1666 files changed, 386445 insertions, 242407 deletions
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index ca2f7d238f..52be6990ab 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -28,12 +28,14 @@
 
 using namespace arm_compute;
 
-AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info)
-    : _info(info)
+AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info) : _info(info)
 {
 }
 
-ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window,
+                                                          ValidRegion   input_valid_region,
+                                                          bool          border_undefined,
+                                                          BorderSize    border_size) const
 {
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_UNUSED(input_valid_region);
@@ -45,17 +47,17 @@ ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window,
 
 ValidRegion AccessWindowAutoPadding::compute_valid_region() const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return ValidRegion{};
     }
 
-    return ValidRegion{ Coordinates(), _info->tensor_shape() };
+    return ValidRegion{Coordinates(), _info->tensor_shape()};
 }
 
 void AccessWindowAutoPadding::set_valid_region()
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return;
     }
@@ -75,7 +77,7 @@ bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window)
     ARM_COMPUTE_UNUSED(window);
 
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
diff --git a/src/core/AccessWindowAutoPadding.h b/src/core/AccessWindowAutoPadding.h
index b8d1508679..406bdba0d8 100644
--- a/src/core/AccessWindowAutoPadding.h
+++ b/src/core/AccessWindowAutoPadding.h
@@ -74,9 +74,12 @@ public:
     ValidRegion compute_valid_region() const;
 
     // Inherited methods overridden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    bool        update_window_if_needed(Window &window) const override;
+    bool        update_padding_if_needed(const Window &window) override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
 private:
     ITensorInfo *_info;
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 0607011bc5..98182b1202 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp
@@ -34,7 +34,10 @@ AccessWindowStatic::AccessWindowStatic(ITensorInfo *info, int start_x, int start
 {
 }
 
-ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowStatic::compute_valid_region(const Window &window,
+                                                     ValidRegion   input_valid_region,
+                                                     bool          border_undefined,
+                                                     BorderSize    border_size) const
 {
     ARM_COMPUTE_UNUSED(border_undefined);
     ARM_COMPUTE_UNUSED(border_size);
@@ -44,7 +47,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
 
 ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -57,7 +60,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
     // Start of the valid region is equal to the start of the static access but
     // never outside of the tensor.
     anchor.set(0, std::max<int>(0, _start_x));
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         anchor.set(1, std::max<int>(0, _start_y));
     }
@@ -65,7 +68,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
     // End of the valid region is equal to the end of the static access but
     // never outside of the tensor.
     shape.set(0, std::min<int>(_end_x, _info->tensor_shape()[0]));
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
     }
@@ -75,7 +78,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
 
 void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegion &input_valid_region)
 {
-    if(_info != nullptr)
+    if (_info != nullptr)
     {
         _info->set_valid_region(compute_valid_region(window, input_valid_region));
     }
@@ -84,7 +87,7 @@ void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegio
 bool AccessWindowStatic::update_window_if_needed(Window &window) const
 {
     // If the padding is not enough and the tensor is not resizable, shrink the window to size 0
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
@@ -96,48 +99,50 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const
     bool window_modified = false;
 
     // Calculate if padding is enough
-    if(_start_y < 0)
+    if (_start_y < 0)
     {
         const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
 
-        if(_start_y < front_pad_y_available)
+        if (_start_y < front_pad_y_available)
         {
             window_modified = true;
         }
     }
 
-    if(!window_modified)
+    if (!window_modified)
     {
-        if(_end_y > static_cast<int>(shape[1]))
+        if (_end_y > static_cast<int>(shape[1]))
         {
             const int stride_z             = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
             const int tail_pad_y_available = (stride_z / strides[1]) - shape[1];
 
-            if(static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
+            if (static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
             {
                 window_modified = true;
             }
         }
 
-        if(!window_modified)
+        if (!window_modified)
         {
             const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
-            if(_start_x < 0)
+            if (_start_x < 0)
             {
-                const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+                const int front_pad_x_available =
+                    -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) /
+                    static_cast<int>(strides[0]);
 
-                if(_start_x < front_pad_x_available)
+                if (_start_x < front_pad_x_available)
                 {
                     window_modified = true;
                 }
             }
 
-            if(!window_modified && _end_x > static_cast<int>(shape[0]))
+            if (!window_modified && _end_x > static_cast<int>(shape[0]))
             {
                 const int tail_pad_x_available = (stride_y / strides[0]) - shape[0];
 
-                if(static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
+                if (static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
                 {
                     window_modified = true;
                 }
@@ -146,9 +151,9 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const
     }
 
     // If padding is not enough
-    if(window_modified)
+    if (window_modified)
     {
-        for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+        for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
         {
             window.set(i, Window::Dimension(0, 0, 1));
         }
@@ -162,7 +167,7 @@ bool AccessWindowStatic::update_padding_if_needed(const Window &window)
     ARM_COMPUTE_UNUSED(window);
 
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
diff --git a/src/core/AccessWindowStatic.h b/src/core/AccessWindowStatic.h
index f7d43cbb55..5c6d2c7db0 100644
--- a/src/core/AccessWindowStatic.h
+++ b/src/core/AccessWindowStatic.h
@@ -86,9 +86,12 @@ public:
     ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const;
 
     // Inherited methods overriden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    bool        update_window_if_needed(Window &window) const override;
+    bool        update_padding_if_needed(const Window &window) override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
 private:
     ITensorInfo *_info;
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index d8bd4c4de1..42f0081c14 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -29,9 +29,12 @@
 
 using namespace arm_compute;
 
-ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window,
+                                                        ValidRegion   input_valid_region,
+                                                        bool          border_undefined,
+                                                        BorderSize    border_size) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -41,7 +44,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
     Coordinates  old_anchor(anchor);
     TensorShape  old_shape(shape);
 
-    if(!border_undefined)
+    if (!border_undefined)
     {
         border_size = BorderSize(0);
     }
@@ -53,7 +56,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
     // the kernel to write back output values.
     // As the relation between input and output is transposed window.y() is
     // used for x anchor and window.x() for y anchor.
-    if(_info->dimension(0) > 1)
+    if (_info->dimension(0) > 1)
     {
         anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
     }
@@ -69,15 +72,19 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
     // a size of the region.
     // As the relation between input and output is transposed window.y() is
     // used for x shape and window.x() for y shape.
-    if(_info->dimension(0) > 1)
+    if (_info->dimension(0) > 1)
     {
-        shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+        shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right,
+                                   (window.y().end() - window.y().step()) * _scale_x + _width) -
+                         anchor[0]);
     }
-    shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+    shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom,
+                               (window.x().end() - window.x().step()) * _scale_y + _height) -
+                     anchor[1]);
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input
-    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    for (size_t d = 2; d < _info->num_dimensions(); ++d)
     {
         anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
         shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
@@ -89,7 +96,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
 bool AccessWindowTranspose::update_window_if_needed(Window &window) const
 {
     // Only update the window size if we can't use padding
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
@@ -107,12 +114,12 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     const int max_y = window.x().end() * _scale_y + _y;
 
     // Adjust window start for output's Y dimension (so X in (input) window)
-    if(min_y < 0)
+    if (min_y < 0)
     {
         // Calculate rows available above the tensor
         const int front_pad_y_available = -offset_first_element / strides[1];
 
-        if(min_y < front_pad_y_available)
+        if (min_y < front_pad_y_available)
         {
             // Not enough padding available, need to shrink the window
             const int start = adjust_up(min_y, front_pad_y_available, window.x().step() * _scale_y) - _y;
@@ -126,17 +133,18 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for Y dimension
-    if(max_y > static_cast<int>(shape[1]))
+    if (max_y > static_cast<int>(shape[1]))
     {
         const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
 
         // Calculate rows available below the tensor
         const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
 
-        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) + window.x().step() * _scale_y - _y - _height;
+            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) +
+                            window.x().step() * _scale_y - _y - _height;
             window.set(0, Window::Dimension(window.x().start(), end / _scale_y, window.x().step()));
             window_modified = true;
         }
@@ -151,11 +159,14 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
     // Adjust window start for X dimension
-    if(min_x < 0)
+    if (min_x < 0)
     {
-        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+        const int front_pad_x_available =
+            -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1],
+                           stride_y - shape[0] * strides[0]) /
+            static_cast<int>(strides[0]);
 
-        if(min_x < front_pad_x_available)
+        if (min_x < front_pad_x_available)
         {
             // Not enough padding available, need to shrink the window
             const int start = adjust_up(min_x, front_pad_x_available, window.y().step() * _scale_x) - _x;
@@ -168,14 +179,15 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for X dimension
-    if(max_x > static_cast<int>(shape[0]))
+    if (max_x > static_cast<int>(shape[0]))
     {
         const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
 
-        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) + window.y().step() * _scale_x - _x - _width;
+            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) +
+                            window.y().step() * _scale_x - _x - _width;
             window.set(1, Window::Dimension(window.y().start(), end / _scale_x, window.y().step()));
             window_modified = true;
         }
@@ -189,7 +201,7 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
 bool AccessWindowTranspose::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
diff --git a/src/core/AccessWindowTranspose.h b/src/core/AccessWindowTranspose.h
index 0306076d6e..12bb9a535b 100644
--- a/src/core/AccessWindowTranspose.h
+++ b/src/core/AccessWindowTranspose.h
@@ -42,7 +42,10 @@ public:
     bool update_window_if_needed(Window &window) const override;
     bool update_padding_if_needed(const Window &window) override;
     using AccessWindowRectangle::compute_valid_region;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H*/
diff --git a/src/core/CL/CLCommandBuffer.cpp b/src/core/CL/CLCommandBuffer.cpp
new file mode 100644
index 0000000000..d094dcdaea
--- /dev/null
+++ b/src/core/CL/CLCommandBuffer.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/CL/CLCommandBuffer.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
+#include "src/core/CL/CLCompatCommandBuffer.h"
+#include "src/core/CL/CLMutableCommandBuffer.h"
+
+namespace arm_compute
+{
+
+std::unique_ptr<CLCommandBuffer> CLCommandBuffer::create(cl_command_queue queue)
+{
+    const auto &cl_device            = CLKernelLibrary::get().get_device();
+    const auto  has_mutable_dispatch = command_buffer_mutable_dispatch_supported(cl_device);
+
+    if (has_mutable_dispatch)
+    {
+        return std::make_unique<CLMutableCommandBuffer>(queue);
+    }
+    else
+    {
+        return std::make_unique<CLCompatCommandBuffer>(queue);
+    }
+}
+
+CLCommandBuffer::CLCommandBuffer()  = default;
+CLCommandBuffer::~CLCommandBuffer() = default;
+
+CLCommandBuffer::State CLCommandBuffer::state() const
+{
+    return _state;
+}
+
+CLCommandBuffer &CLCommandBuffer::state(CLCommandBuffer::State state)
+{
+    _state = state;
+
+    return *this;
+}
+
+} // namespace arm_compute
diff --git a/src/core/CL/CLCommandBuffer.h b/src/core/CL/CLCommandBuffer.h
new file mode 100644
index 0000000000..90e434161e
--- /dev/null
+++ b/src/core/CL/CLCommandBuffer.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H
+#define ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+namespace arm_compute
+{
+
+/** Command buffer contains a list of commands that is constructed once and later enqueued multiple times.
+ *
+ * To prepare a command buffer:
+ *   - Construct a new command buffer targeting a command queue using @ref CLCommandBuffer::create.
+ *   - Add kernel enqueue command to the buffer using @ref CLCommandBuffer::add_kernel.
+ *     The kernel must be ready to be enqueued with all the arguments set.
+ *   - Specify which kernel argument is mutable after the command buffer has been finalized.
+ *   - When all the kernel enqueue commands have been added, call @ref CLCommandBuffer::finalize.
+ *     After this point the command buffer is ready to be executed.
+ *
+ * To execute the command buffer:
+ *   - Make any changes in the value which the mutable arguments are pointing to.
+ *   - Call @ref CLCommandBuffer::update to apply the argument value changes.
+ *   - Call @ref CLCommandBuffer::enqueue to enqueue the command buffer to execute.
+ */
+class CLCommandBuffer
+{
+public:
+    /** Create a new command buffer targeting the specified command queue.
+     *
+     * @param[in] queue The command queue to execute the command buffer.
+     *
+     * @return A unique pointer to the newly created command buffer.
+     */
+    static std::unique_ptr<CLCommandBuffer> create(cl_command_queue queue);
+
+    /** Constructor. */
+    CLCommandBuffer();
+
+    /** Destructor. */
+    virtual ~CLCommandBuffer();
+
+    /** Disallow copy constructor. */
+    CLCommandBuffer(const CLCommandBuffer &) = delete;
+
+    /** Disallow copy assignment. */
+    CLCommandBuffer &operator=(const CLCommandBuffer &) = delete;
+
+    /** Disallow move constructor. */
+    CLCommandBuffer(CLCommandBuffer &&other) = delete;
+
+    /** Disallow move assignment. */
+    CLCommandBuffer &operator=(CLCommandBuffer &&other) = delete;
+
+    /** Add a kernel enqueue command to the command queue.
+     *
+     * This function must be called before the command buffer has been finalized.
+     *
+     * @param[in] kernel The CL kernel.
+     * @param[in] offset The global work offset.
+     * @param[in] global The global work size.
+     * @param[in] local  The local work size.
+     */
+    virtual void
+    add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0;
+
+    /** Add the mutable argument to the current kernel enqueue command.
+     *
+     * This function must be called after @ref CLCommandBuffer::add_kernel but before the command buffer
+     * has been finalized.
+     *
+     * The pointer must be valid and it must point to the correct value at the time
+     * @ref CLCommandBuffer::update is called so that the value of the argument
+     * can be applied successfully to the kernel enqueue command.
+     *
+     * @param[in] arg_idx The index of the argument in the current kernel program.
+     * @param[in] value   The pointer to the value of the argument.
+     */
+    template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value || std::is_pointer<T>::value>>
+    void add_mutable_argument(cl_uint arg_idx, const T *value)
+    {
+        add_mutable_argument_generic(arg_idx, value, sizeof(T));
+    }
+
+    /** Finalize the command buffer. */
+    virtual void finalize() = 0;
+
+    /** Update the command buffer with new kernel argument values.
+     *
+     * This function must be called after the command buffer has been finalized.
+     *
+     * All the value pointed by the mutable argument will be applied to the command buffer.
+     */
+    virtual void update() = 0;
+
+    /** Enqueue the command buffer.
+     *
+     * This function must be called after the command buffer has been finalized.
+     */
+    virtual void enqueue() = 0;
+
+    /** Check if the command buffer has been finalized.
+     *
+     * @return true if the command buffer has been finalized.
+     */
+    virtual bool is_finalized() const = 0;
+
+protected:
+    /** Add the mutable argument to the current kernel enqueue command.
+     *
+     * @see CLCommandBuffer::add_mutable_argument for more information.
+     */
+    virtual void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) = 0;
+
+    /** The state of the command buffer. */
+    enum class State : int32_t
+    {
+        /** The command buffer has been created and is being specified. */
+        Created,
+
+        /** The command buffer has been finalized and is ready to be executed. */
+        Finalized,
+    };
+
+    /** Get the state of the command buffer. */
+    State state() const;
+
+    /** Set the state of the command buffer. */
+    CLCommandBuffer &state(State state);
+
+private:
+    State _state{State::Created};
+};
+
+} // namespace arm_compute
+
+#endif // ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H
diff --git a/src/core/CL/CLCompatCommandBuffer.cpp b/src/core/CL/CLCompatCommandBuffer.cpp
new file mode 100644
index 0000000000..242fd7719c
--- /dev/null
+++ b/src/core/CL/CLCompatCommandBuffer.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/CL/CLCompatCommandBuffer.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/core/CL/CLUtils.h"
+
+namespace arm_compute
+{
+
+CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) : _queue(queue)
+{
+}
+
+CLCompatCommandBuffer::~CLCompatCommandBuffer()
+{
+}
+
+void CLCompatCommandBuffer::add_kernel(cl_kernel          kernel,
+                                       const cl::NDRange &offset,
+                                       const cl::NDRange &global,
+                                       const cl::NDRange &local)
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+
+    _kernel_cmds.push_back(KernelCommand{kernel, offset, global, local, {}});
+}
+
+void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size)
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+    ARM_COMPUTE_ERROR_ON(_kernel_cmds.empty());
+
+    _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{arg_idx, size, value});
+}
+
+void CLCompatCommandBuffer::finalize()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+
+    _kernel_cmds.shrink_to_fit();
+
+    for (auto &cmd : _kernel_cmds)
+    {
+        cmd.mutable_args.shrink_to_fit();
+    }
+
+    state(State::Finalized);
+}
+
+void CLCompatCommandBuffer::update()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
+
+    // Nothing to do here - The kernel arguments will be updated when each command is enqueued.
+}
+
+void CLCompatCommandBuffer::enqueue()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
+
+    for (const auto &cmd : _kernel_cmds)
+    {
+        for (const auto &arg : cmd.mutable_args)
+        {
+            const auto error = clSetKernelArg(cmd.kernel, arg.arg_index, arg.arg_size, arg.arg_value);
+
+            handle_cl_error("clSetKernelArg", error);
+        }
+
+        const auto error =
+            clEnqueueNDRangeKernel(_queue, cmd.kernel, static_cast<cl_uint>(cmd.global.dimensions()),
+                                   cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, cmd.global.get(),
+                                   cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, 0, nullptr, nullptr);
+
+        handle_cl_error("clEnqueueNDRangeKernel", error);
+    }
+}
+
+bool CLCompatCommandBuffer::is_finalized() const
+{
+    return state() == State::Finalized;
+}
+
+} // namespace arm_compute
diff --git a/src/core/CL/CLCompatCommandBuffer.h b/src/core/CL/CLCompatCommandBuffer.h
new file mode 100644
index 0000000000..d5df106425
--- /dev/null
+++ b/src/core/CL/CLCompatCommandBuffer.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H
+#define ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H
+
+#include "src/core/CL/CLCommandBuffer.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+
+/** Command buffer implementation for platform without mutable dispatch command buffer extension. */
+class CLCompatCommandBuffer final : public CLCommandBuffer
+{
+public:
+    /** Create a new command buffer targeting the specified command queue.
+     *
+     * @param[in] queue The command queue to execute the command buffer.
+     */
+    CLCompatCommandBuffer(cl_command_queue queue);
+
+    /** Destructor. */
+    virtual ~CLCompatCommandBuffer();
+
+    /** Disallow copy constructor. */
+    CLCompatCommandBuffer(const CLCompatCommandBuffer &) = delete;
+
+    /** Disallow copy assignment. */
+    CLCompatCommandBuffer &operator=(const CLCompatCommandBuffer &) = delete;
+
+    /** Disallow move constructor. */
+    CLCompatCommandBuffer(CLCompatCommandBuffer &&) = delete;
+
+    /** Disallow move assignment. */
+    CLCompatCommandBuffer &operator=(CLCompatCommandBuffer &&) = delete;
+
+    void add_kernel(cl_kernel          kernel,
+                    const cl::NDRange &offset,
+                    const cl::NDRange &global,
+                    const cl::NDRange &local) override;
+
+    void finalize() override;
+
+    void update() override;
+
+    void enqueue() override;
+
+    bool is_finalized() const override;
+
+protected:
+    void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) override;
+
+private:
+    struct KernelCommand
+    {
+        cl_kernel   kernel;
+        cl::NDRange offset;
+        cl::NDRange global;
+        cl::NDRange local;
+
+        std::vector<cl_mutable_dispatch_arg_khr> mutable_args;
+    };
+
+private:
+    cl_command_queue           _queue{};
+    std::vector<KernelCommand> _kernel_cmds{};
+};
+
+} // namespace arm_compute
+
+#endif // ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index 3f2975dc15..9bbc32657e 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,19 +22,19 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/CL/OpenCL.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
+
 #include "support/StringSupport.h"
 
 #include <regex>
 
 namespace arm_compute
 {
-CLBuildOptions::CLBuildOptions()
-    : _build_opts()
+CLBuildOptions::CLBuildOptions() : _build_opts()
 {
 }
 
@@ -45,7 +45,7 @@ void CLBuildOptions::add_option(std::string option)
 
 void CLBuildOptions::add_option_if(bool cond, std::string option)
 {
-    if(cond)
+    if (cond)
     {
         add_option(std::move(option));
     }
@@ -63,7 +63,7 @@ void CLBuildOptions::add_options(const StringSet &options)
 
 void CLBuildOptions::add_options_if(bool cond, const StringSet &options)
 {
-    if(cond)
+    if (cond)
     {
         add_options(options);
     }
@@ -74,26 +74,40 @@ const CLBuildOptions::StringSet &CLBuildOptions::options() const
     return _build_opts;
 }
 
-Program::Program()
-    : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
+bool CLBuildOptions::operator==(const CLBuildOptions &other) const
+{
+    return _build_opts == other._build_opts;
+}
+
+Program::Program() : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
 {
 }
 
 Program::Program(cl::Context context, std::string name, std::string source)
-    : _context(std::move(context)), _device(), _is_binary(false), _name(std::move(name)), _source(std::move(source)), _binary()
+    : _context(std::move(context)),
+      _device(),
+      _is_binary(false),
+      _name(std::move(name)),
+      _source(std::move(source)),
+      _binary()
 {
 }
 
 Program::Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary)
-    : _context(std::move(context)), _device(std::move(device)), _is_binary(true), _name(std::move(name)), _source(), _binary(std::move(binary))
+    : _context(std::move(context)),
+      _device(std::move(device)),
+      _is_binary(true),
+      _name(std::move(name)),
+      _source(),
+      _binary(std::move(binary))
 {
 }
 
 Program::operator cl::Program() const
 {
-    if(_is_binary)
+    if (_is_binary)
     {
-        return cl::Program(_context, { _device }, { _binary });
+        return cl::Program(_context, {_device}, {_binary});
     }
     else
     {
@@ -107,12 +121,12 @@ bool Program::build(const cl::Program &program, const std::string &build_options
     {
         return program.build(build_options.c_str()) == CL_SUCCESS;
     }
-    catch(const cl::Error &e)
+    catch (const cl::Error &e)
     {
         cl_int     err        = CL_SUCCESS;
         const auto build_info = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&err);
 
-        for(auto &pair : build_info)
+        for (auto &pair : build_info)
         {
             std::cerr << pair.second << std::endl;
         }
@@ -128,14 +142,12 @@ cl::Program Program::build(const std::string &build_options) const
     return cl_program;
 }
 
-Kernel::Kernel()
-    : _name(), _kernel()
+Kernel::Kernel() : _name(), _kernel()
 {
 }
 
 Kernel::Kernel(std::string name, const cl::Program &program)
-    : _name(std::move(name)),
-      _kernel(cl::Kernel(program, _name.c_str()))
+    : _name(std::move(name)), _kernel(cl::Kernel(program, _name.c_str()))
 {
 }
 CLCompileContext::CLCompileContext()
@@ -151,15 +163,19 @@ CLCompileContext::CLCompileContext(cl::Context context, const cl::Device &device
     _is_wbsm_supported = get_wbsm_support_info(device);
 }
 
-Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source,
-                                       const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const
+Kernel CLCompileContext::create_kernel(const std::string &kernel_name,
+                                       const std::string &program_name,
+                                       const std::string &program_source,
+                                       const std::string &kernel_path,
+                                       const StringSet   &build_options_set,
+                                       bool               is_binary) const
 {
     const std::string build_options      = generate_build_options(build_options_set, kernel_path);
     const std::string built_program_name = program_name + "_" + build_options;
     auto              built_program_it   = _built_programs_map.find(built_program_name);
     cl::Program       cl_program;
 
-    if(_built_programs_map.end() != built_program_it)
+    if (_built_programs_map.end() != built_program_it)
     {
         // If program has been built, retrieve to create kernel from it
         cl_program = built_program_it->second;
@@ -179,11 +195,12 @@ Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std
     return Kernel(kernel_name, cl_program);
 }
 
-const Program &CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const
+const Program &
+CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const
 {
     const auto program_it = _programs_map.find(program_name);
 
-    if(program_it != _programs_map.end())
+    if (program_it != _programs_map.end())
     {
         return program_it->second;
     }
@@ -194,9 +211,10 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c
     ARM_COMPUTE_UNUSED(is_binary);
     program = Program(_context, program_name, program_source);
 #else  /* EMBEDDED_KERNELS */
-    if(is_binary)
+    if (is_binary)
     {
-        program = Program(_context, _device.cl_device(), program_name, std::vector<unsigned char>(program_source.begin(), program_source.end()));
+        program = Program(_context, _device.cl_device(), program_name,
+                          std::vector<unsigned char>(program_source.begin(), program_source.end()));
     }
     else
     {
@@ -213,20 +231,23 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c
 void CLCompileContext::set_context(cl::Context context)
 {
     _context = std::move(context);
-    if(_context.get() != nullptr)
+    if (_context.get() != nullptr)
     {
         const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
 
-        if(!cl_devices.empty())
+        if (!cl_devices.empty())
         {
             _device = CLDevice(cl_devices[0]);
         }
     }
 }
 
-std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const
+std::string CLCompileContext::generate_build_options(const StringSet   &build_options_set,
+                                                     const std::string &kernel_path) const
 {
     std::string concat_str;
+    bool        ext_supported = false;
+    std::string ext_buildopts;
 
 #if defined(ARM_COMPUTE_DEBUG_ENABLED)
     // Enable debug properties in CL kernels
@@ -234,47 +255,38 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti
 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
 
     GPUTarget gpu_arch = get_arch_from_target(_device.target());
-    concat_str += " -DGPU_ARCH=" + support::cpp11::to_string(
-                      static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch));
+    concat_str +=
+        " -DGPU_ARCH=" + support::cpp11::to_string(static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch));
 
-    if(_device.supported("cl_khr_fp16"))
+    if (_device.supported("cl_khr_fp16"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
     }
 
-    if(_device.supported("cl_arm_integer_dot_product_int8"))
+    if (_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ENABLED=1 ";
     }
 
-    if(_device.supported("cl_arm_integer_dot_product_accumulate_int8"))
+    if (_device.supported("cl_arm_integer_dot_product_accumulate_int8"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED=1 ";
     }
 
-    if(_device.version() == CLVersion::CL20)
-    {
-        concat_str += " -cl-std=CL2.0 ";
-    }
-    else if(_device.supported("cl_arm_non_uniform_work_group_size"))
+    std::tie(ext_supported, ext_buildopts) = _device.is_non_uniform_workgroup_supported();
+
+    if (ext_supported)
     {
-        concat_str += " -cl-arm-non-uniform-work-group-size ";
+        concat_str += ext_buildopts;
     }
     else
     {
         ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
     }
 
-    if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD)
+    if (gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11)
     {
-        const std::string device_vers = _device.device_version();
-        const std::regex  ddk_regex("r([0-9]*)p[0-9]");
-        std::smatch       ddk_match;
-
-        if(std::regex_search(device_vers, ddk_match, ddk_regex) && std::stoi(ddk_match[1]) >= 9)
-        {
-            concat_str += " -DUNROLL_WITH_PRAGMA ";
-        }
+        concat_str += " -DUNROLL_WITH_PRAGMA ";
     }
 
     std::string build_options = stringify_set(build_options_set, kernel_path) + concat_str;
@@ -297,7 +309,7 @@ std::string CLCompileContext::stringify_set(const StringSet &s, const std::strin
 #endif /* EMBEDDED_KERNELS */
 
     // Concatenate set
-    for(const auto &el : s)
+    for (const auto &el : s)
     {
         concat_set += " " + el;
     }
@@ -333,8 +345,8 @@ const cl::Device &CLCompileContext::get_device() const
 
 void CLCompileContext::set_device(cl::Device device)
 {
-    _device            = std::move(device);
     _is_wbsm_supported = get_wbsm_support_info(device);
+    _device            = std::move(device);
 }
 
 cl::NDRange CLCompileContext::default_ndrange() const
@@ -342,7 +354,7 @@ cl::NDRange CLCompileContext::default_ndrange() const
     GPUTarget   _target = get_target_from_device(_device.cl_device());
     cl::NDRange default_range;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::MIDGARD:
         case GPUTarget::T600:
@@ -372,7 +384,8 @@ size_t CLCompileContext::max_local_workgroup_size(const cl::Kernel &kernel) cons
     size_t result;
 
     size_t err = kernel.getWorkGroupInfo(_device.cl_device(), CL_KERNEL_WORK_GROUP_SIZE, &result);
-    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+    ARM_COMPUTE_ERROR_ON_MSG(err != 0,
+                             "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
     ARM_COMPUTE_UNUSED(err);
 
     return result;
@@ -387,4 +400,22 @@ cl_uint CLCompileContext::get_num_compute_units() const
 {
     return _device.compute_units();
 }
+
+int32_t CLCompileContext::get_ddk_version() const
+{
+    const std::string device_version = _device.device_version();
+    const std::regex  ddk_regex("r([0-9]*)p[0-9]");
+    std::smatch       ddk_match;
+
+    if (std::regex_search(device_version, ddk_match, ddk_regex))
+    {
+        return std::stoi(ddk_match[1]);
+    }
+
+    return -1;
+}
+GPUTarget CLCompileContext::get_gpu_target() const
+{
+    return _device.target();
+}
 } // namespace arm_compute
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 6af378c7ab..5ea99d360a 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,13 +22,16 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLHelpers.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 
-#include "src/core/gpu/cl/ClKernelLibrary.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/ClKernelLibrary.h"
 
 #include <utility>
 #include <vector>
@@ -37,7 +40,7 @@ namespace arm_compute
 {
 std::string get_cl_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -73,7 +76,7 @@ std::string get_cl_type_from_data_type(const DataType &dt)
 
 std::string get_cl_promoted_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -103,7 +106,7 @@ std::string get_cl_promoted_type_from_data_type(const DataType &dt)
 
 std::string get_cl_unsigned_type_from_element_size(size_t element_size)
 {
-    switch(element_size)
+    switch (element_size)
     {
         case 1:
             return "uchar";
@@ -121,7 +124,7 @@ std::string get_cl_unsigned_type_from_element_size(size_t element_size)
 
 std::string get_cl_signed_type_from_element_size(size_t element_size)
 {
-    switch(element_size)
+    switch (element_size)
     {
         case 1:
             return "char";
@@ -139,11 +142,10 @@ std::string get_cl_signed_type_from_element_size(size_t element_size)
 
 std::string get_cl_select_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
-            return "uchar";
         case DataType::S8:
         case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8:
@@ -173,7 +175,7 @@ std::string get_cl_select_type_from_data_type(const DataType &dt)
 
 std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -191,7 +193,7 @@ std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt)
 
 std::string get_data_size_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::S8:
@@ -243,8 +245,9 @@ bool dot8_supported(const cl::Device &device)
     const GPUTarget gpu_target  = get_target_from_name(device_name);
 
     // SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8
-    std::set<GPUTarget> sw_workaround_issue = { GPUTarget::G76 };
-    return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0);
+    std::set<GPUTarget> sw_workaround_issue = {GPUTarget::G76};
+    return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") ||
+            sw_workaround_issue.count(gpu_target) != 0);
 }
 
 bool dot8_acc_supported(const cl::Device &device)
@@ -255,19 +258,23 @@ bool dot8_acc_supported(const cl::Device &device)
 CLVersion get_cl_version(const cl::Device &device)
 {
     std::string version_str = device.getInfo<CL_DEVICE_VERSION>();
-    if(version_str.find("OpenCL 2") != std::string::npos)
+    if (version_str.find("OpenCL 3") != std::string::npos)
+    {
+        return CLVersion::CL30;
+    }
+    else if (version_str.find("OpenCL 2") != std::string::npos)
     {
         return CLVersion::CL20;
     }
-    else if(version_str.find("OpenCL 1.2") != std::string::npos)
+    else if (version_str.find("OpenCL 1.2") != std::string::npos)
     {
         return CLVersion::CL12;
     }
-    else if(version_str.find("OpenCL 1.1") != std::string::npos)
+    else if (version_str.find("OpenCL 1.1") != std::string::npos)
     {
         return CLVersion::CL11;
     }
-    else if(version_str.find("OpenCL 1.0") != std::string::npos)
+    else if (version_str.find("OpenCL 1.0") != std::string::npos)
     {
         return CLVersion::CL10;
     }
@@ -282,14 +289,15 @@ bool device_supports_extension(const cl::Device &device, const char *extension_n
     return (pos != std::string::npos);
 }
 
-bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout)
+bool cl_winograd_convolution_layer_supported(const Size2D &output_tile,
+                                             const Size2D &kernel_size,
+                                             DataLayout    data_layout)
 {
     ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    std::vector<WinogradConfiguration> winograd_configs_nchw =
-    {
+    std::vector<WinogradConfiguration> winograd_configs_nchw = {
         WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1)),
@@ -298,11 +306,9 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
         WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)),
-        WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))
-    };
+        WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))};
 
-    std::vector<WinogradConfiguration> winograd_configs_nhwc =
-    {
+    std::vector<WinogradConfiguration> winograd_configs_nhwc = {
         WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)),
         WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)),
@@ -319,19 +325,21 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si
                             std::pair<int, int>(kernel_size.width, kernel_size.height));
 
     // Return true if supported
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != winograd_configs_nchw.end());
+        return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) !=
+                winograd_configs_nchw.end());
     }
     else
     {
-        return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end());
+        return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) !=
+                winograd_configs_nhwc.end());
     }
 }
 
 size_t preferred_vector_width(const cl::Device &device, const DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::S8:
@@ -377,7 +385,7 @@ size_t get_cl_image_pitch_alignment(const cl::Device &device)
 
     cl_int err = clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &pixel_aligment, nullptr);
 
-    if(err == CL_SUCCESS)
+    if (err == CL_SUCCESS)
     {
         return pixel_aligment;
     }
@@ -387,7 +395,18 @@ size_t get_cl_image_pitch_alignment(const cl::Device &device)
     }
 }
 
-cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts)
+bool get_cl_non_uniform_work_group_supported(const cl::Device &device)
+{
+    cl_bool supported = CL_FALSE;
+
+    cl_int err =
+        clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr);
+
+    return (err == CL_SUCCESS && supported == CL_TRUE);
+}
+
+cl::Kernel
+create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts)
 {
     opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
 
@@ -395,7 +414,8 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_
     auto              kernel_src   = klib.program(program_name);
     const std::string kernel_path  = klib.kernel_path();
 
-    return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path, build_opts, kernel_src.is_binary));
+    return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path,
+                                                     build_opts, kernel_src.is_binary));
 }
 
 cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size)
@@ -409,8 +429,9 @@ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimensio
 bool get_wbsm_support_info(const cl::Device &device)
 {
     cl_bitfield capabilities = 0;
-    cl_int      err          = clGetDeviceInfo(device.get(), ARM_COMPUTE_LIBRARY_OPENCL_DEVICE_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr);
-    if((err == CL_SUCCESS) && (capabilities & ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM))
+    cl_int      err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield),
+                                      &capabilities, nullptr);
+    if ((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM))
     {
         return true;
     }
@@ -419,12 +440,75 @@ bool get_wbsm_support_info(const cl::Device &device)
 
 void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint)
 {
-    cl_int err = clSetKernelExecInfo(kernel.get(),
-                                     ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM,
-                                     sizeof(cl_int),
-                                     &wbsm_hint);
+    cl_int err = clSetKernelExecInfo(kernel.get(), CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM,
+                                     sizeof(cl_int), &wbsm_hint);
     ARM_COMPUTE_UNUSED(err);
     ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS);
 }
 
+bool export_to_cl_image(const ITensorInfo *tensor)
+{
+    if (tensor->tensor_shape()[0] % 4 != 0)
+    {
+        return false;
+    }
+
+    // If not floating point
+    if (!is_data_type_float(tensor->data_type()))
+    {
+        return false;
+    }
+
+    // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
+    if (!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    {
+        return false;
+    }
+
+    // Check cl image pitch alignment
+    if (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
+    {
+        return false;
+    }
+
+    const size_t image_w     = tensor->tensor_shape()[0] / 4;
+    const size_t image_h     = tensor->tensor_shape().total_size() / tensor->tensor_shape()[0];
+    const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+    const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+    if (image_w > max_image_w || image_h > max_image_h)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values)
+{
+    for (const int value : values)
+    {
+        if (value > max_manual_loop_unrolling)
+        {
+            built_opts.add_option("-DUNROLL_WITH_PRAGMA");
+            return;
+        }
+    }
+}
+
+bool arm_matrix_multiply_supported(const cl::Device &device)
+{
+    return device_supports_extension(device, "cl_arm_matrix_multiply");
+}
+
+bool command_buffer_supported(const cl::Device &device)
+{
+    return device_supports_extension(device, "cl_khr_command_buffer");
+}
+
+bool command_buffer_mutable_dispatch_supported(const cl::Device &device)
+{
+    return device_supports_extension(device, "cl_khr_command_buffer_mutable_dispatch");
+}
+
 } // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index bbd4009389..e69d006750 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -24,136 +24,113 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 
 #include "arm_compute/core/Error.h"
-#include "src/core/gpu/cl/ClKernelLibrary.h"
+
+#include "src/gpu/cl/ClKernelLibrary.h"
 
 #include <algorithm>
 #include <array>
 #include <fstream>
 #include <utility>
 #include <vector>
-
 namespace arm_compute
 {
-CLKernelLibrary::CLKernelLibrary()
-    : _compile_context()
+CLKernelLibrary::CLKernelLibrary() : _compile_context()
 {
     opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the CLKernelLibrary is built
 }
-
 CLKernelLibrary &CLKernelLibrary::get()
 {
     static CLKernelLibrary _kernel_library;
     return _kernel_library;
 }
-
-Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const std::set<std::string> &build_options_set) const
+Kernel CLKernelLibrary::create_kernel(const std::string           &kernel_name,
+                                      const std::set<std::string> &build_options_set) const
 {
-    const opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
-
-    const std::string  program_name = klib.program_name(kernel_name);
-    auto               program      = klib.program(program_name);
-    const std::string &kernel_path  = CLKernelLibrary::get().get_kernel_path();
-
-    return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set, program.is_binary);
+    const opencl::ClKernelLibrary &klib         = opencl::ClKernelLibrary::get();
+    const std::string              program_name = klib.program_name(kernel_name);
+    auto                           program      = klib.program(program_name);
+    const std::string             &kernel_path  = CLKernelLibrary::get().get_kernel_path();
+    return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set,
+                                          program.is_binary);
 }
-
 std::string CLKernelLibrary::get_program_name(const std::string &kernel_name) const
 {
     return opencl::ClKernelLibrary::get().program_name(kernel_name);
 }
-
 void CLKernelLibrary::init(std::string kernel_path, cl::Context context, cl::Device device)
 {
     _compile_context = CLCompileContext(context, device);
     opencl::ClKernelLibrary::get().set_kernel_path(kernel_path);
 }
-
 void CLKernelLibrary::set_kernel_path(const std::string &kernel_path)
 {
     opencl::ClKernelLibrary::get().set_kernel_path(kernel_path);
 }
-
 cl::Context &CLKernelLibrary::context()
 {
     return _compile_context.context();
 }
-
 const cl::Device &CLKernelLibrary::get_device()
 {
     return _compile_context.get_device();
 }
-
 void CLKernelLibrary::set_device(cl::Device device)
 {
     _compile_context.set_device(device);
 }
-
 void CLKernelLibrary::set_context(cl::Context context)
 {
     _compile_context.set_context(context);
 }
-
 std::string CLKernelLibrary::get_kernel_path()
 {
     return opencl::ClKernelLibrary::get().kernel_path();
 }
-
 void CLKernelLibrary::clear_programs_cache()
 {
     _compile_context.clear_programs_cache();
 }
-
 const std::map<std::string, cl::Program> &CLKernelLibrary::get_built_programs() const
 {
     return _compile_context.get_built_programs();
 }
-
 void CLKernelLibrary::add_built_program(const std::string &built_program_name, const cl::Program &program)
 {
     _compile_context.add_built_program(built_program_name, program);
 }
-
 bool CLKernelLibrary::fp16_supported() const
 {
     return _compile_context.fp16_supported();
 }
-
 bool CLKernelLibrary::int64_base_atomics_supported() const
 {
     return _compile_context.int64_base_atomics_supported();
 }
-
 bool CLKernelLibrary::is_wbsm_supported()
 {
     return _compile_context.is_wbsm_supported();
 }
-
 std::pair<std::string, bool> CLKernelLibrary::get_program(const std::string &program_name) const
 {
     auto program_info = opencl::ClKernelLibrary::get().program(program_name);
     return std::make_pair(std::move(program_info.program), program_info.is_binary);
 }
-
 size_t CLKernelLibrary::max_local_workgroup_size(const cl::Kernel &kernel) const
 {
     return _compile_context.max_local_workgroup_size(kernel);
 }
-
 cl::NDRange CLKernelLibrary::default_ndrange() const
 {
     return _compile_context.default_ndrange();
 }
-
 std::string CLKernelLibrary::get_device_version()
 {
     return _compile_context.get_device_version();
 }
-
 cl_uint CLKernelLibrary::get_num_compute_units()
 {
     return _compile_context.get_num_compute_units();
 }
-
 CLCompileContext &CLKernelLibrary::get_compile_context()
 {
     return _compile_context;
diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h
index 5dc95dae27..9a681a4f45 100644
--- a/src/core/CL/CLKernels.h
+++ b/src/core/CL/CLKernels.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLKERNELS_H
-#define ARM_COMPUTE_CLKERNELS_H
+#ifndef ACL_SRC_CORE_CL_CLKERNELS_H
+#define ACL_SRC_CORE_CL_CLKERNELS_H
 
 /* Header regrouping all the CL kernels */
 #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
@@ -31,54 +31,38 @@
 #include "src/core/CL/kernels/CLBitwiseKernel.h"
 #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 #include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
-#include "src/core/CL/kernels/CLCol2ImKernel.h"
 #include "src/core/CL/kernels/CLComparisonKernel.h"
 #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 #include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
 #include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "src/core/CL/kernels/CLGatherKernel.h"
 #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
-#include "src/core/CL/kernels/CLIm2ColKernel.h"
 #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 #include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
-#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
 #include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 #include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 #include "src/core/CL/kernels/CLPadLayerKernel.h"
 #include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
-#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
-#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "src/core/CL/kernels/CLRangeKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
-#include "src/core/CL/kernels/CLRemapKernel.h"
 #include "src/core/CL/kernels/CLReorgLayerKernel.h"
 #include "src/core/CL/kernels/CLReverseKernel.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "src/core/CL/kernels/CLSelectKernel.h"
 #include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 #include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 #include "src/core/CL/kernels/CLStackLayerKernel.h"
 #include "src/core/CL/kernels/CLStridedSliceKernel.h"
 #include "src/core/CL/kernels/CLTileKernel.h"
-#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 
-#endif /* ARM_COMPUTE_CLKERNELS_H */
+#endif // ACL_SRC_CORE_CL_CLKERNELS_H
diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp
new file mode 100644
index 0000000000..0e078d8416
--- /dev/null
+++ b/src/core/CL/CLMutableCommandBuffer.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/CL/CLMutableCommandBuffer.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/CLUtils.h"
+
+namespace arm_compute
+{
+
+CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLCommandBuffer()
+{
+    cl_int status = CL_SUCCESS;
+
+    cl_command_buffer_properties_khr properties[] = {
+        CL_COMMAND_BUFFER_FLAGS_KHR,
+        CL_COMMAND_BUFFER_MUTABLE_KHR,
+        0,
+    };
+
+    _cb = clCreateCommandBufferKHR(1, &queue, properties, &status);
+    handle_cl_error("clCreateCommandBufferKHR", status);
+}
+
+CLMutableCommandBuffer::~CLMutableCommandBuffer()
+{
+    const auto status = clReleaseCommandBufferKHR(_cb);
+    if (status != CL_SUCCESS)
+    {
+        const std::string error_message = "clReleaseCommandBufferKHR - Error code: " + std::to_string(status);
+        ARM_COMPUTE_LOG_ERROR_ACL(error_message);
+    }
+}
+
+void CLMutableCommandBuffer::add_kernel(cl_kernel          kernel,
+                                        const cl::NDRange &offset,
+                                        const cl::NDRange &global,
+                                        const cl::NDRange &local)
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+
+    cl_mutable_command_khr mutable_handle = nullptr;
+
+    cl_ndrange_kernel_command_properties_khr properties[] = {
+        CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
+        CL_MUTABLE_DISPATCH_ARGUMENTS_KHR,
+        0,
+    };
+
+    const auto error = clCommandNDRangeKernelKHR(
+        _cb, nullptr, properties, kernel, global.dimensions(), offset.dimensions() != 0 ? offset.get() : nullptr,
+        global.get(), local.dimensions() != 0 ? local.get() : nullptr, 0, nullptr, nullptr, &mutable_handle);
+
+    handle_cl_error("clCommandNDRangeKernelKHR", error);
+
+    cl_mutable_dispatch_config_khr mut_dispatch_cfg{};
+    mut_dispatch_cfg.type    = CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR;
+    mut_dispatch_cfg.command = mutable_handle;
+
+    _mut_dispatch_cfgs.emplace_back(mut_dispatch_cfg);
+}
+
+void CLMutableCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size)
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+
+    cl_mutable_dispatch_arg_khr cfg{};
+    cfg.arg_index = arg_idx;
+    cfg.arg_size  = size;
+    cfg.arg_value = value;
+
+    _mut_arg_cfgs.emplace_back(cfg);
+    ++_mut_dispatch_cfgs.back().num_args;
+}
+
+void CLMutableCommandBuffer::finalize()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Created);
+
+    const auto error = clFinalizeCommandBufferKHR(_cb);
+    handle_cl_error("clFinalizeCommandBufferKHR", error);
+
+    state(State::Finalized);
+
+    _mut_dispatch_cfgs.shrink_to_fit();
+    _mut_arg_cfgs.shrink_to_fit();
+
+    size_t arg_no = 0;
+
+    for (auto &mut_dispatch_cfg : _mut_dispatch_cfgs)
+    {
+        ARM_COMPUTE_ERROR_ON(arg_no >= _mut_arg_cfgs.size());
+        mut_dispatch_cfg.arg_list = &_mut_arg_cfgs[arg_no];
+
+        arg_no += mut_dispatch_cfg.num_args;
+    }
+
+    _mut_cfg.type                  = CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR;
+    _mut_cfg.next                  = nullptr;
+    _mut_cfg.num_mutable_dispatch  = _mut_dispatch_cfgs.size();
+    _mut_cfg.mutable_dispatch_list = &_mut_dispatch_cfgs[0];
+}
+
+void CLMutableCommandBuffer::update()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
+
+    const auto error = clUpdateMutableCommandsKHR(_cb, &_mut_cfg);
+
+    handle_cl_error("clUpdateMutableCommandsKHR", error);
+}
+
+void CLMutableCommandBuffer::enqueue()
+{
+    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
+
+    const auto error = clEnqueueCommandBufferKHR(0, nullptr, _cb, 0, nullptr, nullptr);
+
+    handle_cl_error("clEnqueueCommandBufferKHR", error);
+}
+
+bool CLMutableCommandBuffer::is_finalized() const
+{
+    return state() == State::Finalized;
+}
+
+} // namespace arm_compute
diff --git a/src/core/CL/CLMutableCommandBuffer.h b/src/core/CL/CLMutableCommandBuffer.h
new file mode 100644
index 0000000000..8997d7d1fd
--- /dev/null
+++ b/src/core/CL/CLMutableCommandBuffer.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H
+#define ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H
+
+#include "src/core/CL/CLCommandBuffer.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+
+/** Command buffer implementaton based on CL mutable dispatch command buffer extension. */
+class CLMutableCommandBuffer : public CLCommandBuffer
+{
+public:
+    /** Create a new mutable dispatch command buffer targeting the specified command queue.
+     *
+     * @param[in] queue The command queue to execute the command buffer.
+     */
+    CLMutableCommandBuffer(cl_command_queue queue);
+
+    /** Destructor. */
+    virtual ~CLMutableCommandBuffer();
+
+    /** Disallow copy constructor. */
+    CLMutableCommandBuffer(const CLMutableCommandBuffer &) = delete;
+
+    /** Disallow copy assignment. */
+    CLMutableCommandBuffer &operator=(const CLMutableCommandBuffer &) = delete;
+
+    /** Disallow move constructor. */
+    CLMutableCommandBuffer(CLMutableCommandBuffer &&) = delete;
+
+    /** Disallow move assignment. */
+    CLMutableCommandBuffer &operator=(CLMutableCommandBuffer &&) = delete;
+
+    void add_kernel(cl_kernel          kernel,
+                    const cl::NDRange &offset,
+                    const cl::NDRange &global,
+                    const cl::NDRange &local) override;
+
+    void finalize() override;
+
+    void update() override;
+
+    void enqueue() override;
+
+    bool is_finalized() const override;
+
+protected:
+    void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) override;
+
+private:
+    cl_command_buffer_khr                       _cb{};
+    cl_mutable_base_config_khr                  _mut_cfg{};
+    std::vector<cl_mutable_dispatch_config_khr> _mut_dispatch_cfgs{};
+    std::vector<cl_mutable_dispatch_arg_khr>    _mut_arg_cfgs{};
+};
+
+} // namespace arm_compute
+
+#endif // ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 67af240044..290ed32648 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,16 +21,60 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Types.h"
-
 #include "src/core/CL/CLUtils.h"
 
-cl::Image2D arm_compute::create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch)
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType image_type)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+
+    const cl::Context &ctx    = CLKernelLibrary::get().context();
+    const cl::Buffer  &buffer = tensor->cl_buffer();
+    const ITensorInfo *info   = tensor->info();
+    ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(), "Tensor paddings must not be locked to allow extending paddings to "
+                                                    "satisfy cl_image pitch alignment requirement");
+
+    const size_t image_w{info->dimension(0) / 4};
+    const size_t image_h{info->tensor_shape().total_size() / info->dimension(0)};
+    const size_t max_image_w{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>()};
+    const size_t max_image_h{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>()};
+
+    ARM_COMPUTE_UNUSED(max_image_w, max_image_h);
+    ARM_COMPUTE_ERROR_ON_MSG(image_w > max_image_w, "Image width exceeds maximum width for exporting to cl_image");
+    ARM_COMPUTE_ERROR_ON_MSG(image_h > max_image_h, "Image height exceeds maximum height for exporting to cl_image");
+
+    const TensorShape shape2d(image_w, image_h);
+    const size_t      image_row_pitch = info->strides_in_bytes()[1];
+
+    return create_image2d_from_buffer(ctx, buffer, shape2d, info->data_type(), image_row_pitch, image_type);
+}
+
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx,
+                                       const cl::Buffer  &buffer,
+                                       const TensorShape &shape2d,
+                                       DataType           data_type,
+                                       size_t             image_row_pitch,
+                                       CLImage2DType      image_type)
 {
+    ARM_COMPUTE_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()),
+                             "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
+    ARM_COMPUTE_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0,
+                             "Impossible to retrieve the cl_image pitch alignment");
+    ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr, "Cannot create cl_image from empty cl_buffer");
+
     cl_channel_type cl_data_type;
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::F32:
             cl_data_type = CL_FLOAT;
@@ -45,7 +89,7 @@ cl::Image2D arm_compute::create_image2d_from_buffer(const cl::Context &ctx, cons
     cl_mem cl_image;
     cl_int err = CL_SUCCESS;
 
-    const cl_image_format format = { CL_RGBA, cl_data_type };
+    const cl_image_format format = {CL_RGBA, cl_data_type};
 
     cl_image_desc desc;
     memset(&desc, 0, sizeof(desc));
@@ -55,10 +99,31 @@ cl::Image2D arm_compute::create_image2d_from_buffer(const cl::Context &ctx, cons
     desc.image_width     = shape2d[0];
     desc.image_height    = shape2d[1];
 
-    cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
+    switch (image_type)
+    {
+        case CLImage2DType::ReadOnly:
+            cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
+            break;
+        case CLImage2DType::WriteOnly:
+            cl_image = clCreateImage(ctx(), CL_MEM_WRITE_ONLY, &format, &desc, nullptr, &err);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported CLImage2DType");
+    }
 
     ARM_COMPUTE_UNUSED(err);
     ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Error during the creation of CL image from buffer");
 
     return cl::Image2D(cl_image);
 }
+
+void handle_cl_error(const std::string &function_name, cl_int error_code)
+{
+    if (error_code != CL_SUCCESS)
+    {
+        std::string error_message = function_name + " - Error code: " + std::to_string(error_code);
+        ARM_COMPUTE_ERROR(error_message.c_str());
+    }
+}
+
+} // namespace arm_compute
diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index b65d547756..f9dcfeac3a 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,14 +22,36 @@
  * SOFTWARE.
  */
 
-#ifndef ARM_COMPUTE_CL_CLUTILS_H
-#define ARM_COMPUTE_CL_CLUTILS_H
+#ifndef ACL_SRC_CORE_CL_CLUTILS_H
+#define ACL_SRC_CORE_CL_CLUTILS_H
 
 #include "arm_compute/core/CL/OpenCL.h"
 
+#include <map>
+
 namespace arm_compute
 {
 class TensorShape;
+class CLBuildOptions;
+class ITensorInfo;
+class ICLTensor;
+enum class DataType;
+
+/** OpenCL Image2D types */
+enum class CLImage2DType
+{
+    ReadOnly,
+    WriteOnly
+};
+
+/** Create a cl::Image2D object from a tensor
+ *
+ * @param[in] tensor     Tensor from which to construct Image 2D object
+ * @param[in] image_type Image 2D type (@ref CLImage2DType)
+ *
+ * @return cl::Image2D object
+ */
+cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType image_type);
 
 /** Create a cl::Image2D object from an OpenCL buffer
  *
@@ -46,11 +68,24 @@ class TensorShape;
  * @param[in] shape2d         2D tensor shape
  * @param[in] data_type       DataType to use. Only supported: F32,F16
  * @param[in] image_row_pitch Image row pitch (a.k.a. stride Y) to be used in the image2d object
+ * @param[in] image_type      Image 2D type (@ref CLImage2DType)
  *
  * @return cl::Image2D object
  */
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch);
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx,
+                                       const cl::Buffer  &buffer,
+                                       const TensorShape &shape2d,
+                                       DataType           data_type,
+                                       size_t             image_row_pitch,
+                                       CLImage2DType      image_type);
+
+/** Check for CL error code and throw exception accordingly.
+ *
+ * @param[in] function_name The name of the CL function being called.
+ * @param[in] error_code    The error returned by the CL function.
+ */
+void handle_cl_error(const std::string &function_name, cl_int error_code);
 
-} // arm_compute
+} // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CL_CLUTILS_H */
+#endif // ACL_SRC_CORE_CL_CLUTILS_H
diff --git a/src/core/CL/CLValidate.h b/src/core/CL/CLValidate.h
index 7b5294e452..50d224f1c0 100644
--- a/src/core/CL/CLValidate.h
+++ b/src/core/CL/CLValidate.h
@@ -29,11 +29,13 @@
 
 namespace arm_compute
 {
-#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor)                                                          \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \
+                                                                        CLKernelLibrary::get().fp16_supported()))
 
-#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor)                                                    \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \
+                                                                         CLKernelLibrary::get().fp16_supported()))
 
 /** Return an error if int64_base_atomics extension is not supported by the device.
  *
@@ -43,11 +45,13 @@ namespace arm_compute
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
+inline arm_compute::Status
+error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
 {
-    if(!CLKernelLibrary::get().int64_base_atomics_supported())
+    if (!CLKernelLibrary::get().int64_base_atomics_supported())
     {
-        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported");
+        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line,
+                                            "Atomic functions are not supported");
     }
     return arm_compute::Status{};
 }
diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp
new file mode 100644
index 0000000000..f96b24d2a9
--- /dev/null
+++ b/src/core/CL/DefaultLWSHeuristics.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/CL/DefaultLWSHeuristics.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
+namespace
+{
+cl::NDRange get_gemm_lws(size_t gws_x, size_t gws_y, size_t gws_z)
+{
+    ARM_COMPUTE_UNUSED(gws_y);
+
+    if (gws_z != 1)
+    {
+        return cl::NDRange(4, 4, 2);
+    }
+    else
+    {
+        if (gws_x > 256)
+        {
+            return cl::NDRange(2, 16, 1);
+        }
+        else
+        {
+            return cl::NDRange(32, 4, 1);
+        }
+    }
+}
+
+cl::NDRange get_winograd_lws(size_t gws_x, size_t gws_y, size_t gws_z)
+{
+    ARM_COMPUTE_UNUSED(gws_x, gws_y, gws_z);
+
+    return cl::NDRange(4, 2, 1);
+}
+
+cl::NDRange get_direct_lws(size_t gws_x, size_t gws_y, size_t gws_z)
+{
+    ARM_COMPUTE_UNUSED(gws_z);
+
+    if (gws_x < gws_y)
+    {
+        if (gws_x < 4)
+        {
+            return cl::NDRange(std::min(gws_x, static_cast<size_t>(2u)), 32, 1);
+        }
+        else
+        {
+            return cl::NDRange(std::min(gws_x, static_cast<size_t>(4u)), 8, 1);
+        }
+    }
+    else
+    {
+        return cl::NDRange(8, 4, 1);
+    }
+}
+
+cl::NDRange get_dwc_lws(size_t gws_x, size_t gws_y, size_t gws_z)
+{
+    ARM_COMPUTE_UNUSED(gws_y);
+    ARM_COMPUTE_UNUSED(gws_z);
+
+    if (gws_x < 32)
+    {
+        return cl::NDRange(gws_x, 4, 4);
+    }
+    else
+    {
+        return cl::NDRange(8, 4, 2);
+    }
+}
+} // namespace
+
+namespace arm_compute
+{
+cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws)
+{
+    const size_t gws_x = gws[0];
+    const size_t gws_y = gws[1];
+    const size_t gws_z = gws[2];
+
+    switch (kernel_type)
+    {
+        case CLKernelType::GEMM:
+        {
+            return get_gemm_lws(gws_x, gws_y, gws_z);
+        }
+        case CLKernelType::DIRECT:
+        {
+            return get_direct_lws(gws_x, gws_y, gws_z);
+        }
+        case CLKernelType::WINOGRAD:
+        {
+            return get_winograd_lws(gws_x, gws_y, gws_z);
+        }
+        case CLKernelType::DEPTHWISE:
+        {
+            return get_dwc_lws(gws_x, gws_y, gws_z);
+        }
+        default:
+        {
+            return CLKernelLibrary::get().default_ndrange();
+        }
+    }
+}
+} // namespace arm_compute
diff --git a/src/core/CL/DefaultLWSHeuristics.h b/src/core/CL/DefaultLWSHeuristics.h
new file mode 100644
index 0000000000..e646b9acb8
--- /dev/null
+++ b/src/core/CL/DefaultLWSHeuristics.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DEFAULT_LWS_HEURISTICS_H
+#define ARM_COMPUTE_DEFAULT_LWS_HEURISTICS_H
+
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/CL/OpenCL.h"
+
+namespace arm_compute
+{
+cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws);
+} // namespace arm_compute
+#endif // ARM_COMPUTE_DEFAULT_LWS_HEURISTICS_H
+\ No newline at end of file
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 9ba17d0e03..ac53e7f1d2 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,41 +25,41 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/Utils.h"
 
 #include <cstddef>
 
-void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items)
+void arm_compute::enqueue(cl::CommandQueue  &queue,
+                          ICLKernel         &kernel,
+                          const Window      &window,
+                          const cl::NDRange &lws_hint,
+                          bool               use_dummy_work_items)
 {
-    if(kernel.kernel()() == nullptr)
+    if (kernel.kernel()() == nullptr)
     {
         return;
     }
 
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(window[i].step() == 0);
         // Make sure that dimensions > Z are 1
         ARM_COMPUTE_ERROR_ON((i >= 3) && ((window[i].end() - window[i].start()) != 1));
     }
 
-    cl::NDRange gws = ICLKernel::gws_from_window(window);
+    cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items);
 
     // Check for empty NDRange
-    if(gws.dimensions() == 0)
+    if (gws.dimensions() == 0)
     {
         return;
     }
 
-    // Use dummy work-items
-    if(use_dummy_work_items)
-    {
-        gws.get()[0] = get_next_power_two(gws[0]);
-        gws.get()[1] = get_next_power_two(gws[1]);
-    }
+    kernel.cache_gws(gws);
 
     cl::NDRange valid_lws;
-    if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
+    if (lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
     {
         valid_lws = cl::NullRange;
     }
@@ -70,12 +70,12 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
 
     cl::NDRange lws = cl::NullRange;
 
-    if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
+    if ((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
     {
         lws = valid_lws;
     }
 
-    if(CLKernelLibrary::get().is_wbsm_supported())
+    if (CLKernelLibrary::get().is_wbsm_supported())
     {
         set_wbsm(kernel.kernel(), kernel.wbsm_hint());
     }
@@ -95,7 +95,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
     // Calculate offset to the start of the window
     unsigned int offset_first_element = info->offset_first_element_in_bytes();
 
-    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    for (unsigned int n = 0; n < info->num_dimensions(); ++n)
     {
         offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n];
     }
@@ -103,7 +103,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
     unsigned int idx_start = idx;
     _kernel.setArg(idx++, tensor->cl_buffer());
 
-    for(unsigned int d = 0; d < dimension_size; ++d)
+    for (unsigned int d = 0; d < dimension_size; ++d)
     {
         _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : strides[d]);
         _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : (strides[d] * window[d].step()));
@@ -112,15 +112,69 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
     _kernel.setArg<cl_uint>(idx++, offset_first_element);
 
     ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
-                                 "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
+                                 "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel",
+                                 dimension_size, num_arguments_per_tensor<dimension_size>());
     ARM_COMPUTE_UNUSED(idx_start);
 }
 
+void ICLKernel::add_3d_tensor_nhw_argument(unsigned int &idx, const ICLTensor *tensor)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const ITensorInfo *info = tensor->info();
+    ARM_COMPUTE_ERROR_ON(info == nullptr);
+    const Strides &strides = info->strides_in_bytes();
+
+    // Tensor poniter
+    _kernel.setArg(idx++, tensor->cl_buffer());
+
+    // Add stride_y, stride_z
+    _kernel.setArg<cl_uint>(idx++, strides[1]);
+    _kernel.setArg<cl_uint>(idx++, strides[2]);
+
+    // Tensor dimensions
+    _kernel.setArg<cl_uint>(idx++, info->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(2));
+
+    // Offset of first element
+    unsigned int offset_first_element = info->offset_first_element_in_bytes();
+    _kernel.setArg<cl_uint>(idx++, offset_first_element);
+}
+
+void ICLKernel::add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const ITensorInfo *info = tensor->info();
+    ARM_COMPUTE_ERROR_ON(info == nullptr);
+    const Strides &strides = info->strides_in_bytes();
+
+    // Tensor poniter
+    _kernel.setArg(idx++, tensor->cl_buffer());
+
+    // Add stride_y, stride_z and stride_w
+    _kernel.setArg<cl_uint>(idx++, strides[1]);
+    _kernel.setArg<cl_uint>(idx++, strides[2]);
+    _kernel.setArg<cl_uint>(idx++, strides[3]);
+
+    // Tensor dimensions
+    _kernel.setArg<cl_uint>(idx++, info->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(2));
+    _kernel.setArg<cl_uint>(idx++, info->dimension(3));
+
+    // Offset of first element
+    unsigned int offset_first_element = info->offset_first_element_in_bytes();
+    _kernel.setArg<cl_uint>(idx++, offset_first_element);
+}
+
 #ifndef DOXYGEN_SKIP_THIS
 template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window);
 template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window);
 template void ICLKernel::add_tensor_argument<3>(unsigned &idx, const ICLTensor *tensor, const Window &window);
 template void ICLKernel::add_tensor_argument<4>(unsigned &idx, const ICLTensor *tensor, const Window &window);
+template void ICLKernel::add_tensor_argument<5>(unsigned &idx, const ICLTensor *tensor, const Window &window);
 #endif /* DOXYGEN_SKIP_THIS */
 
 void ICLKernel::set_target(cl::Device &device)
@@ -130,16 +184,16 @@ void ICLKernel::set_target(cl::Device &device)
 
 size_t ICLKernel::get_max_workgroup_size()
 {
-    if(_max_workgroup_size == 0)
+    if (_max_workgroup_size == 0)
     {
         _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel);
     }
     return _max_workgroup_size;
 }
 
-cl::NDRange ICLKernel::gws_from_window(const Window &window)
+cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items)
 {
-    if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
+    if ((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
     {
         return cl::NullRange;
     }
@@ -148,6 +202,22 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window)
                     (window.y().end() - window.y().start()) / window.y().step(),
                     (window.z().end() - window.z().start()) / window.z().step());
 
+    if (use_dummy_work_items)
+    {
+        gws.get()[0] = get_next_power_two(gws[0]);
+        gws.get()[1] = get_next_power_two(gws[1]);
+    }
+
     return gws;
 }
-} // namespace arm_compute
-\ No newline at end of file
+
+cl::NDRange ICLKernel::get_cached_gws() const
+{
+    return _cached_gws;
+}
+
+void ICLKernel::cache_gws(const cl::NDRange &gws)
+{
+    _cached_gws = gws;
+}
+} // namespace arm_compute
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index 6737109f34..6aebef15a5 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,21 +27,42 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/IKernel.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/CLTuningParams.h"
 
+#include "src/core/CL/DefaultLWSHeuristics.h"
+
 #include <string>
 
 namespace arm_compute
 {
+namespace
+{
+bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1)
+{
+    if (lws0.dimensions() != lws1.dimensions())
+    {
+        return false;
+    }
+
+    for (size_t i = 0; i < lws0.dimensions(); ++i)
+    {
+        if (lws0.get()[i] != lws1.get()[i])
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+} // namespace
 template <typename T>
 class ICLArray;
 class ICLTensor;
 class Window;
-
 /** Common interface for all the OpenCL kernels */
 class ICLKernel : public IKernel
 {
@@ -50,7 +71,7 @@ private:
      *
      * @return The number of arguments enqueued per array object.
      */
-    template <unsigned int        dimension_size>
+    template <unsigned int dimension_size>
     constexpr static unsigned int num_arguments_per_array()
     {
         return num_arguments_per_tensor<dimension_size>();
@@ -59,11 +80,24 @@ private:
      *
      * @return The number of arguments enqueued per tensor object.
      */
-    template <unsigned int        dimension_size>
+    template <unsigned int dimension_size>
     constexpr static unsigned int num_arguments_per_tensor()
     {
         return 2 + 2 * dimension_size;
     }
+
+    /** Get default lws for the kernel
+     *
+     * @param[in] window               Execution window used by the kernel
+     * @param[in] use_dummy_work_items If the kernel uses dummy workloads
+     *
+     * @return    cl::NDRange
+     */
+    cl::NDRange default_lws_tune(const Window &window, bool use_dummy_work_items)
+    {
+        return get_default_lws_for_type(_type, gws_from_window(window, use_dummy_work_items));
+    }
+
     using IKernel::configure; //Prevent children from calling IKernel::configure() directly
 protected:
     /** Configure the kernel's window and local workgroup size hint.
@@ -82,16 +116,32 @@ protected:
      * @param[in] window             The maximum window which will be returned by window()
      * @param[in] tuning_params_hint (Optional) Tuning parameters to use.
      */
-    void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0))
+    void configure_internal(const Window  &window,
+                            CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(),
+                                                                               0))
     {
         _tuning_params_hint = tuning_params_hint;
+
+        if (is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
+        {
+            // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which
+            // will be recalculated with use_dummy_work_items flag at run time again anyway.
+            _tuning_params_hint.set_lws(default_lws_tune(window, false /* use_dummy_work_items */));
+        }
+
         IKernel::configure(window);
     }
 
 public:
     /** Constructor */
     ICLKernel()
-        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _tuning_params_hint()
+        : _kernel(nullptr),
+          _target(GPUTarget::MIDGARD),
+          _config_id(arm_compute::default_config_id),
+          _max_workgroup_size(0),
+          _type(CLKernelType::UNKNOWN),
+          _tuning_params_hint(),
+          _cached_gws(cl::NullRange)
     {
     }
     /** Returns a reference to the OpenCL kernel of this object.
@@ -102,6 +152,14 @@ public:
     {
         return _kernel;
     }
+    /** Returns the CL kernel type
+     *
+     * @return The CL kernel type
+     */
+    CLKernelType type() const
+    {
+        return _type;
+    }
     /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
@@ -111,7 +169,11 @@ public:
      * @param[in]     window         Window the kernel will be executed on.
      */
     template <typename T>
-    void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+    void add_1D_array_argument(unsigned int      &idx,
+                               const ICLArray<T> *array,
+                               const Strides     &strides,
+                               unsigned int       num_dimensions,
+                               const Window      &window)
     {
         add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
     }
@@ -134,7 +196,7 @@ public:
      */
     void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
     {
-        if(cond)
+        if (cond)
         {
             add_1D_tensor_argument(idx, tensor, window);
         }
@@ -158,7 +220,7 @@ public:
      */
     void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
     {
-        if(cond)
+        if (cond)
         {
             add_2D_tensor_argument(idx, tensor, window);
         }
@@ -183,6 +245,51 @@ public:
     {
         add_tensor_argument<4>(idx, tensor, window);
     }
+    /** Add the passed 5D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_5D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        add_tensor_argument<5>(idx, tensor, window);
+    }
+
+    /** Add the passed NHW 3D tensor's parameters to the object's kernel's arguments by passing strides, dimensions and the offset to the first valid element in bytes.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     */
+    void add_3d_tensor_nhw_argument(unsigned int &idx, const ICLTensor *tensor);
+
+    /** Returns the number of arguments enqueued per NHW 3D Tensor object.
+     *
+     * @return The number of arguments enqueued per NHW 3D Tensor object.
+     */
+    constexpr static unsigned int num_arguments_per_3d_tensor_nhw()
+    {
+        constexpr unsigned int no_args_per_3d_tensor_nhw = 7u;
+        return no_args_per_3d_tensor_nhw;
+    }
+
+    /** Add the passed NHWC 4D tensor's parameters to the object's kernel's arguments by passing strides, dimensions and the offset to the first valid element in bytes.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     */
+    void add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor);
+
+    /** Returns the number of arguments enqueued per NHWC 4D Tensor object.
+     *
+     * @return The number of arguments enqueued per NHWC 4D Tensor object.
+     */
+    constexpr static unsigned int num_arguments_per_4d_tensor_nhwc()
+    {
+        constexpr unsigned int no_args_per_4d_tensor_nhwc = 9u;
+        return no_args_per_4d_tensor_nhwc;
+    }
+
     /** Returns the number of arguments enqueued per 1D array object.
      *
      * @return The number of arguments enqueues per 1D array object.
@@ -345,11 +452,24 @@ public:
     size_t get_max_workgroup_size();
     /** Get the global work size given an execution window
      *
-     * @param[in] window Execution window
+     * @param[in] window               Execution window
+     * @param[in] use_dummy_work_items If the kernel uses dummy work items
      *
      * @return Global work size of the given execution window
      */
-    static cl::NDRange gws_from_window(const Window &window);
+    static cl::NDRange gws_from_window(const Window &window, bool use_dummy_work_items);
+
+    /** Get the cached gws used to enqueue this kernel
+     *
+     * @return Latest global work size of the kernel
+     */
+    cl::NDRange get_cached_gws() const;
+
+    /** Cache the latest gws used to enqueue this kernel
+     *
+     * @param[in] gws Latest global work size of the kernel
+     */
+    void cache_gws(const cl::NDRange &gws);
 
 private:
     /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
@@ -361,7 +481,11 @@ private:
      * @param[in]     window         Window the kernel will be executed on.
      */
     template <typename T, unsigned int dimension_size>
-    void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
+    void add_array_argument(unsigned int      &idx,
+                            const ICLArray<T> *array,
+                            const Strides     &strides,
+                            unsigned int       num_dimensions,
+                            const Window      &window);
     /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -372,12 +496,14 @@ private:
     void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
 
 protected:
-    cl::Kernel  _kernel;             /**< OpenCL kernel to run */
-    GPUTarget   _target;             /**< The targeted GPU */
-    std::string _config_id;          /**< Configuration ID */
-    size_t      _max_workgroup_size; /**< The maximum workgroup size for this kernel */
+    cl::Kernel   _kernel;             /**< OpenCL kernel to run */
+    GPUTarget    _target;             /**< The targeted GPU */
+    std::string  _config_id;          /**< Configuration ID */
+    size_t       _max_workgroup_size; /**< The maximum workgroup size for this kernel */
+    CLKernelType _type;               /**< The CL kernel type */
 private:
     CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */
+    cl::NDRange    _cached_gws;         /**< Latest GWS used to enqueue this kernel */
 };
 
 /** Add the kernel to the command queue with the given window.
@@ -395,7 +521,11 @@ private:
  *
  * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
  */
-void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
+void enqueue(cl::CommandQueue  &queue,
+             ICLKernel         &kernel,
+             const Window      &window,
+             const cl::NDRange &lws_hint             = CLKernelLibrary::get().default_ndrange(),
+             bool               use_dummy_work_items = false);
 
 /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
  *
@@ -406,14 +536,15 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c
  * @param[in]     window         Window the kernel will be executed on.
  */
 template <typename T, unsigned int dimension_size>
-void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+void ICLKernel::add_array_argument(
+    unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
 {
     ARM_COMPUTE_ERROR_ON(array == nullptr);
 
     // Calculate offset to the start of the window
     unsigned int offset_first_element = 0;
 
-    for(unsigned int n = 0; n < num_dimensions; ++n)
+    for (unsigned int n = 0; n < num_dimensions; ++n)
     {
         offset_first_element += window[n].start() * strides[n];
     }
@@ -421,7 +552,7 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons
     unsigned int idx_start = idx;
     _kernel.setArg(idx++, array->cl_buffer());
 
-    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    for (unsigned int dimension = 0; dimension < dimension_size; dimension++)
     {
         _kernel.setArg<cl_uint>(idx++, strides[dimension]);
         _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
@@ -430,8 +561,9 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons
     _kernel.setArg<cl_uint>(idx++, offset_first_element);
 
     ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
-                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
+                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel",
+                                 dimension_size, num_arguments_per_array<dimension_size>());
     ARM_COMPUTE_UNUSED(idx_start);
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLKERNEL_H */
diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp
index 5d8295bdfe..3f7edbb88d 100644
--- a/src/core/CL/ICLSimple2DKernel.cpp
+++ b/src/core/CL/ICLSimple2DKernel.cpp
@@ -40,6 +40,5 @@ void ICLSimple2DKernel::run(const Window &window, cl::CommandQueue &queue)
         add_2D_tensor_argument(idx, _input, slice);
         add_2D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
diff --git a/src/core/CL/ICLSimple2DKernel.h b/src/core/CL/ICLSimple2DKernel.h
index 5246492401..97bc1e58c2 100644
--- a/src/core/CL/ICLSimple2DKernel.h
+++ b/src/core/CL/ICLSimple2DKernel.h
@@ -37,5 +37,5 @@ public:
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLSIMPLE2DKERNEL_H */
diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp
index fef1a86125..71d7d1f07b 100644
--- a/src/core/CL/ICLSimple3DKernel.cpp
+++ b/src/core/CL/ICLSimple3DKernel.cpp
@@ -42,6 +42,5 @@ void ICLSimple3DKernel::run(const Window &window, cl::CommandQueue &queue)
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/ICLSimple3DKernel.h b/src/core/CL/ICLSimple3DKernel.h
index ff0b274663..5071b6b339 100644
--- a/src/core/CL/ICLSimple3DKernel.h
+++ b/src/core/CL/ICLSimple3DKernel.h
@@ -39,5 +39,5 @@ public:
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLSIMPLE3DKERNEL_H */
diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp
index d67fefdf71..c31db8355f 100644
--- a/src/core/CL/ICLSimpleKernel.cpp
+++ b/src/core/CL/ICLSimpleKernel.cpp
@@ -22,30 +22,35 @@
  * SOFTWARE.
  */
 #include "src/core/CL/ICLSimpleKernel.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
-ICLSimpleKernel::ICLSimpleKernel()
-    : _input(nullptr), _output(nullptr)
+ICLSimpleKernel::ICLSimpleKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
-void ICLSimpleKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
+void ICLSimpleKernel::configure(const ICLTensor  *input,
+                                ICLTensor        *output,
+                                unsigned int      num_elems_processed_per_iteration,
+                                bool              border_undefined,
+                                const BorderSize &border_size)
 {
     _input  = input;
     _output = output;
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+    Window win =
+        calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
     output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
diff --git a/src/core/CL/ICLSimpleKernel.h b/src/core/CL/ICLSimpleKernel.h
index b35547a217..6afd7309aa 100644
--- a/src/core/CL/ICLSimpleKernel.h
+++ b/src/core/CL/ICLSimpleKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -55,12 +56,16 @@ public:
      * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
      * @param[in]  border_size                       (Optional) Size of the border.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
+    void configure(const ICLTensor  *input,
+                   ICLTensor        *output,
+                   unsigned int      num_elems_processed_per_iteration,
+                   bool              border_undefined = false,
+                   const BorderSize &border_size      = BorderSize());
 
 protected:
     const ICLTensor *_input;
     ICLTensor       *_output;
 };
-}
+} // namespace arm_compute
 
 #endif /*ARM_COMPUTE_ICLSIMPLEKERNEL_H */
diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp
index b541bff04a..0771db7f50 100644
--- a/src/core/CL/ICLTensor.cpp
+++ b/src/core/CL/ICLTensor.cpp
@@ -27,8 +27,7 @@
 
 using namespace arm_compute;
 
-ICLTensor::ICLTensor()
-    : _mapping(nullptr)
+ICLTensor::ICLTensor() : _mapping(nullptr)
 {
 }
 
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index d8c2736ef7..2ebc3274aa 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,16 +29,14 @@
 
 #include "arm_compute/core/Error.h"
 
+#include <algorithm>
 #include <dlfcn.h>
 #include <iostream>
+#include <sstream>
 
 namespace arm_compute
 {
-CLSymbols::CLSymbols() noexcept(false)
-    : _loaded(
-{
-    false, false
-})
+CLSymbols::CLSymbols() noexcept(false) : _loaded({false, false})
 {
 }
 
@@ -50,9 +48,9 @@ CLSymbols &CLSymbols::get()
 
 bool CLSymbols::load_default()
 {
-    static const std::vector<std::string> libraries{ "libOpenCL.so", "libGLES_mali.so", "libmali.so" };
+    static const std::vector<std::string> libraries_filenames{"libOpenCL.so", "libGLES_mali.so", "libmali.so"};
 
-    if(_loaded.first)
+    if (_loaded.first)
     {
         return _loaded.second;
     }
@@ -60,33 +58,83 @@ bool CLSymbols::load_default()
     // Indicate that default loading has been tried
     _loaded.first = true;
 
-    for(const auto &lib : libraries)
+    if (load(libraries_filenames, /* use_loader */ false))
     {
-        if(load(lib))
-        {
-            ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from shared library");
-            return true;
-        }
+        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr,
+                                 "Failed to load OpenCL symbols from shared library");
+        return true;
+    }
+
+#ifdef __ANDROID__
+    // When running in NDK environment, the above libraries are not accessible.
+    static const std::vector<std::string> android_libraries_filenames{"libOpenCL-pixel.so", "libOpenCL-car.so"};
+
+    if (load(android_libraries_filenames, /* use_loader */ true))
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr,
+                                 "Failed to load OpenCL symbols from android shared library");
+        return true;
     }
+#endif // __ANDROID__
 
-    std::cerr << "Couldn't find any OpenCL library.\n";
+    // If not returned till here then libraries not found
+    std::stringstream ss;
+    std::for_each(libraries_filenames.begin(), libraries_filenames.end(),
+                  [&ss](const std::string &s) { ss << s << " "; });
+#ifdef __ANDROID__
+    std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(),
+                  [&ss](const std::string &s) { ss << s << " "; });
+#endif // __ANDROID__
+    std::cerr << "Couldn't find any of the following OpenCL library: " << ss.str() << std::endl;
     return false;
 }
 
-bool CLSymbols::load(const std::string &library)
+bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool use_loader)
 {
-    void *handle = dlopen(library.c_str(), RTLD_LAZY | RTLD_LOCAL);
-
-    if(handle == nullptr)
+    void        *handle = nullptr;
+    unsigned int index  = 0;
+    for (index = 0; index < libraries_filenames.size(); ++index)
+    {
+        handle = dlopen(libraries_filenames[index].c_str(), RTLD_LAZY | RTLD_LOCAL);
+        if (handle != nullptr)
+        {
+            break;
+        }
+    }
+    if (index == libraries_filenames.size())
     {
-        std::cerr << "Can't load " << library << ": " << dlerror() << "\n";
         // Set status of loading to failed
         _loaded.second = false;
         return false;
     }
 
+#ifdef __ANDROID__
+    typedef void *(*loadOpenCLPointer_t)(const char *name);
+    loadOpenCLPointer_t loadOpenCLPointer;
+    if (use_loader)
+    {
+        typedef void (*enableOpenCL_t)();
+        enableOpenCL_t enableOpenCL = reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL"));
+        enableOpenCL();
+
+        loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(dlsym(handle, "loadOpenCLPointer"));
+    }
+    else
+    {
+        loadOpenCLPointer = nullptr;
+    }
+#define LOAD_FUNCTION_PTR(func_name, _handle)                                                            \
+    func_name##_ptr = reinterpret_cast<decltype(func_name) *>(use_loader ? loadOpenCLPointer(#func_name) \
+                                                                         : dlsym(handle, #func_name));
+#else /* __ANDROID__ */
+    (void)use_loader; // Avoid unused warning
 #define LOAD_FUNCTION_PTR(func_name, handle) \
     func_name##_ptr = reinterpret_cast<decltype(func_name) *>(dlsym(handle, #func_name));
+#endif /* __ANDROID__ */
+
+#define LOAD_EXTENSION_FUNCTION_PTR(func_name, platform_id) \
+    func_name##_ptr =                                       \
+        reinterpret_cast<decltype(func_name) *>(clGetExtensionFunctionAddressForPlatform(platform_id, #func_name));
 
     LOAD_FUNCTION_PTR(clCreateContext, handle);
     LOAD_FUNCTION_PTR(clCreateContextFromType, handle);
@@ -137,11 +185,45 @@ bool CLSymbols::load(const std::string &library)
     LOAD_FUNCTION_PTR(clWaitForEvents, handle);
     LOAD_FUNCTION_PTR(clCreateImage, handle);
     LOAD_FUNCTION_PTR(clSetKernelExecInfo, handle);
+    LOAD_FUNCTION_PTR(clGetExtensionFunctionAddressForPlatform, handle);
+
+    // Load Extensions
+
+    // Number of platforms is assumed to be 1. For this to be greater than 1,
+    // the system must have more than one OpenCL implementation provided by
+    // different vendors. This is not our use case. Besides, the library
+    // already assumes one implementation as it uses one handle to load core
+    // functions.
+    constexpr unsigned int      num_platforms = 1U;
+    std::vector<cl_platform_id> platform_ids(num_platforms);
+    cl_int                      err = clGetPlatformIDs(num_platforms, platform_ids.data(), nullptr);
+    if (err != CL_SUCCESS)
+    {
+        return false;
+    }
+
+    // Command buffer and mutable dispatch command buffer extensions
+    /// TODO: (COMPMID-6742) Load Command Buffer extensions in a Portable way
+    /// using clGetExtensionFunctionAddressForPlatform().
+    /// The details can be found here:
+    ///    https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#getting-opencl-api-extension-function-pointers
+    ///
+    /// @note: There are some problems reported while loading these extensions in the recommended way.
+    ///        For details, please see COMPUTE-16545
+    LOAD_FUNCTION_PTR(clCreateCommandBufferKHR, handle);
+    LOAD_FUNCTION_PTR(clRetainCommandBufferKHR, handle);
+    LOAD_FUNCTION_PTR(clReleaseCommandBufferKHR, handle);
+    LOAD_FUNCTION_PTR(clFinalizeCommandBufferKHR, handle);
+    LOAD_FUNCTION_PTR(clEnqueueCommandBufferKHR, handle);
+    LOAD_FUNCTION_PTR(clCommandNDRangeKernelKHR, handle);
+
+    LOAD_FUNCTION_PTR(clUpdateMutableCommandsKHR, handle);
 
     // Third-party extensions
-    LOAD_FUNCTION_PTR(clImportMemoryARM, handle);
+    LOAD_EXTENSION_FUNCTION_PTR(clImportMemoryARM, platform_ids[0]);
 
 #undef LOAD_FUNCTION_PTR
+#undef LOAD_EXTENSION_FUNCTION_PTR
 
     //Don't call dlclose(handle) or all the symbols will be unloaded !
 
@@ -163,7 +245,7 @@ bool opencl_is_available()
     // hold their state, we call a harmless OpenCL function (clGetPlatformIDs
     // with invalid parameters must result in CL_INVALID_VALUE) to ensure the
     // runtimes have a chance to initialize their static objects first. Thanks
-    // to C++11 rules about normal program termination (cf [basic.start]), this
+    // to C++11 rules about normal program completion (cf [basic.start]), this
     // ensures their static objects are destroyed last, i.e. after the
     // singleton CLScheduler is destroyed.
     //
@@ -175,12 +257,11 @@ bool opencl_is_available()
 }
 } // namespace arm_compute
 
-cl_int clEnqueueMarker(cl_command_queue command_queue,
-                       cl_event        *event)
+cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueMarker_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, event);
     }
@@ -190,12 +271,11 @@ cl_int clEnqueueMarker(cl_command_queue command_queue,
     }
 }
 
-cl_int clWaitForEvents(cl_uint         num_events,
-                       const cl_event *event_list)
+cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clWaitForEvents_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_events, event_list);
     }
@@ -205,12 +285,18 @@ cl_int clWaitForEvents(cl_uint         num_events,
     }
 }
 
-cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void *svm_ptr,
-                       size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+cl_int clEnqueueSVMMap(cl_command_queue command_queue,
+                       cl_bool          blocking_map,
+                       cl_map_flags     flags,
+                       void            *svm_ptr,
+                       size_t           size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event  *event_wait_list,
+                       cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueSVMMap_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event);
     }
@@ -220,12 +306,15 @@ cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_
     }
 }
 
-cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, cl_uint num_events_in_wait_list,
-                         const cl_event *event_wait_list, cl_event *event)
+cl_int clEnqueueSVMUnmap(cl_command_queue command_queue,
+                         void            *svm_ptr,
+                         cl_uint          num_events_in_wait_list,
+                         const cl_event  *event_wait_list,
+                         cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueSVMUnmap_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event);
     }
@@ -239,7 +328,7 @@ void *clSVMAlloc(cl_context context, cl_svm_mem_flags_arm flags, size_t size, cl
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSVMAlloc_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, size, alignment);
     }
@@ -253,7 +342,7 @@ void clSVMFree(cl_context context, void *svm_pointer)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSVMFree_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         func(context, svm_pointer);
     }
@@ -267,7 +356,7 @@ cl_int clGetContextInfo(cl_context      context,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetContextInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -284,7 +373,7 @@ cl_command_queue clCreateCommandQueue(cl_context                  context,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, device, properties, errcode_ret);
     }
@@ -301,7 +390,7 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context                 c
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateCommandQueueWithProperties_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, device, properties, errcode_ret);
     }
@@ -311,17 +400,16 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context                 c
     }
 }
 
-cl_context clCreateContext(
-    const cl_context_properties *properties,
-    cl_uint                      num_devices,
-    const cl_device_id          *devices,
-    void (*pfn_notify)(const char *, const void *, size_t, void *),
-    void   *user_data,
-    cl_int *errcode_ret)
+cl_context clCreateContext(const cl_context_properties *properties,
+                           cl_uint                      num_devices,
+                           const cl_device_id          *devices,
+                           void (*pfn_notify)(const char *, const void *, size_t, void *),
+                           void   *user_data,
+                           cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(properties, num_devices, devices, pfn_notify, user_data, errcode_ret);
     }
@@ -339,7 +427,7 @@ cl_context clCreateContextFromType(const cl_context_properties *properties,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateContextFromType_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(properties, device_type, pfn_notify, user_data, errcode_ret);
     }
@@ -349,17 +437,16 @@ cl_context clCreateContextFromType(const cl_context_properties *properties,
     }
 }
 
-cl_int clBuildProgram(
-    cl_program          program,
-    cl_uint             num_devices,
-    const cl_device_id *device_list,
-    const char         *options,
-    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
-    void *user_data)
+cl_int clBuildProgram(cl_program          program,
+                      cl_uint             num_devices,
+                      const cl_device_id *device_list,
+                      const char         *options,
+                      void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+                      void *user_data)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clBuildProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, num_devices, device_list, options, pfn_notify, user_data);
     }
@@ -369,22 +456,22 @@ cl_int clBuildProgram(
     }
 }
 
-cl_int clEnqueueNDRangeKernel(
-    cl_command_queue command_queue,
-    cl_kernel        kernel,
-    cl_uint          work_dim,
-    const size_t    *global_work_offset,
-    const size_t    *global_work_size,
-    const size_t    *local_work_size,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                              cl_kernel        kernel,
+                              cl_uint          work_dim,
+                              const size_t    *global_work_offset,
+                              const size_t    *global_work_size,
+                              const size_t    *local_work_size,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event  *event_wait_list,
+                              cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size,
+                    num_events_in_wait_list, event_wait_list, event);
     }
     else
     {
@@ -392,15 +479,11 @@ cl_int clEnqueueNDRangeKernel(
     }
 }
 
-cl_int clSetKernelArg(
-    cl_kernel   kernel,
-    cl_uint     arg_index,
-    size_t      arg_size,
-    const void *arg_value)
+cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSetKernelArg_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, arg_index, arg_size, arg_value);
     }
@@ -414,7 +497,7 @@ cl_int clRetainMemObject(cl_mem memobj)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj);
     }
@@ -428,7 +511,7 @@ cl_int clReleaseMemObject(cl_mem memobj)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj);
     }
@@ -438,17 +521,16 @@ cl_int clReleaseMemObject(cl_mem memobj)
     }
 }
 
-cl_int clEnqueueUnmapMemObject(
-    cl_command_queue command_queue,
-    cl_mem           memobj,
-    void            *mapped_ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                               cl_mem           memobj,
+                               void            *mapped_ptr,
+                               cl_uint          num_events_in_wait_list,
+                               const cl_event  *event_wait_list,
+                               cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
     }
@@ -462,7 +544,7 @@ cl_int clRetainCommandQueue(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -476,7 +558,7 @@ cl_int clReleaseContext(cl_context context)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context);
     }
@@ -489,7 +571,7 @@ cl_int clReleaseEvent(cl_event event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseEvent_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event);
     }
@@ -499,22 +581,22 @@ cl_int clReleaseEvent(cl_event event)
     }
 }
 
-cl_int clEnqueueWriteBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_write,
-    size_t           offset,
-    size_t           size,
-    const void      *ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueWriteBuffer(cl_command_queue command_queue,
+                            cl_mem           buffer,
+                            cl_bool          blocking_write,
+                            size_t           offset,
+                            size_t           size,
+                            const void      *ptr,
+                            cl_uint          num_events_in_wait_list,
+                            const cl_event  *event_wait_list,
+                            cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+                    event);
     }
     else
     {
@@ -522,22 +604,22 @@ cl_int clEnqueueWriteBuffer(
     }
 }
 
-cl_int clEnqueueReadBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_read,
-    size_t           offset,
-    size_t           size,
-    void            *ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
+                           cl_mem           buffer,
+                           cl_bool          blocking_read,
+                           size_t           offset,
+                           size_t           size,
+                           void            *ptr,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event  *event_wait_list,
+                           cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+                    event);
     }
     else
     {
@@ -545,17 +627,16 @@ cl_int clEnqueueReadBuffer(
     }
 }
 
-cl_int clGetProgramBuildInfo(
-    cl_program            program,
-    cl_device_id          device,
-    cl_program_build_info param_name,
-    size_t                param_value_size,
-    void                 *param_value,
-    size_t               *param_value_size_ret)
+cl_int clGetProgramBuildInfo(cl_program            program,
+                             cl_device_id          device,
+                             cl_program_build_info param_name,
+                             size_t                param_value_size,
+                             void                 *param_value,
+                             size_t               *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -569,7 +650,7 @@ cl_int clRetainProgram(cl_program program)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program);
     }
@@ -579,27 +660,27 @@ cl_int clRetainProgram(cl_program program)
     }
 }
 
-void *clEnqueueMapBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_map,
-    cl_map_flags     map_flags,
-    size_t           offset,
-    size_t           size,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event,
-    cl_int          *errcode_ret)
+void *clEnqueueMapBuffer(cl_command_queue command_queue,
+                         cl_mem           buffer,
+                         cl_bool          blocking_map,
+                         cl_map_flags     map_flags,
+                         size_t           offset,
+                         size_t           size,
+                         cl_uint          num_events_in_wait_list,
+                         const cl_event  *event_wait_list,
+                         cl_event        *event,
+                         cl_int          *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+        return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list,
+                    event_wait_list, event, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -611,7 +692,7 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -621,24 +702,23 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue)
     }
 }
 
-cl_program clCreateProgramWithBinary(
-    cl_context            context,
-    cl_uint               num_devices,
-    const cl_device_id   *device_list,
-    const size_t         *lengths,
-    const unsigned char **binaries,
-    cl_int               *binary_status,
-    cl_int               *errcode_ret)
+cl_program clCreateProgramWithBinary(cl_context            context,
+                                     cl_uint               num_devices,
+                                     const cl_device_id   *device_list,
+                                     const size_t         *lengths,
+                                     const unsigned char **binaries,
+                                     cl_int               *binary_status,
+                                     cl_int               *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -650,7 +730,7 @@ cl_int clRetainContext(cl_context context)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context);
     }
@@ -664,7 +744,7 @@ cl_int clReleaseProgram(cl_program program)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program);
     }
@@ -678,7 +758,7 @@ cl_int clFlush(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clFlush_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -692,7 +772,7 @@ cl_int clFinish(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clFinish_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -702,16 +782,15 @@ cl_int clFinish(cl_command_queue command_queue)
     }
 }
 
-cl_int clGetProgramInfo(
-    cl_program      program,
-    cl_program_info param_name,
-    size_t          param_value_size,
-    void           *param_value,
-    size_t         *param_value_size_ret)
+cl_int clGetProgramInfo(cl_program      program,
+                        cl_program_info param_name,
+                        size_t          param_value_size,
+                        void           *param_value,
+                        size_t         *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetProgramInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -721,20 +800,17 @@ cl_int clGetProgramInfo(
     }
 }
 
-cl_kernel clCreateKernel(
-    cl_program  program,
-    const char *kernel_name,
-    cl_int     *errcode_ret)
+cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, kernel_name, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -746,7 +822,7 @@ cl_int clRetainKernel(cl_kernel kernel)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel);
     }
@@ -756,22 +832,17 @@ cl_int clRetainKernel(cl_kernel kernel)
     }
 }
 
-cl_mem clCreateBuffer(
-    cl_context   context,
-    cl_mem_flags flags,
-    size_t       size,
-    void        *host_ptr,
-    cl_int      *errcode_ret)
+cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, size, host_ptr, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -780,21 +851,17 @@ cl_mem clCreateBuffer(
 }
 
 cl_program clCreateProgramWithSource(
-    cl_context    context,
-    cl_uint       count,
-    const char **strings,
-    const size_t *lengths,
-    cl_int       *errcode_ret)
+    cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, count, strings, lengths, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -806,7 +873,7 @@ cl_int clReleaseKernel(cl_kernel kernel)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel);
     }
@@ -819,12 +886,12 @@ cl_int clReleaseKernel(cl_kernel kernel)
 cl_int clGetDeviceIDs(cl_platform_id platform,
                       cl_device_type device_type,
                       cl_uint        num_entries,
-                      cl_device_id *devices,
+                      cl_device_id  *devices,
                       cl_uint       *num_devices)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetDeviceIDs_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(platform, device_type, num_entries, devices, num_devices);
     }
@@ -842,7 +909,7 @@ cl_int clGetDeviceInfo(cl_device_id   device,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetDeviceInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -852,15 +919,12 @@ cl_int clGetDeviceInfo(cl_device_id   device,
     }
 }
 
-cl_int clGetMemObjectInfo(cl_mem      memobj,
-                          cl_mem_info param_name,
-                          size_t      param_value_size,
-                          void       *param_value,
-                          size_t     *param_value_size_ret)
+cl_int clGetMemObjectInfo(
+    cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetMemObjectInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -874,7 +938,7 @@ cl_int clRetainEvent(cl_event event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainEvent_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event);
     }
@@ -892,7 +956,7 @@ cl_int clGetPlatformInfo(cl_platform_id   platform,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetPlatformInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(platform, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -906,7 +970,7 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetPlatformIDs_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_entries, platforms, num_platforms);
     }
@@ -916,17 +980,16 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint
     }
 }
 
-cl_int
-clGetKernelWorkGroupInfo(cl_kernel                 kernel,
-                         cl_device_id              device,
-                         cl_kernel_work_group_info param_name,
-                         size_t                    param_value_size,
-                         void                     *param_value,
-                         size_t                   *param_value_size_ret)
+cl_int clGetKernelWorkGroupInfo(cl_kernel                 kernel,
+                                cl_device_id              device,
+                                cl_kernel_work_group_info param_name,
+                                size_t                    param_value_size,
+                                void                     *param_value,
+                                size_t                   *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -936,16 +999,15 @@ clGetKernelWorkGroupInfo(cl_kernel                 kernel,
     }
 }
 
-cl_int
-clGetCommandQueueInfo(cl_command_queue      command_queue,
-                      cl_command_queue_info param_name,
-                      size_t                param_value_size,
-                      void                 *param_value,
-                      size_t               *param_value_size_ret)
+cl_int clGetCommandQueueInfo(cl_command_queue      command_queue,
+                             cl_command_queue_info param_name,
+                             size_t                param_value_size,
+                             void                 *param_value,
+                             size_t               *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -955,16 +1017,15 @@ clGetCommandQueueInfo(cl_command_queue      command_queue,
     }
 }
 
-cl_int
-clGetKernelInfo(cl_kernel      kernel,
-                cl_kernel_info param_name,
-                size_t         param_value_size,
-                void          *param_value,
-                size_t        *param_value_size_ret)
+cl_int clGetKernelInfo(cl_kernel      kernel,
+                       cl_kernel_info param_name,
+                       size_t         param_value_size,
+                       void          *param_value,
+                       size_t        *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -974,16 +1035,15 @@ clGetKernelInfo(cl_kernel      kernel,
     }
 }
 
-cl_int
-clGetEventProfilingInfo(cl_event          event,
-                        cl_profiling_info param_name,
-                        size_t            param_value_size,
-                        void             *param_value,
-                        size_t           *param_value_size_ret)
+cl_int clGetEventProfilingInfo(cl_event          event,
+                               cl_profiling_info param_name,
+                               size_t            param_value_size,
+                               void             *param_value,
+                               size_t           *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -993,23 +1053,22 @@ clGetEventProfilingInfo(cl_event          event,
     }
 }
 
-cl_mem
-clCreateImage(cl_context             context,
-              cl_mem_flags           flags,
-              const cl_image_format *image_format,
-              const cl_image_desc   *image_desc,
-              void                  *host_ptr,
-              cl_int                *errcode_ret)
+cl_mem clCreateImage(cl_context             context,
+                     cl_mem_flags           flags,
+                     const cl_image_format *image_format,
+                     const cl_image_desc   *image_desc,
+                     void                  *host_ptr,
+                     cl_int                *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateImage_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, image_format, image_desc, host_ptr, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -1017,14 +1076,12 @@ clCreateImage(cl_context             context,
     }
 }
 
-cl_int clSetKernelExecInfo(cl_kernel           kernel,
-                           cl_kernel_exec_info param_name,
-                           size_t              param_value_size,
-                           const void         *param_value)
+cl_int
+clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void *param_value)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, param_name, param_value_size, param_value);
     }
@@ -1034,23 +1091,166 @@ cl_int clSetKernelExecInfo(cl_kernel           kernel,
     }
 }
 
-cl_mem
-clImportMemoryARM(cl_context                      context,
-                  cl_mem_flags                    flags,
-                  const cl_import_properties_arm *properties,
-                  void                           *memory,
-                  size_t                          size,
-                  cl_int                         *errcode_ret)
+void *clGetExtensionFunctionAddressForPlatform(cl_platform_id platform, const char *funcname)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clGetExtensionFunctionAddressForPlatform_ptr;
+
+    if (func != nullptr)
+    {
+        return func(platform, funcname);
+    }
+
+    return nullptr;
+}
+
+cl_command_buffer_khr clCreateCommandBufferKHR(cl_uint                                 num_queues,
+                                               const cl_command_queue                 *queues,
+                                               const cl_command_buffer_properties_khr *properties,
+                                               cl_int                                 *errcode_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clCreateCommandBufferKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(num_queues, queues, properties, errcode_ret);
+    }
+    else
+    {
+        if (errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_INVALID_OPERATION;
+        }
+
+        return {};
+    }
+}
+
+cl_int clFinalizeCommandBufferKHR(cl_command_buffer_khr command_buffer)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clFinalizeCommandBufferKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(command_buffer);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_int clRetainCommandBufferKHR(cl_command_buffer_khr command_buffer)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clRetainCommandBufferKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(command_buffer);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clReleaseCommandBufferKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(command_buffer);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_int clEnqueueCommandBufferKHR(cl_uint               num_queues,
+                                 cl_command_queue     *queues,
+                                 cl_command_buffer_khr command_buffer,
+                                 cl_uint               num_events_in_wait_list,
+                                 const cl_event       *event_wait_list,
+                                 cl_event             *event)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clEnqueueCommandBufferKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(num_queues, queues, command_buffer, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_int clCommandNDRangeKernelKHR(cl_command_buffer_khr                           command_buffer,
+                                 cl_command_queue                                command_queue,
+                                 const cl_ndrange_kernel_command_properties_khr *properties,
+                                 cl_kernel                                       kernel,
+                                 cl_uint                                         work_dim,
+                                 const size_t                                   *global_work_offset,
+                                 const size_t                                   *global_work_size,
+                                 const size_t                                   *local_work_size,
+                                 cl_uint                                         num_sync_points_in_wait_list,
+                                 const cl_sync_point_khr                        *sync_point_wait_list,
+                                 cl_sync_point_khr                              *sync_point,
+                                 cl_mutable_command_khr                         *mutable_handle)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clCommandNDRangeKernelKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size,
+                    local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_int clUpdateMutableCommandsKHR(cl_command_buffer_khr             command_buffer,
+                                  const cl_mutable_base_config_khr *mutable_config)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clUpdateMutableCommandsKHR_ptr;
+
+    if (func != nullptr)
+    {
+        return func(command_buffer, mutable_config);
+    }
+    else
+    {
+        return CL_INVALID_OPERATION;
+    }
+}
+
+cl_mem clImportMemoryARM(cl_context                      context,
+                         cl_mem_flags                    flags,
+                         const cl_import_properties_arm *properties,
+                         void                           *memory,
+                         size_t                          size,
+                         cl_int                         *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clImportMemoryARM_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, properties, memory, size, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h
index 91d7197889..02faae2369 100644
--- a/src/core/CL/cl_kernels/activation_float_helpers.h
+++ b/src/core/CL/cl_kernels/activation_float_helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,8 @@
 #endif // GPU_ARCH == GPU_ARCH_BIFROST
 
 // Hard-Swish
-#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
+#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
 
 // Logistic Activation
 #define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
@@ -49,13 +50,16 @@
 #define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
 
 // Leaky RELU Activation
-#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
 
 // Soft RELU Activation
 #define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
 
 // ELU Activation
-#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
+#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)           \
+    (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, \
+            (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
 
 // Absolute Activation
 #define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
@@ -69,6 +73,10 @@
 // Linear Activation
 #define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
 
+// GELU Activation
+#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
+
 // Identity Activation
 #define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
 
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
deleted file mode 100644
index bc2c99b6c8..0000000000
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ACT) && defined(DATA_TYPE) && defined(VEC_SIZE)
-
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#include "activation_float_helpers.h"
-
-/** This performs an activation function floating point inputs.
- *
- * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void activation_layer(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    // Get pixels pointer
-    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-#ifdef IN_PLACE
-    __global uchar *output_addr = input_addr;
-#else  /* IN_PLACE */
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-#endif /* IN_PLACE */
-
-    // Load data
-    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
-
-    // Perform activation
-    data0 = ACTIVATION(ACT, DATA_TYPE, VEC_SIZE, data0, A_VAL, B_VAL);
-
-    // Store result
-    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#endif /* defined(ACT) */
diff --git a/src/core/CL/cl_kernels/activation_layer_quant.cl b/src/core/CL/cl_kernels/activation_layer_quant.cl
deleted file mode 100644
index 66261019ab..0000000000
--- a/src/core/CL/cl_kernels/activation_layer_quant.cl
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "activation_quant_helpers.h"
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-
-#if defined(FLOAT_DOMAIN)
-// Activations performed in the float domain
-
-#include "activation_float_helpers.h"
-
-/** This performs an activation function on quantized inputs with float transformations.
- *
- * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
- * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively.
- * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void activation_layer_quant_f32(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    // Get pixels pointer
-    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-#ifdef IN_PLACE
-    __global uchar *output_addr = input_addr;
-#else  /* IN_PLACE */
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-#endif /* IN_PLACE */
-
-    // Load data
-    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
-
-    VEC_FLOAT data_flt = CONVERT(data0, VEC_FLOAT);
-#if defined(O1_VAL)
-    data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
-#else  // defined(O1_VAL)
-    data_flt                    = round(data_flt) * ((float)S1_VAL);
-#endif // defined(O1_VAL)
-    data_flt = ACTIVATION(ACT, float, VEC_SIZE, data_flt, A_VAL, B_VAL);
-
-#if defined(O2_VAL)
-    data0 = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
-#else  // defined(O2_VAL)
-    data0                       = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
-#endif // defined(O2_VAL)
-
-    // Store result
-    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#else // defined(FLOAT_DOMAIN)
-// Activations performed in the quantized domain
-
-#if defined(ACT)
-/** This performs an activation function on quantized inputs.
- *
- * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
- * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
- * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void activation_layer_quant(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    // Get pixels pointer
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-#ifdef IN_PLACE
-    __global uchar *output_addr = input_addr;
-#else  /* IN_PLACE */
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-#endif /* IN_PLACE */
-
-    // Load data
-    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
-
-    data0 = PERFORM_ACTIVATION_QUANT(ACT, data0);
-
-    // Store result
-    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(ACT)
-#endif // defined(FLOAT_DOMAIN)
diff --git a/src/core/CL/cl_kernels/activation_quant_helpers.h b/src/core/CL/cl_kernels/activation_quant_helpers.h
index a32e4e94a3..c758ff1278 100644
--- a/src/core/CL/cl_kernels/activation_quant_helpers.h
+++ b/src/core/CL/cl_kernels/activation_quant_helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,21 +51,26 @@ inline TYPE lu_brelu_op(TYPE x)
 // Hard Swish Activation
 inline TYPE hard_swish_op(TYPE x)
 {
-    return  (x * ((min(max((TYPE)(x + (TYPE)3.f), (TYPE)0.f), (TYPE)6.f)) * (TYPE)0.166666667f));
+    return (x * ((min(max((TYPE)(x + (TYPE)3.f), (TYPE)0.f), (TYPE)6.f)) * (TYPE)0.166666667f));
+}
+
+inline TYPE identiy_op(TYPE x)
+{
+    return x;
 }
 
 #define ACTIVATION_OP2(op, x) op##_op(x)
-#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#define ACTIVATION_OP(op, x)  ACTIVATION_OP2(op, x)
 
 #if defined(S1_VAL) && defined(S2_VAL)
 #if defined(O1_VAL) && defined(O2_VAL)
 #define PERFORM_ACTIVATION_QUANT(act, data)                                                       \
     ({                                                                                            \
         data = ACTIVATION_OP(act, data);                                                          \
-        \
+                                                                                                  \
         VEC_DATA_TYPE(float, VEC_SIZE)                                                            \
         fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));                                    \
-        \
+                                                                                                  \
         fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
         data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                           \
     })
@@ -73,17 +78,14 @@ inline TYPE hard_swish_op(TYPE x)
 #define PERFORM_ACTIVATION_QUANT(act, data)                             \
     ({                                                                  \
         data = ACTIVATION_OP(act, data);                                \
-        \
+                                                                        \
         VEC_DATA_TYPE(float, VEC_SIZE)                                  \
         fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));          \
-        \
+                                                                        \
         fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL));       \
         data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
     })
 #endif /* defined(O1_VAL) && defined(O2_VAL) */
 #else  /* defined(S1_VAL) && defined(S2_VAL) */
-#define PERFORM_ACTIVATION_QUANT(act, data) \
-    ({                                      \
-        data = ACTIVATION_OP(act, data);    \
-    })
+#define PERFORM_ACTIVATION_QUANT(act, data) ({ data = ACTIVATION_OP(act, data); })
 #endif /* defined(S1_VAL) && defined(S2_VAL) */
diff --git a/src/core/CL/cl_kernels/arg_min_max.cl b/src/core/CL/cl_kernels/arg_min_max.cl
deleted file mode 100644
index 6e57ed0af1..0000000000
--- a/src/core/CL/cl_kernels/arg_min_max.cl
+++ /dev/null
@@ -1,451 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
-
-#define VEC_TYPE_IN VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define VEC_TYPE_OUT VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
-#define VEC_SELECT_IN SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define VEC_SIGNED_INT_IN SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#if defined(FLOAT_DATA_TYPE)
-#define ISGREATER(x, y) (VEC_SELECT_IN) isgreater(x, y)
-#define ISLESS(x, y) (VEC_SELECT_IN) isless(x, y)
-#else // !FLOAT_DATA_TYPE
-#if defined(WIDTH)
-#define ISGREATER(x, y) (x > y) ? 1 : 0
-#define ISLESS(x, y) (x < y) ? 1 : 0
-#else // !defined(WIDTH)
-#define ISGREATER(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x > y))
-#define ISLESS(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x < y))
-#endif // defined(WIDTH)
-#endif // defined(FLOAT_DATA_TYPE)
-
-#if defined(ARG_MAX)
-#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
-#elif defined(ARG_MIN)
-#define CONDITION_TO_USE(x, y) ISLESS(x, y)
-#else // !(defined(ARG_MAX) || defined(ARG_MIN))
-#error "Unsupported reduction operation!"
-#endif // defined(ARG_MAX)
-
-#if defined(WIDTH)
-#if defined(ARG_MIN)
-#if defined(PREV_OUTPUT)
-/** Find index minimum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)
-{
-    int end_elem = (x_idx + 1) * 16;
-    if(end_elem > WIDTH)
-    {
-        end_elem = WIDTH - x_idx * 16;
-    }
-    DATA_TYPE_OUTPUT res = prev_res[0];
-    for(int x_v = 1; x_v < end_elem; ++x_v)
-    {
-        res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < * (input + res));
-    }
-    return res;
-}
-#else // !defined(PREV_OUTPUT)
-/** Find index minimum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
-{
-#if WIDTH < 16
-    DATA_TYPE_OUTPUT res = 0;
-    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
-    {
-        res = select(res, x_v, *(input + x_v) < * (input + res));
-    }
-    return res;
-#else  // WIDTH >= 16
-    int       x_elem   = x_idx * 16;
-    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
-    x_elem -= x_goback;
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, input - x_goback);
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
-    SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 8)
-    idx_sel       = (in.s01234567 <= in.s89abcdef);
-    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
-    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
-
-    idx_sel.s0123 = (in.s0123 < in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 4)));
-    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
-    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
-
-    idx_sel.s01 = (in.s01 < in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 2)));
-    in.s01      = select(in.s23, in.s01, idx_sel.s01);
-    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
-
-    idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), SIGNED_INT_DATA_TYPE(DATA_TYPE)));
-    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
-
-    return res.s0 + x_elem;
-#endif // WIDTH < 16
-}
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MIN)
-#if defined(ARG_MAX)
-#if defined(PREV_OUTPUT)
-/** Find index maximum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)
-{
-    int end_elem = (x_idx + 1) * 16;
-    if(end_elem > WIDTH)
-    {
-        end_elem = WIDTH - x_idx * 16;
-    }
-    DATA_TYPE_OUTPUT res = prev_res[0];
-    for(int x_v = 1; x_v < end_elem; ++x_v)
-    {
-        res = select(res, prev_res[x_v], *(input + prev_res[x_v]) > *(input + res));
-    }
-    return res;
-}
-#else // !defined(PREV_OUTPUT)
-/** Find index maximum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
-{
-#if WIDTH < 16
-    DATA_TYPE_OUTPUT res = 0;
-    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
-    {
-        res = select(res, x_v, *(input + x_v) > *(input + res));
-    }
-    return res;
-#else  // WIDTH >= 16
-    int       x_elem   = x_idx * 16;
-    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
-    x_elem -= x_goback;
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, input - x_goback);
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
-    SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 8)
-    idx_sel       = (in.s01234567 >= in.s89abcdef);
-    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
-    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
-
-    idx_sel.s0123 = (in.s0123 > in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 4)));
-    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
-    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
-
-    idx_sel.s01 = (in.s01 > in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 2)));
-    in.s01      = select(in.s23, in.s01, idx_sel.s01);
-    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
-
-    idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), SIGNED_INT_DATA_TYPE(DATA_TYPE)));
-    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
-
-    return res.s0 + x_elem;
-#endif // WIDTH < 16
-}
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MAX)
-
-/** This kernel performs parallel reduction given an operation on x-axis.
- *
- * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed using -DPREV_OUTPUT
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
- * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the ArgMax
- * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the ArgMin
- *
- * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
- * @param[in] prev_res_ptr                              (Optional) Pointer to previous results tensor. Supported data types: U32/S32
- * @param[in] prev_res_stride_x                         (Optional) Stride of the output tensor in X dimension (in bytes)
- * @param[in] prev_res_step_x                           (Optional) prev_res_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] prev_res_stride_y                         (Optional) Stride of the output tensor in Y dimension (in bytes)
- * @param[in] prev_res_step_y                           (Optional) prev_res_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] prev_res_offset_first_element_in_bytes    (Optional) The offset of the first element in the previous results tensor
- * @param[in] partial_res_ptr                           The local buffer to hold partial result values. Supported data types: U32/S32
- * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] local_results                             Local buffer for storing the partial result
- */
-__kernel void arg_min_max_x(
-    IMAGE_DECLARATION(src),
-#if defined(PREV_OUTPUT)
-    IMAGE_DECLARATION(prev_res),
-#endif // defined(PREV_OUTPUT)
-    IMAGE_DECLARATION(partial_res),
-    __local DATA_TYPE_OUTPUT *local_results)
-{
-#if defined(PREV_OUTPUT)
-    Image src      = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
-    Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
-#else  // !defined(PREV_OUTPUT)
-    Image src                      = CONVERT_TO_IMAGE_STRUCT(src);
-#endif // defined(PREV_OUTPUT)
-    Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
-
-    unsigned int lsize = get_local_size(0);
-    unsigned int lid   = get_local_id(0);
-
-    const uint     x_idx                 = get_global_id(0);
-    const uint     y_idx                 = get_global_id(1);
-    const __global DATA_TYPE *src_in_row = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);
-
-    for(unsigned int y = 0; y < get_local_size(1); ++y)
-    {
-#if defined(ARG_MAX)
-#if defined(PREV_OUTPUT)
-        local_results[lid] = arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
-#else  // !defined(PREV_OUTPUT)
-        local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
-#endif // defined(PREV_OUTPUT)
-#else  // defined(ARG_MIN)
-#if defined(PREV_OUTPUT)
-        local_results[lid]         = arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
-#else  // !defined(PREV_OUTPUT)
-        local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MAX) || defined(ARG_MIN)
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Looking for the next highest power of 2 (maximum value of lsize is 8)
-        unsigned int middle = lsize - 1;
-        middle |= middle >> 1;
-        middle |= middle >> 2;
-        middle += 1;
-        // Perform parallel reduction
-        for(unsigned int i = middle; i > 0; i >>= 1)
-        {
-            if(lid < i && lid + i < lsize)
-            {
-                DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
-                DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
-#if defined(ARG_MAX)
-                local_results[lid] = select(
-                                         local_results[lid],
-                                         local_results[lid + i],
-                                         ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1));
-#else  // defined(ARG_MIN)
-                local_results[lid] = select(
-                                         local_results[lid],
-                                         local_results[lid + i],
-                                         ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
-#endif // defined(ARG_MAX) || defined(ARG_MIN)
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        if(lid == 0)
-        {
-            ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
-        }
-    }
-}
-#endif // defined(WIDTH)
-
-#if defined(HEIGHT)
-/** This kernel performs reduction on y-axis.
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
- * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void arg_min_max_y(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y;
-
-    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
-
-    VEC_TYPE_OUT indx0 = 0;
-    for(DATA_TYPE_OUTPUT y = 1; y < HEIGHT; ++y)
-    {
-        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_TYPE_IN);
-
-        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
-        indx0                  = select(indx0, (VEC_TYPE_OUT)y, cond_conv);
-        res                    = select(res, in, CONDITION_TO_USE(in, res));
-    }
-
-    // Store result
-    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(HEIGHT)
-
-#if defined(DEPTH) && !defined(BATCH)
-/** This kernel performs reduction on z-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void arg_min_max_z(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-
-    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
-
-    VEC_TYPE_OUT indx0 = 0;
-    for(DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
-    {
-        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_TYPE_IN);
-
-        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
-        indx0                  = select(indx0, (VEC_TYPE_OUT)z, cond_conv);
-        res                    = select(res, in, CONDITION_TO_USE(in, res));
-    }
-
-    // Store result
-    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(DEPTH)  && !defined(BATCH) */
-
-#if defined(BATCH) && defined(DEPTH)
-/** This kernel performs reduction on w-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
- * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void arg_min_max_w(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + (get_global_id(2) % DEPTH) * input_stride_z +
-                                  (get_global_id(2) / DEPTH) * input_stride_w;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + (get_global_id(
-                                      2) % DEPTH) * output_stride_z + (get_global_id(2) / DEPTH) * output_stride_w;
-
-    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
-
-    VEC_TYPE_OUT indx0 = 0;
-    for(DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
-    {
-        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_TYPE_IN);
-
-        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
-        indx0                  = select(indx0, (VEC_TYPE_OUT)w, cond_conv);
-        res                    = select(res, in, CONDITION_TO_USE(in, res));
-    }
-
-    // Store result
-    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(BATCH) && defined(DEPTH) */
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batch_to_space.cl b/src/core/CL/cl_kernels/batch_to_space.cl
deleted file mode 100644
index 8a71985b02..0000000000
--- a/src/core/CL/cl_kernels/batch_to_space.cl
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(BATCH_SIZE)
-/** Batch to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nchw(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    VECTOR_DECLARATION(block_shape),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-
-    const int block_x = *((__global int *)vector_offset(&block, 0));
-    const int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Batch to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nhwc(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    VECTOR_DECLARATION(block_shape),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-
-    const int block_x = *((__global int *)vector_offset(&block, 0));
-    const int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
-
-#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
-/** Batch to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_static_nchw(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int block_x = BLOCK_SHAPE_X;
-    const int block_y = BLOCK_SHAPE_Y;
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Batch to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_static_nhwc(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int block_x = BLOCK_SHAPE_X;
-    const int block_y = BLOCK_SHAPE_Y;
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
deleted file mode 100644
index 89cbe4440e..0000000000
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define INVSQRT_OP(a) rsqrt((a))
-#define SQCVT_SAT(a) (a)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
-#include "activation_float_helpers.h"
-
-/** Apply batch normalization.
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
- * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
- * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
- * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
- * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
- * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
- * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
- */
-__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
-#ifndef IN_PLACE
-                                            TENSOR3D_DECLARATION(output),
-#endif /* not IN_PLACE */
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var),
-#ifndef USE_DEFAULT_BETA
-                                            VECTOR_DECLARATION(beta),
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-                                            VECTOR_DECLARATION(gamma),
-#endif /* USE_DEFAULT_GAMMA */
-                                            float epsilon)
-{
-    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D out = in;
-#else  /* IN_PLACE */
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
-#ifndef USE_DEFAULT_BETA
-    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
-#endif /* USE_DEFAULT_GAMMA */
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    denominator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    numerator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    x_bar = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res = 0;
-
-    const int current_slice = get_global_id(2);
-
-    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-    denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
-    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
-
-    // Calculate x bar and store results
-    numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
-    numerator = SUB_OP(data, numerator);
-    x_bar     = MUL_OP(numerator, denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
-
-    res = MUL_OP(gamma_vec, x_bar);
-#else  /* USE_DEFAULT_GAMMA */
-    // gamma is equal to 1, no need to perform multiplications
-    res                         = x_bar;
-#endif /* USE_DEFAULT_GAMMA */
-
-#ifndef USE_DEFAULT_BETA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
-    // beta is not zero, hence we need to perform the addition
-    res = ADD_OP(res, beta_vec);
-#endif /* USE_DEFAULT_BETA */
-
-    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
-
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Apply batch normalization on tensors with NHWC format.
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
- * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
- * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
- * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
- * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
- * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
- * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
- */
-__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
-#ifndef IN_PLACE
-                                            TENSOR3D_DECLARATION(output),
-#endif /* not IN_PLACE */
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var),
-#ifndef USE_DEFAULT_BETA
-                                            VECTOR_DECLARATION(beta),
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-                                            VECTOR_DECLARATION(gamma),
-#endif /* USE_DEFAULT_GAMMA */
-                                            float epsilon)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-#ifdef IN_PLACE
-    __global uchar *output_addr = input_ptr;
-#else  /* IN_PLACE */
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-#endif /* IN_PLACE */
-    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
-    __global uchar *var_addr  = var_ptr + var_offset_first_element_in_bytes + x_offs;
-#ifndef USE_DEFAULT_BETA
-    __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-    __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
-#endif /* USE_DEFAULT_GAMMA */
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    denominator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    numerator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    x_bar = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = 0;
-
-    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
-    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
-    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
-
-    // Calculate x bar and store results
-    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
-    numerator = SUB_OP(data, numerator);
-    x_bar     = MUL_OP(numerator, denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
-
-    res0 = MUL_OP(gamma_vec, x_bar);
-#else  /* USE_DEFAULT_GAMMA */
-    // gamma is equal to 1, no need to perform multiplications
-    res0 = x_bar;
-#endif /* USE_DEFAULT_GAMMA */
-
-#ifndef USE_DEFAULT_BETA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
-    // beta is not zero, hence we need to perform the addition
-    res0 = ADD_OP(res0, beta_vec);
-#endif /* USE_DEFAULT_BETA */
-
-    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
-
-#if defined(DATA_TYPE) && defined(EPSILON)
-/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
- *
- * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
- * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
- *       For depthwise convolution weight do not pass DIM2
- * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
- * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
- *
- * @param[in]  w_ptr                                 Pointer to the weights tensor. Supported data types: F16/F32
- * @param[in]  w_stride_x                            Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  w_step_x                              w_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  w_stride_y                            Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  w_step_y                              w_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  w_stride_z                            Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  w_step_z                              w_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  w_offset_first_element_in_bytes       The offset of the first element in the weights tensor
- * @param[in]  b_ptr                                 (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
- * @param[in]  b_stride_x                            (Optional) Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  b_step_x                              (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  b_stride_y                            (Optional) Stride of the bias tensor in Y dimension (in bytes)
- * @param[in]  b_step_y                              (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  b_stride_z                            (Optional) Stride of the bias tensor in Z dimension (in bytes)
- * @param[in]  b_step_z                              (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  b_offset_first_element_in_bytes       (Optional) The offset of the first element in the bias tensor
- * @param[in]  mean_ptr                              Pointer to the mean source tensor. Supported data types: same as @p w_ptr
- * @param[in]  mean_stride_x                         Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                           mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes    The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                               Pointer to the var tensor. Supported data types: same as @p w_ptr
- * @param[in]  var_stride_x                          Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                            var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes     The offset of the first element in the var source tensor
- * @param[out] w_fused_ptr                           (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
- * @param[in]  w_fused_stride_x                      (Optional) Stride of the destination weights tensor in X dimension (in bytes)
- * @param[in]  w_fused_step_x                        (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  w_fused_stride_y                      (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
- * @param[in]  w_fused_step_y                        (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  w_fused_stride_z                      (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
- * @param[in]  w_fused_step_z                        (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
- * @param[in]  b_fused_ptr                           (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
- * @param[in]  b_fused_stride_x                      (Optional) Stride of the destination bias tensor in X dimension (in bytes)
- * @param[in]  b_fused_step_x                        (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
- * @param[in]  beta_ptr                              (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
- * @param[in]  beta_stride_x                         (Optional) Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                           (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes    (Optional) The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                             (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
- * @param[in]  gamma_stride_x                        (Optional) Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                          (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes   (Optional) The offset of the first element in the gamma source tensor
- */
-__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
-#if defined(BIAS)
-                                            VECTOR_DECLARATION(b),
-#endif // defined(BIAS)
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var)
-#ifndef IN_PLACE_W
-                                            ,
-                                            TENSOR3D_DECLARATION(w_fused)
-#endif // ifndef IN_PLACE_W
-#ifndef IN_PLACE_B
-                                            ,
-                                            VECTOR_DECLARATION(b_fused)
-#endif // ifndef IN_PLACE_B
-#if defined(BETA)
-                                            ,
-                                            VECTOR_DECLARATION(beta)
-#endif // defined(BETA)
-#if defined(GAMMA)
-                                            ,
-                                            VECTOR_DECLARATION(gamma)
-#endif // defined(GAMMA)
-                                           )
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-#if defined(DIM2)
-    int c0 = z % DIM2;
-    int c1 = z / DIM2;
-#else // ! defined(DIM2)
-    int c0                                                                                    = 0;
-#if defined(NHWC)
-    int c1                                                                                    = x;
-#else  // defined(NHWC)
-    int c1 = z;
-#endif // defined(NHWC)
-#endif // defined(DIM2)
-
-    int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
-    int v_offset = c1 * sizeof(DATA_TYPE);
-
-    DATA_TYPE w_old = 0.0f;
-    DATA_TYPE b_old = 0.0f;
-    DATA_TYPE w_new = 0.0f;
-    DATA_TYPE b_new = 0.0f;
-    DATA_TYPE gamma = 1.0f;
-    DATA_TYPE mean  = 0.0f;
-    DATA_TYPE var   = 1.0f;
-    DATA_TYPE beta  = 0.0f;
-
-    w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
-    var   = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
-    mean  = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
-
-#if defined(GAMMA)
-    gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
-#endif // defined(GAMMA)
-
-    // Compute new weight
-    w_new = (gamma * w_old) / (sqrt(var + EPSILON));
-
-#if defined(IN_PLACE_W)
-    *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
-#else  // defined(IN_PLACE_W)
-    *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
-#endif // defined(IN_PLACE_W)
-
-    // Compute bias
-#if !defined(DIM2) && defined(NHWC)
-    if(z == 0 && y == 0)
-#else  // !defined(DIM2) && defined(NHWC)
-    if(x == 0 && y == 0 && c0 == 0)
-#endif // !defined(DIM2) && defined(NHWC)
-    {
-#if defined(BIAS)
-        b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
-#endif // defined(BIAS)
-#if defined(BETA)
-        beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
-#endif // defined(BETA)
-
-        b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
-
-#if defined(BIAS)
-
-#if defined(IN_PLACE_B)
-        *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
-#else  // defined(IN_PLACE_B)
-        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
-#endif // defined(IN_PLACE_B)
-
-#else // defined(BIAS)
-
-#ifndef IN_PLACE_B
-        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
-#endif // ifndef IN_PLACE_B
-
-#endif // defined(BIAS)
-    }
-}
-#endif // defined(DATA_TYPE) && defined(EPSILON)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/bitwise_op.cl b/src/core/CL/cl_kernels/bitwise_op.cl
deleted file mode 100644
index a600bced9e..0000000000
--- a/src/core/CL/cl_kernels/bitwise_op.cl
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-
-/** This function computes the bitwise OR of two input images.
- *
- * @note The following variables must be passed at compile time:
- * -# -DVEC_SIZE         : The number of elements processed in X dimension
- * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void bitwise_or(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
-
-    // Load data
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
-
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    data0 = in_a | in_b;
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-/** This function computes the bitwise AND of two input images.
- *
- * @note The following variables must be passed at compile time:
- * -# -DVEC_SIZE         : The number of elements processed in X dimension
- * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void bitwise_and(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
-
-    // Load data
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
-
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    data0 = in_a & in_b;
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-/** This function computes the bitwise XOR of two input images.
- *
- * @note The following variables must be passed at compile time:
- * -# -DVEC_SIZE         : The number of elements processed in X dimension
- * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void bitwise_xor(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
-
-    // Load data
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
-
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    data0 = in_a ^ in_b;
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-/** This function computes the bitwise NOT of an images.
- *
- * @note The following variables must be passed at compile time:
- * -# -DVEC_SIZE         : The number of elements processed in X dimension
- * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void bitwise_not(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(out))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
-
-    // Load data
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
-
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    data0 = ~in_a;
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
deleted file mode 100644
index 63af2c6137..0000000000
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
-* Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "tile_helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
-
-// Check valid VEC_SIZES
-#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
-#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
-#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
-
-#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
-    ({                                                      \
-        div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
-        uint r  = div_res * (y);                            \
-        mod_res = (x)-r;                                    \
-    })
-
-/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
- *
- * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
- * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
- * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
- *       K is equal to num_channels / num_groups.
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
-                                   TENSOR4D_DECLARATION(dst))
-{
-    uint curr_channel = 0; // channel id of input
-    uint batch_id     = 0; // batch id
-    uint group_id     = 0; // group id
-    uint channel_id   = 0; // channel id within the group
-
-    // Compute curr_channel and batch_id
-    DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
-
-    // Compute group_id and channel_id
-    DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
-
-    const uint x = get_global_id(0) * VEC_SIZE;
-    const uint y = get_global_id(1) * 2;
-    const uint z = channel_id * NUM_GROUPS + group_id;
-
-    // Load the Nx2 block
-    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-
-    // Store blocks
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
-    VSTORE(VEC_SIZE)
-    (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
-    VSTORE(VEC_SIZE)
-    (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
-}
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
-
-/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
- *
- * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
- * @note The third dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
- * @note The first dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_X=num. e.g. -DSRC_DIM_X=64
- * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
- *       K is equal to num_channels / num_groups.
- * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
-                                   TENSOR4D_DECLARATION(dst))
-{
-    // Offset computation
-    const uint curr_out_channel = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); // output feature map
-
-    uint z        = 0;
-    uint batch_id = 0;
-    // Compute curr_channel and batch_id
-    DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, z);
-
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    curr_out_channels = (VEC_DATA_TYPE(uint, VEC_SIZE))(curr_out_channel) + VEC_OFFS(uint, VEC_SIZE);
-
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    in_channels = (curr_out_channels * (VEC_DATA_TYPE(uint, VEC_SIZE))(K)) % (VEC_DATA_TYPE(uint, VEC_SIZE))(SRC_DIM_X) + (curr_out_channels / (VEC_DATA_TYPE(uint, VEC_SIZE))(NUM_GROUPS));
-
-    // Load the values
-    const __global DATA_TYPE *input_ptr = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + z * src_stride_z + batch_id * src_stride_w);
-
-#if VEC_SIZE == 1
-    DATA_TYPE out0 = *((const __global * DATA_TYPE)(input_ptr) + in_channels);
-#elif VEC_SIZE == 2
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0 =
-    {
-        *(input_ptr + in_channels.s0),
-        *(input_ptr + in_channels.s1)
-    };
-#elif VEC_SIZE == 3
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    out0 =
-    {
-        *(input_ptr + in_channels.s0),
-        *(input_ptr + in_channels.s1),
-        *(input_ptr + in_channels.s2)
-    };
-#elif VEC_SIZE == 4
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0 =
-    {
-        *(input_ptr + in_channels.s0),
-        *(input_ptr + in_channels.s1),
-        *(input_ptr + in_channels.s2),
-        *(input_ptr + in_channels.s3)
-    };
-#elif VEC_SIZE == 8
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0 =
-    {
-        *(input_ptr + in_channels.s0),
-        *(input_ptr + in_channels.s1),
-        *(input_ptr + in_channels.s2),
-        *(input_ptr + in_channels.s3),
-        *(input_ptr + in_channels.s4),
-        *(input_ptr + in_channels.s5),
-        *(input_ptr + in_channels.s6),
-        *(input_ptr + in_channels.s7)
-    };
-#elif VEC_SIZE == 16
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    out0 =
-    {
-        *(input_ptr + in_channels.s0),
-        *(input_ptr + in_channels.s1),
-        *(input_ptr + in_channels.s2),
-        *(input_ptr + in_channels.s3),
-        *(input_ptr + in_channels.s4),
-        *(input_ptr + in_channels.s5),
-        *(input_ptr + in_channels.s6),
-        *(input_ptr + in_channels.s7),
-        *(input_ptr + in_channels.s8),
-        *(input_ptr + in_channels.s9),
-        *(input_ptr + in_channels.sa),
-        *(input_ptr + in_channels.sb),
-        *(input_ptr + in_channels.sc),
-        *(input_ptr + in_channels.sd),
-        *(input_ptr + in_channels.se),
-        *(input_ptr + in_channels.sf)
-    };
-#endif // VEC_SIZE == 1
-
-    __global uchar *output_ptr = dst_ptr + curr_out_channel * sizeof(DATA_TYPE) + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
-    STORE_VECTOR_SELECT(out, DATA_TYPE, output_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl
deleted file mode 100644
index 59c2d8a3aa..0000000000
--- a/src/core/CL/cl_kernels/col2im.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
-
-#if ELEMENT_SIZE == 1
-#define COND_DATA_TYPE char
-#elif ELEMENT_SIZE == 2
-#define COND_DATA_TYPE short
-#elif ELEMENT_SIZE == 4
-#define COND_DATA_TYPE int
-#else // ELEMENT_SIZE
-#error "Element size not support"
-#endif // ELEMENT_SIZE
-
-/** This kernel performs a reshaping of the output of the convolution layer
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320
- * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600
- * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4
- * @note The number of groups must be passed at compile time using  -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void col2im(
-    TENSOR3D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, 0);
-
-    const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor
-    const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data = vload8(0, (__global DATA_TYPE *)src.ptr);
-
-    uint  x         = get_global_id(0) * 8;
-    uint8 x_clamped = x + (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
-
-    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
-    cond0 = CONVERT((x_clamped < WIDTH_INPUT), VEC_DATA_TYPE(COND_DATA_TYPE, 8));
-
-    // Clamp x if out-of-bounds
-    x_clamped = select((uint8)x, x_clamped, convert_int8(cond0));
-
-    // If out-of-bound, overwrite with the first element
-    data = select((VEC_DATA_TYPE(DATA_TYPE, 8))data.s0, data, cond0);
-
-#if NUM_GROUPS > 1
-    // Compute output offset (batches on 4th dimension)
-    int idx = yd * dst_stride_y + xd * dst_stride_x + (get_global_id(2) / NUM_GROUPS) * dst.stride_w;
-
-    const uint group = get_global_id(2) % NUM_GROUPS; // group ID
-    x_clamped += group * WIDTH_INPUT;
-#else  /* defined(NUM_GROUPS > 1 ) */
-    // Compute output offset (batches on 3rd dimension)
-    int idx = yd * dst.stride_y + xd * dst.stride_x + get_global_id(2) * dst.stride_w;
-#endif /* NUM_GROUPS > 1 */
-
-    // Store value
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s0 * dst.stride_z)) = data.s0;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s1 * dst.stride_z)) = data.s1;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s2 * dst.stride_z)) = data.s2;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s3 * dst.stride_z)) = data.s3;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s4 * dst.stride_z)) = data.s4;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s5 * dst.stride_z)) = data.s5;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s6 * dst.stride_z)) = data.s6;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s7 * dst.stride_z)) = data.s7;
-}
-#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
diff --git a/src/core/CL/cl_kernels/common/activation_layer.cl b/src/core/CL/cl_kernels/common/activation_layer.cl
new file mode 100644
index 0000000000..a04556a1ed
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/activation_layer.cl
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ACT) && defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#include "activation_float_helpers.h"
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    // Get pixels pointer
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_addr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    // Perform activation
+    data0 = ACTIVATION(ACT, DATA_TYPE, VEC_SIZE, data0, A_VAL, B_VAL);
+
+    // Store result
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+
+#endif /* defined(ACT) */
diff --git a/src/core/CL/cl_kernels/common/activation_layer_quant.cl b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
new file mode 100644
index 0000000000..38ee00b17a
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_quant_helpers.h"
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+
+#if defined(FLOAT_DOMAIN)
+// Activations performed in the float domain
+
+#include "activation_float_helpers.h"
+
+/** This performs an activation function on quantized inputs with float transformations.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_quant_f32(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    // Get pixels pointer
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_addr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    VEC_FLOAT data_flt = CONVERT(data0, VEC_FLOAT);
+#if defined(O1_VAL)
+    data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
+#else  // defined(O1_VAL)
+    data_flt                    = round(data_flt) * ((float)S1_VAL);
+#endif // defined(O1_VAL)
+    data_flt = ACTIVATION(ACT, float, VEC_SIZE, data_flt, A_VAL, B_VAL);
+
+#if defined(O2_VAL)
+    data0 = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
+#else  // defined(O2_VAL)
+    data0                       = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
+#endif // defined(O2_VAL)
+
+    // Store result
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+
+#else // defined(FLOAT_DOMAIN)
+// Activations performed in the quantized domain
+
+#if defined(ACT)
+/** This performs an activation function on quantized inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_quant(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    // Get pixels pointer
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_addr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    data0 = PERFORM_ACTIVATION_QUANT(ACT, data0);
+
+    // Store result
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(ACT)
+#endif // defined(FLOAT_DOMAIN)
diff --git a/src/core/CL/cl_kernels/common/arg_min_max.cl b/src/core/CL/cl_kernels/common/arg_min_max.cl
new file mode 100644
index 0000000000..413fcf5333
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/arg_min_max.cl
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
+
+#define VEC_TYPE_IN VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_TYPE_OUT VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+#define VEC_SELECT_IN SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_SIGNED_INT_IN SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) (VEC_SELECT_IN) isgreater(x, y)
+#define ISLESS(x, y) (VEC_SELECT_IN) isless(x, y)
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x > y))
+#define ISLESS(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x < y))
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+#if defined(ARG_MAX)
+#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
+#elif defined(ARG_MIN)
+#define CONDITION_TO_USE(x, y) ISLESS(x, y)
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+#error "Unsupported reduction operation!"
+#endif // defined(ARG_MAX)
+
+#if defined(WIDTH)
+
+#if defined(ARG_MAX)
+#define VECTOR_PREDICATE_EQ(x, y) ((x) >= (y))
+#define VECTOR_PREDICATE(x, y) ((x) > (y))
+#define SCALAR_SELECT_OP(x, y) ((x) > (y)) ? (x) : (y);
+#elif defined(ARG_MIN)
+#define VECTOR_PREDICATE_EQ(x, y) ((x) <= (y))
+#define VECTOR_PREDICATE(x, y) ((x) < (y))
+#define SCALAR_SELECT_OP(x, y) ((x) < (y)) ? (x) : (y);
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+#error "Unsupported reduction operation!"
+#endif // defined(ARG_MAX)
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_2(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 2) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) res)
+{
+    if( VECTOR_PREDICATE_EQ(in.s0,in.s1) )
+    {
+        *min_max_val  = in.s0;
+        *min_max_idx  = res.s0;
+    }
+    else
+    {
+        *min_max_val  = in.s1;
+        *min_max_idx  = res.s1;
+    }
+}
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_4(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 4) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) res)
+{
+    VEC_DATA_TYPE(COND_DATA_TYPE, 2)
+    idx_sel       = VECTOR_PREDICATE_EQ(in.s01, in.s23);
+    in.s01      = select(in.s23, in.s01, idx_sel);
+    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
+    idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
+    *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
+    *min_max_idx  = res.s0;
+}
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_8(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 8) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 8) res)
+{
+    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+    idx_sel       = VECTOR_PREDICATE_EQ(in.s0123, in.s4567);
+    in.s0123      = select(in.s4567, in.s0123, idx_sel);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) ));
+    idx_sel.s01   = (VECTOR_PREDICATE(in.s01, in.s23)) || (in.s01 == in.s23 && CONVERT(((res.s01 < res.s23)), VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
+    in.s01        = select(in.s23, in.s01, idx_sel.s01);
+    res.s01       = select(res.s23, res.s01, CONVERT(idx_sel.s01,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
+    idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
+    *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
+    *min_max_idx  = res.s0;
+}
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_16(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 16) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) res)
+{
+    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
+    idx_sel       = VECTOR_PREDICATE_EQ(in.s01234567, in.s89abcdef);
+    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
+    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 8) ));
+    idx_sel.s0123 = VECTOR_PREDICATE(in.s0123, in.s4567) || (in.s0123 == in.s4567 && CONVERT(((res.s0123 < res.s4567)), VEC_DATA_TYPE(COND_DATA_TYPE, 4)));
+    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) ));
+    idx_sel.s01   = (VECTOR_PREDICATE(in.s01, in.s23)) || (in.s01 == in.s23 && CONVERT(((res.s01 < res.s23)), VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
+    in.s01        = select(in.s23, in.s01, idx_sel.s01);
+    res.s01       = select(res.s23, res.s01, CONVERT(idx_sel.s01,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
+    idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
+    *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
+    *min_max_idx  = res.s0;
+}
+
+
+
+inline void scalar_compute_global_min_max(DATA_TYPE in_val, int idx, DATA_TYPE *out_min_max_val, DATA_TYPE_OUTPUT *out_idx)
+{
+#if defined(ARG_MAX)
+    if(in_val > *out_min_max_val)
+#else  // defined(ARG_MAX)
+    if(in_val < *out_min_max_val)
+#endif // defined(ARG_MAX)
+    {
+        *out_min_max_val = in_val;
+        *out_idx         = idx;
+    }
+}
+
+#if VEC_SIZE > 1
+#if VEC_SIZE == 16
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_16(min_max_val,min_max_idx,in,res)
+#elif VEC_SIZE == 8 // #if VEC_SIZE == 16
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_8(min_max_val,min_max_idx,in,res)
+#elif VEC_SIZE == 4 // # elif VEC_SIZE == 8
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_4(min_max_val,min_max_idx,in,res)
+#elif VEC_SIZE == 2 // elif VEC_SIZE == 4
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_2(min_max_val,min_max_idx,in,res)
+#else // elif VEC_SIZE == 2
+    #error "Not supported"
+#endif // #if VEC_SIZE == 16
+
+inline VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE) init_idx_vector()
+{
+#if VEC_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+#elif VEC_SIZE == 8 // #if VEC_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1, 2, 3, 4, 5, 6, 7 };
+#elif VEC_SIZE == 4 // elif VEC_SIZE == 8
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1, 2, 3 };
+#elif VEC_SIZE == 2 // elif VEC_SIZE == 4
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1 };
+#else  // elif VEC_SIZE == 2
+#error "Not supported"
+#endif // #if VEC_SIZE == 16
+    return vidx;
+}
+#endif // VEC_SIZE > 1
+
+/** This kernel performs reduction on x-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
+ * @note The data type used for the comparing indexe must be passed at compile type using -DCOND_DATA_TYPE: e.g -DCOND_DATA_TYPE=uint
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_x(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    __global DATA_TYPE *input_addr         = (__global DATA_TYPE *)(input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y);
+    __global DATA_TYPE_OUTPUT *output_addr = (__global DATA_TYPE_OUTPUT *)(output_ptr + output_offset_first_element_in_bytes + get_global_id(1) * output_stride_y);
+
+    DATA_TYPE        final_value = input_addr[0];
+    DATA_TYPE_OUTPUT final_idx   = 0;
+
+#if VEC_SIZE > 1
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = init_idx_vector();
+
+    int x = 0;
+    for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        vals = VLOAD(VEC_SIZE)(0, (input_addr + x));
+        DATA_TYPE        local_min_max_value;
+        DATA_TYPE_OUTPUT local_min_max_idx;
+
+        VECTORIZED_OP(&local_min_max_value, &local_min_max_idx, vals, vidx);
+        local_min_max_idx += x;
+        scalar_compute_global_min_max(local_min_max_value, local_min_max_idx, &final_value, &final_idx);
+    }
+#endif // VEC_SIZE > 1
+
+#if(WIDTH % VEC_SIZE)
+    LOOP_UNROLLING(int, j, 0, 1, WIDTH % VEC_SIZE,
+    {
+        scalar_compute_global_min_max(*(input_addr + j + x), j + x, &final_value, &final_idx);
+    })
+#endif // (WIDTH % VEC_SIZE)
+
+    output_addr[0] = final_idx;
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_y(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    const int x_offs            = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT y = 1; y < HEIGHT; ++y)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)y, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH) && !defined(BATCH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_z(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)z, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DEPTH)  && !defined(BATCH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
+ * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_w(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + (get_global_id(2) % DEPTH) * input_stride_z +
+                                  (get_global_id(2) / DEPTH) * input_stride_w;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + (get_global_id(
+                                      2) % DEPTH) * output_stride_z + (get_global_id(2) / DEPTH) * output_stride_w;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)w, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
diff --git a/src/core/CL/cl_kernels/common/batchnormalization_layer.cl b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
new file mode 100644
index 0000000000..18f54907df
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(EPSILON)
+/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
+ *
+ * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
+ * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
+ *       For depthwise convolution weight do not pass DIM2
+ * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
+ * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  w_ptr                                 Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in]  w_stride_x                            Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  w_step_x                              w_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_stride_y                            Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  w_step_y                              w_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_stride_z                            Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  w_step_z                              w_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_offset_first_element_in_bytes       The offset of the first element in the weights tensor
+ * @param[in]  b_ptr                                 (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_stride_x                            (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  b_step_x                              (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_stride_y                            (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  b_step_y                              (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  b_stride_z                            (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  b_step_z                              (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  b_offset_first_element_in_bytes       (Optional) The offset of the first element in the bias tensor
+ * @param[in]  mean_ptr                              Pointer to the mean source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  mean_stride_x                         Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                           mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes    The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                               Pointer to the var tensor. Supported data types: same as @p w_ptr
+ * @param[in]  var_stride_x                          Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                            var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes     The offset of the first element in the var source tensor
+ * @param[out] w_fused_ptr                           (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
+ * @param[in]  w_fused_stride_x                      (Optional) Stride of the destination weights tensor in X dimension (in bytes)
+ * @param[in]  w_fused_step_x                        (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_y                      (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
+ * @param[in]  w_fused_step_y                        (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_z                      (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
+ * @param[in]  w_fused_step_z                        (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
+ * @param[in]  b_fused_ptr                           (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_fused_stride_x                      (Optional) Stride of the destination bias tensor in X dimension (in bytes)
+ * @param[in]  b_fused_step_x                        (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
+ * @param[in]  beta_ptr                              (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  beta_stride_x                         (Optional) Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                           (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes    (Optional) The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                             (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  gamma_stride_x                        (Optional) Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                          (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes   (Optional) The offset of the first element in the gamma source tensor
+ */
+__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
+#if defined(BIAS)
+                                            VECTOR_DECLARATION(b),
+#endif // defined(BIAS)
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var)
+#ifndef IN_PLACE_W
+                                            ,
+                                            TENSOR3D_DECLARATION(w_fused)
+#endif // ifndef IN_PLACE_W
+#ifndef IN_PLACE_B
+                                            ,
+                                            VECTOR_DECLARATION(b_fused)
+#endif // ifndef IN_PLACE_B
+#if defined(BETA)
+                                            ,
+                                            VECTOR_DECLARATION(beta)
+#endif // defined(BETA)
+#if defined(GAMMA)
+                                            ,
+                                            VECTOR_DECLARATION(gamma)
+#endif // defined(GAMMA)
+                                           )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+#if defined(DIM2)
+    int c0 = z % DIM2;
+    int c1 = z / DIM2;
+#else // ! defined(DIM2)
+    int c0                                                                                    = 0;
+#if defined(NHWC)
+    int c1                                                                                    = x;
+#else  // defined(NHWC)
+    int c1 = z;
+#endif // defined(NHWC)
+#endif // defined(DIM2)
+
+    int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
+    int v_offset = c1 * sizeof(DATA_TYPE);
+
+    DATA_TYPE w_old = 0.0f;
+    DATA_TYPE b_old = 0.0f;
+    DATA_TYPE w_new = 0.0f;
+    DATA_TYPE b_new = 0.0f;
+    DATA_TYPE gamma = 1.0f;
+    DATA_TYPE mean  = 0.0f;
+    DATA_TYPE var   = 1.0f;
+    DATA_TYPE beta  = 0.0f;
+
+    w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
+    var   = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
+    mean  = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
+
+#if defined(GAMMA)
+    gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
+#endif // defined(GAMMA)
+
+    // Compute new weight
+    w_new = (gamma * w_old) / (sqrt(var + EPSILON));
+
+#if defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
+#else  // defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
+#endif // defined(IN_PLACE_W)
+
+    // Compute bias
+#if !defined(DIM2) && defined(NHWC)
+    if(z == 0 && y == 0)
+#else  // !defined(DIM2) && defined(NHWC)
+    if(x == 0 && y == 0 && c0 == 0)
+#endif // !defined(DIM2) && defined(NHWC)
+    {
+#if defined(BIAS)
+        b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
+#endif // defined(BIAS)
+#if defined(BETA)
+        beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
+#endif // defined(BETA)
+
+        b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
+
+#if defined(BIAS)
+
+#if defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
+#else  // defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // defined(IN_PLACE_B)
+
+#else // defined(BIAS)
+
+#ifndef IN_PLACE_B
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // ifndef IN_PLACE_B
+
+#endif // defined(BIAS)
+    }
+}
+#endif // defined(DATA_TYPE) && defined(EPSILON)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/bitwise_op.cl b/src/core/CL/cl_kernels/common/bitwise_op.cl
new file mode 100644
index 0000000000..e142c1d275
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/bitwise_op.cl
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+/** This function computes the bitwise OR of two input images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_or(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a | in_b;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+/** This function computes the bitwise AND of two input images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_and(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a & in_b;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+/** This function computes the bitwise XOR of two input images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_xor(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a ^ in_b;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+/** This function computes the bitwise NOT of an images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_not(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = ~in_a;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
index f2e9cb0ed0..f2e9cb0ed0 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform.cl
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
diff --git a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
index c1d45a56b9..c1d45a56b9 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
diff --git a/src/core/CL/cl_kernels/cast.cl b/src/core/CL/cl_kernels/common/cast.cl
index 036a683ec7..036a683ec7 100644
--- a/src/core/CL/cl_kernels/cast.cl
+++ b/src/core/CL/cl_kernels/common/cast.cl
diff --git a/src/core/CL/cl_kernels/common/col2im.cl b/src/core/CL/cl_kernels/common/col2im.cl
new file mode 100644
index 0000000000..4dc005fd43
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/col2im.cl
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+/** This kernel performs a reshaping of the output of the convolution layer
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320
+ * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600
+ * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4
+ * @note The number of groups must be passed at compile time using  -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void col2im(
+    TENSOR3D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst);
+
+    const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor
+    const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    uint  x         = get_global_id(0) * 8;
+    uint8 x_clamped = x + (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
+
+    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
+    cond0 = CONVERT((x_clamped < WIDTH_INPUT), VEC_DATA_TYPE(COND_DATA_TYPE, 8));
+
+    // Clamp x if out-of-bounds
+    x_clamped = select((uint8)x, x_clamped, convert_int8(cond0));
+
+    // If out-of-bound, overwrite with the first element
+    data = select((VEC_DATA_TYPE(DATA_TYPE, 8))data.s0, data, cond0);
+
+#if NUM_GROUPS > 1
+    // Compute output offset (batches on 4th dimension)
+    int idx = yd * dst_stride_y + xd * dst_stride_x + (get_global_id(2) / NUM_GROUPS) * dst.stride_w;
+
+    const uint group = get_global_id(2) % NUM_GROUPS; // group ID
+    x_clamped += group * WIDTH_INPUT;
+#else  /* defined(NUM_GROUPS > 1 ) */
+    // Compute output offset (batches on 3rd dimension)
+    int idx = yd * dst.stride_y + xd * dst.stride_x + get_global_id(2) * dst.stride_w;
+#endif /* NUM_GROUPS > 1 */
+
+    // Store value
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s0 * dst.stride_z)) = data.s0;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s1 * dst.stride_z)) = data.s1;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s2 * dst.stride_z)) = data.s2;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s3 * dst.stride_z)) = data.s3;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s4 * dst.stride_z)) = data.s4;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s5 * dst.stride_z)) = data.s5;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s6 * dst.stride_z)) = data.s6;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s7 * dst.stride_z)) = data.s7;
+}
+#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
diff --git a/src/core/CL/cl_kernels/common/comparisons.cl b/src/core/CL/cl_kernels/common/comparisons.cl
new file mode 100644
index 0000000000..00bb491f85
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/comparisons.cl
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define EQUAL(x, y) ((x) == (y))
+#define NOTEQUAL(x, y) ((x) != (y))
+#define GREATER(x, y) ((x) > (y))
+#define GREATEREQUAL(x, y) ((x) >= (y))
+#define LESS(x, y) ((x) < (y))
+#define LESSEQUAL(x, y) ((x) <= (y))
+
+#ifdef IS_QUANTIZED
+#  define DEFINE_KERNEL_STR(name) compare_##name##_quantized
+#else // IS_QUANTIZED
+#  define DEFINE_KERNEL_STR(name) compare_##name
+#endif // IS_QUANTIZED
+
+#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME)
+/** This function compares two tensors.
+ *
+ * @attention The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All non-quantized data types.
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void DEFINE_KERNEL(OP_NAME)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    int dst_x = max((int)get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE, 0);
+
+#if VEC_SIZE_IN1 == 1
+    int in1_x = 0;
+#else // VEC_SIZE_IN1 == 1
+    int in1_x = dst_x;
+#endif // VEC_SIZE_IN1 == 1
+
+#if VEC_SIZE_IN2 == 1
+    int in2_x = 0;
+#else // VEC_SIZE_IN2 == 1
+    int in2_x = dst_x;
+#endif // VEC_SIZE_IN2 == 1
+
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    in1_ptr += in1_offset_first_element_in_bytes + z * in1_stride_z + y * in1_stride_y + in1_x * sizeof(DATA_TYPE);
+    in2_ptr += in2_offset_first_element_in_bytes + z * in2_stride_z + y * in2_stride_y + in2_x * sizeof(DATA_TYPE);
+    out_ptr += out_offset_first_element_in_bytes + z * out_stride_z + y * out_stride_y + dst_x * sizeof(uchar);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_a = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_b = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_ptr);
+
+    // Calculate and store result
+#ifdef IS_QUANTIZED
+    VEC_DATA_TYPE(int, VEC_SIZE) in_a_i32 = CONVERT(in_a, VEC_DATA_TYPE(int, VEC_SIZE));
+    VEC_DATA_TYPE(int, VEC_SIZE) in_b_i32 = CONVERT(in_b, VEC_DATA_TYPE(int, VEC_SIZE));
+
+    VEC_DATA_TYPE(float, VEC_SIZE) in_a_fp = CONVERT(in_a_i32 - OFFSET_IN1, VEC_DATA_TYPE(float, VEC_SIZE)) * SCALE_IN1;
+    VEC_DATA_TYPE(float, VEC_SIZE) in_b_fp = CONVERT(in_b_i32 - OFFSET_IN2, VEC_DATA_TYPE(float, VEC_SIZE)) * SCALE_IN2;
+#else // IS_QUANTIZED
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_a_fp = in_a;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_b_fp = in_b;
+#endif // IS_QUANTIZED
+
+#if VEC_SIZE == 1
+    uchar res0 = (uchar)select(0, 255, OP(in_a_fp, in_b_fp));
+#else // VEC_SIZE == 1
+    VEC_DATA_TYPE(uchar, VEC_SIZE) res0 = CONVERT(OP(in_a_fp, in_b_fp), VEC_DATA_TYPE(uchar, VEC_SIZE));
+#endif // VEC_SIZE == 1
+
+    STORE_VECTOR_SELECT(res, uchar, out_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
diff --git a/src/core/CL/cl_kernels/common/concatenate.cl b/src/core/CL/cl_kernels/common/concatenate.cl
new file mode 100644
index 0000000000..dc7210a4c4
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/concatenate.cl
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_QUANT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset, float in_scale, float out_scale)
+{
+    const VEC_FLOAT in_f32  = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale);
+    const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset));
+    const VEC_QUANT res_q8  = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_QUANT);
+    return res_q8;
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(DATA_TYPE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(ELEMENT_SIZE)
+
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SEQ VEC_OFFS(int, VEC_SIZE)
+
+#if defined(CONCATENATE_WIDTH_X2)
+/** This kernel concatenates two input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All.
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                              Tensor depth
+ * @param[in]  INPUT1_WIDTH                       First input tensor width
+ */
+__kernel void concatenate_width_x2(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH,
+    const int INPUT1_WIDTH)
+{
+    // Calculate input indices
+    const int x  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y  = get_global_id(1);
+    const int z  = get_global_id(2) % (int)DEPTH;
+    const int w  = get_global_id(2) / (int)DEPTH;
+    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
+    const int x2 = max(x - (int)INPUT1_WIDTH, 0);
+
+    // Calculate inputs and output addresses
+    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
+    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+
+    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
+    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1)  && defined(SCALE_IN2) && defined(SCALE_OUT) */
+    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
+
+    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
+    SELECT_TYPE cond = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH) && ((VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    src1_values      = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
+    src2_values      = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
+
+    cond                   = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
+    const VEC_TYPE values0 = select(src2_values, src1_values, cond);
+
+    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(CONCATENATE_WIDTH_X2)
+
+#if defined(CONCATENATE_WIDTH_X4)
+/** This kernel concatenates four input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src3_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src3_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src3_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src3_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src3_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src3_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src4_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src4_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src4_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src4_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src4_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src4_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                              Tensor depth
+ * @param[in]  INPUT1_WIDTH                       First input tensor width
+ * @param[in]  INPUT2_WIDTH                       Second input tensor width
+ * @param[in]  INPUT3_WIDTH                       Third input tensor width
+ */
+__kernel void concatenate_width_x4(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(src3),
+    TENSOR4D_DECLARATION(src4),
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH,
+    const int INPUT1_WIDTH,
+    const int INPUT2_WIDTH,
+    const int INPUT3_WIDTH)
+{
+    // Calculate input indices
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % (int)DEPTH;
+    const int w = get_global_id(2) / (int)DEPTH;
+
+    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
+    const int x2 = min(max(x - (int)INPUT1_WIDTH, 0), (int)INPUT2_WIDTH - (int)VEC_SIZE);
+    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, 0), (int)INPUT3_WIDTH - (int)VEC_SIZE);
+    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, 0);
+
+    // Calculate inputs and output addresses
+    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
+    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+    const __global uchar *src3_addr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * sizeof(DATA_TYPE) + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
+    const __global uchar *src4_addr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * sizeof(DATA_TYPE) + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
+
+    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
+    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
+    VEC_TYPE src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src3_addr);
+    VEC_TYPE src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src4_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
+    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+    src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
+    src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
+
+    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
+
+    SELECT_TYPE cond_in2 = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    SELECT_TYPE cond_in3 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    SELECT_TYPE cond_in4 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH - VEC_SIZE)), SELECT_TYPE);
+
+    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
+    src1_values = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
+    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
+    // Rotate src2/3_values, if values0 is a combination of src2_values and src3_values.
+    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
+    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
+    // Rotate src3/4_values, if values0 is a combination of src3_values and src4_values.
+    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
+    src4_values = select(src4_values, ROTATE(src4_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
+
+    cond_in2 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
+    cond_in3 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH), SELECT_TYPE);
+    cond_in4 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), SELECT_TYPE);
+
+    VEC_TYPE values0 = select(src2_values, src1_values, cond_in2);
+    values0          = select(src3_values, values0, cond_in3);
+    values0          = select(src4_values, values0, cond_in4);
+
+    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(CONCATENATE_WIDTH_X4) */
+#endif /* defined(ELEMENT_SIZE) */
+
+#if defined(WIDTH_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+#if defined(CONCATENATE_WIDTH)
+/** This kernel concatenates the input tensor into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                             Tensor depth
+ */
+__kernel void concatenate_width(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH)
+{
+    // Calculate input indices
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % (int)DEPTH;
+    const int w = get_global_id(2) / (int)DEPTH;
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + z * src_stride_z + w * src_stride_w;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + w * dst_stride_w;
+
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+}
+#endif /* defined(CONCATENATE_WIDTH) */
+#endif /* defined(WIDTH_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)*/
+
+#if defined(VEC_SIZE_LEFTOVER)
+#if defined(CONCATENATE_HEIGHT)
+#if defined(HEIGHT_OFFSET) && defined(VEC_SIZE)
+/** This kernel concatenates the input tensor into the output tensor along the second dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Vector sizes supported are 2,4,8 and 16.
+ * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                             Tensor depth
+ */
+__kernel void concatenate_height(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH)
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + (get_global_id(2) % DEPTH) * src_stride_z + (get_global_id(
+                                   2) / DEPTH) * src_stride_w;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + (get_global_id(2) % DEPTH) * dst_stride_z + (get_global_id(
+                                   2) / DEPTH) * dst_stride_w;
+
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+}
+#endif /* defined(CONCATENATE_HEIGHT) */
+#endif /* defined(HEIGHT_OFFSET) */
+
+#if defined(CONCATENATE)
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  offsets                           The offsets to the first valid element of the output tensor in bytes
+ */
+__kernel void concatenate(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    int offset)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    source_values0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + offset, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(CONCATENATE)
+#endif /* defined(VEC_SIZE_LEFTOVER) */
+#endif /* defined(DATA_TYPE) */
+#endif /* defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/common/convert_fc_weights.cl b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
new file mode 100644
index 0000000000..01ef04a7d6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
+/** Perform a NCHW -> NHWC or NHWC -> NCHW conversion for Fully Connected 2D weights.
+ *
+ * For NCHW -> NHWC, FACTOR_1 will be equal to the product of the first two dimensions of FullyConnectedLayer's input and FACTOR_2 will represent the number of channels of that tensor.
+ * For NHWC -> NCHW, FACTOR_1 and FACTOR_2 will hold the same values, but swapped.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convert_fc_weights(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_x + (get_global_id(1) % FACTOR_1 * FACTOR_2 + get_global_id(1) / FACTOR_1) * dst_stride_y;
+
+    *((__global DATA_TYPE *)dst_addr) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
diff --git a/src/core/CL/cl_kernels/common/convolution_layer.cl b/src/core/CL/cl_kernels/common/convolution_layer.cl
new file mode 100644
index 0000000000..be76929ac8
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/convolution_layer.cl
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(NUM_GROUPS)
+/** This kernel reshapes the tensor's low three dimensions to single column
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=number. e.g. -DNUM_GROUPS=2
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  bias_ptr                           Pointer to the bias tensor. Supported data types: F16/F32, for quantized types this must be nullptr
+ * @param[in]  bias_stride_x                      Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bias_step_x                        bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  width                              The width of the input tensor
+ * @param[in]  height                             The height of the input tensor
+ * @param[in]  depth                              The depth of the input tensor
+ * @param[in]  total_filters                      Total number of filters. 4th dimension of the weights matrix
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ */
+__kernel void reshape_to_columns(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(bias),
+#endif /* HAS_BIAS */
+    uint width, uint height, uint depth, uint total_filters, uint dst_stride_z)
+{
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
+    bool     is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
+
+    __global uchar *tmp_src_ptr = src.ptr;
+    __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
+                                      2) * width * height * dst_stride_y;
+#ifdef HAS_BIAS
+    __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
+#endif /* HAS_BIAS */
+
+    if(is_last_thread)
+    {
+        for(uint g = 0; g < NUM_GROUPS; ++g)
+        {
+            __global uchar *curr_group_dst = tmp_dst_ptr;
+
+            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
+            {
+                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
+
+#ifdef HAS_BIAS
+                *((__global DATA_TYPE *)(curr_group_dst + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
+                tmp_bias_ptr += bias_stride_x;
+#endif /* HAS_BIAS */
+                tmp_src_ptr += depth * src_stride_z;
+                curr_group_dst += dst_stride_x;
+            }
+
+            tmp_dst_ptr += dst_stride_z;
+        }
+    }
+    else
+    {
+        for(uint g = 0; g < NUM_GROUPS; ++g)
+        {
+            __global uchar *curr_group_dst = tmp_dst_ptr;
+
+            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
+            {
+                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
+                tmp_src_ptr += depth * src_stride_z;
+                curr_group_dst += dst_stride_x;
+            }
+
+            tmp_dst_ptr += dst_stride_z;
+        }
+    }
+}
+#endif // defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/common/copy_tensor.cl b/src/core/CL/cl_kernels/common/copy_tensor.cl
new file mode 100644
index 0000000000..753b98d1b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/copy_tensor.cl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+/** Performs a copy of input tensor to the output tensor.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DDATA_TYPE        : Input and output datatypes.
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void copy_tensor(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Boundary-aware access:
+    // If the there's left-over in width (VEC_SIZE_LEFTOVER > 0):
+    // Shift all accesses other than the first to avoid accessing out of bounds
+    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)VEC_SIZE_LEFTOVER, 0) % VEC_SIZE;
+    in.ptr -= shift * in.stride_x;
+    out.ptr -= shift * out.stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, DATA_TYPE, (__global DATA_TYPE *)out.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/crop_tensor.cl b/src/core/CL/cl_kernels/common/crop_tensor.cl
index d9090dc838..d9090dc838 100644
--- a/src/core/CL/cl_kernels/crop_tensor.cl
+++ b/src/core/CL/cl_kernels/common/crop_tensor.cl
diff --git a/src/core/CL/cl_kernels/common/deconvolution_layer.cl b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
new file mode 100644
index 0000000000..4ac5e3f0e9
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void deconvolution_upsample(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Store result
+    *((__global DATA_TYPE *)dst.ptr) = *((__global DATA_TYPE *)src.ptr);
+}
+
+#if defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+/** This kernel reshapes the deconvolution output tensor before returning the result of the Deconvolution. The decovnolution output tensor
+ * is the result of a @ref CLGEMM operation between the deconvolution input and the deconvolution filter
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g., -DDATA_TYPE=F32
+ * @note The width of the filter should be given as a preprocessor argument using -DFILTER_WIDTH=width, e.g., -DFILTER_WIDTH=2
+ * @note The height of the filter should be given as a preprocessor argument using -DFILTER_HEIGHT=height, e.g., -DFILTER_HEIGHT=2
+ * @note The width of the input should be given as a preprocessor argument using -DSRC_WIDTH=width, e.g., -DSRC_WIDTH=10
+ * @note The height of the input should be given as a preprocessor argument using -DSRC_HEIGHT=width, e.g., -DSRC_HEIGHT=10
+ * @note The output data layout is NHWC if the preprocessor argument NUM_FILTERS is defined, NCHW if NUM_FILTERS is not defined
+ *
+ * @param[in]  src_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_ptr                            Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  bias_ptr                           (Optional) Pointer to the biases vector. Supported data types: F16/F32/S32
+ * @param[in]  bias_stride_x                      (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ */
+__kernel void deconvolution_reshape(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst)
+#if defined(ADD_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(ADD_BIAS)
+)
+{
+#define FILTER_AREA ((FILTER_WIDTH) * (FILTER_HEIGHT))
+
+    Tensor3D        src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D        dst  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+    const DATA_TYPE data = *(__global DATA_TYPE *)src.ptr;
+
+    // Store result
+    const int x_in = get_global_id(0);
+    const int y_in = get_global_id(1);
+    const int z_in = get_global_id(2);
+
+#if defined(NUM_FILTERS)
+    const int bias_index = x_in / (FILTER_AREA);
+    const int z_out      = bias_index + (NUM_FILTERS) * (z_in / (SRC_HEIGHT));
+    const int x_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+    const int y_out      = (FILTER_HEIGHT) * (z_in % (SRC_HEIGHT)) + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+#else  // defined(NUM_FILTERS)
+    const int x_out      = x_in / (FILTER_AREA);
+    const int y_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+    const int z_out      = (FILTER_HEIGHT) * z_in + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+    const int bias_index = x_out;
+#endif // defined(NUM_FILTERS)
+
+#if defined(ADD_BIAS)
+    Vector          bias     = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+    const DATA_TYPE bias_val = *(__global DATA_TYPE *)vector_offset(&bias, bias_index);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data + bias_val;
+#else  // defined(ADD_BIAS)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data;
+#endif // defined(ADD_BIAS)
+
+#undef FILTER_AREA
+}
+#endif // defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/common/dequantization_layer.cl b/src/core/CL/cl_kernels/common/dequantization_layer.cl
new file mode 100644
index 0000000000..7fa62577ce
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/dequantization_layer.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+
+/** This performs the dequantization of 8-bit unsigned integers to floating point.
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
+ * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void dequantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = SCALE;
+
+    const VEC_DATA_TYPE(int, VEC_SIZE)
+    voffset = OFFSET;
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/elementwise_operation.cl b/src/core/CL/cl_kernels/common/elementwise_operation.cl
new file mode 100644
index 0000000000..91e51d9d1a
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_operation.cl
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE)
+
+/** List of all the operations supported by this kernel.
+ * @note ADD and SUB operations, when executed on integers, support saturation */
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+#define MAX(x, y) max(x, y)
+#define MIN(x, y) min(x, y)
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define POWER(x, y) pow(x, y)
+
+#if VEC_SIZE_OUT == 1
+#define PRELU(x, y) (x > 0 ? x : x * y)
+#else // VEC_SIZE_OUT == 1
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))))
+#endif // VEC_SIZE_OUT == 1
+
+#define DIV(x, y) (x / y)
+
+#define AND(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
+#define OR(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+#endif // defined(ACTIVATION_TYPE)
+
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_OUT=3. It is defined as the remainder between the input's first dimension and VEC_SIZE_OUT
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2)
+#if !defined(IN_PLACE)
+    ,
+    TENSOR3D_DECLARATION(out)
+#endif // !defined(IN_PLACE)
+)
+{
+#if VEC_SIZE_IN1 == 1
+    uint in1_x_offs = 0;
+#else  // VEC_SIZE_IN1 == 1
+    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
+#endif // VEC_SIZE_IN1 == 1
+#if VEC_SIZE_IN2 == 1
+    uint in2_x_offs = 0;
+#else  // VEC_SIZE_IN2 == 1
+    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
+#endif // VEC_SIZE_IN2 == 1
+#if !defined(IN_PLACE)
+    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+#endif // !defined(IN_PLACE)
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr    = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+    in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_addr)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+    in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_addr)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
+
+    // Calculate and store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+    res0 = OP(in_a, in_b);
+#if defined(ACTIVATION_TYPE)
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE) */
diff --git a/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
new file mode 100644
index 0000000000..a11be80875
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define SUB(x, y) (x - y)
+#define ADD(x, y) (x + y)
+#define MAX(x, y) max((x), (y))
+#define MIN(x, y) min((x), (y))
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(float, VEC_SIZE_OUT))))
+#define DIV(x, y) (x / y)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op##_quantized
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note In case of broadcasting along the X dimension the proper preprocessor argument should be passed depending on the input (e.g. -DIS_IN1_X_BROADCASTING, -DIS_IN2_X_BROADCASTING)
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ * @note For QSYMM16 operations OFFSET_IN1, OFFSET_IN2 and OFFSET_OUT must be set to zero
+ * @note The data type must be passed at compile time using -DDATA_TYPE, i.e. -DDATA_TYPE=uchar
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2)
+#if !defined(IN_PLACE)
+    ,
+    TENSOR3D_DECLARATION(out)
+#endif // !defined(IN_PLACE)
+)
+{
+#if VEC_SIZE_IN1 == 1
+    uint in1_x_offs = 0;
+#else  // VEC_SIZE_IN1 == 1
+    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
+#endif // VEC_SIZE_IN1 == 1
+#if VEC_SIZE_IN2 == 1
+    uint in2_x_offs = 0;
+#else  // VEC_SIZE_IN2 == 1
+    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
+#endif // VEC_SIZE_IN2 == 1
+#if !defined(IN_PLACE)
+    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+#endif // !defined(IN_PLACE)
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr    = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    VEC_INT in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_addr)), VEC_INT);
+    VEC_INT in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_addr)), VEC_INT);
+
+    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
+
+    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+    const VEC_FLOAT qresf32 = OP(in1f32, in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+    const VEC_TYPE  res0    = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+
+    // Store result
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE) */
diff --git a/src/core/CL/cl_kernels/common/elementwise_unary.cl b/src/core/CL/cl_kernels/common/elementwise_unary.cl
new file mode 100644
index 0000000000..81835108a3
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_unary.cl
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(OPERATION)
+
+// Calculate exponential
+#define exp_op(input) exp(input)
+// Calculate reverse square root
+#define rsqrt_op(input) rsqrt(input)
+// Calculate negative
+#define neg_op(input) (-input)
+// Calculate sine
+#define sin_op(input) sin(input)
+// Calculate abs for floating point values
+#define fabs_op(input) fabs(input)
+// Calculate natural_log
+#define natural_log_op(input) log(input)
+// Calculate round using round to nearest even rounding mode
+#define round_op(input) rint(input)
+
+#if defined(VEC_SIZE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define logical_not_op(input) CONVERT(CONVERT(!input, VEC_TYPE) & ((VEC_TYPE)0x1), VEC_TYPE)
+#else // defined(VEC_SIZE)
+#define logical_not_op(input) ((!input) & 0x1)
+#endif // defined(VEC_SIZE)
+
+/** Applies element wise unary operator in a tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: F16/32.
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: F16/32.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_step_y                        Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void elementwise_unary(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
+    out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+    VSTORE(VEC_SIZE)
+    (OPERATION(data), 0, (__global DATA_TYPE *)out.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(out.ptr)) = (DATA_TYPE)(OPERATION(*((__global DATA_TYPE *)in.ptr)));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(DATA_TYPE) && defined(OPERATION)
diff --git a/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl b/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl
new file mode 100644
index 0000000000..2e4cdc53fe
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(OPERATION)
+// Calculate reverse square root
+#define rsqrt_op(input) rsqrt(input)
+#if defined(VEC_SIZE)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#endif // defined(VEC_SIZE)
+
+/** Applies element wise unary operator in a tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED.
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: QASYMM8/QASYMM8_SIGNED.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_step_y                        Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void elementwise_unary_quantized(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
+    out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    data_f32                = CONVERT(data, VEC_FLOAT);
+    data_f32                = (data_f32 - (float)OFFSET_IN) * (float)SCALE_IN;
+    VEC_INT        qres_int = CONVERT_SAT((OPERATION(data_f32) / ((VEC_FLOAT)(float)SCALE_OUT)), VEC_INT) + ((VEC_INT)((int)OFFSET_OUT));
+    const VEC_TYPE qres     = CONVERT_SAT(qres_int, VEC_TYPE);
+    VSTORE(VEC_SIZE)
+    (qres, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(OPERATION)
diff --git a/src/core/CL/cl_kernels/common/fft.cl b/src/core/CL/cl_kernels/common/fft.cl
new file mode 100644
index 0000000000..3f26d0f1a6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fft.cl
@@ -0,0 +1,1880 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE)
+/** Calculates and applies the twiddle factor to a given input.
+ *
+ * @param[in]     phi   The angle.
+ * @param[in,out] input The input on which the factor should be applied.
+ */
+#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input)  \
+    {                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                \
+        w, tmp;                                    \
+        w.x   = cos(phi);                          \
+        w.y   = sin(phi);                          \
+        tmp.x = (w.x * input.x) - (w.y * input.y); \
+        tmp.y = (w.x * input.y) + (w.y * input.x); \
+        input = tmp;                               \
+    }
+
+/** Computes radix-2 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ */
+#define DFT_2(c0, c1)               \
+    {                               \
+        VEC_DATA_TYPE(DATA_TYPE, 2) \
+        v0;                         \
+        v0 = c0;                    \
+        c0 = v0 + c1;               \
+        c1 = v0 - c1;               \
+    }
+
+// radix-3 butterfly unit factors
+#define SQRT3DIV2 0.86602540378443f
+
+/** Computes radix-3 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ */
+#define DFT_3(c0, c1, c2)                             \
+    {                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
+        v0 = c1 + c2;                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
+        v1   = c1 - c2;                               \
+        c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
+        c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
+        c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
+        c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
+        c0   = c0 + v0;                               \
+    }
+
+/**Computes radix-4 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ */
+#define DFT_4(c0, c1, c2, c3)       \
+    {                               \
+        VEC_DATA_TYPE(DATA_TYPE, 2) \
+        v0, v1, v2, v3;             \
+        v0   = c0 + c2;             \
+        v1   = c1 + c3;             \
+        v2   = c0 - c2;             \
+        v3.x = c1.y - c3.y;         \
+        v3.y = c3.x - c1.x;         \
+        c0   = v0 + v1;             \
+        c2   = v0 - v1;             \
+        c1   = v2 + v3;             \
+        c3   = v2 - v3;             \
+    }
+
+// radix-5 butterfly unit factors
+#define W5_A (DATA_TYPE)0.30901699437494f
+#define W5_B (DATA_TYPE)0.95105651629515f
+#define W5_C (DATA_TYPE)0.80901699437494f
+#define W5_D (DATA_TYPE)0.58778525229247f
+
+/** Computes radix-5 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ */
+#define DFT_5(c0, c1, c2, c3, c4)                                  \
+    {                                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                \
+        v0, v1, v2, v3, v4;                                        \
+        v0 = c0;                                                   \
+        v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3);                  \
+        v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3);                  \
+        v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3);                  \
+        v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3);                  \
+        c0 = v0 + c1 + c2 + c3 + c4;                               \
+        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x); \
+        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v3.y, -v3.x); \
+        c3 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v3.y, v3.x); \
+        c4 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x); \
+    }
+
+// radix-7 butterfly unit factors
+#define W7_A (DATA_TYPE)0.62348980185873f
+#define W7_B (DATA_TYPE)0.78183148246802f
+#define W7_C (DATA_TYPE)0.22252093395631f
+#define W7_D (DATA_TYPE)0.97492791218182f
+#define W7_E (DATA_TYPE)0.90096886790241f
+#define W7_F (DATA_TYPE)0.43388373911755f
+
+/** Computes radix-7 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ */
+#define DFT_7(c0, c1, c2, c3, c4, c5, c6)                            \
+    {                                                                \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                  \
+        v0, v1, v2, v3, v4, v5, v6;                                  \
+        v0 = c0;                                                     \
+        v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \
+        v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \
+        v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \
+        v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \
+        v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \
+        v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \
+        c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6;                       \
+        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x);   \
+        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v5.y, -v5.x);   \
+        c3 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v6.y, -v6.x);   \
+        c4 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v6.y, v6.x);   \
+        c5 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v5.y, v5.x);   \
+        c6 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x);   \
+    }
+
+/** Computes radix-8 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ * @param[in,out] c7 Complex input 7.
+ */
+#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \
+    {                                         \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        v0, v1, v2, v3, v4, v5, v6, v7;       \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        s0, s1, s2, s3, s4, s5, s6, s7;       \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        t0, t1, t2;                           \
+        v0   = c0 + c4;                       \
+        v1   = c1 + c5;                       \
+        v2   = c2 + c6;                       \
+        v3   = c3 + c7;                       \
+        v4   = c0 - c4;                       \
+        v5   = c1 - c5;                       \
+        v6   = c2 - c6;                       \
+        v7   = c3 - c7;                       \
+        s0   = v0 + v2;                       \
+        s1   = v1 + v3;                       \
+        s2   = v0 - v2;                       \
+        s3   = v1 - v3;                       \
+        s4.x = v4.x - v6.y;                   \
+        s4.y = v4.y + v6.x;                   \
+        s5.x = v5.x - v7.y;                   \
+        s5.y = v5.y + v7.x;                   \
+        s6.x = v4.x + v6.y;                   \
+        s6.y = v4.y - v6.x;                   \
+        s7.x = v5.x + v7.y;                   \
+        s7.y = v5.y - v7.x;                   \
+        t0.x = -s3.y;                         \
+        t0.y = s3.x;                          \
+        t1.x = M_SQRT1_2_F * (s5.x - s5.y);   \
+        t1.y = M_SQRT1_2_F * (s5.x + s5.y);   \
+        t2.x = -M_SQRT1_2_F * (s7.x + s7.y);  \
+        t2.y = M_SQRT1_2_F * (s7.x - s7.y);   \
+        c0   = s0 + s1;                       \
+        c1   = s6 - t2;                       \
+        c2   = s2 - t0;                       \
+        c3   = s4 - t1;                       \
+        c4   = s0 - s1;                       \
+        c5   = s6 + t2;                       \
+        c6   = s2 + t0;                       \
+        c7   = s4 + t1;                       \
+    }
+
+/** Computes the first stage of a radix-2 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_2_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data = vload4(0, (__global DATA_TYPE *)input.ptr);
+
+    // Compute DFT N = 2
+    DFT_2(data.s01, data.s23);
+
+    // Store two complex output values
+    vstore4(data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+/** Computes the first stage of a radix-2 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_2_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Compute DFT N = 2
+    DFT_2(data1, data2);
+
+    // Store two complex output values
+    vstore2(data1, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_3_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data0 = vload4(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2, 0, 0));
+
+    // Compute DFT N = 3
+    DFT_3(data0.s01, data0.s23, data1.s01);
+
+    // Store three complex output values
+    vstore4(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2, 0, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_3_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+
+    // Compute DFT N = 3
+    DFT_3(data0, data1, data2);
+
+    // Store three complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_4_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)input.ptr);
+
+    // Compute DFT N = 4
+    DFT_4(data.s01, data.s23, data.s45, data.s67);
+
+    // Store four complex output values
+    vstore8(data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_4_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+
+    // Compute DFT N = 4
+    DFT_4(data0, data1, data2, data3);
+
+    // Store four complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_5_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
+
+    // Compute DFT N = 5
+    DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
+
+    // Store five complex output values
+    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_5_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+
+    // Compute DFT N = 5
+    DFT_5(data0, data1, data2, data3, data4);
+
+    // Store five complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_7_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data1 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6, 0, 0));
+
+    // Compute DFT N = 7
+    DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
+
+    // Store seven complex output values
+    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore4(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6, 0, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_7_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
+
+    // Compute DFT N = 7
+    DFT_7(data0, data1, data2, data3, data4, data5, data6);
+
+    // Store seven complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_8_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)input.ptr);
+
+    // Compute DFT N = 8
+    DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
+
+    // Store eight complex output values
+    vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_8_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7, 0));
+
+    // Compute DFT N = 8
+    DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);
+
+    // Store eight complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
+    vstore2(data7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_2_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-2
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+    // Compute DFT N = 2
+    DFT_2(c0, c1);
+
+    // Store two complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_2_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-2
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+    // Compute DFT N = 2
+    DFT_2(c0, c1);
+
+    // Store two complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_3_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-3
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+    // Compute DFT N = 3
+    DFT_3(c0, c1, c2);
+
+    // Store three complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_3_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-3
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+    // Compute DFT N = 3
+    DFT_3(c0, c1, c2);
+
+    // Store three complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_4_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-4
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+    // Compute DFT N = 4
+    DFT_4(c0, c1, c2, c3);
+
+    // Store four complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_4_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-4
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+    // Compute DFT N = 4
+    DFT_4(c0, c1, c2, c3);
+
+    // Store four complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_5_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-5
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+    // Compute DFT N = 5
+    DFT_5(c0, c1, c2, c3, c4);
+
+    // Store five complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_5_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-5
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+    // Compute DFT N = 5
+    DFT_5(c0, c1, c2, c3, c4);
+
+    // Store five complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_7_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-7
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+    // Compute DFT N = 7
+    DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+    // Store seven complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_7_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-7
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+    // Compute DFT N = 7
+    DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+    // Store seven complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_8_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-8
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 7 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+    // Compute DFT N = 8
+    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+    // Store eight complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 7 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_8_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-8
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+    // Compute DFT N = 8
+    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+    // Store eight complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7 * Nx, 0));
+}
+#endif // defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/fft_digit_reverse.cl b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
new file mode 100644
index 0000000000..5f64d95bf9
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+/** Computes the digit reverse stage on axis X
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
+ * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
+ * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_0(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    VECTOR_DECLARATION(idx))
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   idx = CONVERT_TO_VECTOR_STRUCT(idx);
+
+    const unsigned int iidx = *((__global uint *)(idx.ptr));
+
+    // Load data
+#if VEC_SIZE == 1
+    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Create result
+#if VEC_SIZE == 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { data, 0 };
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Store result
+#if defined(CONJ)
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // defined(CONJ)
+}
+
+/** Computes the digit reverse stage on axis Y
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
+ * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
+ * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_1(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    VECTOR_DECLARATION(idx))
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   idx = CONVERT_TO_VECTOR_STRUCT_NO_STEP(idx);
+
+    const unsigned int iidx = *((__global uint *)vector_offset(&idx, (int)(get_global_id(1))));
+
+    // Load data
+#if VEC_SIZE == 1
+    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Create result
+#if VEC_SIZE == 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { data, 0 };
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Store result
+#if defined(CONJ)
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // defined(CONJ)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/fft_scale.cl b/src/core/CL/cl_kernels/common/fft_scale.cl
new file mode 100644
index 0000000000..c799dd3b9e
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fft_scale.cl
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+/** Computes the fft scale stage
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        (Optional) dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        (Optional) dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        (Optional) dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ * @param[in]  scale                             Scale to apply to the complex value
+ */
+__kernel void fft_scale_conj(
+    TENSOR3D_DECLARATION(src)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(dst)
+#endif /* not IN_PLACE */
+    ,
+    float scale)
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+#if defined(IN_PLACE)
+    Tensor3D dst = src;
+#else  /* IN_PLACE */
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#endif /* IN_PLACE */
+
+    // Store result
+#if VEC_SIZE == 1
+    *((__global DATA_TYPE *)dst.ptr) = (*(__global DATA_TYPE *)src.ptr) / (DATA_TYPE)scale;
+#elif VEC_SIZE == 2
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)src.ptr);
+    data /= (DATA_TYPE)scale;
+#if defined(CONJ)
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(data.s0, -data.s1), 0, (__global DATA_TYPE *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(data, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // defined(CONJ)
+#else  // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/fill_border.cl b/src/core/CL/cl_kernels/common/fill_border.cl
new file mode 100644
index 0000000000..a43343c9f4
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fill_border.cl
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]     buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     width                             Width of the valid region of the image
+ * @param[in]     height                            Height of the valid region of the image
+ * @param[in]     start_pos                         XY coordinate indicating the start point of the valid region
+ */
+__kernel void fill_image_borders_replicate(
+    TENSOR3D_DECLARATION(buf),
+    uint width,
+    uint height,
+    int2 start_pos)
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        DATA_TYPE left_val = *(__global DATA_TYPE *)offset(&buf, 0, gidH);
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = left_val;
+        }
+        // Handle right border
+        DATA_TYPE right_val = *(__global DATA_TYPE *)offset(&buf, width - 1, gidH);
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = right_val;
+        }
+    }
+    else
+    {
+        // Get value for corners
+        int val_idx = gidW;
+        if(gidW < 0 || gidW > (width - 1))
+        {
+            val_idx = gidW < 0 ? 0 : width - 1;
+        }
+
+        // Handle top border
+        DATA_TYPE top_val = *(__global DATA_TYPE *)offset(&buf, val_idx, 0);
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = top_val;
+        }
+        // Handle bottom border
+        DATA_TYPE bottom_val = *(__global DATA_TYPE *)offset(&buf, val_idx, height - 1);
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = bottom_val;
+        }
+    }
+}
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]  buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  width                             Width of the valid region of the image
+ * @param[in]  height                            Height of the valid region of the image
+ * @param[in]  start_pos                         XY coordinate indicating the start point of the valid region
+ * @param[in]  constant_value                    Constant value to use to fill the edges
+ */
+__kernel void fill_image_borders_constant(
+    TENSOR3D_DECLARATION(buf),
+    uint      width,
+    uint      height,
+    int2      start_pos,
+    DATA_TYPE constant_value)
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = constant_value;
+        }
+        // Handle right border
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = constant_value;
+        }
+    }
+    else
+    {
+        // Handle top border
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = constant_value;
+        }
+        // Handle bottom border
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = constant_value;
+        }
+    }
+}
diff --git a/src/core/CL/cl_kernels/floor.cl b/src/core/CL/cl_kernels/common/floor.cl
index f6dd4edd2e..f6dd4edd2e 100644
--- a/src/core/CL/cl_kernels/floor.cl
+++ b/src/core/CL/cl_kernels/common/floor.cl
diff --git a/src/core/CL/cl_kernels/common/gather.cl b/src/core/CL/cl_kernels/common/gather.cl
new file mode 100644
index 0000000000..e16f4bf315
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gather.cl
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS)
+
+/** Performs the Gather operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ *
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source tensor
+ * @param[in]  indices_ptr                           Pointer to the indices vector. Supported data types: S32/U32.
+ * @param[in]  indices_stride_x                      Stride of the indices vector in X dimension (in bytes)
+ * @param[in]  indices_step_x                        input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes Offset of the first element in the indices vector
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination tensor
+ */
+__kernel void gather(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(indices),
+    TENSOR4D_DECLARATION(output))
+{
+    const int px = get_global_id(0);
+    const int py = get_global_id(1);
+    const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+    const int pw = (get_global_id(2) / OUTPUT_DIM_Z );
+
+    const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    const Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(indices);
+    Tensor4D       output  = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, py, pz, pw);
+#elif INDICES_DIMS == 2
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, py, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, pz, pw, 0);
+#elif INDICES_DIMS == 3
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, py, pz, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, pw, 0, 0);
+#elif INDICES_DIMS == 4
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, py, pz, pw);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, 0, 0, 0);
+#endif //INDICES_DIMS
+
+#elif AXIS == 1
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, py, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+     __global const uchar *input_addr = tensor4D_offset(&input, px, safe_index, pz, pw);
+#elif INDICES_DIMS == 2
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, py, pz, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, safe_index, pw, 0);
+#elif INDICES_DIMS == 3
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, py, pz, pw, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, safe_index, 0, 0);
+#endif //INDICES_DIMS
+
+#elif AXIS == 2
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, pz, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+     __global const uchar *input_addr = tensor4D_offset(&input, px, py, safe_index, pw);
+#elif INDICES_DIMS == 2
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, pz, pw, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, py, safe_index, 0);
+#endif //INDICES_DIMS
+
+#elif AXIS == 3
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, pw, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+     __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, safe_index);
+#endif //INDICES_DIMS
+
+#endif //AXIS
+
+    *(__global DATA_TYPE *)output.ptr = select((DATA_TYPE)0, *((__global const DATA_TYPE *)input_addr), (DATA_TYPE)(index < INDEX_LIMIT));
+}
+
+#endif //defined(DATA_TYPE) && defined(AXIS)
diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
new file mode 100644
index 0000000000..0c30c0e626
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemm.cl
@@ -0,0 +1,3594 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
+
+#define CONCAT(a, b) a##b
+
+#define ARM_DOT1(a, b, c) \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+#define ARM_DOT2(a, b, c)       \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+    })
+#define ARM_DOT3(a, b, c)           \
+    ({                              \
+        ARM_DOT2(a, b, c);          \
+        c = fma((a.s2), (b.s2), c); \
+    })
+#define ARM_DOT4(a, b, c)           \
+    ({                              \
+        ARM_DOT3(a, b, c);          \
+        c = fma((a.s3), (b.s3), c); \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##8), (c.s8));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##9), (c.s9));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##A), (c.sA));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##B), (c.sB));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##C), (c.sC));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##D), (c.sD));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##E), (c.sE));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##F), (c.sF));     \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
+                                          IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                          IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                          IMAGE_DECLARATION(dst),
+                                          uint lhs_stride_z,
+                                          uint rhs_stride_z,
+#if defined(BETA)
+                                          uint bias_stride_z,
+#endif //defined(BETA)
+                                          uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                          ,
+                                          uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                          ,
+                                          uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                          ,
+                                          const int M,
+                                          const int N,
+                                          const int K)
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS reshaped matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS reshaped matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS reshaped matrix
+        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(1, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(1, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(1, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(1, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(1, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(1, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(1, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(1, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
+}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T)
+
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
+                                                  __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                  IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                  IMAGE_DECLARATION(dst),
+                                                  uint lhs_stride_z,
+                                                  uint rhs_stride_z,
+#if defined(BETA)
+                                                  uint bias_stride_z,
+#endif //defined(BETA)
+                                                  uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                  ,
+                                                  uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                  ,
+                                                  uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                  ,
+                                                  const int M,
+                                                  const int N,
+                                                  const int K)
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
+
+    const uint LEFTOVER_K = K % K0;
+
+    // Block size
+#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X (PIXEL_UNIT * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X PIXEL_UNIT
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = get_global_id(2);
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix stored in a cl_image
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    if(LEFTOVER_K != 0)
+    {
+        // Note: We cannot read out-of-bound elements from the RHS matrix because
+        // the RHS width is always multiple of K0. This is not be true for the LHS matrix
+        // Left-over accumulations for LHS matrix
+
+        union UNION_VEC_TYPE
+        {
+            DATA_TYPE s[K0];
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            v;
+        };
+
+        union UNION_VEC_TYPE a0 = {.v = 0 };
+#if M0 > 1
+        union UNION_VEC_TYPE a1 = {.v = 0 };
+#endif // M0 > 1
+#if M0 > 2
+        union UNION_VEC_TYPE a2 = {.v = 0 };
+#endif // M0 > 2
+#if M0 > 3
+        union UNION_VEC_TYPE a3 = {.v = 0 };
+#endif // M0 > 3
+#if M0 > 4
+        union UNION_VEC_TYPE a4 = {.v = 0 };
+#endif // M0 > 4
+#if M0 > 5
+        union UNION_VEC_TYPE a5 = {.v = 0 };
+#endif // M0 > 5
+#if M0 > 6
+        union UNION_VEC_TYPE a6 = {.v = 0 };
+#endif // M0 > 6
+#if M0 > 7
+        union UNION_VEC_TYPE a7 = {.v = 0 };
+#endif // M0 > 7
+
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+
+        // Load from RHS matrix
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+        // Load from LHS matrix
+        for(int k = 0; k < LEFTOVER_K; ++k)
+        {
+            a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
+#if M0 > 1
+            a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
+#endif // M0 > 1
+#if M0 > 2
+            a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
+#endif // M0 > 2
+#if M0 > 3
+            a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
+#endif // M0 > 3
+#if M0 > 4
+            a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
+#endif // M0 > 4
+#if M0 > 5
+            a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
+#endif // M0 > 5
+#if M0 > 6
+            a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
+#endif // M0 > 6
+#if M0 > 7
+            a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
+#endif // M0 > 7
+
+            lhs_offset += sizeof(DATA_TYPE);
+        }
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0.v, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1.v, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2.v, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3.v, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4.v, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5.v, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6.v, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7.v, b, c7);
+#endif // M0 > 7
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
+#undef PIXEL_UNIT
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)
+
+#define VFMA(a, b, c)     \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+
+#if M0 == 1
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    })
+#elif M0 == 2 // M0 == 2
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    })
+#elif M0 == 3 // M0 == 3
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    })
+#elif M0 == 4 // M0 == 4
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    })
+#elif M0 == 5 // M0 == 5
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    })
+#elif M0 == 6 // M0 == 6
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    })
+#elif M0 == 7 // M0 == 7
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+    })
+#elif M0 == 8 // M0 == 8
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+    })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
+                                           IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                           IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                           IMAGE_DECLARATION(dst),
+                                           uint lhs_stride_z,
+                                           uint rhs_stride_z,
+#if defined(BETA)
+                                           uint bias_stride_z,
+#endif //defined(BETA)
+                                           uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                           ,
+                                           uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                           ,
+                                           uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                           ,
+                                           const int M,
+                                           const int N,
+                                           const int K)
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS reshaped matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   //uint zin0=0,zin1=0,zin2=0,... zin7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(0, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(1, a, b0, c);
+#if K0 > 2
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(2, a, b0, c);
+#endif // K0 > 2
+#if K0 > 3
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(3, a, b0, c);
+#endif // K0 > 3
+#if K0 > 4
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(4, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(5, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(6, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(7, a, b0, c);
+#endif // K0 > 4
+#if K0 > 8
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(8, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(9, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(A, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(B, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(C, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(D, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(E, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(F, a, b0, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(0, a, b0, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
+}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)
+
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
+                                                   __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                   IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                   IMAGE_DECLARATION(dst),
+                                                   uint lhs_stride_z,
+                                                   uint rhs_stride_z,
+#if defined(BETA)
+                                                   uint bias_stride_z,
+#endif //defined(BETA)
+                                                   uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                   ,
+                                                   uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                   ,
+                                                   uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                   ,
+                                                   const int M,
+                                                   const int N,
+                                                   const int K)
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#define RHS_STEP_LOOP 1
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (PIXEL_UNIT)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (z % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(0, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(1, a, b0, c);
+#if K0 > 2
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(2, a, b0, c);
+#endif // K0 > 2
+#if K0 > 3
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(3, a, b0, c);
+#endif // K0 > 3
+#if K0 > 4
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(4, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(5, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(6, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(7, a, b0, c);
+#endif // K0 > 4
+#if K0 > 8
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(8, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(9, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(A, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(B, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(C, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(D, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(E, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(F, a, b0, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+
+        VFMA_M0xN0(0, a, b0, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        x_rhs += RHS_STEP_X;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
+
+#if defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+        c += a.s4 * b.s4;   \
+        c += a.s5 * b.s5;   \
+        c += a.s6 * b.s6;   \
+        c += a.s7 * b.s7;   \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+        c += a.s4 * b.s4;   \
+        c += a.s5 * b.s5;   \
+        c += a.s6 * b.s6;   \
+        c += a.s7 * b.s7;   \
+        c += a.s8 * b.s8;   \
+        c += a.s9 * b.s9;   \
+        c += a.sA * b.sA;   \
+        c += a.sB * b.sB;   \
+        c += a.sC * b.sC;   \
+        c += a.sD * b.sD;   \
+        c += a.sE * b.sE;   \
+        c += a.sF * b.sF;   \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#else  // defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+        c = fma(a.s8, b.s8, c); \
+        c = fma(a.s9, b.s9, c); \
+        c = fma(a.sA, b.sA, c); \
+        c = fma(a.sB, b.sB, c); \
+        c = fma(a.sC, b.sC, c); \
+        c = fma(a.sD, b.sD, c); \
+        c = fma(a.sE, b.sE, c); \
+        c = fma(a.sF, b.sF, c); \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#endif // defined(MIXED_PRECISION)
+
+#if defined(ARM_DOT_K0XN0)
+#undef ARM_DOT_K0XN0
+#endif // defined(ARM_DOT_K0XN0)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+        ARM_DOT_K0((a), (b##8), (c.s8)); \
+        ARM_DOT_K0((a), (b##9), (c.s9)); \
+        ARM_DOT_K0((a), (b##A), (c.sA)); \
+        ARM_DOT_K0((a), (b##B), (c.sB)); \
+        ARM_DOT_K0((a), (b##C), (c.sC)); \
+        ARM_DOT_K0((a), (b##D), (c.sD)); \
+        ARM_DOT_K0((a), (b##E), (c.sE)); \
+        ARM_DOT_K0((a), (b##F), (c.sF)); \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+                                            IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst),
+                                            uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif //defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                            ,
+                                            const int M,
+                                            const int N,
+                                            const int K)
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                               (get_global_id(2) * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+#endif // defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)
+
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
+                                                    __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                    IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                    IMAGE_DECLARATION(dst),
+                                                    uint lhs_stride_z,
+                                                    uint rhs_stride_z,
+#if defined(BETA)
+                                                    uint bias_stride_z,
+#endif //defined(BETA)
+                                                    uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                    ,
+                                                    uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                    ,
+                                                    const int M,
+                                                    const int N,
+                                                    const int K)
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
+
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X (PIXEL_UNIT * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X PIXEL_UNIT
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                               (get_global_id(2) * lhs_stride_z);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = get_global_id(2);
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+        // Load values from RHS matrix stored in a cl_image
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+
+        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef PIXEL_UNIT
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)
+
+#if defined(LHS_TRANSPOSE)
+
+#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
+
+#if defined(MIXED_PRECISION)
+
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#else // defined(MIXED_PRECISION
+
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#endif // defined(MIXED_PRECISION)
+
+#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
+    ({                                                 \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
+    })
+#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
+    })
+#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
+    })
+#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
+    })
+#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
+    })
+
+// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
+// a is the column-vector (transposed)
+// b is the row-vector (not transposed)
+// C is the output matrix
+// Lower case is a vector (a, b)
+// Upper case is a matrix (C)
+#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
+
+#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
+    ({                                                        \
+        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
+    })
+
+// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
+// The dimensions for this matrix multiplications are defined through M0, N0 and K0
+// The dimensions supported are:
+// M0: 1, 2, 3, 4, 8
+// N0: 1, 2, 3, 4, 8, 16
+// K0: 1, 2, 3, 4, 8, 16
+// This macro calls the vector-by-matrix macro K0 times
+// A, B and C are matrices
+#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
+    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
+    (M0, N0, TYPE, A, B, C)
+
+#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
+ *
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
+                                            IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst),
+                                            uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif //defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                            ,
+                                            const int M,
+                                            const int N,
+                                            const int K)
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#endif // defined(RHS_INTERLEAVE)
+
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint z = get_global_id(2);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, M0)
+        a0;
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+#if K0 > 1
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+        lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+#ifndef RHS_INTERLEAVE
+        rhs += (N0 * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)
+
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at runtime.
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
+                                                    __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                    IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                    IMAGE_DECLARATION(dst),
+                                                    uint lhs_stride_z,
+                                                    uint rhs_stride_z,
+#if defined(BETA)
+                                                    uint bias_stride_z,
+#endif //defined(BETA)
+                                                    uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                    ,
+                                                    uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                    ,
+                                                    const int M,
+                                                    const int N,
+                                                    const int K)
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
+
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (PIXEL_UNIT)
+#endif // defined(RHS_INTERLEAVE)
+
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (z % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, M0)
+        a0;
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+#if K0 > 1
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+        lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+        x_rhs += K0 * RHS_STEP_X;
+#ifndef RHS_INTERLEAVE
+        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef PIXEL_UNIT
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)
+
+#endif // defined(LHS_TRANSPOSE)
+
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)
+
+#define VFMA(a, b, c)     \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+
+#if M0 == 1
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    })
+#elif M0 == 2 // M0 == 2
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    })
+#elif M0 == 3 // M0 == 3
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    })
+#elif M0 == 4 // M0 == 4
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    })
+#elif M0 == 5 // M0 == 5
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    })
+#elif M0 == 6 // M0 == 6
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    })
+#elif M0 == 7 // M0 == 7
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+    })
+#elif M0 == 8 // M0 == 8
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+    })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+#if defined(GEMM_MM_NATIVE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         lhs_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         lhs_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         rhs_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         rhs_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
+                             IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                             IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                             IMAGE_DECLARATION(dst),
+                             uint lhs_stride_z,
+                             uint rhs_stride_z,
+#if defined(BETA)
+                             uint bias_stride_z,
+#endif //defined(BETA)
+                             uint      dst_stride_z,
+                             const int M,
+                             const int N,
+                             const int K
+#if defined(REINTERPRET_INPUT_AS_3D)
+                             ,
+                             uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                             ,
+                             uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                            )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+#if K0 > 1
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
+
+        RHS_VFMA_M0xN0(0, a, b0, c);
+        RHS_VFMA_M0xN0(1, a, b1, c);
+#if K0 > 2
+        RHS_VFMA_M0xN0(2, a, b2, c);
+#endif // K0 > 2
+#if K0 > 3
+        RHS_VFMA_M0xN0(3, a, b3, c);
+#endif // K0 > 3
+#if K0 > 4
+        RHS_VFMA_M0xN0(4, a, b4, c);
+        RHS_VFMA_M0xN0(5, a, b5, c);
+        RHS_VFMA_M0xN0(6, a, b6, c);
+        RHS_VFMA_M0xN0(7, a, b7, c);
+#endif // K0 > 4
+#if K0 > 8
+        RHS_VFMA_M0xN0(8, a, b8, c);
+        RHS_VFMA_M0xN0(9, a, b9, c);
+        RHS_VFMA_M0xN0(A, a, bA, c);
+        RHS_VFMA_M0xN0(B, a, bB, c);
+        RHS_VFMA_M0xN0(C, a, bC, c);
+        RHS_VFMA_M0xN0(D, a, bD, c);
+        RHS_VFMA_M0xN0(E, a, bE, c);
+        RHS_VFMA_M0xN0(F, a, bF, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += K0 * rhs_stride_y;
+    }
+#endif // K0 > 1
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
+#endif // M0 > 7
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
+        RHS_VFMA_M0xN0(0, a, b, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += rhs_stride_y;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+#endif // defined(GEMM_MM_NATIVE)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)
+
+#if defined(BETA)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
+                          TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Load values from A x B
+    float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
+
+    // Load values from Matrix C
+    float4 c = vload4(0, (__global float *)src.ptr);
+
+    // Computes alpha * axb + beta * c
+    float4 out = alpha_ab + (float4)BETA * c;
+
+    // Store final result in axb matrix
+    vstore4(out, 0, (__global float *)dst.ptr);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
+                          TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Load values from A x B
+    half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
+
+    // Load values from Matrix C
+    half8 c = vload8(0, (__global half *)src.ptr);
+
+    // Computes alpha * axb + beta * c
+    half8 out = alpha_ab + (half8)BETA * c;
+
+    // Store final result in axb matrix
+    vstore8(out, 0, (__global half *)dst.ptr);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(BETA)
diff --git a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
new file mode 100644
index 0000000000..09b8956b68
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
@@ -0,0 +1,556 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_MMUL)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension:
+ *
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is NOT transposed
+ *
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=2)
+ * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=2)
+ * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS tensor. Supported data types: F16/F32
+ * @param[in]  lhs_stride_y                      Stride of the LHS tensor in Y dimension (in bytes)
+ * @param[in]  lhs_stride_z                      Stride of the LHS tensor in Z dimension (in bytes)
+ * @param[in]  lhs_w                             The size of the width dimension of the LHS tensor
+ * @param[in]  lhs_h                             The size of the height dimension of the LHS tensor
+ * @param[in]  lhs_n                             The size of the depth dimension of the LHS tensor
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                      Stride of the RHS tensor in Y dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS tensor in Z dimension (in bytes)
+ * @param[in]  rhs_w                             The size of the width dimension of the RHS tensor
+ * @param[in]  rhs_h                             The size of the height dimension of the RHS tensor
+ * @param[in]  rhs_n                             The size of the depth dimension of the RHS tensor
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bia_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bia_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bia_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bia_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bia_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  M                                 Number of rows in LHS matrix not reshaped
+ * @param[in]  N                                 Number of columns in RHS matrix not reshaped
+ * @param[in]  K                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_mmul(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#if defined(BETA)
+    TENSOR3D_T(bia, BUFFER),
+#endif // defined(BETA)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0)
+
+    uint x0 = get_global_id(0); // (N / N0) * MMUL_K0
+    uint y0 = get_global_id(1); // (M / M0) / MMUL_M0
+    uint z  = get_global_id(2); // Batch
+
+    // Get block ID and thread ID within the block
+    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+
+    // Coordinate within a block
+    uint block_x = thread_id % MMUL_N0;
+    uint block_y = (thread_id / MMUL_M0);
+
+    // Starting destination coordinates
+    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
+    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
+
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+
+    // Starting LHS coordinates
+    uint lhs_x = block_x;
+    uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
+    uint rhs_y = block_id;
+
+    // Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef BATCHED_RHS
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+#else // BATCHED_RHS
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y;
+#endif // BATCHED_RHS
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
+
+    // Note: If RHS derives from the weights of convolution 2d layer, RHS will always be 2D and rhs_stride_z will always be equal to 0 for
+    // not sliding the tensor
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+#if !defined(HALF_PRECISION)
+#define c c_f32
+#endif // !defined(HALF_PRECISION)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, 0, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * MMUL_N0 * N0 * sizeof(DATA_TYPE);
+    }
+
+    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
+    {
+        return;
+    }
+
+    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#endif // defined(HALF_PRECISION)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE);
+
+    TILE(DATA_TYPE, 1, N0, bias0);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, N0_LEFTOVER)
+        (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c);
+#else // defined(BROADCAST_BIAS)
+    TILE(DATA_TYPE, M0, N0, bias0);
+
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z;
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VLOAD_PARTIAL(N0, N0_LEFTOVER)
+                (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    T_ADD(DATA_TYPE, M0, N0, c, bias0, c);
+    // c = c + bias
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    // Store
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL)
+
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_MMUL_TEXTURE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension and the OpenCL image for RHS:
+ *
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is NOT transposed
+ *
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=2)
+ * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=2)
+ * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS tensor. Supported data types: F16/F32
+ * @param[in]  lhs_stride_y                      Stride of the LHS tensor in Y dimension (in bytes)
+ * @param[in]  lhs_stride_z                      Stride of the LHS tensor in Z dimension (in bytes)
+ * @param[in]  lhs_w                             The size of the width dimension of the LHS tensor
+ * @param[in]  lhs_h                             The size of the height dimension of the LHS tensor
+ * @param[in]  lhs_n                             The size of the depth dimension of the LHS tensor
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                      Stride of the RHS tensor in Y dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS tensor in Z dimension (in bytes)
+ * @param[in]  rhs_w                             The size of the width dimension of the RHS tensor
+ * @param[in]  rhs_h                             The size of the height dimension of the RHS tensor
+ * @param[in]  rhs_n                             The size of the depth dimension of the RHS tensor
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bia_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bia_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bia_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bia_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bia_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  M                                 Number of rows in LHS matrix not reshaped
+ * @param[in]  N                                 Number of columns in RHS matrix not reshaped
+ * @param[in]  K                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, IMAGE),
+#if defined(BETA)
+    TENSOR3D_T(bia, BUFFER),
+#endif // defined(BETA)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0)
+
+    uint x0 = get_global_id(0); // (N / N0) * MMUL_K0
+    uint y0 = get_global_id(1); // (M / M0) / MMUL_M0
+    uint z  = get_global_id(2); // Batch
+
+    // Get block ID and thread ID within the block
+    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+
+    // Coordinate within a block
+    uint block_x = thread_id % MMUL_N0;
+    uint block_y = (thread_id / MMUL_M0);
+
+    // Starting destination coordinates
+    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
+    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
+
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+
+    // Starting LHS coordinates
+    uint lhs_x = block_x;
+    uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
+
+#ifdef BATCHED_RHS
+    uint rhs_y = block_id + z * rhs_h;
+#else // BATCHED_RHS
+    uint rhs_y = block_id;
+#endif // BATCHED_RHS
+
+    // Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+#if !defined(HALF_PRECISION)
+#define c c_f32
+#endif // !defined(HALF_PRECISION)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, IMAGE, rhs, rhs_x, rhs_y, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_x += MMUL_K0 * MMUL_N0 * N0;
+    }
+
+    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
+    {
+        return;
+    }
+
+    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#endif // defined(HALF_PRECISION)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE);
+
+    TILE(DATA_TYPE, 1, N0, bias0);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, N0_LEFTOVER)
+        (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c);
+#else // defined(BROADCAST_BIAS)
+    TILE(DATA_TYPE, M0, N0, bias0);
+
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z;
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VLOAD_PARTIAL(N0, N0_LEFTOVER)
+                (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    T_ADD(DATA_TYPE, M0, N0, c, bias0, c);
+    // c = c + bias
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    // Store
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)
diff --git a/src/core/CL/cl_kernels/common/gemm_utils.cl b/src/core/CL/cl_kernels/common/gemm_utils.cl
new file mode 100644
index 0000000000..be57d94ce6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemm_utils.cl
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(RESHAPE_LHS_NT)
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] M                                 The size of height dimension of the source tensor, affected by reinterpret_input_as_3d
+ * @param[in] V0                                The number of blocks to place on the same row. It must be greater than 0.
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_T(src, BUFFER),
+                                         TENSOR3D_T(dst, BUFFER),
+                                         const int M,
+                                         const int V0)
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0); // K
+    const int y = GET_SPATIAL_IDX(1, 1, 0); // M
+    const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size
+
+    const int xi = x * K0;
+    const int yi = y * M0;
+
+    const int xo = x * BLOCK_SIZE * V0 + (y % V0) * OUTPUT_OFFSET_X;
+    const int yo = (y / V0);
+
+    // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, M0, K0, in);
+
+    // Initialize the input tile to zero
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in[_i].v = 0;
+    });
+
+    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
+    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
+    // Load input tile
+    TILE(uint, M0, 1, in_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in_indirect_y[_i].v = _i;
+
+    });
+#if PARTIAL_M0 != 0
+    if(y_cond)
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    else
+#endif // PARTIAL_M0 != 0
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+
+    // Store output tile
+    TILE(uint, M0, 1, dst_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        dst_indirect_y[_i].v = _i;
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_LHS_NT)
+
+#if defined(RESHAPE_LHS_T)
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] M                                 The size of height dimension of the source tensor, affected by reinterpret_input_as_3d
+ * @param[in] V0                                The number of blocks to place on the same row. It must be greater than 0
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_T(src, BUFFER),
+                                        TENSOR3D_T(dst, BUFFER),
+                                        const int M,
+                                        const int V0)
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0); // K
+    const int y = GET_SPATIAL_IDX(1, 1, 0); // M
+    const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size
+
+    const int xi = x * K0;
+    const int yi = y * M0;
+
+    const int xo = x * BLOCK_SIZE * V0 + ((y % V0) * OUTPUT_OFFSET_X);
+    const int yo = (y / V0);
+
+    // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, M0, K0, in);
+    TILE(DATA_TYPE, K0, M0, in_tr);
+
+    // Initialize the tile to zero
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in[_i].v = 0;
+    });
+
+    // Load input tile
+    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
+    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
+
+    TILE(uint, M0, 1, in_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in_indirect_y[_i].v = _i;
+
+    });
+#if PARTIAL_M0 != 0
+    if(y_cond)
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    else
+#endif // PARTIAL_M0 != 0
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    // Transpose input tile
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, k0, 0, 1, K0,
+        {
+            in_tr[k0].s[m0] = in[m0].s[k0];
+        })
+    });
+
+    TILE(uint, K0, 1, dst_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, K0,
+    {
+        dst_indirect_y[_i].v = _i;
+    });
+
+    // Store output tile
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, M0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_LHS_T)
+
+#if defined(RESHAPE_RHS_NT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 1,2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] H0                                The number of blocks to place on the same row. It must be greater than 0
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_T(src, BUFFER),
+                                         TENSOR3D_T(dst, BUFFER),
+                                         const int H0)
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0);
+    const int y = GET_SPATIAL_IDX(1, 1, 0);
+    const int z = GET_SPATIAL_IDX(2, 1, 0);
+
+    const int xi = x * N0;
+    const int yi = y * K0;
+
+    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
+    const int yo = (x / H0);
+
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, K0, N0, in);
+
+    // Initialize the tile to zero
+    for(int i = 0; i < K0; ++i)
+    {
+        in[i].v = 0;
+    }
+
+    // Load input tile
+    for(int i = 0; i < K0; ++i)
+    {
+        if(yi + i < src_h)
+        {
+            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
+        }
+    }
+
+    TILE(uint, K0, 1, dst_indirect_y);
+    for(int i = 0; i < K0; ++i)
+    {
+        dst_indirect_y[i].v = i;
+    }
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_RHS_NT)
+
+#if defined(RESHAPE_RHS_T)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] H0                                The number of blocks to place on the same row. It must be greater than 0.
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_T(src, BUFFER),
+                                        TENSOR3D_T(dst, BUFFER),
+                                        const int H0)
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0);
+    const int y = GET_SPATIAL_IDX(1, 1, 0);
+    const int z = GET_SPATIAL_IDX(2, 1, 0);
+
+    const int xi = x * N0;
+    const int yi = y * K0;
+
+    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
+    const int yo = (x / H0);
+
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, K0, N0, in);
+    TILE(DATA_TYPE, N0, K0, in_tr);
+
+    // Initialize the tile to zero
+    for(int i = 0; i < K0; ++i)
+    {
+        in[i].v = 0;
+    }
+
+    // Load input tile
+    for(int i = 0; i < K0; ++i)
+    {
+        if(yi + i < src_h)
+        {
+            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
+        }
+    }
+
+    // Transpose input tile
+    for(int k0 = 0; k0 < K0; ++k0)
+    {
+        for(int n0 = 0; n0 < N0; ++n0)
+        {
+            in_tr[n0].s[k0] = in[k0].s[n0];
+        }
+    }
+
+    TILE(uint, N0, 1, dst_indirect_y);
+    for(int i = 0; i < N0; ++i)
+    {
+        dst_indirect_y[i].v = i;
+    }
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#endif // defined(RESHAPE_RHS_T)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl
new file mode 100644
index 0000000000..62c4cd31f5
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemmlowp.cl
@@ -0,0 +1,2162 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "helpers_asymm.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#define ARM_DOT1(a, b, c)                                                                                                                               \
+    ({                                                                                                                                                  \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
+    })
+#define ARM_DOT2(a, b, c)                                                                                                                               \
+    ({                                                                                                                                                  \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
+    })
+#define ARM_DOT3(a, b, c)                                                                                           \
+    ({                                                                                                              \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
+    })
+#define ARM_DOT4(a, b, c) \
+    ({                    \
+        ARM_DOT(a, b, c); \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] without using the dot8 instruction. */
+#define ARM_DOT1(a, b, c)          \
+    ({                             \
+        c += (ACC_DATA_TYPE)a * b; \
+    })
+#define ARM_DOT2(a, b, c)                \
+    ({                                   \
+        c += (ACC_DATA_TYPE)a.s0 * b.s0; \
+        c += (ACC_DATA_TYPE)a.s1 * b.s1; \
+    })
+#define ARM_DOT3(a, b, c)                \
+    ({                                   \
+        ARM_DOT2(a, b, c);               \
+        c += (ACC_DATA_TYPE)a.s2 * b.s2; \
+    })
+#define ARM_DOT4(a, b, c)                \
+    ({                                   \
+        ARM_DOT3(a, b, c);               \
+        c += (ACC_DATA_TYPE)a.s3 * b.s3; \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
+#define ARM_DOT_K0X1(k0, a, b, c)         \
+    ({                                    \
+        ARM_DOT_K0(k0, (a), (b##0), (c)); \
+    })
+#define ARM_DOT_K0X2(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
+        ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
+    })
+#define ARM_DOT_K0X3(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X2(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
+    })
+#define ARM_DOT_K0X4(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X3(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
+    })
+#define ARM_DOT_K0X8(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X4(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
+        ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
+        ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
+        ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
+    })
+#define ARM_DOT_K0X16(k0, a, b, c)           \
+    ({                                       \
+        ARM_DOT_K0X8(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
+        ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
+        ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
+        ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
+        ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
+        ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
+        ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
+        ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
+    })
+
+/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_K0XN0X1(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
+    })
+#define ARM_MM_K0XN0X2(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X1(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
+    })
+#define ARM_MM_K0XN0X3(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X2(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
+    })
+#define ARM_MM_K0XN0X4(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X3(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
+    })
+#define ARM_MM_K0XN0X5(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X4(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
+    })
+#define ARM_MM_K0XN0X6(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X5(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
+    })
+#define ARM_MM_K0XN0X7(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X6(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
+    })
+#define ARM_MM_K0XN0X8(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X7(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
+    })
+
+#define ARM_DOT_K0(k0, a, b, c) \
+    ({                          \
+        CONCAT(ARM_DOT, k0)     \
+        ((a), (b), (c));        \
+    })
+
+#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
+    ({                                 \
+        CONCAT(ARM_DOT_K0X, n0)        \
+        (k0, (a), b, (c));             \
+    })
+
+#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
+    ({                                       \
+        CONCAT(ARM_MM_K0XN0X, m0)            \
+        (n0, k0, a, b, c);                   \
+    })
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
+#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c)   \
+    ({                                           \
+        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \
+    })
+#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
+        c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
+    })
+#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
+    })
+#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
+    })
+#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
+        c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
+        c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
+        c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
+    })
+#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c)       \
+    ({                                                \
+        ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
+        c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
+        c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
+        c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
+        c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
+        c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
+        c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
+        c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
+    })
+/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \
+    })
+#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
+    ({                                              \
+        CONCAT(ARM_MUL_N0X, k0)                     \
+        (VECTOR_ACC_TYPE, (a), b, (c));             \
+    })
+#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
+    ({                                                           \
+        CONCAT(ARM_MM_NATIVE_N0XK0X, m0)                         \
+        (VECTOR_ACC_TYPE, k0, a, b, c);                          \
+    })
+
+#if defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM/QASYMM_SIGNED data type.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM_SIGNED
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+                                                IMAGE_DECLARATION(rhs),
+                                                IMAGE_DECLARATION(dst),
+                                                uint k,
+                                                uint lhs_stride_z,
+                                                uint rhs_stride_z,
+                                                uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                ,
+                                                uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                               )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z));
+
+    // Compute RHS matrix address
+    __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    for(int i = 0; i < k; i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
+
+        // Partial matrix multiplication M0,N0,K0
+        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+        // Update address
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
+        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Convert and store output block
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+    // Store output block
+    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T)
+
+#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
+#define FUSED_OUTPUT_STAGE_FIXED_POINT
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage using fixed-point arithmetic.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed at compile time.
+ *
+ * @param[in]  lhs_ptr                                          Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_x                                     Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                                     Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes                The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                                          Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                                     Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                                     Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes                The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                                          Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                                     Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                                     Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                                     Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                              (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                              (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: S32
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
+ */
+#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT)
+__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint
+#elif defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT)
+__kernel void gemmlowp_mm_reshaped_only_rhs_t
+#endif                                         // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
+(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+#if defined(PER_CHANNEL_QUANTIZATION)
+ ,
+ VECTOR_DECLARATION(result_multipliers),
+ VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+    // @note: replace with (DIMENSION + PAD) once we pass the relevant info at compile time
+#define FULL_LHS_HEIGHT (lhs_stride_z / lhs_stride_y)
+#define FULL_DST_HEIGHT (dst_stride_z / dst_stride_y)
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X (K0 * H0)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0 * N0)
+#define RHS_STEP_X (K0)
+#endif // defined(RHS_INTERLEAVE)
+#define RHS_STEP_LOOP (N0 * K0 * H0)
+
+    uint x  = GET_SPATIAL_IDX(0, 1, 1);
+    uint y  = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    uint z  = GET_SPATIAL_IDX(2, 1, 1);
+    int  xo = (x * N0);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((xo >= N) || (y >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_y = y + z * FULL_LHS_HEIGHT;
+
+    // Compute RHS matrix address
+    uint rhs_offset_x = (x % H0) * RHS_OFFSET_X;
+    uint rhs_offset_y = (x / H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset_y += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset_y += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        // Load values from LHS matrix
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
+
+        // // Load values from RHS matrix
+        LOOP_UNROLLING(int, _i, 0, 1, N0,
+        {
+            b[_i].v = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X));
+        })
+
+        // Partial matrix multiplication M0,N0,K0
+        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+        rhs_offset_x += RHS_STEP_LOOP;
+    }
+
+#if((K % K0) != 0)
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        // Load values from LHS matrix
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
+
+        LOOP_UNROLLING(int, _i, 0, 1, N0,
+        {
+            b[_i].v = *(__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X);
+        })
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+        rhs_offset_x += 1;
+    }
+#endif // ((K % K0) != 0)
+
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+    TILE(int, M0, N0, c_int);
+    TILE(int, M0, N0, offset_s32);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET;
+    })
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_int[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
+    })
+
+#if defined(A_OFFSET)
+
+#if defined(SUM_COL_HAS_BATCHES)
+    int sum_col_y = z;
+#else  // defined(SUM_COL_HAS_BATCHES)
+    int sum_col_y = 0;
+#endif // defined(SUM_COL_HAS_BATCHES)
+    TILE(int, 1, N0, a_offset_s32);
+
+    T_LOAD(int, 1, N0, BUFFER, sum_col, xo, sum_col_y, 1, sum_col_stride_y, a_offset_s32);
+
+    a_offset_s32[0].v *= A_OFFSET;
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+    // Compute the offset contribution due to B_OFFSET
+    // Note: The sum_row tensor is generated through CLGEMMLowpMatrixAReductionKernel which
+    // does not introduce paddings. For this reason is safe to access the tensor in this manner
+    // without considering that the coordinate "y" could come from an input 3D tensor
+    TILE(int, M0, N0, b_offset_s32);
+
+    T_LOAD(int, M0, 1, BUFFER, sum_row, y + z * (sum_row_stride_y / sizeof(int)), 0, 1, sum_row_stride_x, b_offset_s32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v += b_offset_s32[i].v *B_OFFSET;
+    })
+
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+
+    TILE(int, 1, N0, bias);
+
+    T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias);
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32);
+#endif // defined(ADD_BIAS)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_int[i].v += offset_s32[i].v;
+    })
+
+    TILE(DATA_TYPE, M0, N0, c_lp);
+
+    // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+    TILE(int, 1, N0, res_mul);
+    TILE(int, 1, N0, res_shift);
+
+    T_LOAD(int, 1, N0, BUFFER, result_multipliers, xo, 0, 0, 0, res_mul);
+    T_LOAD(int, 1, N0, BUFFER, result_shifts, xo, 0, 0, 0, res_shift);
+
+    T_QUANTIZE8(int, DATA_TYPE, PER_CHANNEL, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, res_mul, res_shift, c_lp);
+#else  // defined(PER_CHANNEL_QUANTIZATION)
+    T_QUANTIZE8(int, DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, 0, 0, c_lp);
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+#if defined(MIN_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MIN_BOUND);
+    })
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MAX_BOUND);
+    })
+#endif // defined(MAX_BOUND)
+
+#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    TILE(int, M0, N0, c_lp);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
+    })
+#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+        dst_indirect_y[i].v = (uint)min((int)((y + i) % HEIGHT_GEMM3D), (int)HEIGHT_GEMM3D - 1);
+        dst_indirect_y[i].v += (uint)min((int)((y + i) / HEIGHT_GEMM3D), (int)DEPTH_GEMM3D - 1) * FULL_DST_HEIGHT;
+        dst_indirect_y[i].v += z *FULL_DST_HEIGHT *DEPTH_GEMM3D;
+#else  // (REINTERPRET_OUTPUT_AS_3D)
+        dst_indirect_y[i].v = (uint)min((int)y + i, (int)M - 1) + z *FULL_DST_HEIGHT;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+    })
+
+    const bool cond_x = (xo > (N - N0)) & (PARTIAL_STORE_N0 != 0);
+
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
+#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    T_STORE_INDIRECT_WIDTH_SELECT(int, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
+#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
+}
+#endif // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
+
+#if defined(GEMMLOWP_MM_NATIVE)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., -DK0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
+                                 IMAGE_DECLARATION(rhs),
+                                 IMAGE_DECLARATION(dst),
+                                 uint lhs_stride_z,
+                                 uint rhs_stride_z,
+                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                 ,
+                                 uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                 ,
+                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                )
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+        // Partial matrix multiplication M0,N0,K0
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        // Transpose the values from RHS matrix
+        TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
+
+        ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        // Update the offset
+        lhs_offset += K0;
+        rhs_offset += K0 * rhs_stride_y;
+    }
+
+    // Left-over for loop
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+        // Partial matrix multiplication M0,N0,1
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        // Transpose the values from RHS matrix
+        TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
+
+        ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        // Update the offset
+        lhs_offset += 1;
+        rhs_offset += rhs_stride_y;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Convert and store output block
+    REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); // resN = CONVERT(cN, VEC_DATA_TYPE(int, N0));
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+#endif // defined(GEMMLOWP_MM_NATIVE)
+
+#if defined(GEMMLOWP_MATRIX_A_REDUCTION)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    sum_row_32            = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
+    ACC_DATA_TYPE sum_row = 0;
+
+    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+
+    // This for loop performs 16 accumulations
+    for(; i <= ((int)COLS_A - 16); i += 16)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
+
+        sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.sCDEF,
+                      VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
+    }
+
+    // This for loop performs the leftover accumulations
+    for(; i < COLS_A; ++i)
+    {
+        sum_row += (ACC_DATA_TYPE)matrix_a[i];
+    }
+
+    sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
+
+#if defined(SCALAR)
+    sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+    *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(GEMMLOWP_MATRIX_A_REDUCTION)
+
+#if defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction.
+ * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
+                                               IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    ACC_DATA_TYPE sum_row = 0;
+
+    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+
+    // This for loop performs 16 accumulations
+    for(; i <= ((int)COLS_A - 32); i += 32)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        a0 = vload16(0, matrix_a + i);
+
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+
+        a0 = vload16(1, matrix_a + i);
+
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+    }
+
+    // This for loop performs the leftover accumulations
+    for(; i < COLS_A; ++i)
+    {
+        sum_row += (ACC_DATA_TYPE)matrix_a[i];
+    }
+
+#if defined(SCALAR)
+    sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+    *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8)
+
+#if defined(GEMMLOWP_MATRIX_B_REDUCTION)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ * It is also possible to multiply each reduced column by a scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. -DSCALAR=3)
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const uint y      = get_global_id(1);
+
+    __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z);
+    __global uchar *dst_addr           = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + y * dst_stride_y;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))0;
+
+    int i = 0;
+    // This for loop performs 4 accumulations
+    for(; i <= ((int)ROWS_B - 4); i += 4)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b0 = VLOAD(VEC_SIZE)(0, matrix_b + 0 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b1 = VLOAD(VEC_SIZE)(0, matrix_b + 1 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b2 = VLOAD(VEC_SIZE)(0, matrix_b + 2 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b3 = VLOAD(VEC_SIZE)(0, matrix_b + 3 * src_stride_y);
+
+        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b3,
+                      VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+        matrix_b += 4 * src_stride_y;
+    }
+
+    // This for loop perfoms the leftover accumulations
+    for(; i < (int)ROWS_B; ++i)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b0 = VLOAD(VEC_SIZE)(0, matrix_b);
+
+        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+        matrix_b += src_stride_y;
+    }
+
+#if defined(SCALAR)
+    sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))SCALAR;
+#endif // defined(SCALAR)
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    res0 = CONVERT(sum_col_32, VEC_DATA_TYPE(int, VEC_SIZE));
+
+    STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_MATRIX_B_REDUCTION)
+
+#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+
+/* Helper function used to calculate the offset contribution after matrix multiplication.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and calculates the offset contribution of matrix A and matrix B.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in] x                                     max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0)
+ * @param[in] y                                     get_global_id(1)
+ * @param[in] z                                     get_global_id(2)
+ * @param[in] sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x                        (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y                        (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x                        (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y                        (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases tensor
+ */
+inline VEC_INT offset_contribution(
+    int x,
+    int y,
+    int z
+#if defined(A_OFFSET)
+    ,
+    IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+)
+{
+    VEC_INT a_offset_s32 = (VEC_INT)0;
+    VEC_INT b_offset_s32 = (VEC_INT)0;
+
+    int batch_id = z;
+#if defined(DEPTH_INPUT3D)
+    batch_id /= (int)DEPTH_INPUT3D;
+#endif // defined(DEPTH_INPUT3D)
+
+#if defined(A_OFFSET)
+    // Compute the offset contribution due to A_OFFSET
+    __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
+
+    // Compute the offset contribution due to A_OFFSET
+#if defined(SUM_COL_HAS_BATCHES)
+    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
+#else  // defined(SUM_COL_HAS_BATCHES)
+    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)sum_col_addr);
+#endif // defined(SUM_COL_HAS_BATCHES)
+
+    a_offset_s32 *= (VEC_INT)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+    // Compute the offset contribution due to A_OFFSET
+    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
+
+    // Compute the offset contribution due to B_OFFSET
+#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
+#else  // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
+#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 *= (VEC_INT)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_INT biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    b_offset_s32 += (VEC_INT)biases_values;
+#endif // defined(ADD_BIAS)
+
+    return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32;
+}
+
+#if defined(GEMMLOWP_OFFSET_CONTRIBUTION)
+/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * @param[in] mm_result_ptr                           Pointer to the source tensor. Supported data type: S32
+ * @param[in] mm_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] mm_result_step_x                        mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] mm_result_step_y                        mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] mm_result_step_z                        mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_col_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x                          (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y                          (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x                          (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y                          (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr                              (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                         (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x                           (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes    (Optional) The offset of the first element in the biases tensor
+ */
+__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                           ,
+                                           VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS))
+                                          )
+{
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Compute offset contribution
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
+#if defined(A_OFFSET)
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                              );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32_0 += offset_term_s32;
+
+    // Store the result with the offset contribution
+    STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION)
+
+#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                         ,
+                                                         IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                         ,
+                                                         IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                         ,
+#if defined(ADD_BIAS)
+                                                         VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                         TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                         ,
+                                                         VECTOR_DECLARATION(result_multipliers),
+                                                         VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+                                                        )
+{
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    // Compute offset contribution
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
+#if defined(A_OFFSET)
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                              );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // -------------- OUTPUT STAGE
+
+    // Add the offset terms to GEMM's result
+    in_s32 += (VEC_INT)RESULT_OFFSET;
+
+    // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
+    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
+
+    in_s32 *= result_multipliers_values;
+    in_s32 >>= result_shifts_values;
+#else  // defined(PER_CHANNEL_QUANTIZATION)
+    in_s32 *= RESULT_MULTIPLIER;
+
+    in_s32 >>= RESULT_SHIFT;
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN)
+
+#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT)
+/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                                    ,
+                                                                    IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                                    ,
+                                                                    IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                                    ,
+#if defined(ADD_BIAS)
+                                                                    VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                    TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                                    ,
+                                                                    VECTOR_DECLARATION(result_multipliers),
+                                                                    VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+                                                                   )
+{
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Compute offset contribution
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
+#if defined(A_OFFSET)
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                              );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // -------------- OUTPUT STAGE
+
+    // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
+    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
+
+    VEC_INT in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
+    VEC_INT in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
+    in_s32                   = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#else  // RESULT_SHIFT >= 0
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+    // Add the offset terms to GEMM's result
+    in_s32 += (VEC_INT)RESULT_OFFSET;
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT)
+
+#undef VEC_INT
+
+#endif // defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *  -#  - to the [0..255] range and cast to QASYMM8.
+ *  -#  - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                  VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                  TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Add the offset terms to GEMM's result
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET;
+
+    // Multiply by result_mult_int and shift
+    input_values *= RESULT_MULT_INT;
+
+#if RESULT_SHIFT < 0
+    input_values >>= -RESULT_SHIFT;
+#else  // RESULT_SHIFT >= 0
+    input_values >>= RESULT_SHIFT;
+#endif // RESULT_SHIFT < 0
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN)
+
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                             VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                             TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#else  // RESULT_SHIFT >= 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#endif // RESULT_SHIFT < 0
+
+    // Add the offset terms to GEMM's result
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT;
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT)
+
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QSYMM16 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QSYMM16
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                                     VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                     TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#else  // RESULT_SHIFT >= 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#endif // RESULT_SHIFT < 0
+
+    VEC_DATA_TYPE(short, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16)
+
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Requantize
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                         Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                           src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                        VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+#if defined(DST_HEIGHT)
+                                                        TENSOR4D_DECLARATION(dst))
+#else  // defined(DST_HEIGHT)
+                                                        TENSOR3D_DECLARATION(dst))
+#endif // defined(DST_HEIGHT)
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Convert to float
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    input_values_f = CONVERT(input_values, VEC_DATA_TYPE(float, VEC_SIZE));
+    input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT)
diff --git a/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl
new file mode 100644
index 0000000000..72fe3d3b89
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_MMUL)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension:
+ *
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is transposed
+ *
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=1, -DK0=1).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=1)
+ * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=4)
+ * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=4)
+ * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=16)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 4
+ *  - N0 = 1, 4, 8
+ *  - K0 = 4
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ *
+ * @param[in]  lhs_ptr                               Pointer to the LHS tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                          Stride of the LHS tensor in Y dimension (in bytes)
+ * @param[in]  lhs_stride_z                          Stride of the LHS tensor in Z dimension (in bytes)
+ * @param[in]  lhs_w                                 The size of the width dimension of the LHS tensor
+ * @param[in]  lhs_h                                 The size of the height dimension of the LHS tensor
+ * @param[in]  lhs_n                                 The size of the depth dimension of the LHS tensor
+ * @param[in]  lhs_offset_first_element_in_bytes     The offset of the first element in the LHS tensor
+ * @param[in]  rhs_ptr                               Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                          Stride of the RHS tensor in Y dimension (in bytes)
+ * @param[in]  rhs_stride_z                          Stride of the RHS tensor in Z dimension (in bytes)
+ * @param[in]  rhs_w                                 The size of the width dimension of the RHS tensor
+ * @param[in]  rhs_h                                 The size of the height dimension of the RHS tensor
+ * @param[in]  rhs_n                                 The size of the depth dimension of the RHS tensor
+ * @param[in]  rhs_offset_first_element_in_bytes     The offset of the first element in the RHS tensor
+ * @param[in]  bia_ptr                               (Optional) Pointer to the bias tensor. Supported data type: S32
+ * @param[in]  bia_stride_y                          (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bia_stride_z                          (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bia_w                                 (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bia_h                                 (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bia_n                                 (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bia_offset_first_element_in_bytes     (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data type: same as @p lhs_ptr or S32
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_w                                 The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                                 The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                                 The size of the depth dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  M                                     Number of rows in LHS matrix not reshaped
+ * @param[in]  N                                     Number of columns in RHS matrix not reshaped
+ * @param[in]  K                                     Number of columns in LHS matrix and rows in RHS matrix not reshaped
+ * @param[in]  sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_col_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                        (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                        (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                           (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_row_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                        (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                        (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_mmul(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#if defined(ADD_BIAS)
+    TENSOR3D_T(bia, BUFFER),
+#endif // defined(ADD_BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K
+#if defined(A_OFFSET)
+    ,
+    TENSOR3D_T(sum_col, BUFFER)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    TENSOR3D_T(sum_row, BUFFER)
+#endif // defined(B_OFFSET)
+)
+{
+#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_M0)
+#define VEC_SIZE 4 // For int8 types input to mmul instruction is a length 4 vector
+
+    uint x0 = get_global_id(0);
+    uint y0 = get_global_id(1);
+    uint z  = get_global_id(2);
+
+    // Get block ID and thread ID within the block
+    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+
+    // Coordinate within a block
+    uint block_x = thread_id % MMUL_N0;
+    uint block_y = (thread_id / MMUL_M0);
+
+    // Starting destination coordinates
+    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
+    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
+
+    uint lhs_x = VEC_SIZE * block_x;
+    uint lhs_y = dst_y;
+
+    uint rhs_x = VEC_SIZE * N0 * block_y;
+    uint rhs_y = 4 * block_id + block_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(OUT_DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, M0, VEC_SIZE, a);
+        T_LOAD(DATA_TYPE, M0, VEC_SIZE, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        TILE(DATA_TYPE, N0, VEC_SIZE, b);
+        T_LOAD(DATA_TYPE, N0, VEC_SIZE, BUFFER, rhs, 0, 0, 1, VEC_SIZE, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                VEC_TYPE vec_a = (VEC_TYPE)(a[m0].s[0], a[m0].s[1], a[m0].s[2], a[m0].s[3]);
+                VEC_TYPE vec_b = (VEC_TYPE)(b[n0].s[0], b[n0].s[1], b[n0].s[2], b[n0].s[3]);
+                c[m0].s[n0]    = arm_matrix_multiply(vec_a, vec_b, c[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * N0 * sizeof(DATA_TYPE);
+    }
+
+    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
+    {
+        return;
+    }
+
+    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
+    {
+        return;
+    }
+
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+    TILE(int, M0, N0, offset_s32);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET;
+    })
+
+#if defined(A_OFFSET)
+
+    TILE(int, 1, N0, a_offset_s32);
+
+    T_LOAD(int, 1, N0, BUFFER, sum_col, dst_x, z, 1, sum_col_stride_z, a_offset_s32);
+
+    a_offset_s32[0].v *= A_OFFSET;
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+
+    TILE(int, M0, 1, b_offset_s32);
+
+    T_LOAD(int, M0, 1, BUFFER, sum_row, dst_y, z * M, 1, 4, b_offset_s32);
+
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        offset_s32[m0].v += b_offset_s32[m0].v *B_OFFSET;
+    })
+
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+#if defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(ACC_DATA_TYPE) + z * bia_stride_y;
+
+    TILE(int, M0, N0, bias);
+
+    T_LOAD(int, M0, N0, BUFFER, bia, dst_x, dst_y, 1, 1, bias);
+
+    T_ADD(ACC_DATA_TYPE, M0, N0, offset_s32, bias, offset_s32);
+
+#else // defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(ACC_DATA_TYPE);
+
+    TILE(int, 1, N0, bias);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        bias[0].v = VLOAD(N0)(0, (ACC_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, N0_LEFTOVER)
+        (bias[0].v, 0, (ACC_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(ADD_BIAS)
+
+    T_ADD(ACC_DATA_TYPE, M0, N0, c, offset_s32, c);
+    TILE(OUT_DATA_TYPE, M0, N0, c_lp);
+    T_QUANTIZE8(ACC_DATA_TYPE, OUT_DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c, 0, 0, c_lp);
+
+#if defined(MIN_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(OUT_DATA_TYPE, N0))MIN_BOUND);
+    })
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(OUT_DATA_TYPE, N0))MAX_BOUND);
+    })
+#endif // defined(MAX_BOUND)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c_lp[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c_lp[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#else  // FUSED_OUTPUT_STAGE_FIXED_POINT
+    // Store
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+#endif // FUSED_OUTPUT_STAGE_FIXED_POINT
+}
+
+#endif // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_MMUL)
diff --git a/src/core/CL/cl_kernels/common/gemv.cl b/src/core/CL/cl_kernels/common/gemv.cl
new file mode 100644
index 0000000000..71a372eb29
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemv.cl
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+/** This kernel applies dot product to each plane on the input tensor and the corrispective column of the reshaped weight tensor.
+ *
+ * @note Datatype and source width and height should be given as a preprocessor argument using -DDATA_TYPE=type, -DSRC_WIDTH=width and -DSRC_HEIGHT=height. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ */
+__kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VECTOR_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    int y = get_global_id(1) * 4;
+    int z = get_global_id(2);
+
+    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
+    __global uchar *input_ptr       = src.ptr;
+
+    DATA_TYPE acc0 = (DATA_TYPE)0;
+    DATA_TYPE acc1 = (DATA_TYPE)0;
+    DATA_TYPE acc2 = (DATA_TYPE)0;
+    DATA_TYPE acc3 = (DATA_TYPE)0;
+
+    // This kernel handle 4 rows in per thread so that it can reuse the weights
+    for(int i = 0; i < SRC_WIDTH; i += 4)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        weights = vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x));
+
+        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp0 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp1 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp2 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp3 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3));
+
+        acc0 += dot(weights, tmp0);
+        acc1 += dot(weights, tmp1);
+        acc2 += dot(weights, tmp2);
+        acc3 += dot(weights, tmp3);
+    }
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
+
+    int rows_left = SRC_HEIGHT - (y + 4);
+
+    // This if check is used to handle the last few rows when it can't be divided by the four
+    if(rows_left >= 0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        out = (VEC_DATA_TYPE(DATA_TYPE, 4))(acc0, acc1, acc2, acc3);
+        vstore4(out, 0, (__global DATA_TYPE *)output_ptr);
+    }
+    else
+    {
+        switch(rows_left)
+        {
+            case -1: // three rows left; one is padding
+                *((__global DATA_TYPE *)(output_ptr + 2 * dst_stride_x)) = acc2;
+            case -2: // two rows left; two are padding
+                *((__global DATA_TYPE *)(output_ptr + 1 * dst_stride_x)) = acc1;
+            case -3: // one row left; three are padding
+                *((__global DATA_TYPE *)(output_ptr + 0 * dst_stride_x)) = acc0;
+                break;
+        }
+    }
+}
+
+/** This kernel applies dot product to each plane on the input tensor and the corresponding column of the reshaped weight tensor.
+ *
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uchar
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: S32
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  input_offset                          Input's quantization offset
+ * @param[in]  weights_offset                        Weights's quantization offset
+ */
+__kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),
+                                IMAGE_DECLARATION(weights),
+                                VECTOR_DECLARATION(dst),
+                                const int input_offset,
+                                const int weights_offset)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    int y = get_global_id(1) * 4;
+    int z = get_global_id(2);
+
+    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
+    __global uchar *input_ptr       = src.ptr;
+
+    int acc0 = 0;
+    int acc1 = 0;
+    int acc2 = 0;
+    int acc3 = 0;
+
+    // This kernel handle 4 rows in per thread so that it can reuse the weights
+    for(int i = 0; i < SRC_WIDTH; i += 4)
+    {
+        int4 w = convert_int4(vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
+
+        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
+
+        int4 tmp0 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0))) + (int4)input_offset;
+        int4 tmp1 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1))) + (int4)input_offset;
+        int4 tmp2 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2))) + (int4)input_offset;
+        int4 tmp3 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3))) + (int4)input_offset;
+
+        // Accumulate
+        acc0 += tmp0.s0 * w.s0 + tmp0.s1 * w.s1 + tmp0.s2 * w.s2 + tmp0.s3 * w.s3;
+        acc1 += tmp1.s0 * w.s0 + tmp1.s1 * w.s1 + tmp1.s2 * w.s2 + tmp1.s3 * w.s3;
+        acc2 += tmp2.s0 * w.s0 + tmp2.s1 * w.s1 + tmp2.s2 * w.s2 + tmp2.s3 * w.s3;
+        acc3 += tmp3.s0 * w.s0 + tmp3.s1 * w.s1 + tmp3.s2 * w.s2 + tmp3.s3 * w.s3;
+    }
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
+
+    int rows_left = SRC_HEIGHT - (y + 4);
+
+    // This if check is used to handle the last few rows when it can't be divided by the four
+    if(rows_left >= 0)
+    {
+        vstore4((int4)(acc0, acc1, acc2, acc3), 0, (__global int *)output_ptr);
+    }
+    else
+    {
+        switch(rows_left)
+        {
+            case -1: // three rows left; one is padding
+                *((__global int *)(output_ptr + 2 * dst_stride_x)) = acc2;
+            case -2: // two rows left; two are padding
+                *((__global int *)(output_ptr + 1 * dst_stride_x)) = acc1;
+            case -3: // one row left; three are padding
+                *((__global int *)(output_ptr + 0 * dst_stride_x)) = acc0;
+                break;
+        }
+    }
+}
+#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
diff --git a/src/core/CL/cl_kernels/common/generate_proposals.cl b/src/core/CL/cl_kernels/common/generate_proposals.cl
new file mode 100644
index 0000000000..bfe1922ac2
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/generate_proposals.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
+ * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
+ * -# -DHEIGHT= Height of the feature map on which this kernel is applied
+ * -# -DWIDTH= Width of the feature map on which this kernel is applied
+ * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
+ * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
+ * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
+ *
+ * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: F16/F32
+ * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
+ * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
+ * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
+ * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
+ * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
+ * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
+ * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
+ * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
+ */
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
+__kernel void generate_proposals_compute_all_anchors(
+    VECTOR_DECLARATION(anchors),
+    VECTOR_DECLARATION(rois))
+{
+    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
+    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
+
+    const unsigned int idx = get_global_id(0);
+    // Find the index of the anchor
+    const unsigned int anchor_idx = idx % NUM_ANCHORS;
+
+    // Find which shift is this thread using
+    const unsigned int shift_idx = idx / NUM_ANCHORS;
+
+    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
+    const float shift_x = (float)(shift_idx % WIDTH) * STRIDE;
+    const float shift_y = (float)(shift_idx / WIDTH) * STRIDE;
+
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
+
+    // Read the given anchor
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    anchor = vload4(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS));
+
+    // Apply the shift to the anchor
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    shifted_anchor = anchor + shift;
+
+    vstore4(shifted_anchor, 0, (__global DATA_TYPE *)rois.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
diff --git a/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
new file mode 100644
index 0000000000..70f861c4b7
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
+ * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM8
+ * -# -DHEIGHT= Height of the feature map on which this kernel is applied
+ * -# -DWIDTH= Width of the feature map on which this kernel is applied
+ * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
+ * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
+ * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
+ *
+ * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: QASYMM8
+ * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
+ * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
+ * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
+ * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
+ * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
+ * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
+ * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
+ * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
+ */
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
+__kernel void generate_proposals_compute_all_anchors_quantized(
+    VECTOR_DECLARATION(anchors),
+    VECTOR_DECLARATION(rois))
+{
+    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
+    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
+
+    const size_t idx = get_global_id(0);
+    // Find the index of the anchor
+    const size_t anchor_idx = idx % NUM_ANCHORS;
+
+    // Find which shift is this thread using
+    const size_t shift_idx = idx / NUM_ANCHORS;
+
+    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
+    const float shift_x = (float)(shift_idx % WIDTH) * STRIDE;
+    const float shift_y = (float)(shift_idx / WIDTH) * STRIDE;
+
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    shift = (VEC_DATA_TYPE(float, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
+
+    // Read the given anchor
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    anchor = DEQUANTIZE(VLOAD(NUM_ROI_FIELDS)(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS)), OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS);
+
+    // Apply the shift to the anchor
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    shifted_anchor = anchor + shift;
+
+    VSTORE(NUM_ROI_FIELDS)
+    (QUANTIZE(shifted_anchor, OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS), 0, (__global DATA_TYPE *)rois.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
diff --git a/src/core/CL/cl_kernels/common/instance_normalization.cl b/src/core/CL/cl_kernels/common/instance_normalization.cl
new file mode 100644
index 0000000000..f9b3cd3620
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/instance_normalization.cl
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) & defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
+/** This function computes the mean and variance of each plane of the input tensor and provides it as output.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
+ * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void compute_mean_var(
+    TENSOR4D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+#if defined(NHWC)
+    const int          ch             = get_global_id(0); // Current channel
+    const int          batch          = get_global_id(1); // Current batch
+    const int          elements_plane = DIM_Y * DIM_Z;
+    INTERNAL_DATA_TYPE part_sum       = 0.f;
+    INTERNAL_DATA_TYPE part_sum_sq    = 0.f;
+    const int          in_offset      = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
+
+    for(int i_w = 0; i_w < DIM_Y; ++i_w)
+    {
+        for(int i_h = 0; i_h < DIM_Z; ++i_h)
+        {
+            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
+            part_sum += data;
+            part_sum_sq += data * data;
+        }
+    }
+
+    INTERNAL_DATA_TYPE mean                      = (part_sum / elements_plane);
+    INTERNAL_DATA_TYPE var                       = (part_sum_sq / elements_plane) - (mean * mean);
+    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
+    *output_address0                             = mean;
+    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
+    *output_address1                             = var;
+#else // !defined(NHWC)
+    const int ch             = get_global_id(2) % DIM_Z; // Current channel
+    const int batch          = get_global_id(2) / DIM_Z; // Current batch
+    const int elements_plane = DIM_X * DIM_Y;
+
+    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+    part_sum = 0.f;
+    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+    part_sum_sq = 0.f;
+    // Calculate partial sum
+    for(int y = 0; y < DIM_Y; ++y)
+    {
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            // Load data
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
+            part_sum += data;
+            part_sum_sq += data * data;
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)));
+            part_sum.s0 += data;
+            part_sum_sq.s0 += data * data;
+        }
+    }
+    // Perform reduction
+#if VEC_SIZE > 8
+    part_sum.s01234567 += part_sum.s89abcdef;
+    part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
+#endif // VEC_SIZE > 8
+#if VEC_SIZE > 4
+    part_sum.s0123 += part_sum.s4567;
+    part_sum_sq.s0123 += part_sum_sq.s4567;
+#endif // VEC_SIZE > 4
+#if VEC_SIZE > 2
+    part_sum.s01 += part_sum.s23;
+    part_sum_sq.s01 += part_sum_sq.s23;
+#endif // VEC_SIZE > 2
+    part_sum.s0 += part_sum.s1;
+    part_sum_sq.s0 += part_sum_sq.s1;
+
+    INTERNAL_DATA_TYPE sum    = (INTERNAL_DATA_TYPE)part_sum.s0;
+    INTERNAL_DATA_TYPE sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0;
+
+    const INTERNAL_DATA_TYPE mean = (sum / elements_plane);
+    const INTERNAL_DATA_TYPE var  = (sum_sq / elements_plane) - (mean * mean);
+
+    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
+    *output_address0                             = mean;
+    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
+    *output_address1                             = var;
+
+#endif // defined(NHWC)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
+ * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3
+ * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void instance_normalization(
+    TENSOR4D_DECLARATION(input),
+    TENSOR3D_DECLARATION(mean_var)
+#ifndef IN_PLACE
+    ,
+    TENSOR4D_DECLARATION(output)
+#endif /* IN_PLACE */
+)
+{
+    Tensor4D in       = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D mean_var = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(mean_var);
+#ifndef IN_PLACE
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+#endif /* IN_PLACE */
+
+#if defined(NHWC)
+    const int ch    = get_global_id(0); // Current channel
+    const int batch = get_global_id(2); // Current batch
+#else                                   /* defined(NHWC) */
+    const int ch    = get_global_id(2) % DIM_Z; // Current channel
+    const int batch = get_global_id(2) / DIM_Z; // Current batch
+#endif                                  /* defined(NHWC) */
+
+    const __global INTERNAL_DATA_TYPE *mean_ptr                   = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 0, batch);
+    const __global INTERNAL_DATA_TYPE *var_ptr                    = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 1, batch);
+    const INTERNAL_DATA_TYPE                               mean   = (INTERNAL_DATA_TYPE) * mean_ptr;
+    const INTERNAL_DATA_TYPE                               var    = (INTERNAL_DATA_TYPE) * var_ptr;
+    const INTERNAL_DATA_TYPE                               multip = GAMMA / sqrt(var + EPSILON);
+    const INTERNAL_DATA_TYPE                               beta   = (INTERNAL_DATA_TYPE)BETA;
+
+#if defined(NHWC)
+    const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
+#ifndef IN_PLACE
+    const int out_offset = output_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
+#endif /* IN_PLACE */
+
+    for(int i_w = 0; i_w < DIM_Y; ++i_w)
+    {
+        for(int i_h = 0; i_h < DIM_Z; ++i_h)
+        {
+            __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
+#endif /* IN_PLACE */
+            *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+        }
+    }
+#else // !defined(NHWC)
+    for(int y = 0; y < DIM_Y; ++y)
+    {
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            data = CONVERT(VLOAD(VEC_SIZE)(0, input_address), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
+
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            res = (data - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+            VSTORE(VEC_SIZE)
+            (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, output_address);
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+            *(output_address)                  = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+        }
+    }
+#endif // defined(NHWC)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/common/l2_normalize.cl
index fbe3406239..fbe3406239 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/common/l2_normalize.cl
diff --git a/src/core/CL/cl_kernels/common/mat_mul.cl b/src/core/CL/cl_kernels/common/mat_mul.cl
new file mode 100644
index 0000000000..c7ef8ae52b
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul.cl
@@ -0,0 +1,708 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for float/half datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(DATA_TYPE, M0, N0, acc), uint x)
+{
+    TILE(DATA_TYPE, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#if defined(MAT_MUL_NATIVE_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, K0, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, b, acc);
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, b, acc);
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
+                                  TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+                                  TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+                                  TENSOR3D_T(dst, BUFFER))
+
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // This part is written to decrease the number of loop unrollings caused
+        // by T_MMUL. The NT/NT version is partly vectorized and uses less number
+        // of loop unrollings, and code behaves as expected. Although this is not
+        // a performant solution for the specified architecture, it is necessary
+        // to overcome some limitations.
+        TILE(DATA_TYPE, K0, N0, bt);
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                bt[j].s[i] = b[i].s[j];
+            })
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // See the main loop for the explanation of this part
+        TILE(DATA_TYPE, 1, N0, bt);
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            bt[0].s[i] = b[i].s[0];
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_NT_T)
+
+#if defined(MAT_MUL_NATIVE_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ *  - K0 > 0
+ * * @note Values > 8 for M0, and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, K0, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, K0, at);
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                at[j].s[i] = a[i].s[j];
+            })
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, b, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, NT, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, 1, M0, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, 1, at);
+        LOOP_UNROLLING(int, j, 0, 1, M0,
+        {
+            at[j].s[0] = a[0].s[j];
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, b, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, NT, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_T_NT)
+
+#if defined(MAT_MUL_NATIVE_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr,
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, K0, at);
+        TILE(DATA_TYPE, K0, N0, bt);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                at[j].s[i] = a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                bt[j].s[i] = b[i].s[j];
+            })
+        })
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, 1, M0, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, 1, at);
+        TILE(DATA_TYPE, 1, N0, bt);
+
+        LOOP_UNROLLING(int, j, 0, 1, M0,
+        {
+            at[j].s[0] = a[0].s[j];
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            bt[0].s[i] = b[i].s[0];
+        })
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_mmul.cl b/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
new file mode 100644
index 0000000000..e549da86d4
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
@@ -0,0 +1,946 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for float and half datatypes when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(DATA_TYPE, M0, N0, acc), uint x)
+{
+    TILE(DATA_TYPE, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#if defined(MAT_MUL_NATIVE_MMUL_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in LHS matrix
+ * @param[in]  N                                  Number of columns in RHS matrix
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0) // MMUL block size for the output matrix
+
+    // The output/destination matrix is divided into "sections". Each section is filled by a group of
+    // threads of size MMUL_BLOCK_SIZE, bundled together according to GWS_x.
+    // Each thread writes to a tile of M0 x N0 (the usual output block size for a thread) in the output matrix.
+    // Therefore, the section dimensions are (MMUL_M0 x M0) x (MMUL_N0 x N0).
+
+    // The GWS is constructed in such a way that the y global id is the y section coordinate,
+    // and the x global id is a transformed thread id: MMUL_BLOCK_SIZE number of consecutive threads
+    // in the x dimension corresponding to a section.
+    // This can be visualized as first obtaining the coordinates of all the sections:
+    // x = [0, (N / N0) / MMUL_N0) --> (N / N0) / MMUL_N0 is the number of sections in x dimension
+    // y = [0, (M / M0) / MMUL_M0) --> (M / M0) / MMUL_M0 is the number of sections in y dimension
+    // Then multiply the x coordinates with MMUL_SECTION_NUM_THREADS to get the consecutive thread ids in the x dimension
+    // x = [0, ((N / N0) / MMUL_N0) * MMUL_N0 * MMUL_M0)
+    // x = [0, (N / N0) * MMUL_MO)
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Within these sections, each thread writes onto a small output block of size M0 x N0
+    // in row major order. A section divided into tiles can be visualized as below.
+    //
+    //                   (Figure 1)
+    //          A Section in the Output Matrix
+    //
+    //    _____N0__________N0____________________N0____
+    //    |           |          |         |           |
+    //    |           |          |         |           |
+    // M0 |  Thread 1 | Thread 2 |   ...   |  Thread   |
+    //    |           |          |         |  MMUL_N0  |
+    //    |___________|__________|_________|___________|
+    //    |           |                    |           |
+    //    |           |                    |           |
+    // M0 |  Thread   |     .              |           |
+    //    | MMUL_N0+1 |       .            |           | (M0 x MMUL_M0)
+    //    |___________|         .          |           |
+    //    |     .                          |           |
+    //    |     .                          |           |
+    //    |     .                          |           |
+    //    |                                |___________|
+    //    |                                |           |
+    //    |                                |  Thread   |
+    // M0 |                                | MMUL_N0 x |
+    //    |                                | MMUL_M0   |
+    //    |________________________________|___________|
+    //                  N0 x MMUL_N0
+    //
+    // The output matrix has several of these sections. As shown above, each section
+    // will be filled by a separate thread group of size MMUL_BLOCK_SIZE. The overall
+    // section layout of the output matrix is as below. For instance, S(1,1) will be filled
+    // by MMUL_BLOCK_SIZE (possibly equal to 16) threads, so as S(0,1) and others.
+    //
+    //                          (Figure 2)
+    //                          DST Matrix
+    //              ____________________________________
+    //             |        |        |       |         |
+    //             | S(0,0) | S(0,1) | ...   | S(0, X) |
+    //             |________|________|_______|_________|
+    //             |        |        |       |         |
+    //             | S(1,0) | S(1,1) | ...   | S(1, X) |
+    //             |________|________|_______|_________|
+    //             |   .    |        |                 |
+    //             |   .    |        |                 |        Y = (M / M0) / MMUL_M0 - 1 (Max possible section y coordinate)
+    //             |   .    |        |                 |        X = (N / N0) / MMUL_N0 - 1 (Max possible section x coordinate)
+    //             |________|________|_________________|
+    //             |        |        |       |         |        S(y, x) denotes the section, and y and x are computed in
+    //             | S(Y,0) | S(Y,1) |       | S(Y, X) |        section_y, section_x respectively.
+    //             |________|________|_______|_________|
+    //
+    //
+    //
+    //
+    // A complete view involving the three matrices is given below. It examplifies how the section S(0,0) is computed.
+    //
+    //                                                    (Figure 3)
+    //                                                  Complete View
+    //
+    //                       LHS Matrix                             RHS Matrix                                          DST Matrix
+    //
+    //                   ___MMUL_K0___________               __MMUL_N0 x N0____________                     ___MMUL_N0 x N0____________________
+    //                  /|xxxxxxxxxx|         |             /|xxxxxxxxxxxxxxx|         |                   /|xxxxxxxxxxxxxxxxxxx|             |
+    //                 / |xxxxxxxxxx|         |    MMUK_K0  ||xxxxxxxxxxxxxxx|         |                  / |xxxxxxxxxxxxxxxxxxx|             |
+    //      MMUL_M0    | |xxxxxxxxxx|  --->   |             ||xxxxxxxxxxxxxxx| . . .   |        MMUL_M0  |  |xxxxxxxxxxxxxxxxxxx|             |
+    //        x M0     | |xxxxxxxxxx|         |             \|_______________|_________|          x M0   |  |xxxxxxxxxxxxxxxxxxx|     ...     |
+    //                 | |xxxxxxxxxx|         |              |                         |                 |  |xxxxxxxxxxxxxxxxxxx|             |
+    //                 | |xxxxxxxxxx|         |     x        |       |                 |   =              \ |xxxxxxxxxxxxxxxxxxx|             |
+    //                  \|__________|_________|              |       |                 |                   \|___________________|             |
+    //                   |                    |              |       \/                |                    |                                 |
+    //                   |   ,                |              |_________________________|                    |         .                       |
+    //                   |   ,                |                                                             |         .                       |
+    //                   |   ,                |                                                             |         .                       |
+    //                   |____________________|                                                             |_________________________________|
+    //
+    // Horizontal and vertical arrows show the direction of K loop (main loop in the kernel).
+    // Each output section shown above is a zooomed out version of Figure 1.
+    //
+    // In each iteration of the main loop, LHS matrix traverses towards rightward, and RHS matrix traverses towards downward,
+    // the LHS section of (MMUL_M0 x M0) x MMUL_K0 and RHS section of MMUL_K0 x (MMUL_N0 x N0) is multiplied
+    // "cooperatively" using arm_matrix_multiply calls, and the result is accummulated over the output (DST) section
+    // of size (MMUL_M0 x M0) x (MMUL_N0 x N0) shown with 'x' signs.
+    //
+    // If it was a single thread, this multiplication would have been straightforward with a T_MMUL call.
+    // However, since it involves multiple threads working together using the aforementioned extension, it
+    // works slightly differently.
+    //
+    // Here is how threads access the LHS and RHS matrices:
+    // (Assume MMUL_K0 = MMUL_N0 = MMUL_M0 = 4 because the following diagram is heavily dependent on this)
+    //
+    //                                              (Figure 4)
+    //                               Thread Access Layouts in LHS & RHS matrices
+    //
+    //                   LHS matrix                                                             RHS Matrix
+    //           ___________________________                     __________N0 times______N0 times____________________N0 times_______
+    //          |__T0__|__T1__|__T2__|__T3__|                   |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //          |__T0__| ...                |                   |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //    M0    |   .    .                  |                   |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //   Times  |   .       .               |                   |__T12_|_____|__T12_|__T13_|______|__T13_|_____|__T15_|_____|__T15_|
+    //          |   .           .           |           X
+    //          |__T0__|__T1__|__T2__|__T3__|
+    //          |__T4__|__T5__|__T6__|__T7__|
+    //          |__T4__|__T5__|__T6__|__T7__|
+    //    M0    |   .    .                  |
+    //   Times  |   .       .               |
+    //          |   .           .           |
+    //          |__T4__|__T5__|__T6__|__T7__|
+    //          |__T8__|__T9__|__T10_|__T11_|
+    //    M0    |   .                       |
+    //   Times  |   .                       |
+    //          |   .                       |
+    //          |__T12_|__T13_|__T14_|__T15_|
+    //    M0    |   .                       |
+    //   Times  |   .                       |
+    //          |   .                       |
+    //          |__T12_|__T13_|__T14_|__T15_|
+    //
+    //
+    // This access layout is designed such that the threads access continuous elements of each matrix (in terms of row/column).
+    // To multiply these large sections, the arm_matrix_multiply call is made for each of the M0xN0 elements. So, for each
+    // combination of m0 and n0 (iterators of M0 and N0 from 0 to M0-1 and N0-1 respectively), one arm_matrix_multiply call is
+    // made, and MMUL_BLOCK_SIZE number of threads compute the result.
+    //
+    // The matrix multiplication taking place in this extension
+    // is an "interleaved" one, because, for example, if m0=0 and n0=0, i.e. the first iteration, we would use the first,
+    // M0-th, 2M0-th and 3M0-th rows of the LHS matrix. Similarly, we jump N0 steps in the RHS matrix. This is how we access
+    // one element for each thread in a single (m0, n0) loop.
+    //
+    //   For example, if we have
+    //          - a 8 x 4 LHS section
+    //          - 4 x 8 RHS section
+    //          - Each vector variable ai, bj represent a 4x1 vector
+    //          - ^T (superscript T) denotes transpose
+    //          - M0 = N0 = 2
+    //          - MMUL_N0 = MMUL_M0 = MMUL_K0 = 4
+    //
+    //                                             (Figure 5)
+    //                              Mathematical view of the Matrix Multiplication
+    //
+    //      LHS                           RHS                                           DST
+    //    [  a1^T  ]            [ b1 b2 b3 b4 b5 b6 b7 ]                [ a1^Tb1  a1^Tb2  a1^Tb3 ... a1^Tb7 ]
+    //    [  a2^T  ]                                    4 x 8           [ a2^Tb1  a2^Tb2  a2^Tb3 ... a2^Tb7 ]
+    //    [  a3^T  ]                                                    [                                   ]
+    //    [  a4^T  ]                                                =   [   .       .                       ]
+    //    [  a5^T  ]        X                                           [   .          .                    ]
+    //    [  a6^T  ]                                                    [   .             .                 ]
+    //    [  a7^T  ]                                                    [                                   ]
+    //    [  a8^T  ]                                                    [ a7^Tb1  a7^Tb2  a7^Tb3 ... a7^Tb7 ]
+    //              8 x 4                                                                                     8 x 8
+    //
+    //
+    //  For the first iteration, i.e. (m0, n0) = (0, 0), the arm_matrix_multiply would multiply the following matrices:
+    //
+    //    [  a1^T  ]            [  b1 b3 b5 b7 ]                [ a1^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a3^T  ]        x                   4 x 4     =     [ a3^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a5^T  ]                                            [ a5^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a7^T  ]                                            [ a7^Tb1  a7^Tb3  a7^Tb5  a7^Tb7 ]
+    //              4 x 4                                                                         4 x 4
+    //  The elements calculated in the 4x4 output block are the "interleaved" elements in the DST above.
+    //  When we follow for each combination of (m0, n0), every element of the DST matrix "section" is filled.
+    //
+
+    // Get thread coordinates within an mmul block (of size MMUL_BLOCK_SIZE)
+    // Since threads are grouped in x dimension, the modular of x-dim global id
+    // wrt the MMUL_BLOCK_SIZE is the thread id in the group, ranging from 0 to
+    // MMUL_BLOCK_SIZE-1. Because the thread numbering is in row-major order.
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Starting destination coordinates
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+    // Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.
+    // The unclamped dst coordinates can be calculated easily from the output section coordinates and the thread coordinates (see above figure).
+
+    // See Figure 1 & 2. Thread step size is N0 and M0,
+    //                   Section step size is N0 x MMUL_N0 and M0 x MMUL_M0
+    //                   respectively for x, y dimensions.
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = dst_x;
+    const uint rhs_y = thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, M0, 1, a);
+        // A tile of K0xN0 but K0 must be set to 1
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_MMUL_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=4). K must be a multiple of MMUL_K0
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in DST matrix
+ * @param[in]  N                                  Number of columns in DST matrix
+ * @param[in]  K                                  Number of rows in LHS and RHS matrices, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)
+    // For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    uint thread_x  = thread_id % MMUL_N0;
+    uint thread_y  = (thread_id / MMUL_N0);
+
+    // See Nt/Nt kernel for explanations
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    uint lhs_x = dst_y;
+    uint lhs_y = thread_x;
+
+    // Starting RHS coordinates
+    uint rhs_x = dst_x;
+    uint rhs_y = thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, 1, M0, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[0].s[m0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_T_NT)
+
+#if defined(MAT_MUL_NATIVE_MMUL_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in LHS matrix
+ * @param[in]  N                                  Number of columns in RHS matrix
+ * @param[in]  K                                  Number of columns in LHS matrix and columns in RHS matrix, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_nt_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)
+    // For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get block coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within a block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Starting destination coordinates
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+    // Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, M0, 1, a);
+        // A tile of N0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, N0, 1, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[n0].s[0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_N0 * sizeof(DATA_TYPE);
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_NT_T)
+
+#if defined(MAT_MUL_NATIVE_MMUL_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in LHS matrix
+ * @param[in]  N                                  Number of columns in RHS matrix
+ * @param[in]  K                                  Number of rows in LHS matrix and columns in RHS matrix, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)
+    // For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get block coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within a block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Starting destination coordinates
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+    // Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = dst_y;
+    const uint lhs_y = thread_x;
+
+    // Starting RHS coordinates
+    const uint rhs_x = thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        // A tile of K0xM0 but K0 must be set to 1
+        TILE(DATA_TYPE, 1, M0, a);
+        // A tile of N0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, N0, 1, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[0].s[m0], b[n0].s[0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_N0 * sizeof(DATA_TYPE);
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
new file mode 100644
index 0000000000..7f81ac4549
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
@@ -0,0 +1,833 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for integer datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(int, M0, N0, acc), uint x)
+{
+    TILE(int, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(int, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor in a transposed fashion
+        // in order to use T_MMUL_NT_T macro because only this macro
+        // can utilize dot product instruction for Int8/UInt8 by
+        // directly multiplying the rows of Lhs and Rhs tensors.
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
+    }
+
+#if((K % K0) != 0)
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor in a transposed fashion.
+        // See the main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0, N0, K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_nt_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if((K % K0) != 0)
+    // Leftover loop
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs/rhs tensors in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
+    }
+
+#if((K % K0) != 0)
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs/rhs tensors in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if((K % K0) != 0)
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl
new file mode 100644
index 0000000000..fdfb75d39c
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for integer datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(int, M0, N0, acc), uint x)
+{
+    TILE(int, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(int, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0) // MMUL block size for the output matrix
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at
+ *       compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER
+ *       (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The dimensions M, N, K must be passed at compile time using -DK (e.g. -DM=5, -DN=8, -DK=6).
+ *       K must be a multiple of 16.
+ * @note MMUL block sizes must be passed at compile time using -DMMUL_K0, -DMMUL_M0, -DMMUL_N0
+ *       (e.g. -DMMUL_K0=16, -DMMUL_M0=4, -DMMUL_N0=4)
+ * @note If there is bias -DBIAS option must be passed at compile time
+ * @note Quantization offsets of lhs, rhs and dst tensors must be passed at compile time using -DLHS_OFFSET,
+ *       -DRHS_OFFSET, -DDST_OFFSET (e.g. -DLHS_OFFSET=10, -DRHS_OFFSET=0, -DDST_OFFSET=-6)
+ * @note Effective quantization multiplier and shift for the destination tensor must be passed at compile time using
+ *       -DDST_MULTIPLIER and -DDST_SHIFT (e.g. -DDST_MULTIPLIER=2091, -DST_SHIFT=8)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ * @note For a generic view on how the MMUL works, see mat_mul_mmul.cl
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: S32
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_mmul_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    // The explanation of how this kernel works is very similar to the explanation given in
+    // mat_mul_mmul.cl. The MMUL logic, and terminology is the same. The only difference is
+    // in quantization multiplication, the MMUL block sizes are (4 x 16) for Lhs matrix and
+    // (16 x 4) for Rhs matrix, resulting in (4 x 4) MMUL block size for the destination.
+    //
+    // Figures 1, 2 and 3 in the previous explanation works the same. Since the Lhs and Rhs
+    // MMUL block sizes are different in quantized extension, the thread access pattern is
+    // slightly different. We can redraw Figure 4 (Thread access pattern) as follows:
+    //
+    //                                 (Modified Figure 4 from mat_mul_mmul.cl)
+    //                               Thread Access Layouts in LHS & RHS matrices
+    //
+    //                                    LHS matrix
+    //                 4 times      4 times          4 times        4 times
+    //           _______________________________________________________________
+    //          |T0_|T0_|T0_|T0_|T1_|T1_|T1_|T1_|T2_|T2_|T2_|T2_|T3_|T3_|T3_|T3_|
+    //          |T0_| ...                                                       |
+    //    M0    |   .    .                                                      |
+    //   Times  |   .       .                                                   |
+    //          |   .           .                                               |
+    //          |T0_|T0_|T0_|T0_|T1_|T1_|T1_|T1_|T2_|T2_|T2_|T2_|T3_|T3_|T3_|T3_|
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //    M0    |   .    .                                                      |
+    //   Times  |   .       .                                                   |
+    //          |   .           .                                               |
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //          |T8_|T8_|T8_|T8_|T9_|T9_|T9_|T9_|T10|T10|T10|T10|T11|T11|T11|T11|
+    //    M0    |   .                                                           |
+    //   Times  |   .                                                           |
+    //          |   .                                                           |
+    //          |T8_|T8_|T8_|T8_|T9_|T9_|T9_|T9_|T10|T10|T10|T10|T11|T11|T11|T11|
+    //    M0    |   .                                                           |
+    //   Times  |   .                                                           |
+    //          |   .                                                           |
+    //          |T12|T12|T12|T12|T13|T13|T13|T13|T14|T14|T14|T14|T15|T15|T15|T15|
+    //
+    //
+    //                                                  RHS Matrix
+    //
+    //                   __________N0 times______N0 times____________________N0 times_______
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //       4 times    |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //       4 times    |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //                  |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //           X      |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //       4 times    |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //       4 times    |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //                  |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //                  |__T12_|_____|__T12_|__T13_|______|__T13_|_____|__T15_|_____|__T15_|
+    //
+    //
+    // The logic behind this thread access pattern is already descried in the explanation
+    // in mat_mul_mmul.cl. The only change is threads accesses are extended to 4 elements
+    // from 1, in rightward direction in Lhs, and in downward direction in Rhs, because they
+    // are now operating on 4 char/uchar's (again 32-bit data), instead of one 32-bit floating point.
+    //
+    // The mathematical view of the matrix multiplication explained in Figure 5 also holds for this,
+    // except the dimension 4 is 16 instead, but the vector notations do not change, i.e. it's as follows:
+    //
+    //   Settings:
+    //          - a 8 x 16 LHS section
+    //          - 16 x 8 RHS section
+    //          - Each vector variable ai, bj represent a 16x1 vector
+    //          - ^T (superscript T) denotes transpose
+    //          - M0 = N0 = 2
+    //          - MMUL_N0 = MMUL_M0 = 4, MMUL_K0 = 16
+    //
+    //
+    //                                             (Modified Figure 5)
+    //                              Mathematical view of the Matrix Multiplication
+    //
+    //      LHS                           RHS                                           DST
+    //    [  a1^T  ]            [ b1 b2 b3 b4 b5 b6 b7 ]                [ a1^Tb1  a1^Tb2  a1^Tb3 ... a1^Tb7 ]
+    //    [  a2^T  ]                                    16 x 8          [ a2^Tb1  a2^Tb2  a2^Tb3 ... a2^Tb7 ]
+    //    [  a3^T  ]                                                    [                                   ]
+    //    [  a4^T  ]                                                =   [   .       .                       ]
+    //    [  a5^T  ]        X                                           [   .          .                    ]
+    //    [  a6^T  ]                                                    [   .             .                 ]
+    //    [  a7^T  ]                                                    [                                   ]
+    //    [  a8^T  ]                                                    [ a7^Tb1  a7^Tb2  a7^Tb3 ... a7^Tb7 ]
+    //              8 x 16                                                                                     8 x 8
+    //
+    //
+    //  For the first iteration, i.e. (m0, n0) = (0, 0), the arm_matrix_multiply would multiply the following matrices:
+    //
+    //    [  a1^T  ]            [  b1 b3 b5 b7 ]                [ a1^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a3^T  ]        x                   4 x 4     =     [ a3^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a5^T  ]                                            [ a5^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a7^T  ]                                            [ a7^Tb1  a7^Tb3  a7^Tb5  a7^Tb7 ]
+    //              4 x 4                                                                         4 x 4
+    // The elements calculated in the 4x4 output block are the "interleaved" elements in the DST above.
+    // When we follow for each combination of (m0, n0), every element of the DST matrix "section" is filled.
+    //
+    // Please refer to mat_mul_mmul.cl for more details.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = K0 * thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = dst_x;
+    const uint rhs_y = K0 * thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_w; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to K0
+        TILE(DATA_TYPE, M0, K0, a);
+        // A tile of K0xN0 but K0 must be set to K0
+        TILE(DATA_TYPE, K0, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+            LOOP_UNROLLING(int, m0, 0, 1, M0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(a[m0].v, vec_b, c[m0].s[n0]);
+            })
+
+#if LHS_OFFSET != 0
+            // Column Sum of B: Calculate the sum of columns by multiplying B
+            // with a matrix of 1's from Left
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]);
+#endif // LHS_OFFSET != 0s
+        })
+
+#if RHS_OFFSET != 0
+        // Row Sum of A: Calculate the sum of rows by multiplying A with
+        // a matrix of 1's from Right
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            a_sum[0].s[m0] = arm_matrix_multiply(a[m0].v, vec_1, a_sum[0].s[m0]);
+        })
+#endif // RHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
+ *
+ * Supported block configurations:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_nt_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = K0 * thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = K0 * thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_w; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to K0
+        TILE(DATA_TYPE, M0, K0, a);
+        // A tile of K0xN0 but K0 must be set to K0
+        TILE(DATA_TYPE, N0, K0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(a[m0].v, b[n0].v, c[m0].s[n0]);
+            })
+        })
+
+#if RHS_OFFSET != 0
+        // Row Sum of A: Calculate the sum of rows by multiplying A with
+        // a matrix of 1's from Right
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            a_sum[0].s[m0] = arm_matrix_multiply(a[m0].v, vec_1, a_sum[0].s[m0]);
+        })
+#endif // RHS_OFFSET != 0
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, b[n0].v, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_T)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
+ *
+ * Supported block configurations:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = dst_y;
+    const uint lhs_y = K0 * thread_x;
+
+    // Starting RHS coordinates
+    const uint rhs_x = dst_x;
+    const uint rhs_y = K0 * thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_h; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, K0, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_a = (VEC_DATA_TYPE(DATA_TYPE, K0))(a[0].s[m0], a[1].s[m0], a[2].s[m0], a[3].s[m0]);
+
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                VEC_DATA_TYPE(DATA_TYPE, K0)
+                vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+                c[m0].s[n0] = arm_matrix_multiply(vec_a, vec_b, c[m0].s[n0]);
+            })
+
+#if RHS_OFFSET != 0
+            // Row Sum of A: Calculate the sum of rows by multiplying A with
+            // a matrix of 1's from Right
+            a_sum[0].s[m0] = arm_matrix_multiply(vec_a, vec_1, a_sum[0].s[m0]);
+#endif // RHS_OFFSET != 0
+        })
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
+ *
+ * Supported block configurations:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = dst_y;
+    const uint lhs_y = K0 * thread_x;
+
+    // Starting RHS coordinates
+    const uint rhs_x = K0 * thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_h; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_a = (VEC_DATA_TYPE(DATA_TYPE, K0))(a[0].s[m0], a[1].s[m0], a[2].s[m0], a[3].s[m0]);
+
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(vec_a, b[n0].v, c[m0].s[n0]);
+            })
+#if RHS_OFFSET != 0
+            // Row Sum of A: Calculate the sum of rows by multiplying A with
+            // a matrix of 1's from Right
+            a_sum[0].s[m0] = arm_matrix_multiply(vec_a, vec_1, a_sum[0].s[m0]);
+#endif // RHS_OFFSET != 0
+        })
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, b[n0].v, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_T)
diff --git a/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
new file mode 100644
index 0000000000..22abf64874
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void mean_stddev_normalization(
+    IMAGE_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    IMAGE_DECLARATION(output)
+#endif /* IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Image in = CONVERT_TO_IMAGE_STRUCT(input);
+#ifdef IN_PLACE
+    Image out = in;
+#else  /* IN_PLACE */
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+#endif /* IN_PLACE */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    sum = 0.f;
+#ifdef MEANSTDNORM_HALF
+    VEC_DATA_TYPE(float, VEC_SIZE)
+#else  /* MEANSTDNORM_HALF */
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#endif /* MEANSTDNORM_HALF */
+    sum_sq = 0.f;
+    // Calculate partial sum
+    int i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        // Load data
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
+
+        sum += data;
+#ifdef MEANSTDNORM_HALF
+        VEC_DATA_TYPE(float, VEC_SIZE)
+        dsq = CONVERT(data * data, VEC_DATA_TYPE(float, VEC_SIZE));
+        sum_sq += dsq;
+#else  /* MEANSTDNORM_HALF */
+        sum_sq += data * data;
+#endif /* MEANSTDNORM_HALF */
+    }
+    // Perform reduction
+    sum    = SUM_REDUCE(sum, VEC_SIZE);
+    sum_sq = SUM_REDUCE(sum_sq, VEC_SIZE);
+
+#if VEC_SIZE > 1
+#define sum sum.s0
+#define sum_sq sum_sq.s0
+#endif // VEC_SIZE > 1
+
+    // Left-overs loop
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
+
+        sum += data;
+        sum_sq += data * data;
+    }
+
+    DATA_TYPE mean       = sum / WIDTH;
+    DATA_TYPE var        = (sum_sq / WIDTH) - (mean * mean);
+    DATA_TYPE stddev_inv = 1.f / sqrt(var + EPSILON);
+
+    i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
+
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        res = (data - mean) * stddev_inv;
+        VSTORE(VEC_SIZE)
+        (res, 0, (__global DATA_TYPE *)offset(&out, i, 0));
+    }
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
+
+        *((__global DATA_TYPE *)offset(&out, i, 0)) = (data - mean) * stddev_inv;
+    }
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH) */
diff --git a/src/core/CL/cl_kernels/common/memset.cl b/src/core/CL/cl_kernels/common/memset.cl
new file mode 100644
index 0000000000..9ff25f3af4
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/memset.cl
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
+
+/** Fill the tensor's planes with all value
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
+ * -# -DVEC_SIZE = Vector size
+ * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in] tensor_ptr                           Pointer to the source image. Data types supported: All.
+ * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] tensor_step_y                        tensor_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] value                                The value used to fill the pages of the tensor
+ */
+__kernel void memset(
+    TENSOR3D_DECLARATION(tensor))
+{
+    Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = (DATA_TYPE)(CONSTANT_VALUE);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)tensor.ptr);
+#else  // !defined(VEC_SIZE)
+    *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/common/minmax_layer.cl b/src/core/CL/cl_kernels/common/minmax_layer.cl
new file mode 100644
index 0000000000..49356451df
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/minmax_layer.cl
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
+/** This function identifies the min and maximum value of an input 3D tensor.
+ *
+ * @note The width, height and depth of the input tensor must be provided at compile time using -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3)
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] dst_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
+ * @param[in] dst_stride_x                      Stride of the min/max vector in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ */
+__kernel void minmax_layer(
+    TENSOR3D_DECLARATION(src),
+    VECTOR_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Vector   dst = CONVERT_TO_VECTOR_STRUCT(dst);
+
+    float4 min_value     = (float4)FLT_MAX;
+    float4 max_value     = (float4) - FLT_MAX;
+    float2 min_max_value = (float2)(FLT_MAX, -FLT_MAX);
+
+    for(int z = 0; z < DEPTH; ++z)
+    {
+        for(int y = 0; y < HEIGHT; ++y)
+        {
+            int             x        = 0;
+            __global float *src_addr = (__global float *)(src.ptr + y * src_stride_y + z * src_stride_z);
+
+            for(; x <= (int)(WIDTH - 8); x += 8)
+            {
+                float8 value = *(src_addr + x);
+
+                min_value = select(value.s0123, min_value, min_value < value.s0123);
+                min_value = select(value.s4567, min_value, min_value < value.s4567);
+
+                max_value = select(value.s0123, max_value, max_value > value.s0123);
+                max_value = select(value.s4567, max_value, max_value > value.s4567);
+            }
+
+            for(; x < WIDTH; ++x)
+            {
+                float value = *(src_addr + x);
+
+                min_max_value.s0 = min(min_max_value.s0, value);
+                min_max_value.s1 = max(min_max_value.s1, value);
+            }
+        }
+    }
+
+    // Perform min/max reduction
+    min_value.s01 = min(min_value.s01, min_value.s23);
+    min_value.s0  = min(min_value.s0, min_value.s1);
+    max_value.s01 = max(max_value.s01, max_value.s23);
+    max_value.s0  = max(max_value.s0, max_value.s1);
+
+    min_max_value.s0 = min(min_max_value.s0, min_value.s0);
+    min_max_value.s1 = max(min_max_value.s1, max_value.s0);
+
+    if(min_max_value.s0 == min_max_value.s1)
+    {
+        min_max_value.s0 = 0.0f;
+        min_max_value.s1 = 1.0f;
+    }
+
+    // Store min and max
+    vstore2(min_max_value, 0, (__global float *)dst.ptr);
+}
+#endif // defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/nonmax.cl b/src/core/CL/cl_kernels/common/nonmax.cl
new file mode 100644
index 0000000000..702e635a89
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/nonmax.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs Non maxima suppression over a 3x3 window on a given image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p scr_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_max_suppression(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    vc = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    if(all(vc == (DATA_TYPE)0))
+    {
+        vstore8(0, 0, (__global DATA_TYPE *)dst.ptr);
+
+        return;
+    }
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, -1));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out = select((DATA_TYPE)0, vc, (vc >= nc.s01234567) && (vc >= nc.s12345678) && (vc >= nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, 0));
+    out = select((DATA_TYPE)0, out, (vc >= nc.s01234567) && (vc > nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, +1));
+    out = select((DATA_TYPE)0, out, (vc > nc.s01234567) && (vc > nc.s12345678) && (vc > nc.s23456789));
+
+    vstore8(out, 0, (__global DATA_TYPE *)dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/common/pad_layer.cl b/src/core/CL/cl_kernels/common/pad_layer.cl
new file mode 100644
index 0000000000..5ae4ec884d
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pad_layer.cl
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_SELECT SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSETS VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VEC_SIZE)
+#define SCALAR_COND(x) CONVERT((VEC_SELECT)x == (VEC_SELECT)1, VEC_SELECT)
+
+#if defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
+/** Perform a pad operation when PaddingMode is CONSTANT
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
+ * @note In case pad left is more than the vector size, the number of threads to skip along the X axis must be passed using the
+ *       -DTHREADS_TO_SKIP_BEFORE compile flag, e.g. -DTHREADS_TO_SKIP_BEFORE=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
+ * @note In case pad left is more than the vector size, the thread from which to skip along the X axis for pad right must be passed using the
+ *       -DTHREADS_TO_SKIP_AFTER compile flag, e.g. -THREADS_TO_SKIP_AFTER=1. This is defined as ((SRC_WIDTH + PAD_X_BEFORE) / VEC_SIZE)
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If pad also needs to be added to the batch of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g. -DPAD_W_BEFORE=3)
+ *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  batch                             (Optional) Batch index if 4D pad must be applied
+ */
+__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src),
+                                 TENSOR3D_DECLARATION(dst)
+#if defined(PAD_W_BEFORE)
+                                 ,
+                                 uint batch
+#endif // defined(PAD_W_BEFORE)
+                                )
+{
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    // If true, write only padding values; no reads performed
+    uint cond = 0;
+#if defined(THREADS_TO_SKIP_BEFORE)
+    cond |= x < THREADS_TO_SKIP_BEFORE || x > THREADS_TO_SKIP_AFTER;
+#endif // defined(THREADS_TO_SKIP_BEFORE)
+#if defined(PAD_Y_BEFORE)
+    cond |= y < PAD_Y_BEFORE || y >= (SRC_HEIGHT + PAD_Y_BEFORE);
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+    cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+    cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
+#endif // defined(PAD_W_BEFORE)
+
+    if(cond)
+    {
+        VEC_TYPE const_vals0 = (VEC_TYPE)CONST_VAL;
+        STORE_VECTOR_SELECT(const_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
+    }
+    else
+    {
+        // Calculate input's coordinates based on output's
+        int w = 0;
+#if defined(THREADS_TO_SKIP_BEFORE)
+        x -= THREADS_TO_SKIP_BEFORE;
+#endif // defined(THREADS_TO_SKIP_BEFORE)
+#if defined(PAD_Y_BEFORE)
+        y -= PAD_Y_BEFORE;
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+        z -= PAD_Z_BEFORE;
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+        w -= PAD_W_BEFORE * SRC_DEPTH;
+#endif // defined(PAD_W_BEFORE)
+        x *= VEC_SIZE;
+        x -= PAD_X_BEFORE_REMAINDER;
+
+        // Check for out of bound reads and clamp X coordinate
+        uint cond_left  = x < 0;
+        uint cond_right = (x + VEC_SIZE) > SRC_WIDTH;
+        x               = clamp(x, 0, (SRC_WIDTH - VEC_SIZE));
+
+        // Calculate input's address
+        __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_stride_x + y * src_stride_y + z * src_stride_z + w * (int)src_stride_z;
+
+        // Read values and rotate them properly if they would have been across paddings
+        VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, PAD_X_BEFORE_REMAINDER), SCALAR_COND(cond_left));
+        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, VEC_SIZE_LEFTOVER_READ), SCALAR_COND(cond_right));
+
+        // Check what values would be padding and replace them with the constant value
+        VEC_INT xs_out = (VEC_INT)(get_global_id(0) * VEC_SIZE) + VEC_OFFS(int, VEC_SIZE);
+        VEC_INT conds  = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
+        src_vals0      = select(src_vals0, (VEC_TYPE)CONST_VAL, CONVERT(conds, VEC_SELECT));
+
+        // Store values in bounds
+        STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
+    }
+}
+#endif // defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
+
+#if defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+
+#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
+#define SYMM_REFL_LEFT(x, n0, n1) select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
+#define SYMM_REFL_RIGHT(x, n0, n1) select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
+
+/** Perform a pad operation when PaddingMode is SYMMETRIC
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
+ * @note Number of values to the left when operating across left padding must be passed using the -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
+ * @note Number of values to the left when operating across right padding must be passed using the -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5
+ * @note When after pad X, starting point to read backward from must be passed using the -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253
+ * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be set to 0
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If the starting point to read backward from is less than the output's last element accessed in the X, the following compile flags must be passed at compile time to avoid negative offsets:
+ *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src),
+                                          TENSOR3D_DECLARATION(dst))
+{
+    // Get current thread position
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Define conditions based on the thread X position w.r.t. pad left and right
+    const int x_out_first         = x * VEC_SIZE;
+    const int x_out_last          = x_out_first + VEC_SIZE;
+    const int is_before_pad_left  = (x_out_last <= PAD_X_BEFORE);
+    const int is_across_pad_left  = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE);
+    const int is_inside_input     = (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE));
+    const int is_across_pad_right = (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE));
+    const int is_after_pad_right  = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE));
+
+    // Calculate base pointers
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes;
+    Tensor3D        dst      = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Calculate input tensor's offset based on the defined conditions
+    int x_offset = 0;
+    x_offset     = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left);
+    x_offset     = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input);
+    x_offset     = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right);
+    x_offset     = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right);
+
+#if defined(AFTER_PAD_REM)
+    int neg_offs = x_offset < 0;
+    x_offset     = max(x_offset, 0);
+#endif // defined(AFTER_PAD_REM)
+
+    // Load input values from the computed offset
+    int y_in = y;
+    int z_in = z;
+#if defined(PAD_Y_BEFORE)
+    y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE);
+    y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1, y >= (SRC_HEIGHT + PAD_Y_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+    z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE);
+    z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1, z >= (SRC_DEPTH + PAD_Z_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+
+    src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z;
+
+#if SRC_WIDTH == 1
+    VSTORE(VEC_SIZE)
+    ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
+#else // SRC_WIDTH == 1
+
+    VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+    // Choose rearrangement policy based on the defined conditions
+    src_vals0 = select(src_vals0, SYMM_REFL_LEFT(src_vals0, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL), SCALAR_COND(is_across_pad_left));
+    src_vals0 = select(src_vals0, SYMM_REFL_RIGHT(src_vals0, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL), SCALAR_COND(is_across_pad_right));
+    src_vals0 = select(src_vals0, REVERSE(src_vals0, VEC_SIZE), SCALAR_COND((is_before_pad_left || is_after_pad_right)));
+#if defined(AFTER_PAD_REM)
+    src_vals0 = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
+#endif // defined(AFTER_PAD_REM)
+
+    // Store values in bounds
+    STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
+#endif // SRC_WIDTH == 1
+}
+#endif // defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
diff --git a/src/core/CL/cl_kernels/common/permute.cl b/src/core/CL/cl_kernels/common/permute.cl
new file mode 100644
index 0000000000..1a97ca7495
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/permute.cl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+/**Perform a permute operation on an input tensor of Shape DCHW.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2, -DP2=1, -DP3=0 and -DP4=3.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void permute(TENSOR4D_DECLARATION(input),
+                      TENSOR4D_DECLARATION(output))
+
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+
+    int out_index[4] = { 0 };
+    int in_index[4]  = { 0 };
+
+    in_index[0] = get_global_id(0);            // W
+    in_index[1] = get_global_id(1);            // H
+    in_index[2] = get_global_id(2) % DEPTH_IN; // C
+    in_index[3] = get_global_id(2) / DEPTH_IN; // B
+
+    out_index[0] = in_index[P1];
+    out_index[1] = in_index[P2];
+    out_index[2] = in_index[P3];
+    out_index[3] = in_index[P4];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
new file mode 100644
index 0000000000..10875293a9
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else /* SATURATE */
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif /* SATURATE */
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
+
+#if defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+#endif // defined(ACTIVATION_TYPE)
+
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+
+/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
+ * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
+ * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_float(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
+    TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
+    const float scale)
+{
+    // Get pixels pointer
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr      = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load data
+    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_ACC_TYPE);
+    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_ACC_TYPE);
+
+    // Perform multiplication
+#ifdef DATA_TYPE_FLOAT
+    VEC_OUT_TYPE res0 = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_OUT_TYPE);
+#else  /* DATA_TYPE_FLOAT */
+    VEC_OUT_TYPE res0 = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((CONVERT(in1_data * in2_data, VEC_FLOAT) * scale), VEC_ACC_TYPE, ROUND), VEC_OUT_TYPE, ROUND);
+#endif /* DATA_TYPE_FLOAT */
+
+#if defined(ACTIVATION_TYPE)
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
+
+#if defined(DATA_TYPE)
+
+/** Performs a pixelwise multiplication of complex float values
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pixelwise_mul_complex(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    vin1 = vload2(0, (__global DATA_TYPE *)in1.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    vin2 = vload2(0, (__global DATA_TYPE *)in2.ptr);
+
+    // Perform complex multiplication
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
+
+#if defined(ACTIVATION_TYPE)
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res, A_VAL, B_VAL), 0, (__global DATA_TYPE *)out.ptr);
+#else  // defined(ACTIVATION_TYPE)
+    // Store result
+    vstore2(res, 0, (__global DATA_TYPE *)out.ptr);
+#endif // defined(ACTIVATION_TYPE)
+}
+
+#endif // defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
new file mode 100644
index 0000000000..6d1c2d0c79
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(SATURATE)
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
+#else // SATURATE
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
+#endif // SATURATE
+#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
+
+#define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
+
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+
+/** Performs a pixelwise multiplication with integer scale of integer inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data_type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
+ * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Integer scaling factor. Supported data types: S32.
+ */
+__kernel void pixelwise_mul_int(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
+    TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
+    const uint scale)
+{
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr            = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load data
+    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr), VEC_ACC_TYPE);
+    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr), VEC_ACC_TYPE);
+    // Perform multiplication and store result
+    VEC_OUT_TYPE out_data0 = MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, VEC_SIZE_OUT);
+    STORE_VECTOR_SELECT(out_data, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
+
+#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+
+/** Performs a pixelwise multiplication with float scale of quantized inputs.
+ *
+ * @note The quantization offset of the first operand must be passed at compile time only if asymmetric using -DOFFSET_IN1, e.g. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time only if asymmetric using -DOFFSET_IN2, e.g. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time only if asymmetric using -DOFFSET_OUT, e.g. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, e.g. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, e.g. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, e.g. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_quantized(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
+    TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
+    const float scale)
+{
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr            = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load data
+    VEC_INT in_a = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
+    VEC_INT in_b = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
+
+    // Dequantize
+#if defined(OFFSET_IN1)
+    in_a -= (VEC_INT)((int)OFFSET_IN1);
+#endif // defined(OFFSET_IN1)
+#if defined(OFFSET_IN2)
+    in_b -= (VEC_INT)((int)OFFSET_IN2);
+#endif // defined(OFFSET_IN2)
+    const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+
+#if defined(OFFSET_OUT)
+    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+#else  // defined(OFFSET_OUT)
+    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT);
+#endif // defined(OFFSET_OUT)
+    const VEC_TYPE res0 = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+
+    // Store result
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT) */
diff --git a/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
new file mode 100644
index 0000000000..4494dd8cec
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#if VEC_SIZE == 2
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 2)
+#define PERFORM_REDUCTION_IMPL(type)                                                   \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 2) sum) \
+    {                                                                                  \
+        sum.s0 += sum.s1;                                                              \
+        return sum.s0;                                                                 \
+    }
+#elif VEC_SIZE == 4
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 4)
+#define PERFORM_REDUCTION_IMPL(type)                                                   \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 4) sum) \
+    {                                                                                  \
+        sum.s01 += sum.s23;                                                            \
+        sum.s0 += sum.s1;                                                              \
+        return sum.s0;                                                                 \
+    }
+#elif VEC_SIZE == 8
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 8)
+#define PERFORM_REDUCTION_IMPL(type)                                                   \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 8) sum) \
+    {                                                                                  \
+        sum.s0123 += sum.s4567;                                                        \
+        sum.s01 += sum.s23;                                                            \
+        sum.s0 += sum.s1;                                                              \
+        return sum.s0;                                                                 \
+    }
+#else /* VEC_SIZE DEFAULT */
+#define VEC_SIZE 16
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 16)
+#define PERFORM_REDUCTION_IMPL(type)                                                    \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 16) sum) \
+    {                                                                                   \
+        sum.s01234567 += sum.s89abcdef;                                                 \
+        sum.s0123 += sum.s4567;                                                         \
+        sum.s01 += sum.s23;                                                             \
+        sum.s0 += sum.s1;                                                               \
+        return sum.s0;                                                                  \
+    }
+#endif /* VEC_SIZE END */
+
+#define PERFORM_REDUCTION_STR(input, type) perform_reduction_##type(input)
+#define PERFORM_REDUCTION(input, type) PERFORM_REDUCTION_STR(input, type)
+
+PERFORM_REDUCTION_IMPL(int)
+PERFORM_REDUCTION_IMPL(long)
+
+/** Compute quantized multiplier and shift for the inverse square root of input.
+ *  Using 3-bit fixed point and 5 iteration of Newton-Raphson method.
+ *
+ * @param[in] in            Input to use
+ * @param[in] reverse_shift -1 to reverse the shift direction
+ *
+ * @return:
+ *             .s0  Quantized multiplier for inverse square root
+ *             .s1  Shift for inverse square root
+ *
+ */
+inline int2 get_invsqrt_quantized_multiplier_exp(int in, int reverse_shift)
+{
+    int2 stddev_inv;
+    int  stddev_inv_multiplier = INT_MAX;
+    int  stddev_inv_shift      = 0;
+    int  input                 = in;
+    if(input <= 1)
+    {
+        stddev_inv.s0 = stddev_inv_multiplier;
+        stddev_inv.s1 = stddev_inv_shift;
+        return stddev_inv;
+    }
+
+    stddev_inv_shift = 11;
+    while(input >= (1 << 29))
+    {
+        input /= 4;
+        ++stddev_inv_shift;
+    }
+
+    const unsigned int max_left_shift_bits       = clz(input) - 1;
+    const unsigned int max_left_shift_bits_pairs = max_left_shift_bits / 2;
+    const unsigned int left_shift_bit_pairs      = max_left_shift_bits_pairs - 1;
+    stddev_inv_shift -= left_shift_bit_pairs;
+    input <<= 2 * left_shift_bit_pairs;
+
+    typedef int               FixedPointRawType;
+    const unsigned int        fixedpoint_position     = 3;
+    const unsigned int        fixedpoint_int_position = sizeof(FixedPointRawType) * 8 - 1 - fixedpoint_position;
+    typedef FixedPointRawType FixedPoint3;
+    typedef FixedPointRawType FixedPoint0;
+
+    const FixedPoint3 fixedpoint_input      = (input >> 1);
+    const FixedPoint3 fixedpoint_half_input = ASYMM_ROUNDING_DIVIDE_BY_POW2(fixedpoint_input, 1, 1);
+    const FixedPoint3 fixedpoint_half_three = (0x1 << fixedpoint_int_position) + (0x1 << (fixedpoint_int_position - 1));
+    FixedPoint3       x                     = 0x1 << fixedpoint_int_position;
+
+    const int num_iteration = 5;
+    for(int i = 0; i < num_iteration; i++)
+    {
+        int x3 = ASYMM_RESCALE(ASYMM_MULT(ASYMM_MULT(x, x, 1), x, 1), 9, fixedpoint_position, 1);
+        x      = ASYMM_RESCALE(ASYMM_MULT(fixedpoint_half_three, x, 1) - ASYMM_MULT(fixedpoint_half_input, x3, 1), 6, fixedpoint_position, 1);
+    }
+    const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
+    x                                        = ASYMM_MULT(fixedpoint_half_sqrt_2, x, 1);
+    stddev_inv_multiplier                    = x;
+    if(stddev_inv_shift < 0)
+    {
+        stddev_inv_multiplier <<= -stddev_inv_shift;
+        stddev_inv_shift = 0;
+    }
+    stddev_inv_shift *= reverse_shift;
+
+    stddev_inv.s0 = stddev_inv_multiplier;
+    stddev_inv.s1 = stddev_inv_shift;
+    return stddev_inv;
+}
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+/** This function implements QLSTM layer normalization.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QSYMM16
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  weight_ptr                           Pointer to the weight tensor. Supported data type: same as @p input_ptr
+ * @param[in]  weight_stride_x                      Stride of the weight tensor in X dimension (in bytes)
+ * @param[in]  weight_step_x                        weight_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weight_offset_first_element_in_bytes The offset of the first element in the weight tensor
+ * @param[in]  bias_ptr                             Pointer to the bias tensor. Supported data type: S32
+ * @param[in]  bias_stride_x                        Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bias_step_x                          bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes   The offset of the first element in the biases tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void qlstm_layer_normalization(
+    IMAGE_DECLARATION(input),
+    VECTOR_DECLARATION(weight),
+    VECTOR_DECLARATION(bias),
+    IMAGE_DECLARATION(output))
+{
+    // Get pixels pointer
+    Image  input  = CONVERT_TO_IMAGE_STRUCT(input);
+    Vector weight = CONVERT_TO_VECTOR_STRUCT(weight);
+    Vector bias   = CONVERT_TO_VECTOR_STRUCT(bias);
+    Image  output = CONVERT_TO_IMAGE_STRUCT(output);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    sum = 0;
+    VEC_DATA_TYPE(long, VEC_SIZE)
+    sum_sq = 0;
+    // Calculate partial sum
+    int i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        // Load data
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));
+
+        sum += CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE));
+        sum_sq += CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE)) * CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE));
+    }
+    // Perform reduction
+    sum.s0    = PERFORM_REDUCTION(sum, int);
+    sum_sq.s0 = PERFORM_REDUCTION(sum_sq, long);
+
+    // Left-overs loop
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));
+
+        sum.s0 += CONVERT(data, int);
+        sum_sq.s0 += CONVERT(data, long) * CONVERT(data, long);
+    }
+
+    int  temp       = 0x100000 / WIDTH;
+    int  mean       = (int)(sum.s0 * 1024 / WIDTH);
+    int  var2       = ((sum_sq.s0 * (long)temp) - ((long)mean * (long)mean)) / 0x100000;
+    int2 stddev_inv = get_invsqrt_quantized_multiplier_exp(var2, -1);
+
+    i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));
+        VEC_DATA_TYPE(int, VEC_SIZE)
+        res = CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE)) * 1024 - mean;
+        res = multiply_by_quantized_multiplier(res, stddev_inv.s0, stddev_inv.s1);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        w   = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)vector_offset(&weight, i));
+        res = res * CONVERT(w, VEC_DATA_TYPE(int, VEC_SIZE));
+        res = res + VLOAD(VEC_SIZE)(0, (__global int *)vector_offset(&bias, i));
+        // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;
+        res = (res + 512) >> 10;
+        res = multiply_by_quantized_multiplier(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12);
+#if defined(MIN_BOUND)
+        res = max(res, (VEC_DATA_TYPE(int, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+        res = min(res, (VEC_DATA_TYPE(int, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+        VSTORE(VEC_SIZE)
+        (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)offset(&output, i, 0));
+    }
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));
+        int res        = (int)data * 1024 - mean;
+        res            = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, stddev_inv.s0, stddev_inv.s1, 1);
+        DATA_TYPE w    = *((__global DATA_TYPE *)vector_offset(&weight, i));
+        res            = res * (int)w;
+        int b          = *((__global int *)vector_offset(&bias, i));
+        res            = res + b;
+        // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;
+        res = (res + 512) >> 10;
+        res = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12, 1);
+#if defined(MIN_BOUND)
+        res = max(res, MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+        res = min(res, MAX_BOUND);
+#endif // defined(MAX_BOUND)
+        *((__global DATA_TYPE *)offset(&output, i, 0)) = (DATA_TYPE)res;
+    }
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/quantization_layer.cl b/src/core/CL/cl_kernels/common/quantization_layer.cl
new file mode 100644
index 0000000000..69cc288c25
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/quantization_layer.cl
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
+#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
+
+/** This performs the quantization of floating point inputs or 8-bit quantized integers to 8-bit integers.
+ *
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE_IN=type. e.g. -DDATA_TYPE=short
+ * @note Output data type should be given as a preprocessor argument using -DDATA_TYPE_OUT=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale should be given as a preprocessor argument using -DSCALE=scale. e.g. -DSCALE=0.125
+ * @note Quantization offset should be given as a preprocessor argument using -DOFFSET=offset. e.g. -DOFFSET=125
+ * @note Minimum value for quantized type should be given as a preprocessor argument using -DMIN_QUANT_VAL=value. e.g. -DMIN_QUANT_VAL=0
+ * @note Maximum value for quantized type should be given as a preprocessor argument using -DMAX_QUANT_VAL=value. e.g. -DMAXIN_QUANT_VAL=255
+ * @note If the input data type if a floating point (F16 or F32) the preprocessor argument should be give as -DIS_FLOAT
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void quantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+#if defined(IS_FLOAT)
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    val_float = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE;
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset         = OFFSET;
+#else  // defined(IS_FLOAT)
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    val_float = CONVERT(val, VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE) vscale = SCALE;
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
+#endif // defined(IS_FLOAT)
+
+    // Quantize
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    res = CLAMP(CONVERT_RTE_VEC(val_float / vscale, int, VEC_SIZE) + voffset, MIN_QUANT_VAL, MAX_QUANT_VAL);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
+#else  //!defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE_IN *)input.ptr) / ((float)SCALE), int) + (int)OFFSET, MIN_QUANT_VAL, MAX_QUANT_VAL);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
diff --git a/src/core/CL/cl_kernels/range.cl b/src/core/CL/cl_kernels/common/range.cl
index d25d10e207..d25d10e207 100644
--- a/src/core/CL/cl_kernels/range.cl
+++ b/src/core/CL/cl_kernels/common/range.cl
diff --git a/src/core/CL/cl_kernels/common/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl
new file mode 100644
index 0000000000..99369be19a
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/reduction_operation.cl
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "helpers_asymm.h"
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isgreater(x, y))
+#define ISLESS(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isless(x, y))
+#define ISGREATER_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isgreater(x, y))
+#define ISLESS_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isless(x, y))
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#define ISGREATER_SCALAR ISGREATER
+#define ISLESS_SCALAR ISLESS
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x > y)
+#define ISLESS(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x < y)
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+#if defined(WIDTH)
+#if defined(OPERATION)
+
+#define sum(in0, in1, size) (in0 + SUM_REDUCE(in1, size))
+#define square_sum(in0, in1, size) (in0 + SUM_REDUCE((in1 * in1), size))
+#define product(in0, in1, size) (in0 * PROD_REDUCE(in1, size))
+#define min_(in0, in1, size) (min(in0, MIN_REDUCE(in1, size)))
+#define max_(in0, in1, size) (max(in0, MAX_REDUCE(in1, size)))
+
+/** This kernel performs parallel reduction given an operation on x-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
+ * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input
+ * @param[in] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reduction_operation_x(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + y * input_stride_y + z * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + y * output_stride_y + z * output_stride_z;
+
+#if !defined(MIN) && !defined(MAX)
+#if defined(PROD)
+    DATA_TYPE res = (DATA_TYPE)1;
+#else  // defined(PROD)
+    DATA_TYPE res = (DATA_TYPE)0;
+#endif // defined(PROD)
+#else  // #if !defined(MIN) && !defined(MAX)
+    DATA_TYPE res = *((__global DATA_TYPE *)input_addr);
+#endif // #if defined(MIN) || defined(MAX)
+    int x = 0;
+
+    for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
+        res  = OPERATION(res, vals, VEC_SIZE);
+    }
+
+#if(WIDTH % VEC_SIZE)
+    _Pragma("unroll") for(; x < WIDTH; ++x)
+    {
+        DATA_TYPE val = *((__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
+        res           = OPERATION(res, val, 1);
+    }
+#endif // (WIDTH % VEC_SIZE)
+
+#if defined(MEAN)
+    res /= WIDTH;
+#endif // defined(MEAN)
+    *((__global DATA_TYPE *)output_addr) = res;
+}
+#endif // defined(OPERATION)
+/** This kernel performs reduction on x-axis. (Non parallel)
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_non_parallel_x(
+    VECTOR_DECLARATION(input),
+    VECTOR_DECLARATION(output))
+{
+    Vector input  = CONVERT_TO_VECTOR_STRUCT(input);
+    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
+
+    DATA_TYPE_PROMOTED res = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, 0)), DATA_TYPE_PROMOTED);
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    float res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    for(unsigned int x = 1; x < WIDTH; ++x)
+    {
+        DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, x)), DATA_TYPE_PROMOTED);
+#if defined(MIN)
+        res = select(res, in, ISLESS_SCALAR(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER_SCALAR(in, res));
+#elif defined(PROD)
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+#else  // defined(SUM))
+        res += in;
+#endif // defined(MAX) || defined(MIN) || defined(PROD)
+    }
+
+    // Store result
+#if defined(MEAN)
+    res /= WIDTH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (WIDTH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    *((__global DATA_TYPE *)output.ptr) = CONVERT_SAT(res, DATA_TYPE);
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_y(
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_z,
+    uint            output_offset_first_element_in_bytes)
+{
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int z = get_global_id(1);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int y = 1; y < HEIGHT; ++y)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
+        res += in;
+#endif // defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= HEIGHT;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (HEIGHT - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_z(
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_stride_w,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_y,
+    uint            output_stride_w,
+    uint            output_offset_first_element_in_bytes)
+{
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int w = get_global_id(2);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + w * input_stride_w;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + w * output_stride_w;
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int z = 1; z < DEPTH; ++z)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
+        res += in;
+#endif // defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= DEPTH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (DEPTH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DEPTH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_stride_v                       Stride of the source tensor in V dimension (in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_stride_v                      Stride of the output tensor in V dimension (in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_w(
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_stride_w,
+    uint            input_stride_v,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_y,
+    uint            output_stride_z,
+    uint            output_stride_v,
+    uint            output_offset_first_element_in_bytes)
+{
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+
+    int gid_2 = get_global_id(2);
+    int z = get_global_id(2) % DEPTH;
+    int v = get_global_id(2) / DEPTH;
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z + v * input_stride_v;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z + v * output_stride_v;
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int w = 1; w < BATCH; ++w)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
+        res += in;
+#endif //defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= BATCH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (BATCH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
diff --git a/src/core/CL/cl_kernels/common/reshape_layer.cl b/src/core/CL/cl_kernels/common/reshape_layer.cl
new file mode 100644
index 0000000000..c47664bf85
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/reshape_layer.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform tensor reshape
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_shape                          Input spatial shape
+ * @param[in]  output_shape                         Output spatial shape
+ */
+__kernel void reshape_layer(TENSOR3D_DECLARATION(input),
+                            TENSOR3D_DECLARATION(output),
+                            int2 input_shape,
+                            int2 output_shape)
+{
+    int out_x = get_global_id(0);
+    int out_y = get_global_id(1);
+    int out_z = get_global_id(2);
+
+    // Compute the output linearized index
+    int out_linear_idx = out_x + out_y * output_shape.x + out_z * output_shape.x * output_shape.y;
+
+    // Translate to intput
+    int in_x = out_linear_idx % input_shape.x;
+    int in_y = (out_linear_idx / input_shape.x) % input_shape.y;
+    int in_z = out_linear_idx / (input_shape.x * input_shape.y);
+
+    // Store result
+    input_ptr += input_offset_first_element_in_bytes + in_x * input_stride_x + in_y * input_stride_y + in_z * input_stride_z;
+    output_ptr += output_offset_first_element_in_bytes + out_x * output_stride_x + out_y * output_stride_y + out_z * output_stride_z;
+    *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)input_ptr);
+}
diff --git a/src/core/CL/cl_kernels/common/reverse.cl b/src/core/CL/cl_kernels/common/reverse.cl
new file mode 100644
index 0000000000..e6df3041c2
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/reverse.cl
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
+
+#if NUM_REVERSE_DIMS > 4
+#error("Reversing more than 4 dimensions is not currently supported")
+#endif /* NUM_REVERSE_DIMS > 4 */
+
+/** Performs reverse along the specified axis.
+ *
+ * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
+ * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
+ * @note The number of dimensions of the source tensor must be given as a preprocessor argument using -DRANK=num, e.g. -DRANK=3
+ * @note The values in axis_tensor must be within [-rank, rank-1].
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  axis_ptr                           Pointer to the source vector. Supported data types: U32
+ * @param[in]  axis_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  axis_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  axis_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void reverse(TENSOR4D_DECLARATION(src),
+                      VECTOR_DECLARATION(axis),
+                      TENSOR4D_DECLARATION(dst),
+                      const uint width,
+                      const uint height,
+                      const uint depth,
+                      const uint batches)
+{
+    Tensor4D src  = CONVERT_TO_TENSOR4D_STRUCT(src, depth);
+    Vector   axis = CONVERT_TO_VECTOR_STRUCT_NO_STEP(axis);
+    Tensor4D dst  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst);
+
+    const uint x_in = get_global_id(0);
+    const uint y_in = get_global_id(1);
+    const uint z_in = get_global_id(2) % depth;
+    const uint w_in = get_global_id(2) / depth;
+
+    const uint4 dims       = (uint4)(0, 1, 2, 3);
+    int4        to_reverse = (int4)(0, 0, 0, 0);
+
+    VEC_DATA_TYPE(int, NUM_REVERSE_DIMS) indices =  VLOAD(NUM_REVERSE_DIMS)(0,(__global int *)axis.ptr);
+#if defined(USE_INVERTED_AXIS)
+    indices    = select((VEC_DATA_TYPE(int, NUM_REVERSE_DIMS)) RANK - 1, -1, indices < 0) - indices;
+#else /* defined(USE_INVERTED_AXIS) */
+    indices    = select(indices, indices + RANK, indices < 0);
+#endif /* defined(USE_INVERTED_AXIS) */
+
+#if NUM_REVERSE_DIMS == 1
+    to_reverse = ((uint4)indices == dims);
+#elif NUM_REVERSE_DIMS == 2
+    to_reverse = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
+#elif NUM_REVERSE_DIMS == 3
+    to_reverse = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims);
+#else /* NUM_REVERSE_DIMS == 1 */
+    to_reverse    = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
+#endif /* NUM_REVERSE_DIMS == 1 */
+
+    const uint x_out = to_reverse.s0 ? width - x_in - 1 : x_in;
+    const uint y_out = to_reverse.s1 ? height - y_in - 1 : y_in;
+    const uint z_out = to_reverse.s2 ? depth - z_in - 1 : z_in;
+    const uint w_out = to_reverse.s3 ? batches - w_in - 1 : w_in;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&dst, x_out, y_out, z_out, w_out)) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
diff --git a/src/core/CL/cl_kernels/common/roi_align_layer.cl b/src/core/CL/cl_kernels/common/roi_align_layer.cl
new file mode 100644
index 0000000000..8cfe5ddcb6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/roi_align_layer.cl
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
+// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
+#define EPS_GRID 0.00001f
+
+#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) // Check for compile time constants
+
+/** Performs a roi align on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return An average pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
+                               float bin_size_x,
+                               float grid_size_x,
+                               float region_end_x,
+                               float region_start_y,
+                               float bin_size_y,
+                               float grid_size_y,
+                               float region_end_y,
+                               int   pz)
+{
+    // Iterate through the pooling region
+    float sum = 0;
+    for(int iy = 0; iy < grid_size_y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size_x; ++ix)
+        {
+            // Align the window in the middle of every bin
+            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
+            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
+
+            // Interpolation in the unit square
+            const int y_low  = (int)y;
+            const int x_low  = (int)x;
+            const int y_high = y_low + 1;
+            const int x_high = x_low + 1;
+
+            const float ly = y - y_low;
+            const float lx = x - x_low;
+            const float hy = 1.f - ly;
+            const float hx = 1.f - lx;
+
+            const float w1 = hy * hx;
+            const float w2 = hy * lx;
+            const float w3 = ly * hx;
+            const float w4 = ly * lx;
+#if defined(NHWC)
+            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
+            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
+            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
+            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
+#else  // !defined(NHWC)
+            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#endif // defined(NHWC)
+            sum += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+        }
+    }
+
+    return (DATA_TYPE)(sum / (grid_size_x * grid_size_y));
+}
+
+/** Performs a roi align function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
+ *       will have a default sampling ratio of roi_dims/pooling_dims
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void roi_align_layer(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+#if defined(NHWC)
+    const int px = get_global_id(1);
+    const int py = get_global_id(2);
+    const int pw = get_global_id(0);
+#else  // !defined(NHWC)
+    const int                                  px = get_global_id(0);
+    const int                                  py = get_global_id(1);
+    const int                                  pw = get_global_id(2);
+#endif // defined(NHWC)
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    roi                 = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+    const float2 roi_anchor = convert_float2(roi.s01) * convert_float(SPATIAL_SCALE);
+    const float2 roi_dims   = fmax(convert_float2(roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
+
+    // Calculate pooled region start and end
+    const float2 spatial_indx     = (float2)(px, py);
+    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    const float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
+
+    const float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
+    float2       region_start = spatial_indx * bin_size + roi_anchor;
+    float2       region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+#if defined(SAMPLING_RATIO)
+    const float2 roi_bin_grid = SAMPLING_RATIO;
+#else  // !defined(SAMPLING_RATIO)
+    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
+    const float2 roi_bin_grid           = ceil(bin_size - EPS_GRID);
+#endif // defined(SAMPLING_RATIO)
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+#if defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
+#else  // !defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+#endif // defined(NHWC)
+        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
+                                                         region_start.x,
+                                                         bin_size.x,
+                                                         roi_bin_grid.x,
+                                                         region_end.x,
+                                                         region_start.y,
+                                                         bin_size.y,
+                                                         roi_bin_grid.y,
+                                                         region_end.y, pz);
+    }
+}
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
new file mode 100644
index 0000000000..e75dee06f6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
+// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
+#define EPS_GRID 0.00001f
+
+#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) && defined(OFFSET_IN) && defined(OFFSET_OUT) && defined(SCALE_IN) && defined(SCALE_OUT) && defined(OFFSET_ROIS) && defined(SCALE_ROIS) // Check for compile time constants
+
+/** Performs a roi align on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return An average pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
+                               float bin_size_x,
+                               float grid_size_x,
+                               float region_end_x,
+                               float region_start_y,
+                               float bin_size_y,
+                               float grid_size_y,
+                               float region_end_y,
+                               int   pz)
+{
+    // Iterate through the pooling region
+    float sum = 0;
+    for(int iy = 0; iy < grid_size_y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size_x; ++ix)
+        {
+            // Align the window in the middle of every bin
+            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
+            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
+
+            // Interpolation in the unit square
+            const int y_low  = (int)y;
+            const int x_low  = (int)x;
+            const int y_high = y_low + 1;
+            const int x_high = x_low + 1;
+
+            const float ly = y - y_low;
+            const float lx = x - x_low;
+            const float hy = 1.f - ly;
+            const float hx = 1.f - lx;
+
+            const float w1 = hy * hx;
+            const float w2 = hy * lx;
+            const float w3 = ly * hx;
+            const float w4 = ly * lx;
+#if defined(NHWC)
+            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
+            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
+            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
+            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
+#else  // !defined(NHWC)
+            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#endif // defined(NHWC)
+
+            const float data1_f32 = DEQUANTIZE(data1, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data2_f32 = DEQUANTIZE(data2, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data3_f32 = DEQUANTIZE(data3, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data4_f32 = DEQUANTIZE(data4, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            sum += w1 * data1_f32 + w2 * data2_f32 + w3 * data3_f32 + w4 * data4_f32;
+        }
+    }
+
+    const float res_f32 = sum / (grid_size_x * grid_size_y);
+    return QUANTIZE(res_f32, OFFSET_OUT, SCALE_OUT, DATA_TYPE, 1);
+}
+
+/** Performs a roi align function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=uchar
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
+ *       will have a default sampling ratio of roi_dims/pooling_dims
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }.
+ *                                                  Supported data types: QASYMM16 with 0.125f scale and 0 offset
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void roi_align_layer_quantized(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+#if defined(NHWC)
+    const int px = get_global_id(1);
+    const int py = get_global_id(2);
+    const int pw = get_global_id(0);
+#else  // !defined(NHWC)
+    const int                                  px = get_global_id(0);
+    const int                                  py = get_global_id(1);
+    const int                                  pw = get_global_id(2);
+#endif // defined(NHWC)
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = *((__global ushort *)offset(&rois, 0, pw));
+    float4 roi             = DEQUANTIZE(vload4(0, (__global ushort *)offset(&rois, 1, pw)), OFFSET_ROIS, SCALE_ROIS, ushort, 4);
+    float2 roi_anchor      = roi.s01 * convert_float(SPATIAL_SCALE);
+    float2 roi_dims        = fmax((roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
+
+    // Calculate pooled region start and end
+    float2 spatial_indx     = (float2)(px, py);
+    float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
+
+    float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
+    float2 region_start = spatial_indx * bin_size + roi_anchor;
+    float2 region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+#if defined(SAMPLING_RATIO)
+    float2 roi_bin_grid = SAMPLING_RATIO;
+#else  // !defined(SAMPLING_RATIO)
+    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
+    float2       roi_bin_grid           = ceil(bin_size - EPS_GRID);
+#endif // defined(SAMPLING_RATIO)
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+#if defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
+#else  // !defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+#endif // defined(NHWC)
+        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
+                                                         region_start.x,
+                                                         bin_size.x,
+                                                         roi_bin_grid.x,
+                                                         region_end.x,
+                                                         region_start.y,
+                                                         bin_size.y,
+                                                         roi_bin_grid.y,
+                                                         region_end.y, pz);
+    }
+}
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
index 6899b952e0..6899b952e0 100644
--- a/src/core/CL/cl_kernels/roi_pooling_layer.cl
+++ b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
diff --git a/src/core/CL/cl_kernels/common/scatter.cl b/src/core/CL/cl_kernels/common/scatter.cl
new file mode 100644
index 0000000000..e3ec9cc98e
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/scatter.cl
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+// The below defines the various reduce operations for our purposes.
+// Where a corresponds to the existing value, and b the new value.
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+
+#ifdef IS_FLOAT
+#define MAX_OP(a, b) fmax(a, b)
+#define MIN_OP(a, b) fmin(a, b)
+#else // ifdef IS_FLOAT
+#define MAX_OP(a, b) max(a, b)
+#define MIN_OP(a, b) min(a, b)
+#endif // ifdef IS_FLOAT
+
+#define UPDATE_OP(a, b) (b)
+
+#ifdef SCATTER_MP1D_2D_MPND
+
+/** This kernel performs scatter operation
+ *
+ * @note Datatype should be given as a compile-time argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Number of indices should be given as a compile-time argument using -DNUM_INDICES, e.g. -DNUM_INDICES=3
+ * @note Index length should be given as a compile-time argument using -DINDEX_LENGTH, e.g. -DINDEX_LENGTH=2
+ * @note Outermost output shapes should be given as a compile-time argument using -DOUT_SHAPE_N_MINUS_X, where
+ *       X must be 1,2,3,4,5, e.g. -DOUT_SHAPE_N_MINUS_1=3, ...
+ * @note Number of elements to copy in a row should be given as a compile-time argument using -DN0, e.g. -DN0=4
+ * @note Number of partial elements at the edge to copy in a row should be given as a compile-time argument using
+ *       -DPARTIAL_N0, e.g. -DPARTIAL_N0=2
+ * @note Scatter function should be given as a compile-time argument using -DSCATTER_FUNCTION, e.g. -DSCATTER_FUNCTION=ADD
+ * @note If the kernel should skip reading the output tensor, -DSKIP_OUTPUT_READ option should be provided.
+ * @note Kernel name in uppercase letters should be provided as a compile-time argument, e.g. -DSCATTER_MP1D_2D_MPND
+ *
+ * @param[in]  updates_ptr                           Pointer to the updates tensor. Data Types: F32
+ * @param[in]  updates_stride_x                      Stride of the updates tensor in X dimension (in bytes)
+ * @param[in]  updates_step_x                        updates_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  updates_stride_y                      Stride of the updates tensor in Y dimension (in bytes)
+ * @param[in]  updates_step_y                        updates_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  updates_offset_first_element_in_bytes The offset of the first element in the updates tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Data Types: S32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Same as @p upt_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  upt_block_stride                      Update tensor data block stride in bytes
+ * @param[in]  out_block_stride                      Output tensor data block stride in bytes
+ */
+__kernel void scatter_mp1d_2d_mpnd(
+    IMAGE_DECLARATION(updates),
+    IMAGE_DECLARATION(indices),
+    IMAGE_DECLARATION(output),
+    int upt_block_stride,
+    int out_block_stride
+    )
+{
+    const int out_shape[5] = {OUT_SHAPE_N_MINUS_1, OUT_SHAPE_N_MINUS_2, OUT_SHAPE_N_MINUS_3,
+        OUT_SHAPE_N_MINUS_4, OUT_SHAPE_N_MINUS_5};
+
+    const int x = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // x-coordinate in the tensor
+    const int y = get_global_id(1); // collapsed y-coordinate (ignoring the outermost dimensions)
+
+    const bool x_cond = (PARTIAL_N0 != 0 && get_global_id(0) == 0);
+
+    uchar *ind_ptr_raw = indices_ptr + indices_offset_first_element_in_bytes;
+    const uchar *out_ptr_raw =  output_ptr + output_offset_first_element_in_bytes
+        + x * sizeof(DATA_TYPE) + y * output_stride_y;
+
+    const uchar *upt_ptr_raw = updates_ptr + updates_offset_first_element_in_bytes
+        + x * sizeof(DATA_TYPE) + y * updates_stride_y;
+
+    for(int index_element = 0; index_element < NUM_INDICES; ++index_element)
+    {
+        const int *ind_ptr = (const int *) (ind_ptr_raw);
+
+        // Out of bounds check
+        bool out_of_bounds = false;
+        LOOP_UNROLLING(int, i, 0, 1, INDEX_LENGTH,
+        {
+            if(ind_ptr[i] >= out_shape[i] || ind_ptr[i] < 0)
+            {
+                out_of_bounds = true;
+            }
+        });
+
+        ind_ptr_raw += indices_stride_y;
+
+        if(out_of_bounds)
+        {
+            continue;
+        }
+
+        // Index calculation
+        int index = 0;
+        LOOP_UNROLLING(int, i, 0, 1, INDEX_LENGTH,
+        {
+            index = index * out_shape[i] + ind_ptr[i];
+        });
+
+        DATA_TYPE *out_ptr = (DATA_TYPE *) (out_ptr_raw + index * out_block_stride);
+
+        const DATA_TYPE *upt_ptr = (const DATA_TYPE *) (upt_ptr_raw + index_element * upt_block_stride);
+
+        VEC_DATA_TYPE(DATA_TYPE, N0) data_in0 = VLOAD(N0)(0, (__global DATA_TYPE *) upt_ptr);
+
+#ifdef SKIP_OUTPUT_READ
+        STORE_VECTOR_SELECT(data_in, DATA_TYPE, (__global DATA_TYPE *) out_ptr, N0, PARTIAL_N0, x_cond);
+#else // ifdef SKIP_OUTPUT_READ
+        VEC_DATA_TYPE(DATA_TYPE, N0) data_out0 = VLOAD(N0)(0, (__global DATA_TYPE *) out_ptr);
+        data_out0 = SCATTER_FUNCTION(data_out0, data_in0);
+
+        STORE_VECTOR_SELECT(data_out, DATA_TYPE, (__global DATA_TYPE *) out_ptr, N0, PARTIAL_N0, x_cond);
+#endif // ifdef SKIP_OUTPUT_READ
+    }
+}
+
+#endif // SCATTER_MP1D_2D_MPND
+
+#ifdef SCATTER1D_PARALLEL
+
+// NOTE : This code is non-deterministic and can only be excecuted with the "update" ScatterFunction
+// This code is currently unusued as it requires changes to the existing test suite.
+/** Performs the Scatter1D operation with multiple threads.
+ *  Similar to @ref scatter1D()
+ */
+__kernel void scatter1D_parallel(
+    TENSOR4D_DECLARATION(updates),
+    TENSOR4D_DECLARATION(indices),
+    TENSOR4D_DECLARATION(output))
+{
+    // Currently 1D - only iterate through x dimension of indices.
+    const int px = get_global_id(0);
+    const int index_value = *(uchar*)(indices_ptr + indices_offset_first_element_in_bytes + (sizeof(int) * px));
+
+    if(index_value < OUT_SHAPE_X)
+    {
+        const DATA_TYPE update = *(DATA_TYPE *)(updates_ptr + updates_offset_first_element_in_bytes + (sizeof(DATA_TYPE) * px));
+        __global uchar *out_addr = output_ptr + indices_offset_first_element_in_bytes + (sizeof(DATA_TYPE) * index_value);
+        *(__global DATA_TYPE *)(out_addr) = update;
+    }
+}
+
+#endif // SCATTER1D_PARALLEL
diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/common/select.cl
index 6fd4bd4ce3..6fd4bd4ce3 100644
--- a/src/core/CL/cl_kernels/select.cl
+++ b/src/core/CL/cl_kernels/common/select.cl
diff --git a/src/core/CL/cl_kernels/common/slice_ops.cl b/src/core/CL/cl_kernels/common/slice_ops.cl
new file mode 100644
index 0000000000..189d414aba
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/slice_ops.cl
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform a strided slice operation on a given input.
+ *
+ * @attention Supported tensor rank: up to 4
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depht should be given as a preprocessor arguments using -DDST_DEPTH=size
+ * @attention Absolute start coordinates for each dimension should be given as preprocessor -DSTART_index=value e.g. -DSTART_0=2
+ * @attention Strides for each dimension should be given as preprocessor -DSTRIDE_index=value e.g. -DSTRIDE_1=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void strided_slice(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+
+    int offset = 0;
+
+    // Offset X
+#if defined(SHRINK_0)
+    input.ptr += (int)START_0 * input_stride_x;
+#elif defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    offset       = (int)START_0 + min(xi, (int)LAST_ACCESSED_X);
+    input.ptr += offset * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+#elif defined(START_0) && defined(STRIDE_0)
+    offset = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0;
+    input.ptr += offset * input_stride_x;
+#endif // defined(START_0) && defined(STRIDE_0)
+
+    // Offset Y
+#if defined(SHRINK_1)
+    input.ptr += (int)START_1 * input_stride_y;
+#elif defined(START_1) && defined(STRIDE_1)
+#if defined(SHRINK_0)
+    offset = (int)START_1 + (int)get_global_id(0) * (int)STRIDE_1;
+#else  // defined(SHRINK_0)
+    offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
+#endif // defined(SHRINK_0)
+    input.ptr += offset * input_stride_y;
+#endif // defined(START_1) && defined(STRIDE_1)
+
+    // Offset Z
+#if defined(SHRINK_2)
+    input.ptr += (int)START_2 * input_stride_z;
+#elif defined(START_2) && defined(STRIDE_2)
+
+#if defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_2 + (int)get_global_id(0) * (int)STRIDE_2;
+#elif defined(SHRINK_1) || defined(SHRINK_0)
+    offset = (int)START_2 + (int)get_global_id(1) * (int)STRIDE_2;
+#else  // defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
+#endif // defined(SHRINK_1) && defined(SHRINK_0)
+
+    input.ptr += offset * input_stride_z;
+#endif // defined(START_2) && defined(STRIDE_2)
+
+    // Offset depth
+#if defined(SHRINK_3)
+    input.ptr += (int)START_3 * input_stride_w;
+#elif defined(START_3) && defined(STRIDE_3)
+#if defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_3 + (int)get_global_id(0) * (int)STRIDE_3;
+#elif !defined(SHRINK_2) && !defined(SHRINK_1) && !defined(SHRINK_0)
+    offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
+#elif(defined(SHRINK_0) && defined(SHRINK_1)) || (defined(SHRINK_1) && defined(SHRINK_2)) || (defined(SHRINK_0) && defined(SHRINK_2))
+    offset = (int)START_3 + (int)get_global_id(1) * (int)STRIDE_3;
+#else  // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_3 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_3;
+#endif // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    input.ptr += offset * input_stride_w;
+#endif // defined(START_3) && defined(STRIDE_3)
+
+    // Store result
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input.ptr));
+
+    VSTORE(VEC_SIZE)
+    (val, 0, (__global DATA_TYPE *)(output.ptr));
+#else  // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
diff --git a/src/core/CL/cl_kernels/common/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl
new file mode 100644
index 0000000000..bfc0995bb8
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/softmax_layer.cl
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#define MIN_VALUE_float -FLT_MAX
+#define MIN_VALUE_half  -HALF_MAX
+#define MIN_VALUE_char  CHAR_MIN
+#define MIN_VALUE_uchar 0
+
+#define MIN_VALUE_TYPE_STR(data_type) MIN_VALUE_##data_type
+#define MIN_VALUE_TYPE(data_type) MIN_VALUE_TYPE_STR(data_type)
+#define MIN_VALUE MIN_VALUE_TYPE(DATA_TYPE)
+
+#ifdef SOFTMAX_X
+
+/** 3-pass softmax in the x dimension.
+ *
+ * List of preprocessors:
+ *   - DATA_TYPE: the input/output data type.
+ *   - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ *     If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ *   - IS_LOG (optional): indicating whether this is log softmax.
+ *   - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ *   - BETA: the beta coefficient.
+ *   - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ *   - VEC_SIZE: the size of the vector.
+ *
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ *   - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ *   - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
+ *
+ * @param[in] src_ptr                  Pointer to the source tensor.
+ * @param[in] src_stride_0             Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1             Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2             Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr                  Pointer to the destination tensor.
+ * @param[in] dst_stride_0             Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1             Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2             Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr                  Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
+ */
+__kernel void softmax_x(
+    __global uchar *src_ptr,
+    uint src_stride_0,
+    uint src_stride_1,
+    uint src_stride_2,
+    uint src_offset_first_element,
+
+    __global uchar *dst_ptr,
+    uint dst_stride_0,
+    uint dst_stride_1,
+    uint dst_stride_2,
+    uint dst_offset_first_element
+
+#ifdef IS_QUANTIZED
+    ,
+    __global uchar *tmp_ptr,
+    uint tmp_stride_0,
+    uint tmp_stride_1,
+    uint tmp_stride_2,
+    uint tmp_offset_first_element
+#endif // IS_QUANTIZED
+)
+{
+    const int dim_0 = get_global_id(0);
+    const int dim_1 = get_global_id(1);
+    const int dim_2 = get_global_id(2);
+
+    src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+    dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
+
+#ifdef IS_QUANTIZED
+    tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
+#else // IS_QUANTIZED
+    __global uchar *tmp_ptr = dst_ptr;
+#endif // IS_QUANTIZED
+
+    // Calculate max value.
+    DATA_TYPE max_value = MIN_VALUE;
+    int i = 0;
+
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)));
+
+        max_value = max(max_value, MAX_REDUCE(data, VEC_SIZE));
+    }
+
+    for (; i < LENGTH; ++i)
+    {
+        DATA_TYPE data = *(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE));
+
+        max_value = max(max_value, data);
+    }
+
+    // Regularize the data.
+    TMP_DATA_TYPE sum_value = 0;
+
+#ifdef IS_QUANTIZED
+    TMP_DATA_TYPE max_value_f = (CONVERT(max_value, TMP_DATA_TYPE) - SRC_OFFSET) * SRC_SCALE;
+    TMP_DATA_TYPE regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += SUM_REDUCE(exp(data), VEC_SIZE);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += SUM_REDUCE(data, VEC_SIZE);
+#endif // IS_LOG
+
+        VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
+    }
+
+    for (; i < LENGTH; ++i)
+    {
+        TMP_DATA_TYPE data = CONVERT(*(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)), TMP_DATA_TYPE);
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += exp(data);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += data;
+#endif // IS_LOG
+
+        *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)) = data;
+    }
+
+#undef REGULARIZE
+
+    // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+    TMP_DATA_TYPE norm_offset = -log(sum_value) + DST_OFFSET;
+#  define NORMALIZE(SIZE, x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, SIZE), rte)
+# else // IS_LOG
+    TMP_DATA_TYPE norm_div = sum_value * DST_SCALE;
+#  define NORMALIZE(SIZE, x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, SIZE))
+#  endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+#  define NORMALIZE(SIZE, x) ((x) - log(sum_value))
+# else // IS_LOG
+#  define NORMALIZE(SIZE, x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
+
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result = NORMALIZE(VEC_SIZE, data);
+
+        VSTORE(VEC_SIZE)(result, 0, (__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)));
+    }
+
+    for (; i < LENGTH; ++i)
+    {
+        TMP_DATA_TYPE data = *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE));
+
+        DATA_TYPE result = NORMALIZE(1, data);
+
+        *(__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)) = result;
+    }
+
+#undef NORMALIZE
+}
+
+#endif // SOFTMAX_X
+
+#ifdef SOFTMAX_NON_X
+
+/** 3-pass softmax in any dimension higher than the x dimension.
+ *
+ * List of preprocessors:
+ *   - DATA_TYPE: the input/output data type.
+ *   - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ *     If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ *   - IS_LOG (optional): indicating whether this is log softmax.
+ *   - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ *   - BETA: the beta coefficient.
+ *   - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ *   - VEC_SIZE: the size of the vector.
+ *   - VEC_SIZE_LEFTOVER: the size of the leftover part.
+ *
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ *   - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ *   - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
+ *
+ * @param[in] src_ptr                  Pointer to the source tensor.
+ * @param[in] src_stride_0             Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1             Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2             Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr                  Pointer to the destination tensor.
+ * @param[in] dst_stride_0             Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1             Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2             Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr                  Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
+ */
+__kernel void softmax_non_x(
+    __global uchar *src_ptr,
+    uint src_stride_0,
+    uint src_stride_1,
+    uint src_stride_2,
+    uint src_offset_first_element,
+
+    __global uchar *dst_ptr,
+    uint dst_stride_0,
+    uint dst_stride_1,
+    uint dst_stride_2,
+    uint dst_offset_first_element,
+
+    __global uchar *tmp_ptr,
+    uint tmp_stride_0,
+    uint tmp_stride_1,
+    uint tmp_stride_2,
+    uint tmp_offset_first_element,
+
+    uint src_stride_axis,
+    uint dst_stride_axis
+)
+{
+    const int dim_0 = max((int)get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE, 0);
+    const int dim_1 = get_global_id(1);
+    const int dim_2 = get_global_id(2);
+
+    src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+    dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
+    tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
+
+    // In case of processing quantized data, i.e. DATA_TYPE is smaller than TMP_DATA_TYPE:
+    //
+    // In the first pass (finding max), the quantized data is copied from the input tensor to the temporary tensor.
+    // Dequantization is not needed to find the max value and since dequantization widens the data, we defer it
+    // to the second pass pass to reduce memory bandwidth of the first pass.
+    //
+    // In the second pass, it reads the quantized data from the temporary tensor and writes the dequantized data
+    // back to the temporary tensor.
+    //
+    // To avoid dequantized data overwritting the unprocessed quantized data in the temporary tensor,
+    // this extra offset is introduced to store the quantized data at the end of the temporary tensor.
+    //
+    // Note: Another approach is to perform the second pass in reverse order, but for unexplanable reason
+    // it doesn't work in some devices.
+    uint tmp_extra_offset = LENGTH * VEC_SIZE * (sizeof(TMP_DATA_TYPE) - sizeof(DATA_TYPE));
+
+    // Calculate max value and store the input data to the temporary tensor in suitable format.
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) max_value = MIN_VALUE;
+    int i = 0;
+
+    for (i = 0; i < LENGTH; ++i)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * src_stride_axis));
+
+        max_value = max(max_value, data);
+
+        VSTORE(VEC_SIZE)(data, 0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE)));
+    }
+
+    // Regularize the data.
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) sum_value = 0;
+
+#ifdef IS_QUANTIZED
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) max_value_f = (CONVERT(max_value, VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE)) - SRC_OFFSET) * SRC_SCALE;
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH; ++i)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += exp(data);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += data;
+#endif // IS_LOG
+
+        VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
+    }
+
+#undef REGULARIZE
+
+    // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_offset = -log(sum_value) + DST_OFFSET;
+#  define NORMALIZE(x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE), rte)
+# else // IS_LOG
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_div = sum_value * DST_SCALE;
+#  define NORMALIZE(x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, VEC_SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
+#  endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+#  define NORMALIZE(x) ((x) - log(sum_value))
+# else // IS_LOG
+#  define NORMALIZE(x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH; ++i)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
+
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result0 = NORMALIZE(data);
+
+        STORE_VECTOR_SELECT(result, DATA_TYPE, dst_ptr + i * dst_stride_axis, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+    }
+
+#undef NORMALIZE
+}
+
+#endif // SOFTMAX_NON_X
+
+#undef MIN_VALUE
+#undef MIN_VALUE_TYPE
+#undef MIN_VALUE_TYPE_STR
+
+#undef MIN_VALUE_float
+#undef MIN_VALUE_half
+#undef MIN_VALUE_char
+#undef MIN_VALUE_uchar
diff --git a/src/core/CL/cl_kernels/common/stack_layer.cl b/src/core/CL/cl_kernels/common/stack_layer.cl
new file mode 100644
index 0000000000..2468bf750d
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/stack_layer.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
+
+#if AXIS == 0
+#define X_DST (idx_input)
+#define Y_DST (x_src)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 1 // AXIS == 1
+#define X_DST (x_src)
+#define Y_DST (idx_input)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 2 // AXIS == 2
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (idx_input)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 3 // AXIS == 3
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (idx_input)
+#define K_DST (w_src)
+#elif AXIS == 4 // AXIS == 4
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (w_src)
+#define K_DST (idx_input)
+#else // AXIS not supported
+#error "Not supported axis"
+#endif // AXIS == 0
+
+/** OpenCL kernel to stack a rank-R tensor into one with rank-(R+1) along the axis dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note The dimension to stack the tensors along has to be passed at compile time using -DAXIS. i.e. -DAXIS=1
+ * @note Dimension 2 of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=112)
+ * @note Dimension 3 of the output tensor must be passed at compile time using -DDST_DIM3 (e.g. -DDST_DIM3=112)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_input                         Index of the input tensor in the list of tensors to stack
+ */
+__kernel void stack_layer(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    unsigned int idx_input)
+{
+    uint x_src = get_global_id(0);
+    uint y_src = get_global_id(1);
+    uint z_src = (get_global_id(2) % SRC_DIM2);
+    uint w_src = (get_global_id(2) / SRC_DIM2);
+
+    __global DATA_TYPE *src = (__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_src * sizeof(DATA_TYPE) + y_src * src_stride_y + z_src * src_stride_z + w_src * src_stride_w);
+
+    __global DATA_TYPE *dst = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + X_DST * sizeof(DATA_TYPE) + Y_DST * dst_stride_y + Z_DST * dst_stride_z + W_DST * dst_stride_w + K_DST *
+                                                     dst_stride_w * (uint)DST_DIM3);
+
+    *dst = *src;
+}
+
+#undef X_DST
+#undef Y_DST
+#undef Z_DST
+#undef W_DST
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
diff --git a/src/core/CL/cl_kernels/common/tile.cl b/src/core/CL/cl_kernels/common/tile.cl
new file mode 100644
index 0000000000..4d8f802ea1
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/tile.cl
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
+/** Perform a floor operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void tile(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+
+    // For all coordinates but x, each tile copies from the input
+    const int y     = get_global_id(1);
+    const int z     = get_global_id(2) % DST_DEPTH;
+    const int batch = get_global_id(2) / DST_DEPTH;
+
+#if defined(VEC_SIZE) && defined(OFFSET)
+    // If we are loading/storing multiple elements at time, we need to
+    // not exceed the input boundaries. The last threads need to backtrack
+    // of OFFSET elements. Those elements cumulates for previous tiles
+
+    const int id          = (int)(get_global_id(0));
+    const int multiple_no = id / SRC_WIDTH_TILES;
+    const int tile_no     = id % SRC_WIDTH_TILES;
+    const int last_tile   = (int)(tile_no == SRC_WIDTH_TILES - 1);
+
+    const int x_input  = tile_no * VEC_SIZE - last_tile * OFFSET;
+    const int x_output = multiple_no * SRC_WIDTH + x_input;
+
+    // Update the input and output pointers.
+    input.ptr  = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    output.ptr = tensor4D_offset(&output, x_output, y, z, batch);
+
+    // Copy the data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+#else  // !defined(VEC_SIZE) || !defined(OFFSET)
+    const int x = get_global_id(0);
+
+    // Update the input and output pointers.
+    input.ptr  = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    output.ptr = tensor4D_offset(&output, x, y, z, batch);
+
+    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(OFFSET)
+}
+#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
diff --git a/src/core/CL/cl_kernels/common/transpose.cl b/src/core/CL/cl_kernels/common/transpose.cl
new file mode 100644
index 0000000000..5b4c68ca10
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/transpose.cl
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define PARTIAL_STORE_M0 VEC_SIZE_LEFTOVER_X
+#define PARTIAL_STORE_N0 VEC_SIZE_LEFTOVER_Y
+
+#include "helpers.h"
+#include "repeat.h"
+
+#if defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
+
+#if VEC_SIZE_X == 1
+#if VEC_SIZE_Y == 1
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0                \
+    }
+#elif VEC_SIZE_Y == 2
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1            \
+    }
+#elif VEC_SIZE_Y == 3
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1, u2        \
+    }
+#elif VEC_SIZE_Y == 4
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1, u2, u3    \
+    }
+#elif VEC_SIZE_Y == 8
+#define TRANSPOSED_U(val)              \
+    {                                  \
+        u0, u1, u2, u3, u4, u5, u6, u7 \
+    }
+#elif VEC_SIZE_Y == 16
+#define TRANSPOSED_U(val)                        \
+    {                                            \
+        u0, u1, u2, u3, u4, u5, u6, u7,          \
+        u8, u9, u10, u11, u12, u13, u14, u15 \
+    }
+#endif /* switch VEC_SIZE_Y */
+#else  // VEC_SIZE_X == 1
+#if VEC_SIZE_Y == 1
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0.val            \
+    }
+#elif VEC_SIZE_Y == 2
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0.val, u1.val    \
+    }
+#elif VEC_SIZE_Y == 3
+#define TRANSPOSED_U(val)      \
+    {                          \
+        u0.val, u1.val, u2.val \
+    }
+#elif VEC_SIZE_Y == 4
+#define TRANSPOSED_U(val)              \
+    {                                  \
+        u0.val, u1.val, u2.val, u3.val \
+    }
+#elif VEC_SIZE_Y == 8
+#define TRANSPOSED_U(val)                                              \
+    {                                                                  \
+        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val \
+    }
+#elif VEC_SIZE_Y == 16
+#define TRANSPOSED_U(val)                                                        \
+    {                                                                            \
+        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val,          \
+        u8.val, u9.val, u10.val, u11.val, u12.val, u13.val, u14.val, u15.val \
+    }
+#endif /* switch VEC_SIZE_Y */
+#endif // VEC_SIZE_X == 1
+
+#if DATA_TYPE_IN_BYTES == 4
+#define DATA_TYPE uint
+#elif DATA_TYPE_IN_BYTES == 2
+#define DATA_TYPE ushort
+#elif DATA_TYPE_IN_BYTES == 1
+#define DATA_TYPE uchar
+#else /* switch DATA_TYPE_IN_BYTES */
+#error DATA_TYPE_IN_BYTES not supported for transpose
+#endif /* switch DATA_TYPE_IN_BYTES */
+
+/** This OpenCL kernel computes the matrix transposition of input matrix
+ *
+ * @note The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be:
+ *  -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
+ *  -# -DVEC_SIZE_X is the number of elements processed in X dimension
+ *  -# -DVEC_SIZE_LEFTOVER_X is the leftover size in the X dimension; x_dimension % VEC_SIZE_X
+ *  -# -DVEC_SIZE_Y is the number of elements processed in Y dimension
+ *  -# -DVEC_SIZE_LEFTOVER_Y is the leftover size in the Y dimension; y_dimension % VEC_SIZE_Y
+ *
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination matrix in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_gx_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void transpose(TENSOR3D_DECLARATION(src),
+                        TENSOR3D_DECLARATION(dst))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+    uint y_offs = max((int)(get_global_id(1) * VEC_SIZE_Y - (VEC_SIZE_Y - VEC_SIZE_LEFTOVER_Y) % VEC_SIZE_Y), 0);
+    uint z_offs = get_global_id(2);
+
+    // Compute addresses
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y + z_offs * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y + z_offs * dst_stride_z;
+
+    // Load the NxM block at (x, y)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u0 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)src_addr);
+#if VEC_SIZE_Y > 1
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u1 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + src_stride_y));
+#endif /* VEC_SIZE_Y > 1 */
+#if VEC_SIZE_Y > 2
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u2 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif /* VEC_SIZE_Y > 2 */
+#if VEC_SIZE_Y > 3
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u3 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif /* VEC_SIZE_Y > 3 */
+#if VEC_SIZE_Y > 4
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u4 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u5 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u6 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u7 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
+#endif /* VEC_SIZE_Y > 4 */
+#if VEC_SIZE_Y > 8
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u8 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u9 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 9 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u10 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 10 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u11 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 11 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u12 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 12 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u13 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 13 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u14 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 14 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u15 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 15 * src_stride_y));
+#endif /* VEC_SIZE_Y > 8 */
+
+    //Create transposed vectors
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t0 = TRANSPOSED_U(s0);
+#if VEC_SIZE_X > 1
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t1 = TRANSPOSED_U(s1);
+#endif /* VEC_SIZE_X > 1 */
+#if VEC_SIZE_X > 2
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t2 = TRANSPOSED_U(s2);
+#endif /* VEC_SIZE_X > 2 */
+#if VEC_SIZE_X > 3
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t3 = TRANSPOSED_U(s3);
+#endif /* VEC_SIZE_X > 3 */
+#if VEC_SIZE_X > 4
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t4 = TRANSPOSED_U(s4);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t5 = TRANSPOSED_U(s5);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t6 = TRANSPOSED_U(s6);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t7 = TRANSPOSED_U(s7);
+#endif /* VEC_SIZE_X > 4 */
+#if VEC_SIZE_X > 8
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t8 = TRANSPOSED_U(s8);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t9 = TRANSPOSED_U(s9);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tA = TRANSPOSED_U(sA);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tB = TRANSPOSED_U(sB);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tC = TRANSPOSED_U(sC);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tD = TRANSPOSED_U(sD);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tE = TRANSPOSED_U(sE);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tF = TRANSPOSED_U(sF);
+#endif /* VEC_SIZE_X > 8 */
+
+    // Store the block at (y, x)
+    REPEAT_VAR_INIT_TO_CONST(VEC_SIZE_X, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+    STORE_BLOCK_BOUNDARY_AWARE(VEC_SIZE_X, VEC_SIZE_Y, DATA_TYPE, t, (__global uchar *)dst_addr, dst_stride_y, zout, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_Y, VEC_SIZE_LEFTOVER_X != 0
+                               && get_global_id(0) == 0,
+                               VEC_SIZE_LEFTOVER_Y != 0 && get_global_id(1) == 0);
+}
+
+#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
diff --git a/src/core/CL/cl_kernels/common/unpooling_layer.cl b/src/core/CL/cl_kernels/common/unpooling_layer.cl
new file mode 100644
index 0000000000..6662dc9360
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/unpooling_layer.cl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+/** Performs max unpooling function with pool size equal to 2.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
+ * @note The width of the output tensor must be passed using -DWIDTH_DST e.g. -DWIDTH_DST=24
+ * @note The height of the output tensor must be passed using -DHEIGHT_DST e.g. -DHEIGHT_DST=54
+ * @note The depth of the output tensor must be passed using -DDEPTH_DST e.g. -DDEPTH_DST=32
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the output tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the output tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the output tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the output tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the output tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void max_unpooling_layer_2(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    unsigned int index = *((__global unsigned int *)indices.ptr);
+    DATA_TYPE value    = *((__global DATA_TYPE *)input.ptr);
+
+    *((__global DATA_TYPE *)tensor3D_index2ptr(&output, WIDTH_DST, HEIGHT_DST, DEPTH_DST, index)) = value;
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/comparisons.cl b/src/core/CL/cl_kernels/comparisons.cl
deleted file mode 100644
index 408846144d..0000000000
--- a/src/core/CL/cl_kernels/comparisons.cl
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define EQUAL(x, y) ((x) == (y))
-#define NOTEQUAL(x, y) ((x) != (y))
-#define GREATER(x, y) ((x) > (y))
-#define GREATEREQUAL(x, y) ((x) >= (y))
-#define LESS(x, y) ((x) < (y))
-#define LESSEQUAL(x, y) ((x) <= (y))
-
-#define DEFINE_KERNEL_STR(name) compare_##name
-#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
-
-#define DEFINE_KERNEL_QUANTIZED_STR(name) compare_##name##_quantized
-#define DEFINE_KERNEL_QUANTIZED(name) DEFINE_KERNEL_QUANTIZED_STR(name)
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME)
-/** This function compares two tensors.
- *
- * @attention The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All non-quantized data types.
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void DEFINE_KERNEL(OP_NAME)(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_b = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2.ptr);
-
-    // Calculate and store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(OP(in_a, in_b), VEC_DATA_TYPE(uchar, VEC_SIZE)), 0, (__global uchar *)out.ptr);
-}
-#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
-
-#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2)
-/** This function compares two quantized tensors.
- *
- * @note The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
- * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
- * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
- * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All quantized data types.
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void DEFINE_KERNEL_QUANTIZED(OP_NAME)(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    int16 in_a = CONVERT(vload16(0, (__global DATA_TYPE *)in1.ptr), int16);
-    int16 in_b = CONVERT(vload16(0, (__global DATA_TYPE *)in2.ptr), int16);
-
-    in_a = in_a - (int16)((int)OFFSET_IN1);
-    in_b = in_b - (int16)((int)OFFSET_IN2);
-
-    const float16 in1f32 = convert_float16(in_a) * (float16)((float)SCALE_IN1);
-    const float16 in2f32 = convert_float16(in_b) * (float16)((float)SCALE_IN2);
-    const int16   res    = OP(in1f32, in2f32);
-
-    // Store result
-    vstore16(convert_uchar16(res), 0, (__global uchar *)out.ptr);
-}
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) */
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
deleted file mode 100644
index d2e65408dc..0000000000
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ /dev/null
@@ -1,415 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_QUANT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset, float in_scale, float out_scale)
-{
-    const VEC_FLOAT in_f32  = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale);
-    const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset));
-    const VEC_QUANT res_q8  = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_QUANT);
-    return res_q8;
-}
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-#if defined(DATA_TYPE)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#if defined(DEPTH) && defined(ELEMENT_SIZE)
-#if defined(INPUT1_WIDTH)
-
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define SEQ VEC_OFFS(int, VEC_SIZE)
-
-/** This kernel concatenates two input tensors into the output tensor along the first dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
- * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
- *
- * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All.
- * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
- * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- */
-__kernel void concatenate_width_x2(
-    TENSOR4D_DECLARATION(src1),
-    TENSOR4D_DECLARATION(src2),
-    TENSOR4D_DECLARATION(dst))
-{
-    // Calculate input indices
-    const int x  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y  = get_global_id(1);
-    const int z  = get_global_id(2) % (int)DEPTH;
-    const int w  = get_global_id(2) / (int)DEPTH;
-    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
-    const int x2 = max(x - (int)INPUT1_WIDTH, 0);
-
-    // Calculate inputs and output addresses
-    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
-    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
-    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
-
-    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
-    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
-    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1)  && defined(SCALE_IN2) && defined(SCALE_OUT) */
-    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
-
-    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
-    SELECT_TYPE cond = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH) && ((VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
-    src1_values      = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
-    src2_values      = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
-
-    cond                   = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
-    const VEC_TYPE values0 = select(src2_values, src1_values, cond);
-
-    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
-/** This kernel concatenates four input tensors into the output tensor along the first dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
- * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
- * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
- * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
- *
- * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
- * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  src3_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
- * @param[in]  src3_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src3_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src3_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src3_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src3_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src3_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src3_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src3_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src3_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  src4_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
- * @param[in]  src4_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src4_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src4_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src4_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src4_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src4_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src4_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src4_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src4_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- */
-__kernel void concatenate_width_x4(
-    TENSOR4D_DECLARATION(src1),
-    TENSOR4D_DECLARATION(src2),
-    TENSOR4D_DECLARATION(src3),
-    TENSOR4D_DECLARATION(src4),
-    TENSOR4D_DECLARATION(dst))
-{
-    // Calculate input indices
-    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2) % (int)DEPTH;
-    const int w = get_global_id(2) / (int)DEPTH;
-
-    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
-    const int x2 = min(max(x - (int)INPUT1_WIDTH, 0), (int)INPUT2_WIDTH - (int)VEC_SIZE);
-    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, 0), (int)INPUT3_WIDTH - (int)VEC_SIZE);
-    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, 0);
-
-    // Calculate inputs and output addresses
-    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
-    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
-    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
-    const __global uchar *src3_addr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * sizeof(DATA_TYPE) + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
-    const __global uchar *src4_addr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * sizeof(DATA_TYPE) + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
-
-    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
-    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
-    VEC_TYPE src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src3_addr);
-    VEC_TYPE src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src4_addr);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
-    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
-    src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
-    src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
-
-    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
-
-    SELECT_TYPE cond_in2 = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
-    SELECT_TYPE cond_in3 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH - VEC_SIZE)), SELECT_TYPE);
-    SELECT_TYPE cond_in4 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH - VEC_SIZE)), SELECT_TYPE);
-
-    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
-    src1_values = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
-    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
-    // Rotate src2/3_values, if values0 is a combination of src2_values and src3_values.
-    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
-    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
-    // Rotate src3/4_values, if values0 is a combination of src3_values and src4_values.
-    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
-    src4_values = select(src4_values, ROTATE(src4_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
-
-    cond_in2 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
-    cond_in3 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH), SELECT_TYPE);
-    cond_in4 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), SELECT_TYPE);
-
-    VEC_TYPE values0 = select(src2_values, src1_values, cond_in2);
-    values0          = select(src3_values, values0, cond_in3);
-    values0          = select(src4_values, values0, cond_in4);
-
-    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
-#endif /* defined(INPUT1_WIDTH) */
-#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
-
-#if defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-/** This kernel concatenates the input tensor into the output tensor along the first dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-
-__kernel void concatenate_width(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst))
-{
-    // Calculate input indices
-    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2) % (int)DEPTH;
-    const int w = get_global_id(2) / (int)DEPTH;
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + z * src_stride_z + w * src_stride_w;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + w * dst_stride_w;
-
-    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-}
-
-#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)*/
-
-#if defined(VEC_SIZE_LEFTOVER)
-
-#if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
-/** This kernel concatenates the input tensor into the output tensor along the second dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note Vector sizes supported are 2,4,8 and 16.
- * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-
-__kernel void concatenate_height(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst))
-{
-    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + (get_global_id(2) % DEPTH) * src_stride_z + (get_global_id(
-                                   2) / DEPTH) * src_stride_w;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + (get_global_id(2) % DEPTH) * dst_stride_z + (get_global_id(
-                                   2) / DEPTH) * dst_stride_w;
-
-    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-}
-
-#endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */
-
-/** This kernel concatenates the input tensor into the output tensor along the third dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  offsets                           The offsets to the first valid element of the output tensor in bytes
- */
-__kernel void concatenate(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    int offset)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    source_values0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + offset, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(VEC_SIZE_LEFTOVER) */
-#endif /* defined(DATA_TYPE) */
-#endif /* defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/convert_fc_weights.cl
deleted file mode 100644
index a451c0213b..0000000000
--- a/src/core/CL/cl_kernels/convert_fc_weights.cl
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
-/** Perform a NCHW -> NHWC or NHWC -> NCHW conversion for Fully Connected 2D weights.
- *
- * For NCHW -> NHWC, FACTOR_1 will be equal to the product of the first two dimensions of FullyConnectedLayer's input and FACTOR_2 will represent the number of channels of that tensor.
- * For NHWC -> NCHW, FACTOR_1 and FACTOR_2 will hold the same values, but swapped.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convert_fc_weights(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_x + (get_global_id(1) % FACTOR_1 * FACTOR_2 + get_global_id(1) / FACTOR_1) * dst_stride_y;
-
-    *((__global DATA_TYPE *)dst_addr) = *((__global DATA_TYPE *)src.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
deleted file mode 100644
index cfd1f12328..0000000000
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(NUM_GROUPS)
-/** This kernel reshapes the tensor's low three dimensions to single column
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=number. e.g. -DNUM_GROUPS=2
- *
- * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  bias_ptr                           Pointer to the bias tensor. Supported data types: F16/F32, for quantized types this must be nullptr
- * @param[in]  bias_stride_x                      Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  bias_step_x                        bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  width                              The width of the input tensor
- * @param[in]  height                             The height of the input tensor
- * @param[in]  depth                              The depth of the input tensor
- * @param[in]  total_filters                      Total number of filters. 4th dimension of the weights matrix
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- */
-__kernel void reshape_to_columns(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(bias),
-#endif /* HAS_BIAS */
-    uint width, uint height, uint depth, uint total_filters, uint dst_stride_z)
-{
-    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
-    bool     is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
-
-    __global uchar *tmp_src_ptr = src.ptr;
-    __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
-                                      2) * width * height * dst_stride_y;
-#ifdef HAS_BIAS
-    __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
-#endif /* HAS_BIAS */
-
-    if(is_last_thread)
-    {
-        for(uint g = 0; g < NUM_GROUPS; ++g)
-        {
-            __global uchar *curr_group_dst = tmp_dst_ptr;
-
-            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
-            {
-                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
-
-#ifdef HAS_BIAS
-                *((__global DATA_TYPE *)(curr_group_dst + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
-                tmp_bias_ptr += bias_stride_x;
-#endif /* HAS_BIAS */
-                tmp_src_ptr += depth * src_stride_z;
-                curr_group_dst += dst_stride_x;
-            }
-
-            tmp_dst_ptr += dst_stride_z;
-        }
-    }
-    else
-    {
-        for(uint g = 0; g < NUM_GROUPS; ++g)
-        {
-            __global uchar *curr_group_dst = tmp_dst_ptr;
-
-            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
-            {
-                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
-                tmp_src_ptr += depth * src_stride_z;
-                curr_group_dst += dst_stride_x;
-            }
-
-            tmp_dst_ptr += dst_stride_z;
-        }
-    }
-}
-#endif // defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
deleted file mode 100644
index 9c90969827..0000000000
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-/** Performs a copy of input tensor to the output tensor.
- *
- * @note The following variables must be passed at compile time:
- * -# -DDATA_TYPE        : Input and output datatypes.
- * -# -DVEC_SIZE         : The number of elements processed in X dimension
- * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
- *
- * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void copy_tensor(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Boundary-aware access:
-    // If the there's left-over in width (VEC_SIZE_LEFTOVER > 0):
-    // Shift all accesses other than the first to avoid accessing out of bounds
-    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)VEC_SIZE_LEFTOVER, 0) % VEC_SIZE;
-    in.ptr -= shift * in.stride_x;
-    out.ptr -= shift * out.stride_x;
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(data, DATA_TYPE, (__global DATA_TYPE *)out.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl
deleted file mode 100644
index b1d5e61476..0000000000
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function applies upsample on an input image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void deconvolution_upsample(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    // Store result
-    *((__global DATA_TYPE *)dst.ptr) = *((__global DATA_TYPE *)src.ptr);
-}
-
-#if defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
-/** This kernel reshapes the deconvolution output tensor before returning the result of the Deconvolution. The decovnolution output tensor
- * is the result of a @ref CLGEMM operation between the deconvolution input and the deconvolution filter
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g., -DDATA_TYPE=F32
- * @note The width of the filter should be given as a preprocessor argument using -DFILTER_WIDTH=width, e.g., -DFILTER_WIDTH=2
- * @note The height of the filter should be given as a preprocessor argument using -DFILTER_HEIGHT=height, e.g., -DFILTER_HEIGHT=2
- * @note The width of the input should be given as a preprocessor argument using -DSRC_WIDTH=width, e.g., -DSRC_WIDTH=10
- * @note The height of the input should be given as a preprocessor argument using -DSRC_HEIGHT=width, e.g., -DSRC_HEIGHT=10
- * @note The output data layout is NHWC if the preprocessor argument NUM_FILTERS is defined, NCHW if NUM_FILTERS is not defined
- *
- * @param[in]  src_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in]  src_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_ptr                            Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  bias_ptr                           (Optional) Pointer to the biases vector. Supported data types: F16/F32/S32
- * @param[in]  bias_stride_x                      (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
- */
-__kernel void deconvolution_reshape(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst)
-#if defined(ADD_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(ADD_BIAS)
-)
-{
-#define FILTER_AREA ((FILTER_WIDTH) * (FILTER_HEIGHT))
-
-    Tensor3D        src  = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D        dst  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
-    const DATA_TYPE data = *(__global DATA_TYPE *)src.ptr;
-
-    // Store result
-    const int x_in = get_global_id(0);
-    const int y_in = get_global_id(1);
-    const int z_in = get_global_id(2);
-
-#if defined(NUM_FILTERS)
-    const int bias_index = x_in / (FILTER_AREA);
-    const int z_out      = bias_index + (NUM_FILTERS) * (z_in / (SRC_HEIGHT));
-    const int x_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
-    const int y_out      = (FILTER_HEIGHT) * (z_in % (SRC_HEIGHT)) + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
-#else  // defined(NUM_FILTERS)
-    const int x_out      = x_in / (FILTER_AREA);
-    const int y_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
-    const int z_out      = (FILTER_HEIGHT) * z_in + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
-    const int bias_index = x_out;
-#endif // defined(NUM_FILTERS)
-
-#if defined(ADD_BIAS)
-    Vector          bias     = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-    const DATA_TYPE bias_val = *(__global DATA_TYPE *)vector_offset(&bias, bias_index);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data + bias_val;
-#else  // defined(ADD_BIAS)
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data;
-#endif // defined(ADD_BIAS)
-
-#undef FILTER_AREA
-}
-#endif // defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/depth_to_space.cl b/src/core/CL/cl_kernels/depth_to_space.cl
deleted file mode 100644
index f301e64d66..0000000000
--- a/src/core/CL/cl_kernels/depth_to_space.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-/** Depth to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void depth_to_space_nchw(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2) % r;
-
-    const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
-    const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Depth to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void depth_to_space_nhwc(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0) % r;
-
-    const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
-    const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
deleted file mode 100644
index 22a38e7094..0000000000
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ /dev/null
@@ -1,1781 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#include "activation_float_helpers.h"
-
-/** Get the pointer position at a certain offset in x and y direction.
- *
- * @param[in] ptr      Pointer to the starting position of the buffer
- * @param[in] x        Relative X position
- * @param[in] y        Relative Y position
- * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
- *
- * @return a uchar
- */
-inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
-{
-    return ptr + x * stride_x + y * stride_y;
-}
-
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-#define CONVOLUTION1x3_2X1_STRIDE1(acc, src0, weights_row0) \
-    ({                                                      \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);     \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);     \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);     \
-        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);     \
-        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);     \
-        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);     \
-    })
-
-#define CONVOLUTION1x3_4X1_STRIDE1(acc, src0, weights_row0) \
-    ({                                                      \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);     \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);     \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);     \
-        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);     \
-        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);     \
-        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);     \
-        acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2);     \
-        acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2);     \
-        acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2);     \
-        acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3);     \
-        acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3);     \
-        acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3);     \
-    })
-
-#define CONVOLUTION1x3_2X1_STRIDE2(acc, src0, src1, weights_row0) \
-    ({                                                            \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);           \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);           \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);           \
-        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);           \
-        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);           \
-        acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1);           \
-    })
-
-#define CONVOLUTION1x3_4X1_STRIDE2(acc, src0, src1, weights_row0) \
-    ({                                                            \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);           \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);           \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);           \
-        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);           \
-        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);           \
-        acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1);           \
-        acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2);           \
-        acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2);           \
-        acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2);           \
-        acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3);           \
-        acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3);           \
-        acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3);           \
-    })
-
-#else /* DILATION_X==1 && DILATION_Y==1 */
-
-#define CONVOLUTION1x3_2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
-    ({                                                                                 \
-        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                           \
-        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                            \
-        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                          \
-        acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1);                           \
-        acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1);                            \
-        acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1);                          \
-    })
-
-#define CONVOLUTION1x3_2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
-    ({                                                                                 \
-        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                           \
-        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                            \
-        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                          \
-        acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1);                           \
-        acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1);                            \
-        acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1);                          \
-    })
-
-#define CONVOLUTION1x3_4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
-    ({                                                                                 \
-        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                           \
-        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                            \
-        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                          \
-        acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1);                           \
-        acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1);                            \
-        acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1);                          \
-        acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2);                           \
-        acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2);                            \
-        acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2);                          \
-        acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3);                           \
-        acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3);                            \
-        acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3);                          \
-    })
-
-#define CONVOLUTION1x3_4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
-    ({                                                                                 \
-        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                           \
-        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                            \
-        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                          \
-        acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1);                           \
-        acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1);                            \
-        acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1);                          \
-        acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2);                           \
-        acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2);                            \
-        acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2);                          \
-        acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3);                           \
-        acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3);                            \
-        acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3);                          \
-    })
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-
-#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
-#if defined(CONV_STRIDE_X)
-
-#if CONV_STRIDE_X == 1
-#define convolution1x3 convolution1x3_stride_1
-#elif CONV_STRIDE_X == 2
-#define convolution1x3 convolution1x3_stride_2
-#elif CONV_STRIDE_X == 3
-#define convolution1x3 convolution1x3_stride_3
-#else /* CONV_STRIDE_X */
-#error "Stride not supported"
-#endif /* CONV_STRIDE_X */
-
-/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a float2 containing 2 convoluted values.
- */
-inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
-                                      const float           left_coeff,
-                                      const float           middle_coeff,
-                                      const float           right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-    float4 temp = vload4(0, (__global float *)left_pixel);
-
-    float2 left   = CONVERT(temp.s01, float2);
-    float2 middle = CONVERT(temp.s12, float2);
-    float2 right  = CONVERT(temp.s23, float2);
-    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
-#else  /* DILATION_X==1 && DILATION_Y==1 */
-    return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
-           + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
-           + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a float2 containing 2 convoluted values.
- */
-inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
-                                      const float           left_coeff,
-                                      const float           middle_coeff,
-                                      const float           right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-    float4 temp0 = vload4(0, (__global float *)left_pixel);
-    float  temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
-
-    float2 left   = CONVERT(temp0.s02, float2);
-    float2 middle = CONVERT(temp0.s13, float2);
-    float2 right  = CONVERT((float2)(temp0.s2, temp1), float2);
-
-    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
-#else /* DILATION_X==1 && DILATION_Y==1 */
-    __global float *left_pixel_float = (__global float *)left_pixel;
-
-    return vload4(0, left_pixel_float).s02 * (float2)left_coeff
-           + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
-           + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a float2 containing 2 convoluted values.
- */
-inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
-                                      const float           left_coeff,
-                                      const float           middle_coeff,
-                                      const float           right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-    float4 temp0 = vload4(0, (__global float *)left_pixel);
-    float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
-
-    float2 left   = CONVERT(temp0.s03, float2);
-    float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
-    float2 right  = CONVERT((float2)(temp0.s2, temp1.s1), float2);
-
-    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
-#else  /* DILATION_X==1 && DILATION_Y==1 */
-    __global float *left_pixel_float = (__global float *)left_pixel;
-
-    return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
-           + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
-           + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
- *
- * Convolution matrix layout:
- *
- * [ mat0, mat1, mat2 ]\n
- * [ mat3, mat4, mat5 ]\n
- * [ mat6, mat7, mat8 ]\n
- *
- * @param[in] src  A pointer to source Image structure
- * @param[in] mat0 Coefficient from the convolution matrix
- * @param[in] mat1 Coefficient from the convolution matrix
- * @param[in] mat2 Coefficient from the convolution matrix
- * @param[in] mat3 Coefficient from the convolution matrix
- * @param[in] mat4 Coefficient from the convolution matrix
- * @param[in] mat5 Coefficient from the convolution matrix
- * @param[in] mat6 Coefficient from the convolution matrix
- * @param[in] mat0 Coefficient from the convolution matrix
- * @param[in] mat7 Coefficient from the convolution matrix
- * @param[in] mat8 Coefficient from the convolution matrix
- *
- * @return a float2 containing 2 convoluted values.
- */
-inline float2 convolution3x3(
-    __global const uchar *src,
-    unsigned int          src_stride_y,
-    const float mat0, const float mat1, const float mat2,
-    const float mat3, const float mat4, const float mat5,
-    const float mat6, const float mat7, const float mat8)
-{
-    float2 pixels;
-
-    pixels = convolution1x3((src + 0 * DILATION_Y * src_stride_y), mat0, mat1, mat2);
-    pixels += convolution1x3((src + 1 * DILATION_Y * src_stride_y), mat3, mat4, mat5);
-    pixels += convolution1x3((src + 2 * DILATION_Y * src_stride_y), mat6, mat7, mat8);
-
-    return pixels;
-}
-
-/** This OpenCL kernel computes the depthwise convolution 3x3
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16/F32
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-
-    float2 pixels = 0.0f;
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-
-    __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
-                               (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-
-    // Load the weights
-    float3 weights_values0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-    float3 weights_values1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-    float3 weights_values2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-
-    pixels = convolution3x3(src_addr, src_stride_y,
-                            weights_values0.s0, weights_values0.s1, weights_values0.s2,
-                            weights_values1.s0, weights_values1.s1, weights_values1.s2,
-                            weights_values2.s0, weights_values2.s1, weights_values2.s2);
-#if defined(HAS_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = *((__global float *)(vector_offset(&biases, channel)));
-
-    pixels += (float2)bias;
-#endif //defined(HAS_BIAS)
-
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
-}
-#endif //defined(CONV_STRIDE_X)
-
-#if(DILATION_X > 1 || DILATION_Y > 1)
-
-/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
- *
- * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
- * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] y_offset         Offset from the source tensor from which to start convolution
- * @param[in] weights_addr     Pointer from where to get weights
- * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
- */
-inline float2 convolution_3x3_dilation_stridex1_stridey1_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
-                                                             const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
-{
-    // Load the weights
-    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-
-    float2 pixels0 = 0.0f;
-
-    float2 src00_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
-    float2 src00_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-    float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-
-    float2 src10_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
-    float2 src10_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-    float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-
-    float2 src20_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
-    float2 src20_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-    float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-
-    CONVOLUTION1x3_2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
-
-    return pixels0;
-}
-
-/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
- *
- * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
- * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] y_offset         Offset from the source tensor from which to start convolution
- * @param[in] weights_addr     Pointer from where to get weights
- * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
- */
-inline float2 convolution_3x3_dilation_stridex2_stridey2_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
-                                                             const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
-{
-    // Load the weights
-    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-
-    float2 pixels0 = 0.0f;
-
-    float3 src00_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
-    float3 src00_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-    float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-
-    float3 src10_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
-    float3 src10_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-    float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-
-    float3 src20_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
-    float3 src20_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-    float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-
-    CONVOLUTION1x3_2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
-    CONVOLUTION1x3_2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
-    CONVOLUTION1x3_2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
-
-    return pixels0;
-}
-
-#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
-
-/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
- * stride_x and stride_y are equal to 1
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F32
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_stridex1_stridey1_f32(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-
-    float2 pixels0 = 0.0f;
-    float2 pixels1 = 0.0f;
-    float2 pixels2 = 0.0f;
-    float2 pixels3 = 0.0f;
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-    __global uchar *src_addr     = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
-                                   (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-
-#if(DILATION_X == 1 && DILATION_Y == 1)
-    // Load the weights
-    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-
-    // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
-    float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
-    float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
-    float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
-    float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
-    float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
-    float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
-
-    CONVOLUTION1x3_2X1_STRIDE1(pixels0, src00, weights_row0);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels0, src10, weights_row1);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels0, src20, weights_row2);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels1, src10, weights_row0);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels1, src20, weights_row1);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels1, src30, weights_row2);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels2, src20, weights_row0);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels2, src30, weights_row1);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels2, src40, weights_row2);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels3, src30, weights_row0);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels3, src40, weights_row1);
-    CONVOLUTION1x3_2X1_STRIDE1(pixels3, src50, weights_row2);
-
-#else /* DILATION_X==1 && DILATION_Y==1 */
-
-    //3x3 Convolution of elements starting in 0th row
-    pixels0 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 1st row
-    pixels1 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 1, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 2nd row
-    pixels2 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 3rd row
-    pixels3 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 3, weights_addr, weights_stride_y);
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = *((__global float *)(vector_offset(&biases, channel)));
-
-    pixels0 += (float2)bias;
-    pixels1 += (float2)bias;
-    pixels2 += (float2)bias;
-    pixels3 += (float2)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
-}
-
-/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
- * stride_x and stride_y are equal to 2
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F32
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_stridex2_stridey2_f32(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-
-    float2 pixels0 = 0.0f;
-    float2 pixels1 = 0.0f;
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-    __global uchar *src_addr     = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
-                                   (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-    // Load the weights
-    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-
-    // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
-    float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
-    float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
-    float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
-    float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
-    float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
-    float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
-    float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
-    float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
-    float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
-    float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
-
-    CONVOLUTION1x3_2X1_STRIDE2(pixels0, src00, src01, weights_row0);
-    CONVOLUTION1x3_2X1_STRIDE2(pixels0, src10, src11, weights_row1);
-    CONVOLUTION1x3_2X1_STRIDE2(pixels0, src20, src21, weights_row2);
-    CONVOLUTION1x3_2X1_STRIDE2(pixels1, src20, src21, weights_row0);
-    CONVOLUTION1x3_2X1_STRIDE2(pixels1, src30, src31, weights_row1);
-    CONVOLUTION1x3_2X1_STRIDE2(pixels1, src40, src41, weights_row2);
-
-#else  /* DILATION_X==1 && DILATION_Y==1 */
-
-    //3x3 Convolution of elements starting in 0th row
-    pixels0 = convolution_3x3_dilation_stridex2_stridey2_f32(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 2nd row
-    pixels1 = convolution_3x3_dilation_stridex2_stridey2_f32(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = *((__global float *)(vector_offset(&biases, channel)));
-
-    pixels0 += (float2)bias;
-    pixels1 += (float2)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-}
-
-#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
-#if defined(CONV_STRIDE_X)
-#if CONV_STRIDE_X == 1
-#define convolution1x3_f16 convolution1x3_stride_1_f16
-#elif CONV_STRIDE_X == 2
-#define convolution1x3_f16 convolution1x3_stride_2_f16
-#elif CONV_STRIDE_X == 3
-#define convolution1x3_f16 convolution1x3_stride_3_f16
-#else /* CONV_STRIDE_X */
-#error "Stride not supported"
-#endif /* CONV_STRIDE_X */
-
-#if(DILATION_X > 1 || DILATION_Y > 1)
-
-/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
- *
- * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
- * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] y_offset         Offset from the source tensor from which to start convolution
- * @param[in] weights_addr     Pointer from where to get weights
- * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
- */
-inline half4 convolution_3x3_dilation_stridex1_stridey1_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
-                                                            const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
-{
-    // Load the weights
-    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
-    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
-    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
-
-    half4 pixels0 = 0.0f;
-
-    half4 src00_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
-    half4 src00_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-    half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-
-    half4 src10_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
-    half4 src10_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-    half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-
-    half4 src20_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
-    half4 src20_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-    half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-
-    CONVOLUTION1x3_4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
-
-    return pixels0;
-}
-
-/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
- *
- * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
- * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] y_offset         Offset from the source tensor from which to start convolution
- * @param[in] weights_addr     Pointer from where to get weights
- * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
- */
-inline half4 convolution_3x3_dilation_stridex2_stridey2_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
-                                                            const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
-{
-    // Load the weights
-    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
-    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
-    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
-
-    half4 pixels0 = 0.0f;
-
-    half8 src00_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
-    half8 src00_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-    half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-
-    half8 src10_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
-    half8 src10_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-    half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-
-    half8 src20_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
-    half8 src20_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-    half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-
-    CONVOLUTION1x3_4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
-    CONVOLUTION1x3_4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
-    CONVOLUTION1x3_4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
-
-    return pixels0;
-}
-
-#endif // (DILATION_X > 1 && DILATION_Y > 1)
-
-/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a half4 containing 4 convoluted values.
- */
-inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
-                                         const half            left_coeff,
-                                         const half            middle_coeff,
-                                         const half            right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-    half8 temp = vload8(0, (__global half *)left_pixel);
-
-    half4 left   = CONVERT(temp.s0123, half4);
-    half4 middle = CONVERT(temp.s1234, half4);
-    half4 right  = CONVERT(temp.s2345, half4);
-
-    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
-#else /* DILATION_X==1 && DILATION_Y==1 */
-    return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
-           + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
-           + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a half4 containing 4 convoluted values.
- */
-inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
-                                         const half            left_coeff,
-                                         const half            middle_coeff,
-                                         const half            right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-    half8 temp0 = vload8(0, (__global half *)left_pixel);
-    half temp1  = *((__global half *)(left_pixel + 8 * sizeof(half)));
-
-    half4 left   = CONVERT(temp0.s0246, half4);
-    half4 middle = CONVERT(temp0.s1357, half4);
-    half4 right  = CONVERT((half4)(temp0.s246, temp1), half4);
-
-    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
-#else /* DILATION_X==1 && DILATION_Y==1 */
-
-    __global half *left_pixel_float = (__global half *)left_pixel;
-
-    return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
-           + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
-           + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a half4 containing 4 convoluted values.
- */
-inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
-                                         const half            left_coeff,
-                                         const half            middle_coeff,
-                                         const half            right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-    half16 temp0 = vload16(0, (__global half *)left_pixel);
-
-    half4 left   = CONVERT(temp0.s0369, half4);
-    half4 middle = CONVERT(temp0.s147A, half4);
-    half4 right  = CONVERT(temp0.s258B, half4);
-
-    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
-#else /* DILATION_X==1 && DILATION_Y==1 */
-
-    __global half *left_pixel_float = (__global half *)left_pixel;
-
-    return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
-           + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
-           + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
- *
- * Convolution matrix layout:
- *
- * [ mat0, mat1, mat2 ]\n
- * [ mat3, mat4, mat5 ]\n
- * [ mat6, mat7, mat8 ]\n
- *
- * @param[in] src  A pointer to source Image structure
- * @param[in] mat0 Coefficient from the convolution matrix
- * @param[in] mat1 Coefficient from the convolution matrix
- * @param[in] mat2 Coefficient from the convolution matrix
- * @param[in] mat3 Coefficient from the convolution matrix
- * @param[in] mat4 Coefficient from the convolution matrix
- * @param[in] mat5 Coefficient from the convolution matrix
- * @param[in] mat6 Coefficient from the convolution matrix
- * @param[in] mat0 Coefficient from the convolution matrix
- * @param[in] mat7 Coefficient from the convolution matrix
- * @param[in] mat8 Coefficient from the convolution matrix
- *
- * @return a half4 containing 4 convoluted values.
- */
-inline half4 convolution3x3_f16(
-    __global uchar *src, uint src_stride_y,
-    const half mat0, const half mat1, const half mat2,
-    const half mat3, const half mat4, const half mat5,
-    const half mat6, const half mat7, const half mat8)
-{
-    half4 pixels;
-
-    pixels = convolution1x3_f16(src, mat0, mat1, mat2);
-    pixels += convolution1x3_f16(src + DILATION_Y * src_stride_y, mat3, mat4, mat5);
-    pixels += convolution1x3_f16(src + DILATION_Y * 2 * src_stride_y, mat6, mat7, mat8);
-
-    return pixels;
-}
-
-#if defined(DEPTH_MULTIPLIER)
-
-/** This OpenCL kernel computes the depthwise convolution 3x3
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_f16(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-#if defined(HAS_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-#endif //defined(HAS_BIAS)
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    __global uchar *src_addr     = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
-                                   (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-
-    uchar3 offset         = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
-    half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
-    half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
-    half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
-
-    half4 pixels = convolution3x3_f16(src_addr, src_stride_y, weights_values0.s0, weights_values0.s1, weights_values0.s2,
-                                      weights_values1.s0, weights_values1.s1, weights_values1.s2,
-                                      weights_values2.s0, weights_values2.s1, weights_values2.s2);
-#if defined(HAS_BIAS)
-    pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
-#endif //defined(HAS_BIAS)
-
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
-}
-#endif // defined(DEPTH_MULTIPLIER)
-#endif // defined(CONV_STRIDE_X)
-
-/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
- * when both stride_x and stride_y are equal to 1
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_stridex1_stridey1_f16(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    half bias = *((__global half *)(vector_offset(&biases, channel)));
-#endif /* defined(HAS_BIAS) */
-
-    half4 pixels0 = 0.0f;
-    half4 pixels1 = 0.0f;
-    half4 pixels2 = 0.0f;
-    half4 pixels3 = 0.0f;
-
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-    __global uchar *src_addr     = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
-                                   (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-
-#if(DILATION_X == 1 && DILATION_Y == 1)
-    // Load the weights
-    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
-    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
-    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
-
-    // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
-    half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
-    half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
-    half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
-    half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
-    half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
-    half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
-
-    CONVOLUTION1x3_4X1_STRIDE1(pixels0, src00, weights_row0);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels0, src10, weights_row1);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels0, src20, weights_row2);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels1, src10, weights_row0);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels1, src20, weights_row1);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels1, src30, weights_row2);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels2, src20, weights_row0);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels2, src30, weights_row1);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels2, src40, weights_row2);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels3, src30, weights_row0);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels3, src40, weights_row1);
-    CONVOLUTION1x3_4X1_STRIDE1(pixels3, src50, weights_row2);
-
-#else /* DILATION_X==1 && DILATION_Y==1 */
-
-    //3x3 Convolution of elements starting in 0th row
-    pixels0 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 1st row
-    pixels1 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 1, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 2nd row
-    pixels2 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 3rd row
-    pixels3 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 3, weights_addr, weights_stride_y);
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-
-#ifdef HAS_BIAS
-    pixels0 += (half4)bias;
-    pixels1 += (half4)bias;
-    pixels2 += (half4)bias;
-    pixels3 += (half4)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
-}
-
-/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
- * when both stride_x and stride_y are equal to 2
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_stridex2_stridey2_f16(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    half bias = *((__global half *)(vector_offset(&biases, channel)));
-#endif /* defined(HAS_BIAS) */
-
-    half4 pixels0 = 0.0f;
-    half4 pixels1 = 0.0f;
-
-    // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-    __global uchar *src_addr     = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
-                                   (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-    // Load the weights
-    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
-    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
-    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
-
-    // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
-    half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
-    half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
-    half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
-    half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
-    half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
-    half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
-    half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
-    half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
-    half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
-    half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
-
-    CONVOLUTION1x3_4X1_STRIDE2(pixels0, src00, src01, weights_row0);
-    CONVOLUTION1x3_4X1_STRIDE2(pixels0, src10, src11, weights_row1);
-    CONVOLUTION1x3_4X1_STRIDE2(pixels0, src20, src21, weights_row2);
-    CONVOLUTION1x3_4X1_STRIDE2(pixels1, src20, src21, weights_row0);
-    CONVOLUTION1x3_4X1_STRIDE2(pixels1, src30, src31, weights_row1);
-    CONVOLUTION1x3_4X1_STRIDE2(pixels1, src40, src41, weights_row2);
-
-#else  /* DILATION_X==1 && DILATION_Y==1 */
-    //3x3 Convolution of elements starting in 0th row
-    pixels0 = convolution_3x3_dilation_stridex2_stridey2_f16(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 2nd row
-    pixels1                  = convolution_3x3_dilation_stridex2_stridey2_f16(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-
-#ifdef HAS_BIAS
-    pixels0 += (half4)bias;
-    pixels1 += (half4)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
-}
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
-
-#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)
-/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
- * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
- * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
- * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
- * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
- * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
- * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
- * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void dwc_MxN_native_fp_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif // defined(HAS_BIAS)
-)
-{
-    int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0) * sizeof(DATA_TYPE);
-
-    int x = get_global_id(0); // channels
-    int y = get_global_id(1); // spatial coordinate x
-#if defined(DST_DEPTH)
-    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
-    int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else                                          // defined(DST_DEPTH)
-    int z                    = get_global_id(2); // spatial coordinate y
-#endif                                         // defined(DST_DEPTH)
-
-    __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs;
-
-    __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;
-
-    __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
-
-#if defined(HAS_BIAS)
-    __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
-#endif // defined(HAS_BIAS)
-
-#if defined(DST_DEPTH)
-    s_addr += b * src_stride_w;
-    d_addr += b * dst_stride_w;
-#endif // defined(DST_DEPTH)
-
-    for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
-    {
-        // Each work-item computes N0x1x1 elements
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        res0 = 0;
-
-        int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
-        int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
-
-        for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
-        {
-            if(y_coord >= 0 && y_coord < SRC_DIM2)
-            {
-                int x_coord_tmp = x_coord;
-
-                for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
-                {
-                    if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
-                    {
-                        int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
-                        int w_offset = xk * weights_stride_y + yk * weights_stride_z;
-
-                        // Load input and weights values
-                        VEC_DATA_TYPE(DATA_TYPE, N0)
-                        i = VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset));
-                        VEC_DATA_TYPE(DATA_TYPE, N0)
-                        w = VLOAD(N0)(0, (__global DATA_TYPE *)(w_addr + w_offset));
-
-#if GPU_ARCH == GPU_ARCH_MIDGARD
-                        res0 += i * w;
-#else  // GPU_ARCH == GPU_ARCH_MIDGARD
-                        res0 = fma(i, w, res0);
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-                    }
-                    x_coord_tmp += DILATION_X;
-                }
-            }
-            y_coord += DILATION_Y;
-        }
-
-#if defined(HAS_BIAS)
-        res0 += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr));
-#endif // defined(HAS_BIAS)
-
-        res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, res0, A_VAL, B_VAL);
-
-        STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-
-        w_addr += sizeof(DATA_TYPE);
-        d_addr += sizeof(DATA_TYPE);
-#if defined(HAS_BIAS)
-        b_addr += sizeof(DATA_TYPE);
-#endif // defined(HAS_BIAS)
-    }
-}
-#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)
-
-#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
-
-#if DATA_TYPE != float || DATA_TYPE != half
-#error "Unsupported data type"
-#endif // DATA_TYPE != float || DATA_TYPE != half
-
-#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#define FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond)                                                                     \
-    ({                                                                                                                                    \
-        basename##0 = select(basename##0, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s0)); \
-        basename##1 = select(basename##1, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s1)); \
-        basename##2 = select(basename##2, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s2)); \
-    })
-
-#define FILL_ZERO_OUT_OF_BOUND_4(data_type, vec_size, basename, cond)                                                                     \
-    ({                                                                                                                                    \
-        FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond);                                                                    \
-        basename##3 = select(basename##3, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s3)); \
-    })
-
-#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
-
-/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
- * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
- * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
- * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note In case of biases, -DHAS_BIAS must to be passed at compile
- * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] max_offset                            Max offset for the input tensor
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif /* defined(HAS_BIAS) */
-)
-{
-    int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
-    int y        = get_global_id(1); // spatial coordinate x
-#if defined(DST_DEPTH)
-    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
-    int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else                                          // defined(DST_DEPTH)
-    int      z               = get_global_id(2); // spatial coordinate y
-#endif                                         // defined(DST_DEPTH)
-
-    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
-
-#if defined(DST_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
-#endif /* defined(DST_DEPTH) */
-
-    int3 src_coord_y = (int3)(y * CONV_STRIDE_X - CONV_PAD_LEFT) + (int3)(0, DILATION_X, 2 * DILATION_X);
-    int3 src_coord_z = (int3)(z * CONV_STRIDE_Y - CONV_PAD_TOP) + (int3)(0, DILATION_Y, 2 * DILATION_Y);
-
-    int3 src_offset_y = clamp(src_coord_y, (int3)0, (int3)(SRC_DIM_1 - 1));
-    int3 src_offset_z = clamp(src_coord_z, (int3)0, (int3)(SRC_DIM_2 - 1));
-
-    // Use these vectors to check whether the unclamped load would have been out of bounds
-    src_coord_y = (src_offset_y != src_coord_y);
-    src_coord_z = (src_offset_z != src_coord_z);
-
-    src_offset_y *= (int3)src_stride_y;
-    src_offset_z *= (int3)src_stride_z;
-
-    // We compute VEC_SIZEx1x1 [C,W,H] elements
-    VEC_FLOAT acc0 = 0;
-
-    // Load weights
-    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
-
-    // Load input values
-    // z == 0
-    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
-    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
-    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
-
-    FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s0);
-
-    acc0 = fma(values0, w0, acc0);
-    acc0 = fma(values1, w1, acc0);
-    acc0 = fma(values2, w2, acc0);
-
-    // z == 1
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
-
-    FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s1);
-
-    acc0 = fma(values0, w3, acc0);
-    acc0 = fma(values1, w4, acc0);
-    acc0 = fma(values2, w5, acc0);
-
-    // z == 2
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
-
-    FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s2);
-
-    acc0 = fma(values0, w6, acc0);
-    acc0 = fma(values1, w7, acc0);
-    acc0 = fma(values2, w8, acc0);
-
-#if defined(HAS_BIAS)
-    __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
-    VEC_FLOAT bias_values       = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
-    acc0 += bias_values;
-#endif // defined(HAS_BIAS)
-
-#if defined(DST_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z;
-#endif /* defined(DST_DEPTH) */
-
-    acc0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
-    STORE_VECTOR_SELECT(acc, DATA_TYPE, dst_addr, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
-}
-#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
-
-#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
- * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The size of the output's second dimension must be passed at compile time using -DDST_DIM_1 (e.g. -DDST_DIM_1=64)
- * @note The size of the output's third dimension must be passed at compile time using -DDST_DIM_2 (e.g. -DDST_DIM_2=32)
- * @note In case of biases, -DHAS_BIAS must to be passed at compile
- * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] max_offset                            Max offset for the input tensor
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_nhwc_stride1(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif /* defined(HAS_BIAS) */
-)
-{
-    int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
-    int y        = get_global_id(1); // spatial coordinate x
-#if defined(DST_DEPTH)
-    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
-    int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else                                          // defined(DST_DEPTH)
-    int             z        = get_global_id(2); // spatial coordinate y
-#endif                                         // defined(DST_DEPTH)
-
-    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
-
-#if defined(DST_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
-#endif /* defined(DST_DEPTH) */
-
-    int4 src_coord_y = (int4)(y * NUM_ROWS_PROCESSED - CONV_PAD_LEFT) + V_OFFS4(int);
-    int4 src_coord_z = (int4)(z * NUM_PLANES_PROCESSED - CONV_PAD_TOP) + V_OFFS4(int);
-
-    int4 src_offset_y = clamp(src_coord_y, (int4)0, (int4)(SRC_DIM_1 - 1));
-    int4 src_offset_z = clamp(src_coord_z, (int4)0, (int4)(SRC_DIM_2 - 1));
-
-    // Use these vectors to check whether the unclamped load would have been out of bounds
-    src_coord_y = (src_offset_y != src_coord_y);
-    src_coord_z = (src_offset_z != src_coord_z);
-
-    src_offset_y *= (int4)src_stride_y;
-    src_offset_z *= (int4)src_stride_z;
-
-    // We compute VEC_SIZEx2x2 [C,W,H] elements
-    VEC_FLOAT acc0 = 0;
-    VEC_FLOAT acc1 = 0;
-    VEC_FLOAT acc2 = 0;
-    VEC_FLOAT acc3 = 0;
-
-    // Load weights
-    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
-
-    // Load input values
-    // z == 0
-    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
-    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
-    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
-    VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s3));
-
-    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s0);
-
-    acc0 = fma(values0, w0, acc0);
-    acc0 = fma(values1, w1, acc0);
-    acc0 = fma(values2, w2, acc0);
-    acc1 = fma(values1, w0, acc1);
-    acc1 = fma(values2, w1, acc1);
-    acc1 = fma(values3, w2, acc1);
-
-    // z == 1
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
-    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s3));
-
-    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s1);
-
-    acc0 = fma(values0, w3, acc0);
-    acc0 = fma(values1, w4, acc0);
-    acc0 = fma(values2, w5, acc0);
-    acc1 = fma(values1, w3, acc1);
-    acc1 = fma(values2, w4, acc1);
-    acc1 = fma(values3, w5, acc1);
-
-    acc2 = fma(values0, w0, acc2);
-    acc2 = fma(values1, w1, acc2);
-    acc2 = fma(values2, w2, acc2);
-    acc3 = fma(values1, w0, acc3);
-    acc3 = fma(values2, w1, acc3);
-    acc3 = fma(values3, w2, acc3);
-
-    // z == 2
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
-    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s3));
-
-    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s2);
-
-    acc0 = fma(values0, w6, acc0);
-    acc0 = fma(values1, w7, acc0);
-    acc0 = fma(values2, w8, acc0);
-    acc1 = fma(values1, w6, acc1);
-    acc1 = fma(values2, w7, acc1);
-    acc1 = fma(values3, w8, acc1);
-
-    acc2 = fma(values0, w3, acc2);
-    acc2 = fma(values1, w4, acc2);
-    acc2 = fma(values2, w5, acc2);
-    acc3 = fma(values1, w3, acc3);
-    acc3 = fma(values2, w4, acc3);
-    acc3 = fma(values3, w5, acc3);
-
-    // z == 3
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s0));
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s1));
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s2));
-    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s3));
-
-    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s3);
-
-    acc2 = fma(values0, w6, acc2);
-    acc2 = fma(values1, w7, acc2);
-    acc2 = fma(values2, w8, acc2);
-    acc3 = fma(values1, w6, acc3);
-    acc3 = fma(values2, w7, acc3);
-    acc3 = fma(values3, w8, acc3);
-
-#if defined(HAS_BIAS)
-    __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
-
-    VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
-
-    acc0 += bias_values;
-    acc1 += bias_values;
-    acc2 += bias_values;
-    acc3 += bias_values;
-#endif // defined(HAS_BIAS)
-
-    int2 dst_offset_y = min((int2)(y * NUM_ROWS_PROCESSED) + V_OFFS2(int), (int2)(DST_DIM_1 - 1)) * (int2)dst_stride_y;
-    int  dst_coord_z  = z * NUM_PLANES_PROCESSED;
-
-#if defined(DST_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z + b * dst_stride_w;
-#else  // defined(DST_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z;
-#endif //  defined(DST_DEPTH)
-
-    /* Store vectors in reverse order along the Y. The Y offsets are calculated so that they are forced to be in bound.
-     * If only the first address is in bound, the Y offset of the second address will be brought back and there will be 2 writes in the same location for the same thread.
-     * Since the last vector to be written is always the valid one for that location, it overwrites the wrong values.
-     */
-    values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc1, A_VAL, B_VAL);
-    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
-
-    values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
-    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
-
-#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
-    if((dst_coord_z + 1) < DST_DIM_2)
-#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
-    {
-        values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc3, A_VAL, B_VAL);
-        STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
-
-        values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc2, A_VAL, B_VAL);
-        STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
-    }
-}
-
-#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
deleted file mode 100644
index 000dce1590..0000000000
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ /dev/null
@@ -1,961 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers_asymm.h"
-
-#ifndef VEC_SIZE
-#if defined(N0)
-#define VEC_SIZE N0
-#else /* defined(N0) */
-#define VEC_SIZE 8
-#endif /* defined(N0) */
-#endif /* VEC_SIZE */
-
-#if defined(ACTIVATION_TYPE) && defined(CONST_0)
-#include "activation_layer_quant.cl"
-#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QUANT(ACTIVATION_TYPE, x)
-#else /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
-#define ACTIVATION_FUNC(x) (x)
-#endif /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
-
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_SHORT VEC_DATA_TYPE(short, VEC_SIZE)
-
-#if defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
-
-#define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size)
-
-#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
-
-#if defined(WEIGHTS_PROMOTED_TYPE)
-#define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val);
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
-
-#if CONV_STRIDE_X > 3
-#error "Stride X not supported"
-#endif /* CONV_STRIDE_X > 3 */
-
-#if !defined(IS_DOT8)
-
-#if DILATION_X == 1
-
-#if CONV_STRIDE_X == 1
-#define GET_VALUES(first_value, left, middle, right)                                                        \
-    ({                                                                                                      \
-        int8 temp0 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8);                         \
-        int2 temp1 = CONVERT(vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))), int2); \
-        \
-        left   = CONVERT(temp0.s01234567, int8);                                                            \
-        middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8);                                  \
-        right  = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8);                                  \
-    })
-#elif CONV_STRIDE_X == 2
-#define GET_VALUES(first_value, left, middle, right)                                                 \
-    ({                                                                                               \
-        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);               \
-        int temp1   = CONVERT(*((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int); \
-        \
-        left   = CONVERT(temp0.s02468ace, int8);                                                     \
-        middle = CONVERT(temp0.s13579bdf, int8);                                                     \
-        right  = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8);                              \
-    })
-#else /* CONV_STRIDE_X */
-#define GET_VALUES(first_value, left, middle, right)                                                          \
-    ({                                                                                                        \
-        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);                        \
-        int8 temp1  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \
-        \
-        left   = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                    \
-        middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8);                                    \
-        right  = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8);                                    \
-    })
-#endif /* CONV_STRIDE_X */
-
-#else /* DILATION_X == 1 */
-
-#if CONV_STRIDE_X == 1
-#define GET_VALUES(first_value, left, middle, right)                                                                 \
-    ({                                                                                                               \
-        left   = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8);                                      \
-        middle = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int8);     \
-        right  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int8); \
-    })
-#elif CONV_STRIDE_X == 2
-#define GET_VALUES(first_value, left, middle, right)                                                                  \
-    ({                                                                                                                \
-        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);                                \
-        left        = CONVERT(temp0.s02468ace, int8);                                                                 \
-        \
-        temp0  = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16);    \
-        middle = CONVERT(temp0.s02468ace, int8);                                                                      \
-        \
-        temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \
-        right = CONVERT(temp0.s02468ace, int8);                                                                       \
-    })
-#else /* CONV_STRIDE_X */
-#define GET_VALUES(first_value, left, middle, right)                                                                       \
-    ({                                                                                                                     \
-        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);                                     \
-        int8 temp1  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8);              \
-        left        = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                            \
-        \
-        temp0  = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16);         \
-        temp1  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))), int8);    \
-        middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                                 \
-        \
-        temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16);      \
-        temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))), int8); \
-        right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                                  \
-    })
-
-#endif /* CONV_STRIDE_X */
-#endif /* DILATION_X==1 */
-
-/** This function computes the depthwise convolution quantized.
- *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                                       src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
- * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                                 Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                                   weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
- * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
- * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
- * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
- * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
- * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
- * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
- * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
- */
-
-__kernel void dwc_3x3_native_quantized8_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-    VECTOR_DECLARATION(output_multipliers),
-    VECTOR_DECLARATION(output_shifts)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    __global uchar *src_addr           = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z;
-    Image           dst                = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D        weights            = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Vector          output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
-    Vector          output_shifts      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-
-#if defined(HAS_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    int bias_value = *((__global int *)(vector_offset(&biases, channel)));
-#endif //defined(HAS_BIAS)
-
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    src_addr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
-    w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y));
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
-    w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y));
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
-    w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y));
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel));
-    const int output_shift      = *((__global int *)vector_offset(&output_shifts, channel));
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-    int8 values0 = 0;
-    int8 sum0    = 0;
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    int8 values1 = 0;
-    int8 sum1    = 0;
-#endif /* CONV_STRIDE_Y &&DILATION_Y==1 */
-
-    // Row0
-    int8 left, middle, right;
-    GET_VALUES(src_addr + 0 * src_stride_y, left, middle, right);
-    values0 += left * (int8)(w0.s0);
-    values0 += middle * (int8)(w0.s1);
-    values0 += right * (int8)(w0.s2);
-
-#if WEIGHTS_OFFSET != 0
-    sum0 += left + middle + right;
-#endif /* WEIGHTS_OFFSET != 0 */
-
-    // Row1
-    GET_VALUES(src_addr + DILATION_Y * src_stride_y, left, middle, right);
-    values0 += left * (int8)(w1.s0);
-    values0 += middle * (int8)(w1.s1);
-    values0 += right * (int8)(w1.s2);
-
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += left * (int8)(w0.s0);
-    values1 += middle * (int8)(w0.s1);
-    values1 += right * (int8)(w0.s2);
-#endif /* CONV_STRIDE_Y && DILATION_Y== 1 */
-
-#if WEIGHTS_OFFSET != 0
-    int8 tmp = left + middle + right;
-    sum0 += tmp;
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    sum1 += tmp;
-#endif /* CONV_STRIDE_Y &&DILATION_Y== 1 */
-#endif /* WEIGHTS_OFFSET != 0 */
-
-    // Row2
-    GET_VALUES(src_addr + 2 * DILATION_Y * src_stride_y, left, middle, right);
-    values0 += left * (int8)(w2.s0);
-    values0 += middle * (int8)(w2.s1);
-    values0 += right * (int8)(w2.s2);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += left * (int8)(w1.s0);
-    values1 += middle * (int8)(w1.s1);
-    values1 += right * (int8)(w1.s2);
-#endif /* CONV_STRIDE_Y &&DILATION_Y == 1 */
-
-#if WEIGHTS_OFFSET != 0
-    tmp = left + middle + right;
-    sum0 += tmp;
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    sum1 += tmp;
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    // Row3
-    GET_VALUES(src_addr + 3 * src_stride_y, left, middle, right);
-    values1 += left * (int8)(w2.s0);
-    values1 += middle * (int8)(w2.s1);
-    values1 += right * (int8)(w2.s2);
-
-#if WEIGHTS_OFFSET != 0
-    sum1 += left + middle + right;
-#endif /* WEIGHTS_OFFSET != 0 */
-#endif /* CONV_STRIDE_Y && DILATION_Y == 1 */
-
-#if defined(HAS_BIAS)
-    values0 += (int8)(bias_value);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += (int8)(bias_value);
-#endif /* CONV_STRIDE_Y & &DILATION_Y == 1 */
-#endif //defined(HAS_BIAS)
-
-#if WEIGHTS_OFFSET != 0
-    values0 += sum0 * (int8)(WEIGHTS_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += sum1 * (int8)(WEIGHTS_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if INPUT_OFFSET != 0
-    VEC_WEIGHTS_PROMOTED_TYPE(3)
-    tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));
-
-    WEIGHTS_PROMOTED_TYPE sum_weights = tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
-    values0 += sum_weights * (int8)(INPUT_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += sum_weights * (int8)(INPUT_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
-#endif /* INPUT_OFFSET != 0 */
-
-#if K_OFFSET != 0
-    values0 += (int8)(K_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += (int8)(K_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
-#endif /* K_OFFSET != 0 */
-
-#if defined(REAL_MULTIPLIER)
-
-    values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
-
-#else // defined(REAL_MULTIPLIER)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
-    int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
-    values0             = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_OFFSET < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#endif // defined(REAL_MULTIPLIER)
-
-    values0 += (int8)OUTPUT_OFFSET;
-    VEC_TYPE(8)
-    res0 = CONVERT_SAT(values0, VEC_TYPE(8));
-
-    vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-#if defined(REAL_MULTIPLIER)
-
-    values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
-
-#else // defined(REAL_MULTIPLIER)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
-    int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
-    values1             = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_OFFSET < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#endif // defined(REAL_MULTIPLIER)
-
-    values1 += (int8)OUTPUT_OFFSET;
-    VEC_TYPE(8)
-    res1 = CONVERT_SAT(values1, VEC_TYPE(8));
-
-    vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
-}
-
-#else // !defined(IS_DOT8)
-
-#if DILATION_X == 1
-#if CONV_STRIDE_X == 1
-#define GET_VALUES(first_value, left, middle, right)                                    \
-    ({                                                                                  \
-        VEC_TYPE(8)                                                                     \
-        temp0 = vload8(0, (__global DATA_TYPE *)(first_value));                         \
-        VEC_TYPE(2)                                                                     \
-        temp1 = vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))); \
-        \
-        left   = temp0.s01234567;                                                       \
-        middle = (VEC_TYPE(8))(temp0.s1234, temp0.s567, temp1.s0);                      \
-        right  = (VEC_TYPE(8))(temp0.s2345, temp0.s67, temp1.s01);                      \
-    })
-#elif CONV_STRIDE_X == 2
-#define GET_VALUES(first_value, left, middle, right)                                       \
-    ({                                                                                     \
-        VEC_TYPE(16)                                                                       \
-        temp0           = vload16(0, (__global DATA_TYPE *)(first_value));                 \
-        DATA_TYPE temp1 = *((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \
-        \
-        left   = temp0.s02468ace;                                                          \
-        middle = temp0.s13579bdf;                                                          \
-        right  = (VEC_TYPE(8))(temp0.s2468, temp0.sace, temp1);                            \
-    })
-#else /* CONV_STRIDE_X */
-#define GET_VALUES(first_value, left, middle, right)                                     \
-    ({                                                                                   \
-        VEC_TYPE(16)                                                                     \
-        temp0 = vload16(0, (__global DATA_TYPE *)(first_value));                         \
-        VEC_TYPE(8)                                                                      \
-        temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \
-        \
-        left   = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                       \
-        middle = (VEC_TYPE(8))(temp0.s147a, temp0.sd, temp1.s036);                       \
-        right  = (VEC_TYPE(8))(temp0.s258b, temp0.se, temp1.s147);                       \
-    })
-#endif /* CONV_STRIDE_X */
-#else  /*DILATION_X==1*/
-
-#if CONV_STRIDE_X == 1
-#define GET_VALUES(first_value, left, middle, right)                                                  \
-    ({                                                                                                \
-        left   = vload8(0, (__global DATA_TYPE *)(first_value));                                      \
-        middle = vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE)));     \
-        right  = vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \
-    })
-#elif CONV_STRIDE_X == 2
-#define GET_VALUES(first_value, left, middle, right)                                                   \
-    ({                                                                                                 \
-        VEC_TYPE(16)                                                                                   \
-        temp0  = vload16(0, (__global DATA_TYPE *)(first_value));                                      \
-        left   = temp0.s02468ace;                                                                      \
-        temp0  = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE)));     \
-        middle = temp0.s02468ace;                                                                      \
-        temp0  = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \
-        right  = temp0.s02468ace;                                                                      \
-    })
-#else /* CONV_STRIDE_X */
-#define GET_VALUES(first_value, left, middle, right)                                                        \
-    ({                                                                                                      \
-        VEC_TYPE(16)                                                                                        \
-        temp0 = vload16(0, (__global DATA_TYPE *)(first_value));                                            \
-        VEC_TYPE(8)                                                                                         \
-        temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE)));                    \
-        left  = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                                           \
-        \
-        temp0  = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE)));          \
-        temp1  = vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE)));    \
-        middle = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                                          \
-        \
-        temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE)));       \
-        temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))); \
-        right = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                                           \
-    })
-
-#endif /* CONV_STRIDE_X */
-#endif /*DILATION_X==1*/
-/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
- *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                                       src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
- * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                                 Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                                   weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
- * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
- * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
- * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
- * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
- * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
- * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
- * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
- */
-
-__kernel void dwc_3x3_native_quantized8_dot8_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-    VECTOR_DECLARATION(output_multipliers),
-    VECTOR_DECLARATION(output_shifts)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    __global uchar *src_addr           = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z;
-    Image           dst                = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D        weights            = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Vector          output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
-    Vector          output_shifts      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-
-#if defined(HAS_BIAS)
-    Vector    biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    const int bias_value = *((__global int *)(vector_offset(&biases, channel)));
-#endif //defined(HAS_BIAS)
-
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    src_addr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-
-    VEC_TYPE(3)
-    w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y));
-    VEC_TYPE(3)
-    w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y));
-    VEC_TYPE(3)
-    w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y));
-
-    const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0));
-    const int output_shift      = *((__global int *)vector_offset(&output_shifts, 0));
-
-    VEC_TYPE(8)
-    left0, middle0, right0;
-    VEC_TYPE(8)
-    left1, middle1, right1;
-    VEC_TYPE(8)
-    left2, middle2, right2;
-
-    int8 values0 = 0;
-    int8 sum0    = 0;
-
-    GET_VALUES(src_addr + 0 * src_stride_y, left0, middle0, right0);
-    GET_VALUES(src_addr + DILATION_Y * src_stride_y, left1, middle1, right1);
-    GET_VALUES(src_addr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);
-
-#if WEIGHTS_OFFSET != 0
-    sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
-    sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
-    sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    // If conv_stride_y is equals to 1, we compute two output rows
-
-    VEC_TYPE(8)
-    left3, middle3, right3;
-    int8 values1 = 0;
-    int8 sum1    = 0;
-
-    GET_VALUES(src_addr + 3 * src_stride_y, left3, middle3, right3);
-
-#if WEIGHTS_OFFSET != 0
-    sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
-    sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
-    sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);
-#endif /* WEIGHTS_OFFSET != 0 */
-#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
-
-    ARM_DOT((VEC_TYPE(4))(left0.s0, middle0.s0, right0.s0, left1.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
-    ARM_DOT((VEC_TYPE(4))(middle1.s0, right1.s0, left2.s0, middle2.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
-    values0.s0 += right2.s0 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s1, middle0.s1, right0.s1, left1.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);
-    ARM_DOT((VEC_TYPE(4))(middle1.s1, right1.s1, left2.s1, middle2.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);
-    values0.s1 += right2.s1 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s2, middle0.s2, right0.s2, left1.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);
-    ARM_DOT((VEC_TYPE(4))(middle1.s2, right1.s2, left2.s2, middle2.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);
-    values0.s2 += right2.s2 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s3, middle0.s3, right0.s3, left1.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);
-    ARM_DOT((VEC_TYPE(4))(middle1.s3, right1.s3, left2.s3, middle2.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);
-    values0.s3 += right2.s3 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s4, middle0.s4, right0.s4, left1.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);
-    ARM_DOT((VEC_TYPE(4))(middle1.s4, right1.s4, left2.s4, middle2.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);
-    values0.s4 += right2.s4 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s5, middle0.s5, right0.s5, left1.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);
-    ARM_DOT((VEC_TYPE(4))(middle1.s5, right1.s5, left2.s5, middle2.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);
-    values0.s5 += right2.s5 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s6, middle0.s6, right0.s6, left1.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);
-    ARM_DOT((VEC_TYPE(4))(middle1.s6, right1.s6, left2.s6, middle2.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);
-    values0.s6 += right2.s6 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s7, middle0.s7, right0.s7, left1.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);
-    ARM_DOT((VEC_TYPE(4))(middle1.s7, right1.s7, left2.s7, middle2.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
-    values0.s7 += right2.s7 * w2.s2;
-
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    ARM_DOT((VEC_TYPE(4))(left1.s0, middle1.s0, right1.s0, left2.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
-    ARM_DOT((VEC_TYPE(4))(middle2.s0, right2.s0, left3.s0, middle3.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
-    values1.s0 += right3.s0 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s1, middle1.s1, right1.s1, left2.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);
-    ARM_DOT((VEC_TYPE(4))(middle2.s1, right2.s1, left3.s1, middle3.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);
-    values1.s1 += right3.s1 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s2, middle1.s2, right1.s2, left2.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);
-    ARM_DOT((VEC_TYPE(4))(middle2.s2, right2.s2, left3.s2, middle3.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);
-    values1.s2 += right3.s2 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s3, middle1.s3, right1.s3, left2.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);
-    ARM_DOT((VEC_TYPE(4))(middle2.s3, right2.s3, left3.s3, middle3.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);
-    values1.s3 += right3.s3 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s4, middle1.s4, right1.s4, left2.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);
-    ARM_DOT((VEC_TYPE(4))(middle2.s4, right2.s4, left3.s4, middle3.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);
-    values1.s4 += right3.s4 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s5, middle1.s5, right1.s5, left2.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);
-    ARM_DOT((VEC_TYPE(4))(middle2.s5, right2.s5, left3.s5, middle3.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);
-    values1.s5 += right3.s5 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s6, middle1.s6, right1.s6, left2.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);
-    ARM_DOT((VEC_TYPE(4))(middle2.s6, right2.s6, left3.s6, middle3.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);
-    values1.s6 += right3.s6 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s7, middle1.s7, right1.s7, left2.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
-    ARM_DOT((VEC_TYPE(4))(middle2.s7, right2.s7, left3.s7, middle3.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
-    values1.s7 += right3.s7 * w2.s2;
-#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
-
-#if defined(HAS_BIAS)
-    values0 += (int8)(bias_value);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += (int8)(bias_value);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
-#endif //defined(HAS_BIAS)
-
-#if WEIGHTS_OFFSET != 0
-    values0 += sum0 * (int8)(WEIGHTS_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += sum1 * (int8)(WEIGHTS_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if INPUT_OFFSET != 0
-    WEIGHTS_PROMOTED_TYPE sum_weights = 0;
-    VEC_WEIGHTS_PROMOTED_TYPE(3)
-    tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));
-    sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
-    values0 += sum_weights * (int8)(INPUT_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += sum_weights * (int8)(INPUT_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
-#endif /* INPUT_OFFSET != 0 */
-
-#if K_OFFSET != 0
-    values0 += (int8)(K_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += (int8)(K_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
-#endif /* K_OFFSET != 0 */
-
-#if defined(REAL_MULTIPLIER)
-
-    values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
-
-#else // defined(REAL_MULTIPLIER)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
-    int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
-    values0             = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_OFFSET < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#endif // defined(REAL_MULTIPLIER)
-
-    values0 += (int8)OUTPUT_OFFSET;
-    VEC_TYPE(8)
-    res0 = CONVERT_SAT(values0, VEC_TYPE(8));
-
-    vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-
-#if defined(REAL_MULTIPLIER)
-
-    values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
-
-#else // defined(REAL_MULTIPLIER)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
-    int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
-    values1             = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_OFFSET < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#endif // defined(REAL_MULTIPLIER)
-
-    values1 += (int8)OUTPUT_OFFSET;
-    VEC_TYPE(8)
-    res1 = CONVERT_SAT(values1, VEC_TYPE(8));
-
-    vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
-}
-
-#endif // !defined(IS_DOT8)
-
-#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */
-
-#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
-
-#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
-
-#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)) * CONVERT(y, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)), VEC_INT)
-
-#if WEIGHTS_OFFSET != 0
-#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \
-    ({                                          \
-        sum += CONVERT(x, VEC_INT);             \
-        MULTIPLY_ADD(x, y, acc);                \
-    })
-#else /* WEIGHTS_OFFSET != 0 */
-#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \
-    ({                                                                                 \
-        ARM_DOT((VEC_TYPE(4))(val0, val1, val2, val3), w0.s0123, acc);                 \
-        ARM_DOT((VEC_TYPE(4))(val4, val5, val6, val7), w0.s4567, acc);                 \
-        acc += val8 * w1;                                                              \
-    })
-
-#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \
-    ({                                                                                   \
-        sum = val0;                                                                      \
-        ARM_DOT((VEC_TYPE(4))(val1, val2, val3, val4), (VEC_TYPE(4))1, sum);             \
-        ARM_DOT((VEC_TYPE(4))(val5, val6, val7, val8), (VEC_TYPE(4))1, sum);             \
-    })
-
-#define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \
-    ({                                             \
-        sum = w1;                                  \
-        ARM_DOT(w0.s0123, (VEC_TYPE(4))1, sum);    \
-        ARM_DOT(w0.s4567, (VEC_TYPE(4))1, sum);    \
-    })
-
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-#endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
-
-#endif // defined(WEIGHTS_PROMOTED_TYPE)
-
-#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
-
-#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER)
-/** This function computes the depthwise convolution for NHWC data layout.
- *
- * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
- * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
- * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
- * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
- * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
- * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
- * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
- * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                                       src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                                     Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                                       src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
- * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                                 Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                                   weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
- * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
- * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
- * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
- * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
- * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
- * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
- * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
- */
-__kernel void dwc_MxN_native_quantized8_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-    VECTOR_DECLARATION(output_multipliers),
-    VECTOR_DECLARATION(output_shifts)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif // defined(HAS_BIAS)
-)
-{
-    int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0);
-    int y      = get_global_id(1); // spatial coordinate x
-#if defined(DST_DEPTH)
-    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
-    int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else                                          // defined(DST_DEPTH)
-    int z = get_global_id(2); // spatial coordinate y
-#endif                                         // defined(DST_DEPTH)
-
-    __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
-
-    __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;
-
-    __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER;
-
-#if defined(HAS_BIAS)
-    __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
-#endif // defined(HAS_BIAS)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    __global uchar *out_mul_addr   = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
-    __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#if defined(DST_DEPTH)
-    s_addr += b * src_stride_w;
-    d_addr += b * dst_stride_w;
-#endif // defined(DST_DEPTH)
-
-#if DEPTH_MULTIPLIER > 1
-    for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
-    {
-#endif // DEPTH_MULTIPLIER > 1
-        // Each work-item computes N0x1x1 elements
-        VEC_INT res = 0;
-
-        int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
-        int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
-
-        for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
-        {
-            if(y_coord >= 0 && y_coord < SRC_DIM2)
-            {
-                int x_coord_tmp = x_coord;
-
-                for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
-                {
-                    if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
-                    {
-                        int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
-                        int w_offset = xk * weights_stride_y + yk * weights_stride_z;
-
-                        // Load input and weights values
-                        VEC_INT i = CONVERT(VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset)), VEC_INT);
-                        VEC_INT w = CONVERT(VLOAD(N0)(0, (__global WEIGHTS_TYPE *)(w_addr + w_offset)), VEC_INT);
-
-                        res += (i + (VEC_INT)INPUT_OFFSET) * (w + (VEC_INT)WEIGHTS_OFFSET);
-                    }
-                    x_coord_tmp += DILATION_X;
-                }
-            }
-            y_coord += DILATION_Y;
-        }
-
-#if defined(HAS_BIAS)
-        VEC_INT bias = VLOAD(N0)(0, (__global int *)(b_addr));
-        res += bias;
-#endif // defined(HAS_BIAS)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-        VEC_INT output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));
-        VEC_INT output_shift      = VLOAD(N0)(0, (__global int *)(out_shift_addr));
-
-        VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, output_multiplier, output_shift, N0);
-        VEC_INT res_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);
-        res                   = select(res_shift_lt0, res_shift_gt0, (VEC_INT)(output_shift) >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-        res   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
-#else  // OUTPUT_SHIFT < 0
-        res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
-#endif // OUTPUT_OFFSET < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-        res += (VEC_INT)OUTPUT_OFFSET;
-
-        VEC_TYPE(VEC_SIZE)
-        res0 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE));
-        res0 = ACTIVATION_FUNC(res0);
-
-        STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-
-#if DEPTH_MULTIPLIER > 1
-        w_addr += sizeof(WEIGHTS_TYPE);
-        d_addr += sizeof(DATA_TYPE);
-#if defined(PER_CHANNEL_QUANTIZATION)
-        out_mul_addr += sizeof(int);
-        out_shift_addr += sizeof(int);
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-#if defined(HAS_BIAS)
-        b_addr += sizeof(int);
-#endif // defined(HAS_BIAS)
-    }
-#endif // DEPTH_MULTIPLIER > 1
-}
-#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER)
-#endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
deleted file mode 100644
index 127f67d940..0000000000
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
-
-/** This performs the dequantization of 8-bit unsigned integers to floating point.
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
- * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void dequantization_layer(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale and offset vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = SCALE;
-
-    const VEC_DATA_TYPE(int, VEC_SIZE)
-    voffset = OFFSET;
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
-#endif // defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
-/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
- */
-__kernel void dequantization_layer_per_channel_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    __global float *scale)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = scale[get_global_id(2)];
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
-#endif // defined(LAST_ACCESSED_X)
-}
-/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
- */
-__kernel void dequantization_layer_per_channel_nhwc(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    __global float *scale)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-    scale -= max(xi - (int)LAST_ACCESSED_X, 0);
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
-#endif // defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/direct_convolution.cl
deleted file mode 100644
index c5444cd7cc..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution.cl
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "activation_float_helpers.h"
-#include "helpers.h"
-#include "helpers_asymm.h"
-#include "tile_helpers.h"
-
-//! @cond Doxygen_Suppress
-/** OpenCL kernel to compute the direct convolution.
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The convolution strides must be passed at compile time using -DSTRIDE and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
- * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
- * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
- * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
- * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
- * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
- * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
- * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
- * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
- * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
- * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
- * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
- * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, .... n
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
- *
- *@note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
- * - -DIS_QUANTIZED
- * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
- * - The destination quantization shift e.g. -DDST_SHIFT=4
- * - The destination offset e.g. -DDST_OFFSET=4
- * - The source offset e.g. -DSRC_OFFSET=4
- * - The weights offset e.g. -DWEI_OFFSET=4
- * - The quantized zero value e.g. -DZERO_VALUE=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32/QASYMM8
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
- * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  wei_step_x                        wei_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
- * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
- * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED)
- * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- */
-//! @endcond
-__kernel void direct_convolution_nhwc(
-    TENSOR4D(src, SRC_TENSOR_TYPE),
-    TENSOR4D(dst, DST_TENSOR_TYPE),
-    TENSOR4D(wei, WEI_TENSOR_TYPE)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bia)
-#endif // defined(HAS_BIAS)
-)
-{
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _IWEI_WIDTH WEI_WIDTH
-#define _IWEI_HEIGHT WEI_HEIGHT
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _ISRC_CHANNELS SRC_CHANNELS
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _IDST_CHANNELS DST_CHANNELS
-#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
-
-    // If quantized, the output tile has to be quantized first before being stored to global memory
-#if defined(IS_QUANTIZED)
-#define _IOUTPUT_TILE cq
-#else // defined(IS_QUANTIZED)
-#define _IOUTPUT_TILE c
-#endif // defined(IS_QUANTIZED)
-
-    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
-
-    // .v    = access the whole vector (OpenCL vector)
-    // .s[x] = access the vector element at position x (scalar access)
-    TILE(int, M0, 1, xi);
-    TILE(int, M0, 1, yi);
-
-    // Convert the linear index to coordinate
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        xi[i].v = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
-        yi[i].v = ((mout + i) / _IDST_WIDTH) * STRIDE_Y;
-        xi[i].v -= PAD_LEFT;
-        yi[i].v -= PAD_TOP;
-    })
-
-    // Initialize the accumulators
-    TILE(ACC_DATA_TYPE, M0, N0, c);
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        c[i].v = 0;
-    })
-
-    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
-    {
-        int ck = 0;
-        int xk = i % _IWEI_WIDTH;
-        int yk = i / _IWEI_WIDTH;
-
-        int k = 0;
-        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
-        {
-            TILE(SRC_DATA_TYPE, M0, K0, a);
-            TILE(WEI_DATA_TYPE, N0, K0, b);
-
-            LOOP_UNROLLING(int, i, 0, 1, M0,
-            {
-                a[i].v = ZERO_VALUE;
-            })
-
-            // Load tile from the src tensor
-            T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
-
-            // Load tile from the weights tensor
-            T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
-
-            // Compute the matrix multiplication between two tiles
-            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
-
-            // Apply the offset correction (correction usually needed for asymmetric quantized computation)
-            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
-            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
-
-            ck += K0;
-        }
-
-        // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
-        // This #if directive should be removed in case of dynamic tensor support
-#if((SRC_CHANNELS % K0) != 0)
-        // Left-over accumulations
-        for(; k < _ISRC_CHANNELS; ++k)
-        {
-            TILE(SRC_DATA_TYPE, M0, 1, a);
-            TILE(WEI_DATA_TYPE, N0, 1, b);
-
-            LOOP_UNROLLING(int, i, 0, 1, M0,
-            {
-                a[i].v = ZERO_VALUE;
-            })
-
-            // Load tile from the src tensor
-            T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
-
-            // Load tile from the weights tensor
-            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
-            T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
-
-            // Compute the matrix multiplication between two tiles
-            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
-
-            // Apply the offset correction (operation usually needed for asymmetric quantized computation)
-            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
-            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
-
-            ++ck;
-        }
-#endif // ((SRC_CHANNELS % K0) != 0)
-    }
-
-    // Offset correction required for the quantized asymmetric computation
-    // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
-    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
-
-#if defined(HAS_BIAS)
-    TILE(BIA_DATA_TYPE, 1, N0, bias0);
-
-    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
-
-#endif // HAS_BIAS
-
-    TILE(uint, M0, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
-        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
-    })
-
-    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
-
-#if defined(IS_QUANTIZED)
-
-    TILE(DST_DATA_TYPE, M0, N0, cq);
-
-    // Quantize the tile
-    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
-#endif // defined(IS_QUANTIZED)
-
-    // Apply activation
-    T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE);
-
-    // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
-
-#undef _IWEI_WIDTH
-#undef _IWEI_HEIGHT
-#undef _ISRC_WIDTH
-#undef _ISRC_HEIGHT
-#undef _ISRC_CHANNELS
-#undef _IDST_WIDTH
-#undef _IDST_HEIGHT
-#undef _IDST_CHANNELS
-#undef _IY_MULTIPLIER
-}
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl
deleted file mode 100644
index 8ab2d1d4ea..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution1x1.cl
+++ /dev/null
@@ -1,316 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#define ADD_OP(a, b) ((a) + (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define CONVERT_SAT(a, b) ((a))
-
-#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if STRIDE_X == 3
-#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
-#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
-#elif STRIDE_X == 2
-#define INPUT_PIXEL(data_size) extract_input_stride2
-#elif STRIDE_X == 1
-#define INPUT_PIXEL(data_size) extract_input_stride1
-#else /* STRIDE_X not equals 1, 2 or 3 */
-#error "Only support strides 1, 2 and 3"
-#endif /* STRIDE_X == 3 */
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
-{
-    return vload8(0, input_pixel);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp = vload16(0, input_pixel);
-    return temp.s02468ace;
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp1 = vload4(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp2 = vload4(0, input_pixel + 6);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp3 = vload4(0, input_pixel + 12);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp4 = vload4(0, input_pixel + 18);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp1 = vload8(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp2 = vload8(0, input_pixel + 8);
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp3 = vload8(0, input_pixel + 16);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp1 = vload16(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp2 = vload16(0, input_pixel + 12);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
-}
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
- * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution1x1(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-#endif /* defined(HAS_BIAS) */
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
-    values = 0;
-
-    const uint z_index = get_global_id(2);
-
-    weights.ptr += z_index * weights_stride_w;
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
-        values      = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
-        src.ptr += src_stride_z;
-        weights.ptr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \
-    ({                                                 \
-        acc.s0 = mad(src.s0, weight_value, acc.s0);    \
-        acc.s1 = mad(src.s1, weight_value, acc.s1);    \
-        acc.s2 = mad(src.s2, weight_value, acc.s2);    \
-        acc.s3 = mad(src.s3, weight_value, acc.s3);    \
-    })
-
-/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases, -DHAS_BIAS must to be passed at compile
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution1x1_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 acc0 = 0.0f;
-    float4 acc1 = 0.0f;
-    float4 acc2 = 0.0f;
-    float4 acc3 = 0.0f;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights
-        float weight = *((__global float *)weights_addr);
-
-        // Load values from row0 of input tensor
-        float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-        float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-        float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-        float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
-
-        CONVOLUTION1x1_BIFROST(acc0, src0, weight);
-        CONVOLUTION1x1_BIFROST(acc1, src1, weight);
-        CONVOLUTION1x1_BIFROST(acc2, src2, weight);
-        CONVOLUTION1x1_BIFROST(acc3, src3, weight);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    acc0.s0 += bias;
-    acc0.s1 += bias;
-    acc0.s2 += bias;
-    acc0.s3 += bias;
-    acc1.s0 += bias;
-    acc1.s1 += bias;
-    acc1.s2 += bias;
-    acc1.s3 += bias;
-    acc2.s0 += bias;
-    acc2.s1 += bias;
-    acc2.s2 += bias;
-    acc2.s3 += bias;
-    acc3.s0 += bias;
-    acc3.s1 += bias;
-    acc3.s2 += bias;
-    acc3.s3 += bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-    vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl
deleted file mode 100644
index 811df053c4..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution3x3.cl
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#define ADD_OP(a, b) ((a) + (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define CONVERT_SAT(a, b) ((a))
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X == 2 */
-
-#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                                                  \
-    ({                                                                                                                                             \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                                \
-        weights_values0 = vload3(0, weights_row_ptr);                                                                                              \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                                \
-        src0 = vload8(0, src_row_ptr);                                                                                                             \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                                \
-        src1 = vload2(0, src_row_ptr + 8);                                                                                                         \
-        \
-        acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                          \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
-    })
-
-#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                                               \
-    ({                                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                             \
-        weights_values0 = vload3(0, weights_row_ptr);                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                            \
-        src0           = vload16(0, src_row_ptr);                                                                                               \
-        DATA_TYPE src1 = *(src_row_ptr + 16);                                                                                                   \
-        \
-        acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                  \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1));      \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
-    })
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note This OpenCL kernel works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution3x3(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
-    values0 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    values0 = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(CONVERT_SAT(values0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \
-    ({                                                        \
-        acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0);       \
-        acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1);       \
-        acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2);       \
-        acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3);       \
-        acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0);       \
-        acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1);       \
-        acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2);       \
-        acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3);       \
-        acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0);       \
-        acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1);       \
-        acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2);       \
-        acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3);       \
-    })
-
-/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases, -DHAS_BIAS must to be passed at compile
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution3x3_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 values0 = 0;
-    float4 values1 = 0;
-    float4 values2 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights
-        float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-        float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-        float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-        float4 src0;
-        float2 src1;
-
-        // Load values from row0 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);
-
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row0);
-
-        // Load values from row1 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row1);
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row0);
-
-        // Load values from row2 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row2);
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row1);
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row0);
-
-        // Load values from row3 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row2);
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row1);
-
-        // Row4
-        src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row2);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    values0 += (float4)bias;
-    values1 += (float4)bias;
-    values2 += (float4)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(values2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
deleted file mode 100644
index 59d668f0bf..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X == 2 */
-
-#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                               \
-    ({                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
-        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                             \
-        src0 = vload8(0, src_row_ptr);                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        src1 = vload4(0, src_row_ptr + 8);                                                                                      \
-        \
-        acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                          \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
-    })
-
-#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                               \
-    ({                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
-        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
-        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                            \
-        src0 = vload16(0, src_row_ptr);                                                                                         \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        src1 = vload4(0, src_row_ptr + 16);                                                                                     \
-        acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                     \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;         \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
-        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
-    })
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution5x5(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    values0 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \
-    ({                                                                  \
-        acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0);                \
-        acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1);                \
-        acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2);                \
-        acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3);                \
-        acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0);                \
-        acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1);                \
-        acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2);                \
-        acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3);                \
-        acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0);                \
-        acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1);                \
-        acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2);                \
-        acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3);                \
-        acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0);                \
-        acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1);                \
-        acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2);                \
-        acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3);                \
-        acc.s0 = mad(src0.s4, weights_row01, acc.s0);                   \
-        acc.s1 = mad(src0.s5, weights_row01, acc.s1);                   \
-        acc.s2 = mad(src0.s6, weights_row01, acc.s2);                   \
-        acc.s3 = mad(src0.s7, weights_row01, acc.s3);                   \
-    })
-
-/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution5x5_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 values0 = 0.0f;
-    float4 values1 = 0.0f;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights from row0 and row1
-        float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-        float  weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4);
-        float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-        float  weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4);
-        float8 src0;
-
-        // Load values from row0 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-
-        // Load values from row1 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        // Load values from row2 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
-
-        // Load weights from row2
-        weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-        weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
-
-        // Load values from row3 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
-
-        // Load weights from row3
-        weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y));
-        weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        // Load values from row4 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
-
-        // Load weights from row4
-        weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y));
-        weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4);
-
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
-
-        // Load values from row5 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    values0 += bias;
-    values1 += bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_quantized.cl
deleted file mode 100644
index b80d4f587e..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#undef CONVERT_SAT_STR
-#undef CONVERT_SAT
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
-
-#define CONVERT_SAT_STR(x, type) (convert_##type##8_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
-
-#if KERNEL_SIZE == 9
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                             \
-    ({                                                                                                        \
-        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                     \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                          \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                       \
-        acc += (src0.lo + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                        \
-        acc += ((int8)(src0.s1234, src0.s5678) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s2345, src0.s6789) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s3456, src0.s789A) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s4567, src0.s89AB) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s5678, src0.s9ABC) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s6789, src0.sABCD) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s789A, src0.sBCDE) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s89AB, src0.sCDEF) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
-    })
-
-#define CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                             \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                                  \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                               \
-        int8  src1            = convert_int8(vload8(0, src_row_ptr + 16));                                            \
-        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                              \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);         \
-        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s468A, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s579B, src0.sDF, src1.s13) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s68AC, src0.sE, src1.s024) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s79BD, src0.sF, src1.s135) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s8ACE, src1.s0246) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);             \
-    })
-
-#elif KERNEL_SIZE == 5
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                              \
-        int  weights_value1  = convert_int(*(weights_row_ptr + 4));                                                   \
-        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                  \
-        int4 src1            = convert_int4(vload4(0, src_row_ptr + 8));                                              \
-        acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                                   \
-        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s345, src0.s67, src1.s012) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s45, src0.s67, src1.s0123) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
-    })
-
-#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int4  weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                             \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 4));                                                  \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                               \
-        int4  src1            = convert_int4(vload4(0, src_row_ptr + 16));                                            \
-        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                              \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);         \
-        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
-    })
-
-#elif KERNEL_SIZE == 3
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                              \
-        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                  \
-        int2 src1            = convert_int2(vload2(0, src_row_ptr + 8));                                              \
-        acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                                   \
-        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-    })
-
-#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                  \
-    ({                                                                                                             \
-        int3  weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                          \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                            \
-        int   src1            = convert_int(*(src_row_ptr + 16));                                                  \
-        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                           \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);      \
-        acc += ((int8)(src0.s2468, src0.sACE, src1) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-    })
-
-#elif KERNEL_SIZE == 1
-
-#if STRIDE_X == 3
-#define INPUT_VALUE extract_input_stride3
-#elif STRIDE_X == 2
-#define INPUT_VALUE extract_input_stride2
-#elif STRIDE_X == 1
-#define INPUT_VALUE extract_input_stride1
-
-#else /* STRIDE_X not equals 1, 2 or 3 */
-#error "Only support strides 1, 2 and 3"
-#endif /* STRIDE_X */
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value)
-{
-    return vload8(0, input_value);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp = vload16(0, input_value);
-    return temp.s02468ace;
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp1 = vload16(0, input_value);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp2 = vload16(0, input_value + 12);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
-}
-
-#else /* KERNEL_SIZE not equals 1, 3 , 5, 9 */
-#error "Only kernel sizes 1, 3, 5 and 9 are supported"
-#endif /* KERNEL_SIZE */
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
- * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
- * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3
- * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3
- * @note The destination offset quantization parameter must be passed at compile time using -DOUTPUT_OFFSET e.g. -DOUTPUT_OFFSET=3
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Supported data types: S32
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int8 values0 = 0;
-
-    __global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0);
-    __global DATA_TYPE *src_addr     = (__global DATA_TYPE *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-#if KERNEL_SIZE == 9
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 5 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 6 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 7 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 8 * weights_stride_y));
-#elif KERNEL_SIZE == 5
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-#elif KERNEL_SIZE == 3
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-#elif KERNEL_SIZE == 1
-        int weight       = convert_int(*(__global DATA_TYPE *)weights_addr);
-        int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr));
-        values0 += (input_value + INPUT_OFFSET) * ((int8)weight + WEIGHTS_OFFSET);
-#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector        biases    = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-    __global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index)));
-    values0 += (int8)(*bias_addr);
-#endif /* defined(HAS_BIAS) */
-
-#if OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_SHIFT < 0
-    values0 = values0 + OUTPUT_OFFSET;
-
-    vstore8(CONVERT_SAT(values0, DATA_TYPE), 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/elementwise_operation.cl
deleted file mode 100644
index c8250045dc..0000000000
--- a/src/core/CL/cl_kernels/elementwise_operation.cl
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT)
-
-/** List of all the operations supported by this kernel.
- * @note ADD and SUB operations, when executed on integers, support saturation */
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-#define MAX(x, y) max(x, y)
-#define MIN(x, y) min(x, y)
-#define SQUARED_DIFF(x, y) (x - y) * (x - y)
-#define POWER(x, y) pow(x, y)
-
-#if VEC_SIZE_OUT == 1
-#define PRELU(x, y) (x > 0 ? x : x * y)
-#else // VEC_SIZE_OUT == 1
-#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_OUT)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))))
-#endif // VEC_SIZE_OUT == 1
-
-#if defined(S32)
-#define DIV(x, y) CONVERT(floor(CONVERT(x, VEC_DATA_TYPE(float, VEC_SIZE_OUT)) / CONVERT(y, VEC_DATA_TYPE(float, VEC_SIZE_OUT))), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT));
-#else /* S32 */
-#define DIV(x, y) (x / y)
-#endif /* S32 */
-
-#define AND(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))1))
-#define OR(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))1))
-
-#define OP_FUN_NAME_STR(op) elementwise_operation_##op
-#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
-
-#if defined(ACTIVATION_TYPE)
-#include "activation_float_helpers.h"
-#endif // defined(ACTIVATION_TYPE)
-
-/** This function executes an element-wise operation among two tensors.
- *
- * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_OUT=3. It is defined as the remainder between the input's first dimension and VEC_SIZE_OUT
- * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
- * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void OP_FUN_NAME(OP)(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-#if VEC_SIZE_IN1 == 1
-    uint in1_x_offs = 0;
-#else  // VEC_SIZE_IN1 == 1
-    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
-#endif // VEC_SIZE_IN1 == 1
-#if VEC_SIZE_IN2 == 1
-    uint in2_x_offs = 0;
-#else  // VEC_SIZE_IN2 == 1
-    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
-#endif // VEC_SIZE_IN2 == 1
-    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE_IN1) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE_IN2) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-    in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT));
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-    in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT));
-
-    // Calculate and store result
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-    res0 = OP(in_a, in_b);
-#if defined(ACTIVATION_TYPE)
-    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) */
diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
deleted file mode 100644
index a08c3b2d47..0000000000
--- a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define SUB(x, y) (x - y)
-#define ADD(x, y) (x + y)
-#define MAX(x, y) max((x), (y))
-#define MIN(x, y) min((x), (y))
-#define SQUARED_DIFF(x, y) (x - y) * (x - y)
-#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_OUT)0), SELECT_VEC_DATA_TYPE(float, VEC_SIZE_OUT))))
-#define DIV(x, y) (x / y)
-
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-
-#define OP_FUN_NAME_STR(op) elementwise_operation_##op##_quantized
-#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
-
-#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-
-/** This function executes an element-wise operation among two tensors.
- *
- * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note In case of broadcasting along the X dimension the proper preprocessor argument should be passed depending on the input (e.g. -DIS_IN1_X_BROADCASTING, -DIS_IN2_X_BROADCASTING)
- * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
- * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
- * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
- * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
- * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
- * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
- * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
- * @note For QSYMM16 operations OFFSET_IN1, OFFSET_IN2 and OFFSET_OUT must be set to zero
- * @note The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QSYMM16
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void OP_FUN_NAME(OP)(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-#if VEC_SIZE_IN1 == 1
-    uint in1_x_offs = 0;
-#else  // VEC_SIZE_IN1 == 1
-    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
-#endif // VEC_SIZE_IN1 == 1
-#if VEC_SIZE_IN2 == 1
-    uint in2_x_offs = 0;
-#else  // VEC_SIZE_IN2 == 1
-    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
-#endif // VEC_SIZE_IN2 == 1
-    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
-
-    VEC_INT in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
-    VEC_INT in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
-
-    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
-    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
-
-    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
-    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
-    const VEC_FLOAT qresf32 = OP(in1f32, in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
-    const VEC_TYPE  res0    = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
-
-    // Store result
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) */
diff --git a/src/core/CL/cl_kernels/elementwise_unary.cl b/src/core/CL/cl_kernels/elementwise_unary.cl
deleted file mode 100644
index d2d9d97d33..0000000000
--- a/src/core/CL/cl_kernels/elementwise_unary.cl
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-#if defined(DATA_TYPE) && defined(OPERATION)
-
-// Calculate exponential
-#define exp_op(input) exp(input)
-// Calculate reverse square root
-#define rsqrt_op(input) rsqrt(input)
-// Calculate negative
-#define neg_op(input) (-input)
-// Calculate sine
-#define sin_op(input) sin(input)
-// Calculate abs for floating point values
-#define fabs_op(input) fabs(input)
-// Calculate natural_log
-#define natural_log_op(input) log(input)
-// Calculate round (Cannot use round function as it rounds halfway cases away from zero).
-#if defined(VEC_SIZE)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, VEC_DATA_TYPE(int, VEC_SIZE), rte), VEC_TYPE)
-#define logical_not_op(input) CONVERT(CONVERT(!input, VEC_TYPE) & ((VEC_TYPE)0x1), VEC_TYPE)
-#else // defined(VEC_SIZE)
-#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, int, rte), DATA_TYPE)
-#define logical_not_op(input) ((!input) & 0x1)
-#endif // defined(VEC_SIZE)
-
-/** Applies element wise unary operator in a tensor.
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: F16/32.
- * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: F16/32.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_step_y                        Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- */
-__kernel void elementwise_unary(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
-    out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-
-    VSTORE(VEC_SIZE)
-    (OPERATION(data), 0, (__global DATA_TYPE *)out.ptr);
-#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE *)(out.ptr)) = (DATA_TYPE)(OPERATION(*((__global DATA_TYPE *)in.ptr)));
-#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-}
-#endif // defined(DATA_TYPE) && defined(OPERATION)
diff --git a/src/core/CL/cl_kernels/fft.cl b/src/core/CL/cl_kernels/fft.cl
deleted file mode 100644
index 51763a620a..0000000000
--- a/src/core/CL/cl_kernels/fft.cl
+++ /dev/null
@@ -1,1880 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE)
-/** Calculates and applies the twiddle factor to a given input.
- *
- * @param[in]     phi   The angle.
- * @param[in,out] input The input on which the factor should be applied.
- */
-#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input)  \
-    {                                              \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                \
-        w, tmp;                                    \
-        w.x   = cos(phi);                          \
-        w.y   = sin(phi);                          \
-        tmp.x = (w.x * input.x) - (w.y * input.y); \
-        tmp.y = (w.x * input.y) + (w.y * input.x); \
-        input = tmp;                               \
-    }
-
-/** Computes radix-2 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- */
-#define DFT_2(c0, c1)               \
-    {                               \
-        VEC_DATA_TYPE(DATA_TYPE, 2) \
-        v0;                         \
-        v0 = c0;                    \
-        c0 = v0 + c1;               \
-        c1 = v0 - c1;               \
-    }
-
-// radix-3 butterfly unit factors
-#define SQRT3DIV2 0.86602540378443f
-
-/** Computes radix-3 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- * @param[in,out] c2 Complex input 2.
- */
-#define DFT_3(c0, c1, c2)                             \
-    {                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
-        v0 = c1 + c2;                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
-        v1   = c1 - c2;                               \
-        c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
-        c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
-        c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
-        c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
-        c0   = c0 + v0;                               \
-    }
-
-/**Computes radix-4 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- * @param[in,out] c2 Complex input 2.
- * @param[in,out] c3 Complex input 3.
- */
-#define DFT_4(c0, c1, c2, c3)       \
-    {                               \
-        VEC_DATA_TYPE(DATA_TYPE, 2) \
-        v0, v1, v2, v3;             \
-        v0   = c0 + c2;             \
-        v1   = c1 + c3;             \
-        v2   = c0 - c2;             \
-        v3.x = c1.y - c3.y;         \
-        v3.y = c3.x - c1.x;         \
-        c0   = v0 + v1;             \
-        c2   = v0 - v1;             \
-        c1   = v2 + v3;             \
-        c3   = v2 - v3;             \
-    }
-
-// radix-5 butterfly unit factors
-#define W5_A (DATA_TYPE)0.30901699437494f
-#define W5_B (DATA_TYPE)0.95105651629515f
-#define W5_C (DATA_TYPE)0.80901699437494f
-#define W5_D (DATA_TYPE)0.58778525229247f
-
-/** Computes radix-5 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- * @param[in,out] c2 Complex input 2.
- * @param[in,out] c3 Complex input 3.
- * @param[in,out] c4 Complex input 4.
- */
-#define DFT_5(c0, c1, c2, c3, c4)                                  \
-    {                                                              \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                \
-        v0, v1, v2, v3, v4;                                        \
-        v0 = c0;                                                   \
-        v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3);                  \
-        v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3);                  \
-        v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3);                  \
-        v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3);                  \
-        c0 = v0 + c1 + c2 + c3 + c4;                               \
-        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x); \
-        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v3.y, -v3.x); \
-        c3 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v3.y, v3.x); \
-        c4 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x); \
-    }
-
-// radix-7 butterfly unit factors
-#define W7_A (DATA_TYPE)0.62348980185873f
-#define W7_B (DATA_TYPE)0.78183148246802f
-#define W7_C (DATA_TYPE)0.22252093395631f
-#define W7_D (DATA_TYPE)0.97492791218182f
-#define W7_E (DATA_TYPE)0.90096886790241f
-#define W7_F (DATA_TYPE)0.43388373911755f
-
-/** Computes radix-7 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- * @param[in,out] c2 Complex input 2.
- * @param[in,out] c3 Complex input 3.
- * @param[in,out] c4 Complex input 4.
- * @param[in,out] c5 Complex input 5.
- * @param[in,out] c6 Complex input 6.
- */
-#define DFT_7(c0, c1, c2, c3, c4, c5, c6)                            \
-    {                                                                \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                  \
-        v0, v1, v2, v3, v4, v5, v6;                                  \
-        v0 = c0;                                                     \
-        v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \
-        v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \
-        v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \
-        v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \
-        v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \
-        v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \
-        c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6;                       \
-        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x);   \
-        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v5.y, -v5.x);   \
-        c3 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v6.y, -v6.x);   \
-        c4 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v6.y, v6.x);   \
-        c5 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v5.y, v5.x);   \
-        c6 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x);   \
-    }
-
-/** Computes radix-8 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- * @param[in,out] c2 Complex input 2.
- * @param[in,out] c3 Complex input 3.
- * @param[in,out] c4 Complex input 4.
- * @param[in,out] c5 Complex input 5.
- * @param[in,out] c6 Complex input 6.
- * @param[in,out] c7 Complex input 7.
- */
-#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \
-    {                                         \
-        VEC_DATA_TYPE(DATA_TYPE, 2)           \
-        v0, v1, v2, v3, v4, v5, v6, v7;       \
-        VEC_DATA_TYPE(DATA_TYPE, 2)           \
-        s0, s1, s2, s3, s4, s5, s6, s7;       \
-        VEC_DATA_TYPE(DATA_TYPE, 2)           \
-        t0, t1, t2;                           \
-        v0   = c0 + c4;                       \
-        v1   = c1 + c5;                       \
-        v2   = c2 + c6;                       \
-        v3   = c3 + c7;                       \
-        v4   = c0 - c4;                       \
-        v5   = c1 - c5;                       \
-        v6   = c2 - c6;                       \
-        v7   = c3 - c7;                       \
-        s0   = v0 + v2;                       \
-        s1   = v1 + v3;                       \
-        s2   = v0 - v2;                       \
-        s3   = v1 - v3;                       \
-        s4.x = v4.x - v6.y;                   \
-        s4.y = v4.y + v6.x;                   \
-        s5.x = v5.x - v7.y;                   \
-        s5.y = v5.y + v7.x;                   \
-        s6.x = v4.x + v6.y;                   \
-        s6.y = v4.y - v6.x;                   \
-        s7.x = v5.x + v7.y;                   \
-        s7.y = v5.y - v7.x;                   \
-        t0.x = -s3.y;                         \
-        t0.y = s3.x;                          \
-        t1.x = M_SQRT1_2_F * (s5.x - s5.y);   \
-        t1.y = M_SQRT1_2_F * (s5.x + s5.y);   \
-        t2.x = -M_SQRT1_2_F * (s7.x + s7.y);  \
-        t2.y = M_SQRT1_2_F * (s7.x - s7.y);   \
-        c0   = s0 + s1;                       \
-        c1   = s6 - t2;                       \
-        c2   = s2 - t0;                       \
-        c3   = s4 - t1;                       \
-        c4   = s0 - s1;                       \
-        c5   = s6 + t2;                       \
-        c6   = s2 + t0;                       \
-        c7   = s4 + t1;                       \
-    }
-
-/** Computes the first stage of a radix-2 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_2_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load two complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    data = vload4(0, (__global DATA_TYPE *)input.ptr);
-
-    // Compute DFT N = 2
-    DFT_2(data.s01, data.s23);
-
-    // Store two complex output values
-    vstore4(data, 0, (__global DATA_TYPE *)output.ptr);
-}
-
-/** Computes the first stage of a radix-2 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_2_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load two complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-
-    // Compute DFT N = 2
-    DFT_2(data1, data2);
-
-    // Store two complex output values
-    vstore2(data1, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-}
-
-/** Computes the first stage of a radix-3 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_3_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load three complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    data0 = vload4(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2, 0, 0));
-
-    // Compute DFT N = 3
-    DFT_3(data0.s01, data0.s23, data1.s01);
-
-    // Store three complex output values
-    vstore4(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2, 0, 0));
-}
-
-/** Computes the first stage of a radix-3 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_3_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load three complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-
-    // Compute DFT N = 3
-    DFT_3(data0, data1, data2);
-
-    // Store three complex output values
-    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
-}
-
-/** Computes the first stage of a radix-4 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_4_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load four complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data = vload8(0, (__global DATA_TYPE *)input.ptr);
-
-    // Compute DFT N = 4
-    DFT_4(data.s01, data.s23, data.s45, data.s67);
-
-    // Store four complex output values
-    vstore8(data, 0, (__global DATA_TYPE *)output.ptr);
-}
-
-/** Computes the first stage of a radix-4 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_4_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load four complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
-
-    // Compute DFT N = 4
-    DFT_4(data0, data1, data2, data3);
-
-    // Store four complex output values
-    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
-}
-
-/** Computes the first stage of a radix-5 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_5_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load five complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
-
-    // Compute DFT N = 5
-    DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
-
-    // Store five complex output values
-    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
-}
-
-/** Computes the first stage of a radix-5 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_5_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load five complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
-
-    // Compute DFT N = 5
-    DFT_5(data0, data1, data2, data3, data4);
-
-    // Store five complex output values
-    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
-    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
-}
-
-/** Computes the first stage of a radix-7 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_7_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load seven complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    data1 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6, 0, 0));
-
-    // Compute DFT N = 7
-    DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
-
-    // Store seven complex output values
-    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore4(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6, 0, 0));
-}
-
-/** Computes the first stage of a radix-7 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_7_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load seven complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
-
-    // Compute DFT N = 7
-    DFT_7(data0, data1, data2, data3, data4, data5, data6);
-
-    // Store seven complex output values
-    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
-    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
-    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
-    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
-}
-
-/** Computes the first stage of a radix-8 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_8_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load eight complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)input.ptr);
-
-    // Compute DFT N = 8
-    DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
-
-    // Store eight complex output values
-    vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
-}
-
-/** Computes the first stage of a radix-8 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_8_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load eight complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7, 0));
-
-    // Compute DFT N = 8
-    DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);
-
-    // Store eight complex output values
-    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
-    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
-    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
-    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
-    vstore2(data7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7, 0));
-}
-
-/** Computes a stage of a radix-2 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_2_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-2
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load two complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-
-    // Compute DFT N = 2
-    DFT_2(c0, c1);
-
-    // Store two complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-2 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_2_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-2
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load two complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-
-    // Compute DFT N = 2
-    DFT_2(c0, c1);
-
-    // Store two complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-}
-
-/** Computes a stage of a radix-3 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_3_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-3
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load three complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-
-    // Compute DFT N = 3
-    DFT_3(c0, c1, c2);
-
-    // Store three complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-3 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_3_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-3
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load three complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-
-    // Compute DFT N = 3
-    DFT_3(c0, c1, c2);
-
-    // Store three complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-}
-
-/** Computes a stage of a radix-4 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_4_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-4
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load four complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-
-    // Compute DFT N = 4
-    DFT_4(c0, c1, c2, c3);
-
-    // Store four complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-4 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_4_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-4
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load four complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-
-    // Compute DFT N = 4
-    DFT_4(c0, c1, c2, c3);
-
-    // Store four complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
-}
-
-/** Computes a stage of a radix-5 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_5_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-5
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load five complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-
-    // Compute DFT N = 5
-    DFT_5(c0, c1, c2, c3, c4);
-
-    // Store five complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-5 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_5_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-5
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load five complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-
-    // Compute DFT N = 5
-    DFT_5(c0, c1, c2, c3, c4);
-
-    // Store five complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
-}
-
-/** Computes a stage of a radix-7 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_7_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-7
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load seven complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
-    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
-
-    // Compute DFT N = 7
-    DFT_7(c0, c1, c2, c3, c4, c5, c6);
-
-    // Store seven complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
-    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
-    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-7 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_7_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-7
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load seven complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
-    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
-
-    // Compute DFT N = 7
-    DFT_7(c0, c1, c2, c3, c4, c5, c6);
-
-    // Store seven complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
-    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
-    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
-}
-
-/** Computes a stage of a radix-8 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_8_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-8
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load eight complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 7 * Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
-    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
-    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
-
-    // Compute DFT N = 8
-    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
-
-    // Store eight complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
-    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
-    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
-    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 7 * Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-8 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_8_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-8
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load eight complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7 * Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
-    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
-    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
-
-    // Compute DFT N = 8
-    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
-
-    // Store eight complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
-    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
-    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
-    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7 * Nx, 0));
-}
-#endif // defined(DATA_TYPE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft_digit_reverse.cl b/src/core/CL/cl_kernels/fft_digit_reverse.cl
deleted file mode 100644
index de566212c6..0000000000
--- a/src/core/CL/cl_kernels/fft_digit_reverse.cl
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-/** Computes the digit reverse stage on axis X
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
- * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
- * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
- */
-__kernel void fft_digit_reverse_axis_0(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    VECTOR_DECLARATION(idx))
-{
-    // Get tensor pointers
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   idx = CONVERT_TO_VECTOR_STRUCT(idx);
-
-    const unsigned int iidx = *((__global uint *)(idx.ptr));
-
-    // Load data
-#if VEC_SIZE == 1
-    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
-#elif VEC_SIZE == 2
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
-#else // VEC_SIZE == 1
-#error "vec_size of 1 and 2 are supported"
-#endif // VEC_SIZE == 1
-
-    // Create result
-#if VEC_SIZE == 1
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    res = { data, 0 };
-#elif VEC_SIZE == 2
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    res = data;
-#else // VEC_SIZE == 1
-#error "vec_size of 1 and 2 are supported"
-#endif // VEC_SIZE == 1
-
-    // Store result
-#if defined(CONJ)
-    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
-#else  // defined(CONJ)
-    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
-#endif // defined(CONJ)
-}
-
-/** Computes the digit reverse stage on axis Y
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
- * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
- * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
- */
-__kernel void fft_digit_reverse_axis_1(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    VECTOR_DECLARATION(idx))
-{
-    // Get tensor pointers
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   idx = CONVERT_TO_VECTOR_STRUCT_NO_STEP(idx);
-
-    const unsigned int iidx = *((__global uint *)vector_offset(&idx, (int)(get_global_id(1))));
-
-    // Load data
-#if VEC_SIZE == 1
-    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
-#elif VEC_SIZE == 2
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
-#else // VEC_SIZE == 1
-#error "vec_size of 1 and 2 are supported"
-#endif // VEC_SIZE == 1
-
-    // Create result
-#if VEC_SIZE == 1
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    res = { data, 0 };
-#elif VEC_SIZE == 2
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    res = data;
-#else // VEC_SIZE == 1
-#error "vec_size of 1 and 2 are supported"
-#endif // VEC_SIZE == 1
-
-    // Store result
-#if defined(CONJ)
-    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
-#else  // defined(CONJ)
-    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
-#endif // defined(CONJ)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft_scale.cl b/src/core/CL/cl_kernels/fft_scale.cl
deleted file mode 100644
index 57e25ef504..0000000000
--- a/src/core/CL/cl_kernels/fft_scale.cl
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-/** Computes the fft scale stage
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        (Optional) dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        (Optional) dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        (Optional) dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
- * @param[in]  scale                             Scale to apply to the complex value
- */
-__kernel void fft_scale_conj(
-    TENSOR3D_DECLARATION(src)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(dst)
-#endif /* not IN_PLACE */
-    ,
-    float scale)
-{
-    // Get tensor pointers
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-#if defined(IN_PLACE)
-    Tensor3D dst = src;
-#else  /* IN_PLACE */
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-#endif /* IN_PLACE */
-
-    // Store result
-#if VEC_SIZE == 1
-    *((__global DATA_TYPE *)dst.ptr) = (*(__global DATA_TYPE *)src.ptr) / (DATA_TYPE)scale;
-#elif VEC_SIZE == 2
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data = vload2(0, (__global DATA_TYPE *)src.ptr);
-    data /= (DATA_TYPE)scale;
-#if defined(CONJ)
-    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(data.s0, -data.s1), 0, (__global DATA_TYPE *)dst.ptr);
-#else  // defined(CONJ)
-    vstore2(data, 0, (__global DATA_TYPE *)dst.ptr);
-#endif // defined(CONJ)
-#else  // VEC_SIZE == 1
-#error "vec_size of 1 and 2 are supported"
-#endif // VEC_SIZE == 1
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
deleted file mode 100644
index 5775d899e8..0000000000
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
- *
- * @attention  The DATA_TYPE needs to be passed at the compile time.
- * e.g. -DDATA_TYPE=int
- *
- * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
- * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
- *
- * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     buf_stride_z                      Stride between images if batching images (in bytes)
- * @param[in]     buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]     width                             Width of the valid region of the image
- * @param[in]     height                            Height of the valid region of the image
- * @param[in]     start_pos                         XY coordinate indicating the start point of the valid region
- */
-__kernel void fill_image_borders_replicate(
-    TENSOR3D_DECLARATION(buf),
-    uint width,
-    uint height,
-    int2 start_pos)
-{
-    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
-
-    // Update pointer to point to the starting point of the valid region
-    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
-
-    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
-    const int gid0        = get_global_id(0);
-    const int gidH        = gid0 - total_width;
-    const int gidW        = gid0 - BORDER_SIZE_LEFT;
-
-    if(gidH >= 0)
-    {
-        // Handle left border
-        DATA_TYPE left_val = *(__global DATA_TYPE *)offset(&buf, 0, gidH);
-        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, i, gidH) = left_val;
-        }
-        // Handle right border
-        DATA_TYPE right_val = *(__global DATA_TYPE *)offset(&buf, width - 1, gidH);
-        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = right_val;
-        }
-    }
-    else
-    {
-        // Get value for corners
-        int val_idx = gidW;
-        if(gidW < 0 || gidW > (width - 1))
-        {
-            val_idx = gidW < 0 ? 0 : width - 1;
-        }
-
-        // Handle top border
-        DATA_TYPE top_val = *(__global DATA_TYPE *)offset(&buf, val_idx, 0);
-        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, gidW, i) = top_val;
-        }
-        // Handle bottom border
-        DATA_TYPE bottom_val = *(__global DATA_TYPE *)offset(&buf, val_idx, height - 1);
-        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = bottom_val;
-        }
-    }
-}
-
-/** Fill N pixels of the padding edge of a single channel image with a constant value.
- *
- * @attention  The DATA_TYPE needs to be passed at the compile time.
- * e.g. -DDATA_TYPE=int
- *
- * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
- * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
- *
- * @param[out] buf_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  buf_stride_z                      Stride between images if batching images (in bytes)
- * @param[in]  buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  width                             Width of the valid region of the image
- * @param[in]  height                            Height of the valid region of the image
- * @param[in]  start_pos                         XY coordinate indicating the start point of the valid region
- * @param[in]  constant_value                    Constant value to use to fill the edges
- */
-__kernel void fill_image_borders_constant(
-    TENSOR3D_DECLARATION(buf),
-    uint      width,
-    uint      height,
-    int2      start_pos,
-    DATA_TYPE constant_value)
-{
-    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
-
-    // Update pointer to point to the starting point of the valid region
-    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
-
-    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
-    const int gid0        = get_global_id(0);
-    const int gidH        = gid0 - total_width;
-    const int gidW        = gid0 - BORDER_SIZE_LEFT;
-
-    if(gidH >= 0)
-    {
-        // Handle left border
-        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, i, gidH) = constant_value;
-        }
-        // Handle right border
-        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = constant_value;
-        }
-    }
-    else
-    {
-        // Handle top border
-        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, gidW, i) = constant_value;
-        }
-        // Handle bottom border
-        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = constant_value;
-        }
-    }
-}
diff --git a/src/core/CL/cl_kernels/gather.cl b/src/core/CL/cl_kernels/gather.cl
deleted file mode 100644
index 41f439cb47..0000000000
--- a/src/core/CL/cl_kernels/gather.cl
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(AXIS)
-
-/** Performs the Gather operation along the chosen axis
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
- * @attention Output tensor depth should be given as a preprocessor argument using -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
- * @attention Input tensor depth should be given as a preprocessor argument using -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
- *
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per work item (in bytes)
- * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per work item (in bytes)
- * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source tensor
- * @param[in]  indices_ptr                           Pointer to the indices vector. Supported data types: S32/U32.
- * @param[in]  indices_stride_x                      Stride of the indices vector in X dimension (in bytes)
- * @param[in]  indices_step_x                        input_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  indices_offset_first_element_in_bytes Offset of the first element in the indices vector
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per work item (in bytes)
- * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per work item (in bytes)
- * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination tensor
- */
-__kernel void gather(
-    TENSOR4D_DECLARATION(input),
-    VECTOR_DECLARATION(indices),
-    TENSOR4D_DECLARATION(output))
-{
-    const int px = get_global_id(0);
-    const int py = get_global_id(1);
-    const int pz = get_global_id(2) % OUTPUT_DIM_Z;
-    const int pw = get_global_id(2) / OUTPUT_DIM_Z;
-
-    const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
-    const Vector   indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
-    Tensor4D       output  = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
-
-#if AXIS == 0
-    const uint index                 = *(__global const uint *)vector_offset(&indices, px);
-    __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
-#elif AXIS == 1
-    const uint index                 = *(__global const uint *)vector_offset(&indices, py);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
-#elif AXIS == 2
-    const uint index                 = *(__global const uint *)vector_offset(&indices, pz);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
-#elif AXIS == 3
-    const uint index                 = *(__global const uint *)vector_offset(&indices, pw);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
-#endif //AXIS
-
-    *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
-}
-
-#endif //defined(DATA_TYPE) && defined(AXIS)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
deleted file mode 100644
index 10435d376f..0000000000
--- a/src/core/CL/cl_kernels/gemm.cl
+++ /dev/null
@@ -1,4386 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
-#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
-#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
-#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
-#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
-#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
-#define CONCAT_INC(K0) INC##K0
-#define INC(K0) CONCAT_INC(K0)
-
-#if(SRC_WIDTH % K0)
-#define BOUNDARY_CONDITION_X(x, a)                                                                                                                   \
-    ({                                                                                                                                               \
-        a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
-    })
-#else // (SRC_WIDTH % K0)
-#define BOUNDARY_CONDITION_X(x, a) \
-    ({})
-#endif // (SRC_WIDTH % K0)
-
-#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                     \
-    ({                                                                                                           \
-        if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0)                                                    \
-        {                                                                                                        \
-            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
-            }                                                                                                    \
-            else                                                                                                 \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
-            }                                                                                                    \
-        }                                                                                                        \
-        else                                                                                                     \
-        {                                                                                                        \
-            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
-            }                                                                                                    \
-            else                                                                                                 \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);                           \
-            }                                                                                                    \
-        }                                                                                                        \
-    })
-
-/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
- * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
- * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
- * @note Only the following values for M0, K0 and V0 are supported:
- *                                      M0: 2,3,4,5,6,7,8
- *                                      K0: 2,3,4,8,16
- *                                      V0: greater than 0
- * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- */
-__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
-                                         TENSOR3D_DECLARATION(dst)
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                         ,
-                                         uint cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-                                        )
-{
-    // Block size
-#define BLOCK_SIZE ((M0) * (K0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (K0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (K0) * (V0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (K0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
-                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
-
-    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src_stride_z by DEPTH_GEMM3D
-
-    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    input_ptr += z * (uint)src_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    output_ptr += z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    // Load values from the LHS matrix
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
-
-    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
-
-    // ---------------------------Store output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-
-#if M0 == 2
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i);                                   \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 3 // M0 == 3
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i);                          \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 4 // M0 == 4
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                 \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 5 // M0 == 5
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                      \
-    ({                                                                                                \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                   \
-        res0           = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);           \
-        DATA_TYPE res1 = a4.s##i;                                                                     \
-        VSTORE(4)                                                                                     \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));    \
-        *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
-    })
-#elif M0 == 6 // M0 == 6
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
-    ({                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
-        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                    \
-        res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i);                                        \
-        VSTORE(4)                                                                                      \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
-        VSTORE(2)                                                                                      \
-        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
-    })
-#elif M0 == 7 // M0 == 7
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
-    ({                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
-        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                    \
-        res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i);                               \
-        VSTORE(4)                                                                                      \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
-        VSTORE(3)                                                                                      \
-        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
-    })
-#elif M0 == 8 // M0 == 8
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                                      \
-    ({                                                                                                                \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                                                  \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
-        VSTORE(M0)                                                                                                    \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));                     \
-    })
-#else // M0 not supported
-#error "M0 value not supported"
-#endif // N0 conditions
-
-/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
- * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
- * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
- * @note Only the following values for M0, K0 and V0 are supported:
- *                                      M0: 2,3,4,5,6,7,8
- *                                      K0: 2,3,4,8,16
- *                                      V0: greater than 0
- * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- */
-__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
-                                        TENSOR3D_DECLARATION(dst)
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                        ,
-                                        uint cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-                                       )
-{
-    // Block size
-#define BLOCK_SIZE ((M0) * (K0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (M0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (M0) * (V0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (M0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
-                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
-
-    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src_stride_z by DEPTH_GEMM3D
-
-    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    input_ptr += z * (uint)src_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    output_ptr += z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
-
-    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
-
-    // ---------------------------Transpose and store block -----------------------
-
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
-#if K0 > 2
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
-#endif // K0 > 2
-#if K0 > 3
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
-#endif // K0 > 3
-#if K0 > 4
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
-#endif // K0 > 4
-#if K0 > 8
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
-#endif // K0 > 8
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
-
-#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
-/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
- * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- * @note Only the following values for K0, N0 and H0 are supported:
- *                                      N0: 2,3,4,8,16
- *                                      K0: 1,2,3,4,8,16
- *                                      H0: greater than 0
- *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
-                                         TENSOR3D_DECLARATION(dst))
-{
-    // Block size
-#define BLOCK_SIZE ((K0) * (N0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (N0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (N0) * (H0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (N0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
-                                     x / (uint)H0)
-                                 * (uint)dst_stride_y)
-                                 + z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-
-    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
-
-    // Load values from the RHS matrix
-    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-#if K0 > 1
-    if(y * (uint)K0 + 1 < SRC_HEIGHT)
-    {
-        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    }
-#endif // K0 > 1
-#if K0 > 2
-    if(y * (uint)K0 + 2 < SRC_HEIGHT)
-    {
-        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-    }
-#endif // K0 > 2
-#if K0 > 3
-    if(y * (uint)K0 + 3 < SRC_HEIGHT)
-    {
-        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
-    }
-#endif // K0 > 3
-#if K0 > 4
-    if(y * (uint)K0 + 4 < SRC_HEIGHT)
-    {
-        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
-    }
-    if(y * (uint)K0 + 5 < SRC_HEIGHT)
-    {
-        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
-    }
-    if(y * (uint)K0 + 6 < SRC_HEIGHT)
-    {
-        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
-    }
-    if(y * (uint)K0 + 7 < SRC_HEIGHT)
-    {
-        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
-    }
-#endif // K0 > 4
-#if K0 > 8
-    if(y * (uint)K0 + 8 < SRC_HEIGHT)
-    {
-        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
-    }
-    if(y * (uint)K0 + 9 < SRC_HEIGHT)
-    {
-        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
-    }
-    if(y * (uint)K0 + 10 < SRC_HEIGHT)
-    {
-        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
-    }
-    if(y * (uint)K0 + 11 < SRC_HEIGHT)
-    {
-        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
-    }
-    if(y * (uint)K0 + 12 < SRC_HEIGHT)
-    {
-        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
-    }
-    if(y * (uint)K0 + 13 < SRC_HEIGHT)
-    {
-        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
-    }
-    if(y * (uint)K0 + 14 < SRC_HEIGHT)
-    {
-        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
-    }
-    if(y * (uint)K0 + 15 < SRC_HEIGHT)
-    {
-        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
-    }
-#endif // K0 > 8
-
-    // ---------------------------Store output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-
-#if defined(TRANSPOSE)
-/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
- * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- * @note The option -DTRANSPOSE must passed at compile time.
- * @note Only the following values for K0, N0 and H0 are supported:
- *                                      N0: 2,3,4,8,16
- *                                      K0: 2,3,4,8,16
- *                                      H0: greater than 0
- *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
-                                        TENSOR3D_DECLARATION(dst))
-{
-    // Block size
-#define BLOCK_SIZE ((K0) * (N0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (K0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (K0) * (H0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (K0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
-                                 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    a0=0, a1=0, ... a(K0-1)=0;
-
-    // Load values from the RHS matrix
-    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    if(y * (uint)K0 + 1 < SRC_HEIGHT)
-    {
-        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    }
-#if K0 > 2
-    if(y * (uint)K0 + 2 < SRC_HEIGHT)
-    {
-        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-    }
-#endif // K0 > 2
-#if K0 > 3
-    if(y * (uint)K0 + 3 < SRC_HEIGHT)
-    {
-        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
-    }
-#endif // K0 > 3
-#if K0 > 4
-    if(y * (uint)K0 + 4 < SRC_HEIGHT)
-    {
-        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
-    }
-    if(y * (uint)K0 + 5 < SRC_HEIGHT)
-    {
-        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
-    }
-    if(y * (uint)K0 + 6 < SRC_HEIGHT)
-    {
-        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
-    }
-    if(y * (uint)K0 + 7 < SRC_HEIGHT)
-    {
-        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
-    }
-#endif // K0 > 4
-#if K0 > 8
-    if(y * (uint)K0 + 8 < SRC_HEIGHT)
-    {
-        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
-    }
-    if(y * (uint)K0 + 9 < SRC_HEIGHT)
-    {
-        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
-    }
-    if(y * (uint)K0 + 10 < SRC_HEIGHT)
-    {
-        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
-    }
-    if(y * (uint)K0 + 11 < SRC_HEIGHT)
-    {
-        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
-    }
-    if(y * (uint)K0 + 12 < SRC_HEIGHT)
-    {
-        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
-    }
-    if(y * (uint)K0 + 13 < SRC_HEIGHT)
-    {
-        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
-    }
-    if(y * (uint)K0 + 14 < SRC_HEIGHT)
-    {
-        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
-    }
-    if(y * (uint)K0 + 15 < SRC_HEIGHT)
-    {
-        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
-    }
-#endif // K0 > 8
-
-    // ---------------------------Transpose the block ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0)    res0=0, res1=0, res2=0,... res(N0-1)=0;
-
-#if K0 == 2
-    // This part computes the following transpositions:
-    // 2x2 -> 2x2
-    // 2x4 -> 4x2
-    // 2x8 -> 8x2
-    // 2x16 -> 16x2
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
-#endif // N0 > 8
-
-#elif K0 == 3 // K0 == 2
-    // This part computes the following transpositions:
-    // 3x2 -> 2x3
-    // 3x4 -> 4x3
-    // 3x8 -> 8x3
-    // 3x16 -> 16x3
-    res0                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
-    res1                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
-#if N0 > 2
-    res2                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
-    res5                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
-    res6                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
-    res7                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
-    res9                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
-    resA                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
-    resB                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
-    resC                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
-    resD                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
-    resE                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
-    resF                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
-#endif // N0 > 8
-
-#elif K0 == 4 // K0 == 4
-    // This part computes the following transpositions:
-    // 4x2 -> 2x4
-    // 4x4 -> 4x4
-    // 4x8 -> 8x4
-    // 4x16 -> 16x4
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
-#endif // N0 > 8
-
-#elif K0 == 8 // K0 == 8
-    // This part computes the following transpositions:
-    // 8x2 -> 2x8
-    // 8x4 -> 4x8
-    // 8x8 -> 8x8
-    // 8x16 -> 16x8
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
-#endif // N0 > 8
-
-#elif K0 == 16 // K0 == 16
-
-    // This part computes the following transpositions:
-    // 16x2 -> 2x16
-    // 16x4 -> 4x16
-    // 16x8 -> 8x16
-    // 16x16 -> 16x16
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
-                                          a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
-                                          a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
-                                          a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
-                                          a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
-                                          a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
-                                          a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
-                                          a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
-                                          a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
-                                          a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
-                                          a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
-                                          a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
-                                          a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
-                                          a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
-                                          a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
-                                          a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
-                                          a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
-#endif // N0 > 8
-
-#else // N0 == 16
-#error "Not supported N0 value"
-#endif // N0 > 2
-
-    // ---------------------------Store the output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-#endif // defined(TRANSPOSE)
-#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
-
-#define CONCAT(a, b) a##b
-
-#define ARM_DOT1(a, b, c) \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-#define ARM_DOT2(a, b, c)       \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-    })
-#define ARM_DOT3(a, b, c)           \
-    ({                              \
-        ARM_DOT2(a, b, c);          \
-        c = fma((a.s2), (b.s2), c); \
-    })
-#define ARM_DOT4(a, b, c)           \
-    ({                              \
-        ARM_DOT3(a, b, c);          \
-        c = fma((a.s3), (b.s3), c); \
-    })
-#define ARM_DOT8(a, b, c)            \
-    ({                               \
-        ARM_DOT4((a.lo), (b.lo), c); \
-        ARM_DOT4((a.hi), (b.hi), c); \
-    })
-#define ARM_DOT16(a, b, c)           \
-    ({                               \
-        ARM_DOT8((a.lo), (b.lo), c); \
-        ARM_DOT8((a.hi), (b.hi), c); \
-    })
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##8), (c.s8));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##9), (c.s9));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##A), (c.sA));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##B), (c.sB));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##C), (c.sC));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##D), (c.sD));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##E), (c.sE));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##F), (c.sF));     \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- *
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
- * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
- * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
-                                          IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                          IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                          IMAGE_DECLARATION(dst),
-                                          uint lhs_stride_z,
-                                          uint rhs_stride_z,
-#if defined(BETA)
-                                          uint bias_stride_z,
-#endif //defined(BETA)
-                                          uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                          ,
-                                          uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                          ,
-                                          uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                         )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS reshaped matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(1, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(1, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(1, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(1, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(1, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(1, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(1, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(1, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-
-#if defined(OPENCL_IMAGE_SUPPORT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
- *  The LHS matrix is NOT reshaped
- *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- *
- * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
- * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
- *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
- *       could be different from the value returned by get_image_height(rhs_img).
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 4, 8, 16
- *  - K0 = 4, 8, 16
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
- * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
- * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
-                                                  __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                  IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                  IMAGE_DECLARATION(dst),
-                                                  uint lhs_stride_z,
-                                                  uint rhs_stride_z,
-#if defined(BETA)
-                                                  uint bias_stride_z,
-#endif //defined(BETA)
-                                                  uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                  ,
-                                                  uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                  ,
-                                                  uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                 )
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
-#define LEFTOVER_K (K % K0)
-
-    // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix stored in a cl_image
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-#if LEFTOVER_K != 0
-    // Note: We cannot read out-of-bound elements from the RHS matrix because
-    // the RHS width is always multiple of K0. This is not be true for the LHS matrix
-
-    union UNION_VEC_TYPE
-    {
-        DATA_TYPE s[K0];
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        v;
-    };
-
-    union UNION_VEC_TYPE a0 = {.v = 0 };
-#if M0 > 1
-    union UNION_VEC_TYPE a1 = {.v = 0 };
-#endif // M0 > 1
-#if M0 > 2
-    union UNION_VEC_TYPE a2 = {.v = 0 };
-#endif // M0 > 2
-#if M0 > 3
-    union UNION_VEC_TYPE a3 = {.v = 0 };
-#endif // M0 > 3
-#if M0 > 4
-    union UNION_VEC_TYPE a4 = {.v = 0 };
-#endif // M0 > 4
-#if M0 > 5
-    union UNION_VEC_TYPE a5 = {.v = 0 };
-#endif // M0 > 5
-#if M0 > 6
-    union UNION_VEC_TYPE a6 = {.v = 0 };
-#endif // M0 > 6
-#if M0 > 7
-    union UNION_VEC_TYPE a7 = {.v = 0 };
-#endif // M0 > 7
-
-    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-
-    // Load from RHS matrix
-    LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-    // Load from LHS matrix
-    for(int k = 0; k < LEFTOVER_K; ++k)
-    {
-        a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
-#if M0 > 1
-        a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
-#endif // M0 > 1
-#if M0 > 2
-        a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
-#endif // M0 > 2
-#if M0 > 3
-        a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
-#endif // M0 > 3
-#if M0 > 4
-        a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
-#endif // M0 > 4
-#if M0 > 5
-        a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
-#endif // M0 > 5
-#if M0 > 6
-        a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
-#endif // M0 > 6
-#if M0 > 7
-        a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
-#endif // M0 > 7
-
-        lhs_offset += sizeof(DATA_TYPE);
-    }
-
-    // Accumulate
-    ARM_DOT_K0XN0(K0, a0.v, b, c0);
-#if M0 > 1
-    ARM_DOT_K0XN0(K0, a1.v, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-    ARM_DOT_K0XN0(K0, a2.v, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-    ARM_DOT_K0XN0(K0, a3.v, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-    ARM_DOT_K0XN0(K0, a4.v, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-    ARM_DOT_K0XN0(K0, a5.v, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-    ARM_DOT_K0XN0(K0, a6.v, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-    ARM_DOT_K0XN0(K0, a7.v, b, c7);
-#endif // M0 > 7
-
-#endif // LEFTOVER_K != 0
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef LEFTOVER_K
-#undef PIXEL_UNIT
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT)
-
-#define VFMA(a, b, c)     \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-
-#if M0 == 1
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-    })
-#elif M0 == 2 // M0 == 2
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-    })
-#elif M0 == 3 // M0 == 3
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-    })
-#elif M0 == 4 // M0 == 4
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-    })
-#elif M0 == 5 // M0 == 5
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-    })
-#elif M0 == 6 // M0 == 6
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-    })
-#elif M0 == 7 // M0 == 7
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-    })
-#elif M0 == 8 // M0 == 8
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
-    })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- *
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
- * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
- * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
-                                           IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                           IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                           IMAGE_DECLARATION(dst),
-                                           uint lhs_stride_z,
-                                           uint rhs_stride_z,
-#if defined(BETA)
-                                           uint bias_stride_z,
-#endif //defined(BETA)
-                                           uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                           ,
-                                           uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                           ,
-                                           uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                          )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS reshaped matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   //uint zin0=0,zin1=0,zin2=0,... zin7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(0, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(4, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(5, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(6, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(8, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(9, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(A, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(B, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(C, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(D, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(E, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(0, a, b0, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-
-#if defined(OPENCL_IMAGE_SUPPORT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- *
- * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
- * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
- *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
- *       could be different from the value returned by get_image_height(rhs_img).
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 4, 8, 16
- *  - K0 = 4, 8, 16
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
- * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
- * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
-                                                   __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                   IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                   IMAGE_DECLARATION(dst),
-                                                   uint lhs_stride_z,
-                                                   uint rhs_stride_z,
-#if defined(BETA)
-                                                   uint bias_stride_z,
-#endif //defined(BETA)
-                                                   uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                   ,
-                                                   uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                   ,
-                                                   uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                  )
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(0, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(4, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(5, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(6, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(8, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(9, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(A, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(B, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(C, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(D, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(E, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
-        VFMA_M0xN0(0, a, b0, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        x_rhs += RHS_STEP_X;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
-
-#if defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-    })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-    })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-    })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-        c += a.s4 * b.s4;   \
-        c += a.s5 * b.s5;   \
-        c += a.s6 * b.s6;   \
-        c += a.s7 * b.s7;   \
-    })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-        c += a.s4 * b.s4;   \
-        c += a.s5 * b.s5;   \
-        c += a.s6 * b.s6;   \
-        c += a.s7 * b.s7;   \
-        c += a.s8 * b.s8;   \
-        c += a.s9 * b.s9;   \
-        c += a.sA * b.sA;   \
-        c += a.sB * b.sB;   \
-        c += a.sC * b.sC;   \
-        c += a.sD * b.sD;   \
-        c += a.sE * b.sE;   \
-        c += a.sF * b.sF;   \
-    })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#else  // defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-    })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-    })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-    })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-        c = fma(a.s4, b.s4, c); \
-        c = fma(a.s5, b.s5, c); \
-        c = fma(a.s6, b.s6, c); \
-        c = fma(a.s7, b.s7, c); \
-    })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-        c = fma(a.s4, b.s4, c); \
-        c = fma(a.s5, b.s5, c); \
-        c = fma(a.s6, b.s6, c); \
-        c = fma(a.s7, b.s7, c); \
-        c = fma(a.s8, b.s8, c); \
-        c = fma(a.s9, b.s9, c); \
-        c = fma(a.sA, b.sA, c); \
-        c = fma(a.sB, b.sB, c); \
-        c = fma(a.sC, b.sC, c); \
-        c = fma(a.sD, b.sD, c); \
-        c = fma(a.sE, b.sE, c); \
-        c = fma(a.sF, b.sF, c); \
-    })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#endif // defined(MIXED_PRECISION)
-
-#if defined(ARM_DOT_K0XN0)
-#undef ARM_DOT_K0XN0
-#endif // defined(ARM_DOT_K0XN0)
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-        ARM_DOT_K0((a), (b##4), (c.s4)); \
-        ARM_DOT_K0((a), (b##5), (c.s5)); \
-        ARM_DOT_K0((a), (b##6), (c.s6)); \
-        ARM_DOT_K0((a), (b##7), (c.s7)); \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-        ARM_DOT_K0((a), (b##4), (c.s4)); \
-        ARM_DOT_K0((a), (b##5), (c.s5)); \
-        ARM_DOT_K0((a), (b##6), (c.s6)); \
-        ARM_DOT_K0((a), (b##7), (c.s7)); \
-        ARM_DOT_K0((a), (b##8), (c.s8)); \
-        ARM_DOT_K0((a), (b##9), (c.s9)); \
-        ARM_DOT_K0((a), (b##A), (c.sA)); \
-        ARM_DOT_K0((a), (b##B), (c.sB)); \
-        ARM_DOT_K0((a), (b##C), (c.sC)); \
-        ARM_DOT_K0((a), (b##D), (c.sD)); \
-        ARM_DOT_K0((a), (b##E), (c.sE)); \
-        ARM_DOT_K0((a), (b##F), (c.sF)); \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
- *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
- * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
- * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
- * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - V0 >= 1
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
- * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
-                                            IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                            IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                            IMAGE_DECLARATION(dst),
-                                            uint k,
-                                            uint lhs_stride_z,
-                                            uint rhs_stride_z,
-#if defined(BETA)
-                                            uint bias_stride_z,
-#endif //defined(BETA)
-                                            uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                            ,
-                                            uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                           )
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
-                               (get_global_id(2) * lhs_stride_z);
-
-    // Compute RHS matrix address
-    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += get_global_id(2) * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-    for(int i = 0; i < k; i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-#endif // defined(MIXED_PRECISION)
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK(M0, c, bias_hp);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK(M0, c, bias);
-#endif // defined(MIXED_PRECISION)
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-#if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
-#else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(MIXED_PRECISION)
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-    // Store output block
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#endif // defined(MIXED_PRECISION)
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-
-#if defined(OPENCL_IMAGE_SUPPORT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
- *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
- *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- *
- * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
- * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
- * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
- *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
- *       could be different from the value returned by get_image_height(rhs_img).
- * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
- * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 4, 8, 16
- *  - K0 = 4, 8, 16
- *  - V0 >= 1
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
- * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
-                                                    __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                    IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                    IMAGE_DECLARATION(dst),
-                                                    uint k,
-                                                    uint lhs_stride_z,
-                                                    uint rhs_stride_z,
-#if defined(BETA)
-                                                    uint bias_stride_z,
-#endif //defined(BETA)
-                                                    uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                    ,
-                                                    uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                   )
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
-                               (get_global_id(2) * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
-        // Load values from RHS matrix stored in a cl_image
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Accumulate
-        ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-
-        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-#endif // defined(MIXED_PRECISION)
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK(M0, c, bias_hp);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK(M0, c, bias);
-#endif // defined(MIXED_PRECISION)
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-#if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
-#else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(MIXED_PRECISION)
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-    // Store output block
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#endif // defined(MIXED_PRECISION)
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT)
-
-#if defined(LHS_TRANSPOSE)
-
-#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
-
-#if defined(MIXED_PRECISION)
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#else // defined(MIXED_PRECISION
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#endif // defined(MIXED_PRECISION)
-
-#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
-    ({                                                 \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
-    })
-#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
-    })
-#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
-    })
-#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
-    })
-#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
-    })
-
-// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
-// a is the column-vector (transposed)
-// b is the row-vector (not transposed)
-// C is the output matrix
-// Lower case is a vector (a, b)
-// Upper case is a matrix (C)
-#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
-
-#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
-    ({                                                        \
-        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
-    })
-
-// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
-// The dimensions for this matrix multiplications are defined through M0, N0 and K0
-// The dimensions supported are:
-// M0: 1, 2, 3, 4, 8
-// N0: 1, 2, 3, 4, 8, 16
-// K0: 1, 2, 3, 4, 8, 16
-// This macro calls the vector-by-matrix macro K0 times
-// A, B and C are matrices
-#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
-    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
-    (M0, N0, TYPE, A, B, C)
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
- *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- *
- * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
- * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
- * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 2, 3, 4, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - V0 >= 1
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
- * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
-                                            IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                            IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                            IMAGE_DECLARATION(dst),
-                                            uint k,
-                                            uint lhs_stride_z,
-                                            uint rhs_stride_z,
-#if defined(BETA)
-                                            uint bias_stride_z,
-#endif //defined(BETA)
-                                            uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                            ,
-                                            uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                           )
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#endif // defined(RHS_INTERLEAVE)
-
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-    // Compute RHS matrix address
-    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
-
-    for(int i = 0; i < k; i += K0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, M0)
-        a0;
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-#if K0 > 1
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
-        lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-#ifndef RHS_INTERLEAVE
-        rhs += (N0 * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-#endif // defined(MIXED_PRECISION)
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK(M0, c, bias_hp);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK(M0, c, bias);
-#endif // defined(MIXED_PRECISION)
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-#if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
-#else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(MIXED_PRECISION)
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-    // Store output block
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#endif // defined(MIXED_PRECISION)
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-
-#if defined(OPENCL_IMAGE_SUPPORT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
- *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
- *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- *
- * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
- * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
- * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
- *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
- *       could be different from the value returned by get_image_height(rhs_img).
- * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
- * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 2, 3, 4, 8
- *  - N0 = 4, 8, 16
- *  - K0 = 4, 8, 16
- *  - V0 >= 1
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
- * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_img                            The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
-                                                    __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                    IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                    IMAGE_DECLARATION(dst),
-                                                    uint k,
-                                                    uint lhs_stride_z,
-                                                    uint rhs_stride_z,
-#if defined(BETA)
-                                                    uint bias_stride_z,
-#endif //defined(BETA)
-                                                    uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                    ,
-                                                    uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                   )
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#endif // defined(RHS_INTERLEAVE)
-
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, M0)
-        a0;
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-#if K0 > 1
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
-        lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-        x_rhs += K0 * RHS_STEP_X;
-#ifndef RHS_INTERLEAVE
-        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-#endif // defined(MIXED_PRECISION)
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK(M0, c, bias_hp);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK(M0, c, bias);
-#endif // defined(MIXED_PRECISION)
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-#if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
-#else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(MIXED_PRECISION)
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-    // Store output block
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#endif // defined(MIXED_PRECISION)
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT)
-
-#endif // defined(LHS_TRANSPOSE)
-
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
-
-#define VFMA(a, b, c)     \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-
-#if M0 == 1
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-    })
-#elif M0 == 2 // M0 == 2
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-    })
-#elif M0 == 3 // M0 == 3
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-    })
-#elif M0 == 4 // M0 == 4
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-    })
-#elif M0 == 5 // M0 == 5
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-    })
-#elif M0 == 6 // M0 == 6
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-    })
-#elif M0 == 7 // M0 == 7
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-    })
-#elif M0 == 8 // M0 == 8
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
-    })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS matrix is NOT reshaped
- *
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
- * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
- * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         lhs_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         lhs_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
- * @param[in]  rhs_ptr                            Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                       Stride of the RHS matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                         rhs_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                       Stride of the RHS matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                         rhs_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS matrix
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
-                             IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                             IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                             IMAGE_DECLARATION(dst),
-                             uint lhs_stride_z,
-                             uint rhs_stride_z,
-#if defined(BETA)
-                             uint bias_stride_z,
-#endif //defined(BETA)
-                             uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                             ,
-                             uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                             ,
-                             uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                            )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
-
-        RHS_VFMA_M0xN0(0, a, b0, c);
-        RHS_VFMA_M0xN0(1, a, b1, c);
-#if K0 > 2
-        RHS_VFMA_M0xN0(2, a, b2, c);
-#endif // K0 > 2
-#if K0 > 3
-        RHS_VFMA_M0xN0(3, a, b3, c);
-#endif // K0 > 3
-#if K0 > 4
-        RHS_VFMA_M0xN0(4, a, b4, c);
-        RHS_VFMA_M0xN0(5, a, b5, c);
-        RHS_VFMA_M0xN0(6, a, b6, c);
-        RHS_VFMA_M0xN0(7, a, b7, c);
-#endif // K0 > 4
-#if K0 > 8
-        RHS_VFMA_M0xN0(8, a, b8, c);
-        RHS_VFMA_M0xN0(9, a, b9, c);
-        RHS_VFMA_M0xN0(A, a, bA, c);
-        RHS_VFMA_M0xN0(B, a, bB, c);
-        RHS_VFMA_M0xN0(C, a, bC, c);
-        RHS_VFMA_M0xN0(D, a, bD, c);
-        RHS_VFMA_M0xN0(E, a, bE, c);
-        RHS_VFMA_M0xN0(F, a, bF, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += K0 * rhs_stride_y;
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
-        RHS_VFMA_M0xN0(0, a, b, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += rhs_stride_y;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
-
-#if defined(BETA)
-/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
- *
- * @note The beta's value need to be passed at compile time using -DBETA
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
-                          TENSOR3D_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    // Load values from A x B
-    float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
-
-    // Load values from Matrix C
-    float4 c = vload4(0, (__global float *)src.ptr);
-
-    // Computes alpha * axb + beta * c
-    float4 out = alpha_ab + (float4)BETA * c;
-
-    // Store final result in axb matrix
-    vstore4(out, 0, (__global float *)dst.ptr);
-}
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
- *
- * @note The beta's value need to be passed at compile time using -DBETA
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
-                          TENSOR3D_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    // Load values from A x B
-    half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
-
-    // Load values from Matrix C
-    half8 c = vload8(0, (__global half *)src.ptr);
-
-    // Computes alpha * axb + beta * c
-    half8 out = alpha_ab + (half8)BETA * c;
-
-    // Store final result in axb matrix
-    vstore8(out, 0, (__global half *)dst.ptr);
-}
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-#endif // defined(BETA)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index 3bbd243ff5..4bef02314f 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,17 +31,17 @@
  * @param[in] offset The offset within the vector. Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
  * @param[in] n0     The number of consecutive columns to access. n0 + offset must be <= 16
  * @param[in] x      Vector to access
- * @{
+ *
  */
 #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
-#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
+#define SCALAR_ACCESS(offset, n0, x)     SCALAR_ACCESS_STR(offset, n0, x)
 
 // offset == 0
-#define scalar_access_0_1(x) ((x).s0)
-#define scalar_access_0_2(x) ((x).s01)
-#define scalar_access_0_3(x) ((x).s012)
-#define scalar_access_0_4(x) ((x).s0123)
-#define scalar_access_0_8(x) ((x).s01234567)
+#define scalar_access_0_1(x)  ((x).s0)
+#define scalar_access_0_2(x)  ((x).s01)
+#define scalar_access_0_3(x)  ((x).s012)
+#define scalar_access_0_4(x)  ((x).s0123)
+#define scalar_access_0_8(x)  ((x).s01234567)
 #define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
 
 // offset == 1
@@ -100,8 +100,7 @@
  * @param[in] Z          The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
-    ({})
+#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) ({})
 
 #define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
@@ -186,8 +185,10 @@
  * @param[in] Z          The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
-#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
 /** @} */ // end of group LOAD_TENSOR
 
 /** Load 2D tensor (consecutive rows and columns) with Z offset.
@@ -202,8 +203,7 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
-    ({})
+#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) ({})
 
 #define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
     LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
@@ -279,8 +279,11 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @}*/ // end of group LOAD_TENSOR_M0XN0
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
  * @name LOAD_ROW_n
@@ -394,10 +397,323 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
-#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 /** @} */ // end of group LOAD_BLOCK
 
+/** Partially load the 0 to (n-1)th rows of the given variables
+ * @name LOAD_ROW_PARTIAL_n
+ * Within each row, load the lower @p LOAD_N0 elements of vectors of width @p N0
+ *
+ * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] LOAD_N0   The **lower** size of the vectors to load. Supported: [1-16 and <= @p N0
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of group LOAD_ROW_PARTIAL_n
+
+/** Partially load a block of the given size LOAD_M0xLOAD_N0
+ * @name LOAD_BLOCK_PARTIAL
+ *
+ * @note The vector width @p N0 is also required for correct partial storing behaviour.
+ * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for LOAD_M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for LOAD_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] LOAD_M0   The number of rows to load. Supported: 1-16
+ * @param[in] LOAD_N0   The lower number of elements of vectors to load. Supported: 1-16 and <= @p N0
+ * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+/** Load a block that can be partial in both x and y dimensions
+ *
+ * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to load, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] OFFSET           The offset within a row
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
+ */
+#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0,     \
+                                      PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                            \
+    if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                    \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                             \
+    }                                                                                                              \
+    else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);               \
+    }                                                                                                              \
+    else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);               \
+    }                                                                                                              \
+    else                                                                                                           \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
+    }
+/** Load a block that can only be partial in x but not y.
+ *
+ * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to load, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] OFFSET           The offset within a row
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
+ */
+#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, \
+                                PARTIAL_COND_X)                                                          \
+    if (!(PARTIAL_COND_X))                                                                               \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                   \
+    }                                                                                                    \
+    else                                                                                                 \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);     \
+    }
+/** Load a block that can only be partial in y but not x.
+ *
+ * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] OFFSET           The offset within a row
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
+ */
+#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                PARTIAL_COND_Y)                                                          \
+    if (!(PARTIAL_COND_Y))                                                                               \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                   \
+    }                                                                                                    \
+    else                                                                                                 \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);     \
+    }
+/** @} */ // end of group LOAD_BLOCK_PARTIAL
+/** Boundary-aware GeMM block load
+ * @name LOAD_BLOCK_BOUNDARY_AWARE
+ * This macro assumes the following schemes to achieve boundary-awareness:
+ *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
+ *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
+ *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
+ * The macro then ensures that the src tensor can be loaded without any paddings in both x and y dim.
+ *
+ * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
+ * blocks **at the end**.
+ * Say, the src tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
+ * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
+ *
+ *  *--x-->                         x == 0                        x == 1
+ *  |                  |<------------------------------N-------------------------->|
+ *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
+ *  |     -------------#############################################################
+ *  *     |          | |...............................|...........................|
+ * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
+ *        |          | |...............................|...........................|
+ *        M          --#############################################################
+ *        |          | |                               |...........................|
+ * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
+ *        |          | |                               |...........................|
+ *        |------------#############################################################
+ *
+ * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
+ *
+ * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
+ * and select corresponding load methods such that the boundary detection logic is only added when needed.
+ *
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * The macro will result in a declaration of @p M0 vectors of size @p N0 with data
+ * type @p DATA_TYPE containing values partially loaded from the specified
+ * address in memory. The remaining (N0 - PARTIAL_STORE_N0) elements will be
+ * filled with zeros.
+ *
+ * @param[in] M0               The number of rows to load, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] OFFSET           The offset within a row
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
+ * @{
+ */
+#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+// Case1: No partial blocks in either x or y
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+
+#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
+// Case2: Partial blocks in y
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
+    LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
+
+#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
+// Case3: Partial blocks in x
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
+    LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
+
+#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+// Case4: Partial blocks in both x and y
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
+    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+
+#endif    // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+/** @} */ // end of group LOAD_BLOCK_BOUNDARY_AWARE
+
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
  * @name LOAD_TEXTURE2D_ROW_n
  *
@@ -493,8 +809,10 @@
  * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
  * @{
  */
-#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
-#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
 /** @} */ // end of group LOAD_TEXTURE2D
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded.
@@ -513,7 +831,7 @@
 #define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##0;                                                                            \
-    if(Y_MASK##0 != 0)                                                                      \
+    if (Y_MASK##0 != 0)                                                                     \
         BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##0 = 0;
@@ -522,7 +840,7 @@
     LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##1;                                                                            \
-    if(Y_MASK##1 != 0)                                                                      \
+    if (Y_MASK##1 != 0)                                                                     \
         BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##1 = 0;
@@ -531,7 +849,7 @@
     LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##2;                                                                            \
-    if(Y_MASK##2 != 0)                                                                      \
+    if (Y_MASK##2 != 0)                                                                     \
         BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##2 = 0;
@@ -540,7 +858,7 @@
     LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##3;                                                                            \
-    if(Y_MASK##3 != 0)                                                                      \
+    if (Y_MASK##3 != 0)                                                                     \
         BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##3 = 0;
@@ -549,7 +867,7 @@
     LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##4;                                                                            \
-    if(Y_MASK##4 != 0)                                                                      \
+    if (Y_MASK##4 != 0)                                                                     \
         BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##4 = 0;
@@ -558,7 +876,7 @@
     LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##5;                                                                            \
-    if(Y_MASK##5 != 0)                                                                      \
+    if (Y_MASK##5 != 0)                                                                     \
         BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##5 = 0;
@@ -567,7 +885,7 @@
     LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##6;                                                                            \
-    if(Y_MASK##6 != 0)                                                                      \
+    if (Y_MASK##6 != 0)                                                                     \
         BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##6 = 0;
@@ -576,7 +894,7 @@
     LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##7;                                                                            \
-    if(Y_MASK##7 != 0)                                                                      \
+    if (Y_MASK##7 != 0)                                                                     \
         BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##7 = 0;
@@ -585,7 +903,7 @@
     LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##8;                                                                            \
-    if(Y_MASK##8 != 0)                                                                      \
+    if (Y_MASK##8 != 0)                                                                     \
         BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##8 = 0;
@@ -594,7 +912,7 @@
     LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##9;                                                                            \
-    if(Y_MASK##9 != 0)                                                                      \
+    if (Y_MASK##9 != 0)                                                                     \
         BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##9 = 0;
@@ -603,7 +921,7 @@
     LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##A;                                                                            \
-    if(Y_MASK##A != 0)                                                                      \
+    if (Y_MASK##A != 0)                                                                     \
         BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##A = 0;
@@ -612,7 +930,7 @@
     LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##B;                                                                            \
-    if(Y_MASK##B != 0)                                                                      \
+    if (Y_MASK##B != 0)                                                                     \
         BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##B = 0;
@@ -621,7 +939,7 @@
     LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##C;                                                                            \
-    if(Y_MASK##C != 0)                                                                      \
+    if (Y_MASK##C != 0)                                                                     \
         BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##C = 0;
@@ -630,7 +948,7 @@
     LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##D;                                                                            \
-    if(Y_MASK##D != 0)                                                                      \
+    if (Y_MASK##D != 0)                                                                     \
         BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##D = 0;
@@ -639,7 +957,7 @@
     LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##E;                                                                            \
-    if(Y_MASK##E != 0)                                                                      \
+    if (Y_MASK##E != 0)                                                                     \
         BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##E = 0;
@@ -648,10 +966,11 @@
     LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##F;                                                                            \
-    if(Y_MASK##F != 0)                                                                      \
+    if (Y_MASK##F != 0)                                                                     \
         BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##F = 0;
+/** @} */ // end of group LOAD_ROW_INDIRECT_n
 
 /** Load blocks (consecutive rows and columns) with Y offset.
  * @name LOAD_BLOCK_INDIRECT
@@ -673,8 +992,11 @@
  * @param[in] Y_MASK    The y-axis mask vector. If 0, forces BASENAMEn to 0
  * @{
  */
-#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
-#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+    LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+    LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+/** @} */ // end of group LOAD_BLOCK_INDIRECT
 
 /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
  * @name LOAD_ELEMENT_n
@@ -784,8 +1106,10 @@
  * @param[in] STRIDE_Y  The stride in y-axis direction
  * @{
  */
-#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
-#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+    LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+    LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
 /** @} */ // end of group LOAD_SCALAR_AS_VECTOR
 
 /** Basic macros to calculate Z offset values from Z0 to Zn-1
@@ -883,8 +1207,10 @@
  * @param[in] STRIDE_Y        The stride value in y-axis direction
  * @{
  */
-#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
-#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
 /** @} */ // end of group CALCULATE_Z_OFFSET
 
 /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
@@ -895,8 +1221,7 @@
  * @param[in] SCALE     The scale factor
  * @{
  */
-#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
-    BASENAME##0 *= (DATA_TYPE)SCALE;
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE;
 
 #define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
     SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
@@ -971,7 +1296,7 @@
  * @{
  */
 #define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
-#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE)     SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
 /** @} */ // end of group SCALE_BLOCK
 
 /** Create a new vector containing the values at the given index for a set of given vectors
@@ -983,8 +1308,7 @@
  * @param[in] TYPE     The data type of the destination vectors
  * @{
  */
-#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
-    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
 #define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 2)                         \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
@@ -993,13 +1317,20 @@
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
 #define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 4)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
-#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 8)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
-#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 16)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
+    BASENAME##IDX_COL =                            \
+        (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE)                                                           \
+    VEC_DATA_TYPE(TYPE, 8)                                                                                   \
+    BASENAME##IDX_COL =                                                                                      \
+        (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+                                 (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE)                                                           \
+    VEC_DATA_TYPE(TYPE, 16)                                                                                   \
+    BASENAME##IDX_COL =                                                                                       \
+        (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+                                  (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, \
+                                  (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, \
+                                  (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
 /** @} */ // end of group COLUMN_VECTORn
 
 /** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector
@@ -1011,8 +1342,7 @@
  * @param[in] TYPE     The data type of the destination vectors
  * @{
  */
-#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
-    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
+#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0));
 #define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 2)                                \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
@@ -1025,10 +1355,11 @@
 #define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 8)                                \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
-#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 16)                                \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
-/** @} */ // end of group COLUMN_VECTORn
+#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE)                                                        \
+    VEC_DATA_TYPE(TYPE, 16)                                                                                       \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \
+                                                  (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
+/** @} */ // end of group COLUMN_VECTOR_SCALARn
 
 /** Create transposed vectors of the given vectors
  * @name TRANSPOSE_K0Xn
@@ -1039,8 +1370,7 @@
  * @param[in] TYPE     The data type of the transposed vectors
  * @{
  */
-#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
-    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
+#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
 #define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
     COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
     COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
@@ -1113,8 +1443,7 @@
  * @param[in] BIAS     The basename of the added variables
  * @{
  */
-#define ADD_ROW_1(BASENAME, BIAS) \
-    BASENAME##0 += BIAS##0;
+#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0;
 
 #define ADD_ROW_2(BASENAME, BIAS) \
     ADD_ROW_1(BASENAME, BIAS)     \
@@ -1189,7 +1518,7 @@
  * @{
  */
 #define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
-#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS)     ADD_BLOCK_STR(N, BASENAME, BIAS)
 /** @} */ // end of group ADD_BLOCK
 
 /** Broadcast (add single value) to the each element of the destination variables
@@ -1199,8 +1528,7 @@
  * @param[in] BIAS     The variable containing the value to add
  * @{
  */
-#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
-    BASENAME##0 += BIAS;
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS;
 
 #define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
     ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
@@ -1261,6 +1589,7 @@
 #define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
     ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
     BASENAME##F += BIAS;
+/** @} */ // end of group ADD_ROW_BROADCAST_n
 
 /** Broadcast (add a value) to the each element of the destination block (BASENAME)
  * @name ADD_BLOCK_BROADCAST
@@ -1273,7 +1602,7 @@
  * @{
  */
 #define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
-#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS)     ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
 /** @} */ // end of group ADD_BLOCK_BROADCAST
 
 /** Apply activation to the given variables
@@ -1363,8 +1692,10 @@
  * @param[in] B_VAL           Additional value required by the activation
  * @{
  */
-#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
-#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
 /** @} */ // end of group ACTIVATION_BLOCK
 
 /** Apply convert_<data_type> to the given variables
@@ -1374,6 +1705,7 @@
  * @param[in] DATA_TYPE    The data type of the vectors
  * @param[in] BASENAME_SRC The basename of the source variables
  * @param[in] BASENAME_DST The basename of the destination variables
+ * @{
  */
 #define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
@@ -1465,7 +1797,10 @@
  * @param[in] DATA_TYPE    The data type of the vectors
  * @param[in] BASENAME_SRC The basename of the source variables
  * @param[in] BASENAME_DST The basename of the destination variables
+ * @{
  */
-#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
-#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
-/** @} */ // end of group CONVERT_BLOCK
-\ No newline at end of file
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+/** @} */ // end of group CONVERT_BLOCK
diff --git a/src/core/CL/cl_kernels/gemm_v1.cl b/src/core/CL/cl_kernels/gemm_v1.cl
deleted file mode 100644
index a136a1b96b..0000000000
--- a/src/core/CL/cl_kernels/gemm_v1.cl
+++ /dev/null
@@ -1,3243 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-#if defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) && defined(IN1_DIM_X)
-/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 4;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global float *src_end_addr_b = src_addr_b + IN1_DIM_X;
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float4 c0 = 0.0f;
-    float4 c1 = 0.0f;
-    float4 c2 = 0.0f;
-    float4 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(8 * H0)); src_addr_a += 8 * V0, src_addr_b += 8 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 4 * V0);
-        b0 = vload4(0, src_addr_b + 4 * H0);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 4 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x4 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
-                                                         IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                         IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                         IMAGE_DECLARATION(dst),
-                                                         uint src0_stride_z,
-                                                         uint src1_stride_z,
-#if defined(BETA)
-                                                         uint src2_stride_z,
-#endif //defined(BETA)
-                                                         uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                         ,
-                                                         uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                        )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 4;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float4 c0 = 0.0f;
-    float4 c1 = 0.0f;
-    float4 c2 = 0.0f;
-    float4 c3 = 0.0f;
-
-    int i = 0;
-    for(; i <= (int)(K - 4); i += 4)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x4 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global half *src_end_addr_b = src_addr_b + IN1_DIM_X;
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    half8 c0 = 0.0f;
-    half8 c1 = 0.0f;
-    half8 c2 = 0.0f;
-    half8 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(16 * H0)); src_addr_a += 8 * V0, src_addr_b += 16 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 4 * V0);
-        b0 = vload8(0, src_addr_b + 8 * H0);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 8 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, half, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
-                                                       IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                       IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                       IMAGE_DECLARATION(dst),
-                                                       uint src0_stride_z,
-                                                       uint src1_stride_z,
-#if defined(BETA)
-                                                       uint src2_stride_z,
-#endif //defined(BETA)
-                                                       uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                       ,
-                                                       uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                      )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global half *src_end_addr_b = src_addr_b + IN1_DIM_X;
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float8 c0 = 0.0f;
-    float8 c1 = 0.0f;
-    float8 c2 = 0.0f;
-    float8 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(16 * H0)); src_addr_a += 8 * V0, src_addr_b += 16 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = convert_float4(vload4(0, src_addr_a));
-        float8 b0 = convert_float8(vload8(0, src_addr_b));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = convert_float4(vload4(0, src_addr_a + 4 * V0));
-        b0 = convert_float8(vload8(0, src_addr_b + 8 * H0));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 8 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = convert_float4(vload4(0, src_addr_a));
-        float8 b0 = convert_float8(vload8(0, src_addr_b));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias_f0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-    float8 bias_f1 = convert_float8(bias1);
-    float8 bias_f2 = convert_float8(bias2);
-    float8 bias_f3 = convert_float8(bias3);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias_f);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    half8 c_h0 = convert_half8(c0);
-    half8 c_h1 = convert_half8(c1);
-    half8 c_h2 = convert_half8(c2);
-    half8 c_h3 = convert_half8(c3);
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c_h, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
-                                                         IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                         IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                         IMAGE_DECLARATION(dst),
-                                                         uint src0_stride_z,
-                                                         uint src1_stride_z,
-#if defined(BETA)
-                                                         uint src2_stride_z,
-#endif //defined(BETA)
-                                                         uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                         ,
-                                                         uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                        )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    half8 c0 = 0.0f;
-    half8 c1 = 0.0f;
-    half8 c2 = 0.0f;
-    half8 c3 = 0.0f;
-
-    int i = 0;
-    for(; i <= (int)(K - 4); i += 4)
-    {
-#if V0 == 1
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half8 a0 = vload8(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 8 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix B (transposed)
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s4, b0, c0);
-        c1 = fma((half8)a0.s5, b0, c1);
-        c2 = fma((half8)a0.s6, b0, c2);
-        c3 = fma((half8)a0.s7, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload8(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 8 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix B (transposed)
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s4, b0, c0);
-        c1 = fma((half8)a0.s5, b0, c1);
-        c2 = fma((half8)a0.s6, b0, c2);
-        c3 = fma((half8)a0.s7, b0, c3);
-#else  // V0 == 1
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-#endif // V0 == 1
-    }
-
-    for(; i < (int)K; ++i)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, half, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-
-#endif // defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) && defined(IN1_DIM_X)
-
-#if defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-#if defined(DATA_TYPE)
-#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, N0)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
- *
- * @note This OpenCL kernel works with floating point data types (F16/F32)
- * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
-                                     IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                     IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                     IMAGE_DECLARATION(dst),
-                                     uint src0_stride_z,
-                                     uint src1_stride_z,
-#if defined(BETA)
-                                     uint src2_stride_z,
-#endif //defined(BETA)
-                                     uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                     ,
-                                     uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                     ,
-                                     uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                    )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(DATA_TYPE);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    int end_row_vec_a = src_addr.s0 + (K * sizeof(DATA_TYPE));
-
-    VECTOR_TYPE acc0 = 0.0f;
-#if M0 > 1
-    VECTOR_TYPE acc1 = 0.0f;
-#endif // M0 > 1
-#if M0 > 2
-    VECTOR_TYPE acc2 = 0.0f;
-#endif // M0 > 2
-#if M0 > 3
-    VECTOR_TYPE acc3 = 0.0f;
-#endif // M0 > 3
-
-    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(M0, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        VECTOR_TYPE b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
-        VECTOR_TYPE b1 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
-
-        // Accumulate
-        acc0 += b0 * (VECTOR_TYPE)a0.s0;
-        acc0 += b1 * (VECTOR_TYPE)a0.s1;
-#if M0 > 1
-        acc1 += b0 * (VECTOR_TYPE)a1.s0;
-        acc1 += b1 * (VECTOR_TYPE)a1.s1;
-#endif // M0 > 1
-#if M0 > 2
-        acc2 += b0 * (VECTOR_TYPE)a2.s0;
-        acc2 += b1 * (VECTOR_TYPE)a2.s1;
-#endif // M0 > 2
-#if M0 > 3
-        acc3 += b0 * (VECTOR_TYPE)a3.s0;
-        acc3 += b1 * (VECTOR_TYPE)a3.s1;
-#endif // M0 > 3
-    }
-
-    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        VECTOR_TYPE b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
-
-        // Accumulate
-        acc0 += b0 * (VECTOR_TYPE)a0;
-#if M0 > 1
-        acc1 += b0 * (VECTOR_TYPE)a1;
-#endif // M0 > 1
-#if M0 > 2
-        acc2 += b0 * (VECTOR_TYPE)a2;
-#endif // M0 > 2
-#if M0 > 3
-        acc3 += b0 * (VECTOR_TYPE)a3;
-#endif // M0 > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                               PARTIAL_STORE_M0)
-                               * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(DATA_TYPE)
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
- *
- * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=4.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                 ,
-                                                 uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for matrix B
-    src_addr.s1 += idx * sizeof(float);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize accumulators
-    float4 acc0 = 0.0f;
-
-#if M0 > 1
-    float4 acc1 = 0.0f;
-#endif // M0 > 1
-
-#if M0 > 2
-    float4 acc2 = 0.0f;
-#endif // M0 > 2
-
-#if M0 > 3
-    float4 acc3 = 0.0f;
-#endif // M0 > 3
-
-    // A and B src indices get incremented at the same time.
-    int i = 0;
-    for(; i <= ((int)K - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A and matrix B
-        LOAD_BLOCK(M0, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A and matrix B
-        float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        src_addr.s0 += 4 * sizeof(float);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0, b0.s3, acc0.s3);
-#if M0 > 1
-        acc1.s0 = fma(a1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1, b0.s3, acc1.s3);
-#endif // M0 > 1
-#if M0 > 2
-        acc2.s0 = fma(a2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2, b0.s3, acc2.s3);
-#endif // M0 > 2
-#if M0 > 3
-        acc3.s0 = fma(a3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        src_addr.s0 += sizeof(float);
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                               PARTIAL_STORE_M0)
-                               * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 4, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
- *
- * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
- * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=2.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
-                                                      IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                      IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                      IMAGE_DECLARATION(dst),
-                                                      uint src0_stride_z,
-                                                      uint src1_stride_z,
-#if defined(BETA)
-                                                      uint src2_stride_z,
-#endif //defined(BETA)
-                                                      uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                      ,
-                                                      uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                      ,
-                                                      uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                     )
-{
-    // Requires 2 N0, C vect2, A vect4, B (2 vload2) // to fix for M0 > 1
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(float);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize accumulators
-    float2 acc0 = 0.0f;
-#if M0 > 1
-    float2 acc1 = 0.0f;
-#endif // M0 > 1
-#if M0 > 2
-    float2 acc2 = 0.0f;
-#endif // M0 > 2
-#if M0 > 3
-    float2 acc3 = 0.0f;
-#endif // M0 > 3
-
-    // A and B src indices get incremented at the same time.
-    int i = 0;
-    for(; i <= ((int)K - 8); i += 8)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
-        acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
-        acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
-        acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
-        acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
-        acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
-        acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
-        acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
-
-        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
-        acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
-        acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
-        acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
-        acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
-        acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
-        acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
-        acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
-
-#if M0 > 1
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
-        acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
-        acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
-        acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
-        acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
-        acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
-        acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
-        acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
-
-        acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
-        acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
-        acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
-        acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
-        acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
-        acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
-        acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
-        acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
-#endif // M0 > 1
-#if M0 > 2
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
-        acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
-        acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
-        acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
-        acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
-        acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
-        acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
-        acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
-
-        acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
-        acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
-        acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
-        acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
-        acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
-        acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
-        acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
-        acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
-#endif // M0 > 2
-#if M0 > 3
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
-        acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
-        acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
-        acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
-        acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
-        acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
-        acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
-        acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
-
-        acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
-        acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
-        acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
-        acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
-        acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
-        acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
-        acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
-        acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
-#endif // M0 > 3
-
-        src_addr.s0 += sizeof(float) * 8;
-    }
-    // float size increment
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0, b0.s1, acc0.s1);
-#if M0 > 1
-        acc1.s0 = fma(a1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1, b0.s1, acc1.s1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2.s0 = fma(a2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2, b0.s1, acc2.s1);
-#endif // M0 > 2
-#if M0 > 3
-        acc3.s0 = fma(a3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3, b0.s1, acc3.s1);
-#endif // M0 > 3
-
-        src_addr.s0 += sizeof(float);
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                               PARTIAL_STORE_M0)
-                               * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
-
-    LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 2 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 2, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
- *
- * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=8.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
-                                                       IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                       IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                       IMAGE_DECLARATION(dst),
-                                                       uint src0_stride_z,
-                                                       uint src1_stride_z,
-#if defined(BETA)
-                                                       uint src2_stride_z,
-#endif //defined(BETA)
-                                                       uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                       ,
-                                                       uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                       ,
-                                                       uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                      )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(half);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    float8 acc0 = 0.0h;
-#if M0 > 1
-    float8 acc1 = 0.0h;
-#endif // M0 > 1
-#if M0 > 2
-    float8 acc2 = 0.0h;
-#endif // M0 > 2
-#if M0 > 3
-    float8 acc3 = 0.0h;
-#endif // M0 > 3
-
-    int i = 0;
-    for(; i <= ((int)K - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(M0, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-
-        // Accumulate
-        acc0 = fma(b0, (float8)a0.s0, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s0, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s0, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s0, acc3);
-#endif // M0 > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s1, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s1, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s1, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s1, acc3);
-#endif // M0 > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s2, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s2, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s2, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s2, acc3);
-#endif // M0 > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s3, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s3, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s3, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s3, acc3);
-#endif // M0 > 3
-
-        src_addr.s0 += 4 * sizeof(half);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-
-        src_addr += (int2)(sizeof(half), src1_stride_y);
-
-        // Accumulate
-        acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
-#endif                                    // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
-#endif                                    // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
-#endif                                    // M0 > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias_f0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-#if M0 > 1
-    float8 bias_f1 = convert_float8(bias1);
-#endif // M0 > 1
-#if M0 > 2
-    float8 bias_f2 = convert_float8(bias2);
-#endif // M0 > 2
-#if M0 > 3
-    float8 bias_f3 = convert_float8(bias3);
-#endif // M0 > 3
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias_f);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    half8 acc_h0 = convert_half8(acc0);
-#if M0 > 1
-    half8 acc_h1 = convert_half8(acc1);
-#endif // M0 > 1
-#if M0 > 2
-    half8 acc_h2 = convert_half8(acc2);
-#endif // M0 > 2
-#if M0 > 3
-    half8 acc_h3 = convert_half8(acc3);
-#endif // M0 > 3
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, half, VEC_SIZE, acc_h, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
- *
- * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=8.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                 ,
-                                                 uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(half);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    half8 acc0 = 0.0h;
-#if M0 > 1
-    half8 acc1 = 0.0h;
-#endif // M0 > 1
-#if M0 > 2
-    half8 acc2 = 0.0h;
-#endif // M0 > 2
-#if M0 > 3
-    half8 acc3 = 0.0h;
-#endif // M0 > 3
-
-    int i = 0;
-    for(; i <= ((int)K - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(M0, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Accumulate
-        acc0 = fma(b0, (half8)a0.s0, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s0, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s0, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s0, acc3);
-#endif // M0 > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s1, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s1, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s1, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s1, acc3);
-#endif // M0 > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s2, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s2, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s2, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s2, acc3);
-#endif // M0 > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s3, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s3, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s3, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s3, acc3);
-#endif // M0 > 3
-
-        src_addr.s0 += 4 * sizeof(half);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-
-        src_addr += (int2)(sizeof(half), src1_stride_y);
-
-        // Accumulate
-        acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
-#endif                                   // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
-#endif                                   // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
-#endif                                   // M0 > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, half, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, half, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-
-#endif // defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
deleted file mode 100644
index d3eba89e76..0000000000
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ /dev/null
@@ -1,2316 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "gemm_helpers.h"
-#include "helpers_asymm.h"
-#include "repeat.h"
-
-#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-#define ARM_DOT1(a, b, c)                                                                                                                               \
-    ({                                                                                                                                                  \
-        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
-    })
-#define ARM_DOT2(a, b, c)                                                                                                                               \
-    ({                                                                                                                                                  \
-        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
-    })
-#define ARM_DOT3(a, b, c)                                                                                           \
-    ({                                                                                                              \
-        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
-    })
-#define ARM_DOT4(a, b, c) \
-    ({                    \
-        ARM_DOT(a, b, c); \
-    })
-#define ARM_DOT8(a, b, c)            \
-    ({                               \
-        ARM_DOT4((a.lo), (b.lo), c); \
-        ARM_DOT4((a.hi), (b.hi), c); \
-    })
-#define ARM_DOT16(a, b, c)           \
-    ({                               \
-        ARM_DOT8((a.lo), (b.lo), c); \
-        ARM_DOT8((a.hi), (b.hi), c); \
-    })
-
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] without using the dot8 instruction. */
-#define ARM_DOT1(a, b, c)          \
-    ({                             \
-        c += (ACC_DATA_TYPE)a * b; \
-    })
-#define ARM_DOT2(a, b, c)                \
-    ({                                   \
-        c += (ACC_DATA_TYPE)a.s0 * b.s0; \
-        c += (ACC_DATA_TYPE)a.s1 * b.s1; \
-    })
-#define ARM_DOT3(a, b, c)                \
-    ({                                   \
-        ARM_DOT2(a, b, c);               \
-        c += (ACC_DATA_TYPE)a.s2 * b.s2; \
-    })
-#define ARM_DOT4(a, b, c)                \
-    ({                                   \
-        ARM_DOT3(a, b, c);               \
-        c += (ACC_DATA_TYPE)a.s3 * b.s3; \
-    })
-#define ARM_DOT8(a, b, c)            \
-    ({                               \
-        ARM_DOT4((a.lo), (b.lo), c); \
-        ARM_DOT4((a.hi), (b.hi), c); \
-    })
-#define ARM_DOT16(a, b, c)           \
-    ({                               \
-        ARM_DOT8((a.lo), (b.lo), c); \
-        ARM_DOT8((a.hi), (b.hi), c); \
-    })
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
-#define ARM_DOT_K0X1(k0, a, b, c)         \
-    ({                                    \
-        ARM_DOT_K0(k0, (a), (b##0), (c)); \
-    })
-#define ARM_DOT_K0X2(k0, a, b, c)            \
-    ({                                       \
-        ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
-        ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
-    })
-#define ARM_DOT_K0X3(k0, a, b, c)            \
-    ({                                       \
-        ARM_DOT_K0X2(k0, a, b, c);           \
-        ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
-    })
-#define ARM_DOT_K0X4(k0, a, b, c)            \
-    ({                                       \
-        ARM_DOT_K0X3(k0, a, b, c);           \
-        ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
-    })
-#define ARM_DOT_K0X8(k0, a, b, c)            \
-    ({                                       \
-        ARM_DOT_K0X4(k0, a, b, c);           \
-        ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
-        ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
-        ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
-        ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
-    })
-#define ARM_DOT_K0X16(k0, a, b, c)           \
-    ({                                       \
-        ARM_DOT_K0X8(k0, a, b, c);           \
-        ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
-        ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
-        ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
-        ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
-        ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
-        ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
-        ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
-        ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
-    })
-
-/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */
-#define ARM_MM_K0XN0X1(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
-    })
-#define ARM_MM_K0XN0X2(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X1(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
-    })
-#define ARM_MM_K0XN0X3(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X2(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
-    })
-#define ARM_MM_K0XN0X4(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X3(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
-    })
-#define ARM_MM_K0XN0X5(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X4(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
-    })
-#define ARM_MM_K0XN0X6(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X5(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
-    })
-#define ARM_MM_K0XN0X7(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X6(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
-    })
-#define ARM_MM_K0XN0X8(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X7(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
-    })
-
-#define ARM_DOT_K0(k0, a, b, c) \
-    ({                          \
-        CONCAT(ARM_DOT, k0)     \
-        ((a), (b), (c));        \
-    })
-
-#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
-    ({                                 \
-        CONCAT(ARM_DOT_K0X, n0)        \
-        (k0, (a), b, (c));             \
-    })
-
-#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
-    ({                                       \
-        CONCAT(ARM_MM_K0XN0X, m0)            \
-        (n0, k0, a, b, c);                   \
-    })
-
-/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
-#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c)   \
-    ({                                           \
-        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \
-    })
-#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c)        \
-    ({                                                \
-        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
-        c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
-    })
-#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c)        \
-    ({                                                \
-        ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c);       \
-        c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
-    })
-#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c)        \
-    ({                                                \
-        ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c);       \
-        c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
-    })
-#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c)        \
-    ({                                                \
-        ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c);       \
-        c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
-        c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
-        c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
-        c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
-    })
-#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c)       \
-    ({                                                \
-        ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c);       \
-        c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
-        c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
-        c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
-        c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
-        c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
-        c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
-        c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
-        c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
-    })
-/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
-#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \
-    })
-#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
-    ({                                              \
-        CONCAT(ARM_MUL_N0X, k0)                     \
-        (VECTOR_ACC_TYPE, (a), b, (c));             \
-    })
-#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
-    ({                                                           \
-        CONCAT(ARM_MM_NATIVE_N0XK0X, m0)                         \
-        (VECTOR_ACC_TYPE, k0, a, b, c);                          \
-    })
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM/QASYMM_SIGNED data type.
- *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
- *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
- * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
- * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
- * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - V0 >= 1
- *  - H0 >= 1
- *
- * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
- *
- * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM_SIGNED
- * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
-                                                IMAGE_DECLARATION(rhs),
-                                                IMAGE_DECLARATION(dst),
-                                                uint k,
-                                                uint lhs_stride_z,
-                                                uint rhs_stride_z,
-                                                uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                ,
-                                                uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                               )
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z));
-
-    // Compute RHS matrix address
-    __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    for(int i = 0; i < k; i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
-
-        // Partial matrix multiplication M0,N0,K0
-        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
-
-        // Update address
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
-        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Convert and store output block
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-    // Store output block
-    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - H0 >= 1
- *
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
-                                              IMAGE_DECLARATION(rhs),
-                                              IMAGE_DECLARATION(dst),
-                                              uint lhs_stride_z,
-                                              uint rhs_stride_z,
-                                              uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                              ,
-                                              uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                              ,
-                                              uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                             )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
-
-        // Partial matrix multiplication M0,N0,K0
-        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
-
-        lhs_offset += K0;
-        rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
-
-        ARM_MM_K0XN0XM0(M0, N0, 1, a, b, c);
-        lhs_offset += 1;
-        rhs_offset += 1;
-    }
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Convert and store output block
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-
-#if defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage using fixed-point arithmetic.
- *  The LHS matrix is NOT reshaped
- *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - H0 >= 1
- *
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed at compile time.
- *
- * @param[in]  lhs_ptr                                          Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  lhs_stride_x                                     Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                                     Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes                The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                                          Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                                     Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                                     Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes                The offset of the first element in the RHS reshaped matrix
- * @param[out] dst_ptr                                          Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                                     Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                                     Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                                     Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                                     Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                              (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                              (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: S32
- * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: S32
- * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: S32
- * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
- * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
- * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
- * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
- */
-__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAGE_DECLARATION(lhs),
-                                                                            IMAGE_DECLARATION(rhs),
-                                                                            IMAGE_DECLARATION(dst),
-                                                                            uint lhs_stride_z,
-                                                                            uint rhs_stride_z,
-                                                                            uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                            ,
-                                                                            uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                            ,
-                                                                            uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-#if defined(A_OFFSET)
-                                                                            ,
-                                                                            IMAGE_DECLARATION(sum_col)
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                                                            ,
-                                                                            IMAGE_DECLARATION(sum_row)
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
-                                                                            ,
-                                                                            VECTOR_DECLARATION(biases)
-#endif // defined(ADD_BIAS)
-#if defined(PER_CHANNEL_QUANTIZATION)
-                                                                            ,
-                                                                            VECTOR_DECLARATION(result_multipliers),
-                                                                            VECTOR_DECLARATION(result_shifts)
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-                                                                           )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
-
-        // Partial matrix multiplication M0,N0,K0
-        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
-
-        lhs_offset += K0;
-        rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
-
-        ARM_MM_K0XN0XM0(M0, N0, 1, a, b, c);
-        lhs_offset += 1;
-        rhs_offset += 1;
-    }
-    // Result of MM is of type DATA_TYPE
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Convert result of matrix multiplication to S32
-    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_int);
-
-    // Offset contribution: c += (A_OFFSET * sum_col) + (B_OFFSET * sum_row) +  K_OFFSET;
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(int, N0), offset_s32_, K_OFFSET);
-
-#if defined(A_OFFSET)
-    // Compute the offset contribution due to A_OFFSET
-    __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
-
-#if defined(SUM_COL_HAS_BATCHES)
-    sum_col_addr += z * sum_col_stride_y;
-#endif // defined(SUM_COL_HAS_BATCHES)
-    VEC_DATA_TYPE(int, N0)
-    a_offset_s32 = VLOAD(N0)(0, (__global int *)sum_col_addr);
-    a_offset_s32 *= (VEC_DATA_TYPE(int, N0))A_OFFSET;
-
-    REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, a_offset_s32);
-#endif // defined(A_OFFSET)
-
-#if defined(B_OFFSET)
-    // Compute the offset contribution due to B_OFFSET
-    // Note: The sum_row tensor is generated through CLGEMMLowpMatrixAReductionKernel which
-    // does not introduce paddings. For this reason is safe to access the tensor in this manner
-    // without considering that the coordinate "y" could come from an input 3D tensor
-    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + (COMPUTE_M0_START_ROW(y, (uint)M0, PARTIAL_STORE_M0)) * sizeof(int) + z * sum_row_stride_y;
-
-    LOAD_SCALAR_AS_VECTOR(M0, N0, int, b_offset_s32_, sum_row_addr, 0, sum_row_stride_x);
-
-    REPEAT_MLA_VAR_WITH_CONST_VEC(M0, offset_s32_, b_offset_s32_, (VEC_DATA_TYPE(int, N0))B_OFFSET);
-#endif // defined(B_OFFSET)
-
-#if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
-
-    VEC_DATA_TYPE(int, N0)
-    bias_values = VLOAD(N0)(0, (__global int *)bias_addr);
-    REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, bias_values);
-#endif // defined(ADD_BIAS)
-
-    REPEAT_ADD_TWO_VARS(M0, c_int, offset_s32_);
-
-    // Multiply by result_mult_int and shift
-#if defined(PER_CHANNEL_QUANTIZATION)
-    __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
-    __global uchar *result_shifts_addr      = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
-
-    VEC_DATA_TYPE(int, N0)
-    res_mul = VLOAD(N0)(0, (__global int *)result_multipliers_addr);
-    VEC_DATA_TYPE(int, N0)
-    res_shift = VLOAD(N0)(0, (__global int *)result_shifts_addr);
-
-    REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(M0, N0, c_int, res_mul, res_shift);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-
-#if RESULT_SHIFT < 0
-    REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER, RESULT_SHIFT);
-#else  // RESULT_SHIFT >= 0
-    REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER, RESULT_SHIFT);
-#endif // RESULT_SHIFT < 0
-
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-    // Add the offset terms to GEMM's result
-    REPEAT_ADD_CONST_TO_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, RESULT_OFFSET);
-
-#if defined(MIN_BOUND)
-    REPEAT_MAX_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    REPEAT_MIN_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Convert and store output block
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c_int, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS matrix is NOT reshaped
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
- * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
- * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., -DK0=2)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
- * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
-                                 IMAGE_DECLARATION(rhs),
-                                 IMAGE_DECLARATION(dst),
-                                 uint lhs_stride_z,
-                                 uint rhs_stride_z,
-                                 uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                 ,
-                                 uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                 ,
-                                 uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                )
-{
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
-
-        // Partial matrix multiplication M0,N0,K0
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
-#else  // GPU_ARCH == GPU_ARCH_MIDGARD
-        // Transpose the values from RHS matrix
-        TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
-
-        ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-        // Update the offset
-        lhs_offset += K0;
-        rhs_offset += K0 * rhs_stride_y;
-    }
-
-    // Left-over for loop
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
-
-        // Partial matrix multiplication M0,N0,1
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
-#else  // GPU_ARCH == GPU_ARCH_MIDGARD
-        // Transpose the values from RHS matrix
-        TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
-
-        ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-        // Update the offset
-        lhs_offset += 1;
-        rhs_offset += rhs_stride_y;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Convert and store output block
-    REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); // resN = CONVERT(cN, VEC_DATA_TYPE(int, N0));
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-#if defined(COLS_A)
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- *
- * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
-                                          IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-    sum_row_32            = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
-    ACC_DATA_TYPE sum_row = 0;
-
-    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
-
-    int i = 0;
-
-    // This for loop performs 16 accumulations
-    for(; i <= ((int)COLS_A - 16); i += 16)
-    {
-        const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
-
-        sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.sCDEF,
-                      VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
-    }
-
-    // This for loop performs the leftover accumulations
-    for(; i < COLS_A; ++i)
-    {
-        sum_row += (ACC_DATA_TYPE)matrix_a[i];
-    }
-
-    sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
-
-#if defined(SCALAR)
-    sum_row *= (int)SCALAR;
-#endif // defined(SCALAR)
-    *((__global int *)dst.ptr) = (int)sum_row;
-}
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction.
- * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- *
- * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
-                                               IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    ACC_DATA_TYPE sum_row = 0;
-
-    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
-
-    int i = 0;
-
-    // This for loop performs 16 accumulations
-    for(; i <= ((int)COLS_A - 32); i += 32)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 16)
-        a0 = vload16(0, matrix_a + i);
-
-        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-
-        a0 = vload16(1, matrix_a + i);
-
-        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-    }
-
-    // This for loop performs the leftover accumulations
-    for(; i < COLS_A; ++i)
-    {
-        sum_row += (ACC_DATA_TYPE)matrix_a[i];
-    }
-
-#if defined(SCALAR)
-    sum_row *= (int)SCALAR;
-#endif // defined(SCALAR)
-    *((__global int *)dst.ptr) = (int)sum_row;
-}
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#endif // defined(COLS_A)
-
-#if defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- * It is also possible to multiply each reduced column by a scalar value, if SCALAR is passed at compile time.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- *
- * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. -DSCALAR=3)
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
-                                          IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const uint y      = get_global_id(1);
-
-    __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z);
-    __global uchar *dst_addr           = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + y * dst_stride_y;
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))0;
-
-    int i = 0;
-    // This for loop performs 4 accumulations
-    for(; i <= ((int)ROWS_B - 4); i += 4)
-    {
-        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        b0 = VLOAD(VEC_SIZE)(0, matrix_b + 0 * src_stride_y);
-        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        b1 = VLOAD(VEC_SIZE)(0, matrix_b + 1 * src_stride_y);
-        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        b2 = VLOAD(VEC_SIZE)(0, matrix_b + 2 * src_stride_y);
-        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        b3 = VLOAD(VEC_SIZE)(0, matrix_b + 3 * src_stride_y);
-
-        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b3,
-                      VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-
-        matrix_b += 4 * src_stride_y;
-    }
-
-    // This for loop perfoms the leftover accumulations
-    for(; i < (int)ROWS_B; ++i)
-    {
-        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        b0 = VLOAD(VEC_SIZE)(0, matrix_b);
-
-        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-
-        matrix_b += src_stride_y;
-    }
-
-#if defined(SCALAR)
-    sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))SCALAR;
-#endif // defined(SCALAR)
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    res0 = CONVERT(sum_col_32, VEC_DATA_TYPE(int, VEC_SIZE));
-
-    STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-
-#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
-
-#if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-
-/* Helper function used to calculate the offset contribution after matrix multiplication.
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
- * and calculates the offset contribution of matrix A and matrix B.
- *
- * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
- * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
- * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
- * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in] x                                     max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0)
- * @param[in] y                                     get_global_id(1)
- * @param[in] z                                     get_global_id(2)
- * @param[in] sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_col_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_col_step_x                        (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_col_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_col_step_y                        (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
- * @param[in] sum_row_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_row_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_row_step_x                        (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_row_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_row_step_y                        (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
- * @param[in] biases_ptr                            (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases tensor
- */
-inline VEC_INT offset_contribution(
-    int x,
-    int y,
-    int z
-#if defined(A_OFFSET)
-    ,
-    IMAGE_DECLARATION(sum_col)
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-    ,
-    IMAGE_DECLARATION(sum_row)
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif // defined(ADD_BIAS)
-)
-{
-    VEC_INT a_offset_s32 = (VEC_INT)0;
-    VEC_INT b_offset_s32 = (VEC_INT)0;
-
-    int batch_id = z;
-#if defined(DEPTH_INPUT3D)
-    batch_id /= (int)DEPTH_INPUT3D;
-#endif // defined(DEPTH_INPUT3D)
-
-#if defined(A_OFFSET)
-    // Compute the offset contribution due to A_OFFSET
-    __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
-
-    // Compute the offset contribution due to A_OFFSET
-#if defined(SUM_COL_HAS_BATCHES)
-    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
-#else  // defined(SUM_COL_HAS_BATCHES)
-    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)sum_col_addr);
-#endif // defined(SUM_COL_HAS_BATCHES)
-
-    a_offset_s32 *= (VEC_INT)A_OFFSET;
-#endif // defined(A_OFFSET)
-
-#if defined(B_OFFSET)
-    // Compute the offset contribution due to A_OFFSET
-    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
-
-    // Compute the offset contribution due to B_OFFSET
-#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
-    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
-#else  // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
-    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
-#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
-    b_offset_s32 *= (VEC_INT)B_OFFSET;
-#endif // defined(B_OFFSET)
-
-#if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
-
-    VEC_INT biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
-    b_offset_s32 += (VEC_INT)biases_values;
-#endif // defined(ADD_BIAS)
-
-    return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32;
-}
-
-/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
- * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
- * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
- * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (sum_col[k] * A_OFFSET) +
- *                   (sum_row[i] * B_OFFSET) +
- *                   (K_OFFSET)
- *
- * @param[in] mm_result_ptr                           Pointer to the source tensor. Supported data type: S32
- * @param[in] mm_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in] mm_result_step_x                        mm_result_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] mm_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in] mm_result_step_y                        mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] mm_result_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] mm_result_step_z                        mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_col_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_col_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_col_step_x                          (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_col_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_col_step_y                          (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
- * @param[in] sum_row_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_row_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_row_step_x                          (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_row_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_row_step_y                          (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
- * @param[in] biases_ptr                              (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in] biases_stride_x                         (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x                           (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes    (Optional) The offset of the first element in the biases tensor
- */
-__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
-#if defined(A_OFFSET)
-                                           ,
-                                           IMAGE_DECLARATION(sum_col)
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                           ,
-                                           IMAGE_DECLARATION(sum_row)
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
-                                           ,
-                                           VECTOR_DECLARATION(biases)
-#endif // defined(ADD_BIAS))
-                                          )
-{
-    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-
-    // Compute offset contribution
-    VEC_INT offset_term_s32 = offset_contribution(
-                                  x, y, z
-#if defined(A_OFFSET)
-                                  ,
-                                  sum_col_ptr,
-                                  sum_col_stride_x,
-                                  sum_col_step_x,
-                                  sum_col_stride_y,
-                                  sum_col_step_y,
-                                  sum_col_offset_first_element_in_bytes
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                  ,
-                                  sum_row_ptr,
-                                  sum_row_stride_x,
-                                  sum_row_step_x,
-                                  sum_row_stride_y,
-                                  sum_row_step_y,
-                                  sum_row_offset_first_element_in_bytes
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
-                                  ,
-                                  biases_ptr,
-                                  biases_stride_x,
-                                  biases_step_x,
-                                  biases_offset_first_element_in_bytes
-#endif // defined(ADD_BIAS)
-                              );
-
-    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
-
-    VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
-
-    // Add the offset terms to GEMM's result
-    in_s32_0 += offset_term_s32;
-
-    // Store the result with the offset contribution
-    STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE)
-/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
- *
- * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
- *
- *
- * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
- * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
- * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
- * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
- *
- * The result before the output stage is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (sum_col[k] * A_OFFSET) +
- *                   (sum_row[i] * B_OFFSET) +
- *                   (K_OFFSET)
- *
- * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
- *  -# Clamp the resulting int32 values:
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
- * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
- * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
- * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
- * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
- * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
- * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
- */
-__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
-#if defined(A_OFFSET)
-                                                         ,
-                                                         IMAGE_DECLARATION(sum_col)
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                                         ,
-                                                         IMAGE_DECLARATION(sum_row)
-#endif // defined(B_OFFSET)
-                                                         ,
-#if defined(ADD_BIAS)
-                                                         VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-                                                         TENSOR3D_DECLARATION(dst)
-#if defined(PER_CHANNEL_QUANTIZATION)
-                                                         ,
-                                                         VECTOR_DECLARATION(result_multipliers),
-                                                         VECTOR_DECLARATION(result_shifts)
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-                                                        )
-{
-    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
-
-    // Compute offset contribution
-    VEC_INT offset_term_s32 = offset_contribution(
-                                  x, y, z
-#if defined(A_OFFSET)
-                                  ,
-                                  sum_col_ptr,
-                                  sum_col_stride_x,
-                                  sum_col_step_x,
-                                  sum_col_stride_y,
-                                  sum_col_step_y,
-                                  sum_col_offset_first_element_in_bytes
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                  ,
-                                  sum_row_ptr,
-                                  sum_row_stride_x,
-                                  sum_row_step_x,
-                                  sum_row_stride_y,
-                                  sum_row_step_y,
-                                  sum_row_offset_first_element_in_bytes
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
-                                  ,
-                                  biases_ptr,
-                                  biases_stride_x,
-                                  biases_step_x,
-                                  biases_offset_first_element_in_bytes
-#endif // defined(ADD_BIAS)
-                              );
-
-    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
-
-    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
-
-    // Add the offset terms to GEMM's result
-    in_s32 += offset_term_s32;
-
-    // -------------- OUTPUT STAGE
-
-    // Add the offset terms to GEMM's result
-    in_s32 += (VEC_INT)RESULT_OFFSET;
-
-    // Multiply by result_mult_int and shift
-#if defined(PER_CHANNEL_QUANTIZATION)
-    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
-    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
-    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
-    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
-
-    in_s32 *= result_multipliers_values;
-    in_s32 >>= result_shifts_values;
-#else  // defined(PER_CHANNEL_QUANTIZATION)
-    in_s32 *= RESULT_MULTIPLIER;
-
-    in_s32 >>= RESULT_SHIFT;
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes down to uint8.
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
- *
- *
- * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
- * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
- * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
- * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
- *
- * The result before the output stage is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (sum_col[k] * A_OFFSET) +
- *                   (sum_row[i] * B_OFFSET) +
- *                   (K_OFFSET)
- *
- * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
- * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
- * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
- * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
- * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
- * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
- * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
- */
-__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
-#if defined(A_OFFSET)
-                                                                    ,
-                                                                    IMAGE_DECLARATION(sum_col)
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                                                    ,
-                                                                    IMAGE_DECLARATION(sum_row)
-#endif // defined(B_OFFSET)
-                                                                    ,
-#if defined(ADD_BIAS)
-                                                                    VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-                                                                    TENSOR3D_DECLARATION(dst)
-#if defined(PER_CHANNEL_QUANTIZATION)
-                                                                    ,
-                                                                    VECTOR_DECLARATION(result_multipliers),
-                                                                    VECTOR_DECLARATION(result_shifts)
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-                                                                   )
-{
-    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-
-    // Compute offset contribution
-    VEC_INT offset_term_s32 = offset_contribution(
-                                  x, y, z
-#if defined(A_OFFSET)
-                                  ,
-                                  sum_col_ptr,
-                                  sum_col_stride_x,
-                                  sum_col_step_x,
-                                  sum_col_stride_y,
-                                  sum_col_step_y,
-                                  sum_col_offset_first_element_in_bytes
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                  ,
-                                  sum_row_ptr,
-                                  sum_row_stride_x,
-                                  sum_row_step_x,
-                                  sum_row_stride_y,
-                                  sum_row_step_y,
-                                  sum_row_offset_first_element_in_bytes
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
-                                  ,
-                                  biases_ptr,
-                                  biases_stride_x,
-                                  biases_step_x,
-                                  biases_offset_first_element_in_bytes
-#endif // defined(ADD_BIAS)
-                              );
-
-    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
-
-    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
-
-    // Add the offset terms to GEMM's result
-    in_s32 += offset_term_s32;
-
-    // -------------- OUTPUT STAGE
-
-    // Multiply by result_mult_int and shift
-#if defined(PER_CHANNEL_QUANTIZATION)
-    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
-    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
-    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
-    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
-
-    VEC_INT in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
-    VEC_INT in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
-    in_s32                   = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-
-#if RESULT_SHIFT < 0
-    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#else  // RESULT_SHIFT >= 0
-    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#endif // RESULT_SHIFT < 0
-
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-    // Add the offset terms to GEMM's result
-    in_s32 += (VEC_INT)RESULT_OFFSET;
-
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE)
-
-#undef VEC_INT
-
-#endif // defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-
-#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
-/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
- *  -# Clamp the resulting int32 values:
- *  -#  - to the [0..255] range and cast to QASYMM8.
- *  -#  - to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
- * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
- * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
-#if defined(ADD_BIAS)
-                                                  VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-                                                  TENSOR3D_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
-
-#if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
-    input_values += biases_values;
-#endif // defined(ADD_BIAS)
-
-    // Add the offset terms to GEMM's result
-    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET;
-
-    // Multiply by result_mult_int and shift
-    input_values *= RESULT_MULT_INT;
-
-#if RESULT_SHIFT < 0
-    input_values >>= -RESULT_SHIFT;
-#else  // RESULT_SHIFT >= 0
-    input_values >>= RESULT_SHIFT;
-#endif // RESULT_SHIFT < 0
-
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
-
-#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
-/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
- * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
- * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
-#if defined(ADD_BIAS)
-                                                             VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-                                                             TENSOR3D_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
-
-#if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
-    input_values += biases_values;
-#endif // defined(ADD_BIAS)
-
-    // Multiply by result_mult_int and shift
-#if RESULT_SHIFT < 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#else  // RESULT_SHIFT >= 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#endif // RESULT_SHIFT < 0
-
-    // Add the offset terms to GEMM's result
-    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT;
-
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
-
-#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
-
-/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QSYMM16 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16.
- *
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
- * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
- * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QSYMM16
- * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
-#if defined(ADD_BIAS)
-                                                                     VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-                                                                     TENSOR3D_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z;
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
-
-#if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
-    input_values += biases_values;
-#endif // defined(ADD_BIAS)
-
-    // Multiply by result_mult_int and shift
-#if RESULT_SHIFT < 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#else  // RESULT_SHIFT >= 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#endif // RESULT_SHIFT < 0
-
-    VEC_DATA_TYPE(short, VEC_SIZE)
-    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
-
-#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
-/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Requantize
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
- * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                        biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
- * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                         Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                           src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
-#if defined(ADD_BIAS)
-                                                        VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-#if defined(DST_HEIGHT)
-                                                        TENSOR4D_DECLARATION(dst))
-#else  // defined(DST_HEIGHT)
-                                                        TENSOR3D_DECLARATION(dst))
-#endif // defined(DST_HEIGHT)
-{
-    // Compute source and destination addresses
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
-
-#if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
-    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))biases_values;
-#endif // defined(ADD_BIAS)
-
-    // Convert to float
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    input_values_f = CONVERT(input_values, VEC_DATA_TYPE(float, VEC_SIZE));
-    input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
-
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/gemv.cl
deleted file mode 100644
index aaa83975f8..0000000000
--- a/src/core/CL/cl_kernels/gemv.cl
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
-/** This kernel applies dot product to each plane on the input tensor and the corrispective column of the reshaped weight tensor.
- *
- * @note Datatype and source width and height should be given as a preprocessor argument using -DDATA_TYPE=type, -DSRC_WIDTH=width and -DSRC_HEIGHT=height. e.g. -DDATA_TYPE=short
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- */
-__kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VECTOR_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
-    int y = get_global_id(1) * 4;
-    int z = get_global_id(2);
-
-    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
-    __global uchar *input_ptr       = src.ptr;
-
-    DATA_TYPE acc0 = (DATA_TYPE)0;
-    DATA_TYPE acc1 = (DATA_TYPE)0;
-    DATA_TYPE acc2 = (DATA_TYPE)0;
-    DATA_TYPE acc3 = (DATA_TYPE)0;
-
-    // This kernel handle 4 rows in per thread so that it can reuse the weights
-    for(int i = 0; i < SRC_WIDTH; i += 4)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        weights = vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x));
-
-        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
-
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        tmp0 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        tmp1 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        tmp2 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        tmp3 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3));
-
-        acc0 += dot(weights, tmp0);
-        acc1 += dot(weights, tmp1);
-        acc2 += dot(weights, tmp2);
-        acc3 += dot(weights, tmp3);
-    }
-
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
-
-    int rows_left = SRC_HEIGHT - (y + 4);
-
-    // This if check is used to handle the last few rows when it can't be divided by the four
-    if(rows_left >= 0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        out = (VEC_DATA_TYPE(DATA_TYPE, 4))(acc0, acc1, acc2, acc3);
-        vstore4(out, 0, (__global DATA_TYPE *)output_ptr);
-    }
-    else
-    {
-        switch(rows_left)
-        {
-            case -1: // three rows left; one is padding
-                *((__global DATA_TYPE *)(output_ptr + 2 * dst_stride_x)) = acc2;
-            case -2: // two rows left; two are padding
-                *((__global DATA_TYPE *)(output_ptr + 1 * dst_stride_x)) = acc1;
-            case -3: // one row left; three are padding
-                *((__global DATA_TYPE *)(output_ptr + 0 * dst_stride_x)) = acc0;
-                break;
-        }
-    }
-}
-
-/** This kernel applies dot product to each plane on the input tensor and the corresponding column of the reshaped weight tensor.
- *
- * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uchar
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: S32
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  input_offset                          Input's quantization offset
- * @param[in]  weights_offset                        Weights's quantization offset
- */
-__kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),
-                                IMAGE_DECLARATION(weights),
-                                VECTOR_DECLARATION(dst),
-                                const int input_offset,
-                                const int weights_offset)
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
-    int y = get_global_id(1) * 4;
-    int z = get_global_id(2);
-
-    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
-    __global uchar *input_ptr       = src.ptr;
-
-    int acc0 = 0;
-    int acc1 = 0;
-    int acc2 = 0;
-    int acc3 = 0;
-
-    // This kernel handle 4 rows in per thread so that it can reuse the weights
-    for(int i = 0; i < SRC_WIDTH; i += 4)
-    {
-        int4 w = convert_int4(vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
-
-        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
-
-        int4 tmp0 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0))) + (int4)input_offset;
-        int4 tmp1 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1))) + (int4)input_offset;
-        int4 tmp2 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2))) + (int4)input_offset;
-        int4 tmp3 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3))) + (int4)input_offset;
-
-        // Accumulate
-        acc0 += tmp0.s0 * w.s0 + tmp0.s1 * w.s1 + tmp0.s2 * w.s2 + tmp0.s3 * w.s3;
-        acc1 += tmp1.s0 * w.s0 + tmp1.s1 * w.s1 + tmp1.s2 * w.s2 + tmp1.s3 * w.s3;
-        acc2 += tmp2.s0 * w.s0 + tmp2.s1 * w.s1 + tmp2.s2 * w.s2 + tmp2.s3 * w.s3;
-        acc3 += tmp3.s0 * w.s0 + tmp3.s1 * w.s1 + tmp3.s2 * w.s2 + tmp3.s3 * w.s3;
-    }
-
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
-
-    int rows_left = SRC_HEIGHT - (y + 4);
-
-    // This if check is used to handle the last few rows when it can't be divided by the four
-    if(rows_left >= 0)
-    {
-        vstore4((int4)(acc0, acc1, acc2, acc3), 0, (__global int *)output_ptr);
-    }
-    else
-    {
-        switch(rows_left)
-        {
-            case -1: // three rows left; one is padding
-                *((__global int *)(output_ptr + 2 * dst_stride_x)) = acc2;
-            case -2: // two rows left; two are padding
-                *((__global int *)(output_ptr + 1 * dst_stride_x)) = acc1;
-            case -3: // one row left; three are padding
-                *((__global int *)(output_ptr + 0 * dst_stride_x)) = acc0;
-                break;
-        }
-    }
-}
-#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
diff --git a/src/core/CL/cl_kernels/generate_proposals.cl b/src/core/CL/cl_kernels/generate_proposals.cl
deleted file mode 100644
index e8306c55a8..0000000000
--- a/src/core/CL/cl_kernels/generate_proposals.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
- * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
- * -# -DHEIGHT= Height of the feature map on which this kernel is applied
- * -# -DWIDTH= Width of the feature map on which this kernel is applied
- * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
- * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
- * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
- *
- * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: F16/F32
- * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
- * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
- * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
- * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
- * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
- * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
- * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
- * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
- */
-#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
-__kernel void generate_proposals_compute_all_anchors(
-    VECTOR_DECLARATION(anchors),
-    VECTOR_DECLARATION(rois))
-{
-    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
-    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
-
-    const size_t idx = get_global_id(0);
-    // Find the index of the anchor
-    const size_t anchor_idx = idx % NUM_ANCHORS;
-
-    // Find which shift is this thread using
-    const size_t shift_idx = idx / NUM_ANCHORS;
-
-    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
-    const DATA_TYPE
-    shift_x = (DATA_TYPE)(shift_idx % WIDTH) * STRIDE;
-    const DATA_TYPE
-    shift_y = (DATA_TYPE)(shift_idx / WIDTH) * STRIDE;
-
-    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
-    shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
-
-    // Read the given anchor
-    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
-    anchor = vload4(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS));
-
-    // Apply the shift to the anchor
-    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
-    shifted_anchor = anchor + shift;
-
-    vstore4(shifted_anchor, 0, (__global DATA_TYPE *)rois.ptr);
-}
-#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
diff --git a/src/core/CL/cl_kernels/generate_proposals_quantized.cl b/src/core/CL/cl_kernels/generate_proposals_quantized.cl
deleted file mode 100644
index 04264197f4..0000000000
--- a/src/core/CL/cl_kernels/generate_proposals_quantized.cl
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
- * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM8
- * -# -DHEIGHT= Height of the feature map on which this kernel is applied
- * -# -DWIDTH= Width of the feature map on which this kernel is applied
- * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
- * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
- * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
- *
- * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: QASYMM8
- * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
- * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
- * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
- * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
- * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
- * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
- * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
- * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
- */
-#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
-__kernel void generate_proposals_compute_all_anchors_quantized(
-    VECTOR_DECLARATION(anchors),
-    VECTOR_DECLARATION(rois))
-{
-    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
-    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
-
-    const size_t idx = get_global_id(0);
-    // Find the index of the anchor
-    const size_t anchor_idx = idx % NUM_ANCHORS;
-
-    // Find which shift is this thread using
-    const size_t shift_idx = idx / NUM_ANCHORS;
-
-    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
-    const float shift_x = (float)(shift_idx % WIDTH) * STRIDE;
-    const float shift_y = (float)(shift_idx / WIDTH) * STRIDE;
-
-    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
-    shift = (VEC_DATA_TYPE(float, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
-
-    // Read the given anchor
-    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
-    anchor = DEQUANTIZE(VLOAD(NUM_ROI_FIELDS)(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS)), OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS);
-
-    // Apply the shift to the anchor
-    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
-    shifted_anchor = anchor + shift;
-
-    VSTORE(NUM_ROI_FIELDS)
-    (QUANTIZE(shifted_anchor, OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS), 0, (__global DATA_TYPE *)rois.ptr);
-}
-#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 6cd76373d2..6e05a513ec 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_HELPER_H
-#define ARM_COMPUTE_HELPER_H
+#ifndef ACL_SRC_CORE_CL_CL_KERNELS_HELPERS_H
+#define ACL_SRC_CORE_CL_CL_KERNELS_HELPERS_H
 
 #include "load_store_utility.h"
 
@@ -44,6 +44,7 @@
 
 #define GPU_ARCH_MIDGARD 0x100
 #define GPU_ARCH_BIFROST 0x200
+#define GPU_ARCH_VALHALL 0x300
 
 /** Concatenate two inputs.
  *
@@ -80,11 +81,11 @@
  * @return The reversed vector
  * @{
  */
-#define REV1(x) ((x))
-#define REV2(x) ((x).s10)
-#define REV3(x) ((x).s210)
-#define REV4(x) ((x).s3210)
-#define REV8(x) ((x).s76543210)
+#define REV1(x)  ((x))
+#define REV2(x)  ((x).s10)
+#define REV3(x)  ((x).s210)
+#define REV4(x)  ((x).s3210)
+#define REV8(x)  ((x).s76543210)
 #define REV16(x) ((x).sFEDCBA9876543210)
 /** @} */ // end of group REVn
 
@@ -98,7 +99,7 @@
  * @{
  */
 #define REVERSE_STR(x, s) REV##s((x))
-#define REVERSE(x, s) REVERSE_STR(x, s)
+#define REVERSE(x, s)     REVERSE_STR(x, s)
 /** @} */ // end of group REVERSE
 
 /** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
@@ -137,16 +138,16 @@
 #define ROT8_7(x) ((x).s12345670)
 #define ROT8_8(x) ((x))
 
-#define ROT16_0(x) ((x))
-#define ROT16_1(x) ((x).sF0123456789ABCDE)
-#define ROT16_2(x) ((x).sEF0123456789ABCD)
-#define ROT16_3(x) ((x).sDEF0123456789ABC)
-#define ROT16_4(x) ((x).sCDEF0123456789AB)
-#define ROT16_5(x) ((x).sBCDEF0123456789A)
-#define ROT16_6(x) ((x).sABCDEF0123456789)
-#define ROT16_7(x) ((x).s9ABCDEF012345678)
-#define ROT16_8(x) ((x).s89ABCDEF01234567)
-#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_0(x)  ((x))
+#define ROT16_1(x)  ((x).sF0123456789ABCDE)
+#define ROT16_2(x)  ((x).sEF0123456789ABCD)
+#define ROT16_3(x)  ((x).sDEF0123456789ABC)
+#define ROT16_4(x)  ((x).sCDEF0123456789AB)
+#define ROT16_5(x)  ((x).sBCDEF0123456789A)
+#define ROT16_6(x)  ((x).sABCDEF0123456789)
+#define ROT16_7(x)  ((x).s9ABCDEF012345678)
+#define ROT16_8(x)  ((x).s89ABCDEF01234567)
+#define ROT16_9(x)  ((x).s789ABCDEF0123456)
 #define ROT16_10(x) ((x).s6789ABCDEF012345)
 #define ROT16_11(x) ((x).s56789ABCDEF01234)
 #define ROT16_12(x) ((x).s456789ABCDEF0123)
@@ -167,7 +168,7 @@
  * @{
  */
 #define ROTATE_STR(x, s, n) ROT##s##_##n(x)
-#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+#define ROTATE(x, s, n)     ROTATE_STR(x, s, n)
 /** @} */ // end of group ROTATE
 
 /** Creates a vector of size n filled with offset values corresponding to the location of each element.
@@ -178,11 +179,11 @@
  * @return The vector filled with offset values
  * @{
  */
-#define V_OFFS1(dt) (dt##1)(0)
-#define V_OFFS2(dt) (dt##2)(0, 1)
-#define V_OFFS3(dt) (dt##3)(0, 1, 2)
-#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
-#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS1(dt)  (dt##1)(0)
+#define V_OFFS2(dt)  (dt##2)(0, 1)
+#define V_OFFS3(dt)  (dt##3)(0, 1, 2)
+#define V_OFFS4(dt)  (dt##4)(0, 1, 2, 3)
+#define V_OFFS8(dt)  (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
 #define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
 /** @} */ // end of group V_OFFSn
 
@@ -196,14 +197,216 @@
  * @{
  */
 #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
-#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+#define VEC_OFFS(dt, s)     VEC_OFFS_STR(dt, s)
 /** @} */ // end of group VEC_OFFS
 
 #define VLOAD_STR(size) vload##size
-#define VLOAD(size) VLOAD_STR(size)
+#define VLOAD(size)     VLOAD_STR(size)
 
-#define PIXEL_UNIT4 1
-#define PIXEL_UNIT8 2
+/** Extended partial vload that correctly handles scalar values as well.
+ * Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops
+ * @name VLOAD_PARTIAL
+ *
+ * @note With this macro, the passed data can be both a vector and a scalar
+ * @note @p load_size needs to be <= @p size
+ * eg 1: Valid
+ * VLOAD_PARTIAL(16, 15) ...;
+ * eg 2: Invalid
+ * VLOAD_PARTIAL(4, 7) ...;
+ *
+ * @param[in] size      The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
+ * @param[in] load_size The number of lower elements to load. Supported values: 1-16, but has to be <= @p size
+ * @{
+ */
+#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
+#define VLOAD_PARTIAL(size, load_size)     VLOAD_PARTIAL_STR(size, load_size)
+
+#define NO_LOAD(data, offs, ptr) \
+    {                            \
+    }
+
+// Size == 1 (scalar)
+#define vload_partial_1_0  NO_LOAD
+#define vload_partial_1_1  vload1
+#define vload_partial_1_2  NO_LOAD
+#define vload_partial_1_3  NO_LOAD
+#define vload_partial_1_4  NO_LOAD
+#define vload_partial_1_5  NO_LOAD
+#define vload_partial_1_6  NO_LOAD
+#define vload_partial_1_7  NO_LOAD
+#define vload_partial_1_8  NO_LOAD
+#define vload_partial_1_9  NO_LOAD
+#define vload_partial_1_10 NO_LOAD
+#define vload_partial_1_11 NO_LOAD
+#define vload_partial_1_12 NO_LOAD
+#define vload_partial_1_13 NO_LOAD
+#define vload_partial_1_14 NO_LOAD
+#define vload_partial_1_15 NO_LOAD
+#define vload_partial_1_16 NO_LOAD
+// Size == 2
+#define vload_partial_2_0  NO_LOAD
+#define vload_partial_2_1  vload_partial_1
+#define vload_partial_2_2  vload_partial_2
+#define vload_partial_2_3  NO_LOAD
+#define vload_partial_2_4  NO_LOAD
+#define vload_partial_2_5  NO_LOAD
+#define vload_partial_2_6  NO_LOAD
+#define vload_partial_2_7  NO_LOAD
+#define vload_partial_2_8  NO_LOAD
+#define vload_partial_2_9  NO_LOAD
+#define vload_partial_2_10 NO_LOAD
+#define vload_partial_2_11 NO_LOAD
+#define vload_partial_2_12 NO_LOAD
+#define vload_partial_2_13 NO_LOAD
+#define vload_partial_2_14 NO_LOAD
+#define vload_partial_2_15 NO_LOAD
+#define vload_partial_2_16 NO_LOAD
+// Size == 3
+#define vload_partial_3_0  NO_LOAD
+#define vload_partial_3_1  vload_partial_1
+#define vload_partial_3_2  vload_partial_2
+#define vload_partial_3_3  vload_partial_3
+#define vload_partial_3_4  NO_LOAD
+#define vload_partial_3_5  NO_LOAD
+#define vload_partial_3_6  NO_LOAD
+#define vload_partial_3_7  NO_LOAD
+#define vload_partial_3_8  NO_LOAD
+#define vload_partial_3_9  NO_LOAD
+#define vload_partial_3_10 NO_LOAD
+#define vload_partial_3_11 NO_LOAD
+#define vload_partial_3_12 NO_LOAD
+#define vload_partial_3_13 NO_LOAD
+#define vload_partial_3_14 NO_LOAD
+#define vload_partial_3_15 NO_LOAD
+#define vload_partial_3_16 NO_LOAD
+// Size == 4
+#define vload_partial_4_0  NO_LOAD
+#define vload_partial_4_1  vload_partial_1
+#define vload_partial_4_2  vload_partial_2
+#define vload_partial_4_3  vload_partial_3
+#define vload_partial_4_4  vload_partial_4
+#define vload_partial_4_5  NO_LOAD
+#define vload_partial_4_6  NO_LOAD
+#define vload_partial_4_7  NO_LOAD
+#define vload_partial_4_8  NO_LOAD
+#define vload_partial_4_9  NO_LOAD
+#define vload_partial_4_10 NO_LOAD
+#define vload_partial_4_11 NO_LOAD
+#define vload_partial_4_12 NO_LOAD
+#define vload_partial_4_13 NO_LOAD
+#define vload_partial_4_14 NO_LOAD
+#define vload_partial_4_15 NO_LOAD
+#define vload_partial_4_16 NO_LOAD
+// Size == 8
+#define vload_partial_8_0  NO_LOAD
+#define vload_partial_8_1  vload_partial_1
+#define vload_partial_8_2  vload_partial_2
+#define vload_partial_8_3  vload_partial_3
+#define vload_partial_8_4  vload_partial_4
+#define vload_partial_8_5  vload_partial_5
+#define vload_partial_8_6  vload_partial_6
+#define vload_partial_8_7  vload_partial_7
+#define vload_partial_8_8  vload_partial_8
+#define vload_partial_8_9  NO_LOAD
+#define vload_partial_8_10 NO_LOAD
+#define vload_partial_8_11 NO_LOAD
+#define vload_partial_8_12 NO_LOAD
+#define vload_partial_8_13 NO_LOAD
+#define vload_partial_8_14 NO_LOAD
+#define vload_partial_8_15 NO_LOAD
+#define vload_partial_8_16 NO_LOAD
+// Size == 16
+#define vload_partial_16_0  NO_LOAD
+#define vload_partial_16_1  vload_partial_1
+#define vload_partial_16_2  vload_partial_2
+#define vload_partial_16_3  vload_partial_3
+#define vload_partial_16_4  vload_partial_4
+#define vload_partial_16_5  vload_partial_5
+#define vload_partial_16_6  vload_partial_6
+#define vload_partial_16_7  vload_partial_7
+#define vload_partial_16_8  vload_partial_8
+#define vload_partial_16_9  vload_partial_9
+#define vload_partial_16_10 vload_partial_10
+#define vload_partial_16_11 vload_partial_11
+#define vload_partial_16_12 vload_partial_12
+#define vload_partial_16_13 vload_partial_13
+#define vload_partial_16_14 vload_partial_14
+#define vload_partial_16_15 vload_partial_15
+#define vload_partial_16_16 vload_partial_16
+
+/** Partial vload. Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vload ops
+ * @name vload_partial_n
+ *
+ * @note @p DATA needs to be a vector not a scalar
+ * @note n needs to be <= the vector width of the input variable @p DATA
+ * eg 1: Valid
+ * vload_partial_15(var:float16, 0, 0xabcd);
+ * eg 2: Invalid
+ * vload_partial_7(var:float4, 0, 0xabcd);
+ *
+ * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vload is invoked, thus there's no performance penalty.
+ *
+ * @param[in] DATA   The name of the variable where to load the values
+ * @param[in] OFFSET Offset in n
+ * @param[in] PTR    The base pointer
+ * @{
+ */
+#define vload_partial_1(DATA, OFFSET, PTR) DATA.s0 = vload1(OFFSET, PTR);
+
+#define vload_partial_2(DATA, OFFSET, PTR) DATA.s01 = vload2(OFFSET, PTR);
+
+#define vload_partial_3(DATA, OFFSET, PTR) DATA.s012 = vload3(OFFSET, PTR);
+
+#define vload_partial_4(DATA, OFFSET, PTR) DATA.s0123 = vload4(OFFSET, PTR);
+
+#define vload_partial_5(DATA, OFFSET, PTR)    \
+    vload_partial_4(DATA.s0123, OFFSET, PTR); \
+    DATA.s4 = vload1(OFFSET, PTR + 4);
+
+#define vload_partial_6(DATA, OFFSET, PTR)    \
+    vload_partial_4(DATA.s0123, OFFSET, PTR); \
+    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
+
+#define vload_partial_7(DATA, OFFSET, PTR)    \
+    vload_partial_4(DATA.s0123, OFFSET, PTR); \
+    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
+
+#define vload_partial_8(DATA, OFFSET, PTR) DATA.s01234567 = vload8(OFFSET, PTR);
+
+#define vload_partial_9(DATA, OFFSET, PTR)        \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    DATA.s8 = vload1(OFFSET, PTR + 8);
+
+#define vload_partial_10(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
+
+#define vload_partial_11(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
+
+#define vload_partial_12(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
+// For vload_partial_{13,14,15}, an 8-vector size has been passed, because vectors size of size 5,6,7 are not supported
+#define vload_partial_13(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
+
+#define vload_partial_14(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
+
+#define vload_partial_15(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
+
+#define vload_partial_16(DATA, OFFSET, PTR) DATA = vload16(OFFSET, PTR);
+/** @} */ // end of groupd vload_partial_n
+/** @} */ // end of groupd VLOAD_PARTIAL
+
+#define PIXEL_UNIT4  1
+#define PIXEL_UNIT8  2
 #define PIXEL_UNIT16 4
 
 /** Utility macro to convert a vector size in pixel unit.
@@ -216,17 +419,45 @@
  * @{
  */
 #define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
-#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
+#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size)     CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
 /** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
 
 #define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
-#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_floatx2(img, x_coord, y_coord) \
+    (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_floatx4(img, x_coord, y_coord)                                                       \
+    (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), \
+              read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
 
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
-#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_halfx2(img, x_coord, y_coord) \
+    (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_halfx4(img, x_coord, y_coord)                                                       \
+    (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), \
+             read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
+#define write_image2d_floatx2(img, x_coord, y_coord, values)    \
+    (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \
+     write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_floatx4(img, x_coord, y_coord, values)        \
+    (write_imagef(img, (int2)(x_coord, y_coord), values.s0123),     \
+     write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+     write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+     write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
+#define write_image2d_halfx2(img, x_coord, y_coord, values)     \
+    (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \
+     write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_halfx4(img, x_coord, y_coord, values)         \
+    (write_imageh(img, (int2)(x_coord, y_coord), values.s0123),     \
+     write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+     write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+     write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
 /** Utility macro to read a 2D OpenCL image object.
@@ -243,24 +474,44 @@
  * @{
  */
 #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
-#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
+#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord)     READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
+/** @} */
+
+/** Utility macro to write a 2D OpenCL image object.
+ *
+ * @note Coordinates are not normalized
+ *
+ * @param[in] data_type Data type
+ * @param[in] n0        Number of pixel to write. Only 1,2 and 4 is supported
+ * @param[in] img       OpenCL image object
+ * @param[in] x_coord   The x coordinate for the top-left pixel
+ * @param[in] y_coord   The y coordinate for the top-left pixel
+ * @param[in] values    Values to write
+ *
+ * @{
+ */
+#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) \
+    write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) \
+    WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
+/** @} */
 
 #define VSTORE_STR(size) vstore##size
-#define VSTORE(size) VSTORE_STR(size)
+#define VSTORE(size)     VSTORE_STR(size)
 
-#define float1 float
-#define half1 half
-#define char1 char
-#define uchar1 uchar
-#define short1 short
+#define float1  float
+#define half1   half
+#define char1   char
+#define uchar1  uchar
+#define short1  short
 #define ushort1 ushort
-#define int1 int
-#define uint1 uint
-#define long1 long
-#define ulong1 ulong
+#define int1    int
+#define uint1   uint
+#define long1   long
+#define ulong1  ulong
 #define double1 double
 
-#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vload1(OFFSET, PTR)        *(OFFSET + PTR)
 #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
 
 /** Extended partial vstore that correctly handles scalar values as well.
@@ -279,23 +530,23 @@
  * @{
  */
 #define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
-#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
+#define VSTORE_PARTIAL(size, store_size)     VSTORE_PARTIAL_STR(size, store_size)
 
 #define NO_STORE(data, offs, ptr) \
     {                             \
     }
 
 // Size == 1 (scalar)
-#define vstore_partial_1_0 NO_STORE
-#define vstore_partial_1_1 vstore1
-#define vstore_partial_1_2 NO_STORE
-#define vstore_partial_1_3 NO_STORE
-#define vstore_partial_1_4 NO_STORE
-#define vstore_partial_1_5 NO_STORE
-#define vstore_partial_1_6 NO_STORE
-#define vstore_partial_1_7 NO_STORE
-#define vstore_partial_1_8 NO_STORE
-#define vstore_partial_1_9 NO_STORE
+#define vstore_partial_1_0  NO_STORE
+#define vstore_partial_1_1  vstore1
+#define vstore_partial_1_2  NO_STORE
+#define vstore_partial_1_3  NO_STORE
+#define vstore_partial_1_4  NO_STORE
+#define vstore_partial_1_5  NO_STORE
+#define vstore_partial_1_6  NO_STORE
+#define vstore_partial_1_7  NO_STORE
+#define vstore_partial_1_8  NO_STORE
+#define vstore_partial_1_9  NO_STORE
 #define vstore_partial_1_10 NO_STORE
 #define vstore_partial_1_11 NO_STORE
 #define vstore_partial_1_12 NO_STORE
@@ -304,16 +555,16 @@
 #define vstore_partial_1_15 NO_STORE
 #define vstore_partial_1_16 NO_STORE
 // Size == 2
-#define vstore_partial_2_0 NO_STORE
-#define vstore_partial_2_1 vstore_partial_1
-#define vstore_partial_2_2 vstore_partial_2
-#define vstore_partial_2_3 NO_STORE
-#define vstore_partial_2_4 NO_STORE
-#define vstore_partial_2_5 NO_STORE
-#define vstore_partial_2_6 NO_STORE
-#define vstore_partial_2_7 NO_STORE
-#define vstore_partial_2_8 NO_STORE
-#define vstore_partial_2_9 NO_STORE
+#define vstore_partial_2_0  NO_STORE
+#define vstore_partial_2_1  vstore_partial_1
+#define vstore_partial_2_2  vstore_partial_2
+#define vstore_partial_2_3  NO_STORE
+#define vstore_partial_2_4  NO_STORE
+#define vstore_partial_2_5  NO_STORE
+#define vstore_partial_2_6  NO_STORE
+#define vstore_partial_2_7  NO_STORE
+#define vstore_partial_2_8  NO_STORE
+#define vstore_partial_2_9  NO_STORE
 #define vstore_partial_2_10 NO_STORE
 #define vstore_partial_2_11 NO_STORE
 #define vstore_partial_2_12 NO_STORE
@@ -322,16 +573,16 @@
 #define vstore_partial_2_15 NO_STORE
 #define vstore_partial_2_16 NO_STORE
 // Size == 3
-#define vstore_partial_3_0 NO_STORE
-#define vstore_partial_3_1 vstore_partial_1
-#define vstore_partial_3_2 vstore_partial_2
-#define vstore_partial_3_3 vstore_partial_3
-#define vstore_partial_3_4 NO_STORE
-#define vstore_partial_3_5 NO_STORE
-#define vstore_partial_3_6 NO_STORE
-#define vstore_partial_3_7 NO_STORE
-#define vstore_partial_3_8 NO_STORE
-#define vstore_partial_3_9 NO_STORE
+#define vstore_partial_3_0  NO_STORE
+#define vstore_partial_3_1  vstore_partial_1
+#define vstore_partial_3_2  vstore_partial_2
+#define vstore_partial_3_3  vstore_partial_3
+#define vstore_partial_3_4  NO_STORE
+#define vstore_partial_3_5  NO_STORE
+#define vstore_partial_3_6  NO_STORE
+#define vstore_partial_3_7  NO_STORE
+#define vstore_partial_3_8  NO_STORE
+#define vstore_partial_3_9  NO_STORE
 #define vstore_partial_3_10 NO_STORE
 #define vstore_partial_3_11 NO_STORE
 #define vstore_partial_3_12 NO_STORE
@@ -340,16 +591,16 @@
 #define vstore_partial_3_15 NO_STORE
 #define vstore_partial_3_16 NO_STORE
 // Size == 4
-#define vstore_partial_4_0 NO_STORE
-#define vstore_partial_4_1 vstore_partial_1
-#define vstore_partial_4_2 vstore_partial_2
-#define vstore_partial_4_3 vstore_partial_3
-#define vstore_partial_4_4 vstore_partial_4
-#define vstore_partial_4_5 NO_STORE
-#define vstore_partial_4_6 NO_STORE
-#define vstore_partial_4_7 NO_STORE
-#define vstore_partial_4_8 NO_STORE
-#define vstore_partial_4_9 NO_STORE
+#define vstore_partial_4_0  NO_STORE
+#define vstore_partial_4_1  vstore_partial_1
+#define vstore_partial_4_2  vstore_partial_2
+#define vstore_partial_4_3  vstore_partial_3
+#define vstore_partial_4_4  vstore_partial_4
+#define vstore_partial_4_5  NO_STORE
+#define vstore_partial_4_6  NO_STORE
+#define vstore_partial_4_7  NO_STORE
+#define vstore_partial_4_8  NO_STORE
+#define vstore_partial_4_9  NO_STORE
 #define vstore_partial_4_10 NO_STORE
 #define vstore_partial_4_11 NO_STORE
 #define vstore_partial_4_12 NO_STORE
@@ -358,16 +609,16 @@
 #define vstore_partial_4_15 NO_STORE
 #define vstore_partial_4_16 NO_STORE
 // Size == 8
-#define vstore_partial_8_0 NO_STORE
-#define vstore_partial_8_1 vstore_partial_1
-#define vstore_partial_8_2 vstore_partial_2
-#define vstore_partial_8_3 vstore_partial_3
-#define vstore_partial_8_4 vstore_partial_4
-#define vstore_partial_8_5 vstore_partial_5
-#define vstore_partial_8_6 vstore_partial_6
-#define vstore_partial_8_7 vstore_partial_7
-#define vstore_partial_8_8 vstore_partial_8
-#define vstore_partial_8_9 NO_STORE
+#define vstore_partial_8_0  NO_STORE
+#define vstore_partial_8_1  vstore_partial_1
+#define vstore_partial_8_2  vstore_partial_2
+#define vstore_partial_8_3  vstore_partial_3
+#define vstore_partial_8_4  vstore_partial_4
+#define vstore_partial_8_5  vstore_partial_5
+#define vstore_partial_8_6  vstore_partial_6
+#define vstore_partial_8_7  vstore_partial_7
+#define vstore_partial_8_8  vstore_partial_8
+#define vstore_partial_8_9  NO_STORE
 #define vstore_partial_8_10 NO_STORE
 #define vstore_partial_8_11 NO_STORE
 #define vstore_partial_8_12 NO_STORE
@@ -376,16 +627,16 @@
 #define vstore_partial_8_15 NO_STORE
 #define vstore_partial_8_16 NO_STORE
 // Size == 16
-#define vstore_partial_16_0 NO_STORE
-#define vstore_partial_16_1 vstore_partial_1
-#define vstore_partial_16_2 vstore_partial_2
-#define vstore_partial_16_3 vstore_partial_3
-#define vstore_partial_16_4 vstore_partial_4
-#define vstore_partial_16_5 vstore_partial_5
-#define vstore_partial_16_6 vstore_partial_6
-#define vstore_partial_16_7 vstore_partial_7
-#define vstore_partial_16_8 vstore_partial_8
-#define vstore_partial_16_9 vstore_partial_9
+#define vstore_partial_16_0  NO_STORE
+#define vstore_partial_16_1  vstore_partial_1
+#define vstore_partial_16_2  vstore_partial_2
+#define vstore_partial_16_3  vstore_partial_3
+#define vstore_partial_16_4  vstore_partial_4
+#define vstore_partial_16_5  vstore_partial_5
+#define vstore_partial_16_6  vstore_partial_6
+#define vstore_partial_16_7  vstore_partial_7
+#define vstore_partial_16_8  vstore_partial_8
+#define vstore_partial_16_9  vstore_partial_9
 #define vstore_partial_16_10 vstore_partial_10
 #define vstore_partial_16_11 vstore_partial_11
 #define vstore_partial_16_12 vstore_partial_12
@@ -411,17 +662,13 @@
  * @param[in] PTR    The base pointer
  * @{
  */
-#define vstore_partial_1(DATA, OFFSET, PTR) \
-    vstore1(DATA.s0, OFFSET, PTR);
+#define vstore_partial_1(DATA, OFFSET, PTR) vstore1(DATA.s0, OFFSET, PTR);
 
-#define vstore_partial_2(DATA, OFFSET, PTR) \
-    vstore2(DATA.s01, OFFSET, PTR);
+#define vstore_partial_2(DATA, OFFSET, PTR) vstore2(DATA.s01, OFFSET, PTR);
 
-#define vstore_partial_3(DATA, OFFSET, PTR) \
-    vstore3(DATA.s012, OFFSET, PTR);
+#define vstore_partial_3(DATA, OFFSET, PTR) vstore3(DATA.s012, OFFSET, PTR);
 
-#define vstore_partial_4(DATA, OFFSET, PTR) \
-    vstore4(DATA.s0123, OFFSET, PTR);
+#define vstore_partial_4(DATA, OFFSET, PTR) vstore4(DATA.s0123, OFFSET, PTR);
 
 #define vstore_partial_5(DATA, OFFSET, PTR)    \
     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
@@ -435,8 +682,7 @@
     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
     vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
 
-#define vstore_partial_8(DATA, OFFSET, PTR) \
-    vstore8(DATA.s01234567, OFFSET, PTR);
+#define vstore_partial_8(DATA, OFFSET, PTR) vstore8(DATA.s01234567, OFFSET, PTR);
 
 #define vstore_partial_9(DATA, OFFSET, PTR)        \
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
@@ -466,162 +712,156 @@
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
     vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
 
-#define vstore_partial_16(DATA, OFFSET, PTR) \
-    vstore16(DATA, OFFSET, PTR);
+#define vstore_partial_16(DATA, OFFSET, PTR) vstore16(DATA, OFFSET, PTR);
 /** @} */ // end of groupd vstore_partial_n
 /** @} */ // end of groupd VSTORE_PARTIAL
 
 // Convert built-in functions with _sat modifier are not supported in floating point so we create defines
 // without _sat to overcome this issue
-#define convert_float_sat convert_float
-#define convert_float1_sat convert_float
-#define convert_float2_sat convert_float2
-#define convert_float3_sat convert_float3
-#define convert_float4_sat convert_float4
-#define convert_float8_sat convert_float8
+#define convert_float_sat   convert_float
+#define convert_float1_sat  convert_float
+#define convert_float2_sat  convert_float2
+#define convert_float3_sat  convert_float3
+#define convert_float4_sat  convert_float4
+#define convert_float8_sat  convert_float8
 #define convert_float16_sat convert_float16
-#define convert_half_sat convert_float
-#define convert_half1_sat convert_half
-#define convert_half2_sat convert_half2
-#define convert_half3_sat convert_half3
-#define convert_half4_sat convert_half4
-#define convert_half8_sat convert_half8
-#define convert_half16_sat convert_half16
-
-#define convert_float1 convert_float
-#define convert_half1 convert_half
-#define convert_char1 convert_char
-#define convert_uchar1 convert_uchar
-#define convert_short1 convert_short
+#define convert_half_sat    convert_float
+#define convert_half1_sat   convert_half
+#define convert_half2_sat   convert_half2
+#define convert_half3_sat   convert_half3
+#define convert_half4_sat   convert_half4
+#define convert_half8_sat   convert_half8
+#define convert_half16_sat  convert_half16
+
+#define convert_float1  convert_float
+#define convert_half1   convert_half
+#define convert_char1   convert_char
+#define convert_uchar1  convert_uchar
+#define convert_short1  convert_short
 #define convert_ushort1 convert_ushort
-#define convert_int1 convert_int
-#define convert_uint1 convert_uint
-#define convert_long1 convert_long
-#define convert_ulong1 convert_ulong
+#define convert_int1    convert_int
+#define convert_uint1   convert_uint
+#define convert_long1   convert_long
+#define convert_ulong1  convert_ulong
 #define convert_double1 convert_double
 
-#define convert_char1_sat convert_char_sat
-#define convert_uchar1_sat convert_uchar_sat
-#define convert_uchar2_sat convert_uchar2_sat
-#define convert_uchar3_sat convert_uchar3_sat
-#define convert_uchar4_sat convert_uchar4_sat
-#define convert_uchar8_sat convert_uchar8_sat
+#define convert_char1_sat   convert_char_sat
+#define convert_uchar1_sat  convert_uchar_sat
+#define convert_uchar2_sat  convert_uchar2_sat
+#define convert_uchar3_sat  convert_uchar3_sat
+#define convert_uchar4_sat  convert_uchar4_sat
+#define convert_uchar8_sat  convert_uchar8_sat
 #define convert_uchar16_sat convert_uchar16_sat
-#define convert_short1_sat convert_short_sat
+#define convert_short1_sat  convert_short_sat
 #define convert_ushort1_sat convert_ushort_sat
-#define convert_int1_sat convert_int_sat
-#define convert_uint1_sat convert_uint_sat
-#define convert_long1_sat convert_long_sat
-#define convert_ulong1_sat convert_ulong_sat
+#define convert_int1_sat    convert_int_sat
+#define convert_uint1_sat   convert_uint_sat
+#define convert_long1_sat   convert_long_sat
+#define convert_ulong1_sat  convert_ulong_sat
 #define convert_double1_sat convert_double_sat
 
 #define VEC_DATA_TYPE_STR(type, size) type##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+#define VEC_DATA_TYPE(type, size)     VEC_DATA_TYPE_STR(type, size)
 
 #define CONVERT_STR(x, type) (convert_##type((x)))
-#define CONVERT(x, type) CONVERT_STR(x, type)
+#define CONVERT(x, type)     CONVERT_STR(x, type)
 
 #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+#define CONVERT_SAT(x, type)     CONVERT_SAT_STR(x, type)
 
 #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
-#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+#define CONVERT_SAT_ROUND(x, type, round)     CONVERT_SAT_ROUND_STR(x, type, round)
 
-#define select_vec_dt_uchar(size) uchar##size
-#define select_vec_dt_char(size) char##size
+#define select_vec_dt_uchar(size)  uchar##size
+#define select_vec_dt_char(size)   char##size
 #define select_vec_dt_ushort(size) ushort##size
-#define select_vec_dt_short(size) short##size
-#define select_vec_dt_half(size) short##size
-#define select_vec_dt_uint(size) uint##size
-#define select_vec_dt_int(size) int##size
-#define select_vec_dt_float(size) int##size
-#define select_vec_dt_ulong(size) ulong##size
-#define select_vec_dt_long(size) long##size
+#define select_vec_dt_short(size)  short##size
+#define select_vec_dt_half(size)   short##size
+#define select_vec_dt_uint(size)   uint##size
+#define select_vec_dt_int(size)    int##size
+#define select_vec_dt_float(size)  int##size
+#define select_vec_dt_ulong(size)  ulong##size
+#define select_vec_dt_long(size)   long##size
 
 #define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
-#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
-#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
+#define SELECT_VEC_DATA_TYPE(type, size)     SELECT_VEC_DATA_TYPE_STR(type, size)
+#define SELECT_DATA_TYPE(type)               SELECT_VEC_DATA_TYPE_STR(type, 1)
 
-#define signed_int_vec_dt_uchar(size) char##size
-#define signed_int_vec_dt_char(size) char##size
+#define signed_int_vec_dt_uchar(size)  char##size
+#define signed_int_vec_dt_char(size)   char##size
 #define signed_int_vec_dt_ushort(size) short##size
-#define signed_int_vec_dt_short(size) short##size
-#define signed_int_vec_dt_half(size) short##size
-#define signed_int_vec_dt_uint(size) int##size
-#define signed_int_vec_dt_int(size) int##size
-#define signed_int_vec_dt_float(size) int##size
-#define signed_int_vec_dt_ulong(size) long##size
-#define signed_int_vec_dt_long(size) long##size
+#define signed_int_vec_dt_short(size)  short##size
+#define signed_int_vec_dt_half(size)   short##size
+#define signed_int_vec_dt_uint(size)   int##size
+#define signed_int_vec_dt_int(size)    int##size
+#define signed_int_vec_dt_float(size)  int##size
+#define signed_int_vec_dt_ulong(size)  long##size
+#define signed_int_vec_dt_long(size)   long##size
 
 #define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
-#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
-#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
-
-#define sum_reduce_1(x) (x)
-#define sum_reduce_2(x) ((x).s0) + ((x).s1)
-#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
-#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
-#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
+#define SIGNED_INT_VEC_DATA_TYPE(type, size)     SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
+#define SIGNED_INT_DATA_TYPE(type)               SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
+
+#define sum_reduce_1(x)  (x)
+#define sum_reduce_2(x)  ((x).s0) + ((x).s1)
+#define sum_reduce_3(x)  sum_reduce_2((x).s01) + ((x).s2)
+#define sum_reduce_4(x)  sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
+#define sum_reduce_8(x)  sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
 #define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
 
 #define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
-#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
+#define SUM_REDUCE(x, size)     SUM_REDUCE_STR(x, size)
 
-#define prod_reduce_1(x) (x)
-#define prod_reduce_2(x) ((x).s0) * ((x).s1)
-#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
-#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
-#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
+#define prod_reduce_1(x)  (x)
+#define prod_reduce_2(x)  ((x).s0) * ((x).s1)
+#define prod_reduce_3(x)  prod_reduce_2((x).s01) * ((x).s2)
+#define prod_reduce_4(x)  prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
+#define prod_reduce_8(x)  prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
 #define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
 
 #define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
-#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
+#define PROD_REDUCE(x, size)     PROD_REDUCE_STR(x, size)
 
-#define max_reduce_1(x) (x)
-#define max_reduce_2(x) max(((x).s0), ((x).s1))
-#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
-#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
-#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
+#define max_reduce_1(x)  (x)
+#define max_reduce_2(x)  max(((x).s0), ((x).s1))
+#define max_reduce_3(x)  max(max_reduce_2((x).s01), ((x).s2))
+#define max_reduce_4(x)  max(max_reduce_2((x).s01), max_reduce_2((x).s23))
+#define max_reduce_8(x)  max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
 #define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
 
 #define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
-#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
-
-#define VECTOR_DECLARATION(name)     \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define IMAGE_DECLARATION(name)      \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define TENSOR3D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define TENSOR4D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_stride_w, \
-    uint        name##_step_w,   \
-    uint        name##_offset_first_element_in_bytes
+#define MAX_REDUCE(x, size)     MAX_REDUCE_STR(x, size)
+
+#define min_reduce_1(x)  (x)
+#define min_reduce_2(x)  min(((x).s0), ((x).s1))
+#define min_reduce_3(x)  min(min_reduce_2((x).s01), ((x).s2))
+#define min_reduce_4(x)  min(min_reduce_2((x).s01), min_reduce_2((x).s23))
+#define min_reduce_8(x)  min(min_reduce_4((x).s0123), min_reduce_4((x).s4567))
+#define min_reduce_16(x) min(min_reduce_8((x).s01234567), min_reduce_8((x).s89ABCDEF))
+
+#define MIN_REDUCE_STR(x, size) min_reduce_##size(x)
+#define MIN_REDUCE(x, size)     MIN_REDUCE_STR(x, size)
+
+#define VECTOR_DECLARATION(name) \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)                                                                                     \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w,                         \
+        uint name##_offset_first_element_in_bytes
+
+#define TENSOR5D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, uint name##_stride_v,   \
+        uint name##_step_v, uint name##_offset_first_element_in_bytes
 
 #define CONVERT_TO_VECTOR_STRUCT(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
@@ -629,38 +869,47 @@
 #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
 
-#define CONVERT_TO_IMAGE_STRUCT(name) \
-    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+#define CONVERT_TO_IMAGE_STRUCT(name)                                                                           \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                              name##_stride_y, name##_step_y)
 
 #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
     update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                                                 \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                            name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                                            name##_step_z)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)                                                            \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                            name##_stride_y, 0, name##_stride_z, name##_step_z)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                                                 \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                            name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                                            name##_step_z)
 
-#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
-    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                 name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                                 name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
 
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
-    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)                                                       \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                 name##_stride_y, 0, name##_stride_z, 0)
 
-#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
-    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                                 name##_stride_y, name##_step_y, name##_stride_z, name##_step_z, name##_stride_w,  \
+                                 name##_step_w, mod_size)
 
-#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
-    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name)                                                            \
+    update_tensor4D_workitem_no_step_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                         name##_stride_y, name##_stride_z, name##_stride_w)
 
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
-    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                           name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                       \
+    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                           name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
 
 /** Structure to hold Vector information */
 typedef struct Vector
@@ -709,10 +958,10 @@ typedef struct Tensor4D
  *
  * @return An image object
  */
-inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+inline Vector
+update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
 {
-    Vector vector =
-    {
+    Vector vector = {
         .ptr                           = ptr,
         .offset_first_element_in_bytes = offset_first_element_in_bytes,
         .stride_x                      = stride_x,
@@ -732,15 +981,13 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_
  *
  * @return An image object
  */
-inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+inline Image update_image_workitem_ptr(
+    __global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
 {
-    Image img =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y
-    };
+    Image img = {.ptr                           = ptr,
+                 .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                 .stride_x                      = stride_x,
+                 .stride_y                      = stride_y};
     img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
     return img;
 }
@@ -758,16 +1005,21 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
  *
  * @return A 3D tensor object
  */
-inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+                                                     uint            offset_first_element_in_bytes,
+                                                     uint            stride_x,
+                                                     uint            step_x,
+                                                     uint            stride_y,
+                                                     uint            step_y,
+                                                     uint            stride_z,
+                                                     uint            step_z)
 {
-    Image img =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y
-    };
-    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    Image img = {.ptr                           = ptr,
+                 .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                 .stride_x                      = stride_x,
+                 .stride_y                      = stride_y};
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+               get_global_id(2) * step_z;
     return img;
 }
 
@@ -784,17 +1036,22 @@ inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint o
  *
  * @return A 3D tensor object
  */
-inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
+                                             uint            offset_first_element_in_bytes,
+                                             uint            stride_x,
+                                             uint            step_x,
+                                             uint            stride_y,
+                                             uint            step_y,
+                                             uint            stride_z,
+                                             uint            step_z)
 {
-    Tensor3D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z
-    };
-    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    Tensor3D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z};
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+                  get_global_id(2) * step_z;
     return tensor;
 }
 
@@ -811,34 +1068,58 @@ inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_fi
  *
  * @return A 3D tensor object
  */
-inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr,
+                                       uint            offset_first_element_in_bytes,
+                                       uint            stride_x,
+                                       uint            step_x,
+                                       uint            stride_y,
+                                       uint            step_y,
+                                       uint            stride_z,
+                                       uint            step_z)
 {
-    Tensor3D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z
-    };
+    Tensor3D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z};
     return tensor;
 }
 
-inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
-                                             uint step_w,
-                                             uint mod_size)
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
+                                             uint            offset_first_element_in_bytes,
+                                             uint            stride_x,
+                                             uint            step_x,
+                                             uint            stride_y,
+                                             uint            step_y,
+                                             uint            stride_z,
+                                             uint            step_z,
+                                             uint            stride_w,
+                                             uint            step_w,
+                                             uint            mod_size)
 {
-    Tensor4D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z,
-        .stride_w                      = stride_w
-    };
+    Tensor4D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z,
+                       .stride_w                      = stride_w};
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+                  (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
 
-    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+inline Tensor4D update_tensor4D_workitem_no_step_ptr(
+    __global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint stride_y, uint stride_z, uint stride_w)
+{
+    Tensor4D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z,
+                       .stride_w                      = stride_w};
+
+    tensor.ptr += tensor.offset_first_element_in_bytes;
     return tensor;
 }
 
@@ -910,7 +1191,8 @@ inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint wid
 
     const uint x = index;
 
-    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
+           tensor->offset_first_element_in_bytes;
 }
 
-#endif // _HELPER_H
+#endif // ACL_SRC_CORE_CL_CL_KERNELS_HELPERS_H
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index 562c5d3236..166260a3c0 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -34,7 +34,7 @@
  * @return The converted vector
  */
 #define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+#define CONVERT_DOWN_RTE(x, type)     CONVERT_DOWN_RTE_STR(x, type)
 
 /** Quantize a floating-point scalar value to 8-bit asymmetric
  *
@@ -84,14 +84,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return quantized values
  */
-#define QUANTIZE_IMPL(type, size)                                                                                       \
-    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
-    {                                                                                                                   \
-        VEC_DATA_TYPE(float, size)                                                                                      \
-        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
-        VEC_DATA_TYPE(type, size)                                                                                       \
-        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
-        return res;                                                                                                     \
+#define QUANTIZE_IMPL(type, size)                                                                          \
+    inline VEC_DATA_TYPE(type, size)                                                                       \
+        quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)                 \
+    {                                                                                                      \
+        VEC_DATA_TYPE(float, size)                                                                         \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);      \
+        VEC_DATA_TYPE(type, size)                                                                          \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \
+        return res;                                                                                        \
     }
 
 /** Dequantize a vector of values to floating-point
@@ -101,10 +102,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return dequantized values in floating point
  */
-#define DEQUANTIZE_IMPL(type, size)                                                                                       \
-    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
-    {                                                                                                                     \
-        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+#define DEQUANTIZE_IMPL(type, size)                                                         \
+    inline VEC_DATA_TYPE(float, size)                                                       \
+        dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                       \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;               \
     }
 
 /** Correctly-rounded-to-nearest division by a power-of-two.
@@ -113,18 +115,17 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Correctly-rounded-to-nearest division by a power-of-two.
  */
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
-    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
-    {                                                                                                                                   \
-        const VEC_DATA_TYPE(int, size)                                                                                                  \
-        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
-        const VEC_DATA_TYPE(int, size)                                                                                                  \
-        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
-        VEC_DATA_TYPE(int, size)                                                                                                        \
-        mask = (one << exponent) - one;                                                                                                 \
-        VEC_DATA_TYPE(int, size)                                                                                                        \
-        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
-        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                               \
+    inline VEC_DATA_TYPE(int, size)                                                                            \
+        asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
+    {                                                                                                          \
+        const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;                                     \
+        const VEC_DATA_TYPE(int, size) one  = (VEC_DATA_TYPE(int, size))1;                                     \
+        VEC_DATA_TYPE(int, size)                                                                               \
+        mask = (one << exponent) - one;                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                               \
+        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                 \
+        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \
     }
 
 /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -167,27 +168,29 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
-    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
-    {                                                                                                                               \
-        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
-        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
-        const int k_fractional_bits = 31;                                                                                           \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x2 = ASYMM_MULT(x, x, size);                                                                                                \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
-        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                              \
+    inline VEC_DATA_TYPE(int, size)                                                                           \
+        asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a)       \
+    {                                                                                                         \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                        \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                         \
+        const int k_fractional_bits                      = 31;                                                \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x = a + (1 << (k_fractional_bits - 3));                                                               \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x2 = ASYMM_MULT(x, x, size);                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x3 = ASYMM_MULT(x2, x, size);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                               \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;       \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                            \
+            ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                        \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
     }
 
 /** Each bit of the result is set to the corresponding bit of either then_val or
@@ -198,10 +201,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
  */
-#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
-    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
-    {                                                                                                                                                                     \
-        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                      \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(                                              \
+        VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                           \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                    \
     }
 
 /** For each element of input vector, the corresponding bits of the result item are set
@@ -234,18 +238,19 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
         return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
     }
 
-#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
-    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
-    {                                                                                                                                                                                                         \
-        if(k_integer_bits > exponent)                                                                                                                                                                         \
-        {                                                                                                                                                                                                     \
-            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
-            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
-                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
-                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
-        }                                                                                                                                                                                                     \
-        \
-        return result;                                                                                                                                                                                        \
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                  \
+    inline VEC_DATA_TYPE(int, size)                                                                                    \
+        exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \
+                                 int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                            \
+    {                                                                                                                  \
+        if (k_integer_bits > exponent)                                                                                 \
+        {                                                                                                              \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                   \
+            return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),            \
+                                           ASYMM_MULT(result, fp_multiplier, size), result, size);                     \
+        }                                                                                                              \
+                                                                                                                       \
+        return result;                                                                                                 \
     }
 
 /** Calculates \f$ exp(x) \f$ for x < 0.
@@ -254,39 +259,40 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
-    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
-    {                                                                                                                         \
-        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        mask = k_one_quarter - 1;                                                                                             \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
-        \
-        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
-        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
-        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
-        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
-        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
-        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
-        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
-        \
-        if(k_integer_bits > 5)                                                                                                \
-        {                                                                                                                     \
-            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
-            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
-        }                                                                                                                     \
-        \
-        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
-        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
+    {                                                                                                                  \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        mask = k_one_quarter - 1;                                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                    \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, \
+                                                                               size);                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                               \
+                                                                                                                       \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);       \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);       \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);        \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);        \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);         \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);           \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);              \
+                                                                                                                       \
+        if (k_integer_bits > 5)                                                                                        \
+        {                                                                                                              \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                    \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                \
+        }                                                                                                              \
+                                                                                                                       \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                               \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                             \
     }
 
 /** Calculates the product of a integer value by a power of two, with either a positive exponent
@@ -297,26 +303,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Arithmetic left or right shift.
  */
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
-    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-    {                                                                                                                      \
-        if(exponent < 0)                                                                                                   \
-        {                                                                                                                  \
-            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
-        }                                                                                                                  \
-        \
-        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
-        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
-        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        result = x << exponent;                                                                                            \
-        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
-        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
-        return result;                                                                                                     \
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                      \
+    inline VEC_DATA_TYPE(int, size)                                                            \
+        asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                          \
+        if (exponent < 0)                                                                      \
+        {                                                                                      \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                          \
+        }                                                                                      \
+                                                                                               \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                          \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                          \
+        int threshold                      = ((1 << (31 - exponent)) - 1);                     \
+        VEC_DATA_TYPE(int, size)                                                               \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                           \
+        VEC_DATA_TYPE(int, size)                                                               \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                          \
+        VEC_DATA_TYPE(int, size)                                                               \
+        result = x << exponent;                                                                \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                    \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                    \
+        return result;                                                                         \
     }
 
 /** Calculates (a+b)/2, rounded to the nearest integer.
@@ -326,20 +333,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return (a+b)/2, rounded to the nearest integer.
  */
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
-    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-    {                                                                                                                     \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        a64 = convert_long##size(a);                                                                                      \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        b64 = convert_long##size(b);                                                                                      \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        sum = a64 + b64;                                                                                                  \
-        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
-        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
-        return convert_int##size((sum + sign) / 2);                                                                       \
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size)                                                           \
+        asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                         \
+        VEC_DATA_TYPE(long, size)                                                             \
+        a64 = convert_long##size(a);                                                          \
+        VEC_DATA_TYPE(long, size)                                                             \
+        b64 = convert_long##size(b);                                                          \
+        VEC_DATA_TYPE(long, size)                                                             \
+        sum                                       = a64 + b64;                                \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                        \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                       \
+        VEC_DATA_TYPE(long, size)                                                             \
+        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));          \
+        return convert_int##size((sum + sign) / 2);                                           \
     }
 
 /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
@@ -354,12 +362,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
         const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
         const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
         VEC_DATA_TYPE(int, size)                                                                             \
-        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        half_denominator                                 = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);         \
         const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
         const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
         VEC_DATA_TYPE(int, size)                                                                             \
         x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
-        for(int i = 0; i < 3; i++)                                                                           \
+        for (int i = 0; i < 3; i++)                                                                          \
         {                                                                                                    \
             VEC_DATA_TYPE(int, size)                                                                         \
             half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
@@ -378,48 +386,57 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Rescaled value.
  */
-#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
-    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
-    {                                                                                                                               \
-        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
-        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+#define ASYMM_RESCALE_IMPL(size)                                                                        \
+    inline VEC_DATA_TYPE(int, size)                                                                     \
+        asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                   \
+        int exponent = src_integer_bits - dst_integer_bits;                                             \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                           \
     }
 
-#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
-#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define QUANTIZE_STR(input, offset, scale, type, size)   quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size)       QUANTIZE_STR(input, offset, scale, type, size)
 #define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
-#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE(input, offset, scale, type, size)     DEQUANTIZE_STR(input, offset, scale, type, size)
 
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
-#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
-#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size)     ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
+#define ASYMM_MULT_STR(a, b, size)                           asymm_mult##size(a, b)
+#define ASYMM_MULT(a, b, size)                               ASYMM_MULT_STR(a, b, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
     ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
     ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
-#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
-#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+    asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
+    asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size)     asymm_mask_if_zero##size(a)
 #define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
-#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) \
+    exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
 #define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
-#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size)     ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)       asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size)           ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
+    asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
 #define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
-#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
-#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
-
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
-    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
-    {                                                                                                                           \
-        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
-        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
-        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
+#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) \
+    asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+    ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
+
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                 \
+    inline VEC_DATA_TYPE(int, size)                                                                                 \
+        multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift)                 \
+    {                                                                                                               \
+        const int left_shift  = shift > 0 ? shift : 0;                                                              \
+        const int right_shift = shift > 0 ? 0 : -shift;                                                             \
+        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \
     }
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+    multiply_by_quantized_multiplier##size(input, qmul, shift)
 
 QUANTIZE_IMPL(uchar, 1)
 QUANTIZE_IMPL(char, 1)
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
deleted file mode 100644
index a1467a0b36..0000000000
--- a/src/core/CL/cl_kernels/im2col.cl
+++ /dev/null
@@ -1,1360 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
-
-#if ELEMENT_SIZE == 1
-#define COND_DATA_TYPE char
-#elif ELEMENT_SIZE == 2
-#define COND_DATA_TYPE short
-#elif ELEMENT_SIZE == 4
-#define COND_DATA_TYPE int
-#else // ELEMENT_SIZE
-#error "Element size not support"
-#endif // ELEMENT_SIZE
-
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col1x1_stridex1_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const uint xc    = get_global_id(0) * 4;         // x coordinate in the convolved tensor
-    const uint yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const uint ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const uint batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Clamp xc
-    // The strategy clamps at "xc" as it will be a valid value for sure
-    uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);
-
-    // Check which values are valid
-    const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
-
-    xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));
-
-    // Calculate input indices
-    const uint xi = xc;
-    const uint yi = yc * STRIDE_Y;
-
-    // Calculate output indices
-
-#if defined(NUM_GROUPS)
-    const uint xo = ch % (SRC_DEPTH / NUM_GROUPS);
-    const uint zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                                   // defined(NUM_GROUPS)
-    const uint xo              = ch;
-#endif                                                  // defined(NUM_GROUPS)
-    const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    // Get input and output address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w;
-#else  // defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
-#endif // defined(NUM_GROUPS)
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    data = vload4(0, (__global DATA_TYPE *)input_ptr);
-
-    // If out-of-bound, overwrite with the first element
-    data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);
-
-    *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;
-    *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;
-    *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;
-    *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if(xo == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
-        *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
-        *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;
-        *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-#if defined(DILATION_X) && defined(DILATION_Y)
-/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_generic_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X - PAD_LEFT;
-    const int yi = yc * STRIDE_Y - PAD_TOP;
-
-    // Calculate output indices
-#if defined(NUM_GROUPS)
-    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
-    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                         // defined(NUM_GROUPS)
-    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
-#endif                                        // defined(NUM_GROUPS)
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
-#else  // defined(NUM_GROUPS)
-    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
-#endif // defined(NUM_GROUPS)
-
-    // Linearize convolution elements
-    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
-    {
-        int y = yi + yk * DILATION_Y;
-        for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
-        {
-            int x = xi + xk * DILATION_X;
-#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
-            *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-#else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
-            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
-            {
-                *output_ptr = PAD_VALUE;
-            }
-            else
-            {
-                *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-            }
-#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
-        }
-    }
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *output_ptr = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(DILATION_X) && defined(DILATION_Y)
-
-/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col3x3_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X - PAD_LEFT;
-    const int yi = yc * STRIDE_Y - PAD_TOP;
-
-    // Calculate output indices
-#if defined(NUM_GROUPS)
-    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3
-    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                         // defined(NUM_GROUPS)
-    const int xo               = ch * 9; // 3x3
-#endif                                        // defined(NUM_GROUPS)
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    // Get input and output address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
-#else  // defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
-#endif // defined(NUM_GROUPS)
-
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-    // Put 0 if the value is out-of-bound
-    int3 x = (int3)xi + (int3)(0, 1, 2);
-    int3 y = (int3)yi + (int3)(0, 1, 2);
-
-    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
-    cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
-    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
-    cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
-    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
-    cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
-
-    row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
-    row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);
-    row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-    vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
-    *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-
-/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col5x5_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X - PAD_LEFT;
-    const int yi = yc * STRIDE_Y - PAD_TOP;
-
-    // Calculate output indices
-#if defined(NUM_GROUPS)
-    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5
-    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                         // defined(NUM_GROUPS)
-    const int xo               = ch * 25; // 5x5
-#endif                                        // defined(NUM_GROUPS)
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-    // Put 0 if the value is out-of-bound
-    int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);
-    int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);
-    int  x1 = xi + 4;
-    int  y1 = yi + 4;
-
-    // Check if we could have out-of-bounds elements in the x direction
-    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-    x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
-    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-    y0_condition                = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
-    COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);
-    COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-    // Get input and output address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
-#else  // defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
-#endif // defined(NUM_GROUPS)
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
-        DATA_TYPE
-        row01 = *((__global DATA_TYPE *)input_ptr + 4);
-
-        input_ptr += src_stride_y;
-
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
-        DATA_TYPE
-        row11 = *((__global DATA_TYPE *)input_ptr + 4);
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;
-        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;
-        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);
-        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);
-
-        // Replace with 0 if the value is not valid
-        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
-        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
-        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
-        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
-                                              row10.s012),
-                0, (__global DATA_TYPE *)output_ptr);
-        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 10 * dst_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
-        DATA_TYPE
-        row01 = *((__global DATA_TYPE *)input_ptr + 4);
-
-        input_ptr += src_stride_y;
-
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
-        DATA_TYPE
-        row11 = *((__global DATA_TYPE *)input_ptr + 4);
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;
-        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;
-        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);
-        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);
-
-        // Replace with 0 if the value is not valid
-        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
-        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
-        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
-        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
-                                              row10.s012),
-                0, (__global DATA_TYPE *)output_ptr);
-        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 10 * dst_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
-        DATA_TYPE
-        row01 = *((__global DATA_TYPE *)input_ptr + 4);
-
-        input_ptr += src_stride_y;
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-        cond00                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;
-        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);
-
-        // Replace with 0 if the value is not valid
-        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
-        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-        vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);
-        *((__global DATA_TYPE *)output_ptr + 4) = row01;
-
-        output_ptr += 5 * dst_stride_x;
-    }
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *((__global DATA_TYPE *)output_ptr) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col11x11_padx0_pady0_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X;
-    const int yi = yc * STRIDE_Y;
-
-    // Calculate output indices
-#if defined(NUM_GROUPS)
-    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11
-    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                         // defined(NUM_GROUPS)
-    const int xo               = ch * 121; // 11x11
-#endif                                        // defined(NUM_GROUPS)
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    // Get input and output address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
-#else  // defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
-#endif // defined(NUM_GROUPS)
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        output_ptr += 11 * src_stride_x;
-    }
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *((__global DATA_TYPE *)output_ptr) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
-/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
- * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_generic_padx0_pady0_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X;
-    const int yi = yc * STRIDE_Y;
-
-    // Calculate output indices
-#if defined(NUM_GROUPS)
-    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
-    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                         // defined(NUM_GROUPS)
-    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
-#endif                                        // defined(NUM_GROUPS)
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
-#else  // defined(NUM_GROUPS)
-    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
-#endif // defined(NUM_GROUPS)
-
-    // Linearize convolution elements
-    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
-    {
-        int last_x = 0;
-        for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
-        {
-            VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-            row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-            VSTORE(VECTOR_SIZE)
-            (row, 0, output_ptr);
-            last_x = x;
-        }
-        // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
-        // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
-#if WIDTH_MOD_VECTOR_SIZE == 1
-        *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
-#elif WIDTH_MOD_VECTOR_SIZE > 1
-        VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
-        row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
-        VSTORE(WIDTH_MOD_VECTOR_SIZE)
-        (row, 0, output_ptr);
-#endif /* WIDTH_MOD_VECTOR_SIZE */
-        output_ptr += WIDTH_MOD_VECTOR_SIZE;
-    } /* End of loop over KERNEL_HEIGHT */
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *output_ptr = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
-
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-
-#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)
-
-/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
- *  @name IM2COL1X9_NHWC_STORE
- *
- *  @note To use this macro for a 3x3 block, @p ROW has to be 0
- *
- * @param[in] VECTOR_SIZE          The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
- * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
- * @param[in] DATA_TYPE            Data type of @p DATA
- * @param[in] SRC_DEPTH            Input channel size / depth
- * @param[in] DATA                 Value variable base name
- * @param[in] ROW                  The row number to store. Supported: 0-8
- * @param[in] OUTPUT_PTR           Output pointer
- * @{
- */
-#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)         \
-    const bool at_channel_boundary = get_global_id(0) == 0;                                                          \
-    if(at_channel_boundary)                                                                                          \
-    {                                                                                                                \
-        IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    }                                                                                                                \
-    else                                                                                                             \
-    {                                                                                                                \
-        IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)                    \
-    }
-#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
-#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-
-#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
-
-#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
-/** @}*/
-
-/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col3x3_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int  yi_coord = 0;
-    int3 offset   = 0;
-
-    // Clamp xi
-    int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
-#if PAD_LEFT != 0 || PAD_RIGHT != 0
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-    xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
-#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
-    // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
-    xi_offset *= (int3)src_stride_y;
-
-    // Out-of-bound condition for X
-    int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
-
-    // yi == 0
-    // Clamp yi
-    // yi_coord is casted to unsigned int in order to use just a min() operation
-    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
-    // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
-    // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
-    // also causes y_cond (y padding condition) to be satisfied
-    yi_coord = yi - (int)PAD_TOP;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with PAD_VALUE
-    int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
-    values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // yi == 1
-    // Clamp yi_coord (it can be negative if PAD_TOP > 1)
-    yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with zeros
-    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
-    values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // yi == 2
-    // Clamp yi_coord
-    yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with PAD_VALUE
-    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
-    values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // Store in a boundary-aware way to avoid padding
-    IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i)                                                                                         \
-    ({                                                                                                       \
-        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
-        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
-        \
-        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
-        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
-        \
-        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
-        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
-        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
-        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
-        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
-        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
-        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
-        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
-        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
-        \
-        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                \
-        values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
-        values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
-        values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
-        values3    = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
-        values4    = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
-        values5    = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
-        values6    = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
-        values7    = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
-        values8    = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1)));    \
-        \
-        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
-    })
-#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i)                                                                                         \
-    ({                                                                                                       \
-        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
-        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
-        \
-        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
-        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
-        \
-        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
-        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
-        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
-        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
-        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
-        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
-        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
-        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
-        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
-        \
-        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
-    })
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col9x9_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int  yi_coord = 0;
-    int8 offset0  = 0;
-    int  offset1  = 0;
-
-    // Clamp xi
-    int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
-    int  xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
-
-#if PAD_LEFT != 0 || PAD_RIGHT != 0
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-    xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
-    xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
-#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
-    xi_offset0 *= (int8)src_stride_y;
-    xi_offset1 *= (int)src_stride_y;
-
-    // Out-of-bound condition for X
-    int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
-    int  x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
-
-    IM2COL1x9(0);
-    IM2COL1x9(1);
-    IM2COL1x9(2);
-    IM2COL1x9(3);
-    IM2COL1x9(4);
-    IM2COL1x9(5);
-    IM2COL1x9(6);
-    IM2COL1x9(7);
-    IM2COL1x9(8);
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-
-/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_generic_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int i = 0;
-    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
-    {
-        // Clamp yi_coord
-        int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
-        yi_coord     = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
-
-        // Out-of-bound condition for Y
-        int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
-
-        for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
-        {
-            // Clamp xi_coord
-            int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
-            xi_coord     = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
-
-            // Out-of-bound condition for X
-            int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
-
-            int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
-
-            VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-            // Replace with PAD_VALUE if the value is out-of-bound
-            values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-            // Store in a boundary-aware way to avoid padding
-#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
-            const bool at_channel_boundary = get_global_id(0) == 0;
-            if(at_channel_boundary)
-            {
-                VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
-                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
-            }
-            else // at_channel_boundary
-#endif           // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
-            {
-                VSTORE(VECTOR_SIZE)
-                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
-            }
-            i++;
-        }
-    }
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
diff --git a/src/core/CL/cl_kernels/instance_normalization.cl b/src/core/CL/cl_kernels/instance_normalization.cl
deleted file mode 100644
index adfbebd67d..0000000000
--- a/src/core/CL/cl_kernels/instance_normalization.cl
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) & defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
-/** This function computes the mean and variance of each plane of the input tensor and provides it as output.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
- * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
- */
-__kernel void compute_mean_var(
-    TENSOR4D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-
-#if defined(NHWC)
-    const int          ch             = get_global_id(0); // Current channel
-    const int          batch          = get_global_id(1); // Current batch
-    const int          elements_plane = DIM_Y * DIM_Z;
-    INTERNAL_DATA_TYPE part_sum       = 0.f;
-    INTERNAL_DATA_TYPE part_sum_sq    = 0.f;
-    const int          in_offset      = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
-
-    for(int i_w = 0; i_w < DIM_Y; ++i_w)
-    {
-        for(int i_h = 0; i_h < DIM_Z; ++i_h)
-        {
-            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
-            part_sum += data;
-            part_sum_sq += data * data;
-        }
-    }
-
-    INTERNAL_DATA_TYPE mean                      = (part_sum / elements_plane);
-    INTERNAL_DATA_TYPE var                       = (part_sum_sq / elements_plane) - (mean * mean);
-    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
-    *output_address0                             = mean;
-    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
-    *output_address1                             = var;
-#else // !defined(NHWC)
-    const int ch             = get_global_id(2) % DIM_Z; // Current channel
-    const int batch          = get_global_id(2) / DIM_Z; // Current batch
-    const int elements_plane = DIM_X * DIM_Y;
-
-    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
-    part_sum = 0.f;
-    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
-    part_sum_sq = 0.f;
-    // Calculate partial sum
-    for(int y = 0; y < DIM_Y; ++y)
-    {
-        int x = 0;
-        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
-        {
-            // Load data
-            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
-            data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
-            part_sum += data;
-            part_sum_sq += data * data;
-        }
-        // Left-overs loop
-        for(; x < DIM_X; ++x)
-        {
-            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)));
-            part_sum.s0 += data;
-            part_sum_sq.s0 += data * data;
-        }
-    }
-    // Perform reduction
-#if VEC_SIZE > 8
-    part_sum.s01234567 += part_sum.s89abcdef;
-    part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
-#endif // VEC_SIZE > 8
-#if VEC_SIZE > 4
-    part_sum.s0123 += part_sum.s4567;
-    part_sum_sq.s0123 += part_sum_sq.s4567;
-#endif // VEC_SIZE > 4
-#if VEC_SIZE > 2
-    part_sum.s01 += part_sum.s23;
-    part_sum_sq.s01 += part_sum_sq.s23;
-#endif // VEC_SIZE > 2
-    part_sum.s0 += part_sum.s1;
-    part_sum_sq.s0 += part_sum_sq.s1;
-
-    INTERNAL_DATA_TYPE sum    = (INTERNAL_DATA_TYPE)part_sum.s0;
-    INTERNAL_DATA_TYPE sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0;
-
-    const INTERNAL_DATA_TYPE mean = (sum / elements_plane);
-    const INTERNAL_DATA_TYPE var  = (sum_sq / elements_plane) - (mean * mean);
-
-    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
-    *output_address0                             = mean;
-    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
-    *output_address1                             = var;
-
-#endif // defined(NHWC)
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
-/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
- * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3
- * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4
- * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
- * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
- */
-__kernel void instance_normalization(
-    TENSOR4D_DECLARATION(input),
-    TENSOR3D_DECLARATION(mean_var)
-#ifndef IN_PLACE
-    ,
-    TENSOR4D_DECLARATION(output)
-#endif /* IN_PLACE */
-)
-{
-    Tensor4D in       = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D mean_var = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(mean_var);
-#ifndef IN_PLACE
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-#endif /* IN_PLACE */
-
-#if defined(NHWC)
-    const int ch    = get_global_id(0); // Current channel
-    const int batch = get_global_id(2); // Current batch
-#else                                   /* defined(NHWC) */
-    const int ch    = get_global_id(2) % DIM_Z; // Current channel
-    const int batch = get_global_id(2) / DIM_Z; // Current batch
-#endif                                  /* defined(NHWC) */
-
-    const __global INTERNAL_DATA_TYPE *mean_ptr                   = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 0, batch);
-    const __global INTERNAL_DATA_TYPE *var_ptr                    = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 1, batch);
-    const INTERNAL_DATA_TYPE                               mean   = (INTERNAL_DATA_TYPE) * mean_ptr;
-    const INTERNAL_DATA_TYPE                               var    = (INTERNAL_DATA_TYPE) * var_ptr;
-    const INTERNAL_DATA_TYPE                               multip = GAMMA / sqrt(var + EPSILON);
-    const INTERNAL_DATA_TYPE                               beta   = (INTERNAL_DATA_TYPE)BETA;
-
-#if defined(NHWC)
-    const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
-#ifndef IN_PLACE
-    const int out_offset = output_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
-#endif /* IN_PLACE */
-
-    for(int i_w = 0; i_w < DIM_Y; ++i_w)
-    {
-        for(int i_h = 0; i_h < DIM_Z; ++i_h)
-        {
-            __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
-#ifdef IN_PLACE
-            __global DATA_TYPE *output_address = input_address;
-#else  /* !IN_PLACE */
-            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
-#endif /* IN_PLACE */
-            *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
-        }
-    }
-#else // !defined(NHWC)
-    for(int y = 0; y < DIM_Y; ++y)
-    {
-        int x = 0;
-        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
-        {
-            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
-#ifdef IN_PLACE
-            __global DATA_TYPE *output_address = input_address;
-#else  /* !IN_PLACE */
-            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
-#endif /* IN_PLACE */
-
-            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
-            data = CONVERT(VLOAD(VEC_SIZE)(0, input_address), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
-
-            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
-            res = (data - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
-            VSTORE(VEC_SIZE)
-            (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, output_address);
-        }
-        // Left-overs loop
-        for(; x < DIM_X; ++x)
-        {
-            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
-#ifdef IN_PLACE
-            __global DATA_TYPE *output_address = input_address;
-#else  /* !IN_PLACE */
-            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
-#endif /* IN_PLACE */
-            *(output_address)                  = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
-        }
-    }
-#endif // defined(NHWC)
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
diff --git a/src/core/CL/cl_kernels/load_store_utility.h b/src/core/CL/cl_kernels/load_store_utility.h
index 56b1538c6f..4daf0adc89 100644
--- a/src/core/CL/cl_kernels/load_store_utility.h
+++ b/src/core/CL/cl_kernels/load_store_utility.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -223,8 +223,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @} */ // end of group STORE_BLOCK
 
 /** Convert and store a block of the given size M0xN0
@@ -245,8 +247,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @} */ // end of group CONVERT_STORE_BLOCK
 
 /** Partially store the 0 to (n-1)th rows of the given variables
@@ -365,8 +369,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** Store a block that can be partial in both x and y dimensions
  *
  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
@@ -388,22 +394,23 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
-    }                                                                                                                                                     \
-    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                     \
-    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                     \
-    else                                                                                                                                                  \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
+#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0,     \
+                                       PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                    \
+    if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                             \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                             \
+    }                                                                                                       \
+    else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                         \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);               \
+    }                                                                                                       \
+    else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                         \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);               \
+    }                                                                                                       \
+    else                                                                                                    \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
     }
 /** Store a block that can only be partial in x but not y.
  *
@@ -425,7 +432,7 @@
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
 #define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X))                                                                                         \
+    if (!(PARTIAL_COND_X))                                                                                        \
     {                                                                                                             \
         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
     }                                                                                                             \
@@ -453,7 +460,7 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  */
 #define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
-    if(!(PARTIAL_COND_Y))                                                                                         \
+    if (!(PARTIAL_COND_Y))                                                                                        \
     {                                                                                                             \
         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
     }                                                                                                             \
@@ -463,8 +470,6 @@
     }
 /** @} */ // end of group STORE_BLOCK_PARTIAL
 
-#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
 /** Boundary-aware GEMM block store
  * @name STORE_BLOCK_BOUNDARY_AWARE
  * This macro assumes the following schemes to achieve boundary-awareness:
@@ -516,32 +521,37 @@
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  * @{
  */
+#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case1: No partial blocks in either x or y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 
 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
 // Case2: Partial blocks in y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
 
 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
 // Case3: Partial blocks in x
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
 
 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case4: Partial blocks in both x and y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
+    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)
 
 #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 
 #endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 /** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
 
-#if defined(PARTIAL_STORE_M0)
 /** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
  * @name COMPUTE_M0_START_ROW
  * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
@@ -558,16 +568,16 @@
  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
  * @{
  */
+#if defined(PARTIAL_STORE_M0)
 #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
     ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
 #else // defined(PARTIAL_STORE_M0)
-#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
-    ((uint)(y * M0))
+#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) ((uint)(y * M0))
 #endif    // defined(PARTIAL_STORE_M0)
 /** @} */ // end of group COMPUTE_M0_START_ROW
 
 /** Store a vector that can only be partial in x.
- *
+ * @name STORE_VECTOR_SELECT
  * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
  *
  * The data to store is expected to end in a 0.
@@ -583,4 +593,4 @@
  */
 #define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
     STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
-/** @} */ // end of group STORE_VECTOR_SELECT
-\ No newline at end of file
+/** @} */ // end of group STORE_VECTOR_SELECT
diff --git a/src/core/CL/cl_kernels/mean_stddev_normalization.cl b/src/core/CL/cl_kernels/mean_stddev_normalization.cl
deleted file mode 100644
index 76be629934..0000000000
--- a/src/core/CL/cl_kernels/mean_stddev_normalization.cl
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2019, 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH)
-/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16
- * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
- */
-__kernel void mean_stddev_normalization(
-    IMAGE_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    IMAGE_DECLARATION(output)
-#endif /* IN_PLACE */
-)
-{
-    // Get pixels pointer
-    Image in = CONVERT_TO_IMAGE_STRUCT(input);
-#ifdef IN_PLACE
-    Image out = in;
-#else  /* IN_PLACE */
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-#endif /* IN_PLACE */
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    sum = 0.f;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    sum_sq = 0.f;
-    // Calculate partial sum
-    int i = 0;
-    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
-    {
-        // Load data
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
-
-        sum += data;
-        sum_sq += data * data;
-    }
-    // Perform reduction
-    sum    = SUM_REDUCE(sum, VEC_SIZE);
-    sum_sq = SUM_REDUCE(sum_sq, VEC_SIZE);
-
-#if VEC_SIZE > 1
-#define sum sum.s0
-#define sum_sq sum_sq.s0
-#endif // VEC_SIZE > 1
-
-    // Left-overs loop
-    for(; i < WIDTH; ++i)
-    {
-        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
-
-        sum += data;
-        sum_sq += data * data;
-    }
-
-    DATA_TYPE mean       = sum / WIDTH;
-    DATA_TYPE var        = (sum_sq / WIDTH) - (mean * mean);
-    DATA_TYPE stddev_inv = 1.f / sqrt(var + EPSILON);
-
-    i = 0;
-    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
-
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        res = (data - mean) * stddev_inv;
-        VSTORE(VEC_SIZE)
-        (res, 0, (__global DATA_TYPE *)offset(&out, i, 0));
-    }
-    for(; i < WIDTH; ++i)
-    {
-        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
-
-        *((__global DATA_TYPE *)offset(&out, i, 0)) = (data - mean) * stddev_inv;
-    }
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH) */
diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/memset.cl
deleted file mode 100644
index bb46a49f84..0000000000
--- a/src/core/CL/cl_kernels/memset.cl
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
-
-/** Fill the tensor's planes with all value
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
- * -# -DVEC_SIZE = Vector size
- * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might need to step back a bit)
- *
- * @param[in] tensor_ptr                           Pointer to the source image. Data types supported: All.
- * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] tensor_step_y                        tensor_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] value                                The value used to fill the pages of the tensor
- */
-__kernel void memset(
-    TENSOR3D_DECLARATION(tensor))
-{
-    Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
-
-#if defined(VEC_SIZE)
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
-#endif // defined(LAST_ACCESSED_X)
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = (DATA_TYPE)(CONSTANT_VALUE);
-
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)tensor.ptr);
-#else  // !defined(VEC_SIZE)
-    *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
-#endif // defined(VEC_SIZE)
-}
-
-#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/minmax_layer.cl b/src/core/CL/cl_kernels/minmax_layer.cl
deleted file mode 100644
index 655696f9a1..0000000000
--- a/src/core/CL/cl_kernels/minmax_layer.cl
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
-/** This function identifies the min and maximum value of an input 3D tensor.
- *
- * @note The width, height and depth of the input tensor must be provided at compile time using -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3)
- *
- * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                      Stride of the source image in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] dst_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
- * @param[in] dst_stride_x                      Stride of the min/max vector in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max vector
- */
-__kernel void minmax_layer(
-    TENSOR3D_DECLARATION(src),
-    VECTOR_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Vector   dst = CONVERT_TO_VECTOR_STRUCT(dst);
-
-    float4 min_value     = (float4)FLT_MAX;
-    float4 max_value     = (float4) - FLT_MAX;
-    float2 min_max_value = (float2)(FLT_MAX, -FLT_MAX);
-
-    for(int z = 0; z < DEPTH; ++z)
-    {
-        for(int y = 0; y < HEIGHT; ++y)
-        {
-            int             x        = 0;
-            __global float *src_addr = (__global float *)(src.ptr + y * src_stride_y + z * src_stride_z);
-
-            for(; x <= (int)(WIDTH - 8); x += 8)
-            {
-                float8 value = *(src_addr + x);
-
-                min_value = select(value.s0123, min_value, min_value < value.s0123);
-                min_value = select(value.s4567, min_value, min_value < value.s4567);
-
-                max_value = select(value.s0123, max_value, max_value > value.s0123);
-                max_value = select(value.s4567, max_value, max_value > value.s4567);
-            }
-
-            for(; x < WIDTH; ++x)
-            {
-                float value = *(src_addr + x);
-
-                min_max_value.s0 = min(min_max_value.s0, value);
-                min_max_value.s1 = max(min_max_value.s1, value);
-            }
-        }
-    }
-
-    // Perform min/max reduction
-    min_value.s01 = min(min_value.s01, min_value.s23);
-    min_value.s0  = min(min_value.s0, min_value.s1);
-    max_value.s01 = max(max_value.s01, max_value.s23);
-    max_value.s0  = max(max_value.s0, max_value.s1);
-
-    min_max_value.s0 = min(min_max_value.s0, min_value.s0);
-    min_max_value.s1 = max(min_max_value.s1, max_value.s0);
-
-    if(min_max_value.s0 == min_max_value.s1)
-    {
-        min_max_value.s0 = 0.0f;
-        min_max_value.s1 = 1.0f;
-    }
-
-    // Store min and max
-    vstore2(min_max_value, 0, (__global float *)dst.ptr);
-}
-#endif // defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/batch_to_space.cl b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
new file mode 100644
index 0000000000..d83e81347e
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    VECTOR_DECLARATION(block_shape),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    const int in_batch = batch_id + ((x % block_x) + (y % block_y) * block_x) * BATCH_SIZE;
+    const int in_x     = x / block_x;
+    const int in_y     = y / block_y;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, in_batch));
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
+
+    const int x = get_global_id(0) + CROP_LEFT;
+    const int y = get_global_id(1) + CROP_TOP;
+    const int z = get_global_id(2);
+
+    const int in_batch = batch_id + ((x % block_x) + (y % block_y) * block_x) * BATCH_SIZE;
+    const int in_x     = x / block_x;
+    const int in_y     = y / block_y;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, in_batch));
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
diff --git a/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
new file mode 100644
index 0000000000..2d466661b3
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+                                            TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
+{
+    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D out = in;
+#else  /* IN_PLACE */
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res = 0;
+
+    const int current_slice = get_global_id(2);
+
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+    // Calculate x bar and store results
+    numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
+    numerator = SUB_OP(data, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
+
+    res = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
+    // beta is not zero, hence we need to perform the addition
+    res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/channel_shuffle.cl b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
new file mode 100644
index 0000000000..84396e122f
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
@@ -0,0 +1,103 @@
+/*
+* Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+// Check valid VEC_SIZES
+#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+
+#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
+    ({                                                      \
+        div_res = (uint)((x)/(y)); \
+        uint r  = div_res * (y);                            \
+        mod_res = (x)-r;                                    \
+    })
+
+/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
+{
+    uint curr_channel = 0; // channel id of input
+    uint batch_id     = 0; // batch id
+    uint group_id     = 0; // group id
+    uint channel_id   = 0; // channel id within the group
+
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
+
+    // Compute group_id and channel_id
+    DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
+
+    const uint x = get_global_id(0) * VEC_SIZE;
+    const uint y = get_global_id(1) * 2;
+    const uint z = channel_id * NUM_GROUPS + group_id;
+
+    // Load the Nx2 block
+    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+    // Store blocks
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+    VSTORE(VEC_SIZE)
+    (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
+    VSTORE(VEC_SIZE)
+    (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
+}
+
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
diff --git a/src/core/CL/cl_kernels/nchw/depth_to_space.cl b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
new file mode 100644
index 0000000000..57183393d2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nchw(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % r;
+
+    const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+    const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
diff --git a/src/core/CL/cl_kernels/nchw/dequantization_layer.cl b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
new file mode 100644
index 0000000000..e0203f7408
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nchw(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    __global float *scale)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = scale[get_global_id(2)];
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution.cl b/src/core/CL/cl_kernels/nchw/direct_convolution.cl
new file mode 100644
index 0000000000..866f62da95
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution.cl
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "helpers_asymm.h"
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
+ * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
+ * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3
+ * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    const int id0 = get_global_id(0);
+    const int id1 = get_global_id(1);
+    const int id2 = get_global_id(2);
+
+    const int x_coords = (id0 * STRIDE_X) - PAD_LEFT;
+    const int y_coords = (id1 * STRIDE_Y) - PAD_TOP;
+
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr     = (__global uchar *)(src_ptr + src_offset_first_element_in_bytes);
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + id2 * weights_stride_w);
+    __global uchar *dst_addr     = (__global uchar *)dst_ptr + dst_offset_first_element_in_bytes + x_offs + id1 * dst_stride_y + id2 * dst_stride_z;
+
+#ifdef IS_QUANTIZED
+    int acc_value = 0;
+#else  /* IS_QUANTIZED */
+    DATA_TYPE                 acc_value = 0;
+#endif /* IS_QUANTIZED */
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        for(int y = 0; y < WEI_HEIGHT; ++y)
+        {
+            for(int x = 0; x < WEI_WIDTH; ++x)
+            {
+                const int idx_x = (x_coords + x);
+                const int idx_y = (y_coords + y);
+                if((idx_x >= 0 && idx_x < SRC_WIDTH) && (idx_y >= 0 && idx_y < SRC_HEIGHT))
+                {
+                    const int weight_offset = x + (WEI_HEIGHT * y);
+                    const int input_offset  = idx_x + SRC_WIDTH * idx_y;
+#ifdef IS_QUANTIZED
+                    int weight = convert_int(*((__global DATA_TYPE *)weights_addr + weight_offset));
+                    int input  = convert_int(*((__global DATA_TYPE *)src_addr + input_offset));
+                    acc_value += (input + INPUT_OFFSET) * (weight + WEIGHTS_OFFSET);
+#else  /* IS_QUANTIZED */
+                    DATA_TYPE weight    = *((__global DATA_TYPE *)weights_addr + weight_offset);
+                    DATA_TYPE input     = *((__global DATA_TYPE *)src_addr + input_offset);
+                    acc_value += input * weight;
+#endif /* IS_QUANTIZED */
+                }
+            }
+        }
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#ifdef IS_QUANTIZED
+    int bias = *((__global int *)(vector_offset(&biases, id2)));
+#else  /* IS_QUANTIZED */
+    DATA_TYPE bias = *((__global DATA_TYPE *)(vector_offset(&biases, id2)));
+#endif /* IS_QUANTIZED */
+    acc_value += bias;
+
+#endif /* defined(HAS_BIAS) */
+
+#ifdef IS_QUANTIZED
+
+#if OUTPUT_SHIFT < 0
+    acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
+#else  // OUTPUT_SHIFT < 0
+    acc_value      = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
+#endif // OUTPUT_SHIFT < 0
+    acc_value = acc_value + OUTPUT_OFFSET;
+#endif /* IS_QUANTIZED */
+
+    *(__global DATA_TYPE *)dst_addr = CONVERT_SAT(acc_value, DATA_TYPE);
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/im2col.cl b/src/core/CL/cl_kernels/nchw/im2col.cl
new file mode 100644
index 0000000000..fddf918c63
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/im2col.cl
@@ -0,0 +1,863 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col1x1_stridex1_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const uint xc    = get_global_id(0) * 4;         // x coordinate in the convolved tensor
+    const uint yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const uint ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const uint batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Clamp xc
+    // The strategy clamps at "xc" as it will be a valid value for sure
+    uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);
+
+    // Check which values are valid
+    const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+
+    xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));
+
+    // Calculate input indices
+    const uint xi = xc;
+    const uint yi = yc * STRIDE_Y;
+
+    // Calculate output indices
+
+#if defined(NUM_GROUPS)
+    const uint xo = ch % (SRC_DEPTH / NUM_GROUPS);
+    const uint zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                                   // defined(NUM_GROUPS)
+    const uint xo              = ch;
+#endif                                                  // defined(NUM_GROUPS)
+    const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data = vload4(0, (__global DATA_TYPE *)input_ptr);
+
+    // If out-of-bound, overwrite with the first element
+    data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);
+
+    *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;
+    *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;
+    *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;
+    *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if(xo == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+#if defined(DILATION_X) && defined(DILATION_Y)
+/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else  // defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
+
+    // Linearize convolution elements
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+    {
+        int y = yi + yk * DILATION_Y;
+        for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
+        {
+            int x = xi + xk * DILATION_X;
+#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+            *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+#else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
+            {
+                *output_ptr = PAD_VALUE;
+            }
+            else
+            {
+                *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            }
+#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+        }
+    }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *output_ptr = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(DILATION_X) && defined(DILATION_Y)
+
+/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 9; // 3x3
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+    // Put 0 if the value is out-of-bound
+    int3 x = (int3)xi + (int3)(0, 1, 2);
+    int3 y = (int3)yi + (int3)(0, 1, 2);
+
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+
+    row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
+    row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);
+    row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+    vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
+    *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
+/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col5x5_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 25; // 5x5
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+    // Put 0 if the value is out-of-bound
+    int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);
+    int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);
+    int  x1 = xi + 4;
+    int  y1 = yi + 4;
+
+    // Check if we could have out-of-bounds elements in the x direction
+    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+    x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+    y0_condition                = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+    COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);
+    COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row11 = *((__global DATA_TYPE *)input_ptr + 4);
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);
+        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
+                                              row10.s012),
+                0, (__global DATA_TYPE *)output_ptr);
+        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 10 * dst_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row11 = *((__global DATA_TYPE *)input_ptr + 4);
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);
+        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
+                                              row10.s012),
+                0, (__global DATA_TYPE *)output_ptr);
+        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 10 * dst_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);
+        *((__global DATA_TYPE *)output_ptr + 4) = row01;
+
+        output_ptr += 5 * dst_stride_x;
+    }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)output_ptr) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col11x11_padx0_pady0_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X;
+    const int yi = yc * STRIDE_Y;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 121; // 11x11
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        output_ptr += 11 * src_stride_x;
+    }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)output_ptr) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
+ * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_padx0_pady0_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X;
+    const int yi = yc * STRIDE_Y;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else  // defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
+
+    // Linearize convolution elements
+    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+    {
+        int last_x = 0;
+        for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+            row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            VSTORE(VECTOR_SIZE)
+            (row, 0, output_ptr);
+            last_x = x;
+        }
+        // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
+        // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
+#if WIDTH_MOD_VECTOR_SIZE == 1
+        *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+#elif WIDTH_MOD_VECTOR_SIZE > 1
+        VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
+        row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+        VSTORE(WIDTH_MOD_VECTOR_SIZE)
+        (row, 0, output_ptr);
+#endif /* WIDTH_MOD_VECTOR_SIZE */
+        output_ptr += WIDTH_MOD_VECTOR_SIZE;
+    } /* End of loop over KERNEL_HEIGHT */
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *output_ptr = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalization_layer.cl b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
new file mode 100644
index 0000000000..deada49db5
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#if defined(NUM_SLICES)
+/** Apply cross-map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input),
+                                                 TENSOR3D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+
+    const int current_slice = get_global_id(2);
+    const int left_slice    = max(-(int)RADIUS, -current_slice);
+    const int right_slice   = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
+
+    for(int i = left_slice; i <= right_slice; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
+        acc    = ADD_OP(acc, MUL_OP(values, values));
+    }
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+    VSTORE(VEC_SIZE)
+    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(NUM_SLICES) */
+
+#if defined(WIDTH_SIZE)
+/** Apply in-map normalization when tensors are in the NCHW data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = 0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = SQCVT_SAT(KAPPA);
+
+    const int left_pos  = -(int)RADIUS;
+    const int right_pos = (int)RADIUS;
+
+#if defined(IN_MAP_2D)
+    const int current_row = get_global_id(1);
+    const int first_row   = max(-(int)RADIUS, -current_row);
+    const int last_row    = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
+    {
+#endif /* defined(IN_MAP_2D) */
+        for(int i = left_pos; i <= right_pos; ++i)
+        {
+#if defined(IN_MAP_2D)
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
+#else  /* defined(IN_MAP_2D) */
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
+#endif /* defined(IN_MAP_2D) */
+            acc = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
+    }
+#endif /* defined(IN_MAP_2D) */
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+    VSTORE(VEC_SIZE)
+    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(WIDTH_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
new file mode 100644
index 0000000000..23a0de76f7
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
+                                              TENSOR3D_DECLARATION(dst),
+                                              VECTOR_DECLARATION(mean),
+                                              VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+    const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
+    const DATA_TYPE curr_std  = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
+
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+    TYPE res  = (data - curr_mean) / curr_std;
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
new file mode 100644
index 0000000000..0f02ef6184
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+#if defined(NUM_CHANNELS)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
+                                                 TENSOR3D_DECLARATION(dst),
+                                                 VECTOR_DECLARATION(mean),
+                                                 VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE));
+    data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT;
+
+    // Perform normalization
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+    const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+    VSTORE(VEC_SIZE)
+    (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#endif // defined(NUM_CHANNELS)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer.cl b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
new file mode 100644
index 0000000000..15ad116289
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#if defined(QUANTIZED)
+#define POOL_OP(x, y) (max((x), (y)))
+#else // defined(QUANTIZED)
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif // defined(QUANTIZED)
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(FP_MIXED_PRECISION) || defined(QUANTIZED)
+#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) || defined(QUANTIZED)*/
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
+#endif /* defined(FP_MIXED_PRECISION) || defined(QUANTIZED)*/
+
+ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+/** Performs a pooling function of pool size equal to N  (NCHW)
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32/QASYMM8;
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32/QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    int id0 = get_global_id(0);
+    int id1 = get_global_id(1);
+    int id2 = get_global_id(2);
+
+    int x_coords = (id0 * STRIDE_X) - PAD_X;
+    int y_coords = (id1 * STRIDE_Y) - PAD_Y;
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + y_coords * (int)src_stride_y + id2 * src_stride_z;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+    vdata               = INITIAL_VALUE;
+    ACC_DATA_TYPE sdata = INITIAL_VALUE;
+
+    const int end_x = min((int)POOL_SIZE_X, (int)(SRC_WIDTH - x_coords));
+    const int end_y = min((int)POOL_SIZE_Y, (int)(SRC_HEIGHT - y_coords));
+
+    // Load data
+    for(int y = 0; y < end_y; ++y)
+    {
+        if((y_coords + y) >= 0)
+        {
+            int x = 0;
+            for(; x <= (end_x - 8); x += 8)
+            {
+                int8 src_x = (int8)(x_coords + x) + VEC_OFFS(int, 8);
+#if defined(POOL_AVG) || defined(POOL_L2)
+                SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+                cond_x = CONVERT(src_x < 0, SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, 8));
+                src_x  = clamp(src_x, (int8)0, (int8)(SRC_WIDTH - 1));
+                VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+                data0 = select(VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)(src_addr + src_x.s0 * sizeof(DATA_TYPE) + y * src_stride_y)), (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))0, REVERSE(cond_x, 8));
+#else  // defined(POOL_AVG) || defined(POOL_L2)
+                src_x = clamp(src_x, 0, SRC_WIDTH - 1);
+                VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+                data0               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)(src_addr + src_x.s0 * sizeof(DATA_TYPE) + y * src_stride_y));
+#endif // defined(POOL_AVG) || defined(POOL_L2
+
+#if defined(POOL_L2)
+                // Raise to power of 2 for L2 Pooling
+                data0 *= data0;
+#endif /* defined(POOL_L2) */
+
+                vdata = POOL_OP(vdata, data0);
+            }
+
+            // Leftover
+            for(; x < end_x; ++x)
+            {
+                int src_x = x_coords + x;
+#if defined(POOL_AVG) || defined(POOL_L2)
+                SELECT_DATA_TYPE(ACC_DATA_TYPE)
+                cond_x              = (src_x < 0);
+                src_x               = clamp(src_x, 0, SRC_WIDTH - 1);
+                ACC_DATA_TYPE data0 = select((ACC_DATA_TYPE)(*((__global DATA_TYPE *)(src_addr + src_x * sizeof(DATA_TYPE) + y * src_stride_y))), (ACC_DATA_TYPE)0, cond_x);
+#else  // defined(POOL_AVG) || defined(POOL_L2)
+                src_x               = clamp(src_x, 0, SRC_WIDTH - 1);
+                ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)(src_addr + src_x * sizeof(DATA_TYPE) + y * src_stride_y)));
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+                // Raise to power of 2 for L2 Pooling
+                data0 *= data0;
+#endif /* defined(POOL_L2) */
+
+                sdata = POOL_OP(sdata, data0);
+            }
+        }
+    }
+
+    // Reduce result
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    reduce2           = POOL_OP(reduce4.s01, reduce4.s23);
+    ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+    res               = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(QUANTIZED)
+
+    DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+
+    const float result_f32   = convert_float(result_q8);
+    const float input_offset = (float)OFFSET_IN1;
+    const float input_scale  = (float)SCALE_IN1;
+    const float scale_out    = (float)SCALE_OUT;
+    const float offset_out   = (float)OFFSET_OUT;
+    const float in_f32       = (result_f32 - input_offset) * input_scale;
+    const float out_f32      = in_f32 / scale_out + offset_out;
+    result_q8                = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
+
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 * sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = result_q8;
+
+#else // defined(QUANTIZED)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 * sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = (DATA_TYPE)res;
+#endif // defined(QUANTIZED)
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(indices))
+{
+    int id0 = get_global_id(0);
+    int id1 = get_global_id(1);
+    int id2 = get_global_id(2);
+
+    int2 x_coords = clamp((int2)((id0 * STRIDE_X) - PAD_X), (int2)0, (int2)(SRC_WIDTH - 1));
+    int2 y_coords = clamp((int2)((id1 * STRIDE_Y) - PAD_Y) + VEC_OFFS(int, 2), (int2)0, (int2)(SRC_HEIGHT - 1));
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + id2 * src_stride_z;
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = VLOAD(2)(0, (__global DATA_TYPE *)(src_addr + x_coords.s0 * sizeof(DATA_TYPE) + y_coords.s0 * (int)src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = VLOAD(2)(0, (__global DATA_TYPE *)(src_addr + x_coords.s1 * sizeof(DATA_TYPE) + y_coords.s1 * (int)src_stride_y));
+
+    // Perform calculations
+    DATA_TYPE data0_max = POOL_OP(data0.s0, data0.s1);
+    DATA_TYPE data1_max = POOL_OP(data1.s0, data1.s1);
+    DATA_TYPE res       = POOL_OP(data0_max, data1_max);
+    // Store result
+    *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 * sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = res;
+
+#if defined(SRC_BATCH)
+
+    uint offset_top    = (x_coords.s0 + y_coords.s0 * SRC_WIDTH + id2 * (SRC_WIDTH * SRC_HEIGHT)) % SRC_BATCH;
+    uint offset_bottom = offset_top + SRC_WIDTH;
+
+    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+    *(__global uint *)(indices_ptr + indices_offset_first_element_in_bytes + id0 * sizeof(uint) + id1 * indices_stride_y + id2 * indices_stride_z) = index;
+
+#endif // defined(SRC_BATCH)
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/prior_box_layer.cl b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
new file mode 100644
index 0000000000..7524ba7b4a
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3)
+
+/**  Compute prior boxes and clip (NCHW)
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx                                  Index to write to
+ * @param[in]  center_x                             Center value of the x axis
+ * @param[in]  center_y                             Center value of the y axis
+ * @param[in]  box_width                            Prior box width
+ * @param[in]  box_height                           Prior box height
+ *
+ */
+inline void calculate_xy_min_max_nchw(Image *out, int idx, float center_x, float center_y, float box_width, float box_height)
+{
+    float xmin = (center_x - box_width / 2.f) / WIDTH;
+    float ymin = (center_y - box_height / 2.f) / HEIGHT;
+    float xmax = (center_x + box_width / 2.f) / WIDTH;
+    float ymax = (center_y + box_height / 2.f) / HEIGHT;
+
+#if defined(CLIP)
+    xmin = clamp(xmin, 0.f, 1.f);
+    ymin = clamp(ymin, 0.f, 1.f);
+    xmax = clamp(xmax, 0.f, 1.f);
+    ymax = clamp(ymax, 0.f, 1.f);
+#endif // defined(CLIP)
+
+    // Store result
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(xmin, ymin, xmax, ymax), 0, ((__global DATA_TYPE *)offset(out, idx + 0, 0)));
+}
+
+/** Compute prior boxes (NCHW)
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  min_size                             Prior box min size
+ * @param[in]  min_idx                              Index of the min vector
+ * @param[in]  idx                                  Index to write to
+ *
+ * @return The updated index
+ */
+inline int calculate_min_nchw(Image *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
+{
+    const float center_x = ((float)(get_global_id(0) % LAYER_WIDTH) + OFFSET) * STEP_X;
+    const float center_y = ((float)(get_global_id(0) / LAYER_WIDTH) + OFFSET) * STEP_Y;
+
+    float box_width  = min_size;
+    float box_height = min_size;
+    calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+    idx += 4;
+
+    if(max_size > 0)
+    {
+        box_width  = sqrt(min_size * max[min_idx]);
+        box_height = box_width;
+        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+        idx += 4;
+    }
+    for(unsigned int i = 0; i < aspect_ratios_size; ++i)
+    {
+        if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
+        {
+            continue;
+        }
+        box_width  = min_size * sqrt(aspect_ratios[i]);
+        box_height = min_size * rsqrt(aspect_ratios[i]);
+
+        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+        idx += 4;
+    }
+
+    return idx;
+}
+/** Calculate prior boxes with NCHW format.
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  min                                  The minimum values
+ * @param[in]  max                                  The maximum_values
+ * @param[in]  aspect_ratios                        The aspect ratio values
+ * @param[in]  min_size                             The minimum values size
+ * @param[in]  max_size                             The maximum_values values size
+ * @param[in]  aspect_ratios_size                   The aspect ratio values size
+ */
+__kernel void prior_box_layer_nchw(IMAGE_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
+                                   unsigned int aspect_ratios_size)
+{
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    int idx = 0;
+    for(unsigned int i = 0; i < min_size; ++i)
+    {
+        idx = calculate_min_nchw(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
+    }
+
+    // Store variances
+    for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
+    {
+        vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1)));
+    }
+}
+#endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */
diff --git a/src/core/CL/cl_kernels/nchw/reorg_layer.cl b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
new file mode 100644
index 0000000000..f66b17c1a6
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi)     \
+    ({                                                        \
+        int offset = zo / (int)SRC_DEPTH;                     \
+        xi         = xo * (int)STRIDE + offset % (int)STRIDE; \
+        yi         = yo * (int)STRIDE + offset / (int)STRIDE; \
+        zi         = zo % SRC_DEPTH;                          \
+    })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int xo = get_global_id(0);
+    int yo = get_global_id(1);
+    int zo = get_global_id(2);
+    int xi, yi, zi;
+
+    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+    int src_offset                   = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z;
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale.cl b/src/core/CL/cl_kernels/nchw/scale.cl
new file mode 100644
index 0000000000..2b4d6be9fb
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/scale.cl
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_nearest(const float2 coord, const float2 scale)
+{
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = in_x_coords * (float4)(scale.s0);
+    const float4 new_y       = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
+    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 new_x = in_x_coords * (float4)(scale.s0);
+    const float4 new_y = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void scale_nearest_neighbour_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out))
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    float8 transformed = transform_nearest((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y));
+#ifdef ALIGN_CORNERS
+    transformed = round(transformed);
+#endif // ALIGN_CORNERS
+
+    TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 4, cond);
+    cond[0].v = CONVERT(((transformed.even < 0) || (transformed.even >= (int)SRC_WIDTH)) || ((transformed.odd < 0) || (transformed.odd >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 4));
+
+    TILE(int, 1, 4, in_x);
+    TILE(int, 1, 4, in_y);
+    in_x[0].v = convert_int4(clamp(transformed.even, 0.f, SRC_WIDTH - 1.f));
+    in_y[0].v = convert_int4(clamp(transformed.odd, 0.f, SRC_HEIGHT - 1.f));
+
+    TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+    LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+    {
+        out_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]);
+    })
+
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y;
+
+    if(x == get_global_size(0) - 1)
+    {
+#if VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
+    else
+    {
+#if VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void scale_bilinear_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out))
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    TILE(float, 1, 8, trans_coords);
+    TILE(float, 1, 8, floor_coords);
+    TILE(int, 1, 16, in_x);
+    TILE(int, 1, 16, in_y);
+
+    trans_coords[0].v = transform_bilinear((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y));
+    floor_coords[0].v = floor(trans_coords[0].v);
+
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, 4,
+        {
+            in_x[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 0] + (j % 2);
+            in_y[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 1] + (j > 1);
+        })
+    })
+
+#if defined(BORDER_MODE_CONSTANT)
+    TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 16, cond);
+    cond[0].v = CONVERT(((in_x[0].v < 0) || (in_x[0].v >= (int)SRC_WIDTH)) || ((in_y[0].v < 0) || (in_y[0].v >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 16));
+#endif // defined(BORDER_MODE_CONSTANT)
+
+    in_x[0].v = clamp(in_x[0].v, 0, (int16)((int)SRC_WIDTH - 1));
+    in_y[0].v = clamp(in_y[0].v, 0, (int16)((int)SRC_HEIGHT - 1));
+
+    TILE(DATA_TYPE, 1, 16, in_vals);
+
+    // Loads the values from the input image
+#if defined(BORDER_MODE_CONSTANT)
+    LOOP_UNROLLING(int, i, 0, 1, 16,
+    {
+        in_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]);
+    })
+#else  // defined(BORDER_MODE_CONSTANT)
+    LOOP_UNROLLING(int, i, 0, 1, 16,
+    {
+        in_vals[0].s[i] = *((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y));
+    })
+#endif // defined(BORDER_MODE_CONSTANT)
+
+    TILE(float, 1, 8, a);
+    TILE(float, 1, 8, b);
+
+    a[0].v = trans_coords[0].v - floor_coords[0].v;
+    b[0].v = ((float8)(1.f)) - a[0].v;
+
+#if defined(OFFSET) && defined(SCALE)
+    TILE(float, 1, 16, in_vals_f32);
+    TILE(float, 1, 4, out_vals_f32);
+
+    in_vals_f32[0].v = convert_float16(convert_int16(in_vals[0].v) - (int16)OFFSET) * (float16)SCALE;
+
+    // Bilinear interpolation: (in0  * b0 * b1) + (in1  * a0 * b1) + (in2  * b0 * a1) + (in3  * a0 * a1)
+    //                         (in4  * b2 * b3) + (in5  * a2 * b3) + (in6  * b2 * a3) + (in7  * a2 * a3)
+    //                         (in8  * b4 * b5) + (in9  * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5)
+    //                         (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7)
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        out_vals_f32[0].s[i] = (in_vals_f32[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]);
+    })
+
+    TILE(DATA_TYPE, 1, 4, out_vals_4);
+    TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+
+    out_vals_4[0].v = CONVERT_SAT(convert_int4_sat_rtp(out_vals_f32[0].v / (float)SCALE) + OFFSET, VEC_DATA_TYPE(DATA_TYPE, 4));
+
+    LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+    {
+        out_vals[0].s[i] = out_vals_4[0].s[i];
+    })
+#else  // defined(OFFSET) && defined(SCALE)
+
+    TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+
+    // Bilinear interpolation: (in0  * b0 * b1) + (in1  * a0 * b1) + (in2  * b0 * a1) + (in3  * a0 * a1)
+    //                         (in4  * b2 * b3) + (in5  * a2 * b3) + (in6  * b2 * a3) + (in7  * a2 * a3)
+    //                         (in8  * b4 * b5) + (in9  * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5)
+    //                         (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7)
+    LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+    {
+        out_vals[0].s[i] = (in_vals[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]);
+    })
+#endif // defined(OFFSET) && defined(SCALE)
+
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y;
+
+    if(x == get_global_size(0) - 1)
+    {
+#if VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
+    else
+    {
+#if VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/space_to_batch.cl b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
new file mode 100644
index 0000000000..91520213e8
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
+ * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
+ * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
+ * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in]  batch_id                                  The output tensor batch id
+ * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nchw(
+    TENSOR4D_DECLARATION(input),
+    IMAGE_DECLARATION(paddings),
+    VECTOR_DECLARATION(block_shape),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
+    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
+    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+    int block_x = *((__global int *)vector_offset(&block, 0));
+    int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int out_x = get_global_id(0);
+    const int out_y = get_global_id(1);
+    const int z     = get_global_id(2);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+    }
+}
+
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[in]  batch_id                             The output tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int block_x = BLOCK_SHAPE_X;
+    int block_y = BLOCK_SHAPE_Y;
+
+    const int out_x = get_global_id(0);
+    const int out_y = get_global_id(1);
+    const int z     = get_global_id(2);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/cl_kernels/nchw/space_to_depth.cl b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
new file mode 100644
index 0000000000..8097f65942
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % r;
+
+    const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+    const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
diff --git a/src/core/CL/cl_kernels/nchw/upsample_layer.cl b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
new file mode 100644
index 0000000000..723c491165
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image. (NCHW)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
+    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
+
+    vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
+    vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
new file mode 100644
index 0000000000..85eff9e6d9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(SRC_DIM_Z)
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0);
+    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+    out0.s3 = (w0.s2);
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out1    = 0.0f;
+    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out2    = 0.0f;
+    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out3    = 0.0f;
+    out3.s0 = (w2.s0);
+    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+    out3.s3 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 16 channels for 3x3 kernels
+    // 4 channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0) / 16.f;
+    out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
+    out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
+    out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s5 = (w0.s2) / 4.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
+    out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (w2.s0) / 4.f;
+    out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
+    out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
+    out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s5 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 36 channels for 3x3 kernels
+    // 6 channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ *
+ * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w10           = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w20           = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w30           = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w40           = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Transform the input tile
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = w00.s0;
+    out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
+    out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
+    out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
+    out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
+    out0.s7 = w01;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
+    out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
+    out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
+    out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6    = 0.0f;
+    out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
+    out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7    = 0.0f;
+    out7.s0 = w40.s0;
+    out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
+    out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
+    out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
+    out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
+    out7.s7 = w41;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+#endif // defined(SRC_DIM_Z)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_5x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
new file mode 100644
index 0000000000..8c382183c3
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
@@ -0,0 +1,1346 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
+    ({                                                              \
+        comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
+        comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5;            \
+        comm_fact.s2 = 2.5f * tmp.s3;                               \
+        comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
+        comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6;    \
+        comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4;        \
+        comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
+        \
+        out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
+        out.s1 = comm_fact.s0 + comm_fact.s1;                       \
+        out.s2 = comm_fact.s0 - comm_fact.s1;                       \
+        out.s3 = comm_fact.s3 + comm_fact.s4;                       \
+        out.s4 = comm_fact.s4 - comm_fact.s3;                       \
+        out.s5 = comm_fact.s5 + comm_fact.s6;                       \
+        out.s6 = comm_fact.s5 - comm_fact.s6;                       \
+        out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
+    })
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                    \
+    ({                                                                                             \
+        comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6;                                   \
+        comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5;                            \
+        comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6;                                    \
+        comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5;                            \
+        comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6;                                     \
+        comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5;                            \
+        out.s0       = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6;                \
+        out.s1       = comm_fact.s0 - comm_fact.s1;                                                \
+        out.s2       = comm_fact.s0 + comm_fact.s1;                                                \
+        out.s3       = comm_fact.s2 - comm_fact.s3;                                                \
+        out.s4       = comm_fact.s2 + comm_fact.s3;                                                \
+        out.s5       = comm_fact.s4 - comm_fact.s5;                                                \
+        out.s6       = comm_fact.s4 + comm_fact.s5;                                                \
+        out.s7       = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
+    })
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z              = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp0 = in_row0;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    tmp0 -= in_row2;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
+    DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
+    DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
+    DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp1 = in_row1 + in_row2;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp2 = in_row2 - in_row1;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp3 = in_row1 - in_row3;
+
+    DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
+    DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
+    DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
+    DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
+
+    DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
+    DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
+    DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
+    DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
+
+    DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
+    DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
+    DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
+    DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z))  = out10;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z))  = out11;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z))  = out12;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z))  = out13;
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out20;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out21;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = (get_global_id(2) * 2) % SRC_DEPTH;
+    const int b = (get_global_id(2) * 2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int       z        = get_global_id(2) * 2;
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    src_addr += src_stride_z;
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp0 = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp4 = in_row4;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    tmp0 -= in_row2;
+    tmp4 -= in_row6;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp1 = in_row1 + in_row2;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp2 = in_row2 - in_row1;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp3 = in_row1 - in_row3;
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp5 = in_row5 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp6 = in_row6 - in_row5;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp7 = in_row5 - in_row7;
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
+    vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
+    vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
+    vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
+    vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
+    vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
+    vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
+    vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
+    vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
+    vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
+    vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
+    vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
+    vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
+    vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
+    vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int       z        = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d01                                        = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE out0 = 0.0f;
+    DATA_TYPE out1 = 0.0f;
+    DATA_TYPE out2 = 0.0f;
+    DATA_TYPE out3 = 0.0f;
+    DATA_TYPE out4 = 0.0f;
+    DATA_TYPE out5 = 0.0f;
+
+    // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
+    out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
+    out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
+    out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
+    out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
+    out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
+    out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row4
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+
+    // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
+    DATA_TYPE k0 = d41.s0;
+    DATA_TYPE k1 = d41.s0;
+    DATA_TYPE k2 = d41.s0;
+    DATA_TYPE k3 = d41.s0;
+    DATA_TYPE k4 = d41.s0;
+    DATA_TYPE k5 = 0.0f;
+
+    k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
+    k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
+    k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
+    k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
+    k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
+    k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
+
+    out0 += k0;
+    out1 += k1;
+    out2 += k2;
+    out3 += k3;
+    out4 += k4;
+    out5 += k5;
+
+    // Row2
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+
+    out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
+    out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
+    out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
+    out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
+    out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
+    out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
+#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Compute destination address
+#if defined(SRC_DEPTH)
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(SRC_DEPTH) */
+
+    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
+
+    *(dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    DATA_TYPE out6  = k0;
+    DATA_TYPE out7  = k1;
+    DATA_TYPE out8  = k2;
+    DATA_TYPE out9  = k3;
+    DATA_TYPE out10 = k4;
+    DATA_TYPE out11 = k5;
+    DATA_TYPE out12 = k0;
+    DATA_TYPE out13 = k1;
+    DATA_TYPE out14 = k2;
+    DATA_TYPE out15 = k3;
+    DATA_TYPE out16 = k4;
+    DATA_TYPE out17 = k5;
+    DATA_TYPE out18 = k0;
+    DATA_TYPE out19 = k1;
+    DATA_TYPE out20 = k2;
+    DATA_TYPE out21 = k3;
+    DATA_TYPE out22 = k4;
+    DATA_TYPE out23 = k5;
+    DATA_TYPE out24 = k0;
+    DATA_TYPE out25 = k1;
+    DATA_TYPE out26 = k2;
+    DATA_TYPE out27 = k3;
+    DATA_TYPE out28 = k4;
+    DATA_TYPE out29 = k5;
+
+    // Row1
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+
+    // Row3
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+
+    // Compute common parts for the channels between [6, 29]
+    // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
+    // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
+    DATA_TYPE part0  = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part1  = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
+    DATA_TYPE part2  = 16.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part3  = 16.0f * d20.s1 - 4.0f * d20.s3;
+    DATA_TYPE part4  = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
+    DATA_TYPE part5  = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
+    DATA_TYPE part6  = 4.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part7  = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
+    DATA_TYPE part8  = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
+    DATA_TYPE part9  = 8.0f * d20.s1 - 8.0f * d20.s3;
+    DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
+    DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
+
+    // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
+    // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
+    DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
+    DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
+    DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
+    DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
+    DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
+    DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
+    DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
+    DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
+    DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
+    DATA_TYPE part21 = part9 * 0.25f;                                                 // 2.0f * (d20.s1 - d20.s3)
+    DATA_TYPE part22 = part10 * 0.25f;                                                // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
+    DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
+
+    out6 += part0 - part1;
+    out12 += part0 + part1;
+    out7 += part2 + part3 + part4 + part5;
+    out8 += part2 - part3 + part4 - part5;
+    out13 += part2 + part3 - part4 - part5;
+    out14 += part2 - part3 - part4 + part5;
+    out9 += part6 + part7 + part8 + part9;
+    out10 += part6 - part7 + part8 - part9;
+    out15 += part6 - part7 - part8 + part9;
+    out16 += part6 + part7 - part8 - part9;
+    out11 += part10 + part11;
+    out17 += part10 - part11;
+
+    out18 += part13 - part12;
+    out24 += part13 + part12;
+    out19 += part14 + part15 + part16 + part17;
+    out20 += part14 - part15 + part16 - part17;
+    out25 += part14 - part15 - part16 + part17;
+    out26 += part14 + part15 - part16 - part17;
+    out21 += part18 + part19 + part20 + part21;
+    out22 += part18 - part19 + part20 - part21;
+    out27 += part18 - part19 - part20 + part21;
+    out28 += part18 + part19 - part20 - part21;
+    out23 += part22 + part23;
+    out29 += part22 - part23;
+
+    *(dst_addr) = out6;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out7;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out8;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out9;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out10;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out11;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out12;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out13;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out14;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out15;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out16;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out17;
+    dst_addr += dst_plane_stride;
+
+    *(dst_addr) = out18;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out19;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out20;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out21;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out22;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out23;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out24;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out25;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out26;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out27;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out28;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out29;
+    dst_addr += dst_plane_stride;
+
+    // Row5
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+
+    // Channels [30, 35]
+    out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
+
+    *(dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int                                z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+    // Load input tile
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Calculate common factors for intermediate tensor
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = 0.0f;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
+    tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
+
+    comm_fact0 = (DATA_TYPE)2.5f * in_row3;
+    comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
+
+    comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
+    comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Calculate output rows (reuse comm_fact0 vector)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0;
+
+    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1, out2, out3, out4, out5, out6, out7;
+
+    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Store values across the channels
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
new file mode 100644
index 0000000000..861ed50651
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
@@ -0,0 +1,1082 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute the 2x1 or 1x2 output tile
+    // out00 = d00 + d01 + d02
+    // out01 = d01 - d02 - d03
+
+    float out00 = d00 + d01 + d02;
+    float out01 = d01 - d02 - d03;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+    // Compute the 2x2 output tile
+    float k0 = d01 + d11 + d21;
+    float k1 = d02 + d12 + d22;
+    float k2 = d11 - d21 - d31;
+    float k3 = d12 - d22 - d32;
+
+    // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
+    // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
+    // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
+    // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
+
+    float out00 = d10;
+    float out01 = -d13;
+    float out10 = d10;
+    float out11 = -d13;
+
+    out00 += d00 + d20 + k0 + k1;
+    out01 += k0 - k1 - (d03 + d23);
+    out10 += -d20 - d30 + k2 + k3;
+    out11 += k2 - k3 + d23 + d33;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Get output address
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0_dt                                            = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(HAS_BIAS)
+    // Add bias
+    out10 += (DATA_TYPE)b;
+    out11 += (DATA_TYPE)b;
+#endif // defined(HAS_BIAS)
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the channels to compose the 6x6 or 6x1 tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04;
+    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
+    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
+    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+
+    // Compute out00, out01, out02 and out03
+    float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
+
+    float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
+    float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
+
+    out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
+    out01 += k1 - d02 - d12 - d22 - d32 - d42;
+    out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
+    out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
+
+    // Compute out10, out11, out12 and out13
+    float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+
+    k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
+
+    out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
+    out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
+    out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
+    out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
+
+    // Compute out20, out21, out22 and out23
+    float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+
+    k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
+
+    out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
+    out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
+    out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
+    out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
+
+    // Compute out30, out31, out32 and out33
+    float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+
+    k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
+
+    out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
+    out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
+    out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
+    out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+    out02 += (float)b;
+    out03 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Get output address
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Store the output tile
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore4(out0_dt, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(HAS_BIAS)
+    // Add bias
+    out10 += (float)b;
+    out11 += (float)b;
+    out12 += (float)b;
+    out13 += (float)b;
+
+    out20 += (float)b;
+    out21 += (float)b;
+    out22 += (float)b;
+    out23 += (float)b;
+
+    out30 += (float)b;
+    out31 += (float)b;
+    out32 += (float)b;
+    out33 += (float)b;
+#endif // defined(HAS_BIAS)
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+            (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+            (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+
+#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact)  \
+    ({                                                                   \
+        comm_fact.s0 = d1 + d2;                                          \
+        comm_fact.s1 = d3 + d4;                                          \
+        comm_fact.s2 = d5 + d6;                                          \
+        \
+        col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0;  \
+        col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
+        \
+        comm_fact.s0 = d1 - d2;                                          \
+        comm_fact.s1 = d3 - d4;                                          \
+        comm_fact.s2 = d5 - d6;                                          \
+        \
+        col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
+        col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7;  \
+    })
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_5x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute output address
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the channels to compose the input tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
+    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
+    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
+    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (DATA_TYPE)b;
+    out01 += (DATA_TYPE)b;
+    out02 += (DATA_TYPE)b;
+    out03 += (DATA_TYPE)b;
+#endif // defined(HAS_BIAS)
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL,
+                                 B_VAL),
+                      VEC_DATA_TYPE(DATA_TYPE, 4));
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
+
+    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
+
+    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
+
+    // Compute the 8x4 intermediate tensor
+    VEC_DATA_TYPE(float, 4)
+    comm_fact0, comm_fact1, comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+    COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
+
+    // Compute the 4x4 output tile
+    comm_fact0 = tmp_col1 + tmp_col2;
+    comm_fact1 = tmp_col3 + tmp_col4;
+    comm_fact2 = tmp_col5 + tmp_col6;
+
+    VEC_DATA_TYPE(float, 4)
+    out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
+    VEC_DATA_TYPE(float, 4)
+    out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
+
+    comm_fact0 = tmp_col1 - tmp_col2;
+    comm_fact1 = tmp_col3 - tmp_col4;
+    comm_fact2 = tmp_col5 - tmp_col6;
+
+    VEC_DATA_TYPE(float, 4)
+    out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out_col0 += (VEC_DATA_TYPE(float, 4))b;
+    out_col1 += (VEC_DATA_TYPE(float, 4))b;
+    out_col2 += (VEC_DATA_TYPE(float, 4))b;
+    out_col3 += (VEC_DATA_TYPE(float, 4))b;
+#endif // defined(HAS_BIAS)
+
+    // Store the output tile
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_5x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nhwc/batch_to_space.cl b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
new file mode 100644
index 0000000000..b910a753a6
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    VECTOR_DECLARATION(block_shape),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0);
+
+    const int in_batch = batch_id + ((x % block_x) + (y % block_y) * (block_x)) * BATCH_SIZE;
+    const int in_x     = x / block_x;
+    const int in_y     = y / block_y;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, in_batch));
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
+
+    const int x = get_global_id(1) + CROP_LEFT;
+    const int y = get_global_id(2) + CROP_TOP;
+    const int z = get_global_id(0);
+
+    const int in_batch = batch_id + ((x % block_x) + (y % block_y) * (block_x)) * BATCH_SIZE;
+    const int in_x     = x / block_x;
+    const int in_y     = y / block_y;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, in_batch));
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
diff --git a/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
new file mode 100644
index 0000000000..cb2da1bd99
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization on tensors with NHWC format.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+                                            TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_ptr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *var_addr  = var_ptr + var_offset_first_element_in_bytes + x_offs;
+#ifndef USE_DEFAULT_BETA
+    __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_GAMMA */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = 0;
+
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+    // Calculate x bar and store results
+    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
+    numerator = SUB_OP(data, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
+
+    res0 = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res0 = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
+    // beta is not zero, hence we need to perform the addition
+    res0 = ADD_OP(res0, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
new file mode 100644
index 0000000000..233beb3aa9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
@@ -0,0 +1,160 @@
+/*
+* Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+// Check valid VEC_SIZES
+#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+
+#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
+    ({                                                      \
+        div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+        uint r  = div_res * (y);                            \
+        mod_res = (x)-r;                                    \
+    })
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
+
+/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The third dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The first dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_X=num. e.g. -DSRC_DIM_X=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
+{
+    // Offset computation
+    const uint curr_out_channel = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); // output feature map
+
+    uint z        = 0;
+    uint batch_id = 0;
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, z);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    curr_out_channels = (VEC_DATA_TYPE(uint, VEC_SIZE))(curr_out_channel) + VEC_OFFS(uint, VEC_SIZE);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    in_channels = (curr_out_channels * (VEC_DATA_TYPE(uint, VEC_SIZE))(K)) % (VEC_DATA_TYPE(uint, VEC_SIZE))(SRC_DIM_X) + (curr_out_channels / (VEC_DATA_TYPE(uint, VEC_SIZE))(NUM_GROUPS));
+
+    // Load the values
+    const __global DATA_TYPE *input_ptr = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + z * src_stride_z + batch_id * src_stride_w);
+
+#if VEC_SIZE == 1
+    DATA_TYPE out0 = *((const __global * DATA_TYPE)(input_ptr) + in_channels);
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1)
+    };
+#elif VEC_SIZE == 3
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2)
+    };
+#elif VEC_SIZE == 4
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3)
+    };
+#elif VEC_SIZE == 8
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3),
+        *(input_ptr + in_channels.s4),
+        *(input_ptr + in_channels.s5),
+        *(input_ptr + in_channels.s6),
+        *(input_ptr + in_channels.s7)
+    };
+#elif VEC_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3),
+        *(input_ptr + in_channels.s4),
+        *(input_ptr + in_channels.s5),
+        *(input_ptr + in_channels.s6),
+        *(input_ptr + in_channels.s7),
+        *(input_ptr + in_channels.s8),
+        *(input_ptr + in_channels.s9),
+        *(input_ptr + in_channels.sa),
+        *(input_ptr + in_channels.sb),
+        *(input_ptr + in_channels.sc),
+        *(input_ptr + in_channels.sd),
+        *(input_ptr + in_channels.se),
+        *(input_ptr + in_channels.sf)
+    };
+#endif // VEC_SIZE == 1
+
+    __global uchar *output_ptr = dst_ptr + curr_out_channel * sizeof(DATA_TYPE) + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+    STORE_VECTOR_SELECT(out, DATA_TYPE, output_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/depth_to_space.cl b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
new file mode 100644
index 0000000000..84f8aa7263
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nhwc(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0) % r;
+
+    const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+    const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
diff --git a/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
new file mode 100644
index 0000000000..238d3a7921
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nhwc(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    __global float *scale)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+    scale -= max(xi - (int)LAST_ACCESSED_X, 0);
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
new file mode 100644
index 0000000000..81ceeb8846
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "helpers_asymm.h"
+#include "tile_helpers.h"
+
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the direct convolution.
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
+ * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, and 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
+ *
+ *@note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
+ * - -DIS_QUANTIZED
+ * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
+ * - The destination quantization shift e.g. -DDST_SHIFT=4
+ * - The destination offset e.g. -DDST_OFFSET=4
+ * - The source offset e.g. -DSRC_OFFSET=4
+ * - The weights offset e.g. -DWEI_OFFSET=4
+ * - The quantized zero value e.g. -DZERO_VALUE=4
+ *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32/QASYMM8
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                             The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weights matrix
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED)
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void direct_convolution_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+    // If quantized, the output tile has to be quantized first before being stored to global memory
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
+
+    // .v    = access the whole vector (OpenCL vector)
+    // .s[x] = access the vector element at position x (scalar access)
+    TILE(int, 1, M0, xi);
+    TILE(int, 1, M0, yi);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xi[0].s[i] = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
+        yi[0].s[i] = ((mout + i) / _IDST_WIDTH) * STRIDE_Y;
+        xi[0].s[i] -= PAD_LEFT;
+        yi[0].s[i] -= PAD_TOP;
+    })
+
+    // Initialize the accumulators
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+    {
+        int xk = i % _IWEI_WIDTH;
+        int yk = i / _IWEI_WIDTH;
+
+        TILE(int, 1, M0, my);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            int x_s    = xi[0].s[i] + xk;
+            int y_s    = yi[0].s[i] + yk;
+            my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
+            my[0].s[i] = my[0].s[i] + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
+            my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
+            my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
+            my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
+            my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
+        })
+
+        int ck = 0;
+        for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
+        {
+            TILE(SRC_DATA_TYPE, M0, K0, a);
+            TILE(WEI_DATA_TYPE, N0, K0, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = ZERO_VALUE;
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = ZERO_VALUE;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+            // Load tile from the weights tensor
+            T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+            // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+        }
+
+        // This #if directive should be removed in case of dynamic tensor support
+#if defined(LEFTOVER_LOOP)
+        // Left-over accumulations
+        for(; ck < _ISRC_CHANNELS; ++ck)
+        {
+            TILE(SRC_DATA_TYPE, M0, 1, a);
+            TILE(WEI_DATA_TYPE, N0, 1, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = ZERO_VALUE;
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = ZERO_VALUE;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+            // Load tile from the weights tensor
+            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+            T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+            // Apply the offset correction (operation usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+        }
+#endif // defined(LEFTOVER_LOOP)
+    }
+
+    // Offset correction required for the quantized asymmetric computation
+    // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+#if defined(IS_QUANTIZED)
+
+    TILE(DST_DATA_TYPE, M0, N0, cq);
+
+    // Quantize the tile
+    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+#endif // defined(IS_QUANTIZED)
+
+    // Apply activation
+    T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE);
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
+
+#undef _IWEI_WIDTH
+#undef _IWEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+}
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
new file mode 100644
index 0000000000..807b990e82
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+#include "tile_helpers.h"
+
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the direct convolution 3d.
+ *
+ * @note Data layout supported: NDHWC
+ * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
+ * @note The accumulation data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
+ * @note The convolution padding (left, top and front) must be passed at compile time using -DPAD_LEFT, -DPAD_TOP and -DPAD_FRONT (e.g. -DPAD_LEFT=2, -DPAD_TOP=2, -DPAD_FRONT=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2, -DSTRIDE_Z=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH, -DWEI_HEIGHT and -DWEI_DEPTH (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9, -DWEI_DEPTH=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT and -DSRC_DEPTH (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64, -DSRC_DEPTH=32)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH, -DDST_HEIGHT and -DDST_DEPTH (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64, -DDST_DEPTH=32)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDST_CHANNELS=64)
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, .... n
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
+ *  - -DIS_QUANTIZED
+ *  - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
+ *  - The destination quantization shift e.g. -DDST_SHIFT=4
+ *  - The destination offset e.g. -DDST_OFFSET=4
+ *  - The source offset e.g. -DSRC_OFFSET=4
+ *  - The weights offset e.g. -DWEI_OFFSET=4
+ *  - The quantized zero value e.g. -DZERO_VALUE=4
+ *
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time along with its tensor type by using -DBIA_DATA_TYPE (e.g. -DBIA_DATA_TYPE=int).
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  wei_step_x                        wei_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weights matrix
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void direct_convolution3d_ndhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    TENSOR4D(wei, BUFFER)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _IWEI_DEPTH WEI_DEPTH
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_DEPTH SRC_DEPTH
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_DEPTH DST_DEPTH
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT * _IWEI_DEPTH)
+
+    // If quantized, the output tile has to be quantized first before being stored to global memory
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT x DEPTH
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
+
+    TILE(int, M0, 1, xi);
+    TILE(int, M0, 1, yi);
+    TILE(int, M0, 1, zi);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xi[i].v = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
+        yi[i].v = (((mout + i) / _IDST_WIDTH) % _IDST_HEIGHT) * STRIDE_Y;
+        zi[i].v = (((mout + i) / (_IDST_WIDTH * _IDST_HEIGHT)) % _IDST_DEPTH) * STRIDE_Z;
+
+        xi[i].v -= PAD_LEFT;
+        yi[i].v -= PAD_TOP;
+        zi[i].v -= PAD_FRONT;
+    })
+
+    // Initialize the accumulators
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = (ACC_DATA_TYPE)0;
+    })
+
+    for(int i = 0; i < _IY_MULTIPLIER; ++i)
+    {
+        int ck = 0;
+        int xk = i % _IWEI_WIDTH;
+        int yk = (i / _IWEI_WIDTH) % _IWEI_HEIGHT;
+        int zk = i / (_IWEI_WIDTH * _IWEI_HEIGHT);
+
+        int k = 0;
+        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+        {
+            TILE(DATA_TYPE, M0, K0, a);
+            TILE(DATA_TYPE, N0, K0, b);
+
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = ZERO_VALUE;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NDHWC_INDIRECT(DATA_TYPE, M0, K0, BUFFER, src, bout, zk, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, _ISRC_DEPTH, src_stride_y, xi, yi, zi, a);
+
+            // Load tile from the weights tensor
+            const int b_offs = k + (xk * _ISRC_CHANNELS) + (yk * _ISRC_CHANNELS * _IWEI_WIDTH) + (zk * _ISRC_CHANNELS * _IWEI_WIDTH * _IWEI_HEIGHT);
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                if((cout + i) < _IDST_CHANNELS)
+                {
+                    LOOP_UNROLLING(int, j, 0, 1, K0,
+                    {
+                        b[i].s[j] = *(__global DATA_TYPE *)(wei_ptr + wei_offset_first_element_in_bytes + (cout + i) * sizeof(DATA_TYPE) + j * wei_stride_y + b_offs * wei_stride_y);
+                    })
+                }
+            })
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+            // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+
+            ck += K0;
+        }
+
+#if((_ISRC_CHANNELS % K0) != 0)
+        // Left-over accumulations
+        for(; k < _ISRC_CHANNELS; ++k)
+        {
+            TILE(DATA_TYPE, M0, 1, a);
+            TILE(DATA_TYPE, N0, 1, b);
+
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = ZERO_VALUE;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NDHWC_INDIRECT(DATA_TYPE, M0, 1, BUFFER, src, bout, zk, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, _ISRC_DEPTH, src_stride_y, xi, yi, zi, a);
+
+            // Load tile from the weights tensor
+            const int b_offs = k + (xk * _ISRC_CHANNELS) + (yk * _ISRC_CHANNELS * _IWEI_WIDTH) + (zk * _ISRC_CHANNELS * _IWEI_WIDTH * _IWEI_HEIGHT);
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                if((cout + i) < _IDST_CHANNELS)
+                {
+                    b[i].v = *(__global DATA_TYPE *)(wei_ptr + wei_offset_first_element_in_bytes + (cout + i) * sizeof(DATA_TYPE) + b_offs * wei_stride_y);
+                }
+            })
+
+            // // Compute the matrix multiplication between two tiles
+            T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+            // Apply the offset correction (operation usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+
+            ++ck;
+        }
+#endif // ((_ISRC_CHANNELS % K0) != 0)
+    }
+
+    // Offset correction required for the quantized asymmetric computation
+    // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _IWEI_DEPTH * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    if((cout + N0) <= _IDST_CHANNELS)
+    {
+        bias0[0].v = VLOAD(N0)(0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(BIA_DATA_TYPE)));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, PARTIAL_N0)
+        (bias0[0].v, 0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(BIA_DATA_TYPE)));
+    }
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH *_IDST_HEIGHT * _IDST_DEPTH) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH *_IDST_HEIGHT * _IDST_DEPTH);
+    })
+
+#if defined(IS_QUANTIZED)
+    TILE(DATA_TYPE, M0, N0, cq);
+
+    // Quantize the tile
+    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+#endif // defined(IS_QUANTIZED)
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_N0, BUFFER, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
new file mode 100644
index 0000000000..dcbae220b6
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+// *INDENT-OFF*
+// clang-format off
+#if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the depthwise convolution for floating-point data types (F32/F16)
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note Only the following configurations of M0 and N0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
+ *  - N0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
+ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+ *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                             The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weigts matrix
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void dwc_native_fp_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Only the weight tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
+#define _IN0_A N0_A        // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+#define _IM0_B _IWEI_WIDTH // Rows tile B
+#define _IN0_B N0          // Cols tile B
+#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                 // defined(BATCHED_EXECUTION)
+
+    int xi = xo * STRIDE_X;
+    int yi = yo * STRIDE_Y;
+    xi -= PAD_LEFT;
+    yi -= PAD_TOP;
+
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+
+    // Reset accumulators
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+#if _IWEI_HEIGHT < 5
+    LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else  // _IWEI_HEIGHT <= 5
+    for(int yk = 0; yk < _IWEI_HEIGHT; ++yk)
+#endif // _IWEI_HEIGHT <= 5
+    {
+        TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+
+        LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+        {
+            a[i].v = 0;
+        })
+
+        // Load tile from the src tensor (TILE A)
+        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), SRC_WIDTH, SRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+
+        TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+
+        // Load tile from the weights tensor (TILE B)
+        T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b);
+
+        // Optimized path for STRIDE_X == 1
+        // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+            {
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+                c[m0].v += a[xk + m0].v * b[xk].v;
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+                c[m0].v = fma(a[xk + m0].v, b[xk].v, c[m0].v);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+            })
+        })
+    }
+#if _IWEI_HEIGHT < 5
+                      )
+#endif // _IWEI_HEIGHT <= 5
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+#endif // HAS_BIAS
+
+    T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    if(x_cond)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            int xi_out = min(xo + M0 - 1 - m0, (int)(DST_WIDTH) - 1);
+            VSTORE_PARTIAL(N0, PARTIAL_N0)
+            (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            int xi_out = min(xo + M0 - 1 - m0, (int)(DST_WIDTH) - 1);
+            VSTORE(N0)
+            (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
+    }
+}
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+// *INDENT-ON*
+// clang-format on
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
new file mode 100644
index 0000000000..2d255e5b61
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+#include "tile_helpers.h"
+
+// *INDENT-OFF*
+// clang-format off
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_##A_DATA_TYPE##_##B_DATA_TYPE
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_char (0)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_uchar (0)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_char (128)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_uchar (-128)
+
+#define T_LOAD_MULTIPLIERS_SHIFT_PER_TENSOR() \
+    ({})
+
+#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL()                                                     \
+    TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers);                                           \
+    TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts);                                                     \
+    T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout, 0, 0, 0, multipliers); \
+    T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout, 0, 0, 0, shifts);
+
+#define T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE)
+#define T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_##QUANTIZATION_TYPE()
+
+#if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the depthwise convolution for quantized data types
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=int8)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=int8)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=int8)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=int)
+ * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The activation type must be passed at compile using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note The A and B variables required by some activation functions must be passed at compile time using -DA_VAL= and -DB_VAL= respectively
+ * @note The quantization offset used for both the per-tensor and per-channel quantization must be passed at compile using -DDST_OFFSET (e.g., -DDST_OFFSET=3)
+ * @note The quantization shift for the per-tensor quantization must be passed at compile time using -DDST_SHIFT (e.g., -DDST_SHIFT=1)
+ * @note The quantization multiplier for the per-tensor quantization must be passed at compile using -DDST_MULTIPLIER (e.g., -DDST_MULTIPLER=121432)
+ * @note Only the following configurations of M0 and N0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
+ *  - N0 = 2, 3, 4, 8, 16
+ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+ *
+ * @param[in]  src_img                                       (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                                       Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
+ * @param[in]  src_stride_y                                  Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                                  Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                                  Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                                         The size of the channels dimension of the source tensor
+ * @param[in]  src_w                                         The size of the width dimension of the source tensor
+ * @param[in]  src_h                                         The size of the height dimension of the source tensor
+ * @param[in]  src_n                                         The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes             The offset of the first element in the source tensor
+ * @param[out] dst_img                                       (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                                       Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_y                                  Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                                  Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                                  Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                                         The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                                         The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                                         The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                                         The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes             The offset of the first element in the destination tensor
+ * @param[in]  wei_img                                       (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in]  wei_ptr                                       Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_y                                  Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_stride_z                                  Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_stride_w                                  Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                                         The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                                         The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                                         The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                                         The size of the batches dimension of the weights tensor
+ * @param[in]  wei_step_w                                    wei_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  wei_offset_first_element_in_bytes             The offset of the first element in the weights tensor
+ * @param[in]  dst_multipliers_ptr                           Pointer to the destination multipliers tensor for the per-channel quantization. Supported data type: S32
+ * @param[in]  dst_multipliers_stride_x                      Stride of the destination multipliers tensor in X dimension (in bytes)
+ * @param[in]  dst_multipliers_step_x                        dst_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_multipliers_offset_first_element_in_bytes The offset of the first element in the destination multipliers tensor
+ * @param[in]  dst_shifts_ptr                                Pointer to the destination shifts tensor for the per-channel quantization. Supported data type: S32
+ * @param[in]  dst_shifts_stride_x                           Stride of the destination shifts tensor in X dimension (in bytes)
+ * @param[in]  dst_shifts_step_x                             dst_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_shifts_offset_first_element_in_bytes      The offset of the first element in the destination shifts tensor
+ * @param[in]  bia_ptr                                       (Optional) Pointer to the bias tensor Supported data type: S32
+ * @param[in]  bia_stride_x                                  (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                                    (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes             (Optional) The offset of the first element in the bias tensor
+ */
+//! @endcond
+__kernel void dwc_native_quantized_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE),
+    VECTOR_DECLARATION(dst_multipliers),
+    VECTOR_DECLARATION(dst_shifts)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Only the weight tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
+#define _IN0_A N0_A        // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+#define _IM0_B _IWEI_WIDTH // Rows tile B
+#define _IN0_B N0          // Cols tile B
+#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                 // defined(BATCHED_EXECUTION)
+
+    int xi = xo * STRIDE_X;
+    int yi = yo * STRIDE_Y;
+    xi -= PAD_LEFT;
+    yi -= PAD_TOP;
+
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+
+    // Reset accumulators
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+#if _IWEI_HEIGHT <= 5
+    LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else  // _IWEI_HEIGHT <= 5
+    for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+#endif // _IWEI_HEIGHT <= 5
+                   {
+                       TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+
+                       LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+    {
+        a[i].v = ZERO_VALUE;
+    })
+
+    // Load tile from the src tensor (TILE A)
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+
+    TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+
+    // Load tile from the weights tensor (TILE B)
+    T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b);
+
+    // Optimized path for STRIDE_X == 1
+    // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+#if _IWEI_WIDTH <= 16
+#define DOT_DATA_TYPE SRC_DATA_TYPE
+#define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE))
+
+            // Optimized path for the dot instruction
+            TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
+            TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
+            ACC_DATA_TYPE offset_a = 0;
+            ACC_DATA_TYPE offset_b = 0;
+
+            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+            {
+                x0[0].s[xk] = a[xk + m0].s[n0];
+                y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
+            })
+            DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
+            REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
+            REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
+            c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
+#else  // _IWEI_WIDTH <= 16
+            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+            {
+                c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
+            })
+#endif // _IWEI_WIDTH <= 16
+        })
+    })
+                   }
+#if _IWEI_HEIGHT <= 5
+                  )
+#endif // _IWEI_HEIGHT <= 5
+
+#if _IWEI_WIDTH <= 16
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c);
+#endif // _IWEI_WIDTH <= 16
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    // Load bias
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+#endif // HAS_BIAS
+
+    T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE);
+
+    // Quantize the tile
+    TILE(DST_DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq);
+
+    // Perform activation
+    T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq);
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    if(x_cond)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+            VSTORE_PARTIAL(N0, PARTIAL_N0)
+            (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+            VSTORE(N0)
+            (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
+    }
+}
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+// *INDENT-ON*
+// clang-format on
diff --git a/src/core/CL/cl_kernels/nhwc/im2col.cl b/src/core/CL/cl_kernels/nhwc/im2col.cl
new file mode 100644
index 0000000000..a23e943fab
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/im2col.cl
@@ -0,0 +1,526 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define COND_N SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+
+#if defined(IM2COL_3X3) || defined(IM2COL_9X9)
+/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
+ *  @name IM2COL1X9_NHWC_STORE
+ *
+ *  @note To use this macro for a 3x3 block, @p ROW has to be 0
+ *
+ * @param[in] VECTOR_SIZE          The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
+ * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
+ * @param[in] DATA_TYPE            Data type of @p DATA
+ * @param[in] SRC_DEPTH            Input channel size / depth
+ * @param[in] DATA                 Value variable base name
+ * @param[in] ROW                  The row number to store. Supported: 0-8
+ * @param[in] OUTPUT_PTR           Output pointer
+ * @{
+ */
+#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)         \
+    const bool at_channel_boundary = get_global_id(0) == 0;                                                          \
+    if(at_channel_boundary)                                                                                          \
+    {                                                                                                                \
+        IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    }                                                                                                                \
+    else                                                                                                             \
+    {                                                                                                                \
+        IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)                    \
+    }
+#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
+#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+
+#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+
+#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+/** @}*/
+#endif // defined(IM2COL_3X3) || defined(IM2COL_9X9)
+
+#if defined(IM2COL_3X3)
+/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int  yi_coord = 0;
+    int3 offset   = 0;
+
+    // Clamp xi
+    int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+    xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+    // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
+    xi_offset *= (int3)src_stride_y;
+
+    // Out-of-bound condition for X
+    int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
+
+    // yi == 0
+    // Clamp yi
+    // yi_coord is casted to unsigned int in order to use just a min() operation
+    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+    // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
+    // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
+    // also causes y_cond (y padding condition) to be satisfied
+    yi_coord = yi - (int)PAD_TOP;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with PAD_VALUE
+    int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
+    values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // yi == 1
+    // Clamp yi_coord (it can be negative if PAD_TOP > 1)
+    yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with zeros
+    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+    values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // yi == 2
+    // Clamp yi_coord
+    yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with PAD_VALUE
+    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+    values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // Store in a boundary-aware way to avoid padding
+    IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(IM2COL_3X3)
+
+#if defined(IM2COL_9X9)
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                                         \
+    ({                                                                                                       \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
+        \
+        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                \
+        values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
+        values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
+        values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
+        values3    = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
+        values4    = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
+        values5    = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
+        values6    = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
+        values7    = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
+        values8    = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1)));    \
+        \
+        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+    })
+#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                                         \
+    ({                                                                                                       \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
+        \
+        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+    })
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col9x9_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int  yi_coord = 0;
+    int8 offset0  = 0;
+    int  offset1  = 0;
+
+    // Clamp xi
+    int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
+    int  xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
+
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+    xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
+    xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+    xi_offset0 *= (int8)src_stride_y;
+    xi_offset1 *= (int)src_stride_y;
+
+    // Out-of-bound condition for X
+    int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
+    int  x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+    IM2COL1x9(0);
+    IM2COL1x9(1);
+    IM2COL1x9(2);
+    IM2COL1x9(3);
+    IM2COL1x9(4);
+    IM2COL1x9(5);
+    IM2COL1x9(6);
+    IM2COL1x9(7);
+    IM2COL1x9(8);
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(IM2COL_9X9)
+
+#if defined(IM2COL_GENERIC)
+/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (yo % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (yo / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    const int stride_x         = ch * sizeof(DATA_TYPE);
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + stride_x + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + stride_x + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int i = 0;
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+    {
+        // Clamp yi_coord
+        int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
+        yi_coord     = clamp(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
+
+        // Out-of-bound condition for Y
+        int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
+
+        for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
+        {
+            // Clamp xi_coord
+            int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
+            xi_coord     = clamp(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
+
+            // Out-of-bound condition for X
+            int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+            int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
+
+            VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+            // Replace with PAD_VALUE if the value is out-of-bound
+            values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+            // Store in a boundary-aware way to avoid padding
+#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+            const bool at_channel_boundary = get_global_id(0) == 0;
+            if(at_channel_boundary)
+            {
+                VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
+                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+            }
+            else // at_channel_boundary
+#endif           // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+            {
+                VSTORE(VECTOR_SIZE)
+                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+            }
+            i++;
+        }
+    }
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(IM2COL_GENERIC)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
new file mode 100644
index 0000000000..aa719bfef0
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(INDIRECT_CONVOLUTION_ADDRESS_PRECALCULATION)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the indirect convolution 2d indirect buffer.
+ *
+ * @note This kernel only works for unit batch_size
+ *
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The kernel width must be passed at compile time using -DWEI_CONV_WIDTH (e.g. -DWEI_CONV_WIDTH=9)
+ * @note The spatial dimensions of the source tensor used by conv2d must be passed at compile time using -DSRC_CONV_WIDTH and -DSRC_CONV_HEIGHT (e.g. -DSRC_CONV_WIDTH=96, -DSRC_CONV_HEIGHT=64)
+ * @note The width dimension of the destination tensor produced by conv2d must be passed at compile time using -DDST_CONV_WIDTH (e.g. -DDST_CONV_WIDTH=96)
+ * @note The tensor type ("BUFFER" only) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, and 8
+ *
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: INT32
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void indirect_convolution_address_precalculation(
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE))
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Note: WIDTH = M0 x KernelWidth x KernelHeight
+
+    // m index
+    const int mi = x % M0;
+    // Kernel index
+    const int ki = x / M0;
+    // Kernel width coordinate
+    const int xk = ki % WEI_CONV_WIDTH;
+    // kernel height coordinate
+    const int yk = ki / WEI_CONV_WIDTH;
+
+    TILE(DST_DATA_TYPE, 1, 1, xi);
+    TILE(DST_DATA_TYPE, 1, 1, yi);
+    TILE(DST_DATA_TYPE, 1, 1, my);
+
+    const int mout = y * M0;
+
+    xi[0].s[0] = ((mout + mi) % DST_CONV_WIDTH) * STRIDE_X;
+    yi[0].s[0] = ((mout + mi) / DST_CONV_WIDTH) * STRIDE_Y;
+    xi[0].s[0] -= PAD_LEFT;
+    yi[0].s[0] -= PAD_TOP;
+
+    const int x_s = xi[0].s[0] + xk;
+    const int y_s = yi[0].s[0] + yk;
+    my[0].s[0]    = x_s + y_s * SRC_CONV_WIDTH;
+    my[0].s[0]    = my[0].s[0] + z * (int)(SRC_CONV_WIDTH * SRC_CONV_HEIGHT);
+    my[0].s[0]    = select(-1, my[0].s[0], x_s >= 0);
+    my[0].s[0]    = select(-1, my[0].s[0], x_s < SRC_CONV_WIDTH);
+    my[0].s[0]    = select(-1, my[0].s[0], y_s >= 0);
+    my[0].s[0]    = select(-1, my[0].s[0], y_s < SRC_CONV_HEIGHT);
+
+    VSTORE(1)
+    (my[0].s[0], 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DST_DATA_TYPE) + y * dst_stride_y + z * dst_stride_z));
+}
+#endif // defined(INDIRECT_CONVOLUTION_ADDRESS_PRECALCULATION)
+
+#if defined(INDIRECT_CONVOLUTION_NHWC)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the indirect convolution.
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The vector length used for loading the values from the indirect buffer should be passed at compile time using -DIND_BUFF_VEC_SIZE (e.g. -DIND_BUFF_VEC_SIZE=4)
+ * @note The activation function to fuse and corresponding A and B values should be passed at compile time using -DACTIVATION_TYPE, -DA_VAL, and -DB_VAL
+ *        (e.g. -DFUNCTION_TYPE=lu_brelu_op, -DA_VAL=3.0, and -DB_VAL=1.0)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, and 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
+ *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  off_img                           (Not supported) Read only cl_image object for the indirect buffer tensor. Included when OFF_TENSOR_TYPE=IMAGE
+ * @param[in]  off_ptr                           Pointer to the indirect buffer tensor. Supported data type: INT32
+ * @param[in]  off_stride_y                      Stride of the indirect buffer tensor in Y dimension (in bytes)
+ * @param[in]  off_stride_z                      Stride of the indirect buffer tensor in Z dimension (in bytes)
+ * @param[in]  off_stride_w                      Stride of the indirect buffer tensor in W dimension (in bytes)
+ * @param[in]  off_c                             The size of the channels dimension of the indirect buffer tensor
+ * @param[in]  off_w                             The size of the width dimension of the indirect buffer tensor
+ * @param[in]  off_h                             The size of the height dimension of the indirect buffer tensor
+ * @param[in]  off_n                             The size of the batches dimension of the indirect buffer tensor
+ * @param[in]  off_offset_first_element_in_bytes The offset of the first element in the indirect buffer tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[out] wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                             The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void indirect_convolution_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_RO_T(off, OFF_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
+
+    off_offset_first_element_in_bytes += get_global_id(1) * off_stride_y;
+    off_offset_first_element_in_bytes += bout * off_stride_z;
+
+    // Initialize the accumulators
+    TILE(DST_DATA_TYPE, M0, N0, c);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+    {
+        TILE(int, 1, IND_BUFF_VEC_SIZE, my);
+        T_LOAD(int, 1, IND_BUFF_VEC_SIZE, OFF_TENSOR_TYPE, off, i * M0, 0, 1, 0, my);
+
+        int ck = 0;
+        for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
+        {
+            TILE(SRC_DATA_TYPE, M0, K0, a);
+            TILE(WEI_DATA_TYPE, N0, K0, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = 0.0;
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = 0.0;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+            // Load tile from the weights tensor
+            T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+        }
+
+        // This #if directive should be removed in case of dynamic tensor support
+#if defined(LEFTOVER_LOOP)
+        // Left-over accumulations
+        for(; ck < _ISRC_CHANNELS; ++ck)
+        {
+            TILE(SRC_DATA_TYPE, M0, 1, a);
+            TILE(WEI_DATA_TYPE, N0, 1, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = 0.0;
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = 0.0;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+            // Load tile from the weights tensor
+            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+            T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, DST_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+        }
+#endif // defined(LEFTOVER_LOOP)
+    }
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+    // Apply activation
+    T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    const bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, c, dst_indirect_y);
+
+#undef _IWEI_WIDTH
+#undef _IWEI_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IY_MULTIPLIER
+}
+#endif // defined(INDIRECT_CONVOLUTION_NHWC)
diff --git a/src/core/CL/cl_kernels/nhwc/normalization_layer.cl b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
new file mode 100644
index 0000000000..7e35e161c8
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#if defined(WIDTH_SIZE)
+/** Apply cross-map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_cross_map_nhwc(TENSOR3D_DECLARATION(input),
+                                                 TENSOR3D_DECLARATION(output))
+{
+    // Offset computation
+    const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = 0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = SQCVT_SAT(KAPPA);
+
+    const int left_slice  = max((int)0, (int)x_offs - (int)RADIUS);
+    const int right_slice = min((int)WIDTH_SIZE - 1, (int)x_offs + (int)RADIUS);
+
+    for(int i = left_slice; i <= right_slice; ++i)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * sizeof(DATA_TYPE)));
+        acc    = ADD_OP(acc, MUL_OP(values, values));
+    }
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x_offs * sizeof(DATA_TYPE))), normalized);
+
+    STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(WIDTH_SIZE)
+
+#if defined(NUM_SLICES) && defined(DIM1_SIZE)
+/** Apply in-map normalization when tensors are in the NHWC data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
+{
+    // Offset computation
+    const uint x_offs       = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    const int  current_cols = get_global_id(1);
+    const int  current_rows = get_global_id(2);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + current_cols * output_stride_y + current_rows * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = 0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = SQCVT_SAT(KAPPA);
+
+    const int first_col = max(0, current_cols - (int)RADIUS);
+    const int last_col  = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS);
+
+#if defined(IN_MAP_2D)
+    const int first_row = max(0, current_rows - (int)RADIUS);
+    const int last_row  = min((int)NUM_SLICES - 1, current_rows + (int)RADIUS);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
+    {
+#else  // defined(IN_MAP_2D)
+    const int j = current_rows;
+#endif /* defined(IN_MAP_2D) */
+        for(int i = first_col; i <= last_col; ++i)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * input_stride_y + j * input_stride_z));
+            acc    = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
+    }
+#endif /* defined(IN_MAP_2D) */
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows *output_stride_z)), normalized);
+
+    STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(NUM_SLICES) && defined(DIM1_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
new file mode 100644
index 0000000000..86c33499e2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
+                                              TENSOR3D_DECLARATION(dst),
+                                              VECTOR_DECLARATION(mean),
+                                              VECTOR_DECLARATION(std))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr  = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *std_addr  = std_ptr + std_offset_first_element_in_bytes + x_offs;
+
+    const TYPE curr_mean = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
+    const TYPE curr_std  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr);
+
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+    TYPE res0 = (data - curr_mean) / curr_std;
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
new file mode 100644
index 0000000000..7bc3c15a63
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
+                                                 TENSOR3D_DECLARATION(dst),
+                                                 VECTOR_DECLARATION(mean),
+                                                 VECTOR_DECLARATION(std))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr  = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *std_addr  = std_ptr + std_offset_first_element_in_bytes + x_offs;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_mean_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_std_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+    data_flt = round(data_flt - OFFSET_FLT) * (SCALE_FLT);
+
+    // Perform normalization
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+    const TYPE res0 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl
new file mode 100644
index 0000000000..4e5481d1db
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h" // Needed for GET_SPATIAL_IDX()
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+
+/** Performs 3d pooling layer of size equal to MxNXD. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be excluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X, -DPOOL_SIZE_Y, and -DPOOL_SIZE_Z. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4, -DPOOL_SIZE_Z=2
+ * @note Input tensor width, height and depth must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT, and -DSRC_DEPTH
+ * @note Output tensor height, channels, depth, and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS, -DDST_DEPTH, and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z which are the steps of the window along the x, y and z directions
+ * @note Pool pads must be passed at compile time using -DPAD_X, -DPAD_Y, -DPAD_Z
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_stride_v                       Stride of the source tensor in V dimension (in bytes)
+ * @param[in]  input_step_v                         input_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_stride_v                      Stride of the destination tensor in V dimension (in bytes)
+ * @param[in]  output_step_v                        output_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_3d_layer_MxN_ndhwc(
+    TENSOR5D_DECLARATION(input),
+    TENSOR5D_DECLARATION(output))
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+
+    // The depth size dimension and the batch size dimension are collapsed over the height dimension
+    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+    int idx_out_d = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) % DST_DEPTH;
+    int idx_out_n = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) / DST_DEPTH;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_v;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_d *
+                                           output_stride_w + idx_out_n * output_stride_v;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+    int idx_in_w = idx_out_w * STRIDE_X - (int)PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - (int)PAD_Y;
+    int idx_in_d = idx_out_d * STRIDE_Z - (int)PAD_Z;
+
+    // The start of width to consider in calculation should exclude padding
+    int pool_x_s = max((int)0, -idx_in_w);
+    // Assumed Symmetric Padding (left padding = right padding = PAD_X), the filter end should be either the pool width or what is remaining from current pos to the (src width + pad right)
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH + PAD_X - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT + PAD_Y - idx_in_h);
+    int pool_z_s = max((int)0, -idx_in_d);
+    int pool_z_e = min((int)POOL_SIZE_Z, (int)SRC_DEPTH + PAD_Z - idx_in_d);
+
+    // The filter size with all padding in all directions considered.
+    int filter_size = pool_z_e * pool_y_e * pool_x_e;
+
+    // The end of width to consider in calculation should exclude PAD_X
+    pool_x_e = min(pool_x_e, SRC_WIDTH - idx_in_w);
+    pool_y_e = min(pool_y_e, SRC_HEIGHT - idx_in_h);
+    pool_z_e = min(pool_z_e, SRC_DEPTH - idx_in_d);
+
+#if defined(EXCLUDE_PADDING)
+    filter_size = (pool_z_e - pool_z_s) * (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+#endif  // defined(EXCLUDE_PADDING)
+
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && POOL_SIZE_Z == SRC_DEPTH && PAD_X == 0 && PAD_Y == 0 && PAD_Z == 0
+    // Global pooling path
+    for(int z = 0; z < POOL_SIZE_Z; ++z)
+    {
+        int depth_offset_src = (z + idx_in_d) * input_stride_w;
+        for(int y = 0; y < POOL_SIZE_Y; ++y)
+        {
+            int height_offset_src = (y + idx_in_h) * input_stride_z;
+#pragma unroll 8
+            for(int x = 0; x < POOL_SIZE_X; ++x)
+            {
+                int width_offset_src = (x + idx_in_w) * input_stride_y;
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && POOL_SIZE_Z == SRC_DEPTH && PAD_X == 0 && PAD_Y == 0 && PAD_Z == 0
+    for(int z = pool_z_s; z < pool_z_e; ++z)
+    {
+        int depth_offset_src = (z + idx_in_d) * input_stride_w;
+        for(int y = pool_y_s; y < pool_y_e; ++y)
+        {
+            int height_offset_src = (y + idx_in_h) * input_stride_z;
+#pragma unroll 8
+            for(int x = pool_x_s; x < pool_x_e; ++x)
+            {
+                int width_offset_src = (x + idx_in_w) * input_stride_y;
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && POOL_SIZE_Z == SRC_DEPTH && PAD_X == 0 && PAD_Y == 0 && PAD_Z == 0
+                VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+                data0;
+#if defined(FP_MIXED_PRECISION)
+                // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+                data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + width_offset_src + height_offset_src + depth_offset_src)),
+                                VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else  // defined(FP_MIXED_PRECISION)
+                data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + width_offset_src + height_offset_src + depth_offset_src));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(POOL_L2)
+                // Raise to power of 2 for L2 Pooling
+                data0 *= data0;
+#endif // defined(POOL_L2)
+                res0 = POOL_OP(res0, data0);
+            }
+        }
+    }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+
+
+   // Store result
+#if defined(QUANTIZED)
+    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#elif defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else  // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl
new file mode 100644
index 0000000000..abf0db9d07
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h" // Needed for GET_SPATIAL_IDX()
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG)  */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                 \
+    {                                                                                                                                                                 \
+        const VEC_FLOAT(VEC_SIZE) in_f32  = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
+        const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset));                            \
+        res                               = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                                \
+    }
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+/** Performs 3d pooling layer of size equal to MxNXD. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be excluded, -DEXCLUDE_PADDING should be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are QASYMM8_SIGNED, QASYMM8
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X, -DPOOL_SIZE_Y, and -DPOOL_SIZE_Z. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4, -DPOOL_SIZE_Z=2
+ * @note Input tensor width, height and depth must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT, and -DSRC_DEPTH
+ * @note Output tensor height, channels, depth, and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS, -DDST_DEPTH, and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z which are the steps of the window along the x, y and z directions
+ * @note Pool pads must be passed at compile time using -DPAD_X, -DPAD_Y, -DPAD_Z
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8_SIGNED, QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_stride_v                       Stride of the source tensor in V dimension (in bytes)
+ * @param[in]  input_step_v                         input_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_stride_v                      Stride of the destination tensor in V dimension (in bytes)
+ * @param[in]  output_step_v                        output_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_3d_layer_MxN_ndhwc_quantized(
+    TENSOR5D_DECLARATION(input),
+    TENSOR5D_DECLARATION(output))
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be shrunk to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+
+    // The depth size dimension and the batch size dimension are collapsed over the height dimension
+    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+    int idx_out_d = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) % DST_DEPTH;
+    int idx_out_n = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) / DST_DEPTH;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_v;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_d *
+                                           output_stride_w + idx_out_n * output_stride_v;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+    int idx_in_w = idx_out_w * STRIDE_X - (int)PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - (int)PAD_Y;
+    int idx_in_d = idx_out_d * STRIDE_Z - (int)PAD_Z;
+
+    // The start of width to consider in calculation should exclude padding
+    int pool_x_s = max((int)0, -idx_in_w);
+    // Assumed Symmetric Padding (left padding = right padding = PAD_X), the filter end should be either the pool width or what is remaining from current pos to the (src width + pad right)
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH + PAD_X - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT + PAD_Y - idx_in_h);
+    int pool_z_s = max((int)0, -idx_in_d);
+    int pool_z_e = min((int)POOL_SIZE_Z, (int)SRC_DEPTH + PAD_Z - idx_in_d);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = 0;
+#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = pool_z_e * pool_y_e * pool_x_e;
+#endif                                               // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
+
+    // The end of width to consider in calculation should exclude PAD_X
+    pool_x_e = min(pool_x_e, SRC_WIDTH - idx_in_w);
+    pool_y_e = min(pool_y_e, SRC_HEIGHT - idx_in_h);
+    pool_z_e = min(pool_z_e, SRC_DEPTH - idx_in_d);
+
+    for(int z = pool_z_s; z < pool_z_e; ++z)
+    {
+        int depth_offset_src = (z + idx_in_d) * input_stride_w;
+        for(int y = pool_y_s; y < pool_y_e; ++y)
+        {
+            int height_offset_src = (y + idx_in_h) * input_stride_z;
+#pragma unroll 8
+            for(int x = pool_x_s; x < pool_x_e; ++x)
+            {
+                int width_offset_src = (x + idx_in_w) * input_stride_y;
+
+                VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+                data;
+                VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+                data0;
+
+                data  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + width_offset_src + height_offset_src + depth_offset_src));
+                data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+                res0 = POOL_OP(res0, data0);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+                filter_size++;
+#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+            }
+        }
+    }
+
+#if defined(POOL_AVG)
+    res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
+#endif // defined(POOL_AVG)
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
new file mode 100644
index 0000000000..5b59ff5088
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+    int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
+#else  //DST_BATCH_SIZE != 1
+    int idx_out_h   = GET_SPATIAL_IDX(2, 1, 0);
+    int idx_out_n   = 0;
+#endif // DST_BATCH_SIZE != 1
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+                                           output_stride_w;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+#if defined(EXCLUDE_PADDING)
+    int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+#else  // defined(EXCLUDE_PADDING)
+    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif // defined(EXCLUDE_PADDING)
+
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+    // Global pooling path
+    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    {
+#pragma unroll 8
+        for(int x = 0; x < POOL_SIZE_X; ++x)
+        {
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+    for(int y = pool_y_s; y < pool_y_e; ++y)
+    {
+#pragma unroll 8
+        for(int x = pool_x_s; x < pool_x_e; ++x)
+        {
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+            data0;
+#if defined(FP_MIXED_PRECISION)
+            // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else  // defined(FP_MIXED_PRECISION)
+            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif // defined(POOL_L2)
+            res0 = POOL_OP(res0, data0);
+        }
+    }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else  // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+
+/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           (Optional) Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      (Optional) Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      (Optional) Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      (Optional) Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_stride_w                      (Optional) Stride of the indices tensor in W dimension (in bytes)
+ * @param[in]  indices_step_w                        (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2x2_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output)
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+    ,
+    TENSOR4D_DECLARATION(indices)
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+)
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else  //SRC_BATCH_SIZE != 1
+    int idx_out_h = get_global_id(2);
+    int idx_out_n = 0;
+#endif // SRC_BATCH_SIZE != 1
+
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+                                           output_stride_w;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
+
+    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
+
+    int x0 = pool_x_s + idx_in_w;
+    int y0 = pool_y_s + idx_in_h;
+    int x1 = pool_x_e - 1 + idx_in_w;
+    int y1 = pool_y_e - 1 + idx_in_h;
+
+    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
+
+#if defined(FP_MIXED_PRECISION)
+    // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else  // defined(FP_MIXED_PRECISION)
+    data0         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
+    data1         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
+    data2         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
+    data3         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if !defined(POOL_MAX)
+    if(filter_size != 4)
+    {
+        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
+        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
+        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
+        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
+
+        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
+        data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
+        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
+        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
+        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
+    }
+#endif // !defined(POOL_MAX)
+
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 *= data0;
+    data1 *= data1;
+    data2 *= data2;
+    data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = data0;
+    res0 = POOL_OP(res0, data1);
+    res0 = POOL_OP(res0, data2);
+    res0 = POOL_OP(res0, data3);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#if defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#else  // !defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
+#endif // defined(EXCLUDE_PADDING)
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else  // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+
+    // This part is used to return the index of the maximum value
+    // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
+
+    // note: Batch dimension does not contribute in the offset contribution
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    base_index = (uint)idx_out_c;
+
+    base_index += VEC_OFFS(uint, VEC_SIZE);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+
+    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
+
+    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
+                                           indices_stride_w;
+
+    // Store result
+    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
new file mode 100644
index 0000000000..46268a4a88
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
+#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                  \
+    {                                                                                                                                                                 \
+        const VEC_FLOAT(VEC_SIZE) in_f32  = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
+        const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset));                            \
+        res                               = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_TYPE(VEC_SIZE));                                                \
+    }
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=uchar. Supported data types are QASYMM8/QASYMM8_SIGNED
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=int
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ * @note If the output has be requantized, -DOFFSET_IN1, -DOFFSET_OUT, -DSCALE_IN1 and -DSCALE_OUT muste be passed at compile time
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_quantized_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int offset_c  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else  //DST_BATCH_SIZE != 1
+    int idx_out_h   = get_global_id(2);
+    int idx_out_n   = 0;
+#endif // DST_BATCH_SIZE != 1
+
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + offset_c + idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + offset_c + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * output_stride_w;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = 0;
+#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif                                               // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+    for(int y = pool_y_s; y < pool_y_e; ++y)
+    {
+        for(int x = pool_x_s; x < pool_x_e; ++x)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            data;
+            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+            data0;
+
+            data  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+            data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+            res0 = POOL_OP(res0, data0);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+            filter_size++;
+#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+        }
+    }
+
+#if defined(POOL_AVG)
+    res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
+#endif // defined(POOL_AVG)
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    // Store result
+    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/reorg_layer.cl b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
new file mode 100644
index 0000000000..a340b0b8a2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi)     \
+    ({                                                        \
+        int offset = zo / (int)SRC_DEPTH;                     \
+        xi         = xo * (int)STRIDE + offset % (int)STRIDE; \
+        yi         = yo * (int)STRIDE + offset / (int)STRIDE; \
+        zi         = zo % SRC_DEPTH;                          \
+    })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int xo = get_global_id(1);
+    int yo = get_global_id(2);
+    int zo = get_global_id(0);
+    int xi, yi, zi;
+
+    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+    int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
new file mode 100644
index 0000000000..e071b0f192
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(SCALE_NEAREST_NEIGHBOUR)
+//! @cond Doxygen_Suppress
+/** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC)
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
+ * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
+ * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c                             The size of the channels dimension of the source tensor
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale_x                           The scale value to apply on the source width
+ * @param[in] scale_y                           The scale value to apply on the source height
+ */
+//! @endcond
+__kernel void scale_nearest_neighbour_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    const float scale_x,
+    const float scale_y)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                 // defined(BATCHED_EXECUTION)
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    float xi_f = (xo * scale_x);
+    float yi_f = (yo * scale_y);
+#elif SAMPLING_POLICY_CENTER
+    float     xi_f = ((xo + 0.5f) * scale_x);
+    float     yi_f = ((yo + 0.5f) * scale_y);
+#else // SAMPLING_POLICY
+#error("Unsupported sampling policy");
+#endif // SAMPLING_POLICY
+
+#ifdef ALIGN_CORNERS
+    xi_f = round(xi_f);
+    yi_f = round(yi_f);
+#endif // ALIGN_CORNERS
+
+    const int xi0 = clamp((int)xi_f, 0, (int)src_w - 1);
+    const int yi0 = clamp((int)yi_f, 0, (int)src_h - 1);
+
+    TILE(SRC_DATA_TYPE, 1, N0, in00);
+
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
+
+    TILE(uint, 1, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y);
+}
+#endif /* SCALE_NEAREST_NEIGHBOUR */
+
+#if defined(SCALE_BILINEAR)
+//! @cond Doxygen_Suppress
+/** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC)
+ *
+ * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
+ * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
+ * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
+ *
+ * @note In case of QASYMM8, the following extra information must be passed at compile time:
+ * - The source offset e.g. -DOFFSET=4
+ * - The source scale e.g. -DSCALE=4
+ *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale_x                           The scale value to apply on the source width
+ * @param[in]  scale_y                           The scale value to apply on the source height
+ */
+//! @endcond
+__kernel void scale_bilinear_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    const float scale_x,
+    const float scale_y)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                 // defined(BATCHED_EXECUTION)
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    float xi_f = (xo * scale_x);
+    float yi_f = (yo * scale_y);
+#elif SAMPLING_POLICY_CENTER
+    float     xi_f = ((xo + 0.5f) * scale_x - 0.5f);
+    float     yi_f = ((yo + 0.5f) * scale_y - 0.5f);
+#else // SAMPLING_POLICY
+#error("Unsupported sampling policy");
+#endif // SAMPLING_POLICY
+
+    const int xi = (int)floor(xi_f);
+    const int yi = (int)floor(yi_f);
+
+    TILE(SRC_DATA_TYPE, 1, N0, in00);
+    TILE(SRC_DATA_TYPE, 1, N0, in01);
+    TILE(SRC_DATA_TYPE, 1, N0, in10);
+    TILE(SRC_DATA_TYPE, 1, N0, in11);
+
+    // Initialize the tiles to CONSTANT_VALUE
+    in00[0].v = CONSTANT_VALUE;
+    in01[0].v = CONSTANT_VALUE;
+    in10[0].v = CONSTANT_VALUE;
+    in11[0].v = CONSTANT_VALUE;
+
+#ifndef BORDER_MODE_REPLICATE
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, src_w, src_h, 1, 1, true, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, src_w, src_h, 1, 1, true, in01);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, src_w, src_h, 1, 1, true, in10);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, src_w, src_h, 1, 1, true, in11);
+#else  // BORDER_MODE_REPLICATE
+    const int xi0  = clamp(xi, 0, (int)src_w - 1);
+    const int yi0  = clamp(yi, 0, (int)src_h - 1);
+    const int xi1  = clamp(xi + 1, 0, (int)src_w - 1);
+    const int yi1  = clamp(yi + 1, 0, (int)src_h - 1);
+
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, src_w, src_h, 1, 1, false, in01);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, src_w, src_h, 1, 1, false, in10);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, src_w, src_h, 1, 1, false, in11);
+#endif // BORDER_MODE_REPLICATE
+
+    TILE(DST_DATA_TYPE, 1, N0, out);
+
+#if defined(IS_FLOATING_POINT)
+    const SRC_DATA_TYPE a  = (SRC_DATA_TYPE)(xi_f - (float)xi);
+    const SRC_DATA_TYPE b  = (SRC_DATA_TYPE)(1.f - a);
+    const SRC_DATA_TYPE a1 = (SRC_DATA_TYPE)(yi_f - (float)yi);
+    const SRC_DATA_TYPE b1 = (SRC_DATA_TYPE)(1.f - a1);
+
+    // Calculate the output
+    out[0].v = ((in00[0].v * b * b1) + (in01[0].v * a * b1) + (in10[0].v * b * a1) + (in11[0].v * a * a1));
+#else  // defined(IS_FLOATING_POINT)
+
+    const float a  = (xi_f - (float)xi);
+    const float b  = (1.f - a);
+    const float a1 = (yi_f - (float)yi);
+    const float b1 = (1.f - a1);
+
+    out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) +
+                           (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
+                           (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) +
+                           (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1),
+                           VEC_DATA_TYPE(DST_DATA_TYPE, N0));
+#endif // defined(IS_FLOATING_POINT)
+
+    TILE(uint, 1, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y);
+}
+#endif /* SCALE_BILINEAR */
diff --git a/src/core/CL/cl_kernels/nhwc/space_to_batch.cl b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
new file mode 100644
index 0000000000..695bd4c217
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
+ * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
+ * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
+ * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in]  batch_id                                  The output tensor batch id
+ * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nhwc(
+    TENSOR4D_DECLARATION(input),
+    IMAGE_DECLARATION(paddings),
+    VECTOR_DECLARATION(block_shape),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
+    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
+    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+    int block_x = *((__global int *)vector_offset(&block, 0));
+    int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int out_x = get_global_id(1);
+    const int out_y = get_global_id(2);
+    const int z     = get_global_id(0);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[in]  batch_id                             The output tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int block_x = BLOCK_SHAPE_X;
+    int block_y = BLOCK_SHAPE_Y;
+
+    const int out_x = get_global_id(1);
+    const int out_y = get_global_id(2);
+    const int z     = get_global_id(0);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/cl_kernels/nhwc/space_to_depth.cl b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
new file mode 100644
index 0000000000..10aac6d5fb
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0) % r;
+
+    const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+    const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
diff --git a/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
new file mode 100644
index 0000000000..1393537283
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+#include "tile_helpers.h"
+
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the transposed convolution.
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
+ * @note The transposed convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The transposed convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDST_CHANNELS=64)
+ * @note The tensor type (currently only "BUFFER" is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type (currently only "BUFFER" is supported) of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type (currently only "BUFFER" is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DBIA_DATA_TYPE (e.g. -DBIA_DATA_TYPE=float)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note If bias exists, the compile time argument -DHAS_BIAS should be passed
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ *
+ * @note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
+ * - -DIS_QUANTIZED
+ * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
+ * - The destination quantization shift e.g. -DDST_SHIFT=4
+ * - The destination offset e.g. -DDST_OFFSET=4
+ * - The source offset e.g. -DSRC_OFFSET=4
+ * - The weights offset e.g. -DWEI_OFFSET=4
+ * - The quantized zero value e.g. -DZERO_VALUE=4
+ *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels (IFM) dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels (OFM) dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                             The size of the channels (IFM) dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches (OFM) dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16)
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void transposed_convolution_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
+
+    // .v    = access the whole vector (OpenCL vector)
+    // .s[x] = access the vector element at position x (scalar access)
+    TILE(int, 1, M0, xi);
+    TILE(int, 1, M0, yi);
+    TILE(int, 1, M0, xu);
+    TILE(int, 1, M0, yu);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xu[0].s[i] = ((mout + i) % _IDST_WIDTH) - PAD_LEFT;
+        yu[0].s[i] = ((mout + i) / _IDST_WIDTH) - PAD_TOP;
+        xi[0].s[i] = ceil(xu[0].s[i] / (float)STRIDE_X);
+        yi[0].s[i] = ceil(yu[0].s[i] / (float)STRIDE_Y);
+    })
+
+    // Initialize the accumulators
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    // Flipped indices
+    const int x_start = _IWEI_WIDTH - (xi[0].s[0] * STRIDE_X - xu[0].s[0]) - 1;
+    const int y_start = _IWEI_HEIGHT - (yi[0].s[0] * STRIDE_Y - yu[0].s[0]) - 1;
+
+    for(int yk = y_start, yi_step = 0; yk >= 0; yk -= STRIDE_Y, ++yi_step)
+    {
+        for(int xk = x_start, xi_step = 0; xk >= 0; xk -= STRIDE_X, ++xi_step)
+        {
+            const int weights_y = cout * _IY_MULTIPLIER + yk * _IWEI_WIDTH + xk;
+
+            TILE(int, 1, M0, my);
+
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                int x_s    = xi[0].s[i] + xi_step;
+                int y_s    = yi[0].s[i] + yi_step;
+                my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
+                my[0].s[i] = my[0].s[i] + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
+                my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
+                my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
+                my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
+                my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
+            })
+
+            int ck = 0;
+            for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
+            {
+                TILE(SRC_DATA_TYPE, M0, K0, a);
+                TILE(WEI_DATA_TYPE, N0, K0, b);
+
+                // Initialize tiles
+                LOOP_UNROLLING(int, i, 0, 1, M0,
+                {
+                    a[i].v = ZERO_VALUE;
+                })
+
+                LOOP_UNROLLING(int, i, 0, 1, N0,
+                {
+                    b[i].v = ZERO_VALUE;
+                })
+
+                // Load tile from the src tensor
+                T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+                // Load tile from the weights tensor
+                T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, weights_y, _IY_MULTIPLIER, wei_stride_y, b);
+
+                // Compute the matrix multiplication between two tiles
+                T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+#if defined(IS_QUANTIZED)
+                // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+                // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+                T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+#endif // defined(IS_QUANTIZED)
+            }
+
+            // This #if directive should be removed in case of dynamic tensor support
+#if defined(LEFTOVER_LOOP)
+            // Left-over accumulations
+            for(; ck < _ISRC_CHANNELS; ++ck)
+            {
+                TILE(SRC_DATA_TYPE, M0, 1, a);
+                TILE(WEI_DATA_TYPE, N0, 1, b);
+
+                // Initialize tiles
+                LOOP_UNROLLING(int, i, 0, 1, M0,
+                {
+                    a[i].v = ZERO_VALUE;
+                })
+
+                // Load tile from the src tensor
+                // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+                T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, BUFFER, src, ck, src_stride_y, my, a);
+
+                // Load tile from the weights tensor
+                // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+                T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, weights_y, _IY_MULTIPLIER, wei_stride_y, b);
+
+                // Compute the matrix multiplication between two tiles
+                T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+#if defined(IS_QUANTIZED)
+                // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+                // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+                T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+#endif // defined(IS_QUANTIZED)
+            }
+#endif // defined(LEFTOVER_LOOP)
+        }
+    }
+
+#if defined(IS_QUANTIZED)
+    const int total_pixels = floor((1 + y_start / (float)STRIDE_Y)) * floor(1 + x_start / (float)STRIDE_X);
+
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (total_pixels * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
+#endif // defined(IS_QUANTIZED)
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+#if defined(IS_QUANTIZED)
+
+    TILE(DST_DATA_TYPE, M0, N0, cq);
+
+    // Quantize the tile
+    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+#endif // defined(IS_QUANTIZED)
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
+
+#undef _IWEI_WIDTH
+#undef _IWEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+}
diff --git a/src/core/CL/cl_kernels/nhwc/upsample_layer.cl b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
new file mode 100644
index 0000000000..74b9674a88
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image. (NHWC)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
+    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)src.ptr);
+
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1));
+#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
new file mode 100644
index 0000000000..45fbc1b641
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
@@ -0,0 +1,1107 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp)                                                                                               \
+    ({                                                                                                                             \
+        out.s0 = -tmp.s0 / 36.f;                                                                                                   \
+        out.s1 = (tmp.s0 - tmp.s1 + tmp.s2 - tmp.s3 + tmp.s4 - tmp.s5 + tmp.s6) / 48.f;                                            \
+        out.s2 = (tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3 + tmp.s4 + tmp.s5 + tmp.s6) / 48.f;                                            \
+        out.s3 = (-tmp.s0 + 2.f * tmp.s1 - 4.f * tmp.s2 + 8.f * tmp.s3 - 16.f * tmp.s4 + 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
+        out.s4 = (-tmp.s0 - 2.f * tmp.s1 - 4.f * tmp.s2 - 8.f * tmp.s3 - 16.f * tmp.s4 - 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
+        out.s5 = (tmp.s0 - 3.f * tmp.s1 + 9.f * tmp.s2 - 27.f * tmp.s3 + 81.f * tmp.s4 - 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+        out.s6 = (tmp.s0 + 3.f * tmp.s1 + 9.f * tmp.s2 + 27.f * tmp.s3 + 81.f * tmp.s4 + 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+        out.s7 = tmp.s6;                                                                                                           \
+    })
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4
+ *
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    // Row 0
+    DATA_TYPE out00, out01, out02, out03, out04, out05;
+    out00 = (w00) / 16.f;
+    out01 = (-w00 - w01 - w02) / 24.f;
+    out02 = (-w00 + w01 - w02) / 24.f;
+    out03 = (w00 + 2.f * w01 + 4.f * w02) / 96.f;
+    out04 = (w00 - 2.f * w01 + 4.f * w02) / 96.f;
+    out05 = (w02) / 4.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    DATA_TYPE out10, out11, out12, out13, out14, out15;
+    out10 = (-w00 - w10 - w20) / 24.f;
+    out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f;
+    out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f;
+    out13 = (-w00 - w10 - w20 + 2.f * (-w01 - w11 - w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
+    out14 = (-w00 - w10 - w20 + 2.f * (w01 + w11 + w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
+    out15 = (-w02 - w12 - w22) / 6.f;
+
+    // Row 2
+    DATA_TYPE out20, out21, out22, out23, out24, out25;
+    out20 = (-w00 + w10 - w20) / 24.f;
+    out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f;
+    out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f;
+    out23 = (-w00 + w10 - w20 + 2.f * (-w01 + w11 - w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
+    out24 = (-w00 + w10 - w20 + 2.f * (w01 - w11 + w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
+    out25 = (-w02 + w12 - w22) / 6.f;
+
+    // Row 3
+    DATA_TYPE out30, out31, out32, out33, out34, out35;
+    out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f;
+    out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
+    out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
+    out33 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (w01 + 2.f * w11 + 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
+    out34 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (-w01 - 2.f * w11 - 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
+    out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f;
+
+    // Row 4
+    DATA_TYPE out40, out41, out42, out43, out44, out45;
+    out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f;
+    out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
+    out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
+    out43 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (w01 - 2.f * w11 + 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
+    out44 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (-w01 + 2.f * w11 - 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
+    out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f;
+
+    // Row 5
+    DATA_TYPE out50, out51, out52, out53, out54, out55;
+    out50 = (w20) / 4.f;
+    out51 = (-w20 - w21 - w22) / 6.f;
+    out52 = (-w20 + w21 - w22) / 6.f;
+    out53 = (w20 + 2.f * w21 + 4.f * w22) / 24.f;
+    out54 = (w20 - 2.f * w21 + 4.f * w22) / 24.f;
+    out55 = (w22);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int x0 = get_global_id(2); // idx filter
+    int y0 = get_global_id(0); // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 36 channels for 3x3 kernels
+    // 6  channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out00;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out01;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out02;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out03;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out04;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out05;
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out10;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out11;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out12;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out13;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out14;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out15;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out20;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out21;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out22;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out23;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out24;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out25;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out30;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out31;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out32;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out33;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out34;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out35;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out40;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out41;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out42;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out43;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out44;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out45;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out50;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out51;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out52;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out53;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out54;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4
+ *
+ * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = w00;
+    out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;
+    out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;
+    out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;
+    out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;
+    out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;
+    out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;
+    out0.s7 = w04;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;
+    out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+    out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+    out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+                (w04 + w14 + w24 + w34 + w44)) / 405.f;
+    out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+                (w04 + w14 + w24 + w34 + w44)) / 405.f;
+    out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +
+                (w04 + w14 + w24 + w34 + w44)) / 810.f;
+    out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +
+                (w04 + w14 + w24 + w34 + w44)) / 810.f;
+    out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;
+    out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+    out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+    out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+                (w04 - w14 + w24 - w34 + w44)) / 405.f;
+    out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+                (w04 - w14 + w24 - w34 + w44)) / 405.f;
+    out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +
+                (w04 - w14 + w24 - w34 + w44)) / 810.f;
+    out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +
+                (w04 - w14 + w24 - w34 + w44)) / 810.f;
+    out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;
+    out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +
+                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+    out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -
+                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+    out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f
+               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+    out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f
+               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+    out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+    out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+    out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;
+    out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +
+                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+    out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -
+                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+    out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f
+               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+    out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f
+               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+    out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+    out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+    out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;
+    out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +
+                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+    out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -
+                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+    out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f
+               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+    out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f
+               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+    out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+    out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+    out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6    = 0.0f;
+    out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;
+    out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +
+                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+    out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -
+                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+    out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f
+               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+    out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f
+               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+    out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+    out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+    out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7    = 0.0f;
+    out7.s0 = w40;
+    out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;
+    out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;
+    out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;
+    out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;
+    out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;
+    out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;
+    out7.s7 = w44;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int x0 = get_global_id(2); // idx filter
+    int y0 = get_global_id(0); // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
+
+/** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2
+ *
+ * @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_2x2_7x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w15 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w16 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w25 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w26 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w35 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w36 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w45 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w46 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w50 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w51 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w52 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w53 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w54 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w55 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w56 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w60 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w61 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w62 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w63 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w64 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w65 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w66 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 6 * src_stride_y));
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp = 0.0f;
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = 0.0f;
+
+    out0.s0 = -w00 / 36.0f;
+    out0.s1 = (w00 - w01 + w02 - w03 + w04 - w05 + w06) / 48.f;
+    out0.s2 = (w00 + w01 + w02 + w03 + w04 + w05 + w06) / 48.f;
+    out0.s3 = (-w00 + 2.f * w01 - 4.f * w02 + 8.f * w03 - 16.f * w04 + 32.f * w05 - 64.f * w06) / 120.f;
+    out0.s4 = (-w00 - 2.f * w01 - 4.f * w02 - 8.f * w03 - 16.f * w04 - 32.f * w05 - 64.f * w06) / 120.f;
+    out0.s5 = (w00 - 3.f * w01 + 9.f * w02 - 27.f * w03 + 81.f * w04 - 243.f * w05 + 729.f * w06) / 720.f;
+    out0.s6 = (w00 + 3.f * w01 + 9.f * w02 + 27.f * w03 + 81.f * w04 + 243.f * w05 + 729.f * w06) / 720.f;
+    out0.s7 = w06;
+
+    out0 /= (VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1 = 0.0f;
+
+    tmp.s0 = (w00 - w10 + w20 - w30 + w40 - w50 + w60) / 48.f;
+    tmp.s1 = (w01 - w11 + w21 - w31 + w41 - w51 + w61) / 48.f;
+    tmp.s2 = (w02 - w12 + w22 - w32 + w42 - w52 + w62) / 48.f;
+    tmp.s3 = (w03 - w13 + w23 - w33 + w43 - w53 + w63) / 48.f;
+    tmp.s4 = (w04 - w14 + w24 - w34 + w44 - w54 + w64) / 48.f;
+    tmp.s5 = (w05 - w15 + w25 - w35 + w45 - w55 + w65) / 48.f;
+    tmp.s6 = (w06 - w16 + w26 - w36 + w46 - w56 + w66) / 48.f;
+
+    OUTPUT_ROW_2x2_7x7(out1, tmp);
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2 = 0.0f;
+
+    tmp.s0 = (w00 + w10 + w20 + w30 + w40 + w50 + w60) / 48.f;
+    tmp.s1 = (w01 + w11 + w21 + w31 + w41 + w51 + w61) / 48.f;
+    tmp.s2 = (w02 + w12 + w22 + w32 + w42 + w52 + w62) / 48.f;
+    tmp.s3 = (w03 + w13 + w23 + w33 + w43 + w53 + w63) / 48.f;
+    tmp.s4 = (w04 + w14 + w24 + w34 + w44 + w54 + w64) / 48.f;
+    tmp.s5 = (w05 + w15 + w25 + w35 + w45 + w55 + w65) / 48.f;
+    tmp.s6 = (w06 + w16 + w26 + w36 + w46 + w56 + w66) / 48.f;
+
+    OUTPUT_ROW_2x2_7x7(out2, tmp);
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3 = 0.0f;
+
+    tmp.s0 = (-w00 + 2.f * w10 - 4.f * w20 + 8.f * w30 - 16.f * w40 + 32.f * w50 - 64.f * w60) / 120.f;
+    tmp.s1 = (-w01 + 2.f * w11 - 4.f * w21 + 8.f * w31 - 16.f * w41 + 32.f * w51 - 64.f * w61) / 120.f;
+    tmp.s2 = (-w02 + 2.f * w12 - 4.f * w22 + 8.f * w32 - 16.f * w42 + 32.f * w52 - 64.f * w62) / 120.f;
+    tmp.s3 = (-w03 + 2.f * w13 - 4.f * w23 + 8.f * w33 - 16.f * w43 + 32.f * w53 - 64.f * w63) / 120.f;
+    tmp.s4 = (-w04 + 2.f * w14 - 4.f * w24 + 8.f * w34 - 16.f * w44 + 32.f * w54 - 64.f * w64) / 120.f;
+    tmp.s5 = (-w05 + 2.f * w15 - 4.f * w25 + 8.f * w35 - 16.f * w45 + 32.f * w55 - 64.f * w65) / 120.f;
+    tmp.s6 = (-w06 + 2.f * w16 - 4.f * w26 + 8.f * w36 - 16.f * w46 + 32.f * w56 - 64.f * w66) / 120.f;
+
+    OUTPUT_ROW_2x2_7x7(out3, tmp);
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4 = 0.0f;
+
+    tmp.s0 = (-w00 - 2.f * w10 - 4.f * w20 - 8.f * w30 - 16.f * w40 - 32.f * w50 - 64.f * w60) / 120.f;
+    tmp.s1 = (-w01 - 2.f * w11 - 4.f * w21 - 8.f * w31 - 16.f * w41 - 32.f * w51 - 64.f * w61) / 120.f;
+    tmp.s2 = (-w02 - 2.f * w12 - 4.f * w22 - 8.f * w32 - 16.f * w42 - 32.f * w52 - 64.f * w62) / 120.f;
+    tmp.s3 = (-w03 - 2.f * w13 - 4.f * w23 - 8.f * w33 - 16.f * w43 - 32.f * w53 - 64.f * w63) / 120.f;
+    tmp.s4 = (-w04 - 2.f * w14 - 4.f * w24 - 8.f * w34 - 16.f * w44 - 32.f * w54 - 64.f * w64) / 120.f;
+    tmp.s5 = (-w05 - 2.f * w15 - 4.f * w25 - 8.f * w35 - 16.f * w45 - 32.f * w55 - 64.f * w65) / 120.f;
+    tmp.s6 = (-w06 - 2.f * w16 - 4.f * w26 - 8.f * w36 - 16.f * w46 - 32.f * w56 - 64.f * w66) / 120.f;
+
+    OUTPUT_ROW_2x2_7x7(out4, tmp);
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5 = 0.0f;
+
+    tmp.s0 = (w00 - 3.f * w10 + 9.f * w20 - 27.f * w30 + 81.f * w40 - 243.f * w50 + 729.f * w60) / 720.f;
+    tmp.s1 = (w01 - 3.f * w11 + 9.f * w21 - 27.f * w31 + 81.f * w41 - 243.f * w51 + 729.f * w61) / 720.f;
+    tmp.s2 = (w02 - 3.f * w12 + 9.f * w22 - 27.f * w32 + 81.f * w42 - 243.f * w52 + 729.f * w62) / 720.f;
+    tmp.s3 = (w03 - 3.f * w13 + 9.f * w23 - 27.f * w33 + 81.f * w43 - 243.f * w53 + 729.f * w63) / 720.f;
+    tmp.s4 = (w04 - 3.f * w14 + 9.f * w24 - 27.f * w34 + 81.f * w44 - 243.f * w54 + 729.f * w64) / 720.f;
+    tmp.s5 = (w05 - 3.f * w15 + 9.f * w25 - 27.f * w35 + 81.f * w45 - 243.f * w55 + 729.f * w65) / 720.f;
+    tmp.s6 = (w06 - 3.f * w16 + 9.f * w26 - 27.f * w36 + 81.f * w46 - 243.f * w56 + 729.f * w66) / 720.f;
+
+    OUTPUT_ROW_2x2_7x7(out5, tmp);
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6 = 0.0f;
+
+    tmp.s0 = (w00 + 3.f * w10 + 9.f * w20 + 27.f * w30 + 81.f * w40 + 243.f * w50 + 729.f * w60) / 720.f;
+    tmp.s1 = (w01 + 3.f * w11 + 9.f * w21 + 27.f * w31 + 81.f * w41 + 243.f * w51 + 729.f * w61) / 720.f;
+    tmp.s2 = (w02 + 3.f * w12 + 9.f * w22 + 27.f * w32 + 81.f * w42 + 243.f * w52 + 729.f * w62) / 720.f;
+    tmp.s3 = (w03 + 3.f * w13 + 9.f * w23 + 27.f * w33 + 81.f * w43 + 243.f * w53 + 729.f * w63) / 720.f;
+    tmp.s4 = (w04 + 3.f * w14 + 9.f * w24 + 27.f * w34 + 81.f * w44 + 243.f * w54 + 729.f * w64) / 720.f;
+    tmp.s5 = (w05 + 3.f * w15 + 9.f * w25 + 27.f * w35 + 81.f * w45 + 243.f * w55 + 729.f * w65) / 720.f;
+    tmp.s6 = (w06 + 3.f * w16 + 9.f * w26 + 27.f * w36 + 81.f * w46 + 243.f * w56 + 729.f * w66) / 720.f;
+
+    OUTPUT_ROW_2x2_7x7(out6, tmp);
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7 = 0.0f;
+
+    tmp.s0 = w60;
+    tmp.s1 = w61;
+    tmp.s2 = w62;
+    tmp.s3 = w63;
+    tmp.s4 = w64;
+    tmp.s5 = w65;
+    tmp.s6 = w66;
+
+    OUTPUT_ROW_2x2_7x7(out7, tmp);
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int x0 = get_global_id(2); // idx filter
+    int y0 = get_global_id(0); // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
+{
+    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NHWC and the output tile is 4x1
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_4x1_5x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
+{
+    winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_2x1_7x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
+{
+    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+#if defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
+{
+    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NHWC and the output tile is 1x4
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
+{
+    winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
+{
+    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
new file mode 100644
index 0000000000..7341336b92
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                                           \
+    ({                                                                                    \
+        comm_fact.s0 = tmp.s2 - (DATA_TYPE)4.25f * tmp.s4 + tmp.s6;                       \
+        comm_fact.s1 = tmp.s1 - (DATA_TYPE)4.25f * tmp.s3 + tmp.s5;                       \
+        comm_fact.s2 = (DATA_TYPE)2.5f * tmp.s3;                                          \
+        comm_fact.s3 = (DATA_TYPE)0.5f * tmp.s1 + (DATA_TYPE)2.f * tmp.s5 - comm_fact.s2; \
+        comm_fact.s4 = (DATA_TYPE)0.25f * tmp.s2 - (DATA_TYPE)1.25f * tmp.s4 + tmp.s6;    \
+        comm_fact.s5 = (DATA_TYPE)4.f * tmp.s2 + tmp.s6 - (DATA_TYPE)5.f * tmp.s4;        \
+        comm_fact.s6 = (DATA_TYPE)2.f * tmp.s1 + (DATA_TYPE)0.5f * tmp.s5 - comm_fact.s2; \
+        \
+        out.s0 = tmp.s0 - tmp.s6 + (DATA_TYPE)5.25f * tmp.s4 - (DATA_TYPE)5.25f * tmp.s2; \
+        out.s1 = comm_fact.s0 + comm_fact.s1;                                             \
+        out.s2 = comm_fact.s0 - comm_fact.s1;                                             \
+        out.s3 = comm_fact.s3 + comm_fact.s4;                                             \
+        out.s4 = comm_fact.s4 - comm_fact.s3;                                             \
+        out.s5 = comm_fact.s5 + comm_fact.s6;                                             \
+        out.s6 = comm_fact.s5 - comm_fact.s6;                                             \
+        out.s7 = tmp.s7 - tmp.s1 + (DATA_TYPE)5.25f * tmp.s3 - (DATA_TYPE)5.25f * tmp.s5; \
+    })
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                                                                \
+    ({                                                                                                                                         \
+        comm_fact.s0 = (DATA_TYPE)36.0f * tmp.s2 - (DATA_TYPE)13.0f * tmp.s4 + tmp.s6;                                                         \
+        comm_fact.s1 = (DATA_TYPE)36.0f * tmp.s1 - (DATA_TYPE)13.0f * tmp.s3 + (DATA_TYPE)1.0f * tmp.s5;                                       \
+        comm_fact.s2 = (DATA_TYPE)9.0f * tmp.s2 - (DATA_TYPE)10.0f * tmp.s4 + tmp.s6;                                                          \
+        comm_fact.s3 = (DATA_TYPE)18.0f * tmp.s1 - (DATA_TYPE)20.0f * tmp.s3 + (DATA_TYPE)2.0f * tmp.s5;                                       \
+        comm_fact.s4 = (DATA_TYPE)4.0f * tmp.s2 - (DATA_TYPE)5.0f * tmp.s4 + tmp.s6;                                                           \
+        comm_fact.s5 = (DATA_TYPE)12.0f * tmp.s1 - (DATA_TYPE)15.0f * tmp.s3 + (DATA_TYPE)3.0f * tmp.s5;                                       \
+        out.s0       = -(DATA_TYPE)36.0f * tmp.s0 + (DATA_TYPE)49.0f * tmp.s2 + -(DATA_TYPE)14.0f * tmp.s4 + tmp.s6;                           \
+        out.s1       = comm_fact.s0 - comm_fact.s1;                                                                                            \
+        out.s2       = comm_fact.s0 + comm_fact.s1;                                                                                            \
+        out.s3       = comm_fact.s2 - comm_fact.s3;                                                                                            \
+        out.s4       = comm_fact.s2 + comm_fact.s3;                                                                                            \
+        out.s5       = comm_fact.s4 - comm_fact.s5;                                                                                            \
+        out.s6       = comm_fact.s4 + comm_fact.s5;                                                                                            \
+        out.s7       = -(DATA_TYPE)36.0f * tmp.s1 + (DATA_TYPE)0.0f * tmp.s2 + (DATA_TYPE)49.0f * tmp.s3 - (DATA_TYPE)14.0f * tmp.s5 + tmp.s7; \
+    })
+
+#if defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+
+#if defined(NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // NUM_TILES_X x NUM_TILES_Y
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 6, N0, in);
+    TILE(DATA_TYPE, 6, N0, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 6, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 6, 1, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    TILE(DATA_TYPE, 6, N0, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v *= (DATA_TYPE)4.0f;
+    })
+
+    com[0].v = in[2].v - (DATA_TYPE)4.f * in[0].v;
+    com[1].v = in[3].v - (DATA_TYPE)4.f * in[1].v;
+    com[2].v = in[4].v - (DATA_TYPE)4.f * in[2].v;
+    com[3].v = in[5].v - (DATA_TYPE)4.f * in[3].v;
+    com[4].v = in[3].v - in[1].v;
+    com[4].v = com[4].v + com[4].v;
+    com[5].v = in[4].v - in[2].v;
+
+    out[0].v = com[2].v - com[0].v;
+    out[1].v = com[2].v + com[1].v;
+    out[2].v = com[2].v - com[1].v;
+    out[3].v = com[5].v + com[4].v;
+    out[4].v = com[5].v - com[4].v;
+    out[5].v = com[3].v - com[1].v;
+
+    TILE(uint, 6, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 36, N0, in);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 6, 6, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 6, N0, com);
+    TILE(DATA_TYPE, 36, N0, tmp);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        com[0].v         = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
+        com[1].v         = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
+        com[2].v         = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
+        com[3].v         = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
+        com[4].v         = in[3 * 6 + i].v - in[1 * 6 + i].v;
+        com[4].v         = com[4].v + com[4].v;
+        com[5].v         = in[4 * 6 + i].v - in[2 * 6 + i].v;
+        tmp[i + 0 * 6].v = com[2].v - com[0].v;
+        tmp[i + 1 * 6].v = com[2].v + com[1].v;
+        tmp[i + 2 * 6].v = com[2].v - com[1].v;
+        tmp[i + 3 * 6].v = com[5].v + com[4].v;
+        tmp[i + 4 * 6].v = com[5].v - com[4].v;
+        tmp[i + 5 * 6].v = com[3].v - com[1].v;
+    })
+
+    TILE(DATA_TYPE, 36, N0, out);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        com[0].v         = tmp[i * 6 + 2].v - (DATA_TYPE)4.f *tmp[i * 6 + 0].v;
+        com[1].v         = tmp[i * 6 + 3].v - (DATA_TYPE)4.f *tmp[i * 6 + 1].v;
+        com[2].v         = tmp[i * 6 + 4].v - (DATA_TYPE)4.f *tmp[i * 6 + 2].v;
+        com[3].v         = tmp[i * 6 + 5].v - (DATA_TYPE)4.f *tmp[i * 6 + 3].v;
+        com[4].v         = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
+        com[4].v         = com[4].v + com[4].v;
+        com[5].v         = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
+        out[i * 6 + 0].v = com[2].v - com[0].v;
+        out[i * 6 + 1].v = com[2].v + com[1].v;
+        out[i * 6 + 2].v = com[2].v - com[1].v;
+        out[i * 6 + 3].v = com[5].v + com[4].v;
+        out[i * 6 + 4].v = com[5].v - com[4].v;
+        out[i * 6 + 5].v = com[3].v - com[1].v;
+    })
+
+    // Compute destination address
+    TILE(uint, 36, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 8, 1, in);
+    TILE(DATA_TYPE, 8, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 8, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 8, 1, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    TILE(DATA_TYPE, 1, 8, com);
+
+    com[0].s[0] = in[2].v - (DATA_TYPE)4.25f * in[4].v + in[6].v;
+    com[0].s[1] = in[1].v - (DATA_TYPE)4.25f * in[3].v + in[5].v;
+    com[0].s[2] = (DATA_TYPE)0.5f * in[1].v - (DATA_TYPE)2.5f * in[3].v + (DATA_TYPE)2.0f * in[5].v;
+    com[0].s[3] = (DATA_TYPE)0.25f * in[2].v - (DATA_TYPE)1.25f * in[4].v + in[6].v;
+    com[0].s[4] = (DATA_TYPE)4.0f * in[2].v - (DATA_TYPE)5.0f * in[4].v + in[6].v;
+    com[0].s[5] = (DATA_TYPE)2.0f * in[1].v - (DATA_TYPE)2.5f * in[3].v + (DATA_TYPE)0.5f * in[5].v;
+    out[0].s[0] = in[0].v - 5.25f * in[2].v + (DATA_TYPE)5.25f * in[4].v - in[6].v;
+    out[1].s[0] = com[0].s[0] + com[0].s[1];
+    out[2].s[0] = com[0].s[0] - com[0].s[1];
+    out[3].s[0] = com[0].s[3] + com[0].s[2];
+    out[4].s[0] = com[0].s[3] - com[0].s[2];
+    out[5].s[0] = com[0].s[4] + com[0].s[5];
+    out[6].s[0] = com[0].s[4] - com[0].s[5];
+    out[7].s[0] = -in[1].v + (DATA_TYPE)5.25f * in[3].v - (DATA_TYPE)5.25f * in[5].v + in[7].v;
+
+    TILE(uint, 8, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, 1, in);
+    TILE(DATA_TYPE, 64, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 8, 8, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                                    // x
+        com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];                                    // x
+        com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                 // x
+        com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
+        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
+        com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
+        com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
+    })
+
+    TILE(DATA_TYPE, 8, 8, tmp);
+    tmp[0].v = com[6].v;
+    tmp[1].v = com[0].v + com[1].v;
+    tmp[2].v = com[0].v - com[1].v;
+    tmp[3].v = com[2].v + com[3].v;
+    tmp[4].v = com[2].v - com[3].v;
+    tmp[5].v = com[4].v + com[5].v;
+    tmp[6].v = com[4].v - com[5].v;
+    tmp[7].v = com[7].v;
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[0]         = tmp[i].s[2] - (DATA_TYPE)4.25f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[1]         = tmp[i].s[1] - (DATA_TYPE)4.25f * tmp[i].s[3] + tmp[i].s[5];
+        com[0].s[2]         = (DATA_TYPE)0.5f * tmp[i].s[1] - (DATA_TYPE)2.5f * tmp[i].s[3] + (DATA_TYPE)2.0f * tmp[i].s[5];
+        com[0].s[3]         = (DATA_TYPE)0.25f * tmp[i].s[2] - (DATA_TYPE)1.25f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[4]         = (DATA_TYPE)4.0f * tmp[i].s[2] - (DATA_TYPE)5.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[5]         = (DATA_TYPE)2.0f * tmp[i].s[1] - (DATA_TYPE)2.5f * tmp[i].s[3] + (DATA_TYPE)0.5f * tmp[i].s[5];
+        out[i * 8 + 0].s[0] = tmp[i].s[0] - (DATA_TYPE)5.25f * tmp[i].s[2] + (DATA_TYPE)5.25f * tmp[i].s[4] - tmp[i].s[6];
+        out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
+        out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
+        out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
+        out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
+        out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
+        out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
+        out[i * 8 + 7].s[0] = -tmp[i].s[1] + (DATA_TYPE)5.25f * tmp[i].s[3] - (DATA_TYPE)5.25f * tmp[i].s[5] + tmp[i].s[7];
+    })
+
+    TILE(uint, 64, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 8, 1, in);
+    TILE(DATA_TYPE, 8, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v *= (DATA_TYPE) - 36.0f;
+    })
+
+    TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
+
+    com[0].s[0] = (DATA_TYPE)36.0f * in[2].v - (DATA_TYPE)13.0f * in[4].v + in[6].v;
+    com[0].s[1] = (DATA_TYPE)36.0f * in[1].v - (DATA_TYPE)13.0f * in[3].v + (DATA_TYPE)1.0f * in[5].v;
+    com[0].s[2] = (DATA_TYPE)9.0f * in[2].v - (DATA_TYPE)10.0f * in[4].v + in[6].v;
+    com[0].s[3] = (DATA_TYPE)18.0f * in[1].v - (DATA_TYPE)20.0f * in[3].v + (DATA_TYPE)2.0f * in[5].v;
+    com[0].s[4] = (DATA_TYPE)4.0f * in[2].v - (DATA_TYPE)5.0f * in[4].v + in[6].v;
+    com[0].s[5] = (DATA_TYPE)12.0f * in[1].v - (DATA_TYPE)15.0f * in[3].v + (DATA_TYPE)3.0f * in[5].v;
+    out[0].s[0] = (DATA_TYPE) - 36.0f * in[0].v + (DATA_TYPE)49.0f * in[2].v + -(DATA_TYPE)14.0f * in[4].v + in[6].v;
+    out[1].s[0] = com[0].s[0] - com[0].s[1];
+    out[2].s[0] = com[0].s[0] + com[0].s[1];
+    out[3].s[0] = com[0].s[2] - com[0].s[3];
+    out[4].s[0] = com[0].s[2] + com[0].s[3];
+    out[5].s[0] = com[0].s[4] - com[0].s[5];
+    out[6].s[0] = com[0].s[4] + com[0].s[5];
+    out[7].s[0] = -(DATA_TYPE)36.0f * in[1].v + (DATA_TYPE)0.0f * in[2].v + (DATA_TYPE)49.0f * in[3].v - (DATA_TYPE)14.0f * in[5].v + in[7].v;
+
+    TILE(uint, 8, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, 1, in);
+    TILE(DATA_TYPE, 64, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 8, 8, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
+        com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
+        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
+        com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
+        com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
+    })
+
+    TILE(DATA_TYPE, 8, 8, tmp);
+    tmp[0].v = com[6].v;
+    tmp[1].v = com[0].v - com[1].v;
+    tmp[2].v = com[0].v + com[1].v;
+    tmp[3].v = com[2].v - com[3].v;
+    tmp[4].v = com[2].v + com[3].v;
+    tmp[5].v = com[4].v - com[5].v;
+    tmp[6].v = com[4].v + com[5].v;
+    tmp[7].v = com[7].v;
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[0]         = (DATA_TYPE)36.0f * tmp[i].s[2] - (DATA_TYPE)13.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[1]         = (DATA_TYPE)36.0f * tmp[i].s[1] - (DATA_TYPE)13.0f * tmp[i].s[3] + (DATA_TYPE)1.0f * tmp[i].s[5];
+        com[0].s[2]         = (DATA_TYPE)9.0f * tmp[i].s[2] - (DATA_TYPE)10.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[3]         = (DATA_TYPE)18.0f * tmp[i].s[1] - (DATA_TYPE)20.0f * tmp[i].s[3] + (DATA_TYPE)2.0f * tmp[i].s[5];
+        com[0].s[4]         = (DATA_TYPE)4.0f * tmp[i].s[2] - (DATA_TYPE)5.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[5]         = (DATA_TYPE)12.0f * tmp[i].s[1] - (DATA_TYPE)15.0f * tmp[i].s[3] + (DATA_TYPE)3.0f * tmp[i].s[5];
+        out[i * 8 + 0].s[0] = (DATA_TYPE) - 36.0f * tmp[i].s[0] + (DATA_TYPE)49.0f * tmp[i].s[2] + -(DATA_TYPE)14.0f * tmp[i].s[4] + tmp[i].s[6];
+        out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
+        out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
+        out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
+        out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
+        out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
+        out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
+        out[i * 8 + 7].s[0] = -(DATA_TYPE)36.0f * tmp[i].s[1] + (DATA_TYPE)0.0f * tmp[i].s[2] + (DATA_TYPE)49.0f * tmp[i].s[3] - (DATA_TYPE)14.0f * tmp[i].s[5] + tmp[i].s[7];
+    })
+
+    TILE(uint, 64, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ *
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+#endif // defined(NHWC)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
new file mode 100644
index 0000000000..9eb995fbb2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
@@ -0,0 +1,1109 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
+ *
+ * @note  must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  _ISRC_HEIGHT                      The source tensor's height
+ * @param[in]  _IDST_WIDTH                       The destination tensor's width
+ * @param[in]  _IDST_HEIGHT                      The destination tensor's height
+ */
+__kernel void winograd_output_transform_2x2_7x7_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int _ISRC_HEIGHT,
+    const int _IDST_WIDTH,
+    const int _IDST_HEIGHT)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    TILE(DATA_TYPE, 8, N0, in);
+    TILE(DATA_TYPE, 2, N0, out);
+    TILE(uint, 8, 1, src_indirect_y);
+
+    // Calculate the indirect Y for the source tensor
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 8 channels to compose the 8x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // Compute out0 and out01
+    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
+    out[1].v = -in[1].v + in[2].v - (DATA_TYPE)2.f * in[3].v + (DATA_TYPE)2.0f * in[4].v - (DATA_TYPE)3.0f * in[5].v + (DATA_TYPE)3.0f * in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 2, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+    T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 2, 1, dst_indirect_y);
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 2,
+    {
+        int y_c              = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 2,
+    {
+        int x_c              = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(DATA_TYPE, 16, N0, tmp);
+    TILE(uint, 64, 1, src_indirect_y);
+
+    // Calculate the indirect Y for the source tensor
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 64 channels to compose the 8x8 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        tmp[i * 2].v     = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
+        tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - (DATA_TYPE)2 * in[24 + i].v + (DATA_TYPE)2 * in[32 + i].v + (DATA_TYPE) - 3 * in[40 + i].v + (DATA_TYPE)3 * in[48 + i].v + in[56 + i].v;
+    })
+
+    // Compute the 2x2 output tile
+    LOOP_UNROLLING(int, i, 0, 1, 2,
+    {
+        out[i * 2].v     = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
+        out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - (DATA_TYPE)2 * tmp[6 + i].v + (DATA_TYPE)2 * tmp[8 + i].v - (DATA_TYPE)3 * tmp[10 + i].v + (DATA_TYPE)3 * tmp[12 + i].v + tmp[14 + i].v;
+    })
+
+#if defined(HAS_BIAS)
+    // Add bias
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 2,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 2,
+        {
+            int x_c                       = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;
+            dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  dst_size                          Size of the destination tensor, minus the last padding
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x4_3x3_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 6, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(uint, 6, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // Compute out00, out01, out02 and out03
+    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
+    out[1].v = in[1].v - in[2].v + (DATA_TYPE)2.0f * in[3].v - (DATA_TYPE)2.0f * in[4].v;
+    out[2].v = in[1].v + in[2].v + (DATA_TYPE)4.0f * in[3].v + (DATA_TYPE)4.0f * in[4].v;
+    out[3].v = in[1].v - in[2].v + (DATA_TYPE)8.0f * in[3].v - (DATA_TYPE)8.0f * in[4].v + in[5].v;
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 4,
+    {
+        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Calculate the indirect Y for the source tensor
+    TILE(DATA_TYPE, 36, N0, in);
+    TILE(DATA_TYPE, 4, N0, tmp);
+    TILE(uint, 36, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        tmp[0].v     = in[6 + i].v + in[12 + i].v;
+        tmp[1].v     = in[6 + i].v - in[12 + i].v;
+        tmp[2].v     = in[18 + i].v + in[24 + i].v;
+        tmp[3].v     = in[18 + i].v - in[24 + i].v;
+        tmp[3].v     = tmp[3].v + tmp[3].v;
+        in[i].v      = in[i].v + tmp[0].v + tmp[2].v;
+        in[6 + i].v  = tmp[3].v + tmp[1].v;
+        in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+        in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
+    })
+
+    // Compute the output tile
+    TILE(DATA_TYPE, 16, N0, out);
+
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        tmp[0].v         = in[6 * i + 1].v + in[6 * i + 2].v;
+        tmp[1].v         = in[6 * i + 1].v - in[6 * i + 2].v;
+        tmp[2].v         = in[6 * i + 3].v + in[6 * i + 4].v;
+        tmp[3].v         = in[6 * i + 3].v - in[6 * i + 4].v;
+        tmp[3].v         = tmp[3].v + tmp[3].v;
+        out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
+        out[4 * i + 1].v = tmp[3].v + tmp[1].v;
+        out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+        out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
+    })
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 16, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 4,
+        {
+            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x4_5x5_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    TILE(DATA_TYPE, 8, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(DATA_TYPE, 4, N0, tmp);
+    TILE(uint, 8, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+    // "in" contains 1x8 or 8x1 tile here
+    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // A^T * in, and in this degenerate case out consists of 1 column/row
+    tmp[0].v = in[1].v - in[2].v;
+    tmp[1].v = (DATA_TYPE)2.0f * (in[3].v - in[4].v);
+    tmp[2].v = (DATA_TYPE)2.0f * (in[5].v + in[6].v);
+    tmp[3].v = in[3].v + in[4].v;
+    out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + (DATA_TYPE)4.0f * tmp[2].v;
+    out[1].v = tmp[0].v + tmp[1].v + (DATA_TYPE)4.0f * (in[5].v - in[6].v);
+    out[2].v = in[1].v + in[2].v + (DATA_TYPE)4.0f * tmp[3].v + tmp[2].v;
+    out[3].v = tmp[0].v + (DATA_TYPE)4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 4,
+    {
+        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Calculate the indirect Y for the source tensor
+    TILE(DATA_TYPE, 64, N0, in);
+    TILE(DATA_TYPE, 6, N0, tmp);
+    TILE(uint, 64, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // "in" here is 8x8 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // A^T * in
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        tmp[0].v = in[8 + i].v + in[16 + i].v;
+        tmp[1].v = in[8 + i].v - in[16 + i].v;
+        tmp[2].v = in[24 + i].v + in[32 + i].v;
+        tmp[3].v = in[24 + i].v - in[32 + i].v;
+        tmp[3].v = tmp[3].v + tmp[3].v;
+        tmp[4].v = in[40 + i].v + in[48 + i].v;
+        tmp[4].v = tmp[4].v + tmp[4].v;
+        tmp[5].v = in[40 + i].v - in[48 + i].v;
+
+        // 4x8 matrix as a result
+        in[i].v      = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+        in[8 + i].v  = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+        in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
+        in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
+    })
+
+    // Compute the output tile
+    TILE(DATA_TYPE, 16, N0, out);
+
+    // in * A, with in = A^T * in as above
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
+        tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
+        tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
+        tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
+        tmp[3].v = tmp[3].v + tmp[3].v;
+        tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
+        tmp[4].v = tmp[4].v + tmp[4].v;
+        tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
+
+        // 4x4 tile
+        out[4 * i].v     = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+        out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+        out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
+        out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
+    })
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 16, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 4,
+        {
+            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_2x1_7x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x1_3x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x1_5x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_1x2_1x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_1x4_1x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_1x4_1x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/nonmax.cl
deleted file mode 100644
index ab13131807..0000000000
--- a/src/core/CL/cl_kernels/nonmax.cl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function performs Non maxima suppression over a 3x3 window on a given image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/F32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p scr_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_max_suppression(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    vc = vload8(0, (__global DATA_TYPE *)src.ptr);
-
-    if(all(vc == (DATA_TYPE)0))
-    {
-        vstore8(0, 0, (__global DATA_TYPE *)dst.ptr);
-
-        return;
-    }
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, -1));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out = select((DATA_TYPE)0, vc, (vc >= nc.s01234567) && (vc >= nc.s12345678) && (vc >= nc.s23456789));
-
-    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, 0));
-    out = select((DATA_TYPE)0, out, (vc >= nc.s01234567) && (vc > nc.s23456789));
-
-    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, +1));
-    out = select((DATA_TYPE)0, out, (vc > nc.s01234567) && (vc > nc.s12345678) && (vc > nc.s23456789));
-
-    vstore8(out, 0, (__global DATA_TYPE *)dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
deleted file mode 100644
index 4569208824..0000000000
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "tile_helpers.h"
-
-#define MUL_OP(x, y) ((x) * (y))
-#define ADD_OP(x, y) ((x) + (y))
-#define DIV_OP(x, y) ((x) / (y))
-#define POW_OP(x, y) pow((x), (y))
-#define SQCVT_SAT(a) (a)
-
-#if defined(NUM_SLICES)
-/** Apply cross-map normalization.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input),
-                                                 TENSOR3D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
-
-    const int current_slice = get_global_id(2);
-    const int left_slice    = max(-(int)RADIUS, -current_slice);
-    const int right_slice   = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
-
-    for(int i = left_slice; i <= right_slice; i++)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
-        acc    = ADD_OP(acc, MUL_OP(values, values));
-    }
-
-    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized = POW_OP(acc, beta_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
-
-    VSTORE(VEC_SIZE)
-    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
-}
-#endif /* defined(NUM_SLICES) */
-
-#if defined(WIDTH_SIZE)
-/** Apply cross-map normalization.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_cross_map_nhwc(TENSOR3D_DECLARATION(input),
-                                                 TENSOR3D_DECLARATION(output))
-{
-    // Offset computation
-    const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
-
-    // Address computation
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = 0;
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = SQCVT_SAT(COEFF);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = SQCVT_SAT(BETA);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = SQCVT_SAT(KAPPA);
-
-    const int left_slice  = max((int)0, (int)x_offs - (int)RADIUS);
-    const int right_slice = min((int)WIDTH_SIZE - 1, (int)x_offs + (int)RADIUS);
-
-    for(int i = left_slice; i <= right_slice; ++i)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * sizeof(DATA_TYPE)));
-        acc    = ADD_OP(acc, MUL_OP(values, values));
-    }
-
-    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized = POW_OP(acc, beta_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x_offs * sizeof(DATA_TYPE))), normalized);
-
-    STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-/** Apply in-map normalization when tensors are in the NCHW data layout format.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
-                                              TENSOR3D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = 0;
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = SQCVT_SAT(COEFF);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = SQCVT_SAT(BETA);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = SQCVT_SAT(KAPPA);
-
-    const int current_col = get_global_id(0) << 2;
-    const int left_pos    = max(-(int)RADIUS, -3 - current_col);
-    const int right_pos   = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
-
-#if defined(IN_MAP_2D)
-    const int current_row = get_global_id(1);
-    const int first_row   = max(-(int)RADIUS, -current_row);
-    const int last_row    = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
-#endif /* defined(IN_MAP_2D) */
-
-#if defined(IN_MAP_2D)
-    for(int j = first_row; j <= last_row; ++j)
-    {
-#endif /* defined(IN_MAP_2D) */
-        for(int i = left_pos; i <= right_pos; ++i)
-        {
-#if defined(IN_MAP_2D)
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
-#else  /* defined(IN_MAP_2D) */
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
-#endif /* defined(IN_MAP_2D) */
-            acc = ADD_OP(acc, MUL_OP(values, values));
-        }
-#if defined(IN_MAP_2D)
-    }
-#endif /* defined(IN_MAP_2D) */
-
-    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized = POW_OP(acc, beta_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
-
-    VSTORE(VEC_SIZE)
-    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
-}
-#endif // defined(WIDTH_SIZE)
-
-#if defined(NUM_SLICES) && defined(DIM1_SIZE)
-/** Apply in-map normalization when tensors are in the NHWC data layout format.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
-                                              TENSOR3D_DECLARATION(output))
-{
-    // Offset computation
-    const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
-    const int current_cols = get_global_id(1);
-    const int current_rows = get_global_id(2);
-
-    // Address computation
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + current_cols * output_stride_y + current_rows * output_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = 0;
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = SQCVT_SAT(COEFF);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = SQCVT_SAT(BETA);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = SQCVT_SAT(KAPPA);
-
-    const int first_col    = max(0, current_cols - (int)RADIUS);
-    const int last_col     = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS);
-
-#if defined(IN_MAP_2D)
-    const int first_row = max(0, current_rows - (int)RADIUS);
-    const int last_row  = min((int)NUM_SLICES - 1, current_rows + (int)RADIUS);
-#endif /* defined(IN_MAP_2D) */
-
-#if defined(IN_MAP_2D)
-    for(int j = first_row; j <= last_row; ++j)
-    {
-#else  // defined(IN_MAP_2D)
-    const int j = current_rows;
-#endif /* defined(IN_MAP_2D) */
-        for(int i = first_col; i <= last_col; ++i)
-        {
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * input_stride_y + j * input_stride_z));
-            acc    = ADD_OP(acc, MUL_OP(values, values));
-        }
-#if defined(IN_MAP_2D)
-    }
-#endif /* defined(IN_MAP_2D) */
-
-    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized = POW_OP(acc, beta_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows * output_stride_z)), normalized);
-
-    STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(NUM_SLICES) && defined(DIM1_SIZE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
deleted file mode 100644
index 0a098356b4..0000000000
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE)
-
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
-                                              TENSOR3D_DECLARATION(dst),
-                                              VECTOR_DECLARATION(mean),
-                                              VECTOR_DECLARATION(std))
-{
-    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
-
-    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
-
-    const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
-    const DATA_TYPE curr_std  = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
-
-    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
-    TYPE res  = (data - curr_mean) / curr_std;
-
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
-                                              TENSOR3D_DECLARATION(dst),
-                                              VECTOR_DECLARATION(mean),
-                                              VECTOR_DECLARATION(std))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr  = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
-    __global uchar *std_addr  = std_ptr + std_offset_first_element_in_bytes + x_offs;
-
-    const TYPE curr_mean = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
-    const TYPE curr_std  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr);
-
-    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    TYPE res0 = (data - curr_mean) / curr_std;
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
deleted file mode 100644
index d660fffb58..0000000000
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
-
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define OFFSET_FLT ((float)OFFSET)
-#define SCALE_FLT ((float)SCALE)
-
-#if defined(NUM_CHANNELS)
-
-/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
- * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
- * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
-                                                 TENSOR3D_DECLARATION(dst),
-                                                 VECTOR_DECLARATION(mean),
-                                                 VECTOR_DECLARATION(std))
-{
-    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
-
-    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
-    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
-    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE));
-    data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT;
-
-    // Perform normalization
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
-
-    const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
-    VSTORE(VEC_SIZE)
-    (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-#endif // defined(NUM_CHANNELS)
-
-/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
- * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
-                                                 TENSOR3D_DECLARATION(dst),
-                                                 VECTOR_DECLARATION(mean),
-                                                 VECTOR_DECLARATION(std))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr  = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
-    __global uchar *std_addr  = std_ptr + std_offset_first_element_in_bytes + x_offs;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    curr_mean_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr), VEC_DATA_TYPE(float, VEC_SIZE));
-    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    curr_std_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr), VEC_DATA_TYPE(float, VEC_SIZE));
-    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr), VEC_DATA_TYPE(float, VEC_SIZE));
-    data_flt = round(data_flt - OFFSET_FLT) * (SCALE_FLT);
-
-    // Perform normalization
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
-
-    const TYPE res0 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
-    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
diff --git a/src/core/CL/cl_kernels/pad_layer.cl b/src/core/CL/cl_kernels/pad_layer.cl
deleted file mode 100644
index 903e924a2f..0000000000
--- a/src/core/CL/cl_kernels/pad_layer.cl
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
-
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_SELECT SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define OFFSETS VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VEC_SIZE)
-#define SCALAR_COND(x) CONVERT((VEC_SELECT)x == (VEC_SELECT)1, VEC_SELECT)
-
-#if defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
-/** Perform a pad operation when PaddingMode is CONSTANT
- *
- * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
- * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
- * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
- * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
- * @note In case pad left is more than the vector size, the number of threads to skip along the X axis must be passed using the
- *       -DTHREADS_TO_SKIP_BEFORE compile flag, e.g. -DTHREADS_TO_SKIP_BEFORE=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
- * @note In case pad left is more than the vector size, the thread from which to skip along the X axis for pad right must be passed using the
- *       -DTHREADS_TO_SKIP_AFTER compile flag, e.g. -THREADS_TO_SKIP_AFTER=1. This is defined as ((SRC_WIDTH + PAD_X_BEFORE) / VEC_SIZE)
- * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
- *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
- * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
- *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
- * @note If pad also needs to be added to the batch of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g. -DPAD_W_BEFORE=3)
- *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  batch                             (Optional) Batch index if 4D pad must be applied
- */
-__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src),
-                                 TENSOR3D_DECLARATION(dst)
-#if defined(PAD_W_BEFORE)
-                                 ,
-                                 uint batch
-#endif // defined(PAD_W_BEFORE)
-                                )
-{
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    // If true, write only padding values; no reads performed
-    uint cond = 0;
-#if defined(THREADS_TO_SKIP_BEFORE)
-    cond |= x < THREADS_TO_SKIP_BEFORE || x > THREADS_TO_SKIP_AFTER;
-#endif // defined(THREADS_TO_SKIP_BEFORE)
-#if defined(PAD_Y_BEFORE)
-    cond |= y < PAD_Y_BEFORE || y >= (SRC_HEIGHT + PAD_Y_BEFORE);
-#endif // defined(PAD_Y_BEFORE)
-#if defined(PAD_Z_BEFORE)
-    cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
-#endif // defined(PAD_Z_BEFORE)
-#if defined(PAD_W_BEFORE)
-    cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
-#endif // defined(PAD_W_BEFORE)
-
-    if(cond)
-    {
-        VEC_TYPE const_vals0 = (VEC_TYPE)CONST_VAL;
-        STORE_VECTOR_SELECT(const_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
-    }
-    else
-    {
-        // Calculate input's coordinates based on output's
-        int w = 0;
-#if defined(THREADS_TO_SKIP_BEFORE)
-        x -= THREADS_TO_SKIP_BEFORE;
-#endif // defined(THREADS_TO_SKIP_BEFORE)
-#if defined(PAD_Y_BEFORE)
-        y -= PAD_Y_BEFORE;
-#endif // defined(PAD_Y_BEFORE)
-#if defined(PAD_Z_BEFORE)
-        z -= PAD_Z_BEFORE;
-#endif // defined(PAD_Z_BEFORE)
-#if defined(PAD_W_BEFORE)
-        w -= PAD_W_BEFORE * SRC_DEPTH;
-#endif // defined(PAD_W_BEFORE)
-        x *= VEC_SIZE;
-        x -= PAD_X_BEFORE_REMAINDER;
-
-        // Check for out of bound reads and clamp X coordinate
-        uint cond_left  = x < 0;
-        uint cond_right = (x + VEC_SIZE) > SRC_WIDTH;
-        x               = clamp(x, 0, (SRC_WIDTH - VEC_SIZE));
-
-        // Calculate input's address
-        __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_stride_x + y * src_stride_y + z * src_stride_z + w * (int)src_stride_z;
-
-        // Read values and rotate them properly if they would have been across paddings
-        VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, PAD_X_BEFORE_REMAINDER), SCALAR_COND(cond_left));
-        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, VEC_SIZE_LEFTOVER_READ), SCALAR_COND(cond_right));
-
-        // Check what values would be padding and replace them with the constant value
-        VEC_INT xs_out = (VEC_INT)(get_global_id(0) * VEC_SIZE) + VEC_OFFS(int, VEC_SIZE);
-        VEC_INT conds  = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
-        src_vals0      = select(src_vals0, (VEC_TYPE)CONST_VAL, CONVERT(conds, VEC_SELECT));
-
-        // Store values in bounds
-        STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
-    }
-}
-#endif // defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
-
-#if defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
-
-#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
-#define SYMM_REFL_LEFT(x, n0, n1) select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
-#define SYMM_REFL_RIGHT(x, n0, n1) select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
-
-/** Perform a pad operation when PaddingMode is SYMMETRIC
- *
- * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
- * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
- * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
- * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
- * @note Number of values to the left when operating across left padding must be passed using the -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
- * @note Number of values to the left when operating across right padding must be passed using the -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
- * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
- * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5
- * @note When after pad X, starting point to read backward from must be passed using the -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253
- * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be set to 0
- * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
- *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
- * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
- *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
- * @note If the starting point to read backward from is less than the output's last element accessed in the X, the following compile flags must be passed at compile time to avoid negative offsets:
- *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src),
-                                          TENSOR3D_DECLARATION(dst))
-{
-    // Get current thread position
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-
-    // Define conditions based on the thread X position w.r.t. pad left and right
-    const int x_out_first         = x * VEC_SIZE;
-    const int x_out_last          = x_out_first + VEC_SIZE;
-    const int is_before_pad_left  = (x_out_last <= PAD_X_BEFORE);
-    const int is_across_pad_left  = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE);
-    const int is_inside_input     = (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE));
-    const int is_across_pad_right = (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE));
-    const int is_after_pad_right  = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE));
-
-    // Calculate base pointers
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes;
-    Tensor3D        dst      = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    // Calculate input tensor's offset based on the defined conditions
-    int x_offset = 0;
-    x_offset     = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left);
-    x_offset     = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input);
-    x_offset     = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right);
-    x_offset     = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right);
-
-#if defined(AFTER_PAD_REM)
-    int neg_offs = x_offset < 0;
-    x_offset     = max(x_offset, 0);
-#endif // defined(AFTER_PAD_REM)
-
-    // Load input values from the computed offset
-    int y_in = y;
-    int z_in = z;
-#if defined(PAD_Y_BEFORE)
-    y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE);
-    y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1, y >= (SRC_HEIGHT + PAD_Y_BEFORE));
-#endif // defined(PAD_Y_BEFORE)
-#if defined(PAD_Z_BEFORE)
-    z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE);
-    z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1, z >= (SRC_DEPTH + PAD_Z_BEFORE));
-#endif // defined(PAD_Y_BEFORE)
-
-    src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z;
-
-#if SRC_WIDTH == 1
-    VSTORE(VEC_SIZE)
-    ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
-#else // SRC_WIDTH == 1
-
-    VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-
-    // Choose rearrangement policy based on the defined conditions
-    src_vals0 = select(src_vals0, SYMM_REFL_LEFT(src_vals0, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL), SCALAR_COND(is_across_pad_left));
-    src_vals0 = select(src_vals0, SYMM_REFL_RIGHT(src_vals0, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL), SCALAR_COND(is_across_pad_right));
-    src_vals0 = select(src_vals0, REVERSE(src_vals0, VEC_SIZE), SCALAR_COND((is_before_pad_left || is_after_pad_right)));
-#if defined(AFTER_PAD_REM)
-    src_vals0 = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
-#endif // defined(AFTER_PAD_REM)
-
-    // Store values in bounds
-    STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
-#endif // SRC_WIDTH == 1
-}
-#endif // defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/permute.cl
deleted file mode 100644
index db9e7ecc25..0000000000
--- a/src/core/CL/cl_kernels/permute.cl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
-/**Perform a permute operation on an input tensor of Shape DCHW.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2, -DP2=1, -DP3=0 and -DP4=3.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute(TENSOR4D_DECLARATION(input),
-                      TENSOR4D_DECLARATION(output))
-
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    int out_index[4] = { 0 };
-    int in_index[4]  = { 0 };
-
-    in_index[0] = get_global_id(0);            // W
-    in_index[1] = get_global_id(1);            // H
-    in_index[2] = get_global_id(2) % DEPTH_IN; // C
-    in_index[3] = get_global_id(2) / DEPTH_IN; // B
-
-    out_index[0] = in_index[P1];
-    out_index[1] = in_index[P2];
-    out_index[2] = in_index[P3];
-    out_index[3] = in_index[P4];
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
deleted file mode 100644
index 0016775893..0000000000
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifdef SATURATE
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
-#else /* SATURATE */
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
-#endif /* SATURATE */
-#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
-
-#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
-
-#if defined(ACTIVATION_TYPE)
-#include "activation_float_helpers.h"
-#endif // defined(ACTIVATION_TYPE)
-
-#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
-#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
-
-/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
- * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
- * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Float scaling factor. Supported data types: F32
- */
-__kernel void pixelwise_mul_float(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out),
-    const float scale)
-{
-    // Get pixels pointer
-    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
-    size_t y = get_global_id(1);
-    size_t z = get_global_id(2);
-
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
-
-    // Load data
-    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_ACC_TYPE);
-    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_ACC_TYPE);
-
-    // Perform multiplication
-#ifdef DATA_TYPE_FLOAT
-    VEC_OUT_TYPE res0 = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_OUT_TYPE);
-#else  /* DATA_TYPE_FLOAT */
-    VEC_OUT_TYPE res0 = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((CONVERT(in1_data * in2_data, VEC_FLOAT) * scale), VEC_ACC_TYPE, ROUND), VEC_OUT_TYPE, ROUND);
-#endif /* DATA_TYPE_FLOAT */
-
-#if defined(ACTIVATION_TYPE)
-    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
-
-#if defined(DATA_TYPE)
-
-/** Performs a pixelwise multiplication of complex float values
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: F16/F32
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pixelwise_mul_complex(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    vin1 = vload2(0, (__global DATA_TYPE *)in1.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    vin2 = vload2(0, (__global DATA_TYPE *)in2.ptr);
-
-    // Perform complex multiplication
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
-
-#if defined(ACTIVATION_TYPE)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res, A_VAL, B_VAL), 0, (__global DATA_TYPE *)out.ptr);
-#else  // defined(ACTIVATION_TYPE)
-    // Store result
-    vstore2(res, 0, (__global DATA_TYPE *)out.ptr);
-#endif // defined(ACTIVATION_TYPE)
-}
-
-#endif // defined(DATA_TYPE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
deleted file mode 100644
index ac5cabcb8c..0000000000
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(SATURATE)
-#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
-#else // SATURATE
-#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
-#endif // SATURATE
-#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
-
-#define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
-
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-
-#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
-
-#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
-#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-
-/** Performs a pixelwise multiplication with integer scale of integer inputs.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data_type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
- * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/S16
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Integer scaling factor. Supported data types: S32.
- */
-__kernel void pixelwise_mul_int(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out),
-    const uint scale)
-{
-    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
-    size_t y = get_global_id(1);
-    size_t z = get_global_id(2);
-
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
-
-    // Load data
-    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr), VEC_ACC_TYPE);
-    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr), VEC_ACC_TYPE);
-    // Perform multiplication and store result
-    VEC_OUT_TYPE out_data0 = MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, VEC_SIZE_OUT);
-    STORE_VECTOR_SELECT(out_data, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
-
-#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-
-/** Performs a pixelwise multiplication with float scale of quantized inputs.
- *
- * @note The quantization offset of the first operand must be passed at compile time only if asymmetric using -DOFFSET_IN1, e.g. -DOFFSET_IN1=10
- * @note The quantization offset of the second operand must be passed at compile time only if asymmetric using -DOFFSET_IN2, e.g. -DOFFSET_IN2=10
- * @note The quantization offset of the output must be passed at compile time only if asymmetric using -DOFFSET_OUT, e.g. -DOFFSET_OUT=10
- * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, e.g. -DSCALE_IN1=10
- * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, e.g. -DSCALE_IN2=10
- * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, e.g. -DSCALE_OUT=10
- * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Float scaling factor. Supported data types: F32
- */
-__kernel void pixelwise_mul_quantized(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out),
-    const float scale)
-{
-    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
-    size_t y = get_global_id(1);
-    size_t z = get_global_id(2);
-
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
-
-    // Load data
-    VEC_INT in_a = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
-    VEC_INT in_b = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
-
-    // Dequantize
-#if defined(OFFSET_IN1)
-    in_a -= (VEC_INT)((int)OFFSET_IN1);
-#endif // defined(OFFSET_IN1)
-#if defined(OFFSET_IN2)
-    in_b -= (VEC_INT)((int)OFFSET_IN2);
-#endif // defined(OFFSET_IN2)
-    const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
-    const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
-
-#if defined(OFFSET_OUT)
-    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
-#else  // defined(OFFSET_OUT)
-    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT);
-#endif // defined(OFFSET_OUT)
-    const VEC_TYPE res0 = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
-
-    // Store result
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT) */
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
deleted file mode 100644
index 8944c9b1ac..0000000000
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ /dev/null
@@ -1,971 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "repeat.h"
-#include "tile_helpers.h"
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define POOL_OP(x, y) ((x) + (y))
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#define POOL_OP(x, y) (fmax((x), (y)))
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-#define POW2_OP(x, vec_size) ((x) * (x))
-#else /* defined(POOL_L2) */
-#define POW2_OP(x, vec_size) (x)
-#endif /* defined(POOL_L2) */
-
-#define DIV_OP(x, y) (x * (1.f / y))
-#define SQRT_OP(x) sqrt((x))
-
-#if STRIDE_X == 1
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
-#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
-#endif /* STRIDE_X == 3 */
-
-#if defined(FP_MIXED_PRECISION)
-#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
-#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
-    CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
-#else /* defined(FP_MIXED_PRECISION) */
-#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
-#endif /* defined(FP_MIXED_PRECISION) */
-
-#define POOLING3x3_STRIDE1(res, input, output)                                                                                                       \
-    ({                                                                                                                                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4);                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4);                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4);                               \
-        data00 = POW2_OP(data00, 4);                                                                                                                 \
-        data01 = POW2_OP(data01, 2);                                                                                                                 \
-        data10 = POW2_OP(data10, 4);                                                                                                                 \
-        data11 = POW2_OP(data11, 2);                                                                                                                 \
-        data20 = POW2_OP(data20, 4);                                                                                                                 \
-        data21 = POW2_OP(data21, 2);                                                                                                                 \
-        \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01);                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01);                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01);                                                              \
-        \
-        values00 = POOL_OP(values00, values10);                                                                                                      \
-        values01 = POOL_OP(values01, values11);                                                                                                      \
-        values00 = POOL_OP(values00, values20);                                                                                                      \
-        values01 = POOL_OP(values01, values21);                                                                                                      \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
-    })
-
-#define POOLING3x3_STRIDE2(res, input, output)                                                                                                       \
-    ({                                                                                                                                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data00               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                     \
-        ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8));                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data10               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                     \
-        ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8));                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data20               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                     \
-        ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8));                                       \
-        data00               = POW2_OP(data00, 8);                                                                                                   \
-        data01               = POW2_OP(data01, 1);                                                                                                   \
-        data10               = POW2_OP(data10, 8);                                                                                                   \
-        data11               = POW2_OP(data11, 1);                                                                                                   \
-        data20               = POW2_OP(data20, 8);                                                                                                   \
-        data21               = POW2_OP(data21, 1);                                                                                                   \
-        \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01);                                                                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11);                                                                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21);                                                                           \
-        \
-        values00 = POOL_OP(values00, values10);                                                                                                      \
-        values01 = POOL_OP(values01, values11);                                                                                                      \
-        values00 = POOL_OP(values00, values20);                                                                                                      \
-        values01 = POOL_OP(values01, values21);                                                                                                      \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
-    })
-
-#define POOLING3x3_STRIDE3(res, input, output)                                                                                               \
-    ({                                                                                                                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                       \
-        data00 = POW2_OP(data00, 8);                                                                                                         \
-        data01 = POW2_OP(data01, 4);                                                                                                         \
-        data10 = POW2_OP(data10, 8);                                                                                                         \
-        data11 = POW2_OP(data11, 4);                                                                                                         \
-        data20 = POW2_OP(data20, 8);                                                                                                         \
-        data21 = POW2_OP(data21, 4);                                                                                                         \
-        \
-        data00 = POOL_OP(data00, data10);                                                                                                    \
-        data01 = POOL_OP(data01, data11);                                                                                                    \
-        data00 = POOL_OP(data00, data20);                                                                                                    \
-        data01 = POOL_OP(data01, data21);                                                                                                    \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03));                                                       \
-    })
-
-ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int       start_x = get_global_id(0) * stride_x - pad_x;
-    int       start_y = get_global_id(1) * stride_y - pad_y;
-    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to 2.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_2(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 = POW2_OP(data0, 2);
-    data1 = POW2_OP(data1, 2);
-#endif /* defined(POOL_L2) */
-
-    // Perform calculations
-    data0             = POOL_OP(data0, data1);
-    ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average or l2 pooling
-    res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-
-/** Performs a pooling function of pool size equal to 3
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_3(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 = POW2_OP(data0, 3);
-    data1 = POW2_OP(data1, 3);
-    data2 = POW2_OP(data2, 3);
-#endif /* defined(POOL_L2) */
-
-    // Perform calculations
-    data0             = POOL_OP(data0, data1);
-    data0             = POOL_OP(data0, data2);
-    ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-
-#if defined(POOLING3x3)
-
-#define CONVERT_OP(data_type) convert_##data_type##4
-#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
-
-VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
-                     const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int4       start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
-    int        start_y = get_global_id(1) * stride_y - pad_y;
-    const int4 end_x   = min(start_x + (int4)pool_size, (int4)upper_bound_w);
-    const int  end_y   = min(start_y + pool_size, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max((int4)0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
-}
-
-/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_optimized_3(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-    res;
-
-    // Perform pooling 3x3 for 4 output elements
-    POOLING3x3(res, input, output);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(POOLING3x3)
-
-#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-/** Performs a pooling function of pool size equal to N  (NCHW)
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-    vdata               = INITIAL_VALUE;
-    ACC_DATA_TYPE sdata = INITIAL_VALUE;
-
-    // Load data
-    for(int y = 0; y < POOL_SIZE_Y; y++)
-    {
-        int x = 0;
-        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
-        {
-            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif /* defined(POOL_L2) */
-            vdata = POOL_OP(vdata, data0);
-        }
-
-        // Leftover
-        for(; x < (int)POOL_SIZE_X; ++x)
-        {
-            ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif /* defined(POOL_L2) */
-            sdata = POOL_OP(sdata, data0);
-        }
-    }
-
-    // Reduce result
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-    reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    reduce2           = POOL_OP(reduce4.s01, reduce4.s23);
-    ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
-    res               = POOL_OP(res, sdata);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
-{
-    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
-    const int pad_vert  = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
-
-    const int x = get_global_id(0) * STRIDE_X;
-    const int y = get_global_id(1) * STRIDE_Y;
-    const int z = get_global_id(2);
-
-    //x axis: width, y axis: height, z axis: component
-    const uint padded_offset = input->offset_first_element_in_bytes
-                               + x * input->stride_x
-                               + y * input->stride_y
-                               + z * input->stride_z;
-
-    const uint offset_base = padded_offset
-                             - y * pad_horiz * sizeof(DATA_TYPE)                                               /* Horizontal padding for each row */
-                             - PAD_TENSOR_TOP * input->stride_y                                                /* top padding */
-                             - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
-                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
-
-#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
-    *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
-#else  /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
-    *offset_top   = (uint)(offset_base / sizeof(DATA_TYPE));
-#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
-
-    *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-
-    return;
-}
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2_nchw_indices_fp32(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    TENSOR3D_DECLARATION(indices))
-{
-    // Get pixels pointer
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
-    // Load data
-    float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
-    float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-
-    // Perform calculations
-    float data0_max = POOL_OP(data0.s0, data0.s1);
-    float data1_max = POOL_OP(data1.s0, data1.s1);
-    float res       = POOL_OP(data0_max, data1_max);
-    // Store result
-    *(__global float *)output.ptr = res;
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-    uint offset_top    = 0;
-    uint offset_bottom = 0;
-
-    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
-
-    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
-    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
-    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
-
-    *(__global uint *)indices.ptr = index;
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-}
-
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2_nchw_indices_fp16(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    TENSOR3D_DECLARATION(indices))
-{
-    // Get pixels pointer
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
-    // Load data
-    half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
-    half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
-
-    // Perform calculations
-    half data0_max = POOL_OP(data0.s0, data0.s1);
-    half data1_max = POOL_OP(data1.s0, data1.s1);
-    half res       = POOL_OP(data0_max, data1_max);
-    // Store result
-    *(__global half *)output.ptr = res;
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-    uint offset_top    = 0;
-    uint offset_bottom = 0;
-
-    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
-
-    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
-    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
-    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
-
-    *(__global uint *)indices.ptr = index;
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-}
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-
-#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
- * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
- * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
-    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
-    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
-#if DST_BATCH_SIZE != 1
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
-    int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
-#else  //DST_BATCH_SIZE != 1
-    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0);
-    int idx_out_n = 0;
-#endif // DST_BATCH_SIZE != 1
-
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
-
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
-                                           output_stride_w;
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    res0 = INITIAL_VALUE;
-
-    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
-
-    int pool_x_s    = max((int)0, -idx_in_w);
-    int pool_x_e    = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s    = max((int)0, -idx_in_h);
-    int pool_y_e    = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
-
-#if defined(EXCLUDE_PADDING)
-    int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
-#else  // defined(EXCLUDE_PADDING)
-    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
-#endif // defined(EXCLUDE_PADDING)
-
-    for(int y = pool_y_s; y < pool_y_e; ++y)
-    {
-        for(int x = pool_x_s; x < pool_x_e; ++x)
-        {
-            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-            data0;
-#if defined(FP_MIXED_PRECISION)
-            // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else  // defined(FP_MIXED_PRECISION)
-            data0   = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
-#endif // defined(FP_MIXED_PRECISION)
-
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif // defined(POOL_L2)
-            res0 = POOL_OP(res0, data0);
-        }
-    }
-    
-#if defined(POOL_AVG) || defined(POOL_L2)
-            res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
-#endif // defined(POOL_AVG) || defined(POOL_L2)
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res0 = SQRT_OP(res0);
-#endif // defined(POOL_L2)
-
-    // Store result
-#if defined(FP_MIXED_PRECISION)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else  // defined(FP_MIXED_PRECISION)
-    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#endif // defined(FP_MIXED_PRECISION)
-}
-#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-
-/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
- * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           (Optional) Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      (Optional) Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      (Optional) Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      (Optional) Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_stride_w                      (Optional) Stride of the indices tensor in W dimension (in bytes)
- * @param[in]  indices_step_w                        (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2x2_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output)
-#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-    ,
-    TENSOR4D_DECLARATION(indices)
-#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-)
-{
-    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
-    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int idx_out_w = get_global_id(1);
-#if DST_BATCH_SIZE != 1
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    int idx_out_h = get_global_id(2) % DST_HEIGHT;
-    int idx_out_n = get_global_id(2) / DST_HEIGHT;
-#else  //SRC_BATCH_SIZE != 1
-    int idx_out_h = get_global_id(2);
-    int idx_out_n = 0;
-#endif // SRC_BATCH_SIZE != 1
-
-    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
-
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
-
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
-                                           output_stride_w;
-
-    int pool_x_s = max((int)0, -idx_in_w);
-    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s = max((int)0, -idx_in_h);
-    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
-
-    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
-
-    int x0 = pool_x_s + idx_in_w;
-    int y0 = pool_y_s + idx_in_h;
-    int x1 = pool_x_e - 1 + idx_in_w;
-    int y1 = pool_y_e - 1 + idx_in_h;
-
-    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
-
-#if defined(FP_MIXED_PRECISION)
-    // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else  // defined(FP_MIXED_PRECISION)
-    data0         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
-    data1         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
-    data2         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
-    data3         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
-#endif // defined(FP_MIXED_PRECISION)
-
-#if !defined(POOL_MAX)
-    if(filter_size != 4)
-    {
-        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
-        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
-        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
-        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
-
-        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
-        data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
-        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
-        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
-        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
-    }
-#endif // !defined(POOL_MAX)
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 *= data0;
-    data1 *= data1;
-    data2 *= data2;
-    data3 *= data3;
-#endif /* defined(POOL_L2) */
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    res0 = data0;
-    res0 = POOL_OP(res0, data1);
-    res0 = POOL_OP(res0, data2);
-    res0 = POOL_OP(res0, data3);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#if defined(EXCLUDE_PADDING)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
-#else  // !defined(EXCLUDE_PADDING)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
-#endif // defined(EXCLUDE_PADDING)
-#endif // defined(POOL_AVG) || defined(POOL_L2)
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res0 = SQRT_OP(res0);
-#endif // defined(POOL_L2)
-
-    // Store result
-#if defined(FP_MIXED_PRECISION)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else  // defined(FP_MIXED_PRECISION)
-    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#endif // defined(FP_MIXED_PRECISION)
-
-#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-
-    // This part is used to return the index of the maximum value
-    // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
-
-    // note: Batch dimension does not contribute in the offset contribution
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    base_index = (uint)idx_out_c;
-
-    base_index += VEC_OFFS(uint, VEC_SIZE);
-
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
-
-    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
-    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
-    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
-
-    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
-                                           indices_stride_w;
-
-    // Store result
-    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
-#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
deleted file mode 100644
index d8cef2b4e6..0000000000
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
-#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                  \
-    {                                                                                                                                                                 \
-        const VEC_FLOAT(VEC_SIZE) in_f32  = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
-        const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset));                            \
-        res                               = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_TYPE(VEC_SIZE));                                                \
-    }
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-#if defined(POOL_AVG)
-#define POOL_OP(x, y) ((x) + (y))
-#else /* defined(POOL_AVG) */
-#define POOL_OP(x, y) (max((x), (y)))
-#endif /* defined(POOL_AVG) */
-
-#define DIV_OP(x, y) (x * (1.f / y))
-
-#if defined(POOL_L2)
-#error "L2 pooling is not supported"
-#endif /* defined(POOL_L2) */
-
-int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                        const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int       start_x = get_global_id(0) * stride_x - pad_x;
-    int       start_y = get_global_id(1) * stride_y - pad_y;
-    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to N (NCHW)
- *
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pooling_layer_MxN_quantized_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int8 vdata = INITIAL_VALUE;
-    int  sdata = INITIAL_VALUE;
-
-    // Load data
-    for(int y = 0; y < POOL_SIZE_Y; y++)
-    {
-        int x = 0;
-        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
-        {
-            VEC_TYPE(8)
-            data       = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-            int8 data0 = convert_int8(data);
-            vdata      = POOL_OP(vdata, data0);
-        }
-
-        // Leftover
-        for(; x < (int)POOL_SIZE_X; ++x)
-        {
-            DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-            int data0      = convert_int(data);
-            sdata          = POOL_OP(sdata, data0);
-        }
-    }
-
-    // Reduce result
-    int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
-    int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
-    int  res     = POOL_OP(reduce2.s0, reduce2.s1);
-    res          = POOL_OP(res, sdata);
-
-#if defined(POOL_AVG)
-    res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
-#endif /* defined(POOL_AVG) */
-
-    DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-
-    const float result_f32   = convert_float(result_q8);
-    const float input_offset = (float)OFFSET_IN1;
-    const float input_scale  = (float)SCALE_IN1;
-    const float scale_out    = (float)SCALE_OUT;
-    const float offset_out   = (float)OFFSET_OUT;
-    const float in_f32       = (result_f32 - input_offset) * input_scale;
-    const float out_f32      = in_f32 / scale_out + offset_out;
-    result_q8                = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
-
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-    *(__global DATA_TYPE *)output.ptr = result_q8;
-}
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=uchar. Supported data types are QASYMM8/QASYMM8_SIGNED
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=int
- * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- * @note If the output has be requantized, -DOFFSET_IN1, -DOFFSET_OUT, -DSCALE_IN1 and -DSCALE_OUT muste be passed at compile time
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pooling_layer_MxN_quantized_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
-    int offset_c  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
-    int idx_out_w = get_global_id(1);
-#if DST_BATCH_SIZE != 1
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    int idx_out_h = get_global_id(2) % DST_HEIGHT;
-    int idx_out_n = get_global_id(2) / DST_HEIGHT;
-#else  //DST_BATCH_SIZE != 1
-    int idx_out_h   = get_global_id(2);
-    int idx_out_n   = 0;
-#endif // DST_BATCH_SIZE != 1
-
-    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
-
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + offset_c + idx_out_n * input_stride_w;
-
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + offset_c + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * output_stride_w;
-
-    int pool_x_s = max((int)0, -idx_in_w);
-    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s = max((int)0, -idx_in_h);
-    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
-
-#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
-    int filter_size = 0;
-#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
-    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
-#endif                                               // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    res0 = INITIAL_VALUE;
-
-    for(int y = pool_y_s; y < pool_y_e; ++y)
-    {
-        for(int x = pool_x_s; x < pool_x_e; ++x)
-        {
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            data;
-            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-            data0;
-
-            data  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
-            data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-
-            res0 = POOL_OP(res0, data0);
-
-#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
-            filter_size++;
-#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
-        }
-    }
-
-#if defined(POOL_AVG)
-    res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
-#endif // defined(POOL_AVG)
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-    // Store result
-    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
-}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/prior_box_layer.cl
deleted file mode 100644
index de10decdec..0000000000
--- a/src/core/CL/cl_kernels/prior_box_layer.cl
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3)
-
-/**  Compute prior boxes and clip (NCHW)
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  idx                                  Index to write to
- * @param[in]  center_x                             Center value of the x axis
- * @param[in]  center_y                             Center value of the y axis
- * @param[in]  box_width                            Prior box width
- * @param[in]  box_height                           Prior box height
- *
- */
-inline void calculate_xy_min_max_nchw(Image *out, int idx, float center_x, float center_y, float box_width, float box_height)
-{
-    float xmin = (center_x - box_width / 2.f) / WIDTH;
-    float ymin = (center_y - box_height / 2.f) / HEIGHT;
-    float xmax = (center_x + box_width / 2.f) / WIDTH;
-    float ymax = (center_y + box_height / 2.f) / HEIGHT;
-
-#if defined(CLIP)
-    xmin = clamp(xmin, 0.f, 1.f);
-    ymin = clamp(ymin, 0.f, 1.f);
-    xmax = clamp(xmax, 0.f, 1.f);
-    ymax = clamp(ymax, 0.f, 1.f);
-#endif // defined(CLIP)
-
-    // Store result
-    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(xmin, ymin, xmax, ymax), 0, ((__global DATA_TYPE *)offset(out, idx + 0, 0)));
-}
-
-/** Compute prior boxes (NCHW)
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  min_size                             Prior box min size
- * @param[in]  min_idx                              Index of the min vector
- * @param[in]  idx                                  Index to write to
- *
- * @return The updated index
- */
-inline int calculate_min_nchw(Image *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
-{
-    const float center_x = ((float)(get_global_id(0) % LAYER_WIDTH) + OFFSET) * STEP_X;
-    const float center_y = ((float)(get_global_id(0) / LAYER_WIDTH) + OFFSET) * STEP_Y;
-
-    float box_width  = min_size;
-    float box_height = min_size;
-    calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
-    idx += 4;
-
-    if(max_size > 0)
-    {
-        box_width  = sqrt(min_size * max[min_idx]);
-        box_height = box_width;
-        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
-        idx += 4;
-    }
-    for(unsigned int i = 0; i < aspect_ratios_size; ++i)
-    {
-        if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
-        {
-            continue;
-        }
-        box_width  = min_size * sqrt(aspect_ratios[i]);
-        box_height = min_size * rsqrt(aspect_ratios[i]);
-
-        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
-        idx += 4;
-    }
-
-    return idx;
-}
-/** Calculate prior boxes with NCHW format.
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  min                                  The minimum values
- * @param[in]  max                                  The maximum_values
- * @param[in]  aspect_ratios                        The aspect ratio values
- * @param[in]  min_size                             The minimum values size
- * @param[in]  max_size                             The maximum_values values size
- * @param[in]  aspect_ratios_size                   The aspect ratio values size
- */
-__kernel void prior_box_layer_nchw(IMAGE_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
-                                   unsigned int aspect_ratios_size)
-{
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    int idx = 0;
-    for(unsigned int i = 0; i < min_size; ++i)
-    {
-        idx = calculate_min_nchw(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
-    }
-
-    // Store variances
-    for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
-    {
-        vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1)));
-    }
-}
-#endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */
diff --git a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl b/src/core/CL/cl_kernels/qlstm_layer_normalization.cl
deleted file mode 100644
index 24cb111772..0000000000
--- a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#if VEC_SIZE == 2
-#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 2)
-#define PERFORM_REDUCTION_IMPL(type)                                                   \
-    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 2) sum) \
-    {                                                                                  \
-        sum.s0 += sum.s1;                                                              \
-        return sum.s0;                                                                 \
-    }
-#elif VEC_SIZE == 4
-#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 4)
-#define PERFORM_REDUCTION_IMPL(type)                                                   \
-    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 4) sum) \
-    {                                                                                  \
-        sum.s01 += sum.s23;                                                            \
-        sum.s0 += sum.s1;                                                              \
-        return sum.s0;                                                                 \
-    }
-#elif VEC_SIZE == 8
-#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 8)
-#define PERFORM_REDUCTION_IMPL(type)                                                   \
-    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 8) sum) \
-    {                                                                                  \
-        sum.s0123 += sum.s4567;                                                        \
-        sum.s01 += sum.s23;                                                            \
-        sum.s0 += sum.s1;                                                              \
-        return sum.s0;                                                                 \
-    }
-#else /* VEC_SIZE DEFAULT */
-#define VEC_SIZE 16
-#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 16)
-#define PERFORM_REDUCTION_IMPL(type)                                                    \
-    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 16) sum) \
-    {                                                                                   \
-        sum.s01234567 += sum.s89abcdef;                                                 \
-        sum.s0123 += sum.s4567;                                                         \
-        sum.s01 += sum.s23;                                                             \
-        sum.s0 += sum.s1;                                                               \
-        return sum.s0;                                                                  \
-    }
-#endif /* VEC_SIZE END */
-
-#define PERFORM_REDUCTION_STR(input, type) perform_reduction_##type(input)
-#define PERFORM_REDUCTION(input, type) PERFORM_REDUCTION_STR(input, type)
-
-PERFORM_REDUCTION_IMPL(int)
-PERFORM_REDUCTION_IMPL(long)
-
-/** Compute quantized multiplier and shift for the inverse square root of input.
- *  Using 3-bit fixed point and 5 iteration of Newton-Raphson method.
- *
- * @param[in] in            Input to use
- * @param[in] reverse_shift -1 to reverse the shift direction
- *
- * @return:
- *             .s0  Quantized multiplier for inverse square root
- *             .s1  Shift for inverse square root
- *
- */
-inline int2 get_invsqrt_quantized_multiplier_exp(int in, int reverse_shift)
-{
-    int2 stddev_inv;
-    int  stddev_inv_multiplier = INT_MAX;
-    int  stddev_inv_shift      = 0;
-    int  input                 = in;
-    if(input <= 1)
-    {
-        stddev_inv.s0 = stddev_inv_multiplier;
-        stddev_inv.s1 = stddev_inv_shift;
-        return stddev_inv;
-    }
-
-    stddev_inv_shift = 11;
-    while(input >= (1 << 29))
-    {
-        input /= 4;
-        ++stddev_inv_shift;
-    }
-
-    const unsigned int max_left_shift_bits       = clz(input) - 1;
-    const unsigned int max_left_shift_bits_pairs = max_left_shift_bits / 2;
-    const unsigned int left_shift_bit_pairs      = max_left_shift_bits_pairs - 1;
-    stddev_inv_shift -= left_shift_bit_pairs;
-    input <<= 2 * left_shift_bit_pairs;
-
-    typedef int               FixedPointRawType;
-    const unsigned int        fixedpoint_position     = 3;
-    const unsigned int        fixedpoint_int_position = sizeof(FixedPointRawType) * 8 - 1 - fixedpoint_position;
-    typedef FixedPointRawType FixedPoint3;
-    typedef FixedPointRawType FixedPoint0;
-
-    const FixedPoint3 fixedpoint_input      = (input >> 1);
-    const FixedPoint3 fixedpoint_half_input = ASYMM_ROUNDING_DIVIDE_BY_POW2(fixedpoint_input, 1, 1);
-    const FixedPoint3 fixedpoint_half_three = (0x1 << fixedpoint_int_position) + (0x1 << (fixedpoint_int_position - 1));
-    FixedPoint3       x                     = 0x1 << fixedpoint_int_position;
-
-    const int num_iteration = 5;
-    for(int i = 0; i < num_iteration; i++)
-    {
-        int x3 = ASYMM_RESCALE(ASYMM_MULT(ASYMM_MULT(x, x, 1), x, 1), 9, fixedpoint_position, 1);
-        x      = ASYMM_RESCALE(ASYMM_MULT(fixedpoint_half_three, x, 1) - ASYMM_MULT(fixedpoint_half_input, x3, 1), 6, fixedpoint_position, 1);
-    }
-    const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
-    x                                        = ASYMM_MULT(fixedpoint_half_sqrt_2, x, 1);
-    stddev_inv_multiplier                    = x;
-    if(stddev_inv_shift < 0)
-    {
-        stddev_inv_multiplier <<= -stddev_inv_shift;
-        stddev_inv_shift = 0;
-    }
-    stddev_inv_shift *= reverse_shift;
-
-    stddev_inv.s0 = stddev_inv_multiplier;
-    stddev_inv.s1 = stddev_inv_shift;
-    return stddev_inv;
-}
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
-/** This function implements QLSTM layer normalization.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QSYMM16
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  weight_ptr                           Pointer to the weight tensor. Supported data type: same as @p input_ptr
- * @param[in]  weight_stride_x                      Stride of the weight tensor in X dimension (in bytes)
- * @param[in]  weight_step_x                        weight_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weight_offset_first_element_in_bytes The offset of the first element in the weight tensor
- * @param[in]  bias_ptr                             Pointer to the bias tensor. Supported data type: S32
- * @param[in]  bias_stride_x                        Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  bias_step_x                          bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes   The offset of the first element in the biases tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void qlstm_layer_normalization(
-    IMAGE_DECLARATION(input),
-    VECTOR_DECLARATION(weight),
-    VECTOR_DECLARATION(bias),
-    IMAGE_DECLARATION(output))
-{
-    // Get pixels pointer
-    Image  input  = CONVERT_TO_IMAGE_STRUCT(input);
-    Vector weight = CONVERT_TO_VECTOR_STRUCT(weight);
-    Vector bias   = CONVERT_TO_VECTOR_STRUCT(bias);
-    Image  output = CONVERT_TO_IMAGE_STRUCT(output);
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    sum = 0;
-    VEC_DATA_TYPE(long, VEC_SIZE)
-    sum_sq = 0;
-    // Calculate partial sum
-    int i = 0;
-    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
-    {
-        // Load data
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));
-
-        sum += CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE));
-        sum_sq += CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE)) * CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE));
-    }
-    // Perform reduction
-    sum.s0    = PERFORM_REDUCTION(sum, int);
-    sum_sq.s0 = PERFORM_REDUCTION(sum_sq, long);
-
-    // Left-overs loop
-    for(; i < WIDTH; ++i)
-    {
-        DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));
-
-        sum.s0 += CONVERT(data, int);
-        sum_sq.s0 += CONVERT(data, long) * CONVERT(data, long);
-    }
-
-    int  temp       = 0x100000 / WIDTH;
-    int  mean       = (int)(sum.s0 * 1024 / WIDTH);
-    int  var2       = ((sum_sq.s0 * (long)temp) - ((long)mean * (long)mean)) / 0x100000;
-    int2 stddev_inv = get_invsqrt_quantized_multiplier_exp(var2, -1);
-
-    i = 0;
-    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));
-        VEC_DATA_TYPE(int, VEC_SIZE)
-        res = CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE)) * 1024 - mean;
-        res = multiply_by_quantized_multiplier(res, stddev_inv.s0, stddev_inv.s1);
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        w   = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)vector_offset(&weight, i));
-        res = res * CONVERT(w, VEC_DATA_TYPE(int, VEC_SIZE));
-        res = res + VLOAD(VEC_SIZE)(0, (__global int *)vector_offset(&bias, i));
-        // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;
-        res = (res + 512) >> 10;
-        res = multiply_by_quantized_multiplier(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12);
-#if defined(MIN_BOUND)
-        res = max(res, (VEC_DATA_TYPE(int, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-        res = min(res, (VEC_DATA_TYPE(int, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-        VSTORE(VEC_SIZE)
-        (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)offset(&output, i, 0));
-    }
-    for(; i < WIDTH; ++i)
-    {
-        DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));
-        int res        = (int)data * 1024 - mean;
-        res            = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, stddev_inv.s0, stddev_inv.s1, 1);
-        DATA_TYPE w    = *((__global DATA_TYPE *)vector_offset(&weight, i));
-        res            = res * (int)w;
-        int b          = *((__global int *)vector_offset(&bias, i));
-        res            = res + b;
-        // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;
-        res = (res + 512) >> 10;
-        res = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12, 1);
-#if defined(MIN_BOUND)
-        res = max(res, MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-        res = min(res, MAX_BOUND);
-#endif // defined(MAX_BOUND)
-        *((__global DATA_TYPE *)offset(&output, i, 0)) = (DATA_TYPE)res;
-    }
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/quantization_layer.cl
deleted file mode 100644
index 3538dae5f0..0000000000
--- a/src/core/CL/cl_kernels/quantization_layer.cl
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
-#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
-
-/** This performs the quantization of floating point inputs or 8-bit quantized integers to 8-bit integers.
- *
- * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE_IN=type. e.g. -DDATA_TYPE=short
- * @note Output data type should be given as a preprocessor argument using -DDATA_TYPE_OUT=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Quantization scale should be given as a preprocessor argument using -DSCALE=scale. e.g. -DSCALE=0.125
- * @note Quantization offset should be given as a preprocessor argument using -DOFFSET=offset. e.g. -DOFFSET=125
- * @note Minimum value for quantized type should be given as a preprocessor argument using -DMIN_QUANT_VAL=value. e.g. -DMIN_QUANT_VAL=0
- * @note Maximum value for quantized type should be given as a preprocessor argument using -DMAX_QUANT_VAL=value. e.g. -DMAXIN_QUANT_VAL=255
- * @note If the input data type if a floating point (F16 or F32) the preprocessor argument should be give as -DIS_FLOAT
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void quantization_layer(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-    // Load data
-#if defined(IS_FLOAT)
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-    val_float = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-
-    // Create scale and offset vectors
-    const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE;
-    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
-#else // defined(IS_FLOAT)
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    val_float = CONVERT(val, VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Create scale and offset vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE) vscale = SCALE;
-    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
-#endif // defined(IS_FLOAT)
-
-    // Quantize
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    res = CLAMP(CONVERT_RTE_VEC(val_float / vscale, int, VEC_SIZE) + voffset, MIN_QUANT_VAL, MAX_QUANT_VAL);
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
-#else  //!defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE_IN *)input.ptr) / ((float)SCALE), int) + (int)OFFSET, MIN_QUANT_VAL, MAX_QUANT_VAL);
-#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
deleted file mode 100644
index 9f2c6e23b5..0000000000
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "helpers_asymm.h"
-
-#if defined(FLOAT_DATA_TYPE)
-#define ISGREATER(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isgreater(x, y))
-#define ISLESS(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isless(x, y))
-#define ISGREATER_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isgreater(x, y))
-#define ISLESS_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isless(x, y))
-#else // !FLOAT_DATA_TYPE
-#if defined(WIDTH)
-#define ISGREATER(x, y) (x > y) ? 1 : 0
-#define ISLESS(x, y) (x < y) ? 1 : 0
-#define ISGREATER_SCALAR ISGREATER
-#define ISLESS_SCALAR ISLESS
-#else // !defined(WIDTH)
-#define ISGREATER(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x > y)
-#define ISLESS(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x < y)
-#endif // defined(WIDTH)
-#endif // defined(FLOAT_DATA_TYPE)
-
-#if defined(WIDTH)
-#if defined(OPERATION)
-
-#define sum(in0, in1, size) (in0 + SUM_REDUCE(in1, size))
-#define square_sum(in0, in1, size) (in0 + SUM_REDUCE((in1 * in1), size))
-#define product(in0, in1, size) (in0 * PROD_REDUCE(in1, size))
-
-/** This kernel performs parallel reduction given an operation on x-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
- * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
- * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
- * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input
- * @param[in] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reduction_operation_x(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + y * input_stride_y + z * input_stride_z;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + y * output_stride_y + z * output_stride_z;
-
-#if defined(PROD)
-    DATA_TYPE res = (DATA_TYPE)1;
-#else  // defined(PROD)
-    DATA_TYPE res = (DATA_TYPE)0;
-#endif // defined(PROD)
-
-    int x = 0;
-
-    for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
-        res  = OPERATION(res, vals, VEC_SIZE);
-    }
-
-#if(WIDTH % VEC_SIZE)
-    _Pragma("unroll") for(; x < WIDTH; ++x)
-    {
-        DATA_TYPE val = *((__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
-        res           = OPERATION(res, val, 1);
-    }
-#endif // (WIDTH % VEC_SIZE)
-
-#if defined(MEAN)
-    res /= WIDTH;
-#endif // defined(MEAN)
-    *((__global DATA_TYPE *)output_addr) = res;
-}
-#endif // defined(OPERATION)
-/** This kernel performs reduction on x-axis. (Non parallel)
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
- * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void reduction_operation_non_parallel_x(
-    VECTOR_DECLARATION(input),
-    VECTOR_DECLARATION(output))
-{
-    Vector input  = CONVERT_TO_VECTOR_STRUCT(input);
-    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
-
-    DATA_TYPE_PROMOTED res = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, 0)), DATA_TYPE_PROMOTED);
-
-    // Convert input into F32 in order to perform quantized multiplication
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    float res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-    for(unsigned int x = 1; x < WIDTH; ++x)
-    {
-        DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, x)), DATA_TYPE_PROMOTED);
-#if defined(MIN)
-        res = select(res, in, ISLESS_SCALAR(in, res));
-#elif defined(MAX)
-        res = select(res, in, ISGREATER_SCALAR(in, res));
-#elif defined(PROD)
-#if defined(OFFSET) && defined(SCALE)
-        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
-#else  // !(defined(OFFSET) && defined(SCALE))
-        res *= in;
-#endif //  defined(OFFSET) && defined(SCALE)
-#else  // defined(SUM))
-        res += in;
-#endif // defined(MAX) || defined(MIN) || defined(PROD)
-    }
-
-    // Store result
-#if defined(MEAN)
-    res /= WIDTH;
-#endif // defined(MEAN)
-
-    // Subtract the offsets in case of quantized SUM
-#if defined(SUM) && defined(OFFSET) && defined(SCALE)
-    res -= (WIDTH - 1) * OFFSET;
-#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
-
-    // Re-quantize
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-    *((__global DATA_TYPE *)output.ptr) = CONVERT_SAT(res, DATA_TYPE);
-}
-#endif // defined(WIDTH)
-
-#if defined(HEIGHT)
-/** This kernel performs reduction on y-axis.
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void reduction_operation_y(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y;
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-
-    // Convert input into F32 in order to perform quantized multiplication
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-#if defined(SUM_SQUARE)
-    res *= res;
-#endif // defined(SUM_SQUARE)
-
-    for(unsigned int y = 1; y < HEIGHT; ++y)
-    {
-        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-#if defined(MIN)
-        res = select(res, in, ISLESS(in, res));
-#elif defined(MAX)
-        res = select(res, in, ISGREATER(in, res));
-#else // !(defined(MAX) || defined(MIN))
-#if defined(SUM_SQUARE)
-        in *= in;
-#endif // defined(SUM_SQUARE)
-#if defined(PROD)
-
-#if defined(OFFSET) && defined(SCALE)
-        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#else  // !(defined(OFFSET) && defined(SCALE))
-        res *= in;
-#endif //  defined(OFFSET) && defined(SCALE)
-
-#else  // !defined(PROD)
-        res += in;
-#endif // defined(PROD)
-#endif // defined(MAX) || defined(MIN)
-    }
-
-#if defined(MEAN)
-    res /= HEIGHT;
-#endif // defined(MEAN)
-
-    // Subtract the offsets in case of quantized SUM
-#if defined(SUM) && defined(OFFSET) && defined(SCALE)
-    res -= (HEIGHT - 1) * OFFSET;
-#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
-
-    // Re-quantize
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-    // Store result
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(HEIGHT)
-
-#if defined(DEPTH)
-/** This kernel performs reduction on z-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void reduction_operation_z(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-
-    // Convert input into F32 in order to perform quantized multiplication
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-#if defined(SUM_SQUARE)
-    res *= res;
-#endif // defined(SUM_SQUARE)
-
-    for(unsigned int z = 1; z < DEPTH; ++z)
-    {
-        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-
-#if defined(MIN)
-        res = select(res, in, ISLESS(in, res));
-#elif defined(MAX)
-        res = select(res, in, ISGREATER(in, res));
-#else // !(defined(MAX) || defined(MIN))
-#if defined(SUM_SQUARE)
-        in *= in;
-#endif // defined(SUM_SQUARE)
-#if defined(PROD)
-
-#if defined(OFFSET) && defined(SCALE)
-        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#else  // !(defined(OFFSET) && defined(SCALE))
-        res *= in;
-#endif //  defined(OFFSET) && defined(SCALE)
-
-#else  // !defined(PROD)
-        res += in;
-#endif // defined(PROD)
-#endif // defined(MAX) || defined(MIN)
-    }
-
-#if defined(MEAN)
-    res /= DEPTH;
-#endif // defined(MEAN)
-
-    // Subtract the offsets in case of quantized SUM
-#if defined(SUM) && defined(OFFSET) && defined(SCALE)
-    res -= (DEPTH - 1) * OFFSET;
-#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
-
-    // Re-quantize
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-    // Store result
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(DEPTH) */
-
-#if defined(BATCH) && defined(DEPTH)
-/** This kernel performs reduction on w-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
- * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void reduction_operation_w(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + (z % DEPTH) * input_stride_z + (z / DEPTH) * input_stride_w;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + (z % DEPTH) * output_stride_z + (z / DEPTH) * output_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-
-    // Convert input into F32 in order to perform quantized multiplication
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-#if defined(SUM_SQUARE)
-    res *= res;
-#endif // defined(SUM_SQUARE)
-
-    for(unsigned int w = 1; w < BATCH; ++w)
-    {
-        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-
-#if defined(MIN)
-        res = select(res, in, ISLESS(in, res));
-#elif defined(MAX)
-        res = select(res, in, ISGREATER(in, res));
-#else // !(defined(MAX) || defined(MIN))
-#if defined(SUM_SQUARE)
-        in *= in;
-#endif // defined(SUM_SQUARE)
-#if defined(PROD)
-
-#if defined(OFFSET) && defined(SCALE)
-        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#else  // !(defined(OFFSET) && defined(SCALE))
-        res *= in;
-#endif //  defined(OFFSET) && defined(SCALE)
-
-#else  // !defined(PROD)
-        res += in;
-#endif //defined(PROD)
-#endif // defined(MAX) || defined(MIN)
-    }
-
-#if defined(MEAN)
-    res /= BATCH;
-#endif // defined(MEAN)
-
-    // Subtract the offsets in case of quantized SUM
-#if defined(SUM) && defined(OFFSET) && defined(SCALE)
-    res -= (BATCH - 1) * OFFSET;
-#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
-
-    // Re-quantize
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-    // Store result
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(BATCH) && defined(DEPTH) */
diff --git a/src/core/CL/cl_kernels/remap.cl b/src/core/CL/cl_kernels/remap.cl
deleted file mode 100644
index 8ea4e84e96..0000000000
--- a/src/core/CL/cl_kernels/remap.cl
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Copyright (c) 2017, 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-__kernel void remap_nearest_neighbour_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(mapx),
-    IMAGE_DECLARATION(mapy),
-    const float width,
-    const float height)
-{
-    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
-    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
-    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
-    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
-    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
-    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
-                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-
-    vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
-}
-
-/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-__kernel void remap_bilinear_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(mapx),
-    IMAGE_DECLARATION(mapy),
-    const float width,
-    const float height)
-{
-    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
-    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
-    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
-    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
-    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
-    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
-                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-
-    vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
-}
-
-/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
- *  Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-
-#if defined(DEPTH_OUT)
-
-__kernel void remap_nearest_neighbour_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    TENSOR4D_DECLARATION(mapx),
-    TENSOR4D_DECLARATION(mapy),
-    const float width,
-    const float height
-#ifdef CONSTANT_BORDER
-    ,
-    const DATA_TYPE border_val
-#endif // CONSTANT_BORDER
-)
-{
-    Tensor4D in   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out  = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-    Tensor4D mapx = CONVERT_TO_TENSOR4D_STRUCT(mapx, DEPTH_OUT);
-    Tensor4D mapy = CONVERT_TO_TENSOR4D_STRUCT(mapy, DEPTH_OUT);
-
-    float mapx_coord = (float) * (__global float *)mapx.ptr;
-    float mapy_coord = (float) * (__global float *)mapy.ptr;
-
-#ifdef CONSTANT_BORDER
-    if(mapx_coord < 0 || mapx_coord > width - 1 || mapy_coord < 0 || mapy_coord > height - 1)
-    {
-        *((__global DATA_TYPE *)out.ptr) = border_val;
-        return;
-    }
-#else  // CONSTANT_BORDER
-    mapx_coord = clamp(mapx_coord, 0.0f, width - 1);
-    mapy_coord = clamp(mapy_coord, 0.0f, height - 1);
-#endif // CONSTANT_BORDER
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(mapx_coord), convert_int(mapy_coord), (get_global_id(2) / DEPTH_OUT)));
-}
-
-/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
- *  Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-__kernel void remap_bilinear_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    TENSOR4D_DECLARATION(mapx),
-    TENSOR4D_DECLARATION(mapy),
-    const float width,
-    const float height
-#ifdef CONSTANT_BORDER
-    ,
-    const DATA_TYPE border_val
-#endif // CONSTANT_BORDER
-)
-{
-    Tensor4D in   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out  = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-    Tensor4D mapx = CONVERT_TO_TENSOR4D_STRUCT(mapx, DEPTH_OUT);
-    Tensor4D mapy = CONVERT_TO_TENSOR4D_STRUCT(mapy, DEPTH_OUT);
-
-    float mapx_coord = (float) * (__global float *)mapx.ptr;
-    float mapy_coord = (float) * (__global float *)mapy.ptr;
-
-#ifdef CONSTANT_BORDER
-    if(mapx_coord < 0 || mapx_coord > width - 1 || mapy_coord < 0 || mapy_coord > height - 1)
-    {
-        *((__global DATA_TYPE *)out.ptr) = border_val;
-        return;
-    }
-#endif // CONSTANT_BORDER
-
-    const float new_xf     = floor(mapx_coord);
-    const float new_yf     = floor(mapy_coord);
-    const float clamped_x  = clamp(new_xf, 0.0f, width - 1);
-    const float clamped_x1 = clamp(new_xf + 1, 0.0f, width - 1);
-    const float clamped_y  = clamp(new_yf, 0.0f, height - 1);
-    const float clamped_y1 = clamp(new_yf + 1, 0.0f, height - 1);
-
-    float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-
-    const float a  = mapx_coord - new_xf;
-    const float b  = 1.f - a;
-    const float a1 = mapy_coord - new_yf;
-    const float b1 = 1.f - a1;
-    const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
-
-    *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
-}
-
-#endif // DEPTH_OUT
diff --git a/src/core/CL/cl_kernels/reorg_layer.cl b/src/core/CL/cl_kernels/reorg_layer.cl
deleted file mode 100644
index 29344de37a..0000000000
--- a/src/core/CL/cl_kernels/reorg_layer.cl
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
-
-#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi)     \
-    ({                                                        \
-        int offset = zo / (int)SRC_DEPTH;                     \
-        xi         = xo * (int)STRIDE + offset % (int)STRIDE; \
-        yi         = yo * (int)STRIDE + offset / (int)STRIDE; \
-        zi         = zo % SRC_DEPTH;                          \
-    })
-
-/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
- * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reorg_layer_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int xo = get_global_id(0);
-    int yo = get_global_id(1);
-    int zo = get_global_id(2);
-    int xi, yi, zi;
-
-    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
-
-    int src_offset                   = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z;
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
-}
-
-/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
- * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reorg_layer_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int xo = get_global_id(1);
-    int yo = get_global_id(2);
-    int zo = get_global_id(0);
-    int xi, yi, zi;
-
-    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
-
-    int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
-}
-#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
index bed94a7b3b..cb2f4b0319 100644
--- a/src/core/CL/cl_kernels/repeat.h
+++ b/src/core/CL/cl_kernels/repeat.h
@@ -75,7 +75,9 @@
     P_X##_DEF(F, P_A, P_B, P_C);        \
     REPEAT_3_15(P_X, P_A, P_B, P_C)
 
-#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \
+    REPEAT_3_##P_NUM(P_OP, P_A, P_B,               \
+                     P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
 #define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
 
 // Repeat macros with 4 param, excluding the implicit ID param
@@ -126,52 +128,59 @@
     P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
     REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
 
-#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \
+    REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C,               \
+                     P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
 #define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
 
 // Macro for initializing N variables. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL)   TYPE VAR##ID = VAL
 #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
 
 // Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
+#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT)   TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
 #define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
 
 // Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
 #define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
-#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
+#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \
+    REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
 
 // Macro for adding a constant to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
+#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID += (TYPE)VAL
 #define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
 
 // Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...)
-#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
+#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL)   VAR_A##ID += VAR_B##ID * VAL
 #define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
 
 // Macro for adding a vector to N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
 #define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
-#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
+#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC)     REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
 
 // Macro for adding a two N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
 #define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
-#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
+#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B)     REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
 
 // Macro for performing Max between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
+#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID = max(VAR##ID, (TYPE)VAL)
 #define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
 
 // Macro for performing Min between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
+#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID = min(VAR##ID, (TYPE)VAL)
 #define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
 
 // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 // Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables.
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
@@ -182,6 +191,7 @@
         VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
         VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
     })
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 #endif // ARM_COMPUTE_REPEAT_H
diff --git a/src/core/CL/cl_kernels/reshape_layer.cl b/src/core/CL/cl_kernels/reshape_layer.cl
deleted file mode 100644
index 2d6a7edade..0000000000
--- a/src/core/CL/cl_kernels/reshape_layer.cl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Perform tensor reshape
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  input_shape                          Input spatial shape
- * @param[in]  output_shape                         Output spatial shape
- */
-__kernel void reshape_layer(TENSOR3D_DECLARATION(input),
-                            TENSOR3D_DECLARATION(output),
-                            int2 input_shape,
-                            int2 output_shape)
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-
-    int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
-
-    // Linearize index
-    int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y;
-
-    // Translate to output
-    int3 out_id;
-    out_id.x = linear_idx % output_shape.x;
-    out_id.y = (linear_idx / output_shape.x) % output_shape.y;
-    out_id.z = linear_idx / (output_shape.x * output_shape.y);
-
-    // Store result
-    *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) = *((__global DATA_TYPE *)in.ptr);
-}
diff --git a/src/core/CL/cl_kernels/reverse.cl b/src/core/CL/cl_kernels/reverse.cl
deleted file mode 100644
index 10ffe84aeb..0000000000
--- a/src/core/CL/cl_kernels/reverse.cl
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
-* Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
-
-#if NUM_REVERSE_DIMS > 4
-#error("Reversing more than 4 dimensions is not currently supported")
-#endif /* NUM_REVERSE_DIMS > 4 */
-
-/** Performs reverse along the specified axis.
- *
- * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
- * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
- *
- * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  axis_ptr                           Pointer to the source vector. Supported data types: U32
- * @param[in]  axis_stride_x                      Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  axis_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  axis_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- */
-__kernel void reverse(TENSOR4D_DECLARATION(src),
-                      VECTOR_DECLARATION(axis),
-                      TENSOR4D_DECLARATION(dst),
-                      const uint width,
-                      const uint height,
-                      const uint depth,
-                      const uint batches)
-{
-    Tensor4D src  = CONVERT_TO_TENSOR4D_STRUCT(src, depth);
-    Vector   axis = CONVERT_TO_VECTOR_STRUCT_NO_STEP(axis);
-    Tensor4D dst  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, depth);
-
-    const uint x_in = get_global_id(0);
-    const uint y_in = get_global_id(1);
-    const uint z_in = get_global_id(2) % depth;
-    const uint w_in = get_global_id(2) / depth;
-
-    const uint4 dims       = (uint4)(0, 1, 2, 3);
-    int4        to_reverse = (int4)(0, 0, 0, 0);
-#if NUM_REVERSE_DIMS == 1
-    const uint index = *((__global uint *)axis.ptr);
-    to_reverse       = (uint4)index == dims;
-#elif NUM_REVERSE_DIMS == 2
-    const uint2 indices = vload2(0, (__global uint *)axis.ptr);
-    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
-#elif NUM_REVERSE_DIMS == 3
-    const uint2 indices01 = vload2(0, (__global uint *)axis.ptr);
-    const uint index2     = *((__global uint *)axis.ptr + 2);
-    to_reverse            = ((uint4)indices01.s0 == dims) || ((uint4)indices01.s1 == dims) || ((uint4)index2 == dims);
-#else  /* NUM_REVERSE_DIMS == 3 */
-    const uint4 indices = vload4(0, (__global uint *)axis.ptr);
-    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
-#endif /* NUM_REVERSE_DIMS == 1 */
-    const uint x_out = to_reverse.s0 ? width - x_in - 1 : x_in;
-    const uint y_out = to_reverse.s1 ? height - y_in - 1 : y_in;
-    const uint z_out = to_reverse.s2 ? depth - z_in - 1 : z_in;
-    const uint w_out = to_reverse.s3 ? batches - w_in - 1 : w_in;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&dst, x_out, y_out, z_out, w_out)) = *((__global DATA_TYPE *)src.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/roi_align_layer.cl
deleted file mode 100644
index e0b98e68c9..0000000000
--- a/src/core/CL/cl_kernels/roi_align_layer.cl
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
-// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
-#define EPS_GRID 0.00001f
-
-#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) // Check for compile time constants
-
-/** Performs a roi align on a single output pixel.
- *
- * @param[in] input          Pointer to input Tensor3D struct.
- * @param[in] region_start_x Start x index projected onto the input tensor.
- * @param[in] region_end_x   End x index projected onto the input tensor.
- * @param[in] region_start_y Start y index projected onto the input tensor.
- * @param[in] region_end_y   End y index projected onto the input tensor.
- * @param[in] pz             z index of the input tensor.
- *
- * @return An average pooled value from the region specified in the input tensor.
- */
-inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
-                               float bin_size_x,
-                               float grid_size_x,
-                               float region_end_x,
-                               float region_start_y,
-                               float bin_size_y,
-                               float grid_size_y,
-                               float region_end_y,
-                               int   pz)
-{
-    // Iterate through the pooling region
-    float sum = 0;
-    for(int iy = 0; iy < grid_size_y; ++iy)
-    {
-        for(int ix = 0; ix < grid_size_x; ++ix)
-        {
-            // Align the window in the middle of every bin
-            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
-            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
-
-            // Interpolation in the unit square
-            const int y_low  = (int)y;
-            const int x_low  = (int)x;
-            const int y_high = y_low + 1;
-            const int x_high = x_low + 1;
-
-            const float ly = y - y_low;
-            const float lx = x - x_low;
-            const float hy = 1.f - ly;
-            const float hx = 1.f - lx;
-
-            const float w1 = hy * hx;
-            const float w2 = hy * lx;
-            const float w3 = ly * hx;
-            const float w4 = ly * lx;
-#if defined(NHWC)
-            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
-            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
-            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
-            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
-#else  // !defined(NHWC)
-            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
-            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
-            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
-            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
-#endif // defined(NHWC)
-            sum += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-        }
-    }
-
-    return (DATA_TYPE)(sum / (grid_size_x * grid_size_y));
-}
-
-/** Performs a roi align function.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
- * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
- * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
- * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
- * @note Spatial scale must be passed using -DSPATIAL_SCALE;
- * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
- *       will have a default sampling ratio of roi_dims/pooling_dims
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16, F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
- * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
- * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
- * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
- * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
- * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
- * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void roi_align_layer(
-    TENSOR3D_DECLARATION(input),
-    IMAGE_DECLARATION(rois),
-    TENSOR3D_DECLARATION(output),
-    unsigned int input_stride_w, unsigned int output_stride_w)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-
-#if defined(NHWC)
-    const int px = get_global_id(1);
-    const int py = get_global_id(2);
-    const int pw = get_global_id(0);
-#else  // !defined(NHWC)
-    const int                                  px = get_global_id(0);
-    const int                                  py = get_global_id(1);
-    const int                                  pw = get_global_id(2);
-#endif // defined(NHWC)
-
-    // Load roi parameters
-    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
-    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    roi                 = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
-    const float2 roi_anchor = convert_float2(roi.s01) * convert_float(SPATIAL_SCALE);
-    const float2 roi_dims   = fmax(convert_float2(roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
-
-    // Calculate pooled region start and end
-    const float2 spatial_indx     = (float2)(px, py);
-    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
-    const float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
-
-    const float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
-    float2       region_start = spatial_indx * bin_size + roi_anchor;
-    float2       region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
-
-    region_start = clamp(region_start, 0, max_spatial_dims);
-    region_end   = clamp(region_end, 0, max_spatial_dims);
-
-#if defined(SAMPLING_RATIO)
-    const float2 roi_bin_grid = SAMPLING_RATIO;
-#else  // !defined(SAMPLING_RATIO)
-    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
-    const float2   roi_bin_grid = ceil(bin_size - EPS_GRID);
-#endif // defined(SAMPLING_RATIO)
-
-    // Move input and output pointer across the fourth dimension
-    input.ptr += roi_batch * input_stride_w;
-    output.ptr += pw * output_stride_w;
-    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
-    {
-#if defined(NHWC)
-        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
-#else  // !defined(NHWC)
-        __global DATA_TYPE *_output_ptr  = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
-#endif // defined(NHWC)
-        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
-                                                         region_start.x,
-                                                         bin_size.x,
-                                                         roi_bin_grid.x,
-                                                         region_end.x,
-                                                         region_start.y,
-                                                         bin_size.y,
-                                                         roi_bin_grid.y,
-                                                         region_end.y, pz);
-    }
-}
-#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl b/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
deleted file mode 100644
index d5c9a0d9bf..0000000000
--- a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
-// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
-#define EPS_GRID 0.00001f
-
-#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) && defined(OFFSET_IN) && defined(OFFSET_OUT) && defined(SCALE_IN) && defined(SCALE_OUT) && defined(OFFSET_ROIS) && defined(SCALE_ROIS) // Check for compile time constants
-
-/** Performs a roi align on a single output pixel.
- *
- * @param[in] input          Pointer to input Tensor3D struct.
- * @param[in] region_start_x Start x index projected onto the input tensor.
- * @param[in] region_end_x   End x index projected onto the input tensor.
- * @param[in] region_start_y Start y index projected onto the input tensor.
- * @param[in] region_end_y   End y index projected onto the input tensor.
- * @param[in] pz             z index of the input tensor.
- *
- * @return An average pooled value from the region specified in the input tensor.
- */
-inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
-                               float bin_size_x,
-                               float grid_size_x,
-                               float region_end_x,
-                               float region_start_y,
-                               float bin_size_y,
-                               float grid_size_y,
-                               float region_end_y,
-                               int   pz)
-{
-    // Iterate through the pooling region
-    float sum = 0;
-    for(int iy = 0; iy < grid_size_y; ++iy)
-    {
-        for(int ix = 0; ix < grid_size_x; ++ix)
-        {
-            // Align the window in the middle of every bin
-            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
-            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
-
-            // Interpolation in the unit square
-            const int y_low  = (int)y;
-            const int x_low  = (int)x;
-            const int y_high = y_low + 1;
-            const int x_high = x_low + 1;
-
-            const float ly = y - y_low;
-            const float lx = x - x_low;
-            const float hy = 1.f - ly;
-            const float hx = 1.f - lx;
-
-            const float w1 = hy * hx;
-            const float w2 = hy * lx;
-            const float w3 = ly * hx;
-            const float w4 = ly * lx;
-#if defined(NHWC)
-            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
-            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
-            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
-            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
-#else  // !defined(NHWC)
-            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
-            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
-            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
-            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
-#endif // defined(NHWC)
-
-            const float data1_f32 = DEQUANTIZE(data1, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
-            const float data2_f32 = DEQUANTIZE(data2, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
-            const float data3_f32 = DEQUANTIZE(data3, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
-            const float data4_f32 = DEQUANTIZE(data4, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
-            sum += w1 * data1_f32 + w2 * data2_f32 + w3 * data3_f32 + w4 * data4_f32;
-        }
-    }
-
-    const float res_f32 = sum / (grid_size_x * grid_size_y);
-    return QUANTIZE(res_f32, OFFSET_OUT, SCALE_OUT, DATA_TYPE, 1);
-}
-
-/** Performs a roi align function.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=uchar
- * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
- * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
- * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
- * @note Spatial scale must be passed using -DSPATIAL_SCALE;
- * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
- *       will have a default sampling ratio of roi_dims/pooling_dims
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
- * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }.
- *                                                  Supported data types: QASYMM16 with 0.125f scale and 0 offset
- * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
- * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
- * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
- * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
- * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void roi_align_layer_quantized(
-    TENSOR3D_DECLARATION(input),
-    IMAGE_DECLARATION(rois),
-    TENSOR3D_DECLARATION(output),
-    unsigned int input_stride_w, unsigned int output_stride_w)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-
-#if defined(NHWC)
-    const int px = get_global_id(1);
-    const int py = get_global_id(2);
-    const int pw = get_global_id(0);
-#else  // !defined(NHWC)
-    const int                                  px = get_global_id(0);
-    const int                                  py = get_global_id(1);
-    const int                                  pw = get_global_id(2);
-#endif // defined(NHWC)
-
-    // Load roi parameters
-    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
-    const ushort roi_batch = *((__global ushort *)offset(&rois, 0, pw));
-    float4 roi             = DEQUANTIZE(vload4(0, (__global ushort *)offset(&rois, 1, pw)), OFFSET_ROIS, SCALE_ROIS, ushort, 4);
-    float2 roi_anchor      = roi.s01 * convert_float(SPATIAL_SCALE);
-    float2 roi_dims        = fmax((roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
-
-    // Calculate pooled region start and end
-    float2 spatial_indx     = (float2)(px, py);
-    float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
-    float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
-
-    float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
-    float2 region_start = spatial_indx * bin_size + roi_anchor;
-    float2 region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
-
-    region_start = clamp(region_start, 0, max_spatial_dims);
-    region_end   = clamp(region_end, 0, max_spatial_dims);
-
-#if defined(SAMPLING_RATIO)
-    float2 roi_bin_grid = SAMPLING_RATIO;
-#else  // !defined(SAMPLING_RATIO)
-    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
-    float2       roi_bin_grid           = ceil(bin_size - EPS_GRID);
-#endif // defined(SAMPLING_RATIO)
-
-    // Move input and output pointer across the fourth dimension
-    input.ptr += roi_batch * input_stride_w;
-    output.ptr += pw * output_stride_w;
-    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
-    {
-#if defined(NHWC)
-        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
-#else  // !defined(NHWC)
-        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
-#endif // defined(NHWC)
-        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
-                                                         region_start.x,
-                                                         bin_size.x,
-                                                         roi_bin_grid.x,
-                                                         region_end.x,
-                                                         region_start.y,
-                                                         bin_size.y,
-                                                         roi_bin_grid.y,
-                                                         region_end.y, pz);
-    }
-}
-#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
deleted file mode 100644
index d4c27e6cf6..0000000000
--- a/src/core/CL/cl_kernels/scale.cl
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_nearest(const float2 coord, const float2 scale)
-{
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    const float4 new_x       = in_x_coords * (float4)(scale.s0);
-    const float4 new_y       = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
-    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear(const float2 coord, const float2 scale)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 new_x = in_x_coords * (float4)(scale.s0);
-    const float4 new_y = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
-    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_nearest_neighbour_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out         = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r           = (float2)(scale_x, scale_y);
-    float8       transformed = transform_nearest(get_current_coords(), r);
-#ifdef ALIGN_CORNERS
-    transformed = round(transformed);
-#endif // ALIGN_CORNERS
-    const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
-    vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_bilinear_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = transform_bilinear(get_current_coords(), r);
-    vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
-}
-
-#if defined(DEPTH_OUT)
-/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_nearest_neighbour_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    float new_x = get_global_id(1) * scale_x;
-    float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
-#elif SAMPLING_POLICY_CENTER
-    float       new_x = (get_global_id(1) + 0.5f) * scale_x;
-    float       new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-#ifdef ALIGN_CORNERS
-    new_x = round(new_x);
-    new_y = round(new_y);
-#endif /* ALIGN_CORNERS */
-    const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
-    const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
-}
-
-/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
- *
- * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- *
- */
-__kernel void scale_bilinear_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float new_x = get_global_id(1) * scale_x;
-    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
-#elif SAMPLING_POLICY_CENTER
-    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
-    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-
-    const float new_xf     = floor(new_x);
-    const float new_yf     = floor(new_y);
-    const float clamped_x  = clamp(new_xf, 0.0f, input_width - 1);
-    const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
-    const float clamped_y  = clamp(new_yf, 0.0f, input_height - 1);
-    const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
-
-#ifndef BORDER_MODE_REPLICATE
-    const bool  check_x = (0.f <= new_xf && new_xf < input_width);
-    const bool  check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
-    const bool  check_y = (0.f <= new_yf && new_yf < input_height);
-    const bool  check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
-    const float ins_0   = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
-                                                                                                          (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y);
-    const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
-                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y);
-    const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
-                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y1);
-    const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
-                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y1);
-    float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3);
-#else  /* BORDER_MODE_REPLICATE */
-    float4 ins        = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-#endif /* BORDER_MODE_REPLICATE */
-
-    const float a  = new_x - new_xf;
-    const float b  = 1.f - a;
-    const float a1 = new_y - new_yf;
-    const float b1 = 1.f - a1;
-    const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
-
-    *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
-}
-#endif /* defined(DEPTH_OUT) */
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl
deleted file mode 100644
index 010e4ed57a..0000000000
--- a/src/core/CL/cl_kernels/scale_quantized.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-#include "warp_helpers_quantized.h"
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 new_x = in_x_coords * (float4)(scale.s0);
-    const float4 new_y = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
-    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_bilinear_quantized_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = transform_bilinear_quantized(get_current_coords_quantized(), r);
-    vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
-}
-
-#if defined(DEPTH_OUT)
-/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
- *
- * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- * @param[in]  constant_border_value             Constant border value to use
- */
-__kernel void scale_bilinear_quantized_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float new_x = get_global_id(1) * scale_x;
-    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
-#elif SAMPLING_POLICY_CENTER
-    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
-    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-
-    const float new_xf     = floor(new_x);
-    const float new_yf     = floor(new_y);
-    const float clamped_x  = clamp(new_xf, 0.0f, input_width - 1);
-    const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
-    const float clamped_y  = clamp(new_yf, 0.0f, input_height - 1);
-    const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
-
-#ifndef BORDER_MODE_REPLICATE
-    const bool  check_x = (0.f <= new_xf && new_xf < input_width);
-    const bool  check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
-    const bool  check_y = (0.f <= new_yf && new_yf < input_height);
-    const bool  check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
-    const int ins_0    = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
-                                                                                                     (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y);
-    const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
-                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y);
-    const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
-                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y1);
-    const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
-                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y1);
-    int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3);
-#else  /* BORDER_MODE_REPLICATE */
-    int4 ins          = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-#endif /* BORDER_MODE_REPLICATE */
-
-    const float  a      = new_x - new_xf;
-    const float  b      = 1.f - a;
-    const float  a1     = new_y - new_yf;
-    const float  b1     = 1.f - a1;
-    const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
-
-    const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
-
-    DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
-
-    *((__global DATA_TYPE *)out.ptr) = res;
-}
-#endif /* defined(DEPTH_OUT) */
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/slice_ops.cl b/src/core/CL/cl_kernels/slice_ops.cl
deleted file mode 100644
index dc3ffd91c1..0000000000
--- a/src/core/CL/cl_kernels/slice_ops.cl
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Perform a strided slice operation on a given input.
- *
- * @attention Supported tensor rank: up to 4
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input and output tensor dephts should be given as a preprocessor arguments using -DSRC_DEPTH=size. and -DDST_DEPTH=size
- * @attention Absolute start coordinates for each dimension should be given as preprocessor -DSTART_index=value e.g. -DSTART_0=2
- * @attention Strides for each dimension should be given as preprocessor -DSTRIDE_index=value e.g. -DSTRIDE_1=1
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void strided_slice(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-
-    int offset = 0;
-
-    // Offset X
-#if defined(SHRINK_0)
-    input.ptr += (int)START_0 * input_stride_x;
-#elif defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    offset       = (int)START_0 + min(xi, (int)LAST_ACCESSED_X);
-    input.ptr += offset * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-#elif defined(START_0) && defined(STRIDE_0)
-    offset = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0;
-    input.ptr += offset * input_stride_x;
-#endif // defined(START_0) && defined(STRIDE_0)
-
-    // Offset Y
-#if defined(SHRINK_1)
-    input.ptr += (int)START_1 * input_stride_y;
-#elif defined(START_1) && defined(STRIDE_1)
-#if defined(SHRINK_0)
-    offset = (int)START_1 + (int)get_global_id(0) * (int)STRIDE_1;
-#else  // defined(SHRINK_0)
-    offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
-#endif // defined(SHRINK_0)
-    input.ptr += offset * input_stride_y;
-#endif // defined(START_1) && defined(STRIDE_1)
-
-    // Offset Z
-#if defined(SHRINK_2)
-    input.ptr += (int)START_2 * input_stride_z;
-#elif defined(START_2) && defined(STRIDE_2)
-
-#if defined(SHRINK_1) && defined(SHRINK_0)
-    offset = (int)START_2 + (int)get_global_id(0) * (int)STRIDE_2;
-#elif defined(SHRINK_1) || defined(SHRINK_0)
-    offset = (int)START_2 + (int)get_global_id(1) * (int)STRIDE_2;
-#else  // defined(SHRINK_1) && defined(SHRINK_0)
-    offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
-#endif // defined(SHRINK_1) && defined(SHRINK_0)
-
-    input.ptr += offset * input_stride_z;
-#endif // defined(START_2) && defined(STRIDE_2)
-
-    // Offset depth
-#if defined(SHRINK_3)
-    input.ptr += (int)START_3 * input_stride_w;
-#elif defined(START_3) && defined(STRIDE_3)
-#if defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
-    offset = (int)START_3 + (int)get_global_id(0) * (int)STRIDE_3;
-#elif !defined(SHRINK_2) && !defined(SHRINK_1) && !defined(SHRINK_0)
-    offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
-#elif(defined(SHRINK_0) && defined(SHRINK_1)) || (defined(SHRINK_1) && defined(SHRINK_2)) || (defined(SHRINK_0) && defined(SHRINK_2))
-    offset = (int)START_3 + (int)get_global_id(1) * (int)STRIDE_3;
-#else  // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
-    offset = (int)START_3 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_3;
-#endif // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
-    input.ptr += offset * input_stride_w;
-#endif // defined(START_3) && defined(STRIDE_3)
-
-    // Store result
-#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input.ptr));
-
-    VSTORE(VEC_SIZE)
-    (val, 0, (__global DATA_TYPE *)(output.ptr));
-#else  // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
-#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-}
diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl
deleted file mode 100644
index 7983734fc4..0000000000
--- a/src/core/CL/cl_kernels/sobel_filter.cl
+++ /dev/null
@@ -1,541 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/***********************************************/
-/*   Begin implementation of Sobel3x3 filter   */
-/***********************************************/
-
-/** This OpenCL kernel that computes a Sobel3x3 filter.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel3x3(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-#ifdef GRAD_X
-    short8 gx = (short8)0;
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short8 gy = (short8)0;
-#endif /* GRAD_Y */
-
-    // Row0
-    uchar16 temp   = vload16(0, offset(&src, -1, -1));
-    short8  left   = convert_short8(temp.s01234567);
-    short8  middle = convert_short8(temp.s12345678);
-    short8  right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-1);
-    gx += right * (short8)(+1);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(-1);
-    gy += middle * (short8)(-2);
-    gy += right * (short8)(-1);
-#endif /* GRAD_Y */
-
-    // Row1
-    temp  = vload16(0, offset(&src, -1, 0));
-    left  = convert_short8(temp.s01234567);
-    right = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-2);
-    gx += right * (short8)(+2);
-#endif /* GRAD_X */
-
-    // Row2
-    temp   = vload16(0, offset(&src, -1, 1));
-    left   = convert_short8(temp.s01234567);
-    middle = convert_short8(temp.s12345678);
-    right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-1);
-    gx += right * (short8)(+1);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(+1);
-    gy += middle * (short8)(+2);
-    gy += right * (short8)(+1);
-#endif /* GRAD_Y */
-
-    // Store results
-#ifdef GRAD_X
-    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel3x3 filter   */
-/**********************************************/
-
-/***********************************************/
-/*   Begin implementation of Sobel5x5 filter   */
-/***********************************************/
-
-/** Compute a 1D horizontal sobel filter 1x5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src             Pointer to source image.
- * @param[in] left1_coeff_gx  Weight of the most left pixel for gx
- * @param[in] left2_coeff_gx  Weight of the left pixel for gx
- * @param[in] middle_coeff_gx Weight of the middle pixel for gx
- * @param[in] right1_coeff_gx Weight of the right pixel for gx
- * @param[in] right2_coeff_gx Weight of the most right pixel for gx
- * @param[in] left1_coeff_gy  Weight of the most left pixel for gy
- * @param[in] left2_coeff_gy  Weight of the left pixel for gy
- * @param[in] middle_coeff_gy Weight of the middle pixel for gy
- * @param[in] right1_coeff_gy Weight of the right pixel for gy
- * @param[in] right2_coeff_gy Weight of the most right pixel for gy
- *
- * @return a short16 containing short8 gx and short8 gy values.
- */
-short16 sobel1x5(
-    Image      *src,
-    const short left1_coeff_gx,
-    const short left2_coeff_gx,
-    const short middle_coeff_gx,
-    const short right1_coeff_gx,
-    const short right2_coeff_gx,
-    const short left1_coeff_gy,
-    const short left2_coeff_gy,
-    const short middle_coeff_gy,
-    const short right1_coeff_gy,
-    const short right2_coeff_gy)
-{
-    uchar16 temp = vload16(0, offset(src, -2, 0));
-    short8  gx   = 0;
-    short8  gy   = 0;
-    short8  val;
-
-    val = convert_short8(temp.s01234567);
-    gx += val * (short8)left1_coeff_gx;
-    gy += val * (short8)left1_coeff_gy;
-
-    val = convert_short8(temp.s12345678);
-    gx += val * (short8)left2_coeff_gx;
-    gy += val * (short8)left2_coeff_gy;
-
-    val = convert_short8(temp.s23456789);
-    gx += val * (short8)middle_coeff_gx;
-    gy += val * (short8)middle_coeff_gy;
-
-    val = convert_short8(temp.s3456789a);
-    gx += val * (short8)right1_coeff_gx;
-    gy += val * (short8)right1_coeff_gy;
-
-    val = convert_short8(temp.s456789ab);
-    gx += val * (short8)right2_coeff_gx;
-    gy += val * (short8)right2_coeff_gy;
-
-    return (short16)(gx, gy);
-}
-
-/** Compute a 1D vertical sobel filter 5x1 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the most down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-short8 sobel5x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff)
-{
-    short8 val;
-    short8 out = (short8)0;
-
-    val = vload8(0, (__global short *)offset(src, 0, -2));
-    out += val * (short8)up1_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, -1));
-    out += val * (short8)up2_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 0));
-    out += val * (short8)middle_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 1));
-    out += val * (short8)down1_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 2));
-    out += val * (short8)down2_coeff;
-
-    return (short8)(out);
-}
-
-/** Apply a 1x5 sobel matrix to a single channel U8 input image and output two temporary channel S16 images.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image.. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image.. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel_separable1x5(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-    short16 gx_gy = sobel1x5(&src,
-                             -1, -2, 0, 2, 1,
-                             1, 4, 6, 4, 1);
-
-    // Store result in dst
-#ifdef GRAD_X
-    vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images
- *  and output two single channel S16 images.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_x_ptr                            Pointer to the source image.. Supported data types: S16
- * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S16
- * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
- */
-__kernel void sobel_separable5x1(
-#ifdef GRAD_X
-    IMAGE_DECLARATION(src_x),
-    IMAGE_DECLARATION(dst_gx),
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    IMAGE_DECLARATION(src_y),
-    IMAGE_DECLARATION(dst_gy),
-#endif /* GRAD_Y */
-    int dummy)
-{
-#ifdef GRAD_X
-    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-#ifdef GRAD_X
-    short8 gx = sobel5x1(&src_x,
-                         1, 4, 6, 4, 1);
-    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short8 gy = sobel5x1(&src_y,
-                         -1, -2, 0, 2, 1);
-    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel5x5 filter   */
-/**********************************************/
-
-/***********************************************/
-/*   Begin implementation of Sobel7x7 filter   */
-/***********************************************/
-
-/* Sobel 1x7 horizontal X / 7x1 vertical Y coefficients */
-#define X0 -1
-#define X1 -4
-#define X2 -5
-#define X3 0
-#define X4 5
-#define X5 4
-#define X6 1
-
-/* Sobel 1x7 vertical X / 7x1 horizontal Y coefficients */
-#define Y0 1
-#define Y1 6
-#define Y2 15
-#define Y3 20
-#define Y4 15
-#define Y5 6
-#define Y6 1
-
-/* Calculates single horizontal iteration. */
-#define SOBEL1x1_HOR(src, gx, gy, idx)                               \
-    {                                                                \
-        int8 val = convert_int8(vload8(0, offset(src, idx - 3, 0))); \
-        gx += val * X##idx;                                          \
-        gy += val * Y##idx;                                          \
-    }
-
-/* Calculates single vertical iteration. */
-#define SOBEL1x1_VERT(src, g, direction, idx)                          \
-    {                                                                  \
-        int8 val = vload8(0, (__global int *)offset(src, 0, idx - 3)); \
-        g += val * (int8)direction##idx;                               \
-    }
-
-/* Calculates a 1x7 horizontal iteration. */
-#define SOBEL1x7(ptr, gx, gy)                        \
-    SOBEL1x1_HOR(ptr, gx, gy, 0)                     \
-    SOBEL1x1_HOR(ptr, gx, gy, 1)                 \
-    SOBEL1x1_HOR(ptr, gx, gy, 2)             \
-    SOBEL1x1_HOR(ptr, gx, gy, 3)         \
-    SOBEL1x1_HOR(ptr, gx, gy, 4)     \
-    SOBEL1x1_HOR(ptr, gx, gy, 5) \
-    SOBEL1x1_HOR(ptr, gx, gy, 6)
-
-/* Calculates a 7x1 vertical iteration. */
-#define SOBEL7x1(ptr, g, direction)                         \
-    SOBEL1x1_VERT(ptr, g, direction, 0)                     \
-    SOBEL1x1_VERT(ptr, g, direction, 1)                 \
-    SOBEL1x1_VERT(ptr, g, direction, 2)             \
-    SOBEL1x1_VERT(ptr, g, direction, 3)         \
-    SOBEL1x1_VERT(ptr, g, direction, 4)     \
-    SOBEL1x1_VERT(ptr, g, direction, 5) \
-    SOBEL1x1_VERT(ptr, g, direction, 6)
-
-/** Apply a 1x7 sobel matrix to a single channel U8 input image and output two temporary channel S16 images and leave the borders undefined.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S32
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S32
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel_separable1x7(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-    int8 gx = (int8)0;
-    int8 gy = (int8)0;
-
-    SOBEL1x7(&src, gx, gy);
-
-    // Store result in dst
-#ifdef GRAD_X
-    vstore8(gx, 0, ((__global int *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gy, 0, ((__global int *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_x_ptr                            Pointer to the source image. Supported data types: S32
- * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S32
- * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
- */
-__kernel void sobel_separable7x1(
-#ifdef GRAD_X
-    IMAGE_DECLARATION(src_x),
-    IMAGE_DECLARATION(dst_gx),
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    IMAGE_DECLARATION(src_y),
-    IMAGE_DECLARATION(dst_gy),
-#endif /* GRAD_Y */
-    int dummy)
-{
-#ifdef GRAD_X
-    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-#ifdef GRAD_X
-    int8 gx = 0;
-    SOBEL7x1(&src_x, gx, Y);
-    vstore8(gx, 0, (__global int *)dst_gx.ptr);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    int8 gy = 0;
-    SOBEL7x1(&src_y, gy, X);
-    vstore8(gy, 0, (__global int *)dst_gy.ptr);
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel7x7 filter   */
-/**********************************************/
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
deleted file mode 100644
index 4d2d89dd73..0000000000
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ /dev/null
@@ -1,531 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
-
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void softmax_layer_norm(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst))
-{
-    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
-    // Load max value of 1D logits vector (row)
-    DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-
-#if defined(LOG_SOFTMAX)
-    sum_val = log(sum_val);
-    data0 -= sum_val;
-#else  // defined(LOG_SOFTMAX)
-    data0 /= sum_val;
-#endif // defined(LOG_SOFTMAX)
-
-    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
-
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_serial(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-#ifdef BETA
-    // Initialize beta
-    VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
-
-    // Initialize local maximum
-    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_TYPE data    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
-    max_val_vec      = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx));
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        max_val_vec   = max(data, max_val_vec);
-    }
-
-    // Perform max reduction
-    DATA_TYPE max_val                 = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    *((__global DATA_TYPE *)maxo.ptr) = max_val;
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_TYPE sum1D = 0;
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    data -= max_val;
-#ifdef BETA
-    data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data, 0, (__global DATA_TYPE *)dst_addr);
-    data = exp(data);
-    data = select(0, data, widx);
-#else  /* LOG_SOFTMAX */
-    data = exp(data);
-    data = select(0, data, widx);
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data, 0, (__global DATA_TYPE *)dst_addr);
-#endif /* LOG_SOFTMAX */
-    sum1D += data;
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    // Shift values, exp and sum
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-
-    // Perform sum reduction
-    *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-}
-
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_parallel(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    const uint lid    = get_local_id(0);
-    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-#ifdef BETA
-    // Initialize beta
-    VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
-
-    // Define one temporary vector per work-item.
-    __local VEC_TYPE tmp_local[GRID_SIZE];
-    __local DATA_TYPE max_local;
-
-    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
-
-    // Number of iterations per work-item.
-    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
-    // Calculate max of row
-    uint i = 0;
-    for(; i < width; ++i)
-    {
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    // How many work-items needed to complete the computation.
-    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    SELECT_TYPE widx;
-    if(lid == 0)
-    {
-        // Handle non multiple of 4
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        widx              = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
-        max_val_vec       = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx));
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = max_val_vec;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);
-        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_TYPE  sum1D   = 0;
-    DATA_TYPE max_val = max_local;
-
-    // Shift values, exp and sum
-    for(i = 0; i < width; ++i)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(lid == 0)
-    {
-        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        data = exp(data);
-        data = select(0, data, widx);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        data = select(0, data, widx);
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = sum1D;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] += tmp_local[lid + 128];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] += tmp_local[lid + 64];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] += tmp_local[lid + 32];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] += tmp_local[lid + 16];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] += tmp_local[lid + 8];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] += tmp_local[lid + 4];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] += tmp_local[lid + 2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
-        // Perform sum reduction
-        *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-    }
-}
-
-#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
-#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
deleted file mode 100644
index 4d5006d804..0000000000
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ /dev/null
@@ -1,530 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(DIFF_MIN)
-
-#define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
-
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void softmax_layer_norm_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst))
-{
-    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
-    // Load max value of 1D logits vector (row)
-    int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
-
-    // It will be better to calculate this in prev layer and pass here as parameter
-    uint    sum_val_u               = convert_uint(sum_val);
-    int     headroom_plus_one       = clz(sum_val_u);
-    int     num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
-    int     shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
-    VEC_INT shifted_sum_minus_one   = shifted_sum_minus_one_1;
-    VEC_INT shifted_scale           = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, VECTOR_SIZE);
-
-    // It was already calculated in prev layer, should be stored into tmp output and reused
-    VEC_INT data_diff      = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr);
-    VEC_INT data_diff_mult = data_diff;
-#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
-    if(INPUT_BETA_MULTIPLIER > 1)
-    {
-        data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
-    }
-#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
-
-    VEC_INT data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-    data         = ASYMM_MULT(shifted_scale, data, VECTOR_SIZE);
-    data         = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, VECTOR_SIZE);
-#ifdef QASYMM8_SIGNED
-    data += (VEC_INT)(MIN_VALUE);
-#endif /* QASYMM8_SIGNED */
-    data           = select(MIN_VALUE, data, data_diff >= (VEC_INT)(DIFF_MIN));
-    VEC_BASE data0 = CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
-
-    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
-
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
-
-VEC_INT mult_by_quantized_multiplier(VEC_INT data)
-{
-#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
-    if(INPUT_BETA_MULTIPLIER > 1)
-    {
-        return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
-    }
-#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
-    return data;
-}
-
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not multiple of VECTOR_SIZE -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: S32
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-    VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
-
-    // Calculate max of row
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
-    VEC_BASE data        = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    VEC_INT widx         = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
-    max_val_vec          = max(max_val_vec, select(vec_min_val, data, CONVERT(widx, VEC_BASE)));
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        max_val_vec   = max(data, max_val_vec);
-    }
-
-    // Perform max reduction
-    DATA_TYPE max_local               = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    *((__global DATA_TYPE *)maxo.ptr) = max_local;
-
-    // Second part
-
-    // Load max value of 1D logits vector (row)
-    int max_val = convert_int(max_local);
-
-    // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
-    VEC_INT sum1D = 0;
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_INT data_fp        = CONVERT(data, VEC_INT);
-    VEC_INT data_diff      = data_fp - max_val;
-    VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-    data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-    data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data_diff, 0, (__global int *)dst_addr);
-    data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    sum1D += select(0, data_fp, widx);
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    // Shift values, exp and sum
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + i * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-
-    // Perform sum reduction
-    *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-}
-
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    const uint lid    = get_local_id(0);
-    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-    // Define one temporary vector per work-item.
-    __local VEC_INT tmp_local[GRID_SIZE];
-    __local DATA_TYPE max_local;
-
-    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
-    VEC_BASE max_val_vec = vec_min_val;
-
-    // Number of iterations per work-item.
-    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
-    // Calculate max of row
-    uint i = 0;
-    for(; i < width; ++i)
-    {
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    // How many work-items needed to complete the computation.
-    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_INT widx;
-    if(lid == 0)
-    {
-        // Handle non multiple of 4
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        widx              = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
-        max_val_vec       = max(max_val_vec, select(vec_min_val, data_max, CONVERT(widx, VEC_BASE)));
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = CONVERT(max_val_vec, VEC_INT);
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        max_val_vec = max(CONVERT((tmp_local[lid + 1]), VEC_BASE), CONVERT((tmp_local[lid]), VEC_BASE));
-        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_INT sum1D   = 0;
-    int     max_val = convert_int(max_local);
-
-    // Shift values, exp and sum
-    for(i = 0; i < width; ++i)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(lid == 0)
-    {
-        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data_diff, 0, (__global int *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(int)));
-        data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-        data_fp = select(0, data_fp, widx);
-        sum1D   = sum1D + data_fp;
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = sum1D;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] += tmp_local[lid + 128];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] += tmp_local[lid + 64];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] += tmp_local[lid + 32];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] += tmp_local[lid + 16];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] += tmp_local[lid + 8];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] += tmp_local[lid + 4];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] += tmp_local[lid + 2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
-        // Perform sum reduction
-        *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-    }
-}
-#endif // #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
-#endif /* defined(DATA_TYPE) && defined(DIFF_MIN) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(MIN_VALUE) */
diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/space_to_batch.cl
deleted file mode 100644
index cb11786ac4..0000000000
--- a/src/core/CL/cl_kernels/space_to_batch.cl
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
-/** Calculate the space to batch conversion.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
- *
- * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
- * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
- * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
- * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
- * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
- * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
- * @param[in]  batch_id                                  The output tensor batch id
- * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
- */
-__kernel void space_to_batch_nchw(
-    TENSOR4D_DECLARATION(input),
-    IMAGE_DECLARATION(paddings),
-    VECTOR_DECLARATION(block_shape),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
-    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
-    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
-    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
-
-    int block_x = *((__global int *)vector_offset(&block, 0));
-    int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int out_x = get_global_id(0);
-    const int out_y = get_global_id(1);
-    const int z     = get_global_id(2);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - pad_left_x;
-        const int in_y = pos_y - pad_left_y;
-
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
-    }
-}
-/** Calculate the space to batch conversion. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
- *
- * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
- * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
- * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
- * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
- * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
- * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
- * @param[in]  batch_id                                  The output tensor batch id
- * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
- */
-__kernel void space_to_batch_nhwc(
-    TENSOR4D_DECLARATION(input),
-    IMAGE_DECLARATION(paddings),
-    VECTOR_DECLARATION(block_shape),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
-    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
-    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
-    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
-
-    int block_x = *((__global int *)vector_offset(&block, 0));
-    int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int out_x = get_global_id(1);
-    const int out_y = get_global_id(2);
-    const int z     = get_global_id(0);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - pad_left_x;
-        const int in_y = pos_y - pad_left_y;
-
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
-    }
-}
-#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
-
-#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
-/** Calculate the space to batch conversion.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
- * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
- * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
- * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[in]  batch_id                             The output tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void space_to_batch_static_nchw(
-    TENSOR4D_DECLARATION(input),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int block_x = BLOCK_SHAPE_X;
-    int block_y = BLOCK_SHAPE_Y;
-
-    const int out_x = get_global_id(0);
-    const int out_y = get_global_id(1);
-    const int z     = get_global_id(2);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - PAD_LEFT_X;
-        const int in_y = pos_y - PAD_LEFT_Y;
-
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
-    }
-}
-/** Calculate the space to batch conversion. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
- * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
- * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
- * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[in]  batch_id                             The output tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void space_to_batch_static_nhwc(
-    TENSOR4D_DECLARATION(input),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int block_x = BLOCK_SHAPE_X;
-    int block_y = BLOCK_SHAPE_Y;
-
-    const int out_x = get_global_id(1);
-    const int out_y = get_global_id(2);
-    const int z     = get_global_id(0);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - PAD_LEFT_X;
-        const int in_y = pos_y - PAD_LEFT_Y;
-
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
-    }
-}
-#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/cl_kernels/space_to_depth.cl b/src/core/CL/cl_kernels/space_to_depth.cl
deleted file mode 100644
index 1217a37345..0000000000
--- a/src/core/CL/cl_kernels/space_to_depth.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-/** Space to depth transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void space_to_depth_nchw(
-    TENSOR4D_DECLARATION(input),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2) % r;
-
-    const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
-    const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
-}
-/** Space to depth transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void space_to_depth_nhwc(
-    TENSOR4D_DECLARATION(input),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0) % r;
-
-    const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
-    const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
-}
-#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/stack_layer.cl b/src/core/CL/cl_kernels/stack_layer.cl
deleted file mode 100644
index 438e858df2..0000000000
--- a/src/core/CL/cl_kernels/stack_layer.cl
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
-
-#if AXIS == 0
-#define X_DST (idx_input)
-#define Y_DST (x_src)
-#define Z_DST (y_src)
-#define W_DST (z_src)
-#define K_DST (w_src)
-#elif AXIS == 1 // AXIS == 1
-#define X_DST (x_src)
-#define Y_DST (idx_input)
-#define Z_DST (y_src)
-#define W_DST (z_src)
-#define K_DST (w_src)
-#elif AXIS == 2 // AXIS == 2
-#define X_DST (x_src)
-#define Y_DST (y_src)
-#define Z_DST (idx_input)
-#define W_DST (z_src)
-#define K_DST (w_src)
-#elif AXIS == 3 // AXIS == 3
-#define X_DST (x_src)
-#define Y_DST (y_src)
-#define Z_DST (z_src)
-#define W_DST (idx_input)
-#define K_DST (w_src)
-#elif AXIS == 4 // AXIS == 4
-#define X_DST (x_src)
-#define Y_DST (y_src)
-#define Z_DST (z_src)
-#define W_DST (w_src)
-#define K_DST (idx_input)
-#else // AXIS not supported
-#error "Not supported axis"
-#endif // AXIS == 0
-
-/** OpenCL kernel to stack a rank-R tensor into one with rank-(R+1) along the axis dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note The dimension to stack the tensors along has to be passed at compile time using -DAXIS. i.e. -DAXIS=1
- * @note Dimension 2 of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=112)
- * @note Dimension 3 of the output tensor must be passed at compile time using -DDST_DIM3 (e.g. -DDST_DIM3=112)
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  idx_input                         Index of the input tensor in the list of tensors to stack
- */
-__kernel void stack_layer(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    unsigned int idx_input)
-{
-    uint x_src = get_global_id(0);
-    uint y_src = get_global_id(1);
-    uint z_src = (get_global_id(2) % SRC_DIM2);
-    uint w_src = (get_global_id(2) / SRC_DIM2);
-
-    __global DATA_TYPE *src = (__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_src * sizeof(DATA_TYPE) + y_src * src_stride_y + z_src * src_stride_z + w_src * src_stride_w);
-
-    __global DATA_TYPE *dst = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + X_DST * sizeof(DATA_TYPE) + Y_DST * dst_stride_y + Z_DST * dst_stride_z + W_DST * dst_stride_w + K_DST *
-                                                     dst_stride_w * (uint)DST_DIM3);
-
-    *dst = *src;
-}
-
-#undef X_DST
-#undef Y_DST
-#undef Z_DST
-#undef W_DST
-#endif // defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
diff --git a/src/core/CL/cl_kernels/tile.cl b/src/core/CL/cl_kernels/tile.cl
deleted file mode 100644
index 79da7fe6b9..0000000000
--- a/src/core/CL/cl_kernels/tile.cl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
-/** Perform a floor operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void tile(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
-
-    // For all coordinates but x, each tile copies from the input
-    const int y     = get_global_id(1);
-    const int z     = get_global_id(2) % DST_DEPTH;
-    const int batch = get_global_id(2) / DST_DEPTH;
-
-#if defined(VEC_SIZE) && defined(OFFSET)
-    // If we are loading/storing multiple elements at time, we need to
-    // not exceed the input boundaries. The last threads need to backtrack
-    // of OFFSET elements. Those elements cumulates for previous tiles
-    const int id = (int)(get_global_id(0));
-    int       x  = id * VEC_SIZE;
-
-    // Shift x based on the previous offsets
-    const int tile_number = x / SRC_WIDTH;
-    x -= (tile_number) * OFFSET;
-    int x_input = x % SRC_WIDTH;
-
-    // Shift x based on being the last tile
-    const int last_tile = (int)(x_input + VEC_SIZE > SRC_WIDTH);
-    x -= last_tile * OFFSET;
-    x_input = x % SRC_WIDTH;
-    output.ptr -= (tile_number + last_tile) * OFFSET * output_stride_x;
-
-    // Update the input pointer
-    input.ptr = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
-
-    // Copy the data
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
-
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)output.ptr);
-#else  // !defined(VEC_SIZE) || !defined(OFFSET)
-    const int x = get_global_id(0);
-
-    // Update the input pointer
-    input.ptr = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
-
-    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
-#endif // defined(VEC_SIZE) && defined(OFFSET)
-}
-#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index f2d2f26cf2..8129606277 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,50 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#ifndef ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
+#define ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
 
 // *INDENT-OFF*
 // clang-format off
 
+#define TILE_VECTOR_SIZE1 1
+#define TILE_VECTOR_SIZE2 2
+#define TILE_VECTOR_SIZE3 3
+#define TILE_VECTOR_SIZE4 4
+#define TILE_VECTOR_SIZE5 8
+#define TILE_VECTOR_SIZE6 8
+#define TILE_VECTOR_SIZE7 8
+#define TILE_VECTOR_SIZE8 8
+#define TILE_VECTOR_SIZE9 16
+#define TILE_VECTOR_SIZE10 16
+#define TILE_VECTOR_SIZE11 16
+#define TILE_VECTOR_SIZE12 16
+#define TILE_VECTOR_SIZE13 16
+#define TILE_VECTOR_SIZE14 16
+#define TILE_VECTOR_SIZE15 16
+#define TILE_VECTOR_SIZE16 16
+
+#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
+#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
+#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
+#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
+#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
+#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
+#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
+#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
+#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
+
 /** Tile object
  *  A tile object is a 2D memory block and can be accessed using the following syntax:
  *  -# a[m0].v    = access the the vector at row "m0" (OpenCL vector)
- *  -# a[m0].s[x] = access the scalar element at row "m0" and column "n0" (scalar access)
+ *  -# dst[m0].s[n0] = access the scalar element at row "m0" and column "n0" (scalar access)
  *
  * @param[in] DATA_TYPE Data type of the tile
  * @param[in] H         Number of tile rows
@@ -38,8 +74,8 @@
 #define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
 #define TILE_STR(DATA_TYPE, H, W, BASENAME) \
     union {                                 \
-        DATA_TYPE    s[W];                  \
-        DATA_TYPE##W v;                     \
+        DATA_TYPE                      s[TILE_VECTOR_SIZE##W];                  \
+        TILE_VECTOR_TYPE##W(DATA_TYPE) v;                     \
     } BASENAME[H]
 
 #define TENSOR4D_IMAGE(name)          \
@@ -70,6 +106,90 @@
 #define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
 #define TENSOR4D(name, type) TENSOR4D_STR(name, type)
 
+#define TENSOR4D_T_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    __global uchar *name##_ptr,       \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_stride_w, \
+    uint        name##_c,   \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_T_BUFFER(name)    \
+    __global uchar *name##_ptr,  \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_stride_w, \
+    uint        name##_c,   \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
+
+/** Legacy tensor 4D arguments
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
+#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
+
+#define TENSOR4D_RO_T_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
+
+/** Read-Only (RO) tensor 4D.
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
+#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
+
+#define TENSOR4D_WO_T_IMAGE(name)          \
+    __write_only image2d_t name##_img, \
+    TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
+
+/** Write-Only (WO) tensor 4D.
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
+#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
+
+#define TENSOR3D_T_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    __global uchar *name##_ptr,       \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_T_BUFFER(name)    \
+    __global uchar *name##_ptr,  \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
+#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
+
 #if !defined(UNROLL_WITH_PRAGMA)
 #define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
 
@@ -235,51 +355,128 @@
  *
  *  @note Performs: c += dot(a, b)
  *
- * @param[in] DST_DATA_TYPE Accumulator data type
- * @param[in] K0            Number of accumulations
- * @param[in] a             OpenCL vector a
- * @param[in] b             OpenCL vector b
- * @param[in] c             Scalar variable c
+ * @param[in] A_DATA_TYPE A (lhs) data type
+ * @param[in] B_DATA_TYPE B (rhs) data type
+ * @param[in] C_DATA_TYPE C (accumulator) data type
+ * @param[in] K0          Number of accumulations
+ * @param[in] a           OpenCL vector a
+ * @param[in] b           OpenCL vector b
+ * @param[in] c           Scalar variable c
  */
-#define DOT_PRODUCT_INTEGER8(DST_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(DST_DATA_TYPE, K0, a, b, c)
-#define DOT_PRODUCT_INTEGER8_STR(DST_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(DST_DATA_TYPE, a, b, c)
-#define DOT_PRODUCT1_INTEGER8(DST_DATA_TYPE, a, b, c) \
+#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
+#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
+#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
     ({                                                \
-        c += (DST_DATA_TYPE)a * (DST_DATA_TYPE)b;     \
+        c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
     })
-#define DOT_PRODUCT2_INTEGER8(DST_DATA_TYPE, a, b, c)   \
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
+#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
+#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
+#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
+#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) //  defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
+#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
+#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
+#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
+#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
+#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
+#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
     ({                                                  \
-        c += (DST_DATA_TYPE)a.s0 * (DST_DATA_TYPE)b.s0; \
-        c += (DST_DATA_TYPE)a.s1 * (DST_DATA_TYPE)b.s1; \
+        c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
+        c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
     })
-#define DOT_PRODUCT3_INTEGER8(DST_DATA_TYPE, a, b, c)   \
+#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
     ({                                                  \
-        DOT_PRODUCT2_INTEGER8(DST_DATA_TYPE, a, b, c);  \
-        c += (DST_DATA_TYPE)a.s2 * (DST_DATA_TYPE)b.s2; \
+        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c);  \
+        c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
     })
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define DOT_PRODUCT4_INTEGER8(DST_DATA_TYPE, x, y, val) val = arm_dot_acc((x), (y), (val));
-#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#define DOT_PRODUCT4_INTEGER8(DST_DATA_TYPE, x, y, val) val += arm_dot((x), (y));
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define DOT_PRODUCT4_INTEGER8(DST_DATA_TYPE, x, y, val)   \
+#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val)   \
     ({                                                    \
-        val += (DST_DATA_TYPE)x.s0 * (DST_DATA_TYPE)y.s0; \
-        val += (DST_DATA_TYPE)x.s1 * (DST_DATA_TYPE)y.s1; \
-        val += (DST_DATA_TYPE)x.s2 * (DST_DATA_TYPE)y.s2; \
-        val += (DST_DATA_TYPE)x.s3 * (DST_DATA_TYPE)y.s3; \
+        val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
+        val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
+        val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
+        val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
     })
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define DOT_PRODUCT8_INTEGER8(DST_DATA_TYPE, a, b, c)            \
-    ({                                                           \
-        DOT_PRODUCT4_INTEGER8(DST_DATA_TYPE, (a.lo), (b.lo), c); \
-        DOT_PRODUCT4_INTEGER8(DST_DATA_TYPE, (a.hi), (b.hi), c); \
+#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
+        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c);     \
     })
-#define DOT_PRODUCT16_INTEGER8(DST_DATA_TYPE, a, b, c)           \
-    ({                                                           \
-        DOT_PRODUCT8_INTEGER8(DST_DATA_TYPE, (a.lo), (b.lo), c); \
-        DOT_PRODUCT8_INTEGER8(DST_DATA_TYPE, (a.hi), (b.hi), c); \
+#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
+        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c);     \
+    })
+#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
+        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c);     \
+    })
+#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);     \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);     \
+    })
+#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c);     \
+    })
+#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c);     \
+    })
+#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c);     \
+    })
+#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
     })
+#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
+        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).sC), ((b).sC), c);     \
+    })
+#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
+        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).sCD), ((b).sCD), c);     \
+    })
+#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
+        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).sCDE), ((b).sCDE), c);     \
+    })
+#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                 \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);      \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);      \
+    })
+
+/** Dot product integet 8bit function
+ *
+ *  @note Performs: c += dot(a, b)
+ *
+ * @param[in] A_DATA_TYPE A (lhs) data type
+ * @param[in] B_DATA_TYPE B (rhs) data type
+ * @param[in] C_DATA_TYPE C (accumulator) data type
+ * @param[in] K0          Number of accumulations
+ * @param[in] a           OpenCL vector a
+ * @param[in] c           Scalar variable c
+ */
+#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
+#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
 
 /** Load a vector from global memory (tensor)
  *
@@ -296,9 +493,28 @@
 #define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
 #define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
     VLOAD(WIDTH)                                                \
-    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y)*STRIDE_Y))
+    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
 #define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
 
+/** Store a vector in global memory (tensor)
+ *
+ * @param[in] DATA_TYPE   Data type
+ * @param[in] WIDTH       Number of dst columns
+ * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                        In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in] TENSOR      Tensor basename
+ * @param[in] X           Starting X position
+ * @param[in] Y           Starting Y position
+ * @param[in] STRIDE_Y    Stride Y (in bytes)
+ * @param[in] VALUES      Values to store in memory
+ */
+#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
+#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
+#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
+    VSTORE(WIDTH)                                                \
+    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
+#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
+
 /** Load a tile from global memory (tensor)
  *
  * @param[in]  DATA_TYPE     Data type
@@ -323,6 +539,100 @@
         })                                                                                                             \
     })
 
+/** Store a VECTOR variable (e.g. int4, int8, char2 etc.) to a specified column in the TILE object
+ *
+ * @param[in]      VECTOR Vector variable to store
+ * @param[in, out] TILE   Tile variable to store to
+ * @param[in]      WIDTH  Width of the vector variable, also height of the tile (e.g. 2 if char2)
+ * @param[in]      COLUMN Column index of the tile
+ */
+#define COPY_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, WIDTH, COLUMN) COPY_VECTOR_TO_TILE_COLUMN_STR(VECTOR, TILE, WIDTH, COLUMN)
+#define COPY_VECTOR_TO_TILE_COLUMN_STR(VECTOR, TILE, WIDTH, COLUMN) COPY_##WIDTH##_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN)
+#define COPY_1_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR;                         \
+    })
+
+#define COPY_2_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR.s0;                      \
+        TILE[1].s[COLUMN] = VECTOR.s1;                      \
+    })
+
+#define COPY_3_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR.s0;                      \
+        TILE[1].s[COLUMN] = VECTOR.s1;                      \
+        TILE[2].s[COLUMN] = VECTOR.s2;                      \
+    })
+
+#define COPY_4_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR.s0;                      \
+        TILE[1].s[COLUMN] = VECTOR.s1;                      \
+        TILE[2].s[COLUMN] = VECTOR.s2;                      \
+        TILE[3].s[COLUMN] = VECTOR.s3;                      \
+    })
+
+#define COPY_8_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR.s0;                      \
+        TILE[1].s[COLUMN] = VECTOR.s1;                      \
+        TILE[2].s[COLUMN] = VECTOR.s2;                      \
+        TILE[3].s[COLUMN] = VECTOR.s3;                      \
+        TILE[4].s[COLUMN] = VECTOR.s4;                      \
+        TILE[5].s[COLUMN] = VECTOR.s5;                      \
+        TILE[6].s[COLUMN] = VECTOR.s6;                      \
+        TILE[7].s[COLUMN] = VECTOR.s7;                      \
+    })
+
+#define COPY_16_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR.s0;                      \
+        TILE[1].s[COLUMN] = VECTOR.s1;                      \
+        TILE[2].s[COLUMN] = VECTOR.s2;                      \
+        TILE[3].s[COLUMN] = VECTOR.s3;                      \
+        TILE[4].s[COLUMN] = VECTOR.s4;                      \
+        TILE[5].s[COLUMN] = VECTOR.s5;                      \
+        TILE[6].s[COLUMN] = VECTOR.s6;                      \
+        TILE[7].s[COLUMN] = VECTOR.s7;                      \
+        TILE[8].s[COLUMN] = VECTOR.s8;                      \
+        TILE[9].s[COLUMN] = VECTOR.s9;                      \
+        TILE[10].s[COLUMN] = VECTOR.sA;                     \
+        TILE[11].s[COLUMN] = VECTOR.sB;                     \
+        TILE[12].s[COLUMN] = VECTOR.sC;                     \
+        TILE[13].s[COLUMN] = VECTOR.sD;                     \
+        TILE[14].s[COLUMN] = VECTOR.sE;                     \
+        TILE[15].s[COLUMN] = VECTOR.sF;                     \
+    })
+
+/** Load SRC_HEIGHT x SRC_WIDTH elements from global memory (tensor), and store them in a SRC_WIDTH x SRC_HEIGHT tile
+ *
+ * @param[in]  DATA_TYPE     Data type
+ * @param[in]  SRC_HEIGHT    Number of source rows, or number of columns of the output tile
+ * @param[in]  SRC_WIDTH     Number of source columns, or number of tile rows
+ * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                           In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR        Tensor basename
+ * @param[in]  X             Starting X position
+ * @param[in]  Y             Starting Y position
+ * @param[in]  YI_MULTIPLIER Parameter used to multiply the internal row increment (_i).
+ *                           In common cases should be 1 but it becomes useful when we want to load rows which are multiple of STRIDE_Y.
+ *                           (e.g. loading the weights of convolution layer).
+ *                           In this case the address calculation is performed as: (Y + _i * Y_MULTIPLIER) * STRIDE_Y
+ * @param[in]  STRIDE_Y      Stride Y (in bytes) used to load each row.
+ * @param[out] dst           Output tile
+ */
+#define T_LOAD_TRANSPOSED(DATA_TYPE, SRC_HEIGHT, SRC_WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)     \
+    ({                                                                                                                   \
+        LOOP_UNROLLING(int, _i, 0, 1, SRC_HEIGHT,                                                                        \
+        {                                                                                                                \
+            VEC_DATA_TYPE(DATA_TYPE, SRC_WIDTH)                                                                          \
+                tmp = V_LOAD(DATA_TYPE, SRC_WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
+            COPY_VECTOR_TO_TILE_COLUMN(tmp, dst, SRC_WIDTH, _i);                                                         \
+        })                                                                                                               \
+    })
+
 /** Load a tile from global memory (tensor) using an indirect Y index tile
  *
  * @param[in]  DATA_TYPE   Data type
@@ -344,6 +654,42 @@
         })                                                                                              \
     })
 
+/** Load a tile from global memory (tensor) using an indirect Y index tile and conditionally use a different length for the load
+ *
+ * @note If WIDTH1_CONDITION is true, the load will use the WIDTH1 length for the store
+ * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones
+ *
+ * @param[in]  DATA_TYPE        Data type
+ * @param[in]  HEIGHT           Number of dst rows
+ * @param[in]  WIDTH0           Store width to use if WIDTH1_CONDITION = false
+ * @param[in]  WIDTH1           Store width to use if WIDTH1_CONDITION = true
+ * @param[in]  TENSOR_TYPE      Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                              In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR           Tensor basename
+ * @param[in]  X                Starting X position
+ * @param[in]  STRIDE_Y         Stride Y (in bytes) used to load each row.
+ * @param[in]  WIDTH1_CONDITION Condition to select the WIDTH1 store
+ * @param[out] dst              Output tile
+ * @param[out] indirect_y       Indirect Y index tile
+ */
+#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
+    ({                                                                                                                                                                                             \
+        if(WIDTH1_CONDITION)                                                                                                                                                                       \
+        {                                                                                                                                                                                          \
+            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
+            {                                                                                                                                                                                      \
+                VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
+                (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
+            })                                                                                                                                                                                     \
+        }                                                                                                                                                                                          \
+        else                                                                                                                                                                                       \
+        {                                                                                                                                                                                          \
+            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
+            {                                                                                                                                                                                      \
+                dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
+            })                                                                                                                                                                                     \
+        }                                                                                                                                                                                          \
+    })
 /** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout
  *
  * @param[in]  DATA_TYPE     Data type
@@ -379,6 +725,53 @@
         })                                                                                                                                            \
     })
 
+/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout with dilation for the X and Y increments
+ *
+ * @param[in]  DATA_TYPE      Data type
+ * @param[in]  TILE_HEIGHT    Number of elements to load from Y (height) dimension
+ * @param[in]  TILE_WIDTH     Number of elements to load from X (width) dimension
+ * @param[in]  TILE_CHANNELS  Number of elements to load from C (channel) dimension
+ * @param[in]  TENSOR_TYPE    Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ *                            In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR         Tensor basename
+ * @param[in]  B              Starting batch index
+ * @param[in]  Y              Starting Y index
+ * @param[in]  X              Starting X index
+ * @param[in]  C              Starting C index
+ * @param[in]  TENSOR_HEIGHT  Number of elements to load from Y (height) dimension
+ * @param[in]  TENSOR_WIDTH   Number of elements to load from X (width) dimension
+ * @param[in]  DILATION_X     Dilation for the X increment
+ * @param[in]  DILATION_Y     Dilation for the Y increment
+ * @param[in]  BOUNDARY_CHECK Boundary check flag. If true, it checks for any out-of-bound reads
+ * @param[out] dst            Output tile
+ */
+#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst)         \
+    ({ \
+        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
+        { \
+            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
+            { \
+                int _src_y = (X) + _xk * (DILATION_X); \
+                int _src_z = ((Y) + _yk * (DILATION_Y)); \
+                int _src_w    = (B); \
+                bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
+                if(!(BOUNDARY_CHECK)) \
+                { \
+                    dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
+                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
+                } \
+                else \
+                { \
+                    if(_src_valid_y) \
+                    { \
+                        dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
+                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
+                    }                                                                                                                                                                                                 \
+                } \
+            })                                                                                                                                                                                                             \
+        })                                                                                                                                                                                                             \
+    })
+
 /** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout using indirect X and Y coordinates
  *
  * @param[in]  DATA_TYPE     Data type
@@ -391,8 +784,8 @@
  * @param[in]  Y             Starting Y index
  * @param[in]  X             Starting X index
  * @param[in]  C             Starting C index
- * @param[in]  TENSOR_HEIGHT Number of elements to load from Y (height) dimension
  * @param[in]  TENSOR_WIDTH  Number of elements to load from X (width) dimension
+ * @param[in]  TENSOR_HEIGHT Number of elements to load from Y (height) dimension
  * @param[in]  STRIDE_Y      Stride Y (in bytes)
  * @param[out] xi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate
  * @param[out] yi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate
@@ -412,6 +805,79 @@
         })                                                                                                                                                            \
     })
 
+/** Load a tile from global memory (tensor) using an indirect buffer for the Y coordinates
+ *
+ * @param[in]  DATA_TYPE     Data type
+ * @param[in]  TILE_AREA     Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
+ * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
+ * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                           When TENSOR_TYPE=IMAGE, the if condition for the out-of-bound check can be skipped
+ *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR        Tensor basename
+ * @param[in]  C             Starting C index
+ * @param[in]  STRIDE_Y      Stride Y (in bytes)
+ * @param[out] yi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate
+ *                           16 is the maximum indirect buffer size.
+ * @param[out] dst           Output tile
+ */
+#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
+#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
+#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
+    ({ \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
+        { \
+            if(yi[0].s[_i] >= 0) \
+            { \
+                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
+            } \
+        }) \
+    })
+
+#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
+    ({ \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
+        { \
+            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
+        }) \
+    })
+
+/** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates
+ *
+ * @param[in]  DATA_TYPE     Data type
+ * @param[in]  TILE_AREA     Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
+ * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
+ * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR        Tensor basename
+ * @param[in]  B             Starting batch index
+ * @param[in]  Z             Starting Z index
+ * @param[in]  Y             Starting Y index
+ * @param[in]  X             Starting X index
+ * @param[in]  C             Starting C index
+ * @param[in]  TENSOR_WIDTH  Number of elements to load from X (width) dimension
+ * @param[in]  TENSOR_HEIGHT Number of elements to load from Y (height) dimension
+ * @param[in]  TENSOR_DEPTH  Number of elements to load from Z (depth) dimension
+ * @param[in]  STRIDE_Y      Stride Y (in bytes)
+ * @param[out] xi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate
+ * @param[out] yi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate
+ * @param[out] zi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Z coordinate
+ * @param[out] dst           Output tile
+ */
+#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
+    ({                                                                                                                                                                \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
+        {                                                                                                                                                             \
+            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT);                                      \
+            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH);                                                                         \
+            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)   \
+                             && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH));                                                                     \
+            if(_src_valid_y != 0)                                                                                                                                     \
+            {                                                                                                                                                         \
+                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
+            }                                                                                                                                                         \
+        })                                                                                                                                                            \
+    })
+
 /** Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a different length for the store
  *
  * @note If WIDTH1_CONDITION is true, the store will use the WIDTH1 length for the store
@@ -437,7 +903,7 @@
             LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
             {                                                                                                                                                                                      \
                 VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                                                                     \
-                (src[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
+                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
             })                                                                                                                                                                                     \
         }                                                                                                                                                                                          \
         else                                                                                                                                                                                       \
@@ -445,7 +911,7 @@
             LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
             {                                                                                                                                                                                      \
                 VSTORE(WIDTH0)                                                                                                                                                                     \
-                (src[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
+                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
             })                                                                                                                                                                                     \
         }                                                                                                                                                                                          \
     })
@@ -479,40 +945,160 @@
                     dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
                 })                                                                                   \
             })                                                                                       \
-        });                                                                                          \
+        })                                                                                          \
+    })
+
+/** 8-bit quantization with fixed-point scale
+ *
+ * @param[in]  SRC_DATA_TYPE     SRC data type
+ * @param[in]  DST_DATA_TYPE     DST data type
+ * @param[in]  QUANTIZATION_TYPE Quantization type (PER_TENSOR or PER_CHANNEL)
+ * @param[in]  M0                Number of src/dst rows
+ * @param[in]  N0                Number of src/dst columns
+ * @param[in]  DST_OFFSET        Quantization offset used for both the per-tensor and per-channel quantization
+ * @param[in]  DST_SHIFT         Quantization shift for the per-tensor quantization
+ * @param[in]  DST_MULTIPLIER    Quantization multiplier for the per-tensor quantization
+ * @param[in]  src               Input tile
+ * @param[in]  dst_multipliers   Output multipliers tile for the per-channel quantization
+ * @param[in]  dst_shifts        Output shift tile for the per-channel quantization
+ * @param[out] dst               Output tile
+ */
+#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
+#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
+
+/** 8-bit per-tensor quantization with fixed-point scale
+ *
+ * @param[in]  SRC_DATA_TYPE   SRC data type
+ * @param[in]  DST_DATA_TYPE   DST data type
+ * @param[in]  M0              Number of src/dst rows
+ * @param[in]  N0              Number of src/dst columns
+ * @param[in]  DST_OFFSET      Quantization offset
+ * @param[in]  DST_SHIFT       Quantization shift for the per-tensor quantization
+ * @param[in]  DST_MULTIPLIER  Quantization multiplier for the per-tensor quantization
+ * @param[in]  src             Input tile
+ * @param[in]  dst_multipliers (unused)
+ * @param[in]  dst_shifts      (unused)
+ * @param[out] dst             Output tile
+ */
+#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
+        { \
+            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
+            { \
+                SRC_DATA_TYPE _tmp = 0; \
+                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
+                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
+                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
+                long a_64 = (long)(_src); \
+                long b_64 = (long)(DST_MULTIPLIER); \
+                long ab_64 = a_64 * b_64; \
+                long mask1 = 1 << 30; \
+                long mask2 = 1 - (1 << 30); \
+                long is_positive_or_zero = ab_64 >= 0; \
+                long nudge = select(mask2, mask1, is_positive_or_zero); \
+                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
+                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
+                if(DST_SHIFT >= 0) \
+                { \
+                    long mask = ((((int)1) << DST_SHIFT) - (long)1); \
+                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
+                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
+                } \
+                _tmp += DST_OFFSET; \
+                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
+            })                                                                                                                                          \
+        })                                                                                                                                          \
+    })
+
+/** 8-bit per-channel quantization with fixed-point scale
+ *
+ * @param[in]  SRC_DATA_TYPE   SRC data type
+ * @param[in]  DST_DATA_TYPE   DST data type
+ * @param[in]  M0              Number of src/dst rows
+ * @param[in]  N0              Number of src/dst columns
+ * @param[in]  DST_OFFSET      Quantization offset
+ * @param[in]  DST_SHIFT       (unused)
+ * @param[in]  DST_MULTIPLIER  (unused)
+ * @param[in]  src             Input tile
+ * @param[in]  dst_multipliers Output multipliers tile for the per-channel quantization
+ * @param[in]  dst_shifts      Output shift tile for the per-channel quantization
+ * @param[out] dst             Output tile
+ */
+#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
+        { \
+            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
+            { \
+                SRC_DATA_TYPE _tmp = 0; \
+                SRC_DATA_TYPE _tmp2 = 0; \
+                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
+                SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
+                SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
+                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
+                SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
+                long a_64 = (long)(_src); \
+                long b_64 = (long)(_dst_multiplier); \
+                long ab_64 = a_64 * b_64; \
+                long mask1 = 1 << 30; \
+                long mask2 = 1 - (1 << 30); \
+                long is_positive_or_zero = ab_64 >= 0; \
+                long nudge = select(mask2, mask1, is_positive_or_zero); \
+                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
+                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
+                long mask = ((((int)1) << _dst_shift) - (int)1); \
+                long threshold = (mask >> 1) + any(_tmp); \
+                _tmp2 = _tmp >> _dst_shift; \
+                _tmp2 += select(0, 1, (_tmp & mask) > threshold); \
+                _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \
+                _tmp += DST_OFFSET; \
+                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
+            })                                                                                                                                          \
+        })                                                                                                                                         \
     })
 
-/** Quantized the tile (ASYMMETRIC) with fixed-point scale
+/** Quantized the 8-bit tile with fixed-point scale for asymmetric
  *
  * @param[in]  SRC_DATA_TYPE  SRC data type
  * @param[in]  DST_DATA_TYPE  DST data type
  * @param[in]  M0             Number of src/dst rows
  * @param[in]  N0             Number of src/dst columns
- * @param[in]  DST_OFFSET     Quantization offset
- * @param[in]  DST_SHIFT      Quantization shift
- * @param[in]  DST_MULTIPLIER Quantization multiplier
+ * @param[in]  DST_OFFSET     Quantization offset used for both the per-tensor and per-channel quantization
+ * @param[in]  DST_SHIFT      Quantization shift for the per-tensor quantization
+ * @param[in]  DST_MULTIPLIER Quantization multiplier for the per-tensor quantization
  * @param[in]  src            Input tile
  * @param[out] dst            Output tile
  */
-#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)      \
-    ({                                                                                                                     \
-        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                                                 \
-        {                                                                                                                  \
-            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                                             \
-            {                                                                                                              \
-                SRC_DATA_TYPE _tmp = 0;                                                                                    \
-                if(DST_SHIFT < 0)                                                                                          \
-                {                                                                                                          \
-                    _tmp = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(src[_m0].s[_n0], DST_MULTIPLIER, DST_SHIFT, 1); \
-                }                                                                                                          \
-                else                                                                                                       \
-                {                                                                                                          \
-                    _tmp = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(src[_m0].s[_n0], DST_MULTIPLIER, DST_SHIFT, 1);    \
-                }                                                                                                          \
-                _tmp += DST_OFFSET;                                                                                        \
-                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                        \
-            })                                                                                                             \
-        })                                                                                                                 \
+#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
+        { \
+            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
+            { \
+                SRC_DATA_TYPE _tmp = 0; \
+                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
+                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
+                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
+                long a_64 = (long)(_src); \
+                long b_64 = (long)(DST_MULTIPLIER); \
+                long ab_64 = a_64 * b_64; \
+                long mask1 = 1 << 30; \
+                long mask2 = 1 - (1 << 30); \
+                long is_positive_or_zero = ab_64 >= 0; \
+                long nudge = select(mask2, mask1, is_positive_or_zero); \
+                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
+                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
+                if(DST_SHIFT >= 0) \
+                { \
+                    long mask = ((((int)1) << DST_SHIFT) - (int)1); \
+                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
+                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
+                } \
+                _tmp += DST_OFFSET; \
+                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
+            })                                                                                                                                          \
+        })                                                                                                                                          \
     })
 
 /** Conditional rowset (memset by row)
@@ -537,7 +1123,7 @@
         })                                                                                                                                                 \
     })
 
-/** Element-wise activation
+/** Element-wise activation for floating point types
  *
  * @note Performs: activation(LHS) = DST
  *
@@ -558,6 +1144,68 @@
         })                                                                                     \
     })
 
+
+// NOTE : A_VAL and B_VAL should be quantized values (using same quantization info as x)
+// RELU Activation
+#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_POINT, x))
+// Bounded RELU Activation
+#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_POINT, x)))
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+// Hard Swish Activation
+#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
+// Identity Activation
+#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (x)
+
+#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x)
+#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x)
+
+#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
+#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
+#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
+#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
+
+/** Element-wise activation for quantized types
+ *
+ * @note Performs: activation(LHS) = DST
+ *
+ * @param[in]  DATA_TYPE       SRC/DST data type
+ * @param[in]  M0              Number of SRC/DST rows
+ * @param[in]  N0              Number of SRC/DST columns
+ * @param[in]  ACTIVATION_TYPE Activation type
+ * @param[in]  ZERO_POINT      The zero value to consider in the computation
+ * @param[in]  A_VAL           Quantized A value used for the activation (e.g. tanh_op, brelu,..)
+ * @param[in]  B_VAL           Quantized B value used for the activation (e.g. tanh_op, brelu,..)
+ * @param[out] src             SRC tile
+ * @param[out] dst             DST tile
+ */
+#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, src, dst)               \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
+        { \
+            dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_POINT, A_VAL, B_VAL, src[_m0].v); \
+        })                                                                                          \
+    })
+
+/** Element-wise addition between two tiles
+ *
+ * @note Performs: LHS + RHS = DST
+ *
+ * @param[in]  DATA_TYPE LHS/RHS/DST data type
+ * @param[in]  M0        Number of LHS rows
+ * @param[in]  N0        Number of LHS columns
+ * @param[in]  lhs       LHS tile
+ * @param[in]  rhs       Constant RHS tile
+ * @param[out] dst       DST tile
+ */
+#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
+    ({                                                            \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
+        {                                                         \
+            dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
+        })                                                        \
+    })
+
 /** Element-wise addition with a constant value
  *
  * @note Performs: LHS + constant = DST
@@ -573,30 +1221,125 @@
     ({                                                            \
         LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
         {                                                         \
-            LOOP_UNROLLING(int, _n0, 0, 1, N0,                    \
-            {                                                     \
-                dst[_m0].s[_n0] = lhs[_m0].s[_n0] + rhs_constant; \
-            })                                                    \
+            dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant;               \
+        })                                                        \
+    })
+
+#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+/** Element-wise scale with a constant value
+ *
+ * @note Performs: LHS * constant = DST
+ *
+ * @param[in]  DATA_TYPE    LHS/RHS/DST data type
+ * @param[in]  M0           Number of LHS rows
+ * @param[in]  N0           Number of LHS columns
+ * @param[in]  lhs          LHS tile
+ * @param[in]  rhs_constant Constant value
+ * @param[out] dst          DST tile
+ */
+#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
+    ({                                                            \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
+        {                                                         \
+            dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
         })                                                        \
     })
 
-/** Element-wise addition with RHS broadcasted (RHS has the X dimension only)
+/** Element-wise operation with RHS broadcasted (RHS has the X dimension only)
  *
- * @note Performs: LHS + RHS[broadcasted] = DST
+ * @note Performs: LHS OP RHS[broadcasted] = DST
  * @note Both tiles must have same data type
  *
- * @param[in]  DATA_TYPE LHS/RHS/DST data type
- * @param[in]  M0        Number of LHS rows
- * @param[in]  N0        Number of LHS columns
- * @param[in]  lhs       LHS tile
- * @param[in]  rhs       RHS tile
- * @param[out] dst       DST tile
+ * @param[in]  T_ELWISE_OP   Elementwise operator to perform
+ * @param[in]  DST_DATA_TYPE DST data type
+ * @param[in]  M0            Number of LHS rows
+ * @param[in]  N0            Number of LHS columns
+ * @param[in]  lhs           LHS tile
+ * @param[in]  rhs           RHS tile
+ * @param[out] dst           DST tile
  */
-#define T_ADD_BROADCAST_X(DATA_TYPE, M0, N0, lhs, rhs, dst) \
+#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
     ({                                                      \
         LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
         {                                                   \
-            dst[_m0].v = lhs[_m0].v + rhs[0].v;             \
+            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
+        })                                                  \
+    })
+
+/** Element-wise operation with LHS broadcasted (LHS has the X dimension only)
+ *
+ * @note Performs: LHS[broadcasted] OP RHS = DST
+ * @note Both tiles must have same data type
+ *
+ * @param[in]  T_ELWISE_OP   Elementwise operator to perform
+ * @param[in]  DST_DATA_TYPE DST data type
+ * @param[in]  M0            Number of RHS rows
+ * @param[in]  N0            Number of RHS columns
+ * @param[in]  lhs           LHS tile
+ * @param[in]  rhs           RHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
+    ({                                                      \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
+        {                                                   \
+            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
+        })                                                  \
+    })
+
+#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+/** Element-wise operation between two tiles (LHS and RHS)
+ *
+ * @note Performs: LHS OP RHS = DST
+ * @note Both tiles must have same data type
+ *
+ * @param[in]  T_ELWISE_OP   Elementwise operator to perform
+ * @param[in]  DST_DATA_TYPE DST data type
+ * @param[in]  M0            Number of LHS rows
+ * @param[in]  N0            Number of LHS columns
+ * @param[in]  lhs           LHS tile
+ * @param[in]  rhs           RHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
+    ({                                                      \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
+        {                                                   \
+            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
+        })                                                  \
+    })
+
+/** Floor operation on a tile
+ *
+ * @note Performs: floor(SRC) = DST
+ * @note Both tiles must have same data type
+ *
+ * @param[in]  DST_DATA_TYPE DST data type
+ * @param[in]  M0            Number of SRC rows
+ * @param[in]  N0            Number of SRC columns
+ * @param[in]  src           LHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
+    ({                                                      \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
+        {                                                   \
+            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
         })                                                  \
     })
 
@@ -615,15 +1358,72 @@
  * @param[in]      lhs           LHS tile
  * @param[in]      rhs           RHS tile
  * @param[in, out] dst           DST tile
+ *
+ * @note For Int8/UInt8 multiplications, we only have T_MMUL_NT_T because we need
+ *       the multiply the rows of Lhs and Rhs tensors to utilize dot product extension.
+ *       Addition of other versions requires dealing with on the fly transposition of
+ *       these tile elements and therefore is not favored.
  */
 #define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
-#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
-#define T_MMUL_NT_T_float_float_float(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
-#define T_MMUL_NT_T_half_half_half(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
-#define T_MMUL_NT_T_char_char_int(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
-#define T_MMUL_NT_T_uchar_uchar_uint(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
-#define T_MMUL_NT_T_uchar_uchar_int(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
-#define T_MMUL_NT_T_FLOAT(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
+#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
+    {                                                                                     \
+        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
+        {                                                                                 \
+            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
+            {                                                                             \
+                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
+                {                                                                         \
+                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
+                })                                                                        \
+            })                                                                            \
+        })                                                                                \
+    }
+
+#define T_MMUL_NT_NT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_NT_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_NT_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_NT_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
+    {                                                                                                                    \
+        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                                                \
+        {                                                                                                                \
+            LOOP_UNROLLING(int, _k, 0, 1, K0,                                                                            \
+            {                                                                                                            \
+                dst[_m].v = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (rhs[_k].v), dst[_m].v);                                 \
+            })                                                                                                           \
+        })                                                                                                               \
+    }
+
+#define T_MMUL_T_NT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_NT_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_NT_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_NT_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_NT_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
+    {                                                                                     \
+        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
+        {                                                                                 \
+            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
+            {                                                                             \
+                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
+                {                                                                         \
+                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_k].s[_m]), (DST_DATA_TYPE)(rhs[_k].s[_n]), dst[_m].s[_n]); \
+                })                                                                        \
+            })                                                                            \
+        })                                                                                \
+    }
+
+#define T_MMUL_T_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
     {                                                                                     \
         LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
         {                                                                                 \
@@ -631,21 +1431,21 @@
             {                                                                             \
                 LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
                 {                                                                         \
-                    dst[_m].s[_n] = fma((lhs[_m].s[_k]), (rhs[_n].s[_k]), dst[_m].s[_n]); \
+                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_k].s[_m]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
                 })                                                                        \
             })                                                                            \
         })                                                                                \
     }
-#define T_MMUL_NT_T_INTEGER8(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
-    ({                                                                                            \
-        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                         \
-        {                                                                                         \
-            LOOP_UNROLLING(int, _n, 0, 1, N0,                                                     \
-            {                                                                                     \
-                DOT_PRODUCT_INTEGER8(DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
-            })                                                                                    \
-        })                                                                                        \
-    })
-
-// clang-format on
-// *INDENT-ON*
-\ No newline at end of file
+
+#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
+    ({ \
+        LOOP_UNROLLING(int, _m, 0, 1, M0, \
+        { \
+            LOOP_UNROLLING(int, _n, 0, 1, N0, \
+            { \
+                DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
+            })                                                                                             \
+        })                                                                                             \
+    })
+
+#endif /* ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS */
diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/transpose.cl
deleted file mode 100644
index 82db2908b5..0000000000
--- a/src/core/CL/cl_kernels/transpose.cl
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#define PARTIAL_STORE_M0 VEC_SIZE_LEFTOVER_X
-#define PARTIAL_STORE_N0 VEC_SIZE_LEFTOVER_Y
-
-#include "helpers.h"
-#include "repeat.h"
-
-#if defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
-
-#if VEC_SIZE_X == 1
-#if VEC_SIZE_Y == 1
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0                \
-    }
-#elif VEC_SIZE_Y == 2
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0, u1            \
-    }
-#elif VEC_SIZE_Y == 3
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0, u1, u2        \
-    }
-#elif VEC_SIZE_Y == 4
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0, u1, u2, u3    \
-    }
-#elif VEC_SIZE_Y == 8
-#define TRANSPOSED_U(val)              \
-    {                                  \
-        u0, u1, u2, u3, u4, u5, u6, u7 \
-    }
-#elif VEC_SIZE_Y == 16
-#define TRANSPOSED_U(val)                        \
-    {                                            \
-        u0, u1, u2, u3, u4, u5, u6, u7,          \
-        u8, u9, u10, u11, u12, u13, u14, u15 \
-    }
-#endif /* switch VEC_SIZE_Y */
-#else  // VEC_SIZE_X == 1
-#if VEC_SIZE_Y == 1
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0.val            \
-    }
-#elif VEC_SIZE_Y == 2
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0.val, u1.val    \
-    }
-#elif VEC_SIZE_Y == 3
-#define TRANSPOSED_U(val)      \
-    {                          \
-        u0.val, u1.val, u2.val \
-    }
-#elif VEC_SIZE_Y == 4
-#define TRANSPOSED_U(val)              \
-    {                                  \
-        u0.val, u1.val, u2.val, u3.val \
-    }
-#elif VEC_SIZE_Y == 8
-#define TRANSPOSED_U(val)                                              \
-    {                                                                  \
-        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val \
-    }
-#elif VEC_SIZE_Y == 16
-#define TRANSPOSED_U(val)                                                        \
-    {                                                                            \
-        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val,          \
-        u8.val, u9.val, u10.val, u11.val, u12.val, u13.val, u14.val, u15.val \
-    }
-#endif /* switch VEC_SIZE_Y */
-#endif // VEC_SIZE_X == 1
-
-#if DATA_TYPE_IN_BYTES == 4
-#define DATA_TYPE uint
-#elif DATA_TYPE_IN_BYTES == 2
-#define DATA_TYPE ushort
-#elif DATA_TYPE_IN_BYTES == 1
-#define DATA_TYPE uchar
-#else /* switch DATA_TYPE_IN_BYTES */
-#error DATA_TYPE_IN_BYTES not supported for transpose
-#endif /* switch DATA_TYPE_IN_BYTES */
-
-/** This OpenCL kernel computes the matrix transposition of input matrix
- *
- * @note The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be:
- *  -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices
- *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
- *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
- *  -# -DVEC_SIZE_X is the number of elements processed in X dimension
- *  -# -DVEC_SIZE_LEFTOVER_X is the leftover size in the X dimension; x_dimension % VEC_SIZE_X
- *  -# -DVEC_SIZE_Y is the number of elements processed in Y dimension
- *  -# -DVEC_SIZE_LEFTOVER_Y is the leftover size in the Y dimension; y_dimension % VEC_SIZE_Y
- *
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void transpose(IMAGE_DECLARATION(src),
-                        IMAGE_DECLARATION(dst))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
-    uint y_offs = max((int)(get_global_id(1) * VEC_SIZE_Y - (VEC_SIZE_Y - VEC_SIZE_LEFTOVER_Y) % VEC_SIZE_Y), 0);
-
-    // Compute addresses
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y;
-
-    // Load the NxM block at (x, y)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u0 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)src_addr);
-#if VEC_SIZE_Y > 1
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u1 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + src_stride_y));
-#endif /* VEC_SIZE_Y > 1 */
-#if VEC_SIZE_Y > 2
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u2 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif /* VEC_SIZE_Y > 2 */
-#if VEC_SIZE_Y > 3
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u3 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-#endif /* VEC_SIZE_Y > 3 */
-#if VEC_SIZE_Y > 4
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u4 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u5 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u6 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u7 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
-#endif /* VEC_SIZE_Y > 4 */
-#if VEC_SIZE_Y > 8
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u8 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u9 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 9 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u10 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 10 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u11 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 11 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u12 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 12 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u13 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 13 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u14 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 14 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u15 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 15 * src_stride_y));
-#endif /* VEC_SIZE_Y > 8 */
-
-    //Create transposed vectors
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t0 = TRANSPOSED_U(s0);
-#if VEC_SIZE_X > 1
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t1 = TRANSPOSED_U(s1);
-#endif /* VEC_SIZE_X > 1 */
-#if VEC_SIZE_X > 2
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t2 = TRANSPOSED_U(s2);
-#endif /* VEC_SIZE_X > 2 */
-#if VEC_SIZE_X > 3
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t3 = TRANSPOSED_U(s3);
-#endif /* VEC_SIZE_X > 3 */
-#if VEC_SIZE_X > 4
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t4 = TRANSPOSED_U(s4);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t5 = TRANSPOSED_U(s5);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t6 = TRANSPOSED_U(s6);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t7 = TRANSPOSED_U(s7);
-#endif /* VEC_SIZE_X > 4 */
-#if VEC_SIZE_X > 8
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t8 = TRANSPOSED_U(s8);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t9 = TRANSPOSED_U(s9);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tA = TRANSPOSED_U(sA);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tB = TRANSPOSED_U(sB);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tC = TRANSPOSED_U(sC);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tD = TRANSPOSED_U(sD);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tE = TRANSPOSED_U(sE);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tF = TRANSPOSED_U(sF);
-#endif /* VEC_SIZE_X > 8 */
-
-    // Store the block at (y, x)
-    REPEAT_VAR_INIT_TO_CONST(VEC_SIZE_X, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-    STORE_BLOCK_BOUNDARY_AWARE(VEC_SIZE_X, VEC_SIZE_Y, DATA_TYPE, t, (__global uchar *)dst_addr, dst_stride_y, zout, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_Y, VEC_SIZE_LEFTOVER_X != 0
-                               && get_global_id(0) == 0,
-                               VEC_SIZE_LEFTOVER_Y != 0 && get_global_id(1) == 0);
-}
-
-#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/unpooling_layer.cl b/src/core/CL/cl_kernels/unpooling_layer.cl
deleted file mode 100644
index 457e9bf8f1..0000000000
--- a/src/core/CL/cl_kernels/unpooling_layer.cl
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-/** Performs max unpooling function with pool size equal to 2.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
- * @note The width of the output tensor must be passed using -DWIDTH_DST e.g. -DWIDTH_DST=24
- * @note The height of the output tensor must be passed using -DHEIGHT_DST e.g. -DHEIGHT_DST=54
- * @note The depth of the output tensor must be passed using -DDEPTH_DST e.g. -DDEPTH_DST=32
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the output tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the output tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the output tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the output tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the output tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void max_unpooling_layer_2(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    TENSOR3D_DECLARATION(indices))
-{
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
-    unsigned int index = *((__global unsigned int *)indices.ptr);
-    DATA_TYPE value    = *((__global DATA_TYPE *)input.ptr);
-
-    *((__global DATA_TYPE *)tensor3D_index2ptr(&output, WIDTH_DST, HEIGHT_DST, DEPTH_DST, index)) = value;
-}
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/upsample_layer.cl b/src/core/CL/cl_kernels/upsample_layer.cl
deleted file mode 100644
index d0cc0f24b7..0000000000
--- a/src/core/CL/cl_kernels/upsample_layer.cl
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function applies upsample on an input image. (NCHW)
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: All
- * -# -DVEC_SIZE_IN = Input vector size
- * -# -DVEC_SIZE_OUT = Output vector size
- * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
- * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void upsample_layer_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
-    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
-    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
-    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data = vload8(0, (__global DATA_TYPE *)src.ptr);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
-
-    vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
-    vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
-#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
-#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-}
-
-/** This function applies upsample on an input image. (NHWC)
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: All
- * -# -DVEC_SIZE_IN = Input vector size
- * -# -DVEC_SIZE_OUT = Output vector size
- * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
- * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void upsample_layer_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
-    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
-    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
-    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)src.ptr);
-
-    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0));
-    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
-    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1));
-    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1));
-#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr);
-#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-}
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 005861ddfa..6595bd1981 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -31,11 +31,13 @@
  * @param[in] border_size Border size of the image
  *
  */
-inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+inline const float8
+clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
 {
     const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
     const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
-    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3,
+                    clamped_y.s3);
 }
 
 /** Clamps the given coordinates to the borders.
@@ -63,12 +65,6 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int
                                          *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
 }
 
-/** Returns the current thread coordinates. */
-inline const float2 get_current_coords()
-{
-    return (float2)(get_global_id(0) * 4, get_global_id(1));
-}
-
 /** Given a texel coordinates this function will return the following array of coordinates:
  * [ P, right neighbour, below neighbour, below right neighbour ]
  *
@@ -80,7 +76,8 @@ inline const float2 get_current_coords()
  */
 inline const float8 get_neighbour_coords(const float2 coord)
 {
-    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1,
+                    /*br*/ coord.s0 + 1, coord.s1 + 1);
 }
 
 /** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
@@ -91,37 +88,38 @@ inline const float8 get_neighbour_coords(const float2 coord)
  * @param[in] height      Height of the image
  * @param[in] border_size Border size
  */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(
+    const Image *in, const float8 coords, const float width, const float height, const float border_size)
 {
     // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
 
     // Sets the 4x4 coordinates for each of the four input texels
     const float8  fc = floor(coords);
-    const float16 c1 = (float16)(
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
-    const float16 c2 = (float16)(
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+    const float16 c1 =
+        (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                  clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 =
+        (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                  clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
 
     // Loads the values from the input image
     const float16 t = (float16)(
-                          /* tl, tr, bl, br */
-                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
-                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
-                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
-                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
-                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
-                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
-                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
-                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
-    const float8 a  = coords - fc;
-    const float8 b  = ((float8)(1.f)) - a;
-    const float4 fr = (float4)(
-                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
-                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
-                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
-                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+        /* tl, tr, bl, br */
+        *((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a = coords - fc;
+    const float8 b = ((float8)(1.f)) - a;
+    const float4 fr =
+        (float4)(((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                 ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                 ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                 ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
     return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
 }
 
@@ -132,7 +130,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const
  * @param[in] width  Width of the image
  * @param[in] height Height of the image
  */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4)
+    bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
 {
     return bilinear_interpolate_with_border(in, coords, width, height, 1);
 }
diff --git a/src/core/CL/cl_kernels/warp_helpers_quantized.h b/src/core/CL/cl_kernels/warp_helpers_quantized.h
deleted file mode 100644
index b10890aff0..0000000000
--- a/src/core/CL/cl_kernels/warp_helpers_quantized.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-/** Clamps the given coordinates to the borders according to the border size.
- *
- * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
- * @param[in] width       Width of the image
- * @param[in] height      Height of the image
- * @param[in] border_size Border size of the image
- *
- */
-inline const float8 clamp_to_border_with_size_quantized(float8 coords, const float width, const float height, const float border_size)
-{
-    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
-    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
-    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
-}
-
-/** Clamps the given coordinates to the borders.
- *
- * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
- * @param[in] width  Width of the image
- * @param[in] height Height of the image
- *
- */
-inline const float8 clamp_to_border_quantized(float8 coords, const float width, const float height)
-{
-    return clamp_to_border_with_size_quantized(coords, width, height, 1);
-}
-
-/** Given a texel coordinates this function will return the following array of coordinates:
- * [ P, right neighbour, below neighbour, below right neighbour ]
- *
- * @note No checks to see if the coordinates are out of the image are done here.
- *
- * @param[in] coord Input coordinates
- *
- * @return vector of 8 floats with the coordinates, even positions are x and odd y.
- */
-inline const float8 get_neighbour_coords_quantized(const float2 coord)
-{
-    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
-}
-
-/** Returns the current thread coordinates. */
-inline const float2 get_current_coords_quantized()
-{
-    return (float2)(get_global_id(0) * 4, get_global_id(1));
-}
-
-/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
- *
- * @param[in] in            Pointer to the source image.
- * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
- * @param[in] width         Width of the image
- * @param[in] height        Height of the image
- * @param[in] border_size   Border size
- * @param[in] scale         Scale value
- * @param[in] offset_qasymm Offset value
- */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border_quantized(const Image *in, const float8 coords, const float width, const float height, const float border_size,
-                                                                                    const float scale, const int offset_qasymm)
-{
-    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
-
-    // Sets the 4x4 coordinates for each of the four input texels
-    const float8  fc = floor(coords);
-    const float16 c1 = (float16)(
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s0, fc.s1)), width, height, border_size),
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s2, fc.s3)), width, height, border_size));
-    const float16 c2 = (float16)(
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s4, fc.s5)), width, height, border_size),
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s6, fc.s7)), width, height, border_size));
-
-    // Loads the values from the input image
-    const int16 t = (int16)(
-                        /* tl, tr, bl, br */
-                        * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
-                        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
-                        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
-                        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
-                        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
-                        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
-                        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
-                        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
-
-    const float16 inf32 = convert_float16(t - (int16)offset_qasymm) * (float16)scale;
-
-    const float8 a  = coords - fc;
-    const float8 b  = ((float8)(1.f)) - a;
-    const float4 fr = (float4)(
-                          ((inf32.s0 * b.s0 * b.s1) + (inf32.s1 * a.s0 * b.s1) + (inf32.s2 * b.s0 * a.s1) + (inf32.s3 * a.s0 * a.s1)),
-                          ((inf32.s4 * b.s2 * b.s3) + (inf32.s5 * a.s2 * b.s3) + (inf32.s6 * b.s2 * a.s3) + (inf32.s7 * a.s2 * a.s3)),
-                          ((inf32.s8 * b.s4 * b.s5) + (inf32.s9 * a.s4 * b.s5) + (inf32.sa * b.s4 * a.s5) + (inf32.sb * a.s4 * a.s5)),
-                          ((inf32.sc * b.s6 * b.s7) + (inf32.sd * a.s6 * b.s7) + (inf32.se * b.s6 * a.s7) + (inf32.sf * a.s6 * a.s7)));
-
-    const VEC_DATA_TYPE(DATA_TYPE, 4) res = CONVERT_SAT(convert_int4_sat_rtp(fr / scale) + offset_qasymm, VEC_DATA_TYPE(DATA_TYPE, 4));
-
-    return res;
-}
-
-/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
- *
- * @param[in] in            Pointer to the source image.
- * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
- * @param[in] width         Width of the image
- * @param[in] height        Height of the image
- * @param[in] scale         Scale value
- * @param[in] offset_qasymm Offset value
- */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_quantized(const Image *in, const float8 coords, const float width, const float height, const float scale, const int offset_qasymm)
-{
-    return bilinear_interpolate_with_border_quantized(in, coords, width, height, 1, scale, offset_qasymm);
-}
diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/winograd_filter_transform.cl
deleted file mode 100644
index 5c3bb8aa9b..0000000000
--- a/src/core/CL/cl_kernels/winograd_filter_transform.cl
+++ /dev/null
@@ -1,1952 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(SRC_DIM_Z)
-
-#define OUTPUT_ROW_2x2_7x7(out, tmp)                                                                                               \
-    ({                                                                                                                             \
-        out.s0 = -tmp.s0 / 36.f;                                                                                                   \
-        out.s1 = (tmp.s0 - tmp.s1 + tmp.s2 - tmp.s3 + tmp.s4 - tmp.s5 + tmp.s6) / 48.f;                                            \
-        out.s2 = (tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3 + tmp.s4 + tmp.s5 + tmp.s6) / 48.f;                                            \
-        out.s3 = (-tmp.s0 + 2.f * tmp.s1 - 4.f * tmp.s2 + 8.f * tmp.s3 - 16.f * tmp.s4 + 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
-        out.s4 = (-tmp.s0 - 2.f * tmp.s1 - 4.f * tmp.s2 - 8.f * tmp.s3 - 16.f * tmp.s4 - 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
-        out.s5 = (tmp.s0 - 3.f * tmp.s1 + 9.f * tmp.s2 - 27.f * tmp.s3 + 81.f * tmp.s4 - 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
-        out.s6 = (tmp.s0 + 3.f * tmp.s1 + 9.f * tmp.s2 + 27.f * tmp.s3 + 81.f * tmp.s4 + 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
-        out.s7 = tmp.s6;                                                                                                           \
-    })
-
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x2_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0    = 0.0f;
-    out0.s0 = (w0.s0);
-    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
-    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
-    out0.s3 = (w0.s2);
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out1    = 0.0f;
-    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
-    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
-    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
-    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out2    = 0.0f;
-    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
-    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
-    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
-    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out3    = 0.0f;
-    out3.s0 = (w2.s0);
-    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
-    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
-    out3.s3 = (w2.s2);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    // 16 channels for 3x3 kernels
-    // 4 channels for 3x1 or 1x3 kernels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0    = 0.0f;
-    out0.s0 = (w0.s0) / 16.f;
-    out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
-    out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
-    out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
-    out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
-    out0.s5 = (w0.s2) / 4.f;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1    = 0.0f;
-    out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
-    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
-    out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
-    out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
-    out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
-    out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2    = 0.0f;
-    out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
-    out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
-    out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
-    out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
-    out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
-    out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3    = 0.0f;
-    out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
-    out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4    = 0.0f;
-    out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
-    out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5    = 0.0f;
-    out5.s0 = (w2.s0) / 4.f;
-    out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
-    out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
-    out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
-    out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
-    out5.s5 = (w2.s2);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    // 36 channels for 3x3 kernels
-    // 6 channels for 3x1 or 1x3 kernels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_3x3_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    // Row 0
-    DATA_TYPE out00, out01, out02, out03, out04, out05;
-    out00 = (w00) / 16.f;
-    out01 = (-w00 - w01 - w02) / 24.f;
-    out02 = (-w00 + w01 - w02) / 24.f;
-    out03 = (w00 + 2.f * w01 + 4.f * w02) / 96.f;
-    out04 = (w00 - 2.f * w01 + 4.f * w02) / 96.f;
-    out05 = (w02) / 4.f;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    DATA_TYPE out10, out11, out12, out13, out14, out15;
-    out10 = (-w00 - w10 - w20) / 24.f;
-    out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f;
-    out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f;
-    out13 = (-w00 - w10 - w20 + 2.f * (-w01 - w11 - w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
-    out14 = (-w00 - w10 - w20 + 2.f * (w01 + w11 + w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
-    out15 = (-w02 - w12 - w22) / 6.f;
-
-    // Row 2
-    DATA_TYPE out20, out21, out22, out23, out24, out25;
-    out20 = (-w00 + w10 - w20) / 24.f;
-    out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f;
-    out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f;
-    out23 = (-w00 + w10 - w20 + 2.f * (-w01 + w11 - w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
-    out24 = (-w00 + w10 - w20 + 2.f * (w01 - w11 + w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
-    out25 = (-w02 + w12 - w22) / 6.f;
-
-    // Row 3
-    DATA_TYPE out30, out31, out32, out33, out34, out35;
-    out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f;
-    out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
-    out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
-    out33 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (w01 + 2.f * w11 + 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
-    out34 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (-w01 - 2.f * w11 - 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
-    out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f;
-
-    // Row 4
-    DATA_TYPE out40, out41, out42, out43, out44, out45;
-    out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f;
-    out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
-    out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
-    out43 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (w01 - 2.f * w11 + 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
-    out44 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (-w01 + 2.f * w11 - 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
-    out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f;
-
-    // Row 5
-    DATA_TYPE out50, out51, out52, out53, out54, out55;
-    out50 = (w20) / 4.f;
-    out51 = (-w20 - w21 - w22) / 6.f;
-    out52 = (-w20 + w21 - w22) / 6.f;
-    out53 = (w20 + 2.f * w21 + 4.f * w22) / 24.f;
-    out54 = (w20 - 2.f * w21 + 4.f * w22) / 24.f;
-    out55 = (w22);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int x0 = get_global_id(2); // idx filter
-    int y0 = get_global_id(0); // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    // 36 channels for 3x3 kernels
-    // 6  channels for 3x1 or 1x3 kernels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out00;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out01;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out02;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out03;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out04;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out05;
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out10;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out11;
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out12;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out13;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out14;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out15;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out20;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out21;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out22;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out23;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out24;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out25;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out30;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out31;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out32;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out33;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out34;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out35;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out40;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out41;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out42;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out43;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out44;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out45;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out50;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out51;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out52;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out53;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out54;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- *
- * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_5x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w10           = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w20           = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w30           = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w40           = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Transform the input tile
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0    = 0.0f;
-    out0.s0 = w00.s0;
-    out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
-    out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
-    out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
-    out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
-    out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
-    out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
-    out0.s7 = w01;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1    = 0.0f;
-    out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
-    out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
-                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
-    out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
-                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
-    out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
-    out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
-    out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
-    out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
-    out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2    = 0.0f;
-    out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
-    out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
-                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
-    out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
-                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
-    out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
-    out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
-    out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
-    out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
-    out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3    = 0.0f;
-    out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
-    out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
-    out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
-    out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
-    out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
-    out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
-    out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
-    out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4    = 0.0f;
-    out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
-    out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
-    out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
-    out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
-    out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
-    out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
-    out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
-    out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5    = 0.0f;
-    out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
-    out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
-    out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
-    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
-    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
-    out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
-
-    // Row 6
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out6    = 0.0f;
-    out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
-    out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
-    out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
-    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
-    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
-    out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
-
-    // Row 7
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out7    = 0.0f;
-    out7.s0 = w40.s0;
-    out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
-    out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
-    out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
-    out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
-    out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
-    out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
-    out7.s7 = w41;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
-    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
-    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
-    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
-    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
-    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
-    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
-    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
-    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
-    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
-    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
-    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
-    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
-    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
-    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
-    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
-    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
-    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
-    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
-    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
-    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_5x5_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
-
-#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Load the values from the input tensor
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Load the values from the input tensor
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0    = 0.0f;
-    out0.s0 = w00;
-    out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;
-    out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;
-    out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;
-    out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;
-    out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;
-    out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;
-    out0.s7 = w04;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1    = 0.0f;
-    out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;
-    out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
-    out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
-    out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
-                (w04 + w14 + w24 + w34 + w44)) / 405.f;
-    out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
-                (w04 + w14 + w24 + w34 + w44)) / 405.f;
-    out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +
-                (w04 + w14 + w24 + w34 + w44)) / 810.f;
-    out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +
-                (w04 + w14 + w24 + w34 + w44)) / 810.f;
-    out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2    = 0.0f;
-    out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;
-    out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
-    out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
-    out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
-                (w04 - w14 + w24 - w34 + w44)) / 405.f;
-    out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
-                (w04 - w14 + w24 - w34 + w44)) / 405.f;
-    out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +
-                (w04 - w14 + w24 - w34 + w44)) / 810.f;
-    out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +
-                (w04 - w14 + w24 - w34 + w44)) / 810.f;
-    out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3    = 0.0f;
-    out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;
-    out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +
-                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
-    out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -
-                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
-    out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f
-               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
-    out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f
-               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
-    out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
-               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
-    out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
-               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
-    out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4    = 0.0f;
-    out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;
-    out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +
-                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
-    out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -
-                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
-    out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f
-               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
-    out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f
-               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
-    out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
-               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
-    out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
-               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
-    out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5    = 0.0f;
-    out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;
-    out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +
-                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
-    out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -
-                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
-    out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f
-               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
-    out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f
-               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
-    out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
-               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
-    out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
-               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
-    out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;
-
-    // Row 6
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out6    = 0.0f;
-    out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;
-    out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +
-                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
-    out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -
-                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
-    out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f
-               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
-    out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f
-               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
-    out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
-               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
-    out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
-               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
-    out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;
-
-    // Row 7
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out7    = 0.0f;
-    out7.s0 = w40;
-    out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;
-    out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;
-    out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;
-    out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;
-    out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;
-    out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;
-    out7.s7 = w44;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int x0 = get_global_id(2); // idx filter
-    int y0 = get_global_id(0); // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
-    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
-    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
-    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
-    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
-    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
-    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
-    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
-    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
-    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
-    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
-    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
-    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
-    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
-    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
-    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
-    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
-    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
-    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
-    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
-    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-/** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x2_7x7_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
-
-#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Load the values from the input tensor
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Load the values from the input tensor
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w15 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w16 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 6 * src_stride_y));
-
-    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w25 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w26 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 6 * src_stride_y));
-
-    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w35 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w36 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 6 * src_stride_y));
-
-    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w45 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w46 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 6 * src_stride_y));
-
-    DATA_TYPE w50 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w51 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w52 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w53 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w54 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w55 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w56 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 6 * src_stride_y));
-
-    DATA_TYPE w60 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w61 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w62 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w63 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w64 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w65 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w66 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 6 * src_stride_y));
-
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    tmp = 0.0f;
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0 = 0.0f;
-
-    out0.s0 = -w00 / 36.0f;
-    out0.s1 = (w00 - w01 + w02 - w03 + w04 - w05 + w06) / 48.f;
-    out0.s2 = (w00 + w01 + w02 + w03 + w04 + w05 + w06) / 48.f;
-    out0.s3 = (-w00 + 2.f * w01 - 4.f * w02 + 8.f * w03 - 16.f * w04 + 32.f * w05 - 64.f * w06) / 120.f;
-    out0.s4 = (-w00 - 2.f * w01 - 4.f * w02 - 8.f * w03 - 16.f * w04 - 32.f * w05 - 64.f * w06) / 120.f;
-    out0.s5 = (w00 - 3.f * w01 + 9.f * w02 - 27.f * w03 + 81.f * w04 - 243.f * w05 + 729.f * w06) / 720.f;
-    out0.s6 = (w00 + 3.f * w01 + 9.f * w02 + 27.f * w03 + 81.f * w04 + 243.f * w05 + 729.f * w06) / 720.f;
-    out0.s7 = w06;
-
-    out0 /= (VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.f;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1 = 0.0f;
-
-    tmp.s0 = (w00 - w10 + w20 - w30 + w40 - w50 + w60) / 48.f;
-    tmp.s1 = (w01 - w11 + w21 - w31 + w41 - w51 + w61) / 48.f;
-    tmp.s2 = (w02 - w12 + w22 - w32 + w42 - w52 + w62) / 48.f;
-    tmp.s3 = (w03 - w13 + w23 - w33 + w43 - w53 + w63) / 48.f;
-    tmp.s4 = (w04 - w14 + w24 - w34 + w44 - w54 + w64) / 48.f;
-    tmp.s5 = (w05 - w15 + w25 - w35 + w45 - w55 + w65) / 48.f;
-    tmp.s6 = (w06 - w16 + w26 - w36 + w46 - w56 + w66) / 48.f;
-
-    OUTPUT_ROW_2x2_7x7(out1, tmp);
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2 = 0.0f;
-
-    tmp.s0 = (w00 + w10 + w20 + w30 + w40 + w50 + w60) / 48.f;
-    tmp.s1 = (w01 + w11 + w21 + w31 + w41 + w51 + w61) / 48.f;
-    tmp.s2 = (w02 + w12 + w22 + w32 + w42 + w52 + w62) / 48.f;
-    tmp.s3 = (w03 + w13 + w23 + w33 + w43 + w53 + w63) / 48.f;
-    tmp.s4 = (w04 + w14 + w24 + w34 + w44 + w54 + w64) / 48.f;
-    tmp.s5 = (w05 + w15 + w25 + w35 + w45 + w55 + w65) / 48.f;
-    tmp.s6 = (w06 + w16 + w26 + w36 + w46 + w56 + w66) / 48.f;
-
-    OUTPUT_ROW_2x2_7x7(out2, tmp);
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3 = 0.0f;
-
-    tmp.s0 = (-w00 + 2.f * w10 - 4.f * w20 + 8.f * w30 - 16.f * w40 + 32.f * w50 - 64.f * w60) / 120.f;
-    tmp.s1 = (-w01 + 2.f * w11 - 4.f * w21 + 8.f * w31 - 16.f * w41 + 32.f * w51 - 64.f * w61) / 120.f;
-    tmp.s2 = (-w02 + 2.f * w12 - 4.f * w22 + 8.f * w32 - 16.f * w42 + 32.f * w52 - 64.f * w62) / 120.f;
-    tmp.s3 = (-w03 + 2.f * w13 - 4.f * w23 + 8.f * w33 - 16.f * w43 + 32.f * w53 - 64.f * w63) / 120.f;
-    tmp.s4 = (-w04 + 2.f * w14 - 4.f * w24 + 8.f * w34 - 16.f * w44 + 32.f * w54 - 64.f * w64) / 120.f;
-    tmp.s5 = (-w05 + 2.f * w15 - 4.f * w25 + 8.f * w35 - 16.f * w45 + 32.f * w55 - 64.f * w65) / 120.f;
-    tmp.s6 = (-w06 + 2.f * w16 - 4.f * w26 + 8.f * w36 - 16.f * w46 + 32.f * w56 - 64.f * w66) / 120.f;
-
-    OUTPUT_ROW_2x2_7x7(out3, tmp);
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4 = 0.0f;
-
-    tmp.s0 = (-w00 - 2.f * w10 - 4.f * w20 - 8.f * w30 - 16.f * w40 - 32.f * w50 - 64.f * w60) / 120.f;
-    tmp.s1 = (-w01 - 2.f * w11 - 4.f * w21 - 8.f * w31 - 16.f * w41 - 32.f * w51 - 64.f * w61) / 120.f;
-    tmp.s2 = (-w02 - 2.f * w12 - 4.f * w22 - 8.f * w32 - 16.f * w42 - 32.f * w52 - 64.f * w62) / 120.f;
-    tmp.s3 = (-w03 - 2.f * w13 - 4.f * w23 - 8.f * w33 - 16.f * w43 - 32.f * w53 - 64.f * w63) / 120.f;
-    tmp.s4 = (-w04 - 2.f * w14 - 4.f * w24 - 8.f * w34 - 16.f * w44 - 32.f * w54 - 64.f * w64) / 120.f;
-    tmp.s5 = (-w05 - 2.f * w15 - 4.f * w25 - 8.f * w35 - 16.f * w45 - 32.f * w55 - 64.f * w65) / 120.f;
-    tmp.s6 = (-w06 - 2.f * w16 - 4.f * w26 - 8.f * w36 - 16.f * w46 - 32.f * w56 - 64.f * w66) / 120.f;
-
-    OUTPUT_ROW_2x2_7x7(out4, tmp);
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5 = 0.0f;
-
-    tmp.s0 = (w00 - 3.f * w10 + 9.f * w20 - 27.f * w30 + 81.f * w40 - 243.f * w50 + 729.f * w60) / 720.f;
-    tmp.s1 = (w01 - 3.f * w11 + 9.f * w21 - 27.f * w31 + 81.f * w41 - 243.f * w51 + 729.f * w61) / 720.f;
-    tmp.s2 = (w02 - 3.f * w12 + 9.f * w22 - 27.f * w32 + 81.f * w42 - 243.f * w52 + 729.f * w62) / 720.f;
-    tmp.s3 = (w03 - 3.f * w13 + 9.f * w23 - 27.f * w33 + 81.f * w43 - 243.f * w53 + 729.f * w63) / 720.f;
-    tmp.s4 = (w04 - 3.f * w14 + 9.f * w24 - 27.f * w34 + 81.f * w44 - 243.f * w54 + 729.f * w64) / 720.f;
-    tmp.s5 = (w05 - 3.f * w15 + 9.f * w25 - 27.f * w35 + 81.f * w45 - 243.f * w55 + 729.f * w65) / 720.f;
-    tmp.s6 = (w06 - 3.f * w16 + 9.f * w26 - 27.f * w36 + 81.f * w46 - 243.f * w56 + 729.f * w66) / 720.f;
-
-    OUTPUT_ROW_2x2_7x7(out5, tmp);
-
-    // Row 6
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out6 = 0.0f;
-
-    tmp.s0 = (w00 + 3.f * w10 + 9.f * w20 + 27.f * w30 + 81.f * w40 + 243.f * w50 + 729.f * w60) / 720.f;
-    tmp.s1 = (w01 + 3.f * w11 + 9.f * w21 + 27.f * w31 + 81.f * w41 + 243.f * w51 + 729.f * w61) / 720.f;
-    tmp.s2 = (w02 + 3.f * w12 + 9.f * w22 + 27.f * w32 + 81.f * w42 + 243.f * w52 + 729.f * w62) / 720.f;
-    tmp.s3 = (w03 + 3.f * w13 + 9.f * w23 + 27.f * w33 + 81.f * w43 + 243.f * w53 + 729.f * w63) / 720.f;
-    tmp.s4 = (w04 + 3.f * w14 + 9.f * w24 + 27.f * w34 + 81.f * w44 + 243.f * w54 + 729.f * w64) / 720.f;
-    tmp.s5 = (w05 + 3.f * w15 + 9.f * w25 + 27.f * w35 + 81.f * w45 + 243.f * w55 + 729.f * w65) / 720.f;
-    tmp.s6 = (w06 + 3.f * w16 + 9.f * w26 + 27.f * w36 + 81.f * w46 + 243.f * w56 + 729.f * w66) / 720.f;
-
-    OUTPUT_ROW_2x2_7x7(out6, tmp);
-
-    // Row 7
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out7 = 0.0f;
-
-    tmp.s0 = w60;
-    tmp.s1 = w61;
-    tmp.s2 = w62;
-    tmp.s3 = w63;
-    tmp.s4 = w64;
-    tmp.s5 = w65;
-    tmp.s6 = w66;
-
-    OUTPUT_ROW_2x2_7x7(out7, tmp);
-
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int x0 = get_global_id(2); // idx filter
-    int y0 = get_global_id(0); // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
-    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
-    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
-    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
-    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
-    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
-    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
-    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
-    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
-    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
-    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
-    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
-    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
-    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
-    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
-    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
-    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
-    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
-    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
-    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
-    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-#endif // defined(SRC_DIM_Z)
-
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_5x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_3x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NHWC and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_5x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x1_7x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x2_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x3_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NHWC and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x5_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x2_1x7_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
deleted file mode 100644
index fbb5e95196..0000000000
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ /dev/null
@@ -1,2233 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "tile_helpers.h"
-
-#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
-    ({                                                              \
-        comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
-        comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5;            \
-        comm_fact.s2 = 2.5f * tmp.s3;                               \
-        comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
-        comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6;    \
-        comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4;        \
-        comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
-        \
-        out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
-        out.s1 = comm_fact.s0 + comm_fact.s1;                       \
-        out.s2 = comm_fact.s0 - comm_fact.s1;                       \
-        out.s3 = comm_fact.s3 + comm_fact.s4;                       \
-        out.s4 = comm_fact.s4 - comm_fact.s3;                       \
-        out.s5 = comm_fact.s5 + comm_fact.s6;                       \
-        out.s6 = comm_fact.s5 - comm_fact.s6;                       \
-        out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
-    })
-
-#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                    \
-    ({                                                                                             \
-        comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6;                                   \
-        comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5;                            \
-        comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6;                                    \
-        comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5;                            \
-        comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6;                                     \
-        comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5;                            \
-        out.s0       = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6;                \
-        out.s1       = comm_fact.s0 - comm_fact.s1;                                                \
-        out.s2       = comm_fact.s0 + comm_fact.s1;                                                \
-        out.s3       = comm_fact.s2 - comm_fact.s3;                                                \
-        out.s4       = comm_fact.s2 + comm_fact.s3;                                                \
-        out.s5       = comm_fact.s4 - comm_fact.s5;                                                \
-        out.s6       = comm_fact.s4 + comm_fact.s5;                                                \
-        out.s7       = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
-    })
-
-#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
-/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = get_global_id(2) % SRC_DEPTH;
-    const int b = get_global_id(2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int z              = get_global_id(2);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp0 = in_row0;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    tmp0 -= in_row2;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
-    DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
-    DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
-    DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp1 = in_row1 + in_row2;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp2 = in_row2 - in_row1;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp3 = in_row1 - in_row3;
-
-    DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
-    DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
-    DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
-    DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
-
-    DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
-    DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
-    DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
-    DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
-
-    DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
-    DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
-    DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
-    DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
-#endif /* defined(SRC_DEPTH) */
-
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z))  = out10;
-    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z))  = out11;
-    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z))  = out12;
-    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z))  = out13;
-    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out20;
-    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out21;
-    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
-    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
-    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
-    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
-    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
-    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = (get_global_id(2) * 2) % SRC_DEPTH;
-    const int b = (get_global_id(2) * 2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int       z        = get_global_id(2) * 2;
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    src_addr += src_stride_z;
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp0 = in_row0;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp4 = in_row4;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    tmp0 -= in_row2;
-    tmp4 -= in_row6;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp1 = in_row1 + in_row2;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp2 = in_row2 - in_row1;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp3 = in_row1 - in_row3;
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp5 = in_row5 + in_row6;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp6 = in_row6 - in_row5;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp7 = in_row5 - in_row7;
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
-#endif /* defined(SRC_DEPTH) */
-
-    vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
-    vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
-    vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
-    vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
-    vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
-    vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
-    vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
-    vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
-    vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
-    vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
-    vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
-    vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
-    vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
-    vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
-    vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = get_global_id(2) % SRC_DEPTH;
-    const int b = get_global_id(2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int       z        = get_global_id(2);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row0
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row0
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d01                                        = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE out0 = 0.0f;
-    DATA_TYPE out1 = 0.0f;
-    DATA_TYPE out2 = 0.0f;
-    DATA_TYPE out3 = 0.0f;
-    DATA_TYPE out4 = 0.0f;
-    DATA_TYPE out5 = 0.0f;
-
-    // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
-    out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
-    out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
-    out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
-    out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
-    out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
-    out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row4
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-
-    // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
-    DATA_TYPE k0 = d41.s0;
-    DATA_TYPE k1 = d41.s0;
-    DATA_TYPE k2 = d41.s0;
-    DATA_TYPE k3 = d41.s0;
-    DATA_TYPE k4 = d41.s0;
-    DATA_TYPE k5 = 0.0f;
-
-    k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
-    k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
-    k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
-    k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
-    k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
-    k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
-
-    out0 += k0;
-    out1 += k1;
-    out2 += k2;
-    out3 += k3;
-    out4 += k4;
-    out5 += k5;
-
-    // Row2
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-
-    out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
-    out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
-    out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
-    out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
-    out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
-    out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
-#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Compute destination address
-#if defined(SRC_DEPTH)
-    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
-#else  /* defined(SRC_DEPTH) */
-    __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
-#endif /* defined(SRC_DEPTH) */
-
-    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
-
-    *(dst_addr) = out0;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out1;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out2;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out3;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out4;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out5;
-    dst_addr += dst_plane_stride;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    DATA_TYPE out6  = k0;
-    DATA_TYPE out7  = k1;
-    DATA_TYPE out8  = k2;
-    DATA_TYPE out9  = k3;
-    DATA_TYPE out10 = k4;
-    DATA_TYPE out11 = k5;
-    DATA_TYPE out12 = k0;
-    DATA_TYPE out13 = k1;
-    DATA_TYPE out14 = k2;
-    DATA_TYPE out15 = k3;
-    DATA_TYPE out16 = k4;
-    DATA_TYPE out17 = k5;
-    DATA_TYPE out18 = k0;
-    DATA_TYPE out19 = k1;
-    DATA_TYPE out20 = k2;
-    DATA_TYPE out21 = k3;
-    DATA_TYPE out22 = k4;
-    DATA_TYPE out23 = k5;
-    DATA_TYPE out24 = k0;
-    DATA_TYPE out25 = k1;
-    DATA_TYPE out26 = k2;
-    DATA_TYPE out27 = k3;
-    DATA_TYPE out28 = k4;
-    DATA_TYPE out29 = k5;
-
-    // Row1
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-
-    // Row3
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-
-    // Compute common parts for the channels between [6, 29]
-    // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
-    // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
-    DATA_TYPE part0  = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
-    DATA_TYPE part1  = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
-    DATA_TYPE part2  = 16.0f * d20.s2 - 4.0f * d21.s0;
-    DATA_TYPE part3  = 16.0f * d20.s1 - 4.0f * d20.s3;
-    DATA_TYPE part4  = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
-    DATA_TYPE part5  = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
-    DATA_TYPE part6  = 4.0f * d20.s2 - 4.0f * d21.s0;
-    DATA_TYPE part7  = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
-    DATA_TYPE part8  = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
-    DATA_TYPE part9  = 8.0f * d20.s1 - 8.0f * d20.s3;
-    DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
-    DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
-
-    // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
-    // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
-    DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
-    DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
-    DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
-    DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
-    DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
-    DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
-    DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
-    DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
-    DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
-    DATA_TYPE part21 = part9 * 0.25f;                                                 // 2.0f * (d20.s1 - d20.s3)
-    DATA_TYPE part22 = part10 * 0.25f;                                                // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
-    DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
-
-    out6 += part0 - part1;
-    out12 += part0 + part1;
-    out7 += part2 + part3 + part4 + part5;
-    out8 += part2 - part3 + part4 - part5;
-    out13 += part2 + part3 - part4 - part5;
-    out14 += part2 - part3 - part4 + part5;
-    out9 += part6 + part7 + part8 + part9;
-    out10 += part6 - part7 + part8 - part9;
-    out15 += part6 - part7 - part8 + part9;
-    out16 += part6 + part7 - part8 - part9;
-    out11 += part10 + part11;
-    out17 += part10 - part11;
-
-    out18 += part13 - part12;
-    out24 += part13 + part12;
-    out19 += part14 + part15 + part16 + part17;
-    out20 += part14 - part15 + part16 - part17;
-    out25 += part14 - part15 - part16 + part17;
-    out26 += part14 + part15 - part16 - part17;
-    out21 += part18 + part19 + part20 + part21;
-    out22 += part18 - part19 + part20 - part21;
-    out27 += part18 - part19 - part20 + part21;
-    out28 += part18 + part19 - part20 - part21;
-    out23 += part22 + part23;
-    out29 += part22 - part23;
-
-    *(dst_addr) = out6;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out7;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out8;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out9;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out10;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out11;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out12;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out13;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out14;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out15;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out16;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out17;
-    dst_addr += dst_plane_stride;
-
-    *(dst_addr) = out18;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out19;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out20;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out21;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out22;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out23;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out24;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out25;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out26;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out27;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out28;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out29;
-    dst_addr += dst_plane_stride;
-
-    // Row5
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-
-    // Channels [30, 35]
-    out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
-
-    *(dst_addr) = out0;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out1;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out2;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out3;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out4;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out5;
-    dst_addr += dst_plane_stride;
-#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = get_global_id(2) % SRC_DEPTH;
-    const int b = get_global_id(2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int                                z = get_global_id(2);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-    // Load input tile
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Calculate common factors for intermediate tensor
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    tmp0 = in_row0;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact0 = 0.0f;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
-    tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
-
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
-
-    comm_fact0 = (DATA_TYPE)2.5f * in_row3;
-    comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5;
-
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
-
-    comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
-    comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
-
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Calculate output rows (reuse comm_fact0 vector)
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0;
-
-    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1, out2, out3, out4, out5, out6, out7;
-
-    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Store values across the channels
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
-#endif /* defined(SRC_DEPTH) */
-
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
-    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
-    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
-    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
-    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
-    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
-    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
-    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
-    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
-    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
-    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
-    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
-    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
-    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
-    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
-    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
-    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
-    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
-    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
-    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
-    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
-    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
-    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
-    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
-    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
-    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
-    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
-    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
-    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
-    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
-    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
-    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
-    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
-    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
-    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
-    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
-    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
-    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
-    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
-    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
-    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
-    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
-    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
-    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
-    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
-    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
-    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
-    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
-    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
-    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
-    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
-    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
-    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
-    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
-    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
-    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
-    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
-    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
-    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
-    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
-    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
-    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-    x -= PAD_LEFT;
-    y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 6, 1, in);
-    TILE(DATA_TYPE, 6, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        in[i].v = 0;
-    })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    TILE(DATA_TYPE, 6, 1, com);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        in[i].v *= 4.0f;
-    })
-
-    com[0].v = in[2].v - 4.f * in[0].v;
-    com[1].v = in[3].v - 4.f * in[1].v;
-    com[2].v = in[4].v - 4.f * in[2].v;
-    com[3].v = in[5].v - 4.f * in[3].v;
-    com[4].v = in[3].v - in[1].v;
-    com[4].v = com[4].v + com[4].v;
-    com[5].v = in[4].v - in[2].v;
-
-    out[0].v = com[2].v - com[0].v;
-    out[1].v = com[2].v + com[1].v;
-    out[2].v = com[2].v - com[1].v;
-    out[3].v = com[5].v + com[4].v;
-    out[4].v = com[5].v - com[4].v;
-    out[5].v = com[3].v - com[1].v;
-
-    TILE(uint, 6, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 6;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 36, 1, in);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the tile from a NHWC tensor
-    T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
-    TILE(DATA_TYPE, 6, 1, com);
-    TILE(DATA_TYPE, 36, 1, tmp);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        com[0].v         = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
-        com[1].v         = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
-        com[2].v         = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
-        com[3].v         = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
-        com[4].v         = in[3 * 6 + i].v - in[1 * 6 + i].v;
-        com[4].v         = com[4].v + com[4].v;
-        com[5].v         = in[4 * 6 + i].v - in[2 * 6 + i].v;
-        tmp[i + 0 * 6].v = com[2].v - com[0].v;
-        tmp[i + 1 * 6].v = com[2].v + com[1].v;
-        tmp[i + 2 * 6].v = com[2].v - com[1].v;
-        tmp[i + 3 * 6].v = com[5].v + com[4].v;
-        tmp[i + 4 * 6].v = com[5].v - com[4].v;
-        tmp[i + 5 * 6].v = com[3].v - com[1].v;
-    })
-
-    TILE(DATA_TYPE, 36, 1, out);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        com[0].v         = tmp[i * 6 + 2].v - 4.f * tmp[i * 6 + 0].v;
-        com[1].v         = tmp[i * 6 + 3].v - 4.f * tmp[i * 6 + 1].v;
-        com[2].v         = tmp[i * 6 + 4].v - 4.f * tmp[i * 6 + 2].v;
-        com[3].v         = tmp[i * 6 + 5].v - 4.f * tmp[i * 6 + 3].v;
-        com[4].v         = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
-        com[4].v         = com[4].v + com[4].v;
-        com[5].v         = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
-        out[i * 6 + 0].v = com[2].v - com[0].v;
-        out[i * 6 + 1].v = com[2].v + com[1].v;
-        out[i * 6 + 2].v = com[2].v - com[1].v;
-        out[i * 6 + 3].v = com[5].v + com[4].v;
-        out[i * 6 + 4].v = com[5].v - com[4].v;
-        out[i * 6 + 5].v = com[3].v - com[1].v;
-    })
-
-    // Compute destination address
-    TILE(uint, 36, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 36;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
-    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
-    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-    x -= PAD_LEFT;
-    y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 8, 1, in);
-    TILE(DATA_TYPE, 8, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    TILE(DATA_TYPE, 1, 8, com);
-
-    com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;
-    com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;
-    com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;
-    com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;
-    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
-    com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;
-    out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;
-    out[1].s[0] = com[0].s[0] + com[0].s[1];
-    out[2].s[0] = com[0].s[0] - com[0].s[1];
-    out[3].s[0] = com[0].s[3] + com[0].s[2];
-    out[4].s[0] = com[0].s[3] - com[0].s[2];
-    out[5].s[0] = com[0].s[4] + com[0].s[5];
-    out[6].s[0] = com[0].s[4] - com[0].s[5];
-    out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;
-
-    TILE(uint, 8, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 64, 1, in);
-    TILE(DATA_TYPE, 64, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the tile from a NHWC tensor
-    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
-    TILE(DATA_TYPE, 8, 8, com);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                                    // x
-        com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];                                    // x
-        com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                 // x
-        com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
-        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
-        com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
-        com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
-    })
-
-    TILE(DATA_TYPE, 8, 8, tmp);
-    tmp[0].v = com[6].v;
-    tmp[1].v = com[0].v + com[1].v;
-    tmp[2].v = com[0].v - com[1].v;
-    tmp[3].v = com[2].v + com[3].v;
-    tmp[4].v = com[2].v - com[3].v;
-    tmp[5].v = com[4].v + com[5].v;
-    tmp[6].v = com[4].v - com[5].v;
-    tmp[7].v = com[7].v;
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[0]         = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[1]         = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];
-        com[0].s[2]         = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
-        com[0].s[3]         = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[5]         = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];
-        out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];
-        out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
-        out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
-        out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
-        out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
-        out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
-        out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
-        out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];
-    })
-
-    TILE(uint, 64, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
-    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
-    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-    x -= PAD_LEFT;
-    y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 8, 1, in);
-    TILE(DATA_TYPE, 8, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v *= (DATA_TYPE) - 36.0f;
-    })
-
-    TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
-
-    com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;
-    com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;
-    com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;
-    com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;
-    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
-    com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;
-    out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;
-    out[1].s[0] = com[0].s[0] - com[0].s[1];
-    out[2].s[0] = com[0].s[0] + com[0].s[1];
-    out[3].s[0] = com[0].s[2] - com[0].s[3];
-    out[4].s[0] = com[0].s[2] + com[0].s[3];
-    out[5].s[0] = com[0].s[4] - com[0].s[5];
-    out[6].s[0] = com[0].s[4] + com[0].s[5];
-    out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;
-
-    TILE(uint, 8, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 64, 1, in);
-    TILE(DATA_TYPE, 64, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the tile from a NHWC tensor
-    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
-    TILE(DATA_TYPE, 8, 8, com);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
-        com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
-        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
-        com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
-        com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
-    })
-
-    TILE(DATA_TYPE, 8, 8, tmp);
-    tmp[0].v = com[6].v;
-    tmp[1].v = com[0].v - com[1].v;
-    tmp[2].v = com[0].v + com[1].v;
-    tmp[3].v = com[2].v - com[3].v;
-    tmp[4].v = com[2].v + com[3].v;
-    tmp[5].v = com[4].v - com[5].v;
-    tmp[6].v = com[4].v + com[5].v;
-    tmp[7].v = com[7].v;
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[0]         = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[1]         = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];
-        com[0].s[2]         = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[3]         = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
-        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[5]         = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];
-        out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];
-        out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
-        out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
-        out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
-        out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
-        out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
-        out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
-        out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];
-    })
-
-    TILE(uint, 64, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
deleted file mode 100644
index 6a3e6d3346..0000000000
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ /dev/null
@@ -1,2063 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "activation_float_helpers.h"
-#include "helpers.h"
-#include "tile_helpers.h"
-
-#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
-#if defined(VEC_SIZE) && VEC_SIZE == 2
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x2_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute the 2x1 or 1x2 output tile
-    // out00 = d00 + d01 + d02
-    // out01 = d01 - d02 - d03
-
-    float out00 = d00 + d01 + d02;
-    float out01 = d01 - d02 - d03;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-
-    // Compute the 2x2 output tile
-    float k0 = d01 + d11 + d21;
-    float k1 = d02 + d12 + d22;
-    float k2 = d11 - d21 - d31;
-    float k3 = d12 - d22 - d32;
-
-    // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
-    // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
-    // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
-    // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
-
-    float out00 = d10;
-    float out01 = -d13;
-    float out10 = d10;
-    float out11 = -d13;
-
-    out00 += d00 + d20 + k0 + k1;
-    out01 += k0 - k1 - (d03 + d23);
-    out10 += -d20 - d30 + k2 + k3;
-    out11 += k2 - k3 + d23 + d33;
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    int y_in  = get_global_id(1);
-    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-    int z_out = get_global_id(0);
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out00 += (float)b;
-    out01 += (float)b;
-#endif // defined(HAS_BIAS)
-
-    // Get output address
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    // Store the output tile
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0_dt                                            = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#if defined(HAS_BIAS)
-    // Add bias
-    out10 += (DATA_TYPE)b;
-    out11 += (DATA_TYPE)b;
-#endif // defined(HAS_BIAS)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x2_7x7_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-
-    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
-
-    int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    TILE(DATA_TYPE, 8, N0, in);
-    TILE(DATA_TYPE, 2, N0, out);
-    TILE(uint, 8, 1, src_indirect_y);
-
-    // Calculate the indirect Y for the source tensor
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 8 channels to compose the 8x1 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // Compute out0 and out01
-    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
-    out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
-#endif // defined(HAS_BIAS)
-
-    T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 2, 1, dst_indirect_y);
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, yk, 0, 1, 2,
-    {
-        int y_c              = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
-        dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
-    })
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, xk, 0, 1, 2,
-    {
-        int x_c              = min(x_out + xk, ((int)_IDST_WIDTH - 1));
-        dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
-    })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 64, N0, in);
-    TILE(DATA_TYPE, 4, N0, out);
-    TILE(DATA_TYPE, 16, N0, tmp);
-    TILE(uint, 64, 1, src_indirect_y);
-
-    // Calculate the indirect Y for the source tensor
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 64 channels to compose the 8x8 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        tmp[i * 2].v     = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
-        tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;
-    })
-
-    // Compute the 2x2 output tile
-    LOOP_UNROLLING(int, i, 0, 1, 2,
-    {
-        out[i * 2].v     = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
-        out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;
-    })
-
-#if defined(HAS_BIAS)
-    // Add bias
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // defined(HAS_BIAS)
-
-    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 4, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, yk, 0, 1, 2,
-    {
-        LOOP_UNROLLING(int, xk, 0, 1, 2,
-        {
-            int x_c                       = min(x_out + xk, ((int)_IDST_WIDTH - 1));
-            int y_c                       = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
-            dst_indirect_y[xk + yk * 2].v = x_c + y_c * _IDST_WIDTH;
-            dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
-        })
-    })
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 2
-
-#if defined(VEC_SIZE) && VEC_SIZE == 4
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x4_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    // Each thread stores a 4x4/4x1 or 1x4 tile
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    // Load the values across the channels to compose the 6x6 or 6x1 tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute out00, out01, out02 and out03
-    float out00 = d00 + d01 + d02 + d03 + d04;
-    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
-    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
-    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
-    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
-    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
-    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
-
-    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
-    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
-    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
-    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
-    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
-    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
-
-    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
-    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
-    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
-    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
-    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
-    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
-
-    // Compute out00, out01, out02 and out03
-    float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
-    float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
-    float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
-    float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
-
-    float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
-    float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
-
-    out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
-    out01 += k1 - d02 - d12 - d22 - d32 - d42;
-    out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
-    out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
-
-    // Compute out10, out11, out12 and out13
-    float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-
-    k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
-    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
-
-    out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
-    out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
-    out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
-    out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
-
-    // Compute out20, out21, out22 and out23
-    float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-
-    k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
-    k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
-
-    out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
-    out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
-    out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
-    out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
-
-    // Compute out30, out31, out32 and out33
-    float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-
-    k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
-    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
-
-    out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
-    out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
-    out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
-    out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    int y_in  = get_global_id(1);
-    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-    int z_out = get_global_id(0);
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out00 += (float)b;
-    out01 += (float)b;
-    out02 += (float)b;
-    out03 += (float)b;
-#endif // defined(HAS_BIAS)
-
-    // Get output address
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    // Store the output tile
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4(out0_dt, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#if defined(HAS_BIAS)
-    // Add bias
-    out10 += (float)b;
-    out11 += (float)b;
-    out12 += (float)b;
-    out13 += (float)b;
-
-    out20 += (float)b;
-    out21 += (float)b;
-    out22 += (float)b;
-    out23 += (float)b;
-
-    out30 += (float)b;
-    out31 += (float)b;
-    out32 += (float)b;
-    out33 += (float)b;
-#endif // defined(HAS_BIAS)
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
-            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
-            (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
-            (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  dst_size                          Size of the destination tensor, minus the last padding
- */
-__kernel void winograd_output_transform_4x4_3x3_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 6, N0, in);
-    TILE(DATA_TYPE, 4, N0, out);
-    TILE(uint, 6, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // Compute out00, out01, out02 and out03
-    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
-    out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
-    out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
-    out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 4, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
-        dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;
-        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, xk, 0, 1, 4,
-    {
-        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
-        dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;
-        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Calculate the indirect Y for the source tensor
-    TILE(DATA_TYPE, 36, N0, in);
-    TILE(DATA_TYPE, 4, N0, tmp);
-    TILE(uint, 36, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        tmp[0].v     = in[6 + i].v + in[12 + i].v;
-        tmp[1].v     = in[6 + i].v - in[12 + i].v;
-        tmp[2].v     = in[18 + i].v + in[24 + i].v;
-        tmp[3].v     = in[18 + i].v - in[24 + i].v;
-        tmp[3].v     = tmp[3].v + tmp[3].v;
-        in[i].v      = in[i].v + tmp[0].v + tmp[2].v;
-        in[6 + i].v  = tmp[3].v + tmp[1].v;
-        in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
-        in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
-    })
-
-    // Compute the output tile
-    TILE(DATA_TYPE, 16, N0, out);
-
-    LOOP_UNROLLING(int, i, 0, 1, 4,
-    {
-        tmp[0].v         = in[6 * i + 1].v + in[6 * i + 2].v;
-        tmp[1].v         = in[6 * i + 1].v - in[6 * i + 2].v;
-        tmp[2].v         = in[6 * i + 3].v + in[6 * i + 4].v;
-        tmp[3].v         = in[6 * i + 3].v - in[6 * i + 4].v;
-        tmp[3].v         = tmp[3].v + tmp[3].v;
-        out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
-        out[4 * i + 1].v = tmp[3].v + tmp[1].v;
-        out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
-        out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
-    })
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 16, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        LOOP_UNROLLING(int, xk, 0, 1, 4,
-        {
-            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
-            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
-            dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;
-            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-        })
-    })
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
-#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact)  \
-    ({                                                                   \
-        comm_fact.s0 = d1 + d2;                                          \
-        comm_fact.s1 = d3 + d4;                                          \
-        comm_fact.s2 = d5 + d6;                                          \
-        \
-        col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0;  \
-        col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
-        \
-        comm_fact.s0 = d1 - d2;                                          \
-        comm_fact.s1 = d3 - d4;                                          \
-        comm_fact.s2 = d5 - d6;                                          \
-        \
-        col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
-        col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7;  \
-    })
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x4_5x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    // Each thread stores a 4x4/4x1 or 1x4 tile
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute output address
-    int y_in  = get_global_id(1);
-    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-    int z_out = get_global_id(0);
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    // Load the values across the channels to compose the input tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute out00, out01, out02 and out03
-    float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
-    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
-    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
-    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out00 += (DATA_TYPE)b;
-    out01 += (DATA_TYPE)b;
-    out02 += (DATA_TYPE)b;
-    out03 += (DATA_TYPE)b;
-#endif // defined(HAS_BIAS)
-
-    // Store the output tile
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL,
-                                 B_VAL),
-                      VEC_DATA_TYPE(DATA_TYPE, 4));
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)),
-            0, (__global DATA_TYPE *)(dst_addr));
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
-    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
-    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
-    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
-    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
-    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
-    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
-    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
-    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
-
-    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
-    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
-    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
-    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
-    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
-    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
-    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
-    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
-
-    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
-    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
-    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
-    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
-    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
-    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
-    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
-    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
-
-    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
-    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
-    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
-    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
-    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
-    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
-    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
-    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
-
-    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
-    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
-    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
-    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
-    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
-    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
-    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
-    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
-
-    // Compute the 8x4 intermediate tensor
-    VEC_DATA_TYPE(float, 4)
-    comm_fact0, comm_fact1, comm_fact2;
-    VEC_DATA_TYPE(float, 4)
-    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
-
-    COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
-
-    // Compute the 4x4 output tile
-    comm_fact0 = tmp_col1 + tmp_col2;
-    comm_fact1 = tmp_col3 + tmp_col4;
-    comm_fact2 = tmp_col5 + tmp_col6;
-
-    VEC_DATA_TYPE(float, 4)
-    out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
-    VEC_DATA_TYPE(float, 4)
-    out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
-
-    comm_fact0 = tmp_col1 - tmp_col2;
-    comm_fact1 = tmp_col3 - tmp_col4;
-    comm_fact2 = tmp_col5 - tmp_col6;
-
-    VEC_DATA_TYPE(float, 4)
-    out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
-    VEC_DATA_TYPE(float, 4)
-    out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out_col0 += (VEC_DATA_TYPE(float, 4))b;
-    out_col1 += (VEC_DATA_TYPE(float, 4))b;
-    out_col2 += (VEC_DATA_TYPE(float, 4))b;
-    out_col3 += (VEC_DATA_TYPE(float, 4))b;
-#endif // defined(HAS_BIAS)
-
-    // Store the output tile
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL),
-                    VEC_DATA_TYPE(DATA_TYPE, 4)),
-            0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL),
-                    VEC_DATA_TYPE(DATA_TYPE, 4)),
-            0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL),
-                    VEC_DATA_TYPE(DATA_TYPE, 4)),
-            0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL),
-                    VEC_DATA_TYPE(DATA_TYPE, 4)),
-            0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x4_5x5_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    TILE(DATA_TYPE, 8, N0, in);
-    TILE(DATA_TYPE, 4, N0, out);
-    TILE(DATA_TYPE, 4, N0, tmp);
-    TILE(uint, 8, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-    // "in" contains 1x8 or 8x1 tile here
-    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // A^T * in, and in this degenerate case out consists of 1 column/row
-    tmp[0].v = in[1].v - in[2].v;
-    tmp[1].v = 2.0f * (in[3].v - in[4].v);
-    tmp[2].v = 2.0f * (in[5].v + in[6].v);
-    tmp[3].v = in[3].v + in[4].v;
-    out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;
-    out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);
-    out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;
-    out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 4, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
-        dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;
-        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, xk, 0, 1, 4,
-    {
-        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
-        dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;
-        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Calculate the indirect Y for the source tensor
-    TILE(DATA_TYPE, 64, N0, in);
-    TILE(DATA_TYPE, 6, N0, tmp);
-    TILE(uint, 64, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // "in" here is 8x8 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // A^T * in
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        tmp[0].v = in[8 + i].v + in[16 + i].v;
-        tmp[1].v = in[8 + i].v - in[16 + i].v;
-        tmp[2].v = in[24 + i].v + in[32 + i].v;
-        tmp[3].v = in[24 + i].v - in[32 + i].v;
-        tmp[3].v = tmp[3].v + tmp[3].v;
-        tmp[4].v = in[40 + i].v + in[48 + i].v;
-        tmp[4].v = tmp[4].v + tmp[4].v;
-        tmp[5].v = in[40 + i].v - in[48 + i].v;
-
-        // 4x8 matrix as a result
-        in[i].v      = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
-        in[8 + i].v  = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
-        in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
-        in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
-    })
-
-    // Compute the output tile
-    TILE(DATA_TYPE, 16, N0, out);
-
-    // in * A, with in = A^T * in as above
-    LOOP_UNROLLING(int, i, 0, 1, 4,
-    {
-        tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
-        tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
-        tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
-        tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
-        tmp[3].v = tmp[3].v + tmp[3].v;
-        tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
-        tmp[4].v = tmp[4].v + tmp[4].v;
-        tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
-
-        // 4x4 tile
-        out[4 * i].v     = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
-        out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
-        out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
-        out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
-    })
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 16, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        LOOP_UNROLLING(int, xk, 0, 1, 4,
-        {
-            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
-            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
-            dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;
-            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-        })
-    })
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 4
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
-#if defined(VEC_SIZE) && VEC_SIZE == 2
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x1_7x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 2
-
-#if defined(VEC_SIZE) && VEC_SIZE == 4
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_5x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_3x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_5x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 4
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#if defined(VEC_SIZE) && VEC_SIZE == 2
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x2_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x2_1x7_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 2
-
-#if defined(VEC_SIZE) && VEC_SIZE == 4
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x3_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x5_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 4
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index 909972482f..5b72354abe 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,37 +29,36 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
-    }
-    if(prev_output != nullptr && prev_output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32, DataType::S32);
-        if(output->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output);
-        }
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64,
+                                                             DataType::U64);
     }
 
     return Status{};
@@ -67,59 +66,70 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_outp
 } // namespace
 
 CLArgMinMaxLayerKernel::CLArgMinMaxLayerKernel()
-    : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::ARG_IDX_MAX)
+    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::ARG_IDX_MAX)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const ICLTensor   *input,
+                                       ICLTensor         *output,
+                                       unsigned int       axis,
+                                       ReductionOperation op)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, prev_output, output, axis, op);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
 }
 
-void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
+                                       const ICLTensor        *input,
+                                       ICLTensor              *output,
+                                       unsigned int            axis,
+                                       ReductionOperation      op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape{ input->info()->tensor_shape() };
+    TensorShape output_shape{input->info()->tensor_shape()};
     output_shape.set(axis, 1);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(DataType::S32).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(DataType::S32)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, op));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
-    auto padding_info = get_padding_info({ input, prev_output, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input          = input;
-    _prev_output    = prev_output;
     _output         = output;
     _reduction_axis = axis;
     _op             = op;
 
     // Set build options
-    const auto vector_size = (axis == 0) ? 16U : adjust_vec_size(16U, input->info()->dimension(0));
+    const auto adjusted_vector_size = adjust_vec_size(16U, input->info()->dimension(0));
+    const auto vector_size          = (adjusted_vector_size == 3U && axis == 0U)
+                                          ? 2U
+                                          : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
 
     CLBuildOptions build_opts;
-    build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT");
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(input->info()->dimension(0) % vector_size));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vector_size));
     build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
     build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
     build_opts.add_option("-DDATA_TYPE_OUTPUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.add_option("-DCOND_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DUNROLL_WITH_PRAGMA=1");
 
     // Create kernel
-    cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
     std::string kernel_axis_name;
-    switch(axis)
+    switch (axis)
     {
         case 0:
-        {
-            const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input;
-            build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input_for_width->info()->dimension(0)));
-
+            build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
             kernel_axis_name = "x";
-            lws_hint         = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0), vector_size);
-        }
-        break;
+            break;
         case 1:
             build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
             kernel_axis_name = "y";
@@ -139,15 +149,18 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
     _kernel = create_kernel(compile_context, "arg_min_max_" + kernel_axis_name, build_opts.options());
 
     // Configure kernel window
-    Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output->info()) : (*input->info()), Steps(vector_size));
-    ICLKernel::configure_internal(win, lws_hint);
+    Window win = calculate_max_window(*input->info(), Steps(vector_size));
+    ICLKernel::configure_internal(win);
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input,
+                                        const ITensorInfo *output,
+                                        unsigned int       axis,
+                                        ReductionOperation op)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
     return Status{};
 }
 
@@ -156,43 +169,36 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    switch(_reduction_axis)
+    switch (_reduction_axis)
     {
         case 0:
         {
             // Set out window
             Window out_window(window);
+            Window in_window(window);
             out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+            in_window.set(Window::DimX,
+                          Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+            in_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1u));
 
             // Get first input and output slices
-            Window in_slice  = window.first_slice_window_2D();
+            Window in_slice  = in_window.first_slice_window_2D();
             Window out_slice = out_window.first_slice_window_2D();
-
-            // Reshape window
-            const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2;
-
-            // Set local sums buffer
-            unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size();
-            _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr);
             do
             {
                 unsigned int idx = 0;
                 add_2D_tensor_argument(idx, _input, in_slice);
-                if(_prev_output != nullptr)
-                {
-                    add_2D_tensor_argument(idx, _prev_output, in_slice);
-                }
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            } while (in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
         }
         break;
         case 1:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+            Window window_in{window};
+            window_in.set(Window::DimY,
+                          Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
             Window in_slice  = window_in.first_slice_window_2D();
             Window out_slice = window.first_slice_window_2D();
 
@@ -202,15 +208,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_2D_tensor_argument(idx, _input, in_slice);
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
         }
         break;
         case 2:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+            Window window_in{window};
+            window_in.set(Window::DimZ,
+                          Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
             Window in_slice  = window_in.first_slice_window_3D();
             Window out_slice = window.first_slice_window_3D();
 
@@ -220,14 +226,13 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_3D_tensor_argument(idx, _input, in_slice);
                 add_3D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+            } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
         }
         break;
         case 3:
         {
             // Get first input and output slices
-            Window window_in{ window };
+            Window window_in{window};
             window_in.set(3, Window::Dimension(0, 1, 1));
             Window in_slice  = window_in.first_slice_window_4D();
             Window out_slice = window.first_slice_window_4D();
@@ -238,8 +243,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_4D_tensor_argument(idx, _input, in_slice);
                 add_4D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+            } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
index 929677f905..fb3b41b0de 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -56,48 +57,46 @@ public:
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input       Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in]  prev_output Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
-     *                         Has to be nullptr for the first iteration
-     * @param[out] output      Destination tensor. Data types supported: U32/S32
-     *                         Output will have the same number of dimensions as input.
-     * @param[in]  axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in]  op          Reduction operation to perform. Only ArgMin and ArgMax are supported.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[out] output Destination tensor. Data types supported: U32/S32
+     *                    Output will have the same number of dimensions as input.
+     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in]  op     Reduction operation to perform. Only ArgMin and ArgMax are supported.
      */
-    void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in]  prev_output     Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
-     *                             Has to be nullptr for the first iteration
      * @param[out] output          Destination tensor. Data types supported: U32/S32
      *                             Output will have the same number of dimensions as input.
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in]  op              Reduction operation to perform. Only ArgMin and ArgMax are supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
      *
-     * @param[in] input       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in] prev_output Destination tensor info of the previous iterations. Data types supported: U32/S32
-     *                        Has to be nullptr for the first iteration
-     * @param[in] output      Destination tensor info. Data types supported: U32/S32
-     *                        Output will have the same number of dimensions as input.
-     * @param[in] axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in] op          Reduction operation to perform.  Only ArgMin and ArgMax are supported.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
+     * @param[in] output Destination tensor info. Data types supported: U32/S32
+     *                   Output will have the same number of dimensions as input.
+     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in] op     Reduction operation to perform.  Only ArgMin and ArgMax are supported.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
     const ICLTensor   *_input;
-    const ICLTensor   *_prev_output;
     ICLTensor         *_output;
     unsigned int       _reduction_axis;
     ReductionOperation _op;
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 44bdc6f587..c88a852a44 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,49 +29,58 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ITensorInfo *mean, const ITensorInfo *var,
-                          const ITensorInfo *beta, const ITensorInfo *gamma,
-                          float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo  *input,
+                          const ITensorInfo  *output,
+                          const ITensorInfo  *mean,
+                          const ITensorInfo  *var,
+                          const ITensorInfo  *beta,
+                          const ITensorInfo  *gamma,
+                          float               epsilon,
+                          ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
-    if(beta != nullptr)
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+                                    input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+    if (beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
     }
-    if(gamma != nullptr)
+    if (gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
     }
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ActivationLayerInfo::ActivationFunction act = act_info.activation();
         ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+                                    act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+                                    act !=
+                                        ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
         ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -83,14 +92,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
 
 std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
 {
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->element_size(), input->dimension(0));
 
     // Configure kernel window
     Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
 
     bool window_changed = false;
-    if(output != nullptr)
+    if (output != nullptr)
     {
         AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
         window_changed = update_window_and_padding(win, input_access, output_access);
@@ -101,29 +111,50 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
         window_changed = update_window_and_padding(win, input_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
 CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false)
+    : _input(nullptr),
+      _output(nullptr),
+      _mean(nullptr),
+      _var(nullptr),
+      _beta(nullptr),
+      _gamma(nullptr),
+      _epsilon(0),
+      _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(ICLTensor          *input,
+                                                ICLTensor          *output,
+                                                const ICLTensor    *mean,
+                                                const ICLTensor    *var,
+                                                const ICLTensor    *beta,
+                                                const ICLTensor    *gamma,
+                                                float               epsilon,
+                                                ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
-                                                const ICLTensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+                                                ICLTensor              *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *mean,
+                                                const ICLTensor        *var,
+                                                const ICLTensor        *beta,
+                                                const ICLTensor        *gamma,
+                                                float                   epsilon,
+                                                ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
-    auto padding_info = get_padding_info({ input, output, mean, var, beta, gamma });
+    auto padding_info = get_padding_info({input, output, mean, var, beta, gamma});
     _input            = input;
     _output           = output;
     _mean             = mean;
@@ -138,13 +169,15 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
                                                   mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
                                                   (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
-    unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+    unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
@@ -153,29 +186,33 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
     build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
 
     // Create kernel
-    _kernel = create_kernel(compile_context, "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel =
+        create_kernel(compile_context,
+                      "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                      build_opts.options());
 
     // Set kernel static arguments
     unsigned int include_output = (!_run_in_place) ? 1 : 0;
-    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
-    if(_beta != nullptr)
+    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() +
+                       2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    if (_beta != nullptr)
     {
         idx += num_arguments_per_1D_tensor(); // Skip beta parameter
     }
-    if(_gamma != nullptr)
+    if (_gamma != nullptr)
     {
         idx += num_arguments_per_1D_tensor(); // Skip gamma parameter
     }
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 
     // Configure kernel window
-    if(input->info()->data_layout() == DataLayout::NHWC)
+    if (input->info()->data_layout() == DataLayout::NHWC)
     {
         Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
         ICLKernel::configure_internal(win);
@@ -201,18 +238,23 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
     _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
 }
 
-Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                 const ITensorInfo *mean, const ITensorInfo *var,
-                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
-                                                 float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo  *input,
+                                                 const ITensorInfo  *output,
+                                                 const ITensorInfo  *mean,
+                                                 const ITensorInfo  *var,
+                                                 const ITensorInfo  *beta,
+                                                 const ITensorInfo  *gamma,
+                                                 float               epsilon,
+                                                 ActivationLayerInfo act_info)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
 
-    if(input->data_layout() != DataLayout::NHWC)
+    if (input->data_layout() != DataLayout::NHWC)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
-                                    .first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
+                .first);
     }
 
     return Status{};
@@ -232,11 +274,11 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
     unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor();
     add_1D_tensor_argument(idx, _mean, vector_slice);
     add_1D_tensor_argument(idx, _var, vector_slice);
-    if(_beta != nullptr)
+    if (_beta != nullptr)
     {
         add_1D_tensor_argument(idx, _beta, vector_slice);
     }
-    if(_gamma != nullptr)
+    if (_gamma != nullptr)
     {
         add_1D_tensor_argument(idx, _gamma, vector_slice);
     }
@@ -245,11 +287,10 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
     {
         idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index 743f4a9594..1a88d2a8c5 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
 
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +65,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ICLTensor          *input,
+                   ICLTensor          *output,
+                   const ICLTensor    *mean,
+                   const ICLTensor    *var,
+                   const ICLTensor    *beta     = nullptr,
+                   const ICLTensor    *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
@@ -81,8 +89,15 @@ public:
      * @param[in]      epsilon         (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
-                   const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *var,
+                   const ICLTensor        *beta     = nullptr,
+                   const ICLTensor        *gamma    = nullptr,
+                   float                   epsilon  = 0.001f,
+                   ActivationLayerInfo     act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
@@ -98,10 +113,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index da41feb7b8..c640b5a8d6 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,10 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -52,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output)
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const ITensorInfo *output,
+                                 const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -64,14 +71,12 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const int idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape_x * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_y * input->tensor_shape()[idx_height]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] != input->tensor_shape()[idx_channel]);
+        const TensorShape expected_output_shape = compute_batch_to_space_shape(
+            input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+        const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -80,9 +85,9 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 }
 } // namespace
 
-CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _output(nullptr)
+CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() : _input(nullptr), _block_shape(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
@@ -90,11 +95,14 @@ void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTenso
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
 }
 
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *block_shape,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    auto padding_info = get_padding_info({ input, block_shape, output });
+    auto padding_info = get_padding_info({input, block_shape, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
 
@@ -102,66 +110,83 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     _block_shape = block_shape;
     _output      = output;
 
-    const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
-    build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
+    _kernel = create_kernel(compile_context,
+                            "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
+    Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input,
+                                          const int32_t    block_shape_x,
+                                          const int32_t    block_shape_y,
+                                          ICLTensor       *output,
+                                          const CropInfo  &crop_info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const int32_t           block_shape_x,
+                                          const int32_t           block_shape_y,
+                                          ICLTensor              *output,
+                                          const CropInfo         &crop_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = compute_batch_to_space_shape(input->info(), block_shape_x, block_shape_y);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+    const TensorShape output_shape = compute_batch_to_space_shape(
+        input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
 
     _input  = input;
     _output = output;
 
-    const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
-    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
     build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
     build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
-    build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    build_opts.add_option("-DCROP_LEFT=" + support::cpp11::to_string(crop_info.left));
+    build_opts.add_option("-DCROP_TOP=" + support::cpp11::to_string(crop_info.top));
+    _kernel = create_kernel(
+        compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+        build_opts.options());
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
+    Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 }
 
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
     return Status{};
 }
 
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output)
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+                                           const int32_t      block_shape_x,
+                                           const int32_t      block_shape_y,
+                                           const ITensorInfo *output,
+                                           const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
     return Status{};
 }
 
@@ -170,32 +195,31 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice_in  = window.first_slice_window_3D();
-    Window slice_out = window.first_slice_window_4D();
+    Window slice_out = window.first_slice_window_3D();
+    Window slice_in  = window.first_slice_window_4D();
 
     Window vector_slice = window.first_slice_window_1D();
     vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
 
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    slice_out.set(3, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    slice_in.set(3, Window::Dimension(0, 0, 0));
 
     int batch_id = 0;
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
+        add_4D_tensor_argument(idx, _input, slice_in);
         add_argument(idx, batch_id);
-        if(_block_shape != nullptr)
+        if (_block_shape != nullptr)
         {
             add_1D_tensor_argument(idx, _block_shape, vector_slice);
         }
-        add_4D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_in, lws_hint());
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_in));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
index 131a43e59c..b9d3e66fe2 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -52,6 +53,8 @@ public:
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
     void configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
     /** Initialise the kernel's inputs and output.
@@ -60,16 +63,26 @@ public:
      * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   ICLTensor              *output);
     /** Initialise the kernel's inputs and output (Static block shape).
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
      * @param[out] output        Tensor output. Data types supported: same as @p input
+     * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
+    void configure(const ICLTensor *input,
+                   const int32_t    block_shape_x,
+                   const int32_t    block_shape_y,
+                   ICLTensor       *output,
+                   const CropInfo  &crop_info);
     /** Initialise the kernel's inputs and output (Static block shape).
      *
      * @param[in]  compile_context The compile context to be used.
@@ -77,8 +90,14 @@ public:
      * @param[in]  block_shape_x   Block shape x value.
      * @param[in]  block_shape_y   Block shape y value.
      * @param[out] output          Tensor output. Data types supported: same as @p input
+     * @param[in]  crop_info       Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int32_t           block_shape_x,
+                   const int32_t           block_shape_y,
+                   ICLTensor              *output,
+                   const CropInfo         &crop_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -86,6 +105,8 @@ public:
      * @param[in] output      Tensor output. Data types supported: same as @p input
      *
      * @return a status
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel (Static block shape).
@@ -94,10 +115,15 @@ public:
      * @param[in] block_shape_x Block shape x value.
      * @param[in] block_shape_y Block shape y value.
      * @param[in] output        Tensor output. Data types supported: same as @p input
+     * @param[in] crop_info     Specifies how the output shape is cropped after batch to space is performed
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int32_t      block_shape_x,
+                           const int32_t      block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp
index b1f7c00fac..de3fb43de8 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,23 +27,30 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
-CLBitwiseKernel::CLBitwiseKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLBitwiseKernel::CLBitwiseKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op)
+void CLBitwiseKernel::configure(const CLCompileContext &compile_context,
+                                const ICLTensor        *input1,
+                                const ICLTensor        *input2,
+                                ICLTensor              *output,
+                                BitwiseOperation        op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
-    if(op != BitwiseOperation::NOT)
+    if (op != BitwiseOperation::NOT)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
@@ -53,7 +60,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*(output->info()), *(input1->info()));
-    auto padding_info = get_padding_info({ input1, input2, output });
+    auto padding_info = get_padding_info({input1, input2, output});
 
     // Configure kernel window
     const unsigned int vec_size_x = adjust_vec_size(16 / output->info()->element_size(), output->info()->dimension(0));
@@ -65,7 +72,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
 
     // Create kernel
     std::string kernel_name = "";
-    switch(op)
+    switch (op)
     {
         case BitwiseOperation::AND:
             kernel_name = "bitwise_and";
@@ -104,13 +111,12 @@ void CLBitwiseKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, _input1, slice);
-        if(_input2 != nullptr)
+        if (_input2 != nullptr)
         {
             add_2D_tensor_argument(idx, _input2, slice);
         }
         add_2D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBitwiseKernel.h b/src/core/CL/kernels/CLBitwiseKernel.h
index c5a999643d..2c74955ae4 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.h
+++ b/src/core/CL/kernels/CLBitwiseKernel.h
@@ -59,7 +59,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[in]  op              Bitwise operation to perform. Supported: AND, OR, NOT, XOR.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output,
+                   BitwiseOperation        op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 1bf0dc7445..f32c518e29 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -39,7 +41,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo              *boxes,
+                          const ITensorInfo              *pred_boxes,
+                          const ITensorInfo              *deltas,
+                          const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes);
@@ -52,7 +57,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
 
     const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16;
-    if(is_qasymm16)
+    if (is_qasymm16)
     {
         const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
@@ -64,12 +69,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
     }
 
-    if(pred_boxes->total_size() > 0)
+    if (pred_boxes->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes);
         ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
-        if(is_qasymm16)
+        if (is_qasymm16)
         {
             const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform();
             ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f);
@@ -82,21 +87,31 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
 }
 } // namespace
 
-CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel()
-    : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
+CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const ICLTensor                *boxes,
+                                             ICLTensor                      *pred_boxes,
+                                             const ICLTensor                *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
 }
 
-void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const CLCompileContext         &compile_context,
+                                             const ICLTensor                *boxes,
+                                             ICLTensor                      *pred_boxes,
+                                             const ICLTensor                *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
-    auto padding_info = get_padding_info({ boxes, pred_boxes, deltas });
-    auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+    auto padding_info = get_padding_info({boxes, pred_boxes, deltas});
+    auto_init_if_empty(*pred_boxes->info(), deltas->info()
+                                                ->clone()
+                                                ->set_data_type(boxes->info()->data_type())
+                                                .set_quantization_info(boxes->info()->quantization_info()));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
 
@@ -126,7 +141,7 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
     build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale()));
     build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1");
 
-    if(is_quantized)
+    if (is_quantized)
     {
         build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type()));
         const UniformQuantizationInfo boxes_qinfo      = boxes->info()->quantization_info().uniform();
@@ -146,12 +161,15 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
 
     // Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor
     const unsigned int num_elems_processed_per_iteration = 4;
-    Window             win                               = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
+    Window             win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
     ICLKernel::configure_internal(win);
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransformKernel::validate(const ITensorInfo              *boxes,
+                                              const ITensorInfo              *pred_boxes,
+                                              const ITensorInfo              *deltas,
+                                              const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
     return Status{};
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
index 08f350e86a..9a1bb49bb9 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
@@ -58,7 +58,10 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -71,7 +74,11 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const CLCompileContext         &compile_context,
+                   const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -85,7 +92,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index 8a6b58002c..ec58bf9e7a 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,15 +47,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
 
-    const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+    const unsigned int channels =
+        input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        num_groups == channels,
+        "Channel shuffling with same number of groups as number of channels would be inefficient");
     // There cannot be more groups than channels
     ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+                                    "The number of channels must be a multiple of the number of groups");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -67,11 +75,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     auto_init_if_empty(*output, *input->clone());
 
     const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
-    if(is_nhwc)
+    if (is_nhwc)
     {
-        unsigned int num_elems_processed_per_iteration_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
-        Window       win                                 = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
-        Window       win_collapsed                       = win.collapse(win, Window::DimZ);
+        unsigned int num_elems_processed_per_iteration_x =
+            adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
+        Window win_collapsed = win.collapse(win, Window::DimZ);
         return std::make_pair(Status{}, win_collapsed);
     }
     else
@@ -80,23 +89,27 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         constexpr unsigned int num_elems_processed_per_iteration_y = 2;
 
         // Configure kernel window
-        Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        Window win = calculate_max_window(
+            *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
+                                           num_elems_processed_per_iteration_y);
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x,
+                                            num_elems_processed_per_iteration_y);
 
         const bool window_changed = update_window_and_padding(win, input_access, output_access);
 
         Window win_collapsed = win.collapse(win, Window::DimZ);
 
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+        Status err =
+            (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
         return std::make_pair(err, win_collapsed);
     }
 }
 } // namespace
 
-CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
@@ -104,23 +117,27 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
     configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
 }
 
-void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context,
+                                            const ICLTensor        *input,
+                                            ICLTensor              *output,
+                                            unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
 
-    const DataLayout   data_layout          = input->info()->data_layout();
-    const bool         is_nhwc              = data_layout == DataLayout::NHWC;
-    const unsigned int channels             = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
-    unsigned int       vec_size_x           = 0;
-    unsigned int       vec_size_x_leftovers = 0;
-    if(is_nhwc)
+    const DataLayout   data_layout = input->info()->data_layout();
+    const bool         is_nhwc     = data_layout == DataLayout::NHWC;
+    const unsigned int channels =
+        input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    unsigned int vec_size_x           = 0;
+    unsigned int vec_size_x_leftovers = 0;
+    if (is_nhwc)
     {
-        vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+        vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
         vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
     }
     else
@@ -166,13 +183,14 @@ void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_cont
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
     }
 }
 
-Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
index 31c007f17e..43c939ebd8 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
@@ -60,7 +60,10 @@ public:
      * @param[out] output          Output tensor. Data type supported: Same as @p input
      * @param[in]  num_groups      Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            num_groups);
     /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
      *
      * @param[in] input      Input tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
deleted file mode 100644
index 5f52945efb..0000000000
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLCol2ImKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, true, num_groups));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
-
-    constexpr unsigned int num_elems_read_per_iteration = 8;
-
-    // Configure window
-    Window win = calculate_max_window(*input, Steps(num_elems_read_per_iteration));
-
-    // Update window and padding just for the input tensor as we cannot access out-of-bounds elements in the output one
-    AccessWindowHorizontal input_access(input, 0, num_elems_read_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLCol2ImKernel::CLCol2ImKernel()
-    : _input(nullptr), _output(nullptr), _convolved_dims()
-{
-}
-
-void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, convolved_dims, num_groups);
-}
-
-void CLCol2ImKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims, num_groups));
-
-    _input          = input;
-    _output         = output;
-    _convolved_dims = convolved_dims;
-
-    const DataType data_type = input->info()->data_type();
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
-    build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width));
-    build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-
-    _kernel = create_kernel(compile_context, "col2im", build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), _convolved_dims, num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "col2im_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(num_groups);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims, num_groups));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), convolved_dims, num_groups).first);
-    return Status{};
-}
-
-void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    bool is_collapsed     = false;
-    bool is_collapsed_out = false;
-
-    Window out_window;
-    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
-    Window collapsed     = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &is_collapsed);
-    Window collapsed_out = out_window.collapse_if_possible(out_window, 3, &is_collapsed_out);
-
-    ARM_COMPUTE_ERROR_ON(is_collapsed != is_collapsed_out);
-
-    Window slice     = collapsed.first_slice_window_3D();
-    Window slice_out = collapsed_out.first_slice_window_4D();
-    do
-    {
-        // Set inputs
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_4D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLCol2ImKernel.h b/src/core/CL/kernels/CLCol2ImKernel.h
deleted file mode 100644
index 710e048bca..0000000000
--- a/src/core/CL/kernels/CLCol2ImKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOL2IMKERNEL_H
-#define ARM_COMPUTE_CLCOL2IMKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the col2im reshaping kernel.
- *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CLIm2ColKernel.
- *
- * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
- *
- * @f[
- * \left( \begin{array}{ccccccccc}
- * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccc}
- * a0 & a1 & a2 \\
- * a3 & a4 & a5 \\
- * a6 & a7 & a8 \\
- * \end{array} \right)
- * @f]
- */
-class CLCol2ImKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLCol2ImKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCol2ImKernel(const CLCol2ImKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCol2ImKernel &operator=(const CLCol2ImKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCol2ImKernel(CLCol2ImKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCol2ImKernel &operator=(CLCol2ImKernel &&) = default;
-    /** Default destructor */
-    ~CLCol2ImKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input          The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in]  convolved_dims Output convolved dimensions.
-     * @param[in]  num_groups     (Optional) Number of groups when performing a grouped convolution
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output          The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                             while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in]  convolved_dims  Output convolved dimensions.
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLCol2ImKernel
-     *
-     * @param[in] input          The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                           while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in] convolved_dims Output convolved dimensions.
-     * @param[in] num_groups     (Optional) Number of groups when performing a grouped convolution
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Size2D           _convolved_dims;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCOL2IMKERNEL_H */
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index d0b29e2ba8..a0f9aca54a 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,9 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -37,22 +40,16 @@ namespace arm_compute
 namespace
 {
 // Create supported comparisons map
-const std::map<ComparisonOperation, std::string> supported_comparison_ops =
-{
-    { ComparisonOperation::Equal, "EQUAL" },
-    { ComparisonOperation::NotEqual, "NOTEQUAL" },
-    { ComparisonOperation::Greater, "GREATER" },
-    { ComparisonOperation::GreaterEqual, "GREATEREQUAL" },
-    { ComparisonOperation::Less, "LESS" },
-    { ComparisonOperation::LessEqual, "LESSEQUAL" },
+const std::map<ComparisonOperation, std::string> supported_comparison_ops = {
+    {ComparisonOperation::Equal, "EQUAL"},     {ComparisonOperation::NotEqual, "NOTEQUAL"},
+    {ComparisonOperation::Greater, "GREATER"}, {ComparisonOperation::GreaterEqual, "GREATEREQUAL"},
+    {ComparisonOperation::Less, "LESS"},       {ComparisonOperation::LessEqual, "LESSEQUAL"},
 };
 
-int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
-{
-    return 16 / input.element_size();
-}
-
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation)
+Status validate_arguments(const ITensorInfo  &input1,
+                          const ITensorInfo  &input2,
+                          const ITensorInfo  &output,
+                          ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
     ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN);
@@ -63,7 +60,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured output
-    if(output.total_size() > 0)
+    if (output.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
@@ -75,40 +72,37 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
 {
-    const TensorShape &out_shape                         = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
-    const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1);
+    const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input1.element_size(), output.dimension(0));
 
     // Auto initialize output if not initialized
     auto_init_if_empty(output, out_shape, 1, DataType::U8, QuantizationInfo());
 
-    Window win        = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration));
-    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
-
-    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+    Window win = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration));
 
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(Status{}, win);
 }
 } // namespace
 
-CLComparisonKernel::CLComparisonKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLComparisonKernel::CLComparisonKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const ICLTensor    *input1,
+                                   const ICLTensor    *input2,
+                                   ICLTensor          *output,
+                                   ComparisonOperation operation)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
 }
 
-void CLComparisonKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input1,
+                                   const ICLTensor        *input2,
+                                   ICLTensor              *output,
+                                   ComparisonOperation     operation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation));
@@ -124,17 +118,29 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
     const std::string &operation_name = supported_comparison_ops.at(operation);
     std::string        kernel_name    = "compare_" + lower_string(operation_name);
 
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input1->info()->element_size(), output->info()->dimension(0));
+
     // Set kernel build options
     std::set<std::string> build_opts;
     build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
+    build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.emplace("-DVEC_SIZE_LEFTOVER=" +
+                       support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.emplace(
+        "-DVEC_SIZE_IN1=" + //
+        support::cpp11::to_string(input1->info()->dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.emplace(
+        "-DVEC_SIZE_IN2=" + //
+        support::cpp11::to_string(input2->info()->dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
     build_opts.emplace("-DOP=" + operation_name);
     build_opts.emplace("-DOP_NAME=" + lower_string(operation_name));
-    if(is_data_type_quantized(input1->info()->data_type()))
+    if (is_data_type_quantized(input1->info()->data_type()))
     {
         const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
 
+        build_opts.emplace("-DIS_QUANTIZED");
         build_opts.emplace("-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
         build_opts.emplace("-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
         build_opts.emplace("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
@@ -158,12 +164,16 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
     _config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
 }
 
-Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparisonKernel::validate(const ITensorInfo  *input1,
+                                    const ITensorInfo  *input2,
+                                    const ITensorInfo  *output,
+                                    ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
 
     return Status{};
 }
@@ -179,17 +189,18 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
 
     bool       can_collapse = true;
     const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -210,16 +221,7 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
-BorderSize CLComparisonKernel::border_size() const
-{
-    const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info());
-
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize{ 0, border, 0, 0 };
-}
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h
index 0b94190183..2fb4ba06b6 100644
--- a/src/core/CL/kernels/CLComparisonKernel.h
+++ b/src/core/CL/kernels/CLComparisonKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLCOMPARISONKERNEL_H
-#define ARM_COMPUTE_CLCOMPARISONKERNEL_H
+#ifndef ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
+#define ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[in]  operation       Comparison operation to use.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output,
+                   ComparisonOperation     operation);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
      *
      * @param[in] input1    Source tensor. Data types supported: All.
@@ -74,11 +79,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+    static Status validate(const ITensorInfo  *input1,
+                           const ITensorInfo  *input2,
+                           const ITensorInfo  *output,
+                           ComparisonOperation operation);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
 
 private:
     const ICLTensor *_input1; /**< Source tensor 1 */
@@ -86,4 +93,4 @@ private:
     ICLTensor       *_output; /**< Destination tensor */
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCOMPARISONKERNEL_H */
+#endif // ACL_SRC_CORE_CL_KERNELS_CLCOMPARISONKERNEL_H
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index eb420d8842..f8ecc4c098 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -36,9 +38,11 @@ namespace arm_compute
 CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
     : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo   *input,
+                                                    const ITensorInfo   *output,
                                                     const PadStrideInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -58,7 +62,7 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
-    for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
     }
@@ -66,20 +70,21 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
     return Status{};
 }
 
-void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                                   const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                                                   const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context,
+                                                   const ICLTensor        *input,
+                                                   ICLTensor              *output,
+                                                   const PadStrideInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input       = input;
     _output      = output;
@@ -117,7 +122,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
     const int out_end_y   = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
     const int out_step_y  = _info.stride().second;
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -135,8 +140,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
                 add_3D_tensor_argument(idx, _input, slice_in);
                 add_3D_tensor_argument(idx, _output, slice_out);
                 enqueue(queue, *this, slice_out, lws_hint());
-            }
-            while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
+            } while (collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
             break;
         }
         case DataLayout::NHWC:
@@ -154,8 +158,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
                 add_3D_tensor_argument(idx, _input, slice_in);
                 add_3D_tensor_argument(idx, _output, slice_out);
                 enqueue(queue, *this, slice_out, lws_hint());
-            }
-            while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+            } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
             break;
         }
         default:
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index e0d1322341..762989a836 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -62,7 +62,10 @@ public:
      * @param[out] output          Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info            Contains padding and stride information described in @ref PadStrideInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
      *
      * @param[in] input  Source tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index ca7e9d4b23..b33e0a8b6f 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,9 +27,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -38,7 +39,11 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *bias,
+                          const ITensorInfo   *output,
+                          const ITensorInfo   *input_info,
+                          const ITensorInfo   *weights_info,
                           const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
@@ -53,19 +58,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first);
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32);
-    if(!is_qasymm)
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S32);
+    if (!is_qasymm)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) *
+                                                           weights_info->dimension(idx_b));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
-        if(is_qasymm)
+        if (is_qasymm)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -76,19 +83,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b));
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
-        auto                out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+        auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+                                                        weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+                                                        stride_info);
 
-        const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
     }
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo   *input,
+                                                        ITensorInfo         *output,
+                                                        const ITensorInfo   *input_info,
+                                                        const ITensorInfo   *weights_info,
+                                                        const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -97,11 +111,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
     const size_t        idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
 
-    auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+    auto out_dims =
+        deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+                                        weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
 
-    const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+    const TensorShape output_shape =
+        misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
 
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info()));
+    auto_init_if_empty(*output, input->clone()
+                                    ->set_tensor_shape(output_shape)
+                                    .set_data_layout(data_layout)
+                                    .set_quantization_info(input->quantization_info()));
 
     Window win = calculate_max_window(*input);
 
@@ -109,28 +129,37 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
 }
 } // namespace
 
-CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel()
-    : _add_bias(false),
-      _bias(nullptr)
+CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() : _add_bias(false), _bias(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor     *input,
+                                                   const ICLTensor     *bias,
+                                                   ICLTensor           *output,
+                                                   const ITensorInfo   *input_info,
+                                                   const ITensorInfo   *weights_info,
                                                    const PadStrideInfo &deconv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, input_info, weights_info, deconv_info);
 }
 
-void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info,
-                                                   const ITensorInfo   *weights_info,
-                                                   const PadStrideInfo &deconv_info)
+void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context,
+                                                   const ICLTensor        *input,
+                                                   const ICLTensor        *bias,
+                                                   ICLTensor              *output,
+                                                   const ITensorInfo      *input_info,
+                                                   const ITensorInfo      *weights_info,
+                                                   const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr),
+                                                  output->info(), input_info, weights_info, deconv_info));
 
-    auto padding_info = get_padding_info({ input, bias, output });
+    auto padding_info = get_padding_info({input, bias, output});
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
+    auto win_config =
+        validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     const DataLayout data_layout = input_info->data_layout();
@@ -177,7 +206,11 @@ void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo   *input,
+                                                    const ITensorInfo   *bias,
+                                                    const ITensorInfo   *output,
+                                                    const ITensorInfo   *input_info,
+                                                    const ITensorInfo   *weights_info,
                                                     const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info));
@@ -193,7 +226,7 @@ void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQu
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input, collapsed);
     add_3D_tensor_argument(idx, _output, collapsed);
-    if(_add_bias)
+    if (_add_bias)
     {
         add_1D_tensor_argument(idx, _bias, collapsed);
     }
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
index ce354fa86f..8f436b07e3 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
@@ -67,7 +67,12 @@ public:
      * @param[in]  weights_info Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
      * @param[in]  deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+    void configure(const ICLTensor     *input,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const ITensorInfo   *input_info,
+                   const ITensorInfo   *weights_info,
+                   const PadStrideInfo &deconv_info);
     /** Initialise the kernel's source and destination.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -79,8 +84,13 @@ public:
      * @param[in]  weights_info    Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
      * @param[in]  deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
-                   const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const ITensorInfo      *input_info,
+                   const ITensorInfo      *weights_info,
+                   const PadStrideInfo    &deconv_info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref  CLDeconvolutionReshapeOutputKernel.
      *
@@ -93,7 +103,12 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const ITensorInfo   *input_info,
+                           const ITensorInfo   *weights_info,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index 8946f2a713..cdf19ab2e1 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -48,12 +50,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+                                    (block_shape * input->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+                                    (block_shape * input->tensor_shape()[idx_height]));
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -62,9 +66,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
@@ -72,14 +76,18 @@ void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          int32_t                 block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+    TensorShape output_shape =
+        compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
 
@@ -96,7 +104,9 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
     build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
@@ -135,7 +145,6 @@ void CLDepthToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
         enqueue(queue, *this, slice_in, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_in));
+    } while (window.slide_window_slice_3D(slice_in));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
index 1f7f77b569..cef70c4dda 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -61,7 +62,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
deleted file mode 100644
index dda70d2231..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ /dev/null
@@ -1,432 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                          const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D dilation,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
-                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                    "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3);
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
-
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-
-    const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
-
-    if(biases != nullptr)
-    {
-        if(is_qasymm)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON((biases->dimension(0) != weights->dimension(2)) && (weights->dimension(2) != 1 || biases->dimension(0) != weights->dimension(3)));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    if(is_qasymm)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-
-        if(is_data_type_quantized_per_channel(weights->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
-            ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_multipliers->dimension(0));
-            ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_shifts->dimension(0));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-            ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0));
-            ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-
-    if(output->total_size() != 0)
-    {
-        const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
-        const TensorShape     output_shape = compute_depthwise_convolution_shape(*input, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                        unsigned int depth_multiplier, std::string &kernel_name, const Size2D dilation)
-{
-    // Output auto inizialitation if not yet initialized
-    const ConvolutionInfo info
-    {
-        conv_info, depth_multiplier, ActivationLayerInfo(), dilation
-    };
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, info);
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
-    const unsigned int conv_stride_x = conv_info.stride().first;
-    const unsigned int conv_stride_y = conv_info.stride().second;
-    const bool         is_qasymm     = is_data_type_quantized_asymmetric(input->data_type());
-
-    // Configure kernel window
-    unsigned int num_elems_read_per_iteration_x    = 0;
-    unsigned int num_elems_read_per_iteration_y    = 0;
-    unsigned int num_elems_written_per_iteration_x = 0;
-    unsigned int num_elems_written_per_iteration_y = 0;
-
-    if(input->data_type() == DataType::F16)
-    {
-        kernel_name                       = "depthwise_convolution_3x3_f16";
-        num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
-        num_elems_written_per_iteration_y = 1;
-        num_elems_read_per_iteration_y    = 3;
-        switch(conv_stride_x)
-        {
-            case 1:
-                num_elems_read_per_iteration_x = 8;
-                break;
-            case 2:
-                num_elems_read_per_iteration_x = 9;
-                break;
-            case 3:
-                num_elems_read_per_iteration_x = 16;
-                break;
-            default:
-                num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
-                break;
-        }
-        if(conv_stride_x == 1 && conv_stride_y == 1)
-        {
-            kernel_name                       = "depthwise_convolution_3x3_stridex1_stridey1_f16";
-            num_elems_read_per_iteration_x    = 8;
-            num_elems_written_per_iteration_x = 4;
-            num_elems_read_per_iteration_y    = 6;
-            num_elems_written_per_iteration_y = 4;
-        }
-        else if(conv_stride_x == 2 && conv_stride_y == 2)
-        {
-            kernel_name                       = "depthwise_convolution_3x3_stridex2_stridey2_f16";
-            num_elems_read_per_iteration_x    = 10;
-            num_elems_written_per_iteration_x = 4;
-            num_elems_read_per_iteration_y    = 5;
-            num_elems_written_per_iteration_y = 2;
-        }
-    }
-    else if(input->data_type() == DataType::F32)
-    {
-        if(conv_stride_x == 1 && conv_stride_y == 1)
-        {
-            kernel_name                       = "depthwise_convolution_3x3_stridex1_stridey1_f32";
-            num_elems_read_per_iteration_x    = 4;
-            num_elems_read_per_iteration_y    = 6;
-            num_elems_written_per_iteration_x = 2;
-            num_elems_written_per_iteration_y = 4;
-        }
-        else if(conv_stride_x == 2 && conv_stride_y == 2)
-        {
-            kernel_name                       = "depthwise_convolution_3x3_stridex2_stridey2_f32";
-            num_elems_read_per_iteration_x    = 6;
-            num_elems_read_per_iteration_y    = 5;
-            num_elems_written_per_iteration_x = 2;
-            num_elems_written_per_iteration_y = 2;
-        }
-        else
-        {
-            kernel_name                       = "depthwise_convolution_3x3";
-            num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
-            num_elems_written_per_iteration_y = 1;
-            num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
-            num_elems_read_per_iteration_y    = 3;
-        }
-    }
-    else
-    {
-        const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_data_type_quantized_per_channel(weights->data_type());
-
-        kernel_name = is_qasymm ? "dwc_3x3_native_quantized8" : "depthwise_convolution_3x3";
-        kernel_name += (is_qasymm && is_dot8_supported ? "_dot8" : "");
-        kernel_name += (is_qasymm ? "_nchw" : "");
-
-        num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
-        num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y == 1 && dilation.y() == 1) ? 2 : 1;
-        num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x + (conv_stride_x > 1 ? 1 : 0);
-        num_elems_read_per_iteration_y    = num_elems_written_per_iteration_y + 2;
-    }
-    // The OpenCL routine convolution1x3 does loadn(addr), loadn(addr + dilation_x) and loadn(addr + 2 * dilation_x) on the input.
-    // Each of the three convolution1x3 gets called by passing addr, (addr + dilation_y) and (addr + 2 * dilation_y)
-    // Hence we must add 2 * dilation.x/y() to the number of elements read in those axes per thread
-    num_elems_read_per_iteration_x += 2 * dilation.x();
-    num_elems_read_per_iteration_y += 2 * dilation.y();
-
-    // Create window and update padding
-    Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
-    AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
-                                       num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
-                                       conv_stride_x, conv_stride_y);
-    AccessWindowStatic    weights_access(weights, 0, 0, 3, 3);
-    AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-
-    bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLDepthwiseConvolutionLayer3x3NCHWKernel::CLDepthwiseConvolutionLayer3x3NCHWKernel()
-    : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1), _output_multipliers(), _output_shifts(), _is_quantized(false), _conv_stride_x(0), _conv_pad_top(0), _conv_pad_left(0)
-{
-}
-
-BorderSize CLDepthwiseConvolutionLayer3x3NCHWKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
-                                                         const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts);
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation,
-                                                         const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                  conv_info, depth_multiplier, act_info, dilation,
-                                                  (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
-                                                  (output_shifts != nullptr) ? output_shifts->info() : nullptr));
-
-    _input              = input;
-    _output             = output;
-    _weights            = weights;
-    _biases             = biases;
-    _conv_stride_x      = conv_info.stride().first;
-    _conv_stride_y      = conv_info.stride().second;
-    _conv_pad_left      = conv_info.pad_left();
-    _conv_pad_top       = conv_info.pad_top();
-    _output_multipliers = output_multipliers;
-    _output_shifts      = output_shifts;
-    _is_quantized       = is_data_type_quantized_asymmetric(input->info()->data_type());
-
-    // Configure kernel window
-    std::string kernel_name;
-
-    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, kernel_name, dilation);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    _border_size = BorderSize(input->info()->padding());
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-    build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->tensor_shape().z()));
-    build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
-    build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
-    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
-    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
-    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
-
-    if(_is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
-
-        const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
-        const bool is_dot8_supported        = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel;
-        build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
-        build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
-        build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
-        build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
-        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * iq_info.offset * wq_info.offset));
-        build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-        build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8");
-
-        // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
-        float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
-        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-
-        if(act_info.enabled())
-        {
-            int a_val{};
-            int b_val{};
-            std::tie(b_val, a_val) = get_quantized_activation_min_max(act_info, input->info()->data_type(), oq_info);
-
-            const int o1 = oq_info.offset;
-
-            build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
-            build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
-            build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
-
-            const float s1 = iq_info.scale;
-            build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
-            build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
-        }
-
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-        build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
-        build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type()));
-    }
-    else
-    {
-        build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-        build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-        build_opts.add_option_if(act_info.enabled(), "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-        build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(win_config.second.x().step()));
-    }
-
-    build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DIS_F16");
-    build_opts.add_option_if(input->info()->data_type() == DataType::F32, "-DIS_F32");
-
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                          const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info,
-                                                          const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    std::string kernel_name;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(),
-                                                              conv_info, depth_multiplier, kernel_name, dilation)
-                                .first);
-
-    return Status{};
-}
-
-void CLDepthwiseConvolutionLayer3x3NCHWKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-    // Create input window and adjust
-    Window collapsed_in = collapsed;
-    collapsed_in.adjust(Window::DimX, -_conv_pad_left, true);
-    collapsed_in.adjust(Window::DimY, -_conv_pad_top, true);
-    collapsed_in.set_dimension_step(Window::DimX, collapsed_in.x().step() * _conv_stride_x);
-    collapsed_in.set_dimension_step(Window::DimY, collapsed_in.y().step() * _conv_stride_y);
-
-    Window slice_in      = collapsed_in.first_slice_window_3D();
-    Window slice_out     = collapsed.first_slice_window_3D();
-    Window slice_weights = window.first_slice_window_3D();
-    slice_weights.set_dimension_step(Window::DimX, 0);
-    slice_weights.set_dimension_step(Window::DimY, 0);
-
-    unsigned int idx = 3 * num_arguments_per_3D_tensor();
-
-    // Set output multipliers in case of quantized data type
-    if(_is_quantized)
-    {
-        Window slice;
-        slice.use_tensor_dimensions(_output_multipliers->info()->tensor_shape());
-        add_1D_tensor_argument(idx, _output_multipliers, slice);
-        add_1D_tensor_argument(idx, _output_shifts, slice);
-    }
-
-    // Set biases
-    if(_biases != nullptr)
-    {
-        Window slice_biases;
-        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
-        add_1D_tensor_argument(idx, _biases, slice_biases);
-    }
-
-    do
-    {
-        idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        add_3D_tensor_argument(idx, _weights, slice_weights);
-
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice_out) && collapsed_in.slide_window_slice_3D(slice_in));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
deleted file mode 100644
index c4e475f6f2..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NCHW.
- */
-class CLDepthwiseConvolutionLayer3x3NCHWKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayer3x3NCHWKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayer3x3NCHWKernel(const CLDepthwiseConvolutionLayer3x3NCHWKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayer3x3NCHWKernel &operator=(const CLDepthwiseConvolutionLayer3x3NCHWKernel &) = delete;
-    /** Default Move Constructor. */
-    CLDepthwiseConvolutionLayer3x3NCHWKernel(CLDepthwiseConvolutionLayer3x3NCHWKernel &&) = default;
-    /** Default move assignment operator */
-    CLDepthwiseConvolutionLayer3x3NCHWKernel &operator=(CLDepthwiseConvolutionLayer3x3NCHWKernel &&) = default;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NCHWKernel
-     *
-     * @param[in] input              Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [3, 3, IFM].
-     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in] conv_info          Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(),
-                           const Size2D &dilation = Size2D(1U, 1U), const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize       _border_size;
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_weights;
-    const ICLTensor *_biases;
-    unsigned int     _conv_stride_y;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized;
-
-    unsigned int _conv_stride_x;
-    unsigned int _conv_pad_top;
-    unsigned int _conv_pad_left;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H */
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
deleted file mode 100644
index 91a2f5745a..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                          const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 4);
-
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-
-    const size_t weights_width  = 3;
-    const size_t weights_height = 3;
-
-    const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
-
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
-                                         *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), info);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(1) != weights_width) || (weights->dimension(2) != weights_height));
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[0]);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output,
-                                                        const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
-{
-    ARM_COMPUTE_UNUSED(weights, bias);
-    ARM_COMPUTE_UNUSED(depth_multiplier);
-
-    const bool   is_stride_1_dilation_1           = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1) && dilation.x() == 1 && dilation.y() == 1);
-    unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
-
-    Window win{};
-    Status err{};
-
-    unsigned int num_elems_accessed_per_iteration = adjust_vec_size(4 / input->element_size(), input->dimension(0));
-    win                                           = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_processed_per_iteration));
-
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLDepthwiseConvolutionLayer3x3NHWCKernel::CLDepthwiseConvolutionLayer3x3NHWCKernel()
-    : _input(), _output(), _weights(), _biases(), _num_planes_processed_per_iteration(1)
-{
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                  conv_info, depth_multiplier, act_info, dilation));
-
-    auto padding_info = get_padding_info({ input, weights, biases, output });
-
-    auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-                                                    conv_info, depth_multiplier, dilation);
-
-    const bool is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
-
-    _input                              = input;
-    _output                             = output;
-    _weights                            = weights;
-    _biases                             = biases;
-    _num_planes_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
-
-    unsigned int num_elems_accessed_per_iteration = adjust_vec_size(4 / input->info()->element_size(), input->info()->dimension(0));
-    unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration));
-    build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
-    build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
-    build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-    build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_accessed_per_iteration));
-    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
-    build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
-                             "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
-    build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-    build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-
-    if(is_stride_1_dilation_1)
-    {
-        build_opts.add_option("-DNUM_ROWS_PROCESSED=" + support::cpp11::to_string(num_rows_processed_per_iteration));
-        build_opts.add_option("-DNUM_PLANES_PROCESSED=" + support::cpp11::to_string(_num_planes_processed_per_iteration));
-        build_opts.add_option("-DDST_DIM_1=" + support::cpp11::to_string(_output->info()->dimension(1)));
-        build_opts.add_option("-DDST_DIM_2=" + support::cpp11::to_string(_output->info()->dimension(2)));
-        build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string((input->info()->dimension(1) + conv_info.pad_left() + conv_info.pad_right()) % num_rows_processed_per_iteration));
-    }
-    else
-    {
-        build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
-        build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
-        build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
-        build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
-    }
-
-    // Create kernel
-    std::string kernel_name;
-    kernel_name = std::string("depthwise_convolution_3x3_nhwc");
-    kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
-
-    ICLKernel::configure_internal(win_config.second);
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += string_from_data_type(input->info()->data_type());
-}
-
-Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                          const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
-                                                              biases != nullptr ? biases->clone().get() : nullptr,
-                                                              output->clone().get(), conv_info, depth_multiplier, dilation)
-                                .first);
-    return Status{};
-}
-
-void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    const size_t total_batches = _input->info()->tensor_shape().total_size_upper(3);
-
-    Window win = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)) * total_batches, 1));
-
-    unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor();
-
-    if(_biases != nullptr)
-    {
-        Window win_biases;
-        win_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
-        win_biases.set_dimension_step(Window::DimX, window.x().step());
-        add_1D_tensor_argument(idx, _biases, win_biases);
-    }
-
-    Window slice = win.first_slice_window_4D();
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, _input, slice);
-        add_4D_tensor_argument(idx, _output, slice);
-        add_3D_tensor_argument(idx, _weights, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(win.slide_window_slice_4D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
deleted file mode 100644
index ee47d98807..0000000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NHWC.
- */
-class CLDepthwiseConvolutionLayer3x3NHWCKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayer3x3NHWCKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayer3x3NHWCKernel(const CLDepthwiseConvolutionLayer3x3NHWCKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayer3x3NHWCKernel &operator=(const CLDepthwiseConvolutionLayer3x3NHWCKernel &) = delete;
-    /** Default Move Constructor. */
-    CLDepthwiseConvolutionLayer3x3NHWCKernel(CLDepthwiseConvolutionLayer3x3NHWCKernel &&) = default;
-    /** Default move assignment operator */
-    CLDepthwiseConvolutionLayer3x3NHWCKernel &operator=(CLDepthwiseConvolutionLayer3x3NHWCKernel &&) = default;
-    /** Default move assignment operator. */
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input            Source tensor. DataType supported: F16/F32.
-     * @param[in]  weights          Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
-     *                              Data type supported: Same as @p input.
-     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. DataType supported: F16/F32.
-     * @param[in]  weights          Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
-     *                              Data type supported: Same as @p input.
-     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
-     *
-     * @param[in] input            Source tensor info. DataType supported: F16/F32.
-     * @param[in] weights          Weights tensor info. A 3D tensor with dimensions [IFM, 3, 3].
-     *                             Data type supported: Same as @p input.
-     * @param[in] biases           Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input.
-     * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_weights;
-    const ICLTensor *_biases;
-
-    unsigned int _num_planes_processed_per_iteration;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H */
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index fcfa7f878d..b95abe795f 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,47 +28,94 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                          const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo          *input,
+                          const ITensorInfo          *weights,
+                          const ITensorInfo          *biases,
+                          const ITensorInfo          *output,
+                          const DWCComputeKernelInfo &dwc_info,
+                          const ConvolutionInfo      &conv_info,
+                          const ITensorInfo          *output_multipliers,
+                          const ITensorInfo          *output_shifts)
 {
     ARM_COMPUTE_UNUSED(dwc_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    bool in_place = false;
+    if (output == nullptr || output == input)
+    {
+        in_place = true;
+        output   = input;
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1 && dwc_weights_info.n0 != 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().second < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) &&
+                                        (export_to_cl_image(weights) == false),
+                                    "Weights cannot be exported to cl_image!");
+    ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON((conv_info.dilation.x() < 1) || (conv_info.dilation.y() < 1));
     const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_UNUSED(idx_c);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * conv_info.depth_multiplier));
+
+    // In place restrictions
+    if (in_place)
+    {
+        const int weights_width_idx =
+            get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+        const int weights_height_idx =
+            get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U ||
+                                    weights->tensor_shape()[weights_height_idx] != 1U);
+        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier != 1U);
+        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride() != std::make_pair(1U, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size2D(1U, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            conv_info.pad_stride_info
+                .has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
+    }
 
-    const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
-    const TensorShape     output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+    const ConvolutionInfo info{conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(),
+                               conv_info.dilation};
+    const TensorShape     output_shape =
+        arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
+
+    if (conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0);
+    }
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
 
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -78,7 +125,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         }
     }
 
-    if(is_quantized)
+    if (is_quantized)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
@@ -86,7 +133,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
 
-        if(is_data_type_quantized_per_channel(weights->data_type()))
+        if (is_data_type_quantized_per_channel(weights->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0));
@@ -104,22 +151,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
-    if(is_data_type_quantized(input->data_type()))
+    if (is_data_type_quantized(input->data_type()))
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
         const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
+        const UniformQuantizationInfo oq_info =
+            (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
 
         float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
     }
 
     return Status{};
@@ -134,111 +183,194 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel
       _depth_multiplier(1),
       _output_multipliers(nullptr),
       _output_shifts(nullptr),
+      _export_input_to_cl_image(false),
+      _export_weights_to_cl_image(false),
       _is_quantized(false)
 {
+    _type = CLKernelType::DEPTHWISE;
 }
 
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                                                        const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
-                                                        const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor                  *input,
+                                                        const ICLTensor            *weights,
+                                                        const ICLTensor            *biases,
+                                                        ICLTensor                  *output,
+                                                        const DWCComputeKernelInfo &dwc_info,
+                                                        const ConvolutionInfo      &conv_info,
+                                                        const ICLTensor            *output_multipliers,
+                                                        const ICLTensor            *output_shifts)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info,
+              output_multipliers, output_shifts);
 }
 
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                        const DWCWeightsKernelInfo &dwc_weights_info,
-                                                        const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
-                                                        const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext     &compile_context,
+                                                        ICLTensor                  *input,
+                                                        const ICLTensor            *weights,
+                                                        const ICLTensor            *biases,
+                                                        ICLTensor                  *output,
+                                                        const DWCComputeKernelInfo &dwc_info,
+                                                        const ConvolutionInfo      &conv_info,
+                                                        const ICLTensor            *output_multipliers,
+                                                        const ICLTensor            *output_shifts)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                  dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
-                                                  (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+    if (output == nullptr)
+    {
+        // In-place
+        output = input;
+    }
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_info,
+        conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
+        (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+
+    auto padding_info = get_padding_info({input, output});
+
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+        *(input->info()), *(weights->info()), conv_info);
+    auto_init_if_empty(*(output->info()), input->info()
+                                              ->clone()
+                                              ->set_tensor_shape(output_shape)
+                                              .set_quantization_info(output->info()->quantization_info()));
+
+    _input                      = input;
+    _output                     = output;
+    _weights                    = weights;
+    _biases                     = biases;
+    _depth_multiplier           = conv_info.depth_multiplier;
+    _output_multipliers         = output_multipliers;
+    _output_shifts              = output_shifts;
+    _export_input_to_cl_image   = dwc_info.export_input_to_cl_image;
+    _export_weights_to_cl_image = dwc_info.export_weights_to_cl_image;
+    _is_quantized               = is_data_type_quantized(input->info()->data_type());
+
+    const unsigned int n0          = adjust_vec_size(dwc_info.n0, output->info()->dimension(0));
+    const unsigned int m0          = std::min(dwc_info.m0, (unsigned int)output->info()->dimension(1));
+    std::string        kernel_name = "";
 
-    auto padding_info = get_padding_info({ input, output });
+    CLBuildOptions build_opts;
 
-    const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
-    const TensorShape     output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), info);
-    auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
+    // Update the padding for the input/weights tensor if we can export to cl_image
+    if (_export_input_to_cl_image)
+    {
+        arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info());
+    }
 
-    _input              = input;
-    _output             = output;
-    _weights            = weights;
-    _biases             = biases;
-    _depth_multiplier   = depth_multiplier;
-    _output_multipliers = output_multipliers;
-    _output_shifts      = output_shifts;
-    _is_quantized       = is_data_type_quantized(input->info()->data_type());
+    if (_export_weights_to_cl_image)
+    {
+        arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info());
+    }
 
-    const unsigned int n0 = adjust_vec_size(dwc_weights_info.n0, input->info()->dimension(0));
+    // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
+    const GPUTarget gpu_target    = get_target();
+    const auto      act_function  = conv_info.act_info.activation();
+    const auto      dst_data_type = _output->info()->data_type();
 
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
-    build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1, "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(_output->info()->dimension(2))));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(dwc_info.activation_info.activation())));
-    build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+    {
+        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+        build_opts.add_option("-cl-unsafe-math-optimizations");
+    }
+    else
+    {
+        build_opts.add_option("-cl-fast-relaxed-math");
+    }
+
+    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
+    build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(conv_info.depth_multiplier));
+    build_opts.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER");
+    // Note: SRC_DATA_TYPE must have the same data type of WEI_DATA_TYPE. In quantized, we could
+    // have a case where the data types for the activation and weights are different. However, since the implementation
+    // only works when both have same data type, we have to change the offset to take into account this aspect
+    build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
+    build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
+    build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
+    build_opts.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(1)));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(2)));
+    build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(1)));
+    build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(2)));
+    build_opts.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(1)));
+    build_opts.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(_weights->info()->dimension(2)));
+    build_opts.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(_weights->info()->data_type()));
+    build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_stride_info.pad_top()));
+    build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_stride_info.pad_left()));
+    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.pad_stride_info.stride().first));
+    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.pad_stride_info.stride().second));
+    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(conv_info.dilation.x()));
+    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(conv_info.dilation.y()));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DSRC_DIM1=" + support::cpp11::to_string(_input->info()->dimension(1)));
-    build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(_input->info()->dimension(2)));
-    build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(weights->info()->dimension(1)));
-    build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(weights->info()->dimension(2)));
-    build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-    build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-    build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
-    build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
-    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
-    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(_input->info()->dimension(0) % n0));
-
-    std::string kernel_name = (_is_quantized) ? "dwc_MxN_native_quantized8_nhwc" : "dwc_MxN_native_fp_nhwc";
-
-    if(_is_quantized)
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
+    build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1",
+                                  "-DN0_A=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0));
+    build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
+
+    // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
+    set_unroll_with_pragma(build_opts, {static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
+                                        static_cast<int>(_weights->info()->dimension(1)),
+                                        static_cast<int>(_weights->info()->dimension(2))});
+
+    if (biases != nullptr)
     {
-        const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
+        build_opts.add_option(std::string("-DHAS_BIAS"));
+        build_opts.add_option(
+            std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
+    }
 
-        build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
-        build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
-        build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
-        build_opts.add_option_if(is_data_type_quantized_per_channel(weights->info()->data_type()), "-DPER_CHANNEL_QUANTIZATION");
+    if (_is_quantized)
+    {
+        kernel_name                          = "dwc_native_quantized_nhwc";
+        const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
+        const UniformQuantizationInfo wqinfo = weights->info()->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
 
-        // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
-        float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
+        PixelValue zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+        int        zero_value_s32;
+        zero_value.get(zero_value_s32);
+
+        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
         quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-
-        if(dwc_info.activation_info.enabled())
-        {
-            int a_val{};
-            int b_val{};
-            std::tie(b_val, a_val) = get_quantized_activation_min_max(dwc_info.activation_info, input->info()->data_type(), oq_info);
-
-            const int o1 = oq_info.offset;
-
-            build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
-            build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
-            build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
-
-            const float s1 = iq_info.scale;
-            build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
-            build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
-        }
-
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-        build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
+        build_opts.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+        build_opts.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+        build_opts.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+        build_opts.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+        build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+        build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+        build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+        build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" +
+                              get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
+        build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" +
+                              get_cl_type_from_data_type(_output_shifts->info()->data_type()));
+        build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL,
+                                      "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
+        // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach
+        int a_val{};
+        int b_val{};
+        std::tie(b_val, a_val) =
+            get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
+
+        build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + support::cpp11::to_string(a_val));
+        build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + support::cpp11::to_string(b_val));
     }
     else
     {
-        build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.a()));
-        build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.b()));
+        kernel_name = "dwc_native_fp_nhwc";
+        build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+        build_opts.add_option_if(conv_info.act_info.enabled(),
+                                 "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
+        build_opts.add_option_if(conv_info.act_info.enabled(),
+                                 "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
     }
 
-    Window win = calculate_max_window(*(output->info()), Steps(n0));
+    Window win = calculate_max_window(*(output->info()), Steps(n0, m0));
     ICLKernel::configure_internal(win);
 
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -263,11 +395,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     _config_id += string_from_data_type(input->info()->data_type());
 }
 
-Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                         const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info,
-                                                         unsigned int depth_multiplier, const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo          *input,
+                                                         const ITensorInfo          *weights,
+                                                         const ITensorInfo          *biases,
+                                                         const ITensorInfo          *output,
+                                                         const DWCComputeKernelInfo &dwc_info,
+                                                         const ConvolutionInfo      &conv_info,
+                                                         const ITensorInfo          *output_multipliers,
+                                                         const ITensorInfo          *output_shifts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
     return Status{};
 }
 
@@ -278,37 +416,61 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
 
     // Collapse window
     Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
-    Window slice_in         = window.first_slice_window_4D();
-    Window slice_out        = window_collapsed.first_slice_window_4D();
 
-    if(_depth_multiplier != 1)
-    {
-        ARM_COMPUTE_ERROR_ON(slice_out.x().step() != 1);
-        slice_out.set(Window::DimX, Window::Dimension(0, _input->info()->tensor_shape()[0], 1));
-    }
+    Window slice = window_collapsed.first_slice_window_4D();
 
-    unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor();
+    cl::Image2D input_cl_image;
+    cl::Image2D weights_cl_image;
 
-    // Set output multipliers in case of quantized data type
-    if(_is_quantized)
+    if (_export_input_to_cl_image || _export_weights_to_cl_image)
     {
-        add_1D_tensor_argument(idx, _output_multipliers, slice_in);
-        add_1D_tensor_argument(idx, _output_shifts, slice_in);
+        // Export cl_buffer to cl_image
+        if (_export_input_to_cl_image)
+        {
+            const size_t image_w = _input->info()->dimension(0) / 4;
+            const size_t image_h =
+                _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
+            const TensorShape shape2d(image_w, image_h);
+            const size_t      image_row_pitch = _input->info()->strides_in_bytes()[1];
+            input_cl_image =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d,
+                                           _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        }
+
+        if (_export_weights_to_cl_image)
+        {
+            const size_t image_w = _weights->info()->dimension(0) / 4;
+            const size_t image_h =
+                _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
+            const TensorShape shape2d(image_w, image_h);
+            const size_t      image_row_pitch = _weights->info()->strides_in_bytes()[1];
+            weights_cl_image =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d,
+                                           _weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        }
     }
 
-    if(_biases != nullptr)
+    unsigned int idx = 0;
+    if (_export_input_to_cl_image)
     {
-        add_1D_tensor_argument(idx, _biases, slice_in);
+        _kernel.setArg(idx++, input_cl_image);
     }
-
-    do
+    add_4d_tensor_nhwc_argument(idx, _input);
+    add_4d_tensor_nhwc_argument(idx, _output);
+    if (_export_weights_to_cl_image)
+    {
+        _kernel.setArg(idx++, weights_cl_image);
+    }
+    add_4d_tensor_nhwc_argument(idx, _weights);
+    if (_is_quantized)
+    {
+        add_1D_tensor_argument(idx, _output_multipliers, slice);
+        add_1D_tensor_argument(idx, _output_shifts, slice);
+    }
+    if (_biases != nullptr)
     {
-        idx = 0;
-        add_4D_tensor_argument(idx, _input, slice_in);
-        add_4D_tensor_argument(idx, _output, slice_out);
-        add_3D_tensor_argument(idx, _weights, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
+        add_1D_tensor_argument(idx, _biases, slice);
     }
-    while(window_collapsed.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+    enqueue(queue, *this, slice, lws_hint());
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 325f4e7067..d34a662966 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,10 @@
 #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/function_info/ConvolutionInfo.h"
+
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
@@ -47,85 +48,84 @@ public:
     CLDepthwiseConvolutionLayerNativeKernel(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
     /** Allow instances of this class to be moved */
     CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
+
     /** Initialize the function's source, destination and parameters
      *
+     * @param[in]  compile_context    The compile context to be used.
      * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
      * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
      *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
      *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
+     * @param[out] output             Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: Same as @p input.
      * @param[in]  dwc_info           Depthwise convolution layer info
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  conv_info          Convolution info (padding, stride, dilation, ...)
      * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
      *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
      * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
      *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     *
+     * @note: In-place is only supported when
+     *          * data layout: NHWC
+     *          * filter: 1x1
+     *          * @p depth_multiplier: 1
+     *          * strides: 1
+     *          * dilation: 1
+     *          * no padding
+     *          * no change of data layout after configure
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Initialize the function's source, destination and parameters
+    void configure(const CLCompileContext     &compile_context,
+                   ICLTensor                  *input,
+                   const ICLTensor            *weights,
+                   const ICLTensor            *biases,
+                   ICLTensor                  *output,
+                   const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo      &conv_info,
+                   const ICLTensor            *output_multipliers = nullptr,
+                   const ICLTensor            *output_shifts      = nullptr);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in]  dwc_info           Depthwise convolution layer info
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(ICLTensor                  *input,
+                   const ICLTensor            *weights,
+                   const ICLTensor            *biases,
+                   ICLTensor                  *output,
+                   const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo      &conv_info,
+                   const ICLTensor            *output_multipliers = nullptr,
+                   const ICLTensor            *output_shifts      = nullptr);
+
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
-     * @param[in] input              Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, N, M].
-     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output             Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in] dwc_info           Depthwise convolution layer info
-     * @param[in] conv_info          Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                           const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo          *input,
+                           const ITensorInfo          *weights,
+                           const ITensorInfo          *biases,
+                           const ITensorInfo          *output,
+                           const DWCComputeKernelInfo &dwc_info,
+                           const ConvolutionInfo      &conv_info,
+                           const ITensorInfo          *output_multipliers = nullptr,
+                           const ITensorInfo          *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input;
-    const ICLTensor *_weights;
-    const ICLTensor *_biases;
-    ICLTensor       *_output;
-    unsigned int     _depth_multiplier;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized;
+    const ICLTensor *_input{};
+    const ICLTensor *_weights{};
+    const ICLTensor *_biases{};
+    ICLTensor       *_output{};
+    unsigned int     _depth_multiplier{0};
+    const ICLTensor *_output_multipliers{};
+    const ICLTensor *_output_shifts{};
+    bool             _export_input_to_cl_image{false};
+    bool             _export_weights_to_cl_image{true};
+    bool             _is_quantized{false};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index ff04708b5b..3d8f875ef7 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -35,17 +38,20 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo               *input,
+                          const ITensorInfo               *output,
+                          const ITensorInfo               *idx,
+                          const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -55,7 +61,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo                     *input,
+                                                        ITensorInfo                     *output,
+                                                        ITensorInfo                     *idx,
+                                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(idx, config);
 
@@ -67,20 +76,27 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-CLFFTDigitReverseKernel::CLFFTDigitReverseKernel()
-    : _input(nullptr), _output(nullptr), _idx(nullptr)
+CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() : _input(nullptr), _output(nullptr), _idx(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const ICLTensor                 *input,
+                                        ICLTensor                       *output,
+                                        const ICLTensor                 *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, idx, config);
 }
 
-void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const CLCompileContext          &compile_context,
+                                        const ICLTensor                 *input,
+                                        ICLTensor                       *output,
+                                        const ICLTensor                 *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
-    auto padding_info = get_padding_info({ input, output, idx });
+    auto padding_info = get_padding_info({input, output, idx});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
 
     _input  = input;
@@ -111,10 +127,14 @@ void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context,
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status CLFFTDigitReverseKernel::validate(const ITensorInfo               *input,
+                                         const ITensorInfo               *output,
+                                         const ITensorInfo               *idx,
+                                         const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
 
     return Status{};
 }
@@ -134,7 +154,6 @@ void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue)
         add_3D_tensor_argument(idx, _output, slice);
         add_1D_tensor_argument(idx, _idx, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
index e5583a4c22..fdd1bcc3d3 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.h
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 #define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -56,7 +56,8 @@ public:
      * @param[in]  idx    Digit reverse index tensor. Data type supported: U32
      * @param[in]  config Kernel configuration.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    void
+    configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -65,7 +66,11 @@ public:
      * @param[in]  idx             Digit reverse index tensor. Data type supported: U32
      * @param[in]  config          Kernel configuration.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    void configure(const CLCompileContext          &compile_context,
+                   const ICLTensor                 *input,
+                   ICLTensor                       *output,
+                   const ICLTensor                 *idx,
+                   const FFTDigitReverseKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
@@ -75,7 +80,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+    static Status validate(const ITensorInfo               *input,
+                           const ITensorInfo               *output,
+                           const ITensorInfo               *idx,
+                           const FFTDigitReverseKernelInfo &config);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 779bf43922..3729e6b77d 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -45,11 +47,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -58,9 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
 {
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output, *input);
     }
@@ -75,9 +78,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-CLFFTRadixStageKernel::CLFFTRadixStageKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTRadixStageKernel::CLFFTRadixStageKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
@@ -85,11 +88,15 @@ void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
+void CLFFTRadixStageKernel::configure(const CLCompileContext        &compile_context,
+                                      ICLTensor                     *input,
+                                      ICLTensor                     *output,
+                                      const FFTRadixStageKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
-    auto padding_info = get_padding_info({ input, output });
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+    auto padding_info = get_padding_info({input, output});
 
     _input        = input;
     _output       = output;
@@ -108,11 +115,12 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set static arguments if not the first stage
-    if(!config.is_first_stage)
+    if (!config.is_first_stage)
     {
         const unsigned int Ni        = config.Nx * config.radix;
         const float        exp_const = (-2.0 * M_PI) / static_cast<float>(Ni);
-        unsigned int       idx       = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+        unsigned int       idx =
+            (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
         _kernel.setArg<cl_uint>(idx++, config.Nx);
         _kernel.setArg<cl_uint>(idx++, Ni);
         _kernel.setArg<cl_float>(idx, exp_const);
@@ -134,21 +142,22 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status CLFFTRadixStageKernel::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *output,
+                                       const FFTRadixStageKernelInfo &config)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (run_in_place) ? nullptr : output->clone().get(),
-                                                              config)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+            .first);
 
     return Status{};
 }
 
 std::set<unsigned int> CLFFTRadixStageKernel::supported_radix()
 {
-    return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+    return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
 }
 
 void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -163,12 +172,11 @@ void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h
index 9bb310db83..de80bfced3 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.h
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 #define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 #include <set>
 
 namespace arm_compute
@@ -69,7 +69,10 @@ public:
      * @param[out]    output          Destination tensor. Can be nullptr. Data type supported: same as @p input
      * @param[in]     config          FFT descriptor metadata.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
+    void configure(const CLCompileContext        &compile_context,
+                   ICLTensor                     *input,
+                   ICLTensor                     *output,
+                   const FFTRadixStageKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index c80f774c6a..be6e16b074 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -41,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -52,9 +55,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 }
 } // namespace
 
-CLFFTScaleKernel::CLFFTScaleKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTScaleKernel::CLFFTScaleKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
@@ -62,11 +65,14 @@ void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTS
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
+void CLFFTScaleKernel::configure(const CLCompileContext   &compile_context,
+                                 ICLTensor                *input,
+                                 ICLTensor                *output,
+                                 const FFTScaleKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input        = input;
     _output       = output;
@@ -75,20 +81,22 @@ void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTen
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels()
+                                                                                      : input->info()->num_channels()));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option_if(config.conjugate, "-DCONJ");
     std::string kernel_name = "fft_scale_conj";
     _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set static arguments
-    unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    unsigned int idx =
+        (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
     _kernel.setArg<cl_float>(idx, config.scale);
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
@@ -127,12 +135,11 @@ void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h
index cc518be193..b995282e02 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.h
+++ b/src/core/CL/kernels/CLFFTScaleKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
 #define ARM_COMPUTE_CLFFTSCALEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -63,7 +63,10 @@ public:
      * @param[out]    output          Destination tensor. Data type supported: same as @p input
      * @param[in]     config          Kernel configuration
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
+    void configure(const CLCompileContext   &compile_context,
+                   ICLTensor                *input,
+                   ICLTensor                *output,
+                   const FFTScaleKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 840ed0ca2f..86bb502da3 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,16 +29,18 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
-CLFillBorderKernel::CLFillBorderKernel()
-    : ICLKernel(), _tensor(nullptr)
+CLFillBorderKernel::CLFillBorderKernel() : ICLKernel(), _tensor(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 bool CLFillBorderKernel::is_parallelisable() const
@@ -54,27 +56,38 @@ void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue
     ICLKernel::add_argument<T>(idx, static_cast<T>(value));
 }
 
-void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(ICLTensor        *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), tensor, border_size, border_mode, constant_border_value);
 }
 
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+                                   ICLTensor              *tensor,
+                                   BorderSize              border_size,
+                                   BorderMode              border_mode,
+                                   const PixelValue       &constant_border_value)
 {
     _tensor = tensor;
     configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value);
 }
 
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+                                   ITensorInfo            *tensor,
+                                   BorderSize              border_size,
+                                   BorderMode              border_mode,
+                                   const PixelValue       &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
     ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1);
-    auto padding_info = get_padding_info({ tensor });
+    auto padding_info = get_padding_info({tensor});
 
     border_size.limit(tensor->padding());
 
     // If there is no border: early exit
-    if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+    if (border_size.empty() || border_mode == BorderMode::UNDEFINED)
     {
         return;
     }
@@ -96,25 +109,22 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Create static kernel arguments
-    const unsigned int valid_width  = tensor->valid_region().shape[0];
-    const unsigned int valid_height = tensor->valid_region().shape[1];
-    const cl_int2      valid_region_coords =
-    {
-        {
-            static_cast<cl_int>(tensor->valid_region().anchor[0]),
-            static_cast<cl_int>(tensor->valid_region().anchor[1]),
-        }
-    };
-    const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+    const unsigned int valid_width         = tensor->valid_region().shape[0];
+    const unsigned int valid_height        = tensor->valid_region().shape[1];
+    const cl_int2      valid_region_coords = {{
+             static_cast<cl_int>(tensor->valid_region().anchor[0]),
+             static_cast<cl_int>(tensor->valid_region().anchor[1]),
+    }};
+    const unsigned int total_valid_width   = border_size.left + valid_width + border_size.right;
 
     // Set static kernel arguments
     unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
     ICLKernel::add_argument<cl_uint>(idx, valid_width);
     ICLKernel::add_argument<cl_uint>(idx, valid_height);
     ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
-    if(BorderMode::CONSTANT == border_mode)
+    if (BorderMode::CONSTANT == border_mode)
     {
-        switch(dt)
+        switch (dt)
         {
             case DataType::U8:
             case DataType::QASYMM8:
@@ -173,12 +183,13 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
 void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0
-    if(_kernel() == nullptr)
+    if (_kernel() == nullptr)
     {
         return;
     }
 
-    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto tensor =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
 
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
@@ -191,14 +202,13 @@ void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0
-    if(_kernel() == nullptr)
+    if (_kernel() == nullptr)
     {
         return;
     }
@@ -214,7 +224,6 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h
index 7951f48171..5782143cf9 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.h
+++ b/src/core/CL/kernels/CLFillBorderKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -57,7 +58,11 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *tensor,
+                   BorderSize              border_size,
+                   BorderMode              border_mode,
+                   const PixelValue       &constant_border_value = PixelValue());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
@@ -65,7 +70,10 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ICLTensor        *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]     compile_context       The compile context to be used.
@@ -74,7 +82,11 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   BorderSize              border_size,
+                   BorderMode              border_mode,
+                   const PixelValue       &constant_border_value = PixelValue());
 
     /** Function to set the constant value on fill border kernel depending on type.
      *
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index 2116239080..7da0679ae4 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,27 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo         *input_weights,
+                          const ITensorInfo         *bn_mean,
+                          const ITensorInfo         *bn_var,
+                          const ITensorInfo         *fused_weights,
+                          const ITensorInfo         *fused_bias,
+                          const ITensorInfo         *input_bias,
+                          const ITensorInfo         *bn_beta,
+                          const ITensorInfo         *bn_gamma,
+                          float                      epsilon,
+                          FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -53,43 +60,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
     ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
 
-    if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+    if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
     }
     else
     {
-        const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t channel_idx =
+            get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
     }
 
     // Validate bias
-    if(input_bias != nullptr)
+    if (input_bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
     }
     // Validate beta
-    if(bn_beta != nullptr)
+    if (bn_beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
     }
     // Validate gamma
-    if(bn_gamma != nullptr)
+    if (bn_gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
     }
     // Validate output weights
-    if(fused_weights != nullptr && fused_weights->total_size() != 0)
+    if (fused_weights != nullptr && fused_weights->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
     }
     // Validate output bias
-    if(fused_bias != nullptr && fused_bias->total_size() != 0)
+    if (fused_bias != nullptr && fused_bias->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -100,27 +108,52 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
 } // namespace
 
 CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel()
-    : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
-      _run_in_place_weights(false), _run_in_place_bias(false)
+    : _input_weights(nullptr),
+      _input_bias(nullptr),
+      _bn_mean(nullptr),
+      _bn_var(nullptr),
+      _bn_gamma(nullptr),
+      _bn_beta(nullptr),
+      _fused_weights(nullptr),
+      _fused_bias(nullptr),
+      _epsilon(),
+      _run_in_place_weights(false),
+      _run_in_place_bias(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                               ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                               const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const ICLTensor           *input_weights,
+                                               const ICLTensor           *bn_mean,
+                                               const ICLTensor           *bn_var,
+                                               ICLTensor                 *fused_weights,
+                                               ICLTensor                 *fused_bias,
+                                               const ICLTensor           *input_bias,
+                                               const ICLTensor           *bn_beta,
+                                               const ICLTensor           *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+              input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                               ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                               const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const CLCompileContext    &compile_context,
+                                               const ICLTensor           *input_weights,
+                                               const ICLTensor           *bn_mean,
+                                               const ICLTensor           *bn_var,
+                                               ICLTensor                 *fused_weights,
+                                               ICLTensor                 *fused_bias,
+                                               const ICLTensor           *input_bias,
+                                               const ICLTensor           *bn_beta,
+                                               const ICLTensor           *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
 
-    auto padding_info = get_padding_info({ input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma });
+    auto padding_info =
+        get_padding_info({input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma});
 
     _input_weights = input_weights;
     _input_bias    = input_bias;
@@ -133,28 +166,28 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     _epsilon       = epsilon;
 
     _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == input_weights);
-    _run_in_place_bias    = (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
+    _run_in_place_bias =
+        (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
 
     // Auto initialize outputs
-    if(_fused_weights != nullptr)
+    if (_fused_weights != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
     }
-    if(_fused_bias != nullptr)
+    if (_fused_bias != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
     }
 
     // Validate arguments
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
-                                                  (fused_weights != nullptr) ? fused_weights->info() : nullptr,
-                                                  (fused_bias != nullptr) ? fused_bias->info() : nullptr,
-                                                  (input_bias != nullptr) ? input_bias->info() : nullptr,
-                                                  (bn_beta != nullptr) ? bn_beta->info() : nullptr,
-                                                  (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
-                                                  epsilon, fbn_type));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_weights->info(), bn_mean->info(), bn_var->info(),
+        (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+        (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+        (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+        fbn_type));
 
     // Configure kernel window
     Window win = calculate_max_window(*input_weights->info());
@@ -163,7 +196,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input_weights->info()->data_type()));
-    build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
+    build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION,
+                             "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
     build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
     build_opts.add_option_if(_input_weights->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
     build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W");
@@ -178,12 +212,19 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                                const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                                const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                                float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo         *input_weights,
+                                                const ITensorInfo         *bn_mean,
+                                                const ITensorInfo         *bn_var,
+                                                const ITensorInfo         *fused_weights,
+                                                const ITensorInfo         *fused_bias,
+                                                const ITensorInfo         *input_bias,
+                                                const ITensorInfo         *bn_beta,
+                                                const ITensorInfo         *bn_gamma,
+                                                float                      epsilon,
+                                                FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                   input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
     return Status{};
 }
 
@@ -200,25 +241,25 @@ void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl::
     // Add kernel arguments
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input_weights, slice_3d);
-    if(_input_bias != nullptr)
+    if (_input_bias != nullptr)
     {
         add_1D_tensor_argument(idx, _input_bias, slice_1d);
     }
     add_1D_tensor_argument(idx, _bn_mean, slice_1d);
     add_1D_tensor_argument(idx, _bn_var, slice_1d);
-    if(!_run_in_place_weights)
+    if (!_run_in_place_weights)
     {
         add_3D_tensor_argument(idx, _fused_weights, slice_3d);
     }
-    if(!_run_in_place_bias)
+    if (!_run_in_place_bias)
     {
         add_1D_tensor_argument(idx, _fused_bias, slice_1d);
     }
-    if(_bn_beta != nullptr)
+    if (_bn_beta != nullptr)
     {
         add_1D_tensor_argument(idx, _bn_beta, slice_1d);
     }
-    if(_bn_gamma != nullptr)
+    if (_bn_gamma != nullptr)
     {
         add_1D_tensor_argument(idx, _bn_gamma, slice_1d);
     }
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
index 78b1e74cab..76ec7a759f 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
@@ -62,9 +62,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Set the source, destination of the kernel
      *
      * @param[in]  compile_context The compile context to be used.
@@ -81,9 +88,17 @@ public:
      * @param[in]  epsilon         (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type        (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -101,10 +116,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
deleted file mode 100644
index 9215fd602d..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMReshapeInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(input0->data_type() == DataType::QASYMM8)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m();
-    const int n = gemm_info.n();
-    const int k = gemm_info.k();
-
-    ARM_COMPUTE_UNUSED(m);
-    ARM_COMPUTE_UNUSED(n);
-    ARM_COMPUTE_UNUSED(k);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d())
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
-    }
-
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d();
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
-
-    Window win{};
-    bool   window_changed = false;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
-
-    TensorInfo tmp_info(*output);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    // RHS matrix still needs padding on the X
-    AccessWindowStatic input1_access(input1, 0, 0,
-                                     ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
-                                     input1->dimension(1));
-
-    window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyNativeKernel::CLGEMMLowpMatrixMultiplyNativeKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                     const GEMMReshapeInfo &gemm_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                     const GEMMRHSMatrixInfo &rhs_info,
-                                                     const GEMMReshapeInfo   &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
-
-    _input0                   = input0;
-    _input1                   = input1;
-    _output                   = output;
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d();
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
-    // We still need padding on the X dimension for the RHS matrix
-    auto padding_info = get_padding_info({ input0, output });
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    std::string kernel_name("gemmlowp_mm_native");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k());
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                      const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              output->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
-        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
deleted file mode 100644
index 125f0c6948..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */
-class CLGEMMLowpMatrixMultiplyNativeKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyNativeKernel(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyNativeKernel(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1    Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same as lhs_info.k0
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1          Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyNativeKernel
-     *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] input1    Input tensor info for the RHS matrix. Data type supported: same as @p input0
-     * @param[in] output    Output tensor info. Data type supported: S32
-     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     * @param[in] rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: same as lhs_info.k0
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
deleted file mode 100644
index 848f272e50..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMReshapeInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m();
-    const int n = gemm_info.n();
-    const int k = gemm_info.k();
-
-    TensorShape tensor_shape0{ input0->tensor_shape() };
-    tensor_shape0.set(0, k);
-    tensor_shape0.set(1, m);
-
-    TensorShape tensor_shape1{ input1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
-    const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
-    const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
-
-    TensorInfo tmp_info(*output);
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-    Window win                          = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    return std::make_pair(Status{}, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                       const GEMMReshapeInfo &gemm_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info);
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                       const GEMMRHSMatrixInfo &rhs_info,
-                                                       const GEMMReshapeInfo   &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
-
-    _input0                   = input0;
-    _input1                   = input1;
-    _output                   = output;
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-    _k                        = gemm_info.k();
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    auto              padding_info = get_padding_info({ input0, input1, output });
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
-
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
-    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    std::string kernel_name("gemmlowp_mm_reshaped_");
-    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
-    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k());
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.v0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.interleave);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              output->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index 06a73f173d..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
- *
- * @note The input matrices @p input0 and @p input1 must be reshaped through:
- *  - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel
- *  - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
- */
-class CLGEMMLowpMatrixMultiplyReshapedKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyReshapedKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedKernel(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedKernel(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     *                       lhs_info.transpose: false
-     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same as lhs_info.k0
-     *                       rhs_info.transpose: true
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     *                             lhs_info.transpose: false
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     *                             rhs_info.transpose: true
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedKernel
-     *
-     * @param[in] input0    Input tensor info containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in] input1    Input tensor info containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in] output    Output tensor info. Data type supported: S32
-     * @param[in] lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     *                      lhs_info.transpose: false
-     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: 2,3,4,8,16
-     *                      rhs_info.transpose: true
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_output_as_3d;
-    unsigned int     _k;
-    bool             _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
deleted file mode 100644
index 37c11000db..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ /dev/null
@@ -1,573 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(input0->data_type() == DataType::QASYMM8)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-
-    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
-    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m;
-    const int n = gemm_info.n;
-    const int k = gemm_info.k;
-
-    TensorShape tensor_shape1{ input1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    const TensorInfo tensor_info1          = input1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
-    const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info);
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(expected_output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-        }
-    }
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0));
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
-                                    "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
-
-    // Checks performed if the output stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_output_shape[0]);
-        }
-
-        // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-            // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_output_shape.num_dimensions() > 1 && expected_output_shape.y() != vector_sum_row->tensor_shape().x();
-
-            // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_output_shape[1] * expected_output_shape[2]));
-            ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_output_shape[1]);
-
-            if(expected_output_shape.num_dimensions() > 1)
-            {
-                const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-                TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-                vector_sum_row_shape.collapse_from(1);
-                TensorShape collapsed_output_shape(expected_output_shape);
-                collapsed_output_shape.collapse_from(output_batch_idx);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_output_shape[output_batch_idx],
-                                                "vector_sum_row must have the same number of batches of output tensor");
-
-                if(gemm_info.a_offset != 0)
-                {
-                    TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                    vector_sum_col_shape.collapse_from(1);
-
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-                }
-            }
-        }
-
-        if(output->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type());
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-
-        if(output_multipliers != nullptr && output_shifts != nullptr)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_shifts->dimension(0));
-                ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_multipliers->dimension(0));
-            }
-        }
-    }
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
-{
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d != 0);
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // Output tensor auto initialization if not yet initialized
-    const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
-    {
-        auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(output_stage.output_data_type));
-    }
-    else
-    {
-        auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(DataType::S32));
-    }
-
-    TensorInfo tmp_info(*output);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(output->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
-    num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        if(gemm_info.a_offset != 0)
-        {
-            AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
-        }
-        // No access window needed for vector_sum_row
-        ARM_COMPUTE_UNUSED(vector_sum_row);
-
-        if(bias != nullptr)
-        {
-            AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, bias_access);
-        }
-
-        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
-        {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
-            AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
-        }
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel()
-    : _input0(nullptr),
-      _input1(nullptr),
-      _output(nullptr),
-      _vector_sum_col(nullptr),
-      _vector_sum_row(nullptr),
-      _bias(nullptr),
-      _output_multipliers(nullptr),
-      _output_shifts(nullptr),
-      _slide_matrix_b(true),
-      _reinterpret_input_as_3d(false),
-      _reinterpret_output_as_3d(false),
-      _use_dummy_work_items(false),
-      _is_quantized_per_channel(false),
-      _fuse_output_stage(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info,
-                                                              const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts);
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
-                                                              const GEMMKernelInfo &gemm_info,
-                                                              const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(),
-                                                  input1->info(),
-                                                  output->info(),
-                                                  gemm_info,
-                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                  bias != nullptr ? bias->info() : nullptr,
-                                                  output_multipliers != nullptr ? output_multipliers->info() : nullptr,
-                                                  output_shifts != nullptr ? output_shifts->info() : nullptr));
-
-    auto                          padding_info = get_padding_info({ input0, input1, output, vector_sum_row });
-    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
-    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-    const int32_t                 a_offset     = gemm_info.a_offset;
-    const int32_t                 b_offset     = gemm_info.b_offset;
-
-    _input0                   = input0;
-    _input1                   = input1;
-    _output                   = output;
-    _vector_sum_col           = vector_sum_col;
-    _vector_sum_row           = vector_sum_row;
-    _bias                     = bias;
-    _output_multipliers       = output_multipliers;
-    _output_shifts            = output_shifts;
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
-    // In case both input and output have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
-    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(),
-                                                    input1->info(),
-                                                    output->info(),
-                                                    gemm_info,
-                                                    vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                    vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                    bias != nullptr ? bias->info() : nullptr,
-                                                    output_multipliers != nullptr ? output_multipliers->info() : nullptr,
-                                                    output_shifts != nullptr ? output_shifts->info() : nullptr,
-                                                    num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % internal_m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-
-    std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
-    kernel_name += rhs_info.transpose ? "t" : "nt";
-
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        kernel_name += "_fused_output_stage_fixedpoint";
-        _fuse_output_stage = true;
-        // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
-        {
-            build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-            build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-        }
-        // If b_offset == 0, vector_sum_row can be a nullptr
-        build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * input0->info()->dimension(0)));
-        build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-        build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
-        build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
-        build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
-        build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-
-        const int min = output_stage.gemmlowp_min_bound;
-        const int max = output_stage.gemmlowp_max_bound;
-
-        PixelValue min_val{};
-        PixelValue max_val{};
-        std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
-        build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-        build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info,
-                                                               const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              output->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    if(_input1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
-        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        if(_reinterpret_input_as_3d)
-        {
-            // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-            idx++;
-        }
-
-        if(_reinterpret_output_as_3d)
-        {
-            // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-            idx++;
-        }
-
-        if(_fuse_output_stage)
-        {
-            add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
-            add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
-            add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
-            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice);
-            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice);
-        }
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
deleted file mode 100644
index e79f6dfe05..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (input1) has been reshaped
- *
- * @note The input matrix input1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
- * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
- */
-class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                                Only the following values are supported for LHS info:
-     *                                lhs_info.m0: 2,3,4,5,6,7,8
-     *                                lhs_info.k0: 2,3,4,8,16
-     *                                Only the following values are supported for RHS info:
-     *                                rhs_info.n0: 2,3,4,8,16
-     *                                rhs_info.k0: same as lhs_info.k0
-     *                                rhs_info.transpose: true
-     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
-                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                                Only the following values are supported for LHS info:
-     *                                lhs_info.m0: 2,3,4,5,6,7,8
-     *                                lhs_info.k0: 2,3,4,8,16
-     *                                Only the following values are supported for RHS info:
-     *                                rhs_info.n0: 2,3,4,8,16
-     *                                rhs_info.k0: same as lhs_info.k0
-     *                                rhs_info.transpose: true
-     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
-                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
-     *
-     * @param[in] input0             Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] input1             Input tensor info for the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[in] output             Output tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in] gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                               Only the following values are supported for LHS info:
-     *                               lhs_info.m0: 2,3,4,5,6,7,8
-     *                               lhs_info.k0: 2,3,4,8,16
-     *                               Only the following values are supported for RHS info:
-     *                               rhs_info.n0: 2,3,4,8,16
-     *                               rhs_info.k0: same as lhs_info.k0
-     *                               rhs_info.transpose: true
-     * @param[in] vector_sum_col     (Optional) Input row-vector info of sums of all the entries in each column of matrix B.
-     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in] vector_sum_row     (Optional) Input row-vector info of sums of all the entries in each row of matrix A.
-     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in] bias               (Optional) Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32.
-     * @param[in] output_shifts      (Optional) Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col = nullptr,
-                           const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, const ITensorInfo *output_multipliers = nullptr,
-                           const ITensorInfo *output_shifts = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    const ICLTensor *_bias;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _is_quantized_per_channel;
-    bool             _fuse_output_stage;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H */
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
deleted file mode 100644
index e621323c5f..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
-    }
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    return Status{};
-}
-} // namespace
-
-CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
-    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr)
-{
-}
-
-void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
-                                                   int32_t b_offset)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, k, a_offset, b_offset);
-}
-
-void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row,
-                                                   const ICLTensor *bias,
-                                                   int32_t k, int32_t a_offset,
-                                                   int32_t b_offset)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
-                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                  bias != nullptr ? bias->info() : nullptr,
-                                                  a_offset, b_offset)); // NOLINT
-
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias });
-
-    _vector_sum_col = vector_sum_col;
-    _vector_sum_row = vector_sum_row;
-    _mm_result      = mm_result;
-    _bias           = bias;
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration));
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-        build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-    }
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    std::string kernel_name("gemmlowp_offset_contribution");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name + "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                    int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
-    return Status{};
-}
-
-void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _mm_result, slice);
-        add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
-        add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
-        add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
deleted file mode 100644
index f8705595a0..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (vector_sum_col[k] * a_offset) +
- *                   (vector_sum_row[i] * b_offset) +
- *                   (a_offset * b_offset * k)
- *
- */
-class CLGEMMLowpOffsetContributionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpOffsetContributionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in, out] mm_result      Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]      k              Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
-     */
-    void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, int32_t b_offset);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] mm_result       Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]      vector_sum_col  Input row-vector of sums of all the entries in each column of matrix B.
-     *                                 Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row  Input row-vector of sums of all the entries in each row of matrix A.
-     *                                 Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                 Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]      k               Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset        Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset        Offset to be added to each element of the matrix B.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
-                   int32_t b_offset);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result      Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
-     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    ICLTensor       *_mm_result;
-    const ICLTensor *_bias;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
deleted file mode 100644
index 8ed83ed52c..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                          int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-    if(output_stage.is_quantized_per_channel)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
-    }
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
-    // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect");
-
-    return Status{};
-}
-} // namespace
-
-CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel()
-    : _mm_result(nullptr),
-      _vector_sum_col(nullptr),
-      _vector_sum_row(nullptr),
-      _bias(nullptr),
-      _output(nullptr),
-      _output_multipliers(nullptr),
-      _output_shifts(nullptr),
-      _is_quantized_per_channel(false)
-{
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, output, k, a_offset, b_offset, output_stage, output_multipliers, output_shifts);
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row,
-                                                              const ICLTensor *bias, ICLTensor *output,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                              const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output, output_multipliers, output_shifts);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
-                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                  bias != nullptr ? bias->info() : nullptr,
-                                                  output->info(),
-                                                  a_offset, b_offset, output_stage,
-                                                  output_multipliers->info(), output_shifts->info())); // NOLINT
-
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, output, output_multipliers, output_shifts });
-
-    const int min = output_stage.gemmlowp_min_bound;
-    const int max = output_stage.gemmlowp_max_bound;
-
-    _vector_sum_col           = vector_sum_col;
-    _vector_sum_row           = vector_sum_row;
-    _mm_result                = mm_result;
-    _bias                     = bias;
-    _output                   = output;
-    _output_multipliers       = output_multipliers;
-    _output_shifts            = output_shifts;
-    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
-    // Auto initialize the output
-    auto_init_if_empty(*output->info(), mm_result->info()->clone()->set_data_type(output_stage.output_data_type));
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration));
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-        build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-    }
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
-    build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
-    build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    PixelValue min_val{};
-    PixelValue max_val{};
-    std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
-    build_opts.add_option_if((min > min_val.get<int32_t>()), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < max_val.get<int32_t>()), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-
-    std::string kernel_name("gemmlowp_offset_contribution");
-    kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name + "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *output, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
-    return Status{};
-}
-
-void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _mm_result, slice);
-        add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col);
-        add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row);
-        add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice);
-        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 15f54d17a5..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
- * of matrix A and matrix B and performs the output stage defined by the output_stage argument
- *
- * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
- */
-class CLGEMMLowpOffsetContributionOutputStageKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpOffsetContributionOutputStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionOutputStageKernel(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionOutputStageKernel(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  k                  Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage       GEMMLowp output stage info
-     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     */
-    void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
-                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  k                  Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage       GEMMLowp output stage info
-     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
-                   int32_t k,
-                   int32_t a_offset, int32_t b_offset,
-                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result          Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
-     * @param[in] vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] a_offset           Offset to be added to each element of the matrix A.
-     * @param[in] b_offset           Offset to be added to each element of the matrix B.
-     * @param[in] output_stage       GEMMLowp output stage info
-     * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32
-     * @param[in] output_shifts      Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
-                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_mm_result;
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized_per_channel;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 5d827189e2..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                    const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                   const GEMMLowpOutputStageInfo *info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
-
-    auto padding_info = get_padding_info({ input, bias, output });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    auto           min = info->gemmlowp_min_bound;
-    auto           max = info->gemmlowp_max_bound;
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
-    build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Create input window
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Setup bias slice
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
deleted file mode 100644
index 8653102cd8..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16.
- */
-class CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
-     * @param[in]  info            Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QSYMM8/QASYMM8_SIGNED/QSYMM16.
-     * @param[in] info   Output stage info. Used to pass the quantized output data type
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
deleted file mode 100644
index adbbb1f5ac..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                          const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
-                                || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-class Coordinates;
-CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                               const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                              const GEMMLowpOutputStageInfo *info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                              const GEMMLowpOutputStageInfo *info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
-
-    auto padding_info = get_padding_info({ input, bias, output });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
-    auto min = info->gemmlowp_min_bound;
-    auto max = info->gemmlowp_max_bound;
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
-    build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Create input window
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Setup bias slice
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
deleted file mode 100644
index 0a8d5e1942..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Requantize
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- */
-class CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: S32
-     * @param[in]  bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                    Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info   Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info            Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] info   Output stage info. Used to pass the quantized output data type
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
deleted file mode 100644
index 7af4d16780..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != output_stage->output_data_type, "Mismatching output data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-} //namespace
-
-CLGEMMLowpQuantizeDownInt32ScaleKernel::CLGEMMLowpQuantizeDownInt32ScaleKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, output_stage));
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, output_stage);
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                       const GEMMLowpOutputStageInfo *output_stage)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
-                                                  (bias != nullptr) ? bias->info() : nullptr,
-                                                  output->info(),
-                                                  output_stage));
-
-    auto padding_info = get_padding_info({ input, bias, output });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_stage->output_data_type));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    auto           min = output_stage->gemmlowp_min_bound;
-    auto           max = output_stage->gemmlowp_max_bound;
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
-    build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-}
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
deleted file mode 100644
index abdf33ea43..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *  -#  -to the [0..255] range and cast to QASYMM8.
- *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleKernel(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input        Input tensor. Data type supported: S32
-     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  output_stage GEMMLowp output stage metadata.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  output_stage    GEMMLowp output stage metadata.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in] input        Input tensor. Data type supported: S32
-     * @param[in] bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                         Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] output_stage GEMMLowp output stage metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */
-\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
deleted file mode 100644
index 3d23aa7f34..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
-
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
-    }
-    return Status{};
-}
-
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
-    }
-    return Status{};
-}
-} // namespace
-
-ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
-    : _input(), _output()
-{
-}
-
-void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mtx_a, vector_sum_row, info);
-}
-
-void CLGEMMLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*vector_sum_row->info(), TensorShape(mtx_a->info()->dimension(1)), 1, DataType::S32);
-
-    auto padding_info = get_padding_info({ mtx_a, vector_sum_row });
-
-    _input  = mtx_a;
-    _output = vector_sum_row;
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->info()->data_type()));
-    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
-    const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
-
-    std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    // This kernel does not need padding
-    Window win = calculate_max_window(*vector_sum_row->info(), Steps());
-    ICLKernel::configure_internal(win);
-
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(_input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(_input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(_input->info()->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
-    Window slice_in  = collapsed.first_slice_window_2D();
-    Window slice_out = collapsed.first_slice_window_2D();
-
-    // Setup input slice. Its dimensions are increased in the cl kernel.
-    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
-}
-
-void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), mtx_b, vector_sum_col, info);
-}
-
-void CLGEMMLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
-
-    _input  = mtx_b;
-    _output = vector_sum_col;
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*_output->info(), TensorShape(mtx_b->info()->dimension(0)), 1, DataType::S32);
-
-    auto padding_info = get_padding_info({ mtx_b, vector_sum_col });
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->info()->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->info()->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0)));
-    build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->info()->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->info()->data_type()));
-    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
-
-    return Status{};
-}
-
-void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
-
-    Window slice_out = collapsed.first_slice_window_2D();
-    Window slice_in  = slice_out;
-
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_2D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h b/src/core/CL/kernels/CLGEMMLowpReductionKernel.h
deleted file mode 100644
index 237d8099b7..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-struct GEMMLowpReductionKernelInfo;
-
-/** Common interface for all OpenCL reduction kernels */
-class ICLGEMMLowpReductionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    ICLGEMMLowpReductionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info   Kernel metadata:
-     *                    - k            Number of matrix columns/rows depending on the type of reduction.
-     *                    - is_reshaped  True if the matrix has been reshaped.
-     *                    - scalar       Scalar value to multiply each reduced column/row by.
-     *                    - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] output          Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-
-protected:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            Number of matrix columns/rows depending on the type of reduction.
-     *                            - is_reshaped  True if the matrix has been reshaped.
-     *                            - scalar       Scalar value to multiply each reduced column/row by.
-     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_a           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] vector_sum_row  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel
-     *
-     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            Number of matrix columns/rows depending on the type of reduction.
-     *                           - is_reshaped  True if the matrix has been reshaped.
-     *                           - scalar       Scalar value to multiply each reduced column/row by.
-     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            Number of matrix columns/rows depending on the type of reduction.
-     *                            - is_reshaped  True if the matrix has been reshaped.
-     *                            - scalar       Scalar value to multiply each reduced column/row by.
-     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_b           Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[out] vector_sum_col  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel
-     *
-     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            Number of matrix columns/rows depending on the type of reduction.
-     *                           - is_reshaped  True if the matrix has been reshaped.
-     *                           - scalar       Scalar value to multiply each reduced column/row by.
-     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H */
diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index cbd540d80b..904bb07282 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLGatherKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -34,20 +36,22 @@ namespace arm_compute
 {
 namespace
 {
-inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+inline Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
-    ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() + indices->num_dimensions() - 1) > 4);
+
     ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+            input->tensor_shape(), indices->tensor_shape(), actual_axis);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
     }
 
@@ -56,12 +60,14 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
     const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
     // Output auto initialization if not yet initialized
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+        input->tensor_shape(), indices->tensor_shape(), actual_axis);
     auto_init_if_empty((*output), output_shape, 1, input->data_type());
 
     // Create window
@@ -72,9 +78,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 
 } // namespace
 
-CLGatherKernel::CLGatherKernel()
-    : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+CLGatherKernel::CLGatherKernel() : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
@@ -82,10 +88,14 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices,
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
 }
 
-void CLGatherKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGatherKernel::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *input,
+                               const ICLTensor        *indices,
+                               ICLTensor              *output,
+                               int                     axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
-    auto padding_info = get_padding_info({ input, output, indices });
+    auto padding_info = get_padding_info({input, output, indices});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis));
 
     // Configure kernel window
@@ -99,10 +109,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
 
     // Set build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option("-DINDICES_DIMS=" + support::cpp11::to_string(indices->info()->num_dimensions()));
     build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+    build_opts.add_option("-DINDEX_LIMIT=" + support::cpp11::to_string(input->info()->tensor_shape()[_axis]));
 
     // Create kernel
     _kernel = create_kernel(compile_context, "gather", build_opts.options());
@@ -110,10 +122,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
     return Status{};
 }
 
@@ -125,7 +139,7 @@ void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
     Window       window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     unsigned int idx              = 0;
     add_4D_tensor_argument(idx, _input, window_collapsed);
-    add_1D_tensor_argument(idx, _indices, window_collapsed);
+    add_4D_tensor_argument(idx, _indices, window_collapsed);
     add_4D_tensor_argument(idx, _output, window_collapsed);
     enqueue(queue, *this, window_collapsed, lws_hint());
 }
diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h
index 8f472a4696..db4b49d2f5 100644
--- a/src/core/CL/kernels/CLGatherKernel.h
+++ b/src/core/CL/kernels/CLGatherKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLGATHERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +64,11 @@ public:
      * @param[out] output          Destination tensor. Data type supported: Same as @p input
      * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   int                     axis = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
      *
@@ -74,7 +79,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 459ed035b1..b9ff72b928 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -46,7 +48,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
-    if(all_anchors->total_size() > 0)
+    if (all_anchors->total_size() > 0)
     {
         size_t feature_height = info.feat_height();
         size_t feature_width  = info.feat_width();
@@ -56,7 +58,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
 
-        if(is_data_type_quantized(anchors->data_type()))
+        if (is_data_type_quantized(anchors->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
         }
@@ -65,20 +67,25 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
 }
 } // namespace
 
-CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel()
-    : _anchors(nullptr), _all_anchors(nullptr)
+CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() : _anchors(nullptr), _all_anchors(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const ICLTensor          *anchors,
+                                          ICLTensor                *all_anchors,
+                                          const ComputeAnchorsInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info);
 }
 
-void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const CLCompileContext   &compile_context,
+                                          const ICLTensor          *anchors,
+                                          ICLTensor                *all_anchors,
+                                          const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors);
-    auto padding_info = get_padding_info({ anchors, all_anchors });
+    auto padding_info = get_padding_info({anchors, all_anchors});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info));
 
     // Metadata
@@ -89,7 +96,8 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
 
     // Initialize the output if empty
     const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
-    auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+    auto_init_if_empty(*all_anchors->info(),
+                       TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
 
     // Set instance variables
     _anchors     = anchors;
@@ -106,7 +114,7 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors));
     build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi()));
 
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
         build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
@@ -114,8 +122,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     }
 
     // Create kernel
-    const std::string kernel_name = (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
+    const std::string kernel_name =
+        (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields).
     // This means we don't need to pad on the X dimension, as we know in advance how many fields
@@ -125,7 +134,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status CLComputeAllAnchorsKernel::validate(const ITensorInfo        *anchors,
+                                           const ITensorInfo        *all_anchors,
+                                           const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
     return Status{};
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
index d26795ac7d..e08f281d6c 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
@@ -62,7 +62,10 @@ public:
      * @param[in]  info            Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
+    void configure(const CLCompileContext   &compile_context,
+                   const ICLTensor          *anchors,
+                   ICLTensor                *all_anchors,
+                   const ComputeAnchorsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
      *
@@ -81,5 +84,5 @@ private:
     const ICLTensor *_anchors;
     ICLTensor       *_all_anchors;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
deleted file mode 100644
index 44012690e7..0000000000
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ /dev/null
@@ -1,427 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLIm2ColKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-#include <tuple>
-#include <utility>
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-struct Im2ColConfiguration
-{
-    std::string           kernel_name{};
-    std::set<std::string> build_options{};
-    unsigned int          num_elems_processed_per_iteration{};
-    bool                  is_padding_required_nchw{};
-};
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                          unsigned int num_groups)
-{
-    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % num_groups) != 0);
-
-    // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
-    const unsigned int width_idx    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned     total_width  = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
-    const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
-    ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
-
-    if(output->total_size() > 0)
-    {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                                        unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto initialization if not yet initialized
-    TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
-
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(expected_output_shape));
-
-    const DataLayout   data_layout  = input->data_layout();
-    const unsigned int width_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int input_width  = input->dimension(width_idx);
-    const unsigned int input_height = input->dimension(height_idx);
-
-    // Configure the execute window based on the selected optimal OpenCL kernel
-    bool   window_changed = false;
-    Window win;
-
-    if(data_layout == DataLayout::NHWC)
-    {
-        win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    }
-    else
-    {
-        if(is_padding_required_nchw)
-        {
-            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
-            win = calculate_max_window(*input,
-                                       Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
-            AccessWindowStatic input_access(input,
-                                            -border.left,
-                                            -border.top,
-                                            ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
-                                            input_height + border.bottom);
-            window_changed = window_changed || update_window_and_padding(win, input_access);
-        }
-        else
-        {
-            // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
-            // update_window_and_padding() can be skipped
-            win = calculate_max_window(*input, Steps());
-        }
-    }
-
-    // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
-    win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
-{
-    const DataLayout   data_layout   = input->data_layout();
-    const DataType     data_type     = input->data_type();
-    const unsigned int width_idx     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const unsigned int input_width   = input->dimension(width_idx);
-    const unsigned int input_height  = input->dimension(height_idx);
-    const unsigned int input_channel = input->dimension(channel_idx);
-
-    const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
-
-    // Im2Col configuration
-    std::string                   kernel_name = "im2col_generic_";
-    CLBuildOptions                build_opts;
-    unsigned int                  num_elems_processed_per_iteration = 1;
-    bool                          is_padding_required_nchw          = false;
-    const UniformQuantizationInfo qinfo                             = input->quantization_info().uniform();
-
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->element_size()));
-    build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
-    build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
-    build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first));
-    build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second));
-    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
-    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
-    build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-    build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-    build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
-    build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
-    build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel));
-    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
-    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
-    build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-    build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
-    build_opts.add_option_if(has_bias, "-DHAS_BIAS");
-
-    if(data_layout == DataLayout::NHWC)
-    {
-        num_elems_processed_per_iteration = std::min(2U, input_channel);
-        is_padding_required_nchw          = false;
-
-        // Only the 3x3 and 9x9 cases are optimized for NHWC
-        if(kernel_dims == Size2D(3U, 3U))
-        {
-            kernel_name = "im2col3x3_";
-        }
-        else if(kernel_dims == Size2D(9U, 9U))
-        {
-            kernel_name = "im2col9x9_";
-        }
-
-        // Get boundary vector (the first/last vector with potentially a partial vector size) size
-        // If input_channel is a multiple of num_elems_processed_per_iteration, the boundary vec size is the (full) vector size
-        // otherwise, the boundary vec size is the (partial) remainder vector size
-        const unsigned int vec_size          = num_elems_processed_per_iteration;
-        const unsigned int partial_vec_size  = input_channel % vec_size;
-        const unsigned int boundary_vec_size = vec_size - ((vec_size - partial_vec_size) % vec_size);
-        build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vec_size));
-        build_opts.add_option("-DBOUNDARY_VECTOR_SIZE=" + support::cpp11::to_string(boundary_vec_size));
-    }
-    else
-    {
-        if(dilation == Size2D(1U, 1U))
-        {
-            const bool squared_im2col = kernel_dims.width == kernel_dims.height;
-            if(squared_im2col)
-            {
-                // Check if we can run an optimized im2col for NCHW
-                switch(kernel_dims.width)
-                {
-                    case 1:
-                        // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
-                        if(conv_info.stride().first == 1 && !conv_info.has_padding())
-                        {
-                            kernel_name                       = "im2col1x1_stridex1_";
-                            num_elems_processed_per_iteration = 4;
-                            is_padding_required_nchw          = true;
-                        }
-                        break;
-                    case 3:
-                        kernel_name                       = "im2col3x3_";
-                        num_elems_processed_per_iteration = 1;
-                        is_padding_required_nchw          = true;
-                        break;
-                    case 5:
-                        kernel_name                       = "im2col5x5_";
-                        num_elems_processed_per_iteration = 1;
-                        is_padding_required_nchw          = true;
-                        break;
-                    case 11:
-                        // Optimized im2col11x11 if pad_x = pad_y = 0
-                        if(!conv_info.has_padding())
-                        {
-                            kernel_name                       = "im2col11x11_padx0_pady0_";
-                            num_elems_processed_per_iteration = 1;
-                            is_padding_required_nchw          = true;
-                        }
-                        break;
-                    default:
-                        kernel_name                       = "im2col_generic_";
-                        num_elems_processed_per_iteration = 1;
-                        is_padding_required_nchw          = false;
-                        break;
-                }
-            }
-            else if(kernel_dims.width > 1 && !conv_info.has_padding())
-            {
-                kernel_name                       = "im2col_generic_padx0_pady0_";
-                num_elems_processed_per_iteration = 1;
-                is_padding_required_nchw          = false;
-
-                // Optimized im2col is performed using one or more vector operations with the specified vector size
-                // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
-                // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
-                // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
-                // Using the vector size of 8, however, may be faster.
-                // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
-                // is used instead.)
-                const size_t vector_size           = std::min(static_cast<size_t>(4), kernel_dims.width);
-                const size_t width_mod_vector_size = kernel_dims.width % vector_size;
-                build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-                build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
-            }
-        }
-    }
-
-    // Append the data layout to the kernel_name
-    kernel_name += lower_string(string_from_data_layout(data_layout));
-
-    Im2ColConfiguration im2col_config;
-    im2col_config.kernel_name                       = kernel_name;
-    im2col_config.build_options                     = build_opts.options();
-    im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration;
-    im2col_config.is_padding_required_nchw          = is_padding_required_nchw;
-
-    return im2col_config;
-}
-} // namespace
-
-CLIm2ColKernel::CLIm2ColKernel()
-    : _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
-{
-}
-
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                               unsigned int num_groups)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
-}
-
-void CLIm2ColKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                               const Size2D &dilation,
-                               unsigned int  num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
-
-    auto padding_info = get_padding_info({ input, output });
-    _data_layout      = input->info()->data_layout();
-
-    const unsigned int width_idx    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int input_width  = input->info()->dimension(width_idx);
-    const unsigned int input_height = input->info()->dimension(height_idx);
-
-    // Select and configure the optimal OpenCL kernel to run.
-    // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
-    // and the padding requirement flag
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation, num_groups);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options);
-
-    _input                             = input;
-    _output                            = output;
-    _convolved_dims                    = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
-    _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
-    _kernel_dims                       = kernel_dims; // Only needed by the Tuner
-    _conv_info                         = conv_info;   // Only needed by the Tuner
-    _num_groups                        = num_groups;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
-                                                    im2col_config.is_padding_required_nchw, num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = im2col_config.kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(num_groups);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-
-    ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
-}
-
-Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups));
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation, num_groups);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
-                                                              im2col_config.is_padding_required_nchw, num_groups)
-                                .first);
-    return Status{};
-}
-
-void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    // Get initial windows
-    // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    window_collapsed.set_dimension_step(Window::DimZ, 1);
-
-    Window window_output;
-    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-
-    const Window first_slice_3d = window_collapsed.first_slice_window_3D();
-
-    Window slice     = first_slice_3d;
-    Window slice_in  = first_slice_3d;
-    Window slice_out = window_output.first_slice_window_2D();
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        const Window tmp_win     = window.collapse_if_possible(ICLKernel::window(), 3);
-        const int    num_batches = tmp_win[3].end();
-
-        slice.set(1, Window::Dimension(0, static_cast<int>(_output->info()->tensor_shape()[1]), 1));
-        slice.set(2, Window::Dimension(0, static_cast<int>(num_batches), 1));
-    }
-    else
-    {
-        slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration));
-        slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
-        // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
-    }
-
-    // Setup input slice
-    // The dimensions of the input are increased within the OpenCL kernel
-    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Setup output slice
-    // The dimensions of the output are increased within the OpenCL kernel
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        if(_num_groups == 1)
-        {
-            add_2D_tensor_argument(idx, _output, slice_out);
-        }
-        else
-        {
-            add_3D_tensor_argument(idx, _output, slice_out);
-        }
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLIm2ColKernel.h b/src/core/CL/kernels/CLIm2ColKernel.h
deleted file mode 100644
index 2920c7d138..0000000000
--- a/src/core/CL/kernels/CLIm2ColKernel.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLIM2COLKERNEL_H
-#define ARM_COMPUTE_CLIM2COLKERNEL_H
-
-#include "arm_compute/core/Size2D.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * =
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class CLIm2ColKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLIm2ColKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIm2ColKernel(const CLIm2ColKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIm2ColKernel &operator=(const CLIm2ColKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLIm2ColKernel(CLIm2ColKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLIm2ColKernel &operator=(CLIm2ColKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                         while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims The kernel dimensions (width and height).
-     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
-     *                         This is valid only for non-quantized inputs.
-     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution.
-     *                         Number of groups other than 1 is only supported for NCHW data layout.
-     *                         Number of groups should be multiple to the number of channels.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                   unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output          The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                             while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims     The kernel dimensions (width and height).
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias        In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                   const Size2D &dilation   = Size2D(1U, 1U),
-                   unsigned int  num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel
-     *
-     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                        while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in] kernel_dims The kernel dimensions (width and height).
-     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
-     *                        This is valid only for non-quantized inputs.
-     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution.
-     *                        Number of groups other than 1 is only supported for NCHW data layout.
-     *                        Number of groups should be multiple to the number of channels.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                           unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    DataLayout       _data_layout;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-    unsigned int  _num_elems_processed_per_iteration;
-    Size2D        _kernel_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _num_groups;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLIM2COLKERNEL_H */
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index 323579dc3c..b13eb16556 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -38,17 +40,20 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status validate_arguments(const ITensorInfo                          *input,
+                          const ITensorInfo                          *output,
+                          const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.epsilon == 0.f, "Epsilon must be different than 0");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
 
     return Status{};
@@ -58,26 +63,30 @@ Status validate_arguments_meanvar(const ITensorInfo *input, const ITensorInfo *o
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
 
     return Status{};
 }
 } // namespace
 
-CLComputeMeanVariance::CLComputeMeanVariance()
-    : _input(nullptr), _output(nullptr)
+CLComputeMeanVariance::CLComputeMeanVariance() : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision)
+void CLComputeMeanVariance::configure(const CLCompileContext &compile_context,
+                                      ICLTensor              *input,
+                                      ICLTensor              *output,
+                                      bool                    use_mixed_precision)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output == nullptr ? input : output;
@@ -86,7 +95,8 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I
     const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
 
     CLBuildOptions build_opts;
-    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option("-DINTERNAL_DATA_TYPE=" +
+                          (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
@@ -106,7 +116,7 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I
     const TensorShape  out_shape(input_channel, 2u, input_batches);
 
     // Output auto initialization if not yet initialized
-    if(use_mixed_precision)
+    if (use_mixed_precision)
     {
         auto_init_if_empty(*_output->info(), out_shape, 1, DataType::F32);
     }
@@ -132,7 +142,7 @@ void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue)
     Window collapsed_window = window.collapse(window, Window::DimZ);
 
     // We will process the planes together
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if (_input->info()->data_layout() == DataLayout::NCHW)
     {
         collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
         collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -152,12 +162,17 @@ void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue)
 CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel()
     : _input(nullptr), _output(nullptr), _mean(nullptr), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext                     &compile_context,
+                                                   ICLTensor                                  *input,
+                                                   ICLTensor                                  *mean_var,
+                                                   ICLTensor                                  *output,
+                                                   const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output == nullptr ? input : output;
@@ -169,7 +184,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision
+                                                         ? "float"
+                                                         : get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
@@ -185,7 +202,7 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps(1));
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
     }
@@ -194,7 +211,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo                          *input,
+                                                    const ITensorInfo                          *output,
+                                                    const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
     return Status{};
@@ -208,7 +227,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
     Window collapsed_window = window.collapse(window, Window::DimZ);
 
     // We will process the planes together
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if (_input->info()->data_layout() == DataLayout::NCHW)
     {
         collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
         collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -223,7 +242,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
     add_4D_tensor_argument(idx, _input, collapsed_window);
     add_3D_tensor_argument(idx, _mean, collapsed_window);
 
-    if(!_run_in_place)
+    if (!_run_in_place)
     {
         add_4D_tensor_argument(idx, _output, collapsed_window);
     }
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
index 2f9014a651..9f436da7f6 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -59,7 +59,11 @@ public:
      * @param[out]     output          Destination tensor. Data types and data layouts supported: same as @p input.
      * @param[in]      info            Kernel meta-data descriptor
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
+    void configure(const CLCompileContext                     &compile_context,
+                   ICLTensor                                  *input,
+                   ICLTensor                                  *mean_var,
+                   ICLTensor                                  *output,
+                   const InstanceNormalizationLayerKernelInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *
@@ -69,7 +73,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -106,7 +111,8 @@ public:
      * @param[out]     output              Destination tensor. Data types and data layouts supported: same as @p input.
      * @param[in]      use_mixed_precision Use mixed precision in case of FP16 execution
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
+    void
+    configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index c688951d57..9ed9d7c5b0 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,11 +29,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -42,7 +43,8 @@ namespace
 {
 constexpr int max_input_tensor_dim = 3;
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
@@ -52,14 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+                                    "Actual normalization axis greater than max number of dimensions");
 
     // Reduce shape on axis
     TensorShape sum_shape = input->tensor_shape();
     sum_shape.set(actual_axis, 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -74,18 +77,25 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
 CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel()
     : _input(nullptr), _sum(nullptr), _output(nullptr), _actual_axis(0), _epsilon(1e-12)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(
+    const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, axis, epsilon);
 }
 
-void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context,
+                                         const ICLTensor        *input,
+                                         const ICLTensor        *sum,
+                                         ICLTensor              *output,
+                                         int                     axis,
+                                         float                   epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
-    auto padding_info = get_padding_info({ input, sum, output });
+    auto padding_info = get_padding_info({input, sum, output});
 
     _input       = input;
     _sum         = sum;
@@ -93,8 +103,9 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     _actual_axis = wrap_around(axis, max_input_tensor_dim);
     _epsilon     = epsilon;
 
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
-    const int          vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+    const unsigned int vec_size_x =
+        adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+    const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
 
     // Set build options
     CLBuildOptions build_opts;
@@ -105,7 +116,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     // Create kernel
     std::string  kernel_name;
     unsigned int idx = 0;
-    switch(_actual_axis)
+    switch (_actual_axis)
     {
         case 0:
             kernel_name = "l2_normalize_x";
@@ -125,7 +136,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set epsilon argument
-    if(input->info()->data_type() == DataType::F32)
+    if (input->info()->data_type() == DataType::F32)
     {
         _kernel.setArg<cl_float>(idx, _epsilon);
     }
@@ -144,7 +155,8 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status CLL2NormalizeLayerKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
     return Status{};
@@ -157,7 +169,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
 
     Window window_sum(window);
 
-    switch(_actual_axis)
+    switch (_actual_axis)
     {
         case 0:
         {
@@ -171,8 +183,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_2D_tensor_argument(idx, _sum, sum_slice);
                 add_2D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+            } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
         case 1:
@@ -187,8 +198,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_2D_tensor_argument(idx, _sum, sum_slice);
                 add_2D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+            } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
         case 2:
@@ -203,8 +213,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_3D_tensor_argument(idx, _sum, sum_slice);
                 add_3D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+            } while (window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
index edc0585217..5c9ab94ce5 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -70,7 +71,12 @@ public:
      * @param[in]  axis            Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
      * @param[in]  epsilon         Lower bound value for the normalization.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *sum,
+                   ICLTensor              *output,
+                   int                     axis,
+                   float                   epsilon);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel.
      *
@@ -84,7 +90,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
index ab68e0f68d..e560f1de4a 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,9 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -42,26 +43,31 @@ using namespace misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status validate_arguments(const ITensorInfo      *input,
+                          const ITensorInfo      *output,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices);
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    PoolingType         pool_type          = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int    pool_size_x = pool_info.pool_size.width;
-    const int    pool_size_y = pool_info.pool_size.height;
+    const int    pool_size_x               = pool_info.pool_size.width;
+    const int    pool_size_y               = pool_info.pool_size.height;
     const Size2D pool_size(pool_size_x, pool_size_y);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                    "Pooling indices only supported for MAX pooling method");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -71,16 +77,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel()
-    : _input(nullptr), _output(nullptr), _indices(nullptr)
+CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() : _input(nullptr), _output(nullptr), _indices(nullptr)
 {
+    _type = CLKernelType::POOL;
 }
 
-void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *indices,
+                                          ICLTensor              *output,
+                                          const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
-    auto padding_info = get_padding_info({ input, indices, output });
+    auto padding_info = get_padding_info({input, indices, output});
 
     _input   = input;
     _output  = output;
@@ -118,7 +128,10 @@ void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo      *input,
+                                           const ITensorInfo      *indices,
+                                           const ITensorInfo      *output,
+                                           const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
@@ -139,7 +152,6 @@ void CLMaxUnpoolingLayerKernel::run(const Window &window, cl::CommandQueue &queu
         add_3D_tensor_argument(idx, _output, slice);
         add_3D_tensor_argument(idx, _indices, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
index 45481d0507..eb18a46784 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
@@ -59,7 +59,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -72,7 +76,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index 9f98b67582..8632bdf623 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,9 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -59,6 +62,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
 CLMeanStdDevNormalizationKernel::CLMeanStdDevNormalizationKernel()
     : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *output, float epsilon)
@@ -66,15 +70,19 @@ void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
 }
 
-void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context,
+                                                ICLTensor              *input,
+                                                ICLTensor              *output,
+                                                float                   epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     _run_in_place = (output == nullptr) || (output == input);
 
-    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(
+        input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), *input->info());
     }
@@ -82,7 +90,8 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
     _input  = input;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
@@ -90,6 +99,7 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
     build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DMEANSTDNORM_HALF");
     build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
 
     // Create kernel
@@ -130,7 +140,6 @@ void CLMeanStdDevNormalizationKernel::run(const Window &window, cl::CommandQueue
         add_2D_tensor_argument_if((!_run_in_place), idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
index a1ba2b905e..e02a3c58a3 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
@@ -66,7 +66,10 @@ public:
      * @param[out]     output          (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
      * @param[in]      epsilon         (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output  = nullptr,
+                   float                   epsilon = 1e-8f);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
      *
      * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
deleted file mode 100644
index ac8770467e..0000000000
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <climits>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
-    if(output->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-        TensorShape output_shape = compute_min_max_shape(input);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    TensorShape output_shape = compute_min_max_shape(input);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, input->data_type());
-
-    const unsigned int num_elems_processed_per_iteration = 1;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     output_access(output, 0, 0, 2, output->dimension(1));
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win);
-}
-} // namespace
-
-CLMinMaxLayerKernel::CLMinMaxLayerKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    std::set<std::string> build_opts;
-    build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.emplace("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
-    build_opts.emplace("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "minmax_layer", build_opts);
-
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
-}
-
-Status CLMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-
-    return Status{};
-}
-
-void CLMinMaxLayerKernel::reset(cl::CommandQueue &queue)
-{
-    _output->map(queue, true);
-
-    Window window_output;
-    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator output(_output, window_output);
-
-    // Reset output
-    execute_window_loop(window_output, [&](const Coordinates &)
-    {
-        auto *ptr = reinterpret_cast<float *>(output.ptr());
-        ptr[0]    = std::numeric_limits<float>::max();
-        ptr[1]    = std::numeric_limits<float>::min();
-    },
-    output);
-
-    _output->unmap(queue);
-}
-
-void CLMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
-    Window slice            = window_collapsed.first_slice_window_3D();
-    slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        Window output_slice = slice.shift_dimensions(2);
-
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, _input, slice);
-        add_1D_tensor_argument(idx, _output, output_slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.h b/src/core/CL/kernels/CLMinMaxLayerKernel.h
deleted file mode 100644
index aa2ff3f375..0000000000
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to perform min max search on a 3D tensor.
- */
-class CLMinMaxLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMinMaxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLayerKernel(const CLMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLayerKernel &operator=(const CLMinMaxLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLayerKernel(CLMinMaxLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLayerKernel &operator=(CLMinMaxLayerKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
-     * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
-     * @param[out] output          Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
-     *
-     * @param[in] input  Input tensor info.  Data types supported: F32.
-     * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                   The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    /** Resets global minimum and maximum
-     *
-     * @param[in,out] queue Command queue on which to map and unmap the min_max tensor
-     */
-    void reset(cl::CommandQueue &queue);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMINMAXLAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 9242505315..b636c485e7 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -51,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -61,7 +64,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
 {
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
@@ -69,20 +73,33 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     bool             window_changed = false;
     Window           win;
     const DataLayout data_layout = input->data_layout();
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        const unsigned int vec_size_x            = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
-        const unsigned int norm_idx              = get_normalization_dimension_index(input->data_layout(), norm_info);
-        const bool         is_norm_accross_width = norm_idx == 0;
+        const unsigned int vec_size_x =
+            adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        const unsigned int norm_idx             = get_normalization_dimension_index(input->data_layout(), norm_info);
+        const bool         is_norm_across_width = norm_idx == 0;
 
-        const unsigned int border_width = is_norm_accross_width ? vec_size_x - 1 : 0;
-        const BorderSize   border_size  = BorderSize(0, border_width);
+        const unsigned int norm_radius = norm_info.norm_size() / 2;
+        // Border / padding calculation:
+        // For NCHW no border handling is impelmeneted in the kernel in the x axis.
+        // This means the x axis is fully-padded depending on vec_size_x and norm_size
+        // E.G. for input x dimension = 3, norm_size = 3 (radius = 1), vec_size_x = 2 ('#' is element 'p' is padding):
+        // In : |p|#|#|#|p|p|
+        // Out:   |#|#|#|p|
+        // The output has 1 right padding because of the vec_size_x.
+        // The input has 1 left padding because radius = 1.
+        // The input has 2 right padding because of radius = 1 AND because of the extra output padding
+        const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right =
+            is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
+        const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left);
 
         win = calculate_max_window(*input, Steps(vec_size_x));
 
         // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
         // Reads can occur within the valid region of the input
-        if(is_norm_accross_width)
+        if (is_norm_across_width)
         {
             AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
             window_changed = window_changed || update_window_and_padding(win, input_access);
@@ -99,13 +116,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     else
     {
         unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
-        if(norm_info.is_cross_map())
+        if (norm_info.is_cross_map())
         {
             vec_size_x = 1;
         }
         win = calculate_max_window(*input, Steps(vec_size_x));
     }
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -113,6 +131,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 CLNormalizationLayerKernel::CLNormalizationLayerKernel()
     : _input(nullptr), _output(nullptr), _border_size(0), _is_norm_across_width(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 BorderSize CLNormalizationLayerKernel::border_size() const
@@ -125,10 +144,13 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
 }
 
-void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+                                           const ICLTensor        *input,
+                                           ICLTensor              *output,
+                                           NormalizationLayerInfo  norm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
@@ -138,21 +160,34 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
     _input  = input;
     _output = output;
 
-    const DataLayout data_layout          = input->info()->data_layout();
-    unsigned int     vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
-    int              vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
-    if(norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
+    const DataLayout data_layout = input->info()->data_layout();
+    unsigned int     vec_size_x =
+        adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+    int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+    if (norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
     {
         vec_size_x           = 1;
         vec_size_x_leftovers = 0;
     }
 
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        const unsigned int norm_idx     = get_normalization_dimension_index(data_layout, norm_info);
-        _is_norm_across_width           = norm_idx == 0;
-        const unsigned int border_width = _is_norm_across_width ? vec_size_x - 1 : 0;
-        _border_size                    = BorderSize(0, border_width);
+        const unsigned int norm_idx    = get_normalization_dimension_index(data_layout, norm_info);
+        _is_norm_across_width          = norm_idx == 0;
+        const unsigned int norm_radius = norm_info.norm_size() / 2;
+        // Border / padding calculation:
+        // For NCHW no border handling is impelmeneted in the kernel in the x axis.
+        // This means the x axis is fully-padded depending on vec_size_x and norm_size
+        // E.G. for input x dimension = 3, norm_size = 3 (radius = 1), vec_size_x = 2 ('#' is element 'p' is padding):
+        // In : |p|#|#|#|p|p|
+        // Out:   |#|#|#|p|
+        // The output has 1 right padding because of the vec_size_x.
+        // The input has 1 left padding because radius = 1.
+        // The input has 2 right padding because of radius = 1 AND the extra output padding
+        const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right =
+            _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
+        _border_size = BorderSize(0, border_width_right, 0, border_width_left);
     }
 
     const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
@@ -168,12 +203,14 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
     build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
     build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
     build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
-    build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC, "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()),
+                             "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC,
+                             "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
 
     // Create kernel
     std::string kernel_name;
-    if(norm_info.is_in_map())
+    if (norm_info.is_in_map())
     {
         kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout));
     }
@@ -197,16 +234,19 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
     _config_id += support::cpp11::to_string(input->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
     }
 }
 
-Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
+Status CLNormalizationLayerKernel::validate(const ITensorInfo     *input,
+                                            const ITensorInfo     *output,
+                                            NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
 
     return Status{};
 }
@@ -226,7 +266,6 @@ void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &que
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h
index 739a2ae9f1..5517ba6904 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -63,7 +63,10 @@ public:
      *                             Data layouts supported: same as @p input.
      * @param[in]  norm_info       Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   NormalizationLayerInfo  norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
      *
      * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -77,7 +80,7 @@ public:
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void       run(const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index cf2511adec..59352a8fb7 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,32 +29,37 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
 
-    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int channel_idx =
+        get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -75,7 +80,8 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
 
     bool window_changed = update_window_and_padding(win, input_access, output_access);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -83,14 +89,22 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
 CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel()
     : _input(nullptr), _output(nullptr), _mean(nullptr), _std(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input,
+                                                ICLTensor       *output,
+                                                const ICLTensor *mean,
+                                                const ICLTensor *std)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
 }
 
-void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context,
+                                                const ICLTensor        *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *mean,
+                                                const ICLTensor        *std)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
@@ -99,7 +113,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), *input->info()->clone());
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
@@ -109,9 +123,10 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     const DataLayout data_layout = input->info()->data_layout();
 
     // Get number of elements to process per iterations
-    const unsigned int num_elems_processed_per_iteration = (data_layout == DataLayout::NHWC) ? adjust_vec_size(16 / input->info()->element_size(),
-                                                                                                               input->info()->dimension(0)) :
-                                                           (16 / input->info()->element_size());
+    const unsigned int num_elems_processed_per_iteration =
+        (data_layout == DataLayout::NHWC)
+            ? adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0))
+            : (16 / input->info()->element_size());
     const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     const DataType     dt          = input->info()->data_type();
 
@@ -119,11 +134,12 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     CLBuildOptions build_opts;
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
     build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-    build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
+    build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" +
+                           support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
     build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx))));
 
     std::string kernel_name = "normalize_planar_yuv_layer_";
-    if(is_data_type_quantized(dt))
+    if (is_data_type_quantized(dt))
     {
         const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
         build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)));
@@ -136,7 +152,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
         ICLKernel::configure_internal(win);
@@ -162,12 +178,16 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     _config_id += support::cpp11::to_string(input->info()->dimension(2));
 }
 
-Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *mean,
+                                                 const ITensorInfo *std)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
-    if(input->data_layout() == DataLayout::NCHW)
+    if (input->data_layout() == DataLayout::NCHW)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
     }
     return Status{};
 }
@@ -193,7 +213,6 @@ void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
index 6db4433e78..341b404e3d 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
@@ -67,7 +67,11 @@ public:
      * @param[in]  std             Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
      *                             Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *std);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
      *
      * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
@@ -79,7 +83,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 2f54b390d5..0ac285038e 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,9 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -34,25 +36,29 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const PaddingList &padding,
+                          PixelValue         constant_value,
+                          PaddingMode        mode)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(constant_value);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON((padding.size() < 1) || (padding.size() > input->num_dimensions()));
-    if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+    if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
 
         const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
-        for(size_t i = 0; i < padding.size(); ++i)
+        for (size_t i = 0; i < padding.size(); ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
             ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
         }
     }
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
 
@@ -64,40 +70,51 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLPadLayerKernel::CLPadLayerKernel()
-    : _input(nullptr), _output(nullptr), _4d_enabled(false)
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _4d_enabled(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(
+    const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
 }
 
-void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(const CLCompileContext &compile_context,
+                                 const ICLTensor        *input,
+                                 ICLTensor              *output,
+                                 const PaddingList      &padding,
+                                 PixelValue              constant_value,
+                                 PaddingMode             mode)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(
+                           misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input      = input;
     _output     = output;
     _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
 
     // Set build options
-    const DataType    &data_type               = input->info()->data_type();
-    const unsigned int input_width             = input->info()->dimension(0);
-    const unsigned int input_height            = input->info()->dimension(1);
-    const unsigned int input_depth             = input->info()->dimension(2);
-    const unsigned int pad_x_before            = padding.at(0).first;
-    const unsigned int pad_y_before            = padding.size() > 1 ? padding.at(1).first : 0;
-    const unsigned int pad_z_before            = padding.size() > 2 ? padding.at(2).first : 0;
-    const unsigned int vec_size                = adjust_vec_size(std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))), input_width);
-    const unsigned int pad_right_start         = input_width + pad_x_before;
-    const unsigned int pad_x_before_remainder  = pad_x_before % vec_size;
-    const unsigned int vec_size_leftover_write = vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
+    const DataType    &data_type    = input->info()->data_type();
+    const unsigned int input_width  = input->info()->dimension(0);
+    const unsigned int input_height = input->info()->dimension(1);
+    const unsigned int input_depth  = input->info()->dimension(2);
+    const unsigned int pad_x_before = padding.at(0).first;
+    const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+    const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+    const unsigned int vec_size     = adjust_vec_size(
+            std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))),
+            input_width);
+    const unsigned int pad_right_start        = input_width + pad_x_before;
+    const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+    const unsigned int vec_size_leftover_write =
+        vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -106,12 +123,12 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
     build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER_WRITE=" + support::cpp11::to_string(vec_size_leftover_write));
-    if(padding.size() > 1)
+    if (padding.size() > 1)
     {
         build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
 
-        if(padding.size() > 2)
+        if (padding.size() > 2)
         {
             build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
             build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
@@ -119,23 +136,25 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     }
 
     std::string kernel_name = "pad_layer_";
-    switch(mode)
+    switch (mode)
     {
         case PaddingMode::CONSTANT:
         {
             kernel_name += "constant";
 
-            const unsigned int vec_size_leftover_read = vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
+            const unsigned int vec_size_leftover_read =
+                vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
 
             build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
             build_opts.add_option("-DVEC_SIZE_LEFTOVER_READ=" + support::cpp11::to_string(vec_size_leftover_read));
 
-            if(pad_x_before >= vec_size)
+            if (pad_x_before >= vec_size)
             {
                 build_opts.add_option("-DTHREADS_TO_SKIP_BEFORE=" + support::cpp11::to_string(pad_x_before / vec_size));
-                build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + support::cpp11::to_string(pad_right_start / vec_size));
+                build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" +
+                                      support::cpp11::to_string(pad_right_start / vec_size));
             }
-            if(_4d_enabled)
+            if (_4d_enabled)
             {
                 build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
                 build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
@@ -152,14 +171,17 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
 
             const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
             const unsigned int after_pad_fact_x      = (2 * input_width + pad_x_before) - is_reflect;
-            const unsigned int output_last_x         = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+            const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
 
             build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
             build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder));
-            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
-            build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" +
+                                  support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+            build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" +
+                                  support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
             build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
-            build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
+            build_opts.add_option_if(after_pad_fact_x < output_last_x,
+                                     "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
 
             break;
         }
@@ -177,7 +199,11 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayerKernel::validate(const ITensorInfo *input,
+                                  const ITensorInfo *output,
+                                  const PaddingList &padding,
+                                  PixelValue         constant_value,
+                                  PaddingMode        mode)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
     return Status{};
@@ -195,13 +221,12 @@ void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
-        if(_4d_enabled)
+        if (_4d_enabled)
         {
             add_argument<unsigned int>(idx, batch++);
         }
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h
index 90af337f94..dca121b6a1 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.h
+++ b/src/core/CL/kernels/CLPadLayerKernel.h
@@ -56,7 +56,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const ICLTensor   *input,
+                   ICLTensor         *output,
+                   const PaddingList &padding,
+                   PixelValue         constant_value = PixelValue(),
+                   PaddingMode        mode           = PaddingMode::CONSTANT);
     /** Set the input and output tensor.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -68,8 +72,12 @@ public:
      * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                             or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
-                   PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const PaddingList      &padding,
+                   PixelValue              constant_value = PixelValue(),
+                   PaddingMode             mode           = PaddingMode::CONSTANT);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -80,7 +88,11 @@ public:
      * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           PixelValue         constant_value = PixelValue(),
+                           PaddingMode        mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 7b9caf0063..7dcdf1de6f 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo       *input1,
+                          const ITensorInfo       *input2,
+                          const ITensorInfo       *output,
+                          const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -51,10 +54,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 
     // Check variances
     const int var_size = info.variances().size();
-    if(var_size > 1)
+    if (var_size > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
-        for(int i = 0; i < var_size; ++i)
+        for (int i = 0; i < var_size; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
         }
@@ -62,17 +65,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
 
-    if(!info.max_sizes().empty())
+    if (!info.max_sizes().empty())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+                                        "Max and min sizes dimensions should match");
     }
 
-    for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+    for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+                                        "Max size should be greater than min size");
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
     }
@@ -80,7 +85,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *input1,
+                                                        const ITensorInfo       *input2,
+                                                        ITensorInfo             *output,
+                                                        const PriorBoxLayerInfo &info,
+                                                        int                      num_priors)
 {
     ARM_COMPUTE_UNUSED(input2);
     // Output tensor auto initialization if not yet initialized
@@ -88,10 +97,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
     auto_init_if_empty(*output, output_shape, 1, input1->data_type());
 
     const unsigned int     num_elems_processed_per_iteration = 4 * num_priors;
-    Window                 win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    Window                 win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, output_access);
-    Status                 err            = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status                 err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -99,15 +109,28 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
 CLPriorBoxLayerKernel::CLPriorBoxLayerKernel()
     : _input1(nullptr), _input2(nullptr), _output(nullptr), _info(), _num_priors(), _min(), _max(), _aspect_ratios()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const ICLTensor         *input1,
+                                      const ICLTensor         *input2,
+                                      ICLTensor               *output,
+                                      const PriorBoxLayerInfo &info,
+                                      cl::Buffer              *min,
+                                      cl::Buffer              *max,
+                                      cl::Buffer              *aspect_ratios)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info, min, max, aspect_ratios);
 }
 
-void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min,
-                                      cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const CLCompileContext  &compile_context,
+                                      const ICLTensor         *input1,
+                                      const ICLTensor         *input2,
+                                      ICLTensor               *output,
+                                      const PriorBoxLayerInfo &info,
+                                      cl::Buffer              *min,
+                                      cl::Buffer              *max,
+                                      cl::Buffer              *aspect_ratios)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
@@ -134,7 +157,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
 
     int img_width  = info.img_size().x;
     int img_height = info.img_size().y;
-    if(img_width == 0 || img_height == 0)
+    if (img_width == 0 || img_height == 0)
     {
         img_width  = input2->info()->dimension(width_idx);
         img_height = input2->info()->dimension(height_idx);
@@ -142,7 +165,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
 
     float step_x = info.steps()[0];
     float step_y = info.steps()[0];
-    if(step_x == 0.f || step_y == 0.f)
+    if (step_x == 0.f || step_y == 0.f)
     {
         step_x = static_cast<float>(img_width) / layer_width;
         step_y = static_cast<float>(img_height) / layer_height;
@@ -161,18 +184,20 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
     build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset()));
     build_opts.add_option_if(info.clip(), "-DIN_PLACE");
 
-    if(info.variances().size() > 1)
+    if (info.variances().size() > 1)
     {
-        for(unsigned int i = 0; i < info.variances().size(); ++i)
+        for (unsigned int i = 0; i < info.variances().size(); ++i)
         {
-            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i)));
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+                                  support::cpp11::to_string(info.variances().at(i)));
         }
     }
     else
     {
-        for(unsigned int i = 0; i < 4; ++i)
+        for (unsigned int i = 0; i < 4; ++i)
         {
-            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0)));
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+                                  support::cpp11::to_string(info.variances().at(0)));
         }
     }
 
@@ -193,13 +218,17 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
     ICLKernel::configure_internal(win_config.second);
 }
 
-Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayerKernel::validate(const ITensorInfo       *input1,
+                                       const ITensorInfo       *input2,
+                                       const ITensorInfo       *output,
+                                       const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
     const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(),
+                                                              output->clone().get(), info, num_priors)
+                                    .first);
 
     return Status{};
 }
@@ -210,8 +239,9 @@ void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data());
-    queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data());
-    if(!_info.max_sizes().empty())
+    queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float),
+                             _info.aspect_ratios().data());
+    if (!_info.max_sizes().empty())
     {
         queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
     }
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
index 6c369a7a4e..a50e0c5ff5 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.h
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
@@ -57,7 +57,13 @@ public:
      * @param[in]  max           Maximum prior box values
      * @param[in]  aspect_ratios Aspect ratio values
      */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios);
+    void configure(const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info,
+                   cl::Buffer              *min,
+                   cl::Buffer              *max,
+                   cl::Buffer              *aspect_ratios);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -69,8 +75,14 @@ public:
      * @param[in]  max             Maximum prior box values
      * @param[in]  aspect_ratios   Aspect ratio values
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max,
-                   cl::Buffer *aspect_ratios);
+    void configure(const CLCompileContext  &compile_context,
+                   const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info,
+                   cl::Buffer              *min,
+                   cl::Buffer              *max,
+                   cl::Buffer              *aspect_ratios);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel
      *
      * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
@@ -80,14 +92,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
+    const ICLTensor  *_input1;
+    const ICLTensor  *_input2;
     ICLTensor        *_output;
     PriorBoxLayerInfo _info;
     int               _num_priors;
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index f68520dee6..731fcb8e04 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,12 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -47,14 +51,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
     /* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */
     /* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */
-    const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration;
+    const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration)
+                                                           ? input->dimension(0)
+                                                           : temp_num_elems_processed_per_iteration;
 
     // This kernel doesn't need padding
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
     return std::make_pair(Status{}, win);
 }
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const ITensorInfo *weight,
+                          const ITensorInfo *bias)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output);
 
@@ -70,7 +79,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -82,12 +91,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel()
     : _input(nullptr), _weight(nullptr), _bias(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context,
+                                                const ICLTensor        *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *weight,
+                                                const ICLTensor        *bias)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
-    auto padding_info = get_padding_info({ input, weight, bias, output });
+    auto padding_info = get_padding_info({input, weight, bias, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info()));
 
@@ -101,7 +115,8 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     int32_t                       output_multiplier{};
     int32_t                       output_shift{};
     const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform();
-    const Status                  status    = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
+    const Status                  status =
+        quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
     output_shift *= -1;
 
     // Set build options
@@ -111,8 +126,12 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
     build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-    build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
-    build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+    build_opts.add_option("-DMIN_BOUND=" +
+                          support::cpp11::to_string(std::get<0>(
+                              quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+    build_opts.add_option("-DMAX_BOUND=" +
+                          support::cpp11::to_string(std::get<1>(
+                              quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
 
     // Create kernel
     _kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options());
@@ -132,12 +151,18 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input,
+                                                ICLTensor       *output,
+                                                const ICLTensor *weight,
+                                                const ICLTensor *bias)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias);
 }
 
-Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *weight,
+                                                 const ITensorInfo *bias)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
@@ -168,7 +193,6 @@ void CLQLSTMLayerNormalizationKernel::run(const Window &window, cl::CommandQueue
         add_2D_tensor_argument(idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
index 31085c37ba..ba912e1d2d 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
@@ -63,7 +63,11 @@ public:
      * @param[in]  weight          Weight tensor. Data types supported: Same as @p input.
      * @param[in]  bias            Bias tensor. Data types supported: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *weight,
+                   const ICLTensor        *bias);
     /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel
      *
      * @param[in] input  Source tensor info with 2 dimensions. Data types supported: QSYMM16.
@@ -73,7 +77,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 9894c731fe..c97910ef79 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,9 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -42,24 +43,29 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          ITensorInfo               *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+                                                           output->tensor_shape());
     }
 
-    if(is_data_type_quantized_asymmetric(input->data_type()))
+    if (is_data_type_quantized_asymmetric(input->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
 
@@ -79,14 +85,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITe
 CLROIAlignLayerKernel::CLROIAlignLayerKernel()
     : _input(nullptr), _output(nullptr), _rois(nullptr), _pool_info(0, 0, 0.f)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const ICLTensor           *input,
+                                      const ICLTensor           *rois,
+                                      ICLTensor                 *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const CLCompileContext    &compile_context,
+                                      const ICLTensor           *input,
+                                      const ICLTensor           *rois,
+                                      ICLTensor                 *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
@@ -96,7 +110,7 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
     output->info()->set_data_layout(input->info()->data_layout());
 
-    auto padding_info = get_padding_info({ input, rois, output });
+    auto padding_info = get_padding_info({input, rois, output});
 
     _input     = input;
     _output    = output;
@@ -110,16 +124,23 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH))));
-    build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
-    build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
+    build_opts.add_option("-DMAX_DIM_X=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::WIDTH))));
+    build_opts.add_option("-DMAX_DIM_Y=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
+    build_opts.add_option("-DMAX_DIM_Z=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
     build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
     build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
     build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
     build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
-    build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
+    build_opts.add_option_if(pool_info.sampling_ratio() > 0,
+                             "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
 
-    if(is_qasymm)
+    if (is_qasymm)
     {
         const UniformQuantizationInfo iq_info    = input->info()->quantization_info().uniform();
         const UniformQuantizationInfo roisq_info = rois->info()->quantization_info().uniform();
@@ -143,7 +164,10 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayerKernel::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *rois,
+                                       ITensorInfo               *output,
+                                       const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h
index 5284a5913f..2e84e5d303 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.h
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h
@@ -61,7 +61,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -77,7 +78,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   ICLTensor                 *output,
+                   const ROIPoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -93,7 +98,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue);
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index 7a843d65a2..1b2c414a49 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,9 +46,13 @@ namespace arm_compute
 CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
     : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayerKernel::validate(const ITensorInfo         *input,
+                                         const ITensorInfo         *rois,
+                                         const ITensorInfo         *output,
+                                         const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
 
@@ -59,10 +65,11 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+                                    (output->dimension(1) != pool_info.pooled_height()));
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
         ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
     }
@@ -70,20 +77,30 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor
     return Status{};
 }
 
-void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const ICLTensor           *input,
+                                        const ICLTensor           *rois,
+                                        ICLTensor                 *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const CLCompileContext    &compile_context,
+                                        const ICLTensor           *input,
+                                        const ICLTensor           *rois,
+                                        const ICLTensor           *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
 
-    auto padding_info = get_padding_info({ input, rois, output });
+    auto padding_info = get_padding_info({input, rois, output});
 
     // Output auto initialization if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
-    auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(), output->info()->quantization_info());
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+                             rois->info()->dimension(1));
+    auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(),
+                       output->info()->quantization_info());
 
     // Set instance variables
     _input     = input;
@@ -105,11 +122,12 @@ void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context,
     build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
     build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale()));
 
-    if(is_qasymm)
+    if (is_qasymm)
     {
         // Determine quantization info scale, offset
         UniformQuantizationInfo uqinfo = UniformQuantizationInfo();
-        uqinfo                         = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform());
+        uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(),
+                                                     _output->info()->quantization_info().uniform());
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset));
         build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale));
 
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
index 7b7b457632..80bfb63092 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
@@ -59,7 +59,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -74,7 +75,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   const ICLTensor           *output,
+                   const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -92,7 +97,10 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
 private:
     const ICLTensor    *_input;
diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index 85f79988c9..622f6210b9 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,9 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -40,11 +43,8 @@ constexpr unsigned int vector_size_byte_opencl = 16;
 Status validate_arguments(const ITensorInfo *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
 
@@ -54,20 +54,24 @@ Status validate_arguments(const ITensorInfo *output, const float start, const fl
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), "start value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), "end value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), "step value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()),
+                                    "start value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()),
+                                    "end value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()),
+                                    "step value is outside the range of the data type");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->num_dimensions() != 1, "Output has to be a 1-D tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+                                    "Output tensor size is incorrect");
 
     return Status{};
 }
 } // namespace
 
-CLRangeKernel::CLRangeKernel()
-    : _start(0), _end(1), _step(1), _output(nullptr)
+CLRangeKernel::CLRangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLRangeKernel::configure(ICLTensor *output, const float start, const float end, const float step)
@@ -75,16 +79,18 @@ void CLRangeKernel::configure(ICLTensor *output, const float start, const float
     configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
 }
 
-void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRangeKernel::configure(
+    const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(output->info(), start, end, step));
 
     // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
-    Window       win                               = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
 
-    auto padding_info = get_padding_info({ output });
+    auto padding_info = get_padding_info({output});
 
     _start  = start;
     _end    = end;
@@ -97,10 +103,11 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DSTART=" + support::cpp11::to_string(start));
     build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step));
-    if(is_data_type_quantized_asymmetric(output->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(output->info()->data_type()))
     {
         const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
         build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(qinfo.offset));
@@ -135,4 +142,4 @@ void CLRangeKernel::run(const Window &window, cl::CommandQueue &queue)
 
     enqueue(queue, *this, window, lws_hint());
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h
index 1b94a099ed..65251a11e5 100644
--- a/src/core/CL/kernels/CLRangeKernel.h
+++ b/src/core/CL/kernels/CLRangeKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLRANGEKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 133a35f513..c8665f8fbd 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,14 +28,15 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -46,23 +47,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    if(input->num_channels() == 1)
+    if (input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                             DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON(axis == 0);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8,
+                                    "Not supported reduction operation for QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-    ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && (input->data_type() != DataType::QASYMM8)
-                                && (input->data_type() != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer");
+    ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) &&
+                                (input->data_type() != DataType::QASYMM8) &&
+                                (input->data_type() != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN),
+                                    "Not supported reduction operation, use CLArgMinMaxLayer");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -75,35 +81,45 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 CLReductionOperationKernel::CLReductionOperationKernel()
     : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const ICLTensor   *input,
+                                           ICLTensor         *output,
+                                           unsigned int       axis,
+                                           ReductionOperation op)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
 }
 
-void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const CLCompileContext &compile_context,
+                                           const ICLTensor        *input,
+                                           ICLTensor              *output,
+                                           unsigned int            axis,
+                                           ReductionOperation      op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input          = input;
     _output         = output;
     _reduction_axis = axis;
     _op             = op;
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
 
     // Set build options
     CLBuildOptions build_opts;
     DataType       data_type = input->info()->data_type();
     std::string    data_type_promoted{};
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         data_type_promoted = "int";
     }
@@ -128,10 +144,14 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
     build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
     build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
-    build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
-    build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
-
-    switch(op)
+    build_opts.add_option_if(is_data_type_quantized(data_type),
+                             "-DOFFSET=" +
+                                 support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
+    build_opts.add_option_if(
+        is_data_type_quantized(data_type),
+        "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
+
+    switch (op)
     {
         case ReductionOperation::SUM_SQUARE:
             build_opts.add_option(("-DOPERATION=square_sum"));
@@ -141,7 +161,10 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
             build_opts.add_option(("-DOPERATION=sum"));
             break;
         case ReductionOperation::MIN:
+            build_opts.add_option(("-DOPERATION=min_"));
+            break;
         case ReductionOperation::MAX:
+            build_opts.add_option(("-DOPERATION=max_"));
             break;
         case ReductionOperation::PROD:
             build_opts.add_option(("-DOPERATION=product"));
@@ -154,7 +177,7 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     std::string kernel_axis_name;
     const bool  is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
 
-    switch(axis)
+    switch (axis)
     {
         case 0:
         {
@@ -181,14 +204,19 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     _kernel = create_kernel(compile_context, "reduction_operation_" + kernel_axis_name, build_opts.options());
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(vec_size));
-    win.set(Window::DimX, Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step()));
+    TensorShape actual_input_shape = input->info()->tensor_shape();
+    actual_input_shape[0]          = width;
+
+    Window win = calculate_max_window(actual_input_shape, Steps(vec_size));
     ICLKernel::configure_internal(win);
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLReductionOperationKernel::validate(const ITensorInfo *input,
+                                            const ITensorInfo *output,
+                                            unsigned int       axis,
+                                            ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
     return Status{};
@@ -200,18 +228,19 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
-    switch(_reduction_axis)
+    switch (_reduction_axis)
     {
         case 0:
         {
             // We use parallel reduction only in non quantized types
-            if(is_serial_op)
+            if (is_serial_op)
             {
                 // Get first input and output slices
-                Window window_in{ window };
-                window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+                Window window_in{window};
+                window_in.set(Window::DimX,
+                              Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
 
-                Window out_window{ window };
+                Window out_window{window};
                 out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
 
                 Window in_slice  = window_in.first_slice_window_1D();
@@ -223,8 +252,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
                     add_1D_tensor_argument(idx, _input, in_slice);
                     add_1D_tensor_argument(idx, _output, out_slice);
                     enqueue(queue, *this, in_slice);
-                }
-                while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+                } while (window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
             }
             else
             {
@@ -245,56 +273,92 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
         break;
         case 1:
         {
-            // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
-            Window in_slice  = window_in.first_slice_window_2D();
-            Window out_slice = window.first_slice_window_2D();
+            bool   has_collapsed = true;
+            Window actual_window = window.collapse_if_possible(window, 2, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
 
-            do
-            {
-                unsigned int idx = 0;
-                add_2D_tensor_argument(idx, _input, in_slice);
-                add_2D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            actual_window = actual_window.shift_dimensions(1, Window::DimY);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         case 2:
         {
-            // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
-            Window in_slice  = window_in.first_slice_window_3D();
-            Window out_slice = window.first_slice_window_3D();
+            bool   has_collapsed = true;
+            Window actual_window = window.collapse_if_possible(window, 3, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
 
-            do
-            {
-                unsigned int idx = 0;
-                add_3D_tensor_argument(idx, _input, in_slice);
-                add_3D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+            actual_window = actual_window.shift_dimensions(1, Window::DimZ);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         case 3:
         {
-            // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(3, Window::Dimension(0, 1, 1));
-            Window in_slice  = window_in.first_slice_window_4D();
-            Window out_slice = window.first_slice_window_4D();
+            bool   has_collapsed = true;
+            Window actual_window = window.shift_dimensions(1, Window::DimW);
 
-            do
-            {
-                unsigned int idx = 0;
-                add_4D_tensor_argument(idx, _input, in_slice);
-                add_4D_tensor_argument(idx, _output, out_slice);
-                enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+            actual_window = actual_window.collapse_if_possible(actual_window, 2, &has_collapsed);
+            ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+            const ITensorInfo *input_info    = _input->info();
+            const Strides     &input_strides = input_info->strides_in_bytes();
+
+            const ITensorInfo *output_info    = _output->info();
+            const Strides     &output_strides = output_info->strides_in_bytes();
+
+            unsigned int idx = 0;
+
+            _kernel.setArg(idx++, _input->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+            _kernel.setArg<cl_uint>(idx++, input_strides[4]);
+            _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+            _kernel.setArg(idx++, _output->cl_buffer());
+            _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+            _kernel.setArg<cl_uint>(idx++, output_strides[4]);
+            _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+            enqueue(queue, *this, actual_window);
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h
index b456378746..2f94b2add3 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.h
+++ b/src/core/CL/kernels/CLReductionOperationKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -67,7 +68,11 @@ public:
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
      *
@@ -79,7 +84,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
deleted file mode 100644
index 6edd744db7..0000000000
--- a/src/core/CL/kernels/CLRemapKernel.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLRemapKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-
-namespace arm_compute
-{
-CLRemapKernel::CLRemapKernel()
-    : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr), _data_layout(DataLayout::NCHW)
-{
-}
-
-BorderSize CLRemapKernel::border_size() const
-{
-    return _data_layout == DataLayout::NCHW ? BorderSize(1) : BorderSize(0);
-}
-
-template <class T>
-void CLRemapKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value)
-{
-    T value;
-    constant_border_value.get(value);
-    ICLKernel::add_argument<T>(idx, static_cast<T>(value));
-}
-
-Status CLRemapKernel::validate(const ITensorInfo *input, const ITensorInfo *map_x, const ITensorInfo *map_y, ITensorInfo *output, RemapInfo info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, map_x, map_y, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.policy == InterpolationPolicy::AREA, "Area interpolation is not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.border_mode != BorderMode::CONSTANT && info.border_mode != BorderMode::UNDEFINED, "Border mode not supported");
-    return Status{};
-}
-
-void CLRemapKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, RemapInfo info)
-{
-    CLRemapKernel::validate(input->info(), map_x->info(), map_y->info(), output->info(), info);
-
-    _input       = input;
-    _output      = output;
-    _map_x       = map_x;
-    _map_y       = map_y;
-    _data_layout = input->info()->data_layout();
-
-    const bool is_nhwc            = _data_layout == DataLayout::NHWC;
-    const bool is_constant_border = info.border_mode == BorderMode::CONSTANT;
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if(is_constant_border, "-DCONSTANT_BORDER");
-
-    const std::string interpolation_name = lower_string(string_from_interpolation_policy(info.policy));
-    const std::string kernel_name        = "remap_" + interpolation_name + "_" + lower_string(string_from_data_layout(_data_layout));
-    _kernel                              = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    const unsigned int num_elems_processed_per_iteration = is_nhwc ? 1 : 4;
-    const int          idx_height                        = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int          idx_width                         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int          input_height                      = input->info()->dimension(idx_height);
-    const int          input_width                       = input->info()->dimension(idx_width);
-
-    // Configure window
-    Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
-
-    // Update padding in NCHW case
-    if(_data_layout == DataLayout::NCHW)
-    {
-        const int          total_right  = ceil_to_multiple(input_width, num_elems_processed_per_iteration);
-        const int          access_right = total_right + (((total_right - input_width) == 0) ? border_size().right : 0);
-        AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input_height + border_size().bottom);
-
-        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-        update_window_and_padding(win, input_access, output_access);
-    }
-
-    ICLKernel::configure_internal(win);
-
-    // Set static arguments
-    unsigned int idx = 4 * (is_nhwc ? num_arguments_per_4D_tensor() : num_arguments_per_2D_tensor());
-    _kernel.setArg<cl_float>(idx++, input_width);
-    _kernel.setArg<cl_float>(idx++, input_height);
-    if(is_nhwc && is_constant_border)
-    {
-        set_constant_border<uint8_t>(idx, info.constant_border_value);
-    }
-}
-
-void CLRemapKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            Window slice = window.first_slice_window_2D();
-            do
-            {
-                unsigned int idx = 0;
-                add_2D_tensor_argument(idx, _input, slice);
-                add_2D_tensor_argument(idx, _output, slice);
-                add_2D_tensor_argument(idx, _map_x, slice);
-                add_2D_tensor_argument(idx, _map_y, slice);
-                enqueue(queue, *this, slice, lws_hint());
-
-            }
-            while(window.slide_window_slice_2D(slice));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
-            Window slice     = collapsed.first_slice_window_4D();
-
-            unsigned int idx = 0;
-            add_4D_tensor_argument(idx, _input, slice);
-            add_4D_tensor_argument(idx, _output, slice);
-            add_4D_tensor_argument(idx, _map_x, slice);
-            add_4D_tensor_argument(idx, _map_y, slice);
-            enqueue(queue, *this, slice, lws_hint());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Invalid Data layout");
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLRemapKernel.h b/src/core/CL/kernels/CLRemapKernel.h
deleted file mode 100644
index 1e3a4ad13f..0000000000
--- a/src/core/CL/kernels/CLRemapKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREMAPKERNEL_H
-#define ARM_COMPUTE_CLREMAPKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a remap on a tensor */
-class CLRemapKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLRemapKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRemapKernel(const CLRemapKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRemapKernel &operator=(const CLRemapKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLRemapKernel(CLRemapKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLRemapKernel &operator=(CLRemapKernel &&) = default;
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[in]  map_x           Map for X coordinates. Data types supported: F32.
-     * @param[in]  map_y           Map for Y coordinates. Data types supported: F32.
-     * @param[out] output          Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  info            RemapInfo struct:
-     *                                   - policy                   Interpolation policy to use. Only NEAREST and BILINEAR are supported.
-     *                                   - border_mode              Border mode to use on the input tensor. Only CONSTANT and UNDEFINED are supported.
-     *                                   - constant_border_value    Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, RemapInfo info);
-    /** Validate the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[in]  map_x  Map for X coordinates. Data types supported: F32.
-     * @param[in]  map_y  Map for Y coordinates. Data types supported: F32.
-     * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  info   RemapInfo struct:
-     *                         - policy                   Interpolation policy to use. Only NEAREST and BILINEAR are supported.
-     *                         - border_mode              Border mode to use on the input tensor. Only CONSTANT and UNDEFINED are supported.
-     *                         - constant_border_value    Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *map_x, const ITensorInfo *map_y, ITensorInfo *output, RemapInfo info);
-    /** Function to set the constant value on fill border kernel depending on type.
-     *
-     * @param[in] idx                   Index of the kernel argument to set.
-     * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    template <class T>
-    void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_map_x;
-    const ICLTensor *_map_y;
-    DataLayout       _data_layout;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREMAPKERNEL_H */
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index c6c7824188..9fd21943e8 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -50,13 +52,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+                                    "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+                                    "The height of the input tensor must be a multiple of stride");
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+        const TensorInfo tensor_info_output =
+            output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -65,9 +70,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLReorgLayerKernel::CLReorgLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLReorgLayerKernel::CLReorgLayerKernel() : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t stride)
@@ -75,17 +80,22 @@ void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, in
     configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
 }
 
-void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayerKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input,
+                                   ICLTensor              *output,
+                                   int32_t                 stride)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
 
-    std::string  kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
-    const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    std::string kernel_name =
+        std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+    const size_t idx_channel =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -96,7 +106,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
 
     // Configure window
     // auto inizialize the output tensor if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(
+                           misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
 
     Window win = calculate_max_window(*output->info(), Steps());
 
@@ -117,7 +129,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride)
+Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input,
+                                    const arm_compute::ITensorInfo *output,
+                                    int32_t                         stride)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
 
@@ -137,7 +151,6 @@ void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h
index 455a6170c6..f335071e9f 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.h
+++ b/src/core/CL/kernels/CLReorgLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLREORGLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index b3c9bcafd1..00241b161b 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,9 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -37,17 +40,21 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
 {
+    ARM_COMPUTE_UNUSED(use_inverted_axis);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
+                                    "Current implementation only supports up to 4 dimensions.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -58,20 +65,27 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLReverseKernel::CLReverseKernel()
-    : _input(nullptr), _output(nullptr), _axis(nullptr)
+CLReverseKernel::CLReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const ICLTensor *input,
+                                ICLTensor       *output,
+                                const ICLTensor *axis,
+                                bool             use_inverted_axis)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, use_inverted_axis);
 }
 
-void CLReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const CLCompileContext &compile_context,
+                                const ICLTensor        *input,
+                                ICLTensor              *output,
+                                const ICLTensor        *axis,
+                                bool                    use_inverted_axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
-    auto padding_info = get_padding_info({ input, output, axis });
+    auto padding_info = get_padding_info({input, output, axis});
 
     _input  = input;
     _output = output;
@@ -80,12 +94,14 @@ void CLReverseKernel::configure(const CLCompileContext &compile_context, const I
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), *input->info()->clone());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info(), use_inverted_axis));
 
     // Set kernel build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DNUM_REVERSE_DIMS=" + support::cpp11::to_string(axis->info()->dimension(0)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
+    build_opts.add_option("-DRANK=" + support::cpp11::to_string(input->info()->num_dimensions()));
+    build_opts.add_option_if(use_inverted_axis, "-DUSE_INVERTED_AXIS");
 
     // Create kernel
     _kernel = create_kernel(compile_context, "reverse", build_opts.options());
@@ -113,9 +129,12 @@ void CLReverseKernel::configure(const CLCompileContext &compile_context, const I
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status CLReverseKernel::validate(const ITensorInfo *input,
+                                 const ITensorInfo *output,
+                                 const ITensorInfo *axis,
+                                 bool               use_inverted_axis)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis));
     return Status{};
 }
 
@@ -135,7 +154,6 @@ void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue)
         add_1D_tensor_argument(idx, _axis, axis_slice);
         add_4D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_4D(slice));
+    } while (collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h
index 4a21e4f802..a630aec15a 100644
--- a/src/core/CL/kernels/CLReverseKernel.h
+++ b/src/core/CL/kernels/CLReverseKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLREVERSEKERNEL_H
-#define ARM_COMPUTE_CLREVERSEKERNEL_H
+#ifndef ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
+#define ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
 
 #include "src/core/CL/ICLKernel.h"
 
@@ -48,29 +48,43 @@ public:
     ~CLReverseKernel() = default;
     /** Initialise the kernel's inputis and output
      *
-     * @param[in]  input  Input tensor. Data types supported: All.
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in]  input             Input tensor. Data types supported: All.
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
+     *
+     * @note The value of each axis should be between [-rank, rank)
+     * @note If there are duplicate values in the tensor, the subsequent axis values are ignored. e.g. an array of [2, 2] has the same effects as [2].
+     *
+     * @deprecated Support for U32 in axis tensor will be removed in 24.02 release
+     *
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis);
     /** Initialise the kernel's inputis and output
      *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis            Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in]  compile_context   The compile context to be used.
+     * @param[in]  input             Input tensor. Data types supported: All.
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *axis,
+                   bool                    use_inverted_axis);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: All.
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in] input             Input tensor info. Data types supported: All.
+     * @param[in] output            Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis              Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in] use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -81,4 +95,4 @@ public:
     const ICLTensor *_axis;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREVERSEKERNEL_H */
+#endif // ACL_SRC_CORE_CL_KERNELS_CLREVERSEKERNEL_H
diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index f8e63ddc43..703c64d8d3 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,10 +29,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -50,9 +51,11 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
 
     const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+                                ((c->tensor_shape().num_dimensions() > 1) ||
+                                 (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -62,12 +65,16 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
 }
 } // namespace
 
-CLSelectKernel::CLSelectKernel()
-    : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+CLSelectKernel::CLSelectKernel() : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelectKernel::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *c,
+                               const ICLTensor        *x,
+                               const ICLTensor        *y,
+                               ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info()));
@@ -78,7 +85,7 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
     _output        = output;
     _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
 
-    auto               padding_info         = get_padding_info({ c, x, y, output });
+    auto               padding_info         = get_padding_info({c, x, y, output});
     const unsigned int vec_size_x           = adjust_vec_size(16 / x->info()->element_size(), x->info()->dimension(0));
     const int          vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x;
 
@@ -90,14 +97,14 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
 
     // Create kernel
     std::string kernel_name = "select";
-    if(_has_same_rank)
+    if (_has_same_rank)
     {
         kernel_name += "_same_rank";
     }
     else
     {
         const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2;
-        if(is_input_rank_greater_than_two)
+        if (is_input_rank_greater_than_two)
         {
             const size_t width      = x->info()->tensor_shape().x();
             const size_t height     = x->info()->tensor_shape().y();
@@ -126,7 +133,8 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output));
     return Status{};
@@ -140,7 +148,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
-    if(!_has_same_rank)
+    if (!_has_same_rank)
     {
         Window vector_slice = window.first_slice_window_1D();
         vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -151,7 +159,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
     do
     {
         unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor();
-        if(_has_same_rank)
+        if (_has_same_rank)
         {
             add_3D_tensor_argument(idx, _c, slice);
         }
@@ -160,7 +168,6 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
         add_3D_tensor_argument(idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h
index b8c10cd7cf..c4256fd743 100644
--- a/src/core/CL/kernels/CLSelectKernel.h
+++ b/src/core/CL/kernels/CLSelectKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSELECTKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -60,7 +61,11 @@ public:
      * @param[out] y               Second input tensor. Data types supported: Same as @p x
      * @param[in]  output          Output tensor. Data types supported: Same as @p x.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *c,
+                   const ICLTensor        *x,
+                   const ICLTensor        *y,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel
      *
      * @param[in] c      Condition input tensor. Data types supported: U8.
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 57f7af488b..f4c0839ad2 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -37,19 +39,22 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *block_info,
+                          const ITensorInfo *paddings,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
     ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -60,7 +65,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const Size2D      &padding_left,
+                                 const Size2D      &padding_right,
                                  const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -69,9 +78,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+            input, block_shape_x, block_shape_y, padding_left, padding_right);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -84,18 +94,27 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel()
     : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+                                          const ICLTensor *block_shape,
+                                          const ICLTensor *paddings,
+                                          ICLTensor       *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *block_shape,
+                                          const ICLTensor        *paddings,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
-    auto padding_info = get_padding_info({ input, block_shape, paddings, output });
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+    auto padding_info = get_padding_info({input, block_shape, paddings, output});
 
     _input       = input;
     _block_shape = block_shape;
@@ -109,14 +128,17 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
-    _kernel = create_kernel(compile_context, "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -124,22 +146,34 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                                          ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+                                          const int        block_shape_x,
+                                          const int        block_shape_y,
+                                          const Size2D    &padding_left,
+                                          const Size2D    &padding_right,
+                                          ICLTensor       *output)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+              padding_right, output);
 }
 
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
-                                          const Size2D &padding_right,
-                                          ICLTensor    *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const int               block_shape_x,
+                                          const int               block_shape_y,
+                                          const Size2D           &padding_left,
+                                          const Size2D           &padding_right,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+        input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+                                                         padding_right, output->info()));
 
     _input  = input;
     _output = output;
@@ -151,7 +185,8 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
@@ -164,22 +199,32 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x()));
     build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y()));
     build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y()));
-    _kernel = create_kernel(compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(
+        compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+        build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 }
 
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const ITensorInfo *block_shape,
+                                           const ITensorInfo *paddings,
+                                           const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
     return Status{};
 }
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const int          block_shape_x,
+                                           const int          block_shape_y,
+                                           const Size2D      &padding_left,
+                                           const Size2D      &padding_right,
                                            const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
     return Status{};
 }
 
@@ -216,7 +261,6 @@ void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queu
         add_3D_tensor_argument(idx, _output, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
index 4817cfeef2..f9dce9db47 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +64,11 @@ public:
      * @param[in]  paddings        2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   const ICLTensor        *paddings,
+                   ICLTensor              *output);
     /** Initialise the kernel's input and output. (Static block shape and paddings)
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -73,7 +78,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+    void configure(const ICLTensor *input,
+                   const int        block_shape_x,
+                   const int        block_shape_y,
+                   const Size2D    &padding_left,
+                   const Size2D    &padding_right,
+                   ICLTensor       *output);
     /** Initialise the kernel's input and output. (Static block shape and paddings)
      *
      * @param[in]  compile_context The compile context to be used.
@@ -84,8 +94,13 @@ public:
      * @param[in]  padding_right   The padding at the end of every dimension of the output tensor.
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                   ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int               block_shape_x,
+                   const int               block_shape_y,
+                   const Size2D           &padding_left,
+                   const Size2D           &padding_right,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -95,7 +110,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings)
      *
      * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -107,7 +125,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index 4e5b417ec6..25662b5c62 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -63,9 +65,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
@@ -73,10 +75,13 @@ void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          int32_t                 block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
@@ -92,11 +97,14 @@ void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
     build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
     build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -134,7 +142,6 @@ void CLSpaceToDepthLayerKernel::run(const Window &window, cl::CommandQueue &queu
         enqueue(queue, *this, slice_out, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
index bb1ac5f9a6..d0932919e0 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -61,7 +62,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 9bdcc8dc3f..23e26716e7 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,11 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          unsigned int       axis,
+                          unsigned int       idx_input,
+                          unsigned int       num_tensors,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -51,9 +55,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                           compute_stack_shape(*input, axis, num_tensors));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -61,7 +66,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
 {
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -73,17 +79,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
 }
 } // namespace
 
-CLStackLayerKernel::CLStackLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLStackLayerKernel::CLStackLayerKernel() : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(
+    const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, idx_input, num_tensors, output);
 }
 
-void CLStackLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input,
+                                   unsigned int            axis,
+                                   unsigned int            idx_input,
+                                   unsigned int            num_tensors,
+                                   ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -111,10 +123,15 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons
     _kernel.setArg<cl_uint>(idx, idx_input);
 }
 
-Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status CLStackLayerKernel::validate(const ITensorInfo *input,
+                                    unsigned int       axis,
+                                    unsigned int       idx_input,
+                                    unsigned int       num_tensors,
+                                    const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
     return Status{};
 }
 
diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h
index 2865127a90..d3c17f529c 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.h
+++ b/src/core/CL/kernels/CLStackLayerKernel.h
@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -60,7 +61,8 @@ public:
      * @param[out] output      Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    void configure(
+        const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
     /** Initialise the kernel's inputs and output
      *
      * @note Supported input tensor rank: up to 4
@@ -74,7 +76,12 @@ public:
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   unsigned int            axis,
+                   unsigned int            idx_input,
+                   unsigned int            num_tensors,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
@@ -88,7 +95,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           unsigned int       axis,
+                           unsigned int       idx_input,
+                           unsigned int       num_tensors,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index dd51df9363..20cd835069 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,10 +22,13 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLStridedSliceKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/bit_ops.h"
@@ -36,9 +39,14 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                          int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const Coordinates &starts,
+                          const Coordinates &ends,
+                          const BiStrides   &strides,
+                          int32_t            begin_mask,
+                          int32_t            end_mask,
+                          int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -47,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
-    {
-        return i == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
 
     // Get expected output shape
-    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                          starts, ends, strides,
-                                                                                                          begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
 
     // Checks output if configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -70,28 +75,38 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
 }
 } // namespace
 
-void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+CLStridedSliceKernel::CLStridedSliceKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void CLStridedSliceKernel::configure(const CLCompileContext &compile_context,
+                                     const ITensorInfo      *input,
+                                     ITensorInfo            *output,
+                                     const Coordinates      &starts,
+                                     const Coordinates      &ends,
+                                     const BiStrides        &strides,
+                                     int32_t                 begin_mask,
+                                     int32_t                 end_mask,
+                                     int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    auto padding_info = get_padding_info({input, output});
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
     const TensorShape &input_shape = input->tensor_shape();
 
     Coordinates starts_abs;
     Coordinates ends_abs;
     Coordinates final_strides;
-    std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
-                                                        input_shape,
-                                                        starts, ends, strides,
-                                                        begin_mask, end_mask, shrink_axis_mask);
+    std::tie(starts_abs, ends_abs, final_strides) =
+        arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+                                                                               begin_mask, end_mask, shrink_axis_mask);
 
     // Configure kernel window
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                      starts, ends, strides,
-                                                                                                      begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
     Window win = calculate_max_window(*output, Steps());
 
@@ -102,29 +117,31 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
 
     // Update window if needed
-    if(multi_access_x)
+    if (multi_access_x)
     {
         Window &updated_window = win;
         updated_window.set(Window::DimX,
-                           Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
+                           Window::Dimension(updated_window.x().start(),
+                                             ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
     // Create build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
-        build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
-        build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+        build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" +
+                              support::cpp11::to_string(starts_abs[i]));
+        build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" +
+                              support::cpp11::to_string(final_strides[i]));
         build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i));
     }
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                                        std::max<int>(output_width_x - vec_size_x, 0)));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
-                                  "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
-                                  "-DSRC_DEPTH=1");
     build_opts.add_option_if_else(output->num_dimensions() > 2,
                                   "-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()),
                                   "-DDST_DEPTH=1");
@@ -136,7 +153,7 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     _config_id = "strided_slice";
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->data_type()));
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         _config_id += "_";
         _config_id += support::cpp11::to_string(input->dimension(i));
@@ -150,11 +167,17 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSliceKernel::validate(const ITensorInfo *input,
+                                      const ITensorInfo *output,
+                                      const Coordinates &starts,
+                                      const Coordinates &ends,
+                                      const BiStrides   &strides,
+                                      int32_t            begin_mask,
+                                      int32_t            end_mask,
+                                      int32_t            shrink_axis_mask)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
     return Status{};
 }
@@ -164,8 +187,9 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_4D();
@@ -176,7 +200,6 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
         add_4D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_4D(slice));
+    } while (window_collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h
index 599cf34c39..1cf5bcacec 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.h
+++ b/src/core/CL/kernels/CLStridedSliceKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 #include <cstdint>
@@ -35,6 +36,9 @@ namespace arm_compute
 class CLStridedSliceKernel : public ICLKernel
 {
 public:
+    /** Default constructor */
+    CLStridedSliceKernel();
+
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -50,9 +54,15 @@ public:
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   ITensorInfo            *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends,
+                   const BiStrides        &strides,
+                   int32_t                 begin_mask,
+                   int32_t                 end_mask,
+                   int32_t                 shrink_axis_mask);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -68,9 +78,14 @@ public:
      * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask,
+                           int32_t            end_mask,
+                           int32_t            shrink_axis_mask);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index c0c3d2e2ee..fa996c4008 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,11 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLTileKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -38,15 +41,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
-    {
-        return e == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -54,9 +55,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLTileKernel::CLTileKernel()
-    : _input(nullptr), _output(nullptr)
+CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
@@ -64,7 +65,10 @@ void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Mu
     configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
 }
 
-void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTileKernel::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input,
+                             ICLTensor              *output,
+                             const Multiples        &multiples)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -78,11 +82,13 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     _input  = input;
     _output = output;
 
-    const DataType     data_type      = input->info()->data_type();
-    const int          vec_size_x     = 16 / input->info()->element_size();
-    const int          input_width_x  = input->info()->tensor_shape().x();
-    const unsigned int offset         = ceil_to_multiple(input_width_x, vec_size_x) - input_width_x;
-    const bool         multi_access_x = (input_width_x / vec_size_x > 0);
+    const DataType     data_type         = input->info()->data_type();
+    const int          vec_size_x        = 16 / input->info()->element_size();
+    const int          input_width_x     = input->info()->tensor_shape().x();
+    const unsigned int input_width_ceil  = ceil_to_multiple(input_width_x, vec_size_x);
+    const unsigned int input_width_tiles = input_width_ceil / vec_size_x;
+    const unsigned int offset            = input_width_ceil - input_width_x;
+    const bool         multi_access_x    = (input_width_x / vec_size_x > 0);
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -94,20 +100,20 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(2)));
     build_opts.add_option_if(multi_access_x, "-DOFFSET=" + support::cpp11::to_string(offset));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(multi_access_x, "-DSRC_WIDTH_TILES=" + support::cpp11::to_string(input_width_tiles));
     _kernel = create_kernel(compile_context, "tile", build_opts.options());
 
     // Configure window without padding
     Window win = calculate_max_window(*output->info());
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
         // If multi-access is enabled, no thread should cross the tile boundaries. This means we need
         // as many threads as those to cover a single tile times multiples[0]. Note that if threads
         // do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
         // we don't need to pad the output
         const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), size_win_x, vec_size_x));
+        win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x));
     }
 
     ICLKernel::configure_internal(win);
@@ -116,7 +122,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     _config_id = "tile";
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    for(unsigned int i = 0; i < multiples.size(); ++i)
+    for (unsigned int i = 0; i < multiples.size(); ++i)
     {
         _config_id += "_";
         _config_id += support::cpp11::to_string(input->info()->dimension(i));
@@ -145,7 +151,6 @@ void CLTileKernel::run(const Window &window, cl::CommandQueue &queue)
         add_4D_tensor_argument(idx, _input, slice);
         add_4D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_4D(slice));
+    } while (collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h
index 41752ca90b..c3486aecef 100644
--- a/src/core/CL/kernels/CLTileKernel.h
+++ b/src/core/CL/kernels/CLTileKernel.h
@@ -64,7 +64,10 @@ public:
      * @param[out] output          Destination tensor. Same as @p input
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Multiples        &multiples);
     /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel
      *
      * @param[in] input     Source tensor info. Data type supported: All.
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
deleted file mode 100644
index d55c548b99..0000000000
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type()));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
-    }
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-CLWeightsReshapeKernel::CLWeightsReshapeKernel()
-    : _input(nullptr), _biases(nullptr), _output(nullptr)
-{
-}
-
-void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, biases, output, num_groups);
-}
-
-void CLWeightsReshapeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_weights_reshaped_shape(*input->info(), (biases != nullptr), num_groups)));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
-                                                  (biases != nullptr) ? biases->info() : nullptr,
-                                                  output->info(), num_groups));
-
-    auto padding_info = get_padding_info({ input, biases, output });
-
-    const DataType data_type = input->info()->data_type();
-
-    _biases = biases;
-    _output = output;
-    _input  = input;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(data_type)));
-    build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-    build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "reshape_to_columns", build_opts.options());
-
-    // Configure window
-    Window win = calculate_max_window(*input->info(), Steps());
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status CLWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, biases, output, num_groups));
-    return Status{};
-}
-
-void CLWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    Window out_window;
-    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
-    Window in_slice  = window.first_slice_window_3D();
-    Window out_slice = out_window.first_slice_window_2D();
-
-    Window biases_window;
-    Window biases_slice;
-
-    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
-    idx += (_biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
-    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
-    _kernel.setArg<cl_uint>(idx++, _output->info()->strides_in_bytes().z());
-
-    if(_biases != nullptr)
-    {
-        biases_window.use_tensor_dimensions(_biases->info()->tensor_shape());
-        biases_slice = biases_window.first_slice_window_1D();
-    }
-
-    do
-    {
-        // Set arguments
-        unsigned idx = 0;
-        add_3D_tensor_argument(idx, _input, in_slice);
-        add_2D_tensor_argument(idx, _output, out_slice);
-        if(_biases != nullptr)
-        {
-            add_1D_tensor_argument(idx, _biases, biases_slice);
-            ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice));
-        }
-
-        // Run kernel
-        enqueue(queue, *this, in_slice, lws_hint());
-    }
-    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.h b/src/core/CL/kernels/CLWeightsReshapeKernel.h
deleted file mode 100644
index 402a60472b..0000000000
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** OpenCL kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref CLIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class CLWeightsReshapeKernel : public ICLKernel
-{
-public:
-    /** Constructor.*/
-    CLWeightsReshapeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWeightsReshapeKernel(const CLWeightsReshapeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWeightsReshapeKernel &operator=(const CLWeightsReshapeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWeightsReshapeKernel(CLWeightsReshapeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWeightsReshapeKernel &operator=(CLWeightsReshapeKernel &&) = default;
-    /** Default destructor */
-    ~CLWeightsReshapeKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                        and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in]  biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                        dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                        @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                        Data types supported: Same as @p input
-     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                        Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                             and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in]  biases          The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                             dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                             @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output          The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                             Data types supported: Same as @p input
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                             Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel
-     *
-     * @param[in] input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                       and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in] biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                       dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                       @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[in] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                       Data types supported: Same as @p input
-     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                       Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_biases;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H */
-\ No newline at end of file
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index edcb9cb1ba..ef0518ed3d 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,10 @@
 #include "arm_compute/core/CPP/CPPTypes.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "src/common/cpuinfo/CpuInfo.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 
 namespace arm_compute
 {
@@ -36,8 +39,13 @@ struct CPUInfo::Impl
     unsigned int     L2_cache_size = 262144;
 };
 
-CPUInfo::CPUInfo()
-    : _impl(std::make_unique<Impl>())
+CPUInfo &CPUInfo::get()
+{
+    static CPUInfo _cpuinfo;
+    return _cpuinfo;
+}
+
+CPUInfo::CPUInfo() : _impl(std::make_unique<Impl>())
 {
     _impl->info = cpuinfo::CpuInfo::build();
 }
@@ -49,11 +57,6 @@ unsigned int CPUInfo::get_cpu_num() const
     return _impl->info.num_cpus();
 }
 
-bool CPUInfo::has_sve() const
-{
-    return _impl->info.has_sve();
-}
-
 bool CPUInfo::has_fp16() const
 {
     return _impl->info.has_fp16();
@@ -64,11 +67,51 @@ bool CPUInfo::has_bf16() const
     return _impl->info.has_bf16();
 }
 
+bool CPUInfo::has_svebf16() const
+{
+    return _impl->info.has_svebf16();
+}
+
 bool CPUInfo::has_dotprod() const
 {
     return _impl->info.has_dotprod();
 }
 
+bool CPUInfo::has_svef32mm() const
+{
+    return _impl->info.has_svef32mm();
+}
+
+bool CPUInfo::has_i8mm() const
+{
+    return _impl->info.has_i8mm();
+}
+
+bool CPUInfo::has_svei8mm() const
+{
+    return _impl->info.has_svei8mm();
+}
+
+bool CPUInfo::has_sve() const
+{
+    return _impl->info.has_sve();
+}
+
+bool CPUInfo::has_sve2() const
+{
+    return _impl->info.has_sve2();
+}
+
+bool CPUInfo::has_sme() const
+{
+    return _impl->info.has_sme();
+}
+
+bool CPUInfo::has_sme2() const
+{
+    return _impl->info.has_sme2();
+}
+
 CPUModel CPUInfo::get_cpu_model() const
 {
     return _impl->info.cpu_model();
@@ -79,6 +122,11 @@ CPUModel CPUInfo::get_cpu_model(unsigned int cpuid) const
     return _impl->info.cpu_model(cpuid);
 }
 
+cpuinfo::CpuIsaInfo CPUInfo::get_isa() const
+{
+    return _impl->info.isa();
+}
+
 unsigned int CPUInfo::get_L1_cache_size() const
 {
     return _impl->L1_cache_size;
@@ -88,4 +136,29 @@ unsigned int CPUInfo::get_L2_cache_size() const
 {
     return _impl->L2_cache_size;
 }
+
+unsigned long CPUInfo::get_sme2_vector_length() const
+{
+#ifdef ARM_COMPUTE_ENABLE_SME2
+    return arm_gemm::utils::sme::get_vector_length<int8_t>();
+#else  // ARM_COMPUTE_ENABLE_SME2
+    return 0;
+#endif // ARM_COMPUTE_ENABLE_SME2
+}
+bool CPUInfo::cpu_has_little_mid_big() const
+{
+#if defined(__ANDROID__)
+    return _impl->info.has_little_mid_big();
+#else  /* defined(__ANDROID__) */
+    return false;
+#endif /* defined(__ANDROID__) */
+}
+unsigned int CPUInfo::get_cpu_num_excluding_little() const
+{
+#if defined(__ANDROID__)
+    return _impl->info.not_little_num_cpus();
+#else  /* defined(__ANDROID__) */
+    return get_cpu_num();
+#endif /* defined(__ANDROID__) */
+}
 } // namespace arm_compute
diff --git a/src/core/CPP/ICPPSimpleKernel.cpp b/src/core/CPP/ICPPSimpleKernel.cpp
deleted file mode 100644
index 9e4df5ec8a..0000000000
--- a/src/core/CPP/ICPPSimpleKernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CPP/ICPPSimpleKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace
-{
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int num_elems_processed_per_iteration,
-                                                        bool border_undefined, const arm_compute::BorderSize &border_size)
-{
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration), border_undefined, border_size);
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->valid_region(), border_undefined, border_size);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-ICPPSimpleKernel::ICPPSimpleKernel()
-    : _input{ nullptr }, _output{ nullptr }
-{
-}
-
-void ICPPSimpleKernel::configure(const ITensor *input, ITensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
-{
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration, border_undefined, border_size);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICPPKernel::configure(win_config.second);
-}
-
-Status ICPPSimpleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_elems_processed_per_iteration,
-                                  bool border_undefined, const arm_compute::BorderSize &border_size)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration, border_undefined, border_size).first);
-    return Status{};
-}
-
-} // namespace arm_compute
diff --git a/src/core/CPP/Validate.h b/src/core/CPP/Validate.h
index 9e95f72c3f..fe253508cf 100644
--- a/src/core/CPP/Validate.h
+++ b/src/core/CPP/Validate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CPP_VALIDATE_H
 #define ARM_COMPUTE_CPP_VALIDATE_H
 
+#include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/Validate.h"
 
 namespace arm_compute
@@ -37,15 +38,19 @@ namespace arm_compute
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
 {
+    bool fp16_kernels_enabled = false;
+#if defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS)
+    fp16_kernels_enabled = true;
+#endif /* defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS) */
+
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_info->data_type() == DataType::F16,
-                                        function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    return Status {};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        (tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled), function,
+        file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
+    return Status{};
 }
 
 /** Return an error if the data type of the passed tensor info is BFLOAT16 and BFLOAT16 support is not compiled in.
@@ -57,15 +62,19 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
 {
+    bool bf16_kernels_enabled = false;
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+    bf16_kernels_enabled = true;
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
+
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-#if !(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16))
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_info->data_type() == DataType::BFLOAT16,
-                                        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
-#endif /* !(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)) */
-    return Status {};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        (tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled),
+        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
+    return Status{};
 }
 
 /** Return an error if the data type of the passed tensor is FP16 and FP16 support is not compiled in.
@@ -77,8 +86,8 @@ inline Status error_on_unsupported_cpu_bf16(const char *function, const char *fi
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info()));
@@ -94,8 +103,8 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(function, file, line, tensor->info()));
diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index fb1754247c..02686eb4f6 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
@@ -34,7 +35,11 @@ namespace arm_compute
 namespace
 {
 template <typename T>
-std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &scores_in, std::vector<int> inds, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> SoftNMS(const ITensor               *proposals,
+                         std::vector<std::vector<T>> &scores_in,
+                         std::vector<int>             inds,
+                         const BoxNMSLimitInfo       &info,
+                         int                          class_id)
 {
     std::vector<int> keep;
     const int        proposals_width = proposals->info()->dimension(1);
@@ -45,7 +50,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
     std::vector<T> y2(proposals_width);
     std::vector<T> areas(proposals_width);
 
-    for(int i = 0; i < proposals_width; ++i)
+    for (int i = 0; i < proposals_width; ++i)
     {
         x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
         y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -56,13 +61,13 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
     // Note: Soft NMS scores have already been initialized with input scores
 
-    while(!inds.empty())
+    while (!inds.empty())
     {
         // Find proposal with max score among remaining proposals
         int max_pos = 0;
-        for(unsigned int i = 1; i < inds.size(); ++i)
+        for (unsigned int i = 1; i < inds.size(); ++i)
         {
-            if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
+            if (scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
             {
                 max_pos = i;
             }
@@ -75,7 +80,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
         inds.erase(inds.begin());
 
         std::vector<int> sorted_indices_temp;
-        for(auto idx : inds)
+        for (auto idx : inds)
         {
             const auto xx1 = std::max(x1[idx], x1[element]);
             const auto yy1 = std::max(y1[idx], y1[element]);
@@ -89,7 +94,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
             // Update scores based on computed IoU, overlap threshold and NMS method
             T weight;
-            switch(info.soft_nms_method())
+            switch (info.soft_nms_method())
             {
                 case NMSType::LINEAR:
                     weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f;
@@ -106,7 +111,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
             // Discard boxes with new scores below min threshold and update pending indices
             scores_in[class_id][idx] *= weight;
-            if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
+            if (scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
             {
                 sorted_indices_temp.push_back(idx);
             }
@@ -118,7 +123,10 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 }
 
 template <typename T>
-std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> sorted_indices, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> NonMaximaSuppression(const ITensor         *proposals,
+                                      std::vector<int>       sorted_indices,
+                                      const BoxNMSLimitInfo &info,
+                                      int                    class_id)
 {
     std::vector<int> keep;
 
@@ -130,7 +138,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
     std::vector<T> y2(proposals_width);
     std::vector<T> areas(proposals_width);
 
-    for(int i = 0; i < proposals_width; ++i)
+    for (int i = 0; i < proposals_width; ++i)
     {
         x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
         y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -139,7 +147,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
         areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
     }
 
-    while(!sorted_indices.empty())
+    while (!sorted_indices.empty())
     {
         int i = sorted_indices.at(0);
         keep.push_back(i);
@@ -148,7 +156,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
         std::vector<int> new_indices;
         sorted_indices_temp.erase(sorted_indices_temp.begin());
 
-        for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
+        for (unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
         {
             const float xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]);
             const float yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]);
@@ -163,8 +171,9 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
             const float ctr_y = yy1 + (h / 2);
 
             // If suppress_size is specified, filter the boxes based on their size and position
-            const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && ctr_x < info.im_width() && ctr_y < info.im_height());
-            if(ovr <= info.nms() && keep_size)
+            const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() &&
+                                                             ctr_x < info.im_width() && ctr_y < info.im_height());
+            if (ovr <= info.nms() && keep_size)
             {
                 new_indices.push_back(j);
             }
@@ -172,7 +181,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
 
         const unsigned int new_indices_size = new_indices.size();
         std::vector<int>   new_sorted_indices(new_indices_size);
-        for(unsigned int i = 0; i < new_indices_size; ++i)
+        for (unsigned int i = 0; i < new_indices_size; ++i)
         {
             new_sorted_indices[i] = sorted_indices[new_indices[i] + 1];
         }
@@ -184,7 +193,15 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
 } // namespace
 
 CPPBoxWithNonMaximaSuppressionLimitKernel::CPPBoxWithNonMaximaSuppressionLimitKernel()
-    : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr),
+    : _scores_in(nullptr),
+      _boxes_in(nullptr),
+      _batch_splits_in(nullptr),
+      _scores_out(nullptr),
+      _boxes_out(nullptr),
+      _classes(nullptr),
+      _batch_splits_out(nullptr),
+      _keeps(nullptr),
+      _keeps_size(nullptr),
       _info()
 {
 }
@@ -197,7 +214,7 @@ bool CPPBoxWithNonMaximaSuppressionLimitKernel::is_parallelisable() const
 template <typename T>
 void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 {
-    const int                     batch_size   = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
+    const int                     batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
     const int                     num_classes  = _scores_in->info()->dimension(0);
     const int                     scores_count = _scores_in->info()->dimension(1);
     std::vector<int>              total_keep_per_batch(batch_size);
@@ -205,53 +222,48 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
     int                           total_keep_count = 0;
 
     std::vector<std::vector<T>> in_scores(num_classes, std::vector<T>(scores_count));
-    for(int i = 0; i < scores_count; ++i)
+    for (int i = 0; i < scores_count; ++i)
     {
-        for(int j = 0; j < num_classes; ++j)
+        for (int j = 0; j < num_classes; ++j)
         {
             in_scores[j][i] = *reinterpret_cast<const T *>(_scores_in->ptr_to_element(Coordinates(j, i)));
         }
     }
 
-    int offset        = 0;
     int cur_start_idx = 0;
-    for(int b = 0; b < batch_size; ++b)
+    for (int b = 0; b < batch_size; ++b)
     {
-        const int num_boxes = _batch_splits_in == nullptr ? 1 : static_cast<int>(*reinterpret_cast<T *>(_batch_splits_in->ptr_to_element(Coordinates(b))));
         // Skip first class if there is more than 1 except if the number of classes is 1.
         const int j_start = (num_classes == 1 ? 0 : 1);
-        for(int j = j_start; j < num_classes; ++j)
+        for (int j = j_start; j < num_classes; ++j)
         {
             std::vector<T>   cur_scores(scores_count);
             std::vector<int> inds;
-            for(int i = 0; i < scores_count; ++i)
+            for (int i = 0; i < scores_count; ++i)
             {
                 const T score = in_scores[j][i];
                 cur_scores[i] = score;
 
-                if(score > _info.score_thresh())
+                if (score > _info.score_thresh())
                 {
                     inds.push_back(i);
                 }
             }
-            if(_info.soft_nms_enabled())
+            if (_info.soft_nms_enabled())
             {
                 keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j);
             }
             else
             {
                 std::sort(inds.data(), inds.data() + inds.size(),
-                          [&cur_scores](int lhs, int rhs)
-                {
-                    return cur_scores[lhs] > cur_scores[rhs];
-                });
+                          [&cur_scores](int lhs, int rhs) { return cur_scores[lhs] > cur_scores[rhs]; });
 
                 keeps[j] = NonMaximaSuppression<T>(_boxes_in, inds, _info, j);
             }
             total_keep_count += keeps[j].size();
         }
 
-        if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
+        if (_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
         {
             // merge all scores (represented by indices) together and sort
             auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]()
@@ -259,10 +271,10 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
                 std::vector<T> ret(total_keep_count);
 
                 int ret_idx = 0;
-                for(unsigned int i = 1; i < keeps.size(); ++i)
+                for (unsigned int i = 1; i < keeps.size(); ++i)
                 {
                     auto &cur_keep = keeps[i];
-                    for(auto &ckv : cur_keep)
+                    for (auto &ckv : cur_keep)
                     {
                         ret[ret_idx++] = in_scores[i][ckv];
                     }
@@ -275,13 +287,13 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 
             auto    all_scores_sorted = get_all_scores_sorted();
             const T image_thresh      = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()];
-            for(int j = 1; j < num_classes; ++j)
+            for (int j = 1; j < num_classes; ++j)
             {
                 auto            &cur_keep = keeps[j];
                 std::vector<int> new_keeps_j;
-                for(auto &k : cur_keep)
+                for (auto &k : cur_keep)
                 {
-                    if(in_scores[j][k] >= image_thresh)
+                    if (in_scores[j][k] >= image_thresh)
                     {
                         new_keeps_j.push_back(k);
                     }
@@ -295,59 +307,78 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 
         // Write results
         int cur_out_idx = 0;
-        for(int j = j_start; j < num_classes; ++j)
+        for (int j = j_start; j < num_classes; ++j)
         {
-            auto     &cur_keep        = keeps[j];
-            auto      cur_out_scores  = reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
-            auto      cur_out_classes = reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
-            const int box_column      = (cur_start_idx + cur_out_idx) * 4;
-
-            for(unsigned int k = 0; k < cur_keep.size(); ++k)
+            auto &cur_keep = keeps[j];
+            auto  cur_out_scores =
+                reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            auto cur_out_classes =
+                reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            const int box_column = (cur_start_idx + cur_out_idx) * 4;
+
+            for (unsigned int k = 0; k < cur_keep.size(); ++k)
             {
-                cur_out_scores[k]     = in_scores[j][cur_keep[k]];
-                cur_out_classes[k]    = static_cast<T>(j);
-                auto cur_out_box_row0 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
-                auto cur_out_box_row1 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
-                auto cur_out_box_row2 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
-                auto cur_out_box_row3 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
-                *cur_out_box_row0     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
-                *cur_out_box_row1     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
-                *cur_out_box_row2     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
-                *cur_out_box_row3     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
+                cur_out_scores[k]  = in_scores[j][cur_keep[k]];
+                cur_out_classes[k] = static_cast<T>(j);
+                auto cur_out_box_row0 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
+                auto cur_out_box_row1 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
+                auto cur_out_box_row2 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
+                auto cur_out_box_row3 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
+                *cur_out_box_row0 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
+                *cur_out_box_row1 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
+                *cur_out_box_row2 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
+                *cur_out_box_row3 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
             }
 
             cur_out_idx += cur_keep.size();
         }
 
-        if(_keeps != nullptr)
+        if (_keeps != nullptr)
         {
             cur_out_idx = 0;
-            for(int j = 0; j < num_classes; ++j)
+            for (int j = 0; j < num_classes; ++j)
             {
-                for(unsigned int i = 0; i < keeps[j].size(); ++i)
+                for (unsigned int i = 0; i < keeps[j].size(); ++i)
                 {
-                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast<T>(keeps[j].at(i));
+                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) =
+                        static_cast<T>(keeps[j].at(i));
                 }
-                *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = keeps[j].size();
+                *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) =
+                    keeps[j].size();
                 cur_out_idx += keeps[j].size();
             }
         }
 
-        offset += num_boxes;
         cur_start_idx += total_keep_count;
     }
 
-    if(_batch_splits_out != nullptr)
+    if (_batch_splits_out != nullptr)
     {
-        for(int b = 0; b < batch_size; ++b)
+        for (int b = 0; b < batch_size; ++b)
         {
             *reinterpret_cast<float *>(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b];
         }
     }
 }
 
-void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                                                          ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor        *scores_in,
+                                                          const ITensor        *boxes_in,
+                                                          const ITensor        *batch_splits_in,
+                                                          ITensor              *scores_out,
+                                                          ITensor              *boxes_out,
+                                                          ITensor              *classes,
+                                                          ITensor              *batch_splits_out,
+                                                          ITensor              *keeps,
+                                                          ITensor              *keeps_size,
+                                                          const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32);
@@ -355,25 +386,28 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_
     const unsigned int num_classes = scores_in->info()->dimension(0);
 
     ARM_COMPUTE_UNUSED(num_classes);
-    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes");
-    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows");
+    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0),
+                             "First dimension of input boxes must be of size 4*num_classes");
+    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1),
+                             "Input scores and input boxes must have the same number of rows");
 
     ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1));
     ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4);
     ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != classes->info()->dimension(0));
-    if(keeps != nullptr)
+    if (keeps != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output");
+        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr,
+                                 "keeps_size cannot be nullptr if keeps has to be provided as output");
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps);
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keeps_size, 1, DataType::U32);
         ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0));
         ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0));
     }
-    if(batch_splits_in != nullptr)
+    if (batch_splits_in != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in);
     }
-    if(batch_splits_out != nullptr)
+    if (batch_splits_out != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out);
     }
@@ -402,7 +436,7 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run(const Window &window, const
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
 
-    switch(_scores_in->info()->data_type())
+    switch (_scores_in->info()->data_type())
     {
         case DataType::F32:
             run_nmslimit<float>();
diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
index c1187ff2b3..1224ec14a7 100644
--- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
@@ -35,15 +35,22 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size,
-                          const float score_threshold, const float iou_threshold)
+Status validate_arguments(const ITensorInfo *bboxes,
+                          const ITensorInfo *scores,
+                          const ITensorInfo *output_indices,
+                          unsigned int       max_output_size,
+                          const float        score_threshold,
+                          const float        iou_threshold)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, output_indices);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_indices, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2,
+                                    "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1,
+                                    "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1,
+                                    "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
@@ -55,15 +62,26 @@ Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores,
 } // namespace
 
 CPPNonMaximumSuppressionKernel::CPPNonMaximumSuppressionKernel()
-    : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0)
+    : _input_bboxes(nullptr),
+      _input_scores(nullptr),
+      _output_indices(nullptr),
+      _max_output_size(0),
+      _score_threshold(0.f),
+      _iou_threshold(0.f),
+      _num_boxes(0)
 {
 }
 
-void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices,
-                                               unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes,
+                                               const ITensor *input_scores,
+                                               ITensor       *output_indices,
+                                               unsigned int   max_output_size,
+                                               const float    score_threshold,
+                                               const float    iou_threshold)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_bboxes, input_scores, output_indices);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), max_output_size, score_threshold, iou_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(),
+                                                  max_output_size, score_threshold, iou_threshold));
 
     auto_init_if_empty(*output_indices->info(), TensorShape(max_output_size), 1, DataType::U8, QuantizationInfo());
 
@@ -82,10 +100,15 @@ void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, cons
     ICPPKernel::configure(win);
 }
 
-Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices,
-                                                unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes,
+                                                const ITensorInfo *scores,
+                                                const ITensorInfo *output_indices,
+                                                unsigned int       max_output_size,
+                                                const float        score_threshold,
+                                                const float        iou_threshold)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
     return Status{};
 }
 
@@ -99,10 +122,10 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     // Auxiliary tensors
     std::vector<int>   indices_above_thd;
     std::vector<float> scores_above_thd;
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         const float score_i = *(reinterpret_cast<float *>(_input_scores->ptr_to_element(Coordinates(i))));
-        if(score_i >= _score_threshold)
+        if (score_i >= _score_threshold)
         {
             scores_above_thd.emplace_back(score_i);
             indices_above_thd.emplace_back(i);
@@ -114,12 +137,9 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     std::vector<unsigned int> sorted_indices;
     sorted_indices.resize(num_above_thd);
     std::iota(sorted_indices.data(), sorted_indices.data() + num_above_thd, 0);
-    std::sort(std::begin(sorted_indices),
-              std::end(sorted_indices),
+    std::sort(std::begin(sorted_indices), std::end(sorted_indices),
               [&](unsigned int first, unsigned int second)
-    {
-        return scores_above_thd[first] > scores_above_thd[second];
-    });
+              { return scores_above_thd[first] > scores_above_thd[second]; });
 
     // Number of output is the minimum between max_detection and the scores above the threshold
     const unsigned int num_output = std::min(_max_output_size, num_above_thd);
@@ -127,19 +147,20 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     std::vector<bool>  visited(num_above_thd, false);
 
     // Keep only boxes with small IoU
-    for(unsigned int i = 0; i < num_above_thd; ++i)
+    for (unsigned int i = 0; i < num_above_thd; ++i)
     {
         // Check if the output is full
-        if(output_idx >= num_output)
+        if (output_idx >= num_output)
         {
             break;
         }
 
         // Check if it was already visited, if not add it to the output and update the indices counter
-        if(!visited[sorted_indices[i]])
+        if (!visited[sorted_indices[i]])
         {
-            *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = indices_above_thd[sorted_indices[i]];
-            visited[sorted_indices[i]]                                                           = true;
+            *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) =
+                indices_above_thd[sorted_indices[i]];
+            visited[sorted_indices[i]] = true;
             ++output_idx;
         }
         else
@@ -148,28 +169,36 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
         }
 
         // Once added one element at the output check if the next ones overlap and can be skipped
-        for(unsigned int j = i + 1; j < num_above_thd; ++j)
+        for (unsigned int j = i + 1; j < num_above_thd; ++j)
         {
-            if(!visited[sorted_indices[j]])
+            if (!visited[sorted_indices[j]])
             {
                 // Calculate IoU
                 const unsigned int i_index = indices_above_thd[sorted_indices[i]];
                 const unsigned int j_index = indices_above_thd[sorted_indices[j]];
                 // Box-corner format: xmin, ymin, xmax, ymax
-                const auto box_i_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
-                const auto box_i_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
-                const auto box_i_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
-                const auto box_i_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
-
-                const auto box_j_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
-                const auto box_j_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
-                const auto box_j_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
-                const auto box_j_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
+                const auto box_i_xmin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
+                const auto box_i_ymin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
+                const auto box_i_xmax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
+                const auto box_i_ymax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
+
+                const auto box_j_xmin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
+                const auto box_j_ymin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
+                const auto box_j_xmax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
+                const auto box_j_ymax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
 
                 const float area_i = (box_i_xmax - box_i_xmin) * (box_i_ymax - box_i_ymin);
                 const float area_j = (box_j_xmax - box_j_xmin) * (box_j_ymax - box_j_ymin);
                 float       overlap;
-                if(area_i <= 0 || area_j <= 0)
+                if (area_i <= 0 || area_j <= 0)
                 {
                     overlap = 0.0f;
                 }
@@ -179,11 +208,12 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
                     const auto x_min_intersection = std::max<float>(box_i_xmin, box_j_xmin);
                     const auto y_max_intersection = std::min<float>(box_i_ymax, box_j_ymax);
                     const auto x_max_intersection = std::min<float>(box_i_xmax, box_j_xmax);
-                    const auto area_intersection  = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) * std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
-                    overlap                       = area_intersection / (area_i + area_j - area_intersection);
+                    const auto area_intersection  = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) *
+                                                   std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
+                    overlap = area_intersection / (area_i + area_j - area_intersection);
                 }
 
-                if(overlap > _iou_threshold)
+                if (overlap > _iou_threshold)
                 {
                     visited[sorted_indices[j]] = true;
                 }
@@ -192,7 +222,7 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     }
     // The output could be full but not the output indices tensor
     // Instead return values not valid we put -1
-    for(; output_idx < _max_output_size; ++output_idx)
+    for (; output_idx < _max_output_size; ++output_idx)
     {
         *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = -1;
     }
diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 054c7bf05a..e68090d82b 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -65,7 +66,7 @@ void CPPPermuteKernel::run_permute(const Window &window)
     // Create output window
     Window                  window_out(window);
     const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
-    for(size_t d = 0; d <= _perm.num_dimensions(); ++d)
+    for (size_t d = 0; d <= _perm.num_dimensions(); ++d)
     {
         window_out.set(d, zero_window);
     }
@@ -74,28 +75,32 @@ void CPPPermuteKernel::run_permute(const Window &window)
     Iterator in(_input, window);
     Iterator out(_output, window_out);
 
-    if(_input->info()->num_dimensions() <= 3)
+    if (_input->info()->num_dimensions() <= 3)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
-            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
     }
-    else if(_input->info()->num_dimensions() >= 4)
+    else if (_input->info()->num_dimensions() >= 4)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_strides[3];
-            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] +
+                                id[3] * perm_strides[3];
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
     }
 }
 
-CPPPermuteKernel::CPPPermuteKernel()
-    : _func(), _input(nullptr), _output(nullptr), _perm()
+CPPPermuteKernel::CPPPermuteKernel() : _func(), _input(nullptr), _output(nullptr), _perm()
 {
 }
 
@@ -113,7 +118,7 @@ void CPPPermuteKernel::configure(const ITensor *input, ITensor *output, const Pe
     _output = output;
     _perm   = perm;
 
-    switch(input->info()->element_size())
+    switch (input->info()->element_size())
     {
         case 1:
             _func = &CPPPermuteKernel::run_permute<uint8_t>;
@@ -152,7 +157,7 @@ void CPPPermuteKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_func != nullptr)
+    if (_func != nullptr)
     {
         (this->*_func)(window);
     }
diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp
index d2b54e412e..6ffb68e770 100644
--- a/src/core/CPP/kernels/CPPTopKVKernel.cpp
+++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp
@@ -34,32 +34,34 @@ namespace arm_compute
 {
 namespace
 {
-template <typename T,
-          typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
+template <typename T, typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
 inline bool greater_than(T a, T b)
 {
     const T epsilon = std::numeric_limits<T>::epsilon();
     return (a - b > epsilon);
 }
 
-template < typename T,
-           typename std::enable_if < !utils::traits::is_floating_point<T>::value, int >::type = 0 >
+template <typename T, typename std::enable_if<!utils::traits::is_floating_point<T>::value, int>::type = 0>
 inline bool greater_than(T a, T b)
 {
     return (a > b);
 }
 
-Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status validate_arguments(const ITensorInfo *predictions,
+                          const ITensorInfo *targets,
+                          ITensorInfo       *output,
+                          const unsigned int k)
 {
     ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(targets, 1, DataType::U32);
 
     ARM_COMPUTE_RETURN_ERROR_ON(predictions->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(targets->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(targets->dimension(0) != predictions->dimension(1));
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), targets->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
@@ -72,22 +74,23 @@ Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *tar
 template <typename T>
 void CPPTopKVKernel::run_topkv()
 {
-    for(unsigned int i = 0; i < _batch_size; ++i)
+    for (unsigned int i = 0; i < _batch_size; ++i)
     {
-        const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{ i }));
-        const auto predicted_value = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ target_class_id, i }));
+        const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{i}));
+        const auto predicted_value =
+            *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{target_class_id, i}));
 
         // The variable rank indicates how many values there are before the target_class_id
         unsigned int rank = 0;
-        for(unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
+        for (unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
         {
-            const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ j, i }));
-            if(greater_than(current_prediction, predicted_value))
+            const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{j, i}));
+            if (greater_than(current_prediction, predicted_value))
             {
                 rank++;
             }
         }
-        *(_output->ptr_to_element(Coordinates{ i })) = static_cast<uint8_t>(rank < _k);
+        *(_output->ptr_to_element(Coordinates{i})) = static_cast<uint8_t>(rank < _k);
     }
 }
 
@@ -96,7 +99,10 @@ CPPTopKVKernel::CPPTopKVKernel()
 {
 }
 
-void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+void CPPTopKVKernel::configure(const ITensor     *predictions,
+                               const ITensor     *targets,
+                               ITensor           *output,
+                               const unsigned int k)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(predictions, targets, output);
 
@@ -115,7 +121,10 @@ void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *target
     ICPPKernel::configure(Window()); // Default 1 iteration window
 }
 
-Status CPPTopKVKernel::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status CPPTopKVKernel::validate(const ITensorInfo *predictions,
+                                const ITensorInfo *targets,
+                                ITensorInfo       *output,
+                                const unsigned int k)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(predictions, targets, output, k));
     return Status{};
@@ -129,7 +138,7 @@ bool CPPTopKVKernel::is_parallelisable() const
 void CPPTopKVKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(window, info);
-    switch(_predictions->info()->data_type())
+    switch (_predictions->info()->data_type())
     {
         case DataType::F32:
             run_topkv<float>();
diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
index 7ef83fb2c4..b1efe32446 100644
--- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
@@ -31,8 +32,7 @@
 
 namespace arm_compute
 {
-CPPUpsampleKernel::CPPUpsampleKernel()
-    : _input(nullptr), _output(nullptr), _info()
+CPPUpsampleKernel::CPPUpsampleKernel() : _input(nullptr), _output(nullptr), _info()
 {
 }
 
@@ -82,7 +82,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
     const size_t element_size  = _input->info()->element_size();
 
     // The fill value is normally 0, but for quantized types '0' corresponds to the offset
-    switch(_output->info()->data_type())
+    switch (_output->info()->data_type())
     {
         case DataType::QASYMM8:
         {
@@ -102,7 +102,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
 
     // Create window
     Window window_out(window);
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
         window_out.set(Window::DimX, Window::Dimension(start_width, end_width, stride_width));
         window_out.set(Window::DimY, Window::Dimension(start_height, end_height, stride_height));
@@ -117,10 +117,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
     Iterator in(_input, window);
     Iterator out(_output, window_out);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        memcpy(out.ptr(), in.ptr(), element_size);
-    },
-    in, out);
+    execute_window_loop(
+        window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index 5c8d45c987..679a93f9af 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp
@@ -36,9 +36,10 @@ Status arm_compute::create_error(ErrorCode error_code, std::string msg)
     return Status(error_code, msg);
 }
 
-Status arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg)
+Status
+arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg)
 {
-    std::array<char, 512> out{ 0 };
+    std::array<char, 512> out{0};
     snprintf(out.data(), out.size(), "in %s %s:%d: %s", func, file, line, msg);
     return Status(error_code, std::string(out.data()));
 }
diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index 14264cb883..5904e1a06f 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/GPUTarget.h"
+
 #include "arm_compute/core/Log.h"
 
 #include <map>
@@ -29,60 +30,113 @@
 
 namespace
 {
+
+arm_compute::GPUTarget get_fifth_gen_target(const std::string &version)
+{
+    if (version.find("G720") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G720;
+    }
+    else if (version.find("G620") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G620;
+    }
+    else
+    {
+        return arm_compute::GPUTarget::UNKNOWN;
+    }
+}
+
 arm_compute::GPUTarget get_valhall_target(const std::string &version)
 {
-    if(version.find("G77") != std::string::npos)
+    if (version.find("G77") != std::string::npos)
     {
         return arm_compute::GPUTarget::G77;
     }
-    if(version.find("G78") != std::string::npos)
+    else if (version.find("G57") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G57;
+    }
+    if (version.find("G68") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G68;
+    }
+    if (version.find("G78AE") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G78AE;
+    }
+    if (version.find("G78") != std::string::npos)
     {
         return arm_compute::GPUTarget::G78;
     }
-    else if(version.find("TODX") != std::string::npos)
+    else if (version.find("G710") != std::string::npos)
     {
-        return arm_compute::GPUTarget::TODX;
+        return arm_compute::GPUTarget::G710;
+    }
+    else if (version.find("G610") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G610;
+    }
+    else if (version.find("G510") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G510;
+    }
+    else if (version.find("G310") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G310;
+    }
+    else if (version.find("G715") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G715;
+    }
+    else if (version.find("G615") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G615;
     }
     else
     {
-        return arm_compute::GPUTarget::VALHALL;
+        return arm_compute::GPUTarget::UNKNOWN;
     }
 }
 
 arm_compute::GPUTarget get_bifrost_target(const std::string &version)
 {
-    if(version.find("G71") != std::string::npos)
+    if (version.find("G71") != std::string::npos)
     {
         return arm_compute::GPUTarget::G71;
     }
-    else if(version.find("G72") != std::string::npos)
+    else if (version.find("G72") != std::string::npos)
     {
         return arm_compute::GPUTarget::G72;
     }
-    else if(version.find("G51BIG") != std::string::npos)
+    else if (version.find("G51BIG") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51BIG;
     }
-    else if(version.find("G51LIT") != std::string::npos)
+    else if (version.find("G51LIT") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51LIT;
     }
-    else if(version.find("G51") != std::string::npos)
+    else if (version.find("G51") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51;
     }
-    else if(version.find("G52LIT") != std::string::npos)
+    else if (version.find("G52LIT") != std::string::npos)
     {
         return arm_compute::GPUTarget::G52LIT;
     }
-    else if(version.find("G52") != std::string::npos)
+    else if (version.find("G52") != std::string::npos)
     {
         return arm_compute::GPUTarget::G52;
     }
-    else if(version.find("G76") != std::string::npos)
+    else if (version.find("G76") != std::string::npos)
     {
         return arm_compute::GPUTarget::G76;
     }
+    else if (version.find("G31") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G31;
+    }
     else
     {
         return arm_compute::GPUTarget::UNKNOWN;
@@ -91,15 +145,15 @@ arm_compute::GPUTarget get_bifrost_target(const std::string &version)
 
 arm_compute::GPUTarget get_midgard_target(const std::string &version)
 {
-    if(version.find("T600") != std::string::npos)
+    if (version.find("T600") != std::string::npos)
     {
         return arm_compute::GPUTarget::T600;
     }
-    else if(version.find("T700") != std::string::npos)
+    else if (version.find("T700") != std::string::npos)
     {
         return arm_compute::GPUTarget::T700;
     }
-    else if(version.find("T800") != std::string::npos)
+    else if (version.find("T800") != std::string::npos)
     {
         return arm_compute::GPUTarget::T800;
     }
@@ -114,26 +168,19 @@ namespace arm_compute
 {
 const std::string &string_from_target(GPUTarget target)
 {
-    static std::map<GPUTarget, const std::string> gpu_target_map =
-    {
-        { GPUTarget::MIDGARD, "midgard" },
-        { GPUTarget::BIFROST, "bifrost" },
-        { GPUTarget::VALHALL, "valhall" },
-        { GPUTarget::T600, "t600" },
-        { GPUTarget::T700, "t700" },
-        { GPUTarget::T800, "t800" },
-        { GPUTarget::G71, "g71" },
-        { GPUTarget::G72, "g72" },
-        { GPUTarget::G51, "g51" },
-        { GPUTarget::G51BIG, "g51big" },
-        { GPUTarget::G51LIT, "g51lit" },
-        { GPUTarget::G52, "g52" },
-        { GPUTarget::G52LIT, "g52lit" },
-        { GPUTarget::G76, "g76" },
-        { GPUTarget::G77, "g77" },
-        { GPUTarget::G78, "g78" },
-        { GPUTarget::TODX, "todx" }
-    };
+    static std::map<GPUTarget, const std::string> gpu_target_map = {
+        {GPUTarget::MIDGARD, "midgard"},   {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"},
+        {GPUTarget::FIFTHGEN, "fifthgen"},
+
+        {GPUTarget::T600, "t600"},         {GPUTarget::T700, "t700"},       {GPUTarget::T800, "t800"},
+        {GPUTarget::G71, "g71"},           {GPUTarget::G72, "g72"},         {GPUTarget::G51, "g51"},
+        {GPUTarget::G51BIG, "g51big"},     {GPUTarget::G51LIT, "g51lit"},   {GPUTarget::G31, "g31"},
+        {GPUTarget::G76, "g76"},           {GPUTarget::G52, "g52"},         {GPUTarget::G52LIT, "g52lit"},
+        {GPUTarget::G77, "g77"},           {GPUTarget::G57, "g57"},         {GPUTarget::G78, "g78"},
+        {GPUTarget::G68, "g68"},           {GPUTarget::G78AE, "g78ae"},     {GPUTarget::G710, "g710"},
+        {GPUTarget::G610, "g610"},         {GPUTarget::G510, "g510"},       {GPUTarget::G310, "g310"},
+        {GPUTarget::G715, "g715"},         {GPUTarget::G615, "g615"},       {GPUTarget::G720, "g720"},
+        {GPUTarget::G620, "g620"}};
 
     return gpu_target_map[target];
 }
@@ -144,7 +191,7 @@ GPUTarget get_target_from_name(const std::string &device_name)
     std::smatch name_parts;
     const bool  found_mali = std::regex_search(device_name, name_parts, mali_regex);
 
-    if(!found_mali)
+    if (!found_mali)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Arm® Mali™ GPU. Target is set to default.");
         return GPUTarget::MIDGARD;
@@ -158,16 +205,27 @@ GPUTarget get_target_from_name(const std::string &device_name)
 
     // Work-out gpu target
     GPUTarget gpu_target;
-    if(target == 'G' || is_future_gpu)
+    if (target == 'G' || is_future_gpu)
     {
-        // Check for Bifrost or Valhall
-        gpu_target = get_bifrost_target(version);
-        if(gpu_target == GPUTarget::UNKNOWN)
+        // Check for Valhall, Bifrost or 5-th Gen
+        gpu_target = get_fifth_gen_target(version);
+        if (gpu_target == GPUTarget::UNKNOWN)
         {
             gpu_target = get_valhall_target(version);
         }
+
+        if (gpu_target == GPUTarget::UNKNOWN)
+        {
+            gpu_target = get_bifrost_target(version);
+        }
+
+        // Default GPUTarget
+        if (gpu_target == GPUTarget::UNKNOWN)
+        {
+            gpu_target = GPUTarget::VALHALL;
+        }
     }
-    else if(target == 'T')
+    else if (target == 'T')
     {
         gpu_target = get_midgard_target(version);
     }
@@ -177,7 +235,7 @@ GPUTarget get_target_from_name(const std::string &device_name)
     }
 
     // Report in case of unknown target
-    if(gpu_target == GPUTarget::UNKNOWN)
+    if (gpu_target == GPUTarget::UNKNOWN)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Arm® Mali™ Mali GPU unknown. Target is set to the default one. (BIFROST)");
         return GPUTarget::BIFROST;
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index e692cc1e7c..c801b097b5 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,11 @@
 
 namespace arm_compute
 {
-ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
-                                         InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
+ValidRegion calculate_valid_region_scale(const ITensorInfo  &src_info,
+                                         const TensorShape  &dst_shape,
+                                         InterpolationPolicy interpolate_policy,
+                                         SamplingPolicy      sampling_policy,
+                                         bool                border_undefined)
 {
     const DataLayout data_layout = src_info.data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -49,9 +52,9 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens
     auto valid_end_out_y   = std::min<int>(std::ceil(valid_end_in_y * scale_y), dst_shape[idx_height]);
 
     // Handle valid points in case of the bi-linear interpolation
-    if(border_undefined)
+    if (border_undefined)
     {
-        switch(interpolate_policy)
+        switch (interpolate_policy)
         {
             case InterpolationPolicy::NEAREST_NEIGHBOR:
             {
@@ -90,7 +93,7 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens
     }
 
     // Setup output valid region
-    ValidRegion valid_region{ Coordinates(), dst_shape, dst_shape.num_dimensions() };
+    ValidRegion valid_region{Coordinates(), dst_shape, dst_shape.num_dimensions()};
 
     valid_region.anchor.set(idx_width, std::max(0, valid_start_out_x));
     valid_region.anchor.set(idx_height, std::max(0, valid_start_out_y));
@@ -100,4 +103,21 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens
 
     return valid_region;
 }
-} // namespace arm_compute
-\ No newline at end of file
+
+const std::map<DataLayout, std::vector<DataLayoutDimension>> &get_layout_map()
+{
+    constexpr DataLayoutDimension W = DataLayoutDimension::WIDTH;
+    constexpr DataLayoutDimension H = DataLayoutDimension::HEIGHT;
+    constexpr DataLayoutDimension C = DataLayoutDimension::CHANNEL;
+    constexpr DataLayoutDimension D = DataLayoutDimension::DEPTH;
+    constexpr DataLayoutDimension N = DataLayoutDimension::BATCHES;
+
+    static const std::map<DataLayout, std::vector<DataLayoutDimension>> layout_map = {
+        {DataLayout::NDHWC, {C, W, H, D, N}},
+        {DataLayout::NCDHW, {W, H, D, C, N}},
+        {DataLayout::NHWC, {C, W, H, N}},
+        {DataLayout::NCHW, {W, H, C, N}}};
+
+    return layout_map;
+}
+} // namespace arm_compute
diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index 832801255f..923c5f8a85 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp
@@ -29,14 +29,18 @@
 
 using namespace arm_compute;
 
-ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window      &window,
+                                                        const ValidRegion &input_valid_region) const
 {
     return compute_valid_region(window, input_valid_region, false, BorderSize(0));
 }
 
-ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window,
+                                                        ValidRegion   input_valid_region,
+                                                        bool          border_undefined,
+                                                        BorderSize    border_size) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -45,7 +49,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     Coordinates  old_anchor(anchor);
     TensorShape &shape = input_valid_region.shape;
 
-    if(!border_undefined)
+    if (!border_undefined)
     {
         border_size = BorderSize(0);
     }
@@ -56,7 +60,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     // Additionally the valid region is shifted by the offset that is used by
     // the kernel to write back output values.
     anchor.set(0, std::max<int>(window.x().start() * _scale_x, anchor[0] + border_size.left) + _x);
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y);
     }
@@ -69,15 +73,19 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     // old size is first converted into end points to compared against the
     // execution window. Afterwards the new end points are converted back into
     // a size of the region.
-    shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right, (window.x().end() - window.x().step()) * _scale_x + _width) - anchor[0]);
-    if(_info->num_dimensions() > 1)
+    shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right,
+                               (window.x().end() - window.x().step()) * _scale_x + _width) -
+                     anchor[0]);
+    if (_info->num_dimensions() > 1)
     {
-        shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]);
+        shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom,
+                                   (window.y().end() - window.y().step()) * _scale_y + _height) -
+                         anchor[1]);
     }
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input
-    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    for (size_t d = 2; d < _info->num_dimensions(); ++d)
     {
         anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
         shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
@@ -86,9 +94,12 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     return input_valid_region;
 }
 
-void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined, const BorderSize &border_size)
+void AccessWindowRectangle::set_valid_region(const Window      &window,
+                                             const ValidRegion &input_valid_region,
+                                             bool               border_undefined,
+                                             const BorderSize  &border_size)
 {
-    if(_info != nullptr)
+    if (_info != nullptr)
     {
         _info->set_valid_region(compute_valid_region(window, input_valid_region, border_undefined, border_size));
     }
@@ -97,17 +108,16 @@ void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRe
 bool AccessWindowRectangle::update_window_if_needed(Window &window) const
 {
     // Only update the window size if we can't use padding
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
 
-    PaddingSize needed = get_needed_padding(window);
+    PaddingSize needed    = get_needed_padding(window);
     PaddingSize available = _info->padding();
 
-    if(needed.top <= available.top && needed.right <= available.right
-    && needed.bottom <= available.bottom
-    && needed.left <= available.left)
+    if (needed.top <= available.top && needed.right <= available.right && needed.bottom <= available.bottom &&
+        needed.left <= available.left)
     {
         return false;
     }
@@ -124,12 +134,12 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     const int max_y = (window.y().end() - window.y().step()) * _scale_y + _y + _height;
 
     // Adjust window start for Y dimension
-    if(min_y < 0)
+    if (min_y < 0)
     {
         // Calculate rows available above the tensor
         const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
 
-        if(min_y < front_pad_y_available)
+        if (min_y < front_pad_y_available)
         {
             // Not enough padding available, need to shrink the window
             int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y;
@@ -144,18 +154,19 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for Y dimension
-    if(max_y > static_cast<int>(shape[1]))
+    if (max_y > static_cast<int>(shape[1]))
     {
         const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
 
         // Calculate rows available below the tensor
         const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
 
-        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
         {
             // Not enough padding available, need to shrink the window
-            int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height;
-            end     = std::max<int>(window.y().start(), end / _scale_y);
+            int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) +
+                      window.y().step() * _scale_y - _y - _height;
+            end = std::max<int>(window.y().start(), end / _scale_y);
 
             window.set(1, Window::Dimension(window.y().start(), end, window.y().step()));
             window_modified = true;
@@ -170,11 +181,14 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
     // Adjust window start for X dimension
-    if(min_x < 0)
+    if (min_x < 0)
     {
-        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+        const int front_pad_x_available =
+            -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1],
+                           stride_y - shape[0] * strides[0]) /
+            static_cast<int>(strides[0]);
 
-        if(min_x < front_pad_x_available)
+        if (min_x < front_pad_x_available)
         {
             // Not enough padding available, need to shrink the window
             int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x;
@@ -189,15 +203,16 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for X dimension
-    if(max_x > static_cast<int>(shape[0]))
+    if (max_x > static_cast<int>(shape[0]))
     {
         const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
 
-        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
         {
             // Not enough padding available, need to shrink the window
-            int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width;
-            end     = std::max<int>(window.x().start(), end / _scale_x);
+            int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) +
+                      window.x().step() * _scale_x - _x - _width;
+            end = std::max<int>(window.x().start(), end / _scale_x);
 
             window.set(0, Window::Dimension(window.x().start(), end, window.x().step()));
             window_modified = true;
@@ -212,15 +227,15 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
 bool AccessWindowRectangle::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
     // Update strides in tensor info
-    return _info->extend_padding( get_needed_padding(window));
+    return _info->extend_padding(get_needed_padding(window));
 }
 
-PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window)const
+PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window) const
 {
     ARM_COMPUTE_ERROR_ON(_scale_x == 0);
     ARM_COMPUTE_ERROR_ON(_scale_y == 0);
diff --git a/src/core/IKernel.cpp b/src/core/IKernel.cpp
index 31f1ec7a3f..fb7e095091 100644
--- a/src/core/IKernel.cpp
+++ b/src/core/IKernel.cpp
@@ -30,8 +30,7 @@ const Window &IKernel::window() const
     return _window;
 }
 
-IKernel::IKernel()
-    : _window()
+IKernel::IKernel() : _window()
 {
     // Create an empty window to make sure the children classes set the window values themselves
     _window.set(Window::DimX, Window::Dimension(0, 0, 1));
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index e263596333..4dc8ea959b 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 
 #include <cstring>
@@ -34,7 +35,7 @@ namespace arm_compute
 {
 void ITensor::copy_from(const ITensor &src)
 {
-    if(&src == this)
+    if (&src == this)
     {
         return;
     }
@@ -46,7 +47,7 @@ void ITensor::copy_from(const ITensor &src)
     ARM_COMPUTE_ERROR_ON(src_info->num_channels() != dst_info->num_channels());
     ARM_COMPUTE_ERROR_ON(src_info->element_size() != dst_info->element_size());
 
-    for(size_t d = 0; d < src_info->num_dimensions(); d++)
+    for (size_t d = 0; d < src_info->num_dimensions(); d++)
     {
         ARM_COMPUTE_ERROR_ON(src_info->dimension(d) > dst_info->dimension(d));
     }
@@ -65,11 +66,7 @@ void ITensor::copy_from(const ITensor &src)
     const size_t line_size = src_info->element_size() * src_info->dimension(0);
 
     execute_window_loop(
-        win_src, [&](const Coordinates &)
-    {
-        memcpy(dst_it.ptr(), src_it.ptr(), line_size);
-    },
-    src_it, dst_it);
+        win_src, [&](const Coordinates &) { memcpy(dst_it.ptr(), src_it.ptr(), line_size); }, src_it, dst_it);
 }
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
@@ -86,10 +83,10 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
     stream_status.copyfmt(s);
 
     // Set precision
-    if(is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
+    if (is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
     {
         int precision = io_fmt.precision;
-        if(io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
+        if (io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
         {
             precision = std::numeric_limits<float>().max_digits10;
         }
@@ -100,7 +97,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
     size_t print_width  = 0;
     size_t print_height = 0;
     int    start_offset = 0;
-    switch(io_fmt.print_region)
+    switch (io_fmt.print_region)
     {
         case IOFormatInfo::PrintRegion::NoPadding:
             print_width  = this->info()->dimension(0);
@@ -110,13 +107,14 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
         case IOFormatInfo::PrintRegion::ValidRegion:
             print_width  = this->info()->valid_region().shape.x();
             print_height = this->info()->valid_region().shape.y();
-            start_offset = this->info()->offset_element_in_bytes(Coordinates(this->info()->valid_region().anchor.x(),
-                                                                             this->info()->valid_region().anchor.y()));
+            start_offset = this->info()->offset_element_in_bytes(
+                Coordinates(this->info()->valid_region().anchor.x(), this->info()->valid_region().anchor.y()));
             break;
         case IOFormatInfo::PrintRegion::Full:
             print_width  = padding.left + this->info()->dimension(0) + padding.right;
             print_height = padding.top + this->info()->dimension(1) + padding.bottom;
-            start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] - padding.left * strides[0];
+            start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] -
+                           padding.left * strides[0];
             break;
         default:
             break;
@@ -128,16 +126,17 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
     const uint8_t *ptr = this->buffer() + start_offset;
 
     // Start printing
-    for(size_t i = 0; i < slices2D; ++i)
+    for (size_t i = 0; i < slices2D; ++i)
     {
         // Find max_width of elements in slice to align columns
         int max_element_width = 0;
-        if(io_fmt.align_columns)
+        if (io_fmt.align_columns)
         {
             size_t offset = i * strides[2];
-            for(size_t h = 0; h < print_height; ++h)
+            for (size_t h = 0; h < print_height; ++h)
             {
-                max_element_width = std::max<int>(max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
+                max_element_width = std::max<int>(
+                    max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
                 offset += strides[1];
             }
         }
@@ -145,7 +144,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
         // Print slice
         {
             size_t offset = i * strides[2];
-            for(size_t h = 0; h < print_height; ++h)
+            for (size_t h = 0; h < print_height; ++h)
             {
                 print_consecutive_elements(s, dt, ptr + offset, print_width, max_element_width, io_fmt.element_delim);
                 offset += strides[1];
@@ -169,4 +168,9 @@ void ITensor::mark_as_unused() const
 {
     _is_used = false;
 }
-} // namespace arm_compute
-\ No newline at end of file
+
+void ITensor::mark_as_used() const
+{
+    _is_used = true;
+}
+} // namespace arm_compute
diff --git a/src/core/ITensorPack.cpp b/src/core/ITensorPack.cpp
index 90f9a45039..0f8b0824f8 100644
--- a/src/core/ITensorPack.cpp
+++ b/src/core/ITensorPack.cpp
@@ -27,10 +27,9 @@
 
 namespace arm_compute
 {
-ITensorPack::ITensorPack(std::initializer_list<PackElement> l)
-    : _pack()
+ITensorPack::ITensorPack(std::initializer_list<PackElement> l) : _pack()
 {
-    for(auto &e : l)
+    for (auto &e : l)
     {
         _pack[e.id] = e;
     }
@@ -54,7 +53,7 @@ void ITensorPack::add_const_tensor(int id, const ITensor *tensor)
 const ITensor *ITensorPack::get_const_tensor(int id) const
 {
     auto it = _pack.find(id);
-    if(it != _pack.end())
+    if (it != _pack.end())
     {
         return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
     }
@@ -81,4 +80,4 @@ bool ITensorPack::empty() const
 {
     return _pack.empty();
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/INESimpleKernel.h b/src/core/NEON/INESimpleKernel.h
deleted file mode 100644
index 2986e7b5c9..0000000000
--- a/src/core/NEON/INESimpleKernel.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_INESIMPLEKERNEL_H
-#define ARM_COMPUTE_INESIMPLEKERNEL_H
-
-#include "arm_compute/core/CPP/ICPPSimpleKernel.h"
-
-namespace arm_compute
-{
-/** Interface for simple CPU kernels having 1 tensor input and 1 tensor output */
-using INESimpleKernel = ICPPSimpleKernel;
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_INESIMPLEKERNEL_H */
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
index 9b92a865d0..b93e64a0ef 100644
--- a/src/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEASYMM_H
-#define ARM_COMPUTE_NEASYMM_H
+#ifndef ACL_SRC_CORE_NEON_NEASYMM_H
+#define ACL_SRC_CORE_NEON_NEASYMM_H
 
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -52,7 +53,8 @@ using qasymm8x16_signed_t  = int8x16_t;  /**< 8 bit quantized signed asymmetric
  *
  * @return A 16-component vector in QASYMM8 format, saturated to fit
  */
-uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
+template <RoundingPolicy round_policy = RoundingPolicy::TO_ZERO>
+qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
 
 /** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector
  *
@@ -64,7 +66,8 @@ uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
  *
  * @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit
  */
-int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
+template <RoundingPolicy round_policy = RoundingPolicy::TO_ZERO>
+qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
 
 /** Performs final quantization step on 16 elements
  *
@@ -88,7 +91,7 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
@@ -128,18 +131,13 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to U8
     uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_u8 = vmaxq_u8(out_u8, min_u8);
         out_u8 = vminq_u8(out_u8, max_u8);
@@ -168,7 +166,7 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
                                        int8x16_t    max_s8,
                                        bool         is_bounded_relu)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
@@ -202,18 +200,13 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
     in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -245,8 +238,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     const static int32x4_t one_s32 = vdupq_n_s32(1);
 
     // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-    int32x4x4_t res_shift_gt0 =
-    {
+    int32x4x4_t res_shift_gt0 = {
         vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]),
         vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]),
         vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]),
@@ -258,8 +250,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]);
     res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]);
 
-    int32x4x4_t res_shift_lt0 =
-    {
+    int32x4x4_t res_shift_lt0 = {
         vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))),
         vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))),
         vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))),
@@ -271,8 +262,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]);
 
     // Select result depending on shift value
-    const uint32x4x4_t mask_lt0 =
-    {
+    const uint32x4x4_t mask_lt0 = {
 #ifdef __aarch64__
         vcltzq_s32(result_shift.val[0]),
         vcltzq_s32(result_shift.val[1]),
@@ -298,18 +288,13 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -330,15 +315,20 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
  *
  * @return Quantized value
  */
-inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                     int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                     uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu)
+inline uint8_t finalize_quantization(int32_t in_value,
+                                     int     result_fixedpoint_multiplier,
+                                     int32_t result_shift,
+                                     int32_t result_offset_after_shift_s32,
+                                     uint8_t min_u8,
+                                     uint8_t max_u8,
+                                     bool    is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+        in_value = vgetq_lane_s32(
+            vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
     }
     else
     {
@@ -353,7 +343,7 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
 
     // Bound the result
     uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
     }
@@ -373,15 +363,20 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
  *
  * @return Quantized value
  */
-inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                    int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                    int8_t min_s8, int8_t max_s8, bool is_bounded_relu)
+inline int8_t finalize_quantization(int32_t in_value,
+                                    int     result_fixedpoint_multiplier,
+                                    int32_t result_shift,
+                                    int32_t result_offset_after_shift_s32,
+                                    int8_t  min_s8,
+                                    int8_t  max_s8,
+                                    bool    is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+        in_value = vgetq_lane_s32(
+            vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
     }
     else
     {
@@ -397,7 +392,7 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult
 
     // Bound the result
     int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
     }
@@ -414,17 +409,16 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult
  */
 inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -437,17 +431,14 @@ inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationI
  */
 inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -460,19 +451,24 @@ inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationIn
  */
 inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -485,19 +481,16 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantization
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -511,17 +504,22 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationI
  */
 inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
 {
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -535,17 +533,14 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offs
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset)
 {
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -558,15 +553,12 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offse
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale)
 {
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
-        }
-    };
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
+    }};
     return vdequantized_input;
 }
 
@@ -579,16 +571,13 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
 {
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-        }
-    };
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -605,18 +594,15 @@ inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInf
     const int         offset    = qi.offset;
     const float32x4_t voffset   = vdupq_n_f32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #endif //__aarch64__
-        }
-    };
+    }};
     return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
 }
 
@@ -633,18 +619,15 @@ inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizat
     const int         offset    = qi.offset;
     const float32x4_t voffset   = vdupq_n_f32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #endif //__aarch64__
-        }
-    };
+    }};
     return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
 }
 
@@ -652,22 +635,19 @@ inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int3
 {
     const int32x4_t   voffset   = vdupq_n_s32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+        vaddq_s32(vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+        vaddq_s32(vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+        vaddq_s32(vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+        vaddq_s32(vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
 #else  //__aarch64__
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
 #endif //__aarch64__
-        }
-    };
+    }};
     return rf;
 }
 
@@ -713,8 +693,9 @@ inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQua
     auto             rf = vquantize_internal(qv, qi.scale, qi.offset);
     const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
     const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
-    return { pa, pb };
+    return {pa, pb};
 }
+
 } // namespace arm_compute
 #include "src/core/NEON/NEAsymm.inl"
-#endif // ARM_COMPUTE_NEASYMM_H
+#endif // ACL_SRC_CORE_NEON_NEASYMM_H
diff --git a/src/core/NEON/NEAsymm.inl b/src/core/NEON/NEAsymm.inl
index 6ee1a336b8..fd62fd4654 100644
--- a/src/core/NEON/NEAsymm.inl
+++ b/src/core/NEON/NEAsymm.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#include "arm_compute/core/Rounding.h"
+
 namespace arm_compute
 {
+template <RoundingPolicy round_policy>
 inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo)
 {
     // Convert uint8 vectors to uint16 vectors
@@ -46,16 +50,43 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v
     C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
     D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
     // Convert float32 vectors to uint32 vectors
+#if __aarch64__
+    if (round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+    {
+        A_u32x4 = vcvtnq_u32_f32(A_f32x4);
+        B_u32x4 = vcvtnq_u32_f32(B_f32x4);
+        C_u32x4 = vcvtnq_u32_f32(C_f32x4);
+        D_u32x4 = vcvtnq_u32_f32(D_f32x4);
+    }
+    else if (round_policy == RoundingPolicy::TO_NEAREST_UP)
+    {
+        A_u32x4 = vcvtaq_u32_f32(A_f32x4);
+        B_u32x4 = vcvtaq_u32_f32(B_f32x4);
+        C_u32x4 = vcvtaq_u32_f32(C_f32x4);
+        D_u32x4 = vcvtaq_u32_f32(D_f32x4);
+    }
+    else
+    {
+        A_u32x4 = vcvtq_u32_f32(A_f32x4);
+        B_u32x4 = vcvtq_u32_f32(B_f32x4);
+        C_u32x4 = vcvtq_u32_f32(C_f32x4);
+        D_u32x4 = vcvtq_u32_f32(D_f32x4);
+    }
+#else  // #if __aarch64__
+    // rounding mode only supported in aarch64
     A_u32x4 = vcvtq_u32_f32(A_f32x4);
     B_u32x4 = vcvtq_u32_f32(B_f32x4);
     C_u32x4 = vcvtq_u32_f32(C_f32x4);
     D_u32x4 = vcvtq_u32_f32(D_f32x4);
+#endif // #if __aarch64__
     // Convert uint32 vectors to uint16 vectors (with saturation)
     vd_low_u16x8  = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4));
     vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4));
     // convert uint16 vectors to uint8 vectors (with saturation)
     return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
 }
+
+template <RoundingPolicy round_policy>
 inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
 {
     // Convert uint8 vectors to int16 vectors
@@ -78,11 +109,36 @@ inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x
     B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
     C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
     D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
-    // Convert float32 vectors to int32 vectors
+#if __aarch64__
+    if (round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+    {
+        A_s32x4 = vcvtnq_s32_f32(A_f32x4);
+        B_s32x4 = vcvtnq_s32_f32(B_f32x4);
+        C_s32x4 = vcvtnq_s32_f32(C_f32x4);
+        D_s32x4 = vcvtnq_s32_f32(D_f32x4);
+    }
+    else if (round_policy == RoundingPolicy::TO_NEAREST_UP)
+    {
+        A_s32x4 = vcvtaq_s32_f32(A_f32x4);
+        B_s32x4 = vcvtaq_s32_f32(B_f32x4);
+        C_s32x4 = vcvtaq_s32_f32(C_f32x4);
+        D_s32x4 = vcvtaq_s32_f32(D_f32x4);
+    }
+    else
+    {
+        A_s32x4 = vcvtq_s32_f32(A_f32x4);
+        B_s32x4 = vcvtq_s32_f32(B_f32x4);
+        C_s32x4 = vcvtq_s32_f32(C_f32x4);
+        D_s32x4 = vcvtq_s32_f32(D_f32x4);
+    }
+#else  // #if __aarch64__
+    // rounding mode only supported in aarch64
     A_s32x4 = vcvtq_s32_f32(A_f32x4);
     B_s32x4 = vcvtq_s32_f32(B_f32x4);
     C_s32x4 = vcvtq_s32_f32(C_f32x4);
     D_s32x4 = vcvtq_s32_f32(D_f32x4);
+#endif // #if __aarch64__
+
     // Convert int32 vectors to int16 vectors (with saturation)
     vd_low_s16x8  = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4));
     vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4));
diff --git a/src/core/NEON/NEFixedPoint.inl b/src/core/NEON/NEFixedPoint.inl
index 8bff9c4a8e..fb403b6d26 100644
--- a/src/core/NEON/NEFixedPoint.inl
+++ b/src/core/NEON/NEFixedPoint.inl
@@ -30,13 +30,7 @@ namespace arm_compute
 
 inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
 {
-    float32x4x2_t res =
-    {
-        {
-            vmaxq_f32(a.val[0], b.val[0]),
-            vmaxq_f32(a.val[1], b.val[1])
-        }
-    };
+    float32x4x2_t res = {{vmaxq_f32(a.val[0], b.val[0]), vmaxq_f32(a.val[1], b.val[1])}};
     return res;
 }
 #endif /* DOXYGEN_SKIP_THIS */
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index ea15f4eddd..8675eec93f 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEKERNELS_H
-#define ARM_COMPUTE_NEKERNELS_H
+#ifndef ACL_SRC_CORE_NEON_NEKERNELS_H
+#define ACL_SRC_CORE_NEON_NEKERNELS_H
 
 #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
@@ -33,7 +33,6 @@
 #include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 #include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 #include "src/core/NEON/kernels/NECol2ImKernel.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
 #include "src/core/NEON/kernels/NECropKernel.h"
 #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
@@ -41,45 +40,28 @@
 #include "src/core/NEON/kernels/NEFFTScaleKernel.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "src/core/NEON/kernels/NEGatherKernel.h"
 #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
-#include "src/core/NEON/kernels/NEIm2ColKernel.h"
 #include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 #include "src/core/NEON/kernels/NELogicalKernel.h"
-#include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
-#include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
 #include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 #include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
-#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
-#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NERangeKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/NEON/kernels/NERemapKernel.h"
+#include "src/core/NEON/kernels/NEReorderKernel.h"
 #include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 #include "src/core/NEON/kernels/NEReverseKernel.h"
+#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
+#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NESelectKernel.h"
 #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
 #include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
 #include "src/core/NEON/kernels/NEStackLayerKernel.h"
 #include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 #include "src/core/NEON/kernels/NETileKernel.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
-#include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 
-#endif /* ARM_COMPUTE_NEKERNELS_H */
+#endif // ACL_SRC_CORE_NEON_NEKERNELS_H
diff --git a/src/core/NEON/NEMath.h b/src/core/NEON/NEMath.h
index 13484c9c15..9e81c38ad8 100644
--- a/src/core/NEON/NEMath.h
+++ b/src/core/NEON/NEMath.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -94,6 +94,14 @@ float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &c
  */
 float32x4_t vexpq_f32(float32x4_t x);
 
+/** Calculate error function
+ *
+ * @param[in] x Input vector in F32 format.
+ *
+ * @return The calculated erf.
+ */
+float32x4_t verfq_f32(float32x4_t x);
+
 /** Calculate logarithm
  *
  * @param[in] x Input vector value in F32 format.
@@ -239,6 +247,14 @@ float32x4_t vsinq_f32(float32x4_t val);
  */
 float32x2_t vsin_f32(float32x2_t val);
 
+/** Reduce a vector to be a scalar by accumulating all lanes in the vector
+ *
+ * @param[in] v Vector to be reduced.
+ *
+ * @return the wrapped-around number.
+ */
+float vreduce(const float32x4_t &v);
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Calculate hyperbolic tangent.
  *
@@ -300,6 +316,14 @@ float16x8_t vinvsqrtq_f16(float16x8_t x);
  */
 float16x8_t vexpq_f16(float16x8_t x);
 
+/** Calculate error function
+ *
+ * @param[in] x Input vector in F16 format.
+ *
+ * @return The calculated erf.
+ */
+float16x8_t verfq_f16(float16x8_t x);
+
 /** Calculate n power of a number.
  *
  * pow(x,n) = e^(n*log(x))
@@ -319,6 +343,13 @@ float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
  */
 float16x8_t vsinq_f16(float16x8_t val);
 
+/** Reduce a vector to be a scalar by accumulating all lanes in the vector
+ *
+ * @param[in] v Vector to be reduced.
+ *
+ * @return the wrapped-around number.
+ */
+float16_t vreduce(const float16x8_t &v);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
 #include "src/core/NEON/NEMath.inl"
diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl
index 5ac62badcc..a5aba0bf23 100644
--- a/src/core/NEON/NEMath.inl
+++ b/src/core/NEON/NEMath.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,6 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#include "src/core/utils/Math.h"
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
@@ -28,35 +30,17 @@
 
 namespace arm_compute
 {
-/** Exponent polynomial coefficients */
-const std::array<float32x4_t, 8> exp_tab =
-{
-    {
-        vdupq_n_f32(1.f),
-        vdupq_n_f32(0.0416598916054f),
-        vdupq_n_f32(0.500000596046f),
-        vdupq_n_f32(0.0014122662833f),
-        vdupq_n_f32(1.00000011921f),
-        vdupq_n_f32(0.00833693705499f),
-        vdupq_n_f32(0.166665703058f),
-        vdupq_n_f32(0.000195780929062f),
-    }
-};
-
 /** Logarithm polynomial coefficients */
-const std::array<float32x4_t, 8> log_tab =
-{
-    {
-        vdupq_n_f32(-2.29561495781f),
-        vdupq_n_f32(-2.47071170807f),
-        vdupq_n_f32(-5.68692588806f),
-        vdupq_n_f32(-0.165253549814f),
-        vdupq_n_f32(5.17591238022f),
-        vdupq_n_f32(0.844007015228f),
-        vdupq_n_f32(4.58445882797f),
-        vdupq_n_f32(0.0141278216615f),
-    }
-};
+const std::array<float32x4_t, 8> log_tab = {{
+    vdupq_n_f32(-2.29561495781f),
+    vdupq_n_f32(-2.47071170807f),
+    vdupq_n_f32(-5.68692588806f),
+    vdupq_n_f32(-0.165253549814f),
+    vdupq_n_f32(5.17591238022f),
+    vdupq_n_f32(0.844007015228f),
+    vdupq_n_f32(4.58445882797f),
+    vdupq_n_f32(0.0141278216615f),
+}};
 
 /** Sin polynomial coefficients */
 constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3)
@@ -65,6 +49,15 @@ constexpr float te_sin_coeff4 = 0.023809523810f; // 1/(6*7)
 constexpr float te_sin_coeff5 = 0.013888888889f; // 1/(8*9)
 
 #ifndef DOXYGEN_SKIP_THIS
+inline float32x4_t prefer_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
+{
+#if __ARM_FEATURE_FMA
+    return vfmaq_f32(a, b, c);
+#else  // __ARM_FEATURE_FMA
+    return vmlaq_f32(a, b, c);
+#endif // __ARM_FEATURE_FMA
+}
+
 inline float32x4_t vfloorq_f32(float32x4_t val)
 {
     static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
@@ -85,14 +78,32 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val)
     static const int32x4_t   CONST_1_INT      = vdupq_n_s32(1);
     const float32x4_t        floor_val        = vfloorq_f32(val);
     const float32x4_t        diff             = vsubq_f32(val, floor_val);
+    const float32x4_t        fp32_upper_limit =
+        vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U
 
     /*
-    * Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0).
-    * This condition is checked by vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT))))
+    * 1. Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0).
+    *    This condition is checked by vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT))))
+    *
+    * 2. In case the input value (val) is out of signed int32 range, then simple use the input value as the rounded value
+    *    Because:
+    *    in this case converting to int32 would saturate
+    *    If the input float value is >= 2^23 * 1.00... 23 Zeros ..0  then the rounded value is exactly equal to the input value.
+    *    Because:
+    *    in IEEE single precision floating point representation the fraction part is 23 bit, so if exponent is 23 it means the fraction part = 0 as any digits after decimal point are truncated.
+    *    Hence, rounding has no effect:
+    *    Threshold upper limit with format |S|E(8bits)|   Fraction(23bits)     | = (23 + 127) << 23 (assuming positive sign): Adding 127, because 127 represents the actual zero in this format.
     */
 
-    return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))),
-                     floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
+    float32x4_t rounded_val = vbslq_f32(
+        vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT),
+                  vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT),
+                            vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))),
+        floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
+
+    float32x4_t result = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val);
+
+    return result;
 #endif // __aarch64__
 }
 
@@ -108,8 +119,8 @@ inline float32x2_t vinvsqrt_f32(float32x2_t x)
 inline float32x4_t vinvsqrtq_f32(float32x4_t x)
 {
     float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
 
     return sqrt_reciprocal;
 }
@@ -142,30 +153,140 @@ inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t
     return res;
 }
 
+static const uint32_t exp_f32_coeff[] = {
+    0x3f7ffff6, // x^1: 0x1.ffffecp-1f
+    0x3efffedb, // x^2: 0x1.fffdb6p-2f
+    0x3e2aaf33, // x^3: 0x1.555e66p-3f
+    0x3d2b9f17, // x^4: 0x1.573e2ep-5f
+    0x3c072010, // x^5: 0x1.0e4020p-7f
+};
+
 inline float32x4_t vexpq_f32(float32x4_t x)
 {
-    static const float32x4_t CONST_LN2          = vdupq_n_f32(0.6931471805f); // ln(2)
-    static const float32x4_t CONST_INV_LN2      = vdupq_n_f32(1.4426950408f); // 1/ln(2)
-    static const float32x4_t CONST_INF          = vdupq_n_f32(std::numeric_limits<float>::infinity());
-    static const float32x4_t CONST_MAX_INPUT    = vdupq_n_f32(88.7f);
-    static const float32x4_t CONST_0            = vdupq_n_f32(0.f);
-    static const int32x4_t   CONST_NEGATIVE_126 = vdupq_n_s32(-126);
-
-    // Perform range reduction [-log(2),log(2)]
-    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2));
-    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
-
-    // Polynomial Approximation
-    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
-
-    // Reconstruct
-    poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
-    poly = vbslq_f32(vcltq_s32(m, CONST_NEGATIVE_126), CONST_0, poly); // Handle underflow
-    poly = vbslq_f32(vcgtq_f32(x, CONST_MAX_INPUT), CONST_INF, poly);  // Handle overflow
+    const auto c1 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[0]));
+    const auto c2 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[1]));
+    const auto c3 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[2]));
+    const auto c4 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[3]));
+    const auto c5 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[4]));
+
+    const auto shift   = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    const auto neg_ln2_hi =
+        vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    const auto neg_ln2_lo =
+        vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+
+    const auto inf       = vdupq_n_f32(std::numeric_limits<float>::infinity());
+    const auto max_input = vdupq_n_f32(88.37f); // Approximately ln(2^127.5)
+    const auto zero      = vdupq_n_f32(0.f);
+    const auto min_input = vdupq_n_f32(-86.64f); // Approximately ln(2^-125)
+
+    // Range reduction:
+    //   e^x = 2^n * e^r
+    // where:
+    //   n = floor(x / ln(2))
+    //   r = x - n * ln(2)
+    //
+    // By adding x / ln(2) with 2^23 + 127 (shift):
+    //   * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127 forces decimal part
+    //     of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. n) + 127 will occupy
+    //     the whole fraction part of z in FP32 format.
+    //     Subtracting 2^23 + 127 (shift) from z will result in the integer part of x / ln(2)
+    //     (i.e. n) because the decimal part has been pushed out and lost.
+    //   * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent
+    //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
+    const auto z     = prefer_vfmaq_f32(shift, x, inv_ln2);
+    const auto n     = z - shift;
+    const auto scale = vreinterpretq_f32_u32(vreinterpretq_u32_f32(z) << 23); // 2^n
+
+    // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32.
+    // This outperforms longer Taylor series (3-4 tabs) both in term of accuracy and performance.
+    const auto r_hi = prefer_vfmaq_f32(x, n, neg_ln2_hi);
+    const auto r    = prefer_vfmaq_f32(r_hi, n, neg_ln2_lo);
+
+    // Compute the truncated Taylor series of e^r.
+    //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
+    const auto r2 = r * r;
+
+    const auto p1     = c1 * r;
+    const auto p23    = prefer_vfmaq_f32(c2, c3, r);
+    const auto p45    = prefer_vfmaq_f32(c4, c5, r);
+    const auto p2345  = prefer_vfmaq_f32(p23, p45, r2);
+    const auto p12345 = prefer_vfmaq_f32(p1, p2345, r2);
+
+    auto poly = prefer_vfmaq_f32(scale, p12345, scale);
+
+    // Handle underflow and overflow.
+    poly = vbslq_f32(vcltq_f32(x, min_input), zero, poly);
+    poly = vbslq_f32(vcgtq_f32(x, max_input), inf, poly);
 
     return poly;
 }
 
+#ifdef __aarch64__
+inline float32x4_t verfq_f32(float32x4_t x)
+{
+    const float32x4_t max_value = vdupq_n_f32(3.9375);       // 4 - 8/128
+    const float32x4_t shift     = vdupq_n_f32(65536);        // 2^16
+    const float32x4_t third     = vdupq_n_f32(0.3333333333); // 1/3
+    const float32x4_t one       = vdupq_n_f32(1.f);
+    const uint32x4_t  max_index = vdupq_n_u32(512);
+    const uint32x4_t  sign_mask = vdupq_n_u32(0x7fffffff);
+
+    const float32x4_t x_abs = vabsq_f32(x);
+
+    // erf(x) for x in [0, 3.9375] is approxiated as follows:
+    //
+    //   erf(x) = erf(r) + scale(r) * d * (1 - r * d - 1/3 * d^2)
+    //
+    // where:
+    //   r = floor(x * 128) / 128
+    //   d = x - r
+    //
+    // erf(r) and scale(r) are stored in a 513-entry lookup table.
+    // The LUT covers the range from 0 to 4 with the step of 1/128.
+    //
+    // Special cases:
+    //   erf(x) =  1 for x >  3.9375
+    //   erf(x) = -1 for x < -3.9375
+
+    // Find the LUT indices by rounding the input value to the step of 1/128.
+    //
+    // `shift` is used to push out the 16 LSBs of the input value. Only 7 bits in the fraction part
+    // of the input value is preserved.
+    const float32x4_t z = x_abs + shift;
+    const float32x4_t r = z - shift;
+
+    uint32x4_t index = vreinterpretq_u32_f32(z) - vreinterpretq_u32_f32(shift);
+    index            = vminq_u32(index, max_index);
+
+    // Lookup erf(r) and scale(r).
+    const float64_t entry_0 = *reinterpret_cast<const float64_t *>(&erf_f32_lut[index[0]]);
+    const float64_t entry_1 = *reinterpret_cast<const float64_t *>(&erf_f32_lut[index[1]]);
+    const float64_t entry_2 = *reinterpret_cast<const float64_t *>(&erf_f32_lut[index[2]]);
+    const float64_t entry_3 = *reinterpret_cast<const float64_t *>(&erf_f32_lut[index[3]]);
+
+    const float32x4_t entry_01 = vreinterpretq_f32_f64(float64x2_t{entry_0, entry_1});
+    const float32x4_t entry_23 = vreinterpretq_f32_f64(float64x2_t{entry_2, entry_3});
+
+    const float32x4_t erf_r   = vuzp1q_f32(entry_01, entry_23);
+    const float32x4_t scale_r = vuzp2q_f32(entry_01, entry_23);
+
+    // Approximate erf(x) = erf(r) + scale(r) * d * (1 - r * d - 1/3 * d^2).
+    const float32x4_t d  = x_abs - r;
+    const float32x4_t d2 = d * d;
+
+    const float32x4_t t0    = vfmaq_f32(r, third, d); // t0 = r + 1/3 * d.
+    const float32x4_t t1    = vfmsq_f32(d, d2, t0);   // t1 = d - d2 * t0 = d * (1 - r * d - 1/3 * d^2).
+    const float32x4_t erf_x = vfmaq_f32(erf_r, scale_r, t1);
+
+    const float32x4_t clamped = vbslq_f32(x_abs > max_value, one, erf_x);
+    const float32x4_t result  = vbslq_f32(sign_mask, clamped, x);
+
+    return result;
+}
+#endif // #ifdef __aarch64__
+
 inline float32x4_t vlogq_f32(float32x4_t x)
 {
     static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
@@ -193,12 +314,14 @@ inline float32x4_t vtanhq_f32(float32x4_t val)
     static const float32x4_t CONST_THR      = vdupq_n_f32(5.e-3);
     static const float32x4_t CONST_1_3      = vdupq_n_f32(0.3333333f);
 
-    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
+    float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
     // x * (1 - x^2/3) if |x| < 5.e-3 or (exp2x - 1) / (exp2x + 1) otherwise
-    float32x4_t exp2x = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x));
-    float32x4_t num   = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x));
-    float32x4_t den   = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num));
-    float32x4_t tanh  = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den));
+    float32x4_t exp2x =
+        vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x));
+    float32x4_t num =
+        vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x));
+    float32x4_t den = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num));
+    float32x4_t tanh = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den));
     return tanh;
 }
 
@@ -364,30 +487,23 @@ inline float32x4x4_t convert_to_float32x4x4(const int8x16_t &in)
 
 inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
 {
-    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
-    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
-    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
+    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
+    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
+    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
 }
 
 inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
 {
-    const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
-                                  vqmovn_u32(vcvtq_u32_f32(in.val[1])));
-    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
-                                   vqmovn_u32(vcvtq_u32_f32(in.val[3])));
-    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+    const auto low  = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), vqmovn_u32(vcvtq_u32_f32(in.val[1])));
+    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), vqmovn_u32(vcvtq_u32_f32(in.val[3])));
+    out             = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
 }
 
 inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
 {
-    const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])),
-                                  vqmovn_s32(vcvtq_s32_f32(in.val[1])));
-    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])),
-                                   vqmovn_s32(vcvtq_s32_f32(in.val[3])));
-    out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
+    const auto low  = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), vqmovn_s32(vcvtq_s32_f32(in.val[1])));
+    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), vqmovn_s32(vcvtq_s32_f32(in.val[3])));
+    out             = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
 }
 
 template <>
@@ -418,6 +534,18 @@ inline float32x4x4_t convert_int_to_float<float32x4x4_t, int8x16_t>(const int8x1
     return convert_int8x16_to_float32x4x4(in);
 }
 
+inline float vreduce(const float32x4_t &v)
+{
+    const float32x2_t v0    = vget_high_f32(v);
+    const float32x2_t v1    = vget_low_f32(v);
+    const float32x2_t v_out = vadd_f32(v0, v1);
+
+    const float a = vget_lane_f32(v_out, 0);
+    const float b = vget_lane_f32(v_out, 1);
+
+    return a + b;
+}
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Exponent polynomial coefficients */
 /** Logarithm polynomial coefficients */
@@ -448,8 +576,8 @@ inline float16x4_t vinvsqrt_f16(float16x4_t x)
 inline float16x8_t vinvsqrtq_f16(float16x8_t x)
 {
     float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
     return sqrt_reciprocal;
 }
 
@@ -469,19 +597,44 @@ inline float16x8_t vinvq_f16(float16x8_t x)
     return recip;
 }
 
-inline float16x8_t vtanhq_f16(float16x8_t val)
+inline float16x4_t vtanh_rational_approx_f16(float16x4_t x16)
 {
-    const float16x8_t CONST_1        = vdupq_n_f16(1.f);
-    const float16x8_t CONST_2        = vdupq_n_f16(2.f);
-    const float16x8_t CONST_MIN_TANH = vdupq_n_f16(-10.f);
-    const float16x8_t CONST_MAX_TANH = vdupq_n_f16(10.f);
+    // Calculate rational approximation part of tanh exactly on a half-register of F16 by using F32s
+    // Note: doesn't handle overflows, needs truncating at |x| = 4.508
+    const float32x4_t x = vcvt_f32_f16(x16);
 
-    const float16x8_t x     = vminq_f16(vmaxq_f16(val, CONST_MIN_TANH), CONST_MAX_TANH);
-    const float16x8_t exp2x = vexpq_f16(vmulq_f16(CONST_2, x));
-    const float16x8_t num   = vsubq_f16(exp2x, CONST_1);
-    const float16x8_t den   = vaddq_f16(exp2x, CONST_1);
-    const float16x8_t tanh  = vmulq_f16(num, vinvq_f16(den));
-    return tanh;
+    const float32x4_t ONE = vdupq_n_f32(1.0f);
+    const float32x4_t C1  = vdupq_n_f32(0.43760237f);
+    const float32x4_t C2  = vdupq_n_f32(0.104402f);
+    const float32x4_t C3  = vdupq_n_f32(0.013442706f);
+    const float32x4_t C4  = vdupq_n_f32(0.00073561433f);
+
+    const float32x4_t x2 = vmulq_f32(x, x);
+
+    // Denominator polynomial 1 + C1*x^2 + C3*x^4
+    float32x4_t denom = vfmaq_f32(C1, C3, x2);
+    denom             = vfmaq_f32(ONE, x2, denom);
+
+    // Numerator polynomial x*(1 + C2*x^2 + C4*x^4)
+    float32x4_t numer = vfmaq_f32(C2, C4, x2);
+    numer             = vfmaq_f32(ONE, x2, numer);
+    numer             = vmulq_f32(numer, x);
+
+    return vcvt_f16_f32(vdivq_f32(numer, denom));
+}
+
+inline float16x8_t vtanhq_f16(float16x8_t x)
+{
+    // Split into high/low and use rational approximation on both parts exactly
+    const float16x8_t tanh =
+        vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)), vtanh_rational_approx_f16(vget_high_f16(x)));
+
+    // tanh(x) == sign(x) to F16 precision for |x| >= 4.508, use sign after this
+    const float16x8_t ONE      = vdupq_n_f16(1.0f);
+    const float16x8_t MAX_X    = vdupq_n_f16(4.508f);
+    const auto        at_limit = vcageq_f16(x, MAX_X); // |x| >= 4.508
+    const float16x8_t sign_x   = vbslq_f16(vclezq_f16(x), -ONE, ONE);
+    return vbslq_f16(at_limit, sign_x, tanh);
 }
 
 inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array<float16x8_t, 8> &coeffs)
@@ -505,6 +658,17 @@ inline float16x8_t vexpq_f16(float16x8_t x)
     return res;
 }
 
+#ifdef __aarch64__
+inline float16x8_t verfq_f16(float16x8_t x)
+{
+    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
+    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
+
+    const float16x8_t res = vcombine_f16(vcvt_f16_f32(verfq_f32(x_low)), vcvt_f16_f32(verfq_f32(x_high)));
+    return res;
+}
+#endif // #ifdef __aarch64__
+
 inline float16x8_t vlogq_f16(float16x8_t x)
 {
     const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
@@ -550,6 +714,19 @@ inline float16x4_t vsin_f16(float16x4_t val)
     return vcvt_f16_f32(vcombine_f32(res_low, res_high));
 }
 
+inline float16_t vreduce(const float16x8_t &v)
+{
+    const float16x4_t v0    = vget_high_f16(v);
+    const float16x4_t v1    = vget_low_f16(v);
+    const float16x4_t v_out = vadd_f16(v0, v1);
+
+    const float16_t a = vget_lane_f16(v_out, 0);
+    const float16_t b = vget_lane_f16(v_out, 1);
+    const float16_t c = vget_lane_f16(v_out, 2);
+    const float16_t d = vget_lane_f16(v_out, 3);
+
+    return a + b + c + d;
+}
 #endif /* DOXYGEN_SKIP_THIS */
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
diff --git a/src/core/NEON/NESymm.h b/src/core/NEON/NESymm.h
index e6644577a1..ec246efc8c 100644
--- a/src/core/NEON/NESymm.h
+++ b/src/core/NEON/NESymm.h
@@ -25,7 +25,9 @@
 #define ARM_COMPUTE_NESYMM_H
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -49,13 +51,10 @@ using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 1
  * @return Quantized values
  */
 template <bool is_bounded_relu>
-int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
-                                      int          result_fixedpoint_multiplier,
-                                      int32_t      result_shift,
-                                      int16x8_t    min_s16,
-                                      int16x8_t    max_s16)
+int16x8_t finalize_quantization_int16(
+    int32x4x2_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int16x8_t min_s16, int16x8_t max_s16)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift));
@@ -76,7 +75,7 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
     // Convert S32 to S16
     int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s16 = vmaxq_s16(out_s16, min_s16);
         out_s16 = vminq_s16(out_s16, max_s16);
@@ -98,13 +97,14 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
  * @return Quantized values
  */
 template <bool is_bounded_relu>
-inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier,
-                                           int32_t result_shift, int16_t min_s16, int16_t max_s16)
+inline int16_t finalize_quantization_int16(
+    int32_t in_value, int result_fixedpoint_multiplier, int32_t result_shift, int16_t min_s16, int16_t max_s16)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) * static_cast<int64_t>(result_fixedpoint_multiplier);
-        in_value            = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
+        const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) *
+                              static_cast<int64_t>(result_fixedpoint_multiplier);
+        in_value = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
     }
     else
     {
@@ -117,7 +117,7 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi
     // Bound the result
     int16_t out_s16 = static_cast<int16_t>(std::max<int32_t>(-32768, std::min<int32_t>(32767, in_value)));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s16 = static_cast<int16_t>(std::max(min_s16, std::min(max_s16, out_s16)));
     }
@@ -134,14 +134,9 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi
  */
 inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale)
 {
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)
-        }
-    };
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
+                                               vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)}};
     return vdequantized_input;
 }
 
@@ -156,18 +151,13 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
 {
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
 
-    const int32x4x2_t rf =
-    {
-        {
+    const int32x4x2_t rf = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+        vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
 #else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+        vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
 #endif //__aarch64__
-        }
-    };
+    }};
     return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
 }
 
@@ -180,17 +170,14 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
  */
 inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale  = qi.scale;
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -206,24 +193,20 @@ inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQua
     const float scale = qi.scale;
     ARM_COMPUTE_ERROR_ON(scale == 0.f);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
 #endif //__aarch64__
-        }
-    };
-    const qsymm16x8x2_t res =
-    {
+    }};
+    const qsymm16x8x2_t res = {
         vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])),
         vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])),
     };
diff --git a/src/core/NEON/SVEAsymm.h b/src/core/NEON/SVEAsymm.h
index 4b0ecd9eea..a448cde475 100644
--- a/src/core/NEON/SVEAsymm.h
+++ b/src/core/NEON/SVEAsymm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,9 @@
 #ifndef ARM_COMPUTE_SVEASYMM_H
 #define ARM_COMPUTE_SVEASYMM_H
 
-#if defined(__ARM_FEATURE_SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -67,18 +68,21 @@ svint8_t svmla_qasymm8_signed_z(svbool_t pg, svint8_t vd, svfloat32_t vs, svfloa
  */
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, float scale, int32_t offset)
 {
-    const auto          voffset = svdup_n_s32(offset);
-    const auto          vscale  = svdup_n_f32(scale);
-    const svfloat32x4_t vdequantized_input =
-    {
-        { {
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), vscale),
-            }
-        }
-    };
+    const auto          voffset            = svdup_n_s32(offset);
+    const auto          vscale             = svdup_n_f32(scale);
+    const svfloat32x4_t vdequantized_input = svcreate4_f32(
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)),
+                    vscale));
     return vdequantized_input;
 }
 
@@ -106,18 +110,14 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, const Unif
  */
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale, int32_t offset)
 {
-    const auto          voffset = svdup_n_s32(offset);
-    const auto          vscale  = svdup_n_f32(scale);
-    const svfloat32x4_t vdequantized_input =
-    {
-        { {
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale),
-            }
-        }
-    };
+    const auto          voffset            = svdup_n_s32(offset);
+    const auto          vscale             = svdup_n_f32(scale);
+    const svfloat32x4_t vdequantized_input = svcreate4_f32(
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale));
+
     return vdequantized_input;
 }
 
@@ -145,15 +145,11 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const Unifo
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svfloat32x4_t vscale)
 {
     const svfloat32x4_t vdequantized_input =
-    {
-        { {
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)),
-            }
-        }
-    };
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)));
+
     return vdequantized_input;
 }
 
@@ -168,15 +164,10 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale
 {
     const auto          vscale = svdup_n_f32(scale);
     const svfloat32x4_t vdequantized_input =
-    {
-        { {
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale),
-            }
-        }
-    };
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale));
     return vdequantized_input;
 }
 
@@ -258,5 +249,5 @@ inline svuint16x2_t svquantize_qasymm16_z(svbool_t pg, const svfloat32x4_t qv, c
 }
 } // namespace arm_compute
 #include "src/core/NEON/SVEAsymm.inl"
-#endif /* defined(__ARM_FEATURE_SVE2) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 #endif // ARM_COMPUTE_NEASYMM_H
diff --git a/src/core/NEON/SVEAsymm.inl b/src/core/NEON/SVEAsymm.inl
index edf5733c36..e85cacd721 100644
--- a/src/core/NEON/SVEAsymm.inl
+++ b/src/core/NEON/SVEAsymm.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 namespace arm_compute
 {
-#if defined(__ARM_FEATURE_SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
 inline svuint8_t svmla_qasymm8_z(svbool_t pg, svuint8_t vd, svfloat32_t vs, svfloat32_t vo)
 {
     // Convert uint8 vectors to uint16 vectors
@@ -101,5 +101,5 @@ inline svint8_t svmla_qasymm8_signed_z(svbool_t pg, svint8_t vd, svfloat32_t vs,
     const auto res = svqxtnt_s16(svqxtnb_s16(vd_low_s16), vd_high_s16);
     return res;
 }
-#endif /* (__ARM_FEATURE_SVE2) */
+#endif /* (ARM_COMPUTE_ENABLE_SVE2) */
 } // namespace arm_compute
diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h
index dde75e8088..49ed9df720 100644
--- a/src/core/NEON/SVEMath.h
+++ b/src/core/NEON/SVEMath.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_SVEMATH_H
-#define ARM_COMPUTE_SVEMATH_H
+#ifndef ACL_SRC_CORE_NEON_SVEMATH_H
+#define ACL_SRC_CORE_NEON_SVEMATH_H
 
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "src/core/NEON/wrapper/intrinsics/svcvt.h"
 #include "src/core/NEON/wrapper/intrinsics/svdup_n.h"
 #include "src/core/NEON/wrapper/intrinsics/svreinterpret.h"
+
 #include <arm_sve.h>
 #include <array>
 
@@ -95,6 +96,19 @@ svfloat16_t svtanh_f16_z(svbool_t pg, svfloat16_t val);
  */
 svfloat16_t svexp_f16_z(svbool_t pg, svfloat16_t x);
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+/** Calculate exponential
+ *
+ * @param[in] pg Input predicate.
+ * @param[in] x  Input vector value in F16 format.
+ *
+ * @return The calculated exponent.
+ */
+svfloat16_t svexp_f16_z_sve2(svbool_t pg, svfloat16_t x);
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 /** Calculate reciprocal.
  *
  * @param[in] pg Input predicate.
@@ -113,6 +127,19 @@ svfloat16_t svinv_f16_z(svbool_t pg, svfloat16_t x);
  */
 svfloat16_t svlog_f16_z(svbool_t pg, svfloat16_t x);
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+/** Calculate logarithm
+ *
+ * @param[in] pg Input predicate.
+ * @param[in] x  Input vector value in F32 format.
+ *
+ * @return The calculated logarithm.
+ */
+svfloat16_t svlog_f16_z_sve2(svbool_t pg, svfloat16_t x);
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 /** Calculate inverse square root.
  *
  * @param[in] pg  Input predicate.
@@ -147,6 +174,19 @@ svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val);
  */
 svfloat16_t svsin_f16_z(svbool_t pg, svfloat16_t val);
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+/** Calculate sine.
+ *
+ * @param[in] pg  Input predicate.
+ * @param[in] val Input vector value in radians, F16 format.
+ *
+ * @return The calculated sine.
+ */
+svfloat16_t svsin_f16_z_sve2(svbool_t pg, svfloat16_t val);
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 /** Calculate n power of a number.
  *
  * pow(x,n) = e^(n*log(x))
@@ -171,6 +211,22 @@ svfloat32_t svpow_f32_z(svbool_t pg, svfloat32_t a, svfloat32_t b);
  */
 svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b);
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+/** Calculate n power of a number.
+ *
+ * pow(x,n) = e^(n*log(x))
+ *
+ * @param[in] pg Input predicate.
+ * @param[in] a  Input vector value in F16 format.
+ * @param[in] b  Powers to raise the input to.
+ *
+ * @return The calculated power.
+ */
+svfloat16_t svpow_f16_z_sve2(svbool_t pg, svfloat16_t a, svfloat16_t b);
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 /** Convert and pack four 32-bit float vectors into an 8-bit integer vector
  *
  * @param[in] in_0 The first float vector
@@ -181,9 +237,12 @@ svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b);
  * @return The converted integer vector
  */
 template <typename int_vec_type>
-int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3);
+int_vec_type convert_float_to_int(const svfloat32_t &in_0,
+                                  const svfloat32_t &in_1,
+                                  const svfloat32_t &in_2,
+                                  const svfloat32_t &in_3);
 
 } // namespace arm_compute
 #include "src/core/NEON/SVEMath.inl"
-#endif /* defined(ENABLE_SVE) */
-#endif /* ARM_COMPUTE_SVEMATH_H */
-\ No newline at end of file
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+#endif // ACL_SRC_CORE_NEON_SVEMATH_H
diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl
index 7625e5be34..fdf94f0859 100644
--- a/src/core/NEON/SVEMath.inl
+++ b/src/core/NEON/SVEMath.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#ifndef ACL_SRC_CORE_NEON_SVEMATH_INL
+#define ACL_SRC_CORE_NEON_SVEMATH_INL
+
 #include <cmath>
 #include <limits>
 
-#if defined(__ARM_FEATURE_SVE) && defined(ENABLE_SVE)
+#if defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE)
 
 #ifndef M_PI
 #define M_PI (3.14159265358979323846)
@@ -32,8 +36,16 @@
 
 namespace arm_compute
 {
-inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t coeff_1, svfloat32_t coeff_2, svfloat32_t coeff_3,
-                                       svfloat32_t coeff_4, svfloat32_t coeff_5, svfloat32_t coeff_6, svfloat32_t coeff_7, svfloat32_t coeff_8)
+inline svfloat32_t svtaylor_poly_f32_z(svbool_t    pg,
+                                       svfloat32_t x,
+                                       svfloat32_t coeff_1,
+                                       svfloat32_t coeff_2,
+                                       svfloat32_t coeff_3,
+                                       svfloat32_t coeff_4,
+                                       svfloat32_t coeff_5,
+                                       svfloat32_t coeff_6,
+                                       svfloat32_t coeff_7,
+                                       svfloat32_t coeff_8)
 {
     const auto A   = svmla_f32_z(pg, coeff_1, coeff_5, x);
     const auto B   = svmla_f32_z(pg, coeff_3, coeff_7, x);
@@ -45,8 +57,16 @@ inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t c
     return res;
 }
 
-inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, svfloat16_t x, svfloat16_t coeff_1, svfloat16_t coeff_2, svfloat16_t coeff_3,
-                                       svfloat16_t coeff_4, svfloat16_t coeff_5, svfloat16_t coeff_6, svfloat16_t coeff_7, svfloat16_t coeff_8)
+inline svfloat16_t svtaylor_poly_f16_z(svbool_t    pg,
+                                       svfloat16_t x,
+                                       svfloat16_t coeff_1,
+                                       svfloat16_t coeff_2,
+                                       svfloat16_t coeff_3,
+                                       svfloat16_t coeff_4,
+                                       svfloat16_t coeff_5,
+                                       svfloat16_t coeff_6,
+                                       svfloat16_t coeff_7,
+                                       svfloat16_t coeff_8)
 {
     const auto A   = svmla_f16_z(pg, coeff_1, coeff_5, x);
     const auto B   = svmla_f16_z(pg, coeff_3, coeff_7, x);
@@ -74,67 +94,104 @@ inline svfloat32_t svinv_f32_z(svbool_t pg, svfloat32_t x)
     return recip;
 }
 
+static const uint32_t svexp_f32_coeff[] = {
+    0x3f7ffff6, // x^1: 0x1.ffffecp-1f
+    0x3efffedb, // x^2: 0x1.fffdb6p-2f
+    0x3e2aaf33, // x^3: 0x1.555e66p-3f
+    0x3d2b9f17, // x^4: 0x1.573e2ep-5f
+    0x3c072010, // x^5: 0x1.0e4020p-7f
+};
+
 inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x)
 {
-    const auto CONST_LN2          = svdup_n_f32(0.6931471805f); // ln(2)
-    const auto CONST_INV_LN2      = svdup_n_f32(1.4426950408f); // 1/ln(2)
-    const auto CONST_INF          = svdup_n_f32(std::numeric_limits<float>::infinity());
-    const auto CONST_MAX_INPUT    = svdup_n_f32(88.7f);
-    const auto CONST_0            = svdup_n_f32(0.f);
-    const auto CONST_NEGATIVE_126 = svdup_n_s32(-126);
-
-    /** Exponent polynomial coefficients */
-    const svfloat32_t exp_tab_1 = svdup_n_f32(1.f);
-    const svfloat32_t exp_tab_2 = svdup_n_f32(0.0416598916054f);
-    const svfloat32_t exp_tab_3 = svdup_n_f32(0.500000596046f);
-    const svfloat32_t exp_tab_4 = svdup_n_f32(0.0014122662833f);
-    const svfloat32_t exp_tab_5 = svdup_n_f32(1.00000011921f);
-    const svfloat32_t exp_tab_6 = svdup_n_f32(0.00833693705499f);
-    const svfloat32_t exp_tab_7 = svdup_n_f32(0.166665703058f);
-    const svfloat32_t exp_tab_8 = svdup_n_f32(0.000195780929062f);
-
-    // Perform range reduction [-log(2),log(2)]
-    auto m   = svcvt_s32_f32_z(pg, svmul_f32_z(pg, x, CONST_INV_LN2));
-    auto val = svmls_f32_z(pg, x, svcvt_f32_s32_z(pg, m), CONST_LN2);
-
-    // Polynomial Approximation
-    auto poly = svtaylor_poly_f32_z(pg, val, exp_tab_1, exp_tab_2, exp_tab_3, exp_tab_4, exp_tab_5, exp_tab_6, exp_tab_7, exp_tab_8);
+    const auto c1 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[0]));
+    const auto c2 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[1]));
+    const auto c3 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[2]));
+    const auto c4 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[3]));
+    const auto c5 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[4]));
+
+    const auto shift   = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    const auto neg_ln2_hi =
+        svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    const auto neg_ln2_lo =
+        svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+
+    const auto inf       = svdup_n_f32(std::numeric_limits<float>::infinity());
+    const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
+    const auto zero      = svdup_n_f32(0.f);
+    const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
+
+    // Range reduction:
+    //   e^x = 2^n * e^r
+    // where:
+    //   n = floor(x / ln(2))
+    //   r = x - n * ln(2)
+    //
+    // By adding x / ln(2) with 2^23 + 127 (shift):
+    //   * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127 forces decimal part
+    //     of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. n) + 127 will occupy
+    //     the whole fraction part of z in FP32 format.
+    //     Subtracting 2^23 + 127 (shift) from z will result in the integer part of x / ln(2)
+    //     (i.e. n) because the decimal part has been pushed out and lost.
+    //   * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent
+    //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
+    const auto z     = svmla_f32_z(pg, shift, x, inv_ln2);
+    const auto n     = svsub_f32_z(pg, z, shift);
+    const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n
+
+    // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32.
+    // This outperforms longer Taylor series (3-4 tabs) both in term of accuracy and performance.
+    const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi);
+    const auto r    = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
+
+    // Compute the truncated Taylor series of e^r.
+    //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
+    const auto r2 = svmul_f32_z(pg, r, r);
+
+    const auto p1     = svmul_f32_z(pg, c1, r);
+    const auto p23    = svmla_f32_z(pg, c2, c3, r);
+    const auto p45    = svmla_f32_z(pg, c4, c5, r);
+    const auto p2345  = svmla_f32_z(pg, p23, p45, r2);
+    const auto p12345 = svmla_f32_z(pg, p1, p2345, r2);
+
+    auto poly = svmla_f32_z(pg, scale, p12345, scale);
+
+    // Handle underflow and overflow.
+    poly = svsel_f32(svcmplt_f32(pg, x, min_input), zero, poly);
+    poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly);
 
-    // Reconstruct
-    poly = svreinterpret_f32_s32(svqadd_s32(svreinterpret_s32_f32(poly), svlsl_n_s32_z(pg, m, 23)));
+    return poly;
+}
 
-    // Handle underflow
-    svbool_t ltpg = svcmplt_s32(pg, m, CONST_NEGATIVE_126);
-    poly          = svsel_f32(ltpg, CONST_0, poly);
+inline svfloat16_t svexp_f16_z(svbool_t pg, svfloat16_t x)
+{
+    auto bottom = svcvt_f32_z(pg, x);
+    auto pg_top = svptrue_b16();
+    auto top    = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(x))));
 
-    // Handle overflow
-    svbool_t gtpg = svcmpgt_f32(pg, x, CONST_MAX_INPUT);
-    poly          = svsel_f32(gtpg, CONST_INF, poly);
+    bottom = svexp_f32_z(pg, bottom);
+    top    = svexp_f32_z(pg_top, top);
 
-    return poly;
+    return svtrn1(svcvt_f16_z(pg, bottom), svcvt_f16_z(pg_top, top));
 }
 
-inline svfloat16_t svexp_f16_z(svbool_t pg, svfloat16_t x)
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+inline svfloat16_t svexp_f16_z_sve2(svbool_t pg, svfloat16_t x)
 {
     auto bottom = svcvt_f32_z(pg, x);
-#if defined(__ARM_FEATURE_SVE2)
     auto top    = svcvtlt_f32_x(pg, x);
     auto pg_top = pg;
-#else  /* defined(__ARM_FEATURE_SVE2) */
-    auto pg_top = svptrue_b16();
-    auto top    = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(x))));
-#endif /* defined(__ARM_FEATURE_SVE2) */
 
     bottom = svexp_f32_z(pg, bottom);
     top    = svexp_f32_z(pg_top, top);
 
-#if defined(__ARM_FEATURE_SVE2)
     return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
-#else  /* defined(__ARM_FEATURE_SVE2) */
-    return svtrn1(svcvt_f16_z(pg, bottom), svcvt_f16_z(pg_top, top));
-#endif /* defined(__ARM_FEATURE_SVE2) */
 }
 
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 inline svfloat32_t svtanh_f32_z(svbool_t pg, svfloat32_t val)
 {
     const svfloat32_t CONST_1        = svdup_n_f32(1.f);
@@ -185,7 +242,8 @@ inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x)
     auto val = svreinterpret_f32_s32(svsub_s32_z(pg, svreinterpret_s32_f32(x), svlsl_n_s32_z(pg, m, 23)));
 
     // Polynomial Approximation
-    auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6, log_tab_7, log_tab_8);
+    auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6,
+                                    log_tab_7, log_tab_8);
 
     // Reconstruct
     poly = svmla_f32_z(pg, poly, svcvt_f32_s32_z(pg, m), CONST_LN2);
@@ -196,24 +254,31 @@ inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x)
 inline svfloat16_t svlog_f16_z(svbool_t pg, svfloat16_t x)
 {
     auto bottom = svcvt_f32_z(pg, x);
-#if defined(__ARM_FEATURE_SVE2)
-    auto top    = svcvtlt_f32_x(pg, x);
-    auto pg_top = pg;
-#else  /* defined(__ARM_FEATURE_SVE2) */
     auto pg_top = svptrue_b16();
     auto top    = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(x))));
-#endif /* defined(__ARM_FEATURE_SVE2) */
 
     bottom = svlog_f32_z(pg, bottom);
     top    = svlog_f32_z(pg_top, top);
 
-#if defined(__ARM_FEATURE_SVE2)
-    return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
-#else  /* defined(__ARM_FEATURE_SVE2) */
     return svtrn1(svcvt_f16_z(pg, bottom), svcvt_f16_z(pg_top, top));
-#endif /* defined(__ARM_FEATURE_SVE2) */
 }
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+inline svfloat16_t svlog_f16_z_sve2(svbool_t pg, svfloat16_t x)
+{
+    auto bottom = svcvt_f32_z(pg, x);
+    auto top    = svcvtlt_f32_x(pg, x);
+    auto pg_top = pg;
+
+    bottom = svlog_f32_z(pg, bottom);
+    top    = svlog_f32_z(pg_top, top);
+
+    return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
+}
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val)
 {
     using ScalarType = float;
@@ -231,7 +296,8 @@ inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val)
     //Find positive or negative
     const auto c_v    = svabs_z(pg, wrapper::svcvt_z<int32_t>(pg, svmul_z(pg, val, ipi_v)));
     const auto sign_v = svcmple(pg, val, wrapper::svdup_n(ScalarType(0)));
-    const auto odd_v  = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))), wrapper::svdup_n(IntType(0)));
+    const auto odd_v  = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))),
+                                wrapper::svdup_n(IntType(0)));
 
     auto neg_v = sveor_z(pg, odd_v, sign_v);
 
@@ -269,24 +335,31 @@ inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val)
 inline svfloat16_t svsin_f16_z(svbool_t pg, svfloat16_t val)
 {
     auto bottom = svcvt_f32_z(pg, val);
-#if defined(__ARM_FEATURE_SVE2)
-    auto top    = svcvtlt_f32_x(pg, val);
-    auto pg_top = pg;
-#else  /* defined(__ARM_FEATURE_SVE2) */
     auto pg_top = svptrue_b16();
     auto top    = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(val))));
-#endif /* defined(__ARM_FEATURE_SVE2) */
 
     bottom = svsin_f32_z(pg, bottom);
     top    = svsin_f32_z(pg_top, top);
 
-#if defined(__ARM_FEATURE_SVE2)
-    return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
-#else  /* defined(__ARM_FEATURE_SVE2) */
     return svtrn1(svcvt_f16_z(pg, bottom), svcvt_f16_z(pg_top, top));
-#endif /* defined(__ARM_FEATURE_SVE2) */
 }
 
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+inline svfloat16_t svsin_f16_z_sve2(svbool_t pg, svfloat16_t val)
+{
+    auto bottom = svcvt_f32_z(pg, val);
+    auto top    = svcvtlt_f32_x(pg, val);
+    auto pg_top = pg;
+
+    bottom = svsin_f32_z(pg, bottom);
+    top    = svsin_f32_z(pg_top, top);
+
+    return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
+}
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
 inline svfloat32_t svpow_f32_z(svbool_t pg, svfloat32_t a, svfloat32_t b)
 {
     return svexp_f32_z(pg, svmul_z(pg, b, svlog_f32_z(pg, a)));
@@ -297,29 +370,41 @@ inline svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b)
     auto a_bottom = svcvt_f32_z(pg, a);
     auto b_bottom = svcvt_f32_z(pg, b);
 
-#if defined(__ARM_FEATURE_SVE2)
-    auto pg_top = pg;
-    auto a_top  = svcvtlt_f32_x(pg, a);
-    auto b_top  = svcvtlt_f32_x(pg, b);
-#else  /* defined(__ARM_FEATURE_SVE2) */
     auto pg_top = svptrue_b16();
     auto a_top  = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(a))));
     auto b_top  = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(b))));
-#endif /* defined(__ARM_FEATURE_SVE2) */
 
     auto res_bottom = svpow_f32_z(pg, a_bottom, b_bottom);
     auto res_top    = svpow_f32_z(pg_top, a_top, b_top);
 
-#if defined(__ARM_FEATURE_SVE2)
-    return svcvtnt_f16_m(svcvt_f16_z(pg, res_bottom), pg_top, res_top);
-#else  /* defined(__ARM_FEATURE_SVE2) */
     return svtrn1(svcvt_f16_z(pg, res_bottom), svcvt_f16_z(pg_top, res_top));
-#endif /* defined(__ARM_FEATURE_SVE2) */
 }
 
-#if defined(__ARM_FEATURE_SVE2)
+#ifdef ARM_COMPUTE_ENABLE_SVE2
+
+inline svfloat16_t svpow_f16_z_sve2(svbool_t pg, svfloat16_t a, svfloat16_t b)
+{
+    auto a_bottom = svcvt_f32_z(pg, a);
+    auto b_bottom = svcvt_f32_z(pg, b);
+
+    auto pg_top = pg;
+    auto a_top  = svcvtlt_f32_x(pg, a);
+    auto b_top  = svcvtlt_f32_x(pg, b);
+
+    auto res_bottom = svpow_f32_z(pg, a_bottom, b_bottom);
+    auto res_top    = svpow_f32_z(pg_top, a_top, b_top);
+
+    return svcvtnt_f16_m(svcvt_f16_z(pg, res_bottom), pg_top, res_top);
+}
+
+#endif // ARM_COMPUTE_ENABLE_SVE2
+
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
 template <>
-inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0,
+                                                 const svfloat32_t &in_1,
+                                                 const svfloat32_t &in_2,
+                                                 const svfloat32_t &in_3)
 {
     svuint8_t  out;
     const auto all_true_pg = svptrue_b32();
@@ -353,7 +438,10 @@ inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const
 }
 
 template <>
-inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0,
+                                               const svfloat32_t &in_1,
+                                               const svfloat32_t &in_2,
+                                               const svfloat32_t &in_3)
 {
     svint8_t   out;
     const auto all_true_pg = svptrue_b32();
@@ -385,7 +473,9 @@ inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const sv
 
     return out;
 }
-#endif /* defined(__ARM_FEATURE_SVE2) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 
 } // namespace arm_compute
-#endif /* defined(ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+
+#endif // ACL_SRC_CORE_NEON_SVEMATH_INL
diff --git a/src/core/NEON/SVESymm.h b/src/core/NEON/SVESymm.h
index 30e1e172a3..288d45d979 100644
--- a/src/core/NEON/SVESymm.h
+++ b/src/core/NEON/SVESymm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,9 @@
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
-#if defined(__ARM_FEATURE_SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -44,13 +45,8 @@ inline svfloat32x2_t svdequantize_qsymm16_z(svbool_t pg, const svint16_t &qv, fl
 {
     const auto          vscale = svdup_n_f32(scale);
     const svfloat32x2_t vdequantized_input =
-    {
-        {   {
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale)
-            }
-        }
-    };
+        svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale));
     return vdequantized_input;
 }
 
@@ -86,15 +82,10 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint16x2_t qv, const Uni
     const float         scale  = qi.scale;
     const auto          vscale = svdup_n_f32(scale);
     const svfloat32x4_t vdequantized_input =
-    {
-        { {
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale),
-                svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale),
-            }
-        }
-    };
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale));
     return vdequantized_input;
 }
 
@@ -123,5 +114,5 @@ inline svint16x2_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x4_t qv, con
 }
 
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
-#endif // ARM_COMPUTE_NESYMM_H
-\ No newline at end of file
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+#endif // ARM_COMPUTE_NESYMM_H
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 92000bb2f6..717fd11485 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,18 +28,17 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
+#include "src/core/NEON/kernels/batchnormalization/impl/list.h"
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
-#include "src/core/NEON/kernels/batchnormalization/impl/list.h"
-#include "src/core/common/Registrars.h"
-
 #include <map>
 
 namespace arm_compute
@@ -48,11 +47,19 @@ namespace
 {
 struct BatchNormalizationSelectorData
 {
-    DataType dt;
+    DataType       dt;
+    const CPUInfo &ci;
 };
 using BatchNormalizationSelectorPtr = std::add_pointer<bool(const BatchNormalizationSelectorData &data)>::type;
-using BatchNormalizationKernelPtr   = std::add_pointer<void(ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                            float, ActivationLayerInfo &, const Window &)>::type;
+using BatchNormalizationKernelPtr   = std::add_pointer<void(ITensor *,
+                                                          ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          float,
+                                                          ActivationLayerInfo &,
+                                                          const Window &)>::type;
 
 struct BatchNormalizationKernel
 {
@@ -61,41 +68,32 @@ struct BatchNormalizationKernel
     BatchNormalizationKernelPtr         ukernel;
 };
 
-static const BatchNormalizationKernel available_kernels[] =
-{
-#if defined(ENABLE_SVE)
-    {
-        "fp16_sve_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)
-    },
-    {
-        "f32_sve_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)
-    },
-#endif /* !defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
+static const BatchNormalizationKernel available_kernels[] = {
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+    {"sve_fp16_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
+     REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)},
+    {"sve_fp32_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
+     REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)},
+#endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */
+#if defined(ARM_COMPUTE_ENABLE_NEON)
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "fp16_neon_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)
-    },
+    {"neon_fp16_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)},
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    {
-        "f32_neon_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)
-    },
-#endif /* !defined(ENABLE_NEON) */
+    {"neon_fp32_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)},
+#endif /* !defined(ARM_COMPUTE_ENABLE_NEON) */
 };
 
 const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -103,25 +101,31 @@ const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelec
     return nullptr;
 }
 
-Status
-validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var,
-                   const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo  *input,
+                          const ITensorInfo  *output,
+                          const ITensorInfo  *mean,
+                          const ITensorInfo  *var,
+                          const ITensorInfo  *beta,
+                          const ITensorInfo  *gamma,
+                          float               epsilon,
+                          ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
-    const auto *uk = get_implementation(BatchNormalizationSelectorData{ input->data_type() });
+    const auto *uk = get_implementation(BatchNormalizationSelectorData{input->data_type(), CPUInfo::get()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ActivationLayerInfo::ActivationFunction act = act_info.activation();
-        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+                                    act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+                                    act !=
+                                        ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
         ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
     }
 
-    if(nullptr != output)
+    if (nullptr != output)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -130,139 +134,32 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
-    if(beta != nullptr)
+    if (beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
     }
-    if(gamma != nullptr)
+    if (gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+                                    input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
 
     return Status{};
 }
 } //namespace
 
-template <typename T, bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    const int  window_step_x  = 16 / sizeof(T);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_to_use = window;
-    win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, win_to_use);
-    Iterator output(_output, win_to_use);
-
-    F activation_functor(_act_info);
-
-    // Hold information about the current feature map we are iterating.
-    // Only compute denominator and constants once per feature map.
-    int slice = -1;
-
-    const auto input_mean  = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    T mean        = static_cast<T>(0);
-    T var         = static_cast<T>(0);
-    T gamma       = static_cast<T>(1);
-    T beta        = static_cast<T>(0);
-    T denominator = static_cast<T>(0);
-
-    auto       mean_vec        = wrapper::vdup_n(mean, ExactTagType{});
-    auto       var_vec         = wrapper::vdup_n(var, ExactTagType{});
-    auto       gamma_vec       = wrapper::vdup_n(gamma, ExactTagType{});
-    auto       beta_vec        = wrapper::vdup_n(beta, ExactTagType{});
-    auto       denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
-    const auto epsilon_vec     = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
-    execute_window_loop(win_to_use, [&](const Coordinates & id)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        if(slice != id.z())
-        {
-            mean     = input_mean[id.z()];
-            var      = input_var[id.z()];
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-            var_vec  = wrapper::vdup_n(var, ExactTagType{});
-            if(input_gamma != nullptr)
-            {
-                gamma     = input_gamma[id.z()];
-                gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
-            }
-            if(input_beta != nullptr)
-            {
-                beta     = input_beta[id.z()];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
-            }
-
-            // Calculate denominator
-            denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-            denominator     = wrapper::vgetlane(denominator_vec, 0);
-            slice           = id.z();
-        }
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(fused_activation)
-            {
-                activation_functor(res);
-            }
-
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T numerator = input_ptr[x] - mean;
-            const T x_bar     = numerator * denominator;
-            T       res       = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(fused_activation)
-            {
-                activation_functor(res);
-            }
-
-            // Store results
-            *(output_ptr + x) = res;
-        }
-    },
-    input, output);
-}
-
 void NEBatchNormalizationLayerKernel::configure_non_fused()
 {
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
+            _func = REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused);
             break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
-            _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, false, detail::dummy<float, 4>>;
+            _func = REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused);
             break;
         default:
             ARM_COMPUTE_ERROR("Element size not supported");
@@ -273,29 +170,28 @@ void NEBatchNormalizationLayerKernel::configure_non_fused()
 void NEBatchNormalizationLayerKernel::configure_fused()
 {
     // NCHW Fused Batched Normalization with activation functions : FP32
-    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
-    {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> }
-    };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    // NCHW Fused Batched Normalization with activation functions : FP16
-    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
-    {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> }
-    };
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw = {
+        {ActivationLayerInfo::ActivationFunction::RELU,
+         REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_relu)},
+        {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+         REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_brelu)},
+        {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+         REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_lubrelu)}};
 
-    switch(_input->info()->data_type())
+    // NCHW Fused Batched Normalization with activation functions : FP16
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw = {
+        {ActivationLayerInfo::ActivationFunction::RELU,
+         REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_relu)},
+        {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+         REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_brelu)},
+        {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+         REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_lubrelu)}};
+
+    switch (_input->info()->data_type())
     {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
             _func = bn_fused_map_f16_nchw[_act_info.activation()];
             break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
             _func = bn_fused_map_f32_nchw[_act_info.activation()];
             break;
@@ -306,22 +202,32 @@ void NEBatchNormalizationLayerKernel::configure_fused()
 }
 
 NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info()
+    : _func(nullptr),
+      _input(nullptr),
+      _output(nullptr),
+      _mean(nullptr),
+      _var(nullptr),
+      _gamma(nullptr),
+      _beta(nullptr),
+      _epsilon(),
+      _act_info()
 {
 }
 
-void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
-                                                const ITensor *mean, const ITensor *var,
-                                                const ITensor *beta, const ITensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void NEBatchNormalizationLayerKernel::configure(ITensor            *input,
+                                                ITensor            *output,
+                                                const ITensor      *mean,
+                                                const ITensor      *var,
+                                                const ITensor      *beta,
+                                                const ITensor      *gamma,
+                                                float               epsilon,
+                                                ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
-                                                  mean->info(), var->info(),
-                                                  (beta != nullptr) ? beta->info() : nullptr,
-                                                  (gamma != nullptr) ? gamma->info() : nullptr,
-                                                  epsilon, act_info));
+                                                  mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
+                                                  (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
     _input    = input;
     _output   = input;
@@ -333,16 +239,16 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
     _act_info = act_info;
 
     const bool run_in_place = (output == nullptr) || (output == input);
-    if(!run_in_place)
+    if (!run_in_place)
     {
         _output = output;
     }
 
     // Configure activation function to run
     const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
-    if(is_nchw)
+    if (is_nchw)
     {
-        if(_act_info.enabled())
+        if (_act_info.enabled())
         {
             configure_fused();
         }
@@ -356,17 +262,21 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
     Window win = calculate_max_window(*input->info(), Steps());
     INEKernel::configure(win);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto initialization if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 }
 
-Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                 const ITensorInfo *mean, const ITensorInfo *var,
-                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
-                                                 float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo  *input,
+                                                 const ITensorInfo  *output,
+                                                 const ITensorInfo  *mean,
+                                                 const ITensorInfo  *var,
+                                                 const ITensorInfo  *beta,
+                                                 const ITensorInfo  *gamma,
+                                                 float               epsilon,
+                                                 ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
 
@@ -381,13 +291,14 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo
     ARM_COMPUTE_ERROR_ON(_func == nullptr && _input->info()->data_layout() == DataLayout::NCHW);
 
     const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
-    if(is_nchw)
+    if (is_nchw)
     {
-        (this->*_func)(window);
+        (*_func)(window, _input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info);
     }
     else
     {
-        const auto *uk = get_implementation(BatchNormalizationSelectorData{ _input->info()->data_type() });
+        const auto *uk =
+            get_implementation(BatchNormalizationSelectorData{_input->info()->data_type(), CPUInfo::get()});
         uk->ukernel(_input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info, window);
     }
 }
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 9312073ce8..679ade0fae 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 
 #include "src/core/NEON/INEKernel.h"
 
@@ -67,7 +69,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ITensor            *input,
+                   ITensor            *output,
+                   const ITensor      *mean,
+                   const ITensor      *var,
+                   const ITensor      *beta     = nullptr,
+                   const ITensor      *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel
      *
@@ -84,10 +92,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -98,31 +110,19 @@ private:
     /** Configure execution function in case of fused activation **/
     void configure_fused();
 
-    /** Template function to run batch normalization on fp32
-     *
-     * @tparam T                Specialization data type
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     * @tparam F                Activation function functor to run
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, bool fused_activation, typename F>
-    void batch_normalization_nchw(const Window &window);
-    /** Template function to run batch normalization on fp32 on tensors with NHWC format
-     *
-     * @tparam T                Specialization data type
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     * @tparam F                Activation function functor to run
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, bool fused_activation, typename F>
-    void batch_normalization_nhwc(const Window &window);
     /** Common signature for all the batch normalization functions
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using BatchNormFunctionPtr = void (NEBatchNormalizationLayerKernel::*)(const Window &window);
+    using BatchNormFunctionPtr = void (*)(const Window       &window,
+                                          ITensor            *input,
+                                          ITensor            *output,
+                                          const ITensor      *mean,
+                                          const ITensor      *var,
+                                          const ITensor      *beta,
+                                          const ITensor      *gamma,
+                                          float               epsilon,
+                                          ActivationLayerInfo act_info);
 
 private:
     BatchNormFunctionPtr _func;
@@ -136,4 +136,4 @@ private:
     ActivationLayerInfo  _act_info;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
index 10207b9cf6..f299bb94a4 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,11 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -45,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -53,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output)
+Status validate_arguments_static(const ITensorInfo *input,
+                                 int                block_shape_x,
+                                 int                block_shape_y,
+                                 const ITensorInfo *output,
+                                 const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -64,16 +70,15 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const int idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape_x * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_y * input->tensor_shape()[idx_height]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] != input->tensor_shape()[idx_channel]);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+        const TensorShape expected_output_shape = compute_batch_to_space_shape(
+            input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+        const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
     }
 
     return Status{};
@@ -81,7 +86,13 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 } // namespace
 
 NEBatchToSpaceLayerKernel::NEBatchToSpaceLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _block_shape_x(), _block_shape_y()
+    : _input(nullptr),
+      _block_shape(nullptr),
+      _output(nullptr),
+      _data_layout(DataLayout::UNKNOWN),
+      _block_shape_x(),
+      _block_shape_y(),
+      _crop_info()
 {
 }
 
@@ -96,42 +107,51 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const ITensor *b
     _data_layout = input->info()->data_layout();
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
+    Window win = calculate_max_window(*output->info(), Steps());
     ICPPKernel::configure(win);
 }
 
-void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ITensor *output)
+void NEBatchToSpaceLayerKernel::configure(
+    const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    TensorShape output_shape = compute_batch_to_space_shape(input->info(), block_shape_x, block_shape_y);
-    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = compute_batch_to_space_shape(
+        input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+    // Output auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
 
     _input         = input;
     _output        = output;
     _block_shape_x = block_shape_x;
     _block_shape_y = block_shape_y;
     _data_layout   = input->info()->data_layout();
+    _crop_info     = crop_info;
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
+    Window win = calculate_max_window(*output->info(), Steps());
     ICPPKernel::configure(win);
 }
 
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
     return Status{};
 }
 
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output)
+Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+                                           int32_t            block_shape_x,
+                                           int32_t            block_shape_y,
+                                           const ITensorInfo *output,
+                                           const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
     return Status{};
 }
 
@@ -141,73 +161,76 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_block_shape != nullptr)
+    if (_block_shape != nullptr)
     {
         // Retrieve the block shapes dynamically
         _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
         _block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1)));
     }
 
-    const int batch_size   = _input->info()->dimension(3);
-    const int r            = (batch_size / (_block_shape_x * _block_shape_y));
-    const int element_size = _input->info()->element_size();
-
-    Window slice_in  = window.first_slice_window_3D();
-    Window slice_out = window.first_slice_window_4D();
+    const int batch_size   = _output->info()->dimension(3);
+    const int element_size = _output->info()->element_size();
 
-    // The slice_out slice does not move
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    slice_out.set(3, Window::Dimension(0, 0, 0));
+    Window slice_out = window.first_slice_window_3D();
 
     int batch_id = 0;
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
-            Iterator in(_input, slice_in);
-            execute_window_loop(slice_in, [&](const Coordinates & id)
-            {
-
-                const int x = id.x();
-                const int y = id.y();
-                const int z = id.z();
-
-                const int   w     = batch_id % r;
-                const int   out_x = x * _block_shape_x + (batch_id / r) % _block_shape_x;
-                const int   out_y = y * _block_shape_y + (batch_id / r) / _block_shape_x;
-                Coordinates output_coords{ out_x, out_y, z, w };
-                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-            },
-            in);
+            Iterator out(_output, slice_out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.x();
+                    const int y = id.y();
+                    const int z = id.z();
+                    // Translate x, y to uncropped version
+                    const int x_c = x + _crop_info.left;
+                    const int y_c = y + _crop_info.top;
+
+                    const int in_batch =
+                        batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+                    const int   in_x = x_c / _block_shape_x;
+                    const int   in_y = y_c / _block_shape_y;
+                    Coordinates input_coords{in_x, in_y, z, in_batch};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_in));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
+        // For NHWC we can perform a block copy on the Channel (first) dimension. Thus we do not need to iterate over this dimension
+        slice_out.set(0U, Window::Dimension(0U, 1U, 1U));
         do
         {
-            Iterator in(_input, slice_in);
-            execute_window_loop(slice_in, [&](const Coordinates & id)
-            {
-
-                const int z = id.x();
-                const int x = id.y();
-                const int y = id.z();
-
-                const int   w     = batch_id % r;
-                const int   out_x = x * _block_shape_x + (batch_id / r) % _block_shape_x;
-                const int   out_y = y * _block_shape_y + (batch_id / r) / _block_shape_x;
-                Coordinates output_coords{ z, out_x, out_y, w };
-                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-            },
-            in);
+            Iterator out(_output, slice_out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.y();
+                    const int y = id.z();
+
+                    // Translate x, y to uncropped version
+                    const int x_c = x + _crop_info.left;
+                    const int y_c = y + _crop_info.top;
+
+                    const int in_batch =
+                        batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+                    const int   in_x = x_c / _block_shape_x;
+                    const int   in_y = y_c / _block_shape_y;
+                    Coordinates input_coords{0, in_x, in_y, in_batch};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords),
+                           element_size * _input->info()->dimension(0));
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_in));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
index 26e8224922..d98ac621b0 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
 #define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
 
+#include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -55,6 +57,8 @@ public:
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
     void configure(const ITensor *input, const ITensor *block_shape, ITensor *output);
     /** Initialise the kernel's inputs and output (Static block shape).
@@ -63,8 +67,13 @@ public:
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
      * @param[out] output        Tensor output. Data types supported: same as @p input
+     * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ITensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ITensor *output);
+    void configure(const ITensor  *input,
+                   int32_t         block_shape_x,
+                   int32_t         block_shape_y,
+                   ITensor        *output,
+                   const CropInfo &crop_info = CropInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -72,6 +81,8 @@ public:
      * @param[in] output      Tensor output. Data types supported: same as @p input
      *
      * @return a status
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel (Static block shape).
@@ -80,10 +91,15 @@ public:
      * @param[in] block_shape_x Block shape x value.
      * @param[in] block_shape_y Block shape y value.
      * @param[in] output        Tensor output. Data types supported: same as @p input
+     * @param[in] crop_info     Specifies how the output shape is cropped after batch to space is performed
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           int32_t            block_shape_x,
+                           int32_t            block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info = CropInfo{});
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -93,9 +109,9 @@ private:
     const ITensor *_block_shape; /**< Block shape tensor */
     ITensor       *_output;      /**< Destination tensor */
     DataLayout     _data_layout; /**< Data layout to  be used at run-time */
-
-    int32_t _block_shape_x;
-    int32_t _block_shape_y;
+    int32_t        _block_shape_x;
+    int32_t        _block_shape_y;
+    CropInfo       _crop_info; /**< Information related to cropping performed on output after the operation */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index 677c5cddcc..a59bbd233b 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -55,8 +56,7 @@ inline void bitwise_and(const T *__restrict input1, const T *__restrict input2,
 }
 } // namespace
 
-NEBitwiseAndKernel::NEBitwiseAndKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseAndKernel::NEBitwiseAndKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -86,8 +86,7 @@ void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2,
     Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
                               AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
@@ -103,9 +102,7 @@ void NEBitwiseAndKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 19b1af690a..ecd181a7af 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -50,8 +51,7 @@ inline void bitwise_not_U8_U8(const uint8_t *__restrict input, uint8_t *__restri
 }
 } // namespace
 
-NEBitwiseNotKernel::NEBitwiseNotKernel()
-    : _input(nullptr), _output(nullptr)
+NEBitwiseNotKernel::NEBitwiseNotKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
@@ -77,7 +77,8 @@ void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output)
     // Configure kernel window
     Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access);
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
 
     INEKernel::configure(win);
 }
@@ -90,9 +91,6 @@ void NEBitwiseNotKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input(_input, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_not_U8_U8(input.ptr(), output.ptr());
-    },
-    input, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_not_U8_U8(input.ptr(), output.ptr()); }, input, output);
 }
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index 08094fbfcf..4c906134aa 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -42,7 +43,8 @@ class Coordinates;
 
 namespace
 {
-inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
 {
     const uint8x16_t val1 = vld1q_u8(input1);
     const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@ inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t
 }
 } // namespace
 
-NEBitwiseOrKernel::NEBitwiseOrKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseOrKernel::NEBitwiseOrKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -82,8 +83,7 @@ void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2,
     Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
                               AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
@@ -99,9 +99,7 @@ void NEBitwiseOrKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index fc5b38b64f..dbbed2483c 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -42,7 +43,8 @@ class Coordinates;
 
 namespace
 {
-inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
 {
     const uint8x16_t val1 = vld1q_u8(input1);
     const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@ inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t
 }
 } // namespace
 
-NEBitwiseXorKernel::NEBitwiseXorKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseXorKernel::NEBitwiseXorKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -82,7 +83,8 @@ void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2,
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access);
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
 
     INEKernel::configure(win);
 }
@@ -96,9 +98,7 @@ void NEBitwiseXorKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 1e0a1742f6..cb869838e2 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,9 +27,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/boundingboxtransform/list.h"
 
 #include <arm_neon.h>
 
@@ -37,7 +40,63 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+struct BoundingBoxTransformSelectorData
+{
+    DataType dt;
+};
+
+using BoundingBoxTransformSelctorPtr = std::add_pointer<bool(const BoundingBoxTransformSelectorData &data)>::type;
+using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor           *boxes,
+                                                             ITensor                 *pred_boxes,
+                                                             const ITensor           *deltas,
+                                                             BoundingBoxTransformInfo bbinfo,
+                                                             const Window            &window)>::type;
+
+struct BoundingBoxTransformKernel
+{
+    const char                          *name;
+    const BoundingBoxTransformSelctorPtr is_selected;
+    BoundingBoxTransformUKernelPtr       ukernel;
+};
+
+static const BoundingBoxTransformKernel available_kernels[] = {
+    {"fp32_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)},
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {"fp16_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)},
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {"qu16_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)},
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data)
+{
+    for (const auto &uk : available_kernels)
+    {
+        if (uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
+Status validate_arguments(const ITensorInfo              *boxes,
+                          const ITensorInfo              *pred_boxes,
+                          const ITensorInfo              *deltas,
+                          const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(boxes);
@@ -50,7 +109,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0);
 
-    if(boxes->data_type() == DataType::QASYMM16)
+    if (boxes->data_type() == DataType::QASYMM16)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(deltas, 1, DataType::QASYMM8);
         const UniformQuantizationInfo deltas_qinfo = deltas->quantization_info().uniform();
@@ -62,12 +121,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
     }
 
-    if(pred_boxes->total_size() > 0)
+    if (pred_boxes->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas);
         ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
-        if(pred_boxes->data_type() == DataType::QASYMM16)
+        if (pred_boxes->data_type() == DataType::QASYMM16)
         {
             const UniformQuantizationInfo pred_qinfo = pred_boxes->quantization_info().uniform();
             ARM_COMPUTE_RETURN_ERROR_ON(pred_qinfo.scale != 0.125f);
@@ -84,13 +143,19 @@ NEBoundingBoxTransformKernel::NEBoundingBoxTransformKernel()
 {
 }
 
-void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransformKernel::configure(const ITensor                  *boxes,
+                                             ITensor                        *pred_boxes,
+                                             const ITensor                  *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
 
     // Configure kernel window
-    auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+    auto_init_if_empty(*pred_boxes->info(), deltas->info()
+                                                ->clone()
+                                                ->set_data_type(boxes->info()->data_type())
+                                                .set_quantization_info(boxes->info()->quantization_info()));
 
     // Set instance variables
     _boxes      = boxes;
@@ -106,151 +171,24 @@ void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred
     INEKernel::configure(win);
 }
 
-Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransformKernel::validate(const ITensorInfo              *boxes,
+                                              const ITensorInfo              *pred_boxes,
+                                              const ITensorInfo              *deltas,
+                                              const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
     return Status{};
 }
 
-template <>
-void NEBoundingBoxTransformKernel::internal_run<uint16_t>(const Window &window)
-{
-    const size_t num_classes  = _deltas->info()->tensor_shape()[0] >> 2;
-    const size_t deltas_width = _deltas->info()->tensor_shape()[0];
-    const int    img_h        = std::floor(_bbinfo.img_height() / _bbinfo.scale() + 0.5f);
-    const int    img_w        = std::floor(_bbinfo.img_width() / _bbinfo.scale() + 0.5f);
-
-    const auto scale_after  = (_bbinfo.apply_scale() ? _bbinfo.scale() : 1.f);
-    const auto scale_before = _bbinfo.scale();
-    const auto offset       = (_bbinfo.correct_transform_coords() ? 1.f : 0.f);
-
-    auto pred_ptr  = reinterpret_cast<uint16_t *>(_pred_boxes->buffer() + _pred_boxes->info()->offset_first_element_in_bytes());
-    auto delta_ptr = reinterpret_cast<uint8_t *>(_deltas->buffer() + _deltas->info()->offset_first_element_in_bytes());
-
-    const auto boxes_qinfo  = _boxes->info()->quantization_info().uniform();
-    const auto deltas_qinfo = _deltas->info()->quantization_info().uniform();
-    const auto pred_qinfo   = _pred_boxes->info()->quantization_info().uniform();
-
-    Iterator box_it(_boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto  ptr    = reinterpret_cast<uint16_t *>(box_it.ptr());
-        const auto  b0     = dequantize_qasymm16(*ptr, boxes_qinfo);
-        const auto  b1     = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
-        const auto  b2     = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
-        const auto  b3     = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
-        const float width  = (b2 / scale_before) - (b0 / scale_before) + 1.f;
-        const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
-        const float ctr_x  = (b0 / scale_before) + 0.5f * width;
-        const float ctr_y  = (b1 / scale_before) + 0.5f * height;
-        for(size_t j = 0; j < num_classes; ++j)
-        {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const float  dx       = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / _bbinfo.weights()[0];
-            const float  dy       = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / _bbinfo.weights()[1];
-            float        dw       = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / _bbinfo.weights()[2];
-            float        dh       = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / _bbinfo.weights()[3];
-            // Clip dw and dh
-            dw = std::min(dw, _bbinfo.bbox_xform_clip());
-            dh = std::min(dh, _bbinfo.bbox_xform_clip());
-            // Determine the predictions
-            const float pred_ctr_x = dx * width + ctr_x;
-            const float pred_ctr_y = dy * height + ctr_y;
-            const float pred_w     = std::exp(dw) * width;
-            const float pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo);
-        }
-    },
-    box_it);
-}
-
-template <typename T>
-void NEBoundingBoxTransformKernel::internal_run(const Window &window)
-{
-    const size_t num_classes  = _deltas->info()->tensor_shape()[0] >> 2;
-    const size_t deltas_width = _deltas->info()->tensor_shape()[0];
-    const int    img_h        = std::floor(_bbinfo.img_height() / _bbinfo.scale() + 0.5f);
-    const int    img_w        = std::floor(_bbinfo.img_width() / _bbinfo.scale() + 0.5f);
-
-    const auto scale_after  = (_bbinfo.apply_scale() ? T(_bbinfo.scale()) : T(1));
-    const auto scale_before = T(_bbinfo.scale());
-    ARM_COMPUTE_ERROR_ON(scale_before <= 0);
-    const auto offset = (_bbinfo.correct_transform_coords() ? T(1.f) : T(0.f));
-
-    auto pred_ptr  = reinterpret_cast<T *>(_pred_boxes->buffer() + _pred_boxes->info()->offset_first_element_in_bytes());
-    auto delta_ptr = reinterpret_cast<T *>(_deltas->buffer() + _deltas->info()->offset_first_element_in_bytes());
-
-    Iterator box_it(_boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto ptr    = reinterpret_cast<T *>(box_it.ptr());
-        const auto b0     = *ptr;
-        const auto b1     = *(ptr + 1);
-        const auto b2     = *(ptr + 2);
-        const auto b3     = *(ptr + 3);
-        const T    width  = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
-        const T    height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
-        const T    ctr_x  = (b0 / scale_before) + T(0.5f) * width;
-        const T    ctr_y  = (b1 / scale_before) + T(0.5f) * height;
-        for(size_t j = 0; j < num_classes; ++j)
-        {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const T      dx       = delta_ptr[delta_id] / T(_bbinfo.weights()[0]);
-            const T      dy       = delta_ptr[delta_id + 1] / T(_bbinfo.weights()[1]);
-            T            dw       = delta_ptr[delta_id + 2] / T(_bbinfo.weights()[2]);
-            T            dh       = delta_ptr[delta_id + 3] / T(_bbinfo.weights()[3]);
-            // Clip dw and dh
-            dw = std::min(dw, T(_bbinfo.bbox_xform_clip()));
-            dh = std::min(dh, T(_bbinfo.bbox_xform_clip()));
-            // Determine the predictions
-            const T pred_ctr_x = dx * width + ctr_x;
-            const T pred_ctr_y = dy * height + ctr_y;
-            const T pred_w     = std::exp(dw) * width;
-            const T pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
-            pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
-        }
-    },
-    box_it);
-}
-
 void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    switch(_boxes->info()->data_type())
-    {
-        case DataType::F32:
-        {
-            internal_run<float>(window);
-            break;
-        }
-        case DataType::QASYMM16:
-        {
-            internal_run<uint16_t>(window);
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            internal_run<float16_t>(window);
-            break;
-        }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        default:
-        {
-            ARM_COMPUTE_ERROR("Data type not supported");
-        }
-    }
+
+    const auto *uk = get_implementation(BoundingBoxTransformSelectorData{_boxes->info()->data_type()});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
index c080ce6a5c..3915994feb 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,7 +63,8 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+    void
+    configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -77,15 +78,15 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    template <typename T>
-    void internal_run(const Window &window);
-
     const ITensor           *_boxes;
     ITensor                 *_pred_boxes;
     const ITensor           *_deltas;
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index 008ad7c9f4..3b53b7055f 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,15 +45,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
 
-    const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+    const unsigned int channels =
+        input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        num_groups == channels,
+        "Channel shuffling with same number of groups as number of channels would be inefficient");
     ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); // There cannot be more groups than channels
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+                                    "The number of channels must be a multiple of the number of groups");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -68,24 +73,26 @@ void channel_shuffle_nhwc(const ITensor *input, ITensor *output, unsigned int nu
 
     const size_t       element_size = input->info()->element_size();
     const unsigned int K            = input->info()->dimension(channel_idx) / num_groups;
-    const float        rK           = 1.f / K;
+    const double       rK           = 1.0 / K;
 
     Iterator in(input, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Shuffle channel
-        const unsigned int curr_channel = id.x();
-        const unsigned int group_id     = curr_channel * rK;
-        const unsigned int r            = group_id * K;
-        const unsigned int channel_id   = curr_channel - r;
-
-        // Calculate output coordinates
-        Coordinates out_coords = id;
-        out_coords.set(Window::DimX, channel_id * num_groups + group_id);
-        std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
-    },
-    in);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            // Shuffle channel
+            const unsigned int curr_channel = id.x();
+            const unsigned int group_id     = curr_channel * rK;
+            const unsigned int r            = group_id * K;
+            const unsigned int channel_id   = curr_channel - r;
+
+            // Calculate output coordinates
+            Coordinates out_coords = id;
+            out_coords.set(Window::DimX, channel_id * num_groups + group_id);
+            std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
+        },
+        in);
 }
 void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window)
 {
@@ -103,38 +110,39 @@ void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int nu
     const size_t       row_size        = input->info()->dimension(width_idx) * input->info()->element_size();
 
     const unsigned int K  = input->info()->dimension(channel_idx) / num_groups;
-    const float        rK = 1.f / K;
+    const double       rK = 1.0 / K;
 
     Iterator in(input, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        // Shuffle channel
-        const unsigned int curr_channel = id.z();
-        const unsigned int group_id     = curr_channel * rK;
-        const unsigned int r            = group_id * K;
-        const unsigned int channel_id   = curr_channel - r;
-
-        // Calculate output coordinates
-        Coordinates out_coords = id;
-        out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
-        const uint8_t *input_ptr  = in.ptr();
-        uint8_t       *output_ptr = output->ptr_to_element(out_coords);
-
-        // Copy plane
-        for(unsigned int y = 0; y < height; ++y)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            std::copy_n(input_ptr, row_size, output_ptr);
-            input_ptr += input_stride_y;
-            output_ptr += output_stride_y;
-        }
-    },
-    in);
+            // Shuffle channel
+            const unsigned int curr_channel = id.z();
+            const unsigned int group_id     = curr_channel * rK;
+            const unsigned int r            = group_id * K;
+            const unsigned int channel_id   = curr_channel - r;
+
+            // Calculate output coordinates
+            Coordinates out_coords = id;
+            out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
+            const uint8_t *input_ptr  = in.ptr();
+            uint8_t       *output_ptr = output->ptr_to_element(out_coords);
+
+            // Copy plane
+            for (unsigned int y = 0; y < height; ++y)
+            {
+                std::copy_n(input_ptr, row_size, output_ptr);
+                input_ptr += input_stride_y;
+                output_ptr += output_stride_y;
+            }
+        },
+        in);
 }
 } // namespace
 
-NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel()
-    : _input(nullptr), _output(nullptr), _num_groups()
+NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr), _num_groups()
 {
 }
 
@@ -158,7 +166,8 @@ void NEChannelShuffleLayerKernel::configure(const ITensor *input, ITensor *outpu
     INEKernel::configure(win);
 }
 
-Status NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
     return Status{};
@@ -170,7 +179,7 @@ void NEChannelShuffleLayerKernel::run(const Window &window, const ThreadInfo &in
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    switch(_input->info()->data_layout())
+    switch (_input->info()->data_layout())
     {
         case DataLayout::NHWC:
             channel_shuffle_nhwc(_input, _output, _num_groups, window);
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
deleted file mode 100644
index 4ba02f1542..0000000000
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NECol2ImKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-using namespace misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
-{
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
-    // Validate configured output
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, false));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &convolved_dims)
-{
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, false)));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps());
-
-    // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
-
-    return std::make_pair(Status{}, win);
-}
-} // namespace
-
-template <typename T>
-void NECol2ImKernel::run_col2im(const Window &window)
-{
-    const int output_stride_x = _output->info()->strides_in_bytes().x();
-    const int output_stride_y = _output->info()->strides_in_bytes().y();
-    const int output_stride_z = _output->info()->strides_in_bytes().z();
-
-    Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Create iterators
-    Iterator in(_input, window);
-    Iterator out(_output, window_out);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int hidx = id.y();
-        const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + (hidx % _convolved_dims.width) * output_stride_x;
-
-        *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-    },
-    in, out);
-}
-
-NECol2ImKernel::NECol2ImKernel()
-    : _func(), _input(nullptr), _output(nullptr), _convolved_dims()
-{
-}
-
-void NECol2ImKernel::configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims));
-
-    _input          = input;
-    _output         = output;
-    _convolved_dims = convolved_dims;
-
-    switch(input->info()->element_size())
-    {
-        case 1:
-            _func = &NECol2ImKernel::run_col2im<uint8_t>;
-            break;
-        case 2:
-            _func = &NECol2ImKernel::run_col2im<uint16_t>;
-            break;
-        case 4:
-            _func = &NECol2ImKernel::run_col2im<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), convolved_dims);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NECol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), convolved_dims).first);
-    return Status{};
-}
-
-void NECol2ImKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    (this->*_func)(window);
-}
diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h
index 397bf5ab17..bc6652fd30 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.h
+++ b/src/core/NEON/kernels/NECol2ImKernel.h
@@ -24,17 +24,17 @@
 #ifndef ARM_COMPUTE_NECOL2IMKERNEL_H
 #define ARM_COMPUTE_NECOL2IMKERNEL_H
 
-#include "src/core/NEON/INEKernel.h"
-
 #include "arm_compute/core/Size2D.h"
 
+#include "src/core/NEON/INEKernel.h"
+
 namespace arm_compute
 {
 class ITensor;
 
 /** Kernel to perform col2im reshaping.
  *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref NEIm2ColKernel.
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref cpu::kernels::CpuIm2ColKernel.
  *
  * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
  *
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
deleted file mode 100644
index 1f2170f42a..0000000000
--- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-
-    // Validate output if initialized
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    // Output auto inizialitation if not yet initialized
-    {
-        const bool                    is_input_signed   = input->data_type() == DataType::QASYMM8_SIGNED;
-        const DataType                dt                = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
-        const UniformQuantizationInfo qinfo             = input->quantization_info().uniform();
-        const int                     offset_correction = is_input_signed ? -128 : 128;
-        const QuantizationInfo        corrected_qinfo   = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
-
-        auto_init_if_empty(*output, input->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
-    }
-
-    return std::make_pair(Status{}, calculate_max_window(*output));
-}
-} // namespace
-
-NEConvertQuantizedSignednessKernel::NEConvertQuantizedSignednessKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void NEConvertQuantizedSignednessKernel::configure(const ITensor *input, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    std::pair<Status, Window> win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NEConvertQuantizedSignednessKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-    return Status{};
-}
-
-void NEConvertQuantizedSignednessKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, win_collapsed);
-    Iterator output(_output, win_collapsed);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const uint8_t mask  = 128;
-    const auto    vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{});
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const uint8_t in  = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
-            *(output_ptr + x) = in ^ mask;
-        }
-    },
-    input, output);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
deleted file mode 100644
index 67d5ca246e..0000000000
--- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
-#define ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */
-class NEConvertQuantizedSignednessKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvertQuantizedSignednessKernel";
-    }
-    /** Default constructor */
-    NEConvertQuantizedSignednessKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEConvertQuantizedSignednessKernel(const NEConvertQuantizedSignednessKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEConvertQuantizedSignednessKernel &operator=(const NEConvertQuantizedSignednessKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEConvertQuantizedSignednessKernel(NEConvertQuantizedSignednessKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEConvertQuantizedSignednessKernel &operator=(NEConvertQuantizedSignednessKernel &&) = default;
-    /** Default destructor */
-    ~NEConvertQuantizedSignednessKernel() = default;
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output Destination tensor. Data types supported: opposite of @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEConvertQuantizedSignednessKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output Destination tensor. Data types supported: opposite of @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H */
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index fabbd64305..60271fbc74 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,163 +26,110 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/utils/helpers/bit_ops.h"
+#include "src/cpu/kernels/crop/list.h"
 
 namespace arm_compute
 {
 namespace
 {
-template <typename T>
-inline float32x4_t load_as_f32(T *ptr)
-{
-    ARM_COMPUTE_UNUSED(ptr);
-    ARM_COMPUTE_ERROR("Type not supported.");
-}
-
-template <>
-inline float32x4_t load_as_f32(float *ptr)
-{
-    return wrapper::vloadq(ptr);
-}
-
-template <>
-inline float32x4_t load_as_f32(int32_t *ptr)
-{
-    return vcvtq_f32_s32(wrapper::vloadq(ptr));
-}
-
-template <>
-inline float32x4_t load_as_f32(uint32_t *ptr)
+struct CropSelectorData
 {
-    return vcvtq_f32_u32(wrapper::vloadq(ptr));
-}
+    DataType dt;
+};
 
-template <>
-inline float32x4_t load_as_f32(int16_t *ptr)
-{
-    return vcvtq_f32_s32(vmovl_s16(wrapper::vload(ptr)));
-}
+using CropSelectorPtr = std::add_pointer<bool(const CropSelectorData &data)>::type;
+using CropUKernelPtr  = std::add_pointer<void(
+    const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
 
-template <>
-inline float32x4_t load_as_f32(uint16_t *ptr)
+struct CropUKernel
 {
-    return vcvtq_f32_u32(vmovl_u16(wrapper::vload(ptr)));
-}
+    const char           *name;
+    const CropSelectorPtr is_selected;
+    CropUKernelPtr        ukernel;
+};
 
-template <>
-inline float32x4_t load_as_f32(uint8_t *ptr)
-{
-    return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr)))));
-}
+static const CropUKernel available_kernels[] = {
+    {"fp16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)},
+    {"f32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)},
+    {"u8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)},
+    {"u16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)},
+    {"u32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)},
+    {"s8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)},
+    {"s16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)},
+    {"s32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)},
+};
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float32x4_t load_as_f32(float16_t *ptr)
-{
-    return vcvt_f32_f16(wrapper::vload(ptr));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T>
-inline void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                  int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const CropUKernel *get_implementation(const CropSelectorData &data)
 {
-    // Reverse elements if width flipped.
-    if(is_width_flipped)
+    for (const auto &uk : available_kernels)
     {
-        // Collapse first dimension if possible.
-        if(input_has_single_channel)
+        if (uk.is_selected(data))
         {
-            int32_t     x = output_width_start;
-            Coordinates negative_offset(input_offset);
-            negative_offset.set(1, negative_offset[1] - window_step_x + 1);
-            for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
-            {
-                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
-
-                in = wrapper::vrev64(in);
-                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
-
-                wrapper::vstore(output_ptr + x, in);
-            }
-            input_offset[1] = negative_offset[1] + window_step_x - 1;
-            for(; x < output_width_limit; ++x, --input_offset[1])
-            {
-                *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-            }
-        }
-        else
-        {
-            for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
-            {
-                input_offset.set(0, 0);
-                int32_t c = 0;
-                for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
-                {
-                    auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                    wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
-                }
-                for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
-                {
-                    *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                }
-            }
-        }
-    }
-    else
-    {
-        // Use memcpy if the elements don't need converting to float.
-        if(std::is_same<T, float>::value)
-        {
-            memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
-                   reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
-                   (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
-        }
-        else
-        {
-            int32_t x                = 0;
-            int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
-            float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
-            for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
-            {
-                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-                wrapper::vstore(output_start_ptr + x, in);
-            }
-            for(; x < limit; ++x, ++input_offset[0])
-            {
-                *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
-            }
+            return &uk;
         }
     }
+
+    return nullptr;
 }
 
-inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value,
-                                      int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
+inline void out_of_bounds_crop_window(const ITensor *output,
+                                      float         *output_ptr,
+                                      float          extrapolation_value,
+                                      int32_t        window_step_x,
+                                      int32_t        output_width_start,
+                                      int32_t        output_width_limit)
 {
-    auto    in               = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
-    int32_t x                = 0;
-    int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
-    float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
-    for(; x <= limit - window_step_x; x += window_step_x)
+    auto    in    = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
+    int32_t x     = 0;
+    int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+    float  *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+    for (; x <= limit - window_step_x; x += window_step_x)
     {
         wrapper::vstore(output_start_ptr + x, in);
     }
-    for(; x < limit; ++x)
+    for (; x < limit; ++x)
     {
         *(output_start_ptr + x) = extrapolation_value;
     }
 }
 
-inline void execute_window(const ITensor *input, const ITensor *output, Coordinates input_offset, float extrapolation_value,
-                           const std::array<uint32_t, 2> &rows_out_of_bounds, const std::array<uint32_t, 2> &cols_out_of_bounds, NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
-                           bool is_height_flipped, bool has_cols_in_bounds, bool has_cols_out_of_bounds_before, bool has_cols_out_of_bounds_after, bool input_has_single_channel, bool is_width_flipped)
+inline void execute_window(const ITensor                      *input,
+                           const ITensor                      *output,
+                           Coordinates                         input_offset,
+                           float                               extrapolation_value,
+                           const std::array<uint32_t, 2>      &rows_out_of_bounds,
+                           const std::array<uint32_t, 2>      &cols_out_of_bounds,
+                           NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
+                           bool                                is_height_flipped,
+                           bool                                has_cols_in_bounds,
+                           bool                                has_cols_out_of_bounds_before,
+                           bool                                has_cols_out_of_bounds_after,
+                           bool                                input_has_single_channel,
+                           bool                                is_width_flipped)
 {
     // Output is always float.
     const int window_step_x = 16 / sizeof(float);
@@ -203,46 +150,66 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina
     //  |------------------------------|
     // Fill all output rows that have no elements that are within the input bounds with the extrapolation value.
     // First for the rows before the in bounds rows.
-    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1));
+    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+                              rows_out_of_bounds[0] * output->info()->dimension(1));
     output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0);
     // Iterate through each row that has any elements within the input bounds.
-    for(uint32_t row = rows_out_of_bounds[0]; static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
-        ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
+    for (uint32_t row = rows_out_of_bounds[0];
+         static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
+         ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
     {
         // Fill all elements in the row that are out of bounds with the extrapolation value.
         // First for the elements before the in bounds elements.
-        if(has_cols_out_of_bounds_before)
+        if (has_cols_out_of_bounds_before)
         {
             out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]);
         }
         // Copy all elements within the input bounds from the input tensor.
-        if(has_cols_in_bounds)
+        if (has_cols_in_bounds)
         {
             (*in_bounds_crop_function)(input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0],
-                                       output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, is_width_flipped);
+                                       output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel,
+                                       is_width_flipped);
         }
         // Fill all elements after the in bounds elements with the extrapolation value.
-        if(has_cols_out_of_bounds_after)
+        if (has_cols_out_of_bounds_after)
         {
-            out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1));
+            out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x,
+                                      output->info()->dimension(1) - cols_out_of_bounds[1],
+                                      output->info()->dimension(1));
         }
         output_ptr += output->info()->dimension(1) * output->info()->dimension(0);
     }
     // Fill all rows after the in bounds elements with the extrapolation value.
-    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1));
+    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+                              rows_out_of_bounds[1] * output->info()->dimension(1));
 }
 } // namespace
 
 NECropKernel::NECropKernel()
-    : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds(),
-      _in_bounds_crop_function(nullptr)
+    : _input(nullptr),
+      _crop_boxes(nullptr),
+      _box_ind(nullptr),
+      _output(nullptr),
+      _start(),
+      _end(),
+      _crop_box_ind(0),
+      _extrapolation_value(0),
+      _rows_out_of_bounds(),
+      _cols_out_of_bounds()
 {
 }
 
-void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind, float extrapolation_value)
+void NECropKernel::configure(const ITensor *input,
+                             const ITensor *crop_boxes,
+                             const ITensor *box_ind,
+                             ITensor       *output,
+                             uint32_t       crop_box_ind,
+                             float          extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), crop_box_ind, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(),
+                                        crop_box_ind, extrapolation_value));
 
     _input               = input;
     _crop_boxes          = crop_boxes;
@@ -250,49 +217,29 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co
     _output              = output;
     _crop_box_ind        = crop_box_ind;
     _extrapolation_value = extrapolation_value;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::F32:
-            _in_bounds_crop_function = &in_bounds_crop_window<float>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _in_bounds_crop_function = &in_bounds_crop_window<float16_t>;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::U32:
-            _in_bounds_crop_function = &in_bounds_crop_window<uint32_t>;
-            break;
-        case DataType::S32:
-            _in_bounds_crop_function = &in_bounds_crop_window<int32_t>;
-            break;
-        case DataType::U16:
-            _in_bounds_crop_function = &in_bounds_crop_window<uint16_t>;
-            break;
-        case DataType::S16:
-            _in_bounds_crop_function = &in_bounds_crop_window<int16_t>;
-            break;
-        case DataType::U8:
-            _in_bounds_crop_function = &in_bounds_crop_window<uint8_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Datatype not supported");
-    }
 }
 
-Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value)
+Status NECropKernel::validate(const ITensorInfo *input,
+                              const ITensorInfo *crop_boxes,
+                              const ITensorInfo *box_ind,
+                              const ITensorInfo *output,
+                              uint32_t           crop_box_ind,
+                              float              extrapolation_value)
 {
     ARM_COMPUTE_UNUSED(extrapolation_value);
+    const auto *uk = get_implementation(CropSelectorData{input->data_type()});
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16,
+                                                         DataType::F16, DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] <= crop_box_ind);
     ARM_COMPUTE_RETURN_ERROR_ON(box_ind->tensor_shape()[0] <= crop_box_ind);
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -313,48 +260,53 @@ void NECropKernel::configure_output_shape()
     // The normalized coordiantes are scaled to retrieve the floating point image coordinates which are rounded to integers.
     _start = Coordinates(std::floor(x0 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
                          std::floor(y0 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
-    _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
-                       std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
-    const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, abs(_end[1] - _start[1]) + 1);
+    _end   = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
+                         std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1,
+                                abs(_end[1] - _start[1]) + 1);
     _output->info()->set_tensor_shape(out_shape);
 
     bool is_width_flipped  = _end[0] < _start[0];
     bool is_height_flipped = _end[1] < _start[1];
-    if(is_height_flipped)
+    if (is_height_flipped)
     {
-        _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
-                                                                                                            static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+        _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                     ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(2)))
+                                     : 0;
         _rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast<uint32_t>(-_end[1]),
-                                                        static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+                                                        static_cast<uint32_t>(_output->info()->dimension(2)))
+                                             : 0;
     }
     else
     {
         _rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast<uint32_t>(-_start[1]),
-                                                          static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
-        _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
-                                                                                                          static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+                                                          static_cast<uint32_t>(_output->info()->dimension(2)))
+                                               : 0;
+        _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                     ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(2)))
+                                     : 0;
     }
-    if(is_width_flipped)
+    if (is_width_flipped)
     {
-        _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
-                                                                                                            static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+        _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                     ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(1)))
+                                     : 0;
         _cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast<uint32_t>(-_end[0]),
-                                                        static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+                                                        static_cast<uint32_t>(_output->info()->dimension(1)))
+                                             : 0;
     }
     else
     {
         _cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast<uint32_t>(-_start[0]),
-                                                          static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
-        _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
-                                                                                                          static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+                                                          static_cast<uint32_t>(_output->info()->dimension(1)))
+                                               : 0;
+        _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                     ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(1)))
+                                     : 0;
     }
 
     INEKernel::configure(calculate_max_window(*_output->info()));
@@ -369,11 +321,18 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON(_input->info()->has_padding());
     ARM_COMPUTE_ERROR_ON(_output->info()->has_padding());
 
+    const auto *uk = get_implementation(CropSelectorData{_input->info()->data_type()});
+
     uint32_t    batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind))));
-    Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
-                             _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
-    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, _in_bounds_crop_function, _end[1] < _start[1],
-                   _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0,
+    Coordinates input_offset(
+        0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
+        _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
+    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds,
+                   uk->ukernel,
+                   _end[1]<_start[1],
+                           _cols_out_of_bounds[0] +
+                               _cols_out_of_bounds[1]<_output->info()->dimension(1), _cols_out_of_bounds[0]> 0,
+                           _cols_out_of_bounds[1]> 0,
                    _start[0] <= _end[0], _end[0] < _start[0]);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h
index 742215e22b..da4a1b26e5 100644
--- a/src/core/NEON/kernels/NECropKernel.h
+++ b/src/core/NEON/kernels/NECropKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_NEON_CROP_KERNEL_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -67,7 +67,12 @@ public:
      * @param[in]  crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      */
-    void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+    void configure(const ITensor *input,
+                   const ITensor *crop_boxes,
+                   const ITensor *box_ind,
+                   ITensor       *output,
+                   uint32_t       crop_box_ind        = 0,
+                   float          extrapolation_value = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -82,7 +87,12 @@ public:
      * @param[in] crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
      * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *crop_boxes,
+                           const ITensorInfo *box_ind,
+                           const ITensorInfo *output,
+                           uint32_t           crop_box_ind        = 0,
+                           float              extrapolation_value = 0);
 
     /** Configure output tensor's shape as this can only be determined at runtime. */
     void configure_output_shape();
@@ -91,7 +101,8 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
     /** Function to use for in bounds crop for the particular tensor types passed to configure() */
-    using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
+    using InBoundsCropFunction =
+        void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
 
 private:
     const ITensor *_input;
@@ -107,8 +118,6 @@ private:
     std::array<uint32_t, 2> _rows_out_of_bounds;
     /** The number of columns out of bounds at the start and end of output. */
     std::array<uint32_t, 2> _cols_out_of_bounds;
-
-    NECropKernel::InBoundsCropFunction *_in_bounds_crop_function;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEON_CROP_KERNEL_H */
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index 6dcc85ec2e..e0eb5cf202 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,20 +23,19 @@
  */
 #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 
+#include "arm_compute/core/CoreTypes.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/depth_to_space/list.h"
 
-#include <arm_neon.h>
 #include <cstdint>
 
-using namespace arm_compute::misc::shape_calculator;
-
 namespace arm_compute
 {
 namespace
@@ -52,12 +51,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+                                    (block_shape * input->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+                                    (block_shape * input->tensor_shape()[idx_height]));
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -67,14 +68,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 } // namespace
 
 NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
+    : _input(nullptr),
+      _output(nullptr),
+      _block_shape(),
+      _data_layout(DataLayout::UNKNOWN),
+      _split_dimension(Window::DimY)
 {
 }
 
 void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+    TensorShape output_shape = misc::shape_calculator::compute_depth_to_space_shape(
+        input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
@@ -86,9 +92,31 @@ void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output,
     _block_shape = block_shape;
     _data_layout = input->info()->data_layout();
 
+    constexpr size_t dim_b = 3;
+    const auto       dim_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const auto       dim_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const auto       dim_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_ERROR_ON(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES) != dim_b);
+
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
+    Steps steps;
+    steps.set(dim_h, block_shape);
+    steps.set(dim_w, block_shape);
+    steps.set(dim_c, output->info()->dimension(dim_c));
+
+    Window win = calculate_max_window(*output->info(), steps);
     ICPPKernel::configure(win);
+
+    const auto num_batches = input->info()->tensor_shape().total_size_upper(dim_b);
+    if (num_batches > 1)
+    {
+        _split_dimension = dim_b;
+    }
+    else
+    {
+        _split_dimension = dim_h;
+    }
 }
 
 Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -98,66 +126,80 @@ Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITens
     return Status{};
 }
 
+size_t NEDepthToSpaceLayerKernel::get_split_dimension() const
+{
+    return _split_dimension;
+}
+
 void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    const int idx_channel  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int depth_size   = _input->info()->dimension(idx_channel);
-    const int r            = (depth_size / (_block_shape * _block_shape));
-    const int element_size = _input->info()->element_size();
+    const auto *input_info  = _input->info();
+    const auto *output_info = _output->info();
+
+    const auto  element_size   = input_info->element_size();
+    const auto &input_strides  = input_info->strides_in_bytes();
+    const auto &output_strides = output_info->strides_in_bytes();
+
+    const auto &input_shape = input_info->tensor_shape();
 
-    Window slice_out = window.first_slice_window_3D();
+    const uintptr_t k_input_strides[]  = {input_strides[0], input_strides[1], input_strides[2], input_strides[3]};
+    const uintptr_t k_output_strides[] = {output_strides[0], output_strides[1], output_strides[2], output_strides[3]};
 
-    // The slice_out slice does not move
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    const uint8_t *k_input_ptr  = _input->buffer();
+    uint8_t       *k_output_ptr =               //
+        _output->buffer() +                     //
+        window[3].start() * output_strides[3] + //
+        window[2].start() * output_strides[2] + //
+        window[1].start() * output_strides[1] + //
+        window[0].start() * output_strides[0];
 
-    // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
-        Window slice_in = window.first_slice_window_2D();
-        do
-        {
-            Iterator in(_input, slice_in);
-            execute_window_loop(slice_in, [&](const Coordinates & id)
-            {
-                const int x = id.x();
-                const int y = id.y();
-
-                const int   z     = id.z() % r;
-                const int   out_x = x * _block_shape + (id.z() / r) % _block_shape;
-                const int   out_y = y * _block_shape + (id.z() / r) / _block_shape;
-                Coordinates output_coords{ out_x, out_y, z, id[3] };
-                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-            },
-            in);
-        }
-        while(window.slide_window_slice_2D(slice_in));
+        ARM_COMPUTE_ERROR_ON_MSG(window[2].start() != 0 || window[2].end() != window[2].step(),
+                                 "The window cannot be splitted in channel dimension");
+
+        const uintptr_t k_input_shape[] = {
+            window.num_iterations(0), //
+            window.num_iterations(1), //
+            input_shape[2],           // The window cannot be splitted in channel dimension.
+            window.num_iterations(3)  //
+        };
+
+        k_input_ptr += window[3].start() * input_strides[3] +                               //
+                       window[2].start() * _block_shape * _block_shape * input_strides[2] + //
+                       (window[1].start() / _block_shape) * input_strides[1] +              //
+                       (window[0].start() / _block_shape) * input_strides[0];
+
+        cpu::depth_to_space_nchw_any(                         //
+            k_input_ptr, k_output_ptr,                        //
+            k_input_shape, k_input_strides, k_output_strides, //
+            element_size, _block_shape);
     }
     else
     {
-        Window slice_in = window.first_slice_window_3D();
-        do
-        {
-            Iterator in(_input, slice_in);
-            execute_window_loop(slice_in, [&](const Coordinates & id)
-            {
-                const int x = id.y();
-                const int y = id.z();
-
-                const int   z     = id.x() % r;
-                const int   out_x = x * _block_shape + (id.x() / r) % _block_shape;
-                const int   out_y = y * _block_shape + (id.x() / r) / _block_shape;
-                Coordinates output_coords{ z, out_x, out_y, id[3] };
-                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-            },
-            in);
-        }
-        while(window.slide_window_slice_3D(slice_in));
+        ARM_COMPUTE_ERROR_ON_MSG(window[0].start() != 0 || window[0].end() != window[0].step(),
+                                 "The window cannot be splitted in channel dimension");
+
+        const uintptr_t k_input_shape[] = {
+            input_shape[0],           // The window cannot be splitted in channel dimension.
+            window.num_iterations(1), //
+            window.num_iterations(2), //
+            window.num_iterations(3)  //
+        };
+
+        k_input_ptr += window[3].start() * input_strides[3] +                  //
+                       (window[2].start() / _block_shape) * input_strides[2] + //
+                       (window[1].start() / _block_shape) * input_strides[1] + //
+                       window[0].start() * _block_shape * _block_shape * input_strides[0];
+
+        cpu::depth_to_space_nhwc_any(                         //
+            k_input_ptr, k_output_ptr,                        //
+            k_input_shape, k_input_strides, k_output_strides, //
+            element_size, _block_shape);
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
index 7e18dd88b8..ca431ec5fe 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
 
@@ -68,14 +68,18 @@ public:
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
 
+    /** Get the dimension the scheduler should use to split. */
+    size_t get_split_dimension() const;
+
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;       /**< Source tensor */
-    ITensor       *_output;      /**< Destination tensor */
-    int32_t        _block_shape; /**< Block shape */
-    DataLayout     _data_layout; /**< Data layout of the operation */
+    const ITensor *_input;           /**< Source tensor */
+    ITensor       *_output;          /**< Destination tensor */
+    int32_t        _block_shape;     /**< Block shape */
+    DataLayout     _data_layout;     /**< Data layout of the operation */
+    size_t         _split_dimension; /**< The dimension the scheduler should use to split the workload. */
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
index 261437f07d..a5969cd497 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -37,16 +38,19 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo               *input,
+                          const ITensorInfo               *output,
+                          const ITensorInfo               *idx,
+                          const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -56,7 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo                     *input,
+                                                        ITensorInfo                     *output,
+                                                        ITensorInfo                     *idx,
+                                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(idx, config);
 
@@ -68,12 +75,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-NEFFTDigitReverseKernel::NEFFTDigitReverseKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
+NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
 {
 }
 
-void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config)
+void NEFFTDigitReverseKernel::configure(const ITensor                   *input,
+                                        ITensor                         *output,
+                                        const ITensor                   *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
@@ -91,11 +100,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 
-    if(axis == 0)
+    if (axis == 0)
     {
-        if(is_input_complex)
+        if (is_input_complex)
         {
-            if(is_conj)
+            if (is_conj)
             {
                 _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<true, true>;
             }
@@ -109,11 +118,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
             _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<false, false>;
         }
     }
-    else if(axis == 1)
+    else if (axis == 1)
     {
-        if(is_input_complex)
+        if (is_input_complex)
         {
-            if(is_conj)
+            if (is_conj)
             {
                 _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<true, true>;
             }
@@ -133,10 +142,14 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
     }
 }
 
-Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status NEFFTDigitReverseKernel::validate(const ITensorInfo               *input,
+                                         const ITensorInfo               *output,
+                                         const ITensorInfo               *idx,
+                                         const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
     return Status{};
 }
 
@@ -159,38 +172,40 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0(const Window &window)
     std::vector<float> buffer_row_out(2 * N);
     std::vector<float> buffer_row_in(2 * N);
 
-    execute_window_loop(slice, [&](const Coordinates &)
-    {
-        if(is_input_complex)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &)
         {
-            // Load
-            memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
-
-            // Shuffle
-            for(size_t x = 0; x < 2 * N; x += 2)
+            if (is_input_complex)
             {
-                size_t idx            = buffer_idx[x / 2];
-                buffer_row_out[x]     = buffer_row_in[2 * idx];
-                buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
-            }
-        }
-        else
-        {
-            // Load
-            memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+                // Load
+                memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
 
-            // Shuffle
-            for(size_t x = 0; x < N; ++x)
+                // Shuffle
+                for (size_t x = 0; x < 2 * N; x += 2)
+                {
+                    size_t idx            = buffer_idx[x / 2];
+                    buffer_row_out[x]     = buffer_row_in[2 * idx];
+                    buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
+                }
+            }
+            else
             {
-                size_t idx            = buffer_idx[x];
-                buffer_row_out[2 * x] = buffer_row_in[idx];
+                // Load
+                memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+
+                // Shuffle
+                for (size_t x = 0; x < N; ++x)
+                {
+                    size_t idx            = buffer_idx[x];
+                    buffer_row_out[2 * x] = buffer_row_in[idx];
+                }
             }
-        }
 
-        // Copy back
-        memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
-    },
-    in, out);
+            // Copy back
+            memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
+        },
+        in, out);
 }
 
 template <bool is_input_complex, bool is_conj>
@@ -215,39 +230,41 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1(const Window &window)
     const size_t stride_z = _input->info()->strides_in_bytes()[2];
     const size_t stride_w = _input->info()->strides_in_bytes()[3];
 
-    execute_window_loop(slice, [&](const Coordinates & id)
-    {
-        auto        *out_ptr    = reinterpret_cast<float *>(out.ptr());
-        auto        *in_ptr     = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
-        const size_t y_shuffled = buffer_idx[id.y()];
-
-        if(is_input_complex)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &id)
         {
-            // Shuffle the entire row into the output
-            memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+            auto        *out_ptr = reinterpret_cast<float *>(out.ptr());
+            auto        *in_ptr  = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
+            const size_t y_shuffled = buffer_idx[id.y()];
 
-            // Conjugate if necessary
-            if(is_conj)
+            if (is_input_complex)
             {
-                for(size_t x = 0; x < 2 * Nx; x += 2)
+                // Shuffle the entire row into the output
+                memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+
+                // Conjugate if necessary
+                if (is_conj)
                 {
-                    out_ptr[x + 1] = -out_ptr[x + 1];
+                    for (size_t x = 0; x < 2 * Nx; x += 2)
+                    {
+                        out_ptr[x + 1] = -out_ptr[x + 1];
+                    }
                 }
             }
-        }
-        else
-        {
-            // Shuffle the entire row into the buffer
-            memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
-
-            // Copy the buffer to the output, with a zero imaginary part
-            for(size_t x = 0; x < 2 * Nx; x += 2)
+            else
             {
-                out_ptr[x] = buffer_row[x / 2];
+                // Shuffle the entire row into the buffer
+                memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
+
+                // Copy the buffer to the output, with a zero imaginary part
+                for (size_t x = 0; x < 2 * Nx; x += 2)
+                {
+                    out_ptr[x] = buffer_row[x / 2];
+                }
             }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
index f436c364b2..ecf85ebc98 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -70,7 +71,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+    static Status validate(const ITensorInfo               *input,
+                           const ITensorInfo               *output,
+                           const ITensorInfo               *idx,
+                           const FFTDigitReverseKernelInfo &config);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
index 44c841f626..4b58a7b9ac 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -28,10 +28,11 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "support/ToolchainSupport.h"
 
 #include <arm_neon.h>
@@ -70,7 +71,7 @@ float32x2_t c_mul_neon(float32x2_t a, float32x2_t b)
 {
     using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
 
-    const float32x2_t mask = { -1.0, 1.0 };
+    const float32x2_t mask = {-1.0, 1.0};
     const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
     const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
 
@@ -88,7 +89,7 @@ float32x2_t c_mul_neon_img(float32x2_t a, float img_constant)
     const float a_r = wrapper::vgetlane(a, 0);
     const float a_i = wrapper::vgetlane(a, 1);
 
-    const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant });
+    const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant});
     return out;
 }
 
@@ -100,7 +101,8 @@ float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_
     return wrapper::vadd(t2, e);
 }
 
-float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
+float32x2_t reduce_sum_7(
+    float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
 {
     const auto t0  = wrapper::vadd(x1, x2);
     const auto t1  = wrapper::vadd(x3, x4);
@@ -111,7 +113,14 @@ float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32
     return wrapper::vadd(t00, t01);
 }
 
-float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8)
+float32x2_t reduce_sum_8(float32x2_t x1,
+                         float32x2_t x2,
+                         float32x2_t x3,
+                         float32x2_t x4,
+                         float32x2_t x5,
+                         float32x2_t x6,
+                         float32x2_t x7,
+                         float32x2_t x8)
 {
     const auto t0  = wrapper::vadd(x1, x2);
     const auto t1  = wrapper::vadd(x3, x4);
@@ -141,15 +150,21 @@ void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w,
     x = wrapper::vadd(a, b);
     x = wrapper::vadd(x, c);
 
-    const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c));
-    const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c));
+    const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c));
+    const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c));
 
     y = z = wrapper::vsub(a, v1);
     y     = wrapper::vadd(y, v2);
     z     = wrapper::vsub(z, v2);
 }
 
-void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3)
+void fft_4(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3)
 {
     float32x2_t a = x1;
     float32x2_t b = c_mul_neon(w, x2);
@@ -173,7 +188,15 @@ void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, c
     x4             = wrapper::vadd(x41, x42);
 }
 
-void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4)
+void fft_5(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3,
+           const float32x2_t &w4)
 {
     const auto a = x1;
     const auto b = c_mul_neon(w, x2);
@@ -181,25 +204,25 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     const auto d = c_mul_neon(w3, x4);
     const auto e = c_mul_neon(w4, x5);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b);
+    const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b);
+    const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b);
+    const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b);
 
-    const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c);
+    const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c);
+    const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c);
+    const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c);
 
-    const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d);
+    const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d);
+    const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d);
 
-    const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e);
+    const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e);
+    const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e);
+    const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e);
 
     x1 = reduce_sum_5(a, b, c, d, e);
     x2 = reduce_sum_5(a, b0, c0, d0, e0);
@@ -208,9 +231,19 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     x5 = reduce_sum_5(a, b3, c3, d3, e3);
 }
 
-void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3,
+void fft_7(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           float32x2_t       &x6,
+           float32x2_t       &x7,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3,
            const float32x2_t &w4,
-           const float32x2_t &w5, const float32x2_t &w6)
+           const float32x2_t &w5,
+           const float32x2_t &w6)
 {
     const auto a = x1;
     const auto b = c_mul_neon(w, x2);
@@ -220,47 +253,47 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     const auto f = c_mul_neon(w5, x6);
     const auto g = c_mul_neon(w6, x7);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b);
-    const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b);
-    const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b);
-
-    const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c);
-    const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c);
-    const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c);
-
-    const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d);
-    const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d);
-    const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d);
-
-    const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e);
-    const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e);
-    const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e);
-
-    const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f);
-    const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f);
-    const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f);
-    const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f);
-    const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f);
-    const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f);
-
-    const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g);
-    const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g);
-    const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g);
-    const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g);
-    const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g);
-    const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g);
+    const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b);
+    const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b);
+    const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b);
+    const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b);
+    const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b);
+
+    const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c);
+    const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c);
+    const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c);
+    const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c);
+    const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c);
+
+    const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d);
+    const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d);
+    const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d);
+    const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d);
+
+    const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e);
+    const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e);
+    const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e);
+    const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e);
+    const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e);
+
+    const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f);
+    const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f);
+    const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f);
+    const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f);
+    const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f);
+    const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f);
+
+    const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g);
+    const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g);
+    const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g);
+    const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g);
+    const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g);
+    const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g);
 
     x1 = reduce_sum_7(a, b, c, d, e, f, g);
     x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
@@ -271,9 +304,20 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
 }
 
-void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2,
+void fft_8(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           float32x2_t       &x6,
+           float32x2_t       &x7,
+           float32x2_t       &x8,
+           const float32x2_t &w,
+           const float32x2_t &w2,
            const float32x2_t &w3,
-           const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6,
+           const float32x2_t &w4,
+           const float32x2_t &w5,
+           const float32x2_t &w6,
            const float32x2_t &w7)
 {
     const auto a = x1;
@@ -285,61 +329,61 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     const auto g = c_mul_neon(w6, x7);
     const auto h = c_mul_neon(w7, x8);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b);
-    const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b);
-    const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b);
-    const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b);
-
-    const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c);
-    const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c);
-    const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c);
-    const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c);
-
-    const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d);
-    const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d);
-    const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d);
-    const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d);
-
-    const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-
-    const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f);
-    const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f);
-    const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f);
-    const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f);
-    const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f);
-    const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f);
-    const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f);
-
-    const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g);
-    const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g);
-    const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g);
-    const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g);
-    const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g);
-    const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g);
-    const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g);
-
-    const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h);
-    const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h);
-    const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h);
-    const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h);
-    const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h);
-    const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h);
-    const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h);
+    const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b);
+    const auto b1 = c_mul_neon(float32x2_t{0, -1}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b);
+    const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b);
+    const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b);
+    const auto b5 = c_mul_neon(float32x2_t{0, 1}, b);
+    const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b);
+
+    const auto c0 = c_mul_neon(float32x2_t{0, -1}, c);
+    const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c);
+    const auto c2 = c_mul_neon(float32x2_t{0, 1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{1, 0}, c);
+    const auto c4 = c_mul_neon(float32x2_t{0, -1}, c);
+    const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c);
+    const auto c6 = c_mul_neon(float32x2_t{0, 1}, c);
+
+    const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d);
+    const auto d1 = c_mul_neon(float32x2_t{0, 1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d);
+    const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d);
+    const auto d5 = c_mul_neon(float32x2_t{0, -1}, d);
+    const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d);
+
+    const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e1 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e3 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e5 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e);
+
+    const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f);
+    const auto f1 = c_mul_neon(float32x2_t{0, -1}, f);
+    const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f);
+    const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f);
+    const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f);
+    const auto f5 = c_mul_neon(float32x2_t{0, 1}, f);
+    const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f);
+
+    const auto g0 = c_mul_neon(float32x2_t{0, 1}, g);
+    const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g);
+    const auto g2 = c_mul_neon(float32x2_t{0, -1}, g);
+    const auto g3 = c_mul_neon(float32x2_t{1, 0}, g);
+    const auto g4 = c_mul_neon(float32x2_t{0, 1}, g);
+    const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g);
+    const auto g6 = c_mul_neon(float32x2_t{0, -1}, g);
+
+    const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h);
+    const auto h1 = c_mul_neon(float32x2_t{0, 1}, h);
+    const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h);
+    const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h);
+    const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h);
+    const auto h5 = c_mul_neon(float32x2_t{0, -1}, h);
+    const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h);
 
     x1 = reduce_sum_8(a, b, c, d, e, f, g, h);
     x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
@@ -352,18 +396,19 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
 }
 
 template <bool first_stage>
-void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_2_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            auto a = float32x2_t{ 0, 0 };
-            auto b = float32x2_t{ 0, 0 };
+            auto a = float32x2_t{0, 0};
+            auto b = float32x2_t{0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 a             = wrapper::vgetlow(ab);
@@ -379,7 +424,7 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             fft_2(a, b, w);
 
             // Write outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
             }
@@ -394,12 +439,20 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_2_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -418,20 +471,21 @@ void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_3_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
             // Load inputs
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            if(first_stage)
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 a             = wrapper::vgetlow(ab);
@@ -447,7 +501,7 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             // Base-case prime transform
             fft_3(a, b, c, w, w2);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
             }
@@ -462,14 +516,22 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_3_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -489,21 +551,22 @@ void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_4_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
         const auto w3 = c_mul_neon(w2, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            if(first_stage)
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -524,7 +587,7 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             // Base-case prime transform
             fft_4(a, b, c, d, w, w2, w3);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -542,15 +605,23 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_4_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
         const auto w3 = c_mul_neon(w2, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -572,25 +643,26 @@ void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_5_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
         const float32x2_t w4 = c_mul_neon(w3, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -613,7 +685,7 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             fft_5(a, b, c, d, e, w, w2, w3, w4);
 
             // Store outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -632,16 +704,24 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_5_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
         const float32x2_t w4 = c_mul_neon(w3, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -666,10 +746,11 @@ void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_7_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -677,18 +758,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w5 = c_mul_neon(w4, w);
         const float32x2_t w6 = c_mul_neon(w5, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
-            float32x2_t f = { 0, 0 };
-            float32x2_t g = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
+            float32x2_t f = {0, 0};
+            float32x2_t g = {0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -715,7 +796,7 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             // Base-case prime transform
             fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -737,10 +818,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_7_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -748,7 +837,7 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w5 = c_mul_neon(w4, w);
         const float32x2_t w6 = c_mul_neon(w5, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -777,10 +866,11 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_8_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -789,20 +879,20 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w6 = c_mul_neon(w5, w);
         const float32x2_t w7 = c_mul_neon(w6, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
             // Load inputs
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
-            float32x2_t f = { 0, 0 };
-            float32x2_t g = { 0, 0 };
-            float32x2_t h = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
+            float32x2_t f = {0, 0};
+            float32x2_t g = {0, 0};
+            float32x2_t h = {0, 0};
 
             // Base-case prime transform
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -834,7 +924,7 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
 
             // Store outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -858,10 +948,18 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_8_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -870,7 +968,7 @@ void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w6 = c_mul_neon(w5, w);
         const float32x2_t w7 = c_mul_neon(w6, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -908,7 +1006,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_UNUSED(config);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -917,11 +1015,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(config);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output, *input);
     }
@@ -942,7 +1041,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo
     // FFT table axis 0: [radix, first_stage]
     static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
 
-    if(fft_table_axis0.empty())
+    if (fft_table_axis0.empty())
     {
         fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>;
         fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>;
@@ -967,7 +1066,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo
     // FFT table axis 1: [radix, first_stage]
     static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
 
-    if(fft_table_axis1.empty())
+    if (fft_table_axis1.empty())
     {
         fft_table_axis1[2] = &fft_radix_2_axes_1;
         fft_table_axis1[3] = &fft_radix_3_axes_1;
@@ -985,12 +1084,13 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     // Output auto inizialitation if not yet initialized
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
 
     _input  = input;
     _output = (output == nullptr) ? input : output;
@@ -998,7 +1098,7 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
     _axis   = config.axis;
     _radix  = config.radix;
 
-    switch(config.axis)
+    switch (config.axis)
     {
         case 0:
             set_radix_stage_axis0(config);
@@ -1012,26 +1112,28 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
+    auto win_config =
+        validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status NEFFTRadixStageKernel::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *output,
+                                       const FFTRadixStageKernelInfo &config)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (run_in_place) ? nullptr : output->clone().get(),
-                                                              config)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+            .first);
 
     return Status{};
 }
 
 std::set<unsigned int> NEFFTRadixStageKernel::supported_radix()
 {
-    return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+    return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
 }
 
 void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
@@ -1049,28 +1151,32 @@ void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
     // Precompute FFT constants
     const unsigned int NxRadix = _radix * _Nx;
     const float        alpha   = 2.0f * kPi / float(NxRadix);
-    const float32x2_t  w_m{ cosf(alpha), -sinf(alpha) };
+    const float32x2_t  w_m{cosf(alpha), -sinf(alpha)};
 
-    if(_axis == 0)
+    if (_axis == 0)
     {
         const unsigned int N = _input->info()->dimension(0);
-        execute_window_loop(input_window, [&](const Coordinates &)
-        {
-            _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N);
-        },
-        in, out);
+        execute_window_loop(
+            input_window,
+            [&](const Coordinates &) {
+                _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m,
+                        N);
+            },
+            in, out);
     }
     else
     {
         const unsigned int N = _input->info()->dimension(0);
         const unsigned int M = _input->info()->dimension(1);
-        execute_window_loop(input_window, [&](const Coordinates &)
-        {
-            _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, M,
-                    _input->info()->padding().right + _input->info()->padding().left,
-                    _output->info()->padding().right + _output->info()->padding().left);
-        },
-        in, out);
+        execute_window_loop(
+            input_window,
+            [&](const Coordinates &)
+            {
+                _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N,
+                        M, _input->info()->padding().right + _input->info()->padding().left,
+                        _output->info()->padding().right + _output->info()->padding().left);
+            },
+            in, out);
     }
 
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.h b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
index 2291a1068c..54f32efa23 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.h
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 #include <arm_neon.h>
@@ -92,8 +93,17 @@ private:
     void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config);
     void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config);
 
-    using FFTFunctionPointerAxis0 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
-    using FFTFunctionPointerAxis1 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int, unsigned int, unsigned int, unsigned int)>;
+    using FFTFunctionPointerAxis0 =
+        std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
+    using FFTFunctionPointerAxis1 = std::function<void(float *,
+                                                       float *,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       const float32x2_t &,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       unsigned int)>;
 
     FFTFunctionPointerAxis0 _func_0;
     FFTFunctionPointerAxis1 _func_1;
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
index 5ec330bebc..9fe561fc59 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
@@ -28,9 +28,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
@@ -41,8 +42,8 @@ namespace
 void scale_complex(float *c_in, float *c_out, bool is_conjugate, float scale)
 {
     const auto a = wrapper::vload(c_in);
-    auto       b = wrapper::vdiv(a, float32x2_t{ scale, scale });
-    if(is_conjugate)
+    auto       b = wrapper::vdiv(a, float32x2_t{scale, scale});
+    if (is_conjugate)
     {
         const float img_part = wrapper::vgetlane(b, 1);
         b                    = wrapper::vsetlane(-img_part, b, 1);
@@ -56,7 +57,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -71,7 +72,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     // Configure kernel window
     Window win = calculate_max_window(*input, Steps());
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
         auto_init_if_empty(*output, *input->clone());
@@ -126,10 +127,10 @@ void NEFFTScaleKernel::run(const Window &window, const ThreadInfo &info)
     Iterator in(_input, input_window);
     Iterator out(_run_in_place ? _input : _output, input_window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale);
-    },
-    in, out);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        { scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale); },
+        in, out);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.h b/src/core/NEON/kernels/NEFFTScaleKernel.h
index 24a19f98ba..608cf5ea34 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.h
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H
 #define ARM_COMPUTE_NEFFTSCALEKERNEL_H
 
-#include "src/core/NEON/INEKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/NEON/INEKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 1c7c1f9763..00b0c0ae8d 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -30,14 +30,19 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 namespace arm_compute
 {
 namespace
 {
-inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
+inline void fill_constant_value_single_channel_special(ITensor          *tensor,
+                                                       const Window     &window,
+                                                       unsigned int      right,
+                                                       unsigned int      bottom,
+                                                       const PixelValue &constant_border_value)
 {
     float border_value;
     constant_border_value.get(border_value);
@@ -52,39 +57,43 @@ inline void fill_constant_value_single_channel_special(ITensor *tensor, const Wi
 
     Iterator vertical_it(tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
+        {
+            const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
 
-        // Fill left and right borders
-        *(row_start - 1) = border_value;
-        std::fill_n(row_start + width, right, border_value);
-    },
-    vertical_it);
+            // Fill left and right borders
+            *(row_start - 1) = border_value;
+            std::fill_n(row_start + width, right, border_value);
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
-        // Fill top rows including left/right borders
-        std::fill_n(row_start - 1, 1 + width + right, border_value);
-
-        // Bottom border
-        const unsigned low_border_size = height + bottom;
-        for(unsigned int i = height; i < low_border_size; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
-
-            // Fill bottom rows including left/right borders
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
+            // Fill top rows including left/right borders
             std::fill_n(row_start - 1, 1 + width + right, border_value);
-        }
-    },
-    plane_it);
+
+            // Bottom border
+            const unsigned low_border_size = height + bottom;
+            for (unsigned int i = height; i < low_border_size; ++i)
+            {
+                const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
+
+                // Fill bottom rows including left/right borders
+                std::fill_n(row_start - 1, 1 + width + right, border_value);
+            }
+        },
+        plane_it);
 }
 } // namespace
 
@@ -93,14 +102,20 @@ NEFillBorderKernel::NEFillBorderKernel()
 {
 }
 
-void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensor          *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     _tensor = tensor;
     configure(tensor->info(), border_size, border_mode, constant_border_value);
 }
 
-void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensorInfo      *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -124,7 +139,7 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
 
     // If there is no border: early exit
-    if(_border_size.empty())
+    if (_border_size.empty())
     {
         return;
     }
@@ -132,13 +147,14 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_mode)
+    switch (_mode)
     {
         case BorderMode::CONSTANT:
         {
-            if(_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
+            if (_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
             {
-                fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+                fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom,
+                                                           _constant_border_value);
             }
             else
             {
@@ -176,46 +192,56 @@ void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
 
     Iterator vertical_it(_tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + vertical_it.offset();
-        // Fill left and right borders
-        for(unsigned int i = 0; i < _border_size.left; ++i)
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
         {
-            std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(), element_size);
-        }
+            uint8_t *base_addr = start_valid_region + vertical_it.offset();
+            // Fill left and right borders
+            for (unsigned int i = 0; i < _border_size.left; ++i)
+            {
+                std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(),
+                            element_size);
+            }
 
-        for(unsigned int i = 0; i < _border_size.right; ++i)
-        {
-            std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, element_size);
-        }
-    },
-    vertical_it);
+            for (unsigned int i = 0; i < _border_size.right; ++i)
+            {
+                std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size,
+                            element_size);
+            }
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(_tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        for(int i = -_border_size.top; i < 0; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            // Copy top rows including left/right borders
-            std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) - _border_size.left * element_size,
-                        base_addr - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
-        }
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            for (int i = -_border_size.top; i < 0; ++i)
+            {
+                // Copy top rows including left/right borders
+                std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) -
+                                _border_size.left * element_size,
+                            base_addr - _border_size.left * element_size,
+                            (_border_size.left + width + _border_size.right) * element_size);
+            }
 
-        // Bottom border
-        for(unsigned int i = height; i < height + _border_size.bottom; ++i)
-        {
-            // Copy bottom rows including left/right borders
-            std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
-                        base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
-        }
-    },
-    plane_it);
+            // Bottom border
+            for (unsigned int i = height; i < height + _border_size.bottom; ++i)
+            {
+                // Copy bottom rows including left/right borders
+                std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
+                            base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] -
+                                _border_size.left * element_size,
+                            (_border_size.left + width + _border_size.right) * element_size);
+            }
+        },
+        plane_it);
 }
 
 void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
@@ -232,50 +258,57 @@ void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window
 
     Iterator vertical_it(_tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + vertical_it.offset();
-        // Fill left and right borders
-        for(unsigned int i = 0; i < _border_size.left; ++i)
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
         {
-            std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value, element_size);
-        }
+            uint8_t *base_addr = start_valid_region + vertical_it.offset();
+            // Fill left and right borders
+            for (unsigned int i = 0; i < _border_size.left; ++i)
+            {
+                std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value,
+                            element_size);
+            }
 
-        for(unsigned int i = 0; i < _border_size.right; ++i)
-        {
-            std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
-        }
-    },
-    vertical_it);
+            for (unsigned int i = 0; i < _border_size.right; ++i)
+            {
+                std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
+            }
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(_tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        for(int i = -_border_size.top; i < 0; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            // Fill top rows including left/right borders
-            for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            for (int i = -_border_size.top; i < 0; ++i)
             {
-                std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+                // Fill top rows including left/right borders
+                for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+                {
+                    std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+                                &_constant_border_value, element_size);
+                }
             }
-        }
 
-        // Bottom border
-        const unsigned low_border_size = height + _border_size.bottom;
-        for(unsigned int i = height; i < low_border_size; ++i)
-        {
-            // Fill bottom rows including left/right borders
-            for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+            // Bottom border
+            const unsigned low_border_size = height + _border_size.bottom;
+            for (unsigned int i = height; i < low_border_size; ++i)
             {
-                std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+                // Fill bottom rows including left/right borders
+                for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+                {
+                    std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+                                &_constant_border_value, element_size);
+                }
             }
-        }
-    },
-    plane_it);
+        },
+        plane_it);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h
index 2c851583ed..aaad108bfa 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.h
+++ b/src/core/NEON/kernels/NEFillBorderKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,10 @@ public:
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      *
      */
-    void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensor          *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
     /** Initialise the function.
      *
      * @note This kernel fills the borders within the XY-planes.
@@ -75,7 +79,10 @@ public:
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      *
      */
-    void configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensorInfo      *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index 0d3244c409..cbe5136fb1 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,10 +29,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/fuse_batch_normalization/list.h"
 
 #include <map>
 
@@ -40,10 +44,121 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                          float epsilon, FuseBatchNormalizationType fbn_type)
+struct FuseBatchNormalizeSelectorData
+{
+    DataType                   dt;
+    DataLayout                 dl;
+    FuseBatchNormalizationType fbn_type;
+    cpuinfo::CpuIsaInfo        isa;
+};
+
+using FBNSelectorPtr = std::add_pointer<bool(const FuseBatchNormalizeSelectorData &data)>::type;
+using FBNUKernelPtr  = std::add_pointer<void(const ITensor *,
+                                            const ITensor *,
+                                            ITensor *,
+                                            ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            float,
+                                            const Window &)>::type;
+
+struct FBNUKernel
+{
+    const char          *name;
+    const FBNSelectorPtr is_selected;
+    FBNUKernelPtr        ukernel;
+};
+
+static const FBNUKernel available_kernels[] = {
+    {"fused_batch_normalization_conv_NHWC_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+    {"fused_batch_normalization_conv_NCHW_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+    {"fused_batch_normalization_dwc_NHWC_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)},
+    {"fused_batch_normalization_dwc_NCHW_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)},
+    {"fused_batch_normalization_conv_NHWC_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+    {"fused_batch_normalization_conv_NCHW_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+    {"fused_batch_normalization_dwc_NHWC_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)},
+    {"fused_batch_normalization_dwc_NCHW_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)}};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @param[in]
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data)
+{
+    for (const auto &uk : available_kernels)
+    {
+        if (uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
+Status validate_arguments(const ITensorInfo         *input_weights,
+                          const ITensorInfo         *bn_mean,
+                          const ITensorInfo         *bn_var,
+                          const ITensorInfo         *fused_weights,
+                          const ITensorInfo         *fused_bias,
+                          const ITensorInfo         *input_bias,
+                          const ITensorInfo         *bn_beta,
+                          const ITensorInfo         *bn_gamma,
+                          float                      epsilon,
+                          FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -54,43 +169,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
     ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
 
-    if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+    if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
     }
     else
     {
-        const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t channel_idx =
+            get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
     }
     // Validate bias
-    if(input_bias != nullptr)
+    if (input_bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
     }
     // Validate beta
-    if(bn_beta != nullptr)
+    if (bn_beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
     }
     // Validate gamma
-    if(bn_gamma != nullptr)
+    if (bn_gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
     }
 
     // Validate output weights
-    if(fused_weights != nullptr && fused_weights->total_size() != 0)
+    if (fused_weights != nullptr && fused_weights->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
     }
     // Validate output bias
-    if(fused_bias != nullptr && fused_bias->total_size() != 0)
+    if (fused_bias != nullptr && fused_bias->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -99,330 +215,34 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
     return Status{};
 }
 
-template <typename VectorType>
-void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                    const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
-{
-    using ScalarType   = typename VectorType::scalar_type;
-    const int size     = 16 / conv_weights->info()->element_size();
-    using ExactTagType = typename VectorType::tag_type;
-
-    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == conv_weights);
-    const bool run_in_place_bias    = (fused_bias == nullptr) || (conv_bias != nullptr && fused_bias == conv_bias);
-
-    // Set build options
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x  = size;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Iterator conv_w_in(conv_weights, win);
-    Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win);
-
-    const auto conv_bias_in  = (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       conv_bias_out = (run_in_place_bias ? conv_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
-
-    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       gamma_vec   = wrapper::vdup_n(ScalarType(1), ExactTagType{});
-    auto       beta_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       rvar_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
-
-    auto mean                = ScalarType(0.0);
-    auto var                 = ScalarType(0.0);
-    auto gamma               = ScalarType(1.0);
-    auto beta                = ScalarType(0.0);
-    auto conv_bias_in_scalar = ScalarType(0.0);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        var = input_var[id[3]];
-        if(input_gamma != nullptr)
-        {
-            gamma = input_gamma[id[3]];
-        }
-
-        if((id[0] == 0) && (id[1] == 0) && (id[2] == 0))
-        {
-            if(input_beta != nullptr)
-            {
-                beta     = input_beta[id[3]];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
-            }
-
-            // Construct vectors
-            mean     = input_mean[id[3]];
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-
-            if(conv_bias_in != nullptr)
-            {
-                conv_bias_in_scalar = conv_bias_in[id[3]];
-            }
-            auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-            conv_bias_out[id[3]]      = (conv_bias_tmp_scalar * gamma) + beta;
-        }
-
-        int  x              = window_start_x;
-        auto conv_w_in_ptr  = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
-        auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
-        var_vec             = wrapper::vdup_n(var, ExactTagType{});
-        gamma_vec           = wrapper::vdup_n(gamma, ExactTagType{});
-        rvar_vec            = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto wn = wrapper::vloadq(conv_w_in_ptr + x);
-            wn      = wrapper::vmul(wn, rvar_vec);
-            wn      = wrapper::vmul(wn, gamma_vec);
-
-            // Store results
-            wrapper::vstore(conv_w_out_ptr + x, wn);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    conv_w_in, conv_w_out);
-}
-
-template <typename VectorType>
-void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
-{
-    using ScalarType   = typename VectorType::scalar_type;
-    const int size     = 16 / dwc_weights->info()->element_size();
-    using ExactTagType = typename VectorType::tag_type;
-
-    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == dwc_weights);
-    const bool run_in_place_bias    = (fused_bias == nullptr) || (dwc_bias != nullptr && fused_bias == dwc_bias);
-
-    // Set build options
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x  = size;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Iterator dwc_w_in(dwc_weights, win);
-    Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
-
-    const auto dwc_bias_in  = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
-
-    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    auto       mean_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       var_vec      = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       gamma_vec    = wrapper::vdup_n(ScalarType(1), ExactTagType{});
-    auto       beta_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       rvar_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       dwc_bias_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    const auto epsilon_vec  = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
-
-    auto gamma              = ScalarType(1.0);
-    auto beta               = ScalarType(0.0);
-    auto dwc_bias_in_scalar = ScalarType(0);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            var_vec = wrapper::vloadq(input_var + x);
-            if(input_gamma != nullptr)
-            {
-                gamma_vec = wrapper::vloadq(input_gamma + x);
-            }
-
-            if((id[2] == 0) && (id[1] == 0))
-            {
-                mean_vec = wrapper::vloadq(input_mean + x);
-
-                // Construct vectors
-                if(input_beta != nullptr)
-                {
-                    beta_vec = wrapper::vloadq(input_beta + x);
-                }
-
-                if(dwc_bias_in != nullptr)
-                {
-                    dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x);
-                }
-
-                auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec), wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)));
-                dwc_bias_tmp_vec      = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec);
-                wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec);
-            }
-
-            auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-            auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
-
-            auto wn  = wrapper::vloadq(dwc_w_in_ptr + x);
-            rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-            wn       = wrapper::vmul(wn, rvar_vec);
-            wn       = wrapper::vmul(wn, gamma_vec);
-
-            // Store results
-            wrapper::vstore(dwc_w_out_ptr + x, wn);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            auto var = input_var[x];
-            if(input_gamma != nullptr)
-            {
-                gamma = input_gamma[x];
-            }
-
-            if(id[2] == 0 && id[1] == 0)
-            {
-                auto mean = input_mean[x];
-                if(input_beta != nullptr)
-                {
-                    beta = input_beta[x];
-                }
-                if(dwc_bias_in != nullptr)
-                {
-                    dwc_bias_in_scalar = dwc_bias_in[x];
-                }
-
-                auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-                dwc_bias_out[x]          = (dwc_bias_tmp_scalar * gamma) + beta;
-            }
-
-            const auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-            auto       dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
-
-            *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    dwc_w_in, dwc_w_out);
-}
-
-template <typename VectorType>
-void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
-{
-    using ScalarType   = typename VectorType::scalar_type;
-    const int size     = 16 / dwc_weights->info()->element_size();
-    using ExactTagType = typename VectorType::tag_type;
-
-    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == dwc_weights);
-    const bool run_in_place_bias    = (fused_bias == nullptr) || (dwc_bias != nullptr && fused_bias == dwc_bias);
-
-    // Set build options
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x  = size;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Iterator dwc_w_in(dwc_weights, win);
-    Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
-
-    const auto dwc_bias_in  = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
-
-    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       gamma_vec   = wrapper::vdup_n(ScalarType(1), ExactTagType{});
-    auto       beta_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    auto       rvar_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
-    const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
-
-    auto mean               = ScalarType(0.0);
-    auto var                = ScalarType(0.0);
-    auto gamma              = ScalarType(1.0);
-    auto beta               = ScalarType(0.0);
-    auto dwc_bias_in_scalar = ScalarType(0.0);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        var = input_var[id[2]];
-        if(input_gamma != nullptr)
-        {
-            gamma = input_gamma[id[2]];
-        }
-
-        if(id[1] == 0)
-        {
-            mean = input_mean[id[2]];
-
-            // Construct vectors
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-            if(input_beta != nullptr)
-            {
-                beta     = input_beta[id[2]];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
-            }
-
-            if(dwc_bias_in != nullptr)
-            {
-                dwc_bias_in_scalar = dwc_bias_in[id[2]];
-            }
-
-            auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-            dwc_bias_out[id[2]]      = (dwc_bias_tmp_scalar * gamma) + beta;
-        }
-
-        int  x             = window_start_x;
-        auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-        auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
-        var_vec            = wrapper::vdup_n(var, ExactTagType{});
-        gamma_vec          = wrapper::vdup_n(gamma, ExactTagType{});
-        rvar_vec           = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
-            wn      = wrapper::vmul(wn, rvar_vec);
-            wn      = wrapper::vmul(wn, gamma_vec);
-
-            // Store results
-            wrapper::vstore(dwc_w_out_ptr + x, wn);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    dwc_w_in, dwc_w_out);
-}
-
 } // namespace
 
 NEFuseBatchNormalizationKernel::NEFuseBatchNormalizationKernel()
-    : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
-      _run_in_place_weights(false), _run_in_place_bias(false), _func(nullptr)
+    : _input_weights(nullptr),
+      _input_bias(nullptr),
+      _bn_mean(nullptr),
+      _bn_var(nullptr),
+      _bn_gamma(nullptr),
+      _bn_beta(nullptr),
+      _fused_weights(nullptr),
+      _fused_bias(nullptr),
+      _epsilon(),
+      _run_in_place_weights(false),
+      _run_in_place_bias(false),
+      _func(nullptr)
 {
 }
 
-void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
-                                               ITensor *fused_weights, ITensor *fused_bias,
-                                               const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalizationKernel::configure(const ITensor             *input_weights,
+                                               const ITensor             *bn_mean,
+                                               const ITensor             *bn_var,
+                                               ITensor                   *fused_weights,
+                                               ITensor                   *fused_bias,
+                                               const ITensor             *input_bias,
+                                               const ITensor             *bn_beta,
+                                               const ITensor             *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
 
@@ -440,65 +260,49 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con
     _run_in_place_bias    = (fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
 
     // Auto initialize outputs
-    if(_fused_weights != nullptr)
+    if (_fused_weights != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
     }
-    if(_fused_bias != nullptr)
+    if (_fused_bias != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
     }
 
     // Validate arguments
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
-                                                  (fused_weights != nullptr) ? fused_weights->info() : nullptr,
-                                                  (fused_bias != nullptr) ? fused_bias->info() : nullptr,
-                                                  (input_bias != nullptr) ? input_bias->info() : nullptr,
-                                                  (bn_beta != nullptr) ? bn_beta->info() : nullptr,
-                                                  (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
-                                                  epsilon, fbn_type));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_weights->info(), bn_mean->info(), bn_var->info(),
+        (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+        (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+        (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+        fbn_type));
+
+    const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{
+        input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+    ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+    _func = uk->ukernel;
 
     // Configure kernel window
     Window win = calculate_max_window(*input_weights->info());
     INEKernel::configure(win);
-
-    // Configure function
-    static std::map<std::string, FuseBatchNormFunction *> map_function =
-    {
-        { "fused_batch_normalization_conv_NHWC_F32", &fused_batch_normalization_conv<wrapper::traits::neon_vector<float, 4>> },
-        { "fused_batch_normalization_conv_NCHW_F32", &fused_batch_normalization_conv<wrapper::traits::neon_vector<float, 4>> },
-        { "fused_batch_normalization_dwc_NHWC_F32", &fused_batch_normalization_dwc_nhwc<wrapper::traits::neon_vector<float, 4>> },
-        { "fused_batch_normalization_dwc_NCHW_F32", &fused_batch_normalization_dwc_nchw<wrapper::traits::neon_vector<float, 4>> },
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { "fused_batch_normalization_conv_NHWC_F16", &fused_batch_normalization_conv<wrapper::traits::neon_vector<float16_t, 8>> },
-        { "fused_batch_normalization_conv_NCHW_F16", &fused_batch_normalization_conv<wrapper::traits::neon_vector<float16_t, 8>> },
-        { "fused_batch_normalization_dwc_NHWC_F16", &fused_batch_normalization_dwc_nhwc<wrapper::traits::neon_vector<float16_t, 8>> },
-        { "fused_batch_normalization_dwc_NCHW_F16", &fused_batch_normalization_dwc_nchw<wrapper::traits::neon_vector<float16_t, 8>> },
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    };
-
-    std::string function_to_call("fused_batch_normalization_");
-    function_to_call += fbn_type == FuseBatchNormalizationType::CONVOLUTION ? "conv_" : "dwc_";
-    function_to_call += string_from_data_layout(_input_weights->info()->data_layout());
-    function_to_call += "_";
-    function_to_call += string_from_data_type(_input_weights->info()->data_type());
-
-    auto it = map_function.find(function_to_call);
-
-    if(it != map_function.end())
-    {
-        _func = it->second;
-    }
 }
 
-Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                                const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                                const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                                float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo         *input_weights,
+                                                const ITensorInfo         *bn_mean,
+                                                const ITensorInfo         *bn_var,
+                                                const ITensorInfo         *fused_weights,
+                                                const ITensorInfo         *fused_bias,
+                                                const ITensorInfo         *input_bias,
+                                                const ITensorInfo         *bn_beta,
+                                                const ITensorInfo         *bn_gamma,
+                                                float                      epsilon,
+                                                FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                   input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
     return Status{};
 }
 
@@ -507,6 +311,9 @@ void NEFuseBatchNormalizationKernel::run(const Window &window, const ThreadInfo
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, window);
+
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon,
+             window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
index ee767b01c8..f23280d55a 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
@@ -66,9 +66,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
-                   const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ITensor             *input_weights,
+                   const ITensor             *bn_mean,
+                   const ITensor             *bn_var,
+                   ITensor                   *fused_weights,
+                   ITensor                   *fused_bias,
+                   const ITensor             *input_bias = nullptr,
+                   const ITensor             *bn_beta    = nullptr,
+                   const ITensor             *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -86,10 +93,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -107,8 +120,16 @@ private:
     bool           _run_in_place_weights;
     bool           _run_in_place_bias;
 
-    using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                       const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
+    using FuseBatchNormFunction = void(const ITensor *input_weights,
+                                       const ITensor *input_bias,
+                                       ITensor       *fused_weights,
+                                       ITensor       *fused_bias,
+                                       const ITensor *bn_mean,
+                                       const ITensor *bn_var,
+                                       const ITensor *bn_beta,
+                                       const ITensor *bn_gamma,
+                                       float          epsilon,
+                                       const Window  &window);
 
     FuseBatchNormFunction *_func;
 };
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
deleted file mode 100644
index 9011680c9b..0000000000
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    if(output->total_size() != 0)
-    {
-        TensorShape output_shape = input->tensor_shape();
-        output_shape.set(0, input->dimension(0) * 4);
-        output_shape.set(1, std::ceil(input->dimension(1) / 4.0f));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel()
-    : _func(nullptr)
-{
-}
-
-void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info())));
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    switch(input->info()->element_size())
-    {
-        case 1:
-            _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint8_t>;
-            break;
-        case 2:
-            _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint16_t>;
-            break;
-        case 4:
-            _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR_ON("Element size not supported");
-            break;
-    }
-
-    Window win = calculate_max_window(*input->info(), Steps(1, 4));
-
-    INEKernel::configure(win);
-}
-
-Status NEGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-
-    return Status{};
-}
-
-template <typename ScalarType>
-void NEGEMMInterleave4x4Kernel::gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window)
-{
-    const size_t window_start_x = window.x().start();
-    const size_t window_end_x   = window.x().end();
-
-    const size_t in_height = input->info()->dimension(1);
-    const size_t in_stride = input->info()->strides_in_bytes()[1];
-
-    const size_t partial_y = in_height % 4;
-
-    // Set window for the input tensor
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Set window for the output tensor
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_out.scale(Window::DimY, 0.25f);
-
-    Iterator in(input, win);
-    Iterator out(output, win_out);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        if(id.y() + 4 <= static_cast<int>(in_height))
-        {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
-            {
-                const ScalarType data[4] =
-                {
-                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 0 * in_stride) + x),
-                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 1 * in_stride) + x),
-                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 2 * in_stride) + x),
-                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 3 * in_stride) + x),
-                };
-                std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType));
-            }
-        }
-        else
-        {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
-            {
-                ScalarType data[4] = { 0, 0, 0, 0 };
-
-                for(size_t y = 0; y < partial_y; ++y)
-                {
-                    data[y] = *(reinterpret_cast<const ScalarType *>(in.ptr() + y * in_stride) + x);
-                }
-
-                std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType));
-            }
-        }
-    },
-    in, out);
-}
-
-void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    /*
-    *  This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
-    *         |a00 a01 a02 a03|
-    *         |a10 a11 a12 a13|
-    *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
-    *         |a30 a31 a32 a33|
-    *
-    *         After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
-    */
-    (this->*_func)(_input, _output, window);
-}
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
deleted file mode 100644
index e592d5ef6e..0000000000
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H
-#define ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to interleave the elements of a matrix
- *
- * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
- */
-class NEGEMMInterleave4x4Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMInterleave4x4Kernel";
-    }
-    /** Constructor */
-    NEGEMMInterleave4x4Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMInterleave4x4Kernel(const NEGEMMInterleave4x4Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMInterleave4x4Kernel &operator=(const NEGEMMInterleave4x4Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMInterleave4x4Kernel(NEGEMMInterleave4x4Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMInterleave4x4Kernel &operator=(NEGEMMInterleave4x4Kernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMInterleave4x4Kernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run gemm interleave 4x4
-     *
-     * @tparam ScalarType Scalar datatype
-     *
-     * @param[in]  input  Input tensor. Data types supported: uint32_t, uint16_t and uint8_t
-     * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t
-     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename ScalarType>
-    void gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window);
-
-    /** Common signature for all the specialised gemm interleave 4x4 functions
-     *
-     * @param[in]  input  Input tensor. Data types supported: uint32_t, uint16_t and uint8_t
-     * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t
-     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    using GEMMInterleaveFunctionFuncPtr = void (NEGEMMInterleave4x4Kernel::*)(const ITensor *input, ITensor *output, const Window &window);
-
-    GEMMInterleaveFunctionFuncPtr _func;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H*/
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
deleted file mode 100644
index b95bdd4ca5..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,1052 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-namespace
-{
-void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
-{
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(id.x() > width_b)
-        {
-            return;
-        }
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        uint32x4x4_t c0 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
-        auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
-        auto vec_a_end_addr = vec_a + width_a;
-
-        // This for loop performs 8 accumulations
-        for(; vec_a <= (vec_a_end_addr - 8);)
-        {
-            const uint8x8_t  a00_u8 = vld1_u8(vec_a);
-            const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
-            const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
-            const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
-            const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
-            const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
-            const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
-            const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
-            const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4x2_t a00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(a00_u8)),
-                    vget_high_u16(vmovl_u8(a00_u8))
-                }
-            };
-
-            const uint16x4x4_t b00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            const uint16x4x4_t b10_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))
-                }
-            };
-
-            const uint16x4x4_t b20_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))
-                }
-            };
-
-            const uint16x4x4_t b30_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))
-                }
-            };
-
-            const uint16x4x4_t b40_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))
-                }
-            };
-
-            const uint16x4x4_t b50_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))
-                }
-            };
-
-            const uint16x4x4_t b60_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))
-                }
-            };
-
-            const uint16x4x4_t b70_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))
-                }
-            };
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
-
-            // Accumulate 1:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
-
-            // Accumulate 2:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
-
-            // Accumulate 3:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
-
-            // Accumulate 4:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
-
-            // Accumulate 5:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
-
-            // Accumulate 6:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
-
-            // Accumulate 7:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
-
-            vec_a += 8;
-            matrix_b += 8 * stride_b;
-        }
-
-        // This for loop performs the left-over accumulations
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
-            const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
-
-            const uint16x4x4_t b00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
-            vec_a += 1;
-            matrix_b += stride_b;
-        }
-
-        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() < (width_out - 16))
-        {
-            vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
-            vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
-            vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
-            vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
-        }
-        else
-        {
-            auto left_over = width_out - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(vec_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-        }
-    },
-    ina, inb, out);
-}
-
-void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
-{
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(id.x() > width_b)
-        {
-            return;
-        }
-
-        // Accumulators for the block 0
-        int32x4x4_t c0 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
-        auto vec_a_end_addr = vec_a + width_a;
-
-        // This for loop performs 8 accumulations
-        for(; vec_a <= (vec_a_end_addr - 8);)
-        {
-            const int8x8_t  a00_s8 = vld1_s8(vec_a);
-            const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
-            const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
-            const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
-            const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
-            const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
-            const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
-            const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
-            const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
-
-            // Convert a00_s8 to int16_t and get the lower part
-            const int16x4x2_t a00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(a00_s8)),
-                    vget_high_s16(vmovl_s8(a00_s8))
-                }
-            };
-
-            const int16x4x4_t b00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            const int16x4x4_t b10_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))
-                }
-            };
-
-            const int16x4x4_t b20_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))
-                }
-            };
-
-            const int16x4x4_t b30_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))
-                }
-            };
-
-            const int16x4x4_t b40_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))
-                }
-            };
-
-            const int16x4x4_t b50_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))
-                }
-            };
-
-            const int16x4x4_t b60_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))
-                }
-            };
-
-            const int16x4x4_t b70_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))
-                }
-            };
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
-
-            // Accumulate 1:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
-
-            // Accumulate 2:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
-
-            // Accumulate 3:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
-
-            // Accumulate 4:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
-
-            // Accumulate 5:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
-
-            // Accumulate 6:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
-
-            // Accumulate 7:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
-
-            vec_a += 8;
-            matrix_b += 8 * stride_b;
-        }
-
-        // This for loop performs the left-over accumulations
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
-            const int8x16_t b00_s8 = vld1q_s8(matrix_b);
-
-            const int16x4x4_t b00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            // Convert a00_s8 to uint16_t and get the lower part
-            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
-            vec_a += 1;
-            matrix_b += stride_b;
-        }
-
-        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() < (width_out - 16))
-        {
-            vst1q_s32(vec_out + 0, c0.val[0]);
-            vst1q_s32(vec_out + 4, c0.val[1]);
-            vst1q_s32(vec_out + 8, c0.val[2]);
-            vst1q_s32(vec_out + 12, c0.val[3]);
-        }
-        else
-        {
-            auto left_over = width_out - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(vec_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-        }
-    },
-    ina, inb, out);
-}
-
-void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
-{
-    const auto   width_out  = static_cast<int>(out_info.dimension(0));
-    const auto   height_out = static_cast<int>(out_info.dimension(1));
-    const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8_t *mtx_a0 = ina.ptr();
-        const uint8_t *mtx_b0 = inb.ptr();
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        uint32x4x4_t c0 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        // Accumulators for the block 1
-        uint32x4x4_t c1 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        // Accumulators for the block 2
-        uint32x4x4_t c2 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        // Accumulators for the block 3
-        uint32x4x4_t c3 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
-        {
-            const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
-            const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
-            // Convert b00_s8 to uint16_t
-            const uint16x4x4_t b00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            // 4x4 block 0
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
-            // 4x4 block 1
-            c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
-            c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
-            c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
-            c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
-
-            // 4x4 block 2
-            c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
-            c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
-            c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
-            c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
-
-            // 4x4 block 3
-            c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
-            c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
-            c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
-            c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
-        }
-
-        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-
-        if(id.y() < height_out && id.x() < (width_out - 16))
-        {
-            vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
-            vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
-            vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
-            vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
-            if(id.y() + 1 < height_out)
-            {
-                vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
-                vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
-                vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
-                vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
-                if(id.y() + 2 < height_out)
-                {
-                    vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
-                    if(id.y() + 3 < height_out)
-                    {
-                        vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
-                    }
-                }
-            }
-        }
-        else
-        {
-            const auto left_over_value = width_out - id.x();
-            auto       left_over       = left_over_value;
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(mtx_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-            if(id.y() + 1 < height_out)
-            {
-                left_over = left_over_value;
-                for(auto k = 0; k < 4 && left_over; ++k)
-                {
-                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                    {
-                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
-                    }
-                }
-                if(id.y() + 2 < height_out)
-                {
-                    left_over = left_over_value;
-                    for(auto k = 0; k < 4 && left_over; ++k)
-                    {
-                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                        {
-                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
-                        }
-                    }
-                    if(id.y() + 3 < height_out)
-                    {
-                        left_over = left_over_value;
-                        for(auto k = 0; k < 4 && left_over; ++k)
-                        {
-                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                            {
-                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    },
-    ina, inb, out);
-}
-
-void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
-{
-    const auto   width_out  = static_cast<int>(out_info.dimension(0));
-    const auto   height_out = static_cast<int>(out_info.dimension(1));
-    const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
-    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
-    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
-    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        int32x4x4_t c0 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 1
-        int32x4x4_t c1 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 2
-        int32x4x4_t c2 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 3
-        int32x4x4_t c3 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
-        {
-            const int8x8_t  a00_s8 = vld1_s8(mtx_a0);
-            const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
-
-            // Convert a00_s8 to uint16_t and get the lower part
-            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
-            // Convert b00_s8 to int16_t
-            const int16x4x4_t b00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            // 4x4 block 0
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
-            // 4x4 block 1
-            c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
-            c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
-            c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
-            c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
-
-            // 4x4 block 2
-            c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
-            c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
-            c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
-            c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
-
-            // 4x4 block 3
-            c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
-            c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
-            c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
-            c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
-        }
-        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.y() < height_out && id.x() < (width_out - 16))
-        {
-            vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
-            vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
-            vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
-            vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
-            if(id.y() + 1 < height_out)
-            {
-                vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
-                vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
-                vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
-                vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
-                if(id.y() + 2 < height_out)
-                {
-                    vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
-                    if(id.y() + 3 < height_out)
-                    {
-                        vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
-                    }
-                }
-            }
-        }
-        else if(id.y() < height_out)
-        {
-            const auto left_over_value = width_out - id.x();
-            auto       left_over       = left_over_value;
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(mtx_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-            if(id.y() + 1 < height_out)
-            {
-                left_over = left_over_value;
-                for(auto k = 0; k < 4 && left_over; ++k)
-                {
-                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                    {
-                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
-                    }
-                }
-                if(id.y() + 2 < height_out)
-                {
-                    left_over = left_over_value;
-                    for(auto k = 0; k < 4 && left_over; ++k)
-                    {
-                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                        {
-                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
-                        }
-                    }
-                    if(id.y() + 3 < height_out)
-                    {
-                        left_over = left_over_value;
-                        for(auto k = 0; k < 4 && left_over; ++k)
-                        {
-                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                            {
-                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-    },
-    ina, inb, out);
-}
-} // namespace
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-
-    TensorShape in0_shape = input0->tensor_shape();
-    TensorShape in1_shape = input1->tensor_shape();
-    TensorShape out_shape = output->tensor_shape();
-
-    // Check vector-by-matrix case
-    if(out_shape[1] == 1)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");
-    }
-    else
-    {
-        in0_shape.collapse(2);
-        in1_shape.collapse(2);
-        out_shape.collapse(2);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");
-    }
-
-    return Status{};
-}
-} // namespace
-
-NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
-{
-}
-
-void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
-
-    TensorShape in1_shape = input1->info()->tensor_shape();
-    in1_shape.collapse(2);
-
-    _input0         = input0;
-    _input1         = input1;
-    _output         = output;
-    _slide_matrix_b = in1_shape[2] != 1;
-
-    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
-    Window win;
-
-    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if((output->info()->dimension(1) == 1))
-    {
-        // Configure kernel window
-        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
-    }
-    else
-    {
-        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    }
-
-    INEKernel::configure(win);
-}
-
-Status NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
-
-    return Status{};
-}
-
-void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
-    if((_output->info()->dimension(1) == 1))
-    {
-        const auto width_matrix_a = static_cast<int>(_input0->info()->dimension(0));
-        const auto width_matrix_b = static_cast<int>(_input1->info()->dimension(0));
-        const auto width_out      = static_cast<int>(_output->info()->dimension(0));
-        const auto in_b_stride    = static_cast<int>(_input1->info()->strides_in_bytes()[1] / data_size_from_type(_input1->info()->data_type()));
-
-        // The implementation computes 16 elements per iteration
-        const int window_start_x = 16 * info.thread_id;
-        const int window_step_x  = 16 * info.num_threads;
-        // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
-        const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-
-        Window win_out(window);
-        win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-        win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Window win_a(window);
-        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-        Window win_b;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(_input1->info()->num_dimensions() >= 3)
-        {
-            win_b = window;
-        }
-        win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-        win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Iterator ina(_input0, win_a);
-        Iterator inb(_input1, win_b);
-        Iterator out(_output, win_out);
-
-        switch(_input0->info()->data_type())
-        {
-            case DataType::S8:
-            case DataType::QASYMM8_SIGNED:
-            {
-                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
-                break;
-            }
-            case DataType::U8:
-            case DataType::QASYMM8:
-            {
-                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not supported");
-                break;
-            }
-        }
-    }
-    else
-    {
-        const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
-        const int    width_b     = _input1->info()->dimension(0);
-
-        // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
-        Window win_a(window);
-        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));
-
-        // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix
-        Window win_b;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(_slide_matrix_b)
-        {
-            win_b = window;
-        }
-        win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride));
-        win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-        // The step x and step y for the output matrix has been already set using in configure()
-        Iterator ina(_input0, win_a);
-        Iterator inb(_input1, win_b);
-        Iterator out(_output, window);
-
-        switch(_input0->info()->data_type())
-        {
-            case DataType::S8:
-            case DataType::QASYMM8_SIGNED:
-            {
-                matrix_multiply_s8(ina, inb, out, width_b, *_output->info(), window);
-                break;
-            }
-            case DataType::U8:
-            case DataType::QASYMM8:
-            {
-                matrix_multiply_u8(ina, inb, out, width_b, *_output->info(), window);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not supported");
-                break;
-            }
-        }
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
deleted file mode 100644
index acfb79edeb..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to multiply matrices
- *
- * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
- *  This kernel performs the following computation:
- *
- *  -# Convert a values from int8 to int32
- *  -# Convert b values from int8 to int32
- *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class NEGEMMLowpMatrixMultiplyKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpMatrixMultiplyKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpMatrixMultiplyKernel(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpMatrixMultiplyKernel &operator=(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpMatrixMultiplyKernel(NEGEMMLowpMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpMatrixMultiplyKernel &operator=(NEGEMMLowpMatrixMultiplyKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMLowpMatrixMultiplyKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
-     * kernels change the layout of the original matrices to be more cache-friendly.
-     *
-     * @param[in]  input0 Input tensor containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
-     * @param[in]  input1 Input tensor containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel
-     *
-     * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
-     * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-    bool           _slide_matrix_b;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H*/
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
deleted file mode 100644
index 867beca0ac..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
-                          int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    return Status{};
-}
-
-void run_offset_contribution(const Window &window,
-                             ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row,
-                             int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d)
-{
-    Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
-    collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
-    const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16;
-
-    Iterator mm_result_it(mm_result, collapsed_window);
-
-    if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
-    {
-        // Set window for vector_sum_col
-        Window win_vector_sum_col(collapsed_window);
-        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        // Set window for vector_sum_row
-        Window win_vector_sum_row(collapsed_window);
-        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
-        Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
-
-        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
-
-        // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int batch_id           = id.z() / depth_input;
-            auto      vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-            auto      mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
-            // Compute the leftover term due to b_offset.
-            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 *= b_offset;
-
-            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Compute the leftover term due to a_offset.
-                int32x4x4_t a_offset_term_s32 =
-                {
-                    {
-                        vld1q_s32(vector_sum_col_ptr + x + 0),
-                        vld1q_s32(vector_sum_col_ptr + x + 4),
-                        vld1q_s32(vector_sum_col_ptr + x + 8),
-                        vld1q_s32(vector_sum_col_ptr + x + 12)
-                    }
-                };
-
-                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
-                // Add a_offset_term_s32 and b_offset_term_s32
-                int32x4x4_t offset_term_s32 =
-                {
-                    {
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset)
-                    }
-                };
-
-                offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
-                offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
-                offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
-                offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
-
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Compute the leftover term due to a_offset.
-                int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
-                a_offset_term_s32 *= a_offset;
-
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
-            }
-        },
-        vector_sum_col_it, vector_sum_row_it, mm_result_it);
-    }
-    else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
-
-        // Set window for vector_sum_row
-        Window win_vector_sum_row(collapsed_window);
-        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
-
-        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int batch_id      = id.z() / depth_input;
-            auto      mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
-            // Compute the leftover term due to b_offset.
-            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 *= b_offset;
-
-            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += b_offset_term_s32;
-            }
-        },
-        vector_sum_row_it, mm_result_it);
-    }
-    else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
-    {
-        // Set window for vector_sum_col
-        Window win_vector_sum_col(collapsed_window);
-        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
-
-        // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int batch_id           = id.z() / depth_input;
-            auto      vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-            auto      mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Compute the leftover term due to a_offset.
-                int32x4x4_t a_offset_term_s32 =
-                {
-                    {
-                        vld1q_s32(vector_sum_col_ptr + x + 0),
-                        vld1q_s32(vector_sum_col_ptr + x + 4),
-                        vld1q_s32(vector_sum_col_ptr + x + 8),
-                        vld1q_s32(vector_sum_col_ptr + x + 12)
-                    }
-                };
-
-                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Compute the leftover term due to a_offset.
-                const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += a_offset_term_s32 * a_offset;
-            }
-        },
-        vector_sum_col_it, mm_result_it);
-    }
-    else // false, false
-    {
-        // No offset contribution from matrix A and matrix B
-        return;
-    }
-}
-} // namespace
-
-NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel()
-    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true)
-{
-}
-
-void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
-                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
-                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
-                                                  a_offset, b_offset));                                         // NOLINT
-
-    _vector_sum_col = vector_sum_col;
-    _vector_sum_row = vector_sum_row;
-    _mm_result      = mm_result;
-    _a_offset       = a_offset;
-    _b_offset       = b_offset;
-    _k_offset       = a_offset * b_offset * k;
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        // Check if vector_sum_col_shape should be slidden or not
-        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        _slide_vector_sum_col = vector_sum_col->info()->tensor_shape().num_dimensions() > 1;
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result->info(), Steps());
-    INEKernel::configure(win);
-}
-
-Status NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
-                                                    int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
-
-    return Status{};
-}
-
-void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = _vector_sum_row != nullptr
-                                   && _mm_result->info()->num_dimensions() > 1
-                                   && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x();
-
-    run_offset_contribution(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
deleted file mode 100644
index f71929fe9e..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (vector_sum_col[k] * a_offset) +
- *                   (vector_sum_row[i] * b_offset) +
- *                   (a_offset * b_offset * k)
- *
- */
-class NEGEMMLowpOffsetContributionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpOffsetContributionKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpOffsetContributionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionKernel(const NEGEMMLowpOffsetContributionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionKernel &operator=(const NEGEMMLowpOffsetContributionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionKernel(NEGEMMLowpOffsetContributionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionKernel &operator=(NEGEMMLowpOffsetContributionKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMLowpOffsetContributionKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in, out] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      k              Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
-     */
-    void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_vector_sum_col;
-    const ITensor *_vector_sum_row;
-    ITensor       *_mm_result;
-    int32_t        _a_offset;
-    int32_t        _b_offset;
-    int32_t        _k_offset;
-    bool           _slide_vector_sum_col;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
deleted file mode 100644
index dfed7f0bb8..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ /dev/null
@@ -1,959 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <map>
-
-namespace arm_compute
-{
-namespace
-{
-inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x)
-{
-    return
-    {
-        {
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)
-        }
-    };
-}
-
-inline int32x4x4_t load(const int32_t *ptr, int32_t x)
-{
-    return
-    {
-        {
-            vld1q_s32(ptr + x + 0),
-            vld1q_s32(ptr + x + 4),
-            vld1q_s32(ptr + x + 8),
-            vld1q_s32(ptr + x + 12)
-        }
-    };
-}
-
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
-{
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b),
-            vaddq_s32(a.val[1], b),
-            vaddq_s32(a.val[2], b),
-            vaddq_s32(a.val[3], b)
-        }
-    };
-}
-
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
-{
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b.val[0]),
-            vaddq_s32(a.val[1], b.val[1]),
-            vaddq_s32(a.val[2], b.val[2]),
-            vaddq_s32(a.val[3], b.val[3])
-        }
-    };
-}
-
-inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
-{
-    return
-    {
-        {
-            vmulq_n_s32(a.val[0], mul_scalar),
-            vmulq_n_s32(a.val[1], mul_scalar),
-            vmulq_n_s32(a.val[2], mul_scalar),
-            vmulq_n_s32(a.val[3], mul_scalar)
-        }
-    };
-}
-
-inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier)
-{
-    return
-    {
-        {
-            vmulq_s32(a.val[0], vld1q_s32(multilpier)),
-            vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
-            vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)),
-            vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))
-        }
-    };
-}
-
-inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
-{
-    int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x);
-
-    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-    return a_offset_term_s32;
-}
-
-inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offset)
-{
-    int32x4_t b_offset_term_s32 = vld1q_dup_s32(vector_sum_row_ptr);
-    b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, b_offset);
-    return b_offset_term_s32;
-}
-
-inline int32x4x4_t get_k_offset(int32_t k_offset)
-{
-    return
-    {
-        {
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset)
-        }
-    };
-}
-
-inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
-{
-    const static int32x4_t zero_s32 = vdupq_n_s32(0);
-
-    // Shift final result (negative value shift right)
-    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
-    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
-    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
-    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
-
-    // Saturate negative values
-    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
-    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
-    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
-    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to U8
-    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_u8 = vmaxq_u8(out_u8, min_u8);
-        out_u8 = vminq_u8(out_u8, max_u8);
-    }
-
-    return out_u8;
-}
-
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
-{
-    const static int32x4_t zero_s32 = vdupq_n_s32(0);
-
-    // Shift final result (negative value shift right)
-    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
-    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
-    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
-    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
-
-    // Saturate negative values
-    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
-    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
-    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
-    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to S8
-    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_s8 = vmaxq_s8(out_s8, min_s8);
-        out_s8 = vminq_s8(out_s8, max_s8);
-    }
-
-    return out_s8;
-}
-
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
-{
-    const static int32x4_t zero_s32 = vdupq_n_s32(0);
-
-    // Shift final result (negative value shift right)
-    in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0]));
-    in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1]));
-    in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2]));
-    in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3]));
-
-    // Saturate negative values
-    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
-    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
-    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
-    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to S8
-    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_s8 = vmaxq_s8(out_s8, min_s8);
-        out_s8 = vminq_s8(out_s8, max_s8);
-    }
-
-    return out_s8;
-}
-
-template <typename T>
-struct VectorTyper
-{
-    using stype = T;
-    using vtype = typename wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128>;
-};
-
-inline Window get_win_vector_sum(const Window &window)
-{
-    Window win_vector_sum(window);
-    win_vector_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    return win_vector_sum;
-}
-
-inline Iterator get_vector_sum_col_it(const Window &window, const ITensor *vector_sum_col)
-{
-    Iterator vector_sum_col_it(vector_sum_col, get_win_vector_sum(window));
-    return vector_sum_col_it;
-}
-
-inline Iterator get_vector_sum_row_it(const Window &window, const ITensor *vector_sum_row)
-{
-    Window win_vector_sum_row = get_win_vector_sum(window);
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
-    return vector_sum_row_it;
-}
-
-inline Iterator get_bias_it(const Window &window, const ITensor *bias)
-{
-    Window win_bias(window);
-    win_bias.set(Window::DimY, Window::Dimension(0, 1, 1));
-    win_bias.set(Window::DimZ, Window::Dimension(0, 1, 1));
-    Iterator bias_it(bias, win_bias);
-    return bias_it;
-}
-
-template <typename VT>
-inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
-                                                        const int32x4_t result_offset_s32, const int32x4_t result_shift_s32,
-                                                        typename VT::vtype min_vec, typename VT::vtype max_vec,
-                                                        int32_t a_offset, int32_t b_offset, int32_t k_offset,
-                                                        int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                        int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
-{
-    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
-    if(!is_fixed_point)
-    {
-        // Combine quantization offset with other offsets.
-        offset_term_s32 = add_s32(offset_term_s32, result_offset_s32);
-    }
-    if(has_a_offset && has_b_offset)
-    {
-        offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset));
-    }
-    if(has_b_offset)
-    {
-        offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset));
-    }
-
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
-
-        if(has_a_offset)
-        {
-            in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
-        }
-        if(has_bias)
-        {
-            in_s32 = add_s32(in_s32, load(bias_ptr, x));
-        }
-        if(!is_fixed_point || has_b_offset)
-        {
-            in_s32 = add_s32(in_s32, offset_term_s32);
-        }
-        if(!is_fixed_point)
-        {
-            in_s32 = mul_s32(in_s32, multiplier);
-        }
-
-        if(is_fixed_point)
-        {
-            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
-        }
-        else
-        {
-            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
-        }
-    }
-    // Compute left-over elements
-    for(; x < window_end_x; ++x)
-    {
-        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
-
-        if(has_a_offset)
-        {
-            in_value += (*(vector_sum_col_ptr + x) * a_offset);
-        }
-        if(has_bias)
-        {
-            in_value += *(bias_ptr + x);
-        }
-
-        if(is_fixed_point)
-        {
-            // Finalize and store the result
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset,
-                                                                                              static_cast<typename VT::stype>(min_bound),
-                                                                                              static_cast<typename VT::stype>(max_bound), is_bounded_relu);
-        }
-        else
-        {
-            // Finalize quantization
-            in_value = (in_value * multiplier) >> shift;
-
-            // Bound and store the result
-            if(is_bounded_relu)
-            {
-                in_value = static_cast<typename VT::stype>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
-            }
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = static_cast<typename VT::stype>(std::max<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
-                                                                                                                          std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
-        }
-    }
-}
-
-inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
-                                                             const int32_t *result_multipliers, const int32_t *result_shifts,
-                                                             const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
-                                                             int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                             int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
-{
-    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
-    if(!is_fixed_point)
-    {
-        // Combine quantization offset with other offsets.
-        offset_term_s32 = add_s32(offset_term_s32, result_offset);
-    }
-
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
-
-        if(has_a_offset)
-        {
-            in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
-        }
-        if(has_bias)
-        {
-            in_s32 = add_s32(in_s32, load(bias_ptr, x));
-        }
-        if(!is_fixed_point)
-        {
-            in_s32 = add_s32(in_s32, offset_term_s32);
-            in_s32 = mul_s32(in_s32, result_multipliers + x);
-        }
-
-        if(is_fixed_point)
-        {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu));
-        }
-        else
-        {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
-        }
-    }
-    // Compute left-over elements
-    for(; x < window_end_x; ++x)
-    {
-        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
-
-        if(has_a_offset)
-        {
-            in_value += (*(vector_sum_col_ptr + x) * a_offset);
-        }
-        if(has_bias)
-        {
-            in_value += *(bias_ptr + x);
-        }
-
-        if(is_fixed_point)
-        {
-            // Finalize and store the result
-            *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
-        }
-        else
-        {
-            // Finalize quantization
-            in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]);
-
-            // Bound and store the result
-            if(is_bounded_relu)
-            {
-                in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
-            }
-            *(out_it.ptr() + x) = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
-        }
-    }
-}
-
-template <typename T>
-void run_offset_contribution_output_stage(const Window &window,
-                                          const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
-                                          int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
-                                          GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
-{
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-    using Typer        = VectorTyper<T>;
-
-    const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
-    const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
-
-    const int32_t multiplier = output_stage.gemmlowp_multiplier;
-    const int32_t shift      = output_stage.gemmlowp_shift;
-    const int32_t offset     = output_stage.gemmlowp_offset;
-    const int32_t min_bound  = output_stage.gemmlowp_min_bound;
-    const int32_t max_bound  = output_stage.gemmlowp_max_bound;
-
-    const int32x4_t result_offset_s32 = vdupq_n_s32(offset);
-    const int32x4_t result_shift_s32  = vdupq_n_s32(is_fixed_point ? shift : -shift);
-    const auto      min_vec           = wrapper::vdup_n(static_cast<T>(min_bound), ExactTagType{});
-    const auto      max_vec           = wrapper::vdup_n(static_cast<T>(max_bound), ExactTagType{});
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
-
-    Iterator mm_result_it(mm_result, win);
-    Iterator out_it(output, win);
-
-    if((a_offset != 0) && (b_offset != 0))
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
-
-        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
-        Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row);
-
-        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
-
-        // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
-                                                                   mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
-        }
-    }
-    else if((a_offset == 0) && (b_offset != 0))
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
-
-        Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row);
-
-        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
-
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_row_it, bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_row_it, mm_result_it, out_it);
-        }
-    }
-    else if((a_offset != 0) && (b_offset == 0))
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
-
-        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
-
-        // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, mm_result_it, out_it);
-        }
-    }
-    else
-    {
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point);
-            },
-            bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point);
-            },
-            mm_result_it, out_it);
-        }
-        return;
-    }
-}
-
-void run_offset_contribution_output_stage_symm(const Window &window,
-                                               const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
-                                               int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
-                                               GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
-{
-    ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
-
-    const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
-
-    const int32_t offset    = output_stage.gemmlowp_offset;
-    const int32_t min_bound = output_stage.gemmlowp_min_bound;
-    const int32_t max_bound = output_stage.gemmlowp_max_bound;
-
-    const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data();
-    const int32_t *result_shifts      = output_stage.gemmlowp_shifts.data();
-    const int32x4_t result_offset_s32  = vdupq_n_s32(offset);
-    const int8x16_t min_s8             = vdupq_n_s8(static_cast<int8_t>(min_bound));
-    const int8x16_t max_s8             = vdupq_n_s8(static_cast<int8_t>(max_bound));
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
-
-    Iterator mm_result_it(mm_result, win);
-    Iterator out_it(output, win);
-
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
-
-        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
-
-        // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, mm_result_it, out_it);
-        }
-    }
-    else
-    {
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point);
-            },
-            bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point);
-            },
-            mm_result_it, out_it);
-        }
-        return;
-    }
-}
-
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                          int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-    if(output->data_type() != DataType::QASYMM8)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
-    }
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = output->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *output)
-{
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, mm_result->clone()->set_data_type(DataType::QASYMM8));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result, Steps());
-
-    // Note: This kernel performs 16 elements per iteration.
-    // However, since we use a left-over for loop, we cannot have any read or write out of memory
-    // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped
-
-    return std::make_pair(Status{}, win);
-}
-} // namespace
-
-NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageKernel()
-    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true),
-      _output_stage(GEMMLowpOutputStageInfo())
-
-{
-}
-
-void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_result, const ITensor *vector_sum_col,
-                                                              const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset,
-                                                              GEMMLowpOutputStageInfo output_stage)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
-                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
-                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
-                                                  bias != nullptr ? bias->info() : nullptr,                     // NOLINT
-                                                  output->info(), a_offset, b_offset, output_stage));           // NOLINT
-
-    _vector_sum_col = vector_sum_col;
-    _vector_sum_row = vector_sum_row;
-    _bias           = bias;
-    _mm_result      = mm_result;
-    _output         = output;
-    _a_offset       = a_offset;
-    _b_offset       = b_offset;
-    _k_offset       = a_offset * b_offset * k;
-    _output_stage   = output_stage;
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        // Check if vector_sum_col_shape should be slidden or not
-        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        _slide_vector_sum_col = vector_sum_col->info()->tensor_shape().num_dimensions() > 1;
-    }
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(mm_result->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
-                                                               const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                                                               int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), output->clone().get()).first);
-    return Status{};
-}
-
-void NEGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    PixelValue type_min{};
-    PixelValue type_max{};
-    std::tie(type_min, type_max) = get_min_max(_output->info()->data_type());
-    int32_t type_min_int = type_min.get<int32_t>();
-    int32_t type_max_int = type_max.get<int32_t>();
-
-    const bool reinterpret_as_3d = _vector_sum_row != nullptr
-                                   && _mm_result->info()->num_dimensions() > 1
-                                   && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x();
-
-    const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
-
-    // Check if we need to perform fixed point requantization
-    const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
-
-    // Check if symmetric per-channel execution
-    const bool is_signed = _output->info()->data_type() == DataType::QASYMM8_SIGNED;
-
-    // Check if symmetric per-channel execution
-    const bool is_symm = _output_stage.is_quantized_per_channel;
-
-    if(is_symm)
-    {
-        run_offset_contribution_output_stage_symm(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
-                                                  reinterpret_as_3d, is_bounded_relu, is_fixed_point);
-    }
-    else
-    {
-        if(is_signed)
-        {
-            run_offset_contribution_output_stage<int8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
-                                                         reinterpret_as_3d, is_bounded_relu, is_fixed_point);
-        }
-        else
-        {
-            run_offset_contribution_output_stage<uint8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
-                                                          reinterpret_as_3d, is_bounded_relu, is_fixed_point);
-        }
-    }
-}
-
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 6908f37aad..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel used to add the offset contribution and perform the output stage after @ref NEGEMMLowpMatrixMultiplyKernel.
- *
- * The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8.
- * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8.
- *
- * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is:
- *
- * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift
- *
- * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is:
- *
- * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * and mm_result'[i][k] = mm_result[i][k] +
- *                        (vector_sum_col[k] * a_offset) +
- *                        (vector_sum_row[i] * b_offset) +
- *                        (a_offset * b_offset * k)
- */
-
-class NEGEMMLowpOffsetContributionOutputStageKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpOffsetContributionOutputStageKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpOffsetContributionOutputStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionOutputStageKernel(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionOutputStageKernel &operator=(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionOutputStageKernel(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionOutputStageKernel &operator=(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMLowpOffsetContributionOutputStageKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in]  vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                            Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     * @param[in]  bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                            Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
-     * @param[out] output         Output tensor containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  k              Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset       Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
-     */
-    void configure(const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
-                   GEMMLowpOutputStageInfo output_stage);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionOutputStageKernel
-     *
-     * @param[in] mm_result      Input tensor info containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in] vector_sum_col Tensor info for the input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Tensor info for the input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias           Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
-     * @param[in] output         Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     * @param[in] output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
-                           int32_t                 b_offset,
-                           GEMMLowpOutputStageInfo output_stage);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to use for the particular tensors passed to configure() */
-    const ITensor          *_vector_sum_col;
-    const ITensor          *_vector_sum_row;
-    const ITensor          *_bias;
-    const ITensor          *_mm_result;
-    ITensor                *_output;
-    int32_t                 _a_offset;
-    int32_t                 _b_offset;
-    int32_t                 _k_offset;
-    bool                    _slide_vector_sum_col;
-    GEMMLowpOutputStageInfo _output_stage;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
deleted file mode 100644
index 84365ba25b..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        if(output->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 || output_stage->output_data_type == DataType::QASYMM8_SIGNED))
-        {
-            ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types");
-        }
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-
-inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
-{
-    // Add the offset terms to GEMM's result
-    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
-    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
-    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
-    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
-
-    // Multiply by result_mult_int
-    in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int);
-    in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int);
-    in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int);
-    in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value,
-       typename wrapper::traits::neon_vector<T, 16>::type>::type
-       convert_to_8bit(const int16x8x2_t in_s16)
-{
-    return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1]));
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value,
-       typename wrapper::traits::neon_vector<T, 16>::type>::type
-       convert_to_8bit(const int16x8x2_t in_s16)
-{
-    return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1]));
-}
-
-template <typename T>
-inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector<T, 16>::type min,
-                                                                                typename wrapper::traits::neon_vector<T, 16>::type max)
-{
-    // Shift final result (negative value shift right)
-    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
-    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
-    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
-    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to S8 or U8
-    typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16);
-
-    out = wrapper::vmax(out, min);
-    out = wrapper::vmin(out, max);
-
-    return out;
-}
-
-class Coordinates;
-
-template <typename T>
-void NEGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window)
-{
-    using VectorType = typename wrapper::traits::neon_vector<T, 16>::type;
-
-    const int32x4_t result_offset_s32 = vdupq_n_s32(_output_stage->gemmlowp_offset);
-    const int32x4_t result_shift_s32  = vdupq_n_s32(-_output_stage->gemmlowp_shift);
-    const int       window_step_x     = 16;
-    const auto      window_start_x    = static_cast<int>(window.x().start());
-    const auto      window_end_x      = static_cast<int>(window.x().end());
-
-    const int clamp_min = (_is_bounded_relu) ? _output_stage->gemmlowp_min_bound : std::numeric_limits<T>::lowest();
-    const int clamp_max = (_is_bounded_relu) ? _output_stage->gemmlowp_max_bound : std::numeric_limits<T>::max();
-
-    VectorType min = wrapper::vdup_n(static_cast<T>(clamp_min), wrapper::traits::vector_128_tag{});
-    VectorType max = wrapper::vdup_n(static_cast<T>(clamp_max), wrapper::traits::vector_128_tag{});
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(_input, win);
-    Iterator out(_output, win);
-
-    if(_bias != nullptr)
-    {
-        Window win_biases;
-        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Iterator bias(_bias, win_biases);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                // Add the offset terms to GEMM's result and multiply by result_mult_int
-                scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
-
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int bias_value = *(reinterpret_cast<const int *>(bias.ptr()) + x);
-                int       in_value   = *(reinterpret_cast<const int *>(in.ptr()) + x);
-
-                // Quantize
-                in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
-
-                // Store the result
-                *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
-            }
-        },
-        in, bias, out);
-    }
-    else
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result and multiply by result_mult_int
-                scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
-
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
-
-                // Quantize
-                in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
-
-                // Store the result
-                *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
-            }
-        },
-        in, out);
-    }
-}
-
-NEGEMMLowpQuantizeDownInt32ScaleKernel::NEGEMMLowpQuantizeDownInt32ScaleKernel()
-    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _output_stage(nullptr), _is_bounded_relu(false)
-{
-}
-
-void NEGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, output_stage);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_stage->output_data_type));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
-                                                  (bias != nullptr) ? bias->info() : nullptr,
-                                                  output->info(),
-                                                  output_stage));
-
-    _input        = input;
-    _bias         = bias;
-    _output       = output;
-    _output_stage = output_stage;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
-
-    INEKernel::configure(win);
-
-    // Check if we need to clamp the result using min and max
-    _is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound)
-                        && !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                             && _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
-    if(_output_stage->output_data_type == DataType::QASYMM8)
-    {
-        _func = &NEGEMMLowpQuantizeDownInt32ScaleKernel::run<uint8_t>;
-    }
-    else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
-    {
-        _func = &NEGEMMLowpQuantizeDownInt32ScaleKernel::run<int8_t>;
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Data type not supported");
-    }
-}
-
-Status NEGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, output_stage));
-
-    return Status{};
-}
-
-void NEGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    (this->*_func)(window);
-}
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
deleted file mode 100644
index 021ff8e2e0..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *  -#  -to the [0..255] range and cast to QASYMM8.
- *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ScaleKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ScaleKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ScaleKernel(const NEGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const NEGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ScaleKernel(NEGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ScaleKernel &operator=(NEGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMLowpQuantizeDownInt32ScaleKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input        Input tensor. Data type supported: S32
-     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] output_stage GEMMLowp output stage metadata.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in]  input        Input tensor. Data type supported: S32
-     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]  output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] output_stage GEMMLowp output stage metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ScaleKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ScaleKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr        _func;
-    const ITensor                 *_input;
-    const ITensor                 *_bias;
-    ITensor                       *_output;
-    const GEMMLowpOutputStageInfo *_output_stage;
-    bool                           _is_bounded_relu;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
deleted file mode 100644
index aa54b80436..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, input);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QSYMM16));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps());
-
-    // NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel doesn't need padding so update_window_and_padding() can be skipped
-
-    return std::make_pair(Status{}, win);
-}
-} // namespace
-
-class Coordinates;
-
-template <bool is_bounded_relu>
-void NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run(const Window &window)
-{
-    const int16x8_t min_s16 = vdupq_n_s16(static_cast<int16_t>(_min));
-    const int16x8_t max_s16 = vdupq_n_s16(static_cast<int16_t>(_max));
-
-    ARM_COMPUTE_UNUSED(min_s16);
-    ARM_COMPUTE_UNUSED(max_s16);
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(_input, win_collapsed);
-    Iterator out(_output, win_collapsed);
-    if(_bias != nullptr)
-    {
-        Window win_biases;
-        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Iterator bias(_bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x2_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
-                    }
-                };
-
-                const int32x4x2_t bias_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 4)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-
-                vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
-                                                                                                             static_cast<int16_t>(_max));
-            }
-        },
-        in, out, bias);
-    }
-    else
-    {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x2_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
-                    }
-                };
-
-                vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-                ARM_COMPUTE_UNUSED(in_value);
-                // Finalize and store the result
-                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
-                                                                                                             static_cast<int16_t>(_max));
-            }
-        },
-        in, out);
-    }
-}
-
-NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel()
-    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _min(0), _max(0)
-{
-}
-
-void NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
-                                                                          int min, int max)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
-
-    _input                        = input;
-    _bias                         = bias;
-    _output                       = output;
-    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
-    _result_shift                 = result_shift;
-    _min                          = min;
-    _max                          = max;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-
-    // Check if we need to clamp the result using min and max
-    const bool is_bounded_relu = !(min <= -32768 && max >= 32767);
-    _func                      = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run<false>;
-}
-
-Status NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-void NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    (this->*_func)(window);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
deleted file mode 100644
index b01b204a6f..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor info. Data type supported: S32
-     * @param[in] bias   Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor info with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr _func;
-    const ITensor          *_input;
-    const ITensor          *_bias;
-    ITensor                *_output;
-    int                     _result_fixedpoint_multiplier;
-    int                     _result_shift;
-    int                     _min;
-    int                     _max;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 9ed85e62aa..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, input);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8_SIGNED));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps());
-
-    // NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel doesn't need padding so update_window_and_padding() can be skipped
-
-    return std::make_pair(Status{}, win);
-}
-} // namespace
-
-template <bool is_bounded_relu>
-void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window &window)
-{
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
-    const int8x16_t min_s8                        = vdupq_n_s8(static_cast<int8_t>(_min));
-    const int8x16_t max_s8                        = vdupq_n_s8(static_cast<int8_t>(_max));
-
-    ARM_COMPUTE_UNUSED(min_s8, max_s8);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(_input, win_collapsed);
-    Iterator out(_output, win_collapsed);
-    if(_bias != nullptr)
-    {
-        Window win_biases;
-        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Iterator bias(_bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out, bias);
-    }
-    else
-    {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out);
-    }
-}
-
-NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel()
-    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0), _min(0), _max(0)
-{
-}
-
-void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
-                                                                         int result_offset_after_shift, int min, int max)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
-
-    _input                        = input;
-    _bias                         = bias;
-    _output                       = output;
-    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
-    _result_shift                 = result_shift;
-    _result_offset_after_shift    = result_offset_after_shift;
-    _min                          = min;
-    _max                          = max;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-
-    // Check if we need to clamp the result using min and max
-    const bool is_bounded_relu = !(min <= -128 && max >= 127);
-    _func                      = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run<false>;
-}
-
-Status NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    (this->*_func)(window);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
deleted file mode 100644
index 9e7dc2f599..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr _func;
-    const ITensor          *_input;
-    const ITensor          *_bias;
-    ITensor                *_output;
-    int                     _result_fixedpoint_multiplier;
-    int                     _result_shift;
-    int                     _result_offset_after_shift;
-    int                     _min;
-    int                     _max;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 83ca6f944d..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, input);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps());
-
-    // NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel doesn't need padding so update_window_and_padding() can be skipped
-
-    return std::make_pair(Status{}, win);
-}
-} // namespace
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-template <bool is_bounded_relu>
-void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window)
-{
-    const int32x4_t  result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
-    const uint8x16_t min_u8                        = vdupq_n_u8(static_cast<uint8_t>(_min));
-    const uint8x16_t max_u8                        = vdupq_n_u8(static_cast<uint8_t>(_max));
-
-    ARM_COMPUTE_UNUSED(min_u8);
-    ARM_COMPUTE_UNUSED(max_u8);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(_input, win_collapsed);
-    Iterator out(_output, win_collapsed);
-    if(_bias != nullptr)
-    {
-        Window win_biases;
-        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Iterator bias(_bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out, bias);
-    }
-    else
-    {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out);
-    }
-}
-
-NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel()
-    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0), _min(0), _max(0)
-{
-}
-
-void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
-                                                                          int result_offset_after_shift, int min, int max)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
-
-    _input                        = input;
-    _bias                         = bias;
-    _output                       = output;
-    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
-    _result_shift                 = result_shift;
-    _result_offset_after_shift    = result_offset_after_shift;
-    _min                          = min;
-    _max                          = max;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-
-    // Check if we need to clamp the result using min and max
-    const bool is_bounded_relu = !(min <= 0 && max >= 255);
-    _func                      = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run<false>;
-}
-
-Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    (this->*_func)(window);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
deleted file mode 100644
index def0573967..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr _func;
-    const ITensor          *_input;
-    const ITensor          *_bias;
-    ITensor                *_output;
-    int                     _result_fixedpoint_multiplier;
-    int                     _result_shift;
-    int                     _result_offset_after_shift;
-    int                     _min;
-    int                     _max;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
deleted file mode 100644
index dfbfbd6fab..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
-    }
-    return Status{};
-}
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
-    }
-    return Status{};
-}
-} // namespace
-
-INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
-    : _input(), _output(), _k(0), _scalar(0), _mul_by_scalar(false)
-{
-}
-
-void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
-    ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
-    _input         = mtx_a;
-    _output        = vector_sum_row;
-    _k             = info.k;
-    _scalar        = info.scalar;
-    _mul_by_scalar = info.mul_by_scalar;
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(1)), 1, DataType::S32);
-
-    Window win = calculate_max_window(*_output->info(), Steps(1));
-
-    INEKernel::configure(win);
-}
-
-Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
-    return Status{};
-}
-
-template <typename T>
-void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window)
-{
-    // Intermediate and final accumulator types
-    using TIAcc = wrapper::traits::promote_t<T>;
-    using TAcc  = wrapper::traits::promote_t<TIAcc>;
-
-    Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
-
-    Window win_input(collapsed_window);
-    win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, win_input);
-    Iterator out(_output, collapsed_window);
-
-    execute_window_loop(collapsed_window, [&](const Coordinates & id)
-    {
-        auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
-        TAcc sum_row  = 0;
-
-        const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
-
-#if __arm__
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
-#endif /* __arm__ */
-
-        int i = 0;
-        // This for loop performs 16 accumulations
-        for(; i <= (_k - 16); i += 16)
-        {
-            const auto a0_d8 = wrapper::vloadq(matrix_a + i);
-
-            // Partial accumulations in U16
-            const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
-
-            // Accumulate to U32
-            vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
-        }
-
-        // This for loop performs the leftover accumulations
-        for(; i < _k; ++i)
-        {
-            sum_row += static_cast<TAcc>(matrix_a[i]);
-        }
-
-#if defined(__aarch64__)
-        // Reduction operation available on 64 bit architectures only
-        sum_row += wrapper::vaddv(vsum_row);
-#else  // __aarch64__
-        auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
-        tmp      = wrapper::vpadd(tmp, tmp);
-
-        sum_row += wrapper::vgetlane(tmp, 0);
-#endif // __aarch64__
-
-        // Multiply by scalar if necessary
-        if(_mul_by_scalar)
-        {
-            sum_row *= _scalar;
-        }
-
-        *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
-    },
-    in, out);
-}
-
-void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    switch(_input->info()->data_type())
-    {
-        case DataType::QASYMM8:
-            run_internal<uint8_t>(window);
-            break;
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8:
-        case DataType::QSYMM8_PER_CHANNEL:
-            run_internal<int8_t>(window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type");
-    }
-}
-
-void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
-    ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
-
-    _input         = mtx_b;
-    _output        = vector_sum_col;
-    _k             = info.k;
-    _scalar        = info.scalar;
-    _mul_by_scalar = info.mul_by_scalar;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(0)), 1, DataType::S32);
-
-    // Configure kernel window
-    Window win = calculate_max_window_horizontal(*_output->info(), Steps(num_elems_processed_per_iteration));
-    INEKernel::configure(win);
-}
-
-Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
-
-    return Status{};
-}
-
-template <typename T>
-void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info)
-{
-    // Intermediate and final accumulator types
-    using TIAcc = wrapper::traits::promote_t<T>;
-    using TAcc  = wrapper::traits::promote_t<TIAcc>;
-
-    Window     collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
-    const auto vec_scalar       = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
-
-    const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
-    const auto in_b_stride    = static_cast<int>(_input->info()->strides_in_bytes()[1]);
-
-    // The implementation computes 16 elements per iteration
-    const int window_start_x = 16 * info.thread_id;
-    const int window_step_x  = 16 * info.num_threads;
-    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
-    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-
-    Window win_out(collapsed_window);
-    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-
-    Window win_in(win_out);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Iterator inb(_input, win_in);
-    Iterator out(_output, win_out);
-
-    execute_window_loop(win_out, [&](const Coordinates & id)
-    {
-        if(id.x() > width_matrix_b)
-        {
-            return;
-        }
-
-        // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-        typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
-        {
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
-        };
-
-        const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
-
-#if __arm__
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
-#endif /* __arm__ */
-
-        int i = 0;
-        // This for loop performs 4 accumulations
-        for(; i <= (_k - 4); i += 4)
-        {
-            const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
-            const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
-            const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
-            const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
-#endif /* __arm__ */
-
-            // Partial accumulation in 16bit
-            typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
-            {
-                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
-                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
-            };
-
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
-
-            // Accumulate to 32bit
-            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
-            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
-            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
-            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
-
-            matrix_b += 4 * in_b_stride;
-        }
-
-        // This for loop perfoms the leftover accumulations
-        for(; i < _k; ++i)
-        {
-            const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
-
-            // Convert S8 to S16
-            const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
-            {
-                wrapper::vmovl(wrapper::vgetlow(b0_b8)),
-                wrapper::vmovl(wrapper::vgethigh(b0_b8))
-            };
-
-            // Accumulate to 32bit
-            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
-            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
-            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
-            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
-
-            matrix_b += in_b_stride;
-        }
-
-        // Multiply by scalar if necessary
-        if(_mul_by_scalar)
-        {
-            sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
-            sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
-            sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
-            sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
-        }
-
-        auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() + 16 < width_matrix_b)
-        {
-            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
-            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
-            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
-            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
-        }
-        else
-        {
-            auto left_over = width_matrix_b - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(vector_sum_col + k * 4 + j) = sum_col[k][j];
-                }
-            }
-        }
-    },
-    inb, out);
-}
-
-void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    switch(_input->info()->data_type())
-    {
-        case DataType::QASYMM8:
-            run_internal<uint8_t>(window, info);
-            break;
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8:
-        case DataType::QSYMM8_PER_CHANNEL:
-            run_internal<int8_t>(window, info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type");
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
deleted file mode 100644
index 9be618d656..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-struct GEMMLowpReductionKernelInfo;
-
-/** Common interface for all reduction kernels */
-class INEGEMMLowpReductionKernel : public INEKernel
-{
-public:
-    /** Constructor */
-    INEGEMMLowpReductionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    INEGEMMLowpReductionKernel(const INEGEMMLowpReductionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    INEGEMMLowpReductionKernel &operator=(const INEGEMMLowpReductionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEGEMMLowpReductionKernel(INEGEMMLowpReductionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEGEMMLowpReductionKernel &operator=(INEGEMMLowpReductionKernel &&) = default;
-    /** Default destructor */
-    virtual ~INEGEMMLowpReductionKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info   Kernel metadata:
-     *                    - k            Number of matrix columns/rows depending on the type of reduction.
-     *                    - is_reshaped  True if the matrix has been reshaped.
-     *                    - scalar       Scalar value to multiply each reduced column/row by.
-     *                    - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const ITensor *input, ITensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-
-protected:
-    const ITensor *_input;
-    ITensor       *_output;
-    int32_t        _k;
-    int32_t        _scalar;
-    bool           _mul_by_scalar;
-};
-
-/** Kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class NEGEMMLowpMatrixAReductionKernel : public INEGEMMLowpReductionKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpMatrixAReductionKernel";
-    }
-    /** Default constructor */
-    NEGEMMLowpMatrixAReductionKernel() = default;
-    /** Prevent instances of this class from being copied */
-    NEGEMMLowpMatrixAReductionKernel(const NEGEMMLowpMatrixAReductionKernel &) = delete;
-    /** Prevent instances of this class from being copied */
-    NEGEMMLowpMatrixAReductionKernel &operator=(const NEGEMMLowpMatrixAReductionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpMatrixAReductionKernel(NEGEMMLowpMatrixAReductionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpMatrixAReductionKernel &operator=(NEGEMMLowpMatrixAReductionKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMLowpMatrixAReductionKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            (num_mtx_a_cols) Number of matrix A columns
-     *                            - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
-     *                            - scalar       Scalar value to multiply each reduced row by.
-     *                            - mul_byscalar True if each reduced column must be multiplied by a scalar value.
-     */
-    void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
-     *
-     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            (num_mtx_a_cols) Number of matrix A columns
-     *                           - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
-     *                           - scalar       Scalar value to multiply each reduced row by.
-     *                           - mul_byscalar True if each reduced column must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Execution of the reduction kernel specialized on the input type
-     *
-     * @param[in] window Execution window
-     */
-    template <typename T>
-    void run_internal(const Window &window);
-};
-
-/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class NEGEMMLowpMatrixBReductionKernel : public INEGEMMLowpReductionKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpMatrixBReductionKernel";
-    }
-    /** Default constructor */
-    NEGEMMLowpMatrixBReductionKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMLowpMatrixBReductionKernel(const NEGEMMLowpMatrixBReductionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMLowpMatrixBReductionKernel &operator=(const NEGEMMLowpMatrixBReductionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpMatrixBReductionKernel(NEGEMMLowpMatrixBReductionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpMatrixBReductionKernel &operator=(NEGEMMLowpMatrixBReductionKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMLowpMatrixBReductionKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            (num_mtx_b_rows) Number of matrix B rows.
-     *                            - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
-     *                            - scalar       Scalar value to multiply each reduced row by.
-     *                            - mul_byscalar True if each reduced row must be multiplied by a scalar value.
-     */
-    void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
-     *
-     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            (num_mtx_b_rows) Number of matrix B rows.
-     *                           - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
-     *                           - scalar       Scalar value to multiply each reduced row by.
-     *                           - mul_byscalar True if each reduced row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Execution of the reduction kernel specialized on the input type
-     *
-     * @param[in] window Execution window
-     * @param[in] info   Thread-related information
-     */
-    template <typename T>
-    void run_internal(const Window &window, const ThreadInfo &info);
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
deleted file mode 100644
index 6a2802a991..0000000000
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float beta)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_UNUSED(beta);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-
-void matrix_addition_f32(const ITensor *input, ITensor *output, const Window &window, float beta)
-{
-    const float32x4_t beta_f32 = vdupq_n_f32(beta);
-
-    Iterator in(input, window);
-    Iterator out(output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
-        float32x4x4_t       alpha_ab = vld4q_f32(out_ptr);
-        const float32x4x4_t c        = vld4q_f32(in_ptr);
-
-        // Multiply matrix C by its weight and accumulate
-        alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
-        alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
-        alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
-        alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
-
-        vst4q_f32(out_ptr, alpha_ab);
-    },
-    in, out);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &window, float beta)
-{
-    const float16x8_t beta_f16 = vdupq_n_f16(beta);
-
-    Iterator in(input, window);
-    Iterator out(output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
-
-        float16x8x2_t       alpha_ab = vld2q_f16(out_ptr);
-        const float16x8x2_t c        = vld2q_f16(in_ptr);
-        // Multiply matrix C by its weight and accumulate
-        alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
-        alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
-
-        vst2q_f16(out_ptr + 0, alpha_ab);
-    },
-    in, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-} // namespace
-
-NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
-    : INESimpleKernel(), _func(nullptr), _beta(0.0f)
-{
-}
-
-void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), beta));
-
-    switch(input->info()->data_type())
-    {
-        case DataType::F32:
-            _func = &matrix_addition_f32;
-            break;
-        case DataType::F16:
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            _func = &matrix_addition_f16;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
-    }
-
-    // Configure kernel window
-    INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
-
-    _beta = beta;
-}
-
-Status NEGEMMMatrixAdditionKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, beta));
-    ARM_COMPUTE_RETURN_ON_ERROR(INESimpleKernel::validate(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration));
-    return Status{};
-}
-
-void NEGEMMMatrixAdditionKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    if(_beta != 0.0f)
-    {
-        (*_func)(_input, _output, window, _beta);
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
deleted file mode 100644
index c896cabc6a..0000000000
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H
-#define ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
- *
- * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size
- *
- * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have:
- *        - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref NEGEMMMatrixMultiplyKernel
- *        - MTX_1 = C
- */
-class NEGEMMMatrixAdditionKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixAdditionKernel";
-    }
-    /** Constructor */
-    NEGEMMMatrixAdditionKernel();
-    /** Prevent instances of this class from being copied */
-    NEGEMMMatrixAdditionKernel(const NEGEMMMatrixAdditionKernel &) = delete;
-    /** Prevent instances of this class from being copied */
-    NEGEMMMatrixAdditionKernel &operator=(const NEGEMMMatrixAdditionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAdditionKernel(NEGEMMMatrixAdditionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAdditionKernel &operator=(NEGEMMMatrixAdditionKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMMatrixAdditionKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * @param[in]      input  Input tensor (Matrix C). Data types supported: F16/F32
-     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
-     * @param[in]      beta   Weight of matrix C
-     */
-    void configure(const ITensor *input, ITensor *output, float beta);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAdditionKernel.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * @param[in] input  Input tensor info (Matrix C). Data types supported: F16/F32
-     * @param[in] output Output tensor info. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
-     * @param[in] beta   Weight of matrix C
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the matrix addition functions
-     *
-     * @param[in]  input  An input tensor. Data types supported: F16/F32
-     * @param[out] output The output tensor. Data type supported: same as @p input
-     * @param[in]  window Region on which to execute the kernel.
-     * @param[in]  beta   Weight of matrix C
-     */
-    using MatrixAdditionFunction = void(const ITensor *input, ITensor *output, const Window &window, float beta);
-    /** Matrix addition function to use for the particular tensor types passed to configure() */
-    MatrixAdditionFunction *_func;
-    float                   _beta;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
deleted file mode 100644
index b4a3bb5e77..0000000000
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,1170 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void vector_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
-{
-    const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
-    const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / input1->info()->element_size());
-    const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
-
-    // The implementation computes 32 elements per iteration
-    const int window_start_x = 32 * info.thread_id;
-    const int window_step_x  = 32 * info.num_threads;
-    const int window_end_x   = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
-
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(input1->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    win_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Iterator ina(input0, win_a);
-    Iterator inb(input1, win_b);
-    Iterator out(output, win_out);
-
-    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
-
-    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
-
-    execute_window_loop(win_out, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
-        // window_end_x is computed above which may cause out-of-bound writes to the output.
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
-
-            float16x8_t acc0 = vdupq_n_f16(0.f);
-            float16x8_t acc1 = vdupq_n_f16(0.f);
-            float16x8_t acc2 = vdupq_n_f16(0.f);
-            float16x8_t acc3 = vdupq_n_f16(0.f);
-
-            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
-            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4);)
-            {
-                const float16x4_t a0l = vld1_f16(vec_a);
-
-                float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-                float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-                float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-                float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-                float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
-
-                matrix_b += 2 * in_b_stride;
-
-                b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-                b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-                b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-                b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-                b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
-
-                vec_a += 4;
-                matrix_b += 2 * in_b_stride;
-            }
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float16_t   a0  = *vec_a;
-                const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
-                acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
-                acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
-                acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc0 = vmulq_f16(acc0, alpha_f16);
-                acc1 = vmulq_f16(acc1, alpha_f16);
-                acc2 = vmulq_f16(acc2, alpha_f16);
-                acc3 = vmulq_f16(acc3, alpha_f16);
-            }
-
-            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
-
-            vst1q_f16(vec_out + 0, acc0);
-            vst1q_f16(vec_out + 8, acc1);
-            vst1q_f16(vec_out + 16, acc2);
-            vst1q_f16(vec_out + 24, acc3);
-        }
-
-        for(; x < window_end_x; ++x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
-
-            float16x4_t vacc = vdup_n_f16(0.f);
-
-            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
-            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
-            {
-                const float16x4_t a0l = vld1_f16(vec_a);
-
-                const float16x4_t b_col =
-                {
-                    *(matrix_b + 0 * in_b_stride),
-                    *(matrix_b + 1 * in_b_stride),
-                    *(matrix_b + 2 * in_b_stride),
-                    *(matrix_b + 3 * in_b_stride),
-                };
-
-                vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
-
-                matrix_b += 4 * in_b_stride;
-            }
-
-            float16_t acc = vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float16_t a0  = *vec_a;
-                const float16_t b00 = *matrix_b;
-
-                acc += b00 * a0;
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc *= static_cast<float16_t>(alpha);
-            }
-
-            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
-
-            *(vec_out) = acc;
-        }
-    },
-    ina, inb, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
-{
-    const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
-    const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
-    const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
-
-    // The implementation computes 16 elements per iteration
-    const int window_start_x = 16 * info.thread_id;
-    const int window_step_x  = 16 * info.num_threads;
-    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
-    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(input1->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    win_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Iterator ina(input0, win_a);
-    Iterator inb(input1, win_b);
-    Iterator out(output, win_out);
-
-    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
-
-    const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
-
-    execute_window_loop(win_out, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
-        // window_end_x is computed above which may cause out-of-bound writes to the output.
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            float32x4_t acc0 = vdupq_n_f32(0.f);
-            float32x4_t acc1 = vdupq_n_f32(0.f);
-            float32x4_t acc2 = vdupq_n_f32(0.f);
-            float32x4_t acc3 = vdupq_n_f32(0.f);
-
-            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
-            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
-#endif /* __arm__ */
-
-            auto vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4);)
-            {
-                float32x2_t a0l = vld1_f32(vec_a);
-
-                float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
-
-                float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-                float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-                float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-                float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
-
-#if __arm__
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
-#endif /* __arm__ */
-
-                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
-
-                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
-
-                vec_a += 2;
-                matrix_b += 2 * in_b_stride;
-
-                a0l = vld1_f32(vec_a);
-
-                b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
-
-                b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-                b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-                b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-                b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
-
-                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
-
-                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
-
-                vec_a += 2;
-                matrix_b += 2 * in_b_stride;
-            }
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float a0 = *vec_a;
-
-                const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
-
-                acc0 = vmlaq_n_f32(acc0, b00, a0);
-                acc1 = vmlaq_n_f32(acc1, b01, a0);
-                acc2 = vmlaq_n_f32(acc2, b02, a0);
-                acc3 = vmlaq_n_f32(acc3, b03, a0);
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc0 = vmulq_f32(acc0, alpha_f32);
-                acc1 = vmulq_f32(acc1, alpha_f32);
-                acc2 = vmulq_f32(acc2, alpha_f32);
-                acc3 = vmulq_f32(acc3, alpha_f32);
-            }
-
-            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
-
-            vst1q_f32(vec_out + 0, acc0);
-            vst1q_f32(vec_out + 4, acc1);
-            vst1q_f32(vec_out + 8, acc2);
-            vst1q_f32(vec_out + 12, acc3);
-        }
-
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            float32x4_t vacc = vdupq_n_f32(0.f);
-
-            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
-            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
-#endif /* __arm__ */
-
-            auto vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
-            {
-                const float32x4_t a0l = vld1q_f32(vec_a);
-
-                const float32x4_t b_col =
-                {
-                    *(matrix_b + 0 * in_b_stride),
-                    *(matrix_b + 1 * in_b_stride),
-                    *(matrix_b + 2 * in_b_stride),
-                    *(matrix_b + 3 * in_b_stride),
-                };
-
-#if __arm__
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
-#endif /* __arm__ */
-
-                vacc = vmlaq_f32(vacc, b_col, a0l);
-
-                matrix_b += 4 * in_b_stride;
-            }
-
-            float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + vgetq_lane_f32(vacc, 3);
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float a0 = *vec_a;
-
-                const float b00 = *matrix_b;
-
-                acc += b00 * a0;
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc *= alpha;
-            }
-
-            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
-
-            *vec_out = acc;
-        }
-    },
-    ina, inb, out);
-}
-
-void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
-{
-    const int    out_width            = static_cast<int>(output->info()->dimension(0));
-    const int    out_height           = static_cast<int>(output->info()->dimension(1));
-    const size_t in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
-    const size_t out_stride1          = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
-    const size_t out_stride2          = out_stride1 * 2;
-    const size_t out_stride3          = out_stride1 * 3;
-    const int    num_elems_matrix_b_x = input1->info()->dimension(0);
-
-    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(input1->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    // Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the output matrix
-    // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride));
-    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator ina(input0, win_a);
-    Iterator inb(input1, win_b);
-    Iterator out(output, window);
-
-    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
-
-    const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
-
-    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
-    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
-    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
-        auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
-        auto mtx_b1 = mtx_b0 + in_b_stride;
-
-        float32x4_t acc00 = vdupq_n_f32(0.f);
-        float32x4_t acc10 = vdupq_n_f32(0.f);
-        float32x4_t acc20 = vdupq_n_f32(0.f);
-        float32x4_t acc30 = vdupq_n_f32(0.f);
-
-        float32x4_t acc01 = vdupq_n_f32(0.f);
-        float32x4_t acc11 = vdupq_n_f32(0.f);
-        float32x4_t acc21 = vdupq_n_f32(0.f);
-        float32x4_t acc31 = vdupq_n_f32(0.f);
-
-#if __arm__
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-
-        auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
-        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
-        {
-            float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
-            float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
-            float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
-            float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
-
-            float32x4_t b00 = vld1q_f32(mtx_b0);
-            float32x4_t b10 = vld1q_f32(mtx_b1);
-            float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
-            float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
-            float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
-            float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
-            float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0 = vld1q_dup_f32(mtx_a0 + 0);
-            a1 = vld1q_dup_f32(mtx_a0 + 1);
-            a2 = vld1q_dup_f32(mtx_a0 + 2);
-            a3 = vld1q_dup_f32(mtx_a0 + 3);
-
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0  = vld1q_dup_f32(mtx_a0 + 0);
-            a1  = vld1q_dup_f32(mtx_a0 + 1);
-            a2  = vld1q_dup_f32(mtx_a0 + 2);
-            a3  = vld1q_dup_f32(mtx_a0 + 3);
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0  = vld1q_dup_f32(mtx_a0 + 0);
-            a1  = vld1q_dup_f32(mtx_a0 + 1);
-            a2  = vld1q_dup_f32(mtx_a0 + 2);
-            a3  = vld1q_dup_f32(mtx_a0 + 3);
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-        }
-
-        for(; mtx_b0 < mtx_b0_end_addr;)
-        {
-            float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
-            float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
-            float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
-            float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
-            float32x4_t b00 = vld1q_f32(mtx_b0);
-            float32x4_t b10 = vld1q_f32(mtx_b1);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            mtx_a0 += 4;
-            mtx_b0 += 4;
-            mtx_b1 += 4;
-        }
-
-        // Multiply by the weight of matrix product (alpha)
-        if(multiply_alpha)
-        {
-            acc00 = vmulq_f32(acc00, alpha_f32);
-            acc10 = vmulq_f32(acc10, alpha_f32);
-            acc20 = vmulq_f32(acc20, alpha_f32);
-            acc30 = vmulq_f32(acc30, alpha_f32);
-            acc01 = vmulq_f32(acc01, alpha_f32);
-            acc11 = vmulq_f32(acc11, alpha_f32);
-            acc21 = vmulq_f32(acc21, alpha_f32);
-            acc31 = vmulq_f32(acc31, alpha_f32);
-        }
-
-        const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
-        const auto mtx_out1 = mtx_out0 + 4;
-
-        if(id.x() < (out_width - 8))
-        {
-            vst1q_f32(mtx_out0, acc00);
-            vst1q_f32(mtx_out1, acc01);
-            if(id.y() + 1 < out_height)
-            {
-                vst1q_f32(mtx_out0 + out_stride1, acc10);
-                vst1q_f32(mtx_out1 + out_stride1, acc11);
-                if(id.y() + 2 < out_height)
-                {
-                    vst1q_f32(mtx_out0 + out_stride2, acc20);
-                    vst1q_f32(mtx_out1 + out_stride2, acc21);
-                    if(id.y() + 3 < out_height)
-                    {
-                        vst1q_f32(mtx_out0 + out_stride3, acc30);
-                        vst1q_f32(mtx_out1 + out_stride3, acc31);
-                    }
-                }
-            }
-        }
-        else if(id.x() < (out_width - 4))
-        {
-            vst1q_f32(mtx_out0, acc00);
-            if(id.y() + 1 < out_height)
-            {
-                vst1q_f32(mtx_out0 + out_stride1, acc10);
-                if(id.y() + 2 < out_height)
-                {
-                    vst1q_f32(mtx_out0 + out_stride2, acc20);
-                    if(id.y() + 3 < out_height)
-                    {
-                        vst1q_f32(mtx_out0 + out_stride3, acc30);
-                    }
-                }
-            }
-            // Left-over columns
-            const int columns_left = out_width - id.x() - 4;
-            for(auto x = 0; x < columns_left; ++x)
-            {
-                *(mtx_out1 + x) = acc01[x];
-                if(id.y() + 1 < out_height)
-                {
-                    *(mtx_out1 + x + out_stride1) = acc11[x];
-                    if(id.y() + 2 < out_height)
-                    {
-                        *(mtx_out1 + x + out_stride2) = acc21[x];
-                        if(id.y() + 3 < out_height)
-                        {
-                            *(mtx_out1 + x + out_stride3) = acc31[x];
-                        }
-                    }
-                }
-            }
-        }
-        else
-        {
-            // Left-over columns
-            const int columns_left = out_width - id.x();
-            for(int x = 0; x < columns_left; ++x)
-            {
-                *(mtx_out0 + x) = acc00[x];
-                if(id.y() + 1 < out_height)
-                {
-                    *(mtx_out0 + x + out_stride1) = acc10[x];
-                    if(id.y() + 2 < out_height)
-                    {
-                        *(mtx_out0 + x + out_stride2) = acc20[x];
-                        if(id.y() + 3 < out_height)
-                        {
-                            *(mtx_out0 + x + out_stride3) = acc30[x];
-                        }
-                    }
-                }
-            }
-        }
-    },
-    ina, inb, out);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
-{
-    const int    out_width            = static_cast<int>(output->info()->dimension(0));
-    const int    out_height           = static_cast<int>(output->info()->dimension(1));
-    const size_t in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
-    const size_t out_stride           = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
-    const int    num_elems_matrix_b_x = input1->info()->dimension(0);
-
-    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(input1->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    // Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the output matrix
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));
-    win_b.set(Window::DimY, Window::Dimension(0, 1, 0));
-
-    Iterator ina(input0, win_a);
-    Iterator inb(input1, win_b);
-    Iterator out(output, window);
-
-    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
-
-    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
-        const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
-        auto         *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
-        float16x8x4_t c =
-        {
-            {
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f)
-            }
-        };
-
-        /*
-        This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
-             |a00 a01 a02 a03 | a04 a05 a06 a07|
-             |a10 a11 a12 a13 | a14 a15 a16 a17|
-             |a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ...
-             |a30 a31 a32 a33 | a34 a35 a36 a37|   | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ...
-             |a40 a41 a42 a43 | a44 a45 a46 a47|
-             |a50 a51 a52 a53 | a54 a55 a56 a57|
-             |a60 a61 a62 a63 | a64 a65 a66 a67|
-             |a70 a71 a72 a73 | a74 a75 a76 a77|
-
-             After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
-
-        B Matrix has been transposed as shown below
-
-           |b00 b01 b02 b03 b04 b05 b06 b07|
-           |b10 b11 b12 b13 b14 b15 b16 b17|
-           |b20 b21 b22 b23 b24 b25 b26 b27|
-           |b30 b31 b32 b33 b34 b35 b36 b37|
-          ------------------->
-
-           |b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37|
-
-            c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30
-            c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31
-
-        The size of the output tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
-        */
-        const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
-
-        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
-
-        {
-            const float16x8_t p00 = vld1q_f16(mtx_a0);
-            const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
-
-            const float16x8_t q00 = vld1q_f16(mtx_b0);
-            const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
-            const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
-            const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
-
-            mtx_a0 += 16;
-            mtx_b0 += 32;
-        }
-
-        for(; mtx_b0 < mtx_b0_end_addr;)
-
-        {
-            const float16x4_t p00 = vld1_f16(mtx_a0);
-            const float16x8_t q00 = vld1q_f16(mtx_b0);
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
-
-            mtx_a0 += 4;
-            mtx_b0 += 8;
-        }
-
-        if(multiply_alpha)
-        {
-            c.val[0] = vmulq_f16(c.val[0], alpha_f16);
-            c.val[1] = vmulq_f16(c.val[1], alpha_f16);
-            c.val[2] = vmulq_f16(c.val[2], alpha_f16);
-            c.val[3] = vmulq_f16(c.val[3], alpha_f16);
-        }
-
-        if(id.x() < (out_width - 8))
-        {
-            vst1q_f16(mtx_out, c.val[0]);
-            if(id.y() + 1 < out_height)
-            {
-                vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
-                if(id.y() + 2 < out_height)
-                {
-                    vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
-                    if(id.y() + 3 < out_height)
-                    {
-                        vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
-                    }
-                }
-            }
-        }
-        else
-        {
-            // Left-over columns
-            const int columns_left = out_width - id.x();
-            for(int x = 0; x < columns_left; ++x)
-            {
-                *(mtx_out + x) = c.val[0][x];
-                if(id.y() + 1 < out_height)
-                {
-                    *(mtx_out + x + 1 * out_stride) = c.val[1][x];
-                    if(id.y() + 2 < out_height)
-                    {
-                        *(mtx_out + x + 2 * out_stride) = c.val[2][x];
-                        if(id.y() + 3 < out_height)
-                        {
-                            *(mtx_out + x + 3 * out_stride) = c.val[3][x];
-                        }
-                    }
-                }
-            }
-        }
-    },
-    ina, inb, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-
-    if(!is_interleaved)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
-
-        if(output->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
-            ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-        }
-    }
-    else
-    {
-        const int m                         = reshape_info.m();
-        const int n                         = reshape_info.n();
-        const int k                         = reshape_info.k();
-        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-
-        /* Interleave */
-        TensorShape tensor_shape0{ input0->tensor_shape() };
-        tensor_shape0.set(0, k);
-        tensor_shape0.set(1, m);
-
-        const TensorInfo tensor_info0          = input0->clone()->set_tensor_shape(tensor_shape0);
-        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
-
-        if(n != 0) /* Transpose */
-        {
-            TensorShape tensor_shape1{ input1->tensor_shape() };
-            tensor_shape1.set(0, n);
-            tensor_shape1.set(1, k);
-
-            const TensorInfo tensor_info1          = input1->clone()->set_tensor_shape(tensor_shape1);
-            const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-        }
-
-        if(output->total_size() != 0)
-        {
-            if(n != 0)
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
-            }
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-        }
-    }
-
-    return Status{};
-}
-} // namespace
-
-NEGEMMMatrixMultiplyKernel::NEGEMMMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _alpha(1.0f)
-{
-}
-
-void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    // Output tensor auto inizialitation if not yet initialized
-    TensorShape tensor_shape{ input0->info()->tensor_shape() };
-    tensor_shape.set(0, is_interleaved ? reshape_info.n() : input1->info()->dimension(0));
-    tensor_shape.set(1, is_interleaved ? reshape_info.m() : input0->info()->dimension(1));
-
-    auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape));
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), alpha, is_interleaved, reshape_info));
-
-    _input0 = input0;
-    _input1 = input1;
-    _output = output;
-    _alpha  = alpha;
-
-    // Configure kernel window
-    Window win{};
-
-    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if((output->info()->dimension(1) == 1))
-    {
-        const unsigned int num_elems_processed_per_iteration_x = (input0->info()->data_type() == DataType::F32) ? 16 : 32;
-
-        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
-    }
-    else
-    {
-        constexpr unsigned int num_elems_processed_per_iteration_x = 8;
-        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
-        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    }
-
-    INEKernel::configure(win);
-}
-
-Status NEGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved,
-                                            const GEMMReshapeInfo &reshape_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, alpha, is_interleaved, reshape_info));
-
-    return Status{};
-}
-
-void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    const bool is_output_vector = (_output->info()->dimension(1) == 1);
-    switch(_input0->info()->data_type())
-    {
-        case DataType::F32:
-        {
-            is_output_vector ? vector_matrix_multiply_f32(_input0, _input1, _output, window, info, _alpha) :
-            matrix_matrix_multiply_f32(_input0, _input1, _output, window, _alpha);
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            is_output_vector ? vector_matrix_multiply_f16(_input0, _input1, _output, window, info, _alpha) :
-            matrix_matrix_multiply_f16(_input0, _input1, _output, window, _alpha);
-            break;
-        }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-        {
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
-        }
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
deleted file mode 100644
index 3bc162a1b4..0000000000
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication
- *
- * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref NEGEMMInterleave4x4Kernel" and @ref NEGEMMTranspose1xWKernel
- * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
- *
- */
-class NEGEMMMatrixMultiplyKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixMultiplyKernel";
-    }
-    /** Constructor */
-    NEGEMMMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixMultiplyKernel(const NEGEMMMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixMultiplyKernel &operator=(const NEGEMMMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixMultiplyKernel(NEGEMMMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixMultiplyKernel &operator=(NEGEMMMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
-     *       These two kernels change the layout of the original matrices to be more cache-friendly.
-     *
-     * @param[in]  input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
-     * @param[in]  input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
-     *                            If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
-     * @param[out] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in]  alpha          Weight of the matrix product
-     * @param[in]  is_interleaved (Optional) True if input0 and input1 have been reshaped respectively using @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
-     * @param[in]  reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixMultiplyKernel
-     *
-     * @param[in] input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
-     * @param[in] input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
-     *                           If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
-     * @param[in] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in] alpha          Weight of the matrix product
-     * @param[in] is_interleaved (Optional) True if input0 and input1 have been reshaped respectively using @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
-     * @param[in] reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-    float          _alpha;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H*/
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
deleted file mode 100644
index 20b0cabd1f..0000000000
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-TensorShape get_output_shape(const ITensorInfo *input)
-{
-    TensorShape  output_shape{ input->tensor_shape() };
-    const size_t transpose_w = 16 / input->element_size();
-    output_shape.set(0, input->dimension(1) * transpose_w);
-    output_shape.set(1, static_cast<size_t>(std::ceil((input->dimension(0) / static_cast<float>(transpose_w)))));
-    return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type());
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    const size_t vector_size = 16 / input->info()->element_size();
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(vector_size));
-
-    INEKernel::configure(win);
-}
-
-Status NEGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-
-    return Status{};
-}
-
-void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    /*
-     * Following an example of how the transposition1xW works when the input data type is F32
-     *
-     *         |a00 a01 a02 a03|
-     *         |a10 a11 a12 a13|
-     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
-     *         |a30 a31 a32 a33|
-     *
-     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
-     */
-
-    // Set window for output tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, window);
-    Iterator out(_output, win_out);
-
-    const size_t in_width     = _input->info()->dimension(0);
-    const size_t element_size = _input->info()->element_size();
-    const size_t out_stride   = _output->info()->strides_in_bytes()[1];
-    const size_t vector_size  = 16 / element_size;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8_t *in_ptr  = in.ptr();
-        uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
-
-        for(size_t k = 0; k < vector_size; ++k)
-        {
-            // If the input width is not multiple of W, we fill the reference with 0s
-            if((id.x() + k) >= in_width)
-            {
-                std::memset(out_ptr + k * element_size, 0, element_size);
-            }
-            else
-            {
-                std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
-            }
-        }
-    },
-    in, out);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
deleted file mode 100644
index 7ca71cf414..0000000000
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H
-#define ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
- *
- * Following an example of how the transposition1xW works when the input data is F32
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * Following an example of how the transposition1xW works when the input data type is F16
- *
- * @f[
- * \left( \begin{array}{cccccccc}
- * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 \\
- * a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 \\
- * a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 \\
- * a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
- * \end{array} \right)
- * @f]
- *
- * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
- *
- */
-class NEGEMMTranspose1xWKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMTranspose1xWKernel";
-    }
-    /** Constructor */
-    NEGEMMTranspose1xWKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMTranspose1xWKernel(const NEGEMMTranspose1xWKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMTranspose1xWKernel &operator=(const NEGEMMTranspose1xWKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMTranspose1xWKernel(NEGEMMTranspose1xWKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMTranspose1xWKernel &operator=(NEGEMMTranspose1xWKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMTranspose1xWKernel() = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info. Data type supported: same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
index 7090da8015..f1d457d399 100644
--- a/src/core/NEON/kernels/NEGatherKernel.cpp
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,10 +27,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -38,40 +38,27 @@ namespace arm_compute
 {
 namespace
 {
-/** Validate the indices
- *
- * Validate that indices are not negative
- *
- * @param[in] indices Indices tensor info.
- */
-template <typename U>
-void validate_indices(const ITensor *indices)
-{
-    for(size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0);
-    }
-}
-
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
-    if(axis < 0)
+    if (axis < 0)
     {
         axis += input->num_dimensions();
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 >
+                                Coordinates::num_max_dimensions);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), axis);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+            input->tensor_shape(), indices->tensor_shape(), axis);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
     }
 
@@ -82,53 +69,70 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices,
 } // namespace
 
 NEGatherKernel::NEGatherKernel()
-    : _input{}, _indices{}, _axis{}, _output{}, _func{}
+    : _input{}, _indices{}, _axis{}, _output{}, _func{}, _src_it_strides{}, _idx_it_strides{}
 {
 }
 
-template <typename U>
-inline void NEGatherKernel::gather_0_axis(const Window &window, const ThreadInfo &info)
+template <typename TIndex>
+void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
 
-    // Validate that the indices are not negative
-    validate_indices<U>(_indices);
+    auto dst_win = window;
 
-    Iterator output_it(_output, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        Coordinates gather_id(id);
+    const auto src_info = _input->info();
+    const auto idx_info = _indices->info();
+    const auto dst_info = _output->info();
 
-        auto new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
-        gather_id.set(0, new_index);
+    const auto num_dims     = dst_info->num_dimensions();
+    const auto chunk_stride = src_info->strides_in_bytes()[_axis];
 
-        std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), output_it.ptr());
-    },
-    output_it);
-}
+    const auto window_start_x = window.x().start();
+    const auto window_end_x   = window.x().end();
+    auto       window_size_x  = src_info->element_size();
 
-template <typename U>
-void NEGatherKernel::gather_n_axis(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
+    const auto idx_limit = static_cast<TIndex>(src_info->tensor_shape()[_axis]);
 
-    // Validate that the indices are not negative
-    validate_indices<U>(_indices);
+    if (_axis != 0)
+    {
+        dst_win.set(0, Window::Dimension(window_start_x, window_start_x + 1, 1));
+        window_size_x *= window_end_x - window_start_x;
+    }
 
-    Window output_window{ window };
-    output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+    // Compute source and index tensors window based on the output window.
+    auto   src_win = dst_win;
+    Window idx_win;
 
-    Iterator output_it(_output, output_window);
-    execute_window_loop(output_window, [&](const Coordinates & id)
+    for (size_t i = 0; i < idx_info->num_dimensions(); ++i)
     {
-        Coordinates gather_id(id);
+        src_win.set(_axis + i, Window::Dimension(0, 1, 1));
+        idx_win.set(_axis + i, window[_axis + i]);
+    }
 
-        auto new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
-        gather_id.set(_axis, new_index);
+    // Use the custom strides to access all three tensors using the same loop.
+    Iterator src_it(num_dims, _src_it_strides, _input->buffer(), src_info->offset_first_element_in_bytes(), src_win);
+    Iterator idx_it(num_dims, _idx_it_strides, _indices->buffer(), idx_info->offset_first_element_in_bytes(), idx_win);
+    Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(),
+                    dst_info->offset_first_element_in_bytes(), dst_win);
 
-        std::copy_n(_input->ptr_to_element(gather_id), _input->info()->dimension(0) * _output->info()->element_size(), output_it.ptr());
-    },
-    output_it);
+    execute_window_loop(
+        dst_win,
+        [&](const Coordinates &)
+        {
+            const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr());
+
+            if (idx >= 0 && idx < idx_limit)
+            {
+                const auto src_ptr = src_it.ptr() + idx * chunk_stride;
+
+                std::copy_n(src_ptr, window_size_x, dst_it.ptr());
+            }
+            else
+            {
+                std::fill_n(dst_it.ptr(), window_size_x, 0);
+            }
+        },
+        src_it, idx_it, dst_it);
 }
 
 void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
@@ -141,53 +145,64 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
     _output  = output;
     _axis    = axis;
 
-    if(_axis < 0)
+    if (_axis < 0)
     {
         _axis += input->info()->num_dimensions();
     }
     ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
 
-    if(0 == _axis)
+    switch (_indices->info()->data_type())
     {
-        switch(_indices->info()->data_type())
-        {
-            case DataType::U32:
-                _func = &NEGatherKernel::gather_0_axis<uint32_t>;
-                break;
-            case DataType::S32:
-                _func = &NEGatherKernel::gather_0_axis<int32_t>;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not supported");
-                break;
-        }
-    }
-    else
-    {
-        switch(_indices->info()->data_type())
-        {
-            case DataType::U32:
-                _func = &NEGatherKernel::gather_n_axis<uint32_t>;
-                break;
-            case DataType::S32:
-                _func = &NEGatherKernel::gather_n_axis<int32_t>;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not supported");
-                break;
-        }
+        case DataType::U32:
+            _func = &NEGatherKernel::gather_common<uint32_t>;
+            break;
+        case DataType::S32:
+            _func = &NEGatherKernel::gather_common<int32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
     }
+
     // Output auto initialization if not yet initialized
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+        input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     // Create window
     Window win = calculate_max_window(*output->info(), Steps());
 
     INEKernel::configure(win);
+
+    // Create input and indices strides that have the same number of dimensions as the output tensor.
+    // These will be used to iterate lock-step through all tensors (input, indices and output).
+    size_t dim_no = 0;
+
+    const auto  input_info    = input->info();
+    const auto &input_strides = input_info->strides_in_bytes();
+
+    const auto  indices_info     = indices->info();
+    const auto &indices_strides  = indices_info->strides_in_bytes();
+    const auto  indices_num_dims = indices_info->num_dimensions();
+
+    for (; dim_no < static_cast<size_t>(_axis); ++dim_no)
+    {
+        _src_it_strides[dim_no] = input_strides[dim_no];
+    }
+
+    for (; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no)
+    {
+        _idx_it_strides[dim_no] = indices_strides[dim_no - _axis];
+    }
+
+    for (; dim_no < Coordinates::num_max_dimensions; ++dim_no)
+    {
+        _src_it_strides[dim_no] = input_strides[dim_no - indices_num_dims + 1];
+    }
 }
 
-Status NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
     return Status{};
diff --git a/src/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h
index 0711f8190b..b8c069f99e 100644
--- a/src/core/NEON/kernels/NEGatherKernel.h
+++ b/src/core/NEON/kernels/NEGatherKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_NEGATHERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -61,17 +62,17 @@ public:
     /** Initialise the kernel's inputs and outputs
      *
      * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All
-     * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+     * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+     *                     @note 2D or 3D indices are only supported for the axis 1.
      * @param[out] output  Destination tensor. Data type supported: Same as @p input
-     * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
+     * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0.
+     *
      */
     void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGatherKernel
+
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported: All
-     * @param[in] indices Indices tensor info. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
-     * @param[in] output  Destination tensor info. Data type supported: Same as @p input
-     * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
+     * Similar to @ref NEGatherKernel::configure()
      *
      * @return a status
      */
@@ -81,25 +82,8 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Implementation of the gather operation for 0 axis.
-     *
-     * For gather on the 0 axis an element by element copy is performed.
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
-     * @param[in] info   Info about executing thread and CPU.
-     */
-    template <typename U>
-    void gather_0_axis(const Window &window, const ThreadInfo &info);
-
-    /** Implementation of the gather operation.
-     *
-     * For 1<=axis a row-wise copy is taking place.
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
-     * @param[in] info   Info about executing thread and CPU.
-     */
-    template <typename U>
-    void gather_n_axis(const Window &window, const ThreadInfo &info);
+    template <typename TIndex>
+    void gather_common(const Window &window, const ThreadInfo &info);
 
     using kernel_ptr = void (NEGatherKernel::*)(const Window &window, const ThreadInfo &info);
 
@@ -108,6 +92,9 @@ private:
     int            _axis;
     ITensor       *_output;
     kernel_ptr     _func;
+
+    Strides _src_it_strides;
+    Strides _idx_it_strides;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEGATHERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 56aed0ca25..549319e49f 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,9 +27,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/genproposals/list.h"
 
 #include <arm_neon.h>
 
@@ -37,6 +40,53 @@ namespace arm_compute
 {
 namespace
 {
+struct ComputeAllAnchorsData
+{
+    DataType dt;
+};
+
+using ComputeAllAnchorsSelectorPtr = std::add_pointer<bool(const ComputeAllAnchorsData &data)>::type;
+using ComputeAllAnchorsUKernelPtr  = std::add_pointer<void(
+    const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
+
+struct ComputeAllAnchorsKernel
+{
+    const char                        *name;
+    const ComputeAllAnchorsSelectorPtr is_selected;
+    ComputeAllAnchorsUKernelPtr        ukernel;
+};
+
+static const ComputeAllAnchorsKernel available_kernels[] = {
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)},
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)},
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)},
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data)
+{
+    for (const auto &uk : available_kernels)
+    {
+        if (uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(anchors, all_anchors);
@@ -44,7 +94,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
-    if(all_anchors->total_size() > 0)
+    if (all_anchors->total_size() > 0)
     {
         const size_t feature_height = info.feat_height();
         const size_t feature_width  = info.feat_width();
@@ -54,7 +104,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
 
-        if(is_data_type_quantized(anchors->data_type()))
+        if (is_data_type_quantized(anchors->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
         }
@@ -82,7 +132,8 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a
 
     // Initialize the output if empty
     const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
-    auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+    auto_init_if_empty(*all_anchors->info(),
+                       TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
 
     // Set instance variables
     _anchors      = anchors;
@@ -94,106 +145,23 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a
     INEKernel::configure(win);
 }
 
-Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status NEComputeAllAnchorsKernel::validate(const ITensorInfo        *anchors,
+                                           const ITensorInfo        *all_anchors,
+                                           const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
     return Status{};
 }
 
-template <>
-void NEComputeAllAnchorsKernel::internal_run<int16_t>(const Window &window)
-{
-    Iterator all_anchors_it(_all_anchors, window);
-    Iterator anchors_it(_all_anchors, window);
-
-    const size_t num_anchors = _anchors->info()->dimension(1);
-    const float  stride      = 1.f / _anchors_info.spatial_scale();
-    const size_t feat_width  = _anchors_info.feat_width();
-
-    const UniformQuantizationInfo qinfo = _anchors->info()->quantization_info().uniform();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
-
-        const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<int16_t *>(_anchors->ptr_to_element(Coordinates(0, anchor_offset)));
-
-        const size_t shift_idy = id.y() / num_anchors;
-        const float  shiftx    = (shift_idy % feat_width) * stride;
-        const float  shifty    = (shift_idy / feat_width) * stride;
-
-        const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
-        const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
-        const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
-        const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
-
-        *out_anchor_ptr       = quantize_qsymm16(new_anchor_x1, qinfo.scale);
-        *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
-        *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
-        *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
-    },
-    all_anchors_it);
-}
-
-template <typename T>
-void NEComputeAllAnchorsKernel::internal_run(const Window &window)
-{
-    Iterator all_anchors_it(_all_anchors, window);
-    Iterator anchors_it(_all_anchors, window);
-
-    const size_t num_anchors = _anchors->info()->dimension(1);
-    const T      stride      = 1.f / _anchors_info.spatial_scale();
-    const size_t feat_width  = _anchors_info.feat_width();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
-
-        const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<T *>(_anchors->ptr_to_element(Coordinates(0, anchor_offset)));
-
-        const size_t shift_idy = id.y() / num_anchors;
-        const T      shiftx    = (shift_idy % feat_width) * stride;
-        const T      shifty    = (shift_idy / feat_width) * stride;
-
-        *out_anchor_ptr       = *anchor_ptr + shiftx;
-        *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
-        *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
-        *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
-    },
-    all_anchors_it);
-}
-
 void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_anchors->info()->data_type())
-    {
-        case DataType::QSYMM16:
-        {
-            internal_run<int16_t>(window);
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            internal_run<float16_t>(window);
-            break;
-        }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-        {
-            internal_run<float>(window);
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Data type not supported");
-        }
-    }
+    const auto *uk = get_implementation(ComputeAllAnchorsData{_anchors->info()->data_type()});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_anchors, _all_anchors, _anchors_info, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
index f6d39e50a7..30699eee01 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,12 +74,9 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    template <typename T>
-    void internal_run(const Window &window);
-
     const ITensor     *_anchors;
     ITensor           *_all_anchors;
     ComputeAnchorsInfo _anchors_info;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
deleted file mode 100644
index a28a77a4fb..0000000000
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEIm2ColKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <tuple>
-
-using namespace arm_compute;
-using namespace misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                          bool has_bias, const Size2D &dilation, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon");
-
-    // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
-    const unsigned int width_idx    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned     total_width  = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
-    const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
-    ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
-
-    if(output->total_size() > 0)
-    {
-        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                                        bool has_bias, const Size2D &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false)));
-
-    const DataLayout   data_layout = input->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input->dimension(width_idx), input->dimension(height_idx),
-                                                                             kernel_dims.width, kernel_dims.height,
-                                                                             conv_info, dilation);
-
-    Window win = calculate_max_window(*input, Steps());
-    win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
-    win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1));
-    win.set(channel_idx, Window::Dimension(0, 1, 1));
-
-    // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
-
-    return std::make_pair(Status{}, win);
-}
-
-template <typename T, bool has_pads>
-inline void linearize_volume_nchw(const uint8_t *const in_ptr,
-                                  T                   *out_ptr,
-                                  bool                 has_bias,
-                                  int                  top_left_x,
-                                  int                  top_left_y,
-                                  int                  kernel_width,
-                                  int                  kernel_height,
-                                  int                  kernel_depth,
-                                  int                  input_w,
-                                  int                  input_h,
-                                  int                  input_stride_x,
-                                  int                  input_stride_y,
-                                  int                  input_stride_z,
-                                  int                  pad_value,
-                                  int                  dilation_x,
-                                  int                  dilation_y)
-{
-    const int kernel_size2 = kernel_width * kernel_height;
-    const int x_e          = top_left_x + kernel_width * dilation_x;
-    const int y_e          = top_left_y + kernel_height * dilation_y;
-
-    // Linearize volume
-    int d = 0;
-    // This for loop linearize a volume with 3 slices. This allows:
-    // 1) to reduce the iterations of the outer for loop "d"
-    // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
-    for(; d <= (kernel_depth - 3); d += 3)
-    {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
-        {
-            if((y < 0 || y >= input_h) && has_pads)
-            {
-                // All the values will be the offset (will be zeros when not quantized)
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
-                {
-                    *(out_ptr + 0 * kernel_size2) = pad_value;
-                    *(out_ptr + 1 * kernel_size2) = pad_value;
-                    *(out_ptr + 2 * kernel_size2) = pad_value;
-                }
-            }
-            else
-            {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
-                {
-                    if((x < 0 || x >= input_w) && has_pads)
-                    {
-                        *(out_ptr + 0 * kernel_size2) = pad_value;
-                        *(out_ptr + 1 * kernel_size2) = pad_value;
-                        *(out_ptr + 2 * kernel_size2) = pad_value;
-                    }
-                    else
-                    {
-                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                    }
-                }
-            }
-        }
-        out_ptr += 2 * kernel_size2;
-    }
-
-    // Left over
-    for(; d < kernel_depth; d++)
-    {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
-        {
-            if((y < 0 || y >= input_h) && has_pads)
-            {
-                // All the values will be the offset (will be zeros when not quantized)
-                memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
-                out_ptr += kernel_width;
-            }
-            else
-            {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
-                {
-                    if((x < 0 || x >= input_w) && has_pads)
-                    {
-                        *out_ptr = pad_value;
-                    }
-                    else
-                    {
-                        *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                    }
-                }
-            }
-        }
-    }
-
-    // Append 1 if the convolution layer has biases
-    if(has_bias)
-    {
-        *out_ptr = static_cast<T>(1);
-    }
-}
-
-template <typename T, bool has_pads>
-inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
-                                  T                   *out_ptr,
-                                  bool                 has_bias,
-                                  int                  start_x,
-                                  int                  start_y,
-                                  int                  kernel_width,
-                                  int                  kernel_height,
-                                  int                  input_w,
-                                  int                  input_h,
-                                  int                  input_c,
-                                  int                  input_stride_y,
-                                  int                  input_stride_z,
-                                  int                  pad_value,
-                                  int                  dilation_x,
-                                  int                  dilation_y)
-{
-    const int end_x        = start_x + kernel_width * dilation_x;
-    const int end_y        = start_y + kernel_height * dilation_y;
-    const int pad_quant    = kernel_width * input_c;
-    const int element_size = static_cast<int>(sizeof(T));
-    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size))
-    {
-        for(int y = start_y; y < end_y; y += dilation_y)
-        {
-            //optimized for no dilation and no boundary pixels
-            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
-            out_ptr += input_c * kernel_width;
-        }
-    }
-    else
-    {
-        for(int y = start_y; y < end_y; y += dilation_y)
-        {
-            if(y < 0 || y >= input_h)
-            {
-                memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
-                out_ptr += pad_quant;
-            }
-            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
-            {
-                for(int x = start_x; x < end_x; x += dilation_x)
-                {
-                    if(x < 0 || x >= input_w)
-                    {
-                        memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
-                        out_ptr += input_c;
-                    }
-                    else
-                    {
-                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size);
-                        out_ptr += input_c;
-                    }
-                }
-            }
-            else
-            {
-                //optimized for no dilation and no boundary pixels
-                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
-                out_ptr += input_c * kernel_width;
-            }
-        }
-    }
-    // Append 1 if the convolution layer has biases
-    if(has_bias)
-    {
-        *out_ptr = static_cast<T>(1);
-    }
-}
-} // namespace
-
-template <typename T, bool has_pads, bool is_nchw>
-void NEIm2ColKernel::run_im2col(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-
-    const int input_w        = _input->info()->dimension(width_idx);
-    const int input_h        = _input->info()->dimension(height_idx);
-    const int input_c        = _input->info()->dimension(channel_idx);
-    const int input_stride_x = _input->info()->strides_in_bytes().x();
-    const int input_stride_y = _input->info()->strides_in_bytes().y();
-    const int input_stride_z = _input->info()->strides_in_bytes().z();
-    const int pad_left       = _conv_info.pad_left();
-    const int pad_top        = _conv_info.pad_top();
-    const int stride_x       = _conv_info.stride().first;
-    const int stride_y       = _conv_info.stride().second;
-    const int pad_value      = is_data_type_quantized(_input->info()->data_type()) ? _input->info()->quantization_info().uniform().offset : 0;
-
-    Window window_in_out(window);
-    // The first three dimensions of the input and output are increased by the inner loops
-    window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Create iterators
-    Iterator in(_input, window_in_out);
-    Iterator out(_output, window_in_out);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int start_w = id[width_idx] * stride_x - pad_left;
-        const int start_h = id[height_idx] * stride_y - pad_top;
-
-        // Get pointers
-        const uint8_t *const input_ptr  = in.ptr();
-        auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * _output->info()->strides_in_bytes().y());
-
-        // Linearize volume
-        if(is_nchw)
-        {
-            linearize_volume_nchw<T, has_pads>(input_ptr,
-                                               output_ptr,
-                                               _has_bias,
-                                               start_w,
-                                               start_h,
-                                               _kernel_width,
-                                               _kernel_height,
-                                               input_c,
-                                               input_w,
-                                               input_h,
-                                               input_stride_x,
-                                               input_stride_y,
-                                               input_stride_z,
-                                               pad_value,
-                                               _dilation.x(),
-                                               _dilation.y());
-        }
-        else
-        {
-            linearize_volume_nhwc<T, has_pads>(input_ptr,
-                                               output_ptr,
-                                               _has_bias,
-                                               start_w,
-                                               start_h,
-                                               _kernel_width,
-                                               _kernel_height,
-                                               input_w,
-                                               input_h,
-                                               input_c,
-                                               input_stride_y,
-                                               input_stride_z,
-                                               pad_value,
-                                               _dilation.x(),
-                                               _dilation.y());
-        }
-    },
-    in, out);
-}
-
-NEIm2ColKernel::NEIm2ColKernel()
-    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U), _data_layout(DataLayout::UNKNOWN)
-{
-}
-
-void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                               bool has_bias, const Size2D &dilation, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
-    ARM_COMPUTE_UNUSED(num_groups);
-
-    _data_layout                  = input->info()->data_layout();
-    const unsigned int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-    _input          = input;
-    _output         = output;
-    _conv_info      = conv_info;
-    _kernel_width   = kernel_dims.width;
-    _kernel_height  = kernel_dims.height;
-    _dilation       = dilation;
-    _convolved_dims = scaled_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
-                                        _kernel_width, _kernel_height,
-                                        _conv_info, _dilation);
-    _has_bias = has_bias;
-
-    if(_data_layout == DataLayout::NCHW)
-    {
-        switch(_input->info()->data_type())
-        {
-            case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float, false, true> : &NEIm2ColKernel::run_im2col<float, true, true>;
-                break;
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-            case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<bfloat16, false, true> : &NEIm2ColKernel::run_im2col<bfloat16, true, true>;
-                break;
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float16_t, false, true> : &NEIm2ColKernel::run_im2col<float16_t, true, true>;
-                break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::QASYMM8_SIGNED:
-            case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<qasymm8_t, false, true> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, true>;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-        }
-    }
-    else
-    {
-        switch(_input->info()->data_type())
-        {
-            case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float, false, false> : &NEIm2ColKernel::run_im2col<float, true, false>;
-                break;
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-            case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<bfloat16, false, false> : &NEIm2ColKernel::run_im2col<bfloat16, true, false>;
-                break;
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float16_t, false, false> : &NEIm2ColKernel::run_im2col<float16_t, true, false>;
-                break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<uint8_t, false, false> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, false>;
-                break;
-            case DataType::QASYMM8_SIGNED:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<int8_t, false, false> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, false>;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-        }
-    }
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                bool has_bias, const Size2D &dilation, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation).first);
-    return Status{};
-}
-
-void NEIm2ColKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    (this->*_func)(window);
-}
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.h b/src/core/NEON/kernels/NEIm2ColKernel.h
deleted file mode 100644
index 6c1c631d82..0000000000
--- a/src/core/NEON/kernels/NEIm2ColKernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEIM2COLKERNEL_H
-#define ARM_COMPUTE_NEIM2COLKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-class Size2D;
-
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class NEIm2ColKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEIm2ColKernel";
-    }
-    /** Default constructor */
-    NEIm2ColKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEIm2ColKernel(const NEIm2ColKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEIm2ColKernel &operator=(const NEIm2ColKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEIm2ColKernel(NEIm2ColKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEIm2ColKernel &operator=(NEIm2ColKernel &&) = default;
-    /** Default destructor */
-    ~NEIm2ColKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs.
-     *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                         Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
-     * @param[out] output      The output tensor. Data types supported: Same as @p input
-     * @param[in]  kernel_dims The kernel dimensions (width and height).
-     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
-     */
-    void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                   bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel
-     *
-     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs.
-     *                        Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                        Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
-     * @param[in] output      The output tensor. Data types supported: Same as @p input
-     * @param[in] kernel_dims The kernel dimensions (width and height).
-     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                           bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run im2col
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, bool has_pads, bool is_nchw>
-    void run_im2col(const Window &window);
-
-    /** Common signature for all the specialised im2col functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using Im2ColFunctionPtr = void (NEIm2ColKernel::*)(const Window &window);
-
-    Im2ColFunctionPtr _func;
-    const ITensor    *_input;
-    ITensor          *_output;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _kernel_width;
-    unsigned int  _kernel_height;
-    bool          _has_bias;
-    Size2D        _dilation;
-    DataLayout    _data_layout;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEIM2COLKERNEL_H */
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index d33431a8d2..0a1780f6ee 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,11 +31,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/instancenorm/list.h"
 
 #include <arm_neon.h>
 
@@ -43,137 +46,52 @@ namespace arm_compute
 {
 namespace
 {
-template <typename InputType, typename AccType = InputType>
-void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs)
-{
-    result        = wrapper::vadd(result, inputs);
-    result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs));
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void vector_float_sum(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs)
+struct InstanceNormSelectorData
 {
-    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgetlow(inputs)));
-    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <typename InputType, typename AccType = InputType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+    DataType dt;
+};
+
+using InstanceNormSelctorPtr = std::add_pointer<bool(const InstanceNormSelectorData &data)>::type;
+using InstanceNormUKernelPtr = std::add_pointer<void(ITensor      *input,
+                                                     ITensor      *output,
+                                                     float         gamma,
+                                                     float         beta,
+                                                     float         epsilon,
+                                                     bool          use_mixed_precision,
+                                                     const Window &window)>::type;
+
+struct InstanceNormKernel
 {
-    return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
-}
-
+    const char                  *name;
+    const InstanceNormSelctorPtr is_selected;
+    InstanceNormUKernelPtr       ukernel;
+};
+
+static const InstanceNormKernel available_kernels[] = {
+    {"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float16x8_t vector_float_norm(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta)
-{
-    const auto  input_low   = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
-    const auto  input_high  = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
-    const auto  result_low  = wrapper::vcvt<float16_t>(vector_float_norm(input_low, vec_mean, vec_multip, vec_beta));
-    const auto  result_high = wrapper::vcvt<float16_t>(vector_float_norm(input_high, vec_mean, vec_multip, vec_beta));
-    float16x8_t result      = wrapper::vcombine(result_low, result_high);
-
-    return result;
-}
+    {"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+};
 
-template <typename T, typename AccType = T>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data)
 {
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    // Clear X/Y dimensions on execution window as we handle the planes manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    constexpr int      window_step_x  = 16 / sizeof(T);
-    const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
-
-    Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
+    for (const auto &uk : available_kernels)
     {
-        Window win_plane = window;
-        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
-        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
-        Iterator input_plane_it(input, win_plane);
-        Iterator output_plane_it(output, win_plane);
-
-        auto sum_h_w         = static_cast<AccType>(0.f);
-        auto sum_squares_h_w = static_cast<AccType>(0.f);
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
+        if (uk.is_selected(data))
         {
-            const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
-
-            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                auto vec_input_val = wrapper::vloadq(input_ptr + x);
-                vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
-            }
-
-            auto vec2_sum_h_w         = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
-            auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
-
-            vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-
-            sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
-            sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto value = static_cast<AccType>(*(input_ptr + x));
-                sum_h_w += value;
-                sum_squares_h_w += value * value;
-            }
-        },
-        input_plane_it, output_plane_it);
-
-        const auto mean_h_w = sum_h_w / elements_plane;
-        const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
-        const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
-        const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
-        {
-            auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
-            auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                const auto vec_val        = wrapper::vloadq(input_ptr + x);
-                const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
-                wrapper::vstore(output_ptr + x, normalized_vec);
-            }
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto val    = static_cast<AccType>(*(input_ptr + x));
-                *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
-            }
-        },
-        input_plane_it, output_plane_it);
-    },
-    input_it);
+            return &uk;
+        }
+    }
+    return nullptr;
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
@@ -184,14 +102,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "NHWC data layout is not supported by the kernel directly");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC,
+                                    "NHWC data layout is not supported by the kernel directly");
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
     return Status{};
 }
@@ -210,11 +130,13 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
 } // namespace
 
 NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12)
+    : _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12)
 {
 }
 
-void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void NEInstanceNormalizationLayerKernel::configure(ITensor                                    *input,
+                                                   ITensor                                    *output,
+                                                   const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -227,28 +149,6 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), _gamma, _beta, _epsilon));
 
-    if(_input->info()->data_type() == DataType::F32)
-    {
-        _func = &instance_normalization_nchw<float>;
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else if(_input->info()->data_type() == DataType::F16)
-    {
-        if(_use_mixed_precision)
-        {
-            _func = &instance_normalization_nchw<float16_t, float>;
-        }
-        else
-        {
-            _func = &instance_normalization_nchw<float16_t>;
-        }
-    }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else
-    {
-        ARM_COMPUTE_ERROR("Unsupported data type");
-    }
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(_input->info(), _output->info());
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
@@ -256,10 +156,13 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp
     INEKernel::configure(std::get<1>(win_config));
 }
 
-Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo                          *input,
+                                                    const ITensorInfo                          *output,
+                                                    const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info.gamma, info.beta, info.epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+        input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
     return Status{};
 }
 
@@ -268,6 +171,10 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    (*_func)(_input, _output, _gamma, _beta, _epsilon, window);
+
+    const auto *uk = get_implementation(InstanceNormSelectorData{_input->info()->data_type()});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
index 96c0119719..024ccd9ef2 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,7 +68,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -82,15 +83,15 @@ private:
      * @param[in]      beta    The offset scalar value applied to the normalized tensor. Defaults to 0.0
      * @param[in]      epsilon Lower bound value for the normalization. Defaults to 1e-12
      */
-    using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+    using NormalizationFunction =
+        void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 
-    NormalizationFunction *_func;
-    ITensor               *_input;
-    ITensor               *_output;
-    float                  _gamma;
-    float                  _beta;
-    float                  _epsilon;
-    bool                   _use_mixed_precision{ true };
+    ITensor *_input;
+    ITensor *_output;
+    float    _gamma;
+    float    _beta;
+    float    _epsilon;
+    bool     _use_mixed_precision{true};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 9bda82d416..eea57a17d3 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,11 +30,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/cpu/kernels/l2normlayer/list.h"
 
-#include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 #include <cmath>
 
@@ -44,93 +47,68 @@ namespace
 {
 constexpr int max_input_tensor_dim = 3;
 
-template <typename T, int S>
-void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+struct L2NormalizeLayerSelectorData
 {
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+    DataType            dt;
+    unsigned int        actual_axis;
+    cpuinfo::CpuIsaInfo isa;
+};
 
-    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
+using L2NormalizeLayerKernelSelctorPtr = std::add_pointer<bool(const L2NormalizeLayerSelectorData &data)>::type;
 
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+using L2NormalizeLayerPtr = std::add_pointer<void(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type;
 
-    Iterator input_it(in, win_collapsed);
-    Iterator sum_it(sum, win_collapsed);
-    Iterator output_it(out, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
+struct L2NormalizeLayerKernel
+{
+    const char                            *name;
+    const L2NormalizeLayerKernelSelctorPtr is_selected;
+    L2NormalizeLayerPtr                    ukernel;
+};
+
+static const L2NormalizeLayerKernel available_kernels[] = {
+    {"fp32_neon_l2normalize_x",
+     [](const L2NormalizeLayerSelectorData &data)
+     { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)},
+    {"fp32_neon_l2normalize_yz",
+     [](const L2NormalizeLayerSelectorData &data)
+     { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)},
     {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
-        const T    sum_value      = *reinterpret_cast<const T *>(sum_it.ptr());
-        const T    norm_value     = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
-        const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
-
-        // Compute elements over vector steps
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            out_ptr[x] = in_ptr[x] * norm_value;
-        }
+        "fp16_neon_l2normalize_x",
+        [](const L2NormalizeLayerSelectorData &data)
+        { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_x),
     },
-    input_it, sum_it, output_it);
-}
+    {
+        "fp16_neon_l2normalize_yz",
+        [](const L2NormalizeLayerSelectorData &data)
+        { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_yz),
+    },
+};
 
-template <typename T, int S>
-void l2_normalize_YZ(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorData &data)
 {
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Window window_sum(win);
-    window_sum.set(axis, Window::Dimension(0, 0, 0));
-
-    Iterator input_it(in, win);
-    Iterator sum_it(sum, window_sum);
-    Iterator output_it(out, win);
-
-    const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
-
-    execute_window_loop(win, [&](const Coordinates &)
+    for (const auto &uk : available_kernels)
     {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-        const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
-        // Compute elements over vector steps
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        if (uk.is_selected(data))
         {
-            const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
-            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+            return &uk;
         }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
-            out_ptr[x]         = in_ptr[x] * norm_value;
-        }
-    },
-    input_it, sum_it, output_it);
+    }
+    return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
@@ -139,14 +117,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+                                    "Actual normalization axis greater than max number of dimensions");
 
     // Reduce shape on axis
     TensorShape sum_shape = input->tensor_shape();
     sum_shape.set(actual_axis, 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -175,7 +154,8 @@ NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel()
 {
 }
 
-void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
+void NEL2NormalizeLayerKernel::configure(
+    const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
@@ -193,10 +173,12 @@ void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *su
     INEKernel::configure(std::get<1>(win_config));
 }
 
-Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status NEL2NormalizeLayerKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
 
     return Status{};
 }
@@ -207,23 +189,16 @@ void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    if(_actual_axis > 2)
+    if (_actual_axis > 2)
     {
         ARM_COMPUTE_ERROR("Unsupported normalization axis");
     }
 
-    switch(_input->info()->data_type())
-    {
-        case DataType::F32:
-            (_actual_axis == Window::DimX) ? l2_normalize_X<float, 4>(_input, _sum, _output, _epsilon, window) : l2_normalize_YZ<float, 4>(_input, _sum, _output, _epsilon, window, _actual_axis);
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            (_actual_axis == Window::DimX) ? l2_normalize_X<float16_t, 8>(_input, _sum, _output, _epsilon, window) : l2_normalize_YZ<float16_t, 8>(_input, _sum, _output, _epsilon, window, _actual_axis);
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
+    const auto *uk = get_implementation(
+        L2NormalizeLayerSelectorData{_output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa()});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr);
+    ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+
+    uk->ukernel(_input, _sum, _output, _epsilon, window, _actual_axis);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
index af3ad3403e..3524e66a21 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
@@ -74,7 +74,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NELogicalKernel.cpp b/src/core/NEON/kernels/NELogicalKernel.cpp
index e1c24da777..6be6284528 100644
--- a/src/core/NEON/kernels/NELogicalKernel.cpp
+++ b/src/core/NEON/kernels/NELogicalKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -41,17 +42,16 @@ static const uint8x8_t  c0_x8     = vdup_n_u8(0);
 static const uint8x16_t c0_x16    = vdupq_n_u8(0);
 static const uint8x8_t  c1_x8     = vdup_n_u8(1);
 static const uint8x16_t c1_x16    = vdupq_n_u8(1);
-static const int        step      = 16;
-static const int        half_step = step / 2;
+static const uint32_t   step      = 16;
+static const uint32_t   half_step = step / 2;
 
-void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int len)
+void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uint32_t len)
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src0);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
-    ARM_COMPUTE_ASSERT(len >= 0);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
         src0 += step;
@@ -59,7 +59,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, in
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
         src0 += half_step;
@@ -67,7 +67,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, in
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src0) && (*src1);
         ++src0;
@@ -76,31 +76,30 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, in
     }
 }
 
-void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_t *dst, int len)
+void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_t *dst, uint32_t len)
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
-    ARM_COMPUTE_ASSERT(len >= 0);
 
     const auto broadcast_val_clamped_s   = std::min<uint8_t>(broadcast_val, 1);
     const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
     const auto broadcast_val_clamped_x8  = vdup_n_u8(broadcast_val_clamped_s);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src) && broadcast_val_clamped_s;
         ++src;
@@ -108,14 +107,13 @@ void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8
     }
 }
 
-void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int len)
+void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uint32_t len)
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src0);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
-    ARM_COMPUTE_ASSERT(len >= 0);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
         src0 += step;
@@ -123,7 +121,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
         src0 += half_step;
@@ -131,7 +129,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src0) || (*src1);
         ++src0;
@@ -140,31 +138,30 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int
     }
 }
 
-void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_t *dst, int len)
+void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_t *dst, uint32_t len)
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
-    ARM_COMPUTE_ASSERT(len >= 0);
 
     const auto broadcast_val_clamped_s   = std::min<uint8_t>(broadcast_val, 1);
     const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
     const auto broadcast_val_clamped_x8  = vdup_n_u8(broadcast_val_clamped_s);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src) || broadcast_val_clamped_s;
         ++src;
@@ -172,27 +169,26 @@ void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_
     }
 }
 
-void neon_logical_not(const uint8_t *src, uint8_t *dst, int len)
+void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len)
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
-    ARM_COMPUTE_ASSERT(len >= 0);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = !(*src);
         ++src;
@@ -202,18 +198,15 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, int len)
 
 void run_unary(const Window &window, const ITensor *src, ITensor *dst)
 {
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
+    const auto len = window.x().end() - window.x().start();
 
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        neon_logical_not(in.ptr(), out.ptr(), len);
-    },
-    in, out);
+    execute_window_loop(
+        win, [&](const Coordinates &) { neon_logical_not(in.ptr(), out.ptr(), len); }, in, out);
 }
 
 void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op)
@@ -221,16 +214,17 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
     Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
     Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-    const auto len                   = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
+    const auto len                   = window.x().end() - window.x().start();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
-        using LogicalBroadcastUKernelPtr        = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, int)>::type;
-        LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
+        using LogicalBroadcastUKernelPtr = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type;
+        LogicalBroadcastUKernelPtr logical_func =
+            op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
 
         const bool     is_broadcast_input_1 = src1_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_1 ? src1_win : src0_win;
@@ -243,17 +237,18 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
         Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win);
         Iterator out(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const uint8_t broadcast_value = *broadcast_in.ptr();
-            logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
-
-        },
-        broadcast_in, non_broadcast_in, out);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const uint8_t broadcast_value = *broadcast_in.ptr();
+                logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
+            },
+            broadcast_in, non_broadcast_in, out);
     }
     else
     {
-        using LogicalUKernelPtr        = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, int)>::type;
+        using LogicalUKernelPtr = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type;
         LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and;
 
         src0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -262,11 +257,8 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
         Iterator in0(src0, src0_win);
         Iterator in1(src1, src1_win);
         Iterator out(dst, win);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            logical_func(in0.ptr(), in1.ptr(), out.ptr(), len);
-        },
-        in0, in1, out);
+        execute_window_loop(
+            win, [&](const Coordinates &) { logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); }, in0, in1, out);
     }
 }
 } // namespace
@@ -275,7 +267,10 @@ const char *NELogicalKernel::name() const
     return "NELogicalKernel";
 }
 
-void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op)
+void NELogicalKernel::configure(const ITensorInfo *input1,
+                                const ITensorInfo *input2,
+                                ITensorInfo       *output,
+                                LogicalOperation   op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op));
@@ -284,7 +279,7 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in
 
     Window      win       = calculate_max_window(*input1, Steps());
     TensorShape out_shape = input1->tensor_shape();
-    if(op != LogicalOperation::Not)
+    if (op != LogicalOperation::Not)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
         out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
@@ -297,13 +292,16 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in
     set_data_type_if_unknown(*output, input1->data_type());
 }
 
-Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op)
+Status NELogicalKernel::validate(const ITensorInfo *input1,
+                                 const ITensorInfo *input2,
+                                 const ITensorInfo *output,
+                                 LogicalOperation   op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
     ARM_COMPUTE_RETURN_ERROR_ON(op == LogicalOperation::Unknown);
 
     TensorShape out_shape = input1->tensor_shape();
-    if(op != LogicalOperation::Not)
+    if (op != LogicalOperation::Not)
     {
         out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -311,7 +309,7 @@ Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *i
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -331,7 +329,7 @@ void NELogicalKernel::run_op(ITensorPack &tensors, const Window &window, const T
     const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_op == LogicalOperation::Not)
+    if (_op == LogicalOperation::Not)
     {
         run_unary(window, src0, dst);
     }
diff --git a/src/core/NEON/kernels/NELogicalKernel.h b/src/core/NEON/kernels/NELogicalKernel.h
index caf69cf45d..477a59d826 100644
--- a/src/core/NEON/kernels/NELogicalKernel.h
+++ b/src/core/NEON/kernels/NELogicalKernel.h
@@ -58,10 +58,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
+    static Status
+    validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
deleted file mode 100644
index 761fa15238..0000000000
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices);
-
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int    pool_size_x = pool_info.pool_size.width;
-    const int    pool_size_y = pool_info.pool_size.height;
-    const Size2D pool_size(pool_size_x, pool_size_y);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-NEMaxUnpoolingLayerKernel::NEMaxUnpoolingLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr)
-{
-}
-
-void NEMaxUnpoolingLayerKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
-
-    _input   = input;
-    _output  = output;
-    _indices = indices;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::F32:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<float>;
-            break;
-        case DataType::QASYMM8:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<uint8_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<int8_t>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &NEMaxUnpoolingLayerKernel::unpooling2<float16_t>;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            break;
-    }
-    const TensorShape output_shape = compute_unpool_shape(*input->info(), pool_info);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-    auto window = calculate_max_window(*input->info(), Steps());
-    INEKernel::configure(window);
-}
-template <typename T>
-void NEMaxUnpoolingLayerKernel::unpooling2(const Window &window)
-{
-    Iterator  input(_input, window);
-    Iterator  indices(_indices, window);
-    auto      out_ptr      = reinterpret_cast<T *>(_output->buffer());
-    const int out_stride_w = static_cast<int>(_output->info()->strides_in_bytes()[3]);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto vindices                                         = reinterpret_cast<uint32_t *>(indices.ptr());
-        auto vinput                                           = reinterpret_cast<T *>(input.ptr());
-        out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
-    },
-    input, indices);
-}
-
-Status NEMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
-    return Status{};
-}
-
-void NEMaxUnpoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    // Run function
-    (this->*_func)(window);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
deleted file mode 100644
index ecc116e585..0000000000
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the pooling layer kernel */
-class NEMaxUnpoolingLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMaxUnpoolingLayerKernel";
-    }
-    /** Default constructor */
-    NEMaxUnpoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMaxUnpoolingLayerKernel(const NEMaxUnpoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMaxUnpoolingLayerKernel &operator=(const NEMaxUnpoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEMaxUnpoolingLayerKernel(NEMaxUnpoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEMaxUnpoolingLayerKernel &operator=(NEMaxUnpoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEMaxUnpoolingLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @note Output shape must be equal to the shape of the original input to pool.
-     *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  indices   Tensor containing the offset to store the input elements in the output tensor.
-     *                       @ref NEPoolingLayer with indices should precede this function in order to
-     *                       properly reconstruct the output tensor.
-     *                       The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     */
-    void configure(const ITensor *input, const ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEMaxUnpoolingLayerKernel
-     *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] indices   Tensor info of the indices of the maximal values. Data type supported: U32.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     */
-    template <typename T>
-    void unpooling2(const Window &window_input);
-
-    using UnpoolingFunction = void (NEMaxUnpoolingLayerKernel::*)(const Window &window);
-
-private:
-    UnpoolingFunction _func;
-    const ITensor    *_input;
-    ITensor          *_output;
-    const ITensor    *_indices;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index d1c7d4eb91..451031d696 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,26 +28,74 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/meanstddevnorm/list.h"
 
 namespace arm_compute
 {
 namespace
 {
+struct MeanStdDevNormSelectorData
+{
+    DataType dt;
+};
+
+using MeanStdDevNormSelctorPtr = std::add_pointer<bool(const MeanStdDevNormSelectorData &data)>::type;
+using MeanStdDevNormUKernelPtr =
+    std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
+
+struct MeanStdDevNormKernel
+{
+    const char                    *name;
+    const MeanStdDevNormSelctorPtr is_selected;
+    MeanStdDevNormUKernelPtr       ukernel;
+};
+
+static const std::vector<MeanStdDevNormKernel> available_kernels = {
+    {"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)},
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)},
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)},
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data)
+{
+    for (const auto &uk : available_kernels)
+    {
+        if (uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Input tensor cannot have more than 2 dimensions");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -57,7 +105,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
-    if(output != nullptr)
+    if (output != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
         // Output auto inizialitation if not yet initialized
@@ -72,80 +120,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-template <typename ScalarType, int size>
-void NEMeanStdDevNormalizationKernel::mean_stddev_normalization(const Window &window)
-{
-    using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type;
-
-    // Set build options
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x  = size;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-        auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-        auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto data  = wrapper::vloadq(in_ptr + x);
-            sum_vec    = wrapper::vadd(sum_vec, data);
-            sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
-        }
-
-        auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
-        auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
-        for(int i = 0; i < size / 4; ++i)
-        {
-            sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
-            sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
-        }
-
-        auto sum    = wrapper::vgetlane(sum_carry_res, 0);
-        auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            ScalarType data = *(in_ptr + x);
-            sum += data;
-            sum_sq += data * data;
-        }
-
-        ScalarType mean       = sum / _input->info()->dimension(0);
-        ScalarType var        = (sum_sq / _input->info()->dimension(0)) - (mean * mean);
-        ScalarType stddev_inv = 1.f / sqrt(var + _epsilon);
-
-        auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
-        auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto data = wrapper::vloadq(in_ptr + x);
-            auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
-            // Store results
-            wrapper::vstore(out_ptr + x, res);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
-        }
-    },
-    input, output);
-}
-
-NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel()
-    : _input(nullptr), _output(nullptr), _epsilon(1e-8f), _func(nullptr)
+NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
 {
 }
 
@@ -153,7 +128,8 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
-    ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+    ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(
+        input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
 
     _input   = input;
     _output  = (output == nullptr) ? input : output;
@@ -163,29 +139,14 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
     auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICPPKernel::configure(win_config.second);
-
-    // Configure function to run based on different data types
-    const DataType data_type = input->info()->data_type();
-    switch(data_type)
-    {
-        case DataType::F32:
-            _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float, 4>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float16_t, 8>;
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        default:
-            ARM_COMPUTE_ERROR("Not Supported");
-            break;
-    }
 }
 
 Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr)
+            .first);
     return Status{};
 }
 
@@ -194,8 +155,10 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (this->*_func)(window);
+    const auto *uk = get_implementation(MeanStdDevNormSelectorData{_output->info()->data_type()});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+    uk->ukernel(_input, _output, _epsilon, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
index 59d073ada5..844f0efdc2 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,8 +91,6 @@ private:
     float    _epsilon;
 
     using MeanStdDevNormFunction = void (NEMeanStdDevNormalizationKernel::*)(const Window &window);
-
-    MeanStdDevNormFunction _func;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
deleted file mode 100644
index 5ea8947fa0..0000000000
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <climits>
-#include <cstddef>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
-    if(output->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-        TensorShape output_shape = compute_min_max_shape(input);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    TensorShape output_shape = compute_min_max_shape(input);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, input->data_type());
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, 2);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win);
-}
-} // namespace
-
-NEMinMaxLayerKernel::NEMinMaxLayerKernel()
-    : _input(nullptr), _output(nullptr), _mtx()
-{
-}
-
-void NEMinMaxLayerKernel::configure(const ITensor *input, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    INEKernel::configure(std::get<1>(win_config));
-}
-
-Status NEMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-
-    return Status{};
-}
-
-void NEMinMaxLayerKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const int x_start = window.x().start();
-    const int x_end   = window.x().end();
-
-    Window window_output;
-    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Handle X dimension manually to split into two loops
-    // First one will use vector operations, second one processes the left over pixels
-    Window window_input(window);
-    window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_input.set(3, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, window_input);
-    Iterator output(_output, window_output);
-
-    execute_window_loop(window_output, [&](const Coordinates & id_batch)
-    {
-        float32x2_t carry_min = vdup_n_f32(std::numeric_limits<float>::max());
-        float32x2_t carry_max = vdup_n_f32(std::numeric_limits<float>::lowest());
-
-        float carry_min_scalar = std::numeric_limits<float>::max();
-        float carry_max_scalar = std::numeric_limits<float>::lowest();
-
-        execute_window_loop(window_input, [&](const Coordinates &)
-        {
-            int        x      = x_start;
-            const auto in_ptr = reinterpret_cast<const float *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
-
-            // Vector loop
-            for(; x <= x_end - 8; x += 8)
-            {
-                const float32x4x2_t pixels   = vld2q_f32(in_ptr + x);
-                const float32x4_t   tmp_min1 = vminq_f32(pixels.val[0], pixels.val[1]);
-                const float32x4_t   tmp_max1 = vmaxq_f32(pixels.val[0], pixels.val[1]);
-                const float32x2_t   tmp_min2 = vmin_f32(vget_high_f32(tmp_min1), vget_low_f32(tmp_min1));
-                const float32x2_t   tmp_max2 = vmax_f32(vget_high_f32(tmp_max1), vget_low_f32(tmp_max1));
-                carry_min                    = vmin_f32(tmp_min2, carry_min);
-                carry_max                    = vmax_f32(tmp_max2, carry_max);
-            }
-
-            // Process leftover pixels
-            for(; x < x_end; ++x)
-            {
-                const float pixel = in_ptr[x];
-                carry_min_scalar  = std::min(pixel, carry_min_scalar);
-                carry_max_scalar  = std::max(pixel, carry_max_scalar);
-            }
-        },
-        input);
-
-        // Reduce result
-        carry_min = vpmin_f32(carry_min, carry_min);
-        carry_max = vpmax_f32(carry_max, carry_max);
-        carry_min = vpmin_f32(carry_min, carry_min);
-        carry_max = vpmax_f32(carry_max, carry_max);
-
-        // Extract max/min values
-        const float min_i = std::min(vget_lane_f32(carry_min, 0), carry_min_scalar);
-        const float max_i = std::max(vget_lane_f32(carry_max, 0), carry_max_scalar);
-
-        auto out_ptr = reinterpret_cast<float *>(output.ptr());
-
-        // Perform reduction of local min/max values
-        update_min_max(out_ptr, min_i, max_i);
-    },
-    output);
-}
-
-void NEMinMaxLayerKernel::reset()
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-    float32x2_t reset_values = vdup_n_f32(0.0f);
-    reset_values             = vset_lane_f32(std::numeric_limits<float>::max(), reset_values, 0);
-    reset_values             = vset_lane_f32(std::numeric_limits<float>::lowest(), reset_values, 1);
-
-    Window window_output;
-    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator output(_output, window_output);
-
-    execute_window_loop(window_output, [&](const Coordinates &)
-    {
-        vst1_f32(reinterpret_cast<float *>(output.ptr()), reset_values);
-    },
-    output);
-}
-
-void NEMinMaxLayerKernel::update_min_max(float *out_ptr, float min, float max)
-{
-    arm_compute::lock_guard<Mutex> lock(_mtx);
-
-    const float32x2_t old_min = vld1_dup_f32(out_ptr);
-    const float32x2_t old_max = vld1_dup_f32(out_ptr + 1);
-    const float32x2_t new_min = vmin_f32(vdup_n_f32(min), old_min);
-    const float32x2_t new_max = vmax_f32(vdup_n_f32(max), old_max);
-
-    vst1_f32(out_ptr, vzip_f32(new_min, new_max).val[0]);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.h b/src/core/NEON/kernels/NEMinMaxLayerKernel.h
deleted file mode 100644
index b4852ad9f2..0000000000
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform min max search on a 3D tensor. */
-class NEMinMaxLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMinMaxLayerKernel";
-    }
-    /** Default constructor */
-    NEMinMaxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLayerKernel(const NEMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLayerKernel &operator=(const NEMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxLayerKernel(NEMinMaxLayerKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxLayerKernel &operator=(NEMinMaxLayerKernel &&) = delete;
-    /** Default destructor */
-    ~NEMinMaxLayerKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note output[0] = minimum
-     * @note output[1] = maximum
-     *
-     * @param[in]  input  Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data type supported: F32.
-     * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum value for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
-     *
-     * @param[in] input  Input tensor info.  Data types supported: F32.
-     * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                   The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-    /** Resets global minimum and maximum. */
-    void reset();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    void update_min_max(float *out_ptr, float min, float max);
-    const ITensor     *_input;
-    ITensor           *_output;
-    arm_compute::Mutex _mtx;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMINMAXLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 49a045382d..8399c6c49d 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,19 +29,26 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/NormalizationHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/norm_layer/generic/neon/impl.h"
+#include "src/cpu/kernels/norm_layer/generic/neon/list.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status validate_arguments(const ITensorInfo            *input,
+                          const ITensorInfo            *input_squared,
+                          const ITensorInfo            *output,
+                          const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
@@ -52,7 +59,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squ
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -69,7 +76,10 @@ NENormalizationLayerKernel::NENormalizationLayerKernel()
 {
 }
 
-void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
+void NENormalizationLayerKernel::configure(const ITensor         *input,
+                                           const ITensor         *input_squared,
+                                           ITensor               *output,
+                                           NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output);
     // Output tensor auto initialization if not yet initialized
@@ -84,79 +94,78 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
     _input_squared = input_squared;
     _output        = output;
     _norm_info     = norm_info;
-
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
         case DataType::F32:
         {
-            switch(norm_idx)
+            switch (norm_idx)
             {
                 case 0:
                 {
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, true>;
+                        _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_0_2D);
                     }
                     else
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, false>;
+                        _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_0);
                     }
                     break;
                 }
                 case 1:
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, true>;
+                        _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_1_2D);
                     }
                     else
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, false>;
+                        _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_1);
                     }
                     break;
                 case 2:
-                    _func = &NENormalizationLayerKernel::normalize_float<float, 4, 2, false>;
+                    _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_2);
                     break;
                 default:
                     break;
             }
             break;
         }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ARM_COMPUTE_ENABLE_FP16
         case DataType::F16:
         {
-            switch(norm_idx)
+            switch (norm_idx)
             {
                 case 0:
                 {
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, true>;
+                        _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_0_2D);
                     }
                     else
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, false>;
+                        _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_0);
                     }
                     break;
                 }
                 case 1:
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, true>;
+                        _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_1_2D);
                     }
                     else
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, false>;
+                        _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_1);
                     }
                     break;
                 case 2:
-                    _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 2, false>;
+                    _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_2);
                     break;
                 default:
                     break;
             }
             break;
         }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         default:
             ARM_COMPUTE_ERROR("NOT SUPPORTED!");
     }
@@ -166,115 +175,10 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
     INEKernel::configure(win);
 }
 
-template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
-void NENormalizationLayerKernel::normalize_float(const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = S;
-
-    Iterator input(_input, win);
-    Iterator input_squared(_input_squared, win);
-    Iterator output(_output, win);
-
-    const int dim_y                      = _input->info()->data_layout() == DataLayout::NCHW ? 1 : 2;
-    const int radius                     = _norm_info.norm_size() / 2;
-    const int input_squared_stride_x     = _input_squared->info()->strides_in_bytes()[0];
-    const int input_squared_stride_slice = _input_squared->info()->strides_in_bytes()[dim];
-    const int input_squared_stride_row   = _input_squared->info()->strides_in_bytes()[dim_y];
-
-    const int max_right  = _input->info()->dimension(dim) - 1;
-    const int max_bottom = _input->info()->dimension(dim_y) - 1;
-
-    const auto coeff_vec = wrapper::vdup_n(static_cast<T>(_norm_info.scale_coeff()), ExactTagType{});
-    const auto beta_vec  = wrapper::vdup_n(static_cast<T>(_norm_info.beta()), ExactTagType{});
-    const auto kappa_vec = wrapper::vdup_n(static_cast<T>(_norm_info.kappa()), ExactTagType{});
-
-    auto sequential_normalization = [&](const int x, const Coordinates & id, const int current_row, const int first_row, const int last_row, const T * input_ptr, const uint8_t *input_squared_start_ptr,
-                                        T * output_ptr)
-    {
-        const int current_slice = dim == 0 ? x : id[dim];
-        const int first_slice   = std::max(current_slice - radius, 0);
-        const int last_slice    = std::min(current_slice + radius, max_right);
-
-        const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
-        // Accumulate 2D In-Map values
-        auto accu = static_cast<T>(0.f);
-        for(int j = first_row; j <= last_row; ++j)
-        {
-            // Compute row displacement
-            const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
-            for(int i = first_slice; i <= last_slice; ++i)
-            {
-                accu += *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
-            }
-        }
-
-        // Normalize
-        const auto normalized       = std::pow(accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
-        const auto normalized_pixel = (*(input_ptr + x)) / normalized;
-        *(output_ptr + x)           = normalized_pixel;
-    };
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        auto       output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        // Get range to normalize
-        const int current_row = do_2D_norm ? id[dim_y] : 0;
-        const int first_row   = do_2D_norm ? std::max(current_row - radius, 0) : 0;
-        const int last_row    = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-
-        int x = window_start_x;
-        // Compute serially starting elements for the case x dimension is width
-        for(; x < radius && x < window_end_x && dim == 0; ++x)
-        {
-            sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
-        }
-
-        // Compute vectorized
-        for(; x <= window_end_x - window_step_x - radius; x += window_step_x)
-        {
-            const int current_slice = dim == 0 ? x : id[dim];
-            const int first_slice   = std::max(current_slice - radius, 0);
-            const int last_slice    = std::min(current_slice + radius, max_right);
-
-            const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
-            // Accumulate 2D In-Map values
-            auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-            for(int j = first_row; j <= last_row; ++j)
-            {
-                // Compute row displacement
-                const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
-                for(int i = first_slice; i <= last_slice; ++i)
-                {
-                    accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
-                }
-            }
-
-            // Normalize
-            const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
-            const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
-            wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
-        }
-    },
-    input, input_squared, output);
-}
-
-Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)
+Status NENormalizationLayerKernel::validate(const ITensorInfo           *input,
+                                            const ITensorInfo           *input_squared,
+                                            const ITensorInfo           *output,
+                                            const NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
 
@@ -289,6 +193,6 @@ void NENormalizationLayerKernel::run(const Window &window, const ThreadInfo &inf
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
     // Run function
-    (this->*_func)(window);
+    (*_func)(window, _input, _input_squared, _output, _norm_info);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h
index 53a06b9ed9..5ba4c3edca 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
 
@@ -60,7 +60,8 @@ public:
      * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
      * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
+    void
+    configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
      *
      * @param[in] input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -72,30 +73,21 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info);
+    static Status validate(const ITensorInfo     *input,
+                           const ITensorInfo     *input_squared,
+                           const ITensorInfo     *output,
+                           NormalizationLayerInfo norm_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Function to perform normalization depending on the given template
-     *  dimension. The second template parameter specifies whether the
-     *  normalization has to be 1D or 2D.
-     *
-     * @note Only supported normalizations are:
-     *  - 1D over X or Z
-     *  - 2D over X and Y
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
-    void normalize_float(const Window &window);
-
     /** Common signature for all the specialised normalization functions
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window);
+    using NormalizationFunction = void (*)(
+        const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo);
 
 private:
     NormalizationFunction  _func;
@@ -105,4 +97,4 @@ private:
     NormalizationLayerInfo _norm_info;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 3e2c57a18c..c9bcbc9127 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,26 +28,31 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &paddings, const PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const PaddingList &paddings,
+                          const PaddingMode  mode)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(mode != PaddingMode::CONSTANT, "Only constant padding mode is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(paddings.size() > 4, "Padding list bigger than 4 dimensions");
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
-        const TensorInfo  expected_output_info  = input->clone()->set_tensor_shape(expected_output_shape);
+        const TensorShape expected_output_shape =
+            arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
+        const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -58,30 +63,34 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 template <typename T>
 void NEPadLayerKernel::run_pad_constant(const Window &window)
 {
-    Window output_window{ window };
+    Window output_window{window};
     output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     const size_t element_size = _input->info()->element_size();
     Iterator     output_it(_output, output_window);
-    execute_window_loop(output_window, [&](const Coordinates & id)
-    {
-        Coordinates idin{ id };
-        for(size_t dim = _padding.size() - 1; dim > 0; --dim)
+    execute_window_loop(
+        output_window,
+        [&](const Coordinates &id)
         {
-            idin[dim] -= _padding[dim].first;
-            if(idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+            Coordinates idin{id};
+            for (size_t dim = _padding.size() - 1; dim > 0; --dim)
             {
-                std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0), _constant_value.get<T>());
-                return;
+                idin[dim] -= _padding[dim].first;
+                if (idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+                {
+                    std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0),
+                                _constant_value.get<T>());
+                    return;
+                }
             }
-        }
-        T *input_it_ptr  = reinterpret_cast<T *>(_input->ptr_to_element(idin));
-        T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
-        std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
-        memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
-        std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, _constant_value.get<T>());
-    },
-    output_it);
+            T *input_it_ptr  = reinterpret_cast<T *>(_input->ptr_to_element(idin));
+            T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
+            std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
+            memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
+            std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second,
+                        _constant_value.get<T>());
+        },
+        output_it);
 }
 
 void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window)
@@ -92,7 +101,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
     const size_t end_plane   = window.z().end();
 
     size_t start_plane_input = start_plane;
-    if(_padding.size() > 2)
+    if (_padding.size() > 2)
     {
         start_plane_input = (start_plane < _padding[2].first) ? 0 : start_plane - _padding[2].first;
     }
@@ -105,18 +114,20 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
     const size_t jump_to_next_row_input  = _input->info()->dimension(0);
     const size_t jump_to_next_row_output = _padding[0].first + _padding[0].second;
 
-    uint8_t       *output_row_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
-    const uint8_t *input_it_ptr   = _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
-    const auto     pad_value      = _constant_value.get<uint8_t>();
+    uint8_t *output_row_ptr =
+        _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
+    const uint8_t *input_it_ptr =
+        _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
+    const auto pad_value = _constant_value.get<uint8_t>();
 
-    for(size_t z_i = start_plane; z_i < end_plane; ++z_i)
+    for (size_t z_i = start_plane; z_i < end_plane; ++z_i)
     {
-        if(_padding.size() > 2 && z_i < _padding[2].first)
+        if (_padding.size() > 2 && z_i < _padding[2].first)
         {
             memset(output_row_ptr, pad_value, output_plane_size);
             output_row_ptr += output_plane_size;
         }
-        else if(_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
+        else if (_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
         {
             memset(output_row_ptr, pad_value, output_plane_size);
             output_row_ptr += output_plane_size;
@@ -127,7 +138,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
             output_row_ptr += pad_y_elems_top;
             size_t y_i = _input->info()->dimension(1);
             // Basic loop unrolling
-            for(; y_i > 3; y_i -= 4)
+            for (; y_i > 3; y_i -= 4)
             {
                 memset(output_row_ptr, pad_value, _padding[0].first);
                 output_row_ptr += _padding[0].first;
@@ -160,7 +171,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
                 memset(output_row_ptr, pad_value, _padding[0].second);
                 output_row_ptr += _padding[0].second;
             }
-            for(; y_i > 0; --y_i)
+            for (; y_i > 0; --y_i)
             {
                 memset(output_row_ptr, pad_value, _padding[0].first);
                 output_row_ptr += _padding[0].first;
@@ -183,12 +194,17 @@ NEPadLayerKernel::NEPadLayerKernel()
 {
 }
 
-void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayerKernel::configure(ITensor           *input,
+                                 ITensor           *output,
+                                 const PaddingList &padding,
+                                 const PixelValue   constant_value,
+                                 const PaddingMode  mode)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     // Auto-init
-    const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
-    const TensorInfo  expected_output_info  = input->info()->clone()->set_tensor_shape(expected_output_shape);
+    const TensorShape expected_output_shape =
+        arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
+    const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape);
     auto_init_if_empty(*output->info(), expected_output_info);
 
     // Perform validation step
@@ -200,14 +216,14 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL
     _constant_value = constant_value;
     _mode           = mode;
 
-    if(_mode == PaddingMode::CONSTANT)
+    if (_mode == PaddingMode::CONSTANT)
     {
-        switch(_input->info()->element_size())
+        switch (_input->info()->element_size())
         {
             case 1:
-                if(_input->info()->num_dimensions() == 3 &&                           // Is 3D
-                   padding.size() <= 3 &&                                             // Has 3D padding
-                   !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
+                if (_input->info()->num_dimensions() == 3 &&                           // Is 3D
+                    padding.size() <= 3 &&                                             // Has 3D padding
+                    !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
                 {
                     _func = &NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad;
                 }
@@ -240,7 +256,11 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL
     ICPPKernel::configure(win);
 }
 
-Status NEPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayerKernel::validate(const ITensorInfo *input,
+                                  const ITensorInfo *output,
+                                  const PaddingList &padding,
+                                  const PixelValue   constant_value,
+                                  const PaddingMode  mode)
 {
     ARM_COMPUTE_UNUSED(constant_value);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, mode));
@@ -253,9 +273,18 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    if(_func != nullptr)
+    if (_func != nullptr)
     {
         (this->*_func)(window);
     }
 }
+
+size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+    ARM_COMPUTE_UNUSED(platform);
+
+    return ICPPKernel::default_mws;
+}
+
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h
index 00cda7dc22..d432887d2c 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.h
+++ b/src/core/NEON/kernels/NEPadLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_NEPADLAYERKERNEL_H
 #define ARM_COMPUTE_NEPADLAYERKERNEL_H
 
+#include "arm_compute/core/PixelValue.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -61,7 +63,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
      *                           Only CONSTANT padding mode is currently supported
      */
-    void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(ITensor           *input,
+                   ITensor           *output,
+                   const PaddingList &padding,
+                   const PixelValue   constant_value = PixelValue(),
+                   const PaddingMode  mode           = PaddingMode::CONSTANT);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -74,11 +80,24 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value = PixelValue(),
+                           const PaddingMode  mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] small_network_mws          Minimum workload size for requsted configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
 private:
     /** Template function to run the padding function with constant padding
      *
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 3d89933377..15e933e66e 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -36,7 +37,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo       *input1,
+                          const ITensorInfo       *input2,
+                          const ITensorInfo       *output,
+                          const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -45,10 +49,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 
     // Check variances
     const int var_size = info.variances().size();
-    if(var_size > 1)
+    if (var_size > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
-        for(int i = 0; i < var_size; ++i)
+        for (int i = 0; i < var_size; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
         }
@@ -56,17 +60,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
 
-    if(!info.max_sizes().empty())
+    if (!info.max_sizes().empty())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+                                        "Max and min sizes dimensions should match");
     }
 
-    for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+    for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+                                        "Max size should be greater than min size");
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -76,21 +82,26 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 }
 } // namespace
 
-NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
 {
 }
 
-void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
-                                              const int height)
+void NEPriorBoxLayerKernel::store_coordinates(float      *out,
+                                              const int   offset,
+                                              const float center_x,
+                                              const float center_y,
+                                              const float box_width,
+                                              const float box_height,
+                                              const int   width,
+                                              const int   height)
 {
     float xmin = (center_x - box_width / 2.f) / width;
     float ymin = (center_y - box_height / 2.f) / height;
     float xmax = (center_x + box_width / 2.f) / width;
     float ymax = (center_y + box_height / 2.f) / height;
 
-    float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
-    if(_info.clip())
+    float32x4_t vec_elements = {xmin, ymin, xmax, ymax};
+    if (_info.clip())
     {
         static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
         static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
@@ -112,7 +123,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
 
     int img_width  = _info.img_size().x;
     int img_height = _info.img_size().y;
-    if(img_width == 0 || img_height == 0)
+    if (img_width == 0 || img_height == 0)
     {
         img_width  = _input2->info()->dimension(width_idx);
         img_height = _input2->info()->dimension(height_idx);
@@ -120,7 +131,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
 
     float step_x = _info.steps()[0];
     float step_y = _info.steps()[1];
-    if(step_x == 0.f || step_y == 0.f)
+    if (step_x == 0.f || step_y == 0.f)
     {
         step_x = static_cast<float>(img_width) / layer_width;
         step_y = static_cast<float>(img_height) / layer_height;
@@ -130,74 +141,80 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
     slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
 
     Iterator output(_output, slice);
-    execute_window_loop(slice, [&](const Coordinates & id)
-    {
-        float center_x = 0;
-        float center_y = 0;
-        int   idx      = id.x() / (4 * num_priors);
-        center_x       = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
-        center_y       = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-
-        float box_width;
-        float box_height;
-        int   offset = 0;
-
-        auto out = reinterpret_cast<float *>(output.ptr());
-        for(unsigned int i = 0; i < _info.min_sizes().size(); ++i)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &id)
         {
-            const float min_size = _info.min_sizes().at(i);
-            box_width            = min_size;
-            box_height           = min_size;
-            store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
-            offset += 4;
-
-            if(!_info.max_sizes().empty())
+            float center_x = 0;
+            float center_y = 0;
+            int   idx      = id.x() / (4 * num_priors);
+            center_x       = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+            center_y       = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
+
+            float box_width;
+            float box_height;
+            int   offset = 0;
+
+            auto out = reinterpret_cast<float *>(output.ptr());
+            for (unsigned int i = 0; i < _info.min_sizes().size(); ++i)
             {
-                const float max_size = _info.max_sizes().at(i);
-                box_width            = std::sqrt(min_size * max_size);
-                box_height           = box_width;
-
+                const float min_size = _info.min_sizes().at(i);
+                box_width            = min_size;
+                box_height           = min_size;
                 store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
                 offset += 4;
-            }
 
-            // rest of priors
-            for(auto ar : _info.aspect_ratios())
-            {
-                if(fabs(ar - 1.) < 1e-6)
+                if (!_info.max_sizes().empty())
                 {
-                    continue;
+                    const float max_size = _info.max_sizes().at(i);
+                    box_width            = std::sqrt(min_size * max_size);
+                    box_height           = box_width;
+
+                    store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                    offset += 4;
                 }
 
-                box_width  = min_size * sqrt(ar);
-                box_height = min_size / sqrt(ar);
+                // rest of priors
+                for (auto ar : _info.aspect_ratios())
+                {
+                    if (fabs(ar - 1.) < 1e-6)
+                    {
+                        continue;
+                    }
 
-                store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
-                offset += 4;
+                    box_width  = min_size * sqrt(ar);
+                    box_height = min_size / sqrt(ar);
+
+                    store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                    offset += 4;
+                }
             }
-        }
 
-        // set the variance
-        out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
-        float32x4_t var;
-        if(_info.variances().size() == 1)
-        {
-            var = vdupq_n_f32(_info.variances().at(0));
-        }
-        else
-        {
-            const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
-            var                    = vars;
-        }
-        for(int i = 0; i < num_priors; ++i)
-        {
-            vst1q_f32(out + 4 * i, var);
-        }
-    },
-    output);
+            // set the variance
+            out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+            float32x4_t var;
+            if (_info.variances().size() == 1)
+            {
+                var = vdupq_n_f32(_info.variances().at(0));
+            }
+            else
+            {
+                const float32x4_t vars = {_info.variances().at(0), _info.variances().at(1), _info.variances().at(2),
+                                          _info.variances().at(3)};
+                var                    = vars;
+            }
+            for (int i = 0; i < num_priors; ++i)
+            {
+                vst1q_f32(out + 4 * i, var);
+            }
+        },
+        output);
 }
 
-void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayerKernel::configure(const ITensor           *input1,
+                                      const ITensor           *input2,
+                                      ITensor                 *output,
+                                      const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
@@ -215,7 +232,10 @@ void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *inpu
     INEKernel::configure(win);
 }
 
-Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayerKernel::validate(const ITensorInfo       *input1,
+                                       const ITensorInfo       *input2,
+                                       const ITensorInfo       *output,
+                                       const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
@@ -231,4 +251,4 @@ void NEPriorBoxLayerKernel::run(const Window &window, const ThreadInfo &info)
     // Run function
     calculate_prior_boxes(window);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
index 430a47f9f8..460f80e085 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
@@ -67,7 +67,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -84,7 +87,14 @@ private:
      * @param[in]  width      Input width.
      * @param[in]  height     Input height.
      */
-    void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height);
+    void store_coordinates(float      *out,
+                           const int   offset,
+                           const float center_x,
+                           const float center_y,
+                           const float box_width,
+                           const float box_height,
+                           const int   width,
+                           const int   height);
     /** Function to calculate prior boxes.
      *
      * @param[in] window Input region on which to execute the kernel.
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index a88b193b31..8e1ed3a2a5 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,17 +26,17 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/NESymm.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
 
 #include <map>
 
@@ -72,8 +72,8 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4
     const int64_t b_3 = vgetlane(b_high, 1);
 
     int64x2x2_t     result;
-    const int64x2_t result_0{ a_0 * b_0, a_1 * b_1 };
-    const int64x2_t result_1{ a_2 * b_2, a_3 * b_3 };
+    const int64x2_t result_0{a_0 * b_0, a_1 * b_1};
+    const int64x2_t result_1{a_2 * b_2, a_3 * b_3};
     result.val[0] = vadd(vmovl(vgetlow(bias)), result_0);
     result.val[1] = vadd(vmovl(vgethigh(bias)), result_1);
 
@@ -81,15 +81,17 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4
 }
 } // namespace
 
-void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias)
+void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input,
+                                                ITensor       *output,
+                                                const ITensor *weight,
+                                                const ITensor *bias)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
     ARM_COMPUTE_ERROR_ON(input == output);
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), weight->info(), bias->info()));
 
-    static const std::map<DataType, ComputeFuncType> fn_map =
-    {
-        { DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16) },
+    static const std::map<DataType, ComputeFuncType> fn_map = {
+        {DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16)},
     };
 
     _input  = input;
@@ -102,10 +104,10 @@ void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *o
     _output->info()->set_quantization_info(compute_output_qinfo());
 
     const UniformQuantizationInfo wq_info = _weight->info()->quantization_info().uniform();
-    const Status                  s       = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
+    const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
     _output_shift *= -1;
 
-    if(!bool(s))
+    if (!bool(s))
     {
         _output_multiplier = 0;
         _output_shift      = 0;
@@ -134,7 +136,10 @@ Window NEQLSTMLayerNormalizationKernel::configure_window(ITensor *target)
     return window;
 }
 
-Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *weight,
+                                                 const ITensorInfo *bias)
 {
     ARM_COMPUTE_UNUSED(output, bias, weight, input);
 
@@ -151,7 +156,7 @@ Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().x() != weight->tensor_shape().x());
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -182,11 +187,11 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
     using AccType       = int64_t;
     using InputDataType = int16_t;
 
-    AccType sum{ 0 };
-    AccType sum_sq{ 0 };
+    AccType sum{0};
+    AccType sum_sq{0};
 
     int32_t x = _window_start_x;
-    for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+    for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
     {
         using namespace wrapper;
         const int16x8_t val      = vloadq(input_ptr + x);
@@ -200,6 +205,7 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
         sum_sq += static_cast<AccType>(vaddv(vmul(val_low, val_low)));
         sum_sq += static_cast<AccType>(vaddv(vmul(val_high, val_high)));
 #else  // __aarch64__
+
         // only AArch64 supports vaddv
         const int64x2_t pair_sum_low  = vpaddl(val_low);
         const int64x2_t pair_sum_high = vpaddl(val_high);
@@ -215,7 +221,7 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
 #endif // __aarch64__
     }
 
-    for(; x < _window_end_x; ++x)
+    for (; x < _window_end_x; ++x)
     {
         const InputDataType val = input_ptr[x];
         sum += static_cast<AccType>(val);
@@ -229,7 +235,9 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
                                                                 int16_t       *output_ptr,
                                                                 const int16_t *weight_ptr,
                                                                 const int32_t *bias_ptr,
-                                                                int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift)
+                                                                int32_t        mean,
+                                                                int32_t        inv_std_mul,
+                                                                int32_t        inv_std_shift)
 {
     using OutputDataType = int16_t;
 
@@ -237,7 +245,7 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
     const int32x4_t mean_vec = vdup_n(mean, wrapper::traits::vector_128_tag{});
 
     int32_t x = _window_start_x;
-    for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+    for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
     {
         const int16x8_t val = vloadq(input_ptr + x);
         int32x4x2_t     shifted;
@@ -266,16 +274,18 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
         vstore(output_ptr + x + 4, vqmovn(out_val.val[1]));
     }
 
-    for(; x < _window_end_x; ++x)
+    for (; x < _window_end_x; ++x)
     {
-        const auto    val             = static_cast<int32_t>(input_ptr[x]);
-        const int32_t shifted         = (val << 10) - mean;
-        const int32_t rescaled        = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
-        const int64_t weighted        = rescaled * weight_ptr[x] + bias_ptr[x];
+        const auto    val      = static_cast<int32_t>(input_ptr[x]);
+        const int32_t shifted  = (val << 10) - mean;
+        const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
+        const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x];
         const auto    reverse_shifted = static_cast<int32_t>((weighted + 512) >> 10);
-        int32_t       out_val         = quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
-        out_val                       = utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
-        output_ptr[x]                 = static_cast<OutputDataType>(out_val);
+        int32_t       out_val =
+            quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
+        out_val =
+            utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
+        output_ptr[x] = static_cast<OutputDataType>(out_val);
     }
 }
 
@@ -286,35 +296,38 @@ void NEQLSTMLayerNormalizationKernel::compute_qsymm16()
     using BiasDataType   = int32_t;
     using AccType        = int64_t;
 
-    Iterator input_iterator{ _input, _inout_window };
-    Iterator output_iterator{ _output, _inout_window };
-    Iterator weight_iterator{ _weight, _weight_window };
-    Iterator bias_iterator{ _bias, _weight_window };
+    Iterator input_iterator{_input, _inout_window};
+    Iterator output_iterator{_output, _inout_window};
+    Iterator weight_iterator{_weight, _weight_window};
+    Iterator bias_iterator{_bias, _weight_window};
 
     const auto weight_ptr = reinterpret_cast<const InputDataType *>(weight_iterator.ptr());
     const auto bias_ptr   = reinterpret_cast<const BiasDataType *>(bias_iterator.ptr());
 
     const uint32_t column_size = _input->info()->tensor_shape()[0];
 
-    execute_window_loop(_inout_window, [ &, this](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
-        auto       out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
-
-        AccType sum{ 0 };
-        AccType sum_sq{ 0 };
-        std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
-
-        AccType mean{ 0 };
-        AccType variance{ 0 };
-        std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
-
-        int32_t stddev_invsqrt_mul{};
-        int32_t stddev_invsqrt_shift{};
-        quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul, stddev_invsqrt_shift);
-
-        normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
-    },
-    input_iterator, output_iterator);
+    execute_window_loop(
+        _inout_window,
+        [&, this](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
+            auto       out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
+
+            AccType sum{0};
+            AccType sum_sq{0};
+            std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
+
+            AccType mean{0};
+            AccType variance{0};
+            std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
+
+            int32_t stddev_invsqrt_mul{};
+            int32_t stddev_invsqrt_shift{};
+            quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul,
+                                                               stddev_invsqrt_shift);
+
+            normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
+        },
+        input_iterator, output_iterator);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
index a3ff6e988f..af5b6a0315 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
+
 #include <functional>
 
 namespace arm_compute
@@ -69,34 +70,26 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
     // constants
-    static constexpr uint32_t max_input_dimension{ 2 };  /**< The maximum input dimension supported */
-    static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */
-    static constexpr uint32_t max_bias_dimension{ 1 };   /**< The maximum bias dimension supported */
-    static constexpr uint32_t vector_size_byte{ 16 };    /**< Computation vector size in byte */
+    static constexpr uint32_t max_input_dimension{2};  /**< The maximum input dimension supported */
+    static constexpr uint32_t max_weight_dimension{1}; /**< The maximum weight dimension supported */
+    static constexpr uint32_t max_bias_dimension{1};   /**< The maximum bias dimension supported */
+    static constexpr uint32_t vector_size_byte{16};    /**< Computation vector size in byte */
 
     using ComputeFuncType = std::function<void(NEQLSTMLayerNormalizationKernel &)>;
 
     ComputeFuncType _fn{}; /**< Function pointer to computation function */
 
-    const ITensor *_input
-    {
-        nullptr
-    }; /**< Input tensor */
-    const ITensor *_weight
-    {
-        nullptr
-    }; /**< Weight tensor */
-    const ITensor *_bias
-    {
-        nullptr
-    };                           /**< Bias tensor */
-    ITensor *_output{ nullptr }; /**< Output tensor */
+    const ITensor *_input{nullptr};  /**< Input tensor */
+    const ITensor *_weight{nullptr}; /**< Weight tensor */
+    const ITensor *_bias{nullptr};   /**< Bias tensor */
+    ITensor       *_output{nullptr}; /**< Output tensor */
 
     int32_t _output_multiplier{}; /**< Multiplier for output values */
     int32_t _output_shift{};      /**< Shift value for output values */
@@ -138,7 +131,9 @@ private:
                             int16_t       *output_ptr,
                             const int16_t *weight_ptr,
                             const int32_t *bias_ptr,
-                            int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift);
+                            int32_t        mean,
+                            int32_t        inv_std_mul,
+                            int32_t        inv_std_shift);
     /** Function to compute output quantization information */
     QuantizationInfo compute_output_qinfo();
 };
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index ece7e40e31..486cd6d331 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,12 +26,15 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/roialign/list.h"
 
 #include <arm_neon.h>
 
@@ -41,24 +44,82 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+struct ROIAlignSelectorData
+{
+    DataType dt;
+};
+
+using ROIAlignSelctorPtr = std::add_pointer<bool(const ROIAlignSelectorData &data)>::type;
+using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor      *input,
+                                                 ITensor            *output,
+                                                 const ITensor      *rois,
+                                                 ROIPoolingLayerInfo pool_info,
+                                                 const Window       &window,
+                                                 const ThreadInfo   &info)>::type;
+
+struct ROIAlignKernel
+{
+    const char              *name;
+    const ROIAlignSelctorPtr is_selected;
+    ROIAlignUKernelPtr       ukernel;
+};
+
+static const ROIAlignKernel available_kernels[] = {
+    {"fp32_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)},
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    {"fp16_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)},
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {"qu8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)},
+    {"qs8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)},
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
+{
+    for (const auto &uk : available_kernels)
+    {
+        if (uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          ITensorInfo               *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+                                                           output->tensor_shape());
     }
 
-    if(input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
+    if (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
 
@@ -80,13 +141,17 @@ NEROIAlignLayerKernel::NEROIAlignLayerKernel()
 {
 }
 
-void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayerKernel::configure(const ITensor             *input,
+                                      const ITensor             *rois,
+                                      ITensor                   *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
     // Output auto inizialitation if not yet initialized
     const TensorShape output_shape = compute_roi_align_shape(*input->info(), *rois->info(), pool_info);
-    auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
     output->info()->set_data_layout(input->info()->data_layout());
 
     // Configure kernel window
@@ -104,334 +169,28 @@ void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois,
     INEKernel::configure(window);
 }
 
-Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayerKernel::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *rois,
+                                       ITensorInfo               *output,
+                                       const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
 }
 
-/** Average pooling over an aligned window */
-template <typename input_data_type>
-inline input_data_type roi_align_1x1(const ITensor *input,
-                                     unsigned int   roi_batch,
-                                     float          region_start_x,
-                                     float          bin_size_x,
-                                     int            grid_size_x,
-                                     float          region_end_x,
-                                     float          region_start_y,
-                                     float          bin_size_y,
-                                     int            grid_size_y,
-                                     float          region_end_y,
-                                     int            pz)
-{
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
-    {
-        return input_data_type(0);
-    }
-    else
-    {
-        const DataLayout data_layout = input->info()->data_layout();
-        float            avg         = 0;
-        // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
-        {
-            for(int ix = 0; ix < grid_size_x; ++ix)
-            {
-                // Align the window in the middle of every bin
-                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
-                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
-
-                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
-                const int y_low  = y;
-                const int x_low  = x;
-                const int y_high = y_low + 1;
-                const int x_high = x_low + 1;
-
-                const float ly = y - y_low;
-                const float lx = x - x_low;
-                const float hy = 1. - ly;
-                const float hx = 1. - lx;
-
-                const float w1 = hy * hx;
-                const float w2 = hy * lx;
-                const float w3 = ly * hx;
-                const float w4 = ly * lx;
-                if(data_layout == DataLayout::NCHW)
-                {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
-                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                }
-                else
-                {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
-                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                }
-            }
-        }
-
-        avg /= grid_size_x * grid_size_y;
-        return input_data_type(avg);
-    }
-}
-
-/** Average pooling over an aligned window */
-template <typename input_data_type>
-inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
-                                             unsigned int            roi_batch,
-                                             float                   region_start_x,
-                                             float                   bin_size_x,
-                                             int                     grid_size_x,
-                                             float                   region_end_x,
-                                             float                   region_start_y,
-                                             float                   bin_size_y,
-                                             int                     grid_size_y,
-                                             float                   region_end_y,
-                                             int                     pz,
-                                             const QuantizationInfo &out_qinfo)
-{
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
-    {
-        return input_data_type(out_qinfo.uniform().offset);
-    }
-    else
-    {
-        float                         avg              = 0;
-        const UniformQuantizationInfo input_qinfo      = input->info()->quantization_info().uniform();
-        const bool                    is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
-        const DataLayout              data_layout      = input->info()->data_layout();
-
-        // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
-        {
-            for(int ix = 0; ix < grid_size_x; ++ix)
-            {
-                // Align the window in the middle of every bin
-                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
-                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
-
-                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
-                const int y_low  = y;
-                const int x_low  = x;
-                const int y_high = y_low + 1;
-                const int x_high = x_low + 1;
-
-                const float ly = y - y_low;
-                const float lx = x - x_low;
-                const float hy = 1. - ly;
-                const float hx = 1. - lx;
-
-                const float w1 = hy * hx;
-                const float w2 = hy * lx;
-                const float w3 = ly * hx;
-                const float w4 = ly * lx;
-
-                if(data_layout == DataLayout::NCHW)
-                {
-                    if(is_qasymm_signed)
-                    {
-                        float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                    else
-                    {
-                        float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                }
-                else
-                {
-                    if(is_qasymm_signed)
-                    {
-                        const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                    else
-                    {
-                        const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
-                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-                    }
-                }
-            }
-        }
-
-        avg /= grid_size_x * grid_size_y;
-
-        input_data_type res = 0;
-        if(is_qasymm_signed)
-        {
-            res = quantize_qasymm8_signed(avg, out_qinfo);
-        }
-        else
-        {
-            res = quantize_qasymm8(avg, out_qinfo);
-        }
-        return res;
-    }
-}
-
-inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, float max_value)
-{
-    const float region_start = p * bin_size + roi_anchor;
-    return utility::clamp(region_start, 0.0f, max_value);
-}
-
 void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     const DataLayout data_layout = _input->info()->data_layout();
-    if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
     {
-        switch(_input->info()->data_type())
-        {
-            case DataType::QASYMM8:
-            {
-                NEROIAlignLayerKernel::internal_run<uint8_t, uint16_t>(window, info);
-                break;
-            }
-            case DataType::QASYMM8_SIGNED:
-            {
-                NEROIAlignLayerKernel::internal_run<int8_t, uint16_t>(window, info);
-                break;
-            }
-            case DataType::F32:
-            {
-                NEROIAlignLayerKernel::internal_run<float>(window, info);
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                NEROIAlignLayerKernel::internal_run<float16_t>(window, info);
-                break;
-            }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            default:
-            {
-                ARM_COMPUTE_ERROR("DataType not supported");
-                break;
-            }
-        }
+        const auto *uk = get_implementation(ROIAlignSelectorData{_input->info()->data_type()});
+        ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+        uk->ukernel(_input, _output, _rois, _pool_info, window, info);
     }
     else
     {
         ARM_COMPUTE_ERROR("Invalid layout");
     }
 }
-
-template <typename input_data_type, typename roi_data_type>
-void NEROIAlignLayerKernel::internal_run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const DataLayout data_layout    = _input->info()->data_layout();
-    const size_t     values_per_roi = _rois->info()->dimension(0);
-
-    const int roi_list_start = window.x().start();
-    const int roi_list_end   = window.x().end();
-
-    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int idx_depth  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    const int input_width   = _input->info()->dimension(idx_width);
-    const int input_height  = _input->info()->dimension(idx_height);
-    const int input_chanels = _input->info()->dimension(idx_depth);
-    const int pooled_w      = _pool_info.pooled_width();
-    const int pooled_h      = _pool_info.pooled_height();
-
-    const DataType data_type = _input->info()->data_type();
-    const bool     is_qasymm = is_data_type_quantized_asymmetric(data_type);
-
-    const auto             *rois_ptr   = reinterpret_cast<const roi_data_type *>(_rois->buffer());
-    const QuantizationInfo &rois_qinfo = _rois->info()->quantization_info();
-    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
-    {
-        const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
-
-        roi_data_type qx1 = rois_ptr[values_per_roi * roi_indx + 1];
-        roi_data_type qy1 = rois_ptr[values_per_roi * roi_indx + 2];
-        roi_data_type qx2 = rois_ptr[values_per_roi * roi_indx + 3];
-        roi_data_type qy2 = rois_ptr[values_per_roi * roi_indx + 4];
-        float         x1(qx1);
-        float         x2(qx2);
-        float         y1(qy1);
-        float         y2(qy2);
-        if(is_qasymm)
-        {
-            x1 = dequantize_qasymm16(qx1, rois_qinfo);
-            x2 = dequantize_qasymm16(qx2, rois_qinfo);
-            y1 = dequantize_qasymm16(qy1, rois_qinfo);
-            y2 = dequantize_qasymm16(qy2, rois_qinfo);
-        }
-        const float roi_anchor_x = x1 * _pool_info.spatial_scale();
-        const float roi_anchor_y = y1 * _pool_info.spatial_scale();
-        const float roi_dims_x   = std::max((x2 - x1) * _pool_info.spatial_scale(), 1.0f);
-        const float roi_dims_y   = std::max((y2 - y1) * _pool_info.spatial_scale(), 1.0f);
-        float       bin_size_x   = roi_dims_x / _pool_info.pooled_width();
-        float       bin_size_y   = roi_dims_y / _pool_info.pooled_height();
-
-        // Iterate through all feature maps
-        for(int ch = 0; ch < input_chanels; ++ch)
-        {
-            // Iterate through all output pixels
-            for(int py = 0; py < pooled_h; ++py)
-            {
-                for(int px = 0; px < pooled_w; ++px)
-                {
-                    const float     region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
-                    const float     region_end_x   = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_end_y   = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
-                    const int       roi_bin_grid_x = (_pool_info.sampling_ratio() > 0) ? _pool_info.sampling_ratio() : int(ceil(bin_size_x));
-                    const int       roi_bin_grid_y = (_pool_info.sampling_ratio() > 0) ? _pool_info.sampling_ratio() : int(ceil(bin_size_y));
-                    input_data_type out_val(0);
-                    if(is_qasymm)
-                    {
-                        out_val = roi_align_1x1_qasymm8<input_data_type>(
-                                      _input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch, _output->info()->quantization_info());
-                    }
-                    else
-                    {
-                        out_val = roi_align_1x1<input_data_type>(
-                                      _input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch);
-                    }
-
-                    if(data_layout == DataLayout::NCHW)
-                    {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(_output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
-                        *out_ptr     = out_val;
-                    }
-                    else
-                    {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(_output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
-                        *out_ptr     = out_val;
-                    }
-                }
-            }
-        }
-    }
-}
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
index fa31a879b7..9cc538b429 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -83,15 +83,15 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    template <typename input_data_type, typename roi_data_type = input_data_type>
-    void internal_run(const Window &window, const ThreadInfo &info);
-
     const ITensor      *_input;
     ITensor            *_output;
     const ITensor      *_rois;
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 400e8291d6..1a3810fb56 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -36,7 +38,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          const ITensorInfo         *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, rois);
 
@@ -47,10 +52,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+                                    (output->dimension(1) != pool_info.pooled_height()));
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
         ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
     }
@@ -73,19 +79,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con
  * @param[in]  roi_indx       Index of image of coordinate in output Tensor to store value
  */
 template <typename T>
-void template_eval(const ITensor *input, const ITensor *output, int region_start_x, int region_start_y,
-                   int region_end_x, int region_end_y, int fm, int px, int py, int roi_batch, int roi_indx)
+void template_eval(const ITensor *input,
+                   const ITensor *output,
+                   int            region_start_x,
+                   int            region_start_y,
+                   int            region_end_x,
+                   int            region_end_y,
+                   int            fm,
+                   int            px,
+                   int            py,
+                   int            roi_batch,
+                   int            roi_indx)
 {
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
     {
         *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = 0;
     }
     else
     {
         T curr_max = std::numeric_limits<T>::lowest(); // Min value of typename T
-        for(int j = region_start_y; j < region_end_y; ++j)
+        for (int j = region_start_y; j < region_end_y; ++j)
         {
-            for(int i = region_start_x; i < region_end_x; ++i)
+            for (int i = region_start_x; i < region_end_x; ++i)
             {
                 const auto val = *reinterpret_cast<const T *>(input->ptr_to_element(Coordinates(i, j, fm, roi_batch)));
                 curr_max       = std::max(val, curr_max);
@@ -93,11 +108,13 @@ void template_eval(const ITensor *input, const ITensor *output, int region_start
         }
 
         // if quantized datatype, requantize then store in output tensor
-        if(is_data_type_quantized(input->info()->data_type()))
+        if (is_data_type_quantized(input->info()->data_type()))
         {
             // covert qasymm to new output quantization scale and offset
-            UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
-            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = quantize_qasymm8(curr_max, uqinfo);
+            UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(
+                input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
+            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) =
+                quantize_qasymm8(curr_max, uqinfo);
         }
         else
         {
@@ -112,13 +129,19 @@ NEROIPoolingLayerKernel::NEROIPoolingLayerKernel()
 {
 }
 
-Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIPoolingLayerKernel::validate(const ITensorInfo         *input,
+                                         const ITensorInfo         *rois,
+                                         const ITensorInfo         *output,
+                                         const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
 }
 
-void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayerKernel::configure(const ITensor             *input,
+                                        const ITensor             *rois,
+                                        const ITensor             *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
 
@@ -126,12 +149,15 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *roi
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
 
     // Output auto initialization if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+                             rois->info()->dimension(1));
 
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), output->info()->quantization_info());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       output->info()->quantization_info());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) ||
+                         (output->info()->dimension(1) != pool_info.pooled_height()));
 
     // Set instance variables
     _input     = input;
@@ -167,7 +193,7 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
     const auto *rois_ptr  = reinterpret_cast<const uint16_t *>(_rois->buffer());
     const auto  data_type = _input->info()->data_type();
 
-    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+    for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
     {
         const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
         const auto         x1        = rois_ptr[values_per_roi * roi_indx + 1];
@@ -182,30 +208,35 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
         const int roi_height   = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f);
 
         // Iterate through all feature maps
-        for(int fm = 0; fm < fms; ++fm)
+        for (int fm = 0; fm < fms; ++fm)
         {
             // Iterate through all output pixels
-            for(int py = 0; py < pooled_h; ++py)
+            for (int py = 0; py < pooled_h; ++py)
             {
-                for(int px = 0; px < pooled_w; ++px)
+                for (int px = 0; px < pooled_w; ++px)
                 {
                     auto region_start_x = static_cast<int>(std::floor((static_cast<float>(px) / pooled_w) * roi_width));
-                    auto region_end_x   = static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
-                    auto region_start_y = static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
-                    auto region_end_y   = static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
+                    auto region_end_x =
+                        static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
+                    auto region_start_y =
+                        static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
+                    auto region_end_y =
+                        static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
 
                     region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width);
                     region_end_x   = std::min(std::max(region_end_x + roi_anchor_x, 0), width);
                     region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height);
                     region_end_y   = std::min(std::max(region_end_y + roi_anchor_y, 0), height);
 
-                    switch(data_type)
+                    switch (data_type)
                     {
                         case DataType::F32:
-                            template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+                            template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x,
+                                                 region_end_y, fm, px, py, roi_batch, roi_indx);
                             break;
                         case DataType::QASYMM8:
-                            template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+                            template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x,
+                                                     region_end_y, fm, px, py, roi_batch, roi_indx);
                             break;
                         default:
                             ARM_COMPUTE_ERROR("DataType not Supported");
@@ -216,4 +247,4 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
         }
     }
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
index e7a7e90eef..81f6006ea2 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
@@ -63,7 +63,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor.
      */
-    void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -82,7 +83,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
 private:
     const ITensor      *_input;
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index 0395e0bd34..87b7b76b72 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,91 +27,99 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "arm_compute/core/Utils.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/range/list.h"
 
 namespace arm_compute
 {
 namespace
 {
-template <typename T>
-void range_function(ITensor *output, float start, float step, const Window &window)
+struct RangeSelectorData
 {
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
-
-    const auto step_vec  = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
-    const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
-    auto       id_vec    = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+    DataType dt;
+};
 
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16 / sizeof(T);
+using RangeSelectorPtr = std::add_pointer<bool(const RangeSelectorData &data)>::type;
+using RangeUKernelPtr  = std::add_pointer<void(ITensor *, float, float, const Window &)>::type;
 
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator output_it(output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
+struct RangeUKernel
+{
+    const char            *name;
+    const RangeSelectorPtr is_selected;
+    RangeUKernelPtr        ukernel;
+};
+
+static const RangeUKernel available_kernels[] = {
+    {"fp16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)},
+    {"f32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)},
+    {"u8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)},
+    {"u16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)},
+    {"u32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)},
+    {"s8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)},
+    {"s16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)},
+    {"s32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)},
+};
+
+/** Micro-kernel selector
+ *
+ * @param[in] data Selection data passed to help pick the appropriate micro-kernel
+ *
+ * @return A matching micro-kernel else nullptr
+ */
+const RangeUKernel *get_implementation(const RangeSelectorData &data)
+{
+    for (const auto &uk : available_kernels)
     {
-        int        x       = window_start_x;
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        if (uk.is_selected(data))
         {
-            for(int count = 0; count < window_step_x; ++count)
-            {
-                id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
-            }
-
-            // start + step * id
-            const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
-            wrapper::vstore(out_ptr + x, res_vec);
+            return &uk;
         }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const auto res = start + x * step;
-            *(out_ptr + x) = res;
-        }
-
-    },
-    output_it);
+    }
+    return nullptr;
 }
 
 Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output,
-                                                         1,
-                                                         DataType::U8, DataType::S8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
+    const auto *uk = get_implementation(RangeSelectorData{output.data_type()});
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()),
+                                    "start value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()),
+                                    "end value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()),
+                                    "step value is outside the range of the data type");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+                                    "Output tensor size is incorrect");
 
     return Status{};
 }
 } // namespace
 
-NERangeKernel::NERangeKernel()
-    : _func(nullptr), _start(0), _end(1), _step(1), _output(nullptr)
+NERangeKernel::NERangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
 {
 }
 
@@ -122,7 +130,8 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step));
 
     // Auto initialize output if not initialized
-    auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, output->info()->data_type(), output->info()->quantization_info());
+    auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1,
+                       output->info()->data_type(), output->info()->quantization_info());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -131,38 +140,6 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste
     _end    = end;
     _step   = step;
     _output = output;
-    switch(_output->info()->data_type())
-    {
-        case DataType::U8:
-            _func = &range_function<uint8_t>;
-            break;
-        case DataType::U16:
-            _func = &range_function<uint16_t>;
-            break;
-        case DataType::U32:
-            _func = &range_function<uint32_t>;
-            break;
-        case DataType::S8:
-            _func = &range_function<int8_t>;
-            break;
-        case DataType::S16:
-            _func = &range_function<int16_t>;
-            break;
-        case DataType::S32:
-            _func = &range_function<int32_t>;
-            break;
-        case DataType::F32:
-            _func = &range_function<float>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &range_function<float16_t>;
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-            break;
-    }
 
     INEKernel::configure(win);
 }
@@ -181,8 +158,8 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    const auto *uk = get_implementation(RangeSelectorData{_output->info()->data_type()});
 
-    (*_func)(_output, _start, _step, window);
+    uk->ukernel(_output, _start, _step, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h
index 7c42ef11dc..fa555c2c2e 100644
--- a/src/core/NEON/kernels/NERangeKernel.h
+++ b/src/core/NEON/kernels/NERangeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NERANGEKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -80,11 +81,10 @@ public:
 private:
     using RangeFunction = void(ITensor *output, float start, float step, const Window &window);
 
-    RangeFunction *_func;   /**< Range function to be called */
-    float          _start;  /**< Start of sequence */
-    float          _end;    /**< End of sequence */
-    float          _step;   /**< Increment/step value */
-    ITensor       *_output; /**< Destination tensor */
+    float    _start;  /**< Start of sequence */
+    float    _end;    /**< End of sequence */
+    float    _step;   /**< Increment/step value */
+    ITensor *_output; /**< Destination tensor */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NERANGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 2d6db764f4..5380e6ccce 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,1603 +28,224 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-#include "support/SaturateCast.h"
-
+#include "src/core/NEON/INEKernel.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include <arm_neon.h>
+#include "src/cpu/kernels/reduction_layer/generic/neon/list.h"
 
 namespace arm_compute
 {
-namespace
-{
-// Helper function that calls vqmovun/vqmvn, vcombine and vstore, allows templating of RedOpYZW_quantized
-template <typename T>
-void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0)
-{
-    if(std::is_same<T, uint8_t>::value)
-    {
-        auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2));
-        wrapper::vstore(output.ptr() + offset, res);
-    }
-    else
-    {
-        auto res = wrapper::vcombine(wrapper::vqmovn(t1), wrapper::vqmovn(t2));
-        wrapper::vstore(reinterpret_cast<int8_t *>(output.ptr() + offset), res);
-    }
-}
-
-template <typename T>
-uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
-{
-    uint32x4_t mask{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
-    {
-        mask = wrapper::vcgt(b, a);
-    }
-    else
-    {
-        mask = wrapper::vclt(b, a);
-    }
-
-    uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
-    if(axis != 0)
-    {
-        vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
-    }
-    uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } };
-
-    return res;
-}
-
-template <typename T>
-uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
-{
-    uint32x4x4_t mask{ { 0 } };
-    uint8x16_t   mask_u8{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
-    {
-        mask_u8 = wrapper::vcgt(b, a);
-    }
-    else
-    {
-        mask_u8 = wrapper::vclt(b, a);
-    }
-    auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
-    auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
-    mask.val[0]     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
-    mask.val[1]     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
-    mask.val[2]     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
-    mask.val[3]     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
-
-    uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
-            { idx + 4, idx + 5, idx + 6, idx + 7 },
-            { idx + 8, idx + 9, idx + 10, idx + 11 },
-            { idx + 12, idx + 13, idx + 14, idx + 15 }
-        }
-    };
-    if(axis != 0)
-    {
-        vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
-        vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
-        vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
-        vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
-    }
-    uint32x4x4_t res =
-    {
-        {
-            vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
-            vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
-            vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
-            vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
-        }
-    };
-
-    return res;
-}
-
-// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
-template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
-       typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
-       calculate_min(T in)
-{
-    auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
-    return wrapper::vpmin(pmin, pmin);
-}
-
-// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
-template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
-       typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
-       calculate_min(T in)
-{
-    auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
-    pmin      = wrapper::vpmin(pmin, pmin);
-    pmin      = wrapper::vpmin(pmin, pmin);
-    return wrapper::vpmin(pmin, pmin);
-}
-
-// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
-template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
-       typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
-       calculate_max(T in)
-{
-    auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
-    return wrapper::vpmax(pmax, pmax);
-}
-
-// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
-template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
-       typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
-       calculate_max(T in)
-{
-    auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
-    pmax      = wrapper::vpmax(pmax, pmax);
-    pmax      = wrapper::vpmax(pmax, pmax);
-    return wrapper::vpmax(pmax, pmax);
-}
-
-template <typename T>
-uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
-{
-    uint32x4_t res_idx_mask{ 0 };
-    uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
-
-    if(op == ReductionOperation::ARG_IDX_MIN)
-    {
-        auto pmin    = calculate_min(vec_res_value);
-        auto mask    = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
-        res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
-    }
-    else
-    {
-        auto pmax    = calculate_max(vec_res_value);
-        auto mask    = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
-        res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
-    }
-
-    res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones);
-    auto pmin    = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask));
-    pmin         = wrapper::vpmin(pmin, pmin);
-    uint32_t res = wrapper::vgetlane(pmin, 0);
-
-    return (res - 0xFFFFFFFF);
-}
-
-template <typename T>
-uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
-{
-    uint32x4x4_t res_idx_mask{ { 0 } };
-    uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
-    uint8x16_t   mask_u8{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
-    {
-        auto pmin = calculate_min(vec_res_value);
-        mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
-    }
-    else
-    {
-        auto pmax = calculate_max(vec_res_value);
-        mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
-    }
-
-    // Widen vectors
-    auto wide_u16_1     = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
-    auto wide_u16_2     = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
-    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
-    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
-    auto wide_u32_3     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
-    auto wide_u32_4     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
-    res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
-    res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
-    res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
-    res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4);
-    res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
-    res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
-    res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones);
-    res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones);
-
-    uint32_t res  = 0xFFFFFFFF;
-    int      iter = 0;
-    do
-    {
-        auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
-        pmin      = wrapper::vpmin(pmin, pmin);
-        res       = std::min(wrapper::vgetlane(pmin, 0), res);
-        iter++;
-    }
-    while(iter < 4);
-
-    return (res - 0xFFFFFFFF);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
-{
-    uint32x4x2_t mask{ 0 };
-    uint16x8_t   mask_u16{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
-    {
-        mask_u16 = wrapper::vcgt(b, a);
-    }
-    else
-    {
-        mask_u16 = wrapper::vclt(b, a);
-    }
-    mask.val[0]          = wrapper::vmovl(wrapper::vgetlow(mask_u16));
-    mask.val[1]          = wrapper::vmovl(wrapper::vgethigh(mask_u16));
-    uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
-            { idx + 4, idx + 5, idx + 6, idx + 7 }
-        }
-    };
-    if(axis != 0)
-    {
-        vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
-        vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
-    }
-    uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
-                         wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
-                         0, 0
-                       };
-
-    return res;
-}
 
-// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
-inline float16x4_t calculate_min(float16x8_t in)
-{
-    auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
-    pmin      = wrapper::vpmin(pmin, pmin);
-    return wrapper::vpmin(pmin, pmin);
-}
-// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
-inline float16x4_t calculate_max(float16x8_t in)
+void NEReductionOperationKernel::reduce_op()
 {
-    auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
-    pmax      = wrapper::vpmax(pmax, pmax);
-    return wrapper::vpmax(pmax, pmax);
-}
+    const bool is_complex = (_input->info()->num_channels() == 2);
 
-template <>
-uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
-{
-    uint32x4x2_t res_idx_mask{ 0 };
-    uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
-    uint16x8_t   mask_u16;
-    if(op == ReductionOperation::ARG_IDX_MIN)
-    {
-        auto pmin = calculate_min(vec_res_value);
-        mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
-    }
-    else
+    if (is_complex)
     {
-        auto pmax = calculate_max(vec_res_value);
-        mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
-    }
-
-    // Widen vectors
-    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
-    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
-    res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
-    res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
-    res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
-    res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
-
-    uint32_t res  = 0xFFFFFFFF;
-    int      iter = 0;
-    do
-    {
-        auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
-        pmin      = wrapper::vpmin(pmin, pmin);
-        res       = std::min(wrapper::vgetlane(pmin, 0), res);
-        iter++;
-    }
-    while(iter < 2);
-
-    return (res - 0xFFFFFFFF);
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <class F>
-class Reducer
-{
-public:
-    static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
-    {
-        // Set out window
-        Window out_window(window);
-        out_window.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        f(window, out_window, input, output, op);
-    }
-    static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
-    {
-        // Set in window
-        Window in_window(window);
-        Window out_window(window);
-
-        in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
-        out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1)));
-
-        f(in_window, out_window, input, output, 1, op);
-    }
-    static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
-    {
-        // Set in window
-        Window in_window(window);
-        Window out_window(window);
-
-        in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2)));
-
-        f(in_window, out_window, input, output, 2, op);
-    }
-    static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
-    {
-        // Set in/out window
-        Window in_window(window);
-        Window out_window(window);
-
-        in_window.set(3, Window::Dimension(0, 1, 1));
-        out_window.set(3, Window::Dimension(0, 1, 1));
-
-        f(in_window, out_window, input, output, 3, op);
-    }
-};
-
-template <typename T, int S>
-struct RedOpX
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
-    {
-        const size_t input_dim_0    = in->info()->dimension(0);
-        const int    window_step_x  = 16 / sizeof(T);
-        const auto   window_start_x = static_cast<int>(in_window.x().start());
-        const auto   window_end_x   = static_cast<int>(in_window.x().end());
-
-        Window in_win_no_pad = in_window;
-        in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input(in, in_win_no_pad);
-        Iterator output(out, out_window);
-
-        execute_window_loop(in_win_no_pad, [&](const Coordinates &)
+        switch (_reduction_axis)
         {
-            const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-
-            auto init_res_value = static_cast<T>(0.f);
-            switch(op)
-            {
-                case ReductionOperation::ARG_IDX_MAX:
-                case ReductionOperation::ARG_IDX_MIN:
-                case ReductionOperation::MIN:
-                case ReductionOperation::MAX:
-                {
-                    init_res_value = static_cast<T>(*input_ptr);
-                    break;
-                }
-                case ReductionOperation::PROD:
-                {
-                    init_res_value = static_cast<T>(1.f);
-                    break;
-                }
-                default:
-                    break;
-            }
-            auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
-            uint32x4x4_t vec_res_idx{ { 0 } };
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vec_elements = wrapper::vloadq(input_ptr + x);
-                switch(op)
+            case 2:
+                switch (_input->info()->data_type())
                 {
-                    case ReductionOperation::SUM_SQUARE:
-                        vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
-                        break;
-                    case ReductionOperation::MEAN_SUM:
-                    case ReductionOperation::SUM:
-                        vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
-                        break;
-                    case ReductionOperation::PROD:
-                        vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
-                        break;
-                    case ReductionOperation::ARG_IDX_MIN:
-                    {
-                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
-                        break;
-                    }
-                    case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
-                        break;
-                    }
-                    case ReductionOperation::MIN:
+                    case DataType::F32:
                     {
-                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        switch (_op)
+                        {
+                            case ReductionOperation::SUM:
+                                _func = REGISTER_FP32_NEON(cpu::reduce_RedOpYZW_complex_reduceZ_float32_4_2_SUM);
+                                break;
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                                break;
+                        }
                         break;
                     }
-                    case ReductionOperation::MAX:
+                    default:
                     {
-                        vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        ARM_COMPUTE_ERROR("Not supported");
                         break;
                     }
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
                 }
+                break;
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
             }
+        }
+        return;
+    }
 
-            switch(op)
+    switch (_reduction_axis)
+    {
+        case 0:
+        {
+            switch (_input->info()->data_type())
             {
-                case ReductionOperation::SUM:
-                case ReductionOperation::MEAN_SUM:
-                case ReductionOperation::SUM_SQUARE:
+                case DataType::QASYMM8:
                 {
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
-                    auto res = static_cast<T>(0.f);
-                    for(int i = 0; i < S; ++i)
-                    {
-                        res += wrapper::vgetlane(vec_res_value, i);
-                    }
-#else  // ARM_COMPUTE_DEBUG_ENABLED
-                    auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
-                    for(int i = 0; i < S / 4; ++i)
-                    {
-                        carry_res = wrapper::vpadd(carry_res, carry_res);
-                    }
-                    auto res = wrapper::vgetlane(carry_res, 0);
-#endif // ARM_COMPUTE_DEBUG_ENABLED
-                    if(op == ReductionOperation::SUM_SQUARE)
-                    {
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            res += (*(input_ptr + x)) * (*(input_ptr + x));
-                        }
-                    }
-                    else
-                    {
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            res += *(input_ptr + x);
-                        }
-                    }
-
-                    if(op == ReductionOperation::MEAN_SUM)
-                    {
-                        res /= input_dim_0;
-                    }
-
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
+                    _func = REGISTER_QASYMM8_NEON(cpu::reduce_RedOpX_reduceX_qasymm8);
                     break;
                 }
-                case ReductionOperation::PROD:
+                case DataType::QASYMM8_SIGNED:
                 {
-                    auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
-                    T    res       = 1;
-                    for(int i = 0; i < S / 2; ++i)
-                    {
-                        res *= wrapper::vgetlane(carry_res, i);
-                    }
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res *= *(input_ptr + x);
-                    }
-
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
+                    _func = REGISTER_QASYMM8_SIGNED_NEON(cpu::reduce_RedOpX_reduceX_qasymm8_signed);
                     break;
                 }
-                case ReductionOperation::ARG_IDX_MIN:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                case DataType::F16:
                 {
-                    auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        if(*(input_ptr + x) < res)
-                        {
-                            idx = x;
-                            res = *(input_ptr + x);
-                        }
-                    }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                    _func = REGISTER_FP16_NEON(cpu::reduce_RedOpX_reduceX_float16_8);
                     break;
                 }
-                case ReductionOperation::ARG_IDX_MAX:
+#endif // ARM_COMPUTE_ENABLE_FP16
+                case DataType::F32:
                 {
-                    auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        if(*(input_ptr + x) > res)
-                        {
-                            idx = x;
-                            res = *(input_ptr + x);
-                        }
-                    }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                    _func = REGISTER_FP32_NEON(cpu::reduce_RedOpX_reduceX_float32_4);
                     break;
                 }
-                case ReductionOperation::MIN:
+                case DataType::S32:
                 {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
+                    _func = REGISTER_INTEGER_NEON(cpu::reduce_RedOpX_reduceX_S32_4);
                     break;
                 }
-                case ReductionOperation::MAX:
+                default:
                 {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
+                    ARM_COMPUTE_ERROR("Not supported");
                     break;
                 }
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
             }
-        },
-        input, output);
-    }
-};
-
-template <typename T>
-struct RedOpX_quantized
-{
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
-    {
-        using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
-
-        const TensorInfo              in_info = *(in->info());
-        const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
-
-        const int  window_step_x  = 16 / sizeof(T);
-        const auto window_start_x = static_cast<int>(in_window.x().start());
-        const auto window_end_x   = static_cast<int>(in_window.x().end());
-
-        Window in_win_no_pad = in_window;
-        in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input(in, in_win_no_pad);
-        Iterator output(out, out_window);
-
-        execute_window_loop(in_win_no_pad, [&](const Coordinates &)
+            break;
+        }
+        case 1:
         {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
-            auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-
-            auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
-
-            typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = { 0 };
-
-            if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX)
+            switch (_input->info()->data_type())
             {
-                vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
-            }
-
-            uint32x4x4_t vec_res_idx{ { 0 } };
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vec_elements = wrapper::vloadq(input_ptr + x);
-                switch(op)
-                {
-                    case ReductionOperation::SUM:
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                        vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
-                        vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
-                        vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
-                        vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
-                        const auto scale32x4f_4  = vdupq_n_f32(iq_info.scale);
-
-                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                        auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
-                        auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
-                        auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
-                        auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
-
-                        //de-quantize vec_elements
-                        temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
-                        vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
-                        vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
-                        vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
-                        vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
-                        break;
-                    }
-                    case ReductionOperation::ARG_IDX_MIN:
-                    {
-                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
-                        break;
-                    }
-                    case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
-                        break;
-                    }
-                    case ReductionOperation::MIN:
-                    {
-                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        break;
-                    }
-                    case ReductionOperation::MAX:
-                    {
-                        vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        break;
-                    }
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
-                }
-            }
-
-            switch(op)
-            {
-                case ReductionOperation::ARG_IDX_MIN:
+                case DataType::QASYMM8:
                 {
-                    auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        if(*(input_ptr + x) < res)
-                        {
-                            idx = x;
-                            res = *(input_ptr + x);
-                        }
-                    }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                    _func = REGISTER_QASYMM8_NEON(cpu::reduce_RedOpYZW_reduceY_qasymm8);
                     break;
                 }
-                case ReductionOperation::ARG_IDX_MAX:
+                case DataType::QASYMM8_SIGNED:
                 {
-                    auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        if(*(input_ptr + x) > res)
-                        {
-                            idx = x;
-                            res = *(input_ptr + x);
-                        }
-                    }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                    _func = REGISTER_QASYMM8_SIGNED_NEON(cpu::reduce_RedOpYZW_reduceY_qasymm8_signed);
                     break;
                 }
-                case ReductionOperation::MIN:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                case DataType::F16:
                 {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
+                    _func = REGISTER_FP16_NEON(cpu::reduce_RedOpYZW_reduceY_float16_8);
                     break;
                 }
-                case ReductionOperation::MAX:
+#endif // ARM_COMPUTE_ENABLE_FP16
+                case DataType::F32:
                 {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
+                    _func = REGISTER_FP32_NEON(cpu::reduce_RedOpYZW_reduceY_float32_4);
                     break;
                 }
-                case ReductionOperation::PROD:
+                case DataType::S32:
                 {
-                    auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
-                    carry_res      = wrapper::vmul(carry_res, vec_res_value3_f);
-                    carry_res      = wrapper::vmul(carry_res, vec_res_value4_f);
-
-                    float res = wrapper::vgetlane(carry_res, 0);
-                    res *= wrapper::vgetlane(carry_res, 1);
-                    res *= wrapper::vgetlane(carry_res, 2);
-                    res *= wrapper::vgetlane(carry_res, 3);
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        //de-quantize input
-                        if(std::is_same<T, uint8_t>::value)
-                        {
-                            res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
-                        }
-                        else
-                        {
-                            res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
-                        }
-                    }
-
-                    //re-quantize result
-                    if(std::is_same<T, uint8_t>::value)
-                    {
-                        res = quantize_qasymm8(res, iq_info);
-                    }
-                    else
-                    {
-                        res = quantize_qasymm8_signed(res, iq_info);
-                    }
-
-                    *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
+                    _func = REGISTER_INTEGER_NEON(cpu::reduce_RedOpYZW_reduceY_S32_4);
                     break;
                 }
-                case ReductionOperation::SUM:
-                case ReductionOperation::MEAN_SUM:
+                default:
                 {
-                    auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
-                    carry_res      = wrapper::vadd(carry_res, vec_res_value3);
-                    carry_res      = wrapper::vadd(carry_res, vec_res_value4);
-
-                    auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
-                    carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
-                    auto res             = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res += *(input_ptr + x);
-                    }
-
-                    if(op == ReductionOperation::MEAN_SUM)
-                    {
-                        res /= static_cast<int32_t>(in_info.dimension(0));
-                    }
-                    else
-                    {
-                        // Subtract accumulated offsets
-                        res -= (in_info.dimension(0) - 1) * iq_info.offset;
-                    }
-                    *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+                    ARM_COMPUTE_ERROR("Not supported");
                     break;
                 }
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
             }
-        },
-        input, output);
-    }
-};
-
-template <typename T, int S>
-struct RedOpYZW
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
-
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
-    {
-        const TensorInfo in_info            = *(in->info());
-        const int        window_step_x      = 16 / sizeof(T);
-        const auto       window_start_x_tmp = static_cast<int>(in_window.x().start());
-        const auto       window_end_x_tmp   = static_cast<int>(in_window.x().end());
-        // As it split over x-axis, need to set the correct spiltted window start and end.
-        const auto window_start_x = static_cast<int>(0);
-        const auto window_end_x   = static_cast<int>(in_window.shape().x());
-
-        Window in_win_no_pad = in_window;
-        in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
-        Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
-
-        Iterator input(in, in_win_no_pad);
-        Iterator output(out, out_win_no_pad);
-
-        execute_window_loop(in_win_no_pad, [&](const Coordinates &)
+            break;
+        }
+        case 2:
         {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            switch (_input->info()->data_type())
             {
-                neon_vector vec_res_value = { 0 };
-                switch(op)
-                {
-                    case ReductionOperation::ARG_IDX_MAX:
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        vec_res_value = wrapper::vloadq(input_ptr + x);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
-                        break;
-                    }
-                    default:
-                    {
-                        vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-                        break;
-                    }
-                }
-                uint32x4x4_t vec_res_idx{ { 0 } };
-
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
-                {
-                    const T   *in_ptr       = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
-                    const auto vec_elements = wrapper::vloadq(in_ptr);
-                    switch(op)
-                    {
-                        case ReductionOperation::SUM:
-                        case ReductionOperation::MEAN_SUM:
-                            vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
-                            break;
-                        case ReductionOperation::SUM_SQUARE:
-                            vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
-                            break;
-                        case ReductionOperation::PROD:
-                            vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
-                            break;
-                        case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MAX:
-                        {
-                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
-                            break;
-                        }
-                        case ReductionOperation::MIN:
-                        {
-                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            break;
-                        }
-                        case ReductionOperation::MAX:
-                        {
-                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                            break;
-                        }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
-                    }
-                }
-
-                if(op == ReductionOperation::MEAN_SUM)
-                {
-                    auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
-                    vec_res_value      = wrapper::vmul(vec_res_value, vec_width_inv);
-                }
-
-                if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
-                {
-                    wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    if(std::is_same<T, float16_t>::value)
-                    {
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
-                    }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                }
-                else
+                case DataType::QASYMM8:
                 {
-                    wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
+                    _func = REGISTER_QASYMM8_NEON(cpu::reduce_RedOpYZW_reduceZ_qasymm8);
+                    break;
                 }
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                auto res_value = 0.f;
-                switch(op)
+                case DataType::QASYMM8_SIGNED:
                 {
-                    case ReductionOperation::ARG_IDX_MAX:
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        res_value = *(input_ptr + x);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        res_value = static_cast<T>(1.f);
-                        break;
-                    }
-                    default:
-                    {
-                        res_value = static_cast<T>(0.f);
-                        break;
-                    }
+                    _func = REGISTER_QASYMM8_SIGNED_NEON(cpu::reduce_RedOpYZW_reduceZ_qasymm8_signed);
+                    break;
                 }
-
-                uint32_t res_idx = 0;
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                case DataType::F16:
                 {
-                    const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
-
-                    switch(op)
-                    {
-                        case ReductionOperation::SUM:
-                        case ReductionOperation::MEAN_SUM:
-                            res_value += *in_ptr;
-                            break;
-                        case ReductionOperation::SUM_SQUARE:
-                            res_value += *in_ptr * *in_ptr;
-                            break;
-                        case ReductionOperation::PROD:
-                            res_value *= *in_ptr;
-                            break;
-                        case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            if(*in_ptr < res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MAX:
-                        {
-                            if(*in_ptr > res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
-                            break;
-                        }
-                        case ReductionOperation::MIN:
-                        {
-                            res_value = *in_ptr < res_value ? *in_ptr : res_value;
-                            break;
-                        }
-                        case ReductionOperation::MAX:
-                        {
-                            res_value = *in_ptr > res_value ? *in_ptr : res_value;
-                            break;
-                        }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
-                    }
+                    _func = REGISTER_FP16_NEON(cpu::reduce_RedOpYZW_reduceZ_float16_8);
+                    break;
                 }
-
-                if(op == ReductionOperation::MEAN_SUM)
+#endif // ARM_COMPUTE_ENABLE_FP16
+                case DataType::F32:
                 {
-                    res_value /= in_info.dimension(axis);
+                    _func = REGISTER_FP32_NEON(cpu::reduce_RedOpYZW_reduceZ_float32_4);
+                    break;
                 }
-
-                if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+                case DataType::S32:
                 {
-                    *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
+                    _func = REGISTER_INTEGER_NEON(cpu::reduce_RedOpYZW_reduceZ_S32_4);
+                    break;
                 }
-                else
+                default:
                 {
-                    *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+                    std::cout << int(_input->info()->data_type()) << std::endl;
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
                 }
             }
-        },
-        input, output);
-    }
-};
-
-template <typename T, int S, int axis, ReductionOperation op>
-struct RedOpYZW_complex
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
-
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
-    {
-        ARM_COMPUTE_ERROR_ON(axis != 2);
-        ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM);
-
-        const TensorInfo in_info            = *(in->info());
-        const size_t     stride_z           = in_info.strides_in_bytes()[axis];
-        const int        window_step_x      = 16 / sizeof(T);
-        const auto       window_start_x_tmp = static_cast<int>(in_window.x().start());
-        const auto       window_end_x_tmp   = static_cast<int>(in_window.x().end());
-        // As it split over x-axis, need to set the correct spiltted window start and end.
-        const auto window_start_x = static_cast<int>(0);
-        const auto window_end_x   = static_cast<int>(in_window.shape().x());
-
-        Window in_win_no_pad = in_window;
-        in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
-        Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
-
-        Iterator input(in, in_win_no_pad);
-        Iterator output(out, out_win_no_pad);
-
-        execute_window_loop(in_win_no_pad, [&](const Coordinates &)
+            break;
+        }
+        case 3:
         {
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            switch (_input->info()->data_type())
             {
-                neon_vector vec_res_value_0 = { 0 };
-                neon_vector vec_res_value_1 = { 0 };
-
-                vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-                vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
-                T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
-                {
-                    T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
-                    T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
-
-                    const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
-                    const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
-
-                    vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
-                    vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
-                }
-
-                wrapper::vstore(out_ptr, vec_res_value_0);
-                wrapper::vstore(out_ptr + 4, vec_res_value_1);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                auto res_value_0 = 0.f;
-                auto res_value_1 = 0.f;
-
-                T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                case DataType::QASYMM8:
                 {
-                    T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
-                    res_value_0 += *in_ptr;
-                    res_value_1 += *(in_ptr + 1);
+                    _func = REGISTER_QASYMM8_NEON(cpu::reduce_RedOpYZW_reduceW_qasymm8);
+                    break;
                 }
-                *out_ptr       = res_value_0;
-                *(out_ptr + 1) = res_value_1;
-            }
-        },
-        input, output);
-    }
-};
-
-template <typename T>
-struct RedOpYZW_quantized
-{
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
-    {
-        const TensorInfo              in_info = *(in->info());
-        const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
-        using PromotedType                    = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
-
-        const int  window_step_x      = 16 / sizeof(T);
-        const auto window_start_x_tmp = static_cast<int>(in_window.x().start());
-        const auto window_end_x_tmp   = static_cast<int>(in_window.x().end());
-        // As it split over x-axis, need to set the correct spiltted window start and end.
-        const auto window_start_x = static_cast<int>(0);
-        const auto window_end_x   = static_cast<int>(in_window.shape().x());
-
-        Window in_win_no_pad = in_window;
-        in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
-        Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
-
-        Iterator input(in, in_win_no_pad);
-        Iterator output(out, out_win_no_pad);
-
-        using vector_type   = typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
-        using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type;
-
-        vector_type vec_res_value1{};
-        vector_type vec_res_value2{};
-        vector_type vec_res_value3{};
-        vector_type vec_res_value4{};
-
-        vector_type_f vec_res_value1_f{};
-        vector_type_f vec_res_value2_f{};
-        vector_type_f vec_res_value3_f{};
-        vector_type_f vec_res_value4_f{};
-
-        execute_window_loop(in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                uint32x4x4_t vec_res_idx{ { 0 } };
-                vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-
-                vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-
-                auto vec_res_value = wrapper::vloadq(input_ptr + x);
-
-                for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
+                case DataType::QASYMM8_SIGNED:
                 {
-                    const T   *in_ptr       = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
-                    const auto vec_elements = wrapper::vloadq(in_ptr);
-                    switch(op)
-                    {
-                        case ReductionOperation::SUM:
-                        case ReductionOperation::MEAN_SUM:
-                        {
-                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                            vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
-                            vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
-                            vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
-                            vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
-                            break;
-                        }
-                        case ReductionOperation::PROD:
-                        {
-                            const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
-                            const auto scale32x4f_4  = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
-
-                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                            auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
-                            auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
-                            auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
-                            auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
-
-                            //de-quantize vec_elements
-                            temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
-                            vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
-                            vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
-                            vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
-                            vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MAX:
-                        {
-                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
-                            break;
-                        }
-                        case ReductionOperation::MIN:
-                        {
-                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            break;
-                        }
-                        case ReductionOperation::MAX:
-                        {
-                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                            break;
-                        }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
-                    }
+                    _func = REGISTER_QASYMM8_SIGNED_NEON(cpu::reduce_RedOpYZW_reduceW_qasymm8_signed);
+                    break;
                 }
-
-                switch(op)
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                case DataType::F16:
                 {
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]);
-                        break;
-                    }
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
-                        break;
-                    }
-                    case ReductionOperation::SUM:
-                    {
-                        // Subtract offsets
-                        auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
-
-                        auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
-                        auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
-                        auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
-                        auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
-
-                        vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
-                        vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
-                        vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
-                        vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
-
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
-
-                        combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
-                        break;
-                    }
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        const auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<float>(in_info.dimension(axis)), wrapper::traits::vector_128_tag{}));
-                        vec_res_value1_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value1), vec_width_inv);
-                        vec_res_value2_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value2), vec_width_inv);
-                        vec_res_value3_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value3), vec_width_inv);
-                        vec_res_value4_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value4), vec_width_inv);
-
-                        vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
-
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
-                        auto       res         = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
-                        const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
-
-                        //re-quantize
-                        vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
-
-                        vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
-
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
-                        auto       res         = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
-                        break;
-                    }
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
+                    _func = REGISTER_FP16_NEON(cpu::reduce_RedOpYZW_reduceW_float16_8);
+                    break;
                 }
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                float res_value = 0.f;
-                switch(op)
+#endif // ARM_COMPUTE_ENABLE_FP16
+                case DataType::F32:
                 {
-                    case ReductionOperation::ARG_IDX_MAX:
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        res_value = *(input_ptr + x);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        res_value = static_cast<T>(1.0f);
-                        break;
-                    }
-                    default:
-                    {
-                        res_value = static_cast<T>(0.0f);
-                        break;
-                    }
+                    _func = REGISTER_FP32_NEON(cpu::reduce_RedOpYZW_reduceW_float32_4);
+                    break;
                 }
-                uint32_t res_idx = 0;
-
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                case DataType::S32:
                 {
-                    const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
-                    switch(op)
-                    {
-                        case ReductionOperation::SUM:
-                        case ReductionOperation::MEAN_SUM:
-                        {
-                            res_value += *in_ptr;
-                            break;
-                        }
-                        case ReductionOperation::SUM_SQUARE:
-                        {
-                            res_value += *in_ptr * *in_ptr;
-                            break;
-                        }
-                        case ReductionOperation::PROD:
-                        {
-                            //de-quantize input
-                            if(std::is_same<T, uint8_t>::value)
-                            {
-                                res_value *= dequantize_qasymm8(*in_ptr, iq_info);
-                            }
-                            else
-                            {
-                                res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
-                            }
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            if(*in_ptr < res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MAX:
-                        {
-                            if(*in_ptr > res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
-                            break;
-                        }
-                        case ReductionOperation::MIN:
-                        {
-                            res_value = *in_ptr < res_value ? *in_ptr : res_value;
-                            break;
-                        }
-                        case ReductionOperation::MAX:
-                        {
-                            res_value = *in_ptr > res_value ? *in_ptr : res_value;
-                            break;
-                        }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
-                    }
+                    _func = REGISTER_INTEGER_NEON(cpu::reduce_RedOpYZW_reduceW_S32_4);
+                    break;
                 }
-
-                switch(op)
+                default:
                 {
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        int32_t res = static_cast<int32_t>(res_value);
-                        res /= static_cast<int32_t>(in_info.dimension(axis));
-                        *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
-                        break;
-                    }
-                    case ReductionOperation::SUM:
-                    {
-                        // Subtract accumulated offsets
-                        res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
-                        *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        //re-quantize result
-                        T res = 0;
-                        if(std::is_same<T, uint8_t>::value)
-                        {
-                            res = quantize_qasymm8(res_value, iq_info);
-                        }
-                        else
-                        {
-                            res = quantize_qasymm8_signed(res_value, iq_info);
-                        }
-                        *(reinterpret_cast<T *>(output.ptr() + x)) = res;
-                        break;
-                    }
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
-                        break;
-                    }
-                    default:
-                        *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
                 }
             }
-        },
-        input, output);
-    }
-};
-
-void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
-{
-    const bool is_complex = (input->info()->num_channels() == 2);
-
-    if(is_complex)
-    {
-        switch(axis)
-        {
-            case 2:
-                switch(input->info()->data_type())
-                {
-                    case DataType::F32:
-                        switch(op)
-                        {
-                            case ReductionOperation::SUM:
-                                return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
-                            default:
-                                ARM_COMPUTE_ERROR("Not supported");
-                        }
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
-                }
-            default:
-                ARM_COMPUTE_ERROR("Not supported");
+            break;
         }
-        return;
-    }
-
-    switch(axis)
-    {
-        case 0:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op);
-                case DataType::QASYMM8_SIGNED:
-                    return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
-                case DataType::S32:
-                    return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op);
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 1:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
-                case DataType::QASYMM8_SIGNED:
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
-                case DataType::S32:
-                    return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, RedOpYZW<int32_t, 4>(), op);
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 2:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
-                case DataType::QASYMM8_SIGNED:
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
-                case DataType::S32:
-                    return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, RedOpYZW<int32_t, 4>(), op);
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 3:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
-                case DataType::QASYMM8_SIGNED:
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
-                case DataType::S32:
-                    return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, RedOpYZW<int32_t, 4>(), op);
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
         default:
+        {
             ARM_COMPUTE_ERROR("Unsupported reduction axis");
+            break;
+        }
     }
 }
 
@@ -1635,9 +256,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
 
-    if(input->num_channels() == 1)
+    if (input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                             DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
@@ -1646,16 +268,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
         ARM_COMPUTE_RETURN_ERROR_ON(axis != 2);
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
-        if(!is_arg_min_max)
+        if (!is_arg_min_max)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
             ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
         }
         else
@@ -1663,21 +285,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
         }
 
-        const TensorShape output_shape         = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
-        const TensorInfo  tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
+        const TensorShape output_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+        const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
     }
 
     return Status{};
 }
-} // namespace
 
 NEReductionOperationKernel::NEReductionOperationKernel()
-    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE)
 {
 }
 
-void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+void NEReductionOperationKernel::configure(const ITensor     *input,
+                                           ITensor           *output,
+                                           unsigned int       axis,
+                                           ReductionOperation op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -1693,14 +318,25 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
     INEKernel::configure(win);
 
     // Calculate output shape and set if empty
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
     // Output auto initialization if not yet initialized
     const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
     DataType   output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(output_data_type)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
+    // Determine the reduction function
+    NEReductionOperationKernel::reduce_op();
 }
 
-Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status NEReductionOperationKernel::validate(const ITensorInfo *input,
+                                            const ITensorInfo *output,
+                                            unsigned int       axis,
+                                            ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
 
@@ -1713,6 +349,6 @@ void NEReductionOperationKernel::run(const Window &window, const ThreadInfo &inf
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    reduce_op(window, _input, _output, _reduction_axis, _op);
+    (*_func)(window, _input, _output, _op);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h
index 08e654fd21..407e5de6d6 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H
-#define ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEREDUCTIONOPERATIONKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEREDUCTIONOPERATIONKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
 
@@ -77,16 +77,27 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
+private:
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
+    /** Common signature for all the specialized Reduction functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ReductionFunction = void (*)(const Window &window, const ITensor *in, ITensor *out, ReductionOperation op);
 
-private:
+    /** Populate the _func with the right reduction operation handler
+    */
+    void reduce_op();
+
+    ReductionFunction  _func;
     const ITensor     *_input;
     ITensor           *_output;
     unsigned int       _reduction_axis;
     ReductionOperation _op;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEREDUCTIONOPERATIONKERNEL_H
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
deleted file mode 100644
index a1ba29e4c4..0000000000
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NERemapKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute::scale_helpers;
-
-namespace arm_compute
-{
-class Coordinates;
-
-namespace
-{
-inline int32_t num_out_of_tensor(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &width_1, const int32x4_t &height_1)
-{
-    const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr));
-    const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr));
-
-    const int32x4_t outbx_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(width_1, mapx_s32), mapx_s32), vdupq_n_s32(-1)), vdupq_n_s32(0));  // Contains -1 if out of border in x, 0 otherwise
-    const int32x4_t outby_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(height_1, mapy_s32), mapy_s32), vdupq_n_s32(-1)), vdupq_n_s32(0)); // Contains -1 if out of border in y, 0 otherwise
-
-    const int32x4_t out_of_tensor_v = vminq_s32(outbx_s32, outby_s32);
-#if defined(__aarch64__)
-    // only AArch64 supports vaddv
-    return vaddvq_s32(out_of_tensor_v);
-#else  // __aarch64__    
-    return vgetq_lane_s32(out_of_tensor_v, 0) + vgetq_lane_s32(out_of_tensor_v, 1) + vgetq_lane_s32(out_of_tensor_v, 2)  + vgetq_lane_s32(out_of_tensor_v, 3);
-#endif // __aarch64__
-}
-
-inline void serial_remap_nearest_interpolation(const uint8_t *in_ptr, const float *mapx_ptr, const float *mapy_ptr, uint8_t *out_ptr,
-                                               int32_t width_val, int32_t height_val, int32_t in_stride_val, uint8_t constant_border_value)
-{
-    const auto x_s32 = static_cast<int32_t>(*mapx_ptr);
-    const auto y_s32 = static_cast<int32_t>(*mapy_ptr);
-    if(x_s32 < 0 || y_s32 < 0 || x_s32 >= width_val || y_s32 >= height_val)
-    {
-        *(out_ptr) = constant_border_value;
-    }
-    else
-    {
-        *(out_ptr) = in_ptr[x_s32 + y_s32 * in_stride_val];
-    }
-}
-
-inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &stride)
-{
-    const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr));
-    const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr));
-    return vmlaq_s32(mapx_s32, mapy_s32, stride);
-}
-
-inline uint8_t pixel_bilinear_c1_clamp(const uint8_t *pixel_ptr, int32_t stride, int32_t width, int32_t height, float x, float y, uint8_t constant_border_value)
-{
-    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
-    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
-
-    const int32_t xi = static_cast<int32_t>(std::floor(x));
-    const int32_t yi = static_cast<int32_t>(std::floor(y));
-
-    const float dx = x - static_cast<float>(xi);
-    const float dy = y - static_cast<float>(yi);
-
-    // Calculating the address won't trigger a segfault in case the value is outside the tensor
-    // The ternary operator resolves the values in both conditions
-    const uint8_t *a00 = (xi < 0 || xi >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride);
-    const uint8_t *a01 = (xi + 1 >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride);
-    const uint8_t *a10 = (xi < 0 || xi >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride + stride);
-    const uint8_t *a11 = (xi + 1 >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride + stride);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-    const float w1  = dx1 * dy1;
-    const float w2  = dx * dy1;
-    const float w3  = dx1 * dy;
-    const float w4  = dx * dy;
-
-    return static_cast<uint8_t>((*a00) * w1 + (*a01) * w2 + (*a10) * w3 + (*a11) * w4);
-}
-} // namespace
-
-NERemapKernel::NERemapKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr), _border_mode(BorderMode::UNDEFINED), _constant_border_value(0)
-{
-}
-
-void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
-
-    _input                 = input;
-    _output                = output;
-    _map_x                 = map_x;
-    _map_y                 = map_y;
-    _border_mode           = border_mode;
-    _constant_border_value = constant_border_value;
-
-    switch(policy)
-    {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-        {
-            _func = &NERemapKernel::remap_nearest;
-            break;
-        }
-        case InterpolationPolicy::BILINEAR:
-        {
-            _func = &NERemapKernel::remap_bilinear;
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
-            break;
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps());
-    INEKernel::configure(win);
-}
-
-void NERemapKernel::remap_nearest(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    const auto    window_start_x = static_cast<int32_t>(window.x().start());
-    const auto    window_end_x   = static_cast<int32_t>(window.x().end());
-    const int32_t window_step_x  = 8;
-
-    // Don't increment in X direction for the output, mapx, mapy tensors
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-    Iterator mapx(_map_x, win);
-    Iterator mapy(_map_y, win);
-
-    const int32_t   width_val     = static_cast<int32_t>(_input->info()->dimension(0));
-    const int32_t   height_val    = static_cast<int32_t>(_input->info()->dimension(1));
-    const int32_t   in_stride_val = static_cast<int32_t>(_input->info()->strides_in_bytes()[1]);
-    const int32x4_t width_1       = vdupq_n_s32(width_val - 1);
-    const int32x4_t height_1      = vdupq_n_s32(height_val - 1);
-    const int32x4_t in_stride     = vdupq_n_s32(in_stride_val);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto           mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
-        auto           mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
-        const uint8_t *in_ptr   = in.ptr();
-        uint8_t       *out_ptr  = out.ptr();
-        int32_t        x        = window_start_x;
-        for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x)
-        {
-            const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_1, height_1);
-            const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_1, height_1);
-            const int32_t out_of_tensor  = out_of_tensor0 + out_of_tensor1;
-
-            if(out_of_tensor == -8)
-            {
-                // All elements are out of xy plane
-                uint8x8_t tmp = vdup_n_u8(_constant_border_value);
-                vst1_u8(out_ptr, tmp);
-            }
-            else if(out_of_tensor < 0)
-            {
-                // Some elements are out of xy plane
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 1, mapy_ptr + 1, out_ptr + 1, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 2, mapy_ptr + 2, out_ptr + 2, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 3, mapy_ptr + 3, out_ptr + 3, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 4, mapy_ptr + 4, out_ptr + 4, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 5, mapy_ptr + 5, out_ptr + 5, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 6, mapy_ptr + 6, out_ptr + 6, width_val, height_val, in_stride_val, _constant_border_value);
-                serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 7, mapy_ptr + 7, out_ptr + 7, width_val, height_val, in_stride_val, _constant_border_value);
-            }
-            else
-            {
-                // All elements are in xy plane
-                uint8x8_t       tmp     = vdup_n_u8(0);
-                const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr, mapy_ptr, in_stride);
-                const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, in_stride);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6);
-                tmp                     = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7);
-                vst1_u8(out_ptr, tmp);
-            }
-        }
-        for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr)
-        {
-            serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value);
-        }
-    },
-    in, out, mapx, mapy);
-}
-
-void NERemapKernel::remap_bilinear(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    const auto    window_start_x = static_cast<int32_t>(window.x().start());
-    const auto    window_end_x   = static_cast<int32_t>(window.x().end());
-    const int32_t window_step_x  = 8;
-
-    // Don't increment in X direction for the output, mapx, mapy tensors
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, win);
-    Iterator mapx(_map_x, win);
-    Iterator mapy(_map_y, win);
-
-    const int32_t   width_val     = static_cast<int32_t>(_input->info()->dimension(0));
-    const int32_t   height_val    = static_cast<int32_t>(_input->info()->dimension(1));
-    const int32x4_t width_2       = vdupq_n_s32(width_val - 2);
-    const int32x4_t height_2      = vdupq_n_s32(height_val - 2);
-    const int32_t   in_stride_val = static_cast<int32_t>(_input->info()->strides_in_bytes()[1]);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto           mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
-        auto           mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
-        const uint8_t *in_ptr   = in.ptr();
-        uint8_t       *out_ptr  = out.ptr();
-        int32_t        x        = window_start_x;
-        for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x)
-        {
-            const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_2, height_2);
-            const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_2, height_2);
-            const int32_t out_of_tensor  = out_of_tensor0 + out_of_tensor1;
-
-            if(out_of_tensor < 0)
-            {
-                // Elements are out of xy plane
-                *(out_ptr)     = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value);
-                *(out_ptr + 1) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value);
-                *(out_ptr + 2) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value);
-                *(out_ptr + 3) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value);
-                *(out_ptr + 4) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value);
-                *(out_ptr + 5) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value);
-                *(out_ptr + 6) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value);
-                *(out_ptr + 7) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value);
-            }
-            else
-            {
-                // All elements are in xy plane
-                uint8x8_t tmp = vdup_n_u8(0);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value), tmp, 0);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value), tmp, 1);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value), tmp, 2);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value), tmp, 3);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value), tmp, 4);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value), tmp, 5);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value), tmp, 6);
-                tmp           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value), tmp, 7);
-                vst1_u8(out_ptr, tmp);
-            }
-        }
-        for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr)
-        {
-            *(out_ptr) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value);
-        }
-    },
-    in, out, mapx, mapy);
-}
-
-void NERemapKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h
deleted file mode 100644
index 33e929805a..0000000000
--- a/src/core/NEON/kernels/NERemapKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREMAPKERNEL_H
-#define ARM_COMPUTE_NEREMAPKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform a remap on a tensor */
-class NERemapKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NERemapKernel";
-    }
-    /** Default constructor */
-    NERemapKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel(const NERemapKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel &operator=(const NERemapKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NERemapKernel(NERemapKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NERemapKernel &operator=(NERemapKernel &&) = default;
-    /** Default destructor */
-    ~NERemapKernel() = default;
-
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  input                 Source tensor. Data type supported: U8.
-     * @param[in]  map_x                 Map for X coordinates. Data type supported: F32.
-     * @param[in]  map_y                 Map for Y coordinates. Data type supported: F32.
-     * @param[out] output                Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy                The interpolation type.
-     * @param[in]  border_mode           Border mode to use on the input tensor.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. Defaults to 0.
-     */
-    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** function to perform nearest interpolation on the given window */
-    void remap_nearest(const Window &window);
-    /** function to perform bilinear interpolation on the given window */
-    void remap_bilinear(const Window &window);
-    /** Remap function to use for the particular interpolation type passed to configure() */
-    void (NERemapKernel::*_func)(const Window &window);
-
-    const ITensor *_input;                 /**< Input image */
-    ITensor       *_output;                /**< Output image */
-    const ITensor *_map_x;                 /**< Input remap x coordinates */
-    const ITensor *_map_y;                 /**< Input remap y coordinates */
-    BorderMode     _border_mode;           /**< Border mode */
-    uint8_t        _constant_border_value; /**< Border value to use */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */
-\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp
new file mode 100644
index 0000000000..fe8882f59f
--- /dev/null
+++ b/src/core/NEON/kernels/NEReorderKernel.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__)
+
+#include "src/core/NEON/kernels/NEReorderKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/arm_gemm/transform.hpp"
+
+namespace arm_compute
+{
+
+void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    switch (_input->info()->data_type())
+    {
+        case DataType::F32:
+        {
+            const int ksize_rows_elements = _xmax * _ksize;
+            const int jump_rows           = ksize_rows_elements * window.x().start();
+            const int k_start             = window.x().start() * _ksize;
+            const int k_end               = std::min(window.x().end() * _ksize, _kmax);
+            const int stride              = _kmax;
+            if (k_start < k_end)
+            {
+                switch (_output_wf)
+                {
+                    case WeightFormat::OHWIo4:
+                    {
+                        switch (_output->info()->data_type())
+                        {
+                            case DataType::F32:
+                                arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(
+                                    reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+                                    reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                                break;
+                            case DataType::BFLOAT16:
+                                arm_gemm::Transform<4, 4, true, arm_gemm::VLType::None>(
+                                    reinterpret_cast<bfloat16 *>(_output->buffer()) + jump_rows,
+                                    reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                                break;
+                            default:
+                                ARM_COMPUTE_ERROR("Unsupported data type!");
+                        }
+                        break;
+                    }
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+                    case WeightFormat::OHWIo8:
+                    {
+                        switch (_output->info()->data_type())
+                        {
+                            case DataType::F32:
+                                arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(
+                                    reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+                                    reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                                break;
+                            case DataType::BFLOAT16:
+                                arm_gemm::Transform<2, 4, true, arm_gemm::VLType::SVE>(
+                                    reinterpret_cast<bfloat16 *>(_output->buffer()) + jump_rows,
+                                    reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                                break;
+                            default:
+                                ARM_COMPUTE_ERROR("Unsupported data type!");
+                        }
+                        break;
+                    }
+#endif /* ARM_COMPUTE_ENABLE_SVE */
+                    default:
+                    {
+                        ARM_COMPUTE_ERROR("Unsupported data type!");
+                        break;
+                    }
+                }
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type!");
+    }
+}
+
+NEReorderKernel::NEReorderKernel()
+    : _input(nullptr),
+      _output(nullptr),
+      _ksize(0),
+      _kmax(0),
+      _xmax(0),
+      _input_wf(WeightFormat::ANY),
+      _output_wf(WeightFormat::ANY)
+{
+}
+
+void NEReorderKernel::configure(const ITensor            *input,
+                                ITensor                  *output,
+                                arm_compute::WeightFormat input_wf,
+                                arm_compute::WeightFormat output_wf)
+{
+    ARM_COMPUTE_LOG_PARAMS(input, output, input_wf, output_wf);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), input_wf, output_wf));
+
+    // Set variables
+    _input     = input;
+    _output    = output;
+    _input_wf  = input_wf;
+    _output_wf = output_wf;
+
+    // Setting parameters for transform
+    auto dims = input->info()->num_dimensions();
+    switch (dims)
+    {
+        case 2:
+        {
+            _xmax = input->info()->dimension(0); // Number of columns in input matrix
+            _kmax = input->info()->dimension(1); // Number of rows in input matrix
+            break;
+        }
+        case 4:
+        {
+            _xmax = input->info()->dimension(2); // Number of columns in input matrix
+            _kmax = input->info()->dimension(3); // Number of rows in input matrix
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Only 2 or 4 dimensions supported.");
+        }
+    }
+
+    // Configure kernel window
+    // Window size is set by rows / _ksize
+    Window win;
+    int    window_size = 0;
+    switch (_output_wf)
+    {
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+        case WeightFormat::OHWIo8:
+        {
+            _ksize      = 8;
+            window_size = _kmax / _ksize;
+            break;
+        }
+#endif /* ARM_COMPUTE_ENABLE_SVE */
+        case WeightFormat::OHWIo4:
+        {
+            _ksize      = 4;
+            window_size = _kmax / _ksize;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported weight format.");
+            break;
+        }
+    }
+    if (_kmax % _ksize != 0)
+    {
+        window_size += 1;
+    }
+
+    win.set(Window::DimX, Window::Dimension(0, window_size, 1));
+
+    INEKernel::configure(win);
+}
+
+Status NEReorderKernel::validate(const ITensorInfo        *input,
+                                 const ITensorInfo        *output,
+                                 arm_compute::WeightFormat input_wf,
+                                 arm_compute::WeightFormat output_wf)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+    if (output->tensor_shape().total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != DataType::F32 && output->data_type() != DataType::BFLOAT16);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+        // Only input WeightFormat OHWI supported
+        ARM_COMPUTE_RETURN_ERROR_ON(input_wf != arm_compute::WeightFormat::OHWI);
+        int  input_x_dim;
+        int  input_k_dim;
+        int  output_x_dim;
+        int  output_k_dim;
+        auto dims = output->num_dimensions();
+        switch (dims)
+        {
+            case 2:
+            {
+                input_x_dim  = input->dimension(0);  // Number of columns in input matrix
+                input_k_dim  = input->dimension(1);  // Number of rows in input matrix
+                output_x_dim = output->dimension(0); // Number of columns in output matrix
+                output_k_dim = output->dimension(1); // Number of rows in output matrix
+                break;
+            }
+            case 4:
+            {
+                input_x_dim  = input->dimension(2);  // Number of columns in input matrix
+                input_k_dim  = input->dimension(3);  // Number of rows in input matrix
+                output_x_dim = output->dimension(2); // Number of columns in output matrix
+                output_k_dim = output->dimension(3); // Number of rows in output matrix
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_RETURN_ERROR_MSG("Only 2 or 4 dimensions supported.");
+            }
+        }
+
+        int ksize = 0;
+        switch (output_wf)
+        {
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+            case WeightFormat::OHWIo8:
+            {
+                if (Scheduler::get().cpu_info().has_sve() && arm_gemm::utils::get_vector_length<float>() == 8)
+                {
+                    ksize = 8;
+                }
+                else
+                {
+                    ARM_COMPUTE_RETURN_ERROR_MSG("Unsupported weight format.");
+                }
+                break;
+            }
+#endif /* ARM_COMPUTE_ENABLE_SVE */
+            case WeightFormat::OHWIo4:
+            {
+                ksize = 4;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_RETURN_ERROR_MSG("Unsupported weight format.");
+                break;
+            }
+        }
+
+        // output k_dim needs to be same as input but multiple of ksize
+        int32_t rnd_up_input_kdim = arm_compute::ceil_to_multiple<int32_t, int32_t>(input_k_dim, ksize);
+        ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_kdim != output_k_dim);
+        // output x_dim needs to be same as input
+        ARM_COMPUTE_RETURN_ERROR_ON(input_x_dim != output_x_dim);
+    }
+    return Status{};
+}
+
+} // namespace arm_compute
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/NEReorderKernel.h b/src/core/NEON/kernels/NEReorderKernel.h
new file mode 100644
index 0000000000..4528b25245
--- /dev/null
+++ b/src/core/NEON/kernels/NEReorderKernel.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__)
+
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
+#define ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+
+/** Interface kernel to reorder tensor into blocked format. */
+class NEReorderKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEReorderKernel";
+    }
+
+    /** Default constructor */
+    NEReorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReorderKernel(const NEReorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReorderKernel &operator=(const NEReorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEReorderKernel(NEReorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEReorderKernel &operator=(NEReorderKernel &&) = default;
+    /** Default destructor */
+    ~NEReorderKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input     Source tensor with 2 or 4 dimensions. Data types supported: F32.
+     * @param[out] output    Destination tensor. Data type supported: same as @p input. Shape same as @p input expect last dimension which needs to be multiple of blocking parameter _ksize.
+     * @param[in]  input_wf  WeightFormat of input.
+     * @param[in]  output_wf WeightFormat of output.
+     */
+    void configure(const ITensor            *input,
+                   ITensor                  *output,
+                   arm_compute::WeightFormat input_wf,
+                   arm_compute::WeightFormat output_wf);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEReorderKernel
+     *
+     * @param[in] input     Source tensor with 2 or 4 dimensions. Data types supported: F32.
+     * @param[in] output    Destination tensor. Data type supported: same as @p input. Shape same as @p input expect last dimension which needs to be multiple of blocking parameter _ksize.
+     * @param[in] input_wf  WeightFormat of input.
+     * @param[in] output_wf WeightFormat of output.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo        *input,
+                           const ITensorInfo        *output,
+                           arm_compute::WeightFormat input_wf,
+                           arm_compute::WeightFormat output_wf);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+    /*****************************************************************************/
+
+private:
+    const ITensor *_input{nullptr};  // Input tensor
+    ITensor       *_output{nullptr}; // Output tensor
+    int32_t        _ksize{0};        // Blocking parameter, how many rows kernel reorders on each call
+    int32_t        _kmax{0};         // Rows in input tensor
+    int32_t        _xmax{0};         // Columns in input tensor
+    WeightFormat   _input_wf{WeightFormat::UNSPECIFIED};  // WeightFormat of input tensor
+    WeightFormat   _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor
+};
+
+} // namespace arm_compute
+#endif /* ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL */
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index a7b830c066..227570405c 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -50,13 +51,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+                                    "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+                                    "The height of the input tensor must be a multiple of stride");
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+        const TensorInfo tensor_info_output =
+            output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -65,8 +69,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-NEReorgLayerKernel::NEReorgLayerKernel()
-    : _input(nullptr), _output(nullptr), _stride(1)
+NEReorgLayerKernel::NEReorgLayerKernel() : _input(nullptr), _output(nullptr), _stride(1)
 {
 }
 
@@ -121,23 +124,26 @@ void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
     Iterator out(_output, collapsed_window);
 
     // Perform reorg
-    execute_window_loop(collapsed_window, [&](const Coordinates & id)
-    {
-        // Get spatial coords and channels
-        const unsigned int w = id[idx_w];
-        const unsigned int h = id[idx_h];
-        const unsigned int c = id[idx_c];
-
-        // Calculate mapping
-        const unsigned int offset     = c / out_c;
-        Coordinates        map_coords = id;
-        map_coords.set(idx_w, w * stride + offset % stride);
-        map_coords.set(idx_h, h * stride + offset / stride);
-        map_coords.set(idx_c, c % out_c);
-
-        // Perform mapping
-        std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size());
-    },
-    out);
+    execute_window_loop(
+        collapsed_window,
+        [&](const Coordinates &id)
+        {
+            // Get spatial coords and channels
+            const unsigned int w = id[idx_w];
+            const unsigned int h = id[idx_h];
+            const unsigned int c = id[idx_c];
+
+            // Calculate mapping
+            const unsigned int offset     = c / out_c;
+            Coordinates        map_coords = id;
+            map_coords.set(idx_w, w * stride + offset % stride);
+            map_coords.set(idx_h, h * stride + offset / stride);
+            map_coords.set(idx_c, c % out_c);
+
+            // Perform mapping
+            std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords),
+                        _input->info()->element_size());
+        },
+        out);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.h b/src/core/NEON/kernels/NEReorgLayerKernel.h
index 38a7d9f196..6e67eb364e 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.h
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEREORGLAYERKERNEL_H
-#define ARM_COMPUTE_NEREORGLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORGLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEREORGLAYERKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
 
@@ -60,7 +60,7 @@ public:
      */
     void configure(const ITensor *input, ITensor *output, int32_t stride);
 
-    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuReshapeKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
      * @param[in] input  Source tensor info. Data type supported: All
      * @param[in] output Destination tensor info. Data type supported: Same as @p input
@@ -80,4 +80,4 @@ private:
     int32_t        _stride;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREORGLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEREORGLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 758433f89f..b3710555df 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,25 +26,30 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
 {
+    ARM_COMPUTE_UNUSED(use_inverted_axis);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
+                                    "Current implementation only supports up to 4 dimensions.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -55,42 +60,67 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-NEReverseKernel::NEReverseKernel()
-    : _input(nullptr), _output(nullptr), _axis(nullptr)
+NEReverseKernel::NEReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false)
 {
 }
 
-void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *axis, bool use_inverted_axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
 
-    _input  = input;
-    _output = output;
-    _axis   = axis;
+    _input             = input;
+    _output            = output;
+    _axis              = axis;
+    _use_inverted_axis = use_inverted_axis;
 
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), *input->info()->clone());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info(), use_inverted_axis));
 
     // Configure kernel window
     INEKernel::configure(calculate_max_window(*output->info()));
 }
 
-Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status NEReverseKernel::validate(const ITensorInfo *input,
+                                 const ITensorInfo *output,
+                                 const ITensorInfo *axis,
+                                 bool               use_inverted_axis)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis));
 
     return Status{};
 }
 
 template <typename T>
-void run_reverse(const Window &window, const ITensor *input, const ITensor *axis, ITensor *output)
+void run_reverse(
+    const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis)
 {
-    int axis_bit = 0;
-    for(unsigned int i = 0; i < axis->info()->dimension(0); ++i)
+    unsigned int axis_bit = 0;
+    const int    rank     = input->info()->num_dimensions();
+
+    for (unsigned int i = 0; i < axis->info()->dimension(0); ++i)
     {
-        const int axis_i = *(reinterpret_cast<const int *>(axis->buffer()) + i);
+        int axis_i = *(reinterpret_cast<const int *>(axis->buffer()) + i);
+
+        // The values of axis tensor must be between [-rank, rank-1].
+        if ((axis_i < -rank) || (axis_i >= rank))
+        {
+            ARM_COMPUTE_ERROR("the values of the axis tensor must be within [-rank, rank-1].");
+        }
+
+        // In case of negative axis value i.e targeted axis(i) = rank + axis(i)
+        if (axis_i < 0)
+        {
+            axis_i = rank + axis_i;
+        }
+
+        // Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis
+        if (use_inverted_axis)
+        {
+            axis_i = (rank - 1) - axis_i;
+        }
+
         axis_bit |= 1 << axis_i;
     }
 
@@ -103,43 +133,47 @@ void run_reverse(const Window &window, const ITensor *input, const ITensor *axis
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
-
-            // Reverse 0 axis
-            if(axis_bit & 0x1)
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                in = wrapper::vrev64(in);
-                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+                auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
+
+                // Reverse 0 axis
+                if (axis_bit & 0x1)
+                {
+                    in = wrapper::vrev64(in);
+                    in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+                }
+
+                const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
+                const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+                const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+                const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+
+                auto out_ptr =
+                    reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
+                wrapper::vstore(out_ptr, in);
             }
 
-            const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
-            const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
-            const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
-            const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
-
-            auto out_ptr = reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
-            wrapper::vstore(out_ptr, in);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
 
-            const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
-            const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
-            const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
-            const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+                const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
+                const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+                const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+                const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
 
-            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = in;
-        }
-    },
-    input_it);
+                *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) =
+                    in;
+            }
+        },
+        input_it);
 }
 
 void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
@@ -148,16 +182,16 @@ void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_input->info()->element_size())
+    switch (_input->info()->element_size())
     {
         case 4:
-            run_reverse<uint32_t>(window, _input, _axis, _output);
+            run_reverse<uint32_t>(window, _input, _axis, _output, _use_inverted_axis);
             break;
         case 2:
-            run_reverse<uint16_t>(window, _input, _axis, _output);
+            run_reverse<uint16_t>(window, _input, _axis, _output, _use_inverted_axis);
             break;
         case 1:
-            run_reverse<uint8_t>(window, _input, _axis, _output);
+            run_reverse<uint8_t>(window, _input, _axis, _output, _use_inverted_axis);
             break;
         default:
             ARM_COMPUTE_ERROR("Element size not supported");
diff --git a/src/core/NEON/kernels/NEReverseKernel.h b/src/core/NEON/kernels/NEReverseKernel.h
index 07b547a327..92261887f4 100644
--- a/src/core/NEON/kernels/NEReverseKernel.h
+++ b/src/core/NEON/kernels/NEReverseKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEREVERSEKERNEL_H
-#define ARM_COMPUTE_NEREVERSEKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEREVERSEKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEREVERSEKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
 
@@ -52,21 +52,24 @@ public:
     ~NEReverseKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in]  input             Input tensor. Data types supported: All
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
      */
-    void configure(const ITensor *input, ITensor *output, const ITensor *axis);
+    void configure(const ITensor *input, ITensor *output, const ITensor *axis, bool use_inverted_axis);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReverseKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in] input             Input tensor info. Data types supported: All
+     * @param[in] output            Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis              Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in] use_inverted_axis Reverse ACL axis indices convention i.e. acl.dim(0) = tensor_rank -1
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -75,6 +78,7 @@ private:
     const ITensor *_input;
     ITensor       *_output;
     const ITensor *_axis;
+    bool           _use_inverted_axis;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREVERSEKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEREVERSEKERNEL_H
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index 7c988e9fab..7789b828ea 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,10 +29,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/select/list.h"
 
 #include <arm_neon.h>
 #include <map>
@@ -42,125 +45,91 @@ namespace arm_compute
 {
 namespace
 {
-template <typename ScalarType, typename VectorType>
-void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
-{
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator condition(cond, win);
-    Iterator input1(in1, win);
-    Iterator input2(in2, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
-        const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
-        const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
 
-        int x = window_start_x;
-        for(; x <= limit; x += window_step_x)
-        {
-            const auto c = (*condition_conversion)(condition_ptr + x);
-            const auto a = wrapper::vloadq(input1_ptr + x);
-            const auto b = wrapper::vloadq(input2_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            const auto c      = *(condition_ptr + x);
-            const auto a      = *(input1_ptr + x);
-            const auto b      = *(input2_ptr + x);
-            *(output_ptr + x) = static_cast<bool>(c) ? a : b;
-        }
-    },
-    condition, input1, input2, output);
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+struct SelectKernelSelectorData
 {
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
+    DataType dt;
+    bool     is_same_rank;
+};
 
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
-    });
-}
+using SelectorPtr = std::add_pointer<bool(const SelectKernelSelectorData &data)>::type;
+using KernelPtr =
+    std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
 
-template <typename ScalarType, typename VectorType>
-void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+struct SelectKernelSelector
 {
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
-    });
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
-    });
-}
-
-template <typename ScalarType>
-void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+    const char       *name;
+    const SelectorPtr is_selected;
+    KernelPtr         ukernel;
+};
+
+static const SelectKernelSelector available_kernels[] = {
+    {"neon_s8_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)},
+    {"neon_s16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)},
+    {"neon_s32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)},
+    {"neon_u8_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)},
+    {"neon_u16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)},
+    {"neon_u32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)},
+    {"neon_s8_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)},
+    {"neon_s16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)},
+    {"neon_s32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)},
+    {"neon_u8_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)},
+    {"neon_u16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)},
+    {"neon_u32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)},
+    {"neon_f16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)},
+    {"neon_f16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)},
+    {"neon_f32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)},
+    {"neon_f32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)},
+};
+
+const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data)
 {
-    ARM_COMPUTE_UNUSED(window);
-
-    auto       output_ptr    = reinterpret_cast<ScalarType *>(out->buffer());
-    const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
-    const auto input1_ptr    = reinterpret_cast<const ScalarType *>(in1->buffer());
-    const auto input2_ptr    = reinterpret_cast<const ScalarType *>(in2->buffer());
-
-    const int outer_size = cond->info()->total_size() / cond->info()->element_size();
-    const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
-    int       offset     = 0;
-    const int step       = 16 / in1->info()->element_size();
-
-    for(int i = 0; i < outer_size; ++i)
+    for (const auto &uk : available_kernels)
     {
-        int        x         = offset;
-        const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
-        for(; x <= offset + inner_size - step; x += step)
-        {
-            wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
-        }
-        if(x <= offset + inner_size - (step / 2))
+        if (uk.is_selected(data))
         {
-            wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
-            x += step / 2;
+            return &uk;
         }
-        for(; x < offset + inner_size; ++x)
-        {
-            *(output_ptr + x) = *(input_ptr + x);
-        }
-        offset += inner_size;
     }
+    return nullptr;
 }
+
 } // namespace
 
 NESelectKernel::NESelectKernel()
-    : _function(nullptr), _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+    : /*_function(nullptr), */ _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
 {
 }
 
@@ -178,56 +147,12 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor
     _output        = output;
     _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
 
-    std::string function_to_call("op_");
-    function_to_call += string_from_data_type(x->info()->data_type());
-
-    static std::map<std::string, SelectFunction *> map_function;
-
-    if(_has_same_rank)
-    {
-        map_function =
-        {
-            { "op_S8", &select_op_8<int8_t, uint8x16_t> },
-            { "op_S16", &select_op_16<int16_t, uint16x8_t> },
-            { "op_S32", &select_op_32<int32_t, uint32x4_t> },
-            { "op_U8", &select_op_8<uint8_t, uint8x16_t> },
-            { "op_U16", &select_op_16<uint16_t, uint16x8_t> },
-            { "op_U32", &select_op_32<uint32_t, uint32x4_t> },
-            { "op_F32", &select_op_32<float, uint32x4_t> }
-        };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        map_function["op_F16"] = &select_op_16<float16_t, uint16x8_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    }
-    else
-    {
-        map_function =
-        {
-            { "op_S8", &select_op_not_same_rank<int8_t> },
-            { "op_S16", &select_op_not_same_rank<int16_t> },
-            { "op_S32", &select_op_not_same_rank<int32_t> },
-            { "op_U8", &select_op_not_same_rank<uint8_t> },
-            { "op_U16", &select_op_not_same_rank<uint16_t> },
-            { "op_U32", &select_op_not_same_rank<uint32_t> },
-            { "op_F32", &select_op_not_same_rank<float> }
-        };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        map_function["op_F16"] = &select_op_not_same_rank<float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    }
-
-    auto it = map_function.find(function_to_call);
-
-    if(it != map_function.end())
-    {
-        _function = it->second;
-    }
-
     Window win = calculate_max_window(*x->info());
     INEKernel::configure(win);
 }
 
-Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x);
@@ -238,9 +163,11 @@ Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, cons
 
     const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+                                ((c->tensor_shape().num_dimensions() > 1) ||
+                                 (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -254,7 +181,12 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_function == nullptr);
-    _function(_c, _x, _y, _output, window);
+    ARM_COMPUTE_ERROR_ON(_output == nullptr);
+    ARM_COMPUTE_ERROR_ON(_output->info() == nullptr);
+
+    const auto *uk = get_implementation(SelectKernelSelectorData{_output->info()->data_type(), _has_same_rank});
+    ARM_COMPUTE_ERROR_ON(uk == nullptr);
+    ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
+    uk->ukernel(_c, _x, _y, _output, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h
index f7142feff8..4fec42b536 100644
--- a/src/core/NEON/kernels/NESelectKernel.h
+++ b/src/core/NEON/kernels/NESelectKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESELECTKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -82,22 +83,11 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Common signature for all the specialised select functions
-     *
-     * @param[in] c      Condition input tensor. Data types supported: U8.
-     * @param[in] x      First input tensor. Data types supported: All.
-     * @param[in] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in] output Output tensor. Data types supported: Same as @p x.
-     */
-    using SelectFunction = void(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window);
-
-    /** Select function to use for the particular tensor types passed to configure() */
-    SelectFunction *_function;
-    const ITensor *_c;              /**< Condition tensor */
-    const ITensor *_x;              /**< Source tensor 1 */
-    const ITensor *_y;              /**< Source tensor 2 */
-    ITensor        *_output;        /**< Destination tensor */
-    bool            _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
+    const ITensor *_c;             /**< Condition tensor */
+    const ITensor *_x;             /**< Source tensor 1 */
+    const ITensor *_y;             /**< Source tensor 2 */
+    ITensor       *_output;        /**< Destination tensor */
+    bool           _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NESELECTKERNEL_H */
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
index 673eace3c1..da023aeb96 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -41,19 +42,22 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *block_info,
+                          const ITensorInfo *paddings,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
     ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -64,7 +68,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const Size2D      &padding_left,
+                                 const Size2D      &padding_right,
                                  const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -73,9 +81,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+            input, block_shape_x, block_shape_y, padding_left, padding_right);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -86,14 +95,25 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 } // namespace
 
 NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _padding_left(), _block_shape_x(), _block_shape_y()
+    : _input(nullptr),
+      _block_shape(nullptr),
+      _paddings(nullptr),
+      _output(nullptr),
+      _data_layout(DataLayout::UNKNOWN),
+      _padding_left(),
+      _block_shape_x(),
+      _block_shape_y()
 {
 }
 
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+                                          const ITensor *block_shape,
+                                          const ITensor *paddings,
+                                          ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
 
     _input       = input;
     _block_shape = block_shape;
@@ -106,15 +126,22 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *b
     ICPPKernel::configure(win);
 }
 
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                                          ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+                                          const int      block_shape_x,
+                                          const int      block_shape_y,
+                                          const Size2D  &padding_left,
+                                          const Size2D  &padding_right,
+                                          ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+        input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+                                                         padding_right, output->info()));
 
     _input         = input;
     _output        = output;
@@ -128,15 +155,23 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_
     INEKernel::configure(win);
 }
 
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const ITensorInfo *block_shape,
+                                           const ITensorInfo *paddings,
+                                           const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
     return Status{};
 }
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const int          block_shape_x,
+                                           const int          block_shape_y,
+                                           const Size2D      &padding_left,
+                                           const Size2D      &padding_right,
                                            const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
     return Status{};
 }
 
@@ -146,17 +181,17 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_block_shape != nullptr)
+    if (_block_shape != nullptr)
     {
         // Retrieve the block shapes dynamically
         _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
         _block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1)));
     }
 
-    if(_paddings != nullptr)
+    if (_paddings != nullptr)
     {
-        const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 0, 0 }));
-        const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 1, 0 }));
+        const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({0, 0}));
+        const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({1, 0}));
         _padding_left           = Size2D(pad_left_x, pad_left_y);
     }
     const int height_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
@@ -173,57 +208,61 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info
     int batch_id = 0;
 
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t out_x = id.x();
-                const size_t out_y = id.y();
-                const size_t z     = id.z();
-                const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
-                const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
-                if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
                 {
-                    const int   w    = batch_id % batch_size;
-                    const int   in_x = pos_x - _padding_left.x();
-                    const int   in_y = pos_y - _padding_left.y();
-                    Coordinates input_coords{ in_x, in_y, z, w };
-                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                }
-            },
-            out);
+                    const size_t out_x = id.x();
+                    const size_t out_y = id.y();
+                    const size_t z     = id.z();
+                    const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+                    const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+                    if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+                        pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+                    {
+                        const int   w    = batch_id % batch_size;
+                        const int   in_x = pos_x - _padding_left.x();
+                        const int   in_y = pos_y - _padding_left.y();
+                        Coordinates input_coords{in_x, in_y, z, w};
+                        memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                    }
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t out_x = id.y();
-                const size_t out_y = id.z();
-                const size_t z     = id.x();
-                const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
-                const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
-                if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
                 {
-                    const int   w    = batch_id % batch_size;
-                    const int   in_x = pos_x - _padding_left.x();
-                    const int   in_y = pos_y - _padding_left.y();
-                    Coordinates input_coords{ z, in_x, in_y, w };
-                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                }
-            },
-            out);
+                    const size_t out_x = id.y();
+                    const size_t out_y = id.z();
+                    const size_t z     = id.x();
+                    const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+                    const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+                    if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+                        pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+                    {
+                        const int   w    = batch_id % batch_size;
+                        const int   in_x = pos_x - _padding_left.x();
+                        const int   in_y = pos_y - _padding_left.y();
+                        Coordinates input_coords{z, in_x, in_y, w};
+                        memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                    }
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
index 44b8cbb514..6292c07136 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -69,7 +70,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+    void configure(const ITensor *input,
+                   const int      block_shape_x,
+                   const int      block_shape_y,
+                   const Size2D  &padding_left,
+                   const Size2D  &padding_right,
+                   ITensor       *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -79,7 +85,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings)
      *
      * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -91,7 +100,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 7687c50c40..b49c5ee344 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -50,7 +51,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -115,43 +116,45 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info
     int batch_id = 0;
 
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t channel_id = id.z();
-                const size_t in_x       = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
-                const size_t in_y       = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
-                const int    z          = channel_id % channel_size;
-                Coordinates  input_coords{ in_x, in_y, z, batch_id };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const size_t channel_id = id.z();
+                    const size_t in_x       = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
+                    const size_t in_y       = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
+                    const int    z          = channel_id % channel_size;
+                    Coordinates  input_coords{in_x, in_y, z, batch_id};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t channel_id = id.x();
-                const size_t in_x       = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
-                const size_t in_y       = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
-                const int    z          = channel_id % channel_size;
-                Coordinates  input_coords{ z, in_x, in_y, batch_id };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const size_t channel_id = id.x();
+                    const size_t in_x       = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
+                    const size_t in_y       = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
+                    const int    z          = channel_id % channel_size;
+                    Coordinates  input_coords{z, in_x, in_y, batch_id};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
index 953b68a401..7d147c5b94 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 93080e2ac7..225e4fcfd2 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,14 +25,15 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
@@ -41,7 +42,12 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          uint32_t           axis,
+                          uint32_t           idx_input,
+                          uint32_t           num_tensors,
+                          uint32_t           rank,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -49,10 +55,12 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     ARM_COMPUTE_RETURN_ERROR_ON(idx_input >= num_tensors);
     ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != rank);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                           compute_stack_shape(*input, axis, num_tensors));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -60,83 +68,162 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+inline Coordinates
+shift_from_axis_and_replace_coordinate(const Coordinates &id, uint32_t axis, uint32_t idx_input, uint32_t num_dims)
 {
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
+    Coordinates id_out = id;
+    for (uint32_t i = num_dims; i > axis; --i)
+    {
+        id_out.set(i, id[i - 1]);
+    }
+    id_out.set(axis, idx_input);
+    return id_out;
+}
+
+void elementwise_stack(const std::vector<ITensor *> &input, ITensor *output, uint32_t axis, const Window &window)
+{
+    Window window_out;
+    window_out.use_tensor_dimensions(output->info()->tensor_shape());
 
-    // Configure kernel window
-    Window win = calculate_max_window(*input);
+    const int32_t  num_tensors  = input.size();
+    const size_t   element_size = input[0]->info()->element_size();
+    const uint32_t num_dims     = static_cast<uint32_t>(input[0]->info()->num_dimensions());
 
-    return std::make_pair(Status{}, win);
+    for (int32_t idx_input = 0; idx_input < num_tensors; ++idx_input)
+    {
+        Iterator input_it(input[idx_input], window);
+
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                Coordinates id_out = shift_from_axis_and_replace_coordinate(id, axis, idx_input, num_dims);
+                std::memcpy(output->ptr_to_element(id_out), input_it.ptr(), element_size);
+            },
+            input_it);
+    }
 }
 
-inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
+void memcpy_stack(const std::vector<ITensor *> &input, ITensor *output, uint32_t axis, const Window &window)
 {
-    constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D
-    Coordinates   id_out        = id;
-    for(unsigned int i = max_out_coord - 1; i > axis; --i)
+    const int32_t element_size   = input[0]->info()->element_size();
+    const int32_t chunk_size     = input[0]->info()->tensor_shape().total_size_lower(axis) * element_size;
+    const int32_t num_tensors    = input.size();
+    const int32_t out_chunk_step = chunk_size * num_tensors;
+
+    const int32_t start_x = window.x().start();
+    const int32_t end_x   = window.x().end();
+    const int32_t start_y = window.y().start();
+    const int32_t end_y   = window.y().end();
+
+    uint8_t *out_ptr_base = output->buffer() + output->info()->offset_first_element_in_bytes() + start_x * chunk_size;
+
+    for (int32_t x = start_x; x < end_x; ++x)
     {
-        id_out.set(i, id[i - 1]);
+        const uint8_t *in_ptr =
+            input[x]->buffer() + input[x]->info()->offset_first_element_in_bytes() + start_y * chunk_size;
+        uint8_t *out_ptr = out_ptr_base + start_y * out_chunk_step;
+
+        for (int32_t y = start_y; y < end_y; ++y)
+        {
+            std::memcpy(out_ptr, in_ptr, chunk_size);
+
+            in_ptr += chunk_size;
+            out_ptr += out_chunk_step;
+        }
+
+        out_ptr_base += chunk_size;
     }
-    id_out.set(axis, idx_input);
-    return id_out;
 }
+
 } // namespace
 
-NEStackLayerKernel::NEStackLayerKernel()
-    : _input(nullptr), _output(nullptr), _axis(), _idx_input()
+NEStackLayerKernel::NEStackLayerKernel() : _input(), _output(nullptr), _axis(), _split_dimension(Window::DimY)
 {
 }
 
-void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
+void NEStackLayerKernel::configure(const std::vector<ITensor *> &input, uint32_t axis, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
-    _input     = input;
-    _output    = output;
-    _axis      = axis;
-    _idx_input = idx_input;
+    const int32_t num_tensors = input.size();
+    ARM_COMPUTE_ERROR_ON(num_tensors == 0);
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
+    const uint32_t rank = input[0]->info()->num_dimensions();
+    ARM_COMPUTE_UNUSED(rank);
 
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    for (int32_t i = 0; i < num_tensors; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(input[i]);
+        ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input[i]->info(), axis, i, num_tensors, rank, output->info()));
+    }
+
+    auto_init_if_empty(*output->info(), input[0]->info()->clone()->set_tensor_shape(
+                                            compute_stack_shape(*input[0]->info(), axis, num_tensors)));
+
+    _input  = input;
+    _output = output;
+    _axis   = axis;
 }
 
-Status NEStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status NEStackLayerKernel::validate(const std::vector<ITensorInfo *> &input, uint32_t axis, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    const int32_t num_tensors = input.size();
+    const size_t  rank        = input[0]->num_dimensions();
+
+    for (int32_t i = 0; i < num_tensors; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(input[i]);
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input[i], axis, i, num_tensors, rank, output));
+    }
+
     return Status{};
 }
 
+void NEStackLayerKernel::prepare()
+{
+    // Prepare calculates the window at runtime, in case there is padding being added after configure()
+    const ITensorInfo *input_info  = _input[0]->info();
+    const int32_t      num_dims    = input_info->num_dimensions();
+    const int32_t      num_tensors = _input.size();
+
+    // Check if there are any paddings in the input tensors
+    bool has_padding = false;
+    for (const ITensor *in : _input)
+    {
+        if (has_holes(*in->info(), num_dims - 1))
+        {
+            has_padding = true;
+            break;
+        }
+    }
+
+    has_padding = has_padding || has_holes(*_output->info(), num_dims);
+
+    Window win;
+    if (!has_padding)
+    {
+        _stack_fn = memcpy_stack;
+
+        // 2D execution window (X,Y): [Num_tensors, Dimensions >= axis]
+        win.set(Window::DimX, Window::Dimension(0, num_tensors, 1));
+        win.set(Window::DimY, Window::Dimension(0, input_info->tensor_shape().total_size_upper(_axis), 1));
+    }
+    else
+    {
+        _stack_fn = elementwise_stack;
+        win       = calculate_max_window(*input_info);
+    }
+
+    INEKernel::configure(win);
+}
+
 void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Window window_out;
-    window_out.use_tensor_dimensions(_output->info()->tensor_shape());
-
-    Iterator input(_input, window);
-    Iterator output(_output, window_out);
-
-    const int stride_x = _output->info()->strides_in_bytes()[0];
-    const int stride_y = _output->info()->num_dimensions() >= 1 ? _output->info()->strides_in_bytes()[1] : 0;
-    const int stride_z = _output->info()->num_dimensions() >= 2 ? _output->info()->strides_in_bytes()[2] : 0;
-    const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0;
-    const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
-        const int   idx    = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
-        std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
-    },
-    input);
+    _stack_fn(_input, _output, _axis, window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h
index 9b36518e4d..02ee776ea4 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.h
+++ b/src/core/NEON/kernels/NEStackLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,12 +22,16 @@
  * SOFTWARE.
  */
 
-#ifndef ARM_COMPUTE_NESTACKLAYERKERNEL_H
-#define ARM_COMPUTE_NESTACKLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
+#include <cstdint>
+#include <functional>
+
 namespace arm_compute
 {
 class ITensor;
@@ -56,38 +60,48 @@ public:
      *
      * @note Supported input tensor rank: up to 4
      *
-     * @param[in]  input       Input tensor. Data types supported: All
-     * @param[in]  axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in]  idx_input   Index of the input tensor in the list of tensors to stack.
-     *                         All tensors in the list must have the same shape
-     * @param[in]  num_tensors Number of tensors to stack
-     * @param[out] output      Output tensor. Data types supported: Same as @p input.
+     * @param[in]  input  Input tensors. Data types supported: All
+     * @param[in]  axis   The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+     * @param[out] output Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
+    void configure(const std::vector<ITensor *> &input, uint32_t axis, ITensor *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
      *
-     * @param[in] input       Input tensor info. Data types supported: All
-     * @param[in] axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in] idx_input   Index of the input tensor in the list of tensors to stack
-     *                        All tensors in the list must have the same shape
-     * @param[in] num_tensors Number of tensors to stack
-     * @param[in] output      Output tensor info. Data types supported: Same as @p input.
+     * @param[in] input  Input tensor infos. Data types supported: All
+     * @param[in] axis   The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+    static Status validate(const std::vector<ITensorInfo *> &input, uint32_t axis, const ITensorInfo *output);
+
+    /** Prepare the reshape kernel for execution (Only executed once) for
+     *  choosing the window and the algorithm.
+     */
+    void prepare();
 
     // Inherited methods overridden
     void run(const Window &window, const ThreadInfo &info) override;
 
+    /** Get the dimension to split the kernel workload
+     *
+     * @return the split dimension
+     */
+    uint32_t get_split_dimension() const
+    {
+        return _split_dimension;
+    }
+
 private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _axis;
-    unsigned int   _idx_input;
+    std::vector<ITensor *> _input;
+    ITensor               *_output;
+    uint32_t               _axis;
+    uint32_t               _split_dimension;
+
+    std::function<void(const std::vector<ITensor *> &, ITensor *, uint32_t, const Window &)> _stack_fn{};
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NESTACKLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NESTACKLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
index 1d71339257..efff51be9d 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,9 +26,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -38,9 +39,14 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                          int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const Coordinates &starts,
+                          const Coordinates &ends,
+                          const BiStrides   &strides,
+                          int32_t            begin_mask,
+                          int32_t            end_mask,
+                          int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -49,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
-    {
-        return i == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
 
     // Get expected output shape
-    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                          starts, ends, strides,
-                                                                                                          begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
 
     // Checks output if configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -71,14 +74,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output,
-                                                        const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                                        int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input,
+                                                        ITensorInfo       *output,
+                                                        const Coordinates &starts,
+                                                        const Coordinates &ends,
+                                                        const BiStrides   &strides,
+                                                        int32_t            begin_mask,
+                                                        int32_t            end_mask,
+                                                        int32_t            shrink_axis_mask)
 {
     // Output tensor auto initialization if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                      starts, ends, strides,
-                                                                                                      begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
 
     // Create window
@@ -86,81 +93,51 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
 
     return std::make_pair(Status{}, win);
 }
-
-void strided_slice_generic(const ITensor *input, ITensor *output,
-                           const Coordinates &starts, const BiStrides &strides, int32_t shrink_axis_mask,
-                           const Window &window)
-{
-    Iterator     output_it(output, window);
-    const size_t width_size = input->info()->element_size();
-
-    const bool is_shrink_w = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 0);
-    const bool is_shrink_h = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 1);
-    const bool is_shrink_c = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 2);
-    const bool is_shrink_n = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 3);
-
-    unsigned int index = 0;
-    const int    idx_w = is_shrink_w ? 0 : index++;
-    const int    idx_h = is_shrink_h ? 0 : index++;
-    const int    idx_c = is_shrink_c ? 0 : index++;
-    const int    idx_n = is_shrink_n ? 0 : index;
-
-    BiStrides shrinked_strides;
-    shrinked_strides.set(0, is_shrink_w ? 0 : strides[0]);
-    shrinked_strides.set(1, is_shrink_h ? 0 : strides[1]);
-    shrinked_strides.set(2, is_shrink_c ? 0 : strides[2]);
-    shrinked_strides.set(3, is_shrink_n ? 0 : strides[3]);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int w_coord = starts[0] + (id[idx_w] * shrinked_strides[0]);
-        const int h_coord = starts[1] + (id[idx_h] * shrinked_strides[1]);
-        const int c_coord = starts[2] + (id[idx_c] * shrinked_strides[2]);
-        const int n_coord = starts[3] + (id[idx_n] * shrinked_strides[3]);
-
-        Coordinates in_coords(w_coord, h_coord, c_coord, n_coord);
-        std::copy_n(input->ptr_to_element(in_coords), width_size, output_it.ptr());
-    },
-    output_it);
-}
 } // namespace
 
-NEStridedSliceKernel::NEStridedSliceKernel()
-    : _starts_abs(), _final_strides(), _shrink_mask()
+NEStridedSliceKernel::NEStridedSliceKernel() : _starts_abs(), _final_strides(), _shrink_mask()
 {
 }
 
-void NEStridedSliceKernel::configure(const ITensorInfo *input, ITensorInfo *output,
-                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSliceKernel::configure(const ITensorInfo *input,
+                                     ITensorInfo       *output,
+                                     const Coordinates &starts,
+                                     const Coordinates &ends,
+                                     const BiStrides   &strides,
+                                     int32_t            begin_mask,
+                                     int32_t            end_mask,
+                                     int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
-
-    _shrink_mask = shrink_axis_mask;
-
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    _shrink_mask                   = shrink_axis_mask;
     const TensorShape &input_shape = input->tensor_shape();
-
-    Coordinates ends_abs;
-    std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
-                                                          input_shape,
-                                                          starts, ends, strides,
-                                                          begin_mask, end_mask, shrink_axis_mask);
-
+    Coordinates        ends_abs;
+    std::tie(_starts_abs, ends_abs, _final_strides) =
+        arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+                                                                               begin_mask, end_mask, shrink_axis_mask);
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    auto win_config =
+        validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-Status NEStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSliceKernel::validate(const ITensorInfo *input,
+                                      const ITensorInfo *output,
+                                      const Coordinates &starts,
+                                      const Coordinates &ends,
+                                      const BiStrides   &strides,
+                                      int32_t            begin_mask,
+                                      int32_t            end_mask,
+                                      int32_t            shrink_axis_mask)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
-                                                              starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), starts, ends,
+                                                              strides, begin_mask, end_mask, shrink_axis_mask)
+                                    .first);
 
     return Status{};
 }
@@ -171,9 +148,70 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    // Dispatch kernel
-    strided_slice_generic(tensors.get_const_tensor(TensorType::ACL_SRC_0),
-                          tensors.get_tensor(TensorType::ACL_DST),
-                          _starts_abs, _final_strides, _shrink_mask, window);
+    const ITensor *input  = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const ITensor *output = tensors.get_tensor(TensorType::ACL_DST);
+
+    size_t width_size = input->info()->element_size();
+
+    const bool is_shrink_x = arm_compute::helpers::bit_ops::is_bit_set(_shrink_mask, 0);
+    const bool is_shrink_y = arm_compute::helpers::bit_ops::is_bit_set(_shrink_mask, 1);
+    const bool is_shrink_z = arm_compute::helpers::bit_ops::is_bit_set(_shrink_mask, 2);
+    const bool is_shrink_w = arm_compute::helpers::bit_ops::is_bit_set(_shrink_mask, 3);
+
+    unsigned int index = 0;
+    const int    idx_x = is_shrink_x ? 0 : index++;
+    const int    idx_y = is_shrink_y ? 0 : index++;
+    const int    idx_z = is_shrink_z ? 0 : index++;
+    const int    idx_w = is_shrink_w ? 0 : index;
+
+    BiStrides shrinked_strides;
+    shrinked_strides.set(0, is_shrink_x ? 0 : _final_strides[0]);
+    shrinked_strides.set(1, is_shrink_y ? 0 : _final_strides[1]);
+    shrinked_strides.set(2, is_shrink_z ? 0 : _final_strides[2]);
+    shrinked_strides.set(3, is_shrink_w ? 0 : _final_strides[3]);
+
+    Window win = window;
+
+    size_t length_x = win.shape()[0];
+
+    if (_final_strides[0] == 1 && !is_shrink_x)
+    {
+        win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        width_size = width_size * length_x;
+    }
+
+    Iterator output_it(output, win);
+
+    const int start_0 = _starts_abs[0];
+    const int start_1 = _starts_abs[1];
+    const int start_2 = _starts_abs[2];
+    const int start_3 = _starts_abs[3];
+
+    const int shrinked_stride_0 = shrinked_strides[0];
+    const int shrinked_stride_1 = shrinked_strides[1];
+    const int shrinked_stride_2 = shrinked_strides[2];
+    const int shrinked_stride_3 = shrinked_strides[3];
+
+    const int byte_increment_0 = static_cast<int>(input->info()->strides_in_bytes()[0]);
+    const int byte_increment_1 = static_cast<int>(input->info()->strides_in_bytes()[1]);
+    const int byte_increment_2 = static_cast<int>(input->info()->strides_in_bytes()[2]);
+    const int byte_increment_3 = static_cast<int>(input->info()->strides_in_bytes()[3]);
+
+    uint8_t *input_base = input->ptr_to_element(Coordinates(0, 0, 0, 0));
+    uint8_t *cur_ptr;
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            cur_ptr = input_base;
+            cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0;
+            cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1;
+            cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2;
+            cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3;
+
+            std::copy_n(cur_ptr, width_size, output_it.ptr());
+        },
+        output_it);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.h b/src/core/NEON/kernels/NEStridedSliceKernel.h
index 9ce517417d..a475f09a17 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.h
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
@@ -68,9 +69,14 @@ public:
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    void configure(const ITensorInfo *input,
+                   ITensorInfo       *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask,
+                   int32_t            end_mask,
+                   int32_t            shrink_axis_mask);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel
      *
@@ -86,9 +92,14 @@ public:
      * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask,
+                           int32_t            end_mask,
+                           int32_t            shrink_axis_mask);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
index 94256dc12d..577ce5b69e 100644
--- a/src/core/NEON/kernels/NETileKernel.cpp
+++ b/src/core/NEON/kernels/NETileKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -43,15 +44,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
-    {
-        return e == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -59,8 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-NETileKernel::NETileKernel()
-    : _input(nullptr), _output(nullptr)
+NETileKernel::NETileKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
@@ -95,8 +93,9 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Window output_window{ window };
-    output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), _input->info()->dimension(0)));
+    Window output_window{window};
+    output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(),
+                                                      _input->info()->dimension(0)));
     Window out_slice = output_window.first_slice_window_1D();
 
     const auto src_shape = _input->info()->tensor_shape();
@@ -104,17 +103,19 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info)
     {
         Iterator output_it(_output, out_slice);
 
-        execute_window_loop(out_slice, [&](const Coordinates & id)
-        {
-            const size_t x = id.x();
-            const size_t y = id.y();
-            const size_t z = id.z();
-            const size_t w = id[3];
-            Coordinates  input_coords{ x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3] };
-            memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), _input->info()->dimension(0) * _input->info()->element_size());
-        },
-        output_it);
-    }
-    while(output_window.slide_window_slice_1D(out_slice));
+        execute_window_loop(
+            out_slice,
+            [&](const Coordinates &id)
+            {
+                const size_t x = id.x();
+                const size_t y = id.y();
+                const size_t z = id.z();
+                const size_t w = id[3];
+                Coordinates  input_coords{x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3]};
+                memcpy(output_it.ptr(), _input->ptr_to_element(input_coords),
+                       _input->info()->dimension(0) * _input->info()->element_size());
+            },
+            output_it);
+    } while (output_window.slide_window_slice_1D(out_slice));
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
deleted file mode 100644
index 9bef9c30d9..0000000000
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace
-{
-TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
-{
-    TensorShape output_shape{ input->tensor_shape() };
-
-    output_shape.collapse(3);
-    const size_t tmp_dim = output_shape[0];
-    output_shape.set(0, output_shape[1]);
-    output_shape.set(1, tmp_dim + (has_bias ? 1 : 0));
-
-    return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
-    }
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, biases != nullptr));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input)
-{
-    Window window = calculate_max_window(*input, Steps());
-    window.set(Window::DimX, Window::Dimension(0, input->dimension(0), input->dimension(0)));
-    window.set(Window::DimY, Window::Dimension(0, input->dimension(1), input->dimension(1)));
-    window.set(Window::DimZ, Window::Dimension(0, input->dimension(2), input->dimension(2)));
-
-    // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
-
-    return std::make_pair(Status{}, window);
-}
-} // namespace
-
-NEWeightsReshapeKernel::NEWeightsReshapeKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(get_output_shape(input->info(), (bias != nullptr))));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
-                                                  (bias != nullptr) ? bias->info() : nullptr,
-                                                  output->info()));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    // Configure kernel
-    auto win_config = validate_and_configure_window(input->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NEWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, biases, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get()).first);
-
-    return Status{};
-}
-
-void NEWeightsReshapeKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const unsigned int kernel_size_x   = _input->info()->dimension(0);
-    const unsigned int kernel_size_y   = _input->info()->dimension(1);
-    const unsigned int kernel_depth    = _input->info()->dimension(2);
-    const unsigned int input_stride_x  = _input->info()->strides_in_bytes().x();
-    const unsigned int input_stride_y  = _input->info()->strides_in_bytes().y();
-    const unsigned int input_stride_z  = _input->info()->strides_in_bytes().z();
-    const unsigned int output_stride_y = _output->info()->strides_in_bytes().y();
-
-    // Create iterators
-    Iterator in(_input, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Get column index
-        const int kernel_idx = id[3];
-        const int kernel_idz = id[4];
-
-        // Setup pointers
-        const uint8_t *tmp_input_ptr        = in.ptr();
-        uint8_t       *tmp_output_ptr       = _output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
-        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
-        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
-        // Linearize volume
-        for(unsigned int d = 0; d < kernel_depth; ++d)
-        {
-            for(unsigned int j = 0; j < kernel_size_y; ++j)
-            {
-                for(unsigned int i = 0; i < kernel_size_x; ++i)
-                {
-                    std::memcpy(tmp_output_ptr, tmp_input_ptr, _input->info()->element_size());
-                    tmp_input_ptr += input_stride_x;
-                    tmp_output_ptr += output_stride_y;
-                }
-                curr_input_row_ptr += input_stride_y;
-                tmp_input_ptr = curr_input_row_ptr;
-            }
-            curr_input_depth_ptr += input_stride_z;
-            curr_input_row_ptr = curr_input_depth_ptr;
-            tmp_input_ptr      = curr_input_depth_ptr;
-        }
-
-        // Add bias
-        if(_bias != nullptr)
-        {
-            std::memcpy(tmp_output_ptr, _bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), _input->info()->element_size());
-        }
-    },
-    in);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.h b/src/core/NEON/kernels/NEWeightsReshapeKernel.h
deleted file mode 100644
index 76eca9fe86..0000000000
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref NEIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class NEWeightsReshapeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEWeightsReshapeKernel";
-    }
-    /** Constructor.*/
-    NEWeightsReshapeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWeightsReshapeKernel(const NEWeightsReshapeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWeightsReshapeKernel &operator=(const NEWeightsReshapeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWeightsReshapeKernel(NEWeightsReshapeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWeightsReshapeKernel &operator=(NEWeightsReshapeKernel &&) = default;
-    /** Default destructor */
-    ~NEWeightsReshapeKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
-     *                    Data types supported: All
-     * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
-     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output The output tensor. Data types supported: Same as @p input
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel
-     *
-     * @param[in] input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared.
-     *                   Data types supported: All
-     * @param[in] biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                   dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
-     *                   @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[in] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    const ITensor *_bias;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
deleted file mode 100644
index be34980663..0000000000
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
+++ /dev/null
@@ -1,548 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/kernels/convolution/common/utils.hpp"
-#include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-//Batched Gemms
-
-namespace
-{
-inline bool is_kernel_size_supported(DataType data_type, Size2D size)
-{
-    const std::array<Size2D, 8> f32_support = { { Size2D(1, 3), Size2D(3, 1), Size2D(5, 5), Size2D(3, 3), Size2D(1, 5), Size2D(5, 1), Size2D(7, 1), Size2D(1, 7) } };
-    const std::array<Size2D, 8> f16_support = { { Size2D(3, 3) } };
-
-    switch(data_type)
-    {
-        case DataType::F16:
-            return std::end(f16_support) != std::find(std::begin(f16_support), std::end(f16_support), size);
-        case DataType::F32:
-            return std::end(f32_support) != std::find(std::begin(f32_support), std::end(f32_support), size);
-        default:
-            return false;
-    }
-}
-
-Status validate_arguments_winograd_weight_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-
-    const size_t idx_width    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const auto   input_width  = input->dimension(idx_width);
-    const auto   input_height = input->dimension(idx_height);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(input_width, input_height)),
-                                    "Only 1x3, 3x1, 1x5, 5x1, 7x1, 1x7, 3x3 and 5x5 kernels are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-    const Size2D &output_tile = winograd_info.output_tile_size;
-    const std::array<Size2D, 8> supported_tile_sizes = { { Size2D(2U, 2U), Size2D(4U, 4U), Size2D(1U, 6U), Size2D(6U, 1U), Size2D(4, 1), Size2D(1, 4), Size2D(2, 1), Size2D(1, 2) } };
-    ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_tile_sizes) == std::find(std::begin(supported_tile_sizes), std::end(supported_tile_sizes), output_tile));
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_winograd_weight_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info)));
-    const Window win = calculate_max_window(*input, Steps(), true /* skip border*/);
-    return std::make_pair(Status{}, win);
-}
-
-Status validate_arguments_winograd_input_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    const Size2D        &kernel_dims = winograd_info.kernel_size;
-    const PadStrideInfo &conv_info   = winograd_info.convolution_info;
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(kernel_dims.width, kernel_dims.height)),
-                                    "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
-
-    // Validate configured output
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_winograd_input_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
-    return std::make_pair(Status{}, calculate_max_window(*input, Steps(), true));
-}
-
-Status validate_arguments_winograd_output_trans(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    const PadStrideInfo &conv_info   = winograd_info.convolution_info;
-    const Size2D         kernel_dims = winograd_info.kernel_size;
-
-    // Number of tiles along the X and Y direction
-    const unsigned int num_tiles_x = std::ceil((winograd_info.input_dimensions.x() - (kernel_dims.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>
-                                               (winograd_info.output_tile_size.width));
-    const unsigned int num_tiles_y = std::ceil((winograd_info.input_dimensions.y() - (kernel_dims.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>
-                                               (winograd_info.output_tile_size.height));
-    const Size2D       num_tiles   = Size2D(num_tiles_x, num_tiles_y);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != num_tiles.area());
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(kernel_dims.width, kernel_dims.height)),
-                                    "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
-
-    const std::array<unsigned int, 3> supported_gemm_sizes = { { 8U, 16U, 36U } };
-    ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_gemm_sizes) == std::find(std::begin(supported_gemm_sizes), std::end(supported_gemm_sizes), input->dimension(2)));
-    ARM_COMPUTE_UNUSED(kernel_dims);
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != size_t(1));
-    }
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_winograd_output_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info)));
-
-    return std::make_pair(Status{}, calculate_max_window(*input, Steps(), true));
-}
-} // namespace
-
-Status INEWinogradLayerTransformWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *weights)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    const DataLayout   data_layout = input->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(weights->dimension(width_idx), weights->dimension(height_idx))),
-                                    "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    return Status{};
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int num_output_channels, int num_input_channels) const
-{
-    const KernelShape shape(num_output_channels, KernelRows, KernelCols, num_input_channels);
-    return static_cast<unsigned int>(
-               // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T
-               WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformWeightsKernel()
-    : _transform(nullptr), _weights_hwio(nullptr), _output(nullptr), _matrix_stride(0), _num_output_channels(0), _num_input_channels(0)
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(int num_output_channels, int num_input_channels) const
-{
-    return WinogradConv::get_kernel_matrix_stride(num_input_channels, num_output_channels);
-}
-
-#ifndef DOXYGEN_SKIP_THIS
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
-    const ITensor *weights_hwio,
-    ITensor       *output,
-    const int      matrix_stride,       /** Stride across matrices in the output. */
-    const int      num_output_channels, /** Number of filters. */
-    const int      num_input_channels)  /** Number of channels in each filter. */
-{
-    _weights_hwio        = weights_hwio;
-    _output              = output;
-    _matrix_stride       = matrix_stride;
-    _num_output_channels = num_output_channels;
-    _num_input_channels  = num_input_channels;
-    _transform           = std::make_unique<WeightsTransform>(num_output_channels, num_input_channels);
-
-    Window win;
-    auto   win_last = _transform->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-    INEKernel::configure(win);
-}
-#endif /* DOXYGEN_SKIP_THIS */
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    const size_t fst = window.x().start();
-    const size_t lst = window.x().end();
-    _transform->set_weight_tensor(_weights_hwio->buffer());
-    const int matrix_row_stride = roundup(_num_output_channels, WinogradConv::N_BLOCK);
-    _transform->set_output_matrices(_output->buffer(), _matrix_stride, matrix_row_stride);
-    _transform->set_working_space(_output->buffer());
-
-    _transform->run(fst, lst);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-bool NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
-{
-    return false;
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-Status NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                                                                                  const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_weight_trans(input, output, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_weight_trans(input->clone().get(), output->clone().get(), winograd_info).first);
-    return Status{};
-}
-
-template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>;
-template class NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>;
-template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>;
-template class NEWinogradLayerTransformWeightsKernel<float, 1, 6, 1, 3>;
-template class NEWinogradLayerTransformWeightsKernel<float, 6, 1, 3, 1>;
-
-template class NEWinogradLayerTransformWeightsKernel<float, 1, 4, 1, 5>;
-template class NEWinogradLayerTransformWeightsKernel<float, 4, 1, 5, 1>;
-template class NEWinogradLayerTransformWeightsKernel<float, 1, 2, 1, 7>;
-template class NEWinogradLayerTransformWeightsKernel<float, 2, 1, 7, 1>;
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEWinogradLayerTransformWeightsKernel<__fp16, 4, 4, 3, 3>;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-// Input transform
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_input_storage_size(
-    int  num_batches,  /* Number of batches in the input tensor. */
-    int  num_channels, /* Number of feature maps in the input tensor. */
-    int  num_rows,     /* Number of rows in each feature map. */
-    int  num_cols,     /* Number of columns in each feature map. */
-    bool same_padding  /* Use "SAME" padding, otherwise use "VALID". */
-) const
-{
-    // Construct shapes for the input and kernel tensors.
-    const Tensor4DShape input_shape(num_batches, num_rows, num_cols, num_channels);
-    const KernelShape   kern_shape(1, KernelRows, KernelCols, num_channels);
-    // Return the size, converted into units of TIn
-    return static_cast<unsigned int>(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
-{
-    return _transform->get_working_space_size(num_threads) / sizeof(T);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
-    int  num_batches,  /* Number of batches in the input tensor. */
-    int  num_channels, /* Number of feature maps in the input tensor. */
-    int  num_rows,     /* Number of rows in each feature map. */
-    int  num_cols,     /* Number of columns in each feature map. */
-    bool same_padding /* Use "SAME" padding, otherwise use "VALID". */) const
-{
-    return WinogradConv::get_input_matrix_stride(num_batches, num_rows, num_cols, num_channels, same_padding);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformInputKernel()
-    : _transform(nullptr), _input_nhwc(nullptr), _num_batches(0), _num_rows(0), _num_cols(0), _num_channels(0), _padding(), _output(nullptr), _matrix_stride(0), _padding_top(), _padding_left(),
-      _padding_right(), _padding_bottom(), _workspace(nullptr)
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
-    const ITensor    *input_nhwc,
-    const int         num_batches,   /* Number of batches in input tensor. */
-    const int         num_rows,      /* Number of rows in input tensor. */
-    const int         num_cols,      /* Number of columns in input tensor. */
-    const int         num_channels,  /* Number of channels in input tensor. */
-    const PaddingType padding,       /* Padding type. */
-    ITensor          *output,        /* Base of output matrices. */
-    const int         matrix_stride, /* Stride between output matrices. */
-    ITensor          *workspace)
-{
-    _input_nhwc    = input_nhwc;
-    _num_batches   = num_batches;
-    _num_rows      = num_rows;
-    _num_cols      = num_cols;
-    _num_channels  = num_channels;
-    _padding       = padding;
-    _output        = output;
-    _matrix_stride = matrix_stride;
-    _workspace     = workspace;
-
-    _padding_top    = (padding == PADDING_SAME) ? (KernelRows - 1) / 2 : 0;
-    _padding_left   = (padding == PADDING_SAME) ? (KernelCols - 1) / 2 : 0;
-    _padding_bottom = (padding == PADDING_SAME) ? iceildiv(KernelRows - 1, 2) : 0;
-    _padding_right  = (padding == PADDING_SAME) ? iceildiv(KernelCols - 1, 2) : 0;
-
-    _transform = std::make_unique<InputTransform>(
-                     KernelRows,
-                     KernelCols,
-                     num_batches,
-                     num_rows,
-                     num_cols,
-                     num_channels,
-                     _padding_top,    /**< Padding to apply to the top of the image. */
-                     _padding_left,   /**< Padding to apply to the left of the image. */
-                     _padding_bottom, /**< Padding to apply to the bottom of the image. */
-                     _padding_right   /**< Padding to apply to the right of the image. */
-                 );
-
-    Window win;
-    auto   win_last = _transform->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-    INEKernel::configure(win);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_workspace);
-
-    const int  element_size_in_bytes = _input_nhwc->info()->element_size();
-    const int  input_col_stride      = _input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes;
-    const int  input_row_stride      = _input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes;
-    const int  input_batch_stride    = _input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes;
-    const auto input_nhwc_ptr        = reinterpret_cast<const T *>(_input_nhwc->buffer() + _input_nhwc->info()->offset_first_element_in_bytes());
-    auto       output_ptr            = reinterpret_cast<T *>(_output->buffer() + _output->info()->offset_first_element_in_bytes());
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output_ptr);
-
-    _transform->set_input_tensor(input_nhwc_ptr, input_batch_stride, input_row_stride, input_col_stride);
-    _transform->set_output_matrices(output_ptr, _matrix_stride, _num_channels);
-
-    _transform->set_working_space(_workspace->buffer());
-
-    // The code below cannot be moved to configure because biases hasn't been allocated at that point
-    const size_t fst = window.x().start();
-    const size_t lst = window.x().end();
-    _transform->run(fst, lst, info.thread_id);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-Status NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_input_trans(input, output, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_input_trans(input->clone().get(), output->clone().get(), winograd_info).first);
-
-    return Status{};
-}
-
-template class NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>;
-template class NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>;
-template class NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>;
-template class NEWinogradLayerTransformInputKernel<float, 1, 6, 1, 3>;
-template class NEWinogradLayerTransformInputKernel<float, 6, 1, 3, 1>;
-
-template class NEWinogradLayerTransformInputKernel<float, 1, 4, 1, 5>;
-template class NEWinogradLayerTransformInputKernel<float, 4, 1, 5, 1>;
-template class NEWinogradLayerTransformInputKernel<float, 1, 2, 1, 7>;
-template class NEWinogradLayerTransformInputKernel<float, 2, 1, 7, 1>;
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEWinogradLayerTransformInputKernel<__fp16, 4, 4, 3, 3>;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-// Output transform
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_storage_size(
-    int num_batches,        /* Number of batches in the output tensor. */
-    int num_rows,           /* Number of rows in each feature map of the input tensor. */
-    int num_cols,           /* Number of columns in each feature map of the input tensor. */
-    int num_output_channels /* Number of feature maps in the output tensor. */
-) const
-{
-    // Construct shapes for the input and kernel tensors.
-    const Tensor4DShape input_shape(num_batches, num_rows, num_cols, 1);
-    const KernelShape   kern_shape(num_output_channels, KernelRows, KernelCols, 1);
-    // Return the size, converted into units of TOut
-    return static_cast<unsigned int>(
-               WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformOutputKernel()
-    : _transform(nullptr), _biases(nullptr), _transformed_output(nullptr), _workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output_nhwc(nullptr), _num_batches(0), _num_rows(0),
-      _num_cols(0), _num_channels(0)
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
-{
-    return _transform->get_working_space_size(num_threads) / sizeof(T);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
-    int num_batches,        /* Number of batches in the output tensor. */
-    int num_rows,           /* Number of rows in each feature map of the input tensor. */
-    int num_cols,           /* Number of columns in each feature map of the input tensor. */
-    int num_output_channels /* Number of feature maps in the output tensor. */
-) const
-{
-    return WinogradConv::get_output_matrix_stride(num_batches, num_rows, num_cols, num_output_channels);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-std::pair<unsigned int, unsigned int> NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_shape(
-    int  num_rows, /* Number of rows in each feature map of the input tensor. */
-    int  num_cols, /* Number of columns in each feature map of the input tensor. */
-    bool padding_same) const
-{
-    return WinogradConv::get_output_shape(std::make_pair<unsigned int, unsigned int>(num_rows, num_cols), padding_same);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
-    const ITensor              *biases,
-    const ITensor              *transformed_output,
-    const int                   matrix_stride,
-    ITensor                    *output_nhwc,
-    const int                   num_batches,
-    const int                   num_rows,
-    const int                   num_cols,
-    const int                   num_channels,
-    ITensor                    *workspace,
-    const arm_gemm::Activation &activation)
-{
-    _biases             = biases;
-    _workspace          = workspace;
-    _transformed_output = transformed_output;
-    _matrix_stride      = matrix_stride;
-    _matrix_row_stride  = roundup(num_channels, WinogradConv::N_BLOCK);
-    _output_nhwc        = output_nhwc;
-    _num_batches        = num_batches;
-    _num_rows           = num_rows;
-    _num_cols           = num_cols;
-    _num_channels       = num_channels;
-    // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window
-    _transform = std::make_unique<OutputTransform>(num_batches, num_rows, num_cols, num_channels, activation);
-    Window win;
-    auto   win_last = _transform->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-
-    INEKernel::configure(win);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_workspace);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_transformed_output);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_output_nhwc);
-
-    const int out_batch_stride = _output_nhwc->info()->strides_in_bytes()[3] / sizeof(T);
-    const int out_row_stride   = _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T);
-    const int out_col_stride   = _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T);
-
-    _transform->set_input_matrices(_transformed_output->buffer(), _matrix_stride, _matrix_row_stride);
-    _transform->set_bias((_biases ? reinterpret_cast<T *>(_biases->buffer() + _biases->info()->offset_first_element_in_bytes()) : nullptr));
-    _transform->set_output_tensor(_output_nhwc->buffer() + _output_nhwc->info()->offset_first_element_in_bytes(), out_batch_stride, out_row_stride, out_col_stride);
-    _transform->set_working_space(_workspace->buffer());
-    // The code below cannot be moved to configure because biases hasn't been allocated at that point
-    const size_t fst = window.x().start();
-    const size_t lst = window.x().end();
-    _transform->run(fst, lst, info.thread_id);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-Status NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                                                                 const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_output_trans(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_output_trans(input->clone().get(), output->clone().get(), winograd_info).first);
-
-    return Status{};
-}
-
-template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>;
-template class NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>;
-template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>;
-template class NEWinogradLayerTransformOutputKernel<float, 1, 6, 1, 3>;
-template class NEWinogradLayerTransformOutputKernel<float, 6, 1, 3, 1>;
-
-template class NEWinogradLayerTransformOutputKernel<float, 1, 4, 1, 5>;
-template class NEWinogradLayerTransformOutputKernel<float, 4, 1, 5, 1>;
-template class NEWinogradLayerTransformOutputKernel<float, 1, 2, 1, 7>;
-template class NEWinogradLayerTransformOutputKernel<float, 2, 1, 7, 1>;
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEWinogradLayerTransformOutputKernel<__fp16, 4, 4, 3, 3>;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
deleted file mode 100644
index 75d257de4b..0000000000
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
+++ /dev/null
@@ -1,597 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/NEON/kernels/convolution/common/convolution.hpp"
-#include "src/core/NEON/kernels/convolution/common/tensor.hpp"
-
-#include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to perform Winograd input transform. */
-class INEWinogradLayerTransformInputKernel : public INEKernel
-{
-public:
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param num_threads The greatest number of threads that will be used to execute the transform.
-     * @return Size of working space required in bytes.
-     */
-    virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0;
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed input.
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Storage size (in units of TIn) required.
-     */
-    virtual unsigned int get_input_storage_size(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  input_nhwc    Input tensor in NHWC data layout format.
-     * @param[in]  num_batches   Number of batches in input tensor.
-     * @param[in]  num_rows      Number of rows in input tensor.
-     * @param[in]  num_cols      Number of columns in input tensor.
-     * @param[in]  num_channels  Number of channels in input tensor.
-     * @param[in]  padding       Padding type.
-     * @param[out] output        Base of output matrices.
-     * @param[in]  matrix_stride Stride between output matrices.
-     * @param[in]  workspace     Tensor to be used as the working space during the computation.
-     */
-    virtual void configure(const ITensor *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels,
-                           const PaddingType padding, ITensor *output, const int matrix_stride, ITensor *workspace) = 0;
-
-    /** Destructor */
-    virtual ~INEWinogradLayerTransformInputKernel()
-    {
-    }
-};
-
-/** Kernel to perform Winograd input transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformInputKernel(const NEWinogradLayerTransformInputKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformInputKernel &operator=(const NEWinogradLayerTransformInputKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformInputKernel(NEWinogradLayerTransformInputKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformInputKernel &operator=(NEWinogradLayerTransformInputKernel &&) = default;
-    /** Default destructor */
-    ~NEWinogradLayerTransformInputKernel() = default;
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed input.
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Storage size (in units of TIn) required.
-     */
-    unsigned int get_input_storage_size(
-        int  num_batches,
-        int  num_channels,
-        int  num_rows,
-        int  num_cols,
-        bool same_padding) const override;
-
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    unsigned int get_working_space_size(unsigned int num_threads) const override;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(
-        int  num_batches,
-        int  num_channels,
-        int  num_rows,
-        int  num_cols,
-        bool same_padding) const override;
-
-    /** Default constructor */
-    NEWinogradLayerTransformInputKernel();
-
-    const char *name() const override
-    {
-        return "NEWinogradLayerTransformInputKernel";
-    }
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  input_nhwc    Input tensor.  Data types supported: F16/F32. Layout supported NHWC.
-     * @param[in]  num_batches   Number of batches in input tensor.
-     * @param[in]  num_rows      Number of rows in input tensor.
-     * @param[in]  num_cols      Number of columns in input tensor.
-     * @param[in]  num_channels  Number of channels in input tensor.
-     * @param[in]  padding       Padding type.
-     * @param[out] output        Base of output matrices.
-     * @param[in]  matrix_stride Stride between output matrices.
-     * @param[in]  workspace     Tensor to be used as the working space during the computation.
-     */
-    void configure(
-        const ITensor    *input_nhwc,
-        const int         num_batches,
-        const int         num_rows,
-        const int         num_cols,
-        const int         num_channels,
-        const PaddingType padding,
-        ITensor          *output,
-        const int         matrix_stride,
-        ITensor          *workspace) override;
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    /** Winograd base kernel */
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    /** Winograd convolution kernel */
-    using WinogradConv = typename WinogradBase::template Convolution<T, T>;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformInputKernel
-     *
-     * @param[in] input         First tensor input info. Data types supported: F16/F32.
-     * @param[in] output        Output tensor info. Data types supported: same as @p input.
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-private:
-    using InputTransform = typename WinogradBase::template InputTransform<T, T>;
-
-    std::unique_ptr<InputTransform> _transform{ nullptr };
-    const ITensor                  *_input_nhwc;
-    int                             _num_batches;    /**< Number of batches in input tensor. */
-    int                             _num_rows;       /**< Number of rows in input tensor. */
-    int                             _num_cols;       /**< Number of columns in input tensor. */
-    int                             _num_channels;   /**< Number of channels in input tensor. */
-    PaddingType                     _padding;        /**< Padding type. */
-    ITensor                        *_output;         /**< Base of output matrices. */
-    int                             _matrix_stride;  /**< Stride between output matrices. */
-    int                             _padding_top;    /**< Padding to apply to the top of the image. */
-    int                             _padding_left;   /**< Padding to apply to the left of the image. */
-    int                             _padding_right;  /**< Padding to apply to the right of the image. */
-    int                             _padding_bottom; /**< Padding to apply to the bottom of the image. */
-    ITensor                        *_workspace;
-};
-
-/** Interface for the kernel to perform Winograd output transform. */
-class INEWinogradLayerTransformOutputKernel : public INEKernel
-{
-public:
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0;
-
-    /** Determine how much memory (in units of TOut) to allocate for the
-     * (Winograd domain) output.
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Storage size (in units of TOut) required.
-     */
-    virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0;
-
-    /** Gets the stride between matrices in the output worspace
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0;
-
-    /** Get the output shape of a convolution.
-     *
-     * @param[in] num_rows     Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols     Number of columns in each feature map of the input tensor.
-     * @param[in] padding_same True if padding is SAME, false otherwise
-     *
-     * @return Shape of the output tensor
-     */
-    virtual std::pair<unsigned int, unsigned int> get_output_shape(
-        int  num_rows,    /* Number of rows in each feature map of the input tensor. */
-        int  num_cols,    /* Number of columns in each feature map of the input tensor. */
-        bool padding_same /* True if padding is SAME, false otherwise */
-    ) const = 0;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  biases             Pointer to the biases tensor.
-     * @param[in]  transformed_output Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride      Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
-     * @param[out] output_nhwc        Pointer to a tensor in NHWC data layout ordered output tensor, in the spatial domain.
-     * @param[in]  num_batches        Number of batches in the input tensor.
-     * @param[in]  num_rows           Number of rows in output tensor.
-     * @param[in]  num_cols           Number of columns in output tensor.
-     * @param[in]  num_channels       Number of feature maps in the output tensor.
-     * @param[in]  workspace          Tensor to be used as the working space during the computation.
-     * @param[in]  activation         Activation to be used
-     */
-    virtual void configure(
-        const ITensor              *biases,
-        const ITensor              *transformed_output,
-        const int                   matrix_stride,
-        ITensor                    *output_nhwc,
-        const int                   num_batches,
-        const int                   num_rows,
-        const int                   num_cols,
-        const int                   num_channels,
-        ITensor                    *workspace,
-        const arm_gemm::Activation &activation) = 0;
-
-    virtual ~INEWinogradLayerTransformOutputKernel()
-    {
-    }
-};
-
-/** Kernel to perform Winograd output transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerTransformOutputKernel : public INEWinogradLayerTransformOutputKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEWinogradLayerTransformOutputKernel";
-    }
-    /** Constructor */
-    NEWinogradLayerTransformOutputKernel();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformOutputKernel(const NEWinogradLayerTransformOutputKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformOutputKernel &operator=(const NEWinogradLayerTransformOutputKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformOutputKernel(NEWinogradLayerTransformOutputKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformOutputKernel &operator=(NEWinogradLayerTransformOutputKernel &&) = default;
-    /** Default destructor */
-    ~NEWinogradLayerTransformOutputKernel() = default;
-
-    // Inherited methods overridden:
-    /** Determine how much memory (in units of TOut) to allocate for the
-     * (Winograd domain) output.
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Storage size (in units of TOut) required.
-     */
-    unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const override;
-
-    /** Gets the stride between matrices in the output worspace
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const override;
-    /** Get the output shape of a convolution.
-     *
-     * @param[in] num_rows     Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols     Number of columns in each feature map of the input tensor.
-     * @param[in] padding_same True if padding is SAME, false otherwise
-     *
-     * @return Shape of the output tensor
-     */
-    std::pair<unsigned int, unsigned int> get_output_shape(
-        int  num_rows, /* Number of rows in each feature map of the input tensor. */
-        int  num_cols, /* Number of columns in each feature map of the input tensor. */
-        bool padding_same) const override;
-
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    unsigned int get_working_space_size(unsigned int num_threads) const override;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  biases             Pointer to the biases tensor.
-     * @param[in]  transformed_output Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride      Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
-     * @param[out] output_nhwc        Pointer to a tensor with NHWC data layout, in the spatial domain.
-     * @param[in]  num_batches        Number of batches in the input tensor.
-     * @param[in]  num_rows           Number of rows in output tensor.
-     * @param[in]  num_cols           Number of columns in output tensor.
-     * @param[in]  num_channels       Number of feature maps in the output tensor.
-     * @param[in]  workspace          Tensor to be used as the working space during the computation.
-     * @param[in]  activation         Activation to be used
-     */
-    void configure(
-        const ITensor              *biases,
-        const ITensor              *transformed_output,
-        const int                   matrix_stride,
-        ITensor                    *output_nhwc,
-        const int                   num_batches,
-        const int                   num_rows,
-        const int                   num_cols,
-        const int                   num_channels,
-        ITensor                    *workspace,
-        const arm_gemm::Activation &activation) override;
-
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformOutputKernel
-     *
-     * @param[in] input         Source tensor info with shape [C, N, 16, batches] or [C, N, 36, batches]. Data types supported: F16/F32.
-     * @param[in] bias          Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[in] output        Destination tensor info with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-private:
-    using WinogradBase    = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    using WinogradConv    = typename WinogradBase::template Convolution<T, T>;
-    using OutputTransform = typename WinogradBase::template OutputTransform<T, T>;
-
-    std::unique_ptr<OutputTransform> _transform{ nullptr };
-    const ITensor                   *_biases;
-    const ITensor                   *_transformed_output;
-    ITensor                         *_workspace;
-    int                              _matrix_stride;
-    int                              _matrix_row_stride;
-    ITensor                         *_output_nhwc;
-    int                              _num_batches;
-    int                              _num_rows;
-    int                              _num_cols;
-    int                              _num_channels;
-};
-
-/** Interface for the kernel to perform Winograd weights transform. */
-class INEWinogradLayerTransformWeightsKernel : public INEKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWinogradLayerTransformWeightsKernel(const INEWinogradLayerTransformWeightsKernel &) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWinogradLayerTransformWeightsKernel &operator=(const INEWinogradLayerTransformWeightsKernel &) = default;
-    /** Allow instances of this class to be moved */
-    INEWinogradLayerTransformWeightsKernel(INEWinogradLayerTransformWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEWinogradLayerTransformWeightsKernel &operator=(INEWinogradLayerTransformWeightsKernel &&) = default;
-
-    INEWinogradLayerTransformWeightsKernel()
-    {
-    }
-    virtual ~INEWinogradLayerTransformWeightsKernel()
-    {
-    }
-    /** Determine how much memory (in units of T) to allocate for the
-     * transformed weights.
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Storage size (in units of T) required.
-     */
-    virtual unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const = 0;
-    /** Gets the stride between matrices in the kernel worspace
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_output_channels, int num_input_channels) const = 0;
-
-    /** Configure the weights transform kernel.
-     *
-     * @param[in]  weights_hwio        Pointer to the weights tensor
-     * @param[out] output              Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride       Stride across matrices in the output workspace.
-     * @param[in]  num_output_channels Number of filters.
-     * @param[in]  num_input_channels  Number of channels in each filter.
-     */
-
-    virtual void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformWeightsKernel
-     *
-     * @param[in] input   First tensor input info. Data types supported: F16/F32.
-     * @param[in] weights Weights tensor info. Data types supported: same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights);
-};
-
-/** Kernel to perform Winograd weights transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformWeightsKernel(const NEWinogradLayerTransformWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformWeightsKernel &operator=(const NEWinogradLayerTransformWeightsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformWeightsKernel(NEWinogradLayerTransformWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformWeightsKernel &operator=(NEWinogradLayerTransformWeightsKernel &&) = default;
-    /** Default destructor */
-    ~NEWinogradLayerTransformWeightsKernel() = default;
-
-    /** Default constructor. */
-    NEWinogradLayerTransformWeightsKernel();
-    const char *name() const override
-    {
-        return "NEWinogradLayerTransformWeightsKernel";
-    }
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformWeightsKernel
-     *
-     * @param[in] input         Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout).
-     *                          kernel_x must be 3 and equal to kernel_y. Data types supported: F16/F32.
-     * @param[in] output        Destination tensor info. The output is a 3D tensor with dimensions [OFM, IFM, 16] or [OFM, IFM, 36]. Data type supported: same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-
-#ifndef DOXYGEN_SKIP_THIS
-    /** Configure the weights transform kernel.
-     *
-     * @param[in]  weights_hwio        Pointer to the weights tensor
-     * @param[out] output              Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride       Stride across matrices in the output workspace.
-     * @param[in]  num_output_channels Number of filters.
-     * @param[in]  num_input_channels  Number of channels in each filter.
-     */
-    void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override;
-#endif /* DOXYGEN_SKIP_THIS */
-
-    /** Determine how much memory (in units of T) to allocate for the
-     * transformed weights.
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Storage size (in units of T) required.
-     */
-    unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(int num_output_channels, int num_input_channels) const override;
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    using WinogradBase     = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    using WinogradConv     = typename WinogradBase::template Convolution<T, T>;
-    using WeightsTransform = typename WinogradBase::template WeightsTransform<T, T>;
-
-    std::unique_ptr<WeightsTransform> _transform{ nullptr };
-    const ITensor                    *_weights_hwio;
-    ITensor                          *_output;
-    int                               _matrix_stride;
-    int                               _num_output_channels;
-    int                               _num_input_channels;
-};
-
-/** Kernel to perform Winograd. */
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerConfiguration
-{
-public:
-    /** Winograd base kernel */
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    /** Winograd convolution kernel */
-
-    using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>;
-
-    using TransformInputKernel   = NEWinogradLayerTransformInputKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-    using TransformWeightsKernel = NEWinogradLayerTransformWeightsKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-    using TransformOutputKernel  = NEWinogradLayerTransformOutputKernel<TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-};
-
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H*/
diff --git a/src/core/NEON/kernels/arm_conv/addressing.cpp b/src/core/NEON/kernels/arm_conv/addressing.cpp
new file mode 100644
index 0000000000..2460398880
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/addressing.cpp
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "addressing.hpp"
+#include "utils.hpp"
+#include <algorithm>
+#include <cstring>
+
+namespace arm_conv {
+namespace addressing {
+
+void fill_pointer_array(
+  size_t element_size,
+  void **dest_raw, const unsigned int array_rows, const unsigned int array_cols,
+  void *base_ptr_raw, size_t ld_row, size_t ld_col,
+  void *pad_buffer_raw,
+  const unsigned int pad_top, const unsigned int valid_rows,
+  const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+  auto dest = reinterpret_cast<char **>(dest_raw);
+  auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
+  auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
+  ld_row *= element_size;
+  ld_col *= element_size;
+
+  const auto last_valid_row = std::min(pad_top + valid_rows, array_rows);
+  const auto last_valid_col = std::min(pad_left + valid_cols, array_cols);
+
+  unsigned int i = 0;
+  for (; i < pad_top; i++)
+  {
+    for (unsigned int j = 0; j < array_cols; j++)
+    {
+      *(dest++) = pad_buffer;
+    }
+  }
+  for (; i < last_valid_row; i++)
+  {
+    unsigned int j = 0;
+    auto colptr = base_ptr;
+    base_ptr += ld_row;
+
+    for (; j < pad_left; j++)
+    {
+      *(dest++) = pad_buffer;
+    }
+    for (; j < last_valid_col; j++)
+    {
+      *(dest++) = colptr;
+      colptr += ld_col;
+    }
+    for (; j < array_cols; j++)
+    {
+      *(dest++) = pad_buffer;
+    }
+  }
+  for (; i < array_rows; i++)
+  {
+    for (unsigned int j = 0; j < array_cols; j++)
+    {
+      *(dest++) = pad_buffer;
+    }
+  }
+}
+
+
+void fill_pointer_array_generic_kernel(
+  const size_t element_size,
+  void **dest_raw,
+  const unsigned int output_rows, const unsigned int output_cols,
+  const unsigned int kernel_rows, const unsigned int kernel_cols,
+  const unsigned int stride_rows, const unsigned int stride_cols,
+  void *base_ptr_raw, size_t ld_row, size_t ld_col,
+  void *pad_buffer_raw,
+  const unsigned int pad_top, const unsigned int valid_rows,
+  const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+  auto dest = reinterpret_cast<char **>(dest_raw);
+  auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
+  auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
+  ld_row *= element_size;
+  ld_col *= element_size;
+
+  const auto last_valid_row = pad_top + valid_rows;
+  const auto last_valid_col = pad_left + valid_cols;
+  const auto point_stride = output_rows * output_cols;
+
+  // Iterate over the output points, after every point increment the pointer
+  // into the address array.
+  for (unsigned int oi = 0; oi < output_rows; oi++)
+  {
+    for (unsigned int oj = 0; oj < output_cols; oj++)
+    {
+      auto point_dest = dest;
+      dest++;
+
+      // Iterate over kernel points and fill in the pointer array.
+      unsigned int ki = 0, ii = oi*stride_rows;
+      for (; ii < pad_top && ki < kernel_rows; ii++, ki++)
+      {
+        // Fill with padding
+        for (unsigned int j = 0; j < kernel_cols; j++)
+        {
+          *point_dest = pad_buffer;
+          point_dest += point_stride;
+        }
+      }
+      for (; ii < last_valid_row && ki < kernel_rows; ii++, ki++)
+      {
+        unsigned int kj = 0, ij = oj*stride_cols;
+        for (; ij < pad_left && kj < kernel_cols; ij++, kj++)
+        {
+          // Padding
+          *point_dest = pad_buffer;
+          point_dest += point_stride;
+        }
+        for (; ij < last_valid_col && kj < kernel_cols; ij++, kj++)
+        {
+          *point_dest = base_ptr + (ii - pad_top)*ld_row + (ij - pad_left)*ld_col;
+          point_dest += point_stride;
+        }
+        for (; kj < kernel_cols; kj++)
+        {
+          // Padding
+          *point_dest = pad_buffer;
+          point_dest += point_stride;
+        }
+      }
+      for (; ki < kernel_rows; ki++)
+      {
+        // Fill with padding
+        for (unsigned int j = 0; j < kernel_cols; j++)
+        {
+          *point_dest = pad_buffer;
+          point_dest += point_stride;
+        }
+      }
+    }
+  }
+}
+
+/* Patch array constructor
+ *
+ * Some depthwise kernels require an NCHW-ordered patch of input. Here we
+ * construct such a patch, and fill in an array of pointers to the rows of the
+ * patch.
+ */
+void fill_nchw_patch_array(
+  size_t element_size,
+  const void **dest_row_pointers_raw,  // Array of pointers to each row of the patch
+  void *dest_patch_raw,  // Pointer to space which can be used to construct the patch
+  const unsigned int patch_rows, unsigned int patch_cols,  // Patch size
+  const void *src_ptr_raw, size_t ld_row, size_t ld_col,  // Source tensor
+  const void *pad_row,  // Pointer to a row of padding values
+  const unsigned int pad_top, const unsigned int valid_rows,
+  const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+  // Convert into more useful types
+  auto row_pointers = reinterpret_cast<const char **>(dest_row_pointers_raw);
+  auto dest_patch = reinterpret_cast<char *>(dest_patch_raw);
+  auto src = reinterpret_cast<const char *>(src_ptr_raw);
+  ld_row *= element_size;
+  ld_col *= element_size;
+
+  // Round up the patch columns to be a full quad
+  patch_cols = arm_gemm::roundup<unsigned int>(patch_cols, 16 / element_size);
+
+  const auto last_valid_row = std::min(pad_top + valid_rows, patch_rows);
+  const auto last_valid_col = std::min(pad_left + valid_cols, patch_cols);
+
+  // Construct the patch and row pointer array together
+  unsigned int i = 0;
+  for (; i < pad_top; i++)
+  {
+    // Insert pointers into the padding row
+    *(row_pointers++) = reinterpret_cast<const char *>(pad_row);
+  }
+  for (; i < last_valid_row; i++)
+  {
+    // Get a copy of the pointer for this row
+    auto colptr = src;
+    src += ld_row;
+
+    // If the input is already in NCHW format (ld_col == element_size) AND
+    // there is no padding, then we just use a pointer to the source tensor;
+    // otherwise we need to construct a patch and provide a pointer to it.
+    if (ld_col == element_size && pad_left == 0 && last_valid_col == patch_cols)
+    {
+      *(row_pointers++) = colptr;
+    }
+    else
+    {
+      auto patch_col = dest_patch;
+      *(row_pointers++) = dest_patch;
+      dest_patch += element_size * patch_cols;  // Move the patch pointer on
+
+      // Construct the patch; fill the entirety with padding and then copy in
+      // the valid elements.
+      memcpy(patch_col, pad_row, element_size * patch_cols);
+      patch_col += pad_left * element_size;  // Move over the left padding
+
+      if (ld_col == element_size)
+      {
+        // If the input is NCHW then copy across as many columns as we can.
+        memcpy(patch_col, colptr, (last_valid_col - pad_left) * element_size);
+      }
+      else
+      {
+        // If the input is NHWC then copy columns across in turn.
+        for (auto j = pad_left; j < last_valid_col; j++)
+        {
+          memcpy(patch_col, colptr, element_size);  // Copy the valid element
+          patch_col += element_size;  // Progress the patch destination
+          colptr += ld_col;  // Progress the patch source
+        }
+      }
+    }
+  }
+  for (; i < patch_rows; i++)
+  {
+    // Insert pointers into the padding row
+    *(row_pointers++) = reinterpret_cast<const char *>(pad_row);
+  }
+}
+
+
+/* Patch array constructor (generic kernels)
+ *
+ * Construct an array of pointers; one pointer for each output row for each
+ * kernel point. Pointers should point at a whole number of QUADS containing an
+ * input point for each output point. If the kernel column stride is 1 and the
+ * data is NCHW then the input tensor might be addressed directly, otherwise a
+ * new patch sample might need to be constructed.
+ */
+void fill_patch_array_generic_kernel(
+  size_t element_size,
+  const void **dest_pointers_raw,  // Pointers: one per output row per kernel point
+  void *patch_raw,  // Pointer to space which can be used to construct the patch
+  const unsigned int output_rows, const unsigned int output_cols,
+  const unsigned int kernel_rows, const unsigned int kernel_cols,
+  const unsigned int stride_rows, const unsigned int stride_cols,
+  const void *src_ptr_raw, size_t ld_row, size_t ld_col,  // Source tensor
+  const void *pad_row,  // Pointer to a row of padding values
+  const unsigned int pad_top, const unsigned int valid_rows,
+  const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+  auto dest = reinterpret_cast<const char **>(dest_pointers_raw);
+  auto patch = reinterpret_cast<char *>(patch_raw);
+  auto src_ptr = reinterpret_cast<const char *>(src_ptr_raw);
+  ld_row *= element_size;
+  ld_col *= element_size;
+
+  // Round up the patch columns to a multiple of quad-length
+  const auto patch_cols = arm_gemm::roundup<unsigned int>(output_cols, 16 / element_size);
+
+  const auto input_rows = kernel_rows + (output_rows - 1) * stride_rows;
+  const auto last_valid_row = std::min(pad_top + valid_rows, input_rows);
+
+  const auto input_cols = kernel_cols + (output_cols - 1) * stride_cols;
+  const auto last_valid_col = std::min(pad_left + valid_cols, input_cols);
+
+  for (auto ki = 0u; ki < kernel_rows; ki++)
+  {
+    for (auto kj = 0u; kj < kernel_cols; kj++)
+    {
+      auto oi = 0u, ii = ki;
+      for (; oi < output_rows && ii < pad_top; oi++, ii += stride_rows)
+      {
+        // Insert a pointer to the padding row
+        *(dest++) = reinterpret_cast<const char *>(pad_row);
+      }
+      for (; oi < output_rows && ii < last_valid_row; oi++, ii += stride_rows)
+      {
+        auto rowptr = src_ptr + (ii - pad_top) * ld_row;
+
+        // Construct a sample of the input here
+        auto patch_pos = patch;
+        *(dest++) = patch;
+        patch += patch_cols * element_size;
+
+        // Fill with padding
+        memcpy(patch_pos, pad_row, patch_cols * element_size);
+
+        // Fill in the valid elements
+        auto oj = 0u, ij = kj;
+        for (; oj < patch_cols && ij < pad_left; oj++, ij += stride_cols)
+        {
+          // Do nothing for padding
+          patch_pos += element_size;
+        }
+        for (; oj < patch_cols && ij < last_valid_col; oj++, ij += stride_cols)
+        {
+          // Copy from the source tensor
+          memcpy(patch_pos, rowptr + (ij - pad_left)*ld_col, element_size);
+          patch_pos += element_size;
+        }
+        // No action required for right-hand padding
+      }
+      for (; oi < output_rows; oi++)
+      {
+        *(dest++) = reinterpret_cast<const char *>(pad_row);
+      }
+    }
+  }
+}
+
+}  // namespace addressing
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/addressing.hpp b/src/core/NEON/kernels/arm_conv/addressing.hpp
new file mode 100644
index 0000000000..35715a3764
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/addressing.hpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* arm_conv kernels share a lot of similarities in how they address input and
+ * output tensors. Consequently, this file contains common approaches to
+ * preparing these tensor descriptions. Generic (i.e., untyped) methods are
+ * contained within the `arm_conv::addressing` namespace, and typed wrappers
+ * are provided within an anonymous namespace within `arm_conv`. The various
+ * methods are described below.
+ */
+
+#include <cstddef>
+
+namespace arm_conv {
+namespace addressing {
+
+/* Pointer array
+ * -------------
+ *
+ * Constructs an array of pointers which point to a `array_rows` x `array_cols`
+ * chunk of a tensor. The array of pointers will be written into `dest`.
+ *
+ * `base_ptr` should point at the first VALID element of the chunk of tensor
+ * (i.e., if there's one padded row, and one padded column, then `base_ptr`
+ * should point at the element which will be at position (1, 1) in the array).
+ * `ld_row` and `ld_col` are in bytes, and describe the strides over rows and
+ * columns (respectively) of the NHWC-ordered tensor. `pad_buffer` should point
+ * at a suitably sized (and initialised) area of memory which can be addressed
+ * by elements of the array which represent padding.
+ *
+ * `pad_top` and `pad_left` describe the padding on the top and left of the
+ * array, respectively, and `valid_rows` and `valid_cols` describe the number
+ * of rows and columns between the element pointed to by `base_ptr` and the
+ * edge of the image (that is `valid_rows` may be greater than `array_rows` and
+ * likewise for the columns).
+ */
+void fill_pointer_array(
+  size_t element_size,
+  void **dest, unsigned int array_rows, unsigned int array_cols,
+  void *base_ptr, size_t ld_row, size_t ld_col,
+  void *pad_buffer,
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+);
+
+/* Interleaved multi-point pointer array
+ * -------------------------------------
+ *
+ * For each point in a `output_rows` x `output_cols` array, constructs
+ * `kernel_rows` x `kernel_cols` array of pointers. The pointers are
+ * interleaved thusly:
+ *
+ *   for ki in kernel_rows:
+ *       for kj in kernel_cols:
+ *           for oi in output_rows:
+ *               for oj in output_cols:
+ *                   get pointer for point (oi*stride_rows + ki, oj*stride_cols + kj)
+ *
+ * Other arguments are as for `fill_pointer_array`.
+ *
+ * The name reflects that this is the form of addressing mode used by "generic"
+ * depthwise and pooling kernels.
+ */
+void fill_pointer_array_generic_kernel(
+  size_t element_size,
+  void **dest,
+  unsigned int output_rows, unsigned int output_cols,
+  unsigned int kernel_rows, unsigned int kernel_cols,
+  unsigned int stride_rows, unsigned int stride_cols,
+  void *base_ptr, size_t ld_row, size_t ld_col,
+  void *pad_buffer,
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+);
+
+/* NCHW-patch addressed by row
+ * ---------------------------
+ *
+ * Construct an array of pointers, each of which points at a row of an
+ * NCHW-ordered patch of a tensor. Memory addressed by the pointers may be
+ * outside of the original tensor, and should therefore not be written to
+ * (modifications will be lost).
+ *
+ * `dest_row_pointers` should point at a `patch_rows` list of pointers; each of
+ * which will point at a 1 x `patch_cols` NCHW-ordered sample of the source
+ * tensor.
+ *
+ * `dest_patch` should point to a `element_size * patch_rows * patch_cols` area
+ * of memory which can be written to by this function to form samples of the
+ * source tensor.
+ *
+ * `src_ptr` should point at the first VALID element of the chunk of tensor
+ * (i.e., if there's one padded row, and one padded column, then `src_ptr`
+ * should point at the element which will be at position (1, 1) in the array).
+ * `ld_row` and `ld_col` are in bytes, and describe the strides over rows and
+ * columns (respectively) of the NHWC-ordered tensor. If `ld_col` ==
+ * `element_size` then copies from the source tensor will be elided and source
+ * data may be addressed directly.
+ *
+ * `pad_row` should point to a `patch_cols` array of (appropriately
+ * initialised) padding values.
+ *
+ * Other arguments are as for `fill_pointer_array`.
+ */
+void fill_nchw_patch_array(
+  size_t element_size,
+  const void **dest_row_pointers,  // Array of pointers to each row of the patch
+  void *dest_patch,  // Pointer to space which can be used to construct the patch
+  unsigned int patch_rows, unsigned int patch_cols,  // Patch size
+  const void *src_ptr, size_t ld_row, size_t ld_col,  // Source tensor
+  const void *pad_row,  // Pointer to a row of padding values
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+);
+
+void fill_patch_array_generic_kernel(
+  size_t element_size,
+  const void **dest_pointers,  // Pointers: one per output row per kernel point
+  void *dest_patch,  // Pointer to space which can be used to construct the patch
+  unsigned int output_rows, unsigned int output_cols,
+  unsigned int kernel_rows, unsigned int kernel_cols,
+  unsigned int stride_rows, unsigned int stride_cols,
+  const void *src_ptr, size_t ld_row, size_t ld_col,  // Source tensor
+  const void *pad_row,  // Pointer to a row of padding values
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+);
+
+}  // namespace addressing
+
+namespace {
+
+/* Pointer array
+ * -------------
+ *
+ * See `addressing::fill_pointer_array`. No copies are made by this method,
+ * memory pointed to by the pointer array is contained within the base tensor
+ * and the padding buffer.
+ */
+template <typename T>
+inline void fill_pointer_array(
+  T **dest, unsigned int array_rows, unsigned int array_cols,
+  T *base_ptr, size_t ld_row, size_t ld_col,
+  T *pad_buffer,
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+)
+{
+  addressing::fill_pointer_array(
+    sizeof(T), (void **) dest, array_rows, array_cols,
+    (void *) base_ptr, ld_row, ld_col,
+    (void *) pad_buffer,
+    pad_top, valid_rows,
+    pad_left, valid_cols
+  );
+}
+
+
+/* Interleaved multi-point pointer array
+ * -------------------------------------
+ *
+ * See `addressing::fill_pointer_array_generic_kernel`. No copies are made by
+ * this method, memory pointed to by the pointer array is contained within the
+ * base tensor and the padding buffer.
+ */
+template <typename T>
+inline void fill_pointer_array_generic_kernel(
+  T **dest,
+  unsigned int output_rows, unsigned int output_cols,
+  unsigned int kernel_rows, unsigned int kernel_cols,
+  unsigned int stride_rows, unsigned int stride_cols,
+  T *base_ptr, size_t ld_row, size_t ld_col,
+  T *pad_buffer,
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+)
+{
+  addressing::fill_pointer_array_generic_kernel(
+    sizeof(T),
+    (void **) dest,
+    output_rows, output_cols,
+    kernel_rows, kernel_cols,
+    stride_rows, stride_cols,
+    (void *) base_ptr, ld_row, ld_col,
+    (void *) pad_buffer,
+    pad_top, valid_rows,
+    pad_left, valid_cols
+  );
+}
+
+template <typename T>
+inline void fill_nchw_patch_array(
+  const T **dest_row_pointers,  // Array of pointers to each row of the patch
+  T *dest_patch,  // Pointer to space which can be used to construct the patch
+  unsigned int patch_rows, unsigned int patch_cols,  // Patch size
+  const T *src_ptr, size_t ld_row, size_t ld_col,  // Source tensor
+  const T *pad_row,  // Pointer to a row of padding values
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+)
+{
+  addressing::fill_nchw_patch_array(
+    sizeof(T),
+    reinterpret_cast<const void **>(dest_row_pointers),
+    reinterpret_cast<void *>(dest_patch),
+    patch_rows, patch_cols,
+    reinterpret_cast<const void *>(src_ptr), ld_row, ld_col,
+    reinterpret_cast<const void *>(pad_row),
+    pad_top, valid_rows,
+    pad_left, valid_cols
+  );
+}
+
+template <typename T>
+inline void fill_patch_array_generic_kernel(
+  const T **dest_pointers,  // Pointers: one per output row per kernel point
+  T *dest_patch,  // Pointer to space which can be used to construct the patch
+  unsigned int output_rows, unsigned int output_cols,
+  unsigned int kernel_rows, unsigned int kernel_cols,
+  unsigned int stride_rows, unsigned int stride_cols,
+  const T *src_ptr, size_t ld_row, size_t ld_col,  // Source tensor
+  const T *pad_row,  // Pointer to a row of padding values
+  unsigned int pad_top, unsigned int valid_rows,
+  unsigned int pad_left, unsigned int valid_cols
+)
+{
+  addressing::fill_patch_array_generic_kernel(
+    sizeof(T),
+    reinterpret_cast<const void **>(dest_pointers),
+    reinterpret_cast<void *>(dest_patch),
+    output_rows, output_cols,
+    kernel_rows, kernel_cols,
+    stride_rows, stride_cols,
+    reinterpret_cast<const void *>(src_ptr), ld_row, ld_col,
+    reinterpret_cast<const void *>(pad_row),
+    pad_top, valid_rows,
+    pad_left, valid_cols
+  );
+}
+
+}  // namespace {anonymous}
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
new file mode 100644
index 0000000000..95ece8cdc8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise.hpp"
+#include "utils.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename T> struct DefaultTAccum { using Type = T; };
+template <> struct DefaultTAccum<int8_t> { using Type = int32_t; };
+template <> struct DefaultTAccum<uint8_t> { using Type = int32_t; };
+
+template <typename T> struct DefaultOutputStage { using Type = Nothing; };
+template <> struct DefaultOutputStage<int8_t> { using Type = arm_gemm::Requantize32; };
+template <> struct DefaultOutputStage<uint8_t> { using Type = arm_gemm::Requantize32; };
+
+class IDepthfirstStrategy
+{
+  public:
+  virtual ~IDepthfirstStrategy() = default;
+
+  virtual unsigned int get_input_rows() const = 0;
+  virtual unsigned int get_input_cols() const = 0;
+
+  virtual unsigned int get_output_rows() const = 0;
+  virtual unsigned int get_output_cols() const = 0;
+};
+
+
+template <typename T>
+struct TensorSpec
+{
+  T base;
+  size_t ld_row, ld_col;
+
+  TensorSpec(T ptr, size_t ld_row, size_t ld_col)
+  : base(ptr), ld_row(ld_row), ld_col(ld_col) {}
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
+{
+  protected:
+  using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
+
+  // The strategy which we're applying to solve the depthwise convolution.
+  std::unique_ptr<const IDepthfirstStrategy> m_strat;
+
+  /* Compute the amount of working space required for a single thread. */
+  virtual size_t get_working_size_per_thread() const = 0;
+
+  /* Initialise the working space for a thread. */
+  virtual void initialise_working_space(void *) const = 0;
+
+  /* Compute a portion of the output tensor with padding. */
+  virtual void compute_tile_padded(
+    const DepthwiseArgs &args,
+    unsigned int output_i, unsigned int output_j,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space
+  ) const = 0;
+
+  /* Compute a portion of the work with only top/bottom padding.
+   *
+   * The default implementation of this repeatedly calls into the padded tile
+   * variant.
+   */
+  virtual void compute_row_padded_tile_row(
+    const DepthwiseArgs &args,
+    const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+    const unsigned int output_channel_start, const unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space
+  ) const
+  {
+    for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
+    {
+      this->compute_tile_padded(
+        args,
+        output_i, output_j, output_channel_start, output_channel_end,
+        input, output, parameters, working_space
+      );
+    }
+  }
+
+  /* Compute a portion of the output tensor with no padding.
+   *
+   * The default implementation of this repeatedly calls into the padded
+   * variant.
+   */
+  virtual void compute_tiles_unpadded(
+    const DepthwiseArgs &args,
+    unsigned int start_output_i, unsigned int start_output_j,
+    unsigned int n_tile_rows, unsigned int n_tile_cols,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space
+  ) const
+  {
+    for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
+    {
+      unsigned int row_start_output_j = start_output_j;
+      for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
+      {
+        this->compute_tile_padded(
+            args,
+            start_output_i, row_start_output_j,
+            output_channel_start, output_channel_end,
+            input, output, parameters, working_space
+        );
+        row_start_output_j += m_strat->get_output_cols();
+      }
+      start_output_i += m_strat->get_output_rows();
+    }
+  }
+
+  void execute_internal(
+    const DepthwiseArgs &args,
+    const void *input,
+    size_t ld_input_col,
+    size_t ld_input_row,
+    size_t ld_input_batch,
+    const void *parameters,
+    void *output,
+    size_t ld_output_col,
+    size_t ld_output_row,
+    size_t ld_output_batch,
+    void *working_space,
+    unsigned int thread_id,
+    unsigned int n_threads
+  ) const override
+  {
+    // Get and initialise the working space for this thread.
+    void *thread_working_space =
+      static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+    this->initialise_working_space(thread_working_space);
+
+    // Construct convenient representations of the input/output tensors.
+    TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
+    TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
+
+    const auto n_output_channels = args.input_channels * args.channel_multiplier;
+
+    // By default we parallelize over the rows, but if there's only 1 row, we
+    // try to parallize over batches
+    auto thread_id_for_rows = thread_id;
+    auto n_threads_for_rows = n_threads;
+    auto thread_id_for_batches = 0;
+    auto n_threads_for_batches = 1;
+    if (args.output_rows == 1) {
+      thread_id_for_rows = 0;
+      n_threads_for_rows = 1;
+      thread_id_for_batches = thread_id;
+      n_threads_for_batches = n_threads;
+    }
+
+    // Progress the pointers for the first batch.
+    input_tensor.base += ld_input_batch*thread_id_for_batches;
+    output_tensor.base += ld_output_batch*thread_id_for_batches;
+    for (unsigned int batch = thread_id_for_batches;
+          batch < args.n_batches;
+          batch += n_threads_for_batches)
+    {
+      // Iterate over rows of the output tensor; we stripe over the tiles.
+      for (unsigned int start_output_i = thread_id_for_rows * m_strat->get_output_rows();
+           start_output_i < args.output_rows;
+           start_output_i += n_threads_for_rows * m_strat->get_output_rows())
+      {
+        // Determine what (if any padding) is required on the top/bottom of
+        // this row of the convolution.
+        const auto end_output_i = start_output_i + m_strat->get_output_rows();
+        const bool pad_output_bottom = args.output_rows < end_output_i;
+
+        const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
+        const bool pad_input_top = start_input_i < 0;
+        const int end_input_i = start_input_i + m_strat->get_input_rows();
+        const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i;
+        // We only need to account for input padding if direct padding is not supported.
+        const bool pad_row = ((pad_input_top || pad_input_bottom) && !this->supports_direct_padding())
+                || pad_output_bottom;
+
+        // Iterate over the columns of the output tensor; we attempt to grab as
+        // much as possible of the unpadded regions, so the loop structure is a
+        // bit odd.
+        unsigned int start_output_j = 0;
+        while (start_output_j < args.output_cols)
+        {
+          const int start_in_j = start_output_j * args.stride_cols - args.padding.left;
+          const bool pad_input_left = start_in_j < 0;
+
+          // Determine if we can process a number of unpadded tiles in one go.
+          int n_unpadded_tiles = 0;
+          if ((!pad_input_left) || this->supports_direct_padding())
+          {
+            // Determine the maximum number of tiles we could handle.
+            n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols();
+
+            // Handle padding on the right hand edge
+            const int tile_stride = m_strat->get_output_cols() * args.stride_cols;
+            int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
+            int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
+
+            while (n_unpadded_tiles > 0 &&
+                   (static_cast<int>(args.output_cols) < end_output_j ||
+                    static_cast<int>(args.input_cols) < end_input_j))
+            {
+              n_unpadded_tiles--;
+              end_output_j -= m_strat->get_output_cols();
+              end_input_j -= tile_stride;
+            }
+          }
+
+          // Process unpadded tiles, if possible, otherwise process a padded tile.
+          if (n_unpadded_tiles)
+          {
+            if (!pad_row)
+            {
+              // Completely unpadded execution
+              this->compute_tiles_unpadded(
+                args,
+                start_output_i, start_output_j,
+                1, n_unpadded_tiles,  // Compute a row of unpadded tiles
+                0, n_output_channels,  // Compute all channels
+                input_tensor, output_tensor, parameters, thread_working_space
+              );
+            }
+            else
+            {
+              // Top/bottom padding only
+              this->compute_row_padded_tile_row(
+                args,
+                start_output_i, start_output_j, n_unpadded_tiles,
+                0, n_output_channels,  // Compute all channels
+                input_tensor, output_tensor, parameters, thread_working_space
+              );
+            }
+            start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
+          }
+          else
+          {
+            this->compute_tile_padded(
+              args,
+              start_output_i, start_output_j,
+              0, n_output_channels,  // Compute all channels
+              input_tensor, output_tensor, parameters, thread_working_space
+            );
+            start_output_j += m_strat->get_output_cols();
+          }
+        }
+      }
+
+      // Progress the pointers for the next batch.
+      input_tensor.base += ld_input_batch*n_threads_for_batches;
+      output_tensor.base += ld_output_batch*n_threads_for_batches;
+    }
+  }
+
+  public:
+  DepthfirstDriver(IDepthfirstStrategy *strategy, const DepthwiseArgs &args)
+  : Parent(args), m_strat(strategy)
+  {
+  }
+
+  size_t get_working_size(unsigned int n_threads) const override final
+  {
+    return n_threads * this->get_working_size_per_thread();
+  }
+
+  virtual bool supports_direct_padding() const
+  {
+    return false;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
new file mode 100644
index 0000000000..2950d5e957
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthwise_common.hpp"
+
+#include "utils.hpp"
+
+using arm_gemm::iceildiv;
+
+namespace arm_conv {
+namespace depthwise {
+
+std::tuple<size_t, size_t, size_t, size_t, size_t>
+get_reduced_view_for_dilation(size_t out_size, size_t in_size, const size_t d,
+                              const size_t dilation_factor,
+                              const size_t kernel_size, const size_t stride,
+                              const size_t orig_pad_before) {
+    // Get the valid output range
+    out_size = iceildiv(out_size - d, dilation_factor);
+
+    // Compute the start offset and the amount of padding which applies to this
+    // portion of the work.
+    size_t start_pos = d * stride, pad_before = 0;
+    if (start_pos < orig_pad_before) {
+        pad_before = iceildiv(orig_pad_before - start_pos, dilation_factor);
+    }
+    start_pos += pad_before * dilation_factor - orig_pad_before;
+
+    // Hence compute the valid input range
+    in_size = start_pos < in_size
+                  ? iceildiv(in_size - start_pos, dilation_factor)
+                  : 0;
+
+    // Finally, compute the "after" padding
+    const size_t reqd_input = (out_size - 1) * stride + kernel_size;
+    size_t pad_after = 0;
+    if (reqd_input > (pad_before + in_size)) {
+        pad_after = reqd_input - (pad_before + in_size);
+    }
+
+    return std::make_tuple(out_size, in_size, start_pos, pad_before, pad_after);
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
new file mode 100644
index 0000000000..7b00c9a7af
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
+#include "depthwise_strategies_common.hpp"
+#include "working_space.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#include <limits>
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
+          typename OutputStage>
+class DepthwiseDepthfirstStrategyCommon
+  : public DepthfirstStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  protected:
+  unsigned int m_output_rows, m_output_cols;
+  unsigned int m_kernel_rows, m_kernel_cols;
+  unsigned int m_stride_rows, m_stride_cols;
+
+  public:
+  DepthwiseDepthfirstStrategyCommon(
+    unsigned int output_rows, unsigned int output_cols,
+    unsigned int kernel_rows, unsigned int kernel_cols,
+    unsigned int stride_rows=1, unsigned int stride_cols=1
+  ) : m_output_rows(output_rows), m_output_cols(output_cols),
+      m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+      m_stride_rows(stride_rows), m_stride_cols(stride_cols)
+  {
+  }
+
+  DepthwiseDepthfirstStrategyCommon(unsigned int output_size, unsigned int kernel_size, unsigned int stride=1)
+  : DepthwiseDepthfirstStrategyCommon(output_size, output_size, kernel_size, kernel_size, stride, stride)
+  {
+  }
+
+  virtual ~DepthwiseDepthfirstStrategyCommon() {}
+
+  unsigned int get_output_rows() const override { return m_output_rows; }
+  unsigned int get_output_cols() const override { return m_output_cols; }
+
+  unsigned int get_kernel_rows() const override { return m_kernel_rows; }
+  unsigned int get_kernel_cols() const override { return m_kernel_cols; }
+
+  unsigned int get_stride_rows() const override { return m_stride_rows; }
+  unsigned int get_stride_cols() const override { return m_stride_cols; }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+
+  public:
+  using Parent::Parent;
+
+  typedef void (*IndirectKernelType)(
+    const TInput *const *input_ptrs,
+    TOutput *const *output_ptrs,
+    const void *params,
+    unsigned int n_channels,
+    const TAccum activation_min,
+    const TAccum activation_max
+  );
+  virtual IndirectKernelType get_indirect_kernel(void) const = 0;
+
+  typedef void (*DirectKernelType)(
+    const unsigned int n_tile_rows, const unsigned int n_tile_cols,
+    const TInput *inptr_base, int64_t ld_input_row, int64_t ld_input_col,
+    TOutput *outptr_base, int64_t ld_output_row, int64_t ld_output_col,
+    const void *params, unsigned int n_channels,
+    const TAccum activation_min,
+    const TAccum activation_max
+  );
+  virtual DirectKernelType get_direct_kernel(void) const = 0;
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthwiseDepthfirstStrategy<TInput, TWeight, TOutput, int32_t>
+: public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+  using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
+
+  protected:
+  interleaves::PackingArguments get_packing_args(void) const
+  {
+    return interleaves::PackingArguments(
+      this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+      false, sizeof(int32_t), this->uses_premultiply(),  // Don't pack the bias
+      this->get_vl_type(), sizeof(int32_t), this->get_accumulator_depth_vl(),
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+  }
+
+  public:
+  using Parent::Parent;
+
+  typedef void (*KernelType)(
+    unsigned int,  //  n_channels,
+    const TInput *const *,  // inptrs
+    const TWeight *,  // weights
+    const int32_t *,  //  bias,
+    const arm_gemm::Requantize32 &,
+    const int32_t *, const int32_t *,  //  requant_muls and requant_shifts
+    TOutput *const *  // outptrs
+  );
+  virtual KernelType get_kernel() const = 0;
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleaves::get_storage_size_generic(get_packing_args(), args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const arm_gemm::Requantize32 &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleaves::pack_parameters_generic(
+      get_packing_args(), args, buffer, biases, weights, ld_weight_col, ld_weight_row);
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+class DepthwiseDepthfirstCommon : public DepthfirstDriver<TInput, TWeight, TOutput>
+{
+  using StratType = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+  OutputStage m_os;
+
+  protected:
+  inline OutputStage &get_output_stage(void) { return m_os; }
+  inline const OutputStage &get_output_stage(void) const { return m_os; }
+
+  bool uses_intermediate_array() const
+  {
+    return this->m_args.channel_multiplier != 1 && this->uses_premultiply();
+  }
+
+  virtual void fill_inptr_array(const DepthwiseArgs &args,
+    const TensorSpec<const TInput *> &input,
+    const TInput **inptr_array, TInput *input_buffer,
+    const unsigned int input_i, const unsigned int input_j,
+    const unsigned int input_pad_top, const unsigned int input_pad_left) const = 0;
+
+  void initialise_inptr_array(const DepthwiseArgs &args,
+      unsigned int output_channel_start, unsigned int output_channel_end,
+      const TensorSpec<const TInput *> &input,
+      const TInput **inptr_array, TInput *input_buffer, TInput *intermediate_buffer,
+      const unsigned int input_i, const unsigned int input_j,
+      const unsigned int input_pad_top, const unsigned int input_pad_left,
+      Tile<TInput> &multiplied_input
+  ) const
+  {
+    // Compute the input pointer array
+    const auto input_channel_start = output_channel_start / args.channel_multiplier;
+
+    const auto last_valid_row = std::min(input_pad_top + args.input_rows - input_i, this->m_strat->get_input_rows());
+    const auto last_valid_col = std::min(input_pad_left + args.input_cols - input_j, this->m_strat->get_input_cols());
+
+    const auto tile_rows = last_valid_row - input_pad_top;
+    const auto tile_cols = last_valid_col - input_pad_left;
+
+    const auto tile_channels = output_channel_end - output_channel_start;
+
+    TensorSpec<const TInput *> tile_tensor(0, 0, 0);
+    if (this->uses_intermediate_array()) {
+      multiplied_input = Tile<TInput>(intermediate_buffer, tile_rows, tile_cols, tile_channels);
+      multiplied_input.load_from(input.base, input.ld_row, input.ld_col,
+                                 args.input_rows, args.input_cols,
+                                 input_i, input_j, args.channel_multiplier);
+
+      tile_tensor = TensorSpec<const TInput *>(
+        multiplied_input.array,
+        tile_cols * tile_channels, tile_channels
+      );
+    } else {
+      tile_tensor = TensorSpec<const TInput *>(
+        input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
+        input.ld_row, input.ld_col
+      );
+    }
+
+    fill_inptr_array(args,
+      tile_tensor,
+      inptr_array, input_buffer,
+      input_i, input_j,
+      input_pad_top,
+      input_pad_left
+    );
+  }
+
+  public:
+  DepthwiseDepthfirstCommon(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os)
+  : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
+  {
+  }
+
+  DepthwiseDepthfirstCommon(DepthwiseDepthfirstCommon &) = delete;
+  DepthwiseDepthfirstCommon &operator=(DepthwiseDepthfirstCommon &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    return reinterpret_cast<const StratType *>(this->m_strat.get())->
+      get_storage_size(this->m_args);
+  }
+
+  void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    reinterpret_cast<const StratType *>(this->m_strat.get())->
+      pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
+  }
+};
+
+namespace depthwise_depthfirst {
+
+/* Workspace Element for an array of input pointers as consumed by the
+ * specialised depthwise kernels.
+ */
+template <typename T>
+class InputArrayElement
+{
+  public:
+  struct Workspace
+  {
+    const T **inptr_array;
+  };
+
+  template <class OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof(T **) * args.strategy->get_input_rows() * args.strategy->get_input_cols();
+  }
+
+  template <class WorkspaceType, class OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    ws->inptr_array = reinterpret_cast<const T**>(buffer);
+    return reinterpret_cast<char *>(buffer) + get_element_size(args);
+  }
+};
+
+template <typename TAccum, typename OutputStage, bool IsDot=false>
+struct WorkspaceFinalElement
+{
+  using Element = ActivationsElement<TAccum, OutputStage>;
+};
+
+template <>
+struct WorkspaceFinalElement<int32_t, arm_gemm::Requantize32, false>
+{
+  using Element = RequantizationParametersElement;
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct Invoke
+{
+  constexpr static bool supports_direct_kernel = true;
+
+  template <typename Strat, typename Workspace>
+  static inline void indirect(const Strat *strat, const Workspace *ws, const OutputStage &, const void *params, const TAccum *, unsigned int n_channels)
+  {
+    strat->get_indirect_kernel()(
+      ws->inptr_array,
+      ws->outptr_array,
+      params, n_channels,
+      ws->activation_min, ws->activation_max
+    );
+  }
+
+  template <typename Strat, typename Workspace>
+  static void direct(
+    const Strat *strat, const Workspace *ws, const OutputStage &,
+    unsigned int n_tile_rows, unsigned int n_tile_cols,
+    const TInput *inptr, size_t ld_in_row, size_t ld_in_col,
+    TOutput *outptr, size_t ld_out_row, size_t ld_out_col,
+    const void *params, unsigned int n_channels
+  )
+  {
+    strat->get_direct_kernel()(
+      n_tile_rows, n_tile_cols,
+      inptr, ld_in_row, ld_in_col,
+      outptr, ld_out_row, ld_out_col,
+      params, n_channels, ws->activation_min, ws->activation_max
+    );
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+struct Invoke<TInput, TWeight, TOutput, TAccum, arm_gemm::Requantize32>
+{
+  constexpr static bool supports_direct_kernel = false;
+
+  template <typename Strat, typename Workspace>
+  static inline void indirect(const Strat *strat, const Workspace *ws, const arm_gemm::Requantize32 &qp, const void *params, const TAccum *, unsigned int n_channels)
+  {
+    strat->get_kernel()(
+      n_channels, ws->inptr_array,
+      reinterpret_cast<const TWeight *>(params), ws->bias,
+      qp, ws->requant_muls, ws->requant_shifts,
+      ws->outptr_array
+    );
+  }
+
+  template <typename Strat, typename Workspace>
+  static inline void direct(
+    const Strat *, const Workspace *, const arm_gemm::Requantize32 &,
+    unsigned int, unsigned int,  // n_tile_rows, n_tile_cols
+    const TInput *, size_t, size_t,  // Input pointer, row stride, column stride
+    TOutput *, size_t, size_t,  // Output pointer, row stride, column stride
+    const void *, unsigned int  // Parameters, number of channels
+  )
+  {
+    // Do nothing - this should never be reached because entry to it is guarded
+    // by an `if` on a `constexpr static bool`.
+  }
+};
+
+namespace
+{
+
+template <typename OutputStage>
+inline void stash_bias(OutputStage &, const void *) {}
+
+template <>
+inline void stash_bias(arm_gemm::Requantize32 &qp, const void *bias) __attribute__ ((unused));
+
+template <>
+inline void stash_bias(arm_gemm::Requantize32 &qp, const void *bias)
+{
+  qp.bias = reinterpret_cast<const int32_t *>(bias);
+}
+
+}
+
+}  // namespace depthwise_depthfirst
+
+template <typename TInput,
+          typename TWeight=TInput,
+          typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TInput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirst
+: public DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  using StratType = DepthwiseDepthfirstStrategy<TInput, TWeight, TOutput, TAccum>;
+  using Parent = DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+  using WorkspaceManager = Workspace<
+    OutputArrayElement<TOutput>,
+    depthwise_depthfirst::InputArrayElement<TInput>,
+    InputBufferElement<TInput>,
+    IntermediateBufferElement<TInput>,
+    typename depthwise_depthfirst::WorkspaceFinalElement<TAccum, OutputStage>::Element
+  >;
+  using WorkingSpace = typename WorkspaceManager::WorkspaceType;
+
+  // We keep a copy of the bias and output stage
+  const TAccum *m_bias;
+
+  public:
+  DepthwiseDepthfirst(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+  : Parent(strat, args, os), m_bias(nullptr)
+  {
+  }
+
+  DepthwiseDepthfirst(DepthwiseDepthfirst &) = delete;
+  DepthwiseDepthfirst &operator=(DepthwiseDepthfirst &) = delete;
+
+  void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    reinterpret_cast<const StratType *>(this->m_strat.get())->pack_parameters(
+      this->m_args, buffer, biases, this->get_output_stage(),
+      weights, ld_weight_col, ld_weight_row
+    );
+    m_bias = reinterpret_cast<const TAccum *>(biases);
+    depthwise_depthfirst::stash_bias(this->get_output_stage(), biases);
+  }
+
+  size_t get_working_size_per_thread() const override
+  {
+    DepthwiseArgs args(this->m_args);
+    return WorkspaceManager::get_sizeof_workspace(
+      WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())
+    );
+  }
+
+  void initialise_working_space(void *buffer) const override
+  {
+    DepthwiseArgs args(this->m_args);
+    WorkspaceManager::initialise(
+      buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())
+    );
+  }
+
+  virtual bool supports_direct_padding() const override
+  {
+    using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+    return Invoker::supports_direct_kernel && this->uses_intermediate_array();
+  }
+
+  protected:
+
+  void fill_inptr_array(const DepthwiseArgs &args,
+    const TensorSpec<const TInput *> &input,
+    const TInput **inptr_array, TInput *input_buffer,
+    const unsigned int input_i, const unsigned int input_j,
+    const unsigned int input_pad_top, const unsigned int input_pad_left) const override
+  {
+    fill_pointer_array<const TInput>(
+      inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+      input.base,
+      input.ld_row, input.ld_col,
+      input_buffer,
+      input_pad_top, args.input_rows - input_i,
+      input_pad_left, args.input_cols - input_j
+    );
+  }
+
+  void compute_tile_padded(
+    const DepthwiseArgs &args,
+    unsigned int output_i, unsigned int output_j,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space_raw
+  ) const override
+  {
+    // Get the working space
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+    // Compute the input pointer array
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+    const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+    const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+    const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+    Tile<TInput> multiplied_input;
+    this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+      ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+      input_i, input_j, input_pad_top, input_pad_left, multiplied_input);
+
+    // Compute the output pointer array
+    fill_pointer_array(
+      ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+      output.ld_row, output.ld_col,
+      ws->output_buffer,
+      0, args.output_rows - output_i, // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
+    );
+
+    // Execute the kernel
+    depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>::indirect(
+      reinterpret_cast<const StratType *>(this->m_strat.get()),
+      ws, this->get_output_stage(), parameters, m_bias, output_channel_end - output_channel_start
+    );
+  }
+
+  void compute_row_padded_tile_row(
+    const DepthwiseArgs &args,
+    const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+    const unsigned int output_channel_start, const unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space
+  ) const override
+  {
+    using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space);
+    const auto strat = reinterpret_cast<const StratType *>(this->m_strat.get());
+    const auto os = this->get_output_stage();
+
+    // Compute top and bottom padding; hence fill in the initial pointer arrays.
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+    auto input_j = output_j * args.stride_cols - args.padding.left;
+
+    // Valid input rows is the smallest of the input rows that aren't padding for this tile, and the number of rows
+    // available.
+    const auto valid_input_rows = std::min(strat->get_input_rows() - input_pad_top, args.input_rows - input_i);
+    const auto valid_output_rows = std::min(strat->get_output_rows(), args.output_rows - output_i);
+
+    const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
+    const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
+
+    Tile<TInput> multiplied_input;
+    this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+      ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+      input_i, input_j, input_pad_top, 0, multiplied_input);
+
+    fill_pointer_array(
+      ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+      output.ld_row, output.ld_col,
+      ws->output_buffer,
+      0, args.output_rows - output_i,  // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
+    );
+
+    for (; n_tile_cols; n_tile_cols--)
+    {
+      // Execute the kernel
+      Invoker::indirect(
+        strat, ws, os, parameters, m_bias, output_channel_end - output_channel_start
+      );
+
+      // Update all unpadded pointers
+      if (this->uses_intermediate_array()) {
+        input_j += input_point_stride / input.ld_col;
+        multiplied_input.load_from(input.base,
+          input.ld_row, input.ld_col,
+          args.input_rows, args.input_cols,
+          input_i, input_j, args.channel_multiplier);
+      } else {
+        {
+          auto ptr = ws->inptr_array + strat->get_input_cols() * input_pad_top;
+          for (auto n = input_pad_top; n < (valid_input_rows + input_pad_top); n++)
+          {
+            for (auto m = 0u; m < strat->get_input_cols(); m++)
+            {
+              *(ptr++) += input_point_stride;
+            }
+          }
+        }
+      }
+
+      {
+        auto ptr = ws->outptr_array;
+        for (auto n = 0u; n < valid_output_rows * strat->get_output_cols(); n++)
+        {
+          *(ptr++) += output_point_stride;
+        }
+      }
+    }
+  }
+
+  void compute_tiles_unpadded(
+    const DepthwiseArgs &args,
+    unsigned int output_i, const unsigned int output_j,
+    unsigned int n_tile_rows, unsigned int n_tile_cols,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space_raw
+  ) const override
+  {
+    using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+    const auto strat = reinterpret_cast<const StratType *>(this->m_strat.get());
+    const auto os = this->get_output_stage();
+
+    if (Invoker::supports_direct_kernel)
+    {
+      PaddingValues tile_padding = {
+              args.kernel_cols / 2,
+              args.kernel_rows / 2,
+              args.kernel_cols / 2,
+              args.kernel_rows / 2
+      };
+
+      // If the direct kernel is supported, then use it.
+      // Compute the base pointers we'll use in the tile.
+      auto outptr = output.base + output_channel_start + output_i * output.ld_row + output_j * output.ld_col;
+      const int start_input_i = output_i * args.stride_rows - args.padding.top;
+      const int start_input_j = output_j * args.stride_cols - args.padding.left;
+      auto inptr = input.base + output_channel_start + start_input_i * input.ld_row + start_input_j * input.ld_col;
+
+      auto ld_row = input.ld_row;
+      auto ld_col = input.ld_col;
+
+      const auto tile_rows = this->m_strat->get_output_rows() * args.stride_rows * n_tile_rows + tile_padding.top + tile_padding.bottom;
+      const auto tile_cols = this->m_strat->get_output_cols() * args.stride_cols * n_tile_cols + tile_padding.left + tile_padding.right;
+      const auto tile_channels = output_channel_end - output_channel_start;
+
+      Tile<TInput> multiplied_input;
+      if (this->uses_intermediate_array()) {
+        multiplied_input = Tile<TInput>(ws->intermediate_buffer, tile_rows, tile_cols, tile_channels);
+        multiplied_input.load_from(input.base,
+          input.ld_row, input.ld_col,
+          args.input_rows, args.input_cols,
+          start_input_i, start_input_j, args.channel_multiplier);
+
+        ld_row = tile_cols * tile_channels;
+        ld_col = tile_channels;
+        inptr = multiplied_input.array;
+      }
+
+      // Execute the kernel
+      Invoker::direct(
+        strat, ws, os,
+        n_tile_rows, n_tile_cols,
+        inptr, ld_row, ld_col,
+        outptr, output.ld_row, output.ld_col,
+        parameters, output_channel_end - output_channel_start
+      );
+    }
+    else
+    {
+      // Otherwise, we repeatedly call the padded kernel but use our knowledge
+      // of the tensor structure to avoid recomputing the pointer array.
+
+      const auto n_input_pointers = this->m_strat->get_input_rows() * this->m_strat->get_input_cols();
+      const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
+      const auto n_output_pointers = this->m_strat->get_output_rows() * this->m_strat->get_output_cols();
+      const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
+
+      // For each tile row, initialise the input and output pointer arrays. For
+      // each subsequent tile we simply update the pointers.
+      for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
+      {
+        const int input_i = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+        int input_j = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+
+        Tile<TInput> multiplied_input;
+        this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+          ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+          input_i, input_j, 0, 0, multiplied_input);
+
+        // Compute the output pointer array
+        fill_pointer_array(
+          ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+          output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+          output.ld_row, output.ld_col,
+          ws->output_buffer,
+          0, args.output_rows,
+          0, args.output_cols
+        );
+
+        for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
+        {
+          // Invoke the indirect kernel for this tile
+          depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>::indirect(
+            strat, ws, os, parameters, m_bias, output_channel_end - output_channel_start
+          );
+
+          // Progress the pointers
+          if (this->uses_intermediate_array()) {
+            input_j += input_point_stride / input.ld_col;
+            multiplied_input.load_from(input.base,
+              input.ld_row, input.ld_col,
+              args.input_rows, args.input_cols, input_i, input_j, args.channel_multiplier);
+          } else {
+            for (auto i = 0u; i < n_input_pointers; i++)
+            {
+              ws->inptr_array[i] += input_point_stride;
+            }
+          }
+
+          for (auto i = 0u; i < n_output_pointers; i++)
+          {
+            ws->outptr_array[i] += output_point_stride;
+          }
+        }
+
+        output_i += this->m_strat->get_output_rows();
+      }
+    }
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
new file mode 100644
index 0000000000..e2d05560a1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise_depthfirst.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TOutput, typename TAccum>
+struct GenericDepthfirstKernelStrategyFunctionType
+{
+  using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const void *, const unsigned int, const unsigned int, const TAccum, const TAccum)>;
+};
+
+template <typename TInput, typename TOutput>
+struct GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, int32_t>
+{
+  using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const arm_gemm::Requantize32 &, unsigned int, unsigned int)>;
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class GenericDepthfirstKernelStrategy
+{
+  unsigned int m_n_output_points;
+  arm_gemm::VLType m_vl_type;
+  unsigned int m_accumulator_depth_vl;
+
+  public:
+  GenericDepthfirstKernelStrategy(unsigned int n_output_points, arm_gemm::VLType vl_type, unsigned int accumulator_depth_vl=1)
+  : m_n_output_points(n_output_points), m_vl_type(vl_type), m_accumulator_depth_vl(accumulator_depth_vl)
+  {
+  }
+
+  virtual ~GenericDepthfirstKernelStrategy() = default;
+
+  virtual arm_gemm::VLType get_vl_type() const { return m_vl_type; }
+  virtual unsigned int get_accumulator_depth_vl() const { return m_accumulator_depth_vl; }
+  virtual unsigned int get_n_output_points() const { return m_n_output_points; }
+
+  using KernelType = typename GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, TAccum>::KernelType;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+template <typename TInput,
+          typename TWeight=TInput,
+          typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TInput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class GenericDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  protected:
+  using KernelStrategyType = GenericDepthfirstKernelStrategy<TInput, TWeight, TOutput, TAccum>;
+  std::unique_ptr<KernelStrategyType> m_strategy;
+
+  public:
+  GenericDepthfirstStrategy(
+    KernelStrategyType *strat, unsigned int n_output_rows, unsigned int n_output_cols,
+    const DepthwiseArgs &args
+  )
+  : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
+      n_output_rows, n_output_cols,
+      args.kernel_rows, args.kernel_cols,
+      args.stride_rows, args.stride_cols
+    ),
+    m_strategy(strat)
+  {
+  }
+
+  GenericDepthfirstStrategy(GenericDepthfirstStrategy &) = delete;
+  GenericDepthfirstStrategy operator=(GenericDepthfirstStrategy &) = delete;
+
+  arm_gemm::VLType get_vl_type(void) const override { return m_strategy->get_vl_type(); }
+  unsigned int get_accumulator_depth_vl(void) const override { return m_strategy->get_accumulator_depth_vl(); }
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    interleaves::PackingArguments packing_args(
+      this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+      false, sizeof(TAccum), this->uses_premultiply(),  // Don't pack the bias
+      this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+    return interleaves::get_storage_size_generic(packing_args, args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const OutputStage &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleaves::PackingArguments packing_args(
+      this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+      false, sizeof(TAccum), this->uses_premultiply(),  // Don't pack the bias
+      this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+    interleaves::pack_parameters_generic(
+      packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row);
+  }
+
+  const typename KernelStrategyType::KernelType get_kernel() const { return m_strategy->get_kernel(); }
+};
+
+// Use a templated function to marshal arguments when executing the kernel.
+template <typename OutputStage> struct DepthwiseDepthfirstGenericKernelCall;
+
+template <>
+struct DepthwiseDepthfirstGenericKernelCall<Nothing>
+{
+  template <typename StratType, typename WorkspaceType, typename TAccum>
+  static void execute(
+    const StratType *strat, const WorkspaceType *ws, const Nothing &,
+    const TAccum *bias, const void *params,
+    const unsigned int n_kernel_points, const unsigned int n_output_channels
+  )
+  {
+    strat->get_kernel()(
+      ws->inptr_array,
+      ws->outptr_array,
+      params, bias,
+      n_kernel_points, n_output_channels,
+      ws->activation_min, ws->activation_max
+    );
+  }
+};
+
+template <>
+struct DepthwiseDepthfirstGenericKernelCall<arm_gemm::Requantize32>
+{
+  template <typename StratType, typename WorkspaceType>
+  static void execute(
+    const StratType *strat, const WorkspaceType *ws, const arm_gemm::Requantize32 &qp,
+    const int32_t *, const void *params,
+    const unsigned int n_kernel_points, const unsigned int n_output_channels
+  )
+  {
+    strat->get_kernel()(
+      ws->inptr_array,
+      ws->outptr_array,
+      params, qp,
+      n_kernel_points, n_output_channels
+    );
+  }
+};
+
+
+/* Workspace Element for an array of input pointers as consumed by the
+ * "Generic" depthwise kernels.
+ */
+template <typename T>
+class GenericInputArrayElement
+{
+  public:
+  struct Workspace
+  {
+    const T **inptr_array;
+  };
+
+  template <class OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+    return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols() * kernel_points;
+  }
+
+  template <class WorkspaceType, class OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    ws->inptr_array = reinterpret_cast<const T**>(buffer);
+    return reinterpret_cast<char *>(buffer) + get_element_size(args);
+  }
+};
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TInput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  using StratType = GenericDepthfirstStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
+  using Parent = DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+  using WorkspaceManager = Workspace<
+    OutputArrayElement<TOutput>,
+    GenericInputArrayElement<TInput>,
+    InputBufferElement<TInput>,
+    IntermediateBufferElement<TInput>,
+    ActivationsElement<TAccum, OutputStage>
+  >;
+  using WorkingSpace = typename WorkspaceManager::WorkspaceType;
+  const TAccum *m_bias = nullptr;
+
+  public:
+  DepthwiseDepthfirstGeneric(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os={})
+  : Parent(strat, args, os)
+  {
+  }
+
+  DepthwiseDepthfirstGeneric(DepthwiseDepthfirstGeneric &) = delete;
+  DepthwiseDepthfirstGeneric &operator=(DepthwiseDepthfirstGeneric &) = delete;
+
+  void pack_parameters(
+    void *buffer, const void *biases,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) override
+  {
+    Parent::pack_parameters(buffer, biases, weights, ld_weight_col, ld_weight_row);
+    m_bias = reinterpret_cast<const TAccum *>(biases);  // Get a copy of the biases
+    depthwise_depthfirst::stash_bias(this->get_output_stage(), m_bias);
+  }
+
+  size_t get_working_size_per_thread() const override
+  {
+    DepthwiseArgs args(this->m_args);
+    return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage()));
+  }
+
+  void initialise_working_space(void *buffer) const override
+  {
+    DepthwiseArgs args(this->m_args);
+    return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage()));
+  }
+
+  protected:
+  void fill_inptr_array(const DepthwiseArgs &args,
+    const TensorSpec<const TInput *> &input,
+    const TInput **inptr_array, TInput *input_buffer,
+    const unsigned int input_i, const unsigned int input_j,
+    const unsigned int input_pad_top, const unsigned int input_pad_left) const override
+  {
+    fill_pointer_array_generic_kernel<const TInput>(
+      inptr_array,
+      this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      args.kernel_rows, args.kernel_cols,
+      args.stride_rows, args.stride_cols,
+      input.base,
+      input.ld_row, input.ld_col,
+      input_buffer,
+      input_pad_top, args.input_rows - input_i,
+      input_pad_left, args.input_cols - input_j
+    );
+  }
+
+  void compute_tile_padded(
+    const DepthwiseArgs &args,
+    unsigned int output_i, unsigned int output_j,
+    unsigned int channel_start, unsigned int channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space_raw
+  ) const override
+  {
+    // Get the working space
+    WorkingSpace *ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+    const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+    const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+    const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+    Tile<TInput> multiplied_input;
+    this->initialise_inptr_array(args, channel_start, channel_end, input,
+      ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+      input_i, input_j, input_pad_top, input_pad_left, multiplied_input);
+
+    // Compute the output pointer array
+    fill_pointer_array<TOutput>(
+      ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
+      output.ld_row, output.ld_col,
+      ws->output_buffer,
+      0, args.output_rows - output_i, // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
+    );
+
+    // Execute the kernel
+    DepthwiseDepthfirstGenericKernelCall<OutputStage>::execute(
+      reinterpret_cast<const StratType *>(this->m_strat.get()), ws,
+      this->get_output_stage(), m_bias, parameters,
+      args.kernel_rows * args.kernel_cols,
+      channel_end - channel_start
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
new file mode 100644
index 0000000000..b93caa2aaa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise_depthfirst.hpp"
+#include "interleaves/generic_quantized_dot_product.hpp"
+
+#include <limits>
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class DepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>
+{
+  using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>;
+
+  protected:
+  virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
+  {
+    return interleaves::PackingArguments(
+      args.kernel_rows, args.kernel_cols, sizeof(TWeight),
+      true, sizeof(TAccum), this->uses_premultiply(),
+      this->get_vl_type(),
+      sizeof(TAccum), 1,
+      [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
+      {
+        if (pos < args.kernel_rows * args.kernel_cols)
+        {
+          y = pos % args.kernel_cols;
+          x = pos / args.kernel_cols;
+          return true;
+        }
+        return false;
+      }
+    );
+  }
+  
+  bool uses_premultiply() const override {
+    return false;
+  }
+
+  public:
+  using Parent::Parent;
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
+  }
+
+  void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
+  {
+    interleaves::pack_parameters_generic(
+      this->get_packing_args(args), args,
+      buffer, biases, weights, ld_weight_col, ld_weight_row
+    );
+  }
+
+  using KernelType = std::function<void(
+    const TInput *const *,  // Input pointers
+    TOutput *const *,  // Output pointers
+    const void *,  // Ravelled bias, weights, and quantization parameters
+    unsigned int,  // # output channels
+    TAccum, TAccum  // Min and max activation clamps
+  )>;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t> : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+  using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
+
+  public:
+  using Parent::Parent;
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleaves::quantized::get_storage_size(args, this->get_vl_type(), this->get_accumulator_depth_vl());
+  }
+
+  void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
+  {
+    interleaves::quantized::pack_parameters<TWeight>(
+      buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row,
+      args, qp, this->get_vl_type(), this->get_accumulator_depth_vl()
+    );
+  }
+
+  using KernelType = std::function<void(
+    const TInput *const *,  // Input pointers
+    TOutput *const *,  // Output pointers
+    const void *,  // Ravelled bias, weights, and quantization parameters
+    unsigned int,  // # output channels
+    const arm_gemm::Requantize32 &
+  )>;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class GenericDepthfirstMultiplierKernelStrategy
+{
+  const arm_gemm::VLType m_vl_type;
+  const unsigned int m_output_rows, m_output_cols;
+
+  public:
+  GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
+  : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
+  {
+  }
+
+  virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
+
+  arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
+  unsigned int get_output_rows(void) const { return m_output_rows; }
+  unsigned int get_output_cols(void) const { return m_output_cols; }
+
+  using KernelType = std::function<void(
+    const TInput *const *,  // Input pointers
+    TOutput *const *,  // Output pointers
+    const TWeight *,  // Ravelled weight parameters
+    const TAccum *,  // Bias,
+    unsigned int, unsigned int,  // Number of kernel points, number of output channels
+    TAccum, TAccum  // Activation minimum and maximum
+  )>;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+class GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, int32_t>
+{
+  const arm_gemm::VLType m_vl_type;
+  const unsigned int m_output_rows, m_output_cols;
+
+  public:
+  GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
+  : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
+  {
+  }
+
+  virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
+
+  arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
+  unsigned int get_output_rows(void) const { return m_output_rows; }
+  unsigned int get_output_cols(void) const { return m_output_cols; }
+
+  using KernelType = std::function<void(
+    const TInput *const *,  // Input pointers
+    TOutput *const *,  // Output pointers
+    const TWeight *,  // Ravelled weight parameters
+    const int32_t *,  // Bias,
+    unsigned int, unsigned int,  // Number of kernel points, number of output channels
+    const int32_t *, const int32_t *, const int32_t *,  // Per-channel left-shifts, multipliers, right-shifts (need to account for start channel)
+    const arm_gemm::Requantize32 &
+  )>;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+template <typename TInput,
+          typename TWeight=TInput,
+          typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TInput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class GenericDepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  using KernelStrategyType = GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, TAccum>;
+  std::unique_ptr<KernelStrategyType> m_kern;
+
+  protected:
+  virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
+  {
+    return interleaves::PackingArguments(
+      args.kernel_rows, args.kernel_cols, sizeof(TWeight),
+      false, sizeof(TAccum), this->uses_premultiply(),
+      this->get_vl_type(),
+      sizeof(TAccum), 1,
+      [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
+      {
+        if (pos < args.kernel_rows * args.kernel_cols)
+        {
+          y = pos % args.kernel_cols;
+          x = pos / args.kernel_cols;
+          return true;
+        }
+        return false;
+      }
+    );
+  }
+  
+  bool uses_premultiply() const override {
+    return false;
+  }
+
+  public:
+  GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args)
+  : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
+      kern->get_output_rows(), kern->get_output_cols(),
+      args.kernel_rows, args.kernel_cols,
+      args.stride_rows, args.stride_cols
+    ),
+    m_kern(kern)
+  {
+  };
+
+  arm_gemm::VLType get_vl_type(void) const override { return m_kern->get_vl_type(); }
+  const typename KernelStrategyType::KernelType get_kernel(void) const { return m_kern->get_kernel(); }
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
+  }
+
+  void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
+  {
+    interleaves::pack_parameters_generic(
+      this->get_packing_args(args), args,
+      buffer, biases, weights, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+// Specialise elements of the wrapper based on the type of kernel.
+namespace depthfirst_multiplier {
+
+/* Working space element which contains a pointer for each row of input, a row
+ * of padding, and a space which can be used to construct an NCHW-ordered patch
+ * of input.
+ */
+template <typename T, bool IsGeneric=false, typename OutputStage=Nothing>
+class InputPatchElement
+{
+  public:
+  struct Workspace
+  {
+    constexpr static bool InputPatchIsGeneric = IsGeneric;
+    const T **input_rows;
+    T *input_padding;
+    T *input_patch;
+  };
+
+  static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof_input_rows(args) + sizeof_input_padding(args) + sizeof_input_patch(args);
+  }
+
+  template <class WorkspaceType>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    auto buffer_bytes = reinterpret_cast<char *>(buffer);
+
+    ws->input_rows = reinterpret_cast<const T **>(buffer_bytes);
+    buffer_bytes += sizeof_input_rows(args);
+
+    ws->input_padding = reinterpret_cast<T*>(buffer_bytes);
+    buffer_bytes += sizeof_input_padding(args);
+
+    ws->input_patch = reinterpret_cast<T*>(buffer_bytes);
+    buffer_bytes += sizeof_input_patch(args);
+
+    // Initialise the padding
+    memset(ws->input_padding,
+           get_input_buffer_fill_value(args.output_stage),
+           sizeof_input_padding(args));
+
+    return buffer_bytes;
+  }
+
+  protected:
+  static size_t sizeof_input_rows(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    if (IsGeneric)
+    {
+      return sizeof(T *) * args.strategy->get_output_rows() * args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+    }
+    else
+    {
+      return sizeof(T *) * args.strategy->get_input_rows();
+    }
+  }
+
+  static size_t sizeof_input_padding(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    // Round-up the number of columns to be a whole number of QUADS
+    auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
+    return sizeof(T) * input_cols;
+  }
+
+  static size_t sizeof_input_patch(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    if (IsGeneric)
+    {
+      // Round-up the number of columns to be a whole number of QUADS
+      auto output_cols = arm_gemm::roundup<size_t>(args.strategy->get_output_cols(), 16 / sizeof(T));
+      const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+      return sizeof(T) * kernel_points * args.strategy->get_output_rows() * output_cols;
+    }
+    else
+    {
+      // Round-up the number of columns to be a whole number of QUADS
+      auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
+      return sizeof(T) * args.strategy->get_input_rows() * input_cols;
+    }
+  }
+};
+
+template <bool IsGeneric, typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct StrategyType
+{
+  using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum>;
+
+  template <typename WorkspaceType>
+  static void execute(
+    const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+    const OutputStage &, const unsigned int,
+    const void *parameters, const void *
+  )
+  {
+    strat->get_kernel()(
+      ws->input_rows,
+      ws->outptr_array,
+      parameters, args.channel_multiplier,
+      ws->activation_min, ws->activation_max
+    );
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct StrategyType<true, TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+  using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
+
+  template <typename WorkspaceType>
+  static void execute(
+    const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+    const OutputStage &, const unsigned int start_output_channel,
+    const void *parameters, const void *bias
+  )
+  {
+    strat->get_kernel()(
+      ws->input_rows, ws->outptr_array,
+      reinterpret_cast<const TWeight *>(parameters),
+      bias == nullptr ? nullptr : reinterpret_cast<const TAccum *>(bias) + start_output_channel,
+      strat->get_kernel_rows() * strat->get_kernel_cols(),
+      args.channel_multiplier,
+      ws->activation_min, ws->activation_max
+    );
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct StrategyType<false, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+  using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t>;
+
+  template <typename WorkspaceType>
+  static void execute(
+    const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+    const arm_gemm::Requantize32 &qp, const unsigned int,
+    const void *parameters, const void *
+  )
+  {
+    strat->get_kernel()(
+      ws->input_rows,
+      ws->outptr_array,
+      parameters, args.channel_multiplier,
+      qp
+    );
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct StrategyType<true, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+  using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
+
+  template <typename WorkspaceType>
+  static void execute(
+    const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+    const arm_gemm::Requantize32 &qp, const unsigned int start_output_channel,
+    const void *parameters, const void *
+  )
+  {
+    auto get_ptr = [start_output_channel] (const int32_t *ptr) -> const int32_t *
+    {
+      return ptr == nullptr ? nullptr : ptr + start_output_channel;
+    };
+
+    strat->get_kernel()(
+      ws->input_rows, ws->outptr_array,
+      reinterpret_cast<const TWeight *>(parameters),
+      get_ptr(qp.bias),
+      strat->get_kernel_rows() * strat->get_kernel_cols(),
+      args.channel_multiplier,
+      get_ptr(qp.per_channel_left_shifts),
+      get_ptr(qp.per_channel_muls),
+      get_ptr(qp.per_channel_right_shifts),
+      qp
+    );
+  }
+};
+
+template <bool IsGeneric> struct PrepareInputSample;
+
+template <> struct PrepareInputSample<false>
+{
+  template <typename WorkspaceType, typename StrategyType, typename T>
+  static void execute(
+    const DepthwiseArgs &, WorkspaceType *ws, const StrategyType *strat,
+    T *base_ptr, size_t ld_row, size_t ld_col,
+    const unsigned int input_pad_top, const unsigned int valid_rows,
+    const unsigned int input_pad_left, const unsigned int valid_cols
+  )
+  {
+    fill_nchw_patch_array(
+      ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(),
+      base_ptr, ld_row, ld_col,
+      ws->input_padding,
+      input_pad_top, valid_rows,
+      input_pad_left, valid_cols
+    );
+  }
+};
+
+template <> struct PrepareInputSample<true>
+{
+  template <typename WorkspaceType, typename StrategyType, typename T>
+  static void execute(
+    const DepthwiseArgs &args, WorkspaceType *ws, const StrategyType *strat,
+    T *base_ptr, size_t ld_row, size_t ld_col,
+    const unsigned int input_pad_top, const unsigned int valid_rows,
+    const unsigned int input_pad_left, const unsigned int valid_cols
+  )
+  {
+    fill_patch_array_generic_kernel(
+      ws->input_rows, ws->input_patch,
+      strat->get_output_rows(), strat->get_output_cols(),
+      args.kernel_rows, args.kernel_cols,
+      args.stride_rows, args.stride_cols,
+      base_ptr, ld_row, ld_col,
+      ws->input_padding,
+      input_pad_top, valid_rows,
+      input_pad_left, valid_cols
+    );
+  }
+};
+
+}  // namespace depthfirst_multiplier
+
+template <typename TInput,
+          typename TWeight=TInput,
+          typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TInput>::Type,
+          bool is_generic=false,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, TOutput>
+{
+  protected:
+  using StratType = typename depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
+  using WorkspaceManager = Workspace<
+    OutputArrayElement<TOutput>,
+    depthfirst_multiplier::InputPatchElement<TInput, is_generic, OutputStage>,
+    ActivationsElement<TOutput, OutputStage>
+  >;
+  using WorkingSpace = typename WorkspaceManager::WorkspaceType;
+
+  OutputStage m_os;  // Copy of the output parameters
+  const void *m_bias = nullptr;  // Copy of the bias (should we need it)
+
+  bool uses_premultiply() const override {
+    return false;
+  }
+
+  public:
+  DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+  : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
+  {
+  }
+
+  DepthwiseDepthfirstMultiplier(DepthwiseDepthfirstMultiplier &) = delete;
+  DepthwiseDepthfirstMultiplier &operator=(DepthwiseDepthfirstMultiplier &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    return reinterpret_cast<const StratType *>(this->m_strat.get())
+      ->get_storage_size(this->m_args);
+  }
+
+  void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    reinterpret_cast<const StratType *>(this->m_strat.get())
+      ->pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
+    m_bias = biases;
+    depthwise_depthfirst::stash_bias(m_os, biases);
+  }
+
+  size_t get_working_size_per_thread() const override
+  {
+    DepthwiseArgs args(this->m_args);
+    return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
+  }
+
+  void initialise_working_space(void *buffer) const override
+  {
+    DepthwiseArgs args(this->m_args);
+    return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
+  }
+
+  void compute_tile_padded(
+    const DepthwiseArgs &args,
+    unsigned int output_i, unsigned int output_j,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    const void *parameters,
+    void *working_space_raw
+  ) const override
+  {
+    // Get the working space
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+    const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+    const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+    const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+    // Compute the output pointer array. We'll update this array after every
+    // invocation of the kernel.
+    fill_pointer_array(
+      ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+      output.ld_row, output.ld_col,
+      ws->output_buffer,
+      0, args.output_rows - output_i, // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
+    );
+
+    // Compute the parameter stride
+    DepthwiseArgs single_iter(args);
+    single_iter.input_channels = 1;
+    const size_t parameter_stride = reinterpret_cast<const StratType *>(this->m_strat.get())
+      ->get_storage_size(single_iter);
+
+    for (; output_channel_start < output_channel_end;
+         output_channel_start += args.channel_multiplier)
+    {
+      // Compute the input pointer array
+      const auto input_channel = output_channel_start / args.channel_multiplier;
+
+      // Construct the input patch
+      depthfirst_multiplier::PrepareInputSample<is_generic>::execute(
+        args, ws, this->m_strat.get(),
+        input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col,
+        input_pad_top, args.input_rows - input_i,
+        input_pad_left, args.input_cols - input_j
+      );
+
+      // Execute the kernel
+      depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::execute(
+        args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
+        parameters, m_bias
+      );
+
+      // Update the output pointers
+      for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++)
+      {
+        ws->outptr_array[n] += args.channel_multiplier;
+      }
+
+      // Progress the parameters
+      parameters = reinterpret_cast<const char *>(parameters) + parameter_stride;
+    }
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
new file mode 100644
index 0000000000..8fef6f8ae0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+// This can only be built if the target/compiler supports FP16 arguments.
+#if defined(__ARM_FP16_ARGS)
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#include "kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  bool prefer_premultiply(const DepthwiseArgs &args) {
+    if ((args.stride_rows != args.stride_cols) || (args.kernel_rows != args.kernel_cols))
+    {
+      return false;
+    }
+
+    unsigned int threshold;
+
+    if (args.stride_rows == 1 && args.kernel_rows == 3)
+    {
+      threshold = 30;
+    }
+    else if (args.stride_rows == 1 && args.kernel_rows == 5)
+    {
+      threshold = 31;
+    }
+    else if (args.stride_rows == 2 && args.kernel_rows == 3)
+    {
+      threshold = 11;
+    }
+    else if (args.stride_rows == 2 && args.kernel_rows == 5)
+    {
+      threshold = 19;
+    } else
+    {
+      return false;
+    }
+
+    return args.channel_multiplier <= threshold;
+  }
+
+  template <class Strategy>
+  unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    if (args.channel_multiplier > 1 && !prefer_premultiply(args))
+    {
+      return std::numeric_limits<unsigned int>::max();
+    }
+
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          );
+  }
+
+  template <class Strategy>
+  unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           args.output_cols *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          );
+  }
+
+  unsigned int multiplier_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    return prefer_premultiply(args)? std::numeric_limits<unsigned int>::max() : 0;
+  }
+
+  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) __attribute__ ((unused));
+  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
+  {
+    return std::numeric_limits<unsigned int>::max();
+  }
+#endif  // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = {
+#if defined(__aarch64__)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               cpu_has_sme2),
+    cycle_estimate<sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               cpu_has_sme2),
+    cycle_estimate<sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+              cpu_has_sme2),
+    cycle_estimate<sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               cpu_has_sme2),
+    cycle_estimate<sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               cpu_has_sme2),
+    cycle_estimate<sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+              cpu_has_sve),
+    cycle_estimate<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               cpu_has_fp16),
+    cycle_estimate<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               cpu_has_fp16),
+    cycle_estimate<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+               cpu_has_fp16),
+    cycle_estimate<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               cpu_has_fp16),
+    cycle_estimate<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               cpu_has_fp16),
+    cycle_estimate<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto strat = new a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_generic_output3x3_mla_depthfirst",
+    constraint(cpu_has_fp16),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto kern = new a64_fp16_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<__fp16>(kern, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<__fp16>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint(cpu_has_fp16, has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      auto kern = new a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<__fp16>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<__fp16, __fp16, __fp16, __fp16, true>(strat, args);
+    },
+  },
+#endif  // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<__fp16> *depthwise_implementation_list()
+{
+  return depthwise_fp16_methods;
+}
+
+template UniqueDepthwiseCommon<__fp16> depthwise(const DepthwiseArgs &, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<__fp16>(const DepthwiseArgs &, const Nothing &);
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
new file mode 100644
index 0000000000..760328f3ba
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
@@ -0,0 +1,539 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#include "interleaves/list.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp"
+
+#include "kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp"
+
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+  bool prefer_premultiply(const DepthwiseArgs &args) {
+    if ((args.stride_rows != args.stride_cols) || (args.kernel_rows != args.kernel_cols))
+    {
+      return false;
+    }
+
+    unsigned int threshold;
+
+    if (args.stride_rows == 1 && args.kernel_rows == 3)
+    {
+      threshold = 18;
+    }
+    else if (args.stride_rows == 1 && args.kernel_rows == 5)
+    {
+      threshold = 5;
+    }
+    else if (args.stride_rows == 2 && args.kernel_rows == 3)
+    {
+      threshold = 5;
+    }
+    else if (args.stride_rows == 2 && args.kernel_rows == 5)
+    {
+      threshold = 12;
+    } else
+    {
+      return false;
+    }
+
+    return args.channel_multiplier <= threshold;
+  }
+
+  template <class Strategy>
+  unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    if (args.channel_multiplier > 1 && !prefer_premultiply(args))
+    {
+      return std::numeric_limits<unsigned int>::max();
+    }
+
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          );
+  }
+
+  template <class Strategy>
+  unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           args.output_cols *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          );
+  }
+
+  template <class Strategy>
+  unsigned int fast_mode_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          ) * 2 / 3;
+  }
+
+  unsigned int multiplier_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    return prefer_premultiply(args)? std::numeric_limits<unsigned int>::max() : 0;
+  }
+
+  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
+  {
+    return std::numeric_limits<unsigned int>::max();
+  }
+
+  bool fast_mode_enabled(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+  bool fast_mode_enabled(const DepthwiseArgs &args, const void *)
+  {
+    return args.fast_mode;
+  }
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_3x3_s1_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_3x3_s1_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    [] (const DepthwiseArgs &args, const Nothing &os) -> unsigned int {
+      // Heuristic, don't prefer this kernel unless the input plane is greater
+      // than the number of channels.
+      if (args.input_rows * args.input_cols < args.input_channels)
+        return UINT32_MAX;
+
+      return planar_cycle_estimate<sme2_fp32_planar_3x3_s1_4rows_mla_za>(args, os);
+    },
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_3x3_s1_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_3x3_s2_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    planar_cycle_estimate<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_3x3_s2_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_5x5_s1_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_5x5_s1_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_5x5_s1_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_5x5_s2_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_5x5_s2_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_5x5_s2_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(cpu_has_sme,  cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+              cpu_has_sve),
+    cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               cpu_has_sve),
+    cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_generic_output3x3_mla_depthfirst",
+    constraint(cpu_has_sve),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto kern = new sve_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
+    constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
+               cpu_has_sve, has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
+    constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
+               cpu_has_sve, has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint(cpu_has_sve, has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto kern = new sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
+    cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
+    cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
+    cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
+    cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>),
+    cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_generic_output3x3_mla_depthfirst",
+    nullptr,
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto kern = new a64_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
+    constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
+               has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
+    constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
+               has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint(has_channel_multiplier),
+    multiplier_cycle_estimate,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto kern = new a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
+    },
+  },
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<float> *depthwise_implementation_list()
+{
+  return depthwise_fp32_methods;
+}
+
+template UniqueDepthwiseCommon<float> depthwise(const DepthwiseArgs &, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<float>(const DepthwiseArgs &, const Nothing &);
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
new file mode 100644
index 0000000000..82821af1e6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise.hpp"
+
+#include <cstddef>
+#include <functional>
+
+using arm_gemm::Nothing;
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+struct DepthwiseImplementation
+{
+  const DepthwiseMethod method;
+  const char *name;
+  std::function<bool(const DepthwiseArgs &, const OutputStage &)> is_supported;
+  std::function<uint64_t(const DepthwiseArgs &, const OutputStage &)> cycle_estimate;
+  std::function<DepthwiseCommon<TInput, TWeight, TOutput> *(const DepthwiseArgs &, const OutputStage &)> initialise;
+
+  bool get_is_supported(const DepthwiseArgs &args, const OutputStage &os) const
+  {
+    return (is_supported == nullptr) ? true : is_supported(args, os);
+  }
+
+  uint64_t get_cycle_estimate(const DepthwiseArgs &args, const OutputStage &os) const
+  {
+    return (cycle_estimate == nullptr) ? 0 : cycle_estimate(args, os);
+  }
+
+  DepthwiseCommon<TInput, TWeight, TOutput> *get_instance(const DepthwiseArgs &args, const OutputStage &os) const
+  {
+    auto impl = initialise(args, os);
+    impl->set_name(std::string(name));
+    return impl;
+  }
+};
+
+/**
+ * \relates DepthwiseImplementation
+ */
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *depthwise_implementation_list();
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+bool find_implementation(
+  const DepthwiseArgs &args,
+  const OutputStage &os,
+  const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> * &selected
+)
+{
+  selected = nullptr;
+  uint64_t best_cycle_estimate = UINT64_MAX;
+
+  const auto *impl = depthwise_implementation_list<TInput, TWeight, TOutput, OutputStage>();
+  for (; impl->method != DepthwiseMethod::DEFAULT; impl++)
+  {
+    const bool has_cfg = (args.config != nullptr);
+    const auto &cfg = args.config;
+
+    if (
+      !impl->get_is_supported(args, os) ||  // Problem is unsupported
+      (has_cfg && cfg->method != DepthwiseMethod::DEFAULT && cfg->method != impl->method) ||
+      (has_cfg && cfg->filter != "" && !std::strstr(impl->name, cfg->filter.c_str()))
+    )
+    {
+      continue;
+    }
+
+    const auto cycle_estimate = impl->get_cycle_estimate(args, os);
+
+    if (cycle_estimate == 0)
+    {
+      selected = impl;
+      break;
+    }
+
+    if (selected == nullptr || cycle_estimate < best_cycle_estimate)
+    {
+      selected = impl;
+      best_cycle_estimate = cycle_estimate;
+    }
+  }
+
+  return (selected != nullptr);
+}
+
+template <typename TInput, typename TWeight, typename TOutput, class OutputStage>
+std::vector<KernelDescription> get_compatible_kernels(const DepthwiseArgs &args, const OutputStage &os)
+{
+  std::vector<KernelDescription> kerns;
+
+  // Find the default implementation so we can flag it accordingly
+  const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *default_impl;
+  find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, default_impl);
+
+  for (auto impl = depthwise_implementation_list<TInput, TWeight, TOutput, OutputStage>();
+       impl->method != DepthwiseMethod::DEFAULT; impl++)
+  {
+    if (!impl->get_is_supported(args, os))
+    {
+      continue;
+    }
+
+    kerns.emplace_back(
+      impl->method, impl->name, impl == default_impl,
+      impl->get_cycle_estimate(args, os)
+    );
+  }
+
+  return kerns;
+}
+
+template <typename TInput, typename TWeight, typename TOutput, class OutputStage>
+UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &args, const OutputStage &os)
+{
+  const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr;
+  const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl);
+  return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr);
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
new file mode 100644
index 0000000000..15064aeedc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Utilities for constructing functions which constrain which kernels are
+ * selected for a given depthwise problem.
+ *
+ * It is expected that this will be included in the files which list the
+ * available kernels. To avoid multiple definitions, an anonymous namespace is
+ * used.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "depthwise.hpp"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+namespace
+{
+
+template <class OutputStage>
+using ConstraintFn = std::function<bool(const DepthwiseArgs &, const OutputStage &)>;
+
+using GenericConstraintFn = std::function<bool(const DepthwiseArgs &, const void *)>;
+
+GenericConstraintFn make_constraint(const GenericConstraintFn &f) __attribute__ ((unused));
+GenericConstraintFn make_constraint(const GenericConstraintFn &f)
+{
+  return f;
+}
+
+template <typename ... Fs>
+GenericConstraintFn make_constraint(const GenericConstraintFn &f, Fs ... fs)
+{
+  return [f, fs...] (const DepthwiseArgs &args, const void *os) -> bool {
+    return f(args, os) && make_constraint(fs...)(args, os);
+  };
+}
+
+template <typename OutputStage=Nothing, typename ... Fs>
+ConstraintFn<OutputStage> constraint(Fs ... fs)
+{
+  return [fs...] (const DepthwiseArgs &args, const OutputStage &os) -> bool {
+    return make_constraint(fs...)(args, &os);
+  };
+}
+
+// Some useful constraints
+template <class Strategy>
+bool is_supported(const DepthwiseArgs &args, const void *)
+{
+  return ((args.kernel_rows == Strategy::kernel_rows) &&
+          (args.kernel_cols == Strategy::kernel_cols) &&
+          (args.stride_rows == Strategy::stride_rows) &&
+          (args.stride_cols == Strategy::stride_cols));
+}
+
+bool cpu_has_dot_product(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_dot_product(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_dotprod();
+}
+
+bool cpu_has_sme(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sme(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_sme();
+}
+
+bool cpu_has_sme2(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sme2(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_sme2();
+}
+
+bool cpu_has_sve(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sve(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_sve();
+}
+
+bool cpu_has_sve2(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sve2(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_sve2();
+}
+
+bool cpu_has_fp16(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_fp16(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_fp16();
+}
+
+bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *)
+{
+  return args.channel_multiplier == 1;
+}
+
+bool has_channel_multiplier(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool has_channel_multiplier(const DepthwiseArgs &args, const void *)
+{
+  return args.channel_multiplier > 1;
+}
+
+// Planar kernels require a "priming" step before the main processing loop.  The kernels can prime with left padding
+// or input data, but not right padding - which could be needed in some extreme cases such as a 5x5 kernel, width 1
+// padding 2.  These are rare enough and can be handled with other kernels anyway, so filter them out with this.
+bool no_prime_right_pad(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool no_prime_right_pad(const DepthwiseArgs &args, const void *)
+{
+  return (args.input_cols + args.padding.left) >= (args.kernel_cols - 1);
+}
+
+bool qp_has_no_left_shift(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+bool qp_has_no_left_shift(const DepthwiseArgs &, const void *_qp)
+{
+  const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+  return qp->per_channel_requant ?
+    (qp->per_channel_left_shifts == nullptr) :
+    (qp->per_layer_left_shift == 0);
+}
+
+bool qp_zero_a_offset(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+bool qp_zero_a_offset(const DepthwiseArgs &, const void *_qp)
+{
+  const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+  return qp->a_offset == 0;
+}
+
+template <typename T> bool qp_skip_clamp(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+template <typename T> bool qp_skip_clamp(const DepthwiseArgs &, const void *_qp)
+{
+  const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+  return (qp->minval == std::numeric_limits<T>::min() &&
+          qp->maxval == std::numeric_limits<T>::max());
+}
+
+}  // namespace
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
new file mode 100644
index 0000000000..c3daaf04fe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthfirst_driver.hpp"
+#include "interleaves/generic.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename OutputStage>
+class IPlanarStrategy
+{
+  public:
+  virtual ~IPlanarStrategy() = default;
+  virtual unsigned int get_output_rows(void) const = 0;
+  virtual arm_gemm::VLType get_vl_type(void) const = 0;
+
+  virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
+  virtual void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const OutputStage &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
+          typename OutputStage>
+struct PlanarKernelType;
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
+{
+  typedef void (*Type)(
+    const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *, const TAccum *,
+    TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+    unsigned int start_channels, unsigned int valid_channels,
+    TAccum act_min, TAccum act_max
+  );
+
+  template <typename WorkspaceType>
+  static inline void execute(
+    const Type fn,
+    const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *weights, const TAccum *bias,
+    TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols,
+    unsigned int start_channel, unsigned int valid_channels,
+    const Nothing &, const WorkspaceType *ws
+  )
+  {
+    fn(
+      inptr, ld_in_row, ld_in_col, ld_in_vl,
+      pad_top, valid_input_rows,
+      pad_left, valid_input_cols,
+      weights, bias,
+      outptrs, outlds, outvllds, output_cols,
+      start_channel, valid_channels,
+      ws->activation_min, ws->activation_max
+    );
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+  typedef void (*Type)(
+    const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *,
+    TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+    unsigned int start_channel, unsigned int valid_channels,
+    const arm_gemm::Requantize32 &
+  );
+
+  template <typename WorkspaceType>
+  static inline void execute(
+    const Type fn,
+    const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *weights, const int32_t *,
+    TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols,
+    unsigned int first_channel, unsigned int valid_channels,
+    const arm_gemm::Requantize32 &qp, const WorkspaceType *
+  )
+  {
+    fn(
+      inptr, ld_in_row, ld_in_col, ld_in_vl,
+      pad_top, valid_input_rows,
+      pad_left, valid_input_cols,
+      weights,
+      outptrs, outlds, outldvls, output_cols,
+      first_channel, valid_channels,
+      qp
+    );
+  }
+};
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TOutput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class PlanarStrategy : public IPlanarStrategy<OutputStage>
+{
+  unsigned int m_kernel_rows, m_kernel_cols;
+  unsigned int m_stride_rows, m_stride_cols;
+  unsigned int m_output_rows;
+  arm_gemm::VLType m_vl_type;
+
+  protected:
+  virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
+  {
+    // Get the kernel point to pack at the given index; return false to
+    // indicate that this index (and all greater indices) is out of range.
+    if (m_kernel_rows * m_kernel_cols <= index)
+      return false;
+
+    y = index % m_kernel_cols;
+    x = index / m_kernel_cols;
+    return true;
+  }
+
+  virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
+  {
+    return interleaves::PackingArguments(
+      m_kernel_rows, m_kernel_cols, sizeof(TWeight),
+      false, sizeof(TAccum), true,  // Don't pack the bias
+      m_vl_type, sizeof(TAccum), 1,  // Accumulator depth of 1 TODO
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+  }
+
+  public:
+  PlanarStrategy(
+    unsigned int kernel_rows, unsigned int kernel_cols,
+    unsigned int stride_rows, unsigned int stride_cols,
+    unsigned int output_rows,
+    arm_gemm::VLType vl_type
+  ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+      m_stride_rows(stride_rows), m_stride_cols(stride_cols),
+      m_output_rows(output_rows), m_vl_type(vl_type)
+  {
+  }
+
+  unsigned int get_output_rows(void) const override { return m_output_rows; }
+  arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const OutputStage &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleaves::pack_parameters_generic(
+      this->get_kernel_packing_arguments(), args,
+      buffer, biases, weights, ld_weight_col, ld_weight_row
+    );
+  }
+
+  using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+
+namespace {
+
+template <typename T>
+struct OutputRowPtrsElement
+{
+  struct Workspace
+  {
+    T **output_row_ptrs;
+    size_t *output_ld_cols;
+    size_t *output_ld_vls;  // Stride between vectors of channels
+    T *output_padding_buffer;
+  };
+
+  template <typename OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+  {
+    // We need one pointer and stride for each row of output, and an additional
+    // blob of memory into which padded stores can go.
+    return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
+           get_vector_length<char>(args.strategy->get_vl_type());
+  }
+
+  template <typename WorkspaceType, typename OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer,
+                          const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+  {
+    const auto n_rows = args.strategy->get_output_rows();
+    ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
+    ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
+    ws->output_ld_vls = ws->output_ld_cols + n_rows;
+    ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
+    return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
+  }
+};
+
+}  // namespace {anonymous}
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TOutput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
+{
+  using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
+  using StrategyType = IPlanarStrategy<OutputStage>;
+  using WorkspaceManager = Workspace<
+    OutputRowPtrsElement<TOutput>,
+    ActivationsElement<TAccum, OutputStage>
+  >;
+  using WorkspaceType = typename WorkspaceManager::WorkspaceType;
+
+  std::unique_ptr<StrategyType> m_strat;
+  const TAccum *m_bias;
+  OutputStage m_os;
+
+  public:
+  DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+  : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
+  {
+  }
+
+  DepthwisePlanar(DepthwisePlanar &) = delete;
+  DepthwisePlanar &operator=(DepthwisePlanar &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    return m_strat->get_storage_size(this->m_args);
+  }
+
+  void pack_parameters(
+    void *buffer, const void *biases,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) override
+  {
+    m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
+    this->m_bias = reinterpret_cast<const TAccum *>(biases);
+    depthwise_depthfirst::stash_bias(this->m_os, biases);
+  }
+
+  size_t get_working_size(unsigned int n_threads) const override
+  {
+    return this->get_working_size_per_thread() * n_threads;
+  }
+
+  protected:
+  /* Compute the amount of working space required for a single thread. */
+  virtual size_t get_working_size_per_thread(void) const
+  {
+    return WorkspaceManager::get_sizeof_workspace(
+      WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
+  }
+
+  /* Initialise the working space for a thread. */
+  virtual void initialise_working_space(void *buffer) const
+  {
+    WorkspaceManager::initialise(
+      buffer,
+      WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
+    );
+  }
+
+  /* Execute the kernel for a given chunk of work. */
+  virtual void execute_kernel(
+    const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *weights, const TAccum *bias,
+    TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
+    unsigned int valid_output_rows, unsigned int valid_output_cols,
+    unsigned int first_channel, unsigned int valid_channels,
+    WorkspaceType *ws
+  ) const
+  {
+    // Initialise the output pointers
+    for (auto i = 0u; i < m_strat->get_output_rows(); i++)
+    {
+      // Point at the output tensor for all valid rows; otherwise point at the
+      // padding buffer.
+      ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
+      ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
+      ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
+      outptr += ld_out_row;
+    }
+
+    // Execute the kernel
+    PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
+      reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
+      inptr, ld_in_row, ld_in_col, ld_in_vl,
+      pad_top, valid_input_rows, pad_left, valid_input_cols,
+      weights, bias,
+      ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
+      valid_output_cols, first_channel, valid_channels,
+      this->m_os, ws
+    );
+  }
+
+  void execute_internal(
+    const DepthwiseArgs &args,
+    const void *input,
+    size_t ld_input_col,
+    size_t ld_input_row,
+    size_t ld_input_batch,
+    const void *parameters,
+    void *output,
+    size_t ld_output_col,
+    size_t ld_output_row,
+    size_t ld_output_batch,
+    void *working_space,
+    unsigned int thread_id,
+    unsigned int n_threads
+  ) const override
+  {
+    // Get and initialise the working space for this thread.
+    void *thread_working_space =
+      static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+    this->initialise_working_space(thread_working_space);
+    auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
+
+    const auto n_output_channels = args.input_channels * args.channel_multiplier;
+    const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
+
+    // Get typed pointers
+    auto input_batch = reinterpret_cast<const TInput *>(input);
+    auto output_batch = reinterpret_cast<TOutput *>(output);
+    auto weights = reinterpret_cast<const TWeight *>(parameters);
+
+    // Iterate over batches
+    for (auto batches = args.n_batches; batches; batches--)
+    {
+      // NOTE: Other loop orderings are possible and it would be worth
+      // investigating them.
+
+      // Within a batch, stripe threads across rows.
+      for (auto start_output_i = thread_id * m_strat->get_output_rows();
+           start_output_i < args.output_rows;
+           start_output_i += n_threads * m_strat->get_output_rows())
+      {
+        // Determine what (if any padding) is required on the top/bottom of
+        // this row of the convolution.
+        const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
+        const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
+        const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
+        const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i;
+        const unsigned int valid_output_rows = args.output_rows - start_output_i;
+
+        auto inptr_row = input_batch + input_i*ld_input_row;
+        auto outptr_row = output_batch + start_output_i * ld_output_row;
+
+        // Execute the kernel
+        this->execute_kernel(
+          inptr_row, ld_input_row, ld_input_col, vl,
+          input_pad_top, valid_input_rows, args.padding.left, args.input_cols,
+          weights, this->m_bias,
+          outptr_row, ld_output_row, ld_output_col, vl,
+          valid_output_rows, args.output_cols,
+          0 /* first channel */, n_output_channels,
+          ws
+        );
+      }
+
+      // Update the input and output pointers to account for batch
+      input_batch += ld_input_batch;
+      output_batch += ld_output_batch;
+    }
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
new file mode 100644
index 0000000000..6ecdc36bf0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp)
+{
+  const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+  return qp->b_offset == 0;
+}
+
+uint64_t not_preferred(const DepthwiseArgs &, const Requantize32 &)
+{
+  return std::numeric_limits<uint64_t>::max();
+}
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depthwise_s8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_s8q_planar_3x3_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_s8q_planar_3x3_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sme2_s8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_s8q_planar_3x3_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_s8q_planar_3x3_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sme2_s8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_s8q_planar_5x5_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_s8q_planar_5x5_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sme2_s8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_s8q_planar_5x5_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_s8q_planar_5x5_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sme2_s8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<int8_t>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             qp_weights_are_symmetric,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_sve2),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_sve2),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             qp_weights_are_symmetric,
+                             qp_has_no_left_shift,
+                             cpu_has_dot_product),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_dot_product),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_generic_output3x3_mla_depthfirst",
+    nullptr,
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto kernel = new a64_s8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<int8_t>(kernel, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_dot_product),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_dot_product),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto strat = new a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint<Requantize32>(has_channel_multiplier),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      auto kern = new a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<int8_t>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, true>(strat, args, qp);
+    },
+  },
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> *depthwise_implementation_list()
+{
+  return depthwise_s8q_methods;
+}
+
+template UniqueDepthwiseCommon<int8_t, int8_t, int8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, int8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp
new file mode 100644
index 0000000000..37892b6963
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthwise_strategies_common.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+unsigned int DepthfirstStrategyUntyped::get_input_rows() const
+{
+  return this->get_kernel_rows() + (this->get_output_rows() - 1) * this->get_stride_rows();
+}
+
+unsigned int DepthfirstStrategyUntyped::get_input_cols() const
+{
+  return this->get_kernel_cols() + (this->get_output_cols() - 1) * this->get_stride_cols();
+}
+
+unsigned int DepthfirstStrategyUntyped::get_n_input_points() const { return this->get_input_rows() * this->get_input_cols(); }
+unsigned int DepthfirstStrategyUntyped::get_n_output_points() const { return this->get_output_rows() * this->get_output_cols(); }
+unsigned int DepthfirstStrategyUntyped::get_n_kernel_points() const { return this->get_kernel_rows() * this->get_kernel_cols(); }
+
+bool DepthfirstStrategyUntyped::uses_premultiply() const { return true; }
+
+unsigned int DepthfirstStrategyUntyped::get_accumulator_depth_vl() const { return 1; }
+
+bool DepthfirstStrategyUntyped::get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
+{
+  // Get the kernel point to pack at the given index; return false to
+  // indicate that this index, and all greater indices, is out of range.
+  if (index < (this->get_kernel_cols() * this->get_kernel_rows()))
+  {
+    y = index % this->get_kernel_cols();
+    x = index / this->get_kernel_cols();
+    return true;
+  }
+  return false;
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
new file mode 100644
index 0000000000..19cf26dd2f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "utils.hpp"
+#include "interleaves/generic.hpp"
+#include "depthfirst_driver.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+class DepthfirstStrategyUntyped : public IDepthfirstStrategy
+{
+  public:
+  virtual arm_gemm::VLType get_vl_type() const = 0;
+
+  virtual unsigned int get_kernel_rows() const = 0;
+  virtual unsigned int get_kernel_cols() const = 0;
+
+  virtual unsigned int get_stride_rows() const = 0;
+  virtual unsigned int get_stride_cols() const = 0;
+
+  virtual unsigned int get_input_rows() const override;
+  virtual unsigned int get_input_cols() const override;
+
+  virtual unsigned int get_n_input_points() const;
+  virtual unsigned int get_n_output_points() const;
+  virtual unsigned int get_n_kernel_points() const;
+
+  virtual bool uses_premultiply() const;
+
+  // Get the number of VLs used in the accumulator, this defaults to 1.
+  virtual unsigned int get_accumulator_depth_vl() const;
+
+  // Get the order in which to pack the weights, this defaults to a row-major
+  // sweep over the weight tensor.
+  virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const;
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+class DepthfirstStrategy : public DepthfirstStrategyUntyped
+{
+  public:
+  virtual size_t get_storage_size(const DepthwiseArgs &args) const
+  {
+    interleaves::PackingArguments packing_args(
+      this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+      true, sizeof(TAccum), this->uses_premultiply(),
+      this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+    return interleaves::get_storage_size_generic(packing_args, args);
+  }
+
+  virtual void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const OutputStage &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const
+  {
+    interleaves::PackingArguments packing_args(
+      this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+      true, sizeof(TAccum), this->uses_premultiply(),
+      this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+    interleaves::pack_parameters_generic(
+      packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row);
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
new file mode 100644
index 0000000000..236930ee26
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+
+#include "kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+
+#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+
+#endif  // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+uint64_t not_preferred(const DepthwiseArgs &, const Requantize32 &)
+{
+  return std::numeric_limits<uint64_t>::max();
+}
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8q_planar_3x3_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8q_planar_3x3_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sme2_u8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8q_planar_3x3_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8q_planar_3x3_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sme2_u8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8q_planar_5x5_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8q_planar_5x5_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sme2_u8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8q_planar_5x5_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8q_planar_5x5_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sme2_u8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_sve2),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             has_channel_multiplier,
+                             cpu_has_sve2),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             cpu_has_dot_product,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_zero_a_offset,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_zero_a_offset,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_zero_a_offset,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_generic_output3x3_mla_depthfirst",
+    nullptr,
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto kernel = new a64_u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<uint8_t>(kernel, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<uint8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             cpu_has_dot_product,
+                             has_channel_multiplier,
+                             qp_has_no_left_shift),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             cpu_has_dot_product,
+                             has_channel_multiplier,
+                             qp_has_no_left_shift),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint<Requantize32>(has_channel_multiplier),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      auto kern = new a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, true>(strat, args, qp);
+    },
+  },
+
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> *depthwise_implementation_list()
+{
+  return depthwise_u8q_methods;
+}
+
+template UniqueDepthwiseCommon<uint8_t, uint8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
new file mode 100644
index 0000000000..a888958b76
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+uint64_t not_preferred(const DepthwiseArgs &, const Requantize32 &)
+{
+  return std::numeric_limits<uint64_t>::max();
+}
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za",
+    constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+                             is_supported<sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_sve2),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto strat = new a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_generic_output3x3_mla_depthfirst",
+    nullptr,
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto kernel = new a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstStrategy<uint8_t, int8_t>(kernel, 3, 3, args);
+      return new DepthwiseDepthfirstGeneric<uint8_t, int8_t>(strat, args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    constraint<Requantize32>(has_channel_multiplier),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      auto kern = new a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+      auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t, int8_t>(kern, args);
+      return new DepthwiseDepthfirstMultiplier<uint8_t, int8_t, uint8_t, int32_t, true>(strat, args, qp);
+    },
+  },
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> *depthwise_implementation_list()
+{
+  return depthwise_u8q_methods;
+}
+
+template UniqueDepthwiseCommon<uint8_t, int8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, int8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
new file mode 100644
index 0000000000..3de4bdc1fb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_a64_s8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_a64_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+                       get_vector_length<int32_t>(arm_gemm::VLType::None)), 4lu
+  );
+  return n * 7 * get_vector_length<int8_t>(arm_gemm::VLType::None);
+}
+
+void interleave_a64_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "cmp %x[ld_weight_col], XZR\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "movi v16.4s, #0x9\n"
+    "movi v31.16b, #0x0\n"
+    "mov x21, #0x3\n"
+    "mul x21, %x[ld_weight_col], x21\n"
+    "add x20, %x[qp], %[offsetof_input_offset]\n"
+    "ld1r { v30.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_weights_offset]\n"
+    "ld1r { v29.4s }, [x20]\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "mul v29.4s, v29.4s, v30.4s\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
+    "lsr x21, %x[n_channels], #0x2\n"
+    "movi v28.16b, #0x1\n"
+    "mul v29.4s, v29.4s, v16.4s\n"
+    "add x25, %x[weights], %x[ld_weight_row]\n"
+    "add x20, %x[qp], %[offsetof_per_layer_mul]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x24, x25, %x[ld_weight_row]\n"
+    "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "mov x22, #0x0\n"
+    "cbz x21, 4f\n"
+    "1:"  // Loop
+    "movi v25.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q25, [%x[bias], x22]\n"
+    "2:"  // Loop: Skip bias load
+    "ldr s19, [%x[weights], #0x0]\n"
+    "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
+    "zip1 v17.16b, v16.16b, v31.16b\n"
+    "movi v21.4s, #0x0\n"
+    "ldr s16, [%x[weights], x23]\n"
+    "ldr s18, [x25, #0x0]\n"
+    "zip1 v16.16b, v19.16b, v16.16b\n"
+    "zip1 v20.16b, v16.16b, v17.16b\n"
+    "ldr s17, [x25, %x[ld_weight_col]]\n"
+    "ldr s16, [x25, x23]\n"
+    "zip1 v18.16b, v18.16b, v16.16b\n"
+    "zip1 v16.16b, v17.16b, v31.16b\n"
+    "ldr s17, [x24, #0x0]\n"
+    "ldr s19, [x24, %x[ld_weight_col]]\n"
+    ".inst 0x4e949795  // sdot v21.4s, v28.16b, v20.16b\n"
+    "zip1 v18.16b, v18.16b, v16.16b\n"
+    "ldr s16, [x24, x23]\n"
+    "zip1 v17.16b, v17.16b, v16.16b\n"
+    "zip1 v16.16b, v19.16b, v31.16b\n"
+    ".inst 0x4e929795  // sdot v21.4s, v28.16b, v18.16b\n"
+    "zip1 v16.16b, v17.16b, v16.16b\n"
+    ".inst 0x4e909795  // sdot v21.4s, v28.16b, v16.16b\n"
+    "add %x[weights], %x[weights], #0x4\n"
+    "add x25, x25, #0x4\n"
+    "mls v25.4s, v21.4s, v30.4s\n"
+    "add x24, x24, #0x4\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "str q25, [%x[outptr], #0x0]\n"
+    "str q20, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ldr q27, [%x[rq_mul_perchannel], x22]\n"
+    "ldr q26, [%x[rq_shift_perchannel], x22]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "subs x21, x21, #0x1\n"
+    "str q27, [%x[outptr], #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "str q26, [%x[outptr], #0x10]\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "bgt 1b\n"
+    "tst %x[n_channels], #0x3\n"
+    "beq 13f\n"
+    "4:"  // Oddments
+    "movi v25.4s, #0x0\n"
+    "cbz %x[bias], 7f\n"
+    "add %x[bias], %x[bias], x22\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
+    "b 6f\n"
+    "5:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
+    "6:"  // Oddments: Load bias: Bit 1: End
+    "7:"  // Oddments: Skip bias load
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v17.h }[0], [%x[weights]]\n"
+    "ld1 { v24.h }[0], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.h }[0], [x21]\n"
+    "ld1 { v16.h }[0], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.h }[0], [x21]\n"
+    "ld1 { v18.h }[0], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.h }[0], [x24]\n"
+    "ld1 { v22.h }[0], [x21]\n"
+    "add %x[weights], %x[weights], #0x2\n"
+    "add x25, x25, #0x2\n"
+    "ld1 { v21.h }[0], [x20]\n"
+    "add x24, x24, #0x2\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v17.b }[2], [%x[weights]]\n"
+    "ld1 { v24.b }[2], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.b }[2], [x21]\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.b }[2], [x21]\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.b }[2], [x24]\n"
+    "ld1 { v22.b }[2], [x21]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load weights: Bit 1: Unset
+    "ld1 { v17.b }[0], [%x[weights]]\n"
+    "ld1 { v24.b }[0], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.b }[0], [x21]\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.b }[0], [x21]\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.b }[0], [x24]\n"
+    "ld1 { v22.b }[0], [x21]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "9:"  // Oddments: Load weights: Bit 1: End
+    "zip1 v17.16b, v17.16b, v16.16b\n"
+    "zip1 v16.16b, v20.16b, v31.16b\n"
+    "zip1 v20.16b, v17.16b, v16.16b\n"
+    "zip1 v17.16b, v24.16b, v18.16b\n"
+    "zip1 v16.16b, v19.16b, v31.16b\n"
+    "movi v19.4s, #0x0\n"
+    ".inst 0x4e949793  // sdot v19.4s, v28.16b, v20.16b\n"
+    "zip1 v18.16b, v17.16b, v16.16b\n"
+    "zip1 v17.16b, v23.16b, v21.16b\n"
+    ".inst 0x4e929793  // sdot v19.4s, v28.16b, v18.16b\n"
+    "zip1 v16.16b, v22.16b, v31.16b\n"
+    "zip1 v16.16b, v17.16b, v16.16b\n"
+    ".inst 0x4e909793  // sdot v19.4s, v28.16b, v16.16b\n"
+    "mls v25.4s, v19.4s, v30.4s\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "str q25, [%x[outptr], #0x0]\n"
+    "str q20, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 12f\n"
+    "add x21, %x[rq_mul_perchannel], x22\n"
+    "add x20, %x[rq_shift_perchannel], x22\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v27.d }[0], [x21], #0x8\n"
+    "ld1 { v26.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v27.s }[2], [x21], #0x4\n"
+    "ld1 { v26.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "ld1 { v26.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: End
+    "12:"  // Oddments: Quantisation parameters: Store
+    "str q27, [%x[outptr], #0x0]\n"
+    "str q26, [%x[outptr], #0x10]\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "13:"  // End
+    : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
new file mode 100644
index 0000000000..19264c9fce
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_a64_u8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_a64_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+                       get_vector_length<int32_t>(arm_gemm::VLType::None)), 4lu
+  );
+  return n * 7 * get_vector_length<uint8_t>(arm_gemm::VLType::None);
+}
+
+void interleave_a64_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "cmp %x[ld_weight_col], XZR\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "movi v16.4s, #0x9\n"
+    "movi v31.16b, #0x0\n"
+    "mov x21, #0x3\n"
+    "mul x21, %x[ld_weight_col], x21\n"
+    "add x20, %x[qp], %[offsetof_input_offset]\n"
+    "ld1r { v30.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_weights_offset]\n"
+    "ld1r { v29.4s }, [x20]\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "mul v29.4s, v29.4s, v30.4s\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
+    "lsr x21, %x[n_channels], #0x2\n"
+    "movi v28.16b, #0x1\n"
+    "mul v29.4s, v29.4s, v16.4s\n"
+    "add x25, %x[weights], %x[ld_weight_row]\n"
+    "add x20, %x[qp], %[offsetof_per_layer_mul]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x24, x25, %x[ld_weight_row]\n"
+    "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "mov x22, #0x0\n"
+    "cbz x21, 4f\n"
+    "1:"  // Loop
+    "movi v25.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q25, [%x[bias], x22]\n"
+    "2:"  // Loop: Skip bias load
+    "ldr s19, [%x[weights], #0x0]\n"
+    "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
+    "zip1 v17.16b, v16.16b, v31.16b\n"
+    "movi v21.4s, #0x0\n"
+    "ldr s16, [%x[weights], x23]\n"
+    "ldr s18, [x25, #0x0]\n"
+    "zip1 v16.16b, v19.16b, v16.16b\n"
+    "zip1 v20.16b, v16.16b, v17.16b\n"
+    "ldr s17, [x25, %x[ld_weight_col]]\n"
+    "ldr s16, [x25, x23]\n"
+    "zip1 v18.16b, v18.16b, v16.16b\n"
+    "zip1 v16.16b, v17.16b, v31.16b\n"
+    "ldr s17, [x24, #0x0]\n"
+    "ldr s19, [x24, %x[ld_weight_col]]\n"
+    ".inst 0x6e949795  // udot v21.4s, v28.16b, v20.16b\n"
+    "zip1 v18.16b, v18.16b, v16.16b\n"
+    "ldr s16, [x24, x23]\n"
+    "zip1 v17.16b, v17.16b, v16.16b\n"
+    "zip1 v16.16b, v19.16b, v31.16b\n"
+    ".inst 0x6e929795  // udot v21.4s, v28.16b, v18.16b\n"
+    "zip1 v16.16b, v17.16b, v16.16b\n"
+    ".inst 0x6e909795  // udot v21.4s, v28.16b, v16.16b\n"
+    "add %x[weights], %x[weights], #0x4\n"
+    "add x25, x25, #0x4\n"
+    "mls v25.4s, v21.4s, v30.4s\n"
+    "add x24, x24, #0x4\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "str q25, [%x[outptr], #0x0]\n"
+    "str q20, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ldr q27, [%x[rq_mul_perchannel], x22]\n"
+    "ldr q26, [%x[rq_shift_perchannel], x22]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "subs x21, x21, #0x1\n"
+    "str q27, [%x[outptr], #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "str q26, [%x[outptr], #0x10]\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "bgt 1b\n"
+    "tst %x[n_channels], #0x3\n"
+    "beq 13f\n"
+    "4:"  // Oddments
+    "movi v25.4s, #0x0\n"
+    "cbz %x[bias], 7f\n"
+    "add %x[bias], %x[bias], x22\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
+    "b 6f\n"
+    "5:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
+    "6:"  // Oddments: Load bias: Bit 1: End
+    "7:"  // Oddments: Skip bias load
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v17.h }[0], [%x[weights]]\n"
+    "ld1 { v24.h }[0], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.h }[0], [x21]\n"
+    "ld1 { v16.h }[0], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.h }[0], [x21]\n"
+    "ld1 { v18.h }[0], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.h }[0], [x24]\n"
+    "ld1 { v22.h }[0], [x21]\n"
+    "add %x[weights], %x[weights], #0x2\n"
+    "add x25, x25, #0x2\n"
+    "ld1 { v21.h }[0], [x20]\n"
+    "add x24, x24, #0x2\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v17.b }[2], [%x[weights]]\n"
+    "ld1 { v24.b }[2], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.b }[2], [x21]\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.b }[2], [x21]\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.b }[2], [x24]\n"
+    "ld1 { v22.b }[2], [x21]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load weights: Bit 1: Unset
+    "ld1 { v17.b }[0], [%x[weights]]\n"
+    "ld1 { v24.b }[0], [x25]\n"
+    "add x21, %x[weights], %x[ld_weight_col]\n"
+    "add x20, %x[weights], x23\n"
+    "ld1 { v20.b }[0], [x21]\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "add x21, x25, %x[ld_weight_col]\n"
+    "add x20, x25, x23\n"
+    "ld1 { v19.b }[0], [x21]\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "add x21, x24, %x[ld_weight_col]\n"
+    "add x20, x24, x23\n"
+    "ld1 { v23.b }[0], [x24]\n"
+    "ld1 { v22.b }[0], [x21]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "9:"  // Oddments: Load weights: Bit 1: End
+    "zip1 v17.16b, v17.16b, v16.16b\n"
+    "zip1 v16.16b, v20.16b, v31.16b\n"
+    "zip1 v20.16b, v17.16b, v16.16b\n"
+    "zip1 v17.16b, v24.16b, v18.16b\n"
+    "zip1 v16.16b, v19.16b, v31.16b\n"
+    "movi v19.4s, #0x0\n"
+    ".inst 0x6e949793  // udot v19.4s, v28.16b, v20.16b\n"
+    "zip1 v18.16b, v17.16b, v16.16b\n"
+    "zip1 v17.16b, v23.16b, v21.16b\n"
+    ".inst 0x6e929793  // udot v19.4s, v28.16b, v18.16b\n"
+    "zip1 v16.16b, v22.16b, v31.16b\n"
+    "zip1 v16.16b, v17.16b, v16.16b\n"
+    ".inst 0x6e909793  // udot v19.4s, v28.16b, v16.16b\n"
+    "mls v25.4s, v19.4s, v30.4s\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "str q25, [%x[outptr], #0x0]\n"
+    "str q20, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 12f\n"
+    "add x21, %x[rq_mul_perchannel], x22\n"
+    "add x20, %x[rq_shift_perchannel], x22\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v27.d }[0], [x21], #0x8\n"
+    "ld1 { v26.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v27.s }[2], [x21], #0x4\n"
+    "ld1 { v26.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "ld1 { v26.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: End
+    "12:"  // Oddments: Quantisation parameters: Store
+    "str q27, [%x[outptr], #0x0]\n"
+    "str q26, [%x[outptr], #0x10]\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "13:"  // End
+    : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp
new file mode 100644
index 0000000000..dc505a013d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "generic.hpp"
+
+#include <functional>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+
+PackingArguments::PackingArguments(
+  unsigned int kernel_rows, unsigned int kernel_cols, size_t weight_element_size,
+  bool include_bias, size_t bias_element_size, bool premultiply,
+  arm_gemm::VLType vl_type, size_t accumulator_element_size, unsigned int accumulator_depth_vl,
+  std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos
+) : kernel_rows(kernel_rows), kernel_cols(kernel_cols), weight_element_size(weight_element_size),
+    include_bias(include_bias), bias_element_size(bias_element_size), premultiply(premultiply),
+    vl_type(vl_type), accumulator_element_size(accumulator_element_size), accumulator_depth_vl(accumulator_depth_vl),
+    get_weight_pos(get_weight_pos)
+{
+}
+
+size_t get_storage_size_generic(const PackingArguments &packing_args, const DepthwiseArgs &args)
+{
+  // If the channel multiplier is greater than one, then we treat this as a
+  // repeated packing of `channel_multiplier`-sized problems.
+  if (args.channel_multiplier > 1 && !packing_args.premultiply)
+  {
+    DepthwiseArgs args_per_input_channel(args);
+    args_per_input_channel.input_channels = args.channel_multiplier;
+    args_per_input_channel.channel_multiplier = 1;
+
+    return args.input_channels * get_storage_size_generic(packing_args, args_per_input_channel);
+  }
+
+  const unsigned int vl =
+    packing_args.accumulator_depth_vl *
+    arm_gemm::utils::get_vector_length<uint8_t>(packing_args.vl_type) / packing_args.accumulator_element_size;
+  const unsigned int n_packs = arm_gemm::iceildiv(args.input_channels * args.channel_multiplier, vl);
+  const auto pack_size = (packing_args.include_bias ? packing_args.bias_element_size : 0) +
+                         packing_args.kernel_points() * packing_args.weight_element_size;
+  return n_packs * pack_size * vl;
+}
+
+void pack_parameters_generic(
+  const PackingArguments &packing_args,
+  const DepthwiseArgs &args,
+  void *buffer_raw,
+  const void *biases_raw,
+  const void *weights_raw,
+  size_t ld_weight_col,
+  size_t ld_weight_row
+)
+{
+  // Cast the pointers to byte sizes
+  auto *buffer = static_cast<uint8_t *>(buffer_raw);
+  auto *biases = static_cast<const uint8_t *>(biases_raw);
+  auto *weights = static_cast<const uint8_t *>(weights_raw);
+
+  // If the channel multiplier is greater than one, then we treat this as a
+  // repeated packing of `channel_multiplier`-sized problems.
+  if (args.channel_multiplier > 1 && !packing_args.premultiply)
+  {
+    // Get a modified copy of the depthwise arguments
+    DepthwiseArgs args_per_input_channel(args);
+    args_per_input_channel.input_channels = args.channel_multiplier;
+    args_per_input_channel.channel_multiplier = 1;
+
+    // Resolve the strides here
+    ld_weight_col = ld_weight_col ? ld_weight_col : args.input_channels * args.channel_multiplier;
+    ld_weight_row = ld_weight_row ? ld_weight_row : ld_weight_col * packing_args.kernel_cols;
+
+    auto per_input_channel_size = get_storage_size_generic(packing_args, args_per_input_channel);
+
+    for (unsigned int c = 0; c < args.input_channels; c++)
+    {
+      pack_parameters_generic(
+        packing_args, args_per_input_channel, buffer, biases, weights, ld_weight_col, ld_weight_row);
+
+      // Update the pointers
+      buffer += per_input_channel_size;
+      biases += (biases == nullptr) ? 0 : packing_args.bias_element_size * args.channel_multiplier;
+      weights += packing_args.weight_element_size * args.channel_multiplier;
+    }
+    return;
+  }
+
+  auto input_channels = args.input_channels * args.channel_multiplier;
+
+  // Finalise the weight strides
+  ld_weight_col = (ld_weight_col == 0) ? input_channels : ld_weight_col;
+  ld_weight_row = (ld_weight_row == 0) ? packing_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+  const unsigned int vl =
+    packing_args.accumulator_depth_vl *
+    arm_gemm::utils::get_vector_length<uint8_t>(packing_args.vl_type) / packing_args.accumulator_element_size;
+
+  for (unsigned int n = 0; n < input_channels; n += vl)
+  {
+    const unsigned int todo = std::min(vl, input_channels - n);
+
+    if (packing_args.include_bias)
+    {
+      if (biases != nullptr)
+      {
+        memcpy(buffer, biases, todo * packing_args.bias_element_size);
+        biases += todo * packing_args.bias_element_size;
+      }
+      else
+      {
+        memset(buffer, 0, vl * packing_args.bias_element_size);
+      }
+
+      buffer += vl * packing_args.bias_element_size;
+    }
+
+    // Copy each of the weights in turn
+    unsigned int kx, ky;
+    for (int kindex = 0; packing_args.get_weight_pos(kindex, kx, ky); kindex++)
+    {
+      const auto src_ptr = weights + (kx*ld_weight_row + ky*ld_weight_col + n) * packing_args.weight_element_size;
+      memcpy(buffer, src_ptr, todo * packing_args.weight_element_size);
+      buffer += vl * packing_args.weight_element_size;
+    }
+  }
+}
+
+}  // namespace interleaves
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
new file mode 100644
index 0000000000..1842f10150
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "utils.hpp"
+#include "depthwise.hpp"
+
+#include <functional>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+
+struct PackingArguments
+{
+  const unsigned int kernel_rows;
+  const unsigned int kernel_cols;
+  const size_t weight_element_size;
+  const bool include_bias;
+  const size_t bias_element_size;
+  const bool premultiply;
+  arm_gemm::VLType vl_type;
+  const size_t accumulator_element_size;
+  const unsigned int accumulator_depth_vl;
+  std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos;
+
+  unsigned int kernel_points(void) const { return kernel_cols * kernel_rows; }
+
+  PackingArguments(
+    unsigned int kernel_rows,
+    unsigned int kernel_cols,
+    size_t weight_element_size,
+    bool include_bias,
+    size_t bias_element_size,
+    bool premultiply,
+    arm_gemm::VLType vl_type,
+    size_t accumulator_element_size,
+    unsigned int accumulator_depth_vl,
+    std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos
+  );
+};
+
+size_t get_storage_size_generic(
+  const PackingArguments &packing_args,
+  const DepthwiseArgs &args
+);
+
+void pack_parameters_generic(
+  const PackingArguments &packing_args,
+  const DepthwiseArgs &args,
+  void *buffer_raw,
+  const void *biases_raw,
+  const void *weights_raw,
+  size_t ld_weight_col,
+  size_t ld_weight_row
+);
+
+}  // namespace interleaves
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp
new file mode 100644
index 0000000000..a6389054d1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "generic_quantized_dot_product.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+namespace quantized {
+
+size_t get_storage_size(
+  const DepthwiseArgs &args,
+  const arm_gemm::VLType vl_type,
+  const unsigned int accumulator_depth_vl
+)
+{
+  // We produce VL<int32_t> channels at a time, for each of these blocks of
+  // channels we store a vector of biases, weights (complicated) and
+  // requantize parameters.
+  const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length<int32_t>(vl_type);
+  const unsigned int n_iters = args.input_channels * arm_gemm::iceildiv(args.channel_multiplier, iter_length);
+
+  // Compute the cost of storing the weights
+  const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(args.kernel_cols, 4u);
+
+  return n_iters * iter_length * (
+    sizeof(int32_t) +  // Bias
+    4 * n_dots_per_kernel_row * args.kernel_rows * sizeof(int8_t) +  // Weights
+    2 * sizeof(int32_t)  // Requantisation parameters
+  );
+}
+
+template <typename T>
+void pack_parameters(
+  void *_buffer, const int32_t *biases,
+  const T *weights, size_t ld_weight_col, size_t ld_weight_row,
+  const DepthwiseArgs &args,
+  const arm_gemm::Requantize32 &qp,
+  const arm_gemm::VLType vl_type,
+  const unsigned int accumulator_depth_vl
+)
+{
+  auto buffer = static_cast<uint8_t *>(_buffer);
+  auto requant_muls = qp.per_channel_muls;
+  auto requant_shifts = qp.per_channel_right_shifts;
+
+  const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length<int32_t>(vl_type);
+  const unsigned int n_iters_per_input_channel = arm_gemm::iceildiv(args.channel_multiplier, iter_length);
+  const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(args.kernel_cols, 4u);
+
+  const size_t iter_stride = iter_length * (
+      sizeof(int32_t) +  // Bias
+      4 * n_dots_per_kernel_row * args.kernel_rows * sizeof(T) +  // Weights
+      2 * sizeof(int32_t)  // Requantisation parameters
+  );
+
+  ld_weight_col = (ld_weight_col == 0) ? args.input_channels * args.channel_multiplier : ld_weight_col;
+  ld_weight_row = (ld_weight_row == 0) ? args.kernel_cols * ld_weight_col : ld_weight_row;
+
+  for (unsigned int input_channel = 0; input_channel < args.input_channels; input_channel++)
+  {
+    auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride;
+    auto weights_input_channel = weights + input_channel * args.channel_multiplier;
+
+    for (unsigned int iter = 0; iter < n_iters_per_input_channel; iter++)
+    {
+      // Get a pointer to the start of this portion of the buffer; consequently
+      // derive pointers to the bias, weight and requantisation portions of
+      // this frame.
+      auto buffer_base = buffer_input_channel + iter_stride * iter;
+      auto buffer_biases = reinterpret_cast<int32_t *>(buffer_base);
+      auto buffer_weights = buffer_base + sizeof(int32_t) * iter_length;
+      auto buffer_requant_mul = reinterpret_cast<int32_t *>(
+        buffer_weights + args.kernel_rows * n_dots_per_kernel_row * 4 * iter_length);
+      auto buffer_requant_shift = buffer_requant_mul + iter_length;
+      auto weights_base = weights_input_channel + iter * iter_length;
+
+      // Hence work through the data for this iteration, on a
+      // channel-by-channel basis.
+      const auto this_iter_length = std::min<unsigned int>(
+        iter_length, args.channel_multiplier - iter * iter_length
+      );
+      for (unsigned int i = 0; i < this_iter_length; i++)
+      {
+        auto weights_channel = weights_base + i;
+
+        // Read the bias value, we modify this as we read the weights.
+        auto bias_value = biases == nullptr ? 0 : *(biases++);
+        int32_t elements_sum = 0;
+
+        // Read through the kernel; for each row, marshal together as many dot
+        // product terms as are required.
+        for (unsigned int ki = 0; ki < args.kernel_rows; ki++)
+        {
+          auto buffer_row = buffer_weights + i*4 + ki * 4 * n_dots_per_kernel_row * iter_length;
+          auto weights_row = weights_channel + ki * ld_weight_row;
+
+          unsigned int kj = 0;
+          for (; kj < args.kernel_cols; kj++)
+          {
+            // Determine which element to which we're writing
+            const auto dot = kj / 4;
+            const auto elem = kj % 4;
+
+            // Copy the value; include in the sum
+            const auto val = weights_row[kj * ld_weight_col];
+            buffer_row[dot * 4 * iter_length + elem] = val;
+            elements_sum += val;
+          }
+          for (; kj < 4 * n_dots_per_kernel_row; kj++)
+          {
+            const auto dot = kj / 4;
+            const auto elem = kj % 4;
+            buffer_row[dot * 4 * iter_length + elem] = 0;
+          }
+
+          buffer_row += 4 * n_dots_per_kernel_row * iter_length;
+        }
+
+        // Write back the bias and offset values
+        *(buffer_biases++) =
+          bias_value - qp.a_offset * elements_sum +
+          args.kernel_rows * args.kernel_cols * qp.a_offset * qp.b_offset;
+
+        // Write out the requantisation parameters
+        *(buffer_requant_mul++) = qp.per_channel_requant ? *(requant_muls++) : qp.per_layer_mul;
+        *(buffer_requant_shift++) = qp.per_channel_requant ? *(requant_shifts++) : qp.per_layer_right_shift;
+      }
+    }
+  }
+}
+
+template void pack_parameters(void *, const int32_t *, const int8_t *, size_t, size_t, const DepthwiseArgs &, const arm_gemm::Requantize32 &, arm_gemm::VLType, unsigned int);
+template void pack_parameters(void *, const int32_t *, const uint8_t *, size_t, size_t, const DepthwiseArgs &, const arm_gemm::Requantize32 &, arm_gemm::VLType, unsigned int);
+
+}  // namespace quantized
+}  // namespace interleaves
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp
new file mode 100644
index 0000000000..779d67d3f4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "generic.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+namespace quantized {
+
+size_t get_storage_size(
+  const DepthwiseArgs &args,
+  arm_gemm::VLType vl_type,
+  unsigned int accumulator_depth_vl=1
+);
+
+template <typename T>
+void pack_parameters(
+  void *buffer, const int32_t *biases,
+  const T *weights, size_t ld_weight_col, size_t ld_weight_row,
+  const DepthwiseArgs &args,
+  const arm_gemm::Requantize32 &qp,
+  arm_gemm::VLType vl_type,
+  unsigned int accumulator_depth_vl
+);
+
+}  // namespace quantized
+}  // namespace interleaves
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
new file mode 100644
index 0000000000..76f38eb335
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+struct interleave_sve_u8q_3x3_dot
+{
+  static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+struct interleave_sve_s8q_3x3_dot
+{
+  static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+
+struct interleave_a64_u8q_3x3_dot
+{
+  static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+struct interleave_a64_s8q_3x3_dot
+{
+  static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
new file mode 100644
index 0000000000..5d7b54f235
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_sve_s8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_sve_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+                       get_vector_length<int32_t>(arm_gemm::VLType::SVE)), 4lu
+  );
+  return n * 7 * get_vector_length<int8_t>(arm_gemm::VLType::SVE);
+}
+
+void interleave_sve_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "cmp %x[ld_weight_col], XZR\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "mov z16.s, #0x9\n"
+    "mov z28.b, #0x0\n"
+    "mov x20, #0x3\n"
+    "ptrue p2.b\n"
+    "mul x20, %x[ld_weight_col], x20\n"
+    "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
+    "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x20, NE\n"
+    "mov z25.b, #0x1\n"
+    "mul z26.s, p2/M, z26.s, z27.s\n"
+    "add x24, %x[weights], %x[ld_weight_row]\n"
+    "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+    "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+    "add x23, x24, %x[ld_weight_row]\n"
+    "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "whilelt p1.s, XZR, %x[n_channels]\n"
+    "mov x21, #0x0\n"
+    "mul z26.s, p2/M, z26.s, z16.s\n"
+    "pfalse p8.b\n"
+    "cbz %x[bias], 1f\n"
+    "ptrue p8.s\n"
+    "1:"  // No bias
+    "2:"  // Loop
+    "cntp x20, p2, p1.s\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ld1b { z18.b }, p0/Z, [%x[weights]]\n"
+    "ld1b { z17.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [%x[weights], x22]\n"
+    "zip1 z20.b, z18.b, z16.b\n"
+    "zip1 z19.b, z17.b, z28.b\n"
+    "ld1b { z18.b }, p0/Z, [x24]\n"
+    "ld1b { z17.b }, p0/Z, [x24, %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [x24, x22]\n"
+    "zip1 z22.b, z20.b, z19.b\n"
+    "zip1 z21.b, z18.b, z16.b\n"
+    "zip1 z19.b, z17.b, z28.b\n"
+    "mov z20.s, #0x0\n"
+    "ld1b { z18.b }, p0/Z, [x23]\n"
+    "ld1b { z17.b }, p0/Z, [x23, %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [x23, x22]\n"
+    "sdot z20.s, z25.b, z22.b\n"
+    "zip1 z19.b, z21.b, z19.b\n"
+    "sdot z20.s, z25.b, z19.b\n"
+    "zip1 z18.b, z18.b, z16.b\n"
+    "zip1 z16.b, z17.b, z28.b\n"
+    "and p0.b, p2/Z, p8.b, p1.b\n"
+    "ld1w { z17.s }, p0/Z, [%x[bias], x21, LSL #2]\n"
+    "zip1 z16.b, z18.b, z16.b\n"
+    "sdot z20.s, z25.b, z16.b\n"
+    "mls z17.s, p2/M, z20.s, z27.s\n"
+    "add %x[weights], %x[weights], x20\n"
+    "add x24, x24, x20\n"
+    "add x23, x23, x20\n"
+    "add z17.s, z17.s, z26.s\n"
+    "st1w { z17.s }, p2, [%x[outptr]]\n"
+    "st1b { z22.b }, p2, [%x[outptr], #1, MUL VL]\n"
+    "st1b { z19.b }, p2, [%x[outptr], #2, MUL VL]\n"
+    "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+    "addvl %x[outptr], %x[outptr], #4\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ld1w { z24.s }, p1/Z, [%x[rq_mul_perchannel], x21, LSL #2]\n"
+    "ld1w { z23.s }, p1/Z, [%x[rq_shift_perchannel], x21, LSL #2]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "incw x21\n"
+    "whilelt p1.s, x21, %x[n_channels]\n"
+    "st1w { z24.s }, p2, [%x[outptr]]\n"
+    "st1w { z23.s }, p2, [%x[outptr], #1, MUL VL]\n"
+    "addvl %x[outptr], %x[outptr], #2\n"
+    "b.any 2b\n"
+    : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "p0", "p1", "p2", "p8", "x20", "x21", "x22", "x23", "x24", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
new file mode 100644
index 0000000000..c3da81448b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_sve_u8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_sve_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+                       get_vector_length<int32_t>(arm_gemm::VLType::SVE)), 4lu
+  );
+  return n * 7 * get_vector_length<uint8_t>(arm_gemm::VLType::SVE);
+}
+
+void interleave_sve_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "cmp %x[ld_weight_col], XZR\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "mov z16.s, #0x9\n"
+    "mov z28.b, #0x0\n"
+    "mov x20, #0x3\n"
+    "ptrue p2.b\n"
+    "mul x20, %x[ld_weight_col], x20\n"
+    "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
+    "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x20, NE\n"
+    "mov z25.b, #0x1\n"
+    "mul z26.s, p2/M, z26.s, z27.s\n"
+    "add x24, %x[weights], %x[ld_weight_row]\n"
+    "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+    "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+    "add x23, x24, %x[ld_weight_row]\n"
+    "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "whilelt p1.s, XZR, %x[n_channels]\n"
+    "mov x21, #0x0\n"
+    "mul z26.s, p2/M, z26.s, z16.s\n"
+    "pfalse p8.b\n"
+    "cbz %x[bias], 1f\n"
+    "ptrue p8.s\n"
+    "1:"  // No bias
+    "2:"  // Loop
+    "cntp x20, p2, p1.s\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ld1b { z18.b }, p0/Z, [%x[weights]]\n"
+    "ld1b { z17.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [%x[weights], x22]\n"
+    "zip1 z20.b, z18.b, z16.b\n"
+    "zip1 z19.b, z17.b, z28.b\n"
+    "ld1b { z18.b }, p0/Z, [x24]\n"
+    "ld1b { z17.b }, p0/Z, [x24, %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [x24, x22]\n"
+    "zip1 z22.b, z20.b, z19.b\n"
+    "zip1 z21.b, z18.b, z16.b\n"
+    "zip1 z19.b, z17.b, z28.b\n"
+    "mov z20.s, #0x0\n"
+    "ld1b { z18.b }, p0/Z, [x23]\n"
+    "ld1b { z17.b }, p0/Z, [x23, %x[ld_weight_col]]\n"
+    "ld1b { z16.b }, p0/Z, [x23, x22]\n"
+    "udot z20.s, z25.b, z22.b\n"
+    "zip1 z19.b, z21.b, z19.b\n"
+    "udot z20.s, z25.b, z19.b\n"
+    "zip1 z18.b, z18.b, z16.b\n"
+    "zip1 z16.b, z17.b, z28.b\n"
+    "and p0.b, p2/Z, p8.b, p1.b\n"
+    "ld1w { z17.s }, p0/Z, [%x[bias], x21, LSL #2]\n"
+    "zip1 z16.b, z18.b, z16.b\n"
+    "udot z20.s, z25.b, z16.b\n"
+    "mls z17.s, p2/M, z20.s, z27.s\n"
+    "add %x[weights], %x[weights], x20\n"
+    "add x24, x24, x20\n"
+    "add x23, x23, x20\n"
+    "add z17.s, z17.s, z26.s\n"
+    "st1w { z17.s }, p2, [%x[outptr]]\n"
+    "st1b { z22.b }, p2, [%x[outptr], #1, MUL VL]\n"
+    "st1b { z19.b }, p2, [%x[outptr], #2, MUL VL]\n"
+    "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+    "addvl %x[outptr], %x[outptr], #4\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ld1w { z24.s }, p1/Z, [%x[rq_mul_perchannel], x21, LSL #2]\n"
+    "ld1w { z23.s }, p1/Z, [%x[rq_shift_perchannel], x21, LSL #2]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "incw x21\n"
+    "whilelt p1.s, x21, %x[n_channels]\n"
+    "st1w { z24.s }, p2, [%x[outptr]]\n"
+    "st1w { z23.s }, p2, [%x[outptr], #1, MUL VL]\n"
+    "addvl %x[outptr], %x[outptr], #2\n"
+    "b.any 2b\n"
+    : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "p0", "p1", "p2", "p8", "x20", "x21", "x22", "x23", "x24", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6beaba841f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..d8ca3d7437
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x23, #0x0\n"
+    "mov x22, #0x0\n"
+    "1:"  // Tile loop
+    "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x27, #0x2\n"
+    "mov x26, #0x2\n"
+    "str x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x23, x25\n"  // offset = tile_i * ld_input_row
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x23, x24\n"  // offset = tile_i * ld_output_row
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x22, x15, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x15, x15, #0x1\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x22, x14, x20\n"  // offset += tile_j * ld_output_col
+    "lsr x22, %x[n_channels], #0x3\n"
+    "add x11, x15, x15\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x27\n"  // offset *= kernel_stride * output_size
+    "add x13, x13, x21, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x9, x13, x25, LSL #1\n"
+    "mul x20, x20, x26\n"  // offset *= output_tile_size
+    "add x28, x9, x25, LSL #1\n"
+    "add x12, x12, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x27, x28, x25, LSL #1\n"
+    "add x26, x11, x15\n"
+    "add x25, x12, x24, LSL #1\n"
+    "lsl x14, x14, #0x1\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q25, [x10, #0x0]\n"
+    "ldr q0, [x10, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x10, #0x20]\n"
+    "ldr q2, [x10, #0x30]\n"
+    "ldr q3, [x10, #0x40]\n"
+    "ldr q4, [x10, #0x50]\n"
+    "ldr q5, [x10, #0x60]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "ldr q8, [x10, #0x90]\n"
+    "add x10, x10, #0xa0\n"
+    "ldr q9, [x9, x15]\n"
+    "ld1 { v10.8h }, [x13]\n"
+    "ldr q11, [x13, x26]\n"
+    "ldr q12, [x9, x11]\n"
+    "ldr q13, [x28, x15]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+    "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+    "add x23, x23, #0x10\n"
+    "cmp x23, x22, LSL #4\n"
+    "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ld1 { v18.8h }, [x27]\n"
+    "ldr q25, [x10, #0x0]\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "ldr q20, [x28, x11]\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x27, x26]\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "ldr q16, [x13, x15]\n"
+    "fmla v22.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x13, x11]\n"
+    "fmla v21.8h, v3.8h, v13.8h\n"
+    "add x13, x13, #0x10\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "fmla v22.8h, v4.8h, v13.8h\n"
+    "fmla v21.8h, v8.8h, v17.8h\n"
+    "ld1 { v17.8h }, [x9]\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "fmla v23.8h, v0.8h, v16.8h\n"
+    "ldr q16, [x9, x26]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v22.8h, v5.8h, v20.8h\n"
+    "fmla v21.8h, v4.8h, v20.8h\n"
+    "ldr q4, [x10, #0x50]\n"
+    "fmla v24.8h, v2.8h, v18.8h\n"
+    "fmla v23.8h, v1.8h, v18.8h\n"
+    "ld1 { v19.8h }, [x28]\n"
+    "ldr q1, [x10, #0x20]\n"
+    "fmla v22.8h, v0.8h, v17.8h\n"
+    "ldr q0, [x10, #0x10]\n"
+    "fmla v21.8h, v2.8h, v16.8h\n"
+    "ldr q2, [x10, #0x30]\n"
+    "fmla v24.8h, v8.8h, v20.8h\n"
+    "fmla v23.8h, v7.8h, v20.8h\n"
+    "ldr q18, [x28, x26]\n"
+    "add x28, x28, #0x10\n"
+    "ldr q13, [x28, x15]\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "fmla v21.8h, v5.8h, v18.8h\n"
+    "fmla v24.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x27, x15]\n"
+    "ldr q3, [x10, #0x40]\n"
+    "fmla v23.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x27, x11]\n"
+    "ldr q5, [x10, #0x60]\n"
+    "fmla v22.8h, v7.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v17.8h\n"
+    "ldr q11, [x13, x26]\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "ldr q9, [x9, x15]\n"
+    "fmla v23.8h, v8.8h, v18.8h\n"
+    "ld1 { v10.8h }, [x13]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "ldr q12, [x9, x11]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "fmax v24.8h, v24.8h, v27.8h\n"
+    "fmax v23.8h, v23.8h, v27.8h\n"
+    "ldr q8, [x10, #0x90]\n"
+    "fmax v22.8h, v22.8h, v27.8h\n"
+    "fmax v21.8h, v21.8h, v27.8h\n"
+    "add x27, x27, #0x10\n"
+    "fmin v24.8h, v24.8h, v26.8h\n"
+    "fmin v23.8h, v23.8h, v26.8h\n"
+    "st1 { v24.8h }, [x12]\n"
+    "add x10, x10, #0xa0\n"
+    "fmin v22.8h, v22.8h, v26.8h\n"
+    "fmin v21.8h, v21.8h, v26.8h\n"
+    "str q23, [x12, x14]\n"
+    "add x12, x12, #0x10\n"
+    "st1 { v22.8h }, [x25]\n"
+    "str q21, [x25, x14]\n"
+    "add x25, x25, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+    "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+    "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ld1 { v18.8h }, [x27]\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "ldr q20, [x28, x11]\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x27, x26]\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "ldr q16, [x13, x15]\n"
+    "fmla v22.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x13, x11]\n"
+    "fmla v21.8h, v3.8h, v13.8h\n"
+    "add x13, x13, #0x10\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "fmla v22.8h, v4.8h, v13.8h\n"
+    "fmla v21.8h, v8.8h, v17.8h\n"
+    "ld1 { v17.8h }, [x9]\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "fmla v23.8h, v0.8h, v16.8h\n"
+    "ldr q16, [x9, x26]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v22.8h, v5.8h, v20.8h\n"
+    "fmla v21.8h, v4.8h, v20.8h\n"
+    "fmla v24.8h, v2.8h, v18.8h\n"
+    "fmla v23.8h, v1.8h, v18.8h\n"
+    "ld1 { v19.8h }, [x28]\n"
+    "fmla v22.8h, v0.8h, v17.8h\n"
+    "fmla v21.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v8.8h, v20.8h\n"
+    "fmla v23.8h, v7.8h, v20.8h\n"
+    "ldr q18, [x28, x26]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "fmla v21.8h, v5.8h, v18.8h\n"
+    "fmla v24.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x27, x15]\n"
+    "fmla v23.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x27, x11]\n"
+    "fmla v22.8h, v7.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v17.8h\n"
+    "add x27, x27, #0x10\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "fmla v23.8h, v8.8h, v18.8h\n"
+    "fmax v24.8h, v24.8h, v27.8h\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "fmax v23.8h, v23.8h, v27.8h\n"
+    "fmax v22.8h, v22.8h, v27.8h\n"
+    "fmax v21.8h, v21.8h, v27.8h\n"
+    "fmin v24.8h, v24.8h, v26.8h\n"
+    "fmin v23.8h, v23.8h, v26.8h\n"
+    "st1 { v24.8h }, [x12]\n"
+    "fmin v22.8h, v22.8h, v26.8h\n"
+    "fmin v21.8h, v21.8h, v26.8h\n"
+    "str q23, [x12, x14]\n"
+    "add x12, x12, #0x10\n"
+    "st1 { v22.8h }, [x25]\n"
+    "str q21, [x25, x14]\n"
+    "add x25, x25, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 57f\n"
+    "ldr q25, [x10, #0x0]\n"
+    "ldr q0, [x10, #0x10]\n"
+    "add x24, x9, x15\n"
+    "add x23, x13, XZR\n"
+    "ldr q1, [x10, #0x20]\n"
+    "ldr q2, [x10, #0x30]\n"
+    "add x22, x13, x26\n"
+    "add x21, x9, x11\n"
+    "ldr q3, [x10, #0x40]\n"
+    "ldr q4, [x10, #0x50]\n"
+    "add x20, x28, x15\n"
+    "ldr q5, [x10, #0x60]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "ldr q8, [x10, #0x90]\n"
+    "tbz %x[n_channels], #2, 6f\n"
+    "ldr d9, [x24], #0x8\n"
+    "ldr d10, [x23], #0x8\n"
+    "ldr d11, [x22], #0x8\n"
+    "ldr d12, [x21], #0x8\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[6], [x24]\n"
+    "ld1 { v10.h }[6], [x23]\n"
+    "ld1 { v11.h }[6], [x22]\n"
+    "ld1 { v12.h }[6], [x21]\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 8f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[4], [x24]\n"
+    "ld1 { v10.h }[4], [x23]\n"
+    "ld1 { v11.h }[4], [x22]\n"
+    "ld1 { v12.h }[4], [x21]\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 8f\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s9, [x24], #0x4\n"
+    "ldr s10, [x23], #0x4\n"
+    "ldr s11, [x22], #0x4\n"
+    "ldr s12, [x21], #0x4\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[2], [x24]\n"
+    "ld1 { v10.h }[2], [x23]\n"
+    "ld1 { v11.h }[2], [x22]\n"
+    "ld1 { v12.h }[2], [x21]\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x24, #0x0]\n"
+    "ldr h10, [x23, #0x0]\n"
+    "ldr h11, [x22, #0x0]\n"
+    "ldr h12, [x21, #0x0]\n"
+    "ldr h13, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
+    "mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
+    "mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+    "add x20, x27, XZR\n"
+    "mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+    "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v12.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 10f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+    "fmla v30.8h, v6.8h, v9.8h\n"
+    "fmla v28.8h, v7.8h, v13.8h\n"
+    "add x20, x27, x26\n"
+    "fmla v29.8h, v6.8h, v13.8h\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "fmla v31.8h, v3.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 14f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 16f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 16f\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+    "fmla v31.8h, v8.8h, v11.8h\n"
+    "add x20, x13, x15\n"
+    "tbz %x[n_channels], #2, 18f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 20f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 20f\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v29.8h, v0.8h, v12.8h\n"
+    "add x20, x13, x11\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 24f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 24f\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v1.8h, v9.8h\n"
+    "add x20, x28, x11\n"
+    "tbz %x[n_channels], #2, 26f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 28f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 28f\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: End
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "add x20, x9, XZR\n"
+    "fmla v30.8h, v5.8h, v10.8h\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 30f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 32f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 32f\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "add x20, x9, x26\n"
+    "tbz %x[n_channels], #2, 34f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 36f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 36f\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+    "fmla v29.8h, v5.8h, v12.8h\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "add x20, x28, XZR\n"
+    "tbz %x[n_channels], #2, 38f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 40f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 40f\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+    "fmla v28.8h, v6.8h, v9.8h\n"
+    "fmla v30.8h, v3.8h, v9.8h\n"
+    "add x20, x28, x26\n"
+    "tbz %x[n_channels], #2, 42f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 44f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 44f\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+    "fmla v29.8h, v8.8h, v10.8h\n"
+    "fmla v31.8h, v5.8h, v10.8h\n"
+    "add x20, x27, x15\n"
+    "tbz %x[n_channels], #2, 46f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 48f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 48f\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+    "fmla v30.8h, v7.8h, v11.8h\n"
+    "fmla v31.8h, v6.8h, v11.8h\n"
+    "add x20, x27, x11\n"
+    "tbz %x[n_channels], #2, 50f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 52f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 52f\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+    "fmla v30.8h, v8.8h, v12.8h\n"
+    "fmla v31.8h, v7.8h, v12.8h\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmin v28.8h, v28.8h, v26.8h\n"
+    "fmin v29.8h, v29.8h, v26.8h\n"
+    "fmin v30.8h, v30.8h, v26.8h\n"
+    "fmin v31.8h, v31.8h, v26.8h\n"
+    "tbz %x[n_channels], #2, 54f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.d }[0], [x21], x14\n"
+    "st1 { v30.d }[0], [x20], x14\n"
+    "add x12, x12, #0x8\n"
+    "add x25, x25, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.s }[2], [x21], x14\n"
+    "st1 { v30.s }[2], [x20], x14\n"
+    "add x12, x12, #0x4\n"
+    "add x25, x25, #0x4\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.h }[6], [x21], x14\n"
+    "st1 { v30.h }[6], [x20], x14\n"
+    "st1 { v29.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 56f\n"
+    "53:"  // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 56f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.h }[4], [x21], x14\n"
+    "st1 { v30.h }[4], [x20], x14\n"
+    "st1 { v29.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 56f\n"
+    "54:"  // Tile loop: Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 55f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.s }[0], [x21], x14\n"
+    "st1 { v30.s }[0], [x20], x14\n"
+    "add x12, x12, #0x4\n"
+    "add x25, x25, #0x4\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.h }[2], [x21], x14\n"
+    "st1 { v30.h }[2], [x20], x14\n"
+    "st1 { v29.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.h }[0], [x21], x14\n"
+    "st1 { v30.h }[0], [x20], x14\n"
+    "st1 { v29.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "56:"  // Tile loop: Oddments: Store: Bit 2: End
+    "57:"  // Tile loop: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x22, x22, #0x1\n"
+    "add x21, x23, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x22, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x23, x23, x21, LT\n"
+    "csel x22, x22, XZR, LT\n"
+    "cmp x23, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..c9a554e9ad
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,697 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[16];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x16, #0x10\n"  // cntb _, ALL, #1
+    "lsr x15, %x[n_channels], #0x3\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "mov x28, #0x0\n"
+    "sub x27, XZR, x16\n"
+    "cbz x15, 3f\n"
+    "ldr q25, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "cmp x16, x15, LSL #4\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "add x14, x14, #0xa0\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q10, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "ldr q11, [x21, x28]\n"
+    "ldr q12, [x20, x28]\n"
+    "ldr x20, [x13, #0x20]\n"
+    "ldr q13, [x20, x28]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+    "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+    "ldr x21, [x13, #0x28]\n"
+    "ldr x20, [x13, #0x30]\n"
+    "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ldr q18, [x21, x28]\n"
+    "ldr q25, [x14, #0x0]\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "ldr x20, [x13, #0x48]\n"
+    "ldr q20, [x20, x28]\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v22.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v21.8h, v3.8h, v13.8h\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "ldr x22, [x13, #0x58]\n"
+    "ldr x21, [x13, #0x60]\n"
+    "fmla v22.8h, v4.8h, v13.8h\n"
+    "fmla v21.8h, v8.8h, v17.8h\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "fmla v23.8h, v0.8h, v16.8h\n"
+    "ldr q16, [x22, x28]\n"
+    "ldr x26, [x13, #0x70]\n"
+    "fmla v22.8h, v5.8h, v20.8h\n"
+    "fmla v21.8h, v4.8h, v20.8h\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr x25, [x13, #0x78]\n"
+    "fmla v24.8h, v2.8h, v18.8h\n"
+    "fmla v23.8h, v1.8h, v18.8h\n"
+    "ldr q19, [x21, x28]\n"
+    "ldr q1, [x14, #0x20]\n"
+    "fmla v22.8h, v0.8h, v17.8h\n"
+    "ldr q0, [x14, #0x10]\n"
+    "fmla v21.8h, v2.8h, v16.8h\n"
+    "ldr q2, [x14, #0x30]\n"
+    "fmla v24.8h, v8.8h, v20.8h\n"
+    "fmla v23.8h, v7.8h, v20.8h\n"
+    "ldr q18, [x20, x28]\n"
+    "ldp x24, x23, [x13, #0x0]\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "fmla v21.8h, v5.8h, v18.8h\n"
+    "ldp x22, x21, [x13, #0x10]\n"
+    "ldr x20, [x13, #0x20]\n"
+    "ldr q13, [x20, x16]\n"
+    "fmla v24.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x26, x28]\n"
+    "fmla v23.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x25, x28]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "fmla v22.8h, v7.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v17.8h\n"
+    "ldr q11, [x22, x16]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "fmla v23.8h, v8.8h, v18.8h\n"
+    "ldr q9, [x24, x16]\n"
+    "ldr q10, [x23, x16]\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "ldr q12, [x21, x16]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "fmax v24.8h, v24.8h, v27.8h\n"
+    "fmax v23.8h, v23.8h, v27.8h\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "fmax v22.8h, v22.8h, v27.8h\n"
+    "fmax v21.8h, v21.8h, v27.8h\n"
+    "add x16, x16, #0x10\n"
+    "add x27, x27, #0x10\n"
+    "fmin v24.8h, v24.8h, v26.8h\n"
+    "fmin v23.8h, v23.8h, v26.8h\n"
+    "cmp x16, x15, LSL #4\n"
+    "fmin v22.8h, v22.8h, v26.8h\n"
+    "fmin v21.8h, v21.8h, v26.8h\n"
+    "add x28, x28, #0x10\n"
+    "str q24, [x12, x27]\n"
+    "add x14, x14, #0xa0\n"
+    "str q23, [x11, x27]\n"
+    "str q22, [x10, x27]\n"
+    "str q21, [x9, x27]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+    "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+    "ldr x21, [x13, #0x28]\n"
+    "ldr x20, [x13, #0x30]\n"
+    "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ldr q18, [x21, x28]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "ldr q20, [x20, x28]\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x21, [x13, #0x50]\n"
+    "fmla v22.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v21.8h, v3.8h, v13.8h\n"
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "ldr x23, [x13, #0x60]\n"
+    "ldr x22, [x13, #0x68]\n"
+    "fmla v22.8h, v4.8h, v13.8h\n"
+    "fmla v21.8h, v8.8h, v17.8h\n"
+    "ldr q17, [x21, x28]\n"
+    "ldr x21, [x13, #0x70]\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "fmla v23.8h, v0.8h, v16.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v22.8h, v5.8h, v20.8h\n"
+    "fmla v21.8h, v4.8h, v20.8h\n"
+    "add x27, x27, #0x10\n"
+    "fmla v24.8h, v2.8h, v18.8h\n"
+    "fmla v23.8h, v1.8h, v18.8h\n"
+    "ldr q19, [x23, x28]\n"
+    "fmla v22.8h, v0.8h, v17.8h\n"
+    "fmla v21.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v8.8h, v20.8h\n"
+    "fmla v23.8h, v7.8h, v20.8h\n"
+    "ldr q18, [x22, x28]\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "fmla v21.8h, v5.8h, v18.8h\n"
+    "fmla v24.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x21, x28]\n"
+    "fmla v23.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v22.8h, v7.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v17.8h\n"
+    "add x28, x28, #0x10\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "fmla v23.8h, v8.8h, v18.8h\n"
+    "fmax v24.8h, v24.8h, v27.8h\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "fmax v23.8h, v23.8h, v27.8h\n"
+    "fmax v22.8h, v22.8h, v27.8h\n"
+    "fmax v21.8h, v21.8h, v27.8h\n"
+    "fmin v24.8h, v24.8h, v26.8h\n"
+    "fmin v23.8h, v23.8h, v26.8h\n"
+    "str q24, [x12, x27]\n"
+    "fmin v22.8h, v22.8h, v26.8h\n"
+    "fmin v21.8h, v21.8h, v26.8h\n"
+    "str q23, [x11, x27]\n"
+    "str q22, [x10, x27]\n"
+    "str q21, [x9, x27]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 56f\n"
+    "ldr q25, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "mov x20, x28\n"
+    "add x12, x12, x20\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "add x11, x11, x20\n"
+    "add x10, x10, x20\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "add x9, x9, x20\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "ldr x24, [x13, #0x0]\n"
+    "ldr x23, [x13, #0x8]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    "ldr x22, [x13, #0x10]\n"
+    "ldr x21, [x13, #0x18]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    "ldr x20, [x13, #0x20]\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v9.d }[0], [x24], #0x8\n"
+    "ld1 { v10.d }[0], [x23], #0x8\n"
+    "ld1 { v11.d }[0], [x22], #0x8\n"
+    "ld1 { v12.d }[0], [x21], #0x8\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[6], [x24], #0x2\n"
+    "ld1 { v10.h }[6], [x23], #0x2\n"
+    "ld1 { v11.h }[6], [x22], #0x2\n"
+    "ld1 { v12.h }[6], [x21], #0x2\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[4], [x24], #0x2\n"
+    "ld1 { v10.h }[4], [x23], #0x2\n"
+    "ld1 { v11.h }[4], [x22], #0x2\n"
+    "ld1 { v12.h }[4], [x21], #0x2\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.s }[0], [x24], #0x4\n"
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "ld1 { v11.s }[0], [x22], #0x4\n"
+    "ld1 { v12.s }[0], [x21], #0x4\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[2], [x24], #0x2\n"
+    "ld1 { v10.h }[2], [x23], #0x2\n"
+    "ld1 { v11.h }[2], [x22], #0x2\n"
+    "ld1 { v12.h }[2], [x21], #0x2\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x24], #0x2\n"
+    "ld1 { v10.h }[0], [x23], #0x2\n"
+    "ld1 { v11.h }[0], [x22], #0x2\n"
+    "ld1 { v12.h }[0], [x21], #0x2\n"
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "7:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
+    "mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
+    "mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+    "ldr x20, [x13, #0x28]\n"
+    "add x20, x20, x28\n"
+    "mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+    "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v12.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 9f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load input (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "11:"  // Oddments: Load input (3, 0): Bit 2: End
+    "fmla v30.8h, v6.8h, v9.8h\n"
+    "ldr x20, [x13, #0x30]\n"
+    "fmla v28.8h, v7.8h, v13.8h\n"
+    "add x20, x20, x28\n"
+    "fmla v29.8h, v6.8h, v13.8h\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "fmla v31.8h, v3.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load input (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "15:"  // Oddments: Load input (3, 3): Bit 2: End
+    "ldr x20, [x13, #0x38]\n"
+    "fmla v31.8h, v8.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load input (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "19:"  // Oddments: Load input (0, 1): Bit 2: End
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v29.8h, v0.8h, v12.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 21f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load input (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "23:"  // Oddments: Load input (0, 2): Bit 2: End
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v1.8h, v9.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 25f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load input (2, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load input (2, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (2, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "27:"  // Oddments: Load input (2, 2): Bit 2: End
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "add x20, x20, x28\n"
+    "fmla v30.8h, v5.8h, v10.8h\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 29f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load input (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "31:"  // Oddments: Load input (1, 0): Bit 2: End
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 33f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load input (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "35:"  // Oddments: Load input (1, 3): Bit 2: End
+    "ldr x20, [x13, #0x60]\n"
+    "fmla v29.8h, v5.8h, v12.8h\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 37f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load input (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "39:"  // Oddments: Load input (2, 0): Bit 2: End
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v28.8h, v6.8h, v9.8h\n"
+    "fmla v30.8h, v3.8h, v9.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 41f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load input (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "43:"  // Oddments: Load input (2, 3): Bit 2: End
+    "ldr x20, [x13, #0x70]\n"
+    "fmla v29.8h, v8.8h, v10.8h\n"
+    "fmla v31.8h, v5.8h, v10.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 45f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load input (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "47:"  // Oddments: Load input (3, 1): Bit 2: End
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v30.8h, v7.8h, v11.8h\n"
+    "fmla v31.8h, v6.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 49f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load input (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "51:"  // Oddments: Load input (3, 2): Bit 2: End
+    "fmla v30.8h, v8.8h, v12.8h\n"
+    "fmla v31.8h, v7.8h, v12.8h\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmin v28.8h, v28.8h, v26.8h\n"
+    "fmin v29.8h, v29.8h, v26.8h\n"
+    "fmin v30.8h, v30.8h, v26.8h\n"
+    "fmin v31.8h, v31.8h, v26.8h\n"
+    "tbz %x[n_channels], #2, 53f\n"
+    "st1 { v28.d }[0], [x12], #0x8\n"
+    "st1 { v29.d }[0], [x11], #0x8\n"
+    "st1 { v30.d }[0], [x10], #0x8\n"
+    "st1 { v31.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "st1 { v28.s }[2], [x12], #0x4\n"
+    "st1 { v29.s }[2], [x11], #0x4\n"
+    "st1 { v30.s }[2], [x10], #0x4\n"
+    "st1 { v31.s }[2], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "st1 { v28.h }[6], [x12], #0x2\n"
+    "st1 { v29.h }[6], [x11], #0x2\n"
+    "st1 { v30.h }[6], [x10], #0x2\n"
+    "st1 { v31.h }[6], [x9], #0x2\n"
+    "b 55f\n"
+    "52:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 55f\n"
+    "st1 { v28.h }[4], [x12], #0x2\n"
+    "st1 { v29.h }[4], [x11], #0x2\n"
+    "st1 { v30.h }[4], [x10], #0x2\n"
+    "st1 { v31.h }[4], [x9], #0x2\n"
+    "b 55f\n"
+    "53:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 54f\n"
+    "st1 { v28.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x11], #0x4\n"
+    "st1 { v30.s }[0], [x10], #0x4\n"
+    "st1 { v31.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "st1 { v28.h }[2], [x12], #0x2\n"
+    "st1 { v29.h }[2], [x11], #0x2\n"
+    "st1 { v30.h }[2], [x10], #0x2\n"
+    "st1 { v31.h }[2], [x9], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "st1 { v28.h }[0], [x12], #0x2\n"
+    "st1 { v29.h }[0], [x11], #0x2\n"
+    "st1 { v30.h }[0], [x10], #0x2\n"
+    "st1 { v31.h }[0], [x9], #0x2\n"
+    "55:"  // Oddments: Store: Bit 2: End
+    "56:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6bbd3508cb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..4e64a2bf2b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1158 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x24, #0x0\n"
+    "mov x23, #0x0\n"
+    "1:"  // Tile loop
+    "str x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x27, #0x3\n"
+    "mov x26, #0x3\n"
+    "str x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x24, x25\n"  // offset = tile_i * ld_input_row
+    "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x24, x22\n"  // offset = tile_i * ld_output_row
+    "mov x24, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x23, x8, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x8, x8, #0x1\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x23, x17, x20\n"  // offset += tile_j * ld_output_col
+    "lsl x17, x17, #0x1\n"
+    "lsr x23, %x[n_channels], #0x3\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x27\n"  // offset *= kernel_stride * output_size
+    "add x16, x16, x21, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x13, x16, x25, LSL #1\n"
+    "mul x20, x20, x26\n"  // offset *= output_tile_size
+    "add x12, x13, x25, LSL #1\n"
+    "add x11, x8, x8\n"
+    "add x15, x15, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "add x10, x12, x25, LSL #1\n"
+    "add x9, x11, x8\n"
+    "add x28, x15, x22, LSL #1\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v15.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.8h }, [x20]\n"
+    "add x27, x10, x25, LSL #1\n"
+    "add x26, x9, x8\n"
+    "add x25, x28, x22, LSL #1\n"
+    "add x22, x17, x17\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x24\n"
+    "cbz x23, 4f\n"
+    "ldr q31, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "cmp x24, x23, LSL #4\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "add x14, x14, #0xa0\n"
+    "ldr q9, [x12, x11]\n"
+    "ld1 { v10.8h }, [x16]\n"
+    "ldr q11, [x16, x26]\n"
+    "ld1 { v12.8h }, [x27]\n"
+    "ldr q13, [x13, x11]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+    "add x24, x24, #0x10\n"
+    "cmp x24, x23, LSL #4\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "ldr q23, [x12, x9]\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q18, [x12, x8]\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v6.8h, v18.8h\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v5.8h, v13.8h\n"
+    "fmla v27.8h, v3.8h, v13.8h\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v25.8h, v1.8h, v13.8h\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "ldr q17, [x16, x8]\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "ldr q16, [x27, x26]\n"
+    "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
+    "ldr q31, [x14, #0x0]\n"
+    "fmla v29.8h, v0.8h, v17.8h\n"
+    "fmla v21.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x16, x9]\n"
+    "fmla v28.8h, v7.8h, v18.8h\n"
+    "fmla v20.8h, v0.8h, v18.8h\n"
+    "fmla v26.8h, v4.8h, v18.8h\n"
+    "fmla v25.8h, v3.8h, v18.8h\n"
+    "fmla v22.8h, v1.8h, v18.8h\n"
+    "ld1 { v19.8h }, [x13]\n"
+    "fmla v29.8h, v2.8h, v16.8h\n"
+    "fmla v27.8h, v1.8h, v16.8h\n"
+    "ld1 { v18.8h }, [x10]\n"
+    "fmla v24.8h, v4.8h, v23.8h\n"
+    "fmla v28.8h, v1.8h, v17.8h\n"
+    "ldr q16, [x13, x26]\n"
+    "fmla v20.8h, v2.8h, v23.8h\n"
+    "fmla v21.8h, v1.8h, v23.8h\n"
+    "fmla v29.8h, v8.8h, v23.8h\n"
+    "fmla v27.8h, v7.8h, v23.8h\n"
+    "fmla v25.8h, v5.8h, v23.8h\n"
+    "ldr q17, [x10, x11]\n"
+    "fmla v26.8h, v0.8h, v19.8h\n"
+    "fmla v22.8h, v3.8h, v18.8h\n"
+    "fmla v24.8h, v2.8h, v16.8h\n"
+    "fmla v20.8h, v4.8h, v17.8h\n"
+    "fmla v21.8h, v3.8h, v17.8h\n"
+    "fmla v28.8h, v3.8h, v19.8h\n"
+    "ldr q19, [x10, x26]\n"
+    "fmla v27.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x27, x8]\n"
+    "fmla v26.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x13, x8]\n"
+    "fmla v25.8h, v7.8h, v17.8h\n"
+    "fmla v22.8h, v5.8h, v17.8h\n"
+    "fmla v24.8h, v6.8h, v17.8h\n"
+    "fmla v21.8h, v5.8h, v19.8h\n"
+    "fmla v20.8h, v6.8h, v16.8h\n"
+    "fmla v26.8h, v8.8h, v17.8h\n"
+    "fmla v22.8h, v7.8h, v16.8h\n"
+    "ldr q17, [x27, x9]\n"
+    "fmla v29.8h, v3.8h, v18.8h\n"
+    "fmla v25.8h, v0.8h, v18.8h\n"
+    "fmla v24.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x13, x9]\n"
+    "fmla v20.8h, v8.8h, v17.8h\n"
+    "add x13, x13, #0x10\n"
+    "fmla v21.8h, v7.8h, v17.8h\n"
+    "ldr q19, [x10, x9]\n"
+    "fmla v28.8h, v4.8h, v18.8h\n"
+    "fmla v26.8h, v1.8h, v18.8h\n"
+    "ldr q17, [x10, x8]\n"
+    "fmla v29.8h, v5.8h, v16.8h\n"
+    "add x10, x10, #0x10\n"
+    "fmla v27.8h, v4.8h, v16.8h\n"
+    "fmla v25.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v22.8h, v4.8h, v17.8h\n"
+    "add x16, x16, #0x10\n"
+    "ld1 { v10.8h }, [x16]\n"
+    "fmla v20.8h, v3.8h, v17.8h\n"
+    "fmla v21.8h, v4.8h, v19.8h\n"
+    "ldr q4, [x14, #0x50]\n"
+    "fmla v26.8h, v7.8h, v17.8h\n"
+    "fmla v25.8h, v6.8h, v17.8h\n"
+    "ld1 { v18.8h }, [x12]\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v29.8h, v1.8h, v16.8h\n"
+    "ldr q1, [x14, #0x20]\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmla v27.8h, v0.8h, v16.8h\n"
+    "ldr q17, [x12, x26]\n"
+    "fmla v24.8h, v7.8h, v19.8h\n"
+    "add x12, x12, #0x10\n"
+    "ldr q9, [x12, x11]\n"
+    "fmla v20.8h, v5.8h, v19.8h\n"
+    "fmla v22.8h, v0.8h, v18.8h\n"
+    "ldr q0, [x14, #0x10]\n"
+    "fmla v21.8h, v2.8h, v17.8h\n"
+    "ldr q2, [x14, #0x30]\n"
+    "fmla v25.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x27, x11]\n"
+    "fmla v28.8h, v6.8h, v18.8h\n"
+    "fmla v26.8h, v3.8h, v18.8h\n"
+    "ldr q3, [x14, #0x40]\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmla v27.8h, v8.8h, v17.8h\n"
+    "fmla v24.8h, v5.8h, v17.8h\n"
+    "ldr q11, [x16, x26]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "ldr q8, [x14, #0x90]\n"
+    "fmla v20.8h, v7.8h, v16.8h\n"
+    "ldr q7, [x14, #0x80]\n"
+    "fmla v21.8h, v6.8h, v16.8h\n"
+    "ldr q13, [x13, x11]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "add x27, x27, #0x10\n"
+    "ld1 { v12.8h }, [x27]\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "add x14, x14, #0xa0\n"
+    "fmax v20.8h, v20.8h, v15.8h\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "st1 { v28.8h }, [x15]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "str q29, [x15, x17]\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "str q27, [x15, x22]\n"
+    "add x15, x15, #0x10\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "st1 { v26.8h }, [x28]\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "str q25, [x28, x17]\n"
+    "str q24, [x28, x22]\n"
+    "add x28, x28, #0x10\n"
+    "st1 { v22.8h }, [x25]\n"
+    "str q20, [x25, x17]\n"
+    "str q21, [x25, x22]\n"
+    "add x25, x25, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "ldr q23, [x12, x9]\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q18, [x12, x8]\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v6.8h, v18.8h\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v5.8h, v13.8h\n"
+    "fmla v27.8h, v3.8h, v13.8h\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v25.8h, v1.8h, v13.8h\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "ldr q17, [x16, x8]\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "ldr q16, [x27, x26]\n"
+    "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
+    "fmla v29.8h, v0.8h, v17.8h\n"
+    "fmla v21.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x16, x9]\n"
+    "fmla v28.8h, v7.8h, v18.8h\n"
+    "fmla v20.8h, v0.8h, v18.8h\n"
+    "fmla v26.8h, v4.8h, v18.8h\n"
+    "fmla v25.8h, v3.8h, v18.8h\n"
+    "fmla v22.8h, v1.8h, v18.8h\n"
+    "ld1 { v19.8h }, [x13]\n"
+    "fmla v29.8h, v2.8h, v16.8h\n"
+    "fmla v27.8h, v1.8h, v16.8h\n"
+    "ld1 { v18.8h }, [x10]\n"
+    "fmla v24.8h, v4.8h, v23.8h\n"
+    "fmla v28.8h, v1.8h, v17.8h\n"
+    "ldr q16, [x13, x26]\n"
+    "fmla v20.8h, v2.8h, v23.8h\n"
+    "fmla v21.8h, v1.8h, v23.8h\n"
+    "fmla v29.8h, v8.8h, v23.8h\n"
+    "fmla v27.8h, v7.8h, v23.8h\n"
+    "fmla v25.8h, v5.8h, v23.8h\n"
+    "ldr q17, [x10, x11]\n"
+    "fmla v26.8h, v0.8h, v19.8h\n"
+    "fmla v22.8h, v3.8h, v18.8h\n"
+    "fmla v24.8h, v2.8h, v16.8h\n"
+    "fmla v20.8h, v4.8h, v17.8h\n"
+    "fmla v21.8h, v3.8h, v17.8h\n"
+    "fmla v28.8h, v3.8h, v19.8h\n"
+    "ldr q19, [x10, x26]\n"
+    "fmla v27.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x27, x8]\n"
+    "fmla v26.8h, v6.8h, v18.8h\n"
+    "ldr q18, [x13, x8]\n"
+    "fmla v25.8h, v7.8h, v17.8h\n"
+    "fmla v22.8h, v5.8h, v17.8h\n"
+    "fmla v24.8h, v6.8h, v17.8h\n"
+    "fmla v21.8h, v5.8h, v19.8h\n"
+    "fmla v20.8h, v6.8h, v16.8h\n"
+    "fmla v26.8h, v8.8h, v17.8h\n"
+    "fmla v22.8h, v7.8h, v16.8h\n"
+    "ldr q17, [x27, x9]\n"
+    "fmla v29.8h, v3.8h, v18.8h\n"
+    "fmla v25.8h, v0.8h, v18.8h\n"
+    "fmla v24.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x13, x9]\n"
+    "fmla v20.8h, v8.8h, v17.8h\n"
+    "add x13, x13, #0x10\n"
+    "fmla v21.8h, v7.8h, v17.8h\n"
+    "ldr q19, [x10, x9]\n"
+    "fmla v28.8h, v4.8h, v18.8h\n"
+    "fmla v26.8h, v1.8h, v18.8h\n"
+    "ldr q17, [x10, x8]\n"
+    "fmla v29.8h, v5.8h, v16.8h\n"
+    "add x10, x10, #0x10\n"
+    "fmla v27.8h, v4.8h, v16.8h\n"
+    "fmla v25.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v22.8h, v4.8h, v17.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmla v20.8h, v3.8h, v17.8h\n"
+    "fmla v21.8h, v4.8h, v19.8h\n"
+    "fmla v26.8h, v7.8h, v17.8h\n"
+    "fmla v25.8h, v6.8h, v17.8h\n"
+    "ld1 { v18.8h }, [x12]\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v29.8h, v1.8h, v16.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmla v27.8h, v0.8h, v16.8h\n"
+    "ldr q17, [x12, x26]\n"
+    "fmla v24.8h, v7.8h, v19.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmla v20.8h, v5.8h, v19.8h\n"
+    "fmla v22.8h, v0.8h, v18.8h\n"
+    "add x12, x12, #0x10\n"
+    "fmla v21.8h, v2.8h, v17.8h\n"
+    "fmla v25.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x27, x11]\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "fmla v28.8h, v6.8h, v18.8h\n"
+    "fmla v26.8h, v3.8h, v18.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "add x27, x27, #0x10\n"
+    "fmla v27.8h, v8.8h, v17.8h\n"
+    "fmla v24.8h, v5.8h, v17.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "fmla v20.8h, v7.8h, v16.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmla v21.8h, v6.8h, v16.8h\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "fmax v20.8h, v20.8h, v15.8h\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "st1 { v28.8h }, [x15]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "str q29, [x15, x17]\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "str q27, [x15, x22]\n"
+    "add x15, x15, #0x10\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "st1 { v26.8h }, [x28]\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "str q25, [x28, x17]\n"
+    "str q24, [x28, x22]\n"
+    "add x28, x28, #0x10\n"
+    "st1 { v22.8h }, [x25]\n"
+    "str q20, [x25, x17]\n"
+    "str q21, [x25, x22]\n"
+    "add x25, x25, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 93f\n"
+    "ldr q31, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "add x24, x12, x11\n"
+    "add x23, x16, XZR\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "add x22, x16, x26\n"
+    "add x21, x27, XZR\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "add x20, x13, x11\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "tbz %x[n_channels], #2, 6f\n"
+    "ldr d9, [x24], #0x8\n"
+    "ldr d10, [x23], #0x8\n"
+    "ldr d11, [x22], #0x8\n"
+    "ldr d12, [x21], #0x8\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[6], [x24]\n"
+    "ld1 { v10.h }[6], [x23]\n"
+    "ld1 { v11.h }[6], [x22]\n"
+    "ld1 { v12.h }[6], [x21]\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 8f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[4], [x24]\n"
+    "ld1 { v10.h }[4], [x23]\n"
+    "ld1 { v11.h }[4], [x22]\n"
+    "ld1 { v12.h }[4], [x21]\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 8f\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s9, [x24], #0x4\n"
+    "ldr s10, [x23], #0x4\n"
+    "ldr s11, [x22], #0x4\n"
+    "ldr s12, [x21], #0x4\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[2], [x24]\n"
+    "ld1 { v10.h }[2], [x23]\n"
+    "ld1 { v11.h }[2], [x22]\n"
+    "ld1 { v12.h }[2], [x21]\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x24, #0x0]\n"
+    "ldr h10, [x23, #0x0]\n"
+    "ldr h11, [x22, #0x0]\n"
+    "ldr h12, [x21, #0x0]\n"
+    "ldr h13, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
+    "mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
+    "add x20, x27, x26\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
+    "mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "fmla v25.8h, v2.8h, v11.8h\n"
+    "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v29.8h, v6.8h, v12.8h\n"
+    "fmla v23.8h, v5.8h, v13.8h\n"
+    "fmla v24.8h, v4.8h, v13.8h\n"
+    "fmla v25.8h, v3.8h, v13.8h\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v27.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v0.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 10f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+    "fmla v31.8h, v8.8h, v12.8h\n"
+    "add x20, x12, x8\n"
+    "tbz %x[n_channels], #2, 14f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 16f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 16f\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+    "fmla v23.8h, v7.8h, v11.8h\n"
+    "fmla v24.8h, v6.8h, v11.8h\n"
+    "add x20, x16, x8\n"
+    "fmla v26.8h, v4.8h, v11.8h\n"
+    "fmla v27.8h, v3.8h, v11.8h\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 18f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 20f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 20f\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
+    "fmla v23.8h, v1.8h, v13.8h\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "add x20, x16, x9\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 24f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 24f\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: End
+    "fmla v24.8h, v2.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "add x20, x12, x9\n"
+    "tbz %x[n_channels], #2, 26f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 28f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 28f\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+    "fmla v24.8h, v8.8h, v10.8h\n"
+    "fmla v25.8h, v7.8h, v10.8h\n"
+    "add x20, x13, XZR\n"
+    "fmla v27.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v2.8h, v10.8h\n"
+    "fmla v31.8h, v1.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 30f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 32f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 32f\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
+    "fmla v23.8h, v3.8h, v11.8h\n"
+    "fmla v26.8h, v0.8h, v11.8h\n"
+    "add x20, x13, x26\n"
+    "tbz %x[n_channels], #2, 34f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 36f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 36f\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+    "fmla v25.8h, v5.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "add x20, x10, XZR\n"
+    "tbz %x[n_channels], #2, 38f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 40f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 40f\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+    "fmla v26.8h, v6.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "add x20, x10, x11\n"
+    "tbz %x[n_channels], #2, 42f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 44f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 44f\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "add x20, x10, x26\n"
+    "fmla v28.8h, v6.8h, v10.8h\n"
+    "fmla v29.8h, v5.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v31.8h, v3.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 46f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 48f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 48f\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v5.8h, v11.8h\n"
+    "add x20, x27, x8\n"
+    "tbz %x[n_channels], #2, 50f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 52f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 52f\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+    "fmla v29.8h, v7.8h, v13.8h\n"
+    "fmla v30.8h, v6.8h, v13.8h\n"
+    "add x20, x13, x8\n"
+    "tbz %x[n_channels], #2, 54f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 56f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 56f\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: End
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "add x20, x13, x9\n"
+    "fmla v26.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 58f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 60f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 60f\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+    "fmla v24.8h, v5.8h, v11.8h\n"
+    "fmla v25.8h, v4.8h, v11.8h\n"
+    "add x20, x27, x9\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 62f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 64f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 64f\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+    "fmla v30.8h, v8.8h, v13.8h\n"
+    "fmla v31.8h, v7.8h, v13.8h\n"
+    "add x20, x10, x8\n"
+    "tbz %x[n_channels], #2, 66f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 68f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 68f\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+    "fmla v26.8h, v7.8h, v12.8h\n"
+    "fmla v27.8h, v6.8h, v12.8h\n"
+    "add x20, x16, x11\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 70f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 72f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 72f\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 71f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "72:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "add x20, x10, x9\n"
+    "fmla v25.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 74f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 73f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 76f\n"
+    "73:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 76f\n"
+    "74:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 75f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 76f\n"
+    "75:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "76:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+    "fmla v27.8h, v8.8h, v13.8h\n"
+    "fmla v28.8h, v7.8h, v13.8h\n"
+    "add x20, x12, XZR\n"
+    "fmla v30.8h, v5.8h, v13.8h\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 78f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 77f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 80f\n"
+    "77:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 80f\n"
+    "78:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 79f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 80f\n"
+    "79:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "80:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+    "fmla v23.8h, v6.8h, v12.8h\n"
+    "fmla v26.8h, v3.8h, v12.8h\n"
+    "add x20, x12, x26\n"
+    "fmla v29.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 82f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 81f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 84f\n"
+    "81:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 84f\n"
+    "82:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 83f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 84f\n"
+    "83:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "84:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v11.8h\n"
+    "add x20, x27, x11\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 86f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 85f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 88f\n"
+    "85:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 88f\n"
+    "86:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 87f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 88f\n"
+    "87:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "88:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+    "fmla v29.8h, v8.8h, v13.8h\n"
+    "fmla v30.8h, v7.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "fmla v31.8h, v6.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmax v30.8h, v30.8h, v15.8h\n"
+    "fmax v31.8h, v31.8h, v15.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "tbz %x[n_channels], #2, 90f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.d }[0], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.d }[0], [x21], x17\n"
+    "add x15, x15, #0x8\n"
+    "st1 { v29.d }[0], [x20], x17\n"
+    "add x28, x28, #0x8\n"
+    "add x25, x25, #0x8\n"
+    "st1 { v24.d }[0], [x22], x17\n"
+    "st1 { v27.d }[0], [x21], x17\n"
+    "st1 { v30.d }[0], [x20], x17\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "st1 { v28.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 89f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.s }[2], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.s }[2], [x21], x17\n"
+    "add x15, x15, #0x4\n"
+    "st1 { v29.s }[2], [x20], x17\n"
+    "add x28, x28, #0x4\n"
+    "add x25, x25, #0x4\n"
+    "st1 { v24.s }[2], [x22], x17\n"
+    "st1 { v27.s }[2], [x21], x17\n"
+    "st1 { v30.s }[2], [x20], x17\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "st1 { v28.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.h }[6], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.h }[6], [x21], x17\n"
+    "st1 { v29.h }[6], [x20], x17\n"
+    "st1 { v24.h }[6], [x22], x17\n"
+    "st1 { v27.h }[6], [x21], x17\n"
+    "st1 { v30.h }[6], [x20], x17\n"
+    "st1 { v25.h }[6], [x22]\n"
+    "st1 { v28.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 92f\n"
+    "89:"  // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 92f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.h }[4], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.h }[4], [x21], x17\n"
+    "st1 { v29.h }[4], [x20], x17\n"
+    "st1 { v24.h }[4], [x22], x17\n"
+    "st1 { v27.h }[4], [x21], x17\n"
+    "st1 { v30.h }[4], [x20], x17\n"
+    "st1 { v25.h }[4], [x22]\n"
+    "st1 { v28.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 92f\n"
+    "90:"  // Tile loop: Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 91f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.s }[0], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.s }[0], [x21], x17\n"
+    "add x15, x15, #0x4\n"
+    "st1 { v29.s }[0], [x20], x17\n"
+    "add x28, x28, #0x4\n"
+    "add x25, x25, #0x4\n"
+    "st1 { v24.s }[0], [x22], x17\n"
+    "st1 { v27.s }[0], [x21], x17\n"
+    "st1 { v30.s }[0], [x20], x17\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "st1 { v28.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.h }[2], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.h }[2], [x21], x17\n"
+    "st1 { v29.h }[2], [x20], x17\n"
+    "st1 { v24.h }[2], [x22], x17\n"
+    "st1 { v27.h }[2], [x21], x17\n"
+    "st1 { v30.h }[2], [x20], x17\n"
+    "st1 { v25.h }[2], [x22]\n"
+    "st1 { v28.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 92f\n"
+    "91:"  // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.h }[0], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.h }[0], [x21], x17\n"
+    "st1 { v29.h }[0], [x20], x17\n"
+    "st1 { v24.h }[0], [x22], x17\n"
+    "st1 { v27.h }[0], [x21], x17\n"
+    "st1 { v30.h }[0], [x20], x17\n"
+    "st1 { v25.h }[0], [x22]\n"
+    "st1 { v28.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "92:"  // Tile loop: Oddments: Store: Bit 2: End
+    "93:"  // Tile loop: End
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x23, x23, #0x1\n"
+    "add x21, x24, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x23, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x24, x24, x21, LT\n"
+    "csel x23, x23, XZR, LT\n"
+    "cmp x24, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..72e68482c6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1291 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "mov x7, #0x10\n"  // cntb _, ALL, #1
+    "lsr x8, %x[n_channels], #0x3\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v15.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.8h }, [x20]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x14, #0x0\n"
+    "sub x13, XZR, x7\n"
+    "cbz x8, 3f\n"
+    "ldr q31, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x7, x8, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "add x16, x16, #0xa0\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q9, [x21, x14]\n"
+    "ldr q10, [x20, x14]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr q12, [x20, x14]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ldr q13, [x20, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
+    "ldr x26, [x15, #0x30]\n"
+    "ldr x23, [x15, #0x38]\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "ldr x22, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ldr q19, [x20, x14]\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+    "ldr x25, [x15, #0x50]\n"
+    "ldr x24, [x15, #0x58]\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x26, x14]\n"
+    "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v29.8h, v5.8h, v13.8h\n"
+    "fmla v28.8h, v6.8h, v17.8h\n"
+    "ldr x12, [x15, #0x70]\n"
+    "ldr x11, [x15, #0x88]\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+    "fmla v27.8h, v3.8h, v13.8h\n"
+    "ldr x10, [x17, #0x0]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v25.8h, v1.8h, v13.8h\n"
+    "ldr x9, [x17, #0x8]\n"
+    "ldr x28, [x17, #0x10]\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "ldr q18, [x23, x14]\n"
+    "fmla v23.8h, v6.8h, v12.8h\n"
+    "ldr q16, [x22, x14]\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
+    "ldr q31, [x16, #0x0]\n"
+    "fmla v29.8h, v7.8h, v17.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "fmla v28.8h, v0.8h, v18.8h\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x21, x14]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla v26.8h, v4.8h, v17.8h\n"
+    "fmla v25.8h, v3.8h, v17.8h\n"
+    "ldr x21, [x15, #0x80]\n"
+    "ldr x27, [x17, #0x18]\n"
+    "fmla v21.8h, v0.8h, v17.8h\n"
+    "fmla v24.8h, v4.8h, v19.8h\n"
+    "fmla v23.8h, v1.8h, v17.8h\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v29.8h, v1.8h, v18.8h\n"
+    "ldr q20, [x24, x14]\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v27.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x20, x14]\n"
+    "ldr x26, [x15, #0x90]\n"
+    "fmla v25.8h, v5.8h, v19.8h\n"
+    "fmla v21.8h, v2.8h, v19.8h\n"
+    "ldr x25, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v26.8h, v0.8h, v17.8h\n"
+    "fmla v24.8h, v2.8h, v20.8h\n"
+    "fmla v28.8h, v8.8h, v19.8h\n"
+    "fmla v27.8h, v7.8h, v19.8h\n"
+    "fmla v22.8h, v1.8h, v19.8h\n"
+    "ldr q19, [x23, x14]\n"
+    "fmla v23.8h, v3.8h, v16.8h\n"
+    "ldr x24, [x15, #0xa8]\n"
+    "fmla v26.8h, v6.8h, v16.8h\n"
+    "ldr q18, [x21, x14]\n"
+    "fmla v25.8h, v7.8h, v19.8h\n"
+    "ldr x23, [x15, #0xc0]\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "fmla v21.8h, v4.8h, v19.8h\n"
+    "fmla v29.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x12, x14]\n"
+    "fmla v27.8h, v5.8h, v20.8h\n"
+    "ldr q16, [x22, x14]\n"
+    "fmla v23.8h, v5.8h, v19.8h\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "ldr x22, [x15, #0xb0]\n"
+    "ldr x21, [x15, #0xb8]\n"
+    "fmla v26.8h, v8.8h, v19.8h\n"
+    "fmla v24.8h, v8.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v16.8h\n"
+    "fmla v28.8h, v3.8h, v18.8h\n"
+    "fmla v25.8h, v0.8h, v18.8h\n"
+    "fmla v22.8h, v5.8h, v17.8h\n"
+    "ldr q17, [x11, x14]\n"
+    "fmla v23.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x26, x14]\n"
+    "fmla v29.8h, v4.8h, v18.8h\n"
+    "fmla v26.8h, v1.8h, v18.8h\n"
+    "ldr q18, [x20, x14]\n"
+    "fmla v28.8h, v5.8h, v17.8h\n"
+    "fmla v27.8h, v4.8h, v17.8h\n"
+    "fmla v25.8h, v2.8h, v17.8h\n"
+    "fmla v24.8h, v1.8h, v17.8h\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v21.8h, v8.8h, v16.8h\n"
+    "ldr x20, [x15, #0x20]\n"
+    "fmla v22.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x24, x14]\n"
+    "fmla v29.8h, v2.8h, v17.8h\n"
+    "fmla v26.8h, v7.8h, v18.8h\n"
+    "fmla v25.8h, v6.8h, v18.8h\n"
+    "fmla v23.8h, v4.8h, v18.8h\n"
+    "fmla v21.8h, v3.8h, v18.8h\n"
+    "ldr q18, [x22, x14]\n"
+    "fmla v22.8h, v4.8h, v16.8h\n"
+    "ldr q4, [x16, #0x50]\n"
+    "fmla v28.8h, v1.8h, v17.8h\n"
+    "ldr q1, [x16, #0x20]\n"
+    "fmla v27.8h, v0.8h, v17.8h\n"
+    "ldr q17, [x21, x14]\n"
+    "fmla v29.8h, v6.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmla v24.8h, v7.8h, v16.8h\n"
+    "fmla v21.8h, v5.8h, v16.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "str q29, [x10, x13]\n"
+    "fmla v23.8h, v0.8h, v18.8h\n"
+    "ldr q0, [x16, #0x10]\n"
+    "fmla v22.8h, v2.8h, v17.8h\n"
+    "ldr q2, [x16, #0x30]\n"
+    "fmla v25.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x23, x14]\n"
+    "fmla v26.8h, v3.8h, v18.8h\n"
+    "ldr q3, [x16, #0x40]\n"
+    "fmla v27.8h, v8.8h, v17.8h\n"
+    "fmla v24.8h, v5.8h, v17.8h\n"
+    "ldr q5, [x16, #0x60]\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmla v23.8h, v8.8h, v16.8h\n"
+    "ldr q8, [x16, #0x90]\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "ldr q7, [x16, #0x80]\n"
+    "fmla v22.8h, v6.8h, v16.8h\n"
+    "ldr q13, [x20, x7]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "ldr x24, [x17, #0x20]\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q9, [x21, x7]\n"
+    "ldr q10, [x20, x7]\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q11, [x21, x7]\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "ldr q12, [x20, x7]\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "str q28, [x9, x13]\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "str q27, [x28, x13]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "str q26, [x27, x13]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x7, x7, #0x10\n"
+    "str q25, [x24, x13]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "cmp x7, x8, LSL #4\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "add x14, x14, #0x10\n"
+    "str q24, [x23, x13]\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "str q23, [x22, x13]\n"
+    "add x16, x16, #0xa0\n"
+    "str q21, [x21, x13]\n"
+    "str q22, [x20, x13]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
+    "ldr x23, [x15, #0x30]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ldr q19, [x20, x14]\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+    "ldr x25, [x15, #0x50]\n"
+    "ldr x24, [x15, #0x58]\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q17, [x23, x14]\n"
+    "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "ldr x23, [x15, #0x60]\n"
+    "fmla v29.8h, v5.8h, v13.8h\n"
+    "fmla v28.8h, v6.8h, v17.8h\n"
+    "ldr x12, [x15, #0x70]\n"
+    "ldr x11, [x15, #0x88]\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+    "fmla v27.8h, v3.8h, v13.8h\n"
+    "ldr x10, [x17, #0x0]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v25.8h, v1.8h, v13.8h\n"
+    "ldr x9, [x17, #0x8]\n"
+    "ldr x28, [x17, #0x10]\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "ldr q18, [x22, x14]\n"
+    "fmla v23.8h, v6.8h, v12.8h\n"
+    "ldr q16, [x21, x14]\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
+    "fmla v29.8h, v7.8h, v17.8h\n"
+    "ldr x22, [x15, #0x68]\n"
+    "ldr x21, [x15, #0x78]\n"
+    "fmla v28.8h, v0.8h, v18.8h\n"
+    "fmla v22.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x20, x14]\n"
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v26.8h, v4.8h, v17.8h\n"
+    "fmla v25.8h, v3.8h, v17.8h\n"
+    "ldr x27, [x17, #0x18]\n"
+    "fmla v21.8h, v0.8h, v17.8h\n"
+    "fmla v24.8h, v4.8h, v19.8h\n"
+    "fmla v23.8h, v1.8h, v17.8h\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v29.8h, v1.8h, v18.8h\n"
+    "ldr q20, [x24, x14]\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v27.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x23, x14]\n"
+    "ldr x26, [x15, #0x90]\n"
+    "fmla v25.8h, v5.8h, v19.8h\n"
+    "fmla v21.8h, v2.8h, v19.8h\n"
+    "ldr x25, [x15, #0xa0]\n"
+    "ldr x24, [x15, #0x98]\n"
+    "fmla v26.8h, v0.8h, v17.8h\n"
+    "fmla v24.8h, v2.8h, v20.8h\n"
+    "fmla v28.8h, v8.8h, v19.8h\n"
+    "fmla v27.8h, v7.8h, v19.8h\n"
+    "fmla v22.8h, v1.8h, v19.8h\n"
+    "ldr q19, [x22, x14]\n"
+    "fmla v23.8h, v3.8h, v16.8h\n"
+    "ldr x23, [x15, #0xa8]\n"
+    "fmla v26.8h, v6.8h, v16.8h\n"
+    "ldr q18, [x20, x14]\n"
+    "fmla v25.8h, v7.8h, v19.8h\n"
+    "ldr x22, [x15, #0xc0]\n"
+    "fmla v24.8h, v6.8h, v19.8h\n"
+    "fmla v21.8h, v4.8h, v19.8h\n"
+    "fmla v29.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x12, x14]\n"
+    "fmla v27.8h, v5.8h, v20.8h\n"
+    "ldr q16, [x21, x14]\n"
+    "fmla v23.8h, v5.8h, v19.8h\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v26.8h, v8.8h, v19.8h\n"
+    "fmla v24.8h, v8.8h, v17.8h\n"
+    "fmla v21.8h, v6.8h, v16.8h\n"
+    "fmla v28.8h, v3.8h, v18.8h\n"
+    "fmla v25.8h, v0.8h, v18.8h\n"
+    "fmla v22.8h, v5.8h, v17.8h\n"
+    "ldr q17, [x11, x14]\n"
+    "fmla v23.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x26, x14]\n"
+    "fmla v29.8h, v4.8h, v18.8h\n"
+    "fmla v26.8h, v1.8h, v18.8h\n"
+    "ldr q18, [x24, x14]\n"
+    "fmla v28.8h, v5.8h, v17.8h\n"
+    "fmla v27.8h, v4.8h, v17.8h\n"
+    "fmla v25.8h, v2.8h, v17.8h\n"
+    "fmla v24.8h, v1.8h, v17.8h\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v21.8h, v8.8h, v16.8h\n"
+    "fmla v22.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x23, x14]\n"
+    "fmla v29.8h, v2.8h, v17.8h\n"
+    "fmla v26.8h, v7.8h, v18.8h\n"
+    "fmla v25.8h, v6.8h, v18.8h\n"
+    "fmla v23.8h, v4.8h, v18.8h\n"
+    "fmla v21.8h, v3.8h, v18.8h\n"
+    "ldr q18, [x21, x14]\n"
+    "fmla v22.8h, v4.8h, v16.8h\n"
+    "fmla v28.8h, v1.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmla v27.8h, v0.8h, v17.8h\n"
+    "ldr q17, [x20, x14]\n"
+    "fmla v29.8h, v6.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmla v24.8h, v7.8h, v16.8h\n"
+    "fmla v21.8h, v5.8h, v16.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "str q29, [x10, x13]\n"
+    "fmla v23.8h, v0.8h, v18.8h\n"
+    "fmla v22.8h, v2.8h, v17.8h\n"
+    "ldr x20, [x17, #0x20]\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmla v25.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x22, x14]\n"
+    "fmla v26.8h, v3.8h, v18.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmla v27.8h, v8.8h, v17.8h\n"
+    "fmla v24.8h, v5.8h, v17.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "str q28, [x9, x13]\n"
+    "fmla v23.8h, v8.8h, v16.8h\n"
+    "fmla v21.8h, v7.8h, v16.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "ldr x23, [x17, #0x28]\n"
+    "fmla v22.8h, v6.8h, v16.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "str q27, [x28, x13]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "str q26, [x27, x13]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "str q25, [x20, x13]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "add x14, x14, #0x10\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "str q24, [x23, x13]\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "str q23, [x22, x13]\n"
+    "str q21, [x21, x13]\n"
+    "str q22, [x20, x13]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 92f\n"
+    "ldr q31, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "mov x13, x14\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "ldr x24, [x15, #0x0]\n"
+    "ldr x23, [x15, #0x8]\n"
+    "add x24, x24, x14\n"
+    "add x23, x23, x14\n"
+    "ldr x22, [x15, #0x10]\n"
+    "ldr x21, [x15, #0x18]\n"
+    "add x22, x22, x14\n"
+    "add x21, x21, x14\n"
+    "ldr x20, [x15, #0x20]\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v9.d }[0], [x24], #0x8\n"
+    "ld1 { v10.d }[0], [x23], #0x8\n"
+    "ld1 { v11.d }[0], [x22], #0x8\n"
+    "ld1 { v12.d }[0], [x21], #0x8\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[6], [x24], #0x2\n"
+    "ld1 { v10.h }[6], [x23], #0x2\n"
+    "ld1 { v11.h }[6], [x22], #0x2\n"
+    "ld1 { v12.h }[6], [x21], #0x2\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[4], [x24], #0x2\n"
+    "ld1 { v10.h }[4], [x23], #0x2\n"
+    "ld1 { v11.h }[4], [x22], #0x2\n"
+    "ld1 { v12.h }[4], [x21], #0x2\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.s }[0], [x24], #0x4\n"
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "ld1 { v11.s }[0], [x22], #0x4\n"
+    "ld1 { v12.s }[0], [x21], #0x4\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[2], [x24], #0x2\n"
+    "ld1 { v10.h }[2], [x23], #0x2\n"
+    "ld1 { v11.h }[2], [x22], #0x2\n"
+    "ld1 { v12.h }[2], [x21], #0x2\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x24], #0x2\n"
+    "ld1 { v10.h }[0], [x23], #0x2\n"
+    "ld1 { v11.h }[0], [x22], #0x2\n"
+    "ld1 { v12.h }[0], [x21], #0x2\n"
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "7:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
+    "mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+    "mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "add x20, x20, x14\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
+    "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+    "mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
+    "mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "fmla v25.8h, v2.8h, v11.8h\n"
+    "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v29.8h, v6.8h, v12.8h\n"
+    "fmla v23.8h, v5.8h, v13.8h\n"
+    "fmla v24.8h, v4.8h, v13.8h\n"
+    "fmla v25.8h, v3.8h, v13.8h\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "fmla v27.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v0.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 9f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load input (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "11:"  // Oddments: Load input (4, 4): Bit 2: End
+    "ldr x20, [x15, #0x30]\n"
+    "fmla v31.8h, v8.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load input (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "15:"  // Oddments: Load input (2, 1): Bit 2: End
+    "ldr x20, [x15, #0x38]\n"
+    "fmla v23.8h, v7.8h, v11.8h\n"
+    "fmla v24.8h, v6.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v26.8h, v4.8h, v11.8h\n"
+    "fmla v27.8h, v3.8h, v11.8h\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load input (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "19:"  // Oddments: Load input (0, 1): Bit 2: End
+    "ldr x20, [x15, #0x40]\n"
+    "fmla v23.8h, v1.8h, v13.8h\n"
+    "fmla v24.8h, v0.8h, v13.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 21f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load input (0, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load input (0, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "23:"  // Oddments: Load input (0, 3): Bit 2: End
+    "ldr x20, [x15, #0x48]\n"
+    "fmla v24.8h, v2.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 25f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load input (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "27:"  // Oddments: Load input (2, 3): Bit 2: End
+    "ldr x20, [x15, #0x50]\n"
+    "fmla v24.8h, v8.8h, v10.8h\n"
+    "fmla v25.8h, v7.8h, v10.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v27.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v2.8h, v10.8h\n"
+    "fmla v31.8h, v1.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 29f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load input (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "31:"  // Oddments: Load input (1, 0): Bit 2: End
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v23.8h, v3.8h, v11.8h\n"
+    "fmla v26.8h, v0.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 33f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load input (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "35:"  // Oddments: Load input (1, 4): Bit 2: End
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v25.8h, v5.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 37f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load input (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "39:"  // Oddments: Load input (3, 0): Bit 2: End
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v26.8h, v6.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 41f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load input (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "43:"  // Oddments: Load input (3, 2): Bit 2: End
+    "ldr x20, [x15, #0x70]\n"
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v28.8h, v6.8h, v10.8h\n"
+    "fmla v29.8h, v5.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v31.8h, v3.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 45f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load input (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "47:"  // Oddments: Load input (3, 4): Bit 2: End
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v5.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 49f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load input (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "51:"  // Oddments: Load input (4, 1): Bit 2: End
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v29.8h, v7.8h, v13.8h\n"
+    "fmla v30.8h, v6.8h, v13.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 53f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load input (1, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load input (1, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "55:"  // Oddments: Load input (1, 1): Bit 2: End
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v26.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 57f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load input (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "59:"  // Oddments: Load input (1, 3): Bit 2: End
+    "ldr x20, [x15, #0x90]\n"
+    "fmla v24.8h, v5.8h, v11.8h\n"
+    "fmla v25.8h, v4.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 61f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load input (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "63:"  // Oddments: Load input (4, 3): Bit 2: End
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v30.8h, v8.8h, v13.8h\n"
+    "fmla v31.8h, v7.8h, v13.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #2, 65f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load input (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "67:"  // Oddments: Load input (3, 1): Bit 2: End
+    "ldr x20, [x15, #0xa0]\n"
+    "fmla v26.8h, v7.8h, v12.8h\n"
+    "fmla v27.8h, v6.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 69f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load input (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 70f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "71:"  // Oddments: Load input (0, 2): Bit 2: End
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v25.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 73f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 72f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load input (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 74f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "75:"  // Oddments: Load input (3, 3): Bit 2: End
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla v27.8h, v8.8h, v13.8h\n"
+    "fmla v28.8h, v7.8h, v13.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v30.8h, v5.8h, v13.8h\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 77f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 76f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load input (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 78f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "79:"  // Oddments: Load input (2, 0): Bit 2: End
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v23.8h, v6.8h, v12.8h\n"
+    "fmla v26.8h, v3.8h, v12.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v29.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 81f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 80f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load input (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 82f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "83:"  // Oddments: Load input (2, 4): Bit 2: End
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v11.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 85f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 84f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load input (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 86f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "87:"  // Oddments: Load input (4, 2): Bit 2: End
+    "fmla v29.8h, v8.8h, v13.8h\n"
+    "fmla v30.8h, v7.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "fmla v31.8h, v6.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmax v30.8h, v30.8h, v15.8h\n"
+    "fmax v31.8h, v31.8h, v15.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "tbz %x[n_channels], #2, 89f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.d }[0], [x23]\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.d }[0], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.d }[0], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "add x13, x13, #0x8\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 88f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.s }[2], [x23]\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.s }[2], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.s }[2], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "add x13, x13, #0x4\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.h }[6], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.h }[6], [x23]\n"
+    "st1 { v25.h }[6], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.h }[6], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.h }[6], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.h }[6], [x23]\n"
+    "st1 { v29.h }[6], [x22]\n"
+    "st1 { v30.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 91f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.h }[4], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.h }[4], [x23]\n"
+    "st1 { v25.h }[4], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.h }[4], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.h }[4], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.h }[4], [x23]\n"
+    "st1 { v29.h }[4], [x22]\n"
+    "st1 { v30.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 90f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.s }[0], [x23]\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.s }[0], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "add x13, x13, #0x4\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.h }[2], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.h }[2], [x23]\n"
+    "st1 { v25.h }[2], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.h }[2], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.h }[2], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.h }[2], [x23]\n"
+    "st1 { v29.h }[2], [x22]\n"
+    "st1 { v30.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.h }[0], [x23]\n"
+    "st1 { v25.h }[0], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.h }[0], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.h }[0], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "91:"  // Oddments: Store: Bit 2: End
+    "92:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..04fb532937
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..a1e1dd0e99
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1736 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x27, #0x0\n"
+    "mov x26, #0x0\n"
+    "1:"  // Tile loop
+    "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x23, #0x4\n"
+    "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x27, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x27, x22\n"  // offset = tile_i * ld_output_row
+    "mov x6, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x26, x4, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x4, x4, #0x1\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x26, x5, x20\n"  // offset += tile_j * ld_output_col
+    "lsl x5, x5, #0x1\n"
+    "add x17, x4, x4\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x7, x7, x21, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x15, x7, x24, LSL #1\n"
+    "mul x20, x20, x23\n"  // offset *= output_tile_size
+    "add x14, x15, x24, LSL #1\n"
+    "add x8, x8, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "lsr x13, %x[n_channels], #0x3\n"
+    "add x12, x14, x24, LSL #1\n"
+    "add x11, x17, x4\n"
+    "add x10, x8, x22, LSL #1\n"
+    "add x9, x12, x24, LSL #1\n"
+    "add x28, x11, x4\n"
+    "add x27, x10, x22, LSL #1\n"
+    "add x23, x5, x5\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.8h }, [x20]\n"
+    "add x26, x9, x24, LSL #1\n"
+    "add x25, x28, x4\n"
+    "add x24, x27, x22, LSL #1\n"
+    "add x22, x23, x5\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x6\n"
+    "cbz x13, 4f\n"
+    "ldr q14, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x6, x13, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "add x16, x16, #0xa0\n"
+    "ldr q9, [x14, x17]\n"
+    "ld1 { v10.8h }, [x7]\n"
+    "ldr q11, [x7, x25]\n"
+    "ldr q12, [x14, x11]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v26.16b, v14.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+    "mov v28.16b, v14.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+    "add x6, x6, #0x10\n"
+    "cmp x6, x13, LSL #4\n"
+    "mov v16.16b, v14.16b\n fmla v16.8h, v3.8h, v9.8h\n"
+    "mov v22.16b, v14.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "mov v23.16b, v14.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+    "fmla v26.8h, v5.8h, v12.8h\n"
+    "mov v25.16b, v14.16b\n fmla v25.8h, v7.8h, v9.8h\n"
+    "mov v17.16b, v14.16b\n fmla v17.8h, v6.8h, v9.8h\n"
+    "mov v31.16b, v14.16b\n fmla v31.8h, v5.8h, v9.8h\n"
+    "mov v20.16b, v14.16b\n fmla v20.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x12, x17]\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "ld1 { v30.8h }, [x26]\n"
+    "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
+    "ldr q27, [x26, x25]\n"
+    "fmla v16.8h, v4.8h, v12.8h\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v23.8h, v1.8h, v12.8h\n"
+    "mov v21.16b, v14.16b\n fmla v21.8h, v6.8h, v30.8h\n"
+    "ldr q10, [x12, x11]\n"
+    "fmla v26.8h, v7.8h, v9.8h\n"
+    "fmla v25.8h, v8.8h, v12.8h\n"
+    "fmla v17.8h, v7.8h, v12.8h\n"
+    "fmla v29.8h, v6.8h, v12.8h\n"
+    "mov v24.16b, v14.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+    "mov v19.16b, v14.16b\n fmla v19.8h, v0.8h, v12.8h\n"
+    "ldr q11, [x7, x4]\n"
+    "mov v30.16b, v14.16b\n fmla v30.8h, v8.8h, v27.8h\n"
+    "ldr q12, [x7, x28]\n"
+    "fmla v16.8h, v6.8h, v9.8h\n"
+    "fmla v22.8h, v4.8h, v9.8h\n"
+    "fmla v23.8h, v3.8h, v9.8h\n"
+    "mov v27.16b, v14.16b\n fmla v27.8h, v1.8h, v9.8h\n"
+    "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+    "ldr q14, [x16, #0x0]\n"
+    "fmla v31.8h, v8.8h, v9.8h\n"
+    "fmla v20.8h, v5.8h, v9.8h\n"
+    "fmla v21.8h, v2.8h, v9.8h\n"
+    "ld1 { v9.8h }, [x15]\n"
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v25.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x15, x25]\n"
+    "fmla v17.8h, v2.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "ld1 { v12.8h }, [x9]\n"
+    "fmla v16.8h, v7.8h, v10.8h\n"
+    "fmla v24.8h, v6.8h, v10.8h\n"
+    "fmla v22.8h, v5.8h, v10.8h\n"
+    "fmla v23.8h, v4.8h, v10.8h\n"
+    "fmla v19.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v2.8h, v10.8h\n"
+    "fmla v18.8h, v1.8h, v10.8h\n"
+    "fmla v30.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x15, x17]\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v20.8h, v6.8h, v12.8h\n"
+    "fmla v21.8h, v3.8h, v12.8h\n"
+    "ldr q12, [x9, x25]\n"
+    "fmla v26.8h, v1.8h, v10.8h\n"
+    "fmla v28.8h, v3.8h, v9.8h\n"
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "fmla v24.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x15, x11]\n"
+    "fmla v25.8h, v4.8h, v10.8h\n"
+    "fmla v17.8h, v3.8h, v10.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "fmla v19.8h, v8.8h, v12.8h\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr q9, [x26, x4]\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "fmla v26.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v10.8h\n"
+    "ldr q10, [x14, x4]\n"
+    "fmla v25.8h, v5.8h, v11.8h\n"
+    "fmla v17.8h, v4.8h, v11.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "fmla v16.8h, v1.8h, v11.8h\n"
+    "fmla v24.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x14, x28]\n"
+    "fmla v21.8h, v7.8h, v9.8h\n"
+    "fmla v27.8h, v6.8h, v9.8h\n"
+    "ldr q12, [x26, x28]\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v26.8h, v3.8h, v10.8h\n"
+    "fmla v20.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "fmla v28.8h, v7.8h, v10.8h\n"
+    "fmla v25.8h, v6.8h, v10.8h\n"
+    "ldr q10, [x7, x17]\n"
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "ldr q9, [x12, x4]\n"
+    "fmla v17.8h, v8.8h, v11.8h\n"
+    "fmla v29.8h, v7.8h, v11.8h\n"
+    "fmla v16.8h, v5.8h, v11.8h\n"
+    "fmla v24.8h, v4.8h, v11.8h\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "ldr q12, [x7, x11]\n"
+    "add x7, x7, #0x10\n"
+    "fmla v31.8h, v7.8h, v9.8h\n"
+    "fmla v26.8h, v6.8h, v9.8h\n"
+    "fmla v20.8h, v4.8h, v9.8h\n"
+    "fmla v22.8h, v3.8h, v9.8h\n"
+    "fmla v21.8h, v1.8h, v9.8h\n"
+    "fmla v27.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x12, x28]\n"
+    "fmla v28.8h, v2.8h, v10.8h\n"
+    "fmla v25.8h, v1.8h, v10.8h\n"
+    "fmla v17.8h, v0.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x14]\n"
+    "fmla v18.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v0.8h, v12.8h\n"
+    "fmla v31.8h, v3.8h, v10.8h\n"
+    "fmla v20.8h, v0.8h, v10.8h\n"
+    "fmla v16.8h, v8.8h, v9.8h\n"
+    "fmla v24.8h, v7.8h, v9.8h\n"
+    "fmla v23.8h, v5.8h, v9.8h\n"
+    "fmla v19.8h, v4.8h, v9.8h\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "ldr q11, [x9, x17]\n"
+    "fmla v25.8h, v2.8h, v12.8h\n"
+    "fmla v17.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x14, x25]\n"
+    "add x14, x14, #0x10\n"
+    "ldr q9, [x14, x17]\n"
+    "fmla v28.8h, v6.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x12]\n"
+    "fmla v27.8h, v4.8h, v11.8h\n"
+    "fmla v18.8h, v3.8h, v11.8h\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v19.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x12, x25]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.8h, v6.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "fmla v21.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x26, x17]\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "fmla v18.8h, v6.8h, v10.8h\n"
+    "fmla v20.8h, v8.8h, v11.8h\n"
+    "fmla v22.8h, v7.8h, v11.8h\n"
+    "fmla v23.8h, v6.8h, v11.8h\n"
+    "fmla v21.8h, v5.8h, v11.8h\n"
+    "ldr q11, [x9, x11]\n"
+    "fmla v19.8h, v5.8h, v12.8h\n"
+    "fmla v27.8h, v5.8h, v11.8h\n"
+    "fmla v18.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "fmla v24.8h, v8.8h, v12.8h\n"
+    "ldr q12, [x26, x11]\n"
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "ldr q10, [x15, x4]\n"
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "fmla v23.8h, v7.8h, v11.8h\n"
+    "add x26, x26, #0x10\n"
+    "fmla v19.8h, v6.8h, v11.8h\n"
+    "ldr q11, [x15, x28]\n"
+    "fmla v27.8h, v8.8h, v12.8h\n"
+    "add x15, x15, #0x10\n"
+    "fmla v18.8h, v7.8h, v12.8h\n"
+    "fmla v30.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x9, x4]\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "fmla v25.8h, v3.8h, v10.8h\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmla v31.8h, v1.8h, v10.8h\n"
+    "fmla v26.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x9, x28]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "fmla v17.8h, v5.8h, v11.8h\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "add x9, x9, #0x10\n"
+    "fmla v16.8h, v2.8h, v11.8h\n"
+    "ldr q2, [x16, #0x30]\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x7, x25]\n"
+    "ldr q1, [x16, #0x20]\n"
+    "fmla v20.8h, v7.8h, v12.8h\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "ldr q6, [x16, #0x70]\n"
+    "fmla v21.8h, v4.8h, v12.8h\n"
+    "fmla v27.8h, v3.8h, v12.8h\n"
+    "ldr q12, [x14, x11]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "fmla v23.8h, v8.8h, v10.8h\n"
+    "ldr q8, [x16, #0x90]\n"
+    "fmla v19.8h, v7.8h, v10.8h\n"
+    "ldr q7, [x16, #0x80]\n"
+    "fmla v18.8h, v5.8h, v10.8h\n"
+    "ldr q5, [x16, #0x60]\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x7]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "add x16, x16, #0xa0\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "fmax v16.8h, v16.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v13.8h\n"
+    "fmax v20.8h, v20.8h, v13.8h\n"
+    "fmax v22.8h, v22.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v13.8h\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "fmax v21.8h, v21.8h, v13.8h\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v30.8h, v30.8h, v13.8h\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v25.8h, v25.8h, v15.8h\n"
+    "st1 { v28.8h }, [x8]\n"
+    "fmin v17.8h, v17.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "str q25, [x8, x5]\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "fmin v26.8h, v26.8h, v15.8h\n"
+    "str q17, [x8, x23]\n"
+    "fmin v16.8h, v16.8h, v15.8h\n"
+    "fmin v24.8h, v24.8h, v15.8h\n"
+    "str q29, [x8, x22]\n"
+    "add x8, x8, #0x10\n"
+    "fmin v20.8h, v20.8h, v15.8h\n"
+    "fmin v22.8h, v22.8h, v15.8h\n"
+    "st1 { v31.8h }, [x10]\n"
+    "fmin v23.8h, v23.8h, v15.8h\n"
+    "fmin v19.8h, v19.8h, v15.8h\n"
+    "str q26, [x10, x5]\n"
+    "fmin v21.8h, v21.8h, v15.8h\n"
+    "fmin v27.8h, v27.8h, v15.8h\n"
+    "str q16, [x10, x23]\n"
+    "fmin v18.8h, v18.8h, v15.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "str q24, [x10, x22]\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v20.8h }, [x27]\n"
+    "str q22, [x27, x5]\n"
+    "str q23, [x27, x23]\n"
+    "str q19, [x27, x22]\n"
+    "add x27, x27, #0x10\n"
+    "st1 { v21.8h }, [x24]\n"
+    "str q27, [x24, x5]\n"
+    "str q18, [x24, x23]\n"
+    "str q30, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v16.16b, v14.16b\n fmla v16.8h, v4.8h, v9.8h\n"
+    "mov v23.16b, v14.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+    "mov v31.16b, v14.16b\n fmla v31.8h, v3.8h, v9.8h\n"
+    "mov v30.16b, v14.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+    "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+    "fmla v16.8h, v5.8h, v12.8h\n"
+    "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+    "mov v19.16b, v14.16b\n fmla v19.8h, v6.8h, v9.8h\n"
+    "mov v28.16b, v14.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+    "mov v27.16b, v14.16b\n fmla v27.8h, v2.8h, v9.8h\n"
+    "ldr q24, [x12, x17]\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "ld1 { v21.8h }, [x26]\n"
+    "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
+    "ldr q20, [x26, x25]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "fmla v18.8h, v1.8h, v12.8h\n"
+    "mov v26.16b, v14.16b\n fmla v26.8h, v6.8h, v21.8h\n"
+    "ldr q9, [x12, x11]\n"
+    "fmla v16.8h, v7.8h, v24.8h\n"
+    "fmla v17.8h, v8.8h, v12.8h\n"
+    "fmla v19.8h, v7.8h, v12.8h\n"
+    "fmla v29.8h, v6.8h, v12.8h\n"
+    "mov v11.16b, v14.16b\n fmla v11.8h, v3.8h, v12.8h\n"
+    "mov v10.16b, v14.16b\n fmla v10.8h, v0.8h, v12.8h\n"
+    "ldr q22, [x7, x4]\n"
+    "mov v25.16b, v14.16b\n fmla v25.8h, v8.8h, v20.8h\n"
+    "ldr q21, [x7, x28]\n"
+    "fmla v31.8h, v6.8h, v24.8h\n"
+    "fmla v30.8h, v4.8h, v24.8h\n"
+    "fmla v18.8h, v3.8h, v24.8h\n"
+    "mov v12.16b, v14.16b\n fmla v12.8h, v1.8h, v24.8h\n"
+    "fmla v14.8h, v0.8h, v24.8h\n"
+    "fmla v28.8h, v8.8h, v24.8h\n"
+    "fmla v27.8h, v5.8h, v24.8h\n"
+    "fmla v26.8h, v2.8h, v24.8h\n"
+    "ld1 { v24.8h }, [x15]\n"
+    "fmla v16.8h, v8.8h, v9.8h\n"
+    "fmla v23.8h, v1.8h, v22.8h\n"
+    "fmla v17.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x15, x25]\n"
+    "fmla v19.8h, v2.8h, v21.8h\n"
+    "fmla v29.8h, v1.8h, v21.8h\n"
+    "ld1 { v20.8h }, [x9]\n"
+    "fmla v31.8h, v7.8h, v9.8h\n"
+    "fmla v11.8h, v6.8h, v9.8h\n"
+    "fmla v30.8h, v5.8h, v9.8h\n"
+    "fmla v18.8h, v4.8h, v9.8h\n"
+    "fmla v10.8h, v3.8h, v9.8h\n"
+    "fmla v12.8h, v2.8h, v9.8h\n"
+    "fmla v14.8h, v1.8h, v9.8h\n"
+    "fmla v25.8h, v0.8h, v9.8h\n"
+    "ldr q21, [x15, x17]\n"
+    "fmla v28.8h, v0.8h, v24.8h\n"
+    "fmla v27.8h, v6.8h, v20.8h\n"
+    "fmla v26.8h, v3.8h, v20.8h\n"
+    "ldr q20, [x9, x25]\n"
+    "fmla v16.8h, v1.8h, v21.8h\n"
+    "fmla v23.8h, v3.8h, v24.8h\n"
+    "fmla v29.8h, v5.8h, v22.8h\n"
+    "fmla v11.8h, v2.8h, v22.8h\n"
+    "ldr q22, [x15, x11]\n"
+    "fmla v17.8h, v4.8h, v21.8h\n"
+    "fmla v19.8h, v3.8h, v21.8h\n"
+    "fmla v31.8h, v0.8h, v21.8h\n"
+    "fmla v10.8h, v8.8h, v20.8h\n"
+    "fmla v25.8h, v5.8h, v20.8h\n"
+    "ldr q20, [x26, x4]\n"
+    "fmla v28.8h, v2.8h, v21.8h\n"
+    "fmla v16.8h, v2.8h, v22.8h\n"
+    "fmla v23.8h, v5.8h, v21.8h\n"
+    "ldr q21, [x14, x4]\n"
+    "fmla v17.8h, v5.8h, v22.8h\n"
+    "fmla v19.8h, v4.8h, v22.8h\n"
+    "fmla v29.8h, v3.8h, v22.8h\n"
+    "fmla v31.8h, v1.8h, v22.8h\n"
+    "fmla v11.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x14, x28]\n"
+    "fmla v26.8h, v7.8h, v20.8h\n"
+    "fmla v12.8h, v6.8h, v20.8h\n"
+    "ldr q20, [x26, x28]\n"
+    "fmla v28.8h, v4.8h, v21.8h\n"
+    "fmla v16.8h, v3.8h, v21.8h\n"
+    "fmla v27.8h, v1.8h, v21.8h\n"
+    "fmla v30.8h, v0.8h, v21.8h\n"
+    "fmla v23.8h, v7.8h, v21.8h\n"
+    "fmla v17.8h, v6.8h, v21.8h\n"
+    "ldr q21, [x7, x17]\n"
+    "fmla v14.8h, v8.8h, v20.8h\n"
+    "fmla v25.8h, v7.8h, v20.8h\n"
+    "ldr q20, [x12, x4]\n"
+    "fmla v19.8h, v8.8h, v22.8h\n"
+    "fmla v29.8h, v7.8h, v22.8h\n"
+    "fmla v31.8h, v5.8h, v22.8h\n"
+    "fmla v11.8h, v4.8h, v22.8h\n"
+    "fmla v18.8h, v2.8h, v22.8h\n"
+    "fmla v10.8h, v1.8h, v22.8h\n"
+    "ldr q22, [x7, x11]\n"
+    "add x7, x7, #0x10\n"
+    "fmla v28.8h, v7.8h, v20.8h\n"
+    "fmla v16.8h, v6.8h, v20.8h\n"
+    "fmla v27.8h, v4.8h, v20.8h\n"
+    "fmla v30.8h, v3.8h, v20.8h\n"
+    "fmla v26.8h, v1.8h, v20.8h\n"
+    "fmla v12.8h, v0.8h, v20.8h\n"
+    "ldr q20, [x12, x28]\n"
+    "fmla v23.8h, v2.8h, v21.8h\n"
+    "fmla v17.8h, v1.8h, v21.8h\n"
+    "fmla v19.8h, v0.8h, v21.8h\n"
+    "ld1 { v21.8h }, [x14]\n"
+    "fmla v14.8h, v2.8h, v20.8h\n"
+    "fmla v29.8h, v0.8h, v22.8h\n"
+    "fmla v28.8h, v3.8h, v21.8h\n"
+    "fmla v27.8h, v0.8h, v21.8h\n"
+    "fmla v31.8h, v8.8h, v20.8h\n"
+    "fmla v11.8h, v7.8h, v20.8h\n"
+    "fmla v18.8h, v5.8h, v20.8h\n"
+    "fmla v10.8h, v4.8h, v20.8h\n"
+    "fmla v25.8h, v1.8h, v20.8h\n"
+    "ldr q24, [x9, x17]\n"
+    "fmla v17.8h, v2.8h, v22.8h\n"
+    "fmla v19.8h, v1.8h, v22.8h\n"
+    "ldr q20, [x14, x25]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v23.8h, v6.8h, v21.8h\n"
+    "ld1 { v21.8h }, [x12]\n"
+    "fmla v12.8h, v4.8h, v24.8h\n"
+    "fmla v14.8h, v3.8h, v24.8h\n"
+    "fmla v29.8h, v8.8h, v20.8h\n"
+    "fmla v11.8h, v5.8h, v20.8h\n"
+    "fmla v10.8h, v2.8h, v20.8h\n"
+    "ldr q20, [x12, x25]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.8h, v6.8h, v21.8h\n"
+    "fmla v27.8h, v3.8h, v21.8h\n"
+    "fmla v26.8h, v0.8h, v21.8h\n"
+    "ldr q22, [x26, x17]\n"
+    "fmla v25.8h, v2.8h, v20.8h\n"
+    "fmla v12.8h, v7.8h, v22.8h\n"
+    "fmla v14.8h, v6.8h, v22.8h\n"
+    "fmla v27.8h, v8.8h, v24.8h\n"
+    "fmla v30.8h, v7.8h, v24.8h\n"
+    "fmla v18.8h, v6.8h, v24.8h\n"
+    "fmla v26.8h, v5.8h, v24.8h\n"
+    "ldr q21, [x9, x11]\n"
+    "fmla v10.8h, v5.8h, v20.8h\n"
+    "fmla v12.8h, v5.8h, v21.8h\n"
+    "fmla v14.8h, v4.8h, v21.8h\n"
+    "fmla v25.8h, v3.8h, v21.8h\n"
+    "fmla v11.8h, v8.8h, v20.8h\n"
+    "ldr q20, [x26, x11]\n"
+    "fmla v26.8h, v8.8h, v22.8h\n"
+    "ldr q9, [x15, x4]\n"
+    "fmla v30.8h, v8.8h, v21.8h\n"
+    "fmla v18.8h, v7.8h, v21.8h\n"
+    "add x26, x26, #0x10\n"
+    "fmla v10.8h, v6.8h, v21.8h\n"
+    "ldr q21, [x15, x28]\n"
+    "fmla v12.8h, v8.8h, v20.8h\n"
+    "add x15, x15, #0x10\n"
+    "fmla v14.8h, v7.8h, v20.8h\n"
+    "fmla v25.8h, v6.8h, v20.8h\n"
+    "ldr q24, [x9, x4]\n"
+    "fmla v23.8h, v4.8h, v9.8h\n"
+    "fmla v17.8h, v3.8h, v9.8h\n"
+    "fmax v23.8h, v23.8h, v13.8h\n"
+    "fmla v28.8h, v1.8h, v9.8h\n"
+    "fmla v16.8h, v0.8h, v9.8h\n"
+    "ldr q0, [x9, x28]\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmla v19.8h, v5.8h, v21.8h\n"
+    "fmla v29.8h, v4.8h, v21.8h\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "add x9, x9, #0x10\n"
+    "fmla v31.8h, v2.8h, v21.8h\n"
+    "fmla v11.8h, v1.8h, v21.8h\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "fmla v27.8h, v7.8h, v24.8h\n"
+    "fmla v30.8h, v6.8h, v24.8h\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmla v26.8h, v4.8h, v24.8h\n"
+    "fmla v12.8h, v3.8h, v24.8h\n"
+    "fmax v16.8h, v16.8h, v13.8h\n"
+    "fmla v18.8h, v8.8h, v0.8h\n"
+    "fmla v10.8h, v7.8h, v0.8h\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "fmla v14.8h, v5.8h, v0.8h\n"
+    "fmla v25.8h, v4.8h, v0.8h\n"
+    "fmax v11.8h, v11.8h, v13.8h\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v30.8h, v30.8h, v13.8h\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v10.8h, v10.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "fmax v12.8h, v12.8h, v13.8h\n"
+    "fmax v14.8h, v14.8h, v13.8h\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "fmin v23.8h, v23.8h, v15.8h\n"
+    "fmin v17.8h, v17.8h, v15.8h\n"
+    "st1 { v23.8h }, [x8]\n"
+    "fmin v19.8h, v19.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "str q17, [x8, x5]\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v16.8h, v16.8h, v15.8h\n"
+    "str q19, [x8, x23]\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "fmin v11.8h, v11.8h, v15.8h\n"
+    "str q29, [x8, x22]\n"
+    "add x8, x8, #0x10\n"
+    "fmin v27.8h, v27.8h, v15.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "st1 { v28.8h }, [x10]\n"
+    "fmin v18.8h, v18.8h, v15.8h\n"
+    "fmin v10.8h, v10.8h, v15.8h\n"
+    "str q16, [x10, x5]\n"
+    "fmin v26.8h, v26.8h, v15.8h\n"
+    "fmin v12.8h, v12.8h, v15.8h\n"
+    "str q31, [x10, x23]\n"
+    "fmin v14.8h, v14.8h, v15.8h\n"
+    "fmin v25.8h, v25.8h, v15.8h\n"
+    "str q11, [x10, x22]\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v27.8h }, [x27]\n"
+    "str q30, [x27, x5]\n"
+    "str q18, [x27, x23]\n"
+    "str q10, [x27, x22]\n"
+    "add x27, x27, #0x10\n"
+    "st1 { v26.8h }, [x24]\n"
+    "str q12, [x24, x5]\n"
+    "str q14, [x24, x23]\n"
+    "str q25, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 141f\n"
+    "ldr q14, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "add x23, x14, x17\n"
+    "add x22, x7, XZR\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "add x21, x7, x25\n"
+    "add x20, x14, x11\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "tbz %x[n_channels], #2, 6f\n"
+    "ldr d9, [x23], #0x8\n"
+    "ldr d10, [x22], #0x8\n"
+    "ldr d11, [x21], #0x8\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v9.s }[2], [x23], #0x4\n"
+    "ld1 { v10.s }[2], [x22], #0x4\n"
+    "ld1 { v11.s }[2], [x21], #0x4\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[6], [x23]\n"
+    "ld1 { v10.h }[6], [x22]\n"
+    "ld1 { v11.h }[6], [x21]\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 8f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[4], [x23]\n"
+    "ld1 { v10.h }[4], [x22]\n"
+    "ld1 { v11.h }[4], [x21]\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 8f\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s9, [x23], #0x4\n"
+    "ldr s10, [x22], #0x4\n"
+    "ldr s11, [x21], #0x4\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[2], [x23]\n"
+    "ld1 { v10.h }[2], [x22]\n"
+    "ld1 { v11.h }[2], [x21]\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x23, #0x0]\n"
+    "ldr h10, [x22, #0x0]\n"
+    "ldr h11, [x21, #0x0]\n"
+    "ldr h12, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
+    "mov v16.16b, v14.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+    "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+    "add x20, x26, XZR\n"
+    "mov v18.16b, v14.16b\n fmla v18.8h, v6.8h, v9.8h\n"
+    "mov v21.16b, v14.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+    "mov v22.16b, v14.16b\n fmla v22.8h, v3.8h, v9.8h\n"
+    "mov v25.16b, v14.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+    "mov v26.16b, v14.16b\n fmla v26.8h, v0.8h, v9.8h\n"
+    "mov v19.16b, v14.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+    "mov v20.16b, v14.16b\n fmla v20.8h, v5.8h, v9.8h\n"
+    "mov v24.16b, v14.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "fmla v17.8h, v8.8h, v12.8h\n"
+    "fmla v18.8h, v7.8h, v12.8h\n"
+    "fmla v19.8h, v6.8h, v12.8h\n"
+    "fmla v21.8h, v5.8h, v12.8h\n"
+    "fmla v22.8h, v4.8h, v12.8h\n"
+    "mov v23.16b, v14.16b\n fmla v23.8h, v3.8h, v12.8h\n"
+    "fmla v25.8h, v2.8h, v12.8h\n"
+    "fmla v26.8h, v1.8h, v12.8h\n"
+    "mov v27.16b, v14.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 10f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
+    "mov v28.16b, v14.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+    "add x20, x26, x25\n"
+    "tbz %x[n_channels], #2, 14f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 16f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 16f\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
+    "mov v31.16b, v14.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+    "add x20, x12, x17\n"
+    "tbz %x[n_channels], #2, 18f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 20f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 20f\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+    "fmla v20.8h, v8.8h, v9.8h\n"
+    "fmla v21.8h, v7.8h, v9.8h\n"
+    "add x20, x7, x4\n"
+    "fmla v22.8h, v6.8h, v9.8h\n"
+    "fmla v24.8h, v5.8h, v9.8h\n"
+    "fmla v25.8h, v4.8h, v9.8h\n"
+    "fmla v26.8h, v3.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "mov v29.16b, v14.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "mov v30.16b, v14.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 24f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 24f\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
+    "fmla v16.8h, v1.8h, v12.8h\n"
+    "fmla v17.8h, v0.8h, v12.8h\n"
+    "add x20, x7, x28\n"
+    "tbz %x[n_channels], #2, 26f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 28f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 28f\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: End
+    "fmla v18.8h, v2.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "add x20, x12, x11\n"
+    "tbz %x[n_channels], #2, 30f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 32f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 32f\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "fmla v22.8h, v7.8h, v10.8h\n"
+    "add x20, x15, XZR\n"
+    "fmla v23.8h, v6.8h, v10.8h\n"
+    "fmla v25.8h, v5.8h, v10.8h\n"
+    "fmla v26.8h, v4.8h, v10.8h\n"
+    "fmla v27.8h, v3.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 34f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 36f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 36f\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
+    "fmla v16.8h, v3.8h, v9.8h\n"
+    "fmla v20.8h, v0.8h, v9.8h\n"
+    "add x20, x15, x25\n"
+    "tbz %x[n_channels], #2, 38f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 40f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 40f\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: End
+    "fmla v19.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v2.8h, v12.8h\n"
+    "add x20, x9, XZR\n"
+    "tbz %x[n_channels], #2, 42f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 44f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 44f\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
+    "fmla v24.8h, v6.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "add x20, x15, x17\n"
+    "tbz %x[n_channels], #2, 46f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 48f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 48f\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: End
+    "fmla v16.8h, v5.8h, v10.8h\n"
+    "fmla v17.8h, v4.8h, v10.8h\n"
+    "add x20, x9, x25\n"
+    "fmla v18.8h, v3.8h, v10.8h\n"
+    "fmla v20.8h, v2.8h, v10.8h\n"
+    "fmla v21.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 50f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 52f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 52f\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: End
+    "fmla v27.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v5.8h, v11.8h\n"
+    "add x20, x15, x11\n"
+    "tbz %x[n_channels], #2, 54f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 56f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 56f\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+    "fmla v17.8h, v5.8h, v12.8h\n"
+    "fmla v18.8h, v4.8h, v12.8h\n"
+    "add x20, x26, x4\n"
+    "fmla v19.8h, v3.8h, v12.8h\n"
+    "fmla v21.8h, v2.8h, v12.8h\n"
+    "fmla v22.8h, v1.8h, v12.8h\n"
+    "fmla v23.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 58f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 60f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 60f\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: End
+    "fmla v28.8h, v7.8h, v11.8h\n"
+    "fmla v29.8h, v6.8h, v11.8h\n"
+    "add x20, x14, x4\n"
+    "tbz %x[n_channels], #2, 62f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 64f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 64f\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+    "fmla v16.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v6.8h, v10.8h\n"
+    "add x20, x26, x28\n"
+    "fmla v20.8h, v4.8h, v10.8h\n"
+    "fmla v21.8h, v3.8h, v10.8h\n"
+    "fmla v24.8h, v1.8h, v10.8h\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 66f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 68f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 68f\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: End
+    "fmla v30.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "add x20, x14, x28\n"
+    "tbz %x[n_channels], #2, 70f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 72f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 72f\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 71f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "72:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "fmla v19.8h, v7.8h, v12.8h\n"
+    "add x20, x7, x17\n"
+    "fmla v22.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "fmla v26.8h, v2.8h, v12.8h\n"
+    "fmla v27.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 74f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 73f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 76f\n"
+    "73:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 76f\n"
+    "74:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 75f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 76f\n"
+    "75:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "76:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
+    "fmla v16.8h, v2.8h, v10.8h\n"
+    "fmla v17.8h, v1.8h, v10.8h\n"
+    "add x20, x12, x4\n"
+    "fmla v18.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 78f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 77f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 80f\n"
+    "77:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 80f\n"
+    "78:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 79f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 80f\n"
+    "79:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "80:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+    "fmla v20.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v6.8h, v11.8h\n"
+    "add x20, x7, x11\n"
+    "fmla v24.8h, v4.8h, v11.8h\n"
+    "fmla v25.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 82f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 81f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 84f\n"
+    "81:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 84f\n"
+    "82:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 83f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 84f\n"
+    "83:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "84:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: End
+    "fmla v17.8h, v2.8h, v12.8h\n"
+    "fmla v18.8h, v1.8h, v12.8h\n"
+    "add x20, x14, XZR\n"
+    "fmla v19.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 86f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 85f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 88f\n"
+    "85:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 88f\n"
+    "86:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 87f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 88f\n"
+    "87:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "88:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+    "fmla v16.8h, v6.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "add x20, x12, x28\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 90f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 89f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 92f\n"
+    "89:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 92f\n"
+    "90:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 91f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 92f\n"
+    "91:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "92:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "fmla v23.8h, v7.8h, v11.8h\n"
+    "add x20, x14, x25\n"
+    "fmla v26.8h, v5.8h, v11.8h\n"
+    "fmla v27.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 94f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 93f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 96f\n"
+    "93:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 96f\n"
+    "94:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 95f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 96f\n"
+    "95:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "96:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: End
+    "fmla v19.8h, v8.8h, v12.8h\n"
+    "fmla v23.8h, v5.8h, v12.8h\n"
+    "add x20, x12, XZR\n"
+    "fmla v27.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 98f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 97f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 100f\n"
+    "97:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 100f\n"
+    "98:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 99f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 100f\n"
+    "99:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "100:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+    "fmla v20.8h, v6.8h, v10.8h\n"
+    "fmla v24.8h, v3.8h, v10.8h\n"
+    "add x20, x9, x17\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 102f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 101f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 104f\n"
+    "101:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 104f\n"
+    "102:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 103f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 104f\n"
+    "103:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "104:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+    "fmla v24.8h, v8.8h, v11.8h\n"
+    "fmla v25.8h, v7.8h, v11.8h\n"
+    "add x20, x12, x25\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v11.8h\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 106f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 105f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 108f\n"
+    "105:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 108f\n"
+    "106:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 107f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 108f\n"
+    "107:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "108:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: End
+    "fmla v23.8h, v8.8h, v12.8h\n"
+    "fmla v27.8h, v5.8h, v12.8h\n"
+    "add x20, x26, x17\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 110f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 109f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 112f\n"
+    "109:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 112f\n"
+    "110:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 111f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 112f\n"
+    "111:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "112:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: End
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "add x20, x9, x11\n"
+    "fmla v30.8h, v6.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 114f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 113f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 116f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 116f\n"
+    "113:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 116f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 116f\n"
+    "114:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 115f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 116f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 116f\n"
+    "115:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "116:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "fmla v26.8h, v7.8h, v11.8h\n"
+    "add x20, x26, x11\n"
+    "fmla v27.8h, v6.8h, v11.8h\n"
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 118f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 117f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 120f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 120f\n"
+    "117:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 120f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 120f\n"
+    "118:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 119f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 120f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 120f\n"
+    "119:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "120:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: End
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "add x20, x15, x4\n"
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 122f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 121f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 124f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 124f\n"
+    "121:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 124f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 124f\n"
+    "122:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 123f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 124f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 124f\n"
+    "123:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "124:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: End
+    "fmla v16.8h, v4.8h, v10.8h\n"
+    "fmla v17.8h, v3.8h, v10.8h\n"
+    "add x20, x15, x28\n"
+    "fmla v20.8h, v1.8h, v10.8h\n"
+    "fmla v21.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 126f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 125f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 128f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 128f\n"
+    "125:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 128f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 128f\n"
+    "126:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 127f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 128f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 128f\n"
+    "127:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "128:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+    "fmla v18.8h, v5.8h, v11.8h\n"
+    "fmla v19.8h, v4.8h, v11.8h\n"
+    "add x20, x9, x4\n"
+    "fmla v22.8h, v2.8h, v11.8h\n"
+    "fmla v23.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 130f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 129f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 132f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 132f\n"
+    "129:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 132f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 132f\n"
+    "130:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 131f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 132f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 132f\n"
+    "131:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "132:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+    "fmla v24.8h, v7.8h, v12.8h\n"
+    "fmla v25.8h, v6.8h, v12.8h\n"
+    "add x20, x9, x28\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 134f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 133f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 136f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 136f\n"
+    "133:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 136f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 136f\n"
+    "134:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 135f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 136f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 136f\n"
+    "135:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "136:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "fmax v16.8h, v16.8h, v13.8h\n"
+    "fmla v30.8h, v5.8h, v10.8h\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "fmax v20.8h, v20.8h, v13.8h\n"
+    "fmax v21.8h, v21.8h, v13.8h\n"
+    "fmax v22.8h, v22.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v13.8h\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "fmax v30.8h, v30.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "fmin v16.8h, v16.8h, v15.8h\n"
+    "fmin v17.8h, v17.8h, v15.8h\n"
+    "fmin v18.8h, v18.8h, v15.8h\n"
+    "fmin v19.8h, v19.8h, v15.8h\n"
+    "fmin v20.8h, v20.8h, v15.8h\n"
+    "fmin v21.8h, v21.8h, v15.8h\n"
+    "fmin v22.8h, v22.8h, v15.8h\n"
+    "fmin v23.8h, v23.8h, v15.8h\n"
+    "fmin v24.8h, v24.8h, v15.8h\n"
+    "fmin v25.8h, v25.8h, v15.8h\n"
+    "fmin v26.8h, v26.8h, v15.8h\n"
+    "fmin v27.8h, v27.8h, v15.8h\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "tbz %x[n_channels], #2, 138f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.d }[0], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.d }[0], [x22], x5\n"
+    "st1 { v24.d }[0], [x21], x5\n"
+    "add x8, x8, #0x8\n"
+    "add x10, x10, #0x8\n"
+    "st1 { v28.d }[0], [x20], x5\n"
+    "add x27, x27, #0x8\n"
+    "add x24, x24, #0x8\n"
+    "st1 { v17.d }[0], [x23], x5\n"
+    "st1 { v21.d }[0], [x22], x5\n"
+    "st1 { v25.d }[0], [x21], x5\n"
+    "st1 { v29.d }[0], [x20], x5\n"
+    "st1 { v18.d }[0], [x23], x5\n"
+    "st1 { v22.d }[0], [x22], x5\n"
+    "st1 { v26.d }[0], [x21], x5\n"
+    "st1 { v30.d }[0], [x20], x5\n"
+    "st1 { v19.d }[0], [x23]\n"
+    "st1 { v23.d }[0], [x22]\n"
+    "st1 { v27.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 137f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.s }[2], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.s }[2], [x22], x5\n"
+    "st1 { v24.s }[2], [x21], x5\n"
+    "add x8, x8, #0x4\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v28.s }[2], [x20], x5\n"
+    "add x27, x27, #0x4\n"
+    "add x24, x24, #0x4\n"
+    "st1 { v17.s }[2], [x23], x5\n"
+    "st1 { v21.s }[2], [x22], x5\n"
+    "st1 { v25.s }[2], [x21], x5\n"
+    "st1 { v29.s }[2], [x20], x5\n"
+    "st1 { v18.s }[2], [x23], x5\n"
+    "st1 { v22.s }[2], [x22], x5\n"
+    "st1 { v26.s }[2], [x21], x5\n"
+    "st1 { v30.s }[2], [x20], x5\n"
+    "st1 { v19.s }[2], [x23]\n"
+    "st1 { v23.s }[2], [x22]\n"
+    "st1 { v27.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 140f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.h }[6], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.h }[6], [x22], x5\n"
+    "st1 { v24.h }[6], [x21], x5\n"
+    "st1 { v28.h }[6], [x20], x5\n"
+    "st1 { v17.h }[6], [x23], x5\n"
+    "st1 { v21.h }[6], [x22], x5\n"
+    "st1 { v25.h }[6], [x21], x5\n"
+    "st1 { v29.h }[6], [x20], x5\n"
+    "st1 { v18.h }[6], [x23], x5\n"
+    "st1 { v22.h }[6], [x22], x5\n"
+    "st1 { v26.h }[6], [x21], x5\n"
+    "st1 { v30.h }[6], [x20], x5\n"
+    "st1 { v19.h }[6], [x23]\n"
+    "st1 { v23.h }[6], [x22]\n"
+    "st1 { v27.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 140f\n"
+    "137:"  // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 140f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.h }[4], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.h }[4], [x22], x5\n"
+    "st1 { v24.h }[4], [x21], x5\n"
+    "st1 { v28.h }[4], [x20], x5\n"
+    "st1 { v17.h }[4], [x23], x5\n"
+    "st1 { v21.h }[4], [x22], x5\n"
+    "st1 { v25.h }[4], [x21], x5\n"
+    "st1 { v29.h }[4], [x20], x5\n"
+    "st1 { v18.h }[4], [x23], x5\n"
+    "st1 { v22.h }[4], [x22], x5\n"
+    "st1 { v26.h }[4], [x21], x5\n"
+    "st1 { v30.h }[4], [x20], x5\n"
+    "st1 { v19.h }[4], [x23]\n"
+    "st1 { v23.h }[4], [x22]\n"
+    "st1 { v27.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 140f\n"
+    "138:"  // Tile loop: Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 139f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.s }[0], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.s }[0], [x22], x5\n"
+    "st1 { v24.s }[0], [x21], x5\n"
+    "add x8, x8, #0x4\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v28.s }[0], [x20], x5\n"
+    "add x27, x27, #0x4\n"
+    "add x24, x24, #0x4\n"
+    "st1 { v17.s }[0], [x23], x5\n"
+    "st1 { v21.s }[0], [x22], x5\n"
+    "st1 { v25.s }[0], [x21], x5\n"
+    "st1 { v29.s }[0], [x20], x5\n"
+    "st1 { v18.s }[0], [x23], x5\n"
+    "st1 { v22.s }[0], [x22], x5\n"
+    "st1 { v26.s }[0], [x21], x5\n"
+    "st1 { v30.s }[0], [x20], x5\n"
+    "st1 { v19.s }[0], [x23]\n"
+    "st1 { v23.s }[0], [x22]\n"
+    "st1 { v27.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 140f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.h }[2], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.h }[2], [x22], x5\n"
+    "st1 { v24.h }[2], [x21], x5\n"
+    "st1 { v28.h }[2], [x20], x5\n"
+    "st1 { v17.h }[2], [x23], x5\n"
+    "st1 { v21.h }[2], [x22], x5\n"
+    "st1 { v25.h }[2], [x21], x5\n"
+    "st1 { v29.h }[2], [x20], x5\n"
+    "st1 { v18.h }[2], [x23], x5\n"
+    "st1 { v22.h }[2], [x22], x5\n"
+    "st1 { v26.h }[2], [x21], x5\n"
+    "st1 { v30.h }[2], [x20], x5\n"
+    "st1 { v19.h }[2], [x23]\n"
+    "st1 { v23.h }[2], [x22]\n"
+    "st1 { v27.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 140f\n"
+    "139:"  // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.h }[0], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.h }[0], [x22], x5\n"
+    "st1 { v24.h }[0], [x21], x5\n"
+    "st1 { v28.h }[0], [x20], x5\n"
+    "st1 { v17.h }[0], [x23], x5\n"
+    "st1 { v21.h }[0], [x22], x5\n"
+    "st1 { v25.h }[0], [x21], x5\n"
+    "st1 { v29.h }[0], [x20], x5\n"
+    "st1 { v18.h }[0], [x23], x5\n"
+    "st1 { v22.h }[0], [x22], x5\n"
+    "st1 { v26.h }[0], [x21], x5\n"
+    "st1 { v30.h }[0], [x20], x5\n"
+    "st1 { v19.h }[0], [x23]\n"
+    "st1 { v23.h }[0], [x22]\n"
+    "st1 { v27.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "140:"  // Tile loop: Oddments: Store: Bit 2: End
+    "141:"  // Tile loop: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x26, x26, #0x1\n"
+    "add x21, x27, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x26, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x27, x27, x21, LT\n"
+    "csel x26, x26, XZR, LT\n"
+    "cmp x27, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..96feeeeece
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,2007 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "mov x6, #0x10\n"  // cntb _, ALL, #1
+    "lsr x7, %x[n_channels], #0x3\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.8h }, [x20]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "sub x14, XZR, x6\n"
+    "cbz x7, 3f\n"
+    "ldr q30, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "cmp x6, x7, LSL #4\n"
+    "ldr q1, [x17, #0x20]\n"
+    "ldr q2, [x17, #0x30]\n"
+    "ldr q3, [x17, #0x40]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "add x17, x17, #0xa0\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "ldr q9, [x21, x15]\n"
+    "ldr q10, [x20, x15]\n"
+    "ldp x21, x20, [x16, #0x10]\n"
+    "ldr q11, [x21, x15]\n"
+    "ldr q12, [x20, x15]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v23.16b, v30.16b\n fmla v23.8h, v4.8h, v9.8h\n"
+    "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+    "ldr x27, [x16, #0x20]\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v25.16b, v30.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+    "mov v28.16b, v30.16b\n fmla v28.8h, v1.8h, v9.8h\n"
+    "ldr x23, [x16, #0x28]\n"
+    "ldr x22, [x16, #0x38]\n"
+    "mov v20.16b, v30.16b\n fmla v20.8h, v0.8h, v9.8h\n"
+    "mov v16.16b, v30.16b\n fmla v16.8h, v7.8h, v9.8h\n"
+    "ldr x26, [x16, #0x40]\n"
+    "ldr x20, [x16, #0x48]\n"
+    "mov v15.16b, v30.16b\n fmla v15.8h, v6.8h, v9.8h\n"
+    "fmla v23.8h, v5.8h, v12.8h\n"
+    "ldr x25, [x16, #0x50]\n"
+    "ldr x21, [x16, #0x58]\n"
+    "mov v27.16b, v30.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+    "mov v31.16b, v30.16b\n fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x24, x15]\n"
+    "ldr x13, [x16, #0x70]\n"
+    "fmla v17.8h, v0.8h, v10.8h\n"
+    "ldr q22, [x27, x15]\n"
+    "mov v10.16b, v30.16b\n fmla v10.8h, v2.8h, v11.8h\n"
+    "ldr q18, [x23, x15]\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "ldr x24, [x16, #0x60]\n"
+    "ldr x23, [x16, #0x68]\n"
+    "fmla v20.8h, v1.8h, v12.8h\n"
+    "fmla v16.8h, v8.8h, v12.8h\n"
+    "ldr x12, [x8, #0x0]\n"
+    "ldr x11, [x8, #0x8]\n"
+    "fmla v15.8h, v7.8h, v12.8h\n"
+    "mov v29.16b, v30.16b\n fmla v29.8h, v6.8h, v22.8h\n"
+    "ldr q22, [x20, x15]\n"
+    "ldr x28, [x16, #0x88]\n"
+    "fmla v23.8h, v7.8h, v9.8h\n"
+    "fmla v10.8h, v6.8h, v12.8h\n"
+    "ldr x10, [x8, #0x10]\n"
+    "ldr x9, [x8, #0x18]\n"
+    "mov v21.16b, v30.16b\n fmla v21.8h, v3.8h, v12.8h\n"
+    "mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v12.8h\n"
+    "ldr q11, [x22, x15]\n"
+    "ldr x22, [x16, #0x78]\n"
+    "mov v24.16b, v30.16b\n fmla v24.8h, v8.8h, v18.8h\n"
+    "ldr q12, [x26, x15]\n"
+    "fmla v25.8h, v6.8h, v9.8h\n"
+    "ldr x20, [x16, #0x80]\n"
+    "fmla v28.8h, v4.8h, v9.8h\n"
+    "fmla v20.8h, v3.8h, v9.8h\n"
+    "add x14, x14, #0x10\n"
+    "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v9.8h\n"
+    "mov v18.16b, v30.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+    "ldr q30, [x17, #0x0]\n"
+    "fmla v27.8h, v8.8h, v9.8h\n"
+    "fmla v31.8h, v5.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x25, x15]\n"
+    "fmla v17.8h, v1.8h, v11.8h\n"
+    "ldr x27, [x16, #0x90]\n"
+    "fmla v16.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x21, x15]\n"
+    "fmla v15.8h, v2.8h, v12.8h\n"
+    "ldr x21, [x16, #0x98]\n"
+    "fmla v23.8h, v8.8h, v22.8h\n"
+    "fmla v10.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x24, x15]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v25.8h, v7.8h, v22.8h\n"
+    "fmla v21.8h, v6.8h, v22.8h\n"
+    "fmla v28.8h, v5.8h, v22.8h\n"
+    "fmla v20.8h, v4.8h, v22.8h\n"
+    "fmla v19.8h, v3.8h, v22.8h\n"
+    "fmla v26.8h, v2.8h, v22.8h\n"
+    "fmla v18.8h, v1.8h, v22.8h\n"
+    "fmla v24.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x23, x15]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v17.8h, v3.8h, v9.8h\n"
+    "fmla v27.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "ldr q9, [x13, x15]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v16.8h, v4.8h, v22.8h\n"
+    "fmla v15.8h, v3.8h, v22.8h\n"
+    "fmla v23.8h, v1.8h, v22.8h\n"
+    "fmla v10.8h, v5.8h, v11.8h\n"
+    "fmla v21.8h, v2.8h, v11.8h\n"
+    "ldr q12, [x22, x15]\n"
+    "fmla v25.8h, v0.8h, v22.8h\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v19.8h, v8.8h, v9.8h\n"
+    "fmla v24.8h, v5.8h, v9.8h\n"
+    "ldr q11, [x20, x15]\n"
+    "ldr x22, [x16, #0xc0]\n"
+    "fmla v17.8h, v5.8h, v22.8h\n"
+    "fmla v27.8h, v2.8h, v22.8h\n"
+    "ldr q22, [x28, x15]\n"
+    "ldr x20, [x16, #0xc8]\n"
+    "fmla v16.8h, v5.8h, v12.8h\n"
+    "fmla v15.8h, v4.8h, v12.8h\n"
+    "fmla v23.8h, v2.8h, v12.8h\n"
+    "fmla v10.8h, v3.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "fmla v21.8h, v0.8h, v12.8h\n"
+    "ldr q9, [x21, x15]\n"
+    "ldr x28, [x16, #0xd8]\n"
+    "fmla v29.8h, v7.8h, v11.8h\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "ldr q12, [x27, x15]\n"
+    "ldr x21, [x16, #0xd0]\n"
+    "fmla v17.8h, v7.8h, v22.8h\n"
+    "fmla v16.8h, v6.8h, v22.8h\n"
+    "fmla v27.8h, v4.8h, v22.8h\n"
+    "fmla v23.8h, v3.8h, v22.8h\n"
+    "fmla v31.8h, v1.8h, v22.8h\n"
+    "fmla v28.8h, v0.8h, v22.8h\n"
+    "ldr q11, [x26, x15]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla v15.8h, v8.8h, v9.8h\n"
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "fmla v24.8h, v7.8h, v12.8h\n"
+    "ldr q12, [x25, x15]\n"
+    "fmla v19.8h, v1.8h, v9.8h\n"
+    "ldr x26, [x16, #0xe8]\n"
+    "fmla v10.8h, v7.8h, v9.8h\n"
+    "fmla v25.8h, v5.8h, v9.8h\n"
+    "fmla v21.8h, v4.8h, v9.8h\n"
+    "fmla v20.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x24, x15]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla v17.8h, v2.8h, v11.8h\n"
+    "fmla v16.8h, v1.8h, v11.8h\n"
+    "fmla v15.8h, v0.8h, v11.8h\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v27.8h, v7.8h, v12.8h\n"
+    "ldr x25, [x16, #0xf8]\n"
+    "fmla v23.8h, v6.8h, v12.8h\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v26.8h, v0.8h, v12.8h\n"
+    "ldr q11, [x22, x15]\n"
+    "fmla v19.8h, v4.8h, v11.8h\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla v18.8h, v2.8h, v11.8h\n"
+    "fmla v16.8h, v2.8h, v9.8h\n"
+    "fmla v15.8h, v1.8h, v9.8h\n"
+    "fmla v10.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x20, x15]\n"
+    "ldr x20, [x16, #0x108]\n"
+    "fmla v17.8h, v6.8h, v22.8h\n"
+    "fmla v27.8h, v3.8h, v22.8h\n"
+    "fmla v31.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x21, x15]\n"
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "ldr x22, [x16, #0x110]\n"
+    "fmla v21.8h, v7.8h, v11.8h\n"
+    "fmla v20.8h, v5.8h, v11.8h\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "ldr q12, [x28, x15]\n"
+    "fmla v19.8h, v2.8h, v9.8h\n"
+    "ldr x21, [x16, #0x118]\n"
+    "fmla v29.8h, v0.8h, v22.8h\n"
+    "fmla v26.8h, v4.8h, v12.8h\n"
+    "fmla v18.8h, v3.8h, v12.8h\n"
+    "fmla v10.8h, v8.8h, v9.8h\n"
+    "fmla v21.8h, v5.8h, v9.8h\n"
+    "ldr q11, [x27, x15]\n"
+    "fmla v27.8h, v6.8h, v22.8h\n"
+    "fmla v31.8h, v3.8h, v22.8h\n"
+    "ldr q22, [x26, x15]\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmla v20.8h, v6.8h, v12.8h\n"
+    "fmla v29.8h, v5.8h, v12.8h\n"
+    "fmla v19.8h, v5.8h, v11.8h\n"
+    "fmla v24.8h, v2.8h, v11.8h\n"
+    "fmla v26.8h, v7.8h, v22.8h\n"
+    "fmla v18.8h, v6.8h, v22.8h\n"
+    "fmla v31.8h, v8.8h, v12.8h\n"
+    "ldr q12, [x24, x15]\n"
+    "fmla v29.8h, v8.8h, v22.8h\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v28.8h, v8.8h, v12.8h\n"
+    "fmla v20.8h, v7.8h, v12.8h\n"
+    "fmla v19.8h, v6.8h, v12.8h\n"
+    "fmla v26.8h, v5.8h, v12.8h\n"
+    "fmla v18.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "ldr q12, [x20, x15]\n"
+    "ldp x20, x24, [x16, #0x0]\n"
+    "ldr q9, [x20, x6]\n"
+    "fmla v21.8h, v8.8h, v11.8h\n"
+    "ldr q11, [x25, x15]\n"
+    "fmla v17.8h, v4.8h, v22.8h\n"
+    "fmla v16.8h, v3.8h, v22.8h\n"
+    "fmla v15.8h, v5.8h, v12.8h\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmla v10.8h, v4.8h, v12.8h\n"
+    "fmla v26.8h, v8.8h, v11.8h\n"
+    "fmax v16.8h, v16.8h, v13.8h\n"
+    "fmla v18.8h, v7.8h, v11.8h\n"
+    "fmla v24.8h, v6.8h, v11.8h\n"
+    "ldr q11, [x22, x15]\n"
+    "fmax v15.8h, v15.8h, v13.8h\n"
+    "fmla v27.8h, v1.8h, v22.8h\n"
+    "fmla v23.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x21, x15]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "fmla v25.8h, v2.8h, v12.8h\n"
+    "ldr q2, [x17, #0x30]\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "ldr q1, [x17, #0x20]\n"
+    "fmax v10.8h, v10.8h, v13.8h\n"
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "fmla v28.8h, v6.8h, v11.8h\n"
+    "ldr q6, [x17, #0x70]\n"
+    "fmla v20.8h, v8.8h, v22.8h\n"
+    "ldr q8, [x17, #0x90]\n"
+    "fmla v19.8h, v7.8h, v22.8h\n"
+    "ldr q7, [x17, #0x80]\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "fmin v16.8h, v16.8h, v14.8h\n"
+    "str q17, [x12, x14]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "fmin v15.8h, v15.8h, v14.8h\n"
+    "fmin v10.8h, v10.8h, v14.8h\n"
+    "str q16, [x11, x14]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v13.8h\n"
+    "str q15, [x10, x14]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "fmax v21.8h, v21.8h, v13.8h\n"
+    "str q10, [x9, x14]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmla v26.8h, v3.8h, v11.8h\n"
+    "ldr q3, [x17, #0x40]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmla v18.8h, v5.8h, v22.8h\n"
+    "ldr q5, [x17, #0x60]\n"
+    "fmla v24.8h, v4.8h, v22.8h\n"
+    "ldr q10, [x24, x6]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "str q27, [x23, x14]\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "str q23, [x22, x14]\n"
+    "ldr x25, [x8, #0x40]\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmax v20.8h, v20.8h, v13.8h\n"
+    "str q25, [x21, x14]\n"
+    "ldr x23, [x8, #0x48]\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "str q21, [x20, x14]\n"
+    "ldr x22, [x8, #0x50]\n"
+    "ldr x24, [x8, #0x58]\n"
+    "ldp x21, x20, [x16, #0x10]\n"
+    "ldr q11, [x21, x6]\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "ldr q12, [x20, x6]\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "str q31, [x25, x14]\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "str q28, [x23, x14]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v13.8h\n"
+    "str q20, [x22, x14]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "str q19, [x24, x14]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x6, x6, #0x10\n"
+    "cmp x6, x7, LSL #4\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "add x15, x15, #0x10\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "str q29, [x23, x14]\n"
+    "add x17, x17, #0xa0\n"
+    "str q26, [x22, x14]\n"
+    "str q18, [x21, x14]\n"
+    "str q24, [x20, x14]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v30.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+    "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+    "ldr x27, [x16, #0x20]\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v15.16b, v30.16b\n fmla v15.8h, v3.8h, v9.8h\n"
+    "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "ldr x23, [x16, #0x28]\n"
+    "ldr x22, [x16, #0x38]\n"
+    "mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v9.8h\n"
+    "mov v20.16b, v30.16b\n fmla v20.8h, v7.8h, v9.8h\n"
+    "ldr x26, [x16, #0x40]\n"
+    "ldr x21, [x16, #0x48]\n"
+    "mov v21.16b, v30.16b\n fmla v21.8h, v6.8h, v9.8h\n"
+    "fmla v31.8h, v5.8h, v12.8h\n"
+    "ldr x25, [x16, #0x50]\n"
+    "ldr x20, [x16, #0x58]\n"
+    "mov v18.16b, v30.16b\n fmla v18.8h, v5.8h, v9.8h\n"
+    "mov v27.16b, v30.16b\n fmla v27.8h, v2.8h, v9.8h\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr x13, [x16, #0x70]\n"
+    "fmla v17.8h, v0.8h, v10.8h\n"
+    "ldr q22, [x27, x15]\n"
+    "mov v28.16b, v30.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr q16, [x23, x15]\n"
+    "fmla v15.8h, v4.8h, v12.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "ldr x24, [x16, #0x60]\n"
+    "ldr x23, [x16, #0x68]\n"
+    "fmla v19.8h, v1.8h, v12.8h\n"
+    "fmla v20.8h, v8.8h, v12.8h\n"
+    "ldr x12, [x8, #0x0]\n"
+    "ldr x11, [x8, #0x8]\n"
+    "fmla v21.8h, v7.8h, v12.8h\n"
+    "mov v10.16b, v30.16b\n fmla v10.8h, v6.8h, v22.8h\n"
+    "ldr q22, [x21, x15]\n"
+    "ldr x28, [x16, #0x88]\n"
+    "fmla v31.8h, v7.8h, v24.8h\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "ldr x10, [x8, #0x10]\n"
+    "ldr x9, [x8, #0x18]\n"
+    "mov v9.16b, v30.16b\n fmla v9.8h, v3.8h, v12.8h\n"
+    "mov v11.16b, v30.16b\n fmla v11.8h, v0.8h, v12.8h\n"
+    "ldr q23, [x22, x15]\n"
+    "ldr x22, [x16, #0x78]\n"
+    "mov v12.16b, v30.16b\n fmla v12.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x26, x15]\n"
+    "fmla v15.8h, v6.8h, v24.8h\n"
+    "ldr x21, [x16, #0x80]\n"
+    "fmla v29.8h, v4.8h, v24.8h\n"
+    "fmla v19.8h, v3.8h, v24.8h\n"
+    "add x14, x14, #0x10\n"
+    "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v24.8h\n"
+    "mov v25.16b, v30.16b\n fmla v25.8h, v0.8h, v24.8h\n"
+    "fmla v18.8h, v8.8h, v24.8h\n"
+    "fmla v27.8h, v5.8h, v24.8h\n"
+    "fmla v10.8h, v2.8h, v24.8h\n"
+    "ldr q24, [x25, x15]\n"
+    "fmla v17.8h, v1.8h, v23.8h\n"
+    "ldr x27, [x16, #0x90]\n"
+    "fmla v20.8h, v0.8h, v23.8h\n"
+    "ldr q23, [x20, x15]\n"
+    "fmla v21.8h, v2.8h, v16.8h\n"
+    "ldr x20, [x16, #0x98]\n"
+    "fmla v31.8h, v8.8h, v22.8h\n"
+    "fmla v28.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x24, x15]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v15.8h, v7.8h, v22.8h\n"
+    "fmla v9.8h, v6.8h, v22.8h\n"
+    "fmla v29.8h, v5.8h, v22.8h\n"
+    "fmla v19.8h, v4.8h, v22.8h\n"
+    "fmla v11.8h, v3.8h, v22.8h\n"
+    "fmla v26.8h, v2.8h, v22.8h\n"
+    "fmla v25.8h, v1.8h, v22.8h\n"
+    "fmla v12.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x23, x15]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v17.8h, v3.8h, v24.8h\n"
+    "fmla v18.8h, v0.8h, v24.8h\n"
+    "fmla v27.8h, v6.8h, v16.8h\n"
+    "fmla v10.8h, v3.8h, v16.8h\n"
+    "ldr q16, [x13, x15]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v20.8h, v4.8h, v22.8h\n"
+    "fmla v21.8h, v3.8h, v22.8h\n"
+    "fmla v31.8h, v1.8h, v22.8h\n"
+    "fmla v28.8h, v5.8h, v23.8h\n"
+    "fmla v9.8h, v2.8h, v23.8h\n"
+    "ldr q23, [x22, x15]\n"
+    "fmla v15.8h, v0.8h, v22.8h\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v11.8h, v8.8h, v16.8h\n"
+    "fmla v12.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x21, x15]\n"
+    "ldr x22, [x16, #0xc0]\n"
+    "fmla v17.8h, v5.8h, v22.8h\n"
+    "fmla v18.8h, v2.8h, v22.8h\n"
+    "ldr q22, [x28, x15]\n"
+    "ldr x21, [x16, #0xc8]\n"
+    "fmla v20.8h, v5.8h, v23.8h\n"
+    "fmla v21.8h, v4.8h, v23.8h\n"
+    "fmla v31.8h, v2.8h, v23.8h\n"
+    "fmla v28.8h, v3.8h, v23.8h\n"
+    "fmla v15.8h, v1.8h, v23.8h\n"
+    "fmla v9.8h, v0.8h, v23.8h\n"
+    "ldr q23, [x20, x15]\n"
+    "ldr x28, [x16, #0xd8]\n"
+    "fmla v10.8h, v7.8h, v16.8h\n"
+    "fmla v26.8h, v6.8h, v16.8h\n"
+    "ldr q16, [x27, x15]\n"
+    "ldr x20, [x16, #0xd0]\n"
+    "fmla v17.8h, v7.8h, v22.8h\n"
+    "fmla v20.8h, v6.8h, v22.8h\n"
+    "fmla v18.8h, v4.8h, v22.8h\n"
+    "fmla v31.8h, v3.8h, v22.8h\n"
+    "fmla v27.8h, v1.8h, v22.8h\n"
+    "fmla v29.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x26, x15]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla v21.8h, v8.8h, v23.8h\n"
+    "fmla v25.8h, v8.8h, v16.8h\n"
+    "fmla v12.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x25, x15]\n"
+    "fmla v11.8h, v1.8h, v23.8h\n"
+    "ldr x26, [x16, #0xe8]\n"
+    "fmla v28.8h, v7.8h, v23.8h\n"
+    "fmla v15.8h, v5.8h, v23.8h\n"
+    "fmla v9.8h, v4.8h, v23.8h\n"
+    "fmla v19.8h, v2.8h, v23.8h\n"
+    "ldr q23, [x24, x15]\n"
+    "ldr x25, [x16, #0xf0]\n"
+    "fmla v17.8h, v2.8h, v22.8h\n"
+    "fmla v20.8h, v1.8h, v22.8h\n"
+    "fmla v21.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v18.8h, v7.8h, v16.8h\n"
+    "ldr x24, [x16, #0xf8]\n"
+    "fmla v31.8h, v6.8h, v16.8h\n"
+    "fmla v27.8h, v4.8h, v16.8h\n"
+    "fmla v29.8h, v3.8h, v16.8h\n"
+    "fmla v10.8h, v1.8h, v16.8h\n"
+    "fmla v26.8h, v0.8h, v16.8h\n"
+    "ldr q16, [x22, x15]\n"
+    "fmla v11.8h, v4.8h, v16.8h\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla v25.8h, v2.8h, v16.8h\n"
+    "fmla v20.8h, v2.8h, v23.8h\n"
+    "fmla v21.8h, v1.8h, v23.8h\n"
+    "fmla v28.8h, v0.8h, v23.8h\n"
+    "ldr q23, [x21, x15]\n"
+    "ldr x22, [x16, #0x108]\n"
+    "fmla v17.8h, v6.8h, v22.8h\n"
+    "fmla v18.8h, v3.8h, v22.8h\n"
+    "fmla v27.8h, v0.8h, v22.8h\n"
+    "ldr q22, [x20, x15]\n"
+    "fmla v15.8h, v8.8h, v16.8h\n"
+    "ldr x21, [x16, #0x110]\n"
+    "fmla v9.8h, v7.8h, v16.8h\n"
+    "fmla v19.8h, v5.8h, v16.8h\n"
+    "fmla v12.8h, v1.8h, v16.8h\n"
+    "ldr q16, [x28, x15]\n"
+    "fmla v11.8h, v2.8h, v23.8h\n"
+    "ldr x20, [x16, #0x118]\n"
+    "fmla v10.8h, v0.8h, v22.8h\n"
+    "fmla v26.8h, v4.8h, v16.8h\n"
+    "fmla v25.8h, v3.8h, v16.8h\n"
+    "fmla v28.8h, v8.8h, v23.8h\n"
+    "fmla v9.8h, v5.8h, v23.8h\n"
+    "ldr q23, [x27, x15]\n"
+    "fmla v18.8h, v6.8h, v22.8h\n"
+    "fmla v27.8h, v3.8h, v22.8h\n"
+    "ldr q22, [x26, x15]\n"
+    "fmla v29.8h, v7.8h, v16.8h\n"
+    "fmla v19.8h, v6.8h, v16.8h\n"
+    "fmla v10.8h, v5.8h, v16.8h\n"
+    "fmla v11.8h, v5.8h, v23.8h\n"
+    "fmla v12.8h, v2.8h, v23.8h\n"
+    "fmla v26.8h, v7.8h, v22.8h\n"
+    "fmla v25.8h, v6.8h, v22.8h\n"
+    "fmla v27.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x25, x15]\n"
+    "fmla v10.8h, v8.8h, v22.8h\n"
+    "ldr q30, [x23, x15]\n"
+    "fmla v29.8h, v8.8h, v16.8h\n"
+    "fmla v19.8h, v7.8h, v16.8h\n"
+    "fmla v11.8h, v6.8h, v16.8h\n"
+    "fmla v26.8h, v5.8h, v16.8h\n"
+    "fmla v25.8h, v4.8h, v16.8h\n"
+    "fmla v12.8h, v3.8h, v16.8h\n"
+    "ldr q24, [x22, x15]\n"
+    "fmla v9.8h, v8.8h, v23.8h\n"
+    "ldr q16, [x24, x15]\n"
+    "fmla v17.8h, v4.8h, v30.8h\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmla v20.8h, v3.8h, v30.8h\n"
+    "fmla v21.8h, v5.8h, v24.8h\n"
+    "fmax v20.8h, v20.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v24.8h\n"
+    "fmla v26.8h, v8.8h, v16.8h\n"
+    "fmax v21.8h, v21.8h, v13.8h\n"
+    "fmla v25.8h, v7.8h, v16.8h\n"
+    "fmla v12.8h, v6.8h, v16.8h\n"
+    "ldr q23, [x21, x15]\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmla v18.8h, v1.8h, v30.8h\n"
+    "fmla v31.8h, v0.8h, v30.8h\n"
+    "ldr q16, [x20, x15]\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "fmla v15.8h, v2.8h, v24.8h\n"
+    "fmla v9.8h, v1.8h, v24.8h\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "str q17, [x12, x14]\n"
+    "fmla v27.8h, v7.8h, v23.8h\n"
+    "fmla v29.8h, v6.8h, v23.8h\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "str q20, [x11, x14]\n"
+    "fmla v19.8h, v8.8h, v16.8h\n"
+    "fmla v11.8h, v7.8h, v16.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "str q21, [x10, x14]\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "str q28, [x9, x14]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "fmax v15.8h, v15.8h, v13.8h\n"
+    "fmax v9.8h, v9.8h, v13.8h\n"
+    "ldr x22, [x8, #0x28]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "fmla v10.8h, v4.8h, v23.8h\n"
+    "fmla v26.8h, v3.8h, v23.8h\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "fmla v25.8h, v5.8h, v16.8h\n"
+    "fmla v12.8h, v4.8h, v16.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "str q18, [x23, x14]\n"
+    "fmin v15.8h, v15.8h, v14.8h\n"
+    "fmin v9.8h, v9.8h, v14.8h\n"
+    "str q31, [x22, x14]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "str q15, [x21, x14]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "fmax v11.8h, v11.8h, v13.8h\n"
+    "str q9, [x20, x14]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "str q27, [x23, x14]\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "fmin v11.8h, v11.8h, v14.8h\n"
+    "str q29, [x22, x14]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "fmax v10.8h, v10.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "str q19, [x21, x14]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "fmax v12.8h, v12.8h, v13.8h\n"
+    "str q11, [x20, x14]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "fmin v10.8h, v10.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "str q10, [x23, x14]\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v12.8h, v12.8h, v14.8h\n"
+    "str q26, [x22, x14]\n"
+    "add x15, x15, #0x10\n"
+    "str q25, [x21, x14]\n"
+    "str q12, [x20, x14]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 140f\n"
+    "ldr q30, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "mov x14, x15\n"
+    "ldr q1, [x17, #0x20]\n"
+    "ldr q2, [x17, #0x30]\n"
+    "ldr q3, [x17, #0x40]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "ldr x23, [x16, #0x0]\n"
+    "ldr x22, [x16, #0x8]\n"
+    "add x23, x23, x15\n"
+    "add x22, x22, x15\n"
+    "ldr x21, [x16, #0x10]\n"
+    "ldr x20, [x16, #0x18]\n"
+    "add x21, x21, x15\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v9.d }[0], [x23], #0x8\n"
+    "ld1 { v10.d }[0], [x22], #0x8\n"
+    "ld1 { v11.d }[0], [x21], #0x8\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[2], [x23], #0x4\n"
+    "ld1 { v10.s }[2], [x22], #0x4\n"
+    "ld1 { v11.s }[2], [x21], #0x4\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[6], [x23], #0x2\n"
+    "ld1 { v10.h }[6], [x22], #0x2\n"
+    "ld1 { v11.h }[6], [x21], #0x2\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[4], [x23], #0x2\n"
+    "ld1 { v10.h }[4], [x22], #0x2\n"
+    "ld1 { v11.h }[4], [x21], #0x2\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.s }[0], [x23], #0x4\n"
+    "ld1 { v10.s }[0], [x22], #0x4\n"
+    "ld1 { v11.s }[0], [x21], #0x4\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[2], [x23], #0x2\n"
+    "ld1 { v10.h }[2], [x22], #0x2\n"
+    "ld1 { v11.h }[2], [x21], #0x2\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x23], #0x2\n"
+    "ld1 { v10.h }[0], [x22], #0x2\n"
+    "ld1 { v11.h }[0], [x21], #0x2\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "7:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
+    "mov v16.16b, v30.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+    "mov v17.16b, v30.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+    "ldr x20, [x16, #0x20]\n"
+    "add x20, x20, x15\n"
+    "mov v18.16b, v30.16b\n fmla v18.8h, v6.8h, v9.8h\n"
+    "mov v21.16b, v30.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+    "mov v22.16b, v30.16b\n fmla v22.8h, v3.8h, v9.8h\n"
+    "mov v25.16b, v30.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+    "mov v26.16b, v30.16b\n fmla v26.8h, v0.8h, v9.8h\n"
+    "mov v19.16b, v30.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+    "mov v20.16b, v30.16b\n fmla v20.8h, v5.8h, v9.8h\n"
+    "mov v24.16b, v30.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "fmla v17.8h, v8.8h, v12.8h\n"
+    "fmla v18.8h, v7.8h, v12.8h\n"
+    "fmla v19.8h, v6.8h, v12.8h\n"
+    "fmla v21.8h, v5.8h, v12.8h\n"
+    "fmla v22.8h, v4.8h, v12.8h\n"
+    "mov v23.16b, v30.16b\n fmla v23.8h, v3.8h, v12.8h\n"
+    "fmla v25.8h, v2.8h, v12.8h\n"
+    "fmla v26.8h, v1.8h, v12.8h\n"
+    "mov v27.16b, v30.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 9f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load input (5, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load input (5, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (5, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "11:"  // Oddments: Load input (5, 0): Bit 2: End
+    "ldr x20, [x16, #0x28]\n"
+    "mov v28.16b, v30.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load input (5, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load input (5, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (5, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "15:"  // Oddments: Load input (5, 5): Bit 2: End
+    "ldr x20, [x16, #0x30]\n"
+    "mov v31.16b, v30.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load input (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "19:"  // Oddments: Load input (3, 2): Bit 2: End
+    "ldr x20, [x16, #0x38]\n"
+    "fmla v20.8h, v8.8h, v9.8h\n"
+    "fmla v21.8h, v7.8h, v9.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v22.8h, v6.8h, v9.8h\n"
+    "fmla v24.8h, v5.8h, v9.8h\n"
+    "fmla v25.8h, v4.8h, v9.8h\n"
+    "fmla v26.8h, v3.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "fmla v30.8h, v0.8h, v9.8h\n"
+    "tbz %x[n_channels], #2, 21f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load input (0, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "23:"  // Oddments: Load input (0, 1): Bit 2: End
+    "ldr x20, [x16, #0x40]\n"
+    "fmla v16.8h, v1.8h, v12.8h\n"
+    "fmla v17.8h, v0.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 25f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load input (0, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load input (0, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (0, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "27:"  // Oddments: Load input (0, 4): Bit 2: End
+    "ldr x20, [x16, #0x48]\n"
+    "fmla v18.8h, v2.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 29f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load input (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "31:"  // Oddments: Load input (3, 3): Bit 2: End
+    "ldr x20, [x16, #0x50]\n"
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "fmla v22.8h, v7.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v23.8h, v6.8h, v10.8h\n"
+    "fmla v25.8h, v5.8h, v10.8h\n"
+    "fmla v26.8h, v4.8h, v10.8h\n"
+    "fmla v27.8h, v3.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 33f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load input (1, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "35:"  // Oddments: Load input (1, 0): Bit 2: End
+    "ldr x20, [x16, #0x58]\n"
+    "fmla v16.8h, v3.8h, v9.8h\n"
+    "fmla v20.8h, v0.8h, v9.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 37f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load input (1, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load input (1, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (1, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "39:"  // Oddments: Load input (1, 5): Bit 2: End
+    "ldr x20, [x16, #0x60]\n"
+    "fmla v19.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v2.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 41f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load input (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "43:"  // Oddments: Load input (4, 0): Bit 2: End
+    "ldr x20, [x16, #0x68]\n"
+    "fmla v24.8h, v6.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 45f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load input (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load input (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "47:"  // Oddments: Load input (1, 2): Bit 2: End
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v16.8h, v5.8h, v10.8h\n"
+    "fmla v17.8h, v4.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v18.8h, v3.8h, v10.8h\n"
+    "fmla v20.8h, v2.8h, v10.8h\n"
+    "fmla v21.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 49f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load input (4, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load input (4, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (4, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "51:"  // Oddments: Load input (4, 5): Bit 2: End
+    "ldr x20, [x16, #0x78]\n"
+    "fmla v27.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v5.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 53f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load input (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "55:"  // Oddments: Load input (1, 3): Bit 2: End
+    "ldr x20, [x16, #0x80]\n"
+    "fmla v17.8h, v5.8h, v12.8h\n"
+    "fmla v18.8h, v4.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v19.8h, v3.8h, v12.8h\n"
+    "fmla v21.8h, v2.8h, v12.8h\n"
+    "fmla v22.8h, v1.8h, v12.8h\n"
+    "fmla v23.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 57f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load input (5, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load input (5, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (5, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "59:"  // Oddments: Load input (5, 1): Bit 2: End
+    "ldr x20, [x16, #0x88]\n"
+    "fmla v28.8h, v7.8h, v11.8h\n"
+    "fmla v29.8h, v6.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 61f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load input (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "63:"  // Oddments: Load input (2, 1): Bit 2: End
+    "ldr x20, [x16, #0x90]\n"
+    "fmla v16.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v6.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v20.8h, v4.8h, v10.8h\n"
+    "fmla v21.8h, v3.8h, v10.8h\n"
+    "fmla v24.8h, v1.8h, v10.8h\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 65f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load input (5, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load input (5, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (5, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "67:"  // Oddments: Load input (5, 4): Bit 2: End
+    "ldr x20, [x16, #0x98]\n"
+    "fmla v30.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #2, 69f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load input (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 70f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "71:"  // Oddments: Load input (2, 4): Bit 2: End
+    "ldr x20, [x16, #0xa0]\n"
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "fmla v19.8h, v7.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v22.8h, v5.8h, v12.8h\n"
+    "fmla v23.8h, v4.8h, v12.8h\n"
+    "fmla v26.8h, v2.8h, v12.8h\n"
+    "fmla v27.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 73f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 72f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load input (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 74f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "75:"  // Oddments: Load input (0, 2): Bit 2: End
+    "ldr x20, [x16, #0xa8]\n"
+    "fmla v16.8h, v2.8h, v10.8h\n"
+    "fmla v17.8h, v1.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v18.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 77f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 76f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load input (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 78f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "79:"  // Oddments: Load input (3, 1): Bit 2: End
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v20.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v6.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v24.8h, v4.8h, v11.8h\n"
+    "fmla v25.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 81f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 80f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load input (0, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load input (0, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 82f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "83:"  // Oddments: Load input (0, 3): Bit 2: End
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla v17.8h, v2.8h, v12.8h\n"
+    "fmla v18.8h, v1.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v19.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 85f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 84f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load input (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 86f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "87:"  // Oddments: Load input (2, 0): Bit 2: End
+    "ldr x20, [x16, #0xc0]\n"
+    "fmla v16.8h, v6.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 89f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 88f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load input (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 90f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "91:"  // Oddments: Load input (3, 4): Bit 2: End
+    "ldr x20, [x16, #0xc8]\n"
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "fmla v23.8h, v7.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v26.8h, v5.8h, v11.8h\n"
+    "fmla v27.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 93f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 92f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load input (2, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load input (2, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 94f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load input (2, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "95:"  // Oddments: Load input (2, 5): Bit 2: End
+    "ldr x20, [x16, #0xd0]\n"
+    "fmla v19.8h, v8.8h, v12.8h\n"
+    "fmla v23.8h, v5.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v27.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 97f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 96f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load input (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 98f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "99:"  // Oddments: Load input (3, 0): Bit 2: End
+    "ldr x20, [x16, #0xd8]\n"
+    "fmla v20.8h, v6.8h, v10.8h\n"
+    "fmla v24.8h, v3.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 101f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 100f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load input (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 102f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "103:"  // Oddments: Load input (4, 2): Bit 2: End
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla v24.8h, v8.8h, v11.8h\n"
+    "fmla v25.8h, v7.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "fmla v28.8h, v5.8h, v11.8h\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 105f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 104f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load input (3, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load input (3, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 106f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load input (3, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "107:"  // Oddments: Load input (3, 5): Bit 2: End
+    "ldr x20, [x16, #0xe8]\n"
+    "fmla v23.8h, v8.8h, v12.8h\n"
+    "fmla v27.8h, v5.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 109f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 108f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load input (5, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load input (5, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 110f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load input (5, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "111:"  // Oddments: Load input (5, 2): Bit 2: End
+    "ldr x20, [x16, #0xf0]\n"
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v30.8h, v6.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 113f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 112f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 115f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 115f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load input (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 114f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 115f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "115:"  // Oddments: Load input (4, 3): Bit 2: End
+    "ldr x20, [x16, #0xf8]\n"
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "fmla v26.8h, v7.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v27.8h, v6.8h, v11.8h\n"
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 117f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 116f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 119f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load input (5, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 119f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load input (5, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 118f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 119f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load input (5, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "119:"  // Oddments: Load input (5, 3): Bit 2: End
+    "ldr x20, [x16, #0x100]\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 121f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 120f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 123f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 123f\n"
+    "120:"  // Oddments: Load input (1, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 123f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 123f\n"
+    "121:"  // Oddments: Load input (1, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 122f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 123f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 123f\n"
+    "122:"  // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "123:"  // Oddments: Load input (1, 1): Bit 2: End
+    "ldr x20, [x16, #0x108]\n"
+    "fmla v16.8h, v4.8h, v10.8h\n"
+    "fmla v17.8h, v3.8h, v10.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v20.8h, v1.8h, v10.8h\n"
+    "fmla v21.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #2, 125f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 124f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 127f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 127f\n"
+    "124:"  // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 127f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 127f\n"
+    "125:"  // Oddments: Load input (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 126f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 127f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 127f\n"
+    "126:"  // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "127:"  // Oddments: Load input (1, 4): Bit 2: End
+    "ldr x20, [x16, #0x110]\n"
+    "fmla v18.8h, v5.8h, v11.8h\n"
+    "fmla v19.8h, v4.8h, v11.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v22.8h, v2.8h, v11.8h\n"
+    "fmla v23.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #2, 129f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 128f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 131f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 131f\n"
+    "128:"  // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 131f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 131f\n"
+    "129:"  // Oddments: Load input (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 130f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 131f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 131f\n"
+    "130:"  // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "131:"  // Oddments: Load input (4, 1): Bit 2: End
+    "ldr x20, [x16, #0x118]\n"
+    "fmla v24.8h, v7.8h, v12.8h\n"
+    "fmla v25.8h, v6.8h, v12.8h\n"
+    "add x20, x20, x15\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 133f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 132f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 135f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 135f\n"
+    "132:"  // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 135f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 135f\n"
+    "133:"  // Oddments: Load input (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 134f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 135f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 135f\n"
+    "134:"  // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "135:"  // Oddments: Load input (4, 4): Bit 2: End
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "fmax v16.8h, v16.8h, v13.8h\n"
+    "fmla v30.8h, v5.8h, v10.8h\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmax v17.8h, v17.8h, v13.8h\n"
+    "fmax v18.8h, v18.8h, v13.8h\n"
+    "fmax v19.8h, v19.8h, v13.8h\n"
+    "fmax v20.8h, v20.8h, v13.8h\n"
+    "fmax v21.8h, v21.8h, v13.8h\n"
+    "fmax v22.8h, v22.8h, v13.8h\n"
+    "fmax v23.8h, v23.8h, v13.8h\n"
+    "fmax v24.8h, v24.8h, v13.8h\n"
+    "fmax v25.8h, v25.8h, v13.8h\n"
+    "fmax v26.8h, v26.8h, v13.8h\n"
+    "fmax v27.8h, v27.8h, v13.8h\n"
+    "fmax v28.8h, v28.8h, v13.8h\n"
+    "fmax v29.8h, v29.8h, v13.8h\n"
+    "fmax v30.8h, v30.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v13.8h\n"
+    "fmin v16.8h, v16.8h, v14.8h\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "tbz %x[n_channels], #2, 137f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "add x14, x14, #0x8\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 136f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "add x14, x14, #0x4\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 139f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.h }[6], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.h }[6], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.h }[6], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.h }[6], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.h }[6], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.h }[6], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.h }[6], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.h }[6], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.h }[6], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.h }[6], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.h }[6], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.h }[6], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.h }[6], [x23]\n"
+    "st1 { v29.h }[6], [x22]\n"
+    "st1 { v30.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 139f\n"
+    "136:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 139f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.h }[4], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.h }[4], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.h }[4], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.h }[4], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.h }[4], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.h }[4], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.h }[4], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.h }[4], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.h }[4], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.h }[4], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.h }[4], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.h }[4], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.h }[4], [x23]\n"
+    "st1 { v29.h }[4], [x22]\n"
+    "st1 { v30.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 139f\n"
+    "137:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 138f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "add x14, x14, #0x4\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 139f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.h }[2], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.h }[2], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.h }[2], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.h }[2], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.h }[2], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.h }[2], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.h }[2], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.h }[2], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.h }[2], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.h }[2], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.h }[2], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.h }[2], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.h }[2], [x23]\n"
+    "st1 { v29.h }[2], [x22]\n"
+    "st1 { v30.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 139f\n"
+    "138:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.h }[0], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.h }[0], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.h }[0], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.h }[0], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.h }[0], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.h }[0], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.h }[0], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.h }[0], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.h }[0], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.h }[0], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "139:"  // Oddments: Store: Bit 2: End
+    "140:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..8ad6a37fea
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..8954999990
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,895 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x23, #0x0\n"
+    "mov x27, #0x0\n"
+    "1:"  // Tile loop
+    "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x26, #0x4\n"
+    "mov x25, #0x2\n"
+    "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x23, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x27, x6, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x7, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "lsl x6, x6, #0x1\n"
+    "mul x20, x23, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "mul x22, x22, x26\n"  // offset *= kernel_stride * output_size
+    "add x8, x8, x22, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x16, x8, x24, LSL #1\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x27, x7, x20\n"  // offset += tile_j * ld_output_col
+    "lsr x22, %x[n_channels], #0x3\n"
+    "add x14, x16, x24, LSL #1\n"
+    "mul x20, x20, x25\n"  // offset *= output_tile_size
+    "add x13, x6, x6\n"
+    "add x12, x14, x24, LSL #1\n"
+    "add x11, x13, x6\n"
+    "add x17, x17, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x10, x12, x24, LSL #1\n"
+    "add x9, x11, x6\n"
+    "add x28, x17, x21, LSL #1\n"
+    "lsl x7, x7, #0x1\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q31, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldr q9, [x14, x13]\n"
+    "ld1 { v10.8h }, [x8]\n"
+    "ldr q11, [x8, x6]\n"
+    "ldr q12, [x8, x11]\n"
+    "ldr q13, [x8, x9]\n"
+    "ld1 { v14.8h }, [x16]\n"
+    "ldr q15, [x16, x6]\n"
+    "ldr q16, [x8, x13]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+    "add x23, x23, #0x10\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x8]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q21, [x16, x9]\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "ldr q18, [x16, x11]\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "ldr q17, [x16, x13]\n"
+    "fmla v29.8h, v3.8h, v14.8h\n"
+    "ld1 { v20.8h }, [x12]\n"
+    "fmla v28.8h, v0.8h, v16.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.8h, v4.8h, v15.8h\n"
+    "ld1 { v25.8h }, [x14]\n"
+    "fmla v28.8h, v4.8h, v18.8h\n"
+    "ldr q19, [x12, x6]\n"
+    "fmla v29.8h, v2.8h, v16.8h\n"
+    "ldr q18, [x14, x6]\n"
+    "fmla v28.8h, v5.8h, v21.8h\n"
+    "ldr q24, [x14, x11]\n"
+    "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+    "ldr q31, [x15, #0x0]\n"
+    "cmp x23, x22, LSL #4\n"
+    "fmla v29.8h, v5.8h, v17.8h\n"
+    "fmla v28.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x12, x11]\n"
+    "add x20, x20, #0x10\n"
+    "fmla v23.8h, v3.8h, v20.8h\n"
+    "ldr q16, [x12, x9]\n"
+    "fmla v22.8h, v4.8h, v17.8h\n"
+    "ldr q21, [x10, x6]\n"
+    "fmla v23.8h, v0.8h, v25.8h\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v22.8h, v1.8h, v24.8h\n"
+    "add x21, x21, #0x10\n"
+    "fmla v23.8h, v4.8h, v19.8h\n"
+    "ldr q20, [x14, x9]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "fmla v22.8h, v5.8h, v16.8h\n"
+    "ldr q19, [x10, x11]\n"
+    "fmla v29.8h, v6.8h, v25.8h\n"
+    "ld1 { v17.8h }, [x10]\n"
+    "fmla v23.8h, v1.8h, v18.8h\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmla v22.8h, v2.8h, v20.8h\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmla v29.8h, v7.8h, v18.8h\n"
+    "ldr q16, [x12, x13]\n"
+    "fmla v23.8h, v6.8h, v17.8h\n"
+    "ldr q18, [x10, x13]\n"
+    "fmla v22.8h, v3.8h, v16.8h\n"
+    "ldr q3, [x15, #0x40]\n"
+    "fmla v23.8h, v7.8h, v21.8h\n"
+    "ldr q13, [x8, x9]\n"
+    "fmla v22.8h, v7.8h, v19.8h\n"
+    "ld1 { v14.8h }, [x16]\n"
+    "fmla v28.8h, v7.8h, v24.8h\n"
+    "ldr q12, [x8, x11]\n"
+    "fmla v23.8h, v5.8h, v16.8h\n"
+    "ldr q16, [x8, x13]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "fmla v22.8h, v6.8h, v18.8h\n"
+    "fmla v28.8h, v8.8h, v20.8h\n"
+    "ldr q17, [x10, x9]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "fmla v23.8h, v8.8h, v18.8h\n"
+    "fmla v22.8h, v8.8h, v17.8h\n"
+    "ldr q11, [x8, x6]\n"
+    "ldr q15, [x16, x6]\n"
+    "fmax v29.8h, v29.8h, v26.8h\n"
+    "fmax v28.8h, v28.8h, v26.8h\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "fmax v23.8h, v23.8h, v26.8h\n"
+    "fmax v22.8h, v22.8h, v26.8h\n"
+    "add x14, x14, #0x10\n"
+    "ldr q9, [x14, x13]\n"
+    "fmin v29.8h, v29.8h, v27.8h\n"
+    "fmin v28.8h, v28.8h, v27.8h\n"
+    "fmin v23.8h, v23.8h, v27.8h\n"
+    "fmin v22.8h, v22.8h, v27.8h\n"
+    "add x12, x12, #0x10\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v29.8h }, [x17]\n"
+    "add x15, x15, #0xa0\n"
+    "str q28, [x17, x7]\n"
+    "add x17, x17, #0x10\n"
+    "st1 { v23.8h }, [x28]\n"
+    "str q22, [x28, x7]\n"
+    "add x28, x28, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+    "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q20, [x16, x9]\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "ldr q18, [x16, x11]\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "ldr q17, [x16, x13]\n"
+    "fmla v29.8h, v3.8h, v14.8h\n"
+    "ld1 { v19.8h }, [x12]\n"
+    "fmla v28.8h, v0.8h, v16.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.8h, v4.8h, v15.8h\n"
+    "ld1 { v25.8h }, [x14]\n"
+    "fmla v28.8h, v4.8h, v18.8h\n"
+    "ldr q18, [x12, x6]\n"
+    "fmla v29.8h, v2.8h, v16.8h\n"
+    "ldr q24, [x14, x6]\n"
+    "fmla v28.8h, v5.8h, v20.8h\n"
+    "ldr q23, [x14, x11]\n"
+    "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "fmla v29.8h, v5.8h, v17.8h\n"
+    "fmla v28.8h, v3.8h, v17.8h\n"
+    "ldr q17, [x12, x11]\n"
+    "fmla v22.8h, v3.8h, v19.8h\n"
+    "ldr q16, [x12, x9]\n"
+    "fmla v21.8h, v4.8h, v17.8h\n"
+    "ldr q20, [x10, x6]\n"
+    "fmla v22.8h, v0.8h, v25.8h\n"
+    "fmla v21.8h, v1.8h, v23.8h\n"
+    "fmla v22.8h, v4.8h, v18.8h\n"
+    "ldr q19, [x14, x9]\n"
+    "fmla v21.8h, v5.8h, v16.8h\n"
+    "ldr q18, [x10, x11]\n"
+    "fmla v29.8h, v6.8h, v25.8h\n"
+    "ld1 { v17.8h }, [x10]\n"
+    "fmla v22.8h, v1.8h, v24.8h\n"
+    "add x14, x14, #0x10\n"
+    "fmla v21.8h, v2.8h, v19.8h\n"
+    "fmla v29.8h, v7.8h, v24.8h\n"
+    "ldr q16, [x12, x13]\n"
+    "fmax v29.8h, v29.8h, v26.8h\n"
+    "fmla v22.8h, v6.8h, v17.8h\n"
+    "ldr q17, [x10, x13]\n"
+    "fmla v21.8h, v3.8h, v16.8h\n"
+    "fmin v29.8h, v29.8h, v27.8h\n"
+    "fmla v22.8h, v7.8h, v20.8h\n"
+    "fmla v21.8h, v7.8h, v18.8h\n"
+    "st1 { v29.8h }, [x17]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.8h, v7.8h, v23.8h\n"
+    "fmla v22.8h, v5.8h, v16.8h\n"
+    "fmla v21.8h, v6.8h, v17.8h\n"
+    "fmla v28.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x10, x9]\n"
+    "fmax v28.8h, v28.8h, v26.8h\n"
+    "fmla v22.8h, v8.8h, v17.8h\n"
+    "fmla v21.8h, v8.8h, v16.8h\n"
+    "fmax v22.8h, v22.8h, v26.8h\n"
+    "add x10, x10, #0x10\n"
+    "fmax v21.8h, v21.8h, v26.8h\n"
+    "fmin v28.8h, v28.8h, v27.8h\n"
+    "str q28, [x17, x7]\n"
+    "add x17, x17, #0x10\n"
+    "fmin v22.8h, v22.8h, v27.8h\n"
+    "fmin v21.8h, v21.8h, v27.8h\n"
+    "st1 { v22.8h }, [x28]\n"
+    "str q21, [x28, x7]\n"
+    "add x28, x28, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 81f\n"
+    "ldr q31, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "add x27, x14, x13\n"
+    "add x26, x8, XZR\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "add x25, x8, x6\n"
+    "add x24, x8, x11\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x23, x8, x9\n"
+    "add x22, x16, XZR\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "add x21, x16, x6\n"
+    "add x20, x8, x13\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "tbz %x[n_channels], #2, 6f\n"
+    "ldr d9, [x27], #0x8\n"
+    "ldr d10, [x26], #0x8\n"
+    "ldr d11, [x25], #0x8\n"
+    "ldr d12, [x24], #0x8\n"
+    "ldr d13, [x23], #0x8\n"
+    "ldr d14, [x22], #0x8\n"
+    "ldr d15, [x21], #0x8\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v9.s }[2], [x27], #0x4\n"
+    "ld1 { v10.s }[2], [x26], #0x4\n"
+    "ld1 { v11.s }[2], [x25], #0x4\n"
+    "ld1 { v12.s }[2], [x24], #0x4\n"
+    "ld1 { v13.s }[2], [x23], #0x4\n"
+    "ld1 { v14.s }[2], [x22], #0x4\n"
+    "ld1 { v15.s }[2], [x21], #0x4\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[6], [x27]\n"
+    "ld1 { v10.h }[6], [x26]\n"
+    "ld1 { v11.h }[6], [x25]\n"
+    "ld1 { v12.h }[6], [x24]\n"
+    "ld1 { v13.h }[6], [x23]\n"
+    "ld1 { v14.h }[6], [x22]\n"
+    "ld1 { v15.h }[6], [x21]\n"
+    "ld1 { v16.h }[6], [x20]\n"
+    "b 8f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[4], [x27]\n"
+    "ld1 { v10.h }[4], [x26]\n"
+    "ld1 { v11.h }[4], [x25]\n"
+    "ld1 { v12.h }[4], [x24]\n"
+    "ld1 { v13.h }[4], [x23]\n"
+    "ld1 { v14.h }[4], [x22]\n"
+    "ld1 { v15.h }[4], [x21]\n"
+    "ld1 { v16.h }[4], [x20]\n"
+    "b 8f\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s9, [x27], #0x4\n"
+    "ldr s10, [x26], #0x4\n"
+    "ldr s11, [x25], #0x4\n"
+    "ldr s12, [x24], #0x4\n"
+    "ldr s13, [x23], #0x4\n"
+    "ldr s14, [x22], #0x4\n"
+    "ldr s15, [x21], #0x4\n"
+    "ldr s16, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[2], [x27]\n"
+    "ld1 { v10.h }[2], [x26]\n"
+    "ld1 { v11.h }[2], [x25]\n"
+    "ld1 { v12.h }[2], [x24]\n"
+    "ld1 { v13.h }[2], [x23]\n"
+    "ld1 { v14.h }[2], [x22]\n"
+    "ld1 { v15.h }[2], [x21]\n"
+    "ld1 { v16.h }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x27, #0x0]\n"
+    "ldr h10, [x26, #0x0]\n"
+    "ldr h11, [x25, #0x0]\n"
+    "ldr h12, [x24, #0x0]\n"
+    "ldr h13, [x23, #0x0]\n"
+    "ldr h14, [x22, #0x0]\n"
+    "ldr h15, [x21, #0x0]\n"
+    "ldr h16, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
+    "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "add x20, x16, x11\n"
+    "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v14.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v15.8h\n"
+    "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v29.8h, v0.8h, v16.8h\n"
+    "tbz %x[n_channels], #2, 10f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "add x20, x16, x9\n"
+    "tbz %x[n_channels], #2, 14f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 16f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 16f\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+    "fmla v29.8h, v5.8h, v12.8h\n"
+    "add x20, x16, x13\n"
+    "tbz %x[n_channels], #2, 18f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 20f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 20f\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: End
+    "fmla v28.8h, v5.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "add x20, x12, XZR\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v14.h }[6], [x20]\n"
+    "b 24f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v14.h }[4], [x20]\n"
+    "b 24f\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s14, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v14.h }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+    "fmla v30.8h, v3.8h, v14.8h\n"
+    "add x20, x14, XZR\n"
+    "tbz %x[n_channels], #2, 26f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v15.h }[6], [x20]\n"
+    "b 28f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v15.h }[4], [x20]\n"
+    "b 28f\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s15, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v15.h }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h15, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+    "fmla v28.8h, v6.8h, v15.8h\n"
+    "fmla v30.8h, v0.8h, v15.8h\n"
+    "add x20, x12, x6\n"
+    "tbz %x[n_channels], #2, 30f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 32f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 32f\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "add x20, x14, x6\n"
+    "tbz %x[n_channels], #2, 34f\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v16.h }[6], [x20]\n"
+    "b 36f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v16.h }[4], [x20]\n"
+    "b 36f\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s16, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v16.h }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h16, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+    "fmla v28.8h, v7.8h, v16.8h\n"
+    "fmla v30.8h, v1.8h, v16.8h\n"
+    "add x20, x12, x11\n"
+    "tbz %x[n_channels], #2, 38f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 40f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 40f\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "add x20, x14, x11\n"
+    "tbz %x[n_channels], #2, 42f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 44f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 44f\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+    "fmla v29.8h, v7.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "add x20, x12, x9\n"
+    "tbz %x[n_channels], #2, 46f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v14.h }[6], [x20]\n"
+    "b 48f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v14.h }[4], [x20]\n"
+    "b 48f\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s14, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v14.h }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+    "fmla v31.8h, v5.8h, v14.8h\n"
+    "add x20, x10, XZR\n"
+    "tbz %x[n_channels], #2, 50f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v15.h }[6], [x20]\n"
+    "b 52f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v15.h }[4], [x20]\n"
+    "b 52f\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s15, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v15.h }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h15, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
+    "fmla v30.8h, v6.8h, v15.8h\n"
+    "add x20, x14, x9\n"
+    "tbz %x[n_channels], #2, 54f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 56f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 56f\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+    "fmla v29.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "add x20, x10, x6\n"
+    "tbz %x[n_channels], #2, 58f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 60f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 60f\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+    "fmla v30.8h, v7.8h, v13.8h\n"
+    "add x20, x12, x13\n"
+    "tbz %x[n_channels], #2, 62f\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v16.h }[6], [x20]\n"
+    "b 64f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v16.h }[4], [x20]\n"
+    "b 64f\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr s16, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v16.h }[2], [x20]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h16, [x20, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+    "fmla v30.8h, v5.8h, v16.8h\n"
+    "fmla v31.8h, v3.8h, v16.8h\n"
+    "add x20, x10, x11\n"
+    "tbz %x[n_channels], #2, 66f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v14.h }[6], [x20]\n"
+    "b 68f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v14.h }[4], [x20]\n"
+    "b 68f\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr s14, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v14.h }[2], [x20]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x20, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+    "fmla v31.8h, v7.8h, v14.8h\n"
+    "add x20, x10, x13\n"
+    "tbz %x[n_channels], #2, 70f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v15.h }[6], [x20]\n"
+    "b 72f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v15.h }[4], [x20]\n"
+    "b 72f\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 71f\n"
+    "ldr s15, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v15.h }[2], [x20]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h15, [x20, #0x0]\n"
+    "72:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+    "fmla v30.8h, v8.8h, v15.8h\n"
+    "fmla v31.8h, v6.8h, v15.8h\n"
+    "add x20, x10, x9\n"
+    "tbz %x[n_channels], #2, 74f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 73f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 76f\n"
+    "73:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 76f\n"
+    "74:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 75f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 76f\n"
+    "75:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "76:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+    "fmla v31.8h, v8.8h, v11.8h\n"
+    "fmax v28.8h, v28.8h, v26.8h\n"
+    "fmax v29.8h, v29.8h, v26.8h\n"
+    "fmax v30.8h, v30.8h, v26.8h\n"
+    "fmax v31.8h, v31.8h, v26.8h\n"
+    "fmin v28.8h, v28.8h, v27.8h\n"
+    "fmin v29.8h, v29.8h, v27.8h\n"
+    "fmin v30.8h, v30.8h, v27.8h\n"
+    "fmin v31.8h, v31.8h, v27.8h\n"
+    "tbz %x[n_channels], #2, 78f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.d }[0], [x21], x7\n"
+    "st1 { v30.d }[0], [x20], x7\n"
+    "add x17, x17, #0x8\n"
+    "add x28, x28, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 77f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.s }[2], [x21], x7\n"
+    "st1 { v30.s }[2], [x20], x7\n"
+    "add x17, x17, #0x4\n"
+    "add x28, x28, #0x4\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.h }[6], [x21], x7\n"
+    "st1 { v30.h }[6], [x20], x7\n"
+    "st1 { v29.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 80f\n"
+    "77:"  // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 80f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.h }[4], [x21], x7\n"
+    "st1 { v30.h }[4], [x20], x7\n"
+    "st1 { v29.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 80f\n"
+    "78:"  // Tile loop: Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 79f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.s }[0], [x21], x7\n"
+    "st1 { v30.s }[0], [x20], x7\n"
+    "add x17, x17, #0x4\n"
+    "add x28, x28, #0x4\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.h }[2], [x21], x7\n"
+    "st1 { v30.h }[2], [x20], x7\n"
+    "st1 { v29.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 80f\n"
+    "79:"  // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.h }[0], [x21], x7\n"
+    "st1 { v30.h }[0], [x20], x7\n"
+    "st1 { v29.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "80:"  // Tile loop: Oddments: Store: Bit 2: End
+    "81:"  // Tile loop: End
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x27, x27, #0x1\n"
+    "add x21, x23, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x27, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x23, x23, x21, LT\n"
+    "csel x27, x27, XZR, LT\n"
+    "cmp x23, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..6ae0b30afd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,897 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x25, #0x10\n"  // cntb _, ALL, #1
+    "lsr x24, %x[n_channels], #0x3\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "mov x28, #0x0\n"
+    "sub x22, XZR, x25\n"
+    "cbz x24, 3f\n"
+    "ldr q31, [x23, #0x0]\n"
+    "ldr q0, [x23, #0x10]\n"
+    "cmp x25, x24, LSL #4\n"
+    "ldr q1, [x23, #0x20]\n"
+    "ldr q2, [x23, #0x30]\n"
+    "ldr q3, [x23, #0x40]\n"
+    "ldr q4, [x23, #0x50]\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "ldr q8, [x23, #0x90]\n"
+    "add x23, x23, #0xa0\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q10, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "ldr q11, [x21, x28]\n"
+    "ldr q12, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x20]\n"
+    "ldr q13, [x21, x28]\n"
+    "ldr q14, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x30]\n"
+    "ldr q15, [x21, x28]\n"
+    "ldr q16, [x20, x28]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v24.16b, v31.16b\n fmla v24.8h, v8.8h, v9.8h\n"
+    "mov v23.16b, v31.16b\n fmla v23.8h, v6.8h, v9.8h\n"
+    "ldr x21, [x13, #0x40]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v24.8h, v0.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v12.8h\n"
+    "ldr q20, [x20, x28]\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "ldr q19, [x21, x28]\n"
+    "fmla v23.8h, v2.8h, v13.8h\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v24.8h, v3.8h, v14.8h\n"
+    "fmla v23.8h, v0.8h, v16.8h\n"
+    "ldr x20, [x13, #0x58]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v24.8h, v4.8h, v15.8h\n"
+    "fmla v23.8h, v4.8h, v19.8h\n"
+    "ldr x21, [x13, #0x78]\n"
+    "ldr x20, [x13, #0x60]\n"
+    "ldr q22, [x20, x28]\n"
+    "fmla v24.8h, v2.8h, v16.8h\n"
+    "fmla v23.8h, v5.8h, v20.8h\n"
+    "ldr x20, [x13, #0x80]\n"
+    "ldr q21, [x20, x28]\n"
+    "mov v20.16b, v31.16b\n fmla v20.8h, v2.8h, v9.8h\n"
+    "mov v19.16b, v31.16b\n fmla v19.8h, v0.8h, v9.8h\n"
+    "ldr q31, [x23, #0x0]\n"
+    "fmla v24.8h, v5.8h, v18.8h\n"
+    "fmla v23.8h, v3.8h, v18.8h\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x20, [x13, #0x68]\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v20.8h, v3.8h, v17.8h\n"
+    "fmla v19.8h, v4.8h, v16.8h\n"
+    "ldr x20, [x13, #0x88]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v20.8h, v0.8h, v22.8h\n"
+    "ldr q0, [x23, #0x10]\n"
+    "fmla v19.8h, v1.8h, v21.8h\n"
+    "ldr x20, [x13, #0x70]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v20.8h, v4.8h, v18.8h\n"
+    "fmla v19.8h, v5.8h, v16.8h\n"
+    "ldr q4, [x23, #0x50]\n"
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v24.8h, v6.8h, v22.8h\n"
+    "fmla v20.8h, v1.8h, v17.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q1, [x23, #0x20]\n"
+    "fmla v19.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v7.8h, v17.8h\n"
+    "ldr q2, [x23, #0x30]\n"
+    "ldr x20, [x13, #0x90]\n"
+    "fmla v23.8h, v7.8h, v21.8h\n"
+    "fmla v23.8h, v8.8h, v16.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v20.8h, v6.8h, v16.8h\n"
+    "fmax v24.8h, v24.8h, v26.8h\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0xa0]\n"
+    "fmla v19.8h, v3.8h, v17.8h\n"
+    "fmax v23.8h, v23.8h, v26.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q3, [x23, #0x40]\n"
+    "fmla v20.8h, v7.8h, v16.8h\n"
+    "fmla v20.8h, v5.8h, v17.8h\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr x20, [x13, #0xb0]\n"
+    "add x22, x22, #0x10\n"
+    "fmin v24.8h, v24.8h, v27.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0xb8]\n"
+    "fmla v19.8h, v7.8h, v16.8h\n"
+    "fmin v23.8h, v23.8h, v27.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "fmla v19.8h, v6.8h, v16.8h\n"
+    "fmla v20.8h, v8.8h, v16.8h\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr x20, [x13, #0xc0]\n"
+    "fmax v20.8h, v20.8h, v26.8h\n"
+    "fmin v20.8h, v20.8h, v27.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v19.8h, v8.8h, v16.8h\n"
+    "ldr q8, [x23, #0x90]\n"
+    "fmax v19.8h, v19.8h, v26.8h\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x25]\n"
+    "fmin v19.8h, v19.8h, v27.8h\n"
+    "add x28, x28, #0x10\n"
+    "ldr q10, [x20, x25]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "str q24, [x12, x22]\n"
+    "add x23, x23, #0xa0\n"
+    "ldr q11, [x21, x25]\n"
+    "ldr q12, [x20, x25]\n"
+    "str q23, [x11, x22]\n"
+    "ldp x21, x20, [x13, #0x20]\n"
+    "ldr q13, [x21, x25]\n"
+    "str q20, [x10, x22]\n"
+    "ldr q14, [x20, x25]\n"
+    "ldp x21, x20, [x13, #0x30]\n"
+    "str q19, [x9, x22]\n"
+    "ldr q15, [x21, x25]\n"
+    "ldr q16, [x20, x25]\n"
+    "add x25, x25, #0x10\n"
+    "cmp x25, x24, LSL #4\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v25.16b, v31.16b\n fmla v25.8h, v8.8h, v9.8h\n"
+    "mov v24.16b, v31.16b\n fmla v24.8h, v6.8h, v9.8h\n"
+    "ldr x21, [x13, #0x40]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "fmla v24.8h, v1.8h, v12.8h\n"
+    "ldr q20, [x20, x28]\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v25.8h, v1.8h, v11.8h\n"
+    "ldr q18, [x21, x28]\n"
+    "fmla v24.8h, v2.8h, v13.8h\n"
+    "ldr q19, [x20, x28]\n"
+    "fmla v25.8h, v3.8h, v14.8h\n"
+    "fmla v24.8h, v0.8h, v16.8h\n"
+    "ldr x20, [x13, #0x58]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v25.8h, v4.8h, v15.8h\n"
+    "fmla v24.8h, v4.8h, v18.8h\n"
+    "ldr x21, [x13, #0x78]\n"
+    "ldr x20, [x13, #0x60]\n"
+    "ldr q23, [x20, x28]\n"
+    "fmla v25.8h, v2.8h, v16.8h\n"
+    "fmla v24.8h, v5.8h, v20.8h\n"
+    "ldr x20, [x13, #0x80]\n"
+    "ldr q22, [x20, x28]\n"
+    "mov v21.16b, v31.16b\n fmla v21.8h, v2.8h, v9.8h\n"
+    "mov v20.16b, v31.16b\n fmla v20.8h, v0.8h, v9.8h\n"
+    "ldr x20, [x13, #0x68]\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v25.8h, v5.8h, v19.8h\n"
+    "fmla v24.8h, v3.8h, v19.8h\n"
+    "ldr q16, [x21, x28]\n"
+    "fmla v21.8h, v3.8h, v17.8h\n"
+    "fmla v20.8h, v4.8h, v16.8h\n"
+    "ldr x20, [x13, #0x88]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.8h, v0.8h, v23.8h\n"
+    "fmla v20.8h, v1.8h, v22.8h\n"
+    "ldr x20, [x13, #0x70]\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v21.8h, v4.8h, v18.8h\n"
+    "ldr q19, [x20, x28]\n"
+    "fmla v20.8h, v5.8h, v16.8h\n"
+    "fmla v25.8h, v6.8h, v23.8h\n"
+    "ldr x20, [x13, #0x90]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.8h, v1.8h, v17.8h\n"
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v20.8h, v2.8h, v19.8h\n"
+    "fmla v25.8h, v7.8h, v17.8h\n"
+    "ldr q18, [x20, x28]\n"
+    "ldr x20, [x13, #0xa0]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v21.8h, v6.8h, v16.8h\n"
+    "fmla v20.8h, v3.8h, v18.8h\n"
+    "ldr x20, [x13, #0xb0]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.8h, v7.8h, v17.8h\n"
+    "fmla v20.8h, v7.8h, v16.8h\n"
+    "ldr x20, [x13, #0xb8]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v24.8h, v7.8h, v22.8h\n"
+    "fmla v21.8h, v5.8h, v18.8h\n"
+    "ldr x20, [x13, #0xc0]\n"
+    "fmla v20.8h, v6.8h, v17.8h\n"
+    "fmla v24.8h, v8.8h, v19.8h\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.8h, v8.8h, v17.8h\n"
+    "fmla v20.8h, v8.8h, v16.8h\n"
+    "fmax v25.8h, v25.8h, v26.8h\n"
+    "add x22, x22, #0x10\n"
+    "fmax v24.8h, v24.8h, v26.8h\n"
+    "fmax v21.8h, v21.8h, v26.8h\n"
+    "add x28, x28, #0x10\n"
+    "fmax v20.8h, v20.8h, v26.8h\n"
+    "fmin v25.8h, v25.8h, v27.8h\n"
+    "str q25, [x12, x22]\n"
+    "fmin v24.8h, v24.8h, v27.8h\n"
+    "fmin v21.8h, v21.8h, v27.8h\n"
+    "str q24, [x11, x22]\n"
+    "fmin v20.8h, v20.8h, v27.8h\n"
+    "str q21, [x10, x22]\n"
+    "str q20, [x9, x22]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 80f\n"
+    "ldr q31, [x23, #0x0]\n"
+    "ldr q0, [x23, #0x10]\n"
+    "mov x20, x28\n"
+    "add x12, x12, x20\n"
+    "ldr q1, [x23, #0x20]\n"
+    "ldr q2, [x23, #0x30]\n"
+    "add x11, x11, x20\n"
+    "add x10, x10, x20\n"
+    "ldr q3, [x23, #0x40]\n"
+    "ldr q4, [x23, #0x50]\n"
+    "add x9, x9, x20\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "ldr q8, [x23, #0x90]\n"
+    "ldr x27, [x13, #0x0]\n"
+    "ldr x26, [x13, #0x8]\n"
+    "add x27, x27, x28\n"
+    "add x26, x26, x28\n"
+    "ldr x25, [x13, #0x10]\n"
+    "ldr x24, [x13, #0x18]\n"
+    "add x25, x25, x28\n"
+    "add x24, x24, x28\n"
+    "ldr x23, [x13, #0x20]\n"
+    "ldr x22, [x13, #0x28]\n"
+    "add x23, x23, x28\n"
+    "add x22, x22, x28\n"
+    "ldr x21, [x13, #0x30]\n"
+    "ldr x20, [x13, #0x38]\n"
+    "add x21, x21, x28\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v9.d }[0], [x27], #0x8\n"
+    "ld1 { v10.d }[0], [x26], #0x8\n"
+    "ld1 { v11.d }[0], [x25], #0x8\n"
+    "ld1 { v12.d }[0], [x24], #0x8\n"
+    "ld1 { v13.d }[0], [x23], #0x8\n"
+    "ld1 { v14.d }[0], [x22], #0x8\n"
+    "ld1 { v15.d }[0], [x21], #0x8\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[2], [x27], #0x4\n"
+    "ld1 { v10.s }[2], [x26], #0x4\n"
+    "ld1 { v11.s }[2], [x25], #0x4\n"
+    "ld1 { v12.s }[2], [x24], #0x4\n"
+    "ld1 { v13.s }[2], [x23], #0x4\n"
+    "ld1 { v14.s }[2], [x22], #0x4\n"
+    "ld1 { v15.s }[2], [x21], #0x4\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[6], [x27], #0x2\n"
+    "ld1 { v10.h }[6], [x26], #0x2\n"
+    "ld1 { v11.h }[6], [x25], #0x2\n"
+    "ld1 { v12.h }[6], [x24], #0x2\n"
+    "ld1 { v13.h }[6], [x23], #0x2\n"
+    "ld1 { v14.h }[6], [x22], #0x2\n"
+    "ld1 { v15.h }[6], [x21], #0x2\n"
+    "ld1 { v16.h }[6], [x20], #0x2\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[4], [x27], #0x2\n"
+    "ld1 { v10.h }[4], [x26], #0x2\n"
+    "ld1 { v11.h }[4], [x25], #0x2\n"
+    "ld1 { v12.h }[4], [x24], #0x2\n"
+    "ld1 { v13.h }[4], [x23], #0x2\n"
+    "ld1 { v14.h }[4], [x22], #0x2\n"
+    "ld1 { v15.h }[4], [x21], #0x2\n"
+    "ld1 { v16.h }[4], [x20], #0x2\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.s }[0], [x27], #0x4\n"
+    "ld1 { v10.s }[0], [x26], #0x4\n"
+    "ld1 { v11.s }[0], [x25], #0x4\n"
+    "ld1 { v12.s }[0], [x24], #0x4\n"
+    "ld1 { v13.s }[0], [x23], #0x4\n"
+    "ld1 { v14.s }[0], [x22], #0x4\n"
+    "ld1 { v15.s }[0], [x21], #0x4\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[2], [x27], #0x2\n"
+    "ld1 { v10.h }[2], [x26], #0x2\n"
+    "ld1 { v11.h }[2], [x25], #0x2\n"
+    "ld1 { v12.h }[2], [x24], #0x2\n"
+    "ld1 { v13.h }[2], [x23], #0x2\n"
+    "ld1 { v14.h }[2], [x22], #0x2\n"
+    "ld1 { v15.h }[2], [x21], #0x2\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x27], #0x2\n"
+    "ld1 { v10.h }[0], [x26], #0x2\n"
+    "ld1 { v11.h }[0], [x25], #0x2\n"
+    "ld1 { v12.h }[0], [x24], #0x2\n"
+    "ld1 { v13.h }[0], [x23], #0x2\n"
+    "ld1 { v14.h }[0], [x22], #0x2\n"
+    "ld1 { v15.h }[0], [x21], #0x2\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "7:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
+    "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v10.8h\n"
+    "ldr x20, [x13, #0x40]\n"
+    "add x20, x20, x28\n"
+    "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v14.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v15.8h\n"
+    "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v16.8h\n"
+    "fmla v29.8h, v0.8h, v16.8h\n"
+    "tbz %x[n_channels], #2, 9f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load input (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "11:"  // Oddments: Load input (1, 3): Bit 2: End
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load input (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "15:"  // Oddments: Load input (1, 4): Bit 2: End
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v29.8h, v5.8h, v12.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load input (1, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load input (1, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "19:"  // Oddments: Load input (1, 2): Bit 2: End
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v28.8h, v5.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 21f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v14.h }[6], [x20], #0x2\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v14.h }[4], [x20], #0x2\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load input (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "23:"  // Oddments: Load input (3, 0): Bit 2: End
+    "ldr x20, [x13, #0x60]\n"
+    "fmla v30.8h, v3.8h, v14.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 25f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v15.h }[6], [x20], #0x2\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v15.h }[4], [x20], #0x2\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load input (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "27:"  // Oddments: Load input (2, 0): Bit 2: End
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v28.8h, v6.8h, v15.8h\n"
+    "fmla v30.8h, v0.8h, v15.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 29f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load input (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "31:"  // Oddments: Load input (3, 1): Bit 2: End
+    "ldr x20, [x13, #0x70]\n"
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 33f\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v16.h }[6], [x20], #0x2\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v16.h }[4], [x20], #0x2\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load input (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "35:"  // Oddments: Load input (2, 1): Bit 2: End
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v28.8h, v7.8h, v16.8h\n"
+    "fmla v30.8h, v1.8h, v16.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 37f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load input (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "39:"  // Oddments: Load input (3, 3): Bit 2: End
+    "ldr x20, [x13, #0x80]\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 41f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load input (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "43:"  // Oddments: Load input (2, 3): Bit 2: End
+    "ldr x20, [x13, #0x88]\n"
+    "fmla v29.8h, v7.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 45f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v14.h }[6], [x20], #0x2\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v14.h }[4], [x20], #0x2\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load input (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "47:"  // Oddments: Load input (3, 4): Bit 2: End
+    "ldr x20, [x13, #0x90]\n"
+    "fmla v31.8h, v5.8h, v14.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 49f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v15.h }[6], [x20], #0x2\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v15.h }[4], [x20], #0x2\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load input (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "51:"  // Oddments: Load input (4, 0): Bit 2: End
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v30.8h, v6.8h, v15.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 53f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load input (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "55:"  // Oddments: Load input (2, 4): Bit 2: End
+    "ldr x20, [x13, #0xa0]\n"
+    "fmla v29.8h, v8.8h, v11.8h\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 57f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load input (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "59:"  // Oddments: Load input (4, 1): Bit 2: End
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v30.8h, v7.8h, v13.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 61f\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v16.h }[6], [x20], #0x2\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v16.h }[4], [x20], #0x2\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load input (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "63:"  // Oddments: Load input (3, 2): Bit 2: End
+    "ldr x20, [x13, #0xb0]\n"
+    "fmla v30.8h, v5.8h, v16.8h\n"
+    "fmla v31.8h, v3.8h, v16.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 65f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v14.h }[6], [x20], #0x2\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v14.h }[4], [x20], #0x2\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load input (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "67:"  // Oddments: Load input (4, 3): Bit 2: End
+    "ldr x20, [x13, #0xb8]\n"
+    "fmla v31.8h, v7.8h, v14.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 69f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v15.h }[6], [x20], #0x2\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v15.h }[4], [x20], #0x2\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load input (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 70f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "71:"  // Oddments: Load input (4, 2): Bit 2: End
+    "ldr x20, [x13, #0xc0]\n"
+    "fmla v30.8h, v8.8h, v15.8h\n"
+    "fmla v31.8h, v6.8h, v15.8h\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #2, 73f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 72f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load input (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 74f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "75:"  // Oddments: Load input (4, 4): Bit 2: End
+    "fmla v31.8h, v8.8h, v11.8h\n"
+    "fmax v28.8h, v28.8h, v26.8h\n"
+    "fmax v29.8h, v29.8h, v26.8h\n"
+    "fmax v30.8h, v30.8h, v26.8h\n"
+    "fmax v31.8h, v31.8h, v26.8h\n"
+    "fmin v28.8h, v28.8h, v27.8h\n"
+    "fmin v29.8h, v29.8h, v27.8h\n"
+    "fmin v30.8h, v30.8h, v27.8h\n"
+    "fmin v31.8h, v31.8h, v27.8h\n"
+    "tbz %x[n_channels], #2, 77f\n"
+    "st1 { v28.d }[0], [x12], #0x8\n"
+    "st1 { v29.d }[0], [x11], #0x8\n"
+    "st1 { v30.d }[0], [x10], #0x8\n"
+    "st1 { v31.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #1, 76f\n"
+    "st1 { v28.s }[2], [x12], #0x4\n"
+    "st1 { v29.s }[2], [x11], #0x4\n"
+    "st1 { v30.s }[2], [x10], #0x4\n"
+    "st1 { v31.s }[2], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "st1 { v28.h }[6], [x12], #0x2\n"
+    "st1 { v29.h }[6], [x11], #0x2\n"
+    "st1 { v30.h }[6], [x10], #0x2\n"
+    "st1 { v31.h }[6], [x9], #0x2\n"
+    "b 79f\n"
+    "76:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 79f\n"
+    "st1 { v28.h }[4], [x12], #0x2\n"
+    "st1 { v29.h }[4], [x11], #0x2\n"
+    "st1 { v30.h }[4], [x10], #0x2\n"
+    "st1 { v31.h }[4], [x9], #0x2\n"
+    "b 79f\n"
+    "77:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 78f\n"
+    "st1 { v28.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x11], #0x4\n"
+    "st1 { v30.s }[0], [x10], #0x4\n"
+    "st1 { v31.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "st1 { v28.h }[2], [x12], #0x2\n"
+    "st1 { v29.h }[2], [x11], #0x2\n"
+    "st1 { v30.h }[2], [x10], #0x2\n"
+    "st1 { v31.h }[2], [x9], #0x2\n"
+    "b 79f\n"
+    "78:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "st1 { v28.h }[0], [x12], #0x2\n"
+    "st1 { v29.h }[0], [x11], #0x2\n"
+    "st1 { v30.h }[0], [x10], #0x2\n"
+    "st1 { v31.h }[0], [x9], #0x2\n"
+    "79:"  // Oddments: Store: Bit 2: End
+    "80:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1d1d491c28
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..cecaf79704
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1387 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x27, #0x0\n"
+    "mov x26, #0x0\n"
+    "1:"  // Tile loop
+    "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x23, #0x2\n"
+    "mov x25, #0x2\n"
+    "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x27, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x26, x2, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x3, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "lsl x2, x2, #0x1\n"
+    "mul x20, x27, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x6, x2, x2\n"
+    "mul x22, x22, x23\n"  // offset *= kernel_stride * output_size
+    "add x4, x4, x22, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x7, x4, x24, LSL #1\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x26, x3, x20\n"  // offset += tile_j * ld_output_col
+    "add x17, x7, x24, LSL #1\n"
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "mul x20, x20, x25\n"  // offset *= output_tile_size
+    "lsr x22, %x[n_channels], #0x3\n"
+    "add x16, x17, x24, LSL #1\n"
+    "add x15, x6, x2\n"
+    "add x14, x16, x24, LSL #1\n"
+    "add x13, x15, x2\n"
+    "add x5, x5, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.8h }, [x20]\n"
+    "add x12, x14, x24, LSL #1\n"
+    "add x11, x13, x2\n"
+    "add x10, x5, x21, LSL #1\n"
+    "lsl x3, x3, #0x1\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q25, [x8, #0x0]\n"
+    "ldr q0, [x8, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x8, #0x20]\n"
+    "ldr q2, [x8, #0x30]\n"
+    "ldr q3, [x8, #0x40]\n"
+    "ldr q4, [x8, #0x50]\n"
+    "add x8, x8, #0x60\n"
+    "ld1 { v5.8h }, [x4]\n"
+    "ldr q6, [x4, x2]\n"
+    "ld1 { v7.8h }, [x7]\n"
+    "ldr q8, [x7, x2]\n"
+    "ldr q9, [x4, x6]\n"
+    "ldr q13, [x7, x6]\n"
+    "ldr q11, [x4, x15]\n"
+    "ldr q12, [x4, x13]\n"
+    "ldr q10, [x7, x11]\n"
+    "ld1 { v14.8h }, [x17]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v5.8h\n"
+    "ldr q23, [x7, x15]\n"
+    "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v6.8h\n"
+    "add x23, x23, #0x10\n"
+    "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+    "mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+    "ldr q19, [x8, #0x0]\n"
+    "ldr q25, [x8, #0x140]\n"
+    "fmla v30.8h, v1.8h, v6.8h\n"
+    "ldr q21, [x7, x13]\n"
+    "fmla v31.8h, v1.8h, v9.8h\n"
+    "add x7, x7, #0x10\n"
+    "fmla v29.8h, v1.8h, v8.8h\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "ldr q1, [x8, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "ldr q18, [x4, x11]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "add x4, x4, #0x10\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v23.8h\n"
+    "ldr q17, [x8, #0x20]\n"
+    "add x20, x20, #0x10\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "ldr q6, [x17, x2]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "add x21, x21, #0x10\n"
+    "fmla v29.8h, v3.8h, v23.8h\n"
+    "fmla v28.8h, v3.8h, v21.8h\n"
+    "ldr q16, [x8, #0x30]\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "ldr q2, [x17, x6]\n"
+    "fmla v31.8h, v4.8h, v18.8h\n"
+    "ldr q0, [x17, x15]\n"
+    "fmla v29.8h, v4.8h, v21.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr q20, [x8, #0x40]\n"
+    "fmla v30.8h, v19.8h, v7.8h\n"
+    "ld1 { v7.8h }, [x7]\n"
+    "fmla v31.8h, v19.8h, v8.8h\n"
+    "fmla v29.8h, v19.8h, v14.8h\n"
+    "fmla v28.8h, v19.8h, v6.8h\n"
+    "ldr q19, [x8, #0x50]\n"
+    "fmla v30.8h, v1.8h, v8.8h\n"
+    "ldr q26, [x17, x11]\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v29.8h, v1.8h, v6.8h\n"
+    "fmla v28.8h, v1.8h, v2.8h\n"
+    "ldr q18, [x8, #0x60]\n"
+    "fmla v30.8h, v17.8h, v13.8h\n"
+    "ldr q1, [x17, x13]\n"
+    "fmla v31.8h, v17.8h, v23.8h\n"
+    "add x17, x17, #0x10\n"
+    "fmla v29.8h, v17.8h, v2.8h\n"
+    "fmla v28.8h, v17.8h, v0.8h\n"
+    "ldr q17, [x8, #0x70]\n"
+    "fmla v30.8h, v16.8h, v23.8h\n"
+    "ld1 { v24.8h }, [x16]\n"
+    "fmla v31.8h, v16.8h, v21.8h\n"
+    "fmla v29.8h, v16.8h, v0.8h\n"
+    "fmla v28.8h, v16.8h, v1.8h\n"
+    "ldr q16, [x8, #0x80]\n"
+    "fmla v30.8h, v20.8h, v21.8h\n"
+    "ldr q23, [x16, x2]\n"
+    "fmla v31.8h, v20.8h, v10.8h\n"
+    "ldr q22, [x16, x6]\n"
+    "fmla v29.8h, v20.8h, v1.8h\n"
+    "fmla v28.8h, v20.8h, v26.8h\n"
+    "ldr q21, [x8, #0x90]\n"
+    "fmla v30.8h, v19.8h, v14.8h\n"
+    "ldr q5, [x16, x11]\n"
+    "fmla v31.8h, v19.8h, v6.8h\n"
+    "fmla v29.8h, v19.8h, v24.8h\n"
+    "fmla v28.8h, v19.8h, v23.8h\n"
+    "ldr q11, [x8, #0xa0]\n"
+    "fmla v30.8h, v18.8h, v6.8h\n"
+    "ldr q20, [x16, x15]\n"
+    "fmla v31.8h, v18.8h, v2.8h\n"
+    "fmla v29.8h, v18.8h, v23.8h\n"
+    "fmla v28.8h, v18.8h, v22.8h\n"
+    "ldr q18, [x8, #0xb0]\n"
+    "fmla v30.8h, v17.8h, v2.8h\n"
+    "ldr q19, [x16, x13]\n"
+    "fmla v31.8h, v17.8h, v0.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.8h, v17.8h, v22.8h\n"
+    "fmla v28.8h, v17.8h, v20.8h\n"
+    "ldr q17, [x8, #0xc0]\n"
+    "fmla v30.8h, v16.8h, v0.8h\n"
+    "ld1 { v0.8h }, [x14]\n"
+    "fmla v31.8h, v16.8h, v1.8h\n"
+    "fmla v29.8h, v16.8h, v20.8h\n"
+    "fmla v28.8h, v16.8h, v19.8h\n"
+    "ldr q16, [x8, #0xd0]\n"
+    "fmla v30.8h, v21.8h, v1.8h\n"
+    "ldr q4, [x14, x2]\n"
+    "fmla v31.8h, v21.8h, v26.8h\n"
+    "ldr q12, [x14, x13]\n"
+    "fmla v29.8h, v21.8h, v19.8h\n"
+    "fmla v28.8h, v21.8h, v5.8h\n"
+    "ldr q13, [x8, #0xe0]\n"
+    "fmla v30.8h, v11.8h, v24.8h\n"
+    "ldr q6, [x14, x6]\n"
+    "fmla v31.8h, v11.8h, v23.8h\n"
+    "fmla v29.8h, v11.8h, v0.8h\n"
+    "fmla v28.8h, v11.8h, v4.8h\n"
+    "ldr q24, [x8, #0xf0]\n"
+    "fmla v30.8h, v18.8h, v23.8h\n"
+    "ldr q26, [x14, x15]\n"
+    "fmla v31.8h, v18.8h, v22.8h\n"
+    "fmla v29.8h, v18.8h, v4.8h\n"
+    "fmla v28.8h, v18.8h, v6.8h\n"
+    "ldr q23, [x8, #0x100]\n"
+    "fmla v30.8h, v17.8h, v22.8h\n"
+    "ldr q22, [x14, x11]\n"
+    "fmla v31.8h, v17.8h, v20.8h\n"
+    "add x14, x14, #0x10\n"
+    "fmla v29.8h, v17.8h, v6.8h\n"
+    "fmla v28.8h, v17.8h, v26.8h\n"
+    "ldr q21, [x8, #0x110]\n"
+    "fmla v30.8h, v16.8h, v20.8h\n"
+    "ld1 { v18.8h }, [x12]\n"
+    "fmla v31.8h, v16.8h, v19.8h\n"
+    "fmla v29.8h, v16.8h, v26.8h\n"
+    "fmla v28.8h, v16.8h, v12.8h\n"
+    "ldr q20, [x8, #0x120]\n"
+    "fmla v30.8h, v13.8h, v19.8h\n"
+    "ldr q17, [x12, x2]\n"
+    "fmla v31.8h, v13.8h, v5.8h\n"
+    "ld1 { v14.8h }, [x17]\n"
+    "fmla v29.8h, v13.8h, v12.8h\n"
+    "fmla v28.8h, v13.8h, v22.8h\n"
+    "ldr q19, [x8, #0x130]\n"
+    "fmla v30.8h, v24.8h, v0.8h\n"
+    "ldr q16, [x12, x6]\n"
+    "fmla v31.8h, v24.8h, v4.8h\n"
+    "fmla v29.8h, v24.8h, v18.8h\n"
+    "ldr q18, [x12, x15]\n"
+    "fmla v28.8h, v24.8h, v17.8h\n"
+    "ldr q0, [x8, #0x150]\n"
+    "fmla v30.8h, v23.8h, v4.8h\n"
+    "ldr q13, [x7, x6]\n"
+    "fmla v31.8h, v23.8h, v6.8h\n"
+    "fmla v29.8h, v23.8h, v17.8h\n"
+    "ldr q17, [x12, x13]\n"
+    "fmla v28.8h, v23.8h, v16.8h\n"
+    "ldr q1, [x8, #0x160]\n"
+    "fmla v30.8h, v21.8h, v6.8h\n"
+    "ld1 { v5.8h }, [x4]\n"
+    "fmla v31.8h, v21.8h, v26.8h\n"
+    "fmla v29.8h, v21.8h, v16.8h\n"
+    "ldr q16, [x12, x11]\n"
+    "fmla v28.8h, v21.8h, v18.8h\n"
+    "ldr q2, [x8, #0x170]\n"
+    "fmla v30.8h, v20.8h, v26.8h\n"
+    "ldr q6, [x4, x2]\n"
+    "fmla v31.8h, v20.8h, v12.8h\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.8h, v20.8h, v18.8h\n"
+    "ldr q11, [x4, x15]\n"
+    "fmla v28.8h, v20.8h, v17.8h\n"
+    "ldr q3, [x8, #0x180]\n"
+    "fmla v30.8h, v19.8h, v12.8h\n"
+    "ldr q8, [x7, x2]\n"
+    "fmla v31.8h, v19.8h, v22.8h\n"
+    "ldr q10, [x7, x11]\n"
+    "fmla v29.8h, v19.8h, v17.8h\n"
+    "ldr q12, [x4, x13]\n"
+    "fmla v28.8h, v19.8h, v16.8h\n"
+    "ldr q9, [x4, x6]\n"
+    "ldr q4, [x8, #0x190]\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "add x8, x8, #0x1a0\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "st1 { v30.8h }, [x5]\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "str q31, [x5, x3]\n"
+    "add x5, x5, #0x10\n"
+    "st1 { v29.8h }, [x10]\n"
+    "str q28, [x10, x3]\n"
+    "add x10, x10, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr q22, [x7, x15]\n"
+    "mov v5.16b, v25.16b\n fmla v5.8h, v0.8h, v6.8h\n"
+    "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+    "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+    "ldr q19, [x8, #0x0]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q21, [x7, x13]\n"
+    "fmla v5.8h, v1.8h, v9.8h\n"
+    "add x7, x7, #0x10\n"
+    "fmla v30.8h, v1.8h, v8.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "ldr q18, [x8, #0x10]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr q16, [x4, x11]\n"
+    "fmla v5.8h, v2.8h, v11.8h\n"
+    "add x4, x4, #0x10\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "fmla v29.8h, v2.8h, v22.8h\n"
+    "ldr q17, [x8, #0x20]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q6, [x17, x2]\n"
+    "fmla v5.8h, v3.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v22.8h\n"
+    "fmla v29.8h, v3.8h, v21.8h\n"
+    "ldr q20, [x8, #0x30]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q2, [x17, x6]\n"
+    "fmla v5.8h, v4.8h, v16.8h\n"
+    "ldr q28, [x17, x15]\n"
+    "fmla v30.8h, v4.8h, v21.8h\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "ldr q16, [x8, #0x40]\n"
+    "fmla v31.8h, v19.8h, v7.8h\n"
+    "fmla v5.8h, v19.8h, v8.8h\n"
+    "fmla v30.8h, v19.8h, v14.8h\n"
+    "fmla v29.8h, v19.8h, v6.8h\n"
+    "ldr q19, [x8, #0x50]\n"
+    "fmla v31.8h, v18.8h, v8.8h\n"
+    "ldr q1, [x17, x11]\n"
+    "fmla v5.8h, v18.8h, v13.8h\n"
+    "fmla v30.8h, v18.8h, v6.8h\n"
+    "fmla v29.8h, v18.8h, v2.8h\n"
+    "ldr q18, [x8, #0x60]\n"
+    "fmla v31.8h, v17.8h, v13.8h\n"
+    "ldr q26, [x17, x13]\n"
+    "fmla v5.8h, v17.8h, v22.8h\n"
+    "add x17, x17, #0x10\n"
+    "fmla v30.8h, v17.8h, v2.8h\n"
+    "fmla v29.8h, v17.8h, v28.8h\n"
+    "ldr q17, [x8, #0x70]\n"
+    "fmla v31.8h, v20.8h, v22.8h\n"
+    "ld1 { v25.8h }, [x16]\n"
+    "fmla v5.8h, v20.8h, v21.8h\n"
+    "fmla v30.8h, v20.8h, v28.8h\n"
+    "fmla v29.8h, v20.8h, v26.8h\n"
+    "ldr q24, [x8, #0x80]\n"
+    "fmla v31.8h, v16.8h, v21.8h\n"
+    "ldr q23, [x16, x2]\n"
+    "fmla v5.8h, v16.8h, v10.8h\n"
+    "ldr q0, [x16, x6]\n"
+    "fmla v30.8h, v16.8h, v26.8h\n"
+    "fmla v29.8h, v16.8h, v1.8h\n"
+    "ldr q22, [x8, #0x90]\n"
+    "fmla v31.8h, v19.8h, v14.8h\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v5.8h, v19.8h, v6.8h\n"
+    "fmla v30.8h, v19.8h, v25.8h\n"
+    "fmla v29.8h, v19.8h, v23.8h\n"
+    "ldr q21, [x8, #0xa0]\n"
+    "fmla v31.8h, v18.8h, v6.8h\n"
+    "ldr q20, [x16, x15]\n"
+    "fmla v5.8h, v18.8h, v2.8h\n"
+    "fmla v30.8h, v18.8h, v23.8h\n"
+    "fmla v29.8h, v18.8h, v0.8h\n"
+    "ldr q18, [x8, #0xb0]\n"
+    "fmla v31.8h, v17.8h, v2.8h\n"
+    "ldr q19, [x16, x13]\n"
+    "fmla v5.8h, v17.8h, v28.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmla v30.8h, v17.8h, v0.8h\n"
+    "fmla v29.8h, v17.8h, v20.8h\n"
+    "ldr q17, [x8, #0xc0]\n"
+    "fmla v31.8h, v24.8h, v28.8h\n"
+    "ld1 { v7.8h }, [x14]\n"
+    "fmla v5.8h, v24.8h, v26.8h\n"
+    "fmla v30.8h, v24.8h, v20.8h\n"
+    "fmla v29.8h, v24.8h, v19.8h\n"
+    "ldr q2, [x8, #0xd0]\n"
+    "fmla v31.8h, v22.8h, v26.8h\n"
+    "ldr q28, [x14, x2]\n"
+    "fmla v5.8h, v22.8h, v1.8h\n"
+    "ldr q13, [x14, x13]\n"
+    "fmla v30.8h, v22.8h, v19.8h\n"
+    "fmla v29.8h, v22.8h, v16.8h\n"
+    "ldr q14, [x8, #0xe0]\n"
+    "fmla v31.8h, v21.8h, v25.8h\n"
+    "ldr q26, [x14, x6]\n"
+    "fmla v5.8h, v21.8h, v23.8h\n"
+    "fmla v30.8h, v21.8h, v7.8h\n"
+    "fmla v29.8h, v21.8h, v28.8h\n"
+    "ldr q25, [x8, #0xf0]\n"
+    "fmla v31.8h, v18.8h, v23.8h\n"
+    "ldr q24, [x14, x15]\n"
+    "fmla v5.8h, v18.8h, v0.8h\n"
+    "fmla v30.8h, v18.8h, v28.8h\n"
+    "fmla v29.8h, v18.8h, v26.8h\n"
+    "ldr q23, [x8, #0x100]\n"
+    "fmla v31.8h, v17.8h, v0.8h\n"
+    "ldr q22, [x14, x11]\n"
+    "fmla v5.8h, v17.8h, v20.8h\n"
+    "add x14, x14, #0x10\n"
+    "fmla v30.8h, v17.8h, v26.8h\n"
+    "fmla v29.8h, v17.8h, v24.8h\n"
+    "ldr q21, [x8, #0x110]\n"
+    "fmla v31.8h, v2.8h, v20.8h\n"
+    "ld1 { v18.8h }, [x12]\n"
+    "fmla v5.8h, v2.8h, v19.8h\n"
+    "fmla v30.8h, v2.8h, v24.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "ldr q20, [x8, #0x120]\n"
+    "fmla v31.8h, v14.8h, v19.8h\n"
+    "ldr q17, [x12, x2]\n"
+    "fmla v5.8h, v14.8h, v16.8h\n"
+    "fmla v30.8h, v14.8h, v13.8h\n"
+    "fmla v29.8h, v14.8h, v22.8h\n"
+    "ldr q19, [x8, #0x130]\n"
+    "add x8, x8, #0x140\n"
+    "fmla v31.8h, v25.8h, v7.8h\n"
+    "ldr q16, [x12, x6]\n"
+    "fmla v5.8h, v25.8h, v28.8h\n"
+    "fmla v30.8h, v25.8h, v18.8h\n"
+    "ldr q18, [x12, x15]\n"
+    "fmla v29.8h, v25.8h, v17.8h\n"
+    "fmla v31.8h, v23.8h, v28.8h\n"
+    "fmla v5.8h, v23.8h, v26.8h\n"
+    "fmla v30.8h, v23.8h, v17.8h\n"
+    "ldr q17, [x12, x13]\n"
+    "fmla v29.8h, v23.8h, v16.8h\n"
+    "fmla v31.8h, v21.8h, v26.8h\n"
+    "fmla v5.8h, v21.8h, v24.8h\n"
+    "fmla v30.8h, v21.8h, v16.8h\n"
+    "ldr q16, [x12, x11]\n"
+    "fmla v29.8h, v21.8h, v18.8h\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.8h, v20.8h, v24.8h\n"
+    "fmla v5.8h, v20.8h, v13.8h\n"
+    "fmla v30.8h, v20.8h, v18.8h\n"
+    "fmla v29.8h, v20.8h, v17.8h\n"
+    "fmla v31.8h, v19.8h, v13.8h\n"
+    "fmla v5.8h, v19.8h, v22.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmla v30.8h, v19.8h, v17.8h\n"
+    "fmla v29.8h, v19.8h, v16.8h\n"
+    "fmax v5.8h, v5.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "fmin v5.8h, v5.8h, v15.8h\n"
+    "st1 { v31.8h }, [x5]\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "str q5, [x5, x3]\n"
+    "add x5, x5, #0x10\n"
+    "st1 { v30.8h }, [x10]\n"
+    "str q29, [x10, x3]\n"
+    "add x10, x10, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 117f\n"
+    "ldr q25, [x8, #0x0]\n"
+    "ldr q0, [x8, #0x10]\n"
+    "add x9, x4, XZR\n"
+    "add x28, x4, x2\n"
+    "ldr q1, [x8, #0x20]\n"
+    "ldr q2, [x8, #0x30]\n"
+    "add x27, x7, XZR\n"
+    "add x26, x7, x2\n"
+    "ldr q3, [x8, #0x40]\n"
+    "ldr q4, [x8, #0x50]\n"
+    "add x25, x4, x6\n"
+    "add x24, x7, x6\n"
+    "add x23, x4, x15\n"
+    "add x22, x4, x13\n"
+    "add x21, x7, x11\n"
+    "add x20, x17, XZR\n"
+    "add x8, x8, #0x60\n"
+    "tbz %x[n_channels], #2, 6f\n"
+    "ldr d5, [x9], #0x8\n"
+    "ldr d6, [x28], #0x8\n"
+    "ldr d7, [x27], #0x8\n"
+    "ldr d8, [x26], #0x8\n"
+    "ldr d9, [x25], #0x8\n"
+    "ldr d13, [x24], #0x8\n"
+    "ldr d11, [x23], #0x8\n"
+    "ldr d12, [x22], #0x8\n"
+    "ldr d10, [x21], #0x8\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v5.s }[2], [x9], #0x4\n"
+    "ld1 { v6.s }[2], [x28], #0x4\n"
+    "ld1 { v7.s }[2], [x27], #0x4\n"
+    "ld1 { v8.s }[2], [x26], #0x4\n"
+    "ld1 { v9.s }[2], [x25], #0x4\n"
+    "ld1 { v13.s }[2], [x24], #0x4\n"
+    "ld1 { v11.s }[2], [x23], #0x4\n"
+    "ld1 { v12.s }[2], [x22], #0x4\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v5.h }[6], [x9]\n"
+    "ld1 { v6.h }[6], [x28]\n"
+    "ld1 { v7.h }[6], [x27]\n"
+    "ld1 { v8.h }[6], [x26]\n"
+    "ld1 { v9.h }[6], [x25]\n"
+    "ld1 { v13.h }[6], [x24]\n"
+    "ld1 { v11.h }[6], [x23]\n"
+    "ld1 { v12.h }[6], [x22]\n"
+    "ld1 { v10.h }[6], [x21]\n"
+    "ld1 { v14.h }[6], [x20]\n"
+    "b 8f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v5.h }[4], [x9]\n"
+    "ld1 { v6.h }[4], [x28]\n"
+    "ld1 { v7.h }[4], [x27]\n"
+    "ld1 { v8.h }[4], [x26]\n"
+    "ld1 { v9.h }[4], [x25]\n"
+    "ld1 { v13.h }[4], [x24]\n"
+    "ld1 { v11.h }[4], [x23]\n"
+    "ld1 { v12.h }[4], [x22]\n"
+    "ld1 { v10.h }[4], [x21]\n"
+    "ld1 { v14.h }[4], [x20]\n"
+    "b 8f\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s5, [x9], #0x4\n"
+    "ldr s6, [x28], #0x4\n"
+    "ldr s7, [x27], #0x4\n"
+    "ldr s8, [x26], #0x4\n"
+    "ldr s9, [x25], #0x4\n"
+    "ldr s13, [x24], #0x4\n"
+    "ldr s11, [x23], #0x4\n"
+    "ldr s12, [x22], #0x4\n"
+    "ldr s10, [x21], #0x4\n"
+    "ldr s14, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v5.h }[2], [x9]\n"
+    "ld1 { v6.h }[2], [x28]\n"
+    "ld1 { v7.h }[2], [x27]\n"
+    "ld1 { v8.h }[2], [x26]\n"
+    "ld1 { v9.h }[2], [x25]\n"
+    "ld1 { v13.h }[2], [x24]\n"
+    "ld1 { v11.h }[2], [x23]\n"
+    "ld1 { v12.h }[2], [x22]\n"
+    "ld1 { v10.h }[2], [x21]\n"
+    "ld1 { v14.h }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h5, [x9, #0x0]\n"
+    "ldr h6, [x28, #0x0]\n"
+    "ldr h7, [x27, #0x0]\n"
+    "ldr h8, [x26, #0x0]\n"
+    "ldr h9, [x25, #0x0]\n"
+    "ldr h13, [x24, #0x0]\n"
+    "ldr h11, [x23, #0x0]\n"
+    "ldr h12, [x22, #0x0]\n"
+    "ldr h10, [x21, #0x0]\n"
+    "ldr h14, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
+    "mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+    "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+    "add x20, x7, x15\n"
+    "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+    "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v8.8h\n"
+    "fmla v28.8h, v1.8h, v6.8h\n"
+    "fmla v29.8h, v1.8h, v9.8h\n"
+    "fmla v30.8h, v1.8h, v8.8h\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 10f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v5.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v5.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s5, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v5.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h5, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+    "fmla v31.8h, v2.8h, v5.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "add x20, x7, x13\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v5.8h\n"
+    "tbz %x[n_channels], #2, 14f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v6.h }[6], [x20]\n"
+    "b 16f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v6.h }[4], [x20]\n"
+    "b 16f\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s6, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v6.h }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h6, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+    "fmla v31.8h, v3.8h, v6.8h\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "add x20, x4, x11\n"
+    "tbz %x[n_channels], #2, 18f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 20f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 20f\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v29.8h, v4.8h, v9.8h\n"
+    "fmla v30.8h, v4.8h, v6.8h\n"
+    "add x20, x17, x2\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v28.8h, v0.8h, v7.8h\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.8h, v0.8h, v8.8h\n"
+    "fmla v30.8h, v0.8h, v14.8h\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 24f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 24f\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.8h, v0.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v8.8h\n"
+    "add x20, x17, x6\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "fmla v30.8h, v1.8h, v11.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 26f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 28f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 28f\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "add x20, x17, x15\n"
+    "fmla v29.8h, v2.8h, v5.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 30f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 32f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 32f\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "fmla v28.8h, v3.8h, v5.8h\n"
+    "add x20, x17, x13\n"
+    "fmla v29.8h, v3.8h, v6.8h\n"
+    "fmla v30.8h, v3.8h, v9.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 34f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 36f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 36f\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.8h, v3.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v6.8h\n"
+    "add x20, x17, x11\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 38f\n"
+    "ldr d8, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v8.h }[6], [x20]\n"
+    "b 40f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v8.h }[4], [x20]\n"
+    "b 40f\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s8, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v8.h }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h8, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.8h, v4.8h, v8.8h\n"
+    "fmla v28.8h, v0.8h, v14.8h\n"
+    "add x20, x16, XZR\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 42f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v5.h }[6], [x20]\n"
+    "b 44f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v5.h }[4], [x20]\n"
+    "b 44f\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s5, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v5.h }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h5, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+    "fmla v30.8h, v0.8h, v5.8h\n"
+    "add x20, x16, x2\n"
+    "tbz %x[n_channels], #2, 46f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v6.h }[6], [x20]\n"
+    "b 48f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v6.h }[4], [x20]\n"
+    "b 48f\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s6, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v6.h }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h6, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.8h, v0.8h, v6.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "add x20, x16, x6\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v30.8h, v1.8h, v6.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 50f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 52f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 52f\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.8h, v1.8h, v10.8h\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "add x20, x16, x15\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "fmla v30.8h, v2.8h, v10.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 54f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 56f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 56f\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v9.8h\n"
+    "add x20, x16, x13\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 58f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 60f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 60f\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "add x20, x16, x11\n"
+    "fmla v29.8h, v4.8h, v8.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 62f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v14.h }[6], [x20]\n"
+    "b 64f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v14.h }[4], [x20]\n"
+    "b 64f\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr s14, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v14.h }[2], [x20]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x20, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.8h, v4.8h, v14.8h\n"
+    "fmla v28.8h, v0.8h, v5.8h\n"
+    "add x20, x14, XZR\n"
+    "fmla v29.8h, v0.8h, v6.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 66f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 68f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 68f\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
+    "fmla v30.8h, v0.8h, v9.8h\n"
+    "add x20, x14, x2\n"
+    "tbz %x[n_channels], #2, 70f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v13.h }[6], [x20]\n"
+    "b 72f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v13.h }[4], [x20]\n"
+    "b 72f\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 71f\n"
+    "ldr s13, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "ld1 { v13.h }[2], [x20]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h13, [x20, #0x0]\n"
+    "72:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.8h, v0.8h, v13.8h\n"
+    "fmla v28.8h, v1.8h, v6.8h\n"
+    "add x20, x14, x6\n"
+    "fmla v29.8h, v1.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v13.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 74f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 73f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v5.h }[6], [x20]\n"
+    "b 76f\n"
+    "73:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v5.h }[4], [x20]\n"
+    "b 76f\n"
+    "74:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 75f\n"
+    "ldr s5, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 76f\n"
+    "ld1 { v5.h }[2], [x20]\n"
+    "b 76f\n"
+    "75:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h5, [x20, #0x0]\n"
+    "76:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.8h, v1.8h, v5.8h\n"
+    "fmla v28.8h, v2.8h, v10.8h\n"
+    "add x20, x14, x15\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v5.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 78f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 77f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v6.h }[6], [x20]\n"
+    "b 80f\n"
+    "77:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v6.h }[4], [x20]\n"
+    "b 80f\n"
+    "78:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 79f\n"
+    "ldr s6, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 80f\n"
+    "ld1 { v6.h }[2], [x20]\n"
+    "b 80f\n"
+    "79:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h6, [x20, #0x0]\n"
+    "80:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.8h, v2.8h, v6.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "add x20, x14, x13\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v6.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 82f\n"
+    "ldr d8, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 81f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v8.h }[6], [x20]\n"
+    "b 84f\n"
+    "81:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v8.h }[4], [x20]\n"
+    "b 84f\n"
+    "82:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 83f\n"
+    "ldr s8, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 84f\n"
+    "ld1 { v8.h }[2], [x20]\n"
+    "b 84f\n"
+    "83:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h8, [x20, #0x0]\n"
+    "84:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.8h, v3.8h, v8.8h\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "add x20, x14, x11\n"
+    "fmla v29.8h, v4.8h, v14.8h\n"
+    "fmla v30.8h, v4.8h, v8.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 86f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 85f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[6], [x20]\n"
+    "b 88f\n"
+    "85:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[4], [x20]\n"
+    "b 88f\n"
+    "86:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 87f\n"
+    "ldr s10, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 88f\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "b 88f\n"
+    "87:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h10, [x20, #0x0]\n"
+    "88:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v28.8h, v0.8h, v9.8h\n"
+    "add x20, x12, XZR\n"
+    "fmla v29.8h, v0.8h, v13.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 90f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 89f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 92f\n"
+    "89:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 92f\n"
+    "90:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 91f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 92f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 92f\n"
+    "91:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "92:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "add x20, x12, x2\n"
+    "tbz %x[n_channels], #2, 94f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 93f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 96f\n"
+    "93:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 96f\n"
+    "94:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 95f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 96f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 96f\n"
+    "95:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "96:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.8h, v0.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "add x20, x12, x6\n"
+    "fmla v29.8h, v1.8h, v5.8h\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 98f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 97f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 100f\n"
+    "97:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 100f\n"
+    "98:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 99f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 100f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 100f\n"
+    "99:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "100:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.8h, v1.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v5.8h\n"
+    "add x20, x12, x15\n"
+    "fmla v29.8h, v2.8h, v6.8h\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 102f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 101f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[6], [x20]\n"
+    "b 104f\n"
+    "101:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[4], [x20]\n"
+    "b 104f\n"
+    "102:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 103f\n"
+    "ldr s11, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 104f\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "b 104f\n"
+    "103:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset: Bit 1: Unset
+    "ldr h11, [x20, #0x0]\n"
+    "104:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v6.8h\n"
+    "add x20, x12, x13\n"
+    "fmla v29.8h, v3.8h, v8.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #2, 106f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 105f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[6], [x20]\n"
+    "b 108f\n"
+    "105:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[4], [x20]\n"
+    "b 108f\n"
+    "106:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 107f\n"
+    "ldr s12, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 108f\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "b 108f\n"
+    "107:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset: Bit 1: Unset
+    "ldr h12, [x20, #0x0]\n"
+    "108:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v8.8h\n"
+    "add x20, x12, x11\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "tbz %x[n_channels], #2, 110f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 109f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v9.h }[6], [x20]\n"
+    "b 112f\n"
+    "109:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v9.h }[4], [x20]\n"
+    "b 112f\n"
+    "110:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 111f\n"
+    "ldr s9, [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 112f\n"
+    "ld1 { v9.h }[2], [x20]\n"
+    "b 112f\n"
+    "111:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset: Bit 1: Unset
+    "ldr h9, [x20, #0x0]\n"
+    "112:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
+    "fmla v31.8h, v4.8h, v9.8h\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "tbz %x[n_channels], #2, 114f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.d }[0], [x21], x3\n"
+    "st1 { v30.d }[0], [x20], x3\n"
+    "add x5, x5, #0x8\n"
+    "add x10, x10, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #1, 113f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.s }[2], [x21], x3\n"
+    "st1 { v30.s }[2], [x20], x3\n"
+    "add x5, x5, #0x4\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_channels], #0, 116f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.h }[6], [x21], x3\n"
+    "st1 { v30.h }[6], [x20], x3\n"
+    "st1 { v29.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 116f\n"
+    "113:"  // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 116f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.h }[4], [x21], x3\n"
+    "st1 { v30.h }[4], [x20], x3\n"
+    "st1 { v29.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 116f\n"
+    "114:"  // Tile loop: Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 115f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.s }[0], [x21], x3\n"
+    "st1 { v30.s }[0], [x20], x3\n"
+    "add x5, x5, #0x4\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 116f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.h }[2], [x21], x3\n"
+    "st1 { v30.h }[2], [x20], x3\n"
+    "st1 { v29.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 116f\n"
+    "115:"  // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.h }[0], [x21], x3\n"
+    "st1 { v30.h }[0], [x20], x3\n"
+    "st1 { v29.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "116:"  // Tile loop: Oddments: Store: Bit 2: End
+    "117:"  // Tile loop: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x26, x26, #0x1\n"
+    "add x21, x27, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x26, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x27, x27, x21, LT\n"
+    "csel x26, x26, XZR, LT\n"
+    "cmp x27, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..4913340c4c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1427 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x17, #0x10\n"  // cntb _, ALL, #1
+    "lsr x9, %x[n_channels], #0x3\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.8h }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.8h }, [x20]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "ldp x12, x11, [x21, #0x10]\n"
+    "mov x10, #0x0\n"
+    "sub x28, XZR, x17\n"
+    "cbz x9, 3f\n"
+    "ldr q26, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x17, x9, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "add x16, x16, #0x60\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q5, [x21, x10]\n"
+    "ldr q6, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q7, [x21, x10]\n"
+    "ldr q8, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x20]\n"
+    "ldr q9, [x21, x10]\n"
+    "ldr q13, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr q11, [x21, x10]\n"
+    "ldr q12, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x40]\n"
+    "ldr q10, [x21, x10]\n"
+    "ldr q14, [x20, x10]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v5.8h\n"
+    "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v6.8h\n"
+    "ldr x20, [x15, #0x50]\n"
+    "ldr q24, [x20, x10]\n"
+    "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v7.8h\n"
+    "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+    "ldr q23, [x16, #0x0]\n"
+    "ldr q26, [x16, #0x140]\n"
+    "fmla v30.8h, v1.8h, v6.8h\n"
+    "fmla v31.8h, v1.8h, v9.8h\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr q22, [x20, x10]\n"
+    "fmla v28.8h, v1.8h, v8.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "ldr q21, [x16, #0x10]\n"
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "ldr q17, [x20, x10]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "fmla v29.8h, v2.8h, v24.8h\n"
+    "ldr q16, [x16, #0x20]\n"
+    "ldr x22, [x15, #0x70]\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "ldr q5, [x20, x10]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v28.8h, v3.8h, v24.8h\n"
+    "fmla v29.8h, v3.8h, v22.8h\n"
+    "ldr q20, [x16, #0x30]\n"
+    "ldr x21, [x15, #0x80]\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "ldr q19, [x22, x10]\n"
+    "fmla v31.8h, v4.8h, v17.8h\n"
+    "ldr q2, [x20, x10]\n"
+    "fmla v28.8h, v4.8h, v22.8h\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "ldr q18, [x16, #0x40]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v30.8h, v23.8h, v7.8h\n"
+    "fmla v31.8h, v23.8h, v8.8h\n"
+    "ldr x23, [x15, #0x90]\n"
+    "ldr x26, [x15, #0x98]\n"
+    "fmla v28.8h, v23.8h, v14.8h\n"
+    "fmla v29.8h, v23.8h, v5.8h\n"
+    "ldr q1, [x16, #0x50]\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "fmla v30.8h, v21.8h, v8.8h\n"
+    "ldr q25, [x20, x10]\n"
+    "fmla v31.8h, v21.8h, v13.8h\n"
+    "ldr x25, [x15, #0xa8]\n"
+    "fmla v28.8h, v21.8h, v5.8h\n"
+    "fmla v29.8h, v21.8h, v19.8h\n"
+    "ldr q17, [x16, #0x60]\n"
+    "ldr x24, [x15, #0xb0]\n"
+    "fmla v30.8h, v16.8h, v13.8h\n"
+    "ldr q8, [x21, x10]\n"
+    "fmla v31.8h, v16.8h, v24.8h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v28.8h, v16.8h, v19.8h\n"
+    "fmla v29.8h, v16.8h, v2.8h\n"
+    "ldr q16, [x16, #0x70]\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "fmla v30.8h, v20.8h, v24.8h\n"
+    "ldr q24, [x23, x10]\n"
+    "fmla v31.8h, v20.8h, v22.8h\n"
+    "ldr x27, [x15, #0xc8]\n"
+    "fmla v28.8h, v20.8h, v2.8h\n"
+    "fmla v29.8h, v20.8h, v8.8h\n"
+    "ldr q23, [x16, #0x80]\n"
+    "ldr x23, [x15, #0xd0]\n"
+    "fmla v30.8h, v18.8h, v22.8h\n"
+    "ldr q22, [x26, x10]\n"
+    "fmla v31.8h, v18.8h, v10.8h\n"
+    "ldr q21, [x22, x10]\n"
+    "fmla v28.8h, v18.8h, v8.8h\n"
+    "fmla v29.8h, v18.8h, v25.8h\n"
+    "ldr q20, [x16, #0x90]\n"
+    "ldr x22, [x15, #0xd8]\n"
+    "fmla v30.8h, v1.8h, v14.8h\n"
+    "ldr q0, [x20, x10]\n"
+    "fmla v31.8h, v1.8h, v5.8h\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v28.8h, v1.8h, v24.8h\n"
+    "fmla v29.8h, v1.8h, v22.8h\n"
+    "ldr q6, [x16, #0xa0]\n"
+    "ldr x26, [x15, #0xf8]\n"
+    "fmla v30.8h, v17.8h, v5.8h\n"
+    "ldr q1, [x25, x10]\n"
+    "fmla v31.8h, v17.8h, v19.8h\n"
+    "ldr x25, [x15, #0xe8]\n"
+    "fmla v28.8h, v17.8h, v22.8h\n"
+    "fmla v29.8h, v17.8h, v21.8h\n"
+    "ldr q18, [x16, #0xb0]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v30.8h, v16.8h, v19.8h\n"
+    "ldr q19, [x24, x10]\n"
+    "fmla v31.8h, v16.8h, v2.8h\n"
+    "ldr x24, [x15, #0xf0]\n"
+    "fmla v28.8h, v16.8h, v21.8h\n"
+    "fmla v29.8h, v16.8h, v1.8h\n"
+    "ldr q17, [x16, #0xc0]\n"
+    "fmla v30.8h, v23.8h, v2.8h\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v31.8h, v23.8h, v8.8h\n"
+    "ldr x21, [x15, #0x100]\n"
+    "fmla v28.8h, v23.8h, v1.8h\n"
+    "fmla v29.8h, v23.8h, v19.8h\n"
+    "ldr q13, [x16, #0xd0]\n"
+    "fmla v30.8h, v20.8h, v8.8h\n"
+    "ldr q2, [x27, x10]\n"
+    "fmla v31.8h, v20.8h, v25.8h\n"
+    "ldr q10, [x20, x10]\n"
+    "fmla v28.8h, v20.8h, v19.8h\n"
+    "fmla v29.8h, v20.8h, v0.8h\n"
+    "ldr q9, [x16, #0xe0]\n"
+    "ldr x20, [x15, #0x108]\n"
+    "fmla v30.8h, v6.8h, v24.8h\n"
+    "ldr q5, [x23, x10]\n"
+    "fmla v31.8h, v6.8h, v22.8h\n"
+    "ldr x23, [x15, #0x110]\n"
+    "fmla v28.8h, v6.8h, v16.8h\n"
+    "fmla v29.8h, v6.8h, v2.8h\n"
+    "ldr q24, [x16, #0xf0]\n"
+    "fmla v30.8h, v18.8h, v22.8h\n"
+    "ldr q25, [x22, x10]\n"
+    "fmla v31.8h, v18.8h, v21.8h\n"
+    "ldr x22, [x15, #0x118]\n"
+    "fmla v28.8h, v18.8h, v2.8h\n"
+    "fmla v29.8h, v18.8h, v5.8h\n"
+    "ldr q23, [x16, #0x100]\n"
+    "fmla v30.8h, v17.8h, v21.8h\n"
+    "ldr q22, [x25, x10]\n"
+    "fmla v31.8h, v17.8h, v1.8h\n"
+    "fmla v28.8h, v17.8h, v5.8h\n"
+    "fmla v29.8h, v17.8h, v25.8h\n"
+    "ldr q21, [x16, #0x110]\n"
+    "fmla v30.8h, v13.8h, v1.8h\n"
+    "ldr q18, [x24, x10]\n"
+    "fmla v31.8h, v13.8h, v19.8h\n"
+    "fmla v28.8h, v13.8h, v25.8h\n"
+    "fmla v29.8h, v13.8h, v10.8h\n"
+    "ldr q20, [x16, #0x120]\n"
+    "fmla v30.8h, v9.8h, v19.8h\n"
+    "ldr q17, [x26, x10]\n"
+    "fmla v31.8h, v9.8h, v0.8h\n"
+    "fmla v28.8h, v9.8h, v10.8h\n"
+    "fmla v29.8h, v9.8h, v22.8h\n"
+    "ldr q19, [x16, #0x130]\n"
+    "fmla v30.8h, v24.8h, v16.8h\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v31.8h, v24.8h, v2.8h\n"
+    "fmla v28.8h, v24.8h, v18.8h\n"
+    "ldr q18, [x20, x10]\n"
+    "fmla v29.8h, v24.8h, v17.8h\n"
+    "ldr q0, [x16, #0x150]\n"
+    "fmla v30.8h, v23.8h, v2.8h\n"
+    "fmla v31.8h, v23.8h, v5.8h\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "fmla v28.8h, v23.8h, v17.8h\n"
+    "ldr q17, [x23, x10]\n"
+    "fmla v29.8h, v23.8h, v16.8h\n"
+    "ldr q1, [x16, #0x160]\n"
+    "fmla v30.8h, v21.8h, v5.8h\n"
+    "ldr q5, [x21, x17]\n"
+    "fmla v31.8h, v21.8h, v25.8h\n"
+    "fmla v28.8h, v21.8h, v16.8h\n"
+    "ldr q16, [x22, x10]\n"
+    "fmla v29.8h, v21.8h, v18.8h\n"
+    "ldr q2, [x16, #0x170]\n"
+    "fmla v30.8h, v20.8h, v25.8h\n"
+    "ldr q6, [x20, x17]\n"
+    "fmla v31.8h, v20.8h, v10.8h\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q7, [x21, x17]\n"
+    "fmla v28.8h, v20.8h, v18.8h\n"
+    "fmla v29.8h, v20.8h, v17.8h\n"
+    "ldr q3, [x16, #0x180]\n"
+    "fmla v30.8h, v19.8h, v10.8h\n"
+    "ldr q8, [x20, x17]\n"
+    "fmla v31.8h, v19.8h, v22.8h\n"
+    "ldp x21, x20, [x15, #0x20]\n"
+    "ldr q13, [x20, x17]\n"
+    "fmla v28.8h, v19.8h, v17.8h\n"
+    "fmla v29.8h, v19.8h, v16.8h\n"
+    "ldr q9, [x21, x17]\n"
+    "ldr q4, [x16, #0x190]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "ldr q11, [x21, x17]\n"
+    "ldr q12, [x20, x17]\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "ldp x21, x20, [x15, #0x40]\n"
+    "ldr q10, [x21, x17]\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "ldr q14, [x20, x17]\n"
+    "add x17, x17, #0x10\n"
+    "cmp x17, x9, LSL #4\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "add x10, x10, #0x10\n"
+    "str q30, [x14, x28]\n"
+    "add x16, x16, #0x1a0\n"
+    "str q31, [x13, x28]\n"
+    "str q28, [x12, x28]\n"
+    "str q29, [x11, x28]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+    "mov v5.16b, v26.16b\n fmla v5.8h, v0.8h, v6.8h\n"
+    "ldr x20, [x15, #0x50]\n"
+    "ldr q22, [x20, x10]\n"
+    "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+    "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+    "ldr q19, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q21, [x20, x10]\n"
+    "fmla v5.8h, v1.8h, v9.8h\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla v30.8h, v1.8h, v8.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "ldr q18, [x16, #0x10]\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v5.8h, v2.8h, v11.8h\n"
+    "ldr x23, [x15, #0x70]\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "fmla v29.8h, v2.8h, v22.8h\n"
+    "ldr q17, [x16, #0x20]\n"
+    "ldr x21, [x15, #0x78]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q6, [x20, x10]\n"
+    "fmla v5.8h, v3.8h, v12.8h\n"
+    "ldr x22, [x15, #0x80]\n"
+    "fmla v30.8h, v3.8h, v22.8h\n"
+    "fmla v29.8h, v3.8h, v21.8h\n"
+    "ldr q20, [x16, #0x30]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q2, [x23, x10]\n"
+    "fmla v5.8h, v4.8h, v16.8h\n"
+    "ldr q28, [x21, x10]\n"
+    "fmla v30.8h, v4.8h, v21.8h\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "ldr q16, [x16, #0x40]\n"
+    "ldr x21, [x15, #0x90]\n"
+    "fmla v31.8h, v19.8h, v7.8h\n"
+    "fmla v5.8h, v19.8h, v8.8h\n"
+    "ldr x27, [x15, #0x98]\n"
+    "ldr x26, [x15, #0xa0]\n"
+    "fmla v30.8h, v19.8h, v14.8h\n"
+    "fmla v29.8h, v19.8h, v6.8h\n"
+    "ldr q19, [x16, #0x50]\n"
+    "ldr x25, [x15, #0xa8]\n"
+    "fmla v31.8h, v18.8h, v8.8h\n"
+    "ldr q1, [x20, x10]\n"
+    "fmla v5.8h, v18.8h, v13.8h\n"
+    "ldr x24, [x15, #0xb0]\n"
+    "fmla v30.8h, v18.8h, v6.8h\n"
+    "fmla v29.8h, v18.8h, v2.8h\n"
+    "ldr q18, [x16, #0x60]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v31.8h, v17.8h, v13.8h\n"
+    "ldr q26, [x22, x10]\n"
+    "fmla v5.8h, v17.8h, v22.8h\n"
+    "ldr x23, [x15, #0xc0]\n"
+    "fmla v30.8h, v17.8h, v2.8h\n"
+    "fmla v29.8h, v17.8h, v28.8h\n"
+    "ldr q17, [x16, #0x70]\n"
+    "ldr x22, [x15, #0xc8]\n"
+    "fmla v31.8h, v20.8h, v22.8h\n"
+    "ldr q25, [x21, x10]\n"
+    "fmla v5.8h, v20.8h, v21.8h\n"
+    "ldr x21, [x15, #0xd0]\n"
+    "fmla v30.8h, v20.8h, v28.8h\n"
+    "fmla v29.8h, v20.8h, v26.8h\n"
+    "ldr q24, [x16, #0x80]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v31.8h, v16.8h, v21.8h\n"
+    "ldr q23, [x27, x10]\n"
+    "fmla v5.8h, v16.8h, v10.8h\n"
+    "ldr q0, [x26, x10]\n"
+    "fmla v30.8h, v16.8h, v26.8h\n"
+    "fmla v29.8h, v16.8h, v1.8h\n"
+    "ldr q22, [x16, #0x90]\n"
+    "ldr x27, [x15, #0xd8]\n"
+    "fmla v31.8h, v19.8h, v14.8h\n"
+    "ldr q16, [x20, x10]\n"
+    "fmla v5.8h, v19.8h, v6.8h\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v30.8h, v19.8h, v25.8h\n"
+    "fmla v29.8h, v19.8h, v23.8h\n"
+    "ldr q21, [x16, #0xa0]\n"
+    "ldr x26, [x15, #0xf8]\n"
+    "fmla v31.8h, v18.8h, v6.8h\n"
+    "ldr q20, [x25, x10]\n"
+    "fmla v5.8h, v18.8h, v2.8h\n"
+    "ldr x25, [x15, #0xe8]\n"
+    "fmla v30.8h, v18.8h, v23.8h\n"
+    "fmla v29.8h, v18.8h, v0.8h\n"
+    "ldr q18, [x16, #0xb0]\n"
+    "fmla v31.8h, v17.8h, v2.8h\n"
+    "ldr q19, [x24, x10]\n"
+    "fmla v5.8h, v17.8h, v28.8h\n"
+    "ldr x24, [x15, #0xf0]\n"
+    "fmla v30.8h, v17.8h, v0.8h\n"
+    "fmla v29.8h, v17.8h, v20.8h\n"
+    "ldr q17, [x16, #0xc0]\n"
+    "fmla v31.8h, v24.8h, v28.8h\n"
+    "ldr q7, [x23, x10]\n"
+    "fmla v5.8h, v24.8h, v26.8h\n"
+    "ldr x23, [x15, #0x100]\n"
+    "fmla v30.8h, v24.8h, v20.8h\n"
+    "fmla v29.8h, v24.8h, v19.8h\n"
+    "ldr q3, [x16, #0xd0]\n"
+    "fmla v31.8h, v22.8h, v26.8h\n"
+    "ldr q28, [x22, x10]\n"
+    "fmla v5.8h, v22.8h, v1.8h\n"
+    "ldr q13, [x20, x10]\n"
+    "fmla v30.8h, v22.8h, v19.8h\n"
+    "fmla v29.8h, v22.8h, v16.8h\n"
+    "ldr q11, [x16, #0xe0]\n"
+    "ldr x22, [x15, #0x108]\n"
+    "fmla v31.8h, v21.8h, v25.8h\n"
+    "ldr q26, [x21, x10]\n"
+    "fmla v5.8h, v21.8h, v23.8h\n"
+    "ldr x21, [x15, #0x110]\n"
+    "fmla v30.8h, v21.8h, v7.8h\n"
+    "fmla v29.8h, v21.8h, v28.8h\n"
+    "ldr q25, [x16, #0xf0]\n"
+    "fmla v31.8h, v18.8h, v23.8h\n"
+    "ldr q24, [x27, x10]\n"
+    "fmla v5.8h, v18.8h, v0.8h\n"
+    "ldr x20, [x15, #0x118]\n"
+    "fmla v30.8h, v18.8h, v28.8h\n"
+    "fmla v29.8h, v18.8h, v26.8h\n"
+    "ldr q23, [x16, #0x100]\n"
+    "fmla v31.8h, v17.8h, v0.8h\n"
+    "ldr q22, [x25, x10]\n"
+    "fmla v5.8h, v17.8h, v20.8h\n"
+    "fmla v30.8h, v17.8h, v26.8h\n"
+    "fmla v29.8h, v17.8h, v24.8h\n"
+    "ldr q21, [x16, #0x110]\n"
+    "fmla v31.8h, v3.8h, v20.8h\n"
+    "ldr q18, [x24, x10]\n"
+    "fmla v5.8h, v3.8h, v19.8h\n"
+    "fmla v30.8h, v3.8h, v24.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "ldr q20, [x16, #0x120]\n"
+    "fmla v31.8h, v11.8h, v19.8h\n"
+    "ldr q17, [x26, x10]\n"
+    "fmla v5.8h, v11.8h, v16.8h\n"
+    "fmla v30.8h, v11.8h, v13.8h\n"
+    "fmla v29.8h, v11.8h, v22.8h\n"
+    "ldr q19, [x16, #0x130]\n"
+    "add x16, x16, #0x140\n"
+    "fmla v31.8h, v25.8h, v7.8h\n"
+    "ldr q16, [x23, x10]\n"
+    "fmla v5.8h, v25.8h, v28.8h\n"
+    "fmla v30.8h, v25.8h, v18.8h\n"
+    "ldr q18, [x22, x10]\n"
+    "fmla v29.8h, v25.8h, v17.8h\n"
+    "fmla v31.8h, v23.8h, v28.8h\n"
+    "fmla v5.8h, v23.8h, v26.8h\n"
+    "fmla v30.8h, v23.8h, v17.8h\n"
+    "ldr q17, [x21, x10]\n"
+    "fmla v29.8h, v23.8h, v16.8h\n"
+    "fmla v31.8h, v21.8h, v26.8h\n"
+    "fmla v5.8h, v21.8h, v24.8h\n"
+    "fmla v30.8h, v21.8h, v16.8h\n"
+    "ldr q16, [x20, x10]\n"
+    "fmla v29.8h, v21.8h, v18.8h\n"
+    "add x10, x10, #0x10\n"
+    "fmla v31.8h, v20.8h, v24.8h\n"
+    "fmla v5.8h, v20.8h, v13.8h\n"
+    "fmla v30.8h, v20.8h, v18.8h\n"
+    "fmla v29.8h, v20.8h, v17.8h\n"
+    "fmla v31.8h, v19.8h, v13.8h\n"
+    "fmla v5.8h, v19.8h, v22.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmla v30.8h, v19.8h, v17.8h\n"
+    "fmla v29.8h, v19.8h, v16.8h\n"
+    "fmax v5.8h, v5.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "fmin v5.8h, v5.8h, v15.8h\n"
+    "str q31, [x14, x28]\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "str q5, [x13, x28]\n"
+    "str q30, [x12, x28]\n"
+    "str q29, [x11, x28]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 116f\n"
+    "ldr q26, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "mov x20, x10\n"
+    "add x14, x14, x20\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "add x13, x13, x20\n"
+    "add x12, x12, x20\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "add x11, x11, x20\n"
+    "ldr x9, [x15, #0x0]\n"
+    "ldr x28, [x15, #0x8]\n"
+    "add x9, x9, x10\n"
+    "add x28, x28, x10\n"
+    "ldr x27, [x15, #0x10]\n"
+    "ldr x26, [x15, #0x18]\n"
+    "add x27, x27, x10\n"
+    "add x26, x26, x10\n"
+    "ldr x25, [x15, #0x20]\n"
+    "ldr x24, [x15, #0x28]\n"
+    "add x25, x25, x10\n"
+    "add x24, x24, x10\n"
+    "ldr x23, [x15, #0x30]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "add x23, x23, x10\n"
+    "add x22, x22, x10\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "add x21, x21, x10\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x60\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v5.d }[0], [x9], #0x8\n"
+    "ld1 { v6.d }[0], [x28], #0x8\n"
+    "ld1 { v7.d }[0], [x27], #0x8\n"
+    "ld1 { v8.d }[0], [x26], #0x8\n"
+    "ld1 { v9.d }[0], [x25], #0x8\n"
+    "ld1 { v13.d }[0], [x24], #0x8\n"
+    "ld1 { v11.d }[0], [x23], #0x8\n"
+    "ld1 { v12.d }[0], [x22], #0x8\n"
+    "ld1 { v10.d }[0], [x21], #0x8\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v5.s }[2], [x9], #0x4\n"
+    "ld1 { v6.s }[2], [x28], #0x4\n"
+    "ld1 { v7.s }[2], [x27], #0x4\n"
+    "ld1 { v8.s }[2], [x26], #0x4\n"
+    "ld1 { v9.s }[2], [x25], #0x4\n"
+    "ld1 { v13.s }[2], [x24], #0x4\n"
+    "ld1 { v11.s }[2], [x23], #0x4\n"
+    "ld1 { v12.s }[2], [x22], #0x4\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v5.h }[6], [x9], #0x2\n"
+    "ld1 { v6.h }[6], [x28], #0x2\n"
+    "ld1 { v7.h }[6], [x27], #0x2\n"
+    "ld1 { v8.h }[6], [x26], #0x2\n"
+    "ld1 { v9.h }[6], [x25], #0x2\n"
+    "ld1 { v13.h }[6], [x24], #0x2\n"
+    "ld1 { v11.h }[6], [x23], #0x2\n"
+    "ld1 { v12.h }[6], [x22], #0x2\n"
+    "ld1 { v10.h }[6], [x21], #0x2\n"
+    "ld1 { v14.h }[6], [x20], #0x2\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v5.h }[4], [x9], #0x2\n"
+    "ld1 { v6.h }[4], [x28], #0x2\n"
+    "ld1 { v7.h }[4], [x27], #0x2\n"
+    "ld1 { v8.h }[4], [x26], #0x2\n"
+    "ld1 { v9.h }[4], [x25], #0x2\n"
+    "ld1 { v13.h }[4], [x24], #0x2\n"
+    "ld1 { v11.h }[4], [x23], #0x2\n"
+    "ld1 { v12.h }[4], [x22], #0x2\n"
+    "ld1 { v10.h }[4], [x21], #0x2\n"
+    "ld1 { v14.h }[4], [x20], #0x2\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v5.s }[0], [x9], #0x4\n"
+    "ld1 { v6.s }[0], [x28], #0x4\n"
+    "ld1 { v7.s }[0], [x27], #0x4\n"
+    "ld1 { v8.s }[0], [x26], #0x4\n"
+    "ld1 { v9.s }[0], [x25], #0x4\n"
+    "ld1 { v13.s }[0], [x24], #0x4\n"
+    "ld1 { v11.s }[0], [x23], #0x4\n"
+    "ld1 { v12.s }[0], [x22], #0x4\n"
+    "ld1 { v10.s }[0], [x21], #0x4\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v5.h }[2], [x9], #0x2\n"
+    "ld1 { v6.h }[2], [x28], #0x2\n"
+    "ld1 { v7.h }[2], [x27], #0x2\n"
+    "ld1 { v8.h }[2], [x26], #0x2\n"
+    "ld1 { v9.h }[2], [x25], #0x2\n"
+    "ld1 { v13.h }[2], [x24], #0x2\n"
+    "ld1 { v11.h }[2], [x23], #0x2\n"
+    "ld1 { v12.h }[2], [x22], #0x2\n"
+    "ld1 { v10.h }[2], [x21], #0x2\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v5.h }[0], [x9], #0x2\n"
+    "ld1 { v6.h }[0], [x28], #0x2\n"
+    "ld1 { v7.h }[0], [x27], #0x2\n"
+    "ld1 { v8.h }[0], [x26], #0x2\n"
+    "ld1 { v9.h }[0], [x25], #0x2\n"
+    "ld1 { v13.h }[0], [x24], #0x2\n"
+    "ld1 { v11.h }[0], [x23], #0x2\n"
+    "ld1 { v12.h }[0], [x22], #0x2\n"
+    "ld1 { v10.h }[0], [x21], #0x2\n"
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "7:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
+    "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+    "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+    "ldr x20, [x15, #0x50]\n"
+    "add x20, x20, x10\n"
+    "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+    "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v8.8h\n"
+    "fmla v28.8h, v1.8h, v6.8h\n"
+    "fmla v29.8h, v1.8h, v9.8h\n"
+    "fmla v30.8h, v1.8h, v8.8h\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "tbz %x[n_channels], #2, 9f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v5.h }[6], [x20], #0x2\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v5.h }[4], [x20], #0x2\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load input (1, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "11:"  // Oddments: Load input (1, 3): Bit 2: End
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v31.8h, v2.8h, v5.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v5.8h\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v6.h }[6], [x20], #0x2\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v6.h }[4], [x20], #0x2\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load input (1, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "15:"  // Oddments: Load input (1, 4): Bit 2: End
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v31.8h, v3.8h, v6.8h\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load input (0, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load input (0, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (0, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "19:"  // Oddments: Load input (0, 5): Bit 2: End
+    "ldr q0, [x16, #0x0]\n"
+    "fmla v29.8h, v4.8h, v9.8h\n"
+    "fmla v30.8h, v4.8h, v6.8h\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v28.8h, v0.8h, v7.8h\n"
+    "add x20, x20, x10\n"
+    "fmla v29.8h, v0.8h, v8.8h\n"
+    "fmla v30.8h, v0.8h, v14.8h\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 21f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load input (2, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "23:"  // Oddments: Load input (2, 1): Bit 2: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "fmla v31.8h, v0.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v8.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "fmla v30.8h, v1.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 25f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load input (2, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load input (2, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (2, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "27:"  // Oddments: Load input (2, 2): Bit 2: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "fmla v29.8h, v2.8h, v5.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 29f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load input (2, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "31:"  // Oddments: Load input (2, 3): Bit 2: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "fmla v28.8h, v3.8h, v5.8h\n"
+    "fmla v29.8h, v3.8h, v6.8h\n"
+    "fmla v30.8h, v3.8h, v9.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 33f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load input (2, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "35:"  // Oddments: Load input (2, 4): Bit 2: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v31.8h, v3.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v6.8h\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 37f\n"
+    "ld1 { v8.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v8.h }[6], [x20], #0x2\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load input (2, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v8.h }[4], [x20], #0x2\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load input (2, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (2, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "39:"  // Oddments: Load input (2, 5): Bit 2: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x90]\n"
+    "fmla v31.8h, v4.8h, v8.8h\n"
+    "fmla v28.8h, v0.8h, v14.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 41f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v5.h }[6], [x20], #0x2\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v5.h }[4], [x20], #0x2\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load input (3, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "43:"  // Oddments: Load input (3, 0): Bit 2: End
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v30.8h, v0.8h, v5.8h\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #2, 45f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v6.h }[6], [x20], #0x2\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v6.h }[4], [x20], #0x2\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load input (3, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "47:"  // Oddments: Load input (3, 1): Bit 2: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "fmla v31.8h, v0.8h, v6.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v30.8h, v1.8h, v6.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 49f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load input (3, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "51:"  // Oddments: Load input (3, 2): Bit 2: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla v31.8h, v1.8h, v10.8h\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "fmla v30.8h, v2.8h, v10.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 53f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load input (3, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "55:"  // Oddments: Load input (3, 3): Bit 2: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v9.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 57f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load input (3, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "59:"  // Oddments: Load input (3, 4): Bit 2: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "fmla v29.8h, v4.8h, v8.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 61f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v14.h }[6], [x20], #0x2\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load input (3, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v14.h }[4], [x20], #0x2\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load input (3, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (3, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "63:"  // Oddments: Load input (3, 5): Bit 2: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla v31.8h, v4.8h, v14.8h\n"
+    "fmla v28.8h, v0.8h, v5.8h\n"
+    "fmla v29.8h, v0.8h, v6.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 65f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load input (4, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "67:"  // Oddments: Load input (4, 0): Bit 2: End
+    "ldr x20, [x15, #0xc8]\n"
+    "fmla v30.8h, v0.8h, v9.8h\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #2, 69f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v13.h }[6], [x20], #0x2\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v13.h }[4], [x20], #0x2\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load input (4, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 70f\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ld1 { v13.h }[2], [x20], #0x2\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v13.h }[0], [x20], #0x2\n"
+    "71:"  // Oddments: Load input (4, 1): Bit 2: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xd0]\n"
+    "fmla v31.8h, v0.8h, v13.8h\n"
+    "fmla v28.8h, v1.8h, v6.8h\n"
+    "fmla v29.8h, v1.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v13.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 73f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 72f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v5.h }[6], [x20], #0x2\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v5.h }[4], [x20], #0x2\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load input (4, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 74f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 75f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "75:"  // Oddments: Load input (4, 2): Bit 2: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xd8]\n"
+    "fmla v31.8h, v1.8h, v5.8h\n"
+    "fmla v28.8h, v2.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v5.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 77f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 76f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v6.h }[6], [x20], #0x2\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v6.h }[4], [x20], #0x2\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load input (4, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 78f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 79f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "79:"  // Oddments: Load input (4, 3): Bit 2: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v31.8h, v2.8h, v6.8h\n"
+    "fmla v28.8h, v3.8h, v11.8h\n"
+    "fmla v29.8h, v3.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v6.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 81f\n"
+    "ld1 { v8.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 80f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v8.h }[6], [x20], #0x2\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v8.h }[4], [x20], #0x2\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load input (4, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 82f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 83f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "83:"  // Oddments: Load input (4, 4): Bit 2: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xe8]\n"
+    "fmla v31.8h, v3.8h, v8.8h\n"
+    "fmla v28.8h, v4.8h, v12.8h\n"
+    "fmla v29.8h, v4.8h, v14.8h\n"
+    "fmla v30.8h, v4.8h, v8.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 85f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 84f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[6], [x20], #0x2\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load input (4, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[4], [x20], #0x2\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load input (4, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 86f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 87f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load input (4, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "87:"  // Oddments: Load input (4, 5): Bit 2: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xf0]\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v28.8h, v0.8h, v9.8h\n"
+    "fmla v29.8h, v0.8h, v13.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 89f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 88f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load input (5, 0): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load input (5, 0): Bit 2: Unset
+    "tbz %x[n_channels], #1, 90f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 91f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load input (5, 0): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "91:"  // Oddments: Load input (5, 0): Bit 2: End
+    "ldr x20, [x15, #0xf8]\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #2, 93f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 92f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load input (5, 1): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load input (5, 1): Bit 2: Unset
+    "tbz %x[n_channels], #1, 94f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 95f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load input (5, 1): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "95:"  // Oddments: Load input (5, 1): Bit 2: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x100]\n"
+    "fmla v31.8h, v0.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "fmla v29.8h, v1.8h, v5.8h\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 97f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 96f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load input (5, 2): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load input (5, 2): Bit 2: Unset
+    "tbz %x[n_channels], #1, 98f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 99f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load input (5, 2): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "99:"  // Oddments: Load input (5, 2): Bit 2: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x108]\n"
+    "fmla v31.8h, v1.8h, v9.8h\n"
+    "fmla v28.8h, v2.8h, v5.8h\n"
+    "fmla v29.8h, v2.8h, v6.8h\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 101f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 100f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[6], [x20], #0x2\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load input (5, 3): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[4], [x20], #0x2\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load input (5, 3): Bit 2: Unset
+    "tbz %x[n_channels], #1, 102f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 103f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load input (5, 3): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "103:"  // Oddments: Load input (5, 3): Bit 2: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x110]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v6.8h\n"
+    "fmla v29.8h, v3.8h, v8.8h\n"
+    "fmla v30.8h, v3.8h, v11.8h\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #2, 105f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 104f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[6], [x20], #0x2\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load input (5, 4): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[4], [x20], #0x2\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load input (5, 4): Bit 2: Unset
+    "tbz %x[n_channels], #1, 106f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 107f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load input (5, 4): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "107:"  // Oddments: Load input (5, 4): Bit 2: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x118]\n"
+    "fmla v31.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v8.8h\n"
+    "fmla v29.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #2, 109f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 108f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v9.h }[6], [x20], #0x2\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load input (5, 5): Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v9.h }[4], [x20], #0x2\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load input (5, 5): Bit 2: Unset
+    "tbz %x[n_channels], #1, 110f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 111f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load input (5, 5): Bit 2: Unset: Bit 1: Unset
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "111:"  // Oddments: Load input (5, 5): Bit 2: End
+    "fmla v31.8h, v4.8h, v9.8h\n"
+    "fmax v28.8h, v28.8h, v27.8h\n"
+    "fmax v29.8h, v29.8h, v27.8h\n"
+    "fmax v30.8h, v30.8h, v27.8h\n"
+    "fmax v31.8h, v31.8h, v27.8h\n"
+    "fmin v28.8h, v28.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v15.8h\n"
+    "fmin v30.8h, v30.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v15.8h\n"
+    "tbz %x[n_channels], #2, 113f\n"
+    "st1 { v28.d }[0], [x14], #0x8\n"
+    "st1 { v29.d }[0], [x13], #0x8\n"
+    "st1 { v30.d }[0], [x12], #0x8\n"
+    "st1 { v31.d }[0], [x11], #0x8\n"
+    "tbz %x[n_channels], #1, 112f\n"
+    "st1 { v28.s }[2], [x14], #0x4\n"
+    "st1 { v29.s }[2], [x13], #0x4\n"
+    "st1 { v30.s }[2], [x12], #0x4\n"
+    "st1 { v31.s }[2], [x11], #0x4\n"
+    "tbz %x[n_channels], #0, 115f\n"
+    "st1 { v28.h }[6], [x14], #0x2\n"
+    "st1 { v29.h }[6], [x13], #0x2\n"
+    "st1 { v30.h }[6], [x12], #0x2\n"
+    "st1 { v31.h }[6], [x11], #0x2\n"
+    "b 115f\n"
+    "112:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 115f\n"
+    "st1 { v28.h }[4], [x14], #0x2\n"
+    "st1 { v29.h }[4], [x13], #0x2\n"
+    "st1 { v30.h }[4], [x12], #0x2\n"
+    "st1 { v31.h }[4], [x11], #0x2\n"
+    "b 115f\n"
+    "113:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 114f\n"
+    "st1 { v28.s }[0], [x14], #0x4\n"
+    "st1 { v29.s }[0], [x13], #0x4\n"
+    "st1 { v30.s }[0], [x12], #0x4\n"
+    "st1 { v31.s }[0], [x11], #0x4\n"
+    "tbz %x[n_channels], #0, 115f\n"
+    "st1 { v28.h }[2], [x14], #0x2\n"
+    "st1 { v29.h }[2], [x13], #0x2\n"
+    "st1 { v30.h }[2], [x12], #0x2\n"
+    "st1 { v31.h }[2], [x11], #0x2\n"
+    "b 115f\n"
+    "114:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "st1 { v28.h }[0], [x14], #0x2\n"
+    "st1 { v29.h }[0], [x13], #0x2\n"
+    "st1 { v30.h }[0], [x12], #0x2\n"
+    "st1 { v31.h }[0], [x11], #0x2\n"
+    "115:"  // Oddments: Store: Bit 2: End
+    "116:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..b7608af721
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+class a64_fp16_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  KernelType kernel = a64_fp16_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16>(9, arm_gemm::VLType::None) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..08f40b785f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
+  const __fp16 *const *const inptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  const void *bias,
+  const unsigned int n_points,
+  const unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  const __fp16 minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v2.8h }, [%x[minmax_vals]]\n"
+    "lsr x9, %x[n_channels], #0x3\n"
+    "add x20, %x[minmax_vals], #0x2\n"
+    "ld1r { v1.8h }, [x20]\n"
+    "mov x11, #0x0\n"
+    "cbz x9, 5f\n"
+    "1:"  // Channel loop
+    "movi v23.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q23, [%x[bias], x11]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "ldr q0, [%x[params], #0x0]\n"
+    "mov x26, %x[inptrs]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "subs x25, %x[n_points], #0x1\n"
+    "ldr q14, [x21, x11]\n"
+    "ldr q15, [x20, x11]\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v25.16b, v23.16b\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "ldr q16, [x21, x11]\n"
+    "mov v26.16b, v23.16b\n"
+    "mov v27.16b, v23.16b\n"
+    "ldr q17, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "ldr q18, [x21, x11]\n"
+    "ldr q19, [x20, x11]\n"
+    "mov v30.16b, v23.16b\n"
+    "mov v31.16b, v23.16b\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "ldr q20, [x21, x11]\n"
+    "add %x[params], %x[params], #0x10\n"
+    "ldr q21, [x20, x11]\n"
+    "ldr x20, [x26], #0x8\n"
+    "ldr q22, [x20, x11]\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x20, x24, [x26], #0x10\n"
+    "ldp x23, x22, [x26], #0x10\n"
+    "subs x25, x25, #0x1\n"
+    "fmla v23.8h, v14.8h, v0.8h\n"
+    "ldr q14, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "fmla v24.8h, v15.8h, v0.8h\n"
+    "fmla v25.8h, v16.8h, v0.8h\n"
+    "ldr q15, [x24, x11]\n"
+    "ldr q16, [x23, x11]\n"
+    "fmla v26.8h, v17.8h, v0.8h\n"
+    "fmla v27.8h, v18.8h, v0.8h\n"
+    "ldr q17, [x22, x11]\n"
+    "ldr q18, [x21, x11]\n"
+    "fmla v28.8h, v19.8h, v0.8h\n"
+    "fmla v29.8h, v20.8h, v0.8h\n"
+    "ldr q19, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "fmla v30.8h, v21.8h, v0.8h\n"
+    "fmla v31.8h, v22.8h, v0.8h\n"
+    "ldr q0, [%x[params], #0x0]\n"
+    "ldr q20, [x21, x11]\n"
+    "add %x[params], %x[params], #0x10\n"
+    "ldr q21, [x20, x11]\n"
+    "ldr x20, [x26], #0x8\n"
+    "ldr q22, [x20, x11]\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "fmla v23.8h, v14.8h, v0.8h\n"
+    "fmla v24.8h, v15.8h, v0.8h\n"
+    "fmax v23.8h, v23.8h, v2.8h\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "fmla v25.8h, v16.8h, v0.8h\n"
+    "fmla v26.8h, v17.8h, v0.8h\n"
+    "fmax v24.8h, v24.8h, v2.8h\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "fmla v27.8h, v18.8h, v0.8h\n"
+    "fmla v28.8h, v19.8h, v0.8h\n"
+    "fmax v25.8h, v25.8h, v2.8h\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "fmla v29.8h, v20.8h, v0.8h\n"
+    "fmla v30.8h, v21.8h, v0.8h\n"
+    "fmax v26.8h, v26.8h, v2.8h\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "fmla v31.8h, v22.8h, v0.8h\n"
+    "fmax v27.8h, v27.8h, v2.8h\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "fmax v28.8h, v28.8h, v2.8h\n"
+    "fmax v29.8h, v29.8h, v2.8h\n"
+    "fmax v30.8h, v30.8h, v2.8h\n"
+    "fmax v31.8h, v31.8h, v2.8h\n"
+    "fmin v23.8h, v23.8h, v1.8h\n"
+    "fmin v24.8h, v24.8h, v1.8h\n"
+    "str q23, [x28, x11]\n"
+    "fmin v25.8h, v25.8h, v1.8h\n"
+    "fmin v26.8h, v26.8h, v1.8h\n"
+    "str q24, [x27, x11]\n"
+    "fmin v27.8h, v27.8h, v1.8h\n"
+    "fmin v28.8h, v28.8h, v1.8h\n"
+    "str q25, [x26, x11]\n"
+    "fmin v29.8h, v29.8h, v1.8h\n"
+    "fmin v30.8h, v30.8h, v1.8h\n"
+    "str q26, [x25, x11]\n"
+    "fmin v31.8h, v31.8h, v1.8h\n"
+    "str q27, [x24, x11]\n"
+    "str q28, [x23, x11]\n"
+    "str q29, [x22, x11]\n"
+    "str q30, [x21, x11]\n"
+    "str q31, [x20, x11]\n"
+    "add x11, x11, #0x10\n"
+    "cmp x11, x9, LSL #4\n"
+    "blt 1b\n"
+    "5:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 25f\n"
+    "movi v23.16b, #0x0\n"
+    "cbz %x[bias], 10f\n"
+    "add x20, %x[bias], x11\n"
+    "tbz %x[n_channels], #2, 7f\n"
+    "ld1 { v23.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v23.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v23.h }[6], [x20], #0x2\n"
+    "b 9f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v23.h }[4], [x20], #0x2\n"
+    "b 9f\n"
+    "7:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "9:"  // Oddments: Load bias: Bit 2: End
+    "10:"  // Oddments: Load bias: Done
+    "ldr q0, [%x[params], #0x0]\n"
+    "mov x10, %x[inptrs]\n"
+    "ldp x9, x28, [x10], #0x10\n"
+    "mov v24.16b, v23.16b\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "mov v25.16b, v23.16b\n"
+    "mov v26.16b, v23.16b\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "ldr x21, [x10], #0x8\n"
+    "mov v27.16b, v23.16b\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "mov v30.16b, v23.16b\n"
+    "add x9, x9, x11\n"
+    "add x28, x28, x11\n"
+    "mov v31.16b, v23.16b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "add %x[params], %x[params], #0x10\n"
+    "tbz %x[n_channels], #2, 12f\n"
+    "ldr d14, [x9], #0x8\n"
+    "ldr d15, [x28], #0x8\n"
+    "ldr d16, [x27], #0x8\n"
+    "ldr d17, [x26], #0x8\n"
+    "ldr d18, [x25], #0x8\n"
+    "ldr d19, [x24], #0x8\n"
+    "ldr d20, [x23], #0x8\n"
+    "ldr d21, [x22], #0x8\n"
+    "ldr d22, [x21], #0x8\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ld1 { v14.s }[2], [x9], #0x4\n"
+    "ld1 { v15.s }[2], [x28], #0x4\n"
+    "ld1 { v16.s }[2], [x27], #0x4\n"
+    "ld1 { v17.s }[2], [x26], #0x4\n"
+    "ld1 { v18.s }[2], [x25], #0x4\n"
+    "ld1 { v19.s }[2], [x24], #0x4\n"
+    "ld1 { v20.s }[2], [x23], #0x4\n"
+    "ld1 { v21.s }[2], [x22], #0x4\n"
+    "ld1 { v22.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.h }[6], [x9], #0x2\n"
+    "ld1 { v15.h }[6], [x28], #0x2\n"
+    "ld1 { v16.h }[6], [x27], #0x2\n"
+    "ld1 { v17.h }[6], [x26], #0x2\n"
+    "ld1 { v18.h }[6], [x25], #0x2\n"
+    "ld1 { v19.h }[6], [x24], #0x2\n"
+    "ld1 { v20.h }[6], [x23], #0x2\n"
+    "ld1 { v21.h }[6], [x22], #0x2\n"
+    "ld1 { v22.h }[6], [x21], #0x2\n"
+    "b 14f\n"
+    "11:"  // Oddments: Load: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.h }[4], [x9], #0x2\n"
+    "ld1 { v15.h }[4], [x28], #0x2\n"
+    "ld1 { v16.h }[4], [x27], #0x2\n"
+    "ld1 { v17.h }[4], [x26], #0x2\n"
+    "ld1 { v18.h }[4], [x25], #0x2\n"
+    "ld1 { v19.h }[4], [x24], #0x2\n"
+    "ld1 { v20.h }[4], [x23], #0x2\n"
+    "ld1 { v21.h }[4], [x22], #0x2\n"
+    "ld1 { v22.h }[4], [x21], #0x2\n"
+    "b 14f\n"
+    "12:"  // Oddments: Load: Bit 2: Unset
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr s14, [x9], #0x4\n"
+    "ldr s15, [x28], #0x4\n"
+    "ldr s16, [x27], #0x4\n"
+    "ldr s17, [x26], #0x4\n"
+    "ldr s18, [x25], #0x4\n"
+    "ldr s19, [x24], #0x4\n"
+    "ldr s20, [x23], #0x4\n"
+    "ldr s21, [x22], #0x4\n"
+    "ldr s22, [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.h }[2], [x9], #0x2\n"
+    "ld1 { v15.h }[2], [x28], #0x2\n"
+    "ld1 { v16.h }[2], [x27], #0x2\n"
+    "ld1 { v17.h }[2], [x26], #0x2\n"
+    "ld1 { v18.h }[2], [x25], #0x2\n"
+    "ld1 { v19.h }[2], [x24], #0x2\n"
+    "ld1 { v20.h }[2], [x23], #0x2\n"
+    "ld1 { v21.h }[2], [x22], #0x2\n"
+    "ld1 { v22.h }[2], [x21], #0x2\n"
+    "b 14f\n"
+    "13:"  // Oddments: Load: Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "14:"  // Oddments: Load: Bit 2: End
+    "subs x20, %x[n_points], #0x1\n"
+    "ble 20f\n"
+    "15:"  // Oddments: Planar loop
+    "ldp x9, x28, [x10], #0x10\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "fmla v23.8h, v14.8h, v0.8h\n"
+    "fmla v24.8h, v15.8h, v0.8h\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "fmla v25.8h, v16.8h, v0.8h\n"
+    "fmla v26.8h, v17.8h, v0.8h\n"
+    "ldr x21, [x10], #0x8\n"
+    "fmla v27.8h, v18.8h, v0.8h\n"
+    "fmla v28.8h, v19.8h, v0.8h\n"
+    "add x9, x9, x11\n"
+    "fmla v29.8h, v20.8h, v0.8h\n"
+    "fmla v30.8h, v21.8h, v0.8h\n"
+    "add x28, x28, x11\n"
+    "add x27, x27, x11\n"
+    "fmla v31.8h, v22.8h, v0.8h\n"
+    "ldr q0, [%x[params], #0x0]\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "add %x[params], %x[params], #0x10\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ldr d14, [x9], #0x8\n"
+    "ldr d15, [x28], #0x8\n"
+    "ldr d16, [x27], #0x8\n"
+    "ldr d17, [x26], #0x8\n"
+    "ldr d18, [x25], #0x8\n"
+    "ldr d19, [x24], #0x8\n"
+    "ldr d20, [x23], #0x8\n"
+    "ldr d21, [x22], #0x8\n"
+    "ldr d22, [x21], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v14.s }[2], [x9], #0x4\n"
+    "ld1 { v15.s }[2], [x28], #0x4\n"
+    "ld1 { v16.s }[2], [x27], #0x4\n"
+    "ld1 { v17.s }[2], [x26], #0x4\n"
+    "ld1 { v18.s }[2], [x25], #0x4\n"
+    "ld1 { v19.s }[2], [x24], #0x4\n"
+    "ld1 { v20.s }[2], [x23], #0x4\n"
+    "ld1 { v21.s }[2], [x22], #0x4\n"
+    "ld1 { v22.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v14.h }[6], [x9], #0x2\n"
+    "ld1 { v15.h }[6], [x28], #0x2\n"
+    "ld1 { v16.h }[6], [x27], #0x2\n"
+    "ld1 { v17.h }[6], [x26], #0x2\n"
+    "ld1 { v18.h }[6], [x25], #0x2\n"
+    "ld1 { v19.h }[6], [x24], #0x2\n"
+    "ld1 { v20.h }[6], [x23], #0x2\n"
+    "ld1 { v21.h }[6], [x22], #0x2\n"
+    "ld1 { v22.h }[6], [x21], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Planar loop: Load: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v14.h }[4], [x9], #0x2\n"
+    "ld1 { v15.h }[4], [x28], #0x2\n"
+    "ld1 { v16.h }[4], [x27], #0x2\n"
+    "ld1 { v17.h }[4], [x26], #0x2\n"
+    "ld1 { v18.h }[4], [x25], #0x2\n"
+    "ld1 { v19.h }[4], [x24], #0x2\n"
+    "ld1 { v20.h }[4], [x23], #0x2\n"
+    "ld1 { v21.h }[4], [x22], #0x2\n"
+    "ld1 { v22.h }[4], [x21], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Planar loop: Load: Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ldr s14, [x9], #0x4\n"
+    "ldr s15, [x28], #0x4\n"
+    "ldr s16, [x27], #0x4\n"
+    "ldr s17, [x26], #0x4\n"
+    "ldr s18, [x25], #0x4\n"
+    "ldr s19, [x24], #0x4\n"
+    "ldr s20, [x23], #0x4\n"
+    "ldr s21, [x22], #0x4\n"
+    "ldr s22, [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v14.h }[2], [x9], #0x2\n"
+    "ld1 { v15.h }[2], [x28], #0x2\n"
+    "ld1 { v16.h }[2], [x27], #0x2\n"
+    "ld1 { v17.h }[2], [x26], #0x2\n"
+    "ld1 { v18.h }[2], [x25], #0x2\n"
+    "ld1 { v19.h }[2], [x24], #0x2\n"
+    "ld1 { v20.h }[2], [x23], #0x2\n"
+    "ld1 { v21.h }[2], [x22], #0x2\n"
+    "ld1 { v22.h }[2], [x21], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Planar loop: Load: Bit 2: Unset: Bit 1: Unset
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "19:"  // Oddments: Planar loop: Load: Bit 2: End
+    "subs x20, x20, #0x1\n"
+    "bgt 15b\n"
+    "20:"  // Oddments: Planar tail
+    "fmla v23.8h, v14.8h, v0.8h\n"
+    "fmla v24.8h, v15.8h, v0.8h\n"
+    "fmax v23.8h, v23.8h, v2.8h\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "fmla v25.8h, v16.8h, v0.8h\n"
+    "fmla v26.8h, v17.8h, v0.8h\n"
+    "fmax v24.8h, v24.8h, v2.8h\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "fmla v27.8h, v18.8h, v0.8h\n"
+    "fmla v28.8h, v19.8h, v0.8h\n"
+    "fmax v25.8h, v25.8h, v2.8h\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "fmla v29.8h, v20.8h, v0.8h\n"
+    "fmla v30.8h, v21.8h, v0.8h\n"
+    "fmax v26.8h, v26.8h, v2.8h\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "fmla v31.8h, v22.8h, v0.8h\n"
+    "fmax v27.8h, v27.8h, v2.8h\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "add x28, x28, x11\n"
+    "fmax v28.8h, v28.8h, v2.8h\n"
+    "fmax v29.8h, v29.8h, v2.8h\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "fmax v30.8h, v30.8h, v2.8h\n"
+    "fmax v31.8h, v31.8h, v2.8h\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "fmin v23.8h, v23.8h, v1.8h\n"
+    "fmin v24.8h, v24.8h, v1.8h\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "fmin v25.8h, v25.8h, v1.8h\n"
+    "fmin v26.8h, v26.8h, v1.8h\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "fmin v27.8h, v27.8h, v1.8h\n"
+    "fmin v28.8h, v28.8h, v1.8h\n"
+    "fmin v29.8h, v29.8h, v1.8h\n"
+    "fmin v30.8h, v30.8h, v1.8h\n"
+    "fmin v31.8h, v31.8h, v1.8h\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "st1 { v23.d }[0], [x28], #0x8\n"
+    "st1 { v24.d }[0], [x27], #0x8\n"
+    "st1 { v25.d }[0], [x26], #0x8\n"
+    "st1 { v26.d }[0], [x25], #0x8\n"
+    "st1 { v27.d }[0], [x24], #0x8\n"
+    "st1 { v28.d }[0], [x23], #0x8\n"
+    "st1 { v29.d }[0], [x22], #0x8\n"
+    "st1 { v30.d }[0], [x21], #0x8\n"
+    "st1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "st1 { v23.s }[2], [x28], #0x4\n"
+    "st1 { v24.s }[2], [x27], #0x4\n"
+    "st1 { v25.s }[2], [x26], #0x4\n"
+    "st1 { v26.s }[2], [x25], #0x4\n"
+    "st1 { v27.s }[2], [x24], #0x4\n"
+    "st1 { v28.s }[2], [x23], #0x4\n"
+    "st1 { v29.s }[2], [x22], #0x4\n"
+    "st1 { v30.s }[2], [x21], #0x4\n"
+    "st1 { v31.s }[2], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "st1 { v23.h }[6], [x28], #0x2\n"
+    "st1 { v24.h }[6], [x27], #0x2\n"
+    "st1 { v25.h }[6], [x26], #0x2\n"
+    "st1 { v26.h }[6], [x25], #0x2\n"
+    "st1 { v27.h }[6], [x24], #0x2\n"
+    "st1 { v28.h }[6], [x23], #0x2\n"
+    "st1 { v29.h }[6], [x22], #0x2\n"
+    "st1 { v30.h }[6], [x21], #0x2\n"
+    "st1 { v31.h }[6], [x20], #0x2\n"
+    "b 24f\n"
+    "21:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "st1 { v23.h }[4], [x28], #0x2\n"
+    "st1 { v24.h }[4], [x27], #0x2\n"
+    "st1 { v25.h }[4], [x26], #0x2\n"
+    "st1 { v26.h }[4], [x25], #0x2\n"
+    "st1 { v27.h }[4], [x24], #0x2\n"
+    "st1 { v28.h }[4], [x23], #0x2\n"
+    "st1 { v29.h }[4], [x22], #0x2\n"
+    "st1 { v30.h }[4], [x21], #0x2\n"
+    "st1 { v31.h }[4], [x20], #0x2\n"
+    "b 24f\n"
+    "22:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "st1 { v23.s }[0], [x28], #0x4\n"
+    "st1 { v24.s }[0], [x27], #0x4\n"
+    "st1 { v25.s }[0], [x26], #0x4\n"
+    "st1 { v26.s }[0], [x25], #0x4\n"
+    "st1 { v27.s }[0], [x24], #0x4\n"
+    "st1 { v28.s }[0], [x23], #0x4\n"
+    "st1 { v29.s }[0], [x22], #0x4\n"
+    "st1 { v30.s }[0], [x21], #0x4\n"
+    "st1 { v31.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "st1 { v23.h }[2], [x28], #0x2\n"
+    "st1 { v24.h }[2], [x27], #0x2\n"
+    "st1 { v25.h }[2], [x26], #0x2\n"
+    "st1 { v26.h }[2], [x25], #0x2\n"
+    "st1 { v27.h }[2], [x24], #0x2\n"
+    "st1 { v28.h }[2], [x23], #0x2\n"
+    "st1 { v29.h }[2], [x22], #0x2\n"
+    "st1 { v30.h }[2], [x21], #0x2\n"
+    "st1 { v31.h }[2], [x20], #0x2\n"
+    "b 24f\n"
+    "23:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "st1 { v23.h }[0], [x28], #0x2\n"
+    "st1 { v24.h }[0], [x27], #0x2\n"
+    "st1 { v25.h }[0], [x26], #0x2\n"
+    "st1 { v26.h }[0], [x25], #0x2\n"
+    "st1 { v27.h }[0], [x24], #0x2\n"
+    "st1 { v28.h }[0], [x23], #0x2\n"
+    "st1 { v29.h }[0], [x22], #0x2\n"
+    "st1 { v30.h }[0], [x21], #0x2\n"
+    "st1 { v31.h }[0], [x20], #0x2\n"
+    "24:"  // Oddments: Store: Bit 2: End
+    "25:"  // End
+    : [params] "+&r" (params)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..3646c18b04
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<__fp16, __fp16, __fp16, __fp16>;
+  a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::None)
+  {
+  }
+  Parent::KernelType kernel = a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..cee3fb59c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1044 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const __fp16 *const *const inptrs,
+  __fp16 *const *const outptrs,
+  const __fp16 *weights,
+  const __fp16 *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  const __fp16 minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v8.8h }, [%x[minmax_vals]]\n"
+    "lsr x11, %x[n_output_channels], #0x3\n"
+    "add x20, %x[minmax_vals], #0x2\n"
+    "ld1r { v7.8h }, [x20]\n"
+    "mov x10, #0x0\n"
+    "cbz x11, 8f\n"
+    "1:"  // Output channel loop
+    "movi v31.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x10, #0x1\n"
+    "ldr q31, [%x[bias], x20]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "ldr q6, [%x[weights], #0x0]\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr q1, [x21, #0x0]\n"
+    "ldr q0, [x20, #0x0]\n"
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz x23, 6f\n"
+    "ldr q5, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q4, [x21, #0x0]\n"
+    "ldr q3, [x20, #0x0]\n"
+    "beq 4f\n"
+    "3:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "ldr q1, [x21, #0x0]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "ldr q0, [x20, #0x0]\n"
+    "ldr q6, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "ldr q4, [x21, #0x0]\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "ldr q3, [x20, #0x0]\n"
+    "ldr q5, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 3b\n"
+    "4:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 5f\n"
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "lsl x28, x10, #0x1\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "fmin v16.8h, v16.8h, v7.8h\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmin v17.8h, v17.8h, v7.8h\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmin v18.8h, v18.8h, v7.8h\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "fmin v19.8h, v19.8h, v7.8h\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmin v20.8h, v20.8h, v7.8h\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmin v21.8h, v21.8h, v7.8h\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmin v22.8h, v22.8h, v7.8h\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "fmin v23.8h, v23.8h, v7.8h\n"
+    "fmax v16.8h, v16.8h, v8.8h\n"
+    "fmax v17.8h, v17.8h, v8.8h\n"
+    "str q16, [x27, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v18.8h, v18.8h, v8.8h\n"
+    "fmax v19.8h, v19.8h, v8.8h\n"
+    "str q17, [x26, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v20.8h, v20.8h, v8.8h\n"
+    "fmax v21.8h, v21.8h, v8.8h\n"
+    "str q18, [x25, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmax v22.8h, v22.8h, v8.8h\n"
+    "fmax v23.8h, v23.8h, v8.8h\n"
+    "str q19, [x24, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v24.8h, v24.8h, v7.8h\n"
+    "fmin v25.8h, v25.8h, v7.8h\n"
+    "str q20, [x23, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v26.8h, v26.8h, v7.8h\n"
+    "fmin v27.8h, v27.8h, v7.8h\n"
+    "str q21, [x22, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v28.8h, v28.8h, v7.8h\n"
+    "fmin v29.8h, v29.8h, v7.8h\n"
+    "str q22, [x21, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmin v30.8h, v30.8h, v7.8h\n"
+    "fmin v31.8h, v31.8h, v7.8h\n"
+    "str q23, [x20, x28]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.8h, v24.8h, v8.8h\n"
+    "fmax v25.8h, v25.8h, v8.8h\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.8h, v26.8h, v8.8h\n"
+    "fmax v27.8h, v27.8h, v8.8h\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.8h, v28.8h, v8.8h\n"
+    "fmax v29.8h, v29.8h, v8.8h\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.8h, v30.8h, v8.8h\n"
+    "fmax v31.8h, v31.8h, v8.8h\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "b 7f\n"
+    "5:"  // Output channel loop: Odd tail
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "ldp x20, x9, [x22], #0x10\n"
+    "lsl x28, x10, #0x1\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "ldr q2, [x20, #0x0]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "ldr q1, [%x[weights], #0x0]\n"
+    "ldr q0, [x9, #0x0]\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "fmla v16.8h, v1.8h, v2.h[0]\n"
+    "fmla v17.8h, v1.8h, v2.h[1]\n"
+    "fmin v16.8h, v16.8h, v7.8h\n"
+    "fmla v18.8h, v1.8h, v2.h[2]\n"
+    "fmla v19.8h, v1.8h, v2.h[3]\n"
+    "fmin v17.8h, v17.8h, v7.8h\n"
+    "fmla v20.8h, v1.8h, v2.h[4]\n"
+    "fmla v21.8h, v1.8h, v2.h[5]\n"
+    "fmin v18.8h, v18.8h, v7.8h\n"
+    "fmla v22.8h, v1.8h, v2.h[6]\n"
+    "fmla v23.8h, v1.8h, v2.h[7]\n"
+    "fmin v19.8h, v19.8h, v7.8h\n"
+    "fmla v24.8h, v1.8h, v0.h[0]\n"
+    "fmla v25.8h, v1.8h, v0.h[1]\n"
+    "fmin v20.8h, v20.8h, v7.8h\n"
+    "fmla v26.8h, v1.8h, v0.h[2]\n"
+    "fmla v27.8h, v1.8h, v0.h[3]\n"
+    "fmin v21.8h, v21.8h, v7.8h\n"
+    "fmla v28.8h, v1.8h, v0.h[4]\n"
+    "fmla v29.8h, v1.8h, v0.h[5]\n"
+    "fmin v22.8h, v22.8h, v7.8h\n"
+    "fmla v30.8h, v1.8h, v0.h[6]\n"
+    "fmla v31.8h, v1.8h, v0.h[7]\n"
+    "fmin v23.8h, v23.8h, v7.8h\n"
+    "fmax v16.8h, v16.8h, v8.8h\n"
+    "fmax v17.8h, v17.8h, v8.8h\n"
+    "str q16, [x27, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v18.8h, v18.8h, v8.8h\n"
+    "fmax v19.8h, v19.8h, v8.8h\n"
+    "str q17, [x26, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v20.8h, v20.8h, v8.8h\n"
+    "fmax v21.8h, v21.8h, v8.8h\n"
+    "str q18, [x25, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmax v22.8h, v22.8h, v8.8h\n"
+    "fmax v23.8h, v23.8h, v8.8h\n"
+    "str q19, [x24, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v24.8h, v24.8h, v7.8h\n"
+    "fmin v25.8h, v25.8h, v7.8h\n"
+    "str q20, [x23, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v26.8h, v26.8h, v7.8h\n"
+    "fmin v27.8h, v27.8h, v7.8h\n"
+    "str q21, [x22, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v28.8h, v28.8h, v7.8h\n"
+    "fmin v29.8h, v29.8h, v7.8h\n"
+    "str q22, [x21, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmin v30.8h, v30.8h, v7.8h\n"
+    "fmin v31.8h, v31.8h, v7.8h\n"
+    "str q23, [x20, x28]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.8h, v24.8h, v8.8h\n"
+    "fmax v25.8h, v25.8h, v8.8h\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.8h, v26.8h, v8.8h\n"
+    "fmax v27.8h, v27.8h, v8.8h\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.8h, v28.8h, v8.8h\n"
+    "fmax v29.8h, v29.8h, v8.8h\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.8h, v30.8h, v8.8h\n"
+    "fmax v31.8h, v31.8h, v8.8h\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "b 7f\n"
+    "6:"  // Output channel loop: Single kernel point
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "fmin v16.8h, v16.8h, v7.8h\n"
+    "lsl x28, x10, #0x1\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmin v17.8h, v17.8h, v7.8h\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmin v18.8h, v18.8h, v7.8h\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "fmin v19.8h, v19.8h, v7.8h\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmin v20.8h, v20.8h, v7.8h\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmin v21.8h, v21.8h, v7.8h\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmin v22.8h, v22.8h, v7.8h\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "fmin v23.8h, v23.8h, v7.8h\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmax v16.8h, v16.8h, v8.8h\n"
+    "fmax v17.8h, v17.8h, v8.8h\n"
+    "str q16, [x27, x28]\n"
+    "fmax v18.8h, v18.8h, v8.8h\n"
+    "fmax v19.8h, v19.8h, v8.8h\n"
+    "str q17, [x26, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v20.8h, v20.8h, v8.8h\n"
+    "fmax v21.8h, v21.8h, v8.8h\n"
+    "str q18, [x25, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v22.8h, v22.8h, v8.8h\n"
+    "fmax v23.8h, v23.8h, v8.8h\n"
+    "str q19, [x24, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmin v24.8h, v24.8h, v7.8h\n"
+    "fmin v25.8h, v25.8h, v7.8h\n"
+    "str q20, [x23, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v26.8h, v26.8h, v7.8h\n"
+    "fmin v27.8h, v27.8h, v7.8h\n"
+    "str q21, [x22, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v28.8h, v28.8h, v7.8h\n"
+    "fmin v29.8h, v29.8h, v7.8h\n"
+    "str q22, [x21, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v30.8h, v30.8h, v7.8h\n"
+    "fmin v31.8h, v31.8h, v7.8h\n"
+    "str q23, [x20, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.8h, v24.8h, v8.8h\n"
+    "fmax v25.8h, v25.8h, v8.8h\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.8h, v26.8h, v8.8h\n"
+    "fmax v27.8h, v27.8h, v8.8h\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.8h, v28.8h, v8.8h\n"
+    "fmax v29.8h, v29.8h, v8.8h\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.8h, v30.8h, v8.8h\n"
+    "fmax v31.8h, v31.8h, v8.8h\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "7:"  // Output channel loop: Done
+    "add x10, x10, #0x8\n"
+    "cmp x10, x11, LSL #3\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x7\n"
+    "beq 23f\n"
+    "8:"  // Output channel oddments
+    "movi v31.16b, #0x0\n"
+    "cbz %x[bias], 13f\n"
+    "add x20, %x[bias], x10, LSL #1\n"
+    "tbz %x[n_output_channels], #2, 10f\n"
+    "ld1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #1, 9f\n"
+    "ld1 { v31.s }[2], [x20], #0x4\n"
+    "tbz %x[n_output_channels], #0, 12f\n"
+    "ld1 { v31.h }[6], [x20]\n"
+    "b 12f\n"
+    "9:"  // Output channel oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 12f\n"
+    "ld1 { v31.h }[4], [x20]\n"
+    "b 12f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 2: Unset
+    "tbz %x[n_output_channels], #1, 11f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz %x[n_output_channels], #0, 12f\n"
+    "ld1 { v31.h }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Output channel oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "ld1 { v31.h }[0], [x20]\n"
+    "12:"  // Output channel oddments: Load bias: Bit 2: End
+    "13:"  // Output channel oddments: Load bias: Done
+    "ldr q6, [%x[weights], #0x0]\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr q1, [x21, #0x0]\n"
+    "ldr q0, [x20, #0x0]\n"
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz x23, 17f\n"
+    "ldr q5, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q4, [x21, #0x0]\n"
+    "ldr q3, [x20, #0x0]\n"
+    "beq 15f\n"
+    "14:"  // Output channel oddments: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "ldr q1, [x21, #0x0]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "ldr q0, [x20, #0x0]\n"
+    "ldr q6, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "ldr q4, [x21, #0x0]\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "ldr q3, [x20, #0x0]\n"
+    "ldr q5, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 14b\n"
+    "15:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 16f\n"
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "b 18f\n"
+    "16:"  // Output channel oddments: Odd tail
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "ldr q2, [x21, #0x0]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "ldr q1, [x20, #0x0]\n"
+    "ldr q0, [%x[weights], #0x0]\n"
+    "fmla v16.8h, v5.8h, v4.h[0]\n"
+    "fmla v17.8h, v5.8h, v4.h[1]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v18.8h, v5.8h, v4.h[2]\n"
+    "fmla v19.8h, v5.8h, v4.h[3]\n"
+    "fmla v20.8h, v5.8h, v4.h[4]\n"
+    "fmla v21.8h, v5.8h, v4.h[5]\n"
+    "fmla v22.8h, v5.8h, v4.h[6]\n"
+    "fmla v23.8h, v5.8h, v4.h[7]\n"
+    "fmla v24.8h, v5.8h, v3.h[0]\n"
+    "fmla v25.8h, v5.8h, v3.h[1]\n"
+    "fmla v26.8h, v5.8h, v3.h[2]\n"
+    "fmla v27.8h, v5.8h, v3.h[3]\n"
+    "fmla v28.8h, v5.8h, v3.h[4]\n"
+    "fmla v29.8h, v5.8h, v3.h[5]\n"
+    "fmla v30.8h, v5.8h, v3.h[6]\n"
+    "fmla v31.8h, v5.8h, v3.h[7]\n"
+    "fmla v16.8h, v0.8h, v2.h[0]\n"
+    "fmla v17.8h, v0.8h, v2.h[1]\n"
+    "fmla v18.8h, v0.8h, v2.h[2]\n"
+    "fmla v19.8h, v0.8h, v2.h[3]\n"
+    "fmla v20.8h, v0.8h, v2.h[4]\n"
+    "fmla v21.8h, v0.8h, v2.h[5]\n"
+    "fmla v22.8h, v0.8h, v2.h[6]\n"
+    "fmla v23.8h, v0.8h, v2.h[7]\n"
+    "fmla v24.8h, v0.8h, v1.h[0]\n"
+    "fmla v25.8h, v0.8h, v1.h[1]\n"
+    "fmla v26.8h, v0.8h, v1.h[2]\n"
+    "fmla v27.8h, v0.8h, v1.h[3]\n"
+    "fmla v28.8h, v0.8h, v1.h[4]\n"
+    "fmla v29.8h, v0.8h, v1.h[5]\n"
+    "fmla v30.8h, v0.8h, v1.h[6]\n"
+    "fmla v31.8h, v0.8h, v1.h[7]\n"
+    "b 18f\n"
+    "17:"  // Output channel oddments: Single kernel point
+    "fmla v16.8h, v6.8h, v1.h[0]\n"
+    "fmla v17.8h, v6.8h, v1.h[1]\n"
+    "fmla v18.8h, v6.8h, v1.h[2]\n"
+    "fmla v19.8h, v6.8h, v1.h[3]\n"
+    "fmla v20.8h, v6.8h, v1.h[4]\n"
+    "fmla v21.8h, v6.8h, v1.h[5]\n"
+    "fmla v22.8h, v6.8h, v1.h[6]\n"
+    "fmla v23.8h, v6.8h, v1.h[7]\n"
+    "fmla v24.8h, v6.8h, v0.h[0]\n"
+    "fmla v25.8h, v6.8h, v0.h[1]\n"
+    "fmla v26.8h, v6.8h, v0.h[2]\n"
+    "fmla v27.8h, v6.8h, v0.h[3]\n"
+    "fmla v28.8h, v6.8h, v0.h[4]\n"
+    "fmla v29.8h, v6.8h, v0.h[5]\n"
+    "fmla v30.8h, v6.8h, v0.h[6]\n"
+    "fmla v31.8h, v6.8h, v0.h[7]\n"
+    "18:"  // Output channel oddments: Done
+    "fmin v16.8h, v16.8h, v7.8h\n"
+    "fmin v17.8h, v17.8h, v7.8h\n"
+    "fmin v18.8h, v18.8h, v7.8h\n"
+    "fmin v19.8h, v19.8h, v7.8h\n"
+    "fmin v20.8h, v20.8h, v7.8h\n"
+    "fmin v21.8h, v21.8h, v7.8h\n"
+    "fmin v22.8h, v22.8h, v7.8h\n"
+    "fmin v23.8h, v23.8h, v7.8h\n"
+    "fmin v24.8h, v24.8h, v7.8h\n"
+    "fmin v25.8h, v25.8h, v7.8h\n"
+    "fmin v26.8h, v26.8h, v7.8h\n"
+    "fmin v27.8h, v27.8h, v7.8h\n"
+    "fmin v28.8h, v28.8h, v7.8h\n"
+    "fmin v29.8h, v29.8h, v7.8h\n"
+    "fmin v30.8h, v30.8h, v7.8h\n"
+    "fmin v31.8h, v31.8h, v7.8h\n"
+    "fmax v16.8h, v16.8h, v8.8h\n"
+    "fmax v17.8h, v17.8h, v8.8h\n"
+    "fmax v18.8h, v18.8h, v8.8h\n"
+    "fmax v19.8h, v19.8h, v8.8h\n"
+    "fmax v20.8h, v20.8h, v8.8h\n"
+    "fmax v21.8h, v21.8h, v8.8h\n"
+    "fmax v22.8h, v22.8h, v8.8h\n"
+    "fmax v23.8h, v23.8h, v8.8h\n"
+    "fmax v24.8h, v24.8h, v8.8h\n"
+    "fmax v25.8h, v25.8h, v8.8h\n"
+    "fmax v26.8h, v26.8h, v8.8h\n"
+    "fmax v27.8h, v27.8h, v8.8h\n"
+    "fmax v28.8h, v28.8h, v8.8h\n"
+    "fmax v29.8h, v29.8h, v8.8h\n"
+    "fmax v30.8h, v30.8h, v8.8h\n"
+    "fmax v31.8h, v31.8h, v8.8h\n"
+    "tbz %x[n_output_channels], #2, 20f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.d }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.d }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.d }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.d }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.d }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.d }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.d }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v24.d }[0], [x27]\n"
+    "st1 { v25.d }[0], [x26]\n"
+    "st1 { v26.d }[0], [x25]\n"
+    "st1 { v27.d }[0], [x24]\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_output_channels], #1, 19f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.s }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.s }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.s }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.s }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.s }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.s }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.s }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "add x10, x10, #0x2\n"
+    "st1 { v24.s }[2], [x27]\n"
+    "st1 { v25.s }[2], [x26]\n"
+    "st1 { v26.s }[2], [x25]\n"
+    "st1 { v27.s }[2], [x24]\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "tbz %x[n_output_channels], #0, 22f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.h }[6], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.h }[6], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[6], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[6], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[6], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[6], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[6], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[6], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[6], [x27]\n"
+    "st1 { v25.h }[6], [x26]\n"
+    "st1 { v26.h }[6], [x25]\n"
+    "st1 { v27.h }[6], [x24]\n"
+    "st1 { v28.h }[6], [x23]\n"
+    "st1 { v29.h }[6], [x22]\n"
+    "st1 { v30.h }[6], [x21]\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "b 22f\n"
+    "19:"  // Output channel oddments: Done: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 22f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.h }[4], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.h }[4], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[4], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[4], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[4], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[4], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[4], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[4], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[4], [x27]\n"
+    "st1 { v25.h }[4], [x26]\n"
+    "st1 { v26.h }[4], [x25]\n"
+    "st1 { v27.h }[4], [x24]\n"
+    "st1 { v28.h }[4], [x23]\n"
+    "st1 { v29.h }[4], [x22]\n"
+    "st1 { v30.h }[4], [x21]\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "b 22f\n"
+    "20:"  // Output channel oddments: Done: Store: Bit 2: Unset
+    "tbz %x[n_output_channels], #1, 21f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.s }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.s }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.s }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.s }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.s }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.s }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.s }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "add x10, x10, #0x2\n"
+    "st1 { v24.s }[0], [x27]\n"
+    "st1 { v25.s }[0], [x26]\n"
+    "st1 { v26.s }[0], [x25]\n"
+    "st1 { v27.s }[0], [x24]\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "tbz %x[n_output_channels], #0, 22f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.h }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.h }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[2], [x27]\n"
+    "st1 { v25.h }[2], [x26]\n"
+    "st1 { v26.h }[2], [x25]\n"
+    "st1 { v27.h }[2], [x24]\n"
+    "st1 { v28.h }[2], [x23]\n"
+    "st1 { v29.h }[2], [x22]\n"
+    "st1 { v30.h }[2], [x21]\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Output channel oddments: Done: Store: Bit 2: Unset: Bit 1: Unset
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "add x26, x26, x10, LSL #1\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "add x24, x24, x10, LSL #1\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "add x22, x22, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v16.h }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #1\n"
+    "st1 { v17.h }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[0], [x27]\n"
+    "st1 { v25.h }[0], [x26]\n"
+    "st1 { v26.h }[0], [x25]\n"
+    "st1 { v27.h }[0], [x24]\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "22:"  // Output channel oddments: Done: Store: Bit 2: End
+    "23:"  // Done
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..5d3db974f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..fd8686c15e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x23, #0x0\n"
+    "mov x22, #0x0\n"
+    "1:"  // Tile loop
+    "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x27, #0x2\n"
+    "mov x26, #0x2\n"
+    "str x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x23, x25\n"  // offset = tile_i * ld_input_row
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x23, x24\n"  // offset = tile_i * ld_output_row
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x22, x15, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x15, x15, #0x2\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x22, x14, x20\n"  // offset += tile_j * ld_output_col
+    "lsr x22, %x[n_channels], #0x2\n"
+    "add x11, x15, x15\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x27\n"  // offset *= kernel_stride * output_size
+    "add x13, x13, x21, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x9, x13, x25, LSL #2\n"
+    "mul x20, x20, x26\n"  // offset *= output_tile_size
+    "add x28, x9, x25, LSL #2\n"
+    "add x12, x12, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x27, x28, x25, LSL #2\n"
+    "add x26, x11, x15\n"
+    "add x25, x12, x24, LSL #2\n"
+    "lsl x14, x14, #0x2\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q25, [x10, #0x0]\n"
+    "ldr q0, [x10, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x10, #0x20]\n"
+    "ldr q2, [x10, #0x30]\n"
+    "ldr q3, [x10, #0x40]\n"
+    "ldr q4, [x10, #0x50]\n"
+    "ldr q5, [x10, #0x60]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "ldr q8, [x10, #0x90]\n"
+    "add x10, x10, #0xa0\n"
+    "ldr q9, [x9, x15]\n"
+    "ld1 { v10.4s }, [x13]\n"
+    "ldr q11, [x13, x26]\n"
+    "ldr q12, [x9, x11]\n"
+    "ldr q13, [x28, x15]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+    "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+    "add x23, x23, #0x10\n"
+    "cmp x23, x22, LSL #4\n"
+    "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ld1 { v18.4s }, [x27]\n"
+    "ldr q25, [x10, #0x0]\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "ldr q20, [x28, x11]\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x27, x26]\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "ldr q16, [x13, x15]\n"
+    "fmla v22.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x13, x11]\n"
+    "fmla v21.4s, v3.4s, v13.4s\n"
+    "add x13, x13, #0x10\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "fmla v22.4s, v4.4s, v13.4s\n"
+    "fmla v21.4s, v8.4s, v17.4s\n"
+    "ld1 { v17.4s }, [x9]\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "fmla v23.4s, v0.4s, v16.4s\n"
+    "ldr q16, [x9, x26]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v22.4s, v5.4s, v20.4s\n"
+    "fmla v21.4s, v4.4s, v20.4s\n"
+    "ldr q4, [x10, #0x50]\n"
+    "fmla v24.4s, v2.4s, v18.4s\n"
+    "fmla v23.4s, v1.4s, v18.4s\n"
+    "ld1 { v19.4s }, [x28]\n"
+    "ldr q1, [x10, #0x20]\n"
+    "fmla v22.4s, v0.4s, v17.4s\n"
+    "ldr q0, [x10, #0x10]\n"
+    "fmla v21.4s, v2.4s, v16.4s\n"
+    "ldr q2, [x10, #0x30]\n"
+    "fmla v24.4s, v8.4s, v20.4s\n"
+    "fmla v23.4s, v7.4s, v20.4s\n"
+    "ldr q18, [x28, x26]\n"
+    "add x28, x28, #0x10\n"
+    "ldr q13, [x28, x15]\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "fmla v21.4s, v5.4s, v18.4s\n"
+    "fmla v24.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x27, x15]\n"
+    "ldr q3, [x10, #0x40]\n"
+    "fmla v23.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x27, x11]\n"
+    "ldr q5, [x10, #0x60]\n"
+    "fmla v22.4s, v7.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v17.4s\n"
+    "ldr q11, [x13, x26]\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "ldr q9, [x9, x15]\n"
+    "fmla v23.4s, v8.4s, v18.4s\n"
+    "ld1 { v10.4s }, [x13]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "ldr q12, [x9, x11]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "fmax v24.4s, v24.4s, v27.4s\n"
+    "fmax v23.4s, v23.4s, v27.4s\n"
+    "ldr q8, [x10, #0x90]\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "add x27, x27, #0x10\n"
+    "fmin v24.4s, v24.4s, v26.4s\n"
+    "fmin v23.4s, v23.4s, v26.4s\n"
+    "st1 { v24.4s }, [x12]\n"
+    "add x10, x10, #0xa0\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "str q23, [x12, x14]\n"
+    "add x12, x12, #0x10\n"
+    "st1 { v22.4s }, [x25]\n"
+    "str q21, [x25, x14]\n"
+    "add x25, x25, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+    "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+    "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ld1 { v18.4s }, [x27]\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "ldr q20, [x28, x11]\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x27, x26]\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "ldr q16, [x13, x15]\n"
+    "fmla v22.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x13, x11]\n"
+    "fmla v21.4s, v3.4s, v13.4s\n"
+    "add x13, x13, #0x10\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "fmla v22.4s, v4.4s, v13.4s\n"
+    "fmla v21.4s, v8.4s, v17.4s\n"
+    "ld1 { v17.4s }, [x9]\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "fmla v23.4s, v0.4s, v16.4s\n"
+    "ldr q16, [x9, x26]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v22.4s, v5.4s, v20.4s\n"
+    "fmla v21.4s, v4.4s, v20.4s\n"
+    "fmla v24.4s, v2.4s, v18.4s\n"
+    "fmla v23.4s, v1.4s, v18.4s\n"
+    "ld1 { v19.4s }, [x28]\n"
+    "fmla v22.4s, v0.4s, v17.4s\n"
+    "fmla v21.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v8.4s, v20.4s\n"
+    "fmla v23.4s, v7.4s, v20.4s\n"
+    "ldr q18, [x28, x26]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "fmla v21.4s, v5.4s, v18.4s\n"
+    "fmla v24.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x27, x15]\n"
+    "fmla v23.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x27, x11]\n"
+    "fmla v22.4s, v7.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v17.4s\n"
+    "add x27, x27, #0x10\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "fmla v23.4s, v8.4s, v18.4s\n"
+    "fmax v24.4s, v24.4s, v27.4s\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "fmax v23.4s, v23.4s, v27.4s\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "fmin v24.4s, v24.4s, v26.4s\n"
+    "fmin v23.4s, v23.4s, v26.4s\n"
+    "st1 { v24.4s }, [x12]\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "str q23, [x12, x14]\n"
+    "add x12, x12, #0x10\n"
+    "st1 { v22.4s }, [x25]\n"
+    "str q21, [x25, x14]\n"
+    "add x25, x25, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 31f\n"
+    "ldr q25, [x10, #0x0]\n"
+    "ldr q0, [x10, #0x10]\n"
+    "add x24, x9, x15\n"
+    "add x23, x13, XZR\n"
+    "ldr q1, [x10, #0x20]\n"
+    "ldr q2, [x10, #0x30]\n"
+    "add x22, x13, x26\n"
+    "add x21, x9, x11\n"
+    "ldr q3, [x10, #0x40]\n"
+    "ldr q4, [x10, #0x50]\n"
+    "add x20, x28, x15\n"
+    "ldr q5, [x10, #0x60]\n"
+    "ldr q6, [x10, #0x70]\n"
+    "ldr q7, [x10, #0x80]\n"
+    "ldr q8, [x10, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x24], #0x8\n"
+    "ldr d10, [x23], #0x8\n"
+    "ldr d11, [x22], #0x8\n"
+    "ldr d12, [x21], #0x8\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x24]\n"
+    "ld1 { v10.s }[2], [x23]\n"
+    "ld1 { v11.s }[2], [x22]\n"
+    "ld1 { v12.s }[2], [x21]\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+    "ldr s9, [x24, #0x0]\n"
+    "ldr s10, [x23, #0x0]\n"
+    "ldr s11, [x22, #0x0]\n"
+    "ldr s12, [x21, #0x0]\n"
+    "ldr s13, [x20, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+    "mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+    "mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+    "add x20, x27, XZR\n"
+    "mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+    "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v12.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v30.4s, v6.4s, v9.4s\n"
+    "fmla v28.4s, v7.4s, v13.4s\n"
+    "add x20, x27, x26\n"
+    "fmla v29.4s, v6.4s, v13.4s\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "fmla v31.4s, v3.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v31.4s, v8.4s, v11.4s\n"
+    "add x20, x13, x15\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v29.4s, v0.4s, v12.4s\n"
+    "add x20, x13, x11\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v1.4s, v9.4s\n"
+    "add x20, x28, x11\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "add x20, x9, XZR\n"
+    "fmla v30.4s, v5.4s, v10.4s\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "add x20, x9, x26\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v29.4s, v5.4s, v12.4s\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "add x20, x28, XZR\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v28.4s, v6.4s, v9.4s\n"
+    "fmla v30.4s, v3.4s, v9.4s\n"
+    "add x20, x28, x26\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v29.4s, v8.4s, v10.4s\n"
+    "fmla v31.4s, v5.4s, v10.4s\n"
+    "add x20, x27, x15\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v30.4s, v7.4s, v11.4s\n"
+    "fmla v31.4s, v6.4s, v11.4s\n"
+    "add x20, x27, x11\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v30.4s, v8.4s, v12.4s\n"
+    "fmla v31.4s, v7.4s, v12.4s\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmin v28.4s, v28.4s, v26.4s\n"
+    "fmin v29.4s, v29.4s, v26.4s\n"
+    "fmin v30.4s, v30.4s, v26.4s\n"
+    "fmin v31.4s, v31.4s, v26.4s\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.d }[0], [x21], x14\n"
+    "st1 { v30.d }[0], [x20], x14\n"
+    "add x12, x12, #0x8\n"
+    "add x25, x25, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.s }[2], [x21], x14\n"
+    "st1 { v30.s }[2], [x20], x14\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x21, x12\n"
+    "mov x20, x25\n"
+    "st1 { v28.s }[0], [x21], x14\n"
+    "st1 { v30.s }[0], [x20], x14\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "30:"  // Tile loop: Oddments: Store: Bit 1: End
+    "31:"  // Tile loop: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x22, x22, #0x1\n"
+    "add x21, x23, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x22, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x23, x23, x21, LT\n"
+    "csel x22, x22, XZR, LT\n"
+    "cmp x23, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..7dedfd972a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[16];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x16, #0x10\n"  // cntb _, ALL, #1
+    "lsr x15, %x[n_channels], #0x2\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "mov x28, #0x0\n"
+    "sub x27, XZR, x16\n"
+    "cbz x15, 3f\n"
+    "ldr q25, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "cmp x16, x15, LSL #4\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "add x14, x14, #0xa0\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q10, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "ldr q11, [x21, x28]\n"
+    "ldr q12, [x20, x28]\n"
+    "ldr x20, [x13, #0x20]\n"
+    "ldr q13, [x20, x28]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+    "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+    "ldr x21, [x13, #0x28]\n"
+    "ldr x20, [x13, #0x30]\n"
+    "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ldr q18, [x21, x28]\n"
+    "ldr q25, [x14, #0x0]\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "ldr x20, [x13, #0x48]\n"
+    "ldr q20, [x20, x28]\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v22.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v21.4s, v3.4s, v13.4s\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "ldr x22, [x13, #0x58]\n"
+    "ldr x21, [x13, #0x60]\n"
+    "fmla v22.4s, v4.4s, v13.4s\n"
+    "fmla v21.4s, v8.4s, v17.4s\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "fmla v23.4s, v0.4s, v16.4s\n"
+    "ldr q16, [x22, x28]\n"
+    "ldr x26, [x13, #0x70]\n"
+    "fmla v22.4s, v5.4s, v20.4s\n"
+    "fmla v21.4s, v4.4s, v20.4s\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr x25, [x13, #0x78]\n"
+    "fmla v24.4s, v2.4s, v18.4s\n"
+    "fmla v23.4s, v1.4s, v18.4s\n"
+    "ldr q19, [x21, x28]\n"
+    "ldr q1, [x14, #0x20]\n"
+    "fmla v22.4s, v0.4s, v17.4s\n"
+    "ldr q0, [x14, #0x10]\n"
+    "fmla v21.4s, v2.4s, v16.4s\n"
+    "ldr q2, [x14, #0x30]\n"
+    "fmla v24.4s, v8.4s, v20.4s\n"
+    "fmla v23.4s, v7.4s, v20.4s\n"
+    "ldr q18, [x20, x28]\n"
+    "ldp x24, x23, [x13, #0x0]\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "fmla v21.4s, v5.4s, v18.4s\n"
+    "ldp x22, x21, [x13, #0x10]\n"
+    "ldr x20, [x13, #0x20]\n"
+    "ldr q13, [x20, x16]\n"
+    "fmla v24.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x26, x28]\n"
+    "fmla v23.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x25, x28]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "fmla v22.4s, v7.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v17.4s\n"
+    "ldr q11, [x22, x16]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "fmla v23.4s, v8.4s, v18.4s\n"
+    "ldr q9, [x24, x16]\n"
+    "ldr q10, [x23, x16]\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "ldr q12, [x21, x16]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "fmax v24.4s, v24.4s, v27.4s\n"
+    "fmax v23.4s, v23.4s, v27.4s\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "add x16, x16, #0x10\n"
+    "add x27, x27, #0x10\n"
+    "fmin v24.4s, v24.4s, v26.4s\n"
+    "fmin v23.4s, v23.4s, v26.4s\n"
+    "cmp x16, x15, LSL #4\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "add x28, x28, #0x10\n"
+    "str q24, [x12, x27]\n"
+    "add x14, x14, #0xa0\n"
+    "str q23, [x11, x27]\n"
+    "str q22, [x10, x27]\n"
+    "str q21, [x9, x27]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+    "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+    "ldr x21, [x13, #0x28]\n"
+    "ldr x20, [x13, #0x30]\n"
+    "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ldr q18, [x21, x28]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "ldr q20, [x20, x28]\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x21, [x13, #0x50]\n"
+    "fmla v22.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v21.4s, v3.4s, v13.4s\n"
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "ldr x23, [x13, #0x60]\n"
+    "ldr x22, [x13, #0x68]\n"
+    "fmla v22.4s, v4.4s, v13.4s\n"
+    "fmla v21.4s, v8.4s, v17.4s\n"
+    "ldr q17, [x21, x28]\n"
+    "ldr x21, [x13, #0x70]\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "fmla v23.4s, v0.4s, v16.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v22.4s, v5.4s, v20.4s\n"
+    "fmla v21.4s, v4.4s, v20.4s\n"
+    "add x27, x27, #0x10\n"
+    "fmla v24.4s, v2.4s, v18.4s\n"
+    "fmla v23.4s, v1.4s, v18.4s\n"
+    "ldr q19, [x23, x28]\n"
+    "fmla v22.4s, v0.4s, v17.4s\n"
+    "fmla v21.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v8.4s, v20.4s\n"
+    "fmla v23.4s, v7.4s, v20.4s\n"
+    "ldr q18, [x22, x28]\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "fmla v21.4s, v5.4s, v18.4s\n"
+    "fmla v24.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x21, x28]\n"
+    "fmla v23.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v22.4s, v7.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v17.4s\n"
+    "add x28, x28, #0x10\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "fmla v23.4s, v8.4s, v18.4s\n"
+    "fmax v24.4s, v24.4s, v27.4s\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "fmax v23.4s, v23.4s, v27.4s\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "fmin v24.4s, v24.4s, v26.4s\n"
+    "fmin v23.4s, v23.4s, v26.4s\n"
+    "str q24, [x12, x27]\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "str q23, [x11, x27]\n"
+    "str q22, [x10, x27]\n"
+    "str q21, [x9, x27]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 30f\n"
+    "ldr q25, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "mov x20, x28\n"
+    "add x12, x12, x20\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "add x11, x11, x20\n"
+    "add x10, x10, x20\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "add x9, x9, x20\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "ldr x24, [x13, #0x0]\n"
+    "ldr x23, [x13, #0x8]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    "ldr x22, [x13, #0x10]\n"
+    "ldr x21, [x13, #0x18]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    "ldr x20, [x13, #0x20]\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x24], #0x8\n"
+    "ld1 { v10.d }[0], [x23], #0x8\n"
+    "ld1 { v11.d }[0], [x22], #0x8\n"
+    "ld1 { v12.d }[0], [x21], #0x8\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+    "ld1 { v9.s }[0], [x24], #0x4\n"
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "ld1 { v11.s }[0], [x22], #0x4\n"
+    "ld1 { v12.s }[0], [x21], #0x4\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "5:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+    "mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+    "mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+    "ldr x20, [x13, #0x28]\n"
+    "add x20, x20, x28\n"
+    "mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+    "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v12.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v30.4s, v6.4s, v9.4s\n"
+    "ldr x20, [x13, #0x30]\n"
+    "fmla v28.4s, v7.4s, v13.4s\n"
+    "add x20, x20, x28\n"
+    "fmla v29.4s, v6.4s, v13.4s\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "fmla v31.4s, v3.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "9:"  // Oddments: Load input (3, 3): Bit 1: End
+    "ldr x20, [x13, #0x38]\n"
+    "fmla v31.4s, v8.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Load input (0, 1): Bit 1: End
+    "ldr x20, [x13, #0x40]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v29.4s, v0.4s, v12.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "13:"  // Oddments: Load input (0, 2): Bit 1: End
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v1.4s, v9.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "15:"  // Oddments: Load input (2, 2): Bit 1: End
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "add x20, x20, x28\n"
+    "fmla v30.4s, v5.4s, v10.4s\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "17:"  // Oddments: Load input (1, 0): Bit 1: End
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load input (1, 3): Bit 1: End
+    "ldr x20, [x13, #0x60]\n"
+    "fmla v29.4s, v5.4s, v12.4s\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "21:"  // Oddments: Load input (2, 0): Bit 1: End
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v28.4s, v6.4s, v9.4s\n"
+    "fmla v30.4s, v3.4s, v9.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "23:"  // Oddments: Load input (2, 3): Bit 1: End
+    "ldr x20, [x13, #0x70]\n"
+    "fmla v29.4s, v8.4s, v10.4s\n"
+    "fmla v31.4s, v5.4s, v10.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "25:"  // Oddments: Load input (3, 1): Bit 1: End
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v30.4s, v7.4s, v11.4s\n"
+    "fmla v31.4s, v6.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "27:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v30.4s, v8.4s, v12.4s\n"
+    "fmla v31.4s, v7.4s, v12.4s\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmin v28.4s, v28.4s, v26.4s\n"
+    "fmin v29.4s, v29.4s, v26.4s\n"
+    "fmin v30.4s, v30.4s, v26.4s\n"
+    "fmin v31.4s, v31.4s, v26.4s\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "st1 { v28.d }[0], [x12], #0x8\n"
+    "st1 { v29.d }[0], [x11], #0x8\n"
+    "st1 { v30.d }[0], [x10], #0x8\n"
+    "st1 { v31.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "st1 { v28.s }[2], [x12], #0x4\n"
+    "st1 { v29.s }[2], [x11], #0x4\n"
+    "st1 { v30.s }[2], [x10], #0x4\n"
+    "st1 { v31.s }[2], [x9], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v28.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x11], #0x4\n"
+    "st1 { v30.s }[0], [x10], #0x4\n"
+    "st1 { v31.s }[0], [x9], #0x4\n"
+    "29:"  // Oddments: Store: Bit 1: End
+    "30:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..c2d86615e3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..9bfcd9cd3c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,828 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x24, #0x0\n"
+    "mov x23, #0x0\n"
+    "1:"  // Tile loop
+    "str x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x27, #0x3\n"
+    "mov x26, #0x3\n"
+    "str x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x24, x25\n"  // offset = tile_i * ld_input_row
+    "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x24, x22\n"  // offset = tile_i * ld_output_row
+    "mov x24, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x23, x8, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x8, x8, #0x2\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x23, x17, x20\n"  // offset += tile_j * ld_output_col
+    "lsl x17, x17, #0x2\n"
+    "lsr x23, %x[n_channels], #0x2\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x27\n"  // offset *= kernel_stride * output_size
+    "add x16, x16, x21, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x13, x16, x25, LSL #2\n"
+    "mul x20, x20, x26\n"  // offset *= output_tile_size
+    "add x12, x13, x25, LSL #2\n"
+    "add x11, x8, x8\n"
+    "add x15, x15, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "add x10, x12, x25, LSL #2\n"
+    "add x9, x11, x8\n"
+    "add x28, x15, x22, LSL #2\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x27, x10, x25, LSL #2\n"
+    "add x26, x9, x8\n"
+    "add x25, x28, x22, LSL #2\n"
+    "add x22, x17, x17\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x24\n"
+    "cbz x23, 4f\n"
+    "ldr q31, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "cmp x24, x23, LSL #4\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "add x14, x14, #0xa0\n"
+    "ldr q9, [x12, x11]\n"
+    "ld1 { v10.4s }, [x16]\n"
+    "ldr q11, [x16, x26]\n"
+    "ld1 { v12.4s }, [x27]\n"
+    "ldr q13, [x13, x11]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+    "add x24, x24, #0x10\n"
+    "cmp x24, x23, LSL #4\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "ldr q23, [x12, x9]\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q18, [x12, x8]\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v6.4s, v18.4s\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v5.4s, v13.4s\n"
+    "fmla v27.4s, v3.4s, v13.4s\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v25.4s, v1.4s, v13.4s\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "ldr q17, [x16, x8]\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "ldr q16, [x27, x26]\n"
+    "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
+    "ldr q31, [x14, #0x0]\n"
+    "fmla v29.4s, v0.4s, v17.4s\n"
+    "fmla v21.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x16, x9]\n"
+    "fmla v28.4s, v7.4s, v18.4s\n"
+    "fmla v20.4s, v0.4s, v18.4s\n"
+    "fmla v26.4s, v4.4s, v18.4s\n"
+    "fmla v25.4s, v3.4s, v18.4s\n"
+    "fmla v22.4s, v1.4s, v18.4s\n"
+    "ld1 { v19.4s }, [x13]\n"
+    "fmla v29.4s, v2.4s, v16.4s\n"
+    "fmla v27.4s, v1.4s, v16.4s\n"
+    "ld1 { v18.4s }, [x10]\n"
+    "fmla v24.4s, v4.4s, v23.4s\n"
+    "fmla v28.4s, v1.4s, v17.4s\n"
+    "ldr q16, [x13, x26]\n"
+    "fmla v20.4s, v2.4s, v23.4s\n"
+    "fmla v21.4s, v1.4s, v23.4s\n"
+    "fmla v29.4s, v8.4s, v23.4s\n"
+    "fmla v27.4s, v7.4s, v23.4s\n"
+    "fmla v25.4s, v5.4s, v23.4s\n"
+    "ldr q17, [x10, x11]\n"
+    "fmla v26.4s, v0.4s, v19.4s\n"
+    "fmla v22.4s, v3.4s, v18.4s\n"
+    "fmla v24.4s, v2.4s, v16.4s\n"
+    "fmla v20.4s, v4.4s, v17.4s\n"
+    "fmla v21.4s, v3.4s, v17.4s\n"
+    "fmla v28.4s, v3.4s, v19.4s\n"
+    "ldr q19, [x10, x26]\n"
+    "fmla v27.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x27, x8]\n"
+    "fmla v26.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x13, x8]\n"
+    "fmla v25.4s, v7.4s, v17.4s\n"
+    "fmla v22.4s, v5.4s, v17.4s\n"
+    "fmla v24.4s, v6.4s, v17.4s\n"
+    "fmla v21.4s, v5.4s, v19.4s\n"
+    "fmla v20.4s, v6.4s, v16.4s\n"
+    "fmla v26.4s, v8.4s, v17.4s\n"
+    "fmla v22.4s, v7.4s, v16.4s\n"
+    "ldr q17, [x27, x9]\n"
+    "fmla v29.4s, v3.4s, v18.4s\n"
+    "fmla v25.4s, v0.4s, v18.4s\n"
+    "fmla v24.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x13, x9]\n"
+    "fmla v20.4s, v8.4s, v17.4s\n"
+    "add x13, x13, #0x10\n"
+    "fmla v21.4s, v7.4s, v17.4s\n"
+    "ldr q19, [x10, x9]\n"
+    "fmla v28.4s, v4.4s, v18.4s\n"
+    "fmla v26.4s, v1.4s, v18.4s\n"
+    "ldr q17, [x10, x8]\n"
+    "fmla v29.4s, v5.4s, v16.4s\n"
+    "add x10, x10, #0x10\n"
+    "fmla v27.4s, v4.4s, v16.4s\n"
+    "fmla v25.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v22.4s, v4.4s, v17.4s\n"
+    "add x16, x16, #0x10\n"
+    "ld1 { v10.4s }, [x16]\n"
+    "fmla v20.4s, v3.4s, v17.4s\n"
+    "fmla v21.4s, v4.4s, v19.4s\n"
+    "ldr q4, [x14, #0x50]\n"
+    "fmla v26.4s, v7.4s, v17.4s\n"
+    "fmla v25.4s, v6.4s, v17.4s\n"
+    "ld1 { v18.4s }, [x12]\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v29.4s, v1.4s, v16.4s\n"
+    "ldr q1, [x14, #0x20]\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmla v27.4s, v0.4s, v16.4s\n"
+    "ldr q17, [x12, x26]\n"
+    "fmla v24.4s, v7.4s, v19.4s\n"
+    "add x12, x12, #0x10\n"
+    "ldr q9, [x12, x11]\n"
+    "fmla v20.4s, v5.4s, v19.4s\n"
+    "fmla v22.4s, v0.4s, v18.4s\n"
+    "ldr q0, [x14, #0x10]\n"
+    "fmla v21.4s, v2.4s, v17.4s\n"
+    "ldr q2, [x14, #0x30]\n"
+    "fmla v25.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x27, x11]\n"
+    "fmla v28.4s, v6.4s, v18.4s\n"
+    "fmla v26.4s, v3.4s, v18.4s\n"
+    "ldr q3, [x14, #0x40]\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmla v27.4s, v8.4s, v17.4s\n"
+    "fmla v24.4s, v5.4s, v17.4s\n"
+    "ldr q11, [x16, x26]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "ldr q8, [x14, #0x90]\n"
+    "fmla v20.4s, v7.4s, v16.4s\n"
+    "ldr q7, [x14, #0x80]\n"
+    "fmla v21.4s, v6.4s, v16.4s\n"
+    "ldr q13, [x13, x11]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "add x27, x27, #0x10\n"
+    "ld1 { v12.4s }, [x27]\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "add x14, x14, #0xa0\n"
+    "fmax v20.4s, v20.4s, v15.4s\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "st1 { v28.4s }, [x15]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "str q29, [x15, x17]\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "str q27, [x15, x22]\n"
+    "add x15, x15, #0x10\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "st1 { v26.4s }, [x28]\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "str q25, [x28, x17]\n"
+    "str q24, [x28, x22]\n"
+    "add x28, x28, #0x10\n"
+    "st1 { v22.4s }, [x25]\n"
+    "str q20, [x25, x17]\n"
+    "str q21, [x25, x22]\n"
+    "add x25, x25, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "ldr q23, [x12, x9]\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q18, [x12, x8]\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v6.4s, v18.4s\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v5.4s, v13.4s\n"
+    "fmla v27.4s, v3.4s, v13.4s\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v25.4s, v1.4s, v13.4s\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "ldr q17, [x16, x8]\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "ldr q16, [x27, x26]\n"
+    "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
+    "fmla v29.4s, v0.4s, v17.4s\n"
+    "fmla v21.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x16, x9]\n"
+    "fmla v28.4s, v7.4s, v18.4s\n"
+    "fmla v20.4s, v0.4s, v18.4s\n"
+    "fmla v26.4s, v4.4s, v18.4s\n"
+    "fmla v25.4s, v3.4s, v18.4s\n"
+    "fmla v22.4s, v1.4s, v18.4s\n"
+    "ld1 { v19.4s }, [x13]\n"
+    "fmla v29.4s, v2.4s, v16.4s\n"
+    "fmla v27.4s, v1.4s, v16.4s\n"
+    "ld1 { v18.4s }, [x10]\n"
+    "fmla v24.4s, v4.4s, v23.4s\n"
+    "fmla v28.4s, v1.4s, v17.4s\n"
+    "ldr q16, [x13, x26]\n"
+    "fmla v20.4s, v2.4s, v23.4s\n"
+    "fmla v21.4s, v1.4s, v23.4s\n"
+    "fmla v29.4s, v8.4s, v23.4s\n"
+    "fmla v27.4s, v7.4s, v23.4s\n"
+    "fmla v25.4s, v5.4s, v23.4s\n"
+    "ldr q17, [x10, x11]\n"
+    "fmla v26.4s, v0.4s, v19.4s\n"
+    "fmla v22.4s, v3.4s, v18.4s\n"
+    "fmla v24.4s, v2.4s, v16.4s\n"
+    "fmla v20.4s, v4.4s, v17.4s\n"
+    "fmla v21.4s, v3.4s, v17.4s\n"
+    "fmla v28.4s, v3.4s, v19.4s\n"
+    "ldr q19, [x10, x26]\n"
+    "fmla v27.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x27, x8]\n"
+    "fmla v26.4s, v6.4s, v18.4s\n"
+    "ldr q18, [x13, x8]\n"
+    "fmla v25.4s, v7.4s, v17.4s\n"
+    "fmla v22.4s, v5.4s, v17.4s\n"
+    "fmla v24.4s, v6.4s, v17.4s\n"
+    "fmla v21.4s, v5.4s, v19.4s\n"
+    "fmla v20.4s, v6.4s, v16.4s\n"
+    "fmla v26.4s, v8.4s, v17.4s\n"
+    "fmla v22.4s, v7.4s, v16.4s\n"
+    "ldr q17, [x27, x9]\n"
+    "fmla v29.4s, v3.4s, v18.4s\n"
+    "fmla v25.4s, v0.4s, v18.4s\n"
+    "fmla v24.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x13, x9]\n"
+    "fmla v20.4s, v8.4s, v17.4s\n"
+    "add x13, x13, #0x10\n"
+    "fmla v21.4s, v7.4s, v17.4s\n"
+    "ldr q19, [x10, x9]\n"
+    "fmla v28.4s, v4.4s, v18.4s\n"
+    "fmla v26.4s, v1.4s, v18.4s\n"
+    "ldr q17, [x10, x8]\n"
+    "fmla v29.4s, v5.4s, v16.4s\n"
+    "add x10, x10, #0x10\n"
+    "fmla v27.4s, v4.4s, v16.4s\n"
+    "fmla v25.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v22.4s, v4.4s, v17.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmla v20.4s, v3.4s, v17.4s\n"
+    "fmla v21.4s, v4.4s, v19.4s\n"
+    "fmla v26.4s, v7.4s, v17.4s\n"
+    "fmla v25.4s, v6.4s, v17.4s\n"
+    "ld1 { v18.4s }, [x12]\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v29.4s, v1.4s, v16.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmla v27.4s, v0.4s, v16.4s\n"
+    "ldr q17, [x12, x26]\n"
+    "fmla v24.4s, v7.4s, v19.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmla v20.4s, v5.4s, v19.4s\n"
+    "fmla v22.4s, v0.4s, v18.4s\n"
+    "add x12, x12, #0x10\n"
+    "fmla v21.4s, v2.4s, v17.4s\n"
+    "fmla v25.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x27, x11]\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "fmla v28.4s, v6.4s, v18.4s\n"
+    "fmla v26.4s, v3.4s, v18.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "add x27, x27, #0x10\n"
+    "fmla v27.4s, v8.4s, v17.4s\n"
+    "fmla v24.4s, v5.4s, v17.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "fmla v20.4s, v7.4s, v16.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmla v21.4s, v6.4s, v16.4s\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "fmax v20.4s, v20.4s, v15.4s\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "st1 { v28.4s }, [x15]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "str q29, [x15, x17]\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "str q27, [x15, x22]\n"
+    "add x15, x15, #0x10\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "st1 { v26.4s }, [x28]\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "str q25, [x28, x17]\n"
+    "str q24, [x28, x22]\n"
+    "add x28, x28, #0x10\n"
+    "st1 { v22.4s }, [x25]\n"
+    "str q20, [x25, x17]\n"
+    "str q21, [x25, x22]\n"
+    "add x25, x25, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 49f\n"
+    "ldr q31, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "add x24, x12, x11\n"
+    "add x23, x16, XZR\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "add x22, x16, x26\n"
+    "add x21, x27, XZR\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "add x20, x13, x11\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x24], #0x8\n"
+    "ldr d10, [x23], #0x8\n"
+    "ldr d11, [x22], #0x8\n"
+    "ldr d12, [x21], #0x8\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x24]\n"
+    "ld1 { v10.s }[2], [x23]\n"
+    "ld1 { v11.s }[2], [x22]\n"
+    "ld1 { v12.s }[2], [x21]\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+    "ldr s9, [x24, #0x0]\n"
+    "ldr s10, [x23, #0x0]\n"
+    "ldr s11, [x22, #0x0]\n"
+    "ldr s12, [x21, #0x0]\n"
+    "ldr s13, [x20, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+    "mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+    "add x20, x27, x26\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+    "mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "fmla v25.4s, v2.4s, v11.4s\n"
+    "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v29.4s, v6.4s, v12.4s\n"
+    "fmla v23.4s, v5.4s, v13.4s\n"
+    "fmla v24.4s, v4.4s, v13.4s\n"
+    "fmla v25.4s, v3.4s, v13.4s\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v27.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v0.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v31.4s, v8.4s, v12.4s\n"
+    "add x20, x12, x8\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v23.4s, v7.4s, v11.4s\n"
+    "fmla v24.4s, v6.4s, v11.4s\n"
+    "add x20, x16, x8\n"
+    "fmla v26.4s, v4.4s, v11.4s\n"
+    "fmla v27.4s, v3.4s, v11.4s\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v23.4s, v1.4s, v13.4s\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "add x20, x16, x9\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+    "fmla v24.4s, v2.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "add x20, x12, x9\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v24.4s, v8.4s, v10.4s\n"
+    "fmla v25.4s, v7.4s, v10.4s\n"
+    "add x20, x13, XZR\n"
+    "fmla v27.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v2.4s, v10.4s\n"
+    "fmla v31.4s, v1.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v23.4s, v3.4s, v11.4s\n"
+    "fmla v26.4s, v0.4s, v11.4s\n"
+    "add x20, x13, x26\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v25.4s, v5.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "add x20, x10, XZR\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v26.4s, v6.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "add x20, x10, x11\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "add x20, x10, x26\n"
+    "fmla v28.4s, v6.4s, v10.4s\n"
+    "fmla v29.4s, v5.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v31.4s, v3.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v5.4s, v11.4s\n"
+    "add x20, x27, x8\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v29.4s, v7.4s, v13.4s\n"
+    "fmla v30.4s, v6.4s, v13.4s\n"
+    "add x20, x13, x8\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "add x20, x13, x9\n"
+    "fmla v26.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v24.4s, v5.4s, v11.4s\n"
+    "fmla v25.4s, v4.4s, v11.4s\n"
+    "add x20, x27, x9\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v30.4s, v8.4s, v13.4s\n"
+    "fmla v31.4s, v7.4s, v13.4s\n"
+    "add x20, x10, x8\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v26.4s, v7.4s, v12.4s\n"
+    "fmla v27.4s, v6.4s, v12.4s\n"
+    "add x20, x16, x11\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "add x20, x10, x9\n"
+    "fmla v25.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v27.4s, v8.4s, v13.4s\n"
+    "fmla v28.4s, v7.4s, v13.4s\n"
+    "add x20, x12, XZR\n"
+    "fmla v30.4s, v5.4s, v13.4s\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v23.4s, v6.4s, v12.4s\n"
+    "fmla v26.4s, v3.4s, v12.4s\n"
+    "add x20, x12, x26\n"
+    "fmla v29.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v11.4s\n"
+    "add x20, x27, x11\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v29.4s, v8.4s, v13.4s\n"
+    "fmla v30.4s, v7.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "fmla v31.4s, v6.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmax v30.4s, v30.4s, v15.4s\n"
+    "fmax v31.4s, v31.4s, v15.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.d }[0], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.d }[0], [x21], x17\n"
+    "add x15, x15, #0x8\n"
+    "st1 { v29.d }[0], [x20], x17\n"
+    "add x28, x28, #0x8\n"
+    "add x25, x25, #0x8\n"
+    "st1 { v24.d }[0], [x22], x17\n"
+    "st1 { v27.d }[0], [x21], x17\n"
+    "st1 { v30.d }[0], [x20], x17\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "st1 { v28.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.s }[2], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.s }[2], [x21], x17\n"
+    "st1 { v29.s }[2], [x20], x17\n"
+    "st1 { v24.s }[2], [x22], x17\n"
+    "st1 { v27.s }[2], [x21], x17\n"
+    "st1 { v30.s }[2], [x20], x17\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "st1 { v28.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x22, x15\n"
+    "mov x21, x28\n"
+    "st1 { v23.s }[0], [x22], x17\n"
+    "mov x20, x25\n"
+    "st1 { v26.s }[0], [x21], x17\n"
+    "st1 { v29.s }[0], [x20], x17\n"
+    "st1 { v24.s }[0], [x22], x17\n"
+    "st1 { v27.s }[0], [x21], x17\n"
+    "st1 { v30.s }[0], [x20], x17\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "st1 { v28.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "48:"  // Tile loop: Oddments: Store: Bit 1: End
+    "49:"  // Tile loop: End
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x23, x23, #0x1\n"
+    "add x21, x24, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x23, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x24, x24, x21, LT\n"
+    "csel x23, x23, XZR, LT\n"
+    "cmp x24, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..972f7eb535
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,905 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "mov x7, #0x10\n"  // cntb _, ALL, #1
+    "lsr x8, %x[n_channels], #0x2\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x14, #0x0\n"
+    "sub x13, XZR, x7\n"
+    "cbz x8, 3f\n"
+    "ldr q31, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x7, x8, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "add x16, x16, #0xa0\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q9, [x21, x14]\n"
+    "ldr q10, [x20, x14]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr q12, [x20, x14]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ldr q13, [x20, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
+    "ldr x26, [x15, #0x30]\n"
+    "ldr x23, [x15, #0x38]\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "ldr x22, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ldr q19, [x20, x14]\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "ldr x21, [x15, #0x40]\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+    "ldr x25, [x15, #0x50]\n"
+    "ldr x24, [x15, #0x58]\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x26, x14]\n"
+    "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v29.4s, v5.4s, v13.4s\n"
+    "fmla v28.4s, v6.4s, v17.4s\n"
+    "ldr x12, [x15, #0x70]\n"
+    "ldr x11, [x15, #0x88]\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+    "fmla v27.4s, v3.4s, v13.4s\n"
+    "ldr x10, [x17, #0x0]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v25.4s, v1.4s, v13.4s\n"
+    "ldr x9, [x17, #0x8]\n"
+    "ldr x28, [x17, #0x10]\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "ldr q18, [x23, x14]\n"
+    "fmla v23.4s, v6.4s, v12.4s\n"
+    "ldr q16, [x22, x14]\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
+    "ldr q31, [x16, #0x0]\n"
+    "fmla v29.4s, v7.4s, v17.4s\n"
+    "ldr x23, [x15, #0x68]\n"
+    "fmla v28.4s, v0.4s, v18.4s\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x21, x14]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla v26.4s, v4.4s, v17.4s\n"
+    "fmla v25.4s, v3.4s, v17.4s\n"
+    "ldr x21, [x15, #0x80]\n"
+    "ldr x27, [x17, #0x18]\n"
+    "fmla v21.4s, v0.4s, v17.4s\n"
+    "fmla v24.4s, v4.4s, v19.4s\n"
+    "fmla v23.4s, v1.4s, v17.4s\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v29.4s, v1.4s, v18.4s\n"
+    "ldr q20, [x24, x14]\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v27.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x20, x14]\n"
+    "ldr x26, [x15, #0x90]\n"
+    "fmla v25.4s, v5.4s, v19.4s\n"
+    "fmla v21.4s, v2.4s, v19.4s\n"
+    "ldr x25, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v26.4s, v0.4s, v17.4s\n"
+    "fmla v24.4s, v2.4s, v20.4s\n"
+    "fmla v28.4s, v8.4s, v19.4s\n"
+    "fmla v27.4s, v7.4s, v19.4s\n"
+    "fmla v22.4s, v1.4s, v19.4s\n"
+    "ldr q19, [x23, x14]\n"
+    "fmla v23.4s, v3.4s, v16.4s\n"
+    "ldr x24, [x15, #0xa8]\n"
+    "fmla v26.4s, v6.4s, v16.4s\n"
+    "ldr q18, [x21, x14]\n"
+    "fmla v25.4s, v7.4s, v19.4s\n"
+    "ldr x23, [x15, #0xc0]\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "fmla v21.4s, v4.4s, v19.4s\n"
+    "fmla v29.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x12, x14]\n"
+    "fmla v27.4s, v5.4s, v20.4s\n"
+    "ldr q16, [x22, x14]\n"
+    "fmla v23.4s, v5.4s, v19.4s\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "ldr x22, [x15, #0xb0]\n"
+    "ldr x21, [x15, #0xb8]\n"
+    "fmla v26.4s, v8.4s, v19.4s\n"
+    "fmla v24.4s, v8.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v16.4s\n"
+    "fmla v28.4s, v3.4s, v18.4s\n"
+    "fmla v25.4s, v0.4s, v18.4s\n"
+    "fmla v22.4s, v5.4s, v17.4s\n"
+    "ldr q17, [x11, x14]\n"
+    "fmla v23.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x26, x14]\n"
+    "fmla v29.4s, v4.4s, v18.4s\n"
+    "fmla v26.4s, v1.4s, v18.4s\n"
+    "ldr q18, [x20, x14]\n"
+    "fmla v28.4s, v5.4s, v17.4s\n"
+    "fmla v27.4s, v4.4s, v17.4s\n"
+    "fmla v25.4s, v2.4s, v17.4s\n"
+    "fmla v24.4s, v1.4s, v17.4s\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v21.4s, v8.4s, v16.4s\n"
+    "ldr x20, [x15, #0x20]\n"
+    "fmla v22.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x24, x14]\n"
+    "fmla v29.4s, v2.4s, v17.4s\n"
+    "fmla v26.4s, v7.4s, v18.4s\n"
+    "fmla v25.4s, v6.4s, v18.4s\n"
+    "fmla v23.4s, v4.4s, v18.4s\n"
+    "fmla v21.4s, v3.4s, v18.4s\n"
+    "ldr q18, [x22, x14]\n"
+    "fmla v22.4s, v4.4s, v16.4s\n"
+    "ldr q4, [x16, #0x50]\n"
+    "fmla v28.4s, v1.4s, v17.4s\n"
+    "ldr q1, [x16, #0x20]\n"
+    "fmla v27.4s, v0.4s, v17.4s\n"
+    "ldr q17, [x21, x14]\n"
+    "fmla v29.4s, v6.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmla v24.4s, v7.4s, v16.4s\n"
+    "fmla v21.4s, v5.4s, v16.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "str q29, [x10, x13]\n"
+    "fmla v23.4s, v0.4s, v18.4s\n"
+    "ldr q0, [x16, #0x10]\n"
+    "fmla v22.4s, v2.4s, v17.4s\n"
+    "ldr q2, [x16, #0x30]\n"
+    "fmla v25.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x23, x14]\n"
+    "fmla v26.4s, v3.4s, v18.4s\n"
+    "ldr q3, [x16, #0x40]\n"
+    "fmla v27.4s, v8.4s, v17.4s\n"
+    "fmla v24.4s, v5.4s, v17.4s\n"
+    "ldr q5, [x16, #0x60]\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmla v23.4s, v8.4s, v16.4s\n"
+    "ldr q8, [x16, #0x90]\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "ldr q7, [x16, #0x80]\n"
+    "fmla v22.4s, v6.4s, v16.4s\n"
+    "ldr q13, [x20, x7]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "ldr x24, [x17, #0x20]\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q9, [x21, x7]\n"
+    "ldr q10, [x20, x7]\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q11, [x21, x7]\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "ldr q12, [x20, x7]\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "str q28, [x9, x13]\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "str q27, [x28, x13]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "str q26, [x27, x13]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x7, x7, #0x10\n"
+    "str q25, [x24, x13]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "cmp x7, x8, LSL #4\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "add x14, x14, #0x10\n"
+    "str q24, [x23, x13]\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "str q23, [x22, x13]\n"
+    "add x16, x16, #0xa0\n"
+    "str q21, [x21, x13]\n"
+    "str q22, [x20, x13]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
+    "ldr x23, [x15, #0x30]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ldr q19, [x20, x14]\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "ldr x20, [x15, #0x40]\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+    "ldr x25, [x15, #0x50]\n"
+    "ldr x24, [x15, #0x58]\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q17, [x23, x14]\n"
+    "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "ldr x23, [x15, #0x60]\n"
+    "fmla v29.4s, v5.4s, v13.4s\n"
+    "fmla v28.4s, v6.4s, v17.4s\n"
+    "ldr x12, [x15, #0x70]\n"
+    "ldr x11, [x15, #0x88]\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+    "fmla v27.4s, v3.4s, v13.4s\n"
+    "ldr x10, [x17, #0x0]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v25.4s, v1.4s, v13.4s\n"
+    "ldr x9, [x17, #0x8]\n"
+    "ldr x28, [x17, #0x10]\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "ldr q18, [x22, x14]\n"
+    "fmla v23.4s, v6.4s, v12.4s\n"
+    "ldr q16, [x21, x14]\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
+    "fmla v29.4s, v7.4s, v17.4s\n"
+    "ldr x22, [x15, #0x68]\n"
+    "ldr x21, [x15, #0x78]\n"
+    "fmla v28.4s, v0.4s, v18.4s\n"
+    "fmla v22.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x20, x14]\n"
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v26.4s, v4.4s, v17.4s\n"
+    "fmla v25.4s, v3.4s, v17.4s\n"
+    "ldr x27, [x17, #0x18]\n"
+    "fmla v21.4s, v0.4s, v17.4s\n"
+    "fmla v24.4s, v4.4s, v19.4s\n"
+    "fmla v23.4s, v1.4s, v17.4s\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v29.4s, v1.4s, v18.4s\n"
+    "ldr q20, [x24, x14]\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v27.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x23, x14]\n"
+    "ldr x26, [x15, #0x90]\n"
+    "fmla v25.4s, v5.4s, v19.4s\n"
+    "fmla v21.4s, v2.4s, v19.4s\n"
+    "ldr x25, [x15, #0xa0]\n"
+    "ldr x24, [x15, #0x98]\n"
+    "fmla v26.4s, v0.4s, v17.4s\n"
+    "fmla v24.4s, v2.4s, v20.4s\n"
+    "fmla v28.4s, v8.4s, v19.4s\n"
+    "fmla v27.4s, v7.4s, v19.4s\n"
+    "fmla v22.4s, v1.4s, v19.4s\n"
+    "ldr q19, [x22, x14]\n"
+    "fmla v23.4s, v3.4s, v16.4s\n"
+    "ldr x23, [x15, #0xa8]\n"
+    "fmla v26.4s, v6.4s, v16.4s\n"
+    "ldr q18, [x20, x14]\n"
+    "fmla v25.4s, v7.4s, v19.4s\n"
+    "ldr x22, [x15, #0xc0]\n"
+    "fmla v24.4s, v6.4s, v19.4s\n"
+    "fmla v21.4s, v4.4s, v19.4s\n"
+    "fmla v29.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x12, x14]\n"
+    "fmla v27.4s, v5.4s, v20.4s\n"
+    "ldr q16, [x21, x14]\n"
+    "fmla v23.4s, v5.4s, v19.4s\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v26.4s, v8.4s, v19.4s\n"
+    "fmla v24.4s, v8.4s, v17.4s\n"
+    "fmla v21.4s, v6.4s, v16.4s\n"
+    "fmla v28.4s, v3.4s, v18.4s\n"
+    "fmla v25.4s, v0.4s, v18.4s\n"
+    "fmla v22.4s, v5.4s, v17.4s\n"
+    "ldr q17, [x11, x14]\n"
+    "fmla v23.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x26, x14]\n"
+    "fmla v29.4s, v4.4s, v18.4s\n"
+    "fmla v26.4s, v1.4s, v18.4s\n"
+    "ldr q18, [x24, x14]\n"
+    "fmla v28.4s, v5.4s, v17.4s\n"
+    "fmla v27.4s, v4.4s, v17.4s\n"
+    "fmla v25.4s, v2.4s, v17.4s\n"
+    "fmla v24.4s, v1.4s, v17.4s\n"
+    "ldr q17, [x25, x14]\n"
+    "fmla v21.4s, v8.4s, v16.4s\n"
+    "fmla v22.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x23, x14]\n"
+    "fmla v29.4s, v2.4s, v17.4s\n"
+    "fmla v26.4s, v7.4s, v18.4s\n"
+    "fmla v25.4s, v6.4s, v18.4s\n"
+    "fmla v23.4s, v4.4s, v18.4s\n"
+    "fmla v21.4s, v3.4s, v18.4s\n"
+    "ldr q18, [x21, x14]\n"
+    "fmla v22.4s, v4.4s, v16.4s\n"
+    "fmla v28.4s, v1.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmla v27.4s, v0.4s, v17.4s\n"
+    "ldr q17, [x20, x14]\n"
+    "fmla v29.4s, v6.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmla v24.4s, v7.4s, v16.4s\n"
+    "fmla v21.4s, v5.4s, v16.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "str q29, [x10, x13]\n"
+    "fmla v23.4s, v0.4s, v18.4s\n"
+    "fmla v22.4s, v2.4s, v17.4s\n"
+    "ldr x20, [x17, #0x20]\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmla v25.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x22, x14]\n"
+    "fmla v26.4s, v3.4s, v18.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmla v27.4s, v8.4s, v17.4s\n"
+    "fmla v24.4s, v5.4s, v17.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "str q28, [x9, x13]\n"
+    "fmla v23.4s, v8.4s, v16.4s\n"
+    "fmla v21.4s, v7.4s, v16.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "ldr x23, [x17, #0x28]\n"
+    "fmla v22.4s, v6.4s, v16.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "str q27, [x28, x13]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "str q26, [x27, x13]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "str q25, [x20, x13]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "add x14, x14, #0x10\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "str q24, [x23, x13]\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "str q23, [x22, x13]\n"
+    "str q21, [x21, x13]\n"
+    "str q22, [x20, x13]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 48f\n"
+    "ldr q31, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "mov x13, x14\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "ldr x24, [x15, #0x0]\n"
+    "ldr x23, [x15, #0x8]\n"
+    "add x24, x24, x14\n"
+    "add x23, x23, x14\n"
+    "ldr x22, [x15, #0x10]\n"
+    "ldr x21, [x15, #0x18]\n"
+    "add x22, x22, x14\n"
+    "add x21, x21, x14\n"
+    "ldr x20, [x15, #0x20]\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x24], #0x8\n"
+    "ld1 { v10.d }[0], [x23], #0x8\n"
+    "ld1 { v11.d }[0], [x22], #0x8\n"
+    "ld1 { v12.d }[0], [x21], #0x8\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "ld1 { v11.s }[2], [x22], #0x4\n"
+    "ld1 { v12.s }[2], [x21], #0x4\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x24], #0x4\n"
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "ld1 { v11.s }[0], [x22], #0x4\n"
+    "ld1 { v12.s }[0], [x21], #0x4\n"
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+    "mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+    "mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+    "ldr x20, [x15, #0x28]\n"
+    "add x20, x20, x14\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+    "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+    "mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+    "mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "fmla v25.4s, v2.4s, v11.4s\n"
+    "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v29.4s, v6.4s, v12.4s\n"
+    "fmla v23.4s, v5.4s, v13.4s\n"
+    "fmla v24.4s, v4.4s, v13.4s\n"
+    "fmla v25.4s, v3.4s, v13.4s\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "fmla v27.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v0.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load input (4, 4): Bit 1: End
+    "ldr x20, [x15, #0x30]\n"
+    "fmla v31.4s, v8.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "9:"  // Oddments: Load input (2, 1): Bit 1: End
+    "ldr x20, [x15, #0x38]\n"
+    "fmla v23.4s, v7.4s, v11.4s\n"
+    "fmla v24.4s, v6.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v26.4s, v4.4s, v11.4s\n"
+    "fmla v27.4s, v3.4s, v11.4s\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Load input (0, 1): Bit 1: End
+    "ldr x20, [x15, #0x40]\n"
+    "fmla v23.4s, v1.4s, v13.4s\n"
+    "fmla v24.4s, v0.4s, v13.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "13:"  // Oddments: Load input (0, 3): Bit 1: End
+    "ldr x20, [x15, #0x48]\n"
+    "fmla v24.4s, v2.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "15:"  // Oddments: Load input (2, 3): Bit 1: End
+    "ldr x20, [x15, #0x50]\n"
+    "fmla v24.4s, v8.4s, v10.4s\n"
+    "fmla v25.4s, v7.4s, v10.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v27.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v2.4s, v10.4s\n"
+    "fmla v31.4s, v1.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "17:"  // Oddments: Load input (1, 0): Bit 1: End
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v23.4s, v3.4s, v11.4s\n"
+    "fmla v26.4s, v0.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load input (1, 4): Bit 1: End
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v25.4s, v5.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "21:"  // Oddments: Load input (3, 0): Bit 1: End
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v26.4s, v6.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "23:"  // Oddments: Load input (3, 2): Bit 1: End
+    "ldr x20, [x15, #0x70]\n"
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v28.4s, v6.4s, v10.4s\n"
+    "fmla v29.4s, v5.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v31.4s, v3.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "25:"  // Oddments: Load input (3, 4): Bit 1: End
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v5.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "27:"  // Oddments: Load input (4, 1): Bit 1: End
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v29.4s, v7.4s, v13.4s\n"
+    "fmla v30.4s, v6.4s, v13.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (1, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "29:"  // Oddments: Load input (1, 1): Bit 1: End
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v26.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "31:"  // Oddments: Load input (1, 3): Bit 1: End
+    "ldr x20, [x15, #0x90]\n"
+    "fmla v24.4s, v5.4s, v11.4s\n"
+    "fmla v25.4s, v4.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "33:"  // Oddments: Load input (4, 3): Bit 1: End
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v30.4s, v8.4s, v13.4s\n"
+    "fmla v31.4s, v7.4s, v13.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "35:"  // Oddments: Load input (3, 1): Bit 1: End
+    "ldr x20, [x15, #0xa0]\n"
+    "fmla v26.4s, v7.4s, v12.4s\n"
+    "fmla v27.4s, v6.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "37:"  // Oddments: Load input (0, 2): Bit 1: End
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v25.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "39:"  // Oddments: Load input (3, 3): Bit 1: End
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla v27.4s, v8.4s, v13.4s\n"
+    "fmla v28.4s, v7.4s, v13.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v30.4s, v5.4s, v13.4s\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "41:"  // Oddments: Load input (2, 0): Bit 1: End
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v23.4s, v6.4s, v12.4s\n"
+    "fmla v26.4s, v3.4s, v12.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v29.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "43:"  // Oddments: Load input (2, 4): Bit 1: End
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v11.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "45:"  // Oddments: Load input (4, 2): Bit 1: End
+    "fmla v29.4s, v8.4s, v13.4s\n"
+    "fmla v30.4s, v7.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "fmla v31.4s, v6.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmax v30.4s, v30.4s, v15.4s\n"
+    "fmax v31.4s, v31.4s, v15.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.d }[0], [x23]\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.d }[0], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.d }[0], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "add x13, x13, #0x8\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.s }[2], [x23]\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.s }[2], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.s }[2], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Store: Bit 1: Unset
+    "ldr x20, [x17, #0x0]\n"
+    "add x20, x20, x13\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x23, [x17, #0x8]\n"
+    "ldr x22, [x17, #0x10]\n"
+    "ldr x21, [x17, #0x18]\n"
+    "add x23, x23, x13\n"
+    "add x22, x22, x13\n"
+    "ldr x20, [x17, #0x20]\n"
+    "add x21, x21, x13\n"
+    "add x20, x20, x13\n"
+    "st1 { v24.s }[0], [x23]\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "ldr x23, [x17, #0x28]\n"
+    "ldr x22, [x17, #0x30]\n"
+    "add x23, x23, x13\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x21, [x17, #0x38]\n"
+    "add x22, x22, x13\n"
+    "add x21, x21, x13\n"
+    "st1 { v27.s }[0], [x20]\n"
+    "ldr x20, [x17, #0x40]\n"
+    "add x20, x20, x13\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "47:"  // Oddments: Store: Bit 1: End
+    "48:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..8a198c1818
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..3adf8b0d9f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1232 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x27, #0x0\n"
+    "mov x26, #0x0\n"
+    "1:"  // Tile loop
+    "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x23, #0x4\n"
+    "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x27, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x27, x22\n"  // offset = tile_i * ld_output_row
+    "mov x6, #0x10\n"  // cntb _, ALL, #1
+    "madd x21, x26, x4, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "lsl x4, x4, #0x2\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "madd x20, x26, x5, x20\n"  // offset += tile_j * ld_output_col
+    "lsl x5, x5, #0x2\n"
+    "add x17, x4, x4\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x7, x7, x21, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x15, x7, x24, LSL #2\n"
+    "mul x20, x20, x23\n"  // offset *= output_tile_size
+    "add x14, x15, x24, LSL #2\n"
+    "add x8, x8, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "lsr x13, %x[n_channels], #0x2\n"
+    "add x12, x14, x24, LSL #2\n"
+    "add x11, x17, x4\n"
+    "add x10, x8, x22, LSL #2\n"
+    "add x9, x12, x24, LSL #2\n"
+    "add x28, x11, x4\n"
+    "add x27, x10, x22, LSL #2\n"
+    "add x23, x5, x5\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x26, x9, x24, LSL #2\n"
+    "add x25, x28, x4\n"
+    "add x24, x27, x22, LSL #2\n"
+    "add x22, x23, x5\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x6\n"
+    "cbz x13, 4f\n"
+    "ldr q14, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x6, x13, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "add x16, x16, #0xa0\n"
+    "ldr q9, [x14, x17]\n"
+    "ld1 { v10.4s }, [x7]\n"
+    "ldr q11, [x7, x25]\n"
+    "ldr q12, [x14, x11]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v26.16b, v14.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+    "mov v28.16b, v14.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+    "add x6, x6, #0x10\n"
+    "cmp x6, x13, LSL #4\n"
+    "mov v16.16b, v14.16b\n fmla v16.4s, v3.4s, v9.4s\n"
+    "mov v22.16b, v14.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "add x20, x20, #0x10\n"
+    "add x21, x21, #0x10\n"
+    "mov v23.16b, v14.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+    "fmla v26.4s, v5.4s, v12.4s\n"
+    "mov v25.16b, v14.16b\n fmla v25.4s, v7.4s, v9.4s\n"
+    "mov v17.16b, v14.16b\n fmla v17.4s, v6.4s, v9.4s\n"
+    "mov v31.16b, v14.16b\n fmla v31.4s, v5.4s, v9.4s\n"
+    "mov v20.16b, v14.16b\n fmla v20.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x12, x17]\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "ld1 { v30.4s }, [x26]\n"
+    "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
+    "ldr q27, [x26, x25]\n"
+    "fmla v16.4s, v4.4s, v12.4s\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v23.4s, v1.4s, v12.4s\n"
+    "mov v21.16b, v14.16b\n fmla v21.4s, v6.4s, v30.4s\n"
+    "ldr q10, [x12, x11]\n"
+    "fmla v26.4s, v7.4s, v9.4s\n"
+    "fmla v25.4s, v8.4s, v12.4s\n"
+    "fmla v17.4s, v7.4s, v12.4s\n"
+    "fmla v29.4s, v6.4s, v12.4s\n"
+    "mov v24.16b, v14.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+    "mov v19.16b, v14.16b\n fmla v19.4s, v0.4s, v12.4s\n"
+    "ldr q11, [x7, x4]\n"
+    "mov v30.16b, v14.16b\n fmla v30.4s, v8.4s, v27.4s\n"
+    "ldr q12, [x7, x28]\n"
+    "fmla v16.4s, v6.4s, v9.4s\n"
+    "fmla v22.4s, v4.4s, v9.4s\n"
+    "fmla v23.4s, v3.4s, v9.4s\n"
+    "mov v27.16b, v14.16b\n fmla v27.4s, v1.4s, v9.4s\n"
+    "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+    "ldr q14, [x16, #0x0]\n"
+    "fmla v31.4s, v8.4s, v9.4s\n"
+    "fmla v20.4s, v5.4s, v9.4s\n"
+    "fmla v21.4s, v2.4s, v9.4s\n"
+    "ld1 { v9.4s }, [x15]\n"
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v25.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x15, x25]\n"
+    "fmla v17.4s, v2.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "ld1 { v12.4s }, [x9]\n"
+    "fmla v16.4s, v7.4s, v10.4s\n"
+    "fmla v24.4s, v6.4s, v10.4s\n"
+    "fmla v22.4s, v5.4s, v10.4s\n"
+    "fmla v23.4s, v4.4s, v10.4s\n"
+    "fmla v19.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v2.4s, v10.4s\n"
+    "fmla v18.4s, v1.4s, v10.4s\n"
+    "fmla v30.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x15, x17]\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v20.4s, v6.4s, v12.4s\n"
+    "fmla v21.4s, v3.4s, v12.4s\n"
+    "ldr q12, [x9, x25]\n"
+    "fmla v26.4s, v1.4s, v10.4s\n"
+    "fmla v28.4s, v3.4s, v9.4s\n"
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "fmla v24.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x15, x11]\n"
+    "fmla v25.4s, v4.4s, v10.4s\n"
+    "fmla v17.4s, v3.4s, v10.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "fmla v19.4s, v8.4s, v12.4s\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr q9, [x26, x4]\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "fmla v26.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v10.4s\n"
+    "ldr q10, [x14, x4]\n"
+    "fmla v25.4s, v5.4s, v11.4s\n"
+    "fmla v17.4s, v4.4s, v11.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "fmla v16.4s, v1.4s, v11.4s\n"
+    "fmla v24.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x14, x28]\n"
+    "fmla v21.4s, v7.4s, v9.4s\n"
+    "fmla v27.4s, v6.4s, v9.4s\n"
+    "ldr q12, [x26, x28]\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v26.4s, v3.4s, v10.4s\n"
+    "fmla v20.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "fmla v28.4s, v7.4s, v10.4s\n"
+    "fmla v25.4s, v6.4s, v10.4s\n"
+    "ldr q10, [x7, x17]\n"
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "ldr q9, [x12, x4]\n"
+    "fmla v17.4s, v8.4s, v11.4s\n"
+    "fmla v29.4s, v7.4s, v11.4s\n"
+    "fmla v16.4s, v5.4s, v11.4s\n"
+    "fmla v24.4s, v4.4s, v11.4s\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "ldr q12, [x7, x11]\n"
+    "add x7, x7, #0x10\n"
+    "fmla v31.4s, v7.4s, v9.4s\n"
+    "fmla v26.4s, v6.4s, v9.4s\n"
+    "fmla v20.4s, v4.4s, v9.4s\n"
+    "fmla v22.4s, v3.4s, v9.4s\n"
+    "fmla v21.4s, v1.4s, v9.4s\n"
+    "fmla v27.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x12, x28]\n"
+    "fmla v28.4s, v2.4s, v10.4s\n"
+    "fmla v25.4s, v1.4s, v10.4s\n"
+    "fmla v17.4s, v0.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x14]\n"
+    "fmla v18.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v0.4s, v12.4s\n"
+    "fmla v31.4s, v3.4s, v10.4s\n"
+    "fmla v20.4s, v0.4s, v10.4s\n"
+    "fmla v16.4s, v8.4s, v9.4s\n"
+    "fmla v24.4s, v7.4s, v9.4s\n"
+    "fmla v23.4s, v5.4s, v9.4s\n"
+    "fmla v19.4s, v4.4s, v9.4s\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "ldr q11, [x9, x17]\n"
+    "fmla v25.4s, v2.4s, v12.4s\n"
+    "fmla v17.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x14, x25]\n"
+    "add x14, x14, #0x10\n"
+    "ldr q9, [x14, x17]\n"
+    "fmla v28.4s, v6.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x12]\n"
+    "fmla v27.4s, v4.4s, v11.4s\n"
+    "fmla v18.4s, v3.4s, v11.4s\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v19.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x12, x25]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.4s, v6.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "fmla v21.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x26, x17]\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "fmla v18.4s, v6.4s, v10.4s\n"
+    "fmla v20.4s, v8.4s, v11.4s\n"
+    "fmla v22.4s, v7.4s, v11.4s\n"
+    "fmla v23.4s, v6.4s, v11.4s\n"
+    "fmla v21.4s, v5.4s, v11.4s\n"
+    "ldr q11, [x9, x11]\n"
+    "fmla v19.4s, v5.4s, v12.4s\n"
+    "fmla v27.4s, v5.4s, v11.4s\n"
+    "fmla v18.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "fmla v24.4s, v8.4s, v12.4s\n"
+    "ldr q12, [x26, x11]\n"
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "ldr q10, [x15, x4]\n"
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "fmla v23.4s, v7.4s, v11.4s\n"
+    "add x26, x26, #0x10\n"
+    "fmla v19.4s, v6.4s, v11.4s\n"
+    "ldr q11, [x15, x28]\n"
+    "fmla v27.4s, v8.4s, v12.4s\n"
+    "add x15, x15, #0x10\n"
+    "fmla v18.4s, v7.4s, v12.4s\n"
+    "fmla v30.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x9, x4]\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "fmla v25.4s, v3.4s, v10.4s\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmla v31.4s, v1.4s, v10.4s\n"
+    "fmla v26.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x9, x28]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "fmla v17.4s, v5.4s, v11.4s\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "add x9, x9, #0x10\n"
+    "fmla v16.4s, v2.4s, v11.4s\n"
+    "ldr q2, [x16, #0x30]\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x7, x25]\n"
+    "ldr q1, [x16, #0x20]\n"
+    "fmla v20.4s, v7.4s, v12.4s\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "ldr q6, [x16, #0x70]\n"
+    "fmla v21.4s, v4.4s, v12.4s\n"
+    "fmla v27.4s, v3.4s, v12.4s\n"
+    "ldr q12, [x14, x11]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "fmla v23.4s, v8.4s, v10.4s\n"
+    "ldr q8, [x16, #0x90]\n"
+    "fmla v19.4s, v7.4s, v10.4s\n"
+    "ldr q7, [x16, #0x80]\n"
+    "fmla v18.4s, v5.4s, v10.4s\n"
+    "ldr q5, [x16, #0x60]\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x7]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "add x16, x16, #0xa0\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "fmax v16.4s, v16.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v13.4s\n"
+    "fmax v20.4s, v20.4s, v13.4s\n"
+    "fmax v22.4s, v22.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v13.4s\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "fmax v21.4s, v21.4s, v13.4s\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v30.4s, v30.4s, v13.4s\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v25.4s, v25.4s, v15.4s\n"
+    "st1 { v28.4s }, [x8]\n"
+    "fmin v17.4s, v17.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "str q25, [x8, x5]\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "fmin v26.4s, v26.4s, v15.4s\n"
+    "str q17, [x8, x23]\n"
+    "fmin v16.4s, v16.4s, v15.4s\n"
+    "fmin v24.4s, v24.4s, v15.4s\n"
+    "str q29, [x8, x22]\n"
+    "add x8, x8, #0x10\n"
+    "fmin v20.4s, v20.4s, v15.4s\n"
+    "fmin v22.4s, v22.4s, v15.4s\n"
+    "st1 { v31.4s }, [x10]\n"
+    "fmin v23.4s, v23.4s, v15.4s\n"
+    "fmin v19.4s, v19.4s, v15.4s\n"
+    "str q26, [x10, x5]\n"
+    "fmin v21.4s, v21.4s, v15.4s\n"
+    "fmin v27.4s, v27.4s, v15.4s\n"
+    "str q16, [x10, x23]\n"
+    "fmin v18.4s, v18.4s, v15.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "str q24, [x10, x22]\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v20.4s }, [x27]\n"
+    "str q22, [x27, x5]\n"
+    "str q23, [x27, x23]\n"
+    "str q19, [x27, x22]\n"
+    "add x27, x27, #0x10\n"
+    "st1 { v21.4s }, [x24]\n"
+    "str q27, [x24, x5]\n"
+    "str q18, [x24, x23]\n"
+    "str q30, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v16.16b, v14.16b\n fmla v16.4s, v4.4s, v9.4s\n"
+    "mov v23.16b, v14.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+    "mov v31.16b, v14.16b\n fmla v31.4s, v3.4s, v9.4s\n"
+    "mov v30.16b, v14.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+    "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+    "fmla v16.4s, v5.4s, v12.4s\n"
+    "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+    "mov v19.16b, v14.16b\n fmla v19.4s, v6.4s, v9.4s\n"
+    "mov v28.16b, v14.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+    "mov v27.16b, v14.16b\n fmla v27.4s, v2.4s, v9.4s\n"
+    "ldr q24, [x12, x17]\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "ld1 { v21.4s }, [x26]\n"
+    "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
+    "ldr q20, [x26, x25]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "fmla v18.4s, v1.4s, v12.4s\n"
+    "mov v26.16b, v14.16b\n fmla v26.4s, v6.4s, v21.4s\n"
+    "ldr q9, [x12, x11]\n"
+    "fmla v16.4s, v7.4s, v24.4s\n"
+    "fmla v17.4s, v8.4s, v12.4s\n"
+    "fmla v19.4s, v7.4s, v12.4s\n"
+    "fmla v29.4s, v6.4s, v12.4s\n"
+    "mov v11.16b, v14.16b\n fmla v11.4s, v3.4s, v12.4s\n"
+    "mov v10.16b, v14.16b\n fmla v10.4s, v0.4s, v12.4s\n"
+    "ldr q22, [x7, x4]\n"
+    "mov v25.16b, v14.16b\n fmla v25.4s, v8.4s, v20.4s\n"
+    "ldr q21, [x7, x28]\n"
+    "fmla v31.4s, v6.4s, v24.4s\n"
+    "fmla v30.4s, v4.4s, v24.4s\n"
+    "fmla v18.4s, v3.4s, v24.4s\n"
+    "mov v12.16b, v14.16b\n fmla v12.4s, v1.4s, v24.4s\n"
+    "fmla v14.4s, v0.4s, v24.4s\n"
+    "fmla v28.4s, v8.4s, v24.4s\n"
+    "fmla v27.4s, v5.4s, v24.4s\n"
+    "fmla v26.4s, v2.4s, v24.4s\n"
+    "ld1 { v24.4s }, [x15]\n"
+    "fmla v16.4s, v8.4s, v9.4s\n"
+    "fmla v23.4s, v1.4s, v22.4s\n"
+    "fmla v17.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x15, x25]\n"
+    "fmla v19.4s, v2.4s, v21.4s\n"
+    "fmla v29.4s, v1.4s, v21.4s\n"
+    "ld1 { v20.4s }, [x9]\n"
+    "fmla v31.4s, v7.4s, v9.4s\n"
+    "fmla v11.4s, v6.4s, v9.4s\n"
+    "fmla v30.4s, v5.4s, v9.4s\n"
+    "fmla v18.4s, v4.4s, v9.4s\n"
+    "fmla v10.4s, v3.4s, v9.4s\n"
+    "fmla v12.4s, v2.4s, v9.4s\n"
+    "fmla v14.4s, v1.4s, v9.4s\n"
+    "fmla v25.4s, v0.4s, v9.4s\n"
+    "ldr q21, [x15, x17]\n"
+    "fmla v28.4s, v0.4s, v24.4s\n"
+    "fmla v27.4s, v6.4s, v20.4s\n"
+    "fmla v26.4s, v3.4s, v20.4s\n"
+    "ldr q20, [x9, x25]\n"
+    "fmla v16.4s, v1.4s, v21.4s\n"
+    "fmla v23.4s, v3.4s, v24.4s\n"
+    "fmla v29.4s, v5.4s, v22.4s\n"
+    "fmla v11.4s, v2.4s, v22.4s\n"
+    "ldr q22, [x15, x11]\n"
+    "fmla v17.4s, v4.4s, v21.4s\n"
+    "fmla v19.4s, v3.4s, v21.4s\n"
+    "fmla v31.4s, v0.4s, v21.4s\n"
+    "fmla v10.4s, v8.4s, v20.4s\n"
+    "fmla v25.4s, v5.4s, v20.4s\n"
+    "ldr q20, [x26, x4]\n"
+    "fmla v28.4s, v2.4s, v21.4s\n"
+    "fmla v16.4s, v2.4s, v22.4s\n"
+    "fmla v23.4s, v5.4s, v21.4s\n"
+    "ldr q21, [x14, x4]\n"
+    "fmla v17.4s, v5.4s, v22.4s\n"
+    "fmla v19.4s, v4.4s, v22.4s\n"
+    "fmla v29.4s, v3.4s, v22.4s\n"
+    "fmla v31.4s, v1.4s, v22.4s\n"
+    "fmla v11.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x14, x28]\n"
+    "fmla v26.4s, v7.4s, v20.4s\n"
+    "fmla v12.4s, v6.4s, v20.4s\n"
+    "ldr q20, [x26, x28]\n"
+    "fmla v28.4s, v4.4s, v21.4s\n"
+    "fmla v16.4s, v3.4s, v21.4s\n"
+    "fmla v27.4s, v1.4s, v21.4s\n"
+    "fmla v30.4s, v0.4s, v21.4s\n"
+    "fmla v23.4s, v7.4s, v21.4s\n"
+    "fmla v17.4s, v6.4s, v21.4s\n"
+    "ldr q21, [x7, x17]\n"
+    "fmla v14.4s, v8.4s, v20.4s\n"
+    "fmla v25.4s, v7.4s, v20.4s\n"
+    "ldr q20, [x12, x4]\n"
+    "fmla v19.4s, v8.4s, v22.4s\n"
+    "fmla v29.4s, v7.4s, v22.4s\n"
+    "fmla v31.4s, v5.4s, v22.4s\n"
+    "fmla v11.4s, v4.4s, v22.4s\n"
+    "fmla v18.4s, v2.4s, v22.4s\n"
+    "fmla v10.4s, v1.4s, v22.4s\n"
+    "ldr q22, [x7, x11]\n"
+    "add x7, x7, #0x10\n"
+    "fmla v28.4s, v7.4s, v20.4s\n"
+    "fmla v16.4s, v6.4s, v20.4s\n"
+    "fmla v27.4s, v4.4s, v20.4s\n"
+    "fmla v30.4s, v3.4s, v20.4s\n"
+    "fmla v26.4s, v1.4s, v20.4s\n"
+    "fmla v12.4s, v0.4s, v20.4s\n"
+    "ldr q20, [x12, x28]\n"
+    "fmla v23.4s, v2.4s, v21.4s\n"
+    "fmla v17.4s, v1.4s, v21.4s\n"
+    "fmla v19.4s, v0.4s, v21.4s\n"
+    "ld1 { v21.4s }, [x14]\n"
+    "fmla v14.4s, v2.4s, v20.4s\n"
+    "fmla v29.4s, v0.4s, v22.4s\n"
+    "fmla v28.4s, v3.4s, v21.4s\n"
+    "fmla v27.4s, v0.4s, v21.4s\n"
+    "fmla v31.4s, v8.4s, v20.4s\n"
+    "fmla v11.4s, v7.4s, v20.4s\n"
+    "fmla v18.4s, v5.4s, v20.4s\n"
+    "fmla v10.4s, v4.4s, v20.4s\n"
+    "fmla v25.4s, v1.4s, v20.4s\n"
+    "ldr q24, [x9, x17]\n"
+    "fmla v17.4s, v2.4s, v22.4s\n"
+    "fmla v19.4s, v1.4s, v22.4s\n"
+    "ldr q20, [x14, x25]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v23.4s, v6.4s, v21.4s\n"
+    "ld1 { v21.4s }, [x12]\n"
+    "fmla v12.4s, v4.4s, v24.4s\n"
+    "fmla v14.4s, v3.4s, v24.4s\n"
+    "fmla v29.4s, v8.4s, v20.4s\n"
+    "fmla v11.4s, v5.4s, v20.4s\n"
+    "fmla v10.4s, v2.4s, v20.4s\n"
+    "ldr q20, [x12, x25]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.4s, v6.4s, v21.4s\n"
+    "fmla v27.4s, v3.4s, v21.4s\n"
+    "fmla v26.4s, v0.4s, v21.4s\n"
+    "ldr q22, [x26, x17]\n"
+    "fmla v25.4s, v2.4s, v20.4s\n"
+    "fmla v12.4s, v7.4s, v22.4s\n"
+    "fmla v14.4s, v6.4s, v22.4s\n"
+    "fmla v27.4s, v8.4s, v24.4s\n"
+    "fmla v30.4s, v7.4s, v24.4s\n"
+    "fmla v18.4s, v6.4s, v24.4s\n"
+    "fmla v26.4s, v5.4s, v24.4s\n"
+    "ldr q21, [x9, x11]\n"
+    "fmla v10.4s, v5.4s, v20.4s\n"
+    "fmla v12.4s, v5.4s, v21.4s\n"
+    "fmla v14.4s, v4.4s, v21.4s\n"
+    "fmla v25.4s, v3.4s, v21.4s\n"
+    "fmla v11.4s, v8.4s, v20.4s\n"
+    "ldr q20, [x26, x11]\n"
+    "fmla v26.4s, v8.4s, v22.4s\n"
+    "ldr q9, [x15, x4]\n"
+    "fmla v30.4s, v8.4s, v21.4s\n"
+    "fmla v18.4s, v7.4s, v21.4s\n"
+    "add x26, x26, #0x10\n"
+    "fmla v10.4s, v6.4s, v21.4s\n"
+    "ldr q21, [x15, x28]\n"
+    "fmla v12.4s, v8.4s, v20.4s\n"
+    "add x15, x15, #0x10\n"
+    "fmla v14.4s, v7.4s, v20.4s\n"
+    "fmla v25.4s, v6.4s, v20.4s\n"
+    "ldr q24, [x9, x4]\n"
+    "fmla v23.4s, v4.4s, v9.4s\n"
+    "fmla v17.4s, v3.4s, v9.4s\n"
+    "fmax v23.4s, v23.4s, v13.4s\n"
+    "fmla v28.4s, v1.4s, v9.4s\n"
+    "fmla v16.4s, v0.4s, v9.4s\n"
+    "ldr q0, [x9, x28]\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmla v19.4s, v5.4s, v21.4s\n"
+    "fmla v29.4s, v4.4s, v21.4s\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "add x9, x9, #0x10\n"
+    "fmla v31.4s, v2.4s, v21.4s\n"
+    "fmla v11.4s, v1.4s, v21.4s\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "fmla v27.4s, v7.4s, v24.4s\n"
+    "fmla v30.4s, v6.4s, v24.4s\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmla v26.4s, v4.4s, v24.4s\n"
+    "fmla v12.4s, v3.4s, v24.4s\n"
+    "fmax v16.4s, v16.4s, v13.4s\n"
+    "fmla v18.4s, v8.4s, v0.4s\n"
+    "fmla v10.4s, v7.4s, v0.4s\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "fmla v14.4s, v5.4s, v0.4s\n"
+    "fmla v25.4s, v4.4s, v0.4s\n"
+    "fmax v11.4s, v11.4s, v13.4s\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v30.4s, v30.4s, v13.4s\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v10.4s, v10.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "fmax v12.4s, v12.4s, v13.4s\n"
+    "fmax v14.4s, v14.4s, v13.4s\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "fmin v23.4s, v23.4s, v15.4s\n"
+    "fmin v17.4s, v17.4s, v15.4s\n"
+    "st1 { v23.4s }, [x8]\n"
+    "fmin v19.4s, v19.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "str q17, [x8, x5]\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v16.4s, v16.4s, v15.4s\n"
+    "str q19, [x8, x23]\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "fmin v11.4s, v11.4s, v15.4s\n"
+    "str q29, [x8, x22]\n"
+    "add x8, x8, #0x10\n"
+    "fmin v27.4s, v27.4s, v15.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "st1 { v28.4s }, [x10]\n"
+    "fmin v18.4s, v18.4s, v15.4s\n"
+    "fmin v10.4s, v10.4s, v15.4s\n"
+    "str q16, [x10, x5]\n"
+    "fmin v26.4s, v26.4s, v15.4s\n"
+    "fmin v12.4s, v12.4s, v15.4s\n"
+    "str q31, [x10, x23]\n"
+    "fmin v14.4s, v14.4s, v15.4s\n"
+    "fmin v25.4s, v25.4s, v15.4s\n"
+    "str q11, [x10, x22]\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v27.4s }, [x27]\n"
+    "str q30, [x27, x5]\n"
+    "str q18, [x27, x23]\n"
+    "str q10, [x27, x22]\n"
+    "add x27, x27, #0x10\n"
+    "st1 { v26.4s }, [x24]\n"
+    "str q12, [x24, x5]\n"
+    "str q14, [x24, x23]\n"
+    "str q25, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 73f\n"
+    "ldr q14, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "add x23, x14, x17\n"
+    "add x22, x7, XZR\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "add x21, x7, x25\n"
+    "add x20, x14, x11\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "ldr q5, [x16, #0x60]\n"
+    "ldr q6, [x16, #0x70]\n"
+    "ldr q7, [x16, #0x80]\n"
+    "ldr q8, [x16, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x23], #0x8\n"
+    "ldr d10, [x22], #0x8\n"
+    "ldr d11, [x21], #0x8\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x23]\n"
+    "ld1 { v10.s }[2], [x22]\n"
+    "ld1 { v11.s }[2], [x21]\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+    "ldr s9, [x23, #0x0]\n"
+    "ldr s10, [x22, #0x0]\n"
+    "ldr s11, [x21, #0x0]\n"
+    "ldr s12, [x20, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+    "mov v16.16b, v14.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+    "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+    "add x20, x26, XZR\n"
+    "mov v18.16b, v14.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+    "mov v21.16b, v14.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+    "mov v22.16b, v14.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+    "mov v25.16b, v14.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+    "mov v26.16b, v14.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+    "mov v19.16b, v14.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+    "mov v20.16b, v14.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+    "mov v24.16b, v14.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "fmla v17.4s, v8.4s, v12.4s\n"
+    "fmla v18.4s, v7.4s, v12.4s\n"
+    "fmla v19.4s, v6.4s, v12.4s\n"
+    "fmla v21.4s, v5.4s, v12.4s\n"
+    "fmla v22.4s, v4.4s, v12.4s\n"
+    "mov v23.16b, v14.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+    "fmla v25.4s, v2.4s, v12.4s\n"
+    "fmla v26.4s, v1.4s, v12.4s\n"
+    "mov v27.16b, v14.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+    "mov v28.16b, v14.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+    "add x20, x26, x25\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+    "mov v31.16b, v14.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+    "add x20, x12, x17\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v20.4s, v8.4s, v9.4s\n"
+    "fmla v21.4s, v7.4s, v9.4s\n"
+    "add x20, x7, x4\n"
+    "fmla v22.4s, v6.4s, v9.4s\n"
+    "fmla v24.4s, v5.4s, v9.4s\n"
+    "fmla v25.4s, v4.4s, v9.4s\n"
+    "fmla v26.4s, v3.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "mov v29.16b, v14.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "mov v30.16b, v14.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v16.4s, v1.4s, v12.4s\n"
+    "fmla v17.4s, v0.4s, v12.4s\n"
+    "add x20, x7, x28\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End
+    "fmla v18.4s, v2.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "add x20, x12, x11\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "fmla v22.4s, v7.4s, v10.4s\n"
+    "add x20, x15, XZR\n"
+    "fmla v23.4s, v6.4s, v10.4s\n"
+    "fmla v25.4s, v5.4s, v10.4s\n"
+    "fmla v26.4s, v4.4s, v10.4s\n"
+    "fmla v27.4s, v3.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v16.4s, v3.4s, v9.4s\n"
+    "fmla v20.4s, v0.4s, v9.4s\n"
+    "add x20, x15, x25\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End
+    "fmla v19.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v2.4s, v12.4s\n"
+    "add x20, x9, XZR\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v24.4s, v6.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "add x20, x15, x17\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+    "fmla v16.4s, v5.4s, v10.4s\n"
+    "fmla v17.4s, v4.4s, v10.4s\n"
+    "add x20, x9, x25\n"
+    "fmla v18.4s, v3.4s, v10.4s\n"
+    "fmla v20.4s, v2.4s, v10.4s\n"
+    "fmla v21.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+    "fmla v27.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v5.4s, v11.4s\n"
+    "add x20, x15, x11\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v17.4s, v5.4s, v12.4s\n"
+    "fmla v18.4s, v4.4s, v12.4s\n"
+    "add x20, x26, x4\n"
+    "fmla v19.4s, v3.4s, v12.4s\n"
+    "fmla v21.4s, v2.4s, v12.4s\n"
+    "fmla v22.4s, v1.4s, v12.4s\n"
+    "fmla v23.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+    "fmla v28.4s, v7.4s, v11.4s\n"
+    "fmla v29.4s, v6.4s, v11.4s\n"
+    "add x20, x14, x4\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v16.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v6.4s, v10.4s\n"
+    "add x20, x26, x28\n"
+    "fmla v20.4s, v4.4s, v10.4s\n"
+    "fmla v21.4s, v3.4s, v10.4s\n"
+    "fmla v24.4s, v1.4s, v10.4s\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+    "fmla v30.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "add x20, x14, x28\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "fmla v19.4s, v7.4s, v12.4s\n"
+    "add x20, x7, x17\n"
+    "fmla v22.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "fmla v26.4s, v2.4s, v12.4s\n"
+    "fmla v27.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v16.4s, v2.4s, v10.4s\n"
+    "fmla v17.4s, v1.4s, v10.4s\n"
+    "add x20, x12, x4\n"
+    "fmla v18.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v20.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v6.4s, v11.4s\n"
+    "add x20, x7, x11\n"
+    "fmla v24.4s, v4.4s, v11.4s\n"
+    "fmla v25.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+    "fmla v17.4s, v2.4s, v12.4s\n"
+    "fmla v18.4s, v1.4s, v12.4s\n"
+    "add x20, x14, XZR\n"
+    "fmla v19.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v16.4s, v6.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "add x20, x12, x28\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "fmla v23.4s, v7.4s, v11.4s\n"
+    "add x20, x14, x25\n"
+    "fmla v26.4s, v5.4s, v11.4s\n"
+    "fmla v27.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 50f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 50f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+    "fmla v19.4s, v8.4s, v12.4s\n"
+    "fmla v23.4s, v5.4s, v12.4s\n"
+    "add x20, x12, XZR\n"
+    "fmla v27.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v20.4s, v6.4s, v10.4s\n"
+    "fmla v24.4s, v3.4s, v10.4s\n"
+    "add x20, x9, x17\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 54f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 54f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v24.4s, v8.4s, v11.4s\n"
+    "fmla v25.4s, v7.4s, v11.4s\n"
+    "add x20, x12, x25\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v11.4s\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+    "fmla v23.4s, v8.4s, v12.4s\n"
+    "fmla v27.4s, v5.4s, v12.4s\n"
+    "add x20, x26, x17\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 58f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 58f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "add x20, x9, x11\n"
+    "fmla v30.4s, v6.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "fmla v26.4s, v7.4s, v11.4s\n"
+    "add x20, x26, x11\n"
+    "fmla v27.4s, v6.4s, v11.4s\n"
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 62f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 62f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "add x20, x15, x4\n"
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+    "fmla v16.4s, v4.4s, v10.4s\n"
+    "fmla v17.4s, v3.4s, v10.4s\n"
+    "add x20, x15, x28\n"
+    "fmla v20.4s, v1.4s, v10.4s\n"
+    "fmla v21.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 66f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 66f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v18.4s, v5.4s, v11.4s\n"
+    "fmla v19.4s, v4.4s, v11.4s\n"
+    "add x20, x9, x4\n"
+    "fmla v22.4s, v2.4s, v11.4s\n"
+    "fmla v23.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v24.4s, v7.4s, v12.4s\n"
+    "fmla v25.4s, v6.4s, v12.4s\n"
+    "add x20, x9, x28\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 70f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 70f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "fmax v16.4s, v16.4s, v13.4s\n"
+    "fmla v30.4s, v5.4s, v10.4s\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "fmax v20.4s, v20.4s, v13.4s\n"
+    "fmax v21.4s, v21.4s, v13.4s\n"
+    "fmax v22.4s, v22.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v13.4s\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "fmax v30.4s, v30.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "fmin v16.4s, v16.4s, v15.4s\n"
+    "fmin v17.4s, v17.4s, v15.4s\n"
+    "fmin v18.4s, v18.4s, v15.4s\n"
+    "fmin v19.4s, v19.4s, v15.4s\n"
+    "fmin v20.4s, v20.4s, v15.4s\n"
+    "fmin v21.4s, v21.4s, v15.4s\n"
+    "fmin v22.4s, v22.4s, v15.4s\n"
+    "fmin v23.4s, v23.4s, v15.4s\n"
+    "fmin v24.4s, v24.4s, v15.4s\n"
+    "fmin v25.4s, v25.4s, v15.4s\n"
+    "fmin v26.4s, v26.4s, v15.4s\n"
+    "fmin v27.4s, v27.4s, v15.4s\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "tbz %x[n_channels], #1, 71f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.d }[0], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.d }[0], [x22], x5\n"
+    "st1 { v24.d }[0], [x21], x5\n"
+    "add x8, x8, #0x8\n"
+    "add x10, x10, #0x8\n"
+    "st1 { v28.d }[0], [x20], x5\n"
+    "add x27, x27, #0x8\n"
+    "add x24, x24, #0x8\n"
+    "st1 { v17.d }[0], [x23], x5\n"
+    "st1 { v21.d }[0], [x22], x5\n"
+    "st1 { v25.d }[0], [x21], x5\n"
+    "st1 { v29.d }[0], [x20], x5\n"
+    "st1 { v18.d }[0], [x23], x5\n"
+    "st1 { v22.d }[0], [x22], x5\n"
+    "st1 { v26.d }[0], [x21], x5\n"
+    "st1 { v30.d }[0], [x20], x5\n"
+    "st1 { v19.d }[0], [x23]\n"
+    "st1 { v23.d }[0], [x22]\n"
+    "st1 { v27.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.s }[2], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.s }[2], [x22], x5\n"
+    "st1 { v24.s }[2], [x21], x5\n"
+    "st1 { v28.s }[2], [x20], x5\n"
+    "st1 { v17.s }[2], [x23], x5\n"
+    "st1 { v21.s }[2], [x22], x5\n"
+    "st1 { v25.s }[2], [x21], x5\n"
+    "st1 { v29.s }[2], [x20], x5\n"
+    "st1 { v18.s }[2], [x23], x5\n"
+    "st1 { v22.s }[2], [x22], x5\n"
+    "st1 { v26.s }[2], [x21], x5\n"
+    "st1 { v30.s }[2], [x20], x5\n"
+    "st1 { v19.s }[2], [x23]\n"
+    "st1 { v23.s }[2], [x22]\n"
+    "st1 { v27.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x23, x8\n"
+    "mov x22, x10\n"
+    "st1 { v16.s }[0], [x23], x5\n"
+    "mov x21, x27\n"
+    "mov x20, x24\n"
+    "st1 { v20.s }[0], [x22], x5\n"
+    "st1 { v24.s }[0], [x21], x5\n"
+    "st1 { v28.s }[0], [x20], x5\n"
+    "st1 { v17.s }[0], [x23], x5\n"
+    "st1 { v21.s }[0], [x22], x5\n"
+    "st1 { v25.s }[0], [x21], x5\n"
+    "st1 { v29.s }[0], [x20], x5\n"
+    "st1 { v18.s }[0], [x23], x5\n"
+    "st1 { v22.s }[0], [x22], x5\n"
+    "st1 { v26.s }[0], [x21], x5\n"
+    "st1 { v30.s }[0], [x20], x5\n"
+    "st1 { v19.s }[0], [x23]\n"
+    "st1 { v23.s }[0], [x22]\n"
+    "st1 { v27.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "72:"  // Tile loop: Oddments: Store: Bit 1: End
+    "73:"  // Tile loop: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x26, x26, #0x1\n"
+    "add x21, x27, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x26, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x27, x27, x21, LT\n"
+    "csel x26, x26, XZR, LT\n"
+    "cmp x27, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..76045f30d6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "mov x6, #0x10\n"  // cntb _, ALL, #1
+    "lsr x7, %x[n_channels], #0x2\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "sub x14, XZR, x6\n"
+    "cbz x7, 3f\n"
+    "ldr q30, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "cmp x6, x7, LSL #4\n"
+    "ldr q1, [x17, #0x20]\n"
+    "ldr q2, [x17, #0x30]\n"
+    "ldr q3, [x17, #0x40]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "add x17, x17, #0xa0\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "ldr q9, [x21, x15]\n"
+    "ldr q10, [x20, x15]\n"
+    "ldp x21, x20, [x16, #0x10]\n"
+    "ldr q11, [x21, x15]\n"
+    "ldr q12, [x20, x15]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v23.16b, v30.16b\n fmla v23.4s, v4.4s, v9.4s\n"
+    "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+    "ldr x27, [x16, #0x20]\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v25.16b, v30.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+    "mov v28.16b, v30.16b\n fmla v28.4s, v1.4s, v9.4s\n"
+    "ldr x23, [x16, #0x28]\n"
+    "ldr x22, [x16, #0x38]\n"
+    "mov v20.16b, v30.16b\n fmla v20.4s, v0.4s, v9.4s\n"
+    "mov v16.16b, v30.16b\n fmla v16.4s, v7.4s, v9.4s\n"
+    "ldr x26, [x16, #0x40]\n"
+    "ldr x20, [x16, #0x48]\n"
+    "mov v15.16b, v30.16b\n fmla v15.4s, v6.4s, v9.4s\n"
+    "fmla v23.4s, v5.4s, v12.4s\n"
+    "ldr x25, [x16, #0x50]\n"
+    "ldr x21, [x16, #0x58]\n"
+    "mov v27.16b, v30.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+    "mov v31.16b, v30.16b\n fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x24, x15]\n"
+    "ldr x13, [x16, #0x70]\n"
+    "fmla v17.4s, v0.4s, v10.4s\n"
+    "ldr q22, [x27, x15]\n"
+    "mov v10.16b, v30.16b\n fmla v10.4s, v2.4s, v11.4s\n"
+    "ldr q18, [x23, x15]\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "ldr x24, [x16, #0x60]\n"
+    "ldr x23, [x16, #0x68]\n"
+    "fmla v20.4s, v1.4s, v12.4s\n"
+    "fmla v16.4s, v8.4s, v12.4s\n"
+    "ldr x12, [x8, #0x0]\n"
+    "ldr x11, [x8, #0x8]\n"
+    "fmla v15.4s, v7.4s, v12.4s\n"
+    "mov v29.16b, v30.16b\n fmla v29.4s, v6.4s, v22.4s\n"
+    "ldr q22, [x20, x15]\n"
+    "ldr x28, [x16, #0x88]\n"
+    "fmla v23.4s, v7.4s, v9.4s\n"
+    "fmla v10.4s, v6.4s, v12.4s\n"
+    "ldr x10, [x8, #0x10]\n"
+    "ldr x9, [x8, #0x18]\n"
+    "mov v21.16b, v30.16b\n fmla v21.4s, v3.4s, v12.4s\n"
+    "mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v12.4s\n"
+    "ldr q11, [x22, x15]\n"
+    "ldr x22, [x16, #0x78]\n"
+    "mov v24.16b, v30.16b\n fmla v24.4s, v8.4s, v18.4s\n"
+    "ldr q12, [x26, x15]\n"
+    "fmla v25.4s, v6.4s, v9.4s\n"
+    "ldr x20, [x16, #0x80]\n"
+    "fmla v28.4s, v4.4s, v9.4s\n"
+    "fmla v20.4s, v3.4s, v9.4s\n"
+    "add x14, x14, #0x10\n"
+    "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v9.4s\n"
+    "mov v18.16b, v30.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+    "ldr q30, [x17, #0x0]\n"
+    "fmla v27.4s, v8.4s, v9.4s\n"
+    "fmla v31.4s, v5.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x25, x15]\n"
+    "fmla v17.4s, v1.4s, v11.4s\n"
+    "ldr x27, [x16, #0x90]\n"
+    "fmla v16.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x21, x15]\n"
+    "fmla v15.4s, v2.4s, v12.4s\n"
+    "ldr x21, [x16, #0x98]\n"
+    "fmla v23.4s, v8.4s, v22.4s\n"
+    "fmla v10.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x24, x15]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v25.4s, v7.4s, v22.4s\n"
+    "fmla v21.4s, v6.4s, v22.4s\n"
+    "fmla v28.4s, v5.4s, v22.4s\n"
+    "fmla v20.4s, v4.4s, v22.4s\n"
+    "fmla v19.4s, v3.4s, v22.4s\n"
+    "fmla v26.4s, v2.4s, v22.4s\n"
+    "fmla v18.4s, v1.4s, v22.4s\n"
+    "fmla v24.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x23, x15]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v17.4s, v3.4s, v9.4s\n"
+    "fmla v27.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "ldr q9, [x13, x15]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v16.4s, v4.4s, v22.4s\n"
+    "fmla v15.4s, v3.4s, v22.4s\n"
+    "fmla v23.4s, v1.4s, v22.4s\n"
+    "fmla v10.4s, v5.4s, v11.4s\n"
+    "fmla v21.4s, v2.4s, v11.4s\n"
+    "ldr q12, [x22, x15]\n"
+    "fmla v25.4s, v0.4s, v22.4s\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v19.4s, v8.4s, v9.4s\n"
+    "fmla v24.4s, v5.4s, v9.4s\n"
+    "ldr q11, [x20, x15]\n"
+    "ldr x22, [x16, #0xc0]\n"
+    "fmla v17.4s, v5.4s, v22.4s\n"
+    "fmla v27.4s, v2.4s, v22.4s\n"
+    "ldr q22, [x28, x15]\n"
+    "ldr x20, [x16, #0xc8]\n"
+    "fmla v16.4s, v5.4s, v12.4s\n"
+    "fmla v15.4s, v4.4s, v12.4s\n"
+    "fmla v23.4s, v2.4s, v12.4s\n"
+    "fmla v10.4s, v3.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "fmla v21.4s, v0.4s, v12.4s\n"
+    "ldr q9, [x21, x15]\n"
+    "ldr x28, [x16, #0xd8]\n"
+    "fmla v29.4s, v7.4s, v11.4s\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "ldr q12, [x27, x15]\n"
+    "ldr x21, [x16, #0xd0]\n"
+    "fmla v17.4s, v7.4s, v22.4s\n"
+    "fmla v16.4s, v6.4s, v22.4s\n"
+    "fmla v27.4s, v4.4s, v22.4s\n"
+    "fmla v23.4s, v3.4s, v22.4s\n"
+    "fmla v31.4s, v1.4s, v22.4s\n"
+    "fmla v28.4s, v0.4s, v22.4s\n"
+    "ldr q11, [x26, x15]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla v15.4s, v8.4s, v9.4s\n"
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "fmla v24.4s, v7.4s, v12.4s\n"
+    "ldr q12, [x25, x15]\n"
+    "fmla v19.4s, v1.4s, v9.4s\n"
+    "ldr x26, [x16, #0xe8]\n"
+    "fmla v10.4s, v7.4s, v9.4s\n"
+    "fmla v25.4s, v5.4s, v9.4s\n"
+    "fmla v21.4s, v4.4s, v9.4s\n"
+    "fmla v20.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x24, x15]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla v17.4s, v2.4s, v11.4s\n"
+    "fmla v16.4s, v1.4s, v11.4s\n"
+    "fmla v15.4s, v0.4s, v11.4s\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v27.4s, v7.4s, v12.4s\n"
+    "ldr x25, [x16, #0xf8]\n"
+    "fmla v23.4s, v6.4s, v12.4s\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v26.4s, v0.4s, v12.4s\n"
+    "ldr q11, [x22, x15]\n"
+    "fmla v19.4s, v4.4s, v11.4s\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla v18.4s, v2.4s, v11.4s\n"
+    "fmla v16.4s, v2.4s, v9.4s\n"
+    "fmla v15.4s, v1.4s, v9.4s\n"
+    "fmla v10.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x20, x15]\n"
+    "ldr x20, [x16, #0x108]\n"
+    "fmla v17.4s, v6.4s, v22.4s\n"
+    "fmla v27.4s, v3.4s, v22.4s\n"
+    "fmla v31.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x21, x15]\n"
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "ldr x22, [x16, #0x110]\n"
+    "fmla v21.4s, v7.4s, v11.4s\n"
+    "fmla v20.4s, v5.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "ldr q12, [x28, x15]\n"
+    "fmla v19.4s, v2.4s, v9.4s\n"
+    "ldr x21, [x16, #0x118]\n"
+    "fmla v29.4s, v0.4s, v22.4s\n"
+    "fmla v26.4s, v4.4s, v12.4s\n"
+    "fmla v18.4s, v3.4s, v12.4s\n"
+    "fmla v10.4s, v8.4s, v9.4s\n"
+    "fmla v21.4s, v5.4s, v9.4s\n"
+    "ldr q11, [x27, x15]\n"
+    "fmla v27.4s, v6.4s, v22.4s\n"
+    "fmla v31.4s, v3.4s, v22.4s\n"
+    "ldr q22, [x26, x15]\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmla v20.4s, v6.4s, v12.4s\n"
+    "fmla v29.4s, v5.4s, v12.4s\n"
+    "fmla v19.4s, v5.4s, v11.4s\n"
+    "fmla v24.4s, v2.4s, v11.4s\n"
+    "fmla v26.4s, v7.4s, v22.4s\n"
+    "fmla v18.4s, v6.4s, v22.4s\n"
+    "fmla v31.4s, v8.4s, v12.4s\n"
+    "ldr q12, [x24, x15]\n"
+    "fmla v29.4s, v8.4s, v22.4s\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v28.4s, v8.4s, v12.4s\n"
+    "fmla v20.4s, v7.4s, v12.4s\n"
+    "fmla v19.4s, v6.4s, v12.4s\n"
+    "fmla v26.4s, v5.4s, v12.4s\n"
+    "fmla v18.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "ldr q12, [x20, x15]\n"
+    "ldp x20, x24, [x16, #0x0]\n"
+    "ldr q9, [x20, x6]\n"
+    "fmla v21.4s, v8.4s, v11.4s\n"
+    "ldr q11, [x25, x15]\n"
+    "fmla v17.4s, v4.4s, v22.4s\n"
+    "fmla v16.4s, v3.4s, v22.4s\n"
+    "fmla v15.4s, v5.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmla v10.4s, v4.4s, v12.4s\n"
+    "fmla v26.4s, v8.4s, v11.4s\n"
+    "fmax v16.4s, v16.4s, v13.4s\n"
+    "fmla v18.4s, v7.4s, v11.4s\n"
+    "fmla v24.4s, v6.4s, v11.4s\n"
+    "ldr q11, [x22, x15]\n"
+    "fmax v15.4s, v15.4s, v13.4s\n"
+    "fmla v27.4s, v1.4s, v22.4s\n"
+    "fmla v23.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x21, x15]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "fmla v25.4s, v2.4s, v12.4s\n"
+    "ldr q2, [x17, #0x30]\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "ldr q1, [x17, #0x20]\n"
+    "fmax v10.4s, v10.4s, v13.4s\n"
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "fmla v28.4s, v6.4s, v11.4s\n"
+    "ldr q6, [x17, #0x70]\n"
+    "fmla v20.4s, v8.4s, v22.4s\n"
+    "ldr q8, [x17, #0x90]\n"
+    "fmla v19.4s, v7.4s, v22.4s\n"
+    "ldr q7, [x17, #0x80]\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "fmin v16.4s, v16.4s, v14.4s\n"
+    "str q17, [x12, x14]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "fmin v15.4s, v15.4s, v14.4s\n"
+    "fmin v10.4s, v10.4s, v14.4s\n"
+    "str q16, [x11, x14]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v13.4s\n"
+    "str q15, [x10, x14]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "fmax v21.4s, v21.4s, v13.4s\n"
+    "str q10, [x9, x14]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmla v26.4s, v3.4s, v11.4s\n"
+    "ldr q3, [x17, #0x40]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmla v18.4s, v5.4s, v22.4s\n"
+    "ldr q5, [x17, #0x60]\n"
+    "fmla v24.4s, v4.4s, v22.4s\n"
+    "ldr q10, [x24, x6]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "str q27, [x23, x14]\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "str q23, [x22, x14]\n"
+    "ldr x25, [x8, #0x40]\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmax v20.4s, v20.4s, v13.4s\n"
+    "str q25, [x21, x14]\n"
+    "ldr x23, [x8, #0x48]\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "str q21, [x20, x14]\n"
+    "ldr x22, [x8, #0x50]\n"
+    "ldr x24, [x8, #0x58]\n"
+    "ldp x21, x20, [x16, #0x10]\n"
+    "ldr q11, [x21, x6]\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "ldr q12, [x20, x6]\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "str q31, [x25, x14]\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "str q28, [x23, x14]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v13.4s\n"
+    "str q20, [x22, x14]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "str q19, [x24, x14]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x6, x6, #0x10\n"
+    "cmp x6, x7, LSL #4\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "add x15, x15, #0x10\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "str q29, [x23, x14]\n"
+    "add x17, x17, #0xa0\n"
+    "str q26, [x22, x14]\n"
+    "str q18, [x21, x14]\n"
+    "str q24, [x20, x14]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v30.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+    "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+    "ldr x27, [x16, #0x20]\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v15.16b, v30.16b\n fmla v15.4s, v3.4s, v9.4s\n"
+    "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "ldr x23, [x16, #0x28]\n"
+    "ldr x22, [x16, #0x38]\n"
+    "mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v9.4s\n"
+    "mov v20.16b, v30.16b\n fmla v20.4s, v7.4s, v9.4s\n"
+    "ldr x26, [x16, #0x40]\n"
+    "ldr x21, [x16, #0x48]\n"
+    "mov v21.16b, v30.16b\n fmla v21.4s, v6.4s, v9.4s\n"
+    "fmla v31.4s, v5.4s, v12.4s\n"
+    "ldr x25, [x16, #0x50]\n"
+    "ldr x20, [x16, #0x58]\n"
+    "mov v18.16b, v30.16b\n fmla v18.4s, v5.4s, v9.4s\n"
+    "mov v27.16b, v30.16b\n fmla v27.4s, v2.4s, v9.4s\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr x13, [x16, #0x70]\n"
+    "fmla v17.4s, v0.4s, v10.4s\n"
+    "ldr q22, [x27, x15]\n"
+    "mov v28.16b, v30.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr q16, [x23, x15]\n"
+    "fmla v15.4s, v4.4s, v12.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "ldr x24, [x16, #0x60]\n"
+    "ldr x23, [x16, #0x68]\n"
+    "fmla v19.4s, v1.4s, v12.4s\n"
+    "fmla v20.4s, v8.4s, v12.4s\n"
+    "ldr x12, [x8, #0x0]\n"
+    "ldr x11, [x8, #0x8]\n"
+    "fmla v21.4s, v7.4s, v12.4s\n"
+    "mov v10.16b, v30.16b\n fmla v10.4s, v6.4s, v22.4s\n"
+    "ldr q22, [x21, x15]\n"
+    "ldr x28, [x16, #0x88]\n"
+    "fmla v31.4s, v7.4s, v24.4s\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "ldr x10, [x8, #0x10]\n"
+    "ldr x9, [x8, #0x18]\n"
+    "mov v9.16b, v30.16b\n fmla v9.4s, v3.4s, v12.4s\n"
+    "mov v11.16b, v30.16b\n fmla v11.4s, v0.4s, v12.4s\n"
+    "ldr q23, [x22, x15]\n"
+    "ldr x22, [x16, #0x78]\n"
+    "mov v12.16b, v30.16b\n fmla v12.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x26, x15]\n"
+    "fmla v15.4s, v6.4s, v24.4s\n"
+    "ldr x21, [x16, #0x80]\n"
+    "fmla v29.4s, v4.4s, v24.4s\n"
+    "fmla v19.4s, v3.4s, v24.4s\n"
+    "add x14, x14, #0x10\n"
+    "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v24.4s\n"
+    "mov v25.16b, v30.16b\n fmla v25.4s, v0.4s, v24.4s\n"
+    "fmla v18.4s, v8.4s, v24.4s\n"
+    "fmla v27.4s, v5.4s, v24.4s\n"
+    "fmla v10.4s, v2.4s, v24.4s\n"
+    "ldr q24, [x25, x15]\n"
+    "fmla v17.4s, v1.4s, v23.4s\n"
+    "ldr x27, [x16, #0x90]\n"
+    "fmla v20.4s, v0.4s, v23.4s\n"
+    "ldr q23, [x20, x15]\n"
+    "fmla v21.4s, v2.4s, v16.4s\n"
+    "ldr x20, [x16, #0x98]\n"
+    "fmla v31.4s, v8.4s, v22.4s\n"
+    "fmla v28.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x24, x15]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v15.4s, v7.4s, v22.4s\n"
+    "fmla v9.4s, v6.4s, v22.4s\n"
+    "fmla v29.4s, v5.4s, v22.4s\n"
+    "fmla v19.4s, v4.4s, v22.4s\n"
+    "fmla v11.4s, v3.4s, v22.4s\n"
+    "fmla v26.4s, v2.4s, v22.4s\n"
+    "fmla v25.4s, v1.4s, v22.4s\n"
+    "fmla v12.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x23, x15]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v17.4s, v3.4s, v24.4s\n"
+    "fmla v18.4s, v0.4s, v24.4s\n"
+    "fmla v27.4s, v6.4s, v16.4s\n"
+    "fmla v10.4s, v3.4s, v16.4s\n"
+    "ldr q16, [x13, x15]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v20.4s, v4.4s, v22.4s\n"
+    "fmla v21.4s, v3.4s, v22.4s\n"
+    "fmla v31.4s, v1.4s, v22.4s\n"
+    "fmla v28.4s, v5.4s, v23.4s\n"
+    "fmla v9.4s, v2.4s, v23.4s\n"
+    "ldr q23, [x22, x15]\n"
+    "fmla v15.4s, v0.4s, v22.4s\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v11.4s, v8.4s, v16.4s\n"
+    "fmla v12.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x21, x15]\n"
+    "ldr x22, [x16, #0xc0]\n"
+    "fmla v17.4s, v5.4s, v22.4s\n"
+    "fmla v18.4s, v2.4s, v22.4s\n"
+    "ldr q22, [x28, x15]\n"
+    "ldr x21, [x16, #0xc8]\n"
+    "fmla v20.4s, v5.4s, v23.4s\n"
+    "fmla v21.4s, v4.4s, v23.4s\n"
+    "fmla v31.4s, v2.4s, v23.4s\n"
+    "fmla v28.4s, v3.4s, v23.4s\n"
+    "fmla v15.4s, v1.4s, v23.4s\n"
+    "fmla v9.4s, v0.4s, v23.4s\n"
+    "ldr q23, [x20, x15]\n"
+    "ldr x28, [x16, #0xd8]\n"
+    "fmla v10.4s, v7.4s, v16.4s\n"
+    "fmla v26.4s, v6.4s, v16.4s\n"
+    "ldr q16, [x27, x15]\n"
+    "ldr x20, [x16, #0xd0]\n"
+    "fmla v17.4s, v7.4s, v22.4s\n"
+    "fmla v20.4s, v6.4s, v22.4s\n"
+    "fmla v18.4s, v4.4s, v22.4s\n"
+    "fmla v31.4s, v3.4s, v22.4s\n"
+    "fmla v27.4s, v1.4s, v22.4s\n"
+    "fmla v29.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x26, x15]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla v21.4s, v8.4s, v23.4s\n"
+    "fmla v25.4s, v8.4s, v16.4s\n"
+    "fmla v12.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x25, x15]\n"
+    "fmla v11.4s, v1.4s, v23.4s\n"
+    "ldr x26, [x16, #0xe8]\n"
+    "fmla v28.4s, v7.4s, v23.4s\n"
+    "fmla v15.4s, v5.4s, v23.4s\n"
+    "fmla v9.4s, v4.4s, v23.4s\n"
+    "fmla v19.4s, v2.4s, v23.4s\n"
+    "ldr q23, [x24, x15]\n"
+    "ldr x25, [x16, #0xf0]\n"
+    "fmla v17.4s, v2.4s, v22.4s\n"
+    "fmla v20.4s, v1.4s, v22.4s\n"
+    "fmla v21.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x23, x15]\n"
+    "fmla v18.4s, v7.4s, v16.4s\n"
+    "ldr x24, [x16, #0xf8]\n"
+    "fmla v31.4s, v6.4s, v16.4s\n"
+    "fmla v27.4s, v4.4s, v16.4s\n"
+    "fmla v29.4s, v3.4s, v16.4s\n"
+    "fmla v10.4s, v1.4s, v16.4s\n"
+    "fmla v26.4s, v0.4s, v16.4s\n"
+    "ldr q16, [x22, x15]\n"
+    "fmla v11.4s, v4.4s, v16.4s\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla v25.4s, v2.4s, v16.4s\n"
+    "fmla v20.4s, v2.4s, v23.4s\n"
+    "fmla v21.4s, v1.4s, v23.4s\n"
+    "fmla v28.4s, v0.4s, v23.4s\n"
+    "ldr q23, [x21, x15]\n"
+    "ldr x22, [x16, #0x108]\n"
+    "fmla v17.4s, v6.4s, v22.4s\n"
+    "fmla v18.4s, v3.4s, v22.4s\n"
+    "fmla v27.4s, v0.4s, v22.4s\n"
+    "ldr q22, [x20, x15]\n"
+    "fmla v15.4s, v8.4s, v16.4s\n"
+    "ldr x21, [x16, #0x110]\n"
+    "fmla v9.4s, v7.4s, v16.4s\n"
+    "fmla v19.4s, v5.4s, v16.4s\n"
+    "fmla v12.4s, v1.4s, v16.4s\n"
+    "ldr q16, [x28, x15]\n"
+    "fmla v11.4s, v2.4s, v23.4s\n"
+    "ldr x20, [x16, #0x118]\n"
+    "fmla v10.4s, v0.4s, v22.4s\n"
+    "fmla v26.4s, v4.4s, v16.4s\n"
+    "fmla v25.4s, v3.4s, v16.4s\n"
+    "fmla v28.4s, v8.4s, v23.4s\n"
+    "fmla v9.4s, v5.4s, v23.4s\n"
+    "ldr q23, [x27, x15]\n"
+    "fmla v18.4s, v6.4s, v22.4s\n"
+    "fmla v27.4s, v3.4s, v22.4s\n"
+    "ldr q22, [x26, x15]\n"
+    "fmla v29.4s, v7.4s, v16.4s\n"
+    "fmla v19.4s, v6.4s, v16.4s\n"
+    "fmla v10.4s, v5.4s, v16.4s\n"
+    "fmla v11.4s, v5.4s, v23.4s\n"
+    "fmla v12.4s, v2.4s, v23.4s\n"
+    "fmla v26.4s, v7.4s, v22.4s\n"
+    "fmla v25.4s, v6.4s, v22.4s\n"
+    "fmla v27.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x25, x15]\n"
+    "fmla v10.4s, v8.4s, v22.4s\n"
+    "ldr q30, [x23, x15]\n"
+    "fmla v29.4s, v8.4s, v16.4s\n"
+    "fmla v19.4s, v7.4s, v16.4s\n"
+    "fmla v11.4s, v6.4s, v16.4s\n"
+    "fmla v26.4s, v5.4s, v16.4s\n"
+    "fmla v25.4s, v4.4s, v16.4s\n"
+    "fmla v12.4s, v3.4s, v16.4s\n"
+    "ldr q24, [x22, x15]\n"
+    "fmla v9.4s, v8.4s, v23.4s\n"
+    "ldr q16, [x24, x15]\n"
+    "fmla v17.4s, v4.4s, v30.4s\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmla v20.4s, v3.4s, v30.4s\n"
+    "fmla v21.4s, v5.4s, v24.4s\n"
+    "fmax v20.4s, v20.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v24.4s\n"
+    "fmla v26.4s, v8.4s, v16.4s\n"
+    "fmax v21.4s, v21.4s, v13.4s\n"
+    "fmla v25.4s, v7.4s, v16.4s\n"
+    "fmla v12.4s, v6.4s, v16.4s\n"
+    "ldr q23, [x21, x15]\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmla v18.4s, v1.4s, v30.4s\n"
+    "fmla v31.4s, v0.4s, v30.4s\n"
+    "ldr q16, [x20, x15]\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "fmla v15.4s, v2.4s, v24.4s\n"
+    "fmla v9.4s, v1.4s, v24.4s\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "str q17, [x12, x14]\n"
+    "fmla v27.4s, v7.4s, v23.4s\n"
+    "fmla v29.4s, v6.4s, v23.4s\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "str q20, [x11, x14]\n"
+    "fmla v19.4s, v8.4s, v16.4s\n"
+    "fmla v11.4s, v7.4s, v16.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "str q21, [x10, x14]\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "str q28, [x9, x14]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "fmax v15.4s, v15.4s, v13.4s\n"
+    "fmax v9.4s, v9.4s, v13.4s\n"
+    "ldr x22, [x8, #0x28]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "fmla v10.4s, v4.4s, v23.4s\n"
+    "fmla v26.4s, v3.4s, v23.4s\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "fmla v25.4s, v5.4s, v16.4s\n"
+    "fmla v12.4s, v4.4s, v16.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "str q18, [x23, x14]\n"
+    "fmin v15.4s, v15.4s, v14.4s\n"
+    "fmin v9.4s, v9.4s, v14.4s\n"
+    "str q31, [x22, x14]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "str q15, [x21, x14]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "fmax v11.4s, v11.4s, v13.4s\n"
+    "str q9, [x20, x14]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "str q27, [x23, x14]\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "fmin v11.4s, v11.4s, v14.4s\n"
+    "str q29, [x22, x14]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "fmax v10.4s, v10.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "str q19, [x21, x14]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "fmax v12.4s, v12.4s, v13.4s\n"
+    "str q11, [x20, x14]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "fmin v10.4s, v10.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "str q10, [x23, x14]\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v12.4s, v12.4s, v14.4s\n"
+    "str q26, [x22, x14]\n"
+    "add x15, x15, #0x10\n"
+    "str q25, [x21, x14]\n"
+    "str q12, [x20, x14]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 72f\n"
+    "ldr q30, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "mov x14, x15\n"
+    "ldr q1, [x17, #0x20]\n"
+    "ldr q2, [x17, #0x30]\n"
+    "ldr q3, [x17, #0x40]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "ldr x23, [x16, #0x0]\n"
+    "ldr x22, [x16, #0x8]\n"
+    "add x23, x23, x15\n"
+    "add x22, x22, x15\n"
+    "ldr x21, [x16, #0x10]\n"
+    "ldr x20, [x16, #0x18]\n"
+    "add x21, x21, x15\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x23], #0x8\n"
+    "ld1 { v10.d }[0], [x22], #0x8\n"
+    "ld1 { v11.d }[0], [x21], #0x8\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x23], #0x4\n"
+    "ld1 { v10.s }[2], [x22], #0x4\n"
+    "ld1 { v11.s }[2], [x21], #0x4\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+    "ld1 { v9.s }[0], [x23], #0x4\n"
+    "ld1 { v10.s }[0], [x22], #0x4\n"
+    "ld1 { v11.s }[0], [x21], #0x4\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+    "mov v16.16b, v30.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+    "mov v17.16b, v30.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+    "ldr x20, [x16, #0x20]\n"
+    "add x20, x20, x15\n"
+    "mov v18.16b, v30.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+    "mov v21.16b, v30.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+    "mov v22.16b, v30.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+    "mov v25.16b, v30.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+    "mov v26.16b, v30.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+    "mov v19.16b, v30.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+    "mov v20.16b, v30.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+    "mov v24.16b, v30.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "fmla v17.4s, v8.4s, v12.4s\n"
+    "fmla v18.4s, v7.4s, v12.4s\n"
+    "fmla v19.4s, v6.4s, v12.4s\n"
+    "fmla v21.4s, v5.4s, v12.4s\n"
+    "fmla v22.4s, v4.4s, v12.4s\n"
+    "mov v23.16b, v30.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+    "fmla v25.4s, v2.4s, v12.4s\n"
+    "fmla v26.4s, v1.4s, v12.4s\n"
+    "mov v27.16b, v30.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (5, 0): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load input (5, 0): Bit 1: End
+    "ldr x20, [x16, #0x28]\n"
+    "mov v28.16b, v30.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (5, 5): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "9:"  // Oddments: Load input (5, 5): Bit 1: End
+    "ldr x20, [x16, #0x30]\n"
+    "mov v31.16b, v30.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Load input (3, 2): Bit 1: End
+    "ldr x20, [x16, #0x38]\n"
+    "fmla v20.4s, v8.4s, v9.4s\n"
+    "fmla v21.4s, v7.4s, v9.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v22.4s, v6.4s, v9.4s\n"
+    "fmla v24.4s, v5.4s, v9.4s\n"
+    "fmla v25.4s, v4.4s, v9.4s\n"
+    "fmla v26.4s, v3.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "fmla v30.4s, v0.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "13:"  // Oddments: Load input (0, 1): Bit 1: End
+    "ldr x20, [x16, #0x40]\n"
+    "fmla v16.4s, v1.4s, v12.4s\n"
+    "fmla v17.4s, v0.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (0, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "15:"  // Oddments: Load input (0, 4): Bit 1: End
+    "ldr x20, [x16, #0x48]\n"
+    "fmla v18.4s, v2.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "17:"  // Oddments: Load input (3, 3): Bit 1: End
+    "ldr x20, [x16, #0x50]\n"
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "fmla v22.4s, v7.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v23.4s, v6.4s, v10.4s\n"
+    "fmla v25.4s, v5.4s, v10.4s\n"
+    "fmla v26.4s, v4.4s, v10.4s\n"
+    "fmla v27.4s, v3.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load input (1, 0): Bit 1: End
+    "ldr x20, [x16, #0x58]\n"
+    "fmla v16.4s, v3.4s, v9.4s\n"
+    "fmla v20.4s, v0.4s, v9.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (1, 5): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "21:"  // Oddments: Load input (1, 5): Bit 1: End
+    "ldr x20, [x16, #0x60]\n"
+    "fmla v19.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v2.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "23:"  // Oddments: Load input (4, 0): Bit 1: End
+    "ldr x20, [x16, #0x68]\n"
+    "fmla v24.4s, v6.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (1, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "25:"  // Oddments: Load input (1, 2): Bit 1: End
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v16.4s, v5.4s, v10.4s\n"
+    "fmla v17.4s, v4.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v18.4s, v3.4s, v10.4s\n"
+    "fmla v20.4s, v2.4s, v10.4s\n"
+    "fmla v21.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 5): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "27:"  // Oddments: Load input (4, 5): Bit 1: End
+    "ldr x20, [x16, #0x78]\n"
+    "fmla v27.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v5.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "29:"  // Oddments: Load input (1, 3): Bit 1: End
+    "ldr x20, [x16, #0x80]\n"
+    "fmla v17.4s, v5.4s, v12.4s\n"
+    "fmla v18.4s, v4.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v19.4s, v3.4s, v12.4s\n"
+    "fmla v21.4s, v2.4s, v12.4s\n"
+    "fmla v22.4s, v1.4s, v12.4s\n"
+    "fmla v23.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (5, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "31:"  // Oddments: Load input (5, 1): Bit 1: End
+    "ldr x20, [x16, #0x88]\n"
+    "fmla v28.4s, v7.4s, v11.4s\n"
+    "fmla v29.4s, v6.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "33:"  // Oddments: Load input (2, 1): Bit 1: End
+    "ldr x20, [x16, #0x90]\n"
+    "fmla v16.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v6.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v20.4s, v4.4s, v10.4s\n"
+    "fmla v21.4s, v3.4s, v10.4s\n"
+    "fmla v24.4s, v1.4s, v10.4s\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (5, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "35:"  // Oddments: Load input (5, 4): Bit 1: End
+    "ldr x20, [x16, #0x98]\n"
+    "fmla v30.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "37:"  // Oddments: Load input (2, 4): Bit 1: End
+    "ldr x20, [x16, #0xa0]\n"
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "fmla v19.4s, v7.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v22.4s, v5.4s, v12.4s\n"
+    "fmla v23.4s, v4.4s, v12.4s\n"
+    "fmla v26.4s, v2.4s, v12.4s\n"
+    "fmla v27.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "39:"  // Oddments: Load input (0, 2): Bit 1: End
+    "ldr x20, [x16, #0xa8]\n"
+    "fmla v16.4s, v2.4s, v10.4s\n"
+    "fmla v17.4s, v1.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v18.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "41:"  // Oddments: Load input (3, 1): Bit 1: End
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v20.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v6.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v24.4s, v4.4s, v11.4s\n"
+    "fmla v25.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (0, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "43:"  // Oddments: Load input (0, 3): Bit 1: End
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla v17.4s, v2.4s, v12.4s\n"
+    "fmla v18.4s, v1.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v19.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "45:"  // Oddments: Load input (2, 0): Bit 1: End
+    "ldr x20, [x16, #0xc0]\n"
+    "fmla v16.4s, v6.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "47:"  // Oddments: Load input (3, 4): Bit 1: End
+    "ldr x20, [x16, #0xc8]\n"
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "fmla v23.4s, v7.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v26.4s, v5.4s, v11.4s\n"
+    "fmla v27.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 49f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 49f\n"
+    "48:"  // Oddments: Load input (2, 5): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "49:"  // Oddments: Load input (2, 5): Bit 1: End
+    "ldr x20, [x16, #0xd0]\n"
+    "fmla v19.4s, v8.4s, v12.4s\n"
+    "fmla v23.4s, v5.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v27.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "51:"  // Oddments: Load input (3, 0): Bit 1: End
+    "ldr x20, [x16, #0xd8]\n"
+    "fmla v20.4s, v6.4s, v10.4s\n"
+    "fmla v24.4s, v3.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 53f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 53f\n"
+    "52:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "53:"  // Oddments: Load input (4, 2): Bit 1: End
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla v24.4s, v8.4s, v11.4s\n"
+    "fmla v25.4s, v7.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "fmla v28.4s, v5.4s, v11.4s\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (3, 5): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "55:"  // Oddments: Load input (3, 5): Bit 1: End
+    "ldr x20, [x16, #0xe8]\n"
+    "fmla v23.4s, v8.4s, v12.4s\n"
+    "fmla v27.4s, v5.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 57f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 57f\n"
+    "56:"  // Oddments: Load input (5, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "57:"  // Oddments: Load input (5, 2): Bit 1: End
+    "ldr x20, [x16, #0xf0]\n"
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v30.4s, v6.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "59:"  // Oddments: Load input (4, 3): Bit 1: End
+    "ldr x20, [x16, #0xf8]\n"
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "fmla v26.4s, v7.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v27.4s, v6.4s, v11.4s\n"
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 61f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 61f\n"
+    "60:"  // Oddments: Load input (5, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "61:"  // Oddments: Load input (5, 3): Bit 1: End
+    "ldr x20, [x16, #0x100]\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (1, 1): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "63:"  // Oddments: Load input (1, 1): Bit 1: End
+    "ldr x20, [x16, #0x108]\n"
+    "fmla v16.4s, v4.4s, v10.4s\n"
+    "fmla v17.4s, v3.4s, v10.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v20.4s, v1.4s, v10.4s\n"
+    "fmla v21.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 65f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 65f\n"
+    "64:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "65:"  // Oddments: Load input (1, 4): Bit 1: End
+    "ldr x20, [x16, #0x110]\n"
+    "fmla v18.4s, v5.4s, v11.4s\n"
+    "fmla v19.4s, v4.4s, v11.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v22.4s, v2.4s, v11.4s\n"
+    "fmla v23.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "67:"  // Oddments: Load input (4, 1): Bit 1: End
+    "ldr x20, [x16, #0x118]\n"
+    "fmla v24.4s, v7.4s, v12.4s\n"
+    "fmla v25.4s, v6.4s, v12.4s\n"
+    "add x20, x20, x15\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 69f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 69f\n"
+    "68:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "69:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "fmax v16.4s, v16.4s, v13.4s\n"
+    "fmla v30.4s, v5.4s, v10.4s\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmax v17.4s, v17.4s, v13.4s\n"
+    "fmax v18.4s, v18.4s, v13.4s\n"
+    "fmax v19.4s, v19.4s, v13.4s\n"
+    "fmax v20.4s, v20.4s, v13.4s\n"
+    "fmax v21.4s, v21.4s, v13.4s\n"
+    "fmax v22.4s, v22.4s, v13.4s\n"
+    "fmax v23.4s, v23.4s, v13.4s\n"
+    "fmax v24.4s, v24.4s, v13.4s\n"
+    "fmax v25.4s, v25.4s, v13.4s\n"
+    "fmax v26.4s, v26.4s, v13.4s\n"
+    "fmax v27.4s, v27.4s, v13.4s\n"
+    "fmax v28.4s, v28.4s, v13.4s\n"
+    "fmax v29.4s, v29.4s, v13.4s\n"
+    "fmax v30.4s, v30.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v13.4s\n"
+    "fmin v16.4s, v16.4s, v14.4s\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 70f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.d }[0], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.d }[0], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.d }[0], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "add x14, x14, #0x8\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.s }[2], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.s }[2], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.s }[2], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Store: Bit 1: Unset
+    "ldr x23, [x8, #0x0]\n"
+    "ldr x22, [x8, #0x8]\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "ldr x21, [x8, #0x10]\n"
+    "ldr x20, [x8, #0x18]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "st1 { v16.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x20]\n"
+    "add x23, x23, x14\n"
+    "st1 { v17.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x28]\n"
+    "add x22, x22, x14\n"
+    "st1 { v18.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x30]\n"
+    "add x21, x21, x14\n"
+    "st1 { v19.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x38]\n"
+    "add x20, x20, x14\n"
+    "st1 { v20.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x23, x23, x14\n"
+    "st1 { v21.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x48]\n"
+    "add x22, x22, x14\n"
+    "st1 { v22.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x50]\n"
+    "add x21, x21, x14\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "add x20, x20, x14\n"
+    "st1 { v24.s }[0], [x23]\n"
+    "ldr x23, [x8, #0x60]\n"
+    "add x23, x23, x14\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "ldr x22, [x8, #0x68]\n"
+    "add x22, x22, x14\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x21, [x8, #0x70]\n"
+    "add x21, x21, x14\n"
+    "st1 { v27.s }[0], [x20]\n"
+    "ldr x20, [x8, #0x78]\n"
+    "add x20, x20, x14\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "71:"  // Oddments: Store: Bit 1: End
+    "72:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f727efea80
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..5ab61fad4c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,615 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x23, #0x0\n"
+    "mov x27, #0x0\n"
+    "1:"  // Tile loop
+    "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x26, #0x4\n"
+    "mov x25, #0x2\n"
+    "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x23, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x27, x6, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x7, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "lsl x6, x6, #0x2\n"
+    "mul x20, x23, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "mul x22, x22, x26\n"  // offset *= kernel_stride * output_size
+    "add x8, x8, x22, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x16, x8, x24, LSL #2\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x27, x7, x20\n"  // offset += tile_j * ld_output_col
+    "lsr x22, %x[n_channels], #0x2\n"
+    "add x14, x16, x24, LSL #2\n"
+    "mul x20, x20, x25\n"  // offset *= output_tile_size
+    "add x13, x6, x6\n"
+    "add x12, x14, x24, LSL #2\n"
+    "add x11, x13, x6\n"
+    "add x17, x17, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x10, x12, x24, LSL #2\n"
+    "add x9, x11, x6\n"
+    "add x28, x17, x21, LSL #2\n"
+    "lsl x7, x7, #0x2\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q31, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldr q9, [x14, x13]\n"
+    "ld1 { v10.4s }, [x8]\n"
+    "ldr q11, [x8, x6]\n"
+    "ldr q12, [x8, x11]\n"
+    "ldr q13, [x8, x9]\n"
+    "ld1 { v14.4s }, [x16]\n"
+    "ldr q15, [x16, x6]\n"
+    "ldr q16, [x8, x13]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+    "add x23, x23, #0x10\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x8]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q21, [x16, x9]\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "ldr q18, [x16, x11]\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "ldr q17, [x16, x13]\n"
+    "fmla v29.4s, v3.4s, v14.4s\n"
+    "ld1 { v20.4s }, [x12]\n"
+    "fmla v28.4s, v0.4s, v16.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.4s, v4.4s, v15.4s\n"
+    "ld1 { v25.4s }, [x14]\n"
+    "fmla v28.4s, v4.4s, v18.4s\n"
+    "ldr q19, [x12, x6]\n"
+    "fmla v29.4s, v2.4s, v16.4s\n"
+    "ldr q18, [x14, x6]\n"
+    "fmla v28.4s, v5.4s, v21.4s\n"
+    "ldr q24, [x14, x11]\n"
+    "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+    "ldr q31, [x15, #0x0]\n"
+    "cmp x23, x22, LSL #4\n"
+    "fmla v29.4s, v5.4s, v17.4s\n"
+    "fmla v28.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x12, x11]\n"
+    "add x20, x20, #0x10\n"
+    "fmla v23.4s, v3.4s, v20.4s\n"
+    "ldr q16, [x12, x9]\n"
+    "fmla v22.4s, v4.4s, v17.4s\n"
+    "ldr q21, [x10, x6]\n"
+    "fmla v23.4s, v0.4s, v25.4s\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v22.4s, v1.4s, v24.4s\n"
+    "add x21, x21, #0x10\n"
+    "fmla v23.4s, v4.4s, v19.4s\n"
+    "ldr q20, [x14, x9]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "fmla v22.4s, v5.4s, v16.4s\n"
+    "ldr q19, [x10, x11]\n"
+    "fmla v29.4s, v6.4s, v25.4s\n"
+    "ld1 { v17.4s }, [x10]\n"
+    "fmla v23.4s, v1.4s, v18.4s\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmla v22.4s, v2.4s, v20.4s\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmla v29.4s, v7.4s, v18.4s\n"
+    "ldr q16, [x12, x13]\n"
+    "fmla v23.4s, v6.4s, v17.4s\n"
+    "ldr q18, [x10, x13]\n"
+    "fmla v22.4s, v3.4s, v16.4s\n"
+    "ldr q3, [x15, #0x40]\n"
+    "fmla v23.4s, v7.4s, v21.4s\n"
+    "ldr q13, [x8, x9]\n"
+    "fmla v22.4s, v7.4s, v19.4s\n"
+    "ld1 { v14.4s }, [x16]\n"
+    "fmla v28.4s, v7.4s, v24.4s\n"
+    "ldr q12, [x8, x11]\n"
+    "fmla v23.4s, v5.4s, v16.4s\n"
+    "ldr q16, [x8, x13]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "fmla v22.4s, v6.4s, v18.4s\n"
+    "fmla v28.4s, v8.4s, v20.4s\n"
+    "ldr q17, [x10, x9]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "fmla v23.4s, v8.4s, v18.4s\n"
+    "fmla v22.4s, v8.4s, v17.4s\n"
+    "ldr q11, [x8, x6]\n"
+    "ldr q15, [x16, x6]\n"
+    "fmax v29.4s, v29.4s, v26.4s\n"
+    "fmax v28.4s, v28.4s, v26.4s\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "fmax v23.4s, v23.4s, v26.4s\n"
+    "fmax v22.4s, v22.4s, v26.4s\n"
+    "add x14, x14, #0x10\n"
+    "ldr q9, [x14, x13]\n"
+    "fmin v29.4s, v29.4s, v27.4s\n"
+    "fmin v28.4s, v28.4s, v27.4s\n"
+    "fmin v23.4s, v23.4s, v27.4s\n"
+    "fmin v22.4s, v22.4s, v27.4s\n"
+    "add x12, x12, #0x10\n"
+    "add x10, x10, #0x10\n"
+    "st1 { v29.4s }, [x17]\n"
+    "add x15, x15, #0xa0\n"
+    "str q28, [x17, x7]\n"
+    "add x17, x17, #0x10\n"
+    "st1 { v23.4s }, [x28]\n"
+    "str q22, [x28, x7]\n"
+    "add x28, x28, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+    "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q20, [x16, x9]\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "ldr q18, [x16, x11]\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "ldr q17, [x16, x13]\n"
+    "fmla v29.4s, v3.4s, v14.4s\n"
+    "ld1 { v19.4s }, [x12]\n"
+    "fmla v28.4s, v0.4s, v16.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.4s, v4.4s, v15.4s\n"
+    "ld1 { v25.4s }, [x14]\n"
+    "fmla v28.4s, v4.4s, v18.4s\n"
+    "ldr q18, [x12, x6]\n"
+    "fmla v29.4s, v2.4s, v16.4s\n"
+    "ldr q24, [x14, x6]\n"
+    "fmla v28.4s, v5.4s, v20.4s\n"
+    "ldr q23, [x14, x11]\n"
+    "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "fmla v29.4s, v5.4s, v17.4s\n"
+    "fmla v28.4s, v3.4s, v17.4s\n"
+    "ldr q17, [x12, x11]\n"
+    "fmla v22.4s, v3.4s, v19.4s\n"
+    "ldr q16, [x12, x9]\n"
+    "fmla v21.4s, v4.4s, v17.4s\n"
+    "ldr q20, [x10, x6]\n"
+    "fmla v22.4s, v0.4s, v25.4s\n"
+    "fmla v21.4s, v1.4s, v23.4s\n"
+    "fmla v22.4s, v4.4s, v18.4s\n"
+    "ldr q19, [x14, x9]\n"
+    "fmla v21.4s, v5.4s, v16.4s\n"
+    "ldr q18, [x10, x11]\n"
+    "fmla v29.4s, v6.4s, v25.4s\n"
+    "ld1 { v17.4s }, [x10]\n"
+    "fmla v22.4s, v1.4s, v24.4s\n"
+    "add x14, x14, #0x10\n"
+    "fmla v21.4s, v2.4s, v19.4s\n"
+    "fmla v29.4s, v7.4s, v24.4s\n"
+    "ldr q16, [x12, x13]\n"
+    "fmax v29.4s, v29.4s, v26.4s\n"
+    "fmla v22.4s, v6.4s, v17.4s\n"
+    "ldr q17, [x10, x13]\n"
+    "fmla v21.4s, v3.4s, v16.4s\n"
+    "fmin v29.4s, v29.4s, v27.4s\n"
+    "fmla v22.4s, v7.4s, v20.4s\n"
+    "fmla v21.4s, v7.4s, v18.4s\n"
+    "st1 { v29.4s }, [x17]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.4s, v7.4s, v23.4s\n"
+    "fmla v22.4s, v5.4s, v16.4s\n"
+    "fmla v21.4s, v6.4s, v17.4s\n"
+    "fmla v28.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x10, x9]\n"
+    "fmax v28.4s, v28.4s, v26.4s\n"
+    "fmla v22.4s, v8.4s, v17.4s\n"
+    "fmla v21.4s, v8.4s, v16.4s\n"
+    "fmax v22.4s, v22.4s, v26.4s\n"
+    "add x10, x10, #0x10\n"
+    "fmax v21.4s, v21.4s, v26.4s\n"
+    "fmin v28.4s, v28.4s, v27.4s\n"
+    "str q28, [x17, x7]\n"
+    "add x17, x17, #0x10\n"
+    "fmin v22.4s, v22.4s, v27.4s\n"
+    "fmin v21.4s, v21.4s, v27.4s\n"
+    "st1 { v22.4s }, [x28]\n"
+    "str q21, [x28, x7]\n"
+    "add x28, x28, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 43f\n"
+    "ldr q31, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "add x27, x14, x13\n"
+    "add x26, x8, XZR\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "add x25, x8, x6\n"
+    "add x24, x8, x11\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x23, x8, x9\n"
+    "add x22, x16, XZR\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "add x21, x16, x6\n"
+    "add x20, x8, x13\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x27], #0x8\n"
+    "ldr d10, [x26], #0x8\n"
+    "ldr d11, [x25], #0x8\n"
+    "ldr d12, [x24], #0x8\n"
+    "ldr d13, [x23], #0x8\n"
+    "ldr d14, [x22], #0x8\n"
+    "ldr d15, [x21], #0x8\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x27]\n"
+    "ld1 { v10.s }[2], [x26]\n"
+    "ld1 { v11.s }[2], [x25]\n"
+    "ld1 { v12.s }[2], [x24]\n"
+    "ld1 { v13.s }[2], [x23]\n"
+    "ld1 { v14.s }[2], [x22]\n"
+    "ld1 { v15.s }[2], [x21]\n"
+    "ld1 { v16.s }[2], [x20]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+    "ldr s9, [x27, #0x0]\n"
+    "ldr s10, [x26, #0x0]\n"
+    "ldr s11, [x25, #0x0]\n"
+    "ldr s12, [x24, #0x0]\n"
+    "ldr s13, [x23, #0x0]\n"
+    "ldr s14, [x22, #0x0]\n"
+    "ldr s15, [x21, #0x0]\n"
+    "ldr s16, [x20, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+    "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "add x20, x16, x11\n"
+    "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v14.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v15.4s\n"
+    "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v29.4s, v0.4s, v16.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "add x20, x16, x9\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v29.4s, v5.4s, v12.4s\n"
+    "add x20, x16, x13\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+    "fmla v28.4s, v5.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "add x20, x12, XZR\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.s }[2], [x20]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s14, [x20, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v30.4s, v3.4s, v14.4s\n"
+    "add x20, x14, XZR\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s15, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v28.4s, v6.4s, v15.4s\n"
+    "fmla v30.4s, v0.4s, v15.4s\n"
+    "add x20, x12, x6\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "add x20, x14, x6\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v16.s }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s16, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v28.4s, v7.4s, v16.4s\n"
+    "fmla v30.4s, v1.4s, v16.4s\n"
+    "add x20, x12, x11\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "add x20, x14, x11\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v29.4s, v7.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "add x20, x12, x9\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v14.s }[2], [x20]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s14, [x20, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v31.4s, v5.4s, v14.4s\n"
+    "add x20, x10, XZR\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr s15, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v30.4s, v6.4s, v15.4s\n"
+    "add x20, x14, x9\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v29.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "add x20, x10, x6\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v30.4s, v7.4s, v13.4s\n"
+    "add x20, x12, x13\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d16, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v16.s }[2], [x20]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s16, [x20, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v30.4s, v5.4s, v16.4s\n"
+    "fmla v31.4s, v3.4s, v16.4s\n"
+    "add x20, x10, x11\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v14.s }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s14, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v31.4s, v7.4s, v14.4s\n"
+    "add x20, x10, x13\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d15, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s15, [x20, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v30.4s, v8.4s, v15.4s\n"
+    "fmla v31.4s, v6.4s, v15.4s\n"
+    "add x20, x10, x9\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v31.4s, v8.4s, v11.4s\n"
+    "fmax v28.4s, v28.4s, v26.4s\n"
+    "fmax v29.4s, v29.4s, v26.4s\n"
+    "fmax v30.4s, v30.4s, v26.4s\n"
+    "fmax v31.4s, v31.4s, v26.4s\n"
+    "fmin v28.4s, v28.4s, v27.4s\n"
+    "fmin v29.4s, v29.4s, v27.4s\n"
+    "fmin v30.4s, v30.4s, v27.4s\n"
+    "fmin v31.4s, v31.4s, v27.4s\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.d }[0], [x21], x7\n"
+    "st1 { v30.d }[0], [x20], x7\n"
+    "add x17, x17, #0x8\n"
+    "add x28, x28, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.s }[2], [x21], x7\n"
+    "st1 { v30.s }[2], [x20], x7\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x21, x17\n"
+    "mov x20, x28\n"
+    "st1 { v28.s }[0], [x21], x7\n"
+    "st1 { v30.s }[0], [x20], x7\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "42:"  // Tile loop: Oddments: Store: Bit 1: End
+    "43:"  // Tile loop: End
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x27, x27, #0x1\n"
+    "add x21, x23, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x27, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x23, x23, x21, LT\n"
+    "csel x27, x27, XZR, LT\n"
+    "cmp x23, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..24fe255dfb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x25, #0x10\n"  // cntb _, ALL, #1
+    "lsr x24, %x[n_channels], #0x2\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "mov x28, #0x0\n"
+    "sub x22, XZR, x25\n"
+    "cbz x24, 3f\n"
+    "ldr q31, [x23, #0x0]\n"
+    "ldr q0, [x23, #0x10]\n"
+    "cmp x25, x24, LSL #4\n"
+    "ldr q1, [x23, #0x20]\n"
+    "ldr q2, [x23, #0x30]\n"
+    "ldr q3, [x23, #0x40]\n"
+    "ldr q4, [x23, #0x50]\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "ldr q8, [x23, #0x90]\n"
+    "add x23, x23, #0xa0\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q10, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "ldr q11, [x21, x28]\n"
+    "ldr q12, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x20]\n"
+    "ldr q13, [x21, x28]\n"
+    "ldr q14, [x20, x28]\n"
+    "ldp x21, x20, [x13, #0x30]\n"
+    "ldr q15, [x21, x28]\n"
+    "ldr q16, [x20, x28]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v24.16b, v31.16b\n fmla v24.4s, v8.4s, v9.4s\n"
+    "mov v23.16b, v31.16b\n fmla v23.4s, v6.4s, v9.4s\n"
+    "ldr x21, [x13, #0x40]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v24.4s, v0.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v12.4s\n"
+    "ldr q20, [x20, x28]\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "ldr q19, [x21, x28]\n"
+    "fmla v23.4s, v2.4s, v13.4s\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v24.4s, v3.4s, v14.4s\n"
+    "fmla v23.4s, v0.4s, v16.4s\n"
+    "ldr x20, [x13, #0x58]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v24.4s, v4.4s, v15.4s\n"
+    "fmla v23.4s, v4.4s, v19.4s\n"
+    "ldr x21, [x13, #0x78]\n"
+    "ldr x20, [x13, #0x60]\n"
+    "ldr q22, [x20, x28]\n"
+    "fmla v24.4s, v2.4s, v16.4s\n"
+    "fmla v23.4s, v5.4s, v20.4s\n"
+    "ldr x20, [x13, #0x80]\n"
+    "ldr q21, [x20, x28]\n"
+    "mov v20.16b, v31.16b\n fmla v20.4s, v2.4s, v9.4s\n"
+    "mov v19.16b, v31.16b\n fmla v19.4s, v0.4s, v9.4s\n"
+    "ldr q31, [x23, #0x0]\n"
+    "fmla v24.4s, v5.4s, v18.4s\n"
+    "fmla v23.4s, v3.4s, v18.4s\n"
+    "ldr q16, [x21, x28]\n"
+    "ldr x20, [x13, #0x68]\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v20.4s, v3.4s, v17.4s\n"
+    "fmla v19.4s, v4.4s, v16.4s\n"
+    "ldr x20, [x13, #0x88]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v20.4s, v0.4s, v22.4s\n"
+    "ldr q0, [x23, #0x10]\n"
+    "fmla v19.4s, v1.4s, v21.4s\n"
+    "ldr x20, [x13, #0x70]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v20.4s, v4.4s, v18.4s\n"
+    "fmla v19.4s, v5.4s, v16.4s\n"
+    "ldr q4, [x23, #0x50]\n"
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v24.4s, v6.4s, v22.4s\n"
+    "fmla v20.4s, v1.4s, v17.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q1, [x23, #0x20]\n"
+    "fmla v19.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v7.4s, v17.4s\n"
+    "ldr q2, [x23, #0x30]\n"
+    "ldr x20, [x13, #0x90]\n"
+    "fmla v23.4s, v7.4s, v21.4s\n"
+    "fmla v23.4s, v8.4s, v16.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v20.4s, v6.4s, v16.4s\n"
+    "fmax v24.4s, v24.4s, v26.4s\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0xa0]\n"
+    "fmla v19.4s, v3.4s, v17.4s\n"
+    "fmax v23.4s, v23.4s, v26.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q3, [x23, #0x40]\n"
+    "fmla v20.4s, v7.4s, v16.4s\n"
+    "fmla v20.4s, v5.4s, v17.4s\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr x20, [x13, #0xb0]\n"
+    "add x22, x22, #0x10\n"
+    "fmin v24.4s, v24.4s, v27.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr x20, [x13, #0xb8]\n"
+    "fmla v19.4s, v7.4s, v16.4s\n"
+    "fmin v23.4s, v23.4s, v27.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "fmla v19.4s, v6.4s, v16.4s\n"
+    "fmla v20.4s, v8.4s, v16.4s\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr x20, [x13, #0xc0]\n"
+    "fmax v20.4s, v20.4s, v26.4s\n"
+    "fmin v20.4s, v20.4s, v27.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v19.4s, v8.4s, v16.4s\n"
+    "ldr q8, [x23, #0x90]\n"
+    "fmax v19.4s, v19.4s, v26.4s\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ldr q9, [x21, x25]\n"
+    "fmin v19.4s, v19.4s, v27.4s\n"
+    "add x28, x28, #0x10\n"
+    "ldr q10, [x20, x25]\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    "str q24, [x12, x22]\n"
+    "add x23, x23, #0xa0\n"
+    "ldr q11, [x21, x25]\n"
+    "ldr q12, [x20, x25]\n"
+    "str q23, [x11, x22]\n"
+    "ldp x21, x20, [x13, #0x20]\n"
+    "ldr q13, [x21, x25]\n"
+    "str q20, [x10, x22]\n"
+    "ldr q14, [x20, x25]\n"
+    "ldp x21, x20, [x13, #0x30]\n"
+    "str q19, [x9, x22]\n"
+    "ldr q15, [x21, x25]\n"
+    "ldr q16, [x20, x25]\n"
+    "add x25, x25, #0x10\n"
+    "cmp x25, x24, LSL #4\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v25.16b, v31.16b\n fmla v25.4s, v8.4s, v9.4s\n"
+    "mov v24.16b, v31.16b\n fmla v24.4s, v6.4s, v9.4s\n"
+    "ldr x21, [x13, #0x40]\n"
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "fmla v24.4s, v1.4s, v12.4s\n"
+    "ldr q20, [x20, x28]\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v25.4s, v1.4s, v11.4s\n"
+    "ldr q18, [x21, x28]\n"
+    "fmla v24.4s, v2.4s, v13.4s\n"
+    "ldr q19, [x20, x28]\n"
+    "fmla v25.4s, v3.4s, v14.4s\n"
+    "fmla v24.4s, v0.4s, v16.4s\n"
+    "ldr x20, [x13, #0x58]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v25.4s, v4.4s, v15.4s\n"
+    "fmla v24.4s, v4.4s, v18.4s\n"
+    "ldr x21, [x13, #0x78]\n"
+    "ldr x20, [x13, #0x60]\n"
+    "ldr q23, [x20, x28]\n"
+    "fmla v25.4s, v2.4s, v16.4s\n"
+    "fmla v24.4s, v5.4s, v20.4s\n"
+    "ldr x20, [x13, #0x80]\n"
+    "ldr q22, [x20, x28]\n"
+    "mov v21.16b, v31.16b\n fmla v21.4s, v2.4s, v9.4s\n"
+    "mov v20.16b, v31.16b\n fmla v20.4s, v0.4s, v9.4s\n"
+    "ldr x20, [x13, #0x68]\n"
+    "ldr q18, [x20, x28]\n"
+    "fmla v25.4s, v5.4s, v19.4s\n"
+    "fmla v24.4s, v3.4s, v19.4s\n"
+    "ldr q16, [x21, x28]\n"
+    "fmla v21.4s, v3.4s, v17.4s\n"
+    "fmla v20.4s, v4.4s, v16.4s\n"
+    "ldr x20, [x13, #0x88]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.4s, v0.4s, v23.4s\n"
+    "fmla v20.4s, v1.4s, v22.4s\n"
+    "ldr x20, [x13, #0x70]\n"
+    "ldr q17, [x20, x28]\n"
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v21.4s, v4.4s, v18.4s\n"
+    "ldr q19, [x20, x28]\n"
+    "fmla v20.4s, v5.4s, v16.4s\n"
+    "fmla v25.4s, v6.4s, v23.4s\n"
+    "ldr x20, [x13, #0x90]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.4s, v1.4s, v17.4s\n"
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v20.4s, v2.4s, v19.4s\n"
+    "fmla v25.4s, v7.4s, v17.4s\n"
+    "ldr q18, [x20, x28]\n"
+    "ldr x20, [x13, #0xa0]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v21.4s, v6.4s, v16.4s\n"
+    "fmla v20.4s, v3.4s, v18.4s\n"
+    "ldr x20, [x13, #0xb0]\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.4s, v7.4s, v17.4s\n"
+    "fmla v20.4s, v7.4s, v16.4s\n"
+    "ldr x20, [x13, #0xb8]\n"
+    "ldr q17, [x20, x28]\n"
+    "fmla v24.4s, v7.4s, v22.4s\n"
+    "fmla v21.4s, v5.4s, v18.4s\n"
+    "ldr x20, [x13, #0xc0]\n"
+    "fmla v20.4s, v6.4s, v17.4s\n"
+    "fmla v24.4s, v8.4s, v19.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v21.4s, v8.4s, v17.4s\n"
+    "fmla v20.4s, v8.4s, v16.4s\n"
+    "fmax v25.4s, v25.4s, v26.4s\n"
+    "add x22, x22, #0x10\n"
+    "fmax v24.4s, v24.4s, v26.4s\n"
+    "fmax v21.4s, v21.4s, v26.4s\n"
+    "add x28, x28, #0x10\n"
+    "fmax v20.4s, v20.4s, v26.4s\n"
+    "fmin v25.4s, v25.4s, v27.4s\n"
+    "str q25, [x12, x22]\n"
+    "fmin v24.4s, v24.4s, v27.4s\n"
+    "fmin v21.4s, v21.4s, v27.4s\n"
+    "str q24, [x11, x22]\n"
+    "fmin v20.4s, v20.4s, v27.4s\n"
+    "str q21, [x10, x22]\n"
+    "str q20, [x9, x22]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 42f\n"
+    "ldr q31, [x23, #0x0]\n"
+    "ldr q0, [x23, #0x10]\n"
+    "mov x20, x28\n"
+    "add x12, x12, x20\n"
+    "ldr q1, [x23, #0x20]\n"
+    "ldr q2, [x23, #0x30]\n"
+    "add x11, x11, x20\n"
+    "add x10, x10, x20\n"
+    "ldr q3, [x23, #0x40]\n"
+    "ldr q4, [x23, #0x50]\n"
+    "add x9, x9, x20\n"
+    "ldr q5, [x23, #0x60]\n"
+    "ldr q6, [x23, #0x70]\n"
+    "ldr q7, [x23, #0x80]\n"
+    "ldr q8, [x23, #0x90]\n"
+    "ldr x27, [x13, #0x0]\n"
+    "ldr x26, [x13, #0x8]\n"
+    "add x27, x27, x28\n"
+    "add x26, x26, x28\n"
+    "ldr x25, [x13, #0x10]\n"
+    "ldr x24, [x13, #0x18]\n"
+    "add x25, x25, x28\n"
+    "add x24, x24, x28\n"
+    "ldr x23, [x13, #0x20]\n"
+    "ldr x22, [x13, #0x28]\n"
+    "add x23, x23, x28\n"
+    "add x22, x22, x28\n"
+    "ldr x21, [x13, #0x30]\n"
+    "ldr x20, [x13, #0x38]\n"
+    "add x21, x21, x28\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x27], #0x8\n"
+    "ld1 { v10.d }[0], [x26], #0x8\n"
+    "ld1 { v11.d }[0], [x25], #0x8\n"
+    "ld1 { v12.d }[0], [x24], #0x8\n"
+    "ld1 { v13.d }[0], [x23], #0x8\n"
+    "ld1 { v14.d }[0], [x22], #0x8\n"
+    "ld1 { v15.d }[0], [x21], #0x8\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x27], #0x4\n"
+    "ld1 { v10.s }[2], [x26], #0x4\n"
+    "ld1 { v11.s }[2], [x25], #0x4\n"
+    "ld1 { v12.s }[2], [x24], #0x4\n"
+    "ld1 { v13.s }[2], [x23], #0x4\n"
+    "ld1 { v14.s }[2], [x22], #0x4\n"
+    "ld1 { v15.s }[2], [x21], #0x4\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x27], #0x4\n"
+    "ld1 { v10.s }[0], [x26], #0x4\n"
+    "ld1 { v11.s }[0], [x25], #0x4\n"
+    "ld1 { v12.s }[0], [x24], #0x4\n"
+    "ld1 { v13.s }[0], [x23], #0x4\n"
+    "ld1 { v14.s }[0], [x22], #0x4\n"
+    "ld1 { v15.s }[0], [x21], #0x4\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+    "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v10.4s\n"
+    "ldr x20, [x13, #0x40]\n"
+    "add x20, x20, x28\n"
+    "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v14.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v15.4s\n"
+    "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v16.4s\n"
+    "fmla v29.4s, v0.4s, v16.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load input (1, 3): Bit 1: End
+    "ldr x20, [x13, #0x48]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "9:"  // Oddments: Load input (1, 4): Bit 1: End
+    "ldr x20, [x13, #0x50]\n"
+    "fmla v29.4s, v5.4s, v12.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (1, 2): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Load input (1, 2): Bit 1: End
+    "ldr x20, [x13, #0x58]\n"
+    "fmla v28.4s, v5.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "13:"  // Oddments: Load input (3, 0): Bit 1: End
+    "ldr x20, [x13, #0x60]\n"
+    "fmla v30.4s, v3.4s, v14.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "15:"  // Oddments: Load input (2, 0): Bit 1: End
+    "ldr x20, [x13, #0x68]\n"
+    "fmla v28.4s, v6.4s, v15.4s\n"
+    "fmla v30.4s, v0.4s, v15.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "17:"  // Oddments: Load input (3, 1): Bit 1: End
+    "ldr x20, [x13, #0x70]\n"
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load input (2, 1): Bit 1: End
+    "ldr x20, [x13, #0x78]\n"
+    "fmla v28.4s, v7.4s, v16.4s\n"
+    "fmla v30.4s, v1.4s, v16.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "21:"  // Oddments: Load input (3, 3): Bit 1: End
+    "ldr x20, [x13, #0x80]\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "23:"  // Oddments: Load input (2, 3): Bit 1: End
+    "ldr x20, [x13, #0x88]\n"
+    "fmla v29.4s, v7.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "25:"  // Oddments: Load input (3, 4): Bit 1: End
+    "ldr x20, [x13, #0x90]\n"
+    "fmla v31.4s, v5.4s, v14.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "27:"  // Oddments: Load input (4, 0): Bit 1: End
+    "ldr x20, [x13, #0x98]\n"
+    "fmla v30.4s, v6.4s, v15.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "29:"  // Oddments: Load input (2, 4): Bit 1: End
+    "ldr x20, [x13, #0xa0]\n"
+    "fmla v29.4s, v8.4s, v11.4s\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "31:"  // Oddments: Load input (4, 1): Bit 1: End
+    "ldr x20, [x13, #0xa8]\n"
+    "fmla v30.4s, v7.4s, v13.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "33:"  // Oddments: Load input (3, 2): Bit 1: End
+    "ldr x20, [x13, #0xb0]\n"
+    "fmla v30.4s, v5.4s, v16.4s\n"
+    "fmla v31.4s, v3.4s, v16.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "35:"  // Oddments: Load input (4, 3): Bit 1: End
+    "ldr x20, [x13, #0xb8]\n"
+    "fmla v31.4s, v7.4s, v14.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "37:"  // Oddments: Load input (4, 2): Bit 1: End
+    "ldr x20, [x13, #0xc0]\n"
+    "fmla v30.4s, v8.4s, v15.4s\n"
+    "fmla v31.4s, v6.4s, v15.4s\n"
+    "add x20, x20, x28\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "39:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v31.4s, v8.4s, v11.4s\n"
+    "fmax v28.4s, v28.4s, v26.4s\n"
+    "fmax v29.4s, v29.4s, v26.4s\n"
+    "fmax v30.4s, v30.4s, v26.4s\n"
+    "fmax v31.4s, v31.4s, v26.4s\n"
+    "fmin v28.4s, v28.4s, v27.4s\n"
+    "fmin v29.4s, v29.4s, v27.4s\n"
+    "fmin v30.4s, v30.4s, v27.4s\n"
+    "fmin v31.4s, v31.4s, v27.4s\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "st1 { v28.d }[0], [x12], #0x8\n"
+    "st1 { v29.d }[0], [x11], #0x8\n"
+    "st1 { v30.d }[0], [x10], #0x8\n"
+    "st1 { v31.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "st1 { v28.s }[2], [x12], #0x4\n"
+    "st1 { v29.s }[2], [x11], #0x4\n"
+    "st1 { v30.s }[2], [x10], #0x4\n"
+    "st1 { v31.s }[2], [x9], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v28.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x11], #0x4\n"
+    "st1 { v30.s }[0], [x10], #0x4\n"
+    "st1 { v31.s }[0], [x9], #0x4\n"
+    "41:"  // Oddments: Store: Bit 1: End
+    "42:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..de8a1e4514
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..3426fbc3f9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x27, #0x0\n"
+    "mov x26, #0x0\n"
+    "1:"  // Tile loop
+    "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x23, #0x2\n"
+    "mov x25, #0x2\n"
+    "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x27, x24\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x26, x2, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x3, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "lsl x2, x2, #0x2\n"
+    "mul x20, x27, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x6, x2, x2\n"
+    "mul x22, x22, x23\n"  // offset *= kernel_stride * output_size
+    "add x4, x4, x22, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x7, x4, x24, LSL #2\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x26, x3, x20\n"  // offset += tile_j * ld_output_col
+    "add x17, x7, x24, LSL #2\n"
+    "mov x23, #0x10\n"  // cntb _, ALL, #1
+    "mul x20, x20, x25\n"  // offset *= output_tile_size
+    "lsr x22, %x[n_channels], #0x2\n"
+    "add x16, x17, x24, LSL #2\n"
+    "add x15, x6, x2\n"
+    "add x14, x16, x24, LSL #2\n"
+    "add x13, x15, x2\n"
+    "add x5, x5, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x12, x14, x24, LSL #2\n"
+    "add x11, x13, x2\n"
+    "add x10, x5, x21, LSL #2\n"
+    "lsl x3, x3, #0x2\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "cbz x22, 4f\n"
+    "ldr q25, [x8, #0x0]\n"
+    "ldr q0, [x8, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "ldr q1, [x8, #0x20]\n"
+    "ldr q2, [x8, #0x30]\n"
+    "ldr q3, [x8, #0x40]\n"
+    "ldr q4, [x8, #0x50]\n"
+    "add x8, x8, #0x60\n"
+    "ld1 { v5.4s }, [x4]\n"
+    "ldr q6, [x4, x2]\n"
+    "ld1 { v7.4s }, [x7]\n"
+    "ldr q8, [x7, x2]\n"
+    "ldr q9, [x4, x6]\n"
+    "ldr q13, [x7, x6]\n"
+    "ldr q11, [x4, x15]\n"
+    "ldr q12, [x4, x13]\n"
+    "ldr q10, [x7, x11]\n"
+    "ld1 { v14.4s }, [x17]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v5.4s\n"
+    "ldr q23, [x7, x15]\n"
+    "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v6.4s\n"
+    "add x23, x23, #0x10\n"
+    "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+    "mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+    "ldr q19, [x8, #0x0]\n"
+    "ldr q25, [x8, #0x140]\n"
+    "fmla v30.4s, v1.4s, v6.4s\n"
+    "ldr q21, [x7, x13]\n"
+    "fmla v31.4s, v1.4s, v9.4s\n"
+    "add x7, x7, #0x10\n"
+    "fmla v29.4s, v1.4s, v8.4s\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "ldr q1, [x8, #0x10]\n"
+    "cmp x23, x22, LSL #4\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "ldr q18, [x4, x11]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "add x4, x4, #0x10\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v23.4s\n"
+    "ldr q17, [x8, #0x20]\n"
+    "add x20, x20, #0x10\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "ldr q6, [x17, x2]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "add x21, x21, #0x10\n"
+    "fmla v29.4s, v3.4s, v23.4s\n"
+    "fmla v28.4s, v3.4s, v21.4s\n"
+    "ldr q16, [x8, #0x30]\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "ldr q2, [x17, x6]\n"
+    "fmla v31.4s, v4.4s, v18.4s\n"
+    "ldr q0, [x17, x15]\n"
+    "fmla v29.4s, v4.4s, v21.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr q20, [x8, #0x40]\n"
+    "fmla v30.4s, v19.4s, v7.4s\n"
+    "ld1 { v7.4s }, [x7]\n"
+    "fmla v31.4s, v19.4s, v8.4s\n"
+    "fmla v29.4s, v19.4s, v14.4s\n"
+    "fmla v28.4s, v19.4s, v6.4s\n"
+    "ldr q19, [x8, #0x50]\n"
+    "fmla v30.4s, v1.4s, v8.4s\n"
+    "ldr q26, [x17, x11]\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v29.4s, v1.4s, v6.4s\n"
+    "fmla v28.4s, v1.4s, v2.4s\n"
+    "ldr q18, [x8, #0x60]\n"
+    "fmla v30.4s, v17.4s, v13.4s\n"
+    "ldr q1, [x17, x13]\n"
+    "fmla v31.4s, v17.4s, v23.4s\n"
+    "add x17, x17, #0x10\n"
+    "fmla v29.4s, v17.4s, v2.4s\n"
+    "fmla v28.4s, v17.4s, v0.4s\n"
+    "ldr q17, [x8, #0x70]\n"
+    "fmla v30.4s, v16.4s, v23.4s\n"
+    "ld1 { v24.4s }, [x16]\n"
+    "fmla v31.4s, v16.4s, v21.4s\n"
+    "fmla v29.4s, v16.4s, v0.4s\n"
+    "fmla v28.4s, v16.4s, v1.4s\n"
+    "ldr q16, [x8, #0x80]\n"
+    "fmla v30.4s, v20.4s, v21.4s\n"
+    "ldr q23, [x16, x2]\n"
+    "fmla v31.4s, v20.4s, v10.4s\n"
+    "ldr q22, [x16, x6]\n"
+    "fmla v29.4s, v20.4s, v1.4s\n"
+    "fmla v28.4s, v20.4s, v26.4s\n"
+    "ldr q21, [x8, #0x90]\n"
+    "fmla v30.4s, v19.4s, v14.4s\n"
+    "ldr q5, [x16, x11]\n"
+    "fmla v31.4s, v19.4s, v6.4s\n"
+    "fmla v29.4s, v19.4s, v24.4s\n"
+    "fmla v28.4s, v19.4s, v23.4s\n"
+    "ldr q11, [x8, #0xa0]\n"
+    "fmla v30.4s, v18.4s, v6.4s\n"
+    "ldr q20, [x16, x15]\n"
+    "fmla v31.4s, v18.4s, v2.4s\n"
+    "fmla v29.4s, v18.4s, v23.4s\n"
+    "fmla v28.4s, v18.4s, v22.4s\n"
+    "ldr q18, [x8, #0xb0]\n"
+    "fmla v30.4s, v17.4s, v2.4s\n"
+    "ldr q19, [x16, x13]\n"
+    "fmla v31.4s, v17.4s, v0.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmla v29.4s, v17.4s, v22.4s\n"
+    "fmla v28.4s, v17.4s, v20.4s\n"
+    "ldr q17, [x8, #0xc0]\n"
+    "fmla v30.4s, v16.4s, v0.4s\n"
+    "ld1 { v0.4s }, [x14]\n"
+    "fmla v31.4s, v16.4s, v1.4s\n"
+    "fmla v29.4s, v16.4s, v20.4s\n"
+    "fmla v28.4s, v16.4s, v19.4s\n"
+    "ldr q16, [x8, #0xd0]\n"
+    "fmla v30.4s, v21.4s, v1.4s\n"
+    "ldr q4, [x14, x2]\n"
+    "fmla v31.4s, v21.4s, v26.4s\n"
+    "ldr q12, [x14, x13]\n"
+    "fmla v29.4s, v21.4s, v19.4s\n"
+    "fmla v28.4s, v21.4s, v5.4s\n"
+    "ldr q13, [x8, #0xe0]\n"
+    "fmla v30.4s, v11.4s, v24.4s\n"
+    "ldr q6, [x14, x6]\n"
+    "fmla v31.4s, v11.4s, v23.4s\n"
+    "fmla v29.4s, v11.4s, v0.4s\n"
+    "fmla v28.4s, v11.4s, v4.4s\n"
+    "ldr q24, [x8, #0xf0]\n"
+    "fmla v30.4s, v18.4s, v23.4s\n"
+    "ldr q26, [x14, x15]\n"
+    "fmla v31.4s, v18.4s, v22.4s\n"
+    "fmla v29.4s, v18.4s, v4.4s\n"
+    "fmla v28.4s, v18.4s, v6.4s\n"
+    "ldr q23, [x8, #0x100]\n"
+    "fmla v30.4s, v17.4s, v22.4s\n"
+    "ldr q22, [x14, x11]\n"
+    "fmla v31.4s, v17.4s, v20.4s\n"
+    "add x14, x14, #0x10\n"
+    "fmla v29.4s, v17.4s, v6.4s\n"
+    "fmla v28.4s, v17.4s, v26.4s\n"
+    "ldr q21, [x8, #0x110]\n"
+    "fmla v30.4s, v16.4s, v20.4s\n"
+    "ld1 { v18.4s }, [x12]\n"
+    "fmla v31.4s, v16.4s, v19.4s\n"
+    "fmla v29.4s, v16.4s, v26.4s\n"
+    "fmla v28.4s, v16.4s, v12.4s\n"
+    "ldr q20, [x8, #0x120]\n"
+    "fmla v30.4s, v13.4s, v19.4s\n"
+    "ldr q17, [x12, x2]\n"
+    "fmla v31.4s, v13.4s, v5.4s\n"
+    "ld1 { v14.4s }, [x17]\n"
+    "fmla v29.4s, v13.4s, v12.4s\n"
+    "fmla v28.4s, v13.4s, v22.4s\n"
+    "ldr q19, [x8, #0x130]\n"
+    "fmla v30.4s, v24.4s, v0.4s\n"
+    "ldr q16, [x12, x6]\n"
+    "fmla v31.4s, v24.4s, v4.4s\n"
+    "fmla v29.4s, v24.4s, v18.4s\n"
+    "ldr q18, [x12, x15]\n"
+    "fmla v28.4s, v24.4s, v17.4s\n"
+    "ldr q0, [x8, #0x150]\n"
+    "fmla v30.4s, v23.4s, v4.4s\n"
+    "ldr q13, [x7, x6]\n"
+    "fmla v31.4s, v23.4s, v6.4s\n"
+    "fmla v29.4s, v23.4s, v17.4s\n"
+    "ldr q17, [x12, x13]\n"
+    "fmla v28.4s, v23.4s, v16.4s\n"
+    "ldr q1, [x8, #0x160]\n"
+    "fmla v30.4s, v21.4s, v6.4s\n"
+    "ld1 { v5.4s }, [x4]\n"
+    "fmla v31.4s, v21.4s, v26.4s\n"
+    "fmla v29.4s, v21.4s, v16.4s\n"
+    "ldr q16, [x12, x11]\n"
+    "fmla v28.4s, v21.4s, v18.4s\n"
+    "ldr q2, [x8, #0x170]\n"
+    "fmla v30.4s, v20.4s, v26.4s\n"
+    "ldr q6, [x4, x2]\n"
+    "fmla v31.4s, v20.4s, v12.4s\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.4s, v20.4s, v18.4s\n"
+    "ldr q11, [x4, x15]\n"
+    "fmla v28.4s, v20.4s, v17.4s\n"
+    "ldr q3, [x8, #0x180]\n"
+    "fmla v30.4s, v19.4s, v12.4s\n"
+    "ldr q8, [x7, x2]\n"
+    "fmla v31.4s, v19.4s, v22.4s\n"
+    "ldr q10, [x7, x11]\n"
+    "fmla v29.4s, v19.4s, v17.4s\n"
+    "ldr q12, [x4, x13]\n"
+    "fmla v28.4s, v19.4s, v16.4s\n"
+    "ldr q9, [x4, x6]\n"
+    "ldr q4, [x8, #0x190]\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "add x8, x8, #0x1a0\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "st1 { v30.4s }, [x5]\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "str q31, [x5, x3]\n"
+    "add x5, x5, #0x10\n"
+    "st1 { v29.4s }, [x10]\n"
+    "str q28, [x10, x3]\n"
+    "add x10, x10, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr q22, [x7, x15]\n"
+    "mov v5.16b, v25.16b\n fmla v5.4s, v0.4s, v6.4s\n"
+    "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+    "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+    "ldr q19, [x8, #0x0]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q21, [x7, x13]\n"
+    "fmla v5.4s, v1.4s, v9.4s\n"
+    "add x7, x7, #0x10\n"
+    "fmla v30.4s, v1.4s, v8.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "ldr q18, [x8, #0x10]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr q16, [x4, x11]\n"
+    "fmla v5.4s, v2.4s, v11.4s\n"
+    "add x4, x4, #0x10\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "fmla v29.4s, v2.4s, v22.4s\n"
+    "ldr q17, [x8, #0x20]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q6, [x17, x2]\n"
+    "fmla v5.4s, v3.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v22.4s\n"
+    "fmla v29.4s, v3.4s, v21.4s\n"
+    "ldr q20, [x8, #0x30]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q2, [x17, x6]\n"
+    "fmla v5.4s, v4.4s, v16.4s\n"
+    "ldr q28, [x17, x15]\n"
+    "fmla v30.4s, v4.4s, v21.4s\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "ldr q16, [x8, #0x40]\n"
+    "fmla v31.4s, v19.4s, v7.4s\n"
+    "fmla v5.4s, v19.4s, v8.4s\n"
+    "fmla v30.4s, v19.4s, v14.4s\n"
+    "fmla v29.4s, v19.4s, v6.4s\n"
+    "ldr q19, [x8, #0x50]\n"
+    "fmla v31.4s, v18.4s, v8.4s\n"
+    "ldr q1, [x17, x11]\n"
+    "fmla v5.4s, v18.4s, v13.4s\n"
+    "fmla v30.4s, v18.4s, v6.4s\n"
+    "fmla v29.4s, v18.4s, v2.4s\n"
+    "ldr q18, [x8, #0x60]\n"
+    "fmla v31.4s, v17.4s, v13.4s\n"
+    "ldr q26, [x17, x13]\n"
+    "fmla v5.4s, v17.4s, v22.4s\n"
+    "add x17, x17, #0x10\n"
+    "fmla v30.4s, v17.4s, v2.4s\n"
+    "fmla v29.4s, v17.4s, v28.4s\n"
+    "ldr q17, [x8, #0x70]\n"
+    "fmla v31.4s, v20.4s, v22.4s\n"
+    "ld1 { v25.4s }, [x16]\n"
+    "fmla v5.4s, v20.4s, v21.4s\n"
+    "fmla v30.4s, v20.4s, v28.4s\n"
+    "fmla v29.4s, v20.4s, v26.4s\n"
+    "ldr q24, [x8, #0x80]\n"
+    "fmla v31.4s, v16.4s, v21.4s\n"
+    "ldr q23, [x16, x2]\n"
+    "fmla v5.4s, v16.4s, v10.4s\n"
+    "ldr q0, [x16, x6]\n"
+    "fmla v30.4s, v16.4s, v26.4s\n"
+    "fmla v29.4s, v16.4s, v1.4s\n"
+    "ldr q22, [x8, #0x90]\n"
+    "fmla v31.4s, v19.4s, v14.4s\n"
+    "ldr q16, [x16, x11]\n"
+    "fmla v5.4s, v19.4s, v6.4s\n"
+    "fmla v30.4s, v19.4s, v25.4s\n"
+    "fmla v29.4s, v19.4s, v23.4s\n"
+    "ldr q21, [x8, #0xa0]\n"
+    "fmla v31.4s, v18.4s, v6.4s\n"
+    "ldr q20, [x16, x15]\n"
+    "fmla v5.4s, v18.4s, v2.4s\n"
+    "fmla v30.4s, v18.4s, v23.4s\n"
+    "fmla v29.4s, v18.4s, v0.4s\n"
+    "ldr q18, [x8, #0xb0]\n"
+    "fmla v31.4s, v17.4s, v2.4s\n"
+    "ldr q19, [x16, x13]\n"
+    "fmla v5.4s, v17.4s, v28.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmla v30.4s, v17.4s, v0.4s\n"
+    "fmla v29.4s, v17.4s, v20.4s\n"
+    "ldr q17, [x8, #0xc0]\n"
+    "fmla v31.4s, v24.4s, v28.4s\n"
+    "ld1 { v7.4s }, [x14]\n"
+    "fmla v5.4s, v24.4s, v26.4s\n"
+    "fmla v30.4s, v24.4s, v20.4s\n"
+    "fmla v29.4s, v24.4s, v19.4s\n"
+    "ldr q2, [x8, #0xd0]\n"
+    "fmla v31.4s, v22.4s, v26.4s\n"
+    "ldr q28, [x14, x2]\n"
+    "fmla v5.4s, v22.4s, v1.4s\n"
+    "ldr q13, [x14, x13]\n"
+    "fmla v30.4s, v22.4s, v19.4s\n"
+    "fmla v29.4s, v22.4s, v16.4s\n"
+    "ldr q14, [x8, #0xe0]\n"
+    "fmla v31.4s, v21.4s, v25.4s\n"
+    "ldr q26, [x14, x6]\n"
+    "fmla v5.4s, v21.4s, v23.4s\n"
+    "fmla v30.4s, v21.4s, v7.4s\n"
+    "fmla v29.4s, v21.4s, v28.4s\n"
+    "ldr q25, [x8, #0xf0]\n"
+    "fmla v31.4s, v18.4s, v23.4s\n"
+    "ldr q24, [x14, x15]\n"
+    "fmla v5.4s, v18.4s, v0.4s\n"
+    "fmla v30.4s, v18.4s, v28.4s\n"
+    "fmla v29.4s, v18.4s, v26.4s\n"
+    "ldr q23, [x8, #0x100]\n"
+    "fmla v31.4s, v17.4s, v0.4s\n"
+    "ldr q22, [x14, x11]\n"
+    "fmla v5.4s, v17.4s, v20.4s\n"
+    "add x14, x14, #0x10\n"
+    "fmla v30.4s, v17.4s, v26.4s\n"
+    "fmla v29.4s, v17.4s, v24.4s\n"
+    "ldr q21, [x8, #0x110]\n"
+    "fmla v31.4s, v2.4s, v20.4s\n"
+    "ld1 { v18.4s }, [x12]\n"
+    "fmla v5.4s, v2.4s, v19.4s\n"
+    "fmla v30.4s, v2.4s, v24.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "ldr q20, [x8, #0x120]\n"
+    "fmla v31.4s, v14.4s, v19.4s\n"
+    "ldr q17, [x12, x2]\n"
+    "fmla v5.4s, v14.4s, v16.4s\n"
+    "fmla v30.4s, v14.4s, v13.4s\n"
+    "fmla v29.4s, v14.4s, v22.4s\n"
+    "ldr q19, [x8, #0x130]\n"
+    "add x8, x8, #0x140\n"
+    "fmla v31.4s, v25.4s, v7.4s\n"
+    "ldr q16, [x12, x6]\n"
+    "fmla v5.4s, v25.4s, v28.4s\n"
+    "fmla v30.4s, v25.4s, v18.4s\n"
+    "ldr q18, [x12, x15]\n"
+    "fmla v29.4s, v25.4s, v17.4s\n"
+    "fmla v31.4s, v23.4s, v28.4s\n"
+    "fmla v5.4s, v23.4s, v26.4s\n"
+    "fmla v30.4s, v23.4s, v17.4s\n"
+    "ldr q17, [x12, x13]\n"
+    "fmla v29.4s, v23.4s, v16.4s\n"
+    "fmla v31.4s, v21.4s, v26.4s\n"
+    "fmla v5.4s, v21.4s, v24.4s\n"
+    "fmla v30.4s, v21.4s, v16.4s\n"
+    "ldr q16, [x12, x11]\n"
+    "fmla v29.4s, v21.4s, v18.4s\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.4s, v20.4s, v24.4s\n"
+    "fmla v5.4s, v20.4s, v13.4s\n"
+    "fmla v30.4s, v20.4s, v18.4s\n"
+    "fmla v29.4s, v20.4s, v17.4s\n"
+    "fmla v31.4s, v19.4s, v13.4s\n"
+    "fmla v5.4s, v19.4s, v22.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmla v30.4s, v19.4s, v17.4s\n"
+    "fmla v29.4s, v19.4s, v16.4s\n"
+    "fmax v5.4s, v5.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "fmin v5.4s, v5.4s, v15.4s\n"
+    "st1 { v31.4s }, [x5]\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "str q5, [x5, x3]\n"
+    "add x5, x5, #0x10\n"
+    "st1 { v30.4s }, [x10]\n"
+    "str q29, [x10, x3]\n"
+    "add x10, x10, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 61f\n"
+    "ldr q25, [x8, #0x0]\n"
+    "ldr q0, [x8, #0x10]\n"
+    "add x9, x4, XZR\n"
+    "add x28, x4, x2\n"
+    "ldr q1, [x8, #0x20]\n"
+    "ldr q2, [x8, #0x30]\n"
+    "add x27, x7, XZR\n"
+    "add x26, x7, x2\n"
+    "ldr q3, [x8, #0x40]\n"
+    "ldr q4, [x8, #0x50]\n"
+    "add x25, x4, x6\n"
+    "add x24, x7, x6\n"
+    "add x23, x4, x15\n"
+    "add x22, x4, x13\n"
+    "add x21, x7, x11\n"
+    "add x20, x17, XZR\n"
+    "add x8, x8, #0x60\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d5, [x9], #0x8\n"
+    "ldr d6, [x28], #0x8\n"
+    "ldr d7, [x27], #0x8\n"
+    "ldr d8, [x26], #0x8\n"
+    "ldr d9, [x25], #0x8\n"
+    "ldr d13, [x24], #0x8\n"
+    "ldr d11, [x23], #0x8\n"
+    "ldr d12, [x22], #0x8\n"
+    "ldr d10, [x21], #0x8\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v5.s }[2], [x9]\n"
+    "ld1 { v6.s }[2], [x28]\n"
+    "ld1 { v7.s }[2], [x27]\n"
+    "ld1 { v8.s }[2], [x26]\n"
+    "ld1 { v9.s }[2], [x25]\n"
+    "ld1 { v13.s }[2], [x24]\n"
+    "ld1 { v11.s }[2], [x23]\n"
+    "ld1 { v12.s }[2], [x22]\n"
+    "ld1 { v10.s }[2], [x21]\n"
+    "ld1 { v14.s }[2], [x20]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+    "ldr s5, [x9, #0x0]\n"
+    "ldr s6, [x28, #0x0]\n"
+    "ldr s7, [x27, #0x0]\n"
+    "ldr s8, [x26, #0x0]\n"
+    "ldr s9, [x25, #0x0]\n"
+    "ldr s13, [x24, #0x0]\n"
+    "ldr s11, [x23, #0x0]\n"
+    "ldr s12, [x22, #0x0]\n"
+    "ldr s10, [x21, #0x0]\n"
+    "ldr s14, [x20, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+    "mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+    "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+    "add x20, x7, x15\n"
+    "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+    "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v8.4s\n"
+    "fmla v28.4s, v1.4s, v6.4s\n"
+    "fmla v29.4s, v1.4s, v9.4s\n"
+    "fmla v30.4s, v1.4s, v8.4s\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s5, [x20, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v31.4s, v2.4s, v5.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "add x20, x7, x13\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v5.4s\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v6.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s6, [x20, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v31.4s, v3.4s, v6.4s\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "add x20, x4, x11\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v29.4s, v4.4s, v9.4s\n"
+    "fmla v30.4s, v4.4s, v6.4s\n"
+    "add x20, x17, x2\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v28.4s, v0.4s, v7.4s\n"
+    "add x8, x8, #0x10\n"
+    "fmla v29.4s, v0.4s, v8.4s\n"
+    "fmla v30.4s, v0.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.4s, v0.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v8.4s\n"
+    "add x20, x17, x6\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "fmla v30.4s, v1.4s, v11.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "add x20, x17, x15\n"
+    "fmla v29.4s, v2.4s, v5.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "fmla v28.4s, v3.4s, v5.4s\n"
+    "add x20, x17, x13\n"
+    "fmla v29.4s, v3.4s, v6.4s\n"
+    "fmla v30.4s, v3.4s, v9.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.4s, v3.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v6.4s\n"
+    "add x20, x17, x11\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d8, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v8.s }[2], [x20]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+    "ldr s8, [x20, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.4s, v4.4s, v8.4s\n"
+    "fmla v28.4s, v0.4s, v14.4s\n"
+    "add x20, x16, XZR\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s5, [x20, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v30.4s, v0.4s, v5.4s\n"
+    "add x20, x16, x2\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v6.s }[2], [x20]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s6, [x20, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.4s, v0.4s, v6.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "add x20, x16, x6\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v30.4s, v1.4s, v6.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.4s, v1.4s, v10.4s\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "add x20, x16, x15\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "fmla v30.4s, v2.4s, v10.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v9.4s\n"
+    "add x20, x16, x13\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "add x20, x16, x11\n"
+    "fmla v29.4s, v4.4s, v8.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d14, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v14.s }[2], [x20]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+    "ldr s14, [x20, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.4s, v4.4s, v14.4s\n"
+    "fmla v28.4s, v0.4s, v5.4s\n"
+    "add x20, x14, XZR\n"
+    "fmla v29.4s, v0.4s, v6.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v30.4s, v0.4s, v9.4s\n"
+    "add x20, x14, x2\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d13, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s13, [x20, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.4s, v0.4s, v13.4s\n"
+    "fmla v28.4s, v1.4s, v6.4s\n"
+    "add x20, x14, x6\n"
+    "fmla v29.4s, v1.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v13.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d5, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s5, [x20, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.4s, v1.4s, v5.4s\n"
+    "fmla v28.4s, v2.4s, v10.4s\n"
+    "add x20, x14, x15\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v5.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr d6, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v6.s }[2], [x20]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s6, [x20, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.4s, v2.4s, v6.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "add x20, x14, x13\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v6.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr d8, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v8.s }[2], [x20]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s8, [x20, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.4s, v3.4s, v8.4s\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "add x20, x14, x11\n"
+    "fmla v29.4s, v4.4s, v14.4s\n"
+    "fmla v30.4s, v4.4s, v8.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr d10, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+    "ldr s10, [x20, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+    "ldr q0, [x8, #0x0]\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v28.4s, v0.4s, v9.4s\n"
+    "add x20, x12, XZR\n"
+    "fmla v29.4s, v0.4s, v13.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "add x20, x12, x2\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 50f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 50f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+    "ldr q1, [x8, #0x0]\n"
+    "fmla v31.4s, v0.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "add x20, x12, x6\n"
+    "fmla v29.4s, v1.4s, v5.4s\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+    "ldr q2, [x8, #0x0]\n"
+    "fmla v31.4s, v1.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v5.4s\n"
+    "add x20, x12, x15\n"
+    "fmla v29.4s, v2.4s, v6.4s\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ldr d11, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 54f\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "b 54f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+    "ldr s11, [x20, #0x0]\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+    "ldr q3, [x8, #0x0]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v6.4s\n"
+    "add x20, x12, x13\n"
+    "fmla v29.4s, v3.4s, v8.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "add x8, x8, #0x10\n"
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr d12, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+    "ldr s12, [x20, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+    "ldr q4, [x8, #0x0]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v8.4s\n"
+    "add x20, x12, x11\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ldr d9, [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 58f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 58f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+    "ldr s9, [x20, #0x0]\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+    "fmla v31.4s, v4.4s, v9.4s\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "tbz %x[n_channels], #1, 59f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.d }[0], [x21], x3\n"
+    "st1 { v30.d }[0], [x20], x3\n"
+    "add x5, x5, #0x8\n"
+    "add x10, x10, #0x8\n"
+    "st1 { v29.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.s }[2], [x21], x3\n"
+    "st1 { v30.s }[2], [x20], x3\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x21, x5\n"
+    "mov x20, x10\n"
+    "st1 { v28.s }[0], [x21], x3\n"
+    "st1 { v30.s }[0], [x20], x3\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "60:"  // Tile loop: Oddments: Store: Bit 1: End
+    "61:"  // Tile loop: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x26, x26, #0x1\n"
+    "add x21, x27, #0x1\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x26, x20\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x27, x27, x21, LT\n"
+    "csel x26, x26, XZR, LT\n"
+    "cmp x27, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..32939eb6dc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1043 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "mov x17, #0x10\n"  // cntb _, ALL, #1
+    "lsr x9, %x[n_channels], #0x2\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "ld1r { v27.4s }, [x20]\n"
+    "add x20, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "ldp x12, x11, [x21, #0x10]\n"
+    "mov x10, #0x0\n"
+    "sub x28, XZR, x17\n"
+    "cbz x9, 3f\n"
+    "ldr q26, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "cmp x17, x9, LSL #4\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "add x16, x16, #0x60\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "ldr q5, [x21, x10]\n"
+    "ldr q6, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q7, [x21, x10]\n"
+    "ldr q8, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x20]\n"
+    "ldr q9, [x21, x10]\n"
+    "ldr q13, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr q11, [x21, x10]\n"
+    "ldr q12, [x20, x10]\n"
+    "ldp x21, x20, [x15, #0x40]\n"
+    "ldr q10, [x21, x10]\n"
+    "ldr q14, [x20, x10]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v5.4s\n"
+    "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v6.4s\n"
+    "ldr x20, [x15, #0x50]\n"
+    "ldr q24, [x20, x10]\n"
+    "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v7.4s\n"
+    "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+    "ldr q23, [x16, #0x0]\n"
+    "ldr q26, [x16, #0x140]\n"
+    "fmla v30.4s, v1.4s, v6.4s\n"
+    "fmla v31.4s, v1.4s, v9.4s\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr q22, [x20, x10]\n"
+    "fmla v28.4s, v1.4s, v8.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "ldr q21, [x16, #0x10]\n"
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "ldr q17, [x20, x10]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "fmla v29.4s, v2.4s, v24.4s\n"
+    "ldr q16, [x16, #0x20]\n"
+    "ldr x22, [x15, #0x70]\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "ldr q5, [x20, x10]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v28.4s, v3.4s, v24.4s\n"
+    "fmla v29.4s, v3.4s, v22.4s\n"
+    "ldr q20, [x16, #0x30]\n"
+    "ldr x21, [x15, #0x80]\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "ldr q19, [x22, x10]\n"
+    "fmla v31.4s, v4.4s, v17.4s\n"
+    "ldr q2, [x20, x10]\n"
+    "fmla v28.4s, v4.4s, v22.4s\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "ldr q18, [x16, #0x40]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v30.4s, v23.4s, v7.4s\n"
+    "fmla v31.4s, v23.4s, v8.4s\n"
+    "ldr x23, [x15, #0x90]\n"
+    "ldr x26, [x15, #0x98]\n"
+    "fmla v28.4s, v23.4s, v14.4s\n"
+    "fmla v29.4s, v23.4s, v5.4s\n"
+    "ldr q1, [x16, #0x50]\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "fmla v30.4s, v21.4s, v8.4s\n"
+    "ldr q25, [x20, x10]\n"
+    "fmla v31.4s, v21.4s, v13.4s\n"
+    "ldr x25, [x15, #0xa8]\n"
+    "fmla v28.4s, v21.4s, v5.4s\n"
+    "fmla v29.4s, v21.4s, v19.4s\n"
+    "ldr q17, [x16, #0x60]\n"
+    "ldr x24, [x15, #0xb0]\n"
+    "fmla v30.4s, v16.4s, v13.4s\n"
+    "ldr q8, [x21, x10]\n"
+    "fmla v31.4s, v16.4s, v24.4s\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v28.4s, v16.4s, v19.4s\n"
+    "fmla v29.4s, v16.4s, v2.4s\n"
+    "ldr q16, [x16, #0x70]\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "fmla v30.4s, v20.4s, v24.4s\n"
+    "ldr q24, [x23, x10]\n"
+    "fmla v31.4s, v20.4s, v22.4s\n"
+    "ldr x27, [x15, #0xc8]\n"
+    "fmla v28.4s, v20.4s, v2.4s\n"
+    "fmla v29.4s, v20.4s, v8.4s\n"
+    "ldr q23, [x16, #0x80]\n"
+    "ldr x23, [x15, #0xd0]\n"
+    "fmla v30.4s, v18.4s, v22.4s\n"
+    "ldr q22, [x26, x10]\n"
+    "fmla v31.4s, v18.4s, v10.4s\n"
+    "ldr q21, [x22, x10]\n"
+    "fmla v28.4s, v18.4s, v8.4s\n"
+    "fmla v29.4s, v18.4s, v25.4s\n"
+    "ldr q20, [x16, #0x90]\n"
+    "ldr x22, [x15, #0xd8]\n"
+    "fmla v30.4s, v1.4s, v14.4s\n"
+    "ldr q0, [x20, x10]\n"
+    "fmla v31.4s, v1.4s, v5.4s\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v28.4s, v1.4s, v24.4s\n"
+    "fmla v29.4s, v1.4s, v22.4s\n"
+    "ldr q6, [x16, #0xa0]\n"
+    "ldr x26, [x15, #0xf8]\n"
+    "fmla v30.4s, v17.4s, v5.4s\n"
+    "ldr q1, [x25, x10]\n"
+    "fmla v31.4s, v17.4s, v19.4s\n"
+    "ldr x25, [x15, #0xe8]\n"
+    "fmla v28.4s, v17.4s, v22.4s\n"
+    "fmla v29.4s, v17.4s, v21.4s\n"
+    "ldr q18, [x16, #0xb0]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v30.4s, v16.4s, v19.4s\n"
+    "ldr q19, [x24, x10]\n"
+    "fmla v31.4s, v16.4s, v2.4s\n"
+    "ldr x24, [x15, #0xf0]\n"
+    "fmla v28.4s, v16.4s, v21.4s\n"
+    "fmla v29.4s, v16.4s, v1.4s\n"
+    "ldr q17, [x16, #0xc0]\n"
+    "fmla v30.4s, v23.4s, v2.4s\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v31.4s, v23.4s, v8.4s\n"
+    "ldr x21, [x15, #0x100]\n"
+    "fmla v28.4s, v23.4s, v1.4s\n"
+    "fmla v29.4s, v23.4s, v19.4s\n"
+    "ldr q13, [x16, #0xd0]\n"
+    "fmla v30.4s, v20.4s, v8.4s\n"
+    "ldr q2, [x27, x10]\n"
+    "fmla v31.4s, v20.4s, v25.4s\n"
+    "ldr q10, [x20, x10]\n"
+    "fmla v28.4s, v20.4s, v19.4s\n"
+    "fmla v29.4s, v20.4s, v0.4s\n"
+    "ldr q9, [x16, #0xe0]\n"
+    "ldr x20, [x15, #0x108]\n"
+    "fmla v30.4s, v6.4s, v24.4s\n"
+    "ldr q5, [x23, x10]\n"
+    "fmla v31.4s, v6.4s, v22.4s\n"
+    "ldr x23, [x15, #0x110]\n"
+    "fmla v28.4s, v6.4s, v16.4s\n"
+    "fmla v29.4s, v6.4s, v2.4s\n"
+    "ldr q24, [x16, #0xf0]\n"
+    "fmla v30.4s, v18.4s, v22.4s\n"
+    "ldr q25, [x22, x10]\n"
+    "fmla v31.4s, v18.4s, v21.4s\n"
+    "ldr x22, [x15, #0x118]\n"
+    "fmla v28.4s, v18.4s, v2.4s\n"
+    "fmla v29.4s, v18.4s, v5.4s\n"
+    "ldr q23, [x16, #0x100]\n"
+    "fmla v30.4s, v17.4s, v21.4s\n"
+    "ldr q22, [x25, x10]\n"
+    "fmla v31.4s, v17.4s, v1.4s\n"
+    "fmla v28.4s, v17.4s, v5.4s\n"
+    "fmla v29.4s, v17.4s, v25.4s\n"
+    "ldr q21, [x16, #0x110]\n"
+    "fmla v30.4s, v13.4s, v1.4s\n"
+    "ldr q18, [x24, x10]\n"
+    "fmla v31.4s, v13.4s, v19.4s\n"
+    "fmla v28.4s, v13.4s, v25.4s\n"
+    "fmla v29.4s, v13.4s, v10.4s\n"
+    "ldr q20, [x16, #0x120]\n"
+    "fmla v30.4s, v9.4s, v19.4s\n"
+    "ldr q17, [x26, x10]\n"
+    "fmla v31.4s, v9.4s, v0.4s\n"
+    "fmla v28.4s, v9.4s, v10.4s\n"
+    "fmla v29.4s, v9.4s, v22.4s\n"
+    "ldr q19, [x16, #0x130]\n"
+    "fmla v30.4s, v24.4s, v16.4s\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v31.4s, v24.4s, v2.4s\n"
+    "fmla v28.4s, v24.4s, v18.4s\n"
+    "ldr q18, [x20, x10]\n"
+    "fmla v29.4s, v24.4s, v17.4s\n"
+    "ldr q0, [x16, #0x150]\n"
+    "fmla v30.4s, v23.4s, v2.4s\n"
+    "fmla v31.4s, v23.4s, v5.4s\n"
+    "ldp x21, x20, [x15, #0x0]\n"
+    "fmla v28.4s, v23.4s, v17.4s\n"
+    "ldr q17, [x23, x10]\n"
+    "fmla v29.4s, v23.4s, v16.4s\n"
+    "ldr q1, [x16, #0x160]\n"
+    "fmla v30.4s, v21.4s, v5.4s\n"
+    "ldr q5, [x21, x17]\n"
+    "fmla v31.4s, v21.4s, v25.4s\n"
+    "fmla v28.4s, v21.4s, v16.4s\n"
+    "ldr q16, [x22, x10]\n"
+    "fmla v29.4s, v21.4s, v18.4s\n"
+    "ldr q2, [x16, #0x170]\n"
+    "fmla v30.4s, v20.4s, v25.4s\n"
+    "ldr q6, [x20, x17]\n"
+    "fmla v31.4s, v20.4s, v10.4s\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr q7, [x21, x17]\n"
+    "fmla v28.4s, v20.4s, v18.4s\n"
+    "fmla v29.4s, v20.4s, v17.4s\n"
+    "ldr q3, [x16, #0x180]\n"
+    "fmla v30.4s, v19.4s, v10.4s\n"
+    "ldr q8, [x20, x17]\n"
+    "fmla v31.4s, v19.4s, v22.4s\n"
+    "ldp x21, x20, [x15, #0x20]\n"
+    "ldr q13, [x20, x17]\n"
+    "fmla v28.4s, v19.4s, v17.4s\n"
+    "fmla v29.4s, v19.4s, v16.4s\n"
+    "ldr q9, [x21, x17]\n"
+    "ldr q4, [x16, #0x190]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "ldr q11, [x21, x17]\n"
+    "ldr q12, [x20, x17]\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "ldp x21, x20, [x15, #0x40]\n"
+    "ldr q10, [x21, x17]\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "ldr q14, [x20, x17]\n"
+    "add x17, x17, #0x10\n"
+    "cmp x17, x9, LSL #4\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "add x10, x10, #0x10\n"
+    "str q30, [x14, x28]\n"
+    "add x16, x16, #0x1a0\n"
+    "str q31, [x13, x28]\n"
+    "str q28, [x12, x28]\n"
+    "str q29, [x11, x28]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+    "mov v5.16b, v26.16b\n fmla v5.4s, v0.4s, v6.4s\n"
+    "ldr x20, [x15, #0x50]\n"
+    "ldr q22, [x20, x10]\n"
+    "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+    "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+    "ldr q19, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q21, [x20, x10]\n"
+    "fmla v5.4s, v1.4s, v9.4s\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla v30.4s, v1.4s, v8.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "ldr q18, [x16, #0x10]\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr q16, [x21, x10]\n"
+    "fmla v5.4s, v2.4s, v11.4s\n"
+    "ldr x23, [x15, #0x70]\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "fmla v29.4s, v2.4s, v22.4s\n"
+    "ldr q17, [x16, #0x20]\n"
+    "ldr x21, [x15, #0x78]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q6, [x20, x10]\n"
+    "fmla v5.4s, v3.4s, v12.4s\n"
+    "ldr x22, [x15, #0x80]\n"
+    "fmla v30.4s, v3.4s, v22.4s\n"
+    "fmla v29.4s, v3.4s, v21.4s\n"
+    "ldr q20, [x16, #0x30]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q2, [x23, x10]\n"
+    "fmla v5.4s, v4.4s, v16.4s\n"
+    "ldr q28, [x21, x10]\n"
+    "fmla v30.4s, v4.4s, v21.4s\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "ldr q16, [x16, #0x40]\n"
+    "ldr x21, [x15, #0x90]\n"
+    "fmla v31.4s, v19.4s, v7.4s\n"
+    "fmla v5.4s, v19.4s, v8.4s\n"
+    "ldr x27, [x15, #0x98]\n"
+    "ldr x26, [x15, #0xa0]\n"
+    "fmla v30.4s, v19.4s, v14.4s\n"
+    "fmla v29.4s, v19.4s, v6.4s\n"
+    "ldr q19, [x16, #0x50]\n"
+    "ldr x25, [x15, #0xa8]\n"
+    "fmla v31.4s, v18.4s, v8.4s\n"
+    "ldr q1, [x20, x10]\n"
+    "fmla v5.4s, v18.4s, v13.4s\n"
+    "ldr x24, [x15, #0xb0]\n"
+    "fmla v30.4s, v18.4s, v6.4s\n"
+    "fmla v29.4s, v18.4s, v2.4s\n"
+    "ldr q18, [x16, #0x60]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v31.4s, v17.4s, v13.4s\n"
+    "ldr q26, [x22, x10]\n"
+    "fmla v5.4s, v17.4s, v22.4s\n"
+    "ldr x23, [x15, #0xc0]\n"
+    "fmla v30.4s, v17.4s, v2.4s\n"
+    "fmla v29.4s, v17.4s, v28.4s\n"
+    "ldr q17, [x16, #0x70]\n"
+    "ldr x22, [x15, #0xc8]\n"
+    "fmla v31.4s, v20.4s, v22.4s\n"
+    "ldr q25, [x21, x10]\n"
+    "fmla v5.4s, v20.4s, v21.4s\n"
+    "ldr x21, [x15, #0xd0]\n"
+    "fmla v30.4s, v20.4s, v28.4s\n"
+    "fmla v29.4s, v20.4s, v26.4s\n"
+    "ldr q24, [x16, #0x80]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v31.4s, v16.4s, v21.4s\n"
+    "ldr q23, [x27, x10]\n"
+    "fmla v5.4s, v16.4s, v10.4s\n"
+    "ldr q0, [x26, x10]\n"
+    "fmla v30.4s, v16.4s, v26.4s\n"
+    "fmla v29.4s, v16.4s, v1.4s\n"
+    "ldr q22, [x16, #0x90]\n"
+    "ldr x27, [x15, #0xd8]\n"
+    "fmla v31.4s, v19.4s, v14.4s\n"
+    "ldr q16, [x20, x10]\n"
+    "fmla v5.4s, v19.4s, v6.4s\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v30.4s, v19.4s, v25.4s\n"
+    "fmla v29.4s, v19.4s, v23.4s\n"
+    "ldr q21, [x16, #0xa0]\n"
+    "ldr x26, [x15, #0xf8]\n"
+    "fmla v31.4s, v18.4s, v6.4s\n"
+    "ldr q20, [x25, x10]\n"
+    "fmla v5.4s, v18.4s, v2.4s\n"
+    "ldr x25, [x15, #0xe8]\n"
+    "fmla v30.4s, v18.4s, v23.4s\n"
+    "fmla v29.4s, v18.4s, v0.4s\n"
+    "ldr q18, [x16, #0xb0]\n"
+    "fmla v31.4s, v17.4s, v2.4s\n"
+    "ldr q19, [x24, x10]\n"
+    "fmla v5.4s, v17.4s, v28.4s\n"
+    "ldr x24, [x15, #0xf0]\n"
+    "fmla v30.4s, v17.4s, v0.4s\n"
+    "fmla v29.4s, v17.4s, v20.4s\n"
+    "ldr q17, [x16, #0xc0]\n"
+    "fmla v31.4s, v24.4s, v28.4s\n"
+    "ldr q7, [x23, x10]\n"
+    "fmla v5.4s, v24.4s, v26.4s\n"
+    "ldr x23, [x15, #0x100]\n"
+    "fmla v30.4s, v24.4s, v20.4s\n"
+    "fmla v29.4s, v24.4s, v19.4s\n"
+    "ldr q3, [x16, #0xd0]\n"
+    "fmla v31.4s, v22.4s, v26.4s\n"
+    "ldr q28, [x22, x10]\n"
+    "fmla v5.4s, v22.4s, v1.4s\n"
+    "ldr q13, [x20, x10]\n"
+    "fmla v30.4s, v22.4s, v19.4s\n"
+    "fmla v29.4s, v22.4s, v16.4s\n"
+    "ldr q11, [x16, #0xe0]\n"
+    "ldr x22, [x15, #0x108]\n"
+    "fmla v31.4s, v21.4s, v25.4s\n"
+    "ldr q26, [x21, x10]\n"
+    "fmla v5.4s, v21.4s, v23.4s\n"
+    "ldr x21, [x15, #0x110]\n"
+    "fmla v30.4s, v21.4s, v7.4s\n"
+    "fmla v29.4s, v21.4s, v28.4s\n"
+    "ldr q25, [x16, #0xf0]\n"
+    "fmla v31.4s, v18.4s, v23.4s\n"
+    "ldr q24, [x27, x10]\n"
+    "fmla v5.4s, v18.4s, v0.4s\n"
+    "ldr x20, [x15, #0x118]\n"
+    "fmla v30.4s, v18.4s, v28.4s\n"
+    "fmla v29.4s, v18.4s, v26.4s\n"
+    "ldr q23, [x16, #0x100]\n"
+    "fmla v31.4s, v17.4s, v0.4s\n"
+    "ldr q22, [x25, x10]\n"
+    "fmla v5.4s, v17.4s, v20.4s\n"
+    "fmla v30.4s, v17.4s, v26.4s\n"
+    "fmla v29.4s, v17.4s, v24.4s\n"
+    "ldr q21, [x16, #0x110]\n"
+    "fmla v31.4s, v3.4s, v20.4s\n"
+    "ldr q18, [x24, x10]\n"
+    "fmla v5.4s, v3.4s, v19.4s\n"
+    "fmla v30.4s, v3.4s, v24.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "ldr q20, [x16, #0x120]\n"
+    "fmla v31.4s, v11.4s, v19.4s\n"
+    "ldr q17, [x26, x10]\n"
+    "fmla v5.4s, v11.4s, v16.4s\n"
+    "fmla v30.4s, v11.4s, v13.4s\n"
+    "fmla v29.4s, v11.4s, v22.4s\n"
+    "ldr q19, [x16, #0x130]\n"
+    "add x16, x16, #0x140\n"
+    "fmla v31.4s, v25.4s, v7.4s\n"
+    "ldr q16, [x23, x10]\n"
+    "fmla v5.4s, v25.4s, v28.4s\n"
+    "fmla v30.4s, v25.4s, v18.4s\n"
+    "ldr q18, [x22, x10]\n"
+    "fmla v29.4s, v25.4s, v17.4s\n"
+    "fmla v31.4s, v23.4s, v28.4s\n"
+    "fmla v5.4s, v23.4s, v26.4s\n"
+    "fmla v30.4s, v23.4s, v17.4s\n"
+    "ldr q17, [x21, x10]\n"
+    "fmla v29.4s, v23.4s, v16.4s\n"
+    "fmla v31.4s, v21.4s, v26.4s\n"
+    "fmla v5.4s, v21.4s, v24.4s\n"
+    "fmla v30.4s, v21.4s, v16.4s\n"
+    "ldr q16, [x20, x10]\n"
+    "fmla v29.4s, v21.4s, v18.4s\n"
+    "add x10, x10, #0x10\n"
+    "fmla v31.4s, v20.4s, v24.4s\n"
+    "fmla v5.4s, v20.4s, v13.4s\n"
+    "fmla v30.4s, v20.4s, v18.4s\n"
+    "fmla v29.4s, v20.4s, v17.4s\n"
+    "fmla v31.4s, v19.4s, v13.4s\n"
+    "fmla v5.4s, v19.4s, v22.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmla v30.4s, v19.4s, v17.4s\n"
+    "fmla v29.4s, v19.4s, v16.4s\n"
+    "fmax v5.4s, v5.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "fmin v5.4s, v5.4s, v15.4s\n"
+    "str q31, [x14, x28]\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "str q5, [x13, x28]\n"
+    "str q30, [x12, x28]\n"
+    "str q29, [x11, x28]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 60f\n"
+    "ldr q26, [x16, #0x0]\n"
+    "ldr q0, [x16, #0x10]\n"
+    "mov x20, x10\n"
+    "add x14, x14, x20\n"
+    "ldr q1, [x16, #0x20]\n"
+    "ldr q2, [x16, #0x30]\n"
+    "add x13, x13, x20\n"
+    "add x12, x12, x20\n"
+    "ldr q3, [x16, #0x40]\n"
+    "ldr q4, [x16, #0x50]\n"
+    "add x11, x11, x20\n"
+    "ldr x9, [x15, #0x0]\n"
+    "ldr x28, [x15, #0x8]\n"
+    "add x9, x9, x10\n"
+    "add x28, x28, x10\n"
+    "ldr x27, [x15, #0x10]\n"
+    "ldr x26, [x15, #0x18]\n"
+    "add x27, x27, x10\n"
+    "add x26, x26, x10\n"
+    "ldr x25, [x15, #0x20]\n"
+    "ldr x24, [x15, #0x28]\n"
+    "add x25, x25, x10\n"
+    "add x24, x24, x10\n"
+    "ldr x23, [x15, #0x30]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "add x23, x23, x10\n"
+    "add x22, x22, x10\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "add x21, x21, x10\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x60\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v5.d }[0], [x9], #0x8\n"
+    "ld1 { v6.d }[0], [x28], #0x8\n"
+    "ld1 { v7.d }[0], [x27], #0x8\n"
+    "ld1 { v8.d }[0], [x26], #0x8\n"
+    "ld1 { v9.d }[0], [x25], #0x8\n"
+    "ld1 { v13.d }[0], [x24], #0x8\n"
+    "ld1 { v11.d }[0], [x23], #0x8\n"
+    "ld1 { v12.d }[0], [x22], #0x8\n"
+    "ld1 { v10.d }[0], [x21], #0x8\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v5.s }[2], [x9], #0x4\n"
+    "ld1 { v6.s }[2], [x28], #0x4\n"
+    "ld1 { v7.s }[2], [x27], #0x4\n"
+    "ld1 { v8.s }[2], [x26], #0x4\n"
+    "ld1 { v9.s }[2], [x25], #0x4\n"
+    "ld1 { v13.s }[2], [x24], #0x4\n"
+    "ld1 { v11.s }[2], [x23], #0x4\n"
+    "ld1 { v12.s }[2], [x22], #0x4\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+    "ld1 { v5.s }[0], [x9], #0x4\n"
+    "ld1 { v6.s }[0], [x28], #0x4\n"
+    "ld1 { v7.s }[0], [x27], #0x4\n"
+    "ld1 { v8.s }[0], [x26], #0x4\n"
+    "ld1 { v9.s }[0], [x25], #0x4\n"
+    "ld1 { v13.s }[0], [x24], #0x4\n"
+    "ld1 { v11.s }[0], [x23], #0x4\n"
+    "ld1 { v12.s }[0], [x22], #0x4\n"
+    "ld1 { v10.s }[0], [x21], #0x4\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "5:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+    "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+    "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+    "ldr x20, [x15, #0x50]\n"
+    "add x20, x20, x10\n"
+    "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+    "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v8.4s\n"
+    "fmla v28.4s, v1.4s, v6.4s\n"
+    "fmla v29.4s, v1.4s, v9.4s\n"
+    "fmla v30.4s, v1.4s, v8.4s\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load input (1, 3): Bit 1: End
+    "ldr x20, [x15, #0x58]\n"
+    "fmla v31.4s, v2.4s, v5.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v5.4s\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "9:"  // Oddments: Load input (1, 4): Bit 1: End
+    "ldr x20, [x15, #0x60]\n"
+    "fmla v31.4s, v3.4s, v6.4s\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 5): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "11:"  // Oddments: Load input (0, 5): Bit 1: End
+    "ldr q0, [x16, #0x0]\n"
+    "fmla v29.4s, v4.4s, v9.4s\n"
+    "fmla v30.4s, v4.4s, v6.4s\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v28.4s, v0.4s, v7.4s\n"
+    "add x20, x20, x10\n"
+    "fmla v29.4s, v0.4s, v8.4s\n"
+    "fmla v30.4s, v0.4s, v14.4s\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "13:"  // Oddments: Load input (2, 1): Bit 1: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "fmla v31.4s, v0.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v8.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "fmla v30.4s, v1.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 2): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "15:"  // Oddments: Load input (2, 2): Bit 1: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "fmla v29.4s, v2.4s, v5.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "17:"  // Oddments: Load input (2, 3): Bit 1: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x80]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "fmla v28.4s, v3.4s, v5.4s\n"
+    "fmla v29.4s, v3.4s, v6.4s\n"
+    "fmla v30.4s, v3.4s, v9.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load input (2, 4): Bit 1: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla v31.4s, v3.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v6.4s\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v8.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (2, 5): Bit 1: Unset
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "21:"  // Oddments: Load input (2, 5): Bit 1: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x90]\n"
+    "fmla v31.4s, v4.4s, v8.4s\n"
+    "fmla v28.4s, v0.4s, v14.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "23:"  // Oddments: Load input (3, 0): Bit 1: End
+    "ldr x20, [x15, #0x98]\n"
+    "fmla v30.4s, v0.4s, v5.4s\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "25:"  // Oddments: Load input (3, 1): Bit 1: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "fmla v31.4s, v0.4s, v6.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v30.4s, v1.4s, v6.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "27:"  // Oddments: Load input (3, 2): Bit 1: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla v31.4s, v1.4s, v10.4s\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "fmla v30.4s, v2.4s, v10.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "29:"  // Oddments: Load input (3, 3): Bit 1: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v9.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "31:"  // Oddments: Load input (3, 4): Bit 1: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "fmla v29.4s, v4.4s, v8.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (3, 5): Bit 1: Unset
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "33:"  // Oddments: Load input (3, 5): Bit 1: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla v31.4s, v4.4s, v14.4s\n"
+    "fmla v28.4s, v0.4s, v5.4s\n"
+    "fmla v29.4s, v0.4s, v6.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "35:"  // Oddments: Load input (4, 0): Bit 1: End
+    "ldr x20, [x15, #0xc8]\n"
+    "fmla v30.4s, v0.4s, v9.4s\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v13.s }[2], [x20], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x20], #0x4\n"
+    "37:"  // Oddments: Load input (4, 1): Bit 1: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xd0]\n"
+    "fmla v31.4s, v0.4s, v13.4s\n"
+    "fmla v28.4s, v1.4s, v6.4s\n"
+    "fmla v29.4s, v1.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v13.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "39:"  // Oddments: Load input (4, 2): Bit 1: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xd8]\n"
+    "fmla v31.4s, v1.4s, v5.4s\n"
+    "fmla v28.4s, v2.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v5.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v6.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v6.s }[2], [x20], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "41:"  // Oddments: Load input (4, 3): Bit 1: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla v31.4s, v2.4s, v6.4s\n"
+    "fmla v28.4s, v3.4s, v11.4s\n"
+    "fmla v29.4s, v3.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v6.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v8.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v8.s }[2], [x20], #0x4\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "43:"  // Oddments: Load input (4, 4): Bit 1: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xe8]\n"
+    "fmla v31.4s, v3.4s, v8.4s\n"
+    "fmla v28.4s, v4.4s, v12.4s\n"
+    "fmla v29.4s, v4.4s, v14.4s\n"
+    "fmla v30.4s, v4.4s, v8.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (4, 5): Bit 1: Unset
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "45:"  // Oddments: Load input (4, 5): Bit 1: End
+    "ldr q0, [x16, #0x0]\n"
+    "ldr x20, [x15, #0xf0]\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v28.4s, v0.4s, v9.4s\n"
+    "fmla v29.4s, v0.4s, v13.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (5, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "47:"  // Oddments: Load input (5, 0): Bit 1: End
+    "ldr x20, [x15, #0xf8]\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 49f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 49f\n"
+    "48:"  // Oddments: Load input (5, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "49:"  // Oddments: Load input (5, 1): Bit 1: End
+    "ldr q1, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x100]\n"
+    "fmla v31.4s, v0.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "fmla v29.4s, v1.4s, v5.4s\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (5, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "51:"  // Oddments: Load input (5, 2): Bit 1: End
+    "ldr q2, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x108]\n"
+    "fmla v31.4s, v1.4s, v9.4s\n"
+    "fmla v28.4s, v2.4s, v5.4s\n"
+    "fmla v29.4s, v2.4s, v6.4s\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 53f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 53f\n"
+    "52:"  // Oddments: Load input (5, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "53:"  // Oddments: Load input (5, 3): Bit 1: End
+    "ldr q3, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x110]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v6.4s\n"
+    "fmla v29.4s, v3.4s, v8.4s\n"
+    "fmla v30.4s, v3.4s, v11.4s\n"
+    "add x20, x20, x10\n"
+    "add x16, x16, #0x10\n"
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (5, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "55:"  // Oddments: Load input (5, 4): Bit 1: End
+    "ldr q4, [x16, #0x0]\n"
+    "ldr x20, [x15, #0x118]\n"
+    "fmla v31.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v8.4s\n"
+    "fmla v29.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "add x20, x20, x10\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 57f\n"
+    "ld1 { v9.s }[2], [x20], #0x4\n"
+    "b 57f\n"
+    "56:"  // Oddments: Load input (5, 5): Bit 1: Unset
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "57:"  // Oddments: Load input (5, 5): Bit 1: End
+    "fmla v31.4s, v4.4s, v9.4s\n"
+    "fmax v28.4s, v28.4s, v27.4s\n"
+    "fmax v29.4s, v29.4s, v27.4s\n"
+    "fmax v30.4s, v30.4s, v27.4s\n"
+    "fmax v31.4s, v31.4s, v27.4s\n"
+    "fmin v28.4s, v28.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v15.4s\n"
+    "fmin v30.4s, v30.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v15.4s\n"
+    "tbz %x[n_channels], #1, 58f\n"
+    "st1 { v28.d }[0], [x14], #0x8\n"
+    "st1 { v29.d }[0], [x13], #0x8\n"
+    "st1 { v30.d }[0], [x12], #0x8\n"
+    "st1 { v31.d }[0], [x11], #0x8\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "st1 { v28.s }[2], [x14], #0x4\n"
+    "st1 { v29.s }[2], [x13], #0x4\n"
+    "st1 { v30.s }[2], [x12], #0x4\n"
+    "st1 { v31.s }[2], [x11], #0x4\n"
+    "b 59f\n"
+    "58:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v28.s }[0], [x14], #0x4\n"
+    "st1 { v29.s }[0], [x13], #0x4\n"
+    "st1 { v30.s }[0], [x12], #0x4\n"
+    "st1 { v31.s }[0], [x11], #0x4\n"
+    "59:"  // Oddments: Store: Bit 1: End
+    "60:"  // End
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..8a8060770c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+class a64_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<float, float, float, float>
+{
+  KernelType kernel = a64_fp32_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  a64_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<float, float, float, float>(9, arm_gemm::VLType::None) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a2f577784f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const void *bias,
+  const unsigned int n_points,
+  const unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v2.4s }, [%x[minmax_vals]]\n"
+    "lsr x9, %x[n_channels], #0x2\n"
+    "add x20, %x[minmax_vals], #0x4\n"
+    "ld1r { v1.4s }, [x20]\n"
+    "mov x11, #0x0\n"
+    "cbz x9, 5f\n"
+    "1:"  // Channel loop
+    "movi v23.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q23, [%x[bias], x11]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "ldr q0, [%x[params], #0x0]\n"
+    "mov x26, %x[inptrs]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "subs x25, %x[n_points], #0x1\n"
+    "ldr q14, [x21, x11]\n"
+    "ldr q15, [x20, x11]\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v25.16b, v23.16b\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "ldr q16, [x21, x11]\n"
+    "mov v26.16b, v23.16b\n"
+    "mov v27.16b, v23.16b\n"
+    "ldr q17, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "ldr q18, [x21, x11]\n"
+    "ldr q19, [x20, x11]\n"
+    "mov v30.16b, v23.16b\n"
+    "mov v31.16b, v23.16b\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "ldr q20, [x21, x11]\n"
+    "add %x[params], %x[params], #0x10\n"
+    "ldr q21, [x20, x11]\n"
+    "ldr x20, [x26], #0x8\n"
+    "ldr q22, [x20, x11]\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x20, x24, [x26], #0x10\n"
+    "ldp x23, x22, [x26], #0x10\n"
+    "subs x25, x25, #0x1\n"
+    "fmla v23.4s, v14.4s, v0.4s\n"
+    "ldr q14, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "fmla v24.4s, v15.4s, v0.4s\n"
+    "fmla v25.4s, v16.4s, v0.4s\n"
+    "ldr q15, [x24, x11]\n"
+    "ldr q16, [x23, x11]\n"
+    "fmla v26.4s, v17.4s, v0.4s\n"
+    "fmla v27.4s, v18.4s, v0.4s\n"
+    "ldr q17, [x22, x11]\n"
+    "ldr q18, [x21, x11]\n"
+    "fmla v28.4s, v19.4s, v0.4s\n"
+    "fmla v29.4s, v20.4s, v0.4s\n"
+    "ldr q19, [x20, x11]\n"
+    "ldp x21, x20, [x26], #0x10\n"
+    "fmla v30.4s, v21.4s, v0.4s\n"
+    "fmla v31.4s, v22.4s, v0.4s\n"
+    "ldr q0, [%x[params], #0x0]\n"
+    "ldr q20, [x21, x11]\n"
+    "add %x[params], %x[params], #0x10\n"
+    "ldr q21, [x20, x11]\n"
+    "ldr x20, [x26], #0x8\n"
+    "ldr q22, [x20, x11]\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "fmla v23.4s, v14.4s, v0.4s\n"
+    "fmla v24.4s, v15.4s, v0.4s\n"
+    "fmax v23.4s, v23.4s, v2.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "fmla v25.4s, v16.4s, v0.4s\n"
+    "fmla v26.4s, v17.4s, v0.4s\n"
+    "fmax v24.4s, v24.4s, v2.4s\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "fmla v27.4s, v18.4s, v0.4s\n"
+    "fmla v28.4s, v19.4s, v0.4s\n"
+    "fmax v25.4s, v25.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "fmla v29.4s, v20.4s, v0.4s\n"
+    "fmla v30.4s, v21.4s, v0.4s\n"
+    "fmax v26.4s, v26.4s, v2.4s\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "fmla v31.4s, v22.4s, v0.4s\n"
+    "fmax v27.4s, v27.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "fmax v28.4s, v28.4s, v2.4s\n"
+    "fmax v29.4s, v29.4s, v2.4s\n"
+    "fmax v30.4s, v30.4s, v2.4s\n"
+    "fmax v31.4s, v31.4s, v2.4s\n"
+    "fmin v23.4s, v23.4s, v1.4s\n"
+    "fmin v24.4s, v24.4s, v1.4s\n"
+    "str q23, [x28, x11]\n"
+    "fmin v25.4s, v25.4s, v1.4s\n"
+    "fmin v26.4s, v26.4s, v1.4s\n"
+    "str q24, [x27, x11]\n"
+    "fmin v27.4s, v27.4s, v1.4s\n"
+    "fmin v28.4s, v28.4s, v1.4s\n"
+    "str q25, [x26, x11]\n"
+    "fmin v29.4s, v29.4s, v1.4s\n"
+    "fmin v30.4s, v30.4s, v1.4s\n"
+    "str q26, [x25, x11]\n"
+    "fmin v31.4s, v31.4s, v1.4s\n"
+    "str q27, [x24, x11]\n"
+    "str q28, [x23, x11]\n"
+    "str q29, [x22, x11]\n"
+    "str q30, [x21, x11]\n"
+    "str q31, [x20, x11]\n"
+    "add x11, x11, #0x10\n"
+    "cmp x11, x9, LSL #4\n"
+    "blt 1b\n"
+    "5:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 17f\n"
+    "movi v23.16b, #0x0\n"
+    "cbz %x[bias], 8f\n"
+    "add x20, %x[bias], x11\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v23.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v23.s }[2], [x20], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "7:"  // Oddments: Load bias: Bit 1: End
+    "8:"  // Oddments: Load bias: Done
+    "ldr q0, [%x[params], #0x0]\n"
+    "mov x10, %x[inptrs]\n"
+    "ldp x9, x28, [x10], #0x10\n"
+    "mov v24.16b, v23.16b\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "mov v25.16b, v23.16b\n"
+    "mov v26.16b, v23.16b\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "ldr x21, [x10], #0x8\n"
+    "mov v27.16b, v23.16b\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "mov v30.16b, v23.16b\n"
+    "add x9, x9, x11\n"
+    "add x28, x28, x11\n"
+    "mov v31.16b, v23.16b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "add %x[params], %x[params], #0x10\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d14, [x9], #0x8\n"
+    "ldr d15, [x28], #0x8\n"
+    "ldr d16, [x27], #0x8\n"
+    "ldr d17, [x26], #0x8\n"
+    "ldr d18, [x25], #0x8\n"
+    "ldr d19, [x24], #0x8\n"
+    "ldr d20, [x23], #0x8\n"
+    "ldr d21, [x22], #0x8\n"
+    "ldr d22, [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v14.s }[2], [x9], #0x4\n"
+    "ld1 { v15.s }[2], [x28], #0x4\n"
+    "ld1 { v16.s }[2], [x27], #0x4\n"
+    "ld1 { v17.s }[2], [x26], #0x4\n"
+    "ld1 { v18.s }[2], [x25], #0x4\n"
+    "ld1 { v19.s }[2], [x24], #0x4\n"
+    "ld1 { v20.s }[2], [x23], #0x4\n"
+    "ld1 { v21.s }[2], [x22], #0x4\n"
+    "ld1 { v22.s }[2], [x21], #0x4\n"
+    "b 10f\n"
+    "9:"  // Oddments: Load: Bit 1: Unset
+    "ldr s14, [x9], #0x4\n"
+    "ldr s15, [x28], #0x4\n"
+    "ldr s16, [x27], #0x4\n"
+    "ldr s17, [x26], #0x4\n"
+    "ldr s18, [x25], #0x4\n"
+    "ldr s19, [x24], #0x4\n"
+    "ldr s20, [x23], #0x4\n"
+    "ldr s21, [x22], #0x4\n"
+    "ldr s22, [x21], #0x4\n"
+    "10:"  // Oddments: Load: Bit 1: End
+    "subs x20, %x[n_points], #0x1\n"
+    "ble 14f\n"
+    "11:"  // Oddments: Planar loop
+    "ldp x9, x28, [x10], #0x10\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "fmla v23.4s, v14.4s, v0.4s\n"
+    "fmla v24.4s, v15.4s, v0.4s\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "fmla v25.4s, v16.4s, v0.4s\n"
+    "fmla v26.4s, v17.4s, v0.4s\n"
+    "ldr x21, [x10], #0x8\n"
+    "fmla v27.4s, v18.4s, v0.4s\n"
+    "fmla v28.4s, v19.4s, v0.4s\n"
+    "add x9, x9, x11\n"
+    "fmla v29.4s, v20.4s, v0.4s\n"
+    "fmla v30.4s, v21.4s, v0.4s\n"
+    "add x28, x28, x11\n"
+    "add x27, x27, x11\n"
+    "fmla v31.4s, v22.4s, v0.4s\n"
+    "ldr q0, [%x[params], #0x0]\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "add %x[params], %x[params], #0x10\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ldr d14, [x9], #0x8\n"
+    "ldr d15, [x28], #0x8\n"
+    "ldr d16, [x27], #0x8\n"
+    "ldr d17, [x26], #0x8\n"
+    "ldr d18, [x25], #0x8\n"
+    "ldr d19, [x24], #0x8\n"
+    "ldr d20, [x23], #0x8\n"
+    "ldr d21, [x22], #0x8\n"
+    "ldr d22, [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v14.s }[2], [x9], #0x4\n"
+    "ld1 { v15.s }[2], [x28], #0x4\n"
+    "ld1 { v16.s }[2], [x27], #0x4\n"
+    "ld1 { v17.s }[2], [x26], #0x4\n"
+    "ld1 { v18.s }[2], [x25], #0x4\n"
+    "ld1 { v19.s }[2], [x24], #0x4\n"
+    "ld1 { v20.s }[2], [x23], #0x4\n"
+    "ld1 { v21.s }[2], [x22], #0x4\n"
+    "ld1 { v22.s }[2], [x21], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "ldr s14, [x9], #0x4\n"
+    "ldr s15, [x28], #0x4\n"
+    "ldr s16, [x27], #0x4\n"
+    "ldr s17, [x26], #0x4\n"
+    "ldr s18, [x25], #0x4\n"
+    "ldr s19, [x24], #0x4\n"
+    "ldr s20, [x23], #0x4\n"
+    "ldr s21, [x22], #0x4\n"
+    "ldr s22, [x21], #0x4\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: End
+    "subs x20, x20, #0x1\n"
+    "bgt 11b\n"
+    "14:"  // Oddments: Planar tail
+    "fmla v23.4s, v14.4s, v0.4s\n"
+    "fmla v24.4s, v15.4s, v0.4s\n"
+    "fmax v23.4s, v23.4s, v2.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "fmla v25.4s, v16.4s, v0.4s\n"
+    "fmla v26.4s, v17.4s, v0.4s\n"
+    "fmax v24.4s, v24.4s, v2.4s\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "fmla v27.4s, v18.4s, v0.4s\n"
+    "fmla v28.4s, v19.4s, v0.4s\n"
+    "fmax v25.4s, v25.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "fmla v29.4s, v20.4s, v0.4s\n"
+    "fmla v30.4s, v21.4s, v0.4s\n"
+    "fmax v26.4s, v26.4s, v2.4s\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "fmla v31.4s, v22.4s, v0.4s\n"
+    "fmax v27.4s, v27.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "add x28, x28, x11\n"
+    "fmax v28.4s, v28.4s, v2.4s\n"
+    "fmax v29.4s, v29.4s, v2.4s\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "fmax v30.4s, v30.4s, v2.4s\n"
+    "fmax v31.4s, v31.4s, v2.4s\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "fmin v23.4s, v23.4s, v1.4s\n"
+    "fmin v24.4s, v24.4s, v1.4s\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "fmin v25.4s, v25.4s, v1.4s\n"
+    "fmin v26.4s, v26.4s, v1.4s\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "fmin v27.4s, v27.4s, v1.4s\n"
+    "fmin v28.4s, v28.4s, v1.4s\n"
+    "fmin v29.4s, v29.4s, v1.4s\n"
+    "fmin v30.4s, v30.4s, v1.4s\n"
+    "fmin v31.4s, v31.4s, v1.4s\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "st1 { v23.d }[0], [x28], #0x8\n"
+    "st1 { v24.d }[0], [x27], #0x8\n"
+    "st1 { v25.d }[0], [x26], #0x8\n"
+    "st1 { v26.d }[0], [x25], #0x8\n"
+    "st1 { v27.d }[0], [x24], #0x8\n"
+    "st1 { v28.d }[0], [x23], #0x8\n"
+    "st1 { v29.d }[0], [x22], #0x8\n"
+    "st1 { v30.d }[0], [x21], #0x8\n"
+    "st1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "st1 { v23.s }[2], [x28], #0x4\n"
+    "st1 { v24.s }[2], [x27], #0x4\n"
+    "st1 { v25.s }[2], [x26], #0x4\n"
+    "st1 { v26.s }[2], [x25], #0x4\n"
+    "st1 { v27.s }[2], [x24], #0x4\n"
+    "st1 { v28.s }[2], [x23], #0x4\n"
+    "st1 { v29.s }[2], [x22], #0x4\n"
+    "st1 { v30.s }[2], [x21], #0x4\n"
+    "st1 { v31.s }[2], [x20], #0x4\n"
+    "b 16f\n"
+    "15:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v23.s }[0], [x28], #0x4\n"
+    "st1 { v24.s }[0], [x27], #0x4\n"
+    "st1 { v25.s }[0], [x26], #0x4\n"
+    "st1 { v26.s }[0], [x25], #0x4\n"
+    "st1 { v27.s }[0], [x24], #0x4\n"
+    "st1 { v28.s }[0], [x23], #0x4\n"
+    "st1 { v29.s }[0], [x22], #0x4\n"
+    "st1 { v30.s }[0], [x21], #0x4\n"
+    "st1 { v31.s }[0], [x20], #0x4\n"
+    "16:"  // Oddments: Store: Bit 1: End
+    "17:"  // End
+    : [params] "+&r" (params)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6c07fa645c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+  using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(3, 3, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..9cafd23fb8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v27.4s }, [%x[clamps]]\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "lsr x22, %x[channel_multiplier], #0x2\n"
+    "add x20, %x[clamps], #0x4\n"
+    "ldr q0, [x21, #0x0]\n"
+    "ldr q1, [x21, #0x10]\n"
+    "mov x21, #0x0\n"
+    "mov x14, #0x0\n"
+    "ld1r { v26.4s }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ldr q2, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ldr q4, [x20, #0x0]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "ldr q7, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ldr q8, [x20, #0x0]\n"
+    "ldr q9, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x28]\n"
+    "ldr q10, [x20, #0x0]\n"
+    "ldr q11, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x30]\n"
+    "ldr q12, [x20, #0x0]\n"
+    "ldr q13, [x20, #0x10]\n"
+    "ldp x13, x12, [%x[outptrs], #0x0]\n"
+    "ldp x11, x10, [%x[outptrs], #0x10]\n"
+    "ldp x9, x28, [%x[outptrs], #0x20]\n"
+    "ldp x27, x26, [%x[outptrs], #0x30]\n"
+    "ldr x25, [%x[outptrs], #0x40]\n"
+    "cbz x22, 3f\n"
+    "ldr q14, [%x[params], #0x0]\n"
+    "ldr q31, [%x[params], #0x10]\n"
+    "subs x22, x22, #0x1\n"
+    "mov v15.16b, v14.16b\n"
+    "ldr q30, [%x[params], #0x20]\n"
+    "ldr q29, [%x[params], #0x30]\n"
+    "mov v16.16b, v14.16b\n"
+    "mov v17.16b, v14.16b\n"
+    "mov v18.16b, v14.16b\n"
+    "mov v19.16b, v14.16b\n"
+    "add %x[params], %x[params], #0x40\n"
+    "mov v20.16b, v14.16b\n"
+    "mov v21.16b, v14.16b\n"
+    "mov v22.16b, v14.16b\n"
+    "beq 2f\n"
+    "1:"  // Output channel complete vector loop
+    "fmla v14.4s, v31.4s, v0.s[0]\n"
+    "fmla v15.4s, v31.4s, v0.s[2]\n"
+    "subs x22, x22, #0x1\n"
+    "add x21, x21, #0x4\n"
+    "fmla v16.4s, v31.4s, v1.s[0]\n"
+    "fmla v17.4s, v31.4s, v4.s[0]\n"
+    "fmla v18.4s, v31.4s, v4.s[2]\n"
+    "fmla v19.4s, v31.4s, v5.s[0]\n"
+    "fmla v20.4s, v31.4s, v8.s[0]\n"
+    "fmla v21.4s, v31.4s, v8.s[2]\n"
+    "fmla v22.4s, v31.4s, v9.s[0]\n"
+    "ldr q25, [%x[params], #0x0]\n"
+    "fmla v14.4s, v30.4s, v0.s[1]\n"
+    "fmla v15.4s, v30.4s, v0.s[3]\n"
+    "fmla v16.4s, v30.4s, v1.s[1]\n"
+    "fmla v17.4s, v30.4s, v4.s[1]\n"
+    "fmla v18.4s, v30.4s, v4.s[3]\n"
+    "fmla v19.4s, v30.4s, v5.s[1]\n"
+    "fmla v20.4s, v30.4s, v8.s[1]\n"
+    "fmla v21.4s, v30.4s, v8.s[3]\n"
+    "fmla v22.4s, v30.4s, v9.s[1]\n"
+    "ldr q24, [%x[params], #0x10]\n"
+    "fmla v14.4s, v29.4s, v0.s[2]\n"
+    "fmla v15.4s, v29.4s, v1.s[0]\n"
+    "fmla v16.4s, v29.4s, v1.s[2]\n"
+    "fmla v17.4s, v29.4s, v4.s[2]\n"
+    "fmla v18.4s, v29.4s, v5.s[0]\n"
+    "fmla v19.4s, v29.4s, v5.s[2]\n"
+    "fmla v20.4s, v29.4s, v8.s[2]\n"
+    "fmla v21.4s, v29.4s, v9.s[0]\n"
+    "fmla v22.4s, v29.4s, v9.s[2]\n"
+    "ldr q23, [%x[params], #0x20]\n"
+    "fmla v14.4s, v25.4s, v2.s[0]\n"
+    "fmla v15.4s, v25.4s, v2.s[2]\n"
+    "fmla v16.4s, v25.4s, v3.s[0]\n"
+    "fmla v17.4s, v25.4s, v6.s[0]\n"
+    "fmla v18.4s, v25.4s, v6.s[2]\n"
+    "fmla v19.4s, v25.4s, v7.s[0]\n"
+    "fmla v20.4s, v25.4s, v10.s[0]\n"
+    "fmla v21.4s, v25.4s, v10.s[2]\n"
+    "fmla v22.4s, v25.4s, v11.s[0]\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "fmla v14.4s, v24.4s, v2.s[1]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v3.s[1]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[3]\n"
+    "fmla v19.4s, v24.4s, v7.s[1]\n"
+    "fmla v20.4s, v24.4s, v10.s[1]\n"
+    "fmla v21.4s, v24.4s, v10.s[3]\n"
+    "fmla v22.4s, v24.4s, v11.s[1]\n"
+    "ldr q24, [%x[params], #0x40]\n"
+    "fmla v14.4s, v23.4s, v2.s[2]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v3.s[2]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v7.s[0]\n"
+    "fmla v19.4s, v23.4s, v7.s[2]\n"
+    "fmla v20.4s, v23.4s, v10.s[2]\n"
+    "fmla v21.4s, v23.4s, v11.s[0]\n"
+    "fmla v22.4s, v23.4s, v11.s[2]\n"
+    "ldr q23, [%x[params], #0x50]\n"
+    "fmla v14.4s, v25.4s, v4.s[0]\n"
+    "fmla v15.4s, v25.4s, v4.s[2]\n"
+    "fmla v16.4s, v25.4s, v5.s[0]\n"
+    "fmla v17.4s, v25.4s, v8.s[0]\n"
+    "fmla v18.4s, v25.4s, v8.s[2]\n"
+    "fmla v19.4s, v25.4s, v9.s[0]\n"
+    "fmla v20.4s, v25.4s, v12.s[0]\n"
+    "fmla v21.4s, v25.4s, v12.s[2]\n"
+    "fmla v22.4s, v25.4s, v13.s[0]\n"
+    "ldr q31, [%x[params], #0x70]\n"
+    "fmla v14.4s, v24.4s, v4.s[1]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v5.s[1]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[3]\n"
+    "fmla v19.4s, v24.4s, v9.s[1]\n"
+    "fmla v20.4s, v24.4s, v12.s[1]\n"
+    "fmla v21.4s, v24.4s, v12.s[3]\n"
+    "fmla v22.4s, v24.4s, v13.s[1]\n"
+    "ldr q30, [%x[params], #0x80]\n"
+    "fmla v14.4s, v23.4s, v4.s[2]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmin v14.4s, v14.4s, v26.4s\n"
+    "fmla v16.4s, v23.4s, v5.s[2]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmax v14.4s, v14.4s, v27.4s\n"
+    "str q14, [x13, x14]\n"
+    "ldr q14, [%x[params], #0x60]\n"
+    "fmla v18.4s, v23.4s, v9.s[0]\n"
+    "fmla v19.4s, v23.4s, v9.s[2]\n"
+    "fmin v15.4s, v15.4s, v26.4s\n"
+    "fmla v20.4s, v23.4s, v12.s[2]\n"
+    "fmla v21.4s, v23.4s, v13.s[0]\n"
+    "fmin v16.4s, v16.4s, v26.4s\n"
+    "fmla v22.4s, v23.4s, v13.s[2]\n"
+    "ldr q29, [%x[params], #0x90]\n"
+    "fmin v17.4s, v17.4s, v26.4s\n"
+    "add %x[params], %x[params], #0xa0\n"
+    "fmin v18.4s, v18.4s, v26.4s\n"
+    "fmin v19.4s, v19.4s, v26.4s\n"
+    "fmin v20.4s, v20.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmax v15.4s, v15.4s, v27.4s\n"
+    "str q15, [x12, x14]\n"
+    "fmax v16.4s, v16.4s, v27.4s\n"
+    "fmax v17.4s, v17.4s, v27.4s\n"
+    "str q16, [x11, x14]\n"
+    "fmax v18.4s, v18.4s, v27.4s\n"
+    "fmax v19.4s, v19.4s, v27.4s\n"
+    "str q17, [x10, x14]\n"
+    "fmax v20.4s, v20.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "str q18, [x9, x14]\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "str q19, [x28, x14]\n"
+    "mov v15.16b, v14.16b\n"
+    "str q20, [x27, x14]\n"
+    "mov v16.16b, v14.16b\n"
+    "mov v17.16b, v14.16b\n"
+    "str q21, [x26, x14]\n"
+    "mov v18.16b, v14.16b\n"
+    "mov v19.16b, v14.16b\n"
+    "str q22, [x25, x14]\n"
+    "mov v20.16b, v14.16b\n"
+    "mov v21.16b, v14.16b\n"
+    "add x14, x14, #0x10\n"
+    "mov v22.16b, v14.16b\n"
+    "bgt 1b\n"
+    "2:"  // Output channel complete vector tail
+    "fmla v14.4s, v31.4s, v0.s[0]\n"
+    "fmla v15.4s, v31.4s, v0.s[2]\n"
+    "fmla v16.4s, v31.4s, v1.s[0]\n"
+    "fmla v17.4s, v31.4s, v4.s[0]\n"
+    "fmla v18.4s, v31.4s, v4.s[2]\n"
+    "fmla v19.4s, v31.4s, v5.s[0]\n"
+    "fmla v20.4s, v31.4s, v8.s[0]\n"
+    "fmla v21.4s, v31.4s, v8.s[2]\n"
+    "fmla v22.4s, v31.4s, v9.s[0]\n"
+    "ldr q25, [%x[params], #0x0]\n"
+    "fmla v14.4s, v30.4s, v0.s[1]\n"
+    "fmla v15.4s, v30.4s, v0.s[3]\n"
+    "fmla v16.4s, v30.4s, v1.s[1]\n"
+    "fmla v17.4s, v30.4s, v4.s[1]\n"
+    "fmla v18.4s, v30.4s, v4.s[3]\n"
+    "fmla v19.4s, v30.4s, v5.s[1]\n"
+    "fmla v20.4s, v30.4s, v8.s[1]\n"
+    "fmla v21.4s, v30.4s, v8.s[3]\n"
+    "fmla v22.4s, v30.4s, v9.s[1]\n"
+    "ldr q24, [%x[params], #0x10]\n"
+    "fmla v14.4s, v29.4s, v0.s[2]\n"
+    "fmla v15.4s, v29.4s, v1.s[0]\n"
+    "fmla v16.4s, v29.4s, v1.s[2]\n"
+    "fmla v17.4s, v29.4s, v4.s[2]\n"
+    "fmla v18.4s, v29.4s, v5.s[0]\n"
+    "fmla v19.4s, v29.4s, v5.s[2]\n"
+    "fmla v20.4s, v29.4s, v8.s[2]\n"
+    "fmla v21.4s, v29.4s, v9.s[0]\n"
+    "fmla v22.4s, v29.4s, v9.s[2]\n"
+    "ldr q23, [%x[params], #0x20]\n"
+    "fmla v14.4s, v25.4s, v2.s[0]\n"
+    "fmla v15.4s, v25.4s, v2.s[2]\n"
+    "fmla v16.4s, v25.4s, v3.s[0]\n"
+    "fmla v17.4s, v25.4s, v6.s[0]\n"
+    "fmla v18.4s, v25.4s, v6.s[2]\n"
+    "fmla v19.4s, v25.4s, v7.s[0]\n"
+    "fmla v20.4s, v25.4s, v10.s[0]\n"
+    "fmla v21.4s, v25.4s, v10.s[2]\n"
+    "fmla v22.4s, v25.4s, v11.s[0]\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "fmla v14.4s, v24.4s, v2.s[1]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v3.s[1]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[3]\n"
+    "fmla v19.4s, v24.4s, v7.s[1]\n"
+    "fmla v20.4s, v24.4s, v10.s[1]\n"
+    "fmla v21.4s, v24.4s, v10.s[3]\n"
+    "fmla v22.4s, v24.4s, v11.s[1]\n"
+    "ldr q24, [%x[params], #0x40]\n"
+    "fmla v14.4s, v23.4s, v2.s[2]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v3.s[2]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v7.s[0]\n"
+    "fmla v19.4s, v23.4s, v7.s[2]\n"
+    "fmla v20.4s, v23.4s, v10.s[2]\n"
+    "fmla v21.4s, v23.4s, v11.s[0]\n"
+    "fmla v22.4s, v23.4s, v11.s[2]\n"
+    "ldr q23, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    "fmla v14.4s, v25.4s, v4.s[0]\n"
+    "fmla v15.4s, v25.4s, v4.s[2]\n"
+    "fmla v16.4s, v25.4s, v5.s[0]\n"
+    "fmla v17.4s, v25.4s, v8.s[0]\n"
+    "fmla v18.4s, v25.4s, v8.s[2]\n"
+    "fmla v19.4s, v25.4s, v9.s[0]\n"
+    "fmla v20.4s, v25.4s, v12.s[0]\n"
+    "fmla v21.4s, v25.4s, v12.s[2]\n"
+    "fmla v22.4s, v25.4s, v13.s[0]\n"
+    "fmla v14.4s, v24.4s, v4.s[1]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v5.s[1]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[3]\n"
+    "fmla v19.4s, v24.4s, v9.s[1]\n"
+    "fmla v20.4s, v24.4s, v12.s[1]\n"
+    "fmla v21.4s, v24.4s, v12.s[3]\n"
+    "fmla v22.4s, v24.4s, v13.s[1]\n"
+    "fmla v14.4s, v23.4s, v4.s[2]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmin v14.4s, v14.4s, v26.4s\n"
+    "fmla v16.4s, v23.4s, v5.s[2]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmin v15.4s, v15.4s, v26.4s\n"
+    "fmla v18.4s, v23.4s, v9.s[0]\n"
+    "fmla v19.4s, v23.4s, v9.s[2]\n"
+    "fmin v16.4s, v16.4s, v26.4s\n"
+    "fmla v20.4s, v23.4s, v12.s[2]\n"
+    "fmla v21.4s, v23.4s, v13.s[0]\n"
+    "fmin v17.4s, v17.4s, v26.4s\n"
+    "fmla v22.4s, v23.4s, v13.s[2]\n"
+    "fmin v18.4s, v18.4s, v26.4s\n"
+    "fmin v19.4s, v19.4s, v26.4s\n"
+    "fmin v20.4s, v20.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmax v14.4s, v14.4s, v27.4s\n"
+    "fmax v15.4s, v15.4s, v27.4s\n"
+    "str q14, [x13, x14]\n"
+    "fmax v16.4s, v16.4s, v27.4s\n"
+    "fmax v17.4s, v17.4s, v27.4s\n"
+    "str q15, [x12, x14]\n"
+    "fmax v18.4s, v18.4s, v27.4s\n"
+    "fmax v19.4s, v19.4s, v27.4s\n"
+    "str q16, [x11, x14]\n"
+    "fmax v20.4s, v20.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "str q17, [x10, x14]\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "str q18, [x9, x14]\n"
+    "str q19, [x28, x14]\n"
+    "str q20, [x27, x14]\n"
+    "str q21, [x26, x14]\n"
+    "str q22, [x25, x14]\n"
+    "add x14, x14, #0x10\n"
+    "3:"  // Output channel oddments
+    "tst %x[channel_multiplier], #0x3\n"
+    "beq 6f\n"
+    "ldr q14, [%x[params], #0x0]\n"
+    "ldr q25, [%x[params], #0x10]\n"
+    "mov v15.16b, v14.16b\n"
+    "mov v16.16b, v14.16b\n"
+    "ldr q24, [%x[params], #0x20]\n"
+    "ldr q23, [%x[params], #0x30]\n"
+    "mov v17.16b, v14.16b\n"
+    "mov v18.16b, v14.16b\n"
+    "mov v19.16b, v14.16b\n"
+    "mov v20.16b, v14.16b\n"
+    "fmla v15.4s, v25.4s, v0.s[2]\n"
+    "mov v21.16b, v14.16b\n"
+    "mov v22.16b, v14.16b\n"
+    "fmla v14.4s, v25.4s, v0.s[0]\n"
+    "fmla v16.4s, v25.4s, v1.s[0]\n"
+    "fmla v17.4s, v25.4s, v4.s[0]\n"
+    "fmla v18.4s, v25.4s, v4.s[2]\n"
+    "fmla v19.4s, v25.4s, v5.s[0]\n"
+    "fmla v20.4s, v25.4s, v8.s[0]\n"
+    "fmla v21.4s, v25.4s, v8.s[2]\n"
+    "fmla v22.4s, v25.4s, v9.s[0]\n"
+    "ldr q25, [%x[params], #0x40]\n"
+    "fmla v14.4s, v24.4s, v0.s[1]\n"
+    "fmla v15.4s, v24.4s, v0.s[3]\n"
+    "fmla v16.4s, v24.4s, v1.s[1]\n"
+    "fmla v17.4s, v24.4s, v4.s[1]\n"
+    "fmla v18.4s, v24.4s, v4.s[3]\n"
+    "fmla v19.4s, v24.4s, v5.s[1]\n"
+    "fmla v20.4s, v24.4s, v8.s[1]\n"
+    "fmla v21.4s, v24.4s, v8.s[3]\n"
+    "fmla v22.4s, v24.4s, v9.s[1]\n"
+    "ldr q24, [%x[params], #0x50]\n"
+    "fmla v14.4s, v23.4s, v0.s[2]\n"
+    "fmla v15.4s, v23.4s, v1.s[0]\n"
+    "fmla v16.4s, v23.4s, v1.s[2]\n"
+    "fmla v17.4s, v23.4s, v4.s[2]\n"
+    "fmla v18.4s, v23.4s, v5.s[0]\n"
+    "fmla v19.4s, v23.4s, v5.s[2]\n"
+    "fmla v20.4s, v23.4s, v8.s[2]\n"
+    "fmla v21.4s, v23.4s, v9.s[0]\n"
+    "fmla v22.4s, v23.4s, v9.s[2]\n"
+    "ldr q23, [%x[params], #0x60]\n"
+    "fmla v14.4s, v25.4s, v2.s[0]\n"
+    "fmla v15.4s, v25.4s, v2.s[2]\n"
+    "fmla v16.4s, v25.4s, v3.s[0]\n"
+    "fmla v17.4s, v25.4s, v6.s[0]\n"
+    "fmla v18.4s, v25.4s, v6.s[2]\n"
+    "fmla v19.4s, v25.4s, v7.s[0]\n"
+    "fmla v20.4s, v25.4s, v10.s[0]\n"
+    "fmla v21.4s, v25.4s, v10.s[2]\n"
+    "fmla v22.4s, v25.4s, v11.s[0]\n"
+    "ldr q25, [%x[params], #0x70]\n"
+    "fmla v14.4s, v24.4s, v2.s[1]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v3.s[1]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[3]\n"
+    "fmla v19.4s, v24.4s, v7.s[1]\n"
+    "fmla v20.4s, v24.4s, v10.s[1]\n"
+    "fmla v21.4s, v24.4s, v10.s[3]\n"
+    "fmla v22.4s, v24.4s, v11.s[1]\n"
+    "ldr q24, [%x[params], #0x80]\n"
+    "fmla v14.4s, v23.4s, v2.s[2]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v3.s[2]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v7.s[0]\n"
+    "fmla v19.4s, v23.4s, v7.s[2]\n"
+    "fmla v20.4s, v23.4s, v10.s[2]\n"
+    "fmla v21.4s, v23.4s, v11.s[0]\n"
+    "fmla v22.4s, v23.4s, v11.s[2]\n"
+    "ldr q23, [%x[params], #0x90]\n"
+    "add %x[params], %x[params], #0xa0\n"
+    "fmla v14.4s, v25.4s, v4.s[0]\n"
+    "fmla v15.4s, v25.4s, v4.s[2]\n"
+    "fmla v16.4s, v25.4s, v5.s[0]\n"
+    "fmla v17.4s, v25.4s, v8.s[0]\n"
+    "fmla v18.4s, v25.4s, v8.s[2]\n"
+    "fmla v19.4s, v25.4s, v9.s[0]\n"
+    "fmla v20.4s, v25.4s, v12.s[0]\n"
+    "fmla v21.4s, v25.4s, v12.s[2]\n"
+    "fmla v22.4s, v25.4s, v13.s[0]\n"
+    "fmla v14.4s, v24.4s, v4.s[1]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v5.s[1]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[3]\n"
+    "fmla v19.4s, v24.4s, v9.s[1]\n"
+    "fmla v20.4s, v24.4s, v12.s[1]\n"
+    "fmla v21.4s, v24.4s, v12.s[3]\n"
+    "fmla v22.4s, v24.4s, v13.s[1]\n"
+    "fmla v14.4s, v23.4s, v4.s[2]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmin v14.4s, v14.4s, v26.4s\n"
+    "fmla v16.4s, v23.4s, v5.s[2]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmin v15.4s, v15.4s, v26.4s\n"
+    "fmla v18.4s, v23.4s, v9.s[0]\n"
+    "fmla v19.4s, v23.4s, v9.s[2]\n"
+    "fmin v16.4s, v16.4s, v26.4s\n"
+    "fmla v20.4s, v23.4s, v12.s[2]\n"
+    "fmla v21.4s, v23.4s, v13.s[0]\n"
+    "fmin v17.4s, v17.4s, v26.4s\n"
+    "fmla v22.4s, v23.4s, v13.s[2]\n"
+    "fmin v18.4s, v18.4s, v26.4s\n"
+    "fmin v19.4s, v19.4s, v26.4s\n"
+    "fmin v20.4s, v20.4s, v26.4s\n"
+    "fmin v21.4s, v21.4s, v26.4s\n"
+    "fmin v22.4s, v22.4s, v26.4s\n"
+    "fmax v14.4s, v14.4s, v27.4s\n"
+    "fmax v15.4s, v15.4s, v27.4s\n"
+    "fmax v16.4s, v16.4s, v27.4s\n"
+    "fmax v17.4s, v17.4s, v27.4s\n"
+    "fmax v18.4s, v18.4s, v27.4s\n"
+    "fmax v19.4s, v19.4s, v27.4s\n"
+    "fmax v20.4s, v20.4s, v27.4s\n"
+    "fmax v21.4s, v21.4s, v27.4s\n"
+    "fmax v22.4s, v22.4s, v27.4s\n"
+    "tbz %x[channel_multiplier], #1, 4f\n"
+    "add x20, x13, x14\n"
+    "add x22, x12, x14\n"
+    "st1 { v14.d }[0], [x20]\n"
+    "add x21, x11, x14\n"
+    "add x20, x10, x14\n"
+    "st1 { v15.d }[0], [x22]\n"
+    "add x24, x9, x14\n"
+    "add x23, x28, x14\n"
+    "st1 { v16.d }[0], [x21]\n"
+    "add x22, x27, x14\n"
+    "add x21, x26, x14\n"
+    "st1 { v17.d }[0], [x20]\n"
+    "add x20, x25, x14\n"
+    "st1 { v18.d }[0], [x24]\n"
+    "add x14, x14, #0x8\n"
+    "st1 { v19.d }[0], [x23]\n"
+    "st1 { v20.d }[0], [x22]\n"
+    "st1 { v21.d }[0], [x21]\n"
+    "st1 { v22.d }[0], [x20]\n"
+    "tbz %x[channel_multiplier], #0, 5f\n"
+    "add x20, x13, x14\n"
+    "add x22, x12, x14\n"
+    "st1 { v14.s }[2], [x20]\n"
+    "add x21, x11, x14\n"
+    "add x20, x10, x14\n"
+    "st1 { v15.s }[2], [x22]\n"
+    "add x24, x9, x14\n"
+    "add x23, x28, x14\n"
+    "st1 { v16.s }[2], [x21]\n"
+    "add x22, x27, x14\n"
+    "add x21, x26, x14\n"
+    "st1 { v17.s }[2], [x20]\n"
+    "add x20, x25, x14\n"
+    "st1 { v18.s }[2], [x24]\n"
+    "st1 { v19.s }[2], [x23]\n"
+    "st1 { v20.s }[2], [x22]\n"
+    "st1 { v21.s }[2], [x21]\n"
+    "st1 { v22.s }[2], [x20]\n"
+    "b 5f\n"
+    "4:"  // Output channel oddments: Store: Bit 1: Unset
+    "add x20, x13, x14\n"
+    "add x22, x12, x14\n"
+    "st1 { v14.s }[0], [x20]\n"
+    "add x21, x11, x14\n"
+    "add x20, x10, x14\n"
+    "st1 { v15.s }[0], [x22]\n"
+    "add x24, x9, x14\n"
+    "add x23, x28, x14\n"
+    "st1 { v16.s }[0], [x21]\n"
+    "add x22, x27, x14\n"
+    "add x21, x26, x14\n"
+    "st1 { v17.s }[0], [x20]\n"
+    "add x20, x25, x14\n"
+    "st1 { v18.s }[0], [x24]\n"
+    "st1 { v19.s }[0], [x23]\n"
+    "st1 { v20.s }[0], [x22]\n"
+    "st1 { v21.s }[0], [x21]\n"
+    "st1 { v22.s }[0], [x20]\n"
+    "5:"  // Output channel oddments: Store: Bit 1: End
+    "6:"  // End
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..9f514c78e7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+  using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c9bb1f41da
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v26.4s }, [%x[clamps]]\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "lsr x22, %x[channel_multiplier], #0x2\n"
+    "add x20, %x[clamps], #0x4\n"
+    "ldr q0, [x21, #0x0]\n"
+    "ldr q1, [x21, #0x10]\n"
+    "mov x21, #0x0\n"
+    "mov x13, #0x0\n"
+    "ld1r { v25.4s }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ldr q2, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ldr q4, [x20, #0x0]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "ldr q7, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ldr q8, [x20, #0x0]\n"
+    "ldr q9, [x20, #0x10]\n"
+    "ldr x20, [%x[inptrs], #0x28]\n"
+    "ldr q10, [x20, #0x0]\n"
+    "ldr q11, [x20, #0x10]\n"
+    "ldp x12, x11, [%x[outptrs], #0x0]\n"
+    "ldp x10, x9, [%x[outptrs], #0x10]\n"
+    "ldp x28, x27, [%x[outptrs], #0x20]\n"
+    "ldp x26, x25, [%x[outptrs], #0x30]\n"
+    "cbz x22, 3f\n"
+    "ldr q12, [%x[params], #0x0]\n"
+    "ldr q31, [%x[params], #0x10]\n"
+    "subs x22, x22, #0x1\n"
+    "mov v13.16b, v12.16b\n"
+    "ldr q30, [%x[params], #0x20]\n"
+    "ldr q29, [%x[params], #0x30]\n"
+    "mov v14.16b, v12.16b\n"
+    "mov v15.16b, v12.16b\n"
+    "ldr q28, [%x[params], #0x40]\n"
+    "ldr q27, [%x[params], #0x50]\n"
+    "mov v16.16b, v12.16b\n"
+    "mov v17.16b, v12.16b\n"
+    "mov v18.16b, v12.16b\n"
+    "mov v19.16b, v12.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    "beq 2f\n"
+    "1:"  // Output channel complete vector loop
+    "fmla v12.4s, v31.4s, v0.s[0]\n"
+    "fmla v13.4s, v31.4s, v0.s[1]\n"
+    "subs x22, x22, #0x1\n"
+    "add x21, x21, #0x4\n"
+    "fmla v14.4s, v31.4s, v0.s[2]\n"
+    "fmla v15.4s, v31.4s, v0.s[3]\n"
+    "fmla v16.4s, v31.4s, v2.s[0]\n"
+    "fmla v17.4s, v31.4s, v2.s[1]\n"
+    "fmla v18.4s, v31.4s, v2.s[2]\n"
+    "fmla v19.4s, v31.4s, v2.s[3]\n"
+    "ldr q24, [%x[params], #0x0]\n"
+    "fmla v12.4s, v30.4s, v0.s[1]\n"
+    "fmla v13.4s, v30.4s, v0.s[2]\n"
+    "fmla v14.4s, v30.4s, v0.s[3]\n"
+    "fmla v15.4s, v30.4s, v1.s[0]\n"
+    "fmla v16.4s, v30.4s, v2.s[1]\n"
+    "fmla v17.4s, v30.4s, v2.s[2]\n"
+    "fmla v18.4s, v30.4s, v2.s[3]\n"
+    "fmla v19.4s, v30.4s, v3.s[0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "fmla v12.4s, v29.4s, v0.s[2]\n"
+    "fmla v13.4s, v29.4s, v0.s[3]\n"
+    "fmla v14.4s, v29.4s, v1.s[0]\n"
+    "fmla v15.4s, v29.4s, v1.s[1]\n"
+    "fmla v16.4s, v29.4s, v2.s[2]\n"
+    "fmla v17.4s, v29.4s, v2.s[3]\n"
+    "fmla v18.4s, v29.4s, v3.s[0]\n"
+    "fmla v19.4s, v29.4s, v3.s[1]\n"
+    "ldr q22, [%x[params], #0x20]\n"
+    "fmla v12.4s, v28.4s, v0.s[3]\n"
+    "fmla v13.4s, v28.4s, v1.s[0]\n"
+    "fmla v14.4s, v28.4s, v1.s[1]\n"
+    "fmla v15.4s, v28.4s, v1.s[2]\n"
+    "fmla v16.4s, v28.4s, v2.s[3]\n"
+    "fmla v17.4s, v28.4s, v3.s[0]\n"
+    "fmla v18.4s, v28.4s, v3.s[1]\n"
+    "fmla v19.4s, v28.4s, v3.s[2]\n"
+    "ldr q21, [%x[params], #0x30]\n"
+    "fmla v12.4s, v27.4s, v1.s[0]\n"
+    "fmla v13.4s, v27.4s, v1.s[1]\n"
+    "fmla v14.4s, v27.4s, v1.s[2]\n"
+    "fmla v15.4s, v27.4s, v1.s[3]\n"
+    "fmla v16.4s, v27.4s, v3.s[0]\n"
+    "fmla v17.4s, v27.4s, v3.s[1]\n"
+    "fmla v18.4s, v27.4s, v3.s[2]\n"
+    "fmla v19.4s, v27.4s, v3.s[3]\n"
+    "ldr q20, [%x[params], #0x40]\n"
+    "fmla v12.4s, v24.4s, v2.s[0]\n"
+    "fmla v13.4s, v24.4s, v2.s[1]\n"
+    "fmla v14.4s, v24.4s, v2.s[2]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v4.s[0]\n"
+    "fmla v17.4s, v24.4s, v4.s[1]\n"
+    "fmla v18.4s, v24.4s, v4.s[2]\n"
+    "fmla v19.4s, v24.4s, v4.s[3]\n"
+    "ldr q24, [%x[params], #0x50]\n"
+    "fmla v12.4s, v23.4s, v2.s[1]\n"
+    "fmla v13.4s, v23.4s, v2.s[2]\n"
+    "fmla v14.4s, v23.4s, v2.s[3]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v4.s[1]\n"
+    "fmla v17.4s, v23.4s, v4.s[2]\n"
+    "fmla v18.4s, v23.4s, v4.s[3]\n"
+    "fmla v19.4s, v23.4s, v5.s[0]\n"
+    "ldr q23, [%x[params], #0x60]\n"
+    "fmla v12.4s, v22.4s, v2.s[2]\n"
+    "fmla v13.4s, v22.4s, v2.s[3]\n"
+    "fmla v14.4s, v22.4s, v3.s[0]\n"
+    "fmla v15.4s, v22.4s, v3.s[1]\n"
+    "fmla v16.4s, v22.4s, v4.s[2]\n"
+    "fmla v17.4s, v22.4s, v4.s[3]\n"
+    "fmla v18.4s, v22.4s, v5.s[0]\n"
+    "fmla v19.4s, v22.4s, v5.s[1]\n"
+    "ldr q22, [%x[params], #0x70]\n"
+    "fmla v12.4s, v21.4s, v2.s[3]\n"
+    "fmla v13.4s, v21.4s, v3.s[0]\n"
+    "fmla v14.4s, v21.4s, v3.s[1]\n"
+    "fmla v15.4s, v21.4s, v3.s[2]\n"
+    "fmla v16.4s, v21.4s, v4.s[3]\n"
+    "fmla v17.4s, v21.4s, v5.s[0]\n"
+    "fmla v18.4s, v21.4s, v5.s[1]\n"
+    "fmla v19.4s, v21.4s, v5.s[2]\n"
+    "ldr q21, [%x[params], #0x80]\n"
+    "fmla v12.4s, v20.4s, v3.s[0]\n"
+    "fmla v13.4s, v20.4s, v3.s[1]\n"
+    "fmla v14.4s, v20.4s, v3.s[2]\n"
+    "fmla v15.4s, v20.4s, v3.s[3]\n"
+    "fmla v16.4s, v20.4s, v5.s[0]\n"
+    "fmla v17.4s, v20.4s, v5.s[1]\n"
+    "fmla v18.4s, v20.4s, v5.s[2]\n"
+    "fmla v19.4s, v20.4s, v5.s[3]\n"
+    "ldr q20, [%x[params], #0x90]\n"
+    "fmla v12.4s, v24.4s, v4.s[0]\n"
+    "fmla v13.4s, v24.4s, v4.s[1]\n"
+    "fmla v14.4s, v24.4s, v4.s[2]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v6.s[0]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[2]\n"
+    "fmla v19.4s, v24.4s, v6.s[3]\n"
+    "ldr q24, [%x[params], #0xa0]\n"
+    "fmla v12.4s, v23.4s, v4.s[1]\n"
+    "fmla v13.4s, v23.4s, v4.s[2]\n"
+    "fmla v14.4s, v23.4s, v4.s[3]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmla v16.4s, v23.4s, v6.s[1]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v6.s[3]\n"
+    "fmla v19.4s, v23.4s, v7.s[0]\n"
+    "ldr q23, [%x[params], #0xb0]\n"
+    "fmla v12.4s, v22.4s, v4.s[2]\n"
+    "fmla v13.4s, v22.4s, v4.s[3]\n"
+    "fmla v14.4s, v22.4s, v5.s[0]\n"
+    "fmla v15.4s, v22.4s, v5.s[1]\n"
+    "fmla v16.4s, v22.4s, v6.s[2]\n"
+    "fmla v17.4s, v22.4s, v6.s[3]\n"
+    "fmla v18.4s, v22.4s, v7.s[0]\n"
+    "fmla v19.4s, v22.4s, v7.s[1]\n"
+    "ldr q22, [%x[params], #0xc0]\n"
+    "fmla v12.4s, v21.4s, v4.s[3]\n"
+    "fmla v13.4s, v21.4s, v5.s[0]\n"
+    "fmla v14.4s, v21.4s, v5.s[1]\n"
+    "fmla v15.4s, v21.4s, v5.s[2]\n"
+    "fmla v16.4s, v21.4s, v6.s[3]\n"
+    "fmla v17.4s, v21.4s, v7.s[0]\n"
+    "fmla v18.4s, v21.4s, v7.s[1]\n"
+    "fmla v19.4s, v21.4s, v7.s[2]\n"
+    "ldr q21, [%x[params], #0xd0]\n"
+    "fmla v12.4s, v20.4s, v5.s[0]\n"
+    "fmla v13.4s, v20.4s, v5.s[1]\n"
+    "fmla v14.4s, v20.4s, v5.s[2]\n"
+    "fmla v15.4s, v20.4s, v5.s[3]\n"
+    "fmla v16.4s, v20.4s, v7.s[0]\n"
+    "fmla v17.4s, v20.4s, v7.s[1]\n"
+    "fmla v18.4s, v20.4s, v7.s[2]\n"
+    "fmla v19.4s, v20.4s, v7.s[3]\n"
+    "ldr q20, [%x[params], #0xe0]\n"
+    "fmla v12.4s, v24.4s, v6.s[0]\n"
+    "fmla v13.4s, v24.4s, v6.s[1]\n"
+    "fmla v14.4s, v24.4s, v6.s[2]\n"
+    "fmla v15.4s, v24.4s, v6.s[3]\n"
+    "fmla v16.4s, v24.4s, v8.s[0]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[2]\n"
+    "fmla v19.4s, v24.4s, v8.s[3]\n"
+    "ldr q24, [%x[params], #0xf0]\n"
+    "fmla v12.4s, v23.4s, v6.s[1]\n"
+    "fmla v13.4s, v23.4s, v6.s[2]\n"
+    "fmla v14.4s, v23.4s, v6.s[3]\n"
+    "fmla v15.4s, v23.4s, v7.s[0]\n"
+    "fmla v16.4s, v23.4s, v8.s[1]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmla v18.4s, v23.4s, v8.s[3]\n"
+    "fmla v19.4s, v23.4s, v9.s[0]\n"
+    "ldr q23, [%x[params], #0x100]\n"
+    "fmla v12.4s, v22.4s, v6.s[2]\n"
+    "fmla v13.4s, v22.4s, v6.s[3]\n"
+    "fmla v14.4s, v22.4s, v7.s[0]\n"
+    "fmla v15.4s, v22.4s, v7.s[1]\n"
+    "fmla v16.4s, v22.4s, v8.s[2]\n"
+    "fmla v17.4s, v22.4s, v8.s[3]\n"
+    "fmla v18.4s, v22.4s, v9.s[0]\n"
+    "fmla v19.4s, v22.4s, v9.s[1]\n"
+    "ldr q22, [%x[params], #0x110]\n"
+    "fmla v12.4s, v21.4s, v6.s[3]\n"
+    "fmla v13.4s, v21.4s, v7.s[0]\n"
+    "fmla v14.4s, v21.4s, v7.s[1]\n"
+    "fmla v15.4s, v21.4s, v7.s[2]\n"
+    "fmla v16.4s, v21.4s, v8.s[3]\n"
+    "fmla v17.4s, v21.4s, v9.s[0]\n"
+    "fmla v18.4s, v21.4s, v9.s[1]\n"
+    "fmla v19.4s, v21.4s, v9.s[2]\n"
+    "ldr q21, [%x[params], #0x120]\n"
+    "fmla v12.4s, v20.4s, v7.s[0]\n"
+    "fmla v13.4s, v20.4s, v7.s[1]\n"
+    "fmla v14.4s, v20.4s, v7.s[2]\n"
+    "fmla v15.4s, v20.4s, v7.s[3]\n"
+    "fmla v16.4s, v20.4s, v9.s[0]\n"
+    "fmla v17.4s, v20.4s, v9.s[1]\n"
+    "fmla v18.4s, v20.4s, v9.s[2]\n"
+    "fmla v19.4s, v20.4s, v9.s[3]\n"
+    "ldr q20, [%x[params], #0x130]\n"
+    "fmla v12.4s, v24.4s, v8.s[0]\n"
+    "fmla v13.4s, v24.4s, v8.s[1]\n"
+    "fmla v14.4s, v24.4s, v8.s[2]\n"
+    "fmla v15.4s, v24.4s, v8.s[3]\n"
+    "fmla v16.4s, v24.4s, v10.s[0]\n"
+    "fmla v17.4s, v24.4s, v10.s[1]\n"
+    "fmla v18.4s, v24.4s, v10.s[2]\n"
+    "fmla v19.4s, v24.4s, v10.s[3]\n"
+    "ldr q31, [%x[params], #0x150]\n"
+    "fmla v12.4s, v23.4s, v8.s[1]\n"
+    "fmla v13.4s, v23.4s, v8.s[2]\n"
+    "fmla v14.4s, v23.4s, v8.s[3]\n"
+    "fmla v15.4s, v23.4s, v9.s[0]\n"
+    "fmla v16.4s, v23.4s, v10.s[1]\n"
+    "fmla v17.4s, v23.4s, v10.s[2]\n"
+    "fmla v18.4s, v23.4s, v10.s[3]\n"
+    "fmla v19.4s, v23.4s, v11.s[0]\n"
+    "ldr q30, [%x[params], #0x160]\n"
+    "fmla v12.4s, v22.4s, v8.s[2]\n"
+    "fmla v13.4s, v22.4s, v8.s[3]\n"
+    "fmla v14.4s, v22.4s, v9.s[0]\n"
+    "fmla v15.4s, v22.4s, v9.s[1]\n"
+    "fmla v16.4s, v22.4s, v10.s[2]\n"
+    "fmla v17.4s, v22.4s, v10.s[3]\n"
+    "fmla v18.4s, v22.4s, v11.s[0]\n"
+    "fmla v19.4s, v22.4s, v11.s[1]\n"
+    "ldr q29, [%x[params], #0x170]\n"
+    "fmla v12.4s, v21.4s, v8.s[3]\n"
+    "fmla v13.4s, v21.4s, v9.s[0]\n"
+    "fmla v14.4s, v21.4s, v9.s[1]\n"
+    "fmla v15.4s, v21.4s, v9.s[2]\n"
+    "fmla v16.4s, v21.4s, v10.s[3]\n"
+    "fmla v17.4s, v21.4s, v11.s[0]\n"
+    "fmla v18.4s, v21.4s, v11.s[1]\n"
+    "fmla v19.4s, v21.4s, v11.s[2]\n"
+    "ldr q28, [%x[params], #0x180]\n"
+    "fmla v12.4s, v20.4s, v9.s[0]\n"
+    "fmla v13.4s, v20.4s, v9.s[1]\n"
+    "fmin v12.4s, v12.4s, v25.4s\n"
+    "fmla v14.4s, v20.4s, v9.s[2]\n"
+    "fmla v15.4s, v20.4s, v9.s[3]\n"
+    "fmax v12.4s, v12.4s, v26.4s\n"
+    "str q12, [x12, x13]\n"
+    "ldr q12, [%x[params], #0x140]\n"
+    "fmla v16.4s, v20.4s, v11.s[0]\n"
+    "fmla v17.4s, v20.4s, v11.s[1]\n"
+    "fmin v13.4s, v13.4s, v25.4s\n"
+    "fmla v18.4s, v20.4s, v11.s[2]\n"
+    "fmla v19.4s, v20.4s, v11.s[3]\n"
+    "ldr q27, [%x[params], #0x190]\n"
+    "fmin v14.4s, v14.4s, v25.4s\n"
+    "fmin v15.4s, v15.4s, v25.4s\n"
+    "fmin v16.4s, v16.4s, v25.4s\n"
+    "add %x[params], %x[params], #0x1a0\n"
+    "fmin v17.4s, v17.4s, v25.4s\n"
+    "fmin v18.4s, v18.4s, v25.4s\n"
+    "fmin v19.4s, v19.4s, v25.4s\n"
+    "fmax v13.4s, v13.4s, v26.4s\n"
+    "str q13, [x11, x13]\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "str q14, [x10, x13]\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "str q15, [x9, x13]\n"
+    "fmax v18.4s, v18.4s, v26.4s\n"
+    "fmax v19.4s, v19.4s, v26.4s\n"
+    "str q16, [x28, x13]\n"
+    "str q17, [x27, x13]\n"
+    "mov v13.16b, v12.16b\n"
+    "mov v14.16b, v12.16b\n"
+    "str q18, [x26, x13]\n"
+    "mov v15.16b, v12.16b\n"
+    "mov v16.16b, v12.16b\n"
+    "str q19, [x25, x13]\n"
+    "mov v17.16b, v12.16b\n"
+    "mov v18.16b, v12.16b\n"
+    "add x13, x13, #0x10\n"
+    "mov v19.16b, v12.16b\n"
+    "bgt 1b\n"
+    "2:"  // Output channel complete vector tail
+    "fmla v12.4s, v31.4s, v0.s[0]\n"
+    "fmla v13.4s, v31.4s, v0.s[1]\n"
+    "fmla v14.4s, v31.4s, v0.s[2]\n"
+    "fmla v15.4s, v31.4s, v0.s[3]\n"
+    "fmla v16.4s, v31.4s, v2.s[0]\n"
+    "fmla v17.4s, v31.4s, v2.s[1]\n"
+    "fmla v18.4s, v31.4s, v2.s[2]\n"
+    "fmla v19.4s, v31.4s, v2.s[3]\n"
+    "ldr q24, [%x[params], #0x0]\n"
+    "fmla v12.4s, v30.4s, v0.s[1]\n"
+    "fmla v13.4s, v30.4s, v0.s[2]\n"
+    "fmla v14.4s, v30.4s, v0.s[3]\n"
+    "fmla v15.4s, v30.4s, v1.s[0]\n"
+    "fmla v16.4s, v30.4s, v2.s[1]\n"
+    "fmla v17.4s, v30.4s, v2.s[2]\n"
+    "fmla v18.4s, v30.4s, v2.s[3]\n"
+    "fmla v19.4s, v30.4s, v3.s[0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "fmla v12.4s, v29.4s, v0.s[2]\n"
+    "fmla v13.4s, v29.4s, v0.s[3]\n"
+    "fmla v14.4s, v29.4s, v1.s[0]\n"
+    "fmla v15.4s, v29.4s, v1.s[1]\n"
+    "fmla v16.4s, v29.4s, v2.s[2]\n"
+    "fmla v17.4s, v29.4s, v2.s[3]\n"
+    "fmla v18.4s, v29.4s, v3.s[0]\n"
+    "fmla v19.4s, v29.4s, v3.s[1]\n"
+    "ldr q22, [%x[params], #0x20]\n"
+    "fmla v12.4s, v28.4s, v0.s[3]\n"
+    "fmla v13.4s, v28.4s, v1.s[0]\n"
+    "fmla v14.4s, v28.4s, v1.s[1]\n"
+    "fmla v15.4s, v28.4s, v1.s[2]\n"
+    "fmla v16.4s, v28.4s, v2.s[3]\n"
+    "fmla v17.4s, v28.4s, v3.s[0]\n"
+    "fmla v18.4s, v28.4s, v3.s[1]\n"
+    "fmla v19.4s, v28.4s, v3.s[2]\n"
+    "ldr q21, [%x[params], #0x30]\n"
+    "fmla v12.4s, v27.4s, v1.s[0]\n"
+    "fmla v13.4s, v27.4s, v1.s[1]\n"
+    "fmla v14.4s, v27.4s, v1.s[2]\n"
+    "fmla v15.4s, v27.4s, v1.s[3]\n"
+    "fmla v16.4s, v27.4s, v3.s[0]\n"
+    "fmla v17.4s, v27.4s, v3.s[1]\n"
+    "fmla v18.4s, v27.4s, v3.s[2]\n"
+    "fmla v19.4s, v27.4s, v3.s[3]\n"
+    "ldr q20, [%x[params], #0x40]\n"
+    "fmla v12.4s, v24.4s, v2.s[0]\n"
+    "fmla v13.4s, v24.4s, v2.s[1]\n"
+    "fmla v14.4s, v24.4s, v2.s[2]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v4.s[0]\n"
+    "fmla v17.4s, v24.4s, v4.s[1]\n"
+    "fmla v18.4s, v24.4s, v4.s[2]\n"
+    "fmla v19.4s, v24.4s, v4.s[3]\n"
+    "ldr q24, [%x[params], #0x50]\n"
+    "fmla v12.4s, v23.4s, v2.s[1]\n"
+    "fmla v13.4s, v23.4s, v2.s[2]\n"
+    "fmla v14.4s, v23.4s, v2.s[3]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v4.s[1]\n"
+    "fmla v17.4s, v23.4s, v4.s[2]\n"
+    "fmla v18.4s, v23.4s, v4.s[3]\n"
+    "fmla v19.4s, v23.4s, v5.s[0]\n"
+    "ldr q23, [%x[params], #0x60]\n"
+    "fmla v12.4s, v22.4s, v2.s[2]\n"
+    "fmla v13.4s, v22.4s, v2.s[3]\n"
+    "fmla v14.4s, v22.4s, v3.s[0]\n"
+    "fmla v15.4s, v22.4s, v3.s[1]\n"
+    "fmla v16.4s, v22.4s, v4.s[2]\n"
+    "fmla v17.4s, v22.4s, v4.s[3]\n"
+    "fmla v18.4s, v22.4s, v5.s[0]\n"
+    "fmla v19.4s, v22.4s, v5.s[1]\n"
+    "ldr q22, [%x[params], #0x70]\n"
+    "fmla v12.4s, v21.4s, v2.s[3]\n"
+    "fmla v13.4s, v21.4s, v3.s[0]\n"
+    "fmla v14.4s, v21.4s, v3.s[1]\n"
+    "fmla v15.4s, v21.4s, v3.s[2]\n"
+    "fmla v16.4s, v21.4s, v4.s[3]\n"
+    "fmla v17.4s, v21.4s, v5.s[0]\n"
+    "fmla v18.4s, v21.4s, v5.s[1]\n"
+    "fmla v19.4s, v21.4s, v5.s[2]\n"
+    "ldr q21, [%x[params], #0x80]\n"
+    "fmla v12.4s, v20.4s, v3.s[0]\n"
+    "fmla v13.4s, v20.4s, v3.s[1]\n"
+    "fmla v14.4s, v20.4s, v3.s[2]\n"
+    "fmla v15.4s, v20.4s, v3.s[3]\n"
+    "fmla v16.4s, v20.4s, v5.s[0]\n"
+    "fmla v17.4s, v20.4s, v5.s[1]\n"
+    "fmla v18.4s, v20.4s, v5.s[2]\n"
+    "fmla v19.4s, v20.4s, v5.s[3]\n"
+    "ldr q20, [%x[params], #0x90]\n"
+    "fmla v12.4s, v24.4s, v4.s[0]\n"
+    "fmla v13.4s, v24.4s, v4.s[1]\n"
+    "fmla v14.4s, v24.4s, v4.s[2]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v6.s[0]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[2]\n"
+    "fmla v19.4s, v24.4s, v6.s[3]\n"
+    "ldr q24, [%x[params], #0xa0]\n"
+    "fmla v12.4s, v23.4s, v4.s[1]\n"
+    "fmla v13.4s, v23.4s, v4.s[2]\n"
+    "fmla v14.4s, v23.4s, v4.s[3]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmla v16.4s, v23.4s, v6.s[1]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v6.s[3]\n"
+    "fmla v19.4s, v23.4s, v7.s[0]\n"
+    "ldr q23, [%x[params], #0xb0]\n"
+    "fmla v12.4s, v22.4s, v4.s[2]\n"
+    "fmla v13.4s, v22.4s, v4.s[3]\n"
+    "fmla v14.4s, v22.4s, v5.s[0]\n"
+    "fmla v15.4s, v22.4s, v5.s[1]\n"
+    "fmla v16.4s, v22.4s, v6.s[2]\n"
+    "fmla v17.4s, v22.4s, v6.s[3]\n"
+    "fmla v18.4s, v22.4s, v7.s[0]\n"
+    "fmla v19.4s, v22.4s, v7.s[1]\n"
+    "ldr q22, [%x[params], #0xc0]\n"
+    "fmla v12.4s, v21.4s, v4.s[3]\n"
+    "fmla v13.4s, v21.4s, v5.s[0]\n"
+    "fmla v14.4s, v21.4s, v5.s[1]\n"
+    "fmla v15.4s, v21.4s, v5.s[2]\n"
+    "fmla v16.4s, v21.4s, v6.s[3]\n"
+    "fmla v17.4s, v21.4s, v7.s[0]\n"
+    "fmla v18.4s, v21.4s, v7.s[1]\n"
+    "fmla v19.4s, v21.4s, v7.s[2]\n"
+    "ldr q21, [%x[params], #0xd0]\n"
+    "fmla v12.4s, v20.4s, v5.s[0]\n"
+    "fmla v13.4s, v20.4s, v5.s[1]\n"
+    "fmla v14.4s, v20.4s, v5.s[2]\n"
+    "fmla v15.4s, v20.4s, v5.s[3]\n"
+    "fmla v16.4s, v20.4s, v7.s[0]\n"
+    "fmla v17.4s, v20.4s, v7.s[1]\n"
+    "fmla v18.4s, v20.4s, v7.s[2]\n"
+    "fmla v19.4s, v20.4s, v7.s[3]\n"
+    "ldr q20, [%x[params], #0xe0]\n"
+    "fmla v12.4s, v24.4s, v6.s[0]\n"
+    "fmla v13.4s, v24.4s, v6.s[1]\n"
+    "fmla v14.4s, v24.4s, v6.s[2]\n"
+    "fmla v15.4s, v24.4s, v6.s[3]\n"
+    "fmla v16.4s, v24.4s, v8.s[0]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[2]\n"
+    "fmla v19.4s, v24.4s, v8.s[3]\n"
+    "ldr q24, [%x[params], #0xf0]\n"
+    "fmla v12.4s, v23.4s, v6.s[1]\n"
+    "fmla v13.4s, v23.4s, v6.s[2]\n"
+    "fmla v14.4s, v23.4s, v6.s[3]\n"
+    "fmla v15.4s, v23.4s, v7.s[0]\n"
+    "fmla v16.4s, v23.4s, v8.s[1]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmla v18.4s, v23.4s, v8.s[3]\n"
+    "fmla v19.4s, v23.4s, v9.s[0]\n"
+    "ldr q23, [%x[params], #0x100]\n"
+    "fmla v12.4s, v22.4s, v6.s[2]\n"
+    "fmla v13.4s, v22.4s, v6.s[3]\n"
+    "fmla v14.4s, v22.4s, v7.s[0]\n"
+    "fmla v15.4s, v22.4s, v7.s[1]\n"
+    "fmla v16.4s, v22.4s, v8.s[2]\n"
+    "fmla v17.4s, v22.4s, v8.s[3]\n"
+    "fmla v18.4s, v22.4s, v9.s[0]\n"
+    "fmla v19.4s, v22.4s, v9.s[1]\n"
+    "ldr q22, [%x[params], #0x110]\n"
+    "fmla v12.4s, v21.4s, v6.s[3]\n"
+    "fmla v13.4s, v21.4s, v7.s[0]\n"
+    "fmla v14.4s, v21.4s, v7.s[1]\n"
+    "fmla v15.4s, v21.4s, v7.s[2]\n"
+    "fmla v16.4s, v21.4s, v8.s[3]\n"
+    "fmla v17.4s, v21.4s, v9.s[0]\n"
+    "fmla v18.4s, v21.4s, v9.s[1]\n"
+    "fmla v19.4s, v21.4s, v9.s[2]\n"
+    "ldr q21, [%x[params], #0x120]\n"
+    "fmla v12.4s, v20.4s, v7.s[0]\n"
+    "fmla v13.4s, v20.4s, v7.s[1]\n"
+    "fmla v14.4s, v20.4s, v7.s[2]\n"
+    "fmla v15.4s, v20.4s, v7.s[3]\n"
+    "fmla v16.4s, v20.4s, v9.s[0]\n"
+    "fmla v17.4s, v20.4s, v9.s[1]\n"
+    "fmla v18.4s, v20.4s, v9.s[2]\n"
+    "fmla v19.4s, v20.4s, v9.s[3]\n"
+    "ldr q20, [%x[params], #0x130]\n"
+    "add %x[params], %x[params], #0x140\n"
+    "fmla v12.4s, v24.4s, v8.s[0]\n"
+    "fmla v13.4s, v24.4s, v8.s[1]\n"
+    "fmla v14.4s, v24.4s, v8.s[2]\n"
+    "fmla v15.4s, v24.4s, v8.s[3]\n"
+    "fmla v16.4s, v24.4s, v10.s[0]\n"
+    "fmla v17.4s, v24.4s, v10.s[1]\n"
+    "fmla v18.4s, v24.4s, v10.s[2]\n"
+    "fmla v19.4s, v24.4s, v10.s[3]\n"
+    "fmla v12.4s, v23.4s, v8.s[1]\n"
+    "fmla v13.4s, v23.4s, v8.s[2]\n"
+    "fmla v14.4s, v23.4s, v8.s[3]\n"
+    "fmla v15.4s, v23.4s, v9.s[0]\n"
+    "fmla v16.4s, v23.4s, v10.s[1]\n"
+    "fmla v17.4s, v23.4s, v10.s[2]\n"
+    "fmla v18.4s, v23.4s, v10.s[3]\n"
+    "fmla v19.4s, v23.4s, v11.s[0]\n"
+    "fmla v12.4s, v22.4s, v8.s[2]\n"
+    "fmla v13.4s, v22.4s, v8.s[3]\n"
+    "fmla v14.4s, v22.4s, v9.s[0]\n"
+    "fmla v15.4s, v22.4s, v9.s[1]\n"
+    "fmla v16.4s, v22.4s, v10.s[2]\n"
+    "fmla v17.4s, v22.4s, v10.s[3]\n"
+    "fmla v18.4s, v22.4s, v11.s[0]\n"
+    "fmla v19.4s, v22.4s, v11.s[1]\n"
+    "fmla v12.4s, v21.4s, v8.s[3]\n"
+    "fmla v13.4s, v21.4s, v9.s[0]\n"
+    "fmla v14.4s, v21.4s, v9.s[1]\n"
+    "fmla v15.4s, v21.4s, v9.s[2]\n"
+    "fmla v16.4s, v21.4s, v10.s[3]\n"
+    "fmla v17.4s, v21.4s, v11.s[0]\n"
+    "fmla v18.4s, v21.4s, v11.s[1]\n"
+    "fmla v19.4s, v21.4s, v11.s[2]\n"
+    "fmla v12.4s, v20.4s, v9.s[0]\n"
+    "fmla v13.4s, v20.4s, v9.s[1]\n"
+    "fmin v12.4s, v12.4s, v25.4s\n"
+    "fmla v14.4s, v20.4s, v9.s[2]\n"
+    "fmla v15.4s, v20.4s, v9.s[3]\n"
+    "fmin v13.4s, v13.4s, v25.4s\n"
+    "fmla v16.4s, v20.4s, v11.s[0]\n"
+    "fmla v17.4s, v20.4s, v11.s[1]\n"
+    "fmin v14.4s, v14.4s, v25.4s\n"
+    "fmla v18.4s, v20.4s, v11.s[2]\n"
+    "fmla v19.4s, v20.4s, v11.s[3]\n"
+    "fmin v15.4s, v15.4s, v25.4s\n"
+    "fmin v16.4s, v16.4s, v25.4s\n"
+    "fmin v17.4s, v17.4s, v25.4s\n"
+    "fmin v18.4s, v18.4s, v25.4s\n"
+    "fmin v19.4s, v19.4s, v25.4s\n"
+    "fmax v12.4s, v12.4s, v26.4s\n"
+    "fmax v13.4s, v13.4s, v26.4s\n"
+    "str q12, [x12, x13]\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "str q13, [x11, x13]\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "str q14, [x10, x13]\n"
+    "fmax v18.4s, v18.4s, v26.4s\n"
+    "fmax v19.4s, v19.4s, v26.4s\n"
+    "str q15, [x9, x13]\n"
+    "str q16, [x28, x13]\n"
+    "str q17, [x27, x13]\n"
+    "str q18, [x26, x13]\n"
+    "str q19, [x25, x13]\n"
+    "add x13, x13, #0x10\n"
+    "3:"  // Output channel oddments
+    "tst %x[channel_multiplier], #0x3\n"
+    "beq 6f\n"
+    "ldr q12, [%x[params], #0x0]\n"
+    "ldr q24, [%x[params], #0x10]\n"
+    "mov v13.16b, v12.16b\n"
+    "mov v14.16b, v12.16b\n"
+    "ldr q23, [%x[params], #0x20]\n"
+    "ldr q22, [%x[params], #0x30]\n"
+    "mov v15.16b, v12.16b\n"
+    "mov v16.16b, v12.16b\n"
+    "ldr q21, [%x[params], #0x40]\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    "mov v17.16b, v12.16b\n"
+    "mov v18.16b, v12.16b\n"
+    "mov v19.16b, v12.16b\n"
+    "fmla v12.4s, v24.4s, v0.s[0]\n"
+    "fmla v13.4s, v24.4s, v0.s[1]\n"
+    "fmla v14.4s, v24.4s, v0.s[2]\n"
+    "fmla v15.4s, v24.4s, v0.s[3]\n"
+    "fmla v16.4s, v24.4s, v2.s[0]\n"
+    "fmla v17.4s, v24.4s, v2.s[1]\n"
+    "fmla v18.4s, v24.4s, v2.s[2]\n"
+    "fmla v19.4s, v24.4s, v2.s[3]\n"
+    "ldr q24, [%x[params], #0x60]\n"
+    "fmla v12.4s, v23.4s, v0.s[1]\n"
+    "fmla v13.4s, v23.4s, v0.s[2]\n"
+    "fmla v14.4s, v23.4s, v0.s[3]\n"
+    "fmla v15.4s, v23.4s, v1.s[0]\n"
+    "fmla v16.4s, v23.4s, v2.s[1]\n"
+    "fmla v17.4s, v23.4s, v2.s[2]\n"
+    "fmla v18.4s, v23.4s, v2.s[3]\n"
+    "fmla v19.4s, v23.4s, v3.s[0]\n"
+    "ldr q23, [%x[params], #0x70]\n"
+    "fmla v12.4s, v22.4s, v0.s[2]\n"
+    "fmla v13.4s, v22.4s, v0.s[3]\n"
+    "fmla v14.4s, v22.4s, v1.s[0]\n"
+    "fmla v15.4s, v22.4s, v1.s[1]\n"
+    "fmla v16.4s, v22.4s, v2.s[2]\n"
+    "fmla v17.4s, v22.4s, v2.s[3]\n"
+    "fmla v18.4s, v22.4s, v3.s[0]\n"
+    "fmla v19.4s, v22.4s, v3.s[1]\n"
+    "ldr q22, [%x[params], #0x80]\n"
+    "fmla v12.4s, v21.4s, v0.s[3]\n"
+    "fmla v13.4s, v21.4s, v1.s[0]\n"
+    "fmla v14.4s, v21.4s, v1.s[1]\n"
+    "fmla v15.4s, v21.4s, v1.s[2]\n"
+    "fmla v16.4s, v21.4s, v2.s[3]\n"
+    "fmla v17.4s, v21.4s, v3.s[0]\n"
+    "fmla v18.4s, v21.4s, v3.s[1]\n"
+    "fmla v19.4s, v21.4s, v3.s[2]\n"
+    "ldr q21, [%x[params], #0x90]\n"
+    "fmla v12.4s, v20.4s, v1.s[0]\n"
+    "fmla v13.4s, v20.4s, v1.s[1]\n"
+    "fmla v14.4s, v20.4s, v1.s[2]\n"
+    "fmla v15.4s, v20.4s, v1.s[3]\n"
+    "fmla v16.4s, v20.4s, v3.s[0]\n"
+    "fmla v17.4s, v20.4s, v3.s[1]\n"
+    "fmla v18.4s, v20.4s, v3.s[2]\n"
+    "fmla v19.4s, v20.4s, v3.s[3]\n"
+    "ldr q20, [%x[params], #0xa0]\n"
+    "fmla v12.4s, v24.4s, v2.s[0]\n"
+    "fmla v13.4s, v24.4s, v2.s[1]\n"
+    "fmla v14.4s, v24.4s, v2.s[2]\n"
+    "fmla v15.4s, v24.4s, v2.s[3]\n"
+    "fmla v16.4s, v24.4s, v4.s[0]\n"
+    "fmla v17.4s, v24.4s, v4.s[1]\n"
+    "fmla v18.4s, v24.4s, v4.s[2]\n"
+    "fmla v19.4s, v24.4s, v4.s[3]\n"
+    "ldr q24, [%x[params], #0xb0]\n"
+    "fmla v12.4s, v23.4s, v2.s[1]\n"
+    "fmla v13.4s, v23.4s, v2.s[2]\n"
+    "fmla v14.4s, v23.4s, v2.s[3]\n"
+    "fmla v15.4s, v23.4s, v3.s[0]\n"
+    "fmla v16.4s, v23.4s, v4.s[1]\n"
+    "fmla v17.4s, v23.4s, v4.s[2]\n"
+    "fmla v18.4s, v23.4s, v4.s[3]\n"
+    "fmla v19.4s, v23.4s, v5.s[0]\n"
+    "ldr q23, [%x[params], #0xc0]\n"
+    "fmla v12.4s, v22.4s, v2.s[2]\n"
+    "fmla v13.4s, v22.4s, v2.s[3]\n"
+    "fmla v14.4s, v22.4s, v3.s[0]\n"
+    "fmla v15.4s, v22.4s, v3.s[1]\n"
+    "fmla v16.4s, v22.4s, v4.s[2]\n"
+    "fmla v17.4s, v22.4s, v4.s[3]\n"
+    "fmla v18.4s, v22.4s, v5.s[0]\n"
+    "fmla v19.4s, v22.4s, v5.s[1]\n"
+    "ldr q22, [%x[params], #0xd0]\n"
+    "fmla v12.4s, v21.4s, v2.s[3]\n"
+    "fmla v13.4s, v21.4s, v3.s[0]\n"
+    "fmla v14.4s, v21.4s, v3.s[1]\n"
+    "fmla v15.4s, v21.4s, v3.s[2]\n"
+    "fmla v16.4s, v21.4s, v4.s[3]\n"
+    "fmla v17.4s, v21.4s, v5.s[0]\n"
+    "fmla v18.4s, v21.4s, v5.s[1]\n"
+    "fmla v19.4s, v21.4s, v5.s[2]\n"
+    "ldr q21, [%x[params], #0xe0]\n"
+    "fmla v12.4s, v20.4s, v3.s[0]\n"
+    "fmla v13.4s, v20.4s, v3.s[1]\n"
+    "fmla v14.4s, v20.4s, v3.s[2]\n"
+    "fmla v15.4s, v20.4s, v3.s[3]\n"
+    "fmla v16.4s, v20.4s, v5.s[0]\n"
+    "fmla v17.4s, v20.4s, v5.s[1]\n"
+    "fmla v18.4s, v20.4s, v5.s[2]\n"
+    "fmla v19.4s, v20.4s, v5.s[3]\n"
+    "ldr q20, [%x[params], #0xf0]\n"
+    "fmla v12.4s, v24.4s, v4.s[0]\n"
+    "fmla v13.4s, v24.4s, v4.s[1]\n"
+    "fmla v14.4s, v24.4s, v4.s[2]\n"
+    "fmla v15.4s, v24.4s, v4.s[3]\n"
+    "fmla v16.4s, v24.4s, v6.s[0]\n"
+    "fmla v17.4s, v24.4s, v6.s[1]\n"
+    "fmla v18.4s, v24.4s, v6.s[2]\n"
+    "fmla v19.4s, v24.4s, v6.s[3]\n"
+    "ldr q24, [%x[params], #0x100]\n"
+    "fmla v12.4s, v23.4s, v4.s[1]\n"
+    "fmla v13.4s, v23.4s, v4.s[2]\n"
+    "fmla v14.4s, v23.4s, v4.s[3]\n"
+    "fmla v15.4s, v23.4s, v5.s[0]\n"
+    "fmla v16.4s, v23.4s, v6.s[1]\n"
+    "fmla v17.4s, v23.4s, v6.s[2]\n"
+    "fmla v18.4s, v23.4s, v6.s[3]\n"
+    "fmla v19.4s, v23.4s, v7.s[0]\n"
+    "ldr q23, [%x[params], #0x110]\n"
+    "fmla v12.4s, v22.4s, v4.s[2]\n"
+    "fmla v13.4s, v22.4s, v4.s[3]\n"
+    "fmla v14.4s, v22.4s, v5.s[0]\n"
+    "fmla v15.4s, v22.4s, v5.s[1]\n"
+    "fmla v16.4s, v22.4s, v6.s[2]\n"
+    "fmla v17.4s, v22.4s, v6.s[3]\n"
+    "fmla v18.4s, v22.4s, v7.s[0]\n"
+    "fmla v19.4s, v22.4s, v7.s[1]\n"
+    "ldr q22, [%x[params], #0x120]\n"
+    "fmla v12.4s, v21.4s, v4.s[3]\n"
+    "fmla v13.4s, v21.4s, v5.s[0]\n"
+    "fmla v14.4s, v21.4s, v5.s[1]\n"
+    "fmla v15.4s, v21.4s, v5.s[2]\n"
+    "fmla v16.4s, v21.4s, v6.s[3]\n"
+    "fmla v17.4s, v21.4s, v7.s[0]\n"
+    "fmla v18.4s, v21.4s, v7.s[1]\n"
+    "fmla v19.4s, v21.4s, v7.s[2]\n"
+    "ldr q21, [%x[params], #0x130]\n"
+    "fmla v12.4s, v20.4s, v5.s[0]\n"
+    "fmla v13.4s, v20.4s, v5.s[1]\n"
+    "fmla v14.4s, v20.4s, v5.s[2]\n"
+    "fmla v15.4s, v20.4s, v5.s[3]\n"
+    "fmla v16.4s, v20.4s, v7.s[0]\n"
+    "fmla v17.4s, v20.4s, v7.s[1]\n"
+    "fmla v18.4s, v20.4s, v7.s[2]\n"
+    "fmla v19.4s, v20.4s, v7.s[3]\n"
+    "ldr q20, [%x[params], #0x140]\n"
+    "fmla v12.4s, v24.4s, v6.s[0]\n"
+    "fmla v13.4s, v24.4s, v6.s[1]\n"
+    "fmla v14.4s, v24.4s, v6.s[2]\n"
+    "fmla v15.4s, v24.4s, v6.s[3]\n"
+    "fmla v16.4s, v24.4s, v8.s[0]\n"
+    "fmla v17.4s, v24.4s, v8.s[1]\n"
+    "fmla v18.4s, v24.4s, v8.s[2]\n"
+    "fmla v19.4s, v24.4s, v8.s[3]\n"
+    "ldr q24, [%x[params], #0x150]\n"
+    "fmla v12.4s, v23.4s, v6.s[1]\n"
+    "fmla v13.4s, v23.4s, v6.s[2]\n"
+    "fmla v14.4s, v23.4s, v6.s[3]\n"
+    "fmla v15.4s, v23.4s, v7.s[0]\n"
+    "fmla v16.4s, v23.4s, v8.s[1]\n"
+    "fmla v17.4s, v23.4s, v8.s[2]\n"
+    "fmla v18.4s, v23.4s, v8.s[3]\n"
+    "fmla v19.4s, v23.4s, v9.s[0]\n"
+    "ldr q23, [%x[params], #0x160]\n"
+    "fmla v12.4s, v22.4s, v6.s[2]\n"
+    "fmla v13.4s, v22.4s, v6.s[3]\n"
+    "fmla v14.4s, v22.4s, v7.s[0]\n"
+    "fmla v15.4s, v22.4s, v7.s[1]\n"
+    "fmla v16.4s, v22.4s, v8.s[2]\n"
+    "fmla v17.4s, v22.4s, v8.s[3]\n"
+    "fmla v18.4s, v22.4s, v9.s[0]\n"
+    "fmla v19.4s, v22.4s, v9.s[1]\n"
+    "ldr q22, [%x[params], #0x170]\n"
+    "fmla v12.4s, v21.4s, v6.s[3]\n"
+    "fmla v13.4s, v21.4s, v7.s[0]\n"
+    "fmla v14.4s, v21.4s, v7.s[1]\n"
+    "fmla v15.4s, v21.4s, v7.s[2]\n"
+    "fmla v16.4s, v21.4s, v8.s[3]\n"
+    "fmla v17.4s, v21.4s, v9.s[0]\n"
+    "fmla v18.4s, v21.4s, v9.s[1]\n"
+    "fmla v19.4s, v21.4s, v9.s[2]\n"
+    "ldr q21, [%x[params], #0x180]\n"
+    "fmla v12.4s, v20.4s, v7.s[0]\n"
+    "fmla v13.4s, v20.4s, v7.s[1]\n"
+    "fmla v14.4s, v20.4s, v7.s[2]\n"
+    "fmla v15.4s, v20.4s, v7.s[3]\n"
+    "fmla v16.4s, v20.4s, v9.s[0]\n"
+    "fmla v17.4s, v20.4s, v9.s[1]\n"
+    "fmla v18.4s, v20.4s, v9.s[2]\n"
+    "fmla v19.4s, v20.4s, v9.s[3]\n"
+    "ldr q20, [%x[params], #0x190]\n"
+    "add %x[params], %x[params], #0x1a0\n"
+    "fmla v12.4s, v24.4s, v8.s[0]\n"
+    "fmla v13.4s, v24.4s, v8.s[1]\n"
+    "fmla v14.4s, v24.4s, v8.s[2]\n"
+    "fmla v15.4s, v24.4s, v8.s[3]\n"
+    "fmla v16.4s, v24.4s, v10.s[0]\n"
+    "fmla v17.4s, v24.4s, v10.s[1]\n"
+    "fmla v18.4s, v24.4s, v10.s[2]\n"
+    "fmla v19.4s, v24.4s, v10.s[3]\n"
+    "fmla v12.4s, v23.4s, v8.s[1]\n"
+    "fmla v13.4s, v23.4s, v8.s[2]\n"
+    "fmla v14.4s, v23.4s, v8.s[3]\n"
+    "fmla v15.4s, v23.4s, v9.s[0]\n"
+    "fmla v16.4s, v23.4s, v10.s[1]\n"
+    "fmla v17.4s, v23.4s, v10.s[2]\n"
+    "fmla v18.4s, v23.4s, v10.s[3]\n"
+    "fmla v19.4s, v23.4s, v11.s[0]\n"
+    "fmla v12.4s, v22.4s, v8.s[2]\n"
+    "fmla v13.4s, v22.4s, v8.s[3]\n"
+    "fmla v14.4s, v22.4s, v9.s[0]\n"
+    "fmla v15.4s, v22.4s, v9.s[1]\n"
+    "fmla v16.4s, v22.4s, v10.s[2]\n"
+    "fmla v17.4s, v22.4s, v10.s[3]\n"
+    "fmla v18.4s, v22.4s, v11.s[0]\n"
+    "fmla v19.4s, v22.4s, v11.s[1]\n"
+    "fmla v12.4s, v21.4s, v8.s[3]\n"
+    "fmla v13.4s, v21.4s, v9.s[0]\n"
+    "fmla v14.4s, v21.4s, v9.s[1]\n"
+    "fmla v15.4s, v21.4s, v9.s[2]\n"
+    "fmla v16.4s, v21.4s, v10.s[3]\n"
+    "fmla v17.4s, v21.4s, v11.s[0]\n"
+    "fmla v18.4s, v21.4s, v11.s[1]\n"
+    "fmla v19.4s, v21.4s, v11.s[2]\n"
+    "fmla v12.4s, v20.4s, v9.s[0]\n"
+    "fmla v13.4s, v20.4s, v9.s[1]\n"
+    "fmin v12.4s, v12.4s, v25.4s\n"
+    "fmla v14.4s, v20.4s, v9.s[2]\n"
+    "fmla v15.4s, v20.4s, v9.s[3]\n"
+    "fmin v13.4s, v13.4s, v25.4s\n"
+    "fmla v16.4s, v20.4s, v11.s[0]\n"
+    "fmla v17.4s, v20.4s, v11.s[1]\n"
+    "fmin v14.4s, v14.4s, v25.4s\n"
+    "fmla v18.4s, v20.4s, v11.s[2]\n"
+    "fmla v19.4s, v20.4s, v11.s[3]\n"
+    "fmin v15.4s, v15.4s, v25.4s\n"
+    "fmin v16.4s, v16.4s, v25.4s\n"
+    "fmin v17.4s, v17.4s, v25.4s\n"
+    "fmin v18.4s, v18.4s, v25.4s\n"
+    "fmin v19.4s, v19.4s, v25.4s\n"
+    "fmax v12.4s, v12.4s, v26.4s\n"
+    "fmax v13.4s, v13.4s, v26.4s\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "fmax v18.4s, v18.4s, v26.4s\n"
+    "fmax v19.4s, v19.4s, v26.4s\n"
+    "tbz %x[channel_multiplier], #1, 4f\n"
+    "add x20, x12, x13\n"
+    "add x21, x11, x13\n"
+    "st1 { v12.d }[0], [x20]\n"
+    "add x20, x10, x13\n"
+    "add x24, x9, x13\n"
+    "st1 { v13.d }[0], [x21]\n"
+    "add x23, x28, x13\n"
+    "add x22, x27, x13\n"
+    "st1 { v14.d }[0], [x20]\n"
+    "add x21, x26, x13\n"
+    "add x20, x25, x13\n"
+    "st1 { v15.d }[0], [x24]\n"
+    "st1 { v16.d }[0], [x23]\n"
+    "add x13, x13, #0x8\n"
+    "st1 { v17.d }[0], [x22]\n"
+    "st1 { v18.d }[0], [x21]\n"
+    "st1 { v19.d }[0], [x20]\n"
+    "tbz %x[channel_multiplier], #0, 5f\n"
+    "add x20, x12, x13\n"
+    "add x21, x11, x13\n"
+    "st1 { v12.s }[2], [x20]\n"
+    "add x20, x10, x13\n"
+    "add x24, x9, x13\n"
+    "st1 { v13.s }[2], [x21]\n"
+    "add x23, x28, x13\n"
+    "add x22, x27, x13\n"
+    "st1 { v14.s }[2], [x20]\n"
+    "add x21, x26, x13\n"
+    "add x20, x25, x13\n"
+    "st1 { v15.s }[2], [x24]\n"
+    "st1 { v16.s }[2], [x23]\n"
+    "st1 { v17.s }[2], [x22]\n"
+    "st1 { v18.s }[2], [x21]\n"
+    "st1 { v19.s }[2], [x20]\n"
+    "b 5f\n"
+    "4:"  // Output channel oddments: Store: Bit 1: Unset
+    "add x20, x12, x13\n"
+    "add x21, x11, x13\n"
+    "st1 { v12.s }[0], [x20]\n"
+    "add x20, x10, x13\n"
+    "add x24, x9, x13\n"
+    "st1 { v13.s }[0], [x21]\n"
+    "add x23, x28, x13\n"
+    "add x22, x27, x13\n"
+    "st1 { v14.s }[0], [x20]\n"
+    "add x21, x26, x13\n"
+    "add x20, x25, x13\n"
+    "st1 { v15.s }[0], [x24]\n"
+    "st1 { v16.s }[0], [x23]\n"
+    "st1 { v17.s }[0], [x22]\n"
+    "st1 { v18.s }[0], [x21]\n"
+    "st1 { v19.s }[0], [x20]\n"
+    "5:"  // Output channel oddments: Store: Bit 1: End
+    "6:"  // End
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..3bece73973
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>;
+  a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::None)
+  {
+  }
+  Parent::KernelType kernel = a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..cc18dd4bb4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,850 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const float *weights,
+  const float *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v12.4s }, [%x[minmax_vals]]\n"
+    "lsr x11, %x[n_output_channels], #0x2\n"
+    "add x20, %x[minmax_vals], #0x4\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "mov x10, #0x0\n"
+    "cbz x11, 8f\n"
+    "1:"  // Output channel loop
+    "movi v31.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x10, #0x2\n"
+    "ldr q31, [%x[bias], x20]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "ldr q10, [%x[weights], #0x0]\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr q3, [x21, #0x0]\n"
+    "ldr q2, [x21, #0x10]\n"
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "ldr q1, [x20, #0x0]\n"
+    "ldr q0, [x20, #0x10]\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz x23, 6f\n"
+    "ldr q9, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q8, [x21, #0x0]\n"
+    "ldr q7, [x21, #0x10]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "beq 4f\n"
+    "3:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "subs x23, x23, #0x1\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "ldr q3, [x21, #0x0]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "ldr q2, [x21, #0x10]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "ldr q1, [x20, #0x0]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "ldr q0, [x20, #0x10]\n"
+    "ldr q10, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "ldr q8, [x21, #0x0]\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "ldr q7, [x21, #0x10]\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "ldr q9, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 3b\n"
+    "4:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 5f\n"
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "lsl x28, x10, #0x2\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "fmin v16.4s, v16.4s, v11.4s\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "fmin v17.4s, v17.4s, v11.4s\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmin v18.4s, v18.4s, v11.4s\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "fmin v19.4s, v19.4s, v11.4s\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmin v20.4s, v20.4s, v11.4s\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "fmin v21.4s, v21.4s, v11.4s\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmin v22.4s, v22.4s, v11.4s\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "fmin v23.4s, v23.4s, v11.4s\n"
+    "fmax v16.4s, v16.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v12.4s\n"
+    "str q16, [x27, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v18.4s, v18.4s, v12.4s\n"
+    "fmax v19.4s, v19.4s, v12.4s\n"
+    "str q17, [x26, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v20.4s, v20.4s, v12.4s\n"
+    "fmax v21.4s, v21.4s, v12.4s\n"
+    "str q18, [x25, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmax v22.4s, v22.4s, v12.4s\n"
+    "fmax v23.4s, v23.4s, v12.4s\n"
+    "str q19, [x24, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v24.4s, v24.4s, v11.4s\n"
+    "fmin v25.4s, v25.4s, v11.4s\n"
+    "str q20, [x23, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v26.4s, v26.4s, v11.4s\n"
+    "fmin v27.4s, v27.4s, v11.4s\n"
+    "str q21, [x22, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v28.4s, v28.4s, v11.4s\n"
+    "fmin v29.4s, v29.4s, v11.4s\n"
+    "str q22, [x21, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmin v30.4s, v30.4s, v11.4s\n"
+    "fmin v31.4s, v31.4s, v11.4s\n"
+    "str q23, [x20, x28]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.4s, v24.4s, v12.4s\n"
+    "fmax v25.4s, v25.4s, v12.4s\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.4s, v26.4s, v12.4s\n"
+    "fmax v27.4s, v27.4s, v12.4s\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.4s, v28.4s, v12.4s\n"
+    "fmax v29.4s, v29.4s, v12.4s\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.4s, v30.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v12.4s\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "b 7f\n"
+    "5:"  // Output channel loop: Odd tail
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "ldp x20, x9, [x22], #0x10\n"
+    "lsl x28, x10, #0x2\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "ldr q4, [x20, #0x0]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "ldr q2, [x9, #0x0]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "ldr q1, [%x[weights], #0x0]\n"
+    "ldr q0, [x9, #0x10]\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "fmla v16.4s, v1.4s, v4.s[0]\n"
+    "fmla v17.4s, v1.4s, v4.s[1]\n"
+    "fmin v16.4s, v16.4s, v11.4s\n"
+    "fmla v18.4s, v1.4s, v4.s[2]\n"
+    "fmla v19.4s, v1.4s, v4.s[3]\n"
+    "fmin v17.4s, v17.4s, v11.4s\n"
+    "fmla v20.4s, v1.4s, v3.s[0]\n"
+    "fmla v21.4s, v1.4s, v3.s[1]\n"
+    "fmin v18.4s, v18.4s, v11.4s\n"
+    "fmla v22.4s, v1.4s, v3.s[2]\n"
+    "fmla v23.4s, v1.4s, v3.s[3]\n"
+    "fmin v19.4s, v19.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v2.s[0]\n"
+    "fmla v25.4s, v1.4s, v2.s[1]\n"
+    "fmin v20.4s, v20.4s, v11.4s\n"
+    "fmla v26.4s, v1.4s, v2.s[2]\n"
+    "fmla v27.4s, v1.4s, v2.s[3]\n"
+    "fmin v21.4s, v21.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v0.s[0]\n"
+    "fmla v29.4s, v1.4s, v0.s[1]\n"
+    "fmin v22.4s, v22.4s, v11.4s\n"
+    "fmla v30.4s, v1.4s, v0.s[2]\n"
+    "fmla v31.4s, v1.4s, v0.s[3]\n"
+    "fmin v23.4s, v23.4s, v11.4s\n"
+    "fmax v16.4s, v16.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v12.4s\n"
+    "str q16, [x27, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v18.4s, v18.4s, v12.4s\n"
+    "fmax v19.4s, v19.4s, v12.4s\n"
+    "str q17, [x26, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v20.4s, v20.4s, v12.4s\n"
+    "fmax v21.4s, v21.4s, v12.4s\n"
+    "str q18, [x25, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmax v22.4s, v22.4s, v12.4s\n"
+    "fmax v23.4s, v23.4s, v12.4s\n"
+    "str q19, [x24, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v24.4s, v24.4s, v11.4s\n"
+    "fmin v25.4s, v25.4s, v11.4s\n"
+    "str q20, [x23, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v26.4s, v26.4s, v11.4s\n"
+    "fmin v27.4s, v27.4s, v11.4s\n"
+    "str q21, [x22, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v28.4s, v28.4s, v11.4s\n"
+    "fmin v29.4s, v29.4s, v11.4s\n"
+    "str q22, [x21, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmin v30.4s, v30.4s, v11.4s\n"
+    "fmin v31.4s, v31.4s, v11.4s\n"
+    "str q23, [x20, x28]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.4s, v24.4s, v12.4s\n"
+    "fmax v25.4s, v25.4s, v12.4s\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.4s, v26.4s, v12.4s\n"
+    "fmax v27.4s, v27.4s, v12.4s\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.4s, v28.4s, v12.4s\n"
+    "fmax v29.4s, v29.4s, v12.4s\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.4s, v30.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v12.4s\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "b 7f\n"
+    "6:"  // Output channel loop: Single kernel point
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "fmin v16.4s, v16.4s, v11.4s\n"
+    "lsl x28, x10, #0x2\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "fmin v17.4s, v17.4s, v11.4s\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmin v18.4s, v18.4s, v11.4s\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "fmin v19.4s, v19.4s, v11.4s\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmin v20.4s, v20.4s, v11.4s\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "fmin v21.4s, v21.4s, v11.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmin v22.4s, v22.4s, v11.4s\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "fmin v23.4s, v23.4s, v11.4s\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmax v16.4s, v16.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v12.4s\n"
+    "str q16, [x27, x28]\n"
+    "fmax v18.4s, v18.4s, v12.4s\n"
+    "fmax v19.4s, v19.4s, v12.4s\n"
+    "str q17, [x26, x28]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax v20.4s, v20.4s, v12.4s\n"
+    "fmax v21.4s, v21.4s, v12.4s\n"
+    "str q18, [x25, x28]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax v22.4s, v22.4s, v12.4s\n"
+    "fmax v23.4s, v23.4s, v12.4s\n"
+    "str q19, [x24, x28]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmin v24.4s, v24.4s, v11.4s\n"
+    "fmin v25.4s, v25.4s, v11.4s\n"
+    "str q20, [x23, x28]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin v26.4s, v26.4s, v11.4s\n"
+    "fmin v27.4s, v27.4s, v11.4s\n"
+    "str q21, [x22, x28]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin v28.4s, v28.4s, v11.4s\n"
+    "fmin v29.4s, v29.4s, v11.4s\n"
+    "str q22, [x21, x28]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin v30.4s, v30.4s, v11.4s\n"
+    "fmin v31.4s, v31.4s, v11.4s\n"
+    "str q23, [x20, x28]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax v24.4s, v24.4s, v12.4s\n"
+    "fmax v25.4s, v25.4s, v12.4s\n"
+    "str q24, [x27, x28]\n"
+    "fmax v26.4s, v26.4s, v12.4s\n"
+    "fmax v27.4s, v27.4s, v12.4s\n"
+    "str q25, [x26, x28]\n"
+    "fmax v28.4s, v28.4s, v12.4s\n"
+    "fmax v29.4s, v29.4s, v12.4s\n"
+    "str q26, [x25, x28]\n"
+    "fmax v30.4s, v30.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v12.4s\n"
+    "str q27, [x24, x28]\n"
+    "str q28, [x23, x28]\n"
+    "str q29, [x22, x28]\n"
+    "str q30, [x21, x28]\n"
+    "str q31, [x20, x28]\n"
+    "7:"  // Output channel loop: Done
+    "add x10, x10, #0x4\n"
+    "cmp x10, x11, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 19f\n"
+    "8:"  // Output channel oddments
+    "movi v31.16b, #0x0\n"
+    "cbz %x[bias], 11f\n"
+    "add x20, %x[bias], x10, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 9f\n"
+    "ld1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 10f\n"
+    "ld1 { v31.s }[2], [x20]\n"
+    "b 10f\n"
+    "9:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "ld1 { v31.s }[0], [x20]\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: End
+    "11:"  // Output channel oddments: Load bias: Done
+    "ldr q10, [%x[weights], #0x0]\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr q3, [x21, #0x0]\n"
+    "ldr q2, [x21, #0x10]\n"
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "ldr q1, [x20, #0x0]\n"
+    "ldr q0, [x20, #0x10]\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz x23, 15f\n"
+    "ldr q9, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q8, [x21, #0x0]\n"
+    "ldr q7, [x21, #0x10]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "beq 13f\n"
+    "12:"  // Output channel oddments: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "subs x23, x23, #0x1\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "ldr q3, [x21, #0x0]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "ldr q2, [x21, #0x10]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "ldr q1, [x20, #0x0]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "ldr q0, [x20, #0x10]\n"
+    "ldr q10, [%x[weights], #0x0]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "ldr q8, [x21, #0x0]\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "ldr q7, [x21, #0x10]\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "ldr q6, [x20, #0x0]\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "ldr q5, [x20, #0x10]\n"
+    "ldr q9, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 12b\n"
+    "13:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 14f\n"
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "b 16f\n"
+    "14:"  // Output channel oddments: Odd tail
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "ldr q4, [x21, #0x0]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "ldr q3, [x21, #0x10]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "ldr q2, [x20, #0x0]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "ldr q1, [x20, #0x10]\n"
+    "ldr q0, [%x[weights], #0x0]\n"
+    "fmla v16.4s, v9.4s, v8.s[0]\n"
+    "fmla v17.4s, v9.4s, v8.s[1]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v18.4s, v9.4s, v8.s[2]\n"
+    "fmla v19.4s, v9.4s, v8.s[3]\n"
+    "fmla v20.4s, v9.4s, v7.s[0]\n"
+    "fmla v21.4s, v9.4s, v7.s[1]\n"
+    "fmla v22.4s, v9.4s, v7.s[2]\n"
+    "fmla v23.4s, v9.4s, v7.s[3]\n"
+    "fmla v24.4s, v9.4s, v6.s[0]\n"
+    "fmla v25.4s, v9.4s, v6.s[1]\n"
+    "fmla v26.4s, v9.4s, v6.s[2]\n"
+    "fmla v27.4s, v9.4s, v6.s[3]\n"
+    "fmla v28.4s, v9.4s, v5.s[0]\n"
+    "fmla v29.4s, v9.4s, v5.s[1]\n"
+    "fmla v30.4s, v9.4s, v5.s[2]\n"
+    "fmla v31.4s, v9.4s, v5.s[3]\n"
+    "fmla v16.4s, v0.4s, v4.s[0]\n"
+    "fmla v17.4s, v0.4s, v4.s[1]\n"
+    "fmla v18.4s, v0.4s, v4.s[2]\n"
+    "fmla v19.4s, v0.4s, v4.s[3]\n"
+    "fmla v20.4s, v0.4s, v3.s[0]\n"
+    "fmla v21.4s, v0.4s, v3.s[1]\n"
+    "fmla v22.4s, v0.4s, v3.s[2]\n"
+    "fmla v23.4s, v0.4s, v3.s[3]\n"
+    "fmla v24.4s, v0.4s, v2.s[0]\n"
+    "fmla v25.4s, v0.4s, v2.s[1]\n"
+    "fmla v26.4s, v0.4s, v2.s[2]\n"
+    "fmla v27.4s, v0.4s, v2.s[3]\n"
+    "fmla v28.4s, v0.4s, v1.s[0]\n"
+    "fmla v29.4s, v0.4s, v1.s[1]\n"
+    "fmla v30.4s, v0.4s, v1.s[2]\n"
+    "fmla v31.4s, v0.4s, v1.s[3]\n"
+    "b 16f\n"
+    "15:"  // Output channel oddments: Single kernel point
+    "fmla v16.4s, v10.4s, v3.s[0]\n"
+    "fmla v17.4s, v10.4s, v3.s[1]\n"
+    "fmla v18.4s, v10.4s, v3.s[2]\n"
+    "fmla v19.4s, v10.4s, v3.s[3]\n"
+    "fmla v20.4s, v10.4s, v2.s[0]\n"
+    "fmla v21.4s, v10.4s, v2.s[1]\n"
+    "fmla v22.4s, v10.4s, v2.s[2]\n"
+    "fmla v23.4s, v10.4s, v2.s[3]\n"
+    "fmla v24.4s, v10.4s, v1.s[0]\n"
+    "fmla v25.4s, v10.4s, v1.s[1]\n"
+    "fmla v26.4s, v10.4s, v1.s[2]\n"
+    "fmla v27.4s, v10.4s, v1.s[3]\n"
+    "fmla v28.4s, v10.4s, v0.s[0]\n"
+    "fmla v29.4s, v10.4s, v0.s[1]\n"
+    "fmla v30.4s, v10.4s, v0.s[2]\n"
+    "fmla v31.4s, v10.4s, v0.s[3]\n"
+    "16:"  // Output channel oddments: Done
+    "fmin v16.4s, v16.4s, v11.4s\n"
+    "fmin v17.4s, v17.4s, v11.4s\n"
+    "fmin v18.4s, v18.4s, v11.4s\n"
+    "fmin v19.4s, v19.4s, v11.4s\n"
+    "fmin v20.4s, v20.4s, v11.4s\n"
+    "fmin v21.4s, v21.4s, v11.4s\n"
+    "fmin v22.4s, v22.4s, v11.4s\n"
+    "fmin v23.4s, v23.4s, v11.4s\n"
+    "fmin v24.4s, v24.4s, v11.4s\n"
+    "fmin v25.4s, v25.4s, v11.4s\n"
+    "fmin v26.4s, v26.4s, v11.4s\n"
+    "fmin v27.4s, v27.4s, v11.4s\n"
+    "fmin v28.4s, v28.4s, v11.4s\n"
+    "fmin v29.4s, v29.4s, v11.4s\n"
+    "fmin v30.4s, v30.4s, v11.4s\n"
+    "fmin v31.4s, v31.4s, v11.4s\n"
+    "fmax v16.4s, v16.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v12.4s\n"
+    "fmax v18.4s, v18.4s, v12.4s\n"
+    "fmax v19.4s, v19.4s, v12.4s\n"
+    "fmax v20.4s, v20.4s, v12.4s\n"
+    "fmax v21.4s, v21.4s, v12.4s\n"
+    "fmax v22.4s, v22.4s, v12.4s\n"
+    "fmax v23.4s, v23.4s, v12.4s\n"
+    "fmax v24.4s, v24.4s, v12.4s\n"
+    "fmax v25.4s, v25.4s, v12.4s\n"
+    "fmax v26.4s, v26.4s, v12.4s\n"
+    "fmax v27.4s, v27.4s, v12.4s\n"
+    "fmax v28.4s, v28.4s, v12.4s\n"
+    "fmax v29.4s, v29.4s, v12.4s\n"
+    "fmax v30.4s, v30.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v12.4s\n"
+    "tbz %x[n_output_channels], #1, 17f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "add x26, x26, x10, LSL #2\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "add x24, x24, x10, LSL #2\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "add x22, x22, x10, LSL #2\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v16.d }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "st1 { v17.d }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v18.d }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v19.d }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v20.d }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v21.d }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v22.d }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "add x10, x10, #0x2\n"
+    "st1 { v24.d }[0], [x27]\n"
+    "st1 { v25.d }[0], [x26]\n"
+    "st1 { v26.d }[0], [x25]\n"
+    "st1 { v27.d }[0], [x24]\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "tbz %x[n_output_channels], #0, 18f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "add x26, x26, x10, LSL #2\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "add x24, x24, x10, LSL #2\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "add x22, x22, x10, LSL #2\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v16.s }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "st1 { v17.s }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v18.s }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v19.s }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v20.s }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v21.s }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v22.s }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v24.s }[2], [x27]\n"
+    "st1 { v25.s }[2], [x26]\n"
+    "st1 { v26.s }[2], [x25]\n"
+    "st1 { v27.s }[2], [x24]\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "b 18f\n"
+    "17:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "add x26, x26, x10, LSL #2\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "add x24, x24, x10, LSL #2\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "add x22, x22, x10, LSL #2\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v16.s }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x10, LSL #2\n"
+    "st1 { v17.s }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v18.s }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v19.s }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v20.s }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v21.s }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v22.s }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v24.s }[0], [x27]\n"
+    "st1 { v25.s }[0], [x26]\n"
+    "st1 { v26.s }[0], [x25]\n"
+    "st1 { v27.s }[0], [x24]\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "18:"  // Output channel oddments: Done: Store: Bit 1: End
+    "19:"  // Done
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..e51031ccdb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int, const int8_t *const *const, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *const);
+
+class a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_a64_s8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_a64_s8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..916c8a4afe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1658 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x1\n"
+    "orr x20, x20, #0x100\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "orr x20, x20, #0x10000\n"
+    "lsr x11, %x[n_channels], #0x4\n"
+    "dup v12.4s, w20\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "mov x28, #0x0\n"
+    "mov x27, #0x0\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "ldp x25, x24, [%x[outptrs], #0x0]\n"
+    "ldp x23, x22, [%x[outptrs], #0x10]\n"
+    "cbz x11, 3f\n"
+    "ldr q15, [x15, x28]\n"
+    "ldr q28, [x14, x28]\n"
+    "subs x11, x11, #0x1\n"
+    "ldr q30, [x13, x28]\n"
+    "ldr q8, [x12, x28]\n"
+    "zip2 v19.16b, v15.16b, v30.16b\n"
+    "zip1 v15.16b, v15.16b, v30.16b\n"
+    "ldr q26, [x10, x28]\n"
+    "ldr q0, [x9, x28]\n"
+    "zip1 v7.16b, v28.16b, v8.16b\n"
+    "zip2 v8.16b, v28.16b, v8.16b\n"
+    "ldr q29, [x26, x28]\n"
+    "ldr q10, [x21, x28]\n"
+    "zip2 v25.16b, v15.16b, v7.16b\n"
+    "zip1 v15.16b, v15.16b, v7.16b\n"
+    "ldr q1, [%x[params], #0x10]\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "zip1 v7.16b, v19.16b, v8.16b\n"
+    "zip2 v8.16b, v19.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q20, [%x[params], #0x30]\n"
+    "zip2 v21.16b, v26.16b, v29.16b\n"
+    "zip1 v26.16b, v26.16b, v29.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q22, [x21, x28]\n"
+    "zip1 v27.16b, v0.16b, v10.16b\n"
+    "zip2 v10.16b, v0.16b, v10.16b\n"
+    "ldr q17, [x20, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "zip2 v23.16b, v26.16b, v27.16b\n"
+    "zip1 v26.16b, v26.16b, v27.16b\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q5, [x20, x28]\n"
+    "zip2 v28.16b, v22.16b, v9.16b\n"
+    "zip1 v22.16b, v22.16b, v9.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q27, [x21, x28]\n"
+    "zip1 v24.16b, v17.16b, v5.16b\n"
+    "zip2 v5.16b, v17.16b, v5.16b\n"
+    "ldr q18, [x20, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip1 v3.16b, v21.16b, v10.16b\n"
+    "zip2 v10.16b, v21.16b, v10.16b\n"
+    "ldr q4, [x21, x28]\n"
+    "ldr q9, [x20, x28]\n"
+    "zip2 v17.16b, v27.16b, v4.16b\n"
+    "zip1 v27.16b, v27.16b, v4.16b\n"
+    "zip1 v4.16b, v18.16b, v9.16b\n"
+    "zip2 v9.16b, v18.16b, v9.16b\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "zip2 v19.16b, v22.16b, v24.16b\n"
+    "zip1 v22.16b, v22.16b, v24.16b\n"
+    "zip1 v0.16b, v28.16b, v5.16b\n"
+    "zip2 v5.16b, v28.16b, v5.16b\n"
+    "add %x[params], %x[params], #0x40\n"
+    "zip2 v24.16b, v27.16b, v4.16b\n"
+    "zip1 v27.16b, v27.16b, v4.16b\n"
+    "zip1 v2.16b, v17.16b, v9.16b\n"
+    "zip2 v9.16b, v17.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "movi v21.4s, #0x0\n"
+    ".inst 0x4e9a9595  // sdot v21.4s, v12.16b, v26.16b\n"
+    ".inst 0x4e8f943f  // sdot v31.4s, v1.16b, v15.16b\n"
+    "add x28, x28, #0x10\n"
+    ".inst 0x4e969595  // sdot v21.4s, v12.16b, v22.16b\n"
+    ".inst 0x4e9a943d  // sdot v29.4s, v1.16b, v26.16b\n"
+    "movi v18.4s, #0x0\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0x4e9a94df  // sdot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "mov v17.16b, v21.16b\n .inst 0x4e9b9591  // sdot v17.4s, v12.16b, v27.16b\n"
+    ".inst 0x4e8f9595  // sdot v21.4s, v12.16b, v15.16b\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x4e9a9592  // sdot v18.4s, v12.16b, v26.16b\n"
+    ".inst 0x4e9694dd  // sdot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x4e96969f  // sdot v31.4s, v20.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x4e8f943e  // sdot v30.4s, v1.16b, v15.16b\n"
+    ".inst 0x4e9a943c  // sdot v28.4s, v1.16b, v26.16b\n"
+    "mls v31.4s, v21.4s, v16.4s\n"
+    ".inst 0x4e969592  // sdot v18.4s, v12.16b, v22.16b\n"
+    ".inst 0x4e9b969d  // sdot v29.4s, v20.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e9a94de  // sdot v30.4s, v6.16b, v26.16b\n"
+    "ldr q26, [%x[params], #0x10]\n"
+    ".inst 0x4e9694dc  // sdot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mov v21.16b, v18.16b\n .inst 0x4e9b9595  // sdot v21.4s, v12.16b, v27.16b\n"
+    ".inst 0x4e8f9592  // sdot v18.4s, v12.16b, v15.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e96969e  // sdot v30.4s, v20.16b, v22.16b\n"
+    ".inst 0x4e9b969c  // sdot v28.4s, v20.16b, v27.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v21.4s, v16.4s\n"
+    "and v15.16b, v31.16b, v26.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+    "ldr q1, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v15.4s\n"
+    "and v18.16b, v30.16b, v26.16b\n"
+    "and v21.16b, v29.16b, v26.16b\n"
+    "and v17.16b, v28.16b, v26.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v26.4s\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0x40]\n"
+    "sqadd v29.4s, v29.4s, v21.4s\n"
+    "ldr q27, [%x[params], #0x50]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v26.4s\n"
+    "srshl v29.4s, v29.4s, v26.4s\n"
+    "srshl v28.4s, v28.4s, v26.4s\n"
+    "ldr q20, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x4e979596  // sdot v22.4s, v12.16b, v23.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q26, [%x[params], #0x20]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    ".inst 0x4e939596  // sdot v22.4s, v12.16b, v19.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    "mov v6.16b, v22.16b\n .inst 0x4e989586  // sdot v6.4s, v12.16b, v24.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v30.16b, v26.16b\n"
+    ".inst 0x4e999596  // sdot v22.4s, v12.16b, v25.16b\n"
+    "str s28, [x22, x27]\n"
+    "mov v29.16b, v26.16b\n"
+    "mov v21.16b, v26.16b\n"
+    ".inst 0x4e9995fa  // sdot v26.4s, v15.16b, v25.16b\n"
+    ".inst 0x4e9795fd  // sdot v29.4s, v15.16b, v23.16b\n"
+    ".inst 0x4e97965a  // sdot v26.4s, v18.16b, v23.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "movi v28.4s, #0x0\n"
+    ".inst 0x4e9995fe  // sdot v30.4s, v15.16b, v25.16b\n"
+    ".inst 0x4e9795f5  // sdot v21.4s, v15.16b, v23.16b\n"
+    ".inst 0x4e97959c  // sdot v28.4s, v12.16b, v23.16b\n"
+    ".inst 0x4e93965d  // sdot v29.4s, v18.16b, v19.16b\n"
+    ".inst 0x4e93977a  // sdot v26.4s, v27.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x4e97965e  // sdot v30.4s, v18.16b, v23.16b\n"
+    "ldr q4, [x9, x28]\n"
+    ".inst 0x4e939655  // sdot v21.4s, v18.16b, v19.16b\n"
+    "mls v26.4s, v22.4s, v16.4s\n"
+    ".inst 0x4e93959c  // sdot v28.4s, v12.16b, v19.16b\n"
+    ".inst 0x4e98977d  // sdot v29.4s, v27.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e93977e  // sdot v30.4s, v27.16b, v19.16b\n"
+    ".inst 0x4e989775  // sdot v21.4s, v27.16b, v24.16b\n"
+    "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+    "mov v17.16b, v28.16b\n .inst 0x4e989591  // sdot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x4e99959c  // sdot v28.4s, v12.16b, v25.16b\n"
+    "ldr q31, [x14, x28]\n"
+    "mls v30.4s, v28.4s, v16.4s\n"
+    "mls v29.4s, v6.4s, v16.4s\n"
+    "mls v21.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+    "ldr q27, [%x[params], #0xc0]\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "and v18.16b, v30.16b, v20.16b\n"
+    "and v6.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v21.16b, v20.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "ldr q28, [%x[params], #0xa0]\n"
+    "sqadd v29.4s, v29.4s, v6.4s\n"
+    "ldr q24, [%x[params], #0xb0]\n"
+    "sqadd v21.4s, v21.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x90]\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v21.4s, v21.4s, v20.4s\n"
+    "ldr q1, [%x[params], #0xd0]\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x4e839596  // sdot v22.4s, v12.16b, v3.16b\n"
+    ".inst 0x4e809596  // sdot v22.4s, v12.16b, v0.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s26, [x25, x27]\n"
+    "ldr q26, [%x[params], #0x80]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "mov v18.16b, v22.16b\n .inst 0x4e829592  // sdot v18.4s, v12.16b, v2.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    ".inst 0x4e879596  // sdot v22.4s, v12.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v6.16b, v26.16b\n"
+    "str s21, [x22, x27]\n"
+    "mov v25.16b, v26.16b\n"
+    "mov v20.16b, v26.16b\n"
+    ".inst 0x4e8795fa  // sdot v26.4s, v15.16b, v7.16b\n"
+    ".inst 0x4e8395f9  // sdot v25.4s, v15.16b, v3.16b\n"
+    ".inst 0x4e83979a  // sdot v26.4s, v28.16b, v3.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x4e8795e6  // sdot v6.4s, v15.16b, v7.16b\n"
+    ".inst 0x4e8395f4  // sdot v20.4s, v15.16b, v3.16b\n"
+    ".inst 0x4e839597  // sdot v23.4s, v12.16b, v3.16b\n"
+    ".inst 0x4e809799  // sdot v25.4s, v28.16b, v0.16b\n"
+    ".inst 0x4e80971a  // sdot v26.4s, v24.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e839786  // sdot v6.4s, v28.16b, v3.16b\n"
+    "ldr q19, [x26, x28]\n"
+    ".inst 0x4e809794  // sdot v20.4s, v28.16b, v0.16b\n"
+    "mls v26.4s, v22.4s, v16.4s\n"
+    ".inst 0x4e809597  // sdot v23.4s, v12.16b, v0.16b\n"
+    ".inst 0x4e829719  // sdot v25.4s, v24.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x4e809706  // sdot v6.4s, v24.16b, v0.16b\n"
+    ".inst 0x4e829714  // sdot v20.4s, v24.16b, v2.16b\n"
+    "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+    "mov v17.16b, v23.16b\n .inst 0x4e829591  // sdot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x4e879597  // sdot v23.4s, v12.16b, v7.16b\n"
+    "ldr q21, [x13, x28]\n"
+    "mls v6.4s, v23.4s, v16.4s\n"
+    "mls v25.4s, v18.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v26.16b, v1.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v6.4s, v6.4s, v27.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+    "ldr q15, [%x[params], #0x120]\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "and v18.16b, v6.16b, v1.16b\n"
+    "and v22.16b, v25.16b, v1.16b\n"
+    "and v17.16b, v20.16b, v1.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "ldr q30, [%x[params], #0x100]\n"
+    "sqadd v25.4s, v25.4s, v22.4s\n"
+    "ldr q27, [%x[params], #0x110]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q24, [%x[params], #0xf0]\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "srshl v6.4s, v6.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v20.4s, v20.4s, v1.4s\n"
+    "ldr q23, [%x[params], #0x130]\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "add v6.4s, v6.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smax v6.4s, v6.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v6.4s, v6.4s, v11.4s\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "movi v0.4s, #0x0\n"
+    ".inst 0x4e8a9580  // sdot v0.4s, v12.16b, v10.16b\n"
+    ".inst 0x4e859580  // sdot v0.4s, v12.16b, v5.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s26, [x25, x27]\n"
+    "ldr q28, [%x[params], #0xe0]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v22.16b, v0.16b\n .inst 0x4e899596  // sdot v22.4s, v12.16b, v9.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s6, [x24, x27]\n"
+    ".inst 0x4e889580  // sdot v0.4s, v12.16b, v8.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s25, [x23, x27]\n"
+    "mov v29.16b, v28.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v25.16b, v28.16b\n"
+    "mov v7.16b, v28.16b\n"
+    ".inst 0x4e88971c  // sdot v28.4s, v24.16b, v8.16b\n"
+    ".inst 0x4e8a9719  // sdot v25.4s, v24.16b, v10.16b\n"
+    ".inst 0x4e8a97dc  // sdot v28.4s, v30.16b, v10.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x4e88971d  // sdot v29.4s, v24.16b, v8.16b\n"
+    ".inst 0x4e8a9707  // sdot v7.4s, v24.16b, v10.16b\n"
+    ".inst 0x4e8a9591  // sdot v17.4s, v12.16b, v10.16b\n"
+    ".inst 0x4e8597d9  // sdot v25.4s, v30.16b, v5.16b\n"
+    ".inst 0x4e85977c  // sdot v28.4s, v27.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8a97dd  // sdot v29.4s, v30.16b, v10.16b\n"
+    "ldr q10, [x21, x28]\n"
+    ".inst 0x4e8597c7  // sdot v7.4s, v30.16b, v5.16b\n"
+    "mls v28.4s, v0.4s, v16.4s\n"
+    ".inst 0x4e859591  // sdot v17.4s, v12.16b, v5.16b\n"
+    ".inst 0x4e899779  // sdot v25.4s, v27.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x4e85977d  // sdot v29.4s, v27.16b, v5.16b\n"
+    ".inst 0x4e899767  // sdot v7.4s, v27.16b, v9.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v15.4s\n"
+    "mov v18.16b, v17.16b\n .inst 0x4e899592  // sdot v18.4s, v12.16b, v9.16b\n"
+    ".inst 0x4e889591  // sdot v17.4s, v12.16b, v8.16b\n"
+    "ldr q8, [x12, x28]\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mls v25.4s, v22.4s, v16.4s\n"
+    "mls v7.4s, v18.4s, v16.4s\n"
+    "and v17.16b, v28.16b, v23.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v15.4s\n"
+    "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+    "ldr q15, [x15, x28]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q22, [x21, x28]\n"
+    "ldr q3, [x20, x28]\n"
+    "and v24.16b, v29.16b, v23.16b\n"
+    "and v20.16b, v25.16b, v23.16b\n"
+    "and v17.16b, v7.16b, v23.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "ldr q2, [x21, x28]\n"
+    "ldr q5, [x20, x28]\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v28.4s, v28.4s, v23.4s\n"
+    "sqadd v29.4s, v29.4s, v24.4s\n"
+    "ldr q6, [%x[params], #0x160]\n"
+    "sqadd v25.4s, v25.4s, v20.4s\n"
+    "ldr q20, [%x[params], #0x170]\n"
+    "sqadd v7.4s, v7.4s, v17.4s\n"
+    "ldr q1, [%x[params], #0x150]\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "srshl v29.4s, v29.4s, v23.4s\n"
+    "srshl v25.4s, v25.4s, v23.4s\n"
+    "srshl v7.4s, v7.4s, v23.4s\n"
+    "ldr q26, [x10, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q27, [x21, x28]\n"
+    "ldr q30, [x20, x28]\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v7.4s, v7.4s, v14.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "ldr q23, [x21, x28]\n"
+    "ldr q9, [x20, x28]\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v7.4s, v7.4s, v13.4s\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "smin v7.4s, v7.4s, v11.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s28, [x25, x27]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "zip2 v17.16b, v15.16b, v21.16b\n"
+    "zip1 v15.16b, v15.16b, v21.16b\n"
+    "zip1 v18.16b, v31.16b, v8.16b\n"
+    "zip2 v8.16b, v31.16b, v8.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s29, [x24, x27]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str s25, [x23, x27]\n"
+    "zip2 v25.16b, v15.16b, v18.16b\n"
+    "str s7, [x22, x27]\n"
+    "zip1 v15.16b, v15.16b, v18.16b\n"
+    "zip1 v7.16b, v17.16b, v8.16b\n"
+    "add x27, x27, #0x4\n"
+    "zip2 v8.16b, v17.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x140]\n"
+    "zip2 v29.16b, v26.16b, v19.16b\n"
+    "add %x[params], %x[params], #0x180\n"
+    "zip1 v26.16b, v26.16b, v19.16b\n"
+    "zip1 v28.16b, v4.16b, v10.16b\n"
+    "zip2 v10.16b, v4.16b, v10.16b\n"
+    "zip2 v24.16b, v22.16b, v2.16b\n"
+    "zip1 v22.16b, v22.16b, v2.16b\n"
+    "zip1 v21.16b, v3.16b, v5.16b\n"
+    "zip2 v5.16b, v3.16b, v5.16b\n"
+    "zip2 v18.16b, v27.16b, v23.16b\n"
+    "zip1 v27.16b, v27.16b, v23.16b\n"
+    "zip1 v17.16b, v30.16b, v9.16b\n"
+    "zip2 v9.16b, v30.16b, v9.16b\n"
+    "zip2 v23.16b, v26.16b, v28.16b\n"
+    "zip1 v26.16b, v26.16b, v28.16b\n"
+    "zip1 v3.16b, v29.16b, v10.16b\n"
+    "zip2 v10.16b, v29.16b, v10.16b\n"
+    "zip2 v19.16b, v22.16b, v21.16b\n"
+    "zip1 v22.16b, v22.16b, v21.16b\n"
+    "zip1 v0.16b, v24.16b, v5.16b\n"
+    "zip2 v5.16b, v24.16b, v5.16b\n"
+    "zip2 v24.16b, v27.16b, v17.16b\n"
+    "zip1 v27.16b, v27.16b, v17.16b\n"
+    "zip1 v2.16b, v18.16b, v9.16b\n"
+    "zip2 v9.16b, v18.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "bgt 1b\n"
+    "2:"  // Detached iteration
+    "movi v21.4s, #0x0\n"
+    ".inst 0x4e9a9595  // sdot v21.4s, v12.16b, v26.16b\n"
+    ".inst 0x4e8f943f  // sdot v31.4s, v1.16b, v15.16b\n"
+    "tst %x[n_channels], #0xf\n"
+    ".inst 0x4e969595  // sdot v21.4s, v12.16b, v22.16b\n"
+    ".inst 0x4e9a943d  // sdot v29.4s, v1.16b, v26.16b\n"
+    "movi v18.4s, #0x0\n"
+    "add x28, x28, #0x10\n"
+    ".inst 0x4e9a94df  // sdot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "mov v17.16b, v21.16b\n .inst 0x4e9b9591  // sdot v17.4s, v12.16b, v27.16b\n"
+    ".inst 0x4e8f9595  // sdot v21.4s, v12.16b, v15.16b\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x4e9a9592  // sdot v18.4s, v12.16b, v26.16b\n"
+    ".inst 0x4e9694dd  // sdot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x4e96969f  // sdot v31.4s, v20.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x4e8f943e  // sdot v30.4s, v1.16b, v15.16b\n"
+    ".inst 0x4e9a943c  // sdot v28.4s, v1.16b, v26.16b\n"
+    "mls v31.4s, v21.4s, v16.4s\n"
+    ".inst 0x4e969592  // sdot v18.4s, v12.16b, v22.16b\n"
+    ".inst 0x4e9b969d  // sdot v29.4s, v20.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e9a94de  // sdot v30.4s, v6.16b, v26.16b\n"
+    "ldr q4, [%x[params], #0x10]\n"
+    ".inst 0x4e9694dc  // sdot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mov v21.16b, v18.16b\n .inst 0x4e9b9595  // sdot v21.4s, v12.16b, v27.16b\n"
+    ".inst 0x4e8f9592  // sdot v18.4s, v12.16b, v15.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e96969e  // sdot v30.4s, v20.16b, v22.16b\n"
+    ".inst 0x4e9b969c  // sdot v28.4s, v20.16b, v27.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v21.4s, v16.4s\n"
+    "and v27.16b, v31.16b, v4.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v27.4s\n"
+    "and v20.16b, v30.16b, v4.16b\n"
+    "and v18.16b, v29.16b, v4.16b\n"
+    "and v17.16b, v28.16b, v4.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v4.4s\n"
+    "sqadd v30.4s, v30.4s, v20.4s\n"
+    "ldr q27, [%x[params], #0x40]\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "ldr q26, [%x[params], #0x50]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldr q6, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v4.4s\n"
+    "srshl v29.4s, v29.4s, v4.4s\n"
+    "srshl v28.4s, v28.4s, v4.4s\n"
+    "ldr q4, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v1.4s, #0x0\n"
+    ".inst 0x4e979581  // sdot v1.4s, v12.16b, v23.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q31, [%x[params], #0x20]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    ".inst 0x4e939581  // sdot v1.4s, v12.16b, v19.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    "mov v22.16b, v1.16b\n .inst 0x4e989596  // sdot v22.4s, v12.16b, v24.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v29.16b, v31.16b\n"
+    ".inst 0x4e999581  // sdot v1.4s, v12.16b, v25.16b\n"
+    "str s28, [x22, x27]\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    ".inst 0x4e9994df  // sdot v31.4s, v6.16b, v25.16b\n"
+    ".inst 0x4e9794d5  // sdot v21.4s, v6.16b, v23.16b\n"
+    ".inst 0x4e97977f  // sdot v31.4s, v27.16b, v23.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x4e9994dd  // sdot v29.4s, v6.16b, v25.16b\n"
+    ".inst 0x4e9794d4  // sdot v20.4s, v6.16b, v23.16b\n"
+    ".inst 0x4e979592  // sdot v18.4s, v12.16b, v23.16b\n"
+    ".inst 0x4e939775  // sdot v21.4s, v27.16b, v19.16b\n"
+    ".inst 0x4e93975f  // sdot v31.4s, v26.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x4e97977d  // sdot v29.4s, v27.16b, v23.16b\n"
+    ".inst 0x4e939774  // sdot v20.4s, v27.16b, v19.16b\n"
+    "mls v31.4s, v1.4s, v16.4s\n"
+    ".inst 0x4e939592  // sdot v18.4s, v12.16b, v19.16b\n"
+    ".inst 0x4e989755  // sdot v21.4s, v26.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e93975d  // sdot v29.4s, v26.16b, v19.16b\n"
+    ".inst 0x4e989754  // sdot v20.4s, v26.16b, v24.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v15.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x4e989591  // sdot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x4e999592  // sdot v18.4s, v12.16b, v25.16b\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v21.4s, v22.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v4.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v15.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v15.4s\n"
+    "ldr q27, [%x[params], #0xc0]\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v29.16b, v4.16b\n"
+    "and v18.16b, v21.16b, v4.16b\n"
+    "and v17.16b, v20.16b, v4.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v4.4s\n"
+    "sqadd v29.4s, v29.4s, v19.4s\n"
+    "ldr q26, [%x[params], #0xa0]\n"
+    "sqadd v21.4s, v21.4s, v18.4s\n"
+    "ldr q25, [%x[params], #0xb0]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q24, [%x[params], #0x90]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v29.4s, v29.4s, v4.4s\n"
+    "srshl v21.4s, v21.4s, v4.4s\n"
+    "srshl v20.4s, v20.4s, v4.4s\n"
+    "ldr q1, [%x[params], #0xd0]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x4e839597  // sdot v23.4s, v12.16b, v3.16b\n"
+    ".inst 0x4e809597  // sdot v23.4s, v12.16b, v0.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q31, [%x[params], #0x80]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v22.16b, v23.16b\n .inst 0x4e829596  // sdot v22.4s, v12.16b, v2.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s29, [x24, x27]\n"
+    ".inst 0x4e879597  // sdot v23.4s, v12.16b, v7.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s21, [x23, x27]\n"
+    "mov v21.16b, v31.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v4.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    ".inst 0x4e87971f  // sdot v31.4s, v24.16b, v7.16b\n"
+    ".inst 0x4e839704  // sdot v4.4s, v24.16b, v3.16b\n"
+    ".inst 0x4e83975f  // sdot v31.4s, v26.16b, v3.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x4e879715  // sdot v21.4s, v24.16b, v7.16b\n"
+    ".inst 0x4e839714  // sdot v20.4s, v24.16b, v3.16b\n"
+    ".inst 0x4e839592  // sdot v18.4s, v12.16b, v3.16b\n"
+    ".inst 0x4e809744  // sdot v4.4s, v26.16b, v0.16b\n"
+    ".inst 0x4e80973f  // sdot v31.4s, v25.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e839755  // sdot v21.4s, v26.16b, v3.16b\n"
+    ".inst 0x4e809754  // sdot v20.4s, v26.16b, v0.16b\n"
+    "mls v31.4s, v23.4s, v16.4s\n"
+    ".inst 0x4e809592  // sdot v18.4s, v12.16b, v0.16b\n"
+    ".inst 0x4e829724  // sdot v4.4s, v25.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x4e809735  // sdot v21.4s, v25.16b, v0.16b\n"
+    ".inst 0x4e829734  // sdot v20.4s, v25.16b, v2.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v27.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x4e829591  // sdot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x4e879592  // sdot v18.4s, v12.16b, v7.16b\n"
+    "mls v21.4s, v18.4s, v16.4s\n"
+    "mls v4.4s, v22.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v1.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v27.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+    "ldr q30, [%x[params], #0x120]\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v21.16b, v1.16b\n"
+    "and v18.16b, v4.16b, v1.16b\n"
+    "and v17.16b, v20.16b, v1.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "sqadd v21.4s, v21.4s, v19.4s\n"
+    "ldr q29, [%x[params], #0x100]\n"
+    "sqadd v4.4s, v4.4s, v18.4s\n"
+    "ldr q28, [%x[params], #0x110]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q27, [%x[params], #0xf0]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v21.4s, v21.4s, v1.4s\n"
+    "srshl v4.4s, v4.4s, v1.4s\n"
+    "srshl v20.4s, v20.4s, v1.4s\n"
+    "ldr q26, [%x[params], #0x130]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v4.4s, v4.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v4.4s, v4.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v4.4s, v4.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v25.4s, #0x0\n"
+    ".inst 0x4e8a9599  // sdot v25.4s, v12.16b, v10.16b\n"
+    ".inst 0x4e859599  // sdot v25.4s, v12.16b, v5.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q24, [%x[params], #0xe0]\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v23.16b, v25.16b\n .inst 0x4e899597  // sdot v23.4s, v12.16b, v9.16b\n"
+    "add %x[params], %x[params], #0x140\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s21, [x24, x27]\n"
+    ".inst 0x4e889599  // sdot v25.4s, v12.16b, v8.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s4, [x23, x27]\n"
+    "mov v22.16b, v24.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v21.16b, v24.16b\n"
+    "mov v20.16b, v24.16b\n"
+    ".inst 0x4e889778  // sdot v24.4s, v27.16b, v8.16b\n"
+    ".inst 0x4e8a9775  // sdot v21.4s, v27.16b, v10.16b\n"
+    ".inst 0x4e8a97b8  // sdot v24.4s, v29.16b, v10.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x4e889776  // sdot v22.4s, v27.16b, v8.16b\n"
+    ".inst 0x4e8a9774  // sdot v20.4s, v27.16b, v10.16b\n"
+    ".inst 0x4e8a9592  // sdot v18.4s, v12.16b, v10.16b\n"
+    ".inst 0x4e8597b5  // sdot v21.4s, v29.16b, v5.16b\n"
+    ".inst 0x4e859798  // sdot v24.4s, v28.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8a97b6  // sdot v22.4s, v29.16b, v10.16b\n"
+    ".inst 0x4e8597b4  // sdot v20.4s, v29.16b, v5.16b\n"
+    "mls v24.4s, v25.4s, v16.4s\n"
+    ".inst 0x4e859592  // sdot v18.4s, v12.16b, v5.16b\n"
+    ".inst 0x4e899795  // sdot v21.4s, v28.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x4e859796  // sdot v22.4s, v28.16b, v5.16b\n"
+    ".inst 0x4e899794  // sdot v20.4s, v28.16b, v9.16b\n"
+    "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x4e899591  // sdot v17.4s, v12.16b, v9.16b\n"
+    ".inst 0x4e889592  // sdot v18.4s, v12.16b, v8.16b\n"
+    "mls v22.4s, v18.4s, v16.4s\n"
+    "mls v21.4s, v23.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v26.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "and v19.16b, v22.16b, v26.16b\n"
+    "and v18.16b, v21.16b, v26.16b\n"
+    "and v17.16b, v20.16b, v26.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v19.4s\n"
+    "sqadd v21.4s, v21.4s, v18.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "srshl v24.4s, v24.4s, v26.4s\n"
+    "srshl v22.4s, v22.4s, v26.4s\n"
+    "srshl v21.4s, v21.4s, v26.4s\n"
+    "srshl v20.4s, v20.4s, v26.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v22.4s, v22.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v22.4s, v22.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x25, x27]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s22, [x24, x27]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s21, [x23, x27]\n"
+    "str s20, [x22, x27]\n"
+    "add x27, x27, #0x4\n"
+    "beq 35f\n"
+    "3:"  // Oddments
+    "and x20, %x[n_channels], #0xf\n"
+    "add x15, x15, x28\n"
+    "add x14, x14, x28\n"
+    "add x13, x13, x28\n"
+    "add x12, x12, x28\n"
+    "add x10, x10, x28\n"
+    "add x9, x9, x28\n"
+    "add x26, x26, x28\n"
+    "add x21, x21, x28\n"
+    "tbz %x[n_channels], #3, 7f\n"
+    "ldr d15, [x15], #0x8\n"
+    "ldr d25, [x14], #0x8\n"
+    "ldr d7, [x13], #0x8\n"
+    "ldr d8, [x12], #0x8\n"
+    "ldr d26, [x10], #0x8\n"
+    "ldr d23, [x9], #0x8\n"
+    "ldr d3, [x26], #0x8\n"
+    "ldr d10, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v15.s }[2], [x15], #0x4\n"
+    "ld1 { v25.s }[2], [x14], #0x4\n"
+    "ld1 { v7.s }[2], [x13], #0x4\n"
+    "ld1 { v8.s }[2], [x12], #0x4\n"
+    "ld1 { v26.s }[2], [x10], #0x4\n"
+    "ld1 { v23.s }[2], [x9], #0x4\n"
+    "ld1 { v3.s }[2], [x26], #0x4\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v15.h }[6], [x15], #0x2\n"
+    "ld1 { v25.h }[6], [x14], #0x2\n"
+    "ld1 { v7.h }[6], [x13], #0x2\n"
+    "ld1 { v8.h }[6], [x12], #0x2\n"
+    "ld1 { v26.h }[6], [x10], #0x2\n"
+    "ld1 { v23.h }[6], [x9], #0x2\n"
+    "ld1 { v3.h }[6], [x26], #0x2\n"
+    "ld1 { v10.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[14], [x15], #0x1\n"
+    "ld1 { v25.b }[14], [x14], #0x1\n"
+    "ld1 { v7.b }[14], [x13], #0x1\n"
+    "ld1 { v8.b }[14], [x12], #0x1\n"
+    "ld1 { v26.b }[14], [x10], #0x1\n"
+    "ld1 { v23.b }[14], [x9], #0x1\n"
+    "ld1 { v3.b }[14], [x26], #0x1\n"
+    "ld1 { v10.b }[14], [x21], #0x1\n"
+    "b 11f\n"
+    "4:"  // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[12], [x15], #0x1\n"
+    "ld1 { v25.b }[12], [x14], #0x1\n"
+    "ld1 { v7.b }[12], [x13], #0x1\n"
+    "ld1 { v8.b }[12], [x12], #0x1\n"
+    "ld1 { v26.b }[12], [x10], #0x1\n"
+    "ld1 { v23.b }[12], [x9], #0x1\n"
+    "ld1 { v3.b }[12], [x26], #0x1\n"
+    "ld1 { v10.b }[12], [x21], #0x1\n"
+    "b 11f\n"
+    "5:"  // Oddments: Load (A): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v15.h }[4], [x15], #0x2\n"
+    "ld1 { v25.h }[4], [x14], #0x2\n"
+    "ld1 { v7.h }[4], [x13], #0x2\n"
+    "ld1 { v8.h }[4], [x12], #0x2\n"
+    "ld1 { v26.h }[4], [x10], #0x2\n"
+    "ld1 { v23.h }[4], [x9], #0x2\n"
+    "ld1 { v3.h }[4], [x26], #0x2\n"
+    "ld1 { v10.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[10], [x15], #0x1\n"
+    "ld1 { v25.b }[10], [x14], #0x1\n"
+    "ld1 { v7.b }[10], [x13], #0x1\n"
+    "ld1 { v8.b }[10], [x12], #0x1\n"
+    "ld1 { v26.b }[10], [x10], #0x1\n"
+    "ld1 { v23.b }[10], [x9], #0x1\n"
+    "ld1 { v3.b }[10], [x26], #0x1\n"
+    "ld1 { v10.b }[10], [x21], #0x1\n"
+    "b 11f\n"
+    "6:"  // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[8], [x15], #0x1\n"
+    "ld1 { v25.b }[8], [x14], #0x1\n"
+    "ld1 { v7.b }[8], [x13], #0x1\n"
+    "ld1 { v8.b }[8], [x12], #0x1\n"
+    "ld1 { v26.b }[8], [x10], #0x1\n"
+    "ld1 { v23.b }[8], [x9], #0x1\n"
+    "ld1 { v3.b }[8], [x26], #0x1\n"
+    "ld1 { v10.b }[8], [x21], #0x1\n"
+    "b 11f\n"
+    "7:"  // Oddments: Load (A): Bit 3: Unset
+    "tbz %x[n_channels], #2, 9f\n"
+    "ldr s15, [x15], #0x4\n"
+    "ldr s25, [x14], #0x4\n"
+    "ldr s7, [x13], #0x4\n"
+    "ldr s8, [x12], #0x4\n"
+    "ldr s26, [x10], #0x4\n"
+    "ldr s23, [x9], #0x4\n"
+    "ldr s3, [x26], #0x4\n"
+    "ldr s10, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v15.h }[2], [x15], #0x2\n"
+    "ld1 { v25.h }[2], [x14], #0x2\n"
+    "ld1 { v7.h }[2], [x13], #0x2\n"
+    "ld1 { v8.h }[2], [x12], #0x2\n"
+    "ld1 { v26.h }[2], [x10], #0x2\n"
+    "ld1 { v23.h }[2], [x9], #0x2\n"
+    "ld1 { v3.h }[2], [x26], #0x2\n"
+    "ld1 { v10.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[6], [x15], #0x1\n"
+    "ld1 { v25.b }[6], [x14], #0x1\n"
+    "ld1 { v7.b }[6], [x13], #0x1\n"
+    "ld1 { v8.b }[6], [x12], #0x1\n"
+    "ld1 { v26.b }[6], [x10], #0x1\n"
+    "ld1 { v23.b }[6], [x9], #0x1\n"
+    "ld1 { v3.b }[6], [x26], #0x1\n"
+    "ld1 { v10.b }[6], [x21], #0x1\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[4], [x15], #0x1\n"
+    "ld1 { v25.b }[4], [x14], #0x1\n"
+    "ld1 { v7.b }[4], [x13], #0x1\n"
+    "ld1 { v8.b }[4], [x12], #0x1\n"
+    "ld1 { v26.b }[4], [x10], #0x1\n"
+    "ld1 { v23.b }[4], [x9], #0x1\n"
+    "ld1 { v3.b }[4], [x26], #0x1\n"
+    "ld1 { v10.b }[4], [x21], #0x1\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h15, [x15], #0x2\n"
+    "ldr h25, [x14], #0x2\n"
+    "ldr h7, [x13], #0x2\n"
+    "ldr h8, [x12], #0x2\n"
+    "ldr h26, [x10], #0x2\n"
+    "ldr h23, [x9], #0x2\n"
+    "ldr h3, [x26], #0x2\n"
+    "ldr h10, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[2], [x15], #0x1\n"
+    "ld1 { v25.b }[2], [x14], #0x1\n"
+    "ld1 { v7.b }[2], [x13], #0x1\n"
+    "ld1 { v8.b }[2], [x12], #0x1\n"
+    "ld1 { v26.b }[2], [x10], #0x1\n"
+    "ld1 { v23.b }[2], [x9], #0x1\n"
+    "ld1 { v3.b }[2], [x26], #0x1\n"
+    "ld1 { v10.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b15, [x15], #0x1\n"
+    "ldr b25, [x14], #0x1\n"
+    "ldr b7, [x13], #0x1\n"
+    "ldr b8, [x12], #0x1\n"
+    "ldr b26, [x10], #0x1\n"
+    "ldr b23, [x9], #0x1\n"
+    "ldr b3, [x26], #0x1\n"
+    "ldr b10, [x21], #0x1\n"
+    "11:"  // Oddments: Load (A): Bit 3: End
+    "ldp x15, x14, [%x[inptrs], #0x40]\n"
+    "ldp x13, x12, [%x[inptrs], #0x50]\n"
+    "add x15, x15, x28\n"
+    "add x14, x14, x28\n"
+    "ldp x10, x9, [%x[inptrs], #0x60]\n"
+    "ldp x26, x21, [%x[inptrs], #0x70]\n"
+    "add x13, x13, x28\n"
+    "add x12, x12, x28\n"
+    "add x10, x10, x28\n"
+    "add x9, x9, x28\n"
+    "add x26, x26, x28\n"
+    "add x21, x21, x28\n"
+    "tbz %x[n_channels], #3, 15f\n"
+    "ldr d22, [x15], #0x8\n"
+    "ldr d19, [x14], #0x8\n"
+    "ldr d0, [x13], #0x8\n"
+    "ldr d5, [x12], #0x8\n"
+    "ldr d27, [x10], #0x8\n"
+    "ldr d24, [x9], #0x8\n"
+    "ldr d2, [x26], #0x8\n"
+    "ldr d9, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v22.s }[2], [x15], #0x4\n"
+    "ld1 { v19.s }[2], [x14], #0x4\n"
+    "ld1 { v0.s }[2], [x13], #0x4\n"
+    "ld1 { v5.s }[2], [x12], #0x4\n"
+    "ld1 { v27.s }[2], [x10], #0x4\n"
+    "ld1 { v24.s }[2], [x9], #0x4\n"
+    "ld1 { v2.s }[2], [x26], #0x4\n"
+    "ld1 { v9.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v22.h }[6], [x15], #0x2\n"
+    "ld1 { v19.h }[6], [x14], #0x2\n"
+    "ld1 { v0.h }[6], [x13], #0x2\n"
+    "ld1 { v5.h }[6], [x12], #0x2\n"
+    "ld1 { v27.h }[6], [x10], #0x2\n"
+    "ld1 { v24.h }[6], [x9], #0x2\n"
+    "ld1 { v2.h }[6], [x26], #0x2\n"
+    "ld1 { v9.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[14], [x15], #0x1\n"
+    "ld1 { v19.b }[14], [x14], #0x1\n"
+    "ld1 { v0.b }[14], [x13], #0x1\n"
+    "ld1 { v5.b }[14], [x12], #0x1\n"
+    "ld1 { v27.b }[14], [x10], #0x1\n"
+    "ld1 { v24.b }[14], [x9], #0x1\n"
+    "ld1 { v2.b }[14], [x26], #0x1\n"
+    "ld1 { v9.b }[14], [x21], #0x1\n"
+    "b 19f\n"
+    "12:"  // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[12], [x15], #0x1\n"
+    "ld1 { v19.b }[12], [x14], #0x1\n"
+    "ld1 { v0.b }[12], [x13], #0x1\n"
+    "ld1 { v5.b }[12], [x12], #0x1\n"
+    "ld1 { v27.b }[12], [x10], #0x1\n"
+    "ld1 { v24.b }[12], [x9], #0x1\n"
+    "ld1 { v2.b }[12], [x26], #0x1\n"
+    "ld1 { v9.b }[12], [x21], #0x1\n"
+    "b 19f\n"
+    "13:"  // Oddments: Load (B): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v22.h }[4], [x15], #0x2\n"
+    "ld1 { v19.h }[4], [x14], #0x2\n"
+    "ld1 { v0.h }[4], [x13], #0x2\n"
+    "ld1 { v5.h }[4], [x12], #0x2\n"
+    "ld1 { v27.h }[4], [x10], #0x2\n"
+    "ld1 { v24.h }[4], [x9], #0x2\n"
+    "ld1 { v2.h }[4], [x26], #0x2\n"
+    "ld1 { v9.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[10], [x15], #0x1\n"
+    "ld1 { v19.b }[10], [x14], #0x1\n"
+    "ld1 { v0.b }[10], [x13], #0x1\n"
+    "ld1 { v5.b }[10], [x12], #0x1\n"
+    "ld1 { v27.b }[10], [x10], #0x1\n"
+    "ld1 { v24.b }[10], [x9], #0x1\n"
+    "ld1 { v2.b }[10], [x26], #0x1\n"
+    "ld1 { v9.b }[10], [x21], #0x1\n"
+    "b 19f\n"
+    "14:"  // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[8], [x15], #0x1\n"
+    "ld1 { v19.b }[8], [x14], #0x1\n"
+    "ld1 { v0.b }[8], [x13], #0x1\n"
+    "ld1 { v5.b }[8], [x12], #0x1\n"
+    "ld1 { v27.b }[8], [x10], #0x1\n"
+    "ld1 { v24.b }[8], [x9], #0x1\n"
+    "ld1 { v2.b }[8], [x26], #0x1\n"
+    "ld1 { v9.b }[8], [x21], #0x1\n"
+    "b 19f\n"
+    "15:"  // Oddments: Load (B): Bit 3: Unset
+    "tbz %x[n_channels], #2, 17f\n"
+    "ldr s22, [x15], #0x4\n"
+    "ldr s19, [x14], #0x4\n"
+    "ldr s0, [x13], #0x4\n"
+    "ldr s5, [x12], #0x4\n"
+    "ldr s27, [x10], #0x4\n"
+    "ldr s24, [x9], #0x4\n"
+    "ldr s2, [x26], #0x4\n"
+    "ldr s9, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v22.h }[2], [x15], #0x2\n"
+    "ld1 { v19.h }[2], [x14], #0x2\n"
+    "ld1 { v0.h }[2], [x13], #0x2\n"
+    "ld1 { v5.h }[2], [x12], #0x2\n"
+    "ld1 { v27.h }[2], [x10], #0x2\n"
+    "ld1 { v24.h }[2], [x9], #0x2\n"
+    "ld1 { v2.h }[2], [x26], #0x2\n"
+    "ld1 { v9.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[6], [x15], #0x1\n"
+    "ld1 { v19.b }[6], [x14], #0x1\n"
+    "ld1 { v0.b }[6], [x13], #0x1\n"
+    "ld1 { v5.b }[6], [x12], #0x1\n"
+    "ld1 { v27.b }[6], [x10], #0x1\n"
+    "ld1 { v24.b }[6], [x9], #0x1\n"
+    "ld1 { v2.b }[6], [x26], #0x1\n"
+    "ld1 { v9.b }[6], [x21], #0x1\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[4], [x15], #0x1\n"
+    "ld1 { v19.b }[4], [x14], #0x1\n"
+    "ld1 { v0.b }[4], [x13], #0x1\n"
+    "ld1 { v5.b }[4], [x12], #0x1\n"
+    "ld1 { v27.b }[4], [x10], #0x1\n"
+    "ld1 { v24.b }[4], [x9], #0x1\n"
+    "ld1 { v2.b }[4], [x26], #0x1\n"
+    "ld1 { v9.b }[4], [x21], #0x1\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ldr h22, [x15], #0x2\n"
+    "ldr h19, [x14], #0x2\n"
+    "ldr h0, [x13], #0x2\n"
+    "ldr h5, [x12], #0x2\n"
+    "ldr h27, [x10], #0x2\n"
+    "ldr h24, [x9], #0x2\n"
+    "ldr h2, [x26], #0x2\n"
+    "ldr h9, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[2], [x15], #0x1\n"
+    "ld1 { v19.b }[2], [x14], #0x1\n"
+    "ld1 { v0.b }[2], [x13], #0x1\n"
+    "ld1 { v5.b }[2], [x12], #0x1\n"
+    "ld1 { v27.b }[2], [x10], #0x1\n"
+    "ld1 { v24.b }[2], [x9], #0x1\n"
+    "ld1 { v2.b }[2], [x26], #0x1\n"
+    "ld1 { v9.b }[2], [x21], #0x1\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b22, [x15], #0x1\n"
+    "ldr b19, [x14], #0x1\n"
+    "ldr b0, [x13], #0x1\n"
+    "ldr b5, [x12], #0x1\n"
+    "ldr b27, [x10], #0x1\n"
+    "ldr b24, [x9], #0x1\n"
+    "ldr b2, [x26], #0x1\n"
+    "ldr b9, [x21], #0x1\n"
+    "19:"  // Oddments: Load (B): Bit 3: End
+    "ldr q20, [%x[params], #0x10]\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "zip2 v1.16b, v26.16b, v3.16b\n"
+    "zip1 v26.16b, v26.16b, v3.16b\n"
+    "ldr q4, [%x[params], #0x30]\n"
+    "zip1 v18.16b, v23.16b, v10.16b\n"
+    "zip2 v30.16b, v15.16b, v7.16b\n"
+    "cmp x20, #0x4\n"
+    "zip1 v15.16b, v15.16b, v7.16b\n"
+    "zip1 v29.16b, v25.16b, v8.16b\n"
+    "zip2 v8.16b, v25.16b, v8.16b\n"
+    "zip2 v10.16b, v23.16b, v10.16b\n"
+    "zip2 v23.16b, v26.16b, v18.16b\n"
+    "zip1 v26.16b, v26.16b, v18.16b\n"
+    "zip2 v28.16b, v22.16b, v0.16b\n"
+    "zip1 v22.16b, v22.16b, v0.16b\n"
+    "zip1 v21.16b, v19.16b, v5.16b\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x4e9a9591  // sdot v17.4s, v12.16b, v26.16b\n"
+    "zip2 v25.16b, v15.16b, v29.16b\n"
+    "zip1 v15.16b, v15.16b, v29.16b\n"
+    "zip1 v7.16b, v30.16b, v8.16b\n"
+    "zip2 v8.16b, v30.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "zip2 v5.16b, v19.16b, v5.16b\n"
+    "zip2 v30.16b, v27.16b, v2.16b\n"
+    "zip1 v27.16b, v27.16b, v2.16b\n"
+    "zip1 v18.16b, v24.16b, v9.16b\n"
+    "zip2 v9.16b, v24.16b, v9.16b\n"
+    "zip2 v19.16b, v22.16b, v21.16b\n"
+    "zip1 v22.16b, v22.16b, v21.16b\n"
+    "zip1 v3.16b, v1.16b, v10.16b\n"
+    ".inst 0x4e969591  // sdot v17.4s, v12.16b, v22.16b\n"
+    "zip2 v10.16b, v1.16b, v10.16b\n"
+    "zip1 v0.16b, v28.16b, v5.16b\n"
+    "zip2 v5.16b, v28.16b, v5.16b\n"
+    "zip2 v24.16b, v27.16b, v18.16b\n"
+    "zip1 v27.16b, v27.16b, v18.16b\n"
+    "zip1 v2.16b, v30.16b, v9.16b\n"
+    "mov v18.16b, v17.16b\n .inst 0x4e9b9592  // sdot v18.4s, v12.16b, v27.16b\n"
+    "zip2 v9.16b, v30.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    ".inst 0x4e8f9591  // sdot v17.4s, v12.16b, v15.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x4e8f969f  // sdot v31.4s, v20.16b, v15.16b\n"
+    ".inst 0x4e9a969d  // sdot v29.4s, v20.16b, v26.16b\n"
+    ".inst 0x4e9a94df  // sdot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "movi v1.4s, #0x0\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x4e9a9581  // sdot v1.4s, v12.16b, v26.16b\n"
+    ".inst 0x4e9694dd  // sdot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x4e96949f  // sdot v31.4s, v4.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x4e8f969e  // sdot v30.4s, v20.16b, v15.16b\n"
+    ".inst 0x4e9a969c  // sdot v28.4s, v20.16b, v26.16b\n"
+    "mls v31.4s, v17.4s, v16.4s\n"
+    ".inst 0x4e969581  // sdot v1.4s, v12.16b, v22.16b\n"
+    ".inst 0x4e9b949d  // sdot v29.4s, v4.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e9a94de  // sdot v30.4s, v6.16b, v26.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    ".inst 0x4e9694dc  // sdot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mov v20.16b, v1.16b\n .inst 0x4e9b9594  // sdot v20.4s, v12.16b, v27.16b\n"
+    ".inst 0x4e8f9581  // sdot v1.4s, v12.16b, v15.16b\n"
+    "ldr q18, [%x[params], #0x40]\n"
+    "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+    ".inst 0x4e96949e  // sdot v30.4s, v4.16b, v22.16b\n"
+    ".inst 0x4e9b949c  // sdot v28.4s, v4.16b, v27.16b\n"
+    "mls v30.4s, v1.4s, v16.4s\n"
+    "add %x[params], %x[params], #0x60\n"
+    "mls v28.4s, v20.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v18.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v18.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v26.16b, v28.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v26.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 20f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Unroll 0: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 21f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 22f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 22f\n"
+    "21:"  // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "22:"  // Oddments: Unroll 0: Oddment store: Bit 1: End
+    "23:"  // Oddments: Unroll 0: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q27, [%x[params], #0x10]\n"
+    "movi v1.4s, #0x0\n"
+    ".inst 0x4e979581  // sdot v1.4s, v12.16b, v23.16b\n"
+    "ldr q26, [%x[params], #0x20]\n"
+    "ldr q22, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q4, [%x[params], #0x40]\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x4e99977f  // sdot v31.4s, v27.16b, v25.16b\n"
+    ".inst 0x4e939581  // sdot v1.4s, v12.16b, v19.16b\n"
+    ".inst 0x4e97977d  // sdot v29.4s, v27.16b, v23.16b\n"
+    "movi v20.4s, #0x0\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x4e97975f  // sdot v31.4s, v26.16b, v23.16b\n"
+    "mov v18.16b, v1.16b\n .inst 0x4e989592  // sdot v18.4s, v12.16b, v24.16b\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e999581  // sdot v1.4s, v12.16b, v25.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    ".inst 0x4e99977e  // sdot v30.4s, v27.16b, v25.16b\n"
+    ".inst 0x4e97977c  // sdot v28.4s, v27.16b, v23.16b\n"
+    ".inst 0x4e979594  // sdot v20.4s, v12.16b, v23.16b\n"
+    ".inst 0x4e93975d  // sdot v29.4s, v26.16b, v19.16b\n"
+    ".inst 0x4e9396df  // sdot v31.4s, v22.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x4e97975e  // sdot v30.4s, v26.16b, v23.16b\n"
+    ".inst 0x4e93975c  // sdot v28.4s, v26.16b, v19.16b\n"
+    "mls v31.4s, v1.4s, v16.4s\n"
+    ".inst 0x4e939594  // sdot v20.4s, v12.16b, v19.16b\n"
+    ".inst 0x4e9896dd  // sdot v29.4s, v22.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e9396de  // sdot v30.4s, v22.16b, v19.16b\n"
+    ".inst 0x4e9896dc  // sdot v28.4s, v22.16b, v24.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+    "mov v17.16b, v20.16b\n .inst 0x4e989591  // sdot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x4e999594  // sdot v20.4s, v12.16b, v25.16b\n"
+    "mls v30.4s, v20.4s, v16.4s\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v30.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v28.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 24f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Unroll 1: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 25f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 26f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 26f\n"
+    "25:"  // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "26:"  // Oddments: Unroll 1: Oddment store: Bit 1: End
+    "27:"  // Oddments: Unroll 1: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q25, [%x[params], #0x10]\n"
+    "movi v24.4s, #0x0\n"
+    ".inst 0x4e839598  // sdot v24.4s, v12.16b, v3.16b\n"
+    "ldr q23, [%x[params], #0x20]\n"
+    "ldr q22, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q21, [%x[params], #0x40]\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x4e87973f  // sdot v31.4s, v25.16b, v7.16b\n"
+    ".inst 0x4e809598  // sdot v24.4s, v12.16b, v0.16b\n"
+    ".inst 0x4e83973d  // sdot v29.4s, v25.16b, v3.16b\n"
+    "movi v19.4s, #0x0\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x4e8396ff  // sdot v31.4s, v23.16b, v3.16b\n"
+    "mov v18.16b, v24.16b\n .inst 0x4e829592  // sdot v18.4s, v12.16b, v2.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e879598  // sdot v24.4s, v12.16b, v7.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e87973e  // sdot v30.4s, v25.16b, v7.16b\n"
+    ".inst 0x4e83973c  // sdot v28.4s, v25.16b, v3.16b\n"
+    ".inst 0x4e839593  // sdot v19.4s, v12.16b, v3.16b\n"
+    ".inst 0x4e8096fd  // sdot v29.4s, v23.16b, v0.16b\n"
+    ".inst 0x4e8096df  // sdot v31.4s, v22.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e8396fe  // sdot v30.4s, v23.16b, v3.16b\n"
+    ".inst 0x4e8096fc  // sdot v28.4s, v23.16b, v0.16b\n"
+    "mls v31.4s, v24.4s, v16.4s\n"
+    ".inst 0x4e809593  // sdot v19.4s, v12.16b, v0.16b\n"
+    ".inst 0x4e8296dd  // sdot v29.4s, v22.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x4e8096de  // sdot v30.4s, v22.16b, v0.16b\n"
+    ".inst 0x4e8296dc  // sdot v28.4s, v22.16b, v2.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "mov v17.16b, v19.16b\n .inst 0x4e829591  // sdot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x4e879593  // sdot v19.4s, v12.16b, v7.16b\n"
+    "mls v30.4s, v19.4s, v16.4s\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v20.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v30.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v28.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 28f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Unroll 2: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 29f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 30f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 30f\n"
+    "29:"  // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "30:"  // Oddments: Unroll 2: Oddment store: Bit 1: End
+    "31:"  // Oddments: Unroll 2: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x4e8a9596  // sdot v22.4s, v12.16b, v10.16b\n"
+    "ldr q21, [%x[params], #0x20]\n"
+    "ldr q19, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q20, [%x[params], #0x40]\n"
+    "ldr q26, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x4e8896ff  // sdot v31.4s, v23.16b, v8.16b\n"
+    ".inst 0x4e859596  // sdot v22.4s, v12.16b, v5.16b\n"
+    ".inst 0x4e8a96fd  // sdot v29.4s, v23.16b, v10.16b\n"
+    "movi v18.4s, #0x0\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e8a96bf  // sdot v31.4s, v21.16b, v10.16b\n"
+    "mov v17.16b, v22.16b\n .inst 0x4e899591  // sdot v17.4s, v12.16b, v9.16b\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    ".inst 0x4e889596  // sdot v22.4s, v12.16b, v8.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    ".inst 0x4e8896fe  // sdot v30.4s, v23.16b, v8.16b\n"
+    ".inst 0x4e8a96fc  // sdot v28.4s, v23.16b, v10.16b\n"
+    ".inst 0x4e8a9592  // sdot v18.4s, v12.16b, v10.16b\n"
+    ".inst 0x4e8596bd  // sdot v29.4s, v21.16b, v5.16b\n"
+    ".inst 0x4e85967f  // sdot v31.4s, v19.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8a96be  // sdot v30.4s, v21.16b, v10.16b\n"
+    ".inst 0x4e8596bc  // sdot v28.4s, v21.16b, v5.16b\n"
+    "mls v31.4s, v22.4s, v16.4s\n"
+    ".inst 0x4e859592  // sdot v18.4s, v12.16b, v5.16b\n"
+    ".inst 0x4e89967d  // sdot v29.4s, v19.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x4e85967e  // sdot v30.4s, v19.16b, v5.16b\n"
+    ".inst 0x4e89967c  // sdot v28.4s, v19.16b, v9.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+    "mov v7.16b, v18.16b\n .inst 0x4e899587  // sdot v7.4s, v12.16b, v9.16b\n"
+    ".inst 0x4e889592  // sdot v18.4s, v12.16b, v8.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mls v28.4s, v7.4s, v16.4s\n"
+    "and v16.16b, v31.16b, v26.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v20.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v18.16b, v30.16b, v26.16b\n"
+    "and v17.16b, v29.16b, v26.16b\n"
+    "and v16.16b, v28.16b, v26.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v26.4s\n"
+    "srshl v30.4s, v30.4s, v26.4s\n"
+    "srshl v29.4s, v29.4s, v26.4s\n"
+    "srshl v28.4s, v28.4s, v26.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "32:"  // Oddments: Unroll 3: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 33f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 34f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 34f\n"
+    "33:"  // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "34:"  // Oddments: Unroll 3: Oddment store: Bit 1: End
+    "35:"  // End
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..874b18c145
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const int8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  int8_t *const *const
+);
+
+class a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..4626007afa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v14.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v19.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v29.8h }, [x21]\n"
+    "ld1r { v12.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d22, [x23, x17]\n"
+    "ldr d4, [x22, x17]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d8, [x21, x17]\n"
+    "ldr d27, [x20, x17]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ldr d15, [x20, x17]\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "ssubl v15.8h, v15.8b, v14.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q3, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q28, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "ssubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x27, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x26, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x25, [x15, #0x58]\n"
+    "ldr x24, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x21, [x15, #0x78]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x27, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x26, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "ssubl v11.8h, v11.8b, v14.8b\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "add x13, x13, #0x20\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x25, x17]\n"
+    "ssubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x24, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x23, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x22, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "ssubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d8, [x21, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v27.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v27.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v28.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v8.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+    "smlal v10.4s, v8.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+    "smlal2 v30.4s, v8.8h, v20.8h\n"
+    "smlal2 v6.4s, v8.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v3.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v20.16b, v0.16b, v28.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v31.16b, v30.16b, v28.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v18.16b, v6.16b, v28.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v20.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v31.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v28.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v28.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v28.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr d22, [x23, x17]\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ldr d4, [x22, x17]\n"
+    "ldr d8, [x21, x17]\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ldr d27, [x20, x17]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ldr d15, [x20, x17]\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "ssubl v15.8h, v15.8b, v14.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q28, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q3, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "ssubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x26, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x25, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x24, [x15, #0x58]\n"
+    "ldr x23, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x22, [x15, #0x68]\n"
+    "ldr x21, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "tst x7, #0x7\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x26, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x25, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "ssubl v11.8h, v11.8b, v14.8b\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x24, x17]\n"
+    "ssubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x23, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x22, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x21, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "ssubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d16, [x20, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "ssubl v16.8h, v16.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v1.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v1.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v3.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v16.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+    "smlal v10.4s, v16.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+    "smlal2 v30.4s, v16.8h, v20.8h\n"
+    "smlal2 v6.4s, v16.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v15.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v3.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v23.16b, v30.16b, v3.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v21.16b, v6.16b, v3.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v23.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v21.4s\n"
+    "srshl v24.4s, v24.4s, v3.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v3.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v3.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v3.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 64f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v9.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v24.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v22.s }[0], [x24], #0x4\n"
+    "ld1 { v4.s }[0], [x23], #0x4\n"
+    "ld1 { v8.s }[0], [x22], #0x4\n"
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v22.h }[2], [x24], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v8.h }[2], [x22], #0x2\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[6], [x24]\n"
+    "ld1 { v4.b }[6], [x23]\n"
+    "ld1 { v8.b }[6], [x22]\n"
+    "ld1 { v27.b }[6], [x21]\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[4], [x24]\n"
+    "ld1 { v4.b }[4], [x23]\n"
+    "ld1 { v8.b }[4], [x22]\n"
+    "ld1 { v27.b }[4], [x21]\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v22.h }[0], [x24], #0x2\n"
+    "ld1 { v4.h }[0], [x23], #0x2\n"
+    "ld1 { v8.h }[0], [x22], #0x2\n"
+    "ld1 { v27.h }[0], [x21], #0x2\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[2], [x24]\n"
+    "ld1 { v4.b }[2], [x23]\n"
+    "ld1 { v8.b }[2], [x22]\n"
+    "ld1 { v27.b }[2], [x21]\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[0], [x24]\n"
+    "ld1 { v4.b }[0], [x23]\n"
+    "ld1 { v8.b }[0], [x22]\n"
+    "ld1 { v27.b }[0], [x21]\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ssubl v15.8h, v15.8b, v14.8b\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (3, 0): Bit 2: End
+    "ssubl v21.8h, v21.8b, v14.8b\n"
+    "smlal v2.4s, v21.4h, v31.4h\n"
+    "smlal2 v30.4s, v21.8h, v31.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (3, 3): Bit 2: End
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x38]\n"
+    "smlal v10.4s, v28.4h, v20.4h\n"
+    "smlal2 v6.4s, v28.8h, v20.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v22.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 1): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v22.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 1): Bit 2: End
+    "ssubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v9.4s, v22.4h, v16.4h\n"
+    "smlal2 v24.4s, v22.8h, v16.8h\n"
+    "smlal v7.4s, v22.4h, v23.4h\n"
+    "smlal2 v0.4s, v22.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (0, 2): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (0, 2): Bit 2: End
+    "ssubl v21.8h, v21.8b, v14.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v9.4s, v21.4h, v1.4h\n"
+    "smlal2 v24.4s, v21.8h, v1.8h\n"
+    "smlal v7.4s, v21.4h, v16.4h\n"
+    "smlal2 v0.4s, v21.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v9.4s, v28.4h, v20.4h\n"
+    "smlal2 v24.4s, v28.8h, v20.8h\n"
+    "smlal v7.4s, v28.4h, v25.4h\n"
+    "smlal2 v0.4s, v28.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v2.4s, v28.4h, v18.4h\n"
+    "smlal2 v30.4s, v28.8h, v18.8h\n"
+    "smlal v10.4s, v28.4h, v26.4h\n"
+    "smlal2 v6.4s, v28.8h, v26.8h\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (1, 0): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (1, 0): Bit 2: End
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v9.4s, v8.4h, v5.4h\n"
+    "smlal2 v24.4s, v8.8h, v5.8h\n"
+    "smlal v2.4s, v8.4h, v23.4h\n"
+    "smlal2 v30.4s, v8.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (1, 3): Bit 2: End
+    "ssubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v7.4s, v8.4h, v18.4h\n"
+    "smlal2 v0.4s, v8.8h, v18.8h\n"
+    "smlal v10.4s, v8.4h, v1.4h\n"
+    "smlal2 v6.4s, v8.8h, v1.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v17.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v17.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v17.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 0): Bit 2: End
+    "ssubl v17.8h, v17.8b, v14.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v9.4s, v17.4h, v31.4h\n"
+    "smlal2 v24.4s, v17.8h, v31.8h\n"
+    "smlal v2.4s, v17.4h, v5.4h\n"
+    "smlal2 v30.4s, v17.8h, v5.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ssubl v23.8h, v23.8b, v14.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v7.4s, v23.4h, v20.4h\n"
+    "smlal2 v0.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v18.4h\n"
+    "smlal2 v6.4s, v23.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ssubl v5.8h, v5.8b, v14.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v2.4s, v5.4h, v25.4h\n"
+    "smlal2 v30.4s, v5.8h, v25.8h\n"
+    "smlal v10.4s, v5.4h, v31.4h\n"
+    "smlal2 v6.4s, v5.8h, v31.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ssubl v23.8h, v23.8b, v14.8b\n"
+    "smlal v2.4s, v23.4h, v20.4h\n"
+    "smlal2 v30.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v25.4h\n"
+    "smlal2 v6.4s, v23.8h, v25.8h\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v15.4s }, [x13], #0x10\n"
+    "ld1 { v19.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v22.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v22.s }[2], [x12]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v22.s }[0], [x12]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v15.d }[0], [x13], #0x8\n"
+    "ld1 { v19.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[2], [x13]\n"
+    "ld1 { v19.s }[2], [x12]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[0], [x13]\n"
+    "ld1 { v19.s }[0], [x12]\n"
+    "59:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+    "and v17.16b, v9.16b, v19.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v20.16b, v24.16b, v22.16b\n"
+    "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+    "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+    "sqadd v9.4s, v9.4s, v17.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v21.16b, v7.16b, v19.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v15.16b, v2.16b, v19.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+    "and v23.16b, v10.16b, v19.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v20.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v22.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v17.16b, v30.16b, v22.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v28.16b, v6.16b, v22.16b\n"
+    "sqadd v7.4s, v7.4s, v21.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v23.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v19.4s\n"
+    "srshl v7.4s, v7.4s, v19.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v19.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "srshl v10.4s, v10.4s, v19.4s\n"
+    "sqadd v6.4s, v6.4s, v28.4s\n"
+    "srshl v24.4s, v24.4s, v22.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v22.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v22.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v22.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "tbz x7, #2, 61f\n"
+    "st1 { v9.s }[0], [x11], #0x4\n"
+    "st1 { v7.s }[0], [x10], #0x4\n"
+    "st1 { v2.s }[0], [x9], #0x4\n"
+    "st1 { v10.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "st1 { v9.h }[2], [x11], #0x2\n"
+    "st1 { v7.h }[2], [x10], #0x2\n"
+    "st1 { v2.h }[2], [x9], #0x2\n"
+    "st1 { v10.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[6], [x11], #0x1\n"
+    "st1 { v7.b }[6], [x10], #0x1\n"
+    "st1 { v2.b }[6], [x9], #0x1\n"
+    "st1 { v10.b }[6], [x28], #0x1\n"
+    "b 63f\n"
+    "60:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[4], [x11], #0x1\n"
+    "st1 { v7.b }[4], [x10], #0x1\n"
+    "st1 { v2.b }[4], [x9], #0x1\n"
+    "st1 { v10.b }[4], [x28], #0x1\n"
+    "b 63f\n"
+    "61:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "st1 { v9.h }[0], [x11], #0x2\n"
+    "st1 { v7.h }[0], [x10], #0x2\n"
+    "st1 { v2.h }[0], [x9], #0x2\n"
+    "st1 { v10.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[2], [x11], #0x1\n"
+    "st1 { v7.b }[2], [x10], #0x1\n"
+    "st1 { v2.b }[2], [x9], #0x1\n"
+    "st1 { v10.b }[2], [x28], #0x1\n"
+    "b 63f\n"
+    "62:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[0], [x11], #0x1\n"
+    "st1 { v7.b }[0], [x10], #0x1\n"
+    "st1 { v2.b }[0], [x9], #0x1\n"
+    "st1 { v10.b }[0], [x28], #0x1\n"
+    "63:"  // Oddments: Bit 2: End
+    "64:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..893260362a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const int8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  int8_t *const *const
+);
+
+
+class a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d98ab71cb8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v15.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v17.8h }, [x21]\n"
+    "ld1r { v24.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d25, [x27, x17]\n"
+    "ldr d27, [x26, x17]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d1, [x25, x17]\n"
+    "ldr d2, [x24, x17]\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "ldr d12, [x23, x17]\n"
+    "ldr d16, [x22, x17]\n"
+    "ssubl v1.8h, v1.8b, v6.8b\n"
+    "ssubl v2.8h, v2.8b, v6.8b\n"
+    "ldr d23, [x21, x17]\n"
+    "ldr d10, [x20, x17]\n"
+    "ssubl v12.8h, v12.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ssubl v23.8h, v23.8b, v6.8b\n"
+    "ssubl v10.8h, v10.8b, v6.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q30, [x13, #0x0]\n"
+    "ldr q29, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "ssubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "ssubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "ssubl v12.8h, v12.8b, v6.8b\n"
+    "ssubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "ssubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "ssubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d18, [x22, x17]\n"
+    "ldr d16, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v20.4s, v18.4h, v7.4h\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v30.4s\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v18.8h, v7.8h\n"
+    "and v28.16b, v5.16b, v29.16b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x12, x12, #0x20\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v21.16b, v29.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v23.16b, v20.16b, v29.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v9.16b, v19.16b, v29.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v25.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v22.16b, v0.16b, v25.16b\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v12.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v23.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v9.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v29.4s\n"
+    "srshl v21.4s, v21.4s, v29.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v29.4s\n"
+    "sqadd v0.4s, v0.4s, v22.4s\n"
+    "srshl v19.4s, v19.4s, v29.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr d25, [x27, x17]\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ldr d27, [x26, x17]\n"
+    "ldr d1, [x25, x17]\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ldr d2, [x24, x17]\n"
+    "ldr d12, [x23, x17]\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d23, [x21, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "ssubl v1.8h, v1.8b, v6.8b\n"
+    "ldr d10, [x20, x17]\n"
+    "ssubl v2.8h, v2.8b, v6.8b\n"
+    "ssubl v12.8h, v12.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ssubl v23.8h, v23.8b, v6.8b\n"
+    "ssubl v10.8h, v10.8b, v6.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q29, [x13, #0x0]\n"
+    "ldr q30, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "ssubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "ssubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "ssubl v12.8h, v12.8b, v6.8b\n"
+    "ssubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "ssubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "ssubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d18, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "tst x7, #0x7\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x20, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal v19.4s, v18.4h, v7.4h\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "and v16.16b, v5.16b, v30.16b\n"
+    "smlal2 v31.4s, v18.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v23.16b, v21.16b, v30.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v27.16b, v20.16b, v30.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v22.16b, v19.16b, v30.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v14.16b, v8.16b, v25.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v25.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v23.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v27.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v22.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v30.4s\n"
+    "srshl v21.4s, v21.4s, v30.4s\n"
+    "sqadd v8.4s, v8.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v19.4s, v19.4s, v30.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 88f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v5.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "add x27, x27, x17\n"
+    "add x26, x26, x17\n"
+    "add x25, x25, x17\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v25.s }[0], [x27], #0x4\n"
+    "ld1 { v27.s }[0], [x26], #0x4\n"
+    "ld1 { v1.s }[0], [x25], #0x4\n"
+    "ld1 { v2.s }[0], [x24], #0x4\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "ld1 { v16.s }[0], [x22], #0x4\n"
+    "ld1 { v23.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v25.h }[2], [x27], #0x2\n"
+    "ld1 { v27.h }[2], [x26], #0x2\n"
+    "ld1 { v1.h }[2], [x25], #0x2\n"
+    "ld1 { v2.h }[2], [x24], #0x2\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "ld1 { v16.h }[2], [x22], #0x2\n"
+    "ld1 { v23.h }[2], [x21], #0x2\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[6], [x27]\n"
+    "ld1 { v27.b }[6], [x26]\n"
+    "ld1 { v1.b }[6], [x25]\n"
+    "ld1 { v2.b }[6], [x24]\n"
+    "ld1 { v12.b }[6], [x23]\n"
+    "ld1 { v16.b }[6], [x22]\n"
+    "ld1 { v23.b }[6], [x21]\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[4], [x27]\n"
+    "ld1 { v27.b }[4], [x26]\n"
+    "ld1 { v1.b }[4], [x25]\n"
+    "ld1 { v2.b }[4], [x24]\n"
+    "ld1 { v12.b }[4], [x23]\n"
+    "ld1 { v16.b }[4], [x22]\n"
+    "ld1 { v23.b }[4], [x21]\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v25.h }[0], [x27], #0x2\n"
+    "ld1 { v27.h }[0], [x26], #0x2\n"
+    "ld1 { v1.h }[0], [x25], #0x2\n"
+    "ld1 { v2.h }[0], [x24], #0x2\n"
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "ld1 { v16.h }[0], [x22], #0x2\n"
+    "ld1 { v23.h }[0], [x21], #0x2\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[2], [x27]\n"
+    "ld1 { v27.b }[2], [x26]\n"
+    "ld1 { v1.b }[2], [x25]\n"
+    "ld1 { v2.b }[2], [x24]\n"
+    "ld1 { v12.b }[2], [x23]\n"
+    "ld1 { v16.b }[2], [x22]\n"
+    "ld1 { v23.b }[2], [x21]\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[0], [x27]\n"
+    "ld1 { v27.b }[0], [x26]\n"
+    "ld1 { v1.b }[0], [x25]\n"
+    "ld1 { v2.b }[0], [x24]\n"
+    "ld1 { v12.b }[0], [x23]\n"
+    "ld1 { v16.b }[0], [x22]\n"
+    "ld1 { v23.b }[0], [x21]\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ssubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ssubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ssubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ssubl v2.8h, v2.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ssubl v12.8h, v12.8b, v6.8b\n"
+    "ssubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ssubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "ssubl v15.8h, v15.8b, v6.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v21.4s, v15.4h, v18.4h\n"
+    "smlal2 v8.4s, v15.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v21.4s, v16.4h, v9.4h\n"
+    "smlal2 v8.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (1, 2): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (1, 2): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v5.4s, v16.4h, v9.4h\n"
+    "smlal2 v3.4s, v16.8h, v9.8h\n"
+    "smlal v21.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (3, 0): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v20.4s, v16.4h, v28.4h\n"
+    "smlal2 v0.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 0): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v5.4s, v16.4h, v26.4h\n"
+    "smlal2 v3.4s, v16.8h, v26.8h\n"
+    "smlal v20.4s, v16.4h, v11.4h\n"
+    "smlal2 v0.4s, v16.8h, v11.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (3, 1): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v20.4s, v16.4h, v18.4h\n"
+    "smlal2 v0.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 1): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v16.4h, v7.4h\n"
+    "smlal2 v3.4s, v16.8h, v7.8h\n"
+    "smlal v20.4s, v16.4h, v22.4h\n"
+    "smlal2 v0.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (3, 3): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x80]\n"
+    "smlal v19.4s, v16.4h, v18.4h\n"
+    "smlal2 v31.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x88]\n"
+    "smlal v21.4s, v16.4h, v7.4h\n"
+    "smlal2 v8.4s, v16.8h, v7.8h\n"
+    "smlal v19.4s, v16.4h, v22.4h\n"
+    "smlal2 v31.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 4): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x90]\n"
+    "smlal v19.4s, v16.4h, v9.4h\n"
+    "smlal2 v31.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (4, 0): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x98]\n"
+    "smlal v20.4s, v16.4h, v26.4h\n"
+    "smlal2 v0.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (2, 4): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "smlal v21.4s, v16.4h, v4.4h\n"
+    "smlal2 v8.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v14.4h\n"
+    "smlal2 v31.4s, v16.8h, v14.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 61f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (4, 1): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 65f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 64f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 66f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 2): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v0.4s, v16.8h, v9.8h\n"
+    "smlal v19.4s, v16.4h, v28.4h\n"
+    "smlal2 v31.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 69f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 68f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x7, #1, 70f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 3): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x7, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 2): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal v20.4s, v16.4h, v4.4h\n"
+    "smlal2 v0.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v26.4h\n"
+    "smlal2 v31.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 77f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 76f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x7, #1, 78f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 4): Bit 2: End
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v19.4s, v16.4h, v4.4h\n"
+    "smlal2 v31.4s, v16.8h, v4.8h\n"
+    "tbz x7, #2, 81f\n"
+    "ld1 { v14.4s }, [x13], #0x10\n"
+    "ld1 { v25.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 80f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v12.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v12.s }[2], [x12]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v12.s }[0], [x12]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 82f\n"
+    "ld1 { v14.d }[0], [x13], #0x8\n"
+    "ld1 { v25.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[2], [x13]\n"
+    "ld1 { v25.s }[2], [x12]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[0], [x13]\n"
+    "ld1 { v25.s }[0], [x12]\n"
+    "83:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v28.16b, v5.16b, v25.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v16.16b, v3.16b, v12.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v14.16b, v21.16b, v25.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+    "and v6.16b, v20.16b, v25.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v4.16b, v19.16b, v25.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v12.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "and v7.16b, v0.16b, v12.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v12.16b\n"
+    "sqadd v21.4s, v21.4s, v14.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v6.4s\n"
+    "sshr v7.4s, v7.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v25.4s\n"
+    "srshl v21.4s, v21.4s, v25.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v25.4s\n"
+    "sqadd v0.4s, v0.4s, v7.4s\n"
+    "srshl v19.4s, v19.4s, v25.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v12.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v12.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v12.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v12.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz x7, #2, 85f\n"
+    "st1 { v5.s }[0], [x11], #0x4\n"
+    "st1 { v21.s }[0], [x10], #0x4\n"
+    "st1 { v20.s }[0], [x9], #0x4\n"
+    "st1 { v19.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 84f\n"
+    "st1 { v5.h }[2], [x11], #0x2\n"
+    "st1 { v21.h }[2], [x10], #0x2\n"
+    "st1 { v20.h }[2], [x9], #0x2\n"
+    "st1 { v19.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[6], [x11], #0x1\n"
+    "st1 { v21.b }[6], [x10], #0x1\n"
+    "st1 { v20.b }[6], [x9], #0x1\n"
+    "st1 { v19.b }[6], [x28], #0x1\n"
+    "b 87f\n"
+    "84:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[4], [x11], #0x1\n"
+    "st1 { v21.b }[4], [x10], #0x1\n"
+    "st1 { v20.b }[4], [x9], #0x1\n"
+    "st1 { v19.b }[4], [x28], #0x1\n"
+    "b 87f\n"
+    "85:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 86f\n"
+    "st1 { v5.h }[0], [x11], #0x2\n"
+    "st1 { v21.h }[0], [x10], #0x2\n"
+    "st1 { v20.h }[0], [x9], #0x2\n"
+    "st1 { v19.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[2], [x11], #0x1\n"
+    "st1 { v21.b }[2], [x10], #0x1\n"
+    "st1 { v20.b }[2], [x9], #0x1\n"
+    "st1 { v19.b }[2], [x28], #0x1\n"
+    "b 87f\n"
+    "86:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[0], [x11], #0x1\n"
+    "st1 { v21.b }[0], [x10], #0x1\n"
+    "st1 { v20.b }[0], [x9], #0x1\n"
+    "st1 { v19.b }[0], [x28], #0x1\n"
+    "87:"  // Oddments: Bit 2: End
+    "88:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..ccab35ce57
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const int8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  int8_t *const *const
+);
+
+
+
+class a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..b1648bae14
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2187 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x2, x1, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v18.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.16b }, [x21]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v11.8h }, [x21]\n"
+    "ld1r { v0.8h }, [x20]\n"
+    "mov x3, #0x0\n"
+    "mov x4, #0x0\n"
+    "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x17, x16, [x22, #0x0]\n"
+    "ldp x15, x14, [x22, #0x10]\n"
+    "cbz x2, 3f\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "subs x2, x2, #0x1\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ldr d31, [x9, x3]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldr d17, [x28, x3]\n"
+    "ldr d30, [x27, x3]\n"
+    "ssubl v31.8h, v31.8b, v18.8b\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "ldr d16, [x26, x3]\n"
+    "ldr d3, [x25, x3]\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "ldr d4, [x24, x3]\n"
+    "ldr d25, [x23, x3]\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "ldr d9, [x22, x3]\n"
+    "ldr d29, [x21, x3]\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "ldr d28, [x20, x3]\n"
+    "ssubl v29.8h, v29.8b, v18.8b\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr d2, [x6, #0x28]\n"
+    "ldr d27, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d1, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "ssubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x21, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x20, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v2.4h\n"
+    "ldr x20, [x5, #0x90]\n"
+    "ldr x23, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x21, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x22, [x5, #0xa0]\n"
+    "ldr x21, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v27.4h\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x20, x3]\n"
+    "smlal v20.4s, v16.4h, v2.4h\n"
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal v23.4s, v14.4h, v2.4h\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "ldr x13, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v27.8h\n"
+    "smlal v7.4s, v4.4h, v1.4h\n"
+    "ldr x12, [x5, #0xc0]\n"
+    "ldr x11, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v2.8h\n"
+    "ldr d16, [x23, x3]\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v2.8h\n"
+    "ldr d2, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v27.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v27.4h\n"
+    "smlal v23.4s, v25.4h, v27.4h\n"
+    "ldr x10, [x5, #0xd0]\n"
+    "ldr x9, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v1.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x28, [x5, #0xe0]\n"
+    "ldr x27, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v27.8h\n"
+    "ldr d4, [x22, x3]\n"
+    "smlal2 v22.4s, v14.8h, v27.8h\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v27.8h\n"
+    "ldr d27, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v1.4h\n"
+    "smlal v23.4s, v10.4h, v1.4h\n"
+    "ldr x26, [x5, #0xf0]\n"
+    "ldr x25, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x24, [x5, #0x100]\n"
+    "ldr x23, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v1.8h\n"
+    "ldr d17, [x21, x3]\n"
+    "smlal2 v22.4s, v25.8h, v1.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v1.8h\n"
+    "ldr d1, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x22, [x5, #0x110]\n"
+    "ldr x21, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "subs x2, x2, #0x1\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x13, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x12, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x11, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "ssubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v7.4s, v10.4h, v27.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x10, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v2.4h\n"
+    "smlal v23.4s, v17.4h, v2.4h\n"
+    "smlal2 v15.4s, v10.8h, v27.8h\n"
+    "smlal v7.4s, v9.4h, v1.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "ldr d10, [x9, x3]\n"
+    "smlal2 v22.4s, v4.8h, v2.8h\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v2.8h\n"
+    "ldr d2, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v27.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v27.4h\n"
+    "smlal v23.4s, v6.4h, v27.4h\n"
+    "smlal2 v15.4s, v9.8h, v1.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v27.8h\n"
+    "ldr d9, [x28, x3]\n"
+    "smlal2 v22.4s, v17.8h, v27.8h\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v27.8h\n"
+    "ldr d27, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v1.4h\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x27, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v1.8h\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "ldr d1, [x26, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "ssubl v1.8h, v1.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x25, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x24, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v2.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x23, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "add x6, x6, #0xc8\n"
+    "smlal2 v15.4s, v6.8h, v2.8h\n"
+    "smlal v7.4s, v8.4h, v27.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x22, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal v20.4s, v28.4h, v2.4h\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v2.4h\n"
+    "smlal v23.4s, v12.4h, v2.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v27.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v2.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v2.8h\n"
+    "smlal2 v19.4s, v12.8h, v2.8h\n"
+    "ldr q2, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v27.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v1.4h, v27.4h\n"
+    "smlal v23.4s, v16.4h, v27.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v27.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v1.8h, v27.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v27.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v27.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v27.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v9.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v25.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+    "and v10.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+    "and v21.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+    "sqadd v15.4s, v15.4s, v9.4s\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v14.16b\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "and v12.16b, v22.16b, v14.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v17.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v10.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v21.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v12.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v17.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "add x4, x4, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldr d31, [x9, x3]\n"
+    "ldr d17, [x28, x3]\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr d30, [x27, x3]\n"
+    "ldr d16, [x26, x3]\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ssubl v31.8h, v31.8b, v18.8b\n"
+    "ldr d3, [x25, x3]\n"
+    "ldr d4, [x24, x3]\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "ldr d25, [x23, x3]\n"
+    "ldr d9, [x22, x3]\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "ldr d29, [x21, x3]\n"
+    "ldr d28, [x20, x3]\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "ssubl v29.8h, v29.8b, v18.8b\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr d27, [x6, #0x28]\n"
+    "ldr d1, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d2, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "ssubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x21, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x21, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v27.4h\n"
+    "ldr x23, [x5, #0x90]\n"
+    "ldr x22, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x20, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x21, [x5, #0xa0]\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v27.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v1.4h\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x23, x3]\n"
+    "smlal v20.4s, v16.4h, v27.4h\n"
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v27.4h\n"
+    "smlal v23.4s, v14.4h, v27.4h\n"
+    "ldr x13, [x5, #0xb0]\n"
+    "ldr x12, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v1.8h\n"
+    "smlal v7.4s, v4.4h, v2.4h\n"
+    "ldr x11, [x5, #0xc0]\n"
+    "ldr x10, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v27.8h\n"
+    "ldr d16, [x22, x3]\n"
+    "smlal2 v22.4s, v28.8h, v27.8h\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v27.8h\n"
+    "ldr d27, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v1.4h\n"
+    "smlal v23.4s, v25.4h, v1.4h\n"
+    "ldr x9, [x5, #0xd0]\n"
+    "ldr x28, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v2.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x27, [x5, #0xe0]\n"
+    "ldr x26, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v1.8h\n"
+    "ldr d4, [x21, x3]\n"
+    "smlal2 v22.4s, v14.8h, v1.8h\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v1.8h\n"
+    "ldr d1, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v2.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v2.4h\n"
+    "smlal v23.4s, v10.4h, v2.4h\n"
+    "ldr x25, [x5, #0xf0]\n"
+    "ldr x24, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x23, [x5, #0x100]\n"
+    "ldr x22, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v2.8h\n"
+    "ldr d17, [x20, x3]\n"
+    "smlal2 v22.4s, v25.8h, v2.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v2.8h\n"
+    "ldr d2, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x21, [x5, #0x110]\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "tst x1, #0x7\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x13, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x12, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x11, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v27.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x10, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "ssubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v27.8h\n"
+    "smlal v7.4s, v10.4h, v1.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x9, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v27.4h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v27.4h\n"
+    "smlal v23.4s, v17.4h, v27.4h\n"
+    "smlal2 v15.4s, v10.8h, v1.8h\n"
+    "smlal v7.4s, v9.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v27.8h\n"
+    "ldr d10, [x28, x3]\n"
+    "smlal2 v22.4s, v4.8h, v27.8h\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v27.8h\n"
+    "ldr d27, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v1.4h\n"
+    "smlal v23.4s, v6.4h, v1.4h\n"
+    "smlal2 v15.4s, v9.8h, v2.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v1.8h\n"
+    "ldr d9, [x27, x3]\n"
+    "smlal2 v22.4s, v17.8h, v1.8h\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v1.8h\n"
+    "ldr d1, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v2.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v2.4h\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x26, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v2.8h\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v2.8h\n"
+    "ldr d2, [x25, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "ssubl v2.8h, v2.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x24, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x23, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v27.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "smlal2 v15.4s, v6.8h, v27.8h\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x21, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x20, x3]\n"
+    "smlal v20.4s, v28.4h, v27.4h\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v27.4h\n"
+    "smlal v23.4s, v12.4h, v27.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v1.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v27.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v27.8h\n"
+    "smlal2 v19.4s, v12.8h, v27.8h\n"
+    "ldr q27, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v1.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v2.4h, v1.4h\n"
+    "smlal v23.4s, v16.4h, v1.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v1.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v2.8h, v1.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v1.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v4.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v4.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v30.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "and v3.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+    "and v25.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+    "and v16.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+    "sqadd v15.4s, v15.4s, v30.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v4.16b, v5.16b, v14.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v10.16b, v22.16b, v14.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v3.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v25.4s\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v4.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v10.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v12.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "add x4, x4, #0x8\n"
+    "beq 124f\n"
+    "add x6, x6, #0xc8\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x1, #2, 5f\n"
+    "ld1 { v7.4s }, [x20], #0x10\n"
+    "tbz x1, #1, 4f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x1, #1, 6f\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "add x9, x9, x3\n"
+    "add x28, x28, x3\n"
+    "add x27, x27, x3\n"
+    "add x26, x26, x3\n"
+    "add x25, x25, x3\n"
+    "add x24, x24, x3\n"
+    "add x23, x23, x3\n"
+    "add x22, x22, x3\n"
+    "add x21, x21, x3\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 9f\n"
+    "ld1 { v31.s }[0], [x9], #0x4\n"
+    "ld1 { v17.s }[0], [x28], #0x4\n"
+    "ld1 { v30.s }[0], [x27], #0x4\n"
+    "ld1 { v16.s }[0], [x26], #0x4\n"
+    "ld1 { v3.s }[0], [x25], #0x4\n"
+    "ld1 { v4.s }[0], [x24], #0x4\n"
+    "ld1 { v25.s }[0], [x23], #0x4\n"
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v29.s }[0], [x21], #0x4\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 8f\n"
+    "ld1 { v31.h }[2], [x9], #0x2\n"
+    "ld1 { v17.h }[2], [x28], #0x2\n"
+    "ld1 { v30.h }[2], [x27], #0x2\n"
+    "ld1 { v16.h }[2], [x26], #0x2\n"
+    "ld1 { v3.h }[2], [x25], #0x2\n"
+    "ld1 { v4.h }[2], [x24], #0x2\n"
+    "ld1 { v25.h }[2], [x23], #0x2\n"
+    "ld1 { v9.h }[2], [x22], #0x2\n"
+    "ld1 { v29.h }[2], [x21], #0x2\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[6], [x9]\n"
+    "ld1 { v17.b }[6], [x28]\n"
+    "ld1 { v30.b }[6], [x27]\n"
+    "ld1 { v16.b }[6], [x26]\n"
+    "ld1 { v3.b }[6], [x25]\n"
+    "ld1 { v4.b }[6], [x24]\n"
+    "ld1 { v25.b }[6], [x23]\n"
+    "ld1 { v9.b }[6], [x22]\n"
+    "ld1 { v29.b }[6], [x21]\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[4], [x9]\n"
+    "ld1 { v17.b }[4], [x28]\n"
+    "ld1 { v30.b }[4], [x27]\n"
+    "ld1 { v16.b }[4], [x26]\n"
+    "ld1 { v3.b }[4], [x25]\n"
+    "ld1 { v4.b }[4], [x24]\n"
+    "ld1 { v25.b }[4], [x23]\n"
+    "ld1 { v9.b }[4], [x22]\n"
+    "ld1 { v29.b }[4], [x21]\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x1, #1, 10f\n"
+    "ld1 { v31.h }[0], [x9], #0x2\n"
+    "ld1 { v17.h }[0], [x28], #0x2\n"
+    "ld1 { v30.h }[0], [x27], #0x2\n"
+    "ld1 { v16.h }[0], [x26], #0x2\n"
+    "ld1 { v3.h }[0], [x25], #0x2\n"
+    "ld1 { v4.h }[0], [x24], #0x2\n"
+    "ld1 { v25.h }[0], [x23], #0x2\n"
+    "ld1 { v9.h }[0], [x22], #0x2\n"
+    "ld1 { v29.h }[0], [x21], #0x2\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[2], [x9]\n"
+    "ld1 { v17.b }[2], [x28]\n"
+    "ld1 { v30.b }[2], [x27]\n"
+    "ld1 { v16.b }[2], [x26]\n"
+    "ld1 { v3.b }[2], [x25]\n"
+    "ld1 { v4.b }[2], [x24]\n"
+    "ld1 { v25.b }[2], [x23]\n"
+    "ld1 { v9.b }[2], [x22]\n"
+    "ld1 { v29.b }[2], [x21]\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[0], [x9]\n"
+    "ld1 { v17.b }[0], [x28]\n"
+    "ld1 { v30.b }[0], [x27]\n"
+    "ld1 { v16.b }[0], [x26]\n"
+    "ld1 { v3.b }[0], [x25]\n"
+    "ld1 { v4.b }[0], [x24]\n"
+    "ld1 { v25.b }[0], [x23]\n"
+    "ld1 { v9.b }[0], [x22]\n"
+    "ld1 { v29.b }[0], [x21]\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ssubl v31.8h, v31.8b, v18.8b\n"
+    "ssubl v17.8h, v17.8b, v18.8b\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "ldr x20, [x5, #0x50]\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "add x20, x20, x3\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "ssubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "ssubl v29.8h, v29.8b, v18.8b\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "tbz x1, #2, 13f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 12f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x1, #1, 14f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "ssubl v27.8h, v27.8b, v18.8b\n"
+    "ldr x20, [x5, #0x58]\n"
+    "smlal v23.4s, v27.4h, v10.4h\n"
+    "smlal2 v19.4s, v27.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "smlal v24.4s, v27.4h, v21.4h\n"
+    "smlal2 v22.4s, v27.8h, v21.8h\n"
+    "tbz x1, #2, 17f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 16f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x1, #1, 18f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "ssubl v6.8h, v6.8b, v18.8b\n"
+    "ldr x20, [x5, #0x60]\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 21f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 20f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
+    "tbz x1, #1, 22f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 5): Bit 2: End
+    "ldr d14, [x6, #0x28]\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v14.4h\n"
+    "smlal2 v15.4s, v30.8h, v14.8h\n"
+    "smlal v20.4s, v16.4h, v14.4h\n"
+    "smlal2 v5.4s, v16.8h, v14.8h\n"
+    "smlal v24.4s, v28.4h, v14.4h\n"
+    "smlal2 v22.4s, v28.8h, v14.8h\n"
+    "tbz x1, #2, 25f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 24f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x1, #1, 26f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr d21, [x6, #0x30]\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x70]\n"
+    "smlal v23.4s, v25.4h, v14.4h\n"
+    "smlal2 v19.4s, v25.8h, v14.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v16.8h, v21.8h\n"
+    "smlal v20.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v4.8h, v21.8h\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 29f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 28f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x1, #1, 30f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr d9, [x6, #0x38]\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v4.4h, v9.4h\n"
+    "smlal2 v15.4s, v4.8h, v9.8h\n"
+    "smlal v20.4s, v27.4h, v9.4h\n"
+    "smlal2 v5.4s, v27.8h, v9.8h\n"
+    "smlal v24.4s, v10.4h, v9.4h\n"
+    "smlal2 v22.4s, v10.8h, v9.8h\n"
+    "tbz x1, #2, 33f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 32f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x1, #1, 34f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr d31, [x6, #0x40]\n"
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal v23.4s, v12.4h, v9.4h\n"
+    "smlal2 v19.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v27.4h, v31.4h\n"
+    "smlal2 v15.4s, v27.8h, v31.8h\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "smlal v24.4s, v12.4h, v31.4h\n"
+    "smlal2 v22.4s, v12.8h, v31.8h\n"
+    "tbz x1, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x1, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr d16, [x6, #0x48]\n"
+    "ssubl v8.8h, v8.8b, v18.8b\n"
+    "ssubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0x88]\n"
+    "smlal v23.4s, v8.4h, v31.4h\n"
+    "smlal2 v19.4s, v8.8h, v31.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v6.4h, v16.4h\n"
+    "smlal2 v15.4s, v6.8h, v16.8h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal2 v5.4s, v29.8h, v16.8h\n"
+    "smlal v24.4s, v8.4h, v16.4h\n"
+    "smlal2 v22.4s, v8.8h, v16.8h\n"
+    "tbz x1, #2, 41f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 40f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
+    "tbz x1, #1, 42f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 5): Bit 2: End
+    "ldr d21, [x6, #0x50]\n"
+    "ssubl v27.8h, v27.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x90]\n"
+    "smlal v23.4s, v27.4h, v16.4h\n"
+    "smlal2 v19.4s, v27.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "smlal v20.4s, v25.4h, v21.4h\n"
+    "smlal2 v5.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 45f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 44f\n"
+    "ld1 { v31.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x1, #1, 46f\n"
+    "ld1 { v31.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (3, 0): Bit 2: End
+    "ssubl v31.8h, v31.8b, v18.8b\n"
+    "ldr x20, [x5, #0x98]\n"
+    "smlal v24.4s, v31.4h, v21.4h\n"
+    "smlal2 v22.4s, v31.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 49f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 48f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x1, #1, 50f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr d2, [x6, #0x58]\n"
+    "ssubl v28.8h, v28.8b, v18.8b\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa0]\n"
+    "smlal v23.4s, v28.4h, v21.4h\n"
+    "smlal2 v19.4s, v28.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "tbz x1, #2, 53f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 52f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x1, #1, 54f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr d25, [x6, #0x60]\n"
+    "ssubl v21.8h, v21.8b, v18.8b\n"
+    "ssubl v25.8h, v25.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal v23.4s, v21.4h, v2.4h\n"
+    "smlal2 v19.4s, v21.8h, v2.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v10.4h, v25.4h\n"
+    "smlal2 v15.4s, v10.8h, v25.8h\n"
+    "smlal v20.4s, v12.4h, v25.4h\n"
+    "smlal2 v5.4s, v12.8h, v25.8h\n"
+    "smlal v24.4s, v21.4h, v25.4h\n"
+    "smlal2 v22.4s, v21.8h, v25.8h\n"
+    "tbz x1, #2, 57f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 56f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x1, #1, 58f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr d1, [x6, #0x68]\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "smlal v23.4s, v9.4h, v25.4h\n"
+    "smlal2 v19.4s, v9.8h, v25.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v12.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v1.8h\n"
+    "smlal v20.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v8.8h, v1.8h\n"
+    "smlal v24.4s, v9.4h, v1.4h\n"
+    "smlal2 v22.4s, v9.8h, v1.8h\n"
+    "tbz x1, #2, 61f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 60f\n"
+    "ld1 { v3.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x1, #1, 62f\n"
+    "ld1 { v3.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr d16, [x6, #0x70]\n"
+    "ssubl v3.8h, v3.8b, v18.8b\n"
+    "ssubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb8]\n"
+    "smlal v23.4s, v3.4h, v1.4h\n"
+    "smlal2 v19.4s, v3.8h, v1.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "smlal2 v15.4s, v8.8h, v16.8h\n"
+    "smlal v20.4s, v27.4h, v16.4h\n"
+    "smlal2 v5.4s, v27.8h, v16.8h\n"
+    "smlal v24.4s, v3.4h, v16.4h\n"
+    "smlal2 v22.4s, v3.8h, v16.8h\n"
+    "tbz x1, #2, 65f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 64f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
+    "tbz x1, #1, 66f\n"
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 5): Bit 2: End
+    "ldr d17, [x6, #0x78]\n"
+    "ssubl v14.8h, v14.8b, v18.8b\n"
+    "ssubl v17.8h, v17.8b, v13.8b\n"
+    "ldr x20, [x5, #0xc0]\n"
+    "smlal v23.4s, v14.4h, v16.4h\n"
+    "smlal2 v19.4s, v14.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v31.4h, v17.4h\n"
+    "smlal2 v15.4s, v31.8h, v17.8h\n"
+    "smlal v20.4s, v28.4h, v17.4h\n"
+    "smlal2 v5.4s, v28.8h, v17.8h\n"
+    "tbz x1, #2, 69f\n"
+    "ld1 { v1.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 68f\n"
+    "ld1 { v1.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x1, #1, 70f\n"
+    "ld1 { v1.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 0): Bit 2: End
+    "ssubl v1.8h, v1.8b, v18.8b\n"
+    "ldr x20, [x5, #0xc8]\n"
+    "smlal v24.4s, v1.4h, v17.4h\n"
+    "smlal2 v22.4s, v1.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x1, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr d29, [x6, #0x80]\n"
+    "ssubl v16.8h, v16.8b, v18.8b\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd0]\n"
+    "smlal v23.4s, v16.4h, v17.4h\n"
+    "smlal2 v19.4s, v16.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v29.4h\n"
+    "smlal2 v15.4s, v28.8h, v29.8h\n"
+    "smlal v20.4s, v21.4h, v29.4h\n"
+    "smlal2 v5.4s, v21.8h, v29.8h\n"
+    "smlal v24.4s, v16.4h, v29.4h\n"
+    "smlal2 v22.4s, v16.8h, v29.8h\n"
+    "tbz x1, #2, 77f\n"
+    "ld1 { v30.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 76f\n"
+    "ld1 { v30.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x1, #1, 78f\n"
+    "ld1 { v30.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr d12, [x6, #0x88]\n"
+    "ssubl v30.8h, v30.8b, v18.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd8]\n"
+    "smlal v23.4s, v30.4h, v29.4h\n"
+    "smlal2 v19.4s, v30.8h, v29.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v21.4h, v12.4h\n"
+    "smlal2 v15.4s, v21.8h, v12.8h\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v30.4h, v12.4h\n"
+    "smlal2 v22.4s, v30.8h, v12.8h\n"
+    "tbz x1, #2, 81f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 80f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x1, #1, 82f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "83:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr d21, [x6, #0x90]\n"
+    "ssubl v29.8h, v29.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe0]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal v20.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v3.8h, v21.8h\n"
+    "smlal v24.4s, v29.4h, v21.4h\n"
+    "smlal2 v22.4s, v29.8h, v21.8h\n"
+    "tbz x1, #2, 85f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 84f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x1, #1, 86f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "87:"  // Oddments: Load (4, 4): Bit 2: End
+    "ldr d8, [x6, #0x98]\n"
+    "ssubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe8]\n"
+    "smlal v23.4s, v25.4h, v21.4h\n"
+    "smlal2 v19.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v3.4h, v8.4h\n"
+    "smlal2 v15.4s, v3.8h, v8.8h\n"
+    "smlal v20.4s, v14.4h, v8.4h\n"
+    "smlal2 v5.4s, v14.8h, v8.8h\n"
+    "smlal v24.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "tbz x1, #2, 89f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 88f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
+    "tbz x1, #1, 90f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "91:"  // Oddments: Load (4, 5): Bit 2: End
+    "ldr d9, [x6, #0xa0]\n"
+    "ssubl v21.8h, v21.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0xf0]\n"
+    "smlal v23.4s, v21.4h, v8.4h\n"
+    "smlal2 v19.4s, v21.8h, v8.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v1.4h, v9.4h\n"
+    "smlal2 v15.4s, v1.8h, v9.8h\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v5.4s, v16.8h, v9.8h\n"
+    "tbz x1, #2, 93f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 92f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
+    "tbz x1, #1, 94f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "95:"  // Oddments: Load (5, 0): Bit 2: End
+    "ssubl v12.8h, v12.8b, v18.8b\n"
+    "ldr x20, [x5, #0xf8]\n"
+    "smlal v24.4s, v12.4h, v9.4h\n"
+    "smlal2 v22.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 97f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 96f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
+    "tbz x1, #1, 98f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "99:"  // Oddments: Load (5, 1): Bit 2: End
+    "ldr d12, [x6, #0xa8]\n"
+    "ssubl v10.8h, v10.8b, v18.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0x100]\n"
+    "smlal v23.4s, v10.4h, v9.4h\n"
+    "smlal2 v19.4s, v10.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v12.4h\n"
+    "smlal2 v15.4s, v16.8h, v12.8h\n"
+    "smlal v20.4s, v30.4h, v12.4h\n"
+    "smlal2 v5.4s, v30.8h, v12.8h\n"
+    "smlal v24.4s, v10.4h, v12.4h\n"
+    "smlal2 v22.4s, v10.8h, v12.8h\n"
+    "tbz x1, #2, 101f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 100f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
+    "tbz x1, #1, 102f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "103:"  // Oddments: Load (5, 2): Bit 2: End
+    "ldr d28, [x6, #0xb0]\n"
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "ssubl v28.8h, v28.8b, v13.8b\n"
+    "ldr x20, [x5, #0x108]\n"
+    "smlal v23.4s, v9.4h, v12.4h\n"
+    "smlal2 v19.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v28.4h\n"
+    "smlal2 v15.4s, v30.8h, v28.8h\n"
+    "smlal v20.4s, v29.4h, v28.4h\n"
+    "smlal2 v5.4s, v29.8h, v28.8h\n"
+    "smlal v24.4s, v9.4h, v28.4h\n"
+    "smlal2 v22.4s, v9.8h, v28.8h\n"
+    "tbz x1, #2, 105f\n"
+    "ld1 { v2.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 104f\n"
+    "ld1 { v2.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[6], [x20]\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[4], [x20]\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
+    "tbz x1, #1, 106f\n"
+    "ld1 { v2.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[2], [x20]\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[0], [x20]\n"
+    "107:"  // Oddments: Load (5, 3): Bit 2: End
+    "ldr d30, [x6, #0xb8]\n"
+    "ssubl v2.8h, v2.8b, v18.8b\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "ldr x20, [x5, #0x110]\n"
+    "smlal v23.4s, v2.4h, v28.4h\n"
+    "smlal2 v19.4s, v2.8h, v28.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v29.4h, v30.4h\n"
+    "smlal2 v15.4s, v29.8h, v30.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal v24.4s, v2.4h, v30.4h\n"
+    "smlal2 v22.4s, v2.8h, v30.8h\n"
+    "tbz x1, #2, 109f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 108f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
+    "tbz x1, #1, 110f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "111:"  // Oddments: Load (5, 4): Bit 2: End
+    "ldr d8, [x6, #0xc0]\n"
+    "ssubl v27.8h, v27.8b, v18.8b\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal v23.4s, v27.4h, v30.4h\n"
+    "smlal2 v19.4s, v27.8h, v30.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v8.4h\n"
+    "smlal2 v15.4s, v25.8h, v8.8h\n"
+    "smlal v20.4s, v21.4h, v8.4h\n"
+    "smlal2 v5.4s, v21.8h, v8.8h\n"
+    "smlal v24.4s, v27.4h, v8.4h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "tbz x1, #2, 113f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 112f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
+    "tbz x1, #1, 114f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "115:"  // Oddments: Load (5, 5): Bit 2: End
+    "ssubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v23.4s, v9.4h, v8.4h\n"
+    "smlal2 v19.4s, v9.8h, v8.8h\n"
+    "tbz x1, #2, 117f\n"
+    "ld1 { v30.4s }, [x7], #0x10\n"
+    "ld1 { v12.4s }, [x8], #0x10\n"
+    "tbz x1, #1, 116f\n"
+    "ld1 { v14.d }[0], [x7], #0x8\n"
+    "ld1 { v27.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[2], [x7]\n"
+    "ld1 { v27.s }[2], [x8]\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[0], [x7]\n"
+    "ld1 { v27.s }[0], [x8]\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x1, #1, 118f\n"
+    "ld1 { v30.d }[0], [x7], #0x8\n"
+    "ld1 { v12.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[2], [x7]\n"
+    "ld1 { v12.s }[2], [x8]\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[0], [x7]\n"
+    "ld1 { v12.s }[0], [x8]\n"
+    "119:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+    "and v16.16b, v7.16b, v12.16b\n"
+    "add x17, x17, x4\n"
+    "add x16, x16, x4\n"
+    "sqrdmulh v15.4s, v15.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add x15, x15, x4\n"
+    "add x14, x14, x4\n"
+    "and v2.16b, v15.16b, v27.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+    "sqadd v7.4s, v7.4s, v16.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "and v21.16b, v20.16b, v12.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v18.16b, v24.16b, v12.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+    "and v31.16b, v23.16b, v12.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v15.4s, v15.4s, v2.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v27.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v4.16b, v22.16b, v27.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v27.16b\n"
+    "sqadd v20.4s, v20.4s, v21.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v18.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v31.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v12.4s\n"
+    "srshl v20.4s, v20.4s, v12.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v12.4s\n"
+    "sqadd v22.4s, v22.4s, v4.4s\n"
+    "srshl v23.4s, v23.4s, v12.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "srshl v15.4s, v15.4s, v27.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v27.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v27.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v27.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "tbz x1, #2, 121f\n"
+    "st1 { v7.s }[0], [x17], #0x4\n"
+    "st1 { v20.s }[0], [x16], #0x4\n"
+    "st1 { v24.s }[0], [x15], #0x4\n"
+    "st1 { v23.s }[0], [x14], #0x4\n"
+    "tbz x1, #1, 120f\n"
+    "st1 { v7.h }[2], [x17], #0x2\n"
+    "st1 { v20.h }[2], [x16], #0x2\n"
+    "st1 { v24.h }[2], [x15], #0x2\n"
+    "st1 { v23.h }[2], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[6], [x17], #0x1\n"
+    "st1 { v20.b }[6], [x16], #0x1\n"
+    "st1 { v24.b }[6], [x15], #0x1\n"
+    "st1 { v23.b }[6], [x14], #0x1\n"
+    "b 123f\n"
+    "120:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[4], [x17], #0x1\n"
+    "st1 { v20.b }[4], [x16], #0x1\n"
+    "st1 { v24.b }[4], [x15], #0x1\n"
+    "st1 { v23.b }[4], [x14], #0x1\n"
+    "b 123f\n"
+    "121:"  // Oddments: Bit 2: Unset
+    "tbz x1, #1, 122f\n"
+    "st1 { v7.h }[0], [x17], #0x2\n"
+    "st1 { v20.h }[0], [x16], #0x2\n"
+    "st1 { v24.h }[0], [x15], #0x2\n"
+    "st1 { v23.h }[0], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[2], [x17], #0x1\n"
+    "st1 { v20.b }[2], [x16], #0x1\n"
+    "st1 { v24.b }[2], [x15], #0x1\n"
+    "st1 { v23.b }[2], [x14], #0x1\n"
+    "b 123f\n"
+    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[0], [x17], #0x1\n"
+    "st1 { v20.b }[0], [x16], #0x1\n"
+    "st1 { v24.b }[0], [x15], #0x1\n"
+    "st1 { v23.b }[0], [x14], #0x1\n"
+    "123:"  // Oddments: Bit 2: End
+    "124:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..9c92a9dd46
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+class a64_s8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  KernelType kernel = a64_s8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  a64_s8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<int8_t, int8_t, int8_t, int32_t>(9, arm_gemm::VLType::None) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..77b7d231e0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  const arm_gemm::Requantize32& qp,
+  const unsigned int n_points,
+  const unsigned int n_channels
+)
+{
+  __asm__ __volatile__(
+    "lsr x9, %x[n_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v7.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v5.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v4.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v3.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v2.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v1.4s }, [x20]\n"
+    "mov x11, #0x0\n"
+    "cbz x9, 6f\n"
+    "1:"  // Channel loop
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q23, [%x[bias], x20]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x25, %x[inptrs]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "subs x24, %x[n_points], #0x1\n"
+    "ldr s14, [x21, x11]\n"
+    "ldr s15, [x20, x11]\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v25.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s16, [x21, x11]\n"
+    "mov v26.16b, v23.16b\n"
+    "mov v27.16b, v23.16b\n"
+    "ldr s17, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "ldr s18, [x21, x11]\n"
+    "ldr s19, [x20, x11]\n"
+    "mov v30.16b, v23.16b\n"
+    "mov v31.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s20, [x21, x11]\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "ssubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "ssubl v15.8h, v15.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "ssubl v17.8h, v17.8b, v6.8b\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "ssubl v19.8h, v19.8b, v6.8b\n"
+    "ssubl v20.8h, v20.8b, v6.8b\n"
+    "ssubl v21.8h, v21.8b, v6.8b\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x23, x22, [x25], #0x10\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldr s14, [x23, x11]\n"
+    "ldr s15, [x22, x11]\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "ldr s16, [x21, x11]\n"
+    "ldr s17, [x20, x11]\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s18, [x21, x11]\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "ldr s19, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "subs x24, x24, #0x1\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "ldr s20, [x21, x11]\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "ssubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "ssubl v15.8h, v15.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "ssubl v17.8h, v17.8b, v6.8b\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "ssubl v19.8h, v19.8b, v6.8b\n"
+    "ssubl v20.8h, v20.8b, v6.8b\n"
+    "ssubl v21.8h, v21.8b, v6.8b\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 5f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q2, [%x[rq_mul_ptr], x20]\n"
+    "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 5f\n"
+    "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
+    "5:"  // Channel loop: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s23, [x28, x11]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s24, [x27, x11]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s25, [x26, x11]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s26, [x25, x11]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x11]\n"
+    "str s28, [x23, x11]\n"
+    "str s29, [x22, x11]\n"
+    "str s30, [x21, x11]\n"
+    "str s31, [x20, x11]\n"
+    "add x11, x11, #0x4\n"
+    "cmp x11, x9, LSL #2\n"
+    "blt 1b\n"
+    "6:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 24f\n"
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 9f\n"
+    "add x20, %x[bias], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v23.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v23.s }[2], [x20], #0x4\n"
+    "b 8f\n"
+    "7:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "8:"  // Oddments: Load bias: Bit 1: End
+    "9:"  // Oddments: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x10, %x[inptrs]\n"
+    "ldp x9, x28, [x10], #0x10\n"
+    "mov v24.16b, v23.16b\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "mov v25.16b, v23.16b\n"
+    "mov v26.16b, v23.16b\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "ldr x21, [x10], #0x8\n"
+    "mov v27.16b, v23.16b\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "mov v30.16b, v23.16b\n"
+    "add x9, x9, x11\n"
+    "add x28, x28, x11\n"
+    "mov v31.16b, v23.16b\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "11:"  // Oddments: Load: Bit 1: End
+    "subs x20, %x[n_points], #0x1\n"
+    "ssubl v14.8h, v14.8b, v6.8b\n"
+    "ssubl v15.8h, v15.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ssubl v17.8h, v17.8b, v6.8b\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "ssubl v19.8h, v19.8b, v6.8b\n"
+    "ssubl v20.8h, v20.8b, v6.8b\n"
+    "ssubl v21.8h, v21.8b, v6.8b\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "ble 15f\n"
+    "12:"  // Oddments: Planar loop
+    "ldp x9, x28, [x10], #0x10\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldr x21, [x10], #0x8\n"
+    "add x9, x9, x11\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "add x28, x28, x11\n"
+    "add x27, x27, x11\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 14f\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "14:"  // Oddments: Planar loop: Load: Bit 1: End
+    "subs x20, x20, #0x1\n"
+    "ssubl v14.8h, v14.8b, v6.8b\n"
+    "ssubl v15.8h, v15.8b, v6.8b\n"
+    "ssubl v16.8h, v16.8b, v6.8b\n"
+    "ssubl v17.8h, v17.8b, v6.8b\n"
+    "ssubl v18.8h, v18.8b, v6.8b\n"
+    "ssubl v19.8h, v19.8b, v6.8b\n"
+    "ssubl v20.8h, v20.8b, v6.8b\n"
+    "ssubl v21.8h, v21.8b, v6.8b\n"
+    "ssubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 12b\n"
+    "15:"  // Oddments: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 21f\n"
+    "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v2.d }[0], [x22], #0x8\n"
+    "ld1 { v1.d }[0], [x21], #0x8\n"
+    "cbz %x[rq_left_shift_ptr], 16f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "16:"  // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v2.s }[2], [x22], #0x4\n"
+    "ld1 { v1.s }[2], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 17f\n"
+    "ld1 { v3.s }[2], [x20], #0x4\n"
+    "17:"  // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+    "b 20f\n"
+    "18:"  // Oddments: Load quantisation parameters: Bit 1: Unset
+    "ld1 { v2.s }[0], [x22], #0x4\n"
+    "ld1 { v1.s }[0], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 19f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+    "20:"  // Oddments: Load quantisation parameters: Bit 1: End
+    "21:"  // Oddments: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "add x28, x28, x11\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "st1 { v23.h }[0], [x28], #0x2\n"
+    "st1 { v24.h }[0], [x27], #0x2\n"
+    "st1 { v25.h }[0], [x26], #0x2\n"
+    "st1 { v26.h }[0], [x25], #0x2\n"
+    "st1 { v27.h }[0], [x24], #0x2\n"
+    "st1 { v28.h }[0], [x23], #0x2\n"
+    "st1 { v29.h }[0], [x22], #0x2\n"
+    "st1 { v30.h }[0], [x21], #0x2\n"
+    "st1 { v31.h }[0], [x20], #0x2\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v23.b }[2], [x28], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "b 23f\n"
+    "22:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v23.b }[0], [x28], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "23:"  // Oddments: Store: Bit 1: End
+    "24:"  // End
+    : [params] "+&r" (params)
+    : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..14adf8880f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..be8fbfa0e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "ldr q11, [%x[params], #0x0]\n"
+    "ldr q5, [%x[params], #0x10]\n"
+    "movi v8.16b, #0x1\n"
+    "ushr v8.4s, v8.4s, #0x8\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "ldr q7, [%x[params], #0x30]\n"
+    "movi v24.4s, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ld1 { v1.16b }, [x20]\n"
+    "mov v28.16b, v1.16b\n"
+    "mov v23.16b, v1.16b\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "mov v30.16b, v1.16b\n"
+    "mov v21.16b, v2.16b\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ld1 { v4.16b }, [x20]\n"
+    "mov v20.16b, v2.16b\n"
+    "mov v29.16b, v2.16b\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "ld1 { v0.16b }, [x20]\n"
+    "mov v9.16b, v4.16b\n"
+    "mov v22.16b, v4.16b\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ld1 { v3.16b }, [x20]\n"
+    "mov v31.16b, v4.16b\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x4\n"
+    "ext v30.16b, v30.16b, v30.16b, #0x6\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.4s }, [x20]\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x2\n"
+    "ext v20.16b, v20.16b, v20.16b, #0x4\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "ext v29.16b, v29.16b, v29.16b, #0x6\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x4\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x6\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "mov v27.16b, v0.16b\n"
+    "mov v19.16b, v0.16b\n"
+    "cmp %x[n_channels], #0x4\n"
+    "mov x9, #0x0\n"
+    "mov v18.16b, v0.16b\n"
+    "mov v26.16b, v3.16b\n"
+    "mov x28, #0x0\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "mov v17.16b, v3.16b\n"
+    "mov v16.16b, v3.16b\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x2\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x4\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "add %x[params], %x[params], #0x40\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+    "zip1 v1.4s, v1.4s, v23.4s\n"
+    "zip1 v28.4s, v28.4s, v30.4s\n"
+    "zip1 v2.4s, v2.4s, v20.4s\n"
+    "zip1 v21.4s, v21.4s, v29.4s\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x2\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+    "zip1 v4.4s, v4.4s, v22.4s\n"
+    "zip1 v9.4s, v9.4s, v31.4s\n"
+    "zip1 v0.4s, v0.4s, v19.4s\n"
+    "zip1 v27.4s, v27.4s, v18.4s\n"
+    "zip1 v1.4s, v1.4s, v28.4s\n"
+    "zip1 v2.4s, v2.4s, v21.4s\n"
+    ".inst 0x4f81e118  // sdot v24.4s, v8.16b, v1.4b[0]\n"
+    "zip1 v3.4s, v3.4s, v17.4s\n"
+    "zip1 v26.4s, v26.4s, v16.4s\n"
+    ".inst 0x4fa1e119  // sdot v25.4s, v8.16b, v1.4b[1]\n"
+    "zip1 v4.4s, v4.4s, v9.4s\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
+    "movi v22.4s, #0x0\n"
+    "movi v21.4s, #0x0\n"
+    ".inst 0x4fa1e916  // sdot v22.4s, v8.16b, v1.4b[3]\n"
+    "movi v19.4s, #0x0\n"
+    "movi v9.4s, #0x0\n"
+    ".inst 0x4f82e115  // sdot v21.4s, v8.16b, v2.4b[0]\n"
+    "movi v10.4s, #0x0\n"
+    "movi v20.4s, #0x0\n"
+    ".inst 0x4fa2e113  // sdot v19.4s, v8.16b, v2.4b[1]\n"
+    "movi v18.4s, #0x0\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x4f82e909  // sdot v9.4s, v8.16b, v2.4b[2]\n"
+    "movi v16.4s, #0x0\n"
+    "zip1 v0.4s, v0.4s, v27.4s\n"
+    ".inst 0x4fa2e90a  // sdot v10.4s, v8.16b, v2.4b[3]\n"
+    "zip1 v3.4s, v3.4s, v26.4s\n"
+    ".inst 0x4f84e114  // sdot v20.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x4fa4e112  // sdot v18.4s, v8.16b, v4.4b[1]\n"
+    ".inst 0x4f84e911  // sdot v17.4s, v8.16b, v4.4b[2]\n"
+    ".inst 0x4fa4e910  // sdot v16.4s, v8.16b, v4.4b[3]\n"
+    "movi v31.4s, #0x0\n"
+    "movi v30.4s, #0x0\n"
+    "movi v26.4s, #0x0\n"
+    ".inst 0x4f80e11f  // sdot v31.4s, v8.16b, v0.4b[0]\n"
+    "movi v27.4s, #0x0\n"
+    "movi v28.4s, #0x0\n"
+    ".inst 0x4fa0e11e  // sdot v30.4s, v8.16b, v0.4b[1]\n"
+    "movi v29.4s, #0x0\n"
+    ".inst 0x4f80e91a  // sdot v26.4s, v8.16b, v0.4b[2]\n"
+    ".inst 0x4fa0e91b  // sdot v27.4s, v8.16b, v0.4b[3]\n"
+    ".inst 0x4f83e11c  // sdot v28.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x4fa3e11d  // sdot v29.4s, v8.16b, v3.4b[1]\n"
+    "add v24.4s, v24.4s, v21.4s\n"
+    "add v25.4s, v25.4s, v19.4s\n"
+    "add v23.4s, v23.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "add v21.4s, v20.4s, v21.4s\n"
+    "movi v20.4s, #0x0\n"
+    ".inst 0x4f83e914  // sdot v20.4s, v8.16b, v3.4b[2]\n"
+    "add v19.4s, v18.4s, v19.4s\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x4fa3e912  // sdot v18.4s, v8.16b, v3.4b[3]\n"
+    "add v17.4s, v17.4s, v9.4s\n"
+    "add v16.4s, v16.4s, v10.4s\n"
+    "add v24.4s, v24.4s, v31.4s\n"
+    "add v25.4s, v25.4s, v30.4s\n"
+    "add v26.4s, v23.4s, v26.4s\n"
+    "add v27.4s, v22.4s, v27.4s\n"
+    "add v28.4s, v21.4s, v28.4s\n"
+    "add v29.4s, v19.4s, v29.4s\n"
+    "add v30.4s, v17.4s, v20.4s\n"
+    "add v31.4s, v16.4s, v18.4s\n"
+    "neg v12.4s, v12.4s\n"
+    "mul v24.4s, v24.4s, v12.4s\n"
+    "mul v25.4s, v25.4s, v12.4s\n"
+    "mul v26.4s, v26.4s, v12.4s\n"
+    "mul v27.4s, v27.4s, v12.4s\n"
+    "mul v28.4s, v28.4s, v12.4s\n"
+    "mul v29.4s, v29.4s, v12.4s\n"
+    "mul v30.4s, v30.4s, v12.4s\n"
+    "mul v31.4s, v31.4s, v12.4s\n"
+    "zip1 v19.4s, v24.4s, v26.4s\n"
+    "zip1 v18.4s, v25.4s, v27.4s\n"
+    "zip1 v17.4s, v28.4s, v30.4s\n"
+    "zip1 v16.4s, v29.4s, v31.4s\n"
+    "zip1 v22.4s, v19.4s, v18.4s\n"
+    "zip1 v23.4s, v17.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    "ldr q8, [%x[params], #0x0]\n"
+    "ldr q21, [%x[params], #0x10]\n"
+    ".inst 0x4f80e0b8  // sdot v24.4s, v5.16b, v0.4b[0]\n"
+    ".inst 0x4fa0e0b9  // sdot v25.4s, v5.16b, v0.4b[1]\n"
+    "ldr q20, [%x[params], #0x20]\n"
+    ".inst 0x4f80e8ba  // sdot v26.4s, v5.16b, v0.4b[2]\n"
+    ".inst 0x4fa0e8bb  // sdot v27.4s, v5.16b, v0.4b[3]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x4f81e0d8  // sdot v24.4s, v6.16b, v1.4b[0]\n"
+    ".inst 0x4fa1e0d9  // sdot v25.4s, v6.16b, v1.4b[1]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x9, x9, #0x10\n"
+    ".inst 0x4f81e8da  // sdot v26.4s, v6.16b, v1.4b[2]\n"
+    ".inst 0x4fa1e8db  // sdot v27.4s, v6.16b, v1.4b[3]\n"
+    ".inst 0x4f82e0bc  // sdot v28.4s, v5.16b, v2.4b[0]\n"
+    ".inst 0x4fa2e0bd  // sdot v29.4s, v5.16b, v2.4b[1]\n"
+    ".inst 0x4f82e8be  // sdot v30.4s, v5.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e8bf  // sdot v31.4s, v5.16b, v2.4b[3]\n"
+    "ldr q5, [%x[params], #0x30]\n"
+    ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+    ".inst 0x4fa2e0f9  // sdot v25.4s, v7.16b, v2.4b[1]\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e8fb  // sdot v27.4s, v7.16b, v2.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    ".inst 0x4f83e0dc  // sdot v28.4s, v6.16b, v3.4b[0]\n"
+    ".inst 0x4fa3e0dd  // sdot v29.4s, v6.16b, v3.4b[1]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    ".inst 0x4f83e8de  // sdot v30.4s, v6.16b, v3.4b[2]\n"
+    ".inst 0x4fa3e8df  // sdot v31.4s, v6.16b, v3.4b[3]\n"
+    "ldr q6, [%x[params], #0x40]\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    ".inst 0x4f84e0fc  // sdot v28.4s, v7.16b, v4.4b[0]\n"
+    ".inst 0x4fa4e0fd  // sdot v29.4s, v7.16b, v4.4b[1]\n"
+    "and v19.16b, v24.16b, v21.16b\n"
+    ".inst 0x4f84e8fe  // sdot v30.4s, v7.16b, v4.4b[2]\n"
+    ".inst 0x4fa4e8ff  // sdot v31.4s, v7.16b, v4.4b[3]\n"
+    "ldr q7, [%x[params], #0x50]\n"
+    "and v18.16b, v25.16b, v21.16b\n"
+    "and v17.16b, v26.16b, v21.16b\n"
+    "and v16.16b, v27.16b, v21.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v16.16b, v31.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v21.4s\n"
+    "srshl v25.4s, v25.4s, v21.4s\n"
+    "srshl v26.4s, v26.4s, v21.4s\n"
+    "srshl v27.4s, v27.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v27.4s, v27.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "smax v27.4s, v27.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s24, [x27, x28]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s25, [x26, x28]\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s26, [x25, x28]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x28]\n"
+    "str s28, [x23, x28]\n"
+    "dup v24.4s, v22.s[0]\n"
+    "dup v25.4s, v22.s[1]\n"
+    "str s29, [x22, x28]\n"
+    "dup v26.4s, v22.s[2]\n"
+    "dup v27.4s, v22.s[3]\n"
+    "str s30, [x21, x28]\n"
+    "dup v28.4s, v23.s[0]\n"
+    "dup v29.4s, v23.s[1]\n"
+    "str s31, [x20, x28]\n"
+    "dup v30.4s, v23.s[2]\n"
+    "dup v31.4s, v23.s[3]\n"
+    "add x28, x28, #0x4\n"
+    "add v24.4s, v24.4s, v20.4s\n"
+    "add v25.4s, v25.4s, v20.4s\n"
+    "add v26.4s, v26.4s, v20.4s\n"
+    "add v27.4s, v27.4s, v20.4s\n"
+    "add v28.4s, v28.4s, v20.4s\n"
+    "add v29.4s, v29.4s, v20.4s\n"
+    "add v30.4s, v30.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v20.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q21, [%x[params], #0x0]\n"
+    "ldr q20, [%x[params], #0x10]\n"
+    ".inst 0x4f80e0b8  // sdot v24.4s, v5.16b, v0.4b[0]\n"
+    ".inst 0x4fa0e0b9  // sdot v25.4s, v5.16b, v0.4b[1]\n"
+    ".inst 0x4f80e8ba  // sdot v26.4s, v5.16b, v0.4b[2]\n"
+    ".inst 0x4fa0e8bb  // sdot v27.4s, v5.16b, v0.4b[3]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x27, x27, x28\n"
+    ".inst 0x4f81e0d8  // sdot v24.4s, v6.16b, v1.4b[0]\n"
+    ".inst 0x4fa1e0d9  // sdot v25.4s, v6.16b, v1.4b[1]\n"
+    "add x26, x26, x28\n"
+    "add x25, x25, x28\n"
+    ".inst 0x4f81e8da  // sdot v26.4s, v6.16b, v1.4b[2]\n"
+    ".inst 0x4fa1e8db  // sdot v27.4s, v6.16b, v1.4b[3]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    ".inst 0x4f82e0bc  // sdot v28.4s, v5.16b, v2.4b[0]\n"
+    ".inst 0x4fa2e0bd  // sdot v29.4s, v5.16b, v2.4b[1]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    ".inst 0x4f82e8be  // sdot v30.4s, v5.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e8bf  // sdot v31.4s, v5.16b, v2.4b[3]\n"
+    "add x20, x20, x28\n"
+    "add %x[params], %x[params], #0x20\n"
+    ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+    ".inst 0x4fa2e0f9  // sdot v25.4s, v7.16b, v2.4b[1]\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e8fb  // sdot v27.4s, v7.16b, v2.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+    ".inst 0x4f83e0dc  // sdot v28.4s, v6.16b, v3.4b[0]\n"
+    ".inst 0x4fa3e0dd  // sdot v29.4s, v6.16b, v3.4b[1]\n"
+    "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+    ".inst 0x4f83e8de  // sdot v30.4s, v6.16b, v3.4b[2]\n"
+    ".inst 0x4fa3e8df  // sdot v31.4s, v6.16b, v3.4b[3]\n"
+    "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+    ".inst 0x4f84e0fc  // sdot v28.4s, v7.16b, v4.4b[0]\n"
+    ".inst 0x4fa4e0fd  // sdot v29.4s, v7.16b, v4.4b[1]\n"
+    "and v19.16b, v24.16b, v20.16b\n"
+    ".inst 0x4f84e8fe  // sdot v30.4s, v7.16b, v4.4b[2]\n"
+    ".inst 0x4fa4e8ff  // sdot v31.4s, v7.16b, v4.4b[3]\n"
+    "and v18.16b, v25.16b, v20.16b\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "and v16.16b, v27.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v30.16b, v20.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v27.4s, v27.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v27.4s, v27.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "smax v27.4s, v27.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "blt 3f\n"
+    "str s24, [x27, #0x0]\n"
+    "str s25, [x26, #0x0]\n"
+    "str s26, [x25, #0x0]\n"
+    "str s27, [x24, #0x0]\n"
+    "str s28, [x23, #0x0]\n"
+    "str s29, [x22, #0x0]\n"
+    "str s30, [x21, #0x0]\n"
+    "str s31, [x20, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[1], [x27], #0x1\n"
+    "st1 { v25.b }[1], [x26], #0x1\n"
+    "st1 { v26.b }[1], [x25], #0x1\n"
+    "st1 { v27.b }[1], [x24], #0x1\n"
+    "st1 { v28.b }[1], [x23], #0x1\n"
+    "st1 { v29.b }[1], [x22], #0x1\n"
+    "st1 { v30.b }[1], [x21], #0x1\n"
+    "st1 { v31.b }[1], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "beq 4f\n"
+    "st1 { v24.b }[3], [x27], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v25.b }[3], [x26], #0x1\n"
+    "st1 { v26.b }[3], [x25], #0x1\n"
+    "st1 { v27.b }[3], [x24], #0x1\n"
+    "st1 { v28.b }[3], [x23], #0x1\n"
+    "st1 { v29.b }[3], [x22], #0x1\n"
+    "st1 { v30.b }[3], [x21], #0x1\n"
+    "st1 { v31.b }[3], [x20], #0x1\n"
+    "4:"  // Tail: End
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..62b033f48d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+  : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..17afc92e30
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,640 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "ldr q12, [%x[params], #0x0]\n"
+    "ldr q8, [%x[params], #0x10]\n"
+    "movi v30.16b, #0x1\n"
+    "movi v17.4s, #0x0\n"
+    "ldr q9, [%x[params], #0x20]\n"
+    "ldr q10, [%x[params], #0x30]\n"
+    "movi v16.4s, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    "ldr q11, [%x[params], #0x40]\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "movi v24.4s, #0x0\n"
+    "movi v31.4s, #0x0\n"
+    "ld1 { v3.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "mov v26.16b, v3.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "ld1 { v4.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "mov v21.16b, v4.16b\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "mov v27.16b, v2.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    "ld1 { v1.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x28]\n"
+    "zip1 v3.2d, v3.2d, v26.2d\n"
+    "zip1 v4.2d, v4.2d, v21.2d\n"
+    "ld1 { v5.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x30]\n"
+    "mov v26.16b, v1.16b\n"
+    "mov v22.16b, v5.16b\n"
+    "ld1 { v6.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x38]\n"
+    "mov v19.16b, v6.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "ld1 { v7.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "mov v21.16b, v7.16b\n"
+    "zip1 v2.2d, v2.2d, v27.2d\n"
+    "ld1 { v0.16b }, [x20]\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x4f83e3d1  // sdot v17.4s, v30.16b, v3.4b[0]\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    ".inst 0x4f83ebd0  // sdot v16.4s, v30.16b, v3.4b[2]\n"
+    ".inst 0x4f84e3d9  // sdot v25.4s, v30.16b, v4.4b[0]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v23.4s }, [x20]\n"
+    ".inst 0x4f84ebd8  // sdot v24.4s, v30.16b, v4.4b[2]\n"
+    "mov v18.16b, v0.16b\n"
+    ".inst 0x4f82e3df  // sdot v31.4s, v30.16b, v2.4b[0]\n"
+    "movi v29.4s, #0x0\n"
+    "movi v28.4s, #0x1\n"
+    ".inst 0x4f82ebdd  // sdot v29.4s, v30.16b, v2.4b[2]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+    "zip1 v1.2d, v1.2d, v26.2d\n"
+    ".inst 0x4fa3e391  // sdot v17.4s, v28.16b, v3.4b[1]\n"
+    "zip1 v5.2d, v5.2d, v22.2d\n"
+    "zip1 v6.2d, v6.2d, v19.2d\n"
+    ".inst 0x4fa3eb90  // sdot v16.4s, v28.16b, v3.4b[3]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "zip1 v7.2d, v7.2d, v21.2d\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+    "movi v21.4s, #0x0\n"
+    ".inst 0x4fa4eb98  // sdot v24.4s, v28.16b, v4.4b[3]\n"
+    ".inst 0x4f81e3d6  // sdot v22.4s, v30.16b, v1.4b[0]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "movi v26.4s, #0x0\n"
+    "movi v27.4s, #0x0\n"
+    ".inst 0x4f81ebd5  // sdot v21.4s, v30.16b, v1.4b[2]\n"
+    "movi v20.4s, #0x0\n"
+    "movi v19.4s, #0x0\n"
+    ".inst 0x4f85e3da  // sdot v26.4s, v30.16b, v5.4b[0]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "zip1 v0.2d, v0.2d, v18.2d\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x4f85ebdb  // sdot v27.4s, v30.16b, v5.4b[2]\n"
+    "mov x9, #0x0\n"
+    ".inst 0x4f86e3d4  // sdot v20.4s, v30.16b, v6.4b[0]\n"
+    ".inst 0x4f86ebd3  // sdot v19.4s, v30.16b, v6.4b[2]\n"
+    "add v17.4s, v17.4s, v25.4s\n"
+    "mov x28, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    ".inst 0x4f87e3d2  // sdot v18.4s, v30.16b, v7.4b[0]\n"
+    ".inst 0x4f87ebd9  // sdot v25.4s, v30.16b, v7.4b[2]\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    ".inst 0x4fa2e39f  // sdot v31.4s, v28.16b, v2.4b[1]\n"
+    ".inst 0x4fa2eb9d  // sdot v29.4s, v28.16b, v2.4b[3]\n"
+    "add v16.4s, v16.4s, v24.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "movi v24.4s, #0x0\n"
+    ".inst 0x4f80e3d8  // sdot v24.4s, v30.16b, v0.4b[0]\n"
+    ".inst 0x4fa1e396  // sdot v22.4s, v28.16b, v1.4b[1]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    ".inst 0x4fa1eb95  // sdot v21.4s, v28.16b, v1.4b[3]\n"
+    ".inst 0x4fa5e39a  // sdot v26.4s, v28.16b, v5.4b[1]\n"
+    "add v31.4s, v31.4s, v17.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    ".inst 0x4fa5eb9b  // sdot v27.4s, v28.16b, v5.4b[3]\n"
+    ".inst 0x4fa6e394  // sdot v20.4s, v28.16b, v6.4b[1]\n"
+    "add v29.4s, v29.4s, v16.4s\n"
+    "add %x[params], %x[params], #0x50\n"
+    ".inst 0x4fa6eb93  // sdot v19.4s, v28.16b, v6.4b[3]\n"
+    ".inst 0x4fa7e392  // sdot v18.4s, v28.16b, v7.4b[1]\n"
+    "add v22.4s, v22.4s, v31.4s\n"
+    ".inst 0x4fa7eb99  // sdot v25.4s, v28.16b, v7.4b[3]\n"
+    ".inst 0x4fa0e398  // sdot v24.4s, v28.16b, v0.4b[1]\n"
+    "add v21.4s, v21.4s, v29.4s\n"
+    "add v20.4s, v26.4s, v20.4s\n"
+    "add v19.4s, v27.4s, v19.4s\n"
+    "add v18.4s, v18.4s, v17.4s\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x4f80ebd1  // sdot v17.4s, v30.16b, v0.4b[2]\n"
+    ".inst 0x4fa0eb91  // sdot v17.4s, v28.16b, v0.4b[3]\n"
+    "add v16.4s, v25.4s, v16.4s\n"
+    "add v24.4s, v22.4s, v24.4s\n"
+    "add v25.4s, v21.4s, v17.4s\n"
+    "add v26.4s, v26.4s, v22.4s\n"
+    "add v27.4s, v27.4s, v21.4s\n"
+    "add v28.4s, v20.4s, v31.4s\n"
+    "add v29.4s, v19.4s, v29.4s\n"
+    "add v30.4s, v20.4s, v18.4s\n"
+    "add v31.4s, v19.4s, v16.4s\n"
+    "neg v23.4s, v23.4s\n"
+    "mul v24.4s, v24.4s, v23.4s\n"
+    "mul v25.4s, v25.4s, v23.4s\n"
+    "mul v26.4s, v26.4s, v23.4s\n"
+    "mul v27.4s, v27.4s, v23.4s\n"
+    "mul v28.4s, v28.4s, v23.4s\n"
+    "mul v29.4s, v29.4s, v23.4s\n"
+    "mul v30.4s, v30.4s, v23.4s\n"
+    "mul v31.4s, v31.4s, v23.4s\n"
+    "zip1 v19.4s, v24.4s, v26.4s\n"
+    "zip1 v18.4s, v25.4s, v27.4s\n"
+    "zip1 v17.4s, v28.4s, v30.4s\n"
+    "zip1 v16.4s, v29.4s, v31.4s\n"
+    "zip1 v22.4s, v19.4s, v18.4s\n"
+    "zip1 v23.4s, v17.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v12.4s\n"
+    "add v25.4s, v25.4s, v12.4s\n"
+    "add v26.4s, v26.4s, v12.4s\n"
+    "add v27.4s, v27.4s, v12.4s\n"
+    "add v28.4s, v28.4s, v12.4s\n"
+    "add v29.4s, v29.4s, v12.4s\n"
+    "add v30.4s, v30.4s, v12.4s\n"
+    "add v31.4s, v31.4s, v12.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    "ldr q12, [%x[params], #0x60]\n"
+    "ldr q21, [%x[params], #0x70]\n"
+    ".inst 0x4f80e118  // sdot v24.4s, v8.16b, v0.4b[0]\n"
+    ".inst 0x4f80e919  // sdot v25.4s, v8.16b, v0.4b[2]\n"
+    "ldr q20, [%x[params], #0x80]\n"
+    ".inst 0x4f81e11a  // sdot v26.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x4f81e91b  // sdot v27.4s, v8.16b, v1.4b[2]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x4fa0e138  // sdot v24.4s, v9.16b, v0.4b[1]\n"
+    ".inst 0x4fa0e939  // sdot v25.4s, v9.16b, v0.4b[3]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x9, x9, #0x10\n"
+    ".inst 0x4fa1e13a  // sdot v26.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x4fa1e93b  // sdot v27.4s, v9.16b, v1.4b[3]\n"
+    ".inst 0x4f82e11c  // sdot v28.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x4f82e91d  // sdot v29.4s, v8.16b, v2.4b[2]\n"
+    ".inst 0x4f83e11e  // sdot v30.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x4f83e91f  // sdot v31.4s, v8.16b, v3.4b[2]\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    ".inst 0x4f81e158  // sdot v24.4s, v10.16b, v1.4b[0]\n"
+    ".inst 0x4f81e959  // sdot v25.4s, v10.16b, v1.4b[2]\n"
+    ".inst 0x4f82e15a  // sdot v26.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x4f82e95b  // sdot v27.4s, v10.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e13c  // sdot v28.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e93d  // sdot v29.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x4fa3e13e  // sdot v30.4s, v9.16b, v3.4b[1]\n"
+    ".inst 0x4fa3e93f  // sdot v31.4s, v9.16b, v3.4b[3]\n"
+    "ldr q16, [%x[params], #0x10]\n"
+    ".inst 0x4fa1e178  // sdot v24.4s, v11.16b, v1.4b[1]\n"
+    ".inst 0x4fa1e979  // sdot v25.4s, v11.16b, v1.4b[3]\n"
+    ".inst 0x4fa2e17a  // sdot v26.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e97b  // sdot v27.4s, v11.16b, v2.4b[3]\n"
+    ".inst 0x4f83e15c  // sdot v28.4s, v10.16b, v3.4b[0]\n"
+    ".inst 0x4f83e95d  // sdot v29.4s, v10.16b, v3.4b[2]\n"
+    ".inst 0x4f84e15e  // sdot v30.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x4f84e95f  // sdot v31.4s, v10.16b, v4.4b[2]\n"
+    "ldr q19, [%x[params], #0x20]\n"
+    ".inst 0x4f82e238  // sdot v24.4s, v17.16b, v2.4b[0]\n"
+    ".inst 0x4f82ea39  // sdot v25.4s, v17.16b, v2.4b[2]\n"
+    ".inst 0x4f83e23a  // sdot v26.4s, v17.16b, v3.4b[0]\n"
+    ".inst 0x4f83ea3b  // sdot v27.4s, v17.16b, v3.4b[2]\n"
+    ".inst 0x4fa3e17c  // sdot v28.4s, v11.16b, v3.4b[1]\n"
+    ".inst 0x4fa3e97d  // sdot v29.4s, v11.16b, v3.4b[3]\n"
+    ".inst 0x4fa4e17e  // sdot v30.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x4fa4e97f  // sdot v31.4s, v11.16b, v4.4b[3]\n"
+    "ldr q18, [%x[params], #0x30]\n"
+    ".inst 0x4fa2e218  // sdot v24.4s, v16.16b, v2.4b[1]\n"
+    ".inst 0x4fa2ea19  // sdot v25.4s, v16.16b, v2.4b[3]\n"
+    ".inst 0x4fa3e21a  // sdot v26.4s, v16.16b, v3.4b[1]\n"
+    ".inst 0x4fa3ea1b  // sdot v27.4s, v16.16b, v3.4b[3]\n"
+    ".inst 0x4f84e23c  // sdot v28.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea3d  // sdot v29.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x4f85e23e  // sdot v30.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea3f  // sdot v31.4s, v17.16b, v5.4b[2]\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    ".inst 0x4f83e278  // sdot v24.4s, v19.16b, v3.4b[0]\n"
+    ".inst 0x4f83ea79  // sdot v25.4s, v19.16b, v3.4b[2]\n"
+    ".inst 0x4f84e27a  // sdot v26.4s, v19.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea7b  // sdot v27.4s, v19.16b, v4.4b[2]\n"
+    ".inst 0x4fa4e21c  // sdot v28.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea1d  // sdot v29.4s, v16.16b, v4.4b[3]\n"
+    ".inst 0x4fa5e21e  // sdot v30.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea1f  // sdot v31.4s, v16.16b, v5.4b[3]\n"
+    "ldr q16, [%x[params], #0x50]\n"
+    ".inst 0x4fa3e258  // sdot v24.4s, v18.16b, v3.4b[1]\n"
+    ".inst 0x4fa3ea59  // sdot v25.4s, v18.16b, v3.4b[3]\n"
+    ".inst 0x4fa4e25a  // sdot v26.4s, v18.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea5b  // sdot v27.4s, v18.16b, v4.4b[3]\n"
+    ".inst 0x4f85e27c  // sdot v28.4s, v19.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea7d  // sdot v29.4s, v19.16b, v5.4b[2]\n"
+    ".inst 0x4f86e27e  // sdot v30.4s, v19.16b, v6.4b[0]\n"
+    ".inst 0x4f86ea7f  // sdot v31.4s, v19.16b, v6.4b[2]\n"
+    "ldr q10, [%x[params], #0xb0]\n"
+    ".inst 0x4f84e238  // sdot v24.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea39  // sdot v25.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x4f85e23a  // sdot v26.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea3b  // sdot v27.4s, v17.16b, v5.4b[2]\n"
+    ".inst 0x4fa5e25c  // sdot v28.4s, v18.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea5d  // sdot v29.4s, v18.16b, v5.4b[3]\n"
+    ".inst 0x4fa6e25e  // sdot v30.4s, v18.16b, v6.4b[1]\n"
+    ".inst 0x4fa6ea5f  // sdot v31.4s, v18.16b, v6.4b[3]\n"
+    "ldr q11, [%x[params], #0xc0]\n"
+    ".inst 0x4fa4e218  // sdot v24.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea19  // sdot v25.4s, v16.16b, v4.4b[3]\n"
+    "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+    ".inst 0x4fa5e21a  // sdot v26.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea1b  // sdot v27.4s, v16.16b, v5.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+    ".inst 0x4f86e23c  // sdot v28.4s, v17.16b, v6.4b[0]\n"
+    ".inst 0x4f86ea3d  // sdot v29.4s, v17.16b, v6.4b[2]\n"
+    "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+    ".inst 0x4f87e23e  // sdot v30.4s, v17.16b, v7.4b[0]\n"
+    ".inst 0x4f87ea3f  // sdot v31.4s, v17.16b, v7.4b[2]\n"
+    "ldr q8, [%x[params], #0x90]\n"
+    "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+    ".inst 0x4fa6e21c  // sdot v28.4s, v16.16b, v6.4b[1]\n"
+    ".inst 0x4fa6ea1d  // sdot v29.4s, v16.16b, v6.4b[3]\n"
+    "and v19.16b, v24.16b, v21.16b\n"
+    ".inst 0x4fa7e21e  // sdot v30.4s, v16.16b, v7.4b[1]\n"
+    ".inst 0x4fa7ea1f  // sdot v31.4s, v16.16b, v7.4b[3]\n"
+    "ldr q9, [%x[params], #0xa0]\n"
+    "and v18.16b, v25.16b, v21.16b\n"
+    "and v17.16b, v26.16b, v21.16b\n"
+    "and v16.16b, v27.16b, v21.16b\n"
+    "add %x[params], %x[params], #0xd0\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v12.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v16.16b, v31.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v21.4s\n"
+    "srshl v25.4s, v25.4s, v21.4s\n"
+    "srshl v26.4s, v26.4s, v21.4s\n"
+    "srshl v27.4s, v27.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "add v24.4s, v24.4s, v13.4s\n"
+    "add v25.4s, v25.4s, v13.4s\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "add v27.4s, v27.4s, v13.4s\n"
+    "add v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "add v31.4s, v31.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s24, [x27, x28]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s25, [x26, x28]\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s26, [x25, x28]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x28]\n"
+    "str s28, [x23, x28]\n"
+    "dup v24.4s, v22.s[0]\n"
+    "dup v25.4s, v22.s[1]\n"
+    "str s29, [x22, x28]\n"
+    "dup v26.4s, v22.s[2]\n"
+    "dup v27.4s, v22.s[3]\n"
+    "str s30, [x21, x28]\n"
+    "dup v28.4s, v23.s[0]\n"
+    "dup v29.4s, v23.s[1]\n"
+    "str s31, [x20, x28]\n"
+    "dup v30.4s, v23.s[2]\n"
+    "dup v31.4s, v23.s[3]\n"
+    "add x28, x28, #0x4\n"
+    "add v24.4s, v24.4s, v20.4s\n"
+    "add v25.4s, v25.4s, v20.4s\n"
+    "add v26.4s, v26.4s, v20.4s\n"
+    "add v27.4s, v27.4s, v20.4s\n"
+    "add v28.4s, v28.4s, v20.4s\n"
+    "add v29.4s, v29.4s, v20.4s\n"
+    "add v30.4s, v30.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v20.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q21, [%x[params], #0x60]\n"
+    "ldr q20, [%x[params], #0x70]\n"
+    ".inst 0x4f80e118  // sdot v24.4s, v8.16b, v0.4b[0]\n"
+    ".inst 0x4f80e919  // sdot v25.4s, v8.16b, v0.4b[2]\n"
+    ".inst 0x4f81e11a  // sdot v26.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x4f81e91b  // sdot v27.4s, v8.16b, v1.4b[2]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x27, x27, x28\n"
+    ".inst 0x4fa0e138  // sdot v24.4s, v9.16b, v0.4b[1]\n"
+    ".inst 0x4fa0e939  // sdot v25.4s, v9.16b, v0.4b[3]\n"
+    "add x26, x26, x28\n"
+    "add x25, x25, x28\n"
+    ".inst 0x4fa1e13a  // sdot v26.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x4fa1e93b  // sdot v27.4s, v9.16b, v1.4b[3]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    ".inst 0x4f82e11c  // sdot v28.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x4f82e91d  // sdot v29.4s, v8.16b, v2.4b[2]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    ".inst 0x4f83e11e  // sdot v30.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x4f83e91f  // sdot v31.4s, v8.16b, v3.4b[2]\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "add x20, x20, x28\n"
+    ".inst 0x4f81e158  // sdot v24.4s, v10.16b, v1.4b[0]\n"
+    ".inst 0x4f81e959  // sdot v25.4s, v10.16b, v1.4b[2]\n"
+    ".inst 0x4f82e15a  // sdot v26.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x4f82e95b  // sdot v27.4s, v10.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e13c  // sdot v28.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e93d  // sdot v29.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x4fa3e13e  // sdot v30.4s, v9.16b, v3.4b[1]\n"
+    ".inst 0x4fa3e93f  // sdot v31.4s, v9.16b, v3.4b[3]\n"
+    "ldr q16, [%x[params], #0x10]\n"
+    ".inst 0x4fa1e178  // sdot v24.4s, v11.16b, v1.4b[1]\n"
+    ".inst 0x4fa1e979  // sdot v25.4s, v11.16b, v1.4b[3]\n"
+    ".inst 0x4fa2e17a  // sdot v26.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e97b  // sdot v27.4s, v11.16b, v2.4b[3]\n"
+    ".inst 0x4f83e15c  // sdot v28.4s, v10.16b, v3.4b[0]\n"
+    ".inst 0x4f83e95d  // sdot v29.4s, v10.16b, v3.4b[2]\n"
+    ".inst 0x4f84e15e  // sdot v30.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x4f84e95f  // sdot v31.4s, v10.16b, v4.4b[2]\n"
+    "ldr q19, [%x[params], #0x20]\n"
+    ".inst 0x4f82e238  // sdot v24.4s, v17.16b, v2.4b[0]\n"
+    ".inst 0x4f82ea39  // sdot v25.4s, v17.16b, v2.4b[2]\n"
+    ".inst 0x4f83e23a  // sdot v26.4s, v17.16b, v3.4b[0]\n"
+    ".inst 0x4f83ea3b  // sdot v27.4s, v17.16b, v3.4b[2]\n"
+    ".inst 0x4fa3e17c  // sdot v28.4s, v11.16b, v3.4b[1]\n"
+    ".inst 0x4fa3e97d  // sdot v29.4s, v11.16b, v3.4b[3]\n"
+    ".inst 0x4fa4e17e  // sdot v30.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x4fa4e97f  // sdot v31.4s, v11.16b, v4.4b[3]\n"
+    "ldr q18, [%x[params], #0x30]\n"
+    ".inst 0x4fa2e218  // sdot v24.4s, v16.16b, v2.4b[1]\n"
+    ".inst 0x4fa2ea19  // sdot v25.4s, v16.16b, v2.4b[3]\n"
+    ".inst 0x4fa3e21a  // sdot v26.4s, v16.16b, v3.4b[1]\n"
+    ".inst 0x4fa3ea1b  // sdot v27.4s, v16.16b, v3.4b[3]\n"
+    ".inst 0x4f84e23c  // sdot v28.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea3d  // sdot v29.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x4f85e23e  // sdot v30.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea3f  // sdot v31.4s, v17.16b, v5.4b[2]\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    ".inst 0x4f83e278  // sdot v24.4s, v19.16b, v3.4b[0]\n"
+    ".inst 0x4f83ea79  // sdot v25.4s, v19.16b, v3.4b[2]\n"
+    ".inst 0x4f84e27a  // sdot v26.4s, v19.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea7b  // sdot v27.4s, v19.16b, v4.4b[2]\n"
+    ".inst 0x4fa4e21c  // sdot v28.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea1d  // sdot v29.4s, v16.16b, v4.4b[3]\n"
+    ".inst 0x4fa5e21e  // sdot v30.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea1f  // sdot v31.4s, v16.16b, v5.4b[3]\n"
+    "ldr q16, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x80\n"
+    ".inst 0x4fa3e258  // sdot v24.4s, v18.16b, v3.4b[1]\n"
+    ".inst 0x4fa3ea59  // sdot v25.4s, v18.16b, v3.4b[3]\n"
+    ".inst 0x4fa4e25a  // sdot v26.4s, v18.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea5b  // sdot v27.4s, v18.16b, v4.4b[3]\n"
+    ".inst 0x4f85e27c  // sdot v28.4s, v19.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea7d  // sdot v29.4s, v19.16b, v5.4b[2]\n"
+    ".inst 0x4f86e27e  // sdot v30.4s, v19.16b, v6.4b[0]\n"
+    ".inst 0x4f86ea7f  // sdot v31.4s, v19.16b, v6.4b[2]\n"
+    ".inst 0x4f84e238  // sdot v24.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x4f84ea39  // sdot v25.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x4f85e23a  // sdot v26.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x4f85ea3b  // sdot v27.4s, v17.16b, v5.4b[2]\n"
+    ".inst 0x4fa5e25c  // sdot v28.4s, v18.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea5d  // sdot v29.4s, v18.16b, v5.4b[3]\n"
+    ".inst 0x4fa6e25e  // sdot v30.4s, v18.16b, v6.4b[1]\n"
+    ".inst 0x4fa6ea5f  // sdot v31.4s, v18.16b, v6.4b[3]\n"
+    ".inst 0x4fa4e218  // sdot v24.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x4fa4ea19  // sdot v25.4s, v16.16b, v4.4b[3]\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    ".inst 0x4fa5e21a  // sdot v26.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x4fa5ea1b  // sdot v27.4s, v16.16b, v5.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+    ".inst 0x4f86e23c  // sdot v28.4s, v17.16b, v6.4b[0]\n"
+    ".inst 0x4f86ea3d  // sdot v29.4s, v17.16b, v6.4b[2]\n"
+    "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+    ".inst 0x4f87e23e  // sdot v30.4s, v17.16b, v7.4b[0]\n"
+    ".inst 0x4f87ea3f  // sdot v31.4s, v17.16b, v7.4b[2]\n"
+    "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+    ".inst 0x4fa6e21c  // sdot v28.4s, v16.16b, v6.4b[1]\n"
+    ".inst 0x4fa6ea1d  // sdot v29.4s, v16.16b, v6.4b[3]\n"
+    "and v19.16b, v24.16b, v20.16b\n"
+    ".inst 0x4fa7e21e  // sdot v30.4s, v16.16b, v7.4b[1]\n"
+    ".inst 0x4fa7ea1f  // sdot v31.4s, v16.16b, v7.4b[3]\n"
+    "and v18.16b, v25.16b, v20.16b\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "and v16.16b, v27.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v30.16b, v20.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v27.4s, v27.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "add v24.4s, v24.4s, v13.4s\n"
+    "add v25.4s, v25.4s, v13.4s\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "add v27.4s, v27.4s, v13.4s\n"
+    "add v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "add v31.4s, v31.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "blt 3f\n"
+    "str s24, [x27, #0x0]\n"
+    "str s25, [x26, #0x0]\n"
+    "str s26, [x25, #0x0]\n"
+    "str s27, [x24, #0x0]\n"
+    "str s28, [x23, #0x0]\n"
+    "str s29, [x22, #0x0]\n"
+    "str s30, [x21, #0x0]\n"
+    "str s31, [x20, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[1], [x27], #0x1\n"
+    "st1 { v25.b }[1], [x26], #0x1\n"
+    "st1 { v26.b }[1], [x25], #0x1\n"
+    "st1 { v27.b }[1], [x24], #0x1\n"
+    "st1 { v28.b }[1], [x23], #0x1\n"
+    "st1 { v29.b }[1], [x22], #0x1\n"
+    "st1 { v30.b }[1], [x21], #0x1\n"
+    "st1 { v31.b }[1], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "beq 4f\n"
+    "st1 { v24.b }[3], [x27], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v25.b }[3], [x26], #0x1\n"
+    "st1 { v26.b }[3], [x25], #0x1\n"
+    "st1 { v27.b }[3], [x24], #0x1\n"
+    "st1 { v28.b }[3], [x23], #0x1\n"
+    "st1 { v29.b }[3], [x22], #0x1\n"
+    "st1 { v30.b }[3], [x21], #0x1\n"
+    "st1 { v31.b }[3], [x20], #0x1\n"
+    "4:"  // Tail: End
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..3f71c5fb64
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<int8_t, int8_t, int8_t, int32_t>;
+  a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::None)
+  {
+  }
+  Parent::KernelType kernel = a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..b21ad484e5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const int8_t *weights,
+  const int32_t *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const int32_t *per_channel_left_shifts,
+  const int32_t *per_channel_muls,
+  const int32_t *per_channel_right_shifts,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "lsr x10, %x[n_output_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v13.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v10.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v9.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "mov x9, #0x0\n"
+    "cbz x10, 9f\n"
+    "1:"  // Output channel loop
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q31, [%x[bias], x20]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 3f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q9, [%x[rq_mul_ptr], x20]\n"
+    "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 3f\n"
+    "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
+    "3:"  // Output channel loop: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 7f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "beq 5f\n"
+    "4:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 4b\n"
+    "5:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 6f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "6:"  // Output channel loop: Odd tail
+    "ldp x20, x28, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x20, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldr d4, [x28, #0x0]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "7:"  // Output channel loop: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "8:"  // Output channel loop: Done
+    "add x9, x9, #0x4\n"
+    "cmp x9, x10, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 26f\n"
+    "9:"  // Output channel oddments
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 12f\n"
+    "add x20, %x[bias], x9, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 10f\n"
+    "ld1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v31.s }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "ld1 { v31.s }[0], [x20]\n"
+    "11:"  // Output channel oddments: Load bias: Bit 1: End
+    "12:"  // Output channel oddments: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 18f\n"
+    "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
+    "cbz %x[rq_left_shift_ptr], 15f\n"
+    "tbz %x[n_output_channels], #1, 13f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 14f\n"
+    "13:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "14:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+    "b 18f\n"
+    "15:"  // Output channel oddments: Load quantization parameters: No left shift
+    "tbz %x[n_output_channels], #1, 16f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "b 17f\n"
+    "16:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "17:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+    "18:"  // Output channel oddments: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 22f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "beq 20f\n"
+    "19:"  // Output channel oddments: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 19b\n"
+    "20:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 21f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "b 23f\n"
+    "21:"  // Output channel oddments: Odd tail
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d2, [x21, #0x0]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d1, [x20, #0x0]\n"
+    "ldr s0, [%x[weights]], #0x4\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v0.8h, v0.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "smlal v16.4s, v0.4h, v2.h[0]\n"
+    "smlal v17.4s, v0.4h, v2.h[1]\n"
+    "smlal v18.4s, v0.4h, v2.h[2]\n"
+    "smlal v19.4s, v0.4h, v2.h[3]\n"
+    "smlal v20.4s, v0.4h, v2.h[4]\n"
+    "smlal v21.4s, v0.4h, v2.h[5]\n"
+    "smlal v22.4s, v0.4h, v2.h[6]\n"
+    "smlal v23.4s, v0.4h, v2.h[7]\n"
+    "smlal v24.4s, v0.4h, v1.h[0]\n"
+    "smlal v25.4s, v0.4h, v1.h[1]\n"
+    "smlal v26.4s, v0.4h, v1.h[2]\n"
+    "smlal v27.4s, v0.4h, v1.h[3]\n"
+    "smlal v28.4s, v0.4h, v1.h[4]\n"
+    "smlal v29.4s, v0.4h, v1.h[5]\n"
+    "smlal v30.4s, v0.4h, v1.h[6]\n"
+    "smlal v31.4s, v0.4h, v1.h[7]\n"
+    "b 23f\n"
+    "22:"  // Output channel oddments: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "23:"  // Output channel oddments: Done
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_output_channels], #1, 24f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.h }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.h }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.h }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.h }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.h }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.h }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "add x9, x9, #0x2\n"
+    "st1 { v24.h }[0], [x27]\n"
+    "st1 { v25.h }[0], [x26]\n"
+    "st1 { v26.h }[0], [x25]\n"
+    "st1 { v27.h }[0], [x24]\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[2], [x27]\n"
+    "st1 { v25.b }[2], [x26]\n"
+    "st1 { v26.b }[2], [x25]\n"
+    "st1 { v27.b }[2], [x24]\n"
+    "st1 { v28.b }[2], [x23]\n"
+    "st1 { v29.b }[2], [x22]\n"
+    "st1 { v30.b }[2], [x21]\n"
+    "st1 { v31.b }[2], [x20]\n"
+    "b 25f\n"
+    "24:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[0], [x27]\n"
+    "st1 { v25.b }[0], [x26]\n"
+    "st1 { v26.b }[0], [x25]\n"
+    "st1 { v27.b }[0], [x24]\n"
+    "st1 { v28.b }[0], [x23]\n"
+    "st1 { v29.b }[0], [x22]\n"
+    "st1 { v30.b }[0], [x21]\n"
+    "st1 { v31.b }[0], [x20]\n"
+    "25:"  // Output channel oddments: Done: Store: Bit 1: End
+    "26:"  // Done
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..3190cbfbf0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int , const int8_t *const *const , const int8_t *, const int32_t *, const arm_gemm::Requantize32& , const int32_t *, const int32_t *, int8_t *const *const );
+
+class a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_a64_s8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_a64_s8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..aad34c4c25
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1484 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "lsr x15, %x[n_channels], #0x4\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "ldp x14, x13, [%x[inptrs], #0x0]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v12.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "mov x12, #0x0\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [%x[inptrs], #0x10]\n"
+    "ldp x28, x27, [%x[inptrs], #0x20]\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "ldp x25, x24, [%x[outptrs], #0x0]\n"
+    "ldp x23, x22, [%x[outptrs], #0x10]\n"
+    "cbz x15, 3f\n"
+    "ldr q11, [x14, x12]\n"
+    "ldr q20, [x13, x12]\n"
+    "subs x15, x15, #0x1\n"
+    "ldr q16, [x10, x12]\n"
+    "ldr q14, [x9, x12]\n"
+    "zip2 v19.16b, v11.16b, v16.16b\n"
+    "zip1 v11.16b, v11.16b, v16.16b\n"
+    "ldr q13, [x28, x12]\n"
+    "ldr q18, [x27, x12]\n"
+    "zip1 v17.16b, v20.16b, v14.16b\n"
+    "zip2 v14.16b, v20.16b, v14.16b\n"
+    "ldr q16, [x26, x12]\n"
+    "ldr q27, [x21, x12]\n"
+    "zip2 v10.16b, v11.16b, v17.16b\n"
+    "zip1 v11.16b, v11.16b, v17.16b\n"
+    "ldr q24, [%x[params], #0x10]\n"
+    "ldr q9, [%x[params], #0x20]\n"
+    "zip1 v3.16b, v19.16b, v14.16b\n"
+    "zip2 v14.16b, v19.16b, v14.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q6, [%x[params], #0x30]\n"
+    "zip2 v30.16b, v13.16b, v16.16b\n"
+    "zip1 v13.16b, v13.16b, v16.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q5, [x21, x12]\n"
+    "zip1 v16.16b, v18.16b, v27.16b\n"
+    "zip2 v27.16b, v18.16b, v27.16b\n"
+    "ldr q17, [x20, x12]\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "zip2 v28.16b, v13.16b, v16.16b\n"
+    "zip1 v13.16b, v13.16b, v16.16b\n"
+    "ldr q16, [x21, x12]\n"
+    "ldr q7, [x20, x12]\n"
+    "zip2 v20.16b, v5.16b, v16.16b\n"
+    "zip1 v5.16b, v5.16b, v16.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q16, [x21, x12]\n"
+    "zip1 v22.16b, v17.16b, v7.16b\n"
+    "zip2 v7.16b, v17.16b, v7.16b\n"
+    "ldr q19, [x20, x12]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip1 v21.16b, v30.16b, v27.16b\n"
+    "zip2 v27.16b, v30.16b, v27.16b\n"
+    "ldr q30, [x21, x12]\n"
+    "ldr q1, [x20, x12]\n"
+    "zip2 v17.16b, v16.16b, v30.16b\n"
+    "zip1 v16.16b, v16.16b, v30.16b\n"
+    "zip1 v18.16b, v19.16b, v1.16b\n"
+    "zip2 v1.16b, v19.16b, v1.16b\n"
+    "ldp x14, x13, [%x[inptrs], #0x0]\n"
+    "ldp x10, x9, [%x[inptrs], #0x10]\n"
+    "ldp x28, x27, [%x[inptrs], #0x20]\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "zip2 v29.16b, v5.16b, v22.16b\n"
+    "zip1 v5.16b, v5.16b, v22.16b\n"
+    "zip1 v0.16b, v20.16b, v7.16b\n"
+    "zip2 v7.16b, v20.16b, v7.16b\n"
+    "add %x[params], %x[params], #0x40\n"
+    "zip2 v30.16b, v16.16b, v18.16b\n"
+    "zip1 v16.16b, v16.16b, v18.16b\n"
+    "zip1 v2.16b, v17.16b, v1.16b\n"
+    "zip2 v1.16b, v17.16b, v1.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v4.16b, v31.16b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    ".inst 0x4e8b971f  // sdot v31.4s, v24.16b, v11.16b\n"
+    ".inst 0x4e8d9712  // sdot v18.4s, v24.16b, v13.16b\n"
+    "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+    "add x12, x12, #0x10\n"
+    ".inst 0x4e8d953f  // sdot v31.4s, v9.16b, v13.16b\n"
+    "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+    ".inst 0x4e8b971a  // sdot v26.4s, v24.16b, v11.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    ".inst 0x4e8d9704  // sdot v4.4s, v24.16b, v13.16b\n"
+    ".inst 0x4e859532  // sdot v18.4s, v9.16b, v5.16b\n"
+    "subs x15, x15, #0x1\n"
+    ".inst 0x4e8594df  // sdot v31.4s, v6.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8d953a  // sdot v26.4s, v9.16b, v13.16b\n"
+    "ldr q20, [%x[params], #0x10]\n"
+    ".inst 0x4e859524  // sdot v4.4s, v9.16b, v5.16b\n"
+    ".inst 0x4e9094d2  // sdot v18.4s, v6.16b, v16.16b\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e8594da  // sdot v26.4s, v6.16b, v5.16b\n"
+    ".inst 0x4e9094c4  // sdot v4.4s, v6.16b, v16.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "ldr q5, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v19.16b, v26.16b, v20.16b\n"
+    "and v17.16b, v18.16b, v20.16b\n"
+    "and v16.16b, v4.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "sqadd v26.4s, v26.4s, v19.4s\n"
+    "ldr q13, [%x[params], #0x40]\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0x50]\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v18.4s, v18.4s, v20.4s\n"
+    "srshl v4.4s, v4.4s, v20.4s\n"
+    "ldr q22, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s31, [x25, x11]\n"
+    "ldr q24, [%x[params], #0x20]\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s26, [x24, x11]\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s18, [x23, x11]\n"
+    "mov v26.16b, v24.16b\n"
+    "str s4, [x22, x11]\n"
+    "mov v25.16b, v24.16b\n"
+    "mov v23.16b, v24.16b\n"
+    ".inst 0x4e8a9618  // sdot v24.4s, v16.16b, v10.16b\n"
+    ".inst 0x4e9c9619  // sdot v25.4s, v16.16b, v28.16b\n"
+    ".inst 0x4e9c95b8  // sdot v24.4s, v13.16b, v28.16b\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "add x11, x11, #0x4\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    ".inst 0x4e8a961a  // sdot v26.4s, v16.16b, v10.16b\n"
+    "ldr q10, [x13, x12]\n"
+    ".inst 0x4e9c9617  // sdot v23.4s, v16.16b, v28.16b\n"
+    ".inst 0x4e9d95b9  // sdot v25.4s, v13.16b, v29.16b\n"
+    ".inst 0x4e9d9638  // sdot v24.4s, v17.16b, v29.16b\n"
+    "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+    ".inst 0x4e9c95ba  // sdot v26.4s, v13.16b, v28.16b\n"
+    "ldr q20, [x27, x12]\n"
+    ".inst 0x4e9d95b7  // sdot v23.4s, v13.16b, v29.16b\n"
+    "sqrdmulh v24.4s, v24.4s, v5.4s\n"
+    ".inst 0x4e9e9639  // sdot v25.4s, v17.16b, v30.16b\n"
+    "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+    ".inst 0x4e9d963a  // sdot v26.4s, v17.16b, v29.16b\n"
+    ".inst 0x4e9e9637  // sdot v23.4s, v17.16b, v30.16b\n"
+    "and v16.16b, v24.16b, v22.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v5.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v5.4s\n"
+    "ldr q19, [%x[params], #0xc0]\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "and v18.16b, v26.16b, v22.16b\n"
+    "and v17.16b, v25.16b, v22.16b\n"
+    "and v16.16b, v23.16b, v22.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v24.4s, v24.4s, v22.4s\n"
+    "sqadd v26.4s, v26.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0xa0]\n"
+    "sqadd v25.4s, v25.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0xb0]\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0x90]\n"
+    "add v24.4s, v24.4s, v15.4s\n"
+    "srshl v26.4s, v26.4s, v22.4s\n"
+    "srshl v25.4s, v25.4s, v22.4s\n"
+    "srshl v23.4s, v23.4s, v22.4s\n"
+    "ldr q22, [%x[params], #0xd0]\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "add v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v12.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "smin v23.4s, v23.4s, v12.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x25, x11]\n"
+    "ldr q24, [%x[params], #0x80]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x24, x11]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s25, [x23, x11]\n"
+    "str s23, [x22, x11]\n"
+    "mov v23.16b, v24.16b\n"
+    "mov v31.16b, v24.16b\n"
+    ".inst 0x4e95961f  // sdot v31.4s, v16.16b, v21.16b\n"
+    "mov v13.16b, v24.16b\n"
+    ".inst 0x4e839618  // sdot v24.4s, v16.16b, v3.16b\n"
+    ".inst 0x4e959658  // sdot v24.4s, v18.16b, v21.16b\n"
+    "add x11, x11, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    ".inst 0x4e839617  // sdot v23.4s, v16.16b, v3.16b\n"
+    "ldr q3, [x10, x12]\n"
+    ".inst 0x4e95960d  // sdot v13.4s, v16.16b, v21.16b\n"
+    ".inst 0x4e80965f  // sdot v31.4s, v18.16b, v0.16b\n"
+    ".inst 0x4e809638  // sdot v24.4s, v17.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e959657  // sdot v23.4s, v18.16b, v21.16b\n"
+    "ldr q4, [x26, x12]\n"
+    ".inst 0x4e80964d  // sdot v13.4s, v18.16b, v0.16b\n"
+    ".inst 0x4e82963f  // sdot v31.4s, v17.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "sqrdmulh v24.4s, v24.4s, v19.4s\n"
+    ".inst 0x4e809637  // sdot v23.4s, v17.16b, v0.16b\n"
+    ".inst 0x4e82962d  // sdot v13.4s, v17.16b, v2.16b\n"
+    "and v16.16b, v24.16b, v22.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v23.4s, v23.4s, v19.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v19.4s\n"
+    "sqrdmulh v13.4s, v13.4s, v19.4s\n"
+    "ldr q19, [%x[params], #0x120]\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "and v18.16b, v23.16b, v22.16b\n"
+    "and v17.16b, v31.16b, v22.16b\n"
+    "and v16.16b, v13.16b, v22.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v24.4s, v24.4s, v22.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0x100]\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0x110]\n"
+    "sqadd v13.4s, v13.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0xf0]\n"
+    "add v24.4s, v24.4s, v15.4s\n"
+    "srshl v23.4s, v23.4s, v22.4s\n"
+    "srshl v31.4s, v31.4s, v22.4s\n"
+    "srshl v13.4s, v13.4s, v22.4s\n"
+    "ldr q22, [%x[params], #0x130]\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "add v23.4s, v23.4s, v15.4s\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "add v13.4s, v13.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v12.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smax v13.4s, v13.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v12.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smin v13.4s, v13.4s, v12.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s24, [x25, x11]\n"
+    "ldr q2, [%x[params], #0xe0]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s23, [x24, x11]\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str s31, [x23, x11]\n"
+    "mov v25.16b, v2.16b\n"
+    "str s13, [x22, x11]\n"
+    "mov v21.16b, v2.16b\n"
+    "mov v30.16b, v2.16b\n"
+    ".inst 0x4e8e9602  // sdot v2.4s, v16.16b, v14.16b\n"
+    ".inst 0x4e9b9615  // sdot v21.4s, v16.16b, v27.16b\n"
+    ".inst 0x4e9b9642  // sdot v2.4s, v18.16b, v27.16b\n"
+    "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+    "add x11, x11, #0x4\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e8e9619  // sdot v25.4s, v16.16b, v14.16b\n"
+    "ldr q14, [x9, x12]\n"
+    ".inst 0x4e9b961e  // sdot v30.4s, v16.16b, v27.16b\n"
+    ".inst 0x4e879655  // sdot v21.4s, v18.16b, v7.16b\n"
+    ".inst 0x4e879622  // sdot v2.4s, v17.16b, v7.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e9b9659  // sdot v25.4s, v18.16b, v27.16b\n"
+    "ldr q27, [x21, x12]\n"
+    ".inst 0x4e87965e  // sdot v30.4s, v18.16b, v7.16b\n"
+    "sqrdmulh v2.4s, v2.4s, v19.4s\n"
+    ".inst 0x4e819635  // sdot v21.4s, v17.16b, v1.16b\n"
+    "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+    ".inst 0x4e879639  // sdot v25.4s, v17.16b, v7.16b\n"
+    ".inst 0x4e81963e  // sdot v30.4s, v17.16b, v1.16b\n"
+    "and v16.16b, v2.16b, v22.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v25.4s, v25.4s, v19.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v19.4s\n"
+    "ldr q11, [x14, x12]\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q5, [x21, x12]\n"
+    "ldr q29, [x20, x12]\n"
+    "sqadd v2.4s, v2.4s, v16.4s\n"
+    "and v19.16b, v25.16b, v22.16b\n"
+    "and v17.16b, v21.16b, v22.16b\n"
+    "and v16.16b, v30.16b, v22.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "ldr q26, [x21, x12]\n"
+    "ldr q7, [x20, x12]\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v2.4s, v2.4s, v22.4s\n"
+    "sqadd v25.4s, v25.4s, v19.4s\n"
+    "ldr q9, [%x[params], #0x160]\n"
+    "sqadd v21.4s, v21.4s, v17.4s\n"
+    "ldr q6, [%x[params], #0x170]\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "ldr q24, [%x[params], #0x150]\n"
+    "add v2.4s, v2.4s, v15.4s\n"
+    "srshl v25.4s, v25.4s, v22.4s\n"
+    "srshl v21.4s, v21.4s, v22.4s\n"
+    "srshl v30.4s, v30.4s, v22.4s\n"
+    "ldr q13, [x28, x12]\n"
+    "smax v2.4s, v2.4s, v8.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q16, [x21, x12]\n"
+    "ldr q28, [x20, x12]\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "add v21.4s, v21.4s, v15.4s\n"
+    "add v30.4s, v30.4s, v15.4s\n"
+    "smin v2.4s, v2.4s, v12.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "ldr q23, [x21, x12]\n"
+    "ldr q1, [x20, x12]\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v21.4s, v21.4s, v8.4s\n"
+    "ldp x14, x13, [%x[inptrs], #0x0]\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "ldp x10, x9, [%x[inptrs], #0x10]\n"
+    "ldp x28, x27, [%x[inptrs], #0x20]\n"
+    "smin v21.4s, v21.4s, v12.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str s2, [x25, x11]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "zip2 v18.16b, v11.16b, v3.16b\n"
+    "zip1 v11.16b, v11.16b, v3.16b\n"
+    "zip1 v17.16b, v10.16b, v14.16b\n"
+    "zip2 v14.16b, v10.16b, v14.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x24, x11]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s21, [x23, x11]\n"
+    "str s30, [x22, x11]\n"
+    "zip2 v10.16b, v11.16b, v17.16b\n"
+    "zip1 v11.16b, v11.16b, v17.16b\n"
+    "add x11, x11, #0x4\n"
+    "zip1 v3.16b, v18.16b, v14.16b\n"
+    "zip2 v14.16b, v18.16b, v14.16b\n"
+    "ldr q31, [%x[params], #0x140]\n"
+    "add %x[params], %x[params], #0x180\n"
+    "zip2 v22.16b, v13.16b, v4.16b\n"
+    "zip1 v13.16b, v13.16b, v4.16b\n"
+    "zip1 v2.16b, v20.16b, v27.16b\n"
+    "zip2 v27.16b, v20.16b, v27.16b\n"
+    "zip2 v19.16b, v5.16b, v26.16b\n"
+    "zip1 v5.16b, v5.16b, v26.16b\n"
+    "zip1 v18.16b, v29.16b, v7.16b\n"
+    "zip2 v7.16b, v29.16b, v7.16b\n"
+    "zip2 v4.16b, v16.16b, v23.16b\n"
+    "zip1 v16.16b, v16.16b, v23.16b\n"
+    "zip1 v17.16b, v28.16b, v1.16b\n"
+    "zip2 v1.16b, v28.16b, v1.16b\n"
+    "zip2 v28.16b, v13.16b, v2.16b\n"
+    "zip1 v13.16b, v13.16b, v2.16b\n"
+    "zip1 v21.16b, v22.16b, v27.16b\n"
+    "zip2 v27.16b, v22.16b, v27.16b\n"
+    "zip2 v29.16b, v5.16b, v18.16b\n"
+    "zip1 v5.16b, v5.16b, v18.16b\n"
+    "zip1 v0.16b, v19.16b, v7.16b\n"
+    "zip2 v7.16b, v19.16b, v7.16b\n"
+    "zip2 v30.16b, v16.16b, v17.16b\n"
+    "zip1 v16.16b, v16.16b, v17.16b\n"
+    "zip1 v2.16b, v4.16b, v1.16b\n"
+    "zip2 v1.16b, v4.16b, v1.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v4.16b, v31.16b\n"
+    "bgt 1b\n"
+    "2:"  // Detached iteration
+    ".inst 0x4e8b971f  // sdot v31.4s, v24.16b, v11.16b\n"
+    ".inst 0x4e8d9712  // sdot v18.4s, v24.16b, v13.16b\n"
+    "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+    "tst %x[n_channels], #0xf\n"
+    ".inst 0x4e8d953f  // sdot v31.4s, v9.16b, v13.16b\n"
+    "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+    ".inst 0x4e8b971a  // sdot v26.4s, v24.16b, v11.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    ".inst 0x4e8d9704  // sdot v4.4s, v24.16b, v13.16b\n"
+    ".inst 0x4e859532  // sdot v18.4s, v9.16b, v5.16b\n"
+    "add x12, x12, #0x10\n"
+    ".inst 0x4e8594df  // sdot v31.4s, v6.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8d953a  // sdot v26.4s, v9.16b, v13.16b\n"
+    "ldr q19, [%x[params], #0x10]\n"
+    ".inst 0x4e859524  // sdot v4.4s, v9.16b, v5.16b\n"
+    ".inst 0x4e9094d2  // sdot v18.4s, v6.16b, v16.16b\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e8594da  // sdot v26.4s, v6.16b, v5.16b\n"
+    ".inst 0x4e9094c4  // sdot v4.4s, v6.16b, v16.16b\n"
+    "and v16.16b, v31.16b, v19.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "ldr q24, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v20.16b, v26.16b, v19.16b\n"
+    "and v17.16b, v18.16b, v19.16b\n"
+    "and v16.16b, v4.16b, v19.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v19.4s\n"
+    "sqadd v26.4s, v26.4s, v20.4s\n"
+    "ldr q5, [%x[params], #0x40]\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0x50]\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "srshl v26.4s, v26.4s, v19.4s\n"
+    "srshl v18.4s, v18.4s, v19.4s\n"
+    "srshl v4.4s, v4.4s, v19.4s\n"
+    "ldr q23, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s31, [x25, x11]\n"
+    "ldr q25, [%x[params], #0x20]\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s26, [x24, x11]\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s18, [x23, x11]\n"
+    "mov v22.16b, v25.16b\n"
+    "str s4, [x22, x11]\n"
+    "mov v20.16b, v25.16b\n"
+    "mov v19.16b, v25.16b\n"
+    ".inst 0x4e8a9619  // sdot v25.4s, v16.16b, v10.16b\n"
+    ".inst 0x4e9c9614  // sdot v20.4s, v16.16b, v28.16b\n"
+    ".inst 0x4e9c94b9  // sdot v25.4s, v5.16b, v28.16b\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "add x11, x11, #0x4\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    ".inst 0x4e8a9616  // sdot v22.4s, v16.16b, v10.16b\n"
+    ".inst 0x4e9c9613  // sdot v19.4s, v16.16b, v28.16b\n"
+    ".inst 0x4e9d94b4  // sdot v20.4s, v5.16b, v29.16b\n"
+    ".inst 0x4e9d9639  // sdot v25.4s, v17.16b, v29.16b\n"
+    "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+    ".inst 0x4e9c94b6  // sdot v22.4s, v5.16b, v28.16b\n"
+    ".inst 0x4e9d94b3  // sdot v19.4s, v5.16b, v29.16b\n"
+    "sqrdmulh v25.4s, v25.4s, v24.4s\n"
+    ".inst 0x4e9e9634  // sdot v20.4s, v17.16b, v30.16b\n"
+    "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+    ".inst 0x4e9d9636  // sdot v22.4s, v17.16b, v29.16b\n"
+    ".inst 0x4e9e9633  // sdot v19.4s, v17.16b, v30.16b\n"
+    "and v16.16b, v25.16b, v23.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+    "ldr q24, [%x[params], #0xc0]\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v18.16b, v22.16b, v23.16b\n"
+    "and v17.16b, v20.16b, v23.16b\n"
+    "and v16.16b, v19.16b, v23.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v25.4s, v25.4s, v23.4s\n"
+    "sqadd v22.4s, v22.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0xa0]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0xb0]\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0x90]\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "srshl v22.4s, v22.4s, v23.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "srshl v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0xd0]\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "add v22.4s, v22.4s, v15.4s\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v8.4s\n"
+    "smax v20.4s, v20.4s, v8.4s\n"
+    "smax v19.4s, v19.4s, v8.4s\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x25, x11]\n"
+    "ldr q10, [%x[params], #0x80]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x24, x11]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s20, [x23, x11]\n"
+    "str s19, [x22, x11]\n"
+    "mov v28.16b, v10.16b\n"
+    "mov v20.16b, v10.16b\n"
+    ".inst 0x4e959614  // sdot v20.4s, v16.16b, v21.16b\n"
+    "mov v19.16b, v10.16b\n"
+    ".inst 0x4e83960a  // sdot v10.4s, v16.16b, v3.16b\n"
+    ".inst 0x4e95964a  // sdot v10.4s, v18.16b, v21.16b\n"
+    "add x11, x11, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    ".inst 0x4e83961c  // sdot v28.4s, v16.16b, v3.16b\n"
+    ".inst 0x4e959613  // sdot v19.4s, v16.16b, v21.16b\n"
+    ".inst 0x4e809654  // sdot v20.4s, v18.16b, v0.16b\n"
+    ".inst 0x4e80962a  // sdot v10.4s, v17.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e95965c  // sdot v28.4s, v18.16b, v21.16b\n"
+    ".inst 0x4e809653  // sdot v19.4s, v18.16b, v0.16b\n"
+    ".inst 0x4e829634  // sdot v20.4s, v17.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "sqrdmulh v10.4s, v10.4s, v24.4s\n"
+    ".inst 0x4e80963c  // sdot v28.4s, v17.16b, v0.16b\n"
+    ".inst 0x4e829633  // sdot v19.4s, v17.16b, v2.16b\n"
+    "and v16.16b, v10.16b, v23.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v24.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+    "ldr q24, [%x[params], #0x120]\n"
+    "sqadd v10.4s, v10.4s, v16.4s\n"
+    "and v18.16b, v28.16b, v23.16b\n"
+    "and v17.16b, v20.16b, v23.16b\n"
+    "and v16.16b, v19.16b, v23.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v10.4s, v10.4s, v23.4s\n"
+    "sqadd v28.4s, v28.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0x100]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q17, [%x[params], #0x110]\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "ldr q16, [%x[params], #0xf0]\n"
+    "add v10.4s, v10.4s, v15.4s\n"
+    "srshl v28.4s, v28.4s, v23.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "srshl v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0x130]\n"
+    "smax v10.4s, v10.4s, v8.4s\n"
+    "add v28.4s, v28.4s, v15.4s\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "smin v10.4s, v10.4s, v12.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v20.4s, v20.4s, v8.4s\n"
+    "smax v19.4s, v19.4s, v8.4s\n"
+    "smin v28.4s, v28.4s, v12.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s10, [x25, x11]\n"
+    "ldr q22, [%x[params], #0xe0]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "add %x[params], %x[params], #0x140\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s28, [x24, x11]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s20, [x23, x11]\n"
+    "mov v21.16b, v22.16b\n"
+    "str s19, [x22, x11]\n"
+    "mov v20.16b, v22.16b\n"
+    "mov v19.16b, v22.16b\n"
+    ".inst 0x4e8e9616  // sdot v22.4s, v16.16b, v14.16b\n"
+    ".inst 0x4e9b9614  // sdot v20.4s, v16.16b, v27.16b\n"
+    ".inst 0x4e9b9656  // sdot v22.4s, v18.16b, v27.16b\n"
+    "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+    "add x11, x11, #0x4\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e8e9615  // sdot v21.4s, v16.16b, v14.16b\n"
+    ".inst 0x4e9b9613  // sdot v19.4s, v16.16b, v27.16b\n"
+    ".inst 0x4e879654  // sdot v20.4s, v18.16b, v7.16b\n"
+    ".inst 0x4e879636  // sdot v22.4s, v17.16b, v7.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e9b9655  // sdot v21.4s, v18.16b, v27.16b\n"
+    ".inst 0x4e879653  // sdot v19.4s, v18.16b, v7.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+    ".inst 0x4e819634  // sdot v20.4s, v17.16b, v1.16b\n"
+    "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+    ".inst 0x4e879635  // sdot v21.4s, v17.16b, v7.16b\n"
+    ".inst 0x4e819633  // sdot v19.4s, v17.16b, v1.16b\n"
+    "and v16.16b, v22.16b, v23.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v18.16b, v21.16b, v23.16b\n"
+    "and v17.16b, v20.16b, v23.16b\n"
+    "and v16.16b, v19.16b, v23.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v21.4s, v21.4s, v18.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "srshl v22.4s, v22.4s, v23.4s\n"
+    "srshl v21.4s, v21.4s, v23.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "srshl v19.4s, v19.4s, v23.4s\n"
+    "add v22.4s, v22.4s, v15.4s\n"
+    "add v21.4s, v21.4s, v15.4s\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v8.4s\n"
+    "smax v21.4s, v21.4s, v8.4s\n"
+    "smax v20.4s, v20.4s, v8.4s\n"
+    "smax v19.4s, v19.4s, v8.4s\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "smin v21.4s, v21.4s, v12.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s22, [x25, x11]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s21, [x24, x11]\n"
+    "str s20, [x23, x11]\n"
+    "str s19, [x22, x11]\n"
+    "add x11, x11, #0x4\n"
+    "beq 35f\n"
+    "3:"  // Oddments
+    "and x20, %x[n_channels], #0xf\n"
+    "add x14, x14, x12\n"
+    "add x13, x13, x12\n"
+    "add x10, x10, x12\n"
+    "add x9, x9, x12\n"
+    "add x28, x28, x12\n"
+    "add x27, x27, x12\n"
+    "add x26, x26, x12\n"
+    "add x21, x21, x12\n"
+    "tbz %x[n_channels], #3, 7f\n"
+    "ldr d11, [x14], #0x8\n"
+    "ldr d10, [x13], #0x8\n"
+    "ldr d3, [x10], #0x8\n"
+    "ldr d14, [x9], #0x8\n"
+    "ldr d13, [x28], #0x8\n"
+    "ldr d28, [x27], #0x8\n"
+    "ldr d21, [x26], #0x8\n"
+    "ldr d27, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v11.s }[2], [x14], #0x4\n"
+    "ld1 { v10.s }[2], [x13], #0x4\n"
+    "ld1 { v3.s }[2], [x10], #0x4\n"
+    "ld1 { v14.s }[2], [x9], #0x4\n"
+    "ld1 { v13.s }[2], [x28], #0x4\n"
+    "ld1 { v28.s }[2], [x27], #0x4\n"
+    "ld1 { v21.s }[2], [x26], #0x4\n"
+    "ld1 { v27.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v11.h }[6], [x14], #0x2\n"
+    "ld1 { v10.h }[6], [x13], #0x2\n"
+    "ld1 { v3.h }[6], [x10], #0x2\n"
+    "ld1 { v14.h }[6], [x9], #0x2\n"
+    "ld1 { v13.h }[6], [x28], #0x2\n"
+    "ld1 { v28.h }[6], [x27], #0x2\n"
+    "ld1 { v21.h }[6], [x26], #0x2\n"
+    "ld1 { v27.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[14], [x14], #0x1\n"
+    "ld1 { v10.b }[14], [x13], #0x1\n"
+    "ld1 { v3.b }[14], [x10], #0x1\n"
+    "ld1 { v14.b }[14], [x9], #0x1\n"
+    "ld1 { v13.b }[14], [x28], #0x1\n"
+    "ld1 { v28.b }[14], [x27], #0x1\n"
+    "ld1 { v21.b }[14], [x26], #0x1\n"
+    "ld1 { v27.b }[14], [x21], #0x1\n"
+    "b 11f\n"
+    "4:"  // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[12], [x14], #0x1\n"
+    "ld1 { v10.b }[12], [x13], #0x1\n"
+    "ld1 { v3.b }[12], [x10], #0x1\n"
+    "ld1 { v14.b }[12], [x9], #0x1\n"
+    "ld1 { v13.b }[12], [x28], #0x1\n"
+    "ld1 { v28.b }[12], [x27], #0x1\n"
+    "ld1 { v21.b }[12], [x26], #0x1\n"
+    "ld1 { v27.b }[12], [x21], #0x1\n"
+    "b 11f\n"
+    "5:"  // Oddments: Load (A): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v11.h }[4], [x14], #0x2\n"
+    "ld1 { v10.h }[4], [x13], #0x2\n"
+    "ld1 { v3.h }[4], [x10], #0x2\n"
+    "ld1 { v14.h }[4], [x9], #0x2\n"
+    "ld1 { v13.h }[4], [x28], #0x2\n"
+    "ld1 { v28.h }[4], [x27], #0x2\n"
+    "ld1 { v21.h }[4], [x26], #0x2\n"
+    "ld1 { v27.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[10], [x14], #0x1\n"
+    "ld1 { v10.b }[10], [x13], #0x1\n"
+    "ld1 { v3.b }[10], [x10], #0x1\n"
+    "ld1 { v14.b }[10], [x9], #0x1\n"
+    "ld1 { v13.b }[10], [x28], #0x1\n"
+    "ld1 { v28.b }[10], [x27], #0x1\n"
+    "ld1 { v21.b }[10], [x26], #0x1\n"
+    "ld1 { v27.b }[10], [x21], #0x1\n"
+    "b 11f\n"
+    "6:"  // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[8], [x14], #0x1\n"
+    "ld1 { v10.b }[8], [x13], #0x1\n"
+    "ld1 { v3.b }[8], [x10], #0x1\n"
+    "ld1 { v14.b }[8], [x9], #0x1\n"
+    "ld1 { v13.b }[8], [x28], #0x1\n"
+    "ld1 { v28.b }[8], [x27], #0x1\n"
+    "ld1 { v21.b }[8], [x26], #0x1\n"
+    "ld1 { v27.b }[8], [x21], #0x1\n"
+    "b 11f\n"
+    "7:"  // Oddments: Load (A): Bit 3: Unset
+    "tbz %x[n_channels], #2, 9f\n"
+    "ldr s11, [x14], #0x4\n"
+    "ldr s10, [x13], #0x4\n"
+    "ldr s3, [x10], #0x4\n"
+    "ldr s14, [x9], #0x4\n"
+    "ldr s13, [x28], #0x4\n"
+    "ldr s28, [x27], #0x4\n"
+    "ldr s21, [x26], #0x4\n"
+    "ldr s27, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.h }[2], [x14], #0x2\n"
+    "ld1 { v10.h }[2], [x13], #0x2\n"
+    "ld1 { v3.h }[2], [x10], #0x2\n"
+    "ld1 { v14.h }[2], [x9], #0x2\n"
+    "ld1 { v13.h }[2], [x28], #0x2\n"
+    "ld1 { v28.h }[2], [x27], #0x2\n"
+    "ld1 { v21.h }[2], [x26], #0x2\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[6], [x14], #0x1\n"
+    "ld1 { v10.b }[6], [x13], #0x1\n"
+    "ld1 { v3.b }[6], [x10], #0x1\n"
+    "ld1 { v14.b }[6], [x9], #0x1\n"
+    "ld1 { v13.b }[6], [x28], #0x1\n"
+    "ld1 { v28.b }[6], [x27], #0x1\n"
+    "ld1 { v21.b }[6], [x26], #0x1\n"
+    "ld1 { v27.b }[6], [x21], #0x1\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[4], [x14], #0x1\n"
+    "ld1 { v10.b }[4], [x13], #0x1\n"
+    "ld1 { v3.b }[4], [x10], #0x1\n"
+    "ld1 { v14.b }[4], [x9], #0x1\n"
+    "ld1 { v13.b }[4], [x28], #0x1\n"
+    "ld1 { v28.b }[4], [x27], #0x1\n"
+    "ld1 { v21.b }[4], [x26], #0x1\n"
+    "ld1 { v27.b }[4], [x21], #0x1\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h11, [x14], #0x2\n"
+    "ldr h10, [x13], #0x2\n"
+    "ldr h3, [x10], #0x2\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h13, [x28], #0x2\n"
+    "ldr h28, [x27], #0x2\n"
+    "ldr h21, [x26], #0x2\n"
+    "ldr h27, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v11.b }[2], [x14], #0x1\n"
+    "ld1 { v10.b }[2], [x13], #0x1\n"
+    "ld1 { v3.b }[2], [x10], #0x1\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v13.b }[2], [x28], #0x1\n"
+    "ld1 { v28.b }[2], [x27], #0x1\n"
+    "ld1 { v21.b }[2], [x26], #0x1\n"
+    "ld1 { v27.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b11, [x14], #0x1\n"
+    "ldr b10, [x13], #0x1\n"
+    "ldr b3, [x10], #0x1\n"
+    "ldr b14, [x9], #0x1\n"
+    "ldr b13, [x28], #0x1\n"
+    "ldr b28, [x27], #0x1\n"
+    "ldr b21, [x26], #0x1\n"
+    "ldr b27, [x21], #0x1\n"
+    "11:"  // Oddments: Load (A): Bit 3: End
+    "ldp x14, x13, [%x[inptrs], #0x40]\n"
+    "ldp x10, x9, [%x[inptrs], #0x50]\n"
+    "add x14, x14, x12\n"
+    "add x13, x13, x12\n"
+    "ldp x28, x27, [%x[inptrs], #0x60]\n"
+    "ldp x26, x21, [%x[inptrs], #0x70]\n"
+    "add x10, x10, x12\n"
+    "add x9, x9, x12\n"
+    "add x28, x28, x12\n"
+    "add x27, x27, x12\n"
+    "add x26, x26, x12\n"
+    "add x21, x21, x12\n"
+    "tbz %x[n_channels], #3, 15f\n"
+    "ldr d5, [x14], #0x8\n"
+    "ldr d29, [x13], #0x8\n"
+    "ldr d0, [x10], #0x8\n"
+    "ldr d7, [x9], #0x8\n"
+    "ldr d16, [x28], #0x8\n"
+    "ldr d30, [x27], #0x8\n"
+    "ldr d2, [x26], #0x8\n"
+    "ldr d1, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v5.s }[2], [x14], #0x4\n"
+    "ld1 { v29.s }[2], [x13], #0x4\n"
+    "ld1 { v0.s }[2], [x10], #0x4\n"
+    "ld1 { v7.s }[2], [x9], #0x4\n"
+    "ld1 { v16.s }[2], [x28], #0x4\n"
+    "ld1 { v30.s }[2], [x27], #0x4\n"
+    "ld1 { v2.s }[2], [x26], #0x4\n"
+    "ld1 { v1.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v5.h }[6], [x14], #0x2\n"
+    "ld1 { v29.h }[6], [x13], #0x2\n"
+    "ld1 { v0.h }[6], [x10], #0x2\n"
+    "ld1 { v7.h }[6], [x9], #0x2\n"
+    "ld1 { v16.h }[6], [x28], #0x2\n"
+    "ld1 { v30.h }[6], [x27], #0x2\n"
+    "ld1 { v2.h }[6], [x26], #0x2\n"
+    "ld1 { v1.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[14], [x14], #0x1\n"
+    "ld1 { v29.b }[14], [x13], #0x1\n"
+    "ld1 { v0.b }[14], [x10], #0x1\n"
+    "ld1 { v7.b }[14], [x9], #0x1\n"
+    "ld1 { v16.b }[14], [x28], #0x1\n"
+    "ld1 { v30.b }[14], [x27], #0x1\n"
+    "ld1 { v2.b }[14], [x26], #0x1\n"
+    "ld1 { v1.b }[14], [x21], #0x1\n"
+    "b 19f\n"
+    "12:"  // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[12], [x14], #0x1\n"
+    "ld1 { v29.b }[12], [x13], #0x1\n"
+    "ld1 { v0.b }[12], [x10], #0x1\n"
+    "ld1 { v7.b }[12], [x9], #0x1\n"
+    "ld1 { v16.b }[12], [x28], #0x1\n"
+    "ld1 { v30.b }[12], [x27], #0x1\n"
+    "ld1 { v2.b }[12], [x26], #0x1\n"
+    "ld1 { v1.b }[12], [x21], #0x1\n"
+    "b 19f\n"
+    "13:"  // Oddments: Load (B): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v5.h }[4], [x14], #0x2\n"
+    "ld1 { v29.h }[4], [x13], #0x2\n"
+    "ld1 { v0.h }[4], [x10], #0x2\n"
+    "ld1 { v7.h }[4], [x9], #0x2\n"
+    "ld1 { v16.h }[4], [x28], #0x2\n"
+    "ld1 { v30.h }[4], [x27], #0x2\n"
+    "ld1 { v2.h }[4], [x26], #0x2\n"
+    "ld1 { v1.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[10], [x14], #0x1\n"
+    "ld1 { v29.b }[10], [x13], #0x1\n"
+    "ld1 { v0.b }[10], [x10], #0x1\n"
+    "ld1 { v7.b }[10], [x9], #0x1\n"
+    "ld1 { v16.b }[10], [x28], #0x1\n"
+    "ld1 { v30.b }[10], [x27], #0x1\n"
+    "ld1 { v2.b }[10], [x26], #0x1\n"
+    "ld1 { v1.b }[10], [x21], #0x1\n"
+    "b 19f\n"
+    "14:"  // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[8], [x14], #0x1\n"
+    "ld1 { v29.b }[8], [x13], #0x1\n"
+    "ld1 { v0.b }[8], [x10], #0x1\n"
+    "ld1 { v7.b }[8], [x9], #0x1\n"
+    "ld1 { v16.b }[8], [x28], #0x1\n"
+    "ld1 { v30.b }[8], [x27], #0x1\n"
+    "ld1 { v2.b }[8], [x26], #0x1\n"
+    "ld1 { v1.b }[8], [x21], #0x1\n"
+    "b 19f\n"
+    "15:"  // Oddments: Load (B): Bit 3: Unset
+    "tbz %x[n_channels], #2, 17f\n"
+    "ldr s5, [x14], #0x4\n"
+    "ldr s29, [x13], #0x4\n"
+    "ldr s0, [x10], #0x4\n"
+    "ldr s7, [x9], #0x4\n"
+    "ldr s16, [x28], #0x4\n"
+    "ldr s30, [x27], #0x4\n"
+    "ldr s2, [x26], #0x4\n"
+    "ldr s1, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v5.h }[2], [x14], #0x2\n"
+    "ld1 { v29.h }[2], [x13], #0x2\n"
+    "ld1 { v0.h }[2], [x10], #0x2\n"
+    "ld1 { v7.h }[2], [x9], #0x2\n"
+    "ld1 { v16.h }[2], [x28], #0x2\n"
+    "ld1 { v30.h }[2], [x27], #0x2\n"
+    "ld1 { v2.h }[2], [x26], #0x2\n"
+    "ld1 { v1.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[6], [x14], #0x1\n"
+    "ld1 { v29.b }[6], [x13], #0x1\n"
+    "ld1 { v0.b }[6], [x10], #0x1\n"
+    "ld1 { v7.b }[6], [x9], #0x1\n"
+    "ld1 { v16.b }[6], [x28], #0x1\n"
+    "ld1 { v30.b }[6], [x27], #0x1\n"
+    "ld1 { v2.b }[6], [x26], #0x1\n"
+    "ld1 { v1.b }[6], [x21], #0x1\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[4], [x14], #0x1\n"
+    "ld1 { v29.b }[4], [x13], #0x1\n"
+    "ld1 { v0.b }[4], [x10], #0x1\n"
+    "ld1 { v7.b }[4], [x9], #0x1\n"
+    "ld1 { v16.b }[4], [x28], #0x1\n"
+    "ld1 { v30.b }[4], [x27], #0x1\n"
+    "ld1 { v2.b }[4], [x26], #0x1\n"
+    "ld1 { v1.b }[4], [x21], #0x1\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ldr h5, [x14], #0x2\n"
+    "ldr h29, [x13], #0x2\n"
+    "ldr h0, [x10], #0x2\n"
+    "ldr h7, [x9], #0x2\n"
+    "ldr h16, [x28], #0x2\n"
+    "ldr h30, [x27], #0x2\n"
+    "ldr h2, [x26], #0x2\n"
+    "ldr h1, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v5.b }[2], [x14], #0x1\n"
+    "ld1 { v29.b }[2], [x13], #0x1\n"
+    "ld1 { v0.b }[2], [x10], #0x1\n"
+    "ld1 { v7.b }[2], [x9], #0x1\n"
+    "ld1 { v16.b }[2], [x28], #0x1\n"
+    "ld1 { v30.b }[2], [x27], #0x1\n"
+    "ld1 { v2.b }[2], [x26], #0x1\n"
+    "ld1 { v1.b }[2], [x21], #0x1\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b5, [x14], #0x1\n"
+    "ldr b29, [x13], #0x1\n"
+    "ldr b0, [x10], #0x1\n"
+    "ldr b7, [x9], #0x1\n"
+    "ldr b16, [x28], #0x1\n"
+    "ldr b30, [x27], #0x1\n"
+    "ldr b2, [x26], #0x1\n"
+    "ldr b1, [x21], #0x1\n"
+    "19:"  // Oddments: Load (B): Bit 3: End
+    "ldr q25, [%x[params], #0x10]\n"
+    "ldr q24, [%x[params], #0x20]\n"
+    "zip2 v18.16b, v11.16b, v3.16b\n"
+    "zip1 v11.16b, v11.16b, v3.16b\n"
+    "ldr q23, [%x[params], #0x30]\n"
+    "zip1 v17.16b, v10.16b, v14.16b\n"
+    "zip2 v14.16b, v10.16b, v14.16b\n"
+    "cmp x20, #0x4\n"
+    "zip2 v10.16b, v11.16b, v17.16b\n"
+    "zip1 v11.16b, v11.16b, v17.16b\n"
+    "zip1 v3.16b, v18.16b, v14.16b\n"
+    "zip2 v14.16b, v18.16b, v14.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "zip2 v22.16b, v13.16b, v21.16b\n"
+    "zip1 v13.16b, v13.16b, v21.16b\n"
+    "zip1 v21.16b, v28.16b, v27.16b\n"
+    "zip2 v27.16b, v28.16b, v27.16b\n"
+    "zip2 v20.16b, v5.16b, v0.16b\n"
+    "zip1 v5.16b, v5.16b, v0.16b\n"
+    "zip1 v19.16b, v29.16b, v7.16b\n"
+    "zip2 v7.16b, v29.16b, v7.16b\n"
+    "zip2 v18.16b, v16.16b, v2.16b\n"
+    "zip1 v16.16b, v16.16b, v2.16b\n"
+    "zip1 v17.16b, v30.16b, v1.16b\n"
+    "zip2 v1.16b, v30.16b, v1.16b\n"
+    "zip2 v28.16b, v13.16b, v21.16b\n"
+    "zip1 v13.16b, v13.16b, v21.16b\n"
+    "zip1 v21.16b, v22.16b, v27.16b\n"
+    "zip2 v27.16b, v22.16b, v27.16b\n"
+    "zip2 v29.16b, v5.16b, v19.16b\n"
+    "zip1 v5.16b, v5.16b, v19.16b\n"
+    "zip1 v0.16b, v20.16b, v7.16b\n"
+    "zip2 v7.16b, v20.16b, v7.16b\n"
+    "zip2 v30.16b, v16.16b, v17.16b\n"
+    "zip1 v16.16b, v16.16b, v17.16b\n"
+    "zip1 v2.16b, v18.16b, v1.16b\n"
+    "zip2 v1.16b, v18.16b, v1.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    ".inst 0x4e8d9732  // sdot v18.4s, v25.16b, v13.16b\n"
+    "mov v4.16b, v31.16b\n"
+    ".inst 0x4e8b973f  // sdot v31.4s, v25.16b, v11.16b\n"
+    ".inst 0x4e8d971f  // sdot v31.4s, v24.16b, v13.16b\n"
+    "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+    "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+    ".inst 0x4e8b973a  // sdot v26.4s, v25.16b, v11.16b\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    ".inst 0x4e8d9724  // sdot v4.4s, v25.16b, v13.16b\n"
+    ".inst 0x4e859712  // sdot v18.4s, v24.16b, v5.16b\n"
+    ".inst 0x4e8596ff  // sdot v31.4s, v23.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x4e8d971a  // sdot v26.4s, v24.16b, v13.16b\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    ".inst 0x4e859704  // sdot v4.4s, v24.16b, v5.16b\n"
+    ".inst 0x4e9096f2  // sdot v18.4s, v23.16b, v16.16b\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "add %x[params], %x[params], #0x60\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e8596fa  // sdot v26.4s, v23.16b, v5.16b\n"
+    ".inst 0x4e9096e4  // sdot v4.4s, v23.16b, v16.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v19.16b, v26.16b, v20.16b\n"
+    "and v17.16b, v18.16b, v20.16b\n"
+    "and v16.16b, v4.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v19.4s\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v18.4s, v18.4s, v20.4s\n"
+    "srshl v4.4s, v4.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "blt 20f\n"
+    "str s31, [x25, x11]\n"
+    "str s26, [x24, x11]\n"
+    "str s18, [x23, x11]\n"
+    "str s4, [x22, x11]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Unroll 0: Oddment store
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "tbz x20, #1, 21f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v26.h }[0], [x24], #0x2\n"
+    "st1 { v18.h }[0], [x23], #0x2\n"
+    "st1 { v4.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 22f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v26.b }[2], [x24], #0x1\n"
+    "st1 { v18.b }[2], [x23], #0x1\n"
+    "st1 { v4.b }[2], [x22], #0x1\n"
+    "b 22f\n"
+    "21:"  // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v26.b }[0], [x24], #0x1\n"
+    "st1 { v18.b }[0], [x23], #0x1\n"
+    "st1 { v4.b }[0], [x22], #0x1\n"
+    "22:"  // Oddments: Unroll 0: Oddment store: Bit 1: End
+    "23:"  // Oddments: Unroll 0: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x11, x11, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "ldr q22, [%x[params], #0x20]\n"
+    "ldr q16, [%x[params], #0x30]\n"
+    "mov v4.16b, v31.16b\n"
+    ".inst 0x4e8a96ff  // sdot v31.4s, v23.16b, v10.16b\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    ".inst 0x4e9c96f2  // sdot v18.4s, v23.16b, v28.16b\n"
+    ".inst 0x4e9c96df  // sdot v31.4s, v22.16b, v28.16b\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    ".inst 0x4e8a96fa  // sdot v26.4s, v23.16b, v10.16b\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x4e9c96e4  // sdot v4.4s, v23.16b, v28.16b\n"
+    ".inst 0x4e9d96d2  // sdot v18.4s, v22.16b, v29.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e9d961f  // sdot v31.4s, v16.16b, v29.16b\n"
+    "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+    ".inst 0x4e9c96da  // sdot v26.4s, v22.16b, v28.16b\n"
+    ".inst 0x4e9d96c4  // sdot v4.4s, v22.16b, v29.16b\n"
+    ".inst 0x4e9e9612  // sdot v18.4s, v16.16b, v30.16b\n"
+    "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e9d961a  // sdot v26.4s, v16.16b, v29.16b\n"
+    ".inst 0x4e9e9604  // sdot v4.4s, v16.16b, v30.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v19.16b, v26.16b, v20.16b\n"
+    "and v17.16b, v18.16b, v20.16b\n"
+    "and v16.16b, v4.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v19.4s\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v18.4s, v18.4s, v20.4s\n"
+    "srshl v4.4s, v4.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "blt 24f\n"
+    "str s31, [x25, x11]\n"
+    "str s26, [x24, x11]\n"
+    "str s18, [x23, x11]\n"
+    "str s4, [x22, x11]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Unroll 1: Oddment store
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "tbz x20, #1, 25f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v26.h }[0], [x24], #0x2\n"
+    "st1 { v18.h }[0], [x23], #0x2\n"
+    "st1 { v4.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 26f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v26.b }[2], [x24], #0x1\n"
+    "st1 { v18.b }[2], [x23], #0x1\n"
+    "st1 { v4.b }[2], [x22], #0x1\n"
+    "b 26f\n"
+    "25:"  // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v26.b }[0], [x24], #0x1\n"
+    "st1 { v18.b }[0], [x23], #0x1\n"
+    "st1 { v4.b }[0], [x22], #0x1\n"
+    "26:"  // Oddments: Unroll 1: Oddment store: Bit 1: End
+    "27:"  // Oddments: Unroll 1: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x11, x11, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "ldr q22, [%x[params], #0x20]\n"
+    "ldr q16, [%x[params], #0x30]\n"
+    "mov v4.16b, v31.16b\n"
+    ".inst 0x4e8396ff  // sdot v31.4s, v23.16b, v3.16b\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    ".inst 0x4e9596f2  // sdot v18.4s, v23.16b, v21.16b\n"
+    ".inst 0x4e9596df  // sdot v31.4s, v22.16b, v21.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    ".inst 0x4e8396fa  // sdot v26.4s, v23.16b, v3.16b\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x4e9596e4  // sdot v4.4s, v23.16b, v21.16b\n"
+    ".inst 0x4e8096d2  // sdot v18.4s, v22.16b, v0.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e80961f  // sdot v31.4s, v16.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x4e9596da  // sdot v26.4s, v22.16b, v21.16b\n"
+    ".inst 0x4e8096c4  // sdot v4.4s, v22.16b, v0.16b\n"
+    ".inst 0x4e829612  // sdot v18.4s, v16.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e80961a  // sdot v26.4s, v16.16b, v0.16b\n"
+    ".inst 0x4e829604  // sdot v4.4s, v16.16b, v2.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v19.16b, v26.16b, v20.16b\n"
+    "and v17.16b, v18.16b, v20.16b\n"
+    "and v16.16b, v4.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v19.4s\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v18.4s, v18.4s, v20.4s\n"
+    "srshl v4.4s, v4.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "blt 28f\n"
+    "str s31, [x25, x11]\n"
+    "str s26, [x24, x11]\n"
+    "str s18, [x23, x11]\n"
+    "str s4, [x22, x11]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Unroll 2: Oddment store
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "tbz x20, #1, 29f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v26.h }[0], [x24], #0x2\n"
+    "st1 { v18.h }[0], [x23], #0x2\n"
+    "st1 { v4.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 30f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v26.b }[2], [x24], #0x1\n"
+    "st1 { v18.b }[2], [x23], #0x1\n"
+    "st1 { v4.b }[2], [x22], #0x1\n"
+    "b 30f\n"
+    "29:"  // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v26.b }[0], [x24], #0x1\n"
+    "st1 { v18.b }[0], [x23], #0x1\n"
+    "st1 { v4.b }[0], [x22], #0x1\n"
+    "30:"  // Oddments: Unroll 2: Oddment store: Bit 1: End
+    "31:"  // Oddments: Unroll 2: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x11, x11, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q20, [%x[params], #0x10]\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "ldr q19, [%x[params], #0x20]\n"
+    "ldr q16, [%x[params], #0x30]\n"
+    "mov v4.16b, v31.16b\n"
+    ".inst 0x4e8e969f  // sdot v31.4s, v20.16b, v14.16b\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    "ldr q22, [%x[params], #0x50]\n"
+    ".inst 0x4e9b9692  // sdot v18.4s, v20.16b, v27.16b\n"
+    ".inst 0x4e9b967f  // sdot v31.4s, v19.16b, v27.16b\n"
+    "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x4e8e969a  // sdot v26.4s, v20.16b, v14.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e9b9684  // sdot v4.4s, v20.16b, v27.16b\n"
+    ".inst 0x4e879672  // sdot v18.4s, v19.16b, v7.16b\n"
+    ".inst 0x4e87961f  // sdot v31.4s, v16.16b, v7.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e9b967a  // sdot v26.4s, v19.16b, v27.16b\n"
+    ".inst 0x4e879664  // sdot v4.4s, v19.16b, v7.16b\n"
+    ".inst 0x4e819612  // sdot v18.4s, v16.16b, v1.16b\n"
+    "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x4e87961a  // sdot v26.4s, v16.16b, v7.16b\n"
+    ".inst 0x4e819604  // sdot v4.4s, v16.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v22.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v23.16b, v26.16b, v22.16b\n"
+    "and v17.16b, v18.16b, v22.16b\n"
+    "and v16.16b, v4.16b, v22.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v23.4s\n"
+    "sqadd v18.4s, v18.4s, v17.4s\n"
+    "sqadd v4.4s, v4.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v22.4s\n"
+    "srshl v26.4s, v26.4s, v22.4s\n"
+    "srshl v18.4s, v18.4s, v22.4s\n"
+    "srshl v4.4s, v4.4s, v22.4s\n"
+    "add v31.4s, v31.4s, v15.4s\n"
+    "add v26.4s, v26.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v15.4s\n"
+    "add v4.4s, v4.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v18.4s, v18.4s, v8.4s\n"
+    "smax v4.4s, v4.4s, v8.4s\n"
+    "smin v31.4s, v31.4s, v12.4s\n"
+    "smin v26.4s, v26.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smin v4.4s, v4.4s, v12.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "32:"  // Oddments: Unroll 3: Oddment store
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "tbz x20, #1, 33f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v26.h }[0], [x24], #0x2\n"
+    "st1 { v18.h }[0], [x23], #0x2\n"
+    "st1 { v4.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 34f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v26.b }[2], [x24], #0x1\n"
+    "st1 { v18.b }[2], [x23], #0x1\n"
+    "st1 { v4.b }[2], [x22], #0x1\n"
+    "b 34f\n"
+    "33:"  // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v26.b }[0], [x24], #0x1\n"
+    "st1 { v18.b }[0], [x23], #0x1\n"
+    "st1 { v4.b }[0], [x22], #0x1\n"
+    "34:"  // Oddments: Unroll 3: Oddment store: Bit 1: End
+    "35:"  // End
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..4026855617
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int, const uint8_t *const *const, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *const);
+
+class a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_a64_u8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_a64_u8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const uint8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..5a28daffbf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1658 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x1\n"
+    "orr x20, x20, #0x100\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "orr x20, x20, #0x10000\n"
+    "lsr x11, %x[n_channels], #0x4\n"
+    "dup v12.4s, w20\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "mov x28, #0x0\n"
+    "mov x27, #0x0\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "ldp x25, x24, [%x[outptrs], #0x0]\n"
+    "ldp x23, x22, [%x[outptrs], #0x10]\n"
+    "cbz x11, 3f\n"
+    "ldr q15, [x15, x28]\n"
+    "ldr q28, [x14, x28]\n"
+    "subs x11, x11, #0x1\n"
+    "ldr q30, [x13, x28]\n"
+    "ldr q8, [x12, x28]\n"
+    "zip2 v19.16b, v15.16b, v30.16b\n"
+    "zip1 v15.16b, v15.16b, v30.16b\n"
+    "ldr q26, [x10, x28]\n"
+    "ldr q0, [x9, x28]\n"
+    "zip1 v7.16b, v28.16b, v8.16b\n"
+    "zip2 v8.16b, v28.16b, v8.16b\n"
+    "ldr q29, [x26, x28]\n"
+    "ldr q10, [x21, x28]\n"
+    "zip2 v25.16b, v15.16b, v7.16b\n"
+    "zip1 v15.16b, v15.16b, v7.16b\n"
+    "ldr q1, [%x[params], #0x10]\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "zip1 v7.16b, v19.16b, v8.16b\n"
+    "zip2 v8.16b, v19.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q20, [%x[params], #0x30]\n"
+    "zip2 v21.16b, v26.16b, v29.16b\n"
+    "zip1 v26.16b, v26.16b, v29.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q22, [x21, x28]\n"
+    "zip1 v27.16b, v0.16b, v10.16b\n"
+    "zip2 v10.16b, v0.16b, v10.16b\n"
+    "ldr q17, [x20, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "zip2 v23.16b, v26.16b, v27.16b\n"
+    "zip1 v26.16b, v26.16b, v27.16b\n"
+    "ldr q9, [x21, x28]\n"
+    "ldr q5, [x20, x28]\n"
+    "zip2 v28.16b, v22.16b, v9.16b\n"
+    "zip1 v22.16b, v22.16b, v9.16b\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q27, [x21, x28]\n"
+    "zip1 v24.16b, v17.16b, v5.16b\n"
+    "zip2 v5.16b, v17.16b, v5.16b\n"
+    "ldr q18, [x20, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip1 v3.16b, v21.16b, v10.16b\n"
+    "zip2 v10.16b, v21.16b, v10.16b\n"
+    "ldr q4, [x21, x28]\n"
+    "ldr q9, [x20, x28]\n"
+    "zip2 v17.16b, v27.16b, v4.16b\n"
+    "zip1 v27.16b, v27.16b, v4.16b\n"
+    "zip1 v4.16b, v18.16b, v9.16b\n"
+    "zip2 v9.16b, v18.16b, v9.16b\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "zip2 v19.16b, v22.16b, v24.16b\n"
+    "zip1 v22.16b, v22.16b, v24.16b\n"
+    "zip1 v0.16b, v28.16b, v5.16b\n"
+    "zip2 v5.16b, v28.16b, v5.16b\n"
+    "add %x[params], %x[params], #0x40\n"
+    "zip2 v24.16b, v27.16b, v4.16b\n"
+    "zip1 v27.16b, v27.16b, v4.16b\n"
+    "zip1 v2.16b, v17.16b, v9.16b\n"
+    "zip2 v9.16b, v17.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "movi v21.4s, #0x0\n"
+    ".inst 0x6e9a9595  // udot v21.4s, v12.16b, v26.16b\n"
+    ".inst 0x6e8f943f  // udot v31.4s, v1.16b, v15.16b\n"
+    "add x28, x28, #0x10\n"
+    ".inst 0x6e969595  // udot v21.4s, v12.16b, v22.16b\n"
+    ".inst 0x6e9a943d  // udot v29.4s, v1.16b, v26.16b\n"
+    "movi v18.4s, #0x0\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0x6e9a94df  // udot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "mov v17.16b, v21.16b\n .inst 0x6e9b9591  // udot v17.4s, v12.16b, v27.16b\n"
+    ".inst 0x6e8f9595  // udot v21.4s, v12.16b, v15.16b\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x6e9a9592  // udot v18.4s, v12.16b, v26.16b\n"
+    ".inst 0x6e9694dd  // udot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x6e96969f  // udot v31.4s, v20.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x6e8f943e  // udot v30.4s, v1.16b, v15.16b\n"
+    ".inst 0x6e9a943c  // udot v28.4s, v1.16b, v26.16b\n"
+    "mls v31.4s, v21.4s, v16.4s\n"
+    ".inst 0x6e969592  // udot v18.4s, v12.16b, v22.16b\n"
+    ".inst 0x6e9b969d  // udot v29.4s, v20.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x6e9a94de  // udot v30.4s, v6.16b, v26.16b\n"
+    "ldr q26, [%x[params], #0x10]\n"
+    ".inst 0x6e9694dc  // udot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mov v21.16b, v18.16b\n .inst 0x6e9b9595  // udot v21.4s, v12.16b, v27.16b\n"
+    ".inst 0x6e8f9592  // udot v18.4s, v12.16b, v15.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x6e96969e  // udot v30.4s, v20.16b, v22.16b\n"
+    ".inst 0x6e9b969c  // udot v28.4s, v20.16b, v27.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v21.4s, v16.4s\n"
+    "and v15.16b, v31.16b, v26.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+    "ldr q1, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v15.4s\n"
+    "and v18.16b, v30.16b, v26.16b\n"
+    "and v21.16b, v29.16b, v26.16b\n"
+    "and v17.16b, v28.16b, v26.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v26.4s\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "ldr q18, [%x[params], #0x40]\n"
+    "sqadd v29.4s, v29.4s, v21.4s\n"
+    "ldr q27, [%x[params], #0x50]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v26.4s\n"
+    "srshl v29.4s, v29.4s, v26.4s\n"
+    "srshl v28.4s, v28.4s, v26.4s\n"
+    "ldr q20, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x6e979596  // udot v22.4s, v12.16b, v23.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q26, [%x[params], #0x20]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    ".inst 0x6e939596  // udot v22.4s, v12.16b, v19.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    "mov v6.16b, v22.16b\n .inst 0x6e989586  // udot v6.4s, v12.16b, v24.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v30.16b, v26.16b\n"
+    ".inst 0x6e999596  // udot v22.4s, v12.16b, v25.16b\n"
+    "str s28, [x22, x27]\n"
+    "mov v29.16b, v26.16b\n"
+    "mov v21.16b, v26.16b\n"
+    ".inst 0x6e9995fa  // udot v26.4s, v15.16b, v25.16b\n"
+    ".inst 0x6e9795fd  // udot v29.4s, v15.16b, v23.16b\n"
+    ".inst 0x6e97965a  // udot v26.4s, v18.16b, v23.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "movi v28.4s, #0x0\n"
+    ".inst 0x6e9995fe  // udot v30.4s, v15.16b, v25.16b\n"
+    ".inst 0x6e9795f5  // udot v21.4s, v15.16b, v23.16b\n"
+    ".inst 0x6e97959c  // udot v28.4s, v12.16b, v23.16b\n"
+    ".inst 0x6e93965d  // udot v29.4s, v18.16b, v19.16b\n"
+    ".inst 0x6e93977a  // udot v26.4s, v27.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x6e97965e  // udot v30.4s, v18.16b, v23.16b\n"
+    "ldr q4, [x9, x28]\n"
+    ".inst 0x6e939655  // udot v21.4s, v18.16b, v19.16b\n"
+    "mls v26.4s, v22.4s, v16.4s\n"
+    ".inst 0x6e93959c  // udot v28.4s, v12.16b, v19.16b\n"
+    ".inst 0x6e98977d  // udot v29.4s, v27.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x6e93977e  // udot v30.4s, v27.16b, v19.16b\n"
+    ".inst 0x6e989775  // udot v21.4s, v27.16b, v24.16b\n"
+    "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+    "mov v17.16b, v28.16b\n .inst 0x6e989591  // udot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x6e99959c  // udot v28.4s, v12.16b, v25.16b\n"
+    "ldr q31, [x14, x28]\n"
+    "mls v30.4s, v28.4s, v16.4s\n"
+    "mls v29.4s, v6.4s, v16.4s\n"
+    "mls v21.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+    "ldr q27, [%x[params], #0xc0]\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "and v18.16b, v30.16b, v20.16b\n"
+    "and v6.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v21.16b, v20.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "ldr q28, [%x[params], #0xa0]\n"
+    "sqadd v29.4s, v29.4s, v6.4s\n"
+    "ldr q24, [%x[params], #0xb0]\n"
+    "sqadd v21.4s, v21.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x90]\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v21.4s, v21.4s, v20.4s\n"
+    "ldr q1, [%x[params], #0xd0]\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x6e839596  // udot v22.4s, v12.16b, v3.16b\n"
+    ".inst 0x6e809596  // udot v22.4s, v12.16b, v0.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s26, [x25, x27]\n"
+    "ldr q26, [%x[params], #0x80]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "mov v18.16b, v22.16b\n .inst 0x6e829592  // udot v18.4s, v12.16b, v2.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    ".inst 0x6e879596  // udot v22.4s, v12.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v6.16b, v26.16b\n"
+    "str s21, [x22, x27]\n"
+    "mov v25.16b, v26.16b\n"
+    "mov v20.16b, v26.16b\n"
+    ".inst 0x6e8795fa  // udot v26.4s, v15.16b, v7.16b\n"
+    ".inst 0x6e8395f9  // udot v25.4s, v15.16b, v3.16b\n"
+    ".inst 0x6e83979a  // udot v26.4s, v28.16b, v3.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x6e8795e6  // udot v6.4s, v15.16b, v7.16b\n"
+    ".inst 0x6e8395f4  // udot v20.4s, v15.16b, v3.16b\n"
+    ".inst 0x6e839597  // udot v23.4s, v12.16b, v3.16b\n"
+    ".inst 0x6e809799  // udot v25.4s, v28.16b, v0.16b\n"
+    ".inst 0x6e80971a  // udot v26.4s, v24.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x6e839786  // udot v6.4s, v28.16b, v3.16b\n"
+    "ldr q19, [x26, x28]\n"
+    ".inst 0x6e809794  // udot v20.4s, v28.16b, v0.16b\n"
+    "mls v26.4s, v22.4s, v16.4s\n"
+    ".inst 0x6e809597  // udot v23.4s, v12.16b, v0.16b\n"
+    ".inst 0x6e829719  // udot v25.4s, v24.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x6e809706  // udot v6.4s, v24.16b, v0.16b\n"
+    ".inst 0x6e829714  // udot v20.4s, v24.16b, v2.16b\n"
+    "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+    "mov v17.16b, v23.16b\n .inst 0x6e829591  // udot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x6e879597  // udot v23.4s, v12.16b, v7.16b\n"
+    "ldr q21, [x13, x28]\n"
+    "mls v6.4s, v23.4s, v16.4s\n"
+    "mls v25.4s, v18.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v26.16b, v1.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v6.4s, v6.4s, v27.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+    "ldr q15, [%x[params], #0x120]\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "and v18.16b, v6.16b, v1.16b\n"
+    "and v22.16b, v25.16b, v1.16b\n"
+    "and v17.16b, v20.16b, v1.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "ldr q30, [%x[params], #0x100]\n"
+    "sqadd v25.4s, v25.4s, v22.4s\n"
+    "ldr q27, [%x[params], #0x110]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q24, [%x[params], #0xf0]\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "srshl v6.4s, v6.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v20.4s, v20.4s, v1.4s\n"
+    "ldr q23, [%x[params], #0x130]\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "add v6.4s, v6.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smax v6.4s, v6.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v6.4s, v6.4s, v11.4s\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "movi v0.4s, #0x0\n"
+    ".inst 0x6e8a9580  // udot v0.4s, v12.16b, v10.16b\n"
+    ".inst 0x6e859580  // udot v0.4s, v12.16b, v5.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s26, [x25, x27]\n"
+    "ldr q28, [%x[params], #0xe0]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v22.16b, v0.16b\n .inst 0x6e899596  // udot v22.4s, v12.16b, v9.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s6, [x24, x27]\n"
+    ".inst 0x6e889580  // udot v0.4s, v12.16b, v8.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s25, [x23, x27]\n"
+    "mov v29.16b, v28.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v25.16b, v28.16b\n"
+    "mov v7.16b, v28.16b\n"
+    ".inst 0x6e88971c  // udot v28.4s, v24.16b, v8.16b\n"
+    ".inst 0x6e8a9719  // udot v25.4s, v24.16b, v10.16b\n"
+    ".inst 0x6e8a97dc  // udot v28.4s, v30.16b, v10.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x6e88971d  // udot v29.4s, v24.16b, v8.16b\n"
+    ".inst 0x6e8a9707  // udot v7.4s, v24.16b, v10.16b\n"
+    ".inst 0x6e8a9591  // udot v17.4s, v12.16b, v10.16b\n"
+    ".inst 0x6e8597d9  // udot v25.4s, v30.16b, v5.16b\n"
+    ".inst 0x6e85977c  // udot v28.4s, v27.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x6e8a97dd  // udot v29.4s, v30.16b, v10.16b\n"
+    "ldr q10, [x21, x28]\n"
+    ".inst 0x6e8597c7  // udot v7.4s, v30.16b, v5.16b\n"
+    "mls v28.4s, v0.4s, v16.4s\n"
+    ".inst 0x6e859591  // udot v17.4s, v12.16b, v5.16b\n"
+    ".inst 0x6e899779  // udot v25.4s, v27.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x6e85977d  // udot v29.4s, v27.16b, v5.16b\n"
+    ".inst 0x6e899767  // udot v7.4s, v27.16b, v9.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v15.4s\n"
+    "mov v18.16b, v17.16b\n .inst 0x6e899592  // udot v18.4s, v12.16b, v9.16b\n"
+    ".inst 0x6e889591  // udot v17.4s, v12.16b, v8.16b\n"
+    "ldr q8, [x12, x28]\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mls v25.4s, v22.4s, v16.4s\n"
+    "mls v7.4s, v18.4s, v16.4s\n"
+    "and v17.16b, v28.16b, v23.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v15.4s\n"
+    "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+    "ldr q15, [x15, x28]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x40]\n"
+    "ldr q22, [x21, x28]\n"
+    "ldr q3, [x20, x28]\n"
+    "and v24.16b, v29.16b, v23.16b\n"
+    "and v20.16b, v25.16b, v23.16b\n"
+    "and v17.16b, v7.16b, v23.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "ldr q2, [x21, x28]\n"
+    "ldr q5, [x20, x28]\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v28.4s, v28.4s, v23.4s\n"
+    "sqadd v29.4s, v29.4s, v24.4s\n"
+    "ldr q6, [%x[params], #0x160]\n"
+    "sqadd v25.4s, v25.4s, v20.4s\n"
+    "ldr q20, [%x[params], #0x170]\n"
+    "sqadd v7.4s, v7.4s, v17.4s\n"
+    "ldr q1, [%x[params], #0x150]\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "srshl v29.4s, v29.4s, v23.4s\n"
+    "srshl v25.4s, v25.4s, v23.4s\n"
+    "srshl v7.4s, v7.4s, v23.4s\n"
+    "ldr q26, [x10, x28]\n"
+    "ldp x21, x20, [%x[inptrs], #0x60]\n"
+    "ldr q27, [x21, x28]\n"
+    "ldr q30, [x20, x28]\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v7.4s, v7.4s, v14.4s\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "ldr q23, [x21, x28]\n"
+    "ldr q9, [x20, x28]\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v7.4s, v7.4s, v13.4s\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "ldp x26, x21, [%x[inptrs], #0x30]\n"
+    "smin v7.4s, v7.4s, v11.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s28, [x25, x27]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "zip2 v17.16b, v15.16b, v21.16b\n"
+    "zip1 v15.16b, v15.16b, v21.16b\n"
+    "zip1 v18.16b, v31.16b, v8.16b\n"
+    "zip2 v8.16b, v31.16b, v8.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s29, [x24, x27]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str s25, [x23, x27]\n"
+    "zip2 v25.16b, v15.16b, v18.16b\n"
+    "str s7, [x22, x27]\n"
+    "zip1 v15.16b, v15.16b, v18.16b\n"
+    "zip1 v7.16b, v17.16b, v8.16b\n"
+    "add x27, x27, #0x4\n"
+    "zip2 v8.16b, v17.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x140]\n"
+    "zip2 v29.16b, v26.16b, v19.16b\n"
+    "add %x[params], %x[params], #0x180\n"
+    "zip1 v26.16b, v26.16b, v19.16b\n"
+    "zip1 v28.16b, v4.16b, v10.16b\n"
+    "zip2 v10.16b, v4.16b, v10.16b\n"
+    "zip2 v24.16b, v22.16b, v2.16b\n"
+    "zip1 v22.16b, v22.16b, v2.16b\n"
+    "zip1 v21.16b, v3.16b, v5.16b\n"
+    "zip2 v5.16b, v3.16b, v5.16b\n"
+    "zip2 v18.16b, v27.16b, v23.16b\n"
+    "zip1 v27.16b, v27.16b, v23.16b\n"
+    "zip1 v17.16b, v30.16b, v9.16b\n"
+    "zip2 v9.16b, v30.16b, v9.16b\n"
+    "zip2 v23.16b, v26.16b, v28.16b\n"
+    "zip1 v26.16b, v26.16b, v28.16b\n"
+    "zip1 v3.16b, v29.16b, v10.16b\n"
+    "zip2 v10.16b, v29.16b, v10.16b\n"
+    "zip2 v19.16b, v22.16b, v21.16b\n"
+    "zip1 v22.16b, v22.16b, v21.16b\n"
+    "zip1 v0.16b, v24.16b, v5.16b\n"
+    "zip2 v5.16b, v24.16b, v5.16b\n"
+    "zip2 v24.16b, v27.16b, v17.16b\n"
+    "zip1 v27.16b, v27.16b, v17.16b\n"
+    "zip1 v2.16b, v18.16b, v9.16b\n"
+    "zip2 v9.16b, v18.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "bgt 1b\n"
+    "2:"  // Detached iteration
+    "movi v21.4s, #0x0\n"
+    ".inst 0x6e9a9595  // udot v21.4s, v12.16b, v26.16b\n"
+    ".inst 0x6e8f943f  // udot v31.4s, v1.16b, v15.16b\n"
+    "tst %x[n_channels], #0xf\n"
+    ".inst 0x6e969595  // udot v21.4s, v12.16b, v22.16b\n"
+    ".inst 0x6e9a943d  // udot v29.4s, v1.16b, v26.16b\n"
+    "movi v18.4s, #0x0\n"
+    "add x28, x28, #0x10\n"
+    ".inst 0x6e9a94df  // udot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "mov v17.16b, v21.16b\n .inst 0x6e9b9591  // udot v17.4s, v12.16b, v27.16b\n"
+    ".inst 0x6e8f9595  // udot v21.4s, v12.16b, v15.16b\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x6e9a9592  // udot v18.4s, v12.16b, v26.16b\n"
+    ".inst 0x6e9694dd  // udot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x6e96969f  // udot v31.4s, v20.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x6e8f943e  // udot v30.4s, v1.16b, v15.16b\n"
+    ".inst 0x6e9a943c  // udot v28.4s, v1.16b, v26.16b\n"
+    "mls v31.4s, v21.4s, v16.4s\n"
+    ".inst 0x6e969592  // udot v18.4s, v12.16b, v22.16b\n"
+    ".inst 0x6e9b969d  // udot v29.4s, v20.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x6e9a94de  // udot v30.4s, v6.16b, v26.16b\n"
+    "ldr q4, [%x[params], #0x10]\n"
+    ".inst 0x6e9694dc  // udot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mov v21.16b, v18.16b\n .inst 0x6e9b9595  // udot v21.4s, v12.16b, v27.16b\n"
+    ".inst 0x6e8f9592  // udot v18.4s, v12.16b, v15.16b\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+    ".inst 0x6e96969e  // udot v30.4s, v20.16b, v22.16b\n"
+    ".inst 0x6e9b969c  // udot v28.4s, v20.16b, v27.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v21.4s, v16.4s\n"
+    "and v27.16b, v31.16b, v4.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+    "ldr q15, [%x[params], #0x60]\n"
+    "sqadd v31.4s, v31.4s, v27.4s\n"
+    "and v20.16b, v30.16b, v4.16b\n"
+    "and v18.16b, v29.16b, v4.16b\n"
+    "and v17.16b, v28.16b, v4.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v4.4s\n"
+    "sqadd v30.4s, v30.4s, v20.4s\n"
+    "ldr q27, [%x[params], #0x40]\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "ldr q26, [%x[params], #0x50]\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "ldr q6, [%x[params], #0x30]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v4.4s\n"
+    "srshl v29.4s, v29.4s, v4.4s\n"
+    "srshl v28.4s, v28.4s, v4.4s\n"
+    "ldr q4, [%x[params], #0x70]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v1.4s, #0x0\n"
+    ".inst 0x6e979581  // udot v1.4s, v12.16b, v23.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q31, [%x[params], #0x20]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    ".inst 0x6e939581  // udot v1.4s, v12.16b, v19.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s30, [x24, x27]\n"
+    "mov v22.16b, v1.16b\n .inst 0x6e989596  // udot v22.4s, v12.16b, v24.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s29, [x23, x27]\n"
+    "mov v29.16b, v31.16b\n"
+    ".inst 0x6e999581  // udot v1.4s, v12.16b, v25.16b\n"
+    "str s28, [x22, x27]\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    ".inst 0x6e9994df  // udot v31.4s, v6.16b, v25.16b\n"
+    ".inst 0x6e9794d5  // udot v21.4s, v6.16b, v23.16b\n"
+    ".inst 0x6e97977f  // udot v31.4s, v27.16b, v23.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x6e9994dd  // udot v29.4s, v6.16b, v25.16b\n"
+    ".inst 0x6e9794d4  // udot v20.4s, v6.16b, v23.16b\n"
+    ".inst 0x6e979592  // udot v18.4s, v12.16b, v23.16b\n"
+    ".inst 0x6e939775  // udot v21.4s, v27.16b, v19.16b\n"
+    ".inst 0x6e93975f  // udot v31.4s, v26.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x6e97977d  // udot v29.4s, v27.16b, v23.16b\n"
+    ".inst 0x6e939774  // udot v20.4s, v27.16b, v19.16b\n"
+    "mls v31.4s, v1.4s, v16.4s\n"
+    ".inst 0x6e939592  // udot v18.4s, v12.16b, v19.16b\n"
+    ".inst 0x6e989755  // udot v21.4s, v26.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x6e93975d  // udot v29.4s, v26.16b, v19.16b\n"
+    ".inst 0x6e989754  // udot v20.4s, v26.16b, v24.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v15.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x6e989591  // udot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x6e999592  // udot v18.4s, v12.16b, v25.16b\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v21.4s, v22.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v4.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v15.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v15.4s\n"
+    "ldr q27, [%x[params], #0xc0]\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v29.16b, v4.16b\n"
+    "and v18.16b, v21.16b, v4.16b\n"
+    "and v17.16b, v20.16b, v4.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v4.4s\n"
+    "sqadd v29.4s, v29.4s, v19.4s\n"
+    "ldr q26, [%x[params], #0xa0]\n"
+    "sqadd v21.4s, v21.4s, v18.4s\n"
+    "ldr q25, [%x[params], #0xb0]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q24, [%x[params], #0x90]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v29.4s, v29.4s, v4.4s\n"
+    "srshl v21.4s, v21.4s, v4.4s\n"
+    "srshl v20.4s, v20.4s, v4.4s\n"
+    "ldr q1, [%x[params], #0xd0]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x6e839597  // udot v23.4s, v12.16b, v3.16b\n"
+    ".inst 0x6e809597  // udot v23.4s, v12.16b, v0.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q31, [%x[params], #0x80]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v22.16b, v23.16b\n .inst 0x6e829596  // udot v22.4s, v12.16b, v2.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s29, [x24, x27]\n"
+    ".inst 0x6e879597  // udot v23.4s, v12.16b, v7.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s21, [x23, x27]\n"
+    "mov v21.16b, v31.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v4.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    ".inst 0x6e87971f  // udot v31.4s, v24.16b, v7.16b\n"
+    ".inst 0x6e839704  // udot v4.4s, v24.16b, v3.16b\n"
+    ".inst 0x6e83975f  // udot v31.4s, v26.16b, v3.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x6e879715  // udot v21.4s, v24.16b, v7.16b\n"
+    ".inst 0x6e839714  // udot v20.4s, v24.16b, v3.16b\n"
+    ".inst 0x6e839592  // udot v18.4s, v12.16b, v3.16b\n"
+    ".inst 0x6e809744  // udot v4.4s, v26.16b, v0.16b\n"
+    ".inst 0x6e80973f  // udot v31.4s, v25.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x6e839755  // udot v21.4s, v26.16b, v3.16b\n"
+    ".inst 0x6e809754  // udot v20.4s, v26.16b, v0.16b\n"
+    "mls v31.4s, v23.4s, v16.4s\n"
+    ".inst 0x6e809592  // udot v18.4s, v12.16b, v0.16b\n"
+    ".inst 0x6e829724  // udot v4.4s, v25.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x6e809735  // udot v21.4s, v25.16b, v0.16b\n"
+    ".inst 0x6e829734  // udot v20.4s, v25.16b, v2.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v27.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x6e829591  // udot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x6e879592  // udot v18.4s, v12.16b, v7.16b\n"
+    "mls v21.4s, v18.4s, v16.4s\n"
+    "mls v4.4s, v22.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v1.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v27.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+    "ldr q30, [%x[params], #0x120]\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v21.16b, v1.16b\n"
+    "and v18.16b, v4.16b, v1.16b\n"
+    "and v17.16b, v20.16b, v1.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "sqadd v21.4s, v21.4s, v19.4s\n"
+    "ldr q29, [%x[params], #0x100]\n"
+    "sqadd v4.4s, v4.4s, v18.4s\n"
+    "ldr q28, [%x[params], #0x110]\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "ldr q27, [%x[params], #0xf0]\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "srshl v21.4s, v21.4s, v1.4s\n"
+    "srshl v4.4s, v4.4s, v1.4s\n"
+    "srshl v20.4s, v20.4s, v1.4s\n"
+    "ldr q26, [%x[params], #0x130]\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v4.4s, v4.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v4.4s, v4.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v4.4s, v4.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "movi v25.4s, #0x0\n"
+    ".inst 0x6e8a9599  // udot v25.4s, v12.16b, v10.16b\n"
+    ".inst 0x6e859599  // udot v25.4s, v12.16b, v5.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s31, [x25, x27]\n"
+    "ldr q24, [%x[params], #0xe0]\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v23.16b, v25.16b\n .inst 0x6e899597  // udot v23.4s, v12.16b, v9.16b\n"
+    "add %x[params], %x[params], #0x140\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s21, [x24, x27]\n"
+    ".inst 0x6e889599  // udot v25.4s, v12.16b, v8.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s4, [x23, x27]\n"
+    "mov v22.16b, v24.16b\n"
+    "str s20, [x22, x27]\n"
+    "mov v21.16b, v24.16b\n"
+    "mov v20.16b, v24.16b\n"
+    ".inst 0x6e889778  // udot v24.4s, v27.16b, v8.16b\n"
+    ".inst 0x6e8a9775  // udot v21.4s, v27.16b, v10.16b\n"
+    ".inst 0x6e8a97b8  // udot v24.4s, v29.16b, v10.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "add x27, x27, #0x4\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x6e889776  // udot v22.4s, v27.16b, v8.16b\n"
+    ".inst 0x6e8a9774  // udot v20.4s, v27.16b, v10.16b\n"
+    ".inst 0x6e8a9592  // udot v18.4s, v12.16b, v10.16b\n"
+    ".inst 0x6e8597b5  // udot v21.4s, v29.16b, v5.16b\n"
+    ".inst 0x6e859798  // udot v24.4s, v28.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x6e8a97b6  // udot v22.4s, v29.16b, v10.16b\n"
+    ".inst 0x6e8597b4  // udot v20.4s, v29.16b, v5.16b\n"
+    "mls v24.4s, v25.4s, v16.4s\n"
+    ".inst 0x6e859592  // udot v18.4s, v12.16b, v5.16b\n"
+    ".inst 0x6e899795  // udot v21.4s, v28.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x6e859796  // udot v22.4s, v28.16b, v5.16b\n"
+    ".inst 0x6e899794  // udot v20.4s, v28.16b, v9.16b\n"
+    "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+    "mov v17.16b, v18.16b\n .inst 0x6e899591  // udot v17.4s, v12.16b, v9.16b\n"
+    ".inst 0x6e889592  // udot v18.4s, v12.16b, v8.16b\n"
+    "mls v22.4s, v18.4s, v16.4s\n"
+    "mls v21.4s, v23.4s, v16.4s\n"
+    "mls v20.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v26.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "and v19.16b, v22.16b, v26.16b\n"
+    "and v18.16b, v21.16b, v26.16b\n"
+    "and v17.16b, v20.16b, v26.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v19.4s\n"
+    "sqadd v21.4s, v21.4s, v18.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "srshl v24.4s, v24.4s, v26.4s\n"
+    "srshl v22.4s, v22.4s, v26.4s\n"
+    "srshl v21.4s, v21.4s, v26.4s\n"
+    "srshl v20.4s, v20.4s, v26.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v22.4s, v22.4s, v14.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v22.4s, v22.4s, v13.4s\n"
+    "smax v21.4s, v21.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x25, x27]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s22, [x24, x27]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s21, [x23, x27]\n"
+    "str s20, [x22, x27]\n"
+    "add x27, x27, #0x4\n"
+    "beq 35f\n"
+    "3:"  // Oddments
+    "and x20, %x[n_channels], #0xf\n"
+    "add x15, x15, x28\n"
+    "add x14, x14, x28\n"
+    "add x13, x13, x28\n"
+    "add x12, x12, x28\n"
+    "add x10, x10, x28\n"
+    "add x9, x9, x28\n"
+    "add x26, x26, x28\n"
+    "add x21, x21, x28\n"
+    "tbz %x[n_channels], #3, 7f\n"
+    "ldr d15, [x15], #0x8\n"
+    "ldr d25, [x14], #0x8\n"
+    "ldr d7, [x13], #0x8\n"
+    "ldr d8, [x12], #0x8\n"
+    "ldr d26, [x10], #0x8\n"
+    "ldr d23, [x9], #0x8\n"
+    "ldr d3, [x26], #0x8\n"
+    "ldr d10, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 5f\n"
+    "ld1 { v15.s }[2], [x15], #0x4\n"
+    "ld1 { v25.s }[2], [x14], #0x4\n"
+    "ld1 { v7.s }[2], [x13], #0x4\n"
+    "ld1 { v8.s }[2], [x12], #0x4\n"
+    "ld1 { v26.s }[2], [x10], #0x4\n"
+    "ld1 { v23.s }[2], [x9], #0x4\n"
+    "ld1 { v3.s }[2], [x26], #0x4\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v15.h }[6], [x15], #0x2\n"
+    "ld1 { v25.h }[6], [x14], #0x2\n"
+    "ld1 { v7.h }[6], [x13], #0x2\n"
+    "ld1 { v8.h }[6], [x12], #0x2\n"
+    "ld1 { v26.h }[6], [x10], #0x2\n"
+    "ld1 { v23.h }[6], [x9], #0x2\n"
+    "ld1 { v3.h }[6], [x26], #0x2\n"
+    "ld1 { v10.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[14], [x15], #0x1\n"
+    "ld1 { v25.b }[14], [x14], #0x1\n"
+    "ld1 { v7.b }[14], [x13], #0x1\n"
+    "ld1 { v8.b }[14], [x12], #0x1\n"
+    "ld1 { v26.b }[14], [x10], #0x1\n"
+    "ld1 { v23.b }[14], [x9], #0x1\n"
+    "ld1 { v3.b }[14], [x26], #0x1\n"
+    "ld1 { v10.b }[14], [x21], #0x1\n"
+    "b 11f\n"
+    "4:"  // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[12], [x15], #0x1\n"
+    "ld1 { v25.b }[12], [x14], #0x1\n"
+    "ld1 { v7.b }[12], [x13], #0x1\n"
+    "ld1 { v8.b }[12], [x12], #0x1\n"
+    "ld1 { v26.b }[12], [x10], #0x1\n"
+    "ld1 { v23.b }[12], [x9], #0x1\n"
+    "ld1 { v3.b }[12], [x26], #0x1\n"
+    "ld1 { v10.b }[12], [x21], #0x1\n"
+    "b 11f\n"
+    "5:"  // Oddments: Load (A): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v15.h }[4], [x15], #0x2\n"
+    "ld1 { v25.h }[4], [x14], #0x2\n"
+    "ld1 { v7.h }[4], [x13], #0x2\n"
+    "ld1 { v8.h }[4], [x12], #0x2\n"
+    "ld1 { v26.h }[4], [x10], #0x2\n"
+    "ld1 { v23.h }[4], [x9], #0x2\n"
+    "ld1 { v3.h }[4], [x26], #0x2\n"
+    "ld1 { v10.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[10], [x15], #0x1\n"
+    "ld1 { v25.b }[10], [x14], #0x1\n"
+    "ld1 { v7.b }[10], [x13], #0x1\n"
+    "ld1 { v8.b }[10], [x12], #0x1\n"
+    "ld1 { v26.b }[10], [x10], #0x1\n"
+    "ld1 { v23.b }[10], [x9], #0x1\n"
+    "ld1 { v3.b }[10], [x26], #0x1\n"
+    "ld1 { v10.b }[10], [x21], #0x1\n"
+    "b 11f\n"
+    "6:"  // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[8], [x15], #0x1\n"
+    "ld1 { v25.b }[8], [x14], #0x1\n"
+    "ld1 { v7.b }[8], [x13], #0x1\n"
+    "ld1 { v8.b }[8], [x12], #0x1\n"
+    "ld1 { v26.b }[8], [x10], #0x1\n"
+    "ld1 { v23.b }[8], [x9], #0x1\n"
+    "ld1 { v3.b }[8], [x26], #0x1\n"
+    "ld1 { v10.b }[8], [x21], #0x1\n"
+    "b 11f\n"
+    "7:"  // Oddments: Load (A): Bit 3: Unset
+    "tbz %x[n_channels], #2, 9f\n"
+    "ldr s15, [x15], #0x4\n"
+    "ldr s25, [x14], #0x4\n"
+    "ldr s7, [x13], #0x4\n"
+    "ldr s8, [x12], #0x4\n"
+    "ldr s26, [x10], #0x4\n"
+    "ldr s23, [x9], #0x4\n"
+    "ldr s3, [x26], #0x4\n"
+    "ldr s10, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v15.h }[2], [x15], #0x2\n"
+    "ld1 { v25.h }[2], [x14], #0x2\n"
+    "ld1 { v7.h }[2], [x13], #0x2\n"
+    "ld1 { v8.h }[2], [x12], #0x2\n"
+    "ld1 { v26.h }[2], [x10], #0x2\n"
+    "ld1 { v23.h }[2], [x9], #0x2\n"
+    "ld1 { v3.h }[2], [x26], #0x2\n"
+    "ld1 { v10.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[6], [x15], #0x1\n"
+    "ld1 { v25.b }[6], [x14], #0x1\n"
+    "ld1 { v7.b }[6], [x13], #0x1\n"
+    "ld1 { v8.b }[6], [x12], #0x1\n"
+    "ld1 { v26.b }[6], [x10], #0x1\n"
+    "ld1 { v23.b }[6], [x9], #0x1\n"
+    "ld1 { v3.b }[6], [x26], #0x1\n"
+    "ld1 { v10.b }[6], [x21], #0x1\n"
+    "b 11f\n"
+    "8:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[4], [x15], #0x1\n"
+    "ld1 { v25.b }[4], [x14], #0x1\n"
+    "ld1 { v7.b }[4], [x13], #0x1\n"
+    "ld1 { v8.b }[4], [x12], #0x1\n"
+    "ld1 { v26.b }[4], [x10], #0x1\n"
+    "ld1 { v23.b }[4], [x9], #0x1\n"
+    "ld1 { v3.b }[4], [x26], #0x1\n"
+    "ld1 { v10.b }[4], [x21], #0x1\n"
+    "b 11f\n"
+    "9:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h15, [x15], #0x2\n"
+    "ldr h25, [x14], #0x2\n"
+    "ldr h7, [x13], #0x2\n"
+    "ldr h8, [x12], #0x2\n"
+    "ldr h26, [x10], #0x2\n"
+    "ldr h23, [x9], #0x2\n"
+    "ldr h3, [x26], #0x2\n"
+    "ldr h10, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v15.b }[2], [x15], #0x1\n"
+    "ld1 { v25.b }[2], [x14], #0x1\n"
+    "ld1 { v7.b }[2], [x13], #0x1\n"
+    "ld1 { v8.b }[2], [x12], #0x1\n"
+    "ld1 { v26.b }[2], [x10], #0x1\n"
+    "ld1 { v23.b }[2], [x9], #0x1\n"
+    "ld1 { v3.b }[2], [x26], #0x1\n"
+    "ld1 { v10.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b15, [x15], #0x1\n"
+    "ldr b25, [x14], #0x1\n"
+    "ldr b7, [x13], #0x1\n"
+    "ldr b8, [x12], #0x1\n"
+    "ldr b26, [x10], #0x1\n"
+    "ldr b23, [x9], #0x1\n"
+    "ldr b3, [x26], #0x1\n"
+    "ldr b10, [x21], #0x1\n"
+    "11:"  // Oddments: Load (A): Bit 3: End
+    "ldp x15, x14, [%x[inptrs], #0x40]\n"
+    "ldp x13, x12, [%x[inptrs], #0x50]\n"
+    "add x15, x15, x28\n"
+    "add x14, x14, x28\n"
+    "ldp x10, x9, [%x[inptrs], #0x60]\n"
+    "ldp x26, x21, [%x[inptrs], #0x70]\n"
+    "add x13, x13, x28\n"
+    "add x12, x12, x28\n"
+    "add x10, x10, x28\n"
+    "add x9, x9, x28\n"
+    "add x26, x26, x28\n"
+    "add x21, x21, x28\n"
+    "tbz %x[n_channels], #3, 15f\n"
+    "ldr d22, [x15], #0x8\n"
+    "ldr d19, [x14], #0x8\n"
+    "ldr d0, [x13], #0x8\n"
+    "ldr d5, [x12], #0x8\n"
+    "ldr d27, [x10], #0x8\n"
+    "ldr d24, [x9], #0x8\n"
+    "ldr d2, [x26], #0x8\n"
+    "ldr d9, [x21], #0x8\n"
+    "tbz %x[n_channels], #2, 13f\n"
+    "ld1 { v22.s }[2], [x15], #0x4\n"
+    "ld1 { v19.s }[2], [x14], #0x4\n"
+    "ld1 { v0.s }[2], [x13], #0x4\n"
+    "ld1 { v5.s }[2], [x12], #0x4\n"
+    "ld1 { v27.s }[2], [x10], #0x4\n"
+    "ld1 { v24.s }[2], [x9], #0x4\n"
+    "ld1 { v2.s }[2], [x26], #0x4\n"
+    "ld1 { v9.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v22.h }[6], [x15], #0x2\n"
+    "ld1 { v19.h }[6], [x14], #0x2\n"
+    "ld1 { v0.h }[6], [x13], #0x2\n"
+    "ld1 { v5.h }[6], [x12], #0x2\n"
+    "ld1 { v27.h }[6], [x10], #0x2\n"
+    "ld1 { v24.h }[6], [x9], #0x2\n"
+    "ld1 { v2.h }[6], [x26], #0x2\n"
+    "ld1 { v9.h }[6], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[14], [x15], #0x1\n"
+    "ld1 { v19.b }[14], [x14], #0x1\n"
+    "ld1 { v0.b }[14], [x13], #0x1\n"
+    "ld1 { v5.b }[14], [x12], #0x1\n"
+    "ld1 { v27.b }[14], [x10], #0x1\n"
+    "ld1 { v24.b }[14], [x9], #0x1\n"
+    "ld1 { v2.b }[14], [x26], #0x1\n"
+    "ld1 { v9.b }[14], [x21], #0x1\n"
+    "b 19f\n"
+    "12:"  // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[12], [x15], #0x1\n"
+    "ld1 { v19.b }[12], [x14], #0x1\n"
+    "ld1 { v0.b }[12], [x13], #0x1\n"
+    "ld1 { v5.b }[12], [x12], #0x1\n"
+    "ld1 { v27.b }[12], [x10], #0x1\n"
+    "ld1 { v24.b }[12], [x9], #0x1\n"
+    "ld1 { v2.b }[12], [x26], #0x1\n"
+    "ld1 { v9.b }[12], [x21], #0x1\n"
+    "b 19f\n"
+    "13:"  // Oddments: Load (B): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v22.h }[4], [x15], #0x2\n"
+    "ld1 { v19.h }[4], [x14], #0x2\n"
+    "ld1 { v0.h }[4], [x13], #0x2\n"
+    "ld1 { v5.h }[4], [x12], #0x2\n"
+    "ld1 { v27.h }[4], [x10], #0x2\n"
+    "ld1 { v24.h }[4], [x9], #0x2\n"
+    "ld1 { v2.h }[4], [x26], #0x2\n"
+    "ld1 { v9.h }[4], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[10], [x15], #0x1\n"
+    "ld1 { v19.b }[10], [x14], #0x1\n"
+    "ld1 { v0.b }[10], [x13], #0x1\n"
+    "ld1 { v5.b }[10], [x12], #0x1\n"
+    "ld1 { v27.b }[10], [x10], #0x1\n"
+    "ld1 { v24.b }[10], [x9], #0x1\n"
+    "ld1 { v2.b }[10], [x26], #0x1\n"
+    "ld1 { v9.b }[10], [x21], #0x1\n"
+    "b 19f\n"
+    "14:"  // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[8], [x15], #0x1\n"
+    "ld1 { v19.b }[8], [x14], #0x1\n"
+    "ld1 { v0.b }[8], [x13], #0x1\n"
+    "ld1 { v5.b }[8], [x12], #0x1\n"
+    "ld1 { v27.b }[8], [x10], #0x1\n"
+    "ld1 { v24.b }[8], [x9], #0x1\n"
+    "ld1 { v2.b }[8], [x26], #0x1\n"
+    "ld1 { v9.b }[8], [x21], #0x1\n"
+    "b 19f\n"
+    "15:"  // Oddments: Load (B): Bit 3: Unset
+    "tbz %x[n_channels], #2, 17f\n"
+    "ldr s22, [x15], #0x4\n"
+    "ldr s19, [x14], #0x4\n"
+    "ldr s0, [x13], #0x4\n"
+    "ldr s5, [x12], #0x4\n"
+    "ldr s27, [x10], #0x4\n"
+    "ldr s24, [x9], #0x4\n"
+    "ldr s2, [x26], #0x4\n"
+    "ldr s9, [x21], #0x4\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v22.h }[2], [x15], #0x2\n"
+    "ld1 { v19.h }[2], [x14], #0x2\n"
+    "ld1 { v0.h }[2], [x13], #0x2\n"
+    "ld1 { v5.h }[2], [x12], #0x2\n"
+    "ld1 { v27.h }[2], [x10], #0x2\n"
+    "ld1 { v24.h }[2], [x9], #0x2\n"
+    "ld1 { v2.h }[2], [x26], #0x2\n"
+    "ld1 { v9.h }[2], [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[6], [x15], #0x1\n"
+    "ld1 { v19.b }[6], [x14], #0x1\n"
+    "ld1 { v0.b }[6], [x13], #0x1\n"
+    "ld1 { v5.b }[6], [x12], #0x1\n"
+    "ld1 { v27.b }[6], [x10], #0x1\n"
+    "ld1 { v24.b }[6], [x9], #0x1\n"
+    "ld1 { v2.b }[6], [x26], #0x1\n"
+    "ld1 { v9.b }[6], [x21], #0x1\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[4], [x15], #0x1\n"
+    "ld1 { v19.b }[4], [x14], #0x1\n"
+    "ld1 { v0.b }[4], [x13], #0x1\n"
+    "ld1 { v5.b }[4], [x12], #0x1\n"
+    "ld1 { v27.b }[4], [x10], #0x1\n"
+    "ld1 { v24.b }[4], [x9], #0x1\n"
+    "ld1 { v2.b }[4], [x26], #0x1\n"
+    "ld1 { v9.b }[4], [x21], #0x1\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ldr h22, [x15], #0x2\n"
+    "ldr h19, [x14], #0x2\n"
+    "ldr h0, [x13], #0x2\n"
+    "ldr h5, [x12], #0x2\n"
+    "ldr h27, [x10], #0x2\n"
+    "ldr h24, [x9], #0x2\n"
+    "ldr h2, [x26], #0x2\n"
+    "ldr h9, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v22.b }[2], [x15], #0x1\n"
+    "ld1 { v19.b }[2], [x14], #0x1\n"
+    "ld1 { v0.b }[2], [x13], #0x1\n"
+    "ld1 { v5.b }[2], [x12], #0x1\n"
+    "ld1 { v27.b }[2], [x10], #0x1\n"
+    "ld1 { v24.b }[2], [x9], #0x1\n"
+    "ld1 { v2.b }[2], [x26], #0x1\n"
+    "ld1 { v9.b }[2], [x21], #0x1\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "ldr b22, [x15], #0x1\n"
+    "ldr b19, [x14], #0x1\n"
+    "ldr b0, [x13], #0x1\n"
+    "ldr b5, [x12], #0x1\n"
+    "ldr b27, [x10], #0x1\n"
+    "ldr b24, [x9], #0x1\n"
+    "ldr b2, [x26], #0x1\n"
+    "ldr b9, [x21], #0x1\n"
+    "19:"  // Oddments: Load (B): Bit 3: End
+    "ldr q20, [%x[params], #0x10]\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "zip2 v1.16b, v26.16b, v3.16b\n"
+    "zip1 v26.16b, v26.16b, v3.16b\n"
+    "ldr q4, [%x[params], #0x30]\n"
+    "zip1 v18.16b, v23.16b, v10.16b\n"
+    "zip2 v30.16b, v15.16b, v7.16b\n"
+    "cmp x20, #0x4\n"
+    "zip1 v15.16b, v15.16b, v7.16b\n"
+    "zip1 v29.16b, v25.16b, v8.16b\n"
+    "zip2 v8.16b, v25.16b, v8.16b\n"
+    "zip2 v10.16b, v23.16b, v10.16b\n"
+    "zip2 v23.16b, v26.16b, v18.16b\n"
+    "zip1 v26.16b, v26.16b, v18.16b\n"
+    "zip2 v28.16b, v22.16b, v0.16b\n"
+    "zip1 v22.16b, v22.16b, v0.16b\n"
+    "zip1 v21.16b, v19.16b, v5.16b\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x6e9a9591  // udot v17.4s, v12.16b, v26.16b\n"
+    "zip2 v25.16b, v15.16b, v29.16b\n"
+    "zip1 v15.16b, v15.16b, v29.16b\n"
+    "zip1 v7.16b, v30.16b, v8.16b\n"
+    "zip2 v8.16b, v30.16b, v8.16b\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "zip2 v5.16b, v19.16b, v5.16b\n"
+    "zip2 v30.16b, v27.16b, v2.16b\n"
+    "zip1 v27.16b, v27.16b, v2.16b\n"
+    "zip1 v18.16b, v24.16b, v9.16b\n"
+    "zip2 v9.16b, v24.16b, v9.16b\n"
+    "zip2 v19.16b, v22.16b, v21.16b\n"
+    "zip1 v22.16b, v22.16b, v21.16b\n"
+    "zip1 v3.16b, v1.16b, v10.16b\n"
+    ".inst 0x6e969591  // udot v17.4s, v12.16b, v22.16b\n"
+    "zip2 v10.16b, v1.16b, v10.16b\n"
+    "zip1 v0.16b, v28.16b, v5.16b\n"
+    "zip2 v5.16b, v28.16b, v5.16b\n"
+    "zip2 v24.16b, v27.16b, v18.16b\n"
+    "zip1 v27.16b, v27.16b, v18.16b\n"
+    "zip1 v2.16b, v30.16b, v9.16b\n"
+    "mov v18.16b, v17.16b\n .inst 0x6e9b9592  // udot v18.4s, v12.16b, v27.16b\n"
+    "zip2 v9.16b, v30.16b, v9.16b\n"
+    "mov v30.16b, v31.16b\n"
+    ".inst 0x6e8f9591  // udot v17.4s, v12.16b, v15.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x6e8f969f  // udot v31.4s, v20.16b, v15.16b\n"
+    ".inst 0x6e9a969d  // udot v29.4s, v20.16b, v26.16b\n"
+    ".inst 0x6e9a94df  // udot v31.4s, v6.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "movi v1.4s, #0x0\n"
+    "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+    ".inst 0x6e9a9581  // udot v1.4s, v12.16b, v26.16b\n"
+    ".inst 0x6e9694dd  // udot v29.4s, v6.16b, v22.16b\n"
+    ".inst 0x6e96949f  // udot v31.4s, v4.16b, v22.16b\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    ".inst 0x6e8f969e  // udot v30.4s, v20.16b, v15.16b\n"
+    ".inst 0x6e9a969c  // udot v28.4s, v20.16b, v26.16b\n"
+    "mls v31.4s, v17.4s, v16.4s\n"
+    ".inst 0x6e969581  // udot v1.4s, v12.16b, v22.16b\n"
+    ".inst 0x6e9b949d  // udot v29.4s, v4.16b, v27.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    ".inst 0x6e9a94de  // udot v30.4s, v6.16b, v26.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    ".inst 0x6e9694dc  // udot v28.4s, v6.16b, v22.16b\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mov v20.16b, v1.16b\n .inst 0x6e9b9594  // udot v20.4s, v12.16b, v27.16b\n"
+    ".inst 0x6e8f9581  // udot v1.4s, v12.16b, v15.16b\n"
+    "ldr q18, [%x[params], #0x40]\n"
+    "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+    ".inst 0x6e96949e  // udot v30.4s, v4.16b, v22.16b\n"
+    ".inst 0x6e9b949c  // udot v28.4s, v4.16b, v27.16b\n"
+    "mls v30.4s, v1.4s, v16.4s\n"
+    "add %x[params], %x[params], #0x60\n"
+    "mls v28.4s, v20.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v18.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v18.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v26.16b, v28.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v26.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 20f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Unroll 0: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 21f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 22f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 22f\n"
+    "21:"  // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "22:"  // Oddments: Unroll 0: Oddment store: Bit 1: End
+    "23:"  // Oddments: Unroll 0: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q27, [%x[params], #0x10]\n"
+    "movi v1.4s, #0x0\n"
+    ".inst 0x6e979581  // udot v1.4s, v12.16b, v23.16b\n"
+    "ldr q26, [%x[params], #0x20]\n"
+    "ldr q22, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q4, [%x[params], #0x40]\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x6e99977f  // udot v31.4s, v27.16b, v25.16b\n"
+    ".inst 0x6e939581  // udot v1.4s, v12.16b, v19.16b\n"
+    ".inst 0x6e97977d  // udot v29.4s, v27.16b, v23.16b\n"
+    "movi v20.4s, #0x0\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x6e97975f  // udot v31.4s, v26.16b, v23.16b\n"
+    "mov v18.16b, v1.16b\n .inst 0x6e989592  // udot v18.4s, v12.16b, v24.16b\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x6e999581  // udot v1.4s, v12.16b, v25.16b\n"
+    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+    ".inst 0x6e99977e  // udot v30.4s, v27.16b, v25.16b\n"
+    ".inst 0x6e97977c  // udot v28.4s, v27.16b, v23.16b\n"
+    ".inst 0x6e979594  // udot v20.4s, v12.16b, v23.16b\n"
+    ".inst 0x6e93975d  // udot v29.4s, v26.16b, v19.16b\n"
+    ".inst 0x6e9396df  // udot v31.4s, v22.16b, v19.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x6e97975e  // udot v30.4s, v26.16b, v23.16b\n"
+    ".inst 0x6e93975c  // udot v28.4s, v26.16b, v19.16b\n"
+    "mls v31.4s, v1.4s, v16.4s\n"
+    ".inst 0x6e939594  // udot v20.4s, v12.16b, v19.16b\n"
+    ".inst 0x6e9896dd  // udot v29.4s, v22.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x6e9396de  // udot v30.4s, v22.16b, v19.16b\n"
+    ".inst 0x6e9896dc  // udot v28.4s, v22.16b, v24.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+    "mov v17.16b, v20.16b\n .inst 0x6e989591  // udot v17.4s, v12.16b, v24.16b\n"
+    ".inst 0x6e999594  // udot v20.4s, v12.16b, v25.16b\n"
+    "mls v30.4s, v20.4s, v16.4s\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v30.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v28.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 24f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Unroll 1: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 25f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 26f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 26f\n"
+    "25:"  // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "26:"  // Oddments: Unroll 1: Oddment store: Bit 1: End
+    "27:"  // Oddments: Unroll 1: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q25, [%x[params], #0x10]\n"
+    "movi v24.4s, #0x0\n"
+    ".inst 0x6e839598  // udot v24.4s, v12.16b, v3.16b\n"
+    "ldr q23, [%x[params], #0x20]\n"
+    "ldr q22, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q21, [%x[params], #0x40]\n"
+    "ldr q20, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x6e87973f  // udot v31.4s, v25.16b, v7.16b\n"
+    ".inst 0x6e809598  // udot v24.4s, v12.16b, v0.16b\n"
+    ".inst 0x6e83973d  // udot v29.4s, v25.16b, v3.16b\n"
+    "movi v19.4s, #0x0\n"
+    "cmp x20, #0x4\n"
+    ".inst 0x6e8396ff  // udot v31.4s, v23.16b, v3.16b\n"
+    "mov v18.16b, v24.16b\n .inst 0x6e829592  // udot v18.4s, v12.16b, v2.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x6e879598  // udot v24.4s, v12.16b, v7.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x6e87973e  // udot v30.4s, v25.16b, v7.16b\n"
+    ".inst 0x6e83973c  // udot v28.4s, v25.16b, v3.16b\n"
+    ".inst 0x6e839593  // udot v19.4s, v12.16b, v3.16b\n"
+    ".inst 0x6e8096fd  // udot v29.4s, v23.16b, v0.16b\n"
+    ".inst 0x6e8096df  // udot v31.4s, v22.16b, v0.16b\n"
+    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+    ".inst 0x6e8396fe  // udot v30.4s, v23.16b, v3.16b\n"
+    ".inst 0x6e8096fc  // udot v28.4s, v23.16b, v0.16b\n"
+    "mls v31.4s, v24.4s, v16.4s\n"
+    ".inst 0x6e809593  // udot v19.4s, v12.16b, v0.16b\n"
+    ".inst 0x6e8296dd  // udot v29.4s, v22.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x6e8096de  // udot v30.4s, v22.16b, v0.16b\n"
+    ".inst 0x6e8296dc  // udot v28.4s, v22.16b, v2.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "mov v17.16b, v19.16b\n .inst 0x6e829591  // udot v17.4s, v12.16b, v2.16b\n"
+    ".inst 0x6e879593  // udot v19.4s, v12.16b, v7.16b\n"
+    "mls v30.4s, v19.4s, v16.4s\n"
+    "mls v29.4s, v18.4s, v16.4s\n"
+    "mls v28.4s, v17.4s, v16.4s\n"
+    "and v17.16b, v31.16b, v20.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqadd v31.4s, v31.4s, v17.4s\n"
+    "and v19.16b, v30.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v28.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "blt 28f\n"
+    "str s31, [x25, x27]\n"
+    "str s30, [x24, x27]\n"
+    "str s29, [x23, x27]\n"
+    "str s28, [x22, x27]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Unroll 2: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 29f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 30f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 30f\n"
+    "29:"  // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "30:"  // Oddments: Unroll 2: Oddment store: Bit 1: End
+    "31:"  // Oddments: Unroll 2: After oddment store
+    "subs x20, x20, #0x4\n"
+    "add x27, x27, #0x4\n"
+    "ble 35f\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "ldr q23, [%x[params], #0x10]\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x6e8a9596  // udot v22.4s, v12.16b, v10.16b\n"
+    "ldr q21, [%x[params], #0x20]\n"
+    "ldr q19, [%x[params], #0x30]\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "ldr q20, [%x[params], #0x40]\n"
+    "ldr q26, [%x[params], #0x50]\n"
+    "mov v28.16b, v31.16b\n"
+    ".inst 0x6e8896ff  // udot v31.4s, v23.16b, v8.16b\n"
+    ".inst 0x6e859596  // udot v22.4s, v12.16b, v5.16b\n"
+    ".inst 0x6e8a96fd  // udot v29.4s, v23.16b, v10.16b\n"
+    "movi v18.4s, #0x0\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x6e8a96bf  // udot v31.4s, v21.16b, v10.16b\n"
+    "mov v17.16b, v22.16b\n .inst 0x6e899591  // udot v17.4s, v12.16b, v9.16b\n"
+    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+    ".inst 0x6e889596  // udot v22.4s, v12.16b, v8.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    ".inst 0x6e8896fe  // udot v30.4s, v23.16b, v8.16b\n"
+    ".inst 0x6e8a96fc  // udot v28.4s, v23.16b, v10.16b\n"
+    ".inst 0x6e8a9592  // udot v18.4s, v12.16b, v10.16b\n"
+    ".inst 0x6e8596bd  // udot v29.4s, v21.16b, v5.16b\n"
+    ".inst 0x6e85967f  // udot v31.4s, v19.16b, v5.16b\n"
+    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+    ".inst 0x6e8a96be  // udot v30.4s, v21.16b, v10.16b\n"
+    ".inst 0x6e8596bc  // udot v28.4s, v21.16b, v5.16b\n"
+    "mls v31.4s, v22.4s, v16.4s\n"
+    ".inst 0x6e859592  // udot v18.4s, v12.16b, v5.16b\n"
+    ".inst 0x6e89967d  // udot v29.4s, v19.16b, v9.16b\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+    ".inst 0x6e85967e  // udot v30.4s, v19.16b, v5.16b\n"
+    ".inst 0x6e89967c  // udot v28.4s, v19.16b, v9.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+    "mov v7.16b, v18.16b\n .inst 0x6e899587  // udot v7.4s, v12.16b, v9.16b\n"
+    ".inst 0x6e889592  // udot v18.4s, v12.16b, v8.16b\n"
+    "mls v30.4s, v18.4s, v16.4s\n"
+    "mls v29.4s, v17.4s, v16.4s\n"
+    "mls v28.4s, v7.4s, v16.4s\n"
+    "and v16.16b, v31.16b, v26.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v20.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "and v18.16b, v30.16b, v26.16b\n"
+    "and v17.16b, v29.16b, v26.16b\n"
+    "and v16.16b, v28.16b, v26.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "srshl v31.4s, v31.4s, v26.4s\n"
+    "srshl v30.4s, v30.4s, v26.4s\n"
+    "srshl v29.4s, v29.4s, v26.4s\n"
+    "srshl v28.4s, v28.4s, v26.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smin v31.4s, v31.4s, v11.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smin v29.4s, v29.4s, v11.4s\n"
+    "smin v28.4s, v28.4s, v11.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "32:"  // Oddments: Unroll 3: Oddment store
+    "add x25, x25, x27\n"
+    "add x24, x24, x27\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "tbz x20, #1, 33f\n"
+    "st1 { v31.h }[0], [x25], #0x2\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v29.h }[0], [x23], #0x2\n"
+    "st1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x20, #0, 34f\n"
+    "st1 { v31.b }[2], [x25], #0x1\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v29.b }[2], [x23], #0x1\n"
+    "st1 { v28.b }[2], [x22], #0x1\n"
+    "b 34f\n"
+    "33:"  // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+    "st1 { v31.b }[0], [x25], #0x1\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v29.b }[0], [x23], #0x1\n"
+    "st1 { v28.b }[0], [x22], #0x1\n"
+    "34:"  // Oddments: Unroll 3: Oddment store: Bit 1: End
+    "35:"  // End
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..5ae0be1054
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d5b55cb9c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v14.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v19.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v29.8h }, [x21]\n"
+    "ld1r { v12.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "usubl v23.8h, v23.8b, v19.8b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "usubl v16.8h, v16.8b, v19.8b\n"
+    "usubl v1.8h, v1.8b, v19.8b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "usubl v5.8h, v5.8b, v19.8b\n"
+    "usubl v26.8h, v26.8b, v19.8b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "usubl v18.8h, v18.8b, v19.8b\n"
+    "usubl v31.8h, v31.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v25.8h, v25.8b, v19.8b\n"
+    "usubl v20.8h, v20.8b, v19.8b\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d22, [x23, x17]\n"
+    "ldr d4, [x22, x17]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d8, [x21, x17]\n"
+    "ldr d27, [x20, x17]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ldr d15, [x20, x17]\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q3, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q28, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x27, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x26, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x25, [x15, #0x58]\n"
+    "ldr x24, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x21, [x15, #0x78]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x27, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x26, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "add x13, x13, #0x20\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x25, x17]\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x24, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x23, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x22, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "usubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d8, [x21, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v27.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v27.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v28.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v8.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+    "smlal v10.4s, v8.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+    "smlal2 v30.4s, v8.8h, v20.8h\n"
+    "smlal2 v6.4s, v8.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v3.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v20.16b, v0.16b, v28.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v31.16b, v30.16b, v28.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v18.16b, v6.16b, v28.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v20.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v31.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v28.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v28.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v28.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "usubl v23.8h, v23.8b, v19.8b\n"
+    "usubl v16.8h, v16.8b, v19.8b\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr d22, [x23, x17]\n"
+    "usubl v1.8h, v1.8b, v19.8b\n"
+    "usubl v5.8h, v5.8b, v19.8b\n"
+    "ldr d4, [x22, x17]\n"
+    "ldr d8, [x21, x17]\n"
+    "usubl v26.8h, v26.8b, v19.8b\n"
+    "usubl v18.8h, v18.8b, v19.8b\n"
+    "ldr d27, [x20, x17]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "usubl v31.8h, v31.8b, v19.8b\n"
+    "usubl v25.8h, v25.8b, v19.8b\n"
+    "ldr d15, [x20, x17]\n"
+    "usubl v20.8h, v20.8b, v19.8b\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q28, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q3, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x26, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x25, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x24, [x15, #0x58]\n"
+    "ldr x23, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x22, [x15, #0x68]\n"
+    "ldr x21, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "tst x7, #0x7\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x26, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x25, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x24, x17]\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x23, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x22, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x21, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "usubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d16, [x20, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "usubl v16.8h, v16.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v1.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v1.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v3.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v16.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+    "smlal v10.4s, v16.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+    "smlal2 v30.4s, v16.8h, v20.8h\n"
+    "smlal2 v6.4s, v16.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v15.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v3.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v23.16b, v30.16b, v3.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v21.16b, v6.16b, v3.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v23.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v21.4s\n"
+    "srshl v24.4s, v24.4s, v3.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v3.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v3.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v3.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 64f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v9.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v24.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "usubl v23.8h, v23.8b, v19.8b\n"
+    "usubl v16.8h, v16.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "usubl v1.8h, v1.8b, v19.8b\n"
+    "usubl v5.8h, v5.8b, v19.8b\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "usubl v26.8h, v26.8b, v19.8b\n"
+    "usubl v18.8h, v18.8b, v19.8b\n"
+    "usubl v31.8h, v31.8b, v19.8b\n"
+    "usubl v25.8h, v25.8b, v19.8b\n"
+    "usubl v20.8h, v20.8b, v19.8b\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v22.s }[0], [x24], #0x4\n"
+    "ld1 { v4.s }[0], [x23], #0x4\n"
+    "ld1 { v8.s }[0], [x22], #0x4\n"
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v22.h }[2], [x24], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v8.h }[2], [x22], #0x2\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[6], [x24]\n"
+    "ld1 { v4.b }[6], [x23]\n"
+    "ld1 { v8.b }[6], [x22]\n"
+    "ld1 { v27.b }[6], [x21]\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[4], [x24]\n"
+    "ld1 { v4.b }[4], [x23]\n"
+    "ld1 { v8.b }[4], [x22]\n"
+    "ld1 { v27.b }[4], [x21]\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v22.h }[0], [x24], #0x2\n"
+    "ld1 { v4.h }[0], [x23], #0x2\n"
+    "ld1 { v8.h }[0], [x22], #0x2\n"
+    "ld1 { v27.h }[0], [x21], #0x2\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[2], [x24]\n"
+    "ld1 { v4.b }[2], [x23]\n"
+    "ld1 { v8.b }[2], [x22]\n"
+    "ld1 { v27.b }[2], [x21]\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[0], [x24]\n"
+    "ld1 { v4.b }[0], [x23]\n"
+    "ld1 { v8.b }[0], [x22]\n"
+    "ld1 { v27.b }[0], [x21]\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v21.8h, v21.8b, v14.8b\n"
+    "smlal v2.4s, v21.4h, v31.4h\n"
+    "smlal2 v30.4s, v21.8h, v31.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (3, 3): Bit 2: End
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x38]\n"
+    "smlal v10.4s, v28.4h, v20.4h\n"
+    "smlal2 v6.4s, v28.8h, v20.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v22.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 1): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v22.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 1): Bit 2: End
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v9.4s, v22.4h, v16.4h\n"
+    "smlal2 v24.4s, v22.8h, v16.8h\n"
+    "smlal v7.4s, v22.4h, v23.4h\n"
+    "smlal2 v0.4s, v22.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (0, 2): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (0, 2): Bit 2: End
+    "usubl v21.8h, v21.8b, v14.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v9.4s, v21.4h, v1.4h\n"
+    "smlal2 v24.4s, v21.8h, v1.8h\n"
+    "smlal v7.4s, v21.4h, v16.4h\n"
+    "smlal2 v0.4s, v21.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v9.4s, v28.4h, v20.4h\n"
+    "smlal2 v24.4s, v28.8h, v20.8h\n"
+    "smlal v7.4s, v28.4h, v25.4h\n"
+    "smlal2 v0.4s, v28.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v2.4s, v28.4h, v18.4h\n"
+    "smlal2 v30.4s, v28.8h, v18.8h\n"
+    "smlal v10.4s, v28.4h, v26.4h\n"
+    "smlal2 v6.4s, v28.8h, v26.8h\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (1, 0): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (1, 0): Bit 2: End
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v9.4s, v8.4h, v5.4h\n"
+    "smlal2 v24.4s, v8.8h, v5.8h\n"
+    "smlal v2.4s, v8.4h, v23.4h\n"
+    "smlal2 v30.4s, v8.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v7.4s, v8.4h, v18.4h\n"
+    "smlal2 v0.4s, v8.8h, v18.8h\n"
+    "smlal v10.4s, v8.4h, v1.4h\n"
+    "smlal2 v6.4s, v8.8h, v1.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v17.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v17.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v17.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 0): Bit 2: End
+    "usubl v17.8h, v17.8b, v14.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v9.4s, v17.4h, v31.4h\n"
+    "smlal2 v24.4s, v17.8h, v31.8h\n"
+    "smlal v2.4s, v17.4h, v5.4h\n"
+    "smlal2 v30.4s, v17.8h, v5.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "usubl v23.8h, v23.8b, v14.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v7.4s, v23.4h, v20.4h\n"
+    "smlal2 v0.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v18.4h\n"
+    "smlal2 v6.4s, v23.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "usubl v5.8h, v5.8b, v14.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v2.4s, v5.4h, v25.4h\n"
+    "smlal2 v30.4s, v5.8h, v25.8h\n"
+    "smlal v10.4s, v5.4h, v31.4h\n"
+    "smlal2 v6.4s, v5.8h, v31.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v23.8h, v23.8b, v14.8b\n"
+    "smlal v2.4s, v23.4h, v20.4h\n"
+    "smlal2 v30.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v25.4h\n"
+    "smlal2 v6.4s, v23.8h, v25.8h\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v15.4s }, [x13], #0x10\n"
+    "ld1 { v19.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v22.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v22.s }[2], [x12]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v22.s }[0], [x12]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v15.d }[0], [x13], #0x8\n"
+    "ld1 { v19.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[2], [x13]\n"
+    "ld1 { v19.s }[2], [x12]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[0], [x13]\n"
+    "ld1 { v19.s }[0], [x12]\n"
+    "59:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+    "and v17.16b, v9.16b, v19.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v20.16b, v24.16b, v22.16b\n"
+    "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+    "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+    "sqadd v9.4s, v9.4s, v17.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v21.16b, v7.16b, v19.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v15.16b, v2.16b, v19.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+    "and v23.16b, v10.16b, v19.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v20.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v22.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v17.16b, v30.16b, v22.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v28.16b, v6.16b, v22.16b\n"
+    "sqadd v7.4s, v7.4s, v21.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v23.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v19.4s\n"
+    "srshl v7.4s, v7.4s, v19.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v19.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "srshl v10.4s, v10.4s, v19.4s\n"
+    "sqadd v6.4s, v6.4s, v28.4s\n"
+    "srshl v24.4s, v24.4s, v22.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v22.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v22.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v22.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "tbz x7, #2, 61f\n"
+    "st1 { v9.s }[0], [x11], #0x4\n"
+    "st1 { v7.s }[0], [x10], #0x4\n"
+    "st1 { v2.s }[0], [x9], #0x4\n"
+    "st1 { v10.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "st1 { v9.h }[2], [x11], #0x2\n"
+    "st1 { v7.h }[2], [x10], #0x2\n"
+    "st1 { v2.h }[2], [x9], #0x2\n"
+    "st1 { v10.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[6], [x11], #0x1\n"
+    "st1 { v7.b }[6], [x10], #0x1\n"
+    "st1 { v2.b }[6], [x9], #0x1\n"
+    "st1 { v10.b }[6], [x28], #0x1\n"
+    "b 63f\n"
+    "60:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[4], [x11], #0x1\n"
+    "st1 { v7.b }[4], [x10], #0x1\n"
+    "st1 { v2.b }[4], [x9], #0x1\n"
+    "st1 { v10.b }[4], [x28], #0x1\n"
+    "b 63f\n"
+    "61:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "st1 { v9.h }[0], [x11], #0x2\n"
+    "st1 { v7.h }[0], [x10], #0x2\n"
+    "st1 { v2.h }[0], [x9], #0x2\n"
+    "st1 { v10.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[2], [x11], #0x1\n"
+    "st1 { v7.b }[2], [x10], #0x1\n"
+    "st1 { v2.b }[2], [x9], #0x1\n"
+    "st1 { v10.b }[2], [x28], #0x1\n"
+    "b 63f\n"
+    "62:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[0], [x11], #0x1\n"
+    "st1 { v7.b }[0], [x10], #0x1\n"
+    "st1 { v2.b }[0], [x9], #0x1\n"
+    "st1 { v10.b }[0], [x28], #0x1\n"
+    "63:"  // Oddments: Bit 2: End
+    "64:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..52280ebe70
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c4184622b0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v15.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v17.8h }, [x21]\n"
+    "ld1r { v24.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "usubl v11.8h, v11.8b, v15.8b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "usubl v22.8h, v22.8b, v15.8b\n"
+    "usubl v14.8h, v14.8b, v15.8b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "usubl v28.8h, v28.8b, v15.8b\n"
+    "usubl v18.8h, v18.8b, v15.8b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "usubl v9.8h, v9.8b, v15.8b\n"
+    "usubl v26.8h, v26.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v7.8h, v7.8b, v15.8b\n"
+    "usubl v4.8h, v4.8b, v15.8b\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d25, [x27, x17]\n"
+    "ldr d27, [x26, x17]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d1, [x25, x17]\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "ldr d12, [x23, x17]\n"
+    "ldr d16, [x22, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "ldr d23, [x21, x17]\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q30, [x13, #0x0]\n"
+    "ldr q29, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d18, [x22, x17]\n"
+    "ldr d16, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v20.4s, v18.4h, v7.4h\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v30.4s\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v18.8h, v7.8h\n"
+    "and v28.16b, v5.16b, v29.16b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x12, x12, #0x20\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v21.16b, v29.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v23.16b, v20.16b, v29.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v9.16b, v19.16b, v29.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v25.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v22.16b, v0.16b, v25.16b\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v12.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v23.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v9.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v29.4s\n"
+    "srshl v21.4s, v21.4s, v29.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v29.4s\n"
+    "sqadd v0.4s, v0.4s, v22.4s\n"
+    "srshl v19.4s, v19.4s, v29.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "usubl v11.8h, v11.8b, v15.8b\n"
+    "usubl v22.8h, v22.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "usubl v14.8h, v14.8b, v15.8b\n"
+    "usubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr d25, [x27, x17]\n"
+    "usubl v18.8h, v18.8b, v15.8b\n"
+    "usubl v9.8h, v9.8b, v15.8b\n"
+    "ldr d27, [x26, x17]\n"
+    "ldr d1, [x25, x17]\n"
+    "usubl v26.8h, v26.8b, v15.8b\n"
+    "usubl v7.8h, v7.8b, v15.8b\n"
+    "ldr d2, [x24, x17]\n"
+    "ldr d12, [x23, x17]\n"
+    "usubl v4.8h, v4.8b, v15.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d23, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q29, [x13, #0x0]\n"
+    "ldr q30, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d18, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "tst x7, #0x7\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x20, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal v19.4s, v18.4h, v7.4h\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "and v16.16b, v5.16b, v30.16b\n"
+    "smlal2 v31.4s, v18.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v23.16b, v21.16b, v30.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v27.16b, v20.16b, v30.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v22.16b, v19.16b, v30.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v14.16b, v8.16b, v25.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v25.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v23.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v27.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v22.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v30.4s\n"
+    "srshl v21.4s, v21.4s, v30.4s\n"
+    "sqadd v8.4s, v8.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v19.4s, v19.4s, v30.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 88f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v5.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "usubl v11.8h, v11.8b, v15.8b\n"
+    "usubl v22.8h, v22.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "usubl v14.8h, v14.8b, v15.8b\n"
+    "usubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "usubl v18.8h, v18.8b, v15.8b\n"
+    "usubl v9.8h, v9.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "usubl v26.8h, v26.8b, v15.8b\n"
+    "usubl v7.8h, v7.8b, v15.8b\n"
+    "usubl v4.8h, v4.8b, v15.8b\n"
+    "add x27, x27, x17\n"
+    "add x26, x26, x17\n"
+    "add x25, x25, x17\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v25.s }[0], [x27], #0x4\n"
+    "ld1 { v27.s }[0], [x26], #0x4\n"
+    "ld1 { v1.s }[0], [x25], #0x4\n"
+    "ld1 { v2.s }[0], [x24], #0x4\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "ld1 { v16.s }[0], [x22], #0x4\n"
+    "ld1 { v23.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v25.h }[2], [x27], #0x2\n"
+    "ld1 { v27.h }[2], [x26], #0x2\n"
+    "ld1 { v1.h }[2], [x25], #0x2\n"
+    "ld1 { v2.h }[2], [x24], #0x2\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "ld1 { v16.h }[2], [x22], #0x2\n"
+    "ld1 { v23.h }[2], [x21], #0x2\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[6], [x27]\n"
+    "ld1 { v27.b }[6], [x26]\n"
+    "ld1 { v1.b }[6], [x25]\n"
+    "ld1 { v2.b }[6], [x24]\n"
+    "ld1 { v12.b }[6], [x23]\n"
+    "ld1 { v16.b }[6], [x22]\n"
+    "ld1 { v23.b }[6], [x21]\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[4], [x27]\n"
+    "ld1 { v27.b }[4], [x26]\n"
+    "ld1 { v1.b }[4], [x25]\n"
+    "ld1 { v2.b }[4], [x24]\n"
+    "ld1 { v12.b }[4], [x23]\n"
+    "ld1 { v16.b }[4], [x22]\n"
+    "ld1 { v23.b }[4], [x21]\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v25.h }[0], [x27], #0x2\n"
+    "ld1 { v27.h }[0], [x26], #0x2\n"
+    "ld1 { v1.h }[0], [x25], #0x2\n"
+    "ld1 { v2.h }[0], [x24], #0x2\n"
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "ld1 { v16.h }[0], [x22], #0x2\n"
+    "ld1 { v23.h }[0], [x21], #0x2\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[2], [x27]\n"
+    "ld1 { v27.b }[2], [x26]\n"
+    "ld1 { v1.b }[2], [x25]\n"
+    "ld1 { v2.b }[2], [x24]\n"
+    "ld1 { v12.b }[2], [x23]\n"
+    "ld1 { v16.b }[2], [x22]\n"
+    "ld1 { v23.b }[2], [x21]\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[0], [x27]\n"
+    "ld1 { v27.b }[0], [x26]\n"
+    "ld1 { v1.b }[0], [x25]\n"
+    "ld1 { v2.b }[0], [x24]\n"
+    "ld1 { v12.b }[0], [x23]\n"
+    "ld1 { v16.b }[0], [x22]\n"
+    "ld1 { v23.b }[0], [x21]\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v21.4s, v15.4h, v18.4h\n"
+    "smlal2 v8.4s, v15.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v21.4s, v16.4h, v9.4h\n"
+    "smlal2 v8.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (1, 2): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (1, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v5.4s, v16.4h, v9.4h\n"
+    "smlal2 v3.4s, v16.8h, v9.8h\n"
+    "smlal v21.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v20.4s, v16.4h, v28.4h\n"
+    "smlal2 v0.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v5.4s, v16.4h, v26.4h\n"
+    "smlal2 v3.4s, v16.8h, v26.8h\n"
+    "smlal v20.4s, v16.4h, v11.4h\n"
+    "smlal2 v0.4s, v16.8h, v11.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (3, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v20.4s, v16.4h, v18.4h\n"
+    "smlal2 v0.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v16.4h, v7.4h\n"
+    "smlal2 v3.4s, v16.8h, v7.8h\n"
+    "smlal v20.4s, v16.4h, v22.4h\n"
+    "smlal2 v0.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (3, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x80]\n"
+    "smlal v19.4s, v16.4h, v18.4h\n"
+    "smlal2 v31.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x88]\n"
+    "smlal v21.4s, v16.4h, v7.4h\n"
+    "smlal2 v8.4s, v16.8h, v7.8h\n"
+    "smlal v19.4s, v16.4h, v22.4h\n"
+    "smlal2 v31.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x90]\n"
+    "smlal v19.4s, v16.4h, v9.4h\n"
+    "smlal2 v31.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (4, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x98]\n"
+    "smlal v20.4s, v16.4h, v26.4h\n"
+    "smlal2 v0.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (2, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "smlal v21.4s, v16.4h, v4.4h\n"
+    "smlal2 v8.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v14.4h\n"
+    "smlal2 v31.4s, v16.8h, v14.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 61f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (4, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 65f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 64f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 66f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v0.4s, v16.8h, v9.8h\n"
+    "smlal v19.4s, v16.4h, v28.4h\n"
+    "smlal2 v31.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 69f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 68f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x7, #1, 70f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x7, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal v20.4s, v16.4h, v4.4h\n"
+    "smlal2 v0.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v26.4h\n"
+    "smlal2 v31.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 77f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 76f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x7, #1, 78f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v19.4s, v16.4h, v4.4h\n"
+    "smlal2 v31.4s, v16.8h, v4.8h\n"
+    "tbz x7, #2, 81f\n"
+    "ld1 { v14.4s }, [x13], #0x10\n"
+    "ld1 { v25.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 80f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v12.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v12.s }[2], [x12]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v12.s }[0], [x12]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 82f\n"
+    "ld1 { v14.d }[0], [x13], #0x8\n"
+    "ld1 { v25.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[2], [x13]\n"
+    "ld1 { v25.s }[2], [x12]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[0], [x13]\n"
+    "ld1 { v25.s }[0], [x12]\n"
+    "83:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v28.16b, v5.16b, v25.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v16.16b, v3.16b, v12.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v14.16b, v21.16b, v25.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+    "and v6.16b, v20.16b, v25.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v4.16b, v19.16b, v25.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v12.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "and v7.16b, v0.16b, v12.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v12.16b\n"
+    "sqadd v21.4s, v21.4s, v14.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v6.4s\n"
+    "sshr v7.4s, v7.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v25.4s\n"
+    "srshl v21.4s, v21.4s, v25.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v25.4s\n"
+    "sqadd v0.4s, v0.4s, v7.4s\n"
+    "srshl v19.4s, v19.4s, v25.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v12.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v12.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v12.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v12.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz x7, #2, 85f\n"
+    "st1 { v5.s }[0], [x11], #0x4\n"
+    "st1 { v21.s }[0], [x10], #0x4\n"
+    "st1 { v20.s }[0], [x9], #0x4\n"
+    "st1 { v19.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 84f\n"
+    "st1 { v5.h }[2], [x11], #0x2\n"
+    "st1 { v21.h }[2], [x10], #0x2\n"
+    "st1 { v20.h }[2], [x9], #0x2\n"
+    "st1 { v19.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[6], [x11], #0x1\n"
+    "st1 { v21.b }[6], [x10], #0x1\n"
+    "st1 { v20.b }[6], [x9], #0x1\n"
+    "st1 { v19.b }[6], [x28], #0x1\n"
+    "b 87f\n"
+    "84:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[4], [x11], #0x1\n"
+    "st1 { v21.b }[4], [x10], #0x1\n"
+    "st1 { v20.b }[4], [x9], #0x1\n"
+    "st1 { v19.b }[4], [x28], #0x1\n"
+    "b 87f\n"
+    "85:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 86f\n"
+    "st1 { v5.h }[0], [x11], #0x2\n"
+    "st1 { v21.h }[0], [x10], #0x2\n"
+    "st1 { v20.h }[0], [x9], #0x2\n"
+    "st1 { v19.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[2], [x11], #0x1\n"
+    "st1 { v21.b }[2], [x10], #0x1\n"
+    "st1 { v20.b }[2], [x9], #0x1\n"
+    "st1 { v19.b }[2], [x28], #0x1\n"
+    "b 87f\n"
+    "86:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[0], [x11], #0x1\n"
+    "st1 { v21.b }[0], [x10], #0x1\n"
+    "st1 { v20.b }[0], [x9], #0x1\n"
+    "st1 { v19.b }[0], [x28], #0x1\n"
+    "87:"  // Oddments: Bit 2: End
+    "88:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..07f66fb482
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a3fa93df9c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2187 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x2, x1, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v18.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.16b }, [x21]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v11.8h }, [x21]\n"
+    "ld1r { v0.8h }, [x20]\n"
+    "mov x3, #0x0\n"
+    "mov x4, #0x0\n"
+    "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x17, x16, [x22, #0x0]\n"
+    "ldp x15, x14, [x22, #0x10]\n"
+    "cbz x2, 3f\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "subs x2, x2, #0x1\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "usubl v14.8h, v14.8b, v13.8b\n"
+    "usubl v10.8h, v10.8b, v13.8b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "usubl v12.8h, v12.8b, v13.8b\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ldr d31, [x9, x3]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldr d17, [x28, x3]\n"
+    "ldr d30, [x27, x3]\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "ldr d16, [x26, x3]\n"
+    "ldr d3, [x25, x3]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "ldr d4, [x24, x3]\n"
+    "ldr d25, [x23, x3]\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "ldr d9, [x22, x3]\n"
+    "ldr d29, [x21, x3]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "ldr d28, [x20, x3]\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr d2, [x6, #0x28]\n"
+    "ldr d27, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d1, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x21, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x20, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v2.4h\n"
+    "ldr x20, [x5, #0x90]\n"
+    "ldr x23, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x21, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x22, [x5, #0xa0]\n"
+    "ldr x21, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v27.4h\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x20, x3]\n"
+    "smlal v20.4s, v16.4h, v2.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal v23.4s, v14.4h, v2.4h\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "ldr x13, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v27.8h\n"
+    "smlal v7.4s, v4.4h, v1.4h\n"
+    "ldr x12, [x5, #0xc0]\n"
+    "ldr x11, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v2.8h\n"
+    "ldr d16, [x23, x3]\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v2.8h\n"
+    "ldr d2, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v27.4h\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v27.4h\n"
+    "smlal v23.4s, v25.4h, v27.4h\n"
+    "ldr x10, [x5, #0xd0]\n"
+    "ldr x9, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v1.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x28, [x5, #0xe0]\n"
+    "ldr x27, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v27.8h\n"
+    "ldr d4, [x22, x3]\n"
+    "smlal2 v22.4s, v14.8h, v27.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v27.8h\n"
+    "ldr d27, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v1.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v1.4h\n"
+    "smlal v23.4s, v10.4h, v1.4h\n"
+    "ldr x26, [x5, #0xf0]\n"
+    "ldr x25, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x24, [x5, #0x100]\n"
+    "ldr x23, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v1.8h\n"
+    "ldr d17, [x21, x3]\n"
+    "smlal2 v22.4s, v25.8h, v1.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v1.8h\n"
+    "ldr d1, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x22, [x5, #0x110]\n"
+    "ldr x21, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "subs x2, x2, #0x1\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x13, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "usubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x12, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x11, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v7.4s, v10.4h, v27.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x10, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v2.4h\n"
+    "smlal v23.4s, v17.4h, v2.4h\n"
+    "smlal2 v15.4s, v10.8h, v27.8h\n"
+    "smlal v7.4s, v9.4h, v1.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "ldr d10, [x9, x3]\n"
+    "smlal2 v22.4s, v4.8h, v2.8h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v2.8h\n"
+    "ldr d2, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v27.4h\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v27.4h\n"
+    "smlal v23.4s, v6.4h, v27.4h\n"
+    "smlal2 v15.4s, v9.8h, v1.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v27.8h\n"
+    "ldr d9, [x28, x3]\n"
+    "smlal2 v22.4s, v17.8h, v27.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v27.8h\n"
+    "ldr d27, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v1.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v1.4h\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x27, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v1.8h\n"
+    "usubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "ldr d1, [x26, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x25, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x24, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "usubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v2.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x23, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "add x6, x6, #0xc8\n"
+    "smlal2 v15.4s, v6.8h, v2.8h\n"
+    "smlal v7.4s, v8.4h, v27.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x22, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal v20.4s, v28.4h, v2.4h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v2.4h\n"
+    "smlal v23.4s, v12.4h, v2.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v27.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v2.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v2.8h\n"
+    "smlal2 v19.4s, v12.8h, v2.8h\n"
+    "ldr q2, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v27.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v1.4h, v27.4h\n"
+    "smlal v23.4s, v16.4h, v27.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v27.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v1.8h, v27.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v27.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v27.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v27.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v9.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v25.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+    "and v10.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+    "and v21.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+    "sqadd v15.4s, v15.4s, v9.4s\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v14.16b\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "and v12.16b, v22.16b, v14.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v17.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v10.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v21.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v12.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v17.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "add x4, x4, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "usubl v14.8h, v14.8b, v13.8b\n"
+    "ldr d31, [x9, x3]\n"
+    "ldr d17, [x28, x3]\n"
+    "usubl v10.8h, v10.8b, v13.8b\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "ldr d30, [x27, x3]\n"
+    "ldr d16, [x26, x3]\n"
+    "usubl v12.8h, v12.8b, v13.8b\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "ldr d3, [x25, x3]\n"
+    "ldr d4, [x24, x3]\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "ldr d25, [x23, x3]\n"
+    "ldr d9, [x22, x3]\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "ldr d29, [x21, x3]\n"
+    "ldr d28, [x20, x3]\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr d27, [x6, #0x28]\n"
+    "ldr d1, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d2, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x21, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x21, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v27.4h\n"
+    "ldr x23, [x5, #0x90]\n"
+    "ldr x22, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x20, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x21, [x5, #0xa0]\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v27.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v1.4h\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x23, x3]\n"
+    "smlal v20.4s, v16.4h, v27.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v27.4h\n"
+    "smlal v23.4s, v14.4h, v27.4h\n"
+    "ldr x13, [x5, #0xb0]\n"
+    "ldr x12, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v1.8h\n"
+    "smlal v7.4s, v4.4h, v2.4h\n"
+    "ldr x11, [x5, #0xc0]\n"
+    "ldr x10, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v27.8h\n"
+    "ldr d16, [x22, x3]\n"
+    "smlal2 v22.4s, v28.8h, v27.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v27.8h\n"
+    "ldr d27, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v1.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v1.4h\n"
+    "smlal v23.4s, v25.4h, v1.4h\n"
+    "ldr x9, [x5, #0xd0]\n"
+    "ldr x28, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v2.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x27, [x5, #0xe0]\n"
+    "ldr x26, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v1.8h\n"
+    "ldr d4, [x21, x3]\n"
+    "smlal2 v22.4s, v14.8h, v1.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v1.8h\n"
+    "ldr d1, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v2.4h\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v2.4h\n"
+    "smlal v23.4s, v10.4h, v2.4h\n"
+    "ldr x25, [x5, #0xf0]\n"
+    "ldr x24, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x23, [x5, #0x100]\n"
+    "ldr x22, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v2.8h\n"
+    "ldr d17, [x20, x3]\n"
+    "smlal2 v22.4s, v25.8h, v2.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v2.8h\n"
+    "ldr d2, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x21, [x5, #0x110]\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "tst x1, #0x7\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x13, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x12, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "usubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x11, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v27.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x10, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v27.8h\n"
+    "smlal v7.4s, v10.4h, v1.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x9, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v27.4h\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v27.4h\n"
+    "smlal v23.4s, v17.4h, v27.4h\n"
+    "smlal2 v15.4s, v10.8h, v1.8h\n"
+    "smlal v7.4s, v9.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v27.8h\n"
+    "ldr d10, [x28, x3]\n"
+    "smlal2 v22.4s, v4.8h, v27.8h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v27.8h\n"
+    "ldr d27, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v1.4h\n"
+    "usubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v1.4h\n"
+    "smlal v23.4s, v6.4h, v1.4h\n"
+    "smlal2 v15.4s, v9.8h, v2.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v1.8h\n"
+    "ldr d9, [x27, x3]\n"
+    "smlal2 v22.4s, v17.8h, v1.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v1.8h\n"
+    "ldr d1, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v2.4h\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v2.4h\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x26, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v2.8h\n"
+    "usubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v2.8h\n"
+    "ldr d2, [x25, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "usubl v2.8h, v2.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x24, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x23, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "usubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v27.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "smlal2 v15.4s, v6.8h, v27.8h\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x21, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x20, x3]\n"
+    "smlal v20.4s, v28.4h, v27.4h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v27.4h\n"
+    "smlal v23.4s, v12.4h, v27.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v1.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v27.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v27.8h\n"
+    "smlal2 v19.4s, v12.8h, v27.8h\n"
+    "ldr q27, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v1.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v2.4h, v1.4h\n"
+    "smlal v23.4s, v16.4h, v1.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v1.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v2.8h, v1.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v1.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v4.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v4.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v30.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "and v3.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+    "and v25.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+    "and v16.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+    "sqadd v15.4s, v15.4s, v30.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v4.16b, v5.16b, v14.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v10.16b, v22.16b, v14.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v3.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v25.4s\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v4.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v10.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v12.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "add x4, x4, #0x8\n"
+    "beq 124f\n"
+    "add x6, x6, #0xc8\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x1, #2, 5f\n"
+    "ld1 { v7.4s }, [x20], #0x10\n"
+    "tbz x1, #1, 4f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x1, #1, 6f\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "usubl v14.8h, v14.8b, v13.8b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "usubl v10.8h, v10.8b, v13.8b\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "usubl v12.8h, v12.8b, v13.8b\n"
+    "add x9, x9, x3\n"
+    "add x28, x28, x3\n"
+    "add x27, x27, x3\n"
+    "add x26, x26, x3\n"
+    "add x25, x25, x3\n"
+    "add x24, x24, x3\n"
+    "add x23, x23, x3\n"
+    "add x22, x22, x3\n"
+    "add x21, x21, x3\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 9f\n"
+    "ld1 { v31.s }[0], [x9], #0x4\n"
+    "ld1 { v17.s }[0], [x28], #0x4\n"
+    "ld1 { v30.s }[0], [x27], #0x4\n"
+    "ld1 { v16.s }[0], [x26], #0x4\n"
+    "ld1 { v3.s }[0], [x25], #0x4\n"
+    "ld1 { v4.s }[0], [x24], #0x4\n"
+    "ld1 { v25.s }[0], [x23], #0x4\n"
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v29.s }[0], [x21], #0x4\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 8f\n"
+    "ld1 { v31.h }[2], [x9], #0x2\n"
+    "ld1 { v17.h }[2], [x28], #0x2\n"
+    "ld1 { v30.h }[2], [x27], #0x2\n"
+    "ld1 { v16.h }[2], [x26], #0x2\n"
+    "ld1 { v3.h }[2], [x25], #0x2\n"
+    "ld1 { v4.h }[2], [x24], #0x2\n"
+    "ld1 { v25.h }[2], [x23], #0x2\n"
+    "ld1 { v9.h }[2], [x22], #0x2\n"
+    "ld1 { v29.h }[2], [x21], #0x2\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[6], [x9]\n"
+    "ld1 { v17.b }[6], [x28]\n"
+    "ld1 { v30.b }[6], [x27]\n"
+    "ld1 { v16.b }[6], [x26]\n"
+    "ld1 { v3.b }[6], [x25]\n"
+    "ld1 { v4.b }[6], [x24]\n"
+    "ld1 { v25.b }[6], [x23]\n"
+    "ld1 { v9.b }[6], [x22]\n"
+    "ld1 { v29.b }[6], [x21]\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[4], [x9]\n"
+    "ld1 { v17.b }[4], [x28]\n"
+    "ld1 { v30.b }[4], [x27]\n"
+    "ld1 { v16.b }[4], [x26]\n"
+    "ld1 { v3.b }[4], [x25]\n"
+    "ld1 { v4.b }[4], [x24]\n"
+    "ld1 { v25.b }[4], [x23]\n"
+    "ld1 { v9.b }[4], [x22]\n"
+    "ld1 { v29.b }[4], [x21]\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x1, #1, 10f\n"
+    "ld1 { v31.h }[0], [x9], #0x2\n"
+    "ld1 { v17.h }[0], [x28], #0x2\n"
+    "ld1 { v30.h }[0], [x27], #0x2\n"
+    "ld1 { v16.h }[0], [x26], #0x2\n"
+    "ld1 { v3.h }[0], [x25], #0x2\n"
+    "ld1 { v4.h }[0], [x24], #0x2\n"
+    "ld1 { v25.h }[0], [x23], #0x2\n"
+    "ld1 { v9.h }[0], [x22], #0x2\n"
+    "ld1 { v29.h }[0], [x21], #0x2\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[2], [x9]\n"
+    "ld1 { v17.b }[2], [x28]\n"
+    "ld1 { v30.b }[2], [x27]\n"
+    "ld1 { v16.b }[2], [x26]\n"
+    "ld1 { v3.b }[2], [x25]\n"
+    "ld1 { v4.b }[2], [x24]\n"
+    "ld1 { v25.b }[2], [x23]\n"
+    "ld1 { v9.b }[2], [x22]\n"
+    "ld1 { v29.b }[2], [x21]\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[0], [x9]\n"
+    "ld1 { v17.b }[0], [x28]\n"
+    "ld1 { v30.b }[0], [x27]\n"
+    "ld1 { v16.b }[0], [x26]\n"
+    "ld1 { v3.b }[0], [x25]\n"
+    "ld1 { v4.b }[0], [x24]\n"
+    "ld1 { v25.b }[0], [x23]\n"
+    "ld1 { v9.b }[0], [x22]\n"
+    "ld1 { v29.b }[0], [x21]\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "ldr x20, [x5, #0x50]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "add x20, x20, x3\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "tbz x1, #2, 13f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 12f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x1, #1, 14f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "ldr x20, [x5, #0x58]\n"
+    "smlal v23.4s, v27.4h, v10.4h\n"
+    "smlal2 v19.4s, v27.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "smlal v24.4s, v27.4h, v21.4h\n"
+    "smlal2 v22.4s, v27.8h, v21.8h\n"
+    "tbz x1, #2, 17f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 16f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x1, #1, 18f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "ldr x20, [x5, #0x60]\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 21f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 20f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
+    "tbz x1, #1, 22f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 5): Bit 2: End
+    "ldr d14, [x6, #0x28]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "usubl v14.8h, v14.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v14.4h\n"
+    "smlal2 v15.4s, v30.8h, v14.8h\n"
+    "smlal v20.4s, v16.4h, v14.4h\n"
+    "smlal2 v5.4s, v16.8h, v14.8h\n"
+    "smlal v24.4s, v28.4h, v14.4h\n"
+    "smlal2 v22.4s, v28.8h, v14.8h\n"
+    "tbz x1, #2, 25f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 24f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x1, #1, 26f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr d21, [x6, #0x30]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x70]\n"
+    "smlal v23.4s, v25.4h, v14.4h\n"
+    "smlal2 v19.4s, v25.8h, v14.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v16.8h, v21.8h\n"
+    "smlal v20.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v4.8h, v21.8h\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 29f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 28f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x1, #1, 30f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr d9, [x6, #0x38]\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v4.4h, v9.4h\n"
+    "smlal2 v15.4s, v4.8h, v9.8h\n"
+    "smlal v20.4s, v27.4h, v9.4h\n"
+    "smlal2 v5.4s, v27.8h, v9.8h\n"
+    "smlal v24.4s, v10.4h, v9.4h\n"
+    "smlal2 v22.4s, v10.8h, v9.8h\n"
+    "tbz x1, #2, 33f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 32f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x1, #1, 34f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr d31, [x6, #0x40]\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "usubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal v23.4s, v12.4h, v9.4h\n"
+    "smlal2 v19.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v27.4h, v31.4h\n"
+    "smlal2 v15.4s, v27.8h, v31.8h\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "smlal v24.4s, v12.4h, v31.4h\n"
+    "smlal2 v22.4s, v12.8h, v31.8h\n"
+    "tbz x1, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x1, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr d16, [x6, #0x48]\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0x88]\n"
+    "smlal v23.4s, v8.4h, v31.4h\n"
+    "smlal2 v19.4s, v8.8h, v31.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v6.4h, v16.4h\n"
+    "smlal2 v15.4s, v6.8h, v16.8h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal2 v5.4s, v29.8h, v16.8h\n"
+    "smlal v24.4s, v8.4h, v16.4h\n"
+    "smlal2 v22.4s, v8.8h, v16.8h\n"
+    "tbz x1, #2, 41f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 40f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
+    "tbz x1, #1, 42f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 5): Bit 2: End
+    "ldr d21, [x6, #0x50]\n"
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x90]\n"
+    "smlal v23.4s, v27.4h, v16.4h\n"
+    "smlal2 v19.4s, v27.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "smlal v20.4s, v25.4h, v21.4h\n"
+    "smlal2 v5.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 45f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 44f\n"
+    "ld1 { v31.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x1, #1, 46f\n"
+    "ld1 { v31.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "ldr x20, [x5, #0x98]\n"
+    "smlal v24.4s, v31.4h, v21.4h\n"
+    "smlal2 v22.4s, v31.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 49f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 48f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x1, #1, 50f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr d2, [x6, #0x58]\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa0]\n"
+    "smlal v23.4s, v28.4h, v21.4h\n"
+    "smlal2 v19.4s, v28.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "tbz x1, #2, 53f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 52f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x1, #1, 54f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr d25, [x6, #0x60]\n"
+    "usubl v21.8h, v21.8b, v18.8b\n"
+    "usubl v25.8h, v25.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal v23.4s, v21.4h, v2.4h\n"
+    "smlal2 v19.4s, v21.8h, v2.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v10.4h, v25.4h\n"
+    "smlal2 v15.4s, v10.8h, v25.8h\n"
+    "smlal v20.4s, v12.4h, v25.4h\n"
+    "smlal2 v5.4s, v12.8h, v25.8h\n"
+    "smlal v24.4s, v21.4h, v25.4h\n"
+    "smlal2 v22.4s, v21.8h, v25.8h\n"
+    "tbz x1, #2, 57f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 56f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x1, #1, 58f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr d1, [x6, #0x68]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "smlal v23.4s, v9.4h, v25.4h\n"
+    "smlal2 v19.4s, v9.8h, v25.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v12.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v1.8h\n"
+    "smlal v20.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v8.8h, v1.8h\n"
+    "smlal v24.4s, v9.4h, v1.4h\n"
+    "smlal2 v22.4s, v9.8h, v1.8h\n"
+    "tbz x1, #2, 61f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 60f\n"
+    "ld1 { v3.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x1, #1, 62f\n"
+    "ld1 { v3.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr d16, [x6, #0x70]\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb8]\n"
+    "smlal v23.4s, v3.4h, v1.4h\n"
+    "smlal2 v19.4s, v3.8h, v1.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "smlal2 v15.4s, v8.8h, v16.8h\n"
+    "smlal v20.4s, v27.4h, v16.4h\n"
+    "smlal2 v5.4s, v27.8h, v16.8h\n"
+    "smlal v24.4s, v3.4h, v16.4h\n"
+    "smlal2 v22.4s, v3.8h, v16.8h\n"
+    "tbz x1, #2, 65f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 64f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
+    "tbz x1, #1, 66f\n"
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 5): Bit 2: End
+    "ldr d17, [x6, #0x78]\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v13.8b\n"
+    "ldr x20, [x5, #0xc0]\n"
+    "smlal v23.4s, v14.4h, v16.4h\n"
+    "smlal2 v19.4s, v14.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v31.4h, v17.4h\n"
+    "smlal2 v15.4s, v31.8h, v17.8h\n"
+    "smlal v20.4s, v28.4h, v17.4h\n"
+    "smlal2 v5.4s, v28.8h, v17.8h\n"
+    "tbz x1, #2, 69f\n"
+    "ld1 { v1.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 68f\n"
+    "ld1 { v1.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x1, #1, 70f\n"
+    "ld1 { v1.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 0): Bit 2: End
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "ldr x20, [x5, #0xc8]\n"
+    "smlal v24.4s, v1.4h, v17.4h\n"
+    "smlal2 v22.4s, v1.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x1, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr d29, [x6, #0x80]\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "usubl v29.8h, v29.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd0]\n"
+    "smlal v23.4s, v16.4h, v17.4h\n"
+    "smlal2 v19.4s, v16.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v29.4h\n"
+    "smlal2 v15.4s, v28.8h, v29.8h\n"
+    "smlal v20.4s, v21.4h, v29.4h\n"
+    "smlal2 v5.4s, v21.8h, v29.8h\n"
+    "smlal v24.4s, v16.4h, v29.4h\n"
+    "smlal2 v22.4s, v16.8h, v29.8h\n"
+    "tbz x1, #2, 77f\n"
+    "ld1 { v30.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 76f\n"
+    "ld1 { v30.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x1, #1, 78f\n"
+    "ld1 { v30.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr d12, [x6, #0x88]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "usubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd8]\n"
+    "smlal v23.4s, v30.4h, v29.4h\n"
+    "smlal2 v19.4s, v30.8h, v29.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v21.4h, v12.4h\n"
+    "smlal2 v15.4s, v21.8h, v12.8h\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v30.4h, v12.4h\n"
+    "smlal2 v22.4s, v30.8h, v12.8h\n"
+    "tbz x1, #2, 81f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 80f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x1, #1, 82f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "83:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr d21, [x6, #0x90]\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe0]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal v20.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v3.8h, v21.8h\n"
+    "smlal v24.4s, v29.4h, v21.4h\n"
+    "smlal2 v22.4s, v29.8h, v21.8h\n"
+    "tbz x1, #2, 85f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 84f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x1, #1, 86f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "87:"  // Oddments: Load (4, 4): Bit 2: End
+    "ldr d8, [x6, #0x98]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe8]\n"
+    "smlal v23.4s, v25.4h, v21.4h\n"
+    "smlal2 v19.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v3.4h, v8.4h\n"
+    "smlal2 v15.4s, v3.8h, v8.8h\n"
+    "smlal v20.4s, v14.4h, v8.4h\n"
+    "smlal2 v5.4s, v14.8h, v8.8h\n"
+    "smlal v24.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "tbz x1, #2, 89f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 88f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
+    "tbz x1, #1, 90f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "91:"  // Oddments: Load (4, 5): Bit 2: End
+    "ldr d9, [x6, #0xa0]\n"
+    "usubl v21.8h, v21.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0xf0]\n"
+    "smlal v23.4s, v21.4h, v8.4h\n"
+    "smlal2 v19.4s, v21.8h, v8.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v1.4h, v9.4h\n"
+    "smlal2 v15.4s, v1.8h, v9.8h\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v5.4s, v16.8h, v9.8h\n"
+    "tbz x1, #2, 93f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 92f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
+    "tbz x1, #1, 94f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "95:"  // Oddments: Load (5, 0): Bit 2: End
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "ldr x20, [x5, #0xf8]\n"
+    "smlal v24.4s, v12.4h, v9.4h\n"
+    "smlal2 v22.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 97f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 96f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
+    "tbz x1, #1, 98f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "99:"  // Oddments: Load (5, 1): Bit 2: End
+    "ldr d12, [x6, #0xa8]\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "usubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0x100]\n"
+    "smlal v23.4s, v10.4h, v9.4h\n"
+    "smlal2 v19.4s, v10.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v12.4h\n"
+    "smlal2 v15.4s, v16.8h, v12.8h\n"
+    "smlal v20.4s, v30.4h, v12.4h\n"
+    "smlal2 v5.4s, v30.8h, v12.8h\n"
+    "smlal v24.4s, v10.4h, v12.4h\n"
+    "smlal2 v22.4s, v10.8h, v12.8h\n"
+    "tbz x1, #2, 101f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 100f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
+    "tbz x1, #1, 102f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "103:"  // Oddments: Load (5, 2): Bit 2: End
+    "ldr d28, [x6, #0xb0]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "usubl v28.8h, v28.8b, v13.8b\n"
+    "ldr x20, [x5, #0x108]\n"
+    "smlal v23.4s, v9.4h, v12.4h\n"
+    "smlal2 v19.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v28.4h\n"
+    "smlal2 v15.4s, v30.8h, v28.8h\n"
+    "smlal v20.4s, v29.4h, v28.4h\n"
+    "smlal2 v5.4s, v29.8h, v28.8h\n"
+    "smlal v24.4s, v9.4h, v28.4h\n"
+    "smlal2 v22.4s, v9.8h, v28.8h\n"
+    "tbz x1, #2, 105f\n"
+    "ld1 { v2.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 104f\n"
+    "ld1 { v2.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[6], [x20]\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[4], [x20]\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
+    "tbz x1, #1, 106f\n"
+    "ld1 { v2.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[2], [x20]\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[0], [x20]\n"
+    "107:"  // Oddments: Load (5, 3): Bit 2: End
+    "ldr d30, [x6, #0xb8]\n"
+    "usubl v2.8h, v2.8b, v18.8b\n"
+    "usubl v30.8h, v30.8b, v13.8b\n"
+    "ldr x20, [x5, #0x110]\n"
+    "smlal v23.4s, v2.4h, v28.4h\n"
+    "smlal2 v19.4s, v2.8h, v28.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v29.4h, v30.4h\n"
+    "smlal2 v15.4s, v29.8h, v30.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal v24.4s, v2.4h, v30.4h\n"
+    "smlal2 v22.4s, v2.8h, v30.8h\n"
+    "tbz x1, #2, 109f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 108f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
+    "tbz x1, #1, 110f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "111:"  // Oddments: Load (5, 4): Bit 2: End
+    "ldr d8, [x6, #0xc0]\n"
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "usubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal v23.4s, v27.4h, v30.4h\n"
+    "smlal2 v19.4s, v27.8h, v30.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v8.4h\n"
+    "smlal2 v15.4s, v25.8h, v8.8h\n"
+    "smlal v20.4s, v21.4h, v8.4h\n"
+    "smlal2 v5.4s, v21.8h, v8.8h\n"
+    "smlal v24.4s, v27.4h, v8.4h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "tbz x1, #2, 113f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 112f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
+    "tbz x1, #1, 114f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "115:"  // Oddments: Load (5, 5): Bit 2: End
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v23.4s, v9.4h, v8.4h\n"
+    "smlal2 v19.4s, v9.8h, v8.8h\n"
+    "tbz x1, #2, 117f\n"
+    "ld1 { v30.4s }, [x7], #0x10\n"
+    "ld1 { v12.4s }, [x8], #0x10\n"
+    "tbz x1, #1, 116f\n"
+    "ld1 { v14.d }[0], [x7], #0x8\n"
+    "ld1 { v27.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[2], [x7]\n"
+    "ld1 { v27.s }[2], [x8]\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[0], [x7]\n"
+    "ld1 { v27.s }[0], [x8]\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x1, #1, 118f\n"
+    "ld1 { v30.d }[0], [x7], #0x8\n"
+    "ld1 { v12.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[2], [x7]\n"
+    "ld1 { v12.s }[2], [x8]\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[0], [x7]\n"
+    "ld1 { v12.s }[0], [x8]\n"
+    "119:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+    "and v16.16b, v7.16b, v12.16b\n"
+    "add x17, x17, x4\n"
+    "add x16, x16, x4\n"
+    "sqrdmulh v15.4s, v15.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add x15, x15, x4\n"
+    "add x14, x14, x4\n"
+    "and v2.16b, v15.16b, v27.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+    "sqadd v7.4s, v7.4s, v16.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "and v21.16b, v20.16b, v12.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v18.16b, v24.16b, v12.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+    "and v31.16b, v23.16b, v12.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v15.4s, v15.4s, v2.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v27.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v4.16b, v22.16b, v27.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v27.16b\n"
+    "sqadd v20.4s, v20.4s, v21.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v18.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v31.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v12.4s\n"
+    "srshl v20.4s, v20.4s, v12.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v12.4s\n"
+    "sqadd v22.4s, v22.4s, v4.4s\n"
+    "srshl v23.4s, v23.4s, v12.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "srshl v15.4s, v15.4s, v27.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v27.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v27.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v27.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "tbz x1, #2, 121f\n"
+    "st1 { v7.s }[0], [x17], #0x4\n"
+    "st1 { v20.s }[0], [x16], #0x4\n"
+    "st1 { v24.s }[0], [x15], #0x4\n"
+    "st1 { v23.s }[0], [x14], #0x4\n"
+    "tbz x1, #1, 120f\n"
+    "st1 { v7.h }[2], [x17], #0x2\n"
+    "st1 { v20.h }[2], [x16], #0x2\n"
+    "st1 { v24.h }[2], [x15], #0x2\n"
+    "st1 { v23.h }[2], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[6], [x17], #0x1\n"
+    "st1 { v20.b }[6], [x16], #0x1\n"
+    "st1 { v24.b }[6], [x15], #0x1\n"
+    "st1 { v23.b }[6], [x14], #0x1\n"
+    "b 123f\n"
+    "120:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[4], [x17], #0x1\n"
+    "st1 { v20.b }[4], [x16], #0x1\n"
+    "st1 { v24.b }[4], [x15], #0x1\n"
+    "st1 { v23.b }[4], [x14], #0x1\n"
+    "b 123f\n"
+    "121:"  // Oddments: Bit 2: Unset
+    "tbz x1, #1, 122f\n"
+    "st1 { v7.h }[0], [x17], #0x2\n"
+    "st1 { v20.h }[0], [x16], #0x2\n"
+    "st1 { v24.h }[0], [x15], #0x2\n"
+    "st1 { v23.h }[0], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[2], [x17], #0x1\n"
+    "st1 { v20.b }[2], [x16], #0x1\n"
+    "st1 { v24.b }[2], [x15], #0x1\n"
+    "st1 { v23.b }[2], [x14], #0x1\n"
+    "b 123f\n"
+    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[0], [x17], #0x1\n"
+    "st1 { v20.b }[0], [x16], #0x1\n"
+    "st1 { v24.b }[0], [x15], #0x1\n"
+    "st1 { v23.b }[0], [x14], #0x1\n"
+    "123:"  // Oddments: Bit 2: End
+    "124:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..814efe006e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+class a64_u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  KernelType kernel = a64_u8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  a64_u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>(9, arm_gemm::VLType::None) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f7aa889b56
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  const arm_gemm::Requantize32& qp,
+  const unsigned int n_points,
+  const unsigned int n_channels
+)
+{
+  __asm__ __volatile__(
+    "lsr x9, %x[n_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v7.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v5.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v4.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v3.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v2.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v1.4s }, [x20]\n"
+    "mov x11, #0x0\n"
+    "cbz x9, 6f\n"
+    "1:"  // Channel loop
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q23, [%x[bias], x20]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x25, %x[inptrs]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "subs x24, %x[n_points], #0x1\n"
+    "ldr s14, [x21, x11]\n"
+    "ldr s15, [x20, x11]\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v25.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s16, [x21, x11]\n"
+    "mov v26.16b, v23.16b\n"
+    "mov v27.16b, v23.16b\n"
+    "ldr s17, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "ldr s18, [x21, x11]\n"
+    "ldr s19, [x20, x11]\n"
+    "mov v30.16b, v23.16b\n"
+    "mov v31.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s20, [x21, x11]\n"
+    "usubl v0.8h, v0.8b, v5.8b\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x23, x22, [x25], #0x10\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldr s14, [x23, x11]\n"
+    "ldr s15, [x22, x11]\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "ldr s16, [x21, x11]\n"
+    "ldr s17, [x20, x11]\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s18, [x21, x11]\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "ldr s19, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "subs x24, x24, #0x1\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "ldr s20, [x21, x11]\n"
+    "usubl v0.8h, v0.8b, v5.8b\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 5f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q2, [%x[rq_mul_ptr], x20]\n"
+    "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 5f\n"
+    "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
+    "5:"  // Channel loop: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s23, [x28, x11]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s24, [x27, x11]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s25, [x26, x11]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s26, [x25, x11]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x11]\n"
+    "str s28, [x23, x11]\n"
+    "str s29, [x22, x11]\n"
+    "str s30, [x21, x11]\n"
+    "str s31, [x20, x11]\n"
+    "add x11, x11, #0x4\n"
+    "cmp x11, x9, LSL #2\n"
+    "blt 1b\n"
+    "6:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 24f\n"
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 9f\n"
+    "add x20, %x[bias], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v23.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v23.s }[2], [x20], #0x4\n"
+    "b 8f\n"
+    "7:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "8:"  // Oddments: Load bias: Bit 1: End
+    "9:"  // Oddments: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x10, %x[inptrs]\n"
+    "ldp x9, x28, [x10], #0x10\n"
+    "mov v24.16b, v23.16b\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "mov v25.16b, v23.16b\n"
+    "mov v26.16b, v23.16b\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "ldr x21, [x10], #0x8\n"
+    "mov v27.16b, v23.16b\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "mov v30.16b, v23.16b\n"
+    "add x9, x9, x11\n"
+    "add x28, x28, x11\n"
+    "mov v31.16b, v23.16b\n"
+    "usubl v0.8h, v0.8b, v5.8b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "11:"  // Oddments: Load: Bit 1: End
+    "subs x20, %x[n_points], #0x1\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "ble 15f\n"
+    "12:"  // Oddments: Planar loop
+    "ldp x9, x28, [x10], #0x10\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldr x21, [x10], #0x8\n"
+    "add x9, x9, x11\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "add x28, x28, x11\n"
+    "add x27, x27, x11\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "usubl v0.8h, v0.8b, v5.8b\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 14f\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "14:"  // Oddments: Planar loop: Load: Bit 1: End
+    "subs x20, x20, #0x1\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 12b\n"
+    "15:"  // Oddments: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 21f\n"
+    "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v2.d }[0], [x22], #0x8\n"
+    "ld1 { v1.d }[0], [x21], #0x8\n"
+    "cbz %x[rq_left_shift_ptr], 16f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "16:"  // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v2.s }[2], [x22], #0x4\n"
+    "ld1 { v1.s }[2], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 17f\n"
+    "ld1 { v3.s }[2], [x20], #0x4\n"
+    "17:"  // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+    "b 20f\n"
+    "18:"  // Oddments: Load quantisation parameters: Bit 1: Unset
+    "ld1 { v2.s }[0], [x22], #0x4\n"
+    "ld1 { v1.s }[0], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 19f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+    "20:"  // Oddments: Load quantisation parameters: Bit 1: End
+    "21:"  // Oddments: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "add x28, x28, x11\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "st1 { v23.h }[0], [x28], #0x2\n"
+    "st1 { v24.h }[0], [x27], #0x2\n"
+    "st1 { v25.h }[0], [x26], #0x2\n"
+    "st1 { v26.h }[0], [x25], #0x2\n"
+    "st1 { v27.h }[0], [x24], #0x2\n"
+    "st1 { v28.h }[0], [x23], #0x2\n"
+    "st1 { v29.h }[0], [x22], #0x2\n"
+    "st1 { v30.h }[0], [x21], #0x2\n"
+    "st1 { v31.h }[0], [x20], #0x2\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v23.b }[2], [x28], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "b 23f\n"
+    "22:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v23.b }[0], [x28], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "23:"  // Oddments: Store: Bit 1: End
+    "24:"  // End
+    : [params] "+&r" (params)
+    : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..76965606f7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d69f391514
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "ldr q11, [%x[params], #0x0]\n"
+    "ldr q5, [%x[params], #0x10]\n"
+    "movi v8.16b, #0x1\n"
+    "ushr v8.4s, v8.4s, #0x8\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "ldr q7, [%x[params], #0x30]\n"
+    "movi v24.4s, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ld1 { v1.16b }, [x20]\n"
+    "mov v28.16b, v1.16b\n"
+    "mov v23.16b, v1.16b\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "mov v30.16b, v1.16b\n"
+    "mov v21.16b, v2.16b\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ld1 { v4.16b }, [x20]\n"
+    "mov v20.16b, v2.16b\n"
+    "mov v29.16b, v2.16b\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "ld1 { v0.16b }, [x20]\n"
+    "mov v9.16b, v4.16b\n"
+    "mov v22.16b, v4.16b\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ld1 { v3.16b }, [x20]\n"
+    "mov v31.16b, v4.16b\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x4\n"
+    "ext v30.16b, v30.16b, v30.16b, #0x6\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.4s }, [x20]\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x2\n"
+    "ext v20.16b, v20.16b, v20.16b, #0x4\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "ext v29.16b, v29.16b, v29.16b, #0x6\n"
+    "ext v9.16b, v9.16b, v9.16b, #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x4\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x6\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "mov v27.16b, v0.16b\n"
+    "mov v19.16b, v0.16b\n"
+    "cmp %x[n_channels], #0x4\n"
+    "mov x9, #0x0\n"
+    "mov v18.16b, v0.16b\n"
+    "mov v26.16b, v3.16b\n"
+    "mov x28, #0x0\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "mov v17.16b, v3.16b\n"
+    "mov v16.16b, v3.16b\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x2\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x4\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "add %x[params], %x[params], #0x40\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+    "zip1 v1.4s, v1.4s, v23.4s\n"
+    "zip1 v28.4s, v28.4s, v30.4s\n"
+    "zip1 v2.4s, v2.4s, v20.4s\n"
+    "zip1 v21.4s, v21.4s, v29.4s\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x2\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+    "zip1 v4.4s, v4.4s, v22.4s\n"
+    "zip1 v9.4s, v9.4s, v31.4s\n"
+    "zip1 v0.4s, v0.4s, v19.4s\n"
+    "zip1 v27.4s, v27.4s, v18.4s\n"
+    "zip1 v1.4s, v1.4s, v28.4s\n"
+    "zip1 v2.4s, v2.4s, v21.4s\n"
+    ".inst 0x6f81e118  // udot v24.4s, v8.16b, v1.4b[0]\n"
+    "zip1 v3.4s, v3.4s, v17.4s\n"
+    "zip1 v26.4s, v26.4s, v16.4s\n"
+    ".inst 0x6fa1e119  // udot v25.4s, v8.16b, v1.4b[1]\n"
+    "zip1 v4.4s, v4.4s, v9.4s\n"
+    "movi v23.4s, #0x0\n"
+    ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
+    "movi v22.4s, #0x0\n"
+    "movi v21.4s, #0x0\n"
+    ".inst 0x6fa1e916  // udot v22.4s, v8.16b, v1.4b[3]\n"
+    "movi v19.4s, #0x0\n"
+    "movi v9.4s, #0x0\n"
+    ".inst 0x6f82e115  // udot v21.4s, v8.16b, v2.4b[0]\n"
+    "movi v10.4s, #0x0\n"
+    "movi v20.4s, #0x0\n"
+    ".inst 0x6fa2e113  // udot v19.4s, v8.16b, v2.4b[1]\n"
+    "movi v18.4s, #0x0\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x6f82e909  // udot v9.4s, v8.16b, v2.4b[2]\n"
+    "movi v16.4s, #0x0\n"
+    "zip1 v0.4s, v0.4s, v27.4s\n"
+    ".inst 0x6fa2e90a  // udot v10.4s, v8.16b, v2.4b[3]\n"
+    "zip1 v3.4s, v3.4s, v26.4s\n"
+    ".inst 0x6f84e114  // udot v20.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x6fa4e112  // udot v18.4s, v8.16b, v4.4b[1]\n"
+    ".inst 0x6f84e911  // udot v17.4s, v8.16b, v4.4b[2]\n"
+    ".inst 0x6fa4e910  // udot v16.4s, v8.16b, v4.4b[3]\n"
+    "movi v31.4s, #0x0\n"
+    "movi v30.4s, #0x0\n"
+    "movi v26.4s, #0x0\n"
+    ".inst 0x6f80e11f  // udot v31.4s, v8.16b, v0.4b[0]\n"
+    "movi v27.4s, #0x0\n"
+    "movi v28.4s, #0x0\n"
+    ".inst 0x6fa0e11e  // udot v30.4s, v8.16b, v0.4b[1]\n"
+    "movi v29.4s, #0x0\n"
+    ".inst 0x6f80e91a  // udot v26.4s, v8.16b, v0.4b[2]\n"
+    ".inst 0x6fa0e91b  // udot v27.4s, v8.16b, v0.4b[3]\n"
+    ".inst 0x6f83e11c  // udot v28.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x6fa3e11d  // udot v29.4s, v8.16b, v3.4b[1]\n"
+    "add v24.4s, v24.4s, v21.4s\n"
+    "add v25.4s, v25.4s, v19.4s\n"
+    "add v23.4s, v23.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "add v21.4s, v20.4s, v21.4s\n"
+    "movi v20.4s, #0x0\n"
+    ".inst 0x6f83e914  // udot v20.4s, v8.16b, v3.4b[2]\n"
+    "add v19.4s, v18.4s, v19.4s\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x6fa3e912  // udot v18.4s, v8.16b, v3.4b[3]\n"
+    "add v17.4s, v17.4s, v9.4s\n"
+    "add v16.4s, v16.4s, v10.4s\n"
+    "add v24.4s, v24.4s, v31.4s\n"
+    "add v25.4s, v25.4s, v30.4s\n"
+    "add v26.4s, v23.4s, v26.4s\n"
+    "add v27.4s, v22.4s, v27.4s\n"
+    "add v28.4s, v21.4s, v28.4s\n"
+    "add v29.4s, v19.4s, v29.4s\n"
+    "add v30.4s, v17.4s, v20.4s\n"
+    "add v31.4s, v16.4s, v18.4s\n"
+    "neg v12.4s, v12.4s\n"
+    "mul v24.4s, v24.4s, v12.4s\n"
+    "mul v25.4s, v25.4s, v12.4s\n"
+    "mul v26.4s, v26.4s, v12.4s\n"
+    "mul v27.4s, v27.4s, v12.4s\n"
+    "mul v28.4s, v28.4s, v12.4s\n"
+    "mul v29.4s, v29.4s, v12.4s\n"
+    "mul v30.4s, v30.4s, v12.4s\n"
+    "mul v31.4s, v31.4s, v12.4s\n"
+    "zip1 v19.4s, v24.4s, v26.4s\n"
+    "zip1 v18.4s, v25.4s, v27.4s\n"
+    "zip1 v17.4s, v28.4s, v30.4s\n"
+    "zip1 v16.4s, v29.4s, v31.4s\n"
+    "zip1 v22.4s, v19.4s, v18.4s\n"
+    "zip1 v23.4s, v17.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    "ldr q8, [%x[params], #0x0]\n"
+    "ldr q21, [%x[params], #0x10]\n"
+    ".inst 0x6f80e0b8  // udot v24.4s, v5.16b, v0.4b[0]\n"
+    ".inst 0x6fa0e0b9  // udot v25.4s, v5.16b, v0.4b[1]\n"
+    "ldr q20, [%x[params], #0x20]\n"
+    ".inst 0x6f80e8ba  // udot v26.4s, v5.16b, v0.4b[2]\n"
+    ".inst 0x6fa0e8bb  // udot v27.4s, v5.16b, v0.4b[3]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x6f81e0d8  // udot v24.4s, v6.16b, v1.4b[0]\n"
+    ".inst 0x6fa1e0d9  // udot v25.4s, v6.16b, v1.4b[1]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x9, x9, #0x10\n"
+    ".inst 0x6f81e8da  // udot v26.4s, v6.16b, v1.4b[2]\n"
+    ".inst 0x6fa1e8db  // udot v27.4s, v6.16b, v1.4b[3]\n"
+    ".inst 0x6f82e0bc  // udot v28.4s, v5.16b, v2.4b[0]\n"
+    ".inst 0x6fa2e0bd  // udot v29.4s, v5.16b, v2.4b[1]\n"
+    ".inst 0x6f82e8be  // udot v30.4s, v5.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e8bf  // udot v31.4s, v5.16b, v2.4b[3]\n"
+    "ldr q5, [%x[params], #0x30]\n"
+    ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+    ".inst 0x6fa2e0f9  // udot v25.4s, v7.16b, v2.4b[1]\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e8fb  // udot v27.4s, v7.16b, v2.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    ".inst 0x6f83e0dc  // udot v28.4s, v6.16b, v3.4b[0]\n"
+    ".inst 0x6fa3e0dd  // udot v29.4s, v6.16b, v3.4b[1]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    ".inst 0x6f83e8de  // udot v30.4s, v6.16b, v3.4b[2]\n"
+    ".inst 0x6fa3e8df  // udot v31.4s, v6.16b, v3.4b[3]\n"
+    "ldr q6, [%x[params], #0x40]\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    ".inst 0x6f84e0fc  // udot v28.4s, v7.16b, v4.4b[0]\n"
+    ".inst 0x6fa4e0fd  // udot v29.4s, v7.16b, v4.4b[1]\n"
+    "and v19.16b, v24.16b, v21.16b\n"
+    ".inst 0x6f84e8fe  // udot v30.4s, v7.16b, v4.4b[2]\n"
+    ".inst 0x6fa4e8ff  // udot v31.4s, v7.16b, v4.4b[3]\n"
+    "ldr q7, [%x[params], #0x50]\n"
+    "and v18.16b, v25.16b, v21.16b\n"
+    "and v17.16b, v26.16b, v21.16b\n"
+    "and v16.16b, v27.16b, v21.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v16.16b, v31.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v21.4s\n"
+    "srshl v25.4s, v25.4s, v21.4s\n"
+    "srshl v26.4s, v26.4s, v21.4s\n"
+    "srshl v27.4s, v27.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v27.4s, v27.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "smax v27.4s, v27.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s24, [x27, x28]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s25, [x26, x28]\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s26, [x25, x28]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x28]\n"
+    "str s28, [x23, x28]\n"
+    "dup v24.4s, v22.s[0]\n"
+    "dup v25.4s, v22.s[1]\n"
+    "str s29, [x22, x28]\n"
+    "dup v26.4s, v22.s[2]\n"
+    "dup v27.4s, v22.s[3]\n"
+    "str s30, [x21, x28]\n"
+    "dup v28.4s, v23.s[0]\n"
+    "dup v29.4s, v23.s[1]\n"
+    "str s31, [x20, x28]\n"
+    "dup v30.4s, v23.s[2]\n"
+    "dup v31.4s, v23.s[3]\n"
+    "add x28, x28, #0x4\n"
+    "add v24.4s, v24.4s, v20.4s\n"
+    "add v25.4s, v25.4s, v20.4s\n"
+    "add v26.4s, v26.4s, v20.4s\n"
+    "add v27.4s, v27.4s, v20.4s\n"
+    "add v28.4s, v28.4s, v20.4s\n"
+    "add v29.4s, v29.4s, v20.4s\n"
+    "add v30.4s, v30.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v20.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q21, [%x[params], #0x0]\n"
+    "ldr q20, [%x[params], #0x10]\n"
+    ".inst 0x6f80e0b8  // udot v24.4s, v5.16b, v0.4b[0]\n"
+    ".inst 0x6fa0e0b9  // udot v25.4s, v5.16b, v0.4b[1]\n"
+    ".inst 0x6f80e8ba  // udot v26.4s, v5.16b, v0.4b[2]\n"
+    ".inst 0x6fa0e8bb  // udot v27.4s, v5.16b, v0.4b[3]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x27, x27, x28\n"
+    ".inst 0x6f81e0d8  // udot v24.4s, v6.16b, v1.4b[0]\n"
+    ".inst 0x6fa1e0d9  // udot v25.4s, v6.16b, v1.4b[1]\n"
+    "add x26, x26, x28\n"
+    "add x25, x25, x28\n"
+    ".inst 0x6f81e8da  // udot v26.4s, v6.16b, v1.4b[2]\n"
+    ".inst 0x6fa1e8db  // udot v27.4s, v6.16b, v1.4b[3]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    ".inst 0x6f82e0bc  // udot v28.4s, v5.16b, v2.4b[0]\n"
+    ".inst 0x6fa2e0bd  // udot v29.4s, v5.16b, v2.4b[1]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    ".inst 0x6f82e8be  // udot v30.4s, v5.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e8bf  // udot v31.4s, v5.16b, v2.4b[3]\n"
+    "add x20, x20, x28\n"
+    "add %x[params], %x[params], #0x20\n"
+    ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+    ".inst 0x6fa2e0f9  // udot v25.4s, v7.16b, v2.4b[1]\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e8fb  // udot v27.4s, v7.16b, v2.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+    ".inst 0x6f83e0dc  // udot v28.4s, v6.16b, v3.4b[0]\n"
+    ".inst 0x6fa3e0dd  // udot v29.4s, v6.16b, v3.4b[1]\n"
+    "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+    ".inst 0x6f83e8de  // udot v30.4s, v6.16b, v3.4b[2]\n"
+    ".inst 0x6fa3e8df  // udot v31.4s, v6.16b, v3.4b[3]\n"
+    "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+    ".inst 0x6f84e0fc  // udot v28.4s, v7.16b, v4.4b[0]\n"
+    ".inst 0x6fa4e0fd  // udot v29.4s, v7.16b, v4.4b[1]\n"
+    "and v19.16b, v24.16b, v20.16b\n"
+    ".inst 0x6f84e8fe  // udot v30.4s, v7.16b, v4.4b[2]\n"
+    ".inst 0x6fa4e8ff  // udot v31.4s, v7.16b, v4.4b[3]\n"
+    "and v18.16b, v25.16b, v20.16b\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "and v16.16b, v27.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v30.16b, v20.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v27.4s, v27.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "add v24.4s, v24.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v27.4s, v27.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v14.4s\n"
+    "add v30.4s, v30.4s, v14.4s\n"
+    "add v31.4s, v31.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v13.4s\n"
+    "smax v25.4s, v25.4s, v13.4s\n"
+    "smax v26.4s, v26.4s, v13.4s\n"
+    "smax v27.4s, v27.4s, v13.4s\n"
+    "smax v28.4s, v28.4s, v13.4s\n"
+    "smax v29.4s, v29.4s, v13.4s\n"
+    "smax v30.4s, v30.4s, v13.4s\n"
+    "smax v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "blt 3f\n"
+    "str s24, [x27, #0x0]\n"
+    "str s25, [x26, #0x0]\n"
+    "str s26, [x25, #0x0]\n"
+    "str s27, [x24, #0x0]\n"
+    "str s28, [x23, #0x0]\n"
+    "str s29, [x22, #0x0]\n"
+    "str s30, [x21, #0x0]\n"
+    "str s31, [x20, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[1], [x27], #0x1\n"
+    "st1 { v25.b }[1], [x26], #0x1\n"
+    "st1 { v26.b }[1], [x25], #0x1\n"
+    "st1 { v27.b }[1], [x24], #0x1\n"
+    "st1 { v28.b }[1], [x23], #0x1\n"
+    "st1 { v29.b }[1], [x22], #0x1\n"
+    "st1 { v30.b }[1], [x21], #0x1\n"
+    "st1 { v31.b }[1], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "beq 4f\n"
+    "st1 { v24.b }[3], [x27], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v25.b }[3], [x26], #0x1\n"
+    "st1 { v26.b }[3], [x25], #0x1\n"
+    "st1 { v27.b }[3], [x24], #0x1\n"
+    "st1 { v28.b }[3], [x23], #0x1\n"
+    "st1 { v29.b }[3], [x22], #0x1\n"
+    "st1 { v30.b }[3], [x21], #0x1\n"
+    "st1 { v31.b }[3], [x20], #0x1\n"
+    "4:"  // Tail: End
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..4485aaa735
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+  : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..61cec2b66d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,640 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "ldr q12, [%x[params], #0x0]\n"
+    "ldr q8, [%x[params], #0x10]\n"
+    "movi v30.16b, #0x1\n"
+    "movi v17.4s, #0x0\n"
+    "ldr q9, [%x[params], #0x20]\n"
+    "ldr q10, [%x[params], #0x30]\n"
+    "movi v16.4s, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    "ldr q11, [%x[params], #0x40]\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "movi v24.4s, #0x0\n"
+    "movi v31.4s, #0x0\n"
+    "ld1 { v3.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "mov v26.16b, v3.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "ld1 { v4.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "mov v21.16b, v4.16b\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "mov v27.16b, v2.16b\n"
+    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+    "ld1 { v1.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x28]\n"
+    "zip1 v3.2d, v3.2d, v26.2d\n"
+    "zip1 v4.2d, v4.2d, v21.2d\n"
+    "ld1 { v5.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x30]\n"
+    "mov v26.16b, v1.16b\n"
+    "mov v22.16b, v5.16b\n"
+    "ld1 { v6.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x38]\n"
+    "mov v19.16b, v6.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    "ld1 { v7.16b }, [x20]\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "mov v21.16b, v7.16b\n"
+    "zip1 v2.2d, v2.2d, v27.2d\n"
+    "ld1 { v0.16b }, [x20]\n"
+    "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+    ".inst 0x6f83e3d1  // udot v17.4s, v30.16b, v3.4b[0]\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+    ".inst 0x6f83ebd0  // udot v16.4s, v30.16b, v3.4b[2]\n"
+    ".inst 0x6f84e3d9  // udot v25.4s, v30.16b, v4.4b[0]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v23.4s }, [x20]\n"
+    ".inst 0x6f84ebd8  // udot v24.4s, v30.16b, v4.4b[2]\n"
+    "mov v18.16b, v0.16b\n"
+    ".inst 0x6f82e3df  // udot v31.4s, v30.16b, v2.4b[0]\n"
+    "movi v29.4s, #0x0\n"
+    "movi v28.4s, #0x1\n"
+    ".inst 0x6f82ebdd  // udot v29.4s, v30.16b, v2.4b[2]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.4s }, [x20]\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+    "zip1 v1.2d, v1.2d, v26.2d\n"
+    ".inst 0x6fa3e391  // udot v17.4s, v28.16b, v3.4b[1]\n"
+    "zip1 v5.2d, v5.2d, v22.2d\n"
+    "zip1 v6.2d, v6.2d, v19.2d\n"
+    ".inst 0x6fa3eb90  // udot v16.4s, v28.16b, v3.4b[3]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "zip1 v7.2d, v7.2d, v21.2d\n"
+    "movi v22.4s, #0x0\n"
+    ".inst 0x6fa4e399  // udot v25.4s, v28.16b, v4.4b[1]\n"
+    "movi v21.4s, #0x0\n"
+    ".inst 0x6fa4eb98  // udot v24.4s, v28.16b, v4.4b[3]\n"
+    ".inst 0x6f81e3d6  // udot v22.4s, v30.16b, v1.4b[0]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "movi v26.4s, #0x0\n"
+    "movi v27.4s, #0x0\n"
+    ".inst 0x6f81ebd5  // udot v21.4s, v30.16b, v1.4b[2]\n"
+    "movi v20.4s, #0x0\n"
+    "movi v19.4s, #0x0\n"
+    ".inst 0x6f85e3da  // udot v26.4s, v30.16b, v5.4b[0]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "zip1 v0.2d, v0.2d, v18.2d\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x6f85ebdb  // udot v27.4s, v30.16b, v5.4b[2]\n"
+    "mov x9, #0x0\n"
+    ".inst 0x6f86e3d4  // udot v20.4s, v30.16b, v6.4b[0]\n"
+    ".inst 0x6f86ebd3  // udot v19.4s, v30.16b, v6.4b[2]\n"
+    "add v17.4s, v17.4s, v25.4s\n"
+    "mov x28, #0x0\n"
+    "movi v25.4s, #0x0\n"
+    ".inst 0x6f87e3d2  // udot v18.4s, v30.16b, v7.4b[0]\n"
+    ".inst 0x6f87ebd9  // udot v25.4s, v30.16b, v7.4b[2]\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    ".inst 0x6fa2e39f  // udot v31.4s, v28.16b, v2.4b[1]\n"
+    ".inst 0x6fa2eb9d  // udot v29.4s, v28.16b, v2.4b[3]\n"
+    "add v16.4s, v16.4s, v24.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "movi v24.4s, #0x0\n"
+    ".inst 0x6f80e3d8  // udot v24.4s, v30.16b, v0.4b[0]\n"
+    ".inst 0x6fa1e396  // udot v22.4s, v28.16b, v1.4b[1]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    ".inst 0x6fa1eb95  // udot v21.4s, v28.16b, v1.4b[3]\n"
+    ".inst 0x6fa5e39a  // udot v26.4s, v28.16b, v5.4b[1]\n"
+    "add v31.4s, v31.4s, v17.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    ".inst 0x6fa5eb9b  // udot v27.4s, v28.16b, v5.4b[3]\n"
+    ".inst 0x6fa6e394  // udot v20.4s, v28.16b, v6.4b[1]\n"
+    "add v29.4s, v29.4s, v16.4s\n"
+    "add %x[params], %x[params], #0x50\n"
+    ".inst 0x6fa6eb93  // udot v19.4s, v28.16b, v6.4b[3]\n"
+    ".inst 0x6fa7e392  // udot v18.4s, v28.16b, v7.4b[1]\n"
+    "add v22.4s, v22.4s, v31.4s\n"
+    ".inst 0x6fa7eb99  // udot v25.4s, v28.16b, v7.4b[3]\n"
+    ".inst 0x6fa0e398  // udot v24.4s, v28.16b, v0.4b[1]\n"
+    "add v21.4s, v21.4s, v29.4s\n"
+    "add v20.4s, v26.4s, v20.4s\n"
+    "add v19.4s, v27.4s, v19.4s\n"
+    "add v18.4s, v18.4s, v17.4s\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x6f80ebd1  // udot v17.4s, v30.16b, v0.4b[2]\n"
+    ".inst 0x6fa0eb91  // udot v17.4s, v28.16b, v0.4b[3]\n"
+    "add v16.4s, v25.4s, v16.4s\n"
+    "add v24.4s, v22.4s, v24.4s\n"
+    "add v25.4s, v21.4s, v17.4s\n"
+    "add v26.4s, v26.4s, v22.4s\n"
+    "add v27.4s, v27.4s, v21.4s\n"
+    "add v28.4s, v20.4s, v31.4s\n"
+    "add v29.4s, v19.4s, v29.4s\n"
+    "add v30.4s, v20.4s, v18.4s\n"
+    "add v31.4s, v19.4s, v16.4s\n"
+    "neg v23.4s, v23.4s\n"
+    "mul v24.4s, v24.4s, v23.4s\n"
+    "mul v25.4s, v25.4s, v23.4s\n"
+    "mul v26.4s, v26.4s, v23.4s\n"
+    "mul v27.4s, v27.4s, v23.4s\n"
+    "mul v28.4s, v28.4s, v23.4s\n"
+    "mul v29.4s, v29.4s, v23.4s\n"
+    "mul v30.4s, v30.4s, v23.4s\n"
+    "mul v31.4s, v31.4s, v23.4s\n"
+    "zip1 v19.4s, v24.4s, v26.4s\n"
+    "zip1 v18.4s, v25.4s, v27.4s\n"
+    "zip1 v17.4s, v28.4s, v30.4s\n"
+    "zip1 v16.4s, v29.4s, v31.4s\n"
+    "zip1 v22.4s, v19.4s, v18.4s\n"
+    "zip1 v23.4s, v17.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v12.4s\n"
+    "add v25.4s, v25.4s, v12.4s\n"
+    "add v26.4s, v26.4s, v12.4s\n"
+    "add v27.4s, v27.4s, v12.4s\n"
+    "add v28.4s, v28.4s, v12.4s\n"
+    "add v29.4s, v29.4s, v12.4s\n"
+    "add v30.4s, v30.4s, v12.4s\n"
+    "add v31.4s, v31.4s, v12.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    "ldr q12, [%x[params], #0x60]\n"
+    "ldr q21, [%x[params], #0x70]\n"
+    ".inst 0x6f80e118  // udot v24.4s, v8.16b, v0.4b[0]\n"
+    ".inst 0x6f80e919  // udot v25.4s, v8.16b, v0.4b[2]\n"
+    "ldr q20, [%x[params], #0x80]\n"
+    ".inst 0x6f81e11a  // udot v26.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x6f81e91b  // udot v27.4s, v8.16b, v1.4b[2]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x6fa0e138  // udot v24.4s, v9.16b, v0.4b[1]\n"
+    ".inst 0x6fa0e939  // udot v25.4s, v9.16b, v0.4b[3]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x9, x9, #0x10\n"
+    ".inst 0x6fa1e13a  // udot v26.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x6fa1e93b  // udot v27.4s, v9.16b, v1.4b[3]\n"
+    ".inst 0x6f82e11c  // udot v28.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x6f82e91d  // udot v29.4s, v8.16b, v2.4b[2]\n"
+    ".inst 0x6f83e11e  // udot v30.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x6f83e91f  // udot v31.4s, v8.16b, v3.4b[2]\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    ".inst 0x6f81e158  // udot v24.4s, v10.16b, v1.4b[0]\n"
+    ".inst 0x6f81e959  // udot v25.4s, v10.16b, v1.4b[2]\n"
+    ".inst 0x6f82e15a  // udot v26.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x6f82e95b  // udot v27.4s, v10.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e13c  // udot v28.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e93d  // udot v29.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x6fa3e13e  // udot v30.4s, v9.16b, v3.4b[1]\n"
+    ".inst 0x6fa3e93f  // udot v31.4s, v9.16b, v3.4b[3]\n"
+    "ldr q16, [%x[params], #0x10]\n"
+    ".inst 0x6fa1e178  // udot v24.4s, v11.16b, v1.4b[1]\n"
+    ".inst 0x6fa1e979  // udot v25.4s, v11.16b, v1.4b[3]\n"
+    ".inst 0x6fa2e17a  // udot v26.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e97b  // udot v27.4s, v11.16b, v2.4b[3]\n"
+    ".inst 0x6f83e15c  // udot v28.4s, v10.16b, v3.4b[0]\n"
+    ".inst 0x6f83e95d  // udot v29.4s, v10.16b, v3.4b[2]\n"
+    ".inst 0x6f84e15e  // udot v30.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x6f84e95f  // udot v31.4s, v10.16b, v4.4b[2]\n"
+    "ldr q19, [%x[params], #0x20]\n"
+    ".inst 0x6f82e238  // udot v24.4s, v17.16b, v2.4b[0]\n"
+    ".inst 0x6f82ea39  // udot v25.4s, v17.16b, v2.4b[2]\n"
+    ".inst 0x6f83e23a  // udot v26.4s, v17.16b, v3.4b[0]\n"
+    ".inst 0x6f83ea3b  // udot v27.4s, v17.16b, v3.4b[2]\n"
+    ".inst 0x6fa3e17c  // udot v28.4s, v11.16b, v3.4b[1]\n"
+    ".inst 0x6fa3e97d  // udot v29.4s, v11.16b, v3.4b[3]\n"
+    ".inst 0x6fa4e17e  // udot v30.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x6fa4e97f  // udot v31.4s, v11.16b, v4.4b[3]\n"
+    "ldr q18, [%x[params], #0x30]\n"
+    ".inst 0x6fa2e218  // udot v24.4s, v16.16b, v2.4b[1]\n"
+    ".inst 0x6fa2ea19  // udot v25.4s, v16.16b, v2.4b[3]\n"
+    ".inst 0x6fa3e21a  // udot v26.4s, v16.16b, v3.4b[1]\n"
+    ".inst 0x6fa3ea1b  // udot v27.4s, v16.16b, v3.4b[3]\n"
+    ".inst 0x6f84e23c  // udot v28.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea3d  // udot v29.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x6f85e23e  // udot v30.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea3f  // udot v31.4s, v17.16b, v5.4b[2]\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    ".inst 0x6f83e278  // udot v24.4s, v19.16b, v3.4b[0]\n"
+    ".inst 0x6f83ea79  // udot v25.4s, v19.16b, v3.4b[2]\n"
+    ".inst 0x6f84e27a  // udot v26.4s, v19.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea7b  // udot v27.4s, v19.16b, v4.4b[2]\n"
+    ".inst 0x6fa4e21c  // udot v28.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea1d  // udot v29.4s, v16.16b, v4.4b[3]\n"
+    ".inst 0x6fa5e21e  // udot v30.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea1f  // udot v31.4s, v16.16b, v5.4b[3]\n"
+    "ldr q16, [%x[params], #0x50]\n"
+    ".inst 0x6fa3e258  // udot v24.4s, v18.16b, v3.4b[1]\n"
+    ".inst 0x6fa3ea59  // udot v25.4s, v18.16b, v3.4b[3]\n"
+    ".inst 0x6fa4e25a  // udot v26.4s, v18.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea5b  // udot v27.4s, v18.16b, v4.4b[3]\n"
+    ".inst 0x6f85e27c  // udot v28.4s, v19.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea7d  // udot v29.4s, v19.16b, v5.4b[2]\n"
+    ".inst 0x6f86e27e  // udot v30.4s, v19.16b, v6.4b[0]\n"
+    ".inst 0x6f86ea7f  // udot v31.4s, v19.16b, v6.4b[2]\n"
+    "ldr q10, [%x[params], #0xb0]\n"
+    ".inst 0x6f84e238  // udot v24.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea39  // udot v25.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x6f85e23a  // udot v26.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea3b  // udot v27.4s, v17.16b, v5.4b[2]\n"
+    ".inst 0x6fa5e25c  // udot v28.4s, v18.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea5d  // udot v29.4s, v18.16b, v5.4b[3]\n"
+    ".inst 0x6fa6e25e  // udot v30.4s, v18.16b, v6.4b[1]\n"
+    ".inst 0x6fa6ea5f  // udot v31.4s, v18.16b, v6.4b[3]\n"
+    "ldr q11, [%x[params], #0xc0]\n"
+    ".inst 0x6fa4e218  // udot v24.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea19  // udot v25.4s, v16.16b, v4.4b[3]\n"
+    "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+    ".inst 0x6fa5e21a  // udot v26.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea1b  // udot v27.4s, v16.16b, v5.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+    ".inst 0x6f86e23c  // udot v28.4s, v17.16b, v6.4b[0]\n"
+    ".inst 0x6f86ea3d  // udot v29.4s, v17.16b, v6.4b[2]\n"
+    "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+    ".inst 0x6f87e23e  // udot v30.4s, v17.16b, v7.4b[0]\n"
+    ".inst 0x6f87ea3f  // udot v31.4s, v17.16b, v7.4b[2]\n"
+    "ldr q8, [%x[params], #0x90]\n"
+    "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+    ".inst 0x6fa6e21c  // udot v28.4s, v16.16b, v6.4b[1]\n"
+    ".inst 0x6fa6ea1d  // udot v29.4s, v16.16b, v6.4b[3]\n"
+    "and v19.16b, v24.16b, v21.16b\n"
+    ".inst 0x6fa7e21e  // udot v30.4s, v16.16b, v7.4b[1]\n"
+    ".inst 0x6fa7ea1f  // udot v31.4s, v16.16b, v7.4b[3]\n"
+    "ldr q9, [%x[params], #0xa0]\n"
+    "and v18.16b, v25.16b, v21.16b\n"
+    "and v17.16b, v26.16b, v21.16b\n"
+    "and v16.16b, v27.16b, v21.16b\n"
+    "add %x[params], %x[params], #0xd0\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v12.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v21.16b\n"
+    "and v18.16b, v29.16b, v21.16b\n"
+    "and v17.16b, v30.16b, v21.16b\n"
+    "and v16.16b, v31.16b, v21.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v21.4s\n"
+    "srshl v25.4s, v25.4s, v21.4s\n"
+    "srshl v26.4s, v26.4s, v21.4s\n"
+    "srshl v27.4s, v27.4s, v21.4s\n"
+    "srshl v28.4s, v28.4s, v21.4s\n"
+    "srshl v29.4s, v29.4s, v21.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v31.4s, v31.4s, v21.4s\n"
+    "add v24.4s, v24.4s, v13.4s\n"
+    "add v25.4s, v25.4s, v13.4s\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "add v27.4s, v27.4s, v13.4s\n"
+    "add v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "add v31.4s, v31.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s24, [x27, x28]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s25, [x26, x28]\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s26, [x25, x28]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x28]\n"
+    "str s28, [x23, x28]\n"
+    "dup v24.4s, v22.s[0]\n"
+    "dup v25.4s, v22.s[1]\n"
+    "str s29, [x22, x28]\n"
+    "dup v26.4s, v22.s[2]\n"
+    "dup v27.4s, v22.s[3]\n"
+    "str s30, [x21, x28]\n"
+    "dup v28.4s, v23.s[0]\n"
+    "dup v29.4s, v23.s[1]\n"
+    "str s31, [x20, x28]\n"
+    "dup v30.4s, v23.s[2]\n"
+    "dup v31.4s, v23.s[3]\n"
+    "add x28, x28, #0x4\n"
+    "add v24.4s, v24.4s, v20.4s\n"
+    "add v25.4s, v25.4s, v20.4s\n"
+    "add v26.4s, v26.4s, v20.4s\n"
+    "add v27.4s, v27.4s, v20.4s\n"
+    "add v28.4s, v28.4s, v20.4s\n"
+    "add v29.4s, v29.4s, v20.4s\n"
+    "add v30.4s, v30.4s, v20.4s\n"
+    "add v31.4s, v31.4s, v20.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q21, [%x[params], #0x60]\n"
+    "ldr q20, [%x[params], #0x70]\n"
+    ".inst 0x6f80e118  // udot v24.4s, v8.16b, v0.4b[0]\n"
+    ".inst 0x6f80e919  // udot v25.4s, v8.16b, v0.4b[2]\n"
+    ".inst 0x6f81e11a  // udot v26.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x6f81e91b  // udot v27.4s, v8.16b, v1.4b[2]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "add x27, x27, x28\n"
+    ".inst 0x6fa0e138  // udot v24.4s, v9.16b, v0.4b[1]\n"
+    ".inst 0x6fa0e939  // udot v25.4s, v9.16b, v0.4b[3]\n"
+    "add x26, x26, x28\n"
+    "add x25, x25, x28\n"
+    ".inst 0x6fa1e13a  // udot v26.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x6fa1e93b  // udot v27.4s, v9.16b, v1.4b[3]\n"
+    "add x24, x24, x28\n"
+    "add x23, x23, x28\n"
+    ".inst 0x6f82e11c  // udot v28.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x6f82e91d  // udot v29.4s, v8.16b, v2.4b[2]\n"
+    "add x22, x22, x28\n"
+    "add x21, x21, x28\n"
+    ".inst 0x6f83e11e  // udot v30.4s, v8.16b, v3.4b[0]\n"
+    ".inst 0x6f83e91f  // udot v31.4s, v8.16b, v3.4b[2]\n"
+    "ldr q17, [%x[params], #0x0]\n"
+    "add x20, x20, x28\n"
+    ".inst 0x6f81e158  // udot v24.4s, v10.16b, v1.4b[0]\n"
+    ".inst 0x6f81e959  // udot v25.4s, v10.16b, v1.4b[2]\n"
+    ".inst 0x6f82e15a  // udot v26.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x6f82e95b  // udot v27.4s, v10.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e13c  // udot v28.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e93d  // udot v29.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x6fa3e13e  // udot v30.4s, v9.16b, v3.4b[1]\n"
+    ".inst 0x6fa3e93f  // udot v31.4s, v9.16b, v3.4b[3]\n"
+    "ldr q16, [%x[params], #0x10]\n"
+    ".inst 0x6fa1e178  // udot v24.4s, v11.16b, v1.4b[1]\n"
+    ".inst 0x6fa1e979  // udot v25.4s, v11.16b, v1.4b[3]\n"
+    ".inst 0x6fa2e17a  // udot v26.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e97b  // udot v27.4s, v11.16b, v2.4b[3]\n"
+    ".inst 0x6f83e15c  // udot v28.4s, v10.16b, v3.4b[0]\n"
+    ".inst 0x6f83e95d  // udot v29.4s, v10.16b, v3.4b[2]\n"
+    ".inst 0x6f84e15e  // udot v30.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x6f84e95f  // udot v31.4s, v10.16b, v4.4b[2]\n"
+    "ldr q19, [%x[params], #0x20]\n"
+    ".inst 0x6f82e238  // udot v24.4s, v17.16b, v2.4b[0]\n"
+    ".inst 0x6f82ea39  // udot v25.4s, v17.16b, v2.4b[2]\n"
+    ".inst 0x6f83e23a  // udot v26.4s, v17.16b, v3.4b[0]\n"
+    ".inst 0x6f83ea3b  // udot v27.4s, v17.16b, v3.4b[2]\n"
+    ".inst 0x6fa3e17c  // udot v28.4s, v11.16b, v3.4b[1]\n"
+    ".inst 0x6fa3e97d  // udot v29.4s, v11.16b, v3.4b[3]\n"
+    ".inst 0x6fa4e17e  // udot v30.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x6fa4e97f  // udot v31.4s, v11.16b, v4.4b[3]\n"
+    "ldr q18, [%x[params], #0x30]\n"
+    ".inst 0x6fa2e218  // udot v24.4s, v16.16b, v2.4b[1]\n"
+    ".inst 0x6fa2ea19  // udot v25.4s, v16.16b, v2.4b[3]\n"
+    ".inst 0x6fa3e21a  // udot v26.4s, v16.16b, v3.4b[1]\n"
+    ".inst 0x6fa3ea1b  // udot v27.4s, v16.16b, v3.4b[3]\n"
+    ".inst 0x6f84e23c  // udot v28.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea3d  // udot v29.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x6f85e23e  // udot v30.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea3f  // udot v31.4s, v17.16b, v5.4b[2]\n"
+    "ldr q17, [%x[params], #0x40]\n"
+    ".inst 0x6f83e278  // udot v24.4s, v19.16b, v3.4b[0]\n"
+    ".inst 0x6f83ea79  // udot v25.4s, v19.16b, v3.4b[2]\n"
+    ".inst 0x6f84e27a  // udot v26.4s, v19.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea7b  // udot v27.4s, v19.16b, v4.4b[2]\n"
+    ".inst 0x6fa4e21c  // udot v28.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea1d  // udot v29.4s, v16.16b, v4.4b[3]\n"
+    ".inst 0x6fa5e21e  // udot v30.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea1f  // udot v31.4s, v16.16b, v5.4b[3]\n"
+    "ldr q16, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x80\n"
+    ".inst 0x6fa3e258  // udot v24.4s, v18.16b, v3.4b[1]\n"
+    ".inst 0x6fa3ea59  // udot v25.4s, v18.16b, v3.4b[3]\n"
+    ".inst 0x6fa4e25a  // udot v26.4s, v18.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea5b  // udot v27.4s, v18.16b, v4.4b[3]\n"
+    ".inst 0x6f85e27c  // udot v28.4s, v19.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea7d  // udot v29.4s, v19.16b, v5.4b[2]\n"
+    ".inst 0x6f86e27e  // udot v30.4s, v19.16b, v6.4b[0]\n"
+    ".inst 0x6f86ea7f  // udot v31.4s, v19.16b, v6.4b[2]\n"
+    ".inst 0x6f84e238  // udot v24.4s, v17.16b, v4.4b[0]\n"
+    ".inst 0x6f84ea39  // udot v25.4s, v17.16b, v4.4b[2]\n"
+    ".inst 0x6f85e23a  // udot v26.4s, v17.16b, v5.4b[0]\n"
+    ".inst 0x6f85ea3b  // udot v27.4s, v17.16b, v5.4b[2]\n"
+    ".inst 0x6fa5e25c  // udot v28.4s, v18.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea5d  // udot v29.4s, v18.16b, v5.4b[3]\n"
+    ".inst 0x6fa6e25e  // udot v30.4s, v18.16b, v6.4b[1]\n"
+    ".inst 0x6fa6ea5f  // udot v31.4s, v18.16b, v6.4b[3]\n"
+    ".inst 0x6fa4e218  // udot v24.4s, v16.16b, v4.4b[1]\n"
+    ".inst 0x6fa4ea19  // udot v25.4s, v16.16b, v4.4b[3]\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    ".inst 0x6fa5e21a  // udot v26.4s, v16.16b, v5.4b[1]\n"
+    ".inst 0x6fa5ea1b  // udot v27.4s, v16.16b, v5.4b[3]\n"
+    "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+    ".inst 0x6f86e23c  // udot v28.4s, v17.16b, v6.4b[0]\n"
+    ".inst 0x6f86ea3d  // udot v29.4s, v17.16b, v6.4b[2]\n"
+    "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+    ".inst 0x6f87e23e  // udot v30.4s, v17.16b, v7.4b[0]\n"
+    ".inst 0x6f87ea3f  // udot v31.4s, v17.16b, v7.4b[2]\n"
+    "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+    ".inst 0x6fa6e21c  // udot v28.4s, v16.16b, v6.4b[1]\n"
+    ".inst 0x6fa6ea1d  // udot v29.4s, v16.16b, v6.4b[3]\n"
+    "and v19.16b, v24.16b, v20.16b\n"
+    ".inst 0x6fa7e21e  // udot v30.4s, v16.16b, v7.4b[1]\n"
+    ".inst 0x6fa7ea1f  // udot v31.4s, v16.16b, v7.4b[3]\n"
+    "and v18.16b, v25.16b, v20.16b\n"
+    "and v17.16b, v26.16b, v20.16b\n"
+    "and v16.16b, v27.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v19.4s\n"
+    "sqadd v25.4s, v25.4s, v18.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "and v19.16b, v28.16b, v20.16b\n"
+    "and v18.16b, v29.16b, v20.16b\n"
+    "and v17.16b, v30.16b, v20.16b\n"
+    "and v16.16b, v31.16b, v20.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v24.4s, v24.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v20.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v27.4s, v27.4s, v20.4s\n"
+    "srshl v28.4s, v28.4s, v20.4s\n"
+    "srshl v29.4s, v29.4s, v20.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "srshl v31.4s, v31.4s, v20.4s\n"
+    "add v24.4s, v24.4s, v13.4s\n"
+    "add v25.4s, v25.4s, v13.4s\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "add v27.4s, v27.4s, v13.4s\n"
+    "add v28.4s, v28.4s, v13.4s\n"
+    "add v29.4s, v29.4s, v13.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "add v31.4s, v31.4s, v13.4s\n"
+    "smin v24.4s, v24.4s, v15.4s\n"
+    "smin v25.4s, v25.4s, v15.4s\n"
+    "smin v26.4s, v26.4s, v15.4s\n"
+    "smin v27.4s, v27.4s, v15.4s\n"
+    "smin v28.4s, v28.4s, v15.4s\n"
+    "smin v29.4s, v29.4s, v15.4s\n"
+    "smin v30.4s, v30.4s, v15.4s\n"
+    "smin v31.4s, v31.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "blt 3f\n"
+    "str s24, [x27, #0x0]\n"
+    "str s25, [x26, #0x0]\n"
+    "str s26, [x25, #0x0]\n"
+    "str s27, [x24, #0x0]\n"
+    "str s28, [x23, #0x0]\n"
+    "str s29, [x22, #0x0]\n"
+    "str s30, [x21, #0x0]\n"
+    "str s31, [x20, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[1], [x27], #0x1\n"
+    "st1 { v25.b }[1], [x26], #0x1\n"
+    "st1 { v26.b }[1], [x25], #0x1\n"
+    "st1 { v27.b }[1], [x24], #0x1\n"
+    "st1 { v28.b }[1], [x23], #0x1\n"
+    "st1 { v29.b }[1], [x22], #0x1\n"
+    "st1 { v30.b }[1], [x21], #0x1\n"
+    "st1 { v31.b }[1], [x20], #0x1\n"
+    "beq 4f\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "beq 4f\n"
+    "st1 { v24.b }[3], [x27], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v25.b }[3], [x26], #0x1\n"
+    "st1 { v26.b }[3], [x25], #0x1\n"
+    "st1 { v27.b }[3], [x24], #0x1\n"
+    "st1 { v28.b }[3], [x23], #0x1\n"
+    "st1 { v29.b }[3], [x22], #0x1\n"
+    "st1 { v30.b }[3], [x21], #0x1\n"
+    "st1 { v31.b }[3], [x20], #0x1\n"
+    "4:"  // Tail: End
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1f2d211be2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+  a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::None)
+  {
+  }
+  Parent::KernelType kernel = a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..0770c126ec
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const uint8_t *weights,
+  const int32_t *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const int32_t *per_channel_left_shifts,
+  const int32_t *per_channel_muls,
+  const int32_t *per_channel_right_shifts,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "lsr x10, %x[n_output_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v13.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v10.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v9.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "mov x9, #0x0\n"
+    "cbz x10, 9f\n"
+    "1:"  // Output channel loop
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q31, [%x[bias], x20]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 3f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q9, [%x[rq_mul_ptr], x20]\n"
+    "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 3f\n"
+    "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
+    "3:"  // Output channel loop: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "usubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 7f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "usubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "beq 5f\n"
+    "4:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "usubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "usubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 4b\n"
+    "5:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 6f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "6:"  // Output channel loop: Odd tail
+    "ldp x20, x28, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldr d4, [x28, #0x0]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "7:"  // Output channel loop: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "8:"  // Output channel loop: Done
+    "add x9, x9, #0x4\n"
+    "cmp x9, x10, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 26f\n"
+    "9:"  // Output channel oddments
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 12f\n"
+    "add x20, %x[bias], x9, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 10f\n"
+    "ld1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v31.s }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "ld1 { v31.s }[0], [x20]\n"
+    "11:"  // Output channel oddments: Load bias: Bit 1: End
+    "12:"  // Output channel oddments: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 18f\n"
+    "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
+    "cbz %x[rq_left_shift_ptr], 15f\n"
+    "tbz %x[n_output_channels], #1, 13f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 14f\n"
+    "13:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "14:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+    "b 18f\n"
+    "15:"  // Output channel oddments: Load quantization parameters: No left shift
+    "tbz %x[n_output_channels], #1, 16f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "b 17f\n"
+    "16:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "17:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+    "18:"  // Output channel oddments: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "usubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 22f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "usubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "beq 20f\n"
+    "19:"  // Output channel oddments: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "usubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "usubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 19b\n"
+    "20:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 21f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "b 23f\n"
+    "21:"  // Output channel oddments: Odd tail
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d2, [x21, #0x0]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d1, [x20, #0x0]\n"
+    "ldr s0, [%x[weights]], #0x4\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "usubl v0.8h, v0.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "smlal v16.4s, v0.4h, v2.h[0]\n"
+    "smlal v17.4s, v0.4h, v2.h[1]\n"
+    "smlal v18.4s, v0.4h, v2.h[2]\n"
+    "smlal v19.4s, v0.4h, v2.h[3]\n"
+    "smlal v20.4s, v0.4h, v2.h[4]\n"
+    "smlal v21.4s, v0.4h, v2.h[5]\n"
+    "smlal v22.4s, v0.4h, v2.h[6]\n"
+    "smlal v23.4s, v0.4h, v2.h[7]\n"
+    "smlal v24.4s, v0.4h, v1.h[0]\n"
+    "smlal v25.4s, v0.4h, v1.h[1]\n"
+    "smlal v26.4s, v0.4h, v1.h[2]\n"
+    "smlal v27.4s, v0.4h, v1.h[3]\n"
+    "smlal v28.4s, v0.4h, v1.h[4]\n"
+    "smlal v29.4s, v0.4h, v1.h[5]\n"
+    "smlal v30.4s, v0.4h, v1.h[6]\n"
+    "smlal v31.4s, v0.4h, v1.h[7]\n"
+    "b 23f\n"
+    "22:"  // Output channel oddments: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "23:"  // Output channel oddments: Done
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_output_channels], #1, 24f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.h }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.h }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.h }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.h }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.h }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.h }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "add x9, x9, #0x2\n"
+    "st1 { v24.h }[0], [x27]\n"
+    "st1 { v25.h }[0], [x26]\n"
+    "st1 { v26.h }[0], [x25]\n"
+    "st1 { v27.h }[0], [x24]\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[2], [x27]\n"
+    "st1 { v25.b }[2], [x26]\n"
+    "st1 { v26.b }[2], [x25]\n"
+    "st1 { v27.b }[2], [x24]\n"
+    "st1 { v28.b }[2], [x23]\n"
+    "st1 { v29.b }[2], [x22]\n"
+    "st1 { v30.b }[2], [x21]\n"
+    "st1 { v31.b }[2], [x20]\n"
+    "b 25f\n"
+    "24:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[0], [x27]\n"
+    "st1 { v25.b }[0], [x26]\n"
+    "st1 { v26.b }[0], [x25]\n"
+    "st1 { v27.b }[0], [x24]\n"
+    "st1 { v28.b }[0], [x23]\n"
+    "st1 { v29.b }[0], [x22]\n"
+    "st1 { v30.b }[0], [x21]\n"
+    "st1 { v31.b }[0], [x20]\n"
+    "25:"  // Output channel oddments: Done: Store: Bit 1: End
+    "26:"  // Done
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..db73c88187
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d1872c90f8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1164 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x16, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x15, x16, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v18.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v5.8h }, [x21]\n"
+    "ld1r { v14.8h }, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "mov x14, #0x0\n"
+    "ld1r { v12.8h }, [x20]\n"
+    "mov x13, #0x0\n"
+    "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x11, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x9, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x28, x27, [x22, #0x0]\n"
+    "ldp x26, x25, [x22, #0x10]\n"
+    "cbz x15, 3f\n"
+    "ldr d19, [x11, #0x0]\n"
+    "ldr d7, [x11, #0x8]\n"
+    "subs x15, x15, #0x1\n"
+    "usubl v19.8h, v19.8b, v18.8b\n"
+    "ldr d1, [x11, #0x10]\n"
+    "ldr d17, [x11, #0x18]\n"
+    "usubl v7.8h, v7.8b, v18.8b\n"
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "ldr d8, [x11, #0x20]\n"
+    "ldr d31, [x11, #0x28]\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "ldr d29, [x11, #0x30]\n"
+    "ldr d16, [x11, #0x38]\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "ldr d4, [x11, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "ldr q28, [x20, #0x0]\n"
+    "ldr q9, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x23, x22, [x12, #0x0]\n"
+    "ldp x21, x20, [x12, #0x10]\n"
+    "mov v3.16b, v28.16b\n"
+    "mov v30.16b, v9.16b\n"
+    "ldr d23, [x23, x14]\n"
+    "ldr d10, [x22, x14]\n"
+    "mov v0.16b, v28.16b\n"
+    "mov v22.16b, v9.16b\n"
+    "ldr d11, [x21, x14]\n"
+    "ldr d13, [x20, x14]\n"
+    "mov v6.16b, v28.16b\n"
+    "mov v2.16b, v9.16b\n"
+    "ldr x20, [x12, #0x20]\n"
+    "ldr d27, [x20, x14]\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "ushll v13.8h, v13.8b, #0x0\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q24, [x10, #0x0]\n"
+    "ldr q25, [x9, #0x0]\n"
+    "smlal v28.4s, v23.4h, v8.4h\n"
+    "smlal2 v9.4s, v23.8h, v8.8h\n"
+    "ldr q20, [x10, #0x10]\n"
+    "ldr q26, [x9, #0x10]\n"
+    "smlal v28.4s, v10.4h, v19.4h\n"
+    "smlal v3.4s, v23.4h, v17.4h\n"
+    "ldr x20, [x12, #0x28]\n"
+    "ldr d21, [x20, x14]\n"
+    "smlal v0.4s, v23.4h, v7.4h\n"
+    "smlal v6.4s, v23.4h, v19.4h\n"
+    "smlal2 v9.4s, v10.8h, v19.8h\n"
+    "ldr x20, [x12, #0x38]\n"
+    "ldr d10, [x20, x14]\n"
+    "smlal v28.4s, v13.4h, v31.4h\n"
+    "smlal2 v30.4s, v23.8h, v17.8h\n"
+    "smlal2 v22.4s, v23.8h, v7.8h\n"
+    "ldr x20, [x12, #0x30]\n"
+    "ldr d15, [x20, x14]\n"
+    "smlal2 v2.4s, v23.8h, v19.8h\n"
+    "smlal v3.4s, v11.4h, v1.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [x12, #0x40]\n"
+    "ldr d23, [x20, x14]\n"
+    "smlal v0.4s, v13.4h, v1.4h\n"
+    "smlal v6.4s, v13.4h, v7.4h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal2 v9.4s, v13.8h, v31.8h\n"
+    "smlal v28.4s, v27.4h, v16.4h\n"
+    "ldr x20, [x12, #0x48]\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal2 v30.4s, v11.8h, v1.8h\n"
+    "ldr d11, [x20, x14]\n"
+    "smlal2 v22.4s, v13.8h, v1.8h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal2 v2.4s, v13.8h, v7.8h\n"
+    "smlal v3.4s, v13.4h, v8.4h\n"
+    "ldr x21, [x12, #0x50]\n"
+    "ldr x20, [x12, #0x58]\n"
+    "smlal v0.4s, v21.4h, v29.4h\n"
+    "smlal v6.4s, v27.4h, v17.4h\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "ldr x24, [x12, #0x60]\n"
+    "smlal2 v9.4s, v27.8h, v16.8h\n"
+    "smlal v28.4s, v10.4h, v7.4h\n"
+    "ldr x23, [x12, #0x68]\n"
+    "ldr x22, [x12, #0x70]\n"
+    "smlal2 v30.4s, v13.8h, v8.8h\n"
+    "ldr d13, [x21, x14]\n"
+    "smlal2 v22.4s, v21.8h, v29.8h\n"
+    "ldr d21, [x20, x14]\n"
+    "smlal2 v2.4s, v27.8h, v17.8h\n"
+    "smlal v3.4s, v27.4h, v29.4h\n"
+    "ushll v13.8h, v13.8b, #0x0\n"
+    "ldr x21, [x12, #0x78]\n"
+    "smlal v0.4s, v27.4h, v8.4h\n"
+    "smlal v6.4s, v15.4h, v4.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v9.4s, v10.8h, v7.8h\n"
+    "smlal v28.4s, v23.4h, v1.4h\n"
+    "add x11, x11, #0x48\n"
+    "subs x15, x15, #0x1\n"
+    "smlal2 v30.4s, v27.8h, v29.8h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "ldr d27, [x24, x14]\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "smlal2 v2.4s, v15.8h, v4.8h\n"
+    "ldr d15, [x23, x14]\n"
+    "smlal v3.4s, v10.4h, v19.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v0.4s, v11.4h, v31.4h\n"
+    "smlal v6.4s, v11.4h, v8.4h\n"
+    "add x10, x10, #0x20\n"
+    "add x9, x9, #0x20\n"
+    "smlal2 v9.4s, v23.8h, v1.8h\n"
+    "smlal v28.4s, v11.4h, v4.4h\n"
+    "smlal2 v30.4s, v10.8h, v19.8h\n"
+    "ldr d10, [x22, x14]\n"
+    "smlal2 v22.4s, v11.8h, v31.8h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal2 v2.4s, v11.8h, v8.8h\n"
+    "ldr d8, [x21, x14]\n"
+    "smlal v3.4s, v23.4h, v7.4h\n"
+    "ushll v8.8h, v8.8b, #0x0\n"
+    "smlal v0.4s, v13.4h, v19.4h\n"
+    "smlal v6.4s, v21.4h, v1.4h\n"
+    "add x14, x14, #0x8\n"
+    "smlal2 v9.4s, v11.8h, v4.8h\n"
+    "smlal v28.4s, v13.4h, v17.4h\n"
+    "smlal2 v30.4s, v23.8h, v7.8h\n"
+    "smlal2 v22.4s, v13.8h, v19.8h\n"
+    "smlal2 v2.4s, v21.8h, v1.8h\n"
+    "smlal v3.4s, v11.4h, v16.4h\n"
+    "smlal v0.4s, v27.4h, v17.4h\n"
+    "smlal v6.4s, v15.4h, v31.4h\n"
+    "smlal2 v9.4s, v13.8h, v17.8h\n"
+    "smlal v28.4s, v27.4h, v29.4h\n"
+    "sqrdmulh v28.4s, v28.4s, v24.4s\n"
+    "smlal2 v30.4s, v11.8h, v16.8h\n"
+    "smlal2 v22.4s, v27.8h, v17.8h\n"
+    "and v17.16b, v28.16b, v25.16b\n"
+    "smlal2 v2.4s, v15.8h, v31.8h\n"
+    "smlal v3.4s, v21.4h, v31.4h\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smlal v0.4s, v10.4h, v16.4h\n"
+    "smlal v6.4s, v10.4h, v29.4h\n"
+    "sqadd v28.4s, v28.4s, v17.4s\n"
+    "smlal2 v9.4s, v27.8h, v29.8h\n"
+    "smlal2 v30.4s, v21.8h, v31.8h\n"
+    "sqrdmulh v9.4s, v9.4s, v20.4s\n"
+    "smlal2 v22.4s, v10.8h, v16.8h\n"
+    "smlal2 v2.4s, v10.8h, v29.8h\n"
+    "and v23.16b, v9.16b, v26.16b\n"
+    "smlal v3.4s, v15.4h, v4.4h\n"
+    "smlal v0.4s, v8.4h, v4.4h\n"
+    "sqrdmulh v3.4s, v3.4s, v24.4s\n"
+    "smlal v6.4s, v8.4h, v16.4h\n"
+    "smlal2 v30.4s, v15.8h, v4.8h\n"
+    "sqrdmulh v0.4s, v0.4s, v24.4s\n"
+    "smlal2 v22.4s, v8.8h, v4.8h\n"
+    "smlal2 v2.4s, v8.8h, v16.8h\n"
+    "sqrdmulh v6.4s, v6.4s, v24.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v8.16b, v3.16b, v25.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+    "and v11.16b, v0.16b, v25.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+    "and v29.16b, v6.16b, v25.16b\n"
+    "sqrdmulh v2.4s, v2.4s, v20.4s\n"
+    "sqadd v9.4s, v9.4s, v23.4s\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "and v13.16b, v30.16b, v26.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v21.16b, v22.16b, v26.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "and v23.16b, v2.16b, v26.16b\n"
+    "sqadd v3.4s, v3.4s, v8.4s\n"
+    "sshr v13.4s, v13.4s, #0x1f\n"
+    "sqadd v0.4s, v0.4s, v11.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sqadd v6.4s, v6.4s, v29.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "srshl v28.4s, v28.4s, v25.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqadd v30.4s, v30.4s, v13.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqadd v22.4s, v22.4s, v21.4s\n"
+    "srshl v6.4s, v6.4s, v25.4s\n"
+    "sqadd v2.4s, v2.4s, v23.4s\n"
+    "srshl v9.4s, v9.4s, v26.4s\n"
+    "sqxtn v28.4h, v28.4s\n"
+    "srshl v30.4s, v30.4s, v26.4s\n"
+    "sqxtn v3.4h, v3.4s\n"
+    "srshl v22.4s, v22.4s, v26.4s\n"
+    "sqxtn v0.4h, v0.4s\n"
+    "srshl v2.4s, v2.4s, v26.4s\n"
+    "sqxtn v6.4h, v6.4s\n"
+    "sqxtn2 v28.8h, v9.4s\n"
+    "sqxtn2 v3.8h, v30.4s\n"
+    "sqxtn2 v0.8h, v22.4s\n"
+    "sqxtn2 v6.8h, v2.4s\n"
+    "sqadd v28.8h, v28.8h, v5.8h\n"
+    "sqadd v3.8h, v3.8h, v5.8h\n"
+    "sqadd v0.8h, v0.8h, v5.8h\n"
+    "sqadd v6.8h, v6.8h, v5.8h\n"
+    "smax v28.8h, v28.8h, v14.8h\n"
+    "smax v3.8h, v3.8h, v14.8h\n"
+    "smax v0.8h, v0.8h, v14.8h\n"
+    "smax v6.8h, v6.8h, v14.8h\n"
+    "smin v28.8h, v28.8h, v12.8h\n"
+    "smin v3.8h, v3.8h, v12.8h\n"
+    "smin v0.8h, v0.8h, v12.8h\n"
+    "smin v6.8h, v6.8h, v12.8h\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str d28, [x28, x13]\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "uzp1 v0.16b, v0.16b, v0.16b\n"
+    "str d3, [x27, x13]\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str d0, [x26, x13]\n"
+    "str d6, [x25, x13]\n"
+    "ldr q28, [x20, #0x0]\n"
+    "ldr q9, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d19, [x11, #0x0]\n"
+    "ldr d7, [x11, #0x8]\n"
+    "add x13, x13, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d1, [x11, #0x10]\n"
+    "ldr d17, [x11, #0x18]\n"
+    "mov v3.16b, v28.16b\n"
+    "mov v30.16b, v9.16b\n"
+    "ldr d8, [x11, #0x20]\n"
+    "ldr d31, [x11, #0x28]\n"
+    "mov v0.16b, v28.16b\n"
+    "mov v22.16b, v9.16b\n"
+    "ldr d29, [x11, #0x30]\n"
+    "ldr d16, [x11, #0x38]\n"
+    "mov v6.16b, v28.16b\n"
+    "mov v2.16b, v9.16b\n"
+    "ldr d4, [x11, #0x40]\n"
+    "ldp x23, x22, [x12, #0x0]\n"
+    "usubl v19.8h, v19.8b, v18.8b\n"
+    "usubl v7.8h, v7.8b, v18.8b\n"
+    "ldp x21, x20, [x12, #0x10]\n"
+    "ldr d23, [x23, x14]\n"
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "ldr d10, [x22, x14]\n"
+    "ldr d11, [x21, x14]\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "ldr d13, [x20, x14]\n"
+    "ldr x20, [x12, #0x20]\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "ldr d27, [x20, x14]\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "ushll v13.8h, v13.8b, #0x0\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q26, [x10, #0x0]\n"
+    "ldr q25, [x9, #0x0]\n"
+    "smlal v28.4s, v23.4h, v8.4h\n"
+    "smlal2 v9.4s, v23.8h, v8.8h\n"
+    "ldr q24, [x10, #0x10]\n"
+    "ldr q20, [x9, #0x10]\n"
+    "smlal v28.4s, v10.4h, v19.4h\n"
+    "smlal v3.4s, v23.4h, v17.4h\n"
+    "ldr x20, [x12, #0x28]\n"
+    "ldr d21, [x20, x14]\n"
+    "smlal v0.4s, v23.4h, v7.4h\n"
+    "smlal v6.4s, v23.4h, v19.4h\n"
+    "smlal2 v9.4s, v10.8h, v19.8h\n"
+    "ldr x20, [x12, #0x38]\n"
+    "ldr d15, [x20, x14]\n"
+    "smlal v28.4s, v13.4h, v31.4h\n"
+    "smlal2 v30.4s, v23.8h, v17.8h\n"
+    "smlal2 v22.4s, v23.8h, v7.8h\n"
+    "ldr x20, [x12, #0x30]\n"
+    "ldr d10, [x20, x14]\n"
+    "smlal2 v2.4s, v23.8h, v19.8h\n"
+    "smlal v3.4s, v11.4h, v1.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [x12, #0x40]\n"
+    "ldr d23, [x20, x14]\n"
+    "smlal v0.4s, v13.4h, v1.4h\n"
+    "smlal v6.4s, v13.4h, v7.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal2 v9.4s, v13.8h, v31.8h\n"
+    "smlal v28.4s, v27.4h, v16.4h\n"
+    "ldr x20, [x12, #0x48]\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal2 v30.4s, v11.8h, v1.8h\n"
+    "ldr d11, [x20, x14]\n"
+    "smlal2 v22.4s, v13.8h, v1.8h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal2 v2.4s, v13.8h, v7.8h\n"
+    "smlal v3.4s, v13.4h, v8.4h\n"
+    "ldr x24, [x12, #0x50]\n"
+    "ldr x20, [x12, #0x58]\n"
+    "smlal v0.4s, v21.4h, v29.4h\n"
+    "smlal v6.4s, v27.4h, v17.4h\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "ldr x23, [x12, #0x60]\n"
+    "smlal2 v9.4s, v27.8h, v16.8h\n"
+    "smlal v28.4s, v15.4h, v7.4h\n"
+    "ldr x22, [x12, #0x68]\n"
+    "ldr x21, [x12, #0x70]\n"
+    "smlal2 v30.4s, v13.8h, v8.8h\n"
+    "ldr d13, [x24, x14]\n"
+    "smlal2 v22.4s, v21.8h, v29.8h\n"
+    "ldr d21, [x20, x14]\n"
+    "smlal2 v2.4s, v27.8h, v17.8h\n"
+    "smlal v3.4s, v27.4h, v29.4h\n"
+    "ushll v13.8h, v13.8b, #0x0\n"
+    "ldr x20, [x12, #0x78]\n"
+    "smlal v0.4s, v27.4h, v8.4h\n"
+    "smlal v6.4s, v10.4h, v4.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "tst x16, #0x7\n"
+    "smlal2 v9.4s, v15.8h, v7.8h\n"
+    "smlal v28.4s, v23.4h, v1.4h\n"
+    "add x10, x10, #0x20\n"
+    "add x9, x9, #0x20\n"
+    "smlal2 v30.4s, v27.8h, v29.8h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "ldr d27, [x23, x14]\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "smlal2 v2.4s, v10.8h, v4.8h\n"
+    "ldr d10, [x22, x14]\n"
+    "smlal v3.4s, v15.4h, v19.4h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v0.4s, v11.4h, v31.4h\n"
+    "smlal v6.4s, v11.4h, v8.4h\n"
+    "smlal2 v9.4s, v23.8h, v1.8h\n"
+    "smlal v28.4s, v11.4h, v4.4h\n"
+    "smlal2 v30.4s, v15.8h, v19.8h\n"
+    "ldr d15, [x21, x14]\n"
+    "smlal2 v22.4s, v11.8h, v31.8h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal2 v2.4s, v11.8h, v8.8h\n"
+    "ldr d8, [x20, x14]\n"
+    "smlal v3.4s, v23.4h, v7.4h\n"
+    "ushll v8.8h, v8.8b, #0x0\n"
+    "smlal v0.4s, v13.4h, v19.4h\n"
+    "smlal v6.4s, v21.4h, v1.4h\n"
+    "add x14, x14, #0x8\n"
+    "smlal2 v9.4s, v11.8h, v4.8h\n"
+    "smlal v28.4s, v13.4h, v17.4h\n"
+    "smlal2 v30.4s, v23.8h, v7.8h\n"
+    "smlal2 v22.4s, v13.8h, v19.8h\n"
+    "smlal2 v2.4s, v21.8h, v1.8h\n"
+    "smlal v3.4s, v11.4h, v16.4h\n"
+    "smlal v0.4s, v27.4h, v17.4h\n"
+    "smlal v6.4s, v10.4h, v31.4h\n"
+    "smlal2 v9.4s, v13.8h, v17.8h\n"
+    "smlal v28.4s, v27.4h, v29.4h\n"
+    "sqrdmulh v28.4s, v28.4s, v26.4s\n"
+    "smlal2 v30.4s, v11.8h, v16.8h\n"
+    "smlal2 v22.4s, v27.8h, v17.8h\n"
+    "and v1.16b, v28.16b, v25.16b\n"
+    "smlal2 v2.4s, v10.8h, v31.8h\n"
+    "smlal v3.4s, v21.4h, v31.4h\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "smlal v0.4s, v15.4h, v16.4h\n"
+    "smlal v6.4s, v15.4h, v29.4h\n"
+    "sqadd v28.4s, v28.4s, v1.4s\n"
+    "smlal2 v9.4s, v27.8h, v29.8h\n"
+    "smlal2 v30.4s, v21.8h, v31.8h\n"
+    "sqrdmulh v9.4s, v9.4s, v24.4s\n"
+    "smlal2 v22.4s, v15.8h, v16.8h\n"
+    "smlal2 v2.4s, v15.8h, v29.8h\n"
+    "and v27.16b, v9.16b, v20.16b\n"
+    "smlal v3.4s, v10.4h, v4.4h\n"
+    "smlal v0.4s, v8.4h, v4.4h\n"
+    "sqrdmulh v3.4s, v3.4s, v26.4s\n"
+    "smlal v6.4s, v8.4h, v16.4h\n"
+    "smlal2 v30.4s, v10.8h, v4.8h\n"
+    "sqrdmulh v0.4s, v0.4s, v26.4s\n"
+    "smlal2 v22.4s, v8.8h, v4.8h\n"
+    "smlal2 v2.4s, v8.8h, v16.8h\n"
+    "sqrdmulh v6.4s, v6.4s, v26.4s\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v24.4s\n"
+    "and v4.16b, v0.16b, v25.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+    "and v17.16b, v6.16b, v25.16b\n"
+    "sqrdmulh v2.4s, v2.4s, v24.4s\n"
+    "sqadd v9.4s, v9.4s, v27.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v8.16b, v30.16b, v20.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v26.16b, v22.16b, v20.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v11.16b, v2.16b, v20.16b\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "sqadd v0.4s, v0.4s, v4.4s\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "sqadd v6.4s, v6.4s, v17.4s\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "srshl v28.4s, v28.4s, v25.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqadd v30.4s, v30.4s, v8.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqadd v22.4s, v22.4s, v26.4s\n"
+    "srshl v6.4s, v6.4s, v25.4s\n"
+    "sqadd v2.4s, v2.4s, v11.4s\n"
+    "srshl v9.4s, v9.4s, v20.4s\n"
+    "sqxtn v28.4h, v28.4s\n"
+    "srshl v30.4s, v30.4s, v20.4s\n"
+    "sqxtn v3.4h, v3.4s\n"
+    "srshl v22.4s, v22.4s, v20.4s\n"
+    "sqxtn v0.4h, v0.4s\n"
+    "srshl v2.4s, v2.4s, v20.4s\n"
+    "sqxtn v6.4h, v6.4s\n"
+    "sqxtn2 v28.8h, v9.4s\n"
+    "sqxtn2 v3.8h, v30.4s\n"
+    "sqxtn2 v0.8h, v22.4s\n"
+    "sqxtn2 v6.8h, v2.4s\n"
+    "sqadd v28.8h, v28.8h, v5.8h\n"
+    "sqadd v3.8h, v3.8h, v5.8h\n"
+    "sqadd v0.8h, v0.8h, v5.8h\n"
+    "sqadd v6.8h, v6.8h, v5.8h\n"
+    "smax v28.8h, v28.8h, v14.8h\n"
+    "smax v3.8h, v3.8h, v14.8h\n"
+    "smax v0.8h, v0.8h, v14.8h\n"
+    "smax v6.8h, v6.8h, v14.8h\n"
+    "smin v28.8h, v28.8h, v12.8h\n"
+    "smin v3.8h, v3.8h, v12.8h\n"
+    "smin v0.8h, v0.8h, v12.8h\n"
+    "smin v6.8h, v6.8h, v12.8h\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str d28, [x28, x13]\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "uzp1 v0.16b, v0.16b, v0.16b\n"
+    "str d3, [x27, x13]\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str d0, [x26, x13]\n"
+    "str d6, [x25, x13]\n"
+    "add x13, x13, #0x8\n"
+    "beq 64f\n"
+    "add x11, x11, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x16, #2, 5f\n"
+    "ld1 { v28.4s }, [x20], #0x10\n"
+    "tbz x16, #1, 4f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz x16, #0, 7f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x16, #0, 7f\n"
+    "ld1 { v9.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x16, #1, 6f\n"
+    "ld1 { v28.d }[0], [x20], #0x8\n"
+    "tbz x16, #0, 7f\n"
+    "ld1 { v28.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 7f\n"
+    "ld1 { v28.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d19, [x11, #0x0]\n"
+    "ldr d7, [x11, #0x8]\n"
+    "mov v3.16b, v28.16b\n"
+    "mov v30.16b, v9.16b\n"
+    "ldr d1, [x11, #0x10]\n"
+    "ldr d17, [x11, #0x18]\n"
+    "mov v0.16b, v28.16b\n"
+    "mov v22.16b, v9.16b\n"
+    "ldr d8, [x11, #0x20]\n"
+    "ldr d31, [x11, #0x28]\n"
+    "mov v6.16b, v28.16b\n"
+    "mov v2.16b, v9.16b\n"
+    "ldr d29, [x11, #0x30]\n"
+    "ldr d16, [x11, #0x38]\n"
+    "usubl v19.8h, v19.8b, v18.8b\n"
+    "usubl v7.8h, v7.8b, v18.8b\n"
+    "ldr d4, [x11, #0x40]\n"
+    "ldp x24, x23, [x12, #0x0]\n"
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "ldp x22, x21, [x12, #0x10]\n"
+    "ldr x20, [x12, #0x20]\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "add x24, x24, x14\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 9f\n"
+    "ld1 { v23.s }[0], [x24], #0x4\n"
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "ld1 { v11.s }[0], [x22], #0x4\n"
+    "ld1 { v13.s }[0], [x21], #0x4\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 8f\n"
+    "ld1 { v23.h }[2], [x24], #0x2\n"
+    "ld1 { v10.h }[2], [x23], #0x2\n"
+    "ld1 { v11.h }[2], [x22], #0x2\n"
+    "ld1 { v13.h }[2], [x21], #0x2\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 11f\n"
+    "ld1 { v23.b }[6], [x24]\n"
+    "ld1 { v10.b }[6], [x23]\n"
+    "ld1 { v11.b }[6], [x22]\n"
+    "ld1 { v13.b }[6], [x21]\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x16, #0, 11f\n"
+    "ld1 { v23.b }[4], [x24]\n"
+    "ld1 { v10.b }[4], [x23]\n"
+    "ld1 { v11.b }[4], [x22]\n"
+    "ld1 { v13.b }[4], [x21]\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x16, #1, 10f\n"
+    "ld1 { v23.h }[0], [x24], #0x2\n"
+    "ld1 { v10.h }[0], [x23], #0x2\n"
+    "ld1 { v11.h }[0], [x22], #0x2\n"
+    "ld1 { v13.h }[0], [x21], #0x2\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 11f\n"
+    "ld1 { v23.b }[2], [x24]\n"
+    "ld1 { v10.b }[2], [x23]\n"
+    "ld1 { v11.b }[2], [x22]\n"
+    "ld1 { v13.b }[2], [x21]\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 11f\n"
+    "ld1 { v23.b }[0], [x24]\n"
+    "ld1 { v10.b }[0], [x23]\n"
+    "ld1 { v11.b }[0], [x22]\n"
+    "ld1 { v13.b }[0], [x21]\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal v28.4s, v23.4h, v8.4h\n"
+    "smlal2 v9.4s, v23.8h, v8.8h\n"
+    "ldr x20, [x12, #0x28]\n"
+    "smlal v3.4s, v23.4h, v17.4h\n"
+    "smlal2 v30.4s, v23.8h, v17.8h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal v0.4s, v23.4h, v7.4h\n"
+    "smlal2 v22.4s, v23.8h, v7.8h\n"
+    "add x20, x20, x14\n"
+    "smlal v6.4s, v23.4h, v19.4h\n"
+    "smlal2 v2.4s, v23.8h, v19.8h\n"
+    "ushll v13.8h, v13.8b, #0x0\n"
+    "smlal v28.4s, v10.4h, v19.4h\n"
+    "smlal2 v9.4s, v10.8h, v19.8h\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "smlal v3.4s, v11.4h, v1.4h\n"
+    "smlal2 v30.4s, v11.8h, v1.8h\n"
+    "smlal v28.4s, v13.4h, v31.4h\n"
+    "smlal2 v9.4s, v13.8h, v31.8h\n"
+    "smlal v3.4s, v13.4h, v8.4h\n"
+    "smlal2 v30.4s, v13.8h, v8.8h\n"
+    "smlal v0.4s, v13.4h, v1.4h\n"
+    "smlal2 v22.4s, v13.8h, v1.8h\n"
+    "smlal v6.4s, v13.4h, v7.4h\n"
+    "smlal2 v2.4s, v13.8h, v7.8h\n"
+    "tbz x16, #2, 13f\n"
+    "ld1 { v26.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 12f\n"
+    "ld1 { v26.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 15f\n"
+    "ld1 { v26.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 15f\n"
+    "ld1 { v26.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x16, #1, 14f\n"
+    "ld1 { v26.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 15f\n"
+    "ld1 { v26.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 15f\n"
+    "ld1 { v26.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (3, 0): Bit 2: End
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v0.4s, v26.4h, v29.4h\n"
+    "smlal2 v22.4s, v26.8h, v29.8h\n"
+    "ldr x20, [x12, #0x30]\n"
+    "smlal v28.4s, v27.4h, v16.4h\n"
+    "smlal2 v9.4s, v27.8h, v16.8h\n"
+    "add x20, x20, x14\n"
+    "smlal v3.4s, v27.4h, v29.4h\n"
+    "smlal2 v30.4s, v27.8h, v29.8h\n"
+    "smlal v0.4s, v27.4h, v8.4h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "smlal v6.4s, v27.4h, v17.4h\n"
+    "smlal2 v2.4s, v27.8h, v17.8h\n"
+    "tbz x16, #2, 17f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 16f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 19f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 19f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x16, #1, 18f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 19f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 19f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (3, 3): Bit 2: End
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr x20, [x12, #0x38]\n"
+    "smlal v6.4s, v23.4h, v4.4h\n"
+    "smlal2 v2.4s, v23.8h, v4.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 21f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 20f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 23f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 23f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 1): Bit 2: Unset
+    "tbz x16, #1, 22f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 23f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 23f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 1): Bit 2: End
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [x12, #0x40]\n"
+    "smlal v28.4s, v21.4h, v7.4h\n"
+    "smlal2 v9.4s, v21.8h, v7.8h\n"
+    "smlal v3.4s, v21.4h, v19.4h\n"
+    "smlal2 v30.4s, v21.8h, v19.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 25f\n"
+    "ld1 { v18.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 24f\n"
+    "ld1 { v18.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 27f\n"
+    "ld1 { v18.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 27f\n"
+    "ld1 { v18.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (0, 2): Bit 2: Unset
+    "tbz x16, #1, 26f\n"
+    "ld1 { v18.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 27f\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 27f\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (0, 2): Bit 2: End
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ldr x20, [x12, #0x48]\n"
+    "smlal v28.4s, v18.4h, v1.4h\n"
+    "smlal2 v9.4s, v18.8h, v1.8h\n"
+    "smlal v3.4s, v18.4h, v7.4h\n"
+    "smlal2 v30.4s, v18.8h, v7.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 29f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 28f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 31f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 31f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x16, #1, 30f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 31f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 31f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x12, #0x50]\n"
+    "smlal v28.4s, v15.4h, v4.4h\n"
+    "smlal2 v9.4s, v15.8h, v4.8h\n"
+    "smlal v3.4s, v15.4h, v16.4h\n"
+    "smlal2 v30.4s, v15.8h, v16.8h\n"
+    "add x20, x20, x14\n"
+    "smlal v0.4s, v15.4h, v31.4h\n"
+    "smlal2 v22.4s, v15.8h, v31.8h\n"
+    "smlal v6.4s, v15.4h, v8.4h\n"
+    "smlal2 v2.4s, v15.8h, v8.8h\n"
+    "tbz x16, #2, 33f\n"
+    "ld1 { v20.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 32f\n"
+    "ld1 { v20.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 35f\n"
+    "ld1 { v20.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 35f\n"
+    "ld1 { v20.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (1, 0): Bit 2: Unset
+    "tbz x16, #1, 34f\n"
+    "ld1 { v20.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 35f\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 35f\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (1, 0): Bit 2: End
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "ldr x20, [x12, #0x58]\n"
+    "smlal v28.4s, v20.4h, v17.4h\n"
+    "smlal2 v9.4s, v20.8h, v17.8h\n"
+    "smlal v0.4s, v20.4h, v19.4h\n"
+    "smlal2 v22.4s, v20.8h, v19.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 37f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 36f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 39f\n"
+    "ld1 { v11.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 39f\n"
+    "ld1 { v11.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x16, #1, 38f\n"
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 39f\n"
+    "ld1 { v11.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 39f\n"
+    "ld1 { v11.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (1, 3): Bit 2: End
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "ldr x20, [x12, #0x60]\n"
+    "smlal v3.4s, v11.4h, v31.4h\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "smlal v6.4s, v11.4h, v1.4h\n"
+    "smlal2 v2.4s, v11.8h, v1.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 41f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 40f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 43f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 43f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x16, #1, 42f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 43f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 43f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 0): Bit 2: End
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr x20, [x12, #0x68]\n"
+    "smlal v28.4s, v23.4h, v29.4h\n"
+    "smlal2 v9.4s, v23.8h, v29.8h\n"
+    "smlal v0.4s, v23.4h, v17.4h\n"
+    "smlal2 v22.4s, v23.8h, v17.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 45f\n"
+    "ld1 { v20.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 44f\n"
+    "ld1 { v20.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 47f\n"
+    "ld1 { v20.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 47f\n"
+    "ld1 { v20.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x16, #1, 46f\n"
+    "ld1 { v20.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 47f\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 47f\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "ldr x20, [x12, #0x70]\n"
+    "smlal v3.4s, v20.4h, v4.4h\n"
+    "smlal2 v30.4s, v20.8h, v4.8h\n"
+    "smlal v6.4s, v20.4h, v31.4h\n"
+    "smlal2 v2.4s, v20.8h, v31.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 49f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 48f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 51f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 51f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x16, #1, 50f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 51f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 51f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ushll v8.8h, v8.8b, #0x0\n"
+    "ldr x20, [x12, #0x78]\n"
+    "smlal v0.4s, v8.4h, v16.4h\n"
+    "smlal2 v22.4s, v8.8h, v16.8h\n"
+    "smlal v6.4s, v8.4h, v29.4h\n"
+    "smlal2 v2.4s, v8.8h, v29.8h\n"
+    "add x20, x20, x14\n"
+    "tbz x16, #2, 53f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x16, #1, 52f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x16, #0, 55f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x16, #0, 55f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x16, #1, 54f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x16, #0, 55f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 55f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ushll v8.8h, v8.8b, #0x0\n"
+    "smlal v0.4s, v8.4h, v4.4h\n"
+    "smlal2 v22.4s, v8.8h, v4.8h\n"
+    "smlal v6.4s, v8.4h, v16.4h\n"
+    "smlal2 v2.4s, v8.8h, v16.8h\n"
+    "tbz x16, #2, 57f\n"
+    "ld1 { v7.4s }, [x10], #0x10\n"
+    "ld1 { v23.4s }, [x9], #0x10\n"
+    "tbz x16, #1, 56f\n"
+    "ld1 { v11.d }[0], [x10], #0x8\n"
+    "ld1 { v27.d }[0], [x9], #0x8\n"
+    "tbz x16, #0, 59f\n"
+    "ld1 { v11.s }[2], [x10]\n"
+    "ld1 { v27.s }[2], [x9]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x16, #0, 59f\n"
+    "ld1 { v11.s }[0], [x10]\n"
+    "ld1 { v27.s }[0], [x9]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x16, #1, 58f\n"
+    "ld1 { v7.d }[0], [x10], #0x8\n"
+    "ld1 { v23.d }[0], [x9], #0x8\n"
+    "tbz x16, #0, 59f\n"
+    "ld1 { v7.s }[2], [x10]\n"
+    "ld1 { v23.s }[2], [x9]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 59f\n"
+    "ld1 { v7.s }[0], [x10]\n"
+    "ld1 { v23.s }[0], [x9]\n"
+    "59:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v28.4s, v28.4s, v7.4s\n"
+    "and v20.16b, v28.16b, v23.16b\n"
+    "add x28, x28, x13\n"
+    "add x27, x27, x13\n"
+    "sqrdmulh v9.4s, v9.4s, v11.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "add x26, x26, x13\n"
+    "add x25, x25, x13\n"
+    "and v4.16b, v9.16b, v27.16b\n"
+    "sqrdmulh v3.4s, v3.4s, v7.4s\n"
+    "sqrdmulh v0.4s, v0.4s, v7.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v7.4s\n"
+    "sqadd v28.4s, v28.4s, v20.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v19.16b, v3.16b, v23.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v11.4s\n"
+    "and v29.16b, v0.16b, v23.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v11.4s\n"
+    "and v26.16b, v6.16b, v23.16b\n"
+    "sqrdmulh v2.4s, v2.4s, v11.4s\n"
+    "sqadd v9.4s, v9.4s, v4.4s\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "and v17.16b, v30.16b, v27.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "and v8.16b, v22.16b, v27.16b\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "and v13.16b, v2.16b, v27.16b\n"
+    "sqadd v3.4s, v3.4s, v19.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v0.4s, v0.4s, v29.4s\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "sqadd v6.4s, v6.4s, v26.4s\n"
+    "sshr v13.4s, v13.4s, #0x1f\n"
+    "srshl v28.4s, v28.4s, v23.4s\n"
+    "srshl v3.4s, v3.4s, v23.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "srshl v0.4s, v0.4s, v23.4s\n"
+    "sqadd v22.4s, v22.4s, v8.4s\n"
+    "srshl v6.4s, v6.4s, v23.4s\n"
+    "sqadd v2.4s, v2.4s, v13.4s\n"
+    "srshl v9.4s, v9.4s, v27.4s\n"
+    "sqxtn v28.4h, v28.4s\n"
+    "srshl v30.4s, v30.4s, v27.4s\n"
+    "sqxtn v3.4h, v3.4s\n"
+    "srshl v22.4s, v22.4s, v27.4s\n"
+    "sqxtn v0.4h, v0.4s\n"
+    "srshl v2.4s, v2.4s, v27.4s\n"
+    "sqxtn v6.4h, v6.4s\n"
+    "sqxtn2 v28.8h, v9.4s\n"
+    "sqxtn2 v3.8h, v30.4s\n"
+    "sqxtn2 v0.8h, v22.4s\n"
+    "sqxtn2 v6.8h, v2.4s\n"
+    "sqadd v28.8h, v28.8h, v5.8h\n"
+    "sqadd v3.8h, v3.8h, v5.8h\n"
+    "sqadd v0.8h, v0.8h, v5.8h\n"
+    "sqadd v6.8h, v6.8h, v5.8h\n"
+    "smax v28.8h, v28.8h, v14.8h\n"
+    "smax v3.8h, v3.8h, v14.8h\n"
+    "smax v0.8h, v0.8h, v14.8h\n"
+    "smax v6.8h, v6.8h, v14.8h\n"
+    "smin v28.8h, v28.8h, v12.8h\n"
+    "smin v3.8h, v3.8h, v12.8h\n"
+    "smin v0.8h, v0.8h, v12.8h\n"
+    "smin v6.8h, v6.8h, v12.8h\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "uzp1 v0.16b, v0.16b, v0.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "tbz x16, #2, 61f\n"
+    "st1 { v28.s }[0], [x28], #0x4\n"
+    "st1 { v3.s }[0], [x27], #0x4\n"
+    "st1 { v0.s }[0], [x26], #0x4\n"
+    "st1 { v6.s }[0], [x25], #0x4\n"
+    "tbz x16, #1, 60f\n"
+    "st1 { v28.h }[2], [x28], #0x2\n"
+    "st1 { v3.h }[2], [x27], #0x2\n"
+    "st1 { v0.h }[2], [x26], #0x2\n"
+    "st1 { v6.h }[2], [x25], #0x2\n"
+    "tbz x16, #0, 63f\n"
+    "st1 { v28.b }[6], [x28], #0x1\n"
+    "st1 { v3.b }[6], [x27], #0x1\n"
+    "st1 { v0.b }[6], [x26], #0x1\n"
+    "st1 { v6.b }[6], [x25], #0x1\n"
+    "b 63f\n"
+    "60:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x16, #0, 63f\n"
+    "st1 { v28.b }[4], [x28], #0x1\n"
+    "st1 { v3.b }[4], [x27], #0x1\n"
+    "st1 { v0.b }[4], [x26], #0x1\n"
+    "st1 { v6.b }[4], [x25], #0x1\n"
+    "b 63f\n"
+    "61:"  // Oddments: Bit 2: Unset
+    "tbz x16, #1, 62f\n"
+    "st1 { v28.h }[0], [x28], #0x2\n"
+    "st1 { v3.h }[0], [x27], #0x2\n"
+    "st1 { v0.h }[0], [x26], #0x2\n"
+    "st1 { v6.h }[0], [x25], #0x2\n"
+    "tbz x16, #0, 63f\n"
+    "st1 { v28.b }[2], [x28], #0x1\n"
+    "st1 { v3.b }[2], [x27], #0x1\n"
+    "st1 { v0.b }[2], [x26], #0x1\n"
+    "st1 { v6.b }[2], [x25], #0x1\n"
+    "b 63f\n"
+    "62:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x16, #0, 63f\n"
+    "st1 { v28.b }[0], [x28], #0x1\n"
+    "st1 { v3.b }[0], [x27], #0x1\n"
+    "st1 { v0.b }[0], [x26], #0x1\n"
+    "st1 { v6.b }[0], [x25], #0x1\n"
+    "63:"  // Oddments: Bit 2: End
+    "64:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..9b646bc4f6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..6cb10a7bb2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1395 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v22.8h }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "mov x17, #0x0\n"
+    "ld1r { v5.8h }, [x20]\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d12, [x14, #0x0]\n"
+    "ldr d11, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "ldr d25, [x14, #0x10]\n"
+    "ldr d24, [x14, #0x18]\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "ldr d23, [x14, #0x20]\n"
+    "ldr d7, [x14, #0x28]\n"
+    "usubl v24.8h, v24.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "ldr d3, [x14, #0x30]\n"
+    "ldr d9, [x14, #0x38]\n"
+    "usubl v7.8h, v7.8b, v6.8b\n"
+    "usubl v3.8h, v3.8b, v6.8b\n"
+    "ldr d30, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v9.8h, v9.8b, v6.8b\n"
+    "usubl v30.8h, v30.8b, v6.8b\n"
+    "ldr q8, [x20, #0x0]\n"
+    "ldr q2, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "mov v21.16b, v8.16b\n"
+    "mov v4.16b, v2.16b\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "mov v20.16b, v8.16b\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr d26, [x27, x17]\n"
+    "ldr d18, [x26, x17]\n"
+    "mov v16.16b, v8.16b\n"
+    "mov v14.16b, v2.16b\n"
+    "ldr d10, [x25, x17]\n"
+    "ldr d27, [x24, x17]\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ldr d17, [x23, x17]\n"
+    "ldr d19, [x22, x17]\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "ldr d15, [x21, x17]\n"
+    "ldr d28, [x20, x17]\n"
+    "ushll v17.8h, v17.8b, #0x0\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q31, [x13, #0x0]\n"
+    "ldr q0, [x12, #0x0]\n"
+    "smlal v8.4s, v26.4h, v30.4h\n"
+    "smlal2 v2.4s, v26.8h, v30.8h\n"
+    "ldr q29, [x13, #0x10]\n"
+    "ldr x21, [x15, #0x58]\n"
+    "smlal v8.4s, v18.4h, v12.4h\n"
+    "smlal v21.4s, v26.4h, v3.4h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "ldr x25, [x15, #0x60]\n"
+    "smlal v20.4s, v26.4h, v25.4h\n"
+    "smlal v16.4s, v26.4h, v12.4h\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal2 v2.4s, v18.8h, v12.8h\n"
+    "ldr d18, [x21, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal v8.4s, v10.4h, v11.4h\n"
+    "smlal2 v4.4s, v26.8h, v3.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v1.4s, v26.8h, v25.8h\n"
+    "smlal2 v14.4s, v26.8h, v12.8h\n"
+    "ldr d26, [x20, x17]\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v21.4s, v27.4h, v11.4h\n"
+    "smlal v20.4s, v18.4h, v24.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v16.4s, v26.4h, v23.4h\n"
+    "smlal2 v2.4s, v10.8h, v11.8h\n"
+    "ldr d10, [x25, x17]\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v8.4s, v19.4h, v24.4h\n"
+    "smlal2 v4.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x24, x17]\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "smlal2 v1.4s, v18.8h, v24.8h\n"
+    "ldr d18, [x23, x17]\n"
+    "smlal2 v14.4s, v26.8h, v23.8h\n"
+    "ldr d26, [x22, x17]\n"
+    "ldr x24, [x15, #0x98]\n"
+    "smlal v21.4s, v17.4h, v25.4h\n"
+    "smlal v20.4s, v10.4h, v12.4h\n"
+    "ldr x23, [x15, #0x50]\n"
+    "smlal v16.4s, v27.4h, v11.4h\n"
+    "smlal2 v2.4s, v19.8h, v24.8h\n"
+    "ldr d19, [x21, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v8.4s, v15.4h, v23.4h\n"
+    "smlal2 v4.4s, v17.8h, v25.8h\n"
+    "ldr d17, [x20, x17]\n"
+    "ldr x22, [x15, #0x48]\n"
+    "smlal2 v1.4s, v10.8h, v12.8h\n"
+    "smlal2 v14.4s, v27.8h, v11.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v28.4h, v12.4h\n"
+    "smlal v20.4s, v18.4h, v23.4h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal v16.4s, v26.4h, v7.4h\n"
+    "smlal2 v2.4s, v15.8h, v23.8h\n"
+    "ldr d15, [x24, x17]\n"
+    "ushll v17.8h, v17.8b, #0x0\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v8.4s, v28.4h, v25.4h\n"
+    "smlal2 v4.4s, v28.8h, v12.8h\n"
+    "ldr d12, [x23, x17]\n"
+    "ushll v12.8h, v12.8b, #0x0\n"
+    "smlal2 v1.4s, v18.8h, v23.8h\n"
+    "ldr d18, [x22, x17]\n"
+    "smlal2 v14.4s, v26.8h, v7.8h\n"
+    "ldr d26, [x21, x17]\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v21.4s, v19.4h, v23.4h\n"
+    "smlal v20.4s, v17.4h, v11.4h\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v16.4s, v15.4h, v25.4h\n"
+    "smlal2 v2.4s, v28.8h, v25.8h\n"
+    "ldr d28, [x20, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v8.4s, v12.4h, v7.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal2 v4.4s, v19.8h, v23.8h\n"
+    "ldr d23, [x22, x17]\n"
+    "ldr d19, [x21, x17]\n"
+    "smlal2 v1.4s, v17.8h, v11.8h\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal2 v14.4s, v15.8h, v25.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal v21.4s, v18.4h, v7.4h\n"
+    "smlal v20.4s, v26.4h, v3.4h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "smlal v16.4s, v28.4h, v24.4h\n"
+    "smlal2 v2.4s, v12.8h, v7.8h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal v8.4s, v10.4h, v3.4h\n"
+    "smlal2 v4.4s, v18.8h, v7.8h\n"
+    "ldr d18, [x21, x17]\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal2 v1.4s, v26.8h, v3.8h\n"
+    "smlal2 v14.4s, v28.8h, v24.8h\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "add x14, x14, #0x48\n"
+    "smlal v21.4s, v12.4h, v24.4h\n"
+    "smlal v20.4s, v23.4h, v9.4h\n"
+    "add x17, x17, #0x8\n"
+    "subs x8, x8, #0x1\n"
+    "smlal v16.4s, v19.4h, v9.4h\n"
+    "smlal2 v2.4s, v10.8h, v3.8h\n"
+    "add x13, x13, #0x20\n"
+    "add x12, x12, #0x20\n"
+    "smlal v8.4s, v17.4h, v9.4h\n"
+    "smlal2 v4.4s, v12.8h, v24.8h\n"
+    "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+    "smlal2 v1.4s, v23.8h, v9.8h\n"
+    "smlal2 v14.4s, v19.8h, v9.8h\n"
+    "and v10.16b, v8.16b, v0.16b\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v28.4h, v7.4h\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "smlal v16.4s, v11.4h, v3.4h\n"
+    "smlal2 v2.4s, v17.8h, v9.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v29.4s\n"
+    "smlal2 v4.4s, v27.8h, v9.8h\n"
+    "smlal2 v1.4s, v28.8h, v7.8h\n"
+    "and v12.16b, v2.16b, v25.16b\n"
+    "smlal2 v14.4s, v11.8h, v3.8h\n"
+    "smlal v21.4s, v15.4h, v30.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v31.4s\n"
+    "smlal v20.4s, v11.4h, v30.4h\n"
+    "smlal v16.4s, v18.4h, v30.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+    "smlal2 v4.4s, v15.8h, v30.8h\n"
+    "smlal2 v1.4s, v11.8h, v30.8h\n"
+    "sqrdmulh v16.4s, v16.4s, v31.4s\n"
+    "smlal2 v14.4s, v18.8h, v30.8h\n"
+    "sqadd v8.4s, v8.4s, v10.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "and v27.16b, v21.16b, v0.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v29.4s\n"
+    "and v24.16b, v20.16b, v0.16b\n"
+    "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+    "and v19.16b, v16.16b, v0.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v29.4s\n"
+    "sqadd v2.4s, v2.4s, v12.4s\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "and v18.16b, v4.16b, v25.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "and v17.16b, v1.16b, v25.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "and v15.16b, v14.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v27.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v24.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v16.4s, v16.4s, v19.4s\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "srshl v8.4s, v8.4s, v0.4s\n"
+    "srshl v21.4s, v21.4s, v0.4s\n"
+    "sqadd v4.4s, v4.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v0.4s\n"
+    "sqadd v1.4s, v1.4s, v17.4s\n"
+    "srshl v16.4s, v16.4s, v0.4s\n"
+    "sqadd v14.4s, v14.4s, v15.4s\n"
+    "srshl v2.4s, v2.4s, v25.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "srshl v4.4s, v4.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v1.4s, v1.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v14.4s, v14.4s, v25.4s\n"
+    "sqxtn v16.4h, v16.4s\n"
+    "sqxtn2 v8.8h, v2.4s\n"
+    "sqxtn2 v21.8h, v4.4s\n"
+    "sqxtn2 v20.8h, v1.4s\n"
+    "sqxtn2 v16.8h, v14.4s\n"
+    "sqadd v8.8h, v8.8h, v22.8h\n"
+    "sqadd v21.8h, v21.8h, v22.8h\n"
+    "sqadd v20.8h, v20.8h, v22.8h\n"
+    "sqadd v16.8h, v16.8h, v22.8h\n"
+    "smax v8.8h, v8.8h, v13.8h\n"
+    "smax v21.8h, v21.8h, v13.8h\n"
+    "smax v20.8h, v20.8h, v13.8h\n"
+    "smax v16.8h, v16.8h, v13.8h\n"
+    "smin v8.8h, v8.8h, v5.8h\n"
+    "smin v21.8h, v21.8h, v5.8h\n"
+    "smin v20.8h, v20.8h, v5.8h\n"
+    "smin v16.8h, v16.8h, v5.8h\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "str d8, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d16, [x28, x16]\n"
+    "ldr q8, [x20, #0x0]\n"
+    "ldr q2, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d12, [x14, #0x0]\n"
+    "ldr d11, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d25, [x14, #0x10]\n"
+    "ldr d24, [x14, #0x18]\n"
+    "mov v21.16b, v8.16b\n"
+    "mov v4.16b, v2.16b\n"
+    "ldr d23, [x14, #0x20]\n"
+    "ldr d7, [x14, #0x28]\n"
+    "mov v20.16b, v8.16b\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr d3, [x14, #0x30]\n"
+    "ldr d9, [x14, #0x38]\n"
+    "mov v16.16b, v8.16b\n"
+    "mov v14.16b, v2.16b\n"
+    "ldr d30, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v24.8h, v24.8b, v6.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr d26, [x27, x17]\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v7.8h, v7.8b, v6.8b\n"
+    "ldr d18, [x26, x17]\n"
+    "ldr d10, [x25, x17]\n"
+    "usubl v3.8h, v3.8b, v6.8b\n"
+    "usubl v9.8h, v9.8b, v6.8b\n"
+    "ldr d27, [x24, x17]\n"
+    "ldr d17, [x23, x17]\n"
+    "usubl v30.8h, v30.8b, v6.8b\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "ldr d19, [x22, x17]\n"
+    "ldr d15, [x21, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr d28, [x20, x17]\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "ushll v17.8h, v17.8b, #0x0\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q0, [x13, #0x0]\n"
+    "ldr q31, [x12, #0x0]\n"
+    "smlal v8.4s, v26.4h, v30.4h\n"
+    "smlal2 v2.4s, v26.8h, v30.8h\n"
+    "ldr q29, [x13, #0x10]\n"
+    "ldr x21, [x15, #0x58]\n"
+    "smlal v8.4s, v18.4h, v12.4h\n"
+    "smlal v21.4s, v26.4h, v3.4h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "ldr x25, [x15, #0x60]\n"
+    "smlal v20.4s, v26.4h, v25.4h\n"
+    "smlal v16.4s, v26.4h, v12.4h\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal2 v2.4s, v18.8h, v12.8h\n"
+    "ldr d18, [x21, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal v8.4s, v10.4h, v11.4h\n"
+    "smlal2 v4.4s, v26.8h, v3.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v1.4s, v26.8h, v25.8h\n"
+    "smlal2 v14.4s, v26.8h, v12.8h\n"
+    "ldr d26, [x20, x17]\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v21.4s, v27.4h, v11.4h\n"
+    "smlal v20.4s, v18.4h, v24.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v16.4s, v26.4h, v23.4h\n"
+    "smlal2 v2.4s, v10.8h, v11.8h\n"
+    "ldr d10, [x25, x17]\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v8.4s, v19.4h, v24.4h\n"
+    "smlal2 v4.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x24, x17]\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "smlal2 v1.4s, v18.8h, v24.8h\n"
+    "ldr d18, [x23, x17]\n"
+    "smlal2 v14.4s, v26.8h, v23.8h\n"
+    "ldr d26, [x22, x17]\n"
+    "ldr x24, [x15, #0x98]\n"
+    "smlal v21.4s, v17.4h, v25.4h\n"
+    "smlal v20.4s, v10.4h, v12.4h\n"
+    "ldr x23, [x15, #0x50]\n"
+    "smlal v16.4s, v27.4h, v11.4h\n"
+    "smlal2 v2.4s, v19.8h, v24.8h\n"
+    "ldr d19, [x21, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v8.4s, v15.4h, v23.4h\n"
+    "smlal2 v4.4s, v17.8h, v25.8h\n"
+    "ldr d17, [x20, x17]\n"
+    "ldr x22, [x15, #0x48]\n"
+    "smlal2 v1.4s, v10.8h, v12.8h\n"
+    "smlal2 v14.4s, v27.8h, v11.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v28.4h, v12.4h\n"
+    "smlal v20.4s, v18.4h, v23.4h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal v16.4s, v26.4h, v7.4h\n"
+    "smlal2 v2.4s, v15.8h, v23.8h\n"
+    "ldr d15, [x24, x17]\n"
+    "ushll v17.8h, v17.8b, #0x0\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v8.4s, v28.4h, v25.4h\n"
+    "smlal2 v4.4s, v28.8h, v12.8h\n"
+    "ldr d12, [x23, x17]\n"
+    "ushll v12.8h, v12.8b, #0x0\n"
+    "smlal2 v1.4s, v18.8h, v23.8h\n"
+    "ldr d18, [x22, x17]\n"
+    "smlal2 v14.4s, v26.8h, v7.8h\n"
+    "ldr d26, [x21, x17]\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v21.4s, v19.4h, v23.4h\n"
+    "smlal v20.4s, v17.4h, v11.4h\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v16.4s, v15.4h, v25.4h\n"
+    "smlal2 v2.4s, v28.8h, v25.8h\n"
+    "ldr d28, [x20, x17]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v8.4s, v12.4h, v7.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal2 v4.4s, v19.8h, v23.8h\n"
+    "ldr d23, [x22, x17]\n"
+    "ldr d19, [x21, x17]\n"
+    "smlal2 v1.4s, v17.8h, v11.8h\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal2 v14.4s, v15.8h, v25.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal v21.4s, v18.4h, v7.4h\n"
+    "smlal v20.4s, v26.4h, v3.4h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal v16.4s, v28.4h, v24.4h\n"
+    "smlal2 v2.4s, v12.8h, v7.8h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "tst x7, #0x7\n"
+    "smlal v8.4s, v10.4h, v3.4h\n"
+    "smlal2 v4.4s, v18.8h, v7.8h\n"
+    "ldr d18, [x20, x17]\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal2 v1.4s, v26.8h, v3.8h\n"
+    "smlal2 v14.4s, v28.8h, v24.8h\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "add x17, x17, #0x8\n"
+    "smlal v21.4s, v12.4h, v24.4h\n"
+    "smlal v20.4s, v23.4h, v9.4h\n"
+    "add x13, x13, #0x20\n"
+    "add x12, x12, #0x20\n"
+    "smlal v16.4s, v19.4h, v9.4h\n"
+    "smlal2 v2.4s, v10.8h, v3.8h\n"
+    "smlal v8.4s, v17.4h, v9.4h\n"
+    "smlal2 v4.4s, v12.8h, v24.8h\n"
+    "sqrdmulh v8.4s, v8.4s, v0.4s\n"
+    "smlal2 v1.4s, v23.8h, v9.8h\n"
+    "smlal2 v14.4s, v19.8h, v9.8h\n"
+    "and v23.16b, v8.16b, v31.16b\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v28.4h, v7.4h\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "smlal v16.4s, v11.4h, v3.4h\n"
+    "smlal2 v2.4s, v17.8h, v9.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v29.4s\n"
+    "smlal2 v4.4s, v27.8h, v9.8h\n"
+    "smlal2 v1.4s, v28.8h, v7.8h\n"
+    "and v7.16b, v2.16b, v25.16b\n"
+    "smlal2 v14.4s, v11.8h, v3.8h\n"
+    "smlal v21.4s, v15.4h, v30.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v0.4s\n"
+    "smlal v20.4s, v11.4h, v30.4h\n"
+    "smlal v16.4s, v18.4h, v30.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v0.4s\n"
+    "smlal2 v4.4s, v15.8h, v30.8h\n"
+    "smlal2 v1.4s, v11.8h, v30.8h\n"
+    "sqrdmulh v16.4s, v16.4s, v0.4s\n"
+    "smlal2 v14.4s, v18.8h, v30.8h\n"
+    "sqadd v8.4s, v8.4s, v23.4s\n"
+    "sshr v7.4s, v7.4s, #0x1f\n"
+    "and v23.16b, v21.16b, v31.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v29.4s\n"
+    "and v24.16b, v20.16b, v31.16b\n"
+    "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+    "and v19.16b, v16.16b, v31.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v29.4s\n"
+    "sqadd v2.4s, v2.4s, v7.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v18.16b, v4.16b, v25.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "and v17.16b, v1.16b, v25.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "and v15.16b, v14.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v23.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v24.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v16.4s, v16.4s, v19.4s\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "srshl v8.4s, v8.4s, v31.4s\n"
+    "srshl v21.4s, v21.4s, v31.4s\n"
+    "sqadd v4.4s, v4.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v31.4s\n"
+    "sqadd v1.4s, v1.4s, v17.4s\n"
+    "srshl v16.4s, v16.4s, v31.4s\n"
+    "sqadd v14.4s, v14.4s, v15.4s\n"
+    "srshl v2.4s, v2.4s, v25.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "srshl v4.4s, v4.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v1.4s, v1.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v14.4s, v14.4s, v25.4s\n"
+    "sqxtn v16.4h, v16.4s\n"
+    "sqxtn2 v8.8h, v2.4s\n"
+    "sqxtn2 v21.8h, v4.4s\n"
+    "sqxtn2 v20.8h, v1.4s\n"
+    "sqxtn2 v16.8h, v14.4s\n"
+    "sqadd v8.8h, v8.8h, v22.8h\n"
+    "sqadd v21.8h, v21.8h, v22.8h\n"
+    "sqadd v20.8h, v20.8h, v22.8h\n"
+    "sqadd v16.8h, v16.8h, v22.8h\n"
+    "smax v8.8h, v8.8h, v13.8h\n"
+    "smax v21.8h, v21.8h, v13.8h\n"
+    "smax v20.8h, v20.8h, v13.8h\n"
+    "smax v16.8h, v16.8h, v13.8h\n"
+    "smin v8.8h, v8.8h, v5.8h\n"
+    "smin v21.8h, v21.8h, v5.8h\n"
+    "smin v20.8h, v20.8h, v5.8h\n"
+    "smin v16.8h, v16.8h, v5.8h\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "str d8, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d16, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 88f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v8.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v2.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v2.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v2.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v8.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v8.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v8.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d12, [x14, #0x0]\n"
+    "ldr d11, [x14, #0x8]\n"
+    "mov v21.16b, v8.16b\n"
+    "mov v4.16b, v2.16b\n"
+    "ldr d25, [x14, #0x10]\n"
+    "ldr d24, [x14, #0x18]\n"
+    "mov v20.16b, v8.16b\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr d23, [x14, #0x20]\n"
+    "ldr d7, [x14, #0x28]\n"
+    "mov v16.16b, v8.16b\n"
+    "mov v14.16b, v2.16b\n"
+    "ldr d3, [x14, #0x30]\n"
+    "ldr d9, [x14, #0x38]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "ldr d30, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v24.8h, v24.8b, v6.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v7.8h, v7.8b, v6.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "usubl v3.8h, v3.8b, v6.8b\n"
+    "usubl v9.8h, v9.8b, v6.8b\n"
+    "usubl v30.8h, v30.8b, v6.8b\n"
+    "add x27, x27, x17\n"
+    "add x26, x26, x17\n"
+    "add x25, x25, x17\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v26.s }[0], [x27], #0x4\n"
+    "ld1 { v18.s }[0], [x26], #0x4\n"
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "ld1 { v27.s }[0], [x24], #0x4\n"
+    "ld1 { v17.s }[0], [x23], #0x4\n"
+    "ld1 { v19.s }[0], [x22], #0x4\n"
+    "ld1 { v15.s }[0], [x21], #0x4\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v26.h }[2], [x27], #0x2\n"
+    "ld1 { v18.h }[2], [x26], #0x2\n"
+    "ld1 { v10.h }[2], [x25], #0x2\n"
+    "ld1 { v27.h }[2], [x24], #0x2\n"
+    "ld1 { v17.h }[2], [x23], #0x2\n"
+    "ld1 { v19.h }[2], [x22], #0x2\n"
+    "ld1 { v15.h }[2], [x21], #0x2\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v26.b }[6], [x27]\n"
+    "ld1 { v18.b }[6], [x26]\n"
+    "ld1 { v10.b }[6], [x25]\n"
+    "ld1 { v27.b }[6], [x24]\n"
+    "ld1 { v17.b }[6], [x23]\n"
+    "ld1 { v19.b }[6], [x22]\n"
+    "ld1 { v15.b }[6], [x21]\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v26.b }[4], [x27]\n"
+    "ld1 { v18.b }[4], [x26]\n"
+    "ld1 { v10.b }[4], [x25]\n"
+    "ld1 { v27.b }[4], [x24]\n"
+    "ld1 { v17.b }[4], [x23]\n"
+    "ld1 { v19.b }[4], [x22]\n"
+    "ld1 { v15.b }[4], [x21]\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v26.h }[0], [x27], #0x2\n"
+    "ld1 { v18.h }[0], [x26], #0x2\n"
+    "ld1 { v10.h }[0], [x25], #0x2\n"
+    "ld1 { v27.h }[0], [x24], #0x2\n"
+    "ld1 { v17.h }[0], [x23], #0x2\n"
+    "ld1 { v19.h }[0], [x22], #0x2\n"
+    "ld1 { v15.h }[0], [x21], #0x2\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v26.b }[2], [x27]\n"
+    "ld1 { v18.b }[2], [x26]\n"
+    "ld1 { v10.b }[2], [x25]\n"
+    "ld1 { v27.b }[2], [x24]\n"
+    "ld1 { v17.b }[2], [x23]\n"
+    "ld1 { v19.b }[2], [x22]\n"
+    "ld1 { v15.b }[2], [x21]\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v26.b }[0], [x27]\n"
+    "ld1 { v18.b }[0], [x26]\n"
+    "ld1 { v10.b }[0], [x25]\n"
+    "ld1 { v27.b }[0], [x24]\n"
+    "ld1 { v17.b }[0], [x23]\n"
+    "ld1 { v19.b }[0], [x22]\n"
+    "ld1 { v15.b }[0], [x21]\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ushll v26.8h, v26.8b, #0x0\n"
+    "smlal v8.4s, v26.4h, v30.4h\n"
+    "smlal2 v2.4s, v26.8h, v30.8h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal v8.4s, v18.4h, v12.4h\n"
+    "smlal2 v2.4s, v18.8h, v12.8h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v21.4s, v26.4h, v3.4h\n"
+    "smlal2 v4.4s, v26.8h, v3.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v8.4s, v10.4h, v11.4h\n"
+    "smlal2 v2.4s, v10.8h, v11.8h\n"
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal v21.4s, v27.4h, v11.4h\n"
+    "smlal2 v4.4s, v27.8h, v11.8h\n"
+    "smlal v8.4s, v19.4h, v24.4h\n"
+    "smlal2 v2.4s, v19.8h, v24.8h\n"
+    "ushll v17.8h, v17.8b, #0x0\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v21.4s, v17.4h, v25.4h\n"
+    "smlal2 v4.4s, v17.8h, v25.8h\n"
+    "smlal v8.4s, v15.4h, v23.4h\n"
+    "smlal2 v2.4s, v15.8h, v23.8h\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal v20.4s, v26.4h, v25.4h\n"
+    "smlal2 v1.4s, v26.8h, v25.8h\n"
+    "smlal v16.4s, v26.4h, v12.4h\n"
+    "smlal2 v14.4s, v26.8h, v12.8h\n"
+    "smlal v8.4s, v28.4h, v25.4h\n"
+    "smlal2 v2.4s, v28.8h, v25.8h\n"
+    "smlal v21.4s, v28.4h, v12.4h\n"
+    "smlal2 v4.4s, v28.8h, v12.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v31.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v31.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v31.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v31.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v31.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v31.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v21.4s, v31.4h, v23.4h\n"
+    "smlal2 v4.4s, v31.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v21.4s, v28.4h, v7.4h\n"
+    "smlal2 v4.4s, v28.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (1, 2): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (1, 2): Bit 2: End
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v8.4s, v27.4h, v7.4h\n"
+    "smlal2 v2.4s, v27.8h, v7.8h\n"
+    "smlal v21.4s, v27.4h, v24.4h\n"
+    "smlal2 v4.4s, v27.8h, v24.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v0.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v0.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v0.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v0.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v0.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v0.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (3, 0): Bit 2: End
+    "ushll v0.8h, v0.8b, #0x0\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v20.4s, v0.4h, v24.4h\n"
+    "smlal2 v1.4s, v0.8h, v24.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 0): Bit 2: End
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v8.4s, v15.4h, v3.4h\n"
+    "smlal2 v2.4s, v15.8h, v3.8h\n"
+    "smlal v20.4s, v15.4h, v12.4h\n"
+    "smlal2 v1.4s, v15.8h, v12.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v0.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v0.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v0.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v0.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v0.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v0.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (3, 1): Bit 2: End
+    "ushll v0.8h, v0.8b, #0x0\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v20.4s, v0.4h, v23.4h\n"
+    "smlal2 v1.4s, v0.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 1): Bit 2: End
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v8.4s, v6.4h, v9.4h\n"
+    "smlal2 v2.4s, v6.8h, v9.8h\n"
+    "smlal v20.4s, v6.4h, v11.4h\n"
+    "smlal2 v1.4s, v6.8h, v11.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (3, 3): Bit 2: End
+    "ushll v27.8h, v27.8b, #0x0\n"
+    "ldr x20, [x15, #0x80]\n"
+    "smlal v16.4s, v27.4h, v23.4h\n"
+    "smlal2 v14.4s, v27.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr x20, [x15, #0x88]\n"
+    "smlal v21.4s, v10.4h, v9.4h\n"
+    "smlal2 v4.4s, v10.8h, v9.8h\n"
+    "smlal v16.4s, v10.4h, v11.4h\n"
+    "smlal2 v14.4s, v10.8h, v11.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 4): Bit 2: End
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "ldr x20, [x15, #0x90]\n"
+    "smlal v16.4s, v28.4h, v7.4h\n"
+    "smlal2 v14.4s, v28.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (4, 0): Bit 2: End
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x15, #0x98]\n"
+    "smlal v20.4s, v15.4h, v3.4h\n"
+    "smlal2 v1.4s, v15.8h, v3.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (2, 4): Bit 2: End
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "smlal v21.4s, v6.4h, v30.4h\n"
+    "smlal2 v4.4s, v6.8h, v30.8h\n"
+    "smlal v16.4s, v6.4h, v25.4h\n"
+    "smlal2 v14.4s, v6.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 61f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (4, 1): Bit 2: End
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v20.4s, v23.4h, v9.4h\n"
+    "smlal2 v1.4s, v23.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 65f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 64f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 66f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 2): Bit 2: End
+    "ushll v12.8h, v12.8b, #0x0\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "smlal v20.4s, v12.4h, v7.4h\n"
+    "smlal2 v1.4s, v12.8h, v7.8h\n"
+    "smlal v16.4s, v12.4h, v24.4h\n"
+    "smlal2 v14.4s, v12.8h, v24.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 69f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 68f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x7, #1, 70f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 3): Bit 2: End
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal v16.4s, v10.4h, v9.4h\n"
+    "smlal2 v14.4s, v10.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 73f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 72f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x7, #1, 74f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 2): Bit 2: End
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal v20.4s, v15.4h, v30.4h\n"
+    "smlal2 v1.4s, v15.8h, v30.8h\n"
+    "smlal v16.4s, v15.4h, v3.4h\n"
+    "smlal2 v14.4s, v15.8h, v3.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 77f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 76f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x7, #1, 78f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 4): Bit 2: End
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal v16.4s, v28.4h, v30.4h\n"
+    "smlal2 v14.4s, v28.8h, v30.8h\n"
+    "tbz x7, #2, 81f\n"
+    "ld1 { v19.4s }, [x13], #0x10\n"
+    "ld1 { v23.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 80f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v24.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v24.s }[2], [x12]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v24.s }[0], [x12]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 82f\n"
+    "ld1 { v19.d }[0], [x13], #0x8\n"
+    "ld1 { v23.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v19.s }[2], [x13]\n"
+    "ld1 { v23.s }[2], [x12]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v19.s }[0], [x13]\n"
+    "ld1 { v23.s }[0], [x12]\n"
+    "83:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v8.4s, v8.4s, v19.4s\n"
+    "and v17.16b, v8.16b, v23.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v2.4s, v2.4s, v18.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v11.16b, v2.16b, v24.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v19.4s\n"
+    "sqadd v8.4s, v8.4s, v17.4s\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v28.16b, v21.16b, v23.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v18.4s\n"
+    "and v17.16b, v20.16b, v23.16b\n"
+    "sqrdmulh v1.4s, v1.4s, v18.4s\n"
+    "and v19.16b, v16.16b, v23.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+    "sqadd v2.4s, v2.4s, v11.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "and v18.16b, v4.16b, v24.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v12.16b, v1.16b, v24.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "and v25.16b, v14.16b, v24.16b\n"
+    "sqadd v21.4s, v21.4s, v28.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "sqadd v16.4s, v16.4s, v19.4s\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "srshl v8.4s, v8.4s, v23.4s\n"
+    "srshl v21.4s, v21.4s, v23.4s\n"
+    "sqadd v4.4s, v4.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "sqadd v1.4s, v1.4s, v12.4s\n"
+    "srshl v16.4s, v16.4s, v23.4s\n"
+    "sqadd v14.4s, v14.4s, v25.4s\n"
+    "srshl v2.4s, v2.4s, v24.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "srshl v4.4s, v4.4s, v24.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v1.4s, v1.4s, v24.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v14.4s, v14.4s, v24.4s\n"
+    "sqxtn v16.4h, v16.4s\n"
+    "sqxtn2 v8.8h, v2.4s\n"
+    "sqxtn2 v21.8h, v4.4s\n"
+    "sqxtn2 v20.8h, v1.4s\n"
+    "sqxtn2 v16.8h, v14.4s\n"
+    "sqadd v8.8h, v8.8h, v22.8h\n"
+    "sqadd v21.8h, v21.8h, v22.8h\n"
+    "sqadd v20.8h, v20.8h, v22.8h\n"
+    "sqadd v16.8h, v16.8h, v22.8h\n"
+    "smax v8.8h, v8.8h, v13.8h\n"
+    "smax v21.8h, v21.8h, v13.8h\n"
+    "smax v20.8h, v20.8h, v13.8h\n"
+    "smax v16.8h, v16.8h, v13.8h\n"
+    "smin v8.8h, v8.8h, v5.8h\n"
+    "smin v21.8h, v21.8h, v5.8h\n"
+    "smin v20.8h, v20.8h, v5.8h\n"
+    "smin v16.8h, v16.8h, v5.8h\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "tbz x7, #2, 85f\n"
+    "st1 { v8.s }[0], [x11], #0x4\n"
+    "st1 { v21.s }[0], [x10], #0x4\n"
+    "st1 { v20.s }[0], [x9], #0x4\n"
+    "st1 { v16.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 84f\n"
+    "st1 { v8.h }[2], [x11], #0x2\n"
+    "st1 { v21.h }[2], [x10], #0x2\n"
+    "st1 { v20.h }[2], [x9], #0x2\n"
+    "st1 { v16.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v8.b }[6], [x11], #0x1\n"
+    "st1 { v21.b }[6], [x10], #0x1\n"
+    "st1 { v20.b }[6], [x9], #0x1\n"
+    "st1 { v16.b }[6], [x28], #0x1\n"
+    "b 87f\n"
+    "84:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v8.b }[4], [x11], #0x1\n"
+    "st1 { v21.b }[4], [x10], #0x1\n"
+    "st1 { v20.b }[4], [x9], #0x1\n"
+    "st1 { v16.b }[4], [x28], #0x1\n"
+    "b 87f\n"
+    "85:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 86f\n"
+    "st1 { v8.h }[0], [x11], #0x2\n"
+    "st1 { v21.h }[0], [x10], #0x2\n"
+    "st1 { v20.h }[0], [x9], #0x2\n"
+    "st1 { v16.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v8.b }[2], [x11], #0x1\n"
+    "st1 { v21.b }[2], [x10], #0x1\n"
+    "st1 { v20.b }[2], [x9], #0x1\n"
+    "st1 { v16.b }[2], [x28], #0x1\n"
+    "b 87f\n"
+    "86:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v8.b }[0], [x11], #0x1\n"
+    "st1 { v21.b }[0], [x10], #0x1\n"
+    "st1 { v20.b }[0], [x9], #0x1\n"
+    "st1 { v16.b }[0], [x28], #0x1\n"
+    "87:"  // Oddments: Bit 2: End
+    "88:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..39601fd8e4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const uint8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..9316732632
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2185 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x2, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x3, x2, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v2.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v25.8h }, [x21]\n"
+    "ld1r { v12.8h }, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "mov x4, #0x0\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "mov x5, #0x0\n"
+    "add x6, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x7, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x17, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x16, x15, [x22, #0x0]\n"
+    "ldp x14, x13, [x22, #0x10]\n"
+    "cbz x3, 3f\n"
+    "ldr d21, [x7, #0x0]\n"
+    "ldr d15, [x7, #0x8]\n"
+    "subs x3, x3, #0x1\n"
+    "usubl v21.8h, v21.8b, v2.8b\n"
+    "ldr d29, [x7, #0x10]\n"
+    "ldr d18, [x7, #0x18]\n"
+    "usubl v15.8h, v15.8b, v2.8b\n"
+    "usubl v29.8h, v29.8b, v2.8b\n"
+    "ldr d3, [x7, #0x20]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "usubl v18.8h, v18.8b, v2.8b\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "ldr q13, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x9, x28, [x6, #0x0]\n"
+    "ldp x27, x26, [x6, #0x10]\n"
+    "mov v7.16b, v13.16b\n"
+    "mov v14.16b, v24.16b\n"
+    "ldp x25, x24, [x6, #0x20]\n"
+    "ldp x23, x22, [x6, #0x30]\n"
+    "mov v27.16b, v13.16b\n"
+    "mov v22.16b, v24.16b\n"
+    "ldp x21, x20, [x6, #0x40]\n"
+    "ldr d10, [x9, x4]\n"
+    "mov v8.16b, v13.16b\n"
+    "mov v17.16b, v24.16b\n"
+    "ldr d16, [x28, x4]\n"
+    "ldr d23, [x27, x4]\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "ldr d30, [x26, x4]\n"
+    "ldr d4, [x25, x4]\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "ldr d28, [x24, x4]\n"
+    "ldr d31, [x23, x4]\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "ldr d1, [x22, x4]\n"
+    "ldr d9, [x21, x4]\n"
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "ldr d11, [x20, x4]\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr d5, [x7, #0x28]\n"
+    "ldr d6, [x7, #0x30]\n"
+    "smlal v13.4s, v10.4h, v21.4h\n"
+    "smlal2 v24.4s, v10.8h, v21.8h\n"
+    "ldr d19, [x7, #0x38]\n"
+    "ldr d0, [x7, #0x40]\n"
+    "smlal v13.4s, v16.4h, v15.4h\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "ldr d10, [x7, #0x48]\n"
+    "ldr d20, [x7, #0x50]\n"
+    "smlal v27.4s, v23.4h, v21.4h\n"
+    "smlal v8.4s, v30.4h, v21.4h\n"
+    "ldr x21, [x6, #0x50]\n"
+    "smlal2 v24.4s, v16.8h, v15.8h\n"
+    "smlal v13.4s, v4.4h, v29.4h\n"
+    "ldr x20, [x6, #0x58]\n"
+    "smlal2 v14.4s, v16.8h, v21.8h\n"
+    "ldr d16, [x21, x4]\n"
+    "smlal2 v22.4s, v23.8h, v21.8h\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "smlal2 v17.4s, v30.8h, v21.8h\n"
+    "ldr d21, [x20, x4]\n"
+    "smlal v7.4s, v4.4h, v15.4h\n"
+    "ldr x22, [x6, #0x60]\n"
+    "smlal v27.4s, v30.4h, v15.4h\n"
+    "smlal v8.4s, v28.4h, v15.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [x6, #0x68]\n"
+    "smlal2 v24.4s, v4.8h, v29.8h\n"
+    "smlal v13.4s, v31.4h, v18.4h\n"
+    "usubl v5.8h, v5.8b, v2.8b\n"
+    "ldr x21, [x6, #0x70]\n"
+    "smlal2 v14.4s, v4.8h, v15.8h\n"
+    "ldr d4, [x22, x4]\n"
+    "smlal2 v22.4s, v30.8h, v15.8h\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "smlal2 v17.4s, v28.8h, v15.8h\n"
+    "ldr d15, [x20, x4]\n"
+    "smlal v7.4s, v31.4h, v29.4h\n"
+    "usubl v6.8h, v6.8b, v2.8b\n"
+    "smlal v27.4s, v28.4h, v29.4h\n"
+    "smlal v8.4s, v16.4h, v29.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x6, #0x78]\n"
+    "smlal2 v24.4s, v31.8h, v18.8h\n"
+    "smlal v13.4s, v1.4h, v3.4h\n"
+    "usubl v19.8h, v19.8b, v2.8b\n"
+    "ldr x22, [x6, #0x80]\n"
+    "smlal2 v14.4s, v31.8h, v29.8h\n"
+    "ldr d31, [x21, x4]\n"
+    "smlal2 v22.4s, v28.8h, v29.8h\n"
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "smlal2 v17.4s, v16.8h, v29.8h\n"
+    "ldr d29, [x20, x4]\n"
+    "smlal v7.4s, v1.4h, v18.4h\n"
+    "usubl v0.8h, v0.8b, v2.8b\n"
+    "smlal v27.4s, v16.4h, v18.4h\n"
+    "smlal v8.4s, v21.4h, v18.4h\n"
+    "ushll v29.8h, v29.8b, #0x0\n"
+    "ldr x20, [x6, #0x88]\n"
+    "smlal2 v24.4s, v1.8h, v3.8h\n"
+    "smlal v13.4s, v23.4h, v5.4h\n"
+    "usubl v10.8h, v10.8b, v2.8b\n"
+    "ldr x21, [x6, #0x90]\n"
+    "smlal2 v14.4s, v1.8h, v18.8h\n"
+    "ldr d1, [x22, x4]\n"
+    "smlal2 v22.4s, v16.8h, v18.8h\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "smlal2 v17.4s, v21.8h, v18.8h\n"
+    "ldr d18, [x20, x4]\n"
+    "smlal v7.4s, v4.4h, v3.4h\n"
+    "usubl v20.8h, v20.8b, v2.8b\n"
+    "smlal v27.4s, v21.4h, v3.4h\n"
+    "smlal v8.4s, v9.4h, v3.4h\n"
+    "ldr x20, [x6, #0x98]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal2 v24.4s, v23.8h, v5.8h\n"
+    "ldr d23, [x7, #0x58]\n"
+    "smlal v13.4s, v30.4h, v6.4h\n"
+    "usubl v23.8h, v23.8b, v2.8b\n"
+    "smlal2 v14.4s, v4.8h, v3.8h\n"
+    "ldr d4, [x21, x4]\n"
+    "smlal2 v22.4s, v21.8h, v3.8h\n"
+    "ldr x23, [x6, #0xa0]\n"
+    "smlal2 v17.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x20, x4]\n"
+    "smlal v7.4s, v30.4h, v5.4h\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "smlal v27.4s, v11.4h, v5.4h\n"
+    "smlal v8.4s, v15.4h, v5.4h\n"
+    "ushll v3.8h, v3.8b, #0x0\n"
+    "ldr x22, [x6, #0xa8]\n"
+    "smlal2 v24.4s, v30.8h, v6.8h\n"
+    "smlal v13.4s, v28.4h, v19.4h\n"
+    "ldr x21, [x6, #0xb0]\n"
+    "ldr x20, [x6, #0xb8]\n"
+    "smlal2 v14.4s, v30.8h, v5.8h\n"
+    "ldr d30, [x7, #0x60]\n"
+    "smlal2 v22.4s, v11.8h, v5.8h\n"
+    "usubl v30.8h, v30.8b, v2.8b\n"
+    "smlal2 v17.4s, v15.8h, v5.8h\n"
+    "ldr d5, [x23, x4]\n"
+    "smlal v7.4s, v28.4h, v6.4h\n"
+    "ushll v5.8h, v5.8b, #0x0\n"
+    "smlal v27.4s, v15.4h, v6.4h\n"
+    "smlal v8.4s, v31.4h, v6.4h\n"
+    "ldr x12, [x6, #0xc0]\n"
+    "ldr x11, [x6, #0xc8]\n"
+    "smlal2 v24.4s, v28.8h, v19.8h\n"
+    "smlal v13.4s, v16.4h, v0.4h\n"
+    "ldr x10, [x6, #0xd0]\n"
+    "ldr x9, [x6, #0xd8]\n"
+    "smlal2 v14.4s, v28.8h, v6.8h\n"
+    "ldr d28, [x7, #0x68]\n"
+    "smlal2 v22.4s, v15.8h, v6.8h\n"
+    "usubl v28.8h, v28.8b, v2.8b\n"
+    "smlal2 v17.4s, v31.8h, v6.8h\n"
+    "ldr d6, [x22, x4]\n"
+    "smlal v7.4s, v16.4h, v19.4h\n"
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "smlal v27.4s, v31.4h, v19.4h\n"
+    "smlal v8.4s, v29.4h, v19.4h\n"
+    "ldr x28, [x6, #0xe0]\n"
+    "ldr x27, [x6, #0xe8]\n"
+    "smlal2 v24.4s, v16.8h, v0.8h\n"
+    "smlal v13.4s, v21.4h, v10.4h\n"
+    "ldr x26, [x6, #0xf0]\n"
+    "ldr x25, [x6, #0xf8]\n"
+    "smlal2 v14.4s, v16.8h, v19.8h\n"
+    "ldr d16, [x7, #0x70]\n"
+    "smlal2 v22.4s, v31.8h, v19.8h\n"
+    "usubl v16.8h, v16.8b, v2.8b\n"
+    "smlal2 v17.4s, v29.8h, v19.8h\n"
+    "ldr d19, [x21, x4]\n"
+    "smlal v7.4s, v21.4h, v0.4h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal v27.4s, v29.4h, v0.4h\n"
+    "smlal v8.4s, v1.4h, v0.4h\n"
+    "ldr x24, [x6, #0x100]\n"
+    "ldr x23, [x6, #0x108]\n"
+    "smlal2 v24.4s, v21.8h, v10.8h\n"
+    "smlal v13.4s, v11.4h, v20.4h\n"
+    "ldr x22, [x6, #0x110]\n"
+    "ldr x21, [x6, #0x118]\n"
+    "smlal2 v14.4s, v21.8h, v0.8h\n"
+    "ldr d21, [x7, #0x78]\n"
+    "smlal2 v22.4s, v29.8h, v0.8h\n"
+    "usubl v21.8h, v21.8b, v2.8b\n"
+    "smlal2 v17.4s, v1.8h, v0.8h\n"
+    "ldr d0, [x20, x4]\n"
+    "smlal v7.4s, v9.4h, v10.4h\n"
+    "ushll v0.8h, v0.8b, #0x0\n"
+    "smlal v27.4s, v1.4h, v10.4h\n"
+    "smlal v8.4s, v18.4h, v10.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "subs x3, x3, #0x1\n"
+    "smlal2 v24.4s, v11.8h, v20.8h\n"
+    "ldr d11, [x7, #0x80]\n"
+    "smlal v13.4s, v15.4h, v23.4h\n"
+    "usubl v11.8h, v11.8b, v2.8b\n"
+    "smlal2 v14.4s, v9.8h, v10.8h\n"
+    "ldr d9, [x12, x4]\n"
+    "smlal2 v22.4s, v1.8h, v10.8h\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "smlal2 v17.4s, v18.8h, v10.8h\n"
+    "ldr d10, [x11, x4]\n"
+    "smlal v7.4s, v15.4h, v20.4h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v27.4s, v4.4h, v20.4h\n"
+    "smlal v8.4s, v3.4h, v20.4h\n"
+    "smlal2 v24.4s, v15.8h, v23.8h\n"
+    "smlal v13.4s, v31.4h, v30.4h\n"
+    "smlal2 v14.4s, v15.8h, v20.8h\n"
+    "ldr d15, [x7, #0x88]\n"
+    "smlal2 v22.4s, v4.8h, v20.8h\n"
+    "usubl v15.8h, v15.8b, v2.8b\n"
+    "smlal2 v17.4s, v3.8h, v20.8h\n"
+    "ldr d20, [x10, x4]\n"
+    "smlal v7.4s, v31.4h, v23.4h\n"
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "smlal v27.4s, v3.4h, v23.4h\n"
+    "smlal v8.4s, v5.4h, v23.4h\n"
+    "smlal2 v24.4s, v31.8h, v30.8h\n"
+    "smlal v13.4s, v29.4h, v28.4h\n"
+    "smlal2 v14.4s, v31.8h, v23.8h\n"
+    "ldr d31, [x7, #0x90]\n"
+    "smlal2 v22.4s, v3.8h, v23.8h\n"
+    "usubl v31.8h, v31.8b, v2.8b\n"
+    "smlal2 v17.4s, v5.8h, v23.8h\n"
+    "ldr d23, [x9, x4]\n"
+    "smlal v7.4s, v29.4h, v30.4h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal v27.4s, v5.4h, v30.4h\n"
+    "smlal v8.4s, v6.4h, v30.4h\n"
+    "smlal2 v24.4s, v29.8h, v28.8h\n"
+    "smlal v13.4s, v1.4h, v16.4h\n"
+    "smlal2 v14.4s, v29.8h, v30.8h\n"
+    "ldr d29, [x7, #0x98]\n"
+    "smlal2 v22.4s, v5.8h, v30.8h\n"
+    "usubl v29.8h, v29.8b, v2.8b\n"
+    "smlal2 v17.4s, v6.8h, v30.8h\n"
+    "ldr d30, [x28, x4]\n"
+    "smlal v7.4s, v1.4h, v28.4h\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "smlal v27.4s, v6.4h, v28.4h\n"
+    "smlal v8.4s, v19.4h, v28.4h\n"
+    "smlal2 v24.4s, v1.8h, v16.8h\n"
+    "smlal v13.4s, v4.4h, v21.4h\n"
+    "smlal2 v14.4s, v1.8h, v28.8h\n"
+    "ldr d1, [x7, #0xa0]\n"
+    "smlal2 v22.4s, v6.8h, v28.8h\n"
+    "usubl v1.8h, v1.8b, v2.8b\n"
+    "smlal2 v17.4s, v19.8h, v28.8h\n"
+    "ldr d28, [x27, x4]\n"
+    "smlal v7.4s, v18.4h, v16.4h\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal v27.4s, v19.4h, v16.4h\n"
+    "smlal v8.4s, v0.4h, v16.4h\n"
+    "smlal2 v24.4s, v4.8h, v21.8h\n"
+    "ldr d4, [x7, #0xa8]\n"
+    "smlal v13.4s, v3.4h, v11.4h\n"
+    "usubl v4.8h, v4.8b, v2.8b\n"
+    "smlal2 v14.4s, v18.8h, v16.8h\n"
+    "ldr d18, [x26, x4]\n"
+    "smlal2 v22.4s, v19.8h, v16.8h\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal2 v17.4s, v0.8h, v16.8h\n"
+    "ldr d16, [x25, x4]\n"
+    "smlal v7.4s, v3.4h, v21.4h\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "smlal v27.4s, v9.4h, v21.4h\n"
+    "smlal v8.4s, v10.4h, v21.4h\n"
+    "smlal2 v24.4s, v3.8h, v11.8h\n"
+    "smlal v13.4s, v5.4h, v15.4h\n"
+    "smlal2 v14.4s, v3.8h, v21.8h\n"
+    "ldr d3, [x7, #0xb0]\n"
+    "smlal2 v22.4s, v9.8h, v21.8h\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "smlal2 v17.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x24, x4]\n"
+    "smlal v7.4s, v5.4h, v11.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "smlal v27.4s, v10.4h, v11.4h\n"
+    "smlal v8.4s, v20.4h, v11.4h\n"
+    "smlal2 v24.4s, v5.8h, v15.8h\n"
+    "smlal v13.4s, v6.4h, v31.4h\n"
+    "smlal2 v14.4s, v5.8h, v11.8h\n"
+    "ldr d5, [x7, #0xb8]\n"
+    "smlal2 v22.4s, v10.8h, v11.8h\n"
+    "usubl v5.8h, v5.8b, v2.8b\n"
+    "smlal2 v17.4s, v20.8h, v11.8h\n"
+    "ldr d11, [x23, x4]\n"
+    "smlal v7.4s, v6.4h, v15.4h\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal v27.4s, v20.4h, v15.4h\n"
+    "smlal v8.4s, v23.4h, v15.4h\n"
+    "smlal2 v24.4s, v6.8h, v31.8h\n"
+    "smlal v13.4s, v19.4h, v29.4h\n"
+    "smlal2 v14.4s, v6.8h, v15.8h\n"
+    "ldr d6, [x7, #0xc0]\n"
+    "smlal2 v22.4s, v20.8h, v15.8h\n"
+    "usubl v6.8h, v6.8b, v2.8b\n"
+    "smlal2 v17.4s, v23.8h, v15.8h\n"
+    "ldr d15, [x22, x4]\n"
+    "smlal v7.4s, v19.4h, v31.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v27.4s, v23.4h, v31.4h\n"
+    "smlal v8.4s, v30.4h, v31.4h\n"
+    "add x7, x7, #0xc8\n"
+    "smlal2 v24.4s, v19.8h, v29.8h\n"
+    "smlal v13.4s, v9.4h, v1.4h\n"
+    "smlal2 v14.4s, v19.8h, v31.8h\n"
+    "ldr d19, [x21, x4]\n"
+    "smlal2 v22.4s, v23.8h, v31.8h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal2 v17.4s, v30.8h, v31.8h\n"
+    "ldr q31, [x8, #0x0]\n"
+    "smlal v7.4s, v0.4h, v29.4h\n"
+    "add x4, x4, #0x8\n"
+    "smlal v27.4s, v30.4h, v29.4h\n"
+    "smlal v8.4s, v28.4h, v29.4h\n"
+    "smlal2 v24.4s, v9.8h, v1.8h\n"
+    "ldr q9, [x17, #0x0]\n"
+    "smlal v13.4s, v10.4h, v4.4h\n"
+    "smlal2 v14.4s, v0.8h, v29.8h\n"
+    "ldr q0, [x8, #0x10]\n"
+    "smlal2 v22.4s, v30.8h, v29.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v17.4s, v28.8h, v29.8h\n"
+    "ldr q29, [x17, #0x10]\n"
+    "smlal v7.4s, v10.4h, v1.4h\n"
+    "add x17, x17, #0x20\n"
+    "smlal v27.4s, v18.4h, v1.4h\n"
+    "smlal v8.4s, v16.4h, v1.4h\n"
+    "smlal2 v24.4s, v10.8h, v4.8h\n"
+    "smlal v13.4s, v20.4h, v3.4h\n"
+    "smlal2 v14.4s, v10.8h, v1.8h\n"
+    "smlal2 v22.4s, v18.8h, v1.8h\n"
+    "smlal2 v17.4s, v16.8h, v1.8h\n"
+    "smlal v7.4s, v20.4h, v4.4h\n"
+    "smlal v27.4s, v16.4h, v4.4h\n"
+    "smlal v8.4s, v21.4h, v4.4h\n"
+    "smlal2 v24.4s, v20.8h, v3.8h\n"
+    "smlal v13.4s, v23.4h, v5.4h\n"
+    "smlal2 v14.4s, v20.8h, v4.8h\n"
+    "smlal2 v22.4s, v16.8h, v4.8h\n"
+    "smlal2 v17.4s, v21.8h, v4.8h\n"
+    "smlal v7.4s, v23.4h, v3.4h\n"
+    "smlal v27.4s, v21.4h, v3.4h\n"
+    "smlal v8.4s, v11.4h, v3.4h\n"
+    "smlal2 v24.4s, v23.8h, v5.8h\n"
+    "smlal v13.4s, v30.4h, v6.4h\n"
+    "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+    "smlal2 v14.4s, v23.8h, v3.8h\n"
+    "smlal2 v22.4s, v21.8h, v3.8h\n"
+    "and v23.16b, v13.16b, v9.16b\n"
+    "smlal2 v17.4s, v11.8h, v3.8h\n"
+    "smlal v7.4s, v30.4h, v5.4h\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "smlal v27.4s, v11.4h, v5.4h\n"
+    "smlal v8.4s, v15.4h, v5.4h\n"
+    "sqadd v13.4s, v13.4s, v23.4s\n"
+    "smlal2 v24.4s, v30.8h, v6.8h\n"
+    "smlal2 v14.4s, v30.8h, v5.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v0.4s\n"
+    "smlal2 v22.4s, v11.8h, v5.8h\n"
+    "smlal2 v17.4s, v15.8h, v5.8h\n"
+    "and v10.16b, v24.16b, v29.16b\n"
+    "smlal v7.4s, v28.4h, v6.4h\n"
+    "smlal v27.4s, v15.4h, v6.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v31.4s\n"
+    "smlal v8.4s, v19.4h, v6.4h\n"
+    "smlal2 v14.4s, v28.8h, v6.8h\n"
+    "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+    "smlal2 v22.4s, v15.8h, v6.8h\n"
+    "smlal2 v17.4s, v19.8h, v6.8h\n"
+    "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "and v28.16b, v7.16b, v9.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v0.4s\n"
+    "and v20.16b, v27.16b, v9.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v0.4s\n"
+    "and v23.16b, v8.16b, v9.16b\n"
+    "sqrdmulh v17.4s, v17.4s, v0.4s\n"
+    "sqadd v24.4s, v24.4s, v10.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "and v18.16b, v14.16b, v29.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v30.16b, v22.16b, v29.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v5.16b, v17.16b, v29.16b\n"
+    "sqadd v7.4s, v7.4s, v28.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "sqadd v8.4s, v8.4s, v23.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "srshl v13.4s, v13.4s, v9.4s\n"
+    "srshl v7.4s, v7.4s, v9.4s\n"
+    "sqadd v14.4s, v14.4s, v18.4s\n"
+    "srshl v27.4s, v27.4s, v9.4s\n"
+    "sqadd v22.4s, v22.4s, v30.4s\n"
+    "srshl v8.4s, v8.4s, v9.4s\n"
+    "sqadd v17.4s, v17.4s, v5.4s\n"
+    "srshl v24.4s, v24.4s, v29.4s\n"
+    "sqxtn v13.4h, v13.4s\n"
+    "srshl v14.4s, v14.4s, v29.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v22.4s, v22.4s, v29.4s\n"
+    "sqxtn v27.4h, v27.4s\n"
+    "srshl v17.4s, v17.4s, v29.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "sqxtn2 v13.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v14.4s\n"
+    "sqxtn2 v27.8h, v22.4s\n"
+    "sqxtn2 v8.8h, v17.4s\n"
+    "sqadd v13.8h, v13.8h, v25.8h\n"
+    "sqadd v7.8h, v7.8h, v25.8h\n"
+    "sqadd v27.8h, v27.8h, v25.8h\n"
+    "sqadd v8.8h, v8.8h, v25.8h\n"
+    "smax v13.8h, v13.8h, v12.8h\n"
+    "smax v7.8h, v7.8h, v12.8h\n"
+    "smax v27.8h, v27.8h, v12.8h\n"
+    "smax v8.8h, v8.8h, v12.8h\n"
+    "smin v13.8h, v13.8h, v26.8h\n"
+    "smin v7.8h, v7.8h, v26.8h\n"
+    "smin v27.8h, v27.8h, v26.8h\n"
+    "smin v8.8h, v8.8h, v26.8h\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str d13, [x16, x5]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str d7, [x15, x5]\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "str d27, [x14, x5]\n"
+    "str d8, [x13, x5]\n"
+    "ldr q13, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d21, [x7, #0x0]\n"
+    "ldr d15, [x7, #0x8]\n"
+    "add x5, x5, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d29, [x7, #0x10]\n"
+    "ldr d18, [x7, #0x18]\n"
+    "mov v7.16b, v13.16b\n"
+    "mov v14.16b, v24.16b\n"
+    "ldr d3, [x7, #0x20]\n"
+    "ldp x9, x28, [x6, #0x0]\n"
+    "mov v27.16b, v13.16b\n"
+    "mov v22.16b, v24.16b\n"
+    "ldp x27, x26, [x6, #0x10]\n"
+    "ldp x25, x24, [x6, #0x20]\n"
+    "mov v8.16b, v13.16b\n"
+    "mov v17.16b, v24.16b\n"
+    "ldp x23, x22, [x6, #0x30]\n"
+    "ldp x21, x20, [x6, #0x40]\n"
+    "usubl v21.8h, v21.8b, v2.8b\n"
+    "usubl v15.8h, v15.8b, v2.8b\n"
+    "ldr d10, [x9, x4]\n"
+    "ldr d16, [x28, x4]\n"
+    "usubl v29.8h, v29.8b, v2.8b\n"
+    "usubl v18.8h, v18.8b, v2.8b\n"
+    "ldr d23, [x27, x4]\n"
+    "ldr d30, [x26, x4]\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr d4, [x25, x4]\n"
+    "ldr d28, [x24, x4]\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "ldr d31, [x23, x4]\n"
+    "ldr d1, [x22, x4]\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "ldr d9, [x21, x4]\n"
+    "ldr d11, [x20, x4]\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr d0, [x7, #0x28]\n"
+    "ldr d20, [x7, #0x30]\n"
+    "smlal v13.4s, v10.4h, v21.4h\n"
+    "smlal2 v24.4s, v10.8h, v21.8h\n"
+    "ldr d6, [x7, #0x38]\n"
+    "ldr d19, [x7, #0x40]\n"
+    "smlal v13.4s, v16.4h, v15.4h\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "ldr d10, [x7, #0x48]\n"
+    "ldr d5, [x7, #0x50]\n"
+    "smlal v27.4s, v23.4h, v21.4h\n"
+    "smlal v8.4s, v30.4h, v21.4h\n"
+    "ldr x21, [x6, #0x50]\n"
+    "smlal2 v24.4s, v16.8h, v15.8h\n"
+    "smlal v13.4s, v4.4h, v29.4h\n"
+    "ldr x20, [x6, #0x58]\n"
+    "smlal2 v14.4s, v16.8h, v21.8h\n"
+    "ldr d16, [x21, x4]\n"
+    "smlal2 v22.4s, v23.8h, v21.8h\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "smlal2 v17.4s, v30.8h, v21.8h\n"
+    "ldr d21, [x20, x4]\n"
+    "smlal v7.4s, v4.4h, v15.4h\n"
+    "ldr x22, [x6, #0x60]\n"
+    "smlal v27.4s, v30.4h, v15.4h\n"
+    "smlal v8.4s, v28.4h, v15.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "ldr x20, [x6, #0x68]\n"
+    "smlal2 v24.4s, v4.8h, v29.8h\n"
+    "smlal v13.4s, v31.4h, v18.4h\n"
+    "usubl v0.8h, v0.8b, v2.8b\n"
+    "ldr x21, [x6, #0x70]\n"
+    "smlal2 v14.4s, v4.8h, v15.8h\n"
+    "ldr d4, [x22, x4]\n"
+    "smlal2 v22.4s, v30.8h, v15.8h\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "smlal2 v17.4s, v28.8h, v15.8h\n"
+    "ldr d15, [x20, x4]\n"
+    "smlal v7.4s, v31.4h, v29.4h\n"
+    "usubl v20.8h, v20.8b, v2.8b\n"
+    "smlal v27.4s, v28.4h, v29.4h\n"
+    "smlal v8.4s, v16.4h, v29.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "ldr x20, [x6, #0x78]\n"
+    "smlal2 v24.4s, v31.8h, v18.8h\n"
+    "smlal v13.4s, v1.4h, v3.4h\n"
+    "usubl v6.8h, v6.8b, v2.8b\n"
+    "ldr x22, [x6, #0x80]\n"
+    "smlal2 v14.4s, v31.8h, v29.8h\n"
+    "ldr d31, [x21, x4]\n"
+    "smlal2 v22.4s, v28.8h, v29.8h\n"
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "smlal2 v17.4s, v16.8h, v29.8h\n"
+    "ldr d29, [x20, x4]\n"
+    "smlal v7.4s, v1.4h, v18.4h\n"
+    "usubl v19.8h, v19.8b, v2.8b\n"
+    "smlal v27.4s, v16.4h, v18.4h\n"
+    "smlal v8.4s, v21.4h, v18.4h\n"
+    "ushll v29.8h, v29.8b, #0x0\n"
+    "ldr x20, [x6, #0x88]\n"
+    "smlal2 v24.4s, v1.8h, v3.8h\n"
+    "smlal v13.4s, v23.4h, v0.4h\n"
+    "usubl v10.8h, v10.8b, v2.8b\n"
+    "ldr x21, [x6, #0x90]\n"
+    "smlal2 v14.4s, v1.8h, v18.8h\n"
+    "ldr d1, [x22, x4]\n"
+    "smlal2 v22.4s, v16.8h, v18.8h\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "smlal2 v17.4s, v21.8h, v18.8h\n"
+    "ldr d18, [x20, x4]\n"
+    "smlal v7.4s, v4.4h, v3.4h\n"
+    "usubl v5.8h, v5.8b, v2.8b\n"
+    "smlal v27.4s, v21.4h, v3.4h\n"
+    "smlal v8.4s, v9.4h, v3.4h\n"
+    "ldr x20, [x6, #0x98]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal2 v24.4s, v23.8h, v0.8h\n"
+    "ldr d23, [x7, #0x58]\n"
+    "smlal v13.4s, v30.4h, v20.4h\n"
+    "usubl v23.8h, v23.8b, v2.8b\n"
+    "smlal2 v14.4s, v4.8h, v3.8h\n"
+    "ldr d4, [x21, x4]\n"
+    "smlal2 v22.4s, v21.8h, v3.8h\n"
+    "ldr x22, [x6, #0xa0]\n"
+    "smlal2 v17.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x20, x4]\n"
+    "smlal v7.4s, v30.4h, v0.4h\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "smlal v27.4s, v11.4h, v0.4h\n"
+    "smlal v8.4s, v15.4h, v0.4h\n"
+    "ushll v3.8h, v3.8b, #0x0\n"
+    "ldr x21, [x6, #0xa8]\n"
+    "smlal2 v24.4s, v30.8h, v20.8h\n"
+    "smlal v13.4s, v28.4h, v6.4h\n"
+    "ldr x20, [x6, #0xb0]\n"
+    "ldr x12, [x6, #0xb8]\n"
+    "smlal2 v14.4s, v30.8h, v0.8h\n"
+    "ldr d30, [x7, #0x60]\n"
+    "smlal2 v22.4s, v11.8h, v0.8h\n"
+    "usubl v30.8h, v30.8b, v2.8b\n"
+    "smlal2 v17.4s, v15.8h, v0.8h\n"
+    "ldr d0, [x22, x4]\n"
+    "smlal v7.4s, v28.4h, v20.4h\n"
+    "ushll v0.8h, v0.8b, #0x0\n"
+    "smlal v27.4s, v15.4h, v20.4h\n"
+    "smlal v8.4s, v31.4h, v20.4h\n"
+    "ldr x11, [x6, #0xc0]\n"
+    "ldr x10, [x6, #0xc8]\n"
+    "smlal2 v24.4s, v28.8h, v6.8h\n"
+    "smlal v13.4s, v16.4h, v19.4h\n"
+    "ldr x9, [x6, #0xd0]\n"
+    "ldr x28, [x6, #0xd8]\n"
+    "smlal2 v14.4s, v28.8h, v20.8h\n"
+    "ldr d28, [x7, #0x68]\n"
+    "smlal2 v22.4s, v15.8h, v20.8h\n"
+    "usubl v28.8h, v28.8b, v2.8b\n"
+    "smlal2 v17.4s, v31.8h, v20.8h\n"
+    "ldr d20, [x21, x4]\n"
+    "smlal v7.4s, v16.4h, v6.4h\n"
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "smlal v27.4s, v31.4h, v6.4h\n"
+    "smlal v8.4s, v29.4h, v6.4h\n"
+    "ldr x27, [x6, #0xe0]\n"
+    "ldr x26, [x6, #0xe8]\n"
+    "smlal2 v24.4s, v16.8h, v19.8h\n"
+    "smlal v13.4s, v21.4h, v10.4h\n"
+    "ldr x25, [x6, #0xf0]\n"
+    "ldr x24, [x6, #0xf8]\n"
+    "smlal2 v14.4s, v16.8h, v6.8h\n"
+    "ldr d16, [x7, #0x70]\n"
+    "smlal2 v22.4s, v31.8h, v6.8h\n"
+    "usubl v16.8h, v16.8b, v2.8b\n"
+    "smlal2 v17.4s, v29.8h, v6.8h\n"
+    "ldr d6, [x20, x4]\n"
+    "smlal v7.4s, v21.4h, v19.4h\n"
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "smlal v27.4s, v29.4h, v19.4h\n"
+    "smlal v8.4s, v1.4h, v19.4h\n"
+    "ldr x23, [x6, #0x100]\n"
+    "ldr x22, [x6, #0x108]\n"
+    "smlal2 v24.4s, v21.8h, v10.8h\n"
+    "smlal v13.4s, v11.4h, v5.4h\n"
+    "ldr x21, [x6, #0x110]\n"
+    "ldr x20, [x6, #0x118]\n"
+    "smlal2 v14.4s, v21.8h, v19.8h\n"
+    "ldr d21, [x7, #0x78]\n"
+    "smlal2 v22.4s, v29.8h, v19.8h\n"
+    "usubl v21.8h, v21.8b, v2.8b\n"
+    "smlal2 v17.4s, v1.8h, v19.8h\n"
+    "ldr d19, [x12, x4]\n"
+    "smlal v7.4s, v9.4h, v10.4h\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "smlal v27.4s, v1.4h, v10.4h\n"
+    "smlal v8.4s, v18.4h, v10.4h\n"
+    "tst x2, #0x7\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "ldr d11, [x7, #0x80]\n"
+    "smlal v13.4s, v15.4h, v23.4h\n"
+    "usubl v11.8h, v11.8b, v2.8b\n"
+    "smlal2 v14.4s, v9.8h, v10.8h\n"
+    "ldr d9, [x11, x4]\n"
+    "smlal2 v22.4s, v1.8h, v10.8h\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "smlal2 v17.4s, v18.8h, v10.8h\n"
+    "ldr d10, [x10, x4]\n"
+    "smlal v7.4s, v15.4h, v5.4h\n"
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "smlal v27.4s, v4.4h, v5.4h\n"
+    "smlal v8.4s, v3.4h, v5.4h\n"
+    "smlal2 v24.4s, v15.8h, v23.8h\n"
+    "smlal v13.4s, v31.4h, v30.4h\n"
+    "smlal2 v14.4s, v15.8h, v5.8h\n"
+    "ldr d15, [x7, #0x88]\n"
+    "smlal2 v22.4s, v4.8h, v5.8h\n"
+    "usubl v15.8h, v15.8b, v2.8b\n"
+    "smlal2 v17.4s, v3.8h, v5.8h\n"
+    "ldr d5, [x9, x4]\n"
+    "smlal v7.4s, v31.4h, v23.4h\n"
+    "ushll v5.8h, v5.8b, #0x0\n"
+    "smlal v27.4s, v3.4h, v23.4h\n"
+    "smlal v8.4s, v0.4h, v23.4h\n"
+    "smlal2 v24.4s, v31.8h, v30.8h\n"
+    "smlal v13.4s, v29.4h, v28.4h\n"
+    "smlal2 v14.4s, v31.8h, v23.8h\n"
+    "ldr d31, [x7, #0x90]\n"
+    "smlal2 v22.4s, v3.8h, v23.8h\n"
+    "usubl v31.8h, v31.8b, v2.8b\n"
+    "smlal2 v17.4s, v0.8h, v23.8h\n"
+    "ldr d23, [x28, x4]\n"
+    "smlal v7.4s, v29.4h, v30.4h\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal v27.4s, v0.4h, v30.4h\n"
+    "smlal v8.4s, v20.4h, v30.4h\n"
+    "smlal2 v24.4s, v29.8h, v28.8h\n"
+    "smlal v13.4s, v1.4h, v16.4h\n"
+    "smlal2 v14.4s, v29.8h, v30.8h\n"
+    "ldr d29, [x7, #0x98]\n"
+    "smlal2 v22.4s, v0.8h, v30.8h\n"
+    "usubl v29.8h, v29.8b, v2.8b\n"
+    "smlal2 v17.4s, v20.8h, v30.8h\n"
+    "ldr d30, [x27, x4]\n"
+    "smlal v7.4s, v1.4h, v28.4h\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "smlal v27.4s, v20.4h, v28.4h\n"
+    "smlal v8.4s, v6.4h, v28.4h\n"
+    "smlal2 v24.4s, v1.8h, v16.8h\n"
+    "smlal v13.4s, v4.4h, v21.4h\n"
+    "smlal2 v14.4s, v1.8h, v28.8h\n"
+    "ldr d1, [x7, #0xa0]\n"
+    "smlal2 v22.4s, v20.8h, v28.8h\n"
+    "usubl v1.8h, v1.8b, v2.8b\n"
+    "smlal2 v17.4s, v6.8h, v28.8h\n"
+    "ldr d28, [x26, x4]\n"
+    "smlal v7.4s, v18.4h, v16.4h\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal v27.4s, v6.4h, v16.4h\n"
+    "smlal v8.4s, v19.4h, v16.4h\n"
+    "smlal2 v24.4s, v4.8h, v21.8h\n"
+    "ldr d4, [x7, #0xa8]\n"
+    "smlal v13.4s, v3.4h, v11.4h\n"
+    "usubl v4.8h, v4.8b, v2.8b\n"
+    "smlal2 v14.4s, v18.8h, v16.8h\n"
+    "ldr d18, [x25, x4]\n"
+    "smlal2 v22.4s, v6.8h, v16.8h\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "smlal2 v17.4s, v19.8h, v16.8h\n"
+    "ldr d16, [x24, x4]\n"
+    "smlal v7.4s, v3.4h, v21.4h\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "smlal v27.4s, v9.4h, v21.4h\n"
+    "smlal v8.4s, v10.4h, v21.4h\n"
+    "smlal2 v24.4s, v3.8h, v11.8h\n"
+    "smlal v13.4s, v0.4h, v15.4h\n"
+    "smlal2 v14.4s, v3.8h, v21.8h\n"
+    "ldr d3, [x7, #0xb0]\n"
+    "smlal2 v22.4s, v9.8h, v21.8h\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "smlal2 v17.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x23, x4]\n"
+    "smlal v7.4s, v0.4h, v11.4h\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "smlal v27.4s, v10.4h, v11.4h\n"
+    "smlal v8.4s, v5.4h, v11.4h\n"
+    "smlal2 v24.4s, v0.8h, v15.8h\n"
+    "smlal v13.4s, v20.4h, v31.4h\n"
+    "smlal2 v14.4s, v0.8h, v11.8h\n"
+    "ldr d0, [x7, #0xb8]\n"
+    "smlal2 v22.4s, v10.8h, v11.8h\n"
+    "usubl v0.8h, v0.8b, v2.8b\n"
+    "smlal2 v17.4s, v5.8h, v11.8h\n"
+    "ldr d11, [x22, x4]\n"
+    "smlal v7.4s, v20.4h, v15.4h\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal v27.4s, v5.4h, v15.4h\n"
+    "smlal v8.4s, v23.4h, v15.4h\n"
+    "smlal2 v24.4s, v20.8h, v31.8h\n"
+    "smlal v13.4s, v6.4h, v29.4h\n"
+    "smlal2 v14.4s, v20.8h, v15.8h\n"
+    "ldr d20, [x7, #0xc0]\n"
+    "smlal2 v22.4s, v5.8h, v15.8h\n"
+    "usubl v20.8h, v20.8b, v2.8b\n"
+    "smlal2 v17.4s, v23.8h, v15.8h\n"
+    "ldr d15, [x21, x4]\n"
+    "smlal v7.4s, v6.4h, v31.4h\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v27.4s, v23.4h, v31.4h\n"
+    "smlal v8.4s, v30.4h, v31.4h\n"
+    "smlal2 v24.4s, v6.8h, v29.8h\n"
+    "smlal v13.4s, v9.4h, v1.4h\n"
+    "smlal2 v14.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x20, x4]\n"
+    "smlal2 v22.4s, v23.8h, v31.8h\n"
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "smlal2 v17.4s, v30.8h, v31.8h\n"
+    "ldr q31, [x8, #0x0]\n"
+    "smlal v7.4s, v19.4h, v29.4h\n"
+    "add x4, x4, #0x8\n"
+    "smlal v27.4s, v30.4h, v29.4h\n"
+    "smlal v8.4s, v28.4h, v29.4h\n"
+    "smlal2 v24.4s, v9.8h, v1.8h\n"
+    "ldr q9, [x17, #0x0]\n"
+    "smlal v13.4s, v10.4h, v4.4h\n"
+    "smlal2 v14.4s, v19.8h, v29.8h\n"
+    "ldr q19, [x8, #0x10]\n"
+    "smlal2 v22.4s, v30.8h, v29.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v17.4s, v28.8h, v29.8h\n"
+    "ldr q29, [x17, #0x10]\n"
+    "smlal v7.4s, v10.4h, v1.4h\n"
+    "add x17, x17, #0x20\n"
+    "smlal v27.4s, v18.4h, v1.4h\n"
+    "smlal v8.4s, v16.4h, v1.4h\n"
+    "smlal2 v24.4s, v10.8h, v4.8h\n"
+    "smlal v13.4s, v5.4h, v3.4h\n"
+    "smlal2 v14.4s, v10.8h, v1.8h\n"
+    "smlal2 v22.4s, v18.8h, v1.8h\n"
+    "smlal2 v17.4s, v16.8h, v1.8h\n"
+    "smlal v7.4s, v5.4h, v4.4h\n"
+    "smlal v27.4s, v16.4h, v4.4h\n"
+    "smlal v8.4s, v21.4h, v4.4h\n"
+    "smlal2 v24.4s, v5.8h, v3.8h\n"
+    "smlal v13.4s, v23.4h, v0.4h\n"
+    "smlal2 v14.4s, v5.8h, v4.8h\n"
+    "smlal2 v22.4s, v16.8h, v4.8h\n"
+    "smlal2 v17.4s, v21.8h, v4.8h\n"
+    "smlal v7.4s, v23.4h, v3.4h\n"
+    "smlal v27.4s, v21.4h, v3.4h\n"
+    "smlal v8.4s, v11.4h, v3.4h\n"
+    "smlal2 v24.4s, v23.8h, v0.8h\n"
+    "smlal v13.4s, v30.4h, v20.4h\n"
+    "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+    "smlal2 v14.4s, v23.8h, v3.8h\n"
+    "smlal2 v22.4s, v21.8h, v3.8h\n"
+    "and v21.16b, v13.16b, v9.16b\n"
+    "smlal2 v17.4s, v11.8h, v3.8h\n"
+    "smlal v7.4s, v30.4h, v0.4h\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "smlal v27.4s, v11.4h, v0.4h\n"
+    "smlal v8.4s, v15.4h, v0.4h\n"
+    "sqadd v13.4s, v13.4s, v21.4s\n"
+    "smlal2 v24.4s, v30.8h, v20.8h\n"
+    "smlal2 v14.4s, v30.8h, v0.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v19.4s\n"
+    "smlal2 v22.4s, v11.8h, v0.8h\n"
+    "smlal2 v17.4s, v15.8h, v0.8h\n"
+    "and v16.16b, v24.16b, v29.16b\n"
+    "smlal v7.4s, v28.4h, v20.4h\n"
+    "smlal v27.4s, v15.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v31.4s\n"
+    "smlal v8.4s, v6.4h, v20.4h\n"
+    "smlal2 v14.4s, v28.8h, v20.8h\n"
+    "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+    "smlal2 v22.4s, v15.8h, v20.8h\n"
+    "smlal2 v17.4s, v6.8h, v20.8h\n"
+    "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v23.16b, v7.16b, v9.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v19.4s\n"
+    "and v20.16b, v27.16b, v9.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v19.4s\n"
+    "and v3.16b, v8.16b, v9.16b\n"
+    "sqrdmulh v17.4s, v17.4s, v19.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v18.16b, v14.16b, v29.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v19.16b, v22.16b, v29.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v30.16b, v17.16b, v29.16b\n"
+    "sqadd v7.4s, v7.4s, v23.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sqadd v8.4s, v8.4s, v3.4s\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "srshl v13.4s, v13.4s, v9.4s\n"
+    "srshl v7.4s, v7.4s, v9.4s\n"
+    "sqadd v14.4s, v14.4s, v18.4s\n"
+    "srshl v27.4s, v27.4s, v9.4s\n"
+    "sqadd v22.4s, v22.4s, v19.4s\n"
+    "srshl v8.4s, v8.4s, v9.4s\n"
+    "sqadd v17.4s, v17.4s, v30.4s\n"
+    "srshl v24.4s, v24.4s, v29.4s\n"
+    "sqxtn v13.4h, v13.4s\n"
+    "srshl v14.4s, v14.4s, v29.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v22.4s, v22.4s, v29.4s\n"
+    "sqxtn v27.4h, v27.4s\n"
+    "srshl v17.4s, v17.4s, v29.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "sqxtn2 v13.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v14.4s\n"
+    "sqxtn2 v27.8h, v22.4s\n"
+    "sqxtn2 v8.8h, v17.4s\n"
+    "sqadd v13.8h, v13.8h, v25.8h\n"
+    "sqadd v7.8h, v7.8h, v25.8h\n"
+    "sqadd v27.8h, v27.8h, v25.8h\n"
+    "sqadd v8.8h, v8.8h, v25.8h\n"
+    "smax v13.8h, v13.8h, v12.8h\n"
+    "smax v7.8h, v7.8h, v12.8h\n"
+    "smax v27.8h, v27.8h, v12.8h\n"
+    "smax v8.8h, v8.8h, v12.8h\n"
+    "smin v13.8h, v13.8h, v26.8h\n"
+    "smin v7.8h, v7.8h, v26.8h\n"
+    "smin v27.8h, v27.8h, v26.8h\n"
+    "smin v8.8h, v8.8h, v26.8h\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str d13, [x16, x5]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str d7, [x15, x5]\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "str d27, [x14, x5]\n"
+    "str d8, [x13, x5]\n"
+    "add x5, x5, #0x8\n"
+    "beq 124f\n"
+    "add x7, x7, #0xc8\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x2, #2, 5f\n"
+    "ld1 { v13.4s }, [x20], #0x10\n"
+    "tbz x2, #1, 4f\n"
+    "ld1 { v24.d }[0], [x20], #0x8\n"
+    "tbz x2, #0, 7f\n"
+    "ld1 { v24.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x2, #0, 7f\n"
+    "ld1 { v24.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x2, #1, 6f\n"
+    "ld1 { v13.d }[0], [x20], #0x8\n"
+    "tbz x2, #0, 7f\n"
+    "ld1 { v13.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 7f\n"
+    "ld1 { v13.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d21, [x7, #0x0]\n"
+    "ldr d15, [x7, #0x8]\n"
+    "mov v7.16b, v13.16b\n"
+    "mov v14.16b, v24.16b\n"
+    "ldr d29, [x7, #0x10]\n"
+    "ldr d18, [x7, #0x18]\n"
+    "mov v27.16b, v13.16b\n"
+    "mov v22.16b, v24.16b\n"
+    "ldr d3, [x7, #0x20]\n"
+    "ldp x9, x28, [x6, #0x0]\n"
+    "mov v8.16b, v13.16b\n"
+    "mov v17.16b, v24.16b\n"
+    "ldp x27, x26, [x6, #0x10]\n"
+    "ldp x25, x24, [x6, #0x20]\n"
+    "usubl v21.8h, v21.8b, v2.8b\n"
+    "usubl v15.8h, v15.8b, v2.8b\n"
+    "ldp x23, x22, [x6, #0x30]\n"
+    "ldp x21, x20, [x6, #0x40]\n"
+    "usubl v29.8h, v29.8b, v2.8b\n"
+    "usubl v18.8h, v18.8b, v2.8b\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "add x9, x9, x4\n"
+    "add x28, x28, x4\n"
+    "add x27, x27, x4\n"
+    "add x26, x26, x4\n"
+    "add x25, x25, x4\n"
+    "add x24, x24, x4\n"
+    "add x23, x23, x4\n"
+    "add x22, x22, x4\n"
+    "add x21, x21, x4\n"
+    "add x20, x20, x4\n"
+    "tbz x2, #2, 9f\n"
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "ld1 { v16.s }[0], [x28], #0x4\n"
+    "ld1 { v23.s }[0], [x27], #0x4\n"
+    "ld1 { v30.s }[0], [x26], #0x4\n"
+    "ld1 { v4.s }[0], [x25], #0x4\n"
+    "ld1 { v28.s }[0], [x24], #0x4\n"
+    "ld1 { v31.s }[0], [x23], #0x4\n"
+    "ld1 { v1.s }[0], [x22], #0x4\n"
+    "ld1 { v9.s }[0], [x21], #0x4\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 8f\n"
+    "ld1 { v10.h }[2], [x9], #0x2\n"
+    "ld1 { v16.h }[2], [x28], #0x2\n"
+    "ld1 { v23.h }[2], [x27], #0x2\n"
+    "ld1 { v30.h }[2], [x26], #0x2\n"
+    "ld1 { v4.h }[2], [x25], #0x2\n"
+    "ld1 { v28.h }[2], [x24], #0x2\n"
+    "ld1 { v31.h }[2], [x23], #0x2\n"
+    "ld1 { v1.h }[2], [x22], #0x2\n"
+    "ld1 { v9.h }[2], [x21], #0x2\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 11f\n"
+    "ld1 { v10.b }[6], [x9]\n"
+    "ld1 { v16.b }[6], [x28]\n"
+    "ld1 { v23.b }[6], [x27]\n"
+    "ld1 { v30.b }[6], [x26]\n"
+    "ld1 { v4.b }[6], [x25]\n"
+    "ld1 { v28.b }[6], [x24]\n"
+    "ld1 { v31.b }[6], [x23]\n"
+    "ld1 { v1.b }[6], [x22]\n"
+    "ld1 { v9.b }[6], [x21]\n"
+    "ld1 { v11.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x2, #0, 11f\n"
+    "ld1 { v10.b }[4], [x9]\n"
+    "ld1 { v16.b }[4], [x28]\n"
+    "ld1 { v23.b }[4], [x27]\n"
+    "ld1 { v30.b }[4], [x26]\n"
+    "ld1 { v4.b }[4], [x25]\n"
+    "ld1 { v28.b }[4], [x24]\n"
+    "ld1 { v31.b }[4], [x23]\n"
+    "ld1 { v1.b }[4], [x22]\n"
+    "ld1 { v9.b }[4], [x21]\n"
+    "ld1 { v11.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x2, #1, 10f\n"
+    "ld1 { v10.h }[0], [x9], #0x2\n"
+    "ld1 { v16.h }[0], [x28], #0x2\n"
+    "ld1 { v23.h }[0], [x27], #0x2\n"
+    "ld1 { v30.h }[0], [x26], #0x2\n"
+    "ld1 { v4.h }[0], [x25], #0x2\n"
+    "ld1 { v28.h }[0], [x24], #0x2\n"
+    "ld1 { v31.h }[0], [x23], #0x2\n"
+    "ld1 { v1.h }[0], [x22], #0x2\n"
+    "ld1 { v9.h }[0], [x21], #0x2\n"
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 11f\n"
+    "ld1 { v10.b }[2], [x9]\n"
+    "ld1 { v16.b }[2], [x28]\n"
+    "ld1 { v23.b }[2], [x27]\n"
+    "ld1 { v30.b }[2], [x26]\n"
+    "ld1 { v4.b }[2], [x25]\n"
+    "ld1 { v28.b }[2], [x24]\n"
+    "ld1 { v31.b }[2], [x23]\n"
+    "ld1 { v1.b }[2], [x22]\n"
+    "ld1 { v9.b }[2], [x21]\n"
+    "ld1 { v11.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 11f\n"
+    "ld1 { v10.b }[0], [x9]\n"
+    "ld1 { v16.b }[0], [x28]\n"
+    "ld1 { v23.b }[0], [x27]\n"
+    "ld1 { v30.b }[0], [x26]\n"
+    "ld1 { v4.b }[0], [x25]\n"
+    "ld1 { v28.b }[0], [x24]\n"
+    "ld1 { v31.b }[0], [x23]\n"
+    "ld1 { v1.b }[0], [x22]\n"
+    "ld1 { v9.b }[0], [x21]\n"
+    "ld1 { v11.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ushll v16.8h, v16.8b, #0x0\n"
+    "smlal v13.4s, v10.4h, v21.4h\n"
+    "ldr x20, [x6, #0x50]\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "smlal2 v24.4s, v10.8h, v21.8h\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "smlal2 v14.4s, v16.8h, v21.8h\n"
+    "smlal v27.4s, v23.4h, v21.4h\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "add x20, x20, x4\n"
+    "smlal2 v22.4s, v23.8h, v21.8h\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "smlal v8.4s, v30.4h, v21.4h\n"
+    "smlal2 v17.4s, v30.8h, v21.8h\n"
+    "smlal v13.4s, v16.4h, v15.4h\n"
+    "ushll v28.8h, v28.8b, #0x0\n"
+    "smlal2 v24.4s, v16.8h, v15.8h\n"
+    "smlal v7.4s, v4.4h, v15.4h\n"
+    "ushll v31.8h, v31.8b, #0x0\n"
+    "smlal2 v14.4s, v4.8h, v15.8h\n"
+    "smlal v27.4s, v30.4h, v15.4h\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "smlal2 v22.4s, v30.8h, v15.8h\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "smlal v8.4s, v28.4h, v15.4h\n"
+    "ushll v11.8h, v11.8b, #0x0\n"
+    "smlal2 v17.4s, v28.8h, v15.8h\n"
+    "smlal v13.4s, v4.4h, v29.4h\n"
+    "smlal2 v24.4s, v4.8h, v29.8h\n"
+    "smlal v7.4s, v31.4h, v29.4h\n"
+    "smlal2 v14.4s, v31.8h, v29.8h\n"
+    "smlal v27.4s, v28.4h, v29.4h\n"
+    "smlal2 v22.4s, v28.8h, v29.8h\n"
+    "tbz x2, #2, 13f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 12f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 15f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 15f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x2, #1, 14f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 15f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 15f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "ushll v5.8h, v5.8b, #0x0\n"
+    "ldr x20, [x6, #0x58]\n"
+    "smlal v8.4s, v5.4h, v29.4h\n"
+    "smlal2 v17.4s, v5.8h, v29.8h\n"
+    "smlal v13.4s, v31.4h, v18.4h\n"
+    "smlal2 v24.4s, v31.8h, v18.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v7.4s, v1.4h, v18.4h\n"
+    "smlal2 v14.4s, v1.8h, v18.8h\n"
+    "smlal v27.4s, v5.4h, v18.4h\n"
+    "smlal2 v22.4s, v5.8h, v18.8h\n"
+    "tbz x2, #2, 17f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 16f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 19f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 19f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x2, #1, 18f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 19f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 19f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr x20, [x6, #0x60]\n"
+    "smlal v8.4s, v10.4h, v18.4h\n"
+    "smlal2 v17.4s, v10.8h, v18.8h\n"
+    "smlal v13.4s, v1.4h, v3.4h\n"
+    "smlal2 v24.4s, v1.8h, v3.8h\n"
+    "add x20, x20, x4\n"
+    "tbz x2, #2, 21f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 20f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 23f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 23f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
+    "tbz x2, #1, 22f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 23f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 23f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 5): Bit 2: End
+    "ldr d6, [x7, #0x28]\n"
+    "ushll v15.8h, v15.8b, #0x0\n"
+    "smlal v7.4s, v15.4h, v3.4h\n"
+    "smlal2 v14.4s, v15.8h, v3.8h\n"
+    "smlal v27.4s, v10.4h, v3.4h\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "usubl v6.8h, v6.8b, v2.8b\n"
+    "ldr x20, [x6, #0x68]\n"
+    "smlal v8.4s, v9.4h, v3.4h\n"
+    "smlal2 v17.4s, v9.8h, v3.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v23.4h, v6.4h\n"
+    "smlal2 v24.4s, v23.8h, v6.8h\n"
+    "smlal v7.4s, v30.4h, v6.4h\n"
+    "smlal2 v14.4s, v30.8h, v6.8h\n"
+    "smlal v27.4s, v11.4h, v6.4h\n"
+    "smlal2 v22.4s, v11.8h, v6.8h\n"
+    "tbz x2, #2, 25f\n"
+    "ld1 { v20.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 24f\n"
+    "ld1 { v20.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 27f\n"
+    "ld1 { v20.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 27f\n"
+    "ld1 { v20.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x2, #1, 26f\n"
+    "ld1 { v20.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 27f\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 27f\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr d4, [x7, #0x30]\n"
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "usubl v4.8h, v4.8b, v2.8b\n"
+    "ldr x20, [x6, #0x70]\n"
+    "smlal v8.4s, v20.4h, v6.4h\n"
+    "smlal2 v17.4s, v20.8h, v6.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v30.4h, v4.4h\n"
+    "smlal2 v24.4s, v30.8h, v4.8h\n"
+    "smlal v7.4s, v28.4h, v4.4h\n"
+    "smlal2 v14.4s, v28.8h, v4.8h\n"
+    "smlal v27.4s, v20.4h, v4.4h\n"
+    "smlal2 v22.4s, v20.8h, v4.8h\n"
+    "tbz x2, #2, 29f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 28f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 31f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 31f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x2, #1, 30f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 31f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 31f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr d30, [x7, #0x38]\n"
+    "ushll v23.8h, v23.8b, #0x0\n"
+    "usubl v30.8h, v30.8b, v2.8b\n"
+    "ldr x20, [x6, #0x78]\n"
+    "smlal v8.4s, v23.4h, v4.4h\n"
+    "smlal2 v17.4s, v23.8h, v4.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v28.4h, v30.4h\n"
+    "smlal2 v24.4s, v28.8h, v30.8h\n"
+    "smlal v7.4s, v5.4h, v30.4h\n"
+    "smlal2 v14.4s, v5.8h, v30.8h\n"
+    "smlal v27.4s, v23.4h, v30.4h\n"
+    "smlal2 v22.4s, v23.8h, v30.8h\n"
+    "tbz x2, #2, 33f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 32f\n"
+    "ld1 { v3.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 35f\n"
+    "ld1 { v3.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 35f\n"
+    "ld1 { v3.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x2, #1, 34f\n"
+    "ld1 { v3.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 35f\n"
+    "ld1 { v3.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 35f\n"
+    "ld1 { v3.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr d16, [x7, #0x40]\n"
+    "ushll v3.8h, v3.8b, #0x0\n"
+    "usubl v16.8h, v16.8b, v2.8b\n"
+    "ldr x20, [x6, #0x80]\n"
+    "smlal v8.4s, v3.4h, v30.4h\n"
+    "smlal2 v17.4s, v3.8h, v30.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v5.4h, v16.4h\n"
+    "smlal2 v24.4s, v5.8h, v16.8h\n"
+    "smlal v7.4s, v10.4h, v16.4h\n"
+    "smlal2 v14.4s, v10.8h, v16.8h\n"
+    "smlal v27.4s, v3.4h, v16.4h\n"
+    "smlal2 v22.4s, v3.8h, v16.8h\n"
+    "tbz x2, #2, 37f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 36f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 39f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 39f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x2, #1, 38f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 39f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 39f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr d1, [x7, #0x48]\n"
+    "ushll v6.8h, v6.8b, #0x0\n"
+    "usubl v1.8h, v1.8b, v2.8b\n"
+    "ldr x20, [x6, #0x88]\n"
+    "smlal v8.4s, v6.4h, v16.4h\n"
+    "smlal2 v17.4s, v6.8h, v16.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v10.4h, v1.4h\n"
+    "smlal2 v24.4s, v10.8h, v1.8h\n"
+    "smlal v7.4s, v9.4h, v1.4h\n"
+    "smlal2 v14.4s, v9.8h, v1.8h\n"
+    "smlal v27.4s, v6.4h, v1.4h\n"
+    "smlal2 v22.4s, v6.8h, v1.8h\n"
+    "tbz x2, #2, 41f\n"
+    "ld1 { v18.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 40f\n"
+    "ld1 { v18.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 43f\n"
+    "ld1 { v18.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 43f\n"
+    "ld1 { v18.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
+    "tbz x2, #1, 42f\n"
+    "ld1 { v18.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 43f\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 43f\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 5): Bit 2: End
+    "ldr d28, [x7, #0x50]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "usubl v28.8h, v28.8b, v2.8b\n"
+    "ldr x20, [x6, #0x90]\n"
+    "smlal v8.4s, v18.4h, v1.4h\n"
+    "smlal2 v17.4s, v18.8h, v1.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v11.4h, v28.4h\n"
+    "smlal2 v24.4s, v11.8h, v28.8h\n"
+    "smlal v7.4s, v20.4h, v28.4h\n"
+    "smlal2 v14.4s, v20.8h, v28.8h\n"
+    "tbz x2, #2, 45f\n"
+    "ld1 { v30.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 44f\n"
+    "ld1 { v30.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 47f\n"
+    "ld1 { v30.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 47f\n"
+    "ld1 { v30.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x2, #1, 46f\n"
+    "ld1 { v30.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 47f\n"
+    "ld1 { v30.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 47f\n"
+    "ld1 { v30.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (3, 0): Bit 2: End
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "ldr x20, [x6, #0x98]\n"
+    "smlal v27.4s, v30.4h, v28.4h\n"
+    "smlal2 v22.4s, v30.8h, v28.8h\n"
+    "add x20, x20, x4\n"
+    "tbz x2, #2, 49f\n"
+    "ld1 { v19.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 48f\n"
+    "ld1 { v19.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 51f\n"
+    "ld1 { v19.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 51f\n"
+    "ld1 { v19.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x2, #1, 50f\n"
+    "ld1 { v19.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 51f\n"
+    "ld1 { v19.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 51f\n"
+    "ld1 { v19.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr d0, [x7, #0x58]\n"
+    "ushll v19.8h, v19.8b, #0x0\n"
+    "usubl v0.8h, v0.8b, v2.8b\n"
+    "ldr x20, [x6, #0xa0]\n"
+    "smlal v8.4s, v19.4h, v28.4h\n"
+    "smlal2 v17.4s, v19.8h, v28.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v20.4h, v0.4h\n"
+    "smlal2 v24.4s, v20.8h, v0.8h\n"
+    "smlal v7.4s, v23.4h, v0.4h\n"
+    "smlal2 v14.4s, v23.8h, v0.8h\n"
+    "smlal v27.4s, v19.4h, v0.4h\n"
+    "smlal2 v22.4s, v19.8h, v0.8h\n"
+    "tbz x2, #2, 53f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 52f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 55f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 55f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x2, #1, 54f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 55f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 55f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr d10, [x7, #0x60]\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "usubl v10.8h, v10.8b, v2.8b\n"
+    "ldr x20, [x6, #0xa8]\n"
+    "smlal v8.4s, v9.4h, v0.4h\n"
+    "smlal2 v17.4s, v9.8h, v0.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v23.4h, v10.4h\n"
+    "smlal2 v24.4s, v23.8h, v10.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "smlal2 v14.4s, v3.8h, v10.8h\n"
+    "smlal v27.4s, v9.4h, v10.4h\n"
+    "smlal2 v22.4s, v9.8h, v10.8h\n"
+    "tbz x2, #2, 57f\n"
+    "ld1 { v20.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 56f\n"
+    "ld1 { v20.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 59f\n"
+    "ld1 { v20.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 59f\n"
+    "ld1 { v20.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x2, #1, 58f\n"
+    "ld1 { v20.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 59f\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 59f\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr d28, [x7, #0x68]\n"
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "usubl v28.8h, v28.8b, v2.8b\n"
+    "ldr x20, [x6, #0xb0]\n"
+    "smlal v8.4s, v20.4h, v10.4h\n"
+    "smlal2 v17.4s, v20.8h, v10.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v3.4h, v28.4h\n"
+    "smlal2 v24.4s, v3.8h, v28.8h\n"
+    "smlal v7.4s, v6.4h, v28.4h\n"
+    "smlal2 v14.4s, v6.8h, v28.8h\n"
+    "smlal v27.4s, v20.4h, v28.4h\n"
+    "smlal2 v22.4s, v20.8h, v28.8h\n"
+    "tbz x2, #2, 61f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 60f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 63f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 63f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x2, #1, 62f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 63f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 63f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr d23, [x7, #0x70]\n"
+    "ushll v5.8h, v5.8b, #0x0\n"
+    "usubl v23.8h, v23.8b, v2.8b\n"
+    "ldr x20, [x6, #0xb8]\n"
+    "smlal v8.4s, v5.4h, v28.4h\n"
+    "smlal2 v17.4s, v5.8h, v28.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v6.4h, v23.4h\n"
+    "smlal2 v24.4s, v6.8h, v23.8h\n"
+    "smlal v7.4s, v18.4h, v23.4h\n"
+    "smlal2 v14.4s, v18.8h, v23.8h\n"
+    "smlal v27.4s, v5.4h, v23.4h\n"
+    "smlal2 v22.4s, v5.8h, v23.8h\n"
+    "tbz x2, #2, 65f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 64f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 67f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 67f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
+    "tbz x2, #1, 66f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 67f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 67f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 5): Bit 2: End
+    "ldr d4, [x7, #0x78]\n"
+    "ushll v29.8h, v29.8b, #0x0\n"
+    "usubl v4.8h, v4.8b, v2.8b\n"
+    "ldr x20, [x6, #0xc0]\n"
+    "smlal v8.4s, v29.4h, v23.4h\n"
+    "smlal2 v17.4s, v29.8h, v23.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v30.4h, v4.4h\n"
+    "smlal2 v24.4s, v30.8h, v4.8h\n"
+    "smlal v7.4s, v19.4h, v4.4h\n"
+    "smlal2 v14.4s, v19.8h, v4.8h\n"
+    "tbz x2, #2, 69f\n"
+    "ld1 { v18.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 68f\n"
+    "ld1 { v18.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 71f\n"
+    "ld1 { v18.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 71f\n"
+    "ld1 { v18.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x2, #1, 70f\n"
+    "ld1 { v18.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 71f\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 71f\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 0): Bit 2: End
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "ldr x20, [x6, #0xc8]\n"
+    "smlal v27.4s, v18.4h, v4.4h\n"
+    "smlal2 v22.4s, v18.8h, v4.8h\n"
+    "add x20, x20, x4\n"
+    "tbz x2, #2, 73f\n"
+    "ld1 { v1.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 72f\n"
+    "ld1 { v1.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 75f\n"
+    "ld1 { v1.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 75f\n"
+    "ld1 { v1.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x2, #1, 74f\n"
+    "ld1 { v1.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 75f\n"
+    "ld1 { v1.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 75f\n"
+    "ld1 { v1.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr d23, [x7, #0x80]\n"
+    "ushll v1.8h, v1.8b, #0x0\n"
+    "usubl v23.8h, v23.8b, v2.8b\n"
+    "ldr x20, [x6, #0xd0]\n"
+    "smlal v8.4s, v1.4h, v4.4h\n"
+    "smlal2 v17.4s, v1.8h, v4.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v19.4h, v23.4h\n"
+    "smlal2 v24.4s, v19.8h, v23.8h\n"
+    "smlal v7.4s, v9.4h, v23.4h\n"
+    "smlal2 v14.4s, v9.8h, v23.8h\n"
+    "smlal v27.4s, v1.4h, v23.4h\n"
+    "smlal2 v22.4s, v1.8h, v23.8h\n"
+    "tbz x2, #2, 77f\n"
+    "ld1 { v4.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 76f\n"
+    "ld1 { v4.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 79f\n"
+    "ld1 { v4.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 79f\n"
+    "ld1 { v4.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x2, #1, 78f\n"
+    "ld1 { v4.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 79f\n"
+    "ld1 { v4.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 79f\n"
+    "ld1 { v4.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr d30, [x7, #0x88]\n"
+    "ushll v4.8h, v4.8b, #0x0\n"
+    "usubl v30.8h, v30.8b, v2.8b\n"
+    "ldr x20, [x6, #0xd8]\n"
+    "smlal v8.4s, v4.4h, v23.4h\n"
+    "smlal2 v17.4s, v4.8h, v23.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v9.4h, v30.4h\n"
+    "smlal2 v24.4s, v9.8h, v30.8h\n"
+    "smlal v7.4s, v20.4h, v30.4h\n"
+    "smlal2 v14.4s, v20.8h, v30.8h\n"
+    "smlal v27.4s, v4.4h, v30.4h\n"
+    "smlal2 v22.4s, v4.8h, v30.8h\n"
+    "tbz x2, #2, 81f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 80f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 83f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 83f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x2, #1, 82f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 83f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 83f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "83:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr d3, [x7, #0x90]\n"
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "usubl v3.8h, v3.8b, v2.8b\n"
+    "ldr x20, [x6, #0xe0]\n"
+    "smlal v8.4s, v21.4h, v30.4h\n"
+    "smlal2 v17.4s, v21.8h, v30.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v20.4h, v3.4h\n"
+    "smlal2 v24.4s, v20.8h, v3.8h\n"
+    "smlal v7.4s, v5.4h, v3.4h\n"
+    "smlal2 v14.4s, v5.8h, v3.8h\n"
+    "smlal v27.4s, v21.4h, v3.4h\n"
+    "smlal2 v22.4s, v21.8h, v3.8h\n"
+    "tbz x2, #2, 85f\n"
+    "ld1 { v30.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 84f\n"
+    "ld1 { v30.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 87f\n"
+    "ld1 { v30.b }[6], [x20]\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 87f\n"
+    "ld1 { v30.b }[4], [x20]\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x2, #1, 86f\n"
+    "ld1 { v30.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 87f\n"
+    "ld1 { v30.b }[2], [x20]\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 87f\n"
+    "ld1 { v30.b }[0], [x20]\n"
+    "87:"  // Oddments: Load (4, 4): Bit 2: End
+    "ldr d19, [x7, #0x98]\n"
+    "ushll v30.8h, v30.8b, #0x0\n"
+    "usubl v19.8h, v19.8b, v2.8b\n"
+    "ldr x20, [x6, #0xe8]\n"
+    "smlal v8.4s, v30.4h, v3.4h\n"
+    "smlal2 v17.4s, v30.8h, v3.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v5.4h, v19.4h\n"
+    "smlal2 v24.4s, v5.8h, v19.8h\n"
+    "smlal v7.4s, v29.4h, v19.4h\n"
+    "smlal2 v14.4s, v29.8h, v19.8h\n"
+    "smlal v27.4s, v30.4h, v19.4h\n"
+    "smlal2 v22.4s, v30.8h, v19.8h\n"
+    "tbz x2, #2, 89f\n"
+    "ld1 { v20.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 88f\n"
+    "ld1 { v20.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 91f\n"
+    "ld1 { v20.b }[6], [x20]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 91f\n"
+    "ld1 { v20.b }[4], [x20]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
+    "tbz x2, #1, 90f\n"
+    "ld1 { v20.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 91f\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 91f\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "91:"  // Oddments: Load (4, 5): Bit 2: End
+    "ldr d23, [x7, #0xa0]\n"
+    "ushll v20.8h, v20.8b, #0x0\n"
+    "usubl v23.8h, v23.8b, v2.8b\n"
+    "ldr x20, [x6, #0xf0]\n"
+    "smlal v8.4s, v20.4h, v19.4h\n"
+    "smlal2 v17.4s, v20.8h, v19.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v18.4h, v23.4h\n"
+    "smlal2 v24.4s, v18.8h, v23.8h\n"
+    "smlal v7.4s, v1.4h, v23.4h\n"
+    "smlal2 v14.4s, v1.8h, v23.8h\n"
+    "tbz x2, #2, 93f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 92f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 95f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 95f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
+    "tbz x2, #1, 94f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 95f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 95f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "95:"  // Oddments: Load (5, 0): Bit 2: End
+    "ushll v10.8h, v10.8b, #0x0\n"
+    "ldr x20, [x6, #0xf8]\n"
+    "smlal v27.4s, v10.4h, v23.4h\n"
+    "smlal2 v22.4s, v10.8h, v23.8h\n"
+    "add x20, x20, x4\n"
+    "tbz x2, #2, 97f\n"
+    "ld1 { v18.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 96f\n"
+    "ld1 { v18.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 99f\n"
+    "ld1 { v18.b }[6], [x20]\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 99f\n"
+    "ld1 { v18.b }[4], [x20]\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
+    "tbz x2, #1, 98f\n"
+    "ld1 { v18.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 99f\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 99f\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "99:"  // Oddments: Load (5, 1): Bit 2: End
+    "ldr d5, [x7, #0xa8]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "usubl v5.8h, v5.8b, v2.8b\n"
+    "ldr x20, [x6, #0x100]\n"
+    "smlal v8.4s, v18.4h, v23.4h\n"
+    "smlal2 v17.4s, v18.8h, v23.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v1.4h, v5.4h\n"
+    "smlal2 v24.4s, v1.8h, v5.8h\n"
+    "smlal v7.4s, v4.4h, v5.4h\n"
+    "smlal2 v14.4s, v4.8h, v5.8h\n"
+    "smlal v27.4s, v18.4h, v5.4h\n"
+    "smlal2 v22.4s, v18.8h, v5.8h\n"
+    "tbz x2, #2, 101f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 100f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 103f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 103f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
+    "tbz x2, #1, 102f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 103f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 103f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "103:"  // Oddments: Load (5, 2): Bit 2: End
+    "ldr d18, [x7, #0xb0]\n"
+    "ushll v9.8h, v9.8b, #0x0\n"
+    "usubl v18.8h, v18.8b, v2.8b\n"
+    "ldr x20, [x6, #0x108]\n"
+    "smlal v8.4s, v9.4h, v5.4h\n"
+    "smlal2 v17.4s, v9.8h, v5.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v21.4h, v18.4h\n"
+    "smlal2 v14.4s, v21.8h, v18.8h\n"
+    "smlal v27.4s, v9.4h, v18.4h\n"
+    "smlal2 v22.4s, v9.8h, v18.8h\n"
+    "tbz x2, #2, 105f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 104f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 107f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 107f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
+    "tbz x2, #1, 106f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 107f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 107f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "107:"  // Oddments: Load (5, 3): Bit 2: End
+    "ldr d11, [x7, #0xb8]\n"
+    "ushll v5.8h, v5.8b, #0x0\n"
+    "usubl v11.8h, v11.8b, v2.8b\n"
+    "ldr x20, [x6, #0x110]\n"
+    "smlal v8.4s, v5.4h, v18.4h\n"
+    "smlal2 v17.4s, v5.8h, v18.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v21.4h, v11.4h\n"
+    "smlal2 v24.4s, v21.8h, v11.8h\n"
+    "smlal v7.4s, v30.4h, v11.4h\n"
+    "smlal2 v14.4s, v30.8h, v11.8h\n"
+    "smlal v27.4s, v5.4h, v11.4h\n"
+    "smlal2 v22.4s, v5.8h, v11.8h\n"
+    "tbz x2, #2, 109f\n"
+    "ld1 { v18.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 108f\n"
+    "ld1 { v18.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 111f\n"
+    "ld1 { v18.b }[6], [x20]\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 111f\n"
+    "ld1 { v18.b }[4], [x20]\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
+    "tbz x2, #1, 110f\n"
+    "ld1 { v18.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 111f\n"
+    "ld1 { v18.b }[2], [x20]\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 111f\n"
+    "ld1 { v18.b }[0], [x20]\n"
+    "111:"  // Oddments: Load (5, 4): Bit 2: End
+    "ldr d16, [x7, #0xc0]\n"
+    "ushll v18.8h, v18.8b, #0x0\n"
+    "usubl v16.8h, v16.8b, v2.8b\n"
+    "ldr x20, [x6, #0x118]\n"
+    "smlal v8.4s, v18.4h, v11.4h\n"
+    "smlal2 v17.4s, v18.8h, v11.8h\n"
+    "add x20, x20, x4\n"
+    "smlal v13.4s, v30.4h, v16.4h\n"
+    "smlal2 v24.4s, v30.8h, v16.8h\n"
+    "smlal v7.4s, v20.4h, v16.4h\n"
+    "smlal2 v14.4s, v20.8h, v16.8h\n"
+    "smlal v27.4s, v18.4h, v16.4h\n"
+    "smlal2 v22.4s, v18.8h, v16.8h\n"
+    "tbz x2, #2, 113f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x2, #1, 112f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x2, #0, 115f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+    "tbz x2, #0, 115f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
+    "tbz x2, #1, 114f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x2, #0, 115f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 115f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "115:"  // Oddments: Load (5, 5): Bit 2: End
+    "ushll v21.8h, v21.8b, #0x0\n"
+    "smlal v8.4s, v21.4h, v16.4h\n"
+    "smlal2 v17.4s, v21.8h, v16.8h\n"
+    "tbz x2, #2, 117f\n"
+    "ld1 { v16.4s }, [x8], #0x10\n"
+    "ld1 { v21.4s }, [x17], #0x10\n"
+    "tbz x2, #1, 116f\n"
+    "ld1 { v18.d }[0], [x8], #0x8\n"
+    "ld1 { v0.d }[0], [x17], #0x8\n"
+    "tbz x2, #0, 119f\n"
+    "ld1 { v18.s }[2], [x8]\n"
+    "ld1 { v0.s }[2], [x17]\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x2, #0, 119f\n"
+    "ld1 { v18.s }[0], [x8]\n"
+    "ld1 { v0.s }[0], [x17]\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x2, #1, 118f\n"
+    "ld1 { v16.d }[0], [x8], #0x8\n"
+    "ld1 { v21.d }[0], [x17], #0x8\n"
+    "tbz x2, #0, 119f\n"
+    "ld1 { v16.s }[2], [x8]\n"
+    "ld1 { v21.s }[2], [x17]\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 119f\n"
+    "ld1 { v16.s }[0], [x8]\n"
+    "ld1 { v21.s }[0], [x17]\n"
+    "119:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v13.4s, v13.4s, v16.4s\n"
+    "and v5.16b, v13.16b, v21.16b\n"
+    "add x16, x16, x5\n"
+    "add x15, x15, x5\n"
+    "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "add x14, x14, x5\n"
+    "add x13, x13, x5\n"
+    "and v2.16b, v24.16b, v0.16b\n"
+    "sqrdmulh v7.4s, v7.4s, v16.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v16.4s\n"
+    "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+    "sqadd v13.4s, v13.4s, v5.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "and v23.16b, v7.16b, v21.16b\n"
+    "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+    "and v20.16b, v27.16b, v21.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v18.4s\n"
+    "and v31.16b, v8.16b, v21.16b\n"
+    "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v2.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v18.16b, v14.16b, v0.16b\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v11.16b, v22.16b, v0.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v10.16b, v17.16b, v0.16b\n"
+    "sqadd v7.4s, v7.4s, v23.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "sqadd v8.4s, v8.4s, v31.4s\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "srshl v13.4s, v13.4s, v21.4s\n"
+    "srshl v7.4s, v7.4s, v21.4s\n"
+    "sqadd v14.4s, v14.4s, v18.4s\n"
+    "srshl v27.4s, v27.4s, v21.4s\n"
+    "sqadd v22.4s, v22.4s, v11.4s\n"
+    "srshl v8.4s, v8.4s, v21.4s\n"
+    "sqadd v17.4s, v17.4s, v10.4s\n"
+    "srshl v24.4s, v24.4s, v0.4s\n"
+    "sqxtn v13.4h, v13.4s\n"
+    "srshl v14.4s, v14.4s, v0.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v22.4s, v22.4s, v0.4s\n"
+    "sqxtn v27.4h, v27.4s\n"
+    "srshl v17.4s, v17.4s, v0.4s\n"
+    "sqxtn v8.4h, v8.4s\n"
+    "sqxtn2 v13.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v14.4s\n"
+    "sqxtn2 v27.8h, v22.4s\n"
+    "sqxtn2 v8.8h, v17.4s\n"
+    "sqadd v13.8h, v13.8h, v25.8h\n"
+    "sqadd v7.8h, v7.8h, v25.8h\n"
+    "sqadd v27.8h, v27.8h, v25.8h\n"
+    "sqadd v8.8h, v8.8h, v25.8h\n"
+    "smax v13.8h, v13.8h, v12.8h\n"
+    "smax v7.8h, v7.8h, v12.8h\n"
+    "smax v27.8h, v27.8h, v12.8h\n"
+    "smax v8.8h, v8.8h, v12.8h\n"
+    "smin v13.8h, v13.8h, v26.8h\n"
+    "smin v7.8h, v7.8h, v26.8h\n"
+    "smin v27.8h, v27.8h, v26.8h\n"
+    "smin v8.8h, v8.8h, v26.8h\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "tbz x2, #2, 121f\n"
+    "st1 { v13.s }[0], [x16], #0x4\n"
+    "st1 { v7.s }[0], [x15], #0x4\n"
+    "st1 { v27.s }[0], [x14], #0x4\n"
+    "st1 { v8.s }[0], [x13], #0x4\n"
+    "tbz x2, #1, 120f\n"
+    "st1 { v13.h }[2], [x16], #0x2\n"
+    "st1 { v7.h }[2], [x15], #0x2\n"
+    "st1 { v27.h }[2], [x14], #0x2\n"
+    "st1 { v8.h }[2], [x13], #0x2\n"
+    "tbz x2, #0, 123f\n"
+    "st1 { v13.b }[6], [x16], #0x1\n"
+    "st1 { v7.b }[6], [x15], #0x1\n"
+    "st1 { v27.b }[6], [x14], #0x1\n"
+    "st1 { v8.b }[6], [x13], #0x1\n"
+    "b 123f\n"
+    "120:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x2, #0, 123f\n"
+    "st1 { v13.b }[4], [x16], #0x1\n"
+    "st1 { v7.b }[4], [x15], #0x1\n"
+    "st1 { v27.b }[4], [x14], #0x1\n"
+    "st1 { v8.b }[4], [x13], #0x1\n"
+    "b 123f\n"
+    "121:"  // Oddments: Bit 2: Unset
+    "tbz x2, #1, 122f\n"
+    "st1 { v13.h }[0], [x16], #0x2\n"
+    "st1 { v7.h }[0], [x15], #0x2\n"
+    "st1 { v27.h }[0], [x14], #0x2\n"
+    "st1 { v8.h }[0], [x13], #0x2\n"
+    "tbz x2, #0, 123f\n"
+    "st1 { v13.b }[2], [x16], #0x1\n"
+    "st1 { v7.b }[2], [x15], #0x1\n"
+    "st1 { v27.b }[2], [x14], #0x1\n"
+    "st1 { v8.b }[2], [x13], #0x1\n"
+    "b 123f\n"
+    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x2, #0, 123f\n"
+    "st1 { v13.b }[0], [x16], #0x1\n"
+    "st1 { v7.b }[0], [x15], #0x1\n"
+    "st1 { v27.b }[0], [x14], #0x1\n"
+    "st1 { v8.b }[0], [x13], #0x1\n"
+    "123:"  // Oddments: Bit 2: End
+    "124:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1666c17ca0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f1c1b2315c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v14.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v19.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v29.8h }, [x21]\n"
+    "ld1r { v12.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d22, [x23, x17]\n"
+    "ldr d4, [x22, x17]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d8, [x21, x17]\n"
+    "ldr d27, [x20, x17]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ldr d15, [x20, x17]\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q3, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q28, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x27, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x26, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x25, [x15, #0x58]\n"
+    "ldr x24, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x21, [x15, #0x78]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x27, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x26, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "add x13, x13, #0x20\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x25, x17]\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x24, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x23, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x22, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "usubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d8, [x21, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v27.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v27.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v28.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v8.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+    "smlal v10.4s, v8.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+    "smlal2 v30.4s, v8.8h, v20.8h\n"
+    "smlal2 v6.4s, v8.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v3.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v20.16b, v0.16b, v28.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v31.16b, v30.16b, v28.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v18.16b, v6.16b, v28.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v3.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v20.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v31.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v28.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v28.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v28.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "ldr q9, [x20, #0x0]\n"
+    "ldr q24, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x23, x22, [x15, #0x0]\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ldp x21, x20, [x15, #0x10]\n"
+    "ldr d22, [x23, x17]\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ldr d4, [x22, x17]\n"
+    "ldr d8, [x21, x17]\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ldr d27, [x20, x17]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ldr d15, [x20, x17]\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q28, [x13, #0x0]\n"
+    "ldr q17, [x12, #0x0]\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr q21, [x13, #0x10]\n"
+    "ldr q3, [x12, #0x10]\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "ldr d11, [x20, x17]\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "ldr x20, [x15, #0x38]\n"
+    "ldr d4, [x20, x17]\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "ldr x26, [x15, #0x48]\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x25, [x15, #0x50]\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "ldr d8, [x20, x17]\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "ldr x24, [x15, #0x58]\n"
+    "ldr x23, [x15, #0x60]\n"
+    "smlal v2.4s, v11.4h, v31.4h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "ldr x22, [x15, #0x68]\n"
+    "ldr x21, [x15, #0x70]\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "smlal v9.4s, v4.4h, v16.4h\n"
+    "ldr x20, [x15, #0x78]\n"
+    "tst x7, #0x7\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "ldr d27, [x26, x17]\n"
+    "smlal2 v30.4s, v11.8h, v31.8h\n"
+    "ldr d11, [x25, x17]\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal v10.4s, v22.4h, v20.4h\n"
+    "usubl v11.8h, v11.8b, v14.8b\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v24.4s, v4.8h, v16.8h\n"
+    "smlal v9.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "ldr d15, [x24, x17]\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal2 v6.4s, v22.8h, v20.8h\n"
+    "ldr d22, [x23, x17]\n"
+    "smlal v7.4s, v4.4h, v23.4h\n"
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v2.4s, v27.4h, v18.4h\n"
+    "smlal v10.4s, v27.4h, v26.4h\n"
+    "smlal2 v24.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v20.4h\n"
+    "smlal2 v0.4s, v4.8h, v23.8h\n"
+    "ldr d4, [x22, x17]\n"
+    "smlal2 v30.4s, v27.8h, v18.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "smlal2 v6.4s, v27.8h, v26.8h\n"
+    "ldr d26, [x21, x17]\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "usubl v26.8h, v26.8b, v14.8b\n"
+    "smlal v2.4s, v11.4h, v23.4h\n"
+    "smlal v10.4s, v15.4h, v1.4h\n"
+    "smlal2 v24.4s, v27.8h, v20.8h\n"
+    "smlal v9.4s, v11.4h, v5.4h\n"
+    "smlal2 v0.4s, v8.8h, v16.8h\n"
+    "ldr d16, [x20, x17]\n"
+    "smlal2 v30.4s, v11.8h, v23.8h\n"
+    "usubl v16.8h, v16.8b, v14.8b\n"
+    "smlal2 v6.4s, v15.8h, v1.8h\n"
+    "smlal v7.4s, v27.4h, v25.4h\n"
+    "add x17, x17, #0x8\n"
+    "smlal v2.4s, v22.4h, v5.4h\n"
+    "smlal v10.4s, v4.4h, v18.4h\n"
+    "smlal2 v24.4s, v11.8h, v5.8h\n"
+    "smlal v9.4s, v22.4h, v31.4h\n"
+    "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+    "smlal2 v0.4s, v27.8h, v25.8h\n"
+    "smlal2 v30.4s, v22.8h, v5.8h\n"
+    "and v1.16b, v9.16b, v17.16b\n"
+    "smlal2 v6.4s, v4.8h, v18.8h\n"
+    "smlal v7.4s, v15.4h, v18.4h\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "smlal v2.4s, v26.4h, v25.4h\n"
+    "smlal v10.4s, v26.4h, v31.4h\n"
+    "sqadd v9.4s, v9.4s, v1.4s\n"
+    "smlal2 v24.4s, v22.8h, v31.8h\n"
+    "smlal2 v0.4s, v15.8h, v18.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+    "smlal2 v30.4s, v26.8h, v25.8h\n"
+    "smlal2 v6.4s, v26.8h, v31.8h\n"
+    "and v31.16b, v24.16b, v3.16b\n"
+    "smlal v7.4s, v4.4h, v20.4h\n"
+    "smlal v2.4s, v16.4h, v20.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+    "smlal v10.4s, v16.4h, v25.4h\n"
+    "smlal2 v0.4s, v4.8h, v20.8h\n"
+    "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+    "smlal2 v30.4s, v16.8h, v20.8h\n"
+    "smlal2 v6.4s, v16.8h, v25.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v22.16b, v7.16b, v17.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+    "and v15.16b, v2.16b, v17.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+    "and v11.16b, v10.16b, v17.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+    "sqadd v24.4s, v24.4s, v31.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v3.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v23.16b, v30.16b, v3.16b\n"
+    "sshr v11.4s, v11.4s, #0x1f\n"
+    "and v21.16b, v6.16b, v3.16b\n"
+    "sqadd v7.4s, v7.4s, v22.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v11.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v17.4s\n"
+    "srshl v7.4s, v7.4s, v17.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v17.4s\n"
+    "sqadd v30.4s, v30.4s, v23.4s\n"
+    "srshl v10.4s, v10.4s, v17.4s\n"
+    "sqadd v6.4s, v6.4s, v21.4s\n"
+    "srshl v24.4s, v24.4s, v3.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v3.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v3.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v3.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "str d9, [x11, x16]\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "str d7, [x10, x16]\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d2, [x9, x16]\n"
+    "str d10, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 64f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v9.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v24.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v24.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v9.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v9.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d23, [x14, #0x0]\n"
+    "ldr d16, [x14, #0x8]\n"
+    "mov v7.16b, v9.16b\n"
+    "mov v0.16b, v24.16b\n"
+    "ldr d1, [x14, #0x10]\n"
+    "ldr d5, [x14, #0x18]\n"
+    "mov v2.16b, v9.16b\n"
+    "mov v30.16b, v24.16b\n"
+    "ldr d26, [x14, #0x20]\n"
+    "ldr d18, [x14, #0x28]\n"
+    "mov v10.16b, v9.16b\n"
+    "mov v6.16b, v24.16b\n"
+    "ldr d31, [x14, #0x30]\n"
+    "ldr d25, [x14, #0x38]\n"
+    "ssubl v23.8h, v23.8b, v19.8b\n"
+    "ssubl v16.8h, v16.8b, v19.8b\n"
+    "ldr d20, [x14, #0x40]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "ssubl v1.8h, v1.8b, v19.8b\n"
+    "ssubl v5.8h, v5.8b, v19.8b\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ssubl v26.8h, v26.8b, v19.8b\n"
+    "ssubl v18.8h, v18.8b, v19.8b\n"
+    "ssubl v31.8h, v31.8b, v19.8b\n"
+    "ssubl v25.8h, v25.8b, v19.8b\n"
+    "ssubl v20.8h, v20.8b, v19.8b\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v22.s }[0], [x24], #0x4\n"
+    "ld1 { v4.s }[0], [x23], #0x4\n"
+    "ld1 { v8.s }[0], [x22], #0x4\n"
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v22.h }[2], [x24], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v8.h }[2], [x22], #0x2\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[6], [x24]\n"
+    "ld1 { v4.b }[6], [x23]\n"
+    "ld1 { v8.b }[6], [x22]\n"
+    "ld1 { v27.b }[6], [x21]\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[4], [x24]\n"
+    "ld1 { v4.b }[4], [x23]\n"
+    "ld1 { v8.b }[4], [x22]\n"
+    "ld1 { v27.b }[4], [x21]\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v22.h }[0], [x24], #0x2\n"
+    "ld1 { v4.h }[0], [x23], #0x2\n"
+    "ld1 { v8.h }[0], [x22], #0x2\n"
+    "ld1 { v27.h }[0], [x21], #0x2\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[2], [x24]\n"
+    "ld1 { v4.b }[2], [x23]\n"
+    "ld1 { v8.b }[2], [x22]\n"
+    "ld1 { v27.b }[2], [x21]\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v22.b }[0], [x24]\n"
+    "ld1 { v4.b }[0], [x23]\n"
+    "ld1 { v8.b }[0], [x22]\n"
+    "ld1 { v27.b }[0], [x21]\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "smlal v9.4s, v22.4h, v26.4h\n"
+    "smlal2 v24.4s, v22.8h, v26.8h\n"
+    "ldr x20, [x15, #0x28]\n"
+    "smlal v7.4s, v22.4h, v5.4h\n"
+    "smlal2 v0.4s, v22.8h, v5.8h\n"
+    "usubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "smlal v2.4s, v22.4h, v16.4h\n"
+    "smlal2 v30.4s, v22.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v10.4s, v22.4h, v23.4h\n"
+    "smlal2 v6.4s, v22.8h, v23.8h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "smlal v9.4s, v4.4h, v23.4h\n"
+    "smlal2 v24.4s, v4.8h, v23.8h\n"
+    "usubl v15.8h, v15.8b, v14.8b\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v0.4s, v8.8h, v1.8h\n"
+    "smlal v9.4s, v27.4h, v18.4h\n"
+    "smlal2 v24.4s, v27.8h, v18.8h\n"
+    "smlal v7.4s, v27.4h, v26.4h\n"
+    "smlal2 v0.4s, v27.8h, v26.8h\n"
+    "smlal v2.4s, v27.4h, v1.4h\n"
+    "smlal2 v30.4s, v27.8h, v1.8h\n"
+    "smlal v10.4s, v27.4h, v16.4h\n"
+    "smlal2 v6.4s, v27.8h, v16.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v21.8h, v21.8b, v14.8b\n"
+    "smlal v2.4s, v21.4h, v31.4h\n"
+    "smlal2 v30.4s, v21.8h, v31.8h\n"
+    "ldr x20, [x15, #0x30]\n"
+    "smlal v9.4s, v15.4h, v25.4h\n"
+    "smlal2 v24.4s, v15.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v7.4s, v15.4h, v31.4h\n"
+    "smlal2 v0.4s, v15.8h, v31.8h\n"
+    "smlal v2.4s, v15.4h, v26.4h\n"
+    "smlal2 v30.4s, v15.8h, v26.8h\n"
+    "smlal v10.4s, v15.4h, v5.4h\n"
+    "smlal2 v6.4s, v15.8h, v5.8h\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (3, 3): Bit 2: End
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x38]\n"
+    "smlal v10.4s, v28.4h, v20.4h\n"
+    "smlal2 v6.4s, v28.8h, v20.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v22.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 1): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v22.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v22.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 1): Bit 2: End
+    "usubl v22.8h, v22.8b, v14.8b\n"
+    "ldr x20, [x15, #0x40]\n"
+    "smlal v9.4s, v22.4h, v16.4h\n"
+    "smlal2 v24.4s, v22.8h, v16.8h\n"
+    "smlal v7.4s, v22.4h, v23.4h\n"
+    "smlal2 v0.4s, v22.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (0, 2): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (0, 2): Bit 2: End
+    "usubl v21.8h, v21.8b, v14.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v9.4s, v21.4h, v1.4h\n"
+    "smlal2 v24.4s, v21.8h, v1.8h\n"
+    "smlal v7.4s, v21.4h, v16.4h\n"
+    "smlal2 v0.4s, v21.8h, v16.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v9.4s, v28.4h, v20.4h\n"
+    "smlal2 v24.4s, v28.8h, v20.8h\n"
+    "smlal v7.4s, v28.4h, v25.4h\n"
+    "smlal2 v0.4s, v28.8h, v25.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v2.4s, v28.4h, v18.4h\n"
+    "smlal2 v30.4s, v28.8h, v18.8h\n"
+    "smlal v10.4s, v28.4h, v26.4h\n"
+    "smlal2 v6.4s, v28.8h, v26.8h\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (1, 0): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (1, 0): Bit 2: End
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v9.4s, v8.4h, v5.4h\n"
+    "smlal2 v24.4s, v8.8h, v5.8h\n"
+    "smlal v2.4s, v8.4h, v23.4h\n"
+    "smlal2 v30.4s, v8.8h, v23.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v8.8h, v8.8b, v14.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v7.4s, v8.4h, v18.4h\n"
+    "smlal2 v0.4s, v8.8h, v18.8h\n"
+    "smlal v10.4s, v8.4h, v1.4h\n"
+    "smlal2 v6.4s, v8.8h, v1.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v17.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v17.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v17.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v17.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 0): Bit 2: End
+    "usubl v17.8h, v17.8b, v14.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v9.4s, v17.4h, v31.4h\n"
+    "smlal2 v24.4s, v17.8h, v31.8h\n"
+    "smlal v2.4s, v17.4h, v5.4h\n"
+    "smlal2 v30.4s, v17.8h, v5.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "usubl v23.8h, v23.8b, v14.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v7.4s, v23.4h, v20.4h\n"
+    "smlal2 v0.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v18.4h\n"
+    "smlal2 v6.4s, v23.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v5.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v5.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v5.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "usubl v5.8h, v5.8b, v14.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v2.4s, v5.4h, v25.4h\n"
+    "smlal2 v30.4s, v5.8h, v25.8h\n"
+    "smlal v10.4s, v5.4h, v31.4h\n"
+    "smlal2 v6.4s, v5.8h, v31.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v23.8h, v23.8b, v14.8b\n"
+    "smlal v2.4s, v23.4h, v20.4h\n"
+    "smlal2 v30.4s, v23.8h, v20.8h\n"
+    "smlal v10.4s, v23.4h, v25.4h\n"
+    "smlal2 v6.4s, v23.8h, v25.8h\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v15.4s }, [x13], #0x10\n"
+    "ld1 { v19.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v22.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v22.s }[2], [x12]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v22.s }[0], [x12]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v15.d }[0], [x13], #0x8\n"
+    "ld1 { v19.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[2], [x13]\n"
+    "ld1 { v19.s }[2], [x12]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v15.s }[0], [x13]\n"
+    "ld1 { v19.s }[0], [x12]\n"
+    "59:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+    "and v17.16b, v9.16b, v19.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v20.16b, v24.16b, v22.16b\n"
+    "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+    "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+    "sqadd v9.4s, v9.4s, v17.4s\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "and v21.16b, v7.16b, v19.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v15.16b, v2.16b, v19.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+    "and v23.16b, v10.16b, v19.16b\n"
+    "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v20.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v22.16b\n"
+    "sshr v15.4s, v15.4s, #0x1f\n"
+    "and v17.16b, v30.16b, v22.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v28.16b, v6.16b, v22.16b\n"
+    "sqadd v7.4s, v7.4s, v21.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v2.4s, v2.4s, v15.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v10.4s, v10.4s, v23.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v9.4s, v9.4s, v19.4s\n"
+    "srshl v7.4s, v7.4s, v19.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v2.4s, v2.4s, v19.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "srshl v10.4s, v10.4s, v19.4s\n"
+    "sqadd v6.4s, v6.4s, v28.4s\n"
+    "srshl v24.4s, v24.4s, v22.4s\n"
+    "sqxtn v9.4h, v9.4s\n"
+    "srshl v0.4s, v0.4s, v22.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v30.4s, v30.4s, v22.4s\n"
+    "sqxtn v2.4h, v2.4s\n"
+    "srshl v6.4s, v6.4s, v22.4s\n"
+    "sqxtn v10.4h, v10.4s\n"
+    "sqxtn2 v9.8h, v24.4s\n"
+    "sqxtn2 v7.8h, v0.4s\n"
+    "sqxtn2 v2.8h, v30.4s\n"
+    "sqxtn2 v10.8h, v6.4s\n"
+    "sqadd v9.8h, v9.8h, v13.8h\n"
+    "sqadd v7.8h, v7.8h, v13.8h\n"
+    "sqadd v2.8h, v2.8h, v13.8h\n"
+    "sqadd v10.8h, v10.8h, v13.8h\n"
+    "smax v9.8h, v9.8h, v29.8h\n"
+    "smax v7.8h, v7.8h, v29.8h\n"
+    "smax v2.8h, v2.8h, v29.8h\n"
+    "smax v10.8h, v10.8h, v29.8h\n"
+    "smin v9.8h, v9.8h, v12.8h\n"
+    "smin v7.8h, v7.8h, v12.8h\n"
+    "smin v2.8h, v2.8h, v12.8h\n"
+    "smin v10.8h, v10.8h, v12.8h\n"
+    "uzp1 v9.16b, v9.16b, v9.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v2.16b, v2.16b, v2.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "tbz x7, #2, 61f\n"
+    "st1 { v9.s }[0], [x11], #0x4\n"
+    "st1 { v7.s }[0], [x10], #0x4\n"
+    "st1 { v2.s }[0], [x9], #0x4\n"
+    "st1 { v10.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "st1 { v9.h }[2], [x11], #0x2\n"
+    "st1 { v7.h }[2], [x10], #0x2\n"
+    "st1 { v2.h }[2], [x9], #0x2\n"
+    "st1 { v10.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[6], [x11], #0x1\n"
+    "st1 { v7.b }[6], [x10], #0x1\n"
+    "st1 { v2.b }[6], [x9], #0x1\n"
+    "st1 { v10.b }[6], [x28], #0x1\n"
+    "b 63f\n"
+    "60:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[4], [x11], #0x1\n"
+    "st1 { v7.b }[4], [x10], #0x1\n"
+    "st1 { v2.b }[4], [x9], #0x1\n"
+    "st1 { v10.b }[4], [x28], #0x1\n"
+    "b 63f\n"
+    "61:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "st1 { v9.h }[0], [x11], #0x2\n"
+    "st1 { v7.h }[0], [x10], #0x2\n"
+    "st1 { v2.h }[0], [x9], #0x2\n"
+    "st1 { v10.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[2], [x11], #0x1\n"
+    "st1 { v7.b }[2], [x10], #0x1\n"
+    "st1 { v2.b }[2], [x9], #0x1\n"
+    "st1 { v10.b }[2], [x28], #0x1\n"
+    "b 63f\n"
+    "62:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "st1 { v9.b }[0], [x11], #0x1\n"
+    "st1 { v7.b }[0], [x10], #0x1\n"
+    "st1 { v2.b }[0], [x9], #0x1\n"
+    "st1 { v10.b }[0], [x28], #0x1\n"
+    "63:"  // Oddments: Bit 2: End
+    "64:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..7c05b36f36
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+
+
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..e9db8e1322
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    uint64_t n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x8, x7, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v15.16b }, [x21]\n"
+    "ld1r { v13.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v17.8h }, [x21]\n"
+    "ld1r { v24.8h }, [x20]\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x11, x10, [x22, #0x0]\n"
+    "ldp x9, x28, [x22, #0x10]\n"
+    "cbz x8, 3f\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "subs x8, x8, #0x1\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d25, [x27, x17]\n"
+    "ldr d27, [x26, x17]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d1, [x25, x17]\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "ldr d12, [x23, x17]\n"
+    "ldr d16, [x22, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "ldr d23, [x21, x17]\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr q30, [x13, #0x0]\n"
+    "ldr q29, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d18, [x22, x17]\n"
+    "ldr d16, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "ldr x21, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "add x14, x14, #0x48\n"
+    "smlal v20.4s, v18.4h, v7.4h\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v30.4s\n"
+    "subs x8, x8, #0x1\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v18.8h, v7.8h\n"
+    "and v28.16b, v5.16b, v29.16b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x12, x12, #0x20\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v21.16b, v29.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v23.16b, v20.16b, v29.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v9.16b, v19.16b, v29.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v25.16b\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v22.16b, v0.16b, v25.16b\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v12.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v23.4s\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v9.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v29.4s\n"
+    "srshl v21.4s, v21.4s, v29.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v29.4s\n"
+    "sqadd v0.4s, v0.4s, v22.4s\n"
+    "srshl v19.4s, v19.4s, v29.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "ldr q5, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "add x16, x16, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ldr d25, [x27, x17]\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ldr d27, [x26, x17]\n"
+    "ldr d1, [x25, x17]\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ldr d2, [x24, x17]\n"
+    "ldr d12, [x23, x17]\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d23, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr q29, [x13, #0x0]\n"
+    "ldr q30, [x12, #0x0]\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x21, [x15, #0x58]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "ldr x25, [x15, #0x60]\n"
+    "ldr x24, [x15, #0x80]\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "ldr d27, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "ldr x23, [x15, #0x68]\n"
+    "ldr x22, [x15, #0x88]\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "ldr d25, [x20, x17]\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal v20.4s, v27.4h, v28.4h\n"
+    "smlal v19.4s, v25.4h, v18.4h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "ldr d1, [x25, x17]\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "ldr d2, [x24, x17]\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v28.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v18.8h\n"
+    "ldr d25, [x22, x17]\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "ldr x25, [x15, #0x98]\n"
+    "smlal v20.4s, v1.4h, v11.4h\n"
+    "smlal v19.4s, v2.4h, v22.4h\n"
+    "ldr x24, [x15, #0x50]\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "ldr d16, [x21, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "ldr d12, [x20, x17]\n"
+    "ldr x23, [x15, #0x48]\n"
+    "smlal2 v0.4s, v1.8h, v11.8h\n"
+    "smlal2 v31.4s, v2.8h, v22.8h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal v20.4s, v27.4h, v18.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "smlal v19.4s, v25.4h, v9.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "ldr d23, [x25, x17]\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "ldr d11, [x24, x17]\n"
+    "usubl v11.8h, v11.8b, v6.8b\n"
+    "smlal2 v0.4s, v27.8h, v18.8h\n"
+    "ldr d27, [x23, x17]\n"
+    "smlal2 v31.4s, v25.8h, v9.8h\n"
+    "ldr d25, [x21, x17]\n"
+    "ldr x21, [x15, #0xb0]\n"
+    "smlal v21.4s, v16.4h, v18.4h\n"
+    "smlal v20.4s, v12.4h, v22.4h\n"
+    "smlal v19.4s, v23.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "ldr d10, [x20, x17]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v5.4s, v11.4h, v9.4h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal2 v8.4s, v16.8h, v18.8h\n"
+    "ldr d16, [x22, x17]\n"
+    "ldr d18, [x21, x17]\n"
+    "smlal2 v0.4s, v12.8h, v22.8h\n"
+    "ldr d22, [x20, x17]\n"
+    "smlal2 v31.4s, v23.8h, v14.8h\n"
+    "ldr q14, [x13, #0x10]\n"
+    "smlal v21.4s, v27.4h, v9.4h\n"
+    "smlal v20.4s, v25.4h, v26.4h\n"
+    "smlal v19.4s, v10.4h, v28.4h\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal2 v3.4s, v11.8h, v9.8h\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "smlal v5.4s, v1.4h, v26.4h\n"
+    "tst x7, #0x7\n"
+    "smlal2 v8.4s, v27.8h, v9.8h\n"
+    "ldr d27, [x20, x17]\n"
+    "smlal2 v0.4s, v25.8h, v26.8h\n"
+    "ldr q25, [x12, #0x10]\n"
+    "smlal2 v31.4s, v10.8h, v28.8h\n"
+    "smlal v21.4s, v11.4h, v28.4h\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "add x17, x17, #0x8\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal v19.4s, v18.4h, v7.4h\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v3.4s, v1.8h, v26.8h\n"
+    "smlal v5.4s, v12.4h, v7.4h\n"
+    "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+    "add x12, x12, #0x20\n"
+    "smlal2 v8.4s, v11.8h, v28.8h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "and v16.16b, v5.16b, v30.16b\n"
+    "smlal2 v31.4s, v18.8h, v7.8h\n"
+    "smlal v21.4s, v2.4h, v7.4h\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smlal v20.4s, v10.4h, v9.4h\n"
+    "smlal v19.4s, v22.4h, v26.4h\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "smlal2 v3.4s, v12.8h, v7.8h\n"
+    "smlal2 v8.4s, v2.8h, v7.8h\n"
+    "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+    "smlal2 v0.4s, v10.8h, v9.8h\n"
+    "smlal2 v31.4s, v22.8h, v26.8h\n"
+    "and v16.16b, v3.16b, v25.16b\n"
+    "smlal v21.4s, v23.4h, v4.4h\n"
+    "smlal v20.4s, v22.4h, v4.4h\n"
+    "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+    "smlal v19.4s, v27.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+    "smlal2 v0.4s, v22.8h, v4.8h\n"
+    "smlal2 v31.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v23.16b, v21.16b, v30.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+    "and v27.16b, v20.16b, v30.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+    "and v22.16b, v19.16b, v30.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v23.4s, v23.4s, #0x1f\n"
+    "and v14.16b, v8.16b, v25.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "and v18.16b, v0.16b, v25.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v25.16b\n"
+    "sqadd v21.4s, v21.4s, v23.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v27.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v22.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v30.4s\n"
+    "srshl v21.4s, v21.4s, v30.4s\n"
+    "sqadd v8.4s, v8.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "sqadd v0.4s, v0.4s, v18.4s\n"
+    "srshl v19.4s, v19.4s, v30.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v25.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v25.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v25.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v25.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "str d5, [x11, x16]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d21, [x10, x16]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d20, [x9, x16]\n"
+    "str d19, [x28, x16]\n"
+    "add x16, x16, #0x8\n"
+    "beq 88f\n"
+    "add x14, x14, #0x48\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x7, #2, 5f\n"
+    "ld1 { v5.4s }, [x20], #0x10\n"
+    "tbz x7, #1, 4f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v3.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x7, #1, 6f\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 7f\n"
+    "ld1 { v5.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d11, [x14, #0x0]\n"
+    "ldr d22, [x14, #0x8]\n"
+    "mov v21.16b, v5.16b\n"
+    "mov v8.16b, v3.16b\n"
+    "ldr d14, [x14, #0x10]\n"
+    "ldr d28, [x14, #0x18]\n"
+    "mov v20.16b, v5.16b\n"
+    "mov v0.16b, v3.16b\n"
+    "ldr d18, [x14, #0x20]\n"
+    "ldr d9, [x14, #0x28]\n"
+    "mov v19.16b, v5.16b\n"
+    "mov v31.16b, v3.16b\n"
+    "ldr d26, [x14, #0x30]\n"
+    "ldr d7, [x14, #0x38]\n"
+    "ssubl v11.8h, v11.8b, v15.8b\n"
+    "ssubl v22.8h, v22.8b, v15.8b\n"
+    "ldr d4, [x14, #0x40]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ssubl v14.8h, v14.8b, v15.8b\n"
+    "ssubl v28.8h, v28.8b, v15.8b\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ssubl v18.8h, v18.8b, v15.8b\n"
+    "ssubl v9.8h, v9.8b, v15.8b\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ssubl v26.8h, v26.8b, v15.8b\n"
+    "ssubl v7.8h, v7.8b, v15.8b\n"
+    "ssubl v4.8h, v4.8b, v15.8b\n"
+    "add x27, x27, x17\n"
+    "add x26, x26, x17\n"
+    "add x25, x25, x17\n"
+    "add x24, x24, x17\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 9f\n"
+    "ld1 { v25.s }[0], [x27], #0x4\n"
+    "ld1 { v27.s }[0], [x26], #0x4\n"
+    "ld1 { v1.s }[0], [x25], #0x4\n"
+    "ld1 { v2.s }[0], [x24], #0x4\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "ld1 { v16.s }[0], [x22], #0x4\n"
+    "ld1 { v23.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 8f\n"
+    "ld1 { v25.h }[2], [x27], #0x2\n"
+    "ld1 { v27.h }[2], [x26], #0x2\n"
+    "ld1 { v1.h }[2], [x25], #0x2\n"
+    "ld1 { v2.h }[2], [x24], #0x2\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "ld1 { v16.h }[2], [x22], #0x2\n"
+    "ld1 { v23.h }[2], [x21], #0x2\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[6], [x27]\n"
+    "ld1 { v27.b }[6], [x26]\n"
+    "ld1 { v1.b }[6], [x25]\n"
+    "ld1 { v2.b }[6], [x24]\n"
+    "ld1 { v12.b }[6], [x23]\n"
+    "ld1 { v16.b }[6], [x22]\n"
+    "ld1 { v23.b }[6], [x21]\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[4], [x27]\n"
+    "ld1 { v27.b }[4], [x26]\n"
+    "ld1 { v1.b }[4], [x25]\n"
+    "ld1 { v2.b }[4], [x24]\n"
+    "ld1 { v12.b }[4], [x23]\n"
+    "ld1 { v16.b }[4], [x22]\n"
+    "ld1 { v23.b }[4], [x21]\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x7, #1, 10f\n"
+    "ld1 { v25.h }[0], [x27], #0x2\n"
+    "ld1 { v27.h }[0], [x26], #0x2\n"
+    "ld1 { v1.h }[0], [x25], #0x2\n"
+    "ld1 { v2.h }[0], [x24], #0x2\n"
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "ld1 { v16.h }[0], [x22], #0x2\n"
+    "ld1 { v23.h }[0], [x21], #0x2\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[2], [x27]\n"
+    "ld1 { v27.b }[2], [x26]\n"
+    "ld1 { v1.b }[2], [x25]\n"
+    "ld1 { v2.b }[2], [x24]\n"
+    "ld1 { v12.b }[2], [x23]\n"
+    "ld1 { v16.b }[2], [x22]\n"
+    "ld1 { v23.b }[2], [x21]\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 11f\n"
+    "ld1 { v25.b }[0], [x27]\n"
+    "ld1 { v27.b }[0], [x26]\n"
+    "ld1 { v1.b }[0], [x25]\n"
+    "ld1 { v2.b }[0], [x24]\n"
+    "ld1 { v12.b }[0], [x23]\n"
+    "ld1 { v16.b }[0], [x22]\n"
+    "ld1 { v23.b }[0], [x21]\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v25.8h, v25.8b, v6.8b\n"
+    "smlal v5.4s, v25.4h, v4.4h\n"
+    "smlal2 v3.4s, v25.8h, v4.8h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "usubl v27.8h, v27.8b, v6.8b\n"
+    "smlal v5.4s, v27.4h, v11.4h\n"
+    "smlal2 v3.4s, v27.8h, v11.8h\n"
+    "usubl v1.8h, v1.8b, v6.8b\n"
+    "smlal v21.4s, v25.4h, v26.4h\n"
+    "smlal2 v8.4s, v25.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v5.4s, v1.4h, v22.4h\n"
+    "smlal2 v3.4s, v1.8h, v22.8h\n"
+    "usubl v2.8h, v2.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v21.4s, v2.4h, v22.4h\n"
+    "smlal2 v8.4s, v2.8h, v22.8h\n"
+    "smlal v5.4s, v16.4h, v28.4h\n"
+    "smlal2 v3.4s, v16.8h, v28.8h\n"
+    "usubl v12.8h, v12.8b, v6.8b\n"
+    "usubl v23.8h, v23.8b, v6.8b\n"
+    "smlal v21.4s, v12.4h, v14.4h\n"
+    "smlal2 v8.4s, v12.8h, v14.8h\n"
+    "smlal v5.4s, v23.4h, v18.4h\n"
+    "smlal2 v3.4s, v23.8h, v18.8h\n"
+    "usubl v10.8h, v10.8b, v6.8b\n"
+    "smlal v20.4s, v25.4h, v14.4h\n"
+    "smlal2 v0.4s, v25.8h, v14.8h\n"
+    "smlal v19.4s, v25.4h, v11.4h\n"
+    "smlal2 v31.4s, v25.8h, v11.8h\n"
+    "smlal v5.4s, v10.4h, v14.4h\n"
+    "smlal2 v3.4s, v10.8h, v14.8h\n"
+    "smlal v21.4s, v10.4h, v11.4h\n"
+    "smlal2 v8.4s, v10.8h, v11.8h\n"
+    "tbz x7, #2, 13f\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 12f\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x7, #1, 14f\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 15f\n"
+    "ld1 { v15.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "ldr x20, [x15, #0x48]\n"
+    "smlal v21.4s, v15.4h, v18.4h\n"
+    "smlal2 v8.4s, v15.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 17f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 16f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x7, #1, 18f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 19f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x50]\n"
+    "smlal v21.4s, v16.4h, v9.4h\n"
+    "smlal2 v8.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 21f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 20f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (1, 2): Bit 2: Unset
+    "tbz x7, #1, 22f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 23f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (1, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x58]\n"
+    "smlal v5.4s, v16.4h, v9.4h\n"
+    "smlal2 v3.4s, v16.8h, v9.8h\n"
+    "smlal v21.4s, v16.4h, v28.4h\n"
+    "smlal2 v8.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 25f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 24f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x7, #1, 26f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 27f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x60]\n"
+    "smlal v20.4s, v16.4h, v28.4h\n"
+    "smlal2 v0.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 29f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 28f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x7, #1, 30f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 31f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x68]\n"
+    "smlal v5.4s, v16.4h, v26.4h\n"
+    "smlal2 v3.4s, v16.8h, v26.8h\n"
+    "smlal v20.4s, v16.4h, v11.4h\n"
+    "smlal2 v0.4s, v16.8h, v11.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 33f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 32f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x7, #1, 34f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 35f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (3, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x70]\n"
+    "smlal v20.4s, v16.4h, v18.4h\n"
+    "smlal2 v0.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 37f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 36f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x7, #1, 38f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 39f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x78]\n"
+    "smlal v5.4s, v16.4h, v7.4h\n"
+    "smlal2 v3.4s, v16.8h, v7.8h\n"
+    "smlal v20.4s, v16.4h, v22.4h\n"
+    "smlal2 v0.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 41f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 40f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x7, #1, 42f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 43f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (3, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x80]\n"
+    "smlal v19.4s, v16.4h, v18.4h\n"
+    "smlal2 v31.4s, v16.8h, v18.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 45f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 44f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x7, #1, 46f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 47f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x88]\n"
+    "smlal v21.4s, v16.4h, v7.4h\n"
+    "smlal2 v8.4s, v16.8h, v7.8h\n"
+    "smlal v19.4s, v16.4h, v22.4h\n"
+    "smlal2 v31.4s, v16.8h, v22.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 49f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 48f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x7, #1, 50f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 51f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x90]\n"
+    "smlal v19.4s, v16.4h, v9.4h\n"
+    "smlal2 v31.4s, v16.8h, v9.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 53f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 52f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x7, #1, 54f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 55f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (4, 0): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0x98]\n"
+    "smlal v20.4s, v16.4h, v26.4h\n"
+    "smlal2 v0.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 57f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 56f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x7, #1, 58f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 59f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (2, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "smlal v21.4s, v16.4h, v4.4h\n"
+    "smlal2 v8.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v14.4h\n"
+    "smlal2 v31.4s, v16.8h, v14.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 61f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 60f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x7, #1, 62f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 63f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (4, 1): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "smlal v20.4s, v16.4h, v7.4h\n"
+    "smlal2 v0.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 65f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 64f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x7, #1, 66f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 67f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v0.4s, v16.8h, v9.8h\n"
+    "smlal v19.4s, v16.4h, v28.4h\n"
+    "smlal2 v31.4s, v16.8h, v28.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 69f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 68f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x7, #1, 70f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 71f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 3): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "smlal v19.4s, v16.4h, v7.4h\n"
+    "smlal2 v31.4s, v16.8h, v7.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x7, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 2): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "smlal v20.4s, v16.4h, v4.4h\n"
+    "smlal2 v0.4s, v16.8h, v4.8h\n"
+    "smlal v19.4s, v16.4h, v26.4h\n"
+    "smlal2 v31.4s, v16.8h, v26.8h\n"
+    "add x20, x20, x17\n"
+    "tbz x7, #2, 77f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x7, #1, 76f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x7, #1, 78f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 79f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 4): Bit 2: End
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "smlal v19.4s, v16.4h, v4.4h\n"
+    "smlal2 v31.4s, v16.8h, v4.8h\n"
+    "tbz x7, #2, 81f\n"
+    "ld1 { v14.4s }, [x13], #0x10\n"
+    "ld1 { v25.4s }, [x12], #0x10\n"
+    "tbz x7, #1, 80f\n"
+    "ld1 { v18.d }[0], [x13], #0x8\n"
+    "ld1 { v12.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[2], [x13]\n"
+    "ld1 { v12.s }[2], [x12]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v18.s }[0], [x13]\n"
+    "ld1 { v12.s }[0], [x12]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x7, #1, 82f\n"
+    "ld1 { v14.d }[0], [x13], #0x8\n"
+    "ld1 { v25.d }[0], [x12], #0x8\n"
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[2], [x13]\n"
+    "ld1 { v25.s }[2], [x12]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 83f\n"
+    "ld1 { v14.s }[0], [x13]\n"
+    "ld1 { v25.s }[0], [x12]\n"
+    "83:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v28.16b, v5.16b, v25.16b\n"
+    "add x11, x11, x16\n"
+    "add x10, x10, x16\n"
+    "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "add x9, x9, x16\n"
+    "add x28, x28, x16\n"
+    "and v16.16b, v3.16b, v12.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v5.4s, v5.4s, v28.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v14.16b, v21.16b, v25.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+    "and v6.16b, v20.16b, v25.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+    "and v4.16b, v19.16b, v25.16b\n"
+    "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+    "sqadd v3.4s, v3.4s, v16.4s\n"
+    "sshr v14.4s, v14.4s, #0x1f\n"
+    "and v18.16b, v8.16b, v12.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "and v7.16b, v0.16b, v12.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v16.16b, v31.16b, v12.16b\n"
+    "sqadd v21.4s, v21.4s, v14.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v6.4s\n"
+    "sshr v7.4s, v7.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v5.4s, v5.4s, v25.4s\n"
+    "srshl v21.4s, v21.4s, v25.4s\n"
+    "sqadd v8.4s, v8.4s, v18.4s\n"
+    "srshl v20.4s, v20.4s, v25.4s\n"
+    "sqadd v0.4s, v0.4s, v7.4s\n"
+    "srshl v19.4s, v19.4s, v25.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v12.4s\n"
+    "sqxtn v5.4h, v5.4s\n"
+    "srshl v8.4s, v8.4s, v12.4s\n"
+    "sqxtn v21.4h, v21.4s\n"
+    "srshl v0.4s, v0.4s, v12.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v31.4s, v31.4s, v12.4s\n"
+    "sqxtn v19.4h, v19.4s\n"
+    "sqxtn2 v5.8h, v3.4s\n"
+    "sqxtn2 v21.8h, v8.4s\n"
+    "sqxtn2 v20.8h, v0.4s\n"
+    "sqxtn2 v19.8h, v31.4s\n"
+    "sqadd v5.8h, v5.8h, v13.8h\n"
+    "sqadd v21.8h, v21.8h, v13.8h\n"
+    "sqadd v20.8h, v20.8h, v13.8h\n"
+    "sqadd v19.8h, v19.8h, v13.8h\n"
+    "smax v5.8h, v5.8h, v17.8h\n"
+    "smax v21.8h, v21.8h, v17.8h\n"
+    "smax v20.8h, v20.8h, v17.8h\n"
+    "smax v19.8h, v19.8h, v17.8h\n"
+    "smin v5.8h, v5.8h, v24.8h\n"
+    "smin v21.8h, v21.8h, v24.8h\n"
+    "smin v20.8h, v20.8h, v24.8h\n"
+    "smin v19.8h, v19.8h, v24.8h\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz x7, #2, 85f\n"
+    "st1 { v5.s }[0], [x11], #0x4\n"
+    "st1 { v21.s }[0], [x10], #0x4\n"
+    "st1 { v20.s }[0], [x9], #0x4\n"
+    "st1 { v19.s }[0], [x28], #0x4\n"
+    "tbz x7, #1, 84f\n"
+    "st1 { v5.h }[2], [x11], #0x2\n"
+    "st1 { v21.h }[2], [x10], #0x2\n"
+    "st1 { v20.h }[2], [x9], #0x2\n"
+    "st1 { v19.h }[2], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[6], [x11], #0x1\n"
+    "st1 { v21.b }[6], [x10], #0x1\n"
+    "st1 { v20.b }[6], [x9], #0x1\n"
+    "st1 { v19.b }[6], [x28], #0x1\n"
+    "b 87f\n"
+    "84:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[4], [x11], #0x1\n"
+    "st1 { v21.b }[4], [x10], #0x1\n"
+    "st1 { v20.b }[4], [x9], #0x1\n"
+    "st1 { v19.b }[4], [x28], #0x1\n"
+    "b 87f\n"
+    "85:"  // Oddments: Bit 2: Unset
+    "tbz x7, #1, 86f\n"
+    "st1 { v5.h }[0], [x11], #0x2\n"
+    "st1 { v21.h }[0], [x10], #0x2\n"
+    "st1 { v20.h }[0], [x9], #0x2\n"
+    "st1 { v19.h }[0], [x28], #0x2\n"
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[2], [x11], #0x1\n"
+    "st1 { v21.b }[2], [x10], #0x1\n"
+    "st1 { v20.b }[2], [x9], #0x1\n"
+    "st1 { v19.b }[2], [x28], #0x1\n"
+    "b 87f\n"
+    "86:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x7, #0, 87f\n"
+    "st1 { v5.b }[0], [x11], #0x1\n"
+    "st1 { v21.b }[0], [x10], #0x1\n"
+    "st1 { v20.b }[0], [x9], #0x1\n"
+    "st1 { v19.b }[0], [x28], #0x1\n"
+    "87:"  // Oddments: Bit 2: End
+    "88:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..5d53b17e53
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int,
+  const uint8_t *const *const,
+  const int8_t *const,
+  const int32_t *const,
+  const arm_gemm::Requantize32 &,
+  const int32_t *const,
+  const int32_t *const,
+  uint8_t *const *const
+);
+
+class a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+  Parent::KernelType kernel = a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..df955206e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2187 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "lsr x2, x1, #0x3\n"
+    "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v18.16b }, [x20]\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.16b }, [x21]\n"
+    "ld1r { v26.8h }, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_minval]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v11.8h }, [x21]\n"
+    "ld1r { v0.8h }, [x20]\n"
+    "mov x3, #0x0\n"
+    "mov x4, #0x0\n"
+    "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+    "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "ldp x17, x16, [x22, #0x0]\n"
+    "ldp x15, x14, [x22, #0x10]\n"
+    "cbz x2, 3f\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "subs x2, x2, #0x1\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ldr d31, [x9, x3]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldr d17, [x28, x3]\n"
+    "ldr d30, [x27, x3]\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "ldr d16, [x26, x3]\n"
+    "ldr d3, [x25, x3]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "ldr d4, [x24, x3]\n"
+    "ldr d25, [x23, x3]\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "ldr d9, [x22, x3]\n"
+    "ldr d29, [x21, x3]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "ldr d28, [x20, x3]\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "ldr d2, [x6, #0x28]\n"
+    "ldr d27, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d1, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x21, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x20, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v2.4h\n"
+    "ldr x20, [x5, #0x90]\n"
+    "ldr x23, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x21, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x22, [x5, #0xa0]\n"
+    "ldr x21, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v27.4h\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x20, x3]\n"
+    "smlal v20.4s, v16.4h, v2.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal v23.4s, v14.4h, v2.4h\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "ldr x13, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v27.8h\n"
+    "smlal v7.4s, v4.4h, v1.4h\n"
+    "ldr x12, [x5, #0xc0]\n"
+    "ldr x11, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v2.8h\n"
+    "ldr d16, [x23, x3]\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v2.8h\n"
+    "ldr d2, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v27.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v27.4h\n"
+    "smlal v23.4s, v25.4h, v27.4h\n"
+    "ldr x10, [x5, #0xd0]\n"
+    "ldr x9, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v1.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x28, [x5, #0xe0]\n"
+    "ldr x27, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v27.8h\n"
+    "ldr d4, [x22, x3]\n"
+    "smlal2 v22.4s, v14.8h, v27.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v27.8h\n"
+    "ldr d27, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v1.4h\n"
+    "smlal v23.4s, v10.4h, v1.4h\n"
+    "ldr x26, [x5, #0xf0]\n"
+    "ldr x25, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x24, [x5, #0x100]\n"
+    "ldr x23, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v1.8h\n"
+    "ldr d17, [x21, x3]\n"
+    "smlal2 v22.4s, v25.8h, v1.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v1.8h\n"
+    "ldr d1, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x22, [x5, #0x110]\n"
+    "ldr x21, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "subs x2, x2, #0x1\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x13, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x12, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x11, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v7.4s, v10.4h, v27.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x10, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v2.4h\n"
+    "smlal v23.4s, v17.4h, v2.4h\n"
+    "smlal2 v15.4s, v10.8h, v27.8h\n"
+    "smlal v7.4s, v9.4h, v1.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "ldr d10, [x9, x3]\n"
+    "smlal2 v22.4s, v4.8h, v2.8h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v2.8h\n"
+    "ldr d2, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v27.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v27.4h\n"
+    "smlal v23.4s, v6.4h, v27.4h\n"
+    "smlal2 v15.4s, v9.8h, v1.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v27.8h\n"
+    "ldr d9, [x28, x3]\n"
+    "smlal2 v22.4s, v17.8h, v27.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v27.8h\n"
+    "ldr d27, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v1.4h\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x27, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v1.8h\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "ldr d1, [x26, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x25, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x24, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v2.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x23, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "add x6, x6, #0xc8\n"
+    "smlal2 v15.4s, v6.8h, v2.8h\n"
+    "smlal v7.4s, v8.4h, v27.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x22, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal v20.4s, v28.4h, v2.4h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v2.4h\n"
+    "smlal v23.4s, v12.4h, v2.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v27.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v2.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v2.8h\n"
+    "smlal2 v19.4s, v12.8h, v2.8h\n"
+    "ldr q2, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v27.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v1.4h, v27.4h\n"
+    "smlal v23.4s, v16.4h, v27.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v27.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v1.8h, v27.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v27.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v27.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v27.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v9.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "and v25.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+    "and v10.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+    "and v21.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+    "sqadd v15.4s, v15.4s, v9.4s\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v14.16b\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "and v12.16b, v22.16b, v14.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v17.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v10.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v21.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v12.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v17.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "ldr q7, [x20, #0x0]\n"
+    "ldr q15, [x20, #0x10]\n"
+    "add x20, x20, #0x20\n"
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "add x4, x4, #0x8\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldr d31, [x9, x3]\n"
+    "ldr d17, [x28, x3]\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr d30, [x27, x3]\n"
+    "ldr d16, [x26, x3]\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "ldr d3, [x25, x3]\n"
+    "ldr d4, [x24, x3]\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "ldr d25, [x23, x3]\n"
+    "ldr d9, [x22, x3]\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "ldr d29, [x21, x3]\n"
+    "ldr d28, [x20, x3]\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "ldr d27, [x6, #0x28]\n"
+    "ldr d1, [x6, #0x30]\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "ldr d2, [x6, #0x38]\n"
+    "ldr d31, [x6, #0x40]\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "ldr d8, [x6, #0x48]\n"
+    "ldr x22, [x5, #0x50]\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "ldr x20, [x5, #0x58]\n"
+    "ldr x21, [x5, #0x60]\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "ldr d6, [x20, x3]\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x22, [x5, #0x70]\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "ldr d3, [x21, x3]\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "ldr d14, [x20, x3]\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal v23.4s, v17.4h, v10.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x21, [x5, #0x78]\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "ldr d25, [x22, x3]\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v10.8h\n"
+    "ldr d10, [x21, x3]\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal v24.4s, v17.4h, v21.4h\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x24, [x5, #0x88]\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "smlal v7.4s, v30.4h, v27.4h\n"
+    "ldr x23, [x5, #0x90]\n"
+    "ldr x22, [x5, #0x98]\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "ldr d9, [x20, x3]\n"
+    "smlal2 v22.4s, v17.8h, v21.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "ldr d21, [x6, #0x50]\n"
+    "smlal v20.4s, v3.4h, v12.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "ldr x21, [x5, #0xa0]\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal2 v15.4s, v30.8h, v27.8h\n"
+    "ldr d30, [x24, x3]\n"
+    "smlal v7.4s, v16.4h, v1.4h\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v12.8h\n"
+    "ldr d3, [x6, #0x58]\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "ldr d12, [x23, x3]\n"
+    "smlal v20.4s, v16.4h, v27.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal v24.4s, v28.4h, v27.4h\n"
+    "smlal v23.4s, v14.4h, v27.4h\n"
+    "ldr x13, [x5, #0xb0]\n"
+    "ldr x12, [x5, #0xb8]\n"
+    "smlal2 v15.4s, v16.8h, v1.8h\n"
+    "smlal v7.4s, v4.4h, v2.4h\n"
+    "ldr x11, [x5, #0xc0]\n"
+    "ldr x10, [x5, #0xc8]\n"
+    "smlal2 v5.4s, v16.8h, v27.8h\n"
+    "ldr d16, [x22, x3]\n"
+    "smlal2 v22.4s, v28.8h, v27.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v27.8h\n"
+    "ldr d27, [x6, #0x60]\n"
+    "smlal v20.4s, v4.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v1.4h\n"
+    "smlal v23.4s, v25.4h, v1.4h\n"
+    "ldr x9, [x5, #0xd0]\n"
+    "ldr x28, [x5, #0xd8]\n"
+    "smlal2 v15.4s, v4.8h, v2.8h\n"
+    "smlal v7.4s, v17.4h, v31.4h\n"
+    "ldr x27, [x5, #0xe0]\n"
+    "ldr x26, [x5, #0xe8]\n"
+    "smlal2 v5.4s, v4.8h, v1.8h\n"
+    "ldr d4, [x21, x3]\n"
+    "smlal2 v22.4s, v14.8h, v1.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v1.8h\n"
+    "ldr d1, [x6, #0x68]\n"
+    "smlal v20.4s, v17.4h, v2.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v2.4h\n"
+    "smlal v23.4s, v10.4h, v2.4h\n"
+    "ldr x25, [x5, #0xf0]\n"
+    "ldr x24, [x5, #0xf8]\n"
+    "smlal2 v15.4s, v17.8h, v31.8h\n"
+    "smlal v7.4s, v6.4h, v8.4h\n"
+    "ldr x23, [x5, #0x100]\n"
+    "ldr x22, [x5, #0x108]\n"
+    "smlal2 v5.4s, v17.8h, v2.8h\n"
+    "ldr d17, [x20, x3]\n"
+    "smlal2 v22.4s, v25.8h, v2.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v2.8h\n"
+    "ldr d2, [x6, #0x70]\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v31.4h\n"
+    "smlal v23.4s, v9.4h, v31.4h\n"
+    "ldr x21, [x5, #0x110]\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal2 v15.4s, v6.8h, v8.8h\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "tst x1, #0x7\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "ldr d6, [x13, x3]\n"
+    "smlal2 v22.4s, v10.8h, v31.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v31.8h\n"
+    "ldr d31, [x6, #0x78]\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v9.4h, v8.4h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "ldr d28, [x12, x3]\n"
+    "smlal v7.4s, v14.4h, v3.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v5.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x6, #0x80]\n"
+    "smlal2 v22.4s, v9.8h, v8.8h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "ldr d8, [x11, x3]\n"
+    "smlal v20.4s, v14.4h, v21.4h\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "smlal v24.4s, v12.4h, v21.4h\n"
+    "smlal v23.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v14.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v27.4h\n"
+    "smlal2 v5.4s, v14.8h, v21.8h\n"
+    "ldr d14, [x10, x3]\n"
+    "smlal2 v22.4s, v12.8h, v21.8h\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "smlal2 v19.4s, v16.8h, v21.8h\n"
+    "ldr d21, [x6, #0x88]\n"
+    "smlal v20.4s, v25.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v16.4h, v3.4h\n"
+    "smlal v23.4s, v4.4h, v3.4h\n"
+    "smlal2 v15.4s, v25.8h, v27.8h\n"
+    "smlal v7.4s, v10.4h, v1.4h\n"
+    "smlal2 v5.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x9, x3]\n"
+    "smlal2 v22.4s, v16.8h, v3.8h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v3.8h\n"
+    "ldr d3, [x6, #0x90]\n"
+    "smlal v20.4s, v10.4h, v27.4h\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v4.4h, v27.4h\n"
+    "smlal v23.4s, v17.4h, v27.4h\n"
+    "smlal2 v15.4s, v10.8h, v1.8h\n"
+    "smlal v7.4s, v9.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v27.8h\n"
+    "ldr d10, [x28, x3]\n"
+    "smlal2 v22.4s, v4.8h, v27.8h\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "smlal2 v19.4s, v17.8h, v27.8h\n"
+    "ldr d27, [x6, #0x98]\n"
+    "smlal v20.4s, v9.4h, v1.4h\n"
+    "ssubl v27.8h, v27.8b, v13.8b\n"
+    "smlal v24.4s, v17.4h, v1.4h\n"
+    "smlal v23.4s, v6.4h, v1.4h\n"
+    "smlal2 v15.4s, v9.8h, v2.8h\n"
+    "smlal v7.4s, v12.4h, v31.4h\n"
+    "smlal2 v5.4s, v9.8h, v1.8h\n"
+    "ldr d9, [x27, x3]\n"
+    "smlal2 v22.4s, v17.8h, v1.8h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v19.4s, v6.8h, v1.8h\n"
+    "ldr d1, [x6, #0xa0]\n"
+    "smlal v20.4s, v30.4h, v2.4h\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v24.4s, v6.4h, v2.4h\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v15.4s, v12.8h, v31.8h\n"
+    "ldr d12, [x26, x3]\n"
+    "smlal v7.4s, v16.4h, v29.4h\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "ldr d30, [x6, #0xa8]\n"
+    "smlal2 v22.4s, v6.8h, v2.8h\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "smlal2 v19.4s, v28.8h, v2.8h\n"
+    "ldr d2, [x25, x3]\n"
+    "smlal v20.4s, v16.4h, v31.4h\n"
+    "usubl v2.8h, v2.8b, v18.8b\n"
+    "smlal v24.4s, v8.4h, v31.4h\n"
+    "smlal v23.4s, v14.4h, v31.4h\n"
+    "smlal2 v15.4s, v16.8h, v29.8h\n"
+    "smlal v7.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v16.8h, v31.8h\n"
+    "ldr d16, [x24, x3]\n"
+    "smlal2 v22.4s, v8.8h, v31.8h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "smlal2 v19.4s, v14.8h, v31.8h\n"
+    "ldr d31, [x6, #0xb0]\n"
+    "smlal v20.4s, v4.4h, v29.4h\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "smlal v24.4s, v14.4h, v29.4h\n"
+    "smlal v23.4s, v25.4h, v29.4h\n"
+    "smlal2 v15.4s, v4.8h, v21.8h\n"
+    "smlal v7.4s, v17.4h, v3.4h\n"
+    "smlal2 v5.4s, v4.8h, v29.8h\n"
+    "ldr d4, [x23, x3]\n"
+    "smlal2 v22.4s, v14.8h, v29.8h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v19.4s, v25.8h, v29.8h\n"
+    "ldr d29, [x6, #0xb8]\n"
+    "smlal v20.4s, v17.4h, v21.4h\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v15.4s, v17.8h, v3.8h\n"
+    "smlal v7.4s, v6.4h, v27.4h\n"
+    "smlal2 v5.4s, v17.8h, v21.8h\n"
+    "ldr d17, [x22, x3]\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "ldr d21, [x6, #0xc0]\n"
+    "smlal v20.4s, v6.4h, v3.4h\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "smlal v24.4s, v10.4h, v3.4h\n"
+    "smlal v23.4s, v9.4h, v3.4h\n"
+    "smlal2 v15.4s, v6.8h, v27.8h\n"
+    "smlal v7.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v6.8h, v3.8h\n"
+    "ldr d6, [x21, x3]\n"
+    "smlal2 v22.4s, v10.8h, v3.8h\n"
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "smlal2 v19.4s, v9.8h, v3.8h\n"
+    "ldr d3, [x20, x3]\n"
+    "smlal v20.4s, v28.4h, v27.4h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v24.4s, v9.4h, v27.4h\n"
+    "smlal v23.4s, v12.4h, v27.4h\n"
+    "add x3, x3, #0x8\n"
+    "smlal2 v15.4s, v8.8h, v1.8h\n"
+    "ldr q8, [x7, #0x0]\n"
+    "smlal v7.4s, v14.4h, v30.4h\n"
+    "smlal2 v5.4s, v28.8h, v27.8h\n"
+    "ldr q28, [x8, #0x0]\n"
+    "smlal2 v22.4s, v9.8h, v27.8h\n"
+    "smlal2 v19.4s, v12.8h, v27.8h\n"
+    "ldr q27, [x7, #0x10]\n"
+    "smlal v20.4s, v14.4h, v1.4h\n"
+    "add x7, x7, #0x20\n"
+    "smlal v24.4s, v2.4h, v1.4h\n"
+    "smlal v23.4s, v16.4h, v1.4h\n"
+    "smlal2 v15.4s, v14.8h, v30.8h\n"
+    "smlal v7.4s, v25.4h, v31.4h\n"
+    "smlal2 v5.4s, v14.8h, v1.8h\n"
+    "ldr q14, [x8, #0x10]\n"
+    "smlal2 v22.4s, v2.8h, v1.8h\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v19.4s, v16.8h, v1.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal v24.4s, v16.4h, v30.4h\n"
+    "smlal v23.4s, v4.4h, v30.4h\n"
+    "smlal2 v15.4s, v25.8h, v31.8h\n"
+    "smlal v7.4s, v10.4h, v29.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal2 v22.4s, v16.8h, v30.8h\n"
+    "smlal2 v19.4s, v4.8h, v30.8h\n"
+    "smlal v20.4s, v10.4h, v31.4h\n"
+    "smlal v24.4s, v4.4h, v31.4h\n"
+    "smlal v23.4s, v17.4h, v31.4h\n"
+    "smlal2 v15.4s, v10.8h, v29.8h\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+    "smlal2 v5.4s, v10.8h, v31.8h\n"
+    "smlal2 v22.4s, v4.8h, v31.8h\n"
+    "and v4.16b, v7.16b, v28.16b\n"
+    "smlal2 v19.4s, v17.8h, v31.8h\n"
+    "smlal v20.4s, v9.4h, v29.4h\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "smlal v24.4s, v17.4h, v29.4h\n"
+    "smlal v23.4s, v6.4h, v29.4h\n"
+    "sqadd v7.4s, v7.4s, v4.4s\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal2 v5.4s, v9.8h, v29.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+    "smlal2 v22.4s, v17.8h, v29.8h\n"
+    "smlal2 v19.4s, v6.8h, v29.8h\n"
+    "and v30.16b, v15.16b, v14.16b\n"
+    "smlal v20.4s, v12.4h, v21.4h\n"
+    "smlal v24.4s, v6.4h, v21.4h\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smlal v23.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v12.8h, v21.8h\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "smlal2 v22.4s, v6.8h, v21.8h\n"
+    "smlal2 v19.4s, v3.8h, v21.8h\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "and v3.16b, v20.16b, v28.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+    "and v25.16b, v24.16b, v28.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+    "and v16.16b, v23.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+    "sqadd v15.4s, v15.4s, v30.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "and v4.16b, v5.16b, v14.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "and v10.16b, v22.16b, v14.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v12.16b, v19.16b, v14.16b\n"
+    "sqadd v20.4s, v20.4s, v3.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v25.4s\n"
+    "sshr v10.4s, v10.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sshr v12.4s, v12.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v28.4s\n"
+    "srshl v20.4s, v20.4s, v28.4s\n"
+    "sqadd v5.4s, v5.4s, v4.4s\n"
+    "srshl v24.4s, v24.4s, v28.4s\n"
+    "sqadd v22.4s, v22.4s, v10.4s\n"
+    "srshl v23.4s, v23.4s, v28.4s\n"
+    "sqadd v19.4s, v19.4s, v12.4s\n"
+    "srshl v15.4s, v15.4s, v14.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v14.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v14.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x17, x4]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d20, [x16, x4]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d24, [x15, x4]\n"
+    "str d23, [x14, x4]\n"
+    "add x4, x4, #0x8\n"
+    "beq 124f\n"
+    "add x6, x6, #0xc8\n"
+    "3:"  // Oddments
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x1, #2, 5f\n"
+    "ld1 { v7.4s }, [x20], #0x10\n"
+    "tbz x1, #1, 4f\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v15.s }[0], [x20]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x1, #1, 6f\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[2], [x20]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 7f\n"
+    "ld1 { v7.s }[0], [x20]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "ldr d6, [x6, #0x0]\n"
+    "ldr d14, [x6, #0x8]\n"
+    "mov v20.16b, v7.16b\n"
+    "mov v5.16b, v15.16b\n"
+    "ldr d10, [x6, #0x10]\n"
+    "ldr d21, [x6, #0x18]\n"
+    "mov v24.16b, v7.16b\n"
+    "mov v22.16b, v15.16b\n"
+    "ldr d12, [x6, #0x20]\n"
+    "ldp x9, x28, [x5, #0x0]\n"
+    "mov v23.16b, v7.16b\n"
+    "mov v19.16b, v15.16b\n"
+    "ldp x27, x26, [x5, #0x10]\n"
+    "ldp x25, x24, [x5, #0x20]\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldp x23, x22, [x5, #0x30]\n"
+    "ldp x21, x20, [x5, #0x40]\n"
+    "ssubl v10.8h, v10.8b, v13.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "add x9, x9, x3\n"
+    "add x28, x28, x3\n"
+    "add x27, x27, x3\n"
+    "add x26, x26, x3\n"
+    "add x25, x25, x3\n"
+    "add x24, x24, x3\n"
+    "add x23, x23, x3\n"
+    "add x22, x22, x3\n"
+    "add x21, x21, x3\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 9f\n"
+    "ld1 { v31.s }[0], [x9], #0x4\n"
+    "ld1 { v17.s }[0], [x28], #0x4\n"
+    "ld1 { v30.s }[0], [x27], #0x4\n"
+    "ld1 { v16.s }[0], [x26], #0x4\n"
+    "ld1 { v3.s }[0], [x25], #0x4\n"
+    "ld1 { v4.s }[0], [x24], #0x4\n"
+    "ld1 { v25.s }[0], [x23], #0x4\n"
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v29.s }[0], [x21], #0x4\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 8f\n"
+    "ld1 { v31.h }[2], [x9], #0x2\n"
+    "ld1 { v17.h }[2], [x28], #0x2\n"
+    "ld1 { v30.h }[2], [x27], #0x2\n"
+    "ld1 { v16.h }[2], [x26], #0x2\n"
+    "ld1 { v3.h }[2], [x25], #0x2\n"
+    "ld1 { v4.h }[2], [x24], #0x2\n"
+    "ld1 { v25.h }[2], [x23], #0x2\n"
+    "ld1 { v9.h }[2], [x22], #0x2\n"
+    "ld1 { v29.h }[2], [x21], #0x2\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[6], [x9]\n"
+    "ld1 { v17.b }[6], [x28]\n"
+    "ld1 { v30.b }[6], [x27]\n"
+    "ld1 { v16.b }[6], [x26]\n"
+    "ld1 { v3.b }[6], [x25]\n"
+    "ld1 { v4.b }[6], [x24]\n"
+    "ld1 { v25.b }[6], [x23]\n"
+    "ld1 { v9.b }[6], [x22]\n"
+    "ld1 { v29.b }[6], [x21]\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[4], [x9]\n"
+    "ld1 { v17.b }[4], [x28]\n"
+    "ld1 { v30.b }[4], [x27]\n"
+    "ld1 { v16.b }[4], [x26]\n"
+    "ld1 { v3.b }[4], [x25]\n"
+    "ld1 { v4.b }[4], [x24]\n"
+    "ld1 { v25.b }[4], [x23]\n"
+    "ld1 { v9.b }[4], [x22]\n"
+    "ld1 { v29.b }[4], [x21]\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x1, #1, 10f\n"
+    "ld1 { v31.h }[0], [x9], #0x2\n"
+    "ld1 { v17.h }[0], [x28], #0x2\n"
+    "ld1 { v30.h }[0], [x27], #0x2\n"
+    "ld1 { v16.h }[0], [x26], #0x2\n"
+    "ld1 { v3.h }[0], [x25], #0x2\n"
+    "ld1 { v4.h }[0], [x24], #0x2\n"
+    "ld1 { v25.h }[0], [x23], #0x2\n"
+    "ld1 { v9.h }[0], [x22], #0x2\n"
+    "ld1 { v29.h }[0], [x21], #0x2\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[2], [x9]\n"
+    "ld1 { v17.b }[2], [x28]\n"
+    "ld1 { v30.b }[2], [x27]\n"
+    "ld1 { v16.b }[2], [x26]\n"
+    "ld1 { v3.b }[2], [x25]\n"
+    "ld1 { v4.b }[2], [x24]\n"
+    "ld1 { v25.b }[2], [x23]\n"
+    "ld1 { v9.b }[2], [x22]\n"
+    "ld1 { v29.b }[2], [x21]\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 11f\n"
+    "ld1 { v31.b }[0], [x9]\n"
+    "ld1 { v17.b }[0], [x28]\n"
+    "ld1 { v30.b }[0], [x27]\n"
+    "ld1 { v16.b }[0], [x26]\n"
+    "ld1 { v3.b }[0], [x25]\n"
+    "ld1 { v4.b }[0], [x24]\n"
+    "ld1 { v25.b }[0], [x23]\n"
+    "ld1 { v9.b }[0], [x22]\n"
+    "ld1 { v29.b }[0], [x21]\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "usubl v17.8h, v17.8b, v18.8b\n"
+    "smlal v7.4s, v31.4h, v6.4h\n"
+    "ldr x20, [x5, #0x50]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "smlal2 v15.4s, v31.8h, v6.8h\n"
+    "smlal v20.4s, v17.4h, v6.4h\n"
+    "smlal2 v5.4s, v17.8h, v6.8h\n"
+    "smlal v24.4s, v30.4h, v6.4h\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "add x20, x20, x3\n"
+    "smlal2 v22.4s, v30.8h, v6.8h\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "smlal v23.4s, v16.4h, v6.4h\n"
+    "smlal2 v19.4s, v16.8h, v6.8h\n"
+    "smlal v7.4s, v17.4h, v14.4h\n"
+    "usubl v4.8h, v4.8b, v18.8b\n"
+    "smlal2 v15.4s, v17.8h, v14.8h\n"
+    "smlal v20.4s, v3.4h, v14.4h\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "smlal2 v5.4s, v3.8h, v14.8h\n"
+    "smlal v24.4s, v16.4h, v14.4h\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal2 v22.4s, v16.8h, v14.8h\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "smlal v23.4s, v4.4h, v14.4h\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "smlal2 v19.4s, v4.8h, v14.8h\n"
+    "smlal v7.4s, v3.4h, v10.4h\n"
+    "smlal2 v15.4s, v3.8h, v10.8h\n"
+    "smlal v20.4s, v25.4h, v10.4h\n"
+    "smlal2 v5.4s, v25.8h, v10.8h\n"
+    "smlal v24.4s, v4.4h, v10.4h\n"
+    "smlal2 v22.4s, v4.8h, v10.8h\n"
+    "tbz x1, #2, 13f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 12f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x1, #1, 14f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 15f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "ldr x20, [x5, #0x58]\n"
+    "smlal v23.4s, v27.4h, v10.4h\n"
+    "smlal2 v19.4s, v27.8h, v10.8h\n"
+    "smlal v7.4s, v25.4h, v21.4h\n"
+    "smlal2 v15.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v20.4s, v9.4h, v21.4h\n"
+    "smlal2 v5.4s, v9.8h, v21.8h\n"
+    "smlal v24.4s, v27.4h, v21.4h\n"
+    "smlal2 v22.4s, v27.8h, v21.8h\n"
+    "tbz x1, #2, 17f\n"
+    "ld1 { v6.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 16f\n"
+    "ld1 { v6.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x1, #1, 18f\n"
+    "ld1 { v6.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 19f\n"
+    "ld1 { v6.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "usubl v6.8h, v6.8b, v18.8b\n"
+    "ldr x20, [x5, #0x60]\n"
+    "smlal v23.4s, v6.4h, v21.4h\n"
+    "smlal2 v19.4s, v6.8h, v21.8h\n"
+    "smlal v7.4s, v9.4h, v12.4h\n"
+    "smlal2 v15.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 21f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 20f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
+    "tbz x1, #1, 22f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 23f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (0, 5): Bit 2: End
+    "ldr d14, [x6, #0x28]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v6.4h, v12.4h\n"
+    "smlal2 v22.4s, v6.8h, v12.8h\n"
+    "ssubl v14.8h, v14.8b, v13.8b\n"
+    "ldr x20, [x5, #0x68]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v14.4h\n"
+    "smlal2 v15.4s, v30.8h, v14.8h\n"
+    "smlal v20.4s, v16.4h, v14.4h\n"
+    "smlal2 v5.4s, v16.8h, v14.8h\n"
+    "smlal v24.4s, v28.4h, v14.4h\n"
+    "smlal2 v22.4s, v28.8h, v14.8h\n"
+    "tbz x1, #2, 25f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 24f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x1, #1, 26f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 27f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr d21, [x6, #0x30]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x70]\n"
+    "smlal v23.4s, v25.4h, v14.4h\n"
+    "smlal2 v19.4s, v25.8h, v14.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v21.4h\n"
+    "smlal2 v15.4s, v16.8h, v21.8h\n"
+    "smlal v20.4s, v4.4h, v21.4h\n"
+    "smlal2 v5.4s, v4.8h, v21.8h\n"
+    "smlal v24.4s, v25.4h, v21.4h\n"
+    "smlal2 v22.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 29f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 28f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x1, #1, 30f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 31f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr d9, [x6, #0x38]\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0x78]\n"
+    "smlal v23.4s, v10.4h, v21.4h\n"
+    "smlal2 v19.4s, v10.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v4.4h, v9.4h\n"
+    "smlal2 v15.4s, v4.8h, v9.8h\n"
+    "smlal v20.4s, v27.4h, v9.4h\n"
+    "smlal2 v5.4s, v27.8h, v9.8h\n"
+    "smlal v24.4s, v10.4h, v9.4h\n"
+    "smlal2 v22.4s, v10.8h, v9.8h\n"
+    "tbz x1, #2, 33f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 32f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x1, #1, 34f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 35f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "35:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr d31, [x6, #0x40]\n"
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "ssubl v31.8h, v31.8b, v13.8b\n"
+    "ldr x20, [x5, #0x80]\n"
+    "smlal v23.4s, v12.4h, v9.4h\n"
+    "smlal2 v19.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v27.4h, v31.4h\n"
+    "smlal2 v15.4s, v27.8h, v31.8h\n"
+    "smlal v20.4s, v6.4h, v31.4h\n"
+    "smlal2 v5.4s, v6.8h, v31.8h\n"
+    "smlal v24.4s, v12.4h, v31.4h\n"
+    "smlal2 v22.4s, v12.8h, v31.8h\n"
+    "tbz x1, #2, 37f\n"
+    "ld1 { v8.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 36f\n"
+    "ld1 { v8.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x1, #1, 38f\n"
+    "ld1 { v8.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 39f\n"
+    "ld1 { v8.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr d16, [x6, #0x48]\n"
+    "usubl v8.8h, v8.8b, v18.8b\n"
+    "ssubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0x88]\n"
+    "smlal v23.4s, v8.4h, v31.4h\n"
+    "smlal2 v19.4s, v8.8h, v31.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v6.4h, v16.4h\n"
+    "smlal2 v15.4s, v6.8h, v16.8h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal2 v5.4s, v29.8h, v16.8h\n"
+    "smlal v24.4s, v8.4h, v16.4h\n"
+    "smlal2 v22.4s, v8.8h, v16.8h\n"
+    "tbz x1, #2, 41f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 40f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
+    "tbz x1, #1, 42f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 43f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "43:"  // Oddments: Load (2, 5): Bit 2: End
+    "ldr d21, [x6, #0x50]\n"
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0x90]\n"
+    "smlal v23.4s, v27.4h, v16.4h\n"
+    "smlal2 v19.4s, v27.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v21.4h\n"
+    "smlal2 v15.4s, v28.8h, v21.8h\n"
+    "smlal v20.4s, v25.4h, v21.4h\n"
+    "smlal2 v5.4s, v25.8h, v21.8h\n"
+    "tbz x1, #2, 45f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 44f\n"
+    "ld1 { v31.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[6], [x20]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[4], [x20]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x1, #1, 46f\n"
+    "ld1 { v31.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[2], [x20]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 47f\n"
+    "ld1 { v31.b }[0], [x20]\n"
+    "47:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v31.8h, v31.8b, v18.8b\n"
+    "ldr x20, [x5, #0x98]\n"
+    "smlal v24.4s, v31.4h, v21.4h\n"
+    "smlal2 v22.4s, v31.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 49f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 48f\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x1, #1, 50f\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 51f\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr d2, [x6, #0x58]\n"
+    "usubl v28.8h, v28.8b, v18.8b\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa0]\n"
+    "smlal v23.4s, v28.4h, v21.4h\n"
+    "smlal2 v19.4s, v28.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v2.4h\n"
+    "smlal2 v15.4s, v25.8h, v2.8h\n"
+    "smlal v20.4s, v10.4h, v2.4h\n"
+    "smlal2 v5.4s, v10.8h, v2.8h\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "tbz x1, #2, 53f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 52f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x1, #1, 54f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 55f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr d25, [x6, #0x60]\n"
+    "usubl v21.8h, v21.8b, v18.8b\n"
+    "ssubl v25.8h, v25.8b, v13.8b\n"
+    "ldr x20, [x5, #0xa8]\n"
+    "smlal v23.4s, v21.4h, v2.4h\n"
+    "smlal2 v19.4s, v21.8h, v2.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v10.4h, v25.4h\n"
+    "smlal2 v15.4s, v10.8h, v25.8h\n"
+    "smlal v20.4s, v12.4h, v25.4h\n"
+    "smlal2 v5.4s, v12.8h, v25.8h\n"
+    "smlal v24.4s, v21.4h, v25.4h\n"
+    "smlal2 v22.4s, v21.8h, v25.8h\n"
+    "tbz x1, #2, 57f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 56f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x1, #1, 58f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 59f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "59:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr d1, [x6, #0x68]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb0]\n"
+    "smlal v23.4s, v9.4h, v25.4h\n"
+    "smlal2 v19.4s, v9.8h, v25.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v12.4h, v1.4h\n"
+    "smlal2 v15.4s, v12.8h, v1.8h\n"
+    "smlal v20.4s, v8.4h, v1.4h\n"
+    "smlal2 v5.4s, v8.8h, v1.8h\n"
+    "smlal v24.4s, v9.4h, v1.4h\n"
+    "smlal2 v22.4s, v9.8h, v1.8h\n"
+    "tbz x1, #2, 61f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 60f\n"
+    "ld1 { v3.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[6], [x20]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[4], [x20]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x1, #1, 62f\n"
+    "ld1 { v3.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[2], [x20]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 63f\n"
+    "ld1 { v3.b }[0], [x20]\n"
+    "63:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr d16, [x6, #0x70]\n"
+    "usubl v3.8h, v3.8b, v18.8b\n"
+    "ssubl v16.8h, v16.8b, v13.8b\n"
+    "ldr x20, [x5, #0xb8]\n"
+    "smlal v23.4s, v3.4h, v1.4h\n"
+    "smlal2 v19.4s, v3.8h, v1.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v8.4h, v16.4h\n"
+    "smlal2 v15.4s, v8.8h, v16.8h\n"
+    "smlal v20.4s, v27.4h, v16.4h\n"
+    "smlal2 v5.4s, v27.8h, v16.8h\n"
+    "smlal v24.4s, v3.4h, v16.4h\n"
+    "smlal2 v22.4s, v3.8h, v16.8h\n"
+    "tbz x1, #2, 65f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 64f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[6], [x20]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[4], [x20]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
+    "tbz x1, #1, 66f\n"
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[2], [x20]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 67f\n"
+    "ld1 { v14.b }[0], [x20]\n"
+    "67:"  // Oddments: Load (3, 5): Bit 2: End
+    "ldr d17, [x6, #0x78]\n"
+    "usubl v14.8h, v14.8b, v18.8b\n"
+    "ssubl v17.8h, v17.8b, v13.8b\n"
+    "ldr x20, [x5, #0xc0]\n"
+    "smlal v23.4s, v14.4h, v16.4h\n"
+    "smlal2 v19.4s, v14.8h, v16.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v31.4h, v17.4h\n"
+    "smlal2 v15.4s, v31.8h, v17.8h\n"
+    "smlal v20.4s, v28.4h, v17.4h\n"
+    "smlal2 v5.4s, v28.8h, v17.8h\n"
+    "tbz x1, #2, 69f\n"
+    "ld1 { v1.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 68f\n"
+    "ld1 { v1.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[6], [x20]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[4], [x20]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x1, #1, 70f\n"
+    "ld1 { v1.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[2], [x20]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 71f\n"
+    "ld1 { v1.b }[0], [x20]\n"
+    "71:"  // Oddments: Load (4, 0): Bit 2: End
+    "usubl v1.8h, v1.8b, v18.8b\n"
+    "ldr x20, [x5, #0xc8]\n"
+    "smlal v24.4s, v1.4h, v17.4h\n"
+    "smlal2 v22.4s, v1.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 73f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 72f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x1, #1, 74f\n"
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 75f\n"
+    "ld1 { v16.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr d29, [x6, #0x80]\n"
+    "usubl v16.8h, v16.8b, v18.8b\n"
+    "ssubl v29.8h, v29.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd0]\n"
+    "smlal v23.4s, v16.4h, v17.4h\n"
+    "smlal2 v19.4s, v16.8h, v17.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v28.4h, v29.4h\n"
+    "smlal2 v15.4s, v28.8h, v29.8h\n"
+    "smlal v20.4s, v21.4h, v29.4h\n"
+    "smlal2 v5.4s, v21.8h, v29.8h\n"
+    "smlal v24.4s, v16.4h, v29.4h\n"
+    "smlal2 v22.4s, v16.8h, v29.8h\n"
+    "tbz x1, #2, 77f\n"
+    "ld1 { v30.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 76f\n"
+    "ld1 { v30.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[6], [x20]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[4], [x20]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x1, #1, 78f\n"
+    "ld1 { v30.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[2], [x20]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 79f\n"
+    "ld1 { v30.b }[0], [x20]\n"
+    "79:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr d12, [x6, #0x88]\n"
+    "usubl v30.8h, v30.8b, v18.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0xd8]\n"
+    "smlal v23.4s, v30.4h, v29.4h\n"
+    "smlal2 v19.4s, v30.8h, v29.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v21.4h, v12.4h\n"
+    "smlal2 v15.4s, v21.8h, v12.8h\n"
+    "smlal v20.4s, v9.4h, v12.4h\n"
+    "smlal2 v5.4s, v9.8h, v12.8h\n"
+    "smlal v24.4s, v30.4h, v12.4h\n"
+    "smlal2 v22.4s, v30.8h, v12.8h\n"
+    "tbz x1, #2, 81f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 80f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x1, #1, 82f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 83f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "83:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr d21, [x6, #0x90]\n"
+    "usubl v29.8h, v29.8b, v18.8b\n"
+    "ssubl v21.8h, v21.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe0]\n"
+    "smlal v23.4s, v29.4h, v12.4h\n"
+    "smlal2 v19.4s, v29.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v9.4h, v21.4h\n"
+    "smlal2 v15.4s, v9.8h, v21.8h\n"
+    "smlal v20.4s, v3.4h, v21.4h\n"
+    "smlal2 v5.4s, v3.8h, v21.8h\n"
+    "smlal v24.4s, v29.4h, v21.4h\n"
+    "smlal2 v22.4s, v29.8h, v21.8h\n"
+    "tbz x1, #2, 85f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 84f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x1, #1, 86f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 87f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "87:"  // Oddments: Load (4, 4): Bit 2: End
+    "ldr d8, [x6, #0x98]\n"
+    "usubl v25.8h, v25.8b, v18.8b\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0xe8]\n"
+    "smlal v23.4s, v25.4h, v21.4h\n"
+    "smlal2 v19.4s, v25.8h, v21.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v3.4h, v8.4h\n"
+    "smlal2 v15.4s, v3.8h, v8.8h\n"
+    "smlal v20.4s, v14.4h, v8.4h\n"
+    "smlal2 v5.4s, v14.8h, v8.8h\n"
+    "smlal v24.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "tbz x1, #2, 89f\n"
+    "ld1 { v21.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 88f\n"
+    "ld1 { v21.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[6], [x20]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[4], [x20]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
+    "tbz x1, #1, 90f\n"
+    "ld1 { v21.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[2], [x20]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 91f\n"
+    "ld1 { v21.b }[0], [x20]\n"
+    "91:"  // Oddments: Load (4, 5): Bit 2: End
+    "ldr d9, [x6, #0xa0]\n"
+    "usubl v21.8h, v21.8b, v18.8b\n"
+    "ssubl v9.8h, v9.8b, v13.8b\n"
+    "ldr x20, [x5, #0xf0]\n"
+    "smlal v23.4s, v21.4h, v8.4h\n"
+    "smlal2 v19.4s, v21.8h, v8.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v1.4h, v9.4h\n"
+    "smlal2 v15.4s, v1.8h, v9.8h\n"
+    "smlal v20.4s, v16.4h, v9.4h\n"
+    "smlal2 v5.4s, v16.8h, v9.8h\n"
+    "tbz x1, #2, 93f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 92f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[6], [x20]\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[4], [x20]\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
+    "tbz x1, #1, 94f\n"
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[2], [x20]\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 95f\n"
+    "ld1 { v12.b }[0], [x20]\n"
+    "95:"  // Oddments: Load (5, 0): Bit 2: End
+    "usubl v12.8h, v12.8b, v18.8b\n"
+    "ldr x20, [x5, #0xf8]\n"
+    "smlal v24.4s, v12.4h, v9.4h\n"
+    "smlal2 v22.4s, v12.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "tbz x1, #2, 97f\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 96f\n"
+    "ld1 { v10.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[6], [x20]\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[4], [x20]\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
+    "tbz x1, #1, 98f\n"
+    "ld1 { v10.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[2], [x20]\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 99f\n"
+    "ld1 { v10.b }[0], [x20]\n"
+    "99:"  // Oddments: Load (5, 1): Bit 2: End
+    "ldr d12, [x6, #0xa8]\n"
+    "usubl v10.8h, v10.8b, v18.8b\n"
+    "ssubl v12.8h, v12.8b, v13.8b\n"
+    "ldr x20, [x5, #0x100]\n"
+    "smlal v23.4s, v10.4h, v9.4h\n"
+    "smlal2 v19.4s, v10.8h, v9.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v16.4h, v12.4h\n"
+    "smlal2 v15.4s, v16.8h, v12.8h\n"
+    "smlal v20.4s, v30.4h, v12.4h\n"
+    "smlal2 v5.4s, v30.8h, v12.8h\n"
+    "smlal v24.4s, v10.4h, v12.4h\n"
+    "smlal2 v22.4s, v10.8h, v12.8h\n"
+    "tbz x1, #2, 101f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 100f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
+    "tbz x1, #1, 102f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 103f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "103:"  // Oddments: Load (5, 2): Bit 2: End
+    "ldr d28, [x6, #0xb0]\n"
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "ssubl v28.8h, v28.8b, v13.8b\n"
+    "ldr x20, [x5, #0x108]\n"
+    "smlal v23.4s, v9.4h, v12.4h\n"
+    "smlal2 v19.4s, v9.8h, v12.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v30.4h, v28.4h\n"
+    "smlal2 v15.4s, v30.8h, v28.8h\n"
+    "smlal v20.4s, v29.4h, v28.4h\n"
+    "smlal2 v5.4s, v29.8h, v28.8h\n"
+    "smlal v24.4s, v9.4h, v28.4h\n"
+    "smlal2 v22.4s, v9.8h, v28.8h\n"
+    "tbz x1, #2, 105f\n"
+    "ld1 { v2.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 104f\n"
+    "ld1 { v2.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[6], [x20]\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[4], [x20]\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
+    "tbz x1, #1, 106f\n"
+    "ld1 { v2.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[2], [x20]\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 107f\n"
+    "ld1 { v2.b }[0], [x20]\n"
+    "107:"  // Oddments: Load (5, 3): Bit 2: End
+    "ldr d30, [x6, #0xb8]\n"
+    "usubl v2.8h, v2.8b, v18.8b\n"
+    "ssubl v30.8h, v30.8b, v13.8b\n"
+    "ldr x20, [x5, #0x110]\n"
+    "smlal v23.4s, v2.4h, v28.4h\n"
+    "smlal2 v19.4s, v2.8h, v28.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v29.4h, v30.4h\n"
+    "smlal2 v15.4s, v29.8h, v30.8h\n"
+    "smlal v20.4s, v25.4h, v30.4h\n"
+    "smlal2 v5.4s, v25.8h, v30.8h\n"
+    "smlal v24.4s, v2.4h, v30.4h\n"
+    "smlal2 v22.4s, v2.8h, v30.8h\n"
+    "tbz x1, #2, 109f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 108f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
+    "tbz x1, #1, 110f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 111f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "111:"  // Oddments: Load (5, 4): Bit 2: End
+    "ldr d8, [x6, #0xc0]\n"
+    "usubl v27.8h, v27.8b, v18.8b\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldr x20, [x5, #0x118]\n"
+    "smlal v23.4s, v27.4h, v30.4h\n"
+    "smlal2 v19.4s, v27.8h, v30.8h\n"
+    "add x20, x20, x3\n"
+    "smlal v7.4s, v25.4h, v8.4h\n"
+    "smlal2 v15.4s, v25.8h, v8.8h\n"
+    "smlal v20.4s, v21.4h, v8.4h\n"
+    "smlal2 v5.4s, v21.8h, v8.8h\n"
+    "smlal v24.4s, v27.4h, v8.4h\n"
+    "smlal2 v22.4s, v27.8h, v8.8h\n"
+    "tbz x1, #2, 113f\n"
+    "ld1 { v9.s }[0], [x20], #0x4\n"
+    "tbz x1, #1, 112f\n"
+    "ld1 { v9.h }[2], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[6], [x20]\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[4], [x20]\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
+    "tbz x1, #1, 114f\n"
+    "ld1 { v9.h }[0], [x20], #0x2\n"
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[2], [x20]\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 115f\n"
+    "ld1 { v9.b }[0], [x20]\n"
+    "115:"  // Oddments: Load (5, 5): Bit 2: End
+    "usubl v9.8h, v9.8b, v18.8b\n"
+    "smlal v23.4s, v9.4h, v8.4h\n"
+    "smlal2 v19.4s, v9.8h, v8.8h\n"
+    "tbz x1, #2, 117f\n"
+    "ld1 { v30.4s }, [x7], #0x10\n"
+    "ld1 { v12.4s }, [x8], #0x10\n"
+    "tbz x1, #1, 116f\n"
+    "ld1 { v14.d }[0], [x7], #0x8\n"
+    "ld1 { v27.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[2], [x7]\n"
+    "ld1 { v27.s }[2], [x8]\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v14.s }[0], [x7]\n"
+    "ld1 { v27.s }[0], [x8]\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x1, #1, 118f\n"
+    "ld1 { v30.d }[0], [x7], #0x8\n"
+    "ld1 { v12.d }[0], [x8], #0x8\n"
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[2], [x7]\n"
+    "ld1 { v12.s }[2], [x8]\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 119f\n"
+    "ld1 { v30.s }[0], [x7]\n"
+    "ld1 { v12.s }[0], [x8]\n"
+    "119:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+    "and v16.16b, v7.16b, v12.16b\n"
+    "add x17, x17, x4\n"
+    "add x16, x16, x4\n"
+    "sqrdmulh v15.4s, v15.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add x15, x15, x4\n"
+    "add x14, x14, x4\n"
+    "and v2.16b, v15.16b, v27.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+    "sqadd v7.4s, v7.4s, v16.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "and v21.16b, v20.16b, v12.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+    "and v18.16b, v24.16b, v12.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+    "and v31.16b, v23.16b, v12.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+    "sqadd v15.4s, v15.4s, v2.4s\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "and v9.16b, v5.16b, v27.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v4.16b, v22.16b, v27.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v27.16b\n"
+    "sqadd v20.4s, v20.4s, v21.4s\n"
+    "sshr v9.4s, v9.4s, #0x1f\n"
+    "sqadd v24.4s, v24.4s, v18.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqadd v23.4s, v23.4s, v31.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "srshl v7.4s, v7.4s, v12.4s\n"
+    "srshl v20.4s, v20.4s, v12.4s\n"
+    "sqadd v5.4s, v5.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v12.4s\n"
+    "sqadd v22.4s, v22.4s, v4.4s\n"
+    "srshl v23.4s, v23.4s, v12.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "srshl v15.4s, v15.4s, v27.4s\n"
+    "sqxtn v7.4h, v7.4s\n"
+    "srshl v5.4s, v5.4s, v27.4s\n"
+    "sqxtn v20.4h, v20.4s\n"
+    "srshl v22.4s, v22.4s, v27.4s\n"
+    "sqxtn v24.4h, v24.4s\n"
+    "srshl v19.4s, v19.4s, v27.4s\n"
+    "sqxtn v23.4h, v23.4s\n"
+    "sqxtn2 v7.8h, v15.4s\n"
+    "sqxtn2 v20.8h, v5.4s\n"
+    "sqxtn2 v24.8h, v22.4s\n"
+    "sqxtn2 v23.8h, v19.4s\n"
+    "sqadd v7.8h, v7.8h, v26.8h\n"
+    "sqadd v20.8h, v20.8h, v26.8h\n"
+    "sqadd v24.8h, v24.8h, v26.8h\n"
+    "sqadd v23.8h, v23.8h, v26.8h\n"
+    "smax v7.8h, v7.8h, v11.8h\n"
+    "smax v20.8h, v20.8h, v11.8h\n"
+    "smax v24.8h, v24.8h, v11.8h\n"
+    "smax v23.8h, v23.8h, v11.8h\n"
+    "smin v7.8h, v7.8h, v0.8h\n"
+    "smin v20.8h, v20.8h, v0.8h\n"
+    "smin v24.8h, v24.8h, v0.8h\n"
+    "smin v23.8h, v23.8h, v0.8h\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "tbz x1, #2, 121f\n"
+    "st1 { v7.s }[0], [x17], #0x4\n"
+    "st1 { v20.s }[0], [x16], #0x4\n"
+    "st1 { v24.s }[0], [x15], #0x4\n"
+    "st1 { v23.s }[0], [x14], #0x4\n"
+    "tbz x1, #1, 120f\n"
+    "st1 { v7.h }[2], [x17], #0x2\n"
+    "st1 { v20.h }[2], [x16], #0x2\n"
+    "st1 { v24.h }[2], [x15], #0x2\n"
+    "st1 { v23.h }[2], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[6], [x17], #0x1\n"
+    "st1 { v20.b }[6], [x16], #0x1\n"
+    "st1 { v24.b }[6], [x15], #0x1\n"
+    "st1 { v23.b }[6], [x14], #0x1\n"
+    "b 123f\n"
+    "120:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[4], [x17], #0x1\n"
+    "st1 { v20.b }[4], [x16], #0x1\n"
+    "st1 { v24.b }[4], [x15], #0x1\n"
+    "st1 { v23.b }[4], [x14], #0x1\n"
+    "b 123f\n"
+    "121:"  // Oddments: Bit 2: Unset
+    "tbz x1, #1, 122f\n"
+    "st1 { v7.h }[0], [x17], #0x2\n"
+    "st1 { v20.h }[0], [x16], #0x2\n"
+    "st1 { v24.h }[0], [x15], #0x2\n"
+    "st1 { v23.h }[0], [x14], #0x2\n"
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[2], [x17], #0x1\n"
+    "st1 { v20.b }[2], [x16], #0x1\n"
+    "st1 { v24.b }[2], [x15], #0x1\n"
+    "st1 { v23.b }[2], [x14], #0x1\n"
+    "b 123f\n"
+    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x1, #0, 123f\n"
+    "st1 { v7.b }[0], [x17], #0x1\n"
+    "st1 { v20.b }[0], [x16], #0x1\n"
+    "st1 { v24.b }[0], [x15], #0x1\n"
+    "st1 { v23.b }[0], [x14], #0x1\n"
+    "123:"  // Oddments: Bit 2: End
+    "124:"  // End
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..2c677d2f62
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+class a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  KernelType kernel = a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>(9, arm_gemm::VLType::None) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c2bec4cdab
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  const arm_gemm::Requantize32& qp,
+  const unsigned int n_points,
+  const unsigned int n_channels
+)
+{
+  __asm__ __volatile__(
+    "lsr x9, %x[n_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v7.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v6.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v5.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v4.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v3.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v2.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v1.4s }, [x20]\n"
+    "mov x11, #0x0\n"
+    "cbz x9, 6f\n"
+    "1:"  // Channel loop
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q23, [%x[bias], x20]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x25, %x[inptrs]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "subs x24, %x[n_points], #0x1\n"
+    "ldr s14, [x21, x11]\n"
+    "ldr s15, [x20, x11]\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v25.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s16, [x21, x11]\n"
+    "mov v26.16b, v23.16b\n"
+    "mov v27.16b, v23.16b\n"
+    "ldr s17, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "ldr s18, [x21, x11]\n"
+    "ldr s19, [x20, x11]\n"
+    "mov v30.16b, v23.16b\n"
+    "mov v31.16b, v23.16b\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s20, [x21, x11]\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x23, x22, [x25], #0x10\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldr s14, [x23, x11]\n"
+    "ldr s15, [x22, x11]\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "ldr s16, [x21, x11]\n"
+    "ldr s17, [x20, x11]\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "ldr s18, [x21, x11]\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "ldr s19, [x20, x11]\n"
+    "ldp x21, x20, [x25], #0x10\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "subs x24, x24, #0x1\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "ldr s20, [x21, x11]\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "ldr s21, [x20, x11]\n"
+    "ldr x20, [x25], #0x8\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "ldr s22, [x20, x11]\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 5f\n"
+    "lsl x20, x11, #0x2\n"
+    "ldr q2, [%x[rq_mul_ptr], x20]\n"
+    "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 5f\n"
+    "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
+    "5:"  // Channel loop: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s23, [x28, x11]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s24, [x27, x11]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s25, [x26, x11]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s26, [x25, x11]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s27, [x24, x11]\n"
+    "str s28, [x23, x11]\n"
+    "str s29, [x22, x11]\n"
+    "str s30, [x21, x11]\n"
+    "str s31, [x20, x11]\n"
+    "add x11, x11, #0x4\n"
+    "cmp x11, x9, LSL #2\n"
+    "blt 1b\n"
+    "6:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 24f\n"
+    "movi v23.4s, #0x0\n"
+    "cbz %x[bias], 9f\n"
+    "add x20, %x[bias], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v23.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v23.s }[2], [x20], #0x4\n"
+    "b 8f\n"
+    "7:"  // Oddments: Load bias: Bit 1: Unset
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "8:"  // Oddments: Load bias: Bit 1: End
+    "9:"  // Oddments: Load bias: Done
+    "ldr s0, [%x[params]], #0x4\n"
+    "mov x10, %x[inptrs]\n"
+    "ldp x9, x28, [x10], #0x10\n"
+    "mov v24.16b, v23.16b\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "mov v25.16b, v23.16b\n"
+    "mov v26.16b, v23.16b\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "ldr x21, [x10], #0x8\n"
+    "mov v27.16b, v23.16b\n"
+    "mov v28.16b, v23.16b\n"
+    "mov v29.16b, v23.16b\n"
+    "mov v30.16b, v23.16b\n"
+    "add x9, x9, x11\n"
+    "add x28, x28, x11\n"
+    "mov v31.16b, v23.16b\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "11:"  // Oddments: Load: Bit 1: End
+    "subs x20, %x[n_points], #0x1\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "ble 15f\n"
+    "12:"  // Oddments: Planar loop
+    "ldp x9, x28, [x10], #0x10\n"
+    "ldp x27, x26, [x10], #0x10\n"
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "ldp x25, x24, [x10], #0x10\n"
+    "ldp x23, x22, [x10], #0x10\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "ldr x21, [x10], #0x8\n"
+    "add x9, x9, x11\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "add x28, x28, x11\n"
+    "add x27, x27, x11\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "ldr s0, [%x[params]], #0x4\n"
+    "ssubl v0.8h, v0.8b, v5.8b\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr h14, [x9], #0x2\n"
+    "ldr h15, [x28], #0x2\n"
+    "ldr h16, [x27], #0x2\n"
+    "ldr h17, [x26], #0x2\n"
+    "ldr h18, [x25], #0x2\n"
+    "ldr h19, [x24], #0x2\n"
+    "ldr h20, [x23], #0x2\n"
+    "ldr h21, [x22], #0x2\n"
+    "ldr h22, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.b }[2], [x9], #0x1\n"
+    "ld1 { v15.b }[2], [x28], #0x1\n"
+    "ld1 { v16.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "ld1 { v18.b }[2], [x25], #0x1\n"
+    "ld1 { v19.b }[2], [x24], #0x1\n"
+    "ld1 { v20.b }[2], [x23], #0x1\n"
+    "ld1 { v21.b }[2], [x22], #0x1\n"
+    "ld1 { v22.b }[2], [x21], #0x1\n"
+    "b 14f\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "ldr b14, [x9], #0x1\n"
+    "ldr b15, [x28], #0x1\n"
+    "ldr b16, [x27], #0x1\n"
+    "ldr b17, [x26], #0x1\n"
+    "ldr b18, [x25], #0x1\n"
+    "ldr b19, [x24], #0x1\n"
+    "ldr b20, [x23], #0x1\n"
+    "ldr b21, [x22], #0x1\n"
+    "ldr b22, [x21], #0x1\n"
+    "14:"  // Oddments: Planar loop: Load: Bit 1: End
+    "subs x20, x20, #0x1\n"
+    "usubl v14.8h, v14.8b, v6.8b\n"
+    "usubl v15.8h, v15.8b, v6.8b\n"
+    "usubl v16.8h, v16.8b, v6.8b\n"
+    "usubl v17.8h, v17.8b, v6.8b\n"
+    "usubl v18.8h, v18.8b, v6.8b\n"
+    "usubl v19.8h, v19.8b, v6.8b\n"
+    "usubl v20.8h, v20.8b, v6.8b\n"
+    "usubl v21.8h, v21.8b, v6.8b\n"
+    "usubl v22.8h, v22.8b, v6.8b\n"
+    "bgt 12b\n"
+    "15:"  // Oddments: Planar tail
+    "smlal v23.4s, v14.4h, v0.4h\n"
+    "smlal v24.4s, v15.4h, v0.4h\n"
+    "smlal v25.4s, v16.4h, v0.4h\n"
+    "smlal v26.4s, v17.4h, v0.4h\n"
+    "smlal v27.4s, v18.4h, v0.4h\n"
+    "smlal v28.4s, v19.4h, v0.4h\n"
+    "smlal v29.4s, v20.4h, v0.4h\n"
+    "smlal v30.4s, v21.4h, v0.4h\n"
+    "smlal v31.4s, v22.4h, v0.4h\n"
+    "cbz %x[rq_mul_ptr], 21f\n"
+    "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v2.d }[0], [x22], #0x8\n"
+    "ld1 { v1.d }[0], [x21], #0x8\n"
+    "cbz %x[rq_left_shift_ptr], 16f\n"
+    "ld1 { v3.d }[0], [x20], #0x8\n"
+    "16:"  // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v2.s }[2], [x22], #0x4\n"
+    "ld1 { v1.s }[2], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 17f\n"
+    "ld1 { v3.s }[2], [x20], #0x4\n"
+    "17:"  // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+    "b 20f\n"
+    "18:"  // Oddments: Load quantisation parameters: Bit 1: Unset
+    "ld1 { v2.s }[0], [x22], #0x4\n"
+    "ld1 { v1.s }[0], [x21], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 19f\n"
+    "ld1 { v3.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+    "20:"  // Oddments: Load quantisation parameters: Bit 1: End
+    "21:"  // Oddments: Load quantisation parameters: Done
+    "sshl v23.4s, v23.4s, v3.4s\n"
+    "sshl v24.4s, v24.4s, v3.4s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "add x28, x28, x11\n"
+    "and v18.16b, v23.16b, v1.16b\n"
+    "and v17.16b, v24.16b, v1.16b\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "and v16.16b, v25.16b, v1.16b\n"
+    "sshl v26.4s, v26.4s, v3.4s\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "sshl v27.4s, v27.4s, v3.4s\n"
+    "sshl v28.4s, v28.4s, v3.4s\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "sshl v29.4s, v29.4s, v3.4s\n"
+    "sshl v30.4s, v30.4s, v3.4s\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "sshl v31.4s, v31.4s, v3.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqadd v23.4s, v23.4s, v18.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v21.16b, v26.16b, v1.16b\n"
+    "and v20.16b, v27.16b, v1.16b\n"
+    "and v19.16b, v28.16b, v1.16b\n"
+    "and v18.16b, v29.16b, v1.16b\n"
+    "and v17.16b, v30.16b, v1.16b\n"
+    "and v16.16b, v31.16b, v1.16b\n"
+    "sshr v21.4s, v21.4s, #0x1f\n"
+    "sshr v20.4s, v20.4s, #0x1f\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v21.4s\n"
+    "sqadd v27.4s, v27.4s, v20.4s\n"
+    "sqadd v28.4s, v28.4s, v19.4s\n"
+    "sqadd v29.4s, v29.4s, v18.4s\n"
+    "sqadd v30.4s, v30.4s, v17.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "add v23.4s, v23.4s, v4.4s\n"
+    "add v24.4s, v24.4s, v4.4s\n"
+    "add v25.4s, v25.4s, v4.4s\n"
+    "add v26.4s, v26.4s, v4.4s\n"
+    "add v27.4s, v27.4s, v4.4s\n"
+    "add v28.4s, v28.4s, v4.4s\n"
+    "add v29.4s, v29.4s, v4.4s\n"
+    "add v30.4s, v30.4s, v4.4s\n"
+    "add v31.4s, v31.4s, v4.4s\n"
+    "smax v23.4s, v23.4s, v8.4s\n"
+    "smax v24.4s, v24.4s, v8.4s\n"
+    "smax v25.4s, v25.4s, v8.4s\n"
+    "smax v26.4s, v26.4s, v8.4s\n"
+    "smax v27.4s, v27.4s, v8.4s\n"
+    "smax v28.4s, v28.4s, v8.4s\n"
+    "smax v29.4s, v29.4s, v8.4s\n"
+    "smax v30.4s, v30.4s, v8.4s\n"
+    "smax v31.4s, v31.4s, v8.4s\n"
+    "smin v23.4s, v23.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v7.4s\n"
+    "smin v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v7.4s\n"
+    "smin v27.4s, v27.4s, v7.4s\n"
+    "smin v28.4s, v28.4s, v7.4s\n"
+    "smin v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v7.4s\n"
+    "smin v31.4s, v31.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "st1 { v23.h }[0], [x28], #0x2\n"
+    "st1 { v24.h }[0], [x27], #0x2\n"
+    "st1 { v25.h }[0], [x26], #0x2\n"
+    "st1 { v26.h }[0], [x25], #0x2\n"
+    "st1 { v27.h }[0], [x24], #0x2\n"
+    "st1 { v28.h }[0], [x23], #0x2\n"
+    "st1 { v29.h }[0], [x22], #0x2\n"
+    "st1 { v30.h }[0], [x21], #0x2\n"
+    "st1 { v31.h }[0], [x20], #0x2\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v23.b }[2], [x28], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "st1 { v25.b }[2], [x26], #0x1\n"
+    "st1 { v26.b }[2], [x25], #0x1\n"
+    "st1 { v27.b }[2], [x24], #0x1\n"
+    "st1 { v28.b }[2], [x23], #0x1\n"
+    "st1 { v29.b }[2], [x22], #0x1\n"
+    "st1 { v30.b }[2], [x21], #0x1\n"
+    "st1 { v31.b }[2], [x20], #0x1\n"
+    "b 23f\n"
+    "22:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v23.b }[0], [x28], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "st1 { v25.b }[0], [x26], #0x1\n"
+    "st1 { v26.b }[0], [x25], #0x1\n"
+    "st1 { v27.b }[0], [x24], #0x1\n"
+    "st1 { v28.b }[0], [x23], #0x1\n"
+    "st1 { v29.b }[0], [x22], #0x1\n"
+    "st1 { v30.b }[0], [x21], #0x1\n"
+    "st1 { v31.b }[0], [x20], #0x1\n"
+    "23:"  // Oddments: Store: Bit 1: End
+    "24:"  // End
+    : [params] "+&r" (params)
+    : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..b7ba363b43
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+  a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::None)
+  {
+  }
+  Parent::KernelType kernel = a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..ed99f1f642
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const int8_t *weights,
+  const int32_t *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const int32_t *per_channel_left_shifts,
+  const int32_t *per_channel_muls,
+  const int32_t *per_channel_right_shifts,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "lsr x10, %x[n_output_channels], #0x2\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v14.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v13.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.16b }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v10.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v9.4s }, [x20]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v8.4s }, [x20]\n"
+    "mov x9, #0x0\n"
+    "cbz x10, 9f\n"
+    "1:"  // Output channel loop
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q31, [%x[bias], x20]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 3f\n"
+    "lsl x20, x9, #0x2\n"
+    "ldr q9, [%x[rq_mul_ptr], x20]\n"
+    "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
+    "cbz %x[rq_left_shift_ptr], 3f\n"
+    "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
+    "3:"  // Output channel loop: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 7f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "beq 5f\n"
+    "4:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 4b\n"
+    "5:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 6f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "6:"  // Output channel loop: Odd tail
+    "ldp x20, x28, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldr d4, [x28, #0x0]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "b 8f\n"
+    "7:"  // Output channel loop: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str s16, [x27, x9]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str s17, [x26, x9]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s18, [x25, x9]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s19, [x24, x9]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s20, [x23, x9]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s21, [x22, x9]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s22, [x21, x9]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s23, [x20, x9]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x27, x9]\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s25, [x26, x9]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s26, [x25, x9]\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s27, [x24, x9]\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s28, [x23, x9]\n"
+    "str s29, [x22, x9]\n"
+    "str s30, [x21, x9]\n"
+    "str s31, [x20, x9]\n"
+    "8:"  // Output channel loop: Done
+    "add x9, x9, #0x4\n"
+    "cmp x9, x10, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 26f\n"
+    "9:"  // Output channel oddments
+    "movi v31.4s, #0x0\n"
+    "cbz %x[bias], 12f\n"
+    "add x20, %x[bias], x9, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 10f\n"
+    "ld1 { v31.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v31.s }[2], [x20]\n"
+    "b 11f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "ld1 { v31.s }[0], [x20]\n"
+    "11:"  // Output channel oddments: Load bias: Bit 1: End
+    "12:"  // Output channel oddments: Load bias: Done
+    "mov v16.16b, v31.16b\n"
+    "mov v17.16b, v31.16b\n"
+    "mov v18.16b, v31.16b\n"
+    "mov v19.16b, v31.16b\n"
+    "mov v20.16b, v31.16b\n"
+    "mov v21.16b, v31.16b\n"
+    "mov v22.16b, v31.16b\n"
+    "mov v23.16b, v31.16b\n"
+    "mov v24.16b, v31.16b\n"
+    "mov v25.16b, v31.16b\n"
+    "mov v26.16b, v31.16b\n"
+    "mov v27.16b, v31.16b\n"
+    "mov v28.16b, v31.16b\n"
+    "mov v29.16b, v31.16b\n"
+    "mov v30.16b, v31.16b\n"
+    "mov v31.16b, v31.16b\n"
+    "cbz %x[rq_mul_ptr], 18f\n"
+    "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+    "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+    "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
+    "cbz %x[rq_left_shift_ptr], 15f\n"
+    "tbz %x[n_output_channels], #1, 13f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v10.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v10.s }[2], [x20], #0x4\n"
+    "b 14f\n"
+    "13:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v10.s }[0], [x20], #0x4\n"
+    "14:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+    "b 18f\n"
+    "15:"  // Output channel oddments: Load quantization parameters: No left shift
+    "tbz %x[n_output_channels], #1, 16f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "b 17f\n"
+    "16:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "17:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+    "18:"  // Output channel oddments: Load quantization parameters: Done
+    "ldr s5, [%x[weights]], #0x4\n"
+    "mov x22, %x[inptrs]\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "lsr x23, %x[kernel_points], #0x1\n"
+    "ldr d0, [x21, #0x0]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "cbz x23, 22f\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "subs x23, x23, #0x1\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "ldr d3, [x21, #0x0]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "beq 20f\n"
+    "19:"  // Output channel oddments: Kernel loop
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "subs x23, x23, #0x1\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d0, [x21, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d4, [x20, #0x0]\n"
+    "ldr s5, [%x[weights]], #0x4\n"
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "ldr d3, [x21, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "ldr d6, [x20, #0x0]\n"
+    "ldr s7, [%x[weights]], #0x4\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "bgt 19b\n"
+    "20:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 21f\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "b 23f\n"
+    "21:"  // Output channel oddments: Odd tail
+    "ldp x21, x20, [x22], #0x10\n"
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "ldr d2, [x21, #0x0]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "ldr d1, [x20, #0x0]\n"
+    "ldr s0, [%x[weights]], #0x4\n"
+    "smlal v16.4s, v7.4h, v3.h[0]\n"
+    "smlal v17.4s, v7.4h, v3.h[1]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v18.4s, v7.4h, v3.h[2]\n"
+    "smlal v19.4s, v7.4h, v3.h[3]\n"
+    "ssubl v0.8h, v0.8b, v12.8b\n"
+    "smlal v20.4s, v7.4h, v3.h[4]\n"
+    "smlal v21.4s, v7.4h, v3.h[5]\n"
+    "smlal v22.4s, v7.4h, v3.h[6]\n"
+    "smlal v23.4s, v7.4h, v3.h[7]\n"
+    "smlal v24.4s, v7.4h, v6.h[0]\n"
+    "smlal v25.4s, v7.4h, v6.h[1]\n"
+    "smlal v26.4s, v7.4h, v6.h[2]\n"
+    "smlal v27.4s, v7.4h, v6.h[3]\n"
+    "smlal v28.4s, v7.4h, v6.h[4]\n"
+    "smlal v29.4s, v7.4h, v6.h[5]\n"
+    "smlal v30.4s, v7.4h, v6.h[6]\n"
+    "smlal v31.4s, v7.4h, v6.h[7]\n"
+    "smlal v16.4s, v0.4h, v2.h[0]\n"
+    "smlal v17.4s, v0.4h, v2.h[1]\n"
+    "smlal v18.4s, v0.4h, v2.h[2]\n"
+    "smlal v19.4s, v0.4h, v2.h[3]\n"
+    "smlal v20.4s, v0.4h, v2.h[4]\n"
+    "smlal v21.4s, v0.4h, v2.h[5]\n"
+    "smlal v22.4s, v0.4h, v2.h[6]\n"
+    "smlal v23.4s, v0.4h, v2.h[7]\n"
+    "smlal v24.4s, v0.4h, v1.h[0]\n"
+    "smlal v25.4s, v0.4h, v1.h[1]\n"
+    "smlal v26.4s, v0.4h, v1.h[2]\n"
+    "smlal v27.4s, v0.4h, v1.h[3]\n"
+    "smlal v28.4s, v0.4h, v1.h[4]\n"
+    "smlal v29.4s, v0.4h, v1.h[5]\n"
+    "smlal v30.4s, v0.4h, v1.h[6]\n"
+    "smlal v31.4s, v0.4h, v1.h[7]\n"
+    "b 23f\n"
+    "22:"  // Output channel oddments: Single kernel point
+    "smlal v16.4s, v5.4h, v0.h[0]\n"
+    "smlal v17.4s, v5.4h, v0.h[1]\n"
+    "smlal v18.4s, v5.4h, v0.h[2]\n"
+    "smlal v19.4s, v5.4h, v0.h[3]\n"
+    "smlal v20.4s, v5.4h, v0.h[4]\n"
+    "smlal v21.4s, v5.4h, v0.h[5]\n"
+    "smlal v22.4s, v5.4h, v0.h[6]\n"
+    "smlal v23.4s, v5.4h, v0.h[7]\n"
+    "smlal v24.4s, v5.4h, v4.h[0]\n"
+    "smlal v25.4s, v5.4h, v4.h[1]\n"
+    "smlal v26.4s, v5.4h, v4.h[2]\n"
+    "smlal v27.4s, v5.4h, v4.h[3]\n"
+    "smlal v28.4s, v5.4h, v4.h[4]\n"
+    "smlal v29.4s, v5.4h, v4.h[5]\n"
+    "smlal v30.4s, v5.4h, v4.h[6]\n"
+    "smlal v31.4s, v5.4h, v4.h[7]\n"
+    "23:"  // Output channel oddments: Done
+    "sshl v16.4s, v16.4s, v10.4s\n"
+    "sshl v17.4s, v17.4s, v10.4s\n"
+    "sshl v18.4s, v18.4s, v10.4s\n"
+    "sshl v19.4s, v19.4s, v10.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+    "and v3.16b, v16.16b, v8.16b\n"
+    "and v2.16b, v17.16b, v8.16b\n"
+    "and v1.16b, v18.16b, v8.16b\n"
+    "and v0.16b, v19.16b, v8.16b\n"
+    "sshl v20.4s, v20.4s, v10.4s\n"
+    "sshl v21.4s, v21.4s, v10.4s\n"
+    "sshl v22.4s, v22.4s, v10.4s\n"
+    "sshl v23.4s, v23.4s, v10.4s\n"
+    "sshl v24.4s, v24.4s, v10.4s\n"
+    "sshl v25.4s, v25.4s, v10.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v3.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "sqadd v18.4s, v18.4s, v1.4s\n"
+    "sqadd v19.4s, v19.4s, v0.4s\n"
+    "and v5.16b, v20.16b, v8.16b\n"
+    "and v4.16b, v21.16b, v8.16b\n"
+    "and v3.16b, v22.16b, v8.16b\n"
+    "and v2.16b, v23.16b, v8.16b\n"
+    "and v1.16b, v24.16b, v8.16b\n"
+    "and v0.16b, v25.16b, v8.16b\n"
+    "sshl v26.4s, v26.4s, v10.4s\n"
+    "sshl v27.4s, v27.4s, v10.4s\n"
+    "sshl v28.4s, v28.4s, v10.4s\n"
+    "sshl v29.4s, v29.4s, v10.4s\n"
+    "sshl v30.4s, v30.4s, v10.4s\n"
+    "sshl v31.4s, v31.4s, v10.4s\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+    "sqadd v20.4s, v20.4s, v5.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "sqadd v22.4s, v22.4s, v3.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "sqadd v24.4s, v24.4s, v1.4s\n"
+    "sqadd v25.4s, v25.4s, v0.4s\n"
+    "and v5.16b, v26.16b, v8.16b\n"
+    "and v4.16b, v27.16b, v8.16b\n"
+    "and v3.16b, v28.16b, v8.16b\n"
+    "and v2.16b, v29.16b, v8.16b\n"
+    "and v1.16b, v30.16b, v8.16b\n"
+    "and v0.16b, v31.16b, v8.16b\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v27.4s, v27.4s, v4.4s\n"
+    "sqadd v28.4s, v28.4s, v3.4s\n"
+    "sqadd v29.4s, v29.4s, v2.4s\n"
+    "sqadd v30.4s, v30.4s, v1.4s\n"
+    "sqadd v31.4s, v31.4s, v0.4s\n"
+    "srshl v16.4s, v16.4s, v8.4s\n"
+    "srshl v17.4s, v17.4s, v8.4s\n"
+    "srshl v18.4s, v18.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v8.4s\n"
+    "srshl v20.4s, v20.4s, v8.4s\n"
+    "srshl v21.4s, v21.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v8.4s\n"
+    "srshl v23.4s, v23.4s, v8.4s\n"
+    "srshl v24.4s, v24.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v8.4s\n"
+    "srshl v28.4s, v28.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v8.4s\n"
+    "srshl v30.4s, v30.4s, v8.4s\n"
+    "srshl v31.4s, v31.4s, v8.4s\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v11.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "add v24.4s, v24.4s, v11.4s\n"
+    "add v25.4s, v25.4s, v11.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v27.4s, v27.4s, v11.4s\n"
+    "add v28.4s, v28.4s, v11.4s\n"
+    "add v29.4s, v29.4s, v11.4s\n"
+    "add v30.4s, v30.4s, v11.4s\n"
+    "add v31.4s, v31.4s, v11.4s\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smin v19.4s, v19.4s, v14.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v14.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "smin v25.4s, v25.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v27.4s, v27.4s, v14.4s\n"
+    "smin v28.4s, v28.4s, v14.4s\n"
+    "smin v29.4s, v29.4s, v14.4s\n"
+    "smin v30.4s, v30.4s, v14.4s\n"
+    "smin v31.4s, v31.4s, v14.4s\n"
+    "smax v16.4s, v16.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v15.4s\n"
+    "smax v18.4s, v18.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v15.4s\n"
+    "smax v20.4s, v20.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v15.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smax v25.4s, v25.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v27.4s, v27.4s, v15.4s\n"
+    "smax v28.4s, v28.4s, v15.4s\n"
+    "smax v29.4s, v29.4s, v15.4s\n"
+    "smax v30.4s, v30.4s, v15.4s\n"
+    "smax v31.4s, v31.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "tbz %x[n_output_channels], #1, 24f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.h }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.h }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.h }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.h }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.h }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.h }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "add x9, x9, #0x2\n"
+    "st1 { v24.h }[0], [x27]\n"
+    "st1 { v25.h }[0], [x26]\n"
+    "st1 { v26.h }[0], [x25]\n"
+    "st1 { v27.h }[0], [x24]\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[2], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[2], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[2], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[2], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[2], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[2], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[2], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[2], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[2], [x27]\n"
+    "st1 { v25.b }[2], [x26]\n"
+    "st1 { v26.b }[2], [x25]\n"
+    "st1 { v27.b }[2], [x24]\n"
+    "st1 { v28.b }[2], [x23]\n"
+    "st1 { v29.b }[2], [x22]\n"
+    "st1 { v30.b }[2], [x21]\n"
+    "st1 { v31.b }[2], [x20]\n"
+    "b 25f\n"
+    "24:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "add x27, x27, x9\n"
+    "add x26, x26, x9\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "add x25, x25, x9\n"
+    "add x24, x24, x9\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "add x23, x23, x9\n"
+    "add x22, x22, x9\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "add x21, x21, x9\n"
+    "add x20, x20, x9\n"
+    "st1 { v16.b }[0], [x27]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "add x27, x27, x9\n"
+    "st1 { v17.b }[0], [x26]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "add x26, x26, x9\n"
+    "st1 { v18.b }[0], [x25]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "add x25, x25, x9\n"
+    "st1 { v19.b }[0], [x24]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "add x24, x24, x9\n"
+    "st1 { v20.b }[0], [x23]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v21.b }[0], [x22]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "add x22, x22, x9\n"
+    "st1 { v22.b }[0], [x21]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "add x21, x21, x9\n"
+    "st1 { v23.b }[0], [x20]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "add x20, x20, x9\n"
+    "st1 { v24.b }[0], [x27]\n"
+    "st1 { v25.b }[0], [x26]\n"
+    "st1 { v26.b }[0], [x25]\n"
+    "st1 { v27.b }[0], [x24]\n"
+    "st1 { v28.b }[0], [x23]\n"
+    "st1 { v29.b }[0], [x22]\n"
+    "st1 { v30.b }[0], [x21]\n"
+    "st1 { v31.b }[0], [x20]\n"
+    "25:"  // Output channel oddments: Done: Store: Bit 1: End
+    "26:"  // Done
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..2b6f70c089
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..2d558ade3f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x4, #0x0\n"
+    "mov x5, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "1:"  // Tile loop
+    "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x2\n"
+    "str x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x20, x4, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x5, x6, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "add x17, x6, x6\n"
+    "add x7, x7, x20, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x16, x7, x21, LSL #1\n"
+    "add x15, x17, x6\n"
+    "add x14, x16, x21, LSL #1\n"
+    "add x13, x14, x21, LSL #1\n"
+    "cbnz x5, 2f\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "lsl x12, %x[n_channels], #0x1\n"
+    "mov x21, #0x4\n"
+    "mul x21, x21, x6\n"
+    "add x11, x16, x6, LSL #1\n"
+    "add x10, x7, x15, LSL #1\n"
+    "add x9, x16, x17, LSL #1\n"
+    "sub x20, x24, x5\n"
+    "add x28, x14, x6, LSL #1\n"
+    "sub x20, x20, #0x1\n"
+    "add x27, x13, x15, LSL #1\n"
+    "and x20, x20, #0x3fffff\n"
+    "add x26, x7, x6, LSL #1\n"
+    "orr x12, x12, x20, LSL #22\n"
+    "add x25, x7, x17, LSL #1\n"
+    "orr x12, x12, x21, LSL #38\n"
+    "add x24, x14, x17, LSL #1\n"
+    "add x23, x16, x15, LSL #1\n"
+    "add x22, x14, x15, LSL #1\n"
+    "add x21, x13, x6, LSL #1\n"
+    "add x20, x13, x17, LSL #1\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac48fa  // rprfm pldonce, x12, [x7]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac49ba  // rprfm pldonce, x12, [x13]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac4a1a  // rprfm pldonce, x12, [x16]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mov x20, #0x2\n"
+    "ld1h { z18.h }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x24\n"
+    ".inst 0xa040a100  // ld1h { z0.h-z3.h }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    ".inst 0xa040a104  // ld1h { z4.h-z7.h }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "mul x22, x4, x26\n"  // offset = tile_i * ld_output_row
+    "cmp x24, %x[n_channels]\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "madd x22, x5, x25, x22\n"  // offset += tile_j * ld_output_col
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mov x21, #0x0\n"
+    "mul x22, x22, x20\n"  // offset *= output_tile_size
+    "sub x20, XZR, x24\n"
+    "ld1h { z8.h }, p3/Z, [x8]\n"
+    "add x23, x23, x22, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z9.h }, p2/Z, [x16, x6, LSL #1]\n"
+    "addvl x8, x8, #1\n"
+    "add x22, x23, x26, LSL #1\n"
+    "ld1h { z10.h }, p2/Z, [x7]\n"
+    "ld1h { z11.h }, p2/Z, [x7, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x14, x6, LSL #1]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+    "whilelt p1.h, x24, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13]\n"
+    "inch x24\n"
+    "ld1h { z18.h }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "mov p0.b, p2.b\n"
+    "inch x20\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x6, LSL #1]\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x7, x17, LSL #1]\n"
+    "addvl x7, x7, #1\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z29.h, p3/M, z6.h, z13.h\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "addvl x16, x16, #1\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x14]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "addvl x14, x14, #1\n"
+    "ld1h { z13.h }, p1/Z, [x14, x6, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+    ".inst 0xa040a100  // ld1h { z0.h-z3.h }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "cmp x24, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z7.h, z11.h\n"
+    "fmla z31.h, p3/M, z6.h, z11.h\n"
+    "addvl x13, x13, #1\n"
+    "ld1h { z11.h }, p1/Z, [x7, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x16, x6, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x7]\n"
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z12.h\n"
+    ".inst 0xa040a104  // ld1h { z4.h-z7.h }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "ld1h { z12.h }, p1/Z, [x16, x17, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x5, x5, #0x1\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "add x20, x4, #0x1\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "cmp x5, x24\n"
+    "csel x4, x4, x20, LT\n"
+    "csel x5, x5, XZR, LT\n"
+    "cmp x4, x21\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x6, LSL #1]\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x7, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z29.h, p3/M, z6.h, z13.h\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x14]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z11.h\n"
+    "fmla z31.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z8.h, z10.h\n"
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z12.h\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..415e344832
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[16];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ldr x13, [x16, #0x20]\n"
+    "cnth x12\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldp x11, x10, [x20, #0x0]\n"
+    "cmp x12, %x[n_channels]\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x9, XZR, x12\n"
+    "ldp x28, x27, [x20, #0x10]\n"
+    "ld1h { z16.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    ".inst 0xa040a1c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ld1h { z8.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z28, z16\n fmla z28.h, p3/M, z4.h, z9.h\n"
+    "movprfx z29, z16\n fmla z29.h, p3/M, z3.h, z9.h\n"
+    "ldr x22, [x16, #0x28]\n"
+    "whilelt p1.h, x12, %x[n_channels]\n"
+    "movprfx z30, z16\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x21, [x16, #0x30]\n"
+    "ld1h { z16.h }, p3/Z, [x14]\n"
+    "ldr x20, [x16, #0x38]\n"
+    "addvl x14, x14, #1\n"
+    "inch x9\n"
+    "ld1h { z9.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x48]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x40]\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ldr x24, [x16, #0x50]\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x60]\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "ldr x22, [x16, #0x68]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z29.h, p3/M, z6.h, z13.h\n"
+    "ldr x20, [x16, #0x78]\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x20]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z13.h }, p1/Z, [x13, x12, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "inch x15\n"
+    "fmla z30.h, p3/M, z7.h, z11.h\n"
+    "fmla z31.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x24, x12, LSL #1]\n"
+    "whilelt p2.h, x15, %x[n_channels]\n"
+    "fmla z28.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x26, x12, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x25, x12, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x23, x12, LSL #1]\n"
+    "inch x12\n"
+    ".inst 0xa040a1c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "cmp x12, %x[n_channels]\n"
+    "ld1h { z8.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    ".inst 0xc171ca5c  // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+    "st1h { z28.h }, p0, [x11, x9, LSL #1]\n"
+    "st1h { z29.h }, p0, [x10, x9, LSL #1]\n"
+    "st1h { z30.h }, p0, [x28, x9, LSL #1]\n"
+    "st1h { z31.h }, p0, [x27, x9, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z28, z16\n fmla z28.h, p3/M, z4.h, z9.h\n"
+    "movprfx z29, z16\n fmla z29.h, p3/M, z3.h, z9.h\n"
+    "ldr x22, [x16, #0x28]\n"
+    "inch x9\n"
+    "movprfx z30, z16\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x21, [x16, #0x30]\n"
+    "mov p0.b, p2.b\n"
+    "ldr x20, [x16, #0x38]\n"
+    "ld1h { z9.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x48]\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x40]\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ldr x24, [x16, #0x50]\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x60]\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "ldr x22, [x16, #0x68]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z29.h, p3/M, z6.h, z13.h\n"
+    "ldr x20, [x16, #0x78]\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z11.h\n"
+    "fmla z31.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z8.h, z10.h\n"
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z12.h\n"
+    ".inst 0xc171ca5c  // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+    "st1h { z28.h }, p0, [x11, x9, LSL #1]\n"
+    "st1h { z29.h }, p0, [x10, x9, LSL #1]\n"
+    "st1h { z30.h }, p0, [x28, x9, LSL #1]\n"
+    "st1h { z31.h }, p0, [x27, x9, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f90fbc3906
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..3a7d1cb0b4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x3\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "add x7, x4, x4\n"
+    "add x5, x5, x20, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x8, x5, x21, LSL #1\n"
+    "add x17, x7, x4\n"
+    "add x16, x8, x21, LSL #1\n"
+    "add x15, x17, x4\n"
+    "add x14, x16, x21, LSL #1\n"
+    "add x13, x14, x21, LSL #1\n"
+    "cbnz x3, 2f\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "lsl x12, %x[n_channels], #0x1\n"
+    "mov x28, #0x6\n"
+    "mul x28, x28, x4\n"
+    "add x27, x16, x7, LSL #1\n"
+    "add x26, x5, x15, LSL #1\n"
+    "add x25, x8, x7, LSL #1\n"
+    "sub x20, x9, x3\n"
+    "add x24, x13, x15, LSL #1\n"
+    "sub x20, x20, #0x1\n"
+    "add x23, x16, x4, LSL #1\n"
+    "and x20, x20, #0x3fffff\n"
+    "add x22, x5, x4, LSL #1\n"
+    "orr x12, x12, x20, LSL #22\n"
+    "add x21, x5, x17, LSL #1\n"
+    "orr x12, x12, x28, LSL #38\n"
+    "add x20, x16, x17, LSL #1\n"
+    "add x11, x8, x15, LSL #1\n"
+    "add x10, x14, x7, LSL #1\n"
+    "add x9, x14, x15, LSL #1\n"
+    "add x28, x13, x4, LSL #1\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    "add x27, x8, x4, LSL #1\n"
+    ".inst 0xf8ac48ba  // rprfm pldonce, x12, [x5]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    "add x26, x8, x17, LSL #1\n"
+    ".inst 0xf8ac49ba  // rprfm pldonce, x12, [x13]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    "add x25, x13, x17, LSL #1\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    "add x24, x14, x4, LSL #1\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    "add x23, x5, x7, LSL #1\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    "add x22, x14, x17, LSL #1\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    "add x21, x16, x15, LSL #1\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "add x20, x13, x7, LSL #1\n"
+    ".inst 0xf8ac491a  // rprfm pldonce, x12, [x8]\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac4a1a  // rprfm pldonce, x12, [x16]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mov x21, #0x3\n"
+    "ld1h { z18.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x26\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    ".inst 0xa040a0c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "mul x20, x2, x22\n"  // offset = tile_i * ld_output_row
+    "cmp x26, %x[n_channels]\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "madd x20, x3, x27, x20\n"  // offset += tile_j * ld_output_col
+    "add x24, x27, x27\n"
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mul x20, x20, x21\n"  // offset *= output_tile_size
+    "mov x21, #0x0\n"
+    "ld1h { z8.h }, p3/Z, [x6]\n"
+    "add x25, x25, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "sub x20, XZR, x26\n"
+    "ld1h { z9.h }, p2/Z, [x16, x7, LSL #1]\n"
+    "add x23, x25, x22, LSL #1\n"
+    "ld1h { z10.h }, p2/Z, [x5]\n"
+    "addvl x6, x6, #1\n"
+    "add x22, x23, x22, LSL #1\n"
+    "ld1h { z11.h }, p2/Z, [x5, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x13]\n"
+    "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
+    "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+    "whilelt p1.h, x26, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
+    "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "inch x26\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+    "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
+    "inch x20\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "fmla z24.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "fmla z29.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z18.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    "fmla z25.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x5, x4, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8]\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "fmla z26.h, p3/M, z0.h, z11.h\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "fmla z23.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x14]\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "fmla z24.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z31.h, p3/M, z3.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z13.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "addvl x8, x8, #1\n"
+    "ld1h { z12.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z13.h\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z24.h, p3/M, z5.h, z11.h\n"
+    "fmla z25.h, p3/M, z4.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x5, x7, LSL #1]\n"
+    "addvl x5, x5, #1\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z10.h }, p1/Z, [x5]\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z12.h }, p2/Z, [x16]\n"
+    "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "addvl x16, x16, #1\n"
+    "fmla z30.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z9.h }, p1/Z, [x16, x7, LSL #1]\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "fmla z26.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "addvl x13, x13, #1\n"
+    "cmp x26, %x[n_channels]\n"
+    "ld1h { z11.h }, p1/Z, [x5, x15, LSL #1]\n"
+    "fmax z23.h, p3/M, z23.h, z17.h\n"
+    "ld1h { z12.h }, p1/Z, [x13]\n"
+    "fmla z29.h, p3/M, z8.h, z13.h\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z6.h, z13.h\n"
+    ".inst 0xa040a0c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    ".inst 0xc170ca38  // fclamp { z24.h-z27.h }, z17.h, z16.h\n"
+    "ld1h { z13.h }, p1/Z, [x8, x7, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z26.h }, p0, [x23]\n"
+    "st1h { z27.h }, p0, [x23, x27, LSL #1]\n"
+    "st1h { z23.h }, p0, [x25]\n"
+    "st1h { z24.h }, p0, [x25, x27, LSL #1]\n"
+    "st1h { z25.h }, p0, [x25, x24, LSL #1]\n"
+    "addvl x25, x25, #1\n"
+    "st1h { z28.h }, p0, [x23, x24, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z29.h }, p0, [x22]\n"
+    "st1h { z30.h }, p0, [x22, x27, LSL #1]\n"
+    "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
+    "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
+    "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+    "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x3, x3, #0x1\n"
+    "fmla z24.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "add x20, x2, #0x1\n"
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "cmp x3, x9\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "csel x2, x2, x20, LT\n"
+    "csel x3, x3, XZR, LT\n"
+    "fmla z29.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "cmp x2, x21\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z5.h, z13.h\n"
+    "fmla z25.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x5, x4, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8]\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "fmla z26.h, p3/M, z0.h, z11.h\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "fmla z23.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x14]\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "fmla z24.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z31.h, p3/M, z3.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z13.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z13.h\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z24.h, p3/M, z5.h, z11.h\n"
+    "fmla z25.h, p3/M, z4.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x5, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z12.h }, p2/Z, [x16]\n"
+    "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z30.h, p3/M, z5.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "fmla z26.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmax z23.h, p3/M, z23.h, z17.h\n"
+    "fmla z29.h, p3/M, z8.h, z13.h\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z6.h, z13.h\n"
+    ".inst 0xc170ca38  // fclamp { z24.h-z27.h }, z17.h, z16.h\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z26.h }, p0, [x23]\n"
+    "st1h { z27.h }, p0, [x23, x27, LSL #1]\n"
+    "st1h { z23.h }, p0, [x25]\n"
+    "st1h { z24.h }, p0, [x25, x27, LSL #1]\n"
+    "st1h { z25.h }, p0, [x25, x24, LSL #1]\n"
+    "st1h { z28.h }, p0, [x23, x24, LSL #1]\n"
+    "st1h { z29.h }, p0, [x22]\n"
+    "st1h { z30.h }, p0, [x22, x27, LSL #1]\n"
+    "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..e85cb9e017
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ldp x14, x13, [x16, #0x0]\n"
+    "ldp x12, x11, [x16, #0x10]\n"
+    "cnth x10\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1h { z17.h }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "ldr x9, [x16, #0x20]\n"
+    "cmp x10, %x[n_channels]\n"
+    ".inst 0xa040a220  // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "sub x27, XZR, x10\n"
+    ".inst 0xa040a224  // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z8.h }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z23, z17\n fmla z23.h, p3/M, z8.h, z9.h\n"
+    "movprfx z24, z17\n fmla z24.h, p3/M, z7.h, z9.h\n"
+    "ldr x26, [x16, #0x30]\n"
+    "inch x27\n"
+    "movprfx z25, z17\n fmla z25.h, p3/M, z6.h, z9.h\n"
+    "movprfx z26, z17\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "ldr x25, [x16, #0x38]\n"
+    "mov p1.b, p2.b\n"
+    "movprfx z27, z17\n fmla z27.h, p3/M, z4.h, z9.h\n"
+    "movprfx z28, z17\n fmla z28.h, p3/M, z3.h, z9.h\n"
+    "ldr x24, [x16, #0x28]\n"
+    "whilelt p0.h, x10, %x[n_channels]\n"
+    "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x13, [x16, #0x48]\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "fmla z24.h, p3/M, z4.h, z13.h\n"
+    "ldr x14, [x16, #0x40]\n"
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ldr x12, [x16, #0x50]\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x11, [x16, #0x58]\n"
+    "fmla z29.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ldr x9, [x16, #0x60]\n"
+    "fmla z23.h, p3/M, z5.h, z13.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "ldr x24, [x16, #0x68]\n"
+    "ld1h { z17.h }, p3/Z, [x17]\n"
+    "fmla z25.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z11.h\n"
+    "ldr x26, [x16, #0x70]\n"
+    "fmla z31.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "ldr x25, [x16, #0x78]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ldr x14, [x16, #0x80]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ldr x13, [x16, #0x88]\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x90]\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "ldr x23, [x28, #0x0]\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "ldr x22, [x28, #0x8]\n"
+    "fmla z23.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z0.h, z11.h\n"
+    "ldr x9, [x16, #0xa0]\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "ldr x11, [x16, #0x98]\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "ldr x21, [x28, #0x10]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xb0]\n"
+    "fmla z24.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xa8]\n"
+    "fmla z25.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xb8]\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "ldr x14, [x16, #0xc0]\n"
+    "fmla z31.h, p3/M, z3.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "ldr x20, [x28, #0x18]\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z24.h, p3/M, z5.h, z11.h\n"
+    "fmla z25.h, p3/M, z4.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "ldr x9, [x16, #0x20]\n"
+    "fmla z30.h, p3/M, z8.h, z13.h\n"
+    "fmla z26.h, p3/M, z7.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "fmla z26.h, p3/M, z3.h, z12.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z30.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldp x14, x13, [x16, #0x0]\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmax z23.h, p3/M, z23.h, z18.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ldp x12, x11, [x16, #0x10]\n"
+    "inch x15\n"
+    ".inst 0xa040a220  // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "whilelt p2.h, x15, %x[n_channels]\n"
+    "fmla z29.h, p3/M, z8.h, z13.h\n"
+    ".inst 0xc170ca58  // fclamp { z24.h-z27.h }, z18.h, z16.h\n"
+    "ld1h { z9.h }, p0/Z, [x14, x10, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z10.h }, p0/Z, [x13, x10, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    "fmla z31.h, p3/M, z6.h, z13.h\n"
+    "ld1h { z11.h }, p0/Z, [x12, x10, LSL #1]\n"
+    "ld1h { z12.h }, p0/Z, [x11, x10, LSL #1]\n"
+    "st1h { z24.h }, p1, [x22, x27, LSL #1]\n"
+    "ldr x22, [x28, #0x28]\n"
+    "st1h { z25.h }, p1, [x21, x27, LSL #1]\n"
+    "ldr x21, [x28, #0x30]\n"
+    "ld1h { z13.h }, p0/Z, [x9, x10, LSL #1]\n"
+    "inch x10\n"
+    "st1h { z23.h }, p1, [x23, x27, LSL #1]\n"
+    "ldr x23, [x28, #0x20]\n"
+    ".inst 0xa040a224  // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "st1h { z26.h }, p1, [x20, x27, LSL #1]\n"
+    "ldr x20, [x28, #0x38]\n"
+    "cmp x10, %x[n_channels]\n"
+    ".inst 0xc170ca5c  // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+    "ld1h { z8.h }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "st1h { z27.h }, p1, [x23, x27, LSL #1]\n"
+    "ldr x23, [x28, #0x40]\n"
+    "st1h { z28.h }, p1, [x22, x27, LSL #1]\n"
+    "st1h { z29.h }, p1, [x21, x27, LSL #1]\n"
+    "st1h { z30.h }, p1, [x20, x27, LSL #1]\n"
+    "st1h { z31.h }, p1, [x23, x27, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z23, z17\n fmla z23.h, p3/M, z8.h, z9.h\n"
+    "movprfx z24, z17\n fmla z24.h, p3/M, z7.h, z9.h\n"
+    "ldr x26, [x16, #0x30]\n"
+    "inch x27\n"
+    "movprfx z25, z17\n fmla z25.h, p3/M, z6.h, z9.h\n"
+    "movprfx z26, z17\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "ldr x25, [x16, #0x38]\n"
+    "mov p1.b, p2.b\n"
+    "movprfx z27, z17\n fmla z27.h, p3/M, z4.h, z9.h\n"
+    "movprfx z28, z17\n fmla z28.h, p3/M, z3.h, z9.h\n"
+    "ldr x24, [x16, #0x28]\n"
+    "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x13, [x16, #0x48]\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "fmla z24.h, p3/M, z4.h, z13.h\n"
+    "ldr x14, [x16, #0x40]\n"
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ldr x12, [x16, #0x50]\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x11, [x16, #0x58]\n"
+    "fmla z29.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ldr x9, [x16, #0x60]\n"
+    "fmla z23.h, p3/M, z5.h, z13.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "ldr x24, [x16, #0x68]\n"
+    "fmla z25.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z11.h\n"
+    "ldr x26, [x16, #0x70]\n"
+    "fmla z31.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "ldr x25, [x16, #0x78]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ldr x14, [x16, #0x80]\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ldr x13, [x16, #0x88]\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x90]\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "ldr x23, [x28, #0x0]\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "ldr x22, [x28, #0x8]\n"
+    "fmla z23.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z0.h, z11.h\n"
+    "ldr x9, [x16, #0xa0]\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "ldr x11, [x16, #0x98]\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "ldr x21, [x28, #0x10]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xb0]\n"
+    "fmla z24.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xa8]\n"
+    "fmla z25.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xb8]\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "ldr x14, [x16, #0xc0]\n"
+    "fmla z31.h, p3/M, z3.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "ldr x20, [x28, #0x18]\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z24.h, p3/M, z5.h, z11.h\n"
+    "fmla z25.h, p3/M, z4.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z13.h\n"
+    "fmla z26.h, p3/M, z7.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "fmla z29.h, p3/M, z0.h, z12.h\n"
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "fmla z26.h, p3/M, z3.h, z12.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z30.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmax z23.h, p3/M, z23.h, z18.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z29.h, p3/M, z8.h, z13.h\n"
+    ".inst 0xc170ca58  // fclamp { z24.h-z27.h }, z18.h, z16.h\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    "fmla z31.h, p3/M, z6.h, z13.h\n"
+    "st1h { z24.h }, p1, [x22, x27, LSL #1]\n"
+    "ldr x22, [x28, #0x28]\n"
+    "st1h { z25.h }, p1, [x21, x27, LSL #1]\n"
+    "ldr x21, [x28, #0x30]\n"
+    "st1h { z26.h }, p1, [x20, x27, LSL #1]\n"
+    "ldr x20, [x28, #0x38]\n"
+    "st1h { z23.h }, p1, [x23, x27, LSL #1]\n"
+    "ldr x23, [x28, #0x20]\n"
+    ".inst 0xc170ca5c  // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+    "st1h { z27.h }, p1, [x23, x27, LSL #1]\n"
+    "ldr x23, [x28, #0x40]\n"
+    "st1h { z28.h }, p1, [x22, x27, LSL #1]\n"
+    "st1h { z29.h }, p1, [x21, x27, LSL #1]\n"
+    "st1h { z30.h }, p1, [x20, x27, LSL #1]\n"
+    "st1h { z31.h }, p1, [x23, x27, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6b75d12295
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..37a9febf47
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x1, #0x0\n"
+    "mov x2, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "1:"  // Tile loop
+    "str x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x4\n"
+    "str x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x20, x1, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x2, x3, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "add x6, x3, x3\n"
+    "add x4, x4, x20, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x7, x4, x21, LSL #1\n"
+    "add x8, x6, x3\n"
+    "add x17, x7, x21, LSL #1\n"
+    "add x16, x8, x3\n"
+    "add x15, x17, x21, LSL #1\n"
+    "add x14, x16, x3\n"
+    "add x13, x15, x21, LSL #1\n"
+    "add x12, x13, x21, LSL #1\n"
+    "cbnz x2, 2f\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "lsl x10, %x[n_channels], #0x1\n"
+    "mov x21, #0x8\n"
+    "mul x21, x21, x3\n"
+    "add x9, x17, x6, LSL #1\n"
+    "add x28, x4, x14, LSL #1\n"
+    "add x27, x17, x8, LSL #1\n"
+    "sub x20, x11, x2\n"
+    "add x26, x12, x14, LSL #1\n"
+    "sub x20, x20, #0x1\n"
+    "add x25, x15, x6, LSL #1\n"
+    "and x20, x20, #0x3fffff\n"
+    "add x24, x4, x3, LSL #1\n"
+    "orr x10, x10, x20, LSL #22\n"
+    "add x23, x4, x16, LSL #1\n"
+    "orr x10, x10, x21, LSL #38\n"
+    "add x22, x15, x8, LSL #1\n"
+    "add x21, x7, x14, LSL #1\n"
+    "add x20, x7, x6, LSL #1\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x13, x14, LSL #1\n"
+    ".inst 0xf8aa489a  // rprfm pldonce, x10, [x4]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x7, x8, LSL #1\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x12, x3, LSL #1\n"
+    ".inst 0xf8aa499a  // rprfm pldonce, x10, [x12]\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x17, x3, LSL #1\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x12, x16, LSL #1\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x17, x16, LSL #1\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x4, x6, LSL #1\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x15, x3, LSL #1\n"
+    ".inst 0xf8aa48fa  // rprfm pldonce, x10, [x7]\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x4, x8, LSL #1\n"
+    ".inst 0xf8aa49ba  // rprfm pldonce, x10, [x13]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x15, x16, LSL #1\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x17, x14, LSL #1\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x13, x6, LSL #1\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x15, x14, LSL #1\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x12, x6, LSL #1\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x13, x8, LSL #1\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x12, x8, LSL #1\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x7, x3, LSL #1\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x7, x16, LSL #1\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x13, x3, LSL #1\n"
+    ".inst 0xf8aa4a3a  // rprfm pldonce, x10, [x17]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x13, x16, LSL #1\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    ".inst 0xf8aa49fa  // rprfm pldonce, x10, [x15]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mov x21, #0x4\n"
+    "ld1h { z15.h }, p3/Z, [x5]\n"
+    "addvl x5, x5, #1\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x28\n"
+    ".inst 0xa040a0a0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x5]\n"
+    "addvl x5, x5, #4\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    ".inst 0xa040a0a4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x5]\n"
+    "addvl x5, x5, #4\n"
+    "mul x20, x1, x22\n"  // offset = tile_i * ld_output_row
+    "cmp x28, %x[n_channels]\n"
+    "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "madd x20, x2, x9, x20\n"  // offset += tile_j * ld_output_col
+    "add x26, x9, x9\n"
+    "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mul x20, x20, x21\n"  // offset *= output_tile_size
+    "add x25, x26, x9\n"
+    "ld1h { z8.h }, p3/Z, [x5]\n"
+    "add x27, x27, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "mov x21, #0x0\n"
+    "ld1h { z9.h }, p2/Z, [x17, x6, LSL #1]\n"
+    "add x24, x27, x22, LSL #1\n"
+    "sub x20, XZR, x28\n"
+    "ld1h { z10.h }, p2/Z, [x4]\n"
+    "add x23, x24, x22, LSL #1\n"
+    "ld1h { z11.h }, p2/Z, [x4, x14, LSL #1]\n"
+    "addvl x5, x5, #1\n"
+    "add x22, x23, x22, LSL #1\n"
+    "ld1h { z12.h }, p2/Z, [x17, x8, LSL #1]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
+    "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+    "whilelt p1.h, x28, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
+    "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+    "inch x28\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
+    "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
+    "inch x20\n"
+    "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
+    "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
+    "fmla z21.h, p3/M, z5.h, z12.h\n"
+    "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x15, x6, LSL #1]\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x12]\n"
+    "fmla z22.h, p3/M, z4.h, z12.h\n"
+    "fmla z25.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z8.h, z12.h\n"
+    "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z21.h, p3/M, z7.h, z9.h\n"
+    "ld1h { z10.h }, p2/Z, [x15, x8, LSL #1]\n"
+    "fmla z18.h, p3/M, z7.h, z12.h\n"
+    "fmla z19.h, p3/M, z6.h, z12.h\n"
+    "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
+    "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x4, x3, LSL #1]\n"
+    "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
+    "fmla z22.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x4, x16, LSL #1]\n"
+    "fmla z25.h, p3/M, z4.h, z9.h\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
+    "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z15.h }, p3/Z, [x5]\n"
+    "addvl x5, x5, #1\n"
+    "fmla z20.h, p3/M, z8.h, z9.h\n"
+    "fmla z24.h, p3/M, z5.h, z9.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z9.h }, p2/Z, [x7]\n"
+    "fmla z16.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x14, LSL #1]\n"
+    "fmla z18.h, p3/M, z2.h, z11.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13]\n"
+    "fmla z22.h, p3/M, z7.h, z10.h\n"
+    "fmla z23.h, p3/M, z6.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x7, x6, LSL #1]\n"
+    "fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x14, LSL #1]\n"
+    "fmla z16.h, p3/M, z3.h, z9.h\n"
+    "fmla z21.h, p3/M, z1.h, z10.h\n"
+    "fmla z19.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z2.h, z12.h\n"
+    "fmla z17.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x8, LSL #1]\n"
+    "fmla z18.h, p3/M, z3.h, z10.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "fmla z27.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x3, LSL #1]\n"
+    "fmla z20.h, p3/M, z2.h, z10.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z16.h, p3/M, z5.h, z10.h\n"
+    "fmla z17.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z10.h }, p2/Z, [x17, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmla z19.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z1.h, z12.h\n"
+    "fmla z23.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x17, x16, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z11.h\n"
+    "fmla z29.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x16, LSL #1]\n"
+    "fmla z20.h, p3/M, z4.h, z10.h\n"
+    "fmla z21.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "fmla z16.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x4, x6, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x15, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z19.h, p3/M, z7.h, z12.h\n"
+    "fmla z22.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x4, x8, LSL #1]\n"
+    "addvl x4, x4, #1\n"
+    "fmla z20.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z24.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x15, x16, LSL #1]\n"
+    "fmla z16.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z18.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x17]\n"
+    "fmla z19.h, p3/M, z0.h, z12.h\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z12.h\n"
+    "fmla z18.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x17, x14, LSL #1]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z16.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x15]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z25.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z9.h }, p1/Z, [x17, x6, LSL #1]\n"
+    "fmla z19.h, p3/M, z8.h, z12.h\n"
+    "fmla z23.h, p3/M, z5.h, z12.h\n"
+    "fmla z27.h, p3/M, z2.h, z12.h\n"
+    "fmla z20.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x15, x14, LSL #1]\n"
+    "addvl x15, x15, #1\n"
+    "fmla z24.h, p3/M, z3.h, z10.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x6, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z27.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x8, LSL #1]\n"
+    "addvl x12, x12, #1\n"
+    "fmla z24.h, p3/M, z8.h, z11.h\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x8, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z26.h, p3/M, z7.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x7, x16, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x7, x3, LSL #1]\n"
+    "addvl x7, x7, #1\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z19.h, p3/M, z4.h, z11.h\n"
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "fmla z17.h, p3/M, z3.h, z10.h\n"
+    "fmla z20.h, p3/M, z1.h, z10.h\n"
+    "fmla z21.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x16, LSL #1]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "fmla z22.h, p3/M, z2.h, z11.h\n"
+    "fmla z23.h, p3/M, z1.h, z11.h\n"
+    "cmp x28, %x[n_channels]\n"
+    "addvl x13, x13, #1\n"
+    "fmla z24.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z11.h }, p1/Z, [x4, x14, LSL #1]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    ".inst 0xa040a0a0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x5]\n"
+    "addvl x5, x5, #4\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z12.h }, p1/Z, [x17, x8, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    ".inst 0xa040a0a4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x5]\n"
+    "addvl x5, x5, #4\n"
+    ".inst 0xc16dc9d0  // fclamp { z16.h-z19.h }, z14.h, z13.h\n"
+    ".inst 0xc16dc9d4  // fclamp { z20.h-z23.h }, z14.h, z13.h\n"
+    "ld1h { z10.h }, p1/Z, [x4]\n"
+    "ld1h { z8.h }, p3/Z, [x5]\n"
+    "addvl x5, x5, #1\n"
+    ".inst 0xc16dc9d8  // fclamp { z24.h-z27.h }, z14.h, z13.h\n"
+    ".inst 0xc16dc9dc  // fclamp { z28.h-z31.h }, z14.h, z13.h\n"
+    "st1h { z16.h }, p0, [x27]\n"
+    "st1h { z17.h }, p0, [x27, x9, LSL #1]\n"
+    "st1h { z18.h }, p0, [x27, x26, LSL #1]\n"
+    "st1h { z19.h }, p0, [x27, x25, LSL #1]\n"
+    "addvl x27, x27, #1\n"
+    "st1h { z20.h }, p0, [x24]\n"
+    "st1h { z21.h }, p0, [x24, x9, LSL #1]\n"
+    "st1h { z22.h }, p0, [x24, x26, LSL #1]\n"
+    "st1h { z23.h }, p0, [x24, x25, LSL #1]\n"
+    "addvl x24, x24, #1\n"
+    "st1h { z24.h }, p0, [x23]\n"
+    "st1h { z25.h }, p0, [x23, x9, LSL #1]\n"
+    "st1h { z26.h }, p0, [x23, x26, LSL #1]\n"
+    "st1h { z27.h }, p0, [x23, x25, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z28.h }, p0, [x22]\n"
+    "st1h { z29.h }, p0, [x22, x9, LSL #1]\n"
+    "st1h { z30.h }, p0, [x22, x26, LSL #1]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
+    "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
+    "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+    "ldr x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
+    "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
+    "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x2, x2, #0x1\n"
+    "fmla z21.h, p3/M, z5.h, z12.h\n"
+    "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x15, x6, LSL #1]\n"
+    "add x20, x1, #0x1\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x12]\n"
+    "cmp x2, x11\n"
+    "fmla z22.h, p3/M, z4.h, z12.h\n"
+    "fmla z25.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+    "csel x1, x1, x20, LT\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z8.h, z12.h\n"
+    "csel x2, x2, XZR, LT\n"
+    "cmp x1, x21\n"
+    "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z21.h, p3/M, z7.h, z9.h\n"
+    "ld1h { z10.h }, p2/Z, [x15, x8, LSL #1]\n"
+    "fmla z18.h, p3/M, z7.h, z12.h\n"
+    "fmla z19.h, p3/M, z6.h, z12.h\n"
+    "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
+    "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x4, x3, LSL #1]\n"
+    "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
+    "fmla z22.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x4, x16, LSL #1]\n"
+    "fmla z25.h, p3/M, z4.h, z9.h\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
+    "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
+    "fmla z20.h, p3/M, z8.h, z9.h\n"
+    "fmla z24.h, p3/M, z5.h, z9.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z9.h }, p2/Z, [x7]\n"
+    "fmla z16.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x14, LSL #1]\n"
+    "fmla z18.h, p3/M, z2.h, z11.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13]\n"
+    "fmla z22.h, p3/M, z7.h, z10.h\n"
+    "fmla z23.h, p3/M, z6.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x7, x6, LSL #1]\n"
+    "fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x14, LSL #1]\n"
+    "fmla z16.h, p3/M, z3.h, z9.h\n"
+    "fmla z21.h, p3/M, z1.h, z10.h\n"
+    "fmla z19.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z2.h, z12.h\n"
+    "fmla z17.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x8, LSL #1]\n"
+    "fmla z18.h, p3/M, z3.h, z10.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "fmla z27.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x3, LSL #1]\n"
+    "fmla z20.h, p3/M, z2.h, z10.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z16.h, p3/M, z5.h, z10.h\n"
+    "fmla z17.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z10.h }, p2/Z, [x17, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmla z19.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z1.h, z12.h\n"
+    "fmla z23.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x17, x16, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z11.h\n"
+    "fmla z29.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x16, LSL #1]\n"
+    "fmla z20.h, p3/M, z4.h, z10.h\n"
+    "fmla z21.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "fmla z16.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x4, x6, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x15, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z19.h, p3/M, z7.h, z12.h\n"
+    "fmla z22.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x4, x8, LSL #1]\n"
+    "fmla z20.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z24.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x15, x16, LSL #1]\n"
+    "fmla z16.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z18.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x17]\n"
+    "fmla z19.h, p3/M, z0.h, z12.h\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z12.h\n"
+    "fmla z18.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x17, x14, LSL #1]\n"
+    "fmla z16.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x15]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z25.h, p3/M, z7.h, z11.h\n"
+    "fmla z19.h, p3/M, z8.h, z12.h\n"
+    "fmla z23.h, p3/M, z5.h, z12.h\n"
+    "fmla z27.h, p3/M, z2.h, z12.h\n"
+    "fmla z20.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x15, x14, LSL #1]\n"
+    "fmla z24.h, p3/M, z3.h, z10.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x6, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z27.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x8, LSL #1]\n"
+    "fmla z24.h, p3/M, z8.h, z11.h\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x8, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z26.h, p3/M, z7.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x7, x16, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x7, x3, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x3, LSL #1]\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z19.h, p3/M, z4.h, z11.h\n"
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "fmla z17.h, p3/M, z3.h, z10.h\n"
+    "fmla z20.h, p3/M, z1.h, z10.h\n"
+    "fmla z21.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x16, LSL #1]\n"
+    "fmla z22.h, p3/M, z2.h, z11.h\n"
+    "fmla z23.h, p3/M, z1.h, z11.h\n"
+    "fmla z24.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    ".inst 0xc16dc9d0  // fclamp { z16.h-z19.h }, z14.h, z13.h\n"
+    ".inst 0xc16dc9d4  // fclamp { z20.h-z23.h }, z14.h, z13.h\n"
+    ".inst 0xc16dc9d8  // fclamp { z24.h-z27.h }, z14.h, z13.h\n"
+    ".inst 0xc16dc9dc  // fclamp { z28.h-z31.h }, z14.h, z13.h\n"
+    "st1h { z16.h }, p0, [x27]\n"
+    "st1h { z17.h }, p0, [x27, x9, LSL #1]\n"
+    "st1h { z18.h }, p0, [x27, x26, LSL #1]\n"
+    "st1h { z19.h }, p0, [x27, x25, LSL #1]\n"
+    "st1h { z20.h }, p0, [x24]\n"
+    "st1h { z21.h }, p0, [x24, x9, LSL #1]\n"
+    "st1h { z22.h }, p0, [x24, x26, LSL #1]\n"
+    "st1h { z23.h }, p0, [x24, x25, LSL #1]\n"
+    "st1h { z24.h }, p0, [x23]\n"
+    "st1h { z25.h }, p0, [x23, x9, LSL #1]\n"
+    "st1h { z26.h }, p0, [x23, x26, LSL #1]\n"
+    "st1h { z27.h }, p0, [x23, x25, LSL #1]\n"
+    "st1h { z28.h }, p0, [x22]\n"
+    "st1h { z29.h }, p0, [x22, x9, LSL #1]\n"
+    "st1h { z30.h }, p0, [x22, x26, LSL #1]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..2e6f1123a4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ldp x14, x13, [x16, #0x0]\n"
+    "ldp x12, x11, [x16, #0x10]\n"
+    "cnth x10\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1h { z14.h }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "cmp x10, %x[n_channels]\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    ".inst 0xa040a220  // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "sub x28, XZR, x10\n"
+    ".inst 0xa040a224  // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z8.h }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z21, z14\n fmla z21.h, p3/M, z4.h, z9.h\n"
+    "movprfx z16, z14\n fmla z16.h, p3/M, z8.h, z9.h\n"
+    "ldr x27, [x16, #0x20]\n"
+    "inch x28\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z3.h, z9.h\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z1.h, z9.h\n"
+    "ldr x26, [x16, #0x30]\n"
+    "mov p1.b, p2.b\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z0.h, z9.h\n"
+    "ldr x25, [x16, #0x28]\n"
+    "movprfx z17, z14\n fmla z17.h, p3/M, z7.h, z9.h\n"
+    "whilelt p0.h, x10, %x[n_channels]\n"
+    "movprfx z18, z14\n fmla z18.h, p3/M, z6.h, z9.h\n"
+    "movprfx z20, z14\n fmla z20.h, p3/M, z5.h, z9.h\n"
+    "ldr x24, [x16, #0x38]\n"
+    "fmla z21.h, p3/M, z5.h, z12.h\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x14, [x16, #0x40]\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "movprfx z19, z14\n fmla z19.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x48]\n"
+    "fmla z22.h, p3/M, z4.h, z12.h\n"
+    "fmla z25.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x50]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z8.h, z12.h\n"
+    "ldr x27, [x16, #0x60]\n"
+    "fmla z18.h, p3/M, z7.h, z12.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla z21.h, p3/M, z7.h, z9.h\n"
+    "fmla z19.h, p3/M, z6.h, z12.h\n"
+    "ldr x11, [x16, #0x58]\n"
+    "movprfx z23, z14\n fmla z23.h, p3/M, z3.h, z12.h\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x70]\n"
+    "movprfx z31, z14\n fmla z31.h, p3/M, z8.h, z11.h\n"
+    "fmla z22.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x24, [x16, #0x78]\n"
+    "fmla z25.h, p3/M, z4.h, z9.h\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "ldr x14, [x16, #0x80]\n"
+    "movprfx z29, z14\n fmla z29.h, p3/M, z1.h, z9.h\n"
+    "movprfx z30, z14\n fmla z30.h, p3/M, z0.h, z9.h\n"
+    "ldr x13, [x16, #0x88]\n"
+    "ld1h { z14.h }, p3/Z, [x17]\n"
+    "fmla z20.h, p3/M, z8.h, z9.h\n"
+    "fmla z24.h, p3/M, z5.h, z9.h\n"
+    "ldr x23, [x9, #0x0]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z16.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z9.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x90]\n"
+    "fmla z17.h, p3/M, z0.h, z12.h\n"
+    "fmla z18.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ldr x11, [x16, #0x98]\n"
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xa0]\n"
+    "fmla z22.h, p3/M, z7.h, z10.h\n"
+    "fmla z23.h, p3/M, z6.h, z10.h\n"
+    "ldr x22, [x9, #0x8]\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "ldr x21, [x9, #0x10]\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "ldr x20, [x9, #0x18]\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z16.h, p3/M, z3.h, z9.h\n"
+    "fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xb0]\n"
+    "fmla z17.h, p3/M, z4.h, z10.h\n"
+    "fmla z18.h, p3/M, z3.h, z10.h\n"
+    "fmla z21.h, p3/M, z1.h, z10.h\n"
+    "fmla z19.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z2.h, z12.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xb8]\n"
+    "fmla z27.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x14, [x16, #0xc0]\n"
+    "fmla z16.h, p3/M, z5.h, z10.h\n"
+    "fmla z20.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x13, [x16, #0xc8]\n"
+    "fmla z17.h, p3/M, z5.h, z12.h\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z19.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z1.h, z12.h\n"
+    "fmla z23.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ldr x11, [x16, #0xd8]\n"
+    "fmla z28.h, p3/M, z7.h, z11.h\n"
+    "fmla z29.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0xd0]\n"
+    "fmla z16.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "fmla z20.h, p3/M, z4.h, z10.h\n"
+    "fmla z21.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z19.h, p3/M, z7.h, z12.h\n"
+    "fmla z22.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xf0]\n"
+    "fmla z16.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z18.h, p3/M, z0.h, z10.h\n"
+    "fmla z20.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xf8]\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z24.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z12.h\n"
+    "ldr x14, [x16, #0x100]\n"
+    "fmla z18.h, p3/M, z1.h, z12.h\n"
+    "fmla z19.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x108]\n"
+    "fmla z16.h, p3/M, z6.h, z10.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x110]\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z19.h, p3/M, z8.h, z12.h\n"
+    "ldr x11, [x16, #0x118]\n"
+    "fmla z27.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z20.h, p3/M, z6.h, z10.h\n"
+    "fmla z24.h, p3/M, z3.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z23.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z25.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmla z24.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z5.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z7.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z11.h\n"
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "ldp x14, x13, [x16, #0x0]\n"
+    "fmla z17.h, p3/M, z3.h, z10.h\n"
+    "fmla z20.h, p3/M, z1.h, z10.h\n"
+    "fmla z21.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z19.h, p3/M, z4.h, z11.h\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z9.h }, p0/Z, [x14, x10, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldp x12, x11, [x16, #0x10]\n"
+    "fmla z22.h, p3/M, z2.h, z11.h\n"
+    "fmla z23.h, p3/M, z1.h, z11.h\n"
+    "inch x15\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "whilelt p2.h, x15, %x[n_channels]\n"
+    ".inst 0xc16dc9f0  // fclamp { z16.h-z19.h }, z15.h, z13.h\n"
+    "fmla z24.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z11.h }, p0/Z, [x12, x10, LSL #1]\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    ".inst 0xc16dc9f4  // fclamp { z20.h-z23.h }, z15.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p0/Z, [x11, x10, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p0/Z, [x13, x10, LSL #1]\n"
+    "inch x10\n"
+    "st1h { z16.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x20]\n"
+    ".inst 0xa040a220  // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "st1h { z17.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x28]\n"
+    ".inst 0xc16dc9f8  // fclamp { z24.h-z27.h }, z15.h, z13.h\n"
+    ".inst 0xa040a224  // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+    "st1h { z18.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x30]\n"
+    "addvl x17, x17, #4\n"
+    "cmp x10, %x[n_channels]\n"
+    "st1h { z19.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x38]\n"
+    ".inst 0xc16dc9fc  // fclamp { z28.h-z31.h }, z15.h, z13.h\n"
+    "ld1h { z8.h }, p3/Z, [x17]\n"
+    "st1h { z20.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x40]\n"
+    "addvl x17, x17, #1\n"
+    "st1h { z21.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x48]\n"
+    "st1h { z22.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x50]\n"
+    "st1h { z23.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x58]\n"
+    "st1h { z24.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x60]\n"
+    "st1h { z25.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x68]\n"
+    "st1h { z26.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x70]\n"
+    "st1h { z27.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x78]\n"
+    "st1h { z28.h }, p1, [x23, x28, LSL #1]\n"
+    "st1h { z29.h }, p1, [x22, x28, LSL #1]\n"
+    "st1h { z30.h }, p1, [x21, x28, LSL #1]\n"
+    "st1h { z31.h }, p1, [x20, x28, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z21, z14\n fmla z21.h, p3/M, z4.h, z9.h\n"
+    "movprfx z16, z14\n fmla z16.h, p3/M, z8.h, z9.h\n"
+    "ldr x27, [x16, #0x20]\n"
+    "inch x28\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z3.h, z9.h\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z1.h, z9.h\n"
+    "ldr x26, [x16, #0x30]\n"
+    "mov p1.b, p2.b\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z0.h, z9.h\n"
+    "ldr x25, [x16, #0x28]\n"
+    "movprfx z17, z14\n fmla z17.h, p3/M, z7.h, z9.h\n"
+    "movprfx z18, z14\n fmla z18.h, p3/M, z6.h, z9.h\n"
+    "movprfx z20, z14\n fmla z20.h, p3/M, z5.h, z9.h\n"
+    "ldr x24, [x16, #0x38]\n"
+    "fmla z21.h, p3/M, z5.h, z12.h\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x14, [x16, #0x40]\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "movprfx z19, z14\n fmla z19.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x48]\n"
+    "fmla z22.h, p3/M, z4.h, z12.h\n"
+    "fmla z25.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x50]\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "fmla z17.h, p3/M, z8.h, z12.h\n"
+    "ldr x27, [x16, #0x60]\n"
+    "fmla z18.h, p3/M, z7.h, z12.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla z21.h, p3/M, z7.h, z9.h\n"
+    "fmla z19.h, p3/M, z6.h, z12.h\n"
+    "ldr x11, [x16, #0x58]\n"
+    "movprfx z23, z14\n fmla z23.h, p3/M, z3.h, z12.h\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x70]\n"
+    "movprfx z31, z14\n fmla z31.h, p3/M, z8.h, z11.h\n"
+    "fmla z22.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x24, [x16, #0x78]\n"
+    "fmla z25.h, p3/M, z4.h, z9.h\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "ldr x14, [x16, #0x80]\n"
+    "movprfx z29, z14\n fmla z29.h, p3/M, z1.h, z9.h\n"
+    "movprfx z30, z14\n fmla z30.h, p3/M, z0.h, z9.h\n"
+    "ldr x13, [x16, #0x88]\n"
+    "fmla z20.h, p3/M, z8.h, z9.h\n"
+    "fmla z24.h, p3/M, z5.h, z9.h\n"
+    "ldr x23, [x9, #0x0]\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "fmla z16.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z9.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x90]\n"
+    "fmla z17.h, p3/M, z0.h, z12.h\n"
+    "fmla z18.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ldr x11, [x16, #0x98]\n"
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xa0]\n"
+    "fmla z22.h, p3/M, z7.h, z10.h\n"
+    "fmla z23.h, p3/M, z6.h, z10.h\n"
+    "ldr x22, [x9, #0x8]\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "ldr x21, [x9, #0x10]\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "ldr x20, [x9, #0x18]\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z16.h, p3/M, z3.h, z9.h\n"
+    "fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xb0]\n"
+    "fmla z17.h, p3/M, z4.h, z10.h\n"
+    "fmla z18.h, p3/M, z3.h, z10.h\n"
+    "fmla z21.h, p3/M, z1.h, z10.h\n"
+    "fmla z19.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z2.h, z12.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xb8]\n"
+    "fmla z27.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ldr x14, [x16, #0xc0]\n"
+    "fmla z16.h, p3/M, z5.h, z10.h\n"
+    "fmla z20.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x13, [x16, #0xc8]\n"
+    "fmla z17.h, p3/M, z5.h, z12.h\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z19.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z1.h, z12.h\n"
+    "fmla z23.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ldr x11, [x16, #0xd8]\n"
+    "fmla z28.h, p3/M, z7.h, z11.h\n"
+    "fmla z29.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0xd0]\n"
+    "fmla z16.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "fmla z20.h, p3/M, z4.h, z10.h\n"
+    "fmla z21.h, p3/M, z3.h, z10.h\n"
+    "fmla z24.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xe0]\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z19.h, p3/M, z7.h, z12.h\n"
+    "fmla z22.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xf0]\n"
+    "fmla z16.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z18.h, p3/M, z0.h, z10.h\n"
+    "fmla z20.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xf8]\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z24.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z12.h\n"
+    "ldr x14, [x16, #0x100]\n"
+    "fmla z18.h, p3/M, z1.h, z12.h\n"
+    "fmla z19.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "ldr x13, [x16, #0x108]\n"
+    "fmla z16.h, p3/M, z6.h, z10.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ldr x12, [x16, #0x110]\n"
+    "fmla z23.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z19.h, p3/M, z8.h, z12.h\n"
+    "ldr x11, [x16, #0x118]\n"
+    "fmla z27.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z20.h, p3/M, z6.h, z10.h\n"
+    "fmla z24.h, p3/M, z3.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z23.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z25.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmla z24.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z5.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z7.h, z11.h\n"
+    "fmla z27.h, p3/M, z6.h, z11.h\n"
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "fmla z17.h, p3/M, z3.h, z10.h\n"
+    "fmla z20.h, p3/M, z1.h, z10.h\n"
+    "fmla z21.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z19.h, p3/M, z4.h, z11.h\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z2.h, z11.h\n"
+    "fmla z23.h, p3/M, z1.h, z11.h\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    ".inst 0xc16dc9f0  // fclamp { z16.h-z19.h }, z15.h, z13.h\n"
+    "fmla z24.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    ".inst 0xc16dc9f4  // fclamp { z20.h-z23.h }, z15.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "st1h { z16.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x20]\n"
+    "st1h { z17.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x28]\n"
+    "st1h { z18.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x30]\n"
+    ".inst 0xc16dc9f8  // fclamp { z24.h-z27.h }, z15.h, z13.h\n"
+    "st1h { z19.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x38]\n"
+    "st1h { z20.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x40]\n"
+    ".inst 0xc16dc9fc  // fclamp { z28.h-z31.h }, z15.h, z13.h\n"
+    "st1h { z21.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x48]\n"
+    "st1h { z22.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x50]\n"
+    "st1h { z23.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x58]\n"
+    "st1h { z24.h }, p1, [x23, x28, LSL #1]\n"
+    "ldr x23, [x9, #0x60]\n"
+    "st1h { z25.h }, p1, [x22, x28, LSL #1]\n"
+    "ldr x22, [x9, #0x68]\n"
+    "st1h { z26.h }, p1, [x21, x28, LSL #1]\n"
+    "ldr x21, [x9, #0x70]\n"
+    "st1h { z27.h }, p1, [x20, x28, LSL #1]\n"
+    "ldr x20, [x9, #0x78]\n"
+    "st1h { z28.h }, p1, [x23, x28, LSL #1]\n"
+    "st1h { z29.h }, p1, [x22, x28, LSL #1]\n"
+    "st1h { z30.h }, p1, [x21, x28, LSL #1]\n"
+    "st1h { z31.h }, p1, [x20, x28, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..27fcb2e6d2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..066ce06aa6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x4\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "add x7, x4, x4\n"
+    "add x5, x5, x20, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x8, x5, x21, LSL #1\n"
+    "add x17, x7, x4\n"
+    "add x16, x8, x21, LSL #1\n"
+    "add x15, x17, x4\n"
+    "add x14, x16, x21, LSL #1\n"
+    "add x13, x14, x21, LSL #1\n"
+    "cbnz x3, 2f\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "lsl x12, %x[n_channels], #0x1\n"
+    "mov x28, #0x8\n"
+    "mul x28, x28, x4\n"
+    "add x27, x16, x7, LSL #1\n"
+    "add x26, x5, x4, LSL #1\n"
+    "add x25, x5, x17, LSL #1\n"
+    "sub x20, x24, x3\n"
+    "add x24, x5, x15, LSL #1\n"
+    "sub x20, x20, #0x1\n"
+    "add x23, x8, x4, LSL #1\n"
+    "and x20, x20, #0x3fffff\n"
+    "add x22, x5, x7, LSL #1\n"
+    "orr x12, x12, x20, LSL #22\n"
+    "add x21, x8, x17, LSL #1\n"
+    "orr x12, x12, x28, LSL #38\n"
+    "add x20, x8, x15, LSL #1\n"
+    "add x11, x8, x7, LSL #1\n"
+    "add x10, x14, x4, LSL #1\n"
+    "add x9, x16, x4, LSL #1\n"
+    "add x28, x14, x17, LSL #1\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    "add x27, x16, x17, LSL #1\n"
+    ".inst 0xf8ac48ba  // rprfm pldonce, x12, [x5]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    "add x26, x14, x15, LSL #1\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    "add x25, x16, x15, LSL #1\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    "add x24, x13, x4, LSL #1\n"
+    ".inst 0xf8ac491a  // rprfm pldonce, x12, [x8]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    "add x23, x14, x7, LSL #1\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    "add x22, x13, x17, LSL #1\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    "add x21, x13, x7, LSL #1\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "add x20, x13, x15, LSL #1\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac4a1a  // rprfm pldonce, x12, [x16]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac49ba  // rprfm pldonce, x12, [x13]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mov x20, #0x2\n"
+    "ld1h { z19.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x24\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    ".inst 0xa040a0c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "mul x22, x2, x26\n"  // offset = tile_i * ld_output_row
+    "cmp x24, %x[n_channels]\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "madd x22, x3, x25, x22\n"  // offset += tile_j * ld_output_col
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mov x21, #0x0\n"
+    "mul x22, x22, x20\n"  // offset *= output_tile_size
+    "sub x20, XZR, x24\n"
+    "ld1h { z8.h }, p3/Z, [x6]\n"
+    "add x23, x23, x22, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z9.h }, p2/Z, [x16, x7, LSL #1]\n"
+    "addvl x6, x6, #1\n"
+    "add x22, x23, x26, LSL #1\n"
+    "ld1h { z10.h }, p2/Z, [x5]\n"
+    "ld1h { z11.h }, p2/Z, [x5, x4, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x5, x15, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x8]\n"
+    "ld1h { z15.h }, p2/Z, [x8, x4, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x5, x7, LSL #1]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+    "whilelt p1.h, x24, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z19.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    "inch x24\n"
+    "mov p0.b, p2.b\n"
+    "addvl x5, x5, #1\n"
+    "inch x20\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "ld1h { z10.h }, p1/Z, [x5]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z28.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14]\n"
+    "fmla z29.h, p3/M, z0.h, z16.h\n"
+    "fmla z28.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x16]\n"
+    "fmla z30.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z15.h\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "addvl x16, x16, #1\n"
+    "ld1h { z9.h }, p1/Z, [x16, x7, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x13]\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x5, x17, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z30.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p1/Z, [x5, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "cmp x24, %x[n_channels]\n"
+    "addvl x13, x13, #1\n"
+    "fmla z30.h, p3/M, z5.h, z16.h\n"
+    "fmla z31.h, p3/M, z3.h, z16.h\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "ld1h { z16.h }, p1/Z, [x5, x7, LSL #1]\n"
+    "fmla z31.h, p3/M, z7.h, z14.h\n"
+    "ld1h { z14.h }, p1/Z, [x8]\n"
+    "fmla z30.h, p3/M, z8.h, z15.h\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    ".inst 0xa040a0c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "ld1h { z15.h }, p1/Z, [x8, x4, LSL #1]\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x5, x4, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    ".inst 0xc171ca5c  // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x3, x3, #0x1\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "add x20, x2, #0x1\n"
+    "cmp x3, x24\n"
+    "csel x2, x2, x20, LT\n"
+    "csel x3, x3, XZR, LT\n"
+    "cmp x2, x21\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "fmla z28.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14]\n"
+    "fmla z29.h, p3/M, z0.h, z16.h\n"
+    "fmla z28.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x16]\n"
+    "fmla z30.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z15.h\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x13]\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z16.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z16.h\n"
+    "fmla z31.h, p3/M, z3.h, z16.h\n"
+    "fmla z30.h, p3/M, z8.h, z15.h\n"
+    "fmla z31.h, p3/M, z7.h, z14.h\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    ".inst 0xc171ca5c  // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..1bf3a84959
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "mov x15, #0x0\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "cnth x13\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldp x12, x11, [x20, #0x0]\n"
+    "ldp x10, x9, [x20, #0x10]\n"
+    "cmp x13, %x[n_channels]\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x28, XZR, x13\n"
+    "ld1h { z17.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ldp x27, x26, [x16, #0x0]\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    ".inst 0xa040a1c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "ld1h { z8.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ld1h { z9.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ld1h { z15.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "movprfx z29, z17\n fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x27, [x16, #0x40]\n"
+    "whilelt p1.h, x13, %x[n_channels]\n"
+    "ldr x26, [x16, #0x48]\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z17.h }, p3/Z, [x14]\n"
+    "ldr x25, [x16, #0x50]\n"
+    "addvl x14, x14, #1\n"
+    "inch x28\n"
+    "ldr x24, [x16, #0x58]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x20, [x16, #0x78]\n"
+    "ldr x23, [x16, #0x60]\n"
+    "ldr x22, [x16, #0x68]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x27, [x16, #0x80]\n"
+    "ldr x26, [x16, #0x88]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z28.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z16.h\n"
+    "ldr x24, [x16, #0x98]\n"
+    "ldr x25, [x16, #0x90]\n"
+    "fmla z30.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ldr x23, [x16, #0xa0]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x22, [x16, #0xa8]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xb0]\n"
+    "fmla z30.h, p3/M, z0.h, z15.h\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xc0]\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z16.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x0]\n"
+    "inch x15\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    "whilelt p2.h, x15, %x[n_channels]\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    "fmla z30.h, p3/M, z5.h, z16.h\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "ld1h { z9.h }, p1/Z, [x27, x13, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z10.h }, p1/Z, [x26, x13, LSL #1]\n"
+    "ld1h { z12.h }, p1/Z, [x24, x13, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z15.h\n"
+    "ld1h { z13.h }, p1/Z, [x23, x13, LSL #1]\n"
+    "fmla z31.h, p3/M, z7.h, z14.h\n"
+    "ld1h { z14.h }, p1/Z, [x22, x13, LSL #1]\n"
+    "ld1h { z16.h }, p1/Z, [x20, x13, LSL #1]\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p1/Z, [x21, x13, LSL #1]\n"
+    ".inst 0xa040a1c4  // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x25, x13, LSL #1]\n"
+    "inch x13\n"
+    "cmp x13, %x[n_channels]\n"
+    "ld1h { z8.h }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    ".inst 0xc172ca7c  // fclamp { z28.h-z31.h }, z19.h, z18.h\n"
+    "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+    "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "movprfx z29, z17\n fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x27, [x16, #0x40]\n"
+    "inch x28\n"
+    "ldr x26, [x16, #0x48]\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z2.h, z9.h\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "mov p0.b, p2.b\n"
+    "ldr x25, [x16, #0x50]\n"
+    "ldr x24, [x16, #0x58]\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "ldr x20, [x16, #0x78]\n"
+    "ldr x23, [x16, #0x60]\n"
+    "ldr x22, [x16, #0x68]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x27, [x16, #0x80]\n"
+    "ldr x26, [x16, #0x88]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z28.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z16.h\n"
+    "ldr x24, [x16, #0x98]\n"
+    "ldr x25, [x16, #0x90]\n"
+    "fmla z30.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ldr x23, [x16, #0xa0]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x22, [x16, #0xa8]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xb0]\n"
+    "fmla z30.h, p3/M, z0.h, z15.h\n"
+    "fmla z29.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xc0]\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z16.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "fmla z31.h, p3/M, z5.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z13.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z16.h\n"
+    "fmla z31.h, p3/M, z3.h, z16.h\n"
+    "fmla z30.h, p3/M, z8.h, z15.h\n"
+    "fmla z31.h, p3/M, z7.h, z14.h\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    "fmla z31.h, p3/M, z8.h, z11.h\n"
+    ".inst 0xc172ca7c  // fclamp { z28.h-z31.h }, z19.h, z18.h\n"
+    "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+    "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..84263cb564
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..58b7824b98
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x2\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "add x7, x4, x4\n"
+    "add x5, x5, x20, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x8, x5, x21, LSL #1\n"
+    "add x17, x7, x4\n"
+    "add x16, x8, x21, LSL #1\n"
+    "add x15, x17, x4\n"
+    "add x14, x16, x21, LSL #1\n"
+    "add x13, x15, x4\n"
+    "add x12, x14, x21, LSL #1\n"
+    "add x11, x12, x21, LSL #1\n"
+    "cbnz x3, 2f\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "lsl x10, %x[n_channels], #0x1\n"
+    "mov x21, #0x4\n"
+    "mul x21, x21, x4\n"
+    "add x9, x5, x4, LSL #1\n"
+    "add x28, x8, x4, LSL #1\n"
+    "add x27, x5, x7, LSL #1\n"
+    "sub x20, x25, x3\n"
+    "add x26, x8, x7, LSL #1\n"
+    "sub x20, x20, #0x1\n"
+    "add x25, x5, x17, LSL #1\n"
+    "and x20, x20, #0x3fffff\n"
+    "add x24, x5, x15, LSL #1\n"
+    "orr x10, x10, x20, LSL #22\n"
+    "add x23, x8, x13, LSL #1\n"
+    "orr x10, x10, x21, LSL #38\n"
+    "add x22, x8, x17, LSL #1\n"
+    "add x21, x8, x15, LSL #1\n"
+    "add x20, x5, x13, LSL #1\n"
+    ".inst 0xf8aa48ba  // rprfm pldonce, x10, [x5]\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x16, x4, LSL #1\n"
+    ".inst 0xf8aa491a  // rprfm pldonce, x10, [x8]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x16, x7, LSL #1\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x16, x17, LSL #1\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x16, x15, LSL #1\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x16, x13, LSL #1\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x14, x4, LSL #1\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x14, x7, LSL #1\n"
+    ".inst 0xf8aa4a1a  // rprfm pldonce, x10, [x16]\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x14, x17, LSL #1\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x14, x15, LSL #1\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x14, x13, LSL #1\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x12, x4, LSL #1\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x12, x7, LSL #1\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x12, x17, LSL #1\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x12, x15, LSL #1\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x12, x13, LSL #1\n"
+    ".inst 0xf8aa49da  // rprfm pldonce, x10, [x14]\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x11, x4, LSL #1\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x11, x7, LSL #1\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x11, x17, LSL #1\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x11, x15, LSL #1\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x11, x13, LSL #1\n"
+    ".inst 0xf8aa499a  // rprfm pldonce, x10, [x12]\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    ".inst 0xf8aa497a  // rprfm pldonce, x10, [x11]\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x27, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mov x26, #0x2\n"
+    "cnth x25\n"
+    "ld1h { z18.h }, p3/Z, [x6]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "addvl x6, x6, #1\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "cmp x25, %x[n_channels]\n"
+    "mul x22, x2, x27\n"  // offset = tile_i * ld_output_row
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mov x21, #0x0\n"
+    "madd x22, x3, x24, x22\n"  // offset += tile_j * ld_output_col
+    "sub x20, XZR, x25\n"
+    "ld1h { z4.h }, p3/Z, [x6]\n"
+    "mul x22, x22, x26\n"  // offset *= output_tile_size
+    "ld1h { z5.h }, p2/Z, [x5]\n"
+    "addvl x6, x6, #1\n"
+    "add x23, x23, x22, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z6.h }, p2/Z, [x5, x4, LSL #1]\n"
+    "add x22, x23, x27, LSL #1\n"
+    "ld1h { z7.h }, p2/Z, [x8]\n"
+    "ld1h { z8.h }, p2/Z, [x8, x4, LSL #1]\n"
+    "ld1h { z9.h }, p2/Z, [x5, x7, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x5, x17, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x5, x15, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x8, x13, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x16]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z18\n fmla z28.h, p3/M, z0.h, z5.h\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z5.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "whilelt p1.h, x25, %x[n_channels]\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z0.h, z7.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z0.h }, p3/Z, [x6]\n"
+    "inch x21\n"
+    "inch x25\n"
+    "mov p0.b, p2.b\n"
+    "inch x20\n"
+    "fmla z28.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "fmla z30.h, p3/M, z1.h, z8.h\n"
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #1, MUL VL]\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x5, x13, LSL #1]\n"
+    "addvl x5, x5, #1\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z13.h\n"
+    "fmla z31.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #2, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z5.h\n"
+    "fmla z31.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #3, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z6.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #4, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z7.h\n"
+    "ld1h { z7.h }, p1/Z, [x8]\n"
+    "fmla z29.h, p3/M, z0.h, z8.h\n"
+    "fmla z30.h, p3/M, z0.h, z14.h\n"
+    "fmla z31.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #5, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x16, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z13.h\n"
+    "fmla z30.h, p3/M, z1.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #6, MUL VL]\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "addvl x16, x16, #1\n"
+    "fmla z29.h, p3/M, z2.h, z5.h\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #7, MUL VL]\n"
+    "addvl x6, x6, #16\n"
+    "ld1h { z18.h }, p3/Z, [x6, #4, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x14]\n"
+    "fmla z29.h, p3/M, z3.h, z6.h\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #-8, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #-7, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "fmla z30.h, p3/M, z0.h, z5.h\n"
+    "fmla z31.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #-6, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z6.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #-5, MUL VL]\n"
+    "fmla z28.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #-4, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x12]\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #-3, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #-2, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x12, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z6.h\n"
+    "fmla z30.h, p3/M, z0.h, z9.h\n"
+    "fmla z31.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #-1, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x12, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z13.h\n"
+    "fmla z31.h, p3/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p3/Z, [x6]\n"
+    "fmla z28.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x13, LSL #1]\n"
+    "addvl x12, x12, #1\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z5.h\n"
+    "fmla z31.h, p3/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #1, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z6.h\n"
+    "fmla z31.h, p3/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #2, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z14.h\n"
+    "ld1h { z14.h }, p1/Z, [x16]\n"
+    "fmla z30.h, p3/M, z4.h, z8.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #3, MUL VL]\n"
+    "addvl x6, x6, #5\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z13.h\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z0.h, z12.h\n"
+    "fmla z28.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z13.h }, p1/Z, [x8, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z5.h\n"
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z9.h\n"
+    "fmla z28.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z5.h }, p1/Z, [x5]\n"
+    "fmla z29.h, p3/M, z2.h, z6.h\n"
+    "fmla z30.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x13, LSL #1]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "cmp x25, %x[n_channels]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z6.h }, p1/Z, [x5, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z8.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x5, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    ".inst 0xa040a0c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+    "addvl x6, x6, #4\n"
+    "fmla z28.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p1/Z, [x8, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x8, x13, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x5, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x5, x7, LSL #1]\n"
+    "ld1h { z4.h }, p3/Z, [x6]\n"
+    "addvl x6, x6, #1\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x24, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z28, z18\n fmla z28.h, p3/M, z0.h, z5.h\n"
+    "movprfx z29, z18\n fmla z29.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z5.h }, p2/Z, [x8, x17, LSL #1]\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "movprfx z30, z18\n fmla z30.h, p3/M, z0.h, z7.h\n"
+    "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z0.h }, p3/Z, [x6]\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "mov p0.b, p2.b\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x3, x3, #0x1\n"
+    "fmla z28.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x8, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "add x20, x2, #0x1\n"
+    "fmla z30.h, p3/M, z1.h, z8.h\n"
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #1, MUL VL]\n"
+    "cmp x3, x25\n"
+    "csel x2, x2, x20, LT\n"
+    "csel x3, x3, XZR, LT\n"
+    "cmp x2, x21\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x5, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z13.h\n"
+    "fmla z31.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #2, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z5.h\n"
+    "fmla z31.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #3, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x16, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z6.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #4, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z7.h\n"
+    "fmla z29.h, p3/M, z0.h, z8.h\n"
+    "fmla z30.h, p3/M, z0.h, z14.h\n"
+    "fmla z31.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #5, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x16, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z13.h\n"
+    "fmla z30.h, p3/M, z1.h, z11.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #6, MUL VL]\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x16, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z5.h\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #7, MUL VL]\n"
+    "addvl x6, x6, #16\n"
+    "fmla z28.h, p3/M, z3.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x14]\n"
+    "fmla z29.h, p3/M, z3.h, z6.h\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #-8, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x14, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #-7, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x14, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "fmla z30.h, p3/M, z0.h, z5.h\n"
+    "fmla z31.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #-6, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "fmla z30.h, p3/M, z1.h, z6.h\n"
+    "fmla z31.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p3/Z, [x6, #-5, MUL VL]\n"
+    "fmla z28.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "fmla z30.h, p3/M, z2.h, z10.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #-4, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x12]\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #-3, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #-2, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x12, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z6.h\n"
+    "fmla z30.h, p3/M, z0.h, z9.h\n"
+    "fmla z31.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p3/Z, [x6, #-1, MUL VL]\n"
+    "fmla z28.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x12, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z13.h\n"
+    "fmla z31.h, p3/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p3/Z, [x6]\n"
+    "fmla z28.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z2.h, z5.h\n"
+    "fmla z31.h, p3/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p3/Z, [x6, #1, MUL VL]\n"
+    "fmla z28.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11]\n"
+    "fmla z29.h, p3/M, z3.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z6.h\n"
+    "fmla z31.h, p3/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p3/Z, [x6, #2, MUL VL]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x4, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z14.h\n"
+    "fmla z30.h, p3/M, z4.h, z8.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x6, #3, MUL VL]\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z13.h\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z0.h, z12.h\n"
+    "fmla z28.h, p3/M, z1.h, z13.h\n"
+    "fmla z29.h, p3/M, z1.h, z5.h\n"
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z9.h\n"
+    "fmla z28.h, p3/M, z2.h, z5.h\n"
+    "fmla z29.h, p3/M, z2.h, z6.h\n"
+    "fmla z30.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x13, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z6.h\n"
+    "fmla z29.h, p3/M, z3.h, z8.h\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z8.h\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z9.h\n"
+    ".inst 0xc170ca3c  // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+    "st1h { z28.h }, p0, [x23]\n"
+    "st1h { z29.h }, p0, [x23, x24, LSL #1]\n"
+    "st1h { z30.h }, p0, [x22]\n"
+    "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..313036876e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,537 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x15, #0x0\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "whilelt p3.h, XZR, %x[n_channels]\n"
+    "ptrue p2.b\n"
+    "cnth x13\n"
+    "ldp x12, x11, [x20, #0x0]\n"
+    "ldp x10, x9, [x20, #0x10]\n"
+    "cmp x13, %x[n_channels]\n"
+    "ld1rh { z18.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "sub x28, XZR, x13\n"
+    "ldp x27, x26, [x16, #0x0]\n"
+    "ld1h { z17.h }, p2/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    "ld1rh { z16.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z5.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "ld1h { z6.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x40]\n"
+    "ld1h { z4.h }, p2/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ld1h { z7.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "ld1h { z13.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "ld1h { z10.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ld1h { z14.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z28, z17\n fmla z28.h, p2/M, z0.h, z5.h\n"
+    "movprfx z29, z17\n fmla z29.h, p2/M, z0.h, z6.h\n"
+    "ldr x25, [x16, #0x50]\n"
+    "whilelt p1.h, x13, %x[n_channels]\n"
+    "movprfx z30, z17\n fmla z30.h, p2/M, z0.h, z7.h\n"
+    "movprfx z31, z17\n fmla z31.h, p2/M, z0.h, z8.h\n"
+    "ldr x24, [x16, #0x58]\n"
+    "ld1h { z0.h }, p2/Z, [x14]\n"
+    "ldr x23, [x16, #0x60]\n"
+    "inch x28\n"
+    "mov p0.b, p3.b\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x22, [x16, #0x68]\n"
+    "fmla z28.h, p2/M, z1.h, z6.h\n"
+    "fmla z29.h, p2/M, z1.h, z9.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z30.h, p2/M, z1.h, z8.h\n"
+    "fmla z31.h, p2/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #1, MUL VL]\n"
+    "ldr x20, [x16, #0x78]\n"
+    "ldr x27, [x16, #0x80]\n"
+    "fmla z28.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z2.h, z11.h\n"
+    "ldr x26, [x16, #0x88]\n"
+    "fmla z30.h, p2/M, z2.h, z13.h\n"
+    "fmla z31.h, p2/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #2, MUL VL]\n"
+    "ldr x25, [x16, #0x90]\n"
+    "ldr x24, [x16, #0x98]\n"
+    "fmla z28.h, p2/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z3.h, z12.h\n"
+    "ldr x23, [x16, #0xa0]\n"
+    "fmla z30.h, p2/M, z3.h, z5.h\n"
+    "fmla z31.h, p2/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #3, MUL VL]\n"
+    "ldr x22, [x16, #0xa8]\n"
+    "fmla z28.h, p2/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xb0]\n"
+    "fmla z29.h, p2/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p2/M, z4.h, z6.h\n"
+    "fmla z31.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #4, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z7.h\n"
+    "fmla z29.h, p2/M, z0.h, z8.h\n"
+    "fmla z30.h, p2/M, z0.h, z14.h\n"
+    "fmla z31.h, p2/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #5, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xc8]\n"
+    "fmla z29.h, p2/M, z1.h, z13.h\n"
+    "fmla z30.h, p2/M, z1.h, z11.h\n"
+    "fmla z31.h, p2/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #6, MUL VL]\n"
+    "fmla z28.h, p2/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xc0]\n"
+    "fmla z29.h, p2/M, z2.h, z5.h\n"
+    "fmla z30.h, p2/M, z2.h, z12.h\n"
+    "fmla z31.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #7, MUL VL]\n"
+    "addvl x14, x14, #16\n"
+    "ld1h { z17.h }, p2/Z, [x14, #4, MUL VL]\n"
+    "fmla z28.h, p2/M, z3.h, z5.h\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xd0]\n"
+    "fmla z29.h, p2/M, z3.h, z6.h\n"
+    "fmla z30.h, p2/M, z3.h, z9.h\n"
+    "fmla z31.h, p2/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #-8, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xd8]\n"
+    "fmla z29.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "ldr x23, [x16, #0xe0]\n"
+    "fmla z30.h, p2/M, z4.h, z13.h\n"
+    "fmla z31.h, p2/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #-7, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xf8]\n"
+    "fmla z29.h, p2/M, z0.h, z11.h\n"
+    "fmla z30.h, p2/M, z0.h, z5.h\n"
+    "fmla z31.h, p2/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #-6, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "ldr x22, [x16, #0xe8]\n"
+    "fmla z29.h, p2/M, z1.h, z12.h\n"
+    "fmla z30.h, p2/M, z1.h, z6.h\n"
+    "fmla z31.h, p2/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #-5, MUL VL]\n"
+    "fmla z28.h, p2/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xf0]\n"
+    "fmla z29.h, p2/M, z2.h, z9.h\n"
+    "fmla z30.h, p2/M, z2.h, z10.h\n"
+    "fmla z31.h, p2/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #-4, MUL VL]\n"
+    "fmla z28.h, p2/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0x100]\n"
+    "fmla z29.h, p2/M, z3.h, z13.h\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "fmla z31.h, p2/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #-3, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x108]\n"
+    "fmla z29.h, p2/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "fmla z31.h, p2/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #-2, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x110]\n"
+    "fmla z29.h, p2/M, z0.h, z6.h\n"
+    "fmla z30.h, p2/M, z0.h, z9.h\n"
+    "fmla z31.h, p2/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #-1, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0x118]\n"
+    "fmla z29.h, p2/M, z1.h, z10.h\n"
+    "fmla z30.h, p2/M, z1.h, z13.h\n"
+    "fmla z31.h, p2/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p2/Z, [x14]\n"
+    "fmla z28.h, p2/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z2.h, z11.h\n"
+    "fmla z30.h, p2/M, z2.h, z5.h\n"
+    "fmla z31.h, p2/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #1, MUL VL]\n"
+    "fmla z28.h, p2/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z3.h, z12.h\n"
+    "fmla z30.h, p2/M, z3.h, z6.h\n"
+    "fmla z31.h, p2/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #2, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z4.h, z14.h\n"
+    "fmla z30.h, p2/M, z4.h, z8.h\n"
+    "fmla z31.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #3, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "fmla z28.h, p2/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z0.h, z13.h\n"
+    "fmla z30.h, p2/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x0]\n"
+    "fmla z31.h, p2/M, z0.h, z12.h\n"
+    "fmla z28.h, p2/M, z1.h, z13.h\n"
+    "fmla z29.h, p2/M, z1.h, z5.h\n"
+    "fmla z30.h, p2/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "fmla z31.h, p2/M, z1.h, z9.h\n"
+    "fmla z28.h, p2/M, z2.h, z5.h\n"
+    "ld1h { z5.h }, p1/Z, [x27, x13, LSL #1]\n"
+    "fmla z29.h, p2/M, z2.h, z6.h\n"
+    "fmla z30.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    "inch x15\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "whilelt p3.h, x15, %x[n_channels]\n"
+    "fmla z31.h, p2/M, z2.h, z11.h\n"
+    "fmla z28.h, p2/M, z3.h, z6.h\n"
+    "ld1h { z6.h }, p1/Z, [x26, x13, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x40]\n"
+    "fmla z29.h, p2/M, z3.h, z8.h\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "ld1h { z7.h }, p1/Z, [x25, x13, LSL #1]\n"
+    "ld1h { z13.h }, p1/Z, [x22, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z3.h, z12.h\n"
+    "fmla z28.h, p2/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p1/Z, [x24, x13, LSL #1]\n"
+    "fmla z29.h, p2/M, z4.h, z10.h\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "ld1h { z11.h }, p1/Z, [x21, x13, LSL #1]\n"
+    "ld1h { z12.h }, p1/Z, [x20, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x23, x13, LSL #1]\n"
+    "ld1h { z10.h }, p1/Z, [x27, x13, LSL #1]\n"
+    "ld1h { z14.h }, p1/Z, [x26, x13, LSL #1]\n"
+    "inch x13\n"
+    ".inst 0xa040a1c0  // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "cmp x13, %x[n_channels]\n"
+    ".inst 0xc170ca5c  // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+    "ld1h { z4.h }, p2/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+    "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z28, z17\n fmla z28.h, p2/M, z0.h, z5.h\n"
+    "movprfx z29, z17\n fmla z29.h, p2/M, z0.h, z6.h\n"
+    "ldr x25, [x16, #0x50]\n"
+    "inch x28\n"
+    "movprfx z30, z17\n fmla z30.h, p2/M, z0.h, z7.h\n"
+    "movprfx z31, z17\n fmla z31.h, p2/M, z0.h, z8.h\n"
+    "ldr x24, [x16, #0x58]\n"
+    "ld1h { z0.h }, p2/Z, [x14]\n"
+    "ldr x23, [x16, #0x60]\n"
+    "mov p0.b, p3.b\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x22, [x16, #0x68]\n"
+    "fmla z28.h, p2/M, z1.h, z6.h\n"
+    "fmla z29.h, p2/M, z1.h, z9.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "fmla z30.h, p2/M, z1.h, z8.h\n"
+    "fmla z31.h, p2/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #1, MUL VL]\n"
+    "ldr x20, [x16, #0x78]\n"
+    "ldr x27, [x16, #0x80]\n"
+    "fmla z28.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z2.h, z11.h\n"
+    "ldr x26, [x16, #0x88]\n"
+    "fmla z30.h, p2/M, z2.h, z13.h\n"
+    "fmla z31.h, p2/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #2, MUL VL]\n"
+    "ldr x25, [x16, #0x90]\n"
+    "ldr x24, [x16, #0x98]\n"
+    "fmla z28.h, p2/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z3.h, z12.h\n"
+    "ldr x23, [x16, #0xa0]\n"
+    "fmla z30.h, p2/M, z3.h, z5.h\n"
+    "fmla z31.h, p2/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #3, MUL VL]\n"
+    "ldr x22, [x16, #0xa8]\n"
+    "fmla z28.h, p2/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xb0]\n"
+    "fmla z29.h, p2/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p2/M, z4.h, z6.h\n"
+    "fmla z31.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #4, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z7.h\n"
+    "fmla z29.h, p2/M, z0.h, z8.h\n"
+    "fmla z30.h, p2/M, z0.h, z14.h\n"
+    "fmla z31.h, p2/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #5, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0xc8]\n"
+    "fmla z29.h, p2/M, z1.h, z13.h\n"
+    "fmla z30.h, p2/M, z1.h, z11.h\n"
+    "fmla z31.h, p2/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #6, MUL VL]\n"
+    "fmla z28.h, p2/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0xc0]\n"
+    "fmla z29.h, p2/M, z2.h, z5.h\n"
+    "fmla z30.h, p2/M, z2.h, z12.h\n"
+    "fmla z31.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #7, MUL VL]\n"
+    "addvl x14, x14, #16\n"
+    "fmla z28.h, p2/M, z3.h, z5.h\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0xd0]\n"
+    "fmla z29.h, p2/M, z3.h, z6.h\n"
+    "fmla z30.h, p2/M, z3.h, z9.h\n"
+    "fmla z31.h, p2/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #-8, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0xd8]\n"
+    "fmla z29.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "ldr x23, [x16, #0xe0]\n"
+    "fmla z30.h, p2/M, z4.h, z13.h\n"
+    "fmla z31.h, p2/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #-7, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x16, #0xf8]\n"
+    "fmla z29.h, p2/M, z0.h, z11.h\n"
+    "fmla z30.h, p2/M, z0.h, z5.h\n"
+    "fmla z31.h, p2/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #-6, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "ldr x22, [x16, #0xe8]\n"
+    "fmla z29.h, p2/M, z1.h, z12.h\n"
+    "fmla z30.h, p2/M, z1.h, z6.h\n"
+    "fmla z31.h, p2/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p2/Z, [x14, #-5, MUL VL]\n"
+    "fmla z28.h, p2/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "ldr x21, [x16, #0xf0]\n"
+    "fmla z29.h, p2/M, z2.h, z9.h\n"
+    "fmla z30.h, p2/M, z2.h, z10.h\n"
+    "fmla z31.h, p2/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #-4, MUL VL]\n"
+    "fmla z28.h, p2/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "ldr x27, [x16, #0x100]\n"
+    "fmla z29.h, p2/M, z3.h, z13.h\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "fmla z31.h, p2/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #-3, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "ldr x26, [x16, #0x108]\n"
+    "fmla z29.h, p2/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p3/Z, [x23, x15, LSL #1]\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "fmla z31.h, p2/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #-2, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x16, #0x110]\n"
+    "fmla z29.h, p2/M, z0.h, z6.h\n"
+    "fmla z30.h, p2/M, z0.h, z9.h\n"
+    "fmla z31.h, p2/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p2/Z, [x14, #-1, MUL VL]\n"
+    "fmla z28.h, p2/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "ldr x24, [x16, #0x118]\n"
+    "fmla z29.h, p2/M, z1.h, z10.h\n"
+    "fmla z30.h, p2/M, z1.h, z13.h\n"
+    "fmla z31.h, p2/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p2/Z, [x14]\n"
+    "fmla z28.h, p2/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p3/Z, [x22, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z2.h, z11.h\n"
+    "fmla z30.h, p2/M, z2.h, z5.h\n"
+    "fmla z31.h, p2/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p2/Z, [x14, #1, MUL VL]\n"
+    "fmla z28.h, p2/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z3.h, z12.h\n"
+    "fmla z30.h, p2/M, z3.h, z6.h\n"
+    "fmla z31.h, p2/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p2/Z, [x14, #2, MUL VL]\n"
+    "fmla z28.h, p2/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z4.h, z14.h\n"
+    "fmla z30.h, p2/M, z4.h, z8.h\n"
+    "fmla z31.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p2/Z, [x14, #3, MUL VL]\n"
+    "fmla z28.h, p2/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+    "fmla z29.h, p2/M, z0.h, z13.h\n"
+    "fmla z30.h, p2/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p3/Z, [x26, x15, LSL #1]\n"
+    "fmla z31.h, p2/M, z0.h, z12.h\n"
+    "fmla z28.h, p2/M, z1.h, z13.h\n"
+    "fmla z29.h, p2/M, z1.h, z5.h\n"
+    "fmla z30.h, p2/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p3/Z, [x25, x15, LSL #1]\n"
+    "fmla z31.h, p2/M, z1.h, z9.h\n"
+    "fmla z28.h, p2/M, z2.h, z5.h\n"
+    "fmla z29.h, p2/M, z2.h, z6.h\n"
+    "fmla z30.h, p2/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p3/Z, [x24, x15, LSL #1]\n"
+    "fmla z31.h, p2/M, z2.h, z11.h\n"
+    "fmla z28.h, p2/M, z3.h, z6.h\n"
+    "fmla z29.h, p2/M, z3.h, z8.h\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "fmla z31.h, p2/M, z3.h, z12.h\n"
+    "fmla z28.h, p2/M, z4.h, z8.h\n"
+    "fmla z29.h, p2/M, z4.h, z10.h\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "fmla z31.h, p2/M, z4.h, z9.h\n"
+    ".inst 0xc170ca5c  // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+    "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+    "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..25d83f15c3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..96cfd5e497
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "mov x4, #0x0\n"
+    "mov x5, #0x0\n"
+    "1:"  // Tile loop
+    "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x2\n"
+    "str x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "mul x20, x4, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "madd x20, x5, x6, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "add x7, x7, x20, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x8, x7, x21, LSL #2\n"
+    "add x17, x8, x21, LSL #2\n"
+    "add x16, x6, x6\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x14, x17, x21, LSL #2\n"
+    "add x13, x16, x6\n"
+    "cbnz x5, 2f\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "sub x21, x20, x5\n"
+    "sub x21, x21, #0x1\n"
+    "lsl x12, %x[n_channels], #0x2\n"
+    "mov x20, #0x8\n"
+    "and x21, x21, #0x3fffff\n"
+    "mul x20, x20, x6\n"
+    "orr x12, x12, x21, LSL #22\n"
+    "orr x12, x12, x20, LSL #38\n"
+    "add x11, x8, x6, LSL #2\n"
+    "add x10, x7, x13, LSL #2\n"
+    "add x9, x8, x16, LSL #2\n"
+    "add x28, x17, x6, LSL #2\n"
+    "add x27, x14, x13, LSL #2\n"
+    "add x26, x7, x6, LSL #2\n"
+    "add x25, x7, x16, LSL #2\n"
+    "add x24, x17, x16, LSL #2\n"
+    "add x23, x8, x13, LSL #2\n"
+    "add x22, x17, x13, LSL #2\n"
+    "add x21, x14, x6, LSL #2\n"
+    "add x20, x14, x16, LSL #2\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac48fa  // rprfm pldonce, x12, [x7]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac491a  // rprfm pldonce, x12, [x8]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac4a3a  // rprfm pldonce, x12, [x17]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x4, x22\n"  // offset = tile_i * ld_output_row
+    "mov x20, #0x2\n"
+    "ld1w { z22.s }, p3/Z, [x15]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "madd x21, x5, x25, x21\n"  // offset += tile_j * ld_output_col
+    "addvl x15, x15, #1\n"
+    ".inst 0xa040c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "mul x21, x21, x20\n"  // offset *= output_tile_size
+    "cntw x23\n"
+    "ld1rw { z21.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "addvl x15, x15, #4\n"
+    "add x24, x24, x21, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "addvl x15, x15, #4\n"
+    "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cmp x23, %x[n_channels]\n"
+    "add x22, x24, x22, LSL #2\n"
+    "ld1w { z8.s }, p3/Z, [x15]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "ld1w { z9.s }, p2/Z, [x8, x6, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x7]\n"
+    "addvl x15, x15, #1\n"
+    "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x8, x16, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x17, x6, LSL #2]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z22\n fmla z28.s, p3/M, z4.s, z9.s\n"
+    "movprfx z29, z22\n fmla z29.s, p3/M, z3.s, z9.s\n"
+    "whilelt p1.s, x23, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z30, z22\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x14]\n"
+    "incw x23\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x17, x16, LSL #2]\n"
+    "incw x20\n"
+    "fmla z28.s, p3/M, z5.s, z12.s\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z11.s }, p2/Z, [x7, x6, LSL #2]\n"
+    "fmla z30.s, p3/M, z6.s, z18.s\n"
+    "fmla z31.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z10.s }, p2/Z, [x7, x16, LSL #2]\n"
+    "addvl x7, x7, #1\n"
+    "fmla z28.s, p3/M, z7.s, z13.s\n"
+    "fmla z29.s, p3/M, z6.s, z13.s\n"
+    "ld1w { z22.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "fmla z30.s, p3/M, z4.s, z13.s\n"
+    "fmla z31.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z9.s }, p2/Z, [x8]\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z30.s, p3/M, z5.s, z16.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "fmla z28.s, p3/M, z2.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z18.s }, p2/Z, [x17]\n"
+    "fmla z30.s, p3/M, z0.s, z9.s\n"
+    "fmla z31.s, p3/M, z2.s, z19.s\n"
+    "fmla z28.s, p3/M, z8.s, z16.s\n"
+    "fmla z29.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z30.s, p3/M, z3.s, z18.s\n"
+    "fmla z31.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z13.s }, p1/Z, [x17, x6, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z9.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z19.s\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z31.s, p3/M, z6.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "fmla z29.s, p3/M, z8.s, z17.s\n"
+    ".inst 0xa040c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+    "addvl x15, x15, #4\n"
+    "fmla z30.s, p3/M, z8.s, z16.s\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+    "addvl x15, x15, #4\n"
+    "cmp x23, %x[n_channels]\n"
+    ".inst 0xc1aecabc  // fclamp { z28.s-z31.s }, z21.s, z14.s\n"
+    "addvl x14, x14, #1\n"
+    "ld1w { z9.s }, p1/Z, [x8, x6, LSL #2]\n"
+    "ld1w { z10.s }, p1/Z, [x7]\n"
+    "st1w { z28.s }, p0, [x24]\n"
+    "ld1w { z11.s }, p1/Z, [x7, x13, LSL #2]\n"
+    "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+    "addvl x24, x24, #1\n"
+    "ld1w { z12.s }, p1/Z, [x8, x16, LSL #2]\n"
+    "st1w { z30.s }, p0, [x22]\n"
+    "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+    "addvl x22, x22, #1\n"
+    "ld1w { z8.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z24, z22\n fmla z24.s, p3/M, z4.s, z9.s\n"
+    "movprfx z25, z22\n fmla z25.s, p3/M, z3.s, z9.s\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x5, x5, #0x1\n"
+    "movprfx z26, z22\n fmla z26.s, p3/M, z1.s, z9.s\n"
+    "movprfx z27, z22\n fmla z27.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z17.s }, p2/Z, [x14]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z25.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z26.s, p3/M, z2.s, z12.s\n"
+    "fmla z27.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z20.s }, p2/Z, [x17, x16, LSL #2]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z25.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x7, x6, LSL #2]\n"
+    "cmp x5, x20\n"
+    "fmla z26.s, p3/M, z6.s, z17.s\n"
+    "fmla z27.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x7, x16, LSL #2]\n"
+    "add x20, x4, #0x1\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmla z25.s, p3/M, z6.s, z13.s\n"
+    "csel x4, x4, x20, LT\n"
+    "mov p0.b, p2.b\n"
+    "fmla z26.s, p3/M, z4.s, z13.s\n"
+    "fmla z27.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x8]\n"
+    "csel x5, x5, XZR, LT\n"
+    "fmla z24.s, p3/M, z1.s, z18.s\n"
+    "fmla z25.s, p3/M, z0.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
+    "cmp x4, x21\n"
+    "fmla z26.s, p3/M, z5.s, z20.s\n"
+    "fmla z27.s, p3/M, z4.s, z20.s\n"
+    "fmla z24.s, p3/M, z2.s, z17.s\n"
+    "fmla z25.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x17]\n"
+    "fmla z26.s, p3/M, z0.s, z16.s\n"
+    "fmla z27.s, p3/M, z2.s, z19.s\n"
+    "fmla z24.s, p3/M, z8.s, z20.s\n"
+    "fmla z25.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
+    "fmla z26.s, p3/M, z3.s, z18.s\n"
+    "fmla z27.s, p3/M, z5.s, z17.s\n"
+    "fmla z24.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
+    "fmla z25.s, p3/M, z5.s, z19.s\n"
+    "fmla z26.s, p3/M, z7.s, z16.s\n"
+    "fmla z27.s, p3/M, z6.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "fmla z25.s, p3/M, z8.s, z17.s\n"
+    "fmla z26.s, p3/M, z8.s, z16.s\n"
+    "fmla z27.s, p3/M, z7.s, z16.s\n"
+    ".inst 0xc1aecab8  // fclamp { z24.s-z27.s }, z21.s, z14.s\n"
+    "st1w { z24.s }, p0, [x24]\n"
+    "st1w { z25.s }, p0, [x24, x25, LSL #2]\n"
+    "st1w { z26.s }, p0, [x22]\n"
+    "st1w { z27.s }, p0, [x22, x25, LSL #2]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..39f1b3635f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[16];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ptrue p3.b\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ld1w { z23.s }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ldp x13, x12, [x20, #0x0]\n"
+    "cntw x11\n"
+    ".inst 0xa040c1c0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "ldp x10, x9, [x20, #0x10]\n"
+    "mov x28, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    ".inst 0xa040c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "addvl x14, x14, #4\n"
+    "cmp x11, %x[n_channels]\n"
+    "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x27, XZR, x11\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1w { z8.s }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "ld1w { z9.s }, p2/Z, [x24, x28, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
+    "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
+    "ldr x20, [x15, #0x28]\n"
+    "whilelt p1.s, x11, %x[n_channels]\n"
+    "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ldr x21, [x15, #0x38]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z12.s\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x40]\n"
+    "fmla z30.s, p3/M, z6.s, z19.s\n"
+    "fmla z31.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z25.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x21, [x15, #0x50]\n"
+    "fmla z28.s, p3/M, z7.s, z13.s\n"
+    "fmla z29.s, p3/M, z6.s, z13.s\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ld1w { z23.s }, p3/Z, [x14]\n"
+    "fmla z30.s, p3/M, z4.s, z13.s\n"
+    "fmla z31.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla z28.s, p3/M, z1.s, z16.s\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x68]\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "fmla z31.s, p3/M, z4.s, z17.s\n"
+    "ldr x26, [x15, #0x70]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z28.s, p3/M, z2.s, z25.s\n"
+    "fmla z29.s, p3/M, z1.s, z25.s\n"
+    "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ldr x25, [x15, #0x78]\n"
+    "fmla z30.s, p3/M, z0.s, z11.s\n"
+    "fmla z31.s, p3/M, z2.s, z19.s\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "incw x27\n"
+    "fmla z28.s, p3/M, z8.s, z17.s\n"
+    "fmla z29.s, p3/M, z7.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "fmla z30.s, p3/M, z3.s, z18.s\n"
+    "fmla z31.s, p3/M, z5.s, z17.s\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1w { z13.s }, p1/Z, [x20, x11, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x28, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z19.s\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z31.s, p3/M, z6.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x28, LSL #2]\n"
+    "incw x28\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "fmla z29.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z9.s }, p1/Z, [x24, x11, LSL #2]\n"
+    "whilelt p2.s, x28, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z8.s, z16.s\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z10.s }, p1/Z, [x23, x11, LSL #2]\n"
+    "ld1w { z11.s }, p1/Z, [x22, x11, LSL #2]\n"
+    ".inst 0xc1afcadc  // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
+    "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x21, x11, LSL #2]\n"
+    "incw x11\n"
+    "cmp x11, %x[n_channels]\n"
+    "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
+    ".inst 0xa040c1c0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
+    ".inst 0xa040c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+    "addvl x14, x14, #4\n"
+    "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x14]\n"
+    "addvl x14, x14, #1\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
+    "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
+    "ldr x20, [x15, #0x28]\n"
+    "incw x27\n"
+    "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ldr x21, [x15, #0x38]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x15, #0x48]\n"
+    "ld1w { z20.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z12.s\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x40]\n"
+    "fmla z30.s, p3/M, z6.s, z17.s\n"
+    "fmla z31.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x50]\n"
+    "fmla z28.s, p3/M, z7.s, z13.s\n"
+    "fmla z29.s, p3/M, z6.s, z13.s\n"
+    "ldr x21, [x15, #0x58]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z4.s, z13.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x60]\n"
+    "fmla z28.s, p3/M, z1.s, z18.s\n"
+    "fmla z29.s, p3/M, z0.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ldr x22, [x15, #0x68]\n"
+    "fmla z30.s, p3/M, z5.s, z20.s\n"
+    "fmla z31.s, p3/M, z4.s, z20.s\n"
+    "ldr x21, [x15, #0x70]\n"
+    "fmla z28.s, p3/M, z2.s, z17.s\n"
+    "fmla z29.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla z30.s, p3/M, z0.s, z16.s\n"
+    "fmla z31.s, p3/M, z2.s, z19.s\n"
+    "fmla z28.s, p3/M, z8.s, z20.s\n"
+    "fmla z29.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z17.s }, p2/Z, [x22, x28, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z18.s\n"
+    "fmla z31.s, p3/M, z5.s, z17.s\n"
+    "fmla z28.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z19.s\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z31.s, p3/M, z6.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "fmla z29.s, p3/M, z8.s, z17.s\n"
+    "fmla z30.s, p3/M, z8.s, z16.s\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    ".inst 0xc1afcadc  // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
+    "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
+    "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
+    "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
+    "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..bd330dc21e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..d15a3a8377
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x3\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "add x5, x5, x20, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x6, x5, x21, LSL #2\n"
+    "add x7, x6, x21, LSL #2\n"
+    "add x8, x4, x4\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x16, x7, x21, LSL #2\n"
+    "add x15, x8, x4\n"
+    "add x14, x16, x21, LSL #2\n"
+    "add x13, x15, x4\n"
+    "cbnz x3, 2f\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "sub x21, x20, x3\n"
+    "sub x21, x21, #0x1\n"
+    "lsl x12, %x[n_channels], #0x2\n"
+    "mov x20, #0xc\n"
+    "and x21, x21, #0x3fffff\n"
+    "mul x20, x20, x4\n"
+    "orr x12, x12, x21, LSL #22\n"
+    "orr x12, x12, x20, LSL #38\n"
+    "add x27, x7, x8, LSL #2\n"
+    "add x26, x5, x13, LSL #2\n"
+    "add x25, x6, x8, LSL #2\n"
+    "add x24, x14, x13, LSL #2\n"
+    "add x23, x7, x4, LSL #2\n"
+    "add x22, x5, x4, LSL #2\n"
+    "add x21, x5, x15, LSL #2\n"
+    "add x20, x7, x15, LSL #2\n"
+    "add x11, x6, x13, LSL #2\n"
+    "add x10, x16, x8, LSL #2\n"
+    "add x9, x16, x13, LSL #2\n"
+    "add x28, x14, x4, LSL #2\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    "add x27, x6, x4, LSL #2\n"
+    ".inst 0xf8ac48ba  // rprfm pldonce, x12, [x5]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    "add x26, x6, x15, LSL #2\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    "add x25, x14, x15, LSL #2\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    "add x24, x16, x4, LSL #2\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    "add x23, x5, x8, LSL #2\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    "add x22, x16, x15, LSL #2\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    "add x21, x7, x13, LSL #2\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "add x20, x14, x8, LSL #2\n"
+    ".inst 0xf8ac48da  // rprfm pldonce, x12, [x6]\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac4a1a  // rprfm pldonce, x12, [x16]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac48fa  // rprfm pldonce, x12, [x7]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x2, x22\n"  // offset = tile_i * ld_output_row
+    "mov x20, #0x3\n"
+    "ld1w { z24.s }, p3/Z, [x17]\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "madd x21, x3, x27, x21\n"  // offset += tile_j * ld_output_col
+    "mul x21, x21, x20\n"  // offset *= output_tile_size
+    "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldr x26, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "addvl x17, x17, #1\n"
+    "add x26, x26, x21, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "cntw x25\n"
+    "addvl x17, x17, #4\n"
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "add x24, x26, x22, LSL #2\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "addvl x17, x17, #4\n"
+    "cmp x25, %x[n_channels]\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "add x23, x24, x22, LSL #2\n"
+    "add x22, x27, x27\n"
+    "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x25\n"
+    "ld1w { z10.s }, p2/Z, [x5]\n"
+    "ld1w { z11.s }, p2/Z, [x5, x13, LSL #2]\n"
+    "addvl x17, x17, #1\n"
+    "ld1w { z12.s }, p2/Z, [x14]\n"
+    "ld1w { z13.s }, p2/Z, [x6, x8, LSL #2]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
+    "movprfx z27, z24\n fmla z27.s, p3/M, z8.s, z9.s\n"
+    "whilelt p1.s, x25, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "incw x25\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
+    "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
+    "incw x20\n"
+    "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
+    "fmla z27.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x7, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z19.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
+    "fmla z28.s, p3/M, z6.s, z19.s\n"
+    "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
+    "fmla z27.s, p3/M, z5.s, z13.s\n"
+    "fmla z29.s, p3/M, z3.s, z13.s\n"
+    "fmla z30.s, p3/M, z2.s, z13.s\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "fmla z20.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z18.s }, p2/Z, [x5, x4, LSL #2]\n"
+    "fmla z21.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z15.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z18.s\n"
+    "fmla z23.s, p3/M, z8.s, z15.s\n"
+    "fmla z27.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z19.s\n"
+    "fmla z30.s, p3/M, z4.s, z19.s\n"
+    "ld1w { z24.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z31.s, p3/M, z3.s, z19.s\n"
+    "fmla z21.s, p3/M, z1.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x6]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x16]\n"
+    "fmla z20.s, p3/M, z4.s, z10.s\n"
+    "fmla z27.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z9.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "fmla z22.s, p3/M, z2.s, z10.s\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "fmla z28.s, p3/M, z8.s, z10.s\n"
+    "fmla z29.s, p3/M, z7.s, z10.s\n"
+    "fmla z31.s, p3/M, z5.s, z10.s\n"
+    "fmla z30.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z20.s, p3/M, z2.s, z9.s\n"
+    "fmla z21.s, p3/M, z3.s, z16.s\n"
+    "fmla z22.s, p3/M, z4.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z19.s\n"
+    "fmla z27.s, p3/M, z3.s, z17.s\n"
+    "fmla z29.s, p3/M, z5.s, z9.s\n"
+    "ld1w { z17.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z31.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z20.s, p3/M, z6.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z19.s\n"
+    "ld1w { z18.s }, p2/Z, [x6, x4, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z17.s\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "fmla z30.s, p3/M, z8.s, z19.s\n"
+    "fmla z20.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "addvl x6, x6, #1\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmla z28.s, p3/M, z3.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z0.s, z18.s\n"
+    "fmla z27.s, p3/M, z4.s, z18.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z23.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z18.s\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "addvl x16, x16, #1\n"
+    "fmla z29.s, p3/M, z4.s, z17.s\n"
+    "fmla z31.s, p3/M, z2.s, z17.s\n"
+    "fmla z20.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+    "fmla z21.s, p3/M, z4.s, z11.s\n"
+    "addvl x5, x5, #1\n"
+    "fmla z22.s, p3/M, z3.s, z11.s\n"
+    "fmla z27.s, p3/M, z2.s, z16.s\n"
+    "ld1w { z10.s }, p1/Z, [x5]\n"
+    "fmla z23.s, p3/M, z4.s, z19.s\n"
+    "fmla z30.s, p3/M, z7.s, z11.s\n"
+    "fmla z31.s, p3/M, z6.s, z11.s\n"
+    "fmla z28.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x7]\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z20.s, p3/M, z7.s, z19.s\n"
+    "addvl x7, x7, #1\n"
+    "fmla z22.s, p3/M, z5.s, z19.s\n"
+    "fmla z27.s, p3/M, z6.s, z18.s\n"
+    "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
+    "fmla z21.s, p3/M, z0.s, z18.s\n"
+    "fmla z23.s, p3/M, z2.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z18.s\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "fmla z29.s, p3/M, z8.s, z17.s\n"
+    "fmla z20.s, p3/M, z5.s, z17.s\n"
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "fmla z22.s, p3/M, z7.s, z16.s\n"
+    "addvl x14, x14, #1\n"
+    "cmp x25, %x[n_channels]\n"
+    "fmla z23.s, p3/M, z6.s, z16.s\n"
+    "fmax z27.s, p3/M, z27.s, z26.s\n"
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "fmin z27.s, p3/M, z27.s, z14.s\n"
+    ".inst 0xc1aecb5c  // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
+    "ld1w { z11.s }, p1/Z, [x5, x13, LSL #2]\n"
+    ".inst 0xc1aecb54  // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
+    "ld1w { z12.s }, p1/Z, [x14]\n"
+    "st1w { z27.s }, p0, [x26]\n"
+    "ld1w { z13.s }, p1/Z, [x6, x8, LSL #2]\n"
+    "st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
+    "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
+    "addvl x26, x26, #1\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "st1w { z30.s }, p0, [x24]\n"
+    "st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
+    "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
+    "addvl x24, x24, #1\n"
+    "st1w { z21.s }, p0, [x23]\n"
+    "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
+    "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+    "addvl x23, x23, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
+    "movprfx z25, z24\n fmla z25.s, p3/M, z8.s, z9.s\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x3, x3, #0x1\n"
+    "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x2, #0x1\n"
+    "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
+    "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x3, x20\n"
+    "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
+    "fmla z25.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z27.s }, p2/Z, [x7, x15, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
+    "csel x2, x2, x21, LT\n"
+    "fmla z28.s, p3/M, z6.s, z17.s\n"
+    "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
+    "mov p0.b, p2.b\n"
+    "csel x3, x3, XZR, LT\n"
+    "fmla z25.s, p3/M, z5.s, z13.s\n"
+    "fmla z29.s, p3/M, z3.s, z13.s\n"
+    "cmp x2, x20\n"
+    "fmla z30.s, p3/M, z2.s, z13.s\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "fmla z20.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z19.s }, p2/Z, [x5, x4, LSL #2]\n"
+    "fmla z21.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z19.s\n"
+    "fmla z23.s, p3/M, z8.s, z16.s\n"
+    "fmla z25.s, p3/M, z7.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z17.s\n"
+    "fmla z30.s, p3/M, z4.s, z17.s\n"
+    "fmla z31.s, p3/M, z3.s, z17.s\n"
+    "fmla z21.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x6]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x16]\n"
+    "fmla z20.s, p3/M, z4.s, z27.s\n"
+    "fmla z25.s, p3/M, z1.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "fmla z22.s, p3/M, z2.s, z27.s\n"
+    "fmla z23.s, p3/M, z1.s, z27.s\n"
+    "fmla z28.s, p3/M, z8.s, z27.s\n"
+    "fmla z29.s, p3/M, z7.s, z27.s\n"
+    "fmla z31.s, p3/M, z5.s, z27.s\n"
+    "fmla z30.s, p3/M, z0.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z20.s, p3/M, z2.s, z16.s\n"
+    "fmla z21.s, p3/M, z3.s, z17.s\n"
+    "fmla z22.s, p3/M, z4.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z19.s\n"
+    "fmla z25.s, p3/M, z3.s, z18.s\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z6.s, z17.s\n"
+    "fmla z31.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z20.s, p3/M, z6.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x6, x4, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z18.s\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "fmla z30.s, p3/M, z8.s, z19.s\n"
+    "fmla z20.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmla z28.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z0.s, z17.s\n"
+    "fmla z25.s, p3/M, z4.s, z17.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z23.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z17.s\n"
+    "fmla z28.s, p3/M, z5.s, z18.s\n"
+    "ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z18.s\n"
+    "fmla z31.s, p3/M, z2.s, z18.s\n"
+    "fmla z20.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+    "fmla z21.s, p3/M, z4.s, z17.s\n"
+    "fmla z22.s, p3/M, z3.s, z17.s\n"
+    "fmla z25.s, p3/M, z2.s, z16.s\n"
+    "fmla z23.s, p3/M, z4.s, z19.s\n"
+    "fmla z30.s, p3/M, z7.s, z17.s\n"
+    "fmla z31.s, p3/M, z6.s, z17.s\n"
+    "fmla z28.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x7]\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z20.s, p3/M, z7.s, z19.s\n"
+    "fmla z22.s, p3/M, z5.s, z19.s\n"
+    "fmla z25.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z0.s, z18.s\n"
+    "fmla z23.s, p3/M, z2.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z18.s\n"
+    "fmla z29.s, p3/M, z8.s, z17.s\n"
+    "fmla z20.s, p3/M, z5.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "fmla z22.s, p3/M, z7.s, z16.s\n"
+    "fmla z23.s, p3/M, z6.s, z16.s\n"
+    "fmax z25.s, p3/M, z25.s, z26.s\n"
+    "fmin z25.s, p3/M, z25.s, z14.s\n"
+    ".inst 0xc1aecb5c  // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
+    "st1w { z25.s }, p0, [x26]\n"
+    ".inst 0xc1aecb54  // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
+    "st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
+    "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
+    "st1w { z30.s }, p0, [x24]\n"
+    "st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
+    "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
+    "st1w { z21.s }, p0, [x23]\n"
+    "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
+    "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..2c868b6cf3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ld1w { z20.s }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "ldp x24, x23, [x17, #0x0]\n"
+    "ldp x22, x21, [x17, #0x10]\n"
+    "cntw x16\n"
+    ".inst 0xa040c100  // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "ldr x20, [x17, #0x20]\n"
+    "mov x15, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    ".inst 0xa040c104  // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "cmp x16, %x[n_channels]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x13, XZR, x16\n"
+    "ld1w { z8.s }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
+    "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
+    "ldr x22, [x17, #0x30]\n"
+    "incw x13\n"
+    "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
+    "fmla z21.s, p3/M, z0.s, z10.s\n"
+    "ldr x25, [x17, #0x38]\n"
+    "mov p1.b, p2.b\n"
+    "fmla z24.s, p3/M, z4.s, z13.s\n"
+    "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "ldr x21, [x17, #0x28]\n"
+    "whilelt p0.s, x16, %x[n_channels]\n"
+    "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
+    "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
+    "ldr x20, [x17, #0x48]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z23.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
+    "ldr x20, [x17, #0x40]\n"
+    "fmla z21.s, p3/M, z5.s, z13.s\n"
+    "fmla z24.s, p3/M, z6.s, z23.s\n"
+    "ldr x24, [x17, #0x50]\n"
+    "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "fmla z25.s, p3/M, z3.s, z13.s\n"
+    "ldr x23, [x17, #0x58]\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z27.s, p3/M, z1.s, z13.s\n"
+    "ldr x22, [x17, #0x60]\n"
+    "fmla z28.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z12.s\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "fmla z21.s, p3/M, z7.s, z23.s\n"
+    "ldr x21, [x17, #0x68]\n"
+    "fmla z24.s, p3/M, z0.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x27, [x17, #0x78]\n"
+    "fmla z26.s, p3/M, z4.s, z23.s\n"
+    "fmla z27.s, p3/M, z3.s, z23.s\n"
+    "ldr x20, [x17, #0x80]\n"
+    "ld1w { z20.s }, p3/Z, [x8]\n"
+    "fmla z30.s, p3/M, z0.s, z23.s\n"
+    "fmla z28.s, p3/M, z4.s, z19.s\n"
+    "ldr x11, [x17, #0x88]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z29.s, p3/M, z1.s, z23.s\n"
+    "fmla z21.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x26, [x17, #0x90]\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z25.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x25, [x17, #0x98]\n"
+    "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z5.s, z19.s\n"
+    "fmla z30.s, p3/M, z2.s, z19.s\n"
+    "ldr x24, [x17, #0xa0]\n"
+    "fmla z26.s, p3/M, z0.s, z18.s\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ldr x10, [x14, #0x0]\n"
+    "fmla z24.s, p3/M, z8.s, z19.s\n"
+    "fmla z25.s, p3/M, z7.s, z19.s\n"
+    "ldr x9, [x14, #0x8]\n"
+    "fmla z31.s, p3/M, z1.s, z19.s\n"
+    "fmla z29.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xa8]\n"
+    "fmla z26.s, p3/M, z6.s, z17.s\n"
+    "fmla z27.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z23.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z28.s, p3/M, z6.s, z16.s\n"
+    "fmla z30.s, p3/M, z4.s, z16.s\n"
+    "ldr x28, [x14, #0x10]\n"
+    "fmla z21.s, p3/M, z3.s, z18.s\n"
+    "fmla z25.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z15.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "ldr x21, [x17, #0xb0]\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "fmla z31.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.s, p3/M, z8.s, z16.s\n"
+    "fmla z28.s, p3/M, z8.s, z15.s\n"
+    "ldr x27, [x14, #0x18]\n"
+    "fmla z30.s, p3/M, z6.s, z19.s\n"
+    "fmla z24.s, p3/M, z3.s, z23.s\n"
+    "fmla z27.s, p3/M, z0.s, z23.s\n"
+    "fmla z31.s, p3/M, z5.s, z15.s\n"
+    "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z19.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z4.s, z23.s\n"
+    "fmla z26.s, p3/M, z1.s, z23.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z4.s, z17.s\n"
+    "fmla z27.s, p3/M, z2.s, z17.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "fmla z30.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x26, [x17, #0x20]\n"
+    "fmla z21.s, p3/M, z2.s, z17.s\n"
+    "fmla z26.s, p3/M, z7.s, z16.s\n"
+    "fmla z27.s, p3/M, z6.s, z16.s\n"
+    "fmla z29.s, p3/M, z4.s, z16.s\n"
+    "fmla z30.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z6.s, z18.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "fmla z24.s, p3/M, z1.s, z17.s\n"
+    "fmla z25.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmax z21.s, p3/M, z21.s, z22.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z16.s\n"
+    "fmla z29.s, p3/M, z0.s, z18.s\n"
+    "fmla z31.s, p3/M, z2.s, z17.s\n"
+    "fmla z27.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldp x22, x21, [x17, #0x0]\n"
+    "fmla z26.s, p3/M, z3.s, z18.s\n"
+    "fmla z25.s, p3/M, z8.s, z17.s\n"
+    "ldp x25, x24, [x17, #0x10]\n"
+    "incw x15\n"
+    "fmin z21.s, p3/M, z21.s, z14.s\n"
+    "st1w { z21.s }, p1, [x10, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x20]\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z16.s\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z9.s }, p0/Z, [x22, x16, LSL #2]\n"
+    "whilelt p2.s, x15, %x[n_channels]\n"
+    "fmla z31.s, p3/M, z6.s, z16.s\n"
+    ".inst 0xc1aecad8  // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
+    "st1w { z24.s }, p1, [x9, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x28]\n"
+    "st1w { z25.s }, p1, [x28, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x30]\n"
+    "ld1w { z10.s }, p0/Z, [x21, x16, LSL #2]\n"
+    ".inst 0xc1aecadc  // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
+    "st1w { z26.s }, p1, [x27, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x38]\n"
+    "ld1w { z11.s }, p0/Z, [x25, x16, LSL #2]\n"
+    "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x40]\n"
+    "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
+    "ld1w { z13.s }, p0/Z, [x26, x16, LSL #2]\n"
+    "incw x16\n"
+    "cmp x16, %x[n_channels]\n"
+    "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+    ".inst 0xa040c100  // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+    ".inst 0xa040c104  // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+    "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
+    "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
+    "ldr x23, [x17, #0x30]\n"
+    "incw x13\n"
+    "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
+    "fmla z21.s, p3/M, z0.s, z10.s\n"
+    "ldr x22, [x17, #0x38]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z24.s, p3/M, z4.s, z13.s\n"
+    "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "ldr x21, [x17, #0x28]\n"
+    "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
+    "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
+    "ldr x20, [x17, #0x48]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
+    "ldr x20, [x17, #0x40]\n"
+    "fmla z21.s, p3/M, z5.s, z13.s\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "ldr x25, [x17, #0x50]\n"
+    "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "fmla z25.s, p3/M, z3.s, z13.s\n"
+    "ldr x24, [x17, #0x58]\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z27.s, p3/M, z1.s, z13.s\n"
+    "ldr x23, [x17, #0x60]\n"
+    "fmla z28.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z12.s\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "fmla z21.s, p3/M, z7.s, z18.s\n"
+    "ldr x22, [x17, #0x68]\n"
+    "fmla z24.s, p3/M, z0.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x21, [x17, #0x78]\n"
+    "fmla z26.s, p3/M, z4.s, z18.s\n"
+    "fmla z27.s, p3/M, z3.s, z18.s\n"
+    "ldr x20, [x17, #0x80]\n"
+    "fmla z30.s, p3/M, z0.s, z18.s\n"
+    "fmla z28.s, p3/M, z4.s, z19.s\n"
+    "ldr x11, [x17, #0x88]\n"
+    "fmla z29.s, p3/M, z1.s, z18.s\n"
+    "fmla z21.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ldr x10, [x17, #0x90]\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z25.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x9, [x17, #0x98]\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z5.s, z19.s\n"
+    "fmla z30.s, p3/M, z2.s, z19.s\n"
+    "ldr x28, [x17, #0xa0]\n"
+    "fmla z26.s, p3/M, z0.s, z20.s\n"
+    "fmla z28.s, p3/M, z2.s, z17.s\n"
+    "ldr x27, [x14, #0x0]\n"
+    "fmla z24.s, p3/M, z8.s, z19.s\n"
+    "fmla z25.s, p3/M, z7.s, z19.s\n"
+    "ldr x26, [x14, #0x8]\n"
+    "fmla z31.s, p3/M, z1.s, z19.s\n"
+    "fmla z29.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x25, [x17, #0xa8]\n"
+    "fmla z26.s, p3/M, z6.s, z16.s\n"
+    "fmla z27.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xc0]\n"
+    "fmla z28.s, p3/M, z6.s, z19.s\n"
+    "fmla z30.s, p3/M, z4.s, z19.s\n"
+    "ldr x24, [x14, #0x10]\n"
+    "fmla z21.s, p3/M, z3.s, z20.s\n"
+    "fmla z25.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xb0]\n"
+    "fmla z29.s, p3/M, z5.s, z19.s\n"
+    "fmla z31.s, p3/M, z3.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.s, p3/M, z8.s, z19.s\n"
+    "fmla z28.s, p3/M, z8.s, z17.s\n"
+    "ldr x21, [x14, #0x18]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z24.s, p3/M, z3.s, z18.s\n"
+    "fmla z27.s, p3/M, z0.s, z18.s\n"
+    "fmla z31.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x10, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z4.s, z18.s\n"
+    "fmla z26.s, p3/M, z1.s, z18.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z4.s, z17.s\n"
+    "fmla z27.s, p3/M, z2.s, z17.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "fmla z30.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z2.s, z17.s\n"
+    "fmla z26.s, p3/M, z7.s, z16.s\n"
+    "fmla z27.s, p3/M, z6.s, z16.s\n"
+    "fmla z29.s, p3/M, z4.s, z16.s\n"
+    "fmla z30.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z6.s, z18.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "fmla z24.s, p3/M, z1.s, z17.s\n"
+    "fmla z25.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmax z21.s, p3/M, z21.s, z22.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z16.s\n"
+    "fmla z29.s, p3/M, z0.s, z18.s\n"
+    "fmla z31.s, p3/M, z2.s, z17.s\n"
+    "fmla z27.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z3.s, z18.s\n"
+    "fmla z25.s, p3/M, z8.s, z17.s\n"
+    "fmin z21.s, p3/M, z21.s, z14.s\n"
+    "st1w { z21.s }, p0, [x27, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x20]\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z16.s\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z31.s, p3/M, z6.s, z16.s\n"
+    ".inst 0xc1aecad8  // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
+    "st1w { z24.s }, p0, [x26, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x28]\n"
+    "st1w { z25.s }, p0, [x24, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x30]\n"
+    ".inst 0xc1aecadc  // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
+    "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x38]\n"
+    "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x40]\n"
+    "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
+    "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
+    "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+    "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..add666e14e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..efd37c38ec
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x4\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "add x5, x5, x20, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x6, x5, x21, LSL #2\n"
+    "add x7, x6, x21, LSL #2\n"
+    "add x8, x4, x4\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x16, x7, x21, LSL #2\n"
+    "add x15, x8, x4\n"
+    "add x14, x16, x21, LSL #2\n"
+    "add x13, x15, x4\n"
+    "add x12, x14, x21, LSL #2\n"
+    "add x11, x13, x4\n"
+    "cbnz x3, 2f\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "sub x21, x20, x3\n"
+    "sub x21, x21, #0x1\n"
+    "lsl x10, %x[n_channels], #0x2\n"
+    "mov x20, #0x10\n"
+    "and x21, x21, #0x3fffff\n"
+    "mul x20, x20, x4\n"
+    "orr x10, x10, x21, LSL #22\n"
+    "orr x10, x10, x20, LSL #38\n"
+    "add x9, x7, x8, LSL #2\n"
+    "add x28, x5, x11, LSL #2\n"
+    "add x27, x7, x15, LSL #2\n"
+    "add x26, x12, x11, LSL #2\n"
+    "add x25, x16, x8, LSL #2\n"
+    "add x24, x5, x4, LSL #2\n"
+    "add x23, x5, x13, LSL #2\n"
+    "add x22, x16, x15, LSL #2\n"
+    "add x21, x6, x11, LSL #2\n"
+    "add x20, x6, x8, LSL #2\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x14, x11, LSL #2\n"
+    ".inst 0xf8aa48ba  // rprfm pldonce, x10, [x5]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x6, x15, LSL #2\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x12, x4, LSL #2\n"
+    ".inst 0xf8aa499a  // rprfm pldonce, x10, [x12]\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x7, x4, LSL #2\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x12, x13, LSL #2\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x7, x13, LSL #2\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x5, x8, LSL #2\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x16, x4, LSL #2\n"
+    ".inst 0xf8aa48da  // rprfm pldonce, x10, [x6]\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x5, x15, LSL #2\n"
+    ".inst 0xf8aa49da  // rprfm pldonce, x10, [x14]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x16, x13, LSL #2\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    "add x9, x7, x11, LSL #2\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    "add x28, x14, x8, LSL #2\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    "add x27, x16, x11, LSL #2\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    "add x26, x12, x8, LSL #2\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    "add x25, x14, x15, LSL #2\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    "add x24, x12, x15, LSL #2\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    "add x23, x6, x4, LSL #2\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    "add x22, x6, x13, LSL #2\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    "add x21, x14, x4, LSL #2\n"
+    ".inst 0xf8aa48fa  // rprfm pldonce, x10, [x7]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "add x20, x14, x13, LSL #2\n"
+    ".inst 0xf8aa493a  // rprfm pldonce, x10, [x9]\n"
+    ".inst 0xf8aa4a1a  // rprfm pldonce, x10, [x16]\n"
+    ".inst 0xf8aa4b9a  // rprfm pldonce, x10, [x28]\n"
+    ".inst 0xf8aa4b7a  // rprfm pldonce, x10, [x27]\n"
+    ".inst 0xf8aa4b5a  // rprfm pldonce, x10, [x26]\n"
+    ".inst 0xf8aa4b3a  // rprfm pldonce, x10, [x25]\n"
+    ".inst 0xf8aa4b1a  // rprfm pldonce, x10, [x24]\n"
+    ".inst 0xf8aa4afa  // rprfm pldonce, x10, [x23]\n"
+    ".inst 0xf8aa4ada  // rprfm pldonce, x10, [x22]\n"
+    ".inst 0xf8aa4aba  // rprfm pldonce, x10, [x21]\n"
+    ".inst 0xf8aa4a9a  // rprfm pldonce, x10, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x2, x22\n"  // offset = tile_i * ld_output_row
+    "mov x20, #0x4\n"
+    "ld1w { z14.s }, p3/Z, [x17]\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "madd x21, x3, x9, x21\n"  // offset += tile_j * ld_output_col
+    "mul x21, x21, x20\n"  // offset *= output_tile_size
+    "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x28, x28, x21, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "addvl x17, x17, #1\n"
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "add x27, x28, x22, LSL #2\n"
+    "cntw x26\n"
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "addvl x17, x17, #4\n"
+    "add x25, x27, x22, LSL #2\n"
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "add x24, x9, x9\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
+    "addvl x17, x17, #4\n"
+    "cmp x26, %x[n_channels]\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "add x23, x25, x22, LSL #2\n"
+    "add x22, x24, x9\n"
+    "ld1w { z10.s }, p2/Z, [x5]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x26\n"
+    "ld1w { z11.s }, p2/Z, [x5, x11, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x7, x15, LSL #2]\n"
+    "addvl x17, x17, #1\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "whilelt p1.s, x26, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z3.s, z9.s\n"
+    "movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
+    "incw x26\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z18, z14\n fmla z18.s, p3/M, z0.s, z9.s\n"
+    "fmla z25.s, p3/M, z5.s, z12.s\n"
+    "incw x20\n"
+    "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+    "movprfx z30, z14\n fmla z30.s, p3/M, z6.s, z9.s\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z5.s, z9.s\n"
+    "movprfx z16, z14\n fmla z16.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "movprfx z31, z14\n fmla z31.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z19.s }, p2/Z, [x12]\n"
+    "fmla z26.s, p3/M, z4.s, z12.s\n"
+    "fmla z17.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z22.s }, p2/Z, [x12, x11, LSL #2]\n"
+    "fmla z18.s, p3/M, z1.s, z12.s\n"
+    "movprfx z20, z14\n fmla z20.s, p3/M, z6.s, z19.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z7.s, z9.s\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "fmla z30.s, p3/M, z7.s, z12.s\n"
+    "fmla z31.s, p3/M, z6.s, z12.s\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z3.s, z12.s\n"
+    "movprfx z19, z14\n fmla z19.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
+    "movprfx z23, z14\n fmla z23.s, p3/M, z8.s, z22.s\n"
+    "fmla z26.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+    "fmla z17.s, p3/M, z4.s, z9.s\n"
+    "fmla z18.s, p3/M, z3.s, z9.s\n"
+    "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z14.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "fmla z24.s, p3/M, z8.s, z9.s\n"
+    "fmla z16.s, p3/M, z5.s, z9.s\n"
+    "fmla z20.s, p3/M, z2.s, z9.s\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z9.s }, p2/Z, [x6]\n"
+    "fmla z28.s, p3/M, z1.s, z10.s\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14]\n"
+    "fmla z26.s, p3/M, z7.s, z11.s\n"
+    "fmla z27.s, p3/M, z6.s, z11.s\n"
+    "fmla z17.s, p3/M, z5.s, z11.s\n"
+    "fmla z18.s, p3/M, z4.s, z11.s\n"
+    "fmla z19.s, p3/M, z3.s, z11.s\n"
+    "fmla z21.s, p3/M, z2.s, z11.s\n"
+    "fmla z22.s, p3/M, z1.s, z11.s\n"
+    "fmla z23.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x6, x8, LSL #2]\n"
+    "fmla z24.s, p3/M, z0.s, z9.s\n"
+    "fmla z16.s, p3/M, z6.s, z12.s\n"
+    "fmla z20.s, p3/M, z3.s, z12.s\n"
+    "fmla z25.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, x11, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z9.s\n"
+    "fmla z31.s, p3/M, z5.s, z10.s\n"
+    "fmla z27.s, p3/M, z2.s, z10.s\n"
+    "fmla z29.s, p3/M, z4.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z11.s\n"
+    "fmla z26.s, p3/M, z0.s, z11.s\n"
+    "fmla z19.s, p3/M, z8.s, z12.s\n"
+    "fmla z23.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x12, x4, LSL #2]\n"
+    "fmla z24.s, p3/M, z2.s, z11.s\n"
+    "fmla z25.s, p3/M, z2.s, z10.s\n"
+    "fmla z28.s, p3/M, z5.s, z11.s\n"
+    "fmla z29.s, p3/M, z5.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "fmla z31.s, p3/M, z3.s, z10.s\n"
+    "fmla z26.s, p3/M, z1.s, z10.s\n"
+    "fmla z27.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z20.s, p3/M, z7.s, z12.s\n"
+    "fmla z21.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
+    "fmla z24.s, p3/M, z4.s, z9.s\n"
+    "fmla z25.s, p3/M, z3.s, z9.s\n"
+    "fmla z16.s, p3/M, z1.s, z9.s\n"
+    "fmla z17.s, p3/M, z0.s, z9.s\n"
+    "fmla z28.s, p3/M, z7.s, z9.s\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x5, x8, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z11.s\n"
+    "fmla z23.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "fmla z31.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z5.s, z10.s\n"
+    "fmla z27.s, p3/M, z4.s, z10.s\n"
+    "fmla z18.s, p3/M, z2.s, z10.s\n"
+    "fmla z19.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
+    "addvl x5, x5, #1\n"
+    "fmla z24.s, p3/M, z7.s, z11.s\n"
+    "fmla z25.s, p3/M, z6.s, z11.s\n"
+    "fmla z16.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z3.s, z11.s\n"
+    "fmla z20.s, p3/M, z1.s, z11.s\n"
+    "fmla z21.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z12.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x7]\n"
+    "fmla z22.s, p3/M, z2.s, z11.s\n"
+    "fmla z31.s, p3/M, z0.s, z9.s\n"
+    "fmla z24.s, p3/M, z3.s, z10.s\n"
+    "fmla z16.s, p3/M, z0.s, z10.s\n"
+    "fmla z26.s, p3/M, z8.s, z11.s\n"
+    "fmla z27.s, p3/M, z7.s, z11.s\n"
+    "fmla z18.s, p3/M, z5.s, z11.s\n"
+    "fmla z19.s, p3/M, z4.s, z11.s\n"
+    "fmla z23.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z9.s\n"
+    "fmla z30.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
+    "addvl x7, x7, #1\n"
+    "fmla z28.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x16]\n"
+    "fmla z21.s, p3/M, z4.s, z11.s\n"
+    "fmla z22.s, p3/M, z3.s, z11.s\n"
+    "fmla z31.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
+    "fmla z27.s, p3/M, z5.s, z12.s\n"
+    "fmla z19.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x16, x11, LSL #2]\n"
+    "addvl x16, x16, #1\n"
+    "fmla z24.s, p3/M, z6.s, z10.s\n"
+    "fmla z16.s, p3/M, z3.s, z10.s\n"
+    "fmla z20.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x12, x8, LSL #2]\n"
+    "fmla z23.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z7.s, z10.s\n"
+    "fmla z22.s, p3/M, z6.s, z10.s\n"
+    "fmla z16.s, p3/M, z8.s, z11.s\n"
+    "fmla z17.s, p3/M, z7.s, z11.s\n"
+    "fmla z18.s, p3/M, z6.s, z11.s\n"
+    "fmla z20.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z19.s, p3/M, z5.s, z12.s\n"
+    "fmla z21.s, p3/M, z5.s, z11.s\n"
+    "fmla z22.s, p3/M, z4.s, z11.s\n"
+    "fmla z23.s, p3/M, z3.s, z11.s\n"
+    "fmla z27.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z10.s\n"
+    "addvl x12, x12, #1\n"
+    "ld1w { z10.s }, p2/Z, [x6, x4, LSL #2]\n"
+    "fmla z17.s, p3/M, z8.s, z11.s\n"
+    "fmla z18.s, p3/M, z7.s, z11.s\n"
+    "fmla z19.s, p3/M, z6.s, z11.s\n"
+    "fmla z21.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "addvl x6, x6, #1\n"
+    "fmla z22.s, p3/M, z7.s, z12.s\n"
+    "fmla z23.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "fmla z29.s, p3/M, z3.s, z10.s\n"
+    "fmla z24.s, p3/M, z1.s, z10.s\n"
+    "fmla z25.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z5.s, z11.s\n"
+    "fmla z31.s, p3/M, z4.s, z11.s\n"
+    "cmp x26, %x[n_channels]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z26.s, p3/M, z2.s, z11.s\n"
+    "fmla z27.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p1/Z, [x5, x11, LSL #2]\n"
+    "fmla z16.s, p3/M, z7.s, z12.s\n"
+    "fmla z17.s, p3/M, z6.s, z12.s\n"
+    "fmla z20.s, p3/M, z4.s, z12.s\n"
+    "fmla z21.s, p3/M, z3.s, z12.s\n"
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "fmla z18.s, p3/M, z8.s, z10.s\n"
+    "fmla z19.s, p3/M, z7.s, z10.s\n"
+    "ld1w { z12.s }, p1/Z, [x7, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z5.s, z10.s\n"
+    "fmla z23.s, p3/M, z4.s, z10.s\n"
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    ".inst 0xc1afc9bc  // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
+    ".inst 0xc1afc9b8  // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
+    "ld1w { z10.s }, p1/Z, [x5]\n"
+    ".inst 0xc1afc9b0  // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
+    ".inst 0xc1afc9b4  // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
+    "st1w { z28.s }, p0, [x28]\n"
+    "st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "st1w { z30.s }, p0, [x28, x24, LSL #2]\n"
+    "st1w { z31.s }, p0, [x28, x22, LSL #2]\n"
+    "addvl x28, x28, #1\n"
+    "st1w { z24.s }, p0, [x27]\n"
+    "st1w { z25.s }, p0, [x27, x9, LSL #2]\n"
+    "st1w { z26.s }, p0, [x27, x24, LSL #2]\n"
+    "st1w { z27.s }, p0, [x27, x22, LSL #2]\n"
+    "addvl x27, x27, #1\n"
+    "st1w { z16.s }, p0, [x25]\n"
+    "st1w { z17.s }, p0, [x25, x9, LSL #2]\n"
+    "st1w { z18.s }, p0, [x25, x24, LSL #2]\n"
+    "st1w { z19.s }, p0, [x25, x22, LSL #2]\n"
+    "addvl x25, x25, #1\n"
+    "st1w { z20.s }, p0, [x23]\n"
+    "st1w { z21.s }, p0, [x23, x9, LSL #2]\n"
+    "st1w { z22.s }, p0, [x23, x24, LSL #2]\n"
+    "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+    "addvl x23, x23, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z21, z14\n fmla z21.s, p3/M, z4.s, z9.s\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z8.s, z9.s\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x3, x3, #0x1\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z3.s, z9.s\n"
+    "movprfx z29, z14\n fmla z29.s, p3/M, z1.s, z9.s\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x2, #0x1\n"
+    "movprfx z30, z14\n fmla z30.s, p3/M, z0.s, z9.s\n"
+    "fmla z21.s, p3/M, z5.s, z12.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x3, x20\n"
+    "movprfx z25, z14\n fmla z25.s, p3/M, z7.s, z9.s\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z6.s, z9.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x2, x2, x21, LT\n"
+    "movprfx z20, z14\n fmla z20.s, p3/M, z5.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x12]\n"
+    "csel x3, x3, XZR, LT\n"
+    "fmla z22.s, p3/M, z4.s, z12.s\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x12, x11, LSL #2]\n"
+    "cmp x2, x20\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "movprfx z16, z14\n fmla z16.s, p3/M, z6.s, z17.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "fmla z21.s, p3/M, z7.s, z9.s\n"
+    "fmla z25.s, p3/M, z8.s, z12.s\n"
+    "fmla z26.s, p3/M, z7.s, z12.s\n"
+    "fmla z27.s, p3/M, z6.s, z12.s\n"
+    "movprfx z23, z14\n fmla z23.s, p3/M, z3.s, z12.s\n"
+    "movprfx z31, z14\n fmla z31.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
+    "movprfx z19, z14\n fmla z19.s, p3/M, z8.s, z18.s\n"
+    "fmla z22.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z9.s\n"
+    "fmla z30.s, p3/M, z3.s, z9.s\n"
+    "movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
+    "movprfx z18, z14\n fmla z18.s, p3/M, z0.s, z9.s\n"
+    "fmla z20.s, p3/M, z8.s, z9.s\n"
+    "fmla z28.s, p3/M, z5.s, z9.s\n"
+    "fmla z16.s, p3/M, z2.s, z9.s\n"
+    "fmla z21.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z14.s }, p2/Z, [x6]\n"
+    "fmla z24.s, p3/M, z1.s, z10.s\n"
+    "fmla z25.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+    "fmla z26.s, p3/M, z2.s, z12.s\n"
+    "fmla z27.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14]\n"
+    "fmla z22.s, p3/M, z7.s, z11.s\n"
+    "fmla z23.s, p3/M, z6.s, z11.s\n"
+    "fmla z29.s, p3/M, z5.s, z11.s\n"
+    "fmla z30.s, p3/M, z4.s, z11.s\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "fmla z17.s, p3/M, z2.s, z11.s\n"
+    "fmla z18.s, p3/M, z1.s, z11.s\n"
+    "fmla z19.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z9.s }, p2/Z, [x6, x8, LSL #2]\n"
+    "fmla z20.s, p3/M, z0.s, z14.s\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "fmla z16.s, p3/M, z3.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x14, x11, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z14.s\n"
+    "fmla z27.s, p3/M, z5.s, z10.s\n"
+    "fmla z23.s, p3/M, z2.s, z10.s\n"
+    "fmla z25.s, p3/M, z4.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z3.s, z9.s\n"
+    "fmla z22.s, p3/M, z0.s, z9.s\n"
+    "fmla z31.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x12, x4, LSL #2]\n"
+    "fmla z20.s, p3/M, z2.s, z9.s\n"
+    "fmla z21.s, p3/M, z2.s, z12.s\n"
+    "fmla z24.s, p3/M, z5.s, z9.s\n"
+    "fmla z25.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "fmla z26.s, p3/M, z4.s, z12.s\n"
+    "fmla z27.s, p3/M, z3.s, z12.s\n"
+    "fmla z22.s, p3/M, z1.s, z12.s\n"
+    "fmla z23.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z16.s, p3/M, z7.s, z10.s\n"
+    "fmla z17.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
+    "fmla z20.s, p3/M, z4.s, z9.s\n"
+    "fmla z21.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z9.s\n"
+    "fmla z24.s, p3/M, z7.s, z9.s\n"
+    "fmla z25.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z10.s }, p2/Z, [x5, x8, LSL #2]\n"
+    "fmla z18.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z14.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "fmla z26.s, p3/M, z8.s, z12.s\n"
+    "fmla z27.s, p3/M, z7.s, z12.s\n"
+    "fmla z22.s, p3/M, z5.s, z12.s\n"
+    "fmla z23.s, p3/M, z4.s, z12.s\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
+    "fmla z20.s, p3/M, z7.s, z14.s\n"
+    "fmla z21.s, p3/M, z6.s, z14.s\n"
+    "fmla z28.s, p3/M, z4.s, z14.s\n"
+    "fmla z29.s, p3/M, z3.s, z14.s\n"
+    "fmla z16.s, p3/M, z1.s, z14.s\n"
+    "fmla z17.s, p3/M, z0.s, z14.s\n"
+    "ld1w { z14.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "fmla z24.s, p3/M, z2.s, z10.s\n"
+    "fmla z25.s, p3/M, z1.s, z10.s\n"
+    "fmla z26.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x7]\n"
+    "fmla z18.s, p3/M, z2.s, z14.s\n"
+    "fmla z27.s, p3/M, z0.s, z9.s\n"
+    "fmla z20.s, p3/M, z3.s, z10.s\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z22.s, p3/M, z8.s, z14.s\n"
+    "fmla z23.s, p3/M, z7.s, z14.s\n"
+    "fmla z30.s, p3/M, z5.s, z14.s\n"
+    "fmla z31.s, p3/M, z4.s, z14.s\n"
+    "fmla z19.s, p3/M, z1.s, z14.s\n"
+    "ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z25.s, p3/M, z2.s, z9.s\n"
+    "fmla z26.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
+    "fmla z24.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z14.s }, p2/Z, [x16]\n"
+    "fmla z17.s, p3/M, z4.s, z11.s\n"
+    "fmla z18.s, p3/M, z3.s, z11.s\n"
+    "fmla z27.s, p3/M, z8.s, z12.s\n"
+    "fmla z23.s, p3/M, z5.s, z12.s\n"
+    "fmla z31.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x16, x11, LSL #2]\n"
+    "fmla z20.s, p3/M, z6.s, z14.s\n"
+    "fmla z28.s, p3/M, z3.s, z14.s\n"
+    "fmla z16.s, p3/M, z0.s, z14.s\n"
+    "ld1w { z12.s }, p2/Z, [x12, x8, LSL #2]\n"
+    "fmla z19.s, p3/M, z2.s, z9.s\n"
+    "fmla z17.s, p3/M, z7.s, z12.s\n"
+    "fmla z18.s, p3/M, z6.s, z12.s\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "fmla z29.s, p3/M, z7.s, z11.s\n"
+    "fmla z30.s, p3/M, z6.s, z11.s\n"
+    "fmla z16.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z5.s, z9.s\n"
+    "fmla z17.s, p3/M, z5.s, z10.s\n"
+    "fmla z18.s, p3/M, z4.s, z10.s\n"
+    "fmla z19.s, p3/M, z3.s, z10.s\n"
+    "fmla z23.s, p3/M, z8.s, z9.s\n"
+    "ld1w { z14.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "fmla z16.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x6, x4, LSL #2]\n"
+    "fmla z29.s, p3/M, z8.s, z10.s\n"
+    "fmla z30.s, p3/M, z7.s, z10.s\n"
+    "fmla z31.s, p3/M, z6.s, z10.s\n"
+    "fmla z17.s, p3/M, z8.s, z14.s\n"
+    "ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "fmla z18.s, p3/M, z7.s, z14.s\n"
+    "fmla z19.s, p3/M, z6.s, z14.s\n"
+    "ld1w { z10.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z24.s, p3/M, z4.s, z9.s\n"
+    "fmla z25.s, p3/M, z3.s, z9.s\n"
+    "fmla z20.s, p3/M, z1.s, z9.s\n"
+    "fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "fmla z26.s, p3/M, z5.s, z11.s\n"
+    "fmla z27.s, p3/M, z4.s, z11.s\n"
+    "fmla z22.s, p3/M, z2.s, z11.s\n"
+    "fmla z23.s, p3/M, z1.s, z11.s\n"
+    "fmla z28.s, p3/M, z7.s, z10.s\n"
+    "fmla z29.s, p3/M, z6.s, z10.s\n"
+    "fmla z16.s, p3/M, z4.s, z10.s\n"
+    "fmla z17.s, p3/M, z3.s, z10.s\n"
+    "fmla z30.s, p3/M, z8.s, z12.s\n"
+    "fmla z31.s, p3/M, z7.s, z12.s\n"
+    "fmla z18.s, p3/M, z5.s, z12.s\n"
+    "fmla z19.s, p3/M, z4.s, z12.s\n"
+    ".inst 0xc1afc9b8  // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
+    ".inst 0xc1afc9b4  // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
+    "st1w { z24.s }, p0, [x28]\n"
+    ".inst 0xc1afc9bc  // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
+    ".inst 0xc1afc9b0  // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
+    "st1w { z25.s }, p0, [x28, x9, LSL #2]\n"
+    "st1w { z26.s }, p0, [x28, x24, LSL #2]\n"
+    "st1w { z27.s }, p0, [x28, x22, LSL #2]\n"
+    "st1w { z20.s }, p0, [x27]\n"
+    "st1w { z21.s }, p0, [x27, x9, LSL #2]\n"
+    "st1w { z22.s }, p0, [x27, x24, LSL #2]\n"
+    "st1w { z23.s }, p0, [x27, x22, LSL #2]\n"
+    "st1w { z28.s }, p0, [x25]\n"
+    "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
+    "st1w { z30.s }, p0, [x25, x24, LSL #2]\n"
+    "st1w { z31.s }, p0, [x25, x22, LSL #2]\n"
+    "st1w { z16.s }, p0, [x23]\n"
+    "st1w { z17.s }, p0, [x23, x9, LSL #2]\n"
+    "st1w { z18.s }, p0, [x23, x24, LSL #2]\n"
+    "st1w { z19.s }, p0, [x23, x22, LSL #2]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..2e2a45bab0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ld1w { z13.s }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "ldp x23, x22, [x17, #0x0]\n"
+    "ldp x21, x20, [x17, #0x10]\n"
+    "cntw x16\n"
+    ".inst 0xa040c100  // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "mov x15, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    ".inst 0xa040c104  // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "addvl x8, x8, #4\n"
+    "cmp x16, %x[n_channels]\n"
+    "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x13, XZR, x16\n"
+    "ld1w { z8.s }, p3/Z, [x8]\n"
+    "addvl x8, x8, #1\n"
+    "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
+    "movprfx z16, z13\n fmla z16.s, p3/M, z8.s, z9.s\n"
+    "ldr x24, [x17, #0x20]\n"
+    "incw x13\n"
+    "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
+    "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+    "ldr x20, [x17, #0x30]\n"
+    "mov p1.b, p2.b\n"
+    "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+    "ldr x21, [x17, #0x28]\n"
+    "movprfx z17, z13\n fmla z17.s, p3/M, z7.s, z9.s\n"
+    "whilelt p0.s, x16, %x[n_channels]\n"
+    "movprfx z18, z13\n fmla z18.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z5.s, z12.s\n"
+    "ldr x23, [x17, #0x38]\n"
+    "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
+    "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "fmla z16.s, p3/M, z0.s, z10.s\n"
+    "movprfx z19, z13\n fmla z19.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z22.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x48]\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "fmla z25.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z21.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x27, [x17, #0x50]\n"
+    "fmla z26.s, p3/M, z1.s, z12.s\n"
+    "fmla z17.s, p3/M, z8.s, z12.s\n"
+    "ldr x26, [x17, #0x60]\n"
+    "fmla z18.s, p3/M, z7.s, z12.s\n"
+    "movprfx z20, z13\n fmla z20.s, p3/M, z6.s, z22.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x25, [x17, #0x68]\n"
+    "fmla z29.s, p3/M, z7.s, z9.s\n"
+    "fmla z19.s, p3/M, z6.s, z12.s\n"
+    "ldr x21, [x17, #0x58]\n"
+    "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+    "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x24, [x17, #0x70]\n"
+    "movprfx z23, z13\n fmla z23.s, p3/M, z8.s, z21.s\n"
+    "fmla z30.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x23, [x17, #0x78]\n"
+    "fmla z25.s, p3/M, z4.s, z9.s\n"
+    "fmla z26.s, p3/M, z3.s, z9.s\n"
+    "ldr x22, [x17, #0x80]\n"
+    "movprfx z21, z13\n fmla z21.s, p3/M, z1.s, z9.s\n"
+    "movprfx z22, z13\n fmla z22.s, p3/M, z0.s, z9.s\n"
+    "ldr x20, [x17, #0x88]\n"
+    "ld1w { z13.s }, p3/Z, [x8]\n"
+    "fmla z28.s, p3/M, z8.s, z9.s\n"
+    "fmla z24.s, p3/M, z5.s, z9.s\n"
+    "ldr x12, [x14, #0x0]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z20.s, p3/M, z2.s, z9.s\n"
+    "fmla z16.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "ldr x27, [x17, #0x90]\n"
+    "fmla z17.s, p3/M, z0.s, z10.s\n"
+    "fmla z18.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x21, [x17, #0x98]\n"
+    "fmla z29.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "ldr x26, [x17, #0xa0]\n"
+    "fmla z30.s, p3/M, z7.s, z11.s\n"
+    "fmla z31.s, p3/M, z6.s, z11.s\n"
+    "ldr x11, [x14, #0x8]\n"
+    "fmla z25.s, p3/M, z5.s, z11.s\n"
+    "fmla z26.s, p3/M, z4.s, z11.s\n"
+    "ldr x10, [x14, #0x10]\n"
+    "fmla z27.s, p3/M, z3.s, z11.s\n"
+    "fmla z21.s, p3/M, z2.s, z11.s\n"
+    "ldr x9, [x14, #0x18]\n"
+    "fmla z22.s, p3/M, z1.s, z11.s\n"
+    "fmla z23.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ldr x25, [x17, #0xa8]\n"
+    "fmla z16.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "fmla z24.s, p3/M, z6.s, z12.s\n"
+    "fmla z20.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x24, [x17, #0xb0]\n"
+    "fmla z17.s, p3/M, z4.s, z11.s\n"
+    "fmla z18.s, p3/M, z3.s, z11.s\n"
+    "fmla z29.s, p3/M, z1.s, z11.s\n"
+    "fmla z19.s, p3/M, z5.s, z10.s\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "fmla z30.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xb8]\n"
+    "fmla z27.s, p3/M, z8.s, z9.s\n"
+    "fmla z23.s, p3/M, z5.s, z9.s\n"
+    "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z16.s, p3/M, z5.s, z11.s\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x20, [x17, #0xc8]\n"
+    "fmla z17.s, p3/M, z5.s, z12.s\n"
+    "fmla z18.s, p3/M, z4.s, z12.s\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "fmla z19.s, p3/M, z3.s, z12.s\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "fmla z31.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x28, [x17, #0xd8]\n"
+    "fmla z20.s, p3/M, z7.s, z10.s\n"
+    "fmla z21.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "ldr x21, [x17, #0xd0]\n"
+    "fmla z16.s, p3/M, z7.s, z11.s\n"
+    "fmla z17.s, p3/M, z6.s, z11.s\n"
+    "fmla z28.s, p3/M, z4.s, z11.s\n"
+    "fmla z29.s, p3/M, z3.s, z11.s\n"
+    "fmla z24.s, p3/M, z1.s, z11.s\n"
+    "fmla z25.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "ldr x27, [x17, #0xe0]\n"
+    "fmla z18.s, p3/M, z8.s, z9.s\n"
+    "fmla z22.s, p3/M, z8.s, z10.s\n"
+    "fmla z23.s, p3/M, z7.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z1.s, z9.s\n"
+    "ldr x26, [x17, #0xe8]\n"
+    "fmla z19.s, p3/M, z7.s, z9.s\n"
+    "fmla z30.s, p3/M, z5.s, z9.s\n"
+    "fmla z31.s, p3/M, z4.s, z9.s\n"
+    "fmla z26.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x25, [x17, #0xf0]\n"
+    "fmla z16.s, p3/M, z2.s, z11.s\n"
+    "fmla z17.s, p3/M, z1.s, z11.s\n"
+    "fmla z18.s, p3/M, z0.s, z11.s\n"
+    "fmla z28.s, p3/M, z7.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x24, [x17, #0xf8]\n"
+    "fmla z29.s, p3/M, z6.s, z10.s\n"
+    "fmla z24.s, p3/M, z4.s, z10.s\n"
+    "fmla z25.s, p3/M, z3.s, z10.s\n"
+    "fmla z20.s, p3/M, z1.s, z10.s\n"
+    "fmla z21.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z10.s\n"
+    "ldr x23, [x17, #0x100]\n"
+    "fmla z22.s, p3/M, z2.s, z10.s\n"
+    "fmla z17.s, p3/M, z2.s, z9.s\n"
+    "fmla z18.s, p3/M, z1.s, z9.s\n"
+    "fmla z19.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x108]\n"
+    "fmla z16.s, p3/M, z6.s, z11.s\n"
+    "fmla z28.s, p3/M, z3.s, z11.s\n"
+    "fmla z24.s, p3/M, z0.s, z11.s\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x22, [x17, #0x110]\n"
+    "fmla z31.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z5.s, z10.s\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z9.s\n"
+    "ldr x21, [x17, #0x118]\n"
+    "fmla z20.s, p3/M, z0.s, z11.s\n"
+    "fmla z21.s, p3/M, z4.s, z10.s\n"
+    "fmla z22.s, p3/M, z3.s, z10.s\n"
+    "fmla z19.s, p3/M, z8.s, z9.s\n"
+    "fmla z31.s, p3/M, z5.s, z9.s\n"
+    "fmla z28.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z6.s, z10.s\n"
+    "fmla z20.s, p3/M, z5.s, z10.s\n"
+    "fmla z27.s, p3/M, z5.s, z9.s\n"
+    "fmla z23.s, p3/M, z2.s, z9.s\n"
+    "fmla z21.s, p3/M, z7.s, z12.s\n"
+    "fmla z22.s, p3/M, z6.s, z12.s\n"
+    "fmla z24.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z12.s\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "fmla z26.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z6.s, z11.s\n"
+    "fmla z21.s, p3/M, z5.s, z11.s\n"
+    "fmla z22.s, p3/M, z4.s, z11.s\n"
+    "fmla z23.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldp x20, x25, [x17, #0x0]\n"
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z16.s, p3/M, z4.s, z10.s\n"
+    "fmla z17.s, p3/M, z3.s, z10.s\n"
+    "fmla z18.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z9.s }, p0/Z, [x20, x16, LSL #2]\n"
+    "fmla z19.s, p3/M, z4.s, z11.s\n"
+    "fmla z21.s, p3/M, z8.s, z12.s\n"
+    "fmla z22.s, p3/M, z7.s, z12.s\n"
+    "fmla z23.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z10.s\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z0.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldp x20, x24, [x17, #0x10]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    "incw x15\n"
+    "ld1w { z11.s }, p0/Z, [x20, x16, LSL #2]\n"
+    ".inst 0xc1afc9d0  // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
+    "st1w { z16.s }, p1, [x12, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "fmla z24.s, p3/M, z7.s, z12.s\n"
+    "st1w { z17.s }, p1, [x11, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z25.s, p3/M, z6.s, z12.s\n"
+    "fmla z26.s, p3/M, z8.s, z0.s\n"
+    "st1w { z18.s }, p1, [x10, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z27.s, p3/M, z7.s, z0.s\n"
+    ".inst 0xc1afc9dc  // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+    "st1w { z19.s }, p1, [x9, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z20.s, p3/M, z4.s, z12.s\n"
+    "fmla z21.s, p3/M, z3.s, z12.s\n"
+    "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmla z22.s, p3/M, z5.s, z0.s\n"
+    "fmla z23.s, p3/M, z4.s, z0.s\n"
+    "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x48]\n"
+    ".inst 0xc1afc9d8  // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+    "ld1w { z10.s }, p0/Z, [x25, x16, LSL #2]\n"
+    "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
+    "incw x16\n"
+    "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x58]\n"
+    ".inst 0xa040c100  // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+    "addvl x8, x8, #4\n"
+    "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "whilelt p2.s, x15, %x[n_channels]\n"
+    ".inst 0xa040c104  // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+    "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "addvl x8, x8, #4\n"
+    "cmp x16, %x[n_channels]\n"
+    "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x70]\n"
+    ".inst 0xc1afc9d4  // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
+    "ld1w { z8.s }, p3/Z, [x8]\n"
+    "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "addvl x8, x8, #1\n"
+    "st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
+    "st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
+    "st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
+    "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
+    "movprfx z20, z13\n fmla z20.s, p3/M, z8.s, z9.s\n"
+    "ldr x24, [x17, #0x20]\n"
+    "incw x13\n"
+    "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
+    "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+    "ldr x20, [x17, #0x30]\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+    "ldr x23, [x17, #0x28]\n"
+    "movprfx z21, z13\n fmla z21.s, p3/M, z7.s, z9.s\n"
+    "movprfx z22, z13\n fmla z22.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z5.s, z12.s\n"
+    "ldr x22, [x17, #0x38]\n"
+    "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
+    "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x21, [x17, #0x40]\n"
+    "fmla z20.s, p3/M, z0.s, z10.s\n"
+    "movprfx z23, z13\n fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z19.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x48]\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "fmla z25.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x27, [x17, #0x50]\n"
+    "fmla z26.s, p3/M, z1.s, z12.s\n"
+    "fmla z21.s, p3/M, z8.s, z12.s\n"
+    "ldr x26, [x17, #0x60]\n"
+    "fmla z22.s, p3/M, z7.s, z12.s\n"
+    "movprfx z16, z13\n fmla z16.s, p3/M, z6.s, z19.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x25, [x17, #0x68]\n"
+    "fmla z29.s, p3/M, z7.s, z9.s\n"
+    "fmla z23.s, p3/M, z6.s, z12.s\n"
+    "ldr x20, [x17, #0x58]\n"
+    "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+    "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x24, [x17, #0x70]\n"
+    "movprfx z19, z13\n fmla z19.s, p3/M, z8.s, z17.s\n"
+    "fmla z30.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x23, [x17, #0x78]\n"
+    "fmla z25.s, p3/M, z4.s, z9.s\n"
+    "fmla z26.s, p3/M, z3.s, z9.s\n"
+    "ldr x22, [x17, #0x80]\n"
+    "movprfx z17, z13\n fmla z17.s, p3/M, z1.s, z9.s\n"
+    "movprfx z18, z13\n fmla z18.s, p3/M, z0.s, z9.s\n"
+    "ldr x21, [x17, #0x88]\n"
+    "fmla z28.s, p3/M, z8.s, z9.s\n"
+    "fmla z24.s, p3/M, z5.s, z9.s\n"
+    "ldr x12, [x14, #0x0]\n"
+    "fmla z16.s, p3/M, z2.s, z9.s\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "ldr x27, [x17, #0x90]\n"
+    "fmla z21.s, p3/M, z0.s, z12.s\n"
+    "fmla z22.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x98]\n"
+    "fmla z29.s, p3/M, z8.s, z11.s\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "ldr x26, [x17, #0xa0]\n"
+    "fmla z30.s, p3/M, z7.s, z11.s\n"
+    "fmla z31.s, p3/M, z6.s, z11.s\n"
+    "ldr x11, [x14, #0x8]\n"
+    "fmla z25.s, p3/M, z5.s, z11.s\n"
+    "fmla z26.s, p3/M, z4.s, z11.s\n"
+    "ldr x10, [x14, #0x10]\n"
+    "fmla z27.s, p3/M, z3.s, z11.s\n"
+    "fmla z17.s, p3/M, z2.s, z11.s\n"
+    "ldr x9, [x14, #0x18]\n"
+    "fmla z18.s, p3/M, z1.s, z11.s\n"
+    "fmla z19.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ldr x25, [x17, #0xa8]\n"
+    "fmla z20.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "fmla z24.s, p3/M, z6.s, z12.s\n"
+    "fmla z16.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x24, [x17, #0xb0]\n"
+    "fmla z21.s, p3/M, z4.s, z10.s\n"
+    "fmla z22.s, p3/M, z3.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z10.s\n"
+    "fmla z23.s, p3/M, z5.s, z13.s\n"
+    "fmla z31.s, p3/M, z2.s, z13.s\n"
+    "fmla z30.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xb8]\n"
+    "fmla z27.s, p3/M, z8.s, z12.s\n"
+    "fmla z19.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z20.s, p3/M, z5.s, z10.s\n"
+    "fmla z28.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x21, [x17, #0xc8]\n"
+    "fmla z21.s, p3/M, z5.s, z13.s\n"
+    "fmla z22.s, p3/M, z4.s, z13.s\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "fmla z23.s, p3/M, z3.s, z13.s\n"
+    "fmla z30.s, p3/M, z1.s, z13.s\n"
+    "fmla z31.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z10.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x28, [x17, #0xd8]\n"
+    "fmla z16.s, p3/M, z7.s, z9.s\n"
+    "fmla z17.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "ldr x20, [x17, #0xd0]\n"
+    "fmla z20.s, p3/M, z7.s, z12.s\n"
+    "fmla z21.s, p3/M, z6.s, z12.s\n"
+    "fmla z28.s, p3/M, z4.s, z12.s\n"
+    "fmla z29.s, p3/M, z3.s, z12.s\n"
+    "fmla z24.s, p3/M, z1.s, z12.s\n"
+    "fmla z25.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "ldr x27, [x17, #0xe0]\n"
+    "fmla z22.s, p3/M, z8.s, z10.s\n"
+    "fmla z18.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z1.s, z10.s\n"
+    "ldr x26, [x17, #0xe8]\n"
+    "fmla z23.s, p3/M, z7.s, z10.s\n"
+    "fmla z30.s, p3/M, z5.s, z10.s\n"
+    "fmla z31.s, p3/M, z4.s, z10.s\n"
+    "fmla z26.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "ldr x25, [x17, #0xf0]\n"
+    "fmla z20.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "fmla z22.s, p3/M, z0.s, z12.s\n"
+    "fmla z28.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x24, [x17, #0xf8]\n"
+    "fmla z29.s, p3/M, z6.s, z11.s\n"
+    "fmla z24.s, p3/M, z4.s, z11.s\n"
+    "fmla z25.s, p3/M, z3.s, z11.s\n"
+    "fmla z16.s, p3/M, z1.s, z11.s\n"
+    "fmla z17.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z10.s\n"
+    "ldr x23, [x17, #0x100]\n"
+    "fmla z18.s, p3/M, z2.s, z10.s\n"
+    "fmla z21.s, p3/M, z2.s, z9.s\n"
+    "fmla z22.s, p3/M, z1.s, z9.s\n"
+    "fmla z23.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ldr x22, [x17, #0x108]\n"
+    "fmla z20.s, p3/M, z6.s, z12.s\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "fmla z24.s, p3/M, z0.s, z12.s\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x21, [x17, #0x110]\n"
+    "fmla z31.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z5.s, z10.s\n"
+    "fmla z19.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x28, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ldr x20, [x17, #0x118]\n"
+    "fmla z16.s, p3/M, z0.s, z12.s\n"
+    "fmla z17.s, p3/M, z4.s, z9.s\n"
+    "fmla z18.s, p3/M, z3.s, z9.s\n"
+    "fmla z23.s, p3/M, z8.s, z11.s\n"
+    "fmla z31.s, p3/M, z5.s, z11.s\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z25.s, p3/M, z7.s, z9.s\n"
+    "fmla z26.s, p3/M, z6.s, z9.s\n"
+    "fmla z16.s, p3/M, z5.s, z9.s\n"
+    "fmla z27.s, p3/M, z5.s, z10.s\n"
+    "fmla z19.s, p3/M, z2.s, z10.s\n"
+    "fmla z17.s, p3/M, z7.s, z12.s\n"
+    "fmla z18.s, p3/M, z6.s, z12.s\n"
+    "fmla z24.s, p3/M, z8.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z16.s, p3/M, z8.s, z12.s\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "fmla z26.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z6.s, z11.s\n"
+    "fmla z17.s, p3/M, z5.s, z11.s\n"
+    "fmla z18.s, p3/M, z4.s, z11.s\n"
+    "fmla z19.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z20.s, p3/M, z4.s, z9.s\n"
+    "fmla z21.s, p3/M, z3.s, z9.s\n"
+    "fmla z22.s, p3/M, z5.s, z11.s\n"
+    "fmla z23.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z8.s, z12.s\n"
+    "fmla z18.s, p3/M, z7.s, z12.s\n"
+    "fmla z19.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z13.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z0.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    ".inst 0xc1afc9d4  // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
+    "st1w { z20.s }, p0, [x12, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "st1w { z21.s }, p0, [x11, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z25.s, p3/M, z6.s, z13.s\n"
+    "fmla z26.s, p3/M, z8.s, z0.s\n"
+    "st1w { z22.s }, p0, [x10, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z27.s, p3/M, z7.s, z0.s\n"
+    ".inst 0xc1afc9dc  // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+    "st1w { z23.s }, p0, [x9, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z16.s, p3/M, z4.s, z13.s\n"
+    "fmla z17.s, p3/M, z3.s, z13.s\n"
+    "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmla z18.s, p3/M, z5.s, z0.s\n"
+    "fmla z19.s, p3/M, z4.s, z0.s\n"
+    "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x48]\n"
+    ".inst 0xc1afc9d8  // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+    ".inst 0xc1afc9d0  // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
+    "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x58]\n"
+    "st1w { z24.s }, p0, [x23, x13, LSL #2]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "st1w { z25.s }, p0, [x22, x13, LSL #2]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
+    "ldr x21, [x14, #0x70]\n"
+    "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "st1w { z16.s }, p0, [x23, x13, LSL #2]\n"
+    "st1w { z17.s }, p0, [x22, x13, LSL #2]\n"
+    "st1w { z18.s }, p0, [x21, x13, LSL #2]\n"
+    "st1w { z19.s }, p0, [x20, x13, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..dcffffeb21
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..066b935486
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ptrue p3.b\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x4\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "mul x20, x2, x21\n"  // offset = tile_i * ld_input_row
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "madd x20, x3, x4, x20\n"  // offset += tile_j * ld_input_col
+    "mul x20, x20, x22\n"  // offset *= kernel_stride * output_size
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "add x5, x5, x20, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x6, x5, x21, LSL #2\n"
+    "add x7, x6, x21, LSL #2\n"
+    "add x8, x4, x4\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x16, x7, x21, LSL #2\n"
+    "add x15, x8, x4\n"
+    "add x14, x16, x21, LSL #2\n"
+    "add x13, x15, x4\n"
+    "cbnz x3, 2f\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "sub x21, x20, x3\n"
+    "sub x21, x21, #0x1\n"
+    "lsl x12, %x[n_channels], #0x2\n"
+    "mov x20, #0x10\n"
+    "and x21, x21, #0x3fffff\n"
+    "mul x20, x20, x4\n"
+    "orr x12, x12, x21, LSL #22\n"
+    "orr x12, x12, x20, LSL #38\n"
+    "add x27, x7, x8, LSL #2\n"
+    "add x26, x5, x4, LSL #2\n"
+    "add x25, x5, x15, LSL #2\n"
+    "add x24, x5, x13, LSL #2\n"
+    "add x23, x6, x4, LSL #2\n"
+    "add x22, x5, x8, LSL #2\n"
+    "add x21, x6, x15, LSL #2\n"
+    "add x20, x6, x13, LSL #2\n"
+    "add x11, x6, x8, LSL #2\n"
+    "add x10, x16, x4, LSL #2\n"
+    "add x9, x7, x4, LSL #2\n"
+    "add x28, x16, x15, LSL #2\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    "add x27, x7, x15, LSL #2\n"
+    ".inst 0xf8ac48ba  // rprfm pldonce, x12, [x5]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    "add x26, x16, x13, LSL #2\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    "add x25, x7, x13, LSL #2\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    "add x24, x14, x4, LSL #2\n"
+    ".inst 0xf8ac48da  // rprfm pldonce, x12, [x6]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    "add x23, x16, x8, LSL #2\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    "add x22, x14, x15, LSL #2\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    "add x21, x14, x8, LSL #2\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "add x20, x14, x13, LSL #2\n"
+    ".inst 0xf8ac497a  // rprfm pldonce, x12, [x11]\n"
+    ".inst 0xf8ac4a1a  // rprfm pldonce, x12, [x16]\n"
+    ".inst 0xf8ac48fa  // rprfm pldonce, x12, [x7]\n"
+    ".inst 0xf8ac495a  // rprfm pldonce, x12, [x10]\n"
+    ".inst 0xf8ac493a  // rprfm pldonce, x12, [x9]\n"
+    ".inst 0xf8ac4b9a  // rprfm pldonce, x12, [x28]\n"
+    ".inst 0xf8ac4b7a  // rprfm pldonce, x12, [x27]\n"
+    ".inst 0xf8ac4b5a  // rprfm pldonce, x12, [x26]\n"
+    ".inst 0xf8ac49da  // rprfm pldonce, x12, [x14]\n"
+    ".inst 0xf8ac4b3a  // rprfm pldonce, x12, [x25]\n"
+    ".inst 0xf8ac4b1a  // rprfm pldonce, x12, [x24]\n"
+    ".inst 0xf8ac4afa  // rprfm pldonce, x12, [x23]\n"
+    ".inst 0xf8ac4ada  // rprfm pldonce, x12, [x22]\n"
+    ".inst 0xf8ac4aba  // rprfm pldonce, x12, [x21]\n"
+    ".inst 0xf8ac4a9a  // rprfm pldonce, x12, [x20]\n"
+    "2:"  // Tile loop: Prefetch input rows: End
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x2, x22\n"  // offset = tile_i * ld_output_row
+    "mov x20, #0x2\n"
+    "ld1w { z22.s }, p3/Z, [x17]\n"
+    "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "madd x21, x3, x25, x21\n"  // offset += tile_j * ld_output_col
+    "addvl x17, x17, #1\n"
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "mul x21, x21, x20\n"  // offset *= output_tile_size
+    "cntw x23\n"
+    "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "addvl x17, x17, #4\n"
+    "add x24, x24, x21, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "addvl x17, x17, #4\n"
+    "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cmp x23, %x[n_channels]\n"
+    "add x22, x24, x22, LSL #2\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x23\n"
+    "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x5]\n"
+    "addvl x17, x17, #1\n"
+    "ld1w { z11.s }, p2/Z, [x5, x4, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x5, x13, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x6]\n"
+    "ld1w { z15.s }, p2/Z, [x6, x4, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+    "bge 4f\n"
+    "3:"  // Tile loop: Channel loop
+    "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "whilelt p1.s, x23, %x[n_channels]\n"
+    "incw x21\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "incw x23\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z27.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.s, p3/M, z3.s, z14.s\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x6, x8, LSL #2]\n"
+    "addvl x5, x5, #1\n"
+    "fmla z28.s, p3/M, z4.s, z15.s\n"
+    "fmla z29.s, p3/M, z4.s, z27.s\n"
+    "ld1w { z25.s }, p2/Z, [x16]\n"
+    "addvl x6, x6, #1\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z5.s, z18.s\n"
+    "ld1w { z12.s }, p2/Z, [x7]\n"
+    "incw x20\n"
+    "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
+    "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x7, x15, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "fmla z29.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z25.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z10.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "fmla z31.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z22.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z16.s\n"
+    "addvl x7, x7, #1\n"
+    "fmla z31.s, p3/M, z2.s, z22.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14]\n"
+    "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z31.s, p3/M, z3.s, z17.s\n"
+    "addvl x16, x16, #1\n"
+    "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z29.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z22.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "ld1w { z22.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "cmp x23, %x[n_channels]\n"
+    ".inst 0xc1b8cb5c  // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
+    ".inst 0xa040c220  // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "addvl x14, x14, #1\n"
+    "st1w { z28.s }, p0, [x24]\n"
+    ".inst 0xa040c224  // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+    "addvl x17, x17, #4\n"
+    "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+    "addvl x24, x24, #1\n"
+    "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
+    "st1w { z30.s }, p0, [x22]\n"
+    "ld1w { z10.s }, p1/Z, [x5]\n"
+    "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+    "addvl x22, x22, #1\n"
+    "ld1w { z11.s }, p1/Z, [x5, x4, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x5, x15, LSL #2]\n"
+    "ld1w { z13.s }, p1/Z, [x5, x13, LSL #2]\n"
+    "ld1w { z14.s }, p1/Z, [x6]\n"
+    "ld1w { z15.s }, p1/Z, [x6, x4, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x5, x8, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x17]\n"
+    "addvl x17, x17, #1\n"
+    "blt 3b\n"
+    "4:"  // Tile loop: Channel tail
+    "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x3, x3, #0x1\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z28.s, p3/M, z3.s, z14.s\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z20.s }, p2/Z, [x6, x8, LSL #2]\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z28.s, p3/M, z4.s, z15.s\n"
+    "fmla z29.s, p3/M, z4.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x16]\n"
+    "cmp x3, x20\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z5.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x7]\n"
+    "add x20, x2, #0x1\n"
+    "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
+    "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z19.s }, p2/Z, [x7, x15, LSL #2]\n"
+    "csel x2, x2, x20, LT\n"
+    "fmla z28.s, p3/M, z5.s, z20.s\n"
+    "fmla z29.s, p3/M, z3.s, z20.s\n"
+    "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z3.s, z17.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
+    "csel x3, x3, XZR, LT\n"
+    "fmla z30.s, p3/M, z0.s, z18.s\n"
+    "fmla z31.s, p3/M, z1.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
+    "cmp x2, x21\n"
+    "fmla z30.s, p3/M, z4.s, z17.s\n"
+    "fmla z31.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x7, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z16.s\n"
+    "fmla z31.s, p3/M, z2.s, z18.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14]\n"
+    "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z31.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z29.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    ".inst 0xc1b8cb5c  // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
+    "st1w { z28.s }, p0, [x24]\n"
+    "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+    "st1w { z30.s }, p0, [x22]\n"
+    "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+    "blt 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..dc7a40ff54
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ptrue p3.b\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    ".inst 0x25207810  // ptrue pn8.b\n"
+    "ld1w { z26.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "ldp x14, x13, [x20, #0x0]\n"
+    "cntw x12\n"
+    ".inst 0xa040c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+    "addvl x15, x15, #4\n"
+    "ldp x11, x10, [x20, #0x10]\n"
+    "mov x9, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+    "ldp x28, x26, [x16, #0x0]\n"
+    "addvl x15, x15, #4\n"
+    "cmp x12, %x[n_channels]\n"
+    "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "sub x27, XZR, x12\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    "ld1w { z8.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "ld1w { z9.s }, p2/Z, [x28, x9, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ldr x21, [x16, #0x40]\n"
+    "whilelt p1.s, x12, %x[n_channels]\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x16, #0x48]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z22.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x20, [x16, #0x50]\n"
+    "fmla z28.s, p3/M, z3.s, z14.s\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0x58]\n"
+    "fmla z28.s, p3/M, z4.s, z15.s\n"
+    "fmla z29.s, p3/M, z4.s, z22.s\n"
+    "ldr x21, [x16, #0x78]\n"
+    "ld1w { z23.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x16, #0x60]\n"
+    "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
+    "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ldr x20, [x16, #0x80]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "fmla z29.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla z30.s, p3/M, z3.s, z23.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ldr x20, [x16, #0x88]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z13.s\n"
+    "fmla z31.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "ldr x20, [x16, #0x98]\n"
+    "fmla z30.s, p3/M, z4.s, z17.s\n"
+    "fmla z31.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z13.s\n"
+    "ld1w { z4.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x90]\n"
+    "fmla z30.s, p3/M, z1.s, z16.s\n"
+    "ldr x20, [x16, #0xa8]\n"
+    "fmla z31.s, p3/M, z2.s, z4.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0xa0]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z31.s, p3/M, z3.s, z17.s\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z29.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0xc0]\n"
+    "fmla z31.s, p3/M, z6.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z4.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldp x20, x26, [x16, #0x0]\n"
+    "fmla z30.s, p3/M, z8.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "ldp x25, x24, [x16, #0x10]\n"
+    "ld1w { z26.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "incw x9\n"
+    "ldp x23, x22, [x16, #0x20]\n"
+    "ld1w { z9.s }, p1/Z, [x20, x12, LSL #2]\n"
+    "incw x27\n"
+    "mov p0.b, p2.b\n"
+    "ldp x21, x20, [x16, #0x30]\n"
+    "ld1w { z10.s }, p1/Z, [x26, x12, LSL #2]\n"
+    "whilelt p2.s, x9, %x[n_channels]\n"
+    ".inst 0xc1b8cb3c  // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
+    "ld1w { z11.s }, p1/Z, [x25, x12, LSL #2]\n"
+    "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x24, x12, LSL #2]\n"
+    "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
+    "ld1w { z13.s }, p1/Z, [x23, x12, LSL #2]\n"
+    "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+    "ld1w { z14.s }, p1/Z, [x22, x12, LSL #2]\n"
+    "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
+    "ld1w { z15.s }, p1/Z, [x21, x12, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x20, x12, LSL #2]\n"
+    "incw x12\n"
+    "cmp x12, %x[n_channels]\n"
+    ".inst 0xa040c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+    "addvl x15, x15, #4\n"
+    ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+    "addvl x15, x15, #4\n"
+    "ld1w { z8.s }, p3/Z, [x15]\n"
+    "addvl x15, x15, #1\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ldr x21, [x16, #0x40]\n"
+    "incw x27\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x16, #0x48]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x20, [x16, #0x50]\n"
+    "fmla z28.s, p3/M, z3.s, z14.s\n"
+    "fmla z29.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0x58]\n"
+    "fmla z28.s, p3/M, z4.s, z15.s\n"
+    "fmla z29.s, p3/M, z4.s, z17.s\n"
+    "ldr x21, [x16, #0x78]\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x16, #0x60]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
+    "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ldr x20, [x16, #0x80]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z20.s\n"
+    "fmla z29.s, p3/M, z3.s, z20.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla z30.s, p3/M, z3.s, z17.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ldr x20, [x16, #0x88]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z18.s\n"
+    "fmla z31.s, p3/M, z1.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x70]\n"
+    "ldr x20, [x16, #0x98]\n"
+    "fmla z30.s, p3/M, z4.s, z17.s\n"
+    "fmla z31.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0x90]\n"
+    "fmla z30.s, p3/M, z1.s, z16.s\n"
+    "ldr x20, [x16, #0xa8]\n"
+    "fmla z31.s, p3/M, z2.s, z18.s\n"
+    "fmla z28.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x16, #0xa0]\n"
+    "fmla z30.s, p3/M, z6.s, z16.s\n"
+    "fmla z31.s, p3/M, z3.s, z17.s\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z16.s\n"
+    "fmla z29.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x16, #0xc0]\n"
+    "fmla z31.s, p3/M, z6.s, z17.s\n"
+    "fmla z29.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z16.s\n"
+    "mov p0.b, p2.b\n"
+    ".inst 0xc1b8cb3c  // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
+    "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
+    "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
+    "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+    "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
new file mode 100644
index 0000000000..061b0a1e2e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32_planar_3x3_s1_4rows_mla_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32_planar_3x3_s1_4rows_mla_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32_planar_3x3_s1_4rows_mla_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..a385893146
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x6\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x7\n"
+    "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x16\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z24.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x17\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z20.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z20.s }, p1/Z, [x20, x15, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x14, #0x1\n"
+    "orr x24, x20, %x[ld_in_col], LSL #18\n"
+    "mov z21.d, z20.d\n"
+    "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xa0404ae6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x23]\n"
+    "orr x24, x16, x24, LSL #20\n"
+    "mov x22, #0x6\n"
+    "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ld1w { z10.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "addvl x23, x23, #3\n"
+    "add x21, x17, x7\n"
+    ".inst 0xa1404ae0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x23]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "mov z22.d, z20.d\n"
+    "mov z23.d, z20.d\n"
+    "ld1w { z9.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "addvl x23, x23, #3\n"
+    "mov x8, #0x0\n"
+    "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+    ".inst 0xa0404ae4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x23]\n"
+    "lsl x24, x24, #0x2\n"
+    "sub x22, x22, x21\n"
+    "ld1w { z1.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "madd x20, x20, x17, x13\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b84a9c  // rprfm pldstrm, x24, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x13, x17, x20, x13\n"
+    ".inst 0xc0040e80  // mova za.d[x8, #0], { z20.d-z23.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040e81  // mova za.d[x8, #1], { z20.d-z23.d }\n"
+    "mov x10, #0x2\n"
+    "ldp x9, x28, [x22], #0x10\n"
+    ".inst 0xc0040e82  // mova za.d[x8, #2], { z20.d-z23.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x25, x24, [x22], #0x10\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x10\n"
+    "csel x20, x21, x10, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x10, x10, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c0c  // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+    "sub x11, x11, x21\n"
+    ".inst 0xc1b8c84c  // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z12.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z13.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "st1w { z14.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z15.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x17, x7\n"
+    "bne 10f\n"
+    "cbz x10, 8f\n"
+    "cmp x10, #0x1\n"
+    "sub x14, x14, x10\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 2 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z14.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13619c0  // fmla za.s[x8, 0], { z14.s-z17.s }, z6.s\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13019e0  // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    ".inst 0xc1341a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
+    "7:"  // Unpadded: 1 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z13.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z14.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13719a0  // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
+    ".inst 0xc13619a1  // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13819c0  // fmla za.s[x8, 0], { z14.s-z17.s }, z8.s\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    ".inst 0xc13019c1  // fmla za.s[x8, 1], { z14.s-z17.s }, z0.s\n"
+    ".inst 0xc13519e0  // fmla za.s[x8, 0], { z15.s-z18.s }, z5.s\n"
+    ".inst 0xc13419e1  // fmla za.s[x8, 1], { z15.s-z18.s }, z4.s\n"
+    "8:"  // Unpadded: 0 priming loads
+    "cbz x14, 16f\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x13]\n"
+    "sub x14, x14, #0x1\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, #0x1\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "cmp x14, x11\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "csel x21, x14, x11, LT\n"
+    "ld1w { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z30.s }, p1/Z, [x20]\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 15f\n"
+    "9:"  // Unpadded: Main loop
+    ".inst 0xc13a1b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc1391b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+    ".inst 0xc1371b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+    ".inst 0xc1361b22  // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+    "ld1w { z25.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1311b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+    ".inst 0xc1381b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+    ".inst 0xc1301b42  // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c0c  // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b8c84c  // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
+    "st1w { z12.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc1351b61  // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+    "st1w { z13.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc1341b62  // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z14.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "ld1w { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z15.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc0040e82  // mova za.d[x8, #2], { z20.d-z23.d }\n"
+    "ld1w { z30.s }, p1/Z, [x20]\n"
+    "bgt 9b\n"
+    "b 15f\n"
+    "10:"  // Padded
+    "cbz x10, 13f\n"
+    "cmp x10, #0x1\n"
+    "sub x14, x14, x10\n"
+    "beq 12f\n"
+    "11:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z11.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z14.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1361960  // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1301980  // fmla za.s[x8, 0], { z12.s-z15.s }, z0.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13419a0  // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
+    "12:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z11.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z14.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1371960  // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1361961  // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1381980  // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1301981  // fmla za.s[x8, 1], { z12.s-z15.s }, z0.s\n"
+    ".inst 0xc13519a0  // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
+    ".inst 0xc13419a1  // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
+    "13:"  // Padded: 0 priming loads
+    "cbz x14, 16f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z25.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "sub x14, x14, #0x1\n"
+    "sub x11, x11, #0x1\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "cmp x14, x11\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "csel x21, x14, x11, LT\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 15f\n"
+    "14:"  // Padded: Main loop
+    ".inst 0xc13a1b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1391b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc1371b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+    ".inst 0xc1361b22  // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+    "ld1w { z25.s }, p0/Z, [x13]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1311b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+    ".inst 0xc1381b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+    ".inst 0xc1301b42  // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b8c850  // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
+    "st1w { z16.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc1351b61  // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+    "st1w { z17.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc1341b62  // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z18.s }, p1, [x25]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "st1w { z19.s }, p1, [x24]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc0040e82  // mova za.d[x8, #2], { z20.d-z23.d }\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 14b\n"
+    "15:"  // Main loop tail
+    ".inst 0xc13a1b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+    ".inst 0xc1391b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+    ".inst 0xc1371b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+    ".inst 0xc1361b22  // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+    ".inst 0xc1311b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+    ".inst 0xc1381b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+    ".inst 0xc1301b42  // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b8c850  // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
+    "st1w { z16.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc1351b61  // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+    "st1w { z17.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc1341b62  // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z18.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z19.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc0040e82  // mova za.d[x8, #2], { z20.d-z23.d }\n"
+    "16:"  // Main loop skip tail
+    "cbz x11, 18f\n"
+    "17:"  // Right padding loop
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0xc1b8c848  // fclamp { z8.s-z11.s }, z2.s, z24.s\n"
+    "st1w { z8.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040e82  // mova za.d[x8, #2], { z20.d-z23.d }\n"
+    "st1w { z9.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "st1w { z10.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z11.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 17b\n"
+    "18:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x15\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x15, x16\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
new file mode 100644
index 0000000000..711f7f479a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32_planar_3x3_s2_4rows_mla_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32_planar_3x3_s2_4rows_mla_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32_planar_3x3_s2_4rows_mla_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..26315101b4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x9\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x7\n"
+    "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z7.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x16\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z9.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x17\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z12.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x15, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x14, #0x1\n"
+    "orr x24, x20, %x[ld_in_col], LSL #18\n"
+    "mov z13.d, z12.d\n"
+    "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xa1404ae2  // ld1w { z2.s, z10.s }, pn10.b/Z, [x23]\n"
+    "orr x24, x16, x24, LSL #20\n"
+    "mov x22, #0x9\n"
+    "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ld1w { z8.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "addvl x23, x23, #3\n"
+    "add x21, x17, x7\n"
+    ".inst 0xa0404ae0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x23]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "mov z14.d, z12.d\n"
+    "mov z15.d, z12.d\n"
+    "ld1w { z5.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "addvl x23, x23, #3\n"
+    "mov x8, #0x0\n"
+    "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+    ".inst 0xa1404ae3  // ld1w { z3.s, z11.s }, pn10.b/Z, [x23]\n"
+    "lsl x24, x24, #0x2\n"
+    "sub x22, x22, x21\n"
+    "ld1w { z6.s }, p2/Z, [x23, #2, MUL VL]\n"
+    "madd x20, x20, x17, x13\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b84a9c  // rprfm pldstrm, x24, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x13, x17, x20, x13\n"
+    ".inst 0xc0040d80  // mova za.d[x8, #0], { z12.d-z15.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040d81  // mova za.d[x8, #1], { z12.d-z15.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x26, x25, [x23], #0x10\n"
+    "ldp x24, x23, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c14  // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+    "and x22, x21, #0x1\n"
+    "add x21, x21, #0x1\n"
+    ".inst 0xc1a9c8f4  // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
+    "lsr x21, x21, #0x1\n"
+    "sub x11, x11, x21\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z20.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z21.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z22.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "st1w { z23.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x17, x7\n"
+    "bne 10f\n"
+    "cbz x22, 8f\n"
+    "cmp x22, #0x1\n"
+    "sub x14, x14, x22\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 2 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1321a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z2.s\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1301b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    ".inst 0xc1331a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z3.s\n"
+    "7:"  // Unpadded: 1 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z10.s\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1311a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z1.s\n"
+    "ld1w { z30.s }, p1/Z, [x20]\n"
+    ".inst 0xc13b1b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z11.s\n"
+    "8:"  // Unpadded: 0 priming loads
+    "cmp x14, #0x2\n"
+    "blt 16f\n"
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x13]\n"
+    "sub x14, x14, #0x2\n"
+    "ld1w { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, #0x1\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "lsr x20, x14, #0x1\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "cmp x20, x11\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "csel x22, x20, x11, LT\n"
+    "ld1w { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "and x14, x14, #0x1\n"
+    "ld1w { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, x22\n"
+    "ld1w { z29.s }, p1/Z, [x21]\n"
+    "cbz x22, 15f\n"
+    "9:"  // Unpadded: Main loop
+    ".inst 0xc1381b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    "subs x22, x22, #0x1\n"
+    ".inst 0xc1321b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+    "ld1w { z25.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1351a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+    ".inst 0xc1301a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+    "ld1w { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1361b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+    ".inst 0xc1331b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c14  // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1a9c8f4  // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
+    "st1w { z20.s }, p1, [x10]\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z21.s }, p1, [x9]\n"
+    "ld1w { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+    "add x9, x9, x27, LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1311a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+    "st1w { z22.s }, p1, [x26]\n"
+    "ld1w { z29.s }, p1/Z, [x21]\n"
+    ".inst 0xc13b1b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z11.s\n"
+    "add x26, x26, x24, LSL #2\n"
+    "st1w { z23.s }, p1, [x25]\n"
+    "ld1w { z25.s }, p1/Z, [x13]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z29.s }, p1/Z, [x20]\n"
+    "bgt 9b\n"
+    "b 15f\n"
+    "10:"  // Padded
+    "cbz x22, 13f\n"
+    "cmp x22, #0x1\n"
+    "sub x14, x14, x22\n"
+    "beq 12f\n"
+    "11:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1321b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z2.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1301ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+    "ld1w { z31.s }, p0/Z, [x20]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1331b80  // fmla za.s[x8, 0], { z28.s-z31.s }, z3.s\n"
+    "12:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z22.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13a1ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1311b60  // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13b1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z11.s\n"
+    "13:"  // Padded: 0 priming loads
+    "cmp x14, #0x2\n"
+    "blt 16f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z25.s }, p0/Z, [x13]\n"
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z19.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z26.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "sub x14, x14, #0x2\n"
+    "ld1w { z21.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z28.s }, p0/Z, [x21]\n"
+    "sub x11, x11, #0x1\n"
+    "lsr x20, x14, #0x1\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z22.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "cmp x20, x11\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z29.s }, p0/Z, [x21]\n"
+    "csel x22, x20, x11, LT\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "and x14, x14, #0x1\n"
+    "sub x11, x11, x22\n"
+    "cbz x22, 15f\n"
+    "14:"  // Padded: Main loop
+    ".inst 0xc1381b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1321b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1351a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1301a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+    "ld1w { z25.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1361b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+    "subs x22, x22, #0x1\n"
+    ".inst 0xc1331b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+    "ld1w { z19.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z26.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1a9c8fc  // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "st1w { z28.s }, p1, [x10]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z21.s }, p0/Z, [x21]\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z29.s }, p1, [x9]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z28.s }, p0/Z, [x21]\n"
+    "st1w { z30.s }, p1, [x26]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13a1a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "st1w { z31.s }, p1, [x25]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1311b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
+    "mov x12, #0x0\n"
+    "ld1w { z22.s }, p0/Z, [x21]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z25.s }, p0/Z, [x13]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc13b1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "add x9, x9, x27, LSL #2\n"
+    "add x26, x26, x24, LSL #2\n"
+    "add x25, x25, x23, LSL #2\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "bgt 14b\n"
+    "15:"  // Main loop tail
+    ".inst 0xc1381b20  // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1321b21  // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1351a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1301a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1361b40  // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+    ".inst 0xc1331b41  // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1a9c8fc  // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z28.s }, p1, [x10]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z29.s }, p1, [x9]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "st1w { z30.s }, p1, [x26]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13a1a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z31.s }, p1, [x25]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1311ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "add x9, x9, x27, LSL #2\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc13b1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+    "16:"  // Main loop skip tail
+    "cbz x14, 17f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z21.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1381aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
+    "ld1w { z31.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1351b80  // fmla za.s[x8, 0], { z28.s-z31.s }, z5.s\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    ".inst 0xc1321aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
+    "sub x11, x11, #0x1\n"
+    ".inst 0xc1361ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
+    ".inst 0xc1301b81  // fmla za.s[x8, 1], { z28.s-z31.s }, z0.s\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a9c8f0  // fclamp { z16.s-z19.s }, z7.s, z9.s\n"
+    "st1w { z16.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc1331ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z3.s\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z17.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z18.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "st1w { z19.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "17:"  // Tail input: End
+    "cbz x11, 19f\n"
+    "18:"  // Right padding loop
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0xc1a9c8e0  // fclamp { z0.s-z3.s }, z7.s, z9.s\n"
+    "st1w { z0.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "st1w { z1.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z2.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "st1w { z3.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "bgt 18b\n"
+    "19:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x15\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x15, x16\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
new file mode 100644
index 0000000000..71487e08b6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32_planar_5x5_s1_4rows_mla_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32_planar_5x5_s1_4rows_mla_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32_planar_5x5_s1_4rows_mla_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..3741b973b4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
@@ -0,0 +1,883 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x8\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x6\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z16.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z17.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z28.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    "mov z29.d, z28.d\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "orr x23, x17, x23, LSL #20\n"
+    "mov x22, #0x8\n"
+    "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "mov z30.d, z28.d\n"
+    "mov z31.d, z28.d\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    "mov x8, #0x0\n"
+    "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "lsl x23, x23, #0x2\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x13\n"
+    "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x13, x7, x20, x13\n"
+    ".inst 0xc0040f80  // mova za.d[x8, #0], { z28.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f81  // mova za.d[x8, #1], { z28.d-z31.d }\n"
+    "mov x10, #0x4\n"
+    "ldp x9, x28, [x22], #0x10\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0040f83  // mova za.d[x8, #3], { z28.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "ldp x25, x24, [x22], #0x10\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x10\n"
+    "csel x20, x21, x10, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x10, x10, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    "sub x11, x11, x21\n"
+    ".inst 0xc1b1ca04  // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z4.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z5.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "st1w { z6.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z7.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x10, 10f\n"
+    "cmp x10, #0x1\n"
+    "sub x15, x15, x10\n"
+    "beq 9f\n"
+    "cmp x10, #0x2\n"
+    "beq 8f\n"
+    "cmp x10, #0x3\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 4 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z14.s\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa14049c5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1351a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z5.s\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13c1aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    ".inst 0xa14049c1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1311ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    "7:"  // Unpadded: 3 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z15.s\n"
+    ".inst 0xc13e1a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z14.s\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13b1aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z11.s\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z10.s\n"
+    ".inst 0xa04049c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1371ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z7.s\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1361ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z6.s\n"
+    ".inst 0xa14049c5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13d1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    ".inst 0xc1351ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z5.s\n"
+    ".inst 0xa04049c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1371b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1361b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z1.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z2.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z3.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z4.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13c1820  // fmla za.s[x8, 0], { z1.s-z4.s }, z12.s\n"
+    ".inst 0xc13f1821  // fmla za.s[x8, 1], { z1.s-z4.s }, z15.s\n"
+    "ld1w { z5.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1822  // fmla za.s[x8, 2], { z1.s-z4.s }, z14.s\n"
+    "ld1w { z6.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1381840  // fmla za.s[x8, 0], { z2.s-z5.s }, z8.s\n"
+    ".inst 0xa04049cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc13b1841  // fmla za.s[x8, 1], { z2.s-z5.s }, z11.s\n"
+    ".inst 0xa04149ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13a1842  // fmla za.s[x8, 2], { z2.s-z5.s }, z10.s\n"
+    "ld1w { z7.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1860  // fmla za.s[x8, 0], { z3.s-z6.s }, z14.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc13d1861  // fmla za.s[x8, 1], { z3.s-z6.s }, z13.s\n"
+    ".inst 0xa14149c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13c1862  // fmla za.s[x8, 2], { z3.s-z6.s }, z12.s\n"
+    "ld1w { z8.s }, p1/Z, [x20]\n"
+    ".inst 0xc1301880  // fmla za.s[x8, 0], { z4.s-z7.s }, z0.s\n"
+    ".inst 0xa04049c0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc13f1881  // fmla za.s[x8, 1], { z4.s-z7.s }, z15.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13e1882  // fmla za.s[x8, 2], { z4.s-z7.s }, z14.s\n"
+    ".inst 0xc13c18a0  // fmla za.s[x8, 0], { z5.s-z8.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13118a1  // fmla za.s[x8, 1], { z5.s-z8.s }, z1.s\n"
+    ".inst 0xc13018a2  // fmla za.s[x8, 2], { z5.s-z8.s }, z0.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13d1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
+    ".inst 0xc13c1a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a83  // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
+    ".inst 0xa04049c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc1391aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1381aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13b1aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z11.s\n"
+    ".inst 0xa14149c6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13a1aa3  // fmla za.s[x8, 3], { z21.s-z24.s }, z10.s\n"
+    ".inst 0xa14049c1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13d1ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    ".inst 0xc13c1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1351ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z5.s\n"
+    ".inst 0xc1341ac3  // fmla za.s[x8, 3], { z22.s-z25.s }, z4.s\n"
+    ".inst 0xa04049c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13e1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z14.s\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1361ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z6.s\n"
+    ".inst 0xc1391ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z9.s\n"
+    ".inst 0xc1311ae3  // fmla za.s[x8, 3], { z23.s-z26.s }, z1.s\n"
+    ".inst 0xc13d1b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z13.s\n"
+    ".inst 0xc13c1b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1351b02  // fmla za.s[x8, 2], { z24.s-z27.s }, z5.s\n"
+    ".inst 0xc1341b03  // fmla za.s[x8, 3], { z24.s-z27.s }, z4.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cbz x15, 20f\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x13]\n"
+    "sub x15, x15, #0x1\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, #0x1\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "cmp x15, x11\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "csel x21, x15, x11, LT\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, x21\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "cbz x21, 19f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc1321a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+    "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc1331a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+    ".inst 0xc13d1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+    ".inst 0xc13c1a42  // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13f1a43  // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+    ".inst 0xc13e1a44  // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+    ".inst 0xa04049c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1361a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1391a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+    "ld1w { z18.s }, p1/Z, [x13]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1381a62  // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+    ".inst 0xa04149ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13b1a63  // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+    ".inst 0xc13a1a64  // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+    ".inst 0xa14049c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1361aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc13d1a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z13.s\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13c1a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z12.s\n"
+    ".inst 0xa04149c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1351a83  // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
+    ".inst 0xc1341a84  // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
+    ".inst 0xa04049c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1321ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z15.s\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc13e1aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z14.s\n"
+    ".inst 0xc1381aa3  // fmla za.s[x8, 3], { z21.s-z24.s }, z8.s\n"
+    ".inst 0xc1301aa4  // fmla za.s[x8, 4], { z21.s-z24.s }, z0.s\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c0c  // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b1ca0c  // fclamp { z12.s-z15.s }, z16.s, z17.s\n"
+    "st1w { z12.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc1371ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z7.s\n"
+    "st1w { z13.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc1361ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z6.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "st1w { z14.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc1351ac3  // fmla za.s[x8, 3], { z22.s-z25.s }, z5.s\n"
+    "st1w { z15.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc1341ac4  // fmla za.s[x8, 4], { z22.s-z25.s }, z4.s\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "bgt 11b\n"
+    "b 19f\n"
+    "12:"  // Padded
+    "cbz x10, 17f\n"
+    "cmp x10, #0x1\n"
+    "sub x15, x15, x10\n"
+    "beq 16f\n"
+    "cmp x10, #0x2\n"
+    "beq 15f\n"
+    "cmp x10, #0x3\n"
+    "beq 14f\n"
+    "13:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z19.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
+    ".inst 0xa14049c1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1311aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x14, x14, #5\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    ".inst 0xa04049c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1361ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
+    "addvl x14, x14, #5\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xa14049c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1301ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    "14:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z0.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z1.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z2.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z3.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1800  // fmla za.s[x8, 0], { z0.s-z3.s }, z15.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13e1801  // fmla za.s[x8, 1], { z0.s-z3.s }, z14.s\n"
+    "ld1w { z4.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc13b1820  // fmla za.s[x8, 0], { z1.s-z4.s }, z11.s\n"
+    "ld1w { z5.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1821  // fmla za.s[x8, 1], { z1.s-z4.s }, z10.s\n"
+    ".inst 0xa04049c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1391840  // fmla za.s[x8, 0], { z2.s-z5.s }, z9.s\n"
+    "ld1w { z6.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1381841  // fmla za.s[x8, 1], { z2.s-z5.s }, z8.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13f1860  // fmla za.s[x8, 0], { z3.s-z6.s }, z15.s\n"
+    "ld1w { z7.s }, p0/Z, [x20]\n"
+    ".inst 0xc13e1861  // fmla za.s[x8, 1], { z3.s-z6.s }, z14.s\n"
+    ".inst 0xa14049c3  // ld1w { z3.s, z11.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13b1880  // fmla za.s[x8, 0], { z4.s-z7.s }, z11.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1331881  // fmla za.s[x8, 1], { z4.s-z7.s }, z3.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "15:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z19.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13c1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z12.s\n"
+    ".inst 0xa04149c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13f1a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z15.s\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a62  // fmla za.s[x8, 2], { z19.s-z22.s }, z14.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1381a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z8.s\n"
+    ".inst 0xa14049c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13b1a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z11.s\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc13a1a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z10.s\n"
+    ".inst 0xa14049c2  // ld1w { z2.s, z10.s }, pn10.b/Z, [x14]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1361aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
+    ".inst 0xa14149c4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1381aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    ".inst 0xc1301aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+    ".inst 0xa04049c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc1341ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z4.s\n"
+    ".inst 0xa14149c3  // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13a1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+    ".inst 0xc1321ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z2.s\n"
+    ".inst 0xc1331ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1371ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
+    ".inst 0xc1361ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z6.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "16:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13d1a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z13.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13c1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z12.s\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1a42  // fmla za.s[x8, 2], { z18.s-z21.s }, z15.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a43  // fmla za.s[x8, 3], { z18.s-z21.s }, z14.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1391a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z9.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1381a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z8.s\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc13b1a62  // fmla za.s[x8, 2], { z19.s-z22.s }, z11.s\n"
+    ".inst 0xa14149c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13a1a63  // fmla za.s[x8, 3], { z19.s-z22.s }, z10.s\n"
+    ".inst 0xa14049c1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13d1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    ".inst 0xc13c1a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13f1a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
+    ".inst 0xc13e1a83  // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1381aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1301aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z0.s\n"
+    ".inst 0xc1391aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z9.s\n"
+    ".inst 0xc1311aa3  // fmla za.s[x8, 3], { z21.s-z24.s }, z1.s\n"
+    ".inst 0xc13d1ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
+    ".inst 0xc13c1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13b1ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z11.s\n"
+    ".inst 0xc13a1ac3  // fmla za.s[x8, 3], { z22.s-z25.s }, z10.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "17:"  // Padded: 0 priming loads
+    "cbz x15, 20f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "sub x15, x15, #0x1\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "sub x11, x11, #0x1\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "cmp x15, x11\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "csel x21, x15, x11, LT\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 19f\n"
+    "18:"  // Padded: Main loop
+    ".inst 0xc1321a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+    "ld1w { z0.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1331a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc13d1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+    ".inst 0xc13c1a42  // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+    ".inst 0xa04149c2  // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13f1a43  // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+    ".inst 0xc13e1a44  // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+    ".inst 0xa04049c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1301a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z0.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1391a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1381a62  // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+    ".inst 0xa14149c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13b1a63  // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+    ".inst 0xc13a1a64  // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+    ".inst 0xa04049c6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc13c1aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1331a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z3.s\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1321a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z2.s\n"
+    ".inst 0xa14149c3  // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1351a83  // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
+    ".inst 0xc1341a84  // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    ".inst 0xc13c1ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z12.s\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1381aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+    "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1301aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+    ".inst 0xc1371aa3  // fmla za.s[x8, 3], { z21.s-z24.s }, z7.s\n"
+    ".inst 0xc1361aa4  // fmla za.s[x8, 4], { z21.s-z24.s }, z6.s\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b1ca04  // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+    "st1w { z4.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc13b1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z11.s\n"
+    "st1w { z5.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc1331ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z3.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "st1w { z6.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc13f1ac3  // fmla za.s[x8, 3], { z22.s-z25.s }, z15.s\n"
+    "st1w { z7.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc13e1ac4  // fmla za.s[x8, 4], { z22.s-z25.s }, z14.s\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa04049ce  // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xa04149c8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+    "addvl x14, x14, #5\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "bgt 18b\n"
+    "19:"  // Main loop tail
+    ".inst 0xc1321a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+    "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1331a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+    ".inst 0xc13d1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+    ".inst 0xc13c1a42  // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+    ".inst 0xa04149c4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13f1a43  // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+    ".inst 0xc13e1a44  // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+    ".inst 0xa04049c2  // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1361a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+    "ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1391a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+    ".inst 0xc1381a62  // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+    ".inst 0xa14149c0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc13b1a63  // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+    ".inst 0xc13a1a64  // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+    ".inst 0xa04049ca  // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+    "addvl x14, x14, #5\n"
+    ".inst 0xc1371aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
+    "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
+    ".inst 0xc1351a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
+    ".inst 0xc1341a82  // fmla za.s[x8, 2], { z20.s-z23.s }, z4.s\n"
+    ".inst 0xa04149cc  // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+    ".inst 0xc1331a83  // fmla za.s[x8, 3], { z20.s-z23.s }, z3.s\n"
+    ".inst 0xc1321a84  // fmla za.s[x8, 4], { z20.s-z23.s }, z2.s\n"
+    ".inst 0xa04049c2  // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
+    ".inst 0xc1311ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+    ".inst 0xc1381aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+    ".inst 0xc1301aa2  // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+    ".inst 0xc13b1aa3  // fmla za.s[x8, 3], { z21.s-z24.s }, z11.s\n"
+    ".inst 0xc13a1aa4  // fmla za.s[x8, 4], { z21.s-z24.s }, z10.s\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    ".inst 0xc1b1ca04  // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+    "st1w { z4.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc13d1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z13.s\n"
+    "st1w { z5.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc13c1ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z12.s\n"
+    "st1w { z6.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc1331ac3  // fmla za.s[x8, 3], { z22.s-z25.s }, z3.s\n"
+    "st1w { z7.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc1321ac4  // fmla za.s[x8, 4], { z22.s-z25.s }, z2.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "20:"  // Main loop skip tail
+    "cbz x11, 22f\n"
+    "21:"  // Right padding loop
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0xc1b1ca00  // fclamp { z0.s-z3.s }, z16.s, z17.s\n"
+    "st1w { z0.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "st1w { z1.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "st1w { z2.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z3.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 21b\n"
+    "22:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #16\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x16\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
new file mode 100644
index 0000000000..7412c7b57c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32_planar_5x5_s2_4rows_mla_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32_planar_5x5_s2_4rows_mla_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32_planar_5x5_s2_4rows_mla_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..81ad8e5833
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
@@ -0,0 +1,1172 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0xb\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x5\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x7\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z3.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x6\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z28.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z28.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x16, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    "mov z29.d, z28.d\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "orr x23, x7, x23, LSL #20\n"
+    "mov x22, #0xb\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "add x21, x6, x5\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "mov z30.d, z28.d\n"
+    "mov z31.d, z28.d\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    "mov x8, #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "lsl x23, x23, #0x2\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x6, x14\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x14, x6, x20, x14\n"
+    ".inst 0xc0040f80  // mova za.d[x8, #0], { z28.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f81  // mova za.d[x8, #1], { z28.d-z31.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    ".inst 0xc0040f83  // mova za.d[x8, #3], { z28.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "and x22, x21, #0x1\n"
+    "add x21, x21, #0x1\n"
+    ".inst 0xc1a3c850  // fclamp { z16.s-z19.s }, z2.s, z3.s\n"
+    "lsr x21, x21, #0x1\n"
+    "sub x13, x13, x21\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9, LSL #2\n"
+    "st1w { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25, LSL #2\n"
+    "st1w { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x6, x5\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x16, x16, x22\n"
+    "beq 9f\n"
+    "cmp x22, #0x2\n"
+    "beq 8f\n"
+    "cmp x22, #0x3\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 4 priming loads
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    "ld1w { z9.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z10.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z11.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1341920  // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1371a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z7.s\n"
+    "ld1w { z13.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1341940  // fmla za.s[x8, 0], { z10.s-z13.s }, z4.s\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1301aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z0.s\n"
+    "ld1w { z14.s }, p1/Z, [x20]\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1341960  // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    "7:"  // Unpadded: 3 priming loads
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z7.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z8.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z9.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1351ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
+    "ld1w { z10.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f18e0  // fmla za.s[x8, 0], { z7.s-z10.s }, z15.s\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13f1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
+    "ld1w { z11.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1371900  // fmla za.s[x8, 0], { z8.s-z11.s }, z7.s\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    ".inst 0xa04049ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13b1b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z11.s\n"
+    ".inst 0xa14049e4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z14.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13a1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
+    ".inst 0xc1341a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z4.s\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13019c0  // fmla za.s[x8, 0], { z14.s-z17.s }, z0.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13719c1  // fmla za.s[x8, 1], { z14.s-z17.s }, z7.s\n"
+    ".inst 0xa04049e8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13a1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
+    ".inst 0xc1381a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z8.s\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc13619e1  // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
+    ".inst 0xa04149e8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13819e0  // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc13e1aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z14.s\n"
+    ".inst 0xa14149e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1371aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    "ld1w { z7.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z8.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z9.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z10.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13b18e0  // fmla za.s[x8, 0], { z7.s-z10.s }, z11.s\n"
+    ".inst 0xc13518e1  // fmla za.s[x8, 1], { z7.s-z10.s }, z5.s\n"
+    "ld1w { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1311a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+    "ld1w { z11.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+    ".inst 0xa04049e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ec  // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13d1900  // fmla za.s[x8, 0], { z8.s-z11.s }, z13.s\n"
+    ".inst 0xc1311901  // fmla za.s[x8, 1], { z8.s-z11.s }, z1.s\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa14049e6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc13e1a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z14.s\n"
+    ".inst 0xa14149e6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13e1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
+    "ld1w { z12.s }, p1/Z, [x20]\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc13f1921  // fmla za.s[x8, 1], { z9.s-z12.s }, z15.s\n"
+    ".inst 0xa04149ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13f1920  // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
+    ".inst 0xa14049e4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cmp x16, #0x2\n"
+    "blt 20f\n"
+    "add x21, x14, %x[ld_in_row], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x14]\n"
+    "sub x16, x16, #0x2\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x13, x13, #0x1\n"
+    "ld1w { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "lsr x20, x16, #0x1\n"
+    "ld1w { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "cmp x20, x13\n"
+    "ld1w { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "csel x23, x20, x13, LT\n"
+    "ld1w { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "and x16, x16, #0x1\n"
+    "ld1w { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x13, x13, x23\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    "cbz x23, 19f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc1391ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+    "ld1w { z13.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "add x22, x14, %x[ld_in_row], LSL #2\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc13a1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+    ".inst 0xa14149e1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1341ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+    ".inst 0xa04049e8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1361a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+    "ld1w { z11.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1301a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+    ".inst 0xa04149ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1371a02  // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13d1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
+    "ld1w { z4.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1311ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z1.s\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1381ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z8.s\n"
+    ".inst 0xa04049ec  // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13b1a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
+    "ld1w { z15.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+    "ld1w { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1361a22  // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+    "ld1w { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1341b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z4.s\n"
+    ".inst 0xc1301b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z0.s\n"
+    ".inst 0xa0414aa6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc13c1b02  // fmla za.s[x8, 2], { z24.s-z27.s }, z12.s\n"
+    "ld1w { z17.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    "ld1w { z24.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xa1404aa4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x21]\n"
+    "addvl x21, x21, #5\n"
+    ".inst 0xc1a3c848  // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+    "st1w { z8.s }, p1, [x11]\n"
+    "ld1w { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13719e0  // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
+    "add x11, x11, x9, LSL #2\n"
+    ".inst 0xc13c19e1  // fmla za.s[x8, 1], { z15.s-z18.s }, z12.s\n"
+    ".inst 0xa1404aa7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
+    "st1w { z9.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xa1414aa6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    "addvl x21, x21, #5\n"
+    "st1w { z10.s }, p1, [x27]\n"
+    "add x27, x27, x25, LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z14.s\n"
+    "st1w { z11.s }, p1, [x26]\n"
+    ".inst 0xc13f1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z15.s\n"
+    "ld1w { z19.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xa0404aae  // ld1w { z14.s-z15.s }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13f1a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z15.s\n"
+    ".inst 0xa1414aa4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    "addvl x21, x21, #5\n"
+    ".inst 0xc13c1a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z12.s\n"
+    "ld1w { z26.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    ".inst 0xa0404aac  // ld1w { z12.s-z13.s }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13d1ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z13.s\n"
+    ".inst 0xa1414aa4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    "addvl x21, x21, #5\n"
+    ".inst 0xc13c1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
+    "ld1w { z20.s }, p1/Z, [x22]\n"
+    ".inst 0xa1404aa7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13f1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z15.s\n"
+    ".inst 0xa0414aaa  // ld1w { z10.s-z11.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc13b1a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    "ld1w { z22.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "bgt 11b\n"
+    "b 19f\n"
+    "12:"  // Padded
+    "cbz x22, 17f\n"
+    "cmp x22, #0x1\n"
+    "sub x16, x16, x22\n"
+    "beq 16f\n"
+    "cmp x22, #0x2\n"
+    "beq 15f\n"
+    "cmp x22, #0x3\n"
+    "beq 14f\n"
+    "13:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z9.s }, p0/Z, [x14]\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z10.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z11.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1341920  // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1371ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z7.s\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1361940  // fmla za.s[x8, 0], { z10.s-z13.s }, z6.s\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa14049e6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1361b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
+    "ld1w { z14.s }, p0/Z, [x20]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1361960  // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    "14:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z22.s }, p0/Z, [x14]\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z10.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z11.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1351ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13f1920  // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    ".inst 0xa14049e0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1381ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13f1940  // fmla za.s[x8, 0], { z10.s-z13.s }, z15.s\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13f1b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z15.s\n"
+    ".inst 0xa14049e4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "15:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x14]\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13a1a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    ".inst 0xc1341a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    ".inst 0xc1301ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+    ".inst 0xa14049e0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1371ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
+    ".inst 0xa14149e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1371a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1301a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
+    ".inst 0xa14049e5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13a1b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z10.s\n"
+    ".inst 0xc1351b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    ".inst 0xa14049e5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1351a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1301a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z0.s\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "16:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z19.s }, p0/Z, [x14]\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z8.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z10.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z22.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13b1a60  // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+    "ld1w { z11.s }, p0/Z, [x20]\n"
+    ".inst 0xc1351a61  // fmla za.s[x8, 1], { z19.s-z22.s }, z5.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    ".inst 0xc1311900  // fmla za.s[x8, 0], { z8.s-z11.s }, z1.s\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc13f1901  // fmla za.s[x8, 1], { z8.s-z11.s }, z15.s\n"
+    ".inst 0xa14149e6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13e1a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z14.s\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1351a81  // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
+    ".inst 0xa04049e6  // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa14149e0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1381920  // fmla za.s[x8, 0], { z9.s-z12.s }, z8.s\n"
+    ".inst 0xc1371921  // fmla za.s[x8, 1], { z9.s-z12.s }, z7.s\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    ".inst 0xa14049e0  // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1381aa1  // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+    ".inst 0xa04149ec  // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13d1aa0  // fmla za.s[x8, 0], { z21.s-z24.s }, z13.s\n"
+    ".inst 0xa14049e4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "17:"  // Padded: 0 priming loads
+    "cmp x16, #0x2\n"
+    "blt 20f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z22.s }, p0/Z, [x14]\n"
+    "add x21, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z23.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z18.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z25.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z19.s }, p0/Z, [x21]\n"
+    "sub x16, x16, #0x2\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "sub x13, x13, #0x1\n"
+    "ld1w { z26.s }, p0/Z, [x21]\n"
+    "lsr x20, x16, #0x1\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "cmp x20, x13\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "csel x23, x20, x13, LT\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    "and x16, x16, #0x1\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 19f\n"
+    "18:"  // Padded: Main loop
+    ".inst 0xc1391ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+    "ld1w { z15.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13a1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "add x22, x14, %x[ld_in_row], LSL #2\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc1341ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+    ".inst 0xa14049e5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1361a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+    "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1301a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+    ".inst 0xa04149ec  // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1371a02  // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+    ".inst 0xa14049e6  // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13f1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
+    "ld1w { z16.s }, p0/Z, [x14]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13a1ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "add x21, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1351ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z5.s\n"
+    ".inst 0xa04049ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1311a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
+    "ld1w { z0.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13c1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z12.s\n"
+    "ld1w { z12.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1361a22  // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+    "ld1w { z17.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z13.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1301b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13a1b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z10.s\n"
+    ".inst 0xa1414a81  // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc13e1b02  // fmla za.s[x8, 2], { z24.s-z27.s }, z14.s\n"
+    "ld1w { z18.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z14.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    ".inst 0xa0404a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+    "add x8, x8, #0x1\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc1a3c858  // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
+    "ld1w { z19.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "st1w { z24.s }, p1, [x11]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1391a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
+    ".inst 0xa0404a88  // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
+    "add x11, x11, x9, LSL #2\n"
+    ".inst 0xc1311a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z1.s\n"
+    ".inst 0xa0414a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    "st1w { z25.s }, p1, [x10]\n"
+    "ld1w { z15.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1311980  // fmla za.s[x8, 0], { z12.s-z15.s }, z1.s\n"
+    ".inst 0xc1391981  // fmla za.s[x8, 1], { z12.s-z15.s }, z9.s\n"
+    "ld1w { z20.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa0404a8a  // ld1w { z10.s-z11.s }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13b1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z11.s\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z26.s }, p1, [x27]\n"
+    ".inst 0xa1414a80  // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc1381a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+    "add x27, x27, x25, LSL #2\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "st1w { z27.s }, p1, [x26]\n"
+    ".inst 0xa0404a88  // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc13919a1  // fmla za.s[x8, 1], { z13.s-z16.s }, z9.s\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xa1414a81  // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc13919a0  // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+    "ld1w { z21.s }, p0/Z, [x22]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "ld1w { z22.s }, p0/Z, [x14]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xa0404a8e  // ld1w { z14.s-z15.s }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13f1a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+    ".inst 0xa0414a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1311a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z23.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z17.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z18.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z25.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z19.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z26.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa04049e4  // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "bgt 18b\n"
+    "19:"  // Main loop tail
+    ".inst 0xc1391ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+    "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13a1ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "add x21, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1341ac2  // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+    ".inst 0xa14049e1  // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1361a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1301a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+    ".inst 0xa04149ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1371a02  // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+    ".inst 0xa04049ec  // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1381ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
+    "ld1w { z16.s }, p0/Z, [x14]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x14, x14, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13a1ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
+    ".inst 0xa14149e5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc1311ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z1.s\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1391a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
+    "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc13e1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+    "ld1w { z22.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc13c1a22  // fmla za.s[x8, 2], { z17.s-z20.s }, z12.s\n"
+    "ld1w { z17.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z23.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1311b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z1.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1351b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+    ".inst 0xa0414a8e  // ld1w { z14.s-z15.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1371b02  // fmla za.s[x8, 2], { z24.s-z27.s }, z7.s\n"
+    "ld1w { z18.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z24.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    ".inst 0xa0404a84  // ld1w { z4.s-z5.s }, pn10.b/Z, [x20]\n"
+    "add x8, x8, #0x1\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc1a3c848  // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+    "ld1w { z19.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "st1w { z8.s }, p1, [x11]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc13f1a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z15.s\n"
+    ".inst 0xa0404a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+    "add x11, x11, x9, LSL #2\n"
+    ".inst 0xc1351a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
+    ".inst 0xa1414a80  // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    "st1w { z9.s }, p1, [x10]\n"
+    "ld1w { z25.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1381ac0  // fmla za.s[x8, 0], { z22.s-z25.s }, z8.s\n"
+    ".inst 0xc1311ac1  // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa0404a86  // ld1w { z6.s-z7.s }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1371a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z10.s }, p1, [x27]\n"
+    ".inst 0xa1414a81  // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc1391a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
+    "add x27, x27, x25, LSL #2\n"
+    "ld1w { z26.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "st1w { z11.s }, p1, [x26]\n"
+    ".inst 0xa1404a84  // ld1w { z4.s, z12.s }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13c1ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z12.s\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xa1414a84  // ld1w { z4.s, z12.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "addvl x20, x20, #5\n"
+    ".inst 0xc13c1ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
+    "ld1w { z21.s }, p0/Z, [x21]\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    ".inst 0xa0404a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1311a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z1.s\n"
+    ".inst 0xa0414a80  // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+    ".inst 0xc1311a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+    ".inst 0xa14049e4  // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xa14049e7  // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    "20:"  // Main loop skip tail
+    "cbz x16, 21f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x14]\n"
+    "add x20, x14, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1391a00  // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1361ae0  // fmla za.s[x8, 0], { z23.s-z26.s }, z6.s\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc13a1a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z10.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1341a02  // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
+    ".inst 0xa04049ea  // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
+    "sub x13, x13, #0x1\n"
+    ".inst 0xa04149ee  // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc1381a20  // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0xc1301ae1  // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1371ae2  // fmla za.s[x8, 2], { z23.s-z26.s }, z7.s\n"
+    ".inst 0xa04049e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1391b00  // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
+    ".inst 0xa14149e5  // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    "addvl x15, x15, #5\n"
+    ".inst 0xc13e1a21  // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    ".inst 0xc13a1a22  // fmla za.s[x8, 2], { z17.s-z20.s }, z10.s\n"
+    "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+    ".inst 0xc1381a40  // fmla za.s[x8, 0], { z18.s-z21.s }, z8.s\n"
+    ".inst 0xc1351b01  // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+    ".inst 0xa04049e8  // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+    ".inst 0xc1301b02  // fmla za.s[x8, 2], { z24.s-z27.s }, z0.s\n"
+    ".inst 0xa04149e0  // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a3c858  // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
+    "st1w { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9, LSL #2\n"
+    ".inst 0xc1301a41  // fmla za.s[x8, 1], { z18.s-z21.s }, z0.s\n"
+    "st1w { z25.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc1381a42  // fmla za.s[x8, 2], { z18.s-z21.s }, z8.s\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z26.s }, p1, [x27]\n"
+    "add x27, x27, x25, LSL #2\n"
+    "st1w { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "21:"  // Tail input: End
+    "cbz x13, 23f\n"
+    "22:"  // Right padding loop
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc1a3c848  // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+    "st1w { z8.s }, p1, [x11]\n"
+    "add x11, x11, x9, LSL #2\n"
+    ".inst 0xc0040f84  // mova za.d[x8, #4], { z28.d-z31.d }\n"
+    "st1w { z9.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z10.s }, p1, [x27]\n"
+    "add x27, x27, x25, LSL #2\n"
+    "st1w { z11.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "bgt 22b\n"
+    "23:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #16\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x17\n"
+    "whilelt p1.s, x17, x7\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..50ef6c3815
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..be82e04613
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x6\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x7\n"
+    "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    "ld1rw { z25.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x16\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x17\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z26.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z26.s }, p1/Z, [x20, x15, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x21\n"
+    "fmov z6.s, #0x0\n"
+    "ld1w { z15.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    "incb x21\n"
+    "ld1w { z29.s }, p2/Z, [x20]\n"
+    ".inst 0x648aa9e6  // bfcvtnt z6.h, p2/M, z15.s\n"
+    "incb x20, ALL, MUL #3\n"
+    "ld1w { z30.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    ".inst 0x658aa9e5  // bfcvt z5.h, p2/M, z15.s\n"
+    "ld1w { z14.s }, p2/Z, [x20]\n"
+    ".inst 0x658aaba8  // bfcvt z8.h, p2/M, z29.s\n"
+    "fmov z11.s, #0x0\n"
+    "incb x20, ALL, MUL #3\n"
+    ".inst 0x658aa9ca  // bfcvt z10.h, p2/M, z14.s\n"
+    ".inst 0x648aaba5  // bfcvtnt z5.h, p2/M, z29.s\n"
+    "incb x21\n"
+    "ld1w { z24.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    ".inst 0x648aabc8  // bfcvtnt z8.h, p2/M, z30.s\n"
+    ".inst 0x658aabcc  // bfcvt z12.h, p2/M, z30.s\n"
+    "ld1w { z28.s }, p2/Z, [x20]\n"
+    "mov x21, x21\n"
+    ".inst 0x648aa9cb  // bfcvtnt z11.h, p2/M, z14.s\n"
+    "ld1w { z20.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #3\n"
+    ".inst 0x648aab0a  // bfcvtnt z10.h, p2/M, z24.s\n"
+    ".inst 0x658aab09  // bfcvt z9.h, p2/M, z24.s\n"
+    "ld1w { z15.s }, p2/Z, [x21]\n"
+    "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "incb x21, ALL, MUL #3\n"
+    "fmov z14.s, #0x0\n"
+    ".inst 0x658aaa81  // bfcvt z1.h, p2/M, z20.s\n"
+    "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+    ".inst 0x658aa9e7  // bfcvt z7.h, p2/M, z15.s\n"
+    ".inst 0x648aab89  // bfcvtnt z9.h, p2/M, z28.s\n"
+    "sub x20, x14, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    ".inst 0x658aab84  // bfcvt z4.h, p2/M, z28.s\n"
+    "ld1w { z29.s }, p2/Z, [x21]\n"
+    "orr x23, x16, x23, LSL #20\n"
+    "mov x22, #0x6\n"
+    "add x21, x17, x7\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "mov z27.d, z26.d\n"
+    ".inst 0x648aaa8e  // bfcvtnt z14.h, p2/M, z20.s\n"
+    ".inst 0x648aa9e1  // bfcvtnt z1.h, p2/M, z15.s\n"
+    ".inst 0x648aaba7  // bfcvtnt z7.h, p2/M, z29.s\n"
+    "mov x8, #0x0\n"
+    "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+    ".inst 0x658aaba2  // bfcvt z2.h, p2/M, z29.s\n"
+    "lsl x23, x23, #0x2\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x17, x13\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x13, x17, x20, x13\n"
+    ".inst 0xc0040b40  // mova za.d[x8, #0], { z26.d-z27.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040b41  // mova za.d[x8, #1], { z26.d-z27.d }\n"
+    "mov x10, #0x2\n"
+    "ldp x9, x28, [x22], #0x10\n"
+    ".inst 0xc0040b42  // mova za.d[x8, #2], { z26.d-z27.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0040b43  // mova za.d[x8, #3], { z26.d-z27.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040b44  // mova za.d[x8, #4], { z26.d-z27.d }\n"
+    "ldp x25, x24, [x22], #0x10\n"
+    ".inst 0xc0040b45  // mova za.d[x8, #5], { z26.d-z27.d }\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x10\n"
+    "csel x20, x21, x10, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x10, x10, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060814  // mova { z20.d-z21.d }, za.d[x8, #0]\n"
+    "sub x11, x11, x21\n"
+    ".inst 0xc0060836  // mova { z22.d-z23.d }, za.d[x8, #1]\n"
+    ".inst 0xc1adcb34  // fclamp { z20.s-z23.s }, z25.s, z13.s\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z20.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z22.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "st1w { z21.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "st1w { z23.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x17, x7\n"
+    "bne 10f\n"
+    "cbz x10, 8f\n"
+    "cmp x10, #0x1\n"
+    "sub x14, x14, x10\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 2 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x13]\n"
+    ".inst 0x658aaa3e  // bfcvt z30.h, p2/M, z17.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab9e  // bfcvtnt z30.h, p2/M, z28.s\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa1f  // bfcvt z31.h, p2/M, z16.s\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa9ff  // bfcvtnt z31.h, p2/M, z15.s\n"
+    ".inst 0xc12513d0  // bfdot za.s[x8, 0], { z30.h-z31.h }, z5.h\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa00  // bfcvt z0.h, p2/M, z16.s\n"
+    ".inst 0xc12613d1  // bfdot za.s[x8, 1], { z30.h-z31.h }, z6.h\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    ".inst 0x648aa9e0  // bfcvtnt z0.h, p2/M, z15.s\n"
+    ".inst 0xc12c13f0  // bfdot za.s[x8, 0], { z31.h-z0.h }, z12.h\n"
+    ".inst 0xc12813f1  // bfdot za.s[x8, 1], { z31.h-z0.h }, z8.h\n"
+    "7:"  // Unpadded: 1 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z31.s }, p1/Z, [x13]\n"
+    ".inst 0x658aabef  // bfcvt z15.h, p2/M, z31.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa0f  // bfcvtnt z15.h, p2/M, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa30  // bfcvtnt z16.h, p2/M, z17.s\n"
+    ".inst 0xc12a11f0  // bfdot za.s[x8, 0], { z15.h-z16.h }, z10.h\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaad1  // bfcvt z17.h, p2/M, z22.s\n"
+    ".inst 0xc12b11f1  // bfdot za.s[x8, 1], { z15.h-z16.h }, z11.h\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    ".inst 0x648aaa51  // bfcvtnt z17.h, p2/M, z18.s\n"
+    ".inst 0xc12511f2  // bfdot za.s[x8, 2], { z15.h-z16.h }, z5.h\n"
+    ".inst 0xc12611f3  // bfdot za.s[x8, 3], { z15.h-z16.h }, z6.h\n"
+    ".inst 0xc1241210  // bfdot za.s[x8, 0], { z16.h-z17.h }, z4.h\n"
+    ".inst 0xc1291211  // bfdot za.s[x8, 1], { z16.h-z17.h }, z9.h\n"
+    ".inst 0xc12c1212  // bfdot za.s[x8, 2], { z16.h-z17.h }, z12.h\n"
+    ".inst 0xc1281213  // bfdot za.s[x8, 3], { z16.h-z17.h }, z8.h\n"
+    "8:"  // Unpadded: 0 priming loads
+    "cbz x14, 16f\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x13]\n"
+    ".inst 0x658aaa16  // bfcvt z22.h, p2/M, z16.s\n"
+    "sub x14, x14, #0x1\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, #0x1\n"
+    ".inst 0x648aaa16  // bfcvtnt z22.h, p2/M, z16.s\n"
+    "ld1w { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa817  // bfcvt z23.h, p2/M, z0.s\n"
+    "cmp x14, x11\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "csel x21, x14, x11, LT\n"
+    ".inst 0x648aab17  // bfcvtnt z23.h, p2/M, z24.s\n"
+    "ld1w { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa818  // bfcvt z24.h, p2/M, z0.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    ".inst 0x648aaa18  // bfcvtnt z24.h, p2/M, z16.s\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 15f\n"
+    "9:"  // Unpadded: Main loop
+    ".inst 0xc12112d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z0.s }, p1/Z, [x13]\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc12e12d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+    "ld1w { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12212f0  // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+    ".inst 0xc12712f1  // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12a12d2  // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12b12d3  // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    ".inst 0xc12512d4  // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+    ".inst 0xc12612d5  // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+    ".inst 0x658aa816  // bfcvt z22.h, p2/M, z0.s\n"
+    ".inst 0x648aaa96  // bfcvtnt z22.h, p2/M, z20.s\n"
+    ".inst 0xc12412f2  // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+    ".inst 0xc12912f3  // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+    ".inst 0xc12c12f4  // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+    ".inst 0xc12812f5  // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+    ".inst 0x658aaa77  // bfcvt z23.h, p2/M, z19.s\n"
+    ".inst 0x658aaa38  // bfcvt z24.h, p2/M, z17.s\n"
+    ".inst 0xc0060810  // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+    ".inst 0x648aaa57  // bfcvtnt z23.h, p2/M, z18.s\n"
+    ".inst 0x648aab98  // bfcvtnt z24.h, p2/M, z28.s\n"
+    ".inst 0xc0060832  // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1adcb30  // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+    "st1w { z16.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z18.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc0040b44  // mova za.d[x8, #4], { z26.d-z27.d }\n"
+    "st1w { z17.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc0040b45  // mova za.d[x8, #5], { z26.d-z27.d }\n"
+    "st1w { z19.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 9b\n"
+    "b 15f\n"
+    "10:"  // Padded
+    "cbz x10, 13f\n"
+    "cmp x10, #0x1\n"
+    "sub x14, x14, x10\n"
+    "beq 12f\n"
+    "11:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa14  // bfcvtnt z20.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa15  // bfcvt z21.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa15  // bfcvtnt z21.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1251290  // bfdot za.s[x8, 0], { z20.h-z21.h }, z5.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaaf6  // bfcvt z22.h, p2/M, z23.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa16  // bfcvtnt z22.h, p2/M, z16.s\n"
+    ".inst 0xc1261291  // bfdot za.s[x8, 1], { z20.h-z21.h }, z6.h\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc12c12b0  // bfdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
+    ".inst 0xc12812b1  // bfdot za.s[x8, 1], { z21.h-z22.h }, z8.h\n"
+    "12:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa13  // bfcvt z19.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa13  // bfcvtnt z19.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa14  // bfcvtnt z20.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12a1270  // bfdot za.s[x8, 0], { z19.h-z20.h }, z10.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9f5  // bfcvt z21.h, p2/M, z15.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa15  // bfcvtnt z21.h, p2/M, z16.s\n"
+    ".inst 0xc12b1271  // bfdot za.s[x8, 1], { z19.h-z20.h }, z11.h\n"
+    ".inst 0xc1251272  // bfdot za.s[x8, 2], { z19.h-z20.h }, z5.h\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1261273  // bfdot za.s[x8, 3], { z19.h-z20.h }, z6.h\n"
+    ".inst 0xc1241290  // bfdot za.s[x8, 0], { z20.h-z21.h }, z4.h\n"
+    ".inst 0xc1291291  // bfdot za.s[x8, 1], { z20.h-z21.h }, z9.h\n"
+    ".inst 0xc12c1292  // bfdot za.s[x8, 2], { z20.h-z21.h }, z12.h\n"
+    ".inst 0xc1281293  // bfdot za.s[x8, 3], { z20.h-z21.h }, z8.h\n"
+    "13:"  // Padded: 0 priming loads
+    "cbz x14, 16f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa16  // bfcvt z22.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa16  // bfcvtnt z22.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa17  // bfcvt z23.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa17  // bfcvtnt z23.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa18  // bfcvt z24.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "sub x14, x14, #0x1\n"
+    ".inst 0x648aaa18  // bfcvtnt z24.h, p2/M, z16.s\n"
+    "sub x11, x11, #0x1\n"
+    "cmp x14, x11\n"
+    "csel x21, x14, x11, LT\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 15f\n"
+    "14:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0xc12112d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z20.s }, p0/Z, [x13]\n"
+    ".inst 0xc12e12d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    ".inst 0xc12212f0  // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc12712f1  // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    ".inst 0xc12a12d2  // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12b12d3  // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12512d4  // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0xc12612d5  // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+    ".inst 0x658aaa96  // bfcvt z22.h, p2/M, z20.s\n"
+    ".inst 0x648aaa76  // bfcvtnt z22.h, p2/M, z19.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc12412f2  // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+    ".inst 0xc12912f3  // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+    ".inst 0xc12c12f4  // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+    ".inst 0xc12812f5  // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+    ".inst 0x658aaa37  // bfcvt z23.h, p2/M, z17.s\n"
+    ".inst 0x658aaa18  // bfcvt z24.h, p2/M, z16.s\n"
+    ".inst 0xc0060810  // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+    ".inst 0x648aaa57  // bfcvtnt z23.h, p2/M, z18.s\n"
+    ".inst 0x648aa9f8  // bfcvtnt z24.h, p2/M, z15.s\n"
+    ".inst 0xc0060832  // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1adcb30  // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+    "st1w { z16.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z18.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc0040b44  // mova za.d[x8, #4], { z26.d-z27.d }\n"
+    "st1w { z17.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc0040b45  // mova za.d[x8, #5], { z26.d-z27.d }\n"
+    "st1w { z19.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 14b\n"
+    "15:"  // Main loop tail
+    ".inst 0xc12112d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0xc12e12d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xc12212f0  // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+    ".inst 0xc12712f1  // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
+    ".inst 0xc12a12d2  // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
+    ".inst 0xc12b12d3  // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+    ".inst 0xc12512d4  // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+    ".inst 0xc12612d5  // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+    ".inst 0xc0060810  // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+    ".inst 0xc0060832  // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+    ".inst 0xc1adcb30  // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+    "st1w { z16.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc12412f2  // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+    "st1w { z18.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc12912f3  // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+    "st1w { z17.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc12c12f4  // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+    "st1w { z19.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    ".inst 0xc12812f5  // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0040b44  // mova za.d[x8, #4], { z26.d-z27.d }\n"
+    ".inst 0xc0040b45  // mova za.d[x8, #5], { z26.d-z27.d }\n"
+    "16:"  // Main loop skip tail
+    "cbz x11, 18f\n"
+    "17:"  // Right padding loop
+    ".inst 0xc006081c  // mova { z28.d-z29.d }, za.d[x8, #0]\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0xc006083e  // mova { z30.d-z31.d }, za.d[x8, #1]\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1adcb3c  // fclamp { z28.s-z31.s }, z25.s, z13.s\n"
+    "st1w { z28.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z30.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    ".inst 0xc0040b44  // mova za.d[x8, #4], { z26.d-z27.d }\n"
+    "st1w { z29.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0xc0040b45  // mova za.d[x8, #5], { z26.d-z27.d }\n"
+    "st1w { z31.s }, p1, [x24]\n"
+    "add x24, x24, x22, LSL #2\n"
+    "bgt 17b\n"
+    "18:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x15\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x15, x16\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..e685884762
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..a3b9ca402a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x9\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x7\n"
+    "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    "ld1rw { z4.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x16\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z1.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x17\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z24.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z24.s }, p1/Z, [x20, x15, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x21\n"
+    "ld1w { z18.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    "incb x21\n"
+    "ld1w { z23.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    ".inst 0x658aaa4e  // bfcvt z14.h, p2/M, z18.s\n"
+    "ld1w { z6.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    ".inst 0x648aaaee  // bfcvtnt z14.h, p2/M, z23.s\n"
+    "incb x21\n"
+    "ld1w { z28.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    ".inst 0x658aa8c3  // bfcvt z3.h, p2/M, z6.s\n"
+    ".inst 0x658aab88  // bfcvt z8.h, p2/M, z28.s\n"
+    "ld1w { z10.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #3\n"
+    "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+    ".inst 0x648aa948  // bfcvtnt z8.h, p2/M, z10.s\n"
+    "ld1w { z2.s }, p2/Z, [x20]\n"
+    "mov x21, x21\n"
+    ".inst 0x658aa847  // bfcvt z7.h, p2/M, z2.s\n"
+    "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ld1w { z9.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #3\n"
+    ".inst 0x658aa920  // bfcvt z0.h, p2/M, z9.s\n"
+    "sub x20, x14, #0x1\n"
+    "ld1w { z6.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #3\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    "mov z25.d, z24.d\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "orr x23, x16, x23, LSL #20\n"
+    "mov x22, #0x9\n"
+    "mov z26.d, z24.d\n"
+    "add x21, x17, x7\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "mov z27.d, z24.d\n"
+    ".inst 0x648aa8c0  // bfcvtnt z0.h, p2/M, z6.s\n"
+    ".inst 0x658aaa26  // bfcvt z6.h, p2/M, z17.s\n"
+    "mov x8, #0x0\n"
+    "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "lsl x23, x23, #0x2\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x17, x13\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x13, x17, x20, x13\n"
+    ".inst 0xc0040f00  // mova za.d[x8, #0], { z24.d-z27.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f01  // mova za.d[x8, #1], { z24.d-z27.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x26, x25, [x23], #0x10\n"
+    "ldp x24, x23, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "and x22, x21, #0x1\n"
+    "add x21, x21, #0x1\n"
+    ".inst 0xc1a1c890  // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
+    "lsr x21, x21, #0x1\n"
+    "sub x11, x11, x21\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z16.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z17.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z18.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "st1w { z19.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x17, x7\n"
+    "bne 10f\n"
+    "cbz x22, 8f\n"
+    "cmp x22, #0x1\n"
+    "sub x14, x14, x22\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 2 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x13]\n"
+    ".inst 0x658aaa53  // bfcvt z19.h, p2/M, z18.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa993  // bfcvtnt z19.h, p2/M, z12.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaaf4  // bfcvt z20.h, p2/M, z23.s\n"
+    "ld1w { z2.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa854  // bfcvtnt z20.h, p2/M, z2.s\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9f5  // bfcvt z21.h, p2/M, z15.s\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaad5  // bfcvtnt z21.h, p2/M, z22.s\n"
+    "ld1w { z30.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aabd6  // bfcvt z22.h, p2/M, z30.s\n"
+    "ld1w { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa996  // bfcvtnt z22.h, p2/M, z12.s\n"
+    ".inst 0xc13e1270  // bfdot za.s[x8, 0], { z19.h-z22.h }, z14.h\n"
+    "ld1w { z31.s }, p1/Z, [x20]\n"
+    ".inst 0x658aabf7  // bfcvt z23.h, p2/M, z31.s\n"
+    ".inst 0xc1331290  // bfdot za.s[x8, 0], { z20.h-z23.h }, z3.h\n"
+    "7:"  // Unpadded: 1 priming loads
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x13]\n"
+    ".inst 0x658aaa30  // bfcvt z16.h, p2/M, z17.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaad0  // bfcvtnt z16.h, p2/M, z22.s\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab91  // bfcvt z17.h, p2/M, z28.s\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa51  // bfcvtnt z17.h, p2/M, z18.s\n"
+    "ld1w { z2.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa852  // bfcvt z18.h, p2/M, z2.s\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa72  // bfcvtnt z18.h, p2/M, z19.s\n"
+    "ld1w { z2.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa853  // bfcvt z19.h, p2/M, z2.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaaf3  // bfcvtnt z19.h, p2/M, z23.s\n"
+    ".inst 0xc1381210  // bfdot za.s[x8, 0], { z16.h-z19.h }, z8.h\n"
+    "ld1w { z10.s }, p1/Z, [x20]\n"
+    ".inst 0x658aa954  // bfcvt z20.h, p2/M, z10.s\n"
+    ".inst 0xc1371230  // bfdot za.s[x8, 0], { z17.h-z20.h }, z7.h\n"
+    "8:"  // Unpadded: 0 priming loads
+    "cmp x14, #0x2\n"
+    "blt 16f\n"
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x13]\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "sub x14, x14, #0x2\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x11, x11, #0x1\n"
+    ".inst 0x648aaa09  // bfcvtnt z9.h, p2/M, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0a  // bfcvt z10.h, p2/M, z16.s\n"
+    "lsr x20, x14, #0x1\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "cmp x20, x11\n"
+    ".inst 0x648aaa0a  // bfcvtnt z10.h, p2/M, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0b  // bfcvt z11.h, p2/M, z16.s\n"
+    "csel x22, x20, x11, LT\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa0b  // bfcvtnt z11.h, p2/M, z16.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    "and x14, x14, #0x1\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    "sub x11, x11, x22\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    ".inst 0x658aaa0d  // bfcvt z13.h, p2/M, z16.s\n"
+    "cbz x22, 15f\n"
+    "9:"  // Unpadded: Main loop
+    "add x21, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x13]\n"
+    ".inst 0xc1301130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "ld1w { z15.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13e1131  // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "ld1w { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1361150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1331151  // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+    ".inst 0x658aaa4a  // bfcvt z10.h, p2/M, z18.s\n"
+    "ld1w { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aabcb  // bfcvt z11.h, p2/M, z30.s\n"
+    ".inst 0x648aa9e9  // bfcvtnt z9.h, p2/M, z15.s\n"
+    "ld1w { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa2a  // bfcvtnt z10.h, p2/M, z17.s\n"
+    ".inst 0x648aaa6b  // bfcvtnt z11.h, p2/M, z19.s\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "ld1w { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa84c  // bfcvtnt z12.h, p2/M, z2.s\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z29.s }, p1/Z, [x13]\n"
+    ".inst 0xc1381130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
+    ".inst 0x658aaba9  // bfcvt z9.h, p2/M, z29.s\n"
+    "subs x22, x22, #0x1\n"
+    "ld1w { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1a1c890  // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
+    "st1w { z16.s }, p1, [x10]\n"
+    "ld1w { z16.s }, p1/Z, [x21]\n"
+    ".inst 0x658aaa0d  // bfcvt z13.h, p2/M, z16.s\n"
+    ".inst 0xc1371150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
+    "add x10, x10, x28, LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab8a  // bfcvt z10.h, p2/M, z28.s\n"
+    "st1w { z17.s }, p1, [x9]\n"
+    "ld1w { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z18.s }, p1, [x26]\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0b  // bfcvt z11.h, p2/M, z16.s\n"
+    "add x26, x26, x24, LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z19.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaac9  // bfcvtnt z9.h, p2/M, z22.s\n"
+    ".inst 0x648aabea  // bfcvtnt z10.h, p2/M, z31.s\n"
+    "ld1w { z31.s }, p1/Z, [x20]\n"
+    ".inst 0x648aaa2b  // bfcvtnt z11.h, p2/M, z17.s\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0x658aabed  // bfcvt z13.h, p2/M, z31.s\n"
+    "bgt 9b\n"
+    "b 15f\n"
+    "10:"  // Padded
+    "cbz x22, 13f\n"
+    "cmp x22, #0x1\n"
+    "sub x14, x14, x22\n"
+    "beq 12f\n"
+    "11:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa49  // bfcvtnt z9.h, p2/M, z18.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    ".inst 0x658aa98a  // bfcvt z10.h, p2/M, z12.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    ".inst 0x648aa98a  // bfcvtnt z10.h, p2/M, z12.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa4b  // bfcvt z11.h, p2/M, z18.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa0b  // bfcvtnt z11.h, p2/M, z16.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa0d  // bfcvt z13.h, p2/M, z16.s\n"
+    ".inst 0xc13e1130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z14.h\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1331150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z3.h\n"
+    "12:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa0f  // bfcvt z15.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa0f  // bfcvtnt z15.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa70  // bfcvtnt z16.h, p2/M, z19.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9b1  // bfcvt z17.h, p2/M, z13.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z12.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa991  // bfcvtnt z17.h, p2/M, z12.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa932  // bfcvt z18.h, p2/M, z9.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z11.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0x648aa972  // bfcvtnt z18.h, p2/M, z11.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaab3  // bfcvt z19.h, p2/M, z21.s\n"
+    ".inst 0xc13811f0  // bfdot za.s[x8, 0], { z15.h-z18.h }, z8.h\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1371210  // bfdot za.s[x8, 0], { z16.h-z19.h }, z7.h\n"
+    "13:"  // Padded: 0 priming loads
+    "cmp x14, #0x2\n"
+    "blt 16f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa09  // bfcvtnt z9.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa0a  // bfcvt z10.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa0a  // bfcvtnt z10.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0b  // bfcvt z11.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa0b  // bfcvtnt z11.h, p2/M, z16.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa0d  // bfcvt z13.h, p2/M, z16.s\n"
+    "sub x14, x14, #0x2\n"
+    "sub x11, x11, #0x1\n"
+    "lsr x20, x14, #0x1\n"
+    "cmp x20, x11\n"
+    "csel x21, x20, x11, LT\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "and x14, x14, #0x1\n"
+    "sub x11, x11, x21\n"
+    "cbz x21, 15f\n"
+    "14:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z18.s }, p0/Z, [x13]\n"
+    ".inst 0xc1301130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    ".inst 0xc13e1131  // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0xc1361150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    ".inst 0xc1331151  // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa49  // bfcvt z9.h, p2/M, z18.s\n"
+    ".inst 0x658aaa0a  // bfcvt z10.h, p2/M, z16.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z2.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa84b  // bfcvt z11.h, p2/M, z2.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa29  // bfcvtnt z9.h, p2/M, z17.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z28.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab8c  // bfcvt z12.h, p2/M, z28.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa6a  // bfcvtnt z10.h, p2/M, z19.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z13.s }, p0/Z, [x20]\n"
+    ".inst 0x648aa9eb  // bfcvtnt z11.h, p2/M, z15.s\n"
+    "mov x12, #0x0\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0x648aaa2c  // bfcvtnt z12.h, p2/M, z17.s\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9ad  // bfcvt z13.h, p2/M, z13.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z21.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1a1c89c  // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z28.s }, p1, [x10]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1w { z20.s }, p0/Z, [x20]\n"
+    "st1w { z29.s }, p1, [x9]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "st1w { z30.s }, p1, [x26]\n"
+    "add x8, x8, #0x1\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1381130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z18.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1371150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa2a  // bfcvt z10.h, p2/M, z17.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa6b  // bfcvt z11.h, p2/M, z19.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "subs x21, x21, #0x1\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z31.s }, p1, [x25]\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "add x9, x9, x27, LSL #2\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0x648aaaa9  // bfcvtnt z9.h, p2/M, z21.s\n"
+    ".inst 0x648aaa8a  // bfcvtnt z10.h, p2/M, z20.s\n"
+    "add x25, x25, x23, LSL #2\n"
+    ".inst 0x648aaa4b  // bfcvtnt z11.h, p2/M, z18.s\n"
+    ".inst 0x648aaa2c  // bfcvtnt z12.h, p2/M, z17.s\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    ".inst 0x658aaa0d  // bfcvt z13.h, p2/M, z16.s\n"
+    "bgt 14b\n"
+    "15:"  // Main loop tail
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z17.s }, p0/Z, [x13]\n"
+    ".inst 0xc1301130  // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z2.s }, p0/Z, [x20]\n"
+    ".inst 0xc13e1131  // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0xc1361150  // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z23.s }, p0/Z, [x20]\n"
+    ".inst 0xc1331151  // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa32  // bfcvt z18.h, p2/M, z17.s\n"
+    ".inst 0x658aaa13  // bfcvt z19.h, p2/M, z16.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa852  // bfcvtnt z18.h, p2/M, z2.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa15  // bfcvt z21.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0x648aaaf3  // bfcvtnt z19.h, p2/M, z23.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0x648aa9f4  // bfcvtnt z20.h, p2/M, z15.s\n"
+    ".inst 0x648aaa15  // bfcvtnt z21.h, p2/M, z16.s\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x658aaa16  // bfcvt z22.h, p2/M, z16.s\n"
+    ".inst 0xc1381250  // bfdot za.s[x8, 0], { z18.h-z21.h }, z8.h\n"
+    ".inst 0xc1a1c89c  // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
+    "st1w { z28.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z29.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "add x13, x13, %x[ld_in_col], LSL #2\n"
+    "st1w { z30.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xc1371270  // bfdot za.s[x8, 0], { z19.h-z22.h }, z7.h\n"
+    "st1w { z31.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "16:"  // Main loop skip tail
+    "cbz x14, 17f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x13]\n"
+    ".inst 0x658aaa0f  // bfcvt z15.h, p2/M, z16.s\n"
+    "add x20, x13, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa0f  // bfcvtnt z15.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z2.s }, p0/Z, [x20]\n"
+    ".inst 0x648aa850  // bfcvtnt z16.h, p2/M, z2.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z10.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa951  // bfcvt z17.h, p2/M, z10.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aabd1  // bfcvtnt z17.h, p2/M, z30.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa72  // bfcvt z18.h, p2/M, z19.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0x648aaa72  // bfcvtnt z18.h, p2/M, z19.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z19.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa73  // bfcvt z19.h, p2/M, z19.s\n"
+    ".inst 0xc13011f0  // bfdot za.s[x8, 0], { z15.h-z18.h }, z0.h\n"
+    "sub x11, x11, #0x1\n"
+    ".inst 0xc1361210  // bfdot za.s[x8, 0], { z16.h-z19.h }, z6.h\n"
+    ".inst 0xc13e11f1  // bfdot za.s[x8, 1], { z15.h-z18.h }, z14.h\n"
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a1c888  // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+    "st1w { z8.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc1331211  // bfdot za.s[x8, 1], { z16.h-z19.h }, z3.h\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z10.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "st1w { z11.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "17:"  // Tail input: End
+    "cbz x11, 19f\n"
+    "18:"  // Right padding loop
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x11, x11, #0x1\n"
+    ".inst 0xc1a1c888  // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+    "st1w { z8.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc0040f02  // mova za.d[x8, #2], { z24.d-z27.d }\n"
+    "st1w { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z10.s }, p1, [x26]\n"
+    "add x26, x26, x24, LSL #2\n"
+    "st1w { z11.s }, p1, [x25]\n"
+    "add x25, x25, x23, LSL #2\n"
+    "bgt 18b\n"
+    "19:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x15\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x15, x16\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..5215ccaf39
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..b72042558d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1151 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0x8\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x4\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z29.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x7\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x6\n"
+    "addvl SP, SP, #-30\n"
+    "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "1:"  // Channel loop
+    "ldr x21, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z30.s, #0x0\n"
+    "cbz x21, 2f\n"
+    "ld1w { z30.s }, p1/Z, [x21, x17, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x21\n"
+    "ld1w { z12.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "ld1w { z24.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "fmov z11.s, #0x0\n"
+    "incb x21\n"
+    "ld1w { z3.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa99a  // bfcvt z26.h, p2/M, z12.s\n"
+    ".inst 0x658aab10  // bfcvt z16.h, p2/M, z24.s\n"
+    "ld1w { z20.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "addvl x24, SP, #30\n"
+    ".inst 0x648aa98b  // bfcvtnt z11.h, p2/M, z12.s\n"
+    "ld1w { z25.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    ".inst 0x658aa875  // bfcvt z21.h, p2/M, z3.s\n"
+    "addvl x24, x24, #-6\n"
+    "ld1w { z6.s }, p2/Z, [x20]\n"
+    ".inst 0x658aaa9b  // bfcvt z27.h, p2/M, z20.s\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z11.h }, p2, [x24]\n"
+    ".inst 0x648aab1a  // bfcvtnt z26.h, p2/M, z24.s\n"
+    "ld1w { z14.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "fmov z11.s, #0x0\n"
+    "st1h { z26.h }, p2, [x24, #1, MUL VL]\n"
+    ".inst 0x648aa870  // bfcvtnt z16.h, p2/M, z3.s\n"
+    "ld1w { z19.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa8c9  // bfcvt z9.h, p2/M, z6.s\n"
+    ".inst 0x648aaa95  // bfcvtnt z21.h, p2/M, z20.s\n"
+    "incb x21\n"
+    "ld1w { z12.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z16.h }, p2, [x24, #2, MUL VL]\n"
+    ".inst 0x648aab3b  // bfcvtnt z27.h, p2/M, z25.s\n"
+    ".inst 0x658aab37  // bfcvt z23.h, p2/M, z25.s\n"
+    "ld1w { z5.s }, p2/Z, [x20]\n"
+    ".inst 0x658aa9c8  // bfcvt z8.h, p2/M, z14.s\n"
+    "mov x23, x21\n"
+    "st1h { z21.h }, p2, [x24, #3, MUL VL]\n"
+    ".inst 0x648aa8cb  // bfcvtnt z11.h, p2/M, z6.s\n"
+    ".inst 0x658aaa79  // bfcvt z25.h, p2/M, z19.s\n"
+    "ld1w { z4.s }, p2/Z, [x23]\n"
+    "incb x23, ALL, MUL #5\n"
+    "st1h { z27.h }, p2, [x24, #4, MUL VL]\n"
+    ".inst 0x648aa9c9  // bfcvtnt z9.h, p2/M, z14.s\n"
+    ".inst 0x658aa991  // bfcvt z17.h, p2/M, z12.s\n"
+    "incb x21\n"
+    "st1h { z23.h }, p2, [x24, #5, MUL VL]\n"
+    "addvl x24, x24, #-6\n"
+    "ld1w { z26.s }, p2/Z, [x23]\n"
+    "incb x23, ALL, MUL #5\n"
+    "st1h { z11.h }, p2, [x24]\n"
+    "fmov z2.s, #0x0\n"
+    ".inst 0x648aaa68  // bfcvtnt z8.h, p2/M, z19.s\n"
+    "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
+    "ld1w { z27.s }, p2/Z, [x23]\n"
+    "incb x23, ALL, MUL #5\n"
+    ".inst 0x658aa893  // bfcvt z19.h, p2/M, z4.s\n"
+    "st1h { z8.h }, p2, [x24, #2, MUL VL]\n"
+    ".inst 0x648aa999  // bfcvtnt z25.h, p2/M, z12.s\n"
+    "ld1w { z7.s }, p2/Z, [x23]\n"
+    "incb x23, ALL, MUL #5\n"
+    ".inst 0x658aab4e  // bfcvt z14.h, p2/M, z26.s\n"
+    ".inst 0x648aa8b1  // bfcvtnt z17.h, p2/M, z5.s\n"
+    "st1h { z25.h }, p2, [x24, #3, MUL VL]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+    ".inst 0x658aa8ab  // bfcvt z11.h, p2/M, z5.s\n"
+    "ld1w { z18.s }, p2/Z, [x23]\n"
+    "mov x20, x21\n"
+    ".inst 0x648aa882  // bfcvtnt z2.h, p2/M, z4.s\n"
+    ".inst 0x658aab66  // bfcvt z6.h, p2/M, z27.s\n"
+    "ld1w { z15.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z17.h }, p2, [x24, #4, MUL VL]\n"
+    "st1h { z11.h }, p2, [x24, #5, MUL VL]\n"
+    "addvl x24, x24, #-6\n"
+    ".inst 0x648aab53  // bfcvtnt z19.h, p2/M, z26.s\n"
+    ".inst 0x658aa8fa  // bfcvt z26.h, p2/M, z7.s\n"
+    "ld1w { z11.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z2.h }, p2, [x24]\n"
+    ".inst 0x648aab6e  // bfcvtnt z14.h, p2/M, z27.s\n"
+    "ld1w { z4.s }, p2/Z, [x20]\n"
+    "fmov z21.s, #0x0\n"
+    "st1h { z19.h }, p2, [x24, #1, MUL VL]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa9ea  // bfcvt z10.h, p2/M, z15.s\n"
+    "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
+    ".inst 0x648aa8e6  // bfcvtnt z6.h, p2/M, z7.s\n"
+    "incb x21\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa973  // bfcvt z19.h, p2/M, z11.s\n"
+    "st1h { z6.h }, p2, [x24, #3, MUL VL]\n"
+    ".inst 0x648aaa5a  // bfcvtnt z26.h, p2/M, z18.s\n"
+    ".inst 0x658aaa45  // bfcvt z5.h, p2/M, z18.s\n"
+    "ld1w { z12.s }, p2/Z, [x20]\n"
+    "mov x21, x21\n"
+    ".inst 0x658aa897  // bfcvt z23.h, p2/M, z4.s\n"
+    ".inst 0x648aa9f5  // bfcvtnt z21.h, p2/M, z15.s\n"
+    "ld1w { z24.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    ".inst 0x648aa96a  // bfcvtnt z10.h, p2/M, z11.s\n"
+    "ld1w { z3.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x24, #4, MUL VL]\n"
+    ".inst 0x648aa893  // bfcvtnt z19.h, p2/M, z4.s\n"
+    ".inst 0x658aaa30  // bfcvt z16.h, p2/M, z17.s\n"
+    "ld1w { z2.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    ".inst 0x648aaa37  // bfcvtnt z23.h, p2/M, z17.s\n"
+    "ld1w { z26.s }, p2/Z, [x21]\n"
+    "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
+    "addvl x24, x24, #-6\n"
+    "st1h { z21.h }, p2, [x24]\n"
+    ".inst 0x648aa990  // bfcvtnt z16.h, p2/M, z12.s\n"
+    "incb x21, ALL, MUL #5\n"
+    "fmov z8.s, #0x0\n"
+    "st1h { z10.h }, p2, [x24, #1, MUL VL]\n"
+    ".inst 0x658aab04  // bfcvt z4.h, p2/M, z24.s\n"
+    ".inst 0x658aa985  // bfcvt z5.h, p2/M, z12.s\n"
+    "sub x20, x25, #0x1\n"
+    "st1h { z19.h }, p2, [x24, #2, MUL VL]\n"
+    ".inst 0x658aa871  // bfcvt z17.h, p2/M, z3.s\n"
+    "ld1w { z25.s }, p2/Z, [x21]\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
+    ".inst 0x658aa857  // bfcvt z23.h, p2/M, z2.s\n"
+    "orr x23, x7, x23, LSL #20\n"
+    "mov x22, #0x8\n"
+    "st1h { z16.h }, p2, [x24, #4, MUL VL]\n"
+    ".inst 0x658aab4e  // bfcvt z14.h, p2/M, z26.s\n"
+    "add x21, x6, x4\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
+    "addvl x24, x24, #-6\n"
+    "mov z31.d, z30.d\n"
+    ".inst 0x648aab08  // bfcvtnt z8.h, p2/M, z24.s\n"
+    "st1h { z8.h }, p2, [x24]\n"
+    ".inst 0x648aa864  // bfcvtnt z4.h, p2/M, z3.s\n"
+    ".inst 0x648aa851  // bfcvtnt z17.h, p2/M, z2.s\n"
+    "mov x11, #0x0\n"
+    "st1h { z4.h }, p2, [x24, #1, MUL VL]\n"
+    ".inst 0x648aab57  // bfcvtnt z23.h, p2/M, z26.s\n"
+    ".inst 0x648aab2e  // bfcvtnt z14.h, p2/M, z25.s\n"
+    "mov x8, #0x8\n"
+    "st1h { z17.h }, p2, [x24, #2, MUL VL]\n"
+    ".inst 0x658aab26  // bfcvt z6.h, p2/M, z25.s\n"
+    "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "lsl x23, x23, #0x2\n"
+    "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x6, x16\n"
+    "st1h { z14.h }, p2, [x24, #4, MUL VL]\n"
+    "st1h { z6.h }, p2, [x24, #5, MUL VL]\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x16, x6, x20, x16\n"
+    ".inst 0xc0046bc0  // mova za.d[x11, #0], { z30.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0046bc1  // mova za.d[x11, #1], { z30.d-z31.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x14, x13, [x23], #0x10\n"
+    ".inst 0xc0046bc2  // mova za.d[x11, #2], { z30.d-z31.d }\n"
+    "ldp x5, x10, [x20], #0x10\n"
+    ".inst 0xc0046bc3  // mova za.d[x11, #3], { z30.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0046bc4  // mova za.d[x11, #4], { z30.d-z31.d }\n"
+    "ldp x9, x28, [x23], #0x10\n"
+    ".inst 0xc0046bc5  // mova za.d[x11, #5], { z30.d-z31.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0046bc6  // mova za.d[x11, #6], { z30.d-z31.d }\n"
+    ".inst 0xc0046bc7  // mova za.d[x11, #7], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0066804  // mova { z4.d-z5.d }, za.d[x11, #0]\n"
+    "sub x15, x15, x21\n"
+    ".inst 0xc0066826  // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+    ".inst 0xc1bccba4  // fclamp { z4.s-z7.s }, z29.s, z28.s\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z4.s }, p1, [x14]\n"
+    "add x14, x14, x5, LSL #2\n"
+    "st1w { z6.s }, p1, [x13]\n"
+    "add x13, x13, x10, LSL #2\n"
+    "st1w { z5.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "st1w { z7.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x6, x4\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 9f\n"
+    "cmp x22, #0x2\n"
+    "beq 8f\n"
+    "cmp x22, #0x3\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 4 priming loads
+    "add x21, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z21.s }, p1/Z, [x16]\n"
+    ".inst 0x658aaab2  // bfcvt z18.h, p2/M, z21.s\n"
+    "addvl x20, SP, #24\n"
+    "ld1w { z11.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa972  // bfcvtnt z18.h, p2/M, z11.s\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa33  // bfcvt z19.h, p2/M, z17.s\n"
+    "ld1w { z12.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa993  // bfcvtnt z19.h, p2/M, z12.s\n"
+    "ld1w { z7.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8f4  // bfcvt z20.h, p2/M, z7.s\n"
+    "ld1w { z12.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa994  // bfcvtnt z20.h, p2/M, z12.s\n"
+    ".inst 0xa0402a8c  // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12d7250  // bfdot za.s[x11, 0], { z18.h-z19.h }, z13.h\n"
+    "ld1w { z6.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8d5  // bfcvt z21.h, p2/M, z6.s\n"
+    ".inst 0xc12c7251  // bfdot za.s[x11, 1], { z18.h-z19.h }, z12.h\n"
+    ".inst 0xa0412a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12b7270  // bfdot za.s[x11, 0], { z19.h-z20.h }, z11.h\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    ".inst 0x648aab75  // bfcvtnt z21.h, p2/M, z27.s\n"
+    ".inst 0xc12a7271  // bfdot za.s[x11, 1], { z19.h-z20.h }, z10.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc12b7290  // bfdot za.s[x11, 0], { z20.h-z21.h }, z11.h\n"
+    ".inst 0xc12a7291  // bfdot za.s[x11, 1], { z20.h-z21.h }, z10.h\n"
+    "7:"  // Unpadded: 3 priming loads
+    "add x22, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z6.s }, p1/Z, [x16]\n"
+    ".inst 0x658aa8d7  // bfcvt z23.h, p2/M, z6.s\n"
+    "addvl x21, SP, #18\n"
+    "ld1w { z1.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa837  // bfcvtnt z23.h, p2/M, z1.s\n"
+    "addvl x20, SP, #24\n"
+    "ld1w { z15.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9f8  // bfcvt z24.h, p2/M, z15.s\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa18  // bfcvtnt z24.h, p2/M, z16.s\n"
+    "ld1w { z1.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa839  // bfcvt z25.h, p2/M, z1.s\n"
+    "ld1w { z9.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa939  // bfcvtnt z25.h, p2/M, z9.s\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12972f0  // bfdot za.s[x11, 0], { z23.h-z24.h }, z9.h\n"
+    "ld1w { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa1a  // bfcvt z26.h, p2/M, z16.s\n"
+    ".inst 0xc12172f1  // bfdot za.s[x11, 1], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xa1402a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f72f2  // bfdot za.s[x11, 2], { z23.h-z24.h }, z15.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12772f3  // bfdot za.s[x11, 3], { z23.h-z24.h }, z7.h\n"
+    "ld1w { z16.s }, p1/Z, [x22]\n"
+    ".inst 0x648aaa1a  // bfcvtnt z26.h, p2/M, z16.s\n"
+    ".inst 0xc1297310  // bfdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
+    ".inst 0xc1217311  // bfdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+    ".inst 0xa1412a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xa1422aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12f7312  // bfdot za.s[x11, 2], { z24.h-z25.h }, z15.h\n"
+    ".inst 0xc1277313  // bfdot za.s[x11, 3], { z24.h-z25.h }, z7.h\n"
+    ".inst 0xc12b7330  // bfdot za.s[x11, 0], { z25.h-z26.h }, z11.h\n"
+    ".inst 0xc1237331  // bfdot za.s[x11, 1], { z25.h-z26.h }, z3.h\n"
+    ".inst 0xa0422a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1237332  // bfdot za.s[x11, 2], { z25.h-z26.h }, z3.h\n"
+    ".inst 0xc1227333  // bfdot za.s[x11, 3], { z25.h-z26.h }, z2.h\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x23, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z24.s }, p1/Z, [x16]\n"
+    ".inst 0x658aab02  // bfcvt z2.h, p2/M, z24.s\n"
+    "addvl x22, SP, #12\n"
+    "ld1w { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa02  // bfcvtnt z2.h, p2/M, z16.s\n"
+    "addvl x21, SP, #18\n"
+    "ld1w { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa03  // bfcvt z3.h, p2/M, z16.s\n"
+    "addvl x20, SP, #24\n"
+    "ld1w { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa03  // bfcvtnt z3.h, p2/M, z16.s\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z1.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa824  // bfcvt z4.h, p2/M, z1.s\n"
+    "ld1w { z19.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa64  // bfcvtnt z4.h, p2/M, z19.s\n"
+    ".inst 0xa1402ac7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc12f7050  // bfdot za.s[x11, 0], { z2.h-z3.h }, z15.h\n"
+    "ld1w { z0.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa805  // bfcvt z5.h, p2/M, z0.s\n"
+    ".inst 0xc1277051  // bfdot za.s[x11, 1], { z2.h-z3.h }, z7.h\n"
+    ".inst 0xa1402aa7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12f7052  // bfdot za.s[x11, 2], { z2.h-z3.h }, z15.h\n"
+    ".inst 0xa1412ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc1277053  // bfdot za.s[x11, 3], { z2.h-z3.h }, z7.h\n"
+    "ld1w { z10.s }, p1/Z, [x23]\n"
+    ".inst 0x648aa945  // bfcvtnt z5.h, p2/M, z10.s\n"
+    ".inst 0xc12e7070  // bfdot za.s[x11, 0], { z3.h-z4.h }, z14.h\n"
+    ".inst 0xa1402a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1267071  // bfdot za.s[x11, 1], { z3.h-z4.h }, z6.h\n"
+    ".inst 0xa0412aac  // ld1h { z12.h-z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12f7054  // bfdot za.s[x11, 4], { z2.h-z3.h }, z15.h\n"
+    ".inst 0xa1422ac0  // ld1h { z0.h, z8.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc1277055  // bfdot za.s[x11, 5], { z2.h-z3.h }, z7.h\n"
+    ".inst 0xc12d7072  // bfdot za.s[x11, 2], { z3.h-z4.h }, z13.h\n"
+    ".inst 0xc12c7073  // bfdot za.s[x11, 3], { z3.h-z4.h }, z12.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1287090  // bfdot za.s[x11, 0], { z4.h-z5.h }, z8.h\n"
+    ".inst 0xc1207091  // bfdot za.s[x11, 1], { z4.h-z5.h }, z0.h\n"
+    ".inst 0xa0422aa6  // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12f7074  // bfdot za.s[x11, 4], { z3.h-z4.h }, z15.h\n"
+    ".inst 0xc12e7075  // bfdot za.s[x11, 5], { z3.h-z4.h }, z14.h\n"
+    ".inst 0xc1277092  // bfdot za.s[x11, 2], { z4.h-z5.h }, z7.h\n"
+    ".inst 0xc1267093  // bfdot za.s[x11, 3], { z4.h-z5.h }, z6.h\n"
+    ".inst 0xa1422a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1287094  // bfdot za.s[x11, 4], { z4.h-z5.h }, z8.h\n"
+    ".inst 0xc1207095  // bfdot za.s[x11, 5], { z4.h-z5.h }, z0.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x24, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z18.s }, p1/Z, [x16]\n"
+    ".inst 0x658aaa4c  // bfcvt z12.h, p2/M, z18.s\n"
+    "addvl x23, SP, #6\n"
+    "ld1w { z7.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa8ec  // bfcvtnt z12.h, p2/M, z7.s\n"
+    "addvl x22, SP, #12\n"
+    "ld1w { z20.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa8d  // bfcvt z13.h, p2/M, z20.s\n"
+    "addvl x21, SP, #18\n"
+    "ld1w { z0.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa80d  // bfcvtnt z13.h, p2/M, z0.s\n"
+    "addvl x20, SP, #24\n"
+    "ld1w { z10.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa94e  // bfcvt z14.h, p2/M, z10.s\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z0.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa80e  // bfcvtnt z14.h, p2/M, z0.s\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    ".inst 0xc1217190  // bfdot za.s[x11, 0], { z12.h-z13.h }, z1.h\n"
+    "ld1w { z17.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa2f  // bfcvt z15.h, p2/M, z17.s\n"
+    ".inst 0xc1207191  // bfdot za.s[x11, 1], { z12.h-z13.h }, z0.h\n"
+    ".inst 0xa0402aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc12b7192  // bfdot za.s[x11, 2], { z12.h-z13.h }, z11.h\n"
+    ".inst 0xa0412ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc12a7193  // bfdot za.s[x11, 3], { z12.h-z13.h }, z10.h\n"
+    "ld1w { z18.s }, p1/Z, [x24]\n"
+    ".inst 0x648aaa4f  // bfcvtnt z15.h, p2/M, z18.s\n"
+    ".inst 0xc12171b0  // bfdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
+    ".inst 0xa1402aa2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12071b1  // bfdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
+    ".inst 0xa0412ac6  // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc12a7194  // bfdot za.s[x11, 4], { z12.h-z13.h }, z10.h\n"
+    ".inst 0xa0422aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc1227195  // bfdot za.s[x11, 5], { z12.h-z13.h }, z2.h\n"
+    ".inst 0xa0402a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12771b2  // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
+    ".inst 0xc12671b3  // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
+    ".inst 0xa0412aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12b71d0  // bfdot za.s[x11, 0], { z14.h-z15.h }, z11.h\n"
+    ".inst 0xc12a71d1  // bfdot za.s[x11, 1], { z14.h-z15.h }, z10.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc1297196  // bfdot za.s[x11, 6], { z12.h-z13.h }, z9.h\n"
+    ".inst 0xc1287197  // bfdot za.s[x11, 7], { z12.h-z13.h }, z8.h\n"
+    ".inst 0xc12171b4  // bfdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
+    ".inst 0xc12071b5  // bfdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
+    ".inst 0xa1412a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12a71d2  // bfdot za.s[x11, 2], { z14.h-z15.h }, z10.h\n"
+    ".inst 0xc12271d3  // bfdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
+    ".inst 0xa0422aa6  // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12b71b6  // bfdot za.s[x11, 6], { z13.h-z14.h }, z11.h\n"
+    ".inst 0xc12371b7  // bfdot za.s[x11, 7], { z13.h-z14.h }, z3.h\n"
+    ".inst 0xc12771d4  // bfdot za.s[x11, 4], { z14.h-z15.h }, z7.h\n"
+    ".inst 0xc12671d5  // bfdot za.s[x11, 5], { z14.h-z15.h }, z6.h\n"
+    ".inst 0xa0422a86  // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc12771d6  // bfdot za.s[x11, 6], { z14.h-z15.h }, z7.h\n"
+    ".inst 0xc12671d7  // bfdot za.s[x11, 7], { z14.h-z15.h }, z6.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    ".inst 0xa1402be6  // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa1422be4  // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 20f\n"
+    "add x20, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z1.s }, p1/Z, [x16]\n"
+    ".inst 0x658aa834  // bfcvt z20.h, p2/M, z1.s\n"
+    "sub x25, x25, #0x1\n"
+    "ld1w { z10.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "sub x15, x15, #0x1\n"
+    ".inst 0x648aa954  // bfcvtnt z20.h, p2/M, z10.s\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa15  // bfcvt z21.h, p2/M, z16.s\n"
+    "cmp x25, x15\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "csel x25, x25, x15, LT\n"
+    ".inst 0x648aaa75  // bfcvtnt z21.h, p2/M, z19.s\n"
+    "ld1w { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaaf6  // bfcvt z22.h, p2/M, z23.s\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab76  // bfcvtnt z22.h, p2/M, z27.s\n"
+    "sub x15, x15, x25\n"
+    "ld1w { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9f7  // bfcvt z23.h, p2/M, z15.s\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    ".inst 0x648aaa17  // bfcvtnt z23.h, p2/M, z16.s\n"
+    "cbz x25, 19f\n"
+    "11:"  // Unpadded: Main loop
+    "addvl x24, SP, #6\n"
+    ".inst 0xc12e7290  // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "addvl x23, SP, #12\n"
+    "ld1w { z27.s }, p1/Z, [x16]\n"
+    ".inst 0xc1267291  // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa1402b01  // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+    "addvl x22, SP, #18\n"
+    "addvl x21, SP, #24\n"
+    ".inst 0xc1297292  // bfdot za.s[x11, 2], { z20.h-z21.h }, z9.h\n"
+    "add x20, x16, %x[ld_in_row], LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1217293  // bfdot za.s[x11, 3], { z20.h-z21.h }, z1.h\n"
+    ".inst 0xa1402ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+    "subs x25, x25, #0x1\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc12d72b0  // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+    "ld1w { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12572b1  // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+    ".inst 0xa1412b07  // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+    ".inst 0xc12e7294  // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+    "ld1w { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1267295  // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa1402ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc12f72b2  // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+    "ld1w { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12772b3  // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa1412ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc12c72d0  // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+    "ld1w { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12472d1  // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+    ".inst 0xa1422b07  // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+    ".inst 0xc12d7296  // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+    "ld1w { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1257297  // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
+    ".inst 0xa1402aa4  // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12e72b4  // bfdot za.s[x11, 4], { z21.h-z22.h }, z14.h\n"
+    "ld1w { z16.s }, p1/Z, [x20]\n"
+    ".inst 0xc12672b5  // bfdot za.s[x11, 5], { z21.h-z22.h }, z6.h\n"
+    ".inst 0xa1412ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc12f72d2  // bfdot za.s[x11, 2], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc12772d3  // bfdot za.s[x11, 3], { z22.h-z23.h }, z7.h\n"
+    ".inst 0xa1422ae7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc12e72b6  // bfdot za.s[x11, 6], { z21.h-z22.h }, z14.h\n"
+    ".inst 0xc12672b7  // bfdot za.s[x11, 7], { z21.h-z22.h }, z6.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12f72d4  // bfdot za.s[x11, 4], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc12772d5  // bfdot za.s[x11, 5], { z22.h-z23.h }, z7.h\n"
+    ".inst 0xa0422ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc12f72d6  // bfdot za.s[x11, 6], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc12e72d7  // bfdot za.s[x11, 7], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xa1422aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12c1290  // bfdot za.s[x8, 0], { z20.h-z21.h }, z12.h\n"
+    ".inst 0xc1241291  // bfdot za.s[x8, 1], { z20.h-z21.h }, z4.h\n"
+    ".inst 0x658aab74  // bfcvt z20.h, p2/M, z27.s\n"
+    ".inst 0xa1402be6  // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc12d12b0  // bfdot za.s[x8, 0], { z21.h-z22.h }, z13.h\n"
+    ".inst 0x648aab54  // bfcvtnt z20.h, p2/M, z26.s\n"
+    ".inst 0xc12512b1  // bfdot za.s[x8, 1], { z21.h-z22.h }, z5.h\n"
+    ".inst 0x658aab35  // bfcvt z21.h, p2/M, z25.s\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xc12912d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z9.h\n"
+    ".inst 0x648aab15  // bfcvtnt z21.h, p2/M, z24.s\n"
+    ".inst 0xc12112d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z1.h\n"
+    ".inst 0x658aaa76  // bfcvt z22.h, p2/M, z19.s\n"
+    ".inst 0x658aaa37  // bfcvt z23.h, p2/M, z17.s\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xa1422be4  // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    ".inst 0x648aaa56  // bfcvtnt z22.h, p2/M, z18.s\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1bccba8  // fclamp { z8.s-z11.s }, z29.s, z28.s\n"
+    "st1w { z8.s }, p1, [x14]\n"
+    "add x14, x14, x5, LSL #2\n"
+    "st1w { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10, LSL #2\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "st1w { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    ".inst 0x648aaa17  // bfcvtnt z23.h, p2/M, z16.s\n"
+    "st1w { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "bgt 11b\n"
+    "b 19f\n"
+    "12:"  // Padded
+    "cbz x22, 17f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 16f\n"
+    "cmp x22, #0x2\n"
+    "beq 15f\n"
+    "cmp x22, #0x3\n"
+    "beq 14f\n"
+    "13:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    ".inst 0x658aaa06  // bfcvt z6.h, p2/M, z16.s\n"
+    "add x21, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    ".inst 0x648aaa06  // bfcvtnt z6.h, p2/M, z16.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    ".inst 0x658aaa07  // bfcvt z7.h, p2/M, z16.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    ".inst 0x648aaa07  // bfcvtnt z7.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa08  // bfcvt z8.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "addvl x20, SP, #24\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    ".inst 0x648aaa08  // bfcvtnt z8.h, p2/M, z16.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f70d0  // bfdot za.s[x11, 0], { z6.h-z7.h }, z15.h\n"
+    "ld1w { z9.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0x658aa929  // bfcvt z9.h, p2/M, z9.s\n"
+    ".inst 0xc12e70d1  // bfdot za.s[x11, 1], { z6.h-z7.h }, z14.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p0/Z, [x21]\n"
+    ".inst 0x648aaa09  // bfcvtnt z9.h, p2/M, z16.s\n"
+    ".inst 0xc12f70f0  // bfdot za.s[x11, 0], { z7.h-z8.h }, z15.h\n"
+    ".inst 0xc12e70f1  // bfdot za.s[x11, 1], { z7.h-z8.h }, z14.h\n"
+    ".inst 0xa0422a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1237110  // bfdot za.s[x11, 0], { z8.h-z9.h }, z3.h\n"
+    ".inst 0xc1227111  // bfdot za.s[x11, 1], { z8.h-z9.h }, z2.h\n"
+    "14:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "add x22, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    ".inst 0x648aaa09  // bfcvtnt z9.h, p2/M, z16.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    ".inst 0x658aaa0a  // bfcvt z10.h, p2/M, z16.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    ".inst 0x648aaa0a  // bfcvtnt z10.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0b  // bfcvt z11.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "addvl x21, SP, #18\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    ".inst 0x648aaa0b  // bfcvtnt z11.h, p2/M, z16.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa1402aa7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12f7130  // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    "addvl x20, SP, #24\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1277131  // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
+    ".inst 0xa1402a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    ".inst 0xa1412aa7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12e7132  // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
+    "ld1w { z16.s }, p0/Z, [x22]\n"
+    ".inst 0xc1267133  // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    ".inst 0xc12f7150  // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xa1422aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc1277151  // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12f7152  // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc12e7153  // bfdot za.s[x11, 3], { z10.h-z11.h }, z14.h\n"
+    ".inst 0xc12d7170  // bfdot za.s[x11, 0], { z11.h-z12.h }, z13.h\n"
+    ".inst 0xc1257171  // bfdot za.s[x11, 1], { z11.h-z12.h }, z5.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc12f7172  // bfdot za.s[x11, 2], { z11.h-z12.h }, z15.h\n"
+    ".inst 0xc12e7173  // bfdot za.s[x11, 3], { z11.h-z12.h }, z14.h\n"
+    "15:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    ".inst 0x658aaa12  // bfcvt z18.h, p2/M, z16.s\n"
+    "add x23, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    ".inst 0x648aaa12  // bfcvtnt z18.h, p2/M, z16.s\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    ".inst 0x658aaa13  // bfcvt z19.h, p2/M, z16.s\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    ".inst 0x648aaa13  // bfcvtnt z19.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "addvl x22, SP, #12\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    ".inst 0x648aaa14  // bfcvtnt z20.h, p2/M, z16.s\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa1402ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc1297250  // bfdot za.s[x11, 0], { z18.h-z19.h }, z9.h\n"
+    "ld1w { z26.s }, p0/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab55  // bfcvt z21.h, p2/M, z26.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1217251  // bfdot za.s[x11, 1], { z18.h-z19.h }, z1.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xa1412ac7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc12e7252  // bfdot za.s[x11, 2], { z18.h-z19.h }, z14.h\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p0/Z, [x23]\n"
+    ".inst 0xc1267253  // bfdot za.s[x11, 3], { z18.h-z19.h }, z6.h\n"
+    ".inst 0x648aaa15  // bfcvtnt z21.h, p2/M, z16.s\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f7270  // bfdot za.s[x11, 0], { z19.h-z20.h }, z15.h\n"
+    ".inst 0xc1277271  // bfdot za.s[x11, 1], { z19.h-z20.h }, z7.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xa1422ac7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc12d7254  // bfdot za.s[x11, 4], { z18.h-z19.h }, z13.h\n"
+    ".inst 0xc1257255  // bfdot za.s[x11, 5], { z18.h-z19.h }, z5.h\n"
+    ".inst 0xc12e7272  // bfdot za.s[x11, 2], { z19.h-z20.h }, z14.h\n"
+    ".inst 0xc1267273  // bfdot za.s[x11, 3], { z19.h-z20.h }, z6.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12f7290  // bfdot za.s[x11, 0], { z20.h-z21.h }, z15.h\n"
+    ".inst 0xc1277291  // bfdot za.s[x11, 1], { z20.h-z21.h }, z7.h\n"
+    ".inst 0xa0422aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12d7274  // bfdot za.s[x11, 4], { z19.h-z20.h }, z13.h\n"
+    ".inst 0xc1257275  // bfdot za.s[x11, 5], { z19.h-z20.h }, z5.h\n"
+    ".inst 0xc12f7292  // bfdot za.s[x11, 2], { z20.h-z21.h }, z15.h\n"
+    ".inst 0xc12e7293  // bfdot za.s[x11, 3], { z20.h-z21.h }, z14.h\n"
+    ".inst 0xa0422a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1237294  // bfdot za.s[x11, 4], { z20.h-z21.h }, z3.h\n"
+    ".inst 0xc1227295  // bfdot za.s[x11, 5], { z20.h-z21.h }, z2.h\n"
+    "16:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    ".inst 0x658aaa09  // bfcvt z9.h, p2/M, z16.s\n"
+    "add x24, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    ".inst 0x648aaa09  // bfcvtnt z9.h, p2/M, z16.s\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    ".inst 0x658aaa0a  // bfcvt z10.h, p2/M, z16.s\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    ".inst 0x648aaa0a  // bfcvtnt z10.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0b  // bfcvt z11.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "addvl x23, SP, #6\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    ".inst 0x648aaa0b  // bfcvtnt z11.h, p2/M, z16.s\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa1402ae7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x23]\n"
+    ".inst 0xc12f7130  // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    "addvl x22, SP, #12\n"
+    "add x24, x24, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa0c  // bfcvt z12.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1277131  // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    ".inst 0xa1412ae7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc12e7132  // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
+    "addvl x20, SP, #24\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "ld1w { z16.s }, p0/Z, [x24]\n"
+    ".inst 0xc1267133  // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
+    ".inst 0x648aaa0c  // bfcvtnt z12.h, p2/M, z16.s\n"
+    ".inst 0xa1402aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12f7150  // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc1277151  // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+    ".inst 0xa1412ac7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xa1422ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc12d7134  // bfdot za.s[x11, 4], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xc1257135  // bfdot za.s[x11, 5], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f7152  // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc1277153  // bfdot za.s[x11, 3], { z10.h-z11.h }, z7.h\n"
+    ".inst 0xa1412aa7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12e7170  // bfdot za.s[x11, 0], { z11.h-z12.h }, z14.h\n"
+    ".inst 0xc1267171  // bfdot za.s[x11, 1], { z11.h-z12.h }, z6.h\n"
+    ".inst 0xa1422ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc12d7136  // bfdot za.s[x11, 6], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xc1257137  // bfdot za.s[x11, 7], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xc12f7154  // bfdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc1277155  // bfdot za.s[x11, 5], { z10.h-z11.h }, z7.h\n"
+    ".inst 0xa1412a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12e7172  // bfdot za.s[x11, 2], { z11.h-z12.h }, z14.h\n"
+    ".inst 0xc1267173  // bfdot za.s[x11, 3], { z11.h-z12.h }, z6.h\n"
+    ".inst 0xa1422aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12f7156  // bfdot za.s[x11, 6], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc1277157  // bfdot za.s[x11, 7], { z10.h-z11.h }, z7.h\n"
+    ".inst 0xc1297174  // bfdot za.s[x11, 4], { z11.h-z12.h }, z9.h\n"
+    ".inst 0xc1217175  // bfdot za.s[x11, 5], { z11.h-z12.h }, z1.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1217176  // bfdot za.s[x11, 6], { z11.h-z12.h }, z1.h\n"
+    ".inst 0xc1207177  // bfdot za.s[x11, 7], { z11.h-z12.h }, z0.h\n"
+    "17:"  // Padded: 0 priming loads
+    ".inst 0xa1402be6  // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa1422be4  // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 20f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    "add x20, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa14  // bfcvtnt z20.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x658aaa15  // bfcvt z21.h, p2/M, z16.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x648aaa15  // bfcvtnt z21.h, p2/M, z16.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa16  // bfcvt z22.h, p2/M, z16.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa16  // bfcvtnt z22.h, p2/M, z16.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa17  // bfcvt z23.h, p2/M, z16.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z16.s }, p0/Z, [x20]\n"
+    "sub x25, x25, #0x1\n"
+    ".inst 0x648aaa17  // bfcvtnt z23.h, p2/M, z16.s\n"
+    "sub x15, x15, #0x1\n"
+    "cmp x25, x15\n"
+    "csel x25, x25, x15, LT\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    "sub x15, x15, x25\n"
+    "cbz x25, 19f\n"
+    "18:"  // Padded: Main loop
+    "addvl x24, SP, #6\n"
+    ".inst 0xc12e7290  // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc1267291  // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa0402b02  // ld1h { z2.h-z3.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1237292  // bfdot za.s[x11, 2], { z20.h-z21.h }, z3.h\n"
+    "ld1w { z16.s }, p0/Z, [x16]\n"
+    "add x22, x16, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1227293  // bfdot za.s[x11, 3], { z20.h-z21.h }, z2.h\n"
+    ".inst 0xa1402ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xc12d72b0  // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+    "ld1w { z19.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc12572b1  // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+    ".inst 0xa1412b07  // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+    "subs x25, x25, #0x1\n"
+    "add x16, x16, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc12e7294  // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+    "ld1w { z17.s }, p0/Z, [x22]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    ".inst 0xc1267295  // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc12f72b2  // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+    "ld1w { z27.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc12772b3  // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa1412ae7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc12c72d0  // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+    "ld1w { z10.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc12472d1  // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+    ".inst 0xa1422b04  // ld1h { z4.h, z12.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+    ".inst 0xc12e7296  // bfdot za.s[x11, 6], { z20.h-z21.h }, z14.h\n"
+    "ld1w { z8.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1267297  // bfdot za.s[x11, 7], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f72b4  // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
+    "ld1w { z11.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc12772b5  // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12c72d2  // bfdot za.s[x11, 2], { z22.h-z23.h }, z12.h\n"
+    "ld1w { z18.s }, p0/Z, [x22]\n"
+    ".inst 0xc12472d3  // bfdot za.s[x11, 3], { z22.h-z23.h }, z4.h\n"
+    ".inst 0xa1422ae4  // ld1h { z4.h, z12.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc12f72b6  // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
+    ".inst 0xc12e72b7  // bfdot za.s[x11, 7], { z21.h-z22.h }, z14.h\n"
+    ".inst 0xa1412a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12c72d4  // bfdot za.s[x11, 4], { z22.h-z23.h }, z12.h\n"
+    ".inst 0xc12472d5  // bfdot za.s[x11, 5], { z22.h-z23.h }, z4.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12172d6  // bfdot za.s[x11, 6], { z22.h-z23.h }, z1.h\n"
+    ".inst 0xc12072d7  // bfdot za.s[x11, 7], { z22.h-z23.h }, z0.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc12d1290  // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
+    ".inst 0xc1251291  // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
+    ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+    ".inst 0xa1402be6  // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc12f12b0  // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
+    ".inst 0x648aaa74  // bfcvtnt z20.h, p2/M, z19.s\n"
+    ".inst 0xc12712b1  // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
+    ".inst 0x658aaa35  // bfcvt z21.h, p2/M, z17.s\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xc12112d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0x648aab75  // bfcvtnt z21.h, p2/M, z27.s\n"
+    ".inst 0xc12012d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+    ".inst 0x658aa956  // bfcvt z22.h, p2/M, z10.s\n"
+    ".inst 0x658aa977  // bfcvt z23.h, p2/M, z11.s\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066800  // mova { z0.d-z1.d }, za.d[x11, #0]\n"
+    ".inst 0xa1422be4  // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    ".inst 0x648aa916  // bfcvtnt z22.h, p2/M, z8.s\n"
+    ".inst 0xc0066822  // mova { z2.d-z3.d }, za.d[x11, #1]\n"
+    ".inst 0xc1bccba0  // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
+    "st1w { z0.s }, p1, [x14]\n"
+    "add x14, x14, x5, LSL #2\n"
+    "st1w { z2.s }, p1, [x13]\n"
+    "add x13, x13, x10, LSL #2\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "st1w { z1.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    ".inst 0x648aaa57  // bfcvtnt z23.h, p2/M, z18.s\n"
+    "st1w { z3.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "bgt 18b\n"
+    "19:"  // Main loop tail
+    "addvl x23, SP, #6\n"
+    ".inst 0xc12e7290  // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "addvl x22, SP, #12\n"
+    ".inst 0xc1267291  // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xc1217292  // bfdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+    ".inst 0xc1207293  // bfdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc12d72b0  // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+    ".inst 0xc12572b1  // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+    ".inst 0xa1412ae7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc12e7294  // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+    ".inst 0xc1267295  // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa1402aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc12f72b2  // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+    ".inst 0xc12772b3  // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa1412ac7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc12c72d0  // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+    ".inst 0xc12472d1  // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+    ".inst 0xa1422ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc12d7296  // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+    ".inst 0xc1257297  // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc12f72b4  // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
+    ".inst 0xc12772b5  // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa1412aa7  // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc12e72d2  // bfdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xc12672d3  // bfdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
+    ".inst 0xa1422ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc12f72b6  // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
+    ".inst 0xc12772b7  // bfdot za.s[x11, 7], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xa1412a87  // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc12e72d4  // bfdot za.s[x11, 4], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xc12672d5  // bfdot za.s[x11, 5], { z22.h-z23.h }, z6.h\n"
+    ".inst 0xa1422aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc12b72d6  // bfdot za.s[x11, 6], { z22.h-z23.h }, z11.h\n"
+    ".inst 0xc12372d7  // bfdot za.s[x11, 7], { z22.h-z23.h }, z3.h\n"
+    ".inst 0xa0422a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc12d1290  // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
+    ".inst 0xc1251291  // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
+    ".inst 0xc12f12b0  // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
+    ".inst 0xc12712b1  // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
+    ".inst 0xc12312d0  // bfdot za.s[x8, 0], { z22.h-z23.h }, z3.h\n"
+    ".inst 0xc12212d1  // bfdot za.s[x8, 1], { z22.h-z23.h }, z2.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066814  // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+    ".inst 0xc0066836  // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+    ".inst 0xc1bccbb4  // fclamp { z20.s-z23.s }, z29.s, z28.s\n"
+    "st1w { z20.s }, p1, [x14]\n"
+    "add x14, x14, x5, LSL #2\n"
+    "st1w { z22.s }, p1, [x13]\n"
+    "add x13, x13, x10, LSL #2\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "st1w { z21.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "st1w { z23.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "20:"  // Main loop skip tail
+    "cbz x15, 22f\n"
+    "21:"  // Right padding loop
+    ".inst 0xc0066800  // mova { z0.d-z1.d }, za.d[x11, #0]\n"
+    "add x8, x8, #0x2\n"
+    "subs x15, x15, #0x1\n"
+    ".inst 0xc0066822  // mova { z2.d-z3.d }, za.d[x11, #1]\n"
+    ".inst 0xc1bccba0  // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
+    "st1w { z0.s }, p1, [x14]\n"
+    "add x14, x14, x5, LSL #2\n"
+    "st1w { z2.s }, p1, [x13]\n"
+    "add x13, x13, x10, LSL #2\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "st1w { z1.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "st1w { z3.s }, p1, [x28]\n"
+    "add x28, x28, x26, LSL #2\n"
+    "bgt 21b\n"
+    "22:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #16\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x17\n"
+    "whilelt p1.s, x17, x7\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #30\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..53e596418b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+);
+
+class sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<float, float>
+{
+  using Parent = PlanarStrategy<float, float>;
+
+  public:
+  using return_type = float;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..3a56e69d26
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1246 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
+  const float *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const float *weights,
+  const float *bias,
+  float **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  float act_min,
+  float act_max
+)
+{
+  struct Args
+  {
+    const float *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const float *weights;
+    const float *bias;
+    long unsigned int input_cols, output_cols;
+    float **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+    float clamp_min, clamp_max;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+  __asm__ __volatile__(
+    "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "mov x20, #0xb\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "sub x20, x20, x3\n"
+    "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ptrue p2.b\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+    "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x5\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z12.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+    "whilelt p8.s, XZR, x4\n"
+    "addvl SP, SP, #-15\n"
+    "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+    "fmov z16.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z16.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x21\n"
+    "ld1w { z31.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "ld1w { z8.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aabef  // bfcvt z15.h, p2/M, z31.s\n"
+    "incb x21\n"
+    "ld1w { z18.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aaa4e  // bfcvt z14.h, p2/M, z18.s\n"
+    "addvl x24, SP, #15\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x648aa90f  // bfcvtnt z15.h, p2/M, z8.s\n"
+    "addvl x24, x24, #-3\n"
+    "ld1w { z18.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    "st1h { z15.h }, p2, [x24]\n"
+    ".inst 0x648aaa2e  // bfcvtnt z14.h, p2/M, z17.s\n"
+    "ld1w { z29.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aabb5  // bfcvt z21.h, p2/M, z29.s\n"
+    "incb x21\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z14.h }, p2, [x24, #1, MUL VL]\n"
+    ".inst 0x658aaa58  // bfcvt z24.h, p2/M, z18.s\n"
+    "ld1w { z26.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aab41  // bfcvt z1.h, p2/M, z26.s\n"
+    ".inst 0x648aaa35  // bfcvtnt z21.h, p2/M, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "st1h { z24.h }, p2, [x24, #2, MUL VL]\n"
+    "addvl x24, x24, #-3\n"
+    "ld1w { z9.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    "st1h { z21.h }, p2, [x24]\n"
+    ".inst 0x648aaa21  // bfcvtnt z1.h, p2/M, z17.s\n"
+    "ld1w { z3.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "incb x21\n"
+    ".inst 0x658aa864  // bfcvt z4.h, p2/M, z3.s\n"
+    "ld1w { z31.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa92b  // bfcvt z11.h, p2/M, z9.s\n"
+    "st1h { z1.h }, p2, [x24, #1, MUL VL]\n"
+    "ld1w { z18.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aaa46  // bfcvt z6.h, p2/M, z18.s\n"
+    "st1h { z11.h }, p2, [x24, #2, MUL VL]\n"
+    "ld1w { z5.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "addvl x24, x24, #-3\n"
+    ".inst 0x648aabe4  // bfcvtnt z4.h, p2/M, z31.s\n"
+    "ld1w { z27.s }, p2/Z, [x20]\n"
+    "mov x20, x21\n"
+    "st1h { z4.h }, p2, [x24]\n"
+    ".inst 0x648aa8a6  // bfcvtnt z6.h, p2/M, z5.s\n"
+    "ld1w { z9.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aa938  // bfcvt z24.h, p2/M, z9.s\n"
+    "incb x21\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x658aab75  // bfcvt z21.h, p2/M, z27.s\n"
+    "st1h { z6.h }, p2, [x24, #1, MUL VL]\n"
+    "ld1w { z31.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    ".inst 0x648aaa38  // bfcvtnt z24.h, p2/M, z17.s\n"
+    ".inst 0x658aabf9  // bfcvt z25.h, p2/M, z31.s\n"
+    "ld1w { z18.s }, p2/Z, [x20]\n"
+    "incb x20, ALL, MUL #5\n"
+    "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "st1h { z21.h }, p2, [x24, #2, MUL VL]\n"
+    "ld1w { z11.s }, p2/Z, [x20]\n"
+    "mov x21, x21\n"
+    "addvl x24, x24, #-3\n"
+    "st1h { z24.h }, p2, [x24]\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    ".inst 0x648aaa59  // bfcvtnt z25.h, p2/M, z18.s\n"
+    "st1h { z25.h }, p2, [x24, #1, MUL VL]\n"
+    "ld1w { z8.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    ".inst 0x658aaa29  // bfcvt z9.h, p2/M, z17.s\n"
+    ".inst 0x658aa976  // bfcvt z22.h, p2/M, z11.s\n"
+    "ld1w { z28.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    ".inst 0x658aab85  // bfcvt z5.h, p2/M, z28.s\n"
+    "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ld1w { z25.s }, p2/Z, [x21]\n"
+    "incb x21, ALL, MUL #5\n"
+    "sub x20, x7, #0x1\n"
+    "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
+    "ld1w { z11.s }, p2/Z, [x21]\n"
+    "orr x23, x20, %x[ld_in_col], LSL #18\n"
+    "addvl x24, x24, #-3\n"
+    "mov z17.d, z16.d\n"
+    "orr x23, x5, x23, LSL #20\n"
+    "mov x22, #0xb\n"
+    "mov z18.d, z16.d\n"
+    "mov z19.d, z16.d\n"
+    "add x21, x4, x3\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    ".inst 0x648aa909  // bfcvtnt z9.h, p2/M, z8.s\n"
+    "st1h { z9.h }, p2, [x24]\n"
+    ".inst 0x648aab25  // bfcvtnt z5.h, p2/M, z25.s\n"
+    "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
+    ".inst 0x658aa97b  // bfcvt z27.h, p2/M, z11.s\n"
+    "mov x8, #0x0\n"
+    "st1h { z27.h }, p2, [x24, #2, MUL VL]\n"
+    "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "lsl x23, x23, #0x2\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x4, x17\n"
+    "3:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col], LSL #2\n"
+    "bgt 3b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x2\n"
+    "msub x17, x4, x20, x17\n"
+    ".inst 0xc0040e00  // mova za.d[x8, #0], { z16.d-z19.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040e01  // mova za.d[x8, #1], { z16.d-z19.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x15, x14, [x23], #0x10\n"
+    ".inst 0xc0040e02  // mova za.d[x8, #2], { z16.d-z19.d }\n"
+    "ldp x13, x11, [x20], #0x10\n"
+    ".inst 0xc0040e03  // mova za.d[x8, #3], { z16.d-z19.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "cbz x21, 5f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 5f\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    "and x22, x21, #0x1\n"
+    "add x21, x21, #0x1\n"
+    ".inst 0xc1acc9a4  // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+    "lsr x21, x21, #0x1\n"
+    "sub x16, x16, x21\n"
+    "4:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1w { z4.s }, p1, [x15]\n"
+    "add x15, x15, x13, LSL #2\n"
+    "st1w { z5.s }, p1, [x14]\n"
+    "add x14, x14, x11, LSL #2\n"
+    "st1w { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z7.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "bgt 4b\n"
+    "5:"  // Left padding: End
+    "adds XZR, x4, x3\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 9f\n"
+    "cmp x22, #0x2\n"
+    "beq 8f\n"
+    "cmp x22, #0x3\n"
+    "beq 7f\n"
+    "6:"  // Unpadded: 4 priming loads
+    "add x21, x17, %x[ld_in_row], LSL #2\n"
+    "ld1w { z0.s }, p1/Z, [x17]\n"
+    ".inst 0x658aa816  // bfcvt z22.h, p2/M, z0.s\n"
+    "addvl x20, SP, #12\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa936  // bfcvtnt z22.h, p2/M, z9.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab97  // bfcvt z23.h, p2/M, z28.s\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa97  // bfcvtnt z23.h, p2/M, z20.s\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa98  // bfcvt z24.h, p2/M, z20.s\n"
+    "ld1w { z29.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aabb8  // bfcvtnt z24.h, p2/M, z29.s\n"
+    "ld1w { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aabd9  // bfcvt z25.h, p2/M, z30.s\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa939  // bfcvtnt z25.h, p2/M, z9.s\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab5a  // bfcvt z26.h, p2/M, z26.s\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13312d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa93a  // bfcvtnt z26.h, p2/M, z9.s\n"
+    ".inst 0xc13b12f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z11.h\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    ".inst 0x658aa93b  // bfcvt z27.h, p2/M, z9.s\n"
+    "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1391310  // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
+    "7:"  // Unpadded: 3 priming loads
+    "add x21, x17, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x17]\n"
+    ".inst 0x658aab7d  // bfcvt z29.h, p2/M, z27.s\n"
+    "addvl x20, SP, #9\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab5d  // bfcvtnt z29.h, p2/M, z26.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa93e  // bfcvt z30.h, p2/M, z9.s\n"
+    "ld1w { z20.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa9e  // bfcvtnt z30.h, p2/M, z20.s\n"
+    "ld1w { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab3f  // bfcvt z31.h, p2/M, z25.s\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab5f  // bfcvtnt z31.h, p2/M, z26.s\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab60  // bfcvt z0.h, p2/M, z27.s\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa920  // bfcvtnt z0.h, p2/M, z9.s\n"
+    "ld1w { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaae1  // bfcvt z1.h, p2/M, z23.s\n"
+    ".inst 0xa0402a84  // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13413b0  // bfdot za.s[x8, 0], { z29.h-z0.h }, z4.h\n"
+    "ld1w { z9.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa921  // bfcvtnt z1.h, p2/M, z9.s\n"
+    ".inst 0xc13513d0  // bfdot za.s[x8, 0], { z30.h-z1.h }, z5.h\n"
+    "ld1w { z29.s }, p1/Z, [x21]\n"
+    ".inst 0x658aaba2  // bfcvt z2.h, p2/M, z29.s\n"
+    "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc13913f0  // bfdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x17]\n"
+    ".inst 0x658aab7a  // bfcvt z26.h, p2/M, z27.s\n"
+    "addvl x21, SP, #6\n"
+    "ld1w { z21.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaaba  // bfcvtnt z26.h, p2/M, z21.s\n"
+    "addvl x20, SP, #12\n"
+    "ld1w { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab3b  // bfcvt z27.h, p2/M, z25.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z4.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa89b  // bfcvtnt z27.h, p2/M, z4.s\n"
+    "ld1w { z10.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa95c  // bfcvt z28.h, p2/M, z10.s\n"
+    "ld1w { z4.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa89c  // bfcvtnt z28.h, p2/M, z4.s\n"
+    "ld1w { z5.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8bd  // bfcvt z29.h, p2/M, z5.s\n"
+    "ld1w { z5.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa8bd  // bfcvtnt z29.h, p2/M, z5.s\n"
+    "ld1w { z5.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8be  // bfcvt z30.h, p2/M, z5.s\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13e1350  // bfdot za.s[x8, 0], { z26.h-z29.h }, z14.h\n"
+    "ld1w { z5.s }, p1/Z, [x22]\n"
+    ".inst 0x648aa8be  // bfcvtnt z30.h, p2/M, z5.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13f1370  // bfdot za.s[x8, 0], { z27.h-z30.h }, z15.h\n"
+    ".inst 0xa0402a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1381351  // bfdot za.s[x8, 1], { z26.h-z29.h }, z8.h\n"
+    "ld1w { z23.s }, p1/Z, [x22]\n"
+    ".inst 0x658aaaff  // bfcvt z31.h, p2/M, z23.s\n"
+    ".inst 0xc1391371  // bfdot za.s[x8, 1], { z27.h-z30.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1301390  // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1301391  // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x17]\n"
+    ".inst 0x658aab77  // bfcvt z23.h, p2/M, z27.s\n"
+    "addvl x21, SP, #3\n"
+    "ld1w { z24.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab17  // bfcvtnt z23.h, p2/M, z24.s\n"
+    "addvl x20, SP, #9\n"
+    "ld1w { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aabf8  // bfcvt z24.h, p2/M, z31.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z6.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa8d8  // bfcvtnt z24.h, p2/M, z6.s\n"
+    "ld1w { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab99  // bfcvt z25.h, p2/M, z28.s\n"
+    "ld1w { z26.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab59  // bfcvtnt z25.h, p2/M, z26.s\n"
+    "ld1w { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab9a  // bfcvt z26.h, p2/M, z28.s\n"
+    "ld1w { z4.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa89a  // bfcvtnt z26.h, p2/M, z4.s\n"
+    "ld1w { z20.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aaa9b  // bfcvt z27.h, p2/M, z20.s\n"
+    ".inst 0xa1402aa0  // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13012f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
+    "ld1w { z20.s }, p1/Z, [x22]\n"
+    ".inst 0x648aaa9b  // bfcvtnt z27.h, p2/M, z20.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc1381310  // bfdot za.s[x8, 0], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13212f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+    "ld1w { z11.s }, p1/Z, [x22]\n"
+    ".inst 0x658aa97c  // bfcvt z28.h, p2/M, z11.s\n"
+    ".inst 0xc1331311  // bfdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+    "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1341330  // bfdot za.s[x8, 0], { z25.h-z28.h }, z4.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1301331  // bfdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 20f\n"
+    "add x21, x17, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x17]\n"
+    ".inst 0x658aab75  // bfcvt z21.h, p2/M, z27.s\n"
+    "sub x7, x7, #0x2\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "sub x16, x16, #0x1\n"
+    ".inst 0x648aab55  // bfcvtnt z21.h, p2/M, z26.s\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab56  // bfcvt z22.h, p2/M, z26.s\n"
+    "lsr x20, x7, #0x1\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    "cmp x20, x16\n"
+    ".inst 0x648aab56  // bfcvtnt z22.h, p2/M, z26.s\n"
+    "ld1w { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa917  // bfcvt z23.h, p2/M, z8.s\n"
+    "csel x26, x20, x16, LT\n"
+    "ld1w { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa857  // bfcvtnt z23.h, p2/M, z2.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z6.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8d8  // bfcvt z24.h, p2/M, z6.s\n"
+    "and x7, x7, #0x1\n"
+    "ld1w { z15.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa9f8  // bfcvtnt z24.h, p2/M, z15.s\n"
+    "sub x16, x16, x26\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab79  // bfcvt z25.h, p2/M, z27.s\n"
+    "ld1w { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab59  // bfcvtnt z25.h, p2/M, z26.s\n"
+    "ld1w { z27.s }, p1/Z, [x21]\n"
+    ".inst 0x658aab7a  // bfcvt z26.h, p2/M, z27.s\n"
+    "cbz x26, 19f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc13312b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+    "addvl x25, SP, #6\n"
+    "addvl x24, SP, #12\n"
+    "ld1w { z14.s }, p1/Z, [x17]\n"
+    ".inst 0xc13b12d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+    ".inst 0xa1402b20  // ld1h { z0.h, z8.h }, pn10.b/Z, [x25]\n"
+    "add x23, x17, %x[ld_in_row], LSL #2\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc13012b1  // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z27.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13812d1  // bfdot za.s[x8, 1], { z22.h-z25.h }, z8.h\n"
+    ".inst 0xa1402b00  // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+    "addvl x21, SP, #9\n"
+    "add x20, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13012b2  // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z2.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9d5  // bfcvt z21.h, p2/M, z14.s\n"
+    ".inst 0xc13712f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, #2, MUL VL]\n"
+    ".inst 0x648aab75  // bfcvtnt z21.h, p2/M, z27.s\n"
+    "subs x26, x26, #0x1\n"
+    "ld1w { z14.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13812d2  // bfdot za.s[x8, 2], { z22.h-z25.h }, z8.h\n"
+    ".inst 0x658aa856  // bfcvt z22.h, p2/M, z2.s\n"
+    "ld1w { z7.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13b12f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z11.h\n"
+    ".inst 0x648aa9d6  // bfcvtnt z22.h, p2/M, z14.s\n"
+    "ld1w { z31.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+    ".inst 0xc1acc9a8  // fclamp { z8.s-z11.s }, z13.s, z12.s\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0xc13012f2  // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+    ".inst 0x658aa8f7  // bfcvt z23.h, p2/M, z7.s\n"
+    "add x8, x8, #0x1\n"
+    "ld1w { z26.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab58  // bfcvt z24.h, p2/M, z26.s\n"
+    ".inst 0x648aabf7  // bfcvtnt z23.h, p2/M, z31.s\n"
+    "ld1w { z2.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa858  // bfcvtnt z24.h, p2/M, z2.s\n"
+    "st1w { z8.s }, p1, [x15]\n"
+    "ld1w { z0.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa819  // bfcvt z25.h, p2/M, z0.s\n"
+    "add x15, x15, x13, LSL #2\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc13212b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z2.h\n"
+    "st1w { z9.s }, p1, [x14]\n"
+    "add x14, x14, x11, LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x23]\n"
+    ".inst 0x648aab59  // bfcvtnt z25.h, p2/M, z26.s\n"
+    "add x23, x23, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13312d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13112b1  // bfdot za.s[x8, 1], { z21.h-z24.h }, z1.h\n"
+    "st1w { z10.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "ld1w { z26.s }, p1/Z, [x23]\n"
+    ".inst 0x658aab5a  // bfcvt z26.h, p2/M, z26.s\n"
+    ".inst 0xc13912d1  // bfdot za.s[x8, 1], { z22.h-z25.h }, z9.h\n"
+    "ld1w { z31.s }, p1/Z, [x17]\n"
+    ".inst 0x658aabf5  // bfcvt z21.h, p2/M, z31.s\n"
+    "st1w { z11.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "ld1w { z30.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    ".inst 0x648aabd5  // bfcvtnt z21.h, p2/M, z30.s\n"
+    "ld1w { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa816  // bfcvt z22.h, p2/M, z0.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z1.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa836  // bfcvtnt z22.h, p2/M, z1.s\n"
+    "ld1w { z11.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
+    ".inst 0xc13212f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    "ld1w { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z14.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1w { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc13412f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z4.h\n"
+    ".inst 0x658aa977  // bfcvt z23.h, p2/M, z11.s\n"
+    "ld1w { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa9d8  // bfcvt z24.h, p2/M, z14.s\n"
+    ".inst 0x658aabb9  // bfcvt z25.h, p2/M, z29.s\n"
+    "ld1w { z5.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab97  // bfcvtnt z23.h, p2/M, z28.s\n"
+    ".inst 0x648aab78  // bfcvtnt z24.h, p2/M, z27.s\n"
+    "ld1w { z11.s }, p1/Z, [x20]\n"
+    ".inst 0x648aa8b9  // bfcvtnt z25.h, p2/M, z5.s\n"
+    ".inst 0x658aa97a  // bfcvt z26.h, p2/M, z11.s\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "bgt 11b\n"
+    "b 19f\n"
+    "12:"  // Padded
+    "cbz x22, 17f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 16f\n"
+    "cmp x22, #0x2\n"
+    "beq 15f\n"
+    "cmp x22, #0x3\n"
+    "beq 14f\n"
+    "13:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z1.s }, p0/Z, [x17]\n"
+    ".inst 0x658aa837  // bfcvt z23.h, p2/M, z1.s\n"
+    "add x21, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z29.s }, p0/Z, [x21]\n"
+    ".inst 0x648aabb7  // bfcvtnt z23.h, p2/M, z29.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z30.s }, p0/Z, [x21]\n"
+    ".inst 0x658aabd8  // bfcvt z24.h, p2/M, z30.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z15.s }, p0/Z, [x21]\n"
+    ".inst 0x648aa9f8  // bfcvtnt z24.h, p2/M, z15.s\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab79  // bfcvt z25.h, p2/M, z27.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z20.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa99  // bfcvtnt z25.h, p2/M, z20.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z10.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa95a  // bfcvt z26.h, p2/M, z10.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z8.s }, p0/Z, [x21]\n"
+    ".inst 0x648aa91a  // bfcvtnt z26.h, p2/M, z8.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z28.s }, p0/Z, [x21]\n"
+    ".inst 0x658aab9b  // bfcvt z27.h, p2/M, z28.s\n"
+    "addvl x20, SP, #12\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13112f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z28.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab9b  // bfcvtnt z27.h, p2/M, z28.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z0.s }, p0/Z, [x21]\n"
+    ".inst 0x658aa81c  // bfcvt z28.h, p2/M, z0.s\n"
+    ".inst 0xc1391310  // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc1301330  // bfdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+    "14:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z21.s }, p0/Z, [x17]\n"
+    ".inst 0x658aaab4  // bfcvt z20.h, p2/M, z21.s\n"
+    "add x21, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    ".inst 0x648aab74  // bfcvtnt z20.h, p2/M, z27.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    ".inst 0x658aab75  // bfcvt z21.h, p2/M, z27.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    ".inst 0x648aab75  // bfcvtnt z21.h, p2/M, z27.s\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z29.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aabb6  // bfcvt z22.h, p2/M, z29.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab76  // bfcvtnt z22.h, p2/M, z27.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab77  // bfcvt z23.h, p2/M, z27.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z8.s }, p0/Z, [x21]\n"
+    ".inst 0x648aa917  // bfcvtnt z23.h, p2/M, z8.s\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z28.s }, p0/Z, [x21]\n"
+    ".inst 0x658aab98  // bfcvt z24.h, p2/M, z28.s\n"
+    "addvl x20, SP, #9\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1311290  // bfdot za.s[x8, 0], { z20.h-z23.h }, z1.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z0.s }, p0/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa818  // bfcvtnt z24.h, p2/M, z0.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z1.s }, p0/Z, [x21]\n"
+    ".inst 0x658aa839  // bfcvt z25.h, p2/M, z1.s\n"
+    ".inst 0xc13912b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13012d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+    "15:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z6.s }, p0/Z, [x17]\n"
+    ".inst 0x658aa8da  // bfcvt z26.h, p2/M, z6.s\n"
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z29.s }, p0/Z, [x22]\n"
+    ".inst 0x648aabba  // bfcvtnt z26.h, p2/M, z29.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z28.s }, p0/Z, [x22]\n"
+    ".inst 0x658aab9b  // bfcvt z27.h, p2/M, z28.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z14.s }, p0/Z, [x22]\n"
+    ".inst 0x648aa9db  // bfcvtnt z27.h, p2/M, z14.s\n"
+    "mov x12, #0x4\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab1c  // bfcvt z28.h, p2/M, z24.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z1.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa83c  // bfcvtnt z28.h, p2/M, z1.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z3.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa87d  // bfcvt z29.h, p2/M, z3.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z0.s }, p0/Z, [x22]\n"
+    ".inst 0x648aa81d  // bfcvtnt z29.h, p2/M, z0.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x22]\n"
+    ".inst 0x658aab1e  // bfcvt z30.h, p2/M, z24.s\n"
+    "addvl x21, SP, #6\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1311350  // bfdot za.s[x8, 0], { z26.h-z29.h }, z1.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z23.s }, p0/Z, [x22]\n"
+    ".inst 0x648aaafe  // bfcvtnt z30.h, p2/M, z23.s\n"
+    "addvl x20, SP, #12\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1391370  // bfdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "ld1w { z31.s }, p0/Z, [x22]\n"
+    ".inst 0xc1301351  // bfdot za.s[x8, 1], { z26.h-z29.h }, z0.h\n"
+    ".inst 0x658aabff  // bfcvt z31.h, p2/M, z31.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1311371  // bfdot za.s[x8, 1], { z27.h-z30.h }, z1.h\n"
+    ".inst 0xc1301390  // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1301391  // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+    "16:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z22.s }, p0/Z, [x17]\n"
+    ".inst 0x658aaad5  // bfcvt z21.h, p2/M, z22.s\n"
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z3.s }, p0/Z, [x22]\n"
+    ".inst 0x648aa875  // bfcvtnt z21.h, p2/M, z3.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z20.s }, p0/Z, [x22]\n"
+    ".inst 0x658aaa96  // bfcvt z22.h, p2/M, z20.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z25.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab36  // bfcvtnt z22.h, p2/M, z25.s\n"
+    "mov x12, #0x4\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z24.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab17  // bfcvt z23.h, p2/M, z24.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z0.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa817  // bfcvtnt z23.h, p2/M, z0.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z7.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa8f8  // bfcvt z24.h, p2/M, z7.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z28.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab98  // bfcvtnt z24.h, p2/M, z28.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z6.s }, p0/Z, [x22]\n"
+    ".inst 0x658aa8d9  // bfcvt z25.h, p2/M, z6.s\n"
+    "addvl x21, SP, #3\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc13112b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z1.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z6.s }, p0/Z, [x22]\n"
+    ".inst 0x648aa8d9  // bfcvtnt z25.h, p2/M, z6.s\n"
+    "addvl x20, SP, #9\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc13912d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z9.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "ld1w { z3.s }, p0/Z, [x22]\n"
+    ".inst 0xc13012b1  // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    ".inst 0x658aa87a  // bfcvt z26.h, p2/M, z3.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc13112d1  // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    ".inst 0xc13012f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc13012f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "17:"  // Padded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 20f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z25.s }, p0/Z, [x17]\n"
+    ".inst 0x658aab35  // bfcvt z21.h, p2/M, z25.s\n"
+    "add x20, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x648aab75  // bfcvtnt z21.h, p2/M, z27.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x658aab76  // bfcvt z22.h, p2/M, z27.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x648aab76  // bfcvtnt z22.h, p2/M, z27.s\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab77  // bfcvt z23.h, p2/M, z27.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab37  // bfcvtnt z23.h, p2/M, z25.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab58  // bfcvt z24.h, p2/M, z26.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x648aab78  // bfcvtnt z24.h, p2/M, z27.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x658aab79  // bfcvt z25.h, p2/M, z27.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z26.s }, p0/Z, [x20]\n"
+    ".inst 0x648aab59  // bfcvtnt z25.h, p2/M, z26.s\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    ".inst 0x658aab7a  // bfcvt z26.h, p2/M, z27.s\n"
+    "sub x7, x7, #0x2\n"
+    "sub x16, x16, #0x1\n"
+    "lsr x20, x7, #0x1\n"
+    "cmp x20, x16\n"
+    "csel x24, x20, x16, LT\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "and x7, x7, #0x1\n"
+    "sub x16, x16, x24\n"
+    "cbz x24, 19f\n"
+    "18:"  // Padded: Main loop
+    ".inst 0xc13312b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+    "addvl x23, SP, #6\n"
+    "addvl x21, SP, #12\n"
+    ".inst 0xc13b12d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13012b1  // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z9.s }, p0/Z, [x17]\n"
+    "add x20, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc13112d1  // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "addvl x22, SP, #3\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    ".inst 0xc13012b2  // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z14.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc13712f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+    "mov x12, #0x4\n"
+    "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0xc13112d2  // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
+    ".inst 0x658aa921  // bfcvt z1.h, p2/M, z9.s\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13012f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0x658aab62  // bfcvt z2.h, p2/M, z27.s\n"
+    "ld1w { z27.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0x648aa9c1  // bfcvtnt z1.h, p2/M, z14.s\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc13012f2  // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+    ".inst 0x658aa923  // bfcvt z3.h, p2/M, z9.s\n"
+    "addvl x21, SP, #9\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0x658aa924  // bfcvt z4.h, p2/M, z9.s\n"
+    "mov x12, #0x8\n"
+    "ld1w { z24.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa9e2  // bfcvtnt z2.h, p2/M, z15.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aab63  // bfcvtnt z3.h, p2/M, z27.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0x648aab04  // bfcvtnt z4.h, p2/M, z24.s\n"
+    ".inst 0x658aa925  // bfcvt z5.h, p2/M, z9.s\n"
+    ".inst 0xa1402ac0  // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x648aabc5  // bfcvtnt z5.h, p2/M, z30.s\n"
+    ".inst 0xc1301030  // bfdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1381050  // bfdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "ld1w { z0.s }, p0/Z, [x17]\n"
+    "add x20, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc1361031  // bfdot za.s[x8, 1], { z1.h-z4.h }, z6.h\n"
+    "ld1w { z10.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0x658aaba6  // bfcvt z6.h, p2/M, z29.s\n"
+    "ld1w { z9.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc13e1051  // bfdot za.s[x8, 1], { z2.h-z5.h }, z14.h\n"
+    "mov x12, #0x4\n"
+    "ld1w { z29.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa815  // bfcvt z21.h, p2/M, z0.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z31.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa936  // bfcvt z22.h, p2/M, z9.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+    ".inst 0xc1301070  // bfdot za.s[x8, 0], { z3.h-z6.h }, z0.h\n"
+    "subs x24, x24, #0x1\n"
+    "ld1w { z15.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1acc9b8  // fclamp { z24.s-z27.s }, z13.s, z12.s\n"
+    "ld1w { z30.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "st1w { z24.s }, p1, [x15]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z14.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    "st1w { z25.s }, p1, [x14]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1301071  // bfdot za.s[x8, 1], { z3.h-z6.h }, z0.h\n"
+    ".inst 0x658aabf7  // bfcvt z23.h, p2/M, z31.s\n"
+    "ld1w { z8.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0x658aabd8  // bfcvt z24.h, p2/M, z30.s\n"
+    "ld1w { z4.s }, p0/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0x658aa919  // bfcvt z25.h, p2/M, z8.s\n"
+    "ld1w { z5.s }, p0/Z, [x20]\n"
+    "add x15, x15, x13, LSL #2\n"
+    "add x14, x14, x11, LSL #2\n"
+    "st1w { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z27.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    ".inst 0x648aa955  // bfcvtnt z21.h, p2/M, z10.s\n"
+    ".inst 0x648aabb6  // bfcvtnt z22.h, p2/M, z29.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+    ".inst 0x648aa9f7  // bfcvtnt z23.h, p2/M, z15.s\n"
+    ".inst 0x648aa9d8  // bfcvtnt z24.h, p2/M, z14.s\n"
+    ".inst 0x648aa899  // bfcvtnt z25.h, p2/M, z4.s\n"
+    ".inst 0x658aa8ba  // bfcvt z26.h, p2/M, z5.s\n"
+    "bgt 18b\n"
+    "19:"  // Main loop tail
+    ".inst 0xc13312b0  // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc13b12d0  // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+    ".inst 0xa0402b00  // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13012b1  // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z5.s }, p0/Z, [x17]\n"
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xc13112d1  // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #3\n"
+    "addvl x20, SP, #9\n"
+    ".inst 0xc13012b2  // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+    "ld1w { z29.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z2.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc13712f0  // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+    "mov x12, #0x4\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0xc13112d2  // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
+    ".inst 0x658aa8bb  // bfcvt z27.h, p2/M, z5.s\n"
+    "ld1w { z20.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc13012f1  // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "ld1w { z1.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0x658aa85c  // bfcvt z28.h, p2/M, z2.s\n"
+    "ld1w { z14.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0x648aabbb  // bfcvtnt z27.h, p2/M, z29.s\n"
+    "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0xc13012f2  // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+    ".inst 0x658aa83d  // bfcvt z29.h, p2/M, z1.s\n"
+    "add x17, x17, %x[ld_in_col], LSL #2\n"
+    "ld1w { z1.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0x658aa83e  // bfcvt z30.h, p2/M, z1.s\n"
+    "mov x12, #0x8\n"
+    "ld1w { z31.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aaa9c  // bfcvtnt z28.h, p2/M, z20.s\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z26.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa9dd  // bfcvtnt z29.h, p2/M, z14.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0x648aabfe  // bfcvtnt z30.h, p2/M, z31.s\n"
+    ".inst 0x658aab5f  // bfcvt z31.h, p2/M, z26.s\n"
+    ".inst 0xa1402aa2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+    "ld1w { z9.s }, p0/Z, [x22]\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x648aa93f  // bfcvtnt z31.h, p2/M, z9.s\n"
+    ".inst 0xc1321370  // bfdot za.s[x8, 0], { z27.h-z30.h }, z2.h\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z26.s }, p0/Z, [x22]\n"
+    ".inst 0xc13a1390  // bfdot za.s[x8, 0], { z28.h-z31.h }, z10.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x658aab40  // bfcvt z0.h, p2/M, z26.s\n"
+    ".inst 0xc1321371  // bfdot za.s[x8, 1], { z27.h-z30.h }, z2.h\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1acc9a4  // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+    ".inst 0xc13a1391  // bfdot za.s[x8, 1], { z28.h-z31.h }, z10.h\n"
+    "st1w { z4.s }, p1, [x15]\n"
+    "add x15, x15, x13, LSL #2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc13913b0  // bfdot za.s[x8, 0], { z29.h-z0.h }, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "st1w { z5.s }, p1, [x14]\n"
+    "add x14, x14, x11, LSL #2\n"
+    "st1w { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    "st1w { z7.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    ".inst 0xc13913b1  // bfdot za.s[x8, 1], { z29.h-z0.h }, z9.h\n"
+    "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "20:"  // Main loop skip tail
+    "cbz x7, 21f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z25.s }, p0/Z, [x17]\n"
+    ".inst 0x658aab3d  // bfcvt z29.h, p2/M, z25.s\n"
+    "add x22, x17, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z26.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab5d  // bfcvtnt z29.h, p2/M, z26.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z25.s }, p0/Z, [x22]\n"
+    ".inst 0x658aab3e  // bfcvt z30.h, p2/M, z25.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1w { z24.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab1e  // bfcvtnt z30.h, p2/M, z24.s\n"
+    "mov x12, #0x4\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z26.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aab5f  // bfcvt z31.h, p2/M, z26.s\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z9.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x648aa93f  // bfcvtnt z31.h, p2/M, z9.s\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1w { z9.s }, p0/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x658aa920  // bfcvt z0.h, p2/M, z9.s\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x8\n"
+    "ld1w { z24.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab00  // bfcvtnt z0.h, p2/M, z24.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1w { z9.s }, p0/Z, [x22]\n"
+    ".inst 0x658aa921  // bfcvt z1.h, p2/M, z9.s\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1w { z25.s }, p0/Z, [x22]\n"
+    ".inst 0x648aab21  // bfcvtnt z1.h, p2/M, z25.s\n"
+    ".inst 0xc13313b0  // bfdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+    "addvl x21, SP, #6\n"
+    "add x22, x22, %x[ld_in_row], LSL #2\n"
+    ".inst 0xc13b13d0  // bfdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "addvl x20, SP, #12\n"
+    ".inst 0xc13e13b1  // bfdot za.s[x8, 1], { z29.h-z0.h }, z14.h\n"
+    "ld1w { z25.s }, p0/Z, [x22]\n"
+    ".inst 0x658aab22  // bfcvt z2.h, p2/M, z25.s\n"
+    "sub x16, x16, #0x1\n"
+    ".inst 0xc13f13d1  // bfdot za.s[x8, 1], { z30.h-z1.h }, z15.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc13e13b2  // bfdot za.s[x8, 2], { z29.h-z0.h }, z14.h\n"
+    ".inst 0xc13713f0  // bfdot za.s[x8, 0], { z31.h-z2.h }, z7.h\n"
+    "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc13f13d2  // bfdot za.s[x8, 2], { z30.h-z1.h }, z15.h\n"
+    ".inst 0xc13413f1  // bfdot za.s[x8, 1], { z31.h-z2.h }, z4.h\n"
+    "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    ".inst 0xc1acc9a4  // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+    "st1w { z4.s }, p1, [x15]\n"
+    "add x15, x15, x13, LSL #2\n"
+    ".inst 0xc13913f2  // bfdot za.s[x8, 2], { z31.h-z2.h }, z9.h\n"
+    "add x8, x8, #0x1\n"
+    "st1w { z5.s }, p1, [x14]\n"
+    "add x14, x14, x11, LSL #2\n"
+    "st1w { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    "st1w { z7.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "21:"  // Tail input: End
+    "cbz x16, 23f\n"
+    "22:"  // Right padding loop
+    ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "subs x16, x16, #0x1\n"
+    ".inst 0xc1acc9a4  // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+    "st1w { z4.s }, p1, [x15]\n"
+    "add x15, x15, x13, LSL #2\n"
+    ".inst 0xc0040e04  // mova za.d[x8, #4], { z16.d-z19.d }\n"
+    "st1w { z5.s }, p1, [x14]\n"
+    "add x14, x14, x11, LSL #2\n"
+    "st1w { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28, LSL #2\n"
+    "st1w { z7.s }, p1, [x9]\n"
+    "add x9, x9, x27, LSL #2\n"
+    "bgt 22b\n"
+    "23:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incb x20, ALL, MUL #16\n"
+    "incb x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x6\n"
+    "whilelt p1.s, x6, x5\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21, LSL #2\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21, LSL #2\n"
+    "add x22, x22, x20, LSL #2\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #15\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..de3eadac8a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+  using Parent = PlanarStrategy<int8_t, int8_t>;
+
+  public:
+  using return_type = int8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_s8q_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_s8q_planar_3x3_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..845f376926
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const int8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    int8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x6\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-12\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z21.h, p2/M, z21.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z30.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z10.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z7.h, #0x0\n"
+    "sub z10.h, z10.h, z31.h\n"
+    "incw x22\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "trn1 z20.h, z7.h, z10.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "mov x20, x22\n"
+    "trn1 z19.h, z10.h, z16.h\n"
+    "ld1sb { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "trn1 z26.h, z16.h, z11.h\n"
+    "trn1 z13.h, z11.h, z7.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z24.h, z24.h, z31.h\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "sub z2.h, z2.h, z31.h\n"
+    "addvl x21, SP, #12\n"
+    "incw x22\n"
+    "addvl x21, x21, #-4\n"
+    "mov x20, x22\n"
+    "st1h { z20.h }, p2, [x21]\n"
+    "trn1 z22.h, z7.h, z24.h\n"
+    "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z1.h, z24.h, z11.h\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z3.h, z11.h, z2.h\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+    "trn1 z25.h, z2.h, z7.h\n"
+    "ld1sb { z4.s }, p2/Z, [x20]\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "sub z0.h, z0.h, z31.h\n"
+    "addvl x21, x21, #-4\n"
+    "st1h { z22.h }, p2, [x21]\n"
+    "sub z4.h, z4.h, z31.h\n"
+    "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z31.d, z30.d\n"
+    "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z24.h, z7.h, z16.h\n"
+    "trn1 z18.h, z16.h, z0.h\n"
+    "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
+    "addvl x21, x21, #-4\n"
+    "trn1 z0.h, z0.h, z4.h\n"
+    "trn1 z1.h, z4.h, z7.h\n"
+    "st1h { z24.h }, p2, [x21]\n"
+    "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x6\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040bc2  // mova za.d[x8, #2], { z30.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    ".inst 0xc0040bc3  // mova za.d[x8, #3], { z30.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1sb { z20.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #8\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z20.h, z16.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1sb { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z5.h, z23.h, z22.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    "ld1sb { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "trn1 z6.h, z17.h, z16.h\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b1488  // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+    ".inst 0xc1631489  // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+    ".inst 0xa1412a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16814a8  // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+    ".inst 0xc16014a9  // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x22, x14, %x[ld_in_row]\n"
+    "ld1sb { z25.s }, p1/Z, [x14]\n"
+    "addvl x21, SP, #4\n"
+    "ld1sb { z6.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z3.h, z25.h, z6.h\n"
+    "add z3.h, z3.h, z21.h\n"
+    "ld1sb { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #8\n"
+    "ld1sb { z26.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z4.h, z18.h, z26.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1sb { z2.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z5.s }, p1/Z, [x22]\n"
+    "trn1 z5.h, z2.h, z5.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1611468  // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+    ".inst 0xc1601469  // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xa0412aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a146a  // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+    ".inst 0xc162146b  // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+    ".inst 0xc1691488  // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+    ".inst 0xc1681489  // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+    ".inst 0xa1412a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a148a  // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+    ".inst 0xc162148b  // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1sb { z17.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x1\n"
+    "ld1sb { z9.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z17.h, z9.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1sb { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x15, x13\n"
+    "add z6.h, z6.h, z21.h\n"
+    "ld1sb { z7.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z17.h, z7.h\n"
+    "csel x23, x15, x13, LT\n"
+    "ld1sb { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z7.h, z7.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z1.s }, p1/Z, [x20]\n"
+    "trn1 z8.h, z17.h, z1.h\n"
+    "add z8.h, z8.h, z21.h\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x22, SP, #4\n"
+    "addvl x21, SP, #8\n"
+    "ld1sb { z2.s }, p1/Z, [x14]\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "ld1sb { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412ac3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "ld1sb { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16d14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
+    "ld1sb { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16514cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    "ld1sb { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    "trn1 z6.h, z2.h, z19.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16314eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16914ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xc16114ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+    "trn1 z7.h, z23.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "add z7.h, z7.h, z21.h\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add z8.h, z8.h, z21.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z19.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #8\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z9.h, z17.h, z16.h\n"
+    ".inst 0xc16a14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16214e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16d1508  // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+    ".inst 0xc1651509  // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z19.h, z18.h\n"
+    "trn1 z23.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    "trn1 z24.h, z17.h, z16.h\n"
+    ".inst 0xc16116c8  // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0xc16016c9  // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0412aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d16ca  // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+    ".inst 0xc16516cb  // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+    ".inst 0xc16116e8  // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016e9  // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16116ea  // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016eb  // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+    "15:"  // Padded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z19.h, z18.h\n"
+    "trn1 z7.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "sub x15, x15, #0x1\n"
+    "sub x13, x13, #0x1\n"
+    "cmp x15, x13\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "csel x23, x15, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "16:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z9.s }, p0/Z, [x14]\n"
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    "add z9.h, p0/M, z9.h, z21.h\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z19.s }, p0/Z, [x22]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z18.s }, p0/Z, [x22]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x22]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    "mov x12, #0x4\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16b14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
+    "subs x23, x23, #0x1\n"
+    "ld1sb { z17.s }, p0/Z, [x22]\n"
+    ".inst 0xc16314cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa0412aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d14cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    "ld1sb { z2.s }, p0/Z, [x22]\n"
+    ".inst 0xc16514cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+    "add z2.h, p0/M, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16a14eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16b14ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z6.h, z9.h, z19.h\n"
+    ".inst 0xc16314ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "trn1 z7.h, z18.h, z16.h\n"
+    "trn1 z8.h, z17.h, z2.h\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x21, SP, #4\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa0412aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16114ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16014cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc16314ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc16214eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc16114ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+    ".inst 0xc16014ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "18:"  // Main loop skip tail
+    "cbz x13, 20f\n"
+    "19:"  // Right padding loop
+    ".inst 0xc0060804  // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0060826  // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1acaa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc1afab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    ".inst 0xc1bccfa4  // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
+    "st1b { z4.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z5.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z7.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 19b\n"
+    "20:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #12\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..56fb127aa0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+  using Parent = PlanarStrategy<int8_t, int8_t>;
+
+  public:
+  using return_type = int8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_s8q_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_s8q_planar_3x3_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..1d0efc6bc1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const int8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    int8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x9\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-6\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z11.h, p2/M, z11.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z28.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "incw x22\n"
+    "mov z24.h, #0x0\n"
+    "ld1sb { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z3.h, z3.h, z16.h\n"
+    "trn1 z31.h, z26.h, z3.h\n"
+    "ld1sb { z21.s }, p2/Z, [x20]\n"
+    "sub z21.h, z21.h, z16.h\n"
+    "mov x20, x22\n"
+    "trn1 z14.h, z21.h, z24.h\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z2.h, z2.h, z16.h\n"
+    "addvl x21, SP, #6\n"
+    "ld1sb { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z25.h, z25.h, z16.h\n"
+    "incw x22\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "sub z27.h, z27.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "mov x20, x22\n"
+    "st1h { z31.h }, p2, [x21]\n"
+    "trn1 z4.h, z2.h, z25.h\n"
+    "ld1sb { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z12.h, z27.h, z24.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "sub z23.h, z23.h, z16.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z20.h, z20.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "st1h { z4.h }, p2, [x21]\n"
+    "mov z29.d, z28.d\n"
+    "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
+    "addvl x21, x21, #-2\n"
+    "mov z30.d, z28.d\n"
+    "mov z31.d, z28.d\n"
+    "trn1 z25.h, z26.h, z23.h\n"
+    "st1h { z25.h }, p2, [x21]\n"
+    "trn1 z3.h, z20.h, z24.h\n"
+    "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x9\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040f80  // mova za.d[x8, #0], { z28.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f81  // mova za.d[x8, #1], { z28.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a9aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1adab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc1a7cd58  // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z25.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z26.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1sb { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #4\n"
+    "ld1sb { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1sb { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z15.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z15.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1sb { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z21.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1sb { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z19.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1sb { z8.s }, p1/Z, [x21]\n"
+    "mov z5.d, z8.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1sb { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #2\n"
+    "ld1sb { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1sb { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z12.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z12.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1sb { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z8.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1sb { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z5.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z5.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1sb { z5.s }, p1/Z, [x21]\n"
+    "mov z5.d, z5.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1sb { z21.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x2\n"
+    "ld1sb { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z8.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1sb { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x15, #0x1\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1sb { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z25.h\n"
+    "cmp x20, x13\n"
+    "ld1sb { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x23, x20, x13, LT\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1sb { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z18.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1sb { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1sb { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z19.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1sb { z8.s }, p1/Z, [x21]\n"
+    "mov z25.d, z8.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "addvl x21, SP, #2\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1sb { z21.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1sb { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "ld1sb { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z21.h, z21.h, z11.h\n"
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    "ld1sb { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z8.h\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1sb { z23.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ld1sb { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1sb { z24.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "ld1sb { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z8.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1sb { z4.s }, p1/Z, [x22]\n"
+    "mov z25.d, z4.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    ".inst 0xa1402aa4  // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17416a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "ld1sb { z21.s }, p1/Z, [x14]\n"
+    ".inst 0xc17c16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    "ld1sb { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z12.h\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "ld1sb { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "ld1sb { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z20.h\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "ld1sb { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "ld1sb { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z24.h\n"
+    "add x27, x27, x25\n"
+    "ld1sb { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "ld1sb { z3.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z3.h\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1sb { z3.s }, p1/Z, [x20]\n"
+    "mov z25.d, z3.d\n"
+    "add z22.h, z22.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "add z23.h, z23.h, z11.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "add z25.h, z25.h, z11.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z4.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "addvl x20, SP, #4\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z1.d\n"
+    ".inst 0xc17416c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17c16e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z5.s }, p0/Z, [x20]\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z5.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z15.s }, p0/Z, [x20]\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "addvl x20, SP, #2\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z15.d\n"
+    ".inst 0xc17016c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17116e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+    "15:"  // Padded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "trn1 z22.h, z22.h, z3.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "sub x15, x15, #0x2\n"
+    "sub x13, x13, #0x1\n"
+    "trn1 z23.h, z23.h, z19.h\n"
+    "trn1 z24.h, z24.h, z20.h\n"
+    "lsr x20, x15, #0x1\n"
+    "cmp x20, x13\n"
+    "mov z25.d, z3.d\n"
+    "csel x22, x20, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x22\n"
+    "cbz x22, 17f\n"
+    "16:"  // Padded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    ".inst 0xc17416a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+    "ld1sb { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z14.s }, p0/Z, [x21]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17c16c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+    "ld1sb { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z15.s }, p0/Z, [x21]\n"
+    "mov x12, #0x4\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z23.s }, p0/Z, [x21]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z24.s }, p0/Z, [x21]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z4.s }, p0/Z, [x21]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x8\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z14.h\n"
+    "trn1 z22.h, z22.h, z15.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "addvl x20, SP, #2\n"
+    "ld1sb { z2.s }, p0/Z, [x21]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z4.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17016a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "mov z25.d, z2.d\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z4.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z11.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "subs x22, x22, #0x1\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "trn1 z21.h, z21.h, z20.h\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z22.h, z22.h, z4.h\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "trn1 z24.h, z24.h, z12.h\n"
+    "mov z25.d, z8.d\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1sb { z0.s }, p0/Z, [x14]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z14.s }, p0/Z, [x20]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "ld1sb { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z2.s }, p0/Z, [x20]\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z3.s }, p0/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #2\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "trn1 z0.h, z0.h, z14.h\n"
+    "add x8, x8, #0x1\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "trn1 z1.h, z1.h, z12.h\n"
+    "trn1 z2.h, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "trn1 z3.h, z3.h, z25.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "mov z4.d, z27.d\n"
+    ".inst 0xc17e1408  // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17f1428  // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "18:"  // Main loop skip tail
+    "cbz x15, 19f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z17.h\n"
+    "trn1 z22.h, z22.h, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z5.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z5.h\n"
+    "mov z25.d, z4.d\n"
+    "addvl x20, SP, #4\n"
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "sub x13, x13, #0x1\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "19:"  // Tail input: End
+    "cbz x13, 21f\n"
+    "20:"  // Right padding loop
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 20b\n"
+    "21:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #6\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..40fa718266
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+  using Parent = PlanarStrategy<int8_t, int8_t>;
+
+  public:
+  using return_type = int8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_s8q_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_s8q_planar_5x5_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..bb68733a45
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const int8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    int8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x8\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x5\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x7\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x6\n"
+    "addvl SP, SP, #-30\n"
+    "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z17.h, p2/M, z17.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z18.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x23\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z15.h, #0x0\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "incw x23\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z13.h, z13.h, z3.h\n"
+    "trn1 z11.h, z15.h, z2.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "trn1 z0.h, z2.h, z13.h\n"
+    "ld1sb { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "trn1 z26.h, z13.h, z27.h\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "mov x20, x23\n"
+    "trn1 z10.h, z27.h, z19.h\n"
+    "ld1sb { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z19.h, z19.h, z14.h\n"
+    "trn1 z1.h, z14.h, z15.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "ld1sb { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "addvl x22, SP, #30\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "incw x23\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "trn1 z20.h, z15.h, z9.h\n"
+    "incw x23\n"
+    "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z22.h, z9.h, z5.h\n"
+    "ld1sb { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z9.h, z5.h, z29.h\n"
+    "ld1sb { z21.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z26.h, z29.h, z2.h\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z28.h, z2.h, z23.h\n"
+    "ld1sb { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z23.h, z15.h\n"
+    "sub z25.h, z25.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z21.h, z21.h, z3.h\n"
+    "ld1sb { z6.s }, p2/Z, [x20]\n"
+    "sub z0.h, z0.h, z3.h\n"
+    "mov x20, x23\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "sub z6.h, z6.h, z3.h\n"
+    "st1h { z20.h }, p2, [x22]\n"
+    "incw x23\n"
+    "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z11.h, z15.h, z25.h\n"
+    "trn1 z10.h, z25.h, z21.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z14.h, z21.h, z0.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z21.h, z0.h, z19.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z19.h, z19.h, z6.h\n"
+    "ld1sb { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z13.h, z6.h, z15.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "sub z1.h, z1.h, z3.h\n"
+    "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z30.h, z15.h, z5.h\n"
+    "trn1 z26.h, z5.h, z23.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z22.h, z23.h, z27.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z28.h, z27.h, z29.h\n"
+    "ld1sb { z8.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z27.h, z29.h, z1.h\n"
+    "ld1sb { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z1.h, z15.h\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z8.h, z8.h, z3.h\n"
+    "st1h { z30.h }, p2, [x22]\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+    "mov z19.d, z18.d\n"
+    "trn1 z22.h, z15.h, z11.h\n"
+    "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z1.h, z11.h, z5.h\n"
+    "trn1 z31.h, z5.h, z8.h\n"
+    "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z8.h, z8.h, z9.h\n"
+    "trn1 z21.h, z9.h, z14.h\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "addvl x22, x22, #-6\n"
+    "trn1 z15.h, z14.h, z15.h\n"
+    "st1h { z22.h }, p2, [x22]\n"
+    "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+    "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+    "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+    "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+    "cbz x21, 3f\n"
+    "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x25, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x7, x23, LSL #22\n"
+    "mov x22, #0x8\n"
+    "add x21, x6, x5\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x11, #0x0\n"
+    "mov x8, #0x8\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x6, x16\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x16, x6, x20, x16\n"
+    ".inst 0xc0046a40  // mova za.d[x11, #0], { z18.d-z19.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0046a41  // mova za.d[x11, #1], { z18.d-z19.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x14, x13, [x23], #0x10\n"
+    ".inst 0xc0046a42  // mova za.d[x11, #2], { z18.d-z19.d }\n"
+    "ldp x4, x10, [x20], #0x10\n"
+    ".inst 0xc0046a43  // mova za.d[x11, #3], { z18.d-z19.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0046a44  // mova za.d[x11, #4], { z18.d-z19.d }\n"
+    "ldp x9, x28, [x23], #0x10\n"
+    ".inst 0xc0046a45  // mova za.d[x11, #5], { z18.d-z19.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0046a46  // mova za.d[x11, #6], { z18.d-z19.d }\n"
+    ".inst 0xc0046a47  // mova za.d[x11, #7], { z18.d-z19.d }\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0066814  // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+    "sub x15, x15, x21\n"
+    ".inst 0xc0066836  // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+    ".inst 0xc1a4aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+    ".inst 0xc1acab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+    ".inst 0xc1b0cf14  // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z20.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z22.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z21.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z23.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x6, x5\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x16, %x[ld_in_row]\n"
+    "ld1sb { z1.s }, p1/Z, [x16]\n"
+    "addvl x20, SP, #24\n"
+    "ld1sb { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z1.h, z28.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1sb { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1sb { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z1.h, z2.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1sb { z13.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z6.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z13.h, z6.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1sb { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16a7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+    "ld1sb { z20.s }, p1/Z, [x21]\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "add z30.h, z30.h, z17.h\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1617789  // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+    ".inst 0xc16a77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x22, x16, %x[ld_in_row]\n"
+    "ld1sb { z2.s }, p1/Z, [x16]\n"
+    "addvl x21, SP, #18\n"
+    "ld1sb { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z20.h, z2.h, z28.h\n"
+    "add z20.h, z20.h, z17.h\n"
+    "ld1sb { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1sb { z11.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z21.h, z31.h, z11.h\n"
+    "add z21.h, z21.h, z17.h\n"
+    "ld1sb { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1sb { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z25.h, z8.h\n"
+    "add z22.h, z22.h, z17.h\n"
+    "ld1sb { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16e7688  // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "ld1sb { z3.s }, p1/Z, [x22]\n"
+    "trn1 z23.h, z8.h, z3.h\n"
+    ".inst 0xc1667689  // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc161768a  // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+    "add z23.h, z23.h, z17.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc160768b  // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+    ".inst 0xc16976a8  // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xa0422aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16176a9  // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16976aa  // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xc16176ab  // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xc16f76c8  // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc16e76c9  // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b76ca  // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+    ".inst 0xc16a76cb  // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x23, x16, %x[ld_in_row]\n"
+    "ld1sb { z2.s }, p1/Z, [x16]\n"
+    "addvl x22, SP, #12\n"
+    "ld1sb { z22.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z0.h, z2.h, z22.h\n"
+    "add z0.h, z0.h, z17.h\n"
+    "ld1sb { z14.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1sb { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z1.h, z14.h, z6.h\n"
+    "add z1.h, z1.h, z17.h\n"
+    "ld1sb { z15.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1sb { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z2.h, z15.h, z6.h\n"
+    "add z2.h, z2.h, z17.h\n"
+    "ld1sb { z21.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16f7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+    "ld1sb { z30.s }, p1/Z, [x23]\n"
+    "trn1 z3.h, z21.h, z30.h\n"
+    ".inst 0xc16e7409  // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+    ".inst 0xa1402aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d740a  // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+    "add z3.h, z3.h, z17.h\n"
+    ".inst 0xa0412ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc165740b  // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xa0402a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16f7428  // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e7429  // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xa0422ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16b740c  // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+    ".inst 0xc16a740d  // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+    ".inst 0xc16f742a  // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742b  // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1697448  // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+    ".inst 0xc1687449  // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f742c  // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742d  // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xc16b744a  // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+    ".inst 0xc16a744b  // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc161744c  // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+    ".inst 0xc160744d  // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x24, x16, %x[ld_in_row]\n"
+    "ld1sb { z0.s }, p1/Z, [x16]\n"
+    "addvl x23, SP, #6\n"
+    "ld1sb { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z28.h, z0.h, z3.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1sb { z6.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x22, SP, #12\n"
+    "ld1sb { z30.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z29.h, z6.h, z30.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1sb { z1.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1sb { z25.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z30.h, z1.h, z25.h\n"
+    "add z30.h, z30.h, z17.h\n"
+    "ld1sb { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1sb { z5.s }, p1/Z, [x24]\n"
+    "trn1 z31.h, z3.h, z5.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16e778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+    "add z31.h, z31.h, z17.h\n"
+    ".inst 0xa1412ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc166778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16a77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+    ".inst 0xc16277a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+    ".inst 0xa0412ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16f778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+    ".inst 0xc16e778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xa1402a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16877ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xc166778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xc16d77ac  // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+    ".inst 0xc16577ad  // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa0422aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ae  // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xc16677af  // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+    ".inst 0xc16977cc  // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+    ".inst 0xc16877cd  // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+    ".inst 0xa1422a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ce  // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+    ".inst 0xc16677cf  // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1sb { z26.s }, p1/Z, [x16]\n"
+    "sub x25, x25, #0x1\n"
+    "ld1sb { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z26.h, z28.h\n"
+    "sub x15, x15, #0x1\n"
+    "ld1sb { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x25, x15\n"
+    "add z25.h, z25.h, z17.h\n"
+    "ld1sb { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z31.h, z15.h\n"
+    "csel x25, x25, x15, LT\n"
+    "ld1sb { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z26.h, z26.h, z17.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1sb { z8.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z22.h, z8.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1sb { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "sub x15, x15, x25\n"
+    "ld1sb { z20.s }, p1/Z, [x20]\n"
+    "trn1 z28.h, z21.h, z20.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "cbz x25, 21f\n"
+    "13:"  // Unpadded: Main loop
+    "addvl x24, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #12\n"
+    "ld1sb { z21.s }, p1/Z, [x16]\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402b0e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+    "addvl x22, SP, #18\n"
+    "addvl x21, SP, #24\n"
+    ".inst 0xc16f772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1sb { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa1402ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+    "subs x25, x25, #0x1\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "ld1sb { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412b05  // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1sb { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1sb { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1sb { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422b02  // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+    ".inst 0xc16f772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+    "ld1sb { z30.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+    "ld1sb { z6.s }, p1/Z, [x20]\n"
+    ".inst 0xc165774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16a776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    "trn1 z25.h, z21.h, z0.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16d1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+    "add z25.h, z25.h, z17.h\n"
+    ".inst 0xc1651749  // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+    "trn1 z26.h, z20.h, z31.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    "add z26.h, z26.h, z17.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "trn1 z27.h, z29.h, z22.h\n"
+    "trn1 z28.h, z30.h, z6.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "add z27.h, z27.h, z17.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "add z28.h, z28.h, z17.h\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z9.s }, p0/Z, [x16]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x21, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x21]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z9.h, z22.h\n"
+    "trn1 z0.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z1.h, z22.h, z20.h\n"
+    "ld1sb { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16a77e8  // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16277e9  // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "trn1 z2.h, z21.h, z20.h\n"
+    ".inst 0xc16d7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+    ".inst 0xa0422a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1657409  // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xc1697428  // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+    ".inst 0xc1687429  // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z5.s }, p0/Z, [x16]\n"
+    "add z5.h, p0/M, z5.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z28.h, z5.h, z22.h\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "trn1 z30.h, z22.h, z20.h\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #24\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z21.h, z20.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc161778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa1422aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+    ".inst 0xc16e77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16177ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+    ".inst 0xc16b77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+    ".inst 0xc16377c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+    ".inst 0xc16e77cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z29.s }, p0/Z, [x16]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z8.h, z29.h, z22.h\n"
+    "trn1 z9.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    "trn1 z10.h, z22.h, z20.h\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7508  // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e7509  // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "trn1 z11.h, z21.h, z20.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e750a  // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc166750b  // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16d7528  // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc1657529  // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16f750c  // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e750d  // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xc16d752a  // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xc165752b  // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1617548  // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc1607549  // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e752c  // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+    ".inst 0xc166752d  // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+    ".inst 0xc161754a  // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc160754b  // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f754c  // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc16e754d  // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z1.s }, p0/Z, [x16]\n"
+    "add z1.h, p0/M, z1.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z1.h, z22.h\n"
+    "trn1 z27.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "addvl x23, SP, #6\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aee  // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+    "trn1 z28.h, z22.h, z20.h\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+    ".inst 0xc16e7749  // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xa0402ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0xa0412aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc161774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
+    "addvl x20, SP, #24\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc160774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xa0422ae8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16a7769  // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1687789  // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xc16e776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xc161778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc160778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1422a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16a778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+    "19:"  // Padded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z6.s }, p0/Z, [x16]\n"
+    "add z6.h, p0/M, z6.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z30.s }, p0/Z, [x20]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z6.h, z30.h\n"
+    "trn1 z26.h, z27.h, z26.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z9.s }, p0/Z, [x20]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "sub x25, x25, #0x1\n"
+    "sub x15, x15, #0x1\n"
+    "cmp x25, x15\n"
+    "trn1 z27.h, z8.h, z9.h\n"
+    "trn1 z28.h, z21.h, z29.h\n"
+    "csel x25, x25, x15, LT\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "sub x15, x15, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z8.s }, p0/Z, [x16]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x24, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z21.s }, p0/Z, [x24]\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    "ld1sb { z29.s }, p0/Z, [x24]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    "mov x12, #0x4\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1sb { z30.s }, p0/Z, [x24]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1sb { z15.s }, p0/Z, [x24]\n"
+    "add z15.h, p0/M, z15.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1sb { z20.s }, p0/Z, [x24]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    "ld1sb { z31.s }, p0/Z, [x24]\n"
+    "add z31.h, p0/M, z31.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc16b774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+    "ld1sb { z22.s }, p0/Z, [x24]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    ".inst 0xc16a774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc161776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc160776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+    ".inst 0xa1422aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1631728  // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+    ".inst 0xc1621729  // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    "trn1 z25.h, z8.h, z21.h\n"
+    ".inst 0xc16e1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc1661749  // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "trn1 z26.h, z29.h, z30.h\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "trn1 z27.h, z15.h, z20.h\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "trn1 z28.h, z31.h, z22.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x22, SP, #12\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc169774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc169774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0422ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc163776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422aa2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16a776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc1691748  // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc1681749  // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+    ".inst 0xc1611768  // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc1601769  // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "22:"  // Main loop skip tail
+    "cbz x15, 24f\n"
+    "23:"  // Right padding loop
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "add x8, x8, #0x2\n"
+    "subs x15, x15, #0x1\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 23b\n"
+    "24:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x17\n"
+    "whilelt p1.s, x17, x7\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #30\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..8bffc05e1f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+  using Parent = PlanarStrategy<int8_t, int8_t>;
+
+  public:
+  using return_type = int8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_s8q_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_s8q_planar_5x5_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..3da0d14d74
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
+  const int8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  int8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const int8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    int8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0xb\n"
+    "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x3\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x5\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x4\n"
+    "addvl SP, SP, #-15\n"
+    "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z7.h, p2/M, z7.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z12.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "incw x22\n"
+    "mov z26.h, #0x0\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "trn1 z17.h, z13.h, z22.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "addvl x21, SP, #15\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "trn1 z29.h, z20.h, z1.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "sub z27.h, z27.h, z28.h\n"
+    "incw x22\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z14.h, z14.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "ld1sb { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "trn1 z22.h, z27.h, z26.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z23.h, z23.h, z28.h\n"
+    "st1h { z17.h }, p2, [x21]\n"
+    "ld1sb { z30.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z30.h, z30.h, z28.h\n"
+    "trn1 z8.h, z14.h, z18.h\n"
+    "ld1sb { z15.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z23.h, z23.h, z30.h\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "ld1sb { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z24.h, z24.h, z28.h\n"
+    "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z0.h, z15.h, z26.h\n"
+    "incw x22\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "trn1 z27.h, z20.h, z24.h\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z11.h, z11.h, z28.h\n"
+    "ld1sb { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z20.h, z16.h, z13.h\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z3.h, z3.h, z28.h\n"
+    "ld1sb { z15.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z29.h, z11.h, z26.h\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x22\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z27.h }, p2, [x21]\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "trn1 z19.h, z22.h, z3.h\n"
+    "ld1sb { z17.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z31.h, z13.h, z15.h\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1sb { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z16.h, z16.h, z26.h\n"
+    "sub z17.h, z17.h, z28.h\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z0.h, z0.h, z28.h\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "addvl x21, x21, #-3\n"
+    "st1h { z19.h }, p2, [x21]\n"
+    "mov z13.d, z12.d\n"
+    "mov z14.d, z12.d\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z15.d, z12.d\n"
+    "trn1 z8.h, z17.h, z0.h\n"
+    "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
+    "addvl x21, x21, #-3\n"
+    "trn1 z31.h, z18.h, z22.h\n"
+    "trn1 z29.h, z1.h, z26.h\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x7, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x5, x23, LSL #22\n"
+    "mov x22, #0xb\n"
+    "add x21, x4, x3\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x4, x17\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x17, x4, x20, x17\n"
+    ".inst 0xc0040d80  // mova za.d[x8, #0], { z12.d-z15.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040d81  // mova za.d[x8, #1], { z12.d-z15.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x15, x14, [x23], #0x10\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "ldp x13, x11, [x20], #0x10\n"
+    ".inst 0xc0040d83  // mova za.d[x8, #3], { z12.d-z15.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    "sub x16, x16, x21\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x4, x3\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1sb { z27.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #12\n"
+    "ld1sb { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z0.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1sb { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z11.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z11.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1sb { z29.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1sb { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1sb { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z26.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1sb { z20.s }, p1/Z, [x21]\n"
+    "mov z0.d, z20.d\n"
+    "add z0.h, z0.h, z7.h\n"
+    ".inst 0xc1781788  // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1sb { z29.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #9\n"
+    "ld1sb { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z17.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1sb { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z0.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1sb { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1sb { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1sb { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z16.h\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1sb { z16.s }, p1/Z, [x21]\n"
+    "mov z2.d, z16.d\n"
+    "add z2.h, z2.h, z7.h\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1sb { z26.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #6\n"
+    "ld1sb { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1sb { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #12\n"
+    "ld1sb { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1sb { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z29.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1sb { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1sb { z19.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z19.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1sb { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1sb { z23.s }, p1/Z, [x22]\n"
+    "trn1 z30.h, z30.h, z23.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z30.h, z30.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1sb { z22.s }, p1/Z, [x22]\n"
+    "mov z31.d, z22.d\n"
+    ".inst 0xc1731768  // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b1769  // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+    ".inst 0xc1731788  // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701789  // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1sb { z29.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #3\n"
+    "ld1sb { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z22.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1sb { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #9\n"
+    "ld1sb { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z25.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1sb { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1sb { z0.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1sb { z1.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1sb { z2.s }, p1/Z, [x22]\n"
+    "trn1 z1.h, z1.h, z2.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1sb { z24.s }, p1/Z, [x22]\n"
+    "mov z2.d, z24.d\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    ".inst 0xa0402a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17817a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+    "add z2.h, z2.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17917c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+    ".inst 0xc17317e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17317e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1sb { z23.s }, p1/Z, [x17]\n"
+    "sub x7, x7, #0x2\n"
+    "ld1sb { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z25.h\n"
+    "sub x16, x16, #0x1\n"
+    "ld1sb { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x7, #0x1\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1sb { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z30.h\n"
+    "cmp x20, x16\n"
+    "ld1sb { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x26, x20, x16, LT\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1sb { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z22.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1sb { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z22.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1sb { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "and x7, x7, #0x1\n"
+    "ld1sb { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z30.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1sb { z28.s }, p1/Z, [x21]\n"
+    "mov z28.d, z28.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    "sub x16, x16, x26\n"
+    "cbz x26, 21f\n"
+    "13:"  // Unpadded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x25, SP, #6\n"
+    "addvl x24, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b20  // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
+    "add x23, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "addvl x21, SP, #9\n"
+    "subs x26, x26, #0x1\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    ".inst 0xc17816ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+    "ld1sb { z23.s }, p1/Z, [x17]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+    ".inst 0xc179170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+    "ld1sb { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1sb { z24.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    "ld1sb { z18.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1sb { z25.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "ld1sb { z8.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z8.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1sb { z26.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "ld1sb { z28.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z28.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1sb { z27.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "ld1sb { z28.s }, p1/Z, [x23]\n"
+    "trn1 z27.h, z27.h, z28.h\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add z27.h, z27.h, z7.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    "ld1sb { z20.s }, p1/Z, [x23]\n"
+    "mov z28.d, z20.d\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    ".inst 0xc1701728  // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1sb { z23.s }, p1/Z, [x17]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "ld1sb { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    "add x9, x9, x27\n"
+    "ld1sb { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1sb { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, z25.h, z7.h\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1sb { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1sb { z16.s }, p1/Z, [x20]\n"
+    "mov z28.d, z16.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x17]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z29.s }, p0/Z, [x21]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z30.s }, p0/Z, [x21]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z31.s }, p0/Z, [x21]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #12\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z18.h\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1sb { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "mov z0.d, z20.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1711788  // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+    "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17117a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z25.s }, p0/Z, [x21]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z26.s }, p0/Z, [x21]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z27.s }, p0/Z, [x21]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #9\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0xc1721708  // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+    "ld1sb { z11.s }, p0/Z, [x21]\n"
+    "add z11.h, p0/M, z11.h, z7.h\n"
+    "mov z29.d, z11.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701748  // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #6\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "ld1sb { z1.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #12\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1791748  // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z1.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1791768  // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #3\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1sb { z0.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #9\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    ".inst 0xc17b1748  // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z0.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "19:"  // Padded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z18.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "sub x7, x7, #0x2\n"
+    "sub x16, x16, #0x1\n"
+    "trn1 z25.h, z25.h, z19.h\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "lsr x20, x7, #0x1\n"
+    "cmp x20, x16\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "mov z28.d, z16.d\n"
+    "csel x25, x20, x16, LT\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "and x7, x7, #0x1\n"
+    "sub x16, x16, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa1402b00  // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #9\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1sb { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1sb { z1.s }, p0/Z, [x20]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc173172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z30.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "trn1 z24.h, z24.h, z1.h\n"
+    "trn1 z25.h, z25.h, z3.h\n"
+    "trn1 z26.h, z26.h, z30.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "trn1 z27.h, z27.h, z29.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17216e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+    "ld1sb { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1731709  // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+    "ld1sb { z24.s }, p0/Z, [x20]\n"
+    "mov z28.d, z20.d\n"
+    "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z22.s }, p0/Z, [x20]\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "mov x12, #0x4\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1711729  // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z31.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "trn1 z23.h, z23.h, z8.h\n"
+    "trn1 z24.h, z24.h, z22.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z25.h, z25.h, z28.h\n"
+    "trn1 z26.h, z26.h, z20.h\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "trn1 z27.h, z27.h, z31.h\n"
+    "mov z28.d, z1.d\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17816e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
+    "add x22, x17, %x[ld_in_row]\n"
+    "addvl x21, SP, #3\n"
+    ".inst 0xc1791709  // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x20, SP, #9\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1sb { z29.s }, p0/Z, [x17]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z8.s }, p0/Z, [x22]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1sb { z30.s }, p0/Z, [x22]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1sb { z20.s }, p0/Z, [x22]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc172172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "ld1sb { z31.s }, p0/Z, [x22]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z25.s }, p0/Z, [x22]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z0.s }, p0/Z, [x22]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z17.s }, p0/Z, [x22]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z1.s }, p0/Z, [x22]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z28.s }, p0/Z, [x22]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    "trn1 z31.h, z31.h, z25.h\n"
+    "trn1 z0.h, z0.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z1.h, z1.h, z28.h\n"
+    ".inst 0xc17317a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+    "ld1sb { z22.s }, p0/Z, [x22]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    ".inst 0xc17b17c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1a4aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+    ".inst 0xc17317a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+    "mov z2.d, z22.d\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b17c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xc1aaab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+    ".inst 0xc17917e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1b5ccb8  // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+    "st1b { z24.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z25.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17817e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "22:"  // Main loop skip tail
+    "cbz x7, 23f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1sb { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1sb { z16.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1sb { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1sb { z31.s }, p0/Z, [x20]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z26.h, z26.h, z17.h\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "ld1sb { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "trn1 z28.h, z28.h, z31.h\n"
+    "addvl x21, SP, #6\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    "mov z29.d, z0.d\n"
+    "addvl x20, SP, #12\n"
+    "sub x16, x16, #0x1\n"
+    ".inst 0xc17b1728  // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+    ".inst 0xa0402aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1791729  // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    ".inst 0xc171170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    ".inst 0xc1721749  // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc173174a  // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+    "add x8, x8, #0x1\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "23:"  // Tail input: End
+    "cbz x16, 25f\n"
+    "24:"  // Right padding loop
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "subs x16, x16, #0x1\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 24b\n"
+    "25:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x6\n"
+    "whilelt p1.s, x6, x5\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #15\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..2e40c75d6b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8q_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8q_planar_3x3_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..60c3a1e632
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const uint8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x6\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-12\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z21.h, p2/M, z21.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z30.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1b { z10.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z7.h, #0x0\n"
+    "sub z10.h, z10.h, z31.h\n"
+    "incw x22\n"
+    "ld1b { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "trn1 z20.h, z7.h, z10.h\n"
+    "ld1b { z11.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "mov x20, x22\n"
+    "trn1 z19.h, z10.h, z16.h\n"
+    "ld1b { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "trn1 z26.h, z16.h, z11.h\n"
+    "trn1 z13.h, z11.h, z7.h\n"
+    "ld1b { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z24.h, z24.h, z31.h\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "ld1b { z2.s }, p2/Z, [x20]\n"
+    "sub z2.h, z2.h, z31.h\n"
+    "addvl x21, SP, #12\n"
+    "incw x22\n"
+    "addvl x21, x21, #-4\n"
+    "mov x20, x22\n"
+    "st1h { z20.h }, p2, [x21]\n"
+    "trn1 z22.h, z7.h, z24.h\n"
+    "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z1.h, z24.h, z11.h\n"
+    "ld1b { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z3.h, z11.h, z2.h\n"
+    "ld1b { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+    "trn1 z25.h, z2.h, z7.h\n"
+    "ld1b { z4.s }, p2/Z, [x20]\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "sub z0.h, z0.h, z31.h\n"
+    "addvl x21, x21, #-4\n"
+    "st1h { z22.h }, p2, [x21]\n"
+    "sub z4.h, z4.h, z31.h\n"
+    "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z31.d, z30.d\n"
+    "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z24.h, z7.h, z16.h\n"
+    "trn1 z18.h, z16.h, z0.h\n"
+    "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
+    "addvl x21, x21, #-4\n"
+    "trn1 z0.h, z0.h, z4.h\n"
+    "trn1 z1.h, z4.h, z7.h\n"
+    "st1h { z24.h }, p2, [x21]\n"
+    "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x6\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040bc2  // mova za.d[x8, #2], { z30.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    ".inst 0xc0040bc3  // mova za.d[x8, #3], { z30.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z20.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #8\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z20.h, z16.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1b { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z5.h, z23.h, z22.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "trn1 z6.h, z17.h, z16.h\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b1488  // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+    ".inst 0xc1631489  // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+    ".inst 0xa1412a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16814a8  // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+    ".inst 0xc16014a9  // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x22, x14, %x[ld_in_row]\n"
+    "ld1b { z25.s }, p1/Z, [x14]\n"
+    "addvl x21, SP, #4\n"
+    "ld1b { z6.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z3.h, z25.h, z6.h\n"
+    "add z3.h, z3.h, z21.h\n"
+    "ld1b { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #8\n"
+    "ld1b { z26.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z4.h, z18.h, z26.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1b { z2.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z5.s }, p1/Z, [x22]\n"
+    "trn1 z5.h, z2.h, z5.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1611468  // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+    ".inst 0xc1601469  // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xa0412aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a146a  // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+    ".inst 0xc162146b  // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+    ".inst 0xc1691488  // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+    ".inst 0xc1681489  // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+    ".inst 0xa1412a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a148a  // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+    ".inst 0xc162148b  // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1b { z17.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x1\n"
+    "ld1b { z9.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z17.h, z9.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x15, x13\n"
+    "add z6.h, z6.h, z21.h\n"
+    "ld1b { z7.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z17.h, z7.h\n"
+    "csel x23, x15, x13, LT\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z7.h, z7.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z1.s }, p1/Z, [x20]\n"
+    "trn1 z8.h, z17.h, z1.h\n"
+    "add z8.h, z8.h, z21.h\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x22, SP, #4\n"
+    "addvl x21, SP, #8\n"
+    "ld1b { z2.s }, p1/Z, [x14]\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "ld1b { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412ac3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "ld1b { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16d14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
+    "ld1b { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16514cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    "trn1 z6.h, z2.h, z19.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16314eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16914ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xc16114ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+    "trn1 z7.h, z23.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "add z7.h, z7.h, z21.h\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add z8.h, z8.h, z21.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z19.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #8\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z9.h, z17.h, z16.h\n"
+    ".inst 0xc16a14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16214e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16d1508  // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+    ".inst 0xc1651509  // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z19.h, z18.h\n"
+    "trn1 z23.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    "trn1 z24.h, z17.h, z16.h\n"
+    ".inst 0xc16116c8  // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0xc16016c9  // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0412aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d16ca  // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+    ".inst 0xc16516cb  // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+    ".inst 0xc16116e8  // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016e9  // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16116ea  // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016eb  // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+    "15:"  // Padded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z19.h, z18.h\n"
+    "trn1 z7.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "sub x15, x15, #0x1\n"
+    "sub x13, x13, #0x1\n"
+    "cmp x15, x13\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "csel x23, x15, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "16:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z9.s }, p0/Z, [x14]\n"
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    "add z9.h, p0/M, z9.h, z21.h\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x22]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z18.s }, p0/Z, [x22]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x22]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    "mov x12, #0x4\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16b14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
+    "subs x23, x23, #0x1\n"
+    "ld1b { z17.s }, p0/Z, [x22]\n"
+    ".inst 0xc16314cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa0412aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d14cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    "ld1b { z2.s }, p0/Z, [x22]\n"
+    ".inst 0xc16514cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+    "add z2.h, p0/M, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16a14eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16b14ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z6.h, z9.h, z19.h\n"
+    ".inst 0xc16314ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "trn1 z7.h, z18.h, z16.h\n"
+    "trn1 z8.h, z17.h, z2.h\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x21, SP, #4\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa0412aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16114ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16014cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc16314ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc16214eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc16114ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+    ".inst 0xc16014ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "18:"  // Main loop skip tail
+    "cbz x13, 20f\n"
+    "19:"  // Right padding loop
+    ".inst 0xc0060804  // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0060826  // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1acaa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc1afab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    ".inst 0xc1bccfa4  // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
+    "st1b { z4.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z5.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z7.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 19b\n"
+    "20:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #12\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..f852e12de1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8q_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8q_planar_3x3_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..e4ce6c74fb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const uint8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x9\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-6\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z11.h, p2/M, z11.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z28.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1b { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "incw x22\n"
+    "mov z24.h, #0x0\n"
+    "ld1b { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z3.h, z3.h, z16.h\n"
+    "trn1 z31.h, z26.h, z3.h\n"
+    "ld1b { z21.s }, p2/Z, [x20]\n"
+    "sub z21.h, z21.h, z16.h\n"
+    "mov x20, x22\n"
+    "trn1 z14.h, z21.h, z24.h\n"
+    "ld1b { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z2.h, z2.h, z16.h\n"
+    "addvl x21, SP, #6\n"
+    "ld1b { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z25.h, z25.h, z16.h\n"
+    "incw x22\n"
+    "ld1b { z27.s }, p2/Z, [x20]\n"
+    "sub z27.h, z27.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "mov x20, x22\n"
+    "st1h { z31.h }, p2, [x21]\n"
+    "trn1 z4.h, z2.h, z25.h\n"
+    "ld1b { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1b { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z12.h, z27.h, z24.h\n"
+    "ld1b { z20.s }, p2/Z, [x20]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "sub z23.h, z23.h, z16.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z20.h, z20.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "st1h { z4.h }, p2, [x21]\n"
+    "mov z29.d, z28.d\n"
+    "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
+    "addvl x21, x21, #-2\n"
+    "mov z30.d, z28.d\n"
+    "mov z31.d, z28.d\n"
+    "trn1 z25.h, z26.h, z23.h\n"
+    "st1h { z25.h }, p2, [x21]\n"
+    "trn1 z3.h, z20.h, z24.h\n"
+    "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x9\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040f80  // mova za.d[x8, #0], { z28.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f81  // mova za.d[x8, #1], { z28.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a9aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1adab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc1a7cd58  // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z25.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z26.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #4\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z15.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z15.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z21.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z19.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "mov z5.d, z8.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #2\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z12.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z12.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z8.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z5.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z5.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1b { z5.s }, p1/Z, [x21]\n"
+    "mov z5.d, z5.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x2\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z8.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x15, #0x1\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z25.h\n"
+    "cmp x20, x13\n"
+    "ld1b { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x23, x20, x13, LT\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1b { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z18.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1b { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z19.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "mov z25.d, z8.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "addvl x21, SP, #2\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1b { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z21.h, z21.h, z11.h\n"
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z8.h\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1b { z23.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ld1b { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1b { z24.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z8.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x22]\n"
+    "mov z25.d, z4.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    ".inst 0xa1402aa4  // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17416a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    ".inst 0xc17c16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    "ld1b { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z12.h\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z20.h\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "ld1b { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z24.h\n"
+    "add x27, x27, x25\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "ld1b { z3.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z3.h\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x20]\n"
+    "mov z25.d, z3.d\n"
+    "add z22.h, z22.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "add z23.h, z23.h, z11.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "add z25.h, z25.h, z11.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z4.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "addvl x20, SP, #4\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z1.d\n"
+    ".inst 0xc17416c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17c16e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z5.s }, p0/Z, [x20]\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z5.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z15.s }, p0/Z, [x20]\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "addvl x20, SP, #2\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z15.d\n"
+    ".inst 0xc17016c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17116e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+    "15:"  // Padded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "trn1 z22.h, z22.h, z3.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "sub x15, x15, #0x2\n"
+    "sub x13, x13, #0x1\n"
+    "trn1 z23.h, z23.h, z19.h\n"
+    "trn1 z24.h, z24.h, z20.h\n"
+    "lsr x20, x15, #0x1\n"
+    "cmp x20, x13\n"
+    "mov z25.d, z3.d\n"
+    "csel x22, x20, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x22\n"
+    "cbz x22, 17f\n"
+    "16:"  // Padded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    ".inst 0xc17416a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z14.s }, p0/Z, [x21]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17c16c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z15.s }, p0/Z, [x21]\n"
+    "mov x12, #0x4\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x21]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x21]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x21]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x8\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z14.h\n"
+    "trn1 z22.h, z22.h, z15.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "addvl x20, SP, #2\n"
+    "ld1b { z2.s }, p0/Z, [x21]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z4.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17016a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "mov z25.d, z2.d\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z11.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "subs x22, x22, #0x1\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "trn1 z21.h, z21.h, z20.h\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z22.h, z22.h, z4.h\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "trn1 z24.h, z24.h, z12.h\n"
+    "mov z25.d, z8.d\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1b { z0.s }, p0/Z, [x14]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z14.s }, p0/Z, [x20]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z2.s }, p0/Z, [x20]\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #2\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "trn1 z0.h, z0.h, z14.h\n"
+    "add x8, x8, #0x1\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "trn1 z1.h, z1.h, z12.h\n"
+    "trn1 z2.h, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "trn1 z3.h, z3.h, z25.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "mov z4.d, z27.d\n"
+    ".inst 0xc17e1408  // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17f1428  // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "18:"  // Main loop skip tail
+    "cbz x15, 19f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z17.h\n"
+    "trn1 z22.h, z22.h, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z5.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z5.h\n"
+    "mov z25.d, z4.d\n"
+    "addvl x20, SP, #4\n"
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "sub x13, x13, #0x1\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "19:"  // Tail input: End
+    "cbz x13, 21f\n"
+    "20:"  // Right padding loop
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 20b\n"
+    "21:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #6\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..d8b87dcd55
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8q_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8q_planar_5x5_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..d33ef764ef
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const uint8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x8\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x5\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x7\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x6\n"
+    "addvl SP, SP, #-30\n"
+    "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z17.h, p2/M, z17.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z18.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x23\n"
+    "ld1b { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z15.h, #0x0\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "incw x23\n"
+    "ld1b { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z13.h, z13.h, z3.h\n"
+    "trn1 z11.h, z15.h, z2.h\n"
+    "ld1b { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "trn1 z0.h, z2.h, z13.h\n"
+    "ld1b { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "trn1 z26.h, z13.h, z27.h\n"
+    "ld1b { z14.s }, p2/Z, [x20]\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "mov x20, x23\n"
+    "trn1 z10.h, z27.h, z19.h\n"
+    "ld1b { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z19.h, z19.h, z14.h\n"
+    "trn1 z1.h, z14.h, z15.h\n"
+    "ld1b { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "ld1b { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "addvl x22, SP, #30\n"
+    "ld1b { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "incw x23\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "ld1b { z23.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "trn1 z20.h, z15.h, z9.h\n"
+    "incw x23\n"
+    "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z22.h, z9.h, z5.h\n"
+    "ld1b { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z9.h, z5.h, z29.h\n"
+    "ld1b { z21.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z26.h, z29.h, z2.h\n"
+    "ld1b { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z28.h, z2.h, z23.h\n"
+    "ld1b { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z23.h, z15.h\n"
+    "sub z25.h, z25.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z21.h, z21.h, z3.h\n"
+    "ld1b { z6.s }, p2/Z, [x20]\n"
+    "sub z0.h, z0.h, z3.h\n"
+    "mov x20, x23\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "sub z6.h, z6.h, z3.h\n"
+    "st1h { z20.h }, p2, [x22]\n"
+    "incw x23\n"
+    "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z11.h, z15.h, z25.h\n"
+    "trn1 z10.h, z25.h, z21.h\n"
+    "ld1b { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z14.h, z21.h, z0.h\n"
+    "ld1b { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z21.h, z0.h, z19.h\n"
+    "ld1b { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z19.h, z19.h, z6.h\n"
+    "ld1b { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z13.h, z6.h, z15.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "ld1b { z1.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "sub z1.h, z1.h, z3.h\n"
+    "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z30.h, z15.h, z5.h\n"
+    "trn1 z26.h, z5.h, z23.h\n"
+    "ld1b { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z22.h, z23.h, z27.h\n"
+    "ld1b { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z28.h, z27.h, z29.h\n"
+    "ld1b { z8.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z27.h, z29.h, z1.h\n"
+    "ld1b { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z1.h, z15.h\n"
+    "ld1b { z14.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z8.h, z8.h, z3.h\n"
+    "st1h { z30.h }, p2, [x22]\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+    "mov z19.d, z18.d\n"
+    "trn1 z22.h, z15.h, z11.h\n"
+    "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z1.h, z11.h, z5.h\n"
+    "trn1 z31.h, z5.h, z8.h\n"
+    "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z8.h, z8.h, z9.h\n"
+    "trn1 z21.h, z9.h, z14.h\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "addvl x22, x22, #-6\n"
+    "trn1 z15.h, z14.h, z15.h\n"
+    "st1h { z22.h }, p2, [x22]\n"
+    "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+    "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+    "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+    "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+    "cbz x21, 3f\n"
+    "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x25, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x7, x23, LSL #22\n"
+    "mov x22, #0x8\n"
+    "add x21, x6, x5\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x11, #0x0\n"
+    "mov x8, #0x8\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x6, x16\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x16, x6, x20, x16\n"
+    ".inst 0xc0046a40  // mova za.d[x11, #0], { z18.d-z19.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0046a41  // mova za.d[x11, #1], { z18.d-z19.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x14, x13, [x23], #0x10\n"
+    ".inst 0xc0046a42  // mova za.d[x11, #2], { z18.d-z19.d }\n"
+    "ldp x4, x10, [x20], #0x10\n"
+    ".inst 0xc0046a43  // mova za.d[x11, #3], { z18.d-z19.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0046a44  // mova za.d[x11, #4], { z18.d-z19.d }\n"
+    "ldp x9, x28, [x23], #0x10\n"
+    ".inst 0xc0046a45  // mova za.d[x11, #5], { z18.d-z19.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0046a46  // mova za.d[x11, #6], { z18.d-z19.d }\n"
+    ".inst 0xc0046a47  // mova za.d[x11, #7], { z18.d-z19.d }\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0066814  // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+    "sub x15, x15, x21\n"
+    ".inst 0xc0066836  // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+    ".inst 0xc1a4aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+    ".inst 0xc1acab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+    ".inst 0xc1b0cf14  // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z20.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z22.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z21.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z23.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x6, x5\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x16, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x16]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z1.h, z28.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1b { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z1.h, z2.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1b { z13.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z6.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z13.h, z6.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16a7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+    "ld1b { z20.s }, p1/Z, [x21]\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "add z30.h, z30.h, z17.h\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1617789  // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+    ".inst 0xc16a77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x22, x16, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x16]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z20.h, z2.h, z28.h\n"
+    "add z20.h, z20.h, z17.h\n"
+    "ld1b { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z11.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z21.h, z31.h, z11.h\n"
+    "add z21.h, z21.h, z17.h\n"
+    "ld1b { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z25.h, z8.h\n"
+    "add z22.h, z22.h, z17.h\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16e7688  // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "ld1b { z3.s }, p1/Z, [x22]\n"
+    "trn1 z23.h, z8.h, z3.h\n"
+    ".inst 0xc1667689  // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc161768a  // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+    "add z23.h, z23.h, z17.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc160768b  // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+    ".inst 0xc16976a8  // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xa0422aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16176a9  // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16976aa  // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xc16176ab  // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xc16f76c8  // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc16e76c9  // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b76ca  // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+    ".inst 0xc16a76cb  // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x23, x16, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x16]\n"
+    "addvl x22, SP, #12\n"
+    "ld1b { z22.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z0.h, z2.h, z22.h\n"
+    "add z0.h, z0.h, z17.h\n"
+    "ld1b { z14.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z1.h, z14.h, z6.h\n"
+    "add z1.h, z1.h, z17.h\n"
+    "ld1b { z15.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z2.h, z15.h, z6.h\n"
+    "add z2.h, z2.h, z17.h\n"
+    "ld1b { z21.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16f7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+    "ld1b { z30.s }, p1/Z, [x23]\n"
+    "trn1 z3.h, z21.h, z30.h\n"
+    ".inst 0xc16e7409  // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+    ".inst 0xa1402aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d740a  // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+    "add z3.h, z3.h, z17.h\n"
+    ".inst 0xa0412ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc165740b  // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xa0402a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16f7428  // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e7429  // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xa0422ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16b740c  // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+    ".inst 0xc16a740d  // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+    ".inst 0xc16f742a  // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742b  // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1697448  // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+    ".inst 0xc1687449  // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f742c  // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742d  // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xc16b744a  // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+    ".inst 0xc16a744b  // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc161744c  // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+    ".inst 0xc160744d  // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x24, x16, %x[ld_in_row]\n"
+    "ld1b { z0.s }, p1/Z, [x16]\n"
+    "addvl x23, SP, #6\n"
+    "ld1b { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z28.h, z0.h, z3.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1b { z6.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x22, SP, #12\n"
+    "ld1b { z30.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z29.h, z6.h, z30.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1b { z1.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z25.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z30.h, z1.h, z25.h\n"
+    "add z30.h, z30.h, z17.h\n"
+    "ld1b { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z5.s }, p1/Z, [x24]\n"
+    "trn1 z31.h, z3.h, z5.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16e778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+    "add z31.h, z31.h, z17.h\n"
+    ".inst 0xa1412ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc166778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16a77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+    ".inst 0xc16277a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+    ".inst 0xa0412ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16f778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+    ".inst 0xc16e778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xa1402a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16877ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xc166778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xc16d77ac  // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+    ".inst 0xc16577ad  // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa0422aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ae  // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xc16677af  // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+    ".inst 0xc16977cc  // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+    ".inst 0xc16877cd  // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+    ".inst 0xa1422a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ce  // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+    ".inst 0xc16677cf  // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x16]\n"
+    "sub x25, x25, #0x1\n"
+    "ld1b { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z26.h, z28.h\n"
+    "sub x15, x15, #0x1\n"
+    "ld1b { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x25, x15\n"
+    "add z25.h, z25.h, z17.h\n"
+    "ld1b { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z31.h, z15.h\n"
+    "csel x25, x25, x15, LT\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z26.h, z26.h, z17.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z8.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z22.h, z8.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1b { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "sub x15, x15, x25\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "trn1 z28.h, z21.h, z20.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "cbz x25, 21f\n"
+    "13:"  // Unpadded: Main loop
+    "addvl x24, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #12\n"
+    "ld1b { z21.s }, p1/Z, [x16]\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402b0e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+    "addvl x22, SP, #18\n"
+    "addvl x21, SP, #24\n"
+    ".inst 0xc16f772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1b { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa1402ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+    "subs x25, x25, #0x1\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412b05  // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422b02  // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+    ".inst 0xc16f772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+    "ld1b { z30.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z6.s }, p1/Z, [x20]\n"
+    ".inst 0xc165774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16a776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    "trn1 z25.h, z21.h, z0.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16d1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+    "add z25.h, z25.h, z17.h\n"
+    ".inst 0xc1651749  // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+    "trn1 z26.h, z20.h, z31.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    "add z26.h, z26.h, z17.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "trn1 z27.h, z29.h, z22.h\n"
+    "trn1 z28.h, z30.h, z6.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "add z27.h, z27.h, z17.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "add z28.h, z28.h, z17.h\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z9.s }, p0/Z, [x16]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x21, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x21]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z9.h, z22.h\n"
+    "trn1 z0.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z1.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16a77e8  // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16277e9  // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "trn1 z2.h, z21.h, z20.h\n"
+    ".inst 0xc16d7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+    ".inst 0xa0422a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1657409  // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xc1697428  // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+    ".inst 0xc1687429  // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z5.s }, p0/Z, [x16]\n"
+    "add z5.h, p0/M, z5.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z28.h, z5.h, z22.h\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "trn1 z30.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #24\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z21.h, z20.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc161778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa1422aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+    ".inst 0xc16e77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16177ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+    ".inst 0xc16b77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+    ".inst 0xc16377c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+    ".inst 0xc16e77cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x16]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z8.h, z29.h, z22.h\n"
+    "trn1 z9.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    "trn1 z10.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7508  // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e7509  // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "trn1 z11.h, z21.h, z20.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e750a  // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc166750b  // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16d7528  // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc1657529  // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16f750c  // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e750d  // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xc16d752a  // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xc165752b  // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1617548  // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc1607549  // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e752c  // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+    ".inst 0xc166752d  // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+    ".inst 0xc161754a  // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc160754b  // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f754c  // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc16e754d  // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x16]\n"
+    "add z1.h, p0/M, z1.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z1.h, z22.h\n"
+    "trn1 z27.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x23, SP, #6\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aee  // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+    "trn1 z28.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+    ".inst 0xc16e7749  // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xa0402ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0xa0412aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc161774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
+    "addvl x20, SP, #24\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc160774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xa0422ae8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16a7769  // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1687789  // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xc16e776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xc161778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc160778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1422a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16a778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+    "19:"  // Padded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z6.s }, p0/Z, [x16]\n"
+    "add z6.h, p0/M, z6.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z30.s }, p0/Z, [x20]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z6.h, z30.h\n"
+    "trn1 z26.h, z27.h, z26.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z9.s }, p0/Z, [x20]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "sub x25, x25, #0x1\n"
+    "sub x15, x15, #0x1\n"
+    "cmp x25, x15\n"
+    "trn1 z27.h, z8.h, z9.h\n"
+    "trn1 z28.h, z21.h, z29.h\n"
+    "csel x25, x25, x15, LT\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "sub x15, x15, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x16]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x24, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z21.s }, p0/Z, [x24]\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z29.s }, p0/Z, [x24]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    "mov x12, #0x4\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z30.s }, p0/Z, [x24]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z15.s }, p0/Z, [x24]\n"
+    "add z15.h, p0/M, z15.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1b { z20.s }, p0/Z, [x24]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z31.s }, p0/Z, [x24]\n"
+    "add z31.h, p0/M, z31.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc16b774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+    "ld1b { z22.s }, p0/Z, [x24]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    ".inst 0xc16a774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc161776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc160776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+    ".inst 0xa1422aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1631728  // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+    ".inst 0xc1621729  // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    "trn1 z25.h, z8.h, z21.h\n"
+    ".inst 0xc16e1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc1661749  // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "trn1 z26.h, z29.h, z30.h\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "trn1 z27.h, z15.h, z20.h\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "trn1 z28.h, z31.h, z22.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x22, SP, #12\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc169774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc169774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0422ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc163776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422aa2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16a776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc1691748  // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc1681749  // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+    ".inst 0xc1611768  // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc1601769  // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "22:"  // Main loop skip tail
+    "cbz x15, 24f\n"
+    "23:"  // Right padding loop
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "add x8, x8, #0x2\n"
+    "subs x15, x15, #0x1\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 23b\n"
+    "24:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x17\n"
+    "whilelt p1.s, x17, x7\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #30\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..05aad19c09
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8q_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8q_planar_5x5_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..6c144afa77
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const uint8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const uint8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0xb\n"
+    "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x3\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x5\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x4\n"
+    "addvl SP, SP, #-15\n"
+    "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z7.h, p2/M, z7.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z12.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1b { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "incw x22\n"
+    "mov z26.h, #0x0\n"
+    "ld1b { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "trn1 z17.h, z13.h, z22.h\n"
+    "ld1b { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "addvl x21, SP, #15\n"
+    "ld1b { z1.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "trn1 z29.h, z20.h, z1.h\n"
+    "ld1b { z27.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "sub z27.h, z27.h, z28.h\n"
+    "incw x22\n"
+    "ld1b { z14.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z14.h, z14.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "ld1b { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "trn1 z22.h, z27.h, z26.h\n"
+    "ld1b { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z23.h, z23.h, z28.h\n"
+    "st1h { z17.h }, p2, [x21]\n"
+    "ld1b { z30.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z30.h, z30.h, z28.h\n"
+    "trn1 z8.h, z14.h, z18.h\n"
+    "ld1b { z15.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "ld1b { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z23.h, z23.h, z30.h\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "ld1b { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z24.h, z24.h, z28.h\n"
+    "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1b { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z0.h, z15.h, z26.h\n"
+    "incw x22\n"
+    "ld1b { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "ld1b { z11.s }, p2/Z, [x20]\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "trn1 z27.h, z20.h, z24.h\n"
+    "ld1b { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z11.h, z11.h, z28.h\n"
+    "ld1b { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z20.h, z16.h, z13.h\n"
+    "ld1b { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z3.h, z3.h, z28.h\n"
+    "ld1b { z15.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z29.h, z11.h, z26.h\n"
+    "ld1b { z16.s }, p2/Z, [x20]\n"
+    "incw x22\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z27.h }, p2, [x21]\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "trn1 z19.h, z22.h, z3.h\n"
+    "ld1b { z17.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+    "ld1b { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z31.h, z13.h, z15.h\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1b { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z16.h, z16.h, z26.h\n"
+    "sub z17.h, z17.h, z28.h\n"
+    "ld1b { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z0.h, z0.h, z28.h\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "ld1b { z1.s }, p2/Z, [x20]\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "addvl x21, x21, #-3\n"
+    "st1h { z19.h }, p2, [x21]\n"
+    "mov z13.d, z12.d\n"
+    "mov z14.d, z12.d\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z15.d, z12.d\n"
+    "trn1 z8.h, z17.h, z0.h\n"
+    "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
+    "addvl x21, x21, #-3\n"
+    "trn1 z31.h, z18.h, z22.h\n"
+    "trn1 z29.h, z1.h, z26.h\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x7, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x5, x23, LSL #22\n"
+    "mov x22, #0xb\n"
+    "add x21, x4, x3\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x4, x17\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x17, x4, x20, x17\n"
+    ".inst 0xc0040d80  // mova za.d[x8, #0], { z12.d-z15.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040d81  // mova za.d[x8, #1], { z12.d-z15.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x15, x14, [x23], #0x10\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "ldp x13, x11, [x20], #0x10\n"
+    ".inst 0xc0040d83  // mova za.d[x8, #3], { z12.d-z15.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    "sub x16, x16, x21\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x4, x3\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z27.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #12\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z0.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z11.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z11.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1b { z29.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z26.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1b { z20.s }, p1/Z, [x21]\n"
+    "mov z0.d, z20.d\n"
+    "add z0.h, z0.h, z7.h\n"
+    ".inst 0xc1781788  // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z29.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #9\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z17.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z0.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1b { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z16.h\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "mov z2.d, z16.d\n"
+    "add z2.h, z2.h, z7.h\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #6\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #12\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z29.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1b { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z19.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z19.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z23.s }, p1/Z, [x22]\n"
+    "trn1 z30.h, z30.h, z23.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z30.h, z30.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "mov z31.d, z22.d\n"
+    ".inst 0xc1731768  // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b1769  // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+    ".inst 0xc1731788  // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701789  // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1b { z29.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #3\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z22.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #9\n"
+    "ld1b { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z25.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1b { z0.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1b { z1.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x22]\n"
+    "trn1 z1.h, z1.h, z2.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1b { z24.s }, p1/Z, [x22]\n"
+    "mov z2.d, z24.d\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    ".inst 0xa0402a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17817a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+    "add z2.h, z2.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17917c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+    ".inst 0xc17317e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17317e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    "sub x7, x7, #0x2\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z25.h\n"
+    "sub x16, x16, #0x1\n"
+    "ld1b { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x7, #0x1\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z30.h\n"
+    "cmp x20, x16\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x26, x20, x16, LT\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z22.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z22.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "and x7, x7, #0x1\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z30.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "mov z28.d, z28.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    "sub x16, x16, x26\n"
+    "cbz x26, 21f\n"
+    "13:"  // Unpadded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x25, SP, #6\n"
+    "addvl x24, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b20  // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
+    "add x23, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "addvl x21, SP, #9\n"
+    "subs x26, x26, #0x1\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    ".inst 0xc17816ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+    ".inst 0xc179170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+    "ld1b { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z24.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    "ld1b { z18.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z25.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "ld1b { z8.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z8.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "ld1b { z28.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z28.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "ld1b { z28.s }, p1/Z, [x23]\n"
+    "trn1 z27.h, z27.h, z28.h\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add z27.h, z27.h, z7.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    "ld1b { z20.s }, p1/Z, [x23]\n"
+    "mov z28.d, z20.d\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    ".inst 0xc1701728  // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    "add x9, x9, x27\n"
+    "ld1b { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, z25.h, z7.h\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "mov z28.d, z16.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x17]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x21]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z30.s }, p0/Z, [x21]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z31.s }, p0/Z, [x21]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #12\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z18.h\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "mov z0.d, z20.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1711788  // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+    "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17117a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x21]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z26.s }, p0/Z, [x21]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x21]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #9\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0xc1721708  // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+    "ld1b { z11.s }, p0/Z, [x21]\n"
+    "add z11.h, p0/M, z11.h, z7.h\n"
+    "mov z29.d, z11.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701748  // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #6\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #12\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1791748  // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z1.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1791768  // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #3\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #9\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    ".inst 0xc17b1748  // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z0.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "19:"  // Padded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "sub x7, x7, #0x2\n"
+    "sub x16, x16, #0x1\n"
+    "trn1 z25.h, z25.h, z19.h\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "lsr x20, x7, #0x1\n"
+    "cmp x20, x16\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "mov z28.d, z16.d\n"
+    "csel x25, x20, x16, LT\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "and x7, x7, #0x1\n"
+    "sub x16, x16, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa1402b00  // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #9\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc173172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z30.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "trn1 z24.h, z24.h, z1.h\n"
+    "trn1 z25.h, z25.h, z3.h\n"
+    "trn1 z26.h, z26.h, z30.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "trn1 z27.h, z27.h, z29.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17216e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1731709  // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "mov z28.d, z20.d\n"
+    "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "mov x12, #0x4\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1711729  // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z31.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "trn1 z23.h, z23.h, z8.h\n"
+    "trn1 z24.h, z24.h, z22.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z25.h, z25.h, z28.h\n"
+    "trn1 z26.h, z26.h, z20.h\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "trn1 z27.h, z27.h, z31.h\n"
+    "mov z28.d, z1.d\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17816e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
+    "add x22, x17, %x[ld_in_row]\n"
+    "addvl x21, SP, #3\n"
+    ".inst 0xc1791709  // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x20, SP, #9\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1b { z29.s }, p0/Z, [x17]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z8.s }, p0/Z, [x22]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1b { z30.s }, p0/Z, [x22]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1b { z20.s }, p0/Z, [x22]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc172172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "ld1b { z31.s }, p0/Z, [x22]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z25.s }, p0/Z, [x22]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z0.s }, p0/Z, [x22]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x22]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x22]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z28.s }, p0/Z, [x22]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    "trn1 z31.h, z31.h, z25.h\n"
+    "trn1 z0.h, z0.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z1.h, z1.h, z28.h\n"
+    ".inst 0xc17317a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+    "ld1b { z22.s }, p0/Z, [x22]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    ".inst 0xc17b17c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1a4aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+    ".inst 0xc17317a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+    "mov z2.d, z22.d\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b17c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xc1aaab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+    ".inst 0xc17917e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1b5ccb8  // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+    "st1b { z24.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z25.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17817e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "22:"  // Main loop skip tail
+    "cbz x7, 23f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z31.s }, p0/Z, [x20]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z26.h, z26.h, z17.h\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "trn1 z28.h, z28.h, z31.h\n"
+    "addvl x21, SP, #6\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    "mov z29.d, z0.d\n"
+    "addvl x20, SP, #12\n"
+    "sub x16, x16, #0x1\n"
+    ".inst 0xc17b1728  // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+    ".inst 0xa0402aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1791729  // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    ".inst 0xc171170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    ".inst 0xc1721749  // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc173174a  // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+    "add x8, x8, #0x1\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "23:"  // Tail input: End
+    "cbz x16, 25f\n"
+    "24:"  // Right padding loop
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "subs x16, x16, #0x1\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 24b\n"
+    "25:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x6\n"
+    "whilelt p1.s, x6, x5\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #15\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..a4345097b5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..612beb342a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x6\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-12\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z21.h, p2/M, z21.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z30.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z10.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z7.h, #0x0\n"
+    "sub z10.h, z10.h, z31.h\n"
+    "incw x22\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "trn1 z20.h, z7.h, z10.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "mov x20, x22\n"
+    "trn1 z19.h, z10.h, z16.h\n"
+    "ld1sb { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "trn1 z26.h, z16.h, z11.h\n"
+    "trn1 z13.h, z11.h, z7.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z24.h, z24.h, z31.h\n"
+    "sub z11.h, z11.h, z31.h\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "sub z2.h, z2.h, z31.h\n"
+    "addvl x21, SP, #12\n"
+    "incw x22\n"
+    "addvl x21, x21, #-4\n"
+    "mov x20, x22\n"
+    "st1h { z20.h }, p2, [x21]\n"
+    "trn1 z22.h, z7.h, z24.h\n"
+    "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z1.h, z24.h, z11.h\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z3.h, z11.h, z2.h\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+    "trn1 z25.h, z2.h, z7.h\n"
+    "ld1sb { z4.s }, p2/Z, [x20]\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z16.h, z16.h, z31.h\n"
+    "sub z0.h, z0.h, z31.h\n"
+    "addvl x21, x21, #-4\n"
+    "st1h { z22.h }, p2, [x21]\n"
+    "sub z4.h, z4.h, z31.h\n"
+    "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z31.d, z30.d\n"
+    "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z24.h, z7.h, z16.h\n"
+    "trn1 z18.h, z16.h, z0.h\n"
+    "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
+    "addvl x21, x21, #-4\n"
+    "trn1 z0.h, z0.h, z4.h\n"
+    "trn1 z1.h, z4.h, z7.h\n"
+    "st1h { z24.h }, p2, [x21]\n"
+    "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x6\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040bc0  // mova za.d[x8, #0], { z30.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040bc1  // mova za.d[x8, #1], { z30.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040bc2  // mova za.d[x8, #2], { z30.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    ".inst 0xc0040bc3  // mova za.d[x8, #3], { z30.d-z31.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z20.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #8\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z20.h, z16.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1b { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z5.h, z23.h, z22.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "trn1 z6.h, z17.h, z16.h\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b1488  // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+    ".inst 0xc1631489  // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+    ".inst 0xa1412a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16814a8  // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+    ".inst 0xc16014a9  // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x22, x14, %x[ld_in_row]\n"
+    "ld1b { z25.s }, p1/Z, [x14]\n"
+    "addvl x21, SP, #4\n"
+    "ld1b { z6.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z3.h, z25.h, z6.h\n"
+    "add z3.h, z3.h, z21.h\n"
+    "ld1b { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #8\n"
+    "ld1b { z26.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z4.h, z18.h, z26.h\n"
+    "add z4.h, z4.h, z21.h\n"
+    "ld1b { z2.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z5.s }, p1/Z, [x22]\n"
+    "trn1 z5.h, z2.h, z5.h\n"
+    "add z5.h, z5.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1611468  // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+    ".inst 0xc1601469  // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xa0412aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a146a  // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+    ".inst 0xc162146b  // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+    ".inst 0xc1691488  // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+    ".inst 0xc1681489  // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+    ".inst 0xa1412a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a148a  // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+    ".inst 0xc162148b  // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1b { z17.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x1\n"
+    "ld1b { z9.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z17.h, z9.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x15, x13\n"
+    "add z6.h, z6.h, z21.h\n"
+    "ld1b { z7.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z17.h, z7.h\n"
+    "csel x23, x15, x13, LT\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z7.h, z7.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z1.s }, p1/Z, [x20]\n"
+    "trn1 z8.h, z17.h, z1.h\n"
+    "add z8.h, z8.h, z21.h\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x22, SP, #4\n"
+    "addvl x21, SP, #8\n"
+    "ld1b { z2.s }, p1/Z, [x14]\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "ld1b { z19.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412ac3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "ld1b { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16d14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
+    "ld1b { z18.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16514cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    "ld1b { z17.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    "trn1 z6.h, z2.h, z19.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16314eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16914ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "add z6.h, z6.h, z21.h\n"
+    ".inst 0xc16114ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+    "trn1 z7.h, z23.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "add z7.h, z7.h, z21.h\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add z8.h, z8.h, z21.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z7.h, z19.h, z18.h\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #8\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z9.h, z17.h, z16.h\n"
+    ".inst 0xc16a14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16214e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16d1508  // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+    ".inst 0xc1651509  // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z19.h, z18.h\n"
+    "trn1 z23.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    "trn1 z24.h, z17.h, z16.h\n"
+    ".inst 0xc16116c8  // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+    ".inst 0xc16016c9  // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0412aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d16ca  // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+    ".inst 0xc16516cb  // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+    ".inst 0xc16116e8  // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016e9  // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16116ea  // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+    ".inst 0xc16016eb  // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+    "15:"  // Padded: 0 priming loads
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "cbz x15, 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z19.s }, p0/Z, [x14]\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z6.h, z19.h, z18.h\n"
+    "trn1 z7.h, z17.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    "sub x15, x15, #0x1\n"
+    "sub x13, x13, #0x1\n"
+    "cmp x15, x13\n"
+    "trn1 z8.h, z17.h, z16.h\n"
+    "csel x23, x15, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "16:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z9.s }, p0/Z, [x14]\n"
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    "add z9.h, p0/M, z9.h, z21.h\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x22]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    "add z19.h, p0/M, z19.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z18.s }, p0/Z, [x22]\n"
+    "add z18.h, p0/M, z18.h, z21.h\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x22]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    "mov x12, #0x4\n"
+    "addvl x21, SP, #4\n"
+    "add z16.h, p0/M, z16.h, z21.h\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16b14ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
+    "subs x23, x23, #0x1\n"
+    "ld1b { z17.s }, p0/Z, [x22]\n"
+    ".inst 0xc16314cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z21.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    ".inst 0xa0412aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16d14cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    "ld1b { z2.s }, p0/Z, [x22]\n"
+    ".inst 0xc16514cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+    "add z2.h, p0/M, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc16b14ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xa1402be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    ".inst 0xc16a14eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa1412a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16b14ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z6.h, z9.h, z19.h\n"
+    ".inst 0xc16314ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0412bea  // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "trn1 z7.h, z18.h, z16.h\n"
+    "trn1 z8.h, z17.h, z2.h\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc16d14c8  // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+    "addvl x21, SP, #4\n"
+    "addvl x20, SP, #8\n"
+    ".inst 0xc16514c9  // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b14e8  // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+    ".inst 0xc16a14e9  // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+    ".inst 0xa0412aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc0060818  // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+    ".inst 0xc006083a  // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+    ".inst 0xc16114ca  // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+    ".inst 0xc1acaa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+    ".inst 0xc16014cb  // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1afab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+    ".inst 0xc16914cc  // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+    ".inst 0xc1bccfb8  // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc16114cd  // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc16314ea  // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+    "st1b { z25.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    ".inst 0xc16214eb  // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+    ".inst 0xa0412a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    ".inst 0xc16114ec  // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+    ".inst 0xc16014ed  // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    "18:"  // Main loop skip tail
+    "cbz x13, 20f\n"
+    "19:"  // Right padding loop
+    ".inst 0xc0060804  // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0060826  // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+    ".inst 0xc1aeac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc1acaa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+    ".inst 0xc0040bc4  // mova za.d[x8, #4], { z30.d-z31.d }\n"
+    ".inst 0xc1afab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+    ".inst 0xc0040bc5  // mova za.d[x8, #5], { z30.d-z31.d }\n"
+    ".inst 0xc1bccfa4  // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
+    "st1b { z4.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z6.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z5.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z7.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 19b\n"
+    "20:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #12\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..104c11fc9d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..8ce04fb8c2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x9\n"
+    "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x6\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x17\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x7\n"
+    "addvl SP, SP, #-6\n"
+    "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z11.h, p2/M, z11.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z28.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "incw x22\n"
+    "mov z24.h, #0x0\n"
+    "ld1sb { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z3.h, z3.h, z16.h\n"
+    "trn1 z31.h, z26.h, z3.h\n"
+    "ld1sb { z21.s }, p2/Z, [x20]\n"
+    "sub z21.h, z21.h, z16.h\n"
+    "mov x20, x22\n"
+    "trn1 z14.h, z21.h, z24.h\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z2.h, z2.h, z16.h\n"
+    "addvl x21, SP, #6\n"
+    "ld1sb { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "sub z25.h, z25.h, z16.h\n"
+    "incw x22\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "sub z27.h, z27.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "mov x20, x22\n"
+    "st1h { z31.h }, p2, [x21]\n"
+    "trn1 z4.h, z2.h, z25.h\n"
+    "ld1sb { z26.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #3\n"
+    "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z12.h, z27.h, z24.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "sub z26.h, z26.h, z16.h\n"
+    "sub z23.h, z23.h, z16.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "sub z20.h, z20.h, z16.h\n"
+    "addvl x21, x21, #-2\n"
+    "st1h { z4.h }, p2, [x21]\n"
+    "mov z29.d, z28.d\n"
+    "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
+    "addvl x21, x21, #-2\n"
+    "mov z30.d, z28.d\n"
+    "mov z31.d, z28.d\n"
+    "trn1 z25.h, z26.h, z23.h\n"
+    "st1h { z25.h }, p2, [x21]\n"
+    "trn1 z3.h, z20.h, z24.h\n"
+    "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x15, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x17, x23, LSL #22\n"
+    "mov x22, #0x9\n"
+    "add x21, x7, x6\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x7, x14\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x14, x7, x20, x14\n"
+    ".inst 0xc0040f80  // mova za.d[x8, #0], { z28.d-z31.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040f81  // mova za.d[x8, #1], { z28.d-z31.d }\n"
+    "mov x22, #0x2\n"
+    "ldp x11, x10, [x23], #0x10\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    "ldp x27, x26, [x23], #0x10\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a9aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1adab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+    "sub x13, x13, x21\n"
+    ".inst 0xc1a7cd58  // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z24.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z25.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z26.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z27.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x7, x6\n"
+    "bne 12f\n"
+    "cbz x22, 10f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 2 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #4\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z15.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z15.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z21.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z19.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "mov z5.d, z8.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "9:"  // Unpadded: 1 priming loads
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x14]\n"
+    "addvl x20, SP, #2\n"
+    "ld1b { z21.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z21.h\n"
+    "add z1.h, z1.h, z11.h\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z12.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z2.h, z2.h, z12.h\n"
+    "add z2.h, z2.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z3.h, z3.h, z8.h\n"
+    "add z3.h, z3.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z5.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z4.h, z4.h, z5.h\n"
+    "add z4.h, z4.h, z11.h\n"
+    "ld1b { z5.s }, p1/Z, [x21]\n"
+    "mov z5.d, z5.d\n"
+    "add z5.h, z5.h, z11.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701428  // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+    ".inst 0xc1781448  // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+    "10:"  // Unpadded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    "sub x15, x15, #0x2\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z8.h\n"
+    "sub x13, x13, #0x1\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x15, #0x1\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z25.h\n"
+    "cmp x20, x13\n"
+    "ld1b { z23.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x23, x20, x13, LT\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1b { z18.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z18.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1b { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "ld1b { z19.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z19.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "mov z25.d, z8.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x23\n"
+    "cbz x23, 17f\n"
+    "11:"  // Unpadded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "add x22, x14, %x[ld_in_row]\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "addvl x21, SP, #2\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    "ld1b { z18.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z21.h, z21.h, z11.h\n"
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z8.h\n"
+    "add z22.h, z22.h, z11.h\n"
+    "ld1b { z23.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "ld1b { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "add z23.h, z23.h, z11.h\n"
+    "ld1b { z24.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z8.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "ld1b { z4.s }, p1/Z, [x22]\n"
+    "mov z25.d, z4.d\n"
+    "add z25.h, z25.h, z11.h\n"
+    ".inst 0xa1402aa4  // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17416a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "ld1b { z21.s }, p1/Z, [x14]\n"
+    ".inst 0xc17c16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    "ld1b { z12.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z12.h\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z20.h\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "ld1b { z23.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z24.h\n"
+    "add x27, x27, x25\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "ld1b { z3.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z3.h\n"
+    "add z21.h, z21.h, z11.h\n"
+    "ld1b { z3.s }, p1/Z, [x20]\n"
+    "mov z25.d, z3.d\n"
+    "add z22.h, z22.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "add z23.h, z23.h, z11.h\n"
+    "add z24.h, z24.h, z11.h\n"
+    "add z25.h, z25.h, z11.h\n"
+    "bgt 11b\n"
+    "b 17f\n"
+    "12:"  // Padded
+    "cbz x22, 15f\n"
+    "cmp x22, #0x1\n"
+    "sub x15, x15, x22\n"
+    "beq 14f\n"
+    "13:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z4.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "addvl x20, SP, #4\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z1.d\n"
+    ".inst 0xc17416c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17c16e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+    "14:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x14]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z5.s }, p0/Z, [x20]\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z22.h, z22.h, z17.h\n"
+    "trn1 z23.h, z23.h, z5.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z15.s }, p0/Z, [x20]\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "addvl x20, SP, #2\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov z26.d, z15.d\n"
+    ".inst 0xc17016c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17116e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+    "15:"  // Padded: 0 priming loads
+    "cmp x15, #0x2\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "blt 18f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z18.h\n"
+    "trn1 z22.h, z22.h, z3.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "sub x15, x15, #0x2\n"
+    "sub x13, x13, #0x1\n"
+    "trn1 z23.h, z23.h, z19.h\n"
+    "trn1 z24.h, z24.h, z20.h\n"
+    "lsr x20, x15, #0x1\n"
+    "cmp x20, x13\n"
+    "mov z25.d, z3.d\n"
+    "csel x22, x20, x13, LT\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "and x15, x15, #0x1\n"
+    "sub x13, x13, x22\n"
+    "cbz x22, 17f\n"
+    "16:"  // Padded: Main loop
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa1402a84  // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x21, x14, %x[ld_in_row]\n"
+    ".inst 0xc17416a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z14.s }, p0/Z, [x21]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17c16c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z15.s }, p0/Z, [x21]\n"
+    "mov x12, #0x4\n"
+    "add z15.h, p0/M, z15.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x21]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x21]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x21]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "mov x12, #0x8\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z14.h\n"
+    "trn1 z22.h, z22.h, z15.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "addvl x20, SP, #2\n"
+    "ld1b { z2.s }, p0/Z, [x21]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z4.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    ".inst 0xc17016a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "mov z25.d, z2.d\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z11.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "subs x22, x22, #0x1\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "trn1 z21.h, z21.h, z20.h\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z22.h, z22.h, z4.h\n"
+    "trn1 z23.h, z23.h, z27.h\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "trn1 z24.h, z24.h, z12.h\n"
+    "mov z25.d, z8.d\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "bgt 16b\n"
+    "17:"  // Main loop tail
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "addvl x20, SP, #4\n"
+    "mov x12, #0x0\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    "ld1b { z0.s }, p0/Z, [x14]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z14.s }, p0/Z, [x20]\n"
+    "add z14.h, p0/M, z14.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z12.s }, p0/Z, [x20]\n"
+    "mov x12, #0x4\n"
+    "add z12.h, p0/M, z12.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z2.s }, p0/Z, [x20]\n"
+    "add z2.h, p0/M, z2.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add z3.h, p0/M, z3.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, p0/M, z25.h, z11.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #2\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    "trn1 z0.h, z0.h, z14.h\n"
+    "add x8, x8, #0x1\n"
+    "add z27.h, p0/M, z27.h, z11.h\n"
+    "trn1 z1.h, z1.h, z12.h\n"
+    "trn1 z2.h, z2.h, z21.h\n"
+    "add x14, x14, %x[ld_in_col]\n"
+    "trn1 z3.h, z3.h, z25.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    "mov z4.d, z27.d\n"
+    ".inst 0xc17e1408  // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17f1428  // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+    ".inst 0xa0402bee  // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "18:"  // Main loop skip tail
+    "cbz x15, 19f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z21.s }, p0/Z, [x14]\n"
+    "add z21.h, p0/M, z21.h, z11.h\n"
+    "add x20, x14, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z11.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z21.h, z21.h, z17.h\n"
+    "trn1 z22.h, z22.h, z0.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x20]\n"
+    "add z23.h, p0/M, z23.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z5.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z5.h, p0/M, z5.h, z11.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z4.s }, p0/Z, [x20]\n"
+    "add z4.h, p0/M, z4.h, z11.h\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z5.h\n"
+    "mov z25.d, z4.d\n"
+    "addvl x20, SP, #4\n"
+    ".inst 0xc17e16a8  // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+    "sub x13, x13, #0x1\n"
+    ".inst 0xc17f16c8  // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1a9aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+    ".inst 0xc17016a9  // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+    ".inst 0xc1adab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+    ".inst 0xc17116c9  // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a7cd50  // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+    "st1b { z16.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    "st1b { z17.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z18.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z19.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "19:"  // Tail input: End
+    "cbz x13, 21f\n"
+    "20:"  // Right padding loop
+    ".inst 0xc0060c00  // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a9aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+    "subs x13, x13, #0x1\n"
+    ".inst 0xc0040f82  // mova za.d[x8, #2], { z28.d-z31.d }\n"
+    ".inst 0xc1adab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+    ".inst 0xc1a7cd40  // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+    "st1b { z0.s }, p1, [x11]\n"
+    "add x11, x11, x9\n"
+    "st1b { z1.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z2.s }, p1, [x27]\n"
+    "add x27, x27, x25\n"
+    "st1b { z3.s }, p1, [x26]\n"
+    "add x26, x26, x24\n"
+    "bgt 20b\n"
+    "21:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x16\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "whilelt p1.s, x16, x17\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #6\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..52173b8551
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 1u, stride_cols = 1u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..64023eeaff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0x8\n"
+    "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x5\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x7\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x6\n"
+    "addvl SP, SP, #-30\n"
+    "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z17.h, p2/M, z17.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z18.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x23\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z15.h, #0x0\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "incw x23\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z13.h, z13.h, z3.h\n"
+    "trn1 z11.h, z15.h, z2.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "trn1 z0.h, z2.h, z13.h\n"
+    "ld1sb { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "trn1 z26.h, z13.h, z27.h\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "mov x20, x23\n"
+    "trn1 z10.h, z27.h, z19.h\n"
+    "ld1sb { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z19.h, z19.h, z14.h\n"
+    "trn1 z1.h, z14.h, z15.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "ld1sb { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "addvl x22, SP, #30\n"
+    "ld1sb { z2.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "incw x23\n"
+    "sub z2.h, z2.h, z3.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "trn1 z20.h, z15.h, z9.h\n"
+    "incw x23\n"
+    "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z22.h, z9.h, z5.h\n"
+    "ld1sb { z25.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z9.h, z5.h, z29.h\n"
+    "ld1sb { z21.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z26.h, z29.h, z2.h\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z28.h, z2.h, z23.h\n"
+    "ld1sb { z19.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z23.h, z15.h\n"
+    "sub z25.h, z25.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z21.h, z21.h, z3.h\n"
+    "ld1sb { z6.s }, p2/Z, [x20]\n"
+    "sub z0.h, z0.h, z3.h\n"
+    "mov x20, x23\n"
+    "sub z19.h, z19.h, z3.h\n"
+    "sub z6.h, z6.h, z3.h\n"
+    "st1h { z20.h }, p2, [x22]\n"
+    "incw x23\n"
+    "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z11.h, z15.h, z25.h\n"
+    "trn1 z10.h, z25.h, z21.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z14.h, z21.h, z0.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z21.h, z0.h, z19.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z19.h, z19.h, z6.h\n"
+    "ld1sb { z29.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z13.h, z6.h, z15.h\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z23.h, z23.h, z3.h\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "addvl x22, x22, #-6\n"
+    "sub z27.h, z27.h, z3.h\n"
+    "sub z29.h, z29.h, z3.h\n"
+    "mov x20, x23\n"
+    "st1h { z11.h }, p2, [x22]\n"
+    "sub z1.h, z1.h, z3.h\n"
+    "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+    "trn1 z30.h, z15.h, z5.h\n"
+    "trn1 z26.h, z5.h, z23.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+    "trn1 z22.h, z23.h, z27.h\n"
+    "ld1sb { z5.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z28.h, z27.h, z29.h\n"
+    "ld1sb { z8.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z27.h, z29.h, z1.h\n"
+    "ld1sb { z9.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+    "trn1 z2.h, z1.h, z15.h\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "sub z11.h, z11.h, z3.h\n"
+    "addvl x22, x22, #-6\n"
+    "sub z5.h, z5.h, z3.h\n"
+    "sub z8.h, z8.h, z3.h\n"
+    "st1h { z30.h }, p2, [x22]\n"
+    "sub z9.h, z9.h, z3.h\n"
+    "sub z14.h, z14.h, z3.h\n"
+    "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+    "mov z19.d, z18.d\n"
+    "trn1 z22.h, z15.h, z11.h\n"
+    "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+    "trn1 z1.h, z11.h, z5.h\n"
+    "trn1 z31.h, z5.h, z8.h\n"
+    "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+    "trn1 z8.h, z8.h, z9.h\n"
+    "trn1 z21.h, z9.h, z14.h\n"
+    "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+    "addvl x22, x22, #-6\n"
+    "trn1 z15.h, z14.h, z15.h\n"
+    "st1h { z22.h }, p2, [x22]\n"
+    "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+    "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+    "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+    "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+    "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+    "cbz x21, 3f\n"
+    "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x25, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x7, x23, LSL #22\n"
+    "mov x22, #0x8\n"
+    "add x21, x6, x5\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x11, #0x0\n"
+    "mov x8, #0x8\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x6, x16\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x16, x6, x20, x16\n"
+    ".inst 0xc0046a40  // mova za.d[x11, #0], { z18.d-z19.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0046a41  // mova za.d[x11, #1], { z18.d-z19.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x14, x13, [x23], #0x10\n"
+    ".inst 0xc0046a42  // mova za.d[x11, #2], { z18.d-z19.d }\n"
+    "ldp x4, x10, [x20], #0x10\n"
+    ".inst 0xc0046a43  // mova za.d[x11, #3], { z18.d-z19.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0046a44  // mova za.d[x11, #4], { z18.d-z19.d }\n"
+    "ldp x9, x28, [x23], #0x10\n"
+    ".inst 0xc0046a45  // mova za.d[x11, #5], { z18.d-z19.d }\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    ".inst 0xc0046a46  // mova za.d[x11, #6], { z18.d-z19.d }\n"
+    ".inst 0xc0046a47  // mova za.d[x11, #7], { z18.d-z19.d }\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0066814  // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+    "sub x15, x15, x21\n"
+    ".inst 0xc0066836  // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+    ".inst 0xc1a4aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+    ".inst 0xc1acab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+    ".inst 0xc1b0cf14  // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z20.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z22.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z21.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z23.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x6, x5\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x16, %x[ld_in_row]\n"
+    "ld1b { z1.s }, p1/Z, [x16]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z1.h, z28.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1b { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z2.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z1.h, z2.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1b { z13.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z6.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z13.h, z6.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16a7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+    "ld1b { z20.s }, p1/Z, [x21]\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "add z30.h, z30.h, z17.h\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1617789  // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+    ".inst 0xc16a77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x22, x16, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x16]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z20.h, z2.h, z28.h\n"
+    "add z20.h, z20.h, z17.h\n"
+    "ld1b { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z11.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z21.h, z31.h, z11.h\n"
+    "add z21.h, z21.h, z17.h\n"
+    "ld1b { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z22.h, z25.h, z8.h\n"
+    "add z22.h, z22.h, z17.h\n"
+    "ld1b { z8.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16e7688  // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+    "ld1b { z3.s }, p1/Z, [x22]\n"
+    "trn1 z23.h, z8.h, z3.h\n"
+    ".inst 0xc1667689  // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc161768a  // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+    "add z23.h, z23.h, z17.h\n"
+    ".inst 0xa1412aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc160768b  // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+    ".inst 0xc16976a8  // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xa0422aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16176a9  // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16976aa  // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+    ".inst 0xc16176ab  // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+    ".inst 0xc16f76c8  // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+    ".inst 0xc16e76c9  // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16b76ca  // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+    ".inst 0xc16a76cb  // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x23, x16, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x16]\n"
+    "addvl x22, SP, #12\n"
+    "ld1b { z22.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z0.h, z2.h, z22.h\n"
+    "add z0.h, z0.h, z17.h\n"
+    "ld1b { z14.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z1.h, z14.h, z6.h\n"
+    "add z1.h, z1.h, z17.h\n"
+    "ld1b { z15.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z6.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z2.h, z15.h, z6.h\n"
+    "add z2.h, z2.h, z17.h\n"
+    "ld1b { z21.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16f7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+    "ld1b { z30.s }, p1/Z, [x23]\n"
+    "trn1 z3.h, z21.h, z30.h\n"
+    ".inst 0xc16e7409  // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+    ".inst 0xa1402aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d740a  // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+    "add z3.h, z3.h, z17.h\n"
+    ".inst 0xa0412ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc165740b  // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xa0402a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16f7428  // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e7429  // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xa0422ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16b740c  // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+    ".inst 0xc16a740d  // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+    ".inst 0xc16f742a  // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742b  // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xa0412a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1697448  // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+    ".inst 0xc1687449  // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f742c  // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+    ".inst 0xc16e742d  // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+    ".inst 0xc16b744a  // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+    ".inst 0xc16a744b  // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc161744c  // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+    ".inst 0xc160744d  // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x24, x16, %x[ld_in_row]\n"
+    "ld1b { z0.s }, p1/Z, [x16]\n"
+    "addvl x23, SP, #6\n"
+    "ld1b { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z28.h, z0.h, z3.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "ld1b { z6.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x22, SP, #12\n"
+    "ld1b { z30.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z29.h, z6.h, z30.h\n"
+    "add z29.h, z29.h, z17.h\n"
+    "ld1b { z1.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x21, SP, #18\n"
+    "ld1b { z25.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "trn1 z30.h, z1.h, z25.h\n"
+    "add z30.h, z30.h, z17.h\n"
+    "ld1b { z3.s }, p1/Z, [x24]\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z5.s }, p1/Z, [x24]\n"
+    "trn1 z31.h, z3.h, z5.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16e778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+    "add z31.h, z31.h, z17.h\n"
+    ".inst 0xa1412ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc166778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16a77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+    ".inst 0xc16277a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+    ".inst 0xa0412ac8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16f778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+    ".inst 0xc16e778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xa1402a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16877ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+    ".inst 0xc166778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+    ".inst 0xc16d77ac  // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+    ".inst 0xc16577ad  // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+    ".inst 0xc16277cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+    ".inst 0xa0422aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ae  // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xc16677af  // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+    ".inst 0xc16977cc  // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+    ".inst 0xc16877cd  // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+    ".inst 0xa1422a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16e77ce  // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+    ".inst 0xc16677cf  // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x16]\n"
+    "sub x25, x25, #0x1\n"
+    "ld1b { z28.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z26.h, z28.h\n"
+    "sub x15, x15, #0x1\n"
+    "ld1b { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "cmp x25, x15\n"
+    "add z25.h, z25.h, z17.h\n"
+    "ld1b { z15.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z31.h, z15.h\n"
+    "csel x25, x25, x15, LT\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z26.h, z26.h, z17.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "ld1b { z8.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z22.h, z8.h\n"
+    "add z27.h, z27.h, z17.h\n"
+    "ld1b { z21.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "sub x15, x15, x25\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "trn1 z28.h, z21.h, z20.h\n"
+    "add z28.h, z28.h, z17.h\n"
+    "cbz x25, 21f\n"
+    "13:"  // Unpadded: Main loop
+    "addvl x24, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #12\n"
+    "ld1b { z21.s }, p1/Z, [x16]\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402b0e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+    "addvl x22, SP, #18\n"
+    "addvl x21, SP, #24\n"
+    ".inst 0xc16f772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    "ld1b { z0.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa1402ae6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+    "subs x25, x25, #0x1\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z20.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412b05  // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z31.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z29.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1b { z22.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422b02  // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+    ".inst 0xc16f772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+    "ld1b { z30.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc16e772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xa0402aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16d774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z6.s }, p1/Z, [x20]\n"
+    ".inst 0xc165774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16a776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16a776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422aaa  // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    "trn1 z25.h, z21.h, z0.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xc16d1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+    "add z25.h, z25.h, z17.h\n"
+    ".inst 0xc1651749  // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+    "trn1 z26.h, z20.h, z31.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    "add z26.h, z26.h, z17.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "trn1 z27.h, z29.h, z22.h\n"
+    "trn1 z28.h, z30.h, z6.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "add z27.h, z27.h, z17.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "add z28.h, z28.h, z17.h\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x25, x25, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z9.s }, p0/Z, [x16]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x21, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x21]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z9.h, z22.h\n"
+    "trn1 z0.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x21]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa1402a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+    "trn1 z1.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16a77e8  // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc16277e9  // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+    ".inst 0xa1412a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    "trn1 z2.h, z21.h, z20.h\n"
+    ".inst 0xc16d7408  // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+    ".inst 0xa0422a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1657409  // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+    ".inst 0xc1697428  // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+    ".inst 0xc1687429  // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z5.s }, p0/Z, [x16]\n"
+    "add z5.h, p0/M, z5.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z28.h, z5.h, z22.h\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    "trn1 z30.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #24\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc1617788  // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc1607789  // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z21.h, z20.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xa0412aae  // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc161778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xa1422aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16f77a8  // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+    ".inst 0xc16e77a9  // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+    ".inst 0xa1412a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16977aa  // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+    ".inst 0xc16177ab  // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+    ".inst 0xc16b77c8  // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+    ".inst 0xc16377c9  // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f77ca  // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+    ".inst 0xc16e77cb  // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x16]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z8.h, z29.h, z22.h\n"
+    "trn1 z9.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402ace  // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+    "trn1 z10.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x21, SP, #18\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7508  // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e7509  // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    "addvl x20, SP, #24\n"
+    "trn1 z11.h, z21.h, z20.h\n"
+    ".inst 0xa1412ac5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e750a  // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc166750b  // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16d7528  // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc1657529  // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16f750c  // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+    ".inst 0xc16e750d  // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+    ".inst 0xc16d752a  // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+    ".inst 0xc165752b  // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc1617548  // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc1607549  // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e752c  // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+    ".inst 0xc166752d  // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+    ".inst 0xc161754a  // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+    ".inst 0xc160754b  // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+    ".inst 0xa0422a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f754c  // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+    ".inst 0xc16e754d  // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x16]\n"
+    "add z1.h, p0/M, z1.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z1.h, z22.h\n"
+    "trn1 z27.h, z21.h, z20.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "addvl x23, SP, #6\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xa0402aee  // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+    "trn1 z28.h, z22.h, z20.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "addvl x22, SP, #12\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    ".inst 0xc16f7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+    ".inst 0xc16e7749  // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xa0402ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "trn1 z29.h, z21.h, z20.h\n"
+    ".inst 0xa0412aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc161774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
+    "addvl x20, SP, #24\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc160774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc16b7768  // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xa0422ae8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16a7769  // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc16e774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1402a85  // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc1697788  // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+    ".inst 0xc1687789  // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+    ".inst 0xa1422ac2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xc16e776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc16a778a  // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778b  // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+    ".inst 0xa0422aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16e776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+    ".inst 0xc166776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+    ".inst 0xc161778c  // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+    ".inst 0xc160778d  // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+    ".inst 0xa1422a82  // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16a778e  // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+    ".inst 0xc162778f  // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+    "19:"  // Padded: 0 priming loads
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "cbz x25, 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z6.s }, p0/Z, [x16]\n"
+    "add z6.h, p0/M, z6.h, z17.h\n"
+    "add x20, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z30.s }, p0/Z, [x20]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z17.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z6.h, z30.h\n"
+    "trn1 z26.h, z27.h, z26.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z9.s }, p0/Z, [x20]\n"
+    "add z9.h, p0/M, z9.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z21.s }, p0/Z, [x20]\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    "sub x25, x25, #0x1\n"
+    "sub x15, x15, #0x1\n"
+    "cmp x25, x15\n"
+    "trn1 z27.h, z8.h, z9.h\n"
+    "trn1 z28.h, z21.h, z29.h\n"
+    "csel x25, x25, x15, LT\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    "sub x15, x15, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z8.s }, p0/Z, [x16]\n"
+    "add z8.h, p0/M, z8.h, z17.h\n"
+    "add x24, x16, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z21.s }, p0/Z, [x24]\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x22, SP, #12\n"
+    "add z21.h, p0/M, z21.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    "ld1b { z29.s }, p0/Z, [x24]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    "add z29.h, p0/M, z29.h, z17.h\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    "mov x12, #0x4\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z30.s }, p0/Z, [x24]\n"
+    "add z30.h, p0/M, z30.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc16d774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+    "ld1b { z15.s }, p0/Z, [x24]\n"
+    "add z15.h, p0/M, z15.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc165774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412aca  // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "add x16, x16, %x[ld_in_col]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    "ld1b { z20.s }, p0/Z, [x24]\n"
+    "add z20.h, p0/M, z20.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    "ld1b { z31.s }, p0/Z, [x24]\n"
+    "add z31.h, p0/M, z31.h, z17.h\n"
+    "add x24, x24, %x[ld_in_row]\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    ".inst 0xc16b774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+    "ld1b { z22.s }, p0/Z, [x24]\n"
+    "add z22.h, p0/M, z22.h, z17.h\n"
+    ".inst 0xc16a774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+    ".inst 0xa1412aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc169776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422ac0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16e774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc166774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412a86  // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc161776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc160776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+    ".inst 0xa1422aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc169776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+    ".inst 0xc161776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xa0422a8a  // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc1631728  // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+    ".inst 0xc1621729  // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+    ".inst 0xa0402be0  // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+    "trn1 z25.h, z8.h, z21.h\n"
+    ".inst 0xc16e1748  // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+    ".inst 0xc1661749  // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+    ".inst 0xa1412be5  // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+    "trn1 z26.h, z29.h, z30.h\n"
+    ".inst 0xc16b1768  // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a1769  // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xa0422be2  // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+    "trn1 z27.h, z15.h, z20.h\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "trn1 z28.h, z31.h, z22.h\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    "addvl x23, SP, #6\n"
+    ".inst 0xc1617728  // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+    "addvl x22, SP, #12\n"
+    ".inst 0xc1607729  // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa0402ae0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #18\n"
+    "addvl x20, SP, #24\n"
+    ".inst 0xc161772a  // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+    ".inst 0xc160772b  // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+    ".inst 0xa1402ac6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc16d7748  // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc1657749  // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa1412ae1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+    ".inst 0xc16e772c  // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772d  // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa1402aa6  // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc169774a  // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774b  // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412ac1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+    ".inst 0xc1637768  // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc1627769  // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422aea  // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+    ".inst 0xc16e772e  // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc166772f  // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+    ".inst 0xa0402a8e  // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc169774c  // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc161774d  // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+    ".inst 0xa1412aa5  // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+    ".inst 0xc16b776a  // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+    ".inst 0xc16a776b  // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xa0422ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+    ".inst 0xc16d774e  // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+    ".inst 0xc165774f  // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+    ".inst 0xa0412a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+    ".inst 0xc163776c  // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+    ".inst 0xc162776d  // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa1422aa2  // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+    ".inst 0xc16a776e  // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+    ".inst 0xc162776f  // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+    ".inst 0xa0422a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+    ".inst 0xc16f1728  // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+    ".inst 0xc16e1729  // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+    ".inst 0xc1691748  // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+    ".inst 0xc1681749  // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+    ".inst 0xc1611768  // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
+    ".inst 0xc1601769  // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+    "add x8, x8, #0x2\n"
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "22:"  // Main loop skip tail
+    "cbz x15, 24f\n"
+    "23:"  // Right padding loop
+    ".inst 0xc0066808  // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+    "add x8, x8, #0x2\n"
+    "subs x15, x15, #0x1\n"
+    ".inst 0xc006682a  // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+    ".inst 0xc1a7ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+    "add x11, x11, #0x2\n"
+    ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+    ".inst 0xc0040a40  // mova za.d[x8, #0], { z18.d-z19.d }\n"
+    ".inst 0xc1acab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+    ".inst 0xc0040a41  // mova za.d[x8, #1], { z18.d-z19.d }\n"
+    ".inst 0xc1b0cf08  // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+    "st1b { z8.s }, p1, [x14]\n"
+    "add x14, x14, x4\n"
+    "st1b { z10.s }, p1, [x13]\n"
+    "add x13, x13, x10\n"
+    "st1b { z9.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "st1b { z11.s }, p1, [x28]\n"
+    "add x28, x28, x26\n"
+    "bgt 23b\n"
+    "24:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x17\n"
+    "whilelt p1.s, x17, x7\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #30\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..ad82070912
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+  using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+  public:
+  using return_type = uint8_t;
+  constexpr static auto output_rows = 4u;
+  constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+  constexpr static auto stride_rows = 2u, stride_cols = 2u;
+  constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+  sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+  : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+  {
+  }
+
+  typename Parent::KernelType get_kernel(void) const override
+  {
+    return sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl;
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..d8dc69127e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
+  const uint8_t *inptr,
+  size_t ld_in_row,
+  size_t ld_in_col,
+  size_t ld_in_vl,
+  unsigned int pad_top,
+  unsigned int valid_input_rows,
+  unsigned int pad_left,
+  unsigned int valid_input_cols,
+  const int8_t *weights,
+  uint8_t **outptrs,
+  const size_t *outlds,
+  const size_t *outvllds,
+  unsigned int output_cols,
+  unsigned int start_channel,
+  unsigned int valid_channels,
+  const arm_gemm::Requantize32 &qp
+)
+{
+  struct Args
+  {
+    const uint8_t *inptr;
+    size_t ld_in_vl;
+    long unsigned int pad_top, pad_bottom, pad_left;
+    const int8_t *weights;
+    long unsigned int input_cols, output_cols;
+    uint8_t **outptrs;
+    const size_t *ld_out_cols;
+    const size_t *ld_out_vls;
+    long unsigned int current_channel, n_channels;
+  };
+
+  Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+    "ptrue p2.b\n"
+    "mov x20, #0xb\n"
+    "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+    "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+    "sub x20, x20, x3\n"
+    ".inst 0x25207812  // ptrue pn10.b\n"
+    "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+    "whilelt p1.s, XZR, x5\n"
+    "whilelt p9.s, XZR, x20\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "whilelt p8.s, XZR, x4\n"
+    "addvl SP, SP, #-15\n"
+    "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+    "neg z7.h, p2/M, z7.h\n"
+    "eor p8.b, p2/Z, p8.b, p9.b\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+    "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "1:"  // Channel loop
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+    "mov z12.s, #0x0\n"
+    "cbz x20, 2f\n"
+    "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "2:"  // Load bias: Done
+    "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+    "mov x20, x22\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "incw x22\n"
+    "mov z26.h, #0x0\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "trn1 z17.h, z13.h, z22.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "addvl x21, SP, #15\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "trn1 z29.h, z20.h, z1.h\n"
+    "ld1sb { z27.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "sub z27.h, z27.h, z28.h\n"
+    "incw x22\n"
+    "ld1sb { z14.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z14.h, z14.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "ld1sb { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "trn1 z22.h, z27.h, z26.h\n"
+    "ld1sb { z23.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z23.h, z23.h, z28.h\n"
+    "st1h { z17.h }, p2, [x21]\n"
+    "ld1sb { z30.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z30.h, z30.h, z28.h\n"
+    "trn1 z8.h, z14.h, z18.h\n"
+    "ld1sb { z15.s }, p2/Z, [x20]\n"
+    "mov x20, x22\n"
+    "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "ld1sb { z20.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z23.h, z23.h, z30.h\n"
+    "sub z20.h, z20.h, z28.h\n"
+    "ld1sb { z24.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z24.h, z24.h, z28.h\n"
+    "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z0.h, z15.h, z26.h\n"
+    "incw x22\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "ld1sb { z11.s }, p2/Z, [x20]\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "trn1 z27.h, z20.h, z24.h\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z11.h, z11.h, z28.h\n"
+    "ld1sb { z3.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+    "trn1 z20.h, z16.h, z13.h\n"
+    "ld1sb { z13.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z3.h, z3.h, z28.h\n"
+    "ld1sb { z15.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+    "trn1 z29.h, z11.h, z26.h\n"
+    "ld1sb { z16.s }, p2/Z, [x20]\n"
+    "incw x22\n"
+    "sub z13.h, z13.h, z28.h\n"
+    "sub z15.h, z15.h, z28.h\n"
+    "addvl x21, x21, #-3\n"
+    "mov x20, x22\n"
+    "st1h { z27.h }, p2, [x21]\n"
+    "sub z16.h, z16.h, z28.h\n"
+    "trn1 z19.h, z22.h, z3.h\n"
+    "ld1sb { z17.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+    "ld1sb { z0.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z31.h, z13.h, z15.h\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "ld1sb { z18.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "trn1 z16.h, z16.h, z26.h\n"
+    "sub z17.h, z17.h, z28.h\n"
+    "ld1sb { z22.s }, p2/Z, [x20]\n"
+    "incw x20, ALL, MUL #5\n"
+    "sub z0.h, z0.h, z28.h\n"
+    "sub z18.h, z18.h, z28.h\n"
+    "ld1sb { z1.s }, p2/Z, [x20]\n"
+    "sub z22.h, z22.h, z28.h\n"
+    "sub z1.h, z1.h, z28.h\n"
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+    "addvl x21, x21, #-3\n"
+    "st1h { z19.h }, p2, [x21]\n"
+    "mov z13.d, z12.d\n"
+    "mov z14.d, z12.d\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "mov z15.d, z12.d\n"
+    "trn1 z8.h, z17.h, z0.h\n"
+    "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
+    "addvl x21, x21, #-3\n"
+    "trn1 z31.h, z18.h, z22.h\n"
+    "trn1 z29.h, z1.h, z26.h\n"
+    "st1h { z8.h }, p2, [x21]\n"
+    "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+    "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+    "cbz x20, 3f\n"
+    "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "3:"  // Load mul: End
+    "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+    "cbz x20, 4f\n"
+    "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+    "4:"  // Load right_shift: End
+    "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+    "sub x20, x7, #0x1\n"
+    "orr x23, x20, %x[ld_in_col], LSL #16\n"
+    "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+    "orr x23, x5, x23, LSL #22\n"
+    "mov x22, #0xb\n"
+    "add x21, x4, x3\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+    "mov x8, #0x0\n"
+    "lsl x23, x23, #0x0\n"
+    "sub x22, x22, x21\n"
+    "madd x20, x20, x4, x17\n"
+    "5:"  // Issue prefetches
+    "subs x22, x22, #0x1\n"
+    ".inst 0xf8b74a9c  // rprfm pldstrm, x23, [x20]\n"
+    "add x20, x20, %x[ld_in_col]\n"
+    "bgt 5b\n"
+    "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "lsl x20, %x[ld_in_row], #0x0\n"
+    "msub x17, x4, x20, x17\n"
+    ".inst 0xc0040d80  // mova za.d[x8, #0], { z12.d-z15.d }\n"
+    "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+    ".inst 0xc0040d81  // mova za.d[x8, #1], { z12.d-z15.d }\n"
+    "mov x22, #0x4\n"
+    "ldp x15, x14, [x23], #0x10\n"
+    ".inst 0xc0040d82  // mova za.d[x8, #2], { z12.d-z15.d }\n"
+    "ldp x13, x11, [x20], #0x10\n"
+    ".inst 0xc0040d83  // mova za.d[x8, #3], { z12.d-z15.d }\n"
+    "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "ldp x10, x9, [x23], #0x10\n"
+    "ldp x28, x27, [x20], #0x10\n"
+    "cbz x21, 7f\n"
+    "cmp x21, x22\n"
+    "csel x20, x21, x22, LT\n"
+    "sub x21, x21, x20\n"
+    "sub x22, x22, x20\n"
+    "cbz x21, 7f\n"
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "and x22, x21, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "add x21, x21, #0x1\n"
+    "lsr x21, x21, #0x1\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    "sub x16, x16, x21\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "6:"  // Left padding
+    "subs x21, x21, #0x1\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 6b\n"
+    "7:"  // Left padding: End
+    "adds XZR, x4, x3\n"
+    "bne 14f\n"
+    "cbz x22, 12f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 11f\n"
+    "cmp x22, #0x2\n"
+    "beq 10f\n"
+    "cmp x22, #0x3\n"
+    "beq 9f\n"
+    "8:"  // Unpadded: 4 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z27.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #12\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z0.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z11.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z11.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1b { z29.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z8.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z26.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    ".inst 0xa1402a80  // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1b { z20.s }, p1/Z, [x21]\n"
+    "mov z0.d, z20.d\n"
+    "add z0.h, z0.h, z7.h\n"
+    ".inst 0xc1781788  // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+    "9:"  // Unpadded: 3 priming loads
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z29.s }, p1/Z, [x17]\n"
+    "addvl x20, SP, #9\n"
+    "ld1b { z17.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z17.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z0.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1b { z0.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1b { z1.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z1.h, z1.h, z16.h\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1b { z16.s }, p1/Z, [x21]\n"
+    "mov z2.d, z16.d\n"
+    "add z2.h, z2.h, z7.h\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17817e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+    "10:"  // Unpadded: 2 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1b { z26.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #6\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #12\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z28.h, z28.h, z29.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1b { z29.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z19.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z19.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z23.s }, p1/Z, [x22]\n"
+    "trn1 z30.h, z30.h, z23.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z30.h, z30.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "mov z31.d, z22.d\n"
+    ".inst 0xc1731768  // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b1769  // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+    ".inst 0xc1731788  // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701789  // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+    "11:"  // Unpadded: 1 priming loads
+    "add x22, x17, %x[ld_in_row]\n"
+    "ld1b { z29.s }, p1/Z, [x17]\n"
+    "addvl x21, SP, #3\n"
+    "ld1b { z22.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z22.h\n"
+    "add z29.h, z29.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "addvl x20, SP, #9\n"
+    "ld1b { z25.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z30.h, z30.h, z25.h\n"
+    "add z30.h, z30.h, z7.h\n"
+    "ld1b { z31.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    "add z31.h, z31.h, z7.h\n"
+    "ld1b { z0.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "trn1 z0.h, z0.h, z16.h\n"
+    "add z0.h, z0.h, z7.h\n"
+    "ld1b { z1.s }, p1/Z, [x22]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1b { z2.s }, p1/Z, [x22]\n"
+    "trn1 z1.h, z1.h, z2.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "add z1.h, z1.h, z7.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17217a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+    "ld1b { z24.s }, p1/Z, [x22]\n"
+    "mov z2.d, z24.d\n"
+    ".inst 0xc17317c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+    ".inst 0xa0402a88  // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc17817a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+    "add z2.h, z2.h, z7.h\n"
+    "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17917c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+    ".inst 0xc17317e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17317e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+    "12:"  // Unpadded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    "sub x7, x7, #0x2\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z25.h\n"
+    "sub x16, x16, #0x1\n"
+    "ld1b { z24.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "lsr x20, x7, #0x1\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z30.h\n"
+    "cmp x20, x16\n"
+    "ld1b { z25.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "csel x26, x20, x16, LT\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z22.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z22.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z22.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "and x7, x7, #0x1\n"
+    "ld1b { z30.s }, p1/Z, [x21]\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z30.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z28.s }, p1/Z, [x21]\n"
+    "mov z28.d, z28.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    "sub x16, x16, x26\n"
+    "cbz x26, 21f\n"
+    "13:"  // Unpadded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x25, SP, #6\n"
+    "addvl x24, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b20  // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
+    "add x23, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "addvl x21, SP, #9\n"
+    "subs x26, x26, #0x1\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    ".inst 0xc17816ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+    ".inst 0xc179170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+    "ld1b { z16.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z24.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    "ld1b { z18.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z18.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z25.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "ld1b { z8.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z8.h\n"
+    "add z25.h, z25.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "ld1b { z28.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z28.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x23]\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "ld1b { z28.s }, p1/Z, [x23]\n"
+    "trn1 z27.h, z27.h, z28.h\n"
+    "add x23, x23, %x[ld_in_row]\n"
+    "add z27.h, z27.h, z7.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    "ld1b { z20.s }, p1/Z, [x23]\n"
+    "mov z28.d, z20.d\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa0  // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add z28.h, z28.h, z7.h\n"
+    "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+    ".inst 0xc1711709  // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    ".inst 0xc1701728  // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1b { z23.s }, p1/Z, [x17]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "ld1b { z24.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    "add x9, x9, x27\n"
+    "ld1b { z25.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z23.h, z23.h, z7.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    "add z24.h, z24.h, z7.h\n"
+    "ld1b { z26.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "add z25.h, z25.h, z7.h\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    "add z26.h, z26.h, z7.h\n"
+    "ld1b { z27.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "add z27.h, z27.h, z7.h\n"
+    "ld1b { z16.s }, p1/Z, [x20]\n"
+    "mov z28.d, z16.d\n"
+    "add z28.h, z28.h, z7.h\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "bgt 13b\n"
+    "b 21f\n"
+    "14:"  // Padded
+    "cbz x22, 19f\n"
+    "cmp x22, #0x1\n"
+    "sub x7, x7, x22\n"
+    "beq 18f\n"
+    "cmp x22, #0x2\n"
+    "beq 17f\n"
+    "cmp x22, #0x3\n"
+    "beq 16f\n"
+    "15:"  // Padded: 4 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x17]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x21]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z30.s }, p0/Z, [x21]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z31.s }, p0/Z, [x21]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #12\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z29.h, z29.h, z18.h\n"
+    "trn1 z30.h, z30.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a80  // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+    "trn1 z31.h, z31.h, z16.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1b { z20.s }, p0/Z, [x21]\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "mov z0.d, z20.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1711788  // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+    "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc17117a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+    "16:"  // Padded: 3 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x21, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x21]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z26.s }, p0/Z, [x21]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x21]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x21]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x21]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z28.s }, p0/Z, [x21]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x21]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x20, SP, #9\n"
+    "add x21, x21, %x[ld_in_row]\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    "trn1 z28.h, z28.h, z16.h\n"
+    ".inst 0xc1721708  // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+    "ld1b { z11.s }, p0/Z, [x21]\n"
+    "add z11.h, p0/M, z11.h, z7.h\n"
+    "mov z29.d, z11.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701748  // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+    "17:"  // Padded: 2 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #6\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa1  // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #12\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1791748  // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z1.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1791768  // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "18:"  // Padded: 1 priming loads
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x17]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z25.h, z25.h, z17.h\n"
+    "trn1 z26.h, z26.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "addvl x21, SP, #3\n"
+    "trn1 z27.h, z27.h, z18.h\n"
+    "trn1 z28.h, z28.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z29.h, z29.h, z16.h\n"
+    ".inst 0xc1731728  // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "addvl x20, SP, #9\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    ".inst 0xc17b1748  // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+    ".inst 0xa0402a82  // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1721729  // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+    "mov z30.d, z0.d\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1731749  // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+    ".inst 0xc1701768  // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+    "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1701769  // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+    "19:"  // Padded: 0 priming loads
+    "cmp x7, #0x2\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "blt 22f\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z23.h, z23.h, z17.h\n"
+    "trn1 z24.h, z24.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z19.s }, p0/Z, [x20]\n"
+    "add z19.h, p0/M, z19.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z18.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z18.h, p0/M, z18.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "sub x7, x7, #0x2\n"
+    "sub x16, x16, #0x1\n"
+    "trn1 z25.h, z25.h, z19.h\n"
+    "trn1 z26.h, z26.h, z18.h\n"
+    "lsr x20, x7, #0x1\n"
+    "cmp x20, x16\n"
+    "trn1 z27.h, z27.h, z17.h\n"
+    "mov z28.d, z16.d\n"
+    "csel x25, x20, x16, LT\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    "and x7, x7, #0x1\n"
+    "sub x16, x16, x25\n"
+    "cbz x25, 21f\n"
+    "20:"  // Padded: Main loop
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa1402b00  // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17016e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    "addvl x22, SP, #3\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x21, SP, #9\n"
+    "subs x25, x25, #0x1\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc173172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+    "trn1 z23.h, z23.h, z16.h\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z3.s }, p0/Z, [x20]\n"
+    "add z3.h, p0/M, z3.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z30.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z29.s }, p0/Z, [x20]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    "trn1 z24.h, z24.h, z1.h\n"
+    "trn1 z25.h, z25.h, z3.h\n"
+    "trn1 z26.h, z26.h, z30.h\n"
+    ".inst 0xa0402ac2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    "trn1 z27.h, z27.h, z29.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17216e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x0\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    ".inst 0xa0402aa2  // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17216e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+    "ld1b { z23.s }, p0/Z, [x17]\n"
+    "add z23.h, p0/M, z23.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z8.s }, p0/Z, [x20]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc1731709  // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+    "ld1b { z24.s }, p0/Z, [x20]\n"
+    "mov z28.d, z20.d\n"
+    "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z22.s }, p0/Z, [x20]\n"
+    ".inst 0xc1711728  // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+    "mov x12, #0x4\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc1711729  // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z20.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z31.s }, p0/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z1.s }, p0/Z, [x20]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "trn1 z23.h, z23.h, z8.h\n"
+    "trn1 z24.h, z24.h, z22.h\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "trn1 z25.h, z25.h, z28.h\n"
+    "trn1 z26.h, z26.h, z20.h\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "trn1 z27.h, z27.h, z31.h\n"
+    "mov z28.d, z1.d\n"
+    "bgt 20b\n"
+    "21:"  // Main loop tail
+    ".inst 0xc17316e8  // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+    "addvl x24, SP, #6\n"
+    "addvl x23, SP, #12\n"
+    ".inst 0xc17b1708  // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+    ".inst 0xa0402b08  // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc17816e9  // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
+    "add x22, x17, %x[ld_in_row]\n"
+    "addvl x21, SP, #3\n"
+    ".inst 0xc1791709  // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+    ".inst 0xa1402ae3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+    "addvl x20, SP, #9\n"
+    ".inst 0xc17316ea  // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+    "ld1b { z29.s }, p0/Z, [x17]\n"
+    "add z29.h, p0/M, z29.h, z7.h\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z8.s }, p0/Z, [x22]\n"
+    "add z8.h, p0/M, z8.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    ".inst 0xc17b170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+    "ld1b { z30.s }, p0/Z, [x22]\n"
+    "add z30.h, p0/M, z30.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc1721728  // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+    "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "mov x12, #0x4\n"
+    "ld1b { z20.s }, p0/Z, [x22]\n"
+    ".inst 0xc1701729  // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+    "add z20.h, p0/M, z20.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    ".inst 0xc172172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+    "trn1 z29.h, z29.h, z8.h\n"
+    "ld1b { z31.s }, p0/Z, [x22]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z25.s }, p0/Z, [x22]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z0.s }, p0/Z, [x22]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z17.s }, p0/Z, [x22]\n"
+    "mov x12, #0x8\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z1.s }, p0/Z, [x22]\n"
+    "add z1.h, p0/M, z1.h, z7.h\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z28.s }, p0/Z, [x22]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "trn1 z30.h, z30.h, z20.h\n"
+    "trn1 z31.h, z31.h, z25.h\n"
+    "trn1 z0.h, z0.h, z17.h\n"
+    ".inst 0xa1402aa3  // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+    "add x22, x22, %x[ld_in_row]\n"
+    ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z1.h, z1.h, z28.h\n"
+    ".inst 0xc17317a8  // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+    "ld1b { z22.s }, p0/Z, [x22]\n"
+    ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+    "add z22.h, p0/M, z22.h, z7.h\n"
+    ".inst 0xc17b17c8  // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xa1402a83  // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+    "add x17, x17, %x[ld_in_col]\n"
+    ".inst 0xc1a4aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+    ".inst 0xc17317a9  // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+    "mov z2.d, z22.d\n"
+    "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc17b17c9  // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+    ".inst 0xc1aaab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+    ".inst 0xc17917e8  // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+    "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+    ".inst 0xc1b5ccb8  // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+    "st1b { z24.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z25.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xa1402be3  // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+    "st1b { z26.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    ".inst 0xc17817e9  // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+    "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+    "st1b { z27.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "22:"  // Main loop skip tail
+    "cbz x7, 23f\n"  // Skip remainder inputs
+    "mov x12, #0x0\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z24.s }, p0/Z, [x17]\n"
+    "add z24.h, p0/M, z24.h, z7.h\n"
+    "add x20, x17, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z25.s }, p0/Z, [x20]\n"
+    "add z25.h, p0/M, z25.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "mov x12, #0x4\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    "trn1 z24.h, z24.h, z17.h\n"
+    "trn1 z25.h, z25.h, z16.h\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z26.s }, p0/Z, [x20]\n"
+    "add z26.h, p0/M, z26.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z17.s }, p0/Z, [x20]\n"
+    "add z17.h, p0/M, z17.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "ld1b { z27.s }, p0/Z, [x20]\n"
+    "add z27.h, p0/M, z27.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25f04500  // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+    "ld1b { z16.s }, p0/Z, [x20]\n"
+    "mov x12, #0x8\n"
+    "add z16.h, p0/M, z16.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25304500  // psel p0.s, p1.s/Z, p8.s[w12]\n"
+    "ld1b { z28.s }, p0/Z, [x20]\n"
+    "add z28.h, p0/M, z28.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25704500  // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+    "ld1b { z31.s }, p0/Z, [x20]\n"
+    "add z31.h, p0/M, z31.h, z7.h\n"
+    "add x20, x20, %x[ld_in_row]\n"
+    ".inst 0x25b04500  // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+    "trn1 z26.h, z26.h, z17.h\n"
+    "trn1 z27.h, z27.h, z16.h\n"
+    "ld1b { z0.s }, p0/Z, [x20]\n"
+    "add z0.h, p0/M, z0.h, z7.h\n"
+    "trn1 z28.h, z28.h, z31.h\n"
+    "addvl x21, SP, #6\n"
+    ".inst 0xc1731708  // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+    "mov z29.d, z0.d\n"
+    "addvl x20, SP, #12\n"
+    "sub x16, x16, #0x1\n"
+    ".inst 0xc17b1728  // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+    ".inst 0xa0402aa8  // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+    ".inst 0xc1721748  // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+    ".inst 0xc1781709  // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+    ".inst 0xc0060c10  // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+    ".inst 0xc1791729  // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xa1402a81  // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+    ".inst 0xc1a4aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+    ".inst 0xc171170a  // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+    ".inst 0xc1aaab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+    ".inst 0xc179172a  // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+    ".inst 0xc1b5ccb0  // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+    "st1b { z16.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    ".inst 0xc1721749  // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+    "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+    "st1b { z17.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    ".inst 0xc173174a  // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+    "add x8, x8, #0x1\n"
+    "st1b { z18.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z19.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    "23:"  // Tail input: End
+    "cbz x16, 25f\n"
+    "24:"  // Right padding loop
+    ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+    ".inst 0xc1a6ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+    "add x8, x8, #0x1\n"
+    ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+    "subs x16, x16, #0x1\n"
+    ".inst 0xc0040d84  // mova za.d[x8, #4], { z12.d-z15.d }\n"
+    ".inst 0xc1aaab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+    ".inst 0xc1b5ccbc  // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+    "st1b { z28.s }, p1, [x15]\n"
+    "add x15, x15, x13\n"
+    "st1b { z29.s }, p1, [x14]\n"
+    "add x14, x14, x11\n"
+    "st1b { z30.s }, p1, [x10]\n"
+    "add x10, x10, x28\n"
+    "st1b { z31.s }, p1, [x9]\n"
+    "add x9, x9, x27\n"
+    "bgt 24b\n"
+    "25:"  // End
+    "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "incw x20, ALL, MUL #16\n"
+    "incw x20, ALL, MUL #9\n"
+    "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+    "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+    "incw x6\n"
+    "whilelt p1.s, x6, x5\n"
+    "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "add x20, x20, x21\n"
+    "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+    "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+    "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+    "ldp x23, x22, [x25, #0x0]\n"
+    "ldp x21, x20, [x24, #0x0]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x0]\n"
+    "ldp x23, x22, [x25, #0x10]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x23, x23, x21\n"
+    "add x22, x22, x20\n"
+    "stp x23, x22, [x25, #0x10]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #15\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..edee21e941
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..d807856ccb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x10, #0x0\n"
+    "mov x14, #0x0\n"
+    "1:"  // Tile loop
+    "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x2\n"
+    "mov x24, #0x2\n"
+    "str x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x10, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x10, x22\n"  // offset = tile_i * ld_output_row
+    "cnth x11\n"
+    "madd x21, x14, x13, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "madd x20, x14, x12, x20\n"  // offset += tile_j * ld_output_col
+    "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "ld1h { z27.h }, p3/Z, [x10]\n"
+    "add x27, x13, x13\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x9, x9, x21, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "add x26, x9, x23, LSL #1\n"
+    "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
+    "add x25, x26, x23, LSL #1\n"
+    "add x24, x27, x13\n"
+    "ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "add x28, x28, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cmp x11, %x[n_channels]\n"
+    "add x23, x25, x23, LSL #1\n"
+    "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
+    "add x22, x28, x22, LSL #1\n"
+    "mov x21, #0x0\n"
+    "ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
+    "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
+    "sub x20, XZR, x11\n"
+    "ld1h { z10.h }, p2/Z, [x9]\n"
+    "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n"
+    "addvl x10, x10, #-6\n"
+    "ld1h { z12.h }, p2/Z, [x26, x27, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x25, x13, LSL #1]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+    "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
+    "whilelt p1.h, x11, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+    "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z18.h }, p2/Z, [x23]\n"
+    "inch x11\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "mov p0.b, p2.b\n"
+    "ld1h { z27.h }, p3/Z, [x10]\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
+    "inch x20\n"
+    "fmla z22.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
+    "addvl x9, x9, #1\n"
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "fmla z22.h, p3/M, z4.h, z13.h\n"
+    "fmla z21.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x26]\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
+    "addvl x26, x26, #1\n"
+    "fmla z22.h, p3/M, z5.h, z20.h\n"
+    "fmla z21.h, p3/M, z4.h, z20.h\n"
+    "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z24.h, p3/M, z2.h, z18.h\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x25]\n"
+    "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z22.h, p3/M, z0.h, z17.h\n"
+    "fmla z21.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z24.h, p3/M, z8.h, z20.h\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "addvl x25, x25, #1\n"
+    "fmla z22.h, p3/M, z3.h, z19.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "ld1h { z13.h }, p1/Z, [x25, x13, LSL #1]\n"
+    "fmla z24.h, p3/M, z3.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z24.h, p3/M, z6.h, z19.h\n"
+    "fmla z23.h, p3/M, z8.h, z18.h\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "ld1h { z9.h }, p1/Z, [x26, x13, LSL #1]\n"
+    "cmp x11, %x[n_channels]\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "ld1h { z10.h }, p1/Z, [x9]\n"
+    "ld1h { z11.h }, p1/Z, [x9, x24, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "ld1h { z12.h }, p1/Z, [x26, x27, LSL #1]\n"
+    "st1h { z24.h }, p0, [x28]\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "addvl x23, x23, #1\n"
+    "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
+    "st1h { z22.h }, p0, [x22]\n"
+    "addvl x28, x28, #1\n"
+    "ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
+    "addvl x10, x10, #-6\n"
+    "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+    "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+    "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z18.h }, p2/Z, [x23]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "add x14, x14, #0x1\n"
+    "cmp x14, x20\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
+    "add x21, x10, #0x1\n"
+    "fmla z22.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "csel x10, x10, x21, LT\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.h, p3/M, z4.h, z13.h\n"
+    "fmla z21.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x26]\n"
+    "csel x14, x14, XZR, LT\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
+    "cmp x10, x20\n"
+    "fmla z22.h, p3/M, z5.h, z20.h\n"
+    "fmla z21.h, p3/M, z4.h, z20.h\n"
+    "fmla z24.h, p3/M, z2.h, z18.h\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x25]\n"
+    "fmla z22.h, p3/M, z0.h, z17.h\n"
+    "fmla z21.h, p3/M, z2.h, z16.h\n"
+    "fmla z24.h, p3/M, z8.h, z20.h\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "fmla z22.h, p3/M, z3.h, z19.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "fmla z24.h, p3/M, z3.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z24.h, p3/M, z6.h, z19.h\n"
+    "fmla z23.h, p3/M, z8.h, z18.h\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "st1h { z24.h }, p0, [x28]\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
+    "st1h { z22.h }, p0, [x22]\n"
+    "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..90982b6990
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[16];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "cnth x14\n"
+    "ldp x13, x12, [x20, #0x0]\n"
+    "ldp x11, x10, [x20, #0x10]\n"
+    "mov x9, #0x0\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z20.h }, p3/Z, [x16]\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+    "sub x28, XZR, x14\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+    "ld1h { z9.h }, p2/Z, [x24, x9, LSL #1]\n"
+    "addvl x16, x16, #-6\n"
+    "ld1h { z10.h }, p2/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
+    "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
+    "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x48]\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ldr x22, [x15, #0x50]\n"
+    "fmla z22.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x58]\n"
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "ldr x20, [x15, #0x60]\n"
+    "ldr x27, [x15, #0x68]\n"
+    "fmla z22.h, p3/M, z4.h, z13.h\n"
+    "fmla z21.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ldr x26, [x15, #0x70]\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ldr x25, [x15, #0x78]\n"
+    "fmla z22.h, p3/M, z5.h, z20.h\n"
+    "fmla z21.h, p3/M, z4.h, z20.h\n"
+    "whilelt p1.h, x14, %x[n_channels]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "fmla z24.h, p3/M, z2.h, z18.h\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "fmla z22.h, p3/M, z0.h, z17.h\n"
+    "fmla z21.h, p3/M, z2.h, z16.h\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1h { z13.h }, p1/Z, [x20, x14, LSL #1]\n"
+    "fmla z24.h, p3/M, z8.h, z20.h\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z18.h }, p2/Z, [x27, x9, LSL #1]\n"
+    "inch x28\n"
+    "fmla z22.h, p3/M, z3.h, z19.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "mov p0.b, p2.b\n"
+    "ld1h { z20.h }, p3/Z, [x16]\n"
+    "fmla z24.h, p3/M, z3.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x9, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "inch x9\n"
+    "ld1h { z11.h }, p1/Z, [x22, x14, LSL #1]\n"
+    "fmla z24.h, p3/M, z6.h, z19.h\n"
+    "fmla z23.h, p3/M, z8.h, z18.h\n"
+    "ld1h { z9.h }, p1/Z, [x24, x14, LSL #1]\n"
+    "ld1h { z10.h }, p1/Z, [x23, x14, LSL #1]\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z12.h }, p1/Z, [x21, x14, LSL #1]\n"
+    "inch x14\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+    "whilelt p2.h, x9, %x[n_channels]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
+    "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
+    "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x48]\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z23.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x50]\n"
+    "fmla z22.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x58]\n"
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "ldr x23, [x15, #0x60]\n"
+    "ldr x22, [x15, #0x68]\n"
+    "fmla z22.h, p3/M, z4.h, z13.h\n"
+    "fmla z21.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x70]\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla z22.h, p3/M, z5.h, z20.h\n"
+    "fmla z21.h, p3/M, z4.h, z20.h\n"
+    "inch x28\n"
+    "mov p0.b, p2.b\n"
+    "fmla z24.h, p3/M, z2.h, z18.h\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x23, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z0.h, z17.h\n"
+    "fmla z21.h, p3/M, z2.h, z16.h\n"
+    "fmla z24.h, p3/M, z8.h, z20.h\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z18.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z3.h, z19.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "fmla z24.h, p3/M, z3.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z24.h, p3/M, z6.h, z19.h\n"
+    "fmla z23.h, p3/M, z8.h, z18.h\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..da2ef72a30
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..a22ab39d6f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x13, #0x0\n"
+    "mov x8, #0x0\n"
+    "1:"  // Tile loop
+    "str x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x3\n"
+    "mov x24, #0x3\n"
+    "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x13, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x8, x17, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x15\n"
+    "mul x20, x13, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x12, x17, x17\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x14, x14, x22, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x10, x14, x23, LSL #1\n"
+    "madd x20, x8, x16, x20\n"  // offset += tile_j * ld_output_col
+    "add x9, x10, x23, LSL #1\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z14.h }, p3/Z, [x13]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
+    "add x28, x9, x23, LSL #1\n"
+    "ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
+    "add x27, x12, x17\n"
+    "add x11, x11, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x13, #6, MUL VL]\n"
+    "add x26, x28, x23, LSL #1\n"
+    "add x25, x27, x17\n"
+    "ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
+    "addvl x13, x13, #16\n"
+    "add x24, x11, x21, LSL #1\n"
+    "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cmp x15, %x[n_channels]\n"
+    "add x23, x24, x21, LSL #1\n"
+    "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
+    "add x22, x16, x16\n"
+    "mov x21, #0x0\n"
+    "ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
+    "ld1h { z9.h }, p2/Z, [x9, x12, LSL #1]\n"
+    "sub x20, XZR, x15\n"
+    "ld1h { z10.h }, p2/Z, [x14]\n"
+    "ld1h { z11.h }, p2/Z, [x14, x25, LSL #1]\n"
+    "addvl x13, x13, #-6\n"
+    "ld1h { z12.h }, p2/Z, [x26]\n"
+    "ld1h { z13.h }, p2/Z, [x10, x12, LSL #1]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "whilelt p1.h, x15, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "inch x15\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+    "inch x20\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z6.h, z18.h\n"
+    "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z27.h, p3/M, z3.h, z13.h\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z25.h, p3/M, z1.h, z13.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z22.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
+    "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
+    "fmla z29.h, p3/M, z0.h, z17.h\n"
+    "ld1h { z14.h }, p3/Z, [x13]\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z18.h\n"
+    "fmla z20.h, p3/M, z0.h, z18.h\n"
+    "fmla z26.h, p3/M, z4.h, z18.h\n"
+    "fmla z25.h, p3/M, z3.h, z18.h\n"
+    "fmla z22.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x10]\n"
+    "fmla z29.h, p3/M, z2.h, z16.h\n"
+    "fmla z27.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x28]\n"
+    "fmla z24.h, p3/M, z4.h, z23.h\n"
+    "fmla z28.h, p3/M, z1.h, z17.h\n"
+    "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
+    "fmla z20.h, p3/M, z2.h, z23.h\n"
+    "fmla z21.h, p3/M, z1.h, z23.h\n"
+    "fmla z29.h, p3/M, z8.h, z23.h\n"
+    "fmla z27.h, p3/M, z7.h, z23.h\n"
+    "fmla z25.h, p3/M, z5.h, z23.h\n"
+    "fmla z26.h, p3/M, z0.h, z19.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "fmla z24.h, p3/M, z2.h, z16.h\n"
+    "fmla z20.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z3.h, z17.h\n"
+    "fmla z28.h, p3/M, z3.h, z19.h\n"
+    "fmla z27.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z18.h\n"
+    "fmla z25.h, p3/M, z7.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
+    "fmla z22.h, p3/M, z5.h, z17.h\n"
+    "fmla z24.h, p3/M, z6.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z19.h\n"
+    "fmla z20.h, p3/M, z6.h, z16.h\n"
+    "fmla z26.h, p3/M, z8.h, z17.h\n"
+    "fmla z22.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z18.h\n"
+    "fmla z25.h, p3/M, z0.h, z18.h\n"
+    "fmla z24.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
+    "fmla z20.h, p3/M, z8.h, z17.h\n"
+    "addvl x10, x10, #1\n"
+    "fmla z21.h, p3/M, z7.h, z17.h\n"
+    "fmla z28.h, p3/M, z4.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z18.h\n"
+    "fmla z29.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
+    "addvl x28, x28, #1\n"
+    "fmla z27.h, p3/M, z4.h, z16.h\n"
+    "fmla z25.h, p3/M, z2.h, z16.h\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "addvl x14, x14, #1\n"
+    "fmla z20.h, p3/M, z3.h, z17.h\n"
+    "fmla z21.h, p3/M, z4.h, z19.h\n"
+    "ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
+    "ld1h { z10.h }, p1/Z, [x14]\n"
+    "fmla z26.h, p3/M, z7.h, z17.h\n"
+    "fmla z25.h, p3/M, z6.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x9]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "fmla z29.h, p3/M, z1.h, z16.h\n"
+    "fmax z29.h, p3/M, z29.h, z31.h\n"
+    "ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
+    "fmla z27.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
+    "fmla z24.h, p3/M, z7.h, z19.h\n"
+    "addvl x9, x9, #1\n"
+    "fmla z20.h, p3/M, z5.h, z19.h\n"
+    "fmla z22.h, p3/M, z0.h, z18.h\n"
+    "ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
+    "fmin z29.h, p3/M, z29.h, z30.h\n"
+    "fmla z21.h, p3/M, z2.h, z17.h\n"
+    "fmla z25.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
+    "fmax z25.h, p3/M, z25.h, z31.h\n"
+    "fmla z28.h, p3/M, z6.h, z18.h\n"
+    "fmla z26.h, p3/M, z3.h, z18.h\n"
+    "fmax z28.h, p3/M, z28.h, z31.h\n"
+    "fmax z26.h, p3/M, z26.h, z31.h\n"
+    "fmla z27.h, p3/M, z8.h, z17.h\n"
+    "fmla z24.h, p3/M, z5.h, z17.h\n"
+    "fmax z27.h, p3/M, z27.h, z31.h\n"
+    "fmax z24.h, p3/M, z24.h, z31.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z20.h, p3/M, z7.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z31.h\n"
+    "fmax z20.h, p3/M, z20.h, z31.h\n"
+    "fmla z21.h, p3/M, z6.h, z16.h\n"
+    "fmax z21.h, p3/M, z21.h, z31.h\n"
+    "addvl x26, x26, #1\n"
+    "ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x13, #6, MUL VL]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "cmp x15, %x[n_channels]\n"
+    "ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
+    "addvl x13, x13, #16\n"
+    "fmin z28.h, p3/M, z28.h, z30.h\n"
+    "ld1h { z9.h }, p1/Z, [x9, x12, LSL #1]\n"
+    "fmin z27.h, p3/M, z27.h, z30.h\n"
+    "fmin z26.h, p3/M, z26.h, z30.h\n"
+    "ld1h { z11.h }, p1/Z, [x14, x25, LSL #1]\n"
+    "ld1h { z12.h }, p1/Z, [x26]\n"
+    "fmin z25.h, p3/M, z25.h, z30.h\n"
+    "fmin z24.h, p3/M, z24.h, z30.h\n"
+    "ld1h { z13.h }, p1/Z, [x10, x12, LSL #1]\n"
+    "st1h { z28.h }, p0, [x11]\n"
+    "fmin z22.h, p3/M, z22.h, z30.h\n"
+    "fmin z20.h, p3/M, z20.h, z30.h\n"
+    "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
+    "fmin z21.h, p3/M, z21.h, z30.h\n"
+    "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
+    "addvl x11, x11, #1\n"
+    "ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
+    "st1h { z26.h }, p0, [x24]\n"
+    "addvl x13, x13, #-6\n"
+    "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
+    "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
+    "addvl x24, x24, #1\n"
+    "st1h { z22.h }, p0, [x23]\n"
+    "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
+    "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x8, x8, #0x1\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+    "cmp x8, x20\n"
+    "add x21, x13, #0x1\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "csel x13, x13, x21, LT\n"
+    "fmla z29.h, p3/M, z6.h, z18.h\n"
+    "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "mov p0.b, p2.b\n"
+    "csel x8, x8, XZR, LT\n"
+    "fmla z28.h, p3/M, z5.h, z13.h\n"
+    "fmla z27.h, p3/M, z3.h, z13.h\n"
+    "cmp x13, x20\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z25.h, p3/M, z1.h, z13.h\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z22.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
+    "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
+    "fmla z29.h, p3/M, z0.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z18.h\n"
+    "fmla z20.h, p3/M, z0.h, z18.h\n"
+    "fmla z26.h, p3/M, z4.h, z18.h\n"
+    "fmla z25.h, p3/M, z3.h, z18.h\n"
+    "fmla z22.h, p3/M, z1.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x10]\n"
+    "fmla z29.h, p3/M, z2.h, z16.h\n"
+    "fmla z27.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x28]\n"
+    "fmla z24.h, p3/M, z4.h, z23.h\n"
+    "fmla z28.h, p3/M, z1.h, z17.h\n"
+    "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
+    "fmla z20.h, p3/M, z2.h, z23.h\n"
+    "fmla z21.h, p3/M, z1.h, z23.h\n"
+    "fmla z29.h, p3/M, z8.h, z23.h\n"
+    "fmla z27.h, p3/M, z7.h, z23.h\n"
+    "fmla z25.h, p3/M, z5.h, z23.h\n"
+    "fmla z26.h, p3/M, z0.h, z19.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "fmla z24.h, p3/M, z2.h, z16.h\n"
+    "fmla z20.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z3.h, z17.h\n"
+    "fmla z28.h, p3/M, z3.h, z19.h\n"
+    "fmla z27.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
+    "fmla z26.h, p3/M, z6.h, z18.h\n"
+    "fmla z25.h, p3/M, z7.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
+    "fmla z22.h, p3/M, z5.h, z17.h\n"
+    "fmla z24.h, p3/M, z6.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z19.h\n"
+    "fmla z20.h, p3/M, z6.h, z16.h\n"
+    "fmla z26.h, p3/M, z8.h, z17.h\n"
+    "fmla z22.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z18.h\n"
+    "fmla z25.h, p3/M, z0.h, z18.h\n"
+    "fmla z24.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
+    "fmla z20.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z7.h, z17.h\n"
+    "fmla z28.h, p3/M, z4.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+    "fmla z26.h, p3/M, z1.h, z18.h\n"
+    "fmla z29.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
+    "fmla z27.h, p3/M, z4.h, z16.h\n"
+    "fmla z25.h, p3/M, z2.h, z16.h\n"
+    "fmla z24.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z20.h, p3/M, z3.h, z17.h\n"
+    "fmla z21.h, p3/M, z4.h, z19.h\n"
+    "fmla z26.h, p3/M, z7.h, z17.h\n"
+    "fmla z25.h, p3/M, z6.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x9]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "fmla z29.h, p3/M, z1.h, z16.h\n"
+    "fmax z29.h, p3/M, z29.h, z31.h\n"
+    "fmin z29.h, p3/M, z29.h, z30.h\n"
+    "fmla z27.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
+    "fmla z24.h, p3/M, z7.h, z19.h\n"
+    "fmla z20.h, p3/M, z5.h, z19.h\n"
+    "fmla z22.h, p3/M, z0.h, z18.h\n"
+    "fmla z21.h, p3/M, z2.h, z17.h\n"
+    "fmla z25.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
+    "fmax z25.h, p3/M, z25.h, z31.h\n"
+    "fmla z28.h, p3/M, z6.h, z18.h\n"
+    "fmla z26.h, p3/M, z3.h, z18.h\n"
+    "fmax z28.h, p3/M, z28.h, z31.h\n"
+    "fmax z26.h, p3/M, z26.h, z31.h\n"
+    "fmla z27.h, p3/M, z8.h, z17.h\n"
+    "fmla z24.h, p3/M, z5.h, z17.h\n"
+    "fmax z27.h, p3/M, z27.h, z31.h\n"
+    "fmax z24.h, p3/M, z24.h, z31.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "fmla z20.h, p3/M, z7.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z31.h\n"
+    "fmax z20.h, p3/M, z20.h, z31.h\n"
+    "fmla z21.h, p3/M, z6.h, z16.h\n"
+    "fmax z21.h, p3/M, z21.h, z31.h\n"
+    "fmin z28.h, p3/M, z28.h, z30.h\n"
+    "st1h { z28.h }, p0, [x11]\n"
+    "fmin z27.h, p3/M, z27.h, z30.h\n"
+    "fmin z26.h, p3/M, z26.h, z30.h\n"
+    "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
+    "fmin z25.h, p3/M, z25.h, z30.h\n"
+    "fmin z24.h, p3/M, z24.h, z30.h\n"
+    "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
+    "fmin z22.h, p3/M, z22.h, z30.h\n"
+    "fmin z20.h, p3/M, z20.h, z30.h\n"
+    "st1h { z26.h }, p0, [x24]\n"
+    "fmin z21.h, p3/M, z21.h, z30.h\n"
+    "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
+    "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
+    "st1h { z22.h }, p0, [x23]\n"
+    "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
+    "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..4f8368acd5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1h { z14.h }, p3/Z, [x8]\n"
+    "cnth x16\n"
+    "mov x15, #0x0\n"
+    "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+    "cmp x16, %x[n_channels]\n"
+    "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+    "sub x14, XZR, x16\n"
+    "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x8, x8, #16\n"
+    "ldp x24, x23, [x17, #0x0]\n"
+    "ldp x22, x21, [x17, #0x10]\n"
+    "ldr x20, [x17, #0x20]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+    "ld1h { z9.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "addvl x8, x8, #-6\n"
+    "ld1h { z10.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
+    "ldr x23, [x17, #0x30]\n"
+    "ldr x26, [x17, #0x38]\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z0.h, z10.h\n"
+    "ldr x22, [x17, #0x28]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "ldr x20, [x17, #0x40]\n"
+    "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+    "ldr x25, [x17, #0x50]\n"
+    "ldr x24, [x17, #0x58]\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
+    "ldr x23, [x17, #0x60]\n"
+    "fmla z29.h, p3/M, z5.h, z13.h\n"
+    "fmla z28.h, p3/M, z6.h, z18.h\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ldr x11, [x17, #0x88]\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
+    "fmla z27.h, p3/M, z3.h, z13.h\n"
+    "inch x14\n"
+    "mov p1.b, p2.b\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z25.h, p3/M, z1.h, z13.h\n"
+    "ldr x10, [x13, #0x0]\n"
+    "whilelt p0.h, x16, %x[n_channels]\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
+    "fmla z29.h, p3/M, z7.h, z18.h\n"
+    "ldr x22, [x17, #0x68]\n"
+    "ldr x21, [x17, #0x78]\n"
+    "fmla z28.h, p3/M, z0.h, z17.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x17, #0x80]\n"
+    "fmla z26.h, p3/M, z4.h, z18.h\n"
+    "fmla z25.h, p3/M, z3.h, z18.h\n"
+    "ldr x9, [x13, #0x8]\n"
+    "ldr x28, [x13, #0x10]\n"
+    "fmla z21.h, p3/M, z0.h, z18.h\n"
+    "fmla z24.h, p3/M, z4.h, z19.h\n"
+    "ldr x27, [x13, #0x18]\n"
+    "ld1h { z14.h }, p3/Z, [x8]\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "fmla z29.h, p3/M, z1.h, z17.h\n"
+    "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "fmla z27.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ldr x26, [x17, #0x90]\n"
+    "fmla z25.h, p3/M, z5.h, z19.h\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "ldr x25, [x17, #0xa0]\n"
+    "ldr x24, [x17, #0x98]\n"
+    "fmla z26.h, p3/M, z0.h, z20.h\n"
+    "fmla z24.h, p3/M, z2.h, z17.h\n"
+    "fmla z28.h, p3/M, z8.h, z19.h\n"
+    "fmla z27.h, p3/M, z7.h, z19.h\n"
+    "fmla z22.h, p3/M, z1.h, z19.h\n"
+    "fmla z23.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x23, [x17, #0xa8]\n"
+    "fmla z26.h, p3/M, z6.h, z16.h\n"
+    "fmla z25.h, p3/M, z7.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z24.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z4.h, z18.h\n"
+    "fmla z29.h, p3/M, z3.h, z20.h\n"
+    "fmla z27.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z18.h\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "ldr x21, [x17, #0xb0]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.h, p3/M, z8.h, z18.h\n"
+    "fmla z24.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z16.h\n"
+    "fmla z28.h, p3/M, z3.h, z19.h\n"
+    "fmla z25.h, p3/M, z0.h, z19.h\n"
+    "fmla z22.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z19.h\n"
+    "fmla z26.h, p3/M, z1.h, z19.h\n"
+    "fmla z28.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z4.h, z17.h\n"
+    "fmla z25.h, p3/M, z2.h, z17.h\n"
+    "fmla z24.h, p3/M, z1.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z18.h\n"
+    "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ldr x25, [x17, #0x20]\n"
+    "fmla z22.h, p3/M, z7.h, z18.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z17.h\n"
+    "fmla z26.h, p3/M, z7.h, z16.h\n"
+    "fmla z25.h, p3/M, z6.h, z16.h\n"
+    "fmla z23.h, p3/M, z4.h, z16.h\n"
+    "fmla z21.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z4.h, z18.h\n"
+    "fmla z28.h, p3/M, z1.h, z17.h\n"
+    "fmax z28.h, p3/M, z28.h, z31.h\n"
+    "fmin z28.h, p3/M, z28.h, z30.h\n"
+    "fmla z27.h, p3/M, z0.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z6.h, z16.h\n"
+    "fmax z29.h, p3/M, z29.h, z31.h\n"
+    "fmla z24.h, p3/M, z7.h, z18.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "fmin z29.h, p3/M, z29.h, z30.h\n"
+    "st1h { z29.h }, p1, [x10, x14, LSL #1]\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "fmla z22.h, p3/M, z2.h, z17.h\n"
+    "ldr x24, [x13, #0x20]\n"
+    "st1h { z28.h }, p1, [x9, x14, LSL #1]\n"
+    "fmla z25.h, p3/M, z8.h, z18.h\n"
+    "fmla z26.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldp x23, x22, [x17, #0x0]\n"
+    "fmla z27.h, p3/M, z8.h, z17.h\n"
+    "fmla z24.h, p3/M, z5.h, z17.h\n"
+    "ldp x21, x20, [x17, #0x10]\n"
+    "fmax z27.h, p3/M, z27.h, z31.h\n"
+    "fmla z23.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "fmax z26.h, p3/M, z26.h, z31.h\n"
+    "fmax z25.h, p3/M, z25.h, z31.h\n"
+    "fmla z22.h, p3/M, z6.h, z16.h\n"
+    "inch x15\n"
+    "ld1h { z9.h }, p0/Z, [x23, x16, LSL #1]\n"
+    "ld1h { z10.h }, p0/Z, [x22, x16, LSL #1]\n"
+    "ld1h { z11.h }, p0/Z, [x21, x16, LSL #1]\n"
+    "ld1h { z12.h }, p0/Z, [x20, x16, LSL #1]\n"
+    "fmin z27.h, p3/M, z27.h, z30.h\n"
+    "fmin z26.h, p3/M, z26.h, z30.h\n"
+    "ld1h { z13.h }, p0/Z, [x25, x16, LSL #1]\n"
+    "inch x16\n"
+    "fmin z25.h, p3/M, z25.h, z30.h\n"
+    "st1h { z27.h }, p1, [x28, x14, LSL #1]\n"
+    "fmax z24.h, p3/M, z24.h, z31.h\n"
+    "fmax z23.h, p3/M, z23.h, z31.h\n"
+    "st1h { z26.h }, p1, [x27, x14, LSL #1]\n"
+    "ldr x23, [x13, #0x28]\n"
+    "fmax z21.h, p3/M, z21.h, z31.h\n"
+    "fmax z22.h, p3/M, z22.h, z31.h\n"
+    "st1h { z25.h }, p1, [x24, x14, LSL #1]\n"
+    "ldr x22, [x13, #0x30]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "whilelt p2.h, x15, %x[n_channels]\n"
+    "cmp x16, %x[n_channels]\n"
+    "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
+    "fmin z24.h, p3/M, z24.h, z30.h\n"
+    "fmin z23.h, p3/M, z23.h, z30.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+    "fmin z21.h, p3/M, z21.h, z30.h\n"
+    "fmin z22.h, p3/M, z22.h, z30.h\n"
+    "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+    "st1h { z24.h }, p1, [x23, x14, LSL #1]\n"
+    "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x8, x8, #16\n"
+    "st1h { z23.h }, p1, [x22, x14, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+    "st1h { z21.h }, p1, [x21, x14, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+    "addvl x8, x8, #-6\n"
+    "st1h { z22.h }, p1, [x20, x14, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
+    "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
+    "ldr x23, [x17, #0x30]\n"
+    "ldr x26, [x17, #0x38]\n"
+    "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+    "fmla z29.h, p3/M, z0.h, z10.h\n"
+    "ldr x22, [x17, #0x28]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+    "ldr x20, [x17, #0x40]\n"
+    "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+    "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+    "ldr x25, [x17, #0x50]\n"
+    "ldr x24, [x17, #0x58]\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
+    "ldr x23, [x17, #0x60]\n"
+    "fmla z29.h, p3/M, z5.h, z13.h\n"
+    "fmla z28.h, p3/M, z6.h, z18.h\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ldr x11, [x17, #0x88]\n"
+    "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
+    "fmla z27.h, p3/M, z3.h, z13.h\n"
+    "inch x14\n"
+    "mov p0.b, p2.b\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z25.h, p3/M, z1.h, z13.h\n"
+    "ldr x10, [x13, #0x0]\n"
+    "ldr x9, [x13, #0x8]\n"
+    "fmla z24.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
+    "fmla z29.h, p3/M, z7.h, z18.h\n"
+    "ldr x22, [x17, #0x68]\n"
+    "ldr x21, [x17, #0x78]\n"
+    "fmla z28.h, p3/M, z0.h, z17.h\n"
+    "fmla z22.h, p3/M, z8.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x20, [x17, #0x80]\n"
+    "fmla z26.h, p3/M, z4.h, z18.h\n"
+    "fmla z25.h, p3/M, z3.h, z18.h\n"
+    "ldr x28, [x13, #0x10]\n"
+    "ldr x27, [x13, #0x18]\n"
+    "fmla z21.h, p3/M, z0.h, z18.h\n"
+    "fmla z24.h, p3/M, z4.h, z19.h\n"
+    "fmla z23.h, p3/M, z1.h, z18.h\n"
+    "fmla z29.h, p3/M, z1.h, z17.h\n"
+    "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z16.h\n"
+    "fmla z27.h, p3/M, z1.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "ldr x26, [x17, #0x90]\n"
+    "fmla z25.h, p3/M, z5.h, z19.h\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "ldr x25, [x17, #0xa0]\n"
+    "ldr x24, [x17, #0x98]\n"
+    "fmla z26.h, p3/M, z0.h, z20.h\n"
+    "fmla z24.h, p3/M, z2.h, z17.h\n"
+    "fmla z28.h, p3/M, z8.h, z19.h\n"
+    "fmla z27.h, p3/M, z7.h, z19.h\n"
+    "fmla z22.h, p3/M, z1.h, z19.h\n"
+    "fmla z23.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "ldr x23, [x17, #0xa8]\n"
+    "fmla z26.h, p3/M, z6.h, z16.h\n"
+    "fmla z25.h, p3/M, z7.h, z18.h\n"
+    "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z24.h, p3/M, z6.h, z18.h\n"
+    "fmla z21.h, p3/M, z4.h, z18.h\n"
+    "fmla z29.h, p3/M, z3.h, z20.h\n"
+    "fmla z27.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z18.h\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "ldr x21, [x17, #0xb0]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.h, p3/M, z8.h, z18.h\n"
+    "fmla z24.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z6.h, z16.h\n"
+    "fmla z28.h, p3/M, z3.h, z19.h\n"
+    "fmla z25.h, p3/M, z0.h, z19.h\n"
+    "fmla z22.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "fmla z23.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z19.h\n"
+    "fmla z26.h, p3/M, z1.h, z19.h\n"
+    "fmla z28.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z4.h, z17.h\n"
+    "fmla z25.h, p3/M, z2.h, z17.h\n"
+    "fmla z24.h, p3/M, z1.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z18.h\n"
+    "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z18.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z17.h\n"
+    "fmla z26.h, p3/M, z7.h, z16.h\n"
+    "fmla z25.h, p3/M, z6.h, z16.h\n"
+    "fmla z23.h, p3/M, z4.h, z16.h\n"
+    "fmla z21.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z4.h, z18.h\n"
+    "fmla z28.h, p3/M, z1.h, z17.h\n"
+    "fmax z28.h, p3/M, z28.h, z31.h\n"
+    "fmin z28.h, p3/M, z28.h, z30.h\n"
+    "fmla z27.h, p3/M, z0.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+    "fmla z29.h, p3/M, z6.h, z16.h\n"
+    "fmax z29.h, p3/M, z29.h, z31.h\n"
+    "fmla z24.h, p3/M, z7.h, z18.h\n"
+    "fmla z21.h, p3/M, z5.h, z18.h\n"
+    "fmin z29.h, p3/M, z29.h, z30.h\n"
+    "st1h { z29.h }, p0, [x10, x14, LSL #1]\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "fmla z22.h, p3/M, z2.h, z17.h\n"
+    "ldr x20, [x13, #0x20]\n"
+    "st1h { z28.h }, p0, [x9, x14, LSL #1]\n"
+    "fmla z25.h, p3/M, z8.h, z18.h\n"
+    "fmla z26.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+    "fmax z26.h, p3/M, z26.h, z31.h\n"
+    "fmla z27.h, p3/M, z8.h, z17.h\n"
+    "fmla z24.h, p3/M, z5.h, z17.h\n"
+    "fmax z27.h, p3/M, z27.h, z31.h\n"
+    "fmax z25.h, p3/M, z25.h, z31.h\n"
+    "fmla z23.h, p3/M, z8.h, z16.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "fmin z27.h, p3/M, z27.h, z30.h\n"
+    "fmin z26.h, p3/M, z26.h, z30.h\n"
+    "fmla z22.h, p3/M, z6.h, z16.h\n"
+    "fmin z25.h, p3/M, z25.h, z30.h\n"
+    "fmax z24.h, p3/M, z24.h, z31.h\n"
+    "st1h { z27.h }, p0, [x28, x14, LSL #1]\n"
+    "fmax z23.h, p3/M, z23.h, z31.h\n"
+    "fmax z21.h, p3/M, z21.h, z31.h\n"
+    "st1h { z26.h }, p0, [x27, x14, LSL #1]\n"
+    "ldr x23, [x13, #0x28]\n"
+    "fmax z22.h, p3/M, z22.h, z31.h\n"
+    "st1h { z25.h }, p0, [x20, x14, LSL #1]\n"
+    "ldr x22, [x13, #0x30]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmin z24.h, p3/M, z24.h, z30.h\n"
+    "fmin z23.h, p3/M, z23.h, z30.h\n"
+    "st1h { z24.h }, p0, [x23, x14, LSL #1]\n"
+    "fmin z21.h, p3/M, z21.h, z30.h\n"
+    "fmin z22.h, p3/M, z22.h, z30.h\n"
+    "st1h { z23.h }, p0, [x22, x14, LSL #1]\n"
+    "st1h { z21.h }, p0, [x21, x14, LSL #1]\n"
+    "st1h { z22.h }, p0, [x20, x14, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..af5ee740c9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..41eaa4f18c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x16, #0x0\n"
+    "mov x4, #0x0\n"
+    "1:"  // Tile loop
+    "str x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x24, #0x4\n"
+    "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x16, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x5, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x16, x22\n"  // offset = tile_i * ld_output_row
+    "add x7, x5, x5\n"
+    "madd x21, x4, x5, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "cnth x16\n"
+    "madd x20, x4, x6, x20\n"  // offset += tile_j * ld_output_col
+    "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x14, x7, x5\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x8, x8, x21, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x13, x8, x23, LSL #1\n"
+    "ld1h { z19.h }, p3/Z, [x17]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "add x12, x13, x23, LSL #1\n"
+    "add x15, x15, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+    "add x11, x12, x23, LSL #1\n"
+    "add x10, x14, x5\n"
+    "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+    "add x9, x15, x22, LSL #1\n"
+    "add x28, x11, x23, LSL #1\n"
+    "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+    "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+    "addvl x17, x17, #16\n"
+    "add x27, x10, x5\n"
+    "add x26, x9, x22, LSL #1\n"
+    "add x25, x6, x6\n"
+    "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cmp x16, %x[n_channels]\n"
+    "add x24, x28, x23, LSL #1\n"
+    "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+    "add x23, x26, x22, LSL #1\n"
+    "add x22, x25, x6\n"
+    "ld1h { z9.h }, p2/Z, [x12, x7, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x8]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x16\n"
+    "ld1h { z11.h }, p2/Z, [x8, x27, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
+    "addvl x17, x17, #-6\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
+    "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
+    "whilelt p1.h, x16, %x[n_channels]\n"
+    "inch x21\n"
+    "movprfx z21, z19\n fmla z21.h, p3/M, z3.h, z9.h\n"
+    "movprfx z22, z19\n fmla z22.h, p3/M, z1.h, z9.h\n"
+    "inch x16\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z14.h, p3/M, z5.h, z12.h\n"
+    "inch x20\n"
+    "movprfx z13, z19\n fmla z13.h, p3/M, z7.h, z9.h\n"
+    "movprfx z17, z19\n fmla z17.h, p3/M, z6.h, z9.h\n"
+    "movprfx z27, z19\n fmla z27.h, p3/M, z5.h, z9.h\n"
+    "movprfx z18, z19\n fmla z18.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z29.h }, p2/Z, [x24]\n"
+    "ld1h { z11.h }, p2/Z, [x24, x27, LSL #1]\n"
+    "fmla z21.h, p3/M, z4.h, z12.h\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z20.h, p3/M, z1.h, z12.h\n"
+    "movprfx z23, z19\n fmla z23.h, p3/M, z6.h, z29.h\n"
+    "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+    "fmla z14.h, p3/M, z7.h, z9.h\n"
+    "fmla z13.h, p3/M, z8.h, z12.h\n"
+    "fmla z17.h, p3/M, z7.h, z12.h\n"
+    "fmla z30.h, p3/M, z6.h, z12.h\n"
+    "movprfx z26, z19\n fmla z26.h, p3/M, z3.h, z12.h\n"
+    "movprfx z28, z19\n fmla z28.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x5, LSL #1]\n"
+    "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z11.h\n"
+    "fmla z21.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x10, LSL #1]\n"
+    "fmla z22.h, p3/M, z4.h, z9.h\n"
+    "fmla z20.h, p3/M, z3.h, z9.h\n"
+    "movprfx z25, z19\n fmla z25.h, p3/M, z1.h, z9.h\n"
+    "movprfx z29, z19\n fmla z29.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z19.h }, p3/Z, [x17]\n"
+    "fmla z27.h, p3/M, z8.h, z9.h\n"
+    "fmla z18.h, p3/M, z5.h, z9.h\n"
+    "fmla z23.h, p3/M, z2.h, z9.h\n"
+    "fmla z14.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z9.h }, p2/Z, [x13]\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z13.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x28]\n"
+    "fmla z21.h, p3/M, z7.h, z10.h\n"
+    "fmla z26.h, p3/M, z6.h, z10.h\n"
+    "fmla z22.h, p3/M, z5.h, z10.h\n"
+    "fmla z20.h, p3/M, z4.h, z10.h\n"
+    "fmla z28.h, p3/M, z3.h, z10.h\n"
+    "fmla z25.h, p3/M, z2.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z10.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z27.h, p3/M, z0.h, z9.h\n"
+    "fmla z18.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z3.h, z11.h\n"
+    "fmla z14.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "fmla z13.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z9.h }, p2/Z, [x13, x14, LSL #1]\n"
+    "fmla z17.h, p3/M, z3.h, z10.h\n"
+    "fmla z21.h, p3/M, z0.h, z10.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmla z24.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n"
+    "fmla z27.h, p3/M, z2.h, z10.h\n"
+    "fmla z14.h, p3/M, z2.h, z9.h\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z13.h, p3/M, z5.h, z9.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x5, LSL #1]\n"
+    "fmla z17.h, p3/M, z4.h, z9.h\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "fmla z21.h, p3/M, z1.h, z9.h\n"
+    "fmla z26.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x10, LSL #1]\n"
+    "fmla z23.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x10, LSL #1]\n"
+    "fmla z27.h, p3/M, z4.h, z11.h\n"
+    "fmla z14.h, p3/M, z3.h, z11.h\n"
+    "fmla z18.h, p3/M, z1.h, z11.h\n"
+    "fmla z22.h, p3/M, z0.h, z11.h\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "fmla z13.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z9.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z24.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+    "fmla z17.h, p3/M, z8.h, z10.h\n"
+    "fmla z30.h, p3/M, z7.h, z10.h\n"
+    "fmla z21.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "fmla z20.h, p3/M, z2.h, z10.h\n"
+    "fmla z28.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x14, LSL #1]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z27.h, p3/M, z7.h, z12.h\n"
+    "fmla z14.h, p3/M, z6.h, z12.h\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmla z22.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z1.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x10, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "fmla z13.h, p3/M, z1.h, z9.h\n"
+    "fmla z17.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x12]\n"
+    "fmla z29.h, p3/M, z2.h, z12.h\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z27.h, p3/M, z3.h, z9.h\n"
+    "fmla z18.h, p3/M, z0.h, z9.h\n"
+    "fmla z21.h, p3/M, z8.h, z12.h\n"
+    "fmla z26.h, p3/M, z7.h, z12.h\n"
+    "fmla z20.h, p3/M, z5.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "fmla z24.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z10.h }, p2/Z, [x28, x7, LSL #1]\n"
+    "fmla z13.h, p3/M, z2.h, z11.h\n"
+    "fmla z17.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n"
+    "addvl x12, x12, #1\n"
+    "fmla z31.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z12.h }, p2/Z, [x11]\n"
+    "fmla z25.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z9.h }, p1/Z, [x12, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z10.h\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z18.h, p3/M, z3.h, z12.h\n"
+    "fmla z23.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x7, LSL #1]\n"
+    "fmla z24.h, p3/M, z2.h, z11.h\n"
+    "fmla z25.h, p3/M, z7.h, z12.h\n"
+    "fmla z29.h, p3/M, z6.h, z12.h\n"
+    "fmla z18.h, p3/M, z8.h, z10.h\n"
+    "fmla z22.h, p3/M, z7.h, z10.h\n"
+    "fmla z20.h, p3/M, z6.h, z10.h\n"
+    "fmla z23.h, p3/M, z5.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x28, x14, LSL #1]\n"
+    "fmla z28.h, p3/M, z5.h, z11.h\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "fmla z24.h, p3/M, z3.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x5, LSL #1]\n"
+    "fmla z22.h, p3/M, z8.h, z10.h\n"
+    "fmla z20.h, p3/M, z7.h, z10.h\n"
+    "addvl x24, x24, #1\n"
+    "fmla z28.h, p3/M, z6.h, z10.h\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x10, LSL #1]\n"
+    "addvl x13, x13, #1\n"
+    "fmla z29.h, p3/M, z7.h, z11.h\n"
+    "fmla z24.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x28, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "fmla z13.h, p3/M, z3.h, z12.h\n"
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "fmax z13.h, p3/M, z13.h, z15.h\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "fmla z14.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z0.h }, p2/Z, [x28, x10, LSL #1]\n"
+    "fmax z27.h, p3/M, z27.h, z15.h\n"
+    "fmla z17.h, p3/M, z5.h, z10.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmax z17.h, p3/M, z17.h, z15.h\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "fmla z21.h, p3/M, z2.h, z10.h\n"
+    "fmla z26.h, p3/M, z1.h, z10.h\n"
+    "fmax z14.h, p3/M, z14.h, z15.h\n"
+    "fmax z21.h, p3/M, z21.h, z15.h\n"
+    "fmla z18.h, p3/M, z7.h, z11.h\n"
+    "fmla z22.h, p3/M, z6.h, z11.h\n"
+    "fmax z26.h, p3/M, z26.h, z15.h\n"
+    "fmax z18.h, p3/M, z18.h, z15.h\n"
+    "fmla z23.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z3.h, z11.h\n"
+    "fmax z22.h, p3/M, z22.h, z15.h\n"
+    "fmax z23.h, p3/M, z23.h, z15.h\n"
+    "fmla z20.h, p3/M, z8.h, z0.h\n"
+    "fmla z28.h, p3/M, z7.h, z0.h\n"
+    "fmax z20.h, p3/M, z20.h, z15.h\n"
+    "fmax z28.h, p3/M, z28.h, z15.h\n"
+    "fmla z29.h, p3/M, z5.h, z0.h\n"
+    "fmla z24.h, p3/M, z4.h, z0.h\n"
+    "fmax z25.h, p3/M, z25.h, z15.h\n"
+    "fmax z29.h, p3/M, z29.h, z15.h\n"
+    "fmax z24.h, p3/M, z24.h, z15.h\n"
+    "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+    "cmp x16, %x[n_channels]\n"
+    "fmin z31.h, p3/M, z31.h, z16.h\n"
+    "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+    "fmin z13.h, p3/M, z13.h, z16.h\n"
+    "fmin z17.h, p3/M, z17.h, z16.h\n"
+    "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+    "addvl x17, x17, #16\n"
+    "fmin z30.h, p3/M, z30.h, z16.h\n"
+    "ld1h { z10.h }, p1/Z, [x8]\n"
+    "fmin z27.h, p3/M, z27.h, z16.h\n"
+    "fmin z14.h, p3/M, z14.h, z16.h\n"
+    "ld1h { z11.h }, p1/Z, [x8, x27, LSL #1]\n"
+    "ld1h { z12.h }, p1/Z, [x12, x14, LSL #1]\n"
+    "fmin z21.h, p3/M, z21.h, z16.h\n"
+    "fmin z26.h, p3/M, z26.h, z16.h\n"
+    "st1h { z31.h }, p0, [x15]\n"
+    "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+    "fmin z18.h, p3/M, z18.h, z16.h\n"
+    "fmin z22.h, p3/M, z22.h, z16.h\n"
+    "st1h { z13.h }, p0, [x15, x6, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+    "fmin z20.h, p3/M, z20.h, z16.h\n"
+    "fmin z28.h, p3/M, z28.h, z16.h\n"
+    "st1h { z17.h }, p0, [x15, x25, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    "fmin z25.h, p3/M, z25.h, z16.h\n"
+    "st1h { z30.h }, p0, [x15, x22, LSL #1]\n"
+    "fmin z29.h, p3/M, z29.h, z16.h\n"
+    "fmin z24.h, p3/M, z24.h, z16.h\n"
+    "st1h { z27.h }, p0, [x9]\n"
+    "addvl x28, x28, #1\n"
+    "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
+    "addvl x15, x15, #1\n"
+    "st1h { z21.h }, p0, [x9, x25, LSL #1]\n"
+    "addvl x17, x17, #-6\n"
+    "st1h { z26.h }, p0, [x9, x22, LSL #1]\n"
+    "addvl x9, x9, #1\n"
+    "st1h { z18.h }, p0, [x26]\n"
+    "st1h { z22.h }, p0, [x26, x6, LSL #1]\n"
+    "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
+    "st1h { z28.h }, p0, [x26, x22, LSL #1]\n"
+    "addvl x26, x26, #1\n"
+    "st1h { z23.h }, p0, [x23]\n"
+    "st1h { z25.h }, p0, [x23, x6, LSL #1]\n"
+    "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+    "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
+    "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z30, z19\n fmla z30.h, p3/M, z3.h, z9.h\n"
+    "movprfx z13, z19\n fmla z13.h, p3/M, z1.h, z9.h\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x4, x4, #0x1\n"
+    "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
+    "fmla z14.h, p3/M, z5.h, z12.h\n"
+    "cmp x4, x20\n"
+    "add x21, x16, #0x1\n"
+    "movprfx z18, z19\n fmla z18.h, p3/M, z7.h, z9.h\n"
+    "movprfx z28, z19\n fmla z28.h, p3/M, z6.h, z9.h\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x16, x16, x21, LT\n"
+    "movprfx z17, z19\n fmla z17.h, p3/M, z5.h, z9.h\n"
+    "movprfx z26, z19\n fmla z26.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "movprfx z27, z19\n fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z29.h }, p2/Z, [x24]\n"
+    "ld1h { z21.h }, p2/Z, [x24, x27, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "fmla z13.h, p3/M, z2.h, z12.h\n"
+    "csel x4, x4, XZR, LT\n"
+    "cmp x16, x20\n"
+    "fmla z20.h, p3/M, z1.h, z12.h\n"
+    "movprfx z10, z19\n fmla z10.h, p3/M, z6.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x11, x14, LSL #1]\n"
+    "fmla z14.h, p3/M, z7.h, z9.h\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "movprfx z11, z19\n fmla z11.h, p3/M, z3.h, z12.h\n"
+    "movprfx z25, z19\n fmla z25.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z22.h }, p2/Z, [x8, x5, LSL #1]\n"
+    "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z21.h\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z21.h }, p2/Z, [x8, x10, LSL #1]\n"
+    "fmla z13.h, p3/M, z4.h, z9.h\n"
+    "fmla z20.h, p3/M, z3.h, z9.h\n"
+    "movprfx z12, z19\n fmla z12.h, p3/M, z1.h, z9.h\n"
+    "movprfx z23, z19\n fmla z23.h, p3/M, z0.h, z9.h\n"
+    "fmla z17.h, p3/M, z8.h, z9.h\n"
+    "fmla z26.h, p3/M, z5.h, z9.h\n"
+    "fmla z10.h, p3/M, z2.h, z9.h\n"
+    "fmla z14.h, p3/M, z8.h, z29.h\n"
+    "ld1h { z9.h }, p2/Z, [x13]\n"
+    "fmla z31.h, p3/M, z1.h, z22.h\n"
+    "fmla z18.h, p3/M, z0.h, z22.h\n"
+    "ld1h { z22.h }, p2/Z, [x13, x27, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z21.h\n"
+    "fmla z27.h, p3/M, z1.h, z21.h\n"
+    "ld1h { z19.h }, p2/Z, [x28]\n"
+    "fmla z30.h, p3/M, z7.h, z29.h\n"
+    "fmla z11.h, p3/M, z6.h, z29.h\n"
+    "fmla z13.h, p3/M, z5.h, z29.h\n"
+    "fmla z20.h, p3/M, z4.h, z29.h\n"
+    "fmla z25.h, p3/M, z3.h, z29.h\n"
+    "fmla z12.h, p3/M, z2.h, z29.h\n"
+    "fmla z23.h, p3/M, z1.h, z29.h\n"
+    "fmla z24.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z21.h }, p2/Z, [x13, x7, LSL #1]\n"
+    "fmla z17.h, p3/M, z0.h, z9.h\n"
+    "fmla z26.h, p3/M, z6.h, z19.h\n"
+    "fmla z10.h, p3/M, z3.h, z19.h\n"
+    "fmla z14.h, p3/M, z1.h, z21.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "fmla z27.h, p3/M, z5.h, z22.h\n"
+    "fmla z11.h, p3/M, z2.h, z22.h\n"
+    "fmla z18.h, p3/M, z4.h, z21.h\n"
+    "ld1h { z29.h }, p2/Z, [x13, x14, LSL #1]\n"
+    "fmla z28.h, p3/M, z3.h, z21.h\n"
+    "fmla z30.h, p3/M, z0.h, z21.h\n"
+    "fmla z25.h, p3/M, z8.h, z19.h\n"
+    "fmla z24.h, p3/M, z5.h, z19.h\n"
+    "ld1h { z19.h }, p2/Z, [x24, x5, LSL #1]\n"
+    "fmla z17.h, p3/M, z2.h, z21.h\n"
+    "fmla z14.h, p3/M, z2.h, z29.h\n"
+    "fmla z31.h, p3/M, z5.h, z21.h\n"
+    "fmla z18.h, p3/M, z5.h, z29.h\n"
+    "ld1h { z22.h }, p2/Z, [x12, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z4.h, z29.h\n"
+    "fmla z27.h, p3/M, z3.h, z29.h\n"
+    "fmla z30.h, p3/M, z1.h, z29.h\n"
+    "fmla z11.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z21.h }, p2/Z, [x12, x10, LSL #1]\n"
+    "fmla z10.h, p3/M, z7.h, z19.h\n"
+    "fmla z12.h, p3/M, z6.h, z19.h\n"
+    "ld1h { z19.h }, p2/Z, [x24, x10, LSL #1]\n"
+    "fmla z17.h, p3/M, z4.h, z22.h\n"
+    "fmla z14.h, p3/M, z3.h, z22.h\n"
+    "fmla z26.h, p3/M, z1.h, z22.h\n"
+    "fmla z13.h, p3/M, z0.h, z22.h\n"
+    "fmla z31.h, p3/M, z7.h, z22.h\n"
+    "fmla z18.h, p3/M, z6.h, z22.h\n"
+    "ld1h { z29.h }, p2/Z, [x8, x7, LSL #1]\n"
+    "fmla z23.h, p3/M, z8.h, z19.h\n"
+    "fmla z24.h, p3/M, z7.h, z19.h\n"
+    "ld1h { z19.h }, p2/Z, [x11, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z21.h\n"
+    "fmla z27.h, p3/M, z7.h, z21.h\n"
+    "fmla z30.h, p3/M, z5.h, z21.h\n"
+    "fmla z11.h, p3/M, z4.h, z21.h\n"
+    "fmla z20.h, p3/M, z2.h, z21.h\n"
+    "fmla z25.h, p3/M, z1.h, z21.h\n"
+    "ld1h { z22.h }, p2/Z, [x8, x14, LSL #1]\n"
+    "fmla z17.h, p3/M, z7.h, z19.h\n"
+    "fmla z14.h, p3/M, z6.h, z19.h\n"
+    "fmla z26.h, p3/M, z4.h, z19.h\n"
+    "fmla z13.h, p3/M, z3.h, z19.h\n"
+    "fmla z10.h, p3/M, z1.h, z19.h\n"
+    "fmla z12.h, p3/M, z0.h, z19.h\n"
+    "ld1h { z21.h }, p2/Z, [x11, x10, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z29.h\n"
+    "fmla z18.h, p3/M, z1.h, z29.h\n"
+    "fmla z28.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x12]\n"
+    "fmla z23.h, p3/M, z2.h, z21.h\n"
+    "fmla z27.h, p3/M, z0.h, z22.h\n"
+    "fmla z17.h, p3/M, z3.h, z29.h\n"
+    "fmla z26.h, p3/M, z0.h, z29.h\n"
+    "fmla z30.h, p3/M, z8.h, z21.h\n"
+    "fmla z11.h, p3/M, z7.h, z21.h\n"
+    "fmla z20.h, p3/M, z5.h, z21.h\n"
+    "fmla z25.h, p3/M, z4.h, z21.h\n"
+    "fmla z24.h, p3/M, z1.h, z21.h\n"
+    "ld1h { z19.h }, p2/Z, [x28, x7, LSL #1]\n"
+    "fmla z18.h, p3/M, z2.h, z22.h\n"
+    "fmla z28.h, p3/M, z1.h, z22.h\n"
+    "ld1h { z21.h }, p2/Z, [x12, x27, LSL #1]\n"
+    "fmla z31.h, p3/M, z6.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x11]\n"
+    "fmla z12.h, p3/M, z4.h, z19.h\n"
+    "fmla z23.h, p3/M, z3.h, z19.h\n"
+    "fmla z27.h, p3/M, z8.h, z21.h\n"
+    "fmla z11.h, p3/M, z5.h, z21.h\n"
+    "fmla z25.h, p3/M, z2.h, z21.h\n"
+    "ld1h { z9.h }, p2/Z, [x11, x27, LSL #1]\n"
+    "fmla z17.h, p3/M, z6.h, z29.h\n"
+    "fmla z26.h, p3/M, z3.h, z29.h\n"
+    "fmla z10.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z22.h }, p2/Z, [x24, x7, LSL #1]\n"
+    "fmla z24.h, p3/M, z2.h, z9.h\n"
+    "fmla z12.h, p3/M, z7.h, z22.h\n"
+    "fmla z23.h, p3/M, z6.h, z22.h\n"
+    "fmla z26.h, p3/M, z8.h, z19.h\n"
+    "fmla z13.h, p3/M, z7.h, z19.h\n"
+    "fmla z20.h, p3/M, z6.h, z19.h\n"
+    "fmla z10.h, p3/M, z5.h, z19.h\n"
+    "ld1h { z21.h }, p2/Z, [x28, x14, LSL #1]\n"
+    "fmla z25.h, p3/M, z5.h, z9.h\n"
+    "fmla z12.h, p3/M, z5.h, z21.h\n"
+    "fmla z23.h, p3/M, z4.h, z21.h\n"
+    "fmla z24.h, p3/M, z3.h, z21.h\n"
+    "fmla z11.h, p3/M, z8.h, z9.h\n"
+    "ld1h { z19.h }, p2/Z, [x24, x14, LSL #1]\n"
+    "fmla z10.h, p3/M, z8.h, z22.h\n"
+    "ld1h { z22.h }, p2/Z, [x13, x5, LSL #1]\n"
+    "fmla z13.h, p3/M, z8.h, z21.h\n"
+    "fmla z20.h, p3/M, z7.h, z21.h\n"
+    "fmla z25.h, p3/M, z6.h, z21.h\n"
+    "fmla z12.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z29.h }, p2/Z, [x13, x10, LSL #1]\n"
+    "fmla z23.h, p3/M, z7.h, z19.h\n"
+    "fmla z24.h, p3/M, z6.h, z19.h\n"
+    "ld1h { z21.h }, p2/Z, [x28, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z22.h\n"
+    "fmla z18.h, p3/M, z3.h, z22.h\n"
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "fmax z18.h, p3/M, z18.h, z15.h\n"
+    "fmla z17.h, p3/M, z1.h, z22.h\n"
+    "fmla z14.h, p3/M, z0.h, z22.h\n"
+    "ld1h { z9.h }, p2/Z, [x28, x10, LSL #1]\n"
+    "fmax z17.h, p3/M, z17.h, z15.h\n"
+    "fmla z28.h, p3/M, z5.h, z29.h\n"
+    "fmla z27.h, p3/M, z4.h, z29.h\n"
+    "fmax z28.h, p3/M, z28.h, z15.h\n"
+    "fmax z27.h, p3/M, z27.h, z15.h\n"
+    "fmla z30.h, p3/M, z2.h, z29.h\n"
+    "fmla z11.h, p3/M, z1.h, z29.h\n"
+    "fmax z14.h, p3/M, z14.h, z15.h\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "fmla z26.h, p3/M, z7.h, z21.h\n"
+    "fmla z13.h, p3/M, z6.h, z21.h\n"
+    "fmax z11.h, p3/M, z11.h, z15.h\n"
+    "fmax z26.h, p3/M, z26.h, z15.h\n"
+    "fmla z10.h, p3/M, z4.h, z21.h\n"
+    "fmla z12.h, p3/M, z3.h, z21.h\n"
+    "fmax z13.h, p3/M, z13.h, z15.h\n"
+    "fmax z10.h, p3/M, z10.h, z15.h\n"
+    "fmla z20.h, p3/M, z8.h, z9.h\n"
+    "fmla z25.h, p3/M, z7.h, z9.h\n"
+    "fmax z20.h, p3/M, z20.h, z15.h\n"
+    "fmax z25.h, p3/M, z25.h, z15.h\n"
+    "fmla z23.h, p3/M, z5.h, z9.h\n"
+    "fmla z24.h, p3/M, z4.h, z9.h\n"
+    "fmax z12.h, p3/M, z12.h, z15.h\n"
+    "fmax z23.h, p3/M, z23.h, z15.h\n"
+    "fmax z24.h, p3/M, z24.h, z15.h\n"
+    "fmin z31.h, p3/M, z31.h, z16.h\n"
+    "st1h { z31.h }, p0, [x15]\n"
+    "fmin z18.h, p3/M, z18.h, z16.h\n"
+    "fmin z28.h, p3/M, z28.h, z16.h\n"
+    "st1h { z18.h }, p0, [x15, x6, LSL #1]\n"
+    "fmin z27.h, p3/M, z27.h, z16.h\n"
+    "fmin z17.h, p3/M, z17.h, z16.h\n"
+    "st1h { z28.h }, p0, [x15, x25, LSL #1]\n"
+    "fmin z14.h, p3/M, z14.h, z16.h\n"
+    "fmin z30.h, p3/M, z30.h, z16.h\n"
+    "st1h { z27.h }, p0, [x15, x22, LSL #1]\n"
+    "fmin z11.h, p3/M, z11.h, z16.h\n"
+    "fmin z26.h, p3/M, z26.h, z16.h\n"
+    "st1h { z17.h }, p0, [x9]\n"
+    "fmin z13.h, p3/M, z13.h, z16.h\n"
+    "fmin z20.h, p3/M, z20.h, z16.h\n"
+    "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
+    "fmin z25.h, p3/M, z25.h, z16.h\n"
+    "fmin z10.h, p3/M, z10.h, z16.h\n"
+    "st1h { z30.h }, p0, [x9, x25, LSL #1]\n"
+    "fmin z12.h, p3/M, z12.h, z16.h\n"
+    "fmin z23.h, p3/M, z23.h, z16.h\n"
+    "st1h { z11.h }, p0, [x9, x22, LSL #1]\n"
+    "fmin z24.h, p3/M, z24.h, z16.h\n"
+    "st1h { z26.h }, p0, [x26]\n"
+    "st1h { z13.h }, p0, [x26, x6, LSL #1]\n"
+    "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
+    "st1h { z25.h }, p0, [x26, x22, LSL #1]\n"
+    "st1h { z10.h }, p0, [x23]\n"
+    "st1h { z12.h }, p0, [x23, x6, LSL #1]\n"
+    "st1h { z23.h }, p0, [x23, x25, LSL #1]\n"
+    "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..c0be293cd7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1h { z17.h }, p3/Z, [x7]\n"
+    "cnth x17\n"
+    "mov x16, #0x0\n"
+    "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
+    "cmp x17, %x[n_channels]\n"
+    "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
+    "sub x15, XZR, x17\n"
+    "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
+    "addvl x7, x7, #16\n"
+    "ldp x23, x22, [x8, #0x0]\n"
+    "ldp x21, x20, [x8, #0x10]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
+    "addvl x7, x7, #-6\n"
+    "ld1h { z9.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z20, z17\n fmla z20.h, p3/M, z4.h, z9.h\n"
+    "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z9.h\n"
+    "ldr x27, [x8, #0x20]\n"
+    "ldr x24, [x8, #0x30]\n"
+    "movprfx z24, z17\n fmla z24.h, p3/M, z3.h, z9.h\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ldr x23, [x8, #0x28]\n"
+    "ldr x22, [x8, #0x38]\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+    "movprfx z22, z17\n fmla z22.h, p3/M, z7.h, z9.h\n"
+    "ldr x26, [x8, #0x40]\n"
+    "ldr x21, [x8, #0x48]\n"
+    "movprfx z27, z17\n fmla z27.h, p3/M, z6.h, z9.h\n"
+    "fmla z20.h, p3/M, z5.h, z12.h\n"
+    "ldr x25, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "movprfx z14, z17\n fmla z14.h, p3/M, z5.h, z9.h\n"
+    "movprfx z23, z17\n fmla z23.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z25.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x13, [x8, #0x70]\n"
+    "fmla z26.h, p3/M, z0.h, z10.h\n"
+    "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z28.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "fmla z24.h, p3/M, z4.h, z12.h\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "ldr x24, [x8, #0x60]\n"
+    "ldr x23, [x8, #0x68]\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z22.h, p3/M, z8.h, z12.h\n"
+    "inch x15\n"
+    "mov p1.b, p2.b\n"
+    "fmla z27.h, p3/M, z7.h, z12.h\n"
+    "movprfx z15, z17\n fmla z15.h, p3/M, z6.h, z28.h\n"
+    "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x28, [x8, #0x88]\n"
+    "fmla z20.h, p3/M, z7.h, z25.h\n"
+    "fmla z9.h, p3/M, z6.h, z12.h\n"
+    "ldr x12, [x14, #0x0]\n"
+    "ldr x11, [x14, #0x8]\n"
+    "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
+    "movprfx z13, z17\n fmla z13.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ldr x22, [x8, #0x78]\n"
+    "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z21.h\n"
+    "fmla z24.h, p3/M, z6.h, z25.h\n"
+    "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "ldr x21, [x8, #0x80]\n"
+    "fmla z30.h, p3/M, z4.h, z25.h\n"
+    "fmla z31.h, p3/M, z3.h, z25.h\n"
+    "ldr x10, [x14, #0x10]\n"
+    "ldr x9, [x14, #0x18]\n"
+    "movprfx z18, z17\n fmla z18.h, p3/M, z1.h, z25.h\n"
+    "movprfx z21, z17\n fmla z21.h, p3/M, z0.h, z25.h\n"
+    "whilelt p0.h, x17, %x[n_channels]\n"
+    "ld1h { z17.h }, p3/Z, [x7]\n"
+    "fmla z14.h, p3/M, z8.h, z25.h\n"
+    "fmla z23.h, p3/M, z5.h, z25.h\n"
+    "fmla z15.h, p3/M, z2.h, z25.h\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "ldr x27, [x8, #0x90]\n"
+    "fmla z22.h, p3/M, z0.h, z12.h\n"
+    "fmla z27.h, p3/M, z2.h, z29.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x20, [x8, #0x98]\n"
+    "fmla z20.h, p3/M, z8.h, z10.h\n"
+    "fmla z9.h, p3/M, z1.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x26, [x8, #0xa0]\n"
+    "fmla z24.h, p3/M, z7.h, z10.h\n"
+    "fmla z11.h, p3/M, z6.h, z10.h\n"
+    "fmla z30.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z13.h, p3/M, z3.h, z10.h\n"
+    "fmla z18.h, p3/M, z2.h, z10.h\n"
+    "fmla z21.h, p3/M, z1.h, z10.h\n"
+    "fmla z28.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "ldr x25, [x8, #0xa8]\n"
+    "fmla z26.h, p3/M, z3.h, z25.h\n"
+    "fmla z14.h, p3/M, z0.h, z25.h\n"
+    "fmla z23.h, p3/M, z6.h, z29.h\n"
+    "fmla z15.h, p3/M, z3.h, z29.h\n"
+    "ld1h { z25.h }, p2/Z, [x13, x16, LSL #1]\n"
+    "ldr x24, [x8, #0xb0]\n"
+    "fmla z22.h, p3/M, z4.h, z10.h\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z20.h, p3/M, z1.h, z10.h\n"
+    "fmla z9.h, p3/M, z5.h, z12.h\n"
+    "fmla z11.h, p3/M, z2.h, z12.h\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ldr x23, [x8, #0xb8]\n"
+    "fmla z13.h, p3/M, z8.h, z25.h\n"
+    "fmla z28.h, p3/M, z5.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x22, [x8, #0xc0]\n"
+    "fmla z26.h, p3/M, z5.h, z10.h\n"
+    "fmla z14.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z29.h }, p2/Z, [x28, x16, LSL #1]\n"
+    "ldr x21, [x8, #0xc8]\n"
+    "fmla z22.h, p3/M, z5.h, z12.h\n"
+    "fmla z27.h, p3/M, z4.h, z12.h\n"
+    "fmla z20.h, p3/M, z2.h, z12.h\n"
+    "fmla z9.h, p3/M, z3.h, z12.h\n"
+    "fmla z24.h, p3/M, z1.h, z12.h\n"
+    "fmla z11.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x28, [x8, #0xd8]\n"
+    "fmla z15.h, p3/M, z7.h, z25.h\n"
+    "fmla z18.h, p3/M, z6.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "ldr x20, [x8, #0xd0]\n"
+    "fmla z26.h, p3/M, z7.h, z29.h\n"
+    "fmla z22.h, p3/M, z6.h, z29.h\n"
+    "fmla z14.h, p3/M, z4.h, z29.h\n"
+    "fmla z20.h, p3/M, z3.h, z29.h\n"
+    "fmla z23.h, p3/M, z1.h, z29.h\n"
+    "fmla z30.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "ldr x27, [x8, #0xe0]\n"
+    "fmla z27.h, p3/M, z8.h, z10.h\n"
+    "fmla z21.h, p3/M, z8.h, z25.h\n"
+    "fmla z28.h, p3/M, z7.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "fmla z13.h, p3/M, z1.h, z10.h\n"
+    "ldr x26, [x8, #0xe8]\n"
+    "fmla z9.h, p3/M, z7.h, z10.h\n"
+    "fmla z24.h, p3/M, z5.h, z10.h\n"
+    "fmla z11.h, p3/M, z4.h, z10.h\n"
+    "fmla z31.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x25, [x8, #0xf0]\n"
+    "fmla z26.h, p3/M, z2.h, z29.h\n"
+    "fmla z22.h, p3/M, z1.h, z29.h\n"
+    "fmla z27.h, p3/M, z0.h, z29.h\n"
+    "fmla z14.h, p3/M, z7.h, z25.h\n"
+    "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "ldr x24, [x8, #0xf8]\n"
+    "fmla z20.h, p3/M, z6.h, z25.h\n"
+    "fmla z23.h, p3/M, z4.h, z25.h\n"
+    "fmla z30.h, p3/M, z3.h, z25.h\n"
+    "fmla z15.h, p3/M, z1.h, z25.h\n"
+    "fmla z18.h, p3/M, z0.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "fmla z13.h, p3/M, z4.h, z25.h\n"
+    "ldr x23, [x8, #0x100]\n"
+    "fmla z21.h, p3/M, z2.h, z25.h\n"
+    "fmla z22.h, p3/M, z2.h, z10.h\n"
+    "fmla z27.h, p3/M, z1.h, z10.h\n"
+    "fmla z9.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x22, [x8, #0x108]\n"
+    "fmla z26.h, p3/M, z6.h, z29.h\n"
+    "fmla z14.h, p3/M, z3.h, z29.h\n"
+    "fmla z23.h, p3/M, z0.h, z29.h\n"
+    "fmla z24.h, p3/M, z8.h, z25.h\n"
+    "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x21, [x8, #0x110]\n"
+    "fmla z11.h, p3/M, z7.h, z25.h\n"
+    "fmla z31.h, p3/M, z5.h, z25.h\n"
+    "fmla z28.h, p3/M, z1.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x28, x16, LSL #1]\n"
+    "fmla z13.h, p3/M, z2.h, z12.h\n"
+    "ldr x20, [x8, #0x118]\n"
+    "fmla z15.h, p3/M, z0.h, z10.h\n"
+    "fmla z18.h, p3/M, z4.h, z25.h\n"
+    "fmla z21.h, p3/M, z3.h, z25.h\n"
+    "fmla z9.h, p3/M, z8.h, z12.h\n"
+    "fmla z11.h, p3/M, z5.h, z12.h\n"
+    "fmla z14.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "fmla z23.h, p3/M, z3.h, z10.h\n"
+    "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z25.h\n"
+    "fmla z31.h, p3/M, z6.h, z25.h\n"
+    "fmla z15.h, p3/M, z5.h, z25.h\n"
+    "fmla z13.h, p3/M, z5.h, z12.h\n"
+    "fmla z28.h, p3/M, z2.h, z12.h\n"
+    "fmla z18.h, p3/M, z7.h, z29.h\n"
+    "fmla z21.h, p3/M, z6.h, z29.h\n"
+    "fmla z23.h, p3/M, z8.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "fmla z15.h, p3/M, z8.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z25.h\n"
+    "fmla z31.h, p3/M, z7.h, z25.h\n"
+    "fmla z13.h, p3/M, z6.h, z25.h\n"
+    "fmla z18.h, p3/M, z5.h, z25.h\n"
+    "fmla z21.h, p3/M, z4.h, z25.h\n"
+    "fmla z28.h, p3/M, z3.h, z25.h\n"
+    "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ldp x27, x26, [x8, #0x0]\n"
+    "fmla z11.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z29.h\n"
+    "fmax z26.h, p3/M, z26.h, z16.h\n"
+    "fmla z22.h, p3/M, z3.h, z29.h\n"
+    "fmla z27.h, p3/M, z5.h, z25.h\n"
+    "fmax z22.h, p3/M, z22.h, z16.h\n"
+    "fmax z27.h, p3/M, z27.h, z16.h\n"
+    "fmla z9.h, p3/M, z4.h, z25.h\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmax z9.h, p3/M, z9.h, z16.h\n"
+    "fmin z26.h, p3/M, z26.h, z19.h\n"
+    "fmla z21.h, p3/M, z7.h, z12.h\n"
+    "fmla z28.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "fmin z22.h, p3/M, z22.h, z19.h\n"
+    "fmla z14.h, p3/M, z1.h, z29.h\n"
+    "fmla z20.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "fmin z27.h, p3/M, z27.h, z19.h\n"
+    "fmla z24.h, p3/M, z2.h, z25.h\n"
+    "fmla z11.h, p3/M, z1.h, z25.h\n"
+    "fmin z9.h, p3/M, z9.h, z19.h\n"
+    "fmax z14.h, p3/M, z14.h, z16.h\n"
+    "fmla z23.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmax z20.h, p3/M, z20.h, z16.h\n"
+    "fmax z24.h, p3/M, z24.h, z16.h\n"
+    "fmla z31.h, p3/M, z8.h, z12.h\n"
+    "fmla z13.h, p3/M, z7.h, z12.h\n"
+    "fmax z11.h, p3/M, z11.h, z16.h\n"
+    "st1h { z26.h }, p1, [x12, x15, LSL #1]\n"
+    "st1h { z22.h }, p1, [x11, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z15.h, p3/M, z4.h, z10.h\n"
+    "st1h { z27.h }, p1, [x10, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z18.h, p3/M, z3.h, z10.h\n"
+    "fmla z21.h, p3/M, z5.h, z12.h\n"
+    "st1h { z9.h }, p1, [x9, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z28.h, p3/M, z4.h, z12.h\n"
+    "ldp x25, x24, [x8, #0x10]\n"
+    "fmin z14.h, p3/M, z14.h, z19.h\n"
+    "fmin z20.h, p3/M, z20.h, z19.h\n"
+    "st1h { z14.h }, p1, [x23, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmin z24.h, p3/M, z24.h, z19.h\n"
+    "fmin z11.h, p3/M, z11.h, z19.h\n"
+    "st1h { z20.h }, p1, [x22, x15, LSL #1]\n"
+    "ldr x22, [x14, #0x48]\n"
+    "fmax z23.h, p3/M, z23.h, z16.h\n"
+    "fmax z30.h, p3/M, z30.h, z16.h\n"
+    "st1h { z24.h }, p1, [x21, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "fmax z31.h, p3/M, z31.h, z16.h\n"
+    "fmax z13.h, p3/M, z13.h, z16.h\n"
+    "st1h { z11.h }, p1, [x20, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x58]\n"
+    "inch x16\n"
+    "ld1h { z9.h }, p0/Z, [x27, x17, LSL #1]\n"
+    "ld1h { z10.h }, p0/Z, [x26, x17, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z19.h\n"
+    "ld1h { z11.h }, p0/Z, [x25, x17, LSL #1]\n"
+    "ld1h { z12.h }, p0/Z, [x24, x17, LSL #1]\n"
+    "inch x17\n"
+    "fmin z30.h, p3/M, z30.h, z19.h\n"
+    "fmin z31.h, p3/M, z31.h, z19.h\n"
+    "fmin z13.h, p3/M, z13.h, z19.h\n"
+    "st1h { z23.h }, p1, [x23, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "fmax z15.h, p3/M, z15.h, z16.h\n"
+    "fmax z18.h, p3/M, z18.h, z16.h\n"
+    "st1h { z30.h }, p1, [x22, x15, LSL #1]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "fmax z21.h, p3/M, z21.h, z16.h\n"
+    "fmax z28.h, p3/M, z28.h, z16.h\n"
+    "st1h { z31.h }, p1, [x21, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x70]\n"
+    "st1h { z13.h }, p1, [x20, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
+    "whilelt p2.h, x16, %x[n_channels]\n"
+    "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
+    "cmp x17, %x[n_channels]\n"
+    "fmin z15.h, p3/M, z15.h, z19.h\n"
+    "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
+    "fmin z18.h, p3/M, z18.h, z19.h\n"
+    "fmin z21.h, p3/M, z21.h, z19.h\n"
+    "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
+    "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
+    "addvl x7, x7, #16\n"
+    "fmin z28.h, p3/M, z28.h, z19.h\n"
+    "st1h { z15.h }, p1, [x23, x15, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
+    "addvl x7, x7, #-6\n"
+    "st1h { z18.h }, p1, [x22, x15, LSL #1]\n"
+    "st1h { z21.h }, p1, [x21, x15, LSL #1]\n"
+    "st1h { z28.h }, p1, [x20, x15, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z14, z17\n fmla z14.h, p3/M, z4.h, z9.h\n"
+    "movprfx z18, z17\n fmla z18.h, p3/M, z8.h, z9.h\n"
+    "ldr x27, [x8, #0x20]\n"
+    "ldr x24, [x8, #0x30]\n"
+    "movprfx z15, z17\n fmla z15.h, p3/M, z3.h, z9.h\n"
+    "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ldr x23, [x8, #0x28]\n"
+    "ldr x22, [x8, #0x38]\n"
+    "movprfx z20, z17\n fmla z20.h, p3/M, z0.h, z9.h\n"
+    "movprfx z13, z17\n fmla z13.h, p3/M, z7.h, z9.h\n"
+    "ldr x26, [x8, #0x40]\n"
+    "ldr x21, [x8, #0x48]\n"
+    "movprfx z22, z17\n fmla z22.h, p3/M, z6.h, z9.h\n"
+    "fmla z14.h, p3/M, z5.h, z12.h\n"
+    "ldr x25, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "movprfx z27, z17\n fmla z27.h, p3/M, z5.h, z9.h\n"
+    "movprfx z31, z17\n fmla z31.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z23.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x13, [x8, #0x70]\n"
+    "fmla z18.h, p3/M, z0.h, z10.h\n"
+    "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z21.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "ld1h { z25.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "fmla z15.h, p3/M, z4.h, z12.h\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "ldr x24, [x8, #0x60]\n"
+    "ldr x23, [x8, #0x68]\n"
+    "fmla z20.h, p3/M, z1.h, z12.h\n"
+    "fmla z13.h, p3/M, z8.h, z12.h\n"
+    "inch x15\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.h, p3/M, z7.h, z12.h\n"
+    "movprfx z28, z17\n fmla z28.h, p3/M, z6.h, z21.h\n"
+    "ld1h { z29.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x28, [x8, #0x88]\n"
+    "fmla z14.h, p3/M, z7.h, z23.h\n"
+    "fmla z9.h, p3/M, z6.h, z12.h\n"
+    "ldr x12, [x14, #0x0]\n"
+    "ldr x11, [x14, #0x8]\n"
+    "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
+    "movprfx z10, z17\n fmla z10.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ldr x22, [x8, #0x78]\n"
+    "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z25.h\n"
+    "fmla z15.h, p3/M, z6.h, z23.h\n"
+    "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "ldr x21, [x8, #0x80]\n"
+    "fmla z30.h, p3/M, z4.h, z23.h\n"
+    "fmla z20.h, p3/M, z3.h, z23.h\n"
+    "ldr x10, [x14, #0x10]\n"
+    "ldr x9, [x14, #0x18]\n"
+    "movprfx z25, z17\n fmla z25.h, p3/M, z1.h, z23.h\n"
+    "movprfx z24, z17\n fmla z24.h, p3/M, z0.h, z23.h\n"
+    "fmla z27.h, p3/M, z8.h, z23.h\n"
+    "fmla z31.h, p3/M, z5.h, z23.h\n"
+    "fmla z28.h, p3/M, z2.h, z23.h\n"
+    "fmla z18.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "ldr x27, [x8, #0x90]\n"
+    "fmla z13.h, p3/M, z0.h, z12.h\n"
+    "fmla z22.h, p3/M, z2.h, z21.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x20, [x8, #0x98]\n"
+    "fmla z14.h, p3/M, z8.h, z29.h\n"
+    "fmla z9.h, p3/M, z1.h, z21.h\n"
+    "ld1h { z21.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x26, [x8, #0xa0]\n"
+    "fmla z15.h, p3/M, z7.h, z29.h\n"
+    "fmla z11.h, p3/M, z6.h, z29.h\n"
+    "fmla z30.h, p3/M, z5.h, z29.h\n"
+    "fmla z20.h, p3/M, z4.h, z29.h\n"
+    "fmla z10.h, p3/M, z3.h, z29.h\n"
+    "fmla z25.h, p3/M, z2.h, z29.h\n"
+    "fmla z24.h, p3/M, z1.h, z29.h\n"
+    "fmla z26.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "ldr x25, [x8, #0xa8]\n"
+    "fmla z18.h, p3/M, z3.h, z23.h\n"
+    "fmla z27.h, p3/M, z0.h, z23.h\n"
+    "fmla z31.h, p3/M, z6.h, z21.h\n"
+    "fmla z28.h, p3/M, z3.h, z21.h\n"
+    "ld1h { z21.h }, p2/Z, [x13, x16, LSL #1]\n"
+    "ldr x24, [x8, #0xb0]\n"
+    "fmla z13.h, p3/M, z4.h, z29.h\n"
+    "fmla z22.h, p3/M, z3.h, z29.h\n"
+    "fmla z14.h, p3/M, z1.h, z29.h\n"
+    "fmla z9.h, p3/M, z5.h, z12.h\n"
+    "fmla z11.h, p3/M, z2.h, z12.h\n"
+    "fmla z15.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "ldr x23, [x8, #0xb8]\n"
+    "fmla z10.h, p3/M, z8.h, z21.h\n"
+    "fmla z26.h, p3/M, z5.h, z21.h\n"
+    "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x22, [x8, #0xc0]\n"
+    "fmla z18.h, p3/M, z5.h, z29.h\n"
+    "fmla z27.h, p3/M, z2.h, z29.h\n"
+    "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
+    "ldr x21, [x8, #0xc8]\n"
+    "fmla z13.h, p3/M, z5.h, z17.h\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z14.h, p3/M, z2.h, z17.h\n"
+    "fmla z9.h, p3/M, z3.h, z17.h\n"
+    "fmla z15.h, p3/M, z1.h, z17.h\n"
+    "fmla z11.h, p3/M, z0.h, z17.h\n"
+    "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x28, [x8, #0xd8]\n"
+    "fmla z28.h, p3/M, z7.h, z23.h\n"
+    "fmla z25.h, p3/M, z6.h, z23.h\n"
+    "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "ldr x20, [x8, #0xd0]\n"
+    "fmla z18.h, p3/M, z7.h, z21.h\n"
+    "fmla z13.h, p3/M, z6.h, z21.h\n"
+    "fmla z27.h, p3/M, z4.h, z21.h\n"
+    "fmla z14.h, p3/M, z3.h, z21.h\n"
+    "fmla z31.h, p3/M, z1.h, z21.h\n"
+    "fmla z30.h, p3/M, z0.h, z21.h\n"
+    "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "ldr x27, [x8, #0xe0]\n"
+    "fmla z22.h, p3/M, z8.h, z29.h\n"
+    "fmla z24.h, p3/M, z8.h, z23.h\n"
+    "fmla z26.h, p3/M, z7.h, z23.h\n"
+    "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "fmla z10.h, p3/M, z1.h, z29.h\n"
+    "ldr x26, [x8, #0xe8]\n"
+    "fmla z9.h, p3/M, z7.h, z29.h\n"
+    "fmla z15.h, p3/M, z5.h, z29.h\n"
+    "fmla z11.h, p3/M, z4.h, z29.h\n"
+    "fmla z20.h, p3/M, z2.h, z29.h\n"
+    "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "ldr x25, [x8, #0xf0]\n"
+    "fmla z18.h, p3/M, z2.h, z21.h\n"
+    "fmla z13.h, p3/M, z1.h, z21.h\n"
+    "fmla z22.h, p3/M, z0.h, z21.h\n"
+    "fmla z27.h, p3/M, z7.h, z23.h\n"
+    "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "ldr x24, [x8, #0xf8]\n"
+    "fmla z14.h, p3/M, z6.h, z23.h\n"
+    "fmla z31.h, p3/M, z4.h, z23.h\n"
+    "fmla z30.h, p3/M, z3.h, z23.h\n"
+    "fmla z28.h, p3/M, z1.h, z23.h\n"
+    "fmla z25.h, p3/M, z0.h, z23.h\n"
+    "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "fmla z10.h, p3/M, z4.h, z17.h\n"
+    "ldr x23, [x8, #0x100]\n"
+    "fmla z24.h, p3/M, z2.h, z17.h\n"
+    "fmla z13.h, p3/M, z2.h, z29.h\n"
+    "fmla z22.h, p3/M, z1.h, z29.h\n"
+    "fmla z9.h, p3/M, z0.h, z29.h\n"
+    "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "ldr x22, [x8, #0x108]\n"
+    "fmla z18.h, p3/M, z6.h, z21.h\n"
+    "fmla z27.h, p3/M, z3.h, z21.h\n"
+    "fmla z31.h, p3/M, z0.h, z21.h\n"
+    "fmla z15.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "ldr x21, [x8, #0x110]\n"
+    "fmla z11.h, p3/M, z7.h, z17.h\n"
+    "fmla z20.h, p3/M, z5.h, z17.h\n"
+    "fmla z26.h, p3/M, z1.h, z17.h\n"
+    "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
+    "fmla z10.h, p3/M, z2.h, z23.h\n"
+    "ldr x20, [x8, #0x118]\n"
+    "fmla z28.h, p3/M, z0.h, z29.h\n"
+    "fmla z25.h, p3/M, z4.h, z21.h\n"
+    "fmla z24.h, p3/M, z3.h, z21.h\n"
+    "fmla z9.h, p3/M, z8.h, z23.h\n"
+    "fmla z11.h, p3/M, z5.h, z23.h\n"
+    "fmla z27.h, p3/M, z6.h, z29.h\n"
+    "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z29.h\n"
+    "ld1h { z17.h }, p2/Z, [x26, x16, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z21.h\n"
+    "fmla z20.h, p3/M, z6.h, z21.h\n"
+    "fmla z28.h, p3/M, z5.h, z21.h\n"
+    "fmla z10.h, p3/M, z5.h, z23.h\n"
+    "fmla z26.h, p3/M, z2.h, z23.h\n"
+    "fmla z25.h, p3/M, z7.h, z17.h\n"
+    "fmla z24.h, p3/M, z6.h, z17.h\n"
+    "fmla z31.h, p3/M, z8.h, z21.h\n"
+    "ld1h { z21.h }, p2/Z, [x25, x16, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z17.h\n"
+    "ld1h { z12.h }, p2/Z, [x23, x16, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z21.h\n"
+    "fmla z20.h, p3/M, z7.h, z21.h\n"
+    "fmla z10.h, p3/M, z6.h, z21.h\n"
+    "fmla z25.h, p3/M, z5.h, z21.h\n"
+    "fmla z24.h, p3/M, z4.h, z21.h\n"
+    "fmla z26.h, p3/M, z3.h, z21.h\n"
+    "ld1h { z21.h }, p2/Z, [x22, x16, LSL #1]\n"
+    "fmla z11.h, p3/M, z8.h, z23.h\n"
+    "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+    "fmla z18.h, p3/M, z4.h, z12.h\n"
+    "fmax z18.h, p3/M, z18.h, z16.h\n"
+    "fmla z13.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z5.h, z21.h\n"
+    "fmax z13.h, p3/M, z13.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z16.h\n"
+    "fmla z9.h, p3/M, z4.h, z21.h\n"
+    "fmla z25.h, p3/M, z8.h, z29.h\n"
+    "fmax z9.h, p3/M, z9.h, z16.h\n"
+    "fmin z18.h, p3/M, z18.h, z19.h\n"
+    "fmla z24.h, p3/M, z7.h, z29.h\n"
+    "fmla z26.h, p3/M, z6.h, z29.h\n"
+    "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+    "fmin z13.h, p3/M, z13.h, z19.h\n"
+    "fmla z27.h, p3/M, z1.h, z12.h\n"
+    "fmla z14.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+    "fmin z22.h, p3/M, z22.h, z19.h\n"
+    "fmla z15.h, p3/M, z2.h, z21.h\n"
+    "fmla z11.h, p3/M, z1.h, z21.h\n"
+    "fmin z9.h, p3/M, z9.h, z19.h\n"
+    "fmax z27.h, p3/M, z27.h, z16.h\n"
+    "fmla z31.h, p3/M, z7.h, z23.h\n"
+    "fmla z30.h, p3/M, z6.h, z23.h\n"
+    "fmax z14.h, p3/M, z14.h, z16.h\n"
+    "fmax z15.h, p3/M, z15.h, z16.h\n"
+    "fmla z20.h, p3/M, z8.h, z29.h\n"
+    "fmla z10.h, p3/M, z7.h, z29.h\n"
+    "fmax z11.h, p3/M, z11.h, z16.h\n"
+    "st1h { z18.h }, p0, [x12, x15, LSL #1]\n"
+    "st1h { z13.h }, p0, [x11, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z28.h, p3/M, z4.h, z23.h\n"
+    "st1h { z22.h }, p0, [x10, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z25.h, p3/M, z3.h, z23.h\n"
+    "fmla z24.h, p3/M, z5.h, z29.h\n"
+    "st1h { z9.h }, p0, [x9, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z26.h, p3/M, z4.h, z29.h\n"
+    "fmin z27.h, p3/M, z27.h, z19.h\n"
+    "fmin z14.h, p3/M, z14.h, z19.h\n"
+    "fmin z15.h, p3/M, z15.h, z19.h\n"
+    "st1h { z27.h }, p0, [x23, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmin z11.h, p3/M, z11.h, z19.h\n"
+    "fmax z31.h, p3/M, z31.h, z16.h\n"
+    "st1h { z14.h }, p0, [x22, x15, LSL #1]\n"
+    "ldr x22, [x14, #0x48]\n"
+    "fmax z30.h, p3/M, z30.h, z16.h\n"
+    "fmax z20.h, p3/M, z20.h, z16.h\n"
+    "st1h { z15.h }, p0, [x21, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "fmax z10.h, p3/M, z10.h, z16.h\n"
+    "st1h { z11.h }, p0, [x20, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x58]\n"
+    "fmin z31.h, p3/M, z31.h, z19.h\n"
+    "fmin z30.h, p3/M, z30.h, z19.h\n"
+    "fmin z20.h, p3/M, z20.h, z19.h\n"
+    "st1h { z31.h }, p0, [x23, x15, LSL #1]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "fmin z10.h, p3/M, z10.h, z19.h\n"
+    "fmax z28.h, p3/M, z28.h, z16.h\n"
+    "st1h { z30.h }, p0, [x22, x15, LSL #1]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "fmax z25.h, p3/M, z25.h, z16.h\n"
+    "fmax z24.h, p3/M, z24.h, z16.h\n"
+    "st1h { z20.h }, p0, [x21, x15, LSL #1]\n"
+    "ldr x21, [x14, #0x70]\n"
+    "fmax z26.h, p3/M, z26.h, z16.h\n"
+    "st1h { z10.h }, p0, [x20, x15, LSL #1]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "fmin z28.h, p3/M, z28.h, z19.h\n"
+    "fmin z25.h, p3/M, z25.h, z19.h\n"
+    "fmin z24.h, p3/M, z24.h, z19.h\n"
+    "st1h { z28.h }, p0, [x23, x15, LSL #1]\n"
+    "fmin z26.h, p3/M, z26.h, z19.h\n"
+    "st1h { z25.h }, p0, [x22, x15, LSL #1]\n"
+    "st1h { z24.h }, p0, [x21, x15, LSL #1]\n"
+    "st1h { z26.h }, p0, [x20, x15, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..d8a25666bd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..58decdba1c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x11, #0x0\n"
+    "mov x16, #0x0\n"
+    "1:"  // Tile loop
+    "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x24, #0x2\n"
+    "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x11, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x16, x15, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cnth x13\n"
+    "mul x20, x11, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x10, x15, x15\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x12, x12, x22, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x28, x12, x23, LSL #1\n"
+    "madd x20, x16, x14, x20\n"  // offset += tile_j * ld_output_col
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z30.h }, p3/Z, [x11]\n"
+    "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
+    "add x27, x28, x23, LSL #1\n"
+    "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
+    "add x26, x10, x15\n"
+    "add x25, x27, x23, LSL #1\n"
+    "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
+    "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
+    "addvl x11, x11, #16\n"
+    "add x24, x26, x15\n"
+    "add x9, x9, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "cmp x13, %x[n_channels]\n"
+    "ld1rh { z29.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x23, x25, x23, LSL #1\n"
+    "add x22, x9, x21, LSL #1\n"
+    "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x13\n"
+    "ld1h { z9.h }, p2/Z, [x27, x10, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x12]\n"
+    "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x12, x26, LSL #1]\n"
+    "addvl x11, x11, #-6\n"
+    "ld1h { z13.h }, p2/Z, [x12, x24, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x28]\n"
+    "ld1h { z15.h }, p2/Z, [x28, x15, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x12, x10, LSL #1]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+    "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
+    "whilelt p1.h, x13, %x[n_channels]\n"
+    "inch x21\n"
+    "fmla z27.h, p3/M, z0.h, z10.h\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
+    "inch x13\n"
+    "fmla z27.h, p3/M, z1.h, z11.h\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z14.h\n"
+    "fmla z26.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x25]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.h, p3/M, z4.h, z15.h\n"
+    "fmla z26.h, p3/M, z4.h, z17.h\n"
+    "ld1h { z25.h }, p2/Z, [x27]\n"
+    "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z2.h, z16.h\n"
+    "fmla z26.h, p3/M, z5.h, z20.h\n"
+    "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
+    "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "addvl x12, x12, #1\n"
+    "addvl x28, x28, #1\n"
+    "fmla z27.h, p3/M, z5.h, z19.h\n"
+    "fmla z26.h, p3/M, z3.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x11]\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "fmla z21.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z0.h, z25.h\n"
+    "fmla z21.h, p3/M, z1.h, z24.h\n"
+    "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
+    "inch x20\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
+    "fmla z27.h, p3/M, z6.h, z25.h\n"
+    "fmla z22.h, p3/M, z1.h, z23.h\n"
+    "ld1h { z17.h }, p2/Z, [x23]\n"
+    "addvl x27, x27, #1\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "fmla z27.h, p3/M, z7.h, z23.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
+    "fmax z27.h, p3/M, z27.h, z29.h\n"
+    "fmla z22.h, p3/M, z6.h, z17.h\n"
+    "fmla z21.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
+    "fmla z22.h, p3/M, z7.h, z20.h\n"
+    "fmla z21.h, p3/M, z7.h, z18.h\n"
+    "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
+    "fmla z26.h, p3/M, z7.h, z24.h\n"
+    "fmla z22.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z26.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "fmax z26.h, p3/M, z26.h, z29.h\n"
+    "fmla z22.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z29.h\n"
+    "fmax z21.h, p3/M, z21.h, z29.h\n"
+    "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
+    "addvl x11, x11, #16\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "ld1h { z9.h }, p1/Z, [x27, x10, LSL #1]\n"
+    "cmp x13, %x[n_channels]\n"
+    "fmin z27.h, p3/M, z27.h, z28.h\n"
+    "ld1h { z10.h }, p1/Z, [x12]\n"
+    "ld1h { z11.h }, p1/Z, [x12, x15, LSL #1]\n"
+    "fmin z26.h, p3/M, z26.h, z28.h\n"
+    "fmin z22.h, p3/M, z22.h, z28.h\n"
+    "ld1h { z12.h }, p1/Z, [x12, x26, LSL #1]\n"
+    "ld1h { z13.h }, p1/Z, [x12, x24, LSL #1]\n"
+    "fmin z21.h, p3/M, z21.h, z28.h\n"
+    "addvl x25, x25, #1\n"
+    "ld1h { z14.h }, p1/Z, [x28]\n"
+    "ld1h { z15.h }, p1/Z, [x28, x15, LSL #1]\n"
+    "addvl x23, x23, #1\n"
+    "ld1h { z16.h }, p1/Z, [x12, x10, LSL #1]\n"
+    "st1h { z27.h }, p0, [x9]\n"
+    "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
+    "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
+    "addvl x9, x9, #1\n"
+    "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
+    "addvl x11, x11, #-6\n"
+    "st1h { z22.h }, p0, [x22]\n"
+    "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+    "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z27.h, p3/M, z0.h, z10.h\n"
+    "fmla z26.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z27.h, p3/M, z1.h, z11.h\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+    "fmla z27.h, p3/M, z3.h, z14.h\n"
+    "fmla z26.h, p3/M, z0.h, z16.h\n"
+    "ld1h { z18.h }, p2/Z, [x25]\n"
+    "add x16, x16, #0x1\n"
+    "fmla z27.h, p3/M, z4.h, z15.h\n"
+    "fmla z26.h, p3/M, z4.h, z17.h\n"
+    "ld1h { z25.h }, p2/Z, [x27]\n"
+    "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z27.h, p3/M, z2.h, z16.h\n"
+    "fmla z26.h, p3/M, z5.h, z20.h\n"
+    "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
+    "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "cmp x16, x20\n"
+    "add x21, x11, #0x1\n"
+    "fmla z27.h, p3/M, z5.h, z19.h\n"
+    "fmla z26.h, p3/M, z3.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z22.h, p3/M, z3.h, z18.h\n"
+    "fmla z21.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z22.h, p3/M, z0.h, z25.h\n"
+    "fmla z21.h, p3/M, z1.h, z24.h\n"
+    "csel x11, x11, x21, LT\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
+    "fmla z27.h, p3/M, z6.h, z25.h\n"
+    "fmla z22.h, p3/M, z1.h, z23.h\n"
+    "ld1h { z17.h }, p2/Z, [x23]\n"
+    "csel x16, x16, XZR, LT\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "fmla z27.h, p3/M, z7.h, z23.h\n"
+    "ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
+    "fmax z27.h, p3/M, z27.h, z29.h\n"
+    "fmla z22.h, p3/M, z6.h, z17.h\n"
+    "fmla z21.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
+    "cmp x11, x20\n"
+    "fmla z22.h, p3/M, z7.h, z20.h\n"
+    "fmla z21.h, p3/M, z7.h, z18.h\n"
+    "fmin z27.h, p3/M, z27.h, z28.h\n"
+    "st1h { z27.h }, p0, [x9]\n"
+    "fmla z26.h, p3/M, z7.h, z24.h\n"
+    "fmla z22.h, p3/M, z5.h, z16.h\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z26.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "fmax z26.h, p3/M, z26.h, z29.h\n"
+    "fmla z22.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "fmax z22.h, p3/M, z22.h, z29.h\n"
+    "fmax z21.h, p3/M, z21.h, z29.h\n"
+    "fmin z26.h, p3/M, z26.h, z28.h\n"
+    "fmin z22.h, p3/M, z22.h, z28.h\n"
+    "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
+    "fmin z21.h, p3/M, z21.h, z28.h\n"
+    "st1h { z22.h }, p0, [x22]\n"
+    "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..d5fbb6baee
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "cnth x14\n"
+    "ldp x13, x12, [x20, #0x0]\n"
+    "ldp x11, x10, [x20, #0x10]\n"
+    "mov x9, #0x0\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z20.h }, p3/Z, [x16]\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+    "sub x28, XZR, x14\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z15.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
+    "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x50]\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z23.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z3.h, z14.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla z24.h, p3/M, z4.h, z15.h\n"
+    "fmla z23.h, p3/M, z4.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla z24.h, p3/M, z2.h, z16.h\n"
+    "fmla z23.h, p3/M, z5.h, z18.h\n"
+    "ldr x20, [x15, #0x80]\n"
+    "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z24.h, p3/M, z5.h, z19.h\n"
+    "fmla z23.h, p3/M, z3.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla z22.h, p3/M, z3.h, z17.h\n"
+    "fmla z21.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z0.h, z18.h\n"
+    "fmla z21.h, p3/M, z1.h, z20.h\n"
+    "ldr x21, [x15, #0x70]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z6.h, z18.h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla z22.h, p3/M, z1.h, z16.h\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "fmla z24.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla z22.h, p3/M, z6.h, z16.h\n"
+    "fmla z21.h, p3/M, z3.h, z18.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z5.h, z18.h\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z23.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "whilelt p1.h, x14, %x[n_channels]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "inch x9\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ld1h { z9.h }, p1/Z, [x27, x14, LSL #1]\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "ld1h { z10.h }, p1/Z, [x26, x14, LSL #1]\n"
+    "ld1h { z11.h }, p1/Z, [x25, x14, LSL #1]\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "inch x28\n"
+    "ld1h { z12.h }, p1/Z, [x24, x14, LSL #1]\n"
+    "ld1h { z13.h }, p1/Z, [x23, x14, LSL #1]\n"
+    "mov p0.b, p2.b\n"
+    "whilelt p2.h, x9, %x[n_channels]\n"
+    "ld1h { z14.h }, p1/Z, [x22, x14, LSL #1]\n"
+    "ld1h { z15.h }, p1/Z, [x21, x14, LSL #1]\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "ld1h { z16.h }, p1/Z, [x20, x14, LSL #1]\n"
+    "inch x14\n"
+    "ld1h { z20.h }, p3/Z, [x16]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+    "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+    "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+    "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
+    "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "fmla z24.h, p3/M, z0.h, z10.h\n"
+    "fmla z23.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x50]\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z23.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z3.h, z14.h\n"
+    "fmla z23.h, p3/M, z0.h, z16.h\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla z24.h, p3/M, z4.h, z15.h\n"
+    "fmla z23.h, p3/M, z4.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla z24.h, p3/M, z2.h, z16.h\n"
+    "fmla z23.h, p3/M, z5.h, z18.h\n"
+    "ldr x20, [x15, #0x80]\n"
+    "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
+    "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z24.h, p3/M, z5.h, z19.h\n"
+    "fmla z23.h, p3/M, z3.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla z22.h, p3/M, z3.h, z17.h\n"
+    "fmla z21.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z0.h, z18.h\n"
+    "fmla z21.h, p3/M, z1.h, z20.h\n"
+    "ldr x21, [x15, #0x70]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla z22.h, p3/M, z4.h, z17.h\n"
+    "fmla z21.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z24.h, p3/M, z6.h, z18.h\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla z22.h, p3/M, z1.h, z16.h\n"
+    "fmla z21.h, p3/M, z2.h, z19.h\n"
+    "fmla z24.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "ldr x21, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla z22.h, p3/M, z6.h, z16.h\n"
+    "fmla z21.h, p3/M, z3.h, z18.h\n"
+    "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z17.h\n"
+    "fmla z21.h, p3/M, z7.h, z16.h\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla z23.h, p3/M, z7.h, z20.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z5.h, z18.h\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla z21.h, p3/M, z6.h, z17.h\n"
+    "fmla z23.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+    "fmla z22.h, p3/M, z8.h, z17.h\n"
+    "fmla z21.h, p3/M, z8.h, z16.h\n"
+    "inch x28\n"
+    "mov p0.b, p2.b\n"
+    "fmax z24.h, p3/M, z24.h, z26.h\n"
+    "fmax z23.h, p3/M, z23.h, z26.h\n"
+    "fmax z22.h, p3/M, z22.h, z26.h\n"
+    "fmax z21.h, p3/M, z21.h, z26.h\n"
+    "fmin z24.h, p3/M, z24.h, z25.h\n"
+    "fmin z23.h, p3/M, z23.h, z25.h\n"
+    "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+    "fmin z22.h, p3/M, z22.h, z25.h\n"
+    "fmin z21.h, p3/M, z21.h, z25.h\n"
+    "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+    "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..abdfac5a3f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = __fp16;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..fdbee67926
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x12, #0x0\n"
+    "mov x8, #0x0\n"
+    "1:"  // Tile loop
+    "str x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x2\n"
+    "mov x24, #0x2\n"
+    "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x12, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x8, x17, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "add x15, x17, x17\n"
+    "mul x20, x12, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "cnth x12\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x14, x14, x22, LSL #1\n"  // inptr[0] += offset * sizeof(__fp16)
+    "add x11, x14, x23, LSL #1\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x8, x16, x20\n"  // offset += tile_j * ld_output_col
+    "add x9, x11, x23, LSL #1\n"
+    "add x28, x15, x17\n"
+    "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "add x27, x9, x23, LSL #1\n"
+    "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x26, x28, x17\n"
+    "add x25, x27, x23, LSL #1\n"
+    "ld1h { z29.h }, p3/Z, [x10]\n"
+    "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "add x24, x26, x17\n"
+    "add x13, x13, x20, LSL #1\n"  // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "cmp x12, %x[n_channels]\n"
+    "add x23, x25, x23, LSL #1\n"
+    "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "add x22, x13, x21, LSL #1\n"
+    "mov x21, #0x0\n"
+    "ld1h { z5.h }, p2/Z, [x14]\n"
+    "ld1h { z6.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "sub x20, XZR, x12\n"
+    "ld1h { z7.h }, p2/Z, [x11]\n"
+    "ld1h { z8.h }, p2/Z, [x11, x17, LSL #1]\n"
+    "addvl x10, x10, #6\n"
+    "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x14, x28, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x11, x24, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x9]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z27, z29\n fmla z27.h, p3/M, z0.h, z5.h\n"
+    "movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z24.h }, p2/Z, [x11, x28, LSL #1]\n"
+    "whilelt p1.h, x12, %x[n_channels]\n"
+    "movprfx z26, z29\n fmla z26.h, p3/M, z0.h, z7.h\n"
+    "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z18.h }, p3/Z, [x10]\n"
+    "inch x21\n"
+    "fmla z27.h, p3/M, z1.h, z6.h\n"
+    "fmla z31.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z23.h }, p2/Z, [x11, x26, LSL #1]\n"
+    "inch x12\n"
+    "fmla z26.h, p3/M, z1.h, z8.h\n"
+    "fmla z30.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z22.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.h, p3/M, z2.h, z9.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "fmla z30.h, p3/M, z2.h, z24.h\n"
+    "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z0.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "inch x20\n"
+    "fmla z26.h, p3/M, z3.h, z24.h\n"
+    "fmla z30.h, p3/M, z3.h, z23.h\n"
+    "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z27.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z19.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "ld1h { z5.h }, p2/Z, [x9, x28, LSL #1]\n"
+    "fmla z26.h, p3/M, z4.h, z23.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "fmla z27.h, p3/M, z18.h, z7.h\n"
+    "fmla z31.h, p3/M, z18.h, z8.h\n"
+    "ld1h { z7.h }, p1/Z, [x11]\n"
+    "fmla z26.h, p3/M, z18.h, z14.h\n"
+    "fmla z30.h, p3/M, z18.h, z0.h\n"
+    "ld1h { z18.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z27.h, p3/M, z22.h, z8.h\n"
+    "fmla z31.h, p3/M, z22.h, z13.h\n"
+    "ld1h { z3.h }, p2/Z, [x9, x24, LSL #1]\n"
+    "fmla z26.h, p3/M, z22.h, z0.h\n"
+    "fmla z30.h, p3/M, z22.h, z19.h\n"
+    "ld1h { z8.h }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z27.h, p3/M, z20.h, z13.h\n"
+    "fmla z31.h, p3/M, z20.h, z24.h\n"
+    "ld1h { z2.h }, p2/Z, [x9, x26, LSL #1]\n"
+    "addvl x9, x9, #1\n"
+    "fmla z26.h, p3/M, z20.h, z19.h\n"
+    "fmla z30.h, p3/M, z20.h, z5.h\n"
+    "ld1h { z16.h }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "fmla z27.h, p3/M, z17.h, z24.h\n"
+    "fmla z31.h, p3/M, z17.h, z23.h\n"
+    "ld1h { z25.h }, p2/Z, [x27]\n"
+    "ld1h { z29.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "fmla z26.h, p3/M, z17.h, z5.h\n"
+    "fmla z30.h, p3/M, z17.h, z2.h\n"
+    "ld1h { z17.h }, p3/Z, [x10, #-8, MUL VL]\n"
+    "fmla z27.h, p3/M, z21.h, z23.h\n"
+    "fmla z31.h, p3/M, z21.h, z10.h\n"
+    "ld1h { z24.h }, p2/Z, [x27, x17, LSL #1]\n"
+    "ld1h { z22.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z21.h, z2.h\n"
+    "fmla z30.h, p3/M, z21.h, z3.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #-7, MUL VL]\n"
+    "fmla z27.h, p3/M, z18.h, z14.h\n"
+    "fmla z31.h, p3/M, z18.h, z0.h\n"
+    "ld1h { z1.h }, p2/Z, [x27, x24, LSL #1]\n"
+    "fmla z26.h, p3/M, z18.h, z25.h\n"
+    "fmla z30.h, p3/M, z18.h, z24.h\n"
+    "ld1h { z23.h }, p3/Z, [x10, #-6, MUL VL]\n"
+    "fmla z27.h, p3/M, z8.h, z0.h\n"
+    "fmla z31.h, p3/M, z8.h, z19.h\n"
+    "ld1h { z0.h }, p2/Z, [x27, x28, LSL #1]\n"
+    "fmla z26.h, p3/M, z8.h, z24.h\n"
+    "fmla z30.h, p3/M, z8.h, z22.h\n"
+    "ld1h { z20.h }, p3/Z, [x10, #-5, MUL VL]\n"
+    "fmla z27.h, p3/M, z16.h, z19.h\n"
+    "fmla z31.h, p3/M, z16.h, z5.h\n"
+    "ld1h { z19.h }, p2/Z, [x27, x26, LSL #1]\n"
+    "addvl x27, x27, #1\n"
+    "fmla z26.h, p3/M, z16.h, z22.h\n"
+    "fmla z30.h, p3/M, z16.h, z0.h\n"
+    "ld1h { z18.h }, p3/Z, [x10, #-4, MUL VL]\n"
+    "fmla z27.h, p3/M, z17.h, z5.h\n"
+    "fmla z31.h, p3/M, z17.h, z2.h\n"
+    "ld1h { z16.h }, p2/Z, [x25]\n"
+    "fmla z26.h, p3/M, z17.h, z0.h\n"
+    "fmla z30.h, p3/M, z17.h, z19.h\n"
+    "ld1h { z17.h }, p3/Z, [x10, #-3, MUL VL]\n"
+    "fmla z27.h, p3/M, z21.h, z2.h\n"
+    "fmla z31.h, p3/M, z21.h, z3.h\n"
+    "ld1h { z4.h }, p2/Z, [x25, x17, LSL #1]\n"
+    "ld1h { z8.h }, p2/Z, [x25, x26, LSL #1]\n"
+    "fmla z26.h, p3/M, z21.h, z19.h\n"
+    "fmla z30.h, p3/M, z21.h, z1.h\n"
+    "ld1h { z13.h }, p3/Z, [x10, #-2, MUL VL]\n"
+    "fmla z27.h, p3/M, z23.h, z25.h\n"
+    "fmla z31.h, p3/M, z23.h, z24.h\n"
+    "ld1h { z25.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z23.h, z16.h\n"
+    "fmla z30.h, p3/M, z23.h, z4.h\n"
+    "ld1h { z5.h }, p3/Z, [x10, #-1, MUL VL]\n"
+    "fmla z27.h, p3/M, z20.h, z24.h\n"
+    "fmla z31.h, p3/M, z20.h, z22.h\n"
+    "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
+    "fmla z26.h, p3/M, z20.h, z4.h\n"
+    "fmla z30.h, p3/M, z20.h, z25.h\n"
+    "ld1h { z23.h }, p3/Z, [x10]\n"
+    "fmla z27.h, p3/M, z18.h, z22.h\n"
+    "fmla z31.h, p3/M, z18.h, z0.h\n"
+    "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "addvl x25, x25, #1\n"
+    "fmla z26.h, p3/M, z18.h, z25.h\n"
+    "fmla z30.h, p3/M, z18.h, z24.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "fmla z27.h, p3/M, z17.h, z0.h\n"
+    "fmla z31.h, p3/M, z17.h, z19.h\n"
+    "ld1h { z18.h }, p2/Z, [x23]\n"
+    "fmla z26.h, p3/M, z17.h, z24.h\n"
+    "fmla z30.h, p3/M, z17.h, z8.h\n"
+    "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z27.h, p3/M, z13.h, z19.h\n"
+    "fmla z31.h, p3/M, z13.h, z1.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
+    "ld1h { z14.h }, p1/Z, [x9]\n"
+    "fmla z26.h, p3/M, z13.h, z8.h\n"
+    "fmla z30.h, p3/M, z13.h, z22.h\n"
+    "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z27.h, p3/M, z5.h, z16.h\n"
+    "fmla z31.h, p3/M, z5.h, z4.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z5.h, z18.h\n"
+    "fmla z30.h, p3/M, z5.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z27.h, p3/M, z23.h, z4.h\n"
+    "fmla z31.h, p3/M, z23.h, z25.h\n"
+    "ld1h { z13.h }, p1/Z, [x11, x15, LSL #1]\n"
+    "fmla z26.h, p3/M, z23.h, z17.h\n"
+    "fmla z30.h, p3/M, z23.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z27.h, p3/M, z21.h, z25.h\n"
+    "fmla z31.h, p3/M, z21.h, z24.h\n"
+    "ld1h { z5.h }, p1/Z, [x14]\n"
+    "fmla z26.h, p3/M, z21.h, z16.h\n"
+    "fmla z30.h, p3/M, z21.h, z18.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
+    "fmla z27.h, p3/M, z20.h, z24.h\n"
+    "fmla z31.h, p3/M, z20.h, z8.h\n"
+    "addvl x10, x10, #16\n"
+    "whilelt p2.h, x21, %x[n_channels]\n"
+    "fmla z26.h, p3/M, z20.h, z18.h\n"
+    "fmla z30.h, p3/M, z20.h, z17.h\n"
+    "cmp x12, %x[n_channels]\n"
+    "addvl x23, x23, #1\n"
+    "fmla z27.h, p3/M, z19.h, z8.h\n"
+    "fmla z31.h, p3/M, z19.h, z22.h\n"
+    "fmax z27.h, p3/M, z27.h, z15.h\n"
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "fmla z26.h, p3/M, z19.h, z17.h\n"
+    "fmla z30.h, p3/M, z19.h, z16.h\n"
+    "fmax z26.h, p3/M, z26.h, z15.h\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "fmin z27.h, p3/M, z27.h, z28.h\n"
+    "fmin z31.h, p3/M, z31.h, z28.h\n"
+    "ld1h { z6.h }, p1/Z, [x14, x17, LSL #1]\n"
+    "ld1h { z8.h }, p1/Z, [x11, x17, LSL #1]\n"
+    "fmin z26.h, p3/M, z26.h, z28.h\n"
+    "fmin z30.h, p3/M, z30.h, z28.h\n"
+    "ld1h { z9.h }, p1/Z, [x14, x15, LSL #1]\n"
+    "ld1h { z11.h }, p1/Z, [x14, x28, LSL #1]\n"
+    "ld1h { z12.h }, p1/Z, [x14, x26, LSL #1]\n"
+    "ld1h { z10.h }, p1/Z, [x11, x24, LSL #1]\n"
+    "st1h { z27.h }, p0, [x13]\n"
+    "st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
+    "addvl x13, x13, #1\n"
+    "ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
+    "ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
+    "st1h { z26.h }, p0, [x22]\n"
+    "addvl x10, x10, #-6\n"
+    "st1h { z30.h }, p0, [x22, x16, LSL #1]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z5.h\n"
+    "movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z22.h }, p2/Z, [x11, x28, LSL #1]\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "movprfx z5, z29\n fmla z5.h, p3/M, z0.h, z7.h\n"
+    "fmla z29.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z20.h }, p3/Z, [x10]\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z30.h, p3/M, z1.h, z6.h\n"
+    "fmla z31.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z6.h }, p2/Z, [x11, x26, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z5.h, p3/M, z1.h, z8.h\n"
+    "fmla z29.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z19.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "add x8, x8, #0x1\n"
+    "fmla z30.h, p3/M, z2.h, z9.h\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
+    "cmp x8, x20\n"
+    "fmla z5.h, p3/M, z2.h, z13.h\n"
+    "fmla z29.h, p3/M, z2.h, z22.h\n"
+    "ld1h { z18.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "add x21, x12, #0x1\n"
+    "fmla z30.h, p3/M, z3.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z1.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z5.h, p3/M, z3.h, z22.h\n"
+    "fmla z29.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "csel x12, x12, x21, LT\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z16.h\n"
+    "ld1h { z0.h }, p2/Z, [x9, x15, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x9, x28, LSL #1]\n"
+    "fmla z5.h, p3/M, z4.h, z6.h\n"
+    "fmla z29.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z16.h }, p3/Z, [x10, #4, MUL VL]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.h, p3/M, z20.h, z7.h\n"
+    "fmla z31.h, p3/M, z20.h, z8.h\n"
+    "csel x8, x8, XZR, LT\n"
+    "cmp x12, x20\n"
+    "fmla z5.h, p3/M, z20.h, z14.h\n"
+    "fmla z29.h, p3/M, z20.h, z1.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z30.h, p3/M, z19.h, z8.h\n"
+    "fmla z31.h, p3/M, z19.h, z13.h\n"
+    "ld1h { z26.h }, p2/Z, [x9, x24, LSL #1]\n"
+    "fmla z5.h, p3/M, z19.h, z1.h\n"
+    "fmla z29.h, p3/M, z19.h, z0.h\n"
+    "ld1h { z25.h }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z30.h, p3/M, z18.h, z13.h\n"
+    "fmla z31.h, p3/M, z18.h, z22.h\n"
+    "ld1h { z24.h }, p2/Z, [x9, x26, LSL #1]\n"
+    "fmla z5.h, p3/M, z18.h, z0.h\n"
+    "fmla z29.h, p3/M, z18.h, z27.h\n"
+    "ld1h { z23.h }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "fmla z30.h, p3/M, z17.h, z22.h\n"
+    "fmla z31.h, p3/M, z17.h, z6.h\n"
+    "ld1h { z22.h }, p2/Z, [x27]\n"
+    "fmla z5.h, p3/M, z17.h, z27.h\n"
+    "fmla z29.h, p3/M, z17.h, z24.h\n"
+    "ld1h { z20.h }, p3/Z, [x10, #-8, MUL VL]\n"
+    "fmla z30.h, p3/M, z16.h, z6.h\n"
+    "fmla z31.h, p3/M, z16.h, z10.h\n"
+    "ld1h { z19.h }, p2/Z, [x27, x17, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x27, x15, LSL #1]\n"
+    "fmla z5.h, p3/M, z16.h, z24.h\n"
+    "fmla z29.h, p3/M, z16.h, z26.h\n"
+    "ld1h { z16.h }, p3/Z, [x10, #-7, MUL VL]\n"
+    "fmla z30.h, p3/M, z21.h, z14.h\n"
+    "fmla z31.h, p3/M, z21.h, z1.h\n"
+    "ld1h { z17.h }, p2/Z, [x27, x24, LSL #1]\n"
+    "fmla z5.h, p3/M, z21.h, z22.h\n"
+    "fmla z29.h, p3/M, z21.h, z19.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #-6, MUL VL]\n"
+    "fmla z30.h, p3/M, z25.h, z1.h\n"
+    "fmla z31.h, p3/M, z25.h, z0.h\n"
+    "ld1h { z7.h }, p2/Z, [x27, x28, LSL #1]\n"
+    "fmla z5.h, p3/M, z25.h, z19.h\n"
+    "fmla z29.h, p3/M, z25.h, z18.h\n"
+    "ld1h { z10.h }, p3/Z, [x10, #-5, MUL VL]\n"
+    "fmla z30.h, p3/M, z23.h, z0.h\n"
+    "fmla z31.h, p3/M, z23.h, z27.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x26, LSL #1]\n"
+    "fmla z5.h, p3/M, z23.h, z18.h\n"
+    "fmla z29.h, p3/M, z23.h, z7.h\n"
+    "ld1h { z6.h }, p3/Z, [x10, #-4, MUL VL]\n"
+    "fmla z30.h, p3/M, z20.h, z27.h\n"
+    "fmla z31.h, p3/M, z20.h, z24.h\n"
+    "ld1h { z0.h }, p2/Z, [x25]\n"
+    "fmla z5.h, p3/M, z20.h, z7.h\n"
+    "fmla z29.h, p3/M, z20.h, z11.h\n"
+    "ld1h { z9.h }, p3/Z, [x10, #-3, MUL VL]\n"
+    "fmla z30.h, p3/M, z16.h, z24.h\n"
+    "fmla z31.h, p3/M, z16.h, z26.h\n"
+    "ld1h { z3.h }, p2/Z, [x25, x17, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x25, x26, LSL #1]\n"
+    "fmla z5.h, p3/M, z16.h, z11.h\n"
+    "fmla z29.h, p3/M, z16.h, z17.h\n"
+    "ld1h { z16.h }, p3/Z, [x10, #-2, MUL VL]\n"
+    "fmla z30.h, p3/M, z21.h, z22.h\n"
+    "fmla z31.h, p3/M, z21.h, z19.h\n"
+    "ld1h { z26.h }, p2/Z, [x25, x15, LSL #1]\n"
+    "fmla z5.h, p3/M, z21.h, z0.h\n"
+    "fmla z29.h, p3/M, z21.h, z3.h\n"
+    "ld1h { z25.h }, p3/Z, [x10, #-1, MUL VL]\n"
+    "fmla z30.h, p3/M, z10.h, z19.h\n"
+    "fmla z31.h, p3/M, z10.h, z18.h\n"
+    "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
+    "fmla z5.h, p3/M, z10.h, z3.h\n"
+    "fmla z29.h, p3/M, z10.h, z26.h\n"
+    "ld1h { z23.h }, p3/Z, [x10]\n"
+    "fmla z30.h, p3/M, z6.h, z18.h\n"
+    "fmla z31.h, p3/M, z6.h, z7.h\n"
+    "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
+    "fmla z5.h, p3/M, z6.h, z26.h\n"
+    "fmla z29.h, p3/M, z6.h, z24.h\n"
+    "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
+    "fmla z30.h, p3/M, z9.h, z7.h\n"
+    "fmla z31.h, p3/M, z9.h, z11.h\n"
+    "ld1h { z18.h }, p2/Z, [x23]\n"
+    "fmla z5.h, p3/M, z9.h, z24.h\n"
+    "fmla z29.h, p3/M, z9.h, z27.h\n"
+    "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z30.h, p3/M, z16.h, z11.h\n"
+    "fmla z31.h, p3/M, z16.h, z17.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
+    "fmla z5.h, p3/M, z16.h, z27.h\n"
+    "fmla z29.h, p3/M, z16.h, z22.h\n"
+    "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z30.h, p3/M, z25.h, z0.h\n"
+    "fmla z31.h, p3/M, z25.h, z3.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+    "fmla z5.h, p3/M, z25.h, z18.h\n"
+    "fmla z29.h, p3/M, z25.h, z17.h\n"
+    "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "fmla z30.h, p3/M, z23.h, z3.h\n"
+    "fmla z31.h, p3/M, z23.h, z26.h\n"
+    "fmla z5.h, p3/M, z23.h, z17.h\n"
+    "fmla z29.h, p3/M, z23.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
+    "fmla z30.h, p3/M, z21.h, z26.h\n"
+    "fmla z31.h, p3/M, z21.h, z24.h\n"
+    "fmla z5.h, p3/M, z21.h, z16.h\n"
+    "fmla z29.h, p3/M, z21.h, z18.h\n"
+    "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+    "fmla z30.h, p3/M, z20.h, z24.h\n"
+    "fmla z31.h, p3/M, z20.h, z27.h\n"
+    "fmla z5.h, p3/M, z20.h, z18.h\n"
+    "fmla z29.h, p3/M, z20.h, z17.h\n"
+    "fmla z30.h, p3/M, z19.h, z27.h\n"
+    "fmla z31.h, p3/M, z19.h, z22.h\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "fmla z5.h, p3/M, z19.h, z17.h\n"
+    "fmla z29.h, p3/M, z19.h, z16.h\n"
+    "fmax z5.h, p3/M, z5.h, z15.h\n"
+    "fmax z29.h, p3/M, z29.h, z15.h\n"
+    "fmin z30.h, p3/M, z30.h, z28.h\n"
+    "fmin z31.h, p3/M, z31.h, z28.h\n"
+    "st1h { z30.h }, p0, [x13]\n"
+    "fmin z5.h, p3/M, z5.h, z28.h\n"
+    "fmin z29.h, p3/M, z29.h, z28.h\n"
+    "st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
+    "st1h { z5.h }, p0, [x22]\n"
+    "st1h { z29.h }, p0, [x22, x16, LSL #1]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..1ec0cb2cbf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x15, x14, [x20, #0x0]\n"
+    "mov x13, #0x0\n"
+    "ldp x12, x11, [x20, #0x10]\n"
+    "whilelt p3.h, XZR, %x[n_channels]\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "cnth x10\n"
+    "ptrue p2.b\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ld1h { z5.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "cmp x10, %x[n_channels]\n"
+    "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x10]\n"
+    "sub x28, XZR, x10\n"
+    "ldp x25, x24, [x16, #0x20]\n"
+    "ldp x23, x22, [x16, #0x30]\n"
+    "ldp x21, x20, [x16, #0x40]\n"
+    "ld1rh { z15.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rh { z28.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z29.h }, p2/Z, [x9]\n"
+    "ld1h { z0.h }, p2/Z, [x9, #1, MUL VL]\n"
+    "ld1h { z1.h }, p2/Z, [x9, #2, MUL VL]\n"
+    "ld1h { z2.h }, p2/Z, [x9, #3, MUL VL]\n"
+    "ld1h { z3.h }, p2/Z, [x9, #4, MUL VL]\n"
+    "ld1h { z4.h }, p2/Z, [x9, #5, MUL VL]\n"
+    "ld1h { z7.h }, p3/Z, [x27, x13, LSL #1]\n"
+    "addvl x9, x9, #6\n"
+    "ld1h { z8.h }, p3/Z, [x26, x13, LSL #1]\n"
+    "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+    "ld1h { z13.h }, p3/Z, [x24, x13, LSL #1]\n"
+    "ld1h { z11.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ld1h { z12.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "ld1h { z10.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
+    "movprfx z27, z29\n fmla z27.h, p2/M, z0.h, z6.h\n"
+    "ldr x20, [x16, #0x50]\n"
+    "ld1h { z5.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z7.h\n"
+    "movprfx z26, z29\n fmla z26.h, p2/M, z0.h, z8.h\n"
+    "ldr x20, [x16, #0x58]\n"
+    "ldr x21, [x16, #0x60]\n"
+    "fmla z30.h, p2/M, z1.h, z6.h\n"
+    "fmla z27.h, p2/M, z1.h, z9.h\n"
+    "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x20, [x16, #0x68]\n"
+    "fmla z31.h, p2/M, z1.h, z8.h\n"
+    "fmla z26.h, p2/M, z1.h, z13.h\n"
+    "ld1h { z21.h }, p2/Z, [x9]\n"
+    "ldr x23, [x16, #0x70]\n"
+    "fmla z30.h, p2/M, z2.h, z9.h\n"
+    "fmla z27.h, p2/M, z2.h, z11.h\n"
+    "ld1h { z20.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z31.h, p2/M, z2.h, z13.h\n"
+    "fmla z26.h, p2/M, z2.h, z5.h\n"
+    "ldr x22, [x16, #0x78]\n"
+    "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "fmla z27.h, p2/M, z3.h, z12.h\n"
+    "ld1h { z11.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x21, [x16, #0x80]\n"
+    "fmla z31.h, p2/M, z3.h, z5.h\n"
+    "fmla z26.h, p2/M, z3.h, z22.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #3, MUL VL]\n"
+    "ldr x20, [x16, #0x88]\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "fmla z27.h, p2/M, z4.h, z20.h\n"
+    "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ld1h { z29.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z4.h, z22.h\n"
+    "fmla z26.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z19.h }, p2/Z, [x9, #4, MUL VL]\n"
+    "ldr x23, [x16, #0x90]\n"
+    "fmla z30.h, p2/M, z21.h, z7.h\n"
+    "fmla z27.h, p2/M, z21.h, z8.h\n"
+    "ldr x26, [x16, #0x98]\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla z31.h, p2/M, z21.h, z14.h\n"
+    "fmla z26.h, p2/M, z21.h, z11.h\n"
+    "ld1h { z25.h }, p2/Z, [x9, #5, MUL VL]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z30.h, p2/M, z18.h, z8.h\n"
+    "fmla z27.h, p2/M, z18.h, z13.h\n"
+    "ld1h { z24.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla z31.h, p2/M, z18.h, z11.h\n"
+    "fmla z26.h, p2/M, z18.h, z0.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, #6, MUL VL]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p2/M, z17.h, z13.h\n"
+    "fmla z27.h, p2/M, z17.h, z5.h\n"
+    "ld1h { z3.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ldr x21, [x16, #0xc0]\n"
+    "fmla z31.h, p2/M, z17.h, z0.h\n"
+    "fmla z26.h, p2/M, z17.h, z29.h\n"
+    "ld1h { z17.h }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "fmla z30.h, p2/M, z16.h, z5.h\n"
+    "fmla z27.h, p2/M, z16.h, z22.h\n"
+    "ld1h { z6.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ldr x27, [x16, #0xc8]\n"
+    "fmla z31.h, p2/M, z16.h, z29.h\n"
+    "fmla z26.h, p2/M, z16.h, z3.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ldr x23, [x16, #0xd0]\n"
+    "fmla z30.h, p2/M, z19.h, z22.h\n"
+    "fmla z27.h, p2/M, z19.h, z10.h\n"
+    "ld1h { z23.h }, p3/Z, [x26, x13, LSL #1]\n"
+    "ld1h { z22.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z19.h, z3.h\n"
+    "fmla z26.h, p2/M, z19.h, z24.h\n"
+    "ld1h { z21.h }, p2/Z, [x9, #-7, MUL VL]\n"
+    "ldr x22, [x16, #0xd8]\n"
+    "fmla z30.h, p2/M, z25.h, z14.h\n"
+    "fmla z27.h, p2/M, z25.h, z11.h\n"
+    "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla z31.h, p2/M, z25.h, z6.h\n"
+    "fmla z26.h, p2/M, z25.h, z23.h\n"
+    "ld1h { z20.h }, p2/Z, [x9, #-6, MUL VL]\n"
+    "ldr x26, [x16, #0xf8]\n"
+    "fmla z30.h, p2/M, z18.h, z11.h\n"
+    "fmla z27.h, p2/M, z18.h, z0.h\n"
+    "ld1h { z7.h }, p3/Z, [x25, x13, LSL #1]\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z31.h, p2/M, z18.h, z23.h\n"
+    "fmla z26.h, p2/M, z18.h, z22.h\n"
+    "ld1h { z18.h }, p2/Z, [x9, #-5, MUL VL]\n"
+    "whilelt p1.h, x10, %x[n_channels]\n"
+    "fmla z30.h, p2/M, z17.h, z0.h\n"
+    "fmla z27.h, p2/M, z17.h, z29.h\n"
+    "ld1h { z19.h }, p3/Z, [x24, x13, LSL #1]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla z31.h, p2/M, z17.h, z22.h\n"
+    "fmla z26.h, p2/M, z17.h, z7.h\n"
+    "ld1h { z17.h }, p2/Z, [x9, #-4, MUL VL]\n"
+    "inch x28\n"
+    "fmla z30.h, p2/M, z16.h, z29.h\n"
+    "fmla z27.h, p2/M, z16.h, z3.h\n"
+    "ld1h { z0.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ldr x21, [x16, #0x100]\n"
+    "fmla z31.h, p2/M, z16.h, z7.h\n"
+    "fmla z26.h, p2/M, z16.h, z19.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #-3, MUL VL]\n"
+    "mov p0.b, p3.b\n"
+    "fmla z30.h, p2/M, z21.h, z3.h\n"
+    "fmla z27.h, p2/M, z21.h, z24.h\n"
+    "ld1h { z11.h }, p3/Z, [x27, x13, LSL #1]\n"
+    "ld1h { z13.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z21.h, z19.h\n"
+    "fmla z26.h, p2/M, z21.h, z1.h\n"
+    "ld1h { z10.h }, p2/Z, [x9, #-2, MUL VL]\n"
+    "ldr x20, [x16, #0x108]\n"
+    "fmla z30.h, p2/M, z20.h, z6.h\n"
+    "fmla z27.h, p2/M, z20.h, z23.h\n"
+    "ld1h { z25.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ldr x23, [x16, #0x110]\n"
+    "fmla z31.h, p2/M, z20.h, z0.h\n"
+    "fmla z26.h, p2/M, z20.h, z11.h\n"
+    "ld1h { z8.h }, p2/Z, [x9, #-1, MUL VL]\n"
+    "ld1h { z29.h }, p2/Z, [x9, #4, MUL VL]\n"
+    "fmla z30.h, p2/M, z18.h, z23.h\n"
+    "fmla z27.h, p2/M, z18.h, z22.h\n"
+    "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "ldr x22, [x16, #0x118]\n"
+    "fmla z31.h, p2/M, z18.h, z11.h\n"
+    "fmla z26.h, p2/M, z18.h, z25.h\n"
+    "ld1h { z23.h }, p2/Z, [x9]\n"
+    "fmla z30.h, p2/M, z17.h, z22.h\n"
+    "fmla z27.h, p2/M, z17.h, z7.h\n"
+    "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z17.h, z25.h\n"
+    "fmla z26.h, p2/M, z17.h, z24.h\n"
+    "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z30.h, p2/M, z16.h, z7.h\n"
+    "fmla z27.h, p2/M, z16.h, z19.h\n"
+    "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z16.h, z24.h\n"
+    "fmla z26.h, p2/M, z16.h, z13.h\n"
+    "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.h, p2/M, z10.h, z19.h\n"
+    "fmla z27.h, p2/M, z10.h, z1.h\n"
+    "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z10.h, z13.h\n"
+    "fmla z26.h, p2/M, z10.h, z22.h\n"
+    "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
+    "fmla z30.h, p2/M, z8.h, z0.h\n"
+    "fmla z27.h, p2/M, z8.h, z11.h\n"
+    "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "fmla z31.h, p2/M, z8.h, z18.h\n"
+    "fmla z26.h, p2/M, z8.h, z17.h\n"
+    "ld1h { z18.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "fmla z30.h, p2/M, z23.h, z11.h\n"
+    "fmla z27.h, p2/M, z23.h, z25.h\n"
+    "ld1h { z0.h }, p2/Z, [x9, #5, MUL VL]\n"
+    "fmla z31.h, p2/M, z23.h, z17.h\n"
+    "fmla z26.h, p2/M, z23.h, z16.h\n"
+    "ld1h { z17.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ld1h { z1.h }, p2/Z, [x9, #6, MUL VL]\n"
+    "fmla z30.h, p2/M, z21.h, z25.h\n"
+    "fmla z27.h, p2/M, z21.h, z24.h\n"
+    "ld1h { z5.h }, p1/Z, [x21, x10, LSL #1]\n"
+    "fmla z31.h, p2/M, z21.h, z16.h\n"
+    "fmla z26.h, p2/M, z21.h, z18.h\n"
+    "ld1h { z16.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "ldp x27, x26, [x16, #0x10]\n"
+    "fmla z30.h, p2/M, z20.h, z24.h\n"
+    "fmla z27.h, p2/M, z20.h, z13.h\n"
+    "ld1h { z6.h }, p1/Z, [x20, x10, LSL #1]\n"
+    "ldp x25, x24, [x16, #0x20]\n"
+    "fmla z31.h, p2/M, z20.h, z18.h\n"
+    "fmla z26.h, p2/M, z20.h, z17.h\n"
+    "ldp x23, x22, [x16, #0x30]\n"
+    "ldp x21, x20, [x16, #0x40]\n"
+    "fmla z30.h, p2/M, z19.h, z13.h\n"
+    "fmla z27.h, p2/M, z19.h, z22.h\n"
+    "inch x13\n"
+    "ld1h { z7.h }, p1/Z, [x27, x10, LSL #1]\n"
+    "fmla z31.h, p2/M, z19.h, z17.h\n"
+    "fmla z26.h, p2/M, z19.h, z16.h\n"
+    "ld1h { z8.h }, p1/Z, [x26, x10, LSL #1]\n"
+    "ld1h { z9.h }, p1/Z, [x25, x10, LSL #1]\n"
+    "ld1h { z13.h }, p1/Z, [x24, x10, LSL #1]\n"
+    "ld1h { z11.h }, p1/Z, [x23, x10, LSL #1]\n"
+    "fmax z30.h, p2/M, z30.h, z15.h\n"
+    "fmax z27.h, p2/M, z27.h, z15.h\n"
+    "ld1h { z12.h }, p1/Z, [x22, x10, LSL #1]\n"
+    "ld1h { z10.h }, p1/Z, [x21, x10, LSL #1]\n"
+    "fmax z31.h, p2/M, z31.h, z15.h\n"
+    "fmax z26.h, p2/M, z26.h, z15.h\n"
+    "ld1h { z14.h }, p1/Z, [x20, x10, LSL #1]\n"
+    "inch x10\n"
+    "ld1h { z2.h }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "whilelt p3.h, x13, %x[n_channels]\n"
+    "cmp x10, %x[n_channels]\n"
+    "ld1h { z3.h }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ld1h { z4.h }, p2/Z, [x9, #-7, MUL VL]\n"
+    "fmin z30.h, p2/M, z30.h, z28.h\n"
+    "fmin z27.h, p2/M, z27.h, z28.h\n"
+    "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
+    "fmin z31.h, p2/M, z31.h, z28.h\n"
+    "fmin z26.h, p2/M, z26.h, z28.h\n"
+    "st1h { z27.h }, p0, [x14, x28, LSL #1]\n"
+    "st1h { z31.h }, p0, [x12, x28, LSL #1]\n"
+    "addvl x9, x9, #-6\n"
+    "st1h { z26.h }, p0, [x11, x28, LSL #1]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
+    "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z6.h\n"
+    "ldr x20, [x16, #0x50]\n"
+    "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "movprfx z5, z29\n fmla z5.h, p2/M, z0.h, z7.h\n"
+    "fmla z29.h, p2/M, z0.h, z8.h\n"
+    "ldr x20, [x16, #0x58]\n"
+    "ldr x21, [x16, #0x60]\n"
+    "fmla z30.h, p2/M, z1.h, z6.h\n"
+    "fmla z31.h, p2/M, z1.h, z9.h\n"
+    "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x20, [x16, #0x68]\n"
+    "fmla z5.h, p2/M, z1.h, z8.h\n"
+    "fmla z29.h, p2/M, z1.h, z13.h\n"
+    "ld1h { z20.h }, p2/Z, [x9]\n"
+    "ldr x23, [x16, #0x70]\n"
+    "fmla z30.h, p2/M, z2.h, z9.h\n"
+    "fmla z31.h, p2/M, z2.h, z11.h\n"
+    "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ld1h { z19.h }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z5.h, p2/M, z2.h, z13.h\n"
+    "fmla z29.h, p2/M, z2.h, z22.h\n"
+    "ldr x21, [x16, #0x78]\n"
+    "ld1h { z18.h }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.h, p2/M, z3.h, z11.h\n"
+    "fmla z31.h, p2/M, z3.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x22, [x16, #0x80]\n"
+    "fmla z5.h, p2/M, z3.h, z22.h\n"
+    "fmla z29.h, p2/M, z3.h, z6.h\n"
+    "ld1h { z17.h }, p2/Z, [x9, #3, MUL VL]\n"
+    "ldr x20, [x16, #0x88]\n"
+    "fmla z30.h, p2/M, z4.h, z12.h\n"
+    "fmla z31.h, p2/M, z4.h, z16.h\n"
+    "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ld1h { z27.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z4.h, z6.h\n"
+    "fmla z29.h, p2/M, z4.h, z10.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #4, MUL VL]\n"
+    "ldr x21, [x16, #0x90]\n"
+    "fmla z30.h, p2/M, z20.h, z7.h\n"
+    "fmla z31.h, p2/M, z20.h, z8.h\n"
+    "ldr x27, [x16, #0x98]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla z5.h, p2/M, z20.h, z14.h\n"
+    "fmla z29.h, p2/M, z20.h, z1.h\n"
+    "ld1h { z21.h }, p2/Z, [x9, #5, MUL VL]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z30.h, p2/M, z19.h, z8.h\n"
+    "fmla z31.h, p2/M, z19.h, z13.h\n"
+    "ld1h { z26.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla z5.h, p2/M, z19.h, z1.h\n"
+    "fmla z29.h, p2/M, z19.h, z0.h\n"
+    "ld1h { z25.h }, p2/Z, [x9, #6, MUL VL]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.h, p2/M, z18.h, z13.h\n"
+    "fmla z31.h, p2/M, z18.h, z22.h\n"
+    "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "ldr x23, [x16, #0xc0]\n"
+    "fmla z5.h, p2/M, z18.h, z0.h\n"
+    "fmla z29.h, p2/M, z18.h, z27.h\n"
+    "ld1h { z23.h }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "fmla z30.h, p2/M, z17.h, z22.h\n"
+    "fmla z31.h, p2/M, z17.h, z6.h\n"
+    "ld1h { z22.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ldr x22, [x16, #0xc8]\n"
+    "fmla z5.h, p2/M, z17.h, z27.h\n"
+    "fmla z29.h, p2/M, z17.h, z24.h\n"
+    "ld1h { z20.h }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ldr x21, [x16, #0xd0]\n"
+    "fmla z30.h, p2/M, z16.h, z6.h\n"
+    "fmla z31.h, p2/M, z16.h, z10.h\n"
+    "ld1h { z19.h }, p3/Z, [x27, x13, LSL #1]\n"
+    "ld1h { z18.h }, p3/Z, [x26, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z16.h, z24.h\n"
+    "fmla z29.h, p2/M, z16.h, z26.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #-7, MUL VL]\n"
+    "ldr x27, [x16, #0xd8]\n"
+    "fmla z30.h, p2/M, z21.h, z14.h\n"
+    "fmla z31.h, p2/M, z21.h, z1.h\n"
+    "ld1h { z17.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla z5.h, p2/M, z21.h, z22.h\n"
+    "fmla z29.h, p2/M, z21.h, z19.h\n"
+    "ld1h { z21.h }, p2/Z, [x9, #-6, MUL VL]\n"
+    "ldr x26, [x16, #0xf8]\n"
+    "fmla z30.h, p2/M, z25.h, z1.h\n"
+    "fmla z31.h, p2/M, z25.h, z0.h\n"
+    "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z5.h, p2/M, z25.h, z19.h\n"
+    "fmla z29.h, p2/M, z25.h, z18.h\n"
+    "ld1h { z4.h }, p2/Z, [x9, #-5, MUL VL]\n"
+    "inch x28\n"
+    "fmla z30.h, p2/M, z23.h, z0.h\n"
+    "fmla z31.h, p2/M, z23.h, z27.h\n"
+    "ld1h { z8.h }, p3/Z, [x24, x13, LSL #1]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla z5.h, p2/M, z23.h, z18.h\n"
+    "fmla z29.h, p2/M, z23.h, z9.h\n"
+    "ld1h { z6.h }, p2/Z, [x9, #-4, MUL VL]\n"
+    "mov p0.b, p3.b\n"
+    "fmla z30.h, p2/M, z20.h, z27.h\n"
+    "fmla z31.h, p2/M, z20.h, z24.h\n"
+    "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla z5.h, p2/M, z20.h, z9.h\n"
+    "fmla z29.h, p2/M, z20.h, z8.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, #-3, MUL VL]\n"
+    "fmla z30.h, p2/M, z16.h, z24.h\n"
+    "fmla z31.h, p2/M, z16.h, z26.h\n"
+    "ld1h { z0.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "ld1h { z27.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z16.h, z8.h\n"
+    "fmla z29.h, p2/M, z16.h, z17.h\n"
+    "ld1h { z16.h }, p2/Z, [x9, #-2, MUL VL]\n"
+    "ldr x22, [x16, #0x108]\n"
+    "fmla z30.h, p2/M, z21.h, z22.h\n"
+    "fmla z31.h, p2/M, z21.h, z19.h\n"
+    "ld1h { z26.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "ldr x21, [x16, #0x110]\n"
+    "fmla z5.h, p2/M, z21.h, z10.h\n"
+    "fmla z29.h, p2/M, z21.h, z0.h\n"
+    "ld1h { z25.h }, p2/Z, [x9, #-1, MUL VL]\n"
+    "fmla z30.h, p2/M, z4.h, z19.h\n"
+    "fmla z31.h, p2/M, z4.h, z18.h\n"
+    "ld1h { z24.h }, p3/Z, [x27, x13, LSL #1]\n"
+    "ldr x20, [x16, #0x118]\n"
+    "fmla z5.h, p2/M, z4.h, z0.h\n"
+    "fmla z29.h, p2/M, z4.h, z26.h\n"
+    "ld1h { z23.h }, p2/Z, [x9]\n"
+    "fmla z30.h, p2/M, z6.h, z18.h\n"
+    "fmla z31.h, p2/M, z6.h, z9.h\n"
+    "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z6.h, z26.h\n"
+    "fmla z29.h, p2/M, z6.h, z24.h\n"
+    "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z30.h, p2/M, z11.h, z9.h\n"
+    "fmla z31.h, p2/M, z11.h, z8.h\n"
+    "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z11.h, z24.h\n"
+    "fmla z29.h, p2/M, z11.h, z27.h\n"
+    "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.h, p2/M, z16.h, z8.h\n"
+    "fmla z31.h, p2/M, z16.h, z17.h\n"
+    "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z16.h, z27.h\n"
+    "fmla z29.h, p2/M, z16.h, z22.h\n"
+    "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
+    "fmla z30.h, p2/M, z25.h, z10.h\n"
+    "fmla z31.h, p2/M, z25.h, z0.h\n"
+    "ld1h { z16.h }, p3/Z, [x23, x13, LSL #1]\n"
+    "fmla z5.h, p2/M, z25.h, z18.h\n"
+    "fmla z29.h, p2/M, z25.h, z17.h\n"
+    "ld1h { z18.h }, p3/Z, [x22, x13, LSL #1]\n"
+    "fmla z30.h, p2/M, z23.h, z0.h\n"
+    "fmla z31.h, p2/M, z23.h, z26.h\n"
+    "fmla z5.h, p2/M, z23.h, z17.h\n"
+    "fmla z29.h, p2/M, z23.h, z16.h\n"
+    "ld1h { z17.h }, p3/Z, [x21, x13, LSL #1]\n"
+    "fmla z30.h, p2/M, z21.h, z26.h\n"
+    "fmla z31.h, p2/M, z21.h, z24.h\n"
+    "fmla z5.h, p2/M, z21.h, z16.h\n"
+    "fmla z29.h, p2/M, z21.h, z18.h\n"
+    "ld1h { z16.h }, p3/Z, [x20, x13, LSL #1]\n"
+    "fmla z30.h, p2/M, z20.h, z24.h\n"
+    "fmla z31.h, p2/M, z20.h, z27.h\n"
+    "fmla z5.h, p2/M, z20.h, z18.h\n"
+    "fmla z29.h, p2/M, z20.h, z17.h\n"
+    "fmla z30.h, p2/M, z19.h, z27.h\n"
+    "fmla z31.h, p2/M, z19.h, z22.h\n"
+    "fmax z30.h, p2/M, z30.h, z15.h\n"
+    "fmax z31.h, p2/M, z31.h, z15.h\n"
+    "fmla z5.h, p2/M, z19.h, z17.h\n"
+    "fmla z29.h, p2/M, z19.h, z16.h\n"
+    "fmax z5.h, p2/M, z5.h, z15.h\n"
+    "fmax z29.h, p2/M, z29.h, z15.h\n"
+    "fmin z30.h, p2/M, z30.h, z28.h\n"
+    "fmin z31.h, p2/M, z31.h, z28.h\n"
+    "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
+    "fmin z5.h, p2/M, z5.h, z28.h\n"
+    "fmin z29.h, p2/M, z29.h, z28.h\n"
+    "st1h { z31.h }, p0, [x14, x28, LSL #1]\n"
+    "st1h { z5.h }, p0, [x12, x28, LSL #1]\n"
+    "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..16b96fdb8e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..1bdef85274
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x10, #0x0\n"
+    "mov x14, #0x0\n"
+    "1:"  // Tile loop
+    "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x2\n"
+    "mov x24, #0x2\n"
+    "str x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x10, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x10, x22\n"  // offset = tile_i * ld_output_row
+    "cntw x11\n"
+    "madd x21, x14, x13, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "madd x20, x14, x12, x20\n"  // offset += tile_j * ld_output_col
+    "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "ld1w { z27.s }, p3/Z, [x10]\n"
+    "add x27, x13, x13\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x9, x9, x21, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "add x26, x9, x23, LSL #2\n"
+    "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
+    "add x25, x26, x23, LSL #2\n"
+    "add x24, x27, x13\n"
+    "ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "add x28, x28, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cmp x11, %x[n_channels]\n"
+    "add x23, x25, x23, LSL #2\n"
+    "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
+    "add x22, x28, x22, LSL #2\n"
+    "mov x21, #0x0\n"
+    "ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "sub x20, XZR, x11\n"
+    "ld1w { z10.s }, p2/Z, [x9]\n"
+    "ld1w { z11.s }, p2/Z, [x9, x24, LSL #2]\n"
+    "addvl x10, x10, #-6\n"
+    "ld1w { z12.s }, p2/Z, [x26, x27, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+    "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
+    "whilelt p1.s, x11, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x23]\n"
+    "incw x11\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "mov p0.b, p2.b\n"
+    "ld1w { z27.s }, p3/Z, [x10]\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z23.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
+    "incw x20\n"
+    "fmla z22.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
+    "addvl x9, x9, #1\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "fmla z22.s, p3/M, z4.s, z13.s\n"
+    "fmla z21.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x26]\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
+    "addvl x26, x26, #1\n"
+    "fmla z22.s, p3/M, z5.s, z20.s\n"
+    "fmla z21.s, p3/M, z4.s, z20.s\n"
+    "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z24.s, p3/M, z2.s, z18.s\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x25]\n"
+    "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z22.s, p3/M, z0.s, z17.s\n"
+    "fmla z21.s, p3/M, z2.s, z16.s\n"
+    "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z24.s, p3/M, z8.s, z20.s\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "addvl x25, x25, #1\n"
+    "fmla z22.s, p3/M, z3.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "ld1w { z13.s }, p1/Z, [x25, x13, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z24.s, p3/M, z6.s, z19.s\n"
+    "fmla z23.s, p3/M, z8.s, z18.s\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "ld1w { z9.s }, p1/Z, [x26, x13, LSL #2]\n"
+    "cmp x11, %x[n_channels]\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "ld1w { z10.s }, p1/Z, [x9]\n"
+    "ld1w { z11.s }, p1/Z, [x9, x24, LSL #2]\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "ld1w { z12.s }, p1/Z, [x26, x27, LSL #2]\n"
+    "st1w { z24.s }, p0, [x28]\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "addvl x23, x23, #1\n"
+    "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
+    "st1w { z22.s }, p0, [x22]\n"
+    "addvl x28, x28, #1\n"
+    "ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
+    "addvl x10, x10, #-6\n"
+    "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+    "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x23]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "add x14, x14, #0x1\n"
+    "cmp x14, x20\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z23.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
+    "add x21, x10, #0x1\n"
+    "fmla z22.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "csel x10, x10, x21, LT\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.s, p3/M, z4.s, z13.s\n"
+    "fmla z21.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x26]\n"
+    "csel x14, x14, XZR, LT\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
+    "cmp x10, x20\n"
+    "fmla z22.s, p3/M, z5.s, z20.s\n"
+    "fmla z21.s, p3/M, z4.s, z20.s\n"
+    "fmla z24.s, p3/M, z2.s, z18.s\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x25]\n"
+    "fmla z22.s, p3/M, z0.s, z17.s\n"
+    "fmla z21.s, p3/M, z2.s, z16.s\n"
+    "fmla z24.s, p3/M, z8.s, z20.s\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "fmla z22.s, p3/M, z3.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "fmla z24.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z24.s, p3/M, z6.s, z19.s\n"
+    "fmla z23.s, p3/M, z8.s, z18.s\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "st1w { z24.s }, p0, [x28]\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
+    "st1w { z22.s }, p0, [x22]\n"
+    "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..873b4736ff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[16];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "cntw x14\n"
+    "ldp x13, x12, [x20, #0x0]\n"
+    "ldp x11, x10, [x20, #0x10]\n"
+    "mov x9, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z20.s }, p3/Z, [x16]\n"
+    "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "sub x28, XZR, x14\n"
+    "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x24, x9, LSL #2]\n"
+    "addvl x16, x16, #-6\n"
+    "ld1w { z10.s }, p2/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
+    "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x48]\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z23.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ldr x22, [x15, #0x50]\n"
+    "fmla z22.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x58]\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "ldr x20, [x15, #0x60]\n"
+    "ldr x27, [x15, #0x68]\n"
+    "fmla z22.s, p3/M, z4.s, z13.s\n"
+    "fmla z21.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ldr x26, [x15, #0x70]\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x25, [x15, #0x78]\n"
+    "fmla z22.s, p3/M, z5.s, z20.s\n"
+    "fmla z21.s, p3/M, z4.s, z20.s\n"
+    "whilelt p1.s, x14, %x[n_channels]\n"
+    "ldp x24, x23, [x15, #0x0]\n"
+    "fmla z24.s, p3/M, z2.s, z18.s\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldp x22, x21, [x15, #0x10]\n"
+    "fmla z22.s, p3/M, z0.s, z17.s\n"
+    "fmla z21.s, p3/M, z2.s, z16.s\n"
+    "ldr x20, [x15, #0x20]\n"
+    "ld1w { z13.s }, p1/Z, [x20, x14, LSL #2]\n"
+    "fmla z24.s, p3/M, z8.s, z20.s\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z18.s }, p2/Z, [x27, x9, LSL #2]\n"
+    "incw x28\n"
+    "fmla z22.s, p3/M, z3.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "mov p0.b, p2.b\n"
+    "ld1w { z20.s }, p3/Z, [x16]\n"
+    "fmla z24.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x9, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "incw x9\n"
+    "ld1w { z11.s }, p1/Z, [x22, x14, LSL #2]\n"
+    "fmla z24.s, p3/M, z6.s, z19.s\n"
+    "fmla z23.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z9.s }, p1/Z, [x24, x14, LSL #2]\n"
+    "ld1w { z10.s }, p1/Z, [x23, x14, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z12.s }, p1/Z, [x21, x14, LSL #2]\n"
+    "incw x14\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "whilelt p2.s, x9, %x[n_channels]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
+    "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
+    "ldr x21, [x15, #0x28]\n"
+    "ldr x20, [x15, #0x30]\n"
+    "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x22, [x15, #0x38]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x48]\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "ldr x20, [x15, #0x40]\n"
+    "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z23.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x50]\n"
+    "fmla z22.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x58]\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "ldr x23, [x15, #0x60]\n"
+    "ldr x22, [x15, #0x68]\n"
+    "fmla z22.s, p3/M, z4.s, z13.s\n"
+    "fmla z21.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x70]\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x78]\n"
+    "fmla z22.s, p3/M, z5.s, z20.s\n"
+    "fmla z21.s, p3/M, z4.s, z20.s\n"
+    "incw x28\n"
+    "mov p0.b, p2.b\n"
+    "fmla z24.s, p3/M, z2.s, z18.s\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x23, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z17.s\n"
+    "fmla z21.s, p3/M, z2.s, z16.s\n"
+    "fmla z24.s, p3/M, z8.s, z20.s\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z18.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z3.s, z19.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "fmla z24.s, p3/M, z3.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z24.s, p3/M, z6.s, z19.s\n"
+    "fmla z23.s, p3/M, z8.s, z18.s\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+    "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+    "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..e4f432c9ed
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..015d0e63c2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x13, #0x0\n"
+    "mov x8, #0x0\n"
+    "1:"  // Tile loop
+    "str x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x3\n"
+    "mov x24, #0x3\n"
+    "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x13, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x8, x17, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cntw x15\n"
+    "mul x20, x13, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x12, x17, x17\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x14, x14, x22, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x10, x14, x23, LSL #2\n"
+    "madd x20, x8, x16, x20\n"  // offset += tile_j * ld_output_col
+    "add x9, x10, x23, LSL #2\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z14.s }, p3/Z, [x13]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
+    "add x28, x9, x23, LSL #2\n"
+    "ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
+    "add x27, x12, x17\n"
+    "add x11, x11, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x13, #6, MUL VL]\n"
+    "add x26, x28, x23, LSL #2\n"
+    "add x25, x27, x17\n"
+    "ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
+    "addvl x13, x13, #16\n"
+    "add x24, x11, x21, LSL #2\n"
+    "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cmp x15, %x[n_channels]\n"
+    "add x23, x24, x21, LSL #2\n"
+    "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
+    "add x22, x16, x16\n"
+    "mov x21, #0x0\n"
+    "ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x9, x12, LSL #2]\n"
+    "sub x20, XZR, x15\n"
+    "ld1w { z10.s }, p2/Z, [x14]\n"
+    "ld1w { z11.s }, p2/Z, [x14, x25, LSL #2]\n"
+    "addvl x13, x13, #-6\n"
+    "ld1w { z12.s }, p2/Z, [x26]\n"
+    "ld1w { z13.s }, p2/Z, [x10, x12, LSL #2]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "whilelt p1.s, x15, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "incw x15\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+    "incw x20\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "fmla z29.s, p3/M, z6.s, z18.s\n"
+    "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "fmla z28.s, p3/M, z5.s, z13.s\n"
+    "fmla z27.s, p3/M, z3.s, z13.s\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z25.s, p3/M, z1.s, z13.s\n"
+    "fmla z24.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
+    "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z14.s }, p3/Z, [x13]\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+    "fmla z28.s, p3/M, z7.s, z18.s\n"
+    "fmla z20.s, p3/M, z0.s, z18.s\n"
+    "fmla z26.s, p3/M, z4.s, z18.s\n"
+    "fmla z25.s, p3/M, z3.s, z18.s\n"
+    "fmla z22.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x10]\n"
+    "fmla z29.s, p3/M, z2.s, z16.s\n"
+    "fmla z27.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x28]\n"
+    "fmla z24.s, p3/M, z4.s, z23.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
+    "fmla z20.s, p3/M, z2.s, z23.s\n"
+    "fmla z21.s, p3/M, z1.s, z23.s\n"
+    "fmla z29.s, p3/M, z8.s, z23.s\n"
+    "fmla z27.s, p3/M, z7.s, z23.s\n"
+    "fmla z25.s, p3/M, z5.s, z23.s\n"
+    "fmla z26.s, p3/M, z0.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z20.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z3.s, z17.s\n"
+    "fmla z28.s, p3/M, z3.s, z19.s\n"
+    "fmla z27.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
+    "fmla z26.s, p3/M, z6.s, z18.s\n"
+    "fmla z25.s, p3/M, z7.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z5.s, z17.s\n"
+    "fmla z24.s, p3/M, z6.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z19.s\n"
+    "fmla z20.s, p3/M, z6.s, z16.s\n"
+    "fmla z26.s, p3/M, z8.s, z17.s\n"
+    "fmla z22.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
+    "fmla z29.s, p3/M, z3.s, z18.s\n"
+    "fmla z25.s, p3/M, z0.s, z18.s\n"
+    "fmla z24.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z17.s\n"
+    "addvl x10, x10, #1\n"
+    "fmla z21.s, p3/M, z7.s, z17.s\n"
+    "fmla z28.s, p3/M, z4.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+    "fmla z26.s, p3/M, z1.s, z18.s\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
+    "addvl x28, x28, #1\n"
+    "fmla z27.s, p3/M, z4.s, z16.s\n"
+    "fmla z25.s, p3/M, z2.s, z16.s\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "addvl x14, x14, #1\n"
+    "fmla z20.s, p3/M, z3.s, z17.s\n"
+    "fmla z21.s, p3/M, z4.s, z19.s\n"
+    "ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
+    "ld1w { z10.s }, p1/Z, [x14]\n"
+    "fmla z26.s, p3/M, z7.s, z17.s\n"
+    "fmla z25.s, p3/M, z6.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x9]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "fmax z29.s, p3/M, z29.s, z31.s\n"
+    "ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
+    "fmla z27.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
+    "fmla z24.s, p3/M, z7.s, z19.s\n"
+    "addvl x9, x9, #1\n"
+    "fmla z20.s, p3/M, z5.s, z19.s\n"
+    "fmla z22.s, p3/M, z0.s, z18.s\n"
+    "ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
+    "fmin z29.s, p3/M, z29.s, z30.s\n"
+    "fmla z21.s, p3/M, z2.s, z17.s\n"
+    "fmla z25.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
+    "fmax z25.s, p3/M, z25.s, z31.s\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "fmla z26.s, p3/M, z3.s, z18.s\n"
+    "fmax z28.s, p3/M, z28.s, z31.s\n"
+    "fmax z26.s, p3/M, z26.s, z31.s\n"
+    "fmla z27.s, p3/M, z8.s, z17.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "fmax z27.s, p3/M, z27.s, z31.s\n"
+    "fmax z24.s, p3/M, z24.s, z31.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z20.s, p3/M, z7.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z31.s\n"
+    "fmax z20.s, p3/M, z20.s, z31.s\n"
+    "fmla z21.s, p3/M, z6.s, z16.s\n"
+    "fmax z21.s, p3/M, z21.s, z31.s\n"
+    "addvl x26, x26, #1\n"
+    "ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x13, #6, MUL VL]\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "cmp x15, %x[n_channels]\n"
+    "ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
+    "addvl x13, x13, #16\n"
+    "fmin z28.s, p3/M, z28.s, z30.s\n"
+    "ld1w { z9.s }, p1/Z, [x9, x12, LSL #2]\n"
+    "fmin z27.s, p3/M, z27.s, z30.s\n"
+    "fmin z26.s, p3/M, z26.s, z30.s\n"
+    "ld1w { z11.s }, p1/Z, [x14, x25, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x26]\n"
+    "fmin z25.s, p3/M, z25.s, z30.s\n"
+    "fmin z24.s, p3/M, z24.s, z30.s\n"
+    "ld1w { z13.s }, p1/Z, [x10, x12, LSL #2]\n"
+    "st1w { z28.s }, p0, [x11]\n"
+    "fmin z22.s, p3/M, z22.s, z30.s\n"
+    "fmin z20.s, p3/M, z20.s, z30.s\n"
+    "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
+    "fmin z21.s, p3/M, z21.s, z30.s\n"
+    "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
+    "addvl x11, x11, #1\n"
+    "ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
+    "st1w { z26.s }, p0, [x24]\n"
+    "addvl x13, x13, #-6\n"
+    "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
+    "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
+    "addvl x24, x24, #1\n"
+    "st1w { z22.s }, p0, [x23]\n"
+    "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
+    "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
+    "addvl x23, x23, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x8, x8, #0x1\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+    "cmp x8, x20\n"
+    "add x21, x13, #0x1\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "csel x13, x13, x21, LT\n"
+    "fmla z29.s, p3/M, z6.s, z18.s\n"
+    "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "mov p0.b, p2.b\n"
+    "csel x8, x8, XZR, LT\n"
+    "fmla z28.s, p3/M, z5.s, z13.s\n"
+    "fmla z27.s, p3/M, z3.s, z13.s\n"
+    "cmp x13, x20\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z25.s, p3/M, z1.s, z13.s\n"
+    "fmla z24.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
+    "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+    "fmla z28.s, p3/M, z7.s, z18.s\n"
+    "fmla z20.s, p3/M, z0.s, z18.s\n"
+    "fmla z26.s, p3/M, z4.s, z18.s\n"
+    "fmla z25.s, p3/M, z3.s, z18.s\n"
+    "fmla z22.s, p3/M, z1.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x10]\n"
+    "fmla z29.s, p3/M, z2.s, z16.s\n"
+    "fmla z27.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x28]\n"
+    "fmla z24.s, p3/M, z4.s, z23.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
+    "fmla z20.s, p3/M, z2.s, z23.s\n"
+    "fmla z21.s, p3/M, z1.s, z23.s\n"
+    "fmla z29.s, p3/M, z8.s, z23.s\n"
+    "fmla z27.s, p3/M, z7.s, z23.s\n"
+    "fmla z25.s, p3/M, z5.s, z23.s\n"
+    "fmla z26.s, p3/M, z0.s, z19.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z20.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z3.s, z17.s\n"
+    "fmla z28.s, p3/M, z3.s, z19.s\n"
+    "fmla z27.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
+    "fmla z26.s, p3/M, z6.s, z18.s\n"
+    "fmla z25.s, p3/M, z7.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z5.s, z17.s\n"
+    "fmla z24.s, p3/M, z6.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z19.s\n"
+    "fmla z20.s, p3/M, z6.s, z16.s\n"
+    "fmla z26.s, p3/M, z8.s, z17.s\n"
+    "fmla z22.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
+    "fmla z29.s, p3/M, z3.s, z18.s\n"
+    "fmla z25.s, p3/M, z0.s, z18.s\n"
+    "fmla z24.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z7.s, z17.s\n"
+    "fmla z28.s, p3/M, z4.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+    "fmla z26.s, p3/M, z1.s, z18.s\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z16.s\n"
+    "fmla z25.s, p3/M, z2.s, z16.s\n"
+    "fmla z24.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z20.s, p3/M, z3.s, z17.s\n"
+    "fmla z21.s, p3/M, z4.s, z19.s\n"
+    "fmla z26.s, p3/M, z7.s, z17.s\n"
+    "fmla z25.s, p3/M, z6.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x9]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "fmax z29.s, p3/M, z29.s, z31.s\n"
+    "fmin z29.s, p3/M, z29.s, z30.s\n"
+    "fmla z27.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
+    "fmla z24.s, p3/M, z7.s, z19.s\n"
+    "fmla z20.s, p3/M, z5.s, z19.s\n"
+    "fmla z22.s, p3/M, z0.s, z18.s\n"
+    "fmla z21.s, p3/M, z2.s, z17.s\n"
+    "fmla z25.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
+    "fmax z25.s, p3/M, z25.s, z31.s\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "fmla z26.s, p3/M, z3.s, z18.s\n"
+    "fmax z28.s, p3/M, z28.s, z31.s\n"
+    "fmax z26.s, p3/M, z26.s, z31.s\n"
+    "fmla z27.s, p3/M, z8.s, z17.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "fmax z27.s, p3/M, z27.s, z31.s\n"
+    "fmax z24.s, p3/M, z24.s, z31.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "fmla z20.s, p3/M, z7.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z31.s\n"
+    "fmax z20.s, p3/M, z20.s, z31.s\n"
+    "fmla z21.s, p3/M, z6.s, z16.s\n"
+    "fmax z21.s, p3/M, z21.s, z31.s\n"
+    "fmin z28.s, p3/M, z28.s, z30.s\n"
+    "st1w { z28.s }, p0, [x11]\n"
+    "fmin z27.s, p3/M, z27.s, z30.s\n"
+    "fmin z26.s, p3/M, z26.s, z30.s\n"
+    "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
+    "fmin z25.s, p3/M, z25.s, z30.s\n"
+    "fmin z24.s, p3/M, z24.s, z30.s\n"
+    "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
+    "fmin z22.s, p3/M, z22.s, z30.s\n"
+    "fmin z20.s, p3/M, z20.s, z30.s\n"
+    "st1w { z26.s }, p0, [x24]\n"
+    "fmin z21.s, p3/M, z21.s, z30.s\n"
+    "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
+    "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
+    "st1w { z22.s }, p0, [x23]\n"
+    "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
+    "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..4809b0c45c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1w { z14.s }, p3/Z, [x8]\n"
+    "cntw x16\n"
+    "mov x15, #0x0\n"
+    "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+    "cmp x16, %x[n_channels]\n"
+    "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+    "sub x14, XZR, x16\n"
+    "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x8, x8, #16\n"
+    "ldp x24, x23, [x17, #0x0]\n"
+    "ldp x22, x21, [x17, #0x10]\n"
+    "ldr x20, [x17, #0x20]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+    "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "addvl x8, x8, #-6\n"
+    "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
+    "ldr x23, [x17, #0x30]\n"
+    "ldr x26, [x17, #0x38]\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ldr x22, [x17, #0x28]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "ldr x20, [x17, #0x40]\n"
+    "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+    "ldr x25, [x17, #0x50]\n"
+    "ldr x24, [x17, #0x58]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
+    "ldr x23, [x17, #0x60]\n"
+    "fmla z29.s, p3/M, z5.s, z13.s\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ldr x11, [x17, #0x88]\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+    "fmla z27.s, p3/M, z3.s, z13.s\n"
+    "incw x14\n"
+    "mov p1.b, p2.b\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z25.s, p3/M, z1.s, z13.s\n"
+    "ldr x10, [x13, #0x0]\n"
+    "whilelt p0.s, x16, %x[n_channels]\n"
+    "fmla z24.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z7.s, z18.s\n"
+    "ldr x22, [x17, #0x68]\n"
+    "ldr x21, [x17, #0x78]\n"
+    "fmla z28.s, p3/M, z0.s, z17.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x80]\n"
+    "fmla z26.s, p3/M, z4.s, z18.s\n"
+    "fmla z25.s, p3/M, z3.s, z18.s\n"
+    "ldr x9, [x13, #0x8]\n"
+    "ldr x28, [x13, #0x10]\n"
+    "fmla z21.s, p3/M, z0.s, z18.s\n"
+    "fmla z24.s, p3/M, z4.s, z19.s\n"
+    "ldr x27, [x13, #0x18]\n"
+    "ld1w { z14.s }, p3/Z, [x8]\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "fmla z29.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z27.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x26, [x17, #0x90]\n"
+    "fmla z25.s, p3/M, z5.s, z19.s\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "ldr x25, [x17, #0xa0]\n"
+    "ldr x24, [x17, #0x98]\n"
+    "fmla z26.s, p3/M, z0.s, z20.s\n"
+    "fmla z24.s, p3/M, z2.s, z17.s\n"
+    "fmla z28.s, p3/M, z8.s, z19.s\n"
+    "fmla z27.s, p3/M, z7.s, z19.s\n"
+    "fmla z22.s, p3/M, z1.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xa8]\n"
+    "fmla z26.s, p3/M, z6.s, z16.s\n"
+    "fmla z25.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z4.s, z18.s\n"
+    "fmla z29.s, p3/M, z3.s, z20.s\n"
+    "fmla z27.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z18.s\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "ldr x21, [x17, #0xb0]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.s, p3/M, z8.s, z18.s\n"
+    "fmla z24.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z16.s\n"
+    "fmla z28.s, p3/M, z3.s, z19.s\n"
+    "fmla z25.s, p3/M, z0.s, z19.s\n"
+    "fmla z22.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z19.s\n"
+    "fmla z26.s, p3/M, z1.s, z19.s\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z17.s\n"
+    "fmla z25.s, p3/M, z2.s, z17.s\n"
+    "fmla z24.s, p3/M, z1.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ldr x25, [x17, #0x20]\n"
+    "fmla z22.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z17.s\n"
+    "fmla z26.s, p3/M, z7.s, z16.s\n"
+    "fmla z25.s, p3/M, z6.s, z16.s\n"
+    "fmla z23.s, p3/M, z4.s, z16.s\n"
+    "fmla z21.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z4.s, z18.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "fmax z28.s, p3/M, z28.s, z31.s\n"
+    "fmin z28.s, p3/M, z28.s, z30.s\n"
+    "fmla z27.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z16.s\n"
+    "fmax z29.s, p3/M, z29.s, z31.s\n"
+    "fmla z24.s, p3/M, z7.s, z18.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "fmin z29.s, p3/M, z29.s, z30.s\n"
+    "st1w { z29.s }, p1, [x10, x14, LSL #2]\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "fmla z22.s, p3/M, z2.s, z17.s\n"
+    "ldr x24, [x13, #0x20]\n"
+    "st1w { z28.s }, p1, [x9, x14, LSL #2]\n"
+    "fmla z25.s, p3/M, z8.s, z18.s\n"
+    "fmla z26.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldp x23, x22, [x17, #0x0]\n"
+    "fmla z27.s, p3/M, z8.s, z17.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "ldp x21, x20, [x17, #0x10]\n"
+    "fmax z27.s, p3/M, z27.s, z31.s\n"
+    "fmla z23.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmax z26.s, p3/M, z26.s, z31.s\n"
+    "fmax z25.s, p3/M, z25.s, z31.s\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "incw x15\n"
+    "ld1w { z9.s }, p0/Z, [x23, x16, LSL #2]\n"
+    "ld1w { z10.s }, p0/Z, [x22, x16, LSL #2]\n"
+    "ld1w { z11.s }, p0/Z, [x21, x16, LSL #2]\n"
+    "ld1w { z12.s }, p0/Z, [x20, x16, LSL #2]\n"
+    "fmin z27.s, p3/M, z27.s, z30.s\n"
+    "fmin z26.s, p3/M, z26.s, z30.s\n"
+    "ld1w { z13.s }, p0/Z, [x25, x16, LSL #2]\n"
+    "incw x16\n"
+    "fmin z25.s, p3/M, z25.s, z30.s\n"
+    "st1w { z27.s }, p1, [x28, x14, LSL #2]\n"
+    "fmax z24.s, p3/M, z24.s, z31.s\n"
+    "fmax z23.s, p3/M, z23.s, z31.s\n"
+    "st1w { z26.s }, p1, [x27, x14, LSL #2]\n"
+    "ldr x23, [x13, #0x28]\n"
+    "fmax z21.s, p3/M, z21.s, z31.s\n"
+    "fmax z22.s, p3/M, z22.s, z31.s\n"
+    "st1w { z25.s }, p1, [x24, x14, LSL #2]\n"
+    "ldr x22, [x13, #0x30]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "whilelt p2.s, x15, %x[n_channels]\n"
+    "cmp x16, %x[n_channels]\n"
+    "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+    "fmin z24.s, p3/M, z24.s, z30.s\n"
+    "fmin z23.s, p3/M, z23.s, z30.s\n"
+    "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+    "fmin z21.s, p3/M, z21.s, z30.s\n"
+    "fmin z22.s, p3/M, z22.s, z30.s\n"
+    "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+    "st1w { z24.s }, p1, [x23, x14, LSL #2]\n"
+    "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x8, x8, #16\n"
+    "st1w { z23.s }, p1, [x22, x14, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+    "st1w { z21.s }, p1, [x21, x14, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+    "addvl x8, x8, #-6\n"
+    "st1w { z22.s }, p1, [x20, x14, LSL #2]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
+    "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
+    "ldr x23, [x17, #0x30]\n"
+    "ldr x26, [x17, #0x38]\n"
+    "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ldr x22, [x17, #0x28]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+    "ldr x20, [x17, #0x40]\n"
+    "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+    "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+    "ldr x25, [x17, #0x50]\n"
+    "ldr x24, [x17, #0x58]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
+    "ldr x23, [x17, #0x60]\n"
+    "fmla z29.s, p3/M, z5.s, z13.s\n"
+    "fmla z28.s, p3/M, z6.s, z18.s\n"
+    "ldr x12, [x17, #0x70]\n"
+    "ldr x11, [x17, #0x88]\n"
+    "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+    "fmla z27.s, p3/M, z3.s, z13.s\n"
+    "incw x14\n"
+    "mov p0.b, p2.b\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z25.s, p3/M, z1.s, z13.s\n"
+    "ldr x10, [x13, #0x0]\n"
+    "ldr x9, [x13, #0x8]\n"
+    "fmla z24.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+    "fmla z29.s, p3/M, z7.s, z18.s\n"
+    "ldr x22, [x17, #0x68]\n"
+    "ldr x21, [x17, #0x78]\n"
+    "fmla z28.s, p3/M, z0.s, z17.s\n"
+    "fmla z22.s, p3/M, z8.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x20, [x17, #0x80]\n"
+    "fmla z26.s, p3/M, z4.s, z18.s\n"
+    "fmla z25.s, p3/M, z3.s, z18.s\n"
+    "ldr x28, [x13, #0x10]\n"
+    "ldr x27, [x13, #0x18]\n"
+    "fmla z21.s, p3/M, z0.s, z18.s\n"
+    "fmla z24.s, p3/M, z4.s, z19.s\n"
+    "fmla z23.s, p3/M, z1.s, z18.s\n"
+    "fmla z29.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z16.s\n"
+    "fmla z27.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "ldr x26, [x17, #0x90]\n"
+    "fmla z25.s, p3/M, z5.s, z19.s\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "ldr x25, [x17, #0xa0]\n"
+    "ldr x24, [x17, #0x98]\n"
+    "fmla z26.s, p3/M, z0.s, z20.s\n"
+    "fmla z24.s, p3/M, z2.s, z17.s\n"
+    "fmla z28.s, p3/M, z8.s, z19.s\n"
+    "fmla z27.s, p3/M, z7.s, z19.s\n"
+    "fmla z22.s, p3/M, z1.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "ldr x23, [x17, #0xa8]\n"
+    "fmla z26.s, p3/M, z6.s, z16.s\n"
+    "fmla z25.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ldr x22, [x17, #0xc0]\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "fmla z21.s, p3/M, z4.s, z18.s\n"
+    "fmla z29.s, p3/M, z3.s, z20.s\n"
+    "fmla z27.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z18.s\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "ldr x21, [x17, #0xb0]\n"
+    "ldr x20, [x17, #0xb8]\n"
+    "fmla z26.s, p3/M, z8.s, z18.s\n"
+    "fmla z24.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z6.s, z16.s\n"
+    "fmla z28.s, p3/M, z3.s, z19.s\n"
+    "fmla z25.s, p3/M, z0.s, z19.s\n"
+    "fmla z22.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+    "fmla z23.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z19.s\n"
+    "fmla z26.s, p3/M, z1.s, z19.s\n"
+    "fmla z28.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z17.s\n"
+    "fmla z25.s, p3/M, z2.s, z17.s\n"
+    "fmla z24.s, p3/M, z1.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z18.s\n"
+    "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z17.s\n"
+    "fmla z26.s, p3/M, z7.s, z16.s\n"
+    "fmla z25.s, p3/M, z6.s, z16.s\n"
+    "fmla z23.s, p3/M, z4.s, z16.s\n"
+    "fmla z21.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z4.s, z18.s\n"
+    "fmla z28.s, p3/M, z1.s, z17.s\n"
+    "fmax z28.s, p3/M, z28.s, z31.s\n"
+    "fmin z28.s, p3/M, z28.s, z30.s\n"
+    "fmla z27.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z16.s\n"
+    "fmax z29.s, p3/M, z29.s, z31.s\n"
+    "fmla z24.s, p3/M, z7.s, z18.s\n"
+    "fmla z21.s, p3/M, z5.s, z18.s\n"
+    "fmin z29.s, p3/M, z29.s, z30.s\n"
+    "st1w { z29.s }, p0, [x10, x14, LSL #2]\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "fmla z22.s, p3/M, z2.s, z17.s\n"
+    "ldr x20, [x13, #0x20]\n"
+    "st1w { z28.s }, p0, [x9, x14, LSL #2]\n"
+    "fmla z25.s, p3/M, z8.s, z18.s\n"
+    "fmla z26.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+    "fmax z26.s, p3/M, z26.s, z31.s\n"
+    "fmla z27.s, p3/M, z8.s, z17.s\n"
+    "fmla z24.s, p3/M, z5.s, z17.s\n"
+    "fmax z27.s, p3/M, z27.s, z31.s\n"
+    "fmax z25.s, p3/M, z25.s, z31.s\n"
+    "fmla z23.s, p3/M, z8.s, z16.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "fmin z27.s, p3/M, z27.s, z30.s\n"
+    "fmin z26.s, p3/M, z26.s, z30.s\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "fmin z25.s, p3/M, z25.s, z30.s\n"
+    "fmax z24.s, p3/M, z24.s, z31.s\n"
+    "st1w { z27.s }, p0, [x28, x14, LSL #2]\n"
+    "fmax z23.s, p3/M, z23.s, z31.s\n"
+    "fmax z21.s, p3/M, z21.s, z31.s\n"
+    "st1w { z26.s }, p0, [x27, x14, LSL #2]\n"
+    "ldr x23, [x13, #0x28]\n"
+    "fmax z22.s, p3/M, z22.s, z31.s\n"
+    "st1w { z25.s }, p0, [x20, x14, LSL #2]\n"
+    "ldr x22, [x13, #0x30]\n"
+    "ldr x21, [x13, #0x38]\n"
+    "ldr x20, [x13, #0x40]\n"
+    "fmin z24.s, p3/M, z24.s, z30.s\n"
+    "fmin z23.s, p3/M, z23.s, z30.s\n"
+    "st1w { z24.s }, p0, [x23, x14, LSL #2]\n"
+    "fmin z21.s, p3/M, z21.s, z30.s\n"
+    "fmin z22.s, p3/M, z22.s, z30.s\n"
+    "st1w { z23.s }, p0, [x22, x14, LSL #2]\n"
+    "st1w { z21.s }, p0, [x21, x14, LSL #2]\n"
+    "st1w { z22.s }, p0, [x20, x14, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..38b377509e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..35445595f8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x16, #0x0\n"
+    "mov x4, #0x0\n"
+    "1:"  // Tile loop
+    "str x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x24, #0x4\n"
+    "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "mul x21, x16, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x5, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x20, x16, x22\n"  // offset = tile_i * ld_output_row
+    "add x7, x5, x5\n"
+    "madd x21, x4, x5, x21\n"  // offset += tile_j * ld_input_col
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "cntw x16\n"
+    "madd x20, x4, x6, x20\n"  // offset += tile_j * ld_output_col
+    "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x14, x7, x5\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "mul x21, x21, x25\n"  // offset *= kernel_stride * output_size
+    "add x8, x8, x21, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x13, x8, x23, LSL #2\n"
+    "ld1w { z19.s }, p3/Z, [x17]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "add x12, x13, x23, LSL #2\n"
+    "add x15, x15, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+    "add x11, x12, x23, LSL #2\n"
+    "add x10, x14, x5\n"
+    "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+    "add x9, x15, x22, LSL #2\n"
+    "add x28, x11, x23, LSL #2\n"
+    "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+    "addvl x17, x17, #16\n"
+    "add x27, x10, x5\n"
+    "add x26, x9, x22, LSL #2\n"
+    "add x25, x6, x6\n"
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cmp x16, %x[n_channels]\n"
+    "add x24, x28, x23, LSL #2\n"
+    "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+    "add x23, x26, x22, LSL #2\n"
+    "add x22, x25, x6\n"
+    "ld1w { z9.s }, p2/Z, [x12, x7, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x8]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x16\n"
+    "ld1w { z11.s }, p2/Z, [x8, x27, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
+    "addvl x17, x17, #-6\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
+    "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
+    "whilelt p1.s, x16, %x[n_channels]\n"
+    "incw x21\n"
+    "movprfx z21, z19\n fmla z21.s, p3/M, z3.s, z9.s\n"
+    "movprfx z22, z19\n fmla z22.s, p3/M, z1.s, z9.s\n"
+    "incw x16\n"
+    "mov p0.b, p2.b\n"
+    "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
+    "fmla z14.s, p3/M, z5.s, z12.s\n"
+    "incw x20\n"
+    "movprfx z13, z19\n fmla z13.s, p3/M, z7.s, z9.s\n"
+    "movprfx z17, z19\n fmla z17.s, p3/M, z6.s, z9.s\n"
+    "movprfx z27, z19\n fmla z27.s, p3/M, z5.s, z9.s\n"
+    "movprfx z18, z19\n fmla z18.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z29.s }, p2/Z, [x24]\n"
+    "ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
+    "fmla z21.s, p3/M, z4.s, z12.s\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "movprfx z23, z19\n fmla z23.s, p3/M, z6.s, z29.s\n"
+    "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+    "fmla z14.s, p3/M, z7.s, z9.s\n"
+    "fmla z13.s, p3/M, z8.s, z12.s\n"
+    "fmla z17.s, p3/M, z7.s, z12.s\n"
+    "fmla z30.s, p3/M, z6.s, z12.s\n"
+    "movprfx z26, z19\n fmla z26.s, p3/M, z3.s, z12.s\n"
+    "movprfx z28, z19\n fmla z28.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x8, x5, LSL #2]\n"
+    "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z11.s\n"
+    "fmla z21.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x8, x10, LSL #2]\n"
+    "fmla z22.s, p3/M, z4.s, z9.s\n"
+    "fmla z20.s, p3/M, z3.s, z9.s\n"
+    "movprfx z25, z19\n fmla z25.s, p3/M, z1.s, z9.s\n"
+    "movprfx z29, z19\n fmla z29.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z19.s }, p3/Z, [x17]\n"
+    "fmla z27.s, p3/M, z8.s, z9.s\n"
+    "fmla z18.s, p3/M, z5.s, z9.s\n"
+    "fmla z23.s, p3/M, z2.s, z9.s\n"
+    "fmla z14.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x13]\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "fmla z13.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
+    "fmla z17.s, p3/M, z2.s, z11.s\n"
+    "fmla z30.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x28]\n"
+    "fmla z21.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z6.s, z10.s\n"
+    "fmla z22.s, p3/M, z5.s, z10.s\n"
+    "fmla z20.s, p3/M, z4.s, z10.s\n"
+    "fmla z28.s, p3/M, z3.s, z10.s\n"
+    "fmla z25.s, p3/M, z2.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z10.s\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x13, x7, LSL #2]\n"
+    "fmla z27.s, p3/M, z0.s, z9.s\n"
+    "fmla z18.s, p3/M, z6.s, z11.s\n"
+    "fmla z23.s, p3/M, z3.s, z11.s\n"
+    "fmla z14.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "fmla z26.s, p3/M, z2.s, z12.s\n"
+    "fmla z13.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z9.s }, p2/Z, [x13, x14, LSL #2]\n"
+    "fmla z17.s, p3/M, z3.s, z10.s\n"
+    "fmla z21.s, p3/M, z0.s, z10.s\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "fmla z24.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x5, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z10.s\n"
+    "fmla z14.s, p3/M, z2.s, z9.s\n"
+    "fmla z31.s, p3/M, z5.s, z10.s\n"
+    "fmla z13.s, p3/M, z5.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x5, LSL #2]\n"
+    "fmla z17.s, p3/M, z4.s, z9.s\n"
+    "fmla z30.s, p3/M, z3.s, z9.s\n"
+    "fmla z21.s, p3/M, z1.s, z9.s\n"
+    "fmla z26.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
+    "fmla z23.s, p3/M, z7.s, z12.s\n"
+    "fmla z25.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
+    "fmla z27.s, p3/M, z4.s, z11.s\n"
+    "fmla z14.s, p3/M, z3.s, z11.s\n"
+    "fmla z18.s, p3/M, z1.s, z11.s\n"
+    "fmla z22.s, p3/M, z0.s, z11.s\n"
+    "fmla z31.s, p3/M, z7.s, z11.s\n"
+    "fmla z13.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "fmla z24.s, p3/M, z7.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x5, LSL #2]\n"
+    "fmla z17.s, p3/M, z8.s, z10.s\n"
+    "fmla z30.s, p3/M, z7.s, z10.s\n"
+    "fmla z21.s, p3/M, z5.s, z10.s\n"
+    "fmla z26.s, p3/M, z4.s, z10.s\n"
+    "fmla z20.s, p3/M, z2.s, z10.s\n"
+    "fmla z28.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z11.s }, p2/Z, [x8, x14, LSL #2]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z27.s, p3/M, z7.s, z12.s\n"
+    "fmla z14.s, p3/M, z6.s, z12.s\n"
+    "fmla z18.s, p3/M, z4.s, z12.s\n"
+    "fmla z22.s, p3/M, z3.s, z12.s\n"
+    "fmla z23.s, p3/M, z1.s, z12.s\n"
+    "fmla z25.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x10, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z9.s\n"
+    "fmla z13.s, p3/M, z1.s, z9.s\n"
+    "fmla z17.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x12]\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "fmla z30.s, p3/M, z0.s, z11.s\n"
+    "fmla z27.s, p3/M, z3.s, z9.s\n"
+    "fmla z18.s, p3/M, z0.s, z9.s\n"
+    "fmla z21.s, p3/M, z8.s, z12.s\n"
+    "fmla z26.s, p3/M, z7.s, z12.s\n"
+    "fmla z20.s, p3/M, z5.s, z12.s\n"
+    "fmla z28.s, p3/M, z4.s, z12.s\n"
+    "fmla z24.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n"
+    "fmla z13.s, p3/M, z2.s, z11.s\n"
+    "fmla z17.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+    "addvl x12, x12, #1\n"
+    "fmla z31.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x11]\n"
+    "fmla z25.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z9.s }, p1/Z, [x12, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z3.s, z10.s\n"
+    "fmla z30.s, p3/M, z8.s, z11.s\n"
+    "fmla z26.s, p3/M, z5.s, z11.s\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z27.s, p3/M, z6.s, z12.s\n"
+    "fmla z18.s, p3/M, z3.s, z12.s\n"
+    "fmla z23.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n"
+    "fmla z24.s, p3/M, z2.s, z11.s\n"
+    "fmla z25.s, p3/M, z7.s, z12.s\n"
+    "fmla z29.s, p3/M, z6.s, z12.s\n"
+    "fmla z18.s, p3/M, z8.s, z10.s\n"
+    "fmla z22.s, p3/M, z7.s, z10.s\n"
+    "fmla z20.s, p3/M, z6.s, z10.s\n"
+    "fmla z23.s, p3/M, z5.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x14, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z11.s\n"
+    "fmla z25.s, p3/M, z5.s, z10.s\n"
+    "fmla z29.s, p3/M, z4.s, z10.s\n"
+    "fmla z24.s, p3/M, z3.s, z10.s\n"
+    "fmla z26.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x24, x14, LSL #2]\n"
+    "fmla z23.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x13, x5, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z10.s\n"
+    "fmla z20.s, p3/M, z7.s, z10.s\n"
+    "addvl x24, x24, #1\n"
+    "fmla z28.s, p3/M, z6.s, z10.s\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
+    "addvl x13, x13, #1\n"
+    "fmla z29.s, p3/M, z7.s, z11.s\n"
+    "fmla z24.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x28, x5, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "fmla z13.s, p3/M, z3.s, z12.s\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "fmax z13.s, p3/M, z13.s, z15.s\n"
+    "fmla z27.s, p3/M, z1.s, z12.s\n"
+    "fmla z14.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z0.s }, p2/Z, [x28, x10, LSL #2]\n"
+    "fmax z27.s, p3/M, z27.s, z15.s\n"
+    "fmla z17.s, p3/M, z5.s, z10.s\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "fmax z17.s, p3/M, z17.s, z15.s\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "fmla z21.s, p3/M, z2.s, z10.s\n"
+    "fmla z26.s, p3/M, z1.s, z10.s\n"
+    "fmax z14.s, p3/M, z14.s, z15.s\n"
+    "fmax z21.s, p3/M, z21.s, z15.s\n"
+    "fmla z18.s, p3/M, z7.s, z11.s\n"
+    "fmla z22.s, p3/M, z6.s, z11.s\n"
+    "fmax z26.s, p3/M, z26.s, z15.s\n"
+    "fmax z18.s, p3/M, z18.s, z15.s\n"
+    "fmla z23.s, p3/M, z4.s, z11.s\n"
+    "fmla z25.s, p3/M, z3.s, z11.s\n"
+    "fmax z22.s, p3/M, z22.s, z15.s\n"
+    "fmax z23.s, p3/M, z23.s, z15.s\n"
+    "fmla z20.s, p3/M, z8.s, z0.s\n"
+    "fmla z28.s, p3/M, z7.s, z0.s\n"
+    "fmax z20.s, p3/M, z20.s, z15.s\n"
+    "fmax z28.s, p3/M, z28.s, z15.s\n"
+    "fmla z29.s, p3/M, z5.s, z0.s\n"
+    "fmla z24.s, p3/M, z4.s, z0.s\n"
+    "fmax z25.s, p3/M, z25.s, z15.s\n"
+    "fmax z29.s, p3/M, z29.s, z15.s\n"
+    "fmax z24.s, p3/M, z24.s, z15.s\n"
+    "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+    "cmp x16, %x[n_channels]\n"
+    "fmin z31.s, p3/M, z31.s, z16.s\n"
+    "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+    "fmin z13.s, p3/M, z13.s, z16.s\n"
+    "fmin z17.s, p3/M, z17.s, z16.s\n"
+    "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+    "addvl x17, x17, #16\n"
+    "fmin z30.s, p3/M, z30.s, z16.s\n"
+    "ld1w { z10.s }, p1/Z, [x8]\n"
+    "fmin z27.s, p3/M, z27.s, z16.s\n"
+    "fmin z14.s, p3/M, z14.s, z16.s\n"
+    "ld1w { z11.s }, p1/Z, [x8, x27, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x12, x14, LSL #2]\n"
+    "fmin z21.s, p3/M, z21.s, z16.s\n"
+    "fmin z26.s, p3/M, z26.s, z16.s\n"
+    "st1w { z31.s }, p0, [x15]\n"
+    "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+    "fmin z18.s, p3/M, z18.s, z16.s\n"
+    "fmin z22.s, p3/M, z22.s, z16.s\n"
+    "st1w { z13.s }, p0, [x15, x6, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+    "fmin z20.s, p3/M, z20.s, z16.s\n"
+    "fmin z28.s, p3/M, z28.s, z16.s\n"
+    "st1w { z17.s }, p0, [x15, x25, LSL #2]\n"
+    "fmin z23.s, p3/M, z23.s, z16.s\n"
+    "fmin z25.s, p3/M, z25.s, z16.s\n"
+    "st1w { z30.s }, p0, [x15, x22, LSL #2]\n"
+    "fmin z29.s, p3/M, z29.s, z16.s\n"
+    "fmin z24.s, p3/M, z24.s, z16.s\n"
+    "st1w { z27.s }, p0, [x9]\n"
+    "addvl x28, x28, #1\n"
+    "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
+    "addvl x15, x15, #1\n"
+    "st1w { z21.s }, p0, [x9, x25, LSL #2]\n"
+    "addvl x17, x17, #-6\n"
+    "st1w { z26.s }, p0, [x9, x22, LSL #2]\n"
+    "addvl x9, x9, #1\n"
+    "st1w { z18.s }, p0, [x26]\n"
+    "st1w { z22.s }, p0, [x26, x6, LSL #2]\n"
+    "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
+    "st1w { z28.s }, p0, [x26, x22, LSL #2]\n"
+    "addvl x26, x26, #1\n"
+    "st1w { z23.s }, p0, [x23]\n"
+    "st1w { z25.s }, p0, [x23, x6, LSL #2]\n"
+    "st1w { z29.s }, p0, [x23, x25, LSL #2]\n"
+    "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
+    "addvl x23, x23, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
+    "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "movprfx z30, z19\n fmla z30.s, p3/M, z3.s, z9.s\n"
+    "movprfx z13, z19\n fmla z13.s, p3/M, z1.s, z9.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x4, x4, #0x1\n"
+    "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
+    "fmla z14.s, p3/M, z5.s, z12.s\n"
+    "cmp x4, x20\n"
+    "add x21, x16, #0x1\n"
+    "movprfx z18, z19\n fmla z18.s, p3/M, z7.s, z9.s\n"
+    "movprfx z28, z19\n fmla z28.s, p3/M, z6.s, z9.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "csel x16, x16, x21, LT\n"
+    "movprfx z17, z19\n fmla z17.s, p3/M, z5.s, z9.s\n"
+    "movprfx z26, z19\n fmla z26.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "movprfx z27, z19\n fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z29.s }, p2/Z, [x24]\n"
+    "ld1w { z21.s }, p2/Z, [x24, x27, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "fmla z13.s, p3/M, z2.s, z12.s\n"
+    "csel x4, x4, XZR, LT\n"
+    "cmp x16, x20\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "movprfx z10, z19\n fmla z10.s, p3/M, z6.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x11, x14, LSL #2]\n"
+    "fmla z14.s, p3/M, z7.s, z9.s\n"
+    "fmla z18.s, p3/M, z8.s, z12.s\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "fmla z27.s, p3/M, z6.s, z12.s\n"
+    "movprfx z11, z19\n fmla z11.s, p3/M, z3.s, z12.s\n"
+    "movprfx z25, z19\n fmla z25.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z22.s }, p2/Z, [x8, x5, LSL #2]\n"
+    "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z21.s\n"
+    "fmla z30.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z21.s }, p2/Z, [x8, x10, LSL #2]\n"
+    "fmla z13.s, p3/M, z4.s, z9.s\n"
+    "fmla z20.s, p3/M, z3.s, z9.s\n"
+    "movprfx z12, z19\n fmla z12.s, p3/M, z1.s, z9.s\n"
+    "movprfx z23, z19\n fmla z23.s, p3/M, z0.s, z9.s\n"
+    "fmla z17.s, p3/M, z8.s, z9.s\n"
+    "fmla z26.s, p3/M, z5.s, z9.s\n"
+    "fmla z10.s, p3/M, z2.s, z9.s\n"
+    "fmla z14.s, p3/M, z8.s, z29.s\n"
+    "ld1w { z9.s }, p2/Z, [x13]\n"
+    "fmla z31.s, p3/M, z1.s, z22.s\n"
+    "fmla z18.s, p3/M, z0.s, z22.s\n"
+    "ld1w { z22.s }, p2/Z, [x13, x27, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z21.s\n"
+    "fmla z27.s, p3/M, z1.s, z21.s\n"
+    "ld1w { z19.s }, p2/Z, [x28]\n"
+    "fmla z30.s, p3/M, z7.s, z29.s\n"
+    "fmla z11.s, p3/M, z6.s, z29.s\n"
+    "fmla z13.s, p3/M, z5.s, z29.s\n"
+    "fmla z20.s, p3/M, z4.s, z29.s\n"
+    "fmla z25.s, p3/M, z3.s, z29.s\n"
+    "fmla z12.s, p3/M, z2.s, z29.s\n"
+    "fmla z23.s, p3/M, z1.s, z29.s\n"
+    "fmla z24.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z21.s }, p2/Z, [x13, x7, LSL #2]\n"
+    "fmla z17.s, p3/M, z0.s, z9.s\n"
+    "fmla z26.s, p3/M, z6.s, z19.s\n"
+    "fmla z10.s, p3/M, z3.s, z19.s\n"
+    "fmla z14.s, p3/M, z1.s, z21.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "fmla z27.s, p3/M, z5.s, z22.s\n"
+    "fmla z11.s, p3/M, z2.s, z22.s\n"
+    "fmla z18.s, p3/M, z4.s, z21.s\n"
+    "ld1w { z29.s }, p2/Z, [x13, x14, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z21.s\n"
+    "fmla z30.s, p3/M, z0.s, z21.s\n"
+    "fmla z25.s, p3/M, z8.s, z19.s\n"
+    "fmla z24.s, p3/M, z5.s, z19.s\n"
+    "ld1w { z19.s }, p2/Z, [x24, x5, LSL #2]\n"
+    "fmla z17.s, p3/M, z2.s, z21.s\n"
+    "fmla z14.s, p3/M, z2.s, z29.s\n"
+    "fmla z31.s, p3/M, z5.s, z21.s\n"
+    "fmla z18.s, p3/M, z5.s, z29.s\n"
+    "ld1w { z22.s }, p2/Z, [x12, x5, LSL #2]\n"
+    "fmla z28.s, p3/M, z4.s, z29.s\n"
+    "fmla z27.s, p3/M, z3.s, z29.s\n"
+    "fmla z30.s, p3/M, z1.s, z29.s\n"
+    "fmla z11.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z21.s }, p2/Z, [x12, x10, LSL #2]\n"
+    "fmla z10.s, p3/M, z7.s, z19.s\n"
+    "fmla z12.s, p3/M, z6.s, z19.s\n"
+    "ld1w { z19.s }, p2/Z, [x24, x10, LSL #2]\n"
+    "fmla z17.s, p3/M, z4.s, z22.s\n"
+    "fmla z14.s, p3/M, z3.s, z22.s\n"
+    "fmla z26.s, p3/M, z1.s, z22.s\n"
+    "fmla z13.s, p3/M, z0.s, z22.s\n"
+    "fmla z31.s, p3/M, z7.s, z22.s\n"
+    "fmla z18.s, p3/M, z6.s, z22.s\n"
+    "ld1w { z29.s }, p2/Z, [x8, x7, LSL #2]\n"
+    "fmla z23.s, p3/M, z8.s, z19.s\n"
+    "fmla z24.s, p3/M, z7.s, z19.s\n"
+    "ld1w { z19.s }, p2/Z, [x11, x5, LSL #2]\n"
+    "fmla z28.s, p3/M, z8.s, z21.s\n"
+    "fmla z27.s, p3/M, z7.s, z21.s\n"
+    "fmla z30.s, p3/M, z5.s, z21.s\n"
+    "fmla z11.s, p3/M, z4.s, z21.s\n"
+    "fmla z20.s, p3/M, z2.s, z21.s\n"
+    "fmla z25.s, p3/M, z1.s, z21.s\n"
+    "ld1w { z22.s }, p2/Z, [x8, x14, LSL #2]\n"
+    "fmla z17.s, p3/M, z7.s, z19.s\n"
+    "fmla z14.s, p3/M, z6.s, z19.s\n"
+    "fmla z26.s, p3/M, z4.s, z19.s\n"
+    "fmla z13.s, p3/M, z3.s, z19.s\n"
+    "fmla z10.s, p3/M, z1.s, z19.s\n"
+    "fmla z12.s, p3/M, z0.s, z19.s\n"
+    "ld1w { z21.s }, p2/Z, [x11, x10, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z29.s\n"
+    "fmla z18.s, p3/M, z1.s, z29.s\n"
+    "fmla z28.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x12]\n"
+    "fmla z23.s, p3/M, z2.s, z21.s\n"
+    "fmla z27.s, p3/M, z0.s, z22.s\n"
+    "fmla z17.s, p3/M, z3.s, z29.s\n"
+    "fmla z26.s, p3/M, z0.s, z29.s\n"
+    "fmla z30.s, p3/M, z8.s, z21.s\n"
+    "fmla z11.s, p3/M, z7.s, z21.s\n"
+    "fmla z20.s, p3/M, z5.s, z21.s\n"
+    "fmla z25.s, p3/M, z4.s, z21.s\n"
+    "fmla z24.s, p3/M, z1.s, z21.s\n"
+    "ld1w { z19.s }, p2/Z, [x28, x7, LSL #2]\n"
+    "fmla z18.s, p3/M, z2.s, z22.s\n"
+    "fmla z28.s, p3/M, z1.s, z22.s\n"
+    "ld1w { z21.s }, p2/Z, [x12, x27, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x11]\n"
+    "fmla z12.s, p3/M, z4.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z19.s\n"
+    "fmla z27.s, p3/M, z8.s, z21.s\n"
+    "fmla z11.s, p3/M, z5.s, z21.s\n"
+    "fmla z25.s, p3/M, z2.s, z21.s\n"
+    "ld1w { z9.s }, p2/Z, [x11, x27, LSL #2]\n"
+    "fmla z17.s, p3/M, z6.s, z29.s\n"
+    "fmla z26.s, p3/M, z3.s, z29.s\n"
+    "fmla z10.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z22.s }, p2/Z, [x24, x7, LSL #2]\n"
+    "fmla z24.s, p3/M, z2.s, z9.s\n"
+    "fmla z12.s, p3/M, z7.s, z22.s\n"
+    "fmla z23.s, p3/M, z6.s, z22.s\n"
+    "fmla z26.s, p3/M, z8.s, z19.s\n"
+    "fmla z13.s, p3/M, z7.s, z19.s\n"
+    "fmla z20.s, p3/M, z6.s, z19.s\n"
+    "fmla z10.s, p3/M, z5.s, z19.s\n"
+    "ld1w { z21.s }, p2/Z, [x28, x14, LSL #2]\n"
+    "fmla z25.s, p3/M, z5.s, z9.s\n"
+    "fmla z12.s, p3/M, z5.s, z21.s\n"
+    "fmla z23.s, p3/M, z4.s, z21.s\n"
+    "fmla z24.s, p3/M, z3.s, z21.s\n"
+    "fmla z11.s, p3/M, z8.s, z9.s\n"
+    "ld1w { z19.s }, p2/Z, [x24, x14, LSL #2]\n"
+    "fmla z10.s, p3/M, z8.s, z22.s\n"
+    "ld1w { z22.s }, p2/Z, [x13, x5, LSL #2]\n"
+    "fmla z13.s, p3/M, z8.s, z21.s\n"
+    "fmla z20.s, p3/M, z7.s, z21.s\n"
+    "fmla z25.s, p3/M, z6.s, z21.s\n"
+    "fmla z12.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z29.s }, p2/Z, [x13, x10, LSL #2]\n"
+    "fmla z23.s, p3/M, z7.s, z19.s\n"
+    "fmla z24.s, p3/M, z6.s, z19.s\n"
+    "ld1w { z21.s }, p2/Z, [x28, x5, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z22.s\n"
+    "fmla z18.s, p3/M, z3.s, z22.s\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "fmax z18.s, p3/M, z18.s, z15.s\n"
+    "fmla z17.s, p3/M, z1.s, z22.s\n"
+    "fmla z14.s, p3/M, z0.s, z22.s\n"
+    "ld1w { z9.s }, p2/Z, [x28, x10, LSL #2]\n"
+    "fmax z17.s, p3/M, z17.s, z15.s\n"
+    "fmla z28.s, p3/M, z5.s, z29.s\n"
+    "fmla z27.s, p3/M, z4.s, z29.s\n"
+    "fmax z28.s, p3/M, z28.s, z15.s\n"
+    "fmax z27.s, p3/M, z27.s, z15.s\n"
+    "fmla z30.s, p3/M, z2.s, z29.s\n"
+    "fmla z11.s, p3/M, z1.s, z29.s\n"
+    "fmax z14.s, p3/M, z14.s, z15.s\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "fmla z26.s, p3/M, z7.s, z21.s\n"
+    "fmla z13.s, p3/M, z6.s, z21.s\n"
+    "fmax z11.s, p3/M, z11.s, z15.s\n"
+    "fmax z26.s, p3/M, z26.s, z15.s\n"
+    "fmla z10.s, p3/M, z4.s, z21.s\n"
+    "fmla z12.s, p3/M, z3.s, z21.s\n"
+    "fmax z13.s, p3/M, z13.s, z15.s\n"
+    "fmax z10.s, p3/M, z10.s, z15.s\n"
+    "fmla z20.s, p3/M, z8.s, z9.s\n"
+    "fmla z25.s, p3/M, z7.s, z9.s\n"
+    "fmax z20.s, p3/M, z20.s, z15.s\n"
+    "fmax z25.s, p3/M, z25.s, z15.s\n"
+    "fmla z23.s, p3/M, z5.s, z9.s\n"
+    "fmla z24.s, p3/M, z4.s, z9.s\n"
+    "fmax z12.s, p3/M, z12.s, z15.s\n"
+    "fmax z23.s, p3/M, z23.s, z15.s\n"
+    "fmax z24.s, p3/M, z24.s, z15.s\n"
+    "fmin z31.s, p3/M, z31.s, z16.s\n"
+    "st1w { z31.s }, p0, [x15]\n"
+    "fmin z18.s, p3/M, z18.s, z16.s\n"
+    "fmin z28.s, p3/M, z28.s, z16.s\n"
+    "st1w { z18.s }, p0, [x15, x6, LSL #2]\n"
+    "fmin z27.s, p3/M, z27.s, z16.s\n"
+    "fmin z17.s, p3/M, z17.s, z16.s\n"
+    "st1w { z28.s }, p0, [x15, x25, LSL #2]\n"
+    "fmin z14.s, p3/M, z14.s, z16.s\n"
+    "fmin z30.s, p3/M, z30.s, z16.s\n"
+    "st1w { z27.s }, p0, [x15, x22, LSL #2]\n"
+    "fmin z11.s, p3/M, z11.s, z16.s\n"
+    "fmin z26.s, p3/M, z26.s, z16.s\n"
+    "st1w { z17.s }, p0, [x9]\n"
+    "fmin z13.s, p3/M, z13.s, z16.s\n"
+    "fmin z20.s, p3/M, z20.s, z16.s\n"
+    "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
+    "fmin z25.s, p3/M, z25.s, z16.s\n"
+    "fmin z10.s, p3/M, z10.s, z16.s\n"
+    "st1w { z30.s }, p0, [x9, x25, LSL #2]\n"
+    "fmin z12.s, p3/M, z12.s, z16.s\n"
+    "fmin z23.s, p3/M, z23.s, z16.s\n"
+    "st1w { z11.s }, p0, [x9, x22, LSL #2]\n"
+    "fmin z24.s, p3/M, z24.s, z16.s\n"
+    "st1w { z26.s }, p0, [x26]\n"
+    "st1w { z13.s }, p0, [x26, x6, LSL #2]\n"
+    "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
+    "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
+    "st1w { z10.s }, p0, [x23]\n"
+    "st1w { z12.s }, p0, [x23, x6, LSL #2]\n"
+    "st1w { z23.s }, p0, [x23, x25, LSL #2]\n"
+    "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..3db248924f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1w { z17.s }, p3/Z, [x7]\n"
+    "cntw x17\n"
+    "mov x16, #0x0\n"
+    "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
+    "cmp x17, %x[n_channels]\n"
+    "ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
+    "sub x15, XZR, x17\n"
+    "ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
+    "addvl x7, x7, #16\n"
+    "ldp x23, x22, [x8, #0x0]\n"
+    "ldp x21, x20, [x8, #0x10]\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
+    "addvl x7, x7, #-6\n"
+    "ld1w { z9.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z20, z17\n fmla z20.s, p3/M, z4.s, z9.s\n"
+    "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z9.s\n"
+    "ldr x27, [x8, #0x20]\n"
+    "ldr x24, [x8, #0x30]\n"
+    "movprfx z24, z17\n fmla z24.s, p3/M, z3.s, z9.s\n"
+    "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "ldr x23, [x8, #0x28]\n"
+    "ldr x22, [x8, #0x38]\n"
+    "movprfx z31, z17\n fmla z31.s, p3/M, z0.s, z9.s\n"
+    "movprfx z22, z17\n fmla z22.s, p3/M, z7.s, z9.s\n"
+    "ldr x26, [x8, #0x40]\n"
+    "ldr x21, [x8, #0x48]\n"
+    "movprfx z27, z17\n fmla z27.s, p3/M, z6.s, z9.s\n"
+    "fmla z20.s, p3/M, z5.s, z12.s\n"
+    "ldr x25, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "movprfx z14, z17\n fmla z14.s, p3/M, z5.s, z9.s\n"
+    "movprfx z23, z17\n fmla z23.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z25.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x13, [x8, #0x70]\n"
+    "fmla z26.s, p3/M, z0.s, z10.s\n"
+    "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z28.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "fmla z24.s, p3/M, z4.s, z12.s\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "ldr x24, [x8, #0x60]\n"
+    "ldr x23, [x8, #0x68]\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "fmla z22.s, p3/M, z8.s, z12.s\n"
+    "incw x15\n"
+    "mov p1.b, p2.b\n"
+    "fmla z27.s, p3/M, z7.s, z12.s\n"
+    "movprfx z15, z17\n fmla z15.s, p3/M, z6.s, z28.s\n"
+    "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x28, [x8, #0x88]\n"
+    "fmla z20.s, p3/M, z7.s, z25.s\n"
+    "fmla z9.s, p3/M, z6.s, z12.s\n"
+    "ldr x12, [x14, #0x0]\n"
+    "ldr x11, [x14, #0x8]\n"
+    "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
+    "movprfx z13, z17\n fmla z13.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ldr x22, [x8, #0x78]\n"
+    "movprfx z28, z17\n fmla z28.s, p3/M, z8.s, z21.s\n"
+    "fmla z24.s, p3/M, z6.s, z25.s\n"
+    "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "ldr x21, [x8, #0x80]\n"
+    "fmla z30.s, p3/M, z4.s, z25.s\n"
+    "fmla z31.s, p3/M, z3.s, z25.s\n"
+    "ldr x10, [x14, #0x10]\n"
+    "ldr x9, [x14, #0x18]\n"
+    "movprfx z18, z17\n fmla z18.s, p3/M, z1.s, z25.s\n"
+    "movprfx z21, z17\n fmla z21.s, p3/M, z0.s, z25.s\n"
+    "whilelt p0.s, x17, %x[n_channels]\n"
+    "ld1w { z17.s }, p3/Z, [x7]\n"
+    "fmla z14.s, p3/M, z8.s, z25.s\n"
+    "fmla z23.s, p3/M, z5.s, z25.s\n"
+    "fmla z15.s, p3/M, z2.s, z25.s\n"
+    "fmla z26.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "ldr x27, [x8, #0x90]\n"
+    "fmla z22.s, p3/M, z0.s, z12.s\n"
+    "fmla z27.s, p3/M, z2.s, z29.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x20, [x8, #0x98]\n"
+    "fmla z20.s, p3/M, z8.s, z10.s\n"
+    "fmla z9.s, p3/M, z1.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x26, [x8, #0xa0]\n"
+    "fmla z24.s, p3/M, z7.s, z10.s\n"
+    "fmla z11.s, p3/M, z6.s, z10.s\n"
+    "fmla z30.s, p3/M, z5.s, z10.s\n"
+    "fmla z31.s, p3/M, z4.s, z10.s\n"
+    "fmla z13.s, p3/M, z3.s, z10.s\n"
+    "fmla z18.s, p3/M, z2.s, z10.s\n"
+    "fmla z21.s, p3/M, z1.s, z10.s\n"
+    "fmla z28.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "ldr x25, [x8, #0xa8]\n"
+    "fmla z26.s, p3/M, z3.s, z25.s\n"
+    "fmla z14.s, p3/M, z0.s, z25.s\n"
+    "fmla z23.s, p3/M, z6.s, z29.s\n"
+    "fmla z15.s, p3/M, z3.s, z29.s\n"
+    "ld1w { z25.s }, p2/Z, [x13, x16, LSL #2]\n"
+    "ldr x24, [x8, #0xb0]\n"
+    "fmla z22.s, p3/M, z4.s, z10.s\n"
+    "fmla z27.s, p3/M, z3.s, z10.s\n"
+    "fmla z20.s, p3/M, z1.s, z10.s\n"
+    "fmla z9.s, p3/M, z5.s, z12.s\n"
+    "fmla z11.s, p3/M, z2.s, z12.s\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ldr x23, [x8, #0xb8]\n"
+    "fmla z13.s, p3/M, z8.s, z25.s\n"
+    "fmla z28.s, p3/M, z5.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x22, [x8, #0xc0]\n"
+    "fmla z26.s, p3/M, z5.s, z10.s\n"
+    "fmla z14.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z29.s }, p2/Z, [x28, x16, LSL #2]\n"
+    "ldr x21, [x8, #0xc8]\n"
+    "fmla z22.s, p3/M, z5.s, z12.s\n"
+    "fmla z27.s, p3/M, z4.s, z12.s\n"
+    "fmla z20.s, p3/M, z2.s, z12.s\n"
+    "fmla z9.s, p3/M, z3.s, z12.s\n"
+    "fmla z24.s, p3/M, z1.s, z12.s\n"
+    "fmla z11.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x28, [x8, #0xd8]\n"
+    "fmla z15.s, p3/M, z7.s, z25.s\n"
+    "fmla z18.s, p3/M, z6.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "ldr x20, [x8, #0xd0]\n"
+    "fmla z26.s, p3/M, z7.s, z29.s\n"
+    "fmla z22.s, p3/M, z6.s, z29.s\n"
+    "fmla z14.s, p3/M, z4.s, z29.s\n"
+    "fmla z20.s, p3/M, z3.s, z29.s\n"
+    "fmla z23.s, p3/M, z1.s, z29.s\n"
+    "fmla z30.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "ldr x27, [x8, #0xe0]\n"
+    "fmla z27.s, p3/M, z8.s, z10.s\n"
+    "fmla z21.s, p3/M, z8.s, z25.s\n"
+    "fmla z28.s, p3/M, z7.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "fmla z13.s, p3/M, z1.s, z10.s\n"
+    "ldr x26, [x8, #0xe8]\n"
+    "fmla z9.s, p3/M, z7.s, z10.s\n"
+    "fmla z24.s, p3/M, z5.s, z10.s\n"
+    "fmla z11.s, p3/M, z4.s, z10.s\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x25, [x8, #0xf0]\n"
+    "fmla z26.s, p3/M, z2.s, z29.s\n"
+    "fmla z22.s, p3/M, z1.s, z29.s\n"
+    "fmla z27.s, p3/M, z0.s, z29.s\n"
+    "fmla z14.s, p3/M, z7.s, z25.s\n"
+    "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "ldr x24, [x8, #0xf8]\n"
+    "fmla z20.s, p3/M, z6.s, z25.s\n"
+    "fmla z23.s, p3/M, z4.s, z25.s\n"
+    "fmla z30.s, p3/M, z3.s, z25.s\n"
+    "fmla z15.s, p3/M, z1.s, z25.s\n"
+    "fmla z18.s, p3/M, z0.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "fmla z13.s, p3/M, z4.s, z25.s\n"
+    "ldr x23, [x8, #0x100]\n"
+    "fmla z21.s, p3/M, z2.s, z25.s\n"
+    "fmla z22.s, p3/M, z2.s, z10.s\n"
+    "fmla z27.s, p3/M, z1.s, z10.s\n"
+    "fmla z9.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x22, [x8, #0x108]\n"
+    "fmla z26.s, p3/M, z6.s, z29.s\n"
+    "fmla z14.s, p3/M, z3.s, z29.s\n"
+    "fmla z23.s, p3/M, z0.s, z29.s\n"
+    "fmla z24.s, p3/M, z8.s, z25.s\n"
+    "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x21, [x8, #0x110]\n"
+    "fmla z11.s, p3/M, z7.s, z25.s\n"
+    "fmla z31.s, p3/M, z5.s, z25.s\n"
+    "fmla z28.s, p3/M, z1.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x28, x16, LSL #2]\n"
+    "fmla z13.s, p3/M, z2.s, z12.s\n"
+    "ldr x20, [x8, #0x118]\n"
+    "fmla z15.s, p3/M, z0.s, z10.s\n"
+    "fmla z18.s, p3/M, z4.s, z25.s\n"
+    "fmla z21.s, p3/M, z3.s, z25.s\n"
+    "fmla z9.s, p3/M, z8.s, z12.s\n"
+    "fmla z11.s, p3/M, z5.s, z12.s\n"
+    "fmla z14.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "fmla z23.s, p3/M, z3.s, z10.s\n"
+    "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z25.s\n"
+    "fmla z31.s, p3/M, z6.s, z25.s\n"
+    "fmla z15.s, p3/M, z5.s, z25.s\n"
+    "fmla z13.s, p3/M, z5.s, z12.s\n"
+    "fmla z28.s, p3/M, z2.s, z12.s\n"
+    "fmla z18.s, p3/M, z7.s, z29.s\n"
+    "fmla z21.s, p3/M, z6.s, z29.s\n"
+    "fmla z23.s, p3/M, z8.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "fmla z15.s, p3/M, z8.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z25.s\n"
+    "fmla z31.s, p3/M, z7.s, z25.s\n"
+    "fmla z13.s, p3/M, z6.s, z25.s\n"
+    "fmla z18.s, p3/M, z5.s, z25.s\n"
+    "fmla z21.s, p3/M, z4.s, z25.s\n"
+    "fmla z28.s, p3/M, z3.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ldp x27, x26, [x8, #0x0]\n"
+    "fmla z11.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "fmla z26.s, p3/M, z4.s, z29.s\n"
+    "fmax z26.s, p3/M, z26.s, z16.s\n"
+    "fmla z22.s, p3/M, z3.s, z29.s\n"
+    "fmla z27.s, p3/M, z5.s, z25.s\n"
+    "fmax z22.s, p3/M, z22.s, z16.s\n"
+    "fmax z27.s, p3/M, z27.s, z16.s\n"
+    "fmla z9.s, p3/M, z4.s, z25.s\n"
+    "fmla z18.s, p3/M, z8.s, z12.s\n"
+    "fmax z9.s, p3/M, z9.s, z16.s\n"
+    "fmin z26.s, p3/M, z26.s, z19.s\n"
+    "fmla z21.s, p3/M, z7.s, z12.s\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "fmin z22.s, p3/M, z22.s, z19.s\n"
+    "fmla z14.s, p3/M, z1.s, z29.s\n"
+    "fmla z20.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "fmin z27.s, p3/M, z27.s, z19.s\n"
+    "fmla z24.s, p3/M, z2.s, z25.s\n"
+    "fmla z11.s, p3/M, z1.s, z25.s\n"
+    "fmin z9.s, p3/M, z9.s, z19.s\n"
+    "fmax z14.s, p3/M, z14.s, z16.s\n"
+    "fmla z23.s, p3/M, z7.s, z10.s\n"
+    "fmla z30.s, p3/M, z6.s, z10.s\n"
+    "fmax z20.s, p3/M, z20.s, z16.s\n"
+    "fmax z24.s, p3/M, z24.s, z16.s\n"
+    "fmla z31.s, p3/M, z8.s, z12.s\n"
+    "fmla z13.s, p3/M, z7.s, z12.s\n"
+    "fmax z11.s, p3/M, z11.s, z16.s\n"
+    "st1w { z26.s }, p1, [x12, x15, LSL #2]\n"
+    "st1w { z22.s }, p1, [x11, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z15.s, p3/M, z4.s, z10.s\n"
+    "st1w { z27.s }, p1, [x10, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z18.s, p3/M, z3.s, z10.s\n"
+    "fmla z21.s, p3/M, z5.s, z12.s\n"
+    "st1w { z9.s }, p1, [x9, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z28.s, p3/M, z4.s, z12.s\n"
+    "ldp x25, x24, [x8, #0x10]\n"
+    "fmin z14.s, p3/M, z14.s, z19.s\n"
+    "fmin z20.s, p3/M, z20.s, z19.s\n"
+    "st1w { z14.s }, p1, [x23, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmin z24.s, p3/M, z24.s, z19.s\n"
+    "fmin z11.s, p3/M, z11.s, z19.s\n"
+    "st1w { z20.s }, p1, [x22, x15, LSL #2]\n"
+    "ldr x22, [x14, #0x48]\n"
+    "fmax z23.s, p3/M, z23.s, z16.s\n"
+    "fmax z30.s, p3/M, z30.s, z16.s\n"
+    "st1w { z24.s }, p1, [x21, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "fmax z31.s, p3/M, z31.s, z16.s\n"
+    "fmax z13.s, p3/M, z13.s, z16.s\n"
+    "st1w { z11.s }, p1, [x20, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x58]\n"
+    "incw x16\n"
+    "ld1w { z9.s }, p0/Z, [x27, x17, LSL #2]\n"
+    "ld1w { z10.s }, p0/Z, [x26, x17, LSL #2]\n"
+    "fmin z23.s, p3/M, z23.s, z19.s\n"
+    "ld1w { z11.s }, p0/Z, [x25, x17, LSL #2]\n"
+    "ld1w { z12.s }, p0/Z, [x24, x17, LSL #2]\n"
+    "incw x17\n"
+    "fmin z30.s, p3/M, z30.s, z19.s\n"
+    "fmin z31.s, p3/M, z31.s, z19.s\n"
+    "fmin z13.s, p3/M, z13.s, z19.s\n"
+    "st1w { z23.s }, p1, [x23, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "fmax z15.s, p3/M, z15.s, z16.s\n"
+    "fmax z18.s, p3/M, z18.s, z16.s\n"
+    "st1w { z30.s }, p1, [x22, x15, LSL #2]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "fmax z21.s, p3/M, z21.s, z16.s\n"
+    "fmax z28.s, p3/M, z28.s, z16.s\n"
+    "st1w { z31.s }, p1, [x21, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x70]\n"
+    "st1w { z13.s }, p1, [x20, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
+    "whilelt p2.s, x16, %x[n_channels]\n"
+    "ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
+    "cmp x17, %x[n_channels]\n"
+    "fmin z15.s, p3/M, z15.s, z19.s\n"
+    "ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
+    "fmin z18.s, p3/M, z18.s, z19.s\n"
+    "fmin z21.s, p3/M, z21.s, z19.s\n"
+    "ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
+    "addvl x7, x7, #16\n"
+    "fmin z28.s, p3/M, z28.s, z19.s\n"
+    "st1w { z15.s }, p1, [x23, x15, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
+    "addvl x7, x7, #-6\n"
+    "st1w { z18.s }, p1, [x22, x15, LSL #2]\n"
+    "st1w { z21.s }, p1, [x21, x15, LSL #2]\n"
+    "st1w { z28.s }, p1, [x20, x15, LSL #2]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z14, z17\n fmla z14.s, p3/M, z4.s, z9.s\n"
+    "movprfx z18, z17\n fmla z18.s, p3/M, z8.s, z9.s\n"
+    "ldr x27, [x8, #0x20]\n"
+    "ldr x24, [x8, #0x30]\n"
+    "movprfx z15, z17\n fmla z15.s, p3/M, z3.s, z9.s\n"
+    "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+    "ldr x23, [x8, #0x28]\n"
+    "ldr x22, [x8, #0x38]\n"
+    "movprfx z20, z17\n fmla z20.s, p3/M, z0.s, z9.s\n"
+    "movprfx z13, z17\n fmla z13.s, p3/M, z7.s, z9.s\n"
+    "ldr x26, [x8, #0x40]\n"
+    "ldr x21, [x8, #0x48]\n"
+    "movprfx z22, z17\n fmla z22.s, p3/M, z6.s, z9.s\n"
+    "fmla z14.s, p3/M, z5.s, z12.s\n"
+    "ldr x25, [x8, #0x50]\n"
+    "ldr x20, [x8, #0x58]\n"
+    "movprfx z27, z17\n fmla z27.s, p3/M, z5.s, z9.s\n"
+    "movprfx z31, z17\n fmla z31.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z23.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x13, [x8, #0x70]\n"
+    "fmla z18.s, p3/M, z0.s, z10.s\n"
+    "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z21.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "ld1w { z25.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "fmla z15.s, p3/M, z4.s, z12.s\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "ldr x24, [x8, #0x60]\n"
+    "ldr x23, [x8, #0x68]\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "fmla z13.s, p3/M, z8.s, z12.s\n"
+    "incw x15\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.s, p3/M, z7.s, z12.s\n"
+    "movprfx z28, z17\n fmla z28.s, p3/M, z6.s, z21.s\n"
+    "ld1w { z29.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x28, [x8, #0x88]\n"
+    "fmla z14.s, p3/M, z7.s, z23.s\n"
+    "fmla z9.s, p3/M, z6.s, z12.s\n"
+    "ldr x12, [x14, #0x0]\n"
+    "ldr x11, [x14, #0x8]\n"
+    "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
+    "movprfx z10, z17\n fmla z10.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ldr x22, [x8, #0x78]\n"
+    "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z25.s\n"
+    "fmla z15.s, p3/M, z6.s, z23.s\n"
+    "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "ldr x21, [x8, #0x80]\n"
+    "fmla z30.s, p3/M, z4.s, z23.s\n"
+    "fmla z20.s, p3/M, z3.s, z23.s\n"
+    "ldr x10, [x14, #0x10]\n"
+    "ldr x9, [x14, #0x18]\n"
+    "movprfx z25, z17\n fmla z25.s, p3/M, z1.s, z23.s\n"
+    "movprfx z24, z17\n fmla z24.s, p3/M, z0.s, z23.s\n"
+    "fmla z27.s, p3/M, z8.s, z23.s\n"
+    "fmla z31.s, p3/M, z5.s, z23.s\n"
+    "fmla z28.s, p3/M, z2.s, z23.s\n"
+    "fmla z18.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "ldr x27, [x8, #0x90]\n"
+    "fmla z13.s, p3/M, z0.s, z12.s\n"
+    "fmla z22.s, p3/M, z2.s, z21.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x20, [x8, #0x98]\n"
+    "fmla z14.s, p3/M, z8.s, z29.s\n"
+    "fmla z9.s, p3/M, z1.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x26, [x8, #0xa0]\n"
+    "fmla z15.s, p3/M, z7.s, z29.s\n"
+    "fmla z11.s, p3/M, z6.s, z29.s\n"
+    "fmla z30.s, p3/M, z5.s, z29.s\n"
+    "fmla z20.s, p3/M, z4.s, z29.s\n"
+    "fmla z10.s, p3/M, z3.s, z29.s\n"
+    "fmla z25.s, p3/M, z2.s, z29.s\n"
+    "fmla z24.s, p3/M, z1.s, z29.s\n"
+    "fmla z26.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "ldr x25, [x8, #0xa8]\n"
+    "fmla z18.s, p3/M, z3.s, z23.s\n"
+    "fmla z27.s, p3/M, z0.s, z23.s\n"
+    "fmla z31.s, p3/M, z6.s, z21.s\n"
+    "fmla z28.s, p3/M, z3.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [x13, x16, LSL #2]\n"
+    "ldr x24, [x8, #0xb0]\n"
+    "fmla z13.s, p3/M, z4.s, z29.s\n"
+    "fmla z22.s, p3/M, z3.s, z29.s\n"
+    "fmla z14.s, p3/M, z1.s, z29.s\n"
+    "fmla z9.s, p3/M, z5.s, z12.s\n"
+    "fmla z11.s, p3/M, z2.s, z12.s\n"
+    "fmla z15.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "ldr x23, [x8, #0xb8]\n"
+    "fmla z10.s, p3/M, z8.s, z21.s\n"
+    "fmla z26.s, p3/M, z5.s, z21.s\n"
+    "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x22, [x8, #0xc0]\n"
+    "fmla z18.s, p3/M, z5.s, z29.s\n"
+    "fmla z27.s, p3/M, z2.s, z29.s\n"
+    "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
+    "ldr x21, [x8, #0xc8]\n"
+    "fmla z13.s, p3/M, z5.s, z17.s\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z14.s, p3/M, z2.s, z17.s\n"
+    "fmla z9.s, p3/M, z3.s, z17.s\n"
+    "fmla z15.s, p3/M, z1.s, z17.s\n"
+    "fmla z11.s, p3/M, z0.s, z17.s\n"
+    "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x28, [x8, #0xd8]\n"
+    "fmla z28.s, p3/M, z7.s, z23.s\n"
+    "fmla z25.s, p3/M, z6.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "ldr x20, [x8, #0xd0]\n"
+    "fmla z18.s, p3/M, z7.s, z21.s\n"
+    "fmla z13.s, p3/M, z6.s, z21.s\n"
+    "fmla z27.s, p3/M, z4.s, z21.s\n"
+    "fmla z14.s, p3/M, z3.s, z21.s\n"
+    "fmla z31.s, p3/M, z1.s, z21.s\n"
+    "fmla z30.s, p3/M, z0.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "ldr x27, [x8, #0xe0]\n"
+    "fmla z22.s, p3/M, z8.s, z29.s\n"
+    "fmla z24.s, p3/M, z8.s, z23.s\n"
+    "fmla z26.s, p3/M, z7.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "fmla z10.s, p3/M, z1.s, z29.s\n"
+    "ldr x26, [x8, #0xe8]\n"
+    "fmla z9.s, p3/M, z7.s, z29.s\n"
+    "fmla z15.s, p3/M, z5.s, z29.s\n"
+    "fmla z11.s, p3/M, z4.s, z29.s\n"
+    "fmla z20.s, p3/M, z2.s, z29.s\n"
+    "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "ldr x25, [x8, #0xf0]\n"
+    "fmla z18.s, p3/M, z2.s, z21.s\n"
+    "fmla z13.s, p3/M, z1.s, z21.s\n"
+    "fmla z22.s, p3/M, z0.s, z21.s\n"
+    "fmla z27.s, p3/M, z7.s, z23.s\n"
+    "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "ldr x24, [x8, #0xf8]\n"
+    "fmla z14.s, p3/M, z6.s, z23.s\n"
+    "fmla z31.s, p3/M, z4.s, z23.s\n"
+    "fmla z30.s, p3/M, z3.s, z23.s\n"
+    "fmla z28.s, p3/M, z1.s, z23.s\n"
+    "fmla z25.s, p3/M, z0.s, z23.s\n"
+    "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "fmla z10.s, p3/M, z4.s, z17.s\n"
+    "ldr x23, [x8, #0x100]\n"
+    "fmla z24.s, p3/M, z2.s, z17.s\n"
+    "fmla z13.s, p3/M, z2.s, z29.s\n"
+    "fmla z22.s, p3/M, z1.s, z29.s\n"
+    "fmla z9.s, p3/M, z0.s, z29.s\n"
+    "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "ldr x22, [x8, #0x108]\n"
+    "fmla z18.s, p3/M, z6.s, z21.s\n"
+    "fmla z27.s, p3/M, z3.s, z21.s\n"
+    "fmla z31.s, p3/M, z0.s, z21.s\n"
+    "fmla z15.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "ldr x21, [x8, #0x110]\n"
+    "fmla z11.s, p3/M, z7.s, z17.s\n"
+    "fmla z20.s, p3/M, z5.s, z17.s\n"
+    "fmla z26.s, p3/M, z1.s, z17.s\n"
+    "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
+    "fmla z10.s, p3/M, z2.s, z23.s\n"
+    "ldr x20, [x8, #0x118]\n"
+    "fmla z28.s, p3/M, z0.s, z29.s\n"
+    "fmla z25.s, p3/M, z4.s, z21.s\n"
+    "fmla z24.s, p3/M, z3.s, z21.s\n"
+    "fmla z9.s, p3/M, z8.s, z23.s\n"
+    "fmla z11.s, p3/M, z5.s, z23.s\n"
+    "fmla z27.s, p3/M, z6.s, z29.s\n"
+    "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z29.s\n"
+    "ld1w { z17.s }, p2/Z, [x26, x16, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z21.s\n"
+    "fmla z20.s, p3/M, z6.s, z21.s\n"
+    "fmla z28.s, p3/M, z5.s, z21.s\n"
+    "fmla z10.s, p3/M, z5.s, z23.s\n"
+    "fmla z26.s, p3/M, z2.s, z23.s\n"
+    "fmla z25.s, p3/M, z7.s, z17.s\n"
+    "fmla z24.s, p3/M, z6.s, z17.s\n"
+    "fmla z31.s, p3/M, z8.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [x25, x16, LSL #2]\n"
+    "fmla z28.s, p3/M, z8.s, z17.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x16, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z21.s\n"
+    "fmla z20.s, p3/M, z7.s, z21.s\n"
+    "fmla z10.s, p3/M, z6.s, z21.s\n"
+    "fmla z25.s, p3/M, z5.s, z21.s\n"
+    "fmla z24.s, p3/M, z4.s, z21.s\n"
+    "fmla z26.s, p3/M, z3.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [x22, x16, LSL #2]\n"
+    "fmla z11.s, p3/M, z8.s, z23.s\n"
+    "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+    "fmla z18.s, p3/M, z4.s, z12.s\n"
+    "fmax z18.s, p3/M, z18.s, z16.s\n"
+    "fmla z13.s, p3/M, z3.s, z12.s\n"
+    "fmla z22.s, p3/M, z5.s, z21.s\n"
+    "fmax z13.s, p3/M, z13.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z16.s\n"
+    "fmla z9.s, p3/M, z4.s, z21.s\n"
+    "fmla z25.s, p3/M, z8.s, z29.s\n"
+    "fmax z9.s, p3/M, z9.s, z16.s\n"
+    "fmin z18.s, p3/M, z18.s, z19.s\n"
+    "fmla z24.s, p3/M, z7.s, z29.s\n"
+    "fmla z26.s, p3/M, z6.s, z29.s\n"
+    "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+    "fmin z13.s, p3/M, z13.s, z19.s\n"
+    "fmla z27.s, p3/M, z1.s, z12.s\n"
+    "fmla z14.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+    "fmin z22.s, p3/M, z22.s, z19.s\n"
+    "fmla z15.s, p3/M, z2.s, z21.s\n"
+    "fmla z11.s, p3/M, z1.s, z21.s\n"
+    "fmin z9.s, p3/M, z9.s, z19.s\n"
+    "fmax z27.s, p3/M, z27.s, z16.s\n"
+    "fmla z31.s, p3/M, z7.s, z23.s\n"
+    "fmla z30.s, p3/M, z6.s, z23.s\n"
+    "fmax z14.s, p3/M, z14.s, z16.s\n"
+    "fmax z15.s, p3/M, z15.s, z16.s\n"
+    "fmla z20.s, p3/M, z8.s, z29.s\n"
+    "fmla z10.s, p3/M, z7.s, z29.s\n"
+    "fmax z11.s, p3/M, z11.s, z16.s\n"
+    "st1w { z18.s }, p0, [x12, x15, LSL #2]\n"
+    "st1w { z13.s }, p0, [x11, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x20]\n"
+    "ldr x22, [x14, #0x28]\n"
+    "fmla z28.s, p3/M, z4.s, z23.s\n"
+    "st1w { z22.s }, p0, [x10, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x30]\n"
+    "fmla z25.s, p3/M, z3.s, z23.s\n"
+    "fmla z24.s, p3/M, z5.s, z29.s\n"
+    "st1w { z9.s }, p0, [x9, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x38]\n"
+    "fmla z26.s, p3/M, z4.s, z29.s\n"
+    "fmin z27.s, p3/M, z27.s, z19.s\n"
+    "fmin z14.s, p3/M, z14.s, z19.s\n"
+    "fmin z15.s, p3/M, z15.s, z19.s\n"
+    "st1w { z27.s }, p0, [x23, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x40]\n"
+    "fmin z11.s, p3/M, z11.s, z19.s\n"
+    "fmax z31.s, p3/M, z31.s, z16.s\n"
+    "st1w { z14.s }, p0, [x22, x15, LSL #2]\n"
+    "ldr x22, [x14, #0x48]\n"
+    "fmax z30.s, p3/M, z30.s, z16.s\n"
+    "fmax z20.s, p3/M, z20.s, z16.s\n"
+    "st1w { z15.s }, p0, [x21, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x50]\n"
+    "fmax z10.s, p3/M, z10.s, z16.s\n"
+    "st1w { z11.s }, p0, [x20, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x58]\n"
+    "fmin z31.s, p3/M, z31.s, z19.s\n"
+    "fmin z30.s, p3/M, z30.s, z19.s\n"
+    "fmin z20.s, p3/M, z20.s, z19.s\n"
+    "st1w { z31.s }, p0, [x23, x15, LSL #2]\n"
+    "ldr x23, [x14, #0x60]\n"
+    "fmin z10.s, p3/M, z10.s, z19.s\n"
+    "fmax z28.s, p3/M, z28.s, z16.s\n"
+    "st1w { z30.s }, p0, [x22, x15, LSL #2]\n"
+    "ldr x22, [x14, #0x68]\n"
+    "fmax z25.s, p3/M, z25.s, z16.s\n"
+    "fmax z24.s, p3/M, z24.s, z16.s\n"
+    "st1w { z20.s }, p0, [x21, x15, LSL #2]\n"
+    "ldr x21, [x14, #0x70]\n"
+    "fmax z26.s, p3/M, z26.s, z16.s\n"
+    "st1w { z10.s }, p0, [x20, x15, LSL #2]\n"
+    "ldr x20, [x14, #0x78]\n"
+    "fmin z28.s, p3/M, z28.s, z19.s\n"
+    "fmin z25.s, p3/M, z25.s, z19.s\n"
+    "fmin z24.s, p3/M, z24.s, z19.s\n"
+    "st1w { z28.s }, p0, [x23, x15, LSL #2]\n"
+    "fmin z26.s, p3/M, z26.s, z19.s\n"
+    "st1w { z25.s }, p0, [x22, x15, LSL #2]\n"
+    "st1w { z24.s }, p0, [x21, x15, LSL #2]\n"
+    "st1w { z26.s }, p0, [x20, x15, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..75d62007ab
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..e6090fda94
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x11, #0x0\n"
+    "mov x16, #0x0\n"
+    "1:"  // Tile loop
+    "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "mov x24, #0x2\n"
+    "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x11, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x16, x15, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "cntw x13\n"
+    "mul x20, x11, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x10, x15, x15\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x12, x12, x22, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x28, x12, x23, LSL #2\n"
+    "madd x20, x16, x14, x20\n"  // offset += tile_j * ld_output_col
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z30.s }, p3/Z, [x11]\n"
+    "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
+    "add x27, x28, x23, LSL #2\n"
+    "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
+    "add x26, x10, x15\n"
+    "add x25, x27, x23, LSL #2\n"
+    "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
+    "addvl x11, x11, #16\n"
+    "add x24, x26, x15\n"
+    "add x9, x9, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "cmp x13, %x[n_channels]\n"
+    "ld1rw { z29.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x23, x25, x23, LSL #2\n"
+    "add x22, x9, x21, LSL #2\n"
+    "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
+    "mov x21, #0x0\n"
+    "sub x20, XZR, x13\n"
+    "ld1w { z9.s }, p2/Z, [x27, x10, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x12]\n"
+    "ld1w { z11.s }, p2/Z, [x12, x15, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x12, x26, LSL #2]\n"
+    "addvl x11, x11, #-6\n"
+    "ld1w { z13.s }, p2/Z, [x12, x24, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x28]\n"
+    "ld1w { z15.s }, p2/Z, [x28, x15, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x12, x10, LSL #2]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+    "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
+    "whilelt p1.s, x13, %x[n_channels]\n"
+    "incw x21\n"
+    "fmla z27.s, p3/M, z0.s, z10.s\n"
+    "fmla z26.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
+    "incw x13\n"
+    "fmla z27.s, p3/M, z1.s, z11.s\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+    "fmla z27.s, p3/M, z3.s, z14.s\n"
+    "fmla z26.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x25]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.s, p3/M, z4.s, z15.s\n"
+    "fmla z26.s, p3/M, z4.s, z17.s\n"
+    "ld1w { z25.s }, p2/Z, [x27]\n"
+    "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z16.s\n"
+    "fmla z26.s, p3/M, z5.s, z20.s\n"
+    "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
+    "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "addvl x12, x12, #1\n"
+    "addvl x28, x28, #1\n"
+    "fmla z27.s, p3/M, z5.s, z19.s\n"
+    "fmla z26.s, p3/M, z3.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x11]\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "fmla z21.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z25.s\n"
+    "fmla z21.s, p3/M, z1.s, z24.s\n"
+    "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
+    "incw x20\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
+    "fmla z27.s, p3/M, z6.s, z25.s\n"
+    "fmla z22.s, p3/M, z1.s, z23.s\n"
+    "ld1w { z17.s }, p2/Z, [x23]\n"
+    "addvl x27, x27, #1\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "fmla z27.s, p3/M, z7.s, z23.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
+    "fmax z27.s, p3/M, z27.s, z29.s\n"
+    "fmla z22.s, p3/M, z6.s, z17.s\n"
+    "fmla z21.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
+    "fmla z22.s, p3/M, z7.s, z20.s\n"
+    "fmla z21.s, p3/M, z7.s, z18.s\n"
+    "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
+    "fmla z26.s, p3/M, z7.s, z24.s\n"
+    "fmla z22.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z26.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "fmax z26.s, p3/M, z26.s, z29.s\n"
+    "fmla z22.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z29.s\n"
+    "fmax z21.s, p3/M, z21.s, z29.s\n"
+    "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
+    "addvl x11, x11, #16\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "ld1w { z9.s }, p1/Z, [x27, x10, LSL #2]\n"
+    "cmp x13, %x[n_channels]\n"
+    "fmin z27.s, p3/M, z27.s, z28.s\n"
+    "ld1w { z10.s }, p1/Z, [x12]\n"
+    "ld1w { z11.s }, p1/Z, [x12, x15, LSL #2]\n"
+    "fmin z26.s, p3/M, z26.s, z28.s\n"
+    "fmin z22.s, p3/M, z22.s, z28.s\n"
+    "ld1w { z12.s }, p1/Z, [x12, x26, LSL #2]\n"
+    "ld1w { z13.s }, p1/Z, [x12, x24, LSL #2]\n"
+    "fmin z21.s, p3/M, z21.s, z28.s\n"
+    "addvl x25, x25, #1\n"
+    "ld1w { z14.s }, p1/Z, [x28]\n"
+    "ld1w { z15.s }, p1/Z, [x28, x15, LSL #2]\n"
+    "addvl x23, x23, #1\n"
+    "ld1w { z16.s }, p1/Z, [x12, x10, LSL #2]\n"
+    "st1w { z27.s }, p0, [x9]\n"
+    "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
+    "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
+    "addvl x9, x9, #1\n"
+    "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
+    "addvl x11, x11, #-6\n"
+    "st1w { z22.s }, p0, [x22]\n"
+    "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+    "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z27.s, p3/M, z0.s, z10.s\n"
+    "fmla z26.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z27.s, p3/M, z1.s, z11.s\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+    "fmla z27.s, p3/M, z3.s, z14.s\n"
+    "fmla z26.s, p3/M, z0.s, z16.s\n"
+    "ld1w { z18.s }, p2/Z, [x25]\n"
+    "add x16, x16, #0x1\n"
+    "fmla z27.s, p3/M, z4.s, z15.s\n"
+    "fmla z26.s, p3/M, z4.s, z17.s\n"
+    "ld1w { z25.s }, p2/Z, [x27]\n"
+    "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z27.s, p3/M, z2.s, z16.s\n"
+    "fmla z26.s, p3/M, z5.s, z20.s\n"
+    "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
+    "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "cmp x16, x20\n"
+    "add x21, x11, #0x1\n"
+    "fmla z27.s, p3/M, z5.s, z19.s\n"
+    "fmla z26.s, p3/M, z3.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z22.s, p3/M, z3.s, z18.s\n"
+    "fmla z21.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z25.s\n"
+    "fmla z21.s, p3/M, z1.s, z24.s\n"
+    "csel x11, x11, x21, LT\n"
+    "mov p0.b, p2.b\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
+    "fmla z27.s, p3/M, z6.s, z25.s\n"
+    "fmla z22.s, p3/M, z1.s, z23.s\n"
+    "ld1w { z17.s }, p2/Z, [x23]\n"
+    "csel x16, x16, XZR, LT\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "fmla z27.s, p3/M, z7.s, z23.s\n"
+    "ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
+    "fmax z27.s, p3/M, z27.s, z29.s\n"
+    "fmla z22.s, p3/M, z6.s, z17.s\n"
+    "fmla z21.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
+    "cmp x11, x20\n"
+    "fmla z22.s, p3/M, z7.s, z20.s\n"
+    "fmla z21.s, p3/M, z7.s, z18.s\n"
+    "fmin z27.s, p3/M, z27.s, z28.s\n"
+    "st1w { z27.s }, p0, [x9]\n"
+    "fmla z26.s, p3/M, z7.s, z24.s\n"
+    "fmla z22.s, p3/M, z5.s, z16.s\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z26.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "fmax z26.s, p3/M, z26.s, z29.s\n"
+    "fmla z22.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "fmax z22.s, p3/M, z22.s, z29.s\n"
+    "fmax z21.s, p3/M, z21.s, z29.s\n"
+    "fmin z26.s, p3/M, z26.s, z28.s\n"
+    "fmin z22.s, p3/M, z22.s, z28.s\n"
+    "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
+    "fmin z21.s, p3/M, z21.s, z28.s\n"
+    "st1w { z22.s }, p0, [x22]\n"
+    "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..98427701fa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "cntw x14\n"
+    "ldp x13, x12, [x20, #0x0]\n"
+    "ldp x11, x10, [x20, #0x10]\n"
+    "mov x9, #0x0\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z20.s }, p3/Z, [x16]\n"
+    "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "sub x28, XZR, x14\n"
+    "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
+    "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x50]\n"
+    "fmla z24.s, p3/M, z1.s, z11.s\n"
+    "fmla z23.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z14.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla z24.s, p3/M, z4.s, z15.s\n"
+    "fmla z23.s, p3/M, z4.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z23.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x15, #0x80]\n"
+    "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z24.s, p3/M, z5.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla z22.s, p3/M, z3.s, z17.s\n"
+    "fmla z21.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z18.s\n"
+    "fmla z21.s, p3/M, z1.s, z20.s\n"
+    "ldr x21, [x15, #0x70]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla z22.s, p3/M, z1.s, z16.s\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "fmla z24.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "fmla z21.s, p3/M, z3.s, z18.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z23.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "whilelt p1.s, x14, %x[n_channels]\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ldp x25, x24, [x15, #0x10]\n"
+    "ldp x23, x22, [x15, #0x20]\n"
+    "incw x9\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "ldp x21, x20, [x15, #0x30]\n"
+    "ld1w { z9.s }, p1/Z, [x27, x14, LSL #2]\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "ld1w { z10.s }, p1/Z, [x26, x14, LSL #2]\n"
+    "ld1w { z11.s }, p1/Z, [x25, x14, LSL #2]\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "incw x28\n"
+    "ld1w { z12.s }, p1/Z, [x24, x14, LSL #2]\n"
+    "ld1w { z13.s }, p1/Z, [x23, x14, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "whilelt p2.s, x9, %x[n_channels]\n"
+    "ld1w { z14.s }, p1/Z, [x22, x14, LSL #2]\n"
+    "ld1w { z15.s }, p1/Z, [x21, x14, LSL #2]\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "ld1w { z16.s }, p1/Z, [x20, x14, LSL #2]\n"
+    "incw x14\n"
+    "ld1w { z20.s }, p3/Z, [x16]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+    "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+    "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
+    "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
+    "ldr x21, [x15, #0x40]\n"
+    "ldr x20, [x15, #0x48]\n"
+    "fmla z24.s, p3/M, z0.s, z10.s\n"
+    "fmla z23.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x50]\n"
+    "fmla z24.s, p3/M, z1.s, z11.s\n"
+    "fmla z23.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z3.s, z14.s\n"
+    "fmla z23.s, p3/M, z0.s, z16.s\n"
+    "ldr x20, [x15, #0x58]\n"
+    "ldr x22, [x15, #0x78]\n"
+    "fmla z24.s, p3/M, z4.s, z15.s\n"
+    "fmla z23.s, p3/M, z4.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x60]\n"
+    "fmla z24.s, p3/M, z2.s, z16.s\n"
+    "fmla z23.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x15, #0x80]\n"
+    "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
+    "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z24.s, p3/M, z5.s, z19.s\n"
+    "fmla z23.s, p3/M, z3.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+    "ldr x20, [x15, #0x88]\n"
+    "fmla z22.s, p3/M, z3.s, z17.s\n"
+    "fmla z21.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z0.s, z18.s\n"
+    "fmla z21.s, p3/M, z1.s, z20.s\n"
+    "ldr x21, [x15, #0x70]\n"
+    "ldr x20, [x15, #0x98]\n"
+    "fmla z22.s, p3/M, z4.s, z17.s\n"
+    "fmla z21.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z6.s, z18.s\n"
+    "ldr x21, [x15, #0x90]\n"
+    "ldr x20, [x15, #0xa8]\n"
+    "fmla z22.s, p3/M, z1.s, z16.s\n"
+    "fmla z21.s, p3/M, z2.s, z19.s\n"
+    "fmla z24.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "ldr x21, [x15, #0xa0]\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "fmla z22.s, p3/M, z6.s, z16.s\n"
+    "fmla z21.s, p3/M, z3.s, z18.s\n"
+    "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z17.s\n"
+    "fmla z21.s, p3/M, z7.s, z16.s\n"
+    "ldr x20, [x15, #0xb8]\n"
+    "fmla z23.s, p3/M, z7.s, z20.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z5.s, z18.s\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla z21.s, p3/M, z6.s, z17.s\n"
+    "fmla z23.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z17.s\n"
+    "fmla z21.s, p3/M, z8.s, z16.s\n"
+    "incw x28\n"
+    "mov p0.b, p2.b\n"
+    "fmax z24.s, p3/M, z24.s, z26.s\n"
+    "fmax z23.s, p3/M, z23.s, z26.s\n"
+    "fmax z22.s, p3/M, z22.s, z26.s\n"
+    "fmax z21.s, p3/M, z21.s, z26.s\n"
+    "fmin z24.s, p3/M, z24.s, z25.s\n"
+    "fmin z23.s, p3/M, z23.s, z25.s\n"
+    "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+    "fmin z22.s, p3/M, z22.s, z25.s\n"
+    "fmin z21.s, p3/M, z21.s, z25.s\n"
+    "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+    "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+    "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..ae89a64c6b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+  private:
+  using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+  Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  public:
+  using return_type = float;
+  constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+  : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+  Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+  Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..075181a488
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x12, #0x0\n"
+    "mov x8, #0x0\n"
+    "1:"  // Tile loop
+    "str x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x2\n"
+    "mov x24, #0x2\n"
+    "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mul x22, x12, x23\n"  // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x22, x8, x17, x22\n"  // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "add x15, x17, x17\n"
+    "mul x20, x12, x21\n"  // offset = tile_i * ld_output_row
+    "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "cntw x12\n"
+    "mul x22, x22, x25\n"  // offset *= kernel_stride * output_size
+    "add x14, x14, x22, LSL #2\n"  // inptr[0] += offset * sizeof(float)
+    "add x11, x14, x23, LSL #2\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+    "madd x20, x8, x16, x20\n"  // offset += tile_j * ld_output_col
+    "add x9, x11, x23, LSL #2\n"
+    "add x28, x15, x17\n"
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "mul x20, x20, x24\n"  // offset *= output_tile_size
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "add x27, x9, x23, LSL #2\n"
+    "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x26, x28, x17\n"
+    "add x25, x27, x23, LSL #2\n"
+    "ld1w { z29.s }, p3/Z, [x10]\n"
+    "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "add x24, x26, x17\n"
+    "add x13, x13, x20, LSL #2\n"  // outptrs[0] += offset * sizeof(float)
+    "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "cmp x12, %x[n_channels]\n"
+    "add x23, x25, x23, LSL #2\n"
+    "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "add x22, x13, x21, LSL #2\n"
+    "mov x21, #0x0\n"
+    "ld1w { z5.s }, p2/Z, [x14]\n"
+    "ld1w { z6.s }, p2/Z, [x14, x17, LSL #2]\n"
+    "sub x20, XZR, x12\n"
+    "ld1w { z7.s }, p2/Z, [x11]\n"
+    "ld1w { z8.s }, p2/Z, [x11, x17, LSL #2]\n"
+    "addvl x10, x10, #6\n"
+    "ld1w { z9.s }, p2/Z, [x14, x15, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x11, x15, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x14, x28, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x14, x26, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x11, x24, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x9]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "movprfx z27, z29\n fmla z27.s, p3/M, z0.s, z5.s\n"
+    "movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
+    "ld1w { z24.s }, p2/Z, [x11, x28, LSL #2]\n"
+    "whilelt p1.s, x12, %x[n_channels]\n"
+    "movprfx z26, z29\n fmla z26.s, p3/M, z0.s, z7.s\n"
+    "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z8.s\n"
+    "ld1w { z18.s }, p3/Z, [x10]\n"
+    "incw x21\n"
+    "fmla z27.s, p3/M, z1.s, z6.s\n"
+    "fmla z31.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z23.s }, p2/Z, [x11, x26, LSL #2]\n"
+    "incw x12\n"
+    "fmla z26.s, p3/M, z1.s, z8.s\n"
+    "fmla z30.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z22.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.s, p3/M, z2.s, z9.s\n"
+    "fmla z31.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "fmla z30.s, p3/M, z2.s, z24.s\n"
+    "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z27.s, p3/M, z3.s, z11.s\n"
+    "fmla z31.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z0.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "incw x20\n"
+    "fmla z26.s, p3/M, z3.s, z24.s\n"
+    "fmla z30.s, p3/M, z3.s, z23.s\n"
+    "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z27.s, p3/M, z4.s, z12.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x9, x15, LSL #2]\n"
+    "ld1w { z5.s }, p2/Z, [x9, x28, LSL #2]\n"
+    "fmla z26.s, p3/M, z4.s, z23.s\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "fmla z27.s, p3/M, z18.s, z7.s\n"
+    "fmla z31.s, p3/M, z18.s, z8.s\n"
+    "ld1w { z7.s }, p1/Z, [x11]\n"
+    "fmla z26.s, p3/M, z18.s, z14.s\n"
+    "fmla z30.s, p3/M, z18.s, z0.s\n"
+    "ld1w { z18.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z27.s, p3/M, z22.s, z8.s\n"
+    "fmla z31.s, p3/M, z22.s, z13.s\n"
+    "ld1w { z3.s }, p2/Z, [x9, x24, LSL #2]\n"
+    "fmla z26.s, p3/M, z22.s, z0.s\n"
+    "fmla z30.s, p3/M, z22.s, z19.s\n"
+    "ld1w { z8.s }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z27.s, p3/M, z20.s, z13.s\n"
+    "fmla z31.s, p3/M, z20.s, z24.s\n"
+    "ld1w { z2.s }, p2/Z, [x9, x26, LSL #2]\n"
+    "addvl x9, x9, #1\n"
+    "fmla z26.s, p3/M, z20.s, z19.s\n"
+    "fmla z30.s, p3/M, z20.s, z5.s\n"
+    "ld1w { z16.s }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "fmla z27.s, p3/M, z17.s, z24.s\n"
+    "fmla z31.s, p3/M, z17.s, z23.s\n"
+    "ld1w { z25.s }, p2/Z, [x27]\n"
+    "ld1w { z29.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "fmla z26.s, p3/M, z17.s, z5.s\n"
+    "fmla z30.s, p3/M, z17.s, z2.s\n"
+    "ld1w { z17.s }, p3/Z, [x10, #-8, MUL VL]\n"
+    "fmla z27.s, p3/M, z21.s, z23.s\n"
+    "fmla z31.s, p3/M, z21.s, z10.s\n"
+    "ld1w { z24.s }, p2/Z, [x27, x17, LSL #2]\n"
+    "ld1w { z22.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z21.s, z2.s\n"
+    "fmla z30.s, p3/M, z21.s, z3.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #-7, MUL VL]\n"
+    "fmla z27.s, p3/M, z18.s, z14.s\n"
+    "fmla z31.s, p3/M, z18.s, z0.s\n"
+    "ld1w { z1.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "fmla z26.s, p3/M, z18.s, z25.s\n"
+    "fmla z30.s, p3/M, z18.s, z24.s\n"
+    "ld1w { z23.s }, p3/Z, [x10, #-6, MUL VL]\n"
+    "fmla z27.s, p3/M, z8.s, z0.s\n"
+    "fmla z31.s, p3/M, z8.s, z19.s\n"
+    "ld1w { z0.s }, p2/Z, [x27, x28, LSL #2]\n"
+    "fmla z26.s, p3/M, z8.s, z24.s\n"
+    "fmla z30.s, p3/M, z8.s, z22.s\n"
+    "ld1w { z20.s }, p3/Z, [x10, #-5, MUL VL]\n"
+    "fmla z27.s, p3/M, z16.s, z19.s\n"
+    "fmla z31.s, p3/M, z16.s, z5.s\n"
+    "ld1w { z19.s }, p2/Z, [x27, x26, LSL #2]\n"
+    "addvl x27, x27, #1\n"
+    "fmla z26.s, p3/M, z16.s, z22.s\n"
+    "fmla z30.s, p3/M, z16.s, z0.s\n"
+    "ld1w { z18.s }, p3/Z, [x10, #-4, MUL VL]\n"
+    "fmla z27.s, p3/M, z17.s, z5.s\n"
+    "fmla z31.s, p3/M, z17.s, z2.s\n"
+    "ld1w { z16.s }, p2/Z, [x25]\n"
+    "fmla z26.s, p3/M, z17.s, z0.s\n"
+    "fmla z30.s, p3/M, z17.s, z19.s\n"
+    "ld1w { z17.s }, p3/Z, [x10, #-3, MUL VL]\n"
+    "fmla z27.s, p3/M, z21.s, z2.s\n"
+    "fmla z31.s, p3/M, z21.s, z3.s\n"
+    "ld1w { z4.s }, p2/Z, [x25, x17, LSL #2]\n"
+    "ld1w { z8.s }, p2/Z, [x25, x26, LSL #2]\n"
+    "fmla z26.s, p3/M, z21.s, z19.s\n"
+    "fmla z30.s, p3/M, z21.s, z1.s\n"
+    "ld1w { z13.s }, p3/Z, [x10, #-2, MUL VL]\n"
+    "fmla z27.s, p3/M, z23.s, z25.s\n"
+    "fmla z31.s, p3/M, z23.s, z24.s\n"
+    "ld1w { z25.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z23.s, z16.s\n"
+    "fmla z30.s, p3/M, z23.s, z4.s\n"
+    "ld1w { z5.s }, p3/Z, [x10, #-1, MUL VL]\n"
+    "fmla z27.s, p3/M, z20.s, z24.s\n"
+    "fmla z31.s, p3/M, z20.s, z22.s\n"
+    "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
+    "fmla z26.s, p3/M, z20.s, z4.s\n"
+    "fmla z30.s, p3/M, z20.s, z25.s\n"
+    "ld1w { z23.s }, p3/Z, [x10]\n"
+    "fmla z27.s, p3/M, z18.s, z22.s\n"
+    "fmla z31.s, p3/M, z18.s, z0.s\n"
+    "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "addvl x25, x25, #1\n"
+    "fmla z26.s, p3/M, z18.s, z25.s\n"
+    "fmla z30.s, p3/M, z18.s, z24.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "fmla z27.s, p3/M, z17.s, z0.s\n"
+    "fmla z31.s, p3/M, z17.s, z19.s\n"
+    "ld1w { z18.s }, p2/Z, [x23]\n"
+    "fmla z26.s, p3/M, z17.s, z24.s\n"
+    "fmla z30.s, p3/M, z17.s, z8.s\n"
+    "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z27.s, p3/M, z13.s, z19.s\n"
+    "fmla z31.s, p3/M, z13.s, z1.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
+    "ld1w { z14.s }, p1/Z, [x9]\n"
+    "fmla z26.s, p3/M, z13.s, z8.s\n"
+    "fmla z30.s, p3/M, z13.s, z22.s\n"
+    "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z27.s, p3/M, z5.s, z16.s\n"
+    "fmla z31.s, p3/M, z5.s, z4.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z5.s, z18.s\n"
+    "fmla z30.s, p3/M, z5.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z27.s, p3/M, z23.s, z4.s\n"
+    "fmla z31.s, p3/M, z23.s, z25.s\n"
+    "ld1w { z13.s }, p1/Z, [x11, x15, LSL #2]\n"
+    "fmla z26.s, p3/M, z23.s, z17.s\n"
+    "fmla z30.s, p3/M, z23.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z27.s, p3/M, z21.s, z25.s\n"
+    "fmla z31.s, p3/M, z21.s, z24.s\n"
+    "ld1w { z5.s }, p1/Z, [x14]\n"
+    "fmla z26.s, p3/M, z21.s, z16.s\n"
+    "fmla z30.s, p3/M, z21.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
+    "fmla z27.s, p3/M, z20.s, z24.s\n"
+    "fmla z31.s, p3/M, z20.s, z8.s\n"
+    "addvl x10, x10, #16\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "fmla z26.s, p3/M, z20.s, z18.s\n"
+    "fmla z30.s, p3/M, z20.s, z17.s\n"
+    "cmp x12, %x[n_channels]\n"
+    "addvl x23, x23, #1\n"
+    "fmla z27.s, p3/M, z19.s, z8.s\n"
+    "fmla z31.s, p3/M, z19.s, z22.s\n"
+    "fmax z27.s, p3/M, z27.s, z15.s\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "fmla z26.s, p3/M, z19.s, z17.s\n"
+    "fmla z30.s, p3/M, z19.s, z16.s\n"
+    "fmax z26.s, p3/M, z26.s, z15.s\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "fmin z27.s, p3/M, z27.s, z28.s\n"
+    "fmin z31.s, p3/M, z31.s, z28.s\n"
+    "ld1w { z6.s }, p1/Z, [x14, x17, LSL #2]\n"
+    "ld1w { z8.s }, p1/Z, [x11, x17, LSL #2]\n"
+    "fmin z26.s, p3/M, z26.s, z28.s\n"
+    "fmin z30.s, p3/M, z30.s, z28.s\n"
+    "ld1w { z9.s }, p1/Z, [x14, x15, LSL #2]\n"
+    "ld1w { z11.s }, p1/Z, [x14, x28, LSL #2]\n"
+    "ld1w { z12.s }, p1/Z, [x14, x26, LSL #2]\n"
+    "ld1w { z10.s }, p1/Z, [x11, x24, LSL #2]\n"
+    "st1w { z27.s }, p0, [x13]\n"
+    "st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
+    "addvl x13, x13, #1\n"
+    "ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
+    "st1w { z26.s }, p0, [x22]\n"
+    "addvl x10, x10, #-6\n"
+    "st1w { z30.s }, p0, [x22, x16, LSL #2]\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z5.s\n"
+    "movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
+    "ld1w { z22.s }, p2/Z, [x11, x28, LSL #2]\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "movprfx z5, z29\n fmla z5.s, p3/M, z0.s, z7.s\n"
+    "fmla z29.s, p3/M, z0.s, z8.s\n"
+    "ld1w { z20.s }, p3/Z, [x10]\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z30.s, p3/M, z1.s, z6.s\n"
+    "fmla z31.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z6.s }, p2/Z, [x11, x26, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z5.s, p3/M, z1.s, z8.s\n"
+    "fmla z29.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z19.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "add x8, x8, #0x1\n"
+    "fmla z30.s, p3/M, z2.s, z9.s\n"
+    "fmla z31.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
+    "cmp x8, x20\n"
+    "fmla z5.s, p3/M, z2.s, z13.s\n"
+    "fmla z29.s, p3/M, z2.s, z22.s\n"
+    "ld1w { z18.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "add x21, x12, #0x1\n"
+    "fmla z30.s, p3/M, z3.s, z11.s\n"
+    "fmla z31.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z1.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z5.s, p3/M, z3.s, z22.s\n"
+    "fmla z29.s, p3/M, z3.s, z6.s\n"
+    "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "csel x12, x12, x21, LT\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "fmla z31.s, p3/M, z4.s, z16.s\n"
+    "ld1w { z0.s }, p2/Z, [x9, x15, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x9, x28, LSL #2]\n"
+    "fmla z5.s, p3/M, z4.s, z6.s\n"
+    "fmla z29.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z16.s }, p3/Z, [x10, #4, MUL VL]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z20.s, z7.s\n"
+    "fmla z31.s, p3/M, z20.s, z8.s\n"
+    "csel x8, x8, XZR, LT\n"
+    "cmp x12, x20\n"
+    "fmla z5.s, p3/M, z20.s, z14.s\n"
+    "fmla z29.s, p3/M, z20.s, z1.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #5, MUL VL]\n"
+    "fmla z30.s, p3/M, z19.s, z8.s\n"
+    "fmla z31.s, p3/M, z19.s, z13.s\n"
+    "ld1w { z26.s }, p2/Z, [x9, x24, LSL #2]\n"
+    "fmla z5.s, p3/M, z19.s, z1.s\n"
+    "fmla z29.s, p3/M, z19.s, z0.s\n"
+    "ld1w { z25.s }, p3/Z, [x10, #6, MUL VL]\n"
+    "fmla z30.s, p3/M, z18.s, z13.s\n"
+    "fmla z31.s, p3/M, z18.s, z22.s\n"
+    "ld1w { z24.s }, p2/Z, [x9, x26, LSL #2]\n"
+    "fmla z5.s, p3/M, z18.s, z0.s\n"
+    "fmla z29.s, p3/M, z18.s, z27.s\n"
+    "ld1w { z23.s }, p3/Z, [x10, #7, MUL VL]\n"
+    "addvl x10, x10, #16\n"
+    "fmla z30.s, p3/M, z17.s, z22.s\n"
+    "fmla z31.s, p3/M, z17.s, z6.s\n"
+    "ld1w { z22.s }, p2/Z, [x27]\n"
+    "fmla z5.s, p3/M, z17.s, z27.s\n"
+    "fmla z29.s, p3/M, z17.s, z24.s\n"
+    "ld1w { z20.s }, p3/Z, [x10, #-8, MUL VL]\n"
+    "fmla z30.s, p3/M, z16.s, z6.s\n"
+    "fmla z31.s, p3/M, z16.s, z10.s\n"
+    "ld1w { z19.s }, p2/Z, [x27, x17, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "fmla z5.s, p3/M, z16.s, z24.s\n"
+    "fmla z29.s, p3/M, z16.s, z26.s\n"
+    "ld1w { z16.s }, p3/Z, [x10, #-7, MUL VL]\n"
+    "fmla z30.s, p3/M, z21.s, z14.s\n"
+    "fmla z31.s, p3/M, z21.s, z1.s\n"
+    "ld1w { z17.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "fmla z5.s, p3/M, z21.s, z22.s\n"
+    "fmla z29.s, p3/M, z21.s, z19.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #-6, MUL VL]\n"
+    "fmla z30.s, p3/M, z25.s, z1.s\n"
+    "fmla z31.s, p3/M, z25.s, z0.s\n"
+    "ld1w { z7.s }, p2/Z, [x27, x28, LSL #2]\n"
+    "fmla z5.s, p3/M, z25.s, z19.s\n"
+    "fmla z29.s, p3/M, z25.s, z18.s\n"
+    "ld1w { z10.s }, p3/Z, [x10, #-5, MUL VL]\n"
+    "fmla z30.s, p3/M, z23.s, z0.s\n"
+    "fmla z31.s, p3/M, z23.s, z27.s\n"
+    "ld1w { z11.s }, p2/Z, [x27, x26, LSL #2]\n"
+    "fmla z5.s, p3/M, z23.s, z18.s\n"
+    "fmla z29.s, p3/M, z23.s, z7.s\n"
+    "ld1w { z6.s }, p3/Z, [x10, #-4, MUL VL]\n"
+    "fmla z30.s, p3/M, z20.s, z27.s\n"
+    "fmla z31.s, p3/M, z20.s, z24.s\n"
+    "ld1w { z0.s }, p2/Z, [x25]\n"
+    "fmla z5.s, p3/M, z20.s, z7.s\n"
+    "fmla z29.s, p3/M, z20.s, z11.s\n"
+    "ld1w { z9.s }, p3/Z, [x10, #-3, MUL VL]\n"
+    "fmla z30.s, p3/M, z16.s, z24.s\n"
+    "fmla z31.s, p3/M, z16.s, z26.s\n"
+    "ld1w { z3.s }, p2/Z, [x25, x17, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x25, x26, LSL #2]\n"
+    "fmla z5.s, p3/M, z16.s, z11.s\n"
+    "fmla z29.s, p3/M, z16.s, z17.s\n"
+    "ld1w { z16.s }, p3/Z, [x10, #-2, MUL VL]\n"
+    "fmla z30.s, p3/M, z21.s, z22.s\n"
+    "fmla z31.s, p3/M, z21.s, z19.s\n"
+    "ld1w { z26.s }, p2/Z, [x25, x15, LSL #2]\n"
+    "fmla z5.s, p3/M, z21.s, z0.s\n"
+    "fmla z29.s, p3/M, z21.s, z3.s\n"
+    "ld1w { z25.s }, p3/Z, [x10, #-1, MUL VL]\n"
+    "fmla z30.s, p3/M, z10.s, z19.s\n"
+    "fmla z31.s, p3/M, z10.s, z18.s\n"
+    "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
+    "fmla z5.s, p3/M, z10.s, z3.s\n"
+    "fmla z29.s, p3/M, z10.s, z26.s\n"
+    "ld1w { z23.s }, p3/Z, [x10]\n"
+    "fmla z30.s, p3/M, z6.s, z18.s\n"
+    "fmla z31.s, p3/M, z6.s, z7.s\n"
+    "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
+    "fmla z5.s, p3/M, z6.s, z26.s\n"
+    "fmla z29.s, p3/M, z6.s, z24.s\n"
+    "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
+    "fmla z30.s, p3/M, z9.s, z7.s\n"
+    "fmla z31.s, p3/M, z9.s, z11.s\n"
+    "ld1w { z18.s }, p2/Z, [x23]\n"
+    "fmla z5.s, p3/M, z9.s, z24.s\n"
+    "fmla z29.s, p3/M, z9.s, z27.s\n"
+    "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+    "fmla z30.s, p3/M, z16.s, z11.s\n"
+    "fmla z31.s, p3/M, z16.s, z17.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
+    "fmla z5.s, p3/M, z16.s, z27.s\n"
+    "fmla z29.s, p3/M, z16.s, z22.s\n"
+    "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
+    "fmla z30.s, p3/M, z25.s, z0.s\n"
+    "fmla z31.s, p3/M, z25.s, z3.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+    "fmla z5.s, p3/M, z25.s, z18.s\n"
+    "fmla z29.s, p3/M, z25.s, z17.s\n"
+    "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "fmla z30.s, p3/M, z23.s, z3.s\n"
+    "fmla z31.s, p3/M, z23.s, z26.s\n"
+    "fmla z5.s, p3/M, z23.s, z17.s\n"
+    "fmla z29.s, p3/M, z23.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
+    "fmla z30.s, p3/M, z21.s, z26.s\n"
+    "fmla z31.s, p3/M, z21.s, z24.s\n"
+    "fmla z5.s, p3/M, z21.s, z16.s\n"
+    "fmla z29.s, p3/M, z21.s, z18.s\n"
+    "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+    "fmla z30.s, p3/M, z20.s, z24.s\n"
+    "fmla z31.s, p3/M, z20.s, z27.s\n"
+    "fmla z5.s, p3/M, z20.s, z18.s\n"
+    "fmla z29.s, p3/M, z20.s, z17.s\n"
+    "fmla z30.s, p3/M, z19.s, z27.s\n"
+    "fmla z31.s, p3/M, z19.s, z22.s\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "fmla z5.s, p3/M, z19.s, z17.s\n"
+    "fmla z29.s, p3/M, z19.s, z16.s\n"
+    "fmax z5.s, p3/M, z5.s, z15.s\n"
+    "fmax z29.s, p3/M, z29.s, z15.s\n"
+    "fmin z30.s, p3/M, z30.s, z28.s\n"
+    "fmin z31.s, p3/M, z31.s, z28.s\n"
+    "st1w { z30.s }, p0, [x13]\n"
+    "fmin z5.s, p3/M, z5.s, z28.s\n"
+    "fmin z29.s, p3/M, z29.s, z28.s\n"
+    "st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
+    "st1w { z5.s }, p0, [x22]\n"
+    "st1w { z29.s }, p0, [x22, x16, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..bf65e04d32
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldp x15, x14, [x20, #0x0]\n"
+    "mov x13, #0x0\n"
+    "ldp x12, x11, [x20, #0x10]\n"
+    "whilelt p3.s, XZR, %x[n_channels]\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "cntw x10\n"
+    "ptrue p2.b\n"
+    "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
+    "ld1w { z5.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "cmp x10, %x[n_channels]\n"
+    "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldp x27, x26, [x16, #0x10]\n"
+    "sub x28, XZR, x10\n"
+    "ldp x25, x24, [x16, #0x20]\n"
+    "ldp x23, x22, [x16, #0x30]\n"
+    "ldp x21, x20, [x16, #0x40]\n"
+    "ld1rw { z15.s }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "ld1rw { z28.s }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z29.s }, p2/Z, [x9]\n"
+    "ld1w { z0.s }, p2/Z, [x9, #1, MUL VL]\n"
+    "ld1w { z1.s }, p2/Z, [x9, #2, MUL VL]\n"
+    "ld1w { z2.s }, p2/Z, [x9, #3, MUL VL]\n"
+    "ld1w { z3.s }, p2/Z, [x9, #4, MUL VL]\n"
+    "ld1w { z4.s }, p2/Z, [x9, #5, MUL VL]\n"
+    "ld1w { z7.s }, p3/Z, [x27, x13, LSL #2]\n"
+    "addvl x9, x9, #6\n"
+    "ld1w { z8.s }, p3/Z, [x26, x13, LSL #2]\n"
+    "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+    "ld1w { z13.s }, p3/Z, [x24, x13, LSL #2]\n"
+    "ld1w { z11.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ld1w { z12.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "ld1w { z10.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
+    "movprfx z27, z29\n fmla z27.s, p2/M, z0.s, z6.s\n"
+    "ldr x20, [x16, #0x50]\n"
+    "ld1w { z5.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z7.s\n"
+    "movprfx z26, z29\n fmla z26.s, p2/M, z0.s, z8.s\n"
+    "ldr x20, [x16, #0x58]\n"
+    "ldr x21, [x16, #0x60]\n"
+    "fmla z30.s, p2/M, z1.s, z6.s\n"
+    "fmla z27.s, p2/M, z1.s, z9.s\n"
+    "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x20, [x16, #0x68]\n"
+    "fmla z31.s, p2/M, z1.s, z8.s\n"
+    "fmla z26.s, p2/M, z1.s, z13.s\n"
+    "ld1w { z21.s }, p2/Z, [x9]\n"
+    "ldr x23, [x16, #0x70]\n"
+    "fmla z30.s, p2/M, z2.s, z9.s\n"
+    "fmla z27.s, p2/M, z2.s, z11.s\n"
+    "ld1w { z20.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z31.s, p2/M, z2.s, z13.s\n"
+    "fmla z26.s, p2/M, z2.s, z5.s\n"
+    "ldr x22, [x16, #0x78]\n"
+    "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.s, p2/M, z3.s, z11.s\n"
+    "fmla z27.s, p2/M, z3.s, z12.s\n"
+    "ld1w { z11.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x21, [x16, #0x80]\n"
+    "fmla z31.s, p2/M, z3.s, z5.s\n"
+    "fmla z26.s, p2/M, z3.s, z22.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #3, MUL VL]\n"
+    "ldr x20, [x16, #0x88]\n"
+    "fmla z30.s, p2/M, z4.s, z12.s\n"
+    "fmla z27.s, p2/M, z4.s, z20.s\n"
+    "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ld1w { z29.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z4.s, z22.s\n"
+    "fmla z26.s, p2/M, z4.s, z10.s\n"
+    "ld1w { z19.s }, p2/Z, [x9, #4, MUL VL]\n"
+    "ldr x23, [x16, #0x90]\n"
+    "fmla z30.s, p2/M, z21.s, z7.s\n"
+    "fmla z27.s, p2/M, z21.s, z8.s\n"
+    "ldr x26, [x16, #0x98]\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla z31.s, p2/M, z21.s, z14.s\n"
+    "fmla z26.s, p2/M, z21.s, z11.s\n"
+    "ld1w { z25.s }, p2/Z, [x9, #5, MUL VL]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z30.s, p2/M, z18.s, z8.s\n"
+    "fmla z27.s, p2/M, z18.s, z13.s\n"
+    "ld1w { z24.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla z31.s, p2/M, z18.s, z11.s\n"
+    "fmla z26.s, p2/M, z18.s, z0.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, #6, MUL VL]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.s, p2/M, z17.s, z13.s\n"
+    "fmla z27.s, p2/M, z17.s, z5.s\n"
+    "ld1w { z3.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ldr x21, [x16, #0xc0]\n"
+    "fmla z31.s, p2/M, z17.s, z0.s\n"
+    "fmla z26.s, p2/M, z17.s, z29.s\n"
+    "ld1w { z17.s }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "fmla z30.s, p2/M, z16.s, z5.s\n"
+    "fmla z27.s, p2/M, z16.s, z22.s\n"
+    "ld1w { z6.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ldr x27, [x16, #0xc8]\n"
+    "fmla z31.s, p2/M, z16.s, z29.s\n"
+    "fmla z26.s, p2/M, z16.s, z3.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ldr x23, [x16, #0xd0]\n"
+    "fmla z30.s, p2/M, z19.s, z22.s\n"
+    "fmla z27.s, p2/M, z19.s, z10.s\n"
+    "ld1w { z23.s }, p3/Z, [x26, x13, LSL #2]\n"
+    "ld1w { z22.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z19.s, z3.s\n"
+    "fmla z26.s, p2/M, z19.s, z24.s\n"
+    "ld1w { z21.s }, p2/Z, [x9, #-7, MUL VL]\n"
+    "ldr x22, [x16, #0xd8]\n"
+    "fmla z30.s, p2/M, z25.s, z14.s\n"
+    "fmla z27.s, p2/M, z25.s, z11.s\n"
+    "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla z31.s, p2/M, z25.s, z6.s\n"
+    "fmla z26.s, p2/M, z25.s, z23.s\n"
+    "ld1w { z20.s }, p2/Z, [x9, #-6, MUL VL]\n"
+    "ldr x26, [x16, #0xf8]\n"
+    "fmla z30.s, p2/M, z18.s, z11.s\n"
+    "fmla z27.s, p2/M, z18.s, z0.s\n"
+    "ld1w { z7.s }, p3/Z, [x25, x13, LSL #2]\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z31.s, p2/M, z18.s, z23.s\n"
+    "fmla z26.s, p2/M, z18.s, z22.s\n"
+    "ld1w { z18.s }, p2/Z, [x9, #-5, MUL VL]\n"
+    "whilelt p1.s, x10, %x[n_channels]\n"
+    "fmla z30.s, p2/M, z17.s, z0.s\n"
+    "fmla z27.s, p2/M, z17.s, z29.s\n"
+    "ld1w { z19.s }, p3/Z, [x24, x13, LSL #2]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla z31.s, p2/M, z17.s, z22.s\n"
+    "fmla z26.s, p2/M, z17.s, z7.s\n"
+    "ld1w { z17.s }, p2/Z, [x9, #-4, MUL VL]\n"
+    "incw x28\n"
+    "fmla z30.s, p2/M, z16.s, z29.s\n"
+    "fmla z27.s, p2/M, z16.s, z3.s\n"
+    "ld1w { z0.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ldr x21, [x16, #0x100]\n"
+    "fmla z31.s, p2/M, z16.s, z7.s\n"
+    "fmla z26.s, p2/M, z16.s, z19.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #-3, MUL VL]\n"
+    "mov p0.b, p3.b\n"
+    "fmla z30.s, p2/M, z21.s, z3.s\n"
+    "fmla z27.s, p2/M, z21.s, z24.s\n"
+    "ld1w { z11.s }, p3/Z, [x27, x13, LSL #2]\n"
+    "ld1w { z13.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z21.s, z19.s\n"
+    "fmla z26.s, p2/M, z21.s, z1.s\n"
+    "ld1w { z10.s }, p2/Z, [x9, #-2, MUL VL]\n"
+    "ldr x20, [x16, #0x108]\n"
+    "fmla z30.s, p2/M, z20.s, z6.s\n"
+    "fmla z27.s, p2/M, z20.s, z23.s\n"
+    "ld1w { z25.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ldr x23, [x16, #0x110]\n"
+    "fmla z31.s, p2/M, z20.s, z0.s\n"
+    "fmla z26.s, p2/M, z20.s, z11.s\n"
+    "ld1w { z8.s }, p2/Z, [x9, #-1, MUL VL]\n"
+    "ld1w { z29.s }, p2/Z, [x9, #4, MUL VL]\n"
+    "fmla z30.s, p2/M, z18.s, z23.s\n"
+    "fmla z27.s, p2/M, z18.s, z22.s\n"
+    "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "ldr x22, [x16, #0x118]\n"
+    "fmla z31.s, p2/M, z18.s, z11.s\n"
+    "fmla z26.s, p2/M, z18.s, z25.s\n"
+    "ld1w { z23.s }, p2/Z, [x9]\n"
+    "fmla z30.s, p2/M, z17.s, z22.s\n"
+    "fmla z27.s, p2/M, z17.s, z7.s\n"
+    "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z17.s, z25.s\n"
+    "fmla z26.s, p2/M, z17.s, z24.s\n"
+    "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z30.s, p2/M, z16.s, z7.s\n"
+    "fmla z27.s, p2/M, z16.s, z19.s\n"
+    "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z16.s, z24.s\n"
+    "fmla z26.s, p2/M, z16.s, z13.s\n"
+    "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.s, p2/M, z10.s, z19.s\n"
+    "fmla z27.s, p2/M, z10.s, z1.s\n"
+    "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z10.s, z13.s\n"
+    "fmla z26.s, p2/M, z10.s, z22.s\n"
+    "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
+    "fmla z30.s, p2/M, z8.s, z0.s\n"
+    "fmla z27.s, p2/M, z8.s, z11.s\n"
+    "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "fmla z31.s, p2/M, z8.s, z18.s\n"
+    "fmla z26.s, p2/M, z8.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldp x21, x20, [x16, #0x0]\n"
+    "fmla z30.s, p2/M, z23.s, z11.s\n"
+    "fmla z27.s, p2/M, z23.s, z25.s\n"
+    "ld1w { z0.s }, p2/Z, [x9, #5, MUL VL]\n"
+    "fmla z31.s, p2/M, z23.s, z17.s\n"
+    "fmla z26.s, p2/M, z23.s, z16.s\n"
+    "ld1w { z17.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ld1w { z1.s }, p2/Z, [x9, #6, MUL VL]\n"
+    "fmla z30.s, p2/M, z21.s, z25.s\n"
+    "fmla z27.s, p2/M, z21.s, z24.s\n"
+    "ld1w { z5.s }, p1/Z, [x21, x10, LSL #2]\n"
+    "fmla z31.s, p2/M, z21.s, z16.s\n"
+    "fmla z26.s, p2/M, z21.s, z18.s\n"
+    "ld1w { z16.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "ldp x27, x26, [x16, #0x10]\n"
+    "fmla z30.s, p2/M, z20.s, z24.s\n"
+    "fmla z27.s, p2/M, z20.s, z13.s\n"
+    "ld1w { z6.s }, p1/Z, [x20, x10, LSL #2]\n"
+    "ldp x25, x24, [x16, #0x20]\n"
+    "fmla z31.s, p2/M, z20.s, z18.s\n"
+    "fmla z26.s, p2/M, z20.s, z17.s\n"
+    "ldp x23, x22, [x16, #0x30]\n"
+    "ldp x21, x20, [x16, #0x40]\n"
+    "fmla z30.s, p2/M, z19.s, z13.s\n"
+    "fmla z27.s, p2/M, z19.s, z22.s\n"
+    "incw x13\n"
+    "ld1w { z7.s }, p1/Z, [x27, x10, LSL #2]\n"
+    "fmla z31.s, p2/M, z19.s, z17.s\n"
+    "fmla z26.s, p2/M, z19.s, z16.s\n"
+    "ld1w { z8.s }, p1/Z, [x26, x10, LSL #2]\n"
+    "ld1w { z9.s }, p1/Z, [x25, x10, LSL #2]\n"
+    "ld1w { z13.s }, p1/Z, [x24, x10, LSL #2]\n"
+    "ld1w { z11.s }, p1/Z, [x23, x10, LSL #2]\n"
+    "fmax z30.s, p2/M, z30.s, z15.s\n"
+    "fmax z27.s, p2/M, z27.s, z15.s\n"
+    "ld1w { z12.s }, p1/Z, [x22, x10, LSL #2]\n"
+    "ld1w { z10.s }, p1/Z, [x21, x10, LSL #2]\n"
+    "fmax z31.s, p2/M, z31.s, z15.s\n"
+    "fmax z26.s, p2/M, z26.s, z15.s\n"
+    "ld1w { z14.s }, p1/Z, [x20, x10, LSL #2]\n"
+    "incw x10\n"
+    "ld1w { z2.s }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "whilelt p3.s, x13, %x[n_channels]\n"
+    "cmp x10, %x[n_channels]\n"
+    "ld1w { z3.s }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ld1w { z4.s }, p2/Z, [x9, #-7, MUL VL]\n"
+    "fmin z30.s, p2/M, z30.s, z28.s\n"
+    "fmin z27.s, p2/M, z27.s, z28.s\n"
+    "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
+    "fmin z31.s, p2/M, z31.s, z28.s\n"
+    "fmin z26.s, p2/M, z26.s, z28.s\n"
+    "st1w { z27.s }, p0, [x14, x28, LSL #2]\n"
+    "st1w { z31.s }, p0, [x12, x28, LSL #2]\n"
+    "addvl x9, x9, #-6\n"
+    "st1w { z26.s }, p0, [x11, x28, LSL #2]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
+    "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z6.s\n"
+    "ldr x20, [x16, #0x50]\n"
+    "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "movprfx z5, z29\n fmla z5.s, p2/M, z0.s, z7.s\n"
+    "fmla z29.s, p2/M, z0.s, z8.s\n"
+    "ldr x20, [x16, #0x58]\n"
+    "ldr x21, [x16, #0x60]\n"
+    "fmla z30.s, p2/M, z1.s, z6.s\n"
+    "fmla z31.s, p2/M, z1.s, z9.s\n"
+    "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x20, [x16, #0x68]\n"
+    "fmla z5.s, p2/M, z1.s, z8.s\n"
+    "fmla z29.s, p2/M, z1.s, z13.s\n"
+    "ld1w { z20.s }, p2/Z, [x9]\n"
+    "ldr x23, [x16, #0x70]\n"
+    "fmla z30.s, p2/M, z2.s, z9.s\n"
+    "fmla z31.s, p2/M, z2.s, z11.s\n"
+    "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ld1w { z19.s }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z5.s, p2/M, z2.s, z13.s\n"
+    "fmla z29.s, p2/M, z2.s, z22.s\n"
+    "ldr x21, [x16, #0x78]\n"
+    "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.s, p2/M, z3.s, z11.s\n"
+    "fmla z31.s, p2/M, z3.s, z12.s\n"
+    "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x22, [x16, #0x80]\n"
+    "fmla z5.s, p2/M, z3.s, z22.s\n"
+    "fmla z29.s, p2/M, z3.s, z6.s\n"
+    "ld1w { z17.s }, p2/Z, [x9, #3, MUL VL]\n"
+    "ldr x20, [x16, #0x88]\n"
+    "fmla z30.s, p2/M, z4.s, z12.s\n"
+    "fmla z31.s, p2/M, z4.s, z16.s\n"
+    "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ld1w { z27.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z4.s, z6.s\n"
+    "fmla z29.s, p2/M, z4.s, z10.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #4, MUL VL]\n"
+    "ldr x21, [x16, #0x90]\n"
+    "fmla z30.s, p2/M, z20.s, z7.s\n"
+    "fmla z31.s, p2/M, z20.s, z8.s\n"
+    "ldr x27, [x16, #0x98]\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla z5.s, p2/M, z20.s, z14.s\n"
+    "fmla z29.s, p2/M, z20.s, z1.s\n"
+    "ld1w { z21.s }, p2/Z, [x9, #5, MUL VL]\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla z30.s, p2/M, z19.s, z8.s\n"
+    "fmla z31.s, p2/M, z19.s, z13.s\n"
+    "ld1w { z26.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla z5.s, p2/M, z19.s, z1.s\n"
+    "fmla z29.s, p2/M, z19.s, z0.s\n"
+    "ld1w { z25.s }, p2/Z, [x9, #6, MUL VL]\n"
+    "ldr x20, [x16, #0xb8]\n"
+    "fmla z30.s, p2/M, z18.s, z13.s\n"
+    "fmla z31.s, p2/M, z18.s, z22.s\n"
+    "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "ldr x23, [x16, #0xc0]\n"
+    "fmla z5.s, p2/M, z18.s, z0.s\n"
+    "fmla z29.s, p2/M, z18.s, z27.s\n"
+    "ld1w { z23.s }, p2/Z, [x9, #7, MUL VL]\n"
+    "addvl x9, x9, #16\n"
+    "fmla z30.s, p2/M, z17.s, z22.s\n"
+    "fmla z31.s, p2/M, z17.s, z6.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ldr x22, [x16, #0xc8]\n"
+    "fmla z5.s, p2/M, z17.s, z27.s\n"
+    "fmla z29.s, p2/M, z17.s, z24.s\n"
+    "ld1w { z20.s }, p2/Z, [x9, #-8, MUL VL]\n"
+    "ldr x21, [x16, #0xd0]\n"
+    "fmla z30.s, p2/M, z16.s, z6.s\n"
+    "fmla z31.s, p2/M, z16.s, z10.s\n"
+    "ld1w { z19.s }, p3/Z, [x27, x13, LSL #2]\n"
+    "ld1w { z18.s }, p3/Z, [x26, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z16.s, z24.s\n"
+    "fmla z29.s, p2/M, z16.s, z26.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #-7, MUL VL]\n"
+    "ldr x27, [x16, #0xd8]\n"
+    "fmla z30.s, p2/M, z21.s, z14.s\n"
+    "fmla z31.s, p2/M, z21.s, z1.s\n"
+    "ld1w { z17.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "ldr x20, [x16, #0xe0]\n"
+    "fmla z5.s, p2/M, z21.s, z22.s\n"
+    "fmla z29.s, p2/M, z21.s, z19.s\n"
+    "ld1w { z21.s }, p2/Z, [x9, #-6, MUL VL]\n"
+    "ldr x26, [x16, #0xf8]\n"
+    "fmla z30.s, p2/M, z25.s, z1.s\n"
+    "fmla z31.s, p2/M, z25.s, z0.s\n"
+    "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla z5.s, p2/M, z25.s, z19.s\n"
+    "fmla z29.s, p2/M, z25.s, z18.s\n"
+    "ld1w { z4.s }, p2/Z, [x9, #-5, MUL VL]\n"
+    "incw x28\n"
+    "fmla z30.s, p2/M, z23.s, z0.s\n"
+    "fmla z31.s, p2/M, z23.s, z27.s\n"
+    "ld1w { z8.s }, p3/Z, [x24, x13, LSL #2]\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla z5.s, p2/M, z23.s, z18.s\n"
+    "fmla z29.s, p2/M, z23.s, z9.s\n"
+    "ld1w { z6.s }, p2/Z, [x9, #-4, MUL VL]\n"
+    "mov p0.b, p3.b\n"
+    "fmla z30.s, p2/M, z20.s, z27.s\n"
+    "fmla z31.s, p2/M, z20.s, z24.s\n"
+    "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "ldr x23, [x16, #0x100]\n"
+    "fmla z5.s, p2/M, z20.s, z9.s\n"
+    "fmla z29.s, p2/M, z20.s, z8.s\n"
+    "ld1w { z11.s }, p2/Z, [x9, #-3, MUL VL]\n"
+    "fmla z30.s, p2/M, z16.s, z24.s\n"
+    "fmla z31.s, p2/M, z16.s, z26.s\n"
+    "ld1w { z0.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "ld1w { z27.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z16.s, z8.s\n"
+    "fmla z29.s, p2/M, z16.s, z17.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, #-2, MUL VL]\n"
+    "ldr x22, [x16, #0x108]\n"
+    "fmla z30.s, p2/M, z21.s, z22.s\n"
+    "fmla z31.s, p2/M, z21.s, z19.s\n"
+    "ld1w { z26.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "ldr x21, [x16, #0x110]\n"
+    "fmla z5.s, p2/M, z21.s, z10.s\n"
+    "fmla z29.s, p2/M, z21.s, z0.s\n"
+    "ld1w { z25.s }, p2/Z, [x9, #-1, MUL VL]\n"
+    "fmla z30.s, p2/M, z4.s, z19.s\n"
+    "fmla z31.s, p2/M, z4.s, z18.s\n"
+    "ld1w { z24.s }, p3/Z, [x27, x13, LSL #2]\n"
+    "ldr x20, [x16, #0x118]\n"
+    "fmla z5.s, p2/M, z4.s, z0.s\n"
+    "fmla z29.s, p2/M, z4.s, z26.s\n"
+    "ld1w { z23.s }, p2/Z, [x9]\n"
+    "fmla z30.s, p2/M, z6.s, z18.s\n"
+    "fmla z31.s, p2/M, z6.s, z9.s\n"
+    "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z6.s, z26.s\n"
+    "fmla z29.s, p2/M, z6.s, z24.s\n"
+    "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
+    "fmla z30.s, p2/M, z11.s, z9.s\n"
+    "fmla z31.s, p2/M, z11.s, z8.s\n"
+    "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z11.s, z24.s\n"
+    "fmla z29.s, p2/M, z11.s, z27.s\n"
+    "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+    "fmla z30.s, p2/M, z16.s, z8.s\n"
+    "fmla z31.s, p2/M, z16.s, z17.s\n"
+    "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z16.s, z27.s\n"
+    "fmla z29.s, p2/M, z16.s, z22.s\n"
+    "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
+    "fmla z30.s, p2/M, z25.s, z10.s\n"
+    "fmla z31.s, p2/M, z25.s, z0.s\n"
+    "ld1w { z16.s }, p3/Z, [x23, x13, LSL #2]\n"
+    "fmla z5.s, p2/M, z25.s, z18.s\n"
+    "fmla z29.s, p2/M, z25.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x22, x13, LSL #2]\n"
+    "fmla z30.s, p2/M, z23.s, z0.s\n"
+    "fmla z31.s, p2/M, z23.s, z26.s\n"
+    "fmla z5.s, p2/M, z23.s, z17.s\n"
+    "fmla z29.s, p2/M, z23.s, z16.s\n"
+    "ld1w { z17.s }, p3/Z, [x21, x13, LSL #2]\n"
+    "fmla z30.s, p2/M, z21.s, z26.s\n"
+    "fmla z31.s, p2/M, z21.s, z24.s\n"
+    "fmla z5.s, p2/M, z21.s, z16.s\n"
+    "fmla z29.s, p2/M, z21.s, z18.s\n"
+    "ld1w { z16.s }, p3/Z, [x20, x13, LSL #2]\n"
+    "fmla z30.s, p2/M, z20.s, z24.s\n"
+    "fmla z31.s, p2/M, z20.s, z27.s\n"
+    "fmla z5.s, p2/M, z20.s, z18.s\n"
+    "fmla z29.s, p2/M, z20.s, z17.s\n"
+    "fmla z30.s, p2/M, z19.s, z27.s\n"
+    "fmla z31.s, p2/M, z19.s, z22.s\n"
+    "fmax z30.s, p2/M, z30.s, z15.s\n"
+    "fmax z31.s, p2/M, z31.s, z15.s\n"
+    "fmla z5.s, p2/M, z19.s, z17.s\n"
+    "fmla z29.s, p2/M, z19.s, z16.s\n"
+    "fmax z5.s, p2/M, z5.s, z15.s\n"
+    "fmax z29.s, p2/M, z29.s, z15.s\n"
+    "fmin z30.s, p2/M, z30.s, z28.s\n"
+    "fmin z31.s, p2/M, z31.s, z28.s\n"
+    "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
+    "fmin z5.s, p2/M, z5.s, z28.s\n"
+    "fmin z29.s, p2/M, z29.s, z28.s\n"
+    "st1w { z31.s }, p0, [x14, x28, LSL #2]\n"
+    "st1w { z5.s }, p0, [x12, x28, LSL #2]\n"
+    "st1w { z29.s }, p0, [x11, x28, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6b155fc855
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+class sve_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<float, float, float, float>
+{
+  KernelType kernel = sve_fp32_nhwc_generic_output9_mla_depthfirst_impl;
+
+  public:
+  sve_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<float, float, float, float>(9, arm_gemm::VLType::SVE) {}
+
+  KernelType get_kernel() const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d53daaa8a0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const void *bias,
+  const unsigned int n_points,
+  const unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ptrue p1.b\n"
+    "mov x11, #0x0\n"
+    "ld1rw { z2.s }, p1/Z, [%x[minmax_vals]]\n"
+    "ld1rw { z1.s }, p1/Z, [%x[minmax_vals], #4]\n"
+    "whilelt p0.s, x11, %x[n_channels]\n"
+    "1:"  // Channel loop
+    "mov z23.b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ld1w { z23.s }, p0/Z, [%x[bias], x11, LSL #2]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "mov x10, %x[inptrs]\n"
+    "ldp x28, x27, [x10], #0x10\n"
+    "ldp x26, x25, [x10], #0x10\n"
+    "subs x9, %x[n_points], #0x1\n"
+    "ldp x24, x23, [x10], #0x10\n"
+    "ldp x22, x21, [x10], #0x10\n"
+    "mov z24.d, z23.d\n"
+    "mov z25.d, z23.d\n"
+    "ldr x20, [x10], #0x8\n"
+    "mov z26.d, z23.d\n"
+    "mov z27.d, z23.d\n"
+    "ld1w { z0.s }, p1/Z, [%x[params]]\n"
+    "mov z28.d, z23.d\n"
+    "mov z29.d, z23.d\n"
+    "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
+    "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
+    "mov z30.d, z23.d\n"
+    "mov z31.d, z23.d\n"
+    "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
+    "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
+    "ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
+    "ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
+    "addvl %x[params], %x[params], #1\n"
+    "ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
+    "ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
+    "ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "ldp x28, x27, [x10], #0x10\n"
+    "ldp x26, x25, [x10], #0x10\n"
+    "subs x9, x9, #0x1\n"
+    "fmla z23.s, p1/M, z14.s, z0.s\n"
+    "ldp x24, x23, [x10], #0x10\n"
+    "ldp x22, x21, [x10], #0x10\n"
+    "fmla z24.s, p1/M, z15.s, z0.s\n"
+    "fmla z25.s, p1/M, z16.s, z0.s\n"
+    "ldr x20, [x10], #0x8\n"
+    "fmla z26.s, p1/M, z17.s, z0.s\n"
+    "fmla z27.s, p1/M, z18.s, z0.s\n"
+    "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
+    "fmla z28.s, p1/M, z19.s, z0.s\n"
+    "fmla z29.s, p1/M, z20.s, z0.s\n"
+    "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
+    "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
+    "fmla z30.s, p1/M, z21.s, z0.s\n"
+    "fmla z31.s, p1/M, z22.s, z0.s\n"
+    "ld1w { z0.s }, p1/Z, [%x[params]]\n"
+    "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
+    "ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
+    "ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
+    "addvl %x[params], %x[params], #1\n"
+    "ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
+    "ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
+    "ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "fmla z23.s, p1/M, z14.s, z0.s\n"
+    "fmla z24.s, p1/M, z15.s, z0.s\n"
+    "fmax z23.s, p1/M, z23.s, z2.s\n"
+    "fmax z24.s, p1/M, z24.s, z2.s\n"
+    "fmla z25.s, p1/M, z16.s, z0.s\n"
+    "fmla z26.s, p1/M, z17.s, z0.s\n"
+    "fmax z25.s, p1/M, z25.s, z2.s\n"
+    "fmax z26.s, p1/M, z26.s, z2.s\n"
+    "fmla z27.s, p1/M, z18.s, z0.s\n"
+    "fmla z28.s, p1/M, z19.s, z0.s\n"
+    "fmax z27.s, p1/M, z27.s, z2.s\n"
+    "fmax z28.s, p1/M, z28.s, z2.s\n"
+    "fmla z29.s, p1/M, z20.s, z0.s\n"
+    "fmla z30.s, p1/M, z21.s, z0.s\n"
+    "fmax z29.s, p1/M, z29.s, z2.s\n"
+    "fmax z30.s, p1/M, z30.s, z2.s\n"
+    "fmla z31.s, p1/M, z22.s, z0.s\n"
+    "fmax z31.s, p1/M, z31.s, z2.s\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "fmin z23.s, p1/M, z23.s, z1.s\n"
+    "fmin z24.s, p1/M, z24.s, z1.s\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "fmin z25.s, p1/M, z25.s, z1.s\n"
+    "fmin z26.s, p1/M, z26.s, z1.s\n"
+    "st1w { z23.s }, p0, [x28, x11, LSL #2]\n"
+    "fmin z27.s, p1/M, z27.s, z1.s\n"
+    "fmin z28.s, p1/M, z28.s, z1.s\n"
+    "st1w { z24.s }, p0, [x27, x11, LSL #2]\n"
+    "fmin z29.s, p1/M, z29.s, z1.s\n"
+    "fmin z30.s, p1/M, z30.s, z1.s\n"
+    "st1w { z25.s }, p0, [x26, x11, LSL #2]\n"
+    "fmin z31.s, p1/M, z31.s, z1.s\n"
+    "st1w { z26.s }, p0, [x25, x11, LSL #2]\n"
+    "st1w { z27.s }, p0, [x24, x11, LSL #2]\n"
+    "st1w { z28.s }, p0, [x23, x11, LSL #2]\n"
+    "st1w { z29.s }, p0, [x22, x11, LSL #2]\n"
+    "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
+    "st1w { z31.s }, p0, [x20, x11, LSL #2]\n"
+    "incw x11\n"
+    "whilelt p0.s, x11, %x[n_channels]\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..eb1b111c36
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+  using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *)
+  : Parent(3, 3, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..3a71baaf61
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "mov x17, #0x0\n"
+    "whilelt p2.s, x17, %x[channel_multiplier]\n"
+    "ldr x16, [%x[inptrs], #0x0]\n"
+    "ldr x15, [%x[inptrs], #0x8]\n"
+    "ptrue p1.b\n"
+    "ldr x14, [%x[inptrs], #0x10]\n"
+    "ldr x13, [%x[inptrs], #0x18]\n"
+    "mov x12, #0x0\n"
+    "ldr x11, [%x[inptrs], #0x20]\n"
+    "ldr x10, [%x[inptrs], #0x28]\n"
+    "ldr x9, [%x[inptrs], #0x30]\n"
+    "ld1w { z24.s }, p2/Z, [%x[params]]\n"
+    "mov z21.d, z24.d\n"
+    "mov z25.d, z24.d\n"
+    "ldp x28, x27, [%x[outptrs], #0x0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x10]\n"
+    "mov z27.d, z24.d\n"
+    "mov z26.d, z24.d\n"
+    "ldp x24, x23, [%x[outptrs], #0x20]\n"
+    "ldp x22, x21, [%x[outptrs], #0x30]\n"
+    "mov z28.d, z24.d\n"
+    "mov z20.d, z24.d\n"
+    "ldr x20, [%x[outptrs], #0x40]\n"
+    "ld1rqw { z2.s }, p1/Z, [x16]\n"
+    "mov z23.d, z24.d\n"
+    "mov z19.d, z24.d\n"
+    "ld1rqw { z3.s }, p1/Z, [x16, #16]\n"
+    "ld1rqw { z4.s }, p1/Z, [x15]\n"
+    "ld1rqw { z5.s }, p1/Z, [x15, #16]\n"
+    "ld1rqw { z6.s }, p1/Z, [x14]\n"
+    "ld1rqw { z7.s }, p1/Z, [x14, #16]\n"
+    "ld1rqw { z8.s }, p1/Z, [x13]\n"
+    "ld1rqw { z9.s }, p1/Z, [x13, #16]\n"
+    "ld1rqw { z10.s }, p1/Z, [x11]\n"
+    "ld1rqw { z11.s }, p1/Z, [x11, #16]\n"
+    "ld1rqw { z12.s }, p1/Z, [x10]\n"
+    "ld1rqw { z13.s }, p1/Z, [x10, #16]\n"
+    "ld1rqw { z14.s }, p1/Z, [x9]\n"
+    "ld1rqw { z15.s }, p1/Z, [x9, #16]\n"
+    "ld1rw { z22.s }, p1/Z, [%x[clamps]]\n"
+    "ld1rw { z16.s }, p1/Z, [%x[clamps], #4]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "addvl %x[params], %x[params], #4\n"
+    "1:"  // Output channel complete vector loop
+    "fmla z24.s, z31.s, z2.s[0]\n"
+    "fmla z27.s, z31.s, z6.s[0]\n"
+    "mov z1.d, z10.d\n"
+    "incw x17\n"
+    "fmla z26.s, z31.s, z6.s[2]\n"
+    "fmla z28.s, z31.s, z7.s[0]\n"
+    "mov z0.d, z11.d\n"
+    "mov p0.b, p2.b\n"
+    "fmla z21.s, z31.s, z2.s[2]\n"
+    "fmla z25.s, z31.s, z3.s[0]\n"
+    "whilelt p2.s, x17, %x[channel_multiplier]\n"
+    "fmla z20.s, z31.s, z1.s[0]\n"
+    "fmla z23.s, z31.s, z1.s[2]\n"
+    "fmla z19.s, z31.s, z0.s[0]\n"
+    "fmla z24.s, z30.s, z2.s[1]\n"
+    "ld1w { z18.s }, p1/Z, [%x[params]]\n"
+    "fmla z27.s, z30.s, z6.s[1]\n"
+    "fmla z26.s, z30.s, z6.s[3]\n"
+    "fmla z28.s, z30.s, z7.s[1]\n"
+    "fmla z21.s, z30.s, z2.s[3]\n"
+    "fmla z25.s, z30.s, z3.s[1]\n"
+    "fmla z20.s, z30.s, z1.s[1]\n"
+    "fmla z23.s, z30.s, z1.s[3]\n"
+    "fmla z19.s, z30.s, z0.s[1]\n"
+    "ld1w { z17.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "fmla z24.s, z29.s, z2.s[2]\n"
+    "fmla z27.s, z29.s, z6.s[2]\n"
+    "fmla z26.s, z29.s, z7.s[0]\n"
+    "fmla z28.s, z29.s, z7.s[2]\n"
+    "fmla z21.s, z29.s, z3.s[0]\n"
+    "fmla z25.s, z29.s, z3.s[2]\n"
+    "fmla z20.s, z29.s, z1.s[2]\n"
+    "fmla z23.s, z29.s, z0.s[0]\n"
+    "mov z1.d, z8.d\n"
+    "fmla z19.s, z29.s, z0.s[2]\n"
+    "mov z0.d, z9.d\n"
+    "fmla z24.s, z18.s, z4.s[0]\n"
+    "ld1w { z31.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "fmla z27.s, z18.s, z1.s[0]\n"
+    "fmla z26.s, z18.s, z1.s[2]\n"
+    "mov z1.d, z12.d\n"
+    "fmla z28.s, z18.s, z0.s[0]\n"
+    "mov z0.d, z13.d\n"
+    "fmla z21.s, z18.s, z4.s[2]\n"
+    "fmla z25.s, z18.s, z5.s[0]\n"
+    "fmla z20.s, z18.s, z1.s[0]\n"
+    "fmla z23.s, z18.s, z1.s[2]\n"
+    "fmla z19.s, z18.s, z0.s[0]\n"
+    "mov z1.d, z8.d\n"
+    "ld1w { z18.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "mov z0.d, z9.d\n"
+    "fmla z24.s, z17.s, z4.s[1]\n"
+    "fmla z27.s, z17.s, z1.s[1]\n"
+    "fmla z26.s, z17.s, z1.s[3]\n"
+    "fmla z28.s, z17.s, z0.s[1]\n"
+    "mov z1.d, z12.d\n"
+    "mov z0.d, z13.d\n"
+    "fmla z21.s, z17.s, z4.s[3]\n"
+    "fmla z25.s, z17.s, z5.s[1]\n"
+    "fmla z20.s, z17.s, z1.s[1]\n"
+    "fmla z23.s, z17.s, z1.s[3]\n"
+    "mov z1.d, z8.d\n"
+    "fmla z19.s, z17.s, z0.s[1]\n"
+    "mov z0.d, z9.d\n"
+    "fmla z24.s, z31.s, z4.s[2]\n"
+    "ld1w { z17.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+    "fmla z27.s, z31.s, z1.s[2]\n"
+    "fmla z26.s, z31.s, z0.s[0]\n"
+    "mov z1.d, z12.d\n"
+    "fmla z28.s, z31.s, z0.s[2]\n"
+    "mov z0.d, z13.d\n"
+    "fmla z21.s, z31.s, z5.s[0]\n"
+    "fmla z25.s, z31.s, z5.s[2]\n"
+    "fmla z20.s, z31.s, z1.s[2]\n"
+    "mov z1.d, z10.d\n"
+    "fmla z23.s, z31.s, z0.s[0]\n"
+    "fmla z19.s, z31.s, z0.s[2]\n"
+    "mov z0.d, z11.d\n"
+    "ld1w { z29.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+    "fmla z24.s, z18.s, z6.s[0]\n"
+    "fmla z27.s, z18.s, z1.s[0]\n"
+    "fmla z26.s, z18.s, z1.s[2]\n"
+    "fmla z28.s, z18.s, z0.s[0]\n"
+    "mov z1.d, z14.d\n"
+    "mov z0.d, z15.d\n"
+    "fmla z21.s, z18.s, z6.s[2]\n"
+    "fmla z25.s, z18.s, z7.s[0]\n"
+    "fmla z20.s, z18.s, z1.s[0]\n"
+    "fmla z23.s, z18.s, z1.s[2]\n"
+    "mov z1.d, z10.d\n"
+    "fmla z19.s, z18.s, z0.s[0]\n"
+    "mov z0.d, z11.d\n"
+    "fmla z24.s, z17.s, z6.s[1]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "fmla z27.s, z17.s, z1.s[1]\n"
+    "fmla z26.s, z17.s, z1.s[3]\n"
+    "mov z1.d, z14.d\n"
+    "fmla z28.s, z17.s, z0.s[1]\n"
+    "mov z0.d, z15.d\n"
+    "fmla z21.s, z17.s, z6.s[3]\n"
+    "fmla z25.s, z17.s, z7.s[1]\n"
+    "fmla z20.s, z17.s, z1.s[1]\n"
+    "fmla z23.s, z17.s, z1.s[3]\n"
+    "fmla z19.s, z17.s, z0.s[1]\n"
+    "mov z1.d, z10.d\n"
+    "mov z0.d, z11.d\n"
+    "fmla z24.s, z29.s, z6.s[2]\n"
+    "fmla z27.s, z29.s, z1.s[2]\n"
+    "fmin z24.s, p1/M, z24.s, z16.s\n"
+    "fmla z26.s, z29.s, z0.s[0]\n"
+    "fmla z28.s, z29.s, z0.s[2]\n"
+    "mov z1.d, z14.d\n"
+    "fmax z24.s, p1/M, z24.s, z22.s\n"
+    "mov z0.d, z15.d\n"
+    "fmla z21.s, z29.s, z7.s[0]\n"
+    "fmla z25.s, z29.s, z7.s[2]\n"
+    "fmin z21.s, p1/M, z21.s, z16.s\n"
+    "fmla z20.s, z29.s, z1.s[2]\n"
+    "fmla z23.s, z29.s, z0.s[0]\n"
+    "fmin z25.s, p1/M, z25.s, z16.s\n"
+    "fmin z27.s, p1/M, z27.s, z16.s\n"
+    "fmla z19.s, z29.s, z0.s[2]\n"
+    "fmin z26.s, p1/M, z26.s, z16.s\n"
+    "fmin z28.s, p1/M, z28.s, z16.s\n"
+    "st1w { z24.s }, p0, [x28, x12, LSL #2]\n"
+    "fmin z20.s, p1/M, z20.s, z16.s\n"
+    "fmin z23.s, p1/M, z23.s, z16.s\n"
+    "ld1w { z24.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "fmin z19.s, p1/M, z19.s, z16.s\n"
+    "addvl %x[params], %x[params], #16\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "fmax z21.s, p1/M, z21.s, z22.s\n"
+    "fmax z25.s, p1/M, z25.s, z22.s\n"
+    "st1w { z21.s }, p0, [x27, x12, LSL #2]\n"
+    "mov z21.d, z24.d\n"
+    "fmax z27.s, p1/M, z27.s, z22.s\n"
+    "fmax z26.s, p1/M, z26.s, z22.s\n"
+    "st1w { z25.s }, p0, [x26, x12, LSL #2]\n"
+    "mov z25.d, z24.d\n"
+    "fmax z28.s, p1/M, z28.s, z22.s\n"
+    "fmax z20.s, p1/M, z20.s, z22.s\n"
+    "st1w { z27.s }, p0, [x25, x12, LSL #2]\n"
+    "mov z27.d, z24.d\n"
+    "fmax z23.s, p1/M, z23.s, z22.s\n"
+    "fmax z19.s, p1/M, z19.s, z22.s\n"
+    "st1w { z26.s }, p0, [x24, x12, LSL #2]\n"
+    "mov z26.d, z24.d\n"
+    "st1w { z28.s }, p0, [x23, x12, LSL #2]\n"
+    "mov z28.d, z24.d\n"
+    "addvl %x[params], %x[params], #-6\n"
+    "st1w { z20.s }, p0, [x22, x12, LSL #2]\n"
+    "mov z20.d, z24.d\n"
+    "st1w { z23.s }, p0, [x21, x12, LSL #2]\n"
+    "mov z23.d, z24.d\n"
+    "st1w { z19.s }, p0, [x20, x12, LSL #2]\n"
+    "incw x12\n"
+    "mov z19.d, z24.d\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..cc0c4236a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+  using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..84ab4b5035
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "mov x15, #0x0\n"
+    "whilelt p2.s, x15, %x[channel_multiplier]\n"
+    "ldr x14, [%x[inptrs], #0x0]\n"
+    "ldr x13, [%x[inptrs], #0x8]\n"
+    "ptrue p1.b\n"
+    "ldr x12, [%x[inptrs], #0x10]\n"
+    "ldr x11, [%x[inptrs], #0x18]\n"
+    "mov x10, #0x0\n"
+    "ldr x9, [%x[inptrs], #0x20]\n"
+    "ldr x28, [%x[inptrs], #0x28]\n"
+    "ld1w { z16.s }, p2/Z, [%x[params]]\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "mov z25.d, z16.d\n"
+    "mov z15.d, z16.d\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "mov z24.d, z16.d\n"
+    "mov z14.d, z16.d\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "ld1rqw { z2.s }, p1/Z, [x14]\n"
+    "mov z26.d, z16.d\n"
+    "mov z17.d, z16.d\n"
+    "ld1rqw { z3.s }, p1/Z, [x14, #16]\n"
+    "ld1rqw { z4.s }, p1/Z, [x13]\n"
+    "mov z23.d, z16.d\n"
+    "ld1rqw { z5.s }, p1/Z, [x13, #16]\n"
+    "ld1rqw { z6.s }, p1/Z, [x12]\n"
+    "ld1rqw { z7.s }, p1/Z, [x12, #16]\n"
+    "ld1rqw { z8.s }, p1/Z, [x11]\n"
+    "ld1rqw { z9.s }, p1/Z, [x11, #16]\n"
+    "ld1rqw { z10.s }, p1/Z, [x9]\n"
+    "ld1rqw { z11.s }, p1/Z, [x9, #16]\n"
+    "ld1rqw { z12.s }, p1/Z, [x28]\n"
+    "ld1rqw { z13.s }, p1/Z, [x28, #16]\n"
+    "ld1rw { z21.s }, p1/Z, [%x[clamps]]\n"
+    "ld1rw { z22.s }, p1/Z, [%x[clamps], #4]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "ld1w { z27.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "addvl %x[params], %x[params], #6\n"
+    "1:"  // Output channel complete vector loop
+    "fmla z16.s, z31.s, z2.s[0]\n"
+    "fmla z25.s, z31.s, z2.s[1]\n"
+    "mov z0.d, z8.d\n"
+    "incw x15\n"
+    "fmla z15.s, z31.s, z2.s[2]\n"
+    "fmla z24.s, z31.s, z2.s[3]\n"
+    "mov z1.d, z9.d\n"
+    "mov p0.b, p2.b\n"
+    "fmla z14.s, z31.s, z4.s[0]\n"
+    "fmla z26.s, z31.s, z4.s[1]\n"
+    "whilelt p2.s, x15, %x[channel_multiplier]\n"
+    "fmla z17.s, z31.s, z4.s[2]\n"
+    "fmla z23.s, z31.s, z4.s[3]\n"
+    "ld1w { z20.s }, p1/Z, [%x[params]]\n"
+    "fmla z16.s, z30.s, z2.s[1]\n"
+    "fmla z25.s, z30.s, z2.s[2]\n"
+    "fmla z15.s, z30.s, z2.s[3]\n"
+    "fmla z24.s, z30.s, z3.s[0]\n"
+    "fmla z14.s, z30.s, z4.s[1]\n"
+    "fmla z26.s, z30.s, z4.s[2]\n"
+    "fmla z17.s, z30.s, z4.s[3]\n"
+    "fmla z23.s, z30.s, z5.s[0]\n"
+    "ld1w { z19.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "fmla z16.s, z29.s, z2.s[2]\n"
+    "fmla z25.s, z29.s, z2.s[3]\n"
+    "fmla z15.s, z29.s, z3.s[0]\n"
+    "fmla z24.s, z29.s, z3.s[1]\n"
+    "fmla z14.s, z29.s, z4.s[2]\n"
+    "fmla z26.s, z29.s, z4.s[3]\n"
+    "fmla z17.s, z29.s, z5.s[0]\n"
+    "fmla z23.s, z29.s, z5.s[1]\n"
+    "ld1w { z18.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "fmla z16.s, z28.s, z2.s[3]\n"
+    "fmla z25.s, z28.s, z3.s[0]\n"
+    "fmla z15.s, z28.s, z3.s[1]\n"
+    "fmla z24.s, z28.s, z3.s[2]\n"
+    "fmla z14.s, z28.s, z4.s[3]\n"
+    "fmla z26.s, z28.s, z5.s[0]\n"
+    "fmla z17.s, z28.s, z5.s[1]\n"
+    "fmla z23.s, z28.s, z5.s[2]\n"
+    "ld1w { z28.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "fmla z16.s, z27.s, z3.s[0]\n"
+    "fmla z25.s, z27.s, z3.s[1]\n"
+    "fmla z15.s, z27.s, z3.s[2]\n"
+    "fmla z24.s, z27.s, z3.s[3]\n"
+    "fmla z14.s, z27.s, z5.s[0]\n"
+    "fmla z26.s, z27.s, z5.s[1]\n"
+    "fmla z17.s, z27.s, z5.s[2]\n"
+    "fmla z23.s, z27.s, z5.s[3]\n"
+    "ld1w { z27.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+    "fmla z16.s, z20.s, z4.s[0]\n"
+    "fmla z25.s, z20.s, z4.s[1]\n"
+    "fmla z15.s, z20.s, z4.s[2]\n"
+    "fmla z24.s, z20.s, z4.s[3]\n"
+    "fmla z14.s, z20.s, z6.s[0]\n"
+    "fmla z26.s, z20.s, z6.s[1]\n"
+    "fmla z17.s, z20.s, z6.s[2]\n"
+    "fmla z23.s, z20.s, z6.s[3]\n"
+    "ld1w { z20.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+    "fmla z16.s, z19.s, z4.s[1]\n"
+    "fmla z25.s, z19.s, z4.s[2]\n"
+    "fmla z15.s, z19.s, z4.s[3]\n"
+    "fmla z24.s, z19.s, z5.s[0]\n"
+    "fmla z14.s, z19.s, z6.s[1]\n"
+    "fmla z26.s, z19.s, z6.s[2]\n"
+    "fmla z17.s, z19.s, z6.s[3]\n"
+    "fmla z23.s, z19.s, z7.s[0]\n"
+    "ld1w { z19.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+    "fmla z16.s, z18.s, z4.s[2]\n"
+    "fmla z25.s, z18.s, z4.s[3]\n"
+    "fmla z15.s, z18.s, z5.s[0]\n"
+    "fmla z24.s, z18.s, z5.s[1]\n"
+    "fmla z14.s, z18.s, z6.s[2]\n"
+    "fmla z26.s, z18.s, z6.s[3]\n"
+    "fmla z17.s, z18.s, z7.s[0]\n"
+    "fmla z23.s, z18.s, z7.s[1]\n"
+    "ld1w { z18.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "fmla z16.s, z28.s, z4.s[3]\n"
+    "fmla z25.s, z28.s, z5.s[0]\n"
+    "fmla z15.s, z28.s, z5.s[1]\n"
+    "fmla z24.s, z28.s, z5.s[2]\n"
+    "fmla z14.s, z28.s, z6.s[3]\n"
+    "fmla z26.s, z28.s, z7.s[0]\n"
+    "fmla z17.s, z28.s, z7.s[1]\n"
+    "fmla z23.s, z28.s, z7.s[2]\n"
+    "ld1w { z30.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+    "fmla z16.s, z27.s, z5.s[0]\n"
+    "fmla z25.s, z27.s, z5.s[1]\n"
+    "fmla z15.s, z27.s, z5.s[2]\n"
+    "fmla z24.s, z27.s, z5.s[3]\n"
+    "fmla z14.s, z27.s, z7.s[0]\n"
+    "fmla z26.s, z27.s, z7.s[1]\n"
+    "fmla z17.s, z27.s, z7.s[2]\n"
+    "fmla z23.s, z27.s, z7.s[3]\n"
+    "ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+    "fmla z16.s, z20.s, z6.s[0]\n"
+    "fmla z25.s, z20.s, z6.s[1]\n"
+    "fmla z15.s, z20.s, z6.s[2]\n"
+    "fmla z24.s, z20.s, z6.s[3]\n"
+    "fmla z14.s, z20.s, z0.s[0]\n"
+    "fmla z26.s, z20.s, z0.s[1]\n"
+    "fmla z17.s, z20.s, z0.s[2]\n"
+    "fmla z23.s, z20.s, z0.s[3]\n"
+    "ld1w { z20.s }, p1/Z, [%x[params], #-6, MUL VL]\n"
+    "fmla z16.s, z19.s, z6.s[1]\n"
+    "fmla z25.s, z19.s, z6.s[2]\n"
+    "fmla z15.s, z19.s, z6.s[3]\n"
+    "fmla z24.s, z19.s, z7.s[0]\n"
+    "fmla z14.s, z19.s, z0.s[1]\n"
+    "fmla z26.s, z19.s, z0.s[2]\n"
+    "fmla z17.s, z19.s, z0.s[3]\n"
+    "fmla z23.s, z19.s, z1.s[0]\n"
+    "ld1w { z19.s }, p1/Z, [%x[params], #-5, MUL VL]\n"
+    "fmla z16.s, z18.s, z6.s[2]\n"
+    "fmla z25.s, z18.s, z6.s[3]\n"
+    "fmla z15.s, z18.s, z7.s[0]\n"
+    "fmla z24.s, z18.s, z7.s[1]\n"
+    "fmla z14.s, z18.s, z0.s[2]\n"
+    "fmla z26.s, z18.s, z0.s[3]\n"
+    "fmla z17.s, z18.s, z1.s[0]\n"
+    "fmla z23.s, z18.s, z1.s[1]\n"
+    "ld1w { z18.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
+    "fmla z16.s, z30.s, z6.s[3]\n"
+    "fmla z25.s, z30.s, z7.s[0]\n"
+    "fmla z15.s, z30.s, z7.s[1]\n"
+    "fmla z24.s, z30.s, z7.s[2]\n"
+    "fmla z14.s, z30.s, z0.s[3]\n"
+    "fmla z26.s, z30.s, z1.s[0]\n"
+    "fmla z17.s, z30.s, z1.s[1]\n"
+    "fmla z23.s, z30.s, z1.s[2]\n"
+    "ld1w { z31.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
+    "fmla z16.s, z27.s, z7.s[0]\n"
+    "fmla z25.s, z27.s, z7.s[1]\n"
+    "fmla z15.s, z27.s, z7.s[2]\n"
+    "fmla z24.s, z27.s, z7.s[3]\n"
+    "fmla z14.s, z27.s, z1.s[0]\n"
+    "fmla z26.s, z27.s, z1.s[1]\n"
+    "fmla z17.s, z27.s, z1.s[2]\n"
+    "fmla z23.s, z27.s, z1.s[3]\n"
+    "ld1w { z27.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
+    "fmla z16.s, z20.s, z0.s[0]\n"
+    "fmla z25.s, z20.s, z0.s[1]\n"
+    "fmla z15.s, z20.s, z0.s[2]\n"
+    "fmla z24.s, z20.s, z0.s[3]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z14.s, z20.s, z0.s[0]\n"
+    "fmla z26.s, z20.s, z0.s[1]\n"
+    "fmla z17.s, z20.s, z0.s[2]\n"
+    "fmla z23.s, z20.s, z0.s[3]\n"
+    "mov z0.d, z8.d\n"
+    "ld1w { z20.s }, p1/Z, [%x[params], #-1, MUL VL]\n"
+    "fmla z16.s, z19.s, z0.s[1]\n"
+    "fmla z25.s, z19.s, z0.s[2]\n"
+    "fmla z15.s, z19.s, z0.s[3]\n"
+    "fmla z24.s, z19.s, z1.s[0]\n"
+    "mov z1.d, z10.d\n"
+    "mov z0.d, z11.d\n"
+    "fmla z14.s, z19.s, z1.s[1]\n"
+    "fmla z26.s, z19.s, z1.s[2]\n"
+    "fmla z17.s, z19.s, z1.s[3]\n"
+    "fmla z23.s, z19.s, z0.s[0]\n"
+    "mov z1.d, z8.d\n"
+    "ld1w { z19.s }, p1/Z, [%x[params]]\n"
+    "mov z0.d, z9.d\n"
+    "fmla z16.s, z18.s, z1.s[2]\n"
+    "fmla z25.s, z18.s, z1.s[3]\n"
+    "fmla z15.s, z18.s, z0.s[0]\n"
+    "fmla z24.s, z18.s, z0.s[1]\n"
+    "mov z1.d, z10.d\n"
+    "mov z0.d, z11.d\n"
+    "fmla z14.s, z18.s, z1.s[2]\n"
+    "fmla z26.s, z18.s, z1.s[3]\n"
+    "fmla z17.s, z18.s, z0.s[0]\n"
+    "fmla z23.s, z18.s, z0.s[1]\n"
+    "mov z1.d, z8.d\n"
+    "ld1w { z18.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "mov z0.d, z9.d\n"
+    "fmla z16.s, z31.s, z1.s[3]\n"
+    "fmla z25.s, z31.s, z0.s[0]\n"
+    "fmla z15.s, z31.s, z0.s[1]\n"
+    "fmla z24.s, z31.s, z0.s[2]\n"
+    "mov z0.d, z10.d\n"
+    "mov z1.d, z11.d\n"
+    "fmla z14.s, z31.s, z0.s[3]\n"
+    "fmla z26.s, z31.s, z1.s[0]\n"
+    "fmla z17.s, z31.s, z1.s[1]\n"
+    "fmla z23.s, z31.s, z1.s[2]\n"
+    "mov z1.d, z9.d\n"
+    "ld1w { z28.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "fmla z16.s, z27.s, z1.s[0]\n"
+    "fmla z25.s, z27.s, z1.s[1]\n"
+    "fmla z15.s, z27.s, z1.s[2]\n"
+    "fmla z24.s, z27.s, z1.s[3]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z14.s, z27.s, z1.s[0]\n"
+    "fmla z26.s, z27.s, z1.s[1]\n"
+    "fmla z17.s, z27.s, z1.s[2]\n"
+    "fmla z23.s, z27.s, z1.s[3]\n"
+    "ld1w { z27.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "fmla z16.s, z20.s, z0.s[0]\n"
+    "fmla z25.s, z20.s, z0.s[1]\n"
+    "fmla z15.s, z20.s, z0.s[2]\n"
+    "fmla z24.s, z20.s, z0.s[3]\n"
+    "mov z0.d, z12.d\n"
+    "fmla z14.s, z20.s, z0.s[0]\n"
+    "fmla z26.s, z20.s, z0.s[1]\n"
+    "fmla z17.s, z20.s, z0.s[2]\n"
+    "fmla z23.s, z20.s, z0.s[3]\n"
+    "mov z0.d, z10.d\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "fmla z16.s, z19.s, z0.s[1]\n"
+    "fmla z25.s, z19.s, z0.s[2]\n"
+    "fmla z15.s, z19.s, z0.s[3]\n"
+    "fmla z24.s, z19.s, z1.s[0]\n"
+    "mov z1.d, z12.d\n"
+    "mov z0.d, z13.d\n"
+    "fmla z14.s, z19.s, z1.s[1]\n"
+    "fmla z26.s, z19.s, z1.s[2]\n"
+    "fmla z17.s, z19.s, z1.s[3]\n"
+    "fmla z23.s, z19.s, z0.s[0]\n"
+    "mov z1.d, z10.d\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "mov z0.d, z11.d\n"
+    "fmla z16.s, z18.s, z1.s[2]\n"
+    "fmla z25.s, z18.s, z1.s[3]\n"
+    "fmla z15.s, z18.s, z0.s[0]\n"
+    "fmla z24.s, z18.s, z0.s[1]\n"
+    "mov z1.d, z12.d\n"
+    "mov z0.d, z13.d\n"
+    "fmla z14.s, z18.s, z1.s[2]\n"
+    "fmla z26.s, z18.s, z1.s[3]\n"
+    "fmla z17.s, z18.s, z0.s[0]\n"
+    "fmla z23.s, z18.s, z0.s[1]\n"
+    "mov z1.d, z10.d\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "mov z0.d, z11.d\n"
+    "fmla z16.s, z28.s, z1.s[3]\n"
+    "fmla z25.s, z28.s, z0.s[0]\n"
+    "fmla z15.s, z28.s, z0.s[1]\n"
+    "fmla z24.s, z28.s, z0.s[2]\n"
+    "mov z0.d, z13.d\n"
+    "mov z1.d, z12.d\n"
+    "fmla z26.s, z28.s, z0.s[0]\n"
+    "fmla z17.s, z28.s, z0.s[1]\n"
+    "fmla z23.s, z28.s, z0.s[2]\n"
+    "mov z0.d, z11.d\n"
+    "fmla z14.s, z28.s, z1.s[3]\n"
+    "fmla z16.s, z27.s, z0.s[0]\n"
+    "fmla z25.s, z27.s, z0.s[1]\n"
+    "fmin z16.s, p1/M, z16.s, z22.s\n"
+    "fmax z16.s, p1/M, z16.s, z21.s\n"
+    "fmla z15.s, z27.s, z0.s[2]\n"
+    "fmla z24.s, z27.s, z0.s[3]\n"
+    "mov z0.d, z13.d\n"
+    "fmin z25.s, p1/M, z25.s, z22.s\n"
+    "fmla z14.s, z27.s, z0.s[0]\n"
+    "fmla z26.s, z27.s, z0.s[1]\n"
+    "fmin z15.s, p1/M, z15.s, z22.s\n"
+    "fmin z24.s, p1/M, z24.s, z22.s\n"
+    "fmla z17.s, z27.s, z0.s[2]\n"
+    "fmla z23.s, z27.s, z0.s[3]\n"
+    "fmin z14.s, p1/M, z14.s, z22.s\n"
+    "fmin z26.s, p1/M, z26.s, z22.s\n"
+    "fmin z17.s, p1/M, z17.s, z22.s\n"
+    "fmin z23.s, p1/M, z23.s, z22.s\n"
+    "st1w { z16.s }, p0, [x27, x10, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "fmax z25.s, p1/M, z25.s, z21.s\n"
+    "st1w { z25.s }, p0, [x26, x10, LSL #2]\n"
+    "mov z25.d, z16.d\n"
+    "fmax z15.s, p1/M, z15.s, z21.s\n"
+    "fmax z24.s, p1/M, z24.s, z21.s\n"
+    "st1w { z15.s }, p0, [x25, x10, LSL #2]\n"
+    "mov z15.d, z16.d\n"
+    "fmax z14.s, p1/M, z14.s, z21.s\n"
+    "fmax z26.s, p1/M, z26.s, z21.s\n"
+    "st1w { z24.s }, p0, [x24, x10, LSL #2]\n"
+    "mov z24.d, z16.d\n"
+    "fmax z17.s, p1/M, z17.s, z21.s\n"
+    "fmax z23.s, p1/M, z23.s, z21.s\n"
+    "st1w { z14.s }, p0, [x23, x10, LSL #2]\n"
+    "mov z14.d, z16.d\n"
+    "st1w { z26.s }, p0, [x22, x10, LSL #2]\n"
+    "mov z26.d, z16.d\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "st1w { z17.s }, p0, [x21, x10, LSL #2]\n"
+    "mov z17.d, z16.d\n"
+    "addvl %x[params], %x[params], #-6\n"
+    "st1w { z23.s }, p0, [x20, x10, LSL #2]\n"
+    "incw x10\n"
+    "mov z23.d, z16.d\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f83767d8ae
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>
+{
+  using Parent = GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>;
+  sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+  : Parent(2, 8, arm_gemm::VLType::SVE)
+  {
+  }
+  Parent::KernelType kernel = sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..1770ec182c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const float *weights,
+  const float *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ptrue p1.b\n"
+    "mov x9, #0x0\n"
+    "ld1rw { z15.s }, p1/Z, [%x[minmax_vals]]\n"
+    "ld1rw { z14.s }, p1/Z, [%x[minmax_vals], #4]\n"
+    "whilelt p0.s, x9, %x[n_output_channels]\n"
+    "1:"  // Output channel loop
+    "mov z31.b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ld1w { z31.s }, p0/Z, [%x[bias], x9, LSL #2]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov x23, %x[inptrs]\n"
+    "ldp x21, x20, [x23], #0x10\n"
+    "lsr x22, %x[kernel_points], #0x1\n"
+    "mov z16.d, z31.d\n"
+    "mov z17.d, z31.d\n"
+    "mov z18.d, z31.d\n"
+    "ld1rqw { z6.s }, p1/Z, [x21]\n"
+    "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
+    "mov z19.d, z31.d\n"
+    "mov z20.d, z31.d\n"
+    "ld1rqw { z1.s }, p1/Z, [x20]\n"
+    "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
+    "mov z21.d, z31.d\n"
+    "mov z22.d, z31.d\n"
+    "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+    "addvl %x[weights], %x[weights], #1\n"
+    "mov z23.d, z31.d\n"
+    "mov z24.d, z31.d\n"
+    "mov z25.d, z31.d\n"
+    "mov z26.d, z31.d\n"
+    "mov z27.d, z31.d\n"
+    "mov z28.d, z31.d\n"
+    "mov z29.d, z31.d\n"
+    "mov z30.d, z31.d\n"
+    "mov z31.d, z31.d\n"
+    "cbz x22, 6f\n"
+    "ldp x21, x20, [x23], #0x10\n"
+    "subs x22, x22, #0x1\n"
+    "ld1rqw { z0.s }, p1/Z, [x21]\n"
+    "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
+    "ld1rqw { z7.s }, p1/Z, [x20]\n"
+    "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
+    "ld1w { z11.s }, p1/Z, [%x[weights]]\n"
+    "addvl %x[weights], %x[weights], #1\n"
+    "beq 4f\n"
+    "3:"  // Output channel loop: Kernel loop
+    "ldp x21, x20, [x23], #0x10\n"
+    "fmla z16.s, z8.s, z6.s[0]\n"
+    "fmla z17.s, z8.s, z6.s[1]\n"
+    "subs x22, x22, #0x1\n"
+    "fmla z18.s, z8.s, z6.s[2]\n"
+    "fmla z19.s, z8.s, z6.s[3]\n"
+    "ld1rqw { z6.s }, p1/Z, [x21]\n"
+    "fmla z20.s, z8.s, z5.s[0]\n"
+    "fmla z21.s, z8.s, z5.s[1]\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z23.s, z8.s, z5.s[3]\n"
+    "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
+    "fmla z24.s, z8.s, z1.s[0]\n"
+    "fmla z25.s, z8.s, z1.s[1]\n"
+    "fmla z26.s, z8.s, z1.s[2]\n"
+    "fmla z27.s, z8.s, z1.s[3]\n"
+    "ld1rqw { z1.s }, p1/Z, [x20]\n"
+    "fmla z28.s, z8.s, z2.s[0]\n"
+    "fmla z29.s, z8.s, z2.s[1]\n"
+    "fmla z30.s, z8.s, z2.s[2]\n"
+    "fmla z31.s, z8.s, z2.s[3]\n"
+    "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
+    "ldp x21, x20, [x23], #0x10\n"
+    "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+    "fmla z16.s, z11.s, z0.s[0]\n"
+    "fmla z17.s, z11.s, z0.s[1]\n"
+    "fmla z18.s, z11.s, z0.s[2]\n"
+    "fmla z19.s, z11.s, z0.s[3]\n"
+    "ld1rqw { z0.s }, p1/Z, [x21]\n"
+    "fmla z20.s, z11.s, z4.s[0]\n"
+    "fmla z21.s, z11.s, z4.s[1]\n"
+    "fmla z22.s, z11.s, z4.s[2]\n"
+    "fmla z23.s, z11.s, z4.s[3]\n"
+    "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
+    "fmla z24.s, z11.s, z7.s[0]\n"
+    "fmla z25.s, z11.s, z7.s[1]\n"
+    "fmla z26.s, z11.s, z7.s[2]\n"
+    "fmla z27.s, z11.s, z7.s[3]\n"
+    "ld1rqw { z7.s }, p1/Z, [x20]\n"
+    "fmla z28.s, z11.s, z3.s[0]\n"
+    "fmla z29.s, z11.s, z3.s[1]\n"
+    "fmla z30.s, z11.s, z3.s[2]\n"
+    "fmla z31.s, z11.s, z3.s[3]\n"
+    "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
+    "ld1w { z11.s }, p1/Z, [%x[weights], #1, MUL VL]\n"
+    "addvl %x[weights], %x[weights], #2\n"
+    "bgt 3b\n"
+    "4:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 5f\n"
+    "fmla z16.s, z8.s, z6.s[0]\n"
+    "fmla z17.s, z8.s, z6.s[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla z18.s, z8.s, z6.s[2]\n"
+    "fmla z19.s, z8.s, z6.s[3]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla z20.s, z8.s, z5.s[0]\n"
+    "fmla z21.s, z8.s, z5.s[1]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z23.s, z8.s, z5.s[3]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla z24.s, z8.s, z1.s[0]\n"
+    "fmla z25.s, z8.s, z1.s[1]\n"
+    "fmla z26.s, z8.s, z1.s[2]\n"
+    "fmla z27.s, z8.s, z1.s[3]\n"
+    "fmla z28.s, z8.s, z2.s[0]\n"
+    "fmla z29.s, z8.s, z2.s[1]\n"
+    "fmla z30.s, z8.s, z2.s[2]\n"
+    "fmla z31.s, z8.s, z2.s[3]\n"
+    "fmla z16.s, z11.s, z0.s[0]\n"
+    "fmla z17.s, z11.s, z0.s[1]\n"
+    "fmin z16.s, p1/M, z16.s, z14.s\n"
+    "fmin z17.s, p1/M, z17.s, z14.s\n"
+    "fmla z18.s, z11.s, z0.s[2]\n"
+    "fmla z19.s, z11.s, z0.s[3]\n"
+    "fmin z18.s, p1/M, z18.s, z14.s\n"
+    "fmin z19.s, p1/M, z19.s, z14.s\n"
+    "fmla z20.s, z11.s, z4.s[0]\n"
+    "fmla z21.s, z11.s, z4.s[1]\n"
+    "fmin z20.s, p1/M, z20.s, z14.s\n"
+    "fmin z21.s, p1/M, z21.s, z14.s\n"
+    "fmla z22.s, z11.s, z4.s[2]\n"
+    "fmla z23.s, z11.s, z4.s[3]\n"
+    "fmin z22.s, p1/M, z22.s, z14.s\n"
+    "fmin z23.s, p1/M, z23.s, z14.s\n"
+    "fmla z24.s, z11.s, z7.s[0]\n"
+    "fmla z25.s, z11.s, z7.s[1]\n"
+    "fmax z16.s, p1/M, z16.s, z15.s\n"
+    "fmax z17.s, p1/M, z17.s, z15.s\n"
+    "fmla z26.s, z11.s, z7.s[2]\n"
+    "fmla z27.s, z11.s, z7.s[3]\n"
+    "fmax z18.s, p1/M, z18.s, z15.s\n"
+    "fmax z19.s, p1/M, z19.s, z15.s\n"
+    "fmla z28.s, z11.s, z3.s[0]\n"
+    "fmla z29.s, z11.s, z3.s[1]\n"
+    "fmax z20.s, p1/M, z20.s, z15.s\n"
+    "fmax z21.s, p1/M, z21.s, z15.s\n"
+    "fmla z30.s, z11.s, z3.s[2]\n"
+    "fmla z31.s, z11.s, z3.s[3]\n"
+    "fmax z22.s, p1/M, z22.s, z15.s\n"
+    "fmax z23.s, p1/M, z23.s, z15.s\n"
+    "fmin z24.s, p1/M, z24.s, z14.s\n"
+    "fmin z25.s, p1/M, z25.s, z14.s\n"
+    "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmin z26.s, p1/M, z26.s, z14.s\n"
+    "fmin z27.s, p1/M, z27.s, z14.s\n"
+    "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmin z28.s, p1/M, z28.s, z14.s\n"
+    "fmin z29.s, p1/M, z29.s, z14.s\n"
+    "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmin z30.s, p1/M, z30.s, z14.s\n"
+    "fmin z31.s, p1/M, z31.s, z14.s\n"
+    "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmax z24.s, p1/M, z24.s, z15.s\n"
+    "fmax z25.s, p1/M, z25.s, z15.s\n"
+    "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmax z26.s, p1/M, z26.s, z15.s\n"
+    "fmax z27.s, p1/M, z27.s, z15.s\n"
+    "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmax z28.s, p1/M, z28.s, z15.s\n"
+    "fmax z29.s, p1/M, z29.s, z15.s\n"
+    "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax z30.s, p1/M, z30.s, z15.s\n"
+    "fmax z31.s, p1/M, z31.s, z15.s\n"
+    "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+    "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+    "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+    "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+    "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+    "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+    "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+    "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
+    "b 7f\n"
+    "5:"  // Output channel loop: Odd tail
+    "fmla z16.s, z8.s, z6.s[0]\n"
+    "fmla z17.s, z8.s, z6.s[1]\n"
+    "ldp x20, x28, [x23], #0x10\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "fmla z18.s, z8.s, z6.s[2]\n"
+    "fmla z19.s, z8.s, z6.s[3]\n"
+    "ld1rqw { z6.s }, p1/Z, [x20]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla z20.s, z8.s, z5.s[0]\n"
+    "fmla z21.s, z8.s, z5.s[1]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z23.s, z8.s, z5.s[3]\n"
+    "ld1rqw { z5.s }, p1/Z, [x20, #16]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla z24.s, z8.s, z1.s[0]\n"
+    "fmla z25.s, z8.s, z1.s[1]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "fmla z26.s, z8.s, z1.s[2]\n"
+    "fmla z27.s, z8.s, z1.s[3]\n"
+    "ld1rqw { z1.s }, p1/Z, [x28]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmla z28.s, z8.s, z2.s[0]\n"
+    "fmla z29.s, z8.s, z2.s[1]\n"
+    "fmla z30.s, z8.s, z2.s[2]\n"
+    "fmla z31.s, z8.s, z2.s[3]\n"
+    "ld1w { z10.s }, p1/Z, [%x[weights]]\n"
+    "ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
+    "fmla z16.s, z11.s, z0.s[0]\n"
+    "fmla z17.s, z11.s, z0.s[1]\n"
+    "addvl %x[weights], %x[weights], #1\n"
+    "fmla z18.s, z11.s, z0.s[2]\n"
+    "fmla z19.s, z11.s, z0.s[3]\n"
+    "fmla z20.s, z11.s, z4.s[0]\n"
+    "fmla z21.s, z11.s, z4.s[1]\n"
+    "fmla z22.s, z11.s, z4.s[2]\n"
+    "fmla z23.s, z11.s, z4.s[3]\n"
+    "fmla z24.s, z11.s, z7.s[0]\n"
+    "fmla z25.s, z11.s, z7.s[1]\n"
+    "fmla z26.s, z11.s, z7.s[2]\n"
+    "fmla z27.s, z11.s, z7.s[3]\n"
+    "fmla z28.s, z11.s, z3.s[0]\n"
+    "fmla z29.s, z11.s, z3.s[1]\n"
+    "fmla z30.s, z11.s, z3.s[2]\n"
+    "fmla z31.s, z11.s, z3.s[3]\n"
+    "fmla z16.s, z10.s, z6.s[0]\n"
+    "fmla z17.s, z10.s, z6.s[1]\n"
+    "fmin z16.s, p1/M, z16.s, z14.s\n"
+    "fmin z17.s, p1/M, z17.s, z14.s\n"
+    "fmla z18.s, z10.s, z6.s[2]\n"
+    "fmla z19.s, z10.s, z6.s[3]\n"
+    "fmin z18.s, p1/M, z18.s, z14.s\n"
+    "fmin z19.s, p1/M, z19.s, z14.s\n"
+    "fmla z20.s, z10.s, z5.s[0]\n"
+    "fmla z21.s, z10.s, z5.s[1]\n"
+    "fmin z20.s, p1/M, z20.s, z14.s\n"
+    "fmin z21.s, p1/M, z21.s, z14.s\n"
+    "fmla z22.s, z10.s, z5.s[2]\n"
+    "fmla z23.s, z10.s, z5.s[3]\n"
+    "fmin z22.s, p1/M, z22.s, z14.s\n"
+    "fmin z23.s, p1/M, z23.s, z14.s\n"
+    "fmla z24.s, z10.s, z1.s[0]\n"
+    "fmla z25.s, z10.s, z1.s[1]\n"
+    "fmax z16.s, p1/M, z16.s, z15.s\n"
+    "fmax z17.s, p1/M, z17.s, z15.s\n"
+    "fmla z26.s, z10.s, z1.s[2]\n"
+    "fmla z27.s, z10.s, z1.s[3]\n"
+    "fmax z18.s, p1/M, z18.s, z15.s\n"
+    "fmax z19.s, p1/M, z19.s, z15.s\n"
+    "fmla z28.s, z10.s, z2.s[0]\n"
+    "fmla z29.s, z10.s, z2.s[1]\n"
+    "fmax z20.s, p1/M, z20.s, z15.s\n"
+    "fmax z21.s, p1/M, z21.s, z15.s\n"
+    "fmla z30.s, z10.s, z2.s[2]\n"
+    "fmla z31.s, z10.s, z2.s[3]\n"
+    "fmax z22.s, p1/M, z22.s, z15.s\n"
+    "fmax z23.s, p1/M, z23.s, z15.s\n"
+    "fmin z24.s, p1/M, z24.s, z14.s\n"
+    "fmin z25.s, p1/M, z25.s, z14.s\n"
+    "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmin z26.s, p1/M, z26.s, z14.s\n"
+    "fmin z27.s, p1/M, z27.s, z14.s\n"
+    "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmin z28.s, p1/M, z28.s, z14.s\n"
+    "fmin z29.s, p1/M, z29.s, z14.s\n"
+    "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmin z30.s, p1/M, z30.s, z14.s\n"
+    "fmin z31.s, p1/M, z31.s, z14.s\n"
+    "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmax z24.s, p1/M, z24.s, z15.s\n"
+    "fmax z25.s, p1/M, z25.s, z15.s\n"
+    "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmax z26.s, p1/M, z26.s, z15.s\n"
+    "fmax z27.s, p1/M, z27.s, z15.s\n"
+    "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmax z28.s, p1/M, z28.s, z15.s\n"
+    "fmax z29.s, p1/M, z29.s, z15.s\n"
+    "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax z30.s, p1/M, z30.s, z15.s\n"
+    "fmax z31.s, p1/M, z31.s, z15.s\n"
+    "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+    "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+    "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+    "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+    "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+    "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+    "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+    "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
+    "b 7f\n"
+    "6:"  // Output channel loop: Single kernel point
+    "fmla z16.s, z8.s, z6.s[0]\n"
+    "fmla z17.s, z8.s, z6.s[1]\n"
+    "fmin z16.s, p1/M, z16.s, z14.s\n"
+    "fmin z17.s, p1/M, z17.s, z14.s\n"
+    "fmla z18.s, z8.s, z6.s[2]\n"
+    "fmla z19.s, z8.s, z6.s[3]\n"
+    "fmin z18.s, p1/M, z18.s, z14.s\n"
+    "fmin z19.s, p1/M, z19.s, z14.s\n"
+    "fmla z20.s, z8.s, z5.s[0]\n"
+    "fmla z21.s, z8.s, z5.s[1]\n"
+    "fmin z20.s, p1/M, z20.s, z14.s\n"
+    "fmin z21.s, p1/M, z21.s, z14.s\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z23.s, z8.s, z5.s[3]\n"
+    "fmin z22.s, p1/M, z22.s, z14.s\n"
+    "fmin z23.s, p1/M, z23.s, z14.s\n"
+    "fmla z24.s, z8.s, z1.s[0]\n"
+    "fmla z25.s, z8.s, z1.s[1]\n"
+    "ldr x27, [%x[outptrs], #0x0]\n"
+    "ldr x26, [%x[outptrs], #0x8]\n"
+    "fmla z26.s, z8.s, z1.s[2]\n"
+    "fmla z27.s, z8.s, z1.s[3]\n"
+    "ldr x25, [%x[outptrs], #0x10]\n"
+    "ldr x24, [%x[outptrs], #0x18]\n"
+    "fmla z28.s, z8.s, z2.s[0]\n"
+    "fmla z29.s, z8.s, z2.s[1]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "ldr x22, [%x[outptrs], #0x28]\n"
+    "fmla z30.s, z8.s, z2.s[2]\n"
+    "fmla z31.s, z8.s, z2.s[3]\n"
+    "ldr x21, [%x[outptrs], #0x30]\n"
+    "ldr x20, [%x[outptrs], #0x38]\n"
+    "fmax z16.s, p1/M, z16.s, z15.s\n"
+    "fmax z17.s, p1/M, z17.s, z15.s\n"
+    "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+    "ldr x27, [%x[outptrs], #0x40]\n"
+    "fmax z18.s, p1/M, z18.s, z15.s\n"
+    "fmax z19.s, p1/M, z19.s, z15.s\n"
+    "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+    "ldr x26, [%x[outptrs], #0x48]\n"
+    "fmax z20.s, p1/M, z20.s, z15.s\n"
+    "fmax z21.s, p1/M, z21.s, z15.s\n"
+    "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+    "ldr x25, [%x[outptrs], #0x50]\n"
+    "fmax z22.s, p1/M, z22.s, z15.s\n"
+    "fmax z23.s, p1/M, z23.s, z15.s\n"
+    "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+    "ldr x24, [%x[outptrs], #0x58]\n"
+    "fmin z24.s, p1/M, z24.s, z14.s\n"
+    "fmin z25.s, p1/M, z25.s, z14.s\n"
+    "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin z26.s, p1/M, z26.s, z14.s\n"
+    "fmin z27.s, p1/M, z27.s, z14.s\n"
+    "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+    "ldr x22, [%x[outptrs], #0x68]\n"
+    "fmin z28.s, p1/M, z28.s, z14.s\n"
+    "fmin z29.s, p1/M, z29.s, z14.s\n"
+    "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+    "ldr x21, [%x[outptrs], #0x70]\n"
+    "fmin z30.s, p1/M, z30.s, z14.s\n"
+    "fmin z31.s, p1/M, z31.s, z14.s\n"
+    "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+    "ldr x20, [%x[outptrs], #0x78]\n"
+    "fmax z24.s, p1/M, z24.s, z15.s\n"
+    "fmax z25.s, p1/M, z25.s, z15.s\n"
+    "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+    "fmax z26.s, p1/M, z26.s, z15.s\n"
+    "fmax z27.s, p1/M, z27.s, z15.s\n"
+    "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+    "fmax z28.s, p1/M, z28.s, z15.s\n"
+    "fmax z29.s, p1/M, z29.s, z15.s\n"
+    "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+    "fmax z30.s, p1/M, z30.s, z15.s\n"
+    "fmax z31.s, p1/M, z31.s, z15.s\n"
+    "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+    "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+    "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+    "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+    "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
+    "7:"  // Output channel loop: Done
+    "incw x9\n"
+    "whilelt p0.s, x9, %x[n_output_channels]\n"
+    "b.any 1b\n"
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z10", "z11", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..04cf0d4036
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_sve_s8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_sve_s8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..0cee302c56
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "mov x14, #0x0\n"
+    "whilelt p0.b, x14, %x[n_channels]\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "ldp x25, x24, [%x[inptrs], #0x10]\n"
+    "ldp x23, x22, [%x[inptrs], #0x20]\n"
+    "ldp x13, x21, [%x[inptrs], #0x30]\n"
+    "mov x20, #0x1\n"
+    "ptrue p2.b\n"
+    "ldp x12, x11, [%x[outptrs], #0x0]\n"
+    "ldp x10, x9, [%x[outptrs], #0x10]\n"
+    "orr x20, x20, #0x100\n"
+    "orr x20, x20, #0x10000\n"
+    "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z21.b }, p0/Z, [x26, x14]\n"
+    "dup z25.s, w20\n"
+    "mov x28, #0x0\n"
+    "ldp x27, x26, [%x[inptrs], #0x40]\n"
+    "ld1b { z31.b }, p0/Z, [x25, x14]\n"
+    "zip2 z16.b, z15.b, z31.b\n"
+    "zip1 z15.b, z15.b, z31.b\n"
+    "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+    "ldp x25, x24, [%x[inptrs], #0x50]\n"
+    "zip1 z30.b, z21.b, z29.b\n"
+    "zip2 z29.b, z21.b, z29.b\n"
+    "ld1b { z9.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z20.b }, p0/Z, [x22, x14]\n"
+    "zip2 z13.b, z15.b, z30.b\n"
+    "zip1 z15.b, z15.b, z30.b\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "ld1b { z5.b }, p0/Z, [x13, x14]\n"
+    "zip1 z14.b, z16.b, z29.b\n"
+    "zip2 z29.b, z16.b, z29.b\n"
+    "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip2 z31.b, z9.b, z5.b\n"
+    "zip1 z9.b, z9.b, z5.b\n"
+    "ld1b { z18.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x26, x14]\n"
+    "zip1 z21.b, z20.b, z17.b\n"
+    "zip2 z17.b, z20.b, z17.b\n"
+    "ld1b { z6.b }, p0/Z, [x25, x14]\n"
+    "ld1b { z4.b }, p0/Z, [x24, x14]\n"
+    "zip2 z23.b, z18.b, z6.b\n"
+    "zip1 z18.b, z18.b, z6.b\n"
+    "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z19.b }, p0/Z, [x22, x14]\n"
+    "zip1 z24.b, z28.b, z4.b\n"
+    "zip2 z4.b, z28.b, z4.b\n"
+    "ld1b { z16.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+    "zip2 z22.b, z2.b, z16.b\n"
+    "zip1 z2.b, z2.b, z16.b\n"
+    "zip1 z0.b, z19.b, z5.b\n"
+    "zip2 z5.b, z19.b, z5.b\n"
+    "ld1w { z10.s }, p2/Z, [%x[params]]\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "zip2 z19.b, z9.b, z21.b\n"
+    "zip1 z9.b, z9.b, z21.b\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "zip1 z11.b, z31.b, z17.b\n"
+    "zip2 z17.b, z31.b, z17.b\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "zip2 z12.b, z18.b, z24.b\n"
+    "zip1 z18.b, z18.b, z24.b\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "zip1 z20.b, z23.b, z4.b\n"
+    "zip2 z4.b, z23.b, z4.b\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "zip2 z24.b, z2.b, z0.b\n"
+    "zip1 z2.b, z2.b, z0.b\n"
+    "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "zip1 z0.b, z22.b, z5.b\n"
+    "zip2 z5.b, z22.b, z5.b\n"
+    "addvl %x[params], %x[params], #4\n"
+    "mov z22.d, z10.d\n"
+    "mov z31.d, z10.d\n"
+    "mov z21.d, z10.d\n"
+    "1:"  // Loop
+    "mov z30.s, #0x0\n"
+    "sdot z30.s, z25.b, z9.b\n"
+    "sdot z10.s, z26.b, z15.b\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "sdot z30.s, z25.b, z18.b\n"
+    "sdot z31.s, z26.b, z9.b\n"
+    "mov z27.s, #0x0\n"
+    "incw x14, ALL, MUL #4\n"
+    "sdot z10.s, z3.b, z9.b\n"
+    "ext z9.b, z9.b, z9.b, #0x1\n"
+    "movprfx z28, z30\n sdot z28.s, z25.b, z2.b\n"
+    "sdot z30.s, z25.b, z15.b\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "sdot z27.s, z25.b, z9.b\n"
+    "sdot z31.s, z3.b, z18.b\n"
+    "sdot z10.s, z1.b, z18.b\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "sdot z22.s, z26.b, z15.b\n"
+    "sdot z21.s, z26.b, z9.b\n"
+    "sdot z27.s, z25.b, z18.b\n"
+    "sdot z31.s, z1.b, z2.b\n"
+    "ext z2.b, z2.b, z2.b, #0x1\n"
+    "sdot z22.s, z3.b, z9.b\n"
+    "sdot z21.s, z3.b, z18.b\n"
+    "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "mls z10.s, p2/M, z30.s, z8.s\n"
+    "movprfx z26, z27\n sdot z26.s, z25.b, z2.b\n"
+    "mov z9.s, #0x0\n"
+    "sdot z27.s, z25.b, z15.b\n"
+    "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+    "sdot z22.s, z1.b, z18.b\n"
+    ".inst 0x04b7754a  // sqrdmulh z10.s, z10.s, z23.s\n"
+    "sdot z21.s, z1.b, z2.b\n"
+    "mls z22.s, p2/M, z27.s, z8.s\n"
+    "and z18.d, z10.d, z3.d\n"
+    "mls z31.s, p2/M, z28.s, z8.s\n"
+    "mls z21.s, p2/M, z26.s, z8.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+    ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+    "sdot z9.s, z25.b, z19.b\n"
+    ".inst 0x04b776b5  // sqrdmulh z21.s, z21.s, z23.s\n"
+    "sqadd z10.s, z10.s, z18.s\n"
+    ".inst 0x4482886a  // srshl z10.s, p2/M, z10.s, z3.s\n"
+    "sdot z9.s, z25.b, z12.b\n"
+    "and z28.d, z22.d, z3.d\n"
+    "and z23.d, z31.d, z3.d\n"
+    "movprfx z27, z9\n sdot z27.s, z25.b, z24.b\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "and z18.d, z21.d, z3.d\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "sdot z9.s, z25.b, z13.b\n"
+    "asr z23.s, z23.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z22.s, z22.s, z28.s\n"
+    "sqadd z31.s, z31.s, z23.s\n"
+    ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
+    ".inst 0x4482887f  // srshl z31.s, p2/M, z31.s, z3.s\n"
+    "sqadd z21.s, z21.s, z18.s\n"
+    "add z10.s, z10.s, z16.s\n"
+    ".inst 0x44828875  // srshl z21.s, p2/M, z21.s, z3.s\n"
+    "smax z10.s, p2/M, z10.s, z7.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "smin z10.s, p2/M, z10.s, z6.s\n"
+    "smax z22.s, p2/M, z22.s, z7.s\n"
+    "add z21.s, z21.s, z16.s\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "smax z21.s, p2/M, z21.s, z7.s\n"
+    "st1b { z10.s }, p0, [x12, x28]\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "smin z22.s, p2/M, z22.s, z6.s\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z21.s, p2/M, z21.s, z6.s\n"
+    "st1b { z22.s }, p0, [x11, x28]\n"
+    "mov z26.d, z28.d\n"
+    "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "st1b { z31.s }, p0, [x10, x28]\n"
+    "mov z31.d, z28.d\n"
+    "sdot z31.s, z1.b, z19.b\n"
+    "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "st1b { z21.s }, p0, [x9, x28]\n"
+    "mov z22.d, z28.d\n"
+    "sdot z28.s, z1.b, z13.b\n"
+    "sdot z28.s, z15.b, z19.b\n"
+    "ext z13.b, z13.b, z13.b, #0x1\n"
+    "ext z19.b, z19.b, z19.b, #0x1\n"
+    "sdot z26.s, z1.b, z13.b\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "mov z18.s, #0x0\n"
+    "sdot z22.s, z1.b, z19.b\n"
+    "sdot z18.s, z25.b, z19.b\n"
+    "incw x28\n"
+    "sdot z31.s, z15.b, z12.b\n"
+    "sdot z28.s, z23.b, z12.b\n"
+    "ext z12.b, z12.b, z12.b, #0x1\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "sdot z26.s, z15.b, z19.b\n"
+    "sdot z22.s, z15.b, z12.b\n"
+    "addvl %x[params], %x[params], #16\n"
+    "sdot z18.s, z25.b, z12.b\n"
+    "sdot z31.s, z23.b, z24.b\n"
+    "ext z24.b, z24.b, z24.b, #0x1\n"
+    "mls z28.s, p2/M, z9.s, z8.s\n"
+    "sdot z26.s, z23.b, z12.b\n"
+    ".inst 0x04be779c  // sqrdmulh z28.s, z28.s, z30.s\n"
+    "sdot z22.s, z23.b, z24.b\n"
+    "movprfx z12, z18\n sdot z12.s, z25.b, z24.b\n"
+    "and z2.d, z28.d, z21.d\n"
+    "sdot z18.s, z25.b, z13.b\n"
+    "mls z26.s, p2/M, z18.s, z8.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    "mls z31.s, p2/M, z27.s, z8.s\n"
+    "mls z22.s, p2/M, z12.s, z8.s\n"
+    ".inst 0x04be775a  // sqrdmulh z26.s, z26.s, z30.s\n"
+    ".inst 0x04be77ff  // sqrdmulh z31.s, z31.s, z30.s\n"
+    ".inst 0x04be76d6  // sqrdmulh z22.s, z22.s, z30.s\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+    "sqadd z28.s, z28.s, z2.s\n"
+    "and z24.d, z26.d, z21.d\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    "and z23.d, z31.d, z21.d\n"
+    "and z18.d, z22.d, z21.d\n"
+    "asr z24.s, z24.s, #0x1f\n"
+    "asr z23.s, z23.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z26.s, z26.s, z24.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+    "sqadd z31.s, z31.s, z23.s\n"
+    "sqadd z22.s, z22.s, z18.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    ".inst 0x44828ab6  // srshl z22.s, p2/M, z22.s, z21.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "smax z28.s, p2/M, z28.s, z7.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "smin z28.s, p2/M, z28.s, z6.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "smax z26.s, p2/M, z26.s, z7.s\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "mov z24.s, #0x0\n"
+    "sdot z24.s, z25.b, z11.b\n"
+    "smax z22.s, p2/M, z22.s, z7.s\n"
+    "st1b { z28.s }, p0, [x12, x28]\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "smin z26.s, p2/M, z26.s, z6.s\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z22.s, p2/M, z22.s, z6.s\n"
+    "st1b { z26.s }, p0, [x11, x28]\n"
+    "mov z28.d, z23.d\n"
+    "sdot z24.s, z25.b, z20.b\n"
+    "st1b { z31.s }, p0, [x10, x28]\n"
+    "mov z27.d, z23.d\n"
+    "sdot z27.s, z19.b, z11.b\n"
+    "movprfx z13, z24\n sdot z13.s, z25.b, z0.b\n"
+    "st1b { z22.s }, p0, [x9, x28]\n"
+    "mov z26.d, z23.d\n"
+    "sdot z23.s, z19.b, z14.b\n"
+    "sdot z23.s, z30.b, z11.b\n"
+    "sdot z24.s, z25.b, z14.b\n"
+    "ext z14.b, z14.b, z14.b, #0x1\n"
+    "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+    "sdot z28.s, z19.b, z14.b\n"
+    "ext z11.b, z11.b, z11.b, #0x1\n"
+    "mov z12.s, #0x0\n"
+    "sdot z26.s, z19.b, z11.b\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+    "sdot z12.s, z25.b, z11.b\n"
+    "sdot z27.s, z30.b, z20.b\n"
+    "incw x28\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "sdot z23.s, z21.b, z20.b\n"
+    "ext z20.b, z20.b, z20.b, #0x1\n"
+    "sdot z28.s, z30.b, z11.b\n"
+    "sdot z26.s, z30.b, z20.b\n"
+    "sdot z12.s, z25.b, z20.b\n"
+    "sdot z27.s, z21.b, z0.b\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "mls z23.s, p2/M, z24.s, z8.s\n"
+    "sdot z28.s, z21.b, z20.b\n"
+    "sdot z26.s, z21.b, z0.b\n"
+    ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+    "movprfx z19, z12\n sdot z19.s, z25.b, z0.b\n"
+    "sdot z12.s, z25.b, z14.b\n"
+    "and z18.d, z23.d, z22.d\n"
+    "mls z28.s, p2/M, z12.s, z8.s\n"
+    "mls z27.s, p2/M, z13.s, z8.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "mls z26.s, p2/M, z19.s, z8.s\n"
+    ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+    ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+    ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+    "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "sqadd z23.s, z23.s, z18.s\n"
+    "and z20.d, z28.d, z22.d\n"
+    ".inst 0x44828ad7  // srshl z23.s, p2/M, z23.s, z22.s\n"
+    "and z19.d, z27.d, z22.d\n"
+    "and z18.d, z26.d, z22.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z20.s\n"
+    ".inst 0x44828adc  // srshl z28.s, p2/M, z28.s, z22.s\n"
+    "ld1b { z13.b }, p2/Z, [%x[params]]\n"
+    "sqadd z27.s, z27.s, z19.s\n"
+    "sqadd z26.s, z26.s, z18.s\n"
+    ".inst 0x44828adb  // srshl z27.s, p2/M, z27.s, z22.s\n"
+    ".inst 0x44828ada  // srshl z26.s, p2/M, z26.s, z22.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "smax z23.s, p2/M, z23.s, z7.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "smin z23.s, p2/M, z23.s, z6.s\n"
+    "add z27.s, z27.s, z16.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "smax z28.s, p2/M, z28.s, z7.s\n"
+    "smax z27.s, p2/M, z27.s, z7.s\n"
+    "mov z24.s, #0x0\n"
+    "sdot z24.s, z25.b, z17.b\n"
+    "smax z26.s, p2/M, z26.s, z7.s\n"
+    "st1b { z23.s }, p0, [x12, x28]\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+    "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+    "smin z28.s, p2/M, z28.s, z6.s\n"
+    "smin z27.s, p2/M, z27.s, z6.s\n"
+    "smin z26.s, p2/M, z26.s, z6.s\n"
+    "st1b { z28.s }, p0, [x11, x28]\n"
+    "mov z0.d, z1.d\n"
+    "sdot z24.s, z25.b, z4.b\n"
+    "st1b { z27.s }, p0, [x10, x28]\n"
+    "mov z31.d, z1.d\n"
+    "sdot z31.s, z21.b, z17.b\n"
+    "movprfx z23, z24\n sdot z23.s, z25.b, z5.b\n"
+    "st1b { z26.s }, p0, [x9, x28]\n"
+    "mov z30.d, z1.d\n"
+    "sdot z1.s, z21.b, z29.b\n"
+    "sdot z1.s, z13.b, z17.b\n"
+    "sdot z24.s, z25.b, z29.b\n"
+    "ext z29.b, z29.b, z29.b, #0x1\n"
+    "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "sdot z0.s, z21.b, z29.b\n"
+    "ext z17.b, z17.b, z17.b, #0x1\n"
+    "mov z19.s, #0x0\n"
+    "sdot z30.s, z21.b, z17.b\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z19.s, z25.b, z17.b\n"
+    "sdot z31.s, z13.b, z4.b\n"
+    "incw x28\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "sdot z1.s, z20.b, z4.b\n"
+    "ext z4.b, z4.b, z4.b, #0x1\n"
+    "sdot z0.s, z13.b, z17.b\n"
+    "whilelt p0.b, x14, %x[n_channels]\n"
+    "sdot z30.s, z13.b, z4.b\n"
+    "sdot z19.s, z25.b, z4.b\n"
+    "ld1b { z13.b }, p0/Z, [x26, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+    "sdot z31.s, z20.b, z5.b\n"
+    "ext z5.b, z5.b, z5.b, #0x1\n"
+    "mls z1.s, p2/M, z24.s, z8.s\n"
+    "ld1b { z27.b }, p0/Z, [x22, x14]\n"
+    "sdot z0.s, z20.b, z4.b\n"
+    "sdot z30.s, z20.b, z5.b\n"
+    ".inst 0x04a27421  // sqrdmulh z1.s, z1.s, z2.s\n"
+    "ld1b { z26.b }, p0/Z, [x21, x14]\n"
+    "movprfx z18, z19\n sdot z18.s, z25.b, z5.b\n"
+    "sdot z19.s, z25.b, z29.b\n"
+    "and z11.d, z1.d, z22.d\n"
+    "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+    "mls z0.s, p2/M, z19.s, z8.s\n"
+    "mls z31.s, p2/M, z23.s, z8.s\n"
+    "asr z11.s, z11.s, #0x1f\n"
+    "ld1b { z17.b }, p0/Z, [x20, x14]\n"
+    "mls z30.s, p2/M, z18.s, z8.s\n"
+    ".inst 0x04a27400  // sqrdmulh z0.s, z0.s, z2.s\n"
+    ".inst 0x04a277ff  // sqrdmulh z31.s, z31.s, z2.s\n"
+    ".inst 0x04a277de  // sqrdmulh z30.s, z30.s, z2.s\n"
+    "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+    "ldp x23, x22, [%x[inptrs], #0x40]\n"
+    "sqadd z1.s, z1.s, z11.s\n"
+    "and z21.d, z0.d, z22.d\n"
+    ".inst 0x44828ac1  // srshl z1.s, p2/M, z1.s, z22.s\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "and z20.d, z31.d, z22.d\n"
+    "and z19.d, z30.d, z22.d\n"
+    "ld1b { z18.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z11.b }, p0/Z, [x22, x14]\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "ld1b { z24.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "sqadd z0.s, z0.s, z21.s\n"
+    ".inst 0x44828ac0  // srshl z0.s, p2/M, z0.s, z22.s\n"
+    "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "sqadd z31.s, z31.s, z20.s\n"
+    "sqadd z30.s, z30.s, z19.s\n"
+    ".inst 0x44828adf  // srshl z31.s, p2/M, z31.s, z22.s\n"
+    ".inst 0x44828ade  // srshl z30.s, p2/M, z30.s, z22.s\n"
+    "add z1.s, z1.s, z16.s\n"
+    "smax z1.s, p2/M, z1.s, z7.s\n"
+    "add z0.s, z0.s, z16.s\n"
+    "ld1b { z9.b }, p0/Z, [x24, x14]\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z30.s, z30.s, z16.s\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "smin z1.s, p2/M, z1.s, z6.s\n"
+    "smax z0.s, p2/M, z0.s, z7.s\n"
+    "st1b { z1.s }, p1, [x12, x28]\n"
+    "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "smax z30.s, p2/M, z30.s, z7.s\n"
+    "ld1b { z23.b }, p0/Z, [x22, x14]\n"
+    "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+    "zip2 z20.b, z15.b, z28.b\n"
+    "zip1 z15.b, z15.b, z28.b\n"
+    "smin z0.s, p2/M, z0.s, z6.s\n"
+    "zip1 z19.b, z13.b, z29.b\n"
+    "zip2 z29.b, z13.b, z29.b\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z30.s, p2/M, z30.s, z6.s\n"
+    "st1b { z0.s }, p1, [x11, x28]\n"
+    "zip2 z13.b, z15.b, z19.b\n"
+    "zip1 z15.b, z15.b, z19.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "st1b { z31.s }, p1, [x10, x28]\n"
+    "zip1 z14.b, z20.b, z29.b\n"
+    "zip2 z29.b, z20.b, z29.b\n"
+    "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "st1b { z30.s }, p1, [x9, x28]\n"
+    "zip2 z21.b, z9.b, z26.b\n"
+    "zip1 z9.b, z9.b, z26.b\n"
+    "incw x28\n"
+    "zip1 z20.b, z27.b, z17.b\n"
+    "zip2 z17.b, z27.b, z17.b\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "zip2 z31.b, z18.b, z24.b\n"
+    "zip1 z18.b, z18.b, z24.b\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "zip1 z27.b, z11.b, z4.b\n"
+    "zip2 z4.b, z11.b, z4.b\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #8\n"
+    "zip2 z30.b, z2.b, z22.b\n"
+    "zip1 z2.b, z2.b, z22.b\n"
+    "zip1 z28.b, z23.b, z5.b\n"
+    "zip2 z5.b, z23.b, z5.b\n"
+    "zip2 z19.b, z9.b, z20.b\n"
+    "zip1 z9.b, z9.b, z20.b\n"
+    "zip1 z11.b, z21.b, z17.b\n"
+    "zip2 z17.b, z21.b, z17.b\n"
+    "zip2 z12.b, z18.b, z27.b\n"
+    "zip1 z18.b, z18.b, z27.b\n"
+    "zip1 z20.b, z31.b, z4.b\n"
+    "zip2 z4.b, z31.b, z4.b\n"
+    "zip2 z24.b, z2.b, z28.b\n"
+    "zip1 z2.b, z2.b, z28.b\n"
+    "zip1 z0.b, z30.b, z5.b\n"
+    "zip2 z5.b, z30.b, z5.b\n"
+    "mov z22.d, z10.d\n"
+    "mov z31.d, z10.d\n"
+    "mov z21.d, z10.d\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..c9b4daf334
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..8ac522dc9a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x16, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x16\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z12.b }, p4/Z, [x21]\n"
+    "ld1rb { z30.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z24.h }, p4/Z, [x22]\n"
+    "ld1rh { z11.h }, p4/Z, [x21]\n"
+    "ld1rh { z26.h }, p4/Z, [x20]\n"
+    "ldp x13, x12, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x16, x15\n"
+    "ldp x11, x10, [x24, #0x10]\n"
+    "whilelt p2.s, x16, x15\n"
+    "whilelt p1.s, x23, x15\n"
+    "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z14.h }, p4/Z, [x14]\n"
+    "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x27, #0x0\n"
+    "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e11ce  // ssublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e12b5  // ssublb z21.h, z21.b, z30.b\n"
+    "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1021  // ssublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e10c6  // ssublb z6.h, z6.b, z30.b\n"
+    "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1sb { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x9, x9, #2\n"
+    "mov z17.d, z5.d\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z25.d, z9.d\n"
+    "mov z16.d, z5.d\n"
+    "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z23.d, z9.d\n"
+    "mov z22.d, z5.d\n"
+    "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z27.d, z9.d\n"
+    ".inst 0x455e1252  // ssublb z18.h, z18.b, z30.b\n"
+    "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
+    "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455e10e7  // ssublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e114a  // ssublb z10.h, z10.b, z30.b\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1000  // ssublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c13bd  // ssublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1084  // ssublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c11ad  // ssublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1294  // ssublb z20.h, z20.b, z12.b\n"
+    "1:"  // Loop
+    ".inst 0x44824005  // smlalb z5.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824409  // smlalt z9.s, p4/M, z0.h, z2.h\n"
+    "ldr x20, [x28, #0x28]\n"
+    "ldr x21, [x28, #0x38]\n"
+    ".inst 0x448e43a5  // smlalb z5.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x44864011  // smlalb z17.s, p4/M, z0.h, z6.h\n"
+    "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x30]\n"
+    ".inst 0x44954010  // smlalb z16.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x448e4016  // smlalb z22.s, p4/M, z0.h, z14.h\n"
+    "ld1sb { z31.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c1063  // ssublb z3.h, z3.b, z12.b\n"
+    ".inst 0x448e47a9  // smlalt z9.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x449241a5  // smlalb z5.s, p4/M, z13.h, z18.h\n"
+    "ldr x21, [x28, #0x40]\n"
+    "ld1sb { z15.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44864419  // smlalt z25.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x454c13ff  // ssublb z31.h, z31.b, z12.b\n"
+    "ldr x20, [x28, #0x48]\n"
+    ".inst 0x448e441b  // smlalt z27.s, p4/M, z0.h, z14.h\n"
+    ".inst 0x44814091  // smlalb z17.s, p4/M, z4.h, z1.h\n"
+    "ld1sb { z19.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c11ef  // ssublb z15.h, z15.b, z12.b\n"
+    ".inst 0x448141b0  // smlalb z16.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x449541b6  // smlalb z22.s, p4/M, z13.h, z21.h\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c1273  // ssublb z19.h, z19.b, z12.b\n"
+    ".inst 0x449245a9  // smlalt z9.s, p4/M, z13.h, z18.h\n"
+    ".inst 0x448a4285  // smlalb z5.s, p4/M, z20.h, z10.h\n"
+    "ldr x21, [x28, #0x50]\n"
+    "ldr x20, [x28, #0x58]\n"
+    ".inst 0x44814499  // smlalt z25.s, p4/M, z4.h, z1.h\n"
+    ".inst 0x448145b7  // smlalt z23.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x454c139c  // ssublb z28.h, z28.b, z12.b\n"
+    "ld1sb { z4.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x449545bb  // smlalt z27.s, p4/M, z13.h, z21.h\n"
+    ".inst 0x448241b1  // smlalb z17.s, p4/M, z13.h, z2.h\n"
+    "ld1sb { z29.h }, p3/Z, [x20, x16]\n"
+    "ldr x21, [x28, #0x60]\n"
+    ".inst 0x44874070  // smlalb z16.s, p4/M, z3.h, z7.h\n"
+    ".inst 0x44864296  // smlalb z22.s, p4/M, z20.h, z6.h\n"
+    "ldr x20, [x28, #0x68]\n"
+    ".inst 0x454c1084  // ssublb z4.h, z4.b, z12.b\n"
+    ".inst 0x448a4689  // smlalt z9.s, p4/M, z20.h, z10.h\n"
+    ".inst 0x449543e5  // smlalb z5.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454c13bd  // ssublb z29.h, z29.b, z12.b\n"
+    "ld1sb { z0.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x448245b9  // smlalt z25.s, p4/M, z13.h, z2.h\n"
+    ".inst 0x44874477  // smlalt z23.s, p4/M, z3.h, z7.h\n"
+    "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x70]\n"
+    ".inst 0x4486469b  // smlalt z27.s, p4/M, z20.h, z6.h\n"
+    ".inst 0x44874291  // smlalb z17.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x454c1000  // ssublb z0.h, z0.b, z12.b\n"
+    "ld1sb { z13.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44824290  // smlalb z16.s, p4/M, z20.h, z2.h\n"
+    ".inst 0x448841f6  // smlalb z22.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x454c1063  // ssublb z3.h, z3.b, z12.b\n"
+    "ldr x20, [x28, #0x78]\n"
+    ".inst 0x449547e9  // smlalt z9.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x44814265  // smlalb z5.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x454c11ad  // ssublb z13.h, z13.b, z12.b\n"
+    "whilelt p0.h, x27, x15\n"
+    ".inst 0x44874699  // smlalt z25.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x44824697  // smlalt z23.s, p4/M, z20.h, z2.h\n"
+    "ld1w { z20.s }, p2/Z, [x26]\n"
+    "inch x14\n"
+    ".inst 0x448845fb  // smlalt z27.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x448e43f1  // smlalb z17.s, p4/M, z31.h, z14.h\n"
+    "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44924390  // smlalb z16.s, p4/M, z28.h, z18.h\n"
+    ".inst 0x44824396  // smlalb z22.s, p4/M, z28.h, z2.h\n"
+    "addvl x26, x26, #2\n"
+    ".inst 0x44814669  // smlalt z9.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x44884385  // smlalb z5.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x448e47f9  // smlalt z25.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x44924797  // smlalt z23.s, p4/M, z28.h, z18.h\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c13ff  // ssublb z31.h, z31.b, z12.b\n"
+    ".inst 0x4482479b  // smlalt z27.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x44954271  // smlalb z17.s, p4/M, z19.h, z21.h\n"
+    "uzp1 z2.s, z20.s, z15.s\n"
+    "inch x16\n"
+    ".inst 0x448e4090  // smlalb z16.s, p4/M, z4.h, z14.h\n"
+    ".inst 0x448143b6  // smlalb z22.s, p4/M, z29.h, z1.h\n"
+    "uzp2 z15.s, z20.s, z15.s\n"
+    "ld1w { z20.s }, p2/Z, [x25]\n"
+    ".inst 0x44884789  // smlalt z9.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x44864085  // smlalb z5.s, p4/M, z4.h, z6.h\n"
+    "mov x20, x16\n"
+    "incw x20\n"
+    ".inst 0x44954679  // smlalt z25.s, p4/M, z19.h, z21.h\n"
+    ".inst 0x448e4497  // smlalt z23.s, p4/M, z4.h, z14.h\n"
+    "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+    "uzp1 z21.s, z20.s, z19.s\n"
+    ".inst 0x448147bb  // smlalt z27.s, p4/M, z29.h, z1.h\n"
+    ".inst 0x448a4391  // smlalb z17.s, p4/M, z28.h, z10.h\n"
+    "uzp2 z1.s, z20.s, z19.s\n"
+    "whilelt p2.s, x16, x15\n"
+    ".inst 0x44864010  // smlalb z16.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44924076  // smlalb z22.s, p4/M, z3.h, z18.h\n"
+    "whilelt p1.s, x20, x15\n"
+    "whilelt p3.h, x16, x15\n"
+    ".inst 0x44864489  // smlalt z9.s, p4/M, z4.h, z6.h\n"
+    ".inst 0x44874005  // smlalb z5.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x04a274a5  // sqrdmulh z5.s, z5.s, z2.s\n"
+    "addvl x25, x25, #2\n"
+    ".inst 0x448a4799  // smlalt z25.s, p4/M, z28.h, z10.h\n"
+    ".inst 0x44864417  // smlalt z23.s, p4/M, z0.h, z6.h\n"
+    "and z19.d, z5.d, z21.d\n"
+    ".inst 0x4492447b  // smlalt z27.s, p4/M, z3.h, z18.h\n"
+    ".inst 0x449243b1  // smlalb z17.s, p4/M, z29.h, z18.h\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    ".inst 0x448a41b0  // smlalb z16.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448741b6  // smlalb z22.s, p4/M, z13.h, z7.h\n"
+    "sqadd z5.s, z5.s, z19.s\n"
+    ".inst 0x448292a5  // srshl z5.s, p4/M, z5.s, z21.s\n"
+    ".inst 0x44874409  // smlalt z9.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x449247b9  // smlalt z25.s, p4/M, z29.h, z18.h\n"
+    ".inst 0x04af7529  // sqrdmulh z9.s, z9.s, z15.s\n"
+    ".inst 0x448a45b7  // smlalt z23.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448745bb  // smlalt z27.s, p4/M, z13.h, z7.h\n"
+    "and z29.d, z9.d, z1.d\n"
+    ".inst 0x44884071  // smlalb z17.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x448843f0  // smlalb z16.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x04a27631  // sqrdmulh z17.s, z17.s, z2.s\n"
+    ".inst 0x448a43f6  // smlalb z22.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x44884479  // smlalt z25.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x04a27610  // sqrdmulh z16.s, z16.s, z2.s\n"
+    ".inst 0x448847f7  // smlalt z23.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x448a47fb  // smlalt z27.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x04a276d6  // sqrdmulh z22.s, z22.s, z2.s\n"
+    "asr z29.s, z29.s, #0x1f\n"
+    "and z18.d, z17.d, z21.d\n"
+    ".inst 0x04af7739  // sqrdmulh z25.s, z25.s, z15.s\n"
+    "and z20.d, z16.d, z21.d\n"
+    ".inst 0x04af76f7  // sqrdmulh z23.s, z23.s, z15.s\n"
+    "and z19.d, z22.d, z21.d\n"
+    ".inst 0x04af777b  // sqrdmulh z27.s, z27.s, z15.s\n"
+    "sqadd z9.s, z9.s, z29.s\n"
+    ".inst 0x44829029  // srshl z9.s, p4/M, z9.s, z1.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z7.d, z25.d, z1.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z6.d, z23.d, z1.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z2.d, z27.d, z1.d\n"
+    "sqadd z17.s, z17.s, z18.s\n"
+    "asr z7.s, z7.s, #0x1f\n"
+    ".inst 0x448292b1  // srshl z17.s, p4/M, z17.s, z21.s\n"
+    "sqadd z16.s, z16.s, z20.s\n"
+    "asr z6.s, z6.s, #0x1f\n"
+    ".inst 0x448292b0  // srshl z16.s, p4/M, z16.s, z21.s\n"
+    "sqadd z22.s, z22.s, z19.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    ".inst 0x448292b6  // srshl z22.s, p4/M, z22.s, z21.s\n"
+    "sqadd z25.s, z25.s, z7.s\n"
+    "sqadd z23.s, z23.s, z6.s\n"
+    ".inst 0x44829039  // srshl z25.s, p4/M, z25.s, z1.s\n"
+    ".inst 0x44829037  // srshl z23.s, p4/M, z23.s, z1.s\n"
+    "sqadd z27.s, z27.s, z2.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x4482903b  // srshl z27.s, p4/M, z27.s, z1.s\n"
+    ".inst 0x45304231  // sqxtnb z17.h, z17.s\n"
+    ".inst 0x45304210  // sqxtnb z16.h, z16.s\n"
+    ".inst 0x453042d6  // sqxtnb z22.h, z22.s\n"
+    ".inst 0x45304525  // sqxtnt z5.h, z9.s\n"
+    ".inst 0x45304731  // sqxtnt z17.h, z25.s\n"
+    ".inst 0x453046f0  // sqxtnt z16.h, z23.s\n"
+    ".inst 0x45304776  // sqxtnt z22.h, z27.s\n"
+    "sqadd z5.h, z5.h, z24.h\n"
+    "smax z5.h, p4/M, z5.h, z11.h\n"
+    "smin z5.h, p4/M, z5.h, z26.h\n"
+    "sqadd z17.h, z17.h, z24.h\n"
+    "sqadd z16.h, z16.h, z24.h\n"
+    "smax z17.h, p4/M, z17.h, z11.h\n"
+    "smax z16.h, p4/M, z16.h, z11.h\n"
+    "sqadd z22.h, z22.h, z24.h\n"
+    "smax z22.h, p4/M, z22.h, z11.h\n"
+    "smin z17.h, p4/M, z17.h, z26.h\n"
+    "st1b { z5.h }, p0, [x13, x27]\n"
+    "smin z16.h, p4/M, z16.h, z26.h\n"
+    "smin z22.h, p4/M, z22.h, z26.h\n"
+    "st1b { z17.h }, p0, [x12, x27]\n"
+    "st1b { z16.h }, p0, [x11, x27]\n"
+    "st1b { z22.h }, p0, [x10, x27]\n"
+    "ld1sb { z14.h }, p4/Z, [x14]\n"
+    "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "inch x27\n"
+    "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e11ce  // ssublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e12b5  // ssublb z21.h, z21.b, z30.b\n"
+    "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1021  // ssublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e10c6  // ssublb z6.h, z6.b, z30.b\n"
+    "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1sb { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x21, x21, #2\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z17.d, z5.d\n"
+    "mov z25.d, z9.d\n"
+    "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z16.d, z5.d\n"
+    "mov z23.d, z9.d\n"
+    "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z22.d, z5.d\n"
+    "mov z27.d, z9.d\n"
+    "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x455e1252  // ssublb z18.h, z18.b, z30.b\n"
+    ".inst 0x455e10e7  // ssublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e114a  // ssublb z10.h, z10.b, z30.b\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1000  // ssublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c13bd  // ssublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1084  // ssublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c11ad  // ssublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1294  // ssublb z20.h, z20.b, z12.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..7a9b8a5bde
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..fc9a48bb46
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x7, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x7\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z26.b }, p4/Z, [x21]\n"
+    "ld1rb { z13.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z19.h }, p4/Z, [x22]\n"
+    "ld1rh { z12.h }, p4/Z, [x21]\n"
+    "ld1rh { z9.h }, p4/Z, [x20]\n"
+    "ldp x16, x15, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x7, x8\n"
+    "ldp x14, x13, [x24, #0x10]\n"
+    "whilelt p2.s, x7, x8\n"
+    "whilelt p1.s, x23, x8\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z25.h }, p4/Z, [x17]\n"
+    "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x10, #0x0\n"
+    "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1339  // ssublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d13de  // ssublb z30.h, z30.b, z13.b\n"
+    "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d11ce  // ssublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d114a  // ssublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x12]\n"
+    "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1sb { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x12, x12, #2\n"
+    "mov z18.d, z8.d\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z0.d, z24.d\n"
+    "mov z15.d, z8.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z1.d, z24.d\n"
+    "mov z5.d, z8.d\n"
+    "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z6.d, z24.d\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d12f7  // ssublb z23.h, z23.b, z13.b\n"
+    ".inst 0x454d10e7  // ssublb z7.h, z7.b, z13.b\n"
+    "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    ".inst 0x455a12b5  // ssublb z21.h, z21.b, z26.b\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+    "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455a12d6  // ssublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a116b  // ssublb z11.h, z11.b, z26.b\n"
+    "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455a1294  // ssublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a137b  // ssublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a139c  // ssublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1210  // ssublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a13ff  // ssublb z31.h, z31.b, z26.b\n"
+    "1:"  // Loop
+    ".inst 0x448242a8  // smlalb z8.s, p4/M, z21.h, z2.h\n"
+    "ldr x21, [x11, #0x58]\n"
+    "ldr x20, [x11, #0x78]\n"
+    ".inst 0x448246b8  // smlalt z24.s, p4/M, z21.h, z2.h\n"
+    ".inst 0x449942c8  // smlalb z8.s, p4/M, z22.h, z25.h\n"
+    "ld1sb { z17.h }, p3/Z, [x21, x7]\n"
+    "ld1sb { z29.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x449742b2  // smlalb z18.s, p4/M, z21.h, z23.h\n"
+    "ldr x21, [x11, #0x60]\n"
+    "ldr x20, [x11, #0x80]\n"
+    ".inst 0x448e42af  // smlalb z15.s, p4/M, z21.h, z14.h\n"
+    ".inst 0x449942a5  // smlalb z5.s, p4/M, z21.h, z25.h\n"
+    ".inst 0x449946d8  // smlalt z24.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x455a1231  // ssublb z17.h, z17.b, z26.b\n"
+    ".inst 0x449e4168  // smlalb z8.s, p4/M, z11.h, z30.h\n"
+    "ld1sb { z22.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a13bd  // ssublb z29.h, z29.b, z26.b\n"
+    ".inst 0x449746a0  // smlalt z0.s, p4/M, z21.h, z23.h\n"
+    ".inst 0x448e46a1  // smlalt z1.s, p4/M, z21.h, z14.h\n"
+    "ldr x21, [x11, #0x68]\n"
+    ".inst 0x449946a6  // smlalt z6.s, p4/M, z21.h, z25.h\n"
+    "ld1sb { z21.h }, p3/Z, [x20, x7]\n"
+    "ldr x20, [x11, #0x88]\n"
+    ".inst 0x449e4292  // smlalb z18.s, p4/M, z20.h, z30.h\n"
+    ".inst 0x4484422f  // smlalb z15.s, p4/M, z17.h, z4.h\n"
+    ".inst 0x448a43a5  // smlalb z5.s, p4/M, z29.h, z10.h\n"
+    ".inst 0x455a12d6  // ssublb z22.h, z22.b, z26.b\n"
+    "ldr x22, [x11, #0x40]\n"
+    ".inst 0x449e4578  // smlalt z24.s, p4/M, z11.h, z30.h\n"
+    ".inst 0x455a12b5  // ssublb z21.h, z21.b, z26.b\n"
+    ".inst 0x44844388  // smlalb z8.s, p4/M, z28.h, z4.h\n"
+    "ld1sb { z11.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x449e4680  // smlalt z0.s, p4/M, z20.h, z30.h\n"
+    "ld1sb { z20.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844621  // smlalt z1.s, p4/M, z17.h, z4.h\n"
+    "ldr x21, [x11, #0x70]\n"
+    ".inst 0x448a47a6  // smlalt z6.s, p4/M, z29.h, z10.h\n"
+    "ldr x20, [x11, #0x98]\n"
+    ".inst 0x448e4372  // smlalb z18.s, p4/M, z27.h, z14.h\n"
+    "ldr x23, [x11, #0x50]\n"
+    ".inst 0x449942cf  // smlalb z15.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e42a5  // smlalb z5.s, p4/M, z21.h, z30.h\n"
+    ".inst 0x455a116b  // ssublb z11.h, z11.b, z26.b\n"
+    "ld1sb { z17.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x44844798  // smlalt z24.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x455a1294  // ssublb z20.h, z20.b, z26.b\n"
+    ".inst 0x448a4208  // smlalb z8.s, p4/M, z16.h, z10.h\n"
+    "ld1sb { z29.h }, p3/Z, [x21, x7]\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448e4760  // smlalt z0.s, p4/M, z27.h, z14.h\n"
+    "ldr x22, [x11, #0x48]\n"
+    ".inst 0x449946c1  // smlalt z1.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e46a6  // smlalt z6.s, p4/M, z21.h, z30.h\n"
+    "ldr x21, [x11, #0x90]\n"
+    "ldr x20, [x11, #0xa8]\n"
+    ".inst 0x449943f2  // smlalb z18.s, p4/M, z31.h, z25.h\n"
+    "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x448a416f  // smlalb z15.s, p4/M, z11.h, z10.h\n"
+    ".inst 0x44834285  // smlalb z5.s, p4/M, z20.h, z3.h\n"
+    ".inst 0x455a1231  // ssublb z17.h, z17.b, z26.b\n"
+    ".inst 0x448a4618  // smlalt z24.s, p4/M, z16.h, z10.h\n"
+    ".inst 0x455a13bd  // ssublb z29.h, z29.b, z26.b\n"
+    ".inst 0x448e43e8  // smlalb z8.s, p4/M, z31.h, z14.h\n"
+    "ld1sb { z16.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x455a139c  // ssublb z28.h, z28.b, z26.b\n"
+    ".inst 0x449947e0  // smlalt z0.s, p4/M, z31.h, z25.h\n"
+    "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x448a4561  // smlalt z1.s, p4/M, z11.h, z10.h\n"
+    "ld1sb { z11.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a137b  // ssublb z27.h, z27.b, z26.b\n"
+    ".inst 0x44834686  // smlalt z6.s, p4/M, z20.h, z3.h\n"
+    "ldr x21, [x11, #0xa0]\n"
+    "ldr x20, [x11, #0xb0]\n"
+    ".inst 0x448a4232  // smlalb z18.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e43af  // smlalb z15.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x455a1210  // ssublb z16.h, z16.b, z26.b\n"
+    ".inst 0x448e4385  // smlalb z5.s, p4/M, z28.h, z14.h\n"
+    ".inst 0x448e47f8  // smlalt z24.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x455a1339  // ssublb z25.h, z25.b, z26.b\n"
+    "ld1sb { z20.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a116b  // ssublb z11.h, z11.b, z26.b\n"
+    ".inst 0x44834368  // smlalb z8.s, p4/M, z27.h, z3.h\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448a4620  // smlalt z0.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e47a1  // smlalt z1.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x448e4786  // smlalt z6.s, p4/M, z28.h, z14.h\n"
+    "ldr x20, [x11, #0xb8]\n"
+    ".inst 0x455a1294  // ssublb z20.h, z20.b, z26.b\n"
+    ".inst 0x44834212  // smlalb z18.s, p4/M, z16.h, z3.h\n"
+    ".inst 0x4497432f  // smlalb z15.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x455a13ff  // ssublb z31.h, z31.b, z26.b\n"
+    "ld1sb { z30.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844165  // smlalb z5.s, p4/M, z11.h, z4.h\n"
+    ".inst 0x44834778  // smlalt z24.s, p4/M, z27.h, z3.h\n"
+    "ldr x20, [x11, #0xc0]\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    ".inst 0x449742c8  // smlalb z8.s, p4/M, z22.h, z23.h\n"
+    ".inst 0x44834600  // smlalt z0.s, p4/M, z16.h, z3.h\n"
+    "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+    ".inst 0x455a13de  // ssublb z30.h, z30.b, z26.b\n"
+    ".inst 0x44974721  // smlalt z1.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x44844566  // smlalt z6.s, p4/M, z11.h, z4.h\n"
+    "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
+    "uzp1 z10.s, z17.s, z14.s\n"
+    ".inst 0x44844372  // smlalb z18.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x4487428f  // smlalb z15.s, p4/M, z20.h, z7.h\n"
+    "uzp2 z14.s, z17.s, z14.s\n"
+    "ld1w { z17.s }, p2/Z, [x28]\n"
+    ".inst 0x448743e5  // smlalb z5.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x449746d8  // smlalt z24.s, p4/M, z22.h, z23.h\n"
+    "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+    ".inst 0x455a1339  // ssublb z25.h, z25.b, z26.b\n"
+    ".inst 0x448743a8  // smlalb z8.s, p4/M, z29.h, z7.h\n"
+    ".inst 0x44844760  // smlalt z0.s, p4/M, z27.h, z4.h\n"
+    "uzp1 z4.s, z17.s, z16.s\n"
+    "inch x7\n"
+    ".inst 0x44874681  // smlalt z1.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x448747e6  // smlalt z6.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x04aa7508  // sqrdmulh z8.s, z8.s, z10.s\n"
+    "whilelt p0.h, x10, x8\n"
+    ".inst 0x448742b2  // smlalb z18.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x4483416f  // smlalb z15.s, p4/M, z11.h, z3.h\n"
+    "uzp2 z22.s, z17.s, z16.s\n"
+    "mov x20, x7\n"
+    ".inst 0x449743c5  // smlalb z5.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x448747b8  // smlalt z24.s, p4/M, z29.h, z7.h\n"
+    "and z17.d, z8.d, z4.d\n"
+    "inch x17\n"
+    ".inst 0x448746a0  // smlalt z0.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x44834561  // smlalt z1.s, p4/M, z11.h, z3.h\n"
+    ".inst 0x04ae7718  // sqrdmulh z24.s, z24.s, z14.s\n"
+    "incw x20\n"
+    ".inst 0x449747c6  // smlalt z6.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x44824392  // smlalb z18.s, p4/M, z28.h, z2.h\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "whilelt p2.s, x7, x8\n"
+    ".inst 0x448243cf  // smlalb z15.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x44824325  // smlalb z5.s, p4/M, z25.h, z2.h\n"
+    "and z16.d, z24.d, z22.d\n"
+    "whilelt p1.s, x20, x8\n"
+    ".inst 0x44824780  // smlalt z0.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x448247c1  // smlalt z1.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x04aa7652  // sqrdmulh z18.s, z18.s, z10.s\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44824726  // smlalt z6.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x04aa75ef  // sqrdmulh z15.s, z15.s, z10.s\n"
+    "whilelt p3.h, x7, x8\n"
+    "addvl x9, x9, #2\n"
+    ".inst 0x04aa74a5  // sqrdmulh z5.s, z5.s, z10.s\n"
+    "sqadd z8.s, z8.s, z17.s\n"
+    ".inst 0x44829088  // srshl z8.s, p4/M, z8.s, z4.s\n"
+    "addvl x28, x28, #2\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "and z21.d, z18.d, z4.d\n"
+    ".inst 0x04ae7400  // sqrdmulh z0.s, z0.s, z14.s\n"
+    "and z20.d, z15.d, z4.d\n"
+    ".inst 0x04ae7421  // sqrdmulh z1.s, z1.s, z14.s\n"
+    "and z28.d, z5.d, z4.d\n"
+    ".inst 0x04ae74c6  // sqrdmulh z6.s, z6.s, z14.s\n"
+    "sqadd z24.s, z24.s, z16.s\n"
+    ".inst 0x448292d8  // srshl z24.s, p4/M, z24.s, z22.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    "and z25.d, z0.d, z22.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z17.d, z1.d, z22.d\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "and z16.d, z6.d, z22.d\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "asr z25.s, z25.s, #0x1f\n"
+    ".inst 0x44829092  // srshl z18.s, p4/M, z18.s, z4.s\n"
+    "sqadd z15.s, z15.s, z20.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x4482908f  // srshl z15.s, p4/M, z15.s, z4.s\n"
+    "sqadd z5.s, z5.s, z28.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x44829085  // srshl z5.s, p4/M, z5.s, z4.s\n"
+    "sqadd z0.s, z0.s, z25.s\n"
+    "sqadd z1.s, z1.s, z17.s\n"
+    ".inst 0x448292c0  // srshl z0.s, p4/M, z0.s, z22.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    "sqadd z6.s, z6.s, z16.s\n"
+    ".inst 0x45304108  // sqxtnb z8.h, z8.s\n"
+    ".inst 0x448292c6  // srshl z6.s, p4/M, z6.s, z22.s\n"
+    ".inst 0x45304252  // sqxtnb z18.h, z18.s\n"
+    ".inst 0x453041ef  // sqxtnb z15.h, z15.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x45304708  // sqxtnt z8.h, z24.s\n"
+    ".inst 0x45304412  // sqxtnt z18.h, z0.s\n"
+    ".inst 0x4530442f  // sqxtnt z15.h, z1.s\n"
+    ".inst 0x453044c5  // sqxtnt z5.h, z6.s\n"
+    "sqadd z8.h, z8.h, z19.h\n"
+    "smax z8.h, p4/M, z8.h, z12.h\n"
+    "smin z8.h, p4/M, z8.h, z9.h\n"
+    "sqadd z18.h, z18.h, z19.h\n"
+    "sqadd z15.h, z15.h, z19.h\n"
+    "smax z18.h, p4/M, z18.h, z12.h\n"
+    "smax z15.h, p4/M, z15.h, z12.h\n"
+    "sqadd z5.h, z5.h, z19.h\n"
+    "smax z5.h, p4/M, z5.h, z12.h\n"
+    "smin z18.h, p4/M, z18.h, z9.h\n"
+    "st1b { z8.h }, p0, [x16, x10]\n"
+    "smin z15.h, p4/M, z15.h, z9.h\n"
+    "smin z5.h, p4/M, z5.h, z9.h\n"
+    "st1b { z18.h }, p0, [x15, x10]\n"
+    "st1b { z15.h }, p0, [x14, x10]\n"
+    "st1b { z5.h }, p0, [x13, x10]\n"
+    "ld1sb { z25.h }, p4/Z, [x17]\n"
+    "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "inch x10\n"
+    "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1339  // ssublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d13de  // ssublb z30.h, z30.b, z13.b\n"
+    "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d11ce  // ssublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d114a  // ssublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1sb { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x20, x20, #2\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z18.d, z8.d\n"
+    "mov z0.d, z24.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z15.d, z8.d\n"
+    "mov z1.d, z24.d\n"
+    "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z5.d, z8.d\n"
+    "mov z6.d, z24.d\n"
+    "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    ".inst 0x454d12f7  // ssublb z23.h, z23.b, z13.b\n"
+    "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d10e7  // ssublb z7.h, z7.b, z13.b\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a12b5  // ssublb z21.h, z21.b, z26.b\n"
+    ".inst 0x455a12d6  // ssublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a116b  // ssublb z11.h, z11.b, z26.b\n"
+    ".inst 0x455a1294  // ssublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a137b  // ssublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a139c  // ssublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1210  // ssublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a13ff  // ssublb z31.h, z31.b, z26.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1f8d6c5213
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..7ff724ddd8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x2, #0x0\n"
+    "mov x24, x2\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "incw x24\n"
+    "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z30.b }, p4/Z, [x21]\n"
+    "ld1rb { z10.b }, p4/Z, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1rh { z15.h }, p4/Z, [x21]\n"
+    "ld1rh { z12.h }, p4/Z, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z13.h }, p4/Z, [x20]\n"
+    "ldp x5, x6, [x22, #0x0]\n"
+    "whilelt p3.h, x2, x3\n"
+    "ldp x7, x8, [x22, #0x10]\n"
+    "whilelt p2.s, x2, x3\n"
+    "whilelt p1.s, x24, x3\n"
+    "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+    "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+    "ld1w { z17.s }, p2/Z, [x10]\n"
+    "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1sb { z26.h }, p4/Z, [x4]\n"
+    "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x10, x10, #2\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "mov x16, #0x0\n"
+    "mov z6.d, z14.d\n"
+    "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z18.d, z23.d\n"
+    "mov z9.d, z14.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z20.d, z23.d\n"
+    "mov z7.d, z14.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z1.d, z23.d\n"
+    ".inst 0x454a135a  // ssublb z26.h, z26.b, z10.b\n"
+    "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a1108  // ssublb z8.h, z8.b, z10.b\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x454a1231  // ssublb z17.h, z17.b, z10.b\n"
+    "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e12d6  // ssublb z22.h, z22.b, z30.b\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e116b  // ssublb z11.h, z11.b, z30.b\n"
+    ".inst 0x455e1063  // ssublb z3.h, z3.b, z30.b\n"
+    "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e13bd  // ssublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1084  // ssublb z4.h, z4.b, z30.b\n"
+    "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e13ff  // ssublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1000  // ssublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1273  // ssublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e139c  // ssublb z28.h, z28.b, z30.b\n"
+    "1:"  // Loop
+    ".inst 0x449a42ce  // smlalb z14.s, p4/M, z22.h, z26.h\n"
+    ".inst 0x449a46d7  // smlalt z23.s, p4/M, z22.h, z26.h\n"
+    "ldr x20, [x17, #0x50]\n"
+    "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x4488404e  // smlalb z14.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449a4046  // smlalb z6.s, p4/M, z2.h, z26.h\n"
+    "ldr x20, [x17, #0x58]\n"
+    ".inst 0x455e137b  // ssublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449a4169  // smlalb z9.s, p4/M, z11.h, z26.h\n"
+    ".inst 0x449a4067  // smlalb z7.s, p4/M, z3.h, z26.h\n"
+    "ld1sb { z5.h }, p3/Z, [x20, x2]\n"
+    "ldr x20, [x17, #0x60]\n"
+    ".inst 0x44884457  // smlalt z23.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449043ae  // smlalb z14.s, p4/M, z29.h, z16.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x455e10a5  // ssublb z5.h, z5.b, z30.b\n"
+    ".inst 0x449a4452  // smlalt z18.s, p4/M, z2.h, z26.h\n"
+    ".inst 0x449a4574  // smlalt z20.s, p4/M, z11.h, z26.h\n"
+    "ld1sb { z22.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x449a4461  // smlalt z1.s, p4/M, z3.h, z26.h\n"
+    ".inst 0x448843a6  // smlalb z6.s, p4/M, z29.h, z8.h\n"
+    "ldr x20, [x17, #0x68]\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x44884069  // smlalb z9.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x44884087  // smlalb z7.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x455e12d6  // ssublb z22.h, z22.b, z30.b\n"
+    "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x449047b7  // smlalt z23.s, p4/M, z29.h, z16.h\n"
+    ".inst 0x449543ee  // smlalb z14.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    "ldr x20, [x17, #0x70]\n"
+    ".inst 0x448847b2  // smlalt z18.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x44884474  // smlalt z20.s, p4/M, z3.h, z8.h\n"
+    "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+    ".inst 0x455e135a  // ssublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44884481  // smlalt z1.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x449043e6  // smlalb z6.s, p4/M, z31.h, z16.h\n"
+    "inch x4, ALL, MUL #8\n"
+    "ld1sb { z8.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x44904089  // smlalb z9.s, p4/M, z4.h, z16.h\n"
+    ".inst 0x44904367  // smlalb z7.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x454a13bd  // ssublb z29.h, z29.b, z10.b\n"
+    "ldr x20, [x17, #0x78]\n"
+    ".inst 0x449547f7  // smlalt z23.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x4491400e  // smlalb z14.s, p4/M, z0.h, z17.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4]\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x449047f2  // smlalt z18.s, p4/M, z31.h, z16.h\n"
+    ".inst 0x44904494  // smlalt z20.s, p4/M, z4.h, z16.h\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44904761  // smlalt z1.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x44954006  // smlalb z6.s, p4/M, z0.h, z21.h\n"
+    "ldr x22, [x17, #0x80]\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x44954369  // smlalb z9.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x449540a7  // smlalb z7.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x455e13ff  // ssublb z31.h, z31.b, z30.b\n"
+    "ldr x21, [x17, #0x88]\n"
+    ".inst 0x44914417  // smlalt z23.s, p4/M, z0.h, z17.h\n"
+    ".inst 0x4499416e  // smlalb z14.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    "ldr x20, [x17, #0x90]\n"
+    ".inst 0x44954412  // smlalt z18.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44954774  // smlalt z20.s, p4/M, z27.h, z21.h\n"
+    "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1000  // ssublb z0.h, z0.b, z30.b\n"
+    ".inst 0x449544a1  // smlalt z1.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x449142c6  // smlalb z6.s, p4/M, z22.h, z17.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449140a9  // smlalb z9.s, p4/M, z5.h, z17.h\n"
+    ".inst 0x44914267  // smlalb z7.s, p4/M, z19.h, z17.h\n"
+    "ldr x23, [x17, #0x98]\n"
+    "ldr x22, [x17, #0xa0]\n"
+    ".inst 0x44994577  // smlalt z23.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x4482406e  // smlalb z14.s, p4/M, z3.h, z2.h\n"
+    "ld1sb { z11.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e116b  // ssublb z11.h, z11.b, z30.b\n"
+    ".inst 0x449146d2  // smlalt z18.s, p4/M, z22.h, z17.h\n"
+    ".inst 0x449144b4  // smlalt z20.s, p4/M, z5.h, z17.h\n"
+    "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a12d6  // ssublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44914661  // smlalt z1.s, p4/M, z19.h, z17.h\n"
+    ".inst 0x44994066  // smlalb z6.s, p4/M, z3.h, z25.h\n"
+    "ld1sb { z17.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1231  // ssublb z17.h, z17.b, z30.b\n"
+    ".inst 0x44994389  // smlalb z9.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994347  // smlalb z7.s, p4/M, z26.h, z25.h\n"
+    "ldr x20, [x17, #0xa8]\n"
+    "ldr x21, [x17, #0xb0]\n"
+    ".inst 0x44824477  // smlalt z23.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x449d408e  // smlalb z14.s, p4/M, z4.h, z29.h\n"
+    "ldr x13, [x17, #0xb8]\n"
+    "ldr x12, [x17, #0xc0]\n"
+    ".inst 0x44994472  // smlalt z18.s, p4/M, z3.h, z25.h\n"
+    ".inst 0x44994794  // smlalt z20.s, p4/M, z28.h, z25.h\n"
+    "ld1sb { z3.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e1063  // ssublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44994741  // smlalt z1.s, p4/M, z26.h, z25.h\n"
+    ".inst 0x44824086  // smlalb z6.s, p4/M, z4.h, z2.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824349  // smlalb z9.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824107  // smlalb z7.s, p4/M, z8.h, z2.h\n"
+    "ldr x11, [x17, #0xc8]\n"
+    "ldr x10, [x17, #0xd0]\n"
+    ".inst 0x449d4497  // smlalt z23.s, p4/M, z4.h, z29.h\n"
+    ".inst 0x4498436e  // smlalb z14.s, p4/M, z27.h, z24.h\n"
+    "ldr x9, [x17, #0xd8]\n"
+    "ldr x28, [x17, #0xe0]\n"
+    ".inst 0x44824492  // smlalt z18.s, p4/M, z4.h, z2.h\n"
+    ".inst 0x44824754  // smlalt z20.s, p4/M, z26.h, z2.h\n"
+    "ld1sb { z4.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1084  // ssublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44824501  // smlalt z1.s, p4/M, z8.h, z2.h\n"
+    ".inst 0x449d4366  // smlalb z6.s, p4/M, z27.h, z29.h\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d4109  // smlalb z9.s, p4/M, z8.h, z29.h\n"
+    ".inst 0x449d43e7  // smlalb z7.s, p4/M, z31.h, z29.h\n"
+    "ldr x27, [x17, #0xe8]\n"
+    "ldr x26, [x17, #0xf0]\n"
+    ".inst 0x44984777  // smlalt z23.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449040ae  // smlalb z14.s, p4/M, z5.h, z16.h\n"
+    "ldr x25, [x17, #0xf8]\n"
+    "ldr x24, [x17, #0x100]\n"
+    ".inst 0x449d4772  // smlalt z18.s, p4/M, z27.h, z29.h\n"
+    ".inst 0x449d4514  // smlalt z20.s, p4/M, z8.h, z29.h\n"
+    "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e137b  // ssublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449d47e1  // smlalt z1.s, p4/M, z31.h, z29.h\n"
+    ".inst 0x449840a6  // smlalb z6.s, p4/M, z5.h, z24.h\n"
+    "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a13bd  // ssublb z29.h, z29.b, z10.b\n"
+    ".inst 0x449843e9  // smlalb z9.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984007  // smlalb z7.s, p4/M, z0.h, z24.h\n"
+    "ldr x23, [x17, #0x108]\n"
+    "ldr x22, [x17, #0x110]\n"
+    ".inst 0x449044b7  // smlalt z23.s, p4/M, z5.h, z16.h\n"
+    ".inst 0x4495438e  // smlalb z14.s, p4/M, z28.h, z21.h\n"
+    "ldr x20, [x17, #0x118]\n"
+    "whilelt p0.h, x16, x3\n"
+    ".inst 0x449844b2  // smlalt z18.s, p4/M, z5.h, z24.h\n"
+    ".inst 0x449847f4  // smlalt z20.s, p4/M, z31.h, z24.h\n"
+    "ld1sb { z5.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e10a5  // ssublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44984401  // smlalt z1.s, p4/M, z0.h, z24.h\n"
+    ".inst 0x44904266  // smlalb z6.s, p4/M, z19.h, z16.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44904009  // smlalb z9.s, p4/M, z0.h, z16.h\n"
+    ".inst 0x44904167  // smlalb z7.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44954797  // smlalt z23.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x4496434e  // smlalb z14.s, p4/M, z26.h, z22.h\n"
+    "ld1sb { z28.h }, p3/Z, [x13, x2]\n"
+    ".inst 0x455e139c  // ssublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44904672  // smlalt z18.s, p4/M, z19.h, z16.h\n"
+    ".inst 0x44904414  // smlalt z20.s, p4/M, z0.h, z16.h\n"
+    "ld1sb { z19.h }, p4/Z, [x4]\n"
+    ".inst 0x454a1273  // ssublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44904561  // smlalt z1.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x44954346  // smlalb z6.s, p4/M, z26.h, z21.h\n"
+    "ld1sb { z16.h }, p3/Z, [x12, x2]\n"
+    ".inst 0x455e1210  // ssublb z16.h, z16.b, z30.b\n"
+    ".inst 0x44954229  // smlalb z9.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x44954067  // smlalb z7.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964757  // smlalt z23.s, p4/M, z26.h, z22.h\n"
+    ".inst 0x4499410e  // smlalb z14.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x44954752  // smlalt z18.s, p4/M, z26.h, z21.h\n"
+    ".inst 0x44954634  // smlalt z20.s, p4/M, z17.h, z21.h\n"
+    "ld1sb { z26.h }, p3/Z, [x11, x2]\n"
+    ".inst 0x455e135a  // ssublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44954461  // smlalt z1.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964106  // smlalb z6.s, p4/M, z8.h, z22.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x44964069  // smlalb z9.s, p4/M, z3.h, z22.h\n"
+    ".inst 0x44964087  // smlalb z7.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x44994517  // smlalt z23.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x448243ee  // smlalb z14.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x44964512  // smlalt z18.s, p4/M, z8.h, z22.h\n"
+    ".inst 0x44964474  // smlalt z20.s, p4/M, z3.h, z22.h\n"
+    "ld1sb { z8.h }, p3/Z, [x10, x2]\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x44964481  // smlalt z1.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x449943e6  // smlalb z6.s, p4/M, z31.h, z25.h\n"
+    "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a12d6  // ssublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44994089  // smlalb z9.s, p4/M, z4.h, z25.h\n"
+    ".inst 0x44994367  // smlalb z7.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x448247f7  // smlalt z23.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x449d400e  // smlalb z14.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x449947f2  // smlalt z18.s, p4/M, z31.h, z25.h\n"
+    ".inst 0x44994494  // smlalt z20.s, p4/M, z4.h, z25.h\n"
+    "ld1sb { z31.h }, p3/Z, [x9, x2]\n"
+    ".inst 0x455e13ff  // ssublb z31.h, z31.b, z30.b\n"
+    ".inst 0x44994761  // smlalt z1.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x44824006  // smlalb z6.s, p4/M, z0.h, z2.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824369  // smlalb z9.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x448240a7  // smlalb z7.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4417  // smlalt z23.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x4498422e  // smlalb z14.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x44824412  // smlalt z18.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824774  // smlalt z20.s, p4/M, z27.h, z2.h\n"
+    "ld1sb { z0.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x455e1000  // ssublb z0.h, z0.b, z30.b\n"
+    ".inst 0x448244a1  // smlalt z1.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4166  // smlalb z6.s, p4/M, z11.h, z29.h\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d40a9  // smlalb z9.s, p4/M, z5.h, z29.h\n"
+    ".inst 0x449d4387  // smlalb z7.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984637  // smlalt z23.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x4493406e  // smlalb z14.s, p4/M, z3.h, z19.h\n"
+    "ld1sb { z17.h }, p3/Z, [x27, x2]\n"
+    ".inst 0x455e1231  // ssublb z17.h, z17.b, z30.b\n"
+    ".inst 0x449d4572  // smlalt z18.s, p4/M, z11.h, z29.h\n"
+    ".inst 0x449d44b4  // smlalt z20.s, p4/M, z5.h, z29.h\n"
+    "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a116b  // ssublb z11.h, z11.b, z10.b\n"
+    ".inst 0x449d4781  // smlalt z1.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984066  // smlalb z6.s, p4/M, z3.h, z24.h\n"
+    "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x455e13bd  // ssublb z29.h, z29.b, z30.b\n"
+    ".inst 0x44984209  // smlalb z9.s, p4/M, z16.h, z24.h\n"
+    ".inst 0x44984347  // smlalb z7.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934477  // smlalt z23.s, p4/M, z3.h, z19.h\n"
+    ".inst 0x4495408e  // smlalb z14.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x44984472  // smlalt z18.s, p4/M, z3.h, z24.h\n"
+    ".inst 0x44984614  // smlalt z20.s, p4/M, z16.h, z24.h\n"
+    "ld1sb { z3.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x455e1063  // ssublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44984741  // smlalt z1.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934086  // smlalb z6.s, p4/M, z4.h, z19.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44934349  // smlalb z9.s, p4/M, z26.h, z19.h\n"
+    ".inst 0x44934107  // smlalb z7.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954497  // smlalt z23.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x4496436e  // smlalb z14.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x44934492  // smlalt z18.s, p4/M, z4.h, z19.h\n"
+    ".inst 0x44934754  // smlalt z20.s, p4/M, z26.h, z19.h\n"
+    "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e1084  // ssublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44934501  // smlalt z1.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954366  // smlalb z6.s, p4/M, z27.h, z21.h\n"
+    "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44954109  // smlalb z9.s, p4/M, z8.h, z21.h\n"
+    ".inst 0x449543e7  // smlalb z7.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1273  // ssublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44964777  // smlalt z23.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x449940ae  // smlalb z14.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x44954772  // smlalt z18.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x44954514  // smlalt z20.s, p4/M, z8.h, z21.h\n"
+    "ld1sb { z27.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e137b  // ssublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449547e1  // smlalt z1.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x449640a6  // smlalb z6.s, p4/M, z5.h, z22.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449643e9  // smlalb z9.s, p4/M, z31.h, z22.h\n"
+    ".inst 0x44964007  // smlalb z7.s, p4/M, z0.h, z22.h\n"
+    "inch x4\n"
+    ".inst 0x449944b7  // smlalt z23.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x4482420e  // smlalb z14.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x449644b2  // smlalt z18.s, p4/M, z5.h, z22.h\n"
+    ".inst 0x449647f4  // smlalt z20.s, p4/M, z31.h, z22.h\n"
+    "ld1sb { z5.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e10a5  // ssublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44964401  // smlalt z1.s, p4/M, z0.h, z22.h\n"
+    ".inst 0x44994386  // smlalb z6.s, p4/M, z28.h, z25.h\n"
+    "ld1w { z22.s }, p2/Z, [x15]\n"
+    ".inst 0x44994009  // smlalb z9.s, p4/M, z0.h, z25.h\n"
+    ".inst 0x44994227  // smlalb z7.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824617  // smlalt z23.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x448b434e  // smlalb z14.s, p4/M, z26.h, z11.h\n"
+    "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+    "addvl x15, x15, #2\n"
+    ".inst 0x44994792  // smlalt z18.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994414  // smlalt z20.s, p4/M, z0.h, z25.h\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e139c  // ssublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44994621  // smlalt z1.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824346  // smlalb z6.s, p4/M, z26.h, z2.h\n"
+    "uzp1 z25.s, z22.s, z16.s\n"
+    "inch x2\n"
+    ".inst 0x448243a9  // smlalb z9.s, p4/M, z29.h, z2.h\n"
+    ".inst 0x44824067  // smlalb z7.s, p4/M, z3.h, z2.h\n"
+    "uzp2 z16.s, z22.s, z16.s\n"
+    "ld1w { z22.s }, p2/Z, [x14]\n"
+    ".inst 0x448b4757  // smlalt z23.s, p4/M, z26.h, z11.h\n"
+    ".inst 0x4498410e  // smlalb z14.s, p4/M, z8.h, z24.h\n"
+    "mov x20, x2\n"
+    "incw x20\n"
+    ".inst 0x44824752  // smlalt z18.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x448247b4  // smlalt z20.s, p4/M, z29.h, z2.h\n"
+    "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+    "uzp1 z29.s, z22.s, z26.s\n"
+    ".inst 0x44824461  // smlalt z1.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x448b4106  // smlalb z6.s, p4/M, z8.h, z11.h\n"
+    "uzp2 z22.s, z22.s, z26.s\n"
+    "whilelt p2.s, x2, x3\n"
+    ".inst 0x448b4069  // smlalb z9.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4087  // smlalb z7.s, p4/M, z4.h, z11.h\n"
+    "whilelt p1.s, x20, x3\n"
+    "whilelt p3.h, x2, x3\n"
+    ".inst 0x44984517  // smlalt z23.s, p4/M, z8.h, z24.h\n"
+    ".inst 0x449343ee  // smlalb z14.s, p4/M, z31.h, z19.h\n"
+    "addvl x14, x14, #2\n"
+    ".inst 0x448b4512  // smlalt z18.s, p4/M, z8.h, z11.h\n"
+    ".inst 0x448b4474  // smlalt z20.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4481  // smlalt z1.s, p4/M, z4.h, z11.h\n"
+    ".inst 0x449843e6  // smlalb z6.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984089  // smlalb z9.s, p4/M, z4.h, z24.h\n"
+    ".inst 0x44984367  // smlalb z7.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449347f7  // smlalt z23.s, p4/M, z31.h, z19.h\n"
+    ".inst 0x4495400e  // smlalb z14.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x04b975ce  // sqrdmulh z14.s, z14.s, z25.s\n"
+    ".inst 0x449847f2  // smlalt z18.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984494  // smlalt z20.s, p4/M, z4.h, z24.h\n"
+    "and z3.d, z14.d, z29.d\n"
+    ".inst 0x44984761  // smlalt z1.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x44934006  // smlalb z6.s, p4/M, z0.h, z19.h\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    ".inst 0x44934369  // smlalb z9.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449340a7  // smlalb z7.s, p4/M, z5.h, z19.h\n"
+    "sqadd z14.s, z14.s, z3.s\n"
+    ".inst 0x448293ae  // srshl z14.s, p4/M, z14.s, z29.s\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44934412  // smlalt z18.s, p4/M, z0.h, z19.h\n"
+    ".inst 0x04b076f7  // sqrdmulh z23.s, z23.s, z16.s\n"
+    ".inst 0x44934774  // smlalt z20.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449344a1  // smlalt z1.s, p4/M, z5.h, z19.h\n"
+    "and z31.d, z23.d, z22.d\n"
+    ".inst 0x44954226  // smlalb z6.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x449540a9  // smlalb z9.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x04b974c6  // sqrdmulh z6.s, z6.s, z25.s\n"
+    ".inst 0x44954387  // smlalb z7.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x44954632  // smlalt z18.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x04b97529  // sqrdmulh z9.s, z9.s, z25.s\n"
+    ".inst 0x449544b4  // smlalt z20.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x44954781  // smlalt z1.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x04b974e7  // sqrdmulh z7.s, z7.s, z25.s\n"
+    "asr z31.s, z31.s, #0x1f\n"
+    "and z3.d, z6.d, z29.d\n"
+    ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+    "and z0.d, z9.d, z29.d\n"
+    ".inst 0x04b07694  // sqrdmulh z20.s, z20.s, z16.s\n"
+    "and z19.d, z7.d, z29.d\n"
+    ".inst 0x04b07421  // sqrdmulh z1.s, z1.s, z16.s\n"
+    "sqadd z23.s, z23.s, z31.s\n"
+    ".inst 0x448292d7  // srshl z23.s, p4/M, z23.s, z22.s\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    "and z21.d, z18.d, z22.d\n"
+    "asr z0.s, z0.s, #0x1f\n"
+    "and z17.d, z20.d, z22.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z16.d, z1.d, z22.d\n"
+    "sqadd z6.s, z6.s, z3.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    ".inst 0x448293a6  // srshl z6.s, p4/M, z6.s, z29.s\n"
+    "sqadd z9.s, z9.s, z0.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x448293a9  // srshl z9.s, p4/M, z9.s, z29.s\n"
+    "sqadd z7.s, z7.s, z19.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x448293a7  // srshl z7.s, p4/M, z7.s, z29.s\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "sqadd z20.s, z20.s, z17.s\n"
+    ".inst 0x448292d2  // srshl z18.s, p4/M, z18.s, z22.s\n"
+    ".inst 0x448292d4  // srshl z20.s, p4/M, z20.s, z22.s\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x453041ce  // sqxtnb z14.h, z14.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    ".inst 0x453040c6  // sqxtnb z6.h, z6.s\n"
+    ".inst 0x45304129  // sqxtnb z9.h, z9.s\n"
+    ".inst 0x453040e7  // sqxtnb z7.h, z7.s\n"
+    ".inst 0x453046ee  // sqxtnt z14.h, z23.s\n"
+    ".inst 0x45304646  // sqxtnt z6.h, z18.s\n"
+    ".inst 0x45304689  // sqxtnt z9.h, z20.s\n"
+    ".inst 0x45304427  // sqxtnt z7.h, z1.s\n"
+    "sqadd z14.h, z14.h, z15.h\n"
+    "smax z14.h, p4/M, z14.h, z12.h\n"
+    "smin z14.h, p4/M, z14.h, z13.h\n"
+    "sqadd z6.h, z6.h, z15.h\n"
+    "sqadd z9.h, z9.h, z15.h\n"
+    "smax z6.h, p4/M, z6.h, z12.h\n"
+    "smax z9.h, p4/M, z9.h, z12.h\n"
+    "sqadd z7.h, z7.h, z15.h\n"
+    "smax z7.h, p4/M, z7.h, z12.h\n"
+    "smin z6.h, p4/M, z6.h, z13.h\n"
+    "st1b { z14.h }, p0, [x5, x16]\n"
+    "smin z9.h, p4/M, z9.h, z13.h\n"
+    "smin z7.h, p4/M, z7.h, z13.h\n"
+    "st1b { z6.h }, p0, [x6, x16]\n"
+    "st1b { z9.h }, p0, [x7, x16]\n"
+    "st1b { z7.h }, p0, [x8, x16]\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1sb { z26.h }, p4/Z, [x4]\n"
+    "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x21, x21, #2\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "inch x16\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z6.d, z14.d\n"
+    "mov z18.d, z23.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z9.d, z14.d\n"
+    "mov z20.d, z23.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z7.d, z14.d\n"
+    "mov z1.d, z23.d\n"
+    "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a135a  // ssublb z26.h, z26.b, z10.b\n"
+    ".inst 0x454a1108  // ssublb z8.h, z8.b, z10.b\n"
+    "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x454a1231  // ssublb z17.h, z17.b, z10.b\n"
+    ".inst 0x455e12d6  // ssublb z22.h, z22.b, z30.b\n"
+    "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    ".inst 0x455e116b  // ssublb z11.h, z11.b, z30.b\n"
+    "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1063  // ssublb z3.h, z3.b, z30.b\n"
+    ".inst 0x455e13bd  // ssublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1084  // ssublb z4.h, z4.b, z30.b\n"
+    ".inst 0x455e13ff  // ssublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1000  // ssublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1273  // ssublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e139c  // ssublb z28.h, z28.b, z30.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..abc09ee5a3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..274b29dcfc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x9\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ldr x23, [%x[inptrs], #0x8]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ldr x22, [%x[inptrs], #0x20]\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "mov z13.b, #0x1\n"
+    "lsr z13.s, z13.s, #0x8\n"
+    "ld1b { z1.b }, p0/Z, [x23]\n"
+    "ld1b { z2.b }, p0/Z, [x20]\n"
+    "mov z8.d, z1.d\n"
+    "mov z27.d, z1.d\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ld1b { z4.b }, p0/Z, [x22]\n"
+    "mov z31.d, z1.d\n"
+    "mov z28.d, z2.d\n"
+    "ld1b { z0.b }, p0/Z, [x21]\n"
+    "mov z30.d, z2.d\n"
+    "mov z26.d, z2.d\n"
+    "ld1b { z3.b }, p0/Z, [x20]\n"
+    "mov z22.d, z4.d\n"
+    "mov z10.d, z4.d\n"
+    "ptrue p2.b\n"
+    "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z18.d, z4.d\n"
+    "ext z8.b, z8.b, z8.b, #0x2\n"
+    "lsl x10, %x[n_channels], #0x2\n"
+    "neg z11.s, p2/M, z11.s\n"
+    "ext z27.b, z27.b, z27.b, #0x4\n"
+    "ext z31.b, z31.b, z31.b, #0x6\n"
+    "mov x9, #0x0\n"
+    "whilelt p0.b, x9, x10\n"
+    "ext z28.b, z28.b, z28.b, #0x2\n"
+    "ext z30.b, z30.b, z30.b, #0x4\n"
+    "ld1w { z14.s }, p0/Z, [%x[params]]\n"
+    "mov x28, #0x0\n"
+    "ext z26.b, z26.b, z26.b, #0x6\n"
+    "ext z22.b, z22.b, z22.b, #0x2\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ext z10.b, z10.b, z10.b, #0x4\n"
+    "ext z18.b, z18.b, z18.b, #0x6\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "mov z21.d, z0.d\n"
+    "mov z20.d, z0.d\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "mov z19.d, z0.d\n"
+    "mov z24.d, z3.d\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+    "mov z17.d, z3.d\n"
+    "mov z16.d, z3.d\n"
+    "ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "ext z21.b, z21.b, z21.b, #0x2\n"
+    "ext z20.b, z20.b, z20.b, #0x4\n"
+    "addvl %x[params], %x[params], #4\n"
+    "ext z19.b, z19.b, z19.b, #0x6\n"
+    "zip1 z1.s, z1.s, z27.s\n"
+    "zip1 z8.s, z8.s, z31.s\n"
+    "zip1 z2.s, z2.s, z30.s\n"
+    "zip1 z28.s, z28.s, z26.s\n"
+    "ext z24.b, z24.b, z24.b, #0x2\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "ext z16.b, z16.b, z16.b, #0x6\n"
+    "zip1 z4.s, z4.s, z10.s\n"
+    "zip1 z22.s, z22.s, z18.s\n"
+    "zip1 z0.s, z0.s, z20.s\n"
+    "zip1 z21.s, z21.s, z19.s\n"
+    "zip1 z1.s, z1.s, z8.s\n"
+    "zip1 z2.s, z2.s, z28.s\n"
+    "zip1 z3.s, z3.s, z17.s\n"
+    "zip1 z24.s, z24.s, z16.s\n"
+    "zip1 z4.s, z4.s, z22.s\n"
+    "zip1 z0.s, z0.s, z21.s\n"
+    "mov z1.q, z1.q[0]\n"
+    "mov z2.q, z2.q[0]\n"
+    "zip1 z3.s, z3.s, z24.s\n"
+    "mov z4.q, z4.q[0]\n"
+    "mov z24.s, #0x0\n"
+    "mov z25.s, #0x0\n"
+    "sdot z24.s, z13.b, z1.b[0]\n"
+    "mov z23.s, #0x0\n"
+    "mov z22.s, #0x0\n"
+    "sdot z25.s, z13.b, z1.b[1]\n"
+    "mov z21.s, #0x0\n"
+    "mov z19.s, #0x0\n"
+    "sdot z23.s, z13.b, z1.b[2]\n"
+    "mov z10.s, #0x0\n"
+    "mov z8.s, #0x0\n"
+    "sdot z22.s, z13.b, z1.b[3]\n"
+    "mov z20.s, #0x0\n"
+    "mov z18.s, #0x0\n"
+    "sdot z21.s, z13.b, z2.b[0]\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0x0\n"
+    "sdot z19.s, z13.b, z2.b[1]\n"
+    "sdot z10.s, z13.b, z2.b[2]\n"
+    "sdot z8.s, z13.b, z2.b[3]\n"
+    "mov z0.q, z0.q[0]\n"
+    "sdot z20.s, z13.b, z4.b[0]\n"
+    "sdot z18.s, z13.b, z4.b[1]\n"
+    "mov z3.q, z3.q[0]\n"
+    "sdot z17.s, z13.b, z4.b[2]\n"
+    "sdot z16.s, z13.b, z4.b[3]\n"
+    "mov z31.s, #0x0\n"
+    "mov z30.s, #0x0\n"
+    "mov z26.s, #0x0\n"
+    "sdot z31.s, z13.b, z0.b[0]\n"
+    "mov z27.s, #0x0\n"
+    "mov z28.s, #0x0\n"
+    "sdot z30.s, z13.b, z0.b[1]\n"
+    "mov z29.s, #0x0\n"
+    "sdot z26.s, z13.b, z0.b[2]\n"
+    "sdot z27.s, z13.b, z0.b[3]\n"
+    "sdot z28.s, z13.b, z3.b[0]\n"
+    "sdot z29.s, z13.b, z3.b[1]\n"
+    "add z24.s, z24.s, z21.s\n"
+    "add z25.s, z25.s, z19.s\n"
+    "add z23.s, z23.s, z10.s\n"
+    "add z22.s, z22.s, z8.s\n"
+    "add z21.s, z20.s, z21.s\n"
+    "mov z20.s, #0x0\n"
+    "sdot z20.s, z13.b, z3.b[2]\n"
+    "add z19.s, z18.s, z19.s\n"
+    "mov z18.s, #0x0\n"
+    "sdot z18.s, z13.b, z3.b[3]\n"
+    "add z17.s, z17.s, z10.s\n"
+    "add z16.s, z16.s, z8.s\n"
+    "add z24.s, z24.s, z31.s\n"
+    "add z25.s, z25.s, z30.s\n"
+    "mul z24.s, p2/M, z24.s, z11.s\n"
+    "mul z25.s, p2/M, z25.s, z11.s\n"
+    "add z26.s, z23.s, z26.s\n"
+    "add z27.s, z22.s, z27.s\n"
+    "mul z26.s, p2/M, z26.s, z11.s\n"
+    "mul z27.s, p2/M, z27.s, z11.s\n"
+    "add z28.s, z21.s, z28.s\n"
+    "add z29.s, z19.s, z29.s\n"
+    "mul z28.s, p2/M, z28.s, z11.s\n"
+    "mul z29.s, p2/M, z29.s, z11.s\n"
+    "add z30.s, z17.s, z20.s\n"
+    "add z31.s, z16.s, z18.s\n"
+    "mul z30.s, p2/M, z30.s, z11.s\n"
+    "mul z31.s, p2/M, z31.s, z11.s\n"
+    "zip1 z19.s, z24.s, z26.s\n"
+    "zip1 z18.s, z25.s, z27.s\n"
+    "zip1 z17.s, z28.s, z30.s\n"
+    "zip1 z16.s, z29.s, z31.s\n"
+    "zip1 z22.s, z19.s, z18.s\n"
+    "zip1 z23.s, z17.s, z16.s\n"
+    "add z24.s, z24.s, z14.s\n"
+    "add z25.s, z25.s, z14.s\n"
+    "add z26.s, z26.s, z14.s\n"
+    "add z27.s, z27.s, z14.s\n"
+    "add z28.s, z28.s, z14.s\n"
+    "add z29.s, z29.s, z14.s\n"
+    "add z30.s, z30.s, z14.s\n"
+    "add z31.s, z31.s, z14.s\n"
+    "1:"  // Loop
+    "sdot z24.s, z5.b, z0.b[0]\n"
+    "sdot z25.s, z5.b, z0.b[1]\n"
+    "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "sdot z26.s, z5.b, z0.b[2]\n"
+    "sdot z27.s, z5.b, z0.b[3]\n"
+    "incb x9\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "sdot z24.s, z6.b, z1.b[0]\n"
+    "sdot z25.s, z6.b, z1.b[1]\n"
+    "whilelt p0.b, x9, x10\n"
+    "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "sdot z26.s, z6.b, z1.b[2]\n"
+    "sdot z27.s, z6.b, z1.b[3]\n"
+    "sdot z28.s, z5.b, z2.b[0]\n"
+    "sdot z29.s, z5.b, z2.b[1]\n"
+    "sdot z30.s, z5.b, z2.b[2]\n"
+    "sdot z31.s, z5.b, z2.b[3]\n"
+    "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z24.s, z7.b, z2.b[0]\n"
+    "sdot z25.s, z7.b, z2.b[1]\n"
+    ".inst 0x04a87718  // sqrdmulh z24.s, z24.s, z8.s\n"
+    "sdot z26.s, z7.b, z2.b[2]\n"
+    "sdot z27.s, z7.b, z2.b[3]\n"
+    ".inst 0x04a87739  // sqrdmulh z25.s, z25.s, z8.s\n"
+    "sdot z28.s, z6.b, z3.b[0]\n"
+    "sdot z29.s, z6.b, z3.b[1]\n"
+    ".inst 0x04a8775a  // sqrdmulh z26.s, z26.s, z8.s\n"
+    "sdot z30.s, z6.b, z3.b[2]\n"
+    "sdot z31.s, z6.b, z3.b[3]\n"
+    ".inst 0x04a8777b  // sqrdmulh z27.s, z27.s, z8.s\n"
+    "ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+    "sdot z28.s, z7.b, z4.b[0]\n"
+    "sdot z29.s, z7.b, z4.b[1]\n"
+    "and z19.d, z24.d, z21.d\n"
+    "sdot z30.s, z7.b, z4.b[2]\n"
+    "sdot z31.s, z7.b, z4.b[3]\n"
+    "and z18.d, z25.d, z21.d\n"
+    "ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+    "and z17.d, z26.d, z21.d\n"
+    "and z16.d, z27.d, z21.d\n"
+    "addvl %x[params], %x[params], #6\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04a8779c  // sqrdmulh z28.s, z28.s, z8.s\n"
+    ".inst 0x04a877bd  // sqrdmulh z29.s, z29.s, z8.s\n"
+    ".inst 0x04a877de  // sqrdmulh z30.s, z30.s, z8.s\n"
+    ".inst 0x04a877ff  // sqrdmulh z31.s, z31.s, z8.s\n"
+    "sqadd z24.s, z24.s, z19.s\n"
+    "sqadd z25.s, z25.s, z18.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "sqadd z26.s, z26.s, z17.s\n"
+    "sqadd z27.s, z27.s, z16.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    ".inst 0x44828abb  // srshl z27.s, p2/M, z27.s, z21.s\n"
+    "and z19.d, z28.d, z21.d\n"
+    "and z18.d, z29.d, z21.d\n"
+    "and z17.d, z30.d, z21.d\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z19.s\n"
+    "sqadd z29.s, z29.s, z18.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "sqadd z30.s, z30.s, z17.s\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    ".inst 0x44828abe  // srshl z30.s, p2/M, z30.s, z21.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "add z24.s, z24.s, z9.s\n"
+    "add z25.s, z25.s, z9.s\n"
+    "smin z24.s, p2/M, z24.s, z12.s\n"
+    "smin z25.s, p2/M, z25.s, z12.s\n"
+    "add z26.s, z26.s, z9.s\n"
+    "add z27.s, z27.s, z9.s\n"
+    "smin z26.s, p2/M, z26.s, z12.s\n"
+    "smin z27.s, p2/M, z27.s, z12.s\n"
+    "add z28.s, z28.s, z9.s\n"
+    "add z29.s, z29.s, z9.s\n"
+    "smin z28.s, p2/M, z28.s, z12.s\n"
+    "smin z29.s, p2/M, z29.s, z12.s\n"
+    "add z30.s, z30.s, z9.s\n"
+    "add z31.s, z31.s, z9.s\n"
+    "smin z30.s, p2/M, z30.s, z12.s\n"
+    "smin z31.s, p2/M, z31.s, z12.s\n"
+    "smax z24.s, p2/M, z24.s, z15.s\n"
+    "smax z25.s, p2/M, z25.s, z15.s\n"
+    "st1b { z24.s }, p1, [x27, x28]\n"
+    "mov z24.s, z22.s[0]\n"
+    "smax z26.s, p2/M, z26.s, z15.s\n"
+    "smax z27.s, p2/M, z27.s, z15.s\n"
+    "st1b { z25.s }, p1, [x26, x28]\n"
+    "mov z25.s, z22.s[1]\n"
+    "smax z28.s, p2/M, z28.s, z15.s\n"
+    "smax z29.s, p2/M, z29.s, z15.s\n"
+    "st1b { z26.s }, p1, [x25, x28]\n"
+    "mov z26.s, z22.s[2]\n"
+    "smax z30.s, p2/M, z30.s, z15.s\n"
+    "smax z31.s, p2/M, z31.s, z15.s\n"
+    "st1b { z27.s }, p1, [x24, x28]\n"
+    "mov z27.s, z22.s[3]\n"
+    "st1b { z28.s }, p1, [x23, x28]\n"
+    "mov z28.s, z23.s[0]\n"
+    "add z24.s, z24.s, z20.s\n"
+    "st1b { z29.s }, p1, [x22, x28]\n"
+    "mov z29.s, z23.s[1]\n"
+    "add z25.s, z25.s, z20.s\n"
+    "st1b { z30.s }, p1, [x21, x28]\n"
+    "mov z30.s, z23.s[2]\n"
+    "add z26.s, z26.s, z20.s\n"
+    "st1b { z31.s }, p1, [x20, x28]\n"
+    "mov z31.s, z23.s[3]\n"
+    "incw x28\n"
+    "add z27.s, z27.s, z20.s\n"
+    "add z28.s, z28.s, z20.s\n"
+    "add z29.s, z29.s, z20.s\n"
+    "add z30.s, z30.s, z20.s\n"
+    "add z31.s, z31.s, z20.s\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..701948f264
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+  : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a3b2b429c0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x6\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ldr x22, [%x[inptrs], #0x18]\n"
+    "ldr x21, [%x[inptrs], #0x20]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ld1b { z3.b }, p0/Z, [x22]\n"
+    "mov z23.d, z3.d\n"
+    "ext z23.b, z23.b, z23.b, #0x1\n"
+    "ld1b { z4.b }, p0/Z, [x21]\n"
+    "ldr x24, [%x[inptrs], #0x8]\n"
+    "mov z18.d, z4.d\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "ld1b { z2.b }, p0/Z, [x20]\n"
+    "ldr x23, [%x[inptrs], #0x28]\n"
+    "mov z15.d, z2.d\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "ldr x22, [%x[inptrs], #0x30]\n"
+    "ldr x21, [%x[inptrs], #0x38]\n"
+    "zip1 z3.d, z3.d, z23.d\n"
+    "zip1 z4.d, z4.d, z18.d\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "ld1b { z1.b }, p0/Z, [x24]\n"
+    "mov z19.d, z1.d\n"
+    "ext z19.b, z19.b, z19.b, #0x1\n"
+    "ld1b { z5.b }, p0/Z, [x23]\n"
+    "ld1b { z6.b }, p0/Z, [x22]\n"
+    "mov z18.d, z5.d\n"
+    "mov z22.d, z6.d\n"
+    "ld1b { z7.b }, p0/Z, [x21]\n"
+    "ld1b { z0.b }, p0/Z, [x20]\n"
+    "mov z8.d, z7.d\n"
+    "zip1 z2.d, z2.d, z15.d\n"
+    "mov z3.q, z3.q[0]\n"
+    "mov z4.q, z4.q[0]\n"
+    "ptrue p2.b\n"
+    "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "ext z22.b, z22.b, z22.b, #0x1\n"
+    "lsl x10, %x[n_channels], #0x2\n"
+    "neg z23.s, p2/M, z23.s\n"
+    "ext z8.b, z8.b, z8.b, #0x1\n"
+    "mov z28.b, #0x1\n"
+    "mov x9, #0x0\n"
+    "whilelt p0.b, x9, x10\n"
+    "mov z25.s, #0x0\n"
+    "mov z24.s, #0x0\n"
+    "sdot z25.s, z28.b, z3.b[0]\n"
+    "ld1w { z12.s }, p0/Z, [%x[params]]\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0x0\n"
+    "sdot z24.s, z28.b, z3.b[2]\n"
+    "mov x28, #0x0\n"
+    "mov z27.d, z0.d\n"
+    "sdot z17.s, z28.b, z4.b[0]\n"
+    "sdot z16.s, z28.b, z4.b[2]\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "ext z27.b, z27.b, z27.b, #0x1\n"
+    "zip1 z1.d, z1.d, z19.d\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "mov z2.q, z2.q[0]\n"
+    "zip1 z5.d, z5.d, z18.d\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "zip1 z6.d, z6.d, z22.d\n"
+    "zip1 z7.d, z7.d, z8.d\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "mov z30.s, #0x0\n"
+    "mov z31.s, #0x0\n"
+    "sdot z30.s, z28.b, z2.b[0]\n"
+    "ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+    "mov z29.s, #0x1\n"
+    "sdot z31.s, z28.b, z2.b[2]\n"
+    "sdot z25.s, z29.b, z3.b[1]\n"
+    "ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "zip1 z0.d, z0.d, z27.d\n"
+    "mov z1.q, z1.q[0]\n"
+    "sdot z24.s, z29.b, z3.b[3]\n"
+    "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "mov z5.q, z5.q[0]\n"
+    "mov z6.q, z6.q[0]\n"
+    "sdot z17.s, z29.b, z4.b[1]\n"
+    "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+    "mov z7.q, z7.q[0]\n"
+    "mov z22.s, #0x0\n"
+    "sdot z16.s, z29.b, z4.b[3]\n"
+    "addvl %x[params], %x[params], #5\n"
+    "mov z21.s, #0x0\n"
+    "mov z26.s, #0x0\n"
+    "sdot z22.s, z28.b, z1.b[0]\n"
+    "mov z27.s, #0x0\n"
+    "mov z20.s, #0x0\n"
+    "sdot z21.s, z28.b, z1.b[2]\n"
+    "mov z19.s, #0x0\n"
+    "mov z18.s, #0x0\n"
+    "sdot z26.s, z28.b, z5.b[0]\n"
+    "sdot z27.s, z28.b, z5.b[2]\n"
+    "sdot z20.s, z28.b, z6.b[0]\n"
+    "mov z0.q, z0.q[0]\n"
+    "sdot z19.s, z28.b, z6.b[2]\n"
+    "sdot z18.s, z28.b, z7.b[0]\n"
+    "add z17.s, z25.s, z17.s\n"
+    "mov z25.s, #0x0\n"
+    "sdot z25.s, z28.b, z7.b[2]\n"
+    "sdot z30.s, z29.b, z2.b[1]\n"
+    "sdot z31.s, z29.b, z2.b[3]\n"
+    "add z16.s, z24.s, z16.s\n"
+    "sdot z22.s, z29.b, z1.b[1]\n"
+    "mov z24.s, #0x0\n"
+    "sdot z24.s, z28.b, z0.b[0]\n"
+    "sdot z21.s, z29.b, z1.b[3]\n"
+    "sdot z26.s, z29.b, z5.b[1]\n"
+    "sdot z27.s, z29.b, z5.b[3]\n"
+    "add z30.s, z30.s, z17.s\n"
+    "sdot z20.s, z29.b, z6.b[1]\n"
+    "sdot z19.s, z29.b, z6.b[3]\n"
+    "add z31.s, z31.s, z16.s\n"
+    "sdot z18.s, z29.b, z7.b[1]\n"
+    "sdot z25.s, z29.b, z7.b[3]\n"
+    "add z22.s, z22.s, z30.s\n"
+    "sdot z24.s, z29.b, z0.b[1]\n"
+    "add z21.s, z21.s, z31.s\n"
+    "add z20.s, z26.s, z20.s\n"
+    "add z19.s, z27.s, z19.s\n"
+    "add z18.s, z18.s, z17.s\n"
+    "mov z17.s, #0x0\n"
+    "sdot z17.s, z28.b, z0.b[2]\n"
+    "sdot z17.s, z29.b, z0.b[3]\n"
+    "add z16.s, z25.s, z16.s\n"
+    "add z24.s, z22.s, z24.s\n"
+    "add z25.s, z21.s, z17.s\n"
+    "mul z24.s, p2/M, z24.s, z23.s\n"
+    "mul z25.s, p2/M, z25.s, z23.s\n"
+    "add z26.s, z26.s, z22.s\n"
+    "add z27.s, z27.s, z21.s\n"
+    "mul z26.s, p2/M, z26.s, z23.s\n"
+    "mul z27.s, p2/M, z27.s, z23.s\n"
+    "add z28.s, z20.s, z30.s\n"
+    "add z29.s, z19.s, z31.s\n"
+    "mul z28.s, p2/M, z28.s, z23.s\n"
+    "mul z29.s, p2/M, z29.s, z23.s\n"
+    "add z30.s, z20.s, z18.s\n"
+    "add z31.s, z19.s, z16.s\n"
+    "mul z30.s, p2/M, z30.s, z23.s\n"
+    "mul z31.s, p2/M, z31.s, z23.s\n"
+    "zip1 z19.s, z24.s, z26.s\n"
+    "zip1 z18.s, z25.s, z27.s\n"
+    "zip1 z17.s, z28.s, z30.s\n"
+    "zip1 z16.s, z29.s, z31.s\n"
+    "zip1 z22.s, z19.s, z18.s\n"
+    "zip1 z23.s, z17.s, z16.s\n"
+    "add z24.s, z24.s, z12.s\n"
+    "add z25.s, z25.s, z12.s\n"
+    "add z26.s, z26.s, z12.s\n"
+    "add z27.s, z27.s, z12.s\n"
+    "add z28.s, z28.s, z12.s\n"
+    "add z29.s, z29.s, z12.s\n"
+    "add z30.s, z30.s, z12.s\n"
+    "add z31.s, z31.s, z12.s\n"
+    "1:"  // Loop
+    "sdot z24.s, z8.b, z0.b[0]\n"
+    "sdot z25.s, z8.b, z0.b[2]\n"
+    "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "sdot z26.s, z8.b, z1.b[0]\n"
+    "sdot z27.s, z8.b, z1.b[2]\n"
+    "incb x9\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "sdot z24.s, z9.b, z0.b[1]\n"
+    "sdot z25.s, z9.b, z0.b[3]\n"
+    "whilelt p0.b, x9, x10\n"
+    "sdot z26.s, z9.b, z1.b[1]\n"
+    "sdot z27.s, z9.b, z1.b[3]\n"
+    "sdot z28.s, z8.b, z2.b[0]\n"
+    "sdot z29.s, z8.b, z2.b[2]\n"
+    "sdot z30.s, z8.b, z3.b[0]\n"
+    "sdot z31.s, z8.b, z3.b[2]\n"
+    "ld1b { z17.b }, p2/Z, [%x[params]]\n"
+    "sdot z24.s, z10.b, z1.b[0]\n"
+    "sdot z25.s, z10.b, z1.b[2]\n"
+    "sdot z26.s, z10.b, z2.b[0]\n"
+    "sdot z27.s, z10.b, z2.b[2]\n"
+    "sdot z28.s, z9.b, z2.b[1]\n"
+    "sdot z29.s, z9.b, z2.b[3]\n"
+    "sdot z30.s, z9.b, z3.b[1]\n"
+    "sdot z31.s, z9.b, z3.b[3]\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "sdot z24.s, z11.b, z1.b[1]\n"
+    "sdot z25.s, z11.b, z1.b[3]\n"
+    "sdot z26.s, z11.b, z2.b[1]\n"
+    "sdot z27.s, z11.b, z2.b[3]\n"
+    "sdot z28.s, z10.b, z3.b[0]\n"
+    "sdot z29.s, z10.b, z3.b[2]\n"
+    "sdot z30.s, z10.b, z4.b[0]\n"
+    "sdot z31.s, z10.b, z4.b[2]\n"
+    "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "sdot z24.s, z17.b, z2.b[0]\n"
+    "sdot z25.s, z17.b, z2.b[2]\n"
+    "sdot z26.s, z17.b, z3.b[0]\n"
+    "sdot z27.s, z17.b, z3.b[2]\n"
+    "sdot z28.s, z11.b, z3.b[1]\n"
+    "sdot z29.s, z11.b, z3.b[3]\n"
+    "sdot z30.s, z11.b, z4.b[1]\n"
+    "sdot z31.s, z11.b, z4.b[3]\n"
+    "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z24.s, z16.b, z2.b[1]\n"
+    "sdot z25.s, z16.b, z2.b[3]\n"
+    "sdot z26.s, z16.b, z3.b[1]\n"
+    "sdot z27.s, z16.b, z3.b[3]\n"
+    "sdot z28.s, z17.b, z4.b[0]\n"
+    "sdot z29.s, z17.b, z4.b[2]\n"
+    "sdot z30.s, z17.b, z5.b[0]\n"
+    "sdot z31.s, z17.b, z5.b[2]\n"
+    "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "sdot z24.s, z19.b, z3.b[0]\n"
+    "sdot z25.s, z19.b, z3.b[2]\n"
+    "sdot z26.s, z19.b, z4.b[0]\n"
+    "sdot z27.s, z19.b, z4.b[2]\n"
+    "sdot z28.s, z16.b, z4.b[1]\n"
+    "sdot z29.s, z16.b, z4.b[3]\n"
+    "sdot z30.s, z16.b, z5.b[1]\n"
+    "sdot z31.s, z16.b, z5.b[3]\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "sdot z24.s, z18.b, z3.b[1]\n"
+    "sdot z25.s, z18.b, z3.b[3]\n"
+    "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+    "sdot z26.s, z18.b, z4.b[1]\n"
+    "sdot z27.s, z18.b, z4.b[3]\n"
+    "sdot z28.s, z19.b, z5.b[0]\n"
+    "sdot z29.s, z19.b, z5.b[2]\n"
+    "sdot z30.s, z19.b, z6.b[0]\n"
+    "sdot z31.s, z19.b, z6.b[2]\n"
+    "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
+    "sdot z24.s, z17.b, z4.b[0]\n"
+    "sdot z25.s, z17.b, z4.b[2]\n"
+    "sdot z26.s, z17.b, z5.b[0]\n"
+    "sdot z27.s, z17.b, z5.b[2]\n"
+    "sdot z28.s, z18.b, z5.b[1]\n"
+    "sdot z29.s, z18.b, z5.b[3]\n"
+    "sdot z30.s, z18.b, z6.b[1]\n"
+    "sdot z31.s, z18.b, z6.b[3]\n"
+    "ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
+    "sdot z24.s, z16.b, z4.b[1]\n"
+    "sdot z25.s, z16.b, z4.b[3]\n"
+    ".inst 0x04ac7718  // sqrdmulh z24.s, z24.s, z12.s\n"
+    "sdot z26.s, z16.b, z5.b[1]\n"
+    "sdot z27.s, z16.b, z5.b[3]\n"
+    ".inst 0x04ac7739  // sqrdmulh z25.s, z25.s, z12.s\n"
+    "sdot z28.s, z17.b, z6.b[0]\n"
+    "sdot z29.s, z17.b, z6.b[2]\n"
+    ".inst 0x04ac775a  // sqrdmulh z26.s, z26.s, z12.s\n"
+    "sdot z30.s, z17.b, z7.b[0]\n"
+    "sdot z31.s, z17.b, z7.b[2]\n"
+    ".inst 0x04ac777b  // sqrdmulh z27.s, z27.s, z12.s\n"
+    "ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
+    "sdot z28.s, z16.b, z6.b[1]\n"
+    "sdot z29.s, z16.b, z6.b[3]\n"
+    "and z19.d, z24.d, z21.d\n"
+    "sdot z30.s, z16.b, z7.b[1]\n"
+    "sdot z31.s, z16.b, z7.b[3]\n"
+    "and z18.d, z25.d, z21.d\n"
+    "ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
+    "and z17.d, z26.d, z21.d\n"
+    "and z16.d, z27.d, z21.d\n"
+    "addvl %x[params], %x[params], #-3\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04ac779c  // sqrdmulh z28.s, z28.s, z12.s\n"
+    ".inst 0x04ac77bd  // sqrdmulh z29.s, z29.s, z12.s\n"
+    ".inst 0x04ac77de  // sqrdmulh z30.s, z30.s, z12.s\n"
+    ".inst 0x04ac77ff  // sqrdmulh z31.s, z31.s, z12.s\n"
+    "sqadd z24.s, z24.s, z19.s\n"
+    "sqadd z25.s, z25.s, z18.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "sqadd z26.s, z26.s, z17.s\n"
+    "sqadd z27.s, z27.s, z16.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    ".inst 0x44828abb  // srshl z27.s, p2/M, z27.s, z21.s\n"
+    "and z19.d, z28.d, z21.d\n"
+    "and z18.d, z29.d, z21.d\n"
+    "and z17.d, z30.d, z21.d\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z19.s\n"
+    "sqadd z29.s, z29.s, z18.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "sqadd z30.s, z30.s, z17.s\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    ".inst 0x44828abe  // srshl z30.s, p2/M, z30.s, z21.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "add z24.s, z24.s, z13.s\n"
+    "add z25.s, z25.s, z13.s\n"
+    "smin z24.s, p2/M, z24.s, z15.s\n"
+    "smin z25.s, p2/M, z25.s, z15.s\n"
+    "add z26.s, z26.s, z13.s\n"
+    "add z27.s, z27.s, z13.s\n"
+    "smin z26.s, p2/M, z26.s, z15.s\n"
+    "smin z27.s, p2/M, z27.s, z15.s\n"
+    "add z28.s, z28.s, z13.s\n"
+    "add z29.s, z29.s, z13.s\n"
+    "smin z28.s, p2/M, z28.s, z15.s\n"
+    "smin z29.s, p2/M, z29.s, z15.s\n"
+    "add z30.s, z30.s, z13.s\n"
+    "add z31.s, z31.s, z13.s\n"
+    "smin z30.s, p2/M, z30.s, z15.s\n"
+    "smin z31.s, p2/M, z31.s, z15.s\n"
+    "smax z24.s, p2/M, z24.s, z14.s\n"
+    "smax z25.s, p2/M, z25.s, z14.s\n"
+    "st1b { z24.s }, p1, [x27, x28]\n"
+    "mov z24.s, z22.s[0]\n"
+    "smax z26.s, p2/M, z26.s, z14.s\n"
+    "smax z27.s, p2/M, z27.s, z14.s\n"
+    "st1b { z25.s }, p1, [x26, x28]\n"
+    "mov z25.s, z22.s[1]\n"
+    "smax z28.s, p2/M, z28.s, z14.s\n"
+    "smax z29.s, p2/M, z29.s, z14.s\n"
+    "st1b { z26.s }, p1, [x25, x28]\n"
+    "mov z26.s, z22.s[2]\n"
+    "smax z30.s, p2/M, z30.s, z14.s\n"
+    "smax z31.s, p2/M, z31.s, z14.s\n"
+    "st1b { z27.s }, p1, [x24, x28]\n"
+    "mov z27.s, z22.s[3]\n"
+    "st1b { z28.s }, p1, [x23, x28]\n"
+    "mov z28.s, z23.s[0]\n"
+    "add z24.s, z24.s, z20.s\n"
+    "st1b { z29.s }, p1, [x22, x28]\n"
+    "mov z29.s, z23.s[1]\n"
+    "add z25.s, z25.s, z20.s\n"
+    "st1b { z30.s }, p1, [x21, x28]\n"
+    "mov z30.s, z23.s[2]\n"
+    "add z26.s, z26.s, z20.s\n"
+    "st1b { z31.s }, p1, [x20, x28]\n"
+    "mov z31.s, z23.s[3]\n"
+    "incw x28\n"
+    "add z27.s, z27.s, z20.s\n"
+    "add z28.s, z28.s, z20.s\n"
+    "add z29.s, z29.s, z20.s\n"
+    "add z30.s, z30.s, z20.s\n"
+    "add z31.s, z31.s, z20.s\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..6799b10ed9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_sve_s8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_sve_s8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d9c8644fc4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "mov x13, #0x0\n"
+    "whilelt p0.b, x13, %x[n_channels]\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "ldp x25, x24, [%x[inptrs], #0x10]\n"
+    "ldp x23, x22, [%x[inptrs], #0x20]\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "ptrue p2.b\n"
+    "mov x12, #0x0\n"
+    "ldp x11, x10, [%x[outptrs], #0x0]\n"
+    "ldp x9, x28, [%x[outptrs], #0x10]\n"
+    "ld1b { z15.b }, p0/Z, [x27, x13]\n"
+    "ld1b { z18.b }, p0/Z, [x26, x13]\n"
+    "ldp x27, x26, [%x[inptrs], #0x40]\n"
+    "ld1b { z16.b }, p0/Z, [x25, x13]\n"
+    "zip2 z17.b, z15.b, z16.b\n"
+    "zip1 z15.b, z15.b, z16.b\n"
+    "ld1b { z14.b }, p0/Z, [x24, x13]\n"
+    "ldp x25, x24, [%x[inptrs], #0x50]\n"
+    "zip1 z16.b, z18.b, z14.b\n"
+    "zip2 z14.b, z18.b, z14.b\n"
+    "ld1b { z13.b }, p0/Z, [x23, x13]\n"
+    "ld1b { z18.b }, p0/Z, [x22, x13]\n"
+    "zip2 z12.b, z15.b, z16.b\n"
+    "zip1 z15.b, z15.b, z16.b\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "ld1b { z16.b }, p0/Z, [x21, x13]\n"
+    "zip1 z11.b, z17.b, z14.b\n"
+    "zip2 z14.b, z17.b, z14.b\n"
+    "ld1b { z10.b }, p0/Z, [x20, x13]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip2 z22.b, z13.b, z16.b\n"
+    "zip1 z13.b, z13.b, z16.b\n"
+    "ld1b { z9.b }, p0/Z, [x27, x13]\n"
+    "ld1b { z17.b }, p0/Z, [x26, x13]\n"
+    "zip1 z21.b, z18.b, z10.b\n"
+    "zip2 z10.b, z18.b, z10.b\n"
+    "ld1b { z16.b }, p0/Z, [x25, x13]\n"
+    "ld1b { z8.b }, p0/Z, [x24, x13]\n"
+    "zip2 z20.b, z9.b, z16.b\n"
+    "zip1 z9.b, z9.b, z16.b\n"
+    "ld1b { z7.b }, p0/Z, [x23, x13]\n"
+    "ld1b { z19.b }, p0/Z, [x22, x13]\n"
+    "zip1 z18.b, z17.b, z8.b\n"
+    "zip2 z8.b, z17.b, z8.b\n"
+    "ld1b { z16.b }, p0/Z, [x21, x13]\n"
+    "ld1b { z6.b }, p0/Z, [x20, x13]\n"
+    "zip2 z17.b, z7.b, z16.b\n"
+    "zip1 z7.b, z7.b, z16.b\n"
+    "zip1 z16.b, z19.b, z6.b\n"
+    "zip2 z6.b, z19.b, z6.b\n"
+    "ld1w { z5.s }, p2/Z, [%x[params]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1rw { z2.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "zip2 z1.b, z13.b, z21.b\n"
+    "zip1 z13.b, z13.b, z21.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "zip1 z0.b, z22.b, z10.b\n"
+    "zip2 z10.b, z22.b, z10.b\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "zip2 z31.b, z9.b, z18.b\n"
+    "zip1 z9.b, z9.b, z18.b\n"
+    "zip1 z30.b, z20.b, z8.b\n"
+    "zip2 z8.b, z20.b, z8.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "ld1b { z28.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "zip2 z27.b, z7.b, z16.b\n"
+    "zip1 z7.b, z7.b, z16.b\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "addvl %x[params], %x[params], #4\n"
+    "zip1 z25.b, z17.b, z6.b\n"
+    "zip2 z6.b, z17.b, z6.b\n"
+    "mov z24.d, z5.d\n"
+    "mov z22.d, z5.d\n"
+    "mov z21.d, z5.d\n"
+    "1:"  // Loop
+    "sdot z5.s, z29.b, z15.b\n"
+    "sdot z22.s, z29.b, z13.b\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "whilelt p0.s, x12, %x[n_channels]\n"
+    "sdot z5.s, z28.b, z13.b\n"
+    "ext z13.b, z13.b, z13.b, #0x1\n"
+    "sdot z24.s, z29.b, z15.b\n"
+    "ld1w { z17.s }, p2/Z, [%x[params]]\n"
+    "sdot z21.s, z29.b, z13.b\n"
+    "sdot z22.s, z28.b, z9.b\n"
+    "incw x13, ALL, MUL #4\n"
+    "sdot z5.s, z26.b, z9.b\n"
+    "ext z9.b, z9.b, z9.b, #0x1\n"
+    "sdot z24.s, z28.b, z13.b\n"
+    "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "sdot z21.s, z28.b, z9.b\n"
+    "sdot z22.s, z26.b, z7.b\n"
+    "ext z7.b, z7.b, z7.b, #0x1\n"
+    ".inst 0x04b174a5  // sqrdmulh z5.s, z5.s, z17.s\n"
+    "sdot z24.s, z26.b, z9.b\n"
+    "sdot z21.s, z26.b, z7.b\n"
+    "and z16.d, z5.d, z20.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b17718  // sqrdmulh z24.s, z24.s, z17.s\n"
+    ".inst 0x04b176d6  // sqrdmulh z22.s, z22.s, z17.s\n"
+    ".inst 0x04b176b5  // sqrdmulh z21.s, z21.s, z17.s\n"
+    "sqadd z5.s, z5.s, z16.s\n"
+    ".inst 0x44828a85  // srshl z5.s, p2/M, z5.s, z20.s\n"
+    "ld1w { z19.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "and z18.d, z24.d, z20.d\n"
+    "and z17.d, z22.d, z20.d\n"
+    "and z16.d, z21.d, z20.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    ".inst 0x44828a98  // srshl z24.s, p2/M, z24.s, z20.s\n"
+    ".inst 0x44828a96  // srshl z22.s, p2/M, z22.s, z20.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "add z5.s, z5.s, z2.s\n"
+    ".inst 0x44828a95  // srshl z21.s, p2/M, z21.s, z20.s\n"
+    "smax z5.s, p2/M, z5.s, z4.s\n"
+    "add z24.s, z24.s, z2.s\n"
+    "add z22.s, z22.s, z2.s\n"
+    "smin z5.s, p2/M, z5.s, z3.s\n"
+    "smax z24.s, p2/M, z24.s, z4.s\n"
+    "add z21.s, z21.s, z2.s\n"
+    "smax z22.s, p2/M, z22.s, z4.s\n"
+    "smax z21.s, p2/M, z21.s, z4.s\n"
+    "st1b { z5.s }, p0, [x11, x12]\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "smin z24.s, p2/M, z24.s, z3.s\n"
+    "smin z22.s, p2/M, z22.s, z3.s\n"
+    "smin z21.s, p2/M, z21.s, z3.s\n"
+    "st1b { z24.s }, p0, [x10, x12]\n"
+    "mov z24.d, z23.d\n"
+    "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "st1b { z22.s }, p0, [x9, x12]\n"
+    "mov z22.d, z23.d\n"
+    "sdot z22.s, z18.b, z1.b\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "st1b { z21.s }, p0, [x28, x12]\n"
+    "mov z21.d, z23.d\n"
+    "sdot z23.s, z18.b, z12.b\n"
+    "sdot z23.s, z17.b, z1.b\n"
+    "ext z12.b, z12.b, z12.b, #0x1\n"
+    "ext z1.b, z1.b, z1.b, #0x1\n"
+    "sdot z24.s, z18.b, z12.b\n"
+    "ld1w { z20.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "sdot z21.s, z18.b, z1.b\n"
+    "sdot z22.s, z17.b, z31.b\n"
+    "incw x12\n"
+    "whilelt p0.s, x12, %x[n_channels]\n"
+    "sdot z23.s, z16.b, z31.b\n"
+    "ext z31.b, z31.b, z31.b, #0x1\n"
+    "sdot z24.s, z17.b, z1.b\n"
+    "addvl %x[params], %x[params], #16\n"
+    "sdot z21.s, z17.b, z31.b\n"
+    "sdot z22.s, z16.b, z27.b\n"
+    "ext z27.b, z27.b, z27.b, #0x1\n"
+    ".inst 0x04b376f7  // sqrdmulh z23.s, z23.s, z19.s\n"
+    "sdot z24.s, z16.b, z31.b\n"
+    "sdot z21.s, z16.b, z27.b\n"
+    "and z16.d, z23.d, z20.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b37718  // sqrdmulh z24.s, z24.s, z19.s\n"
+    ".inst 0x04b376d6  // sqrdmulh z22.s, z22.s, z19.s\n"
+    ".inst 0x04b376b5  // sqrdmulh z21.s, z21.s, z19.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    ".inst 0x44828a97  // srshl z23.s, p2/M, z23.s, z20.s\n"
+    "ld1w { z19.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+    "and z18.d, z24.d, z20.d\n"
+    "and z17.d, z22.d, z20.d\n"
+    "and z16.d, z21.d, z20.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    ".inst 0x44828a98  // srshl z24.s, p2/M, z24.s, z20.s\n"
+    ".inst 0x44828a96  // srshl z22.s, p2/M, z22.s, z20.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "add z23.s, z23.s, z2.s\n"
+    ".inst 0x44828a95  // srshl z21.s, p2/M, z21.s, z20.s\n"
+    "smax z23.s, p2/M, z23.s, z4.s\n"
+    "add z24.s, z24.s, z2.s\n"
+    "add z22.s, z22.s, z2.s\n"
+    "smin z23.s, p2/M, z23.s, z3.s\n"
+    "smax z24.s, p2/M, z24.s, z4.s\n"
+    "add z21.s, z21.s, z2.s\n"
+    "smax z22.s, p2/M, z22.s, z4.s\n"
+    "smax z21.s, p2/M, z21.s, z4.s\n"
+    "st1b { z23.s }, p0, [x11, x12]\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1b { z18.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "smin z24.s, p2/M, z24.s, z3.s\n"
+    "smin z22.s, p2/M, z22.s, z3.s\n"
+    "smin z21.s, p2/M, z21.s, z3.s\n"
+    "st1b { z24.s }, p0, [x10, x12]\n"
+    "mov z24.d, z23.d\n"
+    "ld1b { z17.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+    "st1b { z22.s }, p0, [x9, x12]\n"
+    "mov z22.d, z23.d\n"
+    "sdot z22.s, z18.b, z0.b\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+    "st1b { z21.s }, p0, [x28, x12]\n"
+    "mov z21.d, z23.d\n"
+    "sdot z23.s, z18.b, z11.b\n"
+    "sdot z23.s, z17.b, z0.b\n"
+    "ext z11.b, z11.b, z11.b, #0x1\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "sdot z24.s, z18.b, z11.b\n"
+    "ld1w { z20.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+    "sdot z21.s, z18.b, z0.b\n"
+    "sdot z22.s, z17.b, z30.b\n"
+    "incw x12\n"
+    "whilelt p0.s, x12, %x[n_channels]\n"
+    "sdot z23.s, z16.b, z30.b\n"
+    "ext z30.b, z30.b, z30.b, #0x1\n"
+    "sdot z24.s, z17.b, z0.b\n"
+    "sdot z21.s, z17.b, z30.b\n"
+    "sdot z22.s, z16.b, z25.b\n"
+    "ext z25.b, z25.b, z25.b, #0x1\n"
+    ".inst 0x04b376f7  // sqrdmulh z23.s, z23.s, z19.s\n"
+    "sdot z24.s, z16.b, z30.b\n"
+    "sdot z21.s, z16.b, z25.b\n"
+    "and z16.d, z23.d, z20.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b37718  // sqrdmulh z24.s, z24.s, z19.s\n"
+    ".inst 0x04b376d6  // sqrdmulh z22.s, z22.s, z19.s\n"
+    ".inst 0x04b376b5  // sqrdmulh z21.s, z21.s, z19.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    ".inst 0x44828a97  // srshl z23.s, p2/M, z23.s, z20.s\n"
+    "ld1w { z19.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "and z18.d, z24.d, z20.d\n"
+    "and z17.d, z22.d, z20.d\n"
+    "and z16.d, z21.d, z20.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    ".inst 0x44828a98  // srshl z24.s, p2/M, z24.s, z20.s\n"
+    ".inst 0x44828a96  // srshl z22.s, p2/M, z22.s, z20.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "add z23.s, z23.s, z2.s\n"
+    ".inst 0x44828a95  // srshl z21.s, p2/M, z21.s, z20.s\n"
+    "smax z23.s, p2/M, z23.s, z4.s\n"
+    "add z24.s, z24.s, z2.s\n"
+    "add z22.s, z22.s, z2.s\n"
+    "smin z23.s, p2/M, z23.s, z3.s\n"
+    "smax z24.s, p2/M, z24.s, z4.s\n"
+    "add z21.s, z21.s, z2.s\n"
+    "smax z22.s, p2/M, z22.s, z4.s\n"
+    "smax z21.s, p2/M, z21.s, z4.s\n"
+    "st1b { z23.s }, p0, [x11, x12]\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+    "ld1b { z18.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+    "smin z24.s, p2/M, z24.s, z3.s\n"
+    "smin z22.s, p2/M, z22.s, z3.s\n"
+    "smin z21.s, p2/M, z21.s, z3.s\n"
+    "st1b { z24.s }, p0, [x10, x12]\n"
+    "mov z29.d, z23.d\n"
+    "ld1b { z17.b }, p2/Z, [%x[params]]\n"
+    "st1b { z22.s }, p0, [x9, x12]\n"
+    "mov z28.d, z23.d\n"
+    "sdot z28.s, z18.b, z10.b\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "st1b { z21.s }, p0, [x28, x12]\n"
+    "mov z27.d, z23.d\n"
+    "sdot z23.s, z18.b, z14.b\n"
+    "sdot z23.s, z17.b, z10.b\n"
+    "ext z14.b, z14.b, z14.b, #0x1\n"
+    "ext z10.b, z10.b, z10.b, #0x1\n"
+    "sdot z29.s, z18.b, z14.b\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z27.s, z18.b, z10.b\n"
+    "sdot z28.s, z17.b, z8.b\n"
+    "incw x12\n"
+    "whilelt p1.s, x12, %x[n_channels]\n"
+    "sdot z23.s, z16.b, z8.b\n"
+    "ext z8.b, z8.b, z8.b, #0x1\n"
+    "sdot z29.s, z17.b, z10.b\n"
+    "whilelt p0.b, x13, %x[n_channels]\n"
+    "sdot z27.s, z17.b, z8.b\n"
+    "sdot z28.s, z16.b, z6.b\n"
+    "ext z6.b, z6.b, z6.b, #0x1\n"
+    "ld1b { z26.b }, p0/Z, [x26, x13]\n"
+    ".inst 0x04b376f7  // sqrdmulh z23.s, z23.s, z19.s\n"
+    "sdot z29.s, z16.b, z8.b\n"
+    "sdot z27.s, z16.b, z6.b\n"
+    "ld1b { z21.b }, p0/Z, [x25, x13]\n"
+    "and z16.d, z23.d, z22.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "ld1b { z14.b }, p0/Z, [x23, x13]\n"
+    "ld1b { z25.b }, p0/Z, [x22, x13]\n"
+    ".inst 0x04b377bd  // sqrdmulh z29.s, z29.s, z19.s\n"
+    ".inst 0x04b3779c  // sqrdmulh z28.s, z28.s, z19.s\n"
+    "ld1b { z20.b }, p0/Z, [x21, x13]\n"
+    "ld1b { z10.b }, p0/Z, [x20, x13]\n"
+    ".inst 0x04b3777b  // sqrdmulh z27.s, z27.s, z19.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    ".inst 0x44828ad7  // srshl z23.s, p2/M, z23.s, z22.s\n"
+    "ld1b { z15.b }, p0/Z, [x27, x13]\n"
+    "and z19.d, z29.d, z22.d\n"
+    "and z17.d, z28.d, z22.d\n"
+    "ldp x23, x22, [%x[inptrs], #0x40]\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "and z16.d, z27.d, z22.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "ld1b { z9.b }, p0/Z, [x23, x13]\n"
+    "ld1b { z24.b }, p0/Z, [x22, x13]\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "ld1b { z18.b }, p0/Z, [x21, x13]\n"
+    "ld1b { z8.b }, p0/Z, [x20, x13]\n"
+    "sqadd z29.s, z29.s, z19.s\n"
+    "sqadd z28.s, z28.s, z17.s\n"
+    ".inst 0x44828add  // srshl z29.s, p2/M, z29.s, z22.s\n"
+    ".inst 0x44828adc  // srshl z28.s, p2/M, z28.s, z22.s\n"
+    "sqadd z27.s, z27.s, z16.s\n"
+    "add z23.s, z23.s, z2.s\n"
+    ".inst 0x44828adb  // srshl z27.s, p2/M, z27.s, z22.s\n"
+    "smax z23.s, p2/M, z23.s, z4.s\n"
+    "add z29.s, z29.s, z2.s\n"
+    "add z28.s, z28.s, z2.s\n"
+    "ld1b { z13.b }, p0/Z, [x24, x13]\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "add z27.s, z27.s, z2.s\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "smin z23.s, p2/M, z23.s, z3.s\n"
+    "smax z29.s, p2/M, z29.s, z4.s\n"
+    "smax z28.s, p2/M, z28.s, z4.s\n"
+    "smax z27.s, p2/M, z27.s, z4.s\n"
+    "st1b { z23.s }, p1, [x11, x12]\n"
+    "ld1b { z7.b }, p0/Z, [x23, x13]\n"
+    "ld1b { z23.b }, p0/Z, [x22, x13]\n"
+    "ld1b { z22.b }, p0/Z, [x21, x13]\n"
+    "zip2 z17.b, z15.b, z21.b\n"
+    "zip1 z15.b, z15.b, z21.b\n"
+    "ld1b { z6.b }, p0/Z, [x20, x13]\n"
+    "zip1 z16.b, z26.b, z14.b\n"
+    "zip2 z14.b, z26.b, z14.b\n"
+    "smin z29.s, p2/M, z29.s, z3.s\n"
+    "smin z28.s, p2/M, z28.s, z3.s\n"
+    "smin z27.s, p2/M, z27.s, z3.s\n"
+    "st1b { z29.s }, p1, [x10, x12]\n"
+    "zip2 z12.b, z15.b, z16.b\n"
+    "st1b { z28.s }, p1, [x9, x12]\n"
+    "zip1 z15.b, z15.b, z16.b\n"
+    "zip1 z11.b, z17.b, z14.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "st1b { z27.s }, p1, [x28, x12]\n"
+    "zip2 z14.b, z17.b, z14.b\n"
+    "zip2 z21.b, z13.b, z20.b\n"
+    "ld1w { z5.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "zip1 z13.b, z13.b, z20.b\n"
+    "zip1 z20.b, z25.b, z10.b\n"
+    "incw x12\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "zip2 z10.b, z25.b, z10.b\n"
+    "zip2 z19.b, z9.b, z18.b\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "zip1 z9.b, z9.b, z18.b\n"
+    "zip1 z18.b, z24.b, z8.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "ld1b { z28.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "zip2 z8.b, z24.b, z8.b\n"
+    "zip2 z17.b, z7.b, z22.b\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #8\n"
+    "zip1 z7.b, z7.b, z22.b\n"
+    "zip1 z16.b, z23.b, z6.b\n"
+    "zip2 z6.b, z23.b, z6.b\n"
+    "zip2 z1.b, z13.b, z20.b\n"
+    "zip1 z13.b, z13.b, z20.b\n"
+    "zip1 z0.b, z21.b, z10.b\n"
+    "zip2 z10.b, z21.b, z10.b\n"
+    "zip2 z31.b, z9.b, z18.b\n"
+    "zip1 z9.b, z9.b, z18.b\n"
+    "zip1 z30.b, z19.b, z8.b\n"
+    "zip2 z8.b, z19.b, z8.b\n"
+    "zip2 z27.b, z7.b, z16.b\n"
+    "zip1 z7.b, z7.b, z16.b\n"
+    "zip1 z25.b, z17.b, z6.b\n"
+    "zip2 z6.b, z17.b, z6.b\n"
+    "mov z24.d, z5.d\n"
+    "mov z22.d, z5.d\n"
+    "mov z21.d, z5.d\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..6b006e8d51
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleave_sve_u8q_3x3_dot::get_packed_size(args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleave_sve_u8q_3x3_dot::pack_parameters(
+      args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+      reinterpret_cast<const uint8_t *>(weights), qp, ld_weight_col, ld_weight_row
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f0860c98b9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
+{
+  __asm__ __volatile__(
+    "mov x14, #0x0\n"
+    "whilelt p0.b, x14, %x[n_channels]\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "ldp x25, x24, [%x[inptrs], #0x10]\n"
+    "ldp x23, x22, [%x[inptrs], #0x20]\n"
+    "ldp x13, x21, [%x[inptrs], #0x30]\n"
+    "mov x20, #0x1\n"
+    "ptrue p2.b\n"
+    "ldp x12, x11, [%x[outptrs], #0x0]\n"
+    "ldp x10, x9, [%x[outptrs], #0x10]\n"
+    "orr x20, x20, #0x100\n"
+    "orr x20, x20, #0x10000\n"
+    "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z21.b }, p0/Z, [x26, x14]\n"
+    "dup z25.s, w20\n"
+    "mov x28, #0x0\n"
+    "ldp x27, x26, [%x[inptrs], #0x40]\n"
+    "ld1b { z31.b }, p0/Z, [x25, x14]\n"
+    "zip2 z16.b, z15.b, z31.b\n"
+    "zip1 z15.b, z15.b, z31.b\n"
+    "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+    "ldp x25, x24, [%x[inptrs], #0x50]\n"
+    "zip1 z30.b, z21.b, z29.b\n"
+    "zip2 z29.b, z21.b, z29.b\n"
+    "ld1b { z9.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z20.b }, p0/Z, [x22, x14]\n"
+    "zip2 z13.b, z15.b, z30.b\n"
+    "zip1 z15.b, z15.b, z30.b\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "ld1b { z5.b }, p0/Z, [x13, x14]\n"
+    "zip1 z14.b, z16.b, z29.b\n"
+    "zip2 z29.b, z16.b, z29.b\n"
+    "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "zip2 z31.b, z9.b, z5.b\n"
+    "zip1 z9.b, z9.b, z5.b\n"
+    "ld1b { z18.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x26, x14]\n"
+    "zip1 z21.b, z20.b, z17.b\n"
+    "zip2 z17.b, z20.b, z17.b\n"
+    "ld1b { z6.b }, p0/Z, [x25, x14]\n"
+    "ld1b { z4.b }, p0/Z, [x24, x14]\n"
+    "zip2 z23.b, z18.b, z6.b\n"
+    "zip1 z18.b, z18.b, z6.b\n"
+    "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z19.b }, p0/Z, [x22, x14]\n"
+    "zip1 z24.b, z28.b, z4.b\n"
+    "zip2 z4.b, z28.b, z4.b\n"
+    "ld1b { z16.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+    "zip2 z22.b, z2.b, z16.b\n"
+    "zip1 z2.b, z2.b, z16.b\n"
+    "zip1 z0.b, z19.b, z5.b\n"
+    "zip2 z5.b, z19.b, z5.b\n"
+    "ld1w { z10.s }, p2/Z, [%x[params]]\n"
+    "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "zip2 z19.b, z9.b, z21.b\n"
+    "zip1 z9.b, z9.b, z21.b\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "zip1 z11.b, z31.b, z17.b\n"
+    "zip2 z17.b, z31.b, z17.b\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "zip2 z12.b, z18.b, z24.b\n"
+    "zip1 z18.b, z18.b, z24.b\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "zip1 z20.b, z23.b, z4.b\n"
+    "zip2 z4.b, z23.b, z4.b\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "zip2 z24.b, z2.b, z0.b\n"
+    "zip1 z2.b, z2.b, z0.b\n"
+    "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "zip1 z0.b, z22.b, z5.b\n"
+    "zip2 z5.b, z22.b, z5.b\n"
+    "addvl %x[params], %x[params], #4\n"
+    "mov z22.d, z10.d\n"
+    "mov z31.d, z10.d\n"
+    "mov z21.d, z10.d\n"
+    "1:"  // Loop
+    "mov z30.s, #0x0\n"
+    "udot z30.s, z25.b, z9.b\n"
+    "udot z10.s, z26.b, z15.b\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "udot z30.s, z25.b, z18.b\n"
+    "udot z31.s, z26.b, z9.b\n"
+    "mov z27.s, #0x0\n"
+    "incw x14, ALL, MUL #4\n"
+    "udot z10.s, z3.b, z9.b\n"
+    "ext z9.b, z9.b, z9.b, #0x1\n"
+    "movprfx z28, z30\n udot z28.s, z25.b, z2.b\n"
+    "udot z30.s, z25.b, z15.b\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "udot z27.s, z25.b, z9.b\n"
+    "udot z31.s, z3.b, z18.b\n"
+    "udot z10.s, z1.b, z18.b\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "udot z22.s, z26.b, z15.b\n"
+    "udot z21.s, z26.b, z9.b\n"
+    "udot z27.s, z25.b, z18.b\n"
+    "udot z31.s, z1.b, z2.b\n"
+    "ext z2.b, z2.b, z2.b, #0x1\n"
+    "udot z22.s, z3.b, z9.b\n"
+    "udot z21.s, z3.b, z18.b\n"
+    "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "mls z10.s, p2/M, z30.s, z8.s\n"
+    "movprfx z26, z27\n udot z26.s, z25.b, z2.b\n"
+    "mov z9.s, #0x0\n"
+    "udot z27.s, z25.b, z15.b\n"
+    "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+    "udot z22.s, z1.b, z18.b\n"
+    ".inst 0x04b7754a  // sqrdmulh z10.s, z10.s, z23.s\n"
+    "udot z21.s, z1.b, z2.b\n"
+    "mls z22.s, p2/M, z27.s, z8.s\n"
+    "and z18.d, z10.d, z3.d\n"
+    "mls z31.s, p2/M, z28.s, z8.s\n"
+    "mls z21.s, p2/M, z26.s, z8.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+    ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+    "udot z9.s, z25.b, z19.b\n"
+    ".inst 0x04b776b5  // sqrdmulh z21.s, z21.s, z23.s\n"
+    "sqadd z10.s, z10.s, z18.s\n"
+    ".inst 0x4482886a  // srshl z10.s, p2/M, z10.s, z3.s\n"
+    "udot z9.s, z25.b, z12.b\n"
+    "and z28.d, z22.d, z3.d\n"
+    "and z23.d, z31.d, z3.d\n"
+    "movprfx z27, z9\n udot z27.s, z25.b, z24.b\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "and z18.d, z21.d, z3.d\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "udot z9.s, z25.b, z13.b\n"
+    "asr z23.s, z23.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z22.s, z22.s, z28.s\n"
+    "sqadd z31.s, z31.s, z23.s\n"
+    ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
+    ".inst 0x4482887f  // srshl z31.s, p2/M, z31.s, z3.s\n"
+    "sqadd z21.s, z21.s, z18.s\n"
+    "add z10.s, z10.s, z16.s\n"
+    ".inst 0x44828875  // srshl z21.s, p2/M, z21.s, z3.s\n"
+    "smax z10.s, p2/M, z10.s, z7.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "smin z10.s, p2/M, z10.s, z6.s\n"
+    "smax z22.s, p2/M, z22.s, z7.s\n"
+    "add z21.s, z21.s, z16.s\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "smax z21.s, p2/M, z21.s, z7.s\n"
+    "st1b { z10.s }, p0, [x12, x28]\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "smin z22.s, p2/M, z22.s, z6.s\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z21.s, p2/M, z21.s, z6.s\n"
+    "st1b { z22.s }, p0, [x11, x28]\n"
+    "mov z26.d, z28.d\n"
+    "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "st1b { z31.s }, p0, [x10, x28]\n"
+    "mov z31.d, z28.d\n"
+    "udot z31.s, z1.b, z19.b\n"
+    "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "st1b { z21.s }, p0, [x9, x28]\n"
+    "mov z22.d, z28.d\n"
+    "udot z28.s, z1.b, z13.b\n"
+    "udot z28.s, z15.b, z19.b\n"
+    "ext z13.b, z13.b, z13.b, #0x1\n"
+    "ext z19.b, z19.b, z19.b, #0x1\n"
+    "udot z26.s, z1.b, z13.b\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "mov z18.s, #0x0\n"
+    "udot z22.s, z1.b, z19.b\n"
+    "udot z18.s, z25.b, z19.b\n"
+    "incw x28\n"
+    "udot z31.s, z15.b, z12.b\n"
+    "udot z28.s, z23.b, z12.b\n"
+    "ext z12.b, z12.b, z12.b, #0x1\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "udot z26.s, z15.b, z19.b\n"
+    "udot z22.s, z15.b, z12.b\n"
+    "addvl %x[params], %x[params], #16\n"
+    "udot z18.s, z25.b, z12.b\n"
+    "udot z31.s, z23.b, z24.b\n"
+    "ext z24.b, z24.b, z24.b, #0x1\n"
+    "mls z28.s, p2/M, z9.s, z8.s\n"
+    "udot z26.s, z23.b, z12.b\n"
+    ".inst 0x04be779c  // sqrdmulh z28.s, z28.s, z30.s\n"
+    "udot z22.s, z23.b, z24.b\n"
+    "movprfx z12, z18\n udot z12.s, z25.b, z24.b\n"
+    "and z2.d, z28.d, z21.d\n"
+    "udot z18.s, z25.b, z13.b\n"
+    "mls z26.s, p2/M, z18.s, z8.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    "mls z31.s, p2/M, z27.s, z8.s\n"
+    "mls z22.s, p2/M, z12.s, z8.s\n"
+    ".inst 0x04be775a  // sqrdmulh z26.s, z26.s, z30.s\n"
+    ".inst 0x04be77ff  // sqrdmulh z31.s, z31.s, z30.s\n"
+    ".inst 0x04be76d6  // sqrdmulh z22.s, z22.s, z30.s\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+    "sqadd z28.s, z28.s, z2.s\n"
+    "and z24.d, z26.d, z21.d\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    "and z23.d, z31.d, z21.d\n"
+    "and z18.d, z22.d, z21.d\n"
+    "asr z24.s, z24.s, #0x1f\n"
+    "asr z23.s, z23.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z26.s, z26.s, z24.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+    "sqadd z31.s, z31.s, z23.s\n"
+    "sqadd z22.s, z22.s, z18.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    ".inst 0x44828ab6  // srshl z22.s, p2/M, z22.s, z21.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "smax z28.s, p2/M, z28.s, z7.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "smin z28.s, p2/M, z28.s, z6.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "smax z26.s, p2/M, z26.s, z7.s\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "mov z24.s, #0x0\n"
+    "udot z24.s, z25.b, z11.b\n"
+    "smax z22.s, p2/M, z22.s, z7.s\n"
+    "st1b { z28.s }, p0, [x12, x28]\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "smin z26.s, p2/M, z26.s, z6.s\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z22.s, p2/M, z22.s, z6.s\n"
+    "st1b { z26.s }, p0, [x11, x28]\n"
+    "mov z28.d, z23.d\n"
+    "udot z24.s, z25.b, z20.b\n"
+    "st1b { z31.s }, p0, [x10, x28]\n"
+    "mov z27.d, z23.d\n"
+    "udot z27.s, z19.b, z11.b\n"
+    "movprfx z13, z24\n udot z13.s, z25.b, z0.b\n"
+    "st1b { z22.s }, p0, [x9, x28]\n"
+    "mov z26.d, z23.d\n"
+    "udot z23.s, z19.b, z14.b\n"
+    "udot z23.s, z30.b, z11.b\n"
+    "udot z24.s, z25.b, z14.b\n"
+    "ext z14.b, z14.b, z14.b, #0x1\n"
+    "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+    "udot z28.s, z19.b, z14.b\n"
+    "ext z11.b, z11.b, z11.b, #0x1\n"
+    "mov z12.s, #0x0\n"
+    "udot z26.s, z19.b, z11.b\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+    "udot z12.s, z25.b, z11.b\n"
+    "udot z27.s, z30.b, z20.b\n"
+    "incw x28\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "udot z23.s, z21.b, z20.b\n"
+    "ext z20.b, z20.b, z20.b, #0x1\n"
+    "udot z28.s, z30.b, z11.b\n"
+    "udot z26.s, z30.b, z20.b\n"
+    "udot z12.s, z25.b, z20.b\n"
+    "udot z27.s, z21.b, z0.b\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "mls z23.s, p2/M, z24.s, z8.s\n"
+    "udot z28.s, z21.b, z20.b\n"
+    "udot z26.s, z21.b, z0.b\n"
+    ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+    "movprfx z19, z12\n udot z19.s, z25.b, z0.b\n"
+    "udot z12.s, z25.b, z14.b\n"
+    "and z18.d, z23.d, z22.d\n"
+    "mls z28.s, p2/M, z12.s, z8.s\n"
+    "mls z27.s, p2/M, z13.s, z8.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "mls z26.s, p2/M, z19.s, z8.s\n"
+    ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+    ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+    ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+    "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "sqadd z23.s, z23.s, z18.s\n"
+    "and z20.d, z28.d, z22.d\n"
+    ".inst 0x44828ad7  // srshl z23.s, p2/M, z23.s, z22.s\n"
+    "and z19.d, z27.d, z22.d\n"
+    "and z18.d, z26.d, z22.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z20.s\n"
+    ".inst 0x44828adc  // srshl z28.s, p2/M, z28.s, z22.s\n"
+    "ld1b { z13.b }, p2/Z, [%x[params]]\n"
+    "sqadd z27.s, z27.s, z19.s\n"
+    "sqadd z26.s, z26.s, z18.s\n"
+    ".inst 0x44828adb  // srshl z27.s, p2/M, z27.s, z22.s\n"
+    ".inst 0x44828ada  // srshl z26.s, p2/M, z26.s, z22.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "smax z23.s, p2/M, z23.s, z7.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "smin z23.s, p2/M, z23.s, z6.s\n"
+    "add z27.s, z27.s, z16.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "smax z28.s, p2/M, z28.s, z7.s\n"
+    "smax z27.s, p2/M, z27.s, z7.s\n"
+    "mov z24.s, #0x0\n"
+    "udot z24.s, z25.b, z17.b\n"
+    "smax z26.s, p2/M, z26.s, z7.s\n"
+    "st1b { z23.s }, p0, [x12, x28]\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+    "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+    "smin z28.s, p2/M, z28.s, z6.s\n"
+    "smin z27.s, p2/M, z27.s, z6.s\n"
+    "smin z26.s, p2/M, z26.s, z6.s\n"
+    "st1b { z28.s }, p0, [x11, x28]\n"
+    "mov z0.d, z1.d\n"
+    "udot z24.s, z25.b, z4.b\n"
+    "st1b { z27.s }, p0, [x10, x28]\n"
+    "mov z31.d, z1.d\n"
+    "udot z31.s, z21.b, z17.b\n"
+    "movprfx z23, z24\n udot z23.s, z25.b, z5.b\n"
+    "st1b { z26.s }, p0, [x9, x28]\n"
+    "mov z30.d, z1.d\n"
+    "udot z1.s, z21.b, z29.b\n"
+    "udot z1.s, z13.b, z17.b\n"
+    "udot z24.s, z25.b, z29.b\n"
+    "ext z29.b, z29.b, z29.b, #0x1\n"
+    "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "udot z0.s, z21.b, z29.b\n"
+    "ext z17.b, z17.b, z17.b, #0x1\n"
+    "mov z19.s, #0x0\n"
+    "udot z30.s, z21.b, z17.b\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "udot z19.s, z25.b, z17.b\n"
+    "udot z31.s, z13.b, z4.b\n"
+    "incw x28\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "udot z1.s, z20.b, z4.b\n"
+    "ext z4.b, z4.b, z4.b, #0x1\n"
+    "udot z0.s, z13.b, z17.b\n"
+    "whilelt p0.b, x14, %x[n_channels]\n"
+    "udot z30.s, z13.b, z4.b\n"
+    "udot z19.s, z25.b, z4.b\n"
+    "ld1b { z13.b }, p0/Z, [x26, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+    "udot z31.s, z20.b, z5.b\n"
+    "ext z5.b, z5.b, z5.b, #0x1\n"
+    "mls z1.s, p2/M, z24.s, z8.s\n"
+    "ld1b { z27.b }, p0/Z, [x22, x14]\n"
+    "udot z0.s, z20.b, z4.b\n"
+    "udot z30.s, z20.b, z5.b\n"
+    ".inst 0x04a27421  // sqrdmulh z1.s, z1.s, z2.s\n"
+    "ld1b { z26.b }, p0/Z, [x21, x14]\n"
+    "movprfx z18, z19\n udot z18.s, z25.b, z5.b\n"
+    "udot z19.s, z25.b, z29.b\n"
+    "and z11.d, z1.d, z22.d\n"
+    "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+    "mls z0.s, p2/M, z19.s, z8.s\n"
+    "mls z31.s, p2/M, z23.s, z8.s\n"
+    "asr z11.s, z11.s, #0x1f\n"
+    "ld1b { z17.b }, p0/Z, [x20, x14]\n"
+    "mls z30.s, p2/M, z18.s, z8.s\n"
+    ".inst 0x04a27400  // sqrdmulh z0.s, z0.s, z2.s\n"
+    ".inst 0x04a277ff  // sqrdmulh z31.s, z31.s, z2.s\n"
+    ".inst 0x04a277de  // sqrdmulh z30.s, z30.s, z2.s\n"
+    "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+    "ldp x23, x22, [%x[inptrs], #0x40]\n"
+    "sqadd z1.s, z1.s, z11.s\n"
+    "and z21.d, z0.d, z22.d\n"
+    ".inst 0x44828ac1  // srshl z1.s, p2/M, z1.s, z22.s\n"
+    "ldp x21, x20, [%x[inptrs], #0x50]\n"
+    "and z20.d, z31.d, z22.d\n"
+    "and z19.d, z30.d, z22.d\n"
+    "ld1b { z18.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z11.b }, p0/Z, [x22, x14]\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "ld1b { z24.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "sqadd z0.s, z0.s, z21.s\n"
+    ".inst 0x44828ac0  // srshl z0.s, p2/M, z0.s, z22.s\n"
+    "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "sqadd z31.s, z31.s, z20.s\n"
+    "sqadd z30.s, z30.s, z19.s\n"
+    ".inst 0x44828adf  // srshl z31.s, p2/M, z31.s, z22.s\n"
+    ".inst 0x44828ade  // srshl z30.s, p2/M, z30.s, z22.s\n"
+    "add z1.s, z1.s, z16.s\n"
+    "smax z1.s, p2/M, z1.s, z7.s\n"
+    "add z0.s, z0.s, z16.s\n"
+    "ld1b { z9.b }, p0/Z, [x24, x14]\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z30.s, z30.s, z16.s\n"
+    "ldp x23, x22, [%x[inptrs], #0x60]\n"
+    "ldp x21, x20, [%x[inptrs], #0x70]\n"
+    "smin z1.s, p2/M, z1.s, z6.s\n"
+    "smax z0.s, p2/M, z0.s, z7.s\n"
+    "st1b { z1.s }, p1, [x12, x28]\n"
+    "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+    "smax z31.s, p2/M, z31.s, z7.s\n"
+    "smax z30.s, p2/M, z30.s, z7.s\n"
+    "ld1b { z23.b }, p0/Z, [x22, x14]\n"
+    "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+    "zip2 z20.b, z15.b, z28.b\n"
+    "zip1 z15.b, z15.b, z28.b\n"
+    "smin z0.s, p2/M, z0.s, z6.s\n"
+    "zip1 z19.b, z13.b, z29.b\n"
+    "zip2 z29.b, z13.b, z29.b\n"
+    "smin z31.s, p2/M, z31.s, z6.s\n"
+    "smin z30.s, p2/M, z30.s, z6.s\n"
+    "st1b { z0.s }, p1, [x11, x28]\n"
+    "zip2 z13.b, z15.b, z19.b\n"
+    "zip1 z15.b, z15.b, z19.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x0]\n"
+    "st1b { z31.s }, p1, [x10, x28]\n"
+    "zip1 z14.b, z20.b, z29.b\n"
+    "zip2 z29.b, z20.b, z29.b\n"
+    "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "st1b { z30.s }, p1, [x9, x28]\n"
+    "zip2 z21.b, z9.b, z26.b\n"
+    "zip1 z9.b, z9.b, z26.b\n"
+    "incw x28\n"
+    "zip1 z20.b, z27.b, z17.b\n"
+    "zip2 z17.b, z27.b, z17.b\n"
+    "ldp x25, x23, [%x[inptrs], #0x10]\n"
+    "ldp x24, x22, [%x[inptrs], #0x20]\n"
+    "zip2 z31.b, z18.b, z24.b\n"
+    "zip1 z18.b, z18.b, z24.b\n"
+    "ldp x21, x20, [%x[inptrs], #0x30]\n"
+    "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "zip1 z27.b, z11.b, z4.b\n"
+    "zip2 z4.b, z11.b, z4.b\n"
+    "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #8\n"
+    "zip2 z30.b, z2.b, z22.b\n"
+    "zip1 z2.b, z2.b, z22.b\n"
+    "zip1 z28.b, z23.b, z5.b\n"
+    "zip2 z5.b, z23.b, z5.b\n"
+    "zip2 z19.b, z9.b, z20.b\n"
+    "zip1 z9.b, z9.b, z20.b\n"
+    "zip1 z11.b, z21.b, z17.b\n"
+    "zip2 z17.b, z21.b, z17.b\n"
+    "zip2 z12.b, z18.b, z27.b\n"
+    "zip1 z18.b, z18.b, z27.b\n"
+    "zip1 z20.b, z31.b, z4.b\n"
+    "zip2 z4.b, z31.b, z4.b\n"
+    "zip2 z24.b, z2.b, z28.b\n"
+    "zip1 z2.b, z2.b, z28.b\n"
+    "zip1 z0.b, z30.b, z5.b\n"
+    "zip2 z5.b, z30.b, z5.b\n"
+    "mov z22.d, z10.d\n"
+    "mov z31.d, z10.d\n"
+    "mov z21.d, z10.d\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..0300b71d7c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..5c26010c0d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x16, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x16\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z12.b }, p4/Z, [x21]\n"
+    "ld1rb { z30.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z24.h }, p4/Z, [x22]\n"
+    "ld1rh { z11.h }, p4/Z, [x21]\n"
+    "ld1rh { z26.h }, p4/Z, [x20]\n"
+    "ldp x13, x12, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x16, x15\n"
+    "ldp x11, x10, [x24, #0x10]\n"
+    "whilelt p2.s, x16, x15\n"
+    "whilelt p1.s, x23, x15\n"
+    "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1b { z14.h }, p4/Z, [x14]\n"
+    "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x27, #0x0\n"
+    "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e19ce  // usublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e1ab5  // usublb z21.h, z21.b, z30.b\n"
+    "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1821  // usublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e18c6  // usublb z6.h, z6.b, z30.b\n"
+    "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1b { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x9, x9, #2\n"
+    "mov z17.d, z5.d\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z25.d, z9.d\n"
+    "mov z16.d, z5.d\n"
+    "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z23.d, z9.d\n"
+    "mov z22.d, z5.d\n"
+    "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z27.d, z9.d\n"
+    ".inst 0x455e1a52  // usublb z18.h, z18.b, z30.b\n"
+    "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+    "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455e18e7  // usublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e194a  // usublb z10.h, z10.b, z30.b\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1a94  // usublb z20.h, z20.b, z12.b\n"
+    "1:"  // Loop
+    ".inst 0x44824005  // smlalb z5.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824409  // smlalt z9.s, p4/M, z0.h, z2.h\n"
+    "ldr x20, [x28, #0x28]\n"
+    "ldr x21, [x28, #0x38]\n"
+    ".inst 0x448e43a5  // smlalb z5.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x44864011  // smlalb z17.s, p4/M, z0.h, z6.h\n"
+    "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x30]\n"
+    ".inst 0x44954010  // smlalb z16.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x448e4016  // smlalb z22.s, p4/M, z0.h, z14.h\n"
+    "ld1b { z31.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c1863  // usublb z3.h, z3.b, z12.b\n"
+    ".inst 0x448e47a9  // smlalt z9.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x449241a5  // smlalb z5.s, p4/M, z13.h, z18.h\n"
+    "ldr x21, [x28, #0x40]\n"
+    "ld1b { z15.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44864419  // smlalt z25.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    "ldr x20, [x28, #0x48]\n"
+    ".inst 0x448e441b  // smlalt z27.s, p4/M, z0.h, z14.h\n"
+    ".inst 0x44814091  // smlalb z17.s, p4/M, z4.h, z1.h\n"
+    "ld1b { z19.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c19ef  // usublb z15.h, z15.b, z12.b\n"
+    ".inst 0x448141b0  // smlalb z16.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x449541b6  // smlalb z22.s, p4/M, z13.h, z21.h\n"
+    "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c1a73  // usublb z19.h, z19.b, z12.b\n"
+    ".inst 0x449245a9  // smlalt z9.s, p4/M, z13.h, z18.h\n"
+    ".inst 0x448a4285  // smlalb z5.s, p4/M, z20.h, z10.h\n"
+    "ldr x21, [x28, #0x50]\n"
+    "ldr x20, [x28, #0x58]\n"
+    ".inst 0x44814499  // smlalt z25.s, p4/M, z4.h, z1.h\n"
+    ".inst 0x448145b7  // smlalt z23.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x454c1b9c  // usublb z28.h, z28.b, z12.b\n"
+    "ld1b { z4.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x449545bb  // smlalt z27.s, p4/M, z13.h, z21.h\n"
+    ".inst 0x448241b1  // smlalb z17.s, p4/M, z13.h, z2.h\n"
+    "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+    "ldr x21, [x28, #0x60]\n"
+    ".inst 0x44874070  // smlalb z16.s, p4/M, z3.h, z7.h\n"
+    ".inst 0x44864296  // smlalb z22.s, p4/M, z20.h, z6.h\n"
+    "ldr x20, [x28, #0x68]\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x448a4689  // smlalt z9.s, p4/M, z20.h, z10.h\n"
+    ".inst 0x449543e5  // smlalb z5.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    "ld1b { z0.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x448245b9  // smlalt z25.s, p4/M, z13.h, z2.h\n"
+    ".inst 0x44874477  // smlalt z23.s, p4/M, z3.h, z7.h\n"
+    "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x70]\n"
+    ".inst 0x4486469b  // smlalt z27.s, p4/M, z20.h, z6.h\n"
+    ".inst 0x44874291  // smlalb z17.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    "ld1b { z13.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44824290  // smlalb z16.s, p4/M, z20.h, z2.h\n"
+    ".inst 0x448841f6  // smlalb z22.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x454c1863  // usublb z3.h, z3.b, z12.b\n"
+    "ldr x20, [x28, #0x78]\n"
+    ".inst 0x449547e9  // smlalt z9.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x44814265  // smlalb z5.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    "whilelt p0.h, x27, x15\n"
+    ".inst 0x44874699  // smlalt z25.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x44824697  // smlalt z23.s, p4/M, z20.h, z2.h\n"
+    "ld1w { z20.s }, p2/Z, [x26]\n"
+    "inch x14\n"
+    ".inst 0x448845fb  // smlalt z27.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x448e43f1  // smlalb z17.s, p4/M, z31.h, z14.h\n"
+    "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44924390  // smlalb z16.s, p4/M, z28.h, z18.h\n"
+    ".inst 0x44824396  // smlalb z22.s, p4/M, z28.h, z2.h\n"
+    "addvl x26, x26, #2\n"
+    ".inst 0x44814669  // smlalt z9.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x44884385  // smlalb z5.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x448e47f9  // smlalt z25.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x44924797  // smlalt z23.s, p4/M, z28.h, z18.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    ".inst 0x4482479b  // smlalt z27.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x44954271  // smlalb z17.s, p4/M, z19.h, z21.h\n"
+    "uzp1 z2.s, z20.s, z15.s\n"
+    "inch x16\n"
+    ".inst 0x448e4090  // smlalb z16.s, p4/M, z4.h, z14.h\n"
+    ".inst 0x448143b6  // smlalb z22.s, p4/M, z29.h, z1.h\n"
+    "uzp2 z15.s, z20.s, z15.s\n"
+    "ld1w { z20.s }, p2/Z, [x25]\n"
+    ".inst 0x44884789  // smlalt z9.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x44864085  // smlalb z5.s, p4/M, z4.h, z6.h\n"
+    "mov x20, x16\n"
+    "incw x20\n"
+    ".inst 0x44954679  // smlalt z25.s, p4/M, z19.h, z21.h\n"
+    ".inst 0x448e4497  // smlalt z23.s, p4/M, z4.h, z14.h\n"
+    "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+    "uzp1 z21.s, z20.s, z19.s\n"
+    ".inst 0x448147bb  // smlalt z27.s, p4/M, z29.h, z1.h\n"
+    ".inst 0x448a4391  // smlalb z17.s, p4/M, z28.h, z10.h\n"
+    "uzp2 z1.s, z20.s, z19.s\n"
+    "whilelt p2.s, x16, x15\n"
+    ".inst 0x44864010  // smlalb z16.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44924076  // smlalb z22.s, p4/M, z3.h, z18.h\n"
+    "whilelt p1.s, x20, x15\n"
+    "whilelt p3.h, x16, x15\n"
+    ".inst 0x44864489  // smlalt z9.s, p4/M, z4.h, z6.h\n"
+    ".inst 0x44874005  // smlalb z5.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x04a274a5  // sqrdmulh z5.s, z5.s, z2.s\n"
+    "addvl x25, x25, #2\n"
+    ".inst 0x448a4799  // smlalt z25.s, p4/M, z28.h, z10.h\n"
+    ".inst 0x44864417  // smlalt z23.s, p4/M, z0.h, z6.h\n"
+    "and z19.d, z5.d, z21.d\n"
+    ".inst 0x4492447b  // smlalt z27.s, p4/M, z3.h, z18.h\n"
+    ".inst 0x449243b1  // smlalb z17.s, p4/M, z29.h, z18.h\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    ".inst 0x448a41b0  // smlalb z16.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448741b6  // smlalb z22.s, p4/M, z13.h, z7.h\n"
+    "sqadd z5.s, z5.s, z19.s\n"
+    ".inst 0x448292a5  // srshl z5.s, p4/M, z5.s, z21.s\n"
+    ".inst 0x44874409  // smlalt z9.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x449247b9  // smlalt z25.s, p4/M, z29.h, z18.h\n"
+    ".inst 0x04af7529  // sqrdmulh z9.s, z9.s, z15.s\n"
+    ".inst 0x448a45b7  // smlalt z23.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448745bb  // smlalt z27.s, p4/M, z13.h, z7.h\n"
+    "and z29.d, z9.d, z1.d\n"
+    ".inst 0x44884071  // smlalb z17.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x448843f0  // smlalb z16.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x04a27631  // sqrdmulh z17.s, z17.s, z2.s\n"
+    ".inst 0x448a43f6  // smlalb z22.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x44884479  // smlalt z25.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x04a27610  // sqrdmulh z16.s, z16.s, z2.s\n"
+    ".inst 0x448847f7  // smlalt z23.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x448a47fb  // smlalt z27.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x04a276d6  // sqrdmulh z22.s, z22.s, z2.s\n"
+    "asr z29.s, z29.s, #0x1f\n"
+    "and z18.d, z17.d, z21.d\n"
+    ".inst 0x04af7739  // sqrdmulh z25.s, z25.s, z15.s\n"
+    "and z20.d, z16.d, z21.d\n"
+    ".inst 0x04af76f7  // sqrdmulh z23.s, z23.s, z15.s\n"
+    "and z19.d, z22.d, z21.d\n"
+    ".inst 0x04af777b  // sqrdmulh z27.s, z27.s, z15.s\n"
+    "sqadd z9.s, z9.s, z29.s\n"
+    ".inst 0x44829029  // srshl z9.s, p4/M, z9.s, z1.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z7.d, z25.d, z1.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z6.d, z23.d, z1.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z2.d, z27.d, z1.d\n"
+    "sqadd z17.s, z17.s, z18.s\n"
+    "asr z7.s, z7.s, #0x1f\n"
+    ".inst 0x448292b1  // srshl z17.s, p4/M, z17.s, z21.s\n"
+    "sqadd z16.s, z16.s, z20.s\n"
+    "asr z6.s, z6.s, #0x1f\n"
+    ".inst 0x448292b0  // srshl z16.s, p4/M, z16.s, z21.s\n"
+    "sqadd z22.s, z22.s, z19.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    ".inst 0x448292b6  // srshl z22.s, p4/M, z22.s, z21.s\n"
+    "sqadd z25.s, z25.s, z7.s\n"
+    "sqadd z23.s, z23.s, z6.s\n"
+    ".inst 0x44829039  // srshl z25.s, p4/M, z25.s, z1.s\n"
+    ".inst 0x44829037  // srshl z23.s, p4/M, z23.s, z1.s\n"
+    "sqadd z27.s, z27.s, z2.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x4482903b  // srshl z27.s, p4/M, z27.s, z1.s\n"
+    ".inst 0x45304231  // sqxtnb z17.h, z17.s\n"
+    ".inst 0x45304210  // sqxtnb z16.h, z16.s\n"
+    ".inst 0x453042d6  // sqxtnb z22.h, z22.s\n"
+    ".inst 0x45304525  // sqxtnt z5.h, z9.s\n"
+    ".inst 0x45304731  // sqxtnt z17.h, z25.s\n"
+    ".inst 0x453046f0  // sqxtnt z16.h, z23.s\n"
+    ".inst 0x45304776  // sqxtnt z22.h, z27.s\n"
+    "sqadd z5.h, z5.h, z24.h\n"
+    "smax z5.h, p4/M, z5.h, z11.h\n"
+    "smin z5.h, p4/M, z5.h, z26.h\n"
+    "sqadd z17.h, z17.h, z24.h\n"
+    "sqadd z16.h, z16.h, z24.h\n"
+    "smax z17.h, p4/M, z17.h, z11.h\n"
+    "smax z16.h, p4/M, z16.h, z11.h\n"
+    "sqadd z22.h, z22.h, z24.h\n"
+    "smax z22.h, p4/M, z22.h, z11.h\n"
+    "smin z17.h, p4/M, z17.h, z26.h\n"
+    "st1b { z5.h }, p0, [x13, x27]\n"
+    "smin z16.h, p4/M, z16.h, z26.h\n"
+    "smin z22.h, p4/M, z22.h, z26.h\n"
+    "st1b { z17.h }, p0, [x12, x27]\n"
+    "st1b { z16.h }, p0, [x11, x27]\n"
+    "st1b { z22.h }, p0, [x10, x27]\n"
+    "ld1b { z14.h }, p4/Z, [x14]\n"
+    "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "inch x27\n"
+    "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e19ce  // usublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e1ab5  // usublb z21.h, z21.b, z30.b\n"
+    "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1821  // usublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e18c6  // usublb z6.h, z6.b, z30.b\n"
+    "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1b { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x21, x21, #2\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z17.d, z5.d\n"
+    "mov z25.d, z9.d\n"
+    "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z16.d, z5.d\n"
+    "mov z23.d, z9.d\n"
+    "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z22.d, z5.d\n"
+    "mov z27.d, z9.d\n"
+    "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x455e1a52  // usublb z18.h, z18.b, z30.b\n"
+    ".inst 0x455e18e7  // usublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e194a  // usublb z10.h, z10.b, z30.b\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1a94  // usublb z20.h, z20.b, z12.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..bcd0d60d3c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..1ea2fcbfbd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x7, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x7\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z26.b }, p4/Z, [x21]\n"
+    "ld1rb { z13.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z19.h }, p4/Z, [x22]\n"
+    "ld1rh { z12.h }, p4/Z, [x21]\n"
+    "ld1rh { z9.h }, p4/Z, [x20]\n"
+    "ldp x16, x15, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x7, x8\n"
+    "ldp x14, x13, [x24, #0x10]\n"
+    "whilelt p2.s, x7, x8\n"
+    "whilelt p1.s, x23, x8\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1b { z25.h }, p4/Z, [x17]\n"
+    "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x10, #0x0\n"
+    "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1b39  // usublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d1bde  // usublb z30.h, z30.b, z13.b\n"
+    "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d19ce  // usublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1884  // usublb z4.h, z4.b, z13.b\n"
+    "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d194a  // usublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x12]\n"
+    "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1b { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x12, x12, #2\n"
+    "mov z18.d, z8.d\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z0.d, z24.d\n"
+    "mov z15.d, z8.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z1.d, z24.d\n"
+    "mov z5.d, z8.d\n"
+    "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z6.d, z24.d\n"
+    ".inst 0x454d1863  // usublb z3.h, z3.b, z13.b\n"
+    "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d1af7  // usublb z23.h, z23.b, z13.b\n"
+    ".inst 0x454d18e7  // usublb z7.h, z7.b, z13.b\n"
+    "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d1842  // usublb z2.h, z2.b, z13.b\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "1:"  // Loop
+    ".inst 0x448242a8  // smlalb z8.s, p4/M, z21.h, z2.h\n"
+    "ldr x21, [x11, #0x58]\n"
+    "ldr x20, [x11, #0x78]\n"
+    ".inst 0x448246b8  // smlalt z24.s, p4/M, z21.h, z2.h\n"
+    ".inst 0x449942c8  // smlalb z8.s, p4/M, z22.h, z25.h\n"
+    "ld1b { z17.h }, p3/Z, [x21, x7]\n"
+    "ld1b { z29.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x449742b2  // smlalb z18.s, p4/M, z21.h, z23.h\n"
+    "ldr x21, [x11, #0x60]\n"
+    "ldr x20, [x11, #0x80]\n"
+    ".inst 0x448e42af  // smlalb z15.s, p4/M, z21.h, z14.h\n"
+    ".inst 0x449942a5  // smlalb z5.s, p4/M, z21.h, z25.h\n"
+    ".inst 0x449946d8  // smlalt z24.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x455a1a31  // usublb z17.h, z17.b, z26.b\n"
+    ".inst 0x449e4168  // smlalb z8.s, p4/M, z11.h, z30.h\n"
+    "ld1b { z22.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a1bbd  // usublb z29.h, z29.b, z26.b\n"
+    ".inst 0x449746a0  // smlalt z0.s, p4/M, z21.h, z23.h\n"
+    ".inst 0x448e46a1  // smlalt z1.s, p4/M, z21.h, z14.h\n"
+    "ldr x21, [x11, #0x68]\n"
+    ".inst 0x449946a6  // smlalt z6.s, p4/M, z21.h, z25.h\n"
+    "ld1b { z21.h }, p3/Z, [x20, x7]\n"
+    "ldr x20, [x11, #0x88]\n"
+    ".inst 0x449e4292  // smlalb z18.s, p4/M, z20.h, z30.h\n"
+    ".inst 0x4484422f  // smlalb z15.s, p4/M, z17.h, z4.h\n"
+    ".inst 0x448a43a5  // smlalb z5.s, p4/M, z29.h, z10.h\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    "ldr x22, [x11, #0x40]\n"
+    ".inst 0x449e4578  // smlalt z24.s, p4/M, z11.h, z30.h\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    ".inst 0x44844388  // smlalb z8.s, p4/M, z28.h, z4.h\n"
+    "ld1b { z11.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x449e4680  // smlalt z0.s, p4/M, z20.h, z30.h\n"
+    "ld1b { z20.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844621  // smlalt z1.s, p4/M, z17.h, z4.h\n"
+    "ldr x21, [x11, #0x70]\n"
+    ".inst 0x448a47a6  // smlalt z6.s, p4/M, z29.h, z10.h\n"
+    "ldr x20, [x11, #0x98]\n"
+    ".inst 0x448e4372  // smlalb z18.s, p4/M, z27.h, z14.h\n"
+    "ldr x23, [x11, #0x50]\n"
+    ".inst 0x449942cf  // smlalb z15.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e42a5  // smlalb z5.s, p4/M, z21.h, z30.h\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    "ld1b { z17.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x44844798  // smlalt z24.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x448a4208  // smlalb z8.s, p4/M, z16.h, z10.h\n"
+    "ld1b { z29.h }, p3/Z, [x21, x7]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448e4760  // smlalt z0.s, p4/M, z27.h, z14.h\n"
+    "ldr x22, [x11, #0x48]\n"
+    ".inst 0x449946c1  // smlalt z1.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e46a6  // smlalt z6.s, p4/M, z21.h, z30.h\n"
+    "ldr x21, [x11, #0x90]\n"
+    "ldr x20, [x11, #0xa8]\n"
+    ".inst 0x449943f2  // smlalb z18.s, p4/M, z31.h, z25.h\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x448a416f  // smlalb z15.s, p4/M, z11.h, z10.h\n"
+    ".inst 0x44834285  // smlalb z5.s, p4/M, z20.h, z3.h\n"
+    ".inst 0x455a1a31  // usublb z17.h, z17.b, z26.b\n"
+    ".inst 0x448a4618  // smlalt z24.s, p4/M, z16.h, z10.h\n"
+    ".inst 0x455a1bbd  // usublb z29.h, z29.b, z26.b\n"
+    ".inst 0x448e43e8  // smlalb z8.s, p4/M, z31.h, z14.h\n"
+    "ld1b { z16.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x449947e0  // smlalt z0.s, p4/M, z31.h, z25.h\n"
+    "ld1b { z25.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x448a4561  // smlalt z1.s, p4/M, z11.h, z10.h\n"
+    "ld1b { z11.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x44834686  // smlalt z6.s, p4/M, z20.h, z3.h\n"
+    "ldr x21, [x11, #0xa0]\n"
+    "ldr x20, [x11, #0xb0]\n"
+    ".inst 0x448a4232  // smlalb z18.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e43af  // smlalb z15.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x448e4385  // smlalb z5.s, p4/M, z28.h, z14.h\n"
+    ".inst 0x448e47f8  // smlalt z24.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x455a1b39  // usublb z25.h, z25.b, z26.b\n"
+    "ld1b { z20.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    ".inst 0x44834368  // smlalb z8.s, p4/M, z27.h, z3.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448a4620  // smlalt z0.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e47a1  // smlalt z1.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x448e4786  // smlalt z6.s, p4/M, z28.h, z14.h\n"
+    "ldr x20, [x11, #0xb8]\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x44834212  // smlalb z18.s, p4/M, z16.h, z3.h\n"
+    ".inst 0x4497432f  // smlalb z15.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "ld1b { z30.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844165  // smlalb z5.s, p4/M, z11.h, z4.h\n"
+    ".inst 0x44834778  // smlalt z24.s, p4/M, z27.h, z3.h\n"
+    "ldr x20, [x11, #0xc0]\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    ".inst 0x449742c8  // smlalb z8.s, p4/M, z22.h, z23.h\n"
+    ".inst 0x44834600  // smlalt z0.s, p4/M, z16.h, z3.h\n"
+    "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+    ".inst 0x455a1bde  // usublb z30.h, z30.b, z26.b\n"
+    ".inst 0x44974721  // smlalt z1.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x44844566  // smlalt z6.s, p4/M, z11.h, z4.h\n"
+    "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+    "uzp1 z10.s, z17.s, z14.s\n"
+    ".inst 0x44844372  // smlalb z18.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x4487428f  // smlalb z15.s, p4/M, z20.h, z7.h\n"
+    "uzp2 z14.s, z17.s, z14.s\n"
+    "ld1w { z17.s }, p2/Z, [x28]\n"
+    ".inst 0x448743e5  // smlalb z5.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x449746d8  // smlalt z24.s, p4/M, z22.h, z23.h\n"
+    "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+    ".inst 0x455a1b39  // usublb z25.h, z25.b, z26.b\n"
+    ".inst 0x448743a8  // smlalb z8.s, p4/M, z29.h, z7.h\n"
+    ".inst 0x44844760  // smlalt z0.s, p4/M, z27.h, z4.h\n"
+    "uzp1 z4.s, z17.s, z16.s\n"
+    "inch x7\n"
+    ".inst 0x44874681  // smlalt z1.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x448747e6  // smlalt z6.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x04aa7508  // sqrdmulh z8.s, z8.s, z10.s\n"
+    "whilelt p0.h, x10, x8\n"
+    ".inst 0x448742b2  // smlalb z18.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x4483416f  // smlalb z15.s, p4/M, z11.h, z3.h\n"
+    "uzp2 z22.s, z17.s, z16.s\n"
+    "mov x20, x7\n"
+    ".inst 0x449743c5  // smlalb z5.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x448747b8  // smlalt z24.s, p4/M, z29.h, z7.h\n"
+    "and z17.d, z8.d, z4.d\n"
+    "inch x17\n"
+    ".inst 0x448746a0  // smlalt z0.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x44834561  // smlalt z1.s, p4/M, z11.h, z3.h\n"
+    ".inst 0x04ae7718  // sqrdmulh z24.s, z24.s, z14.s\n"
+    "incw x20\n"
+    ".inst 0x449747c6  // smlalt z6.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x44824392  // smlalb z18.s, p4/M, z28.h, z2.h\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "whilelt p2.s, x7, x8\n"
+    ".inst 0x448243cf  // smlalb z15.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x44824325  // smlalb z5.s, p4/M, z25.h, z2.h\n"
+    "and z16.d, z24.d, z22.d\n"
+    "whilelt p1.s, x20, x8\n"
+    ".inst 0x44824780  // smlalt z0.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x448247c1  // smlalt z1.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x04aa7652  // sqrdmulh z18.s, z18.s, z10.s\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44824726  // smlalt z6.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x04aa75ef  // sqrdmulh z15.s, z15.s, z10.s\n"
+    "whilelt p3.h, x7, x8\n"
+    "addvl x9, x9, #2\n"
+    ".inst 0x04aa74a5  // sqrdmulh z5.s, z5.s, z10.s\n"
+    "sqadd z8.s, z8.s, z17.s\n"
+    ".inst 0x44829088  // srshl z8.s, p4/M, z8.s, z4.s\n"
+    "addvl x28, x28, #2\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "and z21.d, z18.d, z4.d\n"
+    ".inst 0x04ae7400  // sqrdmulh z0.s, z0.s, z14.s\n"
+    "and z20.d, z15.d, z4.d\n"
+    ".inst 0x04ae7421  // sqrdmulh z1.s, z1.s, z14.s\n"
+    "and z28.d, z5.d, z4.d\n"
+    ".inst 0x04ae74c6  // sqrdmulh z6.s, z6.s, z14.s\n"
+    "sqadd z24.s, z24.s, z16.s\n"
+    ".inst 0x448292d8  // srshl z24.s, p4/M, z24.s, z22.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    "and z25.d, z0.d, z22.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z17.d, z1.d, z22.d\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "and z16.d, z6.d, z22.d\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "asr z25.s, z25.s, #0x1f\n"
+    ".inst 0x44829092  // srshl z18.s, p4/M, z18.s, z4.s\n"
+    "sqadd z15.s, z15.s, z20.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x4482908f  // srshl z15.s, p4/M, z15.s, z4.s\n"
+    "sqadd z5.s, z5.s, z28.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x44829085  // srshl z5.s, p4/M, z5.s, z4.s\n"
+    "sqadd z0.s, z0.s, z25.s\n"
+    "sqadd z1.s, z1.s, z17.s\n"
+    ".inst 0x448292c0  // srshl z0.s, p4/M, z0.s, z22.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    "sqadd z6.s, z6.s, z16.s\n"
+    ".inst 0x45304108  // sqxtnb z8.h, z8.s\n"
+    ".inst 0x448292c6  // srshl z6.s, p4/M, z6.s, z22.s\n"
+    ".inst 0x45304252  // sqxtnb z18.h, z18.s\n"
+    ".inst 0x453041ef  // sqxtnb z15.h, z15.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x45304708  // sqxtnt z8.h, z24.s\n"
+    ".inst 0x45304412  // sqxtnt z18.h, z0.s\n"
+    ".inst 0x4530442f  // sqxtnt z15.h, z1.s\n"
+    ".inst 0x453044c5  // sqxtnt z5.h, z6.s\n"
+    "sqadd z8.h, z8.h, z19.h\n"
+    "smax z8.h, p4/M, z8.h, z12.h\n"
+    "smin z8.h, p4/M, z8.h, z9.h\n"
+    "sqadd z18.h, z18.h, z19.h\n"
+    "sqadd z15.h, z15.h, z19.h\n"
+    "smax z18.h, p4/M, z18.h, z12.h\n"
+    "smax z15.h, p4/M, z15.h, z12.h\n"
+    "sqadd z5.h, z5.h, z19.h\n"
+    "smax z5.h, p4/M, z5.h, z12.h\n"
+    "smin z18.h, p4/M, z18.h, z9.h\n"
+    "st1b { z8.h }, p0, [x16, x10]\n"
+    "smin z15.h, p4/M, z15.h, z9.h\n"
+    "smin z5.h, p4/M, z5.h, z9.h\n"
+    "st1b { z18.h }, p0, [x15, x10]\n"
+    "st1b { z15.h }, p0, [x14, x10]\n"
+    "st1b { z5.h }, p0, [x13, x10]\n"
+    "ld1b { z25.h }, p4/Z, [x17]\n"
+    "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "inch x10\n"
+    "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1b39  // usublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d1bde  // usublb z30.h, z30.b, z13.b\n"
+    "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d19ce  // usublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1884  // usublb z4.h, z4.b, z13.b\n"
+    "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d194a  // usublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1b { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x20, x20, #2\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z18.d, z8.d\n"
+    "mov z0.d, z24.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z15.d, z8.d\n"
+    "mov z1.d, z24.d\n"
+    "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z5.d, z8.d\n"
+    "mov z6.d, z24.d\n"
+    "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d1863  // usublb z3.h, z3.b, z13.b\n"
+    ".inst 0x454d1af7  // usublb z23.h, z23.b, z13.b\n"
+    "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d18e7  // usublb z7.h, z7.b, z13.b\n"
+    ".inst 0x454d1842  // usublb z2.h, z2.b, z13.b\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..dfaa059e9f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..b8adbb8262
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x2, #0x0\n"
+    "mov x24, x2\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "incw x24\n"
+    "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z30.b }, p4/Z, [x21]\n"
+    "ld1rb { z10.b }, p4/Z, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1rh { z15.h }, p4/Z, [x21]\n"
+    "ld1rh { z12.h }, p4/Z, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z13.h }, p4/Z, [x20]\n"
+    "ldp x5, x6, [x22, #0x0]\n"
+    "whilelt p3.h, x2, x3\n"
+    "ldp x7, x8, [x22, #0x10]\n"
+    "whilelt p2.s, x2, x3\n"
+    "whilelt p1.s, x24, x3\n"
+    "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+    "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+    "ld1w { z17.s }, p2/Z, [x10]\n"
+    "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1b { z26.h }, p4/Z, [x4]\n"
+    "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x10, x10, #2\n"
+    "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "mov x16, #0x0\n"
+    "mov z6.d, z14.d\n"
+    "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z18.d, z23.d\n"
+    "mov z9.d, z14.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z20.d, z23.d\n"
+    "mov z7.d, z14.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z1.d, z23.d\n"
+    ".inst 0x454a1b5a  // usublb z26.h, z26.b, z10.b\n"
+    "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a1908  // usublb z8.h, z8.b, z10.b\n"
+    ".inst 0x454a1a10  // usublb z16.h, z16.b, z10.b\n"
+    "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a1ab5  // usublb z21.h, z21.b, z10.b\n"
+    ".inst 0x454a1a31  // usublb z17.h, z17.b, z10.b\n"
+    "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1a73  // usublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    "1:"  // Loop
+    ".inst 0x449a42ce  // smlalb z14.s, p4/M, z22.h, z26.h\n"
+    ".inst 0x449a46d7  // smlalt z23.s, p4/M, z22.h, z26.h\n"
+    "ldr x20, [x17, #0x50]\n"
+    "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x4488404e  // smlalb z14.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449a4046  // smlalb z6.s, p4/M, z2.h, z26.h\n"
+    "ldr x20, [x17, #0x58]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449a4169  // smlalb z9.s, p4/M, z11.h, z26.h\n"
+    ".inst 0x449a4067  // smlalb z7.s, p4/M, z3.h, z26.h\n"
+    "ld1b { z5.h }, p3/Z, [x20, x2]\n"
+    "ldr x20, [x17, #0x60]\n"
+    ".inst 0x44884457  // smlalt z23.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449043ae  // smlalb z14.s, p4/M, z29.h, z16.h\n"
+    "ld1b { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x449a4452  // smlalt z18.s, p4/M, z2.h, z26.h\n"
+    ".inst 0x449a4574  // smlalt z20.s, p4/M, z11.h, z26.h\n"
+    "ld1b { z22.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1b39  // usublb z25.h, z25.b, z10.b\n"
+    ".inst 0x449a4461  // smlalt z1.s, p4/M, z3.h, z26.h\n"
+    ".inst 0x448843a6  // smlalb z6.s, p4/M, z29.h, z8.h\n"
+    "ldr x20, [x17, #0x68]\n"
+    "ld1b { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x44884069  // smlalb z9.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x44884087  // smlalb z7.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x449047b7  // smlalt z23.s, p4/M, z29.h, z16.h\n"
+    ".inst 0x449543ee  // smlalb z14.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1842  // usublb z2.h, z2.b, z10.b\n"
+    "ldr x20, [x17, #0x70]\n"
+    ".inst 0x448847b2  // smlalt z18.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x44884474  // smlalt z20.s, p4/M, z3.h, z8.h\n"
+    "ld1b { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+    ".inst 0x455e1b5a  // usublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44884481  // smlalt z1.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x449043e6  // smlalb z6.s, p4/M, z31.h, z16.h\n"
+    "inch x4, ALL, MUL #8\n"
+    "ld1b { z8.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x44904089  // smlalb z9.s, p4/M, z4.h, z16.h\n"
+    ".inst 0x44904367  // smlalb z7.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x454a1bbd  // usublb z29.h, z29.b, z10.b\n"
+    "ldr x20, [x17, #0x78]\n"
+    ".inst 0x449547f7  // smlalt z23.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x4491400e  // smlalb z14.s, p4/M, z0.h, z17.h\n"
+    "ld1b { z24.h }, p4/Z, [x4]\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x449047f2  // smlalt z18.s, p4/M, z31.h, z16.h\n"
+    ".inst 0x44904494  // smlalt z20.s, p4/M, z4.h, z16.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1b18  // usublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44904761  // smlalt z1.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x44954006  // smlalb z6.s, p4/M, z0.h, z21.h\n"
+    "ldr x22, [x17, #0x80]\n"
+    "ld1b { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x44954369  // smlalb z9.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x449540a7  // smlalb z7.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    "ldr x21, [x17, #0x88]\n"
+    ".inst 0x44914417  // smlalt z23.s, p4/M, z0.h, z17.h\n"
+    ".inst 0x4499416e  // smlalb z14.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x454a1a10  // usublb z16.h, z16.b, z10.b\n"
+    "ldr x20, [x17, #0x90]\n"
+    ".inst 0x44954412  // smlalt z18.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44954774  // smlalt z20.s, p4/M, z27.h, z21.h\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x449544a1  // smlalt z1.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x449142c6  // smlalb z6.s, p4/M, z22.h, z17.h\n"
+    "ld1b { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a1ab5  // usublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449140a9  // smlalb z9.s, p4/M, z5.h, z17.h\n"
+    ".inst 0x44914267  // smlalb z7.s, p4/M, z19.h, z17.h\n"
+    "ldr x23, [x17, #0x98]\n"
+    "ldr x22, [x17, #0xa0]\n"
+    ".inst 0x44994577  // smlalt z23.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x4482406e  // smlalb z14.s, p4/M, z3.h, z2.h\n"
+    "ld1b { z11.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    ".inst 0x449146d2  // smlalt z18.s, p4/M, z22.h, z17.h\n"
+    ".inst 0x449144b4  // smlalt z20.s, p4/M, z5.h, z17.h\n"
+    "ld1b { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a1ad6  // usublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44914661  // smlalt z1.s, p4/M, z19.h, z17.h\n"
+    ".inst 0x44994066  // smlalb z6.s, p4/M, z3.h, z25.h\n"
+    "ld1b { z17.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1a31  // usublb z17.h, z17.b, z30.b\n"
+    ".inst 0x44994389  // smlalb z9.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994347  // smlalb z7.s, p4/M, z26.h, z25.h\n"
+    "ldr x20, [x17, #0xa8]\n"
+    "ldr x21, [x17, #0xb0]\n"
+    ".inst 0x44824477  // smlalt z23.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x449d408e  // smlalb z14.s, p4/M, z4.h, z29.h\n"
+    "ldr x13, [x17, #0xb8]\n"
+    "ldr x12, [x17, #0xc0]\n"
+    ".inst 0x44994472  // smlalt z18.s, p4/M, z3.h, z25.h\n"
+    ".inst 0x44994794  // smlalt z20.s, p4/M, z28.h, z25.h\n"
+    "ld1b { z3.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44994741  // smlalt z1.s, p4/M, z26.h, z25.h\n"
+    ".inst 0x44824086  // smlalb z6.s, p4/M, z4.h, z2.h\n"
+    "ld1b { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1b39  // usublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824349  // smlalb z9.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824107  // smlalb z7.s, p4/M, z8.h, z2.h\n"
+    "ldr x11, [x17, #0xc8]\n"
+    "ldr x10, [x17, #0xd0]\n"
+    ".inst 0x449d4497  // smlalt z23.s, p4/M, z4.h, z29.h\n"
+    ".inst 0x4498436e  // smlalb z14.s, p4/M, z27.h, z24.h\n"
+    "ldr x9, [x17, #0xd8]\n"
+    "ldr x28, [x17, #0xe0]\n"
+    ".inst 0x44824492  // smlalt z18.s, p4/M, z4.h, z2.h\n"
+    ".inst 0x44824754  // smlalt z20.s, p4/M, z26.h, z2.h\n"
+    "ld1b { z4.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44824501  // smlalt z1.s, p4/M, z8.h, z2.h\n"
+    ".inst 0x449d4366  // smlalb z6.s, p4/M, z27.h, z29.h\n"
+    "ld1b { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a1842  // usublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d4109  // smlalb z9.s, p4/M, z8.h, z29.h\n"
+    ".inst 0x449d43e7  // smlalb z7.s, p4/M, z31.h, z29.h\n"
+    "ldr x27, [x17, #0xe8]\n"
+    "ldr x26, [x17, #0xf0]\n"
+    ".inst 0x44984777  // smlalt z23.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449040ae  // smlalb z14.s, p4/M, z5.h, z16.h\n"
+    "ldr x25, [x17, #0xf8]\n"
+    "ldr x24, [x17, #0x100]\n"
+    ".inst 0x449d4772  // smlalt z18.s, p4/M, z27.h, z29.h\n"
+    ".inst 0x449d4514  // smlalt z20.s, p4/M, z8.h, z29.h\n"
+    "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449d47e1  // smlalt z1.s, p4/M, z31.h, z29.h\n"
+    ".inst 0x449840a6  // smlalb z6.s, p4/M, z5.h, z24.h\n"
+    "ld1b { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a1bbd  // usublb z29.h, z29.b, z10.b\n"
+    ".inst 0x449843e9  // smlalb z9.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984007  // smlalb z7.s, p4/M, z0.h, z24.h\n"
+    "ldr x23, [x17, #0x108]\n"
+    "ldr x22, [x17, #0x110]\n"
+    ".inst 0x449044b7  // smlalt z23.s, p4/M, z5.h, z16.h\n"
+    ".inst 0x4495438e  // smlalb z14.s, p4/M, z28.h, z21.h\n"
+    "ldr x20, [x17, #0x118]\n"
+    "whilelt p0.h, x16, x3\n"
+    ".inst 0x449844b2  // smlalt z18.s, p4/M, z5.h, z24.h\n"
+    ".inst 0x449847f4  // smlalt z20.s, p4/M, z31.h, z24.h\n"
+    "ld1b { z5.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44984401  // smlalt z1.s, p4/M, z0.h, z24.h\n"
+    ".inst 0x44904266  // smlalb z6.s, p4/M, z19.h, z16.h\n"
+    "ld1b { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44904009  // smlalb z9.s, p4/M, z0.h, z16.h\n"
+    ".inst 0x44904167  // smlalb z7.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x454a1b18  // usublb z24.h, z24.b, z10.b\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44954797  // smlalt z23.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x4496434e  // smlalb z14.s, p4/M, z26.h, z22.h\n"
+    "ld1b { z28.h }, p3/Z, [x13, x2]\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44904672  // smlalt z18.s, p4/M, z19.h, z16.h\n"
+    ".inst 0x44904414  // smlalt z20.s, p4/M, z0.h, z16.h\n"
+    "ld1b { z19.h }, p4/Z, [x4]\n"
+    ".inst 0x454a1a73  // usublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44904561  // smlalt z1.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x44954346  // smlalb z6.s, p4/M, z26.h, z21.h\n"
+    "ld1b { z16.h }, p3/Z, [x12, x2]\n"
+    ".inst 0x455e1a10  // usublb z16.h, z16.b, z30.b\n"
+    ".inst 0x44954229  // smlalb z9.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x44954067  // smlalb z7.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964757  // smlalt z23.s, p4/M, z26.h, z22.h\n"
+    ".inst 0x4499410e  // smlalb z14.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x44954752  // smlalt z18.s, p4/M, z26.h, z21.h\n"
+    ".inst 0x44954634  // smlalt z20.s, p4/M, z17.h, z21.h\n"
+    "ld1b { z26.h }, p3/Z, [x11, x2]\n"
+    ".inst 0x455e1b5a  // usublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44954461  // smlalt z1.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964106  // smlalb z6.s, p4/M, z8.h, z22.h\n"
+    "ld1b { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x454a1ab5  // usublb z21.h, z21.b, z10.b\n"
+    ".inst 0x44964069  // smlalb z9.s, p4/M, z3.h, z22.h\n"
+    ".inst 0x44964087  // smlalb z7.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x44994517  // smlalt z23.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x448243ee  // smlalb z14.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x44964512  // smlalt z18.s, p4/M, z8.h, z22.h\n"
+    ".inst 0x44964474  // smlalt z20.s, p4/M, z3.h, z22.h\n"
+    "ld1b { z8.h }, p3/Z, [x10, x2]\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x44964481  // smlalt z1.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x449943e6  // smlalb z6.s, p4/M, z31.h, z25.h\n"
+    "ld1b { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a1ad6  // usublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44994089  // smlalb z9.s, p4/M, z4.h, z25.h\n"
+    ".inst 0x44994367  // smlalb z7.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x448247f7  // smlalt z23.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x449d400e  // smlalb z14.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x449947f2  // smlalt z18.s, p4/M, z31.h, z25.h\n"
+    ".inst 0x44994494  // smlalt z20.s, p4/M, z4.h, z25.h\n"
+    "ld1b { z31.h }, p3/Z, [x9, x2]\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x44994761  // smlalt z1.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x44824006  // smlalb z6.s, p4/M, z0.h, z2.h\n"
+    "ld1b { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a1b39  // usublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824369  // smlalb z9.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x448240a7  // smlalb z7.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4417  // smlalt z23.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x4498422e  // smlalb z14.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x44824412  // smlalt z18.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824774  // smlalt z20.s, p4/M, z27.h, z2.h\n"
+    "ld1b { z0.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x448244a1  // smlalt z1.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4166  // smlalb z6.s, p4/M, z11.h, z29.h\n"
+    "ld1b { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1842  // usublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d40a9  // smlalb z9.s, p4/M, z5.h, z29.h\n"
+    ".inst 0x449d4387  // smlalb z7.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984637  // smlalt z23.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x4493406e  // smlalb z14.s, p4/M, z3.h, z19.h\n"
+    "ld1b { z17.h }, p3/Z, [x27, x2]\n"
+    ".inst 0x455e1a31  // usublb z17.h, z17.b, z30.b\n"
+    ".inst 0x449d4572  // smlalt z18.s, p4/M, z11.h, z29.h\n"
+    ".inst 0x449d44b4  // smlalt z20.s, p4/M, z5.h, z29.h\n"
+    "ld1b { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a196b  // usublb z11.h, z11.b, z10.b\n"
+    ".inst 0x449d4781  // smlalt z1.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984066  // smlalb z6.s, p4/M, z3.h, z24.h\n"
+    "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x44984209  // smlalb z9.s, p4/M, z16.h, z24.h\n"
+    ".inst 0x44984347  // smlalb z7.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934477  // smlalt z23.s, p4/M, z3.h, z19.h\n"
+    ".inst 0x4495408e  // smlalb z14.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x44984472  // smlalt z18.s, p4/M, z3.h, z24.h\n"
+    ".inst 0x44984614  // smlalt z20.s, p4/M, z16.h, z24.h\n"
+    "ld1b { z3.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44984741  // smlalt z1.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934086  // smlalb z6.s, p4/M, z4.h, z19.h\n"
+    "ld1b { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a1b18  // usublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44934349  // smlalb z9.s, p4/M, z26.h, z19.h\n"
+    ".inst 0x44934107  // smlalb z7.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954497  // smlalt z23.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x4496436e  // smlalb z14.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x44934492  // smlalt z18.s, p4/M, z4.h, z19.h\n"
+    ".inst 0x44934754  // smlalt z20.s, p4/M, z26.h, z19.h\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44934501  // smlalt z1.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954366  // smlalb z6.s, p4/M, z27.h, z21.h\n"
+    "ld1b { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44954109  // smlalb z9.s, p4/M, z8.h, z21.h\n"
+    ".inst 0x449543e7  // smlalb z7.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1a73  // usublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44964777  // smlalt z23.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x449940ae  // smlalb z14.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x44954772  // smlalt z18.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x44954514  // smlalt z20.s, p4/M, z8.h, z21.h\n"
+    "ld1b { z27.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449547e1  // smlalt z1.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x449640a6  // smlalb z6.s, p4/M, z5.h, z22.h\n"
+    "ld1b { z21.h }, p4/Z, [x4]\n"
+    ".inst 0x454a1ab5  // usublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449643e9  // smlalb z9.s, p4/M, z31.h, z22.h\n"
+    ".inst 0x44964007  // smlalb z7.s, p4/M, z0.h, z22.h\n"
+    "inch x4\n"
+    ".inst 0x449944b7  // smlalt z23.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x4482420e  // smlalb z14.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x449644b2  // smlalt z18.s, p4/M, z5.h, z22.h\n"
+    ".inst 0x449647f4  // smlalt z20.s, p4/M, z31.h, z22.h\n"
+    "ld1b { z5.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44964401  // smlalt z1.s, p4/M, z0.h, z22.h\n"
+    ".inst 0x44994386  // smlalb z6.s, p4/M, z28.h, z25.h\n"
+    "ld1w { z22.s }, p2/Z, [x15]\n"
+    ".inst 0x44994009  // smlalb z9.s, p4/M, z0.h, z25.h\n"
+    ".inst 0x44994227  // smlalb z7.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824617  // smlalt z23.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x448b434e  // smlalb z14.s, p4/M, z26.h, z11.h\n"
+    "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+    "addvl x15, x15, #2\n"
+    ".inst 0x44994792  // smlalt z18.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994414  // smlalt z20.s, p4/M, z0.h, z25.h\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44994621  // smlalt z1.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824346  // smlalb z6.s, p4/M, z26.h, z2.h\n"
+    "uzp1 z25.s, z22.s, z16.s\n"
+    "inch x2\n"
+    ".inst 0x448243a9  // smlalb z9.s, p4/M, z29.h, z2.h\n"
+    ".inst 0x44824067  // smlalb z7.s, p4/M, z3.h, z2.h\n"
+    "uzp2 z16.s, z22.s, z16.s\n"
+    "ld1w { z22.s }, p2/Z, [x14]\n"
+    ".inst 0x448b4757  // smlalt z23.s, p4/M, z26.h, z11.h\n"
+    ".inst 0x4498410e  // smlalb z14.s, p4/M, z8.h, z24.h\n"
+    "mov x20, x2\n"
+    "incw x20\n"
+    ".inst 0x44824752  // smlalt z18.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x448247b4  // smlalt z20.s, p4/M, z29.h, z2.h\n"
+    "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+    "uzp1 z29.s, z22.s, z26.s\n"
+    ".inst 0x44824461  // smlalt z1.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x448b4106  // smlalb z6.s, p4/M, z8.h, z11.h\n"
+    "uzp2 z22.s, z22.s, z26.s\n"
+    "whilelt p2.s, x2, x3\n"
+    ".inst 0x448b4069  // smlalb z9.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4087  // smlalb z7.s, p4/M, z4.h, z11.h\n"
+    "whilelt p1.s, x20, x3\n"
+    "whilelt p3.h, x2, x3\n"
+    ".inst 0x44984517  // smlalt z23.s, p4/M, z8.h, z24.h\n"
+    ".inst 0x449343ee  // smlalb z14.s, p4/M, z31.h, z19.h\n"
+    "addvl x14, x14, #2\n"
+    ".inst 0x448b4512  // smlalt z18.s, p4/M, z8.h, z11.h\n"
+    ".inst 0x448b4474  // smlalt z20.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4481  // smlalt z1.s, p4/M, z4.h, z11.h\n"
+    ".inst 0x449843e6  // smlalb z6.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984089  // smlalb z9.s, p4/M, z4.h, z24.h\n"
+    ".inst 0x44984367  // smlalb z7.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449347f7  // smlalt z23.s, p4/M, z31.h, z19.h\n"
+    ".inst 0x4495400e  // smlalb z14.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x04b975ce  // sqrdmulh z14.s, z14.s, z25.s\n"
+    ".inst 0x449847f2  // smlalt z18.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984494  // smlalt z20.s, p4/M, z4.h, z24.h\n"
+    "and z3.d, z14.d, z29.d\n"
+    ".inst 0x44984761  // smlalt z1.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x44934006  // smlalb z6.s, p4/M, z0.h, z19.h\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    ".inst 0x44934369  // smlalb z9.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449340a7  // smlalb z7.s, p4/M, z5.h, z19.h\n"
+    "sqadd z14.s, z14.s, z3.s\n"
+    ".inst 0x448293ae  // srshl z14.s, p4/M, z14.s, z29.s\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44934412  // smlalt z18.s, p4/M, z0.h, z19.h\n"
+    ".inst 0x04b076f7  // sqrdmulh z23.s, z23.s, z16.s\n"
+    ".inst 0x44934774  // smlalt z20.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449344a1  // smlalt z1.s, p4/M, z5.h, z19.h\n"
+    "and z31.d, z23.d, z22.d\n"
+    ".inst 0x44954226  // smlalb z6.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x449540a9  // smlalb z9.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x04b974c6  // sqrdmulh z6.s, z6.s, z25.s\n"
+    ".inst 0x44954387  // smlalb z7.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x44954632  // smlalt z18.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x04b97529  // sqrdmulh z9.s, z9.s, z25.s\n"
+    ".inst 0x449544b4  // smlalt z20.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x44954781  // smlalt z1.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x04b974e7  // sqrdmulh z7.s, z7.s, z25.s\n"
+    "asr z31.s, z31.s, #0x1f\n"
+    "and z3.d, z6.d, z29.d\n"
+    ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+    "and z0.d, z9.d, z29.d\n"
+    ".inst 0x04b07694  // sqrdmulh z20.s, z20.s, z16.s\n"
+    "and z19.d, z7.d, z29.d\n"
+    ".inst 0x04b07421  // sqrdmulh z1.s, z1.s, z16.s\n"
+    "sqadd z23.s, z23.s, z31.s\n"
+    ".inst 0x448292d7  // srshl z23.s, p4/M, z23.s, z22.s\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    "and z21.d, z18.d, z22.d\n"
+    "asr z0.s, z0.s, #0x1f\n"
+    "and z17.d, z20.d, z22.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z16.d, z1.d, z22.d\n"
+    "sqadd z6.s, z6.s, z3.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    ".inst 0x448293a6  // srshl z6.s, p4/M, z6.s, z29.s\n"
+    "sqadd z9.s, z9.s, z0.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x448293a9  // srshl z9.s, p4/M, z9.s, z29.s\n"
+    "sqadd z7.s, z7.s, z19.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x448293a7  // srshl z7.s, p4/M, z7.s, z29.s\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "sqadd z20.s, z20.s, z17.s\n"
+    ".inst 0x448292d2  // srshl z18.s, p4/M, z18.s, z22.s\n"
+    ".inst 0x448292d4  // srshl z20.s, p4/M, z20.s, z22.s\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x453041ce  // sqxtnb z14.h, z14.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    ".inst 0x453040c6  // sqxtnb z6.h, z6.s\n"
+    ".inst 0x45304129  // sqxtnb z9.h, z9.s\n"
+    ".inst 0x453040e7  // sqxtnb z7.h, z7.s\n"
+    ".inst 0x453046ee  // sqxtnt z14.h, z23.s\n"
+    ".inst 0x45304646  // sqxtnt z6.h, z18.s\n"
+    ".inst 0x45304689  // sqxtnt z9.h, z20.s\n"
+    ".inst 0x45304427  // sqxtnt z7.h, z1.s\n"
+    "sqadd z14.h, z14.h, z15.h\n"
+    "smax z14.h, p4/M, z14.h, z12.h\n"
+    "smin z14.h, p4/M, z14.h, z13.h\n"
+    "sqadd z6.h, z6.h, z15.h\n"
+    "sqadd z9.h, z9.h, z15.h\n"
+    "smax z6.h, p4/M, z6.h, z12.h\n"
+    "smax z9.h, p4/M, z9.h, z12.h\n"
+    "sqadd z7.h, z7.h, z15.h\n"
+    "smax z7.h, p4/M, z7.h, z12.h\n"
+    "smin z6.h, p4/M, z6.h, z13.h\n"
+    "st1b { z14.h }, p0, [x5, x16]\n"
+    "smin z9.h, p4/M, z9.h, z13.h\n"
+    "smin z7.h, p4/M, z7.h, z13.h\n"
+    "st1b { z6.h }, p0, [x6, x16]\n"
+    "st1b { z9.h }, p0, [x7, x16]\n"
+    "st1b { z7.h }, p0, [x8, x16]\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1b { z26.h }, p4/Z, [x4]\n"
+    "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x21, x21, #2\n"
+    "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "inch x16\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z6.d, z14.d\n"
+    "mov z18.d, z23.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z9.d, z14.d\n"
+    "mov z20.d, z23.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z7.d, z14.d\n"
+    "mov z1.d, z23.d\n"
+    "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a1b5a  // usublb z26.h, z26.b, z10.b\n"
+    ".inst 0x454a1908  // usublb z8.h, z8.b, z10.b\n"
+    "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a1a10  // usublb z16.h, z16.b, z10.b\n"
+    ".inst 0x454a1ab5  // usublb z21.h, z21.b, z10.b\n"
+    "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x454a1a31  // usublb z17.h, z17.b, z10.b\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1a73  // usublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..d5382533a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+  : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a9cd8a7fa9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x9\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ldr x23, [%x[inptrs], #0x8]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ldr x22, [%x[inptrs], #0x20]\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "mov z13.b, #0x1\n"
+    "lsr z13.s, z13.s, #0x8\n"
+    "ld1b { z1.b }, p0/Z, [x23]\n"
+    "ld1b { z2.b }, p0/Z, [x20]\n"
+    "mov z8.d, z1.d\n"
+    "mov z27.d, z1.d\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "ld1b { z4.b }, p0/Z, [x22]\n"
+    "mov z31.d, z1.d\n"
+    "mov z28.d, z2.d\n"
+    "ld1b { z0.b }, p0/Z, [x21]\n"
+    "mov z30.d, z2.d\n"
+    "mov z26.d, z2.d\n"
+    "ld1b { z3.b }, p0/Z, [x20]\n"
+    "mov z22.d, z4.d\n"
+    "mov z10.d, z4.d\n"
+    "ptrue p2.b\n"
+    "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z18.d, z4.d\n"
+    "ext z8.b, z8.b, z8.b, #0x2\n"
+    "lsl x10, %x[n_channels], #0x2\n"
+    "neg z11.s, p2/M, z11.s\n"
+    "ext z27.b, z27.b, z27.b, #0x4\n"
+    "ext z31.b, z31.b, z31.b, #0x6\n"
+    "mov x9, #0x0\n"
+    "whilelt p0.b, x9, x10\n"
+    "ext z28.b, z28.b, z28.b, #0x2\n"
+    "ext z30.b, z30.b, z30.b, #0x4\n"
+    "ld1w { z14.s }, p0/Z, [%x[params]]\n"
+    "mov x28, #0x0\n"
+    "ext z26.b, z26.b, z26.b, #0x6\n"
+    "ext z22.b, z22.b, z22.b, #0x2\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ext z10.b, z10.b, z10.b, #0x4\n"
+    "ext z18.b, z18.b, z18.b, #0x6\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "mov z21.d, z0.d\n"
+    "mov z20.d, z0.d\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "mov z19.d, z0.d\n"
+    "mov z24.d, z3.d\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+    "mov z17.d, z3.d\n"
+    "mov z16.d, z3.d\n"
+    "ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "ext z21.b, z21.b, z21.b, #0x2\n"
+    "ext z20.b, z20.b, z20.b, #0x4\n"
+    "addvl %x[params], %x[params], #4\n"
+    "ext z19.b, z19.b, z19.b, #0x6\n"
+    "zip1 z1.s, z1.s, z27.s\n"
+    "zip1 z8.s, z8.s, z31.s\n"
+    "zip1 z2.s, z2.s, z30.s\n"
+    "zip1 z28.s, z28.s, z26.s\n"
+    "ext z24.b, z24.b, z24.b, #0x2\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "ext z16.b, z16.b, z16.b, #0x6\n"
+    "zip1 z4.s, z4.s, z10.s\n"
+    "zip1 z22.s, z22.s, z18.s\n"
+    "zip1 z0.s, z0.s, z20.s\n"
+    "zip1 z21.s, z21.s, z19.s\n"
+    "zip1 z1.s, z1.s, z8.s\n"
+    "zip1 z2.s, z2.s, z28.s\n"
+    "zip1 z3.s, z3.s, z17.s\n"
+    "zip1 z24.s, z24.s, z16.s\n"
+    "zip1 z4.s, z4.s, z22.s\n"
+    "zip1 z0.s, z0.s, z21.s\n"
+    "mov z1.q, z1.q[0]\n"
+    "mov z2.q, z2.q[0]\n"
+    "zip1 z3.s, z3.s, z24.s\n"
+    "mov z4.q, z4.q[0]\n"
+    "mov z24.s, #0x0\n"
+    "mov z25.s, #0x0\n"
+    "udot z24.s, z13.b, z1.b[0]\n"
+    "mov z23.s, #0x0\n"
+    "mov z22.s, #0x0\n"
+    "udot z25.s, z13.b, z1.b[1]\n"
+    "mov z21.s, #0x0\n"
+    "mov z19.s, #0x0\n"
+    "udot z23.s, z13.b, z1.b[2]\n"
+    "mov z10.s, #0x0\n"
+    "mov z8.s, #0x0\n"
+    "udot z22.s, z13.b, z1.b[3]\n"
+    "mov z20.s, #0x0\n"
+    "mov z18.s, #0x0\n"
+    "udot z21.s, z13.b, z2.b[0]\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0x0\n"
+    "udot z19.s, z13.b, z2.b[1]\n"
+    "udot z10.s, z13.b, z2.b[2]\n"
+    "udot z8.s, z13.b, z2.b[3]\n"
+    "mov z0.q, z0.q[0]\n"
+    "udot z20.s, z13.b, z4.b[0]\n"
+    "udot z18.s, z13.b, z4.b[1]\n"
+    "mov z3.q, z3.q[0]\n"
+    "udot z17.s, z13.b, z4.b[2]\n"
+    "udot z16.s, z13.b, z4.b[3]\n"
+    "mov z31.s, #0x0\n"
+    "mov z30.s, #0x0\n"
+    "mov z26.s, #0x0\n"
+    "udot z31.s, z13.b, z0.b[0]\n"
+    "mov z27.s, #0x0\n"
+    "mov z28.s, #0x0\n"
+    "udot z30.s, z13.b, z0.b[1]\n"
+    "mov z29.s, #0x0\n"
+    "udot z26.s, z13.b, z0.b[2]\n"
+    "udot z27.s, z13.b, z0.b[3]\n"
+    "udot z28.s, z13.b, z3.b[0]\n"
+    "udot z29.s, z13.b, z3.b[1]\n"
+    "add z24.s, z24.s, z21.s\n"
+    "add z25.s, z25.s, z19.s\n"
+    "add z23.s, z23.s, z10.s\n"
+    "add z22.s, z22.s, z8.s\n"
+    "add z21.s, z20.s, z21.s\n"
+    "mov z20.s, #0x0\n"
+    "udot z20.s, z13.b, z3.b[2]\n"
+    "add z19.s, z18.s, z19.s\n"
+    "mov z18.s, #0x0\n"
+    "udot z18.s, z13.b, z3.b[3]\n"
+    "add z17.s, z17.s, z10.s\n"
+    "add z16.s, z16.s, z8.s\n"
+    "add z24.s, z24.s, z31.s\n"
+    "add z25.s, z25.s, z30.s\n"
+    "mul z24.s, p2/M, z24.s, z11.s\n"
+    "mul z25.s, p2/M, z25.s, z11.s\n"
+    "add z26.s, z23.s, z26.s\n"
+    "add z27.s, z22.s, z27.s\n"
+    "mul z26.s, p2/M, z26.s, z11.s\n"
+    "mul z27.s, p2/M, z27.s, z11.s\n"
+    "add z28.s, z21.s, z28.s\n"
+    "add z29.s, z19.s, z29.s\n"
+    "mul z28.s, p2/M, z28.s, z11.s\n"
+    "mul z29.s, p2/M, z29.s, z11.s\n"
+    "add z30.s, z17.s, z20.s\n"
+    "add z31.s, z16.s, z18.s\n"
+    "mul z30.s, p2/M, z30.s, z11.s\n"
+    "mul z31.s, p2/M, z31.s, z11.s\n"
+    "zip1 z19.s, z24.s, z26.s\n"
+    "zip1 z18.s, z25.s, z27.s\n"
+    "zip1 z17.s, z28.s, z30.s\n"
+    "zip1 z16.s, z29.s, z31.s\n"
+    "zip1 z22.s, z19.s, z18.s\n"
+    "zip1 z23.s, z17.s, z16.s\n"
+    "add z24.s, z24.s, z14.s\n"
+    "add z25.s, z25.s, z14.s\n"
+    "add z26.s, z26.s, z14.s\n"
+    "add z27.s, z27.s, z14.s\n"
+    "add z28.s, z28.s, z14.s\n"
+    "add z29.s, z29.s, z14.s\n"
+    "add z30.s, z30.s, z14.s\n"
+    "add z31.s, z31.s, z14.s\n"
+    "1:"  // Loop
+    "udot z24.s, z5.b, z0.b[0]\n"
+    "udot z25.s, z5.b, z0.b[1]\n"
+    "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "udot z26.s, z5.b, z0.b[2]\n"
+    "udot z27.s, z5.b, z0.b[3]\n"
+    "incb x9\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "udot z24.s, z6.b, z1.b[0]\n"
+    "udot z25.s, z6.b, z1.b[1]\n"
+    "whilelt p0.b, x9, x10\n"
+    "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "udot z26.s, z6.b, z1.b[2]\n"
+    "udot z27.s, z6.b, z1.b[3]\n"
+    "udot z28.s, z5.b, z2.b[0]\n"
+    "udot z29.s, z5.b, z2.b[1]\n"
+    "udot z30.s, z5.b, z2.b[2]\n"
+    "udot z31.s, z5.b, z2.b[3]\n"
+    "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "udot z24.s, z7.b, z2.b[0]\n"
+    "udot z25.s, z7.b, z2.b[1]\n"
+    ".inst 0x04a87718  // sqrdmulh z24.s, z24.s, z8.s\n"
+    "udot z26.s, z7.b, z2.b[2]\n"
+    "udot z27.s, z7.b, z2.b[3]\n"
+    ".inst 0x04a87739  // sqrdmulh z25.s, z25.s, z8.s\n"
+    "udot z28.s, z6.b, z3.b[0]\n"
+    "udot z29.s, z6.b, z3.b[1]\n"
+    ".inst 0x04a8775a  // sqrdmulh z26.s, z26.s, z8.s\n"
+    "udot z30.s, z6.b, z3.b[2]\n"
+    "udot z31.s, z6.b, z3.b[3]\n"
+    ".inst 0x04a8777b  // sqrdmulh z27.s, z27.s, z8.s\n"
+    "ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+    "udot z28.s, z7.b, z4.b[0]\n"
+    "udot z29.s, z7.b, z4.b[1]\n"
+    "and z19.d, z24.d, z21.d\n"
+    "udot z30.s, z7.b, z4.b[2]\n"
+    "udot z31.s, z7.b, z4.b[3]\n"
+    "and z18.d, z25.d, z21.d\n"
+    "ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+    "and z17.d, z26.d, z21.d\n"
+    "and z16.d, z27.d, z21.d\n"
+    "addvl %x[params], %x[params], #6\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04a8779c  // sqrdmulh z28.s, z28.s, z8.s\n"
+    ".inst 0x04a877bd  // sqrdmulh z29.s, z29.s, z8.s\n"
+    ".inst 0x04a877de  // sqrdmulh z30.s, z30.s, z8.s\n"
+    ".inst 0x04a877ff  // sqrdmulh z31.s, z31.s, z8.s\n"
+    "sqadd z24.s, z24.s, z19.s\n"
+    "sqadd z25.s, z25.s, z18.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "sqadd z26.s, z26.s, z17.s\n"
+    "sqadd z27.s, z27.s, z16.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    ".inst 0x44828abb  // srshl z27.s, p2/M, z27.s, z21.s\n"
+    "and z19.d, z28.d, z21.d\n"
+    "and z18.d, z29.d, z21.d\n"
+    "and z17.d, z30.d, z21.d\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z19.s\n"
+    "sqadd z29.s, z29.s, z18.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "sqadd z30.s, z30.s, z17.s\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    ".inst 0x44828abe  // srshl z30.s, p2/M, z30.s, z21.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "add z24.s, z24.s, z9.s\n"
+    "add z25.s, z25.s, z9.s\n"
+    "smin z24.s, p2/M, z24.s, z12.s\n"
+    "smin z25.s, p2/M, z25.s, z12.s\n"
+    "add z26.s, z26.s, z9.s\n"
+    "add z27.s, z27.s, z9.s\n"
+    "smin z26.s, p2/M, z26.s, z12.s\n"
+    "smin z27.s, p2/M, z27.s, z12.s\n"
+    "add z28.s, z28.s, z9.s\n"
+    "add z29.s, z29.s, z9.s\n"
+    "smin z28.s, p2/M, z28.s, z12.s\n"
+    "smin z29.s, p2/M, z29.s, z12.s\n"
+    "add z30.s, z30.s, z9.s\n"
+    "add z31.s, z31.s, z9.s\n"
+    "smin z30.s, p2/M, z30.s, z12.s\n"
+    "smin z31.s, p2/M, z31.s, z12.s\n"
+    "smax z24.s, p2/M, z24.s, z15.s\n"
+    "smax z25.s, p2/M, z25.s, z15.s\n"
+    "st1b { z24.s }, p1, [x27, x28]\n"
+    "mov z24.s, z22.s[0]\n"
+    "smax z26.s, p2/M, z26.s, z15.s\n"
+    "smax z27.s, p2/M, z27.s, z15.s\n"
+    "st1b { z25.s }, p1, [x26, x28]\n"
+    "mov z25.s, z22.s[1]\n"
+    "smax z28.s, p2/M, z28.s, z15.s\n"
+    "smax z29.s, p2/M, z29.s, z15.s\n"
+    "st1b { z26.s }, p1, [x25, x28]\n"
+    "mov z26.s, z22.s[2]\n"
+    "smax z30.s, p2/M, z30.s, z15.s\n"
+    "smax z31.s, p2/M, z31.s, z15.s\n"
+    "st1b { z27.s }, p1, [x24, x28]\n"
+    "mov z27.s, z22.s[3]\n"
+    "st1b { z28.s }, p1, [x23, x28]\n"
+    "mov z28.s, z23.s[0]\n"
+    "add z24.s, z24.s, z20.s\n"
+    "st1b { z29.s }, p1, [x22, x28]\n"
+    "mov z29.s, z23.s[1]\n"
+    "add z25.s, z25.s, z20.s\n"
+    "st1b { z30.s }, p1, [x21, x28]\n"
+    "mov z30.s, z23.s[2]\n"
+    "add z26.s, z26.s, z20.s\n"
+    "st1b { z31.s }, p1, [x20, x28]\n"
+    "mov z31.s, z23.s[3]\n"
+    "incw x28\n"
+    "add z27.s, z27.s, z20.s\n"
+    "add z28.s, z28.s, z20.s\n"
+    "add z29.s, z29.s, z20.s\n"
+    "add z30.s, z30.s, z20.s\n"
+    "add z31.s, z31.s, z20.s\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..55b6edea2c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+  using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+  : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+  {
+  }
+
+  arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..4b65a67309
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov x20, #0x6\n"
+    "whilelt p0.b, XZR, x20\n"
+    "ldr x22, [%x[inptrs], #0x18]\n"
+    "ldr x21, [%x[inptrs], #0x20]\n"
+    "ldr x20, [%x[inptrs], #0x10]\n"
+    "ld1b { z3.b }, p0/Z, [x22]\n"
+    "mov z23.d, z3.d\n"
+    "ext z23.b, z23.b, z23.b, #0x1\n"
+    "ld1b { z4.b }, p0/Z, [x21]\n"
+    "ldr x24, [%x[inptrs], #0x8]\n"
+    "mov z18.d, z4.d\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "ld1b { z2.b }, p0/Z, [x20]\n"
+    "ldr x23, [%x[inptrs], #0x28]\n"
+    "mov z15.d, z2.d\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "ldr x22, [%x[inptrs], #0x30]\n"
+    "ldr x21, [%x[inptrs], #0x38]\n"
+    "zip1 z3.d, z3.d, z23.d\n"
+    "zip1 z4.d, z4.d, z18.d\n"
+    "ldr x20, [%x[inptrs], #0x0]\n"
+    "ld1b { z1.b }, p0/Z, [x24]\n"
+    "mov z19.d, z1.d\n"
+    "ext z19.b, z19.b, z19.b, #0x1\n"
+    "ld1b { z5.b }, p0/Z, [x23]\n"
+    "ld1b { z6.b }, p0/Z, [x22]\n"
+    "mov z18.d, z5.d\n"
+    "mov z22.d, z6.d\n"
+    "ld1b { z7.b }, p0/Z, [x21]\n"
+    "ld1b { z0.b }, p0/Z, [x20]\n"
+    "mov z8.d, z7.d\n"
+    "zip1 z2.d, z2.d, z15.d\n"
+    "mov z3.q, z3.q[0]\n"
+    "mov z4.q, z4.q[0]\n"
+    "ptrue p2.b\n"
+    "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "ext z22.b, z22.b, z22.b, #0x1\n"
+    "lsl x10, %x[n_channels], #0x2\n"
+    "neg z23.s, p2/M, z23.s\n"
+    "ext z8.b, z8.b, z8.b, #0x1\n"
+    "mov z28.b, #0x1\n"
+    "mov x9, #0x0\n"
+    "whilelt p0.b, x9, x10\n"
+    "mov z25.s, #0x0\n"
+    "mov z24.s, #0x0\n"
+    "udot z25.s, z28.b, z3.b[0]\n"
+    "ld1w { z12.s }, p0/Z, [%x[params]]\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0x0\n"
+    "udot z24.s, z28.b, z3.b[2]\n"
+    "mov x28, #0x0\n"
+    "mov z27.d, z0.d\n"
+    "udot z17.s, z28.b, z4.b[0]\n"
+    "udot z16.s, z28.b, z4.b[2]\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "ext z27.b, z27.b, z27.b, #0x1\n"
+    "zip1 z1.d, z1.d, z19.d\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "mov z2.q, z2.q[0]\n"
+    "zip1 z5.d, z5.d, z18.d\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "zip1 z6.d, z6.d, z22.d\n"
+    "zip1 z7.d, z7.d, z8.d\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "mov z30.s, #0x0\n"
+    "mov z31.s, #0x0\n"
+    "udot z30.s, z28.b, z2.b[0]\n"
+    "ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+    "mov z29.s, #0x1\n"
+    "udot z31.s, z28.b, z2.b[2]\n"
+    "udot z25.s, z29.b, z3.b[1]\n"
+    "ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+    "zip1 z0.d, z0.d, z27.d\n"
+    "mov z1.q, z1.q[0]\n"
+    "udot z24.s, z29.b, z3.b[3]\n"
+    "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+    "mov z5.q, z5.q[0]\n"
+    "mov z6.q, z6.q[0]\n"
+    "udot z17.s, z29.b, z4.b[1]\n"
+    "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+    "mov z7.q, z7.q[0]\n"
+    "mov z22.s, #0x0\n"
+    "udot z16.s, z29.b, z4.b[3]\n"
+    "addvl %x[params], %x[params], #5\n"
+    "mov z21.s, #0x0\n"
+    "mov z26.s, #0x0\n"
+    "udot z22.s, z28.b, z1.b[0]\n"
+    "mov z27.s, #0x0\n"
+    "mov z20.s, #0x0\n"
+    "udot z21.s, z28.b, z1.b[2]\n"
+    "mov z19.s, #0x0\n"
+    "mov z18.s, #0x0\n"
+    "udot z26.s, z28.b, z5.b[0]\n"
+    "udot z27.s, z28.b, z5.b[2]\n"
+    "udot z20.s, z28.b, z6.b[0]\n"
+    "mov z0.q, z0.q[0]\n"
+    "udot z19.s, z28.b, z6.b[2]\n"
+    "udot z18.s, z28.b, z7.b[0]\n"
+    "add z17.s, z25.s, z17.s\n"
+    "mov z25.s, #0x0\n"
+    "udot z25.s, z28.b, z7.b[2]\n"
+    "udot z30.s, z29.b, z2.b[1]\n"
+    "udot z31.s, z29.b, z2.b[3]\n"
+    "add z16.s, z24.s, z16.s\n"
+    "udot z22.s, z29.b, z1.b[1]\n"
+    "mov z24.s, #0x0\n"
+    "udot z24.s, z28.b, z0.b[0]\n"
+    "udot z21.s, z29.b, z1.b[3]\n"
+    "udot z26.s, z29.b, z5.b[1]\n"
+    "udot z27.s, z29.b, z5.b[3]\n"
+    "add z30.s, z30.s, z17.s\n"
+    "udot z20.s, z29.b, z6.b[1]\n"
+    "udot z19.s, z29.b, z6.b[3]\n"
+    "add z31.s, z31.s, z16.s\n"
+    "udot z18.s, z29.b, z7.b[1]\n"
+    "udot z25.s, z29.b, z7.b[3]\n"
+    "add z22.s, z22.s, z30.s\n"
+    "udot z24.s, z29.b, z0.b[1]\n"
+    "add z21.s, z21.s, z31.s\n"
+    "add z20.s, z26.s, z20.s\n"
+    "add z19.s, z27.s, z19.s\n"
+    "add z18.s, z18.s, z17.s\n"
+    "mov z17.s, #0x0\n"
+    "udot z17.s, z28.b, z0.b[2]\n"
+    "udot z17.s, z29.b, z0.b[3]\n"
+    "add z16.s, z25.s, z16.s\n"
+    "add z24.s, z22.s, z24.s\n"
+    "add z25.s, z21.s, z17.s\n"
+    "mul z24.s, p2/M, z24.s, z23.s\n"
+    "mul z25.s, p2/M, z25.s, z23.s\n"
+    "add z26.s, z26.s, z22.s\n"
+    "add z27.s, z27.s, z21.s\n"
+    "mul z26.s, p2/M, z26.s, z23.s\n"
+    "mul z27.s, p2/M, z27.s, z23.s\n"
+    "add z28.s, z20.s, z30.s\n"
+    "add z29.s, z19.s, z31.s\n"
+    "mul z28.s, p2/M, z28.s, z23.s\n"
+    "mul z29.s, p2/M, z29.s, z23.s\n"
+    "add z30.s, z20.s, z18.s\n"
+    "add z31.s, z19.s, z16.s\n"
+    "mul z30.s, p2/M, z30.s, z23.s\n"
+    "mul z31.s, p2/M, z31.s, z23.s\n"
+    "zip1 z19.s, z24.s, z26.s\n"
+    "zip1 z18.s, z25.s, z27.s\n"
+    "zip1 z17.s, z28.s, z30.s\n"
+    "zip1 z16.s, z29.s, z31.s\n"
+    "zip1 z22.s, z19.s, z18.s\n"
+    "zip1 z23.s, z17.s, z16.s\n"
+    "add z24.s, z24.s, z12.s\n"
+    "add z25.s, z25.s, z12.s\n"
+    "add z26.s, z26.s, z12.s\n"
+    "add z27.s, z27.s, z12.s\n"
+    "add z28.s, z28.s, z12.s\n"
+    "add z29.s, z29.s, z12.s\n"
+    "add z30.s, z30.s, z12.s\n"
+    "add z31.s, z31.s, z12.s\n"
+    "1:"  // Loop
+    "udot z24.s, z8.b, z0.b[0]\n"
+    "udot z25.s, z8.b, z0.b[2]\n"
+    "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "udot z26.s, z8.b, z1.b[0]\n"
+    "udot z27.s, z8.b, z1.b[2]\n"
+    "incb x9\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "udot z24.s, z9.b, z0.b[1]\n"
+    "udot z25.s, z9.b, z0.b[3]\n"
+    "whilelt p0.b, x9, x10\n"
+    "udot z26.s, z9.b, z1.b[1]\n"
+    "udot z27.s, z9.b, z1.b[3]\n"
+    "udot z28.s, z8.b, z2.b[0]\n"
+    "udot z29.s, z8.b, z2.b[2]\n"
+    "udot z30.s, z8.b, z3.b[0]\n"
+    "udot z31.s, z8.b, z3.b[2]\n"
+    "ld1b { z17.b }, p2/Z, [%x[params]]\n"
+    "udot z24.s, z10.b, z1.b[0]\n"
+    "udot z25.s, z10.b, z1.b[2]\n"
+    "udot z26.s, z10.b, z2.b[0]\n"
+    "udot z27.s, z10.b, z2.b[2]\n"
+    "udot z28.s, z9.b, z2.b[1]\n"
+    "udot z29.s, z9.b, z2.b[3]\n"
+    "udot z30.s, z9.b, z3.b[1]\n"
+    "udot z31.s, z9.b, z3.b[3]\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "udot z24.s, z11.b, z1.b[1]\n"
+    "udot z25.s, z11.b, z1.b[3]\n"
+    "udot z26.s, z11.b, z2.b[1]\n"
+    "udot z27.s, z11.b, z2.b[3]\n"
+    "udot z28.s, z10.b, z3.b[0]\n"
+    "udot z29.s, z10.b, z3.b[2]\n"
+    "udot z30.s, z10.b, z4.b[0]\n"
+    "udot z31.s, z10.b, z4.b[2]\n"
+    "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "udot z24.s, z17.b, z2.b[0]\n"
+    "udot z25.s, z17.b, z2.b[2]\n"
+    "udot z26.s, z17.b, z3.b[0]\n"
+    "udot z27.s, z17.b, z3.b[2]\n"
+    "udot z28.s, z11.b, z3.b[1]\n"
+    "udot z29.s, z11.b, z3.b[3]\n"
+    "udot z30.s, z11.b, z4.b[1]\n"
+    "udot z31.s, z11.b, z4.b[3]\n"
+    "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "udot z24.s, z16.b, z2.b[1]\n"
+    "udot z25.s, z16.b, z2.b[3]\n"
+    "udot z26.s, z16.b, z3.b[1]\n"
+    "udot z27.s, z16.b, z3.b[3]\n"
+    "udot z28.s, z17.b, z4.b[0]\n"
+    "udot z29.s, z17.b, z4.b[2]\n"
+    "udot z30.s, z17.b, z5.b[0]\n"
+    "udot z31.s, z17.b, z5.b[2]\n"
+    "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "udot z24.s, z19.b, z3.b[0]\n"
+    "udot z25.s, z19.b, z3.b[2]\n"
+    "udot z26.s, z19.b, z4.b[0]\n"
+    "udot z27.s, z19.b, z4.b[2]\n"
+    "udot z28.s, z16.b, z4.b[1]\n"
+    "udot z29.s, z16.b, z4.b[3]\n"
+    "udot z30.s, z16.b, z5.b[1]\n"
+    "udot z31.s, z16.b, z5.b[3]\n"
+    "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "udot z24.s, z18.b, z3.b[1]\n"
+    "udot z25.s, z18.b, z3.b[3]\n"
+    "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+    "udot z26.s, z18.b, z4.b[1]\n"
+    "udot z27.s, z18.b, z4.b[3]\n"
+    "udot z28.s, z19.b, z5.b[0]\n"
+    "udot z29.s, z19.b, z5.b[2]\n"
+    "udot z30.s, z19.b, z6.b[0]\n"
+    "udot z31.s, z19.b, z6.b[2]\n"
+    "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
+    "udot z24.s, z17.b, z4.b[0]\n"
+    "udot z25.s, z17.b, z4.b[2]\n"
+    "udot z26.s, z17.b, z5.b[0]\n"
+    "udot z27.s, z17.b, z5.b[2]\n"
+    "udot z28.s, z18.b, z5.b[1]\n"
+    "udot z29.s, z18.b, z5.b[3]\n"
+    "udot z30.s, z18.b, z6.b[1]\n"
+    "udot z31.s, z18.b, z6.b[3]\n"
+    "ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
+    "udot z24.s, z16.b, z4.b[1]\n"
+    "udot z25.s, z16.b, z4.b[3]\n"
+    ".inst 0x04ac7718  // sqrdmulh z24.s, z24.s, z12.s\n"
+    "udot z26.s, z16.b, z5.b[1]\n"
+    "udot z27.s, z16.b, z5.b[3]\n"
+    ".inst 0x04ac7739  // sqrdmulh z25.s, z25.s, z12.s\n"
+    "udot z28.s, z17.b, z6.b[0]\n"
+    "udot z29.s, z17.b, z6.b[2]\n"
+    ".inst 0x04ac775a  // sqrdmulh z26.s, z26.s, z12.s\n"
+    "udot z30.s, z17.b, z7.b[0]\n"
+    "udot z31.s, z17.b, z7.b[2]\n"
+    ".inst 0x04ac777b  // sqrdmulh z27.s, z27.s, z12.s\n"
+    "ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
+    "udot z28.s, z16.b, z6.b[1]\n"
+    "udot z29.s, z16.b, z6.b[3]\n"
+    "and z19.d, z24.d, z21.d\n"
+    "udot z30.s, z16.b, z7.b[1]\n"
+    "udot z31.s, z16.b, z7.b[3]\n"
+    "and z18.d, z25.d, z21.d\n"
+    "ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
+    "and z17.d, z26.d, z21.d\n"
+    "and z16.d, z27.d, z21.d\n"
+    "addvl %x[params], %x[params], #-3\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04ac779c  // sqrdmulh z28.s, z28.s, z12.s\n"
+    ".inst 0x04ac77bd  // sqrdmulh z29.s, z29.s, z12.s\n"
+    ".inst 0x04ac77de  // sqrdmulh z30.s, z30.s, z12.s\n"
+    ".inst 0x04ac77ff  // sqrdmulh z31.s, z31.s, z12.s\n"
+    "sqadd z24.s, z24.s, z19.s\n"
+    "sqadd z25.s, z25.s, z18.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "sqadd z26.s, z26.s, z17.s\n"
+    "sqadd z27.s, z27.s, z16.s\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    ".inst 0x44828abb  // srshl z27.s, p2/M, z27.s, z21.s\n"
+    "and z19.d, z28.d, z21.d\n"
+    "and z18.d, z29.d, z21.d\n"
+    "and z17.d, z30.d, z21.d\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z19.s\n"
+    "sqadd z29.s, z29.s, z18.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "sqadd z30.s, z30.s, z17.s\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    ".inst 0x44828abe  // srshl z30.s, p2/M, z30.s, z21.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "add z24.s, z24.s, z13.s\n"
+    "add z25.s, z25.s, z13.s\n"
+    "smin z24.s, p2/M, z24.s, z15.s\n"
+    "smin z25.s, p2/M, z25.s, z15.s\n"
+    "add z26.s, z26.s, z13.s\n"
+    "add z27.s, z27.s, z13.s\n"
+    "smin z26.s, p2/M, z26.s, z15.s\n"
+    "smin z27.s, p2/M, z27.s, z15.s\n"
+    "add z28.s, z28.s, z13.s\n"
+    "add z29.s, z29.s, z13.s\n"
+    "smin z28.s, p2/M, z28.s, z15.s\n"
+    "smin z29.s, p2/M, z29.s, z15.s\n"
+    "add z30.s, z30.s, z13.s\n"
+    "add z31.s, z31.s, z13.s\n"
+    "smin z30.s, p2/M, z30.s, z15.s\n"
+    "smin z31.s, p2/M, z31.s, z15.s\n"
+    "smax z24.s, p2/M, z24.s, z14.s\n"
+    "smax z25.s, p2/M, z25.s, z14.s\n"
+    "st1b { z24.s }, p1, [x27, x28]\n"
+    "mov z24.s, z22.s[0]\n"
+    "smax z26.s, p2/M, z26.s, z14.s\n"
+    "smax z27.s, p2/M, z27.s, z14.s\n"
+    "st1b { z25.s }, p1, [x26, x28]\n"
+    "mov z25.s, z22.s[1]\n"
+    "smax z28.s, p2/M, z28.s, z14.s\n"
+    "smax z29.s, p2/M, z29.s, z14.s\n"
+    "st1b { z26.s }, p1, [x25, x28]\n"
+    "mov z26.s, z22.s[2]\n"
+    "smax z30.s, p2/M, z30.s, z14.s\n"
+    "smax z31.s, p2/M, z31.s, z14.s\n"
+    "st1b { z27.s }, p1, [x24, x28]\n"
+    "mov z27.s, z22.s[3]\n"
+    "st1b { z28.s }, p1, [x23, x28]\n"
+    "mov z28.s, z23.s[0]\n"
+    "add z24.s, z24.s, z20.s\n"
+    "st1b { z29.s }, p1, [x22, x28]\n"
+    "mov z29.s, z23.s[1]\n"
+    "add z25.s, z25.s, z20.s\n"
+    "st1b { z30.s }, p1, [x21, x28]\n"
+    "mov z30.s, z23.s[2]\n"
+    "add z26.s, z26.s, z20.s\n"
+    "st1b { z31.s }, p1, [x20, x28]\n"
+    "mov z31.s, z23.s[3]\n"
+    "incw x28\n"
+    "add z27.s, z27.s, z20.s\n"
+    "add z28.s, z28.s, z20.s\n"
+    "add z29.s, z29.s, z20.s\n"
+    "add z30.s, z30.s, z20.s\n"
+    "add z31.s, z31.s, z20.s\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..0f1030c0d7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..887eccf1e9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x16, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x16\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z12.b }, p4/Z, [x21]\n"
+    "ld1rb { z30.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z24.h }, p4/Z, [x22]\n"
+    "ld1rh { z11.h }, p4/Z, [x21]\n"
+    "ld1rh { z26.h }, p4/Z, [x20]\n"
+    "ldp x13, x12, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x16, x15\n"
+    "ldp x11, x10, [x24, #0x10]\n"
+    "whilelt p2.s, x16, x15\n"
+    "whilelt p1.s, x23, x15\n"
+    "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z14.h }, p4/Z, [x14]\n"
+    "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x27, #0x0\n"
+    "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e11ce  // ssublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e12b5  // ssublb z21.h, z21.b, z30.b\n"
+    "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1021  // ssublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e10c6  // ssublb z6.h, z6.b, z30.b\n"
+    "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1sb { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x9, x9, #2\n"
+    "mov z17.d, z5.d\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z25.d, z9.d\n"
+    "mov z16.d, z5.d\n"
+    "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z23.d, z9.d\n"
+    "mov z22.d, z5.d\n"
+    "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z27.d, z9.d\n"
+    ".inst 0x455e1252  // ssublb z18.h, z18.b, z30.b\n"
+    "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+    "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455e10e7  // ssublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e114a  // ssublb z10.h, z10.b, z30.b\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1a94  // usublb z20.h, z20.b, z12.b\n"
+    "1:"  // Loop
+    ".inst 0x44824005  // smlalb z5.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824409  // smlalt z9.s, p4/M, z0.h, z2.h\n"
+    "ldr x20, [x28, #0x28]\n"
+    "ldr x21, [x28, #0x38]\n"
+    ".inst 0x448e43a5  // smlalb z5.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x44864011  // smlalb z17.s, p4/M, z0.h, z6.h\n"
+    "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x30]\n"
+    ".inst 0x44954010  // smlalb z16.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x448e4016  // smlalb z22.s, p4/M, z0.h, z14.h\n"
+    "ld1b { z31.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c1863  // usublb z3.h, z3.b, z12.b\n"
+    ".inst 0x448e47a9  // smlalt z9.s, p4/M, z29.h, z14.h\n"
+    ".inst 0x449241a5  // smlalb z5.s, p4/M, z13.h, z18.h\n"
+    "ldr x21, [x28, #0x40]\n"
+    "ld1b { z15.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44864419  // smlalt z25.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    "ldr x20, [x28, #0x48]\n"
+    ".inst 0x448e441b  // smlalt z27.s, p4/M, z0.h, z14.h\n"
+    ".inst 0x44814091  // smlalb z17.s, p4/M, z4.h, z1.h\n"
+    "ld1b { z19.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454c19ef  // usublb z15.h, z15.b, z12.b\n"
+    ".inst 0x448141b0  // smlalb z16.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x449541b6  // smlalb z22.s, p4/M, z13.h, z21.h\n"
+    "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c1a73  // usublb z19.h, z19.b, z12.b\n"
+    ".inst 0x449245a9  // smlalt z9.s, p4/M, z13.h, z18.h\n"
+    ".inst 0x448a4285  // smlalb z5.s, p4/M, z20.h, z10.h\n"
+    "ldr x21, [x28, #0x50]\n"
+    "ldr x20, [x28, #0x58]\n"
+    ".inst 0x44814499  // smlalt z25.s, p4/M, z4.h, z1.h\n"
+    ".inst 0x448145b7  // smlalt z23.s, p4/M, z13.h, z1.h\n"
+    ".inst 0x454c1b9c  // usublb z28.h, z28.b, z12.b\n"
+    "ld1b { z4.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x449545bb  // smlalt z27.s, p4/M, z13.h, z21.h\n"
+    ".inst 0x448241b1  // smlalb z17.s, p4/M, z13.h, z2.h\n"
+    "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+    "ldr x21, [x28, #0x60]\n"
+    ".inst 0x44874070  // smlalb z16.s, p4/M, z3.h, z7.h\n"
+    ".inst 0x44864296  // smlalb z22.s, p4/M, z20.h, z6.h\n"
+    "ldr x20, [x28, #0x68]\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x448a4689  // smlalt z9.s, p4/M, z20.h, z10.h\n"
+    ".inst 0x449543e5  // smlalb z5.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    "ld1b { z0.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x448245b9  // smlalt z25.s, p4/M, z13.h, z2.h\n"
+    ".inst 0x44874477  // smlalt z23.s, p4/M, z3.h, z7.h\n"
+    "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+    "ldr x20, [x28, #0x70]\n"
+    ".inst 0x4486469b  // smlalt z27.s, p4/M, z20.h, z6.h\n"
+    ".inst 0x44874291  // smlalb z17.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    "ld1b { z13.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44824290  // smlalb z16.s, p4/M, z20.h, z2.h\n"
+    ".inst 0x448841f6  // smlalb z22.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x454c1863  // usublb z3.h, z3.b, z12.b\n"
+    "ldr x20, [x28, #0x78]\n"
+    ".inst 0x449547e9  // smlalt z9.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x44814265  // smlalb z5.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    "whilelt p0.h, x27, x15\n"
+    ".inst 0x44874699  // smlalt z25.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x44824697  // smlalt z23.s, p4/M, z20.h, z2.h\n"
+    "ld1w { z20.s }, p2/Z, [x26]\n"
+    "inch x14\n"
+    ".inst 0x448845fb  // smlalt z27.s, p4/M, z15.h, z8.h\n"
+    ".inst 0x448e43f1  // smlalb z17.s, p4/M, z31.h, z14.h\n"
+    "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44924390  // smlalb z16.s, p4/M, z28.h, z18.h\n"
+    ".inst 0x44824396  // smlalb z22.s, p4/M, z28.h, z2.h\n"
+    "addvl x26, x26, #2\n"
+    ".inst 0x44814669  // smlalt z9.s, p4/M, z19.h, z1.h\n"
+    ".inst 0x44884385  // smlalb z5.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x448e47f9  // smlalt z25.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x44924797  // smlalt z23.s, p4/M, z28.h, z18.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    ".inst 0x4482479b  // smlalt z27.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x44954271  // smlalb z17.s, p4/M, z19.h, z21.h\n"
+    "uzp1 z2.s, z20.s, z15.s\n"
+    "inch x16\n"
+    ".inst 0x448e4090  // smlalb z16.s, p4/M, z4.h, z14.h\n"
+    ".inst 0x448143b6  // smlalb z22.s, p4/M, z29.h, z1.h\n"
+    "uzp2 z15.s, z20.s, z15.s\n"
+    "ld1w { z20.s }, p2/Z, [x25]\n"
+    ".inst 0x44884789  // smlalt z9.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x44864085  // smlalb z5.s, p4/M, z4.h, z6.h\n"
+    "mov x20, x16\n"
+    "incw x20\n"
+    ".inst 0x44954679  // smlalt z25.s, p4/M, z19.h, z21.h\n"
+    ".inst 0x448e4497  // smlalt z23.s, p4/M, z4.h, z14.h\n"
+    "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+    "uzp1 z21.s, z20.s, z19.s\n"
+    ".inst 0x448147bb  // smlalt z27.s, p4/M, z29.h, z1.h\n"
+    ".inst 0x448a4391  // smlalb z17.s, p4/M, z28.h, z10.h\n"
+    "uzp2 z1.s, z20.s, z19.s\n"
+    "whilelt p2.s, x16, x15\n"
+    ".inst 0x44864010  // smlalb z16.s, p4/M, z0.h, z6.h\n"
+    ".inst 0x44924076  // smlalb z22.s, p4/M, z3.h, z18.h\n"
+    "whilelt p1.s, x20, x15\n"
+    "whilelt p3.h, x16, x15\n"
+    ".inst 0x44864489  // smlalt z9.s, p4/M, z4.h, z6.h\n"
+    ".inst 0x44874005  // smlalb z5.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x04a274a5  // sqrdmulh z5.s, z5.s, z2.s\n"
+    "addvl x25, x25, #2\n"
+    ".inst 0x448a4799  // smlalt z25.s, p4/M, z28.h, z10.h\n"
+    ".inst 0x44864417  // smlalt z23.s, p4/M, z0.h, z6.h\n"
+    "and z19.d, z5.d, z21.d\n"
+    ".inst 0x4492447b  // smlalt z27.s, p4/M, z3.h, z18.h\n"
+    ".inst 0x449243b1  // smlalb z17.s, p4/M, z29.h, z18.h\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    ".inst 0x448a41b0  // smlalb z16.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448741b6  // smlalb z22.s, p4/M, z13.h, z7.h\n"
+    "sqadd z5.s, z5.s, z19.s\n"
+    ".inst 0x448292a5  // srshl z5.s, p4/M, z5.s, z21.s\n"
+    ".inst 0x44874409  // smlalt z9.s, p4/M, z0.h, z7.h\n"
+    ".inst 0x449247b9  // smlalt z25.s, p4/M, z29.h, z18.h\n"
+    ".inst 0x04af7529  // sqrdmulh z9.s, z9.s, z15.s\n"
+    ".inst 0x448a45b7  // smlalt z23.s, p4/M, z13.h, z10.h\n"
+    ".inst 0x448745bb  // smlalt z27.s, p4/M, z13.h, z7.h\n"
+    "and z29.d, z9.d, z1.d\n"
+    ".inst 0x44884071  // smlalb z17.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x448843f0  // smlalb z16.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x04a27631  // sqrdmulh z17.s, z17.s, z2.s\n"
+    ".inst 0x448a43f6  // smlalb z22.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x44884479  // smlalt z25.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x04a27610  // sqrdmulh z16.s, z16.s, z2.s\n"
+    ".inst 0x448847f7  // smlalt z23.s, p4/M, z31.h, z8.h\n"
+    ".inst 0x448a47fb  // smlalt z27.s, p4/M, z31.h, z10.h\n"
+    ".inst 0x04a276d6  // sqrdmulh z22.s, z22.s, z2.s\n"
+    "asr z29.s, z29.s, #0x1f\n"
+    "and z18.d, z17.d, z21.d\n"
+    ".inst 0x04af7739  // sqrdmulh z25.s, z25.s, z15.s\n"
+    "and z20.d, z16.d, z21.d\n"
+    ".inst 0x04af76f7  // sqrdmulh z23.s, z23.s, z15.s\n"
+    "and z19.d, z22.d, z21.d\n"
+    ".inst 0x04af777b  // sqrdmulh z27.s, z27.s, z15.s\n"
+    "sqadd z9.s, z9.s, z29.s\n"
+    ".inst 0x44829029  // srshl z9.s, p4/M, z9.s, z1.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z7.d, z25.d, z1.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z6.d, z23.d, z1.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z2.d, z27.d, z1.d\n"
+    "sqadd z17.s, z17.s, z18.s\n"
+    "asr z7.s, z7.s, #0x1f\n"
+    ".inst 0x448292b1  // srshl z17.s, p4/M, z17.s, z21.s\n"
+    "sqadd z16.s, z16.s, z20.s\n"
+    "asr z6.s, z6.s, #0x1f\n"
+    ".inst 0x448292b0  // srshl z16.s, p4/M, z16.s, z21.s\n"
+    "sqadd z22.s, z22.s, z19.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    ".inst 0x448292b6  // srshl z22.s, p4/M, z22.s, z21.s\n"
+    "sqadd z25.s, z25.s, z7.s\n"
+    "sqadd z23.s, z23.s, z6.s\n"
+    ".inst 0x44829039  // srshl z25.s, p4/M, z25.s, z1.s\n"
+    ".inst 0x44829037  // srshl z23.s, p4/M, z23.s, z1.s\n"
+    "sqadd z27.s, z27.s, z2.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x4482903b  // srshl z27.s, p4/M, z27.s, z1.s\n"
+    ".inst 0x45304231  // sqxtnb z17.h, z17.s\n"
+    ".inst 0x45304210  // sqxtnb z16.h, z16.s\n"
+    ".inst 0x453042d6  // sqxtnb z22.h, z22.s\n"
+    ".inst 0x45304525  // sqxtnt z5.h, z9.s\n"
+    ".inst 0x45304731  // sqxtnt z17.h, z25.s\n"
+    ".inst 0x453046f0  // sqxtnt z16.h, z23.s\n"
+    ".inst 0x45304776  // sqxtnt z22.h, z27.s\n"
+    "sqadd z5.h, z5.h, z24.h\n"
+    "smax z5.h, p4/M, z5.h, z11.h\n"
+    "smin z5.h, p4/M, z5.h, z26.h\n"
+    "sqadd z17.h, z17.h, z24.h\n"
+    "sqadd z16.h, z16.h, z24.h\n"
+    "smax z17.h, p4/M, z17.h, z11.h\n"
+    "smax z16.h, p4/M, z16.h, z11.h\n"
+    "sqadd z22.h, z22.h, z24.h\n"
+    "smax z22.h, p4/M, z22.h, z11.h\n"
+    "smin z17.h, p4/M, z17.h, z26.h\n"
+    "st1b { z5.h }, p0, [x13, x27]\n"
+    "smin z16.h, p4/M, z16.h, z26.h\n"
+    "smin z22.h, p4/M, z22.h, z26.h\n"
+    "st1b { z17.h }, p0, [x12, x27]\n"
+    "st1b { z16.h }, p0, [x11, x27]\n"
+    "st1b { z22.h }, p0, [x10, x27]\n"
+    "ld1sb { z14.h }, p4/Z, [x14]\n"
+    "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+    "inch x27\n"
+    "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+    "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+    ".inst 0x455e11ce  // ssublb z14.h, z14.b, z30.b\n"
+    ".inst 0x455e12b5  // ssublb z21.h, z21.b, z30.b\n"
+    "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+    "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+    ".inst 0x455e1021  // ssublb z1.h, z1.b, z30.b\n"
+    ".inst 0x455e10c6  // ssublb z6.h, z6.b, z30.b\n"
+    "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+    "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+    "inch x14, ALL, MUL #8\n"
+    ".inst 0x455e1042  // ssublb z2.h, z2.b, z30.b\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z5.s, z17.s, z16.s\n"
+    "uzp2 z9.s, z17.s, z16.s\n"
+    "ld1sb { z8.h }, p4/Z, [x14]\n"
+    "ldp x24, x23, [x28, #0x0]\n"
+    "addvl x21, x21, #2\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x22, x21, [x28, #0x10]\n"
+    "ldr x20, [x28, #0x20]\n"
+    "mov z17.d, z5.d\n"
+    "mov z25.d, z9.d\n"
+    "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+    "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+    "mov z16.d, z5.d\n"
+    "mov z23.d, z9.d\n"
+    "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+    "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+    "mov z22.d, z5.d\n"
+    "mov z27.d, z9.d\n"
+    "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x455e1252  // ssublb z18.h, z18.b, z30.b\n"
+    ".inst 0x455e10e7  // ssublb z7.h, z7.b, z30.b\n"
+    ".inst 0x455e114a  // ssublb z10.h, z10.b, z30.b\n"
+    ".inst 0x455e1108  // ssublb z8.h, z8.b, z30.b\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    ".inst 0x454c19ad  // usublb z13.h, z13.b, z12.b\n"
+    ".inst 0x454c1a94  // usublb z20.h, z20.b, z12.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..79e3fd5f54
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..754d06d443
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x7, #0x0\n"
+    "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+    "ptrue p4.b\n"
+    "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "mov x23, x7\n"
+    "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+    "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z26.b }, p4/Z, [x21]\n"
+    "ld1rb { z13.b }, p4/Z, [x20]\n"
+    "add x21, x25, %[offsetof_Requantize32_minval]\n"
+    "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z19.h }, p4/Z, [x22]\n"
+    "ld1rh { z12.h }, p4/Z, [x21]\n"
+    "ld1rh { z9.h }, p4/Z, [x20]\n"
+    "ldp x16, x15, [x24, #0x0]\n"
+    "incw x23\n"
+    "whilelt p3.h, x7, x8\n"
+    "ldp x14, x13, [x24, #0x10]\n"
+    "whilelt p2.s, x7, x8\n"
+    "whilelt p1.s, x23, x8\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z25.h }, p4/Z, [x17]\n"
+    "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+    "mov x10, #0x0\n"
+    "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1339  // ssublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d13de  // ssublb z30.h, z30.b, z13.b\n"
+    "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d11ce  // ssublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d114a  // ssublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x12]\n"
+    "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1sb { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x12, x12, #2\n"
+    "mov z18.d, z8.d\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z0.d, z24.d\n"
+    "mov z15.d, z8.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z1.d, z24.d\n"
+    "mov z5.d, z8.d\n"
+    "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z6.d, z24.d\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d12f7  // ssublb z23.h, z23.b, z13.b\n"
+    ".inst 0x454d10e7  // ssublb z7.h, z7.b, z13.b\n"
+    "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "1:"  // Loop
+    ".inst 0x448242a8  // smlalb z8.s, p4/M, z21.h, z2.h\n"
+    "ldr x21, [x11, #0x58]\n"
+    "ldr x20, [x11, #0x78]\n"
+    ".inst 0x448246b8  // smlalt z24.s, p4/M, z21.h, z2.h\n"
+    ".inst 0x449942c8  // smlalb z8.s, p4/M, z22.h, z25.h\n"
+    "ld1b { z17.h }, p3/Z, [x21, x7]\n"
+    "ld1b { z29.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x449742b2  // smlalb z18.s, p4/M, z21.h, z23.h\n"
+    "ldr x21, [x11, #0x60]\n"
+    "ldr x20, [x11, #0x80]\n"
+    ".inst 0x448e42af  // smlalb z15.s, p4/M, z21.h, z14.h\n"
+    ".inst 0x449942a5  // smlalb z5.s, p4/M, z21.h, z25.h\n"
+    ".inst 0x449946d8  // smlalt z24.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x455a1a31  // usublb z17.h, z17.b, z26.b\n"
+    ".inst 0x449e4168  // smlalb z8.s, p4/M, z11.h, z30.h\n"
+    "ld1b { z22.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a1bbd  // usublb z29.h, z29.b, z26.b\n"
+    ".inst 0x449746a0  // smlalt z0.s, p4/M, z21.h, z23.h\n"
+    ".inst 0x448e46a1  // smlalt z1.s, p4/M, z21.h, z14.h\n"
+    "ldr x21, [x11, #0x68]\n"
+    ".inst 0x449946a6  // smlalt z6.s, p4/M, z21.h, z25.h\n"
+    "ld1b { z21.h }, p3/Z, [x20, x7]\n"
+    "ldr x20, [x11, #0x88]\n"
+    ".inst 0x449e4292  // smlalb z18.s, p4/M, z20.h, z30.h\n"
+    ".inst 0x4484422f  // smlalb z15.s, p4/M, z17.h, z4.h\n"
+    ".inst 0x448a43a5  // smlalb z5.s, p4/M, z29.h, z10.h\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    "ldr x22, [x11, #0x40]\n"
+    ".inst 0x449e4578  // smlalt z24.s, p4/M, z11.h, z30.h\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    ".inst 0x44844388  // smlalb z8.s, p4/M, z28.h, z4.h\n"
+    "ld1b { z11.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x449e4680  // smlalt z0.s, p4/M, z20.h, z30.h\n"
+    "ld1b { z20.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844621  // smlalt z1.s, p4/M, z17.h, z4.h\n"
+    "ldr x21, [x11, #0x70]\n"
+    ".inst 0x448a47a6  // smlalt z6.s, p4/M, z29.h, z10.h\n"
+    "ldr x20, [x11, #0x98]\n"
+    ".inst 0x448e4372  // smlalb z18.s, p4/M, z27.h, z14.h\n"
+    "ldr x23, [x11, #0x50]\n"
+    ".inst 0x449942cf  // smlalb z15.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e42a5  // smlalb z5.s, p4/M, z21.h, z30.h\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    "ld1b { z17.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x44844798  // smlalt z24.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x448a4208  // smlalb z8.s, p4/M, z16.h, z10.h\n"
+    "ld1b { z29.h }, p3/Z, [x21, x7]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448e4760  // smlalt z0.s, p4/M, z27.h, z14.h\n"
+    "ldr x22, [x11, #0x48]\n"
+    ".inst 0x449946c1  // smlalt z1.s, p4/M, z22.h, z25.h\n"
+    ".inst 0x449e46a6  // smlalt z6.s, p4/M, z21.h, z30.h\n"
+    "ldr x21, [x11, #0x90]\n"
+    "ldr x20, [x11, #0xa8]\n"
+    ".inst 0x449943f2  // smlalb z18.s, p4/M, z31.h, z25.h\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x448a416f  // smlalb z15.s, p4/M, z11.h, z10.h\n"
+    ".inst 0x44834285  // smlalb z5.s, p4/M, z20.h, z3.h\n"
+    ".inst 0x455a1a31  // usublb z17.h, z17.b, z26.b\n"
+    ".inst 0x448a4618  // smlalt z24.s, p4/M, z16.h, z10.h\n"
+    ".inst 0x455a1bbd  // usublb z29.h, z29.b, z26.b\n"
+    ".inst 0x448e43e8  // smlalb z8.s, p4/M, z31.h, z14.h\n"
+    "ld1b { z16.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x449947e0  // smlalt z0.s, p4/M, z31.h, z25.h\n"
+    "ld1b { z25.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x448a4561  // smlalt z1.s, p4/M, z11.h, z10.h\n"
+    "ld1b { z11.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x44834686  // smlalt z6.s, p4/M, z20.h, z3.h\n"
+    "ldr x21, [x11, #0xa0]\n"
+    "ldr x20, [x11, #0xb0]\n"
+    ".inst 0x448a4232  // smlalb z18.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e43af  // smlalb z15.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x448e4385  // smlalb z5.s, p4/M, z28.h, z14.h\n"
+    ".inst 0x448e47f8  // smlalt z24.s, p4/M, z31.h, z14.h\n"
+    ".inst 0x455a1b39  // usublb z25.h, z25.b, z26.b\n"
+    "ld1b { z20.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    ".inst 0x44834368  // smlalb z8.s, p4/M, z27.h, z3.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x448a4620  // smlalt z0.s, p4/M, z17.h, z10.h\n"
+    ".inst 0x449e47a1  // smlalt z1.s, p4/M, z29.h, z30.h\n"
+    ".inst 0x448e4786  // smlalt z6.s, p4/M, z28.h, z14.h\n"
+    "ldr x20, [x11, #0xb8]\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x44834212  // smlalb z18.s, p4/M, z16.h, z3.h\n"
+    ".inst 0x4497432f  // smlalb z15.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "ld1b { z30.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x44844165  // smlalb z5.s, p4/M, z11.h, z4.h\n"
+    ".inst 0x44834778  // smlalt z24.s, p4/M, z27.h, z3.h\n"
+    "ldr x20, [x11, #0xc0]\n"
+    "ld1w { z17.s }, p2/Z, [x9]\n"
+    ".inst 0x449742c8  // smlalb z8.s, p4/M, z22.h, z23.h\n"
+    ".inst 0x44834600  // smlalt z0.s, p4/M, z16.h, z3.h\n"
+    "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+    ".inst 0x455a1bde  // usublb z30.h, z30.b, z26.b\n"
+    ".inst 0x44974721  // smlalt z1.s, p4/M, z25.h, z23.h\n"
+    ".inst 0x44844566  // smlalt z6.s, p4/M, z11.h, z4.h\n"
+    "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+    "uzp1 z10.s, z17.s, z14.s\n"
+    ".inst 0x44844372  // smlalb z18.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x4487428f  // smlalb z15.s, p4/M, z20.h, z7.h\n"
+    "uzp2 z14.s, z17.s, z14.s\n"
+    "ld1w { z17.s }, p2/Z, [x28]\n"
+    ".inst 0x448743e5  // smlalb z5.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x449746d8  // smlalt z24.s, p4/M, z22.h, z23.h\n"
+    "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+    ".inst 0x455a1b39  // usublb z25.h, z25.b, z26.b\n"
+    ".inst 0x448743a8  // smlalb z8.s, p4/M, z29.h, z7.h\n"
+    ".inst 0x44844760  // smlalt z0.s, p4/M, z27.h, z4.h\n"
+    "uzp1 z4.s, z17.s, z16.s\n"
+    "inch x7\n"
+    ".inst 0x44874681  // smlalt z1.s, p4/M, z20.h, z7.h\n"
+    ".inst 0x448747e6  // smlalt z6.s, p4/M, z31.h, z7.h\n"
+    ".inst 0x04aa7508  // sqrdmulh z8.s, z8.s, z10.s\n"
+    "whilelt p0.h, x10, x8\n"
+    ".inst 0x448742b2  // smlalb z18.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x4483416f  // smlalb z15.s, p4/M, z11.h, z3.h\n"
+    "uzp2 z22.s, z17.s, z16.s\n"
+    "mov x20, x7\n"
+    ".inst 0x449743c5  // smlalb z5.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x448747b8  // smlalt z24.s, p4/M, z29.h, z7.h\n"
+    "and z17.d, z8.d, z4.d\n"
+    "inch x17\n"
+    ".inst 0x448746a0  // smlalt z0.s, p4/M, z21.h, z7.h\n"
+    ".inst 0x44834561  // smlalt z1.s, p4/M, z11.h, z3.h\n"
+    ".inst 0x04ae7718  // sqrdmulh z24.s, z24.s, z14.s\n"
+    "incw x20\n"
+    ".inst 0x449747c6  // smlalt z6.s, p4/M, z30.h, z23.h\n"
+    ".inst 0x44824392  // smlalb z18.s, p4/M, z28.h, z2.h\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "whilelt p2.s, x7, x8\n"
+    ".inst 0x448243cf  // smlalb z15.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x44824325  // smlalb z5.s, p4/M, z25.h, z2.h\n"
+    "and z16.d, z24.d, z22.d\n"
+    "whilelt p1.s, x20, x8\n"
+    ".inst 0x44824780  // smlalt z0.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x448247c1  // smlalt z1.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x04aa7652  // sqrdmulh z18.s, z18.s, z10.s\n"
+    "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44824726  // smlalt z6.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x04aa75ef  // sqrdmulh z15.s, z15.s, z10.s\n"
+    "whilelt p3.h, x7, x8\n"
+    "addvl x9, x9, #2\n"
+    ".inst 0x04aa74a5  // sqrdmulh z5.s, z5.s, z10.s\n"
+    "sqadd z8.s, z8.s, z17.s\n"
+    ".inst 0x44829088  // srshl z8.s, p4/M, z8.s, z4.s\n"
+    "addvl x28, x28, #2\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "and z21.d, z18.d, z4.d\n"
+    ".inst 0x04ae7400  // sqrdmulh z0.s, z0.s, z14.s\n"
+    "and z20.d, z15.d, z4.d\n"
+    ".inst 0x04ae7421  // sqrdmulh z1.s, z1.s, z14.s\n"
+    "and z28.d, z5.d, z4.d\n"
+    ".inst 0x04ae74c6  // sqrdmulh z6.s, z6.s, z14.s\n"
+    "sqadd z24.s, z24.s, z16.s\n"
+    ".inst 0x448292d8  // srshl z24.s, p4/M, z24.s, z22.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    "and z25.d, z0.d, z22.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z17.d, z1.d, z22.d\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "and z16.d, z6.d, z22.d\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "asr z25.s, z25.s, #0x1f\n"
+    ".inst 0x44829092  // srshl z18.s, p4/M, z18.s, z4.s\n"
+    "sqadd z15.s, z15.s, z20.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x4482908f  // srshl z15.s, p4/M, z15.s, z4.s\n"
+    "sqadd z5.s, z5.s, z28.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x44829085  // srshl z5.s, p4/M, z5.s, z4.s\n"
+    "sqadd z0.s, z0.s, z25.s\n"
+    "sqadd z1.s, z1.s, z17.s\n"
+    ".inst 0x448292c0  // srshl z0.s, p4/M, z0.s, z22.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    "sqadd z6.s, z6.s, z16.s\n"
+    ".inst 0x45304108  // sqxtnb z8.h, z8.s\n"
+    ".inst 0x448292c6  // srshl z6.s, p4/M, z6.s, z22.s\n"
+    ".inst 0x45304252  // sqxtnb z18.h, z18.s\n"
+    ".inst 0x453041ef  // sqxtnb z15.h, z15.s\n"
+    ".inst 0x453040a5  // sqxtnb z5.h, z5.s\n"
+    ".inst 0x45304708  // sqxtnt z8.h, z24.s\n"
+    ".inst 0x45304412  // sqxtnt z18.h, z0.s\n"
+    ".inst 0x4530442f  // sqxtnt z15.h, z1.s\n"
+    ".inst 0x453044c5  // sqxtnt z5.h, z6.s\n"
+    "sqadd z8.h, z8.h, z19.h\n"
+    "smax z8.h, p4/M, z8.h, z12.h\n"
+    "smin z8.h, p4/M, z8.h, z9.h\n"
+    "sqadd z18.h, z18.h, z19.h\n"
+    "sqadd z15.h, z15.h, z19.h\n"
+    "smax z18.h, p4/M, z18.h, z12.h\n"
+    "smax z15.h, p4/M, z15.h, z12.h\n"
+    "sqadd z5.h, z5.h, z19.h\n"
+    "smax z5.h, p4/M, z5.h, z12.h\n"
+    "smin z18.h, p4/M, z18.h, z9.h\n"
+    "st1b { z8.h }, p0, [x16, x10]\n"
+    "smin z15.h, p4/M, z15.h, z9.h\n"
+    "smin z5.h, p4/M, z5.h, z9.h\n"
+    "st1b { z18.h }, p0, [x15, x10]\n"
+    "st1b { z15.h }, p0, [x14, x10]\n"
+    "st1b { z5.h }, p0, [x13, x10]\n"
+    "ld1sb { z25.h }, p4/Z, [x17]\n"
+    "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "inch x10\n"
+    "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+    ".inst 0x454d1339  // ssublb z25.h, z25.b, z13.b\n"
+    ".inst 0x454d13de  // ssublb z30.h, z30.b, z13.b\n"
+    "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x454d11ce  // ssublb z14.h, z14.b, z13.b\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x454d114a  // ssublb z10.h, z10.b, z13.b\n"
+    "ld1w { z17.s }, p2/Z, [x20]\n"
+    "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+    "uzp1 z8.s, z17.s, z16.s\n"
+    "uzp2 z24.s, z17.s, z16.s\n"
+    "ld1sb { z2.h }, p4/Z, [x17]\n"
+    "ldp x27, x26, [x11, #0x0]\n"
+    "addvl x20, x20, #2\n"
+    "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldp x25, x24, [x11, #0x10]\n"
+    "ldp x23, x22, [x11, #0x20]\n"
+    "mov z18.d, z8.d\n"
+    "mov z0.d, z24.d\n"
+    "ldp x21, x20, [x11, #0x30]\n"
+    "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+    "mov z15.d, z8.d\n"
+    "mov z1.d, z24.d\n"
+    "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+    "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+    "mov z5.d, z8.d\n"
+    "mov z6.d, z24.d\n"
+    "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    ".inst 0x454d12f7  // ssublb z23.h, z23.b, z13.b\n"
+    "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+    "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x454d10e7  // ssublb z7.h, z7.b, z13.b\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x455a1ab5  // usublb z21.h, z21.b, z26.b\n"
+    ".inst 0x455a1ad6  // usublb z22.h, z22.b, z26.b\n"
+    ".inst 0x455a196b  // usublb z11.h, z11.b, z26.b\n"
+    ".inst 0x455a1a94  // usublb z20.h, z20.b, z26.b\n"
+    ".inst 0x455a1b7b  // usublb z27.h, z27.b, z26.b\n"
+    ".inst 0x455a1b9c  // usublb z28.h, z28.b, z26.b\n"
+    ".inst 0x455a1a10  // usublb z16.h, z16.b, z26.b\n"
+    ".inst 0x455a1bff  // usublb z31.h, z31.b, z26.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..0ff853ec2d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+  using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+  public:
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+  arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+  Parent::KernelType kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+  Parent::KernelType get_kernel(void) const override { return kernel; }
+  unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f24a258484
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const void *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const void *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "mov x2, #0x0\n"
+    "mov x24, x2\n"
+    "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+    "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "incw x24\n"
+    "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
+    "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z30.b }, p4/Z, [x21]\n"
+    "ld1rb { z10.b }, p4/Z, [x20]\n"
+    "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+    "add x20, x23, %[offsetof_Requantize32_minval]\n"
+    "ld1rh { z15.h }, p4/Z, [x21]\n"
+    "ld1rh { z12.h }, p4/Z, [x20]\n"
+    "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+    "ld1rh { z13.h }, p4/Z, [x20]\n"
+    "ldp x5, x6, [x22, #0x0]\n"
+    "whilelt p3.h, x2, x3\n"
+    "ldp x7, x8, [x22, #0x10]\n"
+    "whilelt p2.s, x2, x3\n"
+    "whilelt p1.s, x24, x3\n"
+    "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+    "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+    "ld1w { z17.s }, p2/Z, [x10]\n"
+    "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1sb { z26.h }, p4/Z, [x4]\n"
+    "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x10, x10, #2\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "mov x16, #0x0\n"
+    "mov z6.d, z14.d\n"
+    "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z18.d, z23.d\n"
+    "mov z9.d, z14.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z20.d, z23.d\n"
+    "mov z7.d, z14.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z1.d, z23.d\n"
+    ".inst 0x454a135a  // ssublb z26.h, z26.b, z10.b\n"
+    "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a1108  // ssublb z8.h, z8.b, z10.b\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x454a1231  // ssublb z17.h, z17.b, z10.b\n"
+    "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1a73  // usublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    "1:"  // Loop
+    ".inst 0x449a42ce  // smlalb z14.s, p4/M, z22.h, z26.h\n"
+    ".inst 0x449a46d7  // smlalt z23.s, p4/M, z22.h, z26.h\n"
+    "ldr x20, [x17, #0x50]\n"
+    "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x4488404e  // smlalb z14.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449a4046  // smlalb z6.s, p4/M, z2.h, z26.h\n"
+    "ldr x20, [x17, #0x58]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449a4169  // smlalb z9.s, p4/M, z11.h, z26.h\n"
+    ".inst 0x449a4067  // smlalb z7.s, p4/M, z3.h, z26.h\n"
+    "ld1b { z5.h }, p3/Z, [x20, x2]\n"
+    "ldr x20, [x17, #0x60]\n"
+    ".inst 0x44884457  // smlalt z23.s, p4/M, z2.h, z8.h\n"
+    ".inst 0x449043ae  // smlalb z14.s, p4/M, z29.h, z16.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x449a4452  // smlalt z18.s, p4/M, z2.h, z26.h\n"
+    ".inst 0x449a4574  // smlalt z20.s, p4/M, z11.h, z26.h\n"
+    "ld1b { z22.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x449a4461  // smlalt z1.s, p4/M, z3.h, z26.h\n"
+    ".inst 0x448843a6  // smlalb z6.s, p4/M, z29.h, z8.h\n"
+    "ldr x20, [x17, #0x68]\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x44884069  // smlalb z9.s, p4/M, z3.h, z8.h\n"
+    ".inst 0x44884087  // smlalb z7.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x449047b7  // smlalt z23.s, p4/M, z29.h, z16.h\n"
+    ".inst 0x449543ee  // smlalb z14.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    "ldr x20, [x17, #0x70]\n"
+    ".inst 0x448847b2  // smlalt z18.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x44884474  // smlalt z20.s, p4/M, z3.h, z8.h\n"
+    "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+    ".inst 0x455e1b5a  // usublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44884481  // smlalt z1.s, p4/M, z4.h, z8.h\n"
+    ".inst 0x449043e6  // smlalb z6.s, p4/M, z31.h, z16.h\n"
+    "inch x4, ALL, MUL #8\n"
+    "ld1b { z8.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x44904089  // smlalb z9.s, p4/M, z4.h, z16.h\n"
+    ".inst 0x44904367  // smlalb z7.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x454a13bd  // ssublb z29.h, z29.b, z10.b\n"
+    "ldr x20, [x17, #0x78]\n"
+    ".inst 0x449547f7  // smlalt z23.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x4491400e  // smlalb z14.s, p4/M, z0.h, z17.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4]\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x449047f2  // smlalt z18.s, p4/M, z31.h, z16.h\n"
+    ".inst 0x44904494  // smlalt z20.s, p4/M, z4.h, z16.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44904761  // smlalt z1.s, p4/M, z27.h, z16.h\n"
+    ".inst 0x44954006  // smlalb z6.s, p4/M, z0.h, z21.h\n"
+    "ldr x22, [x17, #0x80]\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x44954369  // smlalb z9.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x449540a7  // smlalb z7.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    "ldr x21, [x17, #0x88]\n"
+    ".inst 0x44914417  // smlalt z23.s, p4/M, z0.h, z17.h\n"
+    ".inst 0x4499416e  // smlalb z14.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    "ldr x20, [x17, #0x90]\n"
+    ".inst 0x44954412  // smlalt z18.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44954774  // smlalt z20.s, p4/M, z27.h, z21.h\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x449544a1  // smlalt z1.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x449142c6  // smlalb z6.s, p4/M, z22.h, z17.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449140a9  // smlalb z9.s, p4/M, z5.h, z17.h\n"
+    ".inst 0x44914267  // smlalb z7.s, p4/M, z19.h, z17.h\n"
+    "ldr x23, [x17, #0x98]\n"
+    "ldr x22, [x17, #0xa0]\n"
+    ".inst 0x44994577  // smlalt z23.s, p4/M, z11.h, z25.h\n"
+    ".inst 0x4482406e  // smlalb z14.s, p4/M, z3.h, z2.h\n"
+    "ld1b { z11.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    ".inst 0x449146d2  // smlalt z18.s, p4/M, z22.h, z17.h\n"
+    ".inst 0x449144b4  // smlalt z20.s, p4/M, z5.h, z17.h\n"
+    "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a12d6  // ssublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44914661  // smlalt z1.s, p4/M, z19.h, z17.h\n"
+    ".inst 0x44994066  // smlalb z6.s, p4/M, z3.h, z25.h\n"
+    "ld1b { z17.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1a31  // usublb z17.h, z17.b, z30.b\n"
+    ".inst 0x44994389  // smlalb z9.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994347  // smlalb z7.s, p4/M, z26.h, z25.h\n"
+    "ldr x20, [x17, #0xa8]\n"
+    "ldr x21, [x17, #0xb0]\n"
+    ".inst 0x44824477  // smlalt z23.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x449d408e  // smlalb z14.s, p4/M, z4.h, z29.h\n"
+    "ldr x13, [x17, #0xb8]\n"
+    "ldr x12, [x17, #0xc0]\n"
+    ".inst 0x44994472  // smlalt z18.s, p4/M, z3.h, z25.h\n"
+    ".inst 0x44994794  // smlalt z20.s, p4/M, z28.h, z25.h\n"
+    "ld1b { z3.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44994741  // smlalt z1.s, p4/M, z26.h, z25.h\n"
+    ".inst 0x44824086  // smlalb z6.s, p4/M, z4.h, z2.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824349  // smlalb z9.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824107  // smlalb z7.s, p4/M, z8.h, z2.h\n"
+    "ldr x11, [x17, #0xc8]\n"
+    "ldr x10, [x17, #0xd0]\n"
+    ".inst 0x449d4497  // smlalt z23.s, p4/M, z4.h, z29.h\n"
+    ".inst 0x4498436e  // smlalb z14.s, p4/M, z27.h, z24.h\n"
+    "ldr x9, [x17, #0xd8]\n"
+    "ldr x28, [x17, #0xe0]\n"
+    ".inst 0x44824492  // smlalt z18.s, p4/M, z4.h, z2.h\n"
+    ".inst 0x44824754  // smlalt z20.s, p4/M, z26.h, z2.h\n"
+    "ld1b { z4.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44824501  // smlalt z1.s, p4/M, z8.h, z2.h\n"
+    ".inst 0x449d4366  // smlalb z6.s, p4/M, z27.h, z29.h\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d4109  // smlalb z9.s, p4/M, z8.h, z29.h\n"
+    ".inst 0x449d43e7  // smlalb z7.s, p4/M, z31.h, z29.h\n"
+    "ldr x27, [x17, #0xe8]\n"
+    "ldr x26, [x17, #0xf0]\n"
+    ".inst 0x44984777  // smlalt z23.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449040ae  // smlalb z14.s, p4/M, z5.h, z16.h\n"
+    "ldr x25, [x17, #0xf8]\n"
+    "ldr x24, [x17, #0x100]\n"
+    ".inst 0x449d4772  // smlalt z18.s, p4/M, z27.h, z29.h\n"
+    ".inst 0x449d4514  // smlalt z20.s, p4/M, z8.h, z29.h\n"
+    "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449d47e1  // smlalt z1.s, p4/M, z31.h, z29.h\n"
+    ".inst 0x449840a6  // smlalb z6.s, p4/M, z5.h, z24.h\n"
+    "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a13bd  // ssublb z29.h, z29.b, z10.b\n"
+    ".inst 0x449843e9  // smlalb z9.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984007  // smlalb z7.s, p4/M, z0.h, z24.h\n"
+    "ldr x23, [x17, #0x108]\n"
+    "ldr x22, [x17, #0x110]\n"
+    ".inst 0x449044b7  // smlalt z23.s, p4/M, z5.h, z16.h\n"
+    ".inst 0x4495438e  // smlalb z14.s, p4/M, z28.h, z21.h\n"
+    "ldr x20, [x17, #0x118]\n"
+    "whilelt p0.h, x16, x3\n"
+    ".inst 0x449844b2  // smlalt z18.s, p4/M, z5.h, z24.h\n"
+    ".inst 0x449847f4  // smlalt z20.s, p4/M, z31.h, z24.h\n"
+    "ld1b { z5.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44984401  // smlalt z1.s, p4/M, z0.h, z24.h\n"
+    ".inst 0x44904266  // smlalb z6.s, p4/M, z19.h, z16.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44904009  // smlalb z9.s, p4/M, z0.h, z16.h\n"
+    ".inst 0x44904167  // smlalb z7.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+    ".inst 0x44954797  // smlalt z23.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x4496434e  // smlalb z14.s, p4/M, z26.h, z22.h\n"
+    "ld1b { z28.h }, p3/Z, [x13, x2]\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44904672  // smlalt z18.s, p4/M, z19.h, z16.h\n"
+    ".inst 0x44904414  // smlalt z20.s, p4/M, z0.h, z16.h\n"
+    "ld1sb { z19.h }, p4/Z, [x4]\n"
+    ".inst 0x454a1273  // ssublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44904561  // smlalt z1.s, p4/M, z11.h, z16.h\n"
+    ".inst 0x44954346  // smlalb z6.s, p4/M, z26.h, z21.h\n"
+    "ld1b { z16.h }, p3/Z, [x12, x2]\n"
+    ".inst 0x455e1a10  // usublb z16.h, z16.b, z30.b\n"
+    ".inst 0x44954229  // smlalb z9.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x44954067  // smlalb z7.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964757  // smlalt z23.s, p4/M, z26.h, z22.h\n"
+    ".inst 0x4499410e  // smlalb z14.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x44954752  // smlalt z18.s, p4/M, z26.h, z21.h\n"
+    ".inst 0x44954634  // smlalt z20.s, p4/M, z17.h, z21.h\n"
+    "ld1b { z26.h }, p3/Z, [x11, x2]\n"
+    ".inst 0x455e1b5a  // usublb z26.h, z26.b, z30.b\n"
+    ".inst 0x44954461  // smlalt z1.s, p4/M, z3.h, z21.h\n"
+    ".inst 0x44964106  // smlalb z6.s, p4/M, z8.h, z22.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x44964069  // smlalb z9.s, p4/M, z3.h, z22.h\n"
+    ".inst 0x44964087  // smlalb z7.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x44994517  // smlalt z23.s, p4/M, z8.h, z25.h\n"
+    ".inst 0x448243ee  // smlalb z14.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x44964512  // smlalt z18.s, p4/M, z8.h, z22.h\n"
+    ".inst 0x44964474  // smlalt z20.s, p4/M, z3.h, z22.h\n"
+    "ld1b { z8.h }, p3/Z, [x10, x2]\n"
+    ".inst 0x455e1908  // usublb z8.h, z8.b, z30.b\n"
+    ".inst 0x44964481  // smlalt z1.s, p4/M, z4.h, z22.h\n"
+    ".inst 0x449943e6  // smlalb z6.s, p4/M, z31.h, z25.h\n"
+    "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+    ".inst 0x454a12d6  // ssublb z22.h, z22.b, z10.b\n"
+    ".inst 0x44994089  // smlalb z9.s, p4/M, z4.h, z25.h\n"
+    ".inst 0x44994367  // smlalb z7.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x448247f7  // smlalt z23.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x449d400e  // smlalb z14.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x449947f2  // smlalt z18.s, p4/M, z31.h, z25.h\n"
+    ".inst 0x44994494  // smlalt z20.s, p4/M, z4.h, z25.h\n"
+    "ld1b { z31.h }, p3/Z, [x9, x2]\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x44994761  // smlalt z1.s, p4/M, z27.h, z25.h\n"
+    ".inst 0x44824006  // smlalb z6.s, p4/M, z0.h, z2.h\n"
+    "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+    ".inst 0x454a1339  // ssublb z25.h, z25.b, z10.b\n"
+    ".inst 0x44824369  // smlalb z9.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x448240a7  // smlalb z7.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4417  // smlalt z23.s, p4/M, z0.h, z29.h\n"
+    ".inst 0x4498422e  // smlalb z14.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x44824412  // smlalt z18.s, p4/M, z0.h, z2.h\n"
+    ".inst 0x44824774  // smlalt z20.s, p4/M, z27.h, z2.h\n"
+    "ld1b { z0.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x448244a1  // smlalt z1.s, p4/M, z5.h, z2.h\n"
+    ".inst 0x449d4166  // smlalb z6.s, p4/M, z11.h, z29.h\n"
+    "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+    ".inst 0x454a1042  // ssublb z2.h, z2.b, z10.b\n"
+    ".inst 0x449d40a9  // smlalb z9.s, p4/M, z5.h, z29.h\n"
+    ".inst 0x449d4387  // smlalb z7.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984637  // smlalt z23.s, p4/M, z17.h, z24.h\n"
+    ".inst 0x4493406e  // smlalb z14.s, p4/M, z3.h, z19.h\n"
+    "ld1b { z17.h }, p3/Z, [x27, x2]\n"
+    ".inst 0x455e1a31  // usublb z17.h, z17.b, z30.b\n"
+    ".inst 0x449d4572  // smlalt z18.s, p4/M, z11.h, z29.h\n"
+    ".inst 0x449d44b4  // smlalt z20.s, p4/M, z5.h, z29.h\n"
+    "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+    ".inst 0x454a116b  // ssublb z11.h, z11.b, z10.b\n"
+    ".inst 0x449d4781  // smlalt z1.s, p4/M, z28.h, z29.h\n"
+    ".inst 0x44984066  // smlalb z6.s, p4/M, z3.h, z24.h\n"
+    "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x44984209  // smlalb z9.s, p4/M, z16.h, z24.h\n"
+    ".inst 0x44984347  // smlalb z7.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934477  // smlalt z23.s, p4/M, z3.h, z19.h\n"
+    ".inst 0x4495408e  // smlalb z14.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x44984472  // smlalt z18.s, p4/M, z3.h, z24.h\n"
+    ".inst 0x44984614  // smlalt z20.s, p4/M, z16.h, z24.h\n"
+    "ld1b { z3.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x44984741  // smlalt z1.s, p4/M, z26.h, z24.h\n"
+    ".inst 0x44934086  // smlalb z6.s, p4/M, z4.h, z19.h\n"
+    "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+    ".inst 0x454a1318  // ssublb z24.h, z24.b, z10.b\n"
+    ".inst 0x44934349  // smlalb z9.s, p4/M, z26.h, z19.h\n"
+    ".inst 0x44934107  // smlalb z7.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954497  // smlalt z23.s, p4/M, z4.h, z21.h\n"
+    ".inst 0x4496436e  // smlalb z14.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x44934492  // smlalt z18.s, p4/M, z4.h, z19.h\n"
+    ".inst 0x44934754  // smlalt z20.s, p4/M, z26.h, z19.h\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x44934501  // smlalt z1.s, p4/M, z8.h, z19.h\n"
+    ".inst 0x44954366  // smlalb z6.s, p4/M, z27.h, z21.h\n"
+    "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+    "inch x4, ALL, MUL #8\n"
+    ".inst 0x44954109  // smlalb z9.s, p4/M, z8.h, z21.h\n"
+    ".inst 0x449543e7  // smlalb z7.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x454a1273  // ssublb z19.h, z19.b, z10.b\n"
+    ".inst 0x44964777  // smlalt z23.s, p4/M, z27.h, z22.h\n"
+    ".inst 0x449940ae  // smlalb z14.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x44954772  // smlalt z18.s, p4/M, z27.h, z21.h\n"
+    ".inst 0x44954514  // smlalt z20.s, p4/M, z8.h, z21.h\n"
+    "ld1b { z27.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x455e1b7b  // usublb z27.h, z27.b, z30.b\n"
+    ".inst 0x449547e1  // smlalt z1.s, p4/M, z31.h, z21.h\n"
+    ".inst 0x449640a6  // smlalb z6.s, p4/M, z5.h, z22.h\n"
+    "ld1sb { z21.h }, p4/Z, [x4]\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    ".inst 0x449643e9  // smlalb z9.s, p4/M, z31.h, z22.h\n"
+    ".inst 0x44964007  // smlalb z7.s, p4/M, z0.h, z22.h\n"
+    "inch x4\n"
+    ".inst 0x449944b7  // smlalt z23.s, p4/M, z5.h, z25.h\n"
+    ".inst 0x4482420e  // smlalb z14.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x449644b2  // smlalt z18.s, p4/M, z5.h, z22.h\n"
+    ".inst 0x449647f4  // smlalt z20.s, p4/M, z31.h, z22.h\n"
+    "ld1b { z5.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e18a5  // usublb z5.h, z5.b, z30.b\n"
+    ".inst 0x44964401  // smlalt z1.s, p4/M, z0.h, z22.h\n"
+    ".inst 0x44994386  // smlalb z6.s, p4/M, z28.h, z25.h\n"
+    "ld1w { z22.s }, p2/Z, [x15]\n"
+    ".inst 0x44994009  // smlalb z9.s, p4/M, z0.h, z25.h\n"
+    ".inst 0x44994227  // smlalb z7.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824617  // smlalt z23.s, p4/M, z16.h, z2.h\n"
+    ".inst 0x448b434e  // smlalb z14.s, p4/M, z26.h, z11.h\n"
+    "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+    "addvl x15, x15, #2\n"
+    ".inst 0x44994792  // smlalt z18.s, p4/M, z28.h, z25.h\n"
+    ".inst 0x44994414  // smlalt z20.s, p4/M, z0.h, z25.h\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    ".inst 0x44994621  // smlalt z1.s, p4/M, z17.h, z25.h\n"
+    ".inst 0x44824346  // smlalb z6.s, p4/M, z26.h, z2.h\n"
+    "uzp1 z25.s, z22.s, z16.s\n"
+    "inch x2\n"
+    ".inst 0x448243a9  // smlalb z9.s, p4/M, z29.h, z2.h\n"
+    ".inst 0x44824067  // smlalb z7.s, p4/M, z3.h, z2.h\n"
+    "uzp2 z16.s, z22.s, z16.s\n"
+    "ld1w { z22.s }, p2/Z, [x14]\n"
+    ".inst 0x448b4757  // smlalt z23.s, p4/M, z26.h, z11.h\n"
+    ".inst 0x4498410e  // smlalb z14.s, p4/M, z8.h, z24.h\n"
+    "mov x20, x2\n"
+    "incw x20\n"
+    ".inst 0x44824752  // smlalt z18.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x448247b4  // smlalt z20.s, p4/M, z29.h, z2.h\n"
+    "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+    "uzp1 z29.s, z22.s, z26.s\n"
+    ".inst 0x44824461  // smlalt z1.s, p4/M, z3.h, z2.h\n"
+    ".inst 0x448b4106  // smlalb z6.s, p4/M, z8.h, z11.h\n"
+    "uzp2 z22.s, z22.s, z26.s\n"
+    "whilelt p2.s, x2, x3\n"
+    ".inst 0x448b4069  // smlalb z9.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4087  // smlalb z7.s, p4/M, z4.h, z11.h\n"
+    "whilelt p1.s, x20, x3\n"
+    "whilelt p3.h, x2, x3\n"
+    ".inst 0x44984517  // smlalt z23.s, p4/M, z8.h, z24.h\n"
+    ".inst 0x449343ee  // smlalb z14.s, p4/M, z31.h, z19.h\n"
+    "addvl x14, x14, #2\n"
+    ".inst 0x448b4512  // smlalt z18.s, p4/M, z8.h, z11.h\n"
+    ".inst 0x448b4474  // smlalt z20.s, p4/M, z3.h, z11.h\n"
+    ".inst 0x448b4481  // smlalt z1.s, p4/M, z4.h, z11.h\n"
+    ".inst 0x449843e6  // smlalb z6.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984089  // smlalb z9.s, p4/M, z4.h, z24.h\n"
+    ".inst 0x44984367  // smlalb z7.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x449347f7  // smlalt z23.s, p4/M, z31.h, z19.h\n"
+    ".inst 0x4495400e  // smlalb z14.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x04b975ce  // sqrdmulh z14.s, z14.s, z25.s\n"
+    ".inst 0x449847f2  // smlalt z18.s, p4/M, z31.h, z24.h\n"
+    ".inst 0x44984494  // smlalt z20.s, p4/M, z4.h, z24.h\n"
+    "and z3.d, z14.d, z29.d\n"
+    ".inst 0x44984761  // smlalt z1.s, p4/M, z27.h, z24.h\n"
+    ".inst 0x44934006  // smlalb z6.s, p4/M, z0.h, z19.h\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    ".inst 0x44934369  // smlalb z9.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449340a7  // smlalb z7.s, p4/M, z5.h, z19.h\n"
+    "sqadd z14.s, z14.s, z3.s\n"
+    ".inst 0x448293ae  // srshl z14.s, p4/M, z14.s, z29.s\n"
+    ".inst 0x44954417  // smlalt z23.s, p4/M, z0.h, z21.h\n"
+    ".inst 0x44934412  // smlalt z18.s, p4/M, z0.h, z19.h\n"
+    ".inst 0x04b076f7  // sqrdmulh z23.s, z23.s, z16.s\n"
+    ".inst 0x44934774  // smlalt z20.s, p4/M, z27.h, z19.h\n"
+    ".inst 0x449344a1  // smlalt z1.s, p4/M, z5.h, z19.h\n"
+    "and z31.d, z23.d, z22.d\n"
+    ".inst 0x44954226  // smlalb z6.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x449540a9  // smlalb z9.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x04b974c6  // sqrdmulh z6.s, z6.s, z25.s\n"
+    ".inst 0x44954387  // smlalb z7.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x44954632  // smlalt z18.s, p4/M, z17.h, z21.h\n"
+    ".inst 0x04b97529  // sqrdmulh z9.s, z9.s, z25.s\n"
+    ".inst 0x449544b4  // smlalt z20.s, p4/M, z5.h, z21.h\n"
+    ".inst 0x44954781  // smlalt z1.s, p4/M, z28.h, z21.h\n"
+    ".inst 0x04b974e7  // sqrdmulh z7.s, z7.s, z25.s\n"
+    "asr z31.s, z31.s, #0x1f\n"
+    "and z3.d, z6.d, z29.d\n"
+    ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+    "and z0.d, z9.d, z29.d\n"
+    ".inst 0x04b07694  // sqrdmulh z20.s, z20.s, z16.s\n"
+    "and z19.d, z7.d, z29.d\n"
+    ".inst 0x04b07421  // sqrdmulh z1.s, z1.s, z16.s\n"
+    "sqadd z23.s, z23.s, z31.s\n"
+    ".inst 0x448292d7  // srshl z23.s, p4/M, z23.s, z22.s\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    "and z21.d, z18.d, z22.d\n"
+    "asr z0.s, z0.s, #0x1f\n"
+    "and z17.d, z20.d, z22.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z16.d, z1.d, z22.d\n"
+    "sqadd z6.s, z6.s, z3.s\n"
+    "asr z21.s, z21.s, #0x1f\n"
+    ".inst 0x448293a6  // srshl z6.s, p4/M, z6.s, z29.s\n"
+    "sqadd z9.s, z9.s, z0.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x448293a9  // srshl z9.s, p4/M, z9.s, z29.s\n"
+    "sqadd z7.s, z7.s, z19.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x448293a7  // srshl z7.s, p4/M, z7.s, z29.s\n"
+    "sqadd z18.s, z18.s, z21.s\n"
+    "sqadd z20.s, z20.s, z17.s\n"
+    ".inst 0x448292d2  // srshl z18.s, p4/M, z18.s, z22.s\n"
+    ".inst 0x448292d4  // srshl z20.s, p4/M, z20.s, z22.s\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x453041ce  // sqxtnb z14.h, z14.s\n"
+    ".inst 0x448292c1  // srshl z1.s, p4/M, z1.s, z22.s\n"
+    ".inst 0x453040c6  // sqxtnb z6.h, z6.s\n"
+    ".inst 0x45304129  // sqxtnb z9.h, z9.s\n"
+    ".inst 0x453040e7  // sqxtnb z7.h, z7.s\n"
+    ".inst 0x453046ee  // sqxtnt z14.h, z23.s\n"
+    ".inst 0x45304646  // sqxtnt z6.h, z18.s\n"
+    ".inst 0x45304689  // sqxtnt z9.h, z20.s\n"
+    ".inst 0x45304427  // sqxtnt z7.h, z1.s\n"
+    "sqadd z14.h, z14.h, z15.h\n"
+    "smax z14.h, p4/M, z14.h, z12.h\n"
+    "smin z14.h, p4/M, z14.h, z13.h\n"
+    "sqadd z6.h, z6.h, z15.h\n"
+    "sqadd z9.h, z9.h, z15.h\n"
+    "smax z6.h, p4/M, z6.h, z12.h\n"
+    "smax z9.h, p4/M, z9.h, z12.h\n"
+    "sqadd z7.h, z7.h, z15.h\n"
+    "smax z7.h, p4/M, z7.h, z12.h\n"
+    "smin z6.h, p4/M, z6.h, z13.h\n"
+    "st1b { z14.h }, p0, [x5, x16]\n"
+    "smin z9.h, p4/M, z9.h, z13.h\n"
+    "smin z7.h, p4/M, z7.h, z13.h\n"
+    "st1b { z6.h }, p0, [x6, x16]\n"
+    "st1b { z9.h }, p0, [x7, x16]\n"
+    "st1b { z7.h }, p0, [x8, x16]\n"
+    "ld1w { z17.s }, p2/Z, [x21]\n"
+    "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+    "uzp1 z14.s, z17.s, z16.s\n"
+    "ld1sb { z26.h }, p4/Z, [x4]\n"
+    "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+    "uzp2 z23.s, z17.s, z16.s\n"
+    "addvl x21, x21, #2\n"
+    "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+    "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+    "inch x16\n"
+    "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+    "ldp x9, x28, [x17, #0x0]\n"
+    "mov z6.d, z14.d\n"
+    "mov z18.d, z23.d\n"
+    "ldp x27, x26, [x17, #0x10]\n"
+    "ldp x25, x24, [x17, #0x20]\n"
+    "mov z9.d, z14.d\n"
+    "mov z20.d, z23.d\n"
+    "ldp x23, x22, [x17, #0x30]\n"
+    "ldp x21, x20, [x17, #0x40]\n"
+    "mov z7.d, z14.d\n"
+    "mov z1.d, z23.d\n"
+    "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+    "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x454a135a  // ssublb z26.h, z26.b, z10.b\n"
+    ".inst 0x454a1108  // ssublb z8.h, z8.b, z10.b\n"
+    "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x454a1210  // ssublb z16.h, z16.b, z10.b\n"
+    ".inst 0x454a12b5  // ssublb z21.h, z21.b, z10.b\n"
+    "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+    "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x454a1231  // ssublb z17.h, z17.b, z10.b\n"
+    ".inst 0x455e1ad6  // usublb z22.h, z22.b, z30.b\n"
+    "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+    "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x455e1842  // usublb z2.h, z2.b, z30.b\n"
+    ".inst 0x455e196b  // usublb z11.h, z11.b, z30.b\n"
+    "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+    "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455e1863  // usublb z3.h, z3.b, z30.b\n"
+    ".inst 0x455e1bbd  // usublb z29.h, z29.b, z30.b\n"
+    ".inst 0x455e1884  // usublb z4.h, z4.b, z30.b\n"
+    ".inst 0x455e1bff  // usublb z31.h, z31.b, z30.b\n"
+    ".inst 0x455e1800  // usublb z0.h, z0.b, z30.b\n"
+    ".inst 0x455e1a73  // usublb z19.h, z19.b, z30.b\n"
+    ".inst 0x455e1b9c  // usublb z28.h, z28.b, z30.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp b/src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp
new file mode 100644
index 0000000000..8a49c775d3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <premultiply.hpp>
+
+#define CHANNEL_MULTIPLIER 6
+#define BLOCK_SIZE 4
+
+void do_premultiply_float_6(const float       *in_ptr,
+                            const unsigned int ld_row,
+                            const unsigned int ld_col,
+                            float             *out_ptr,
+                            const unsigned int out_ld_row,
+                            const unsigned int out_ld_col,
+                            const unsigned int tile_rows,
+                            const unsigned int tile_cols,
+                            const unsigned     input_channels)
+{
+    for(unsigned int i = 0; i < tile_rows; i++)
+    {
+        const float *ip2 = in_ptr + i * ld_row;
+        float       *op2 = out_ptr + i * out_ld_row;
+        for(unsigned int j = 0; j < tile_cols; j++)
+        {
+            const float *ip = ip2;
+            float       *op = op2;
+
+            unsigned int num_blocks = input_channels / BLOCK_SIZE;
+            for(unsigned int c = 0; c < num_blocks; c++)
+            {
+                float vals[BLOCK_SIZE];
+                for(unsigned int v = 0; v < BLOCK_SIZE; v++)
+                {
+                    vals[v] = ip[v];
+                }
+                ip += BLOCK_SIZE;
+
+                for(unsigned int v = 0; v < BLOCK_SIZE; v++)
+                {
+                    for(unsigned int r = 0; r < CHANNEL_MULTIPLIER; r++)
+                    {
+                        op[r] = vals[v];
+                    }
+                    op += CHANNEL_MULTIPLIER;
+                }
+            }
+
+            unsigned int rem = input_channels - num_blocks * BLOCK_SIZE;
+            for(unsigned int c = 0; c < rem; c++)
+            {
+                float val = ip[c];
+                for(unsigned int r = 0; r < CHANNEL_MULTIPLIER; r++)
+                {
+                    op[r] = val;
+                }
+                op += CHANNEL_MULTIPLIER;
+            }
+
+            ip2 += ld_col;
+            op2 += out_ld_col;
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
new file mode 100644
index 0000000000..9805fd354f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Depthwise kernel drivers commonly require a per-thread blob of working space
+ * in which to store parameters required by the depthwise implementations. The
+ * composition of this working space varies with the driver, kernel, and data
+ * types -- but the tasks of requesting sufficient space, allocating buffer
+ * space, and performing initialisation of the working space are common.
+ *
+ * The classes in this file consist of a number of working space "Elements"
+ * (which are logical units of functionality) and a Workspace type which allows
+ * for compile time composition of elements into a single working space type.
+ *
+ * Creating a workspace
+ * ====================
+ *
+ * A new workspace type can be created by combining Elements as an argument to
+ * the Workspace class. For instance:
+ *
+ *   Workspace<
+ *     depthwise_depthfirst::InputArrayElement<float>,
+ *     InputBufferElement<float>,
+ *     OutputArrayElement<float>
+ *   >
+ *
+ * Creates a new Workspace consisting of the given elements. The workspace type
+ * contained within this class (`Workspace<...>::WorkspaceType`) is equivalent to:
+ *
+ *   struct WorkspaceType
+ *   {
+ *     const float **inptr_array;  // From InputArrayElement<float>
+ *     float *input_buffer;  // From InputBufferElement<float>
+ *     float **outptr_array;  // From OutputArrayElement<float>
+ *     float *output_buffer;  // From OutputArrayElement<float>
+ *   };
+ *
+ * Calling `Workspace<...>::get_sizeof_workspace(...)` will return the amount
+ * of space required to store the above struct and the elements contained
+ * within it. Once this space has been allocated, the workspace can be
+ * initialised by calling `Workspace<...>::initialise` with a pointer to the
+ * buffer and the same arguments. This will place a struct of type
+ * `Workspace<...>::WorkspaceType` at the start of the buffer, and share the
+ * remaining space between the specified elements. As this is all done at
+ * compile time, later code can access elements from the `WorkspaceType` by
+ * name.
+ *
+ * Writing a new element
+ * =====================
+ *
+ * Each Element must provide:
+ *  - A struct called "Workspace" containing the variables contained within
+ *    this portion of the workspace.
+ *  - A static method called `get_element_size` which returns the amount of
+ *    buffer space required by this element of the workspace (NOT including the
+ *    size of the Workspace struct). For example, an element which stores a
+ *    vector of pointers will return the amount of space required top store the
+ *    vector.
+ *  - A static method called `initialise` which accepts a pointer to a struct
+ *    which will be composed of the Element's `Workspace` struct (along with
+ *    other elements), a pointer to the start of the buffer allocated for this
+ *    portion of the workspace, and arguments to be used to initialise the
+ *    workspace. The Element should consume as much of the buffer as it
+ *    requires, initialise the Workspace, and then return the pointer to the
+ *    next free byte of the buffer.
+ *
+ * See the below elements for an example of how this should work.
+ */
+
+#pragma once
+
+#include "depthwise.hpp"
+#include "depthfirst_driver.hpp"
+#include "utils.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+namespace {  // anonymous because we expect this to appear in several compilation units
+
+/* Arguments to use to size and initialise a workspace.
+ */
+template <class StratType, class OutputStage=Nothing>
+struct WorkspaceArgs
+{
+  const StratType *strategy;
+  const DepthwiseArgs &depthwise_args;
+  const OutputStage &output_stage;
+
+  WorkspaceArgs(const StratType *strat, const DepthwiseArgs &dwargs, const OutputStage &os = {})
+  : strategy(strat), depthwise_args(dwargs), output_stage(os)
+  {
+  }
+};
+
+
+/* Sometimes we use templated structs to fill in workspace types, the Empty
+ * element can be useful for when a blank element is required for some sets of
+ * parameters.
+ */
+struct EmptyElement
+{
+  struct Workspace {};
+
+  template <class StratType, class OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &) { return 0; }
+
+  template <class WorkspaceType, class StratType, class OutputStage>
+  static void *initialise(WorkspaceType *, void *buffer, const WorkspaceArgs<StratType, OutputStage> &)
+  {
+    return buffer;
+  }
+};
+
+
+/* Store fused activations for a kernel.
+ *
+ * Activations are set based on the DepthwiseArgs.
+ */
+template <typename T, class OutputStage=Nothing>
+class ActivationsElement
+{
+  public:
+  struct Workspace
+  {
+    T activation_min, activation_max;
+  };
+
+  template <typename StratType>
+  static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &)
+  {
+    return 0;
+  }
+
+  template <class WorkspaceType, class StratType>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+  {
+    ws->activation_min = static_cast<T>(-std::numeric_limits<float>::infinity());
+    ws->activation_max = static_cast<T>(std::numeric_limits<float>::infinity());
+
+    switch (args.depthwise_args.activation.type)
+    {
+      case arm_gemm::Activation::Type::BoundedReLU:
+        ws->activation_max = static_cast<T>(args.depthwise_args.activation.param1);
+        // Fall through
+      case arm_gemm::Activation::Type::ReLU:
+        ws->activation_min = static_cast<T>(0);
+        break;
+      default:
+        break;
+    }
+
+    return buffer;
+  }
+};
+
+/* Activation clamps are contained within `arm_gemm::Requantize32`, so if the
+ * output stage is one of these we substitute in an empty workspace element.
+ */
+template <typename T>
+class ActivationsElement<T, arm_gemm::Requantize32> : public EmptyElement
+{
+};
+
+
+/* Get the value with which to fill an input buffer. This defaults to `0`
+ * (which we return as a `char` since it gets used by `memset`).
+ */
+template <typename OutputStage>
+char get_input_buffer_fill_value(const OutputStage &)
+{
+  return 0;
+}
+
+/* In the case of kernels operating on quantized data, we need to fill the
+ * input buffer with the zero offset of the input tensor.
+ */
+template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) __attribute__ ((unused));
+template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp)
+{
+  return qp.a_offset;
+}
+
+
+/* Container for a vector of padding values which can be safely consumed by the
+ * depthwise kernel. The padding values are initialised to either `0` or the
+ * zero offset of the input tensor (if quantized).
+ */
+template <typename T>
+class InputBufferElement
+{
+  public:
+  struct Workspace
+  {
+    T *input_buffer;
+  };
+
+  template <typename StratType, typename OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
+  {
+    return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+  }
+
+  template <class WorkspaceType, typename StratType, typename OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+  {
+    ws->input_buffer = reinterpret_cast<T*>(buffer);
+    memset(ws->input_buffer, get_input_buffer_fill_value(args.output_stage), get_element_size(args));
+    return reinterpret_cast<char *>(buffer) + get_element_size(args);
+  }
+};
+
+
+/* Container for an array of output pointers, and a buffer which can be used as
+ * a destination for unnecessary writes.
+ */
+template <typename T>
+class OutputArrayElement
+{
+  public:
+  struct Workspace
+  {
+    T **outptr_array;
+    T *output_buffer;
+  };
+
+  template <typename OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof_outptr_array(args) + sizeof_output_buffer(args);
+  }
+
+  template <class WorkspaceType, typename OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    char *buffer_bytes = reinterpret_cast<char *>(buffer);
+
+    ws->outptr_array = reinterpret_cast<T **>(buffer_bytes);
+    buffer_bytes += sizeof_outptr_array(args);
+
+    ws->output_buffer = reinterpret_cast<T *>(buffer_bytes);
+    buffer_bytes += sizeof_output_buffer(args);
+
+    return buffer_bytes;
+  }
+
+  protected:
+  template <typename OutputStage>
+  static size_t sizeof_outptr_array(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols();
+  }
+
+  template <typename OutputStage>
+  static size_t sizeof_output_buffer(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+  {
+    return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+  }
+};
+
+
+/* Intermediate array to store results of premultiplication.
+ * Used as input to the kernel instead of the original input array.
+ */
+template <typename T>
+class IntermediateBufferElement
+{
+public:
+    struct Workspace
+    {
+        T *intermediate_buffer;
+    };
+
+    template <typename StratType, typename OutputStage>
+    static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
+    {
+      auto cols = args.depthwise_args.input_cols + args.depthwise_args.kernel_cols;
+      auto rows = args.strategy->get_input_rows() + args.depthwise_args.kernel_rows;
+      auto channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+      return sizeof(T) * cols * rows * channels;
+    }
+
+    template <class WorkspaceType, typename StratType, typename OutputStage>
+    static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+    {
+      ws->intermediate_buffer = reinterpret_cast<T*>(buffer);
+      return reinterpret_cast<char *>(buffer) + get_element_size(args);
+    }
+};
+
+
+/* Container for requantization parameters.
+ *
+ * This removes the distinction between per-layer and per-channel
+ * requantization parameters by providing a vector of requantization parameters
+ * regardless of whether per-layer or per-channel is selected.
+ */
+class RequantizationParametersElement
+{
+  public:
+  struct Workspace
+  {
+    const int32_t *bias, *requant_muls, *requant_shifts;
+  };
+
+  template <typename StratType>
+  static size_t get_element_size(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return sizeof_bias(args) + sizeof_requant_muls(args) + sizeof_requant_shifts(args);
+  }
+
+  template <typename WorkspaceType, typename StratType>
+  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    const auto n_output_channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+    char *buffer_bytes = reinterpret_cast<char *>(buffer);
+
+    ws->bias = args.output_stage.bias;
+    ws->requant_muls = args.output_stage.per_channel_muls;
+    ws->requant_shifts = args.output_stage.per_channel_right_shifts;
+
+    if (ws->bias == nullptr)
+    {
+      ws->bias = reinterpret_cast<const int32_t *>(buffer_bytes);
+      memset(buffer_bytes, 0, sizeof_bias(args));
+      buffer_bytes += sizeof_bias(args);
+    }
+
+    if (ws->requant_muls == nullptr)
+    {
+      ws->requant_muls = reinterpret_cast<const int32_t *>(buffer_bytes);
+      auto muls = reinterpret_cast<int32_t *>(buffer_bytes);
+      buffer_bytes += sizeof_requant_muls(args);
+
+      for (auto n = 0u; n < n_output_channels; n++)
+      {
+        muls[n] = args.output_stage.per_layer_mul;
+      }
+    }
+
+    if (ws->requant_shifts == nullptr)
+    {
+      ws->requant_shifts = reinterpret_cast<int32_t *>(buffer_bytes);
+      auto shifts = reinterpret_cast<int32_t *>(buffer_bytes);
+      buffer_bytes += sizeof_requant_shifts(args);
+
+      for (auto n = 0u; n < n_output_channels; n++)
+      {
+        shifts[n] = args.output_stage.per_layer_right_shift;
+      }
+    }
+
+    return buffer_bytes;
+  }
+
+  protected:
+  template <typename StratType>
+  static size_t sizeof_bias(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return args.output_stage.bias != nullptr ?
+      0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+  }
+
+  template <typename StratType>
+  static size_t sizeof_requant_muls(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return args.output_stage.per_channel_muls != nullptr ?
+      0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+  }
+
+  template <typename StratType>
+  static size_t sizeof_requant_shifts(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+  {
+    return args.output_stage.per_channel_right_shifts != nullptr ?
+      0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+  }
+};
+
+
+template <typename ...Elements>
+class Workspace;
+
+template <typename Element, typename ...Elements>
+class Workspace<Element, Elements...>
+{
+  public:
+  struct WorkspaceType : Element::Workspace, Workspace<Elements...>::WorkspaceType
+  {
+  };
+
+  template <class S, class T>
+  static void initialise(void *buffer, const WorkspaceArgs<S, T> &args)
+  {
+    // Allocate sufficient space for the struct, then initialise each of the
+    // elements in turn.
+    auto ws = reinterpret_cast<WorkspaceType *>(buffer);
+    initialise_elements(ws, ws + 1, args);
+  }
+
+  template <class S, class T=Nothing>
+  static size_t get_sizeof_workspace(const WorkspaceArgs<S, T> &args)
+  {
+    return sizeof(WorkspaceType) + get_element_sizes(args);
+  }
+
+  template <class S, class T>
+  static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &args)
+  {
+    return Element::get_element_size(args) + Workspace<Elements...>::get_element_sizes(args);
+  }
+
+  template <class WorkspaceType, class S, class T>
+  static void initialise_elements(WorkspaceType *ws, void *buffer, const WorkspaceArgs<S, T> &args)
+  {
+    buffer = Element::initialise(ws, buffer, args);  // Get the next buffer
+    Workspace<Elements...>::initialise_elements(ws, buffer, args);
+  }
+};
+
+template <>
+class Workspace<>
+{
+  public:
+  struct WorkspaceType
+  {
+  };
+
+  template <class S, class T>
+  static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &)
+  {
+    return 0;
+  }
+
+  template <class WorkspaceType, class S, class T>
+  static void initialise_elements(WorkspaceType *, void *, const WorkspaceArgs<S, T> &)
+  {
+  }
+};
+
+}  // namespace {anonymous}
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
new file mode 100644
index 0000000000..d0e8639229
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "pooling.hpp"
+#include "utils.hpp"
+
+namespace arm_conv {
+namespace pooling {
+
+class IDepthfirstStrategy
+{
+  public:
+  virtual ~IDepthfirstStrategy() = default;
+
+  virtual unsigned int get_input_rows() const = 0;
+  virtual unsigned int get_input_cols() const = 0;
+
+  virtual unsigned int get_output_rows() const = 0;
+  virtual unsigned int get_output_cols() const = 0;
+};
+
+
+template <typename T>
+struct TensorSpec
+{
+  T base;
+  size_t ld_row, ld_col;
+
+  TensorSpec(T ptr, size_t ld_row, size_t ld_col)
+  : base(ptr), ld_row(ld_row), ld_col(ld_col) {}
+};
+
+
+template <typename TInput, typename TOutput>
+class DepthfirstDriver : public PoolingCommon<TInput, TOutput>
+{
+  protected:
+  using Parent = PoolingCommon<TInput, TOutput>;
+
+  // The strategy which we're applying to solve the pooling problem.
+  std::unique_ptr<const IDepthfirstStrategy> m_strat;
+
+  /* Compute the amount of working space required for a single thread. */
+  virtual size_t get_working_size_per_thread() const = 0;
+
+  /* Initialise the working space for a thread. */
+  virtual void initialise_working_space(void *) const = 0;
+
+  /* Compute a portion of the output tensor with padding. */
+  virtual void compute_tile_padded(
+    unsigned int output_i, unsigned int output_j,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *working_space
+  ) const = 0;
+
+  /* Compute a portion of the work with only top/bottom padding.
+   *
+   * The default implementation of this repeatedly calls into the padded tile
+   * variant.
+   */
+  virtual void compute_row_padded_tile_row(
+    const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+    const unsigned int output_channel_start, const unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *working_space
+  ) const
+  {
+    for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
+    {
+      this->compute_tile_padded(
+        output_i, output_j, output_channel_start, output_channel_end,
+        input, output, working_space
+      );
+    }
+  }
+
+  /* Compute a portion of the output tensor with no padding.
+   *
+   * The default implementation of this repeatedly calls into the padded
+   * variant.
+   */
+  virtual void compute_tiles_unpadded(
+    unsigned int start_output_i, unsigned int start_output_j,
+    unsigned int n_tile_rows, unsigned int n_tile_cols,
+    unsigned int output_channel_start, unsigned int output_channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *working_space
+  ) const
+  {
+    for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
+    {
+      this->compute_row_padded_tile_row(
+        start_output_i, start_output_j, n_tile_cols,
+        output_channel_start, output_channel_end,
+        input, output, working_space
+      );
+      start_output_i += m_strat->get_output_rows();
+    }
+  }
+
+  void execute_internal(
+    unsigned int n_batches,
+    unsigned int input_height,
+    unsigned int input_width,
+    unsigned int n_channels,
+    const PaddingValues &padding,
+    const void *input,
+    size_t ld_input_col,
+    size_t ld_input_row,
+    size_t ld_input_batch,
+    unsigned int output_height,
+    unsigned int output_width,
+    void *output,
+    size_t ld_output_col,
+    size_t ld_output_row,
+    size_t ld_output_batch,
+    void *working_space,
+    unsigned int thread_id,
+    unsigned int n_threads
+  ) const override
+  {
+    // Get and initialise the working space for this thread.
+    void *thread_working_space =
+      static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+    this->initialise_working_space(thread_working_space);
+
+    // Construct convenient representations of the input/output tensors.
+    TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
+    TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
+
+    // If the output is a 1x1 tensor, which commonly occurs at the end of a
+    // network, then we change the threading strategy to parallelise over
+    // channels rather than rows of the tensor.
+    if (n_threads > 1 && output_height == 1 && output_width == 1)
+    {
+      // Determine how many channels should be assigned to each thread, we
+      // round up first to ensure we get a reasonable spread across the
+      // threads.
+      const auto channels_per_thread = arm_gemm::roundup(arm_gemm::roundup(n_channels, 16u), n_threads) / n_threads;
+      const auto start_channel = thread_id * channels_per_thread;
+      const auto end_channel = std::min(start_channel + channels_per_thread, n_channels);
+
+      if (start_channel >= end_channel)
+      {
+        // This thread should move on if we have insufficient work to do.
+        return;
+      }
+
+      for (; n_batches; n_batches--)
+      {
+        // We know we don't need to iterate over rows or columns here; so just
+        // execute the tile.
+        this->compute_tile_padded(
+          0, 0,  // Compute the only output point
+          start_channel, end_channel,
+          input_tensor, output_tensor, thread_working_space
+        );
+
+        // Progress the pointers for the next batch.
+        input_tensor.base += ld_input_batch;
+        output_tensor.base += ld_output_batch;
+      }
+
+      // Exit here, since we've done all the work using the different strategy.
+      return;
+    }
+
+    for (unsigned int batch = 0; batch < n_batches; batch++)
+    {
+      // Iterate over rows of the output tensor; we stripe over the tiles.
+      for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
+           start_output_i < output_height;
+           start_output_i += n_threads * m_strat->get_output_rows())
+      {
+        // Determine what (if any padding) is required on the top/bottom of
+        // this row of the convolution.
+        const auto end_output_i = start_output_i + m_strat->get_output_rows();
+        const bool pad_output_bottom = output_height < end_output_i;
+
+        const int start_input_i = start_output_i * this->m_args.pool_stride.rows - padding.top;
+        const bool pad_input_top = start_input_i < 0;
+        const int end_input_i = start_input_i + m_strat->get_input_rows();
+        const bool pad_input_bottom = static_cast<int>(input_height) < end_input_i;
+        const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom;
+
+        // Iterate over the columns of the output tensor; we attempt to grab as
+        // much as possible of the unpadded regions, so the loop structure is a
+        // bit odd.
+        unsigned int start_output_j = 0;
+        while (start_output_j < output_width)
+        {
+          const int start_in_j = start_output_j * this->m_args.pool_stride.cols - padding.left;
+          const bool pad_input_left = start_in_j < 0;
+
+          // Determine if we can process a number of unpadded tiles in one go.
+          int n_unpadded_tiles = 0;
+          if (!pad_input_left)
+          {
+            // Determine the maximum number of tiles we could handle.
+            n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols();
+
+            // Handle padding on the right hand edge
+            const int tile_stride = m_strat->get_output_cols() * this->m_args.pool_stride.cols;
+            int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
+            int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
+
+            while (n_unpadded_tiles > 0 &&
+                   (static_cast<int>(output_width) < end_output_j ||
+                    static_cast<int>(input_width) < end_input_j))
+            {
+              n_unpadded_tiles--;
+              end_output_j -= m_strat->get_output_cols();
+              end_input_j -= tile_stride;
+            }
+          }
+
+          // Process unpadded tiles, if possible, otherwise process a padded tile.
+          if (n_unpadded_tiles)
+          {
+            if (!pad_row)
+            {
+              // Completely unpadded execution
+              this->compute_tiles_unpadded(
+                start_output_i, start_output_j,
+                1, n_unpadded_tiles,  // Compute a row of unpadded tiles
+                0, n_channels,  // Compute all channels
+                input_tensor, output_tensor, thread_working_space
+              );
+            }
+            else
+            {
+              // Top/bottom padding only
+              this->compute_row_padded_tile_row(
+                start_output_i, start_output_j, n_unpadded_tiles,
+                0, n_channels,  // Compute all channels
+                input_tensor, output_tensor, thread_working_space
+              );
+            }
+            start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
+          }
+          else
+          {
+            this->compute_tile_padded(
+              start_output_i, start_output_j,
+              0, n_channels,  // Compute all channels
+              input_tensor, output_tensor, thread_working_space
+            );
+            start_output_j += m_strat->get_output_cols();
+          }
+        }
+      }
+
+      // Progress the pointers for the next batch.
+      input_tensor.base += ld_input_batch;
+      output_tensor.base += ld_output_batch;
+    }
+  }
+
+  public:
+  DepthfirstDriver(const IDepthfirstStrategy *strategy, const PoolingArgs &args)
+  : Parent(args), m_strat(strategy)
+  {
+  }
+
+  size_t get_working_size(unsigned int n_threads) const override final
+  {
+    return n_threads * this->get_working_size_per_thread();
+  }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 178db4a0b0..6b3ebe6664 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 namespace arm_conv {
 namespace pooling {
 
 void a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
 
-  typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+  a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 3; }
-  constexpr static unsigned int pool_cols(void) { return 3; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
-  a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 89dbf5ce02..5df848d1dd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 namespace arm_conv {
 namespace pooling {
@@ -82,174 +82,173 @@ void a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x4, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr d7, [%x[args], %[offsetof_rescale]]\n"
+    "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
+    "cmp x3, #0x8\n"
+    "mov x4, #0x0\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
     "mov x5, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x6, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x4, #0x8\n"
-    "ldp x7, x8, [x20, #0x0]\n"
-    "ldp x17, x16, [x20, #0x10]\n"
-    "ldp x15, x14, [x19, #0x0]\n"
-    "ldp x13, x12, [x19, #0x10]\n"
-    "ldp x11, x10, [x19, #0x20]\n"
-    "ldp x9, x28, [x19, #0x30]\n"
-    "ldp x27, x26, [x19, #0x40]\n"
-    "ldp x25, x24, [x19, #0x50]\n"
-    "ldp x23, x22, [x19, #0x60]\n"
-    "ldp x21, x20, [x19, #0x70]\n"
-    "ldr d8, [%x[args], %[offsetof_rescale]]\n"
+    "ldp x6, x7, [x21, #0x0]\n"
+    "ldp x8, x17, [x21, #0x10]\n"
+    "ldp x16, x15, [x20, #0x0]\n"
+    "ldp x14, x13, [x20, #0x10]\n"
+    "ldp x12, x11, [x20, #0x20]\n"
+    "ldp x10, x9, [x20, #0x30]\n"
+    "ldp x28, x27, [x20, #0x40]\n"
+    "ldp x26, x25, [x20, #0x50]\n"
+    "ldp x24, x23, [x20, #0x60]\n"
+    "ldp x22, x21, [x20, #0x70]\n"
     "blt 3f\n"
-    "ldr q7, [x10, x5]\n"
-    "lsr x19, x4, #0x3\n"
-    "ldr q6, [x9, x5]\n"
-    "sub x4, x4, x19, LSL #3\n"
-    "ldr q5, [x26, x5]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q4, [x25, x5]\n"
-    "ldr q3, [x14, x5]\n"
-    "ldr q2, [x13, x5]\n"
-    "ldr q1, [x11, x5]\n"
-    "ldr q0, [x27, x5]\n"
-    "ldr q31, [x28, x5]\n"
-    "ldr q30, [x24, x5]\n"
-    "ldr q29, [x22, x5]\n"
-    "ldr q28, [x21, x5]\n"
-    "ldr q27, [x15, x5]\n"
-    "ldr q26, [x12, x5]\n"
-    "ldr q25, [x23, x5]\n"
-    "ldr q24, [x20, x5]\n"
-    "add x5, x5, #0x10\n"
+    "ldr q6, [x11, x4]\n"
+    "ldr q5, [x10, x4]\n"
+    "lsr x20, x3, #0x3\n"
+    "sub x3, x3, x20, LSL #3\n"
+    "ldr q4, [x27, x4]\n"
+    "ldr q3, [x26, x4]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q2, [x15, x4]\n"
+    "ldr q1, [x14, x4]\n"
+    "ldr q0, [x12, x4]\n"
+    "ldr q31, [x28, x4]\n"
+    "ldr q30, [x9, x4]\n"
+    "ldr q29, [x25, x4]\n"
+    "ldr q28, [x23, x4]\n"
+    "ldr q27, [x22, x4]\n"
+    "ldr q26, [x16, x4]\n"
+    "ldr q25, [x13, x4]\n"
+    "ldr q24, [x24, x4]\n"
+    "ldr q23, [x21, x4]\n"
+    "add x4, x4, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
-    "fadd v17.8h, v7.8h, v6.8h\n"
-    "ldr q7, [x10, x5]\n"
-    "subs x19, x19, #0x1\n"
-    "fadd v16.8h, v5.8h, v4.8h\n"
-    "ldr q6, [x9, x5]\n"
-    "fadd v18.8h, v3.8h, v2.8h\n"
-    "ldr q5, [x26, x5]\n"
-    "fadd v23.8h, v1.8h, v0.8h\n"
-    "ldr q4, [x25, x5]\n"
-    "fadd v22.8h, v31.8h, v30.8h\n"
-    "ldr q3, [x14, x5]\n"
-    "fadd v17.8h, v17.8h, v16.8h\n"
-    "ldr q2, [x13, x5]\n"
-    "fadd v16.8h, v29.8h, v28.8h\n"
-    "ldr q1, [x11, x5]\n"
-    "fadd v19.8h, v27.8h, v23.8h\n"
-    "ldr q0, [x27, x5]\n"
-    "fadd v21.8h, v18.8h, v17.8h\n"
-    "ldr q31, [x28, x5]\n"
-    "fadd v20.8h, v16.8h, v17.8h\n"
-    "ldr q30, [x24, x5]\n"
-    "fadd v18.8h, v26.8h, v22.8h\n"
-    "ldr q29, [x22, x5]\n"
-    "fadd v17.8h, v25.8h, v23.8h\n"
-    "ldr q28, [x21, x5]\n"
-    "fadd v16.8h, v24.8h, v22.8h\n"
-    "ldr q27, [x15, x5]\n"
+    "fadd v17.8h, v6.8h, v5.8h\n"
+    "ldr q6, [x11, x4]\n"
+    "ldr q5, [x10, x4]\n"
+    "fadd v16.8h, v4.8h, v3.8h\n"
+    "ldr q4, [x27, x4]\n"
+    "ldr q3, [x26, x4]\n"
+    "fadd v19.8h, v17.8h, v16.8h\n"
+    "fadd v18.8h, v2.8h, v1.8h\n"
+    "ldr q2, [x15, x4]\n"
+    "ldr q1, [x14, x4]\n"
+    "fadd v17.8h, v0.8h, v31.8h\n"
+    "fadd v22.8h, v30.8h, v29.8h\n"
+    "ldr q0, [x12, x4]\n"
+    "ldr q31, [x28, x4]\n"
+    "fadd v16.8h, v28.8h, v27.8h\n"
+    "fadd v21.8h, v18.8h, v19.8h\n"
+    "ldr q30, [x9, x4]\n"
+    "ldr q29, [x25, x4]\n"
+    "fadd v20.8h, v16.8h, v19.8h\n"
+    "fadd v19.8h, v26.8h, v17.8h\n"
+    "ldr q28, [x23, x4]\n"
+    "ldr q27, [x22, x4]\n"
+    "fadd v18.8h, v25.8h, v22.8h\n"
+    "fadd v17.8h, v24.8h, v17.8h\n"
+    "ldr q26, [x16, x4]\n"
+    "ldr q25, [x13, x4]\n"
+    "fadd v16.8h, v23.8h, v22.8h\n"
     "fadd v19.8h, v21.8h, v19.8h\n"
-    "ldr q26, [x12, x5]\n"
+    "ldr q24, [x24, x4]\n"
+    "ldr q23, [x21, x4]\n"
     "fadd v18.8h, v21.8h, v18.8h\n"
-    "ldr q25, [x23, x5]\n"
     "fadd v17.8h, v17.8h, v20.8h\n"
-    "ldr q24, [x20, x5]\n"
+    "fadd v16.8h, v16.8h, v20.8h\n"
+    "subs x20, x20, #0x1\n"
+    "fmul v19.8h, v19.8h, v7.h[0]\n"
+    "add x4, x4, #0x10\n"
+    "fmul v18.8h, v18.8h, v7.h[1]\n"
+    "fmul v17.8h, v17.8h, v7.h[2]\n"
+    "str q19, [x6, x5]\n"
+    "fmul v16.8h, v16.8h, v7.h[3]\n"
+    "str q18, [x7, x5]\n"
+    "str q17, [x8, x5]\n"
+    "str q16, [x17, x5]\n"
     "add x5, x5, #0x10\n"
-    "fadd v16.8h, v20.8h, v16.8h\n"
-    "fmul v19.8h, v19.8h, v8.h[0]\n"
-    "str q19, [x7, x6]\n"
-    "fmul v18.8h, v18.8h, v8.h[1]\n"
-    "fmul v17.8h, v17.8h, v8.h[2]\n"
-    "str q18, [x8, x6]\n"
-    "fmul v16.8h, v16.8h, v8.h[3]\n"
-    "str q17, [x17, x6]\n"
-    "str q16, [x16, x6]\n"
-    "add x6, x6, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
-    "fadd v17.8h, v7.8h, v6.8h\n"
-    "fadd v16.8h, v5.8h, v4.8h\n"
-    "fadd v18.8h, v3.8h, v2.8h\n"
-    "fadd v23.8h, v1.8h, v0.8h\n"
-    "fadd v17.8h, v17.8h, v16.8h\n"
-    "fadd v22.8h, v31.8h, v30.8h\n"
-    "fadd v16.8h, v29.8h, v28.8h\n"
-    "fadd v21.8h, v18.8h, v17.8h\n"
-    "fadd v19.8h, v27.8h, v23.8h\n"
-    "fadd v20.8h, v16.8h, v17.8h\n"
-    "fadd v18.8h, v26.8h, v22.8h\n"
-    "fadd v17.8h, v25.8h, v23.8h\n"
-    "fadd v16.8h, v24.8h, v22.8h\n"
+    "fadd v17.8h, v6.8h, v5.8h\n"
+    "fadd v16.8h, v4.8h, v3.8h\n"
+    "fadd v19.8h, v17.8h, v16.8h\n"
+    "fadd v18.8h, v2.8h, v1.8h\n"
+    "fadd v17.8h, v0.8h, v31.8h\n"
+    "fadd v22.8h, v30.8h, v29.8h\n"
+    "fadd v16.8h, v28.8h, v27.8h\n"
+    "fadd v21.8h, v18.8h, v19.8h\n"
+    "fadd v20.8h, v16.8h, v19.8h\n"
+    "fadd v19.8h, v26.8h, v17.8h\n"
+    "fadd v18.8h, v25.8h, v22.8h\n"
+    "fadd v17.8h, v24.8h, v17.8h\n"
+    "fadd v16.8h, v23.8h, v22.8h\n"
     "fadd v19.8h, v21.8h, v19.8h\n"
     "fadd v18.8h, v21.8h, v18.8h\n"
     "fadd v17.8h, v17.8h, v20.8h\n"
-    "fadd v16.8h, v20.8h, v16.8h\n"
-    "fmul v19.8h, v19.8h, v8.h[0]\n"
-    "str q19, [x7, x6]\n"
-    "fmul v18.8h, v18.8h, v8.h[1]\n"
-    "fmul v17.8h, v17.8h, v8.h[2]\n"
-    "str q18, [x8, x6]\n"
-    "fmul v16.8h, v16.8h, v8.h[3]\n"
-    "str q17, [x17, x6]\n"
-    "str q16, [x16, x6]\n"
-    "add x6, x6, #0x10\n"
-    "cbz x4, 4f\n"
+    "fadd v16.8h, v16.8h, v20.8h\n"
+    "fmul v19.8h, v19.8h, v7.h[0]\n"
+    "str q19, [x6, x5]\n"
+    "fmul v18.8h, v18.8h, v7.h[1]\n"
+    "fmul v17.8h, v17.8h, v7.h[2]\n"
+    "str q18, [x7, x5]\n"
+    "fmul v16.8h, v16.8h, v7.h[3]\n"
+    "str q17, [x8, x5]\n"
+    "str q16, [x17, x5]\n"
+    "add x5, x5, #0x10\n"
+    "cbz x3, 4f\n"
     "3:"  // Oddments
-    "ldr h7, [x10, x5]\n"
-    "subs x4, x4, #0x1\n"
-    "ldr h6, [x9, x5]\n"
-    "fadd v17.8h, v7.8h, v6.8h\n"
-    "ldr h5, [x26, x5]\n"
-    "ldr h4, [x25, x5]\n"
-    "fadd v16.8h, v5.8h, v4.8h\n"
-    "ldr h3, [x14, x5]\n"
-    "ldr h2, [x13, x5]\n"
-    "fadd v17.8h, v17.8h, v16.8h\n"
-    "ldr h1, [x11, x5]\n"
-    "ldr h0, [x27, x5]\n"
-    "fadd v18.8h, v3.8h, v2.8h\n"
-    "ldr h31, [x28, x5]\n"
-    "fadd v23.8h, v1.8h, v0.8h\n"
-    "ldr h30, [x24, x5]\n"
-    "fadd v21.8h, v18.8h, v17.8h\n"
-    "ldr h29, [x22, x5]\n"
-    "ldr h28, [x21, x5]\n"
-    "fadd v22.8h, v31.8h, v30.8h\n"
-    "ldr h27, [x15, x5]\n"
-    "ldr h26, [x12, x5]\n"
-    "fadd v16.8h, v29.8h, v28.8h\n"
-    "ldr h25, [x23, x5]\n"
-    "fadd v20.8h, v16.8h, v17.8h\n"
-    "ldr h24, [x20, x5]\n"
-    "add x5, x5, #0x2\n"
-    "fadd v19.8h, v27.8h, v23.8h\n"
-    "fadd v18.8h, v26.8h, v22.8h\n"
-    "fadd v17.8h, v25.8h, v23.8h\n"
-    "fadd v16.8h, v24.8h, v22.8h\n"
-    "fadd v19.8h, v21.8h, v19.8h\n"
-    "fadd v18.8h, v21.8h, v18.8h\n"
+    "ldr h17, [x11, x4]\n"
+    "ldr h16, [x10, x4]\n"
+    "fadd v18.8h, v17.8h, v16.8h\n"
+    "subs x3, x3, #0x1\n"
+    "ldr h17, [x27, x4]\n"
+    "ldr h16, [x26, x4]\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "fadd v18.8h, v18.8h, v16.8h\n"
+    "ldr h17, [x15, x4]\n"
+    "ldr h16, [x14, x4]\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "fadd v23.8h, v16.8h, v18.8h\n"
+    "ldr h17, [x12, x4]\n"
+    "ldr h16, [x28, x4]\n"
+    "fadd v22.8h, v17.8h, v16.8h\n"
+    "ldr h17, [x9, x4]\n"
+    "ldr h16, [x25, x4]\n"
+    "fadd v21.8h, v17.8h, v16.8h\n"
+    "ldr h17, [x23, x4]\n"
+    "ldr h16, [x22, x4]\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "fadd v20.8h, v16.8h, v18.8h\n"
+    "ldr h17, [x16, x4]\n"
+    "ldr h16, [x13, x4]\n"
+    "fadd v19.8h, v17.8h, v22.8h\n"
+    "fadd v18.8h, v16.8h, v21.8h\n"
+    "ldr h17, [x24, x4]\n"
+    "ldr h16, [x21, x4]\n"
+    "fadd v17.8h, v17.8h, v22.8h\n"
+    "fadd v16.8h, v16.8h, v21.8h\n"
+    "fadd v19.8h, v23.8h, v19.8h\n"
+    "fadd v18.8h, v23.8h, v18.8h\n"
+    "add x4, x4, #0x2\n"
     "fadd v17.8h, v17.8h, v20.8h\n"
-    "fadd v16.8h, v20.8h, v16.8h\n"
-    "fmul v19.8h, v19.8h, v8.h[0]\n"
-    "str h19, [x7, x6]\n"
-    "fmul v18.8h, v18.8h, v8.h[1]\n"
-    "fmul v17.8h, v17.8h, v8.h[2]\n"
-    "str h18, [x8, x6]\n"
-    "fmul v16.8h, v16.8h, v8.h[3]\n"
-    "str h17, [x17, x6]\n"
-    "str h16, [x16, x6]\n"
-    "add x6, x6, #0x2\n"
+    "fadd v16.8h, v16.8h, v20.8h\n"
+    "fmul v19.8h, v19.8h, v7.h[0]\n"
+    "fmul v18.8h, v18.8h, v7.h[1]\n"
+    "str h19, [x6, x5]\n"
+    "fmul v17.8h, v17.8h, v7.h[2]\n"
+    "fmul v16.8h, v16.8h, v7.h[3]\n"
+    "str h18, [x7, x5]\n"
+    "str h17, [x8, x5]\n"
+    "str h16, [x17, x5]\n"
+    "add x5, x5, #0x2\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp
index 9dc153a764..25e7af1cee 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_fp16_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
 
-struct a64_fp16_nhwc_avg_generic_depthfirst
+struct a64_fp16_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_fp16_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
   a64_fp16_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_fp16_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 5bef7f2bf4..f7be92e53f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
@@ -41,308 +42,306 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
   const auto rescale_value = static_cast<__fp16>(1.0f / static_cast<float>(window_cells));
 
   __asm__ __volatile__(
-    "ld1r { v8.8h }, [%x[rescale_ptr]]\n"
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
+    "ld1r { v9.8h }, [%x[rescale_ptr]]\n"
     "cmp %x[n_channels], #0x20\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
     "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "movi v5.16b, #0x0\n"
-    "movi v4.16b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fadd v23.8h, v3.8h, v2.8h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.8h, v1.8h, v0.8h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd v22.8h, v31.8h, v30.8h\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v18.8h, v29.8h, v28.8h\n"
-    "fadd v21.8h, v27.8h, v21.8h\n"
-    "ldr q2, [x22, x28]\n"
-    "fadd v17.8h, v26.8h, v17.8h\n"
-    "ldr q1, [x21, x28]\n"
-    "fadd v20.8h, v25.8h, v20.8h\n"
-    "ldr q0, [x20, x28]\n"
-    "fadd v16.8h, v24.8h, v16.8h\n"
-    "ldr q31, [x23, x27]\n"
+    "fadd v23.8h, v4.8h, v3.8h\n"
+    "fadd v19.8h, v28.8h, v22.8h\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fadd v22.8h, v2.8h, v1.8h\n"
+    "ldr q2, [x21, x26]\n"
+    "fadd v18.8h, v27.8h, v21.8h\n"
+    "ldr q1, [x20, x26]\n"
+    "fadd v21.8h, v0.8h, v31.8h\n"
+    "ldr q0, [x21, x24]\n"
+    "fadd v17.8h, v26.8h, v20.8h\n"
+    "ldr q31, [x20, x24]\n"
+    "fadd v20.8h, v30.8h, v29.8h\n"
+    "ldr q30, [x21, x23]\n"
+    "fadd v16.8h, v25.8h, v24.8h\n"
+    "ldr q29, [x20, x23]\n"
     "fadd v19.8h, v23.8h, v19.8h\n"
-    "ldr q30, [x22, x27]\n"
     "fadd v18.8h, v22.8h, v18.8h\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "fadd v17.8h, v21.8h, v17.8h\n"
-    "ldr q28, [x20, x27]\n"
     "fadd v16.8h, v20.8h, v16.8h\n"
-    "ldr q27, [x23, x26]\n"
-    "fadd v7.8h, v7.8h, v19.8h\n"
-    "ldr q21, [x22, x26]\n"
-    "fadd v6.8h, v6.8h, v18.8h\n"
-    "ldr q26, [x21, x26]\n"
-    "fadd v5.8h, v5.8h, v17.8h\n"
-    "ldr q17, [x20, x26]\n"
-    "fadd v4.8h, v4.8h, v16.8h\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd v8.8h, v8.8h, v19.8h\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "fadd v7.8h, v7.8h, v18.8h\n"
+    "fadd v6.8h, v6.8h, v17.8h\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "fadd v5.8h, v5.8h, v16.8h\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fadd v23.8h, v3.8h, v2.8h\n"
-    "fadd v19.8h, v1.8h, v0.8h\n"
-    "fadd v22.8h, v31.8h, v30.8h\n"
-    "fadd v18.8h, v29.8h, v28.8h\n"
-    "fadd v21.8h, v27.8h, v21.8h\n"
-    "fadd v17.8h, v26.8h, v17.8h\n"
-    "fadd v20.8h, v25.8h, v20.8h\n"
-    "fadd v16.8h, v24.8h, v16.8h\n"
+    "fadd v23.8h, v4.8h, v3.8h\n"
+    "fadd v19.8h, v28.8h, v22.8h\n"
+    "fadd v22.8h, v2.8h, v1.8h\n"
+    "fadd v18.8h, v27.8h, v21.8h\n"
+    "fadd v21.8h, v0.8h, v31.8h\n"
+    "fadd v17.8h, v26.8h, v20.8h\n"
+    "fadd v20.8h, v30.8h, v29.8h\n"
+    "fadd v16.8h, v25.8h, v24.8h\n"
     "fadd v19.8h, v23.8h, v19.8h\n"
     "fadd v18.8h, v22.8h, v18.8h\n"
     "fadd v17.8h, v21.8h, v17.8h\n"
     "fadd v16.8h, v20.8h, v16.8h\n"
-    "fadd v7.8h, v7.8h, v19.8h\n"
-    "fadd v6.8h, v6.8h, v18.8h\n"
-    "fadd v5.8h, v5.8h, v17.8h\n"
-    "fadd v4.8h, v4.8h, v16.8h\n"
+    "fadd v8.8h, v8.8h, v19.8h\n"
+    "fadd v7.8h, v7.8h, v18.8h\n"
+    "fadd v6.8h, v6.8h, v17.8h\n"
+    "fadd v5.8h, v5.8h, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v7.8h, v7.8h, v3.8h\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "fadd v6.8h, v6.8h, v31.8h\n"
-    "ldr q25, [x23, x25]\n"
-    "fadd v5.8h, v5.8h, v27.8h\n"
-    "fadd v4.8h, v4.8h, v25.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.8h, v8.8h, v16.8h\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "fadd v7.8h, v7.8h, v17.8h\n"
+    "fadd v6.8h, v6.8h, v16.8h\n"
+    "ldr q16, [x20, x23]\n"
+    "fadd v5.8h, v5.8h, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "fmul v7.8h, v7.8h, v8.8h\n"
-    "str q7, [%x[outptr], x28]\n"
-    "fmul v6.8h, v6.8h, v8.8h\n"
-    "add x28, x28, #0x40\n"
-    "fmul v5.8h, v5.8h, v8.8h\n"
-    "str q6, [%x[outptr], x27]\n"
-    "fmul v4.8h, v4.8h, v8.8h\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x20\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "cmp %x[n_channels], #0x20\n"
+    "fmul v8.8h, v8.8h, v9.8h\n"
+    "fmul v7.8h, v7.8h, v9.8h\n"
+    "fmul v6.8h, v6.8h, v9.8h\n"
+    "fmul v5.8h, v5.8h, v9.8h\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 31f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x8\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fadd v23.8h, v3.8h, v2.8h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.8h, v1.8h, v0.8h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd v19.8h, v23.8h, v19.8h\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "fadd v7.8h, v7.8h, v19.8h\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "fadd v17.8h, v4.8h, v3.8h\n"
+    "fadd v16.8h, v28.8h, v22.8h\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "fadd v8.8h, v8.8h, v16.8h\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fadd v23.8h, v3.8h, v2.8h\n"
-    "fadd v19.8h, v1.8h, v0.8h\n"
-    "fadd v19.8h, v23.8h, v19.8h\n"
-    "fadd v7.8h, v7.8h, v19.8h\n"
+    "fadd v17.8h, v4.8h, v3.8h\n"
+    "fadd v16.8h, v28.8h, v22.8h\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "fadd v8.8h, v8.8h, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v7.8h, v7.8h, v3.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.8h, v8.8h, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "fmul v7.8h, v7.8h, v8.8h\n"
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x8\n"
     "cmp %x[n_channels], #0x8\n"
+    "fmul v8.8h, v8.8h, v9.8h\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 31f\n"
     "14:"  // Oddments
-    "movi v7.16b, #0x0\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 20f\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "movi v8.16b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 20f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "b 19f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "b 19f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "b 19f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 19f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 2: End
-    "fadd v23.8h, v3.8h, v2.8h\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.8h, v1.8h, v0.8h\n"
-    "fadd v19.8h, v23.8h, v19.8h\n"
-    "fadd v7.8h, v7.8h, v19.8h\n"
+    "fadd v17.8h, v4.8h, v3.8h\n"
+    "fadd v16.8h, v28.8h, v22.8h\n"
+    "subs x25, x25, #0x1\n"
+    "fadd v16.8h, v17.8h, v16.8h\n"
+    "fadd v8.8h, v8.8h, v16.8h\n"
     "bgt 15b\n"
     "20:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 26f\n"
     "21:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #2, 23f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #1, 22f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "b 25f\n"
     "22:"  // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "b 25f\n"
     "23:"  // Oddments: Single input loop: Load: Bit 2: Unset
     "tbz %x[n_channels], #1, 24f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "b 25f\n"
     "24:"  // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 25f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "25:"  // Oddments: Single input loop: Load: Bit 2: End
-    "fadd v7.8h, v7.8h, v3.8h\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.8h, v8.8h, v4.8h\n"
     "bgt 21b\n"
     "26:"  // Oddments: Single input loop: End
-    "fmul v7.8h, v7.8h, v8.8h\n"
+    "fmul v8.8h, v8.8h, v9.8h\n"
     "tbz %x[n_channels], #2, 28f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #1, 27f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "27:"  // Oddments: Store: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "28:"  // Oddments: Store: Bit 2: Unset
     "tbz %x[n_channels], #1, 29f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "29:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
     "30:"  // Oddments: Store: Bit 2: End
-
     "31:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 9950bb8cdb..b65ac7e9fa 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 namespace arm_conv {
 namespace pooling {
 
 void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
 
-  typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 1c461ee163..4b073b9076 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 namespace arm_conv {
 namespace pooling {
@@ -63,116 +63,115 @@ void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x14, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x15, #0x8\n"
-    "ldp x12, x11, [x20, #0x0]\n"
-    "ldp x10, x9, [x20, #0x10]\n"
-    "ldp x28, x27, [x19, #0x0]\n"
-    "ldp x26, x25, [x19, #0x10]\n"
-    "ldp x24, x23, [x19, #0x20]\n"
-    "ldp x22, x21, [x19, #0x30]\n"
-    "ldr x20, [x19, #0x40]\n"
+    "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "cmp x16, #0x8\n"
+    "mov x15, #0x0\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "mov x12, #0x0\n"
+    "ldp x11, x10, [x21, #0x10]\n"
+    "ldp x9, x28, [x20, #0x0]\n"
+    "ldp x27, x26, [x20, #0x10]\n"
+    "ldp x25, x24, [x20, #0x20]\n"
+    "ldp x23, x22, [x20, #0x30]\n"
+    "ldr x21, [x20, #0x40]\n"
     "blt 3f\n"
-    "ldr q30, [x27, x14]\n"
-    "lsr x19, x15, #0x3\n"
-    "ldr q29, [x24, x14]\n"
-    "sub x15, x15, x19, LSL #3\n"
-    "ldr q28, [x21, x14]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q27, [x25, x14]\n"
-    "ldr q26, [x28, x14]\n"
-    "ldr q25, [x23, x14]\n"
-    "ldr q24, [x26, x14]\n"
-    "ldr q23, [x22, x14]\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q30, [x28, x15]\n"
+    "ldr q29, [x25, x15]\n"
+    "lsr x20, x16, #0x3\n"
+    "sub x16, x16, x20, LSL #3\n"
+    "ldr q28, [x22, x15]\n"
+    "ldr q27, [x26, x15]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q26, [x9, x15]\n"
+    "ldr q25, [x27, x15]\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "ldr q22, [x21, x15]\n"
+    "add x15, x15, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
     "fmax v21.8h, v30.8h, v29.8h\n"
-    "ldr q30, [x27, x14]\n"
-    "subs x19, x19, #0x1\n"
+    "ldr q30, [x28, x15]\n"
     "fmax v20.8h, v29.8h, v28.8h\n"
-    "ldr q29, [x24, x14]\n"
+    "ldr q29, [x25, x15]\n"
+    "ldr q28, [x22, x15]\n"
     "fmax v19.8h, v27.8h, v26.8h\n"
-    "ldr q28, [x21, x14]\n"
+    "ldr q26, [x9, x15]\n"
     "fmax v18.8h, v25.8h, v24.8h\n"
-    "ldr q26, [x28, x14]\n"
-    "fmax v17.8h, v23.8h, v27.8h\n"
-    "ldr q27, [x25, x14]\n"
-    "fmax v16.8h, v25.8h, v22.8h\n"
-    "ldr q25, [x23, x14]\n"
+    "ldr q25, [x27, x15]\n"
+    "fmax v17.8h, v27.8h, v23.8h\n"
+    "ldr q27, [x26, x15]\n"
+    "fmax v16.8h, v24.8h, v22.8h\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "subs x20, x20, #0x1\n"
     "fmax v19.8h, v21.8h, v19.8h\n"
-    "ldr q24, [x26, x14]\n"
-    "fmax v18.8h, v21.8h, v18.8h\n"
-    "ldr q23, [x22, x14]\n"
-    "fmax v17.8h, v20.8h, v17.8h\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q22, [x21, x15]\n"
+    "fmax v18.8h, v18.8h, v21.8h\n"
+    "fmax v17.8h, v17.8h, v20.8h\n"
+    "add x15, x15, #0x10\n"
     "fmax v16.8h, v20.8h, v16.8h\n"
-    "str q19, [x12, x13]\n"
-    "str q18, [x11, x13]\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
+    "str q19, [x14, x12]\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
     "fmax v21.8h, v30.8h, v29.8h\n"
     "fmax v20.8h, v29.8h, v28.8h\n"
-    "fmax v19.8h, v27.8h, v26.8h\n"
+    "fmax v16.8h, v27.8h, v26.8h\n"
     "fmax v18.8h, v25.8h, v24.8h\n"
-    "fmax v17.8h, v23.8h, v27.8h\n"
-    "fmax v16.8h, v25.8h, v22.8h\n"
-    "fmax v19.8h, v21.8h, v19.8h\n"
-    "str q19, [x12, x13]\n"
-    "fmax v18.8h, v21.8h, v18.8h\n"
-    "fmax v17.8h, v20.8h, v17.8h\n"
-    "str q18, [x11, x13]\n"
-    "fmax v16.8h, v20.8h, v16.8h\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
-    "cbz x15, 4f\n"
+    "fmax v17.8h, v27.8h, v23.8h\n"
+    "fmax v19.8h, v24.8h, v22.8h\n"
+    "fmax v16.8h, v21.8h, v16.8h\n"
+    "fmax v18.8h, v18.8h, v21.8h\n"
+    "str q16, [x14, x12]\n"
+    "fmax v17.8h, v17.8h, v20.8h\n"
+    "fmax v16.8h, v20.8h, v19.8h\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
+    "cbz x16, 4f\n"
     "3:"  // Oddments
-    "ldr h30, [x27, x14]\n"
-    "subs x15, x15, #0x1\n"
-    "ldr h29, [x24, x14]\n"
-    "fmax v21.8h, v30.8h, v29.8h\n"
-    "ldr h28, [x21, x14]\n"
-    "ldr h27, [x25, x14]\n"
-    "fmax v20.8h, v29.8h, v28.8h\n"
-    "ldr h26, [x28, x14]\n"
-    "ldr h25, [x23, x14]\n"
-    "fmax v19.8h, v27.8h, v26.8h\n"
-    "ldr h24, [x26, x14]\n"
-    "ldr h23, [x22, x14]\n"
-    "fmax v19.8h, v21.8h, v19.8h\n"
-    "ldr h22, [x20, x14]\n"
-    "add x14, x14, #0x2\n"
-    "fmax v18.8h, v25.8h, v24.8h\n"
-    "str h19, [x12, x13]\n"
-    "fmax v17.8h, v23.8h, v27.8h\n"
-    "fmax v16.8h, v25.8h, v22.8h\n"
-    "fmax v18.8h, v21.8h, v18.8h\n"
-    "str h18, [x11, x13]\n"
-    "fmax v17.8h, v20.8h, v17.8h\n"
-    "fmax v16.8h, v20.8h, v16.8h\n"
-    "str h17, [x10, x13]\n"
-    "str h16, [x9, x13]\n"
-    "add x13, x13, #0x2\n"
+    "ldr h16, [x28, x15]\n"
+    "ldr h17, [x25, x15]\n"
+    "fmax v23.8h, v16.8h, v17.8h\n"
+    "subs x16, x16, #0x1\n"
+    "ldr h16, [x22, x15]\n"
+    "ldr h22, [x26, x15]\n"
+    "fmax v21.8h, v17.8h, v16.8h\n"
+    "ldr h16, [x9, x15]\n"
+    "ldr h17, [x27, x15]\n"
+    "fmax v16.8h, v22.8h, v16.8h\n"
+    "fmax v20.8h, v23.8h, v16.8h\n"
+    "ldr h19, [x24, x15]\n"
+    "ldr h16, [x23, x15]\n"
+    "fmax v18.8h, v17.8h, v19.8h\n"
+    "fmax v17.8h, v22.8h, v16.8h\n"
+    "ldr h16, [x21, x15]\n"
+    "fmax v16.8h, v19.8h, v16.8h\n"
+    "add x15, x15, #0x2\n"
+    "fmax v18.8h, v18.8h, v23.8h\n"
+    "fmax v17.8h, v17.8h, v21.8h\n"
+    "fmax v16.8h, v21.8h, v16.8h\n"
+    "str h20, [x14, x12]\n"
+    "str h18, [x13, x12]\n"
+    "str h17, [x11, x12]\n"
+    "str h16, [x10, x12]\n"
+    "add x12, x12, #0x2\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp
index 8bea0bf5df..4998b37b4b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_fp16_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
 
-struct a64_fp16_nhwc_max_generic_depthfirst
+struct a64_fp16_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_fp16_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
   a64_fp16_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_fp16_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
index e5f7ee3c72..c92e2cdebd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
@@ -39,304 +40,302 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x20\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
     "mov w20, #0xfc00\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.8h, w20\n"
     "dup v7.8h, w20\n"
-    "mov x19, %x[inptrs]\n"
     "dup v6.8h, w20\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "dup v5.8h, w20\n"
-    "dup v4.8h, w20\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fmax v23.8h, v3.8h, v2.8h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.8h, v1.8h, v0.8h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax v22.8h, v31.8h, v30.8h\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v18.8h, v29.8h, v28.8h\n"
-    "fmax v21.8h, v27.8h, v21.8h\n"
-    "ldr q2, [x22, x28]\n"
-    "fmax v17.8h, v26.8h, v17.8h\n"
-    "ldr q1, [x21, x28]\n"
-    "fmax v20.8h, v25.8h, v20.8h\n"
-    "ldr q0, [x20, x28]\n"
-    "fmax v16.8h, v24.8h, v16.8h\n"
-    "ldr q31, [x23, x27]\n"
+    "fmax v23.8h, v4.8h, v3.8h\n"
+    "fmax v19.8h, v28.8h, v22.8h\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fmax v22.8h, v2.8h, v1.8h\n"
+    "ldr q2, [x21, x26]\n"
+    "fmax v18.8h, v27.8h, v21.8h\n"
+    "ldr q1, [x20, x26]\n"
+    "fmax v21.8h, v0.8h, v31.8h\n"
+    "ldr q0, [x21, x24]\n"
+    "fmax v17.8h, v26.8h, v20.8h\n"
+    "ldr q31, [x20, x24]\n"
+    "fmax v20.8h, v30.8h, v29.8h\n"
+    "ldr q30, [x21, x23]\n"
+    "fmax v16.8h, v25.8h, v24.8h\n"
+    "ldr q29, [x20, x23]\n"
     "fmax v19.8h, v23.8h, v19.8h\n"
-    "ldr q30, [x22, x27]\n"
     "fmax v18.8h, v22.8h, v18.8h\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "fmax v17.8h, v21.8h, v17.8h\n"
-    "ldr q28, [x20, x27]\n"
     "fmax v16.8h, v20.8h, v16.8h\n"
-    "ldr q27, [x23, x26]\n"
-    "fmax v7.8h, v7.8h, v19.8h\n"
-    "ldr q21, [x22, x26]\n"
-    "fmax v6.8h, v6.8h, v18.8h\n"
-    "ldr q26, [x21, x26]\n"
-    "fmax v5.8h, v5.8h, v17.8h\n"
-    "ldr q17, [x20, x26]\n"
-    "fmax v4.8h, v4.8h, v16.8h\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax v8.8h, v8.8h, v19.8h\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "fmax v7.8h, v7.8h, v18.8h\n"
+    "fmax v6.8h, v6.8h, v17.8h\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "fmax v5.8h, v5.8h, v16.8h\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fmax v23.8h, v3.8h, v2.8h\n"
-    "fmax v19.8h, v1.8h, v0.8h\n"
-    "fmax v22.8h, v31.8h, v30.8h\n"
-    "fmax v18.8h, v29.8h, v28.8h\n"
-    "fmax v21.8h, v27.8h, v21.8h\n"
-    "fmax v17.8h, v26.8h, v17.8h\n"
-    "fmax v20.8h, v25.8h, v20.8h\n"
-    "fmax v16.8h, v24.8h, v16.8h\n"
+    "fmax v23.8h, v4.8h, v3.8h\n"
+    "fmax v19.8h, v28.8h, v22.8h\n"
+    "fmax v22.8h, v2.8h, v1.8h\n"
+    "fmax v18.8h, v27.8h, v21.8h\n"
+    "fmax v21.8h, v0.8h, v31.8h\n"
+    "fmax v17.8h, v26.8h, v20.8h\n"
+    "fmax v20.8h, v30.8h, v29.8h\n"
+    "fmax v16.8h, v25.8h, v24.8h\n"
     "fmax v19.8h, v23.8h, v19.8h\n"
     "fmax v18.8h, v22.8h, v18.8h\n"
     "fmax v17.8h, v21.8h, v17.8h\n"
     "fmax v16.8h, v20.8h, v16.8h\n"
-    "fmax v7.8h, v7.8h, v19.8h\n"
-    "fmax v6.8h, v6.8h, v18.8h\n"
-    "fmax v5.8h, v5.8h, v17.8h\n"
-    "fmax v4.8h, v4.8h, v16.8h\n"
+    "fmax v8.8h, v8.8h, v19.8h\n"
+    "fmax v7.8h, v7.8h, v18.8h\n"
+    "fmax v6.8h, v6.8h, v17.8h\n"
+    "fmax v5.8h, v5.8h, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v7.8h, v7.8h, v3.8h\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "fmax v6.8h, v6.8h, v31.8h\n"
-    "ldr q25, [x23, x25]\n"
-    "fmax v5.8h, v5.8h, v27.8h\n"
-    "fmax v4.8h, v4.8h, v25.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.8h, v8.8h, v16.8h\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "fmax v7.8h, v7.8h, v17.8h\n"
+    "fmax v6.8h, v6.8h, v16.8h\n"
+    "ldr q16, [x20, x23]\n"
+    "fmax v5.8h, v5.8h, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x40\n"
-    "str q6, [%x[outptr], x27]\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x20\n"
     "cmp %x[n_channels], #0x20\n"
+    "str q8, [%x[outptr], x27]\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x27, x27, #0x40\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 31f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x8\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov w19, #0xfc00\n"
-    "dup v7.8h, w19\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "mov w20, #0xfc00\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.8h, w20\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fmax v23.8h, v3.8h, v2.8h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.8h, v1.8h, v0.8h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax v19.8h, v23.8h, v19.8h\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "fmax v7.8h, v7.8h, v19.8h\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "fmax v17.8h, v4.8h, v3.8h\n"
+    "fmax v16.8h, v28.8h, v22.8h\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fmax v16.8h, v17.8h, v16.8h\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "fmax v8.8h, v8.8h, v16.8h\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fmax v23.8h, v3.8h, v2.8h\n"
-    "fmax v19.8h, v1.8h, v0.8h\n"
-    "fmax v19.8h, v23.8h, v19.8h\n"
-    "fmax v7.8h, v7.8h, v19.8h\n"
+    "fmax v17.8h, v4.8h, v3.8h\n"
+    "fmax v16.8h, v28.8h, v22.8h\n"
+    "fmax v16.8h, v17.8h, v16.8h\n"
+    "fmax v8.8h, v8.8h, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v7.8h, v7.8h, v3.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.8h, v8.8h, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x8\n"
     "cmp %x[n_channels], #0x8\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 31f\n"
     "14:"  // Oddments
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov w19, #0xfc00\n"
-    "dup v7.8h, w19\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 20f\n"
+    "mov w20, #0xfc00\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.8h, w20\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 20f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "b 19f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "b 19f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #0, 19f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "b 19f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 19f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 2: End
-    "fmax v23.8h, v3.8h, v2.8h\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.8h, v1.8h, v0.8h\n"
-    "fmax v19.8h, v23.8h, v19.8h\n"
-    "fmax v7.8h, v7.8h, v19.8h\n"
+    "fmax v17.8h, v4.8h, v3.8h\n"
+    "fmax v16.8h, v28.8h, v22.8h\n"
+    "subs x25, x25, #0x1\n"
+    "fmax v16.8h, v17.8h, v16.8h\n"
+    "fmax v8.8h, v8.8h, v16.8h\n"
     "bgt 15b\n"
     "20:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 26f\n"
     "21:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #2, 23f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #1, 22f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "b 25f\n"
     "22:"  // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "b 25f\n"
     "23:"  // Oddments: Single input loop: Load: Bit 2: Unset
     "tbz %x[n_channels], #1, 24f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #0, 25f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "b 25f\n"
     "24:"  // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 25f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "25:"  // Oddments: Single input loop: Load: Bit 2: End
-    "fmax v7.8h, v7.8h, v3.8h\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.8h, v8.8h, v4.8h\n"
     "bgt 21b\n"
     "26:"  // Oddments: Single input loop: End
     "tbz %x[n_channels], #2, 28f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #1, 27f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "27:"  // Oddments: Store: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "28:"  // Oddments: Store: Bit 2: Unset
     "tbz %x[n_channels], #1, 29f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
     "b 30f\n"
     "29:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 30f\n"
-    "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
     "30:"  // Oddments: Store: Bit 2: End
-
     "31:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 9a16b99a71..7add5feb1d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,33 +24,28 @@
 
 #pragma once
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
 void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+  using Parent = DepthfirstStrategy<float, float>;
 
-  constexpr static unsigned int pool_rows(void) { return 3; }
-  constexpr static unsigned int pool_cols(void) { return 3; }
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
+  a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
-  a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index ff8d7d8ba1..cf0047638e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <cstdint>
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
@@ -80,172 +82,173 @@ void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x4, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr q7, [%x[args], %[offsetof_rescale]]\n"
+    "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
+    "cmp x3, #0x4\n"
+    "mov x4, #0x0\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
     "mov x5, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x6, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x4, #0x4\n"
-    "ldp x7, x8, [x20, #0x0]\n"
-    "ldp x17, x16, [x20, #0x10]\n"
-    "ldp x15, x14, [x19, #0x0]\n"
-    "ldp x13, x12, [x19, #0x10]\n"
-    "ldp x11, x10, [x19, #0x20]\n"
-    "ldp x9, x28, [x19, #0x30]\n"
-    "ldp x27, x26, [x19, #0x40]\n"
-    "ldp x25, x24, [x19, #0x50]\n"
-    "ldp x23, x22, [x19, #0x60]\n"
-    "ldp x21, x20, [x19, #0x70]\n"
-    "ldr q8, [%x[args], %[offsetof_rescale]]\n"
+    "ldp x6, x7, [x21, #0x0]\n"
+    "ldp x8, x17, [x21, #0x10]\n"
+    "ldp x16, x15, [x20, #0x0]\n"
+    "ldp x14, x13, [x20, #0x10]\n"
+    "ldp x12, x11, [x20, #0x20]\n"
+    "ldp x10, x9, [x20, #0x30]\n"
+    "ldp x28, x27, [x20, #0x40]\n"
+    "ldp x26, x25, [x20, #0x50]\n"
+    "ldp x24, x23, [x20, #0x60]\n"
+    "ldp x22, x21, [x20, #0x70]\n"
     "blt 3f\n"
-    "ldr q7, [x10, x5]\n"
-    "lsr x19, x4, #0x2\n"
-    "ldr q6, [x9, x5]\n"
-    "sub x4, x4, x19, LSL #2\n"
-    "ldr q5, [x26, x5]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q4, [x25, x5]\n"
-    "ldr q3, [x14, x5]\n"
-    "ldr q2, [x13, x5]\n"
-    "ldr q1, [x11, x5]\n"
-    "ldr q0, [x27, x5]\n"
-    "ldr q31, [x28, x5]\n"
-    "ldr q30, [x24, x5]\n"
-    "ldr q29, [x22, x5]\n"
-    "ldr q28, [x21, x5]\n"
-    "ldr q27, [x15, x5]\n"
-    "ldr q26, [x12, x5]\n"
-    "ldr q25, [x23, x5]\n"
-    "ldr q24, [x20, x5]\n"
-    "add x5, x5, #0x10\n"
+    "ldr q6, [x11, x4]\n"
+    "ldr q5, [x10, x4]\n"
+    "lsr x20, x3, #0x2\n"
+    "sub x3, x3, x20, LSL #2\n"
+    "ldr q4, [x27, x4]\n"
+    "ldr q3, [x26, x4]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q2, [x15, x4]\n"
+    "ldr q1, [x14, x4]\n"
+    "ldr q0, [x12, x4]\n"
+    "ldr q31, [x28, x4]\n"
+    "ldr q30, [x9, x4]\n"
+    "ldr q29, [x25, x4]\n"
+    "ldr q28, [x23, x4]\n"
+    "ldr q27, [x22, x4]\n"
+    "ldr q26, [x16, x4]\n"
+    "ldr q25, [x13, x4]\n"
+    "ldr q24, [x24, x4]\n"
+    "ldr q23, [x21, x4]\n"
+    "add x4, x4, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
-    "fadd v17.4s, v7.4s, v6.4s\n"
-    "ldr q7, [x10, x5]\n"
-    "subs x19, x19, #0x1\n"
-    "fadd v16.4s, v5.4s, v4.4s\n"
-    "ldr q6, [x9, x5]\n"
-    "fadd v18.4s, v3.4s, v2.4s\n"
-    "ldr q5, [x26, x5]\n"
-    "fadd v23.4s, v1.4s, v0.4s\n"
-    "ldr q4, [x25, x5]\n"
-    "fadd v22.4s, v31.4s, v30.4s\n"
-    "ldr q3, [x14, x5]\n"
-    "fadd v17.4s, v17.4s, v16.4s\n"
-    "ldr q2, [x13, x5]\n"
-    "fadd v16.4s, v29.4s, v28.4s\n"
-    "ldr q1, [x11, x5]\n"
-    "fadd v19.4s, v27.4s, v23.4s\n"
-    "ldr q0, [x27, x5]\n"
-    "fadd v21.4s, v18.4s, v17.4s\n"
-    "ldr q31, [x28, x5]\n"
-    "fadd v20.4s, v16.4s, v17.4s\n"
-    "ldr q30, [x24, x5]\n"
-    "fadd v18.4s, v26.4s, v22.4s\n"
-    "ldr q29, [x22, x5]\n"
-    "fadd v17.4s, v25.4s, v23.4s\n"
-    "ldr q28, [x21, x5]\n"
-    "fadd v16.4s, v24.4s, v22.4s\n"
-    "ldr q27, [x15, x5]\n"
+    "fadd v17.4s, v6.4s, v5.4s\n"
+    "ldr q6, [x11, x4]\n"
+    "ldr q5, [x10, x4]\n"
+    "fadd v16.4s, v4.4s, v3.4s\n"
+    "ldr q4, [x27, x4]\n"
+    "ldr q3, [x26, x4]\n"
+    "fadd v19.4s, v17.4s, v16.4s\n"
+    "fadd v18.4s, v2.4s, v1.4s\n"
+    "ldr q2, [x15, x4]\n"
+    "ldr q1, [x14, x4]\n"
+    "fadd v17.4s, v0.4s, v31.4s\n"
+    "fadd v22.4s, v30.4s, v29.4s\n"
+    "ldr q0, [x12, x4]\n"
+    "ldr q31, [x28, x4]\n"
+    "fadd v16.4s, v28.4s, v27.4s\n"
+    "fadd v21.4s, v18.4s, v19.4s\n"
+    "ldr q30, [x9, x4]\n"
+    "ldr q29, [x25, x4]\n"
+    "fadd v20.4s, v16.4s, v19.4s\n"
+    "fadd v19.4s, v26.4s, v17.4s\n"
+    "ldr q28, [x23, x4]\n"
+    "ldr q27, [x22, x4]\n"
+    "fadd v18.4s, v25.4s, v22.4s\n"
+    "fadd v17.4s, v24.4s, v17.4s\n"
+    "ldr q26, [x16, x4]\n"
+    "ldr q25, [x13, x4]\n"
+    "fadd v16.4s, v23.4s, v22.4s\n"
     "fadd v19.4s, v21.4s, v19.4s\n"
-    "ldr q26, [x12, x5]\n"
+    "ldr q24, [x24, x4]\n"
+    "ldr q23, [x21, x4]\n"
     "fadd v18.4s, v21.4s, v18.4s\n"
-    "ldr q25, [x23, x5]\n"
     "fadd v17.4s, v17.4s, v20.4s\n"
-    "ldr q24, [x20, x5]\n"
+    "fadd v16.4s, v16.4s, v20.4s\n"
+    "subs x20, x20, #0x1\n"
+    "fmul v19.4s, v19.4s, v7.s[0]\n"
+    "add x4, x4, #0x10\n"
+    "fmul v18.4s, v18.4s, v7.s[1]\n"
+    "fmul v17.4s, v17.4s, v7.s[2]\n"
+    "str q19, [x6, x5]\n"
+    "fmul v16.4s, v16.4s, v7.s[3]\n"
+    "str q18, [x7, x5]\n"
+    "str q17, [x8, x5]\n"
+    "str q16, [x17, x5]\n"
     "add x5, x5, #0x10\n"
-    "fadd v16.4s, v20.4s, v16.4s\n"
-    "fmul v19.4s, v19.4s, v8.s[0]\n"
-    "str q19, [x7, x6]\n"
-    "fmul v18.4s, v18.4s, v8.s[1]\n"
-    "fmul v17.4s, v17.4s, v8.s[2]\n"
-    "str q18, [x8, x6]\n"
-    "fmul v16.4s, v16.4s, v8.s[3]\n"
-    "str q17, [x17, x6]\n"
-    "str q16, [x16, x6]\n"
-    "add x6, x6, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
-    "fadd v17.4s, v7.4s, v6.4s\n"
-    "fadd v16.4s, v5.4s, v4.4s\n"
-    "fadd v18.4s, v3.4s, v2.4s\n"
-    "fadd v23.4s, v1.4s, v0.4s\n"
-    "fadd v17.4s, v17.4s, v16.4s\n"
-    "fadd v22.4s, v31.4s, v30.4s\n"
-    "fadd v16.4s, v29.4s, v28.4s\n"
-    "fadd v21.4s, v18.4s, v17.4s\n"
-    "fadd v19.4s, v27.4s, v23.4s\n"
-    "fadd v20.4s, v16.4s, v17.4s\n"
-    "fadd v18.4s, v26.4s, v22.4s\n"
-    "fadd v17.4s, v25.4s, v23.4s\n"
-    "fadd v16.4s, v24.4s, v22.4s\n"
+    "fadd v17.4s, v6.4s, v5.4s\n"
+    "fadd v16.4s, v4.4s, v3.4s\n"
+    "fadd v19.4s, v17.4s, v16.4s\n"
+    "fadd v18.4s, v2.4s, v1.4s\n"
+    "fadd v17.4s, v0.4s, v31.4s\n"
+    "fadd v22.4s, v30.4s, v29.4s\n"
+    "fadd v16.4s, v28.4s, v27.4s\n"
+    "fadd v21.4s, v18.4s, v19.4s\n"
+    "fadd v20.4s, v16.4s, v19.4s\n"
+    "fadd v19.4s, v26.4s, v17.4s\n"
+    "fadd v18.4s, v25.4s, v22.4s\n"
+    "fadd v17.4s, v24.4s, v17.4s\n"
+    "fadd v16.4s, v23.4s, v22.4s\n"
     "fadd v19.4s, v21.4s, v19.4s\n"
     "fadd v18.4s, v21.4s, v18.4s\n"
     "fadd v17.4s, v17.4s, v20.4s\n"
-    "fadd v16.4s, v20.4s, v16.4s\n"
-    "fmul v19.4s, v19.4s, v8.s[0]\n"
-    "str q19, [x7, x6]\n"
-    "fmul v18.4s, v18.4s, v8.s[1]\n"
-    "fmul v17.4s, v17.4s, v8.s[2]\n"
-    "str q18, [x8, x6]\n"
-    "fmul v16.4s, v16.4s, v8.s[3]\n"
-    "str q17, [x17, x6]\n"
-    "str q16, [x16, x6]\n"
-    "add x6, x6, #0x10\n"
-    "cbz x4, 4f\n"
+    "fadd v16.4s, v16.4s, v20.4s\n"
+    "fmul v19.4s, v19.4s, v7.s[0]\n"
+    "str q19, [x6, x5]\n"
+    "fmul v18.4s, v18.4s, v7.s[1]\n"
+    "fmul v17.4s, v17.4s, v7.s[2]\n"
+    "str q18, [x7, x5]\n"
+    "fmul v16.4s, v16.4s, v7.s[3]\n"
+    "str q17, [x8, x5]\n"
+    "str q16, [x17, x5]\n"
+    "add x5, x5, #0x10\n"
+    "cbz x3, 4f\n"
     "3:"  // Oddments
-    "ldr s7, [x10, x5]\n"
-    "subs x4, x4, #0x1\n"
-    "ldr s6, [x9, x5]\n"
-    "fadd v17.4s, v7.4s, v6.4s\n"
-    "ldr s5, [x26, x5]\n"
-    "ldr s4, [x25, x5]\n"
-    "fadd v16.4s, v5.4s, v4.4s\n"
-    "ldr s3, [x14, x5]\n"
-    "ldr s2, [x13, x5]\n"
-    "fadd v17.4s, v17.4s, v16.4s\n"
-    "ldr s1, [x11, x5]\n"
-    "ldr s0, [x27, x5]\n"
-    "fadd v18.4s, v3.4s, v2.4s\n"
-    "ldr s31, [x28, x5]\n"
-    "fadd v23.4s, v1.4s, v0.4s\n"
-    "ldr s30, [x24, x5]\n"
-    "fadd v21.4s, v18.4s, v17.4s\n"
-    "ldr s29, [x22, x5]\n"
-    "ldr s28, [x21, x5]\n"
-    "fadd v22.4s, v31.4s, v30.4s\n"
-    "ldr s27, [x15, x5]\n"
-    "ldr s26, [x12, x5]\n"
-    "fadd v16.4s, v29.4s, v28.4s\n"
-    "ldr s25, [x23, x5]\n"
-    "fadd v20.4s, v16.4s, v17.4s\n"
-    "ldr s24, [x20, x5]\n"
-    "add x5, x5, #0x4\n"
-    "fadd v19.4s, v27.4s, v23.4s\n"
-    "fadd v18.4s, v26.4s, v22.4s\n"
-    "fadd v17.4s, v25.4s, v23.4s\n"
-    "fadd v16.4s, v24.4s, v22.4s\n"
-    "fadd v19.4s, v21.4s, v19.4s\n"
-    "fadd v18.4s, v21.4s, v18.4s\n"
+    "ldr s17, [x11, x4]\n"
+    "ldr s16, [x10, x4]\n"
+    "fadd v18.4s, v17.4s, v16.4s\n"
+    "subs x3, x3, #0x1\n"
+    "ldr s17, [x27, x4]\n"
+    "ldr s16, [x26, x4]\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "fadd v18.4s, v18.4s, v16.4s\n"
+    "ldr s17, [x15, x4]\n"
+    "ldr s16, [x14, x4]\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "fadd v23.4s, v16.4s, v18.4s\n"
+    "ldr s17, [x12, x4]\n"
+    "ldr s16, [x28, x4]\n"
+    "fadd v22.4s, v17.4s, v16.4s\n"
+    "ldr s17, [x9, x4]\n"
+    "ldr s16, [x25, x4]\n"
+    "fadd v21.4s, v17.4s, v16.4s\n"
+    "ldr s17, [x23, x4]\n"
+    "ldr s16, [x22, x4]\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "fadd v20.4s, v16.4s, v18.4s\n"
+    "ldr s17, [x16, x4]\n"
+    "ldr s16, [x13, x4]\n"
+    "fadd v19.4s, v17.4s, v22.4s\n"
+    "fadd v18.4s, v16.4s, v21.4s\n"
+    "ldr s17, [x24, x4]\n"
+    "ldr s16, [x21, x4]\n"
+    "fadd v17.4s, v17.4s, v22.4s\n"
+    "fadd v16.4s, v16.4s, v21.4s\n"
+    "fadd v19.4s, v23.4s, v19.4s\n"
+    "fadd v18.4s, v23.4s, v18.4s\n"
+    "add x4, x4, #0x4\n"
     "fadd v17.4s, v17.4s, v20.4s\n"
-    "fadd v16.4s, v20.4s, v16.4s\n"
-    "fmul v19.4s, v19.4s, v8.s[0]\n"
-    "str s19, [x7, x6]\n"
-    "fmul v18.4s, v18.4s, v8.s[1]\n"
-    "fmul v17.4s, v17.4s, v8.s[2]\n"
-    "str s18, [x8, x6]\n"
-    "fmul v16.4s, v16.4s, v8.s[3]\n"
-    "str s17, [x17, x6]\n"
-    "str s16, [x16, x6]\n"
-    "add x6, x6, #0x4\n"
+    "fadd v16.4s, v16.4s, v20.4s\n"
+    "fmul v19.4s, v19.4s, v7.s[0]\n"
+    "fmul v18.4s, v18.4s, v7.s[1]\n"
+    "str s19, [x6, x5]\n"
+    "fmul v17.4s, v17.4s, v7.s[2]\n"
+    "fmul v16.4s, v16.4s, v7.s[3]\n"
+    "str s18, [x7, x5]\n"
+    "str s17, [x8, x5]\n"
+    "str s16, [x17, x5]\n"
+    "add x5, x5, #0x4\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp
index 4ef26318d4..26895e610d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_fp32_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
 
-struct a64_fp32_nhwc_avg_generic_depthfirst
+struct a64_fp32_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_fp32_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<float, float>;
   a64_fp32_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_fp32_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index 21f705451a..d236f07b1c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -41,260 +42,258 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
   const auto rescale_value = static_cast<float>(1.0f / static_cast<float>(window_cells));
 
   __asm__ __volatile__(
-    "ld1r { v8.4s }, [%x[rescale_ptr]]\n"
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
+    "ld1r { v9.4s }, [%x[rescale_ptr]]\n"
     "cmp %x[n_channels], #0x10\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
     "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "movi v5.16b, #0x0\n"
-    "movi v4.16b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fadd v23.4s, v3.4s, v2.4s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.4s, v1.4s, v0.4s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd v22.4s, v31.4s, v30.4s\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v18.4s, v29.4s, v28.4s\n"
-    "fadd v21.4s, v27.4s, v21.4s\n"
-    "ldr q2, [x22, x28]\n"
-    "fadd v17.4s, v26.4s, v17.4s\n"
-    "ldr q1, [x21, x28]\n"
-    "fadd v20.4s, v25.4s, v20.4s\n"
-    "ldr q0, [x20, x28]\n"
-    "fadd v16.4s, v24.4s, v16.4s\n"
-    "ldr q31, [x23, x27]\n"
+    "fadd v23.4s, v4.4s, v3.4s\n"
+    "fadd v19.4s, v28.4s, v22.4s\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fadd v22.4s, v2.4s, v1.4s\n"
+    "ldr q2, [x21, x26]\n"
+    "fadd v18.4s, v27.4s, v21.4s\n"
+    "ldr q1, [x20, x26]\n"
+    "fadd v21.4s, v0.4s, v31.4s\n"
+    "ldr q0, [x21, x24]\n"
+    "fadd v17.4s, v26.4s, v20.4s\n"
+    "ldr q31, [x20, x24]\n"
+    "fadd v20.4s, v30.4s, v29.4s\n"
+    "ldr q30, [x21, x23]\n"
+    "fadd v16.4s, v25.4s, v24.4s\n"
+    "ldr q29, [x20, x23]\n"
     "fadd v19.4s, v23.4s, v19.4s\n"
-    "ldr q30, [x22, x27]\n"
     "fadd v18.4s, v22.4s, v18.4s\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "fadd v17.4s, v21.4s, v17.4s\n"
-    "ldr q28, [x20, x27]\n"
     "fadd v16.4s, v20.4s, v16.4s\n"
-    "ldr q27, [x23, x26]\n"
-    "fadd v7.4s, v7.4s, v19.4s\n"
-    "ldr q21, [x22, x26]\n"
-    "fadd v6.4s, v6.4s, v18.4s\n"
-    "ldr q26, [x21, x26]\n"
-    "fadd v5.4s, v5.4s, v17.4s\n"
-    "ldr q17, [x20, x26]\n"
-    "fadd v4.4s, v4.4s, v16.4s\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd v8.4s, v8.4s, v19.4s\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "fadd v7.4s, v7.4s, v18.4s\n"
+    "fadd v6.4s, v6.4s, v17.4s\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "fadd v5.4s, v5.4s, v16.4s\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fadd v23.4s, v3.4s, v2.4s\n"
-    "fadd v19.4s, v1.4s, v0.4s\n"
-    "fadd v22.4s, v31.4s, v30.4s\n"
-    "fadd v18.4s, v29.4s, v28.4s\n"
-    "fadd v21.4s, v27.4s, v21.4s\n"
-    "fadd v17.4s, v26.4s, v17.4s\n"
-    "fadd v20.4s, v25.4s, v20.4s\n"
-    "fadd v16.4s, v24.4s, v16.4s\n"
+    "fadd v23.4s, v4.4s, v3.4s\n"
+    "fadd v19.4s, v28.4s, v22.4s\n"
+    "fadd v22.4s, v2.4s, v1.4s\n"
+    "fadd v18.4s, v27.4s, v21.4s\n"
+    "fadd v21.4s, v0.4s, v31.4s\n"
+    "fadd v17.4s, v26.4s, v20.4s\n"
+    "fadd v20.4s, v30.4s, v29.4s\n"
+    "fadd v16.4s, v25.4s, v24.4s\n"
     "fadd v19.4s, v23.4s, v19.4s\n"
     "fadd v18.4s, v22.4s, v18.4s\n"
     "fadd v17.4s, v21.4s, v17.4s\n"
     "fadd v16.4s, v20.4s, v16.4s\n"
-    "fadd v7.4s, v7.4s, v19.4s\n"
-    "fadd v6.4s, v6.4s, v18.4s\n"
-    "fadd v5.4s, v5.4s, v17.4s\n"
-    "fadd v4.4s, v4.4s, v16.4s\n"
+    "fadd v8.4s, v8.4s, v19.4s\n"
+    "fadd v7.4s, v7.4s, v18.4s\n"
+    "fadd v6.4s, v6.4s, v17.4s\n"
+    "fadd v5.4s, v5.4s, v16.4s\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v7.4s, v7.4s, v3.4s\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "fadd v6.4s, v6.4s, v31.4s\n"
-    "ldr q25, [x23, x25]\n"
-    "fadd v5.4s, v5.4s, v27.4s\n"
-    "fadd v4.4s, v4.4s, v25.4s\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.4s, v8.4s, v16.4s\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "fadd v7.4s, v7.4s, v17.4s\n"
+    "fadd v6.4s, v6.4s, v16.4s\n"
+    "ldr q16, [x20, x23]\n"
+    "fadd v5.4s, v5.4s, v16.4s\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "fmul v7.4s, v7.4s, v8.4s\n"
-    "str q7, [%x[outptr], x28]\n"
-    "fmul v6.4s, v6.4s, v8.4s\n"
-    "add x28, x28, #0x40\n"
-    "fmul v5.4s, v5.4s, v8.4s\n"
-    "str q6, [%x[outptr], x27]\n"
-    "fmul v4.4s, v4.4s, v8.4s\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "cmp %x[n_channels], #0x10\n"
+    "fmul v8.4s, v8.4s, v9.4s\n"
+    "fmul v7.4s, v7.4s, v9.4s\n"
+    "fmul v6.4s, v6.4s, v9.4s\n"
+    "fmul v5.4s, v5.4s, v9.4s\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 25f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x4\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fadd v23.4s, v3.4s, v2.4s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.4s, v1.4s, v0.4s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd v19.4s, v23.4s, v19.4s\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "fadd v7.4s, v7.4s, v19.4s\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "fadd v17.4s, v4.4s, v3.4s\n"
+    "fadd v16.4s, v28.4s, v22.4s\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "fadd v8.4s, v8.4s, v16.4s\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fadd v23.4s, v3.4s, v2.4s\n"
-    "fadd v19.4s, v1.4s, v0.4s\n"
-    "fadd v19.4s, v23.4s, v19.4s\n"
-    "fadd v7.4s, v7.4s, v19.4s\n"
+    "fadd v17.4s, v4.4s, v3.4s\n"
+    "fadd v16.4s, v28.4s, v22.4s\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "fadd v8.4s, v8.4s, v16.4s\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fadd v7.4s, v7.4s, v3.4s\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.4s, v8.4s, v16.4s\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "fmul v7.4s, v7.4s, v8.4s\n"
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x4\n"
     "cmp %x[n_channels], #0x4\n"
+    "fmul v8.4s, v8.4s, v9.4s\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 25f\n"
     "14:"  // Oddments
-    "movi v7.16b, #0x0\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 18f\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "movi v8.16b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 18f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #0, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "b 17f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 1: Unset
     "tbz %x[n_channels], #0, 17f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 1: End
-    "fadd v23.4s, v3.4s, v2.4s\n"
-    "subs x24, x24, #0x1\n"
-    "fadd v19.4s, v1.4s, v0.4s\n"
-    "fadd v19.4s, v23.4s, v19.4s\n"
-    "fadd v7.4s, v7.4s, v19.4s\n"
+    "fadd v17.4s, v4.4s, v3.4s\n"
+    "fadd v16.4s, v28.4s, v22.4s\n"
+    "subs x25, x25, #0x1\n"
+    "fadd v16.4s, v17.4s, v16.4s\n"
+    "fadd v8.4s, v8.4s, v16.4s\n"
     "bgt 15b\n"
     "18:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 22f\n"
     "19:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #0, 21f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "b 21f\n"
     "20:"  // Oddments: Single input loop: Load: Bit 1: Unset
     "tbz %x[n_channels], #0, 21f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "21:"  // Oddments: Single input loop: Load: Bit 1: End
-    "fadd v7.4s, v7.4s, v3.4s\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "fadd v8.4s, v8.4s, v4.4s\n"
     "bgt 19b\n"
     "22:"  // Oddments: Single input loop: End
-    "fmul v7.4s, v7.4s, v8.4s\n"
+    "fmul v8.4s, v8.4s, v9.4s\n"
     "tbz %x[n_channels], #1, 23f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #0, 24f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "b 24f\n"
     "23:"  // Oddments: Store: Bit 1: Unset
     "tbz %x[n_channels], #0, 24f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "24:"  // Oddments: Store: Bit 1: End
-
     "25:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 9a22adf6f4..2f72b59d70 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,33 +24,28 @@
 
 #pragma once
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
 void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  using Parent = DepthfirstStrategy<float, float>;
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
+  a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index ea7e2195d1..f4202de1ed 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <cstdint>
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
@@ -61,114 +63,115 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x14, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x15, #0x4\n"
-    "ldp x12, x11, [x20, #0x0]\n"
-    "ldp x10, x9, [x20, #0x10]\n"
-    "ldp x28, x27, [x19, #0x0]\n"
-    "ldp x26, x25, [x19, #0x10]\n"
-    "ldp x24, x23, [x19, #0x20]\n"
-    "ldp x22, x21, [x19, #0x30]\n"
-    "ldr x20, [x19, #0x40]\n"
+    "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "cmp x16, #0x4\n"
+    "mov x15, #0x0\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "mov x12, #0x0\n"
+    "ldp x11, x10, [x21, #0x10]\n"
+    "ldp x9, x28, [x20, #0x0]\n"
+    "ldp x27, x26, [x20, #0x10]\n"
+    "ldp x25, x24, [x20, #0x20]\n"
+    "ldp x23, x22, [x20, #0x30]\n"
+    "ldr x21, [x20, #0x40]\n"
     "blt 3f\n"
-    "ldr q30, [x27, x14]\n"
-    "lsr x19, x15, #0x2\n"
-    "ldr q29, [x24, x14]\n"
-    "sub x15, x15, x19, LSL #2\n"
-    "ldr q28, [x21, x14]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q27, [x25, x14]\n"
-    "ldr q26, [x28, x14]\n"
-    "ldr q25, [x23, x14]\n"
-    "ldr q24, [x26, x14]\n"
-    "ldr q23, [x22, x14]\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q30, [x28, x15]\n"
+    "ldr q29, [x25, x15]\n"
+    "lsr x20, x16, #0x2\n"
+    "sub x16, x16, x20, LSL #2\n"
+    "ldr q28, [x22, x15]\n"
+    "ldr q27, [x26, x15]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q26, [x9, x15]\n"
+    "ldr q25, [x27, x15]\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "ldr q22, [x21, x15]\n"
+    "add x15, x15, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
     "fmax v21.4s, v30.4s, v29.4s\n"
-    "ldr q30, [x27, x14]\n"
-    "subs x19, x19, #0x1\n"
+    "ldr q30, [x28, x15]\n"
     "fmax v20.4s, v29.4s, v28.4s\n"
-    "ldr q29, [x24, x14]\n"
+    "ldr q29, [x25, x15]\n"
+    "ldr q28, [x22, x15]\n"
     "fmax v19.4s, v27.4s, v26.4s\n"
-    "ldr q28, [x21, x14]\n"
+    "ldr q26, [x9, x15]\n"
     "fmax v18.4s, v25.4s, v24.4s\n"
-    "ldr q26, [x28, x14]\n"
-    "fmax v17.4s, v23.4s, v27.4s\n"
-    "ldr q27, [x25, x14]\n"
-    "fmax v16.4s, v25.4s, v22.4s\n"
-    "ldr q25, [x23, x14]\n"
+    "ldr q25, [x27, x15]\n"
+    "fmax v17.4s, v27.4s, v23.4s\n"
+    "ldr q27, [x26, x15]\n"
+    "fmax v16.4s, v24.4s, v22.4s\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "subs x20, x20, #0x1\n"
     "fmax v19.4s, v21.4s, v19.4s\n"
-    "ldr q24, [x26, x14]\n"
-    "fmax v18.4s, v21.4s, v18.4s\n"
-    "ldr q23, [x22, x14]\n"
-    "fmax v17.4s, v20.4s, v17.4s\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q22, [x21, x15]\n"
+    "fmax v18.4s, v18.4s, v21.4s\n"
+    "fmax v17.4s, v17.4s, v20.4s\n"
+    "add x15, x15, #0x10\n"
     "fmax v16.4s, v20.4s, v16.4s\n"
-    "str q19, [x12, x13]\n"
-    "str q18, [x11, x13]\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
+    "str q19, [x14, x12]\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
     "fmax v21.4s, v30.4s, v29.4s\n"
     "fmax v20.4s, v29.4s, v28.4s\n"
-    "fmax v19.4s, v27.4s, v26.4s\n"
+    "fmax v16.4s, v27.4s, v26.4s\n"
     "fmax v18.4s, v25.4s, v24.4s\n"
-    "fmax v17.4s, v23.4s, v27.4s\n"
-    "fmax v16.4s, v25.4s, v22.4s\n"
-    "fmax v19.4s, v21.4s, v19.4s\n"
-    "str q19, [x12, x13]\n"
-    "fmax v18.4s, v21.4s, v18.4s\n"
-    "fmax v17.4s, v20.4s, v17.4s\n"
-    "str q18, [x11, x13]\n"
-    "fmax v16.4s, v20.4s, v16.4s\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
-    "cbz x15, 4f\n"
+    "fmax v17.4s, v27.4s, v23.4s\n"
+    "fmax v19.4s, v24.4s, v22.4s\n"
+    "fmax v16.4s, v21.4s, v16.4s\n"
+    "fmax v18.4s, v18.4s, v21.4s\n"
+    "str q16, [x14, x12]\n"
+    "fmax v17.4s, v17.4s, v20.4s\n"
+    "fmax v16.4s, v20.4s, v19.4s\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
+    "cbz x16, 4f\n"
     "3:"  // Oddments
-    "ldr s30, [x27, x14]\n"
-    "subs x15, x15, #0x1\n"
-    "ldr s29, [x24, x14]\n"
-    "fmax v21.4s, v30.4s, v29.4s\n"
-    "ldr s28, [x21, x14]\n"
-    "ldr s27, [x25, x14]\n"
-    "fmax v20.4s, v29.4s, v28.4s\n"
-    "ldr s26, [x28, x14]\n"
-    "ldr s25, [x23, x14]\n"
-    "fmax v19.4s, v27.4s, v26.4s\n"
-    "ldr s24, [x26, x14]\n"
-    "ldr s23, [x22, x14]\n"
-    "fmax v19.4s, v21.4s, v19.4s\n"
-    "ldr s22, [x20, x14]\n"
-    "add x14, x14, #0x4\n"
-    "fmax v18.4s, v25.4s, v24.4s\n"
-    "str s19, [x12, x13]\n"
-    "fmax v17.4s, v23.4s, v27.4s\n"
-    "fmax v16.4s, v25.4s, v22.4s\n"
-    "fmax v18.4s, v21.4s, v18.4s\n"
-    "str s18, [x11, x13]\n"
-    "fmax v17.4s, v20.4s, v17.4s\n"
-    "fmax v16.4s, v20.4s, v16.4s\n"
-    "str s17, [x10, x13]\n"
-    "str s16, [x9, x13]\n"
-    "add x13, x13, #0x4\n"
+    "ldr s16, [x28, x15]\n"
+    "ldr s17, [x25, x15]\n"
+    "fmax v23.4s, v16.4s, v17.4s\n"
+    "subs x16, x16, #0x1\n"
+    "ldr s16, [x22, x15]\n"
+    "ldr s22, [x26, x15]\n"
+    "fmax v21.4s, v17.4s, v16.4s\n"
+    "ldr s16, [x9, x15]\n"
+    "ldr s17, [x27, x15]\n"
+    "fmax v16.4s, v22.4s, v16.4s\n"
+    "fmax v20.4s, v23.4s, v16.4s\n"
+    "ldr s19, [x24, x15]\n"
+    "ldr s16, [x23, x15]\n"
+    "fmax v18.4s, v17.4s, v19.4s\n"
+    "fmax v17.4s, v22.4s, v16.4s\n"
+    "ldr s16, [x21, x15]\n"
+    "fmax v16.4s, v19.4s, v16.4s\n"
+    "add x15, x15, #0x4\n"
+    "fmax v18.4s, v18.4s, v23.4s\n"
+    "fmax v17.4s, v17.4s, v21.4s\n"
+    "fmax v16.4s, v21.4s, v16.4s\n"
+    "str s20, [x14, x12]\n"
+    "str s18, [x13, x12]\n"
+    "str s17, [x11, x12]\n"
+    "str s16, [x10, x12]\n"
+    "add x12, x12, #0x4\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp
index b20ffc20cf..7577b31d7d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_fp32_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
 
-struct a64_fp32_nhwc_max_generic_depthfirst
+struct a64_fp32_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_fp32_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<float, float>;
   a64_fp32_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_fp32_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
index e0acb7ac02..f4706635dc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -39,256 +40,254 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x10\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
     "mov w20, #0xff800000\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.4s, w20\n"
     "dup v7.4s, w20\n"
-    "mov x19, %x[inptrs]\n"
     "dup v6.4s, w20\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "dup v5.4s, w20\n"
-    "dup v4.4s, w20\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fmax v23.4s, v3.4s, v2.4s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.4s, v1.4s, v0.4s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax v22.4s, v31.4s, v30.4s\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v18.4s, v29.4s, v28.4s\n"
-    "fmax v21.4s, v27.4s, v21.4s\n"
-    "ldr q2, [x22, x28]\n"
-    "fmax v17.4s, v26.4s, v17.4s\n"
-    "ldr q1, [x21, x28]\n"
-    "fmax v20.4s, v25.4s, v20.4s\n"
-    "ldr q0, [x20, x28]\n"
-    "fmax v16.4s, v24.4s, v16.4s\n"
-    "ldr q31, [x23, x27]\n"
+    "fmax v23.4s, v4.4s, v3.4s\n"
+    "fmax v19.4s, v28.4s, v22.4s\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fmax v22.4s, v2.4s, v1.4s\n"
+    "ldr q2, [x21, x26]\n"
+    "fmax v18.4s, v27.4s, v21.4s\n"
+    "ldr q1, [x20, x26]\n"
+    "fmax v21.4s, v0.4s, v31.4s\n"
+    "ldr q0, [x21, x24]\n"
+    "fmax v17.4s, v26.4s, v20.4s\n"
+    "ldr q31, [x20, x24]\n"
+    "fmax v20.4s, v30.4s, v29.4s\n"
+    "ldr q30, [x21, x23]\n"
+    "fmax v16.4s, v25.4s, v24.4s\n"
+    "ldr q29, [x20, x23]\n"
     "fmax v19.4s, v23.4s, v19.4s\n"
-    "ldr q30, [x22, x27]\n"
     "fmax v18.4s, v22.4s, v18.4s\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "fmax v17.4s, v21.4s, v17.4s\n"
-    "ldr q28, [x20, x27]\n"
     "fmax v16.4s, v20.4s, v16.4s\n"
-    "ldr q27, [x23, x26]\n"
-    "fmax v7.4s, v7.4s, v19.4s\n"
-    "ldr q21, [x22, x26]\n"
-    "fmax v6.4s, v6.4s, v18.4s\n"
-    "ldr q26, [x21, x26]\n"
-    "fmax v5.4s, v5.4s, v17.4s\n"
-    "ldr q17, [x20, x26]\n"
-    "fmax v4.4s, v4.4s, v16.4s\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax v8.4s, v8.4s, v19.4s\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "fmax v7.4s, v7.4s, v18.4s\n"
+    "fmax v6.4s, v6.4s, v17.4s\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "fmax v5.4s, v5.4s, v16.4s\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fmax v23.4s, v3.4s, v2.4s\n"
-    "fmax v19.4s, v1.4s, v0.4s\n"
-    "fmax v22.4s, v31.4s, v30.4s\n"
-    "fmax v18.4s, v29.4s, v28.4s\n"
-    "fmax v21.4s, v27.4s, v21.4s\n"
-    "fmax v17.4s, v26.4s, v17.4s\n"
-    "fmax v20.4s, v25.4s, v20.4s\n"
-    "fmax v16.4s, v24.4s, v16.4s\n"
+    "fmax v23.4s, v4.4s, v3.4s\n"
+    "fmax v19.4s, v28.4s, v22.4s\n"
+    "fmax v22.4s, v2.4s, v1.4s\n"
+    "fmax v18.4s, v27.4s, v21.4s\n"
+    "fmax v21.4s, v0.4s, v31.4s\n"
+    "fmax v17.4s, v26.4s, v20.4s\n"
+    "fmax v20.4s, v30.4s, v29.4s\n"
+    "fmax v16.4s, v25.4s, v24.4s\n"
     "fmax v19.4s, v23.4s, v19.4s\n"
     "fmax v18.4s, v22.4s, v18.4s\n"
     "fmax v17.4s, v21.4s, v17.4s\n"
     "fmax v16.4s, v20.4s, v16.4s\n"
-    "fmax v7.4s, v7.4s, v19.4s\n"
-    "fmax v6.4s, v6.4s, v18.4s\n"
-    "fmax v5.4s, v5.4s, v17.4s\n"
-    "fmax v4.4s, v4.4s, v16.4s\n"
+    "fmax v8.4s, v8.4s, v19.4s\n"
+    "fmax v7.4s, v7.4s, v18.4s\n"
+    "fmax v6.4s, v6.4s, v17.4s\n"
+    "fmax v5.4s, v5.4s, v16.4s\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v7.4s, v7.4s, v3.4s\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "fmax v6.4s, v6.4s, v31.4s\n"
-    "ldr q25, [x23, x25]\n"
-    "fmax v5.4s, v5.4s, v27.4s\n"
-    "fmax v4.4s, v4.4s, v25.4s\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.4s, v8.4s, v16.4s\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "fmax v7.4s, v7.4s, v17.4s\n"
+    "fmax v6.4s, v6.4s, v16.4s\n"
+    "ldr q16, [x20, x23]\n"
+    "fmax v5.4s, v5.4s, v16.4s\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x40\n"
-    "str q6, [%x[outptr], x27]\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
     "cmp %x[n_channels], #0x10\n"
+    "str q8, [%x[outptr], x27]\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x27, x27, #0x40\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 25f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x4\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov w19, #0xff800000\n"
-    "dup v7.4s, w19\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "mov w20, #0xff800000\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.4s, w20\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fmax v23.4s, v3.4s, v2.4s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.4s, v1.4s, v0.4s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax v19.4s, v23.4s, v19.4s\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "fmax v7.4s, v7.4s, v19.4s\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "fmax v17.4s, v4.4s, v3.4s\n"
+    "fmax v16.4s, v28.4s, v22.4s\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "fmax v16.4s, v17.4s, v16.4s\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "fmax v8.4s, v8.4s, v16.4s\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fmax v23.4s, v3.4s, v2.4s\n"
-    "fmax v19.4s, v1.4s, v0.4s\n"
-    "fmax v19.4s, v23.4s, v19.4s\n"
-    "fmax v7.4s, v7.4s, v19.4s\n"
+    "fmax v17.4s, v4.4s, v3.4s\n"
+    "fmax v16.4s, v28.4s, v22.4s\n"
+    "fmax v16.4s, v17.4s, v16.4s\n"
+    "fmax v8.4s, v8.4s, v16.4s\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "fmax v7.4s, v7.4s, v3.4s\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.4s, v8.4s, v16.4s\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x4\n"
     "cmp %x[n_channels], #0x4\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 25f\n"
     "14:"  // Oddments
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov w19, #0xff800000\n"
-    "dup v7.4s, w19\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 18f\n"
+    "mov w20, #0xff800000\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "dup v8.4s, w20\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 18f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #0, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "b 17f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 1: Unset
     "tbz %x[n_channels], #0, 17f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 1: End
-    "fmax v23.4s, v3.4s, v2.4s\n"
-    "subs x24, x24, #0x1\n"
-    "fmax v19.4s, v1.4s, v0.4s\n"
-    "fmax v19.4s, v23.4s, v19.4s\n"
-    "fmax v7.4s, v7.4s, v19.4s\n"
+    "fmax v17.4s, v4.4s, v3.4s\n"
+    "fmax v16.4s, v28.4s, v22.4s\n"
+    "subs x25, x25, #0x1\n"
+    "fmax v16.4s, v17.4s, v16.4s\n"
+    "fmax v8.4s, v8.4s, v16.4s\n"
     "bgt 15b\n"
     "18:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 22f\n"
     "19:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #0, 21f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "b 21f\n"
     "20:"  // Oddments: Single input loop: Load: Bit 1: Unset
     "tbz %x[n_channels], #0, 21f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "21:"  // Oddments: Single input loop: Load: Bit 1: End
-    "fmax v7.4s, v7.4s, v3.4s\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "fmax v8.4s, v8.4s, v4.4s\n"
     "bgt 19b\n"
     "22:"  // Oddments: Single input loop: End
     "tbz %x[n_channels], #1, 23f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #0, 24f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "b 24f\n"
     "23:"  // Oddments: Store: Bit 1: Unset
     "tbz %x[n_channels], #0, 24f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "24:"  // Oddments: Store: Bit 1: End
-
     "25:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp
index df66ab7a2c..de94ec0ec3 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_s8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
 
-struct a64_s8_nhwc_avg_generic_depthfirst
+struct a64_s8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_s8_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
   a64_s8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_s8_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 405ae66755..5d082102b3 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
@@ -83,27 +84,28 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
       shift_value--;
       f_rescale_value *= 2.0f;
     }
-    int64_t large_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
-    if (large_rescale_value == (1ll << 31))
+
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      large_rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
-    rescale_value = static_cast<int32_t>(large_rescale_value);
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   __asm__ __volatile__(
-    "mov x26, #0x0\n"
-    "mov x25, #0x10\n" // cntb _, ALL, #1
-    "mov x24, #0x20\n" // cntb _, ALL, #2
-    "mov x23, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x25, #0x20\n"  // cntb _, ALL, #2
+    "mov x24, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
     "movi v11.4s, #0x0\n"
@@ -118,43 +120,43 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "movi v2.4s, #0x0\n"
     "movi v1.4s, #0x0\n"
     "movi v0.4s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ldr q29, [x21, x25]\n"
-    "ldr q28, [x20, x25]\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q26, [x20, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "ldr q24, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     "saddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
     "saddl v21.8h, v29.8b, v28.8b\n"
-    "subs x22, x22, #0x1\n"
     "saddl2 v20.8h, v29.16b, v28.16b\n"
-    "ldr q30, [x20, x26]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
     "saddl v19.8h, v27.8b, v26.8b\n"
-    "ldr q29, [x21, x25]\n"
     "saddl2 v18.8h, v27.16b, v26.16b\n"
-    "ldr q28, [x20, x25]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
     "saddl v17.8h, v25.8b, v24.8b\n"
-    "ldr q27, [x21, x24]\n"
     "saddl2 v16.8h, v25.16b, v24.16b\n"
-    "ldr q26, [x20, x24]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
+    "subs x23, x23, #0x1\n"
     "saddw v15.4s, v15.4s, v23.4h\n"
-    "ldr q25, [x21, x23]\n"
     "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q24, [x20, x23]\n"
     "saddw v13.4s, v13.4s, v22.4h\n"
     "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "add x22, x22, #0x10\n"
     "saddw v11.4s, v11.4s, v21.4h\n"
     "saddw2 v10.4s, v10.4s, v21.8h\n"
     "saddw v9.4s, v9.4s, v20.4h\n"
@@ -194,23 +196,23 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "saddw v1.4s, v1.4s, v16.4h\n"
     "saddw2 v0.4s, v0.4s, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "sxtl v23.8h, v31.8b\n"
-    "ldr q29, [x21, x25]\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "sxtl v21.8h, v29.8b\n"
-    "sxtl2 v20.8h, v29.16b\n"
-    "sxtl v19.8h, v27.8b\n"
-    "sxtl2 v18.8h, v27.16b\n"
-    "sxtl v17.8h, v25.8b\n"
-    "sxtl2 v16.8h, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "sxtl v23.8h, v16.8b\n"
+    "sxtl2 v22.8h, v16.16b\n"
+    "ldr q16, [x20, x26]\n"
+    "ldr q17, [x20, x25]\n"
+    "sxtl v21.8h, v16.8b\n"
+    "sxtl2 v20.8h, v16.16b\n"
+    "ldr q16, [x20, x24]\n"
+    "sxtl v19.8h, v17.8b\n"
+    "sxtl2 v18.8h, v17.16b\n"
+    "subs x23, x23, #0x1\n"
+    "sxtl v17.8h, v16.8b\n"
+    "sxtl2 v16.8h, v16.16b\n"
     "saddw v15.4s, v15.4s, v23.4h\n"
     "saddw2 v14.4s, v14.4s, v23.8h\n"
     "saddw v13.4s, v13.4s, v22.4h\n"
@@ -229,195 +231,195 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "saddw2 v0.4s, v0.4s, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "movi v19.4s, #0x7f\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
+    "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
     "sub %x[n_channels], %x[n_channels], #0x40\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[shift_ptr]]\n"
-    "not v16.16b, v19.16b\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
     "cmp %x[n_channels], #0x40\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
-    "sqdmulh v11.4s, v11.4s, v18.4s\n"
-    "sqdmulh v10.4s, v10.4s, v18.4s\n"
-    "sqdmulh v9.4s, v9.4s, v18.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
-    "srshl v11.4s, v11.4s, v17.4s\n"
-    "srshl v10.4s, v10.4s, v17.4s\n"
-    "srshl v9.4s, v9.4s, v17.4s\n"
-    "sqdmulh v8.4s, v8.4s, v18.4s\n"
-    "sqdmulh v7.4s, v7.4s, v18.4s\n"
-    "sqdmulh v6.4s, v6.4s, v18.4s\n"
-    "sqdmulh v5.4s, v5.4s, v18.4s\n"
-    "srshl v8.4s, v8.4s, v17.4s\n"
-    "srshl v7.4s, v7.4s, v17.4s\n"
-    "srshl v6.4s, v6.4s, v17.4s\n"
-    "srshl v5.4s, v5.4s, v17.4s\n"
-    "sqdmulh v4.4s, v4.4s, v18.4s\n"
-    "sqdmulh v3.4s, v3.4s, v18.4s\n"
-    "sqdmulh v2.4s, v2.4s, v18.4s\n"
-    "sqdmulh v1.4s, v1.4s, v18.4s\n"
-    "srshl v4.4s, v4.4s, v17.4s\n"
-    "srshl v3.4s, v3.4s, v17.4s\n"
-    "srshl v2.4s, v2.4s, v17.4s\n"
-    "srshl v1.4s, v1.4s, v17.4s\n"
-    "sqdmulh v0.4s, v0.4s, v18.4s\n"
+    "sqdmulh v11.4s, v11.4s, v17.4s\n"
+    "sqdmulh v10.4s, v10.4s, v17.4s\n"
+    "sqdmulh v9.4s, v9.4s, v17.4s\n"
+    "sqdmulh v8.4s, v8.4s, v17.4s\n"
+    "sqdmulh v7.4s, v7.4s, v17.4s\n"
+    "sqdmulh v6.4s, v6.4s, v17.4s\n"
+    "sqdmulh v5.4s, v5.4s, v17.4s\n"
+    "sqdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqdmulh v3.4s, v3.4s, v17.4s\n"
+    "sqdmulh v2.4s, v2.4s, v17.4s\n"
+    "sqdmulh v1.4s, v1.4s, v17.4s\n"
+    "sqdmulh v0.4s, v0.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "srshl v11.4s, v11.4s, v16.4s\n"
+    "srshl v10.4s, v10.4s, v16.4s\n"
+    "srshl v9.4s, v9.4s, v16.4s\n"
+    "srshl v8.4s, v8.4s, v16.4s\n"
+    "srshl v7.4s, v7.4s, v16.4s\n"
+    "srshl v6.4s, v6.4s, v16.4s\n"
+    "srshl v5.4s, v5.4s, v16.4s\n"
+    "srshl v4.4s, v4.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v16.4s\n"
+    "srshl v2.4s, v2.4s, v16.4s\n"
+    "srshl v1.4s, v1.4s, v16.4s\n"
+    "srshl v0.4s, v0.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
-    "srshl v0.4s, v0.4s, v17.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
     "smax v11.4s, v11.4s, v16.4s\n"
     "smax v10.4s, v10.4s, v16.4s\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
-    "smin v11.4s, v11.4s, v19.4s\n"
-    "smin v10.4s, v10.4s, v19.4s\n"
     "smax v9.4s, v9.4s, v16.4s\n"
     "smax v8.4s, v8.4s, v16.4s\n"
     "smax v7.4s, v7.4s, v16.4s\n"
-    "smin v9.4s, v9.4s, v19.4s\n"
-    "smin v8.4s, v8.4s, v19.4s\n"
-    "smin v7.4s, v7.4s, v19.4s\n"
     "smax v6.4s, v6.4s, v16.4s\n"
     "smax v5.4s, v5.4s, v16.4s\n"
     "smax v4.4s, v4.4s, v16.4s\n"
-    "smin v6.4s, v6.4s, v19.4s\n"
-    "smin v5.4s, v5.4s, v19.4s\n"
-    "smin v4.4s, v4.4s, v19.4s\n"
     "smax v3.4s, v3.4s, v16.4s\n"
     "smax v2.4s, v2.4s, v16.4s\n"
     "smax v1.4s, v1.4s, v16.4s\n"
-    "smin v3.4s, v3.4s, v19.4s\n"
-    "smin v2.4s, v2.4s, v19.4s\n"
-    "smin v1.4s, v1.4s, v19.4s\n"
     "smax v0.4s, v0.4s, v16.4s\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "smin v11.4s, v11.4s, v17.4s\n"
+    "smin v10.4s, v10.4s, v17.4s\n"
+    "smin v9.4s, v9.4s, v17.4s\n"
+    "smin v8.4s, v8.4s, v17.4s\n"
+    "smin v7.4s, v7.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v17.4s\n"
+    "smin v5.4s, v5.4s, v17.4s\n"
+    "smin v4.4s, v4.4s, v17.4s\n"
+    "smin v3.4s, v3.4s, v17.4s\n"
+    "smin v2.4s, v2.4s, v17.4s\n"
+    "smin v1.4s, v1.4s, v17.4s\n"
+    "smin v0.4s, v0.4s, v17.4s\n"
     "uzp1 v23.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "smin v0.4s, v0.4s, v19.4s\n"
     "uzp1 v22.16b, v11.16b, v10.16b\n"
-    "uzp1 v21.16b, v9.16b, v8.16b\n"
-    "uzp1 v20.16b, v7.16b, v6.16b\n"
+    "uzp1 v18.16b, v9.16b, v8.16b\n"
+    "uzp1 v21.16b, v7.16b, v6.16b\n"
     "uzp1 v17.16b, v5.16b, v4.16b\n"
-    "uzp1 v19.16b, v3.16b, v2.16b\n"
-    "uzp1 v18.16b, v1.16b, v0.16b\n"
+    "uzp1 v20.16b, v3.16b, v2.16b\n"
+    "uzp1 v19.16b, v1.16b, v0.16b\n"
     "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
+    "uzp1 v18.16b, v22.16b, v18.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "uzp1 v17.16b, v21.16b, v17.16b\n"
+    "uzp1 v16.16b, v20.16b, v19.16b\n"
+    "str q18, [%x[outptr], x26]\n"
     "add x26, x26, #0x40\n"
-    "uzp1 v17.16b, v20.16b, v17.16b\n"
-    "str q16, [%x[outptr], x25]\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "str q17, [%x[outptr], x25]\n"
     "add x25, x25, #0x40\n"
-    "str q17, [%x[outptr], x24]\n"
+    "str q16, [%x[outptr], x24]\n"
     "add x24, x24, #0x40\n"
-    "str q16, [%x[outptr], x23]\n"
-    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "subs x22, x22, #0x1\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q30, [x20, x26]\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
+    "add x22, x22, #0x10\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "sxtl v23.8h, v31.8b\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "sxtl v17.8h, v16.8b\n"
+    "sxtl2 v16.8h, v16.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "movi v19.4s, #0x7f\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
+    "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[shift_ptr]]\n"
-    "not v16.16b, v19.16b\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
     "cmp %x[n_channels], #0x10\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "add x26, x26, #0x10\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "add %x[outptr], %x[outptr], x27\n"
     "movi v15.4s, #0x0\n"
-    "add %x[outptr], %x[outptr], x26\n"
     "movi v14.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 24f\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 24f\n"
     "15:"  // Oddments: 2 inputs loop
+    "ldp x21, x20, [x22, #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
+    "add x20, x20, x27\n"
     "movi v30.16b, #0x0\n"
-    "add x21, x21, x26\n"
-    "add x20, x20, x26\n"
     "tbz %x[n_channels], #3, 19f\n"
     "ldr d31, [x21], #0x8\n"
     "ldr d30, [x20], #0x8\n"
@@ -478,21 +480,21 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "ldr b31, [x21], #0x1\n"
     "ldr b30, [x20], #0x1\n"
     "23:"  // Oddments: 2 inputs loop: Load: Bit 3: End
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "subs x22, x22, #0x1\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
+    "ldr x21, [x22], #0x8\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldr x21, [x19], #0x8\n"
-    "add x21, x21, x26\n"
     "tbz %x[n_channels], #3, 29f\n"
     "ldr d31, [x21], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
@@ -538,38 +540,38 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 33f\n"
     "ldr b31, [x21], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "sxtl v23.8h, v31.8b\n"
-    "subs x20, x20, #0x1\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "sxtl v17.8h, v31.8b\n"
+    "sxtl2 v16.8h, v31.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "movi v19.4s, #0x7f\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
-    "not v16.16b, v19.16b\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[shift_ptr]]\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
+    "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -615,12 +617,10 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 7829ecc0e9..f8f1134866 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,33 +24,28 @@
 
 #pragma once
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
 void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  using Parent = DepthfirstStrategy<int8_t, int8_t>;
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
+  a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 298db96861..7e62ac1afc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <cstdint>
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
@@ -61,114 +63,115 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x14, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x15, #0x10\n"
-    "ldp x12, x11, [x20, #0x0]\n"
-    "ldp x10, x9, [x20, #0x10]\n"
-    "ldp x28, x27, [x19, #0x0]\n"
-    "ldp x26, x25, [x19, #0x10]\n"
-    "ldp x24, x23, [x19, #0x20]\n"
-    "ldp x22, x21, [x19, #0x30]\n"
-    "ldr x20, [x19, #0x40]\n"
+    "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "cmp x16, #0x10\n"
+    "mov x15, #0x0\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "mov x12, #0x0\n"
+    "ldp x11, x10, [x21, #0x10]\n"
+    "ldp x9, x28, [x20, #0x0]\n"
+    "ldp x27, x26, [x20, #0x10]\n"
+    "ldp x25, x24, [x20, #0x20]\n"
+    "ldp x23, x22, [x20, #0x30]\n"
+    "ldr x21, [x20, #0x40]\n"
     "blt 3f\n"
-    "ldr q30, [x27, x14]\n"
-    "lsr x19, x15, #0x4\n"
-    "ldr q29, [x24, x14]\n"
-    "sub x15, x15, x19, LSL #4\n"
-    "ldr q28, [x21, x14]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q27, [x25, x14]\n"
-    "ldr q26, [x28, x14]\n"
-    "ldr q25, [x23, x14]\n"
-    "ldr q24, [x26, x14]\n"
-    "ldr q23, [x22, x14]\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q30, [x28, x15]\n"
+    "ldr q29, [x25, x15]\n"
+    "lsr x20, x16, #0x4\n"
+    "sub x16, x16, x20, LSL #4\n"
+    "ldr q28, [x22, x15]\n"
+    "ldr q27, [x26, x15]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q26, [x9, x15]\n"
+    "ldr q25, [x27, x15]\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "ldr q22, [x21, x15]\n"
+    "add x15, x15, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
     "smax v21.16b, v30.16b, v29.16b\n"
-    "ldr q30, [x27, x14]\n"
-    "subs x19, x19, #0x1\n"
+    "ldr q30, [x28, x15]\n"
     "smax v20.16b, v29.16b, v28.16b\n"
-    "ldr q29, [x24, x14]\n"
+    "ldr q29, [x25, x15]\n"
+    "ldr q28, [x22, x15]\n"
     "smax v19.16b, v27.16b, v26.16b\n"
-    "ldr q28, [x21, x14]\n"
+    "ldr q26, [x9, x15]\n"
     "smax v18.16b, v25.16b, v24.16b\n"
-    "ldr q26, [x28, x14]\n"
-    "smax v17.16b, v23.16b, v27.16b\n"
-    "ldr q27, [x25, x14]\n"
-    "smax v16.16b, v25.16b, v22.16b\n"
-    "ldr q25, [x23, x14]\n"
+    "ldr q25, [x27, x15]\n"
+    "smax v17.16b, v27.16b, v23.16b\n"
+    "ldr q27, [x26, x15]\n"
+    "smax v16.16b, v24.16b, v22.16b\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "subs x20, x20, #0x1\n"
     "smax v19.16b, v21.16b, v19.16b\n"
-    "ldr q24, [x26, x14]\n"
-    "smax v18.16b, v21.16b, v18.16b\n"
-    "ldr q23, [x22, x14]\n"
-    "smax v17.16b, v20.16b, v17.16b\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q22, [x21, x15]\n"
+    "smax v18.16b, v18.16b, v21.16b\n"
+    "smax v17.16b, v17.16b, v20.16b\n"
+    "add x15, x15, #0x10\n"
     "smax v16.16b, v20.16b, v16.16b\n"
-    "str q19, [x12, x13]\n"
-    "str q18, [x11, x13]\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
+    "str q19, [x14, x12]\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
     "smax v21.16b, v30.16b, v29.16b\n"
     "smax v20.16b, v29.16b, v28.16b\n"
-    "smax v19.16b, v27.16b, v26.16b\n"
+    "smax v16.16b, v27.16b, v26.16b\n"
     "smax v18.16b, v25.16b, v24.16b\n"
-    "smax v17.16b, v23.16b, v27.16b\n"
-    "smax v16.16b, v25.16b, v22.16b\n"
-    "smax v19.16b, v21.16b, v19.16b\n"
-    "str q19, [x12, x13]\n"
-    "smax v18.16b, v21.16b, v18.16b\n"
-    "smax v17.16b, v20.16b, v17.16b\n"
-    "str q18, [x11, x13]\n"
-    "smax v16.16b, v20.16b, v16.16b\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
-    "cbz x15, 4f\n"
+    "smax v17.16b, v27.16b, v23.16b\n"
+    "smax v19.16b, v24.16b, v22.16b\n"
+    "smax v16.16b, v21.16b, v16.16b\n"
+    "smax v18.16b, v18.16b, v21.16b\n"
+    "str q16, [x14, x12]\n"
+    "smax v17.16b, v17.16b, v20.16b\n"
+    "smax v16.16b, v20.16b, v19.16b\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
+    "cbz x16, 4f\n"
     "3:"  // Oddments
-    "ldr b30, [x27, x14]\n"
-    "subs x15, x15, #0x1\n"
-    "ldr b29, [x24, x14]\n"
-    "smax v21.16b, v30.16b, v29.16b\n"
-    "ldr b28, [x21, x14]\n"
-    "ldr b27, [x25, x14]\n"
-    "smax v20.16b, v29.16b, v28.16b\n"
-    "ldr b26, [x28, x14]\n"
-    "ldr b25, [x23, x14]\n"
-    "smax v19.16b, v27.16b, v26.16b\n"
-    "ldr b24, [x26, x14]\n"
-    "ldr b23, [x22, x14]\n"
-    "smax v19.16b, v21.16b, v19.16b\n"
-    "ldr b22, [x20, x14]\n"
-    "add x14, x14, #0x1\n"
-    "smax v18.16b, v25.16b, v24.16b\n"
-    "str b19, [x12, x13]\n"
-    "smax v17.16b, v23.16b, v27.16b\n"
-    "smax v16.16b, v25.16b, v22.16b\n"
-    "smax v18.16b, v21.16b, v18.16b\n"
-    "str b18, [x11, x13]\n"
-    "smax v17.16b, v20.16b, v17.16b\n"
-    "smax v16.16b, v20.16b, v16.16b\n"
-    "str b17, [x10, x13]\n"
-    "str b16, [x9, x13]\n"
-    "add x13, x13, #0x1\n"
+    "ldr b16, [x28, x15]\n"
+    "ldr b17, [x25, x15]\n"
+    "smax v23.16b, v16.16b, v17.16b\n"
+    "subs x16, x16, #0x1\n"
+    "ldr b16, [x22, x15]\n"
+    "ldr b22, [x26, x15]\n"
+    "smax v21.16b, v17.16b, v16.16b\n"
+    "ldr b16, [x9, x15]\n"
+    "ldr b17, [x27, x15]\n"
+    "smax v16.16b, v22.16b, v16.16b\n"
+    "smax v20.16b, v23.16b, v16.16b\n"
+    "ldr b19, [x24, x15]\n"
+    "ldr b16, [x23, x15]\n"
+    "smax v18.16b, v17.16b, v19.16b\n"
+    "smax v17.16b, v22.16b, v16.16b\n"
+    "ldr b16, [x21, x15]\n"
+    "smax v16.16b, v19.16b, v16.16b\n"
+    "add x15, x15, #0x1\n"
+    "smax v18.16b, v18.16b, v23.16b\n"
+    "smax v17.16b, v17.16b, v21.16b\n"
+    "smax v16.16b, v21.16b, v16.16b\n"
+    "str b20, [x14, x12]\n"
+    "str b18, [x13, x12]\n"
+    "str b17, [x11, x12]\n"
+    "str b16, [x10, x12]\n"
+    "add x12, x12, #0x1\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp
index 6c4cd1467f..ba6d52f570 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_s8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
 
-struct a64_s8_nhwc_max_generic_depthfirst
+struct a64_s8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_s8_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
   a64_s8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_s8_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
index 5e4c84d23e..411fd11460 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -39,397 +40,395 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x80\n"
     "movi v7.16b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x80\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "movi v5.16b, #0x80\n"
-    "movi v4.16b, #0x80\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax v22.16b, v31.16b, v30.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v18.16b, v29.16b, v28.16b\n"
-    "smax v21.16b, v27.16b, v21.16b\n"
-    "ldr q2, [x22, x28]\n"
-    "smax v17.16b, v26.16b, v17.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "smax v20.16b, v25.16b, v20.16b\n"
-    "ldr q0, [x20, x28]\n"
-    "smax v16.16b, v24.16b, v16.16b\n"
-    "ldr q31, [x23, x27]\n"
+    "smax v23.16b, v4.16b, v3.16b\n"
+    "smax v19.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "smax v22.16b, v2.16b, v1.16b\n"
+    "ldr q2, [x21, x26]\n"
+    "smax v18.16b, v27.16b, v21.16b\n"
+    "ldr q1, [x20, x26]\n"
+    "smax v21.16b, v0.16b, v31.16b\n"
+    "ldr q0, [x21, x24]\n"
+    "smax v17.16b, v26.16b, v20.16b\n"
+    "ldr q31, [x20, x24]\n"
+    "smax v20.16b, v30.16b, v29.16b\n"
+    "ldr q30, [x21, x23]\n"
+    "smax v16.16b, v25.16b, v24.16b\n"
+    "ldr q29, [x20, x23]\n"
     "smax v19.16b, v23.16b, v19.16b\n"
-    "ldr q30, [x22, x27]\n"
     "smax v18.16b, v22.16b, v18.16b\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "smax v17.16b, v21.16b, v17.16b\n"
-    "ldr q28, [x20, x27]\n"
     "smax v16.16b, v20.16b, v16.16b\n"
-    "ldr q27, [x23, x26]\n"
-    "smax v7.16b, v7.16b, v19.16b\n"
-    "ldr q21, [x22, x26]\n"
-    "smax v6.16b, v6.16b, v18.16b\n"
-    "ldr q26, [x21, x26]\n"
-    "smax v5.16b, v5.16b, v17.16b\n"
-    "ldr q17, [x20, x26]\n"
-    "smax v4.16b, v4.16b, v16.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "smax v8.16b, v8.16b, v19.16b\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "smax v7.16b, v7.16b, v18.16b\n"
+    "smax v6.16b, v6.16b, v17.16b\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "smax v5.16b, v5.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v22.16b, v31.16b, v30.16b\n"
-    "smax v18.16b, v29.16b, v28.16b\n"
-    "smax v21.16b, v27.16b, v21.16b\n"
-    "smax v17.16b, v26.16b, v17.16b\n"
-    "smax v20.16b, v25.16b, v20.16b\n"
-    "smax v16.16b, v24.16b, v16.16b\n"
+    "smax v23.16b, v4.16b, v3.16b\n"
+    "smax v19.16b, v28.16b, v22.16b\n"
+    "smax v22.16b, v2.16b, v1.16b\n"
+    "smax v18.16b, v27.16b, v21.16b\n"
+    "smax v21.16b, v0.16b, v31.16b\n"
+    "smax v17.16b, v26.16b, v20.16b\n"
+    "smax v20.16b, v30.16b, v29.16b\n"
+    "smax v16.16b, v25.16b, v24.16b\n"
     "smax v19.16b, v23.16b, v19.16b\n"
     "smax v18.16b, v22.16b, v18.16b\n"
     "smax v17.16b, v21.16b, v17.16b\n"
     "smax v16.16b, v20.16b, v16.16b\n"
-    "smax v7.16b, v7.16b, v19.16b\n"
-    "smax v6.16b, v6.16b, v18.16b\n"
-    "smax v5.16b, v5.16b, v17.16b\n"
-    "smax v4.16b, v4.16b, v16.16b\n"
+    "smax v8.16b, v8.16b, v19.16b\n"
+    "smax v7.16b, v7.16b, v18.16b\n"
+    "smax v6.16b, v6.16b, v17.16b\n"
+    "smax v5.16b, v5.16b, v16.16b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v7.16b, v7.16b, v3.16b\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "smax v6.16b, v6.16b, v31.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "smax v5.16b, v5.16b, v27.16b\n"
-    "smax v4.16b, v4.16b, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "smax v7.16b, v7.16b, v17.16b\n"
+    "smax v6.16b, v6.16b, v16.16b\n"
+    "ldr q16, [x20, x23]\n"
+    "smax v5.16b, v5.16b, v16.16b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x40\n"
-    "str q6, [%x[outptr], x27]\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x40\n"
     "cmp %x[n_channels], #0x40\n"
+    "str q8, [%x[outptr], x27]\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x27, x27, #0x40\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "movi v7.16b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x80\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "smax v7.16b, v7.16b, v19.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "smax v7.16b, v7.16b, v19.16b\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v7.16b, v7.16b, v3.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
     "cmp %x[n_channels], #0x10\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
-    "movi v7.16b, #0x80\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 24f\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "movi v8.16b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 24f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #3, 19f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
-    "ld1 { v2.b }[14], [x22], #0x1\n"
-    "ld1 { v1.b }[14], [x21], #0x1\n"
-    "ld1 { v0.b }[14], [x20], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
+    "ld1 { v3.b }[14], [x22], #0x1\n"
+    "ld1 { v28.b }[14], [x21], #0x1\n"
+    "ld1 { v22.b }[14], [x20], #0x1\n"
     "b 23f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
-    "ld1 { v2.b }[12], [x22], #0x1\n"
-    "ld1 { v1.b }[12], [x21], #0x1\n"
-    "ld1 { v0.b }[12], [x20], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
+    "ld1 { v3.b }[12], [x22], #0x1\n"
+    "ld1 { v28.b }[12], [x21], #0x1\n"
+    "ld1 { v22.b }[12], [x20], #0x1\n"
     "b 23f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
-    "ld1 { v2.b }[10], [x22], #0x1\n"
-    "ld1 { v1.b }[10], [x21], #0x1\n"
-    "ld1 { v0.b }[10], [x20], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
+    "ld1 { v3.b }[10], [x22], #0x1\n"
+    "ld1 { v28.b }[10], [x21], #0x1\n"
+    "ld1 { v22.b }[10], [x20], #0x1\n"
     "b 23f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
-    "ld1 { v2.b }[8], [x22], #0x1\n"
-    "ld1 { v1.b }[8], [x21], #0x1\n"
-    "ld1 { v0.b }[8], [x20], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
+    "ld1 { v3.b }[8], [x22], #0x1\n"
+    "ld1 { v28.b }[8], [x21], #0x1\n"
+    "ld1 { v22.b }[8], [x20], #0x1\n"
     "b 23f\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 21f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
-    "ld1 { v2.b }[6], [x22], #0x1\n"
-    "ld1 { v1.b }[6], [x21], #0x1\n"
-    "ld1 { v0.b }[6], [x20], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
+    "ld1 { v3.b }[6], [x22], #0x1\n"
+    "ld1 { v28.b }[6], [x21], #0x1\n"
+    "ld1 { v22.b }[6], [x20], #0x1\n"
     "b 23f\n"
     "20:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
-    "ld1 { v2.b }[4], [x22], #0x1\n"
-    "ld1 { v1.b }[4], [x21], #0x1\n"
-    "ld1 { v0.b }[4], [x20], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
+    "ld1 { v3.b }[4], [x22], #0x1\n"
+    "ld1 { v28.b }[4], [x21], #0x1\n"
+    "ld1 { v22.b }[4], [x20], #0x1\n"
     "b 23f\n"
     "21:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 22f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
-    "ld1 { v2.b }[2], [x22], #0x1\n"
-    "ld1 { v1.b }[2], [x21], #0x1\n"
-    "ld1 { v0.b }[2], [x20], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
+    "ld1 { v3.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "ld1 { v22.b }[2], [x20], #0x1\n"
     "b 23f\n"
     "22:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ldr b3, [x23], #0x1\n"
-    "ldr b2, [x22], #0x1\n"
-    "ldr b1, [x21], #0x1\n"
-    "ldr b0, [x20], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
+    "ldr b3, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "ldr b22, [x20], #0x1\n"
     "23:"  // Oddments: 4 inputs loop: Load: Bit 3: End
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "smax v7.16b, v7.16b, v19.16b\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "subs x25, x25, #0x1\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #3, 29f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #1, 26f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
     "b 33f\n"
     "26:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
     "b 33f\n"
     "27:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 28f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
     "b 33f\n"
     "28:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
     "b 33f\n"
     "29:"  // Oddments: Single input loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 31f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #1, 30f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
     "b 33f\n"
     "30:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
     "b 33f\n"
     "31:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 32f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
     "b 33f\n"
     "32:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ldr b3, [x23], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "smax v7.16b, v7.16b, v3.16b\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v4.16b\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
     "tbz %x[n_channels], #3, 38f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #1, 35f\n"
-    "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[14], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[14], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "35:"  // Oddments: Store: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[12], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[12], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "36:"  // Oddments: Store: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 37f\n"
-    "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[10], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[10], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "37:"  // Oddments: Store: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[8], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[8], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "38:"  // Oddments: Store: Bit 3: Unset
     "tbz %x[n_channels], #2, 40f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #1, 39f\n"
-    "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[6], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[6], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "39:"  // Oddments: Store: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[4], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[4], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "40:"  // Oddments: Store: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 41f\n"
-    "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[2], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[2], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "41:"  // Oddments: Store: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[0], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp
index a50e99a009..d5d7313a90 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_s8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
 
-struct a64_s8q_nhwc_avg_generic_depthfirst
+struct a64_s8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_s8q_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
   a64_s8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_s8q_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index f288a4119c..019f402911 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 
 #include "pooling.hpp"
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
@@ -86,12 +87,13 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   // Combine together the rescale value for the requantization and the scaling
@@ -112,17 +114,17 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
   );
 
   __asm__ __volatile__(
-    "mov x26, #0x0\n"
-    "mov x25, #0x10\n" // cntb _, ALL, #1
-    "mov x24, #0x20\n" // cntb _, ALL, #2
-    "mov x23, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x25, #0x20\n"  // cntb _, ALL, #2
+    "mov x24, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
     "movi v11.4s, #0x0\n"
@@ -137,43 +139,43 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "movi v2.4s, #0x0\n"
     "movi v1.4s, #0x0\n"
     "movi v0.4s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ldr q29, [x21, x25]\n"
-    "ldr q28, [x20, x25]\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q26, [x20, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "ldr q24, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     "saddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
     "saddl v21.8h, v29.8b, v28.8b\n"
-    "subs x22, x22, #0x1\n"
     "saddl2 v20.8h, v29.16b, v28.16b\n"
-    "ldr q30, [x20, x26]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
     "saddl v19.8h, v27.8b, v26.8b\n"
-    "ldr q29, [x21, x25]\n"
     "saddl2 v18.8h, v27.16b, v26.16b\n"
-    "ldr q28, [x20, x25]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
     "saddl v17.8h, v25.8b, v24.8b\n"
-    "ldr q27, [x21, x24]\n"
     "saddl2 v16.8h, v25.16b, v24.16b\n"
-    "ldr q26, [x20, x24]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
+    "subs x23, x23, #0x1\n"
     "saddw v15.4s, v15.4s, v23.4h\n"
-    "ldr q25, [x21, x23]\n"
     "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q24, [x20, x23]\n"
     "saddw v13.4s, v13.4s, v22.4h\n"
     "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "add x22, x22, #0x10\n"
     "saddw v11.4s, v11.4s, v21.4h\n"
     "saddw2 v10.4s, v10.4s, v21.8h\n"
     "saddw v9.4s, v9.4s, v20.4h\n"
@@ -213,23 +215,23 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "saddw v1.4s, v1.4s, v16.4h\n"
     "saddw2 v0.4s, v0.4s, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "sxtl v23.8h, v31.8b\n"
-    "ldr q29, [x21, x25]\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "sxtl v21.8h, v29.8b\n"
-    "sxtl2 v20.8h, v29.16b\n"
-    "sxtl v19.8h, v27.8b\n"
-    "sxtl2 v18.8h, v27.16b\n"
-    "sxtl v17.8h, v25.8b\n"
-    "sxtl2 v16.8h, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "sxtl v23.8h, v16.8b\n"
+    "sxtl2 v22.8h, v16.16b\n"
+    "ldr q16, [x20, x26]\n"
+    "ldr q17, [x20, x25]\n"
+    "sxtl v21.8h, v16.8b\n"
+    "sxtl2 v20.8h, v16.16b\n"
+    "ldr q16, [x20, x24]\n"
+    "sxtl v19.8h, v17.8b\n"
+    "sxtl2 v18.8h, v17.16b\n"
+    "subs x23, x23, #0x1\n"
+    "sxtl v17.8h, v16.8b\n"
+    "sxtl2 v16.8h, v16.16b\n"
     "saddw v15.4s, v15.4s, v23.4h\n"
     "saddw2 v14.4s, v14.4s, v23.8h\n"
     "saddw v13.4s, v13.4s, v22.4h\n"
@@ -248,217 +250,217 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "saddw2 v0.4s, v0.4s, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "movi v20.4s, #0x7f\n"
-    "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x40\n"
     "ld1r { v18.4s }, [%x[left_shift]]\n"
+    "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
     "srshl v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[right_shift]]\n"
-    "not v16.16b, v20.16b\n"
     "srshl v14.4s, v14.4s, v18.4s\n"
-    "cmp %x[n_channels], #0x40\n"
+    "ld1r { v16.4s }, [%x[right_shift]]\n"
     "srshl v13.4s, v13.4s, v18.4s\n"
     "srshl v12.4s, v12.4s, v18.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x40\n"
     "srshl v11.4s, v11.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v19.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v19.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v19.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v19.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
-    "sqrdmulh v11.4s, v11.4s, v19.4s\n"
     "srshl v10.4s, v10.4s, v18.4s\n"
+    "cmp %x[n_channels], #0x40\n"
     "srshl v9.4s, v9.4s, v18.4s\n"
     "srshl v8.4s, v8.4s, v18.4s\n"
-    "srshl v11.4s, v11.4s, v17.4s\n"
-    "sqrdmulh v10.4s, v10.4s, v19.4s\n"
-    "sqrdmulh v9.4s, v9.4s, v19.4s\n"
-    "sqrdmulh v8.4s, v8.4s, v19.4s\n"
     "srshl v7.4s, v7.4s, v18.4s\n"
-    "srshl v10.4s, v10.4s, v17.4s\n"
-    "srshl v9.4s, v9.4s, v17.4s\n"
-    "srshl v8.4s, v8.4s, v17.4s\n"
-    "sqrdmulh v7.4s, v7.4s, v19.4s\n"
     "srshl v6.4s, v6.4s, v18.4s\n"
     "srshl v5.4s, v5.4s, v18.4s\n"
     "srshl v4.4s, v4.4s, v18.4s\n"
-    "srshl v7.4s, v7.4s, v17.4s\n"
-    "sqrdmulh v6.4s, v6.4s, v19.4s\n"
-    "sqrdmulh v5.4s, v5.4s, v19.4s\n"
-    "sqrdmulh v4.4s, v4.4s, v19.4s\n"
     "srshl v3.4s, v3.4s, v18.4s\n"
-    "srshl v6.4s, v6.4s, v17.4s\n"
-    "srshl v5.4s, v5.4s, v17.4s\n"
-    "srshl v4.4s, v4.4s, v17.4s\n"
-    "sqrdmulh v3.4s, v3.4s, v19.4s\n"
     "srshl v2.4s, v2.4s, v18.4s\n"
     "srshl v1.4s, v1.4s, v18.4s\n"
     "srshl v0.4s, v0.4s, v18.4s\n"
-    "srshl v3.4s, v3.4s, v17.4s\n"
-    "sqrdmulh v2.4s, v2.4s, v19.4s\n"
-    "sqrdmulh v1.4s, v1.4s, v19.4s\n"
-    "sqrdmulh v0.4s, v0.4s, v19.4s\n"
+    "sqrdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v17.4s\n"
+    "sqrdmulh v11.4s, v11.4s, v17.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v17.4s\n"
+    "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+    "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+    "sqrdmulh v7.4s, v7.4s, v17.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v17.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+    "sqrdmulh v2.4s, v2.4s, v17.4s\n"
+    "sqrdmulh v1.4s, v1.4s, v17.4s\n"
+    "sqrdmulh v0.4s, v0.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "srshl v11.4s, v11.4s, v16.4s\n"
+    "srshl v10.4s, v10.4s, v16.4s\n"
+    "srshl v9.4s, v9.4s, v16.4s\n"
+    "srshl v8.4s, v8.4s, v16.4s\n"
+    "srshl v7.4s, v7.4s, v16.4s\n"
+    "srshl v6.4s, v6.4s, v16.4s\n"
+    "srshl v5.4s, v5.4s, v16.4s\n"
+    "srshl v4.4s, v4.4s, v16.4s\n"
+    "srshl v3.4s, v3.4s, v16.4s\n"
+    "srshl v2.4s, v2.4s, v16.4s\n"
+    "srshl v1.4s, v1.4s, v16.4s\n"
+    "srshl v0.4s, v0.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
-    "srshl v2.4s, v2.4s, v17.4s\n"
-    "srshl v1.4s, v1.4s, v17.4s\n"
-    "srshl v0.4s, v0.4s, v17.4s\n"
-    "smin v15.4s, v15.4s, v20.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
-    "smin v14.4s, v14.4s, v20.4s\n"
-    "smin v13.4s, v13.4s, v20.4s\n"
-    "smin v12.4s, v12.4s, v20.4s\n"
     "smax v11.4s, v11.4s, v16.4s\n"
     "smax v10.4s, v10.4s, v16.4s\n"
     "smax v9.4s, v9.4s, v16.4s\n"
-    "smin v11.4s, v11.4s, v20.4s\n"
-    "smin v10.4s, v10.4s, v20.4s\n"
-    "smin v9.4s, v9.4s, v20.4s\n"
     "smax v8.4s, v8.4s, v16.4s\n"
     "smax v7.4s, v7.4s, v16.4s\n"
     "smax v6.4s, v6.4s, v16.4s\n"
-    "smin v8.4s, v8.4s, v20.4s\n"
-    "smin v7.4s, v7.4s, v20.4s\n"
-    "smin v6.4s, v6.4s, v20.4s\n"
     "smax v5.4s, v5.4s, v16.4s\n"
     "smax v4.4s, v4.4s, v16.4s\n"
     "smax v3.4s, v3.4s, v16.4s\n"
-    "smin v5.4s, v5.4s, v20.4s\n"
-    "smin v4.4s, v4.4s, v20.4s\n"
-    "smin v3.4s, v3.4s, v20.4s\n"
     "smax v2.4s, v2.4s, v16.4s\n"
     "smax v1.4s, v1.4s, v16.4s\n"
     "smax v0.4s, v0.4s, v16.4s\n"
-    "smin v2.4s, v2.4s, v20.4s\n"
-    "smin v1.4s, v1.4s, v20.4s\n"
-    "smin v0.4s, v0.4s, v20.4s\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "smin v11.4s, v11.4s, v17.4s\n"
+    "smin v10.4s, v10.4s, v17.4s\n"
+    "smin v9.4s, v9.4s, v17.4s\n"
+    "smin v8.4s, v8.4s, v17.4s\n"
+    "smin v7.4s, v7.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v17.4s\n"
+    "smin v5.4s, v5.4s, v17.4s\n"
+    "smin v4.4s, v4.4s, v17.4s\n"
+    "smin v3.4s, v3.4s, v17.4s\n"
+    "smin v2.4s, v2.4s, v17.4s\n"
+    "smin v1.4s, v1.4s, v17.4s\n"
+    "smin v0.4s, v0.4s, v17.4s\n"
     "uzp1 v23.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
     "uzp1 v22.16b, v11.16b, v10.16b\n"
-    "uzp1 v21.16b, v9.16b, v8.16b\n"
-    "uzp1 v20.16b, v7.16b, v6.16b\n"
+    "uzp1 v18.16b, v9.16b, v8.16b\n"
+    "uzp1 v21.16b, v7.16b, v6.16b\n"
     "uzp1 v17.16b, v5.16b, v4.16b\n"
-    "uzp1 v19.16b, v3.16b, v2.16b\n"
-    "uzp1 v18.16b, v1.16b, v0.16b\n"
+    "uzp1 v20.16b, v3.16b, v2.16b\n"
+    "uzp1 v19.16b, v1.16b, v0.16b\n"
     "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
+    "uzp1 v18.16b, v22.16b, v18.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "uzp1 v17.16b, v21.16b, v17.16b\n"
+    "uzp1 v16.16b, v20.16b, v19.16b\n"
+    "str q18, [%x[outptr], x26]\n"
     "add x26, x26, #0x40\n"
-    "uzp1 v17.16b, v20.16b, v17.16b\n"
-    "str q16, [%x[outptr], x25]\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "str q17, [%x[outptr], x25]\n"
     "add x25, x25, #0x40\n"
-    "str q17, [%x[outptr], x24]\n"
+    "str q16, [%x[outptr], x24]\n"
     "add x24, x24, #0x40\n"
-    "str q16, [%x[outptr], x23]\n"
-    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "subs x22, x22, #0x1\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q30, [x20, x26]\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
+    "add x22, x22, #0x10\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "sxtl v23.8h, v31.8b\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "sxtl v17.8h, v16.8b\n"
+    "sxtl2 v16.8h, v16.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "movi v20.4s, #0x7f\n"
-    "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x10\n"
     "ld1r { v18.4s }, [%x[left_shift]]\n"
+    "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
     "srshl v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[right_shift]]\n"
-    "not v16.16b, v20.16b\n"
     "srshl v14.4s, v14.4s, v18.4s\n"
-    "cmp %x[n_channels], #0x10\n"
+    "ld1r { v16.4s }, [%x[right_shift]]\n"
     "srshl v13.4s, v13.4s, v18.4s\n"
     "srshl v12.4s, v12.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v19.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v19.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v19.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v19.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x10\n"
+    "sqrdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v17.4s\n"
+    "cmp %x[n_channels], #0x10\n"
+    "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
-    "smin v15.4s, v15.4s, v20.4s\n"
-    "smin v14.4s, v14.4s, v20.4s\n"
-    "smin v13.4s, v13.4s, v20.4s\n"
-    "smin v12.4s, v12.4s, v20.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "add x26, x26, #0x10\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "add %x[outptr], %x[outptr], x27\n"
     "movi v15.4s, #0x0\n"
-    "add %x[outptr], %x[outptr], x26\n"
     "movi v14.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 24f\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 24f\n"
     "15:"  // Oddments: 2 inputs loop
+    "ldp x21, x20, [x22, #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
+    "add x20, x20, x27\n"
     "movi v30.16b, #0x0\n"
-    "add x21, x21, x26\n"
-    "add x20, x20, x26\n"
     "tbz %x[n_channels], #3, 19f\n"
     "ldr d31, [x21], #0x8\n"
     "ldr d30, [x20], #0x8\n"
@@ -519,21 +521,21 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "ldr b31, [x21], #0x1\n"
     "ldr b30, [x20], #0x1\n"
     "23:"  // Oddments: 2 inputs loop: Load: Bit 3: End
-    "saddl v23.8h, v31.8b, v30.8b\n"
-    "subs x22, x22, #0x1\n"
-    "saddl2 v22.8h, v31.16b, v30.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "saddl v17.8h, v31.8b, v30.8b\n"
+    "saddl2 v16.8h, v31.16b, v30.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
+    "ldr x21, [x22], #0x8\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldr x21, [x19], #0x8\n"
-    "add x21, x21, x26\n"
     "tbz %x[n_channels], #3, 29f\n"
     "ldr d31, [x21], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
@@ -579,43 +581,43 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 33f\n"
     "ldr b31, [x21], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "sxtl v23.8h, v31.8b\n"
-    "subs x20, x20, #0x1\n"
-    "sxtl2 v22.8h, v31.16b\n"
-    "saddw v15.4s, v15.4s, v23.4h\n"
-    "saddw2 v14.4s, v14.4s, v23.8h\n"
-    "saddw v13.4s, v13.4s, v22.4h\n"
-    "saddw2 v12.4s, v12.4s, v22.8h\n"
+    "sxtl v17.8h, v31.8b\n"
+    "sxtl2 v16.8h, v31.16b\n"
+    "subs x23, x23, #0x1\n"
+    "saddw v15.4s, v15.4s, v17.4h\n"
+    "saddw2 v14.4s, v14.4s, v17.8h\n"
+    "saddw v13.4s, v13.4s, v16.4h\n"
+    "saddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "movi v20.4s, #0x7f\n"
-    "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
-    "not v16.16b, v20.16b\n"
     "ld1r { v18.4s }, [%x[left_shift]]\n"
+    "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
     "srshl v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v17.4s }, [%x[right_shift]]\n"
     "srshl v14.4s, v14.4s, v18.4s\n"
+    "ld1r { v16.4s }, [%x[right_shift]]\n"
     "srshl v13.4s, v13.4s, v18.4s\n"
     "srshl v12.4s, v12.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v19.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v19.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v19.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v19.4s\n"
-    "srshl v15.4s, v15.4s, v17.4s\n"
-    "srshl v14.4s, v14.4s, v17.4s\n"
-    "srshl v13.4s, v13.4s, v17.4s\n"
-    "srshl v12.4s, v12.4s, v17.4s\n"
+    "sqrdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
     "smax v15.4s, v15.4s, v16.4s\n"
     "smax v14.4s, v14.4s, v16.4s\n"
     "smax v13.4s, v13.4s, v16.4s\n"
     "smax v12.4s, v12.4s, v16.4s\n"
-    "smin v15.4s, v15.4s, v20.4s\n"
-    "smin v14.4s, v14.4s, v20.4s\n"
-    "smin v13.4s, v13.4s, v20.4s\n"
-    "smin v12.4s, v12.4s, v20.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "smin v15.4s, v15.4s, v17.4s\n"
+    "smin v14.4s, v14.4s, v17.4s\n"
+    "smin v13.4s, v13.4s, v17.4s\n"
+    "smin v12.4s, v12.4s, v17.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -661,12 +663,10 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_valid_cells] "r" (n_valid_cells), [right_shift] "r" (&right_shift)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp
index ea7f7f89fe..68e7a98d0a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_s8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
 
-struct a64_s8q_nhwc_max_generic_depthfirst
+struct a64_s8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_s8q_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
   a64_s8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_s8q_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
index a077121991..f7b8dc761c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 
 #include "pooling.hpp"
-#include <cstddef>
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -42,88 +42,88 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "movi v8.16b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
     "movi v7.16b, #0x80\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x80\n"
     "movi v5.16b, #0x80\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax v22.16b, v31.16b, v30.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v18.16b, v29.16b, v28.16b\n"
-    "smax v21.16b, v27.16b, v21.16b\n"
-    "ldr q2, [x22, x28]\n"
-    "smax v17.16b, v26.16b, v17.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "smax v20.16b, v25.16b, v20.16b\n"
-    "ldr q0, [x20, x28]\n"
-    "smax v16.16b, v24.16b, v16.16b\n"
-    "ldr q31, [x23, x27]\n"
+    "smax v23.16b, v4.16b, v3.16b\n"
+    "smax v19.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "smax v22.16b, v2.16b, v1.16b\n"
+    "ldr q2, [x21, x26]\n"
+    "smax v18.16b, v27.16b, v21.16b\n"
+    "ldr q1, [x20, x26]\n"
+    "smax v21.16b, v0.16b, v31.16b\n"
+    "ldr q0, [x21, x24]\n"
+    "smax v17.16b, v26.16b, v20.16b\n"
+    "ldr q31, [x20, x24]\n"
+    "smax v20.16b, v30.16b, v29.16b\n"
+    "ldr q30, [x21, x23]\n"
+    "smax v16.16b, v25.16b, v24.16b\n"
+    "ldr q29, [x20, x23]\n"
     "smax v19.16b, v23.16b, v19.16b\n"
-    "ldr q30, [x22, x27]\n"
     "smax v18.16b, v22.16b, v18.16b\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "smax v17.16b, v21.16b, v17.16b\n"
-    "ldr q28, [x20, x27]\n"
     "smax v16.16b, v20.16b, v16.16b\n"
-    "ldr q27, [x23, x26]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
     "smax v8.16b, v8.16b, v19.16b\n"
-    "ldr q21, [x22, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
     "smax v7.16b, v7.16b, v18.16b\n"
-    "ldr q26, [x21, x26]\n"
     "smax v6.16b, v6.16b, v17.16b\n"
-    "ldr q17, [x20, x26]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "smax v5.16b, v5.16b, v16.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v22.16b, v31.16b, v30.16b\n"
-    "smax v18.16b, v29.16b, v28.16b\n"
-    "smax v21.16b, v27.16b, v21.16b\n"
-    "smax v17.16b, v26.16b, v17.16b\n"
-    "smax v20.16b, v25.16b, v20.16b\n"
-    "smax v16.16b, v24.16b, v16.16b\n"
+    "smax v23.16b, v4.16b, v3.16b\n"
+    "smax v19.16b, v28.16b, v22.16b\n"
+    "smax v22.16b, v2.16b, v1.16b\n"
+    "smax v18.16b, v27.16b, v21.16b\n"
+    "smax v21.16b, v0.16b, v31.16b\n"
+    "smax v17.16b, v26.16b, v20.16b\n"
+    "smax v20.16b, v30.16b, v29.16b\n"
+    "smax v16.16b, v25.16b, v24.16b\n"
     "smax v19.16b, v23.16b, v19.16b\n"
     "smax v18.16b, v22.16b, v18.16b\n"
     "smax v17.16b, v21.16b, v17.16b\n"
@@ -133,453 +133,453 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
     "smax v6.16b, v6.16b, v17.16b\n"
     "smax v5.16b, v5.16b, v16.16b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v8.16b, v8.16b, v3.16b\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "smax v7.16b, v7.16b, v31.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "smax v6.16b, v6.16b, v27.16b\n"
-    "smax v5.16b, v5.16b, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "smax v7.16b, v7.16b, v17.16b\n"
+    "smax v6.16b, v6.16b, v16.16b\n"
+    "ldr q16, [x20, x23]\n"
+    "smax v5.16b, v5.16b, v16.16b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
     "sxtl v23.8h, v8.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "ld1r { v4.4s }, [x19]\n"
     "sxtl2 v22.8h, v8.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v4.4s }, [x20]\n"
     "sxtl v21.8h, v7.8b\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "sxtl2 v20.8h, v7.16b\n"
-    "ld1r { v2.4s }, [x19]\n"
-    "sub %x[n_channels], %x[n_channels], #0x40\n"
-    "sxtl v19.8h, v6.8b\n"
-    "cmp %x[n_channels], #0x40\n"
-    "sxtl2 v18.8h, v6.16b\n"
+    "sxtl2 v18.8h, v7.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v3.4s }, [x20]\n"
+    "sxtl v20.8h, v6.8b\n"
+    "sxtl2 v19.8h, v6.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v2.4s }, [x20]\n"
     "sxtl v17.8h, v5.8b\n"
     "sxtl2 v16.8h, v5.16b\n"
+    "sub %x[n_channels], %x[n_channels], #0x40\n"
+    "cmp %x[n_channels], #0x40\n"
     "sxtl v1.4s, v23.4h\n"
     "sxtl2 v23.4s, v23.8h\n"
     "sxtl v0.4s, v22.4h\n"
     "sxtl2 v31.4s, v22.8h\n"
     "sxtl v30.4s, v21.4h\n"
     "sxtl2 v22.4s, v21.8h\n"
-    "sxtl v29.4s, v20.4h\n"
+    "sxtl v29.4s, v18.4h\n"
+    "sxtl2 v18.4s, v18.8h\n"
+    "sxtl v28.4s, v20.4h\n"
     "sxtl2 v21.4s, v20.8h\n"
-    "sxtl v28.4s, v19.4h\n"
-    "sxtl2 v20.4s, v19.8h\n"
-    "sxtl v27.4s, v18.4h\n"
-    "sxtl2 v26.4s, v18.8h\n"
+    "sxtl v27.4s, v19.4h\n"
+    "sxtl2 v26.4s, v19.8h\n"
     "sxtl v25.4s, v17.4h\n"
-    "sxtl2 v19.4s, v17.8h\n"
+    "sxtl2 v20.4s, v17.8h\n"
     "sxtl v24.4s, v16.4h\n"
-    "sxtl2 v18.4s, v16.8h\n"
-    "srshl v1.4s, v1.4s, v3.4s\n"
-    "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v0.4s, v0.4s, v3.4s\n"
-    "srshl v31.4s, v31.4s, v3.4s\n"
-    "sqrdmulh v1.4s, v1.4s, v4.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v0.4s, v0.4s, v4.4s\n"
-    "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+    "sxtl2 v19.4s, v16.8h\n"
+    "srshl v1.4s, v1.4s, v4.4s\n"
+    "srshl v23.4s, v23.4s, v4.4s\n"
+    "srshl v0.4s, v0.4s, v4.4s\n"
+    "srshl v31.4s, v31.4s, v4.4s\n"
+    "srshl v30.4s, v30.4s, v4.4s\n"
+    "srshl v22.4s, v22.4s, v4.4s\n"
+    "srshl v29.4s, v29.4s, v4.4s\n"
+    "srshl v18.4s, v18.4s, v4.4s\n"
+    "srshl v28.4s, v28.4s, v4.4s\n"
+    "srshl v21.4s, v21.4s, v4.4s\n"
+    "srshl v27.4s, v27.4s, v4.4s\n"
+    "srshl v26.4s, v26.4s, v4.4s\n"
+    "srshl v25.4s, v25.4s, v4.4s\n"
+    "srshl v20.4s, v20.4s, v4.4s\n"
+    "srshl v24.4s, v24.4s, v4.4s\n"
+    "srshl v19.4s, v19.4s, v4.4s\n"
+    "sqrdmulh v1.4s, v1.4s, v3.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v3.4s\n"
+    "sqrdmulh v0.4s, v0.4s, v3.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v3.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v3.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v3.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v3.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v3.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v3.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v3.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v3.4s\n"
+    "sqrdmulh v26.4s, v26.4s, v3.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v3.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v3.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v3.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v3.4s\n"
+    "movi v17.4s, #0x7f\n"
     "srshl v1.4s, v1.4s, v2.4s\n"
     "srshl v23.4s, v23.4s, v2.4s\n"
     "srshl v0.4s, v0.4s, v2.4s\n"
     "srshl v31.4s, v31.4s, v2.4s\n"
-    "srshl v30.4s, v30.4s, v3.4s\n"
-    "srshl v22.4s, v22.4s, v3.4s\n"
-    "srshl v29.4s, v29.4s, v3.4s\n"
-    "srshl v21.4s, v21.4s, v3.4s\n"
-    "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-    "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-    "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-    "sqrdmulh v21.4s, v21.4s, v4.4s\n"
     "srshl v30.4s, v30.4s, v2.4s\n"
     "srshl v22.4s, v22.4s, v2.4s\n"
     "srshl v29.4s, v29.4s, v2.4s\n"
-    "srshl v21.4s, v21.4s, v2.4s\n"
-    "srshl v28.4s, v28.4s, v3.4s\n"
-    "srshl v20.4s, v20.4s, v3.4s\n"
-    "srshl v27.4s, v27.4s, v3.4s\n"
-    "srshl v26.4s, v26.4s, v3.4s\n"
-    "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-    "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-    "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-    "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+    "srshl v18.4s, v18.4s, v2.4s\n"
     "srshl v28.4s, v28.4s, v2.4s\n"
-    "srshl v20.4s, v20.4s, v2.4s\n"
+    "srshl v21.4s, v21.4s, v2.4s\n"
     "srshl v27.4s, v27.4s, v2.4s\n"
     "srshl v26.4s, v26.4s, v2.4s\n"
-    "srshl v25.4s, v25.4s, v3.4s\n"
-    "srshl v19.4s, v19.4s, v3.4s\n"
-    "srshl v24.4s, v24.4s, v3.4s\n"
-    "srshl v18.4s, v18.4s, v3.4s\n"
-    "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-    "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-    "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-    "sqrdmulh v18.4s, v18.4s, v4.4s\n"
     "srshl v25.4s, v25.4s, v2.4s\n"
-    "srshl v19.4s, v19.4s, v2.4s\n"
+    "srshl v20.4s, v20.4s, v2.4s\n"
     "srshl v24.4s, v24.4s, v2.4s\n"
-    "srshl v18.4s, v18.4s, v2.4s\n"
-    "movi v17.4s, #0x7f\n"
+    "srshl v19.4s, v19.4s, v2.4s\n"
     "not v16.16b, v17.16b\n"
     "smax v1.4s, v1.4s, v16.4s\n"
     "smax v23.4s, v23.4s, v16.4s\n"
     "smax v0.4s, v0.4s, v16.4s\n"
     "smax v31.4s, v31.4s, v16.4s\n"
+    "smax v30.4s, v30.4s, v16.4s\n"
+    "smax v22.4s, v22.4s, v16.4s\n"
+    "smax v29.4s, v29.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smax v28.4s, v28.4s, v16.4s\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v27.4s, v27.4s, v16.4s\n"
+    "smax v26.4s, v26.4s, v16.4s\n"
+    "smax v25.4s, v25.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v24.4s, v24.4s, v16.4s\n"
+    "smax v19.4s, v19.4s, v16.4s\n"
     "smin v1.4s, v1.4s, v17.4s\n"
     "smin v23.4s, v23.4s, v17.4s\n"
     "smin v0.4s, v0.4s, v17.4s\n"
     "smin v31.4s, v31.4s, v17.4s\n"
-    "smax v30.4s, v30.4s, v16.4s\n"
-    "smax v22.4s, v22.4s, v16.4s\n"
-    "smax v29.4s, v29.4s, v16.4s\n"
     "smin v30.4s, v30.4s, v17.4s\n"
     "smin v22.4s, v22.4s, v17.4s\n"
     "smin v29.4s, v29.4s, v17.4s\n"
-    "smax v21.4s, v21.4s, v16.4s\n"
-    "smax v28.4s, v28.4s, v16.4s\n"
-    "smax v20.4s, v20.4s, v16.4s\n"
-    "smin v21.4s, v21.4s, v17.4s\n"
+    "smin v18.4s, v18.4s, v17.4s\n"
     "smin v28.4s, v28.4s, v17.4s\n"
-    "smin v20.4s, v20.4s, v17.4s\n"
-    "smax v27.4s, v27.4s, v16.4s\n"
-    "smax v26.4s, v26.4s, v16.4s\n"
-    "smax v25.4s, v25.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v17.4s\n"
     "smin v27.4s, v27.4s, v17.4s\n"
     "smin v26.4s, v26.4s, v17.4s\n"
     "smin v25.4s, v25.4s, v17.4s\n"
-    "smax v19.4s, v19.4s, v16.4s\n"
-    "smax v24.4s, v24.4s, v16.4s\n"
-    "smax v18.4s, v18.4s, v16.4s\n"
-    "smin v19.4s, v19.4s, v17.4s\n"
+    "smin v20.4s, v20.4s, v17.4s\n"
     "smin v24.4s, v24.4s, v17.4s\n"
-    "smin v18.4s, v18.4s, v17.4s\n"
+    "smin v19.4s, v19.4s, v17.4s\n"
     "uzp1 v23.16b, v1.16b, v23.16b\n"
     "uzp1 v16.16b, v0.16b, v31.16b\n"
     "uzp1 v22.16b, v30.16b, v22.16b\n"
-    "uzp1 v21.16b, v29.16b, v21.16b\n"
-    "uzp1 v20.16b, v28.16b, v20.16b\n"
+    "uzp1 v18.16b, v29.16b, v18.16b\n"
+    "uzp1 v21.16b, v28.16b, v21.16b\n"
     "uzp1 v17.16b, v27.16b, v26.16b\n"
-    "uzp1 v19.16b, v25.16b, v19.16b\n"
-    "uzp1 v18.16b, v24.16b, v18.16b\n"
+    "uzp1 v20.16b, v25.16b, v20.16b\n"
+    "uzp1 v19.16b, v24.16b, v19.16b\n"
     "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x28]\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
-    "add x28, x28, #0x40\n"
-    "uzp1 v17.16b, v20.16b, v17.16b\n"
+    "uzp1 v18.16b, v22.16b, v18.16b\n"
     "str q16, [%x[outptr], x27]\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
     "add x27, x27, #0x40\n"
-    "str q17, [%x[outptr], x26]\n"
+    "uzp1 v17.16b, v21.16b, v17.16b\n"
+    "uzp1 v16.16b, v20.16b, v19.16b\n"
+    "str q18, [%x[outptr], x26]\n"
     "add x26, x26, #0x40\n"
-    "str q16, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
+    "str q17, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q16, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "movi v8.16b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "smax v8.16b, v8.16b, v19.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "smax v8.16b, v8.16b, v19.16b\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "smax v8.16b, v8.16b, v3.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "sxtl v23.8h, v8.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "ld1r { v4.4s }, [x19]\n"
-    "sxtl2 v22.8h, v8.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    "movi v17.4s, #0x7f\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "sxtl v1.4s, v23.4h\n"
-    "ld1r { v2.4s }, [x19]\n"
-    "not v16.16b, v17.16b\n"
-    "sxtl2 v23.4s, v23.8h\n"
+    "sxtl v17.8h, v8.8b\n"
+    "sxtl2 v16.8h, v8.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v22.4s }, [x20]\n"
+    "sxtl v21.4s, v17.4h\n"
+    "sxtl2 v20.4s, v17.8h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v17.4s }, [x20]\n"
+    "sxtl v19.4s, v16.4h\n"
+    "sxtl2 v18.4s, v16.8h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "srshl v21.4s, v21.4s, v22.4s\n"
+    "srshl v20.4s, v20.4s, v22.4s\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "sxtl v0.4s, v22.4h\n"
     "cmp %x[n_channels], #0x10\n"
-    "sxtl2 v31.4s, v22.8h\n"
-    "srshl v1.4s, v1.4s, v3.4s\n"
-    "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v0.4s, v0.4s, v3.4s\n"
-    "srshl v31.4s, v31.4s, v3.4s\n"
-    "sqrdmulh v1.4s, v1.4s, v4.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v0.4s, v0.4s, v4.4s\n"
-    "sqrdmulh v31.4s, v31.4s, v4.4s\n"
-    "srshl v1.4s, v1.4s, v2.4s\n"
-    "srshl v23.4s, v23.4s, v2.4s\n"
-    "srshl v0.4s, v0.4s, v2.4s\n"
-    "srshl v31.4s, v31.4s, v2.4s\n"
-    "smax v1.4s, v1.4s, v16.4s\n"
-    "smax v23.4s, v23.4s, v16.4s\n"
-    "smax v0.4s, v0.4s, v16.4s\n"
-    "smax v31.4s, v31.4s, v16.4s\n"
-    "smin v1.4s, v1.4s, v17.4s\n"
-    "smin v23.4s, v23.4s, v17.4s\n"
-    "smin v0.4s, v0.4s, v17.4s\n"
-    "smin v31.4s, v31.4s, v17.4s\n"
-    "uzp1 v23.16b, v1.16b, v23.16b\n"
-    "uzp1 v16.16b, v0.16b, v31.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
+    "srshl v19.4s, v19.4s, v22.4s\n"
+    "srshl v18.4s, v18.4s, v22.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v17.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+    "movi v17.4s, #0x7f\n"
+    "srshl v21.4s, v21.4s, v16.4s\n"
+    "srshl v20.4s, v20.4s, v16.4s\n"
+    "srshl v19.4s, v19.4s, v16.4s\n"
+    "srshl v18.4s, v18.4s, v16.4s\n"
+    "not v16.16b, v17.16b\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v19.4s, v19.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v17.4s\n"
+    "smin v20.4s, v20.4s, v17.4s\n"
+    "smin v19.4s, v19.4s, v17.4s\n"
+    "smin v18.4s, v18.4s, v17.4s\n"
+    "uzp1 v17.16b, v21.16b, v20.16b\n"
+    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
     "movi v8.16b, #0x80\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 24f\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 24f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #3, 19f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
-    "ld1 { v2.b }[14], [x22], #0x1\n"
-    "ld1 { v1.b }[14], [x21], #0x1\n"
-    "ld1 { v0.b }[14], [x20], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
+    "ld1 { v3.b }[14], [x22], #0x1\n"
+    "ld1 { v28.b }[14], [x21], #0x1\n"
+    "ld1 { v22.b }[14], [x20], #0x1\n"
     "b 23f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
-    "ld1 { v2.b }[12], [x22], #0x1\n"
-    "ld1 { v1.b }[12], [x21], #0x1\n"
-    "ld1 { v0.b }[12], [x20], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
+    "ld1 { v3.b }[12], [x22], #0x1\n"
+    "ld1 { v28.b }[12], [x21], #0x1\n"
+    "ld1 { v22.b }[12], [x20], #0x1\n"
     "b 23f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
-    "ld1 { v2.b }[10], [x22], #0x1\n"
-    "ld1 { v1.b }[10], [x21], #0x1\n"
-    "ld1 { v0.b }[10], [x20], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
+    "ld1 { v3.b }[10], [x22], #0x1\n"
+    "ld1 { v28.b }[10], [x21], #0x1\n"
+    "ld1 { v22.b }[10], [x20], #0x1\n"
     "b 23f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
-    "ld1 { v2.b }[8], [x22], #0x1\n"
-    "ld1 { v1.b }[8], [x21], #0x1\n"
-    "ld1 { v0.b }[8], [x20], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
+    "ld1 { v3.b }[8], [x22], #0x1\n"
+    "ld1 { v28.b }[8], [x21], #0x1\n"
+    "ld1 { v22.b }[8], [x20], #0x1\n"
     "b 23f\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 21f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
-    "ld1 { v2.b }[6], [x22], #0x1\n"
-    "ld1 { v1.b }[6], [x21], #0x1\n"
-    "ld1 { v0.b }[6], [x20], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
+    "ld1 { v3.b }[6], [x22], #0x1\n"
+    "ld1 { v28.b }[6], [x21], #0x1\n"
+    "ld1 { v22.b }[6], [x20], #0x1\n"
     "b 23f\n"
     "20:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
-    "ld1 { v2.b }[4], [x22], #0x1\n"
-    "ld1 { v1.b }[4], [x21], #0x1\n"
-    "ld1 { v0.b }[4], [x20], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
+    "ld1 { v3.b }[4], [x22], #0x1\n"
+    "ld1 { v28.b }[4], [x21], #0x1\n"
+    "ld1 { v22.b }[4], [x20], #0x1\n"
     "b 23f\n"
     "21:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 22f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
-    "ld1 { v2.b }[2], [x22], #0x1\n"
-    "ld1 { v1.b }[2], [x21], #0x1\n"
-    "ld1 { v0.b }[2], [x20], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
+    "ld1 { v3.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "ld1 { v22.b }[2], [x20], #0x1\n"
     "b 23f\n"
     "22:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ldr b3, [x23], #0x1\n"
-    "ldr b2, [x22], #0x1\n"
-    "ldr b1, [x21], #0x1\n"
-    "ldr b0, [x20], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
+    "ldr b3, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "ldr b22, [x20], #0x1\n"
     "23:"  // Oddments: 4 inputs loop: Load: Bit 3: End
-    "smax v23.16b, v3.16b, v2.16b\n"
-    "subs x24, x24, #0x1\n"
-    "smax v19.16b, v1.16b, v0.16b\n"
-    "smax v19.16b, v23.16b, v19.16b\n"
-    "smax v8.16b, v8.16b, v19.16b\n"
+    "smax v17.16b, v4.16b, v3.16b\n"
+    "smax v16.16b, v28.16b, v22.16b\n"
+    "subs x25, x25, #0x1\n"
+    "smax v16.16b, v17.16b, v16.16b\n"
+    "smax v8.16b, v8.16b, v16.16b\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #3, 29f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #1, 26f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
     "b 33f\n"
     "26:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
     "b 33f\n"
     "27:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 28f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
     "b 33f\n"
     "28:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
     "b 33f\n"
     "29:"  // Oddments: Single input loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 31f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #1, 30f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
     "b 33f\n"
     "30:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
     "b 33f\n"
     "31:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 32f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
     "b 33f\n"
     "32:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ldr b3, [x23], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "smax v8.16b, v8.16b, v3.16b\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "smax v8.16b, v8.16b, v4.16b\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "sxtl v23.8h, v8.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "ld1r { v4.4s }, [x19]\n"
-    "sxtl2 v22.8h, v8.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "sxtl v17.8h, v8.8b\n"
+    "sxtl2 v16.8h, v8.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v22.4s }, [x20]\n"
+    "sxtl v21.4s, v17.4h\n"
+    "sxtl2 v20.4s, v17.8h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v17.4s }, [x20]\n"
+    "sxtl v19.4s, v16.4h\n"
+    "sxtl2 v18.4s, v16.8h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "srshl v21.4s, v21.4s, v22.4s\n"
+    "srshl v20.4s, v20.4s, v22.4s\n"
+    "srshl v19.4s, v19.4s, v22.4s\n"
+    "srshl v18.4s, v18.4s, v22.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v17.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v17.4s\n"
     "movi v17.4s, #0x7f\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "sxtl v1.4s, v23.4h\n"
-    "ld1r { v2.4s }, [x19]\n"
+    "srshl v21.4s, v21.4s, v16.4s\n"
+    "srshl v20.4s, v20.4s, v16.4s\n"
+    "srshl v19.4s, v19.4s, v16.4s\n"
+    "srshl v18.4s, v18.4s, v16.4s\n"
     "not v16.16b, v17.16b\n"
-    "sxtl2 v23.4s, v23.8h\n"
-    "sxtl v0.4s, v22.4h\n"
-    "sxtl2 v31.4s, v22.8h\n"
-    "srshl v1.4s, v1.4s, v3.4s\n"
-    "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v0.4s, v0.4s, v3.4s\n"
-    "srshl v31.4s, v31.4s, v3.4s\n"
-    "sqrdmulh v1.4s, v1.4s, v4.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v0.4s, v0.4s, v4.4s\n"
-    "sqrdmulh v31.4s, v31.4s, v4.4s\n"
-    "srshl v1.4s, v1.4s, v2.4s\n"
-    "srshl v23.4s, v23.4s, v2.4s\n"
-    "srshl v0.4s, v0.4s, v2.4s\n"
-    "srshl v31.4s, v31.4s, v2.4s\n"
-    "smax v1.4s, v1.4s, v16.4s\n"
-    "smax v23.4s, v23.4s, v16.4s\n"
-    "smax v0.4s, v0.4s, v16.4s\n"
-    "smax v31.4s, v31.4s, v16.4s\n"
-    "smin v1.4s, v1.4s, v17.4s\n"
-    "smin v23.4s, v23.4s, v17.4s\n"
-    "smin v0.4s, v0.4s, v17.4s\n"
-    "smin v31.4s, v31.4s, v17.4s\n"
-    "uzp1 v23.16b, v1.16b, v23.16b\n"
-    "uzp1 v16.16b, v0.16b, v31.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v19.4s, v19.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v17.4s\n"
+    "smin v20.4s, v20.4s, v17.4s\n"
+    "smin v19.4s, v19.4s, v17.4s\n"
+    "smin v18.4s, v18.4s, v17.4s\n"
+    "uzp1 v17.16b, v21.16b, v20.16b\n"
+    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -625,12 +625,10 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp
index 230952452b..97818595e8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_u8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
 
-struct a64_u8_nhwc_avg_generic_depthfirst
+struct a64_u8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_u8_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
   a64_u8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_u8_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
index 2c8a29248d..f8984c451c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
@@ -84,26 +85,27 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   __asm__ __volatile__(
-    "mov x26, #0x0\n"
-    "mov x25, #0x10\n" // cntb _, ALL, #1
-    "mov x24, #0x20\n" // cntb _, ALL, #2
-    "mov x23, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x25, #0x20\n"  // cntb _, ALL, #2
+    "mov x24, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
     "movi v11.4s, #0x0\n"
@@ -118,43 +120,43 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "movi v2.4s, #0x0\n"
     "movi v1.4s, #0x0\n"
     "movi v0.4s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ldr q29, [x21, x25]\n"
-    "ldr q28, [x20, x25]\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q26, [x20, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "ldr q24, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     "uaddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
     "uaddl v21.8h, v29.8b, v28.8b\n"
-    "subs x22, x22, #0x1\n"
     "uaddl2 v20.8h, v29.16b, v28.16b\n"
-    "ldr q30, [x20, x26]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
     "uaddl v19.8h, v27.8b, v26.8b\n"
-    "ldr q29, [x21, x25]\n"
     "uaddl2 v18.8h, v27.16b, v26.16b\n"
-    "ldr q28, [x20, x25]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
     "uaddl v17.8h, v25.8b, v24.8b\n"
-    "ldr q27, [x21, x24]\n"
     "uaddl2 v16.8h, v25.16b, v24.16b\n"
-    "ldr q26, [x20, x24]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
+    "subs x23, x23, #0x1\n"
     "uaddw v15.4s, v15.4s, v23.4h\n"
-    "ldr q25, [x21, x23]\n"
     "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q24, [x20, x23]\n"
     "uaddw v13.4s, v13.4s, v22.4h\n"
     "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "add x22, x22, #0x10\n"
     "uaddw v11.4s, v11.4s, v21.4h\n"
     "uaddw2 v10.4s, v10.4s, v21.8h\n"
     "uaddw v9.4s, v9.4s, v20.4h\n"
@@ -194,23 +196,23 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "uaddw v1.4s, v1.4s, v16.4h\n"
     "uaddw2 v0.4s, v0.4s, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "uxtl v23.8h, v31.8b\n"
-    "ldr q29, [x21, x25]\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "uxtl v21.8h, v29.8b\n"
-    "uxtl2 v20.8h, v29.16b\n"
-    "uxtl v19.8h, v27.8b\n"
-    "uxtl2 v18.8h, v27.16b\n"
-    "uxtl v17.8h, v25.8b\n"
-    "uxtl2 v16.8h, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "uxtl v23.8h, v16.8b\n"
+    "uxtl2 v22.8h, v16.16b\n"
+    "ldr q16, [x20, x26]\n"
+    "ldr q17, [x20, x25]\n"
+    "uxtl v21.8h, v16.8b\n"
+    "uxtl2 v20.8h, v16.16b\n"
+    "ldr q16, [x20, x24]\n"
+    "uxtl v19.8h, v17.8b\n"
+    "uxtl2 v18.8h, v17.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uxtl v17.8h, v16.8b\n"
+    "uxtl2 v16.8h, v16.16b\n"
     "uaddw v15.4s, v15.4s, v23.4h\n"
     "uaddw2 v14.4s, v14.4s, v23.8h\n"
     "uaddw v13.4s, v13.4s, v22.4h\n"
@@ -229,195 +231,195 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "uaddw2 v0.4s, v0.4s, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "movi v19.4s, #0x0\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x40\n"
-    "movi v17.4s, #0xff\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
     "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x40\n"
     "cmp %x[n_channels], #0x40\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
-    "sqdmulh v11.4s, v11.4s, v18.4s\n"
+    "sqdmulh v11.4s, v11.4s, v17.4s\n"
+    "sqdmulh v10.4s, v10.4s, v17.4s\n"
+    "sqdmulh v9.4s, v9.4s, v17.4s\n"
+    "sqdmulh v8.4s, v8.4s, v17.4s\n"
+    "sqdmulh v7.4s, v7.4s, v17.4s\n"
+    "sqdmulh v6.4s, v6.4s, v17.4s\n"
+    "sqdmulh v5.4s, v5.4s, v17.4s\n"
+    "sqdmulh v4.4s, v4.4s, v17.4s\n"
+    "sqdmulh v3.4s, v3.4s, v17.4s\n"
+    "sqdmulh v2.4s, v2.4s, v17.4s\n"
+    "sqdmulh v1.4s, v1.4s, v17.4s\n"
+    "sqdmulh v0.4s, v0.4s, v17.4s\n"
     "srshl v15.4s, v15.4s, v16.4s\n"
     "srshl v14.4s, v14.4s, v16.4s\n"
     "srshl v13.4s, v13.4s, v16.4s\n"
     "srshl v12.4s, v12.4s, v16.4s\n"
     "srshl v11.4s, v11.4s, v16.4s\n"
-    "sqdmulh v10.4s, v10.4s, v18.4s\n"
-    "sqdmulh v9.4s, v9.4s, v18.4s\n"
-    "sqdmulh v8.4s, v8.4s, v18.4s\n"
-    "sqdmulh v7.4s, v7.4s, v18.4s\n"
     "srshl v10.4s, v10.4s, v16.4s\n"
     "srshl v9.4s, v9.4s, v16.4s\n"
     "srshl v8.4s, v8.4s, v16.4s\n"
     "srshl v7.4s, v7.4s, v16.4s\n"
-    "sqdmulh v6.4s, v6.4s, v18.4s\n"
-    "sqdmulh v5.4s, v5.4s, v18.4s\n"
-    "sqdmulh v4.4s, v4.4s, v18.4s\n"
-    "sqdmulh v3.4s, v3.4s, v18.4s\n"
     "srshl v6.4s, v6.4s, v16.4s\n"
     "srshl v5.4s, v5.4s, v16.4s\n"
     "srshl v4.4s, v4.4s, v16.4s\n"
     "srshl v3.4s, v3.4s, v16.4s\n"
-    "sqdmulh v2.4s, v2.4s, v18.4s\n"
-    "sqdmulh v1.4s, v1.4s, v18.4s\n"
-    "sqdmulh v0.4s, v0.4s, v18.4s\n"
-    "smax v15.4s, v15.4s, v19.4s\n"
     "srshl v2.4s, v2.4s, v16.4s\n"
     "srshl v1.4s, v1.4s, v16.4s\n"
     "srshl v0.4s, v0.4s, v16.4s\n"
-    "smin v15.4s, v15.4s, v17.4s\n"
-    "smax v14.4s, v14.4s, v19.4s\n"
-    "smax v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v17.4s\n"
-    "smin v13.4s, v13.4s, v17.4s\n"
-    "smin v12.4s, v12.4s, v17.4s\n"
-    "smax v11.4s, v11.4s, v19.4s\n"
-    "smax v10.4s, v10.4s, v19.4s\n"
-    "smax v9.4s, v9.4s, v19.4s\n"
-    "smin v11.4s, v11.4s, v17.4s\n"
-    "smin v10.4s, v10.4s, v17.4s\n"
-    "smin v9.4s, v9.4s, v17.4s\n"
-    "smax v8.4s, v8.4s, v19.4s\n"
-    "smax v7.4s, v7.4s, v19.4s\n"
-    "smax v6.4s, v6.4s, v19.4s\n"
-    "smin v8.4s, v8.4s, v17.4s\n"
-    "smin v7.4s, v7.4s, v17.4s\n"
-    "smin v6.4s, v6.4s, v17.4s\n"
-    "smax v5.4s, v5.4s, v19.4s\n"
-    "smax v4.4s, v4.4s, v19.4s\n"
-    "smax v3.4s, v3.4s, v19.4s\n"
-    "smin v5.4s, v5.4s, v17.4s\n"
-    "smin v4.4s, v4.4s, v17.4s\n"
-    "smin v3.4s, v3.4s, v17.4s\n"
-    "smax v2.4s, v2.4s, v19.4s\n"
-    "smax v1.4s, v1.4s, v19.4s\n"
-    "smax v0.4s, v0.4s, v19.4s\n"
-    "smin v2.4s, v2.4s, v17.4s\n"
-    "smin v1.4s, v1.4s, v17.4s\n"
-    "smin v0.4s, v0.4s, v17.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "smax v9.4s, v9.4s, v16.4s\n"
+    "smax v8.4s, v8.4s, v16.4s\n"
+    "smax v7.4s, v7.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v16.4s\n"
+    "smax v5.4s, v5.4s, v16.4s\n"
+    "smax v4.4s, v4.4s, v16.4s\n"
+    "smax v3.4s, v3.4s, v16.4s\n"
+    "smax v2.4s, v2.4s, v16.4s\n"
+    "smax v1.4s, v1.4s, v16.4s\n"
+    "smax v0.4s, v0.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "smin v11.4s, v11.4s, v16.4s\n"
+    "smin v10.4s, v10.4s, v16.4s\n"
+    "smin v9.4s, v9.4s, v16.4s\n"
+    "smin v8.4s, v8.4s, v16.4s\n"
+    "smin v7.4s, v7.4s, v16.4s\n"
+    "smin v6.4s, v6.4s, v16.4s\n"
+    "smin v5.4s, v5.4s, v16.4s\n"
+    "smin v4.4s, v4.4s, v16.4s\n"
+    "smin v3.4s, v3.4s, v16.4s\n"
+    "smin v2.4s, v2.4s, v16.4s\n"
+    "smin v1.4s, v1.4s, v16.4s\n"
+    "smin v0.4s, v0.4s, v16.4s\n"
     "uzp1 v23.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
     "uzp1 v22.16b, v11.16b, v10.16b\n"
-    "uzp1 v21.16b, v9.16b, v8.16b\n"
-    "uzp1 v20.16b, v7.16b, v6.16b\n"
+    "uzp1 v18.16b, v9.16b, v8.16b\n"
+    "uzp1 v21.16b, v7.16b, v6.16b\n"
     "uzp1 v17.16b, v5.16b, v4.16b\n"
-    "uzp1 v19.16b, v3.16b, v2.16b\n"
-    "uzp1 v18.16b, v1.16b, v0.16b\n"
+    "uzp1 v20.16b, v3.16b, v2.16b\n"
+    "uzp1 v19.16b, v1.16b, v0.16b\n"
     "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
+    "uzp1 v18.16b, v22.16b, v18.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "uzp1 v17.16b, v21.16b, v17.16b\n"
+    "uzp1 v16.16b, v20.16b, v19.16b\n"
+    "str q18, [%x[outptr], x26]\n"
     "add x26, x26, #0x40\n"
-    "uzp1 v17.16b, v20.16b, v17.16b\n"
-    "str q16, [%x[outptr], x25]\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "str q17, [%x[outptr], x25]\n"
     "add x25, x25, #0x40\n"
-    "str q17, [%x[outptr], x24]\n"
+    "str q16, [%x[outptr], x24]\n"
     "add x24, x24, #0x40\n"
-    "str q16, [%x[outptr], x23]\n"
-    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "movi v15.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v14.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "subs x22, x22, #0x1\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q30, [x20, x26]\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
+    "add x22, x22, #0x10\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "uxtl v23.8h, v31.8b\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "uxtl v17.8h, v16.8b\n"
+    "uxtl2 v16.8h, v16.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "movi v19.4s, #0x0\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "movi v17.4s, #0xff\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
     "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x10\n"
     "cmp %x[n_channels], #0x10\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
     "srshl v15.4s, v15.4s, v16.4s\n"
     "srshl v14.4s, v14.4s, v16.4s\n"
     "srshl v13.4s, v13.4s, v16.4s\n"
     "srshl v12.4s, v12.4s, v16.4s\n"
-    "smax v15.4s, v15.4s, v19.4s\n"
-    "smax v14.4s, v14.4s, v19.4s\n"
-    "smax v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v19.4s\n"
-    "smin v15.4s, v15.4s, v17.4s\n"
-    "smin v14.4s, v14.4s, v17.4s\n"
-    "smin v13.4s, v13.4s, v17.4s\n"
-    "smin v12.4s, v12.4s, v17.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "add x26, x26, #0x10\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "add %x[outptr], %x[outptr], x27\n"
     "movi v15.4s, #0x0\n"
-    "add %x[outptr], %x[outptr], x26\n"
     "movi v14.4s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "movi v13.4s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "movi v12.4s, #0x0\n"
-    "cbz x22, 24f\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 24f\n"
     "15:"  // Oddments: 2 inputs loop
+    "ldp x21, x20, [x22, #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
+    "add x20, x20, x27\n"
     "movi v30.16b, #0x0\n"
-    "add x21, x21, x26\n"
-    "add x20, x20, x26\n"
     "tbz %x[n_channels], #3, 19f\n"
     "ldr d31, [x21], #0x8\n"
     "ldr d30, [x20], #0x8\n"
@@ -478,21 +480,21 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "ldr b31, [x21], #0x1\n"
     "ldr b30, [x20], #0x1\n"
     "23:"  // Oddments: 2 inputs loop: Load: Bit 3: End
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "subs x22, x22, #0x1\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
+    "ldr x21, [x22], #0x8\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldr x21, [x19], #0x8\n"
-    "add x21, x21, x26\n"
     "tbz %x[n_channels], #3, 29f\n"
     "ldr d31, [x21], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
@@ -538,38 +540,38 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 33f\n"
     "ldr b31, [x21], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "uxtl v23.8h, v31.8b\n"
-    "subs x20, x20, #0x1\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uxtl v17.8h, v31.8b\n"
+    "uxtl2 v16.8h, v31.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "movi v19.4s, #0x0\n"
-    "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
-    "movi v17.4s, #0xff\n"
+    "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
     "ld1r { v16.4s }, [%x[shift_ptr]]\n"
-    "sqdmulh v15.4s, v15.4s, v18.4s\n"
-    "sqdmulh v14.4s, v14.4s, v18.4s\n"
-    "sqdmulh v13.4s, v13.4s, v18.4s\n"
-    "sqdmulh v12.4s, v12.4s, v18.4s\n"
+    "sqdmulh v15.4s, v15.4s, v17.4s\n"
+    "sqdmulh v14.4s, v14.4s, v17.4s\n"
+    "sqdmulh v13.4s, v13.4s, v17.4s\n"
+    "sqdmulh v12.4s, v12.4s, v17.4s\n"
     "srshl v15.4s, v15.4s, v16.4s\n"
     "srshl v14.4s, v14.4s, v16.4s\n"
     "srshl v13.4s, v13.4s, v16.4s\n"
     "srshl v12.4s, v12.4s, v16.4s\n"
-    "smax v15.4s, v15.4s, v19.4s\n"
-    "smax v14.4s, v14.4s, v19.4s\n"
-    "smax v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v19.4s\n"
-    "smin v15.4s, v15.4s, v17.4s\n"
-    "smin v14.4s, v14.4s, v17.4s\n"
-    "smin v13.4s, v13.4s, v17.4s\n"
-    "smin v12.4s, v12.4s, v17.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -615,12 +617,10 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 0103de812d..9d160bf8f8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,33 +24,28 @@
 
 #pragma once
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
 void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  using Parent = DepthfirstStrategy<uint8_t, uint8_t>;
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
+  a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 02c43ccaba..66cdb7f849 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <cstdint>
 
+#if defined(__aarch64__)
+
 namespace arm_conv {
 namespace pooling {
 
@@ -61,114 +63,115 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x14, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "cmp x15, #0x10\n"
-    "ldp x12, x11, [x20, #0x0]\n"
-    "ldp x10, x9, [x20, #0x10]\n"
-    "ldp x28, x27, [x19, #0x0]\n"
-    "ldp x26, x25, [x19, #0x10]\n"
-    "ldp x24, x23, [x19, #0x20]\n"
-    "ldp x22, x21, [x19, #0x30]\n"
-    "ldr x20, [x19, #0x40]\n"
+    "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "cmp x16, #0x10\n"
+    "mov x15, #0x0\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "mov x12, #0x0\n"
+    "ldp x11, x10, [x21, #0x10]\n"
+    "ldp x9, x28, [x20, #0x0]\n"
+    "ldp x27, x26, [x20, #0x10]\n"
+    "ldp x25, x24, [x20, #0x20]\n"
+    "ldp x23, x22, [x20, #0x30]\n"
+    "ldr x21, [x20, #0x40]\n"
     "blt 3f\n"
-    "ldr q30, [x27, x14]\n"
-    "lsr x19, x15, #0x4\n"
-    "ldr q29, [x24, x14]\n"
-    "sub x15, x15, x19, LSL #4\n"
-    "ldr q28, [x21, x14]\n"
-    "subs x19, x19, #0x1\n"
-    "ldr q27, [x25, x14]\n"
-    "ldr q26, [x28, x14]\n"
-    "ldr q25, [x23, x14]\n"
-    "ldr q24, [x26, x14]\n"
-    "ldr q23, [x22, x14]\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q30, [x28, x15]\n"
+    "ldr q29, [x25, x15]\n"
+    "lsr x20, x16, #0x4\n"
+    "sub x16, x16, x20, LSL #4\n"
+    "ldr q28, [x22, x15]\n"
+    "ldr q27, [x26, x15]\n"
+    "subs x20, x20, #0x1\n"
+    "ldr q26, [x9, x15]\n"
+    "ldr q25, [x27, x15]\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "ldr q22, [x21, x15]\n"
+    "add x15, x15, #0x10\n"
     "beq 2f\n"
     "1:"  // Vector: Loop
     "umax v21.16b, v30.16b, v29.16b\n"
-    "ldr q30, [x27, x14]\n"
-    "subs x19, x19, #0x1\n"
+    "ldr q30, [x28, x15]\n"
     "umax v20.16b, v29.16b, v28.16b\n"
-    "ldr q29, [x24, x14]\n"
+    "ldr q29, [x25, x15]\n"
+    "ldr q28, [x22, x15]\n"
     "umax v19.16b, v27.16b, v26.16b\n"
-    "ldr q28, [x21, x14]\n"
+    "ldr q26, [x9, x15]\n"
     "umax v18.16b, v25.16b, v24.16b\n"
-    "ldr q26, [x28, x14]\n"
-    "umax v17.16b, v23.16b, v27.16b\n"
-    "ldr q27, [x25, x14]\n"
-    "umax v16.16b, v25.16b, v22.16b\n"
-    "ldr q25, [x23, x14]\n"
+    "ldr q25, [x27, x15]\n"
+    "umax v17.16b, v27.16b, v23.16b\n"
+    "ldr q27, [x26, x15]\n"
+    "umax v16.16b, v24.16b, v22.16b\n"
+    "ldr q24, [x24, x15]\n"
+    "ldr q23, [x23, x15]\n"
+    "subs x20, x20, #0x1\n"
     "umax v19.16b, v21.16b, v19.16b\n"
-    "ldr q24, [x26, x14]\n"
-    "umax v18.16b, v21.16b, v18.16b\n"
-    "ldr q23, [x22, x14]\n"
-    "umax v17.16b, v20.16b, v17.16b\n"
-    "ldr q22, [x20, x14]\n"
-    "add x14, x14, #0x10\n"
+    "ldr q22, [x21, x15]\n"
+    "umax v18.16b, v18.16b, v21.16b\n"
+    "umax v17.16b, v17.16b, v20.16b\n"
+    "add x15, x15, #0x10\n"
     "umax v16.16b, v20.16b, v16.16b\n"
-    "str q19, [x12, x13]\n"
-    "str q18, [x11, x13]\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
+    "str q19, [x14, x12]\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
     "bgt 1b\n"
     "2:"  // Vector: Tail
     "umax v21.16b, v30.16b, v29.16b\n"
     "umax v20.16b, v29.16b, v28.16b\n"
-    "umax v19.16b, v27.16b, v26.16b\n"
+    "umax v16.16b, v27.16b, v26.16b\n"
     "umax v18.16b, v25.16b, v24.16b\n"
-    "umax v17.16b, v23.16b, v27.16b\n"
-    "umax v16.16b, v25.16b, v22.16b\n"
-    "umax v19.16b, v21.16b, v19.16b\n"
-    "str q19, [x12, x13]\n"
-    "umax v18.16b, v21.16b, v18.16b\n"
-    "umax v17.16b, v20.16b, v17.16b\n"
-    "str q18, [x11, x13]\n"
-    "umax v16.16b, v20.16b, v16.16b\n"
-    "str q17, [x10, x13]\n"
-    "str q16, [x9, x13]\n"
-    "add x13, x13, #0x10\n"
-    "cbz x15, 4f\n"
+    "umax v17.16b, v27.16b, v23.16b\n"
+    "umax v19.16b, v24.16b, v22.16b\n"
+    "umax v16.16b, v21.16b, v16.16b\n"
+    "umax v18.16b, v18.16b, v21.16b\n"
+    "str q16, [x14, x12]\n"
+    "umax v17.16b, v17.16b, v20.16b\n"
+    "umax v16.16b, v20.16b, v19.16b\n"
+    "str q18, [x13, x12]\n"
+    "str q17, [x11, x12]\n"
+    "str q16, [x10, x12]\n"
+    "add x12, x12, #0x10\n"
+    "cbz x16, 4f\n"
     "3:"  // Oddments
-    "ldr b30, [x27, x14]\n"
-    "subs x15, x15, #0x1\n"
-    "ldr b29, [x24, x14]\n"
-    "umax v21.16b, v30.16b, v29.16b\n"
-    "ldr b28, [x21, x14]\n"
-    "ldr b27, [x25, x14]\n"
-    "umax v20.16b, v29.16b, v28.16b\n"
-    "ldr b26, [x28, x14]\n"
-    "ldr b25, [x23, x14]\n"
-    "umax v19.16b, v27.16b, v26.16b\n"
-    "ldr b24, [x26, x14]\n"
-    "ldr b23, [x22, x14]\n"
-    "umax v19.16b, v21.16b, v19.16b\n"
-    "ldr b22, [x20, x14]\n"
-    "add x14, x14, #0x1\n"
-    "umax v18.16b, v25.16b, v24.16b\n"
-    "str b19, [x12, x13]\n"
-    "umax v17.16b, v23.16b, v27.16b\n"
-    "umax v16.16b, v25.16b, v22.16b\n"
-    "umax v18.16b, v21.16b, v18.16b\n"
-    "str b18, [x11, x13]\n"
-    "umax v17.16b, v20.16b, v17.16b\n"
-    "umax v16.16b, v20.16b, v16.16b\n"
-    "str b17, [x10, x13]\n"
-    "str b16, [x9, x13]\n"
-    "add x13, x13, #0x1\n"
+    "ldr b16, [x28, x15]\n"
+    "ldr b17, [x25, x15]\n"
+    "umax v23.16b, v16.16b, v17.16b\n"
+    "subs x16, x16, #0x1\n"
+    "ldr b16, [x22, x15]\n"
+    "ldr b22, [x26, x15]\n"
+    "umax v21.16b, v17.16b, v16.16b\n"
+    "ldr b16, [x9, x15]\n"
+    "ldr b17, [x27, x15]\n"
+    "umax v16.16b, v22.16b, v16.16b\n"
+    "umax v20.16b, v23.16b, v16.16b\n"
+    "ldr b19, [x24, x15]\n"
+    "ldr b16, [x23, x15]\n"
+    "umax v18.16b, v17.16b, v19.16b\n"
+    "umax v17.16b, v22.16b, v16.16b\n"
+    "ldr b16, [x21, x15]\n"
+    "umax v16.16b, v19.16b, v16.16b\n"
+    "add x15, x15, #0x1\n"
+    "umax v18.16b, v18.16b, v23.16b\n"
+    "umax v17.16b, v17.16b, v21.16b\n"
+    "umax v16.16b, v21.16b, v16.16b\n"
+    "str b20, [x14, x12]\n"
+    "str b18, [x13, x12]\n"
+    "str b17, [x11, x12]\n"
+    "str b16, [x10, x12]\n"
+    "add x12, x12, #0x1\n"
     "bgt 3b\n"
     "4:"  // End
-
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp
index 391af31d03..7d528ccc65 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_u8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
 
-struct a64_u8_nhwc_max_generic_depthfirst
+struct a64_u8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_u8_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
   a64_u8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_u8_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
index f9bbfd8b90..2ceef125ca 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -39,397 +40,395 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
     "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "movi v5.16b, #0x0\n"
-    "movi v4.16b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax v22.16b, v31.16b, v30.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v18.16b, v29.16b, v28.16b\n"
-    "umax v21.16b, v27.16b, v21.16b\n"
-    "ldr q2, [x22, x28]\n"
-    "umax v17.16b, v26.16b, v17.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "umax v20.16b, v25.16b, v20.16b\n"
-    "ldr q0, [x20, x28]\n"
-    "umax v16.16b, v24.16b, v16.16b\n"
-    "ldr q31, [x23, x27]\n"
+    "umax v23.16b, v4.16b, v3.16b\n"
+    "umax v19.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "umax v22.16b, v2.16b, v1.16b\n"
+    "ldr q2, [x21, x26]\n"
+    "umax v18.16b, v27.16b, v21.16b\n"
+    "ldr q1, [x20, x26]\n"
+    "umax v21.16b, v0.16b, v31.16b\n"
+    "ldr q0, [x21, x24]\n"
+    "umax v17.16b, v26.16b, v20.16b\n"
+    "ldr q31, [x20, x24]\n"
+    "umax v20.16b, v30.16b, v29.16b\n"
+    "ldr q30, [x21, x23]\n"
+    "umax v16.16b, v25.16b, v24.16b\n"
+    "ldr q29, [x20, x23]\n"
     "umax v19.16b, v23.16b, v19.16b\n"
-    "ldr q30, [x22, x27]\n"
     "umax v18.16b, v22.16b, v18.16b\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "umax v17.16b, v21.16b, v17.16b\n"
-    "ldr q28, [x20, x27]\n"
     "umax v16.16b, v20.16b, v16.16b\n"
-    "ldr q27, [x23, x26]\n"
-    "umax v7.16b, v7.16b, v19.16b\n"
-    "ldr q21, [x22, x26]\n"
-    "umax v6.16b, v6.16b, v18.16b\n"
-    "ldr q26, [x21, x26]\n"
-    "umax v5.16b, v5.16b, v17.16b\n"
-    "ldr q17, [x20, x26]\n"
-    "umax v4.16b, v4.16b, v16.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "umax v8.16b, v8.16b, v19.16b\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "umax v7.16b, v7.16b, v18.16b\n"
+    "umax v6.16b, v6.16b, v17.16b\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v22.16b, v31.16b, v30.16b\n"
-    "umax v18.16b, v29.16b, v28.16b\n"
-    "umax v21.16b, v27.16b, v21.16b\n"
-    "umax v17.16b, v26.16b, v17.16b\n"
-    "umax v20.16b, v25.16b, v20.16b\n"
-    "umax v16.16b, v24.16b, v16.16b\n"
+    "umax v23.16b, v4.16b, v3.16b\n"
+    "umax v19.16b, v28.16b, v22.16b\n"
+    "umax v22.16b, v2.16b, v1.16b\n"
+    "umax v18.16b, v27.16b, v21.16b\n"
+    "umax v21.16b, v0.16b, v31.16b\n"
+    "umax v17.16b, v26.16b, v20.16b\n"
+    "umax v20.16b, v30.16b, v29.16b\n"
+    "umax v16.16b, v25.16b, v24.16b\n"
     "umax v19.16b, v23.16b, v19.16b\n"
     "umax v18.16b, v22.16b, v18.16b\n"
     "umax v17.16b, v21.16b, v17.16b\n"
     "umax v16.16b, v20.16b, v16.16b\n"
-    "umax v7.16b, v7.16b, v19.16b\n"
-    "umax v6.16b, v6.16b, v18.16b\n"
-    "umax v5.16b, v5.16b, v17.16b\n"
-    "umax v4.16b, v4.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v19.16b\n"
+    "umax v7.16b, v7.16b, v18.16b\n"
+    "umax v6.16b, v6.16b, v17.16b\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v7.16b, v7.16b, v3.16b\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "umax v6.16b, v6.16b, v31.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "umax v5.16b, v5.16b, v27.16b\n"
-    "umax v4.16b, v4.16b, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "umax v7.16b, v7.16b, v17.16b\n"
+    "umax v6.16b, v6.16b, v16.16b\n"
+    "ldr q16, [x20, x23]\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x40\n"
-    "str q6, [%x[outptr], x27]\n"
-    "add x27, x27, #0x40\n"
-    "str q5, [%x[outptr], x26]\n"
-    "add x26, x26, #0x40\n"
-    "str q4, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
     "sub %x[n_channels], %x[n_channels], #0x40\n"
     "cmp %x[n_channels], #0x40\n"
+    "str q8, [%x[outptr], x27]\n"
+    "str q7, [%x[outptr], x26]\n"
+    "add x27, x27, #0x40\n"
+    "add x26, x26, #0x40\n"
+    "str q6, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q5, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "movi v7.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "umax v7.16b, v7.16b, v19.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "umax v7.16b, v7.16b, v19.16b\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v7.16b, v7.16b, v3.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "str q7, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
     "cmp %x[n_channels], #0x10\n"
+    "str q8, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
-    "movi v7.16b, #0x0\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 24f\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "movi v8.16b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 24f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #3, 19f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
-    "ld1 { v2.b }[14], [x22], #0x1\n"
-    "ld1 { v1.b }[14], [x21], #0x1\n"
-    "ld1 { v0.b }[14], [x20], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
+    "ld1 { v3.b }[14], [x22], #0x1\n"
+    "ld1 { v28.b }[14], [x21], #0x1\n"
+    "ld1 { v22.b }[14], [x20], #0x1\n"
     "b 23f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
-    "ld1 { v2.b }[12], [x22], #0x1\n"
-    "ld1 { v1.b }[12], [x21], #0x1\n"
-    "ld1 { v0.b }[12], [x20], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
+    "ld1 { v3.b }[12], [x22], #0x1\n"
+    "ld1 { v28.b }[12], [x21], #0x1\n"
+    "ld1 { v22.b }[12], [x20], #0x1\n"
     "b 23f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
-    "ld1 { v2.b }[10], [x22], #0x1\n"
-    "ld1 { v1.b }[10], [x21], #0x1\n"
-    "ld1 { v0.b }[10], [x20], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
+    "ld1 { v3.b }[10], [x22], #0x1\n"
+    "ld1 { v28.b }[10], [x21], #0x1\n"
+    "ld1 { v22.b }[10], [x20], #0x1\n"
     "b 23f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
-    "ld1 { v2.b }[8], [x22], #0x1\n"
-    "ld1 { v1.b }[8], [x21], #0x1\n"
-    "ld1 { v0.b }[8], [x20], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
+    "ld1 { v3.b }[8], [x22], #0x1\n"
+    "ld1 { v28.b }[8], [x21], #0x1\n"
+    "ld1 { v22.b }[8], [x20], #0x1\n"
     "b 23f\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 21f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
-    "ld1 { v2.b }[6], [x22], #0x1\n"
-    "ld1 { v1.b }[6], [x21], #0x1\n"
-    "ld1 { v0.b }[6], [x20], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
+    "ld1 { v3.b }[6], [x22], #0x1\n"
+    "ld1 { v28.b }[6], [x21], #0x1\n"
+    "ld1 { v22.b }[6], [x20], #0x1\n"
     "b 23f\n"
     "20:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
-    "ld1 { v2.b }[4], [x22], #0x1\n"
-    "ld1 { v1.b }[4], [x21], #0x1\n"
-    "ld1 { v0.b }[4], [x20], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
+    "ld1 { v3.b }[4], [x22], #0x1\n"
+    "ld1 { v28.b }[4], [x21], #0x1\n"
+    "ld1 { v22.b }[4], [x20], #0x1\n"
     "b 23f\n"
     "21:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 22f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
-    "ld1 { v2.b }[2], [x22], #0x1\n"
-    "ld1 { v1.b }[2], [x21], #0x1\n"
-    "ld1 { v0.b }[2], [x20], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
+    "ld1 { v3.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "ld1 { v22.b }[2], [x20], #0x1\n"
     "b 23f\n"
     "22:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ldr b3, [x23], #0x1\n"
-    "ldr b2, [x22], #0x1\n"
-    "ldr b1, [x21], #0x1\n"
-    "ldr b0, [x20], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
+    "ldr b3, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "ldr b22, [x20], #0x1\n"
     "23:"  // Oddments: 4 inputs loop: Load: Bit 3: End
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "umax v7.16b, v7.16b, v19.16b\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "subs x25, x25, #0x1\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #3, 29f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #1, 26f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
     "b 33f\n"
     "26:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
     "b 33f\n"
     "27:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 28f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
     "b 33f\n"
     "28:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
     "b 33f\n"
     "29:"  // Oddments: Single input loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 31f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #1, 30f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
     "b 33f\n"
     "30:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
     "b 33f\n"
     "31:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 32f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
     "b 33f\n"
     "32:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ldr b3, [x23], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "umax v7.16b, v7.16b, v3.16b\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v4.16b\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
     "tbz %x[n_channels], #3, 38f\n"
-    "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+    "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
-    "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #1, 35f\n"
-    "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[14], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[14], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "35:"  // Oddments: Store: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[12], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[12], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "36:"  // Oddments: Store: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 37f\n"
-    "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[10], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[10], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "37:"  // Oddments: Store: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[8], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[8], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "38:"  // Oddments: Store: Bit 3: Unset
     "tbz %x[n_channels], #2, 40f\n"
-    "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+    "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
     "tbz %x[n_channels], #1, 39f\n"
-    "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[6], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[6], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "39:"  // Oddments: Store: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[4], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[4], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "40:"  // Oddments: Store: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 41f\n"
-    "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+    "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[2], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[2], [%x[outptr]], #0x1\n"
     "b 42f\n"
     "41:"  // Oddments: Store: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 42f\n"
-    "st1 { v7.b }[0], [%x[outptr]], #0x1\n"
+    "st1 { v8.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp
index d46658f080..daf836f5d6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_u8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
 
-struct a64_u8q_nhwc_avg_generic_depthfirst
+struct a64_u8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = a64_u8q_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
   a64_u8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_u8q_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index a57fe6df68..31a3489e5c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 
 #include "pooling.hpp"
-#include <cstddef>
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
@@ -87,12 +87,13 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
 
@@ -118,20 +119,20 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
   );
 
   __asm__ __volatile__(
-    "mov x26, #0x0\n"
-    "mov x25, #0x10\n" // cntb _, ALL, #1
-    "mov x24, #0x20\n" // cntb _, ALL, #2
-    "mov x23, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x25, #0x20\n"  // cntb _, ALL, #2
+    "mov x24, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
     "ld1r { v15.4s }, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov v14.16b, v15.16b\n"
-    "mov x19, %x[inptrs]\n"
     "mov v13.16b, v15.16b\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "mov v12.16b, v15.16b\n"
     "mov v11.16b, v15.16b\n"
+    "mov x22, %x[inptrs]\n"
     "mov v10.16b, v15.16b\n"
     "mov v9.16b, v15.16b\n"
     "mov v8.16b, v15.16b\n"
@@ -143,43 +144,43 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "mov v2.16b, v15.16b\n"
     "mov v1.16b, v15.16b\n"
     "mov v0.16b, v15.16b\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ldr q29, [x21, x25]\n"
-    "ldr q28, [x20, x25]\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q26, [x20, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "ldr q24, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     "uaddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
     "uaddl v21.8h, v29.8b, v28.8b\n"
-    "subs x22, x22, #0x1\n"
     "uaddl2 v20.8h, v29.16b, v28.16b\n"
-    "ldr q30, [x20, x26]\n"
+    "ldr q29, [x21, x26]\n"
+    "ldr q28, [x20, x26]\n"
     "uaddl v19.8h, v27.8b, v26.8b\n"
-    "ldr q29, [x21, x25]\n"
     "uaddl2 v18.8h, v27.16b, v26.16b\n"
-    "ldr q28, [x20, x25]\n"
+    "ldr q27, [x21, x25]\n"
+    "ldr q26, [x20, x25]\n"
     "uaddl v17.8h, v25.8b, v24.8b\n"
-    "ldr q27, [x21, x24]\n"
     "uaddl2 v16.8h, v25.16b, v24.16b\n"
-    "ldr q26, [x20, x24]\n"
+    "ldr q25, [x21, x24]\n"
+    "ldr q24, [x20, x24]\n"
+    "subs x23, x23, #0x1\n"
     "uaddw v15.4s, v15.4s, v23.4h\n"
-    "ldr q25, [x21, x23]\n"
     "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q24, [x20, x23]\n"
     "uaddw v13.4s, v13.4s, v22.4h\n"
     "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "add x22, x22, #0x10\n"
     "uaddw v11.4s, v11.4s, v21.4h\n"
     "uaddw2 v10.4s, v10.4s, v21.8h\n"
     "uaddw v9.4s, v9.4s, v20.4h\n"
@@ -219,23 +220,23 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "uaddw v1.4s, v1.4s, v16.4h\n"
     "uaddw2 v0.4s, v0.4s, v16.8h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "uxtl v23.8h, v31.8b\n"
-    "ldr q29, [x21, x25]\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "ldr q27, [x21, x24]\n"
-    "ldr q25, [x21, x23]\n"
-    "uxtl v21.8h, v29.8b\n"
-    "uxtl2 v20.8h, v29.16b\n"
-    "uxtl v19.8h, v27.8b\n"
-    "uxtl2 v18.8h, v27.16b\n"
-    "uxtl v17.8h, v25.8b\n"
-    "uxtl2 v16.8h, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "uxtl v23.8h, v16.8b\n"
+    "uxtl2 v22.8h, v16.16b\n"
+    "ldr q16, [x20, x26]\n"
+    "ldr q17, [x20, x25]\n"
+    "uxtl v21.8h, v16.8b\n"
+    "uxtl2 v20.8h, v16.16b\n"
+    "ldr q16, [x20, x24]\n"
+    "uxtl v19.8h, v17.8b\n"
+    "uxtl2 v18.8h, v17.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uxtl v17.8h, v16.8b\n"
+    "uxtl2 v16.8h, v16.16b\n"
     "uaddw v15.4s, v15.4s, v23.4h\n"
     "uaddw2 v14.4s, v14.4s, v23.8h\n"
     "uaddw v13.4s, v13.4s, v22.4h\n"
@@ -254,64 +255,62 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "uaddw2 v0.4s, v0.4s, v16.8h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "movi v21.4s, #0x0\n"
-    "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "movi v19.4s, #0xff\n"
-    "ld1r { v18.4s }, [%x[left_shift]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x40\n"
-    "srshl v15.4s, v15.4s, v18.4s\n"
+    "ld1r { v19.4s }, [%x[left_shift]]\n"
+    "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
+    "srshl v15.4s, v15.4s, v19.4s\n"
+    "srshl v14.4s, v14.4s, v19.4s\n"
     "ld1r { v17.4s }, [%x[right_shift]]\n"
+    "srshl v13.4s, v13.4s, v19.4s\n"
+    "srshl v12.4s, v12.4s, v19.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "srshl v11.4s, v11.4s, v19.4s\n"
+    "srshl v10.4s, v10.4s, v19.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x40\n"
+    "srshl v9.4s, v9.4s, v19.4s\n"
+    "srshl v8.4s, v8.4s, v19.4s\n"
     "cmp %x[n_channels], #0x40\n"
-    "srshl v14.4s, v14.4s, v18.4s\n"
-    "ld1r { v16.4s }, [x19]\n"
-    "srshl v13.4s, v13.4s, v18.4s\n"
-    "srshl v12.4s, v12.4s, v18.4s\n"
-    "srshl v11.4s, v11.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v20.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v20.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v20.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v20.4s\n"
+    "srshl v7.4s, v7.4s, v19.4s\n"
+    "srshl v6.4s, v6.4s, v19.4s\n"
+    "srshl v5.4s, v5.4s, v19.4s\n"
+    "srshl v4.4s, v4.4s, v19.4s\n"
+    "srshl v3.4s, v3.4s, v19.4s\n"
+    "srshl v2.4s, v2.4s, v19.4s\n"
+    "srshl v1.4s, v1.4s, v19.4s\n"
+    "srshl v0.4s, v0.4s, v19.4s\n"
+    "sqrdmulh v15.4s, v15.4s, v18.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+    "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v18.4s\n"
+    "sqrdmulh v11.4s, v11.4s, v18.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v18.4s\n"
+    "sqrdmulh v9.4s, v9.4s, v18.4s\n"
+    "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+    "sqrdmulh v7.4s, v7.4s, v18.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v18.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v18.4s\n"
+    "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+    "sqrdmulh v2.4s, v2.4s, v18.4s\n"
+    "sqrdmulh v1.4s, v1.4s, v18.4s\n"
+    "sqrdmulh v0.4s, v0.4s, v18.4s\n"
     "srshl v15.4s, v15.4s, v17.4s\n"
     "srshl v14.4s, v14.4s, v17.4s\n"
     "srshl v13.4s, v13.4s, v17.4s\n"
     "srshl v12.4s, v12.4s, v17.4s\n"
-    "sqrdmulh v11.4s, v11.4s, v20.4s\n"
-    "srshl v10.4s, v10.4s, v18.4s\n"
-    "srshl v9.4s, v9.4s, v18.4s\n"
-    "srshl v8.4s, v8.4s, v18.4s\n"
     "srshl v11.4s, v11.4s, v17.4s\n"
-    "sqrdmulh v10.4s, v10.4s, v20.4s\n"
-    "sqrdmulh v9.4s, v9.4s, v20.4s\n"
-    "sqrdmulh v8.4s, v8.4s, v20.4s\n"
-    "srshl v7.4s, v7.4s, v18.4s\n"
     "srshl v10.4s, v10.4s, v17.4s\n"
     "srshl v9.4s, v9.4s, v17.4s\n"
     "srshl v8.4s, v8.4s, v17.4s\n"
-    "sqrdmulh v7.4s, v7.4s, v20.4s\n"
-    "srshl v6.4s, v6.4s, v18.4s\n"
-    "srshl v5.4s, v5.4s, v18.4s\n"
-    "srshl v4.4s, v4.4s, v18.4s\n"
     "srshl v7.4s, v7.4s, v17.4s\n"
-    "sqrdmulh v6.4s, v6.4s, v20.4s\n"
-    "sqrdmulh v5.4s, v5.4s, v20.4s\n"
-    "sqrdmulh v4.4s, v4.4s, v20.4s\n"
-    "srshl v3.4s, v3.4s, v18.4s\n"
     "srshl v6.4s, v6.4s, v17.4s\n"
     "srshl v5.4s, v5.4s, v17.4s\n"
     "srshl v4.4s, v4.4s, v17.4s\n"
-    "sqrdmulh v3.4s, v3.4s, v20.4s\n"
-    "srshl v2.4s, v2.4s, v18.4s\n"
-    "srshl v1.4s, v1.4s, v18.4s\n"
-    "srshl v0.4s, v0.4s, v18.4s\n"
     "srshl v3.4s, v3.4s, v17.4s\n"
-    "sqrdmulh v2.4s, v2.4s, v20.4s\n"
-    "sqrdmulh v1.4s, v1.4s, v20.4s\n"
-    "sqrdmulh v0.4s, v0.4s, v20.4s\n"
-    "add v15.4s, v15.4s, v16.4s\n"
     "srshl v2.4s, v2.4s, v17.4s\n"
     "srshl v1.4s, v1.4s, v17.4s\n"
     "srshl v0.4s, v0.4s, v17.4s\n"
+    "add v15.4s, v15.4s, v16.4s\n"
     "add v14.4s, v14.4s, v16.4s\n"
     "add v13.4s, v13.4s, v16.4s\n"
     "add v12.4s, v12.4s, v16.4s\n"
@@ -327,58 +326,60 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "add v2.4s, v2.4s, v16.4s\n"
     "add v1.4s, v1.4s, v16.4s\n"
     "add v0.4s, v0.4s, v16.4s\n"
-    "smax v15.4s, v15.4s, v21.4s\n"
-    "smax v14.4s, v14.4s, v21.4s\n"
-    "smax v13.4s, v13.4s, v21.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v21.4s\n"
-    "smax v11.4s, v11.4s, v21.4s\n"
-    "smax v10.4s, v10.4s, v21.4s\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
-    "smin v11.4s, v11.4s, v19.4s\n"
-    "smin v10.4s, v10.4s, v19.4s\n"
-    "smax v9.4s, v9.4s, v21.4s\n"
-    "smax v8.4s, v8.4s, v21.4s\n"
-    "smax v7.4s, v7.4s, v21.4s\n"
-    "smin v9.4s, v9.4s, v19.4s\n"
-    "smin v8.4s, v8.4s, v19.4s\n"
-    "smin v7.4s, v7.4s, v19.4s\n"
-    "smax v6.4s, v6.4s, v21.4s\n"
-    "smax v5.4s, v5.4s, v21.4s\n"
-    "smax v4.4s, v4.4s, v21.4s\n"
-    "smin v6.4s, v6.4s, v19.4s\n"
-    "smin v5.4s, v5.4s, v19.4s\n"
-    "smin v4.4s, v4.4s, v19.4s\n"
-    "smax v3.4s, v3.4s, v21.4s\n"
-    "smax v2.4s, v2.4s, v21.4s\n"
-    "smax v1.4s, v1.4s, v21.4s\n"
-    "smin v3.4s, v3.4s, v19.4s\n"
-    "smin v2.4s, v2.4s, v19.4s\n"
-    "smin v1.4s, v1.4s, v19.4s\n"
-    "smax v0.4s, v0.4s, v21.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "smax v9.4s, v9.4s, v16.4s\n"
+    "smax v8.4s, v8.4s, v16.4s\n"
+    "smax v7.4s, v7.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v16.4s\n"
+    "smax v5.4s, v5.4s, v16.4s\n"
+    "smax v4.4s, v4.4s, v16.4s\n"
+    "smax v3.4s, v3.4s, v16.4s\n"
+    "smax v2.4s, v2.4s, v16.4s\n"
+    "smax v1.4s, v1.4s, v16.4s\n"
+    "smax v0.4s, v0.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "smin v11.4s, v11.4s, v16.4s\n"
+    "smin v10.4s, v10.4s, v16.4s\n"
+    "smin v9.4s, v9.4s, v16.4s\n"
+    "smin v8.4s, v8.4s, v16.4s\n"
+    "smin v7.4s, v7.4s, v16.4s\n"
+    "smin v6.4s, v6.4s, v16.4s\n"
+    "smin v5.4s, v5.4s, v16.4s\n"
+    "smin v4.4s, v4.4s, v16.4s\n"
+    "smin v3.4s, v3.4s, v16.4s\n"
+    "smin v2.4s, v2.4s, v16.4s\n"
+    "smin v1.4s, v1.4s, v16.4s\n"
+    "smin v0.4s, v0.4s, v16.4s\n"
     "uzp1 v23.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "smin v0.4s, v0.4s, v19.4s\n"
     "uzp1 v22.16b, v11.16b, v10.16b\n"
-    "uzp1 v21.16b, v9.16b, v8.16b\n"
-    "uzp1 v20.16b, v7.16b, v6.16b\n"
+    "uzp1 v18.16b, v9.16b, v8.16b\n"
+    "uzp1 v21.16b, v7.16b, v6.16b\n"
     "uzp1 v17.16b, v5.16b, v4.16b\n"
-    "uzp1 v19.16b, v3.16b, v2.16b\n"
-    "uzp1 v18.16b, v1.16b, v0.16b\n"
+    "uzp1 v20.16b, v3.16b, v2.16b\n"
+    "uzp1 v19.16b, v1.16b, v0.16b\n"
     "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
+    "uzp1 v18.16b, v22.16b, v18.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x40\n"
+    "uzp1 v17.16b, v21.16b, v17.16b\n"
+    "uzp1 v16.16b, v20.16b, v19.16b\n"
+    "str q18, [%x[outptr], x26]\n"
     "add x26, x26, #0x40\n"
-    "uzp1 v17.16b, v20.16b, v17.16b\n"
-    "str q16, [%x[outptr], x25]\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "str q17, [%x[outptr], x25]\n"
     "add x25, x25, #0x40\n"
-    "str q17, [%x[outptr], x24]\n"
+    "str q16, [%x[outptr], x24]\n"
     "add x24, x24, #0x40\n"
-    "str q16, [%x[outptr], x23]\n"
-    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
@@ -386,70 +387,68 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
     "ld1r { v15.4s }, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov v14.16b, v15.16b\n"
-    "mov x19, %x[inptrs]\n"
     "mov v13.16b, v15.16b\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "mov v12.16b, v15.16b\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ldr q31, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ldr q30, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ldr q30, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "ldr q31, [x21, x26]\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "subs x22, x22, #0x1\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "ldr q30, [x20, x26]\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q31, [x21, x27]\n"
+    "ldr q30, [x20, x27]\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
+    "add x22, x22, #0x10\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q31, [x21, x26]\n"
-    "uxtl v23.8h, v31.8b\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "uxtl v17.8h, v16.8b\n"
+    "uxtl2 v16.8h, v16.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "movi v21.4s, #0x0\n"
-    "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "movi v19.4s, #0xff\n"
-    "ld1r { v18.4s }, [%x[left_shift]]\n"
-    "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "srshl v15.4s, v15.4s, v18.4s\n"
+    "ld1r { v16.4s }, [%x[left_shift]]\n"
+    "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
     "ld1r { v17.4s }, [%x[right_shift]]\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "sqrdmulh v15.4s, v15.4s, v18.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+    "sub %x[n_channels], %x[n_channels], #0x10\n"
+    "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v18.4s\n"
     "cmp %x[n_channels], #0x10\n"
-    "srshl v14.4s, v14.4s, v18.4s\n"
-    "ld1r { v16.4s }, [x19]\n"
-    "srshl v13.4s, v13.4s, v18.4s\n"
-    "srshl v12.4s, v12.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v20.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v20.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v20.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v20.4s\n"
     "srshl v15.4s, v15.4s, v17.4s\n"
     "srshl v14.4s, v14.4s, v17.4s\n"
     "srshl v13.4s, v13.4s, v17.4s\n"
@@ -458,37 +457,39 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "add v14.4s, v14.4s, v16.4s\n"
     "add v13.4s, v13.4s, v16.4s\n"
     "add v12.4s, v12.4s, v16.4s\n"
-    "smax v15.4s, v15.4s, v21.4s\n"
-    "smax v14.4s, v14.4s, v21.4s\n"
-    "smax v13.4s, v13.4s, v21.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v21.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
-    "str q16, [%x[outptr], x26]\n"
-    "add x26, x26, #0x10\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
     "ld1r { v15.4s }, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "add %x[outptr], %x[outptr], x27\n"
     "mov v14.16b, v15.16b\n"
-    "add %x[outptr], %x[outptr], x26\n"
     "mov v13.16b, v15.16b\n"
-    "mov x19, %x[inptrs]\n"
     "mov v12.16b, v15.16b\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
-    "cbz x22, 24f\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 24f\n"
     "15:"  // Oddments: 2 inputs loop
+    "ldp x21, x20, [x22, #0x0]\n"
+    "add x22, x22, #0x10\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
+    "add x20, x20, x27\n"
     "movi v30.16b, #0x0\n"
-    "add x21, x21, x26\n"
-    "add x20, x20, x26\n"
     "tbz %x[n_channels], #3, 19f\n"
     "ldr d31, [x21], #0x8\n"
     "ldr d30, [x20], #0x8\n"
@@ -549,21 +550,21 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "ldr b31, [x21], #0x1\n"
     "ldr b30, [x20], #0x1\n"
     "23:"  // Oddments: 2 inputs loop: Load: Bit 3: End
-    "uaddl v23.8h, v31.8b, v30.8b\n"
-    "subs x22, x22, #0x1\n"
-    "uaddl2 v22.8h, v31.16b, v30.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uaddl v17.8h, v31.8b, v30.8b\n"
+    "uaddl2 v16.8h, v31.16b, v30.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x23, %x[n_valid_cells], #0x1\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
+    "ldr x21, [x22], #0x8\n"
+    "add x21, x21, x27\n"
     "movi v31.16b, #0x0\n"
-    "ldr x21, [x19], #0x8\n"
-    "add x21, x21, x26\n"
     "tbz %x[n_channels], #3, 29f\n"
     "ldr d31, [x21], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
@@ -609,30 +610,28 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 33f\n"
     "ldr b31, [x21], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "uxtl v23.8h, v31.8b\n"
-    "subs x20, x20, #0x1\n"
-    "uxtl2 v22.8h, v31.16b\n"
-    "uaddw v15.4s, v15.4s, v23.4h\n"
-    "uaddw2 v14.4s, v14.4s, v23.8h\n"
-    "uaddw v13.4s, v13.4s, v22.4h\n"
-    "uaddw2 v12.4s, v12.4s, v22.8h\n"
+    "uxtl v17.8h, v31.8b\n"
+    "uxtl2 v16.8h, v31.16b\n"
+    "subs x23, x23, #0x1\n"
+    "uaddw v15.4s, v15.4s, v17.4h\n"
+    "uaddw2 v14.4s, v14.4s, v17.8h\n"
+    "uaddw v13.4s, v13.4s, v16.4h\n"
+    "uaddw2 v12.4s, v12.4s, v16.8h\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "movi v21.4s, #0x0\n"
-    "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "movi v19.4s, #0xff\n"
-    "ld1r { v18.4s }, [%x[left_shift]]\n"
+    "ld1r { v16.4s }, [%x[left_shift]]\n"
+    "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
+    "srshl v15.4s, v15.4s, v16.4s\n"
+    "srshl v14.4s, v14.4s, v16.4s\n"
     "ld1r { v17.4s }, [%x[right_shift]]\n"
-    "srshl v15.4s, v15.4s, v18.4s\n"
-    "ld1r { v16.4s }, [x19]\n"
-    "srshl v14.4s, v14.4s, v18.4s\n"
-    "srshl v13.4s, v13.4s, v18.4s\n"
-    "srshl v12.4s, v12.4s, v18.4s\n"
-    "sqrdmulh v15.4s, v15.4s, v20.4s\n"
-    "sqrdmulh v14.4s, v14.4s, v20.4s\n"
-    "sqrdmulh v13.4s, v13.4s, v20.4s\n"
-    "sqrdmulh v12.4s, v12.4s, v20.4s\n"
+    "srshl v13.4s, v13.4s, v16.4s\n"
+    "srshl v12.4s, v12.4s, v16.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "sqrdmulh v15.4s, v15.4s, v18.4s\n"
+    "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+    "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+    "sqrdmulh v12.4s, v12.4s, v18.4s\n"
     "srshl v15.4s, v15.4s, v17.4s\n"
     "srshl v14.4s, v14.4s, v17.4s\n"
     "srshl v13.4s, v13.4s, v17.4s\n"
@@ -641,17 +640,19 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "add v14.4s, v14.4s, v16.4s\n"
     "add v13.4s, v13.4s, v16.4s\n"
     "add v12.4s, v12.4s, v16.4s\n"
-    "smax v15.4s, v15.4s, v21.4s\n"
-    "smax v14.4s, v14.4s, v21.4s\n"
-    "smax v13.4s, v13.4s, v21.4s\n"
-    "smin v15.4s, v15.4s, v19.4s\n"
-    "smin v14.4s, v14.4s, v19.4s\n"
-    "smin v13.4s, v13.4s, v19.4s\n"
-    "smax v12.4s, v12.4s, v21.4s\n"
-    "uzp1 v23.16b, v15.16b, v14.16b\n"
-    "smin v12.4s, v12.4s, v19.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smax v14.4s, v14.4s, v16.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v15.4s, v15.4s, v16.4s\n"
+    "smin v14.4s, v14.4s, v16.4s\n"
+    "smin v13.4s, v13.4s, v16.4s\n"
+    "smin v12.4s, v12.4s, v16.4s\n"
+    "uzp1 v17.16b, v15.16b, v14.16b\n"
     "uzp1 v16.16b, v13.16b, v12.16b\n"
-    "uzp1 v16.16b, v23.16b, v16.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -697,12 +698,10 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [accumulator_init] "r" (&accumulator_init), [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [quant_params] "r" (&qp), [right_shift] "r" (&right_shift)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp
index 1b97b458c0..fa9600f83d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,11 @@ namespace pooling {
 
 void a64_u8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
 
-struct a64_u8q_nhwc_max_generic_depthfirst
+struct a64_u8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = a64_u8q_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
   a64_u8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return a64_u8q_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
index 0d196e097e..f4927c5536 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 
 #include "pooling.hpp"
-#include <cstddef>
 #include <cstdint>
+#include <cstddef>
 
 #if defined(__aarch64__)
 
@@ -42,583 +42,583 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "mov x28, #0x0\n"
-    "mov x27, #0x10\n" // cntb _, ALL, #1
-    "mov x26, #0x20\n" // cntb _, ALL, #2
-    "mov x25, #0x30\n" // cntb _, ALL, #3
     "cmp %x[n_channels], #0x40\n"
+    "mov x27, #0x0\n"
+    "mov x26, #0x10\n"  // cntb _, ALL, #1
+    "mov x24, #0x20\n"  // cntb _, ALL, #2
+    "mov x23, #0x30\n"  // cntb _, ALL, #3
     "blt 7f\n"
     "1:"  // 4-vectors of channels
-    "movi v4.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "movi v8.16b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "movi v7.16b, #0x0\n"
+    "mov x22, %x[inptrs]\n"
     "movi v6.16b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q30, [x22, x27]\n"
-    "ldr q29, [x21, x27]\n"
-    "ldr q28, [x20, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "ldr q21, [x22, x26]\n"
-    "ldr q26, [x21, x26]\n"
-    "ldr q17, [x20, x26]\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "movi v5.16b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldr q2, [x21, x26]\n"
+    "ldr q1, [x20, x26]\n"
+    "ldr q0, [x21, x24]\n"
+    "ldr q31, [x20, x24]\n"
+    "ldr q30, [x21, x23]\n"
+    "ldr q29, [x20, x23]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax v22.16b, v31.16b, v30.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v18.16b, v29.16b, v28.16b\n"
-    "umax v21.16b, v27.16b, v21.16b\n"
-    "ldr q2, [x22, x28]\n"
-    "umax v17.16b, v26.16b, v17.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "umax v20.16b, v25.16b, v20.16b\n"
-    "ldr q0, [x20, x28]\n"
-    "umax v16.16b, v24.16b, v16.16b\n"
-    "ldr q31, [x23, x27]\n"
+    "umax v23.16b, v4.16b, v3.16b\n"
+    "umax v19.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "umax v22.16b, v2.16b, v1.16b\n"
+    "ldr q2, [x21, x26]\n"
+    "umax v18.16b, v27.16b, v21.16b\n"
+    "ldr q1, [x20, x26]\n"
+    "umax v21.16b, v0.16b, v31.16b\n"
+    "ldr q0, [x21, x24]\n"
+    "umax v17.16b, v26.16b, v20.16b\n"
+    "ldr q31, [x20, x24]\n"
+    "umax v20.16b, v30.16b, v29.16b\n"
+    "ldr q30, [x21, x23]\n"
+    "umax v16.16b, v25.16b, v24.16b\n"
+    "ldr q29, [x20, x23]\n"
     "umax v19.16b, v23.16b, v19.16b\n"
-    "ldr q30, [x22, x27]\n"
     "umax v18.16b, v22.16b, v18.16b\n"
-    "ldr q29, [x21, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "umax v17.16b, v21.16b, v17.16b\n"
-    "ldr q28, [x20, x27]\n"
     "umax v16.16b, v20.16b, v16.16b\n"
-    "ldr q27, [x23, x26]\n"
-    "umax v4.16b, v4.16b, v19.16b\n"
-    "ldr q21, [x22, x26]\n"
-    "umax v8.16b, v8.16b, v18.16b\n"
-    "ldr q26, [x21, x26]\n"
-    "umax v7.16b, v7.16b, v17.16b\n"
-    "ldr q17, [x20, x26]\n"
-    "umax v6.16b, v6.16b, v16.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "ldr q20, [x22, x25]\n"
-    "ldr q24, [x21, x25]\n"
-    "ldr q16, [x20, x25]\n"
+    "ldr q27, [x21, x26]\n"
+    "ldr q21, [x20, x26]\n"
+    "subs x25, x25, #0x1\n"
+    "umax v8.16b, v8.16b, v19.16b\n"
+    "ldr q26, [x21, x24]\n"
+    "ldr q20, [x20, x24]\n"
+    "umax v7.16b, v7.16b, v18.16b\n"
+    "umax v6.16b, v6.16b, v17.16b\n"
+    "ldr q25, [x21, x23]\n"
+    "ldr q24, [x20, x23]\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v22.16b, v31.16b, v30.16b\n"
-    "umax v18.16b, v29.16b, v28.16b\n"
-    "umax v21.16b, v27.16b, v21.16b\n"
-    "umax v17.16b, v26.16b, v17.16b\n"
-    "umax v20.16b, v25.16b, v20.16b\n"
-    "umax v16.16b, v24.16b, v16.16b\n"
+    "umax v23.16b, v4.16b, v3.16b\n"
+    "umax v19.16b, v28.16b, v22.16b\n"
+    "umax v22.16b, v2.16b, v1.16b\n"
+    "umax v18.16b, v27.16b, v21.16b\n"
+    "umax v21.16b, v0.16b, v31.16b\n"
+    "umax v17.16b, v26.16b, v20.16b\n"
+    "umax v20.16b, v30.16b, v29.16b\n"
+    "umax v16.16b, v25.16b, v24.16b\n"
     "umax v19.16b, v23.16b, v19.16b\n"
     "umax v18.16b, v22.16b, v18.16b\n"
     "umax v17.16b, v21.16b, v17.16b\n"
     "umax v16.16b, v20.16b, v16.16b\n"
-    "umax v4.16b, v4.16b, v19.16b\n"
-    "umax v8.16b, v8.16b, v18.16b\n"
-    "umax v7.16b, v7.16b, v17.16b\n"
-    "umax v6.16b, v6.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v19.16b\n"
+    "umax v7.16b, v7.16b, v18.16b\n"
+    "umax v6.16b, v6.16b, v17.16b\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v4.16b, v4.16b, v3.16b\n"
-    "ldr q31, [x23, x27]\n"
-    "ldr q27, [x23, x26]\n"
-    "umax v8.16b, v8.16b, v31.16b\n"
-    "ldr q25, [x23, x25]\n"
-    "umax v7.16b, v7.16b, v27.16b\n"
-    "umax v6.16b, v6.16b, v25.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
+    "ldr q17, [x20, x26]\n"
+    "ldr q16, [x20, x24]\n"
+    "umax v7.16b, v7.16b, v17.16b\n"
+    "umax v6.16b, v6.16b, v16.16b\n"
+    "ldr q16, [x20, x23]\n"
+    "umax v5.16b, v5.16b, v16.16b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "uxtl v17.8h, v4.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
-    "ld1r { v5.4s }, [x19]\n"
-    "uxtl2 v16.8h, v4.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "uxtl v21.8h, v8.8b\n"
-    "ld1r { v4.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    "uxtl2 v20.8h, v8.16b\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "uxtl v19.8h, v7.8b\n"
-    "ld1r { v2.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "uxtl2 v24.8h, v7.16b\n"
-    "ld1r { v1.4s }, [x19]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1r { v4.4s }, [x20]\n"
+    "uxtl v23.8h, v8.8b\n"
+    "uxtl2 v24.8h, v8.16b\n"
+    "uxtl v22.8h, v7.8b\n"
+    "uxtl2 v21.8h, v7.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v3.4s }, [x20]\n"
+    "uxtl v20.8h, v6.8b\n"
+    "uxtl2 v17.8h, v6.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v2.4s }, [x20]\n"
+    "uxtl v19.8h, v5.8b\n"
+    "uxtl2 v18.8h, v5.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v1.4s }, [x20]\n"
+    "neg v4.4s, v4.4s\n"
+    "saddw v0.4s, v4.4s, v23.4h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "saddw2 v23.4s, v4.4s, v23.8h\n"
+    "saddw v31.4s, v4.4s, v24.4h\n"
     "sub %x[n_channels], %x[n_channels], #0x40\n"
-    "uxtl v0.8h, v6.8b\n"
     "cmp %x[n_channels], #0x40\n"
-    "uxtl2 v31.8h, v6.16b\n"
-    "neg v5.4s, v5.4s\n"
-    "movi v30.4s, #0x0\n"
-    "movi v29.4s, #0xff\n"
-    "saddw v23.4s, v5.4s, v17.4h\n"
-    "saddw2 v18.4s, v5.4s, v17.8h\n"
-    "saddw v17.4s, v5.4s, v16.4h\n"
-    "saddw2 v16.4s, v5.4s, v16.8h\n"
-    "saddw v22.4s, v5.4s, v21.4h\n"
-    "saddw2 v21.4s, v5.4s, v21.8h\n"
-    "saddw v28.4s, v5.4s, v20.4h\n"
-    "saddw2 v20.4s, v5.4s, v20.8h\n"
-    "saddw v27.4s, v5.4s, v19.4h\n"
-    "saddw2 v19.4s, v5.4s, v19.8h\n"
+    "saddw2 v30.4s, v4.4s, v24.8h\n"
+    "saddw v29.4s, v4.4s, v22.4h\n"
+    "saddw2 v22.4s, v4.4s, v22.8h\n"
+    "saddw v28.4s, v4.4s, v21.4h\n"
+    "saddw2 v21.4s, v4.4s, v21.8h\n"
+    "saddw v27.4s, v4.4s, v20.4h\n"
+    "saddw2 v20.4s, v4.4s, v20.8h\n"
+    "saddw v26.4s, v4.4s, v17.4h\n"
+    "saddw2 v17.4s, v4.4s, v17.8h\n"
+    "saddw v25.4s, v4.4s, v19.4h\n"
+    "saddw2 v19.4s, v4.4s, v19.8h\n"
+    "saddw v24.4s, v4.4s, v18.4h\n"
+    "saddw2 v18.4s, v4.4s, v18.8h\n"
+    "srshl v0.4s, v0.4s, v3.4s\n"
     "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v18.4s, v18.4s, v3.4s\n"
-    "srshl v17.4s, v17.4s, v3.4s\n"
-    "srshl v16.4s, v16.4s, v3.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-    "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-    "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-    "srshl v23.4s, v23.4s, v2.4s\n"
-    "srshl v18.4s, v18.4s, v2.4s\n"
-    "srshl v17.4s, v17.4s, v2.4s\n"
-    "srshl v16.4s, v16.4s, v2.4s\n"
+    "srshl v31.4s, v31.4s, v3.4s\n"
+    "srshl v30.4s, v30.4s, v3.4s\n"
+    "srshl v29.4s, v29.4s, v3.4s\n"
     "srshl v22.4s, v22.4s, v3.4s\n"
-    "srshl v21.4s, v21.4s, v3.4s\n"
     "srshl v28.4s, v28.4s, v3.4s\n"
-    "srshl v20.4s, v20.4s, v3.4s\n"
-    "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-    "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-    "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-    "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-    "srshl v22.4s, v22.4s, v2.4s\n"
-    "srshl v21.4s, v21.4s, v2.4s\n"
-    "srshl v28.4s, v28.4s, v2.4s\n"
-    "srshl v20.4s, v20.4s, v2.4s\n"
+    "srshl v21.4s, v21.4s, v3.4s\n"
     "srshl v27.4s, v27.4s, v3.4s\n"
-    "srshl v19.4s, v19.4s, v3.4s\n"
-    "add v23.4s, v23.4s, v1.4s\n"
-    "add v18.4s, v18.4s, v1.4s\n"
-    "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-    "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-    "add v17.4s, v17.4s, v1.4s\n"
-    "add v16.4s, v16.4s, v1.4s\n"
-    "srshl v27.4s, v27.4s, v2.4s\n"
-    "srshl v19.4s, v19.4s, v2.4s\n"
-    "add v22.4s, v22.4s, v1.4s\n"
-    "add v21.4s, v21.4s, v1.4s\n"
-    "add v28.4s, v28.4s, v1.4s\n"
-    "add v20.4s, v20.4s, v1.4s\n"
-    "add v27.4s, v27.4s, v1.4s\n"
-    "add v19.4s, v19.4s, v1.4s\n"
-    "smax v23.4s, v23.4s, v30.4s\n"
-    "smax v18.4s, v18.4s, v30.4s\n"
-    "smax v17.4s, v17.4s, v30.4s\n"
-    "smin v23.4s, v23.4s, v29.4s\n"
-    "smin v18.4s, v18.4s, v29.4s\n"
-    "smin v17.4s, v17.4s, v29.4s\n"
-    "smax v16.4s, v16.4s, v30.4s\n"
-    "smax v22.4s, v22.4s, v30.4s\n"
-    "smax v21.4s, v21.4s, v30.4s\n"
-    "smin v16.4s, v16.4s, v29.4s\n"
-    "smin v22.4s, v22.4s, v29.4s\n"
-    "smin v21.4s, v21.4s, v29.4s\n"
-    "smax v28.4s, v28.4s, v30.4s\n"
-    "smax v20.4s, v20.4s, v30.4s\n"
-    "smax v27.4s, v27.4s, v30.4s\n"
-    "smin v28.4s, v28.4s, v29.4s\n"
-    "smin v20.4s, v20.4s, v29.4s\n"
-    "smin v27.4s, v27.4s, v29.4s\n"
-    "smax v19.4s, v19.4s, v30.4s\n"
-    "uzp1 v26.16b, v23.16b, v18.16b\n"
-    "saddw v25.4s, v5.4s, v24.4h\n"
-    "saddw2 v18.4s, v5.4s, v24.8h\n"
-    "smin v19.4s, v19.4s, v29.4s\n"
+    "srshl v20.4s, v20.4s, v3.4s\n"
+    "srshl v26.4s, v26.4s, v3.4s\n"
+    "srshl v17.4s, v17.4s, v3.4s\n"
     "srshl v25.4s, v25.4s, v3.4s\n"
+    "srshl v19.4s, v19.4s, v3.4s\n"
+    "srshl v24.4s, v24.4s, v3.4s\n"
     "srshl v18.4s, v18.4s, v3.4s\n"
-    "uzp1 v24.16b, v17.16b, v16.16b\n"
-    "saddw v17.4s, v5.4s, v0.4h\n"
-    "saddw2 v16.4s, v5.4s, v0.8h\n"
-    "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-    "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-    "srshl v17.4s, v17.4s, v3.4s\n"
-    "srshl v16.4s, v16.4s, v3.4s\n"
-    "srshl v25.4s, v25.4s, v2.4s\n"
-    "srshl v18.4s, v18.4s, v2.4s\n"
-    "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-    "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-    "add v25.4s, v25.4s, v1.4s\n"
-    "add v18.4s, v18.4s, v1.4s\n"
-    "srshl v17.4s, v17.4s, v2.4s\n"
-    "srshl v16.4s, v16.4s, v2.4s\n"
-    "smax v25.4s, v25.4s, v30.4s\n"
-    "smax v18.4s, v18.4s, v30.4s\n"
-    "add v17.4s, v17.4s, v1.4s\n"
-    "add v16.4s, v16.4s, v1.4s\n"
-    "smin v25.4s, v25.4s, v29.4s\n"
-    "smin v18.4s, v18.4s, v29.4s\n"
-    "smax v17.4s, v17.4s, v30.4s\n"
-    "smax v16.4s, v16.4s, v30.4s\n"
-    "uzp1 v23.16b, v22.16b, v21.16b\n"
-    "saddw v22.4s, v5.4s, v31.4h\n"
-    "saddw2 v21.4s, v5.4s, v31.8h\n"
-    "smin v17.4s, v17.4s, v29.4s\n"
-    "srshl v22.4s, v22.4s, v3.4s\n"
-    "srshl v21.4s, v21.4s, v3.4s\n"
-    "smin v16.4s, v16.4s, v29.4s\n"
-    "uzp1 v20.16b, v28.16b, v20.16b\n"
-    "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-    "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-    "uzp1 v19.16b, v27.16b, v19.16b\n"
-    "uzp1 v18.16b, v25.16b, v18.16b\n"
-    "srshl v22.4s, v22.4s, v2.4s\n"
-    "srshl v21.4s, v21.4s, v2.4s\n"
-    "uzp1 v17.16b, v17.16b, v16.16b\n"
-    "uzp1 v16.16b, v26.16b, v24.16b\n"
-    "str q16, [%x[outptr], x28]\n"
-    "add v22.4s, v22.4s, v1.4s\n"
-    "add x28, x28, #0x40\n"
-    "add v21.4s, v21.4s, v1.4s\n"
-    "uzp1 v16.16b, v23.16b, v20.16b\n"
+    "sqrdmulh v0.4s, v0.4s, v2.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+    "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v2.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v2.4s\n"
+    "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v2.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v2.4s\n"
+    "srshl v0.4s, v0.4s, v1.4s\n"
+    "srshl v23.4s, v23.4s, v1.4s\n"
+    "srshl v31.4s, v31.4s, v1.4s\n"
+    "srshl v30.4s, v30.4s, v1.4s\n"
+    "srshl v29.4s, v29.4s, v1.4s\n"
+    "srshl v22.4s, v22.4s, v1.4s\n"
+    "srshl v28.4s, v28.4s, v1.4s\n"
+    "srshl v21.4s, v21.4s, v1.4s\n"
+    "srshl v27.4s, v27.4s, v1.4s\n"
+    "srshl v20.4s, v20.4s, v1.4s\n"
+    "srshl v26.4s, v26.4s, v1.4s\n"
+    "srshl v17.4s, v17.4s, v1.4s\n"
+    "srshl v25.4s, v25.4s, v1.4s\n"
+    "srshl v19.4s, v19.4s, v1.4s\n"
+    "srshl v24.4s, v24.4s, v1.4s\n"
+    "srshl v18.4s, v18.4s, v1.4s\n"
+    "add v0.4s, v0.4s, v16.4s\n"
+    "add v23.4s, v23.4s, v16.4s\n"
+    "add v31.4s, v31.4s, v16.4s\n"
+    "add v30.4s, v30.4s, v16.4s\n"
+    "add v29.4s, v29.4s, v16.4s\n"
+    "add v22.4s, v22.4s, v16.4s\n"
+    "add v28.4s, v28.4s, v16.4s\n"
+    "add v21.4s, v21.4s, v16.4s\n"
+    "add v27.4s, v27.4s, v16.4s\n"
+    "add v20.4s, v20.4s, v16.4s\n"
+    "add v26.4s, v26.4s, v16.4s\n"
+    "add v17.4s, v17.4s, v16.4s\n"
+    "add v25.4s, v25.4s, v16.4s\n"
+    "add v19.4s, v19.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v16.4s\n"
+    "add v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v0.4s, v0.4s, v16.4s\n"
+    "smax v23.4s, v23.4s, v16.4s\n"
+    "smax v31.4s, v31.4s, v16.4s\n"
+    "smax v30.4s, v30.4s, v16.4s\n"
+    "smax v29.4s, v29.4s, v16.4s\n"
+    "smax v22.4s, v22.4s, v16.4s\n"
+    "smax v28.4s, v28.4s, v16.4s\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v27.4s, v27.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v26.4s, v26.4s, v16.4s\n"
+    "smax v17.4s, v17.4s, v16.4s\n"
+    "smax v25.4s, v25.4s, v16.4s\n"
+    "smax v19.4s, v19.4s, v16.4s\n"
+    "smax v24.4s, v24.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v0.4s, v0.4s, v16.4s\n"
+    "smin v23.4s, v23.4s, v16.4s\n"
+    "smin v31.4s, v31.4s, v16.4s\n"
+    "smin v30.4s, v30.4s, v16.4s\n"
+    "smin v29.4s, v29.4s, v16.4s\n"
+    "smin v22.4s, v22.4s, v16.4s\n"
+    "smin v28.4s, v28.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v16.4s\n"
+    "smin v27.4s, v27.4s, v16.4s\n"
+    "smin v20.4s, v20.4s, v16.4s\n"
+    "smin v26.4s, v26.4s, v16.4s\n"
+    "smin v17.4s, v17.4s, v16.4s\n"
+    "smin v25.4s, v25.4s, v16.4s\n"
+    "smin v19.4s, v19.4s, v16.4s\n"
+    "smin v24.4s, v24.4s, v16.4s\n"
+    "smin v18.4s, v18.4s, v16.4s\n"
+    "uzp1 v23.16b, v0.16b, v23.16b\n"
+    "uzp1 v16.16b, v31.16b, v30.16b\n"
+    "uzp1 v22.16b, v29.16b, v22.16b\n"
+    "uzp1 v21.16b, v28.16b, v21.16b\n"
+    "uzp1 v20.16b, v27.16b, v20.16b\n"
+    "uzp1 v17.16b, v26.16b, v17.16b\n"
+    "uzp1 v19.16b, v25.16b, v19.16b\n"
+    "uzp1 v18.16b, v24.16b, v18.16b\n"
+    "uzp1 v16.16b, v23.16b, v16.16b\n"
     "str q16, [%x[outptr], x27]\n"
-    "smax v22.4s, v22.4s, v30.4s\n"
     "add x27, x27, #0x40\n"
-    "smax v21.4s, v21.4s, v30.4s\n"
-    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "uzp1 v16.16b, v22.16b, v21.16b\n"
+    "uzp1 v17.16b, v20.16b, v17.16b\n"
     "str q16, [%x[outptr], x26]\n"
-    "smin v22.4s, v22.4s, v29.4s\n"
     "add x26, x26, #0x40\n"
-    "smin v21.4s, v21.4s, v29.4s\n"
-    "uzp1 v16.16b, v22.16b, v21.16b\n"
-    "uzp1 v16.16b, v17.16b, v16.16b\n"
-    "str q16, [%x[outptr], x25]\n"
-    "add x25, x25, #0x40\n"
+    "uzp1 v16.16b, v19.16b, v18.16b\n"
+    "str q17, [%x[outptr], x24]\n"
+    "add x24, x24, #0x40\n"
+    "str q16, [%x[outptr], x23]\n"
+    "add x23, x23, #0x40\n"
     "bge 1b\n"
     "cbz %x[n_channels], 43f\n"
     "7:"  // Single vector of channels
     "cmp %x[n_channels], #0x10\n"
     "blt 14f\n"
     "8:"  // Single vector of channels: Loop
-    "movi v4.16b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "movi v8.16b, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q3, [x20, x27]\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "add x22, x22, #0x20\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "ldr q3, [x23, x28]\n"
-    "ldr q2, [x22, x28]\n"
-    "umax v4.16b, v4.16b, v19.16b\n"
-    "ldr q1, [x21, x28]\n"
-    "ldr q0, [x20, x28]\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "ldr q4, [x21, x27]\n"
+    "ldr q3, [x20, x27]\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "ldp x21, x20, [x22, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "ldr q28, [x21, x27]\n"
+    "ldr q22, [x20, x27]\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
+    "add x22, x22, #0x20\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "umax v4.16b, v4.16b, v19.16b\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ldr q3, [x23, x28]\n"
-    "umax v4.16b, v4.16b, v3.16b\n"
+    "ldr x20, [x22], #0x8\n"
+    "ldr q16, [x20, x27]\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "uxtl v17.8h, v4.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
-    "ld1r { v5.4s }, [x19]\n"
-    "uxtl2 v16.8h, v4.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "movi v30.4s, #0x0\n"
-    "ld1r { v4.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    "movi v29.4s, #0xff\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "neg v5.4s, v5.4s\n"
-    "ld1r { v2.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "saddw v23.4s, v5.4s, v17.4h\n"
-    "ld1r { v1.4s }, [x19]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1r { v18.4s }, [x20]\n"
+    "uxtl v17.8h, v8.8b\n"
+    "uxtl2 v16.8h, v8.16b\n"
+    "neg v18.4s, v18.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v23.4s }, [x20]\n"
+    "saddw v22.4s, v18.4s, v17.4h\n"
+    "saddw2 v21.4s, v18.4s, v17.8h\n"
+    "saddw v20.4s, v18.4s, v16.4h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v19.4s }, [x20]\n"
+    "saddw2 v18.4s, v18.4s, v16.8h\n"
+    "srshl v22.4s, v22.4s, v23.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v17.4s }, [x20]\n"
+    "srshl v21.4s, v21.4s, v23.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "srshl v18.4s, v18.4s, v23.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v19.4s\n"
     "sub %x[n_channels], %x[n_channels], #0x10\n"
-    "saddw2 v18.4s, v5.4s, v17.8h\n"
     "cmp %x[n_channels], #0x10\n"
-    "saddw v17.4s, v5.4s, v16.4h\n"
-    "saddw2 v16.4s, v5.4s, v16.8h\n"
-    "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v18.4s, v18.4s, v3.4s\n"
-    "srshl v17.4s, v17.4s, v3.4s\n"
-    "srshl v16.4s, v16.4s, v3.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-    "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-    "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-    "srshl v23.4s, v23.4s, v2.4s\n"
-    "srshl v18.4s, v18.4s, v2.4s\n"
-    "srshl v17.4s, v17.4s, v2.4s\n"
-    "srshl v16.4s, v16.4s, v2.4s\n"
-    "add v23.4s, v23.4s, v1.4s\n"
-    "add v18.4s, v18.4s, v1.4s\n"
-    "add v17.4s, v17.4s, v1.4s\n"
-    "add v16.4s, v16.4s, v1.4s\n"
-    "smax v23.4s, v23.4s, v30.4s\n"
-    "smax v18.4s, v18.4s, v30.4s\n"
-    "smax v17.4s, v17.4s, v30.4s\n"
-    "smin v23.4s, v23.4s, v29.4s\n"
-    "smin v18.4s, v18.4s, v29.4s\n"
-    "smin v17.4s, v17.4s, v29.4s\n"
-    "smax v16.4s, v16.4s, v30.4s\n"
-    "uzp1 v26.16b, v23.16b, v18.16b\n"
-    "smin v16.4s, v16.4s, v29.4s\n"
-    "uzp1 v24.16b, v17.16b, v16.16b\n"
-    "uzp1 v16.16b, v26.16b, v24.16b\n"
-    "str q16, [%x[outptr], x28]\n"
-    "add x28, x28, #0x10\n"
+    "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v19.4s\n"
+    "srshl v22.4s, v22.4s, v17.4s\n"
+    "srshl v21.4s, v21.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v17.4s\n"
+    "srshl v18.4s, v18.4s, v17.4s\n"
+    "add v22.4s, v22.4s, v16.4s\n"
+    "add v21.4s, v21.4s, v16.4s\n"
+    "add v20.4s, v20.4s, v16.4s\n"
+    "add v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v22.4s, v22.4s, v16.4s\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v22.4s, v22.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v16.4s\n"
+    "smin v20.4s, v20.4s, v16.4s\n"
+    "smin v18.4s, v18.4s, v16.4s\n"
+    "uzp1 v17.16b, v22.16b, v21.16b\n"
+    "uzp1 v16.16b, v20.16b, v18.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
+    "str q16, [%x[outptr], x27]\n"
+    "add x27, x27, #0x10\n"
     "bge 8b\n"
     "cbz %x[n_channels], 43f\n"
     "14:"  // Oddments
-    "movi v4.16b, #0x0\n"
-    "add %x[outptr], %x[outptr], x28\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 24f\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "add %x[outptr], %x[outptr], x27\n"
+    "movi v8.16b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 24f\n"
     "15:"  // Oddments: 4 inputs loop
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "add x23, x23, x27\n"
+    "add x22, x22, x27\n"
+    "add x21, x21, x27\n"
+    "movi v4.16b, #0x0\n"
     "movi v3.16b, #0x0\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "add x23, x23, x28\n"
-    "movi v2.16b, #0x0\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "movi v1.16b, #0x0\n"
-    "add x19, x19, #0x20\n"
-    "movi v0.16b, #0x0\n"
-    "add x22, x22, x28\n"
-    "add x21, x21, x28\n"
-    "add x20, x20, x28\n"
+    "add x20, x20, x27\n"
+    "movi v28.16b, #0x0\n"
+    "movi v22.16b, #0x0\n"
     "tbz %x[n_channels], #3, 19f\n"
-    "ldr d3, [x23], #0x8\n"
-    "ldr d2, [x22], #0x8\n"
-    "ldr d1, [x21], #0x8\n"
-    "ldr d0, [x20], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
+    "ldr d3, [x22], #0x8\n"
+    "ldr d28, [x21], #0x8\n"
+    "ldr d22, [x20], #0x8\n"
     "tbz %x[n_channels], #2, 17f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
-    "ld1 { v2.s }[2], [x22], #0x4\n"
-    "ld1 { v1.s }[2], [x21], #0x4\n"
-    "ld1 { v0.s }[2], [x20], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
+    "ld1 { v3.s }[2], [x22], #0x4\n"
+    "ld1 { v28.s }[2], [x21], #0x4\n"
+    "ld1 { v22.s }[2], [x20], #0x4\n"
     "tbz %x[n_channels], #1, 16f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
-    "ld1 { v2.h }[6], [x22], #0x2\n"
-    "ld1 { v1.h }[6], [x21], #0x2\n"
-    "ld1 { v0.h }[6], [x20], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
+    "ld1 { v3.h }[6], [x22], #0x2\n"
+    "ld1 { v28.h }[6], [x21], #0x2\n"
+    "ld1 { v22.h }[6], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
-    "ld1 { v2.b }[14], [x22], #0x1\n"
-    "ld1 { v1.b }[14], [x21], #0x1\n"
-    "ld1 { v0.b }[14], [x20], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
+    "ld1 { v3.b }[14], [x22], #0x1\n"
+    "ld1 { v28.b }[14], [x21], #0x1\n"
+    "ld1 { v22.b }[14], [x20], #0x1\n"
     "b 23f\n"
     "16:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
-    "ld1 { v2.b }[12], [x22], #0x1\n"
-    "ld1 { v1.b }[12], [x21], #0x1\n"
-    "ld1 { v0.b }[12], [x20], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
+    "ld1 { v3.b }[12], [x22], #0x1\n"
+    "ld1 { v28.b }[12], [x21], #0x1\n"
+    "ld1 { v22.b }[12], [x20], #0x1\n"
     "b 23f\n"
     "17:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 18f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
-    "ld1 { v2.h }[4], [x22], #0x2\n"
-    "ld1 { v1.h }[4], [x21], #0x2\n"
-    "ld1 { v0.h }[4], [x20], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
+    "ld1 { v3.h }[4], [x22], #0x2\n"
+    "ld1 { v28.h }[4], [x21], #0x2\n"
+    "ld1 { v22.h }[4], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
-    "ld1 { v2.b }[10], [x22], #0x1\n"
-    "ld1 { v1.b }[10], [x21], #0x1\n"
-    "ld1 { v0.b }[10], [x20], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
+    "ld1 { v3.b }[10], [x22], #0x1\n"
+    "ld1 { v28.b }[10], [x21], #0x1\n"
+    "ld1 { v22.b }[10], [x20], #0x1\n"
     "b 23f\n"
     "18:"  // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
-    "ld1 { v2.b }[8], [x22], #0x1\n"
-    "ld1 { v1.b }[8], [x21], #0x1\n"
-    "ld1 { v0.b }[8], [x20], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
+    "ld1 { v3.b }[8], [x22], #0x1\n"
+    "ld1 { v28.b }[8], [x21], #0x1\n"
+    "ld1 { v22.b }[8], [x20], #0x1\n"
     "b 23f\n"
     "19:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 21f\n"
-    "ldr s3, [x23], #0x4\n"
-    "ldr s2, [x22], #0x4\n"
-    "ldr s1, [x21], #0x4\n"
-    "ldr s0, [x20], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
+    "ldr s3, [x22], #0x4\n"
+    "ldr s28, [x21], #0x4\n"
+    "ldr s22, [x20], #0x4\n"
     "tbz %x[n_channels], #1, 20f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
-    "ld1 { v2.h }[2], [x22], #0x2\n"
-    "ld1 { v1.h }[2], [x21], #0x2\n"
-    "ld1 { v0.h }[2], [x20], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
+    "ld1 { v3.h }[2], [x22], #0x2\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "ld1 { v22.h }[2], [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
-    "ld1 { v2.b }[6], [x22], #0x1\n"
-    "ld1 { v1.b }[6], [x21], #0x1\n"
-    "ld1 { v0.b }[6], [x20], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
+    "ld1 { v3.b }[6], [x22], #0x1\n"
+    "ld1 { v28.b }[6], [x21], #0x1\n"
+    "ld1 { v22.b }[6], [x20], #0x1\n"
     "b 23f\n"
     "20:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
-    "ld1 { v2.b }[4], [x22], #0x1\n"
-    "ld1 { v1.b }[4], [x21], #0x1\n"
-    "ld1 { v0.b }[4], [x20], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
+    "ld1 { v3.b }[4], [x22], #0x1\n"
+    "ld1 { v28.b }[4], [x21], #0x1\n"
+    "ld1 { v22.b }[4], [x20], #0x1\n"
     "b 23f\n"
     "21:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 22f\n"
-    "ldr h3, [x23], #0x2\n"
-    "ldr h2, [x22], #0x2\n"
-    "ldr h1, [x21], #0x2\n"
-    "ldr h0, [x20], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
+    "ldr h3, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "ldr h22, [x20], #0x2\n"
     "tbz %x[n_channels], #0, 23f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
-    "ld1 { v2.b }[2], [x22], #0x1\n"
-    "ld1 { v1.b }[2], [x21], #0x1\n"
-    "ld1 { v0.b }[2], [x20], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
+    "ld1 { v3.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "ld1 { v22.b }[2], [x20], #0x1\n"
     "b 23f\n"
     "22:"  // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 23f\n"
-    "ldr b3, [x23], #0x1\n"
-    "ldr b2, [x22], #0x1\n"
-    "ldr b1, [x21], #0x1\n"
-    "ldr b0, [x20], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
+    "ldr b3, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "ldr b22, [x20], #0x1\n"
     "23:"  // Oddments: 4 inputs loop: Load: Bit 3: End
-    "umax v23.16b, v3.16b, v2.16b\n"
-    "subs x24, x24, #0x1\n"
-    "umax v19.16b, v1.16b, v0.16b\n"
-    "umax v19.16b, v23.16b, v19.16b\n"
-    "umax v4.16b, v4.16b, v19.16b\n"
+    "umax v17.16b, v4.16b, v3.16b\n"
+    "umax v16.16b, v28.16b, v22.16b\n"
+    "subs x25, x25, #0x1\n"
+    "umax v16.16b, v17.16b, v16.16b\n"
+    "umax v8.16b, v8.16b, v16.16b\n"
     "bgt 15b\n"
     "24:"  // Oddments: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 34f\n"
     "25:"  // Oddments: Single input loop
-    "movi v3.16b, #0x0\n"
-    "ldr x23, [x19], #0x8\n"
-    "add x23, x23, x28\n"
+    "ldr x23, [x24], #0x8\n"
+    "add x23, x23, x27\n"
+    "movi v4.16b, #0x0\n"
     "tbz %x[n_channels], #3, 29f\n"
-    "ldr d3, [x23], #0x8\n"
+    "ldr d4, [x23], #0x8\n"
     "tbz %x[n_channels], #2, 27f\n"
-    "ld1 { v3.s }[2], [x23], #0x4\n"
+    "ld1 { v4.s }[2], [x23], #0x4\n"
     "tbz %x[n_channels], #1, 26f\n"
-    "ld1 { v3.h }[6], [x23], #0x2\n"
+    "ld1 { v4.h }[6], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[14], [x23], #0x1\n"
+    "ld1 { v4.b }[14], [x23], #0x1\n"
     "b 33f\n"
     "26:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[12], [x23], #0x1\n"
+    "ld1 { v4.b }[12], [x23], #0x1\n"
     "b 33f\n"
     "27:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
     "tbz %x[n_channels], #1, 28f\n"
-    "ld1 { v3.h }[4], [x23], #0x2\n"
+    "ld1 { v4.h }[4], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[10], [x23], #0x1\n"
+    "ld1 { v4.b }[10], [x23], #0x1\n"
     "b 33f\n"
     "28:"  // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[8], [x23], #0x1\n"
+    "ld1 { v4.b }[8], [x23], #0x1\n"
     "b 33f\n"
     "29:"  // Oddments: Single input loop: Load: Bit 3: Unset
     "tbz %x[n_channels], #2, 31f\n"
-    "ldr s3, [x23], #0x4\n"
+    "ldr s4, [x23], #0x4\n"
     "tbz %x[n_channels], #1, 30f\n"
-    "ld1 { v3.h }[2], [x23], #0x2\n"
+    "ld1 { v4.h }[2], [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[6], [x23], #0x1\n"
+    "ld1 { v4.b }[6], [x23], #0x1\n"
     "b 33f\n"
     "30:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[4], [x23], #0x1\n"
+    "ld1 { v4.b }[4], [x23], #0x1\n"
     "b 33f\n"
     "31:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
     "tbz %x[n_channels], #1, 32f\n"
-    "ldr h3, [x23], #0x2\n"
+    "ldr h4, [x23], #0x2\n"
     "tbz %x[n_channels], #0, 33f\n"
-    "ld1 { v3.b }[2], [x23], #0x1\n"
+    "ld1 { v4.b }[2], [x23], #0x1\n"
     "b 33f\n"
     "32:"  // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
     "tbz %x[n_channels], #0, 33f\n"
-    "ldr b3, [x23], #0x1\n"
+    "ldr b4, [x23], #0x1\n"
     "33:"  // Oddments: Single input loop: Load: Bit 3: End
-    "umax v4.16b, v4.16b, v3.16b\n"
-    "subs x20, x20, #0x1\n"
+    "subs x21, x21, #0x1\n"
+    "umax v8.16b, v8.16b, v4.16b\n"
     "bgt 25b\n"
     "34:"  // Oddments: Single input loop: End
-    "uxtl v17.8h, v4.8b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
-    "ld1r { v5.4s }, [x19]\n"
-    "uxtl2 v16.8h, v4.16b\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "movi v30.4s, #0x0\n"
-    "ld1r { v4.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    "movi v29.4s, #0xff\n"
-    "ld1r { v3.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "neg v5.4s, v5.4s\n"
-    "ld1r { v2.4s }, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "saddw v23.4s, v5.4s, v17.4h\n"
-    "ld1r { v1.4s }, [x19]\n"
-    "saddw2 v18.4s, v5.4s, v17.8h\n"
-    "saddw v17.4s, v5.4s, v16.4h\n"
-    "saddw2 v16.4s, v5.4s, v16.8h\n"
-    "srshl v23.4s, v23.4s, v3.4s\n"
-    "srshl v18.4s, v18.4s, v3.4s\n"
-    "srshl v17.4s, v17.4s, v3.4s\n"
-    "srshl v16.4s, v16.4s, v3.4s\n"
-    "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-    "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-    "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-    "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-    "srshl v23.4s, v23.4s, v2.4s\n"
-    "srshl v18.4s, v18.4s, v2.4s\n"
-    "srshl v17.4s, v17.4s, v2.4s\n"
-    "srshl v16.4s, v16.4s, v2.4s\n"
-    "add v23.4s, v23.4s, v1.4s\n"
-    "add v18.4s, v18.4s, v1.4s\n"
-    "add v17.4s, v17.4s, v1.4s\n"
-    "add v16.4s, v16.4s, v1.4s\n"
-    "smax v23.4s, v23.4s, v30.4s\n"
-    "smax v18.4s, v18.4s, v30.4s\n"
-    "smax v17.4s, v17.4s, v30.4s\n"
-    "smin v23.4s, v23.4s, v29.4s\n"
-    "smin v18.4s, v18.4s, v29.4s\n"
-    "smin v17.4s, v17.4s, v29.4s\n"
-    "smax v16.4s, v16.4s, v30.4s\n"
-    "uzp1 v26.16b, v23.16b, v18.16b\n"
-    "smin v16.4s, v16.4s, v29.4s\n"
-    "uzp1 v24.16b, v17.16b, v16.16b\n"
-    "uzp1 v16.16b, v26.16b, v24.16b\n"
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1r { v18.4s }, [x20]\n"
+    "uxtl v17.8h, v8.8b\n"
+    "uxtl2 v16.8h, v8.16b\n"
+    "neg v18.4s, v18.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1r { v23.4s }, [x20]\n"
+    "saddw v22.4s, v18.4s, v17.4h\n"
+    "saddw2 v21.4s, v18.4s, v17.8h\n"
+    "saddw v20.4s, v18.4s, v16.4h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1r { v19.4s }, [x20]\n"
+    "saddw2 v18.4s, v18.4s, v16.8h\n"
+    "srshl v22.4s, v22.4s, v23.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1r { v17.4s }, [x20]\n"
+    "srshl v21.4s, v21.4s, v23.4s\n"
+    "srshl v20.4s, v20.4s, v23.4s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "srshl v18.4s, v18.4s, v23.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v19.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v19.4s\n"
+    "srshl v22.4s, v22.4s, v17.4s\n"
+    "srshl v21.4s, v21.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v17.4s\n"
+    "srshl v18.4s, v18.4s, v17.4s\n"
+    "add v22.4s, v22.4s, v16.4s\n"
+    "add v21.4s, v21.4s, v16.4s\n"
+    "add v20.4s, v20.4s, v16.4s\n"
+    "add v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0x0\n"
+    "smax v22.4s, v22.4s, v16.4s\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "movi v16.4s, #0xff\n"
+    "smin v22.4s, v22.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v16.4s\n"
+    "smin v20.4s, v20.4s, v16.4s\n"
+    "smin v18.4s, v18.4s, v16.4s\n"
+    "uzp1 v17.16b, v22.16b, v21.16b\n"
+    "uzp1 v16.16b, v20.16b, v18.16b\n"
+    "uzp1 v16.16b, v17.16b, v16.16b\n"
     "tbz %x[n_channels], #3, 38f\n"
     "st1 { v16.d }[0], [%x[outptr]], #0x8\n"
     "tbz %x[n_channels], #2, 36f\n"
@@ -664,12 +664,10 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
     "tbz %x[n_channels], #0, 42f\n"
     "st1 { v16.b }[0], [%x[outptr]], #0x1\n"
     "42:"  // Oddments: Store: Bit 3: End
-
     "43:"  // End
-
     : [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
     : [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
   );
 }
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp
index 6dffdcf01c..225f1e42c9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,18 +33,11 @@ template <typename T>
 void cpp_nhwc_1x1_stride_any_depthfirst_impl(const uint64_t, const uint64_t, uint64_t n_channels, const T *const *const inptrs, T *outptr);
 
 template <typename T>
-struct cpp_nhwc_1x1_stride_any_depthfirst
+struct cpp_nhwc_1x1_stride_any_depthfirst : IGenericDepthfirstStrategy<T, T, Nothing>
 {
-  typedef T operand_type;
-  typedef T return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t, uint64_t n_channels, const operand_type *const *const inptrs, return_type *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-  kern_type kernel = cpp_nhwc_1x1_stride_any_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<T, T, Nothing>;
   cpp_nhwc_1x1_stride_any_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return cpp_nhwc_1x1_stride_any_depthfirst_impl<T>; }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
index 2bb22131f7..1f8f863de2 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,10 @@
 
 #include <cstdint>
 #include <cstring>
+#ifdef ARM_COMPUTE_ENABLE_BF16
+#include "bfloat.hpp"
+using arm_gemm::bfloat16;
+#endif
 
 namespace arm_conv {
 namespace pooling {
@@ -41,9 +45,15 @@ void cpp_nhwc_1x1_stride_any_depthfirst_impl(
 }
 
 template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const float *const *, float *);
-#if defined(__ARM_FP16_ARGS)
+
+#ifdef __ARM_FP16_ARGS
 template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const __fp16 *const *, __fp16 *);
-#endif  // defined(__ARM_FP16_ARGS)
+#endif
+
+#ifdef ARM_COMPUTE_ENABLE_BF16
+template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const bfloat16 *const *, bfloat16 *);
+#endif
+
 template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const int8_t *const *, int8_t *);
 template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const uint8_t *const *, uint8_t *);
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..f6682e75e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
+{
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
+
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..67b07205cd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const __fp16 *const *const inptrs,
+  __fp16 *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const __fp16 *const *const inptrs;
+    __fp16 *const *const outptrs;
+    __fp16 rescale_vals[4];
+
+    KernelArgs(
+      unsigned int channels,
+      const __fp16 *const *input_ptrs,
+      __fp16 *const * output_ptrs,
+      bool exclude_padding, unsigned int pad_left, unsigned int pad_top, unsigned int pad_right, unsigned int pad_bottom
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+      for (unsigned int i = 0; i < 2; i++)
+      {
+        const int start_i = 1*i - static_cast<int>(pad_top);
+        const int end_i = std::min<int>(start_i + 3, 4 - pad_top - pad_bottom);
+        const int valid_rows = end_i - std::max<int>(0, start_i);
+
+        for (unsigned int j = 0; j < 2; j++)
+        {
+          const int start_j = 1*j - static_cast<int>(pad_left);
+          const int end_j = std::min<int>(start_j + 3, 4 - pad_left - pad_right);
+          const int valid_cols = end_j - std::max<int>(0, start_j);
+
+          rescale_vals[i*2 + j] = static_cast<__fp16>(1.0f / static_cast<float>(
+            exclude_padding ? valid_rows * valid_cols : 9
+          ));
+        }
+      }
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x3, #0x0\n"
+    "mov x20, #0x4\n"
+    "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+    "whilelt p0.h, XZR, x20\n"
+    "add x20, %x[args], %[offsetof_rescale]\n"
+    "ld1rqh { z4.h }, p0/Z, [x20]\n"
+    "ldr x5, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.h, x3, x5\n"
+    "mov x6, #0x0\n"
+    "ldp x7, x8, [x21, #0x0]\n"
+    "ldp x17, x16, [x21, #0x10]\n"
+    "ldp x15, x14, [x4, #0x0]\n"
+    "ld1h { z3.h }, p0/Z, [x14, x3, LSL #1]\n"
+    "ldp x13, x12, [x4, #0x10]\n"
+    "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
+    "ldp x11, x10, [x4, #0x20]\n"
+    "ld1h { z1.h }, p0/Z, [x10, x3, LSL #1]\n"
+    "ldp x9, x28, [x4, #0x30]\n"
+    "ld1h { z0.h }, p0/Z, [x9, x3, LSL #1]\n"
+    "ldp x27, x26, [x4, #0x40]\n"
+    "ld1h { z31.h }, p0/Z, [x26, x3, LSL #1]\n"
+    "ldp x25, x24, [x4, #0x50]\n"
+    "ld1h { z30.h }, p0/Z, [x25, x3, LSL #1]\n"
+    "ldp x23, x22, [x4, #0x60]\n"
+    "ld1h { z29.h }, p0/Z, [x11, x3, LSL #1]\n"
+    "ldp x21, x20, [x4, #0x70]\n"
+    "ld1h { z28.h }, p0/Z, [x27, x3, LSL #1]\n"
+    "ld1h { z27.h }, p0/Z, [x28, x3, LSL #1]\n"
+    "ld1h { z22.h }, p0/Z, [x24, x3, LSL #1]\n"
+    "ld1h { z21.h }, p0/Z, [x22, x3, LSL #1]\n"
+    "ld1h { z20.h }, p0/Z, [x21, x3, LSL #1]\n"
+    "ld1h { z26.h }, p0/Z, [x15, x3, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
+    "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
+    "incw x3\n"
+    "whilelt p1.h, x3, x5\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "fadd z17.h, z1.h, z0.h\n"
+    "fadd z16.h, z31.h, z30.h\n"
+    "ld1h { z1.h }, p1/Z, [x10, x3, LSL #1]\n"
+    "whilelt p0.h, x6, x5\n"
+    "fadd z19.h, z17.h, z16.h\n"
+    "fadd z18.h, z3.h, z2.h\n"
+    "ld1h { z0.h }, p1/Z, [x9, x3, LSL #1]\n"
+    "fadd z17.h, z29.h, z28.h\n"
+    "fadd z22.h, z27.h, z22.h\n"
+    "ld1h { z31.h }, p1/Z, [x26, x3, LSL #1]\n"
+    "fadd z16.h, z21.h, z20.h\n"
+    "fadd z21.h, z18.h, z19.h\n"
+    "ld1h { z30.h }, p1/Z, [x25, x3, LSL #1]\n"
+    "fadd z20.h, z16.h, z19.h\n"
+    "fadd z19.h, z26.h, z17.h\n"
+    "ld1h { z3.h }, p1/Z, [x14, x3, LSL #1]\n"
+    "fadd z18.h, z25.h, z22.h\n"
+    "fadd z17.h, z24.h, z17.h\n"
+    "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
+    "fadd z16.h, z23.h, z22.h\n"
+    "fadd z19.h, z21.h, z19.h\n"
+    "ld1h { z29.h }, p1/Z, [x11, x3, LSL #1]\n"
+    "fadd z18.h, z21.h, z18.h\n"
+    "fadd z17.h, z17.h, z20.h\n"
+    "ld1h { z28.h }, p1/Z, [x27, x3, LSL #1]\n"
+    "fadd z16.h, z16.h, z20.h\n"
+    "ld1h { z27.h }, p1/Z, [x28, x3, LSL #1]\n"
+    "fmul z19.h, z19.h, z4.h[0]\n"
+    "ld1h { z22.h }, p1/Z, [x24, x3, LSL #1]\n"
+    "fmul z18.h, z18.h, z4.h[1]\n"
+    "fmul z17.h, z17.h, z4.h[2]\n"
+    "ld1h { z21.h }, p1/Z, [x22, x3, LSL #1]\n"
+    "fmul z16.h, z16.h, z4.h[3]\n"
+    "st1h { z19.h }, p0, [x7, x6, LSL #1]\n"
+    "ld1h { z20.h }, p1/Z, [x21, x3, LSL #1]\n"
+    "st1h { z18.h }, p0, [x8, x6, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x15, x3, LSL #1]\n"
+    "st1h { z17.h }, p0, [x17, x6, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
+    "st1h { z16.h }, p0, [x16, x6, LSL #1]\n"
+    "incw x6\n"
+    "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
+    "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
+    "incw x3\n"
+    "whilelt p1.h, x3, x5\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "fadd z17.h, z1.h, z0.h\n"
+    "fadd z16.h, z31.h, z30.h\n"
+    "whilelt p0.h, x6, x5\n"
+    "fadd z19.h, z17.h, z16.h\n"
+    "fadd z18.h, z3.h, z2.h\n"
+    "fadd z17.h, z29.h, z28.h\n"
+    "fadd z22.h, z27.h, z22.h\n"
+    "fadd z16.h, z21.h, z20.h\n"
+    "fadd z21.h, z18.h, z19.h\n"
+    "fadd z20.h, z16.h, z19.h\n"
+    "fadd z19.h, z26.h, z17.h\n"
+    "fadd z18.h, z25.h, z22.h\n"
+    "fadd z17.h, z24.h, z17.h\n"
+    "fadd z16.h, z23.h, z22.h\n"
+    "fadd z19.h, z21.h, z19.h\n"
+    "fadd z18.h, z21.h, z18.h\n"
+    "fadd z17.h, z17.h, z20.h\n"
+    "fadd z16.h, z16.h, z20.h\n"
+    "fmul z19.h, z19.h, z4.h[0]\n"
+    "st1h { z19.h }, p0, [x7, x6, LSL #1]\n"
+    "fmul z18.h, z18.h, z4.h[1]\n"
+    "fmul z17.h, z17.h, z4.h[2]\n"
+    "st1h { z18.h }, p0, [x8, x6, LSL #1]\n"
+    "fmul z16.h, z16.h, z4.h[3]\n"
+    "st1h { z17.h }, p0, [x17, x6, LSL #1]\n"
+    "st1h { z16.h }, p0, [x16, x6, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..cf09f421c4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
+
+struct sme_fp16_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
+{
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
+  sme_fp16_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_fp16_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..60f17b7bc2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp16_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const __fp16 *const *const inptrs,
+  __fp16 *outptr
+)
+{
+  const auto rescale_value = static_cast<__fp16>(1.0f / static_cast<float>(window_cells));
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cnth x28\n"
+    "cnth x27, ALL, MUL #2\n"
+    "cnth x26, ALL, MUL #3\n"
+    "ptrue p0.b\n"
+    "whilelt p3.h, x9, %x[n_channels]\n"
+    "ld1rh { z6.h }, p0/Z, [%x[rescale_ptr]]\n"
+    "whilelt p2.h, x28, %x[n_channels]\n"
+    "whilelt p1.h, x27, %x[n_channels]\n"
+    "whilelt p0.h, x26, %x[n_channels]\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov z4.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z3.b, #0x0\n"
+    "mov z2.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "fadd z23.h, z1.h, z0.h\n"
+    "fadd z19.h, z31.h, z30.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z22.h, z29.h, z22.h\n"
+    "fadd z18.h, z28.h, z18.h\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "fadd z21.h, z27.h, z21.h\n"
+    "fadd z17.h, z26.h, z17.h\n"
+    "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "fadd z20.h, z25.h, z20.h\n"
+    "fadd z16.h, z24.h, z16.h\n"
+    "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "fadd z19.h, z23.h, z19.h\n"
+    "fadd z18.h, z22.h, z18.h\n"
+    "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "fadd z17.h, z21.h, z17.h\n"
+    "fadd z16.h, z20.h, z16.h\n"
+    "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "fadd z5.h, z5.h, z19.h\n"
+    "fadd z4.h, z4.h, z18.h\n"
+    "ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "fadd z3.h, z3.h, z17.h\n"
+    "fadd z2.h, z2.h, z16.h\n"
+    "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "fadd z23.h, z1.h, z0.h\n"
+    "fadd z19.h, z31.h, z30.h\n"
+    "fadd z22.h, z29.h, z22.h\n"
+    "fadd z18.h, z28.h, z18.h\n"
+    "fadd z21.h, z27.h, z21.h\n"
+    "fadd z17.h, z26.h, z17.h\n"
+    "fadd z20.h, z25.h, z20.h\n"
+    "fadd z16.h, z24.h, z16.h\n"
+    "fadd z19.h, z23.h, z19.h\n"
+    "fadd z18.h, z22.h, z18.h\n"
+    "fadd z17.h, z21.h, z17.h\n"
+    "fadd z16.h, z20.h, z16.h\n"
+    "fadd z5.h, z5.h, z19.h\n"
+    "fadd z4.h, z4.h, z18.h\n"
+    "fadd z3.h, z3.h, z17.h\n"
+    "fadd z2.h, z2.h, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z5.h, z5.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "fadd z4.h, z4.h, z16.h\n"
+    "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "fadd z3.h, z3.h, z16.h\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+    "fadd z2.h, z2.h, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "fmul z5.h, z5.h, z6.h\n"
+    "fmul z4.h, z4.h, z6.h\n"
+    "st1h { z5.h }, p3, [%x[outptr], x9, LSL #1]\n"
+    "inch x9, ALL, MUL #4\n"
+    "fmul z3.h, z3.h, z6.h\n"
+    "fmul z2.h, z2.h, z6.h\n"
+    "st1h { z4.h }, p2, [%x[outptr], x28, LSL #1]\n"
+    "inch x28, ALL, MUL #4\n"
+    "st1h { z3.h }, p1, [%x[outptr], x27, LSL #1]\n"
+    "inch x27, ALL, MUL #4\n"
+    "st1h { z2.h }, p0, [%x[outptr], x26, LSL #1]\n"
+    "inch x26, ALL, MUL #4\n"
+    "whilelt p0.h, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p3.h, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1h { z1.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "fadd z17.h, z1.h, z0.h\n"
+    "fadd z16.h, z31.h, z30.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z16.h, z17.h, z16.h\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z5.h, z5.h, z16.h\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "fadd z17.h, z1.h, z0.h\n"
+    "fadd z16.h, z31.h, z30.h\n"
+    "fadd z16.h, z17.h, z16.h\n"
+    "fadd z5.h, z5.h, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z5.h, z5.h, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "fmul z5.h, z5.h, z6.h\n"
+    "st1h { z5.h }, p3, [%x[outptr], x9, LSL #1]\n"
+    "inch x9\n"
+    "whilelt p3.h, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..cd6c7449a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
+{
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
+
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..7fc776ed4e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const __fp16 *const *const inptrs,
+  __fp16 *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const __fp16 *const *const inptrs;
+    __fp16 *const *const outptrs;
+    KernelArgs(
+      unsigned int channels,
+      const __fp16 *const *input_ptrs,
+      __fp16 *const * output_ptrs,
+      bool, unsigned int, unsigned int, unsigned int, unsigned int
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x15, #0x0\n"
+    "ptrue p2.b\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "mov x14, #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.h, x15, x13\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ld1h { z30.h }, p0/Z, [x27, x15, LSL #1]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ld1h { z29.h }, p0/Z, [x25, x15, LSL #1]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ld1h { z28.h }, p0/Z, [x24, x15, LSL #1]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ld1h { z27.h }, p0/Z, [x21, x15, LSL #1]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1h { z26.h }, p0/Z, [x28, x15, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x26, x15, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x23, x15, LSL #1]\n"
+    "ld1h { z19.h }, p0/Z, [x22, x15, LSL #1]\n"
+    "ld1h { z23.h }, p0/Z, [x20, x15, LSL #1]\n"
+    "incw x15\n"
+    "whilelt p1.h, x15, x13\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "movprfx z22, z30\n fmax z22.h, p2/M, z22.h, z28.h\n"
+    "movprfx z21, z28\n fmax z21.h, p2/M, z21.h, z27.h\n"
+    "ld1h { z30.h }, p1/Z, [x27, x15, LSL #1]\n"
+    "whilelt p0.h, x14, x13\n"
+    "movprfx z18, z29\n fmax z18.h, p2/M, z18.h, z26.h\n"
+    "movprfx z17, z25\n fmax z17.h, p2/M, z17.h, z24.h\n"
+    "ld1h { z28.h }, p1/Z, [x24, x15, LSL #1]\n"
+    "movprfx z16, z29\n fmax z16.h, p2/M, z16.h, z19.h\n"
+    "movprfx z20, z24\n fmax z20.h, p2/M, z20.h, z23.h\n"
+    "ld1h { z27.h }, p1/Z, [x21, x15, LSL #1]\n"
+    "ld1h { z29.h }, p1/Z, [x25, x15, LSL #1]\n"
+    "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
+    "movprfx z18, z17\n fmax z18.h, p2/M, z18.h, z22.h\n"
+    "ld1h { z26.h }, p1/Z, [x28, x15, LSL #1]\n"
+    "movprfx z17, z16\n fmax z17.h, p2/M, z17.h, z21.h\n"
+    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
+    "ld1h { z25.h }, p1/Z, [x26, x15, LSL #1]\n"
+    "st1h { z19.h }, p0, [x12, x14, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x23, x15, LSL #1]\n"
+    "st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
+    "ld1h { z19.h }, p1/Z, [x22, x15, LSL #1]\n"
+    "st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
+    "ld1h { z23.h }, p1/Z, [x20, x15, LSL #1]\n"
+    "incw x15\n"
+    "whilelt p1.h, x15, x13\n"
+    "st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
+    "incw x14\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "movprfx z22, z30\n fmax z22.h, p2/M, z22.h, z28.h\n"
+    "movprfx z21, z28\n fmax z21.h, p2/M, z21.h, z27.h\n"
+    "whilelt p0.h, x14, x13\n"
+    "movprfx z20, z29\n fmax z20.h, p2/M, z20.h, z26.h\n"
+    "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z24.h\n"
+    "movprfx z17, z29\n fmax z17.h, p2/M, z17.h, z19.h\n"
+    "movprfx z19, z24\n fmax z19.h, p2/M, z19.h, z23.h\n"
+    "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+    "fmax z18.h, p2/M, z18.h, z22.h\n"
+    "st1h { z16.h }, p0, [x12, x14, LSL #1]\n"
+    "fmax z17.h, p2/M, z17.h, z21.h\n"
+    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z19.h\n"
+    "st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
+    "st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
+    "st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..bfb3bf5b1a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
+
+struct sme_fp16_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
+{
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
+  sme_fp16_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_fp16_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..afa2ccbd71
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp16_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const __fp16 *const *const inptrs,
+  __fp16 *outptr
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cnth x28\n"
+    "cnth x27, ALL, MUL #2\n"
+    "cnth x26, ALL, MUL #3\n"
+    "whilelt p4.h, x9, %x[n_channels]\n"
+    "whilelt p3.h, x28, %x[n_channels]\n"
+    "whilelt p2.h, x27, %x[n_channels]\n"
+    "whilelt p1.h, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.h, #0xfc00\n"
+    "mov z3.h, #0xfc00\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.h, #0xfc00\n"
+    "mov z1.h, #0xfc00\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z17.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z26.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z16.h }, p1/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
+    "fmax z23.h, p0/M, z23.h, z30.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z18.h, p0/M, z18.h, z29.h\n"
+    "fmax z22.h, p0/M, z22.h, z28.h\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "fmax z17.h, p0/M, z17.h, z27.h\n"
+    "fmax z21.h, p0/M, z21.h, z26.h\n"
+    "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "fmax z16.h, p0/M, z16.h, z25.h\n"
+    "fmax z20.h, p0/M, z20.h, z24.h\n"
+    "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "fmax z19.h, p0/M, z19.h, z23.h\n"
+    "fmax z18.h, p0/M, z18.h, z22.h\n"
+    "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "fmax z17.h, p0/M, z17.h, z21.h\n"
+    "fmax z16.h, p0/M, z16.h, z20.h\n"
+    "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "fmax z4.h, p0/M, z4.h, z19.h\n"
+    "fmax z3.h, p0/M, z3.h, z18.h\n"
+    "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
+    "fmax z2.h, p0/M, z2.h, z17.h\n"
+    "fmax z1.h, p0/M, z1.h, z16.h\n"
+    "ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z17.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z26.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z16.h }, p1/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
+    "fmax z23.h, p0/M, z23.h, z30.h\n"
+    "fmax z18.h, p0/M, z18.h, z29.h\n"
+    "fmax z22.h, p0/M, z22.h, z28.h\n"
+    "fmax z17.h, p0/M, z17.h, z27.h\n"
+    "fmax z21.h, p0/M, z21.h, z26.h\n"
+    "fmax z16.h, p0/M, z16.h, z25.h\n"
+    "fmax z20.h, p0/M, z20.h, z24.h\n"
+    "fmax z19.h, p0/M, z19.h, z23.h\n"
+    "fmax z18.h, p0/M, z18.h, z22.h\n"
+    "fmax z17.h, p0/M, z17.h, z21.h\n"
+    "fmax z16.h, p0/M, z16.h, z20.h\n"
+    "fmax z4.h, p0/M, z4.h, z19.h\n"
+    "fmax z3.h, p0/M, z3.h, z18.h\n"
+    "fmax z2.h, p0/M, z2.h, z17.h\n"
+    "fmax z1.h, p0/M, z1.h, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z4.h, p0/M, z4.h, z16.h\n"
+    "ld1h { z16.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "fmax z3.h, p0/M, z3.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "fmax z2.h, p0/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
+    "fmax z1.h, p0/M, z1.h, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "st1h { z4.h }, p4, [%x[outptr], x9, LSL #1]\n"
+    "inch x9, ALL, MUL #4\n"
+    "st1h { z3.h }, p3, [%x[outptr], x28, LSL #1]\n"
+    "inch x28, ALL, MUL #4\n"
+    "st1h { z2.h }, p2, [%x[outptr], x27, LSL #1]\n"
+    "inch x27, ALL, MUL #4\n"
+    "st1h { z1.h }, p1, [%x[outptr], x26, LSL #1]\n"
+    "inch x26, ALL, MUL #4\n"
+    "whilelt p1.h, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.h, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.h, #0xfc00\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1h { z0.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
+    "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z16.h, p0/M, z16.h, z17.h\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fmax z4.h, p0/M, z4.h, z16.h\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
+    "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+    "fmax z16.h, p0/M, z16.h, z17.h\n"
+    "fmax z4.h, p0/M, z4.h, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z4.h, p0/M, z4.h, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "st1h { z4.h }, p4, [%x[outptr], x9, LSL #1]\n"
+    "inch x9\n"
+    "whilelt p4.h, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..23a0eee04e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
+{
+  using Parent = DepthfirstStrategy<float, float>;
+
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..8c8532827a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const float *const *const inptrs;
+    float *const *const outptrs;
+    float rescale_vals[4];
+
+    KernelArgs(
+      unsigned int channels,
+      const float *const *input_ptrs,
+      float *const * output_ptrs,
+      bool exclude_padding, unsigned int pad_left, unsigned int pad_top, unsigned int pad_right, unsigned int pad_bottom
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+      for (unsigned int i = 0; i < 2; i++)
+      {
+        const int start_i = 1*i - static_cast<int>(pad_top);
+        const int end_i = std::min<int>(start_i + 3, 4 - pad_top - pad_bottom);
+        const int valid_rows = end_i - std::max<int>(0, start_i);
+
+        for (unsigned int j = 0; j < 2; j++)
+        {
+          const int start_j = 1*j - static_cast<int>(pad_left);
+          const int end_j = std::min<int>(start_j + 3, 4 - pad_left - pad_right);
+          const int valid_cols = end_j - std::max<int>(0, start_j);
+
+          rescale_vals[i*2 + j] = static_cast<float>(1.0f / static_cast<float>(
+            exclude_padding ? valid_rows * valid_cols : 9
+          ));
+        }
+      }
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x3, #0x0\n"
+    "mov x20, #0x4\n"
+    "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+    "whilelt p0.s, XZR, x20\n"
+    "add x20, %x[args], %[offsetof_rescale]\n"
+    "ld1rqw { z4.s }, p0/Z, [x20]\n"
+    "ldr x5, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.s, x3, x5\n"
+    "mov x6, #0x0\n"
+    "ldp x7, x8, [x21, #0x0]\n"
+    "ldp x17, x16, [x21, #0x10]\n"
+    "ldp x15, x14, [x4, #0x0]\n"
+    "ld1w { z3.s }, p0/Z, [x14, x3, LSL #2]\n"
+    "ldp x13, x12, [x4, #0x10]\n"
+    "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
+    "ldp x11, x10, [x4, #0x20]\n"
+    "ld1w { z1.s }, p0/Z, [x10, x3, LSL #2]\n"
+    "ldp x9, x28, [x4, #0x30]\n"
+    "ld1w { z0.s }, p0/Z, [x9, x3, LSL #2]\n"
+    "ldp x27, x26, [x4, #0x40]\n"
+    "ld1w { z31.s }, p0/Z, [x26, x3, LSL #2]\n"
+    "ldp x25, x24, [x4, #0x50]\n"
+    "ld1w { z30.s }, p0/Z, [x25, x3, LSL #2]\n"
+    "ldp x23, x22, [x4, #0x60]\n"
+    "ld1w { z29.s }, p0/Z, [x11, x3, LSL #2]\n"
+    "ldp x21, x20, [x4, #0x70]\n"
+    "ld1w { z28.s }, p0/Z, [x27, x3, LSL #2]\n"
+    "ld1w { z27.s }, p0/Z, [x28, x3, LSL #2]\n"
+    "ld1w { z22.s }, p0/Z, [x24, x3, LSL #2]\n"
+    "ld1w { z21.s }, p0/Z, [x22, x3, LSL #2]\n"
+    "ld1w { z20.s }, p0/Z, [x21, x3, LSL #2]\n"
+    "ld1w { z26.s }, p0/Z, [x15, x3, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
+    "incw x3\n"
+    "whilelt p1.s, x3, x5\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "fadd z17.s, z1.s, z0.s\n"
+    "fadd z16.s, z31.s, z30.s\n"
+    "ld1w { z1.s }, p1/Z, [x10, x3, LSL #2]\n"
+    "whilelt p0.s, x6, x5\n"
+    "fadd z19.s, z17.s, z16.s\n"
+    "fadd z18.s, z3.s, z2.s\n"
+    "ld1w { z0.s }, p1/Z, [x9, x3, LSL #2]\n"
+    "fadd z17.s, z29.s, z28.s\n"
+    "fadd z22.s, z27.s, z22.s\n"
+    "ld1w { z31.s }, p1/Z, [x26, x3, LSL #2]\n"
+    "fadd z16.s, z21.s, z20.s\n"
+    "fadd z21.s, z18.s, z19.s\n"
+    "ld1w { z30.s }, p1/Z, [x25, x3, LSL #2]\n"
+    "fadd z20.s, z16.s, z19.s\n"
+    "fadd z19.s, z26.s, z17.s\n"
+    "ld1w { z3.s }, p1/Z, [x14, x3, LSL #2]\n"
+    "fadd z18.s, z25.s, z22.s\n"
+    "fadd z17.s, z24.s, z17.s\n"
+    "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
+    "fadd z16.s, z23.s, z22.s\n"
+    "fadd z19.s, z21.s, z19.s\n"
+    "ld1w { z29.s }, p1/Z, [x11, x3, LSL #2]\n"
+    "fadd z18.s, z21.s, z18.s\n"
+    "fadd z17.s, z17.s, z20.s\n"
+    "ld1w { z28.s }, p1/Z, [x27, x3, LSL #2]\n"
+    "fadd z16.s, z16.s, z20.s\n"
+    "ld1w { z27.s }, p1/Z, [x28, x3, LSL #2]\n"
+    "fmul z19.s, z19.s, z4.s[0]\n"
+    "ld1w { z22.s }, p1/Z, [x24, x3, LSL #2]\n"
+    "fmul z18.s, z18.s, z4.s[1]\n"
+    "fmul z17.s, z17.s, z4.s[2]\n"
+    "ld1w { z21.s }, p1/Z, [x22, x3, LSL #2]\n"
+    "fmul z16.s, z16.s, z4.s[3]\n"
+    "st1w { z19.s }, p0, [x7, x6, LSL #2]\n"
+    "ld1w { z20.s }, p1/Z, [x21, x3, LSL #2]\n"
+    "st1w { z18.s }, p0, [x8, x6, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x15, x3, LSL #2]\n"
+    "st1w { z17.s }, p0, [x17, x6, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
+    "st1w { z16.s }, p0, [x16, x6, LSL #2]\n"
+    "incw x6\n"
+    "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
+    "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
+    "incw x3\n"
+    "whilelt p1.s, x3, x5\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "fadd z17.s, z1.s, z0.s\n"
+    "fadd z16.s, z31.s, z30.s\n"
+    "whilelt p0.s, x6, x5\n"
+    "fadd z19.s, z17.s, z16.s\n"
+    "fadd z18.s, z3.s, z2.s\n"
+    "fadd z17.s, z29.s, z28.s\n"
+    "fadd z22.s, z27.s, z22.s\n"
+    "fadd z16.s, z21.s, z20.s\n"
+    "fadd z21.s, z18.s, z19.s\n"
+    "fadd z20.s, z16.s, z19.s\n"
+    "fadd z19.s, z26.s, z17.s\n"
+    "fadd z18.s, z25.s, z22.s\n"
+    "fadd z17.s, z24.s, z17.s\n"
+    "fadd z16.s, z23.s, z22.s\n"
+    "fadd z19.s, z21.s, z19.s\n"
+    "fadd z18.s, z21.s, z18.s\n"
+    "fadd z17.s, z17.s, z20.s\n"
+    "fadd z16.s, z16.s, z20.s\n"
+    "fmul z19.s, z19.s, z4.s[0]\n"
+    "st1w { z19.s }, p0, [x7, x6, LSL #2]\n"
+    "fmul z18.s, z18.s, z4.s[1]\n"
+    "fmul z17.s, z17.s, z4.s[2]\n"
+    "st1w { z18.s }, p0, [x8, x6, LSL #2]\n"
+    "fmul z16.s, z16.s, z4.s[3]\n"
+    "st1w { z17.s }, p0, [x17, x6, LSL #2]\n"
+    "st1w { z16.s }, p0, [x16, x6, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..29bcfc5a3b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
+
+struct sme_fp32_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
+{
+  using Parent = IGenericDepthfirstStrategy<float, float>;
+  sme_fp32_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_fp32_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..86e7f84542
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp32_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const float *const *const inptrs,
+  float *outptr
+)
+{
+  const auto rescale_value = static_cast<float>(1.0f / static_cast<float>(window_cells));
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntw x28\n"
+    "cntw x27, ALL, MUL #2\n"
+    "cntw x26, ALL, MUL #3\n"
+    "ptrue p0.b\n"
+    "whilelt p3.s, x9, %x[n_channels]\n"
+    "ld1rw { z6.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "whilelt p2.s, x28, %x[n_channels]\n"
+    "whilelt p1.s, x27, %x[n_channels]\n"
+    "whilelt p0.s, x26, %x[n_channels]\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov z4.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z3.b, #0x0\n"
+    "mov z2.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "fadd z23.s, z1.s, z0.s\n"
+    "fadd z19.s, z31.s, z30.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z22.s, z29.s, z22.s\n"
+    "fadd z18.s, z28.s, z18.s\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "fadd z21.s, z27.s, z21.s\n"
+    "fadd z17.s, z26.s, z17.s\n"
+    "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "fadd z20.s, z25.s, z20.s\n"
+    "fadd z16.s, z24.s, z16.s\n"
+    "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "fadd z19.s, z23.s, z19.s\n"
+    "fadd z18.s, z22.s, z18.s\n"
+    "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "fadd z17.s, z21.s, z17.s\n"
+    "fadd z16.s, z20.s, z16.s\n"
+    "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "fadd z5.s, z5.s, z19.s\n"
+    "fadd z4.s, z4.s, z18.s\n"
+    "ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "fadd z3.s, z3.s, z17.s\n"
+    "fadd z2.s, z2.s, z16.s\n"
+    "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "fadd z23.s, z1.s, z0.s\n"
+    "fadd z19.s, z31.s, z30.s\n"
+    "fadd z22.s, z29.s, z22.s\n"
+    "fadd z18.s, z28.s, z18.s\n"
+    "fadd z21.s, z27.s, z21.s\n"
+    "fadd z17.s, z26.s, z17.s\n"
+    "fadd z20.s, z25.s, z20.s\n"
+    "fadd z16.s, z24.s, z16.s\n"
+    "fadd z19.s, z23.s, z19.s\n"
+    "fadd z18.s, z22.s, z18.s\n"
+    "fadd z17.s, z21.s, z17.s\n"
+    "fadd z16.s, z20.s, z16.s\n"
+    "fadd z5.s, z5.s, z19.s\n"
+    "fadd z4.s, z4.s, z18.s\n"
+    "fadd z3.s, z3.s, z17.s\n"
+    "fadd z2.s, z2.s, z16.s\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z5.s, z5.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fadd z4.s, z4.s, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "fadd z3.s, z3.s, z16.s\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+    "fadd z2.s, z2.s, z16.s\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "fmul z5.s, z5.s, z6.s\n"
+    "fmul z4.s, z4.s, z6.s\n"
+    "st1w { z5.s }, p3, [%x[outptr], x9, LSL #2]\n"
+    "incw x9, ALL, MUL #4\n"
+    "fmul z3.s, z3.s, z6.s\n"
+    "fmul z2.s, z2.s, z6.s\n"
+    "st1w { z4.s }, p2, [%x[outptr], x28, LSL #2]\n"
+    "incw x28, ALL, MUL #4\n"
+    "st1w { z3.s }, p1, [%x[outptr], x27, LSL #2]\n"
+    "incw x27, ALL, MUL #4\n"
+    "st1w { z2.s }, p0, [%x[outptr], x26, LSL #2]\n"
+    "incw x26, ALL, MUL #4\n"
+    "whilelt p0.s, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p3.s, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1w { z1.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "fadd z17.s, z1.s, z0.s\n"
+    "fadd z16.s, z31.s, z30.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z16.s, z17.s, z16.s\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z5.s, z5.s, z16.s\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "fadd z17.s, z1.s, z0.s\n"
+    "fadd z16.s, z31.s, z30.s\n"
+    "fadd z16.s, z17.s, z16.s\n"
+    "fadd z5.s, z5.s, z16.s\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z5.s, z5.s, z16.s\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "fmul z5.s, z5.s, z6.s\n"
+    "st1w { z5.s }, p3, [%x[outptr], x9, LSL #2]\n"
+    "incw x9\n"
+    "whilelt p3.s, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..338348231f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
+{
+  using Parent = DepthfirstStrategy<float, float>;
+
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..3c7213a498
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const float *const *const inptrs;
+    float *const *const outptrs;
+    KernelArgs(
+      unsigned int channels,
+      const float *const *input_ptrs,
+      float *const * output_ptrs,
+      bool, unsigned int, unsigned int, unsigned int, unsigned int
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x15, #0x0\n"
+    "ptrue p2.b\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "mov x14, #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.s, x15, x13\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ld1w { z30.s }, p0/Z, [x27, x15, LSL #2]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ld1w { z29.s }, p0/Z, [x25, x15, LSL #2]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ld1w { z28.s }, p0/Z, [x24, x15, LSL #2]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ld1w { z27.s }, p0/Z, [x21, x15, LSL #2]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1w { z26.s }, p0/Z, [x28, x15, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x26, x15, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x23, x15, LSL #2]\n"
+    "ld1w { z19.s }, p0/Z, [x22, x15, LSL #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20, x15, LSL #2]\n"
+    "incw x15\n"
+    "whilelt p1.s, x15, x13\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "movprfx z22, z30\n fmax z22.s, p2/M, z22.s, z28.s\n"
+    "movprfx z21, z28\n fmax z21.s, p2/M, z21.s, z27.s\n"
+    "ld1w { z30.s }, p1/Z, [x27, x15, LSL #2]\n"
+    "whilelt p0.s, x14, x13\n"
+    "movprfx z18, z29\n fmax z18.s, p2/M, z18.s, z26.s\n"
+    "movprfx z17, z25\n fmax z17.s, p2/M, z17.s, z24.s\n"
+    "ld1w { z28.s }, p1/Z, [x24, x15, LSL #2]\n"
+    "movprfx z16, z29\n fmax z16.s, p2/M, z16.s, z19.s\n"
+    "movprfx z20, z24\n fmax z20.s, p2/M, z20.s, z23.s\n"
+    "ld1w { z27.s }, p1/Z, [x21, x15, LSL #2]\n"
+    "ld1w { z29.s }, p1/Z, [x25, x15, LSL #2]\n"
+    "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
+    "movprfx z18, z17\n fmax z18.s, p2/M, z18.s, z22.s\n"
+    "ld1w { z26.s }, p1/Z, [x28, x15, LSL #2]\n"
+    "movprfx z17, z16\n fmax z17.s, p2/M, z17.s, z21.s\n"
+    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
+    "ld1w { z25.s }, p1/Z, [x26, x15, LSL #2]\n"
+    "st1w { z19.s }, p0, [x12, x14, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x23, x15, LSL #2]\n"
+    "st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
+    "ld1w { z19.s }, p1/Z, [x22, x15, LSL #2]\n"
+    "st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
+    "ld1w { z23.s }, p1/Z, [x20, x15, LSL #2]\n"
+    "incw x15\n"
+    "whilelt p1.s, x15, x13\n"
+    "st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
+    "incw x14\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "movprfx z22, z30\n fmax z22.s, p2/M, z22.s, z28.s\n"
+    "movprfx z21, z28\n fmax z21.s, p2/M, z21.s, z27.s\n"
+    "whilelt p0.s, x14, x13\n"
+    "movprfx z20, z29\n fmax z20.s, p2/M, z20.s, z26.s\n"
+    "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z24.s\n"
+    "movprfx z17, z29\n fmax z17.s, p2/M, z17.s, z19.s\n"
+    "movprfx z19, z24\n fmax z19.s, p2/M, z19.s, z23.s\n"
+    "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+    "fmax z18.s, p2/M, z18.s, z22.s\n"
+    "st1w { z16.s }, p0, [x12, x14, LSL #2]\n"
+    "fmax z17.s, p2/M, z17.s, z21.s\n"
+    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z19.s\n"
+    "st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
+    "st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
+    "st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..9bc1f11601
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
+
+struct sme_fp32_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
+{
+  using Parent = IGenericDepthfirstStrategy<float, float>;
+  sme_fp32_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_fp32_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..0dabc2f292
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp32_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const float *const *const inptrs,
+  float *outptr
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntw x28\n"
+    "cntw x27, ALL, MUL #2\n"
+    "cntw x26, ALL, MUL #3\n"
+    "whilelt p4.s, x9, %x[n_channels]\n"
+    "whilelt p3.s, x28, %x[n_channels]\n"
+    "whilelt p2.s, x27, %x[n_channels]\n"
+    "whilelt p1.s, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.s, #0xff800000\n"
+    "mov z3.s, #0xff800000\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.s, #0xff800000\n"
+    "mov z1.s, #0xff800000\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z26.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
+    "fmax z23.s, p0/M, z23.s, z30.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z18.s, p0/M, z18.s, z29.s\n"
+    "fmax z22.s, p0/M, z22.s, z28.s\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "fmax z17.s, p0/M, z17.s, z27.s\n"
+    "fmax z21.s, p0/M, z21.s, z26.s\n"
+    "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "fmax z16.s, p0/M, z16.s, z25.s\n"
+    "fmax z20.s, p0/M, z20.s, z24.s\n"
+    "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "fmax z19.s, p0/M, z19.s, z23.s\n"
+    "fmax z18.s, p0/M, z18.s, z22.s\n"
+    "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "fmax z17.s, p0/M, z17.s, z21.s\n"
+    "fmax z16.s, p0/M, z16.s, z20.s\n"
+    "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "fmax z4.s, p0/M, z4.s, z19.s\n"
+    "fmax z3.s, p0/M, z3.s, z18.s\n"
+    "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
+    "fmax z2.s, p0/M, z2.s, z17.s\n"
+    "fmax z1.s, p0/M, z1.s, z16.s\n"
+    "ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z26.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
+    "fmax z23.s, p0/M, z23.s, z30.s\n"
+    "fmax z18.s, p0/M, z18.s, z29.s\n"
+    "fmax z22.s, p0/M, z22.s, z28.s\n"
+    "fmax z17.s, p0/M, z17.s, z27.s\n"
+    "fmax z21.s, p0/M, z21.s, z26.s\n"
+    "fmax z16.s, p0/M, z16.s, z25.s\n"
+    "fmax z20.s, p0/M, z20.s, z24.s\n"
+    "fmax z19.s, p0/M, z19.s, z23.s\n"
+    "fmax z18.s, p0/M, z18.s, z22.s\n"
+    "fmax z17.s, p0/M, z17.s, z21.s\n"
+    "fmax z16.s, p0/M, z16.s, z20.s\n"
+    "fmax z4.s, p0/M, z4.s, z19.s\n"
+    "fmax z3.s, p0/M, z3.s, z18.s\n"
+    "fmax z2.s, p0/M, z2.s, z17.s\n"
+    "fmax z1.s, p0/M, z1.s, z16.s\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z4.s, p0/M, z4.s, z16.s\n"
+    "ld1w { z16.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "fmax z3.s, p0/M, z3.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "fmax z2.s, p0/M, z2.s, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
+    "fmax z1.s, p0/M, z1.s, z16.s\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "st1w { z4.s }, p4, [%x[outptr], x9, LSL #2]\n"
+    "incw x9, ALL, MUL #4\n"
+    "st1w { z3.s }, p3, [%x[outptr], x28, LSL #2]\n"
+    "incw x28, ALL, MUL #4\n"
+    "st1w { z2.s }, p2, [%x[outptr], x27, LSL #2]\n"
+    "incw x27, ALL, MUL #4\n"
+    "st1w { z1.s }, p1, [%x[outptr], x26, LSL #2]\n"
+    "incw x26, ALL, MUL #4\n"
+    "whilelt p1.s, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.s, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.s, #0xff800000\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1w { z0.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
+    "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z16.s, p0/M, z16.s, z17.s\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fmax z4.s, p0/M, z4.s, z16.s\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
+    "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+    "fmax z16.s, p0/M, z16.s, z17.s\n"
+    "fmax z4.s, p0/M, z4.s, z16.s\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z4.s, p0/M, z4.s, z16.s\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "st1w { z4.s }, p4, [%x[outptr], x9, LSL #2]\n"
+    "incw x9\n"
+    "whilelt p4.s, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..318510e697
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
+
+struct sme_s8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
+{
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
+  sme_s8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_s8_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c24e977dc6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+  struct RescaleParams
+  {
+    int32_t multiplier, shift;
+  };
+
+  constexpr RescaleParams rescale_params[8] = {
+    {0x40000000, -0},  // 1/2
+    {0x55555556, -1},  // 1/3
+    {0x40000000, -1},  // 1/4
+    {0x66666666, -2},  // 1/5
+    {0x55555556, -2},  // 1/6
+    {0x49249249, -2},  // 1/7
+    {0x40000000, -2},  // 1/8
+    {0x71c71c72, -3},  // 1/9
+  };
+}
+
+void sme_s8_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const int8_t *const *const inptrs,
+  int8_t *outptr
+)
+{
+  if (n_valid_cells == 1 && window_cells == 1)
+  {
+    // In this case, simply copy from the input to the output
+    std::memcpy(outptr, *inptrs, n_channels);
+    return;
+  }
+
+  // Compute (or look up) the rescale values
+  int32_t shift_value = 0, rescale_value = 0;
+  if (2 <= window_cells && window_cells <= 9)
+  {
+    auto &params = rescale_params[window_cells - 2];
+    rescale_value = params.multiplier;
+    shift_value = params.shift;
+  }
+  else
+  {
+    auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+    shift_value = 0;
+    while (f_rescale_value < 0.5f)
+    {
+      shift_value--;
+      f_rescale_value *= 2.0f;
+    }
+
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
+    {
+      shift_value++;
+      long_rescale_value >>= 1;
+    }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
+  }
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p2.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "mov z11.s, #0x0\n"
+    "mov z10.s, #0x0\n"
+    "mov z9.s, #0x0\n"
+    "mov z8.s, #0x0\n"
+    "mov z7.s, #0x0\n"
+    "mov z6.s, #0x0\n"
+    "mov z5.s, #0x0\n"
+    "mov z4.s, #0x0\n"
+    "mov z3.s, #0x0\n"
+    "mov z2.s, #0x0\n"
+    "mov z1.s, #0x0\n"
+    "mov z0.s, #0x0\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 2 inputs loop
+    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 2 inputs tail
+    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
+    ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
+    ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
+    ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a217  // sshllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508a616  // sshllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x4508a215  // sshllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508a614  // sshllt z20.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508a213  // sshllb z19.h, z16.b, #0x0\n"
+    ".inst 0x4508a612  // sshllt z18.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
+    ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+    ".inst 0x04b1756b  // sqdmulh z11.s, z11.s, z17.s\n"
+    ".inst 0x04b1754a  // sqdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x04b17529  // sqdmulh z9.s, z9.s, z17.s\n"
+    ".inst 0x04b17508  // sqdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x04b174e7  // sqdmulh z7.s, z7.s, z17.s\n"
+    ".inst 0x04b174c6  // sqdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x04b174a5  // sqdmulh z5.s, z5.s, z17.s\n"
+    ".inst 0x04b17484  // sqdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x04b17463  // sqdmulh z3.s, z3.s, z17.s\n"
+    ".inst 0x04b17442  // sqdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x04b17421  // sqdmulh z1.s, z1.s, z17.s\n"
+    ".inst 0x04b17400  // sqdmulh z0.s, z0.s, z17.s\n"
+    "mov z19.s, #0x7f\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "not z16.s, p0/M, z19.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z19.s\n"
+    "smin z14.s, p0/M, z14.s, z19.s\n"
+    "trn1 z23.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z19.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "smin z11.s, p0/M, z11.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z19.s\n"
+    "trn1 z22.h, z11.h, z10.h\n"
+    "smin z9.s, p0/M, z9.s, z19.s\n"
+    "smin z8.s, p0/M, z8.s, z19.s\n"
+    "trn1 z18.h, z9.h, z8.h\n"
+    "smin z7.s, p0/M, z7.s, z19.s\n"
+    "smin z6.s, p0/M, z6.s, z19.s\n"
+    "trn1 z21.h, z7.h, z6.h\n"
+    "smin z5.s, p0/M, z5.s, z19.s\n"
+    "smin z4.s, p0/M, z4.s, z19.s\n"
+    "trn1 z17.h, z5.h, z4.h\n"
+    "smin z3.s, p0/M, z3.s, z19.s\n"
+    "smin z2.s, p0/M, z2.s, z19.s\n"
+    "trn1 z20.h, z3.h, z2.h\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "trn1 z19.h, z1.h, z0.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+    "incb x25, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 2 inputs loop
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 2 inputs tail
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
+    ".inst 0x04b075ef  // sqdmulh z15.s, z15.s, z16.s\n"
+    ".inst 0x04b075ce  // sqdmulh z14.s, z14.s, z16.s\n"
+    ".inst 0x04b075ad  // sqdmulh z13.s, z13.s, z16.s\n"
+    ".inst 0x04b0758c  // sqdmulh z12.s, z12.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..c9a80e6a5b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<int8_t, int8_t>
+{
+  using Parent = DepthfirstStrategy<int8_t, int8_t>;
+
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..96617566a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const int8_t *const *const inptrs;
+    int8_t *const *const outptrs;
+    KernelArgs(
+      unsigned int channels,
+      const int8_t *const *input_ptrs,
+      int8_t *const * output_ptrs,
+      bool, unsigned int, unsigned int, unsigned int, unsigned int
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x15, #0x0\n"
+    "ptrue p2.b\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "mov x14, #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.b, x15, x13\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ld1b { z30.b }, p0/Z, [x27, x15]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ld1b { z29.b }, p0/Z, [x25, x15]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ld1b { z28.b }, p0/Z, [x24, x15]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ld1b { z27.b }, p0/Z, [x21, x15]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1b { z26.b }, p0/Z, [x28, x15]\n"
+    "ld1b { z25.b }, p0/Z, [x26, x15]\n"
+    "ld1b { z24.b }, p0/Z, [x23, x15]\n"
+    "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+    "ld1b { z23.b }, p0/Z, [x20, x15]\n"
+    "incw x15\n"
+    "whilelt p1.b, x15, x13\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "movprfx z22, z30\n smax z22.b, p2/M, z22.b, z28.b\n"
+    "movprfx z21, z28\n smax z21.b, p2/M, z21.b, z27.b\n"
+    "ld1b { z30.b }, p1/Z, [x27, x15]\n"
+    "whilelt p0.b, x14, x13\n"
+    "movprfx z18, z29\n smax z18.b, p2/M, z18.b, z26.b\n"
+    "movprfx z17, z25\n smax z17.b, p2/M, z17.b, z24.b\n"
+    "ld1b { z28.b }, p1/Z, [x24, x15]\n"
+    "movprfx z16, z29\n smax z16.b, p2/M, z16.b, z19.b\n"
+    "movprfx z20, z24\n smax z20.b, p2/M, z20.b, z23.b\n"
+    "ld1b { z27.b }, p1/Z, [x21, x15]\n"
+    "ld1b { z29.b }, p1/Z, [x25, x15]\n"
+    "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
+    "movprfx z18, z17\n smax z18.b, p2/M, z18.b, z22.b\n"
+    "ld1b { z26.b }, p1/Z, [x28, x15]\n"
+    "movprfx z17, z16\n smax z17.b, p2/M, z17.b, z21.b\n"
+    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
+    "ld1b { z25.b }, p1/Z, [x26, x15]\n"
+    "st1b { z19.b }, p0, [x12, x14]\n"
+    "ld1b { z24.b }, p1/Z, [x23, x15]\n"
+    "st1b { z18.b }, p0, [x11, x14]\n"
+    "ld1b { z19.b }, p1/Z, [x22, x15]\n"
+    "st1b { z17.b }, p0, [x10, x14]\n"
+    "ld1b { z23.b }, p1/Z, [x20, x15]\n"
+    "incw x15\n"
+    "whilelt p1.b, x15, x13\n"
+    "st1b { z16.b }, p0, [x9, x14]\n"
+    "incw x14\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "movprfx z22, z30\n smax z22.b, p2/M, z22.b, z28.b\n"
+    "movprfx z21, z28\n smax z21.b, p2/M, z21.b, z27.b\n"
+    "whilelt p0.b, x14, x13\n"
+    "movprfx z20, z29\n smax z20.b, p2/M, z20.b, z26.b\n"
+    "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z24.b\n"
+    "movprfx z17, z29\n smax z17.b, p2/M, z17.b, z19.b\n"
+    "movprfx z19, z24\n smax z19.b, p2/M, z19.b, z23.b\n"
+    "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+    "smax z18.b, p2/M, z18.b, z22.b\n"
+    "st1b { z16.b }, p0, [x12, x14]\n"
+    "smax z17.b, p2/M, z17.b, z21.b\n"
+    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z19.b\n"
+    "st1b { z18.b }, p0, [x11, x14]\n"
+    "st1b { z17.b }, p0, [x10, x14]\n"
+    "st1b { z16.b }, p0, [x9, x14]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..3e0d76c277
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
+
+struct sme_s8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
+{
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
+  sme_s8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_s8_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d2b45cd353
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_s8_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const int8_t *const *const inptrs,
+  int8_t *outptr
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p2.b, x27, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x80\n"
+    "mov z3.b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.b, #0x80\n"
+    "mov z1.b, #0x80\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+    "smax z23.b, p0/M, z23.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z18.b, p0/M, z18.b, z29.b\n"
+    "smax z22.b, p0/M, z22.b, z28.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "smax z17.b, p0/M, z17.b, z27.b\n"
+    "smax z21.b, p0/M, z21.b, z26.b\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "smax z4.b, p0/M, z4.b, z19.b\n"
+    "smax z3.b, p0/M, z3.b, z18.b\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "smax z2.b, p0/M, z2.b, z17.b\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+    "smax z23.b, p0/M, z23.b, z30.b\n"
+    "smax z18.b, p0/M, z18.b, z29.b\n"
+    "smax z22.b, p0/M, z22.b, z28.b\n"
+    "smax z17.b, p0/M, z17.b, z27.b\n"
+    "smax z21.b, p0/M, z21.b, z26.b\n"
+    "smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "smax z4.b, p0/M, z4.b, z19.b\n"
+    "smax z3.b, p0/M, z3.b, z18.b\n"
+    "smax z2.b, p0/M, z2.b, z17.b\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+    "smax z3.b, p0/M, z3.b, z16.b\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "smax z2.b, p0/M, z2.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
+    "st1b { z3.b }, p3, [%x[outptr], x28]\n"
+    "incb x28, ALL, MUL #4\n"
+    "st1b { z2.b }, p2, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "st1b { z1.b }, p1, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..c6263f5dbc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
+
+struct sme_s8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
+{
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
+  sme_s8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_s8q_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..91f2f7ab31
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+  struct RescaleParams
+  {
+    int32_t multiplier, shift;
+  };
+
+  constexpr RescaleParams rescale_params[8] = {
+    {0x40000000, -0},  // 1/2
+    {0x55555556, -1},  // 1/3
+    {0x40000000, -1},  // 1/4
+    {0x66666666, -2},  // 1/5
+    {0x55555556, -2},  // 1/6
+    {0x49249249, -2},  // 1/7
+    {0x40000000, -2},  // 1/8
+    {0x71c71c72, -3},  // 1/9
+  };
+}
+
+void sme_s8q_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const int8_t *const *const inptrs,
+  int8_t *outptr,
+  const Requantize32 &qp
+)
+{
+  if (n_valid_cells == 1 && window_cells == 1)
+  {
+    // In this case, simply copy from the input to the output
+    std::memcpy(outptr, *inptrs, n_channels);
+    return;
+  }
+
+  // Compute (or look up) the rescale values
+  int32_t shift_value = 0, rescale_value = 0;
+  if (2 <= window_cells && window_cells <= 9)
+  {
+    auto &params = rescale_params[window_cells - 2];
+    rescale_value = params.multiplier;
+    shift_value = params.shift;
+  }
+  else
+  {
+    auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+    shift_value = 0;
+    while (f_rescale_value < 0.5f)
+    {
+      shift_value--;
+      f_rescale_value *= 2.0f;
+    }
+
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
+    {
+      shift_value++;
+      long_rescale_value >>= 1;
+    }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
+  }
+
+  // Combine together the rescale value for the requantization and the scaling
+  // factor for the average pool.
+  const int32_t shift = qp.per_layer_left_shift - qp.per_layer_right_shift + shift_value;
+  const int32_t left_shift = shift > 0 ? shift : 0;
+  const int32_t right_shift = shift <= 0 ? shift : 0;
+
+  int32_t combined_rescale_value = 0;
+  __asm__ __volatile__ (
+      "mov v16.s[0], %w[per_layer_mul]\n"
+      "mov v17.s[0], %w[rescale_value]\n"
+      "sqrdmulh s18, s16, s17\n"
+      "mov %w[combined_rescale_value], v18.s[0]\n"
+    : [combined_rescale_value] "=r" (combined_rescale_value)
+    : [per_layer_mul] "r" (qp.per_layer_mul), [rescale_value] "r" (rescale_value)
+    : "v16", "v17", "v18"
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p2.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "mov z11.s, #0x0\n"
+    "mov z10.s, #0x0\n"
+    "mov z9.s, #0x0\n"
+    "mov z8.s, #0x0\n"
+    "mov z7.s, #0x0\n"
+    "mov z6.s, #0x0\n"
+    "mov z5.s, #0x0\n"
+    "mov z4.s, #0x0\n"
+    "mov z3.s, #0x0\n"
+    "mov z2.s, #0x0\n"
+    "mov z1.s, #0x0\n"
+    "mov z0.s, #0x0\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 2 inputs loop
+    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 2 inputs tail
+    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
+    ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
+    ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
+    ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a217  // sshllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508a616  // sshllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x4508a215  // sshllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508a614  // sshllt z20.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508a213  // sshllb z19.h, z16.b, #0x0\n"
+    ".inst 0x4508a612  // sshllt z18.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+    ".inst 0x4482824f  // srshl z15.s, p0/M, z15.s, z18.s\n"
+    ".inst 0x4482824e  // srshl z14.s, p0/M, z14.s, z18.s\n"
+    ".inst 0x4482824d  // srshl z13.s, p0/M, z13.s, z18.s\n"
+    ".inst 0x4482824c  // srshl z12.s, p0/M, z12.s, z18.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482824b  // srshl z11.s, p0/M, z11.s, z18.s\n"
+    ".inst 0x4482824a  // srshl z10.s, p0/M, z10.s, z18.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x44828249  // srshl z9.s, p0/M, z9.s, z18.s\n"
+    ".inst 0x44828248  // srshl z8.s, p0/M, z8.s, z18.s\n"
+    ".inst 0x44828247  // srshl z7.s, p0/M, z7.s, z18.s\n"
+    ".inst 0x44828246  // srshl z6.s, p0/M, z6.s, z18.s\n"
+    ".inst 0x44828245  // srshl z5.s, p0/M, z5.s, z18.s\n"
+    ".inst 0x44828244  // srshl z4.s, p0/M, z4.s, z18.s\n"
+    ".inst 0x44828243  // srshl z3.s, p0/M, z3.s, z18.s\n"
+    ".inst 0x44828242  // srshl z2.s, p0/M, z2.s, z18.s\n"
+    ".inst 0x44828241  // srshl z1.s, p0/M, z1.s, z18.s\n"
+    ".inst 0x44828240  // srshl z0.s, p0/M, z0.s, z18.s\n"
+    ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
+    ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
+    ".inst 0x04b1756b  // sqrdmulh z11.s, z11.s, z17.s\n"
+    ".inst 0x04b1754a  // sqrdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x04b17529  // sqrdmulh z9.s, z9.s, z17.s\n"
+    ".inst 0x04b17508  // sqrdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x04b174e7  // sqrdmulh z7.s, z7.s, z17.s\n"
+    ".inst 0x04b174c6  // sqrdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x04b174a5  // sqrdmulh z5.s, z5.s, z17.s\n"
+    ".inst 0x04b17484  // sqrdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x04b17463  // sqrdmulh z3.s, z3.s, z17.s\n"
+    ".inst 0x04b17442  // sqrdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x04b17421  // sqrdmulh z1.s, z1.s, z17.s\n"
+    ".inst 0x04b17400  // sqrdmulh z0.s, z0.s, z17.s\n"
+    "mov z19.s, #0x7f\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "not z16.s, p0/M, z19.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z19.s\n"
+    "smin z14.s, p0/M, z14.s, z19.s\n"
+    "trn1 z23.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z19.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "smin z11.s, p0/M, z11.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z19.s\n"
+    "trn1 z22.h, z11.h, z10.h\n"
+    "smin z9.s, p0/M, z9.s, z19.s\n"
+    "smin z8.s, p0/M, z8.s, z19.s\n"
+    "trn1 z18.h, z9.h, z8.h\n"
+    "smin z7.s, p0/M, z7.s, z19.s\n"
+    "smin z6.s, p0/M, z6.s, z19.s\n"
+    "trn1 z21.h, z7.h, z6.h\n"
+    "smin z5.s, p0/M, z5.s, z19.s\n"
+    "smin z4.s, p0/M, z4.s, z19.s\n"
+    "trn1 z17.h, z5.h, z4.h\n"
+    "smin z3.s, p0/M, z3.s, z19.s\n"
+    "smin z2.s, p0/M, z2.s, z19.s\n"
+    "trn1 z20.h, z3.h, z2.h\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "trn1 z19.h, z1.h, z0.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+    "incb x25, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 2 inputs loop
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 2 inputs tail
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [right_shift] "r" (&right_shift)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..9667d37954
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
+
+struct sme_s8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
+{
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
+  sme_s8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_s8q_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..e9b586f4ce
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_s8q_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const int8_t *const *const inptrs,
+  int8_t *outptr,
+  const Requantize32 &qp
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p2.b, x27, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x80\n"
+    "mov z3.b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.b, #0x80\n"
+    "mov z1.b, #0x80\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+    "smax z23.b, p0/M, z23.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z18.b, p0/M, z18.b, z29.b\n"
+    "smax z22.b, p0/M, z22.b, z28.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "smax z17.b, p0/M, z17.b, z27.b\n"
+    "smax z21.b, p0/M, z21.b, z26.b\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "smax z4.b, p0/M, z4.b, z19.b\n"
+    "smax z3.b, p0/M, z3.b, z18.b\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "smax z2.b, p0/M, z2.b, z17.b\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+    "smax z23.b, p0/M, z23.b, z30.b\n"
+    "smax z18.b, p0/M, z18.b, z29.b\n"
+    "smax z22.b, p0/M, z22.b, z28.b\n"
+    "smax z17.b, p0/M, z17.b, z27.b\n"
+    "smax z21.b, p0/M, z21.b, z26.b\n"
+    "smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "smax z4.b, p0/M, z4.b, z19.b\n"
+    "smax z3.b, p0/M, z3.b, z18.b\n"
+    "smax z2.b, p0/M, z2.b, z17.b\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+    "smax z3.b, p0/M, z3.b, z16.b\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "smax z2.b, p0/M, z2.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "smax z1.b, p0/M, z1.b, z16.b\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    ".inst 0x4508a097  // sshllb z23.h, z4.b, #0x0\n"
+    ".inst 0x4508a496  // sshllt z22.h, z4.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z4.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a075  // sshllb z21.h, z3.b, #0x0\n"
+    ".inst 0x4508a472  // sshllt z18.h, z3.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z3.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a054  // sshllb z20.h, z2.b, #0x0\n"
+    ".inst 0x4508a451  // sshllt z17.h, z2.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z2.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a033  // sshllb z19.h, z1.b, #0x0\n"
+    ".inst 0x4508a430  // sshllt z16.h, z1.b, #0x0\n"
+    ".inst 0x4510a2e1  // sshllb z1.s, z23.h, #0x0\n"
+    ".inst 0x4510a6f7  // sshllt z23.s, z23.h, #0x0\n"
+    ".inst 0x4510a2c0  // sshllb z0.s, z22.h, #0x0\n"
+    ".inst 0x4510a6df  // sshllt z31.s, z22.h, #0x0\n"
+    ".inst 0x4510a2be  // sshllb z30.s, z21.h, #0x0\n"
+    ".inst 0x4510a6b6  // sshllt z22.s, z21.h, #0x0\n"
+    ".inst 0x4510a25d  // sshllb z29.s, z18.h, #0x0\n"
+    ".inst 0x4510a652  // sshllt z18.s, z18.h, #0x0\n"
+    ".inst 0x4510a29c  // sshllb z28.s, z20.h, #0x0\n"
+    ".inst 0x4510a695  // sshllt z21.s, z20.h, #0x0\n"
+    ".inst 0x4510a23b  // sshllb z27.s, z17.h, #0x0\n"
+    ".inst 0x4510a631  // sshllt z17.s, z17.h, #0x0\n"
+    ".inst 0x4510a27a  // sshllb z26.s, z19.h, #0x0\n"
+    ".inst 0x4510a674  // sshllt z20.s, z19.h, #0x0\n"
+    ".inst 0x4510a219  // sshllb z25.s, z16.h, #0x0\n"
+    ".inst 0x4510a618  // sshllt z24.s, z16.h, #0x0\n"
+    ".inst 0x44828081  // srshl z1.s, p0/M, z1.s, z4.s\n"
+    ".inst 0x44828097  // srshl z23.s, p0/M, z23.s, z4.s\n"
+    ".inst 0x44828080  // srshl z0.s, p0/M, z0.s, z4.s\n"
+    ".inst 0x4482809f  // srshl z31.s, p0/M, z31.s, z4.s\n"
+    ".inst 0x4482809e  // srshl z30.s, p0/M, z30.s, z4.s\n"
+    ".inst 0x44828096  // srshl z22.s, p0/M, z22.s, z4.s\n"
+    ".inst 0x4482809d  // srshl z29.s, p0/M, z29.s, z4.s\n"
+    ".inst 0x44828092  // srshl z18.s, p0/M, z18.s, z4.s\n"
+    ".inst 0x4482809c  // srshl z28.s, p0/M, z28.s, z4.s\n"
+    ".inst 0x44828095  // srshl z21.s, p0/M, z21.s, z4.s\n"
+    ".inst 0x4482809b  // srshl z27.s, p0/M, z27.s, z4.s\n"
+    ".inst 0x44828091  // srshl z17.s, p0/M, z17.s, z4.s\n"
+    ".inst 0x4482809a  // srshl z26.s, p0/M, z26.s, z4.s\n"
+    ".inst 0x44828094  // srshl z20.s, p0/M, z20.s, z4.s\n"
+    ".inst 0x44828099  // srshl z25.s, p0/M, z25.s, z4.s\n"
+    ".inst 0x44828098  // srshl z24.s, p0/M, z24.s, z4.s\n"
+    ".inst 0x04a37421  // sqrdmulh z1.s, z1.s, z3.s\n"
+    ".inst 0x04a376f7  // sqrdmulh z23.s, z23.s, z3.s\n"
+    ".inst 0x04a37400  // sqrdmulh z0.s, z0.s, z3.s\n"
+    ".inst 0x04a377ff  // sqrdmulh z31.s, z31.s, z3.s\n"
+    ".inst 0x04a377de  // sqrdmulh z30.s, z30.s, z3.s\n"
+    ".inst 0x04a376d6  // sqrdmulh z22.s, z22.s, z3.s\n"
+    ".inst 0x04a377bd  // sqrdmulh z29.s, z29.s, z3.s\n"
+    ".inst 0x04a37652  // sqrdmulh z18.s, z18.s, z3.s\n"
+    ".inst 0x04a3779c  // sqrdmulh z28.s, z28.s, z3.s\n"
+    ".inst 0x04a376b5  // sqrdmulh z21.s, z21.s, z3.s\n"
+    ".inst 0x04a3777b  // sqrdmulh z27.s, z27.s, z3.s\n"
+    ".inst 0x04a37631  // sqrdmulh z17.s, z17.s, z3.s\n"
+    ".inst 0x04a3775a  // sqrdmulh z26.s, z26.s, z3.s\n"
+    ".inst 0x04a37694  // sqrdmulh z20.s, z20.s, z3.s\n"
+    ".inst 0x04a37739  // sqrdmulh z25.s, z25.s, z3.s\n"
+    ".inst 0x04a37718  // sqrdmulh z24.s, z24.s, z3.s\n"
+    "mov z19.s, #0x7f\n"
+    ".inst 0x44828041  // srshl z1.s, p0/M, z1.s, z2.s\n"
+    ".inst 0x44828057  // srshl z23.s, p0/M, z23.s, z2.s\n"
+    ".inst 0x44828040  // srshl z0.s, p0/M, z0.s, z2.s\n"
+    ".inst 0x4482805f  // srshl z31.s, p0/M, z31.s, z2.s\n"
+    ".inst 0x4482805e  // srshl z30.s, p0/M, z30.s, z2.s\n"
+    ".inst 0x44828056  // srshl z22.s, p0/M, z22.s, z2.s\n"
+    ".inst 0x4482805d  // srshl z29.s, p0/M, z29.s, z2.s\n"
+    ".inst 0x44828052  // srshl z18.s, p0/M, z18.s, z2.s\n"
+    ".inst 0x4482805c  // srshl z28.s, p0/M, z28.s, z2.s\n"
+    ".inst 0x44828055  // srshl z21.s, p0/M, z21.s, z2.s\n"
+    ".inst 0x4482805b  // srshl z27.s, p0/M, z27.s, z2.s\n"
+    ".inst 0x44828051  // srshl z17.s, p0/M, z17.s, z2.s\n"
+    ".inst 0x4482805a  // srshl z26.s, p0/M, z26.s, z2.s\n"
+    ".inst 0x44828054  // srshl z20.s, p0/M, z20.s, z2.s\n"
+    ".inst 0x44828059  // srshl z25.s, p0/M, z25.s, z2.s\n"
+    ".inst 0x44828058  // srshl z24.s, p0/M, z24.s, z2.s\n"
+    "not z16.s, p0/M, z19.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z23.s, p0/M, z23.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smax z31.s, p0/M, z31.s, z16.s\n"
+    "smax z30.s, p0/M, z30.s, z16.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z29.s, p0/M, z29.s, z16.s\n"
+    "smax z18.s, p0/M, z18.s, z16.s\n"
+    "smax z28.s, p0/M, z28.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z27.s, p0/M, z27.s, z16.s\n"
+    "smax z17.s, p0/M, z17.s, z16.s\n"
+    "smax z26.s, p0/M, z26.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z25.s, p0/M, z25.s, z16.s\n"
+    "smax z24.s, p0/M, z24.s, z16.s\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z23.s, p0/M, z23.s, z19.s\n"
+    "trn1 z23.h, z1.h, z23.h\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "smin z31.s, p0/M, z31.s, z19.s\n"
+    "trn1 z16.h, z0.h, z31.h\n"
+    "smin z30.s, p0/M, z30.s, z19.s\n"
+    "smin z22.s, p0/M, z22.s, z19.s\n"
+    "trn1 z22.h, z30.h, z22.h\n"
+    "smin z29.s, p0/M, z29.s, z19.s\n"
+    "smin z18.s, p0/M, z18.s, z19.s\n"
+    "trn1 z18.h, z29.h, z18.h\n"
+    "smin z28.s, p0/M, z28.s, z19.s\n"
+    "smin z21.s, p0/M, z21.s, z19.s\n"
+    "trn1 z21.h, z28.h, z21.h\n"
+    "smin z27.s, p0/M, z27.s, z19.s\n"
+    "smin z17.s, p0/M, z17.s, z19.s\n"
+    "trn1 z17.h, z27.h, z17.h\n"
+    "smin z26.s, p0/M, z26.s, z19.s\n"
+    "smin z20.s, p0/M, z20.s, z19.s\n"
+    "trn1 z20.h, z26.h, z20.h\n"
+    "smin z25.s, p0/M, z25.s, z19.s\n"
+    "smin z24.s, p0/M, z24.s, z19.s\n"
+    "trn1 z19.h, z25.h, z24.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x28]\n"
+    "incb x28, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z4.b, p0/M, z4.b, z16.b\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    ".inst 0x4508a091  // sshllb z17.h, z4.b, #0x0\n"
+    ".inst 0x4508a490  // sshllt z16.h, z4.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z18.s }, p0/Z, [x20]\n"
+    ".inst 0x4510a236  // sshllb z22.s, z17.h, #0x0\n"
+    ".inst 0x4510a635  // sshllt z21.s, z17.h, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x4510a214  // sshllb z20.s, z16.h, #0x0\n"
+    ".inst 0x4510a613  // sshllt z19.s, z16.h, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x44828256  // srshl z22.s, p0/M, z22.s, z18.s\n"
+    ".inst 0x44828255  // srshl z21.s, p0/M, z21.s, z18.s\n"
+    ".inst 0x44828254  // srshl z20.s, p0/M, z20.s, z18.s\n"
+    ".inst 0x44828253  // srshl z19.s, p0/M, z19.s, z18.s\n"
+    ".inst 0x04b176d6  // sqrdmulh z22.s, z22.s, z17.s\n"
+    ".inst 0x04b176b5  // sqrdmulh z21.s, z21.s, z17.s\n"
+    ".inst 0x04b17694  // sqrdmulh z20.s, z20.s, z17.s\n"
+    ".inst 0x04b17673  // sqrdmulh z19.s, z19.s, z17.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x44828216  // srshl z22.s, p0/M, z22.s, z16.s\n"
+    ".inst 0x44828215  // srshl z21.s, p0/M, z21.s, z16.s\n"
+    ".inst 0x44828214  // srshl z20.s, p0/M, z20.s, z16.s\n"
+    ".inst 0x44828213  // srshl z19.s, p0/M, z19.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z19.s, p0/M, z19.s, z16.s\n"
+    "smin z22.s, p0/M, z22.s, z18.s\n"
+    "smin z21.s, p0/M, z21.s, z18.s\n"
+    "trn1 z17.h, z22.h, z21.h\n"
+    "smin z20.s, p0/M, z20.s, z18.s\n"
+    "smin z19.s, p0/M, z19.s, z18.s\n"
+    "trn1 z16.h, z20.h, z19.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..29a03ec509
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
+
+struct sme_u8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
+{
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
+  sme_u8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_u8_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f0e7bbf5cc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+  struct RescaleParams
+  {
+    int32_t multiplier, shift;
+  };
+
+  constexpr RescaleParams rescale_params[8] = {
+    {0x40000000, -0},  // 1/2
+    {0x55555556, -1},  // 1/3
+    {0x40000000, -1},  // 1/4
+    {0x66666666, -2},  // 1/5
+    {0x55555556, -2},  // 1/6
+    {0x49249249, -2},  // 1/7
+    {0x40000000, -2},  // 1/8
+    {0x71c71c72, -3},  // 1/9
+  };
+}
+
+void sme_u8_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const uint8_t *const *const inptrs,
+  uint8_t *outptr
+)
+{
+  if (n_valid_cells == 1 && window_cells == 1)
+  {
+    // In this case, simply copy from the input to the output
+    std::memcpy(outptr, *inptrs, n_channels);
+    return;
+  }
+
+  // Compute (or look up) the rescale values
+  int32_t shift_value = 0, rescale_value = 0;
+  if (2 <= window_cells && window_cells <= 9)
+  {
+    auto &params = rescale_params[window_cells - 2];
+    rescale_value = params.multiplier;
+    shift_value = params.shift;
+  }
+  else
+  {
+    auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+    shift_value = 0;
+    while (f_rescale_value < 0.5f)
+    {
+      shift_value--;
+      f_rescale_value *= 2.0f;
+    }
+
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
+    {
+      shift_value++;
+      long_rescale_value >>= 1;
+    }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
+  }
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p2.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "mov z11.s, #0x0\n"
+    "mov z10.s, #0x0\n"
+    "mov z9.s, #0x0\n"
+    "mov z8.s, #0x0\n"
+    "mov z7.s, #0x0\n"
+    "mov z6.s, #0x0\n"
+    "mov z5.s, #0x0\n"
+    "mov z4.s, #0x0\n"
+    "mov z3.s, #0x0\n"
+    "mov z2.s, #0x0\n"
+    "mov z1.s, #0x0\n"
+    "mov z0.s, #0x0\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 2 inputs loop
+    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 2 inputs tail
+    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
+    ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
+    ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
+    ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa17  // ushllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508ae16  // ushllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x4508aa15  // ushllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508ae14  // ushllt z20.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508aa13  // ushllb z19.h, z16.b, #0x0\n"
+    ".inst 0x4508ae12  // ushllt z18.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
+    ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+    ".inst 0x04b1756b  // sqdmulh z11.s, z11.s, z17.s\n"
+    ".inst 0x04b1754a  // sqdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x04b17529  // sqdmulh z9.s, z9.s, z17.s\n"
+    ".inst 0x04b17508  // sqdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x04b174e7  // sqdmulh z7.s, z7.s, z17.s\n"
+    ".inst 0x04b174c6  // sqdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x04b174a5  // sqdmulh z5.s, z5.s, z17.s\n"
+    ".inst 0x04b17484  // sqdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x04b17463  // sqdmulh z3.s, z3.s, z17.s\n"
+    ".inst 0x04b17442  // sqdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x04b17421  // sqdmulh z1.s, z1.s, z17.s\n"
+    ".inst 0x04b17400  // sqdmulh z0.s, z0.s, z17.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "mov z19.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z19.s\n"
+    "smin z14.s, p0/M, z14.s, z19.s\n"
+    "trn1 z23.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z19.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "smin z11.s, p0/M, z11.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z19.s\n"
+    "trn1 z22.h, z11.h, z10.h\n"
+    "smin z9.s, p0/M, z9.s, z19.s\n"
+    "smin z8.s, p0/M, z8.s, z19.s\n"
+    "trn1 z18.h, z9.h, z8.h\n"
+    "smin z7.s, p0/M, z7.s, z19.s\n"
+    "smin z6.s, p0/M, z6.s, z19.s\n"
+    "trn1 z21.h, z7.h, z6.h\n"
+    "smin z5.s, p0/M, z5.s, z19.s\n"
+    "smin z4.s, p0/M, z4.s, z19.s\n"
+    "trn1 z17.h, z5.h, z4.h\n"
+    "smin z3.s, p0/M, z3.s, z19.s\n"
+    "smin z2.s, p0/M, z2.s, z19.s\n"
+    "trn1 z20.h, z3.h, z2.h\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "trn1 z19.h, z1.h, z0.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+    "incb x25, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z15.s, #0x0\n"
+    "mov z14.s, #0x0\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z13.s, #0x0\n"
+    "mov z12.s, #0x0\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 2 inputs loop
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 2 inputs tail
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
+    ".inst 0x04b075ef  // sqdmulh z15.s, z15.s, z16.s\n"
+    ".inst 0x04b075ce  // sqdmulh z14.s, z14.s, z16.s\n"
+    ".inst 0x04b075ad  // sqdmulh z13.s, z13.s, z16.s\n"
+    ".inst 0x04b0758c  // sqdmulh z12.s, z12.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z17.s\n"
+    "smax z14.s, p0/M, z14.s, z17.s\n"
+    "smax z13.s, p0/M, z13.s, z17.s\n"
+    "smax z12.s, p0/M, z12.s, z17.s\n"
+    "smin z15.s, p0/M, z15.s, z16.s\n"
+    "smin z14.s, p0/M, z14.s, z16.s\n"
+    "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z16.s\n"
+    "smin z12.s, p0/M, z12.s, z16.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..3df4e4efb8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<uint8_t, uint8_t>
+{
+  using Parent = DepthfirstStrategy<uint8_t, uint8_t>;
+
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
+
+  sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+  Parent::KernelType get_kernel(void) const { return sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..9088cbde89
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const bool exclude_padding,
+  const unsigned int pad_left,
+  const unsigned int pad_top,
+  const unsigned int pad_right,
+  const unsigned int pad_bottom
+)
+{
+  struct KernelArgs
+  {
+    const uint64_t n_channels;
+    const uint8_t *const *const inptrs;
+    uint8_t *const *const outptrs;
+    KernelArgs(
+      unsigned int channels,
+      const uint8_t *const *input_ptrs,
+      uint8_t *const * output_ptrs,
+      bool, unsigned int, unsigned int, unsigned int, unsigned int
+    ) : n_channels(channels),
+        inptrs(input_ptrs),
+        outptrs(output_ptrs)
+    {
+    }
+  };
+
+  const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+                        pad_left, pad_top, pad_right, pad_bottom);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x15, #0x0\n"
+    "ptrue p2.b\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "mov x14, #0x0\n"
+    "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+    "whilelt p0.b, x15, x13\n"
+    "ldp x12, x11, [x21, #0x0]\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ld1b { z30.b }, p0/Z, [x27, x15]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ld1b { z29.b }, p0/Z, [x25, x15]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ld1b { z28.b }, p0/Z, [x24, x15]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ld1b { z27.b }, p0/Z, [x21, x15]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1b { z26.b }, p0/Z, [x28, x15]\n"
+    "ld1b { z25.b }, p0/Z, [x26, x15]\n"
+    "ld1b { z24.b }, p0/Z, [x23, x15]\n"
+    "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+    "ld1b { z23.b }, p0/Z, [x20, x15]\n"
+    "incw x15\n"
+    "whilelt p1.b, x15, x13\n"
+    "b.none 2f\n"
+    "1:"  // Vector: Loop
+    "movprfx z22, z30\n umax z22.b, p2/M, z22.b, z28.b\n"
+    "movprfx z21, z28\n umax z21.b, p2/M, z21.b, z27.b\n"
+    "ld1b { z30.b }, p1/Z, [x27, x15]\n"
+    "whilelt p0.b, x14, x13\n"
+    "movprfx z18, z29\n umax z18.b, p2/M, z18.b, z26.b\n"
+    "movprfx z17, z25\n umax z17.b, p2/M, z17.b, z24.b\n"
+    "ld1b { z28.b }, p1/Z, [x24, x15]\n"
+    "movprfx z16, z29\n umax z16.b, p2/M, z16.b, z19.b\n"
+    "movprfx z20, z24\n umax z20.b, p2/M, z20.b, z23.b\n"
+    "ld1b { z27.b }, p1/Z, [x21, x15]\n"
+    "ld1b { z29.b }, p1/Z, [x25, x15]\n"
+    "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
+    "movprfx z18, z17\n umax z18.b, p2/M, z18.b, z22.b\n"
+    "ld1b { z26.b }, p1/Z, [x28, x15]\n"
+    "movprfx z17, z16\n umax z17.b, p2/M, z17.b, z21.b\n"
+    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
+    "ld1b { z25.b }, p1/Z, [x26, x15]\n"
+    "st1b { z19.b }, p0, [x12, x14]\n"
+    "ld1b { z24.b }, p1/Z, [x23, x15]\n"
+    "st1b { z18.b }, p0, [x11, x14]\n"
+    "ld1b { z19.b }, p1/Z, [x22, x15]\n"
+    "st1b { z17.b }, p0, [x10, x14]\n"
+    "ld1b { z23.b }, p1/Z, [x20, x15]\n"
+    "incw x15\n"
+    "whilelt p1.b, x15, x13\n"
+    "st1b { z16.b }, p0, [x9, x14]\n"
+    "incw x14\n"
+    "b.any 1b\n"
+    "2:"  // Vector: Tail
+    "movprfx z22, z30\n umax z22.b, p2/M, z22.b, z28.b\n"
+    "movprfx z21, z28\n umax z21.b, p2/M, z21.b, z27.b\n"
+    "whilelt p0.b, x14, x13\n"
+    "movprfx z20, z29\n umax z20.b, p2/M, z20.b, z26.b\n"
+    "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z24.b\n"
+    "movprfx z17, z29\n umax z17.b, p2/M, z17.b, z19.b\n"
+    "movprfx z19, z24\n umax z19.b, p2/M, z19.b, z23.b\n"
+    "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+    "umax z18.b, p2/M, z18.b, z22.b\n"
+    "st1b { z16.b }, p0, [x12, x14]\n"
+    "umax z17.b, p2/M, z17.b, z21.b\n"
+    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z19.b\n"
+    "st1b { z18.b }, p0, [x11, x14]\n"
+    "st1b { z17.b }, p0, [x10, x14]\n"
+    "st1b { z16.b }, p0, [x9, x14]\n"
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..077c8ed2f7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
+
+struct sme_u8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
+{
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
+  sme_u8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_u8_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..06f13e8111
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_u8_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const uint8_t *const *const inptrs,
+  uint8_t *outptr
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p2.b, x27, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x0\n"
+    "mov z3.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.b, #0x0\n"
+    "mov z1.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+    "umax z23.b, p0/M, z23.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z18.b, p0/M, z18.b, z29.b\n"
+    "umax z22.b, p0/M, z22.b, z28.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "umax z17.b, p0/M, z17.b, z27.b\n"
+    "umax z21.b, p0/M, z21.b, z26.b\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "umax z4.b, p0/M, z4.b, z19.b\n"
+    "umax z3.b, p0/M, z3.b, z18.b\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "umax z2.b, p0/M, z2.b, z17.b\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+    "umax z23.b, p0/M, z23.b, z30.b\n"
+    "umax z18.b, p0/M, z18.b, z29.b\n"
+    "umax z22.b, p0/M, z22.b, z28.b\n"
+    "umax z17.b, p0/M, z17.b, z27.b\n"
+    "umax z21.b, p0/M, z21.b, z26.b\n"
+    "umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "umax z4.b, p0/M, z4.b, z19.b\n"
+    "umax z3.b, p0/M, z3.b, z18.b\n"
+    "umax z2.b, p0/M, z2.b, z17.b\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z4.b, p0/M, z4.b, z16.b\n"
+    "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+    "umax z3.b, p0/M, z3.b, z16.b\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "umax z2.b, p0/M, z2.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
+    "st1b { z3.b }, p3, [%x[outptr], x28]\n"
+    "incb x28, ALL, MUL #4\n"
+    "st1b { z2.b }, p2, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "st1b { z1.b }, p1, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z4.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "umax z4.b, p0/M, z4.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "umax z4.b, p0/M, z4.b, z16.b\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z4.b, p0/M, z4.b, z16.b\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..bd30a32828
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
+
+struct sme_u8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
+{
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
+  sme_u8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_u8q_nhwc_avg_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..52c52ccdb9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,489 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+  struct RescaleParams
+  {
+    int32_t multiplier, shift;
+  };
+
+  constexpr RescaleParams rescale_params[8] = {
+    {0x40000000, -0},  // 1/2
+    {0x55555556, -1},  // 1/3
+    {0x40000000, -1},  // 1/4
+    {0x66666666, -2},  // 1/5
+    {0x55555556, -2},  // 1/6
+    {0x49249249, -2},  // 1/7
+    {0x40000000, -2},  // 1/8
+    {0x71c71c72, -3},  // 1/9
+  };
+}
+
+void sme_u8q_nhwc_avg_generic_depthfirst_impl(
+  const uint64_t window_cells,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const uint8_t *const *const inptrs,
+  uint8_t *outptr,
+  const Requantize32 &qp
+)
+{
+  if (n_valid_cells == 1 && window_cells == 1)
+  {
+    // In this case, simply copy from the input to the output
+    std::memcpy(outptr, *inptrs, n_channels);
+    return;
+  }
+
+  // Compute (or look up) the rescale values
+  int32_t shift_value = 0, rescale_value = 0;
+  if (2 <= window_cells && window_cells <= 9)
+  {
+    auto &params = rescale_params[window_cells - 2];
+    rescale_value = params.multiplier;
+    shift_value = params.shift;
+  }
+  else
+  {
+    auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+    shift_value = 0;
+    while (f_rescale_value < 0.5f)
+    {
+      shift_value--;
+      f_rescale_value *= 2.0f;
+    }
+
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
+    {
+      shift_value++;
+      long_rescale_value >>= 1;
+    }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
+  }
+
+
+  // Initialise the accumulators such that the offsets are subtracted for all
+  // valid inputs.
+  const int32_t accumulator_init = -qp.input_offset * n_valid_cells;
+
+  // Combine together the rescale value for the requantization and the scaling
+  // factor for the average pool.
+  const int32_t shift = qp.per_layer_left_shift - qp.per_layer_right_shift + shift_value;
+  const int32_t left_shift = shift > 0 ? shift : 0;
+  const int32_t right_shift = shift <= 0 ? shift : 0;
+
+  int32_t combined_rescale_value = 0;
+  __asm__ __volatile__ (
+      "mov v16.s[0], %w[per_layer_mul]\n"
+      "mov v17.s[0], %w[rescale_value]\n"
+      "sqrdmulh s18, s16, s17\n"
+      "mov %w[combined_rescale_value], v18.s[0]\n"
+    : [combined_rescale_value] "=r" (combined_rescale_value)
+    : [per_layer_mul] "r" (qp.per_layer_mul), [rescale_value] "r" (rescale_value)
+    : "v16", "v17", "v18"
+  );
+
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p2.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z14.d, z15.d\n"
+    "mov z13.d, z15.d\n"
+    "mov z12.d, z15.d\n"
+    "mov z11.d, z15.d\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z10.d, z15.d\n"
+    "mov z9.d, z15.d\n"
+    "mov z8.d, z15.d\n"
+    "mov z7.d, z15.d\n"
+    "mov z6.d, z15.d\n"
+    "mov z5.d, z15.d\n"
+    "mov z4.d, z15.d\n"
+    "mov z3.d, z15.d\n"
+    "mov z2.d, z15.d\n"
+    "mov z1.d, z15.d\n"
+    "mov z0.d, z15.d\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 2 inputs loop
+    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 2 inputs tail
+    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
+    ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
+    ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
+    ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
+    ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
+    ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
+    ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa17  // ushllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508ae16  // ushllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    ".inst 0x4508aa15  // ushllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508ae14  // ushllt z20.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508aa13  // ushllb z19.h, z16.b, #0x0\n"
+    ".inst 0x4508ae12  // ushllt z18.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
+    ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
+    ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "ld1rw { z19.s }, p0/Z, [%x[left_shift]]\n"
+    ".inst 0x4482826f  // srshl z15.s, p0/M, z15.s, z19.s\n"
+    ".inst 0x4482826e  // srshl z14.s, p0/M, z14.s, z19.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x4482826d  // srshl z13.s, p0/M, z13.s, z19.s\n"
+    ".inst 0x4482826c  // srshl z12.s, p0/M, z12.s, z19.s\n"
+    "ld1rw { z18.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482826b  // srshl z11.s, p0/M, z11.s, z19.s\n"
+    ".inst 0x4482826a  // srshl z10.s, p0/M, z10.s, z19.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x44828269  // srshl z9.s, p0/M, z9.s, z19.s\n"
+    ".inst 0x44828268  // srshl z8.s, p0/M, z8.s, z19.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x44828267  // srshl z7.s, p0/M, z7.s, z19.s\n"
+    ".inst 0x44828266  // srshl z6.s, p0/M, z6.s, z19.s\n"
+    ".inst 0x44828265  // srshl z5.s, p0/M, z5.s, z19.s\n"
+    ".inst 0x44828264  // srshl z4.s, p0/M, z4.s, z19.s\n"
+    ".inst 0x44828263  // srshl z3.s, p0/M, z3.s, z19.s\n"
+    ".inst 0x44828262  // srshl z2.s, p0/M, z2.s, z19.s\n"
+    ".inst 0x44828261  // srshl z1.s, p0/M, z1.s, z19.s\n"
+    ".inst 0x44828260  // srshl z0.s, p0/M, z0.s, z19.s\n"
+    ".inst 0x04b275ef  // sqrdmulh z15.s, z15.s, z18.s\n"
+    ".inst 0x04b275ce  // sqrdmulh z14.s, z14.s, z18.s\n"
+    ".inst 0x04b275ad  // sqrdmulh z13.s, z13.s, z18.s\n"
+    ".inst 0x04b2758c  // sqrdmulh z12.s, z12.s, z18.s\n"
+    ".inst 0x04b2756b  // sqrdmulh z11.s, z11.s, z18.s\n"
+    ".inst 0x04b2754a  // sqrdmulh z10.s, z10.s, z18.s\n"
+    ".inst 0x04b27529  // sqrdmulh z9.s, z9.s, z18.s\n"
+    ".inst 0x04b27508  // sqrdmulh z8.s, z8.s, z18.s\n"
+    ".inst 0x04b274e7  // sqrdmulh z7.s, z7.s, z18.s\n"
+    ".inst 0x04b274c6  // sqrdmulh z6.s, z6.s, z18.s\n"
+    ".inst 0x04b274a5  // sqrdmulh z5.s, z5.s, z18.s\n"
+    ".inst 0x04b27484  // sqrdmulh z4.s, z4.s, z18.s\n"
+    ".inst 0x04b27463  // sqrdmulh z3.s, z3.s, z18.s\n"
+    ".inst 0x04b27442  // sqrdmulh z2.s, z2.s, z18.s\n"
+    ".inst 0x04b27421  // sqrdmulh z1.s, z1.s, z18.s\n"
+    ".inst 0x04b27400  // sqrdmulh z0.s, z0.s, z18.s\n"
+    ".inst 0x4482822f  // srshl z15.s, p0/M, z15.s, z17.s\n"
+    ".inst 0x4482822e  // srshl z14.s, p0/M, z14.s, z17.s\n"
+    ".inst 0x4482822d  // srshl z13.s, p0/M, z13.s, z17.s\n"
+    ".inst 0x4482822c  // srshl z12.s, p0/M, z12.s, z17.s\n"
+    ".inst 0x4482822b  // srshl z11.s, p0/M, z11.s, z17.s\n"
+    ".inst 0x4482822a  // srshl z10.s, p0/M, z10.s, z17.s\n"
+    ".inst 0x44828229  // srshl z9.s, p0/M, z9.s, z17.s\n"
+    ".inst 0x44828228  // srshl z8.s, p0/M, z8.s, z17.s\n"
+    ".inst 0x44828227  // srshl z7.s, p0/M, z7.s, z17.s\n"
+    ".inst 0x44828226  // srshl z6.s, p0/M, z6.s, z17.s\n"
+    ".inst 0x44828225  // srshl z5.s, p0/M, z5.s, z17.s\n"
+    ".inst 0x44828224  // srshl z4.s, p0/M, z4.s, z17.s\n"
+    ".inst 0x44828223  // srshl z3.s, p0/M, z3.s, z17.s\n"
+    ".inst 0x44828222  // srshl z2.s, p0/M, z2.s, z17.s\n"
+    ".inst 0x44828221  // srshl z1.s, p0/M, z1.s, z17.s\n"
+    ".inst 0x44828220  // srshl z0.s, p0/M, z0.s, z17.s\n"
+    "add z15.s, z15.s, z16.s\n"
+    "add z14.s, z14.s, z16.s\n"
+    "add z13.s, z13.s, z16.s\n"
+    "add z12.s, z12.s, z16.s\n"
+    "add z11.s, z11.s, z16.s\n"
+    "add z10.s, z10.s, z16.s\n"
+    "add z9.s, z9.s, z16.s\n"
+    "add z8.s, z8.s, z16.s\n"
+    "add z7.s, z7.s, z16.s\n"
+    "add z6.s, z6.s, z16.s\n"
+    "add z5.s, z5.s, z16.s\n"
+    "add z4.s, z4.s, z16.s\n"
+    "add z3.s, z3.s, z16.s\n"
+    "add z2.s, z2.s, z16.s\n"
+    "add z1.s, z1.s, z16.s\n"
+    "add z0.s, z0.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "mov z19.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z19.s\n"
+    "smin z14.s, p0/M, z14.s, z19.s\n"
+    "trn1 z23.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z19.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "smin z11.s, p0/M, z11.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z19.s\n"
+    "trn1 z22.h, z11.h, z10.h\n"
+    "smin z9.s, p0/M, z9.s, z19.s\n"
+    "smin z8.s, p0/M, z8.s, z19.s\n"
+    "trn1 z18.h, z9.h, z8.h\n"
+    "smin z7.s, p0/M, z7.s, z19.s\n"
+    "smin z6.s, p0/M, z6.s, z19.s\n"
+    "trn1 z21.h, z7.h, z6.h\n"
+    "smin z5.s, p0/M, z5.s, z19.s\n"
+    "smin z4.s, p0/M, z4.s, z19.s\n"
+    "trn1 z17.h, z5.h, z4.h\n"
+    "smin z3.s, p0/M, z3.s, z19.s\n"
+    "smin z2.s, p0/M, z2.s, z19.s\n"
+    "trn1 z20.h, z3.h, z2.h\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "trn1 z19.h, z1.h, z0.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+    "incb x25, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
+    "mov z14.d, z15.d\n"
+    "mov z13.d, z15.d\n"
+    "mov z12.d, z15.d\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 2 inputs loop
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 2 inputs tail
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x1\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x04b075ef  // sqrdmulh z15.s, z15.s, z16.s\n"
+    ".inst 0x04b075ce  // sqrdmulh z14.s, z14.s, z16.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b075ad  // sqrdmulh z13.s, z13.s, z16.s\n"
+    ".inst 0x04b0758c  // sqrdmulh z12.s, z12.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x4482822f  // srshl z15.s, p0/M, z15.s, z17.s\n"
+    ".inst 0x4482822e  // srshl z14.s, p0/M, z14.s, z17.s\n"
+    ".inst 0x4482822d  // srshl z13.s, p0/M, z13.s, z17.s\n"
+    ".inst 0x4482822c  // srshl z12.s, p0/M, z12.s, z17.s\n"
+    "add z15.s, z15.s, z16.s\n"
+    "add z14.s, z14.s, z16.s\n"
+    "add z13.s, z13.s, z16.s\n"
+    "add z12.s, z12.s, z16.s\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z17.s\n"
+    "smax z14.s, p0/M, z14.s, z17.s\n"
+    "smax z13.s, p0/M, z13.s, z17.s\n"
+    "smax z12.s, p0/M, z12.s, z17.s\n"
+    "smin z15.s, p0/M, z15.s, z16.s\n"
+    "smin z14.s, p0/M, z14.s, z16.s\n"
+    "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z16.s\n"
+    "smin z12.s, p0/M, z12.s, z16.s\n"
+    "trn1 z16.h, z13.h, z12.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [accumulator_init] "r" (&accumulator_init), [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [outptr] "r" (outptr), [quant_params] "r" (&qp), [right_shift] "r" (&right_shift)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..69d627c047
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
+
+struct sme_u8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
+{
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
+  sme_u8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sme_u8q_nhwc_max_generic_depthfirst_impl; }
+};
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c8e8e7d399
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_u8q_nhwc_max_generic_depthfirst_impl(
+  const uint64_t,
+  const uint64_t n_valid_cells,
+  uint64_t n_channels,
+  const uint8_t *const *const inptrs,
+  uint8_t *outptr,
+  const Requantize32 &qp
+)
+{
+  __asm__ __volatile__(
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p2.b, x27, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "ptrue p0.b\n"
+    "b.none 7f\n"
+    "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov z3.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z2.b, #0x0\n"
+    "mov z1.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "beq 3f\n"
+    "2:"  // 4-vectors of channels: 4 inputs loop
+    "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+    "umax z23.b, p0/M, z23.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z18.b, p0/M, z18.b, z29.b\n"
+    "umax z22.b, p0/M, z22.b, z28.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "umax z17.b, p0/M, z17.b, z27.b\n"
+    "umax z21.b, p0/M, z21.b, z26.b\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "umax z5.b, p0/M, z5.b, z19.b\n"
+    "umax z3.b, p0/M, z3.b, z18.b\n"
+    "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+    "umax z2.b, p0/M, z2.b, z17.b\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+    "bgt 2b\n"
+    "3:"  // 4-vectors of channels: 4 inputs tail
+    "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+    "umax z23.b, p0/M, z23.b, z30.b\n"
+    "umax z18.b, p0/M, z18.b, z29.b\n"
+    "umax z22.b, p0/M, z22.b, z28.b\n"
+    "umax z17.b, p0/M, z17.b, z27.b\n"
+    "umax z21.b, p0/M, z21.b, z26.b\n"
+    "umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "umax z5.b, p0/M, z5.b, z19.b\n"
+    "umax z3.b, p0/M, z3.b, z18.b\n"
+    "umax z2.b, p0/M, z2.b, z17.b\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "4:"  // 4-vectors of channels: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 6f\n"
+    "5:"  // 4-vectors of channels: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+    "umax z3.b, p0/M, z3.b, z16.b\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "umax z2.b, p0/M, z2.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "umax z1.b, p0/M, z1.b, z16.b\n"
+    "bgt 5b\n"
+    "6:"  // 4-vectors of channels: Single input loop: End
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1rw { z4.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a8b7  // ushllb z23.h, z5.b, #0x0\n"
+    ".inst 0x4508acb9  // ushllt z25.h, z5.b, #0x0\n"
+    ".inst 0x4508a876  // ushllb z22.h, z3.b, #0x0\n"
+    ".inst 0x4508ac72  // ushllt z18.h, z3.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z3.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a855  // ushllb z21.h, z2.b, #0x0\n"
+    ".inst 0x4508ac51  // ushllt z17.h, z2.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z2.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a834  // ushllb z20.h, z1.b, #0x0\n"
+    ".inst 0x4508ac38  // ushllt z24.h, z1.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z19.s }, p0/Z, [x20]\n"
+    "neg z4.s, p0/M, z4.s\n"
+    ".inst 0x45974081  // saddwb z1.s, z4.s, z23.h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x45974497  // saddwt z23.s, z4.s, z23.h\n"
+    ".inst 0x45994080  // saddwb z0.s, z4.s, z25.h\n"
+    ".inst 0x4599449f  // saddwt z31.s, z4.s, z25.h\n"
+    ".inst 0x4596409e  // saddwb z30.s, z4.s, z22.h\n"
+    ".inst 0x45964496  // saddwt z22.s, z4.s, z22.h\n"
+    ".inst 0x4592409d  // saddwb z29.s, z4.s, z18.h\n"
+    ".inst 0x45924492  // saddwt z18.s, z4.s, z18.h\n"
+    ".inst 0x4595409c  // saddwb z28.s, z4.s, z21.h\n"
+    ".inst 0x45954495  // saddwt z21.s, z4.s, z21.h\n"
+    ".inst 0x4591409b  // saddwb z27.s, z4.s, z17.h\n"
+    ".inst 0x45914491  // saddwt z17.s, z4.s, z17.h\n"
+    ".inst 0x4594409a  // saddwb z26.s, z4.s, z20.h\n"
+    ".inst 0x45944494  // saddwt z20.s, z4.s, z20.h\n"
+    ".inst 0x45984099  // saddwb z25.s, z4.s, z24.h\n"
+    ".inst 0x45984498  // saddwt z24.s, z4.s, z24.h\n"
+    ".inst 0x44828061  // srshl z1.s, p0/M, z1.s, z3.s\n"
+    ".inst 0x44828077  // srshl z23.s, p0/M, z23.s, z3.s\n"
+    ".inst 0x44828060  // srshl z0.s, p0/M, z0.s, z3.s\n"
+    ".inst 0x4482807f  // srshl z31.s, p0/M, z31.s, z3.s\n"
+    ".inst 0x4482807e  // srshl z30.s, p0/M, z30.s, z3.s\n"
+    ".inst 0x44828076  // srshl z22.s, p0/M, z22.s, z3.s\n"
+    ".inst 0x4482807d  // srshl z29.s, p0/M, z29.s, z3.s\n"
+    ".inst 0x44828072  // srshl z18.s, p0/M, z18.s, z3.s\n"
+    ".inst 0x4482807c  // srshl z28.s, p0/M, z28.s, z3.s\n"
+    ".inst 0x44828075  // srshl z21.s, p0/M, z21.s, z3.s\n"
+    ".inst 0x4482807b  // srshl z27.s, p0/M, z27.s, z3.s\n"
+    ".inst 0x44828071  // srshl z17.s, p0/M, z17.s, z3.s\n"
+    ".inst 0x4482807a  // srshl z26.s, p0/M, z26.s, z3.s\n"
+    ".inst 0x44828074  // srshl z20.s, p0/M, z20.s, z3.s\n"
+    ".inst 0x44828079  // srshl z25.s, p0/M, z25.s, z3.s\n"
+    ".inst 0x44828078  // srshl z24.s, p0/M, z24.s, z3.s\n"
+    ".inst 0x04a27421  // sqrdmulh z1.s, z1.s, z2.s\n"
+    ".inst 0x04a276f7  // sqrdmulh z23.s, z23.s, z2.s\n"
+    ".inst 0x04a27400  // sqrdmulh z0.s, z0.s, z2.s\n"
+    ".inst 0x04a277ff  // sqrdmulh z31.s, z31.s, z2.s\n"
+    ".inst 0x04a277de  // sqrdmulh z30.s, z30.s, z2.s\n"
+    ".inst 0x04a276d6  // sqrdmulh z22.s, z22.s, z2.s\n"
+    ".inst 0x04a277bd  // sqrdmulh z29.s, z29.s, z2.s\n"
+    ".inst 0x04a27652  // sqrdmulh z18.s, z18.s, z2.s\n"
+    ".inst 0x04a2779c  // sqrdmulh z28.s, z28.s, z2.s\n"
+    ".inst 0x04a276b5  // sqrdmulh z21.s, z21.s, z2.s\n"
+    ".inst 0x04a2777b  // sqrdmulh z27.s, z27.s, z2.s\n"
+    ".inst 0x04a27631  // sqrdmulh z17.s, z17.s, z2.s\n"
+    ".inst 0x04a2775a  // sqrdmulh z26.s, z26.s, z2.s\n"
+    ".inst 0x04a27694  // sqrdmulh z20.s, z20.s, z2.s\n"
+    ".inst 0x04a27739  // sqrdmulh z25.s, z25.s, z2.s\n"
+    ".inst 0x04a27718  // sqrdmulh z24.s, z24.s, z2.s\n"
+    ".inst 0x44828261  // srshl z1.s, p0/M, z1.s, z19.s\n"
+    ".inst 0x44828277  // srshl z23.s, p0/M, z23.s, z19.s\n"
+    ".inst 0x44828260  // srshl z0.s, p0/M, z0.s, z19.s\n"
+    ".inst 0x4482827f  // srshl z31.s, p0/M, z31.s, z19.s\n"
+    ".inst 0x4482827e  // srshl z30.s, p0/M, z30.s, z19.s\n"
+    ".inst 0x44828276  // srshl z22.s, p0/M, z22.s, z19.s\n"
+    ".inst 0x4482827d  // srshl z29.s, p0/M, z29.s, z19.s\n"
+    ".inst 0x44828272  // srshl z18.s, p0/M, z18.s, z19.s\n"
+    ".inst 0x4482827c  // srshl z28.s, p0/M, z28.s, z19.s\n"
+    ".inst 0x44828275  // srshl z21.s, p0/M, z21.s, z19.s\n"
+    ".inst 0x4482827b  // srshl z27.s, p0/M, z27.s, z19.s\n"
+    ".inst 0x44828271  // srshl z17.s, p0/M, z17.s, z19.s\n"
+    ".inst 0x4482827a  // srshl z26.s, p0/M, z26.s, z19.s\n"
+    ".inst 0x44828274  // srshl z20.s, p0/M, z20.s, z19.s\n"
+    ".inst 0x44828279  // srshl z25.s, p0/M, z25.s, z19.s\n"
+    ".inst 0x44828278  // srshl z24.s, p0/M, z24.s, z19.s\n"
+    "add z1.s, z1.s, z16.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "add z0.s, z0.s, z16.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z30.s, z30.s, z16.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "add z29.s, z29.s, z16.s\n"
+    "add z18.s, z18.s, z16.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "add z21.s, z21.s, z16.s\n"
+    "add z27.s, z27.s, z16.s\n"
+    "add z17.s, z17.s, z16.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "add z20.s, z20.s, z16.s\n"
+    "add z25.s, z25.s, z16.s\n"
+    "add z24.s, z24.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "mov z19.s, #0xff\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z23.s, p0/M, z23.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smax z31.s, p0/M, z31.s, z16.s\n"
+    "smax z30.s, p0/M, z30.s, z16.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z29.s, p0/M, z29.s, z16.s\n"
+    "smax z18.s, p0/M, z18.s, z16.s\n"
+    "smax z28.s, p0/M, z28.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z27.s, p0/M, z27.s, z16.s\n"
+    "smax z17.s, p0/M, z17.s, z16.s\n"
+    "smax z26.s, p0/M, z26.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z25.s, p0/M, z25.s, z16.s\n"
+    "smax z24.s, p0/M, z24.s, z16.s\n"
+    "smin z1.s, p0/M, z1.s, z19.s\n"
+    "smin z23.s, p0/M, z23.s, z19.s\n"
+    "smin z0.s, p0/M, z0.s, z19.s\n"
+    "trn1 z23.h, z1.h, z23.h\n"
+    "smin z31.s, p0/M, z31.s, z19.s\n"
+    "smin z30.s, p0/M, z30.s, z19.s\n"
+    "trn1 z16.h, z0.h, z31.h\n"
+    "smin z22.s, p0/M, z22.s, z19.s\n"
+    "smin z29.s, p0/M, z29.s, z19.s\n"
+    "trn1 z22.h, z30.h, z22.h\n"
+    "smin z18.s, p0/M, z18.s, z19.s\n"
+    "smin z28.s, p0/M, z28.s, z19.s\n"
+    "trn1 z18.h, z29.h, z18.h\n"
+    "smin z21.s, p0/M, z21.s, z19.s\n"
+    "smin z27.s, p0/M, z27.s, z19.s\n"
+    "trn1 z21.h, z28.h, z21.h\n"
+    "smin z17.s, p0/M, z17.s, z19.s\n"
+    "smin z26.s, p0/M, z26.s, z19.s\n"
+    "trn1 z17.h, z27.h, z17.h\n"
+    "smin z20.s, p0/M, z20.s, z19.s\n"
+    "smin z25.s, p0/M, z25.s, z19.s\n"
+    "trn1 z20.h, z26.h, z20.h\n"
+    "smin z24.s, p0/M, z24.s, z19.s\n"
+    "trn1 z19.h, z25.h, z24.h\n"
+    "trn1 z16.b, z23.b, z16.b\n"
+    "trn1 z18.b, z22.b, z18.b\n"
+    "trn1 z17.b, z21.b, z17.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
+    "trn1 z16.b, z20.b, z19.b\n"
+    "st1b { z18.b }, p3, [%x[outptr], x28]\n"
+    "incb x28, ALL, MUL #4\n"
+    "st1b { z17.b }, p2, [%x[outptr], x27]\n"
+    "incb x27, ALL, MUL #4\n"
+    "st1b { z16.b }, p1, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "b.any 1b\n"
+    "7:"  // Single vector of channels
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.none 14f\n"
+    "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z5.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x20, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "beq 10f\n"
+    "9:"  // Single vector of channels: Loop: 4 inputs loop
+    "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+    "bgt 9b\n"
+    "10:"  // Single vector of channels: Loop: 4 inputs tail
+    "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+    "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "11:"  // Single vector of channels: Loop: After loop
+    "ands x21, %x[n_valid_cells], #0x3\n"
+    "beq 13f\n"
+    "12:"  // Single vector of channels: Loop: Single input loop
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "bgt 12b\n"
+    "13:"  // Single vector of channels: Loop: Single input loop: End
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1rw { z18.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a8b1  // ushllb z17.h, z5.b, #0x0\n"
+    ".inst 0x4508acb0  // ushllt z16.h, z5.b, #0x0\n"
+    "neg z18.s, p0/M, z18.s\n"
+    ".inst 0x45914257  // saddwb z23.s, z18.s, z17.h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z22.s }, p0/Z, [x20]\n"
+    ".inst 0x45914655  // saddwt z21.s, z18.s, z17.h\n"
+    ".inst 0x45904254  // saddwb z20.s, z18.s, z16.h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z19.s }, p0/Z, [x20]\n"
+    ".inst 0x45904652  // saddwt z18.s, z18.s, z16.h\n"
+    ".inst 0x448282d7  // srshl z23.s, p0/M, z23.s, z22.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x448282d5  // srshl z21.s, p0/M, z21.s, z22.s\n"
+    ".inst 0x448282d4  // srshl z20.s, p0/M, z20.s, z22.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x448282d2  // srshl z18.s, p0/M, z18.s, z22.s\n"
+    ".inst 0x04b376f7  // sqrdmulh z23.s, z23.s, z19.s\n"
+    ".inst 0x04b376b5  // sqrdmulh z21.s, z21.s, z19.s\n"
+    ".inst 0x04b37694  // sqrdmulh z20.s, z20.s, z19.s\n"
+    ".inst 0x04b37652  // sqrdmulh z18.s, z18.s, z19.s\n"
+    ".inst 0x44828237  // srshl z23.s, p0/M, z23.s, z17.s\n"
+    ".inst 0x44828235  // srshl z21.s, p0/M, z21.s, z17.s\n"
+    ".inst 0x44828234  // srshl z20.s, p0/M, z20.s, z17.s\n"
+    ".inst 0x44828232  // srshl z18.s, p0/M, z18.s, z17.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "add z21.s, z21.s, z16.s\n"
+    "add z20.s, z20.s, z16.s\n"
+    "add z18.s, z18.s, z16.s\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0xff\n"
+    "smax z23.s, p0/M, z23.s, z17.s\n"
+    "smax z21.s, p0/M, z21.s, z17.s\n"
+    "smax z20.s, p0/M, z20.s, z17.s\n"
+    "smax z18.s, p0/M, z18.s, z17.s\n"
+    "smin z23.s, p0/M, z23.s, z16.s\n"
+    "smin z21.s, p0/M, z21.s, z16.s\n"
+    "smin z20.s, p0/M, z20.s, z16.s\n"
+    "trn1 z17.h, z23.h, z21.h\n"
+    "smin z18.s, p0/M, z18.s, z16.s\n"
+    "trn1 z16.h, z20.h, z18.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
+    "b.any 8b\n"
+    "14:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace pooling
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 8c7a497376..f8293233e6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
 
-  typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+  sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 3; }
-  constexpr static unsigned int pool_cols(void) { return 3; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
-  sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 3c1858633b..1ba78f3fba 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
@@ -82,126 +82,126 @@ void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x4, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x5, #0x0\n"
-    "ldr x6, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x19, #0x4\n"
-    "add x7, %x[args], %[offsetof_rescale]\n"
-    "ldp x8, x17, [x20, #0x0]\n"
-    "ldp x16, x15, [x20, #0x10]\n"
-    "whilelt p0.h, XZR, x19\n"
-    "ldp x14, x13, [x6, #0x0]\n"
-    "whilelt p1.h, x4, x3\n"
-    "ldp x12, x11, [x6, #0x10]\n"
-    "ldp x10, x9, [x6, #0x20]\n"
-    "ldp x28, x27, [x6, #0x30]\n"
-    "ldp x26, x25, [x6, #0x40]\n"
-    "ldp x24, x23, [x6, #0x50]\n"
-    "ldp x22, x21, [x6, #0x60]\n"
-    "ldp x20, x19, [x6, #0x70]\n"
-    "ld1rqh { z7.h }, p0/Z, [x7]\n"
-    "ld1h { z8.h }, p1/Z, [x9, x4, LSL #1]\n"
-    "ld1h { z6.h }, p1/Z, [x28, x4, LSL #1]\n"
-    "ld1h { z5.h }, p1/Z, [x25, x4, LSL #1]\n"
-    "ld1h { z4.h }, p1/Z, [x24, x4, LSL #1]\n"
-    "ld1h { z3.h }, p1/Z, [x13, x4, LSL #1]\n"
-    "ld1h { z2.h }, p1/Z, [x12, x4, LSL #1]\n"
-    "ld1h { z1.h }, p1/Z, [x10, x4, LSL #1]\n"
-    "ld1h { z0.h }, p1/Z, [x26, x4, LSL #1]\n"
-    "ld1h { z31.h }, p1/Z, [x27, x4, LSL #1]\n"
-    "ld1h { z30.h }, p1/Z, [x23, x4, LSL #1]\n"
-    "ld1h { z29.h }, p1/Z, [x21, x4, LSL #1]\n"
-    "ld1h { z28.h }, p1/Z, [x20, x4, LSL #1]\n"
-    "ld1h { z27.h }, p1/Z, [x14, x4, LSL #1]\n"
-    "ld1h { z26.h }, p1/Z, [x11, x4, LSL #1]\n"
-    "ld1h { z25.h }, p1/Z, [x22, x4, LSL #1]\n"
-    "ld1h { z24.h }, p1/Z, [x19, x4, LSL #1]\n"
-    "incw x4\n"
-    "whilelt p1.h, x4, x3\n"
+    "ldr x2, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x3, #0x0\n"
+    "mov x20, #0x4\n"
+    "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x5, x6, [x21, #0x0]\n"
+    "whilelt p2.h, XZR, x20\n"
+    "whilelt p0.h, x3, x2\n"
+    "ldp x7, x8, [x21, #0x10]\n"
+    "ldp x17, x16, [x4, #0x0]\n"
+    "add x15, %x[args], %[offsetof_rescale]\n"
+    "mov x14, #0x0\n"
+    "ldp x13, x12, [x4, #0x10]\n"
+    "ldp x11, x10, [x4, #0x20]\n"
+    "ldp x9, x28, [x4, #0x30]\n"
+    "ldp x27, x26, [x4, #0x40]\n"
+    "ldp x25, x24, [x4, #0x50]\n"
+    "ldp x23, x22, [x4, #0x60]\n"
+    "ldp x21, x20, [x4, #0x70]\n"
+    "ld1h { z7.h }, p0/Z, [x10, x3, LSL #1]\n"
+    "ld1h { z6.h }, p0/Z, [x9, x3, LSL #1]\n"
+    "ld1h { z5.h }, p0/Z, [x26, x3, LSL #1]\n"
+    "ld1h { z4.h }, p0/Z, [x25, x3, LSL #1]\n"
+    "ld1h { z3.h }, p0/Z, [x16, x3, LSL #1]\n"
+    "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
+    "ld1h { z1.h }, p0/Z, [x11, x3, LSL #1]\n"
+    "ld1h { z31.h }, p0/Z, [x27, x3, LSL #1]\n"
+    "ld1h { z30.h }, p0/Z, [x28, x3, LSL #1]\n"
+    "ld1h { z29.h }, p0/Z, [x24, x3, LSL #1]\n"
+    "ld1h { z28.h }, p0/Z, [x22, x3, LSL #1]\n"
+    "ld1h { z27.h }, p0/Z, [x21, x3, LSL #1]\n"
+    "ld1h { z26.h }, p0/Z, [x17, x3, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
+    "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
+    "incw x3\n"
+    "whilelt p1.h, x3, x2\n"
+    "ld1rqh { z0.h }, p2/Z, [x15]\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
-    "fadd z17.h, z8.h, z6.h\n"
-    "ld1h { z8.h }, p1/Z, [x9, x4, LSL #1]\n"
-    "whilelt p0.h, x5, x3\n"
+    "fadd z17.h, z7.h, z6.h\n"
     "fadd z16.h, z5.h, z4.h\n"
-    "ld1h { z6.h }, p1/Z, [x28, x4, LSL #1]\n"
+    "ld1h { z7.h }, p1/Z, [x10, x3, LSL #1]\n"
+    "ld1h { z6.h }, p1/Z, [x9, x3, LSL #1]\n"
+    "fadd z19.h, z17.h, z16.h\n"
     "fadd z18.h, z3.h, z2.h\n"
-    "ld1h { z5.h }, p1/Z, [x25, x4, LSL #1]\n"
-    "fadd z23.h, z1.h, z0.h\n"
-    "ld1h { z4.h }, p1/Z, [x24, x4, LSL #1]\n"
-    "fadd z22.h, z31.h, z30.h\n"
-    "ld1h { z3.h }, p1/Z, [x13, x4, LSL #1]\n"
-    "fadd z17.h, z17.h, z16.h\n"
-    "ld1h { z2.h }, p1/Z, [x12, x4, LSL #1]\n"
-    "fadd z16.h, z29.h, z28.h\n"
-    "ld1h { z1.h }, p1/Z, [x10, x4, LSL #1]\n"
-    "fadd z19.h, z27.h, z23.h\n"
-    "ld1h { z0.h }, p1/Z, [x26, x4, LSL #1]\n"
-    "fadd z21.h, z18.h, z17.h\n"
-    "ld1h { z31.h }, p1/Z, [x27, x4, LSL #1]\n"
-    "fadd z20.h, z16.h, z17.h\n"
-    "ld1h { z30.h }, p1/Z, [x23, x4, LSL #1]\n"
-    "fadd z18.h, z26.h, z22.h\n"
-    "ld1h { z29.h }, p1/Z, [x21, x4, LSL #1]\n"
-    "fadd z17.h, z25.h, z23.h\n"
-    "ld1h { z28.h }, p1/Z, [x20, x4, LSL #1]\n"
-    "fadd z16.h, z24.h, z22.h\n"
-    "ld1h { z27.h }, p1/Z, [x14, x4, LSL #1]\n"
+    "ld1h { z5.h }, p1/Z, [x26, x3, LSL #1]\n"
+    "ld1h { z4.h }, p1/Z, [x25, x3, LSL #1]\n"
+    "fadd z17.h, z1.h, z31.h\n"
+    "fadd z22.h, z30.h, z29.h\n"
+    "ld1h { z3.h }, p1/Z, [x16, x3, LSL #1]\n"
+    "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
+    "fadd z16.h, z28.h, z27.h\n"
+    "fadd z21.h, z18.h, z19.h\n"
+    "ld1h { z1.h }, p1/Z, [x11, x3, LSL #1]\n"
+    "ld1h { z31.h }, p1/Z, [x27, x3, LSL #1]\n"
+    "fadd z20.h, z16.h, z19.h\n"
+    "fadd z19.h, z26.h, z17.h\n"
+    "ld1h { z30.h }, p1/Z, [x28, x3, LSL #1]\n"
+    "ld1h { z29.h }, p1/Z, [x24, x3, LSL #1]\n"
+    "fadd z18.h, z25.h, z22.h\n"
+    "fadd z17.h, z24.h, z17.h\n"
+    "ld1h { z28.h }, p1/Z, [x22, x3, LSL #1]\n"
+    "ld1h { z27.h }, p1/Z, [x21, x3, LSL #1]\n"
+    "fadd z16.h, z23.h, z22.h\n"
+    "ld1h { z26.h }, p1/Z, [x17, x3, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
     "fadd z19.h, z21.h, z19.h\n"
-    "ld1h { z26.h }, p1/Z, [x11, x4, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
+    "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
+    "incw x3\n"
     "fadd z18.h, z21.h, z18.h\n"
-    "ld1h { z25.h }, p1/Z, [x22, x4, LSL #1]\n"
     "fadd z17.h, z17.h, z20.h\n"
-    "ld1h { z24.h }, p1/Z, [x19, x4, LSL #1]\n"
-    "incw x4\n"
-    "fadd z16.h, z20.h, z16.h\n"
-    "whilelt p1.h, x4, x3\n"
-    "fmul z19.h, z19.h, z7.h[0]\n"
-    "st1h { z19.h }, p0, [x8, x5, LSL #1]\n"
-    "fmul z18.h, z18.h, z7.h[1]\n"
-    "fmul z17.h, z17.h, z7.h[2]\n"
-    "st1h { z18.h }, p0, [x17, x5, LSL #1]\n"
-    "fmul z16.h, z16.h, z7.h[3]\n"
-    "st1h { z17.h }, p0, [x16, x5, LSL #1]\n"
-    "st1h { z16.h }, p0, [x15, x5, LSL #1]\n"
-    "incw x5\n"
+    "fadd z16.h, z16.h, z20.h\n"
+    "whilelt p0.h, x14, x2\n"
+    "whilelt p1.h, x3, x2\n"
+    "fmul z19.h, z19.h, z0.h[0]\n"
+    "fmul z18.h, z18.h, z0.h[1]\n"
+    "st1h { z19.h }, p0, [x5, x14, LSL #1]\n"
+    "fmul z17.h, z17.h, z0.h[2]\n"
+    "fmul z16.h, z16.h, z0.h[3]\n"
+    "st1h { z18.h }, p0, [x6, x14, LSL #1]\n"
+    "st1h { z17.h }, p0, [x7, x14, LSL #1]\n"
+    "st1h { z16.h }, p0, [x8, x14, LSL #1]\n"
+    "incw x14\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
-    "fadd z17.h, z8.h, z6.h\n"
-    "whilelt p0.h, x5, x3\n"
+    "fadd z17.h, z7.h, z6.h\n"
     "fadd z16.h, z5.h, z4.h\n"
+    "whilelt p0.h, x14, x2\n"
+    "fadd z20.h, z17.h, z16.h\n"
     "fadd z18.h, z3.h, z2.h\n"
-    "fadd z23.h, z1.h, z0.h\n"
-    "fadd z17.h, z17.h, z16.h\n"
-    "fadd z22.h, z31.h, z30.h\n"
-    "fadd z16.h, z29.h, z28.h\n"
-    "fadd z21.h, z18.h, z17.h\n"
-    "fadd z19.h, z27.h, z23.h\n"
-    "fadd z20.h, z16.h, z17.h\n"
-    "fadd z18.h, z26.h, z22.h\n"
-    "fadd z17.h, z25.h, z23.h\n"
-    "fadd z16.h, z24.h, z22.h\n"
-    "fadd z19.h, z21.h, z19.h\n"
+    "fadd z17.h, z1.h, z31.h\n"
+    "fadd z19.h, z30.h, z29.h\n"
+    "fadd z16.h, z28.h, z27.h\n"
+    "fadd z21.h, z18.h, z20.h\n"
+    "fadd z20.h, z16.h, z20.h\n"
+    "fadd z16.h, z26.h, z17.h\n"
+    "fadd z18.h, z25.h, z19.h\n"
+    "fadd z17.h, z24.h, z17.h\n"
+    "fadd z19.h, z23.h, z19.h\n"
+    "fadd z16.h, z21.h, z16.h\n"
+    "fmul z16.h, z16.h, z0.h[0]\n"
+    "st1h { z16.h }, p0, [x5, x14, LSL #1]\n"
     "fadd z18.h, z21.h, z18.h\n"
     "fadd z17.h, z17.h, z20.h\n"
-    "fadd z16.h, z20.h, z16.h\n"
-    "fmul z19.h, z19.h, z7.h[0]\n"
-    "st1h { z19.h }, p0, [x8, x5, LSL #1]\n"
-    "fmul z18.h, z18.h, z7.h[1]\n"
-    "fmul z17.h, z17.h, z7.h[2]\n"
-    "st1h { z18.h }, p0, [x17, x5, LSL #1]\n"
-    "fmul z16.h, z16.h, z7.h[3]\n"
-    "st1h { z17.h }, p0, [x16, x5, LSL #1]\n"
-    "st1h { z16.h }, p0, [x15, x5, LSL #1]\n"
+    "fmul z18.h, z18.h, z0.h[1]\n"
+    "fmul z17.h, z17.h, z0.h[2]\n"
+    "fadd z16.h, z19.h, z20.h\n"
+    "fmul z16.h, z16.h, z0.h[3]\n"
+    "st1h { z18.h }, p0, [x6, x14, LSL #1]\n"
+    "st1h { z17.h }, p0, [x7, x14, LSL #1]\n"
+    "st1h { z16.h }, p0, [x8, x14, LSL #1]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
-    : "cc", "memory", "p0", "p1", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp
index 391d47cf41..49231484e6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp16_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
 
-struct sve_fp16_nhwc_avg_generic_depthfirst
+struct sve_fp16_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_fp16_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
   sve_fp16_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_fp16_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 84a6acf80d..2bef44ea5c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
@@ -41,88 +42,88 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
   const auto rescale_value = static_cast<__fp16>(1.0f / static_cast<float>(window_cells));
 
   __asm__ __volatile__(
+    "mov x9, #0x0\n"
+    "cnth x28\n"
+    "cnth x27, ALL, MUL #2\n"
+    "cnth x26, ALL, MUL #3\n"
     "ptrue p0.b\n"
-    "ld1rh { z8.h }, p0/Z, [%x[rescale_ptr]]\n"
-    "mov x28, #0x0\n"
-    "cnth x27\n"
-    "cnth x26, ALL, MUL #2\n"
-    "cnth x25, ALL, MUL #3\n"
-    "whilelt p3.h, x28, %x[n_channels]\n"
-    "whilelt p2.h, x27, %x[n_channels]\n"
-    "whilelt p1.h, x26, %x[n_channels]\n"
-    "whilelt p0.h, x25, %x[n_channels]\n"
+    "whilelt p3.h, x9, %x[n_channels]\n"
+    "ld1rh { z7.h }, p0/Z, [%x[rescale_ptr]]\n"
+    "whilelt p2.h, x28, %x[n_channels]\n"
+    "whilelt p1.h, x27, %x[n_channels]\n"
+    "whilelt p0.h, x26, %x[n_channels]\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
-    "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "mov z6.b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
     "mov z4.b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
-    "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
-    "ld1h { z29.h }, p2/Z, [x21, x27, LSL #1]\n"
-    "ld1h { z28.h }, p2/Z, [x20, x27, LSL #1]\n"
-    "ld1h { z27.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "ld1h { z21.h }, p1/Z, [x22, x26, LSL #1]\n"
-    "ld1h { z26.h }, p1/Z, [x21, x26, LSL #1]\n"
-    "ld1h { z17.h }, p1/Z, [x20, x26, LSL #1]\n"
-    "ld1h { z25.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "ld1h { z20.h }, p0/Z, [x22, x25, LSL #1]\n"
-    "ld1h { z24.h }, p0/Z, [x21, x25, LSL #1]\n"
-    "ld1h { z16.h }, p0/Z, [x20, x25, LSL #1]\n"
+    "mov z3.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "ld1h { z30.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z29.h }, p2/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fadd z23.h, z3.h, z2.h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd z19.h, z1.h, z0.h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd z22.h, z31.h, z30.h\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
+    "fadd z23.h, z2.h, z1.h\n"
+    "fadd z19.h, z0.h, z31.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z22.h, z30.h, z22.h\n"
     "fadd z18.h, z29.h, z28.h\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
     "fadd z21.h, z27.h, z21.h\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
     "fadd z17.h, z26.h, z17.h\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
     "fadd z20.h, z25.h, z20.h\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
     "fadd z16.h, z24.h, z16.h\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
     "fadd z19.h, z23.h, z19.h\n"
-    "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
     "fadd z18.h, z22.h, z18.h\n"
-    "ld1h { z29.h }, p2/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z30.h }, p2/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
     "fadd z17.h, z21.h, z17.h\n"
-    "ld1h { z28.h }, p2/Z, [x20, x27, LSL #1]\n"
     "fadd z16.h, z20.h, z16.h\n"
-    "ld1h { z27.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "fadd z7.h, z7.h, z19.h\n"
-    "ld1h { z21.h }, p1/Z, [x22, x26, LSL #1]\n"
-    "fadd z6.h, z6.h, z18.h\n"
-    "ld1h { z26.h }, p1/Z, [x21, x26, LSL #1]\n"
-    "fadd z5.h, z5.h, z17.h\n"
-    "ld1h { z17.h }, p1/Z, [x20, x26, LSL #1]\n"
-    "fadd z4.h, z4.h, z16.h\n"
-    "ld1h { z25.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "ld1h { z20.h }, p0/Z, [x22, x25, LSL #1]\n"
-    "ld1h { z24.h }, p0/Z, [x21, x25, LSL #1]\n"
-    "ld1h { z16.h }, p0/Z, [x20, x25, LSL #1]\n"
+    "ld1h { z29.h }, p2/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "fadd z6.h, z6.h, z19.h\n"
+    "fadd z5.h, z5.h, z18.h\n"
+    "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+    "fadd z4.h, z4.h, z17.h\n"
+    "fadd z3.h, z3.h, z16.h\n"
+    "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fadd z23.h, z3.h, z2.h\n"
-    "fadd z19.h, z1.h, z0.h\n"
-    "fadd z22.h, z31.h, z30.h\n"
+    "fadd z23.h, z2.h, z1.h\n"
+    "fadd z19.h, z0.h, z31.h\n"
+    "fadd z22.h, z30.h, z22.h\n"
     "fadd z18.h, z29.h, z28.h\n"
     "fadd z21.h, z27.h, z21.h\n"
     "fadd z17.h, z26.h, z17.h\n"
@@ -132,100 +133,99 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
     "fadd z18.h, z22.h, z18.h\n"
     "fadd z17.h, z21.h, z17.h\n"
     "fadd z16.h, z20.h, z16.h\n"
-    "fadd z7.h, z7.h, z19.h\n"
-    "fadd z6.h, z6.h, z18.h\n"
-    "fadd z5.h, z5.h, z17.h\n"
-    "fadd z4.h, z4.h, z16.h\n"
+    "fadd z6.h, z6.h, z19.h\n"
+    "fadd z5.h, z5.h, z18.h\n"
+    "fadd z4.h, z4.h, z17.h\n"
+    "fadd z3.h, z3.h, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "fadd z7.h, z7.h, z3.h\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
-    "ld1h { z27.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "fadd z6.h, z6.h, z31.h\n"
-    "ld1h { z25.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "fadd z5.h, z5.h, z27.h\n"
-    "fadd z4.h, z4.h, z25.h\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z6.h, z6.h, z16.h\n"
+    "ld1h { z17.h }, p2/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
+    "fadd z5.h, z5.h, z17.h\n"
+    "fadd z4.h, z4.h, z16.h\n"
+    "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+    "fadd z3.h, z3.h, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "fmul z7.h, z7.h, z8.h\n"
-    "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
-    "fmul z6.h, z6.h, z8.h\n"
+    "fmul z6.h, z6.h, z7.h\n"
+    "fmul z5.h, z5.h, z7.h\n"
+    "st1h { z6.h }, p3, [%x[outptr], x9, LSL #1]\n"
+    "fmul z4.h, z4.h, z7.h\n"
+    "fmul z3.h, z3.h, z7.h\n"
+    "st1h { z5.h }, p2, [%x[outptr], x28, LSL #1]\n"
+    "st1h { z4.h }, p1, [%x[outptr], x27, LSL #1]\n"
+    "inch x9, ALL, MUL #4\n"
     "inch x28, ALL, MUL #4\n"
-    "fmul z5.h, z5.h, z8.h\n"
-    "st1h { z6.h }, p2, [%x[outptr], x27, LSL #1]\n"
-    "fmul z4.h, z4.h, z8.h\n"
-    "inch x27, ALL, MUL #4\n"
-    "st1h { z5.h }, p1, [%x[outptr], x26, LSL #1]\n"
+    "st1h { z3.h }, p0, [%x[outptr], x26, LSL #1]\n"
     "inch x26, ALL, MUL #4\n"
-    "st1h { z4.h }, p0, [%x[outptr], x25, LSL #1]\n"
-    "inch x25, ALL, MUL #4\n"
-    "whilelt p0.h, x25, %x[n_channels]\n"
+    "whilelt p0.h, x26, %x[n_channels]\n"
+    "inch x27, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.h, x28, %x[n_channels]\n"
+    "whilelt p3.h, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z6.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fadd z23.h, z3.h, z2.h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd z19.h, z1.h, z0.h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd z19.h, z23.h, z19.h\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "fadd z7.h, z7.h, z19.h\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "fadd z17.h, z2.h, z1.h\n"
+    "fadd z16.h, z0.h, z31.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z16.h, z17.h, z16.h\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z6.h, z6.h, z16.h\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fadd z23.h, z3.h, z2.h\n"
-    "fadd z19.h, z1.h, z0.h\n"
-    "fadd z19.h, z23.h, z19.h\n"
-    "fadd z7.h, z7.h, z19.h\n"
+    "fadd z17.h, z2.h, z1.h\n"
+    "fadd z16.h, z0.h, z31.h\n"
+    "fadd z16.h, z17.h, z16.h\n"
+    "fadd z6.h, z6.h, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "fadd z7.h, z7.h, z3.h\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z6.h, z6.h, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "fmul z7.h, z7.h, z8.h\n"
-    "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
-    "inch x28\n"
-    "whilelt p3.h, x28, %x[n_channels]\n"
+    "fmul z6.h, z6.h, z7.h\n"
+    "st1h { z6.h }, p3, [%x[outptr], x9, LSL #1]\n"
+    "inch x9\n"
+    "whilelt p3.h, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 5fb297eb49..3691b6cb28 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
+  using Parent = DepthfirstStrategy<__fp16, __fp16>;
 
-  typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index f6e23215b8..31bbfd085e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
@@ -63,84 +63,84 @@ void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x14, #0x0\n"
+    "whilelt p0.h, x14, x15\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x13, x12, [x21, #0x0]\n"
     "ptrue p2.b\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x12, #0x0\n"
-    "ldp x11, x10, [x20, #0x0]\n"
-    "whilelt p1.h, x13, x14\n"
-    "ldp x9, x28, [x20, #0x10]\n"
-    "ldp x27, x26, [x19, #0x0]\n"
-    "ldp x25, x24, [x19, #0x10]\n"
-    "ldp x23, x22, [x19, #0x20]\n"
-    "ldp x21, x20, [x19, #0x30]\n"
-    "ldr x19, [x19, #0x40]\n"
-    "ld1h { z31.h }, p1/Z, [x26, x13, LSL #1]\n"
-    "ld1h { z30.h }, p1/Z, [x23, x13, LSL #1]\n"
-    "ld1h { z29.h }, p1/Z, [x20, x13, LSL #1]\n"
-    "ld1h { z28.h }, p1/Z, [x24, x13, LSL #1]\n"
-    "ld1h { z27.h }, p1/Z, [x27, x13, LSL #1]\n"
-    "ld1h { z26.h }, p1/Z, [x22, x13, LSL #1]\n"
-    "ld1h { z25.h }, p1/Z, [x25, x13, LSL #1]\n"
-    "ld1h { z24.h }, p1/Z, [x21, x13, LSL #1]\n"
-    "ld1h { z23.h }, p1/Z, [x19, x13, LSL #1]\n"
-    "incw x13\n"
-    "whilelt p1.h, x13, x14\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1h { z31.h }, p0/Z, [x27, x14, LSL #1]\n"
+    "ld1h { z30.h }, p0/Z, [x24, x14, LSL #1]\n"
+    "ld1h { z29.h }, p0/Z, [x21, x14, LSL #1]\n"
+    "ld1h { z28.h }, p0/Z, [x25, x14, LSL #1]\n"
+    "ld1h { z27.h }, p0/Z, [x28, x14, LSL #1]\n"
+    "ld1h { z26.h }, p0/Z, [x26, x14, LSL #1]\n"
+    "ld1h { z25.h }, p0/Z, [x23, x14, LSL #1]\n"
+    "ld1h { z24.h }, p0/Z, [x22, x14, LSL #1]\n"
+    "ld1h { z23.h }, p0/Z, [x20, x14, LSL #1]\n"
+    "incw x14\n"
+    "whilelt p1.h, x14, x15\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
     "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
-    "ld1h { z31.h }, p1/Z, [x26, x13, LSL #1]\n"
-    "whilelt p0.h, x12, x14\n"
     "movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
-    "ld1h { z30.h }, p1/Z, [x23, x13, LSL #1]\n"
-    "movprfx z18, z28\n fmax z18.h, p2/M, z18.h, z27.h\n"
-    "ld1h { z29.h }, p1/Z, [x20, x13, LSL #1]\n"
-    "movprfx z17, z26\n fmax z17.h, p2/M, z17.h, z25.h\n"
-    "ld1h { z27.h }, p1/Z, [x27, x13, LSL #1]\n"
-    "movprfx z16, z24\n fmax z16.h, p2/M, z16.h, z28.h\n"
-    "ld1h { z28.h }, p1/Z, [x24, x13, LSL #1]\n"
-    "movprfx z20, z26\n fmax z20.h, p2/M, z20.h, z23.h\n"
-    "ld1h { z26.h }, p1/Z, [x22, x13, LSL #1]\n"
-    "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
-    "ld1h { z25.h }, p1/Z, [x25, x13, LSL #1]\n"
-    "movprfx z18, z22\n fmax z18.h, p2/M, z18.h, z17.h\n"
-    "ld1h { z24.h }, p1/Z, [x21, x13, LSL #1]\n"
-    "movprfx z17, z21\n fmax z17.h, p2/M, z17.h, z16.h\n"
-    "ld1h { z23.h }, p1/Z, [x19, x13, LSL #1]\n"
-    "incw x13\n"
-    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
-    "st1h { z19.h }, p0, [x11, x12, LSL #1]\n"
-    "whilelt p1.h, x13, x14\n"
-    "st1h { z18.h }, p0, [x10, x12, LSL #1]\n"
-    "st1h { z17.h }, p0, [x9, x12, LSL #1]\n"
-    "st1h { z16.h }, p0, [x28, x12, LSL #1]\n"
-    "incw x12\n"
+    "ld1h { z31.h }, p1/Z, [x27, x14, LSL #1]\n"
+    "ld1h { z30.h }, p1/Z, [x24, x14, LSL #1]\n"
+    "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
+    "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
+    "ld1h { z29.h }, p1/Z, [x21, x14, LSL #1]\n"
+    "ld1h { z27.h }, p1/Z, [x28, x14, LSL #1]\n"
+    "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
+    "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
+    "ld1h { z28.h }, p1/Z, [x25, x14, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x26, x14, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x23, x14, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x22, x14, LSL #1]\n"
+    "whilelt p0.h, x11, x15\n"
+    "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+    "ld1h { z23.h }, p1/Z, [x20, x14, LSL #1]\n"
+    "incw x14\n"
+    "whilelt p1.h, x14, x15\n"
+    "st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
+    "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
+    "fmax z17.h, p2/M, z17.h, z21.h\n"
+    "st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
+    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
+    "st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
+    "st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
+    "incw x11\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
     "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
-    "whilelt p0.h, x12, x14\n"
     "movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
-    "movprfx z18, z28\n fmax z18.h, p2/M, z18.h, z27.h\n"
-    "movprfx z17, z26\n fmax z17.h, p2/M, z17.h, z25.h\n"
-    "movprfx z16, z24\n fmax z16.h, p2/M, z16.h, z28.h\n"
-    "movprfx z20, z26\n fmax z20.h, p2/M, z20.h, z23.h\n"
-    "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
-    "st1h { z19.h }, p0, [x11, x12, LSL #1]\n"
-    "movprfx z18, z22\n fmax z18.h, p2/M, z18.h, z17.h\n"
-    "movprfx z17, z21\n fmax z17.h, p2/M, z17.h, z16.h\n"
-    "st1h { z18.h }, p0, [x10, x12, LSL #1]\n"
-    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
-    "st1h { z17.h }, p0, [x9, x12, LSL #1]\n"
-    "st1h { z16.h }, p0, [x28, x12, LSL #1]\n"
+    "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
+    "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
+    "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
+    "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
+    "whilelt p0.h, x11, x15\n"
+    "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+    "st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
+    "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
+    "fmax z17.h, p2/M, z17.h, z21.h\n"
+    "st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
+    "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
+    "st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
+    "st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp
index 1c17c27619..0ef0a793cc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp16_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
 
-struct sve_fp16_nhwc_max_generic_depthfirst
+struct sve_fp16_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
 {
-  typedef __fp16 operand_type;
-  typedef __fp16 return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_fp16_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
   sve_fp16_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_fp16_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
index 58ab915605..1a01412836 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
 
 namespace arm_conv {
 namespace pooling {
@@ -39,185 +40,184 @@ void sve_fp16_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cnth x27\n"
-    "cnth x26, ALL, MUL #2\n"
-    "cnth x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cnth x28\n"
+    "cnth x27, ALL, MUL #2\n"
+    "cnth x26, ALL, MUL #3\n"
+    "whilelt p4.h, x9, %x[n_channels]\n"
     "whilelt p3.h, x28, %x[n_channels]\n"
     "whilelt p2.h, x27, %x[n_channels]\n"
     "whilelt p1.h, x26, %x[n_channels]\n"
-    "whilelt p0.h, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.h, #0xfc00\n"
     "mov z7.h, #0xfc00\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x24, %x[inptrs]\n"
     "mov z6.h, #0xfc00\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.h, #0xfc00\n"
-    "mov z4.h, #0xfc00\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
-    "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
-    "ld1h { z22.h }, p2/Z, [x21, x27, LSL #1]\n"
-    "ld1h { z29.h }, p2/Z, [x20, x27, LSL #1]\n"
-    "ld1h { z28.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "ld1h { z27.h }, p1/Z, [x22, x26, LSL #1]\n"
-    "ld1h { z21.h }, p1/Z, [x21, x26, LSL #1]\n"
-    "ld1h { z26.h }, p1/Z, [x20, x26, LSL #1]\n"
-    "ld1h { z16.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "ld1h { z25.h }, p0/Z, [x22, x25, LSL #1]\n"
-    "ld1h { z20.h }, p0/Z, [x21, x25, LSL #1]\n"
-    "ld1h { z24.h }, p0/Z, [x20, x25, LSL #1]\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
+    "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
+    "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+    "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n fmax z18.h, p4/M, z18.h, z30.h\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "fmax z22.h, p4/M, z22.h, z29.h\n"
-    "movprfx z17, z28\n fmax z17.h, p4/M, z17.h, z27.h\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "fmax z21.h, p4/M, z21.h, z26.h\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "fmax z16.h, p4/M, z16.h, z25.h\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
-    "fmax z20.h, p4/M, z20.h, z24.h\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
-    "fmax z19.h, p4/M, z19.h, z23.h\n"
-    "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
-    "fmax z18.h, p4/M, z18.h, z22.h\n"
-    "ld1h { z22.h }, p2/Z, [x21, x27, LSL #1]\n"
-    "fmax z17.h, p4/M, z17.h, z21.h\n"
-    "ld1h { z29.h }, p2/Z, [x20, x27, LSL #1]\n"
-    "fmax z16.h, p4/M, z16.h, z20.h\n"
-    "ld1h { z28.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "fmax z7.h, p4/M, z7.h, z19.h\n"
-    "ld1h { z27.h }, p1/Z, [x22, x26, LSL #1]\n"
-    "fmax z6.h, p4/M, z6.h, z18.h\n"
-    "ld1h { z21.h }, p1/Z, [x21, x26, LSL #1]\n"
-    "fmax z5.h, p4/M, z5.h, z17.h\n"
-    "ld1h { z26.h }, p1/Z, [x20, x26, LSL #1]\n"
-    "fmax z4.h, p4/M, z4.h, z16.h\n"
-    "ld1h { z16.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "ld1h { z25.h }, p0/Z, [x22, x25, LSL #1]\n"
-    "ld1h { z20.h }, p0/Z, [x21, x25, LSL #1]\n"
-    "ld1h { z24.h }, p0/Z, [x20, x25, LSL #1]\n"
+    "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
+    "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n fmax z18.h, p0/M, z18.h, z31.h\n"
+    "fmax z22.h, p0/M, z22.h, z30.h\n"
+    "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "movprfx z17, z29\n fmax z17.h, p0/M, z17.h, z28.h\n"
+    "fmax z21.h, p0/M, z21.h, z27.h\n"
+    "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "movprfx z16, z26\n fmax z16.h, p0/M, z16.h, z25.h\n"
+    "fmax z20.h, p0/M, z20.h, z24.h\n"
+    "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
+    "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
+    "fmax z19.h, p0/M, z19.h, z23.h\n"
+    "fmax z18.h, p0/M, z18.h, z22.h\n"
+    "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+    "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "fmax z17.h, p0/M, z17.h, z21.h\n"
+    "fmax z16.h, p0/M, z16.h, z20.h\n"
+    "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
+    "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z8.h, p0/M, z8.h, z19.h\n"
+    "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+    "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "fmax z7.h, p0/M, z7.h, z18.h\n"
+    "fmax z6.h, p0/M, z6.h, z17.h\n"
+    "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
+    "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+    "fmax z5.h, p0/M, z5.h, z16.h\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+    "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
-    "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
-    "movprfx z18, z31\n fmax z18.h, p4/M, z18.h, z30.h\n"
-    "fmax z22.h, p4/M, z22.h, z29.h\n"
-    "movprfx z17, z28\n fmax z17.h, p4/M, z17.h, z27.h\n"
-    "fmax z21.h, p4/M, z21.h, z26.h\n"
-    "fmax z16.h, p4/M, z16.h, z25.h\n"
-    "fmax z20.h, p4/M, z20.h, z24.h\n"
-    "fmax z19.h, p4/M, z19.h, z23.h\n"
-    "fmax z18.h, p4/M, z18.h, z22.h\n"
-    "fmax z17.h, p4/M, z17.h, z21.h\n"
-    "fmax z16.h, p4/M, z16.h, z20.h\n"
-    "fmax z7.h, p4/M, z7.h, z19.h\n"
-    "fmax z6.h, p4/M, z6.h, z18.h\n"
-    "fmax z5.h, p4/M, z5.h, z17.h\n"
-    "fmax z4.h, p4/M, z4.h, z16.h\n"
+    "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
+    "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
+    "movprfx z18, z0\n fmax z18.h, p0/M, z18.h, z31.h\n"
+    "fmax z22.h, p0/M, z22.h, z30.h\n"
+    "movprfx z17, z29\n fmax z17.h, p0/M, z17.h, z28.h\n"
+    "fmax z21.h, p0/M, z21.h, z27.h\n"
+    "movprfx z16, z26\n fmax z16.h, p0/M, z16.h, z25.h\n"
+    "fmax z20.h, p0/M, z20.h, z24.h\n"
+    "fmax z19.h, p0/M, z19.h, z23.h\n"
+    "fmax z18.h, p0/M, z18.h, z22.h\n"
+    "fmax z17.h, p0/M, z17.h, z21.h\n"
+    "fmax z16.h, p0/M, z16.h, z20.h\n"
+    "fmax z8.h, p0/M, z8.h, z19.h\n"
+    "fmax z7.h, p0/M, z7.h, z18.h\n"
+    "fmax z6.h, p0/M, z6.h, z17.h\n"
+    "fmax z5.h, p0/M, z5.h, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "fmax z7.h, p4/M, z7.h, z3.h\n"
-    "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
-    "ld1h { z28.h }, p1/Z, [x23, x26, LSL #1]\n"
-    "fmax z6.h, p4/M, z6.h, z31.h\n"
-    "ld1h { z16.h }, p0/Z, [x23, x25, LSL #1]\n"
-    "fmax z5.h, p4/M, z5.h, z28.h\n"
-    "fmax z4.h, p4/M, z4.h, z16.h\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z8.h, p0/M, z8.h, z16.h\n"
+    "ld1h { z17.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
+    "fmax z7.h, p0/M, z7.h, z17.h\n"
+    "fmax z6.h, p0/M, z6.h, z16.h\n"
+    "ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
+    "fmax z5.h, p0/M, z5.h, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
+    "st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
+    "inch x9, ALL, MUL #4\n"
     "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
     "inch x28, ALL, MUL #4\n"
     "st1h { z6.h }, p2, [%x[outptr], x27, LSL #1]\n"
     "inch x27, ALL, MUL #4\n"
     "st1h { z5.h }, p1, [%x[outptr], x26, LSL #1]\n"
     "inch x26, ALL, MUL #4\n"
-    "st1h { z4.h }, p0, [%x[outptr], x25, LSL #1]\n"
-    "inch x25, ALL, MUL #4\n"
-    "whilelt p0.h, x25, %x[n_channels]\n"
+    "whilelt p1.h, x26, %x[n_channels]\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.h, x28, %x[n_channels]\n"
+    "whilelt p4.h, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.h, #0xfc00\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.h, #0xfc00\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax z19.h, p4/M, z19.h, z23.h\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
-    "fmax z7.h, p4/M, z7.h, z19.h\n"
-    "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
-    "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+    "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
+    "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fmax z16.h, p0/M, z16.h, z17.h\n"
+    "subs x25, x25, #0x1\n"
+    "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+    "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+    "fmax z8.h, p0/M, z8.h, z16.h\n"
+    "add x24, x24, #0x20\n"
+    "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+    "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
-    "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
-    "fmax z19.h, p4/M, z19.h, z23.h\n"
-    "fmax z7.h, p4/M, z7.h, z19.h\n"
+    "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
+    "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
+    "fmax z16.h, p0/M, z16.h, z17.h\n"
+    "fmax z8.h, p0/M, z8.h, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
-    "fmax z7.h, p4/M, z7.h, z3.h\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z8.h, p0/M, z8.h, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
-    "inch x28\n"
-    "whilelt p3.h, x28, %x[n_channels]\n"
+    "st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
+    "inch x9\n"
+    "whilelt p4.h, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 9cbdb8a58d..d5578d617f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
+  using Parent = DepthfirstStrategy<float, float>;
 
-  typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::AVERAGE;
+  const static auto pool_rows = 3u, pool_cols = 3u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+  sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 3; }
-  constexpr static unsigned int pool_cols(void) { return 3; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
-  sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 50f5da4c3d..c5ea5adea0 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -82,126 +82,126 @@ void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
-    "mov x4, #0x0\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x5, #0x0\n"
-    "ldr x6, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x19, #0x4\n"
-    "add x7, %x[args], %[offsetof_rescale]\n"
-    "ldp x8, x17, [x20, #0x0]\n"
-    "ldp x16, x15, [x20, #0x10]\n"
-    "whilelt p0.s, XZR, x19\n"
-    "ldp x14, x13, [x6, #0x0]\n"
-    "whilelt p1.s, x4, x3\n"
-    "ldp x12, x11, [x6, #0x10]\n"
-    "ldp x10, x9, [x6, #0x20]\n"
-    "ldp x28, x27, [x6, #0x30]\n"
-    "ldp x26, x25, [x6, #0x40]\n"
-    "ldp x24, x23, [x6, #0x50]\n"
-    "ldp x22, x21, [x6, #0x60]\n"
-    "ldp x20, x19, [x6, #0x70]\n"
-    "ld1rqw { z7.s }, p0/Z, [x7]\n"
-    "ld1w { z8.s }, p1/Z, [x9, x4, LSL #2]\n"
-    "ld1w { z6.s }, p1/Z, [x28, x4, LSL #2]\n"
-    "ld1w { z5.s }, p1/Z, [x25, x4, LSL #2]\n"
-    "ld1w { z4.s }, p1/Z, [x24, x4, LSL #2]\n"
-    "ld1w { z3.s }, p1/Z, [x13, x4, LSL #2]\n"
-    "ld1w { z2.s }, p1/Z, [x12, x4, LSL #2]\n"
-    "ld1w { z1.s }, p1/Z, [x10, x4, LSL #2]\n"
-    "ld1w { z0.s }, p1/Z, [x26, x4, LSL #2]\n"
-    "ld1w { z31.s }, p1/Z, [x27, x4, LSL #2]\n"
-    "ld1w { z30.s }, p1/Z, [x23, x4, LSL #2]\n"
-    "ld1w { z29.s }, p1/Z, [x21, x4, LSL #2]\n"
-    "ld1w { z28.s }, p1/Z, [x20, x4, LSL #2]\n"
-    "ld1w { z27.s }, p1/Z, [x14, x4, LSL #2]\n"
-    "ld1w { z26.s }, p1/Z, [x11, x4, LSL #2]\n"
-    "ld1w { z25.s }, p1/Z, [x22, x4, LSL #2]\n"
-    "ld1w { z24.s }, p1/Z, [x19, x4, LSL #2]\n"
-    "incw x4\n"
-    "whilelt p1.s, x4, x3\n"
+    "ldr x2, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x3, #0x0\n"
+    "mov x20, #0x4\n"
+    "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x5, x6, [x21, #0x0]\n"
+    "whilelt p2.s, XZR, x20\n"
+    "whilelt p0.s, x3, x2\n"
+    "ldp x7, x8, [x21, #0x10]\n"
+    "ldp x17, x16, [x4, #0x0]\n"
+    "add x15, %x[args], %[offsetof_rescale]\n"
+    "mov x14, #0x0\n"
+    "ldp x13, x12, [x4, #0x10]\n"
+    "ldp x11, x10, [x4, #0x20]\n"
+    "ldp x9, x28, [x4, #0x30]\n"
+    "ldp x27, x26, [x4, #0x40]\n"
+    "ldp x25, x24, [x4, #0x50]\n"
+    "ldp x23, x22, [x4, #0x60]\n"
+    "ldp x21, x20, [x4, #0x70]\n"
+    "ld1w { z7.s }, p0/Z, [x10, x3, LSL #2]\n"
+    "ld1w { z6.s }, p0/Z, [x9, x3, LSL #2]\n"
+    "ld1w { z5.s }, p0/Z, [x26, x3, LSL #2]\n"
+    "ld1w { z4.s }, p0/Z, [x25, x3, LSL #2]\n"
+    "ld1w { z3.s }, p0/Z, [x16, x3, LSL #2]\n"
+    "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
+    "ld1w { z1.s }, p0/Z, [x11, x3, LSL #2]\n"
+    "ld1w { z31.s }, p0/Z, [x27, x3, LSL #2]\n"
+    "ld1w { z30.s }, p0/Z, [x28, x3, LSL #2]\n"
+    "ld1w { z29.s }, p0/Z, [x24, x3, LSL #2]\n"
+    "ld1w { z28.s }, p0/Z, [x22, x3, LSL #2]\n"
+    "ld1w { z27.s }, p0/Z, [x21, x3, LSL #2]\n"
+    "ld1w { z26.s }, p0/Z, [x17, x3, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
+    "incw x3\n"
+    "whilelt p1.s, x3, x2\n"
+    "ld1rqw { z0.s }, p2/Z, [x15]\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
-    "fadd z17.s, z8.s, z6.s\n"
-    "ld1w { z8.s }, p1/Z, [x9, x4, LSL #2]\n"
-    "whilelt p0.s, x5, x3\n"
+    "fadd z17.s, z7.s, z6.s\n"
     "fadd z16.s, z5.s, z4.s\n"
-    "ld1w { z6.s }, p1/Z, [x28, x4, LSL #2]\n"
+    "ld1w { z7.s }, p1/Z, [x10, x3, LSL #2]\n"
+    "ld1w { z6.s }, p1/Z, [x9, x3, LSL #2]\n"
+    "fadd z19.s, z17.s, z16.s\n"
     "fadd z18.s, z3.s, z2.s\n"
-    "ld1w { z5.s }, p1/Z, [x25, x4, LSL #2]\n"
-    "fadd z23.s, z1.s, z0.s\n"
-    "ld1w { z4.s }, p1/Z, [x24, x4, LSL #2]\n"
-    "fadd z22.s, z31.s, z30.s\n"
-    "ld1w { z3.s }, p1/Z, [x13, x4, LSL #2]\n"
-    "fadd z17.s, z17.s, z16.s\n"
-    "ld1w { z2.s }, p1/Z, [x12, x4, LSL #2]\n"
-    "fadd z16.s, z29.s, z28.s\n"
-    "ld1w { z1.s }, p1/Z, [x10, x4, LSL #2]\n"
-    "fadd z19.s, z27.s, z23.s\n"
-    "ld1w { z0.s }, p1/Z, [x26, x4, LSL #2]\n"
-    "fadd z21.s, z18.s, z17.s\n"
-    "ld1w { z31.s }, p1/Z, [x27, x4, LSL #2]\n"
-    "fadd z20.s, z16.s, z17.s\n"
-    "ld1w { z30.s }, p1/Z, [x23, x4, LSL #2]\n"
-    "fadd z18.s, z26.s, z22.s\n"
-    "ld1w { z29.s }, p1/Z, [x21, x4, LSL #2]\n"
-    "fadd z17.s, z25.s, z23.s\n"
-    "ld1w { z28.s }, p1/Z, [x20, x4, LSL #2]\n"
-    "fadd z16.s, z24.s, z22.s\n"
-    "ld1w { z27.s }, p1/Z, [x14, x4, LSL #2]\n"
+    "ld1w { z5.s }, p1/Z, [x26, x3, LSL #2]\n"
+    "ld1w { z4.s }, p1/Z, [x25, x3, LSL #2]\n"
+    "fadd z17.s, z1.s, z31.s\n"
+    "fadd z22.s, z30.s, z29.s\n"
+    "ld1w { z3.s }, p1/Z, [x16, x3, LSL #2]\n"
+    "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
+    "fadd z16.s, z28.s, z27.s\n"
+    "fadd z21.s, z18.s, z19.s\n"
+    "ld1w { z1.s }, p1/Z, [x11, x3, LSL #2]\n"
+    "ld1w { z31.s }, p1/Z, [x27, x3, LSL #2]\n"
+    "fadd z20.s, z16.s, z19.s\n"
+    "fadd z19.s, z26.s, z17.s\n"
+    "ld1w { z30.s }, p1/Z, [x28, x3, LSL #2]\n"
+    "ld1w { z29.s }, p1/Z, [x24, x3, LSL #2]\n"
+    "fadd z18.s, z25.s, z22.s\n"
+    "fadd z17.s, z24.s, z17.s\n"
+    "ld1w { z28.s }, p1/Z, [x22, x3, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x21, x3, LSL #2]\n"
+    "fadd z16.s, z23.s, z22.s\n"
+    "ld1w { z26.s }, p1/Z, [x17, x3, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
     "fadd z19.s, z21.s, z19.s\n"
-    "ld1w { z26.s }, p1/Z, [x11, x4, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
+    "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
+    "incw x3\n"
     "fadd z18.s, z21.s, z18.s\n"
-    "ld1w { z25.s }, p1/Z, [x22, x4, LSL #2]\n"
     "fadd z17.s, z17.s, z20.s\n"
-    "ld1w { z24.s }, p1/Z, [x19, x4, LSL #2]\n"
-    "incw x4\n"
-    "fadd z16.s, z20.s, z16.s\n"
-    "whilelt p1.s, x4, x3\n"
-    "fmul z19.s, z19.s, z7.s[0]\n"
-    "st1w { z19.s }, p0, [x8, x5, LSL #2]\n"
-    "fmul z18.s, z18.s, z7.s[1]\n"
-    "fmul z17.s, z17.s, z7.s[2]\n"
-    "st1w { z18.s }, p0, [x17, x5, LSL #2]\n"
-    "fmul z16.s, z16.s, z7.s[3]\n"
-    "st1w { z17.s }, p0, [x16, x5, LSL #2]\n"
-    "st1w { z16.s }, p0, [x15, x5, LSL #2]\n"
-    "incw x5\n"
+    "fadd z16.s, z16.s, z20.s\n"
+    "whilelt p0.s, x14, x2\n"
+    "whilelt p1.s, x3, x2\n"
+    "fmul z19.s, z19.s, z0.s[0]\n"
+    "fmul z18.s, z18.s, z0.s[1]\n"
+    "st1w { z19.s }, p0, [x5, x14, LSL #2]\n"
+    "fmul z17.s, z17.s, z0.s[2]\n"
+    "fmul z16.s, z16.s, z0.s[3]\n"
+    "st1w { z18.s }, p0, [x6, x14, LSL #2]\n"
+    "st1w { z17.s }, p0, [x7, x14, LSL #2]\n"
+    "st1w { z16.s }, p0, [x8, x14, LSL #2]\n"
+    "incw x14\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
-    "fadd z17.s, z8.s, z6.s\n"
-    "whilelt p0.s, x5, x3\n"
+    "fadd z17.s, z7.s, z6.s\n"
     "fadd z16.s, z5.s, z4.s\n"
+    "whilelt p0.s, x14, x2\n"
+    "fadd z20.s, z17.s, z16.s\n"
     "fadd z18.s, z3.s, z2.s\n"
-    "fadd z23.s, z1.s, z0.s\n"
-    "fadd z17.s, z17.s, z16.s\n"
-    "fadd z22.s, z31.s, z30.s\n"
-    "fadd z16.s, z29.s, z28.s\n"
-    "fadd z21.s, z18.s, z17.s\n"
-    "fadd z19.s, z27.s, z23.s\n"
-    "fadd z20.s, z16.s, z17.s\n"
-    "fadd z18.s, z26.s, z22.s\n"
-    "fadd z17.s, z25.s, z23.s\n"
-    "fadd z16.s, z24.s, z22.s\n"
-    "fadd z19.s, z21.s, z19.s\n"
+    "fadd z17.s, z1.s, z31.s\n"
+    "fadd z19.s, z30.s, z29.s\n"
+    "fadd z16.s, z28.s, z27.s\n"
+    "fadd z21.s, z18.s, z20.s\n"
+    "fadd z20.s, z16.s, z20.s\n"
+    "fadd z16.s, z26.s, z17.s\n"
+    "fadd z18.s, z25.s, z19.s\n"
+    "fadd z17.s, z24.s, z17.s\n"
+    "fadd z19.s, z23.s, z19.s\n"
+    "fadd z16.s, z21.s, z16.s\n"
+    "fmul z16.s, z16.s, z0.s[0]\n"
+    "st1w { z16.s }, p0, [x5, x14, LSL #2]\n"
     "fadd z18.s, z21.s, z18.s\n"
     "fadd z17.s, z17.s, z20.s\n"
-    "fadd z16.s, z20.s, z16.s\n"
-    "fmul z19.s, z19.s, z7.s[0]\n"
-    "st1w { z19.s }, p0, [x8, x5, LSL #2]\n"
-    "fmul z18.s, z18.s, z7.s[1]\n"
-    "fmul z17.s, z17.s, z7.s[2]\n"
-    "st1w { z18.s }, p0, [x17, x5, LSL #2]\n"
-    "fmul z16.s, z16.s, z7.s[3]\n"
-    "st1w { z17.s }, p0, [x16, x5, LSL #2]\n"
-    "st1w { z16.s }, p0, [x15, x5, LSL #2]\n"
+    "fmul z18.s, z18.s, z0.s[1]\n"
+    "fmul z17.s, z17.s, z0.s[2]\n"
+    "fadd z16.s, z19.s, z20.s\n"
+    "fmul z16.s, z16.s, z0.s[3]\n"
+    "st1w { z18.s }, p0, [x6, x14, LSL #2]\n"
+    "st1w { z17.s }, p0, [x7, x14, LSL #2]\n"
+    "st1w { z16.s }, p0, [x8, x14, LSL #2]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
-    : "cc", "memory", "p0", "p1", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp
index 0daa046a02..a9e6b034e7 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp32_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
 
-struct sve_fp32_nhwc_avg_generic_depthfirst
+struct sve_fp32_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_fp32_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<float, float>;
   sve_fp32_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_fp32_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index c2f5745adc..7c94894892 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -41,88 +42,88 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
   const auto rescale_value = static_cast<float>(1.0f / static_cast<float>(window_cells));
 
   __asm__ __volatile__(
+    "mov x9, #0x0\n"
+    "cntw x28\n"
+    "cntw x27, ALL, MUL #2\n"
+    "cntw x26, ALL, MUL #3\n"
     "ptrue p0.b\n"
-    "ld1rw { z8.s }, p0/Z, [%x[rescale_ptr]]\n"
-    "mov x28, #0x0\n"
-    "cntw x27\n"
-    "cntw x26, ALL, MUL #2\n"
-    "cntw x25, ALL, MUL #3\n"
-    "whilelt p3.s, x28, %x[n_channels]\n"
-    "whilelt p2.s, x27, %x[n_channels]\n"
-    "whilelt p1.s, x26, %x[n_channels]\n"
-    "whilelt p0.s, x25, %x[n_channels]\n"
+    "whilelt p3.s, x9, %x[n_channels]\n"
+    "ld1rw { z7.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "whilelt p2.s, x28, %x[n_channels]\n"
+    "whilelt p1.s, x27, %x[n_channels]\n"
+    "whilelt p0.s, x26, %x[n_channels]\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
-    "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "mov z6.b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
     "mov z4.b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
-    "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
-    "ld1w { z29.s }, p2/Z, [x21, x27, LSL #2]\n"
-    "ld1w { z28.s }, p2/Z, [x20, x27, LSL #2]\n"
-    "ld1w { z27.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "ld1w { z21.s }, p1/Z, [x22, x26, LSL #2]\n"
-    "ld1w { z26.s }, p1/Z, [x21, x26, LSL #2]\n"
-    "ld1w { z17.s }, p1/Z, [x20, x26, LSL #2]\n"
-    "ld1w { z25.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "ld1w { z20.s }, p0/Z, [x22, x25, LSL #2]\n"
-    "ld1w { z24.s }, p0/Z, [x21, x25, LSL #2]\n"
-    "ld1w { z16.s }, p0/Z, [x20, x25, LSL #2]\n"
+    "mov z3.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "ld1w { z30.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "fadd z23.s, z3.s, z2.s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd z19.s, z1.s, z0.s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd z22.s, z31.s, z30.s\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
+    "fadd z23.s, z2.s, z1.s\n"
+    "fadd z19.s, z0.s, z31.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z22.s, z30.s, z22.s\n"
     "fadd z18.s, z29.s, z28.s\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
     "fadd z21.s, z27.s, z21.s\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
     "fadd z17.s, z26.s, z17.s\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
     "fadd z20.s, z25.s, z20.s\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
     "fadd z16.s, z24.s, z16.s\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
     "fadd z19.s, z23.s, z19.s\n"
-    "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
     "fadd z18.s, z22.s, z18.s\n"
-    "ld1w { z29.s }, p2/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z30.s }, p2/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
     "fadd z17.s, z21.s, z17.s\n"
-    "ld1w { z28.s }, p2/Z, [x20, x27, LSL #2]\n"
     "fadd z16.s, z20.s, z16.s\n"
-    "ld1w { z27.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "fadd z7.s, z7.s, z19.s\n"
-    "ld1w { z21.s }, p1/Z, [x22, x26, LSL #2]\n"
-    "fadd z6.s, z6.s, z18.s\n"
-    "ld1w { z26.s }, p1/Z, [x21, x26, LSL #2]\n"
-    "fadd z5.s, z5.s, z17.s\n"
-    "ld1w { z17.s }, p1/Z, [x20, x26, LSL #2]\n"
-    "fadd z4.s, z4.s, z16.s\n"
-    "ld1w { z25.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "ld1w { z20.s }, p0/Z, [x22, x25, LSL #2]\n"
-    "ld1w { z24.s }, p0/Z, [x21, x25, LSL #2]\n"
-    "ld1w { z16.s }, p0/Z, [x20, x25, LSL #2]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "fadd z6.s, z6.s, z19.s\n"
+    "fadd z5.s, z5.s, z18.s\n"
+    "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+    "fadd z4.s, z4.s, z17.s\n"
+    "fadd z3.s, z3.s, z16.s\n"
+    "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "fadd z23.s, z3.s, z2.s\n"
-    "fadd z19.s, z1.s, z0.s\n"
-    "fadd z22.s, z31.s, z30.s\n"
+    "fadd z23.s, z2.s, z1.s\n"
+    "fadd z19.s, z0.s, z31.s\n"
+    "fadd z22.s, z30.s, z22.s\n"
     "fadd z18.s, z29.s, z28.s\n"
     "fadd z21.s, z27.s, z21.s\n"
     "fadd z17.s, z26.s, z17.s\n"
@@ -132,100 +133,99 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
     "fadd z18.s, z22.s, z18.s\n"
     "fadd z17.s, z21.s, z17.s\n"
     "fadd z16.s, z20.s, z16.s\n"
-    "fadd z7.s, z7.s, z19.s\n"
-    "fadd z6.s, z6.s, z18.s\n"
-    "fadd z5.s, z5.s, z17.s\n"
-    "fadd z4.s, z4.s, z16.s\n"
+    "fadd z6.s, z6.s, z19.s\n"
+    "fadd z5.s, z5.s, z18.s\n"
+    "fadd z4.s, z4.s, z17.s\n"
+    "fadd z3.s, z3.s, z16.s\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "fadd z7.s, z7.s, z3.s\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
-    "ld1w { z27.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "fadd z6.s, z6.s, z31.s\n"
-    "ld1w { z25.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "fadd z5.s, z5.s, z27.s\n"
-    "fadd z4.s, z4.s, z25.s\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z6.s, z6.s, z16.s\n"
+    "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
+    "fadd z5.s, z5.s, z17.s\n"
+    "fadd z4.s, z4.s, z16.s\n"
+    "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+    "fadd z3.s, z3.s, z16.s\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "fmul z7.s, z7.s, z8.s\n"
-    "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
-    "fmul z6.s, z6.s, z8.s\n"
+    "fmul z6.s, z6.s, z7.s\n"
+    "fmul z5.s, z5.s, z7.s\n"
+    "st1w { z6.s }, p3, [%x[outptr], x9, LSL #2]\n"
+    "fmul z4.s, z4.s, z7.s\n"
+    "fmul z3.s, z3.s, z7.s\n"
+    "st1w { z5.s }, p2, [%x[outptr], x28, LSL #2]\n"
+    "st1w { z4.s }, p1, [%x[outptr], x27, LSL #2]\n"
+    "incw x9, ALL, MUL #4\n"
     "incw x28, ALL, MUL #4\n"
-    "fmul z5.s, z5.s, z8.s\n"
-    "st1w { z6.s }, p2, [%x[outptr], x27, LSL #2]\n"
-    "fmul z4.s, z4.s, z8.s\n"
-    "incw x27, ALL, MUL #4\n"
-    "st1w { z5.s }, p1, [%x[outptr], x26, LSL #2]\n"
+    "st1w { z3.s }, p0, [%x[outptr], x26, LSL #2]\n"
     "incw x26, ALL, MUL #4\n"
-    "st1w { z4.s }, p0, [%x[outptr], x25, LSL #2]\n"
-    "incw x25, ALL, MUL #4\n"
-    "whilelt p0.s, x25, %x[n_channels]\n"
+    "whilelt p0.s, x26, %x[n_channels]\n"
+    "incw x27, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.s, x28, %x[n_channels]\n"
+    "whilelt p3.s, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z6.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "fadd z23.s, z3.s, z2.s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "fadd z19.s, z1.s, z0.s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fadd z19.s, z23.s, z19.s\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "fadd z7.s, z7.s, z19.s\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "fadd z17.s, z2.s, z1.s\n"
+    "fadd z16.s, z0.s, z31.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fadd z16.s, z17.s, z16.s\n"
+    "subs x25, x25, #0x1\n"
+    "fadd z6.s, z6.s, z16.s\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "fadd z23.s, z3.s, z2.s\n"
-    "fadd z19.s, z1.s, z0.s\n"
-    "fadd z19.s, z23.s, z19.s\n"
-    "fadd z7.s, z7.s, z19.s\n"
+    "fadd z17.s, z2.s, z1.s\n"
+    "fadd z16.s, z0.s, z31.s\n"
+    "fadd z16.s, z17.s, z16.s\n"
+    "fadd z6.s, z6.s, z16.s\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "fadd z7.s, z7.s, z3.s\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fadd z6.s, z6.s, z16.s\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "fmul z7.s, z7.s, z8.s\n"
-    "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
-    "incw x28\n"
-    "whilelt p3.s, x28, %x[n_channels]\n"
+    "fmul z6.s, z6.s, z7.s\n"
+    "st1w { z6.s }, p3, [%x[outptr], x9, LSL #2]\n"
+    "incw x9\n"
+    "whilelt p3.s, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 086f49e957..b97e3623c4 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
+  using Parent = DepthfirstStrategy<float, float>;
 
-  typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 250cc24226..d9cebd1363 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -63,84 +63,84 @@ void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x14, #0x0\n"
+    "whilelt p0.s, x14, x15\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x13, x12, [x21, #0x0]\n"
     "ptrue p2.b\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x12, #0x0\n"
-    "ldp x11, x10, [x20, #0x0]\n"
-    "whilelt p1.s, x13, x14\n"
-    "ldp x9, x28, [x20, #0x10]\n"
-    "ldp x27, x26, [x19, #0x0]\n"
-    "ldp x25, x24, [x19, #0x10]\n"
-    "ldp x23, x22, [x19, #0x20]\n"
-    "ldp x21, x20, [x19, #0x30]\n"
-    "ldr x19, [x19, #0x40]\n"
-    "ld1w { z31.s }, p1/Z, [x26, x13, LSL #2]\n"
-    "ld1w { z30.s }, p1/Z, [x23, x13, LSL #2]\n"
-    "ld1w { z29.s }, p1/Z, [x20, x13, LSL #2]\n"
-    "ld1w { z28.s }, p1/Z, [x24, x13, LSL #2]\n"
-    "ld1w { z27.s }, p1/Z, [x27, x13, LSL #2]\n"
-    "ld1w { z26.s }, p1/Z, [x22, x13, LSL #2]\n"
-    "ld1w { z25.s }, p1/Z, [x25, x13, LSL #2]\n"
-    "ld1w { z24.s }, p1/Z, [x21, x13, LSL #2]\n"
-    "ld1w { z23.s }, p1/Z, [x19, x13, LSL #2]\n"
-    "incw x13\n"
-    "whilelt p1.s, x13, x14\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1w { z31.s }, p0/Z, [x27, x14, LSL #2]\n"
+    "ld1w { z30.s }, p0/Z, [x24, x14, LSL #2]\n"
+    "ld1w { z29.s }, p0/Z, [x21, x14, LSL #2]\n"
+    "ld1w { z28.s }, p0/Z, [x25, x14, LSL #2]\n"
+    "ld1w { z27.s }, p0/Z, [x28, x14, LSL #2]\n"
+    "ld1w { z26.s }, p0/Z, [x26, x14, LSL #2]\n"
+    "ld1w { z25.s }, p0/Z, [x23, x14, LSL #2]\n"
+    "ld1w { z24.s }, p0/Z, [x22, x14, LSL #2]\n"
+    "ld1w { z23.s }, p0/Z, [x20, x14, LSL #2]\n"
+    "incw x14\n"
+    "whilelt p1.s, x14, x15\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
     "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
-    "ld1w { z31.s }, p1/Z, [x26, x13, LSL #2]\n"
-    "whilelt p0.s, x12, x14\n"
     "movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
-    "ld1w { z30.s }, p1/Z, [x23, x13, LSL #2]\n"
-    "movprfx z18, z28\n fmax z18.s, p2/M, z18.s, z27.s\n"
-    "ld1w { z29.s }, p1/Z, [x20, x13, LSL #2]\n"
-    "movprfx z17, z26\n fmax z17.s, p2/M, z17.s, z25.s\n"
-    "ld1w { z27.s }, p1/Z, [x27, x13, LSL #2]\n"
-    "movprfx z16, z24\n fmax z16.s, p2/M, z16.s, z28.s\n"
-    "ld1w { z28.s }, p1/Z, [x24, x13, LSL #2]\n"
-    "movprfx z20, z26\n fmax z20.s, p2/M, z20.s, z23.s\n"
-    "ld1w { z26.s }, p1/Z, [x22, x13, LSL #2]\n"
-    "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
-    "ld1w { z25.s }, p1/Z, [x25, x13, LSL #2]\n"
-    "movprfx z18, z22\n fmax z18.s, p2/M, z18.s, z17.s\n"
-    "ld1w { z24.s }, p1/Z, [x21, x13, LSL #2]\n"
-    "movprfx z17, z21\n fmax z17.s, p2/M, z17.s, z16.s\n"
-    "ld1w { z23.s }, p1/Z, [x19, x13, LSL #2]\n"
-    "incw x13\n"
-    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
-    "st1w { z19.s }, p0, [x11, x12, LSL #2]\n"
-    "whilelt p1.s, x13, x14\n"
-    "st1w { z18.s }, p0, [x10, x12, LSL #2]\n"
-    "st1w { z17.s }, p0, [x9, x12, LSL #2]\n"
-    "st1w { z16.s }, p0, [x28, x12, LSL #2]\n"
-    "incw x12\n"
+    "ld1w { z31.s }, p1/Z, [x27, x14, LSL #2]\n"
+    "ld1w { z30.s }, p1/Z, [x24, x14, LSL #2]\n"
+    "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
+    "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
+    "ld1w { z29.s }, p1/Z, [x21, x14, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x28, x14, LSL #2]\n"
+    "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
+    "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
+    "ld1w { z28.s }, p1/Z, [x25, x14, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x26, x14, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x23, x14, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x22, x14, LSL #2]\n"
+    "whilelt p0.s, x11, x15\n"
+    "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+    "ld1w { z23.s }, p1/Z, [x20, x14, LSL #2]\n"
+    "incw x14\n"
+    "whilelt p1.s, x14, x15\n"
+    "st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
+    "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
+    "fmax z17.s, p2/M, z17.s, z21.s\n"
+    "st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
+    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
+    "st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
+    "st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
+    "incw x11\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
     "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
-    "whilelt p0.s, x12, x14\n"
     "movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
-    "movprfx z18, z28\n fmax z18.s, p2/M, z18.s, z27.s\n"
-    "movprfx z17, z26\n fmax z17.s, p2/M, z17.s, z25.s\n"
-    "movprfx z16, z24\n fmax z16.s, p2/M, z16.s, z28.s\n"
-    "movprfx z20, z26\n fmax z20.s, p2/M, z20.s, z23.s\n"
-    "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
-    "st1w { z19.s }, p0, [x11, x12, LSL #2]\n"
-    "movprfx z18, z22\n fmax z18.s, p2/M, z18.s, z17.s\n"
-    "movprfx z17, z21\n fmax z17.s, p2/M, z17.s, z16.s\n"
-    "st1w { z18.s }, p0, [x10, x12, LSL #2]\n"
-    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
-    "st1w { z17.s }, p0, [x9, x12, LSL #2]\n"
-    "st1w { z16.s }, p0, [x28, x12, LSL #2]\n"
+    "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
+    "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
+    "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
+    "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
+    "whilelt p0.s, x11, x15\n"
+    "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+    "st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
+    "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
+    "fmax z17.s, p2/M, z17.s, z21.s\n"
+    "st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
+    "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
+    "st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
+    "st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp
index 17e3e5f0ba..5f6535072b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_fp32_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
 
-struct sve_fp32_nhwc_max_generic_depthfirst
+struct sve_fp32_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
 {
-  typedef float operand_type;
-  typedef float return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_fp32_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<float, float>;
   sve_fp32_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_fp32_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
index 8166379ce4..87fc75adda 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -39,185 +40,184 @@ void sve_fp32_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cntw x27\n"
-    "cntw x26, ALL, MUL #2\n"
-    "cntw x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cntw x28\n"
+    "cntw x27, ALL, MUL #2\n"
+    "cntw x26, ALL, MUL #3\n"
+    "whilelt p4.s, x9, %x[n_channels]\n"
     "whilelt p3.s, x28, %x[n_channels]\n"
     "whilelt p2.s, x27, %x[n_channels]\n"
     "whilelt p1.s, x26, %x[n_channels]\n"
-    "whilelt p0.s, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.s, #0xff800000\n"
     "mov z7.s, #0xff800000\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x24, %x[inptrs]\n"
     "mov z6.s, #0xff800000\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.s, #0xff800000\n"
-    "mov z4.s, #0xff800000\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
-    "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
-    "ld1w { z22.s }, p2/Z, [x21, x27, LSL #2]\n"
-    "ld1w { z29.s }, p2/Z, [x20, x27, LSL #2]\n"
-    "ld1w { z28.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "ld1w { z27.s }, p1/Z, [x22, x26, LSL #2]\n"
-    "ld1w { z21.s }, p1/Z, [x21, x26, LSL #2]\n"
-    "ld1w { z26.s }, p1/Z, [x20, x26, LSL #2]\n"
-    "ld1w { z16.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "ld1w { z25.s }, p0/Z, [x22, x25, LSL #2]\n"
-    "ld1w { z20.s }, p0/Z, [x21, x25, LSL #2]\n"
-    "ld1w { z24.s }, p0/Z, [x20, x25, LSL #2]\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
+    "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+    "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n fmax z18.s, p4/M, z18.s, z30.s\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "fmax z22.s, p4/M, z22.s, z29.s\n"
-    "movprfx z17, z28\n fmax z17.s, p4/M, z17.s, z27.s\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "fmax z21.s, p4/M, z21.s, z26.s\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "fmax z16.s, p4/M, z16.s, z25.s\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
-    "fmax z20.s, p4/M, z20.s, z24.s\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
-    "fmax z19.s, p4/M, z19.s, z23.s\n"
-    "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
-    "fmax z18.s, p4/M, z18.s, z22.s\n"
-    "ld1w { z22.s }, p2/Z, [x21, x27, LSL #2]\n"
-    "fmax z17.s, p4/M, z17.s, z21.s\n"
-    "ld1w { z29.s }, p2/Z, [x20, x27, LSL #2]\n"
-    "fmax z16.s, p4/M, z16.s, z20.s\n"
-    "ld1w { z28.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "fmax z7.s, p4/M, z7.s, z19.s\n"
-    "ld1w { z27.s }, p1/Z, [x22, x26, LSL #2]\n"
-    "fmax z6.s, p4/M, z6.s, z18.s\n"
-    "ld1w { z21.s }, p1/Z, [x21, x26, LSL #2]\n"
-    "fmax z5.s, p4/M, z5.s, z17.s\n"
-    "ld1w { z26.s }, p1/Z, [x20, x26, LSL #2]\n"
-    "fmax z4.s, p4/M, z4.s, z16.s\n"
-    "ld1w { z16.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "ld1w { z25.s }, p0/Z, [x22, x25, LSL #2]\n"
-    "ld1w { z20.s }, p0/Z, [x21, x25, LSL #2]\n"
-    "ld1w { z24.s }, p0/Z, [x20, x25, LSL #2]\n"
+    "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
+    "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n fmax z18.s, p0/M, z18.s, z31.s\n"
+    "fmax z22.s, p0/M, z22.s, z30.s\n"
+    "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "movprfx z17, z29\n fmax z17.s, p0/M, z17.s, z28.s\n"
+    "fmax z21.s, p0/M, z21.s, z27.s\n"
+    "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "movprfx z16, z26\n fmax z16.s, p0/M, z16.s, z25.s\n"
+    "fmax z20.s, p0/M, z20.s, z24.s\n"
+    "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
+    "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
+    "fmax z19.s, p0/M, z19.s, z23.s\n"
+    "fmax z18.s, p0/M, z18.s, z22.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "fmax z17.s, p0/M, z17.s, z21.s\n"
+    "fmax z16.s, p0/M, z16.s, z20.s\n"
+    "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
+    "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
+    "subs x25, x25, #0x1\n"
+    "fmax z8.s, p0/M, z8.s, z19.s\n"
+    "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+    "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "fmax z7.s, p0/M, z7.s, z18.s\n"
+    "fmax z6.s, p0/M, z6.s, z17.s\n"
+    "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+    "fmax z5.s, p0/M, z5.s, z16.s\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
-    "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
-    "movprfx z18, z31\n fmax z18.s, p4/M, z18.s, z30.s\n"
-    "fmax z22.s, p4/M, z22.s, z29.s\n"
-    "movprfx z17, z28\n fmax z17.s, p4/M, z17.s, z27.s\n"
-    "fmax z21.s, p4/M, z21.s, z26.s\n"
-    "fmax z16.s, p4/M, z16.s, z25.s\n"
-    "fmax z20.s, p4/M, z20.s, z24.s\n"
-    "fmax z19.s, p4/M, z19.s, z23.s\n"
-    "fmax z18.s, p4/M, z18.s, z22.s\n"
-    "fmax z17.s, p4/M, z17.s, z21.s\n"
-    "fmax z16.s, p4/M, z16.s, z20.s\n"
-    "fmax z7.s, p4/M, z7.s, z19.s\n"
-    "fmax z6.s, p4/M, z6.s, z18.s\n"
-    "fmax z5.s, p4/M, z5.s, z17.s\n"
-    "fmax z4.s, p4/M, z4.s, z16.s\n"
+    "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
+    "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
+    "movprfx z18, z0\n fmax z18.s, p0/M, z18.s, z31.s\n"
+    "fmax z22.s, p0/M, z22.s, z30.s\n"
+    "movprfx z17, z29\n fmax z17.s, p0/M, z17.s, z28.s\n"
+    "fmax z21.s, p0/M, z21.s, z27.s\n"
+    "movprfx z16, z26\n fmax z16.s, p0/M, z16.s, z25.s\n"
+    "fmax z20.s, p0/M, z20.s, z24.s\n"
+    "fmax z19.s, p0/M, z19.s, z23.s\n"
+    "fmax z18.s, p0/M, z18.s, z22.s\n"
+    "fmax z17.s, p0/M, z17.s, z21.s\n"
+    "fmax z16.s, p0/M, z16.s, z20.s\n"
+    "fmax z8.s, p0/M, z8.s, z19.s\n"
+    "fmax z7.s, p0/M, z7.s, z18.s\n"
+    "fmax z6.s, p0/M, z6.s, z17.s\n"
+    "fmax z5.s, p0/M, z5.s, z16.s\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "fmax z7.s, p4/M, z7.s, z3.s\n"
-    "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
-    "ld1w { z28.s }, p1/Z, [x23, x26, LSL #2]\n"
-    "fmax z6.s, p4/M, z6.s, z31.s\n"
-    "ld1w { z16.s }, p0/Z, [x23, x25, LSL #2]\n"
-    "fmax z5.s, p4/M, z5.s, z28.s\n"
-    "fmax z4.s, p4/M, z4.s, z16.s\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z8.s, p0/M, z8.s, z16.s\n"
+    "ld1w { z17.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
+    "fmax z7.s, p0/M, z7.s, z17.s\n"
+    "fmax z6.s, p0/M, z6.s, z16.s\n"
+    "ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
+    "fmax z5.s, p0/M, z5.s, z16.s\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
+    "st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
+    "incw x9, ALL, MUL #4\n"
     "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
     "incw x28, ALL, MUL #4\n"
     "st1w { z6.s }, p2, [%x[outptr], x27, LSL #2]\n"
     "incw x27, ALL, MUL #4\n"
     "st1w { z5.s }, p1, [%x[outptr], x26, LSL #2]\n"
     "incw x26, ALL, MUL #4\n"
-    "st1w { z4.s }, p0, [%x[outptr], x25, LSL #2]\n"
-    "incw x25, ALL, MUL #4\n"
-    "whilelt p0.s, x25, %x[n_channels]\n"
+    "whilelt p1.s, x26, %x[n_channels]\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.s, x28, %x[n_channels]\n"
+    "whilelt p4.s, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.s, #0xff800000\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.s, #0xff800000\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "fmax z19.s, p4/M, z19.s, z23.s\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
-    "fmax z7.s, p4/M, z7.s, z19.s\n"
-    "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
-    "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+    "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
+    "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "fmax z16.s, p0/M, z16.s, z17.s\n"
+    "subs x25, x25, #0x1\n"
+    "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+    "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+    "fmax z8.s, p0/M, z8.s, z16.s\n"
+    "add x24, x24, #0x20\n"
+    "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+    "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
-    "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
-    "fmax z19.s, p4/M, z19.s, z23.s\n"
-    "fmax z7.s, p4/M, z7.s, z19.s\n"
+    "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
+    "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
+    "fmax z16.s, p0/M, z16.s, z17.s\n"
+    "fmax z8.s, p0/M, z8.s, z16.s\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
-    "fmax z7.s, p4/M, z7.s, z3.s\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+    "subs x21, x21, #0x1\n"
+    "fmax z8.s, p0/M, z8.s, z16.s\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
-    "incw x28\n"
-    "whilelt p3.s, x28, %x[n_channels]\n"
+    "st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
+    "incw x9\n"
+    "whilelt p4.s, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp
index 2ae38b5b2f..dd2ff4fd2e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_s8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
 
-struct sve_s8_nhwc_avg_generic_depthfirst
+struct sve_s8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_s8_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
   sve_s8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_s8_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 2ea5b90561..7925905e64 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,12 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -84,30 +85,31 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x26, #0x0\n"
-    "cntb x25\n"
-    "cntb x24, ALL, MUL #2\n"
-    "cntb x23, ALL, MUL #3\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "whilelt p3.b, x26, %x[n_channels]\n"
     "whilelt p2.b, x25, %x[n_channels]\n"
     "whilelt p1.b, x24, %x[n_channels]\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
     "mov z11.s, #0x0\n"
@@ -122,43 +124,43 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
     "mov z2.s, #0x0\n"
     "mov z1.s, #0x0\n"
     "mov z0.s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
     ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
-    "subs x22, x22, #0x1\n"
     ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
     ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
     ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
     ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
     ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
@@ -198,219 +200,218 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508a3f1  // sshllb z17.h, z31.b, #0x0\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    ".inst 0x4508a7f0  // sshllt z16.h, z31.b, #0x0\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
-    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
-    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
-    ".inst 0x4508a3b0  // sshllb z16.h, z29.b, #0x0\n"
-    ".inst 0x4590416b  // saddwb z11.s, z11.s, z16.h\n"
-    ".inst 0x4590454a  // saddwt z10.s, z10.s, z16.h\n"
-    ".inst 0x4508a7b0  // sshllt z16.h, z29.b, #0x0\n"
-    ".inst 0x45904129  // saddwb z9.s, z9.s, z16.h\n"
-    ".inst 0x45904508  // saddwt z8.s, z8.s, z16.h\n"
-    ".inst 0x4508a370  // sshllb z16.h, z27.b, #0x0\n"
-    ".inst 0x459040e7  // saddwb z7.s, z7.s, z16.h\n"
-    ".inst 0x459044c6  // saddwt z6.s, z6.s, z16.h\n"
-    ".inst 0x4508a770  // sshllt z16.h, z27.b, #0x0\n"
-    ".inst 0x459040a5  // saddwb z5.s, z5.s, z16.h\n"
-    ".inst 0x45904484  // saddwt z4.s, z4.s, z16.h\n"
-    ".inst 0x4508a330  // sshllb z16.h, z25.b, #0x0\n"
-    ".inst 0x45904063  // saddwb z3.s, z3.s, z16.h\n"
-    ".inst 0x45904442  // saddwt z2.s, z2.s, z16.h\n"
-    ".inst 0x4508a730  // sshllt z16.h, z25.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a217  // sshllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508a616  // sshllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508a215  // sshllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508a614  // sshllt z20.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508a233  // sshllb z19.h, z17.b, #0x0\n"
+    ".inst 0x4508a632  // sshllt z18.h, z17.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z20.s, #0x7f\n"
-    "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
-    "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
-    "not z19.s, p4/M, z20.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
     ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
     ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
     ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
     ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
     ".inst 0x04b1756b  // sqdmulh z11.s, z11.s, z17.s\n"
     ".inst 0x04b1754a  // sqdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
     ".inst 0x04b17529  // sqdmulh z9.s, z9.s, z17.s\n"
     ".inst 0x04b17508  // sqdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
     ".inst 0x04b174e7  // sqdmulh z7.s, z7.s, z17.s\n"
     ".inst 0x04b174c6  // sqdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
     ".inst 0x04b174a5  // sqdmulh z5.s, z5.s, z17.s\n"
     ".inst 0x04b17484  // sqdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
     ".inst 0x04b17463  // sqdmulh z3.s, z3.s, z17.s\n"
     ".inst 0x04b17442  // sqdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
     ".inst 0x04b17421  // sqdmulh z1.s, z1.s, z17.s\n"
     ".inst 0x04b17400  // sqdmulh z0.s, z0.s, z17.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    ".inst 0x4482920b  // srshl z11.s, p4/M, z11.s, z16.s\n"
-    ".inst 0x4482920a  // srshl z10.s, p4/M, z10.s, z16.s\n"
-    ".inst 0x44829209  // srshl z9.s, p4/M, z9.s, z16.s\n"
-    ".inst 0x44829208  // srshl z8.s, p4/M, z8.s, z16.s\n"
-    ".inst 0x44829207  // srshl z7.s, p4/M, z7.s, z16.s\n"
-    ".inst 0x44829206  // srshl z6.s, p4/M, z6.s, z16.s\n"
-    ".inst 0x44829205  // srshl z5.s, p4/M, z5.s, z16.s\n"
-    ".inst 0x44829204  // srshl z4.s, p4/M, z4.s, z16.s\n"
-    ".inst 0x44829203  // srshl z3.s, p4/M, z3.s, z16.s\n"
-    ".inst 0x44829202  // srshl z2.s, p4/M, z2.s, z16.s\n"
-    ".inst 0x44829201  // srshl z1.s, p4/M, z1.s, z16.s\n"
-    ".inst 0x44829200  // srshl z0.s, p4/M, z0.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z19.s\n"
-    "smax z14.s, p4/M, z14.s, z19.s\n"
-    "smax z13.s, p4/M, z13.s, z19.s\n"
-    "smax z12.s, p4/M, z12.s, z19.s\n"
-    "smin z15.s, p4/M, z15.s, z20.s\n"
-    "smin z14.s, p4/M, z14.s, z20.s\n"
-    "smin z13.s, p4/M, z13.s, z20.s\n"
-    "smin z12.s, p4/M, z12.s, z20.s\n"
-    "smax z11.s, p4/M, z11.s, z19.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
-    "smax z10.s, p4/M, z10.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
+    "smin z11.s, p0/M, z11.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
-    "smin z11.s, p4/M, z11.s, z20.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "smin z10.s, p4/M, z10.s, z20.s\n"
-    "incb x26, ALL, MUL #4\n"
-    "smax z9.s, p4/M, z9.s, z19.s\n"
-    "smax z8.s, p4/M, z8.s, z19.s\n"
-    "smax z7.s, p4/M, z7.s, z19.s\n"
-    "smax z6.s, p4/M, z6.s, z19.s\n"
-    "trn1 z18.h, z11.h, z10.h\n"
-    "smin z9.s, p4/M, z9.s, z20.s\n"
-    "smin z8.s, p4/M, z8.s, z20.s\n"
-    "smin z7.s, p4/M, z7.s, z20.s\n"
-    "smin z6.s, p4/M, z6.s, z20.s\n"
-    "smax z5.s, p4/M, z5.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z18.s\n"
+    "smin z9.s, p0/M, z9.s, z18.s\n"
+    "trn1 z17.h, z11.h, z10.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "smin z8.s, p0/M, z8.s, z18.s\n"
+    "smin z7.s, p0/M, z7.s, z18.s\n"
     "trn1 z16.h, z9.h, z8.h\n"
-    "smax z4.s, p4/M, z4.s, z19.s\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z6.s, p0/M, z6.s, z18.s\n"
+    "smin z5.s, p0/M, z5.s, z18.s\n"
     "trn1 z17.h, z7.h, z6.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
-    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
-    "smin z5.s, p4/M, z5.s, z20.s\n"
-    "incb x25, ALL, MUL #4\n"
-    "smin z4.s, p4/M, z4.s, z20.s\n"
-    "smax z3.s, p4/M, z3.s, z19.s\n"
-    "smax z2.s, p4/M, z2.s, z19.s\n"
-    "smax z1.s, p4/M, z1.s, z19.s\n"
-    "smax z0.s, p4/M, z0.s, z19.s\n"
+    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+    "smin z4.s, p0/M, z4.s, z18.s\n"
+    "smin z3.s, p0/M, z3.s, z18.s\n"
     "trn1 z16.h, z5.h, z4.h\n"
-    "smin z3.s, p4/M, z3.s, z20.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
-    "smin z2.s, p4/M, z2.s, z20.s\n"
-    "incb x24, ALL, MUL #4\n"
-    "smin z1.s, p4/M, z1.s, z20.s\n"
-    "smin z0.s, p4/M, z0.s, z20.s\n"
+    "smin z2.s, p0/M, z2.s, z18.s\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
     "trn1 z17.h, z3.h, z2.h\n"
+    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
     "trn1 z16.h, z1.h, z0.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x23]\n"
-    "incb x23, ALL, MUL #4\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "incb x27, ALL, MUL #4\n"
+    "incb x26, ALL, MUL #4\n"
+    "incb x25, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    "subs x22, x22, #0x1\n"
-    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508a3f1  // sshllb z17.h, z31.b, #0x0\n"
-    ".inst 0x4508a7f0  // sshllt z16.h, z31.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
     ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
     ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
     ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
     ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z20.s, #0x7f\n"
-    "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
-    "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
-    "not z19.s, p4/M, z20.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
     ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
     ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
     ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
     ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z19.s\n"
-    "smax z14.s, p4/M, z14.s, z19.s\n"
-    "smax z13.s, p4/M, z13.s, z19.s\n"
-    "smax z12.s, p4/M, z12.s, z19.s\n"
-    "smin z15.s, p4/M, z15.s, z20.s\n"
-    "smin z14.s, p4/M, z14.s, z20.s\n"
-    "smin z13.s, p4/M, z13.s, z20.s\n"
-    "smin z12.s, p4/M, z12.s, z20.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "incb x26\n"
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 071e79c93d..ac842ac623 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
+  using Parent = DepthfirstStrategy<int8_t, int8_t>;
 
-  typedef void (*kern_type)(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index bdf3f53292..5681cc1f3d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -63,84 +63,84 @@ void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x14, #0x0\n"
+    "whilelt p0.b, x14, x15\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x13, x12, [x21, #0x0]\n"
     "ptrue p2.b\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x12, #0x0\n"
-    "ldp x11, x10, [x20, #0x0]\n"
-    "whilelt p1.b, x13, x14\n"
-    "ldp x9, x28, [x20, #0x10]\n"
-    "ldp x27, x26, [x19, #0x0]\n"
-    "ldp x25, x24, [x19, #0x10]\n"
-    "ldp x23, x22, [x19, #0x20]\n"
-    "ldp x21, x20, [x19, #0x30]\n"
-    "ldr x19, [x19, #0x40]\n"
-    "ld1b { z31.b }, p1/Z, [x26, x13]\n"
-    "ld1b { z30.b }, p1/Z, [x23, x13]\n"
-    "ld1b { z29.b }, p1/Z, [x20, x13]\n"
-    "ld1b { z28.b }, p1/Z, [x24, x13]\n"
-    "ld1b { z27.b }, p1/Z, [x27, x13]\n"
-    "ld1b { z26.b }, p1/Z, [x22, x13]\n"
-    "ld1b { z25.b }, p1/Z, [x25, x13]\n"
-    "ld1b { z24.b }, p1/Z, [x21, x13]\n"
-    "ld1b { z23.b }, p1/Z, [x19, x13]\n"
-    "incw x13\n"
-    "whilelt p1.b, x13, x14\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1b { z31.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z30.b }, p0/Z, [x24, x14]\n"
+    "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+    "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+    "ld1b { z26.b }, p0/Z, [x26, x14]\n"
+    "ld1b { z25.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+    "ld1b { z23.b }, p0/Z, [x20, x14]\n"
+    "incw x14\n"
+    "whilelt p1.b, x14, x15\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
     "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
-    "ld1b { z31.b }, p1/Z, [x26, x13]\n"
-    "whilelt p0.b, x12, x14\n"
     "movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
-    "ld1b { z30.b }, p1/Z, [x23, x13]\n"
-    "movprfx z18, z28\n smax z18.b, p2/M, z18.b, z27.b\n"
-    "ld1b { z29.b }, p1/Z, [x20, x13]\n"
-    "movprfx z17, z26\n smax z17.b, p2/M, z17.b, z25.b\n"
-    "ld1b { z27.b }, p1/Z, [x27, x13]\n"
-    "movprfx z16, z24\n smax z16.b, p2/M, z16.b, z28.b\n"
-    "ld1b { z28.b }, p1/Z, [x24, x13]\n"
-    "movprfx z20, z26\n smax z20.b, p2/M, z20.b, z23.b\n"
-    "ld1b { z26.b }, p1/Z, [x22, x13]\n"
-    "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
-    "ld1b { z25.b }, p1/Z, [x25, x13]\n"
-    "movprfx z18, z22\n smax z18.b, p2/M, z18.b, z17.b\n"
-    "ld1b { z24.b }, p1/Z, [x21, x13]\n"
-    "movprfx z17, z21\n smax z17.b, p2/M, z17.b, z16.b\n"
-    "ld1b { z23.b }, p1/Z, [x19, x13]\n"
-    "incw x13\n"
-    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
-    "st1b { z19.b }, p0, [x11, x12]\n"
-    "whilelt p1.b, x13, x14\n"
-    "st1b { z18.b }, p0, [x10, x12]\n"
-    "st1b { z17.b }, p0, [x9, x12]\n"
-    "st1b { z16.b }, p0, [x28, x12]\n"
-    "incw x12\n"
+    "ld1b { z31.b }, p1/Z, [x27, x14]\n"
+    "ld1b { z30.b }, p1/Z, [x24, x14]\n"
+    "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
+    "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
+    "ld1b { z29.b }, p1/Z, [x21, x14]\n"
+    "ld1b { z27.b }, p1/Z, [x28, x14]\n"
+    "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
+    "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
+    "ld1b { z28.b }, p1/Z, [x25, x14]\n"
+    "ld1b { z26.b }, p1/Z, [x26, x14]\n"
+    "ld1b { z25.b }, p1/Z, [x23, x14]\n"
+    "ld1b { z24.b }, p1/Z, [x22, x14]\n"
+    "whilelt p0.b, x11, x15\n"
+    "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+    "ld1b { z23.b }, p1/Z, [x20, x14]\n"
+    "incw x14\n"
+    "whilelt p1.b, x14, x15\n"
+    "st1b { z16.b }, p0, [x13, x11]\n"
+    "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
+    "smax z17.b, p2/M, z17.b, z21.b\n"
+    "st1b { z16.b }, p0, [x12, x11]\n"
+    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
+    "st1b { z17.b }, p0, [x10, x11]\n"
+    "st1b { z16.b }, p0, [x9, x11]\n"
+    "incw x11\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
     "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
-    "whilelt p0.b, x12, x14\n"
     "movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
-    "movprfx z18, z28\n smax z18.b, p2/M, z18.b, z27.b\n"
-    "movprfx z17, z26\n smax z17.b, p2/M, z17.b, z25.b\n"
-    "movprfx z16, z24\n smax z16.b, p2/M, z16.b, z28.b\n"
-    "movprfx z20, z26\n smax z20.b, p2/M, z20.b, z23.b\n"
-    "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
-    "st1b { z19.b }, p0, [x11, x12]\n"
-    "movprfx z18, z22\n smax z18.b, p2/M, z18.b, z17.b\n"
-    "movprfx z17, z21\n smax z17.b, p2/M, z17.b, z16.b\n"
-    "st1b { z18.b }, p0, [x10, x12]\n"
-    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
-    "st1b { z17.b }, p0, [x9, x12]\n"
-    "st1b { z16.b }, p0, [x28, x12]\n"
+    "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
+    "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
+    "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
+    "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
+    "whilelt p0.b, x11, x15\n"
+    "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+    "st1b { z16.b }, p0, [x13, x11]\n"
+    "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
+    "smax z17.b, p2/M, z17.b, z21.b\n"
+    "st1b { z16.b }, p0, [x12, x11]\n"
+    "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
+    "st1b { z17.b }, p0, [x10, x11]\n"
+    "st1b { z16.b }, p0, [x9, x11]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp
index 428902ad61..2ee5bc0527 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_s8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
 
-struct sve_s8_nhwc_max_generic_depthfirst
+struct sve_s8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_s8_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
   sve_s8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_s8_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
index 3e88c8729c..da9e1408f9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -39,185 +40,184 @@ void sve_s8_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cntb x27\n"
-    "cntb x26, ALL, MUL #2\n"
-    "cntb x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "whilelt p3.b, x28, %x[n_channels]\n"
     "whilelt p2.b, x27, %x[n_channels]\n"
     "whilelt p1.b, x26, %x[n_channels]\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.b, #0x80\n"
     "mov z7.b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x24, %x[inptrs]\n"
     "mov z6.b, #0x80\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.b, #0x80\n"
-    "mov z4.b, #0x80\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "smax z21.b, p4/M, z21.b, z26.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "smax z16.b, p4/M, z16.b, z25.b\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "smax z20.b, p4/M, z20.b, z24.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "smax z18.b, p4/M, z18.b, z22.b\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "smax z17.b, p4/M, z17.b, z21.b\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "smax z16.b, p4/M, z16.b, z20.b\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "smax z7.b, p4/M, z7.b, z19.b\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "smax z6.b, p4/M, z6.b, z18.b\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "smax z5.b, p4/M, z5.b, z17.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "smax z4.b, p4/M, z4.b, z16.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+    "smax z22.b, p0/M, z22.b, z30.b\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+    "smax z21.b, p0/M, z21.b, z27.b\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z19.b\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "smax z7.b, p0/M, z7.b, z18.b\n"
+    "smax z6.b, p0/M, z6.b, z17.b\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
-    "smax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
-    "smax z21.b, p4/M, z21.b, z26.b\n"
-    "smax z16.b, p4/M, z16.b, z25.b\n"
-    "smax z20.b, p4/M, z20.b, z24.b\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "smax z18.b, p4/M, z18.b, z22.b\n"
-    "smax z17.b, p4/M, z17.b, z21.b\n"
-    "smax z16.b, p4/M, z16.b, z20.b\n"
-    "smax z7.b, p4/M, z7.b, z19.b\n"
-    "smax z6.b, p4/M, z6.b, z18.b\n"
-    "smax z5.b, p4/M, z5.b, z17.b\n"
-    "smax z4.b, p4/M, z4.b, z16.b\n"
+    "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+    "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+    "smax z22.b, p0/M, z22.b, z30.b\n"
+    "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+    "smax z21.b, p0/M, z21.b, z27.b\n"
+    "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "smax z8.b, p0/M, z8.b, z19.b\n"
+    "smax z7.b, p0/M, z7.b, z18.b\n"
+    "smax z6.b, p0/M, z6.b, z17.b\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z7.b, p4/M, z7.b, z3.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "smax z6.b, p4/M, z6.b, z31.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "smax z5.b, p4/M, z5.b, z28.b\n"
-    "smax z4.b, p4/M, z4.b, z16.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
+    "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "smax z7.b, p0/M, z7.b, z17.b\n"
+    "smax z6.b, p0/M, z6.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
+    "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
     "st1b { z7.b }, p3, [%x[outptr], x28]\n"
     "incb x28, ALL, MUL #4\n"
     "st1b { z6.b }, p2, [%x[outptr], x27]\n"
     "incb x27, ALL, MUL #4\n"
     "st1b { z5.b }, p1, [%x[outptr], x26]\n"
     "incb x26, ALL, MUL #4\n"
-    "st1b { z4.b }, p0, [%x[outptr], x25]\n"
-    "incb x25, ALL, MUL #4\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.b, #0x80\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "smax z7.b, p4/M, z7.b, z19.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "smax z7.b, p4/M, z7.b, z19.b\n"
+    "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z7.b, p4/M, z7.b, z3.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "st1b { z7.b }, p3, [%x[outptr], x28]\n"
-    "incb x28\n"
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp
index 1242eaf530..6f34faa121 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_s8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
 
-struct sve_s8q_nhwc_avg_generic_depthfirst
+struct sve_s8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_s8q_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
   sve_s8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_s8q_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index 928eb412b5..19a3b112ad 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,12 @@
 
 #include "pooling.hpp"
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -86,12 +87,13 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   // Combine together the rescale value for the requantization and the scaling
@@ -112,21 +114,21 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
   );
 
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x26, #0x0\n"
-    "cntb x25\n"
-    "cntb x24, ALL, MUL #2\n"
-    "cntb x23, ALL, MUL #3\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "whilelt p3.b, x26, %x[n_channels]\n"
     "whilelt p2.b, x25, %x[n_channels]\n"
     "whilelt p1.b, x24, %x[n_channels]\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
     "mov z11.s, #0x0\n"
@@ -141,43 +143,43 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
     "mov z2.s, #0x0\n"
     "mov z1.s, #0x0\n"
     "mov z0.s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
     ".inst 0x455c03b5  // saddlb z21.h, z29.b, z28.b\n"
-    "subs x22, x22, #0x1\n"
     ".inst 0x455c07b4  // saddlt z20.h, z29.b, z28.b\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
     ".inst 0x455a0373  // saddlb z19.h, z27.b, z26.b\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
     ".inst 0x455a0772  // saddlt z18.h, z27.b, z26.b\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
     ".inst 0x45580331  // saddlb z17.h, z25.b, z24.b\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
     ".inst 0x45580730  // saddlt z16.h, z25.b, z24.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
     ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
     ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
     ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
     ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
     ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
     ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
@@ -217,241 +219,240 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508a3f1  // sshllb z17.h, z31.b, #0x0\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    ".inst 0x4508a7f0  // sshllt z16.h, z31.b, #0x0\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
-    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
-    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
-    ".inst 0x4508a3b0  // sshllb z16.h, z29.b, #0x0\n"
-    ".inst 0x4590416b  // saddwb z11.s, z11.s, z16.h\n"
-    ".inst 0x4590454a  // saddwt z10.s, z10.s, z16.h\n"
-    ".inst 0x4508a7b0  // sshllt z16.h, z29.b, #0x0\n"
-    ".inst 0x45904129  // saddwb z9.s, z9.s, z16.h\n"
-    ".inst 0x45904508  // saddwt z8.s, z8.s, z16.h\n"
-    ".inst 0x4508a370  // sshllb z16.h, z27.b, #0x0\n"
-    ".inst 0x459040e7  // saddwb z7.s, z7.s, z16.h\n"
-    ".inst 0x459044c6  // saddwt z6.s, z6.s, z16.h\n"
-    ".inst 0x4508a770  // sshllt z16.h, z27.b, #0x0\n"
-    ".inst 0x459040a5  // saddwb z5.s, z5.s, z16.h\n"
-    ".inst 0x45904484  // saddwt z4.s, z4.s, z16.h\n"
-    ".inst 0x4508a330  // sshllb z16.h, z25.b, #0x0\n"
-    ".inst 0x45904063  // saddwb z3.s, z3.s, z16.h\n"
-    ".inst 0x45904442  // saddwt z2.s, z2.s, z16.h\n"
-    ".inst 0x4508a730  // sshllt z16.h, z25.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a217  // sshllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508a616  // sshllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508a215  // sshllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508a614  // sshllt z20.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508a233  // sshllb z19.h, z17.b, #0x0\n"
+    ".inst 0x4508a632  // sshllt z18.h, z17.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595416b  // saddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x4595454a  // saddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944129  // saddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944508  // saddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459340e7  // saddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x459344c6  // saddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459240a5  // saddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924484  // saddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914063  // saddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914442  // saddwt z2.s, z2.s, z17.h\n"
     ".inst 0x45904021  // saddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904400  // saddwt z0.s, z0.s, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z20.s, #0x7f\n"
-    "ld1rw { z18.s }, p4/Z, [%x[combined_rescale_value]]\n"
-    "ld1rw { z17.s }, p4/Z, [%x[left_shift]]\n"
-    "not z19.s, p4/M, z20.s\n"
-    "ld1rw { z16.s }, p4/Z, [%x[right_shift]]\n"
-    ".inst 0x4482922f  // srshl z15.s, p4/M, z15.s, z17.s\n"
-    ".inst 0x4482922e  // srshl z14.s, p4/M, z14.s, z17.s\n"
-    ".inst 0x4482922d  // srshl z13.s, p4/M, z13.s, z17.s\n"
-    ".inst 0x4482922c  // srshl z12.s, p4/M, z12.s, z17.s\n"
-    ".inst 0x4482922b  // srshl z11.s, p4/M, z11.s, z17.s\n"
-    ".inst 0x04b275ef  // sqrdmulh z15.s, z15.s, z18.s\n"
-    ".inst 0x04b275ce  // sqrdmulh z14.s, z14.s, z18.s\n"
-    ".inst 0x04b275ad  // sqrdmulh z13.s, z13.s, z18.s\n"
-    ".inst 0x04b2758c  // sqrdmulh z12.s, z12.s, z18.s\n"
-    ".inst 0x04b2756b  // sqrdmulh z11.s, z11.s, z18.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    ".inst 0x4482920b  // srshl z11.s, p4/M, z11.s, z16.s\n"
-    ".inst 0x4482922a  // srshl z10.s, p4/M, z10.s, z17.s\n"
-    ".inst 0x44829229  // srshl z9.s, p4/M, z9.s, z17.s\n"
-    ".inst 0x44829228  // srshl z8.s, p4/M, z8.s, z17.s\n"
-    ".inst 0x44829227  // srshl z7.s, p4/M, z7.s, z17.s\n"
-    ".inst 0x04b2754a  // sqrdmulh z10.s, z10.s, z18.s\n"
-    ".inst 0x04b27529  // sqrdmulh z9.s, z9.s, z18.s\n"
-    ".inst 0x04b27508  // sqrdmulh z8.s, z8.s, z18.s\n"
-    ".inst 0x04b274e7  // sqrdmulh z7.s, z7.s, z18.s\n"
-    ".inst 0x4482920a  // srshl z10.s, p4/M, z10.s, z16.s\n"
-    ".inst 0x44829209  // srshl z9.s, p4/M, z9.s, z16.s\n"
-    ".inst 0x44829208  // srshl z8.s, p4/M, z8.s, z16.s\n"
-    ".inst 0x44829207  // srshl z7.s, p4/M, z7.s, z16.s\n"
-    ".inst 0x44829226  // srshl z6.s, p4/M, z6.s, z17.s\n"
-    ".inst 0x44829225  // srshl z5.s, p4/M, z5.s, z17.s\n"
-    ".inst 0x44829224  // srshl z4.s, p4/M, z4.s, z17.s\n"
-    ".inst 0x44829223  // srshl z3.s, p4/M, z3.s, z17.s\n"
-    ".inst 0x04b274c6  // sqrdmulh z6.s, z6.s, z18.s\n"
-    ".inst 0x04b274a5  // sqrdmulh z5.s, z5.s, z18.s\n"
-    ".inst 0x04b27484  // sqrdmulh z4.s, z4.s, z18.s\n"
-    ".inst 0x04b27463  // sqrdmulh z3.s, z3.s, z18.s\n"
-    ".inst 0x44829206  // srshl z6.s, p4/M, z6.s, z16.s\n"
-    ".inst 0x44829205  // srshl z5.s, p4/M, z5.s, z16.s\n"
-    ".inst 0x44829204  // srshl z4.s, p4/M, z4.s, z16.s\n"
-    ".inst 0x44829203  // srshl z3.s, p4/M, z3.s, z16.s\n"
-    ".inst 0x44829222  // srshl z2.s, p4/M, z2.s, z17.s\n"
-    ".inst 0x44829221  // srshl z1.s, p4/M, z1.s, z17.s\n"
-    ".inst 0x44829220  // srshl z0.s, p4/M, z0.s, z17.s\n"
-    "smax z15.s, p4/M, z15.s, z19.s\n"
-    ".inst 0x04b27442  // sqrdmulh z2.s, z2.s, z18.s\n"
-    ".inst 0x04b27421  // sqrdmulh z1.s, z1.s, z18.s\n"
-    ".inst 0x04b27400  // sqrdmulh z0.s, z0.s, z18.s\n"
-    "smin z15.s, p4/M, z15.s, z20.s\n"
-    ".inst 0x44829202  // srshl z2.s, p4/M, z2.s, z16.s\n"
-    ".inst 0x44829201  // srshl z1.s, p4/M, z1.s, z16.s\n"
-    ".inst 0x44829200  // srshl z0.s, p4/M, z0.s, z16.s\n"
-    "smax z14.s, p4/M, z14.s, z19.s\n"
-    "smax z13.s, p4/M, z13.s, z19.s\n"
-    "smax z12.s, p4/M, z12.s, z19.s\n"
-    "smax z11.s, p4/M, z11.s, z19.s\n"
-    "smin z14.s, p4/M, z14.s, z20.s\n"
-    "smin z13.s, p4/M, z13.s, z20.s\n"
-    "smin z12.s, p4/M, z12.s, z20.s\n"
-    "smin z11.s, p4/M, z11.s, z20.s\n"
+    "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+    "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482824f  // srshl z15.s, p0/M, z15.s, z18.s\n"
+    ".inst 0x4482824e  // srshl z14.s, p0/M, z14.s, z18.s\n"
+    ".inst 0x4482824d  // srshl z13.s, p0/M, z13.s, z18.s\n"
+    ".inst 0x4482824c  // srshl z12.s, p0/M, z12.s, z18.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x4482824b  // srshl z11.s, p0/M, z11.s, z18.s\n"
+    ".inst 0x4482824a  // srshl z10.s, p0/M, z10.s, z18.s\n"
+    ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
+    ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x44828249  // srshl z9.s, p0/M, z9.s, z18.s\n"
+    ".inst 0x44828248  // srshl z8.s, p0/M, z8.s, z18.s\n"
+    ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
+    ".inst 0x04b1756b  // sqrdmulh z11.s, z11.s, z17.s\n"
+    ".inst 0x44828247  // srshl z7.s, p0/M, z7.s, z18.s\n"
+    ".inst 0x44828246  // srshl z6.s, p0/M, z6.s, z18.s\n"
+    ".inst 0x04b1754a  // sqrdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x04b17529  // sqrdmulh z9.s, z9.s, z17.s\n"
+    ".inst 0x44828245  // srshl z5.s, p0/M, z5.s, z18.s\n"
+    ".inst 0x44828244  // srshl z4.s, p0/M, z4.s, z18.s\n"
+    ".inst 0x04b17508  // sqrdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x04b174e7  // sqrdmulh z7.s, z7.s, z17.s\n"
+    ".inst 0x44828243  // srshl z3.s, p0/M, z3.s, z18.s\n"
+    ".inst 0x44828242  // srshl z2.s, p0/M, z2.s, z18.s\n"
+    ".inst 0x04b174c6  // sqrdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x04b174a5  // sqrdmulh z5.s, z5.s, z17.s\n"
+    ".inst 0x44828241  // srshl z1.s, p0/M, z1.s, z18.s\n"
+    ".inst 0x44828240  // srshl z0.s, p0/M, z0.s, z18.s\n"
+    ".inst 0x04b17484  // sqrdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x04b17463  // sqrdmulh z3.s, z3.s, z17.s\n"
+    ".inst 0x04b17442  // sqrdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x04b17421  // sqrdmulh z1.s, z1.s, z17.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x04b17400  // sqrdmulh z0.s, z0.s, z17.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
-    "smax z10.s, p4/M, z10.s, z19.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
+    "smin z11.s, p0/M, z11.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
-    "smax z9.s, p4/M, z9.s, z19.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "smin z10.s, p4/M, z10.s, z20.s\n"
-    "incb x26, ALL, MUL #4\n"
-    "smin z9.s, p4/M, z9.s, z20.s\n"
-    "smax z8.s, p4/M, z8.s, z19.s\n"
-    "smax z7.s, p4/M, z7.s, z19.s\n"
-    "smax z6.s, p4/M, z6.s, z19.s\n"
-    "trn1 z18.h, z11.h, z10.h\n"
-    "smin z8.s, p4/M, z8.s, z20.s\n"
-    "smin z7.s, p4/M, z7.s, z20.s\n"
-    "smin z6.s, p4/M, z6.s, z20.s\n"
-    "smax z5.s, p4/M, z5.s, z19.s\n"
+    "smin z10.s, p0/M, z10.s, z18.s\n"
+    "smin z9.s, p0/M, z9.s, z18.s\n"
+    "trn1 z17.h, z11.h, z10.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "smin z8.s, p0/M, z8.s, z18.s\n"
+    "smin z7.s, p0/M, z7.s, z18.s\n"
     "trn1 z16.h, z9.h, z8.h\n"
-    "smax z4.s, p4/M, z4.s, z19.s\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z6.s, p0/M, z6.s, z18.s\n"
+    "smin z5.s, p0/M, z5.s, z18.s\n"
     "trn1 z17.h, z7.h, z6.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
-    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
-    "smin z5.s, p4/M, z5.s, z20.s\n"
-    "incb x25, ALL, MUL #4\n"
-    "smin z4.s, p4/M, z4.s, z20.s\n"
-    "smax z3.s, p4/M, z3.s, z19.s\n"
-    "smax z2.s, p4/M, z2.s, z19.s\n"
-    "smax z1.s, p4/M, z1.s, z19.s\n"
-    "smax z0.s, p4/M, z0.s, z19.s\n"
+    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+    "smin z4.s, p0/M, z4.s, z18.s\n"
+    "smin z3.s, p0/M, z3.s, z18.s\n"
     "trn1 z16.h, z5.h, z4.h\n"
-    "smin z3.s, p4/M, z3.s, z20.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
-    "smin z2.s, p4/M, z2.s, z20.s\n"
-    "incb x24, ALL, MUL #4\n"
-    "smin z1.s, p4/M, z1.s, z20.s\n"
-    "smin z0.s, p4/M, z0.s, z20.s\n"
+    "smin z2.s, p0/M, z2.s, z18.s\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
     "trn1 z17.h, z3.h, z2.h\n"
+    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
     "trn1 z16.h, z1.h, z0.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x23]\n"
-    "incb x23, ALL, MUL #4\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "incb x27, ALL, MUL #4\n"
+    "incb x26, ALL, MUL #4\n"
+    "incb x25, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    "subs x22, x22, #0x1\n"
-    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    ".inst 0x455e03f7  // saddlb z23.h, z31.b, z30.b\n"
-    ".inst 0x455e07f6  // saddlt z22.h, z31.b, z30.b\n"
-    ".inst 0x459741ef  // saddwb z15.s, z15.s, z23.h\n"
-    ".inst 0x459745ce  // saddwt z14.s, z14.s, z23.h\n"
-    ".inst 0x459641ad  // saddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x4596458c  // saddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e03f1  // saddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e07f0  // saddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508a3f1  // sshllb z17.h, z31.b, #0x0\n"
-    ".inst 0x4508a7f0  // sshllt z16.h, z31.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508a211  // sshllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508a610  // sshllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
     ".inst 0x459141ef  // saddwb z15.s, z15.s, z17.h\n"
     ".inst 0x459145ce  // saddwt z14.s, z14.s, z17.h\n"
     ".inst 0x459041ad  // saddwb z13.s, z13.s, z16.h\n"
     ".inst 0x4590458c  // saddwt z12.s, z12.s, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z20.s, #0x7f\n"
-    "ld1rw { z18.s }, p4/Z, [%x[combined_rescale_value]]\n"
-    "ld1rw { z17.s }, p4/Z, [%x[left_shift]]\n"
-    "not z19.s, p4/M, z20.s\n"
-    "ld1rw { z16.s }, p4/Z, [%x[right_shift]]\n"
-    ".inst 0x4482922f  // srshl z15.s, p4/M, z15.s, z17.s\n"
-    ".inst 0x4482922e  // srshl z14.s, p4/M, z14.s, z17.s\n"
-    ".inst 0x4482922d  // srshl z13.s, p4/M, z13.s, z17.s\n"
-    ".inst 0x4482922c  // srshl z12.s, p4/M, z12.s, z17.s\n"
-    ".inst 0x04b275ef  // sqrdmulh z15.s, z15.s, z18.s\n"
-    ".inst 0x04b275ce  // sqrdmulh z14.s, z14.s, z18.s\n"
-    ".inst 0x04b275ad  // sqrdmulh z13.s, z13.s, z18.s\n"
-    ".inst 0x04b2758c  // sqrdmulh z12.s, z12.s, z18.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z19.s\n"
-    "smax z14.s, p4/M, z14.s, z19.s\n"
-    "smax z13.s, p4/M, z13.s, z19.s\n"
-    "smax z12.s, p4/M, z12.s, z19.s\n"
-    "smin z15.s, p4/M, z15.s, z20.s\n"
-    "smin z14.s, p4/M, z14.s, z20.s\n"
-    "smin z13.s, p4/M, z13.s, z20.s\n"
-    "smin z12.s, p4/M, z12.s, z20.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+    "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b175ef  // sqrdmulh z15.s, z15.s, z17.s\n"
+    ".inst 0x04b175ce  // sqrdmulh z14.s, z14.s, z17.s\n"
+    ".inst 0x04b175ad  // sqrdmulh z13.s, z13.s, z17.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x04b1758c  // sqrdmulh z12.s, z12.s, z17.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "incb x26\n"
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [right_shift] "r" (&right_shift)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp
index 84aa0d3d6b..fc06ed09f6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_s8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
 
-struct sve_s8q_nhwc_max_generic_depthfirst
+struct sve_s8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
 {
-  typedef int8_t operand_type;
-  typedef int8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_s8q_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
   sve_s8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_s8q_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
index 3717f8cb30..4fc1532d5a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,9 @@
 
 #include "pooling.hpp"
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -41,346 +42,345 @@ void sve_s8q_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cntb x27\n"
-    "cntb x26, ALL, MUL #2\n"
-    "cntb x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "whilelt p3.b, x28, %x[n_channels]\n"
     "whilelt p2.b, x27, %x[n_channels]\n"
     "whilelt p1.b, x26, %x[n_channels]\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "mov z8.b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
     "mov z7.b, #0x80\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
+    "mov x24, %x[inptrs]\n"
     "mov z6.b, #0x80\n"
     "mov z5.b, #0x80\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "smax z21.b, p4/M, z21.b, z26.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "smax z16.b, p4/M, z16.b, z25.b\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "smax z20.b, p4/M, z20.b, z24.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "smax z18.b, p4/M, z18.b, z22.b\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "smax z17.b, p4/M, z17.b, z21.b\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "smax z16.b, p4/M, z16.b, z20.b\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "smax z8.b, p4/M, z8.b, z19.b\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "smax z7.b, p4/M, z7.b, z18.b\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "smax z6.b, p4/M, z6.b, z17.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "smax z5.b, p4/M, z5.b, z16.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+    "smax z22.b, p0/M, z22.b, z30.b\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+    "smax z21.b, p0/M, z21.b, z27.b\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z19.b\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "smax z7.b, p0/M, z7.b, z18.b\n"
+    "smax z6.b, p0/M, z6.b, z17.b\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
-    "smax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
-    "smax z21.b, p4/M, z21.b, z26.b\n"
-    "smax z16.b, p4/M, z16.b, z25.b\n"
-    "smax z20.b, p4/M, z20.b, z24.b\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "smax z18.b, p4/M, z18.b, z22.b\n"
-    "smax z17.b, p4/M, z17.b, z21.b\n"
-    "smax z16.b, p4/M, z16.b, z20.b\n"
-    "smax z8.b, p4/M, z8.b, z19.b\n"
-    "smax z7.b, p4/M, z7.b, z18.b\n"
-    "smax z6.b, p4/M, z6.b, z17.b\n"
-    "smax z5.b, p4/M, z5.b, z16.b\n"
+    "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+    "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+    "smax z22.b, p0/M, z22.b, z30.b\n"
+    "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+    "smax z21.b, p0/M, z21.b, z27.b\n"
+    "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+    "smax z20.b, p0/M, z20.b, z24.b\n"
+    "smax z19.b, p0/M, z19.b, z23.b\n"
+    "smax z18.b, p0/M, z18.b, z22.b\n"
+    "smax z17.b, p0/M, z17.b, z21.b\n"
+    "smax z16.b, p0/M, z16.b, z20.b\n"
+    "smax z8.b, p0/M, z8.b, z19.b\n"
+    "smax z7.b, p0/M, z7.b, z18.b\n"
+    "smax z6.b, p0/M, z6.b, z17.b\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z8.b, p4/M, z8.b, z3.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "smax z7.b, p4/M, z7.b, z31.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "smax z6.b, p4/M, z6.b, z28.b\n"
-    "smax z5.b, p4/M, z5.b, z16.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
+    "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "smax z7.b, p0/M, z7.b, z17.b\n"
+    "smax z6.b, p0/M, z6.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "smax z5.b, p0/M, z5.b, z16.b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z4.s, #0x7f\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "ld1rw { z3.s }, p4/Z, [x19]\n"
     ".inst 0x4508a111  // sshllb z17.h, z8.b, #0x0\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    ".inst 0x4508a510  // sshllt z16.h, z8.b, #0x0\n"
-    "ld1rw { z2.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    ".inst 0x4508a0f2  // sshllb z18.h, z7.b, #0x0\n"
-    "ld1rw { z1.s }, p4/Z, [x19]\n"
-    ".inst 0x4508a4f7  // sshllt z23.h, z7.b, #0x0\n"
-    ".inst 0x4508a0d6  // sshllb z22.h, z6.b, #0x0\n"
-    ".inst 0x4508a4d5  // sshllt z21.h, z6.b, #0x0\n"
-    ".inst 0x4508a0b4  // sshllb z20.h, z5.b, #0x0\n"
-    ".inst 0x4508a4b3  // sshllt z19.h, z5.b, #0x0\n"
-    ".inst 0x4510a220  // sshllb z0.s, z17.h, #0x0\n"
+    ".inst 0x4508a517  // sshllt z23.h, z8.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z4.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a0f6  // sshllb z22.h, z7.b, #0x0\n"
+    ".inst 0x4508a4f5  // sshllt z21.h, z7.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z3.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a0d4  // sshllb z20.h, z6.b, #0x0\n"
+    ".inst 0x4508a4d3  // sshllt z19.h, z6.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z2.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a0b2  // sshllb z18.h, z5.b, #0x0\n"
+    ".inst 0x4508a4b0  // sshllt z16.h, z5.b, #0x0\n"
+    ".inst 0x4510a221  // sshllb z1.s, z17.h, #0x0\n"
     ".inst 0x4510a631  // sshllt z17.s, z17.h, #0x0\n"
-    ".inst 0x4510a21f  // sshllb z31.s, z16.h, #0x0\n"
-    ".inst 0x4510a610  // sshllt z16.s, z16.h, #0x0\n"
-    ".inst 0x4510a25e  // sshllb z30.s, z18.h, #0x0\n"
-    ".inst 0x4510a652  // sshllt z18.s, z18.h, #0x0\n"
-    ".inst 0x4510a2fd  // sshllb z29.s, z23.h, #0x0\n"
-    ".inst 0x4510a6fc  // sshllt z28.s, z23.h, #0x0\n"
-    ".inst 0x4510a2db  // sshllb z27.s, z22.h, #0x0\n"
-    ".inst 0x4510a6da  // sshllt z26.s, z22.h, #0x0\n"
-    ".inst 0x4510a2b9  // sshllb z25.s, z21.h, #0x0\n"
-    ".inst 0x4510a6b8  // sshllt z24.s, z21.h, #0x0\n"
-    ".inst 0x4510a297  // sshllb z23.s, z20.h, #0x0\n"
-    ".inst 0x4510a696  // sshllt z22.s, z20.h, #0x0\n"
-    ".inst 0x4510a275  // sshllb z21.s, z19.h, #0x0\n"
-    ".inst 0x4510a674  // sshllt z20.s, z19.h, #0x0\n"
-    ".inst 0x44829040  // srshl z0.s, p4/M, z0.s, z2.s\n"
-    ".inst 0x44829051  // srshl z17.s, p4/M, z17.s, z2.s\n"
-    ".inst 0x4482905f  // srshl z31.s, p4/M, z31.s, z2.s\n"
-    ".inst 0x44829050  // srshl z16.s, p4/M, z16.s, z2.s\n"
-    ".inst 0x04a37400  // sqrdmulh z0.s, z0.s, z3.s\n"
+    ".inst 0x44828081  // srshl z1.s, p0/M, z1.s, z4.s\n"
+    ".inst 0x44828091  // srshl z17.s, p0/M, z17.s, z4.s\n"
+    ".inst 0x4510a2e0  // sshllb z0.s, z23.h, #0x0\n"
+    ".inst 0x4510a6ff  // sshllt z31.s, z23.h, #0x0\n"
+    ".inst 0x44828080  // srshl z0.s, p0/M, z0.s, z4.s\n"
+    ".inst 0x4482809f  // srshl z31.s, p0/M, z31.s, z4.s\n"
+    ".inst 0x4510a2de  // sshllb z30.s, z22.h, #0x0\n"
+    ".inst 0x4510a6dd  // sshllt z29.s, z22.h, #0x0\n"
+    ".inst 0x4482809e  // srshl z30.s, p0/M, z30.s, z4.s\n"
+    ".inst 0x4482809d  // srshl z29.s, p0/M, z29.s, z4.s\n"
+    ".inst 0x4510a2bc  // sshllb z28.s, z21.h, #0x0\n"
+    ".inst 0x4510a6bb  // sshllt z27.s, z21.h, #0x0\n"
+    ".inst 0x4482809c  // srshl z28.s, p0/M, z28.s, z4.s\n"
+    ".inst 0x4482809b  // srshl z27.s, p0/M, z27.s, z4.s\n"
+    ".inst 0x4510a29a  // sshllb z26.s, z20.h, #0x0\n"
+    ".inst 0x4510a699  // sshllt z25.s, z20.h, #0x0\n"
+    ".inst 0x4482809a  // srshl z26.s, p0/M, z26.s, z4.s\n"
+    ".inst 0x44828099  // srshl z25.s, p0/M, z25.s, z4.s\n"
+    ".inst 0x4510a278  // sshllb z24.s, z19.h, #0x0\n"
+    ".inst 0x4510a677  // sshllt z23.s, z19.h, #0x0\n"
+    ".inst 0x44828098  // srshl z24.s, p0/M, z24.s, z4.s\n"
+    ".inst 0x44828097  // srshl z23.s, p0/M, z23.s, z4.s\n"
+    ".inst 0x4510a256  // sshllb z22.s, z18.h, #0x0\n"
+    ".inst 0x4510a655  // sshllt z21.s, z18.h, #0x0\n"
+    ".inst 0x44828096  // srshl z22.s, p0/M, z22.s, z4.s\n"
+    ".inst 0x44828095  // srshl z21.s, p0/M, z21.s, z4.s\n"
+    ".inst 0x4510a214  // sshllb z20.s, z16.h, #0x0\n"
+    ".inst 0x4510a613  // sshllt z19.s, z16.h, #0x0\n"
+    ".inst 0x44828094  // srshl z20.s, p0/M, z20.s, z4.s\n"
+    ".inst 0x44828093  // srshl z19.s, p0/M, z19.s, z4.s\n"
+    ".inst 0x04a37421  // sqrdmulh z1.s, z1.s, z3.s\n"
     ".inst 0x04a37631  // sqrdmulh z17.s, z17.s, z3.s\n"
+    ".inst 0x44828041  // srshl z1.s, p0/M, z1.s, z2.s\n"
+    ".inst 0x44828051  // srshl z17.s, p0/M, z17.s, z2.s\n"
+    ".inst 0x04a37400  // sqrdmulh z0.s, z0.s, z3.s\n"
     ".inst 0x04a377ff  // sqrdmulh z31.s, z31.s, z3.s\n"
-    ".inst 0x04a37610  // sqrdmulh z16.s, z16.s, z3.s\n"
-    ".inst 0x44829020  // srshl z0.s, p4/M, z0.s, z1.s\n"
-    ".inst 0x44829031  // srshl z17.s, p4/M, z17.s, z1.s\n"
-    ".inst 0x4482903f  // srshl z31.s, p4/M, z31.s, z1.s\n"
-    ".inst 0x44829030  // srshl z16.s, p4/M, z16.s, z1.s\n"
-    ".inst 0x4482905e  // srshl z30.s, p4/M, z30.s, z2.s\n"
-    ".inst 0x44829052  // srshl z18.s, p4/M, z18.s, z2.s\n"
-    ".inst 0x4482905d  // srshl z29.s, p4/M, z29.s, z2.s\n"
-    ".inst 0x4482905c  // srshl z28.s, p4/M, z28.s, z2.s\n"
+    ".inst 0x44828040  // srshl z0.s, p0/M, z0.s, z2.s\n"
+    ".inst 0x4482805f  // srshl z31.s, p0/M, z31.s, z2.s\n"
     ".inst 0x04a377de  // sqrdmulh z30.s, z30.s, z3.s\n"
-    ".inst 0x04a37652  // sqrdmulh z18.s, z18.s, z3.s\n"
     ".inst 0x04a377bd  // sqrdmulh z29.s, z29.s, z3.s\n"
+    ".inst 0x4482805e  // srshl z30.s, p0/M, z30.s, z2.s\n"
+    ".inst 0x4482805d  // srshl z29.s, p0/M, z29.s, z2.s\n"
     ".inst 0x04a3779c  // sqrdmulh z28.s, z28.s, z3.s\n"
-    ".inst 0x4482903e  // srshl z30.s, p4/M, z30.s, z1.s\n"
-    ".inst 0x44829032  // srshl z18.s, p4/M, z18.s, z1.s\n"
-    ".inst 0x4482903d  // srshl z29.s, p4/M, z29.s, z1.s\n"
-    ".inst 0x4482903c  // srshl z28.s, p4/M, z28.s, z1.s\n"
-    ".inst 0x4482905b  // srshl z27.s, p4/M, z27.s, z2.s\n"
-    ".inst 0x4482905a  // srshl z26.s, p4/M, z26.s, z2.s\n"
-    ".inst 0x44829059  // srshl z25.s, p4/M, z25.s, z2.s\n"
-    ".inst 0x44829058  // srshl z24.s, p4/M, z24.s, z2.s\n"
     ".inst 0x04a3777b  // sqrdmulh z27.s, z27.s, z3.s\n"
+    ".inst 0x4482805c  // srshl z28.s, p0/M, z28.s, z2.s\n"
+    ".inst 0x4482805b  // srshl z27.s, p0/M, z27.s, z2.s\n"
     ".inst 0x04a3775a  // sqrdmulh z26.s, z26.s, z3.s\n"
     ".inst 0x04a37739  // sqrdmulh z25.s, z25.s, z3.s\n"
+    ".inst 0x4482805a  // srshl z26.s, p0/M, z26.s, z2.s\n"
+    ".inst 0x44828059  // srshl z25.s, p0/M, z25.s, z2.s\n"
     ".inst 0x04a37718  // sqrdmulh z24.s, z24.s, z3.s\n"
-    ".inst 0x4482903b  // srshl z27.s, p4/M, z27.s, z1.s\n"
-    ".inst 0x4482903a  // srshl z26.s, p4/M, z26.s, z1.s\n"
-    ".inst 0x44829039  // srshl z25.s, p4/M, z25.s, z1.s\n"
-    ".inst 0x44829038  // srshl z24.s, p4/M, z24.s, z1.s\n"
-    ".inst 0x44829057  // srshl z23.s, p4/M, z23.s, z2.s\n"
-    ".inst 0x44829056  // srshl z22.s, p4/M, z22.s, z2.s\n"
-    ".inst 0x44829055  // srshl z21.s, p4/M, z21.s, z2.s\n"
-    ".inst 0x44829054  // srshl z20.s, p4/M, z20.s, z2.s\n"
     ".inst 0x04a376f7  // sqrdmulh z23.s, z23.s, z3.s\n"
+    ".inst 0x44828058  // srshl z24.s, p0/M, z24.s, z2.s\n"
+    ".inst 0x44828057  // srshl z23.s, p0/M, z23.s, z2.s\n"
     ".inst 0x04a376d6  // sqrdmulh z22.s, z22.s, z3.s\n"
     ".inst 0x04a376b5  // sqrdmulh z21.s, z21.s, z3.s\n"
+    ".inst 0x44828056  // srshl z22.s, p0/M, z22.s, z2.s\n"
+    ".inst 0x44828055  // srshl z21.s, p0/M, z21.s, z2.s\n"
     ".inst 0x04a37694  // sqrdmulh z20.s, z20.s, z3.s\n"
-    ".inst 0x44829037  // srshl z23.s, p4/M, z23.s, z1.s\n"
-    ".inst 0x44829036  // srshl z22.s, p4/M, z22.s, z1.s\n"
-    ".inst 0x44829035  // srshl z21.s, p4/M, z21.s, z1.s\n"
-    ".inst 0x44829034  // srshl z20.s, p4/M, z20.s, z1.s\n"
-    "not z19.s, p4/M, z4.s\n"
-    "smax z0.s, p4/M, z0.s, z19.s\n"
-    "smax z17.s, p4/M, z17.s, z19.s\n"
-    "smax z31.s, p4/M, z31.s, z19.s\n"
-    "smax z16.s, p4/M, z16.s, z19.s\n"
-    "smin z0.s, p4/M, z0.s, z4.s\n"
-    "smin z17.s, p4/M, z17.s, z4.s\n"
-    "smin z31.s, p4/M, z31.s, z4.s\n"
-    "smin z16.s, p4/M, z16.s, z4.s\n"
-    "smax z30.s, p4/M, z30.s, z19.s\n"
-    "trn1 z17.h, z0.h, z17.h\n"
-    "smax z18.s, p4/M, z18.s, z19.s\n"
-    "trn1 z16.h, z31.h, z16.h\n"
-    "smin z30.s, p4/M, z30.s, z4.s\n"
+    ".inst 0x04a37673  // sqrdmulh z19.s, z19.s, z3.s\n"
+    ".inst 0x44828054  // srshl z20.s, p0/M, z20.s, z2.s\n"
+    ".inst 0x44828053  // srshl z19.s, p0/M, z19.s, z2.s\n"
+    "mov z18.s, #0x7f\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z17.s, p0/M, z17.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smax z31.s, p0/M, z31.s, z16.s\n"
+    "smax z30.s, p0/M, z30.s, z16.s\n"
+    "smax z29.s, p0/M, z29.s, z16.s\n"
+    "smax z28.s, p0/M, z28.s, z16.s\n"
+    "smax z27.s, p0/M, z27.s, z16.s\n"
+    "smax z26.s, p0/M, z26.s, z16.s\n"
+    "smax z25.s, p0/M, z25.s, z16.s\n"
+    "smax z24.s, p0/M, z24.s, z16.s\n"
+    "smax z23.s, p0/M, z23.s, z16.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z19.s, p0/M, z19.s, z16.s\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
+    "smin z17.s, p0/M, z17.s, z18.s\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
+    "trn1 z17.h, z1.h, z17.h\n"
+    "smin z31.s, p0/M, z31.s, z18.s\n"
+    "smin z30.s, p0/M, z30.s, z18.s\n"
+    "trn1 z16.h, z0.h, z31.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z29.s, p0/M, z29.s, z18.s\n"
+    "smin z28.s, p0/M, z28.s, z18.s\n"
+    "trn1 z17.h, z30.h, z29.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "smin z27.s, p0/M, z27.s, z18.s\n"
+    "smin z26.s, p0/M, z26.s, z18.s\n"
+    "trn1 z16.h, z28.h, z27.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
+    "smin z25.s, p0/M, z25.s, z18.s\n"
+    "smin z24.s, p0/M, z24.s, z18.s\n"
+    "trn1 z17.h, z26.h, z25.h\n"
     "st1b { z16.b }, p3, [%x[outptr], x28]\n"
-    "smin z18.s, p4/M, z18.s, z4.s\n"
-    "incb x28, ALL, MUL #4\n"
-    "smax z29.s, p4/M, z29.s, z19.s\n"
-    "smax z28.s, p4/M, z28.s, z19.s\n"
-    "smax z27.s, p4/M, z27.s, z19.s\n"
-    "smax z26.s, p4/M, z26.s, z19.s\n"
-    "trn1 z18.h, z30.h, z18.h\n"
-    "smin z29.s, p4/M, z29.s, z4.s\n"
-    "smin z28.s, p4/M, z28.s, z4.s\n"
-    "smin z27.s, p4/M, z27.s, z4.s\n"
-    "smin z26.s, p4/M, z26.s, z4.s\n"
-    "smax z25.s, p4/M, z25.s, z19.s\n"
-    "trn1 z16.h, z29.h, z28.h\n"
-    "smax z24.s, p4/M, z24.s, z19.s\n"
-    "trn1 z17.h, z27.h, z26.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
+    "smin z23.s, p0/M, z23.s, z18.s\n"
+    "smin z22.s, p0/M, z22.s, z18.s\n"
+    "trn1 z16.h, z24.h, z23.h\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z21.s, p0/M, z21.s, z18.s\n"
+    "smin z20.s, p0/M, z20.s, z18.s\n"
+    "trn1 z17.h, z22.h, z21.h\n"
     "st1b { z16.b }, p2, [%x[outptr], x27]\n"
-    "smin z25.s, p4/M, z25.s, z4.s\n"
-    "incb x27, ALL, MUL #4\n"
-    "smin z24.s, p4/M, z24.s, z4.s\n"
-    "smax z23.s, p4/M, z23.s, z19.s\n"
-    "smax z22.s, p4/M, z22.s, z19.s\n"
-    "smax z21.s, p4/M, z21.s, z19.s\n"
-    "smax z20.s, p4/M, z20.s, z19.s\n"
-    "trn1 z16.h, z25.h, z24.h\n"
-    "smin z23.s, p4/M, z23.s, z4.s\n"
+    "smin z19.s, p0/M, z19.s, z18.s\n"
+    "trn1 z16.h, z20.h, z19.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
     "st1b { z16.b }, p1, [%x[outptr], x26]\n"
-    "smin z22.s, p4/M, z22.s, z4.s\n"
     "incb x26, ALL, MUL #4\n"
-    "smin z21.s, p4/M, z21.s, z4.s\n"
-    "smin z20.s, p4/M, z20.s, z4.s\n"
-    "trn1 z17.h, z23.h, z22.h\n"
-    "trn1 z16.h, z21.h, z20.h\n"
-    "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x25]\n"
-    "incb x25, ALL, MUL #4\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "incb x9, ALL, MUL #4\n"
+    "incb x28, ALL, MUL #4\n"
+    "incb x27, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "mov z8.b, #0x80\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "smax z8.b, p4/M, z8.b, z19.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
-    "smax z19.b, p4/M, z19.b, z23.b\n"
-    "smax z8.b, p4/M, z8.b, z19.b\n"
+    "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+    "smax z16.b, p0/M, z16.b, z17.b\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "smax z8.b, p4/M, z8.b, z3.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "smax z8.b, p0/M, z8.b, z16.b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z4.s, #0x7f\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    "ld1rw { z3.s }, p4/Z, [x19]\n"
     ".inst 0x4508a111  // sshllb z17.h, z8.b, #0x0\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    ".inst 0x4508a510  // sshllt z16.h, z8.b, #0x0\n"
-    "ld1rw { z2.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    ".inst 0x4510a220  // sshllb z0.s, z17.h, #0x0\n"
-    "ld1rw { z1.s }, p4/Z, [x19]\n"
-    ".inst 0x4510a631  // sshllt z17.s, z17.h, #0x0\n"
-    ".inst 0x4510a21f  // sshllb z31.s, z16.h, #0x0\n"
-    ".inst 0x4510a610  // sshllt z16.s, z16.h, #0x0\n"
-    ".inst 0x44829040  // srshl z0.s, p4/M, z0.s, z2.s\n"
-    ".inst 0x44829051  // srshl z17.s, p4/M, z17.s, z2.s\n"
-    ".inst 0x4482905f  // srshl z31.s, p4/M, z31.s, z2.s\n"
-    ".inst 0x44829050  // srshl z16.s, p4/M, z16.s, z2.s\n"
-    ".inst 0x04a37400  // sqrdmulh z0.s, z0.s, z3.s\n"
-    ".inst 0x04a37631  // sqrdmulh z17.s, z17.s, z3.s\n"
-    ".inst 0x04a377ff  // sqrdmulh z31.s, z31.s, z3.s\n"
-    ".inst 0x04a37610  // sqrdmulh z16.s, z16.s, z3.s\n"
-    ".inst 0x44829020  // srshl z0.s, p4/M, z0.s, z1.s\n"
-    ".inst 0x44829031  // srshl z17.s, p4/M, z17.s, z1.s\n"
-    ".inst 0x4482903f  // srshl z31.s, p4/M, z31.s, z1.s\n"
-    ".inst 0x44829030  // srshl z16.s, p4/M, z16.s, z1.s\n"
-    "not z19.s, p4/M, z4.s\n"
-    "smax z0.s, p4/M, z0.s, z19.s\n"
-    "smax z17.s, p4/M, z17.s, z19.s\n"
-    "smax z31.s, p4/M, z31.s, z19.s\n"
-    "smax z16.s, p4/M, z16.s, z19.s\n"
-    "smin z0.s, p4/M, z0.s, z4.s\n"
-    "smin z17.s, p4/M, z17.s, z4.s\n"
-    "smin z31.s, p4/M, z31.s, z4.s\n"
-    "smin z16.s, p4/M, z16.s, z4.s\n"
-    "trn1 z17.h, z0.h, z17.h\n"
-    "trn1 z16.h, z31.h, z16.h\n"
+    ".inst 0x4508a512  // sshllt z18.h, z8.b, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x4510a236  // sshllb z22.s, z17.h, #0x0\n"
+    ".inst 0x4510a635  // sshllt z21.s, z17.h, #0x0\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x4510a254  // sshllb z20.s, z18.h, #0x0\n"
+    ".inst 0x4510a653  // sshllt z19.s, z18.h, #0x0\n"
+    ".inst 0x44828216  // srshl z22.s, p0/M, z22.s, z16.s\n"
+    ".inst 0x44828215  // srshl z21.s, p0/M, z21.s, z16.s\n"
+    ".inst 0x44828214  // srshl z20.s, p0/M, z20.s, z16.s\n"
+    ".inst 0x44828213  // srshl z19.s, p0/M, z19.s, z16.s\n"
+    ".inst 0x04b176d6  // sqrdmulh z22.s, z22.s, z17.s\n"
+    ".inst 0x04b176b5  // sqrdmulh z21.s, z21.s, z17.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x04b17694  // sqrdmulh z20.s, z20.s, z17.s\n"
+    ".inst 0x04b17673  // sqrdmulh z19.s, z19.s, z17.s\n"
+    "mov z18.s, #0x7f\n"
+    ".inst 0x44828216  // srshl z22.s, p0/M, z22.s, z16.s\n"
+    ".inst 0x44828215  // srshl z21.s, p0/M, z21.s, z16.s\n"
+    ".inst 0x44828214  // srshl z20.s, p0/M, z20.s, z16.s\n"
+    ".inst 0x44828213  // srshl z19.s, p0/M, z19.s, z16.s\n"
+    "not z16.s, p0/M, z18.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z19.s, p0/M, z19.s, z16.s\n"
+    "smin z22.s, p0/M, z22.s, z18.s\n"
+    "smin z21.s, p0/M, z21.s, z18.s\n"
+    "smin z20.s, p0/M, z20.s, z18.s\n"
+    "trn1 z17.h, z22.h, z21.h\n"
+    "smin z19.s, p0/M, z19.s, z18.s\n"
+    "trn1 z16.h, z20.h, z19.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x28]\n"
-    "incb x28\n"
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp
index 299e55c9be..714530bc43 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_u8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
 
-struct sve_u8_nhwc_avg_generic_depthfirst
+struct sve_u8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_u8_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
   sve_u8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_u8_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
index 51a69a42be..f3f4950a1f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,12 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -84,30 +85,31 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x26, #0x0\n"
-    "cntb x25\n"
-    "cntb x24, ALL, MUL #2\n"
-    "cntb x23, ALL, MUL #3\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "whilelt p3.b, x26, %x[n_channels]\n"
     "whilelt p2.b, x25, %x[n_channels]\n"
     "whilelt p1.b, x24, %x[n_channels]\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
     "mov z11.s, #0x0\n"
@@ -122,43 +124,43 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
     "mov z2.s, #0x0\n"
     "mov z1.s, #0x0\n"
     "mov z0.s, #0x0\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
     ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
-    "subs x22, x22, #0x1\n"
     ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
     ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
     ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
     ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
     ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
     ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
     ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
     ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
     ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
     ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
     ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
@@ -198,219 +200,218 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
     ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508abf1  // ushllb z17.h, z31.b, #0x0\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    ".inst 0x4508aff0  // ushllt z16.h, z31.b, #0x0\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
-    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
-    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
-    ".inst 0x4508abb0  // ushllb z16.h, z29.b, #0x0\n"
-    ".inst 0x4590496b  // uaddwb z11.s, z11.s, z16.h\n"
-    ".inst 0x45904d4a  // uaddwt z10.s, z10.s, z16.h\n"
-    ".inst 0x4508afb0  // ushllt z16.h, z29.b, #0x0\n"
-    ".inst 0x45904929  // uaddwb z9.s, z9.s, z16.h\n"
-    ".inst 0x45904d08  // uaddwt z8.s, z8.s, z16.h\n"
-    ".inst 0x4508ab70  // ushllb z16.h, z27.b, #0x0\n"
-    ".inst 0x459048e7  // uaddwb z7.s, z7.s, z16.h\n"
-    ".inst 0x45904cc6  // uaddwt z6.s, z6.s, z16.h\n"
-    ".inst 0x4508af70  // ushllt z16.h, z27.b, #0x0\n"
-    ".inst 0x459048a5  // uaddwb z5.s, z5.s, z16.h\n"
-    ".inst 0x45904c84  // uaddwt z4.s, z4.s, z16.h\n"
-    ".inst 0x4508ab30  // ushllb z16.h, z25.b, #0x0\n"
-    ".inst 0x45904863  // uaddwb z3.s, z3.s, z16.h\n"
-    ".inst 0x45904c42  // uaddwt z2.s, z2.s, z16.h\n"
-    ".inst 0x4508af30  // ushllt z16.h, z25.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa17  // ushllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508ae16  // ushllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508aa15  // ushllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508ae14  // ushllt z20.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508aa33  // ushllb z19.h, z17.b, #0x0\n"
+    ".inst 0x4508ae32  // ushllt z18.h, z17.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
     ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z20.s, #0x0\n"
-    "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
-    "mov z19.s, #0xff\n"
-    "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
     ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
     ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
     ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
     ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
     ".inst 0x04b1756b  // sqdmulh z11.s, z11.s, z17.s\n"
     ".inst 0x04b1754a  // sqdmulh z10.s, z10.s, z17.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
     ".inst 0x04b17529  // sqdmulh z9.s, z9.s, z17.s\n"
     ".inst 0x04b17508  // sqdmulh z8.s, z8.s, z17.s\n"
+    ".inst 0x4482820b  // srshl z11.s, p0/M, z11.s, z16.s\n"
+    ".inst 0x4482820a  // srshl z10.s, p0/M, z10.s, z16.s\n"
     ".inst 0x04b174e7  // sqdmulh z7.s, z7.s, z17.s\n"
     ".inst 0x04b174c6  // sqdmulh z6.s, z6.s, z17.s\n"
+    ".inst 0x44828209  // srshl z9.s, p0/M, z9.s, z16.s\n"
+    ".inst 0x44828208  // srshl z8.s, p0/M, z8.s, z16.s\n"
     ".inst 0x04b174a5  // sqdmulh z5.s, z5.s, z17.s\n"
     ".inst 0x04b17484  // sqdmulh z4.s, z4.s, z17.s\n"
+    ".inst 0x44828207  // srshl z7.s, p0/M, z7.s, z16.s\n"
+    ".inst 0x44828206  // srshl z6.s, p0/M, z6.s, z16.s\n"
     ".inst 0x04b17463  // sqdmulh z3.s, z3.s, z17.s\n"
     ".inst 0x04b17442  // sqdmulh z2.s, z2.s, z17.s\n"
+    ".inst 0x44828205  // srshl z5.s, p0/M, z5.s, z16.s\n"
+    ".inst 0x44828204  // srshl z4.s, p0/M, z4.s, z16.s\n"
     ".inst 0x04b17421  // sqdmulh z1.s, z1.s, z17.s\n"
     ".inst 0x04b17400  // sqdmulh z0.s, z0.s, z17.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    ".inst 0x4482920b  // srshl z11.s, p4/M, z11.s, z16.s\n"
-    ".inst 0x4482920a  // srshl z10.s, p4/M, z10.s, z16.s\n"
-    ".inst 0x44829209  // srshl z9.s, p4/M, z9.s, z16.s\n"
-    ".inst 0x44829208  // srshl z8.s, p4/M, z8.s, z16.s\n"
-    ".inst 0x44829207  // srshl z7.s, p4/M, z7.s, z16.s\n"
-    ".inst 0x44829206  // srshl z6.s, p4/M, z6.s, z16.s\n"
-    ".inst 0x44829205  // srshl z5.s, p4/M, z5.s, z16.s\n"
-    ".inst 0x44829204  // srshl z4.s, p4/M, z4.s, z16.s\n"
-    ".inst 0x44829203  // srshl z3.s, p4/M, z3.s, z16.s\n"
-    ".inst 0x44829202  // srshl z2.s, p4/M, z2.s, z16.s\n"
-    ".inst 0x44829201  // srshl z1.s, p4/M, z1.s, z16.s\n"
-    ".inst 0x44829200  // srshl z0.s, p4/M, z0.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z20.s\n"
-    "smax z14.s, p4/M, z14.s, z20.s\n"
-    "smax z13.s, p4/M, z13.s, z20.s\n"
-    "smax z12.s, p4/M, z12.s, z20.s\n"
-    "smin z15.s, p4/M, z15.s, z19.s\n"
-    "smin z14.s, p4/M, z14.s, z19.s\n"
-    "smin z13.s, p4/M, z13.s, z19.s\n"
-    "smin z12.s, p4/M, z12.s, z19.s\n"
-    "smax z11.s, p4/M, z11.s, z20.s\n"
+    ".inst 0x44828203  // srshl z3.s, p0/M, z3.s, z16.s\n"
+    ".inst 0x44828202  // srshl z2.s, p0/M, z2.s, z16.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "mov z18.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
-    "smax z10.s, p4/M, z10.s, z20.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
-    "smin z11.s, p4/M, z11.s, z19.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "smin z10.s, p4/M, z10.s, z19.s\n"
-    "incb x26, ALL, MUL #4\n"
-    "smax z9.s, p4/M, z9.s, z20.s\n"
-    "smax z8.s, p4/M, z8.s, z20.s\n"
-    "smax z7.s, p4/M, z7.s, z20.s\n"
-    "smax z6.s, p4/M, z6.s, z20.s\n"
-    "trn1 z18.h, z11.h, z10.h\n"
-    "smin z9.s, p4/M, z9.s, z19.s\n"
-    "smin z8.s, p4/M, z8.s, z19.s\n"
-    "smin z7.s, p4/M, z7.s, z19.s\n"
-    "smin z6.s, p4/M, z6.s, z19.s\n"
-    "smax z5.s, p4/M, z5.s, z20.s\n"
+    "smin z11.s, p0/M, z11.s, z18.s\n"
+    "smin z10.s, p0/M, z10.s, z18.s\n"
+    "trn1 z17.h, z11.h, z10.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "smin z9.s, p0/M, z9.s, z18.s\n"
+    "smin z8.s, p0/M, z8.s, z18.s\n"
     "trn1 z16.h, z9.h, z8.h\n"
-    "smax z4.s, p4/M, z4.s, z20.s\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z7.s, p0/M, z7.s, z18.s\n"
+    "smin z6.s, p0/M, z6.s, z18.s\n"
     "trn1 z17.h, z7.h, z6.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
-    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
-    "smin z5.s, p4/M, z5.s, z19.s\n"
-    "incb x25, ALL, MUL #4\n"
-    "smin z4.s, p4/M, z4.s, z19.s\n"
-    "smax z3.s, p4/M, z3.s, z20.s\n"
-    "smax z2.s, p4/M, z2.s, z20.s\n"
-    "smax z1.s, p4/M, z1.s, z20.s\n"
-    "smax z0.s, p4/M, z0.s, z20.s\n"
+    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+    "smin z5.s, p0/M, z5.s, z18.s\n"
+    "smin z4.s, p0/M, z4.s, z18.s\n"
     "trn1 z16.h, z5.h, z4.h\n"
-    "smin z3.s, p4/M, z3.s, z19.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
-    "smin z2.s, p4/M, z2.s, z19.s\n"
-    "incb x24, ALL, MUL #4\n"
-    "smin z1.s, p4/M, z1.s, z19.s\n"
-    "smin z0.s, p4/M, z0.s, z19.s\n"
+    "smin z3.s, p0/M, z3.s, z18.s\n"
+    "smin z2.s, p0/M, z2.s, z18.s\n"
     "trn1 z17.h, z3.h, z2.h\n"
+    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
     "trn1 z16.h, z1.h, z0.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x23]\n"
-    "incb x23, ALL, MUL #4\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "incb x27, ALL, MUL #4\n"
+    "incb x26, ALL, MUL #4\n"
+    "incb x25, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z15.s, #0x0\n"
-    "mov x19, %x[inptrs]\n"
     "mov z14.s, #0x0\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
+    "mov x22, %x[inptrs]\n"
     "mov z13.s, #0x0\n"
     "mov z12.s, #0x0\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    "subs x22, x22, #0x1\n"
-    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508abf1  // ushllb z17.h, z31.b, #0x0\n"
-    ".inst 0x4508aff0  // ushllt z16.h, z31.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
     ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
     ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
     ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
     ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z20.s, #0x0\n"
-    "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
-    "mov z19.s, #0xff\n"
-    "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
+    "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
     ".inst 0x04b175ef  // sqdmulh z15.s, z15.s, z17.s\n"
     ".inst 0x04b175ce  // sqdmulh z14.s, z14.s, z17.s\n"
     ".inst 0x04b175ad  // sqdmulh z13.s, z13.s, z17.s\n"
     ".inst 0x04b1758c  // sqdmulh z12.s, z12.s, z17.s\n"
-    ".inst 0x4482920f  // srshl z15.s, p4/M, z15.s, z16.s\n"
-    ".inst 0x4482920e  // srshl z14.s, p4/M, z14.s, z16.s\n"
-    ".inst 0x4482920d  // srshl z13.s, p4/M, z13.s, z16.s\n"
-    ".inst 0x4482920c  // srshl z12.s, p4/M, z12.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z20.s\n"
-    "smax z14.s, p4/M, z14.s, z20.s\n"
-    "smax z13.s, p4/M, z13.s, z20.s\n"
-    "smax z12.s, p4/M, z12.s, z20.s\n"
-    "smin z15.s, p4/M, z15.s, z19.s\n"
-    "smin z14.s, p4/M, z14.s, z19.s\n"
-    "smin z13.s, p4/M, z13.s, z19.s\n"
-    "smin z12.s, p4/M, z12.s, z19.s\n"
+    ".inst 0x4482820f  // srshl z15.s, p0/M, z15.s, z16.s\n"
+    ".inst 0x4482820e  // srshl z14.s, p0/M, z14.s, z16.s\n"
+    ".inst 0x4482820d  // srshl z13.s, p0/M, z13.s, z16.s\n"
+    ".inst 0x4482820c  // srshl z12.s, p0/M, z12.s, z16.s\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0xff\n"
+    "smax z15.s, p0/M, z15.s, z17.s\n"
+    "smax z14.s, p0/M, z14.s, z17.s\n"
+    "smax z13.s, p0/M, z13.s, z17.s\n"
+    "smax z12.s, p0/M, z12.s, z17.s\n"
+    "smin z15.s, p0/M, z15.s, z16.s\n"
+    "smin z14.s, p0/M, z14.s, z16.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z16.s\n"
+    "smin z12.s, p0/M, z12.s, z16.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "incb x26\n"
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 06df1515ad..eae83b99fe 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,28 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
 
-struct sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
+  using Parent = DepthfirstStrategy<uint8_t, uint8_t>;
 
-  typedef void (*kern_type)(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+  const static auto pooling_type = PoolingType::MAX;
+  const static auto pool_rows = 2u, pool_cols = 2u;
+  const static auto stride_rows = 1u, stride_cols = 1u;
 
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+  sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+  : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
 
-  constexpr static unsigned int pool_rows(void) { return 2; }
-  constexpr static unsigned int pool_cols(void) { return 2; }
-
-  constexpr static unsigned int stride_rows(void) { return 1; }
-  constexpr static unsigned int stride_cols(void) { return 1; }
-
-  constexpr static unsigned int out_rows(void) { return 2; }
-  constexpr static unsigned int out_cols(void) { return 2; }
-
-  kern_type kernel = sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
-  sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+  Parent::KernelType get_kernel(void) const { return sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index e921f345d5..8612555bfb 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -63,84 +63,84 @@ void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
                         pad_left, pad_top, pad_right, pad_bottom);
 
   __asm__ __volatile__(
-    "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+    "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+    "mov x14, #0x0\n"
+    "whilelt p0.b, x14, x15\n"
+    "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+    "ldp x13, x12, [x21, #0x0]\n"
     "ptrue p2.b\n"
-    "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
-    "mov x13, #0x0\n"
-    "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
-    "mov x12, #0x0\n"
-    "ldp x11, x10, [x20, #0x0]\n"
-    "whilelt p1.b, x13, x14\n"
-    "ldp x9, x28, [x20, #0x10]\n"
-    "ldp x27, x26, [x19, #0x0]\n"
-    "ldp x25, x24, [x19, #0x10]\n"
-    "ldp x23, x22, [x19, #0x20]\n"
-    "ldp x21, x20, [x19, #0x30]\n"
-    "ldr x19, [x19, #0x40]\n"
-    "ld1b { z31.b }, p1/Z, [x26, x13]\n"
-    "ld1b { z30.b }, p1/Z, [x23, x13]\n"
-    "ld1b { z29.b }, p1/Z, [x20, x13]\n"
-    "ld1b { z28.b }, p1/Z, [x24, x13]\n"
-    "ld1b { z27.b }, p1/Z, [x27, x13]\n"
-    "ld1b { z26.b }, p1/Z, [x22, x13]\n"
-    "ld1b { z25.b }, p1/Z, [x25, x13]\n"
-    "ld1b { z24.b }, p1/Z, [x21, x13]\n"
-    "ld1b { z23.b }, p1/Z, [x19, x13]\n"
-    "incw x13\n"
-    "whilelt p1.b, x13, x14\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [x21, #0x10]\n"
+    "ldp x28, x27, [x20, #0x0]\n"
+    "ldp x26, x25, [x20, #0x10]\n"
+    "ldp x24, x23, [x20, #0x20]\n"
+    "ldp x22, x21, [x20, #0x30]\n"
+    "ldr x20, [x20, #0x40]\n"
+    "ld1b { z31.b }, p0/Z, [x27, x14]\n"
+    "ld1b { z30.b }, p0/Z, [x24, x14]\n"
+    "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+    "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+    "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+    "ld1b { z26.b }, p0/Z, [x26, x14]\n"
+    "ld1b { z25.b }, p0/Z, [x23, x14]\n"
+    "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+    "ld1b { z23.b }, p0/Z, [x20, x14]\n"
+    "incw x14\n"
+    "whilelt p1.b, x14, x15\n"
     "b.none 2f\n"
     "1:"  // Vector: Loop
     "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
-    "ld1b { z31.b }, p1/Z, [x26, x13]\n"
-    "whilelt p0.b, x12, x14\n"
     "movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
-    "ld1b { z30.b }, p1/Z, [x23, x13]\n"
-    "movprfx z18, z28\n umax z18.b, p2/M, z18.b, z27.b\n"
-    "ld1b { z29.b }, p1/Z, [x20, x13]\n"
-    "movprfx z17, z26\n umax z17.b, p2/M, z17.b, z25.b\n"
-    "ld1b { z27.b }, p1/Z, [x27, x13]\n"
-    "movprfx z16, z24\n umax z16.b, p2/M, z16.b, z28.b\n"
-    "ld1b { z28.b }, p1/Z, [x24, x13]\n"
-    "movprfx z20, z26\n umax z20.b, p2/M, z20.b, z23.b\n"
-    "ld1b { z26.b }, p1/Z, [x22, x13]\n"
-    "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
-    "ld1b { z25.b }, p1/Z, [x25, x13]\n"
-    "movprfx z18, z22\n umax z18.b, p2/M, z18.b, z17.b\n"
-    "ld1b { z24.b }, p1/Z, [x21, x13]\n"
-    "movprfx z17, z21\n umax z17.b, p2/M, z17.b, z16.b\n"
-    "ld1b { z23.b }, p1/Z, [x19, x13]\n"
-    "incw x13\n"
-    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
-    "st1b { z19.b }, p0, [x11, x12]\n"
-    "whilelt p1.b, x13, x14\n"
-    "st1b { z18.b }, p0, [x10, x12]\n"
-    "st1b { z17.b }, p0, [x9, x12]\n"
-    "st1b { z16.b }, p0, [x28, x12]\n"
-    "incw x12\n"
+    "ld1b { z31.b }, p1/Z, [x27, x14]\n"
+    "ld1b { z30.b }, p1/Z, [x24, x14]\n"
+    "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
+    "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
+    "ld1b { z29.b }, p1/Z, [x21, x14]\n"
+    "ld1b { z27.b }, p1/Z, [x28, x14]\n"
+    "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
+    "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
+    "ld1b { z28.b }, p1/Z, [x25, x14]\n"
+    "ld1b { z26.b }, p1/Z, [x26, x14]\n"
+    "ld1b { z25.b }, p1/Z, [x23, x14]\n"
+    "ld1b { z24.b }, p1/Z, [x22, x14]\n"
+    "whilelt p0.b, x11, x15\n"
+    "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+    "ld1b { z23.b }, p1/Z, [x20, x14]\n"
+    "incw x14\n"
+    "whilelt p1.b, x14, x15\n"
+    "st1b { z16.b }, p0, [x13, x11]\n"
+    "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
+    "umax z17.b, p2/M, z17.b, z21.b\n"
+    "st1b { z16.b }, p0, [x12, x11]\n"
+    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
+    "st1b { z17.b }, p0, [x10, x11]\n"
+    "st1b { z16.b }, p0, [x9, x11]\n"
+    "incw x11\n"
     "b.any 1b\n"
     "2:"  // Vector: Tail
     "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
-    "whilelt p0.b, x12, x14\n"
     "movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
-    "movprfx z18, z28\n umax z18.b, p2/M, z18.b, z27.b\n"
-    "movprfx z17, z26\n umax z17.b, p2/M, z17.b, z25.b\n"
-    "movprfx z16, z24\n umax z16.b, p2/M, z16.b, z28.b\n"
-    "movprfx z20, z26\n umax z20.b, p2/M, z20.b, z23.b\n"
-    "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
-    "st1b { z19.b }, p0, [x11, x12]\n"
-    "movprfx z18, z22\n umax z18.b, p2/M, z18.b, z17.b\n"
-    "movprfx z17, z21\n umax z17.b, p2/M, z17.b, z16.b\n"
-    "st1b { z18.b }, p0, [x10, x12]\n"
-    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
-    "st1b { z17.b }, p0, [x9, x12]\n"
-    "st1b { z16.b }, p0, [x28, x12]\n"
+    "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
+    "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
+    "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
+    "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
+    "whilelt p0.b, x11, x15\n"
+    "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+    "st1b { z16.b }, p0, [x13, x11]\n"
+    "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
+    "umax z17.b, p2/M, z17.b, z21.b\n"
+    "st1b { z16.b }, p0, [x12, x11]\n"
+    "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
+    "st1b { z17.b }, p0, [x10, x11]\n"
+    "st1b { z16.b }, p0, [x9, x11]\n"
     :
     : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
-    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp
index 59cd4b9c78..9f3c3a435d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_u8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
 
-struct sve_u8_nhwc_max_generic_depthfirst
+struct sve_u8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_u8_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
   sve_u8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_u8_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
index 164847480b..be0eb398ae 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -39,185 +40,184 @@ void sve_u8_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cntb x27\n"
-    "cntb x26, ALL, MUL #2\n"
-    "cntb x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "whilelt p3.b, x28, %x[n_channels]\n"
     "whilelt p2.b, x27, %x[n_channels]\n"
     "whilelt p1.b, x26, %x[n_channels]\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.b, #0x0\n"
     "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
+    "mov x24, %x[inptrs]\n"
     "mov z6.b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
     "mov z5.b, #0x0\n"
-    "mov z4.b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "umax z21.b, p4/M, z21.b, z26.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "umax z16.b, p4/M, z16.b, z25.b\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "umax z20.b, p4/M, z20.b, z24.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "umax z18.b, p4/M, z18.b, z22.b\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "umax z17.b, p4/M, z17.b, z21.b\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "umax z16.b, p4/M, z16.b, z20.b\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "umax z7.b, p4/M, z7.b, z19.b\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "umax z6.b, p4/M, z6.b, z18.b\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "umax z5.b, p4/M, z5.b, z17.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "umax z4.b, p4/M, z4.b, z16.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+    "umax z22.b, p0/M, z22.b, z30.b\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+    "umax z21.b, p0/M, z21.b, z27.b\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z19.b\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "umax z7.b, p0/M, z7.b, z18.b\n"
+    "umax z6.b, p0/M, z6.b, z17.b\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
-    "umax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
-    "umax z21.b, p4/M, z21.b, z26.b\n"
-    "umax z16.b, p4/M, z16.b, z25.b\n"
-    "umax z20.b, p4/M, z20.b, z24.b\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "umax z18.b, p4/M, z18.b, z22.b\n"
-    "umax z17.b, p4/M, z17.b, z21.b\n"
-    "umax z16.b, p4/M, z16.b, z20.b\n"
-    "umax z7.b, p4/M, z7.b, z19.b\n"
-    "umax z6.b, p4/M, z6.b, z18.b\n"
-    "umax z5.b, p4/M, z5.b, z17.b\n"
-    "umax z4.b, p4/M, z4.b, z16.b\n"
+    "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+    "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+    "umax z22.b, p0/M, z22.b, z30.b\n"
+    "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+    "umax z21.b, p0/M, z21.b, z27.b\n"
+    "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "umax z8.b, p0/M, z8.b, z19.b\n"
+    "umax z7.b, p0/M, z7.b, z18.b\n"
+    "umax z6.b, p0/M, z6.b, z17.b\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z7.b, p4/M, z7.b, z3.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "umax z6.b, p4/M, z6.b, z31.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "umax z5.b, p4/M, z5.b, z28.b\n"
-    "umax z4.b, p4/M, z4.b, z16.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
+    "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "umax z7.b, p0/M, z7.b, z17.b\n"
+    "umax z6.b, p0/M, z6.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
+    "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+    "incb x9, ALL, MUL #4\n"
     "st1b { z7.b }, p3, [%x[outptr], x28]\n"
     "incb x28, ALL, MUL #4\n"
     "st1b { z6.b }, p2, [%x[outptr], x27]\n"
     "incb x27, ALL, MUL #4\n"
     "st1b { z5.b }, p1, [%x[outptr], x26]\n"
     "incb x26, ALL, MUL #4\n"
-    "st1b { z4.b }, p0, [%x[outptr], x25]\n"
-    "incb x25, ALL, MUL #4\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z7.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "umax z7.b, p4/M, z7.b, z19.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "umax z7.b, p4/M, z7.b, z19.b\n"
+    "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z7.b, p4/M, z7.b, z3.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "st1b { z7.b }, p3, [%x[outptr], x28]\n"
-    "incb x28\n"
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp
index f6fc1a58c1..f9d25a1b45 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_u8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
 
-struct sve_u8q_nhwc_avg_generic_depthfirst
+struct sve_u8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
-  kern_type kernel = sve_u8q_nhwc_avg_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
   sve_u8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_u8q_nhwc_avg_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index 373848ad2b..e8339a2cd9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,12 @@
 
 #include "pooling.hpp"
 #include <cstdint>
+#include <cstddef>
 #include <cstring>
 #include <cmath>
 
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -86,12 +87,13 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
       f_rescale_value *= 2.0f;
     }
 
-    rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
-    if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+    int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+    if (long_rescale_value == (1ll << 31))
     {
       shift_value++;
-      rescale_value >>= 1;
+      long_rescale_value >>= 1;
     }
+    rescale_value = static_cast<int32_t>(long_rescale_value);
   }
 
 
@@ -117,24 +119,24 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
   );
 
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x26, #0x0\n"
-    "cntb x25\n"
-    "cntb x24, ALL, MUL #2\n"
-    "cntb x23, ALL, MUL #3\n"
+    "mov x27, #0x0\n"
+    "cntb x26\n"
+    "cntb x25, ALL, MUL #2\n"
+    "cntb x24, ALL, MUL #3\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "whilelt p3.b, x26, %x[n_channels]\n"
     "whilelt p2.b, x25, %x[n_channels]\n"
     "whilelt p1.b, x24, %x[n_channels]\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
-    "ld1rw { z15.s }, p4/Z, [%x[accumulator_init]]\n"
+    "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z14.d, z15.d\n"
-    "mov x19, %x[inptrs]\n"
     "mov z13.d, z15.d\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "mov z12.d, z15.d\n"
     "mov z11.d, z15.d\n"
+    "mov x22, %x[inptrs]\n"
     "mov z10.d, z15.d\n"
     "mov z9.d, z15.d\n"
     "mov z8.d, z15.d\n"
@@ -146,43 +148,43 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
     "mov z2.d, z15.d\n"
     "mov z1.d, z15.d\n"
     "mov z0.d, z15.d\n"
-    "cbz x22, 4f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "cbz x23, 4f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 2 inputs loop
     ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
     ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
     ".inst 0x455c0bb5  // uaddlb z21.h, z29.b, z28.b\n"
-    "subs x22, x22, #0x1\n"
     ".inst 0x455c0fb4  // uaddlt z20.h, z29.b, z28.b\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
     ".inst 0x455a0b73  // uaddlb z19.h, z27.b, z26.b\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
     ".inst 0x455a0f72  // uaddlt z18.h, z27.b, z26.b\n"
-    "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+    "ld1b { z29.b }, p3/Z, [x21, x26]\n"
     ".inst 0x45580b31  // uaddlb z17.h, z25.b, z24.b\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
     ".inst 0x45580f30  // uaddlt z16.h, z25.b, z24.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+    "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z27.b }, p2/Z, [x21, x25]\n"
     ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
     ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+    "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+    "ld1b { z25.b }, p1/Z, [x21, x24]\n"
     ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
     ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    "ld1b { z24.b }, p1/Z, [x20, x24]\n"
     ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
     ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
     ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
@@ -222,265 +224,264 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
     ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508abf1  // ushllb z17.h, z31.b, #0x0\n"
-    "ld1b { z29.b }, p2/Z, [x21, x25]\n"
-    ".inst 0x4508aff0  // ushllt z16.h, z31.b, #0x0\n"
-    "ld1b { z27.b }, p1/Z, [x21, x24]\n"
-    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
-    "ld1b { z25.b }, p0/Z, [x21, x23]\n"
-    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
-    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
-    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
-    ".inst 0x4508abb0  // ushllb z16.h, z29.b, #0x0\n"
-    ".inst 0x4590496b  // uaddwb z11.s, z11.s, z16.h\n"
-    ".inst 0x45904d4a  // uaddwt z10.s, z10.s, z16.h\n"
-    ".inst 0x4508afb0  // ushllt z16.h, z29.b, #0x0\n"
-    ".inst 0x45904929  // uaddwb z9.s, z9.s, z16.h\n"
-    ".inst 0x45904d08  // uaddwt z8.s, z8.s, z16.h\n"
-    ".inst 0x4508ab70  // ushllb z16.h, z27.b, #0x0\n"
-    ".inst 0x459048e7  // uaddwb z7.s, z7.s, z16.h\n"
-    ".inst 0x45904cc6  // uaddwt z6.s, z6.s, z16.h\n"
-    ".inst 0x4508af70  // ushllt z16.h, z27.b, #0x0\n"
-    ".inst 0x459048a5  // uaddwb z5.s, z5.s, z16.h\n"
-    ".inst 0x45904c84  // uaddwt z4.s, z4.s, z16.h\n"
-    ".inst 0x4508ab30  // ushllb z16.h, z25.b, #0x0\n"
-    ".inst 0x45904863  // uaddwb z3.s, z3.s, z16.h\n"
-    ".inst 0x45904c42  // uaddwt z2.s, z2.s, z16.h\n"
-    ".inst 0x4508af30  // ushllt z16.h, z25.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa17  // ushllb z23.h, z16.b, #0x0\n"
+    ".inst 0x4508ae16  // ushllt z22.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+    "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+    ".inst 0x4508aa15  // ushllb z21.h, z16.b, #0x0\n"
+    ".inst 0x4508ae14  // ushllt z20.h, z16.b, #0x0\n"
+    "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+    ".inst 0x4508aa33  // ushllb z19.h, z17.b, #0x0\n"
+    ".inst 0x4508ae32  // ushllt z18.h, z17.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
+    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
+    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
+    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x4595496b  // uaddwb z11.s, z11.s, z21.h\n"
+    ".inst 0x45954d4a  // uaddwt z10.s, z10.s, z21.h\n"
+    ".inst 0x45944929  // uaddwb z9.s, z9.s, z20.h\n"
+    ".inst 0x45944d08  // uaddwt z8.s, z8.s, z20.h\n"
+    ".inst 0x459348e7  // uaddwb z7.s, z7.s, z19.h\n"
+    ".inst 0x45934cc6  // uaddwt z6.s, z6.s, z19.h\n"
+    ".inst 0x459248a5  // uaddwb z5.s, z5.s, z18.h\n"
+    ".inst 0x45924c84  // uaddwt z4.s, z4.s, z18.h\n"
+    ".inst 0x45914863  // uaddwb z3.s, z3.s, z17.h\n"
+    ".inst 0x45914c42  // uaddwt z2.s, z2.s, z17.h\n"
     ".inst 0x45904821  // uaddwb z1.s, z1.s, z16.h\n"
     ".inst 0x45904c00  // uaddwt z0.s, z0.s, z16.h\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z21.s, #0x0\n"
-    "ld1rw { z20.s }, p4/Z, [%x[combined_rescale_value]]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "mov z19.s, #0xff\n"
-    "ld1rw { z18.s }, p4/Z, [%x[left_shift]]\n"
-    "ld1rw { z17.s }, p4/Z, [%x[right_shift]]\n"
-    ".inst 0x4482924f  // srshl z15.s, p4/M, z15.s, z18.s\n"
-    "ld1rw { z16.s }, p4/Z, [x19]\n"
-    ".inst 0x4482924e  // srshl z14.s, p4/M, z14.s, z18.s\n"
-    ".inst 0x4482924d  // srshl z13.s, p4/M, z13.s, z18.s\n"
-    ".inst 0x4482924c  // srshl z12.s, p4/M, z12.s, z18.s\n"
-    ".inst 0x4482924b  // srshl z11.s, p4/M, z11.s, z18.s\n"
-    ".inst 0x04b475ef  // sqrdmulh z15.s, z15.s, z20.s\n"
-    ".inst 0x04b475ce  // sqrdmulh z14.s, z14.s, z20.s\n"
-    ".inst 0x04b475ad  // sqrdmulh z13.s, z13.s, z20.s\n"
-    ".inst 0x04b4758c  // sqrdmulh z12.s, z12.s, z20.s\n"
-    ".inst 0x04b4756b  // sqrdmulh z11.s, z11.s, z20.s\n"
-    ".inst 0x4482922f  // srshl z15.s, p4/M, z15.s, z17.s\n"
-    ".inst 0x4482922e  // srshl z14.s, p4/M, z14.s, z17.s\n"
-    ".inst 0x4482922d  // srshl z13.s, p4/M, z13.s, z17.s\n"
-    ".inst 0x4482922c  // srshl z12.s, p4/M, z12.s, z17.s\n"
+    "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482824f  // srshl z15.s, p0/M, z15.s, z18.s\n"
+    ".inst 0x4482824e  // srshl z14.s, p0/M, z14.s, z18.s\n"
+    ".inst 0x4482824d  // srshl z13.s, p0/M, z13.s, z18.s\n"
+    ".inst 0x4482824c  // srshl z12.s, p0/M, z12.s, z18.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b075ef  // sqrdmulh z15.s, z15.s, z16.s\n"
+    ".inst 0x4482824b  // srshl z11.s, p0/M, z11.s, z18.s\n"
+    ".inst 0x4482824a  // srshl z10.s, p0/M, z10.s, z18.s\n"
+    ".inst 0x04b075ce  // sqrdmulh z14.s, z14.s, z16.s\n"
+    ".inst 0x04b075ad  // sqrdmulh z13.s, z13.s, z16.s\n"
+    ".inst 0x44828249  // srshl z9.s, p0/M, z9.s, z18.s\n"
+    ".inst 0x44828248  // srshl z8.s, p0/M, z8.s, z18.s\n"
+    ".inst 0x04b0758c  // sqrdmulh z12.s, z12.s, z16.s\n"
+    ".inst 0x04b0756b  // sqrdmulh z11.s, z11.s, z16.s\n"
+    ".inst 0x44828247  // srshl z7.s, p0/M, z7.s, z18.s\n"
+    ".inst 0x44828246  // srshl z6.s, p0/M, z6.s, z18.s\n"
+    ".inst 0x04b0754a  // sqrdmulh z10.s, z10.s, z16.s\n"
+    ".inst 0x04b07529  // sqrdmulh z9.s, z9.s, z16.s\n"
+    ".inst 0x44828245  // srshl z5.s, p0/M, z5.s, z18.s\n"
+    ".inst 0x44828244  // srshl z4.s, p0/M, z4.s, z18.s\n"
+    ".inst 0x04b07508  // sqrdmulh z8.s, z8.s, z16.s\n"
+    ".inst 0x04b074e7  // sqrdmulh z7.s, z7.s, z16.s\n"
+    ".inst 0x44828243  // srshl z3.s, p0/M, z3.s, z18.s\n"
+    ".inst 0x44828242  // srshl z2.s, p0/M, z2.s, z18.s\n"
+    ".inst 0x04b074c6  // sqrdmulh z6.s, z6.s, z16.s\n"
+    ".inst 0x04b074a5  // sqrdmulh z5.s, z5.s, z16.s\n"
+    ".inst 0x44828241  // srshl z1.s, p0/M, z1.s, z18.s\n"
+    ".inst 0x44828240  // srshl z0.s, p0/M, z0.s, z18.s\n"
+    ".inst 0x04b07484  // sqrdmulh z4.s, z4.s, z16.s\n"
+    ".inst 0x04b07463  // sqrdmulh z3.s, z3.s, z16.s\n"
+    ".inst 0x04b07442  // sqrdmulh z2.s, z2.s, z16.s\n"
+    ".inst 0x04b07421  // sqrdmulh z1.s, z1.s, z16.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x4482822f  // srshl z15.s, p0/M, z15.s, z17.s\n"
+    ".inst 0x04b07400  // sqrdmulh z0.s, z0.s, z16.s\n"
+    ".inst 0x4482822e  // srshl z14.s, p0/M, z14.s, z17.s\n"
+    ".inst 0x4482822d  // srshl z13.s, p0/M, z13.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x4482822c  // srshl z12.s, p0/M, z12.s, z17.s\n"
+    ".inst 0x4482822b  // srshl z11.s, p0/M, z11.s, z17.s\n"
     "add z15.s, z15.s, z16.s\n"
     "add z14.s, z14.s, z16.s\n"
+    ".inst 0x4482822a  // srshl z10.s, p0/M, z10.s, z17.s\n"
+    ".inst 0x44828229  // srshl z9.s, p0/M, z9.s, z17.s\n"
     "add z13.s, z13.s, z16.s\n"
     "add z12.s, z12.s, z16.s\n"
-    ".inst 0x4482922b  // srshl z11.s, p4/M, z11.s, z17.s\n"
-    ".inst 0x4482924a  // srshl z10.s, p4/M, z10.s, z18.s\n"
-    ".inst 0x44829249  // srshl z9.s, p4/M, z9.s, z18.s\n"
-    ".inst 0x44829248  // srshl z8.s, p4/M, z8.s, z18.s\n"
+    ".inst 0x44828228  // srshl z8.s, p0/M, z8.s, z17.s\n"
+    ".inst 0x44828227  // srshl z7.s, p0/M, z7.s, z17.s\n"
     "add z11.s, z11.s, z16.s\n"
-    ".inst 0x04b4754a  // sqrdmulh z10.s, z10.s, z20.s\n"
-    ".inst 0x04b47529  // sqrdmulh z9.s, z9.s, z20.s\n"
-    ".inst 0x04b47508  // sqrdmulh z8.s, z8.s, z20.s\n"
-    ".inst 0x44829247  // srshl z7.s, p4/M, z7.s, z18.s\n"
-    ".inst 0x4482922a  // srshl z10.s, p4/M, z10.s, z17.s\n"
-    ".inst 0x44829229  // srshl z9.s, p4/M, z9.s, z17.s\n"
-    ".inst 0x44829228  // srshl z8.s, p4/M, z8.s, z17.s\n"
-    ".inst 0x04b474e7  // sqrdmulh z7.s, z7.s, z20.s\n"
     "add z10.s, z10.s, z16.s\n"
+    ".inst 0x44828226  // srshl z6.s, p0/M, z6.s, z17.s\n"
+    ".inst 0x44828225  // srshl z5.s, p0/M, z5.s, z17.s\n"
     "add z9.s, z9.s, z16.s\n"
     "add z8.s, z8.s, z16.s\n"
-    ".inst 0x44829227  // srshl z7.s, p4/M, z7.s, z17.s\n"
-    ".inst 0x44829246  // srshl z6.s, p4/M, z6.s, z18.s\n"
-    ".inst 0x44829245  // srshl z5.s, p4/M, z5.s, z18.s\n"
-    ".inst 0x44829244  // srshl z4.s, p4/M, z4.s, z18.s\n"
+    ".inst 0x44828224  // srshl z4.s, p0/M, z4.s, z17.s\n"
+    ".inst 0x44828223  // srshl z3.s, p0/M, z3.s, z17.s\n"
     "add z7.s, z7.s, z16.s\n"
-    ".inst 0x04b474c6  // sqrdmulh z6.s, z6.s, z20.s\n"
-    ".inst 0x04b474a5  // sqrdmulh z5.s, z5.s, z20.s\n"
-    ".inst 0x04b47484  // sqrdmulh z4.s, z4.s, z20.s\n"
-    ".inst 0x44829243  // srshl z3.s, p4/M, z3.s, z18.s\n"
-    ".inst 0x44829226  // srshl z6.s, p4/M, z6.s, z17.s\n"
-    ".inst 0x44829225  // srshl z5.s, p4/M, z5.s, z17.s\n"
-    ".inst 0x44829224  // srshl z4.s, p4/M, z4.s, z17.s\n"
-    ".inst 0x04b47463  // sqrdmulh z3.s, z3.s, z20.s\n"
     "add z6.s, z6.s, z16.s\n"
+    ".inst 0x44828222  // srshl z2.s, p0/M, z2.s, z17.s\n"
+    ".inst 0x44828221  // srshl z1.s, p0/M, z1.s, z17.s\n"
     "add z5.s, z5.s, z16.s\n"
     "add z4.s, z4.s, z16.s\n"
-    ".inst 0x44829223  // srshl z3.s, p4/M, z3.s, z17.s\n"
-    ".inst 0x44829242  // srshl z2.s, p4/M, z2.s, z18.s\n"
-    ".inst 0x44829241  // srshl z1.s, p4/M, z1.s, z18.s\n"
-    ".inst 0x44829240  // srshl z0.s, p4/M, z0.s, z18.s\n"
+    ".inst 0x44828220  // srshl z0.s, p0/M, z0.s, z17.s\n"
     "add z3.s, z3.s, z16.s\n"
-    ".inst 0x04b47442  // sqrdmulh z2.s, z2.s, z20.s\n"
-    ".inst 0x04b47421  // sqrdmulh z1.s, z1.s, z20.s\n"
-    ".inst 0x04b47400  // sqrdmulh z0.s, z0.s, z20.s\n"
-    "smax z15.s, p4/M, z15.s, z21.s\n"
-    ".inst 0x44829222  // srshl z2.s, p4/M, z2.s, z17.s\n"
-    ".inst 0x44829221  // srshl z1.s, p4/M, z1.s, z17.s\n"
-    ".inst 0x44829220  // srshl z0.s, p4/M, z0.s, z17.s\n"
-    "smin z15.s, p4/M, z15.s, z19.s\n"
     "add z2.s, z2.s, z16.s\n"
     "add z1.s, z1.s, z16.s\n"
     "add z0.s, z0.s, z16.s\n"
-    "smax z14.s, p4/M, z14.s, z21.s\n"
-    "smax z13.s, p4/M, z13.s, z21.s\n"
-    "smax z12.s, p4/M, z12.s, z21.s\n"
-    "smax z11.s, p4/M, z11.s, z21.s\n"
-    "smin z14.s, p4/M, z14.s, z19.s\n"
-    "smin z13.s, p4/M, z13.s, z19.s\n"
-    "smin z12.s, p4/M, z12.s, z19.s\n"
-    "smin z11.s, p4/M, z11.s, z19.s\n"
+    "mov z16.s, #0x0\n"
+    "smax z15.s, p0/M, z15.s, z16.s\n"
+    "smax z14.s, p0/M, z14.s, z16.s\n"
+    "mov z18.s, #0xff\n"
+    "smax z13.s, p0/M, z13.s, z16.s\n"
+    "smax z12.s, p0/M, z12.s, z16.s\n"
+    "smax z11.s, p0/M, z11.s, z16.s\n"
+    "smax z10.s, p0/M, z10.s, z16.s\n"
+    "smax z9.s, p0/M, z9.s, z16.s\n"
+    "smax z8.s, p0/M, z8.s, z16.s\n"
+    "smax z7.s, p0/M, z7.s, z16.s\n"
+    "smax z6.s, p0/M, z6.s, z16.s\n"
+    "smax z5.s, p0/M, z5.s, z16.s\n"
+    "smax z4.s, p0/M, z4.s, z16.s\n"
+    "smax z3.s, p0/M, z3.s, z16.s\n"
+    "smax z2.s, p0/M, z2.s, z16.s\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smin z15.s, p0/M, z15.s, z18.s\n"
+    "smin z14.s, p0/M, z14.s, z18.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
-    "smax z10.s, p4/M, z10.s, z21.s\n"
+    "smin z13.s, p0/M, z13.s, z18.s\n"
+    "smin z12.s, p0/M, z12.s, z18.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
-    "smax z9.s, p4/M, z9.s, z21.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "smin z10.s, p4/M, z10.s, z19.s\n"
-    "incb x26, ALL, MUL #4\n"
-    "smin z9.s, p4/M, z9.s, z19.s\n"
-    "smax z8.s, p4/M, z8.s, z21.s\n"
-    "smax z7.s, p4/M, z7.s, z21.s\n"
-    "smax z6.s, p4/M, z6.s, z21.s\n"
-    "trn1 z18.h, z11.h, z10.h\n"
-    "smin z8.s, p4/M, z8.s, z19.s\n"
-    "smin z7.s, p4/M, z7.s, z19.s\n"
-    "smin z6.s, p4/M, z6.s, z19.s\n"
-    "smax z5.s, p4/M, z5.s, z21.s\n"
+    "smin z11.s, p0/M, z11.s, z18.s\n"
+    "smin z10.s, p0/M, z10.s, z18.s\n"
+    "trn1 z17.h, z11.h, z10.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "smin z9.s, p0/M, z9.s, z18.s\n"
+    "smin z8.s, p0/M, z8.s, z18.s\n"
     "trn1 z16.h, z9.h, z8.h\n"
-    "smax z4.s, p4/M, z4.s, z21.s\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z7.s, p0/M, z7.s, z18.s\n"
+    "smin z6.s, p0/M, z6.s, z18.s\n"
     "trn1 z17.h, z7.h, z6.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
-    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
-    "smin z5.s, p4/M, z5.s, z19.s\n"
-    "incb x25, ALL, MUL #4\n"
-    "smin z4.s, p4/M, z4.s, z19.s\n"
-    "smax z3.s, p4/M, z3.s, z21.s\n"
-    "smax z2.s, p4/M, z2.s, z21.s\n"
-    "smax z1.s, p4/M, z1.s, z21.s\n"
-    "smax z0.s, p4/M, z0.s, z21.s\n"
+    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+    "smin z5.s, p0/M, z5.s, z18.s\n"
+    "smin z4.s, p0/M, z4.s, z18.s\n"
     "trn1 z16.h, z5.h, z4.h\n"
-    "smin z3.s, p4/M, z3.s, z19.s\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
-    "smin z2.s, p4/M, z2.s, z19.s\n"
-    "incb x24, ALL, MUL #4\n"
-    "smin z1.s, p4/M, z1.s, z19.s\n"
-    "smin z0.s, p4/M, z0.s, z19.s\n"
+    "smin z3.s, p0/M, z3.s, z18.s\n"
+    "smin z2.s, p0/M, z2.s, z18.s\n"
     "trn1 z17.h, z3.h, z2.h\n"
+    "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
     "trn1 z16.h, z1.h, z0.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x23]\n"
-    "incb x23, ALL, MUL #4\n"
-    "whilelt p0.b, x23, %x[n_channels]\n"
+    "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+    "incb x24, ALL, MUL #4\n"
+    "whilelt p1.b, x24, %x[n_channels]\n"
+    "incb x27, ALL, MUL #4\n"
+    "incb x26, ALL, MUL #4\n"
+    "incb x25, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "ld1rw { z15.s }, p4/Z, [%x[accumulator_init]]\n"
+    "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+    "lsr x23, %x[n_valid_cells], #0x1\n"
     "mov z14.d, z15.d\n"
-    "mov x19, %x[inptrs]\n"
     "mov z13.d, z15.d\n"
-    "lsr x22, %x[n_valid_cells], #0x1\n"
     "mov z12.d, z15.d\n"
-    "cbz x22, 11f\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    "add x19, x19, #0x10\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    "subs x22, x22, #0x1\n"
+    "mov x22, %x[inptrs]\n"
+    "cbz x23, 11f\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 2 inputs loop
-    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    "ldp x21, x20, [x19, #0x0]\n"
-    "add x19, x19, #0x10\n"
-    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    "subs x22, x22, #0x1\n"
-    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    "ld1b { z30.b }, p3/Z, [x20, x26]\n"
-    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    "ldp x21, x20, [x22, #0x0]\n"
+    "subs x23, x23, #0x1\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    "add x22, x22, #0x10\n"
+    "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
+    "ld1b { z30.b }, p4/Z, [x20, x27]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 2 inputs tail
-    ".inst 0x455e0bf7  // uaddlb z23.h, z31.b, z30.b\n"
-    ".inst 0x455e0ff6  // uaddlt z22.h, z31.b, z30.b\n"
-    ".inst 0x459749ef  // uaddwb z15.s, z15.s, z23.h\n"
-    ".inst 0x45974dce  // uaddwt z14.s, z14.s, z23.h\n"
-    ".inst 0x459649ad  // uaddwb z13.s, z13.s, z22.h\n"
-    ".inst 0x45964d8c  // uaddwt z12.s, z12.s, z22.h\n"
+    ".inst 0x455e0bf1  // uaddlb z17.h, z31.b, z30.b\n"
+    ".inst 0x455e0ff0  // uaddlt z16.h, z31.b, z30.b\n"
+    ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
+    ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
+    ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
+    ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x1\n"
+    "ands x21, %x[n_valid_cells], #0x1\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x21, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z31.b }, p3/Z, [x21, x26]\n"
-    ".inst 0x4508abf1  // ushllb z17.h, z31.b, #0x0\n"
-    ".inst 0x4508aff0  // ushllt z16.h, z31.b, #0x0\n"
+    "ldr x20, [x22], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+    ".inst 0x4508aa11  // ushllb z17.h, z16.b, #0x0\n"
+    ".inst 0x4508ae10  // ushllt z16.h, z16.b, #0x0\n"
+    "subs x21, x21, #0x1\n"
     ".inst 0x459149ef  // uaddwb z15.s, z15.s, z17.h\n"
     ".inst 0x45914dce  // uaddwt z14.s, z14.s, z17.h\n"
     ".inst 0x459049ad  // uaddwb z13.s, z13.s, z16.h\n"
     ".inst 0x45904d8c  // uaddwt z12.s, z12.s, z16.h\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z21.s, #0x0\n"
-    "ld1rw { z20.s }, p4/Z, [%x[combined_rescale_value]]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    "mov z19.s, #0xff\n"
-    "ld1rw { z18.s }, p4/Z, [%x[left_shift]]\n"
-    "ld1rw { z17.s }, p4/Z, [%x[right_shift]]\n"
-    ".inst 0x4482924f  // srshl z15.s, p4/M, z15.s, z18.s\n"
-    "ld1rw { z16.s }, p4/Z, [x19]\n"
-    ".inst 0x4482924e  // srshl z14.s, p4/M, z14.s, z18.s\n"
-    ".inst 0x4482924d  // srshl z13.s, p4/M, z13.s, z18.s\n"
-    ".inst 0x4482924c  // srshl z12.s, p4/M, z12.s, z18.s\n"
-    ".inst 0x04b475ef  // sqrdmulh z15.s, z15.s, z20.s\n"
-    ".inst 0x04b475ce  // sqrdmulh z14.s, z14.s, z20.s\n"
-    ".inst 0x04b475ad  // sqrdmulh z13.s, z13.s, z20.s\n"
-    ".inst 0x04b4758c  // sqrdmulh z12.s, z12.s, z20.s\n"
-    ".inst 0x4482922f  // srshl z15.s, p4/M, z15.s, z17.s\n"
-    ".inst 0x4482922e  // srshl z14.s, p4/M, z14.s, z17.s\n"
-    ".inst 0x4482922d  // srshl z13.s, p4/M, z13.s, z17.s\n"
-    ".inst 0x4482922c  // srshl z12.s, p4/M, z12.s, z17.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[left_shift]]\n"
+    "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
+    ".inst 0x4482822f  // srshl z15.s, p0/M, z15.s, z17.s\n"
+    ".inst 0x4482822e  // srshl z14.s, p0/M, z14.s, z17.s\n"
+    ".inst 0x4482822d  // srshl z13.s, p0/M, z13.s, z17.s\n"
+    ".inst 0x4482822c  // srshl z12.s, p0/M, z12.s, z17.s\n"
+    "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+    ".inst 0x04b075ef  // sqrdmulh z15.s, z15.s, z16.s\n"
+    ".inst 0x04b075ce  // sqrdmulh z14.s, z14.s, z16.s\n"
+    ".inst 0x04b075ad  // sqrdmulh z13.s, z13.s, z16.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x4482822f  // srshl z15.s, p0/M, z15.s, z17.s\n"
+    ".inst 0x04b0758c  // sqrdmulh z12.s, z12.s, z16.s\n"
+    ".inst 0x4482822e  // srshl z14.s, p0/M, z14.s, z17.s\n"
+    ".inst 0x4482822d  // srshl z13.s, p0/M, z13.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x4482822c  // srshl z12.s, p0/M, z12.s, z17.s\n"
     "add z15.s, z15.s, z16.s\n"
     "add z14.s, z14.s, z16.s\n"
     "add z13.s, z13.s, z16.s\n"
     "add z12.s, z12.s, z16.s\n"
-    "smax z15.s, p4/M, z15.s, z21.s\n"
-    "smax z14.s, p4/M, z14.s, z21.s\n"
-    "smax z13.s, p4/M, z13.s, z21.s\n"
-    "smax z12.s, p4/M, z12.s, z21.s\n"
-    "smin z15.s, p4/M, z15.s, z19.s\n"
-    "smin z14.s, p4/M, z14.s, z19.s\n"
-    "smin z13.s, p4/M, z13.s, z19.s\n"
-    "smin z12.s, p4/M, z12.s, z19.s\n"
+    "mov z17.s, #0x0\n"
+    "smax z15.s, p0/M, z15.s, z17.s\n"
+    "smax z14.s, p0/M, z14.s, z17.s\n"
+    "mov z16.s, #0xff\n"
+    "smax z13.s, p0/M, z13.s, z17.s\n"
+    "smax z12.s, p0/M, z12.s, z17.s\n"
+    "smin z15.s, p0/M, z15.s, z16.s\n"
+    "smin z14.s, p0/M, z14.s, z16.s\n"
     "trn1 z17.h, z15.h, z14.h\n"
+    "smin z13.s, p0/M, z13.s, z16.s\n"
+    "smin z12.s, p0/M, z12.s, z16.s\n"
     "trn1 z16.h, z13.h, z12.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x26]\n"
-    "incb x26\n"
-    "whilelt p3.b, x26, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+    "incb x27\n"
+    "whilelt p4.b, x27, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [accumulator_init] "r" (&accumulator_init), [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [outptr] "r" (outptr), [quant_params] "r" (&qp), [right_shift] "r" (&right_shift)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp
index c3c0edd0d5..eece6c0578 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,21 @@
 
 #pragma once
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
 
 void sve_u8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
 
-struct sve_u8q_nhwc_max_generic_depthfirst
+struct sve_u8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
 {
-  typedef uint8_t operand_type;
-  typedef uint8_t return_type;
-
-  typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
-  constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
-  kern_type kernel = sve_u8q_nhwc_max_generic_depthfirst_impl;
-
+  using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
   sve_u8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+  typename Parent::KernelType get_kernel(void) const override { return sve_u8q_nhwc_max_generic_depthfirst_impl; }
 };
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
index c1c1d29613..94522cdaaa 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,9 @@
 
 #include "pooling.hpp"
 #include <cstdint>
+#include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace arm_conv {
 namespace pooling {
@@ -41,376 +42,375 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
 )
 {
   __asm__ __volatile__(
-    "ptrue p4.b\n"
-    "mov x28, #0x0\n"
-    "cntb x27\n"
-    "cntb x26, ALL, MUL #2\n"
-    "cntb x25, ALL, MUL #3\n"
+    "mov x9, #0x0\n"
+    "cntb x28\n"
+    "cntb x27, ALL, MUL #2\n"
+    "cntb x26, ALL, MUL #3\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "whilelt p3.b, x28, %x[n_channels]\n"
     "whilelt p2.b, x27, %x[n_channels]\n"
     "whilelt p1.b, x26, %x[n_channels]\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "ptrue p0.b\n"
     "b.none 7f\n"
     "1:"  // 4-vectors of channels
-    "mov z10.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "mov z9.b, #0x0\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
     "mov z8.b, #0x0\n"
     "mov z7.b, #0x0\n"
-    "cbz x24, 4f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "mov x24, %x[inptrs]\n"
+    "mov z6.b, #0x0\n"
+    "mov z5.b, #0x0\n"
+    "cbz x25, 4f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "beq 3f\n"
     "2:"  // 4-vectors of channels: 4 inputs loop
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "umax z21.b, p4/M, z21.b, z26.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "umax z16.b, p4/M, z16.b, z25.b\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
-    "umax z20.b, p4/M, z20.b, z24.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z30.b }, p2/Z, [x22, x27]\n"
-    "umax z18.b, p4/M, z18.b, z22.b\n"
-    "ld1b { z22.b }, p2/Z, [x21, x27]\n"
-    "umax z17.b, p4/M, z17.b, z21.b\n"
-    "ld1b { z29.b }, p2/Z, [x20, x27]\n"
-    "umax z16.b, p4/M, z16.b, z20.b\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "umax z10.b, p4/M, z10.b, z19.b\n"
-    "ld1b { z27.b }, p1/Z, [x22, x26]\n"
-    "umax z9.b, p4/M, z9.b, z18.b\n"
-    "ld1b { z21.b }, p1/Z, [x21, x26]\n"
-    "umax z8.b, p4/M, z8.b, z17.b\n"
-    "ld1b { z26.b }, p1/Z, [x20, x26]\n"
-    "umax z7.b, p4/M, z7.b, z16.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "ld1b { z25.b }, p0/Z, [x22, x25]\n"
-    "ld1b { z20.b }, p0/Z, [x21, x25]\n"
-    "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+    "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+    "umax z22.b, p0/M, z22.b, z30.b\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+    "umax z21.b, p0/M, z21.b, z27.b\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+    "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+    "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+    "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+    "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+    "subs x25, x25, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z19.b\n"
+    "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+    "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+    "umax z7.b, p0/M, z7.b, z18.b\n"
+    "umax z6.b, p0/M, z6.b, z17.b\n"
+    "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+    "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+    "ld1b { z24.b }, p1/Z, [x20, x26]\n"
     "bgt 2b\n"
     "3:"  // 4-vectors of channels: 4 inputs tail
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
-    "umax z22.b, p4/M, z22.b, z29.b\n"
-    "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
-    "umax z21.b, p4/M, z21.b, z26.b\n"
-    "umax z16.b, p4/M, z16.b, z25.b\n"
-    "umax z20.b, p4/M, z20.b, z24.b\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "umax z18.b, p4/M, z18.b, z22.b\n"
-    "umax z17.b, p4/M, z17.b, z21.b\n"
-    "umax z16.b, p4/M, z16.b, z20.b\n"
-    "umax z10.b, p4/M, z10.b, z19.b\n"
-    "umax z9.b, p4/M, z9.b, z18.b\n"
-    "umax z8.b, p4/M, z8.b, z17.b\n"
-    "umax z7.b, p4/M, z7.b, z16.b\n"
+    "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+    "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+    "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+    "umax z22.b, p0/M, z22.b, z30.b\n"
+    "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+    "umax z21.b, p0/M, z21.b, z27.b\n"
+    "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+    "umax z20.b, p0/M, z20.b, z24.b\n"
+    "umax z19.b, p0/M, z19.b, z23.b\n"
+    "umax z18.b, p0/M, z18.b, z22.b\n"
+    "umax z17.b, p0/M, z17.b, z21.b\n"
+    "umax z16.b, p0/M, z16.b, z20.b\n"
+    "umax z8.b, p0/M, z8.b, z19.b\n"
+    "umax z7.b, p0/M, z7.b, z18.b\n"
+    "umax z6.b, p0/M, z6.b, z17.b\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
     "4:"  // 4-vectors of channels: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 6f\n"
     "5:"  // 4-vectors of channels: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z10.b, p4/M, z10.b, z3.b\n"
-    "ld1b { z31.b }, p2/Z, [x23, x27]\n"
-    "ld1b { z28.b }, p1/Z, [x23, x26]\n"
-    "umax z9.b, p4/M, z9.b, z31.b\n"
-    "ld1b { z16.b }, p0/Z, [x23, x25]\n"
-    "umax z8.b, p4/M, z8.b, z28.b\n"
-    "umax z7.b, p4/M, z7.b, z16.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
+    "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+    "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+    "umax z7.b, p0/M, z7.b, z17.b\n"
+    "umax z6.b, p0/M, z6.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+    "umax z5.b, p0/M, z5.b, z16.b\n"
     "bgt 5b\n"
     "6:"  // 4-vectors of channels: Single input loop: End
-    "mov z6.s, #0x0\n"
-    "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
-    "ld1rw { z5.s }, p4/Z, [x19]\n"
-    "mov z4.s, #0xff\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    ".inst 0x4508a951  // ushllb z17.h, z10.b, #0x0\n"
-    "ld1rw { z3.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    ".inst 0x4508ad50  // ushllt z16.h, z10.b, #0x0\n"
-    "ld1rw { z2.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    ".inst 0x4508a937  // ushllb z23.h, z9.b, #0x0\n"
-    "ld1rw { z1.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    ".inst 0x4508ad36  // ushllt z22.h, z9.b, #0x0\n"
-    "ld1rw { z0.s }, p4/Z, [x19]\n"
-    ".inst 0x4508a912  // ushllb z18.h, z8.b, #0x0\n"
-    ".inst 0x4508ad15  // ushllt z21.h, z8.b, #0x0\n"
-    ".inst 0x4508a8f4  // ushllb z20.h, z7.b, #0x0\n"
-    ".inst 0x4508acf3  // ushllt z19.h, z7.b, #0x0\n"
-    "neg z5.s, p4/M, z5.s\n"
-    ".inst 0x459140bf  // saddwb z31.s, z5.s, z17.h\n"
-    ".inst 0x459144b1  // saddwt z17.s, z5.s, z17.h\n"
-    ".inst 0x459040be  // saddwb z30.s, z5.s, z16.h\n"
-    ".inst 0x459044b0  // saddwt z16.s, z5.s, z16.h\n"
-    ".inst 0x459740bd  // saddwb z29.s, z5.s, z23.h\n"
-    ".inst 0x459744bc  // saddwt z28.s, z5.s, z23.h\n"
-    ".inst 0x459640bb  // saddwb z27.s, z5.s, z22.h\n"
-    ".inst 0x459644ba  // saddwt z26.s, z5.s, z22.h\n"
-    ".inst 0x459240b9  // saddwb z25.s, z5.s, z18.h\n"
-    ".inst 0x459244b2  // saddwt z18.s, z5.s, z18.h\n"
-    ".inst 0x459540b8  // saddwb z24.s, z5.s, z21.h\n"
-    ".inst 0x459544b7  // saddwt z23.s, z5.s, z21.h\n"
-    ".inst 0x459440b6  // saddwb z22.s, z5.s, z20.h\n"
-    ".inst 0x459444b5  // saddwt z21.s, z5.s, z20.h\n"
-    ".inst 0x459340b4  // saddwb z20.s, z5.s, z19.h\n"
-    ".inst 0x459344b3  // saddwt z19.s, z5.s, z19.h\n"
-    ".inst 0x4482905f  // srshl z31.s, p4/M, z31.s, z2.s\n"
-    ".inst 0x44829051  // srshl z17.s, p4/M, z17.s, z2.s\n"
-    ".inst 0x4482905e  // srshl z30.s, p4/M, z30.s, z2.s\n"
-    ".inst 0x44829050  // srshl z16.s, p4/M, z16.s, z2.s\n"
-    ".inst 0x04a377ff  // sqrdmulh z31.s, z31.s, z3.s\n"
-    ".inst 0x04a37631  // sqrdmulh z17.s, z17.s, z3.s\n"
-    ".inst 0x04a377de  // sqrdmulh z30.s, z30.s, z3.s\n"
-    ".inst 0x04a37610  // sqrdmulh z16.s, z16.s, z3.s\n"
-    ".inst 0x4482903f  // srshl z31.s, p4/M, z31.s, z1.s\n"
-    ".inst 0x44829031  // srshl z17.s, p4/M, z17.s, z1.s\n"
-    ".inst 0x4482903e  // srshl z30.s, p4/M, z30.s, z1.s\n"
-    ".inst 0x44829030  // srshl z16.s, p4/M, z16.s, z1.s\n"
-    "add z31.s, z31.s, z0.s\n"
-    "add z17.s, z17.s, z0.s\n"
-    "add z30.s, z30.s, z0.s\n"
-    "add z16.s, z16.s, z0.s\n"
-    ".inst 0x4482905d  // srshl z29.s, p4/M, z29.s, z2.s\n"
-    ".inst 0x4482905c  // srshl z28.s, p4/M, z28.s, z2.s\n"
-    ".inst 0x4482905b  // srshl z27.s, p4/M, z27.s, z2.s\n"
-    ".inst 0x4482905a  // srshl z26.s, p4/M, z26.s, z2.s\n"
-    ".inst 0x04a377bd  // sqrdmulh z29.s, z29.s, z3.s\n"
-    ".inst 0x04a3779c  // sqrdmulh z28.s, z28.s, z3.s\n"
-    ".inst 0x04a3777b  // sqrdmulh z27.s, z27.s, z3.s\n"
-    ".inst 0x04a3775a  // sqrdmulh z26.s, z26.s, z3.s\n"
-    ".inst 0x4482903d  // srshl z29.s, p4/M, z29.s, z1.s\n"
-    ".inst 0x4482903c  // srshl z28.s, p4/M, z28.s, z1.s\n"
-    ".inst 0x4482903b  // srshl z27.s, p4/M, z27.s, z1.s\n"
-    ".inst 0x4482903a  // srshl z26.s, p4/M, z26.s, z1.s\n"
-    "add z29.s, z29.s, z0.s\n"
-    "add z28.s, z28.s, z0.s\n"
-    "add z27.s, z27.s, z0.s\n"
-    "add z26.s, z26.s, z0.s\n"
-    ".inst 0x44829059  // srshl z25.s, p4/M, z25.s, z2.s\n"
-    ".inst 0x44829052  // srshl z18.s, p4/M, z18.s, z2.s\n"
-    "smax z31.s, p4/M, z31.s, z6.s\n"
-    "smax z17.s, p4/M, z17.s, z6.s\n"
-    ".inst 0x04a37739  // sqrdmulh z25.s, z25.s, z3.s\n"
-    ".inst 0x04a37652  // sqrdmulh z18.s, z18.s, z3.s\n"
-    "smin z31.s, p4/M, z31.s, z4.s\n"
-    "smin z17.s, p4/M, z17.s, z4.s\n"
-    ".inst 0x44829039  // srshl z25.s, p4/M, z25.s, z1.s\n"
-    ".inst 0x44829032  // srshl z18.s, p4/M, z18.s, z1.s\n"
-    "smax z30.s, p4/M, z30.s, z6.s\n"
-    "trn1 z17.h, z31.h, z17.h\n"
-    "add z25.s, z25.s, z0.s\n"
-    "add z18.s, z18.s, z0.s\n"
-    ".inst 0x44829058  // srshl z24.s, p4/M, z24.s, z2.s\n"
-    ".inst 0x44829057  // srshl z23.s, p4/M, z23.s, z2.s\n"
-    "smin z30.s, p4/M, z30.s, z4.s\n"
-    "smax z16.s, p4/M, z16.s, z6.s\n"
-    ".inst 0x04a37718  // sqrdmulh z24.s, z24.s, z3.s\n"
-    ".inst 0x04a376f7  // sqrdmulh z23.s, z23.s, z3.s\n"
-    "smax z29.s, p4/M, z29.s, z6.s\n"
-    "smin z16.s, p4/M, z16.s, z4.s\n"
-    ".inst 0x44829038  // srshl z24.s, p4/M, z24.s, z1.s\n"
-    ".inst 0x44829037  // srshl z23.s, p4/M, z23.s, z1.s\n"
-    "smin z29.s, p4/M, z29.s, z4.s\n"
-    "trn1 z16.h, z30.h, z16.h\n"
-    "add z24.s, z24.s, z0.s\n"
-    "add z23.s, z23.s, z0.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1rw { z3.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a911  // ushllb z17.h, z8.b, #0x0\n"
+    ".inst 0x4508ad18  // ushllt z24.h, z8.b, #0x0\n"
+    ".inst 0x4508a8f7  // ushllb z23.h, z7.b, #0x0\n"
+    ".inst 0x4508acf6  // ushllt z22.h, z7.b, #0x0\n"
+    "neg z3.s, p0/M, z3.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    ".inst 0x4508a8d5  // ushllb z21.h, z6.b, #0x0\n"
+    ".inst 0x4508acd4  // ushllt z20.h, z6.b, #0x0\n"
+    "ld1rw { z2.s }, p0/Z, [x20]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    ".inst 0x4508a8b3  // ushllb z19.h, z5.b, #0x0\n"
+    ".inst 0x4508acb0  // ushllt z16.h, z5.b, #0x0\n"
+    "ld1rw { z18.s }, p0/Z, [x20]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    ".inst 0x45914061  // saddwb z1.s, z3.s, z17.h\n"
+    ".inst 0x45914471  // saddwt z17.s, z3.s, z17.h\n"
+    ".inst 0x44828041  // srshl z1.s, p0/M, z1.s, z2.s\n"
+    ".inst 0x44828051  // srshl z17.s, p0/M, z17.s, z2.s\n"
+    ".inst 0x45984060  // saddwb z0.s, z3.s, z24.h\n"
+    ".inst 0x4598447f  // saddwt z31.s, z3.s, z24.h\n"
+    ".inst 0x44828040  // srshl z0.s, p0/M, z0.s, z2.s\n"
+    ".inst 0x4482805f  // srshl z31.s, p0/M, z31.s, z2.s\n"
+    ".inst 0x4597407e  // saddwb z30.s, z3.s, z23.h\n"
+    ".inst 0x4597447d  // saddwt z29.s, z3.s, z23.h\n"
+    ".inst 0x4482805e  // srshl z30.s, p0/M, z30.s, z2.s\n"
+    ".inst 0x4482805d  // srshl z29.s, p0/M, z29.s, z2.s\n"
+    ".inst 0x4596407c  // saddwb z28.s, z3.s, z22.h\n"
+    ".inst 0x4596447b  // saddwt z27.s, z3.s, z22.h\n"
+    ".inst 0x4482805c  // srshl z28.s, p0/M, z28.s, z2.s\n"
+    ".inst 0x4482805b  // srshl z27.s, p0/M, z27.s, z2.s\n"
+    ".inst 0x4595407a  // saddwb z26.s, z3.s, z21.h\n"
+    ".inst 0x45954479  // saddwt z25.s, z3.s, z21.h\n"
+    ".inst 0x4482805a  // srshl z26.s, p0/M, z26.s, z2.s\n"
+    ".inst 0x44828059  // srshl z25.s, p0/M, z25.s, z2.s\n"
+    ".inst 0x45944078  // saddwb z24.s, z3.s, z20.h\n"
+    ".inst 0x45944477  // saddwt z23.s, z3.s, z20.h\n"
+    ".inst 0x44828058  // srshl z24.s, p0/M, z24.s, z2.s\n"
+    ".inst 0x44828057  // srshl z23.s, p0/M, z23.s, z2.s\n"
+    ".inst 0x45934076  // saddwb z22.s, z3.s, z19.h\n"
+    ".inst 0x45934475  // saddwt z21.s, z3.s, z19.h\n"
+    ".inst 0x44828056  // srshl z22.s, p0/M, z22.s, z2.s\n"
+    ".inst 0x44828055  // srshl z21.s, p0/M, z21.s, z2.s\n"
+    ".inst 0x45904074  // saddwb z20.s, z3.s, z16.h\n"
+    ".inst 0x45904473  // saddwt z19.s, z3.s, z16.h\n"
+    ".inst 0x44828054  // srshl z20.s, p0/M, z20.s, z2.s\n"
+    ".inst 0x44828053  // srshl z19.s, p0/M, z19.s, z2.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x04b27421  // sqrdmulh z1.s, z1.s, z18.s\n"
+    ".inst 0x04b27631  // sqrdmulh z17.s, z17.s, z18.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x04b27400  // sqrdmulh z0.s, z0.s, z18.s\n"
+    ".inst 0x04b277ff  // sqrdmulh z31.s, z31.s, z18.s\n"
+    ".inst 0x44828201  // srshl z1.s, p0/M, z1.s, z16.s\n"
+    ".inst 0x44828211  // srshl z17.s, p0/M, z17.s, z16.s\n"
+    ".inst 0x04b277de  // sqrdmulh z30.s, z30.s, z18.s\n"
+    ".inst 0x04b277bd  // sqrdmulh z29.s, z29.s, z18.s\n"
+    ".inst 0x44828200  // srshl z0.s, p0/M, z0.s, z16.s\n"
+    ".inst 0x4482821f  // srshl z31.s, p0/M, z31.s, z16.s\n"
+    ".inst 0x04b2779c  // sqrdmulh z28.s, z28.s, z18.s\n"
+    ".inst 0x04b2777b  // sqrdmulh z27.s, z27.s, z18.s\n"
+    ".inst 0x4482821e  // srshl z30.s, p0/M, z30.s, z16.s\n"
+    ".inst 0x4482821d  // srshl z29.s, p0/M, z29.s, z16.s\n"
+    ".inst 0x04b2775a  // sqrdmulh z26.s, z26.s, z18.s\n"
+    ".inst 0x04b27739  // sqrdmulh z25.s, z25.s, z18.s\n"
+    ".inst 0x4482821c  // srshl z28.s, p0/M, z28.s, z16.s\n"
+    ".inst 0x4482821b  // srshl z27.s, p0/M, z27.s, z16.s\n"
+    ".inst 0x04b27718  // sqrdmulh z24.s, z24.s, z18.s\n"
+    ".inst 0x04b276f7  // sqrdmulh z23.s, z23.s, z18.s\n"
+    ".inst 0x4482821a  // srshl z26.s, p0/M, z26.s, z16.s\n"
+    ".inst 0x44828219  // srshl z25.s, p0/M, z25.s, z16.s\n"
+    ".inst 0x04b276d6  // sqrdmulh z22.s, z22.s, z18.s\n"
+    ".inst 0x04b276b5  // sqrdmulh z21.s, z21.s, z18.s\n"
+    ".inst 0x44828218  // srshl z24.s, p0/M, z24.s, z16.s\n"
+    ".inst 0x44828217  // srshl z23.s, p0/M, z23.s, z16.s\n"
+    ".inst 0x04b27694  // sqrdmulh z20.s, z20.s, z18.s\n"
+    ".inst 0x04b27673  // sqrdmulh z19.s, z19.s, z18.s\n"
+    ".inst 0x44828216  // srshl z22.s, p0/M, z22.s, z16.s\n"
+    ".inst 0x44828215  // srshl z21.s, p0/M, z21.s, z16.s\n"
+    ".inst 0x44828214  // srshl z20.s, p0/M, z20.s, z16.s\n"
+    ".inst 0x44828213  // srshl z19.s, p0/M, z19.s, z16.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    "add z1.s, z1.s, z16.s\n"
+    "add z17.s, z17.s, z16.s\n"
+    "add z0.s, z0.s, z16.s\n"
+    "add z31.s, z31.s, z16.s\n"
+    "add z30.s, z30.s, z16.s\n"
+    "add z29.s, z29.s, z16.s\n"
+    "add z28.s, z28.s, z16.s\n"
+    "add z27.s, z27.s, z16.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "add z25.s, z25.s, z16.s\n"
+    "add z24.s, z24.s, z16.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "add z22.s, z22.s, z16.s\n"
+    "add z21.s, z21.s, z16.s\n"
+    "add z20.s, z20.s, z16.s\n"
+    "add z19.s, z19.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "smax z1.s, p0/M, z1.s, z16.s\n"
+    "smax z17.s, p0/M, z17.s, z16.s\n"
+    "smax z0.s, p0/M, z0.s, z16.s\n"
+    "smax z31.s, p0/M, z31.s, z16.s\n"
+    "mov z18.s, #0xff\n"
+    "smax z30.s, p0/M, z30.s, z16.s\n"
+    "smax z29.s, p0/M, z29.s, z16.s\n"
+    "smax z28.s, p0/M, z28.s, z16.s\n"
+    "smax z27.s, p0/M, z27.s, z16.s\n"
+    "smax z26.s, p0/M, z26.s, z16.s\n"
+    "smax z25.s, p0/M, z25.s, z16.s\n"
+    "smax z24.s, p0/M, z24.s, z16.s\n"
+    "smax z23.s, p0/M, z23.s, z16.s\n"
+    "smax z22.s, p0/M, z22.s, z16.s\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z19.s, p0/M, z19.s, z16.s\n"
+    "smin z1.s, p0/M, z1.s, z18.s\n"
+    "smin z17.s, p0/M, z17.s, z18.s\n"
+    "trn1 z17.h, z1.h, z17.h\n"
+    "smin z0.s, p0/M, z0.s, z18.s\n"
+    "smin z31.s, p0/M, z31.s, z18.s\n"
+    "trn1 z16.h, z0.h, z31.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x28]\n"
-    ".inst 0x44829056  // srshl z22.s, p4/M, z22.s, z2.s\n"
-    "incb x28, ALL, MUL #4\n"
-    ".inst 0x44829055  // srshl z21.s, p4/M, z21.s, z2.s\n"
-    ".inst 0x44829054  // srshl z20.s, p4/M, z20.s, z2.s\n"
-    ".inst 0x44829053  // srshl z19.s, p4/M, z19.s, z2.s\n"
-    "smax z28.s, p4/M, z28.s, z6.s\n"
-    ".inst 0x04a376d6  // sqrdmulh z22.s, z22.s, z3.s\n"
-    ".inst 0x04a376b5  // sqrdmulh z21.s, z21.s, z3.s\n"
-    ".inst 0x04a37694  // sqrdmulh z20.s, z20.s, z3.s\n"
-    ".inst 0x04a37673  // sqrdmulh z19.s, z19.s, z3.s\n"
-    ".inst 0x44829036  // srshl z22.s, p4/M, z22.s, z1.s\n"
-    ".inst 0x44829035  // srshl z21.s, p4/M, z21.s, z1.s\n"
-    ".inst 0x44829034  // srshl z20.s, p4/M, z20.s, z1.s\n"
-    ".inst 0x44829033  // srshl z19.s, p4/M, z19.s, z1.s\n"
-    "add z22.s, z22.s, z0.s\n"
-    "add z21.s, z21.s, z0.s\n"
-    "add z20.s, z20.s, z0.s\n"
-    "add z19.s, z19.s, z0.s\n"
-    "smax z27.s, p4/M, z27.s, z6.s\n"
-    "smax z26.s, p4/M, z26.s, z6.s\n"
-    "smax z25.s, p4/M, z25.s, z6.s\n"
-    "smin z28.s, p4/M, z28.s, z4.s\n"
-    "smin z27.s, p4/M, z27.s, z4.s\n"
-    "smin z26.s, p4/M, z26.s, z4.s\n"
-    "smin z25.s, p4/M, z25.s, z4.s\n"
-    "trn1 z17.h, z29.h, z28.h\n"
-    "smax z18.s, p4/M, z18.s, z6.s\n"
-    "trn1 z16.h, z27.h, z26.h\n"
-    "smax z24.s, p4/M, z24.s, z6.s\n"
+    "smin z30.s, p0/M, z30.s, z18.s\n"
+    "smin z29.s, p0/M, z29.s, z18.s\n"
+    "trn1 z17.h, z30.h, z29.h\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "smin z28.s, p0/M, z28.s, z18.s\n"
+    "smin z27.s, p0/M, z27.s, z18.s\n"
+    "trn1 z16.h, z28.h, z27.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p2, [%x[outptr], x27]\n"
-    "smin z18.s, p4/M, z18.s, z4.s\n"
-    "incb x27, ALL, MUL #4\n"
-    "smin z24.s, p4/M, z24.s, z4.s\n"
-    "smax z23.s, p4/M, z23.s, z6.s\n"
-    "smax z22.s, p4/M, z22.s, z6.s\n"
-    "smax z21.s, p4/M, z21.s, z6.s\n"
-    "trn1 z18.h, z25.h, z18.h\n"
-    "smin z23.s, p4/M, z23.s, z4.s\n"
-    "smin z22.s, p4/M, z22.s, z4.s\n"
-    "smin z21.s, p4/M, z21.s, z4.s\n"
-    "smax z20.s, p4/M, z20.s, z6.s\n"
+    "smin z26.s, p0/M, z26.s, z18.s\n"
+    "smin z25.s, p0/M, z25.s, z18.s\n"
+    "trn1 z17.h, z26.h, z25.h\n"
+    "st1b { z16.b }, p3, [%x[outptr], x28]\n"
+    "smin z24.s, p0/M, z24.s, z18.s\n"
+    "smin z23.s, p0/M, z23.s, z18.s\n"
     "trn1 z16.h, z24.h, z23.h\n"
-    "smax z19.s, p4/M, z19.s, z6.s\n"
+    "trn1 z16.b, z17.b, z16.b\n"
+    "smin z22.s, p0/M, z22.s, z18.s\n"
+    "smin z21.s, p0/M, z21.s, z18.s\n"
     "trn1 z17.h, z22.h, z21.h\n"
-    "trn1 z16.b, z18.b, z16.b\n"
-    "st1b { z16.b }, p1, [%x[outptr], x26]\n"
-    "smin z20.s, p4/M, z20.s, z4.s\n"
-    "incb x26, ALL, MUL #4\n"
-    "smin z19.s, p4/M, z19.s, z4.s\n"
+    "st1b { z16.b }, p2, [%x[outptr], x27]\n"
+    "smin z20.s, p0/M, z20.s, z18.s\n"
+    "smin z19.s, p0/M, z19.s, z18.s\n"
     "trn1 z16.h, z20.h, z19.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p0, [%x[outptr], x25]\n"
-    "incb x25, ALL, MUL #4\n"
-    "whilelt p0.b, x25, %x[n_channels]\n"
+    "st1b { z16.b }, p1, [%x[outptr], x26]\n"
+    "incb x26, ALL, MUL #4\n"
+    "whilelt p1.b, x26, %x[n_channels]\n"
+    "incb x9, ALL, MUL #4\n"
+    "incb x28, ALL, MUL #4\n"
+    "incb x27, ALL, MUL #4\n"
     "b.any 1b\n"
     "7:"  // Single vector of channels
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.none 14f\n"
     "8:"  // Single vector of channels: Loop
-    "mov z10.b, #0x0\n"
-    "mov x19, %x[inptrs]\n"
-    "lsr x24, %x[n_valid_cells], #0x2\n"
-    "cbz x24, 11f\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "subs x24, x24, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "lsr x25, %x[n_valid_cells], #0x2\n"
+    "mov z8.b, #0x0\n"
+    "mov x24, %x[inptrs]\n"
+    "cbz x25, 11f\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "subs x25, x25, #0x1\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "beq 10f\n"
     "9:"  // Single vector of channels: Loop: 4 inputs loop
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "ldp x23, x22, [x19, #0x0]\n"
-    "subs x24, x24, #0x1\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "ldp x21, x20, [x19, #0x10]\n"
-    "add x19, x19, #0x20\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "ld1b { z2.b }, p3/Z, [x22, x28]\n"
-    "umax z10.b, p4/M, z10.b, z19.b\n"
-    "ld1b { z1.b }, p3/Z, [x21, x28]\n"
-    "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+    "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+    "ldp x23, x22, [x24, #0x0]\n"
+    "ldp x21, x20, [x24, #0x10]\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "subs x25, x25, #0x1\n"
+    "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+    "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
+    "add x24, x24, #0x20\n"
+    "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+    "ld1b { z1.b }, p4/Z, [x20, x9]\n"
     "bgt 9b\n"
     "10:"  // Single vector of channels: Loop: 4 inputs tail
-    "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
-    "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
-    "umax z19.b, p4/M, z19.b, z23.b\n"
-    "umax z10.b, p4/M, z10.b, z19.b\n"
+    "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+    "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+    "umax z16.b, p0/M, z16.b, z17.b\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
     "11:"  // Single vector of channels: Loop: After loop
-    "ands x20, %x[n_valid_cells], #0x3\n"
+    "ands x21, %x[n_valid_cells], #0x3\n"
     "beq 13f\n"
     "12:"  // Single vector of channels: Loop: Single input loop
-    "ldr x23, [x19], #0x8\n"
-    "subs x20, x20, #0x1\n"
-    "ld1b { z3.b }, p3/Z, [x23, x28]\n"
-    "umax z10.b, p4/M, z10.b, z3.b\n"
+    "ldr x20, [x24], #0x8\n"
+    "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+    "subs x21, x21, #0x1\n"
+    "umax z8.b, p0/M, z8.b, z16.b\n"
     "bgt 12b\n"
     "13:"  // Single vector of channels: Loop: Single input loop: End
-    "mov z6.s, #0x0\n"
-    "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
-    "ld1rw { z5.s }, p4/Z, [x19]\n"
-    "mov z4.s, #0xff\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
-    ".inst 0x4508a951  // ushllb z17.h, z10.b, #0x0\n"
-    "ld1rw { z3.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
-    ".inst 0x4508ad50  // ushllt z16.h, z10.b, #0x0\n"
-    "ld1rw { z2.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
-    "neg z5.s, p4/M, z5.s\n"
-    "ld1rw { z1.s }, p4/Z, [x19]\n"
-    "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
-    ".inst 0x459140bf  // saddwb z31.s, z5.s, z17.h\n"
-    "ld1rw { z0.s }, p4/Z, [x19]\n"
-    ".inst 0x459144b1  // saddwt z17.s, z5.s, z17.h\n"
-    ".inst 0x459040be  // saddwb z30.s, z5.s, z16.h\n"
-    ".inst 0x459044b0  // saddwt z16.s, z5.s, z16.h\n"
-    ".inst 0x4482905f  // srshl z31.s, p4/M, z31.s, z2.s\n"
-    ".inst 0x44829051  // srshl z17.s, p4/M, z17.s, z2.s\n"
-    ".inst 0x4482905e  // srshl z30.s, p4/M, z30.s, z2.s\n"
-    ".inst 0x44829050  // srshl z16.s, p4/M, z16.s, z2.s\n"
-    ".inst 0x04a377ff  // sqrdmulh z31.s, z31.s, z3.s\n"
-    ".inst 0x04a37631  // sqrdmulh z17.s, z17.s, z3.s\n"
-    ".inst 0x04a377de  // sqrdmulh z30.s, z30.s, z3.s\n"
-    ".inst 0x04a37610  // sqrdmulh z16.s, z16.s, z3.s\n"
-    ".inst 0x4482903f  // srshl z31.s, p4/M, z31.s, z1.s\n"
-    ".inst 0x44829031  // srshl z17.s, p4/M, z17.s, z1.s\n"
-    ".inst 0x4482903e  // srshl z30.s, p4/M, z30.s, z1.s\n"
-    ".inst 0x44829030  // srshl z16.s, p4/M, z16.s, z1.s\n"
-    "add z31.s, z31.s, z0.s\n"
-    "add z17.s, z17.s, z0.s\n"
-    "add z30.s, z30.s, z0.s\n"
-    "add z16.s, z16.s, z0.s\n"
-    "smax z31.s, p4/M, z31.s, z6.s\n"
-    "smax z17.s, p4/M, z17.s, z6.s\n"
-    "smax z30.s, p4/M, z30.s, z6.s\n"
-    "smax z16.s, p4/M, z16.s, z6.s\n"
-    "smin z31.s, p4/M, z31.s, z4.s\n"
-    "smin z17.s, p4/M, z17.s, z4.s\n"
-    "smin z30.s, p4/M, z30.s, z4.s\n"
-    "smin z16.s, p4/M, z16.s, z4.s\n"
-    "trn1 z17.h, z31.h, z17.h\n"
-    "trn1 z16.h, z30.h, z16.h\n"
+    "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+    "ld1rw { z18.s }, p0/Z, [x20]\n"
+    ".inst 0x4508a911  // ushllb z17.h, z8.b, #0x0\n"
+    ".inst 0x4508ad10  // ushllt z16.h, z8.b, #0x0\n"
+    "neg z18.s, p0/M, z18.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+    ".inst 0x45914255  // saddwb z21.s, z18.s, z17.h\n"
+    ".inst 0x45914654  // saddwt z20.s, z18.s, z17.h\n"
+    ".inst 0x45904253  // saddwb z19.s, z18.s, z16.h\n"
+    ".inst 0x45904652  // saddwt z18.s, z18.s, z16.h\n"
+    "ld1rw { z17.s }, p0/Z, [x20]\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    ".inst 0x44828235  // srshl z21.s, p0/M, z21.s, z17.s\n"
+    ".inst 0x44828234  // srshl z20.s, p0/M, z20.s, z17.s\n"
+    ".inst 0x04b076b5  // sqrdmulh z21.s, z21.s, z16.s\n"
+    ".inst 0x44828233  // srshl z19.s, p0/M, z19.s, z17.s\n"
+    ".inst 0x44828232  // srshl z18.s, p0/M, z18.s, z17.s\n"
+    ".inst 0x04b07694  // sqrdmulh z20.s, z20.s, z16.s\n"
+    ".inst 0x04b07673  // sqrdmulh z19.s, z19.s, z16.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+    "ld1rw { z17.s }, p0/Z, [x20]\n"
+    ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+    "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+    ".inst 0x44828235  // srshl z21.s, p0/M, z21.s, z17.s\n"
+    ".inst 0x44828234  // srshl z20.s, p0/M, z20.s, z17.s\n"
+    "ld1rw { z16.s }, p0/Z, [x20]\n"
+    "add z21.s, z21.s, z16.s\n"
+    ".inst 0x44828233  // srshl z19.s, p0/M, z19.s, z17.s\n"
+    ".inst 0x44828232  // srshl z18.s, p0/M, z18.s, z17.s\n"
+    "add z20.s, z20.s, z16.s\n"
+    "add z19.s, z19.s, z16.s\n"
+    "add z18.s, z18.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "smax z21.s, p0/M, z21.s, z16.s\n"
+    "smax z20.s, p0/M, z20.s, z16.s\n"
+    "smax z19.s, p0/M, z19.s, z16.s\n"
+    "smax z18.s, p0/M, z18.s, z16.s\n"
+    "mov z16.s, #0xff\n"
+    "smin z21.s, p0/M, z21.s, z16.s\n"
+    "smin z20.s, p0/M, z20.s, z16.s\n"
+    "trn1 z17.h, z21.h, z20.h\n"
+    "smin z19.s, p0/M, z19.s, z16.s\n"
+    "smin z18.s, p0/M, z18.s, z16.s\n"
+    "trn1 z16.h, z19.h, z18.h\n"
     "trn1 z16.b, z17.b, z16.b\n"
-    "st1b { z16.b }, p3, [%x[outptr], x28]\n"
-    "incb x28\n"
-    "whilelt p3.b, x28, %x[n_channels]\n"
+    "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+    "incb x9\n"
+    "whilelt p4.b, x9, %x[n_channels]\n"
     "b.any 8b\n"
     "14:"  // End
-
     :
     : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
-    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
   );
 }
 
 }  // namespace pooling
 }  // namespace arm_conv
 
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
index ad95207fb3..1ca478513c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,288 +24,262 @@
 
 #pragma once
 
-#include "pool_common.hpp"
+#include "depthfirst_driver.hpp"
+#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
 #include "utils.hpp"
-
-#include "arm_compute/core/Types.h"
+#if !defined(_WIN64) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
 #include <limits>
 
 namespace arm_conv {
 namespace pooling {
 
-template <class strategy>
-class PoolingDepthfirst : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
+template <typename TInput, typename TOutput>
+class DepthfirstStrategy : public IDepthfirstStrategy
 {
-  using TInput = typename strategy::operand_type;
-  using TOutput = typename strategy::return_type;
-
-  const PoolingArgs m_args;  // Copy of arguments
+  unsigned int input_rows, input_cols, output_rows, output_cols;
 
-  constexpr static unsigned int input_rows(void)
+  public:
+  DepthfirstStrategy(unsigned int window_rows, unsigned int window_cols,
+                     unsigned int stride_rows, unsigned int stride_cols,
+                     unsigned int output_rows, unsigned int output_cols)
+  : input_rows(output_rows + (window_rows - 1) * stride_rows),
+    input_cols(output_cols + (window_cols - 1) * stride_cols),
+    output_rows(output_rows), output_cols(output_cols)
   {
-    return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
   }
 
-  constexpr static unsigned int input_cols(void)
-  {
-    return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
-  }
+  unsigned int get_input_rows() const override { return input_rows; }
+  unsigned int get_input_cols() const override { return input_cols; }
+  unsigned int get_output_rows() const override { return output_rows; }
+  unsigned int get_output_cols() const override { return output_cols; }
+
+  typedef void (*KernelType)(
+    unsigned int n_channels,
+    const TInput *const *,
+    TOutput *const *,
+    bool exclude_padding,
+    unsigned int pad_left,
+    unsigned int pad_top,
+    unsigned int pad_right,
+    unsigned int pad_bottom
+  );
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+
+struct WorkingSpace
+{
+  void *input_buffer;
+  void *output_buffer;
+};
+
 
+template <typename TInput, typename TOutput=TInput, class OutputStage=Nothing>
+class PoolingDepthfirst : public DepthfirstDriver<TInput, TOutput>
+{
   size_t sizeof_input_buffer(void) const
   {
-    return sizeof(TInput) * m_args.n_channels;
+    return sizeof(TInput) * this->m_args.n_channels;
   }
 
   size_t sizeof_output_buffer(void) const
   {
-    return sizeof(TOutput) * m_args.n_channels;
+    return sizeof(TOutput) * this->m_args.n_channels;
   }
 
-  public:
-  PoolingDepthfirst(const PoolingArgs &args) : m_args(args)
+  protected:
+  /* Compute the amount of working space required for a single thread. */
+  size_t get_working_size_per_thread() const override
   {
+    return sizeof(WorkingSpace) + this->m_args.n_channels * (sizeof(TInput) + sizeof(TOutput));
   }
 
-  PoolingDepthfirst(PoolingDepthfirst &) = delete;
-  PoolingDepthfirst &operator=(PoolingDepthfirst &) = delete;
-
-  size_t get_working_size(unsigned int num_threads) const override
+  /* Initialise the working space for a thread. */
+  void initialise_working_space(void *raw_ws) const override
   {
-    // We require a channel-length vector of input padding values
-    // (to be shared amongst all threads) and (for each thread) a
-    // channel-length vector in which to dump surplus output.
-    return sizeof_input_buffer() + num_threads * sizeof_output_buffer();
+    auto ws = reinterpret_cast<WorkingSpace *>(raw_ws);
+    ws->input_buffer = ws + 1;
+    ws->output_buffer = reinterpret_cast<char *>(ws + 1) + sizeof(TInput) * this->m_args.n_channels;
+
+    // Fill the input buffer with an appropriate value
+    TInput fill_val = 0;
+    if (this->m_args.pool_type == PoolingType::MAX)
+    {
+      using limits = std::numeric_limits<TInput>;
+      if (limits::has_infinity)
+      {
+        fill_val = -limits::infinity();
+      }
+      else
+      {
+        fill_val = limits::min();
+      }
+    }
+
+    auto ptr = reinterpret_cast<TInput *>(ws->input_buffer);
+    auto n_channels = this->m_args.n_channels;
+    for (; n_channels; n_channels--)
+    {
+      *(ptr++) = fill_val;
+    }
   }
 
-  void execute(
-    const void *const input,
-    void *const output,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
+  /* Compute a portion of the output tensor with padding. */
+  void compute_tile_padded(
+    unsigned int output_i, unsigned int output_j,
+    unsigned int channel_start, unsigned int channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *working_space
   ) const override
   {
-    const size_t ld_input_col = m_args.n_channels;
-    const size_t ld_input_row = ld_input_col * m_args.input_cols;
-    const size_t ld_input_batch = ld_input_row * m_args.input_rows;
-    const size_t ld_output_col = ld_input_col;
-    const size_t ld_output_row = ld_output_col * m_args.output_cols;
-    const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
-    execute(
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
+    const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
+      this->m_strat.get())->get_kernel();
+
+    // Get the working space, and some space on the stack for pointer arrays
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space);
+    auto inptr_array = reinterpret_cast<const TInput **>(alloca(
+        sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
+    auto outptr_array = reinterpret_cast<TOutput **>(alloca(
+        sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
+
+    // Prepare the input pointers
+    const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+    const unsigned int end_ii = ii + this->m_strat->get_input_rows();
+    const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
+
+    const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+    const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+    const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+    const unsigned int end_ij = ij + this->m_strat->get_input_cols();
+    const auto input_pad_right = end_ij < this->m_args.input_cols ? 0 : end_ij - this->m_args.input_cols;
+
+    fill_pointer_array<const TInput>(
+      inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+      input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
+      input.ld_row, input.ld_col,
+      reinterpret_cast<const TInput *>(ws->input_buffer),
+      input_pad_top, this->m_args.input_rows - input_i,
+      input_pad_left, this->m_args.input_cols - input_j
     );
-  }
 
-  void execute(
-    const void *const input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    void *const output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
-  ) const override
-  {
-    execute(
-      m_args.n_batches, m_args.input_rows, m_args.input_cols,
-      m_args.n_channels,
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      m_args.padding,
-      m_args.output_rows, m_args.output_cols,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
+    // Prepare the output pointers
+    fill_pointer_array(
+      outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
+      output.ld_row, output.ld_col,
+      reinterpret_cast<TOutput *>(ws->output_buffer),
+      0, this->m_args.output_rows - output_i, // Top padding, # valid rows
+      0, this->m_args.output_cols - output_j  // Left padding, # valid columns
+    );
+
+    // Call the kernel
+    kern(
+      channel_end - channel_start, inptr_array, outptr_array,
+      this->m_args.exclude_padding,
+      input_pad_left, input_pad_top,
+      input_pad_right, input_pad_bottom
     );
   }
 
-  void execute(
-    unsigned int batches,
-    unsigned int height,
-    unsigned int width,
-    unsigned int channels,
-    const void *const _input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    const PaddingValues &padding,
-    unsigned int output_height,
-    unsigned int output_width,
-    void *const _output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const _working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
+  // Compute a portion of the work with only top/bottom padding.
+  void compute_row_padded_tile_row(
+    const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+    const unsigned int channel_start, const unsigned int channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *working_space
   ) const override
   {
-    ARM_COMPUTE_UNUSED(batches, ld_input_batch, ld_output_batch);
-    strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
-    arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
-    // Cast input and output pointers into the right types
-    const TInput *const inptr = static_cast<const TInput *>(_input);
-    TOutput *const outptr = static_cast<TOutput *>(_output);
-
-    const unsigned int roundup_output_rows = roundup(output_height, num_threads);
-    const unsigned int rows_per_thread = roundup_output_rows / num_threads;
-    const int start_out_height = static_cast<int>(thread_id * rows_per_thread);
-    const int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
-
-    // Create an array for the input pointers
-    const TInput * _inptr_array[input_rows() * input_cols()];
-    const TInput **const inptr_array = _inptr_array;
-
-    // Create an array for the output pointers
-    TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
-    TOutput **const outptr_array = _outptr_array;
-
-    // Allocate portions of the working space
-    uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
-    TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space + thread_id * sizeof_output_buffer());
-    TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + num_threads * sizeof_output_buffer());
-
-    // Initialise the input buffer
-    for (unsigned int c = 0; c < channels; c++)
-    {
-      TInput &val = input_buffer[c];
+    const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
+      this->m_strat.get())->get_kernel();
+
+    // Get the working space, and some space on the stack for pointer arrays
+    auto ws = reinterpret_cast<WorkingSpace *>(working_space);
+    auto inptr_array = reinterpret_cast<const TInput **>(alloca(
+        sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
+    auto outptr_array = reinterpret_cast<TOutput **>(alloca(
+        sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
+
+    // Prepare the initial input pointers
+    const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+    const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+    const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+    const unsigned int end_ii = ii + this->m_strat->get_input_rows();
+    const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
+
+    const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+    const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+    const auto end_oi = output_i + this->m_strat->get_output_cols();
+    const auto output_pad_bottom = end_oi < this->m_args.output_rows ? 0 : end_oi - this->m_args.output_rows;
+
+    fill_pointer_array<const TInput>(
+      inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+      input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
+      input.ld_row, input.ld_col,
+      reinterpret_cast<const TInput *>(ws->input_buffer),
+      input_pad_top, this->m_args.input_rows - input_i,
+      0, this->m_args.input_cols - input_j
+    );
 
-      if (strategy::pooling_type() == PoolingType::AVERAGE)
-      {
-        val = static_cast<TInput>(0);
-      }
-      else if (strategy::pooling_type() == PoolingType::MAX)
-      {
-#if defined(__aarch64__)
-        using InputType = typename std::conditional<std::is_same<TInput, __fp16>::value, arm_compute::half, TInput>::type;
-        using limits = std::numeric_limits<InputType>;
-#else // defined(__aarch64__)
-        using limits = std::numeric_limits<TInput>;
-#endif // defined(__aarch64__)
-        if (limits::has_infinity)
-        {
-          val = -limits::infinity();
-        }
-        else
-        {
-          val = limits::min();
-        }
-      }
-    }
+    // Prepare the initial output pointers
+    fill_pointer_array(
+      outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+      output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
+      output.ld_row, output.ld_col,
+      reinterpret_cast<TOutput *>(ws->output_buffer),
+      0, this->m_args.output_rows - output_i, // Top padding, # valid rows
+      0, this->m_args.output_cols - output_j  // Left padding, # valid columns
+    );
 
-    // For each output tile, construct the requisite set of pointers and call
-    // into the kernel.
-    for (unsigned int batch = 0; batch < batches; batch++)
+    // Call the kernel
+    for (; n_tile_cols; n_tile_cols--)
     {
-      // Get batch pointers
-      const auto inptr_batch = inptr + batch * ld_input_batch;
-      const auto outptr_batch = outptr + batch * ld_output_batch;
+      kern(
+        channel_end - channel_start, inptr_array, outptr_array,
+        this->m_args.exclude_padding,
+        0, input_pad_top,
+        0, input_pad_bottom
+      );
+
+      // Progress the input and output pointer arrays
+      const auto input_col_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.pool_stride.cols;
+      for (
+        auto n = input_pad_top * this->m_strat->get_input_cols();
+        n < (this->m_strat->get_input_rows() - input_pad_bottom) * this->m_strat->get_input_cols();
+        n++
+      )
+      {
+        inptr_array[n] += input_col_stride;
+      }
 
-      for (int start_out_i = start_out_height;
-           start_out_i < end_out_height;
-           start_out_i += static_cast<int>(strategy::out_rows()))
+      const auto output_col_stride = output.ld_col * this->m_strat->get_output_cols();
+      for (
+        auto n = 0u;
+        n < (this->m_strat->get_output_rows() - output_pad_bottom) * this->m_strat->get_output_cols();
+        n++
+      )
       {
-        const int end_out_i = start_out_i + strategy::out_rows();
-        const int start_in_i = start_out_i * strategy::stride_rows() - padding.top;
-        const int end_in_i = start_in_i + input_rows();
-
-        // Compute top/bottom padding - TODO Is this right for average pooling?
-        const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
-        const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0));
-        const unsigned int valid_output_rows = std::min(
-          end_out_i - start_out_i,
-          static_cast<int>(end_out_height) - start_out_i
-        );
-
-        // Fill the input pointer array with padding values
-        for (auto index = 0u; index < input_rows() * input_cols(); index++)
-        {
-          inptr_array[index] = input_buffer;
-        }
-
-        for (int start_out_j = 0, start_in_j = -padding.left;
-             start_out_j < static_cast<int>(output_width);
-             start_out_j += static_cast<int>(strategy::out_cols()),
-             start_in_j += static_cast<int>(strategy::out_cols()) * strategy::stride_cols())
-        {
-          const int end_out_j = start_out_j + strategy::out_cols();
-          const int end_in_j = start_in_j + input_cols();
-
-          // Compute left/right padding - TODO Is this right for average pooling?
-          const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0));
-          const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0));
-
-          const unsigned int valid_output_cols = std::min(
-            end_out_j - start_out_j,
-            static_cast<int>(output_width) - start_out_j
-          );
-
-          // Construct the input pointer array - fill the array with pointers to
-          // the input buffer and then fill in the required values.
-          for (auto i = pad_top; i < input_rows() - pad_bottom; i++)
-          {
-            // Can skip over the left padding because we will have either the
-            // same or less than the previous tile.
-            unsigned int j = pad_left;
-            const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
-            const TInput **ptrs = inptr_array + i * input_cols() + j;
-            for (; j < input_cols() - pad_right; j++)
-            {
-              *(ptrs++) = colptr;
-              colptr += ld_input_col;
-            }
-            for (; j < input_cols(); j++)
-            {
-              *(ptrs++) = input_buffer;
-            }
-          }
-
-          // Construct the output pointer array.
-          TOutput **outptr_pos = outptr_array;
-          for (auto i = 0u; i < valid_output_rows; i++)
-          {
-            unsigned int j = 0u;
-            TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
-            for (; j < valid_output_cols; j++)
-            {
-              *(outptr_pos++) = colptr;
-               colptr += ld_output_col;
-            }
-            for (; j < strategy::out_cols(); j++)
-            {
-              *(outptr_pos++) = output_buffer;
-            }
-          }
-          for (auto i = valid_output_rows; i < strategy::out_rows(); i++)
-          {
-            for (auto j = 0u; j < strategy::out_cols(); j++)
-            {
-              *(outptr_pos++) = output_buffer;
-            }
-          }
-
-#ifdef CYCLE_PROFILING
-          // TODO Work number
-          auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::out_rows() * strategy::out_cols() * strategy::pool_rows() * strategy::pool_cols()));
-#endif
-          strat.kernel(
-            channels, inptr_array, outptr_array,
-            m_args.exclude_padding, pad_left, pad_top, pad_right, pad_bottom
-          );
-        }
+        outptr_array[n] += output_col_stride;
       }
     }
   }
+
+  public:
+  PoolingDepthfirst(const DepthfirstStrategy<TInput, TOutput> *strat,
+                    const PoolingArgs &args, const OutputStage &os = {})
+  : DepthfirstDriver<TInput, TOutput>(strat, args)
+  {
+    ARM_COMPUTE_UNUSED(os);
+  }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp
deleted file mode 100644
index 4aabd957cd..0000000000
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "pool_common.hpp"
-
-#include <stack>
-#include <vector>
-
-namespace arm_conv {
-namespace pooling {
-
-template <class strategy>
-class PoolingDepthfirstCacheOblivious : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
-{
-  using TInput = typename strategy::operand_type;
-  using TOutput = typename strategy::return_type;
-
-  const PoolingArgs m_args;  // Copy of arguments
-
-  constexpr static unsigned int input_rows(void)
-  {
-    return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
-  }
-
-  constexpr static unsigned int input_cols(void)
-  {
-    return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
-  }
-
-  size_t sizeof_input_buffer(void) const
-  {
-    return sizeof(TInput) * m_args.n_channels;
-  }
-
-  size_t sizeof_output_buffer(void) const
-  {
-    return sizeof(TOutput) * m_args.n_channels;
-  }
-
-  public:
-  PoolingDepthfirstCacheOblivious(const PoolingArgs &args) : m_args(args)
-  {
-  }
-
-  PoolingDepthfirstCacheOblivious(PoolingDepthfirstCacheOblivious &) = delete;
-  PoolingDepthfirstCacheOblivious &operator=(PoolingDepthfirstCacheOblivious &) = delete;
-
-  size_t get_working_size(void) const override
-  {
-    // We require an array of pointers for the inputs and outputs, a
-    // channel-length vector in which to dump surplus output, and a
-    // channel-length vector of padding values.
-    return sizeof_input_buffer() + sizeof_output_buffer();
-  }
-
-  void execute(
-    const void *const input,
-    void *const output,
-    void *const working_space
-  ) const override
-  {
-    const size_t ld_input_col = m_args.n_channels;
-    const size_t ld_input_row = ld_input_col * m_args.input_cols;
-    const size_t ld_input_batch = ld_input_row * m_args.input_rows;
-    const size_t ld_output_col = ld_input_col;
-    const size_t ld_output_row = ld_output_col * m_args.output_cols;
-    const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
-    execute(
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space
-    );
-  }
-
-  void execute(
-    const void *const input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    void *const output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const working_space
-  ) const override
-  {
-    execute(
-      m_args.n_batches, m_args.input_rows, m_args.input_cols,
-      m_args.n_channels,
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      m_args.padding,
-      m_args.output_rows, m_args.output_cols,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space
-    );
-  }
-
-  void execute(
-    unsigned int batches,
-    unsigned int input_height,
-    unsigned int input_width,
-    unsigned int channels,
-    const void *const _input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    const PaddingValues &padding,
-    unsigned int output_height,
-    unsigned int output_width,
-    void *const _output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const _working_space
-  ) const override
-  {
-    strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
-    arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
-    // Cast input and output pointers into the right types
-    const TInput *const inptr = static_cast<const TInput *>(_input);
-    TOutput *const outptr = static_cast<TOutput *>(_output);
-
-    // Allocate portions of the working space
-    uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
-    TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
-    TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer());
-
-    // Fill the input buffer
-    const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE)
-                           ? static_cast<TInput>(0)
-                           : (std::numeric_limits<TInput>::has_infinity
-                              ? -std::numeric_limits<TInput>::infinity()
-                              : std::numeric_limits<TInput>::lowest());
-    for (unsigned int i = 0; i < channels; i++)
-    {
-      input_buffer[i] = pad_value;
-    }
-
-    // Keep subdividing the output plane across the longest dimension until we
-    // reach the size of the tile. Queue items for later processing. Note - we
-    // can determine the largest size of the queue a priori from the input
-    // tensor size, this would allow us to allocate memory within the working
-    // space and improve performance.
-    struct WorkItem
-    {
-      unsigned int output_i, output_j;
-      unsigned int output_height, output_width;
-
-      WorkItem(unsigned int i, unsigned int j, unsigned int height, unsigned int width)
-        : output_i(i), output_j(j), output_height(height), output_width(width) {}
-    };
-
-    auto execute = [&] (const WorkItem &item) {
-      // Create an array for the output pointers
-      TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
-      TOutput **const outptr_array = _outptr_array;
-
-      // Construct the output pointer array
-      {
-        const auto output_pad_right = strategy::out_rows() - item.output_width;
-        auto outptr_element = outptr_array;
-        auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col;
-
-        // Fill the array with pointers to the output buffer
-        for (unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++)
-        {
-          outptr_array[i] = output_buffer;
-        }
-
-        // Fill in the valid portion of the array
-        for (unsigned int i = 0; i < item.output_height; i++)
-        {
-          auto outptr_col = outptr_row;
-          for (unsigned int j = 0; j < item.output_width; j++)
-          {
-            *(outptr_element++) = outptr_col;
-            outptr_col += ld_output_col;
-          }
-          outptr_element += output_pad_right;
-          outptr_row += ld_output_row;
-        }
-      }
-
-      const int start_i = item.output_i * strategy::stride_rows() - padding.top;
-      const int end_i = start_i + input_rows();
-      const unsigned int pad_top = std::max(0, 0 - start_i);
-      const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height));
-
-      const int start_j = item.output_j * strategy::stride_cols() - padding.left;
-      const int end_j = start_j + input_cols();
-      const unsigned int pad_left = std::max(0, 0 - start_j);
-      const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width));
-
-      // Create an array for the input pointers
-      const TInput * _inptr_array[input_rows() * input_cols()];
-      const TInput **const inptr_array = _inptr_array;
-      {
-        const unsigned int row_padding = pad_top + pad_bottom;
-        const unsigned int valid_rows = input_rows() - row_padding;
-
-        const unsigned int col_padding = pad_left + pad_right;
-        const unsigned int valid_cols = input_cols() - col_padding;
-
-        // Fill the array with pointers to the input buffer
-        for (unsigned int i = 0; i < input_rows() * input_cols(); i++)
-        {
-          inptr_array[i] = input_buffer;
-        }
-
-        // Compute valid initial pointer
-        auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col;
-
-        // Fill in the valid portion of the input array
-        auto inptr_element = inptr_array + pad_top * input_cols() + pad_left;
-        for (unsigned int i = 0; i < valid_rows; i++)
-        {
-          auto inptr_col = inptr_row;
-          for (unsigned int j = 0; j < valid_cols; j++)
-          {
-            *(inptr_element++) = inptr_col;
-            inptr_col += ld_input_col;
-          }
-
-          inptr_row += ld_input_row;
-          inptr_element += col_padding;  // Skip the padding elements
-        }
-      }
-
-      // Call the kernel
-#ifdef CYCLE_PROFILING
-      // TODO Work number
-      auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols()));
-#endif // CYCLE_PROFILING
-      strat.kernel(channels, inptr_array, outptr_array,
-                   pad_left, pad_top, pad_right, pad_bottom);
-    };
-
-    // Add the initial work item to the stack of work.
-    std::stack<WorkItem, std::vector<WorkItem>> stack;
-    stack.push(WorkItem(0, 0, output_height, output_width));
-    while (!stack.empty())
-    {
-      // Pop an item from the stack, bisect the largest dimension and either
-      // execute the resulting tiles or add them to the stack if they are too
-      // large.
-      const WorkItem item(stack.top());
-      stack.pop();
-
-      if (item.output_height <= strategy::out_rows() &&
-          item.output_width <= strategy::out_cols())
-      {
-        execute(item);
-      }
-      else
-      {
-        // Split the largest dimension, such that we get an exact number of
-        // tiles in the first partition.
-        if (item.output_height >= item.output_width)
-        {
-          const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows();
-          const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2;
-
-          const unsigned int height_first = tiles_first * strategy::out_rows();
-          const unsigned int height_second = item.output_height - height_first;
-
-          stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width));
-          stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width));
-        }
-        else
-        {
-          const unsigned int width_in_tiles = item.output_width / strategy::out_cols();
-          const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2;
-
-          const unsigned int width_first = tiles_first * strategy::out_cols();
-          const unsigned int width_second = item.output_width - width_first;
-
-          stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second));
-          stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first));
-        }
-      }
-    }
-  }
-};
-
-}  // namespace pooling
-}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
index 5979862ed8..ded2c75127 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,236 +24,264 @@
 
 #pragma once
 
-#include "pool_common.hpp"
+#include "depthfirst_driver.hpp"
 #include "utils.hpp"
+#if !defined(_WIN64) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
 
 namespace arm_conv {
 namespace pooling {
 
-template <class strategy>
-class PoolingDepthfirstGeneric : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
+template <typename TInput, typename TOutput, typename OutputStage = Nothing>
+class IGenericDepthfirstStrategy;
+
+template <typename TInput, typename TOutput>
+class IGenericDepthfirstStrategy<TInput, TOutput, Nothing>
 {
-  using TInput = typename strategy::operand_type;
-  using TOutput = typename strategy::return_type;
+  public:
+  virtual ~IGenericDepthfirstStrategy() = default;
 
-  const PoolingArgs m_args;  // Copy of arguments
+  typedef void (*KernelType)(
+    uint64_t window_cells,
+    uint64_t n_valid_cells,
+    uint64_t n_channels,
+    const TInput *const *,
+    TOutput *
+  );
 
-  unsigned int input_rows(void) const
-  {
-    return m_args.pool_window.rows;
-  }
-
-  unsigned int input_cols(void) const
-  {
-    return m_args.pool_window.cols;
-  }
+  virtual KernelType get_kernel(void) const = 0;
+};
 
+template <typename TInput, typename TOutput>
+class IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>
+{
   public:
-  PoolingDepthfirstGeneric(const PoolingArgs &args) : m_args(args)
-  {
-  }
+  virtual ~IGenericDepthfirstStrategy() = default;
+
+  typedef void (*KernelType)(
+    uint64_t window_cells,
+    uint64_t n_valid_cells,
+    uint64_t n_channels,
+    const TInput *const *,
+    TOutput *,
+    const Requantize32 &
+  );
+
+  virtual KernelType get_kernel(void) const = 0;
+};
 
-  PoolingDepthfirstGeneric(PoolingDepthfirstGeneric &) = delete;
-  PoolingDepthfirstGeneric &operator=(PoolingDepthfirstGeneric &) = delete;
+template <typename TInput, typename TOutput, typename OutputStage>
+struct Invoker;
 
-  size_t sizeof_input_pointer_array(void) const
+template <typename TInput, typename TOutput>
+struct Invoker<TInput, TOutput, Nothing>
+{
+  static inline void invoke(
+    const typename IGenericDepthfirstStrategy<TInput, TOutput, Nothing>::KernelType kern,
+    uint64_t window_cells,
+    uint64_t n_valid_cells,
+    uint64_t n_channels,
+    const TInput *const *inptrs,
+    TOutput *outptr,
+    const Nothing &
+  )
   {
-    return sizeof(TInput *) * input_rows() * input_cols();
+    kern(window_cells, n_valid_cells, n_channels, inptrs, outptr);
   }
+};
 
-  size_t get_working_size(unsigned int num_threads) const override
+template <typename TInput, typename TOutput>
+struct Invoker<TInput, TOutput, Requantize32>
+{
+  static inline void invoke(
+    const typename IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>::KernelType kern,
+    uint64_t window_cells,
+    uint64_t n_valid_cells,
+    uint64_t n_channels,
+    const TInput *const *inptrs,
+    TOutput *outptr,
+    const Requantize32 &qp
+  )
   {
-    return num_threads * sizeof_input_pointer_array();
+    kern(window_cells, n_valid_cells, n_channels, inptrs, outptr, qp);
   }
+};
 
-  void execute(
-    const void *const input,
-    void *const output,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
-  ) const override
+template <typename TInput, typename TOutput, typename OutputStage>
+class GenericDepthfirstWrapper : public IDepthfirstStrategy
+{
+  using StratType = IGenericDepthfirstStrategy<TInput, TOutput, OutputStage>;
+
+  std::unique_ptr<const StratType> m_strat;
+  const unsigned int window_rows, window_cols;
+
+  public:
+  GenericDepthfirstWrapper(const StratType *strat, const PoolingArgs &args)
+  : m_strat(strat), window_rows(args.pool_window.rows), window_cols(args.pool_window.cols)
   {
-    const size_t ld_input_col = m_args.n_channels;
-    const size_t ld_input_row = ld_input_col * m_args.input_cols;
-    const size_t ld_input_batch = ld_input_row * m_args.input_rows;
-    const size_t ld_output_col = ld_input_col;
-    const size_t ld_output_row = ld_output_col * m_args.output_cols;
-    const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
-    execute(
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
-    );
   }
 
-  void execute(
-    const void *const input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    void *const output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
+  unsigned int get_input_rows(void) const override { return window_rows; }
+  unsigned int get_input_cols(void) const override { return window_cols; }
+  unsigned int get_output_rows(void) const override { return 1; }
+  unsigned int get_output_cols(void) const override { return 1; }
+
+  typename StratType::KernelType get_kernel(void) const { return m_strat->get_kernel(); }
+};
+
+template <typename TInput, typename TOutput=TInput, typename OutputStage=Nothing>
+class PoolingDepthfirstGeneric : public DepthfirstDriver<TInput, TOutput>
+{
+  const OutputStage m_os;
+
+  protected:
+  size_t get_working_size_per_thread() const override { return 0; }
+  void initialise_working_space(void *) const override { /* Nothing */ }
+
+  /* Compute a portion of the output tensor with padding. */
+  void compute_tile_padded(
+    unsigned int output_i, unsigned int output_j,
+    unsigned int channel_start, unsigned int channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *
   ) const override
   {
-    execute(
-      m_args.n_batches, m_args.input_rows, m_args.input_cols,
-      m_args.n_channels,
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      m_args.padding,
-      m_args.output_rows, m_args.output_cols,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
+    // Determine start position and padding
+    const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+    const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
+    const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
+    const int end_i = start_i + this->m_args.pool_window.rows;
+    const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
+    const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
+
+    const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+    const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
+    const auto pad_left = static_cast<unsigned int>(start_j < 0 ? -start_j : 0);
+    const int end_j = start_j + this->m_args.pool_window.cols;
+    const auto pad_right = static_cast<unsigned int>((unsigned int) end_j < this->m_args.input_cols ? 0 : end_j - this->m_args.input_cols);
+    const auto valid_cols = this->m_args.pool_window.cols - (pad_left + pad_right);
+
+    // Determine the number of valid cells and prepare the pointers
+    const auto n_valid_cells = valid_rows * valid_cols;
+    auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *)));
+    {
+      auto my_ptr = inptrs;
+      auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start;
+      for (auto i = valid_rows; i; i--)
+      {
+        auto ptr = row_ptr;
+        row_ptr += input.ld_row;
+
+        for (auto j = valid_cols; j; j--)
+        {
+          *(my_ptr++) = ptr;
+          ptr += input.ld_col;
+        }
+      }
+    }
+
+    auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start;
+
+    // Some padding variants include (or exclude) the padding values; we handle
+    // this by computing the extent of the padded input tensor and hence
+    // computing the total number of cells captured in the pooling window.
+    const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
+    const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
+    const auto right_padded_width = this->m_args.input_cols + this->m_args.padding.right;
+    const auto captured_cols = std::min<int>(end_j, right_padded_width) - start_j;
+    const auto captured_cells = captured_rows * captured_cols;
+    const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
+
+    // Execute the kernel
+    Invoker<TInput, TOutput, OutputStage>::invoke(
+      reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
+      window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
     );
   }
 
-  void execute(
-    unsigned int batches,
-    unsigned int height,
-    unsigned int width,
-    unsigned int channels,
-    const void *const _input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    const PaddingValues &padding,
-    unsigned int output_height,
-    unsigned int output_width,
-    void *const _output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const _working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
+  // Compute a portion of the work with only top/bottom padding.
+  void compute_row_padded_tile_row(
+    const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+    const unsigned int channel_start, const unsigned int channel_end,
+    const TensorSpec<const TInput *> &input,
+    const TensorSpec<TOutput *> &output,
+    void *
   ) const override
   {
-    strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
-    arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
-    const unsigned int roundup_output_rows = roundup(output_height, num_threads);
-    const unsigned int rows_per_thread = roundup_output_rows / num_threads;
-    int start_out_height = static_cast<int>(thread_id * rows_per_thread);
-    int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
-
-    unsigned int start_channel = 0;
-    unsigned int end_channel = channels;
-    if(output_height == 1)
+    // Determine start position and padding
+    const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+    const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
+    const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
+    const int end_i = start_i + this->m_args.pool_window.rows;
+    const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
+    const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
+
+    const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+    const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
+    const auto valid_cols = this->m_args.pool_window.cols;
+
+    // Determine the number of valid cells and prepare the pointers
+    const auto n_valid_cells = valid_rows * valid_cols;
+    auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *)));
     {
-      const unsigned int channels_per_thread = roundup(channels, num_threads) / num_threads;
-      start_channel = thread_id * channels_per_thread;
-      end_channel = std::min(start_channel + channels_per_thread, channels);
-
-      // Reset start and end rows
-      start_out_height = 0;
-      end_out_height = output_height;
-    }
+      auto my_ptr = inptrs;
+      auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start;
+      for (auto i = valid_rows; i; i--)
+      {
+        auto ptr = row_ptr;
+        row_ptr += input.ld_row;
 
-    if(start_channel >= end_channel)
-    {
-        // Early exit in case of multiple threads parallelising on channels
-        return;
+        for (auto j = valid_cols; j; j--)
+        {
+          *(my_ptr++) = ptr;
+          ptr += input.ld_col;
+        }
+      }
     }
 
-    // Cast input and output pointers into the right types
-    const TInput *const inptr = static_cast<const TInput *>(_input) + start_channel;
-    TOutput *const outptr = static_cast<TOutput *>(_output) + start_channel;
+    auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start;
 
-    // Grab the input pointer array
-    uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
-    const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space + thread_id * sizeof_input_pointer_array());
+    // Some padding variants include (or exclude) the padding values; we handle
+    // this by computing the extent of the padded input tensor and hence
+    // computing the total number of cells captured in the pooling window.
+    const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
+    const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
+    const auto captured_cells = captured_rows * valid_cols;
+    const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
 
-    // For each output tile, construct the requisite set of pointers and call
-    // into the kernel.
-    for (unsigned int batch = 0; batch < batches; batch++)
+    for (; n_tile_cols; n_tile_cols--)
     {
-      // Get batch pointers
-      const auto inptr_batch = inptr + batch * ld_input_batch;
-      auto outptr_row = outptr + batch * ld_output_batch + start_out_height * ld_output_row;
-
-      for (int out_i = start_out_height; out_i < end_out_height; out_i++)
+      // Execute the kernel
+      Invoker<TInput, TOutput, OutputStage>::invoke(
+        reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
+        window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
+      );
+
+      // Update the pointers; the output strides by a column and the inputs
+      // stride by a number of columns.
+      outptr += output.ld_col;
+      for (auto n = 0u; n < n_valid_cells; n++)
       {
-        const int start_in_i = out_i * m_args.pool_stride.rows - padding.top;
-        const int end_in_i = start_in_i + m_args.pool_window.rows;
-
-        // Compute top/bottom padding
-        const auto pad_top = static_cast<unsigned int>(std::max(0 - start_in_i, 0));
-        const auto pad_bottom = static_cast<unsigned int>(std::max<int>(end_in_i - height, 0));
-        const auto valid_rows = input_rows() - pad_top - pad_bottom;
-
-        // Compute the number of pooling window rows which are contained in
-        // either the valid region of the input tensor, or the padding.
-        const auto padded_bottom = std::min<unsigned int>(
-          start_in_i + m_args.pool_window.rows, height + padding.bottom
-        );
-        const auto n_total_rows = padded_bottom - start_in_i;
-
-        auto outptr_col = outptr_row;
-        auto inptr_row = inptr_batch + (start_in_i + pad_top) * ld_input_row;
-
-        for (int out_j = 0, start_in_j = -padding.left;
-             out_j < static_cast<int>(output_width);
-             out_j++, start_in_j += m_args.pool_stride.cols)
-        {
-          const int end_in_j = start_in_j + m_args.pool_window.cols;
-
-          // Compute left/right padding
-          const auto pad_left = static_cast<unsigned int>(std::max(0 - start_in_j, 0));
-          const auto pad_right = static_cast<unsigned int>(std::max<int>(0, end_in_j - width));
-          const auto valid_cols = input_cols() - pad_left - pad_right;
-
-          // Compute the number of pooling window columns which are contained
-          // in either the valid region of the input tensor, or the padding.
-          const auto padded_right = std::min<unsigned int>(
-            start_in_j + m_args.pool_window.cols, width + padding.right
-          );
-          const auto n_total_cols = padded_right - start_in_j;
-
-          // Construct the input pointer array - fill in all valid points
-          // contiguously.
-          const TInput **ptrs = inptr_array;
-          const TInput *rowptr = inptr_row + (start_in_j + pad_left) * ld_input_col;
-          for (auto i = 0u; i < valid_rows; i++)
-          {
-            const TInput *colptr = rowptr;
-            for (auto j = 0u; j < valid_cols; j++)
-            {
-              *(ptrs++) = colptr;
-              colptr += ld_input_col;
-            }
-            rowptr += ld_input_row;
-          }
-
-          // Compute the number of valid cells
-          const auto valid_cells = valid_rows * valid_cols;
-          const auto cells_in_range = n_total_rows * n_total_cols;
-          const auto window_cells = m_args.exclude_padding ? valid_cells : cells_in_range;
-
-          // Get the output pointer for this call
-          TOutput *outptr = outptr_col;
-          outptr_col += ld_output_col;
-
-#ifdef CYCLE_PROFILING
-          // TODO Work number
-          auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::out_rows() * strategy::out_cols() * strategy::pool_rows() * strategy::pool_cols()));
-#endif // CYCLE_PROFILING
-          strat.kernel(window_cells, valid_cells, end_channel - start_channel, inptr_array, outptr);
-        }
-
-        outptr_row += ld_output_row;
+        inptrs[n] += this->m_args.pool_stride.cols * input.ld_col;
       }
     }
   }
+
+  public:
+  PoolingDepthfirstGeneric(
+    const IGenericDepthfirstStrategy<TInput, TOutput, OutputStage> *strat,
+    const PoolingArgs &args,
+    const OutputStage &os = {}
+  )
+  : DepthfirstDriver<TInput, TOutput>(
+      new GenericDepthfirstWrapper<TInput, TOutput, OutputStage>(strat, args),
+      args
+    ),
+    m_os(os)
+  {
+  }
 };
 
 }  // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp
deleted file mode 100644
index f3cb9a1d1f..0000000000
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "pool_common.hpp"
-#include "utils.hpp"
-
-namespace arm_conv {
-namespace pooling {
-
-template <class strategy>
-class PoolingDepthfirstGenericQuantized : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type, Requantize32>
-{
-  using TInput = typename strategy::operand_type;
-  using TOutput = typename strategy::return_type;
-
-  const PoolingArgs m_args;  // Copy of arguments
-  const Requantize32 m_requant;  // Quantization parameters
-
-  unsigned int input_rows(void) const
-  {
-    return m_args.pool_window.rows;
-  }
-
-  unsigned int input_cols(void) const
-  {
-    return m_args.pool_window.cols;
-  }
-
-  public:
-  PoolingDepthfirstGenericQuantized(const PoolingArgs &args, const Requantize32 &rq) : m_args(args), m_requant(rq)
-  {
-  }
-
-  PoolingDepthfirstGenericQuantized(PoolingDepthfirstGenericQuantized &) = delete;
-  PoolingDepthfirstGenericQuantized &operator=(PoolingDepthfirstGenericQuantized &) = delete;
-
-  size_t sizeof_input_pointer_array(void) const
-  {
-    return sizeof(TInput *) * input_rows() * input_cols();
-  }
-
-  size_t get_working_size(unsigned int num_threads) const override
-  {
-    return num_threads * sizeof_input_pointer_array();
-  }
-
-  void execute(
-    const void *const input,
-    void *const output,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
-  ) const override
-  {
-    const size_t ld_input_col = m_args.n_channels;
-    const size_t ld_input_row = ld_input_col * m_args.input_cols;
-    const size_t ld_input_batch = ld_input_row * m_args.input_rows;
-    const size_t ld_output_col = ld_input_col;
-    const size_t ld_output_row = ld_output_col * m_args.output_cols;
-    const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
-    execute(
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
-    );
-  }
-
-  void execute(
-    const void *const input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    void *const output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
-  ) const override
-  {
-    execute(
-      m_args.n_batches, m_args.input_rows, m_args.input_cols,
-      m_args.n_channels,
-      input, ld_input_col, ld_input_row, ld_input_batch,
-      m_args.padding,
-      m_args.output_rows, m_args.output_cols,
-      output, ld_output_col, ld_output_row, ld_output_batch,
-      working_space,
-      thread_id, num_threads
-    );
-  }
-
-  void execute(
-    unsigned int batches,
-    unsigned int height,
-    unsigned int width,
-    unsigned int channels,
-    const void *const _input,
-    size_t ld_input_col,
-    size_t ld_input_row,
-    size_t ld_input_batch,
-    const PaddingValues &padding,
-    unsigned int output_height,
-    unsigned int output_width,
-    void *const _output,
-    size_t ld_output_col,
-    size_t ld_output_row,
-    size_t ld_output_batch,
-    void *const _working_space,
-    unsigned int thread_id,
-    unsigned int num_threads
-  ) const override
-  {
-    strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
-    arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
-    const unsigned int roundup_output_rows = roundup(output_height, num_threads);
-    const unsigned int rows_per_thread = roundup_output_rows / num_threads;
-    int start_out_height = static_cast<int>(thread_id * rows_per_thread);
-    int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
-
-    unsigned int start_channel = 0;
-    unsigned int end_channel = channels;
-    if(output_height == 1)
-    {
-      const unsigned int channels_per_thread = roundup(channels, num_threads) / num_threads;
-      start_channel = thread_id * channels_per_thread;
-      end_channel = std::min(start_channel + channels_per_thread, channels);
-
-      // Reset start and end rows
-      start_out_height = 0;
-      end_out_height = output_height;
-    }
-
-    if(start_channel >= end_channel)
-    {
-        // Early exit in case of multiple threads parallelising on channels
-        return;
-    }
-
-    // Cast input and output pointers into the right types
-    const TInput *const inptr = static_cast<const TInput *>(_input) + start_channel;
-    TOutput *const outptr = static_cast<TOutput *>(_output) + start_channel;
-
-    // Grab the input pointer array
-    uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
-    const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space + thread_id * sizeof_input_pointer_array());
-
-    // For each output tile, construct the requisite set of pointers and call
-    // into the kernel.
-    for (unsigned int batch = 0; batch < batches; batch++)
-    {
-      // Get batch pointers
-      const auto inptr_batch = inptr + batch * ld_input_batch;
-      const auto outptr_batch = outptr + batch * ld_output_batch;
-
-      for (int out_i = start_out_height; out_i < end_out_height; out_i++)
-      {
-        const int start_in_i = out_i * m_args.pool_stride.rows - padding.top;
-        const int end_in_i = start_in_i + m_args.pool_window.rows;
-
-        // Compute top/bottom padding
-        const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
-        const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0));
-
-        // Compute the number of pooling window rows which are contained in
-        // either the valid region of the input tensor, or the padding.
-        const auto padded_bottom = std::min<unsigned int>(
-          start_in_i + m_args.pool_window.rows, height + padding.bottom
-        );
-        const auto n_total_rows = padded_bottom - start_in_i;
-
-        for (int out_j = 0, start_in_j = -padding.left;
-             out_j < static_cast<int>(output_width);
-             out_j++, start_in_j += m_args.pool_stride.cols)
-        {
-          const int end_in_j = start_in_j + m_args.pool_window.cols;
-
-          // Compute left/right padding
-          const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0));
-          const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0));
-
-          // Compute the number of pooling window columns which are contained
-          // in either the valid region of the input tensor, or the padding.
-          const auto padded_right = std::min<unsigned int>(
-            start_in_j + m_args.pool_window.cols, width + padding.right
-          );
-          const auto n_total_cols = padded_right - start_in_j;
-
-          // Construct the input pointer array - fill in all valid points
-          // contiguously.
-          const TInput **ptrs = inptr_array;
-          for (auto i = pad_top; i < input_rows() - pad_bottom; i++)
-          {
-            // Can skip over the left padding because we will have either the
-            // same or less than the previous tile.
-            unsigned int j = pad_left;
-            const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
-            for (; j < input_cols() - pad_right; j++)
-            {
-              *(ptrs++) = colptr;
-              colptr += ld_input_col;
-            }
-          }
-
-          // Compute the number of valid cells
-          const auto valid_rows = input_rows() - pad_top - pad_bottom;
-          const auto valid_cols = input_cols() - pad_left - pad_right;
-          const auto valid_cells = valid_rows * valid_cols;
-          const auto cells_in_range = n_total_rows * n_total_cols;
-          const auto window_cells = m_args.exclude_padding ? valid_cells : cells_in_range;
-
-          // Get the output pointer for this call
-          TOutput *outptr = outptr_batch + out_i * ld_output_row + out_j * ld_output_col;
-
-#ifdef CYCLE_PROFILING
-          // TODO Work number
-          auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long) 0);
-#endif
-          strat.kernel(window_cells, valid_cells, end_channel - start_channel, inptr_array, outptr, m_requant);
-        }
-      }
-    }
-  }
-};
-
-}  // namespace pooling
-}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp
index 094c6aa301..a7f3dd3a93 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,12 +33,18 @@
 
 #include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp"
@@ -48,19 +54,6 @@
 namespace arm_conv {
 namespace pooling {
 
-namespace
-{
-  template <class Strategy>
-  bool is_supported(const PoolingArgs &args, const Nothing &)
-  {
-    return ((args.pool_type == Strategy::pooling_type()) &&
-            (args.pool_window.rows == Strategy::pool_rows()) &&
-            (args.pool_window.cols == Strategy::pool_cols()) &&
-            (args.pool_stride.rows == Strategy::stride_rows()) &&
-            (args.pool_stride.cols == Strategy::stride_cols()));
-  }
-}
-
 static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
   {
     PoolingMethod::DEPTHFIRST,
@@ -70,48 +63,115 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<__fp16>>(args);
+      auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<__fp16>(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
     },
   },
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+      auto strat = new sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+      auto strat = new sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp16_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::AVERAGE;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+      auto strat = new sme_fp16_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp16_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+      auto strat = new sme_fp16_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst",
-    is_supported<sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirst<sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst",
-    is_supported<sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirst<sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp16_nhwc_avg_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::AVERAGE;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirstGeneric<sve_fp16_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new sve_fp16_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp16_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirstGeneric<sve_fp16_nhwc_max_generic_depthfirst>(args);
+      auto strat = new sve_fp16_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
   {
     PoolingMethod::DEPTHFIRST,
@@ -119,7 +179,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
     is_supported<a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirst<a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
     },
   },
   {
@@ -128,7 +189,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
     is_supported<a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirst<a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<__fp16>(strat, args);
     },
   },
   {
@@ -137,7 +199,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirstGeneric<a64_fp16_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new a64_fp16_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
     },
   },
   {
@@ -146,7 +209,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
-      return new PoolingDepthfirstGeneric<a64_fp16_nhwc_max_generic_depthfirst>(args);
+      auto strat = new a64_fp16_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<__fp16>(strat, args);
     },
   },
 #endif  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
index 002115d78c..99d106583e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,12 +30,18 @@
 
 #include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp"
@@ -45,19 +51,6 @@
 namespace arm_conv {
 namespace pooling {
 
-namespace
-{
-  template <class Strategy>
-  bool is_supported(const PoolingArgs &args, const Nothing &)
-  {
-    return ((args.pool_type == Strategy::pooling_type()) &&
-            (args.pool_window.rows == Strategy::pool_rows()) &&
-            (args.pool_window.cols == Strategy::pool_cols()) &&
-            (args.pool_stride.rows == Strategy::stride_rows()) &&
-            (args.pool_stride.cols == Strategy::stride_cols()));
-  }
-}
-
 static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
   {
     PoolingMethod::DEPTHFIRST,
@@ -67,55 +60,123 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<float>>(args);
+      auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<float>(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float, float, Nothing>(strat, args);
     },
   },
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+      auto strat = new sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+      auto strat = new sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp32_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::AVERAGE;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+      auto strat = new sme_fp32_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_fp32_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+      auto strat = new sme_fp32_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst",
-    is_supported<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst",
-    is_supported<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp32_nhwc_avg_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::AVERAGE;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<sve_fp32_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_fp32_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<sve_fp32_nhwc_max_generic_depthfirst>(args);
+      auto strat = new sve_fp32_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst",
     is_supported<a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
@@ -124,7 +185,8 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
     is_supported<a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirst<a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<float>(strat, args);
     },
   },
   {
@@ -133,7 +195,8 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<a64_fp32_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
   {
@@ -142,7 +205,8 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
-      return new PoolingDepthfirstGeneric<a64_fp32_nhwc_max_generic_depthfirst>(args);
+      auto strat = new a64_fp32_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<float>(strat, args);
     },
   },
 #endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
index 3d968b84e5..235aa1b635 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,7 @@ struct PoolingImplementation
   const char * name;
   std::function<bool(const PoolingArgs &, const OutputStage &)> is_supported;
   std::function<uint64_t(const PoolingArgs &, const OutputStage &)> cycle_estimate;
-  std::function<PoolingCommon<TInput, TOutput, OutputStage> *(const PoolingArgs &, const OutputStage &)> initialise;
+  std::function<PoolingCommon<TInput, TOutput> *(const PoolingArgs &, const OutputStage &)> initialise;
 
   bool get_is_supported(const PoolingArgs &args, const OutputStage &os) const
   {
@@ -51,12 +51,15 @@ struct PoolingImplementation
     return (cycle_estimate == nullptr) ? 0 : cycle_estimate(args, os);
   }
 
-  PoolingCommon<TInput, TOutput, OutputStage> *get_instance(const PoolingArgs &args, const OutputStage &os) const
+  PoolingCommon<TInput, TOutput> *get_instance(const PoolingArgs &args, const OutputStage &os) const
   {
     return initialise(args, os);
   }
 };
 
+/**
+ * \relates PoolingImplementation
+ */
 template <typename TInput, typename TOutput, class OutputStage = Nothing>
 const PoolingImplementation<TInput, TOutput, OutputStage> *pooling_implementation_list();
 
@@ -92,11 +95,21 @@ bool find_implementation(
 }
 
 template <typename TInput, typename TOutput, class OutputStage>
-UniquePoolingCommon<TInput, TOutput, OutputStage> pooling(const PoolingArgs &args, const OutputStage &os)
+UniquePoolingCommon<TInput, TOutput> pooling(const PoolingArgs &args, const OutputStage &os)
 {
   const PoolingImplementation<TInput, TOutput, OutputStage> *impl = nullptr;
   const bool success = find_implementation<TInput, TOutput, OutputStage>(args, os, impl);
-  return UniquePoolingCommon<TInput, TOutput, OutputStage>(success ? impl->get_instance(args, os) : nullptr);
+  return UniquePoolingCommon<TInput, TOutput>(success ? impl->get_instance(args, os) : nullptr);
+}
+
+template <class Strategy>
+bool is_supported(const PoolingArgs &args, const Nothing &)
+{
+  return ((args.pool_type == Strategy::pooling_type) &&
+          (args.pool_window.rows == Strategy::pool_rows) &&
+          (args.pool_window.cols == Strategy::pool_cols) &&
+          (args.pool_stride.rows == Strategy::stride_rows) &&
+          (args.pool_stride.cols == Strategy::stride_cols));
 }
 
 }  //  namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp
index 490fc0d863..8d08ddc43f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,13 +30,16 @@
 
 #include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_s8_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp"
-#endif  // defined(SVE2)
 #include "kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_s8_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/a64_s8_nhwc_max_generic_depthfirst.hpp"
@@ -47,19 +50,6 @@
 namespace arm_conv {
 namespace pooling {
 
-namespace
-{
-  template <class Strategy>
-  bool is_supported(const PoolingArgs &args, const Nothing &)
-  {
-    return ((args.pool_type == Strategy::pooling_type()) &&
-            (args.pool_window.rows == Strategy::pool_rows()) &&
-            (args.pool_window.cols == Strategy::pool_cols()) &&
-            (args.pool_stride.rows == Strategy::stride_rows()) &&
-            (args.pool_stride.cols == Strategy::stride_cols()));
-  }
-}
-
 static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
   {
     PoolingMethod::DEPTHFIRST,
@@ -69,48 +59,97 @@ static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<int8_t>>(args);
+      auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<int8_t>(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
   {
     PoolingMethod::DEPTHFIRST,
-    "sve_s8_nhwc_avg_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
+    "sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<int8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_s8_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme2() && args.pool_type == PoolingType::AVERAGE;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sme_s8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_s8_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirstGeneric<sve_s8_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new sme_s8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
-#endif  // defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst",
-    is_supported<sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<int8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sve_s8_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve2() && args.pool_type == PoolingType::AVERAGE;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirst<sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_s8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_s8_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirstGeneric<sve_s8_nhwc_max_generic_depthfirst>(args);
+      auto strat = new sve_s8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst",
     is_supported<a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirst<a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<int8_t>(strat, args);
     },
   },
   {
@@ -119,7 +158,8 @@ static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirstGeneric<a64_s8_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new a64_s8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
   {
@@ -128,7 +168,8 @@ static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
-      return new PoolingDepthfirstGeneric<a64_s8_nhwc_max_generic_depthfirst>(args);
+      auto strat = new a64_s8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t>(strat, args);
     },
   },
 #endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp
index fd4e045035..dcb3c8f57c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,17 @@
 #include "arm_gemm_local.hpp"
 
 #include "pooling_implementation.hpp"
-#include "pooling_depthfirst_generic_quantized.hpp"
+#include "pooling_depthfirst_generic.hpp"
 
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp"
 #endif  // defined(__aarch64__)
@@ -41,30 +45,60 @@
 namespace arm_conv {
 namespace pooling {
 
-static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_methods[] = {
+static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_s8q_methods[] = {
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_s8q_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sme2() && args.pool_type == PoolingType::AVERAGE;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sme_s8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_s8q_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sme2() && args.pool_type == PoolingType::MAX;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sme_s8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_s8q_nhwc_avg_generic_depthfirst",
     [] (const PoolingArgs &args, const Requantize32 &) -> bool {
-      return args.pool_type == PoolingType::AVERAGE;
+      return args.cpu_info->has_sve2() && args.pool_type == PoolingType::AVERAGE;
     },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<sve_s8q_nhwc_avg_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_s8q_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sve2() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<sve_s8q_nhwc_max_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new sve_s8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "a64_s8q_nhwc_avg_generic_depthfirst",
@@ -72,8 +106,9 @@ static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_meth
       return args.pool_type == PoolingType::AVERAGE;
     },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<a64_s8q_nhwc_avg_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
     },
   },
   {
@@ -81,8 +116,9 @@ static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_meth
     "a64_s8q_nhwc_max_generic_depthfirst",
     [] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<a64_s8q_nhwc_max_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+      auto strat = new a64_s8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
     },
   },
 #endif  // defined(__aarch64__)
@@ -92,10 +128,10 @@ static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_meth
 template <>
 const PoolingImplementation<int8_t, int8_t, Requantize32> *pooling_implementation_list()
 {
-  return pooling_u8_methods;
+  return pooling_s8q_methods;
 }
 
-template UniquePoolingCommon<int8_t, int8_t, Requantize32> pooling(const PoolingArgs &, const Requantize32 &);
+template UniquePoolingCommon<int8_t, int8_t> pooling(const PoolingArgs &, const Requantize32 &);
 
 }  //  namespace pooling
 }  //  namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp
index 052354922e..ee5a79b4ff 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,13 +30,16 @@
 
 #include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_u8_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp"
-#endif  // defined(SVE2)
 #include "kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/sve_u8_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
 #include "kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/a64_u8_nhwc_max_generic_depthfirst.hpp"
@@ -47,19 +50,6 @@
 namespace arm_conv {
 namespace pooling {
 
-namespace
-{
-  template <class Strategy>
-  bool is_supported(const PoolingArgs &args, const Nothing &)
-  {
-    return ((args.pool_type == Strategy::pooling_type()) &&
-            (args.pool_window.rows == Strategy::pool_rows()) &&
-            (args.pool_window.cols == Strategy::pool_cols()) &&
-            (args.pool_stride.rows == Strategy::stride_rows()) &&
-            (args.pool_stride.cols == Strategy::stride_cols()));
-  }
-}
-
 static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
   {
     PoolingMethod::DEPTHFIRST,
@@ -69,15 +59,28 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<uint8_t>>(args);
+      auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<uint8_t>(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
   {
     PoolingMethod::DEPTHFIRST,
-    "sve_u8_nhwc_avg_generic_depthfirst",
+    "sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sme() &&
+             is_supported<sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<uint8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_u8_nhwc_avg_generic_depthfirst",
     [] (const PoolingArgs &args, const Nothing &) -> bool {
       // This kernel can only be used when there is either no padding, or we don't care
       // about the value of the padding. Otherwise, we would need to pass in the zero-point
@@ -85,40 +88,82 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
       return (args.exclude_padding ||
               (args.padding.top == 0 && args.padding.bottom == 0 &&
                args.padding.left == 0 && args.padding.right == 0)
-              ) && args.pool_type == PoolingType::AVERAGE;
+              ) && args.pool_type == PoolingType::AVERAGE &&
+             args.cpu_info->has_sme2();
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sme_u8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_u8_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirstGeneric<sve_u8_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new sme_u8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
-#endif  // defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst",
-    is_supported<sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst>,
+    [] (const PoolingArgs &args, const Nothing &os) -> bool {
+      return args.cpu_info->has_sve() &&
+             is_supported<sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<uint8_t>(strat, args);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sve_u8_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      // This kernel can only be used when there is either no padding, or we don't care
+      // about the value of the padding. Otherwise, we would need to pass in the zero-point
+      // for the quantization regime.
+      return (args.exclude_padding ||
+              (args.padding.top == 0 && args.padding.bottom == 0 &&
+               args.padding.left == 0 && args.padding.right == 0)
+              ) && args.pool_type == PoolingType::AVERAGE &&
+             args.cpu_info->has_sve2();
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirst<sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new sve_u8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_u8_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Nothing &) -> bool {
+      return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirstGeneric<sve_u8_nhwc_max_generic_depthfirst>(args);
+      auto strat = new sve_u8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst",
     is_supported<a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst>,
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirst<a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+      auto strat = new a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+      return new PoolingDepthfirst<uint8_t>(strat, args);
     },
   },
   {
@@ -135,7 +180,8 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
     },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirstGeneric<a64_u8_nhwc_avg_generic_depthfirst>(args);
+      auto strat = new a64_u8_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
   {
@@ -144,7 +190,8 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
     [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
     [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
-      return new PoolingDepthfirstGeneric<a64_u8_nhwc_max_generic_depthfirst>(args);
+      auto strat = new a64_u8_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
     },
   },
 #endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp
index 41303fb418..cd1b02889c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,17 @@
 #include "arm_gemm_local.hpp"
 
 #include "pooling_implementation.hpp"
-#include "pooling_depthfirst_generic_quantized.hpp"
+#include "pooling_depthfirst_generic.hpp"
 
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp"
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
 #include "kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp"
 #include "kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp"
 #endif  // defined(__aarch64__)
@@ -41,30 +45,60 @@
 namespace arm_conv {
 namespace pooling {
 
-static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_methods[] = {
+static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8q_methods[] = {
 #if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_u8q_nhwc_avg_generic_depthfirst",
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sme2() && args.pool_type == PoolingType::AVERAGE;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sme_u8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
+    },
+  },
+  {
+    PoolingMethod::DEPTHFIRST,
+    "sme_u8q_nhwc_max_generic_depthfirst",
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sme2() && args.pool_type == PoolingType::MAX;
+    },
+    nullptr,
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sme_u8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "sve_u8q_nhwc_avg_generic_depthfirst",
     [] (const PoolingArgs &args, const Requantize32 &) -> bool {
-      return args.pool_type == PoolingType::AVERAGE;
+      return args.cpu_info->has_sve2() && args.pool_type == PoolingType::AVERAGE;
     },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<sve_u8q_nhwc_avg_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
     },
   },
   {
     PoolingMethod::DEPTHFIRST,
     "sve_u8q_nhwc_max_generic_depthfirst",
-    [] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
+    [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+      return args.cpu_info->has_sve2() && args.pool_type == PoolingType::MAX;
+    },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<sve_u8q_nhwc_max_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new sve_u8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
     },
   },
-#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
   {
     PoolingMethod::DEPTHFIRST,
     "a64_u8q_nhwc_avg_generic_depthfirst",
@@ -72,8 +106,9 @@ static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_me
       return args.pool_type == PoolingType::AVERAGE;
     },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<a64_u8q_nhwc_avg_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
     },
   },
   {
@@ -81,8 +116,9 @@ static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_me
     "a64_u8q_nhwc_max_generic_depthfirst",
     [] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
     nullptr,
-    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
-      return new PoolingDepthfirstGenericQuantized<a64_u8q_nhwc_max_generic_depthfirst>(args, rq);
+    [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+      auto strat = new a64_u8q_nhwc_max_generic_depthfirst(args.cpu_info);
+      return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
     },
   },
 #endif  // defined(__aarch64__)
@@ -92,10 +128,10 @@ static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_me
 template <>
 const PoolingImplementation<uint8_t, uint8_t, Requantize32> *pooling_implementation_list()
 {
-  return pooling_u8_methods;
+  return pooling_u8q_methods;
 }
 
-template UniquePoolingCommon<uint8_t, uint8_t, Requantize32> pooling(const PoolingArgs &, const Requantize32 &);
+template UniquePoolingCommon<uint8_t, uint8_t> pooling(const PoolingArgs &, const Requantize32 &);
 
 }  //  namespace pooling
 }  //  namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_gemm/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
index 7766656adb..4f2c47bf11 100644
--- a/src/core/NEON/kernels/arm_gemm/asmlib.hpp
+++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,9 +37,6 @@
 #define ASM_PREFETCHW(address)   "PRFM PSTL1KEEP, " address "\n"
 #define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
 
-// Lee's uarchsim hack
-//#define ASM_PREFETCH(address)	"LDNP x20, x21, " address "\n"
-
 // No preload at all
 //#define ASM_PREFETCH(address) ""
 #else
diff --git a/src/core/NEON/kernels/arm_gemm/convolver.hpp b/src/core/NEON/kernels/arm_gemm/convolver.hpp
index 879d95f5bb..b15f669132 100644
--- a/src/core/NEON/kernels/arm_gemm/convolver.hpp
+++ b/src/core/NEON/kernels/arm_gemm/convolver.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -103,11 +103,15 @@ private:
                 return (m_length_remaining == 0);
             }
 
+            // Compute a block of output pointers, accounting for padding.
+            // This is performance critical.
             std::tuple<unsigned int, unsigned int> next_block(const T ** const row_ptr) {
                 if (finished()) {
                     return std::make_tuple(0, 0);
                 }
 
+                const T *pad_ptr = m_convolver.m_pad_row.data();
+
                 // "in_width" in the amount of data that will be read in (copied)
                 // "out_width" is the total amount of data that will be produced (including padding)
                 unsigned int offset = (m_current_pos == m_parent.m_start_pos) ? m_parent.m_start_offset : 0;
@@ -117,23 +121,83 @@ private:
                 unsigned int output_y = m_start_output_y;
                 unsigned int output_x = m_start_output_x;
 
-                for (unsigned int row=0; row<m_active_height; row++) {
+                // Loop over "row" (output points), but really there is one
+                // trip through this outer loop per row of output to
+                // minimize redundant padding calculations.
+                unsigned int row=0;
+                while (row < m_active_height) {
                     int input_y = (output_y * m_convolver.m_params.output_stride_h) + m_convolver.m_kernel_y[m_current_pos];
                     int input_x = (output_x * m_convolver.m_params.output_stride_w) + m_convolver.m_kernel_x[m_current_pos];
 
-                    // Out-of-bounds points will read the padding data,
-                    // otherwise find the correct address in the input image.
-                    if (input_y < 0 || input_y >= m_convolver.m_params.input_height || input_x < 0 || input_x >= m_convolver.m_params.input_width) {
-                        row_ptr[row] = m_convolver.m_pad_row.data();
-                    } else {
-                        row_ptr[row] = m_parent.m_input_base + ((input_y * m_convolver.m_params.input_width) + input_x) * m_parent.m_input_stride;
+                    // Factor out base pointer computation.
+                    const T *base_ptr = m_parent.m_input_base +
+                                        (input_y * m_convolver.m_params.input_width * m_parent.m_input_stride);
+
+                    // To start with, check the input row is in-bounds.  If
+                    // not, (at least) this entire output row must be
+                    // padding so handle accordingly.
+
+                    // If input_y is off the bottom of the input, we are
+                    // going to get padding for every remanining output
+                    // point.
+                    if (input_y >= m_convolver.m_params.input_height) {
+                        while (row < m_active_height) {
+                            row_ptr[row++] = pad_ptr;
+                        }
+                        break;
                     }
 
-                    output_x++;
-                    if (output_x == m_convolver.m_params.output_width) {
-                        output_y++;
-                        output_x=0;
+                    // If input_y is less than zero, we are going to get
+                    // padding for the rest of this output row.
+                    if (input_y < 0) {
+                        while (output_x < m_convolver.m_params.output_width && row<m_active_height) {
+                            row_ptr[row++] = pad_ptr;
+                            output_x++;
+                        }
+                        goto next_row;
                     }
+
+                    // The input row is in bounds - so handle left
+                    // padding, then non-padding output, then right
+                    // padding.
+
+                    // Left padding
+                    while (row < m_active_height && input_x < 0) {
+                        row_ptr[row++] = pad_ptr;
+
+                        output_x++;
+                        input_x+=m_convolver.m_params.output_stride_w;
+
+                        // Need to detect the end of the row, in case it's
+                        // all padding.
+                        if (output_x == m_convolver.m_params.output_width) {
+                            goto next_row;
+                        }
+                    }
+
+                    // Non-padding output.  Factor out base pointer calculation.
+                    while (row < m_active_height && input_x < m_convolver.m_params.input_width) {
+                        row_ptr[row++] = base_ptr + (input_x * m_parent.m_input_stride);
+
+                        output_x++;
+                        input_x+=m_convolver.m_params.output_stride_w;
+
+                        if (output_x == m_convolver.m_params.output_width) {
+                            goto next_row;
+                        }
+                    }
+
+                    // Right padding.
+                    while (row < m_active_height && output_x < m_convolver.m_params.output_width) {
+                        row_ptr[row++] = pad_ptr;
+                        output_x++;
+                    }
+
+                    // Update output indices for next row.  Used as a "goto"
+                    // target due to end-of-row checks in nested loops.
+next_row:
+                    output_x=0;
+                    output_y++;
                 }
 
                 m_current_pos++;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index d8134c4bb5..0ddca04846 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,84 +31,191 @@
 #include "gemv_batched.hpp"
 #include "gemv_pretransposed.hpp"
 
+#include "kernels/a32_sgemm_8x6.hpp"
+
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#include "kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp"
+#include "kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp"
+#include "kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp"
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 #include "kernels/a64_hybrid_bf16fp32_dot_6x16.hpp"
+#include "kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp"
 #include "kernels/a64_interleaved_bf16fp32_dot_8x12.hpp"
 #include "kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp"
 #include "kernels/a64_sgemm_8x12.hpp"
-#include "kernels/a32_sgemm_8x6.hpp"
+
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#include "kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp"
+#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+#include "kernels/sme2_gemv_bf16fp32_dot_16VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SME2
+
+#include "kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp"
+#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
+#include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp"
 #include "kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp"
 #include "kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp"
-#include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SVE
 
 namespace arm_gemm {
 
 static const GemmImplementation<bfloat16, float> gemm_bf16_methods[] =
 {
-#ifdef V8P6_BF
-#ifdef __ARM_FEATURE_SVE
-{ // gemm_bf16_interleaved
+#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_BF16
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+// SME kernels
+{
+    GemmMethod::GEMM_HYBRID,
+    "sme2_gemv_bf16fp32_dot_16VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2() && args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args) { return new GemvPretransposed<cls_sme2_gemv_bf16fp32_dot_16VL, bfloat16, float>(args); }
+},
+{
     GemmMethod::GEMM_INTERLEAVED,
-    "sve_interleaved_bf16fp32_mmla_8x3VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, float>(args); }
+    "sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
+                               return args._Nsize >= 8*VL || args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL, bfloat16, float>(args); }
 },
 {
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
+                               return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL, bfloat16, float>(args); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2(); },
+    nullptr,
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL, bfloat16, float>(args); }
+},
+#endif // ARM_COMPUTE_ENABLE_SME2
+// gemm_bf16_interleaved
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_bf16fp32_mmla_8x3VL",
+    [](const GemmArgs &args) { return args._ci->has_svebf16() && (args._Ksize>4); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_bf16fp32_mmla_6x4VL",
+    [](const GemmArgs &args) { return args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_bf16fp32_mmla_6x4VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_mmla_6x4VL, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_bf16fp32_dot_6x4VL",
-    [](const GemmArgs &args) { return args._ci->has_sve(); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && ((args._Ksize <= 128) && (args._Nsize <= 128)); },
+    [](const GemmArgs &args) { return args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, float>(args); }
-},
-{ // gemm_bf16_interleaved
+),
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_bf16fp32_dot_8x3VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>2); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return args._ci->has_svebf16() && (args._Ksize>2); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, float>(args); }
-},
-# endif // SVE
-{ // gemm_bf16_interleaved
+),
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_ffinterleaved_bf16fp32_mmla_8x3VL",
+    KernelWeightFormat::VL2VL_BL64,
+    [](const GemmArgs &args) { return args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_ffhybrid_bf16fp32_mmla_6x4VL",
+    KernelWeightFormat::VL2VL_BL64,
+    [](const GemmArgs &args) { return args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_bf16fp32_mmla_6x4VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_bf16fp32_mmla_6x4VL, bfloat16, float>(args); }
+),
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#endif // ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_bf16fp32_mmla_6x16",
+    [](const GemmArgs &args) { return args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_bf16fp32_mmla_6x16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_bf16fp32_mmla_6x16, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_bf16fp32_mmla_8x12",
-    [](const GemmArgs &args) { return (args._Ksize>4); },
-    nullptr,
+    [](const GemmArgs &args) { return args._ci->has_bf16() && (args._Ksize>4); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, bfloat16, float>(args); }
-},
-{
+),
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_bf16fp32_dot_6x16",
-    nullptr,
-    nullptr,
+    [](const GemmArgs &args) { return args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_bf16fp32_dot_6x16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_bf16fp32_dot_6x16, bfloat16, float>(args); }
-},
-{ // gemm_bf16_interleaved
+),
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_bf16fp32_dot_8x12",
-    [](const GemmArgs &args) { return (args._Ksize>2); },
-    nullptr,
+    [](const GemmArgs &args) { return args._ci->has_bf16() && (args._Ksize>2); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_dot_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
-},
-#endif // V8P6_BF
-#ifdef __aarch64__
-{
+),
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_ffinterleaved_bf16fp32_mmla_8x12",
+    KernelWeightFormat::VL256_BL64,
+    [](const GemmArgs &args) { return args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_ffhybrid_bf16fp32_mmla_6x16",
+    KernelWeightFormat::VL256_BL64,
+    [](const GemmArgs &args) { return args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_bf16fp32_mmla_6x16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_bf16fp32_mmla_6x16, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_ffinterleaved_bf16fp32_dot_8x12",
+    KernelWeightFormat::VL128_BL32,
+    [](const GemmArgs &args) { return args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
+),
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_sgemm_8x12",
     nullptr,
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, bfloat16, float>(args); }
-},
-#elif defined(__arm__)
-{
-    GemmMethod::GEMM_INTERLEAVED,
-    "sgemm_8x6",
-    nullptr,
-    nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<sgemm_8x6, bfloat16, float>(args); }
-},
-#else
-# error "Unknown Architecture"
-#endif
+),
+#endif // ARM_COMPUTE_ENABLE_BF16
+#endif // __aarch64__
 {
     GemmMethod::DEFAULT,
     "",
@@ -125,6 +232,7 @@ const GemmImplementation<bfloat16, float> *gemm_implementation_list<bfloat16, fl
 
 /* Explicitly instantiate the external functions for these types. */
 template UniqueGemmCommon<bfloat16, float> gemm<bfloat16, float, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<bfloat16, float, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
 template KernelDescription get_gemm_method<bfloat16, float, Nothing>(const GemmArgs &args, const Nothing &);
 template std::vector<KernelDescription> get_compatible_kernels<bfloat16, float, Nothing>(const GemmArgs &args, const Nothing &);
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp
new file mode 100644
index 0000000000..aa761b46e4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017-2020, 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "bfloat.hpp"
+#include "gemm_implementation.hpp"
+#include "gemm_interleaved.hpp"
+
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#include "kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp"
+#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+
+namespace arm_gemm {
+
+static const GemmImplementation<bfloat16, bfloat16> gemm_bf16bf16_methods[] =
+{
+#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_BF16
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+GemmImplementation<bfloat16, bfloat16>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_ffinterleaved_bf16fp32_mmla_8x12",
+    KernelWeightFormat::VL256_BL64,
+    [](const GemmArgs &args) { return args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, bfloat16>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, bfloat16>(args); }
+),
+GemmImplementation<bfloat16, bfloat16>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_ffinterleaved_bf16fp32_mmla_8x3VL",
+    KernelWeightFormat::VL2VL_BL64,
+    [](const GemmArgs &args) { return args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, bfloat16, bfloat16>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, bfloat16, bfloat16>(args); }
+),
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#endif // ARM_COMPUTE_ENABLE_BF16
+#endif // __aarch64__
+{
+    GemmMethod::DEFAULT,
+    "",
+    nullptr,
+    nullptr,
+    nullptr
+}
+};
+
+template<>
+const GemmImplementation<bfloat16, bfloat16> *gemm_implementation_list<bfloat16, bfloat16>() {
+    return gemm_bf16bf16_methods;
+}
+
+/* Explicitly instantiate the external functions for these types. */
+template UniqueGemmCommon<bfloat16, bfloat16> gemm<bfloat16, bfloat16, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<bfloat16, bfloat16, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<bfloat16, bfloat16, Nothing>(const GemmArgs &args, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<bfloat16, bfloat16, Nothing>(const GemmArgs &args, const Nothing &);
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index 8e355c8f2c..c7adf8e4ac 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 
 // This can only be built if the target/compiler supports FP16 arguments.
-#ifdef __ARM_FP16_ARGS
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
 
 #include "arm_gemm.hpp"
 
@@ -32,59 +32,131 @@
 #include "gemm_hybrid_indirect.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
+#include "gemv_pretransposed.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#include "kernels/a64_ffhybrid_fp16_mla_6x32.hpp"
+#include "kernels/a64_ffinterleaved_fp16_mla_8x24.hpp"
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 #include "kernels/a64_hgemm_8x24.hpp"
 #include "kernels/a64_hybrid_fp16_mla_6x32.hpp"
 #include "kernels/a64_sgemm_8x12.hpp"
+#ifdef ARM_COMPUTE_ENABLE_SME2
+#include "kernels/sme2_gemv_fp16fp32fp16_dot_16VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SME2
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#include "kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp"
+#include "kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 #include "kernels/sve_hybrid_fp16_mla_6x4VL.hpp"
 #include "kernels/sve_interleaved_fp16_mla_8x3VL.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
-#if defined(__ARM_FEATURE_SVE)
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
 {
     GemmMethod::GEMM_HYBRID,
-    "sve_hybrid_fp16_mla_6x4VL",
-    [](const GemmArgs &args) { return args._ci->has_sve(); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); },
-    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
+    "sme2_gemv_fp16fp32fp16_dot_16VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2() && args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args) { return new GemvPretransposed<cls_sme2_gemv_fp16fp32fp16_dot_16VL, __fp16, __fp16>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
+                               return args._Nsize >= 8*VL || args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL, __fp16, __fp16, Nothing, false, false, false, true>(args); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
+                               return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL, __fp16, __fp16, Nothing, false, false, false, true>(args); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2(); },
+    nullptr,
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL, __fp16, __fp16, Nothing, false, false, false, true>(args); }
+},
+#endif // ARM_COMPUTE_ENABLE_SME2
+GemmImplementation<__fp16, __fp16>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_fp16_mla_6x4VL",
+    [](const GemmArgs &args) { return args._ci->has_sve(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp16_mla_6x4VL, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp16_mla_6x4VL, __fp16, __fp16>(args); }
+),
+GemmImplementation<__fp16, __fp16>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_fp16_mla_8x3VL",
     [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize > 4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16>(args); }
-},
-#endif
-
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
+),
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+GemmImplementation<__fp16, __fp16>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_ffinterleaved_fp16_mla_8x3VL",
+    KernelWeightFormat::VL1VL_BL16,
+    [](const GemmArgs &args) { return args._ci->has_sve(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_fp16_mla_8x3VL, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_fp16_mla_8x3VL, __fp16, __fp16>(args); }
+),
+GemmImplementation<__fp16, __fp16>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_ffhybrid_fp16_mla_6x4VL",
+    KernelWeightFormat::VL1VL_BL16,
+    [](const GemmArgs &args) { return args._ci->has_sve(); },
+    [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_fp16_mla_6x4VL, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_fp16_mla_6x4VL, __fp16, __fp16>(args); }
+),
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#endif // ARM_COMPUTE_ENABLE_SVE
+#if defined(__aarch64__)
 GemmImplementation<__fp16, __fp16>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_fp16_mla_6x32",
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     [](const GemmArgs &args) { return args._ci->has_fp16(); },
-#else
-    nullptr,
-#endif
-    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>::estimate_cycles(args, cls_a64_hybrid_fp16_mla_6x32::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
 ),
 GemmImplementation<__fp16, __fp16>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_hgemm_8x24",
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     [](const GemmArgs &args) { return args._ci->has_fp16(); },
-#else
-    nullptr,
-#endif
-    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>::estimate_cycles(args, cls_a64_hgemm_8x24::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>(args); }
 ),
-#endif // aarch64 && FP16
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+GemmImplementation<__fp16, __fp16>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_ffinterleaved_fp16_mla_8x24",
+    KernelWeightFormat::VL128_BL16,
+    [](const GemmArgs &args) { return args._ci->has_fp16(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_fp16_mla_8x24, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_fp16_mla_8x24, __fp16, __fp16>(args); }
+),
+GemmImplementation<__fp16, __fp16>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_ffhybrid_fp16_mla_6x32",
+    KernelWeightFormat::VL128_BL16,
+    [](const GemmArgs &args) { return args._ci->has_fp16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp16_mla_6x32, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
+),
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 {
     GemmMethod::GEMM_INTERLEAVED,
     "a64_sgemm_8x12",
@@ -119,9 +191,10 @@ const GemmImplementation<__fp16, __fp16> *gemm_implementation_list<__fp16, __fp1
 
 /* Explicitly instantiate the external functions for these types. */
 template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<__fp16, __fp16, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
 template KernelDescription get_gemm_method<__fp16, __fp16, Nothing>(const GemmArgs &args, const Nothing &);
 template std::vector<KernelDescription> get_compatible_kernels<__fp16, __fp16, Nothing>(const GemmArgs &args, const Nothing &);
 
 } // namespace arm_gemm
 
-#endif // __ARM_FP16_ARGS
+#endif // defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index d94814fb4c..0c1d3a387b 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,20 +31,54 @@
 #include "gemv_pretransposed.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/a64_gemv_fp32_mla_32.hpp"
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#include "kernels/a64_ffhybrid_fp32_mla_6x16.hpp"
+#include "kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp"
+#include "kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16.hpp"
+#include "kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp"
+#include "kernels/a64_ffinterleaved_fp32_mla_8x12.hpp"
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#include "kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp"
+#include "kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp"
+#include "kernels/a64_hybrid_fp32_mla_4x24.hpp"
 #include "kernels/a64_hybrid_fp32_mla_6x16.hpp"
 #include "kernels/a64_hybrid_fp32_mla_8x4.hpp"
+#include "kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp"
 #include "kernels/a64_sgemm_8x12.hpp"
 #include "kernels/a64_sgemm_8x6.hpp"
 #include "kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp"
 #include "kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp"
 
-#include "kernels/sve_gemv_fp32_mla_8VL.hpp"
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#include "kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp"
+#include "kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp"
+#include "kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp"
+#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#ifdef ARM_COMPUTE_ENABLE_SME2
+#include "kernels/sme2_gemv_fp32_mla_16VL.hpp"
+#include "kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SME2
+
+#include "kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp"
+#include "kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp"
+#include "kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp"
+#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
+#include "kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp"
+#include "kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp"
 #include "kernels/sve_hybrid_fp32_mla_6x4VL.hpp"
 #include "kernels/sve_hybrid_fp32_mla_8x1VL.hpp"
+#include "kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp"
 #include "kernels/sve_interleaved_fp32_mla_8x3VL.hpp"
 #include "kernels/sve_interleaved_fp32_mmla_8x3VL.hpp"
-#include "kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SVE
 
 namespace arm_gemm {
 
@@ -59,58 +93,194 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
     [](const GemmArgs &args) { return new GemvBatched<float, float>(args); }
 },
 #ifdef __aarch64__
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_BF16
+// "fast mode" (BF16) kernels
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_interleaved_bf16fp32_mmla_8x12",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, float, float>(args); }
+),
+
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_fp32bf16fp32_mmla_6x16",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_6x16, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_6x16, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_fp32bf16fp32_mmla_4x24",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_4x24, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_4x24, float, float>(args); }
+),
+#endif // ARM_COMPUTE_ENABLE_BF16
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+// SME kernels
 {
     GemmMethod::GEMM_HYBRID,
-    "sve_gemv_fp32_mla_8VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args) { return new GemvPretransposed<cls_sve_gemv_fp32_mla_8VL, float, float>(args); }
+    "sme2_gemv_fp32bf16fp32_dot_16VL",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_sme2() && args._Msize==1 && args._nbatches==1 && !args._indirect_input && !args._accumulate; },
+    nullptr,
+    [](const GemmArgs &args) { return new GemvPretransposed<cls_sme2_gemv_fp32bf16fp32_dot_16VL, float, float>(args); }
 },
-#endif
 {
     GemmMethod::GEMM_HYBRID,
-    "a64_gemv_fp32_mla_32",
-    [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
+    "sme2_gemv_fp32_mla_16VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2() && args._Msize==1 && args._nbatches==1 && !args._indirect_input && !args._accumulate; },
     nullptr,
-    [](const GemmArgs &args) { return new GemvPretransposed<cls_a64_gemv_fp32_mla_32, float, float>(args); }
+    [](const GemmArgs &args) { return new GemvPretransposed<cls_sme2_gemv_fp32_mla_16VL, float, float>(args); }
 },
-
-// MMLA next due to higher throughput (SVE only)
-#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32)
+#ifdef ARM_COMPUTE_ENABLE_BF16
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "sve_interleaved_fp32_mmla_8x3VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mmla_8x3VL, float, float>(args); }
+    "sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_sme2() && !args._accumulate; },
+    [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
+                               return args._Nsize >= 8*VL || args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL, float, float>(args); }
 },
-#endif // __ARM_FEATURE_SVE && MMLA_FP32
-
-#ifdef __ARM_FEATURE_SVE
-// SVE smallk / hybrid methods
+#endif // ARM_COMPUTE_ENABLE_BF16
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_fp32_mopa_1VLx4VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2() && !args._accumulate; },
+    [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
+                               return args._Nsize >= 8*VL || args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL, float, float>(args); }
+},
+#ifdef ARM_COMPUTE_ENABLE_BF16
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_sme2() && !args._accumulate; },
+    [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
+                               return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL, float, float>(args); }
+},
+#endif // ARM_COMPUTE_ENABLE_BF16
 {
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_fp32_mopa_4VLx1VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2() && !args._accumulate; },
+    [](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
+                               return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL, float, float>(args); }
+},
+#ifdef ARM_COMPUTE_ENABLE_BF16
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_sme2() && !args._accumulate; },
+    nullptr,
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL, float, float>(args); }
+},
+#endif // ARM_COMPUTE_ENABLE_BF16
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_fp32_mopa_2VLx2VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2() && !args._accumulate; },
+    nullptr,
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL, float, float>(args); }
+},
+#endif // ARM_COMPUTE_ENABLE_SME2
+#ifdef ARM_COMPUTE_ENABLE_BF16
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_bf16fp32_mmla_8x3VL",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
-    "sve_smallK_hybrid_fp32_mla_8x1VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize <= 24 && !args._indirect_input; },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_fp32_mla_8x1VL, float, float>(args); }
+    "sve_hybrid_fp32bf16fp32_mmla_6x4VL",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_fp32bf16fp32_mmla_4x6VL",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL, float, float>(args); }
+),
+#endif // ARM_COMPUTE_ENABLE_BF16
+#ifdef ARM_COMPUTE_ENABLE_SVEF32MM
+// MMLA next due to higher throughput (which is SVE only)
+// Prefer this in all cases, except if fast mode is requested and BF16 is available.
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_fp32_mmla_8x3VL",
+    [](const GemmArgs &args) { return args._ci->has_svef32mm() && (args._Ksize>4); },
+    [](const GemmArgs &args) { return !(args._fast_mode && args._ci->has_bf16()); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mmla_8x3VL, float, float>(args); }
 },
+#endif // ARM_COMPUTE_ENABLE_SVEF32MM
+// SVE kernels
 {
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_fp32_mla_8x1VL",
     [](const GemmArgs &args) { return args._ci->has_sve(); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (args._Nsize < 12); },
+    [](const GemmArgs &args) { return (args._Nsize < 12); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_8x1VL, float, float>(args); }
 },
-{
+GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_fp32_mla_6x4VL",
     [](const GemmArgs &args) { return args._ci->has_sve(); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>::estimate_cycles<float>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>(args); }
-},
-#endif // __ARM_FEATURE_SVE
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_fp32_mla_8x3VL",
+    [](const GemmArgs &args) { return args._ci->has_sve(); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>(args); }
+),
+ #ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#ifdef ARM_COMPUTE_ENABLE_BF16
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_ffinterleaved_bf16fp32_mmla_8x3VL",
+    KernelWeightFormat::VL2VL_BL64_BF16,
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_ffhybrid_fp32bf16fp32_mmla_4x6VL",
+    KernelWeightFormat::VL2VL_BL64_BF16,
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_fp32bf16fp32_mmla_4x6VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_fp32bf16fp32_mmla_4x6VL, float, float>(args); }
+),
+#endif
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_ffinterleaved_fp32_mla_8x3VL",
+    KernelWeightFormat::VL1VL_BL32,
+    [](const GemmArgs &args) { return args._ci->has_sve(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_fp32_mla_8x3VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_fp32_mla_8x3VL, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_ffhybrid_fp32_mla_6x4VL",
+    KernelWeightFormat::VL1VL_BL32,
+    [](const GemmArgs &args) { return args._ci->has_sve(); },
+    [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_fp32_mla_6x4VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_sve_ffhybrid_fp32_mla_6x4VL, float, float>(args); }
+),
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#endif // ARM_COMPUTE_ENABLE_SVE
 // Cortex-A35 specific kernel - use for any problem on A35, and never in any other cases.
 {
     GemmMethod::GEMM_INTERLEAVED,
@@ -123,14 +293,14 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
 {
     GemmMethod::GEMM_HYBRID,
     "a64_smallK_hybrid_fp32_mla_8x4",
-    [](const GemmArgs &args) { return args._Ksize <= 8 && (args._Nsize % 4)==0 && !args._indirect_input; },
+    [](const GemmArgs &args) { return args._Ksize <= 8 && (args._Nsize % 4)==0 && !args._indirect_input && !args._accumulate; },
     nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_fp32_mla_8x4, float, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "a64_smallK_hybrid_fp32_mla_6x4",
-    [](const GemmArgs &args) { return (args._Ksize > 8 && args._Ksize <= 16) && (args._Nsize % 4)==0 && !args._indirect_input; },
+    [](const GemmArgs &args) { return (args._Ksize > 8 && args._Ksize <= 16) && (args._Nsize % 4)==0 && !args._indirect_input && !args._accumulate; },
     nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_fp32_mla_6x4, float, float>(args); }
 },
@@ -143,27 +313,70 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
 },
 GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_fp32_mla_4x24",
+    nullptr,
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_4x24, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_4x24, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
     "a64_hybrid_fp32_mla_6x16",
     nullptr,
-    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>::estimate_cycles(args, cls_a64_hybrid_fp32_mla_6x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>::estimate_cycles<float>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>(args); }
 ),
-#ifdef __ARM_FEATURE_SVE
-{
-    GemmMethod::GEMM_INTERLEAVED,
-    "sve_interleaved_fp32_mla_8x3VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>(args); }
-},
-#endif // __ARM_FEATURE_SVE
 GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_sgemm_8x12",
     nullptr,
-    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, float, float>::estimate_cycles(args, cls_a64_sgemm_8x12::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, float, float>::estimate_cycles<float>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, float, float>(args); }
 ),
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+#ifdef ARM_COMPUTE_ENABLE_BF16
+// "fast mode" (BF16) kernels
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_ffinterleaved_bf16fp32_mmla_8x12",
+    KernelWeightFormat::VL256_BL64_BF16,
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_ffhybrid_fp32bf16fp32_mmla_4x24",
+    KernelWeightFormat::VL256_BL64_BF16,
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_4x24, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_4x24, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_ffhybrid_fp32bf16fp32_mmla_6x16",
+    KernelWeightFormat::VL256_BL64_BF16,
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_6x16, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32bf16fp32_mmla_6x16, float, float>(args); }
+),
+#endif // BF16
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_ffinterleaved_fp32_mla_8x12",
+    KernelWeightFormat::VL128_BL32,
+    nullptr,
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_fp32_mla_8x12, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_fp32_mla_8x12, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_ffhybrid_fp32_mla_6x16",
+    KernelWeightFormat::VL128_BL32,
+    nullptr,
+    [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32_mla_6x16, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_fp32_mla_6x16, float, float>(args); }
+),
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 #endif // __aarch64__
 
 #ifdef __arm__
@@ -192,6 +405,7 @@ const GemmImplementation<float, float> *gemm_implementation_list<float, float>()
 
 /* Explicitly instantiate the external functions for these types. */
 template UniqueGemmCommon<float, float> gemm<float, float, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<float, float, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
 template KernelDescription get_gemm_method<float, float, Nothing>(const GemmArgs &args, const Nothing &);
 template std::vector<KernelDescription> get_compatible_kernels<float, float, Nothing> (const GemmArgs &args, const Nothing &);
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index d702cffce1..a6c9677305 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,7 +74,7 @@ class GemmHybrid : public GemmCommon<To, Tr> {
         }
 
         if (args._cfg && args._cfg->inner_block_size) {
-            return args._cfg->inner_block_size;
+            return roundup(args._cfg->inner_block_size, strategy::k_unroll());
         }
 
         // Target block size (512 for FP32, scaling for other types).  Don't block until size reaches 1.5X this.
@@ -97,7 +97,13 @@ class GemmHybrid : public GemmCommon<To, Tr> {
     // single block.
     static unsigned int compute_n_block(const GemmArgs &args) {
         if (args._cfg && args._cfg->outer_block_size) {
-            return args._cfg->outer_block_size;
+            unsigned int n_block = args._cfg->outer_block_size;
+
+            // Needs to be (at least a single) multiple of the kernel output width.
+            n_block /= strategy::out_width();
+            n_block = std::max(n_block, 1u) * strategy::out_width();
+
+            return n_block;
         }
 
         if (args._Nsize <= 64) {
@@ -215,7 +221,9 @@ public:
         return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi);
     }
 
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override {
+        assert(!transposed);
+
         Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
         _B_transposed = buffer;
         strategy strat(_ci);
@@ -231,7 +239,7 @@ public:
                     const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
 
                     strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
-                                               x0, xmax, k0, kmax);
+                                               x0, xmax, k0, kmax, false);
 
                     buffer += size;
                 }
@@ -264,6 +272,17 @@ public:
 
         return total_cycles;
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c;
+
+        c.method = GemmMethod::GEMM_HYBRID;
+        c.inner_block_size = _k_block;
+        c.outer_block_size = _n_block;
+        c.filter = get_type_name<strategy>();
+
+        return c;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
index 41fecc6bec..0cc4d4f3d9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,9 @@
  */
 #pragma once
 
+#if !defined(_WIN64) && !defined(__OpenBSD__)
 #include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
 
 #include <algorithm>
 #include <cassert>
@@ -31,6 +33,7 @@
 #include "arm_gemm.hpp"
 #include "bias_adder.hpp"
 #include "convolver.hpp"
+#include "kernel_weight_format.hpp"
 #include "ndrange.hpp"
 #include "performance_parameters.hpp"
 #include "transform.hpp"
@@ -52,34 +55,34 @@ namespace {
 // We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do
 // that.
 
-template<typename OutputStage, bool SeparateQuantize = false>
+template<typename OutputStage, bool SeparateQuantize, bool FixedFormat>
 class run_hybrid_kernel {
 public:
-    template<typename strategy, typename To, typename Tr>
-    static void run (
+    template<typename strategy, typename Tlo, typename Tro, typename Tr>
+    static inline void run (
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
         const OutputStage &os, const int32_t *col_bias, unsigned int n_0 );
 };
 
 template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Nothing, false>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Nothing, false, false>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
         const Nothing &, const int32_t *, unsigned int) {
 #ifdef CYCLE_PROFILING
     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
 #endif
     UNUSED(kern_k);
 
-    /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing
+    /* Indirect hybrid kernels read the full width of the bias.  So we need to detect the case where we are writing
      * a partial block and pad the bias for that block. */
     if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
         /* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */
@@ -112,13 +115,61 @@ void run_hybrid_kernel<Nothing, false>::run(
 }
 
 template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Requantize32, false>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Nothing, false, true>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const Nothing &, const int32_t *, unsigned int) {
+#ifdef CYCLE_PROFILING
+    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+    UNUSED(kern_k);
+
+    /* Indirect hybrid kernels read the full width of the bias.  So we need to detect the case where we are writing
+     * a partial block and pad the bias for that block. */
+    if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
+        /* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */
+        unsigned int N_remainder = N % strategy::out_width();
+        unsigned int N_bulk = N - N_remainder;
+
+        /* Output argument to be used for the tail */
+        IndirectOutputArg<Tr> offset_output = output_arg;
+
+        /* If there is a "bulk" to be processed, handle that and update "offset_output" appropriately. */
+        if (N_bulk > 0) {
+            strat.kernel(num_strings, string_ptr, A_arg, M, N_bulk, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate);
+
+            if (output_arg.is_indirect) {
+                offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk);
+            } else {
+                offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride);
+            }
+        }
+
+        /* Pad the bias buffer for the remainder */
+        Tr *bias_pad_buffer = reinterpret_cast<Tr *>(alloca(strategy::out_width() * sizeof(Tr)));
+        memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder * sizeof(Tr));
+
+        /* Process the remainder, offsetting the B pointer as needed. */
+        strat.kernel(num_strings, string_ptr, A_arg, M, N_remainder,
+                     b_ptr + (N_bulk / strategy::stripe_width()) * b_stride, b_stride, offset_output,
+                     bias_pad_buffer, act, accumulate);
+    } else {
+        strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate);
+    }
+}
+
+template<>
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Requantize32, false, false>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
         const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
 #ifdef CYCLE_PROFILING
     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
@@ -129,13 +180,13 @@ void run_hybrid_kernel<Requantize32, false>::run(
 }
 
 template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Requantize32, true>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Requantize32, true, false>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
         const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
     UNUSED(kern_k);
     // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop.
@@ -178,12 +229,41 @@ void run_hybrid_kernel<Requantize32, true>::run(
     }
 }
 
+template<typename strategy, bool FixedFormat>
+struct stripe_width {
+    static unsigned int get() {
+        return strategy::stripe_width();
+    }
+};
+
+template<typename strategy>
+struct stripe_width<strategy, false> {
+    static unsigned int get() {
+        return 0;
+    }
+};
+
+template<typename strategy, bool FixedFormat>
+struct kernel_weight_format {
+    static KernelWeightFormat get() {
+        return strategy::kernel_weight_format();
+    }
+};
+
+template<typename strategy>
+struct kernel_weight_format<strategy, false> {
+    static KernelWeightFormat get() {
+        return KernelWeightFormat::NON_FIXED;
+    }
+};
+
 } // anonymous namespace
 
 // Implementation of the GemmCommon abstract class.
-template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false>
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool SeparateQuantize=false, bool FixedFormat=false>
 class GemmHybridIndirect : public GemmCommon<To, Tr> {
-    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::lhs_operand_type Tloi;
+    typedef typename strategy::rhs_operand_type Troi;
     typedef typename strategy::result_type Tri;
 
     GemmArgs           _args;
@@ -201,7 +281,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tr> {
     const unsigned int _Mround;
 
     /* Pretransposed buffer. */
-    const Toi *_B_transposed=nullptr;
+    const Troi *_B_transposed=nullptr;
 
     /* Indirect parameters.  _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
     const To * const * const * _indirect_buf = nullptr;
@@ -233,7 +313,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tr> {
         }
 
         if (args._cfg && args._cfg->inner_block_size) {
-            return args._cfg->inner_block_size;
+            return roundup(args._cfg->inner_block_size, strategy::k_unroll());
         }
 
         // Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other
@@ -356,11 +436,11 @@ public:
 
         // In convolution mode, we need input pointers.
         if (_convolver) {
-            in_row_ptrs.resize(strategy::out_height() * _args._Ksections, nullptr);
-            in_row_strings.resize(_args._Ksections, nullptr);
+            in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args._Ksections, nullptr);
+            in_row_strings = std::vector<const To * const *>(_args._Ksections, nullptr);
 
             for (unsigned int i=0; i<_args._Ksections; i++) {
-                in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
+                in_row_strings[i] = &(in_row_ptrs.data()[i * strategy::out_height()]);
             }
         }
 
@@ -370,8 +450,8 @@ public:
         }
 
         /* Make sure we've been set up correctly. */
-        assert(_B_transposed);
-        static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+        assert(FixedFormat || _B_transposed);
+        static_assert(std::is_same<To, Tloi>::value, "gemm_native: Operand types must be the same.");
 //        static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
 
         /* For now, each work item implies all the K for a given output
@@ -422,27 +502,35 @@ public:
                 const unsigned int nmax    = std::min(n0 + _n_block, _args._Nsize);
                 const unsigned int multi   = p.dim(3);
 
-                const Toi *b_panel = _B_transposed +
-                                     (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
-                                     (k0 * roundup(_args._Nsize, strategy::out_width())) +
-                                     (n0 * kern_k);
+                const Troi *b_panel;
+                if (FixedFormat) {
+                    b_panel = reinterpret_cast<const Troi *>(this->_Bptr) +
+                               (multi * this->_B_multi_stride) +
+                               ((n0 / stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
+                               (k0 * stripe_width<strategy, FixedFormat>::get());
+                } else {
+                    b_panel = _B_transposed +
+                               (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
+                               (k0 * roundup(_args._Nsize, strategy::out_width())) +
+                               (n0 * kern_k);
+                }
 
-               IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
+                IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
 
 #ifdef CYCLE_PROFILING
                 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
 #endif
                 if (_indirect_buf) {
-                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+                    run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
 #ifdef CYCLE_PROFILING
                                  prof,
 #endif
                                  strat, sections, string_lengths.data(),
                                  IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset),
-                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
                                  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
                                  last_pass ? _args._act : Activation(),
-                                 !first_pass,
+                                 !first_pass || _args._accumulate,
                                  // Quantization parameters
                                  _os, _col_bias+(multi * _args._Nsize), n0);
                 } else if (_convolver) {
@@ -466,32 +554,32 @@ public:
                     }
                     assert(pos == sections);
 
-                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+                    run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
 #ifdef CYCLE_PROFILING
                                  prof,
 #endif
                                  strat, sections, string_lengths.data(),
                                  IndirectInputArg<To>(in_row_strings.data(), 0, first_offset),
-                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
                                  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
                                  last_pass ? _args._act : Activation(),
-                                 !first_pass,
+                                 !first_pass || _args._accumulate,
                                  // Quantization parameters
                                  _os, _col_bias+(multi * _args._Nsize), n0);
                 } else {
                     // Length to process.  This needs to exclude padding, but 'kmax' potentially includes it.
                     const unsigned int len = (std::min(_args._Ksize, kmax) - k0);
 
-                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+                    run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
 #ifdef CYCLE_PROFILING
                                  prof,
 #endif
                                  strat, 1, &len,
                                  IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
-                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
                                  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
                                  last_pass ? _args._act : Activation(),
-                                 !first_pass,
+                                 !first_pass || _args._accumulate,
                                  // Quantization parameters
                                  _os, _col_bias+(multi * _args._Nsize), n0);
                 }
@@ -501,16 +589,20 @@ public:
 
     // Interface implementation - pretransposed
     bool B_is_pretransposed() const override {
-        return true;
+        return (FixedFormat == false);
     }
 
     bool B_pretranspose_required() const override {
-        return (_B_transposed==nullptr);
+        return (FixedFormat == false) && (_B_transposed==nullptr);
     }
 
     size_t get_B_pretransposed_array_size() const override {
+        if (FixedFormat) {
+            return 0;
+        }
+
         // Start with actual pretransposed buffer...
-        size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi);
+        size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Troi);
 
         // Space for result row pointers (not strictly needed any more but retained for indirect output testing)
         size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *);
@@ -522,7 +614,11 @@ public:
         return size;
     }
 
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+    size_t get_B_pretranspose_window_size() const override {
+        return _args._nmulti * iceildiv(_args._Nsize, strategy::out_width());
+    }
+
+    void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         if (std::is_same<OutputStage, Requantize32>::value) {
             _col_bias = reinterpret_cast<int32_t *>(in_buffer);
 
@@ -533,62 +629,115 @@ public:
                 compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize * _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0);
             }
         }
+    }
+
+    bool B_pretranspose_supports_transpose() const override {
+        strategy strat(_args._ci);
+        return strat.transforms.PrepareB_supports_transpose();
+    }
+
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override {
+        pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, transposed, 0, get_B_pretranspose_window_size());
+    }
+
+    void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed, size_t start, size_t end) override {
+        if (end >= get_B_pretranspose_window_size()) {
+            requantize_bias(in_buffer, B, ldb, B_multi_stride);
+        }
 
         // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
-        Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
-        _B_transposed = buffer;
+        Troi *buffer_base = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
+        _B_transposed = buffer_base;
 
         strategy strat(_args._ci);
+        size_t work_per_multi = iceildiv(_args._Nsize, strategy::out_width());
+
+        for (unsigned int multi=(start / work_per_multi); multi<_args._nmulti; multi++) {
+            // Work out which part of the window space this multi occupies,
+            // skip to the next multi or exit as needed.
+            size_t wk_start = multi * work_per_multi;
+            size_t wk_end = (multi + 1) * work_per_multi;
+
+            assert(wk_end > start);
+
+            if (wk_start >= end) {
+                break;
+            }
 
-        for (unsigned int multi=0; multi<_args._nmulti; multi++) {
             for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
                 const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
 
                 /* Figure out the size of each block. */
                 unsigned int k_size = kmax - k0;
 
-                // We need to insert padding at the end of each K section.
-                // The computation needed is a little delicate - the coordinates from the block walker are expressed in
-                // terms of the full, padded, _Ktotal.
-                // But we need to transform each section with reference to the original, unpadded, input, letting the
-                // transform pad each section as needed.
-
-                // This is needed for computations below.
-                const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
-
-                // The expected output format is also an entire <out_width> columns interleaved, then the next set of
-                // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
-                // a time.
-                for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
-                    unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
-
-                    // Track where we are and how much work is left.
-                    unsigned int kpos  = k0;
-                    unsigned int kleft = k_size;
-
-                    while (kleft) {
-                        // Which section are we in?  Based on the rounded-up section size.
-                        unsigned int k_section_base = kpos / rounded_section_size;
-                        // How far into the section are we?
-                        unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
-
-                        // We will either copy the rest of this section, or to the end of the requested length.
-                        unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
-
-                        strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
-                                                  x0, xmax,
-                                                  (k_section_base * _args._Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
-                                                  (k_section_base * _args._Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
+                // Correct the N range and buffer base if we are not processing the whole block.
+                size_t n_start = 0;
+                size_t n_end = _args._Nsize;
 
-                        // We need to modify our position based on the ROUNDED version of what we just did.
-                        unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+                // If we are not doing the first columns, update the buffer write position and starting N value.
+                if (start > wk_start) {
+                    n_start = (start - wk_start) * strategy::out_width();
+                }
 
-                        buffer += strategy::out_width() * padded_length;
+                // If we are not doing the last items, update the final N value.
+                if (end < wk_end) {
+                    n_end = (end - wk_start) * strategy::out_width();
+                }
 
-                        kpos  += padded_length;
-                        kleft -= padded_length;
+                // Set the buffer pointer
+                Troi *buffer = buffer_base +
+                               (roundup(_args._Nsize, strategy::out_width()) * (multi * _Ktotal + k0)) +
+                               (n_start * roundup(k_size, strategy::k_unroll()));
+
+                if (_args._Ksections > 1) {
+                    // We need to insert padding at the end of each K section.
+                    // The computation needed is a little delicate - the k0/kmax coordinates are expressed in
+                    // terms of the full, padded, _Ktotal.
+                    // But we need to transform each section with reference to the original, unpadded, input, letting the
+                    // transform pad each section as needed.
+
+                    // This is needed for computations below.
+                    const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
+
+                    // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+                    // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
+                    // a time.
+                    for (unsigned int x0 = n_start; x0 < n_end; x0 += strategy::out_width()) {
+                        unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
+
+                        // Track where we are and how much work is left.
+                        unsigned int kpos  = k0;
+                        unsigned int kleft = k_size;
+
+                        while (kleft) {
+                            // Which section are we in?  Based on the rounded-up section size.
+                            unsigned int k_section_base = kpos / rounded_section_size;
+                            // How far into the section are we?
+                            unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+
+                            // We will either copy the rest of this section, or to the end of the requested length.
+                            unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
+
+                            strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
+                                                      x0, xmax,
+                                                      (k_section_base * _args._Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
+                                                      (k_section_base * _args._Ksize) + k_offset + k_length,    // K end point - starting point plus length computed above.
+                                                      transposed);
+
+                            // We need to modify our position based on the ROUNDED version of what we just did.
+                            unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+
+                            buffer += strategy::out_width() * padded_length;
+
+                            kpos  += padded_length;
+                            kleft -= padded_length;
+                        }
                     }
+                } else {
+                    // In the single K section case, can process the whole lot in one go.
+                    strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
+                                              n_start, n_end, k0, std::min(kmax, _args._Ksize), transposed);
                 }
             }
         }
@@ -597,12 +746,17 @@ public:
     void set_pretransposed_B_data(void *in_buffer) override {
         // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
-        _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        _B_transposed = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
         _col_bias = reinterpret_cast<int32_t *>(in_buffer);
     }
 
-    // Estimate cycles for given problem given provided parameters
-    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params, const OutputStage &os = {} ) {
+    // Estimate cycles for given problem given provided parameters.
+    // "perf_type" is a type to pass along to get_performance_parameters to get the right set of performance
+    // parameters - it's arbitrary but usually either the input or output type.
+    template <typename perf_type>
+    static uint64_t estimate_cycles(const GemmArgs &args, const OutputStage &os = {}) {
+        const PerformanceParameters params = strategy::template get_performance_parameters<perf_type>(args._ci);
+
         // Note: Current hybrid kernels don't actually round up height (they
         // have paths for each possible height).  Might need to make this
         // configurable in future.
@@ -666,8 +820,23 @@ public:
         assert(parms.input_channels == _args._Ksize);
         _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c;
+
+        c.method = GemmMethod::GEMM_HYBRID;
+        c.inner_block_size = _k_block;
+        c.outer_block_size = _n_block;
+        c.filter = get_type_name<strategy>();
+        c.weight_format = get_weight_format(kernel_weight_format<strategy, FixedFormat>::get(), sizeof(To));
+
+        return c;
+    }
 };
 
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
+using GemmHybridIndirectFixedFormat = GemmHybridIndirect<strategy, To, Tr, OutputStage, false, true>;
+
 } // namespace arm_gemm
 
 #ifdef __I_DEFINED_UNUSED
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
index e48d9b9a07..f12efe4282 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,11 +81,42 @@ class GemmHybridQuantized : public GemmCommon<To, Tr> {
     static unsigned int compute_k_block(const GemmArgs &args) {
         // We don't support K blocks as we only temporarily store 32 bit results.
         return args._Ksize;
+
+        if (args._cfg && args._cfg->inner_block_size) {
+            return args._cfg->inner_block_size;
+        }
+
+        const unsigned int L1_size = args._ci->get_L1_cache_size();
+
+        // k_block: Find out how much of the larger array can be loaded into half the cache.
+        // This should account for associative caches.
+        unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+        // Needs to be (at least a single) multiple of the K unroll level.
+        k_block /= strategy::k_unroll();
+        k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+
+        // So divide the space equally into that many blocks.
+        k_block = iceildiv(args._Ksize, numk_blocks);
+
+        // And round UP to the K unroll level required.
+        k_block = roundup(k_block, strategy::k_unroll());
+
+        return k_block;
     }
 
     static unsigned int compute_n_block(const GemmArgs &args) {
         if (args._cfg && args._cfg->outer_block_size) {
-            return args._cfg->outer_block_size;
+            unsigned int n_block = args._cfg->outer_block_size;
+
+            // Needs to be (at least a single) multiple of the kernel output width.
+            n_block /= strategy::out_width();
+            n_block = std::max(n_block, 1u) * strategy::out_width();
+
+            return n_block;
         }
 
         const unsigned int k_block = compute_k_block(args);
@@ -238,12 +269,18 @@ public:
         return get_col_sum_size() + (roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi));
     }
 
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+    void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         col_bias = reinterpret_cast<int32_t *>(in_buffer);
 
         for (unsigned int i=0; i<_nmulti; i++) {
             compute_col_sums(_qp, _Nsize, _Ksize, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize),  _Ksize, i, 0);
         }
+    }
+
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override {
+        assert(!transposed);
+
+        requantize_bias(in_buffer, B, ldb, B_multi_stride);
 
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
         Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
@@ -261,7 +298,7 @@ public:
                     const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
 
                     strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
-                                               x0, xmax, k0, kmax);
+                                               x0, xmax, k0, kmax, false);
 
                     buffer += size;
                 }
@@ -279,6 +316,17 @@ public:
         _qp.bias = bias;
         _qp.bias_multi_stride = bias_multi_stride;
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c;
+
+        c.method = GemmMethod::GEMM_HYBRID;
+        c.inner_block_size = _k_block;
+        c.outer_block_size = _n_block;
+        c.filter = get_type_name<strategy>();
+
+        return c;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
index 7376b5ffe3..820b54202a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -219,12 +219,16 @@ public:
         return get_col_sum_size() + (roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi));
     }
 
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+    void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         col_bias = reinterpret_cast<int32_t *>(in_buffer);
 
         for (unsigned int i=0; i<_nmulti; i++) {
             compute_col_sums(_qp, _Nsize, _Ksize, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize),  _Ksize, i, 0);
         }
+    }
+
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+        requantize_bias(in_buffer, B, ldb, B_multi_stride);
 
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
         Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index d3857a50e7..5e77df7d4a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 
 #include "arm_gemm.hpp"
 
+#include "kernel_weight_format.hpp"
+
 #include <cstdint>
 #include <functional>
 
@@ -37,15 +39,36 @@ template<typename Top, typename Tret, class OutputStage = Nothing>
 struct GemmImplementation {
     const GemmMethod                                                               method;
     const char *                                                                   name;
+    const KernelWeightFormat                                                       kernel_weight_format = KernelWeightFormat::NON_FIXED;
     std::function<bool(const GemmArgs &, const OutputStage &)>                     is_supported = {};
     std::function<uint64_t(const GemmArgs &, const OutputStage &)>                 cycle_estimate = {};
     std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)>  instantiate = {};
 
     bool do_is_supported(const GemmArgs &args, const OutputStage &os) const {
-        if (is_supported != nullptr) {
-            return is_supported(args, os);
+	// Check supplied is_supported() function first.
+        if (is_supported != nullptr && !is_supported(args, os)) {
+            return false;
+        }
+
+        // Check weight format is appropriate.
+        if (args._fixed_format == false) {
+            // Can't return a fixed format kernel if we weren't asked for one.
+            return (kernel_weight_format == KernelWeightFormat::NON_FIXED);
         } else {
-            return true;
+            // Fixed format kernel requested: if this is a non-fixed format kernel we can't use it.
+            if (kernel_weight_format == KernelWeightFormat::NON_FIXED) {
+                return false;
+            }
+
+            // If there's no config, or the config says ANY then this one is OK.
+            if (!args._cfg || args._cfg->weight_format == WeightFormat::ANY) {
+                return true;
+            }
+
+            // If we get here it means there is a config and it specifies a format.  Check it matches this kernel.
+            // NOTE: this will execute SVE instructions if it's an SVE kernel, so it's important that is_supported()
+            // was called above first.
+            return (args._cfg->weight_format == get_weight_format(kernel_weight_format, sizeof(Top)));
         }
     }
 
@@ -84,6 +107,13 @@ struct GemmImplementation {
                        method(m), name(n), is_supported(is_supported),
                        cycle_estimate( [is_recommended](const GemmArgs &args, const OutputStage &os) { return (is_recommended == nullptr) ? 0 : (is_recommended(args, os) ? 0 : UINT64_MAX); } ),
                        instantiate(instantiate) {   }
+
+    GemmImplementation(GemmMethod m, const char *n, KernelWeightFormat kwf,
+                       std::function<bool(const GemmArgs &, const OutputStage &)> is_supported, std::function<bool(const GemmArgs &, const OutputStage &)> is_recommended,
+                       std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) :
+                       method(m), name(n), kernel_weight_format(kwf), is_supported(is_supported),
+                       cycle_estimate( [is_recommended](const GemmArgs &args, const OutputStage &os) { return (is_recommended == nullptr) ? 0 : (is_recommended(args, os) ? 0 : UINT64_MAX); } ),
+                       instantiate(instantiate) {   }
 };
 
 /* Slightly different version of above for straightforward GEMMs with no
@@ -93,15 +123,36 @@ template<typename Top, typename Tret>
 struct GemmImplementation<Top, Tret, Nothing> {
     const GemmMethod                                          method;
     const char *                                              name;
+    const KernelWeightFormat                                  kernel_weight_format = KernelWeightFormat::NON_FIXED;
     std::function<bool(const GemmArgs &)>                     is_supported = {};
     std::function<uint64_t(const GemmArgs &)>                 cycle_estimate = {};
     std::function<GemmCommon<Top, Tret> *(const GemmArgs &)>  instantiate = {};
 
     bool do_is_supported(const GemmArgs &args, const Nothing &) const {
-        if (is_supported != nullptr) {
-            return is_supported(args);
+	// Check supplied is_supported() function first.
+        if (is_supported != nullptr && !is_supported(args)) {
+            return false;
+        }
+
+        // Check weight format is appropriate.
+        if (args._fixed_format == false) {
+            // Can't return a fixed format kernel if we weren't asked for one.
+            return (kernel_weight_format == KernelWeightFormat::NON_FIXED);
         } else {
-            return true;
+            // Fixed format kernel requested: if this is a non-fixed format kernel we can't use it.
+            if (kernel_weight_format == KernelWeightFormat::NON_FIXED) {
+                return false;
+            }
+
+            // If there's no config, or the config says ANY then this one is OK.
+            if (!args._cfg || args._cfg->weight_format == WeightFormat::ANY) {
+                return true;
+            }
+
+            // If we get here it means there is a config and it specifies a format.  Check it matches this kernel.
+            // NOTE: this will execute SVE instructions if it's an SVE kernel, so it's important that is_supported()
+            // was called above first.
+            return (args._cfg->weight_format == get_weight_format(kernel_weight_format, sizeof(Top)));
         }
     }
 
@@ -129,10 +180,22 @@ struct GemmImplementation<Top, Tret, Nothing> {
         return impl;
     }
 
+    static GemmImplementation with_estimate(GemmMethod m, const char *n, KernelWeightFormat f,
+                       std::function<bool(const GemmArgs &)> is_supported, std::function<uint64_t(const GemmArgs &)> cycle_estimate,
+                       std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) {
+        GemmImplementation impl(m,n,f);
+
+        impl.is_supported=is_supported;
+        impl.cycle_estimate=cycle_estimate;
+        impl.instantiate=instantiate;
+
+        return impl;
+    }
+
     GemmImplementation(const GemmImplementation &) = default;
     GemmImplementation & operator= (const GemmImplementation &) = default;
 
-    GemmImplementation(GemmMethod m, const char * n) : method(m), name(n) {}
+    GemmImplementation(GemmMethod m, const char *n, KernelWeightFormat f=KernelWeightFormat::NON_FIXED) : method(m), name(n), kernel_weight_format(f) {}
 
     GemmImplementation(GemmMethod m, const char *n,
                        std::function<bool(const GemmArgs &)> is_supported, std::function<bool(const GemmArgs &)> is_recommended,
@@ -140,11 +203,20 @@ struct GemmImplementation<Top, Tret, Nothing> {
                        method(m), name(n), is_supported(is_supported),
                        cycle_estimate( [is_recommended](const GemmArgs &args) -> uint64_t { return (is_recommended == nullptr) ? 0 : (is_recommended(args) ? 0 : UINT64_MAX); } ),
                        instantiate(instantiate) {   }
+
+    GemmImplementation(GemmMethod m, const char *n, KernelWeightFormat kwf,
+                       std::function<bool(const GemmArgs &)> is_supported, std::function<bool(const GemmArgs &)> is_recommended,
+                       std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) :
+                       method(m), name(n), kernel_weight_format(kwf), is_supported(is_supported),
+                       cycle_estimate( [is_recommended](const GemmArgs &args) -> uint64_t { return (is_recommended == nullptr) ? 0 : (is_recommended(args) ? 0 : UINT64_MAX); } ),
+                       instantiate(instantiate) {   }
 };
 
-/* "Master" function implemented for each valid combination of types.
- * Returns a list of GEMM implementation descriptors for processing by the
- * other functions, terminated by an implementation with
+/* Provides the list of implementation descriptors which is processed by the
+ * other functions.
+ *
+ * A specialised version is provided for each supported combination of types.
+ * The end of the list is indicated by a sentinel descriptor with
  * method==GemmMethod::DEFAULT.  */
 template<typename Top, typename Tret, class OutputStage = Nothing>
 const GemmImplementation<Top, Tret, OutputStage> *gemm_implementation_list();
@@ -236,6 +308,15 @@ std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, cons
 }
 
 template<typename Top, typename Tret, class OutputStage>
+bool has_opt_gemm(WeightFormat &wf, const GemmArgs &args, const OutputStage &os) {
+    const GemmImplementation<Top, Tret, OutputStage> *impl;
+    const bool success =  find_implementation<Top, Tret, OutputStage>(args, os, impl);
+    if (success)
+      wf = UniqueGemmCommon<Top, Tret>(impl->do_instantiate(args, os))->get_config().weight_format;
+    return success;
+}
+
+template<typename Top, typename Tret, class OutputStage>
 UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage &os) {
     const GemmImplementation<Top, Tret, OutputStage> *impl;
 
@@ -258,4 +339,5 @@ KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage &os) {
     return KernelDescription();
 }
 
+
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index a3a61959c3..aa6ecc2919 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,6 +56,7 @@ const GemmImplementation<int16_t, int32_t> *gemm_implementation_list<int16_t, in
 
 /* Explicitly instantiate the external functions for these types. */
 template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<int16_t, int32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
 template KernelDescription get_gemm_method<int16_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template std::vector<KernelDescription> get_compatible_kernels<int16_t, int32_t, Nothing> (const GemmArgs &args, const Nothing &);
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 60cf82f9c6..fedda3a47a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,71 +34,109 @@
 #include "kernels/a64_gemm_s8_8x12.hpp"
 #include "kernels/a64_gemm_s8_4x4.hpp"
 #include "kernels/a64_hybrid_s8s32_dot_6x16.hpp"
+#include "kernels/a64_hybrid_s8s32_mmla_6x16.hpp"
 #include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp"
 #include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp"
 #include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp"
 
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+#include "kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SME2
+
 #include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp"
 #include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp"
 #include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp"
-#include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SVE
 
 namespace arm_gemm {
 
 static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
-#ifdef __ARM_FEATURE_SVE
-#ifdef MMLA_INT8
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+// SME kernels
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "sve_interleaved_s8s32_mmla_8x3VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>8); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int32_t>(args); }
+    "sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args) { const auto VL = sme::get_vector_length<int32_t>();
+                               return args._Nsize >= 8*VL || args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL, int8_t, int32_t>(args); }
 },
-#endif
 {
-    GemmMethod::GEMM_HYBRID,
-    "sve_smallK_hybrid_s8s32_dot_8x1VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int32_t>(args); }
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args) { const auto VL = sme::get_vector_length<int32_t>();
+                               return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL, int8_t, int32_t>(args); }
 },
 {
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL",
+    [](const GemmArgs &args) { return args._ci->has_sme2(); },
+    nullptr,
+    [](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL, int8_t, int32_t>(args); }
+},
+#endif // ARM_COMPUTE_ENABLE_SME2
+GemmImplementation<int8_t, int32_t>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8s32_mmla_6x4VL",
+    [](const GemmArgs &args) { return args._ci->has_svei8mm(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int32_t>(args); }
+),
+GemmImplementation<int8_t, int32_t>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_s8s32_mmla_8x3VL",
+    [](const GemmArgs &args) { return args._ci->has_svei8mm() && (args._Ksize>8); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int32_t>(args); }
+),
+GemmImplementation<int8_t, int32_t>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_s8s32_dot_6x4VL",
     [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize>=16; },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int32_t>(args); }
-},
-{
+),
+GemmImplementation<int8_t, int32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_s8s32_dot_8x3VL",
     [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int32_t>(args); }
-},
-#endif // SVE
-#ifdef MMLA_INT8
-{
+),
+#endif // ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<int8_t, int32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_s8s32_mmla_8x12",
-    [](const GemmArgs &args) { return (args._Ksize>8); },
-    nullptr,
+    [](const GemmArgs &args) { return args._ci->has_i8mm() && (args._Ksize>8); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int32_t>(args); }
-},
-#endif
+),
+GemmImplementation<int8_t, int32_t>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8s32_mmla_6x16",
+    [](const GemmArgs &args) { return args._ci->has_i8mm(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int32_t>(args); }
+),
 {
     GemmMethod::GEMM_HYBRID,
     "a64_smallK_hybrid_s8s32_dot_8x4",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
-    nullptr,
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input && !args._accumulate; },
+    [](const GemmArgs &args) { return !(args._ci->has_svei8mm() || args._ci->has_i8mm()); },
     [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_s8s32_dot_8x4, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "a64_smallK_hybrid_s8s32_dot_6x4",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
-    nullptr,
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input && !args._accumulate; },
+    [](const GemmArgs &args) { return !(args._ci->has_svei8mm() || args._ci->has_i8mm()); },
     [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_s8s32_dot_6x4, int8_t, int32_t>(args); }
 },
 {
@@ -108,27 +146,29 @@ static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
     [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && ((args._Msize > 28) || ((args._Msize % 8) > 4)); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int8_t, int32_t>(args); },
 },
-{
+GemmImplementation<int8_t, int32_t>::with_estimate(
+
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_s8s32_dot_6x16",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int32_t>(args); }
-},
-{
+),
+GemmImplementation<int8_t, int32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_s8_8x12",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_s8_8x12, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_8x12, int8_t, int32_t>(args); }
-},
-{
+),
+GemmImplementation<int8_t, int32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_s8_4x4",
     nullptr,
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_s8_4x4, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_4x4, int8_t, int32_t>(args); }
-},
+),
+
 {
     GemmMethod::DEFAULT,
     "",
@@ -145,6 +185,7 @@ const GemmImplementation<int8_t, int32_t> *gemm_implementation_list<int8_t, int3
 
 /* Explicitly instantiate the external functions for these types. */
 template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<int8_t, int32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
 template KernelDescription get_gemm_method<int8_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template std::vector<KernelDescription> get_compatible_kernels<int8_t, int32_t, Nothing> (const GemmArgs &args, const Nothing &);
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 7f870b83d7..897ec9d05f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,10 @@
 #include <cassert>
 
 #include "arm_gemm.hpp"
+#include "bfloat.hpp"
 #include "convolver.hpp"
+#include "kernel_traits.hpp"
+#include "kernel_weight_format.hpp"
 #include "mergeresults.hpp"
 #include "performance_parameters.hpp"
 #include "quantized.hpp"
@@ -56,7 +59,7 @@ namespace {
 // Others output directly to the matrix result.  This helper class calls the
 // appropriate functions, using templating to avoid calling non-existent
 // functions.
-template<bool MergeStep, typename OutputStage>
+template<bool MergeStep, bool FixedFormat, typename OutputStage>
 class kernel_and_merge {
 public:
     template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
@@ -64,7 +67,7 @@ public:
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t b_stride, Tri *c_panel,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
         unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
         const Activation &act, bool accumulate, const OutputStage &os, const int32_t *col_bias,
@@ -74,11 +77,11 @@ public:
 // Run a kernel and call the separate merge step
 template<>
 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
-void kernel_and_merge<true, Nothing>::run(
+void kernel_and_merge<true, false, Nothing>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
         unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
         const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
@@ -101,14 +104,44 @@ void kernel_and_merge<true, Nothing>::run(
     }
 }
 
+// Run a fixed-format kernel and call the separate merge step
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<true, true, Nothing>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t b_stride, Tri *c_panel,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+        unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
+        const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
+{
+    {
+#ifdef CYCLE_PROFILING
+        const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+#endif
+
+        strat.kernel(a_ptr, b_panel, b_stride, c_panel, 1, (n_max - n_0), kern_k);
+    }
+
+    {
+#ifdef CYCLE_PROFILING
+        const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+        auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
+#endif
+        strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate);
+    }
+}
+
 // Run a kernel with integrated merge
 template<>
 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
-void kernel_and_merge<false, Nothing>::run(
+void kernel_and_merge<false, false, Nothing>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
         unsigned int n_0, unsigned int n_max, const Tr *biasptr,
         const Activation &act, bool accumulate, const Nothing &, const int32_t *,
@@ -143,11 +176,11 @@ void kernel_and_merge<false, Nothing>::run(
 // Run a kernel with integrated merge, quantizing
 template<>
 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
-void kernel_and_merge<false, Requantize32>::run(
+void kernel_and_merge<false, false, Requantize32>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
         unsigned int n_0, unsigned int n_max, const Tr *,
         const Activation &, bool accumulate, const Requantize32 &qp, const int32_t *col_bias,
@@ -157,10 +190,19 @@ void kernel_and_merge<false, Requantize32>::run(
     auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
 #endif
 
+    // Offset C pointer in a similar way to non-quantized case above.
+    Tri *offset_c_ptr;
+
+    if (c_ptr == nullptr) {
+        offset_c_ptr = nullptr;
+    } else {
+        offset_c_ptr = c_ptr + m_0 * ldc + n_0;
+    }
+
     strat.kernel(// A and B pointers are just the packed panels.
                  a_ptr, b_panel,
                  // Provide relevant part of output array and row stride.
-                 c_ptr + m_0 * ldc + n_0, ldc,
+                 offset_c_ptr, ldc,
                  // M, N, K sizes
                  m_max-m_0, n_max - n_0, kern_k,
                  // Bias, activation, accumulation.  Need to offset the bias as needed.
@@ -170,11 +212,11 @@ void kernel_and_merge<false, Requantize32>::run(
 // Run a kernel and call the separate quantize step
 template<>
 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
-void kernel_and_merge<true, Requantize32>::run(
+void kernel_and_merge<true, false, Requantize32>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
         unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *,
         const Activation &, bool, const Requantize32 &qp, const int32_t *col_bias,
@@ -192,7 +234,7 @@ void kernel_and_merge<true, Requantize32>::run(
 
     {
 #ifdef CYCLE_PROFILING
-        auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
+        auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr)));
 #endif
         // The interleaved kernel outputs in blocks - each block is a
         // row-major matrix of size out_width * out_height.  The merge
@@ -213,6 +255,84 @@ void kernel_and_merge<true, Requantize32>::run(
     }
 }
 
+// Run a kernel with integrated merge, dequantizing to FP32
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<false, false, DequantizeFloat>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
+        unsigned int n_0, unsigned int n_max, const Tr *bias,
+        const Activation &act, bool accumulate, const DequantizeFloat &dq, const int32_t *col_bias,
+        Tab *acc_buff)
+{
+#ifdef CYCLE_PROFILING
+    auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
+#endif
+
+    const int32_t *offset_col_bias = nullptr;
+    const Tr *offset_bias = nullptr;
+
+    if (col_bias) {
+        offset_col_bias = col_bias + n_0;
+    }
+
+    if (bias) {
+        offset_bias = bias + n_0;
+    }
+
+    strat.kernel(// A and B pointers are just the packed panels.
+                 a_ptr, b_panel,
+                 // Provide relevant part of output array and row stride.
+                 c_ptr ? (c_ptr + m_0 * ldc + n_0) : nullptr, ldc,
+                 // M, N, K sizes
+                 m_max-m_0, n_max - n_0, kern_k,
+                 // Bias, activation, accumulation.  Need to offset the bias as needed.
+                 offset_col_bias, dq, offset_bias, act, accumulate, acc_buff);
+}
+
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<true, false, DequantizeFloat>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+        unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *bias,
+        const Activation &act, bool accumulate, const DequantizeFloat &qp, const int32_t *,
+        Tab *)
+{
+    const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+#endif
+
+        strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+    }
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr)));
+#endif
+        auto out_area = strategy::out_width() * strategy::out_height();
+        for (int i=0; i<bblocks; i++) {
+            const unsigned int n_start = n_0 + (strategy::out_width() * i);
+            const unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
+
+            dequantize_block_32(qp, (n_end - n_start), (m_max - m_0),
+                            c_panel + (i * out_area), strategy::out_width(),
+                            c_ptr + m_0 * ldc + n_start, ldc,
+                            bias != nullptr ? bias + n_start : nullptr, accumulate, act);
+
+        }
+    }
+}
+
 // Integer GEMMs can be used in two contexts - "normal" where the full 32-bit output is required, or in
 // "requantizing" context where the output will be requantized.
 //
@@ -234,25 +354,77 @@ public:
 };
 
 // We need a similar trick here to figure out what type the accumulator buffer should be.
-template<typename strategy, typename OutputStage>
+template<typename strategy, typename OutputStage, bool ForceFloat>
 class accumulate_buffer_type {
 public:
     typedef typename strategy::result_type type;
 };
 
 template<typename strategy>
-class accumulate_buffer_type<strategy, Requantize32> {
+class accumulate_buffer_type<strategy, Requantize32, false> {
 public:
     typedef int32_t type;
 };
 
+template<typename strategy>
+class accumulate_buffer_type<strategy, DequantizeFloat, false> {
+public:
+    typedef int32_t type;
+};
+
+template<typename strategy, typename OutputStage>
+class accumulate_buffer_type<strategy, OutputStage, true> {
+public:
+    typedef float type;
+};
+
+// Stripe width is a concept only needed for FixedFormat kernels.  Use an accessor to avoid issues in other scenarios.
+template<typename strategy, bool FixedFormat>
+struct get_stripe_width {
+    static unsigned int get() {
+        return 0;
+    }
+};
+
+template<typename strategy>
+struct get_stripe_width<strategy, true> {
+    static unsigned int get() {
+        return strategy::stripe_width();
+    }
+};
+
+// KernelWeightFormat is a similar story.
+template<typename strategy, bool FixedFormat, typename To>
+struct get_kernel_weight_format {
+    static KernelWeightFormat get() {
+        return KernelWeightFormat::NON_FIXED;
+    }
+};
+
+template<typename strategy, typename To>
+struct get_kernel_weight_format<strategy, true, To> {
+    static KernelWeightFormat get() {
+        KernelWeightFormat kwf = strategy::kernel_weight_format();
+
+        // If we are using a BF16 kernel to do an FP32 problem (fast mode) then we need to set the BF16 flag on the
+        // weight format.
+        if (std::is_same<To, float>::value && std::is_same<typename strategy::operand_type, bfloat16>::value) {
+            uint32_t kwf_i = static_cast<uint32_t>(kwf);
+            kwf_i |= 0x10;
+            kwf = static_cast<KernelWeightFormat>(kwf_i);
+        }
+
+        return kwf;
+    }
+};
+
 } // anonymous namespace
 
-template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool ForceThreadColumns=false>
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool FixedFormat=false, bool ForceThreadColumns=false, bool ForceFloatAccumulate=false>
 class GemmInterleaved : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
     typedef typename strategy::result_type Tri;
-    typedef typename accumulate_buffer_type<strategy, OutputStage>::type Tab;
+    typedef typename accumulate_buffer_type<strategy, OutputStage, ForceFloatAccumulate>::type Tab;
 
     /* const properties set by constructor */
     const CPUInfo * const _ci;
@@ -270,6 +442,7 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
     const bool _thread_columns;
 
     const Activation _act;
+    const bool _accumulate;
 
     const int _maxthreads;
     int _nthreads;
@@ -310,7 +483,7 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
     class blockwalker {
     private:
         /* Size loops, etc. based on our parent's configuration */
-        const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &_parent;
+        const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns, ForceFloatAccumulate> &_parent;
 
         /* K, X and multi parameters for current iteration. */
         unsigned int _k0=0, _x0=0, _multi=0;
@@ -325,9 +498,9 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
         bool _newmulti=true;
 
     public:
-        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent) : _parent(parent) { }
+        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns, ForceFloatAccumulate> &parent) : _parent(parent) { }
 
-        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent,
+        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns, ForceFloatAccumulate> &parent,
                     unsigned int x_start, unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { }
 
         unsigned int xmax() {
@@ -496,15 +669,46 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
 
     static unsigned int get_k_block_size(const GemmArgs &args) {
         if (args._cfg && args._cfg->inner_block_size) {
-            return args._cfg->inner_block_size;
+            return roundup(args._cfg->inner_block_size, strategy::k_unroll());
         }
 
-        // K blocking not supported if we are requantizing.
-        if (std::is_same<OutputStage, Requantize32>::value) {
+        // K blocking not supported if we are requantizing with the merging
+        // kernels.
+        if (std::is_same<OutputStage, Requantize32>::value && MergeStep) {
             return get_ktotal(args);
         }
 
         const unsigned int L1_size = args._ci->get_L1_cache_size();
+
+        // Special blocking for SME
+        if (is_sme<strategy>::value) {
+            // Target 512 bytes for 64kB L1, or 1024 bytes for 128kB L1.
+            unsigned int target_bytes_per_block = L1_size / 128;
+
+            // Default cache size in gemm-linux is 32kB though - so make
+            // sure minimum is 512
+            if (target_bytes_per_block < 512) {
+                target_bytes_per_block = 512;
+            }
+
+            // Don't bother to block below this size threshold (1.25X target size)
+            unsigned int scaling_threshold = ((target_bytes_per_block * 5) / 4) / sizeof(Toi);
+
+            if (get_ktotal(args) <= scaling_threshold) {
+                return get_ktotal(args);
+            }
+
+            // Once we are blocking, this (lower) threshold determines when we should use more blocks
+            // NOTE: Could be that some factor-based solution would work better here.
+            unsigned int max_block_size = target_bytes_per_block / sizeof(Toi);
+
+            unsigned int num_k_blocks = iceildiv(get_ktotal(args), max_block_size);
+
+            unsigned int k_block = roundup(iceildiv(get_ktotal(args), num_k_blocks), strategy::k_unroll());
+
+            return k_block;
+        }
+
         unsigned int k_block;
 
         // k_block: Find out how much of the larger array can be loaded into half the cache.
@@ -539,6 +743,17 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
             return roundup(args._cfg->outer_block_size, strategy::out_width());
         }
 
+        // Special blocking for SME
+        if (is_sme<strategy>::value) {
+            // If total width is less than 4x kernel width, return the entire width.
+            if (args._Nsize < strategy::out_width()*4) {
+                return roundup(args._Nsize, strategy::out_width());
+            }
+
+            // Otherwise block to single kernel width.
+            return strategy::out_width();
+        }
+
         unsigned int x_block;
         const unsigned int L2_size = args._ci->get_L2_cache_size();
         const unsigned int k_block = get_k_block_size(args);
@@ -580,7 +795,7 @@ public:
                       _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
                       _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
                       _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
-                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _act(args._act), _accumulate(args._accumulate), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
                       _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
                       _os(os) { }
 
@@ -590,7 +805,7 @@ public:
                       _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
                       _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
                       _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
-                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _act(args._act), _accumulate(args._accumulate),  _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
                       _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
                       _os() { }
 
@@ -623,7 +838,7 @@ public:
 #endif
 
         /* Make sure we've been set up correctly. */
-        assert(_B_transposed);
+        assert(FixedFormat || _B_transposed);
         assert(_working_space);
         int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
 
@@ -663,10 +878,17 @@ public:
                     const bool first_pass = (k0==0);
                     const bool last_pass  = (kmax==_Ktotal);
 
+                    // Bias is passed for the first pass only, except for dequantizefloat nomerge cases where it's the last pass.
+                    const bool bias_pass = (std::is_same<OutputStage, DequantizeFloat>::value && !MergeStep) ? last_pass : first_pass;
+
                     // Figure out how many "K" the kernel will actually process.
                     unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll());
 
-                    const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
+                    const Toi *b_ptr = FixedFormat ?
+                        reinterpret_cast<const Toi *>(this->_Bptr) + (multi * this->_B_multi_stride) +
+                                                     ((start_x / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
+                                                     (k0 * get_stripe_width<strategy, FixedFormat>::get()) :
+                        _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
 
                     unsigned int batch     = batch_0;
                     unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
@@ -698,25 +920,32 @@ public:
                             }
                         }
 
+                        Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride);
+
+                        // If we are using an accumulation buffer and this isn't the last pass, don't pass a result pointer.
+                        if (_accumulation_buffer && !last_pass) {
+                            result_ptr = nullptr;
+                        }
+
                         // Perform the kernel and merge step, either separately or together as required.
-                        kernel_and_merge<MergeStep, OutputStage>::run(
+                        kernel_and_merge<MergeStep, FixedFormat, OutputStage>::run(
                         #ifdef CYCLE_PROFILING
                             prof,
                         #endif
                             // Strategy and panel pointers
-                            strat, a_panel, b_ptr, c_panel,
+                            strat, a_panel, b_ptr, this->_ldb, c_panel,
                             // Result buffer pointers
-                            this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
+                            result_ptr, this->_ldc,
                             // K size, and M/N ranges
                             kern_k, start_row, end_row, start_x, end_x,
                             // Only do bias on the first pass
-                            ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
+                            ((bias_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
                             // Only do activation on the last pass, and accumulation on any non-first pass.
-                            (last_pass ? _act : Activation()), !first_pass,
+                            (last_pass ? _act : Activation()), (!first_pass || _accumulate),
                             // Pass in quantization parameters for requantizing kernels (others will ignore)
                             _os, col_bias + (multi * _Nsize),
-                            // Accumulation buffer (not yet implemented on this path)
-                            static_cast<Tab *>(nullptr));
+                            // Accumulation buffer
+                            get_accumulation_buffer(start_row, start_x, batch, multi));
 
                         /* Increment to the next block */
                         start_row += strategy::out_height();
@@ -802,6 +1031,13 @@ public:
                     }
                 }
 
+                // For FixedFormat cases, figure out the B pointer.  The loop below moves through batches and vertically through the output so this will be the same throughout.
+                if (FixedFormat) {
+                    b_panel = reinterpret_cast<const Toi *>(this->_Bptr) + (current.multi() * this->_B_multi_stride) +
+                                                                           ((current.x0() / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
+                                                                           (current.k0() * get_stripe_width<strategy, FixedFormat>::get());
+                }
+
                 /* Do the actual work. */
                 for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
                     unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
@@ -830,6 +1066,9 @@ public:
                         const bool first_pass = (current.k0() == 0);
                         const bool last_pass  = (current.kmax() == _Ktotal);
 
+                        // Bias is passed for the first pass only, except for dequantizefloat nomerge cases where it's the last pass.
+                        const bool bias_pass = (std::is_same<OutputStage, DequantizeFloat>::value && !MergeStep) ? last_pass : first_pass;
+
                         // Pointer to appropriate part of result array.
                         Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
 
@@ -840,20 +1079,20 @@ public:
                         }
 
                         // Perform the kernel and merge step, either separately or together as required.
-                        kernel_and_merge<MergeStep, OutputStage>::run(
+                        kernel_and_merge<MergeStep, FixedFormat, OutputStage>::run(
                         #ifdef CYCLE_PROFILING
                             prof,
                         #endif
                             // Strategy and panel pointers
-                            strat, a_ptr, b_panel, c_panel,
+                            strat, a_ptr, b_panel, this->_ldb, c_panel,
                             // Result buffer pointers
                             result_ptr, this->_ldc,
                             // K size, and M/N ranges
                             kern_k, y, ymax, current.x0(), current.xmax(),
                             // Only do bias on the first pass
-                            ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
+                            ((bias_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
                             // Only do activation on the last pass, and accumulation on any non-first pass.
-                            (last_pass ? _act : Activation()), !first_pass,
+                            (last_pass ? _act : Activation()), (!first_pass || _accumulate),
                             // Pass in quantization parameters for requantizing kernels (others will ignore)
                             _os, col_bias + (current.multi() * _Nsize),
                             // Accumulation buffer
@@ -863,7 +1102,9 @@ public:
                     }
                 }
 
-                b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
+                if (FixedFormat == false) {
+                    b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
+                }
             }
         }
     }
@@ -910,20 +1151,31 @@ public:
 
     // Interface implementation - pretransposed
     bool B_is_pretransposed() const override {
-        return true;
+        return (FixedFormat == false);
     }
 
     bool B_pretranspose_required() const override {
-        return (_B_transposed==nullptr);
+        return (FixedFormat == false) && (_B_transposed==nullptr);
     }
 
     size_t get_B_pretransposed_array_size() const override {
+        if (FixedFormat) {
+            return 0;
+        }
+
         unsigned int x_size = roundup(_Nsize, strategy::out_width());
 
         return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size();
     }
 
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+    size_t get_B_pretranspose_window_size() const override {
+        size_t n_blocks = iceildiv(_Nsize, _x_block);
+        size_t k_blocks = iceildiv(_Ktotal, _k_block);
+
+        return n_blocks * k_blocks * _nmulti;
+    }
+
+    void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         if (std::is_same<OutputStage, Requantize32>::value) {
             col_bias = reinterpret_cast<int32_t *>(in_buffer);
 
@@ -934,8 +1186,26 @@ public:
                 compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
             }
         }
+    }
+
+    // Support for transposed B is a property of the strategy::transpose type
+    bool B_pretranspose_supports_transpose() const override {
+        typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+
+        return transforms.PrepareB_supports_transpose();
+    }
 
-        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, const bool transposed) override {
+        pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, transposed, 0, get_B_pretranspose_window_size());
+    }
+
+    void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, const bool transposed, size_t start, size_t end) override {
+        // Perform column sums etc as part of the last block.
+        if (end >= get_B_pretranspose_window_size()) {
+            requantize_bias(in_buffer, B, ldb, B_multi_stride);
+        }
+
+        // Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
         Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
         _B_transposed = buffer;
@@ -943,57 +1213,84 @@ public:
         blockwalker current(*this);
         strategy strat(_ci);
 
-        do {
+        // Skip over blocks we aren't doing
+        for(size_t i = 0; i < start; i++) {
+            buffer += roundup(current.xmax() - current.x0(), strategy::out_width()) * roundup(current.kmax() - current.k0(), strategy::k_unroll());
+            current.advance();
+        }
+
+        size_t blocks_left = (end - start);
+
+        // Double check that we haven't run out of work
+        if (current.done()) {
+            blocks_left = 0;
+        }
+
+        for (/* blocks_left initialized above */; blocks_left > 0; blocks_left--) {
             /* Figure out the size of each block. */
             unsigned int k_size = (current.kmax() - current.k0());
 
-            // We need to insert padding at the end of each K section.
-            // The computation needed is a little delicate - the coordinates from the block walker are expressed in
-            // terms of the full, padded, _Ktotal.
-            // But we need to transform each section with reference to the original, unpadded, input, letting the
-            // transform pad each section as needed.
+            if (_Ksections > 1) {
+                // We need to insert padding at the end of each K section.
+                // The computation needed is a little delicate - the coordinates from the block walker are expressed in
+                // terms of the full, padded, _Ktotal.
+                // But we need to transform each section with reference to the original, unpadded, input, letting the
+                // transform pad each section as needed.
 
-            // This is needed for computations below.
-            const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll());
+                // This is needed for computations below.
+                const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll());
 
-            // The expected output format is also an entire <out_width> columns interleaved, then the next set of
-            // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
-            // a time.
-            for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ){
-                unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
+                // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+                // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
+                // a time.
+                for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ) {
+                    unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
 
-                // Track where we are and how much work is left.
-                unsigned int kpos  = current.k0();
-                unsigned int kleft = k_size;
+                    // Track where we are and how much work is left.
+                    unsigned int kpos  = current.k0();
+                    unsigned int kleft = k_size;
 
-                while (kleft) {
-                    // Which section are we in?  Based on the rounded-up section size.
-                    unsigned int k_section_base = kpos / rounded_section_size;
-                    // How far into the section are we?
-                    unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+                    while (kleft) {
+                        // Which section are we in?  Based on the rounded-up section size.
+                        unsigned int k_section_base = kpos / rounded_section_size;
+                        // How far into the section are we?
+                        unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
 
-                    // We will either copy the rest of this section, or to the end of the requested length.
-                    unsigned int k_length = std::min(_Ksize - k_offset, kleft);
+                        // We will either copy the rest of this section, or to the end of the requested length.
+                        unsigned int k_length = std::min(_Ksize - k_offset, kleft);
 
-                    strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
-                                              x0, xmax,
-                                              (k_section_base * _Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
-                                              (k_section_base * _Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
+                        strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
+                                                  x0, xmax,
+                                                  (k_section_base * _Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
+                                                  (k_section_base * _Ksize) + k_offset + k_length,    // K end point - starting point plus length computed above.
+                                                  transposed);
 
-                    // We need to modify our position based on the ROUNDED version of what we just did.
-                    unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+                        // We need to modify our position based on the ROUNDED version of what we just did.
+                        unsigned int padded_length = roundup(k_length, strategy::k_unroll());
 
-                    buffer += strategy::out_width() * padded_length;
+                        buffer += strategy::out_width() * padded_length;
 
-                    kpos  += padded_length;
-                    kleft -= padded_length;
+                        kpos  += padded_length;
+                        kleft -= padded_length;
+                    }
                 }
+            } else {
+                // In the single K section case, can process the whole lot in one go.
+                // Caution: 'blockwalker::kmax()' rounds up, so clamp to valid _Ksize.
+                strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
+                                          current.x0(), current.xmax(), current.k0(), std::min(current.kmax(), _Ksize), transposed);
+                buffer += roundup(current.xmax() - current.x0(), strategy::out_width()) * roundup(current.kmax() - current.k0(), strategy::k_unroll());
             }
-        } while (current.advance());
+
+            // Advance to the next block, break if we run off the end.
+            if (!current.advance()) {
+                break;
+            }
+        }
     }
 
     void set_pretransposed_B_data(void *in_buffer) override {
-        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+        // Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
         _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
         col_bias = reinterpret_cast<int32_t *>(in_buffer);
@@ -1008,6 +1305,13 @@ public:
         }
     }
 
+    void set_dequantize_scale(const float scale) override {
+        if(std::is_same<OutputStage, DequantizeFloat>::value) {
+            DequantizeFloat* df = reinterpret_cast<DequantizeFloat *>(&_os);
+            df->scale = scale;
+        }
+    }
+
     void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
         assert(string_len == _Ksize);
         _indirect_buf = ptr;
@@ -1019,12 +1323,15 @@ public:
     }
 
     // Estimate cycles for given problem given provided parameters
-    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+    template<typename perf_type>
+    static uint64_t estimate_cycles(const GemmArgs &args) {
         unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
 
+        const PerformanceParameters &params = strategy::template get_performance_parameters<perf_type>(args._ci);
+
         uint64_t total_macs    = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * get_ktotal(args);
         uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * get_ktotal(args) * sizeof(Toi);
-        uint64_t merge_bytes   = static_cast<uint16_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
+        uint64_t merge_bytes   = static_cast<uint64_t>(args._nbatches) * args._nmulti * k_blocks * args._Msize * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
 
         float mac_cycles     = static_cast<float>(total_macs) / params.kernel_macs_cycle;
         float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
@@ -1042,16 +1349,37 @@ public:
 
         return static_cast<uint64_t>(total_cycles);
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c;
+
+        c.method = GemmMethod::GEMM_INTERLEAVED;
+        c.inner_block_size = _k_block;
+        c.outer_block_size = _x_block;
+        c.filter = get_type_name<strategy>();
+        c.weight_format = get_weight_format(get_kernel_weight_format<strategy, FixedFormat, To>::get(), sizeof(To));
+
+        return c;
+    }
 };
 
 // Aliases for the variations
 template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
 using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>;
 
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
+using GemmInterleavedFixedFormat = GemmInterleaved<strategy, To, Tr, OutputStage, true, true>;
+
 template<typename strategy, typename To, typename Tr>
 using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, Tr, Requantize32, false>;
 
 template<typename strategy, typename To, typename Tr>
 using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>;
 
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedNoMergeDequantized = GemmInterleaved<strategy, To, Tr, DequantizeFloat, false>;
+
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedDequantized = GemmInterleaved<strategy, To, Tr, DequantizeFloat>;
+
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
deleted file mode 100644
index b71f390ab9..0000000000
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "arm_gemm.hpp"
-#include "utils.hpp"
-
-#include "mergeresults.hpp"
-#include "transform.hpp"
-
-#ifdef CYCLE_PROFILING
-#include "profiler.hpp"
-#endif
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-
-// Some macros used to decide how much working space to allocate.
-// Round allocations up to the next cache line.
-#define ALLOC_ROUND    64
-#define ROUND_UP(x)    ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
-
-// Implementation of the GemmCommon abstract class.
-//
-// This implementation interleaves the source matrices in blocks - good for
-// larger matrices.
-namespace arm_gemm {
-
-template<typename strategy, typename To, typename Tr>
-class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> {
-    typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type Tri;
-
-    /* const properties set by constructor */
-    const CPUInfo * const _ci;
-
-    const unsigned int _Msize;
-    const unsigned int _Nsize;
-    const unsigned int _Ksize;
-
-    const unsigned int _nbatches;
-    const unsigned int _nmulti;
-
-    const Activation _act;
-
-    const int _maxthreads;
-    int _nthreads;
-
-    /* Blocking info */
-    unsigned int _k_block=0;
-    unsigned int _x_block=0;
-
-    unsigned int _Mround_div=0;
-    unsigned int _Mround=0;
-    unsigned int _Nround_div=0;
-    unsigned int _Nround=0;
-
-    /* Working space, pretransposed buffer */
-    const Toi *_B_transposed=nullptr;
-    void *_working_space=nullptr;
-
-    /* We will need to walk through the blocks of B in a few contexts, so
-     * factor that out.  */
-    class blockwalker {
-    private:
-        /* Size loops, etc. based on our parent's configuration */
-        const GemmInterleavedPretransposed2d<strategy, To, Tr> &_parent;
-
-        /* K, X and multi parameters for current iteration. */
-        unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0;
-
-        unsigned int _index=0;
-        bool _done=false;
-        bool _newkblock=true;
-        bool _newmulti=true;
-
-    public:
-        blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent)
-        : _parent(parent)
-        , _xmax { parent._Nsize }
-        { }
-
-        blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax)
-        : _parent(parent)
-        , _x0   { x0   }
-        , _xmin { x0   }
-        , _xmax { xmax }
-        {
-            assert(_x0 <= _xmax);
-        }
-
-        unsigned int xmax() {
-            return std::min(_x0 + _parent._x_block, _xmax);
-        }
-
-        unsigned int kmax() {
-            return std::min(_k0 + _parent._k_block, _parent._Ksize);
-        }
-
-        /* Advance to the next block, return false at the end. */
-        bool advance(void) {
-            if (_done) {
-                return false;
-            }
-
-            _newkblock=false;
-            _x0 += _parent._x_block;
-            if (_x0 >= _xmax) {
-                _x0=_xmin;
-                _k0 += _parent._k_block;
-                if (_k0 >= _parent._Ksize) {
-                    _k0=0;
-                    _multi++;
-                    if (_multi >= _parent._nmulti) {
-                        _done=true;
-                        return false;
-                    }
-                    _newmulti=true;
-                }
-                _newkblock=true;
-            }
-            _index++;
-
-            return true;
-        }
-
-        unsigned int k0(void) { return _k0; }
-        unsigned int x0(void) { return _x0; }
-        unsigned int multi(void) { return _multi; }
-        unsigned int index(void) { return _index; }
-        bool done(void) { return _done; }
-        bool newkblock(void) { return _newkblock; }
-    };
-
-    // A working size: One of these needed, regardless of thread count.  Divided according to window.
-    size_t get_a_working_size() const {
-        return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2;
-    }
-
-    // As B will be pretranspose we do not need to alloc any space for it
-    size_t get_b_working_size() const {
-        return 0;
-    }
-
-    // C working size: One needed per thread.
-    size_t get_c_working_size() const {
-        return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
-    }
-
-    // Internal execute function.
-    // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
-    void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int) {
-        /* Make sure we've been set up correctly. */
-        assert(_B_transposed);
-        assert(_working_space);
-        assert(this->_Aptr);
-        assert(this->_Cptr);
-
-#ifdef CYCLE_PROFILING
-        profiler prof;
-#endif
-        strategy strat(_ci);
-
-        /* Translate 'start' and 'end' into a position within the batches and rows. */
-        const unsigned int window_per_batch = _Mround / strategy::out_height();
-        unsigned int batch_0   = m_start / window_per_batch;
-        unsigned int batch_end = m_end   / window_per_batch;
-
-        /* Compute the M values to operate on */
-        unsigned int m_0   = (m_start - (batch_0 * window_per_batch)) * strategy::out_height();
-        unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height();
-
-        unsigned int n_0   = std::min(this->_Nsize, strategy::out_width() * n_start);
-        unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end);
-
-        blockwalker current(*this, n_0, n_max);
-
-        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
-
-        auto c_panel_start = working_space_bytes;
-        auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads;
-
-        auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid);
-        auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * threadid);
-
-        /* B^t is stored in interleaved panels separated by their K-block component
-         * we want to store a pointer to the start of the current k-page
-         * then when we come to the next k-block we just add the size of the previous to
-         * this base pointer
-         */
-        const Toi *b_panel_start = _B_transposed;
-        // b_panels stores a pointer to the start of our current block inside of the k-block
-        const Toi *b_panel       = b_panel_start;
-
-        // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
-        unsigned b_page_size = 0;
-        int kern_k = 0;
-        for (;!current.done();current.advance()) {
-            int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
-
-            if (current.newkblock()) {
-                kern_k         = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
-                kern_k        *= strat.k_unroll();
-
-                unsigned b_thread_start_offset = iceildiv(current.x0(), strategy::out_width());
-
-                b_panel_start += b_page_size;
-                b_panel        = b_panel_start + (b_thread_start_offset * strat.out_width() * kern_k);
-                b_page_size    = _Nround * kern_k;
-
-                for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
-                    unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
-                    unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
-
-                    if (first_m >= last_m)
-                        continue;
-
-                    auto a_thread_panel_in  = this->_Aptr
-                                            + (batch * this->_A_batch_stride)
-                                            + (current.multi() * this->_A_multi_stride);
-
-                    auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block);
-
-                    strat.transforms.PrepareA(
-                        a_thread_panel_out,
-                        a_thread_panel_in,
-                        this->_lda,
-                        first_m,
-                        last_m,
-                        current.k0(),
-                        current.kmax(),
-                        0);
-                }
-            }
-
-            /* Do the actual work. */
-            for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
-                unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
-                unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
-
-                const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
-
-                if (first_m >= last_m)
-                    continue;
-
-                for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
-                    unsigned int ymax = std::min(_Msize, y + strategy::out_height());
-
-                    strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
-                    a_ptr += (strategy::out_height() * kern_k);
-
-                    /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */
-                    const bool first_pass = current.k0()==0;
-                    const bool last_pass  = current.kmax()==_Ksize;
-
-                    auto c_panel_out = this->_Cptr
-                                     + this->_C_batch_stride * batch
-                                     + this->_C_multi_stride * current.multi();
-
-                    auto bias        = (first_pass && this->_bias)
-                                     ? this->_bias + (current.multi() * this->_bias_multi_stride)
-                                     : nullptr;
-
-                    auto act        = last_pass ? _act : Activation();
-
-                    strat.transforms.Merge(
-                        c_panel_out,
-                        c_panel,
-                        this->_ldc,
-                        y,
-                        ymax,
-                        current.x0(),
-                        current.xmax(),
-                        bias,
-                        act,
-                        !first_pass);  //Append
-                }
-            }
-
-            b_panel += (bblocks * strat.out_width() * kern_k);
-        }
-    }
-
-    static unsigned int get_k_block_size(const GemmArgs &args) {
-        // Work out blocking parameters, or override from provided GemmConfig
-        if (args._cfg && args._cfg->inner_block_size) {
-            return args._cfg->inner_block_size;
-        }
-
-        const unsigned int L1_size = args._ci->get_L1_cache_size();
-        unsigned int k_block;
-
-        // k_block: Find out how much of the larger array can be loaded into half the cache.
-        // This should account for associative caches.
-        k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
-        // Needs to be (at least a single) multiple of the K unroll level.
-        k_block /= strategy::k_unroll();
-        k_block = std::max(k_block, 1U) * strategy::k_unroll();
-
-        // Now tune to presented problem size; this is how many blocks we need.
-        unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
-
-        // So divide the space equally into that many blocks.
-        k_block = iceildiv(args._Ksize, numk_blocks);
-
-        // And round UP to the K unroll level required.
-        k_block = iceildiv(k_block, strategy::k_unroll());
-        k_block *= strategy::k_unroll();
-
-        return k_block;
-    }
-
-public:
-    GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete;
-    GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete;
-
-    /* Constructor */
-    GemmInterleavedPretransposed2d(const GemmArgs &args)
-    :    _ci(args._ci)
-    ,    _Msize(args._Msize)
-    ,    _Nsize(args._Nsize)
-    ,    _Ksize(args._Ksize)
-    ,    _nbatches(args._nbatches)
-    ,    _nmulti(args._nmulti)
-    ,    _act(args._act)
-    ,    _maxthreads(args._maxthreads)
-    ,    _nthreads(args._maxthreads)
-    ,    _k_block(get_k_block_size(args))
-    // Work out the rounded size of M - needed for some buffers.
-    ,    _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
-    ,    _Mround     ( _Mround_div * strategy::out_height()     )
-
-    ,    _Nround_div ( iceildiv(_Nsize, strategy::out_width()) )
-    ,    _Nround     ( _Nround_div * strategy::out_width()     )
-    {
-        assert(_maxthreads > 0);
-
-        const unsigned int L2_size = _ci->get_L2_cache_size();
-
-        if (args._cfg && args._cfg->outer_block_size) {
-            _x_block = args._cfg->outer_block_size;
-        } else {
-            // x_block: Work out how many rows (of length k_block) will fit in the L2
-            // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-            _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
-                      (sizeof(Toi) * _k_block);
-
-            // Needs to be (at least a single) multiple of the kernel output width.
-            _x_block /= strategy::out_width();
-            _x_block = std::max(_x_block, 1U) * strategy::out_width();
-
-            // And tune to the presented problem size.
-            unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
-            _x_block = iceildiv(_Nsize, num_x_blocks);
-
-            _x_block = iceildiv(_x_block, strategy::out_width());
-            _x_block *= strategy::out_width();
-        }
-    }
-
-    // Interface implementation - Compulsory functions
-    ndrange_t get_window_size() const override {
-        unsigned m = (_Mround / strategy::out_height()) * _nbatches;
-        unsigned n = _Nround_div;
-
-        return { m, n };
-    }
-
-    bool supports_dynamic_scheduling() const override {
-        return true;
-    }
-
-    // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
-    void set_nthreads(int nthreads) override {
-        _nthreads = std::min(nthreads, _maxthreads);
-    }
-
-    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
-        /* This particular GEMM implementation can only be broken up over the M & N
-         * dimensions, we inform the frame work of this limitation via the get_window_size function
-         */
-        const auto m_start = work_range.get_position(0);
-        const auto n_start = work_range.get_position(1);
-        const auto m_size  = work_range.get_size(0);
-        const auto n_size  = work_range.get_size(1);
-        const auto m_end   = m_start + m_size;
-        const auto n_end   = n_start + n_size;
-
-        const auto m_threadid = thread_locator.get_position(0);
-        const auto n_threadid = thread_locator.get_position(1);
-
-        execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid);
-    }
-
-    std::size_t get_working_size() const override {
-        /* Because we do not know how schedular will break up
-         * the task, we need to ensure that alloc enough
-         * space to be able to handle the case where every thread
-         * is parallelised across B AND also every thrread is parallelised across A
-         *
-         * If we parallelise across A, then we only need one buffer of A and 64 buffers of B
-         * If we parallelise across B, then we only need 64 buffer of B and
-         */
-        return get_c_working_size() * _maxthreads
-             + get_a_working_size() * _maxthreads
-             + 64; //to account for cacheline alignment
-    }
-
-
-    void set_working_space(void *working_space) override {
-        // Make sure everything ends up cache line aligned
-        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
-        intptr_t working_space_int  = reinterpret_cast<intptr_t>(working_space);
-
-        size_t diff=0;
-
-        if (working_space_int & 0x3F) {
-            diff = 0x40 - (working_space_int & 0x3F);
-        }
-
-        working_space_bytes += diff;
-
-        _working_space = reinterpret_cast<void *>(working_space_bytes);
-    }
-
-    // Interface implementation - pretransposed
-    bool B_is_pretransposed() const override {
-        return true;
-    }
-
-    bool B_pretranspose_required() const override {
-        return _B_transposed==nullptr;
-    }
-
-    // TODO: this could almost certainly be considerably simpler.
-    size_t get_B_pretransposed_array_size() const override {
-        size_t total=0;
-        blockwalker current(*this);
-
-        do {
-            /* Figure out the size of each block. */
-            unsigned int x_size = (current.xmax() - current.x0());
-            unsigned int k_size = (current.kmax() - current.k0());
-
-            /* Round sizes up as needed. */
-            x_size = iceildiv(x_size, strategy::out_width());
-            x_size *= strategy::out_width();
-
-            k_size = iceildiv(k_size, strategy::k_unroll());
-            k_size *= strategy::k_unroll();
-
-            total += x_size * k_size * sizeof(Toi);
-        } while (current.advance());
-
-        return total;
-    }
-
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
-        blockwalker current(*this);
-        Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
-        _B_transposed = buffer;
-        strategy strat(_ci);
-
-        do {
-            /* Figure out the size of each block. */
-            unsigned int x_size = (current.xmax() - current.x0());
-            unsigned int k_size = (current.kmax() - current.k0());
-
-            /* Round sizes up as needed. */
-            x_size = iceildiv(x_size, strategy::out_width());
-            x_size *= strategy::out_width();
-
-            k_size = iceildiv(k_size, strategy::k_unroll());
-            k_size *= strategy::k_unroll();
-
-            strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
-                                      current.x0(), current.xmax(), current.k0(), current.kmax());
-
-            buffer += (x_size * k_size);
-        } while (current.advance());
-    }
-
-    void set_pretransposed_B_data(void *in_buffer) override {
-        _B_transposed = reinterpret_cast<Toi *>(in_buffer);
-    }
-
-    // Estimate cycles for given problem given provided parameters
-    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
-        unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
-        unsigned int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
-        unsigned int n_blocks = iceildiv(args._Nsize, strategy::out_width());
-
-        uint64_t total_macs    = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
-        uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
-        uint64_t merge_bytes   = static_cast<uint64_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
-
-        // Wide problems incur extra preparation cost, as it is done per thread.
-        // Duplicate the logic the scheduler will later use to figure out how much that will affect us
-        float ratio = m_blocks / static_cast<float>(n_blocks);
-
-        unsigned int ideal_height = static_cast<unsigned int>(std::sqrt(args._maxthreads * ratio) + 0.5);
-        unsigned int height = 1;
-
-        if (ideal_height == 0) {
-            height = 1;
-        } else {
-            for (unsigned int adj=0; adj<ideal_height; adj++) {
-                const unsigned int round_down = ideal_height - adj;
-                if (args._maxthreads % round_down == 0) {
-                    height = round_down;
-                    break;
-                }
-
-                const unsigned int round_up = ideal_height + adj;
-                if (args._maxthreads % round_up == 0) {
-                    height = round_up;
-                    break;
-                }
-            }
-        }
-
-        // We've computed the height here - we need to multiply the amount of preparation effort by the width (which is total threads / height)
-        prepare_bytes *= (args._maxthreads / height);
-
-        float mac_cycles     = static_cast<float>(total_macs) / params.kernel_macs_cycle;
-        float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
-        float merge_cycles   = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
-
-        float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
-
-        // We can't thread over multis, which might be a problem in some
-        // threaded cases.  Penalize that here.
-        float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches * iceildiv(args._Nsize, strategy::out_width())) * 0.9;
-
-        if (parallelism_available < args._maxthreads) {
-            total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
-        }
-
-        return static_cast<uint64_t>(total_cycles);
-    }
-};
-
-} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
index 094b6fdff4..321c97262f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,23 +29,38 @@
 #include "kernels/a64_gemm_s8_4x4.hpp"
 #include "kernels/a64_gemm_s8_8x12.hpp"
 #include "kernels/a64_hybrid_s8qa_dot_4x16.hpp"
+#include "kernels/a64_hybrid_s8qa_mmla_4x16.hpp"
 #include "kernels/a64_hybrid_s8qs_dot_6x16.hpp"
+#include "kernels/a64_hybrid_s8qs_mmla_6x16.hpp"
 #include "kernels/a64_hybrid_s8s32_dot_6x16.hpp"
+#include "kernels/a64_hybrid_s8s32_mmla_6x16.hpp"
 #include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp"
 #include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp"
 #include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp"
 
-#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+#include "kernels/sme2_gemv_s8qa_dot_16VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SME2
+
 #include "kernels/sve_hybrid_s8qa_dot_4x4VL.hpp"
+#include "kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp"
 #include "kernels/sve_hybrid_s8qs_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp"
+#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp"
 #include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp"
 #include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp"
-#include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SVE
 
 #include "gemm_hybrid_indirect.hpp"
 #include "gemm_hybrid_quantized.hpp"
 #include "gemm_hybrid_quantized_inline.hpp"
 #include "gemm_interleaved.hpp"
+#include "gemv_pretransposed.hpp"
 #include "quantize_wrapper.hpp"
 #include "utils.hpp"
 
@@ -53,75 +68,136 @@ namespace arm_gemm {
 
 static const GemmImplementation<int8_t, int8_t, Requantize32> gemm_qint8_methods[] =
 {
-#ifdef __ARM_FEATURE_SVE
-#ifdef MMLA_INT8
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+{
+    GemmMethod::GEMV_PRETRANSPOSED,
+    "sme2_gemv_s8qa_dot_16VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && quant_hybrid_asymmetric(qp) && args._Msize == 1 && !args._indirect_input && args._nbatches == 1;  },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemvPretransposed<cls_sme2_gemv_s8qa_dot_16VL, int8_t, int8_t, Requantize32>(args, qp); }
+},
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "sve_interleaved_s8s32_mmla_8x3VL",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>8); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t>(args, qp); }
+    "sme2_interleaved_nomerge_s8q_mopa_1VLx4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<int32_t>();
+                               return args._Nsize >= 8*VL || args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL, int8_t, int8_t>(args, qp); }
 },
-#endif
 {
-    GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "sve_smallK_hybrid_s8s32_dot_8x1VL",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int8_t>(args, qp); }
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_s8q_mopa_4VLx1VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<int32_t>();
+                               return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL, int8_t, int8_t>(args, qp); }
 },
-#ifdef SVE2
 {
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_s8q_mopa_2VLx2VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL, int8_t, int8_t>(args, qp); }
+},
+#endif // ARM_COMPUTE_ENABLE_SME2
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8qa_mmla_4x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qa_mmla_4x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_mmla_4x4VL, int8_t, int8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8qs_mmla_6x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_symmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qs_mmla_6x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_mmla_6x4VL, int8_t, int8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_s8s32_mmla_8x3VL",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm() && (args._Ksize>8); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t>(args, qp); }
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_hybrid_s8s32_mmla_6x4VL",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int8_t, Requantize32, true>(args, qp); }
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_s8qs_dot_6x4VL",
-    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve() && quant_hybrid_symmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_symmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, Requantize32>(args, qp); }
-},
-{
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_s8qa_dot_4x4VL",
-    [](const GemmArgs &args, const Requantize32 &qp) { return  args._ci->has_sve() && quant_hybrid_asymmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_asymmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, Requantize32>(args, qp); }
-},
-#endif
-{
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_s8s32_dot_6x4VL",
-    [](const GemmArgs &args, const Requantize32 &) { return  args._ci->has_sve(); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, Requantize32, true>(args, qp); }
-},
-{
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_s8s32_dot_8x3VL",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t>(args, qp); }
-},
-#endif // SVE
-#ifdef MMLA_INT8
-{
+),
+#endif // ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8qa_mmla_4x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_asymmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qa_mmla_4x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qa_mmla_4x16, int8_t, int8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8qs_mmla_6x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_symmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qs_mmla_6x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qs_mmla_6x16, int8_t, int8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_s8s32_mmla_8x12",
-    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
-    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm() && (args._Ksize>8); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t>(args, qp); }
-},
-#endif
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_hybrid_s8s32_mmla_6x16",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int8_t, Requantize32, true>(args, qp); }
+),
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "a64_smallK_hybrid_s8s32_dot_8x4",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
-    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return !(args._ci->has_svei8mm() || args._ci->has_i8mm()); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_s8s32_dot_8x4, int8_t, int8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "a64_smallK_hybrid_s8s32_dot_6x4",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
-    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return !(args._ci->has_svei8mm() || args._ci->has_i8mm()); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_s8s32_dot_6x4, int8_t, int8_t>(args, qp); }
 },
 {
@@ -135,42 +211,42 @@ GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_s8qs_dot_6x16",
     [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_symmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, Requantize32>::estimate_cycles(args, cls_a64_hybrid_s8qs_dot_6x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, Requantize32>(args, qp); }
 ),
 GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_s8qa_dot_4x16",
     [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, Requantize32>::estimate_cycles(args, cls_a64_hybrid_s8qa_dot_4x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, Requantize32>(args, qp); }
 ),
 GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_s8s32_dot_6x16",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, Requantize32, true>::estimate_cycles(args, cls_a64_hybrid_s8s32_dot_6x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, Requantize32, true>(args, qp); }
 ),
 GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_s8_8x12",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t>::estimate_cycles(args, cls_a64_gemm_s8_8x12::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t>(args, qp); }
 ),
-{
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_s8_4x4",
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_s8_4x4, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_4x4, int8_t, int8_t>(args, qp); }
-},
+),
 {
     GemmMethod::QUANTIZE_WRAPPER,
     "quantized_wrapper",
     [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; },
-    [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); },
+    [](const GemmArgs &, const Requantize32 &) { return false; },
     [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<int8_t, int8_t, int32_t>(args, qp); }
 },
 {
@@ -188,6 +264,7 @@ const GemmImplementation<int8_t, int8_t, Requantize32> *gemm_implementation_list
 }
 
 template UniqueGemmCommon<int8_t, int8_t> gemm<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
+template bool has_opt_gemm<int8_t, int8_t, Requantize32>(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os);
 template KernelDescription get_gemm_method<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
 template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
index be27b3a117..93eecf991e 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,89 +29,150 @@
 #include "kernels/a64_gemm_u8_4x4.hpp"
 #include "kernels/a64_gemm_u8_8x12.hpp"
 #include "kernels/a64_hybrid_u8qa_dot_4x16.hpp"
+#include "kernels/a64_hybrid_u8qa_mmla_4x16.hpp"
 #include "kernels/a64_hybrid_u8u32_dot_6x16.hpp"
+#include "kernels/a64_hybrid_u8u32_mmla_6x16.hpp"
 #include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp"
 #include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp"
 #include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp"
 
-#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+#include "kernels/sme2_gemv_u8qa_dot_16VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SME2
+
 #include "kernels/sve_hybrid_u8qa_dot_4x4VL.hpp"
+#include "kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp"
+#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp"
 #include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp"
 #include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp"
-#include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SVE
 
 #include "gemm_hybrid_indirect.hpp"
 #include "gemm_hybrid_quantized.hpp"
 #include "gemm_hybrid_quantized_inline.hpp"
 #include "gemm_interleaved.hpp"
+#include "gemv_pretransposed.hpp"
 #include "quantize_wrapper.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<uint8_t, uint8_t, Requantize32> gemm_quint8_methods[] =
 {
-#ifdef __ARM_FEATURE_SVE
-#ifdef MMLA_INT8
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+// SME kernels
+{
+    GemmMethod::GEMV_PRETRANSPOSED,
+    "sme2_gemv_u8qa_dot_16VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && quant_hybrid_asymmetric(qp) && args._Msize == 1 && !args._indirect_input && args._nbatches == 1;  },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemvPretransposed<cls_sme2_gemv_u8qa_dot_16VL, uint8_t, uint8_t, Requantize32>(args, qp); }
+},
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "sve_interleaved_u8u32_mmla_8x3VL",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>8); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t>(args, qp); }
+    "sme2_interleaved_nomerge_u8q_mopa_1VLx4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<uint32_t>();
+                               return args._Nsize >= 8*VL || args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL, uint8_t, uint8_t>(args, qp); }
 },
-#endif
 {
-    GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "sve_smallK_hybrid_u8u32_dot_8x1VL",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint8_t>(args, qp); }
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_u8q_mopa_4VLx1VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<int32_t>();
+                               return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL, uint8_t, uint8_t>(args, qp); }
 },
-#ifdef SVE2 // Requantizing kernels include some SVE2 only instructions (SQRDMULH, SRSHL)
 {
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_u8q_mopa_2VLx2VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL, uint8_t, uint8_t>(args, qp); }
+},
+#endif // ARM_COMPUTE_ENABLE_SME2
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_u8qa_mmla_4x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8qa_mmla_4x4VL, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_mmla_4x4VL, uint8_t, uint8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_u8u32_mmla_8x3VL",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm() && (args._Ksize>8); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t>(args, qp); }
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_hybrid_u8u32_mmla_6x4VL",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_u8qa_dot_4x4VL",
-    [](const GemmArgs &args, const Requantize32 &qp) { return  args._ci->has_sve() && quant_hybrid_asymmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_asymmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, Requantize32>(args, qp); }
-},
-#endif
-{
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_u8u32_dot_6x4VL",
-    [](const GemmArgs &args, const Requantize32 &) { return  args._ci->has_sve(); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, Requantize32, true>(args, qp); }
-},
-{
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_u8u32_dot_8x3VL",
-    [](const GemmArgs &args, const Requantize32 &) { return  args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args, const Requantize32 &) { return  args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t>(args, qp); }
-},
-#endif
-#ifdef MMLA_INT8
-{
+),
+#endif // ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_u8qa_mmla_4x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_asymmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8qa_mmla_4x16, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8qa_mmla_4x16, uint8_t, uint8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_u8u32_mmla_8x12",
-    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm() && (args._Ksize>8); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t>(args, qp); }
-},
-#endif
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_hybrid_u8u32_mmla_6x16",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+),
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "a64_smallK_hybrid_u8u32_dot_8x4",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
-    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return !(args._ci->has_svei8mm() || args._ci->has_i8mm()); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_u8u32_dot_8x4, uint8_t, uint8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "a64_smallK_hybrid_u8u32_dot_6x4",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
-    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return !(args._ci->has_svei8mm() || args._ci->has_i8mm()); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_u8u32_dot_6x4, uint8_t, uint8_t>(args, qp); }
 },
 {
@@ -125,35 +186,35 @@ GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_u8qa_dot_4x16",
     [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, int8_t, int8_t, Requantize32>::estimate_cycles(args, cls_a64_hybrid_u8qa_dot_4x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, uint8_t, uint8_t, Requantize32>(args, qp); }
 ),
 GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_u8u32_dot_6x16",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, int8_t, int8_t, Requantize32, true>::estimate_cycles(args, cls_a64_hybrid_u8u32_dot_6x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, Requantize32, true>(args, qp); }
 ),
 GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_u8_8x12",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, int8_t, int8_t>::estimate_cycles(args, cls_a64_gemm_u8_8x12::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, uint8_t, uint8_t>(args, qp); }
 ),
-{
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_u8_4x4",
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_u8_4x4, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_4x4, uint8_t, uint8_t>(args, qp); }
-},
+),
 {
     GemmMethod::QUANTIZE_WRAPPER,
     "quantized_wrapper",
     [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; },
-    [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); },
+    [](const GemmArgs &, const Requantize32 &) { return false; },
     [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<uint8_t, uint8_t, uint32_t>(args, qp); }
 },
 {
@@ -171,6 +232,7 @@ const GemmImplementation<uint8_t, uint8_t, Requantize32> *gemm_implementation_li
 }
 
 template UniqueGemmCommon<uint8_t, uint8_t> gemm<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
+template bool has_opt_gemm<uint8_t, uint8_t, Requantize32>(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os);
 template KernelDescription get_gemm_method<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
 template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp
new file mode 100644
index 0000000000..38d9b763f6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+
+#include "kernels/a64_gemm_s16_8x12.hpp"
+#include "kernels/a64_gemm_s8_8x12.hpp"
+#include "kernels/a64_gemm_s8_4x4.hpp"
+#include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp"
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+#include "kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL.hpp"
+#include "kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SME2
+#include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SVE
+
+#include "gemm_implementation.hpp"
+#include "gemm_interleaved.hpp"
+#include "utils.hpp"
+
+#include <cstdint>
+#include <vector>
+namespace arm_gemm {
+
+static const GemmImplementation<int8_t, float, DequantizeFloat> gemm_s8fp32_methods[] =
+{
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp",
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2() && !args._accumulate; },
+    [](const GemmArgs &args, const DequantizeFloat &) { const auto VL = sme::get_vector_length<float>();
+                                                        return args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
+    [](const GemmArgs &args, const DequantizeFloat &dq) { return new GemmInterleavedNoMergeDequantized<cls_sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL, int8_t, float>(args, dq); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_s8qfp32_mopa_4Vx1VL.hpp",
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2() && !args._accumulate; },
+    [](const GemmArgs &args, const DequantizeFloat &) { const auto VL = sme::get_vector_length<float>();
+                                                        return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
+    [](const GemmArgs &args, const DequantizeFloat &dq) { return new GemmInterleavedNoMergeDequantized<cls_sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL, int8_t, float>(args, dq); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sme2_interleaved_nomerge_s8qfp32_mopa_2Vx2VL.hpp",
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2() && !args._accumulate; },
+    nullptr,
+    [](const GemmArgs &args, const DequantizeFloat &dq) { return new GemmInterleavedNoMergeDequantized<cls_sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL, int8_t, float>(args, dq); }
+},
+#endif // ARM_COMPUTE_ENABLE_SME2
+GemmImplementation<int8_t, float, DequantizeFloat>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_s8s32_mmla_8x3VL",
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_svei8mm(); },
+    [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, float>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, float>(args, qp); }
+),
+GemmImplementation<int8_t, float, DequantizeFloat>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_s8s32_dot_8x3VL",
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sve(); },
+    [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, float>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, float>(args, qp); }
+),
+#endif // ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<int8_t, float, DequantizeFloat>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_interleaved_s8s32_mmla_8x12",
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_i8mm(); },
+    [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, float>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, float>(args, qp); }
+),
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_s16_8x12",
+    nullptr,
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->get_cpu_model() == CPUModel::A53 && ((args._Msize > 28) || ((args._Msize % 8) > 4)); },
+    [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_gemm_s16_8x12, int8_t, float>(args, qp); }
+},
+GemmImplementation<int8_t, float, DequantizeFloat>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_s8_8x12",
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_dotprod(); },
+    [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_a64_gemm_s8_8x12, int8_t, float>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_gemm_s8_8x12, int8_t, float>(args, qp); }
+),
+GemmImplementation<int8_t, float, DequantizeFloat>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_s8_4x4",
+    nullptr,
+    [](const GemmArgs &args, const DequantizeFloat &) { return GemmInterleavedDequantized<cls_a64_gemm_s8_4x4, int8_t, float>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const DequantizeFloat &qp) { return new GemmInterleavedDequantized<cls_a64_gemm_s8_4x4, int8_t, float>(args, qp); }
+),
+{
+    GemmMethod::DEFAULT,
+    "",
+    nullptr,
+    nullptr,
+    nullptr
+}
+};
+
+template<>
+const GemmImplementation<int8_t, float, DequantizeFloat> *gemm_implementation_list<int8_t, float, DequantizeFloat>() {
+    return gemm_s8fp32_methods;
+}
+
+template UniqueGemmCommon<int8_t, float> gemm<int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
+template KernelDescription get_gemm_method<int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, float, DequantizeFloat>(const GemmArgs &args, const DequantizeFloat &os);
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
+\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 10a35e7a11..25b6cf0cf2 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,6 +56,7 @@ const GemmImplementation<uint16_t, uint32_t> *gemm_implementation_list<uint16_t,
 
 /* Explicitly instantiate the external functions for these types. */
 template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<uint16_t, uint32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
 template KernelDescription get_gemm_method<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template std::vector<KernelDescription> get_compatible_kernels<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index 4de3d2b18a..dfacb687a8 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,71 +34,75 @@
 #include "kernels/a64_gemm_u8_4x4.hpp"
 #include "kernels/a64_gemm_u8_8x12.hpp"
 #include "kernels/a64_hybrid_u8u32_dot_6x16.hpp"
+#include "kernels/a64_hybrid_u8u32_mmla_6x16.hpp"
 #include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp"
 #include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp"
 #include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp"
 
 #include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp"
 #include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp"
 #include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp"
-#include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
-#ifdef __ARM_FEATURE_SVE
-#ifdef MMLA_INT8
-{
+#ifdef ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_u8u32_mmla_6x4VL",
+    [](const GemmArgs &args) { return args._ci->has_svei8mm(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint32_t>(args); }
+),
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_u8u32_mmla_8x3VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>8); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return args._ci->has_svei8mm() && (args._Ksize>8); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint32_t>(args); }
-},
-#endif
-{
-    GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_u8u32_dot_8x1VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint32_t>(args); }
-},
-{
+),
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_u8u32_dot_6x4VL",
     [](const GemmArgs &args) { return args._ci->has_sve(); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint32_t>(args); }
-},
-{
+),
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_u8u32_dot_8x3VL",
     [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, 
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint32_t>(args); }
-},
-#endif
-#ifdef MMLA_INT8
-{
+),
+#endif // ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_u8u32_mmla_8x12",
-    [](const GemmArgs &args) { return (args._Ksize>8); },
-    nullptr,
+    [](const GemmArgs &args) { return args._ci->has_i8mm() && (args._Ksize>8); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint32_t>(args); }
-},
-#endif
+),
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_u8u32_mmla_6x16",
+    [](const GemmArgs &args) { return args._ci->has_i8mm(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint32_t>(args); }
+),
 {
     GemmMethod::GEMM_HYBRID,
     "a64_smallK_hybrid_u8u32_dot_8x4",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
-    nullptr,
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input && !args._accumulate; },
+    [](const GemmArgs &args) { return !(args._ci->has_svei8mm() || args._ci->has_i8mm()); },
     [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_u8u32_dot_8x4, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "a64_smallK_hybrid_u8u32_dot_6x4",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
-    nullptr,
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input && !args._accumulate; },
+    [](const GemmArgs &args) { return !(args._ci->has_svei8mm() || args._ci->has_i8mm()); },
     [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_u8u32_dot_6x4, uint8_t, uint32_t>(args); }
 },
 {
@@ -108,27 +112,27 @@ static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
     [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4; },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint8_t, uint32_t>(args); },
 },
-{
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_u8u32_dot_6x16",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint32_t>(args); }
-},
-{
+),
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_u8_8x12",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_u8_8x12, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_8x12, uint8_t, uint32_t>(args); }
-},
-{
+),
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_u8_4x4",
     nullptr,
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_u8_4x4, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_4x4, uint8_t, uint32_t>(args); }
-},
+),
 {
     GemmMethod::DEFAULT,
     "",
@@ -145,6 +149,7 @@ const GemmImplementation<uint8_t, uint32_t> *gemm_implementation_list<uint8_t, u
 
 /* Explicitly instantiate the external functions for these types. */
 template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
+template bool has_opt_gemm<uint8_t, uint32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
 template KernelDescription get_gemm_method<uint8_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint32_t, Nothing> (const GemmArgs &args, const Nothing &);
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index 12216009d2..ad504f2664 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,13 +88,25 @@ public:
         return _subgemm->get_B_pretransposed_array_size();
     }
 
-    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
-        _subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride);
+    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override {
+        _subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride, transposed);
     }
 
     void set_pretransposed_B_data(void *buffer) override {
         _subgemm->set_pretransposed_B_data(buffer);
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c = _subgemm->get_config();
+
+        std::string new_filter = "gemv_batched[";
+        new_filter.append(c.filter);
+        new_filter.append("]");
+
+        c.filter = new_filter;
+
+        return c;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 9de44fcb73..dbada36052 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,12 +36,55 @@
 
 namespace arm_gemm {
 
+namespace {
+
+template<typename OutputStage>
+class run_gemv_kernel {
+public:
+    template<typename strategy, typename Tlo, typename Tro, typename Tr>
+    static void run (
+        const strategy &strat,
+        const Tlo *A_ptr, const Tro *B_ptr, Tr *c_ptr,
+        size_t N, size_t K,
+        const Tr *bias, const Activation &act, bool Accumulate,
+        const OutputStage &os, const int32_t *col_bias, unsigned int col_base
+    );
+};
+
+template<>
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+void run_gemv_kernel<Nothing>::run(
+        const strategy &strat,
+        const Tlo *A_ptr, const Tro *B_ptr, Tr *C_ptr,
+        size_t N, size_t K,
+        const Tr *bias, const Activation &act, bool Accumulate,
+        const Nothing &, const int32_t *, unsigned int
+    ) {
+
+    strat.kernel(A_ptr, B_ptr, C_ptr, N, K, bias, act, Accumulate);
+}
+
+template<>
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+void run_gemv_kernel<Requantize32>::run(
+        const strategy &strat,
+        const Tlo *A_ptr, const Tro *B_ptr, Tr *C_ptr,
+        size_t N, size_t K,
+        const Tr *, const Activation &, bool,
+        const Requantize32 &qp, const int32_t *col_bias, unsigned int col_base
+    ) {
+
+    strat.kernel(A_ptr, B_ptr, C_ptr, N, K, &qp, col_bias + col_base, col_base);
+}
+
+} // anonymous namespace
+
 // Implementation of the GemmCommon abstract class.
 //
 // This is implementation is for GEMV with pretransposition.
 //
 // batches are not supported as a batched GEMV makes no sense (can be converted to a GEMM).
-template<typename strategy, typename To, typename Tr>
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
 class GemvPretransposed : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
     typedef typename strategy::result_type Tri;
@@ -55,13 +98,28 @@ class GemvPretransposed : public GemmCommon<To, Tr> {
 
     const Toi *_B_pretransposed = nullptr;
 
+    OutputStage _os;
+
+    // Pointer to the column sums (for quantized cases)
+    int32_t *col_bias = nullptr;
+
+    // Get size of the column sums
+    unsigned int get_col_sum_size() const {
+        if(std::is_same<OutputStage, Requantize32>::value) {
+            return _args._Nsize * _args._nmulti * sizeof(int32_t);
+        } else {
+            return 0;
+        }
+    }
+
 public:
     GemvPretransposed(GemvPretransposed &) = delete;
     GemvPretransposed & operator= (GemvPretransposed &) = delete;
 
-    GemvPretransposed(const GemmArgs &args)
+    GemvPretransposed(const GemmArgs &args, const OutputStage &os = {})
                       : _args(args),
-                        _buffer_per_multi(args._Ksize * roundup(args._Nsize, strategy::out_width())) {
+                        _buffer_per_multi(roundup(args._Ksize, strategy::k_unroll()) * roundup(args._Nsize, strategy::out_width())),
+                        _os(os) {
         /* For now don't do any blocking. TODO: figure out if we should. */
         if (strategy::supports_accumulate() && args._cfg && args._cfg->inner_block_size) {
             k_block = args._cfg->inner_block_size;
@@ -117,12 +175,13 @@ public:
 #ifdef CYCLE_PROFILING
                     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (kmax-k0) * (nmax-n));
 #endif
-                    strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + k0,
+                    run_gemv_kernel<OutputStage>::run(strat, this->_Aptr + (multi * this->_A_multi_stride) + k0,
                                  _B_pretransposed + (multi * _buffer_per_multi) + (n * roundup(_args._Ksize, strategy::k_unroll())) + (k0 * strategy::out_width()),
                                  this->_Cptr + (multi * this->_C_multi_stride) + n,
                                  (nmax - n), (kmax-k0),
                                  this->_bias ? this->_bias + (multi * this->_bias_multi_stride) + n : nullptr,
-                                 _args._act, (k0 != 0));
+                                 _args._act, (k0 != 0) || _args._accumulate,
+                                 _os, col_bias, n + (_args._Nsize * multi));
                 }
             }
         }
@@ -139,15 +198,45 @@ public:
     }
 
     size_t get_B_pretransposed_array_size() const override {
-        return _buffer_per_multi * _args._nmulti * sizeof(To);
+        return _buffer_per_multi * _args._nmulti * sizeof(To) + get_col_sum_size();
+    }
+
+    void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+        // Column sums go on the front of the pretransposed buffer in requantized cases.
+        // We could optimize here in case we don't actually need to sum the columns, but this code is only run on setup.
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            col_bias = reinterpret_cast<int32_t *>(in_buffer);
+
+            Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
+
+            for (unsigned int i=0; i<_args._nmulti; i++) {
+                compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize, B + (i * B_multi_stride), ldb, col_bias + (i * _args._Nsize), _args._Ksize, i, 0);
+            }
+        }
+    }
+
+    void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
+
+            qp->bias = bias;
+            qp->bias_multi_stride = bias_multi_stride;
+        }
     }
 
-    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
-        Toi *B_buffer = reinterpret_cast<Toi *>(buffer);
+    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override {
+        assert(!transposed);
+
+        requantize_bias(buffer, B, ldb, B_multi_stride);
+
+        // The actual transposed buffer goes after the column sums (if any)
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(buffer);
+        Toi *B_buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+
         strategy strat(_args._ci);
 
         for (unsigned int multi=0; multi<_args._nmulti; multi++) {
-            strat.transforms.PrepareB(B_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _args._Nsize, 0, _args._Ksize);
+            strat.transforms.PrepareB(B_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _args._Nsize, 0, _args._Ksize, false);
         }
 
         _B_pretransposed = B_buffer;
@@ -156,6 +245,17 @@ public:
     void set_pretransposed_B_data(void *buffer) override {
         _B_pretransposed = reinterpret_cast<Toi *>(buffer);
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c;
+
+        c.method = GemmMethod::GEMV_PRETRANSPOSED;
+        c.inner_block_size = k_block;
+        c.outer_block_size = n_block;
+        c.filter = get_type_name<strategy>();
+
+        return c;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
index 6a8caf6ce6..e4bfc0f6e4 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,44 @@ void interleave_block<4, 16, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x22, [%x[in], #0x0]\n"
+      "ldr x23, [%x[in], #0x0]\n"
+      "ldr x22, [%x[in], #0x8]\n"
       "cmp %x[height], #0x4\n"
-      "ldr x21, [%x[in], #0x8]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "ldr x21, [%x[in], #0x10]\n"
+      "ldr x20, [%x[in], #0x18]\n"
       "add x22, x22, %x[row_offset]\n"
-      "ldr x20, [%x[in], #0x10]\n"
-      "ldr x19, [%x[in], #0x18]\n"
       "add x21, x21, %x[row_offset]\n"
       "add x20, x20, %x[row_offset]\n"
-      "add x19, x19, %x[row_offset]\n"
       "beq 1f\n"
-      "mov x19, x22\n"
       "cmp %x[height], #0x2\n"
-      "csel x21, x21, x22, GE\n"
-      "csel x20, x20, x22, GT\n"
+      "mov x20, x23\n"
+      "csel x22, x22, x23, GE\n"
+      "csel x21, x21, x23, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x22, #0x0]\n"
       "cmp %x[width], #0x10\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
       "prfm pldl1keep, [x20, #0x0]\n"
-      "prfm pldl1keep, [x19, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
       "prfm pldl1keep, [x20, #0x40]\n"
-      "prfm pldl1keep, [x19, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q19, [x22], #0x10\n"
+      "ldr q19, [x23], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
       "subs %x[width], %x[width], #0x10\n"
-      "ldr q18, [x21], #0x10\n"
       "cmp %x[width], #0x10\n"
-      "ldr q17, [x20], #0x10\n"
-      "ldr q16, [x19], #0x10\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
       "prfm pldl1keep, [x22, #0x70]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
       "prfm pldl1keep, [x20, #0x70]\n"
-      "prfm pldl1keep, [x19, #0x70]\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
       "str q17, [%x[out_ptr], #0x20]\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
@@ -76,93 +76,93 @@ void interleave_block<4, 16, VLType::None, false>(
       "3:"  // Main loop skip
       "cbz %x[width], 12f\n"
       "tbz %x[width], #3, 7f\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d18, [x21], #0x8\n"
-      "ldr d17, [x20], #0x8\n"
-      "ldr d16, [x19], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d17, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
       "tbz %x[width], #2, 5f\n"
-      "ld1 { v19.s }[2], [x22], #0x4\n"
-      "ld1 { v18.s }[2], [x21], #0x4\n"
-      "ld1 { v17.s }[2], [x20], #0x4\n"
-      "ld1 { v16.s }[2], [x19], #0x4\n"
+      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x22], #0x4\n"
+      "ld1 { v17.s }[2], [x21], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v19.h }[6], [x22], #0x2\n"
-      "ld1 { v18.h }[6], [x21], #0x2\n"
-      "ld1 { v17.h }[6], [x20], #0x2\n"
-      "ld1 { v16.h }[6], [x19], #0x2\n"
+      "ld1 { v19.h }[6], [x23], #0x2\n"
+      "ld1 { v18.h }[6], [x22], #0x2\n"
+      "ld1 { v17.h }[6], [x21], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v19.b }[14], [x22]\n"
-      "ld1 { v18.b }[14], [x21]\n"
-      "ld1 { v17.b }[14], [x20]\n"
-      "ld1 { v16.b }[14], [x19]\n"
+      "ld1 { v19.b }[14], [x23]\n"
+      "ld1 { v18.b }[14], [x22]\n"
+      "ld1 { v17.b }[14], [x21]\n"
+      "ld1 { v16.b }[14], [x20]\n"
       "b 11f\n"
       "4:"  // odd_loads_1_12
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v19.b }[12], [x22]\n"
-      "ld1 { v18.b }[12], [x21]\n"
-      "ld1 { v17.b }[12], [x20]\n"
-      "ld1 { v16.b }[12], [x19]\n"
+      "ld1 { v19.b }[12], [x23]\n"
+      "ld1 { v18.b }[12], [x22]\n"
+      "ld1 { v17.b }[12], [x21]\n"
+      "ld1 { v16.b }[12], [x20]\n"
       "b 11f\n"
       "5:"  // odd_loads_2_8
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v19.h }[4], [x22], #0x2\n"
-      "ld1 { v18.h }[4], [x21], #0x2\n"
-      "ld1 { v17.h }[4], [x20], #0x2\n"
-      "ld1 { v16.h }[4], [x19], #0x2\n"
+      "ld1 { v19.h }[4], [x23], #0x2\n"
+      "ld1 { v18.h }[4], [x22], #0x2\n"
+      "ld1 { v17.h }[4], [x21], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v19.b }[10], [x22]\n"
-      "ld1 { v18.b }[10], [x21]\n"
-      "ld1 { v17.b }[10], [x20]\n"
-      "ld1 { v16.b }[10], [x19]\n"
+      "ld1 { v19.b }[10], [x23]\n"
+      "ld1 { v18.b }[10], [x22]\n"
+      "ld1 { v17.b }[10], [x21]\n"
+      "ld1 { v16.b }[10], [x20]\n"
       "b 11f\n"
       "6:"  // odd_loads_1_8
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v19.b }[8], [x22]\n"
-      "ld1 { v18.b }[8], [x21]\n"
-      "ld1 { v17.b }[8], [x20]\n"
-      "ld1 { v16.b }[8], [x19]\n"
+      "ld1 { v19.b }[8], [x23]\n"
+      "ld1 { v18.b }[8], [x22]\n"
+      "ld1 { v17.b }[8], [x21]\n"
+      "ld1 { v16.b }[8], [x20]\n"
       "b 11f\n"
       "7:"  // odd_loads_4_0
       "tbz %x[width], #2, 9f\n"
-      "ldr s19, [x22], #0x4\n"
-      "ldr s18, [x21], #0x4\n"
-      "ldr s17, [x20], #0x4\n"
-      "ldr s16, [x19], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "ldr s17, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
       "tbz %x[width], #1, 8f\n"
-      "ld1 { v19.h }[2], [x22], #0x2\n"
-      "ld1 { v18.h }[2], [x21], #0x2\n"
-      "ld1 { v17.h }[2], [x20], #0x2\n"
-      "ld1 { v16.h }[2], [x19], #0x2\n"
+      "ld1 { v19.h }[2], [x23], #0x2\n"
+      "ld1 { v18.h }[2], [x22], #0x2\n"
+      "ld1 { v17.h }[2], [x21], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v19.b }[6], [x22]\n"
-      "ld1 { v18.b }[6], [x21]\n"
-      "ld1 { v17.b }[6], [x20]\n"
-      "ld1 { v16.b }[6], [x19]\n"
+      "ld1 { v19.b }[6], [x23]\n"
+      "ld1 { v18.b }[6], [x22]\n"
+      "ld1 { v17.b }[6], [x21]\n"
+      "ld1 { v16.b }[6], [x20]\n"
       "b 11f\n"
       "8:"  // odd_loads_1_4
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v19.b }[4], [x22]\n"
-      "ld1 { v18.b }[4], [x21]\n"
-      "ld1 { v17.b }[4], [x20]\n"
-      "ld1 { v16.b }[4], [x19]\n"
+      "ld1 { v19.b }[4], [x23]\n"
+      "ld1 { v18.b }[4], [x22]\n"
+      "ld1 { v17.b }[4], [x21]\n"
+      "ld1 { v16.b }[4], [x20]\n"
       "b 11f\n"
       "9:"  // odd_loads_2_0
       "tbz %x[width], #1, 10f\n"
-      "ldr h19, [x22], #0x2\n"
-      "ldr h18, [x21], #0x2\n"
-      "ldr h17, [x20], #0x2\n"
-      "ldr h16, [x19], #0x2\n"
+      "ldr h19, [x23], #0x2\n"
+      "ldr h18, [x22], #0x2\n"
+      "ldr h17, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v19.b }[2], [x22]\n"
-      "ld1 { v18.b }[2], [x21]\n"
-      "ld1 { v17.b }[2], [x20]\n"
-      "ld1 { v16.b }[2], [x19]\n"
+      "ld1 { v19.b }[2], [x23]\n"
+      "ld1 { v18.b }[2], [x22]\n"
+      "ld1 { v17.b }[2], [x21]\n"
+      "ld1 { v16.b }[2], [x20]\n"
       "b 11f\n"
       "10:"  // odd_loads_1_0
-      "ldr b19, [x22, #0x0]\n"
-      "ldr b18, [x21, #0x0]\n"
-      "ldr b17, [x20, #0x0]\n"
-      "ldr b16, [x19, #0x0]\n"
+      "ldr b19, [x23, #0x0]\n"
+      "ldr b18, [x22, #0x0]\n"
+      "ldr b17, [x21, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
       "11:"  // Odd load end
       "str q19, [%x[out_ptr], #0x0]\n"
       "str q18, [%x[out_ptr], #0x10]\n"
@@ -170,10 +170,9 @@ void interleave_block<4, 16, VLType::None, false>(
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "12:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "x19", "x20", "x21", "x22"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "x20", "x21", "x22", "x23"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
index 954a86656e..23800edf20 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,39 +31,39 @@ void interleave_block<4, 16, VLType::None, true>(
 )
 {
   __asm__ __volatile__(
-      "movi v28.8h, #0x0\n"
-      "ldr x23, [%x[in], #0x0]\n"
+      "ldr x24, [%x[in], #0x0]\n"
+      "ldr x23, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x4\n"
       "mov x22, #0x0\n"
+      "ldr x21, [%x[in], #0x10]\n"
+      "ldr x20, [%x[in], #0x18]\n"
+      "movi v28.8h, #0x0\n"
       "movi v27.8h, #0x0\n"
-      "ldr x21, [%x[in], #0x8]\n"
-      "cmp %x[height], #0x4\n"
       "movi v26.8h, #0x0\n"
-      "ldr x20, [%x[in], #0x10]\n"
-      "add x23, x23, %x[row_offset]\n"
       "movi v25.8h, #0x0\n"
-      "ldr x19, [%x[in], #0x18]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "add x23, x23, %x[row_offset]\n"
       "movi v24.4s, #0x0\n"
-      "add x21, x21, %x[row_offset]\n"
       "movi v23.4s, #0x0\n"
+      "add x21, x21, %x[row_offset]\n"
       "add x20, x20, %x[row_offset]\n"
       "movi v22.4s, #0x0\n"
-      "add x19, x19, %x[row_offset]\n"
       "movi v21.4s, #0x0\n"
       "beq 1f\n"
-      "mov x19, x23\n"
       "cmp %x[height], #0x2\n"
-      "csel x21, x21, x23, GE\n"
-      "csel x20, x20, x23, GT\n"
+      "mov x20, x24\n"
+      "csel x23, x23, x24, GE\n"
+      "csel x21, x21, x24, GT\n"
       "1:"  // no_pointer_adj
-      "movi v20.4s, #0x0\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
+      "movi v20.4s, #0x0\n"
       "prfm pldl1keep, [x21, #0x0]\n"
       "prfm pldl1keep, [x20, #0x0]\n"
-      "prfm pldl1keep, [x19, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
       "prfm pldl1keep, [x20, #0x40]\n"
-      "prfm pldl1keep, [x19, #0x40]\n"
       "cbnz %w[first], 2f\n"
       "sub %x[out_ptr], %x[out_ptr], #0x10\n"
       "ld1 { v20.4s }, [%x[out_ptr]]\n"
@@ -75,149 +75,149 @@ void interleave_block<4, 16, VLType::None, true>(
       "ble 4f\n"
       "sadalp v24.4s, v28.8h\n"
       "movi v28.8h, #0x0\n"
+      "mov x22, #0x0\n"
       "sadalp v23.4s, v27.8h\n"
       "movi v27.8h, #0x0\n"
       "sadalp v22.4s, v26.8h\n"
       "movi v26.8h, #0x0\n"
       "sadalp v21.4s, v25.8h\n"
       "movi v25.8h, #0x0\n"
-      "mov x22, #0x0\n"
       "4:"  // no_accumulate_16
-      "ldr q19, [x23], #0x10\n"
-      "add x22, x22, #0x1\n"
-      "ldr q18, [x21], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
       "subs %x[width], %x[width], #0x10\n"
-      "ldr q17, [x20], #0x10\n"
       "cmp %x[width], #0x10\n"
-      "ldr q16, [x19], #0x10\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
       "sadalp v28.8h, v19.16b\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
       "sadalp v27.8h, v18.16b\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
       "prfm pldl1keep, [x20, #0x70]\n"
-      "sadalp v26.8h, v17.16b\n"
-      "prfm pldl1keep, [x19, #0x70]\n"
-      "sadalp v25.8h, v16.16b\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
       "str q17, [%x[out_ptr], #0x20]\n"
+      "sadalp v26.8h, v17.16b\n"
       "str q16, [%x[out_ptr], #0x30]\n"
+      "sadalp v25.8h, v16.16b\n"
+      "add x22, x22, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "bge 3b\n"
       "5:"  // Main loop skip
       "cbz %x[width], 14f\n"
       "tbz %x[width], #3, 9f\n"
-      "ldr d19, [x23], #0x8\n"
-      "ldr d18, [x21], #0x8\n"
-      "ldr d17, [x20], #0x8\n"
-      "ldr d16, [x19], #0x8\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d17, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
       "tbz %x[width], #2, 7f\n"
-      "ld1 { v19.s }[2], [x23], #0x4\n"
-      "ld1 { v18.s }[2], [x21], #0x4\n"
-      "ld1 { v17.s }[2], [x20], #0x4\n"
-      "ld1 { v16.s }[2], [x19], #0x4\n"
+      "ld1 { v19.s }[2], [x24], #0x4\n"
+      "ld1 { v18.s }[2], [x23], #0x4\n"
+      "ld1 { v17.s }[2], [x21], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v19.h }[6], [x23], #0x2\n"
-      "ld1 { v18.h }[6], [x21], #0x2\n"
-      "ld1 { v17.h }[6], [x20], #0x2\n"
-      "ld1 { v16.h }[6], [x19], #0x2\n"
+      "ld1 { v19.h }[6], [x24], #0x2\n"
+      "ld1 { v18.h }[6], [x23], #0x2\n"
+      "ld1 { v17.h }[6], [x21], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[14], [x23]\n"
-      "ld1 { v18.b }[14], [x21]\n"
-      "ld1 { v17.b }[14], [x20]\n"
-      "ld1 { v16.b }[14], [x19]\n"
+      "ld1 { v19.b }[14], [x24]\n"
+      "ld1 { v18.b }[14], [x23]\n"
+      "ld1 { v17.b }[14], [x21]\n"
+      "ld1 { v16.b }[14], [x20]\n"
       "b 13f\n"
       "6:"  // odd_loads_1_12
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[12], [x23]\n"
-      "ld1 { v18.b }[12], [x21]\n"
-      "ld1 { v17.b }[12], [x20]\n"
-      "ld1 { v16.b }[12], [x19]\n"
+      "ld1 { v19.b }[12], [x24]\n"
+      "ld1 { v18.b }[12], [x23]\n"
+      "ld1 { v17.b }[12], [x21]\n"
+      "ld1 { v16.b }[12], [x20]\n"
       "b 13f\n"
       "7:"  // odd_loads_2_8
       "tbz %x[width], #1, 8f\n"
-      "ld1 { v19.h }[4], [x23], #0x2\n"
-      "ld1 { v18.h }[4], [x21], #0x2\n"
-      "ld1 { v17.h }[4], [x20], #0x2\n"
-      "ld1 { v16.h }[4], [x19], #0x2\n"
+      "ld1 { v19.h }[4], [x24], #0x2\n"
+      "ld1 { v18.h }[4], [x23], #0x2\n"
+      "ld1 { v17.h }[4], [x21], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[10], [x23]\n"
-      "ld1 { v18.b }[10], [x21]\n"
-      "ld1 { v17.b }[10], [x20]\n"
-      "ld1 { v16.b }[10], [x19]\n"
+      "ld1 { v19.b }[10], [x24]\n"
+      "ld1 { v18.b }[10], [x23]\n"
+      "ld1 { v17.b }[10], [x21]\n"
+      "ld1 { v16.b }[10], [x20]\n"
       "b 13f\n"
       "8:"  // odd_loads_1_8
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[8], [x23]\n"
-      "ld1 { v18.b }[8], [x21]\n"
-      "ld1 { v17.b }[8], [x20]\n"
-      "ld1 { v16.b }[8], [x19]\n"
+      "ld1 { v19.b }[8], [x24]\n"
+      "ld1 { v18.b }[8], [x23]\n"
+      "ld1 { v17.b }[8], [x21]\n"
+      "ld1 { v16.b }[8], [x20]\n"
       "b 13f\n"
       "9:"  // odd_loads_4_0
       "tbz %x[width], #2, 11f\n"
-      "ldr s19, [x23], #0x4\n"
-      "ldr s18, [x21], #0x4\n"
-      "ldr s17, [x20], #0x4\n"
-      "ldr s16, [x19], #0x4\n"
+      "ldr s19, [x24], #0x4\n"
+      "ldr s18, [x23], #0x4\n"
+      "ldr s17, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
       "tbz %x[width], #1, 10f\n"
-      "ld1 { v19.h }[2], [x23], #0x2\n"
-      "ld1 { v18.h }[2], [x21], #0x2\n"
-      "ld1 { v17.h }[2], [x20], #0x2\n"
-      "ld1 { v16.h }[2], [x19], #0x2\n"
+      "ld1 { v19.h }[2], [x24], #0x2\n"
+      "ld1 { v18.h }[2], [x23], #0x2\n"
+      "ld1 { v17.h }[2], [x21], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[6], [x23]\n"
-      "ld1 { v18.b }[6], [x21]\n"
-      "ld1 { v17.b }[6], [x20]\n"
-      "ld1 { v16.b }[6], [x19]\n"
+      "ld1 { v19.b }[6], [x24]\n"
+      "ld1 { v18.b }[6], [x23]\n"
+      "ld1 { v17.b }[6], [x21]\n"
+      "ld1 { v16.b }[6], [x20]\n"
       "b 13f\n"
       "10:"  // odd_loads_1_4
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[4], [x23]\n"
-      "ld1 { v18.b }[4], [x21]\n"
-      "ld1 { v17.b }[4], [x20]\n"
-      "ld1 { v16.b }[4], [x19]\n"
+      "ld1 { v19.b }[4], [x24]\n"
+      "ld1 { v18.b }[4], [x23]\n"
+      "ld1 { v17.b }[4], [x21]\n"
+      "ld1 { v16.b }[4], [x20]\n"
       "b 13f\n"
       "11:"  // odd_loads_2_0
       "tbz %x[width], #1, 12f\n"
-      "ldr h19, [x23], #0x2\n"
-      "ldr h18, [x21], #0x2\n"
-      "ldr h17, [x20], #0x2\n"
-      "ldr h16, [x19], #0x2\n"
+      "ldr h19, [x24], #0x2\n"
+      "ldr h18, [x23], #0x2\n"
+      "ldr h17, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[2], [x23]\n"
-      "ld1 { v18.b }[2], [x21]\n"
-      "ld1 { v17.b }[2], [x20]\n"
-      "ld1 { v16.b }[2], [x19]\n"
+      "ld1 { v19.b }[2], [x24]\n"
+      "ld1 { v18.b }[2], [x23]\n"
+      "ld1 { v17.b }[2], [x21]\n"
+      "ld1 { v16.b }[2], [x20]\n"
       "b 13f\n"
       "12:"  // odd_loads_1_0
-      "ldr b19, [x23, #0x0]\n"
-      "ldr b18, [x21, #0x0]\n"
-      "ldr b17, [x20, #0x0]\n"
-      "ldr b16, [x19, #0x0]\n"
+      "ldr b19, [x24, #0x0]\n"
+      "ldr b18, [x23, #0x0]\n"
+      "ldr b17, [x21, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
       "13:"  // Odd load end
       "str q19, [%x[out_ptr], #0x0]\n"
       "sadalp v28.8h, v19.16b\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
       "sadalp v27.8h, v18.16b\n"
-      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
       "sadalp v26.8h, v17.16b\n"
-      "str q16, [%x[out_ptr], #0x30]\n"
       "sadalp v25.8h, v16.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "14:"  // Odds skip
       "sadalp v24.4s, v28.8h\n"
       "sadalp v23.4s, v27.8h\n"
-      "addp v24.4s, v24.4s, v23.4s\n"
       "sadalp v22.4s, v26.8h\n"
       "sadalp v21.4s, v25.8h\n"
-      "addp v23.4s, v22.4s, v21.4s\n"
       "addp v24.4s, v24.4s, v23.4s\n"
+      "addp v16.4s, v22.4s, v21.4s\n"
+      "addp v24.4s, v24.4s, v16.4s\n"
       "add v24.4s, v24.4s, v20.4s\n"
       "str q24, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
index c81146212c..15545c24db 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,39 +31,39 @@ void interleave_block<4, 16, VLType::None, true>(
 )
 {
   __asm__ __volatile__(
-      "movi v28.8h, #0x0\n"
-      "ldr x23, [%x[in], #0x0]\n"
+      "ldr x24, [%x[in], #0x0]\n"
+      "ldr x23, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x4\n"
       "mov x22, #0x0\n"
+      "ldr x21, [%x[in], #0x10]\n"
+      "ldr x20, [%x[in], #0x18]\n"
+      "movi v28.8h, #0x0\n"
       "movi v27.8h, #0x0\n"
-      "ldr x21, [%x[in], #0x8]\n"
-      "cmp %x[height], #0x4\n"
       "movi v26.8h, #0x0\n"
-      "ldr x20, [%x[in], #0x10]\n"
-      "add x23, x23, %x[row_offset]\n"
       "movi v25.8h, #0x0\n"
-      "ldr x19, [%x[in], #0x18]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "add x23, x23, %x[row_offset]\n"
       "movi v24.4s, #0x0\n"
-      "add x21, x21, %x[row_offset]\n"
       "movi v23.4s, #0x0\n"
+      "add x21, x21, %x[row_offset]\n"
       "add x20, x20, %x[row_offset]\n"
       "movi v22.4s, #0x0\n"
-      "add x19, x19, %x[row_offset]\n"
       "movi v21.4s, #0x0\n"
       "beq 1f\n"
-      "mov x19, x23\n"
       "cmp %x[height], #0x2\n"
-      "csel x21, x21, x23, GE\n"
-      "csel x20, x20, x23, GT\n"
+      "mov x20, x24\n"
+      "csel x23, x23, x24, GE\n"
+      "csel x21, x21, x24, GT\n"
       "1:"  // no_pointer_adj
-      "movi v20.4s, #0x0\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
+      "movi v20.4s, #0x0\n"
       "prfm pldl1keep, [x21, #0x0]\n"
       "prfm pldl1keep, [x20, #0x0]\n"
-      "prfm pldl1keep, [x19, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
       "prfm pldl1keep, [x20, #0x40]\n"
-      "prfm pldl1keep, [x19, #0x40]\n"
       "cbnz %w[first], 2f\n"
       "sub %x[out_ptr], %x[out_ptr], #0x10\n"
       "ld1 { v20.4s }, [%x[out_ptr]]\n"
@@ -75,149 +75,149 @@ void interleave_block<4, 16, VLType::None, true>(
       "ble 4f\n"
       "uadalp v24.4s, v28.8h\n"
       "movi v28.8h, #0x0\n"
+      "mov x22, #0x0\n"
       "uadalp v23.4s, v27.8h\n"
       "movi v27.8h, #0x0\n"
       "uadalp v22.4s, v26.8h\n"
       "movi v26.8h, #0x0\n"
       "uadalp v21.4s, v25.8h\n"
       "movi v25.8h, #0x0\n"
-      "mov x22, #0x0\n"
       "4:"  // no_accumulate_16
-      "ldr q19, [x23], #0x10\n"
-      "add x22, x22, #0x1\n"
-      "ldr q18, [x21], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
       "subs %x[width], %x[width], #0x10\n"
-      "ldr q17, [x20], #0x10\n"
       "cmp %x[width], #0x10\n"
-      "ldr q16, [x19], #0x10\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
       "uadalp v28.8h, v19.16b\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
       "uadalp v27.8h, v18.16b\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
       "prfm pldl1keep, [x20, #0x70]\n"
-      "uadalp v26.8h, v17.16b\n"
-      "prfm pldl1keep, [x19, #0x70]\n"
-      "uadalp v25.8h, v16.16b\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
       "str q17, [%x[out_ptr], #0x20]\n"
+      "uadalp v26.8h, v17.16b\n"
       "str q16, [%x[out_ptr], #0x30]\n"
+      "uadalp v25.8h, v16.16b\n"
+      "add x22, x22, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "bge 3b\n"
       "5:"  // Main loop skip
       "cbz %x[width], 14f\n"
       "tbz %x[width], #3, 9f\n"
-      "ldr d19, [x23], #0x8\n"
-      "ldr d18, [x21], #0x8\n"
-      "ldr d17, [x20], #0x8\n"
-      "ldr d16, [x19], #0x8\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d17, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
       "tbz %x[width], #2, 7f\n"
-      "ld1 { v19.s }[2], [x23], #0x4\n"
-      "ld1 { v18.s }[2], [x21], #0x4\n"
-      "ld1 { v17.s }[2], [x20], #0x4\n"
-      "ld1 { v16.s }[2], [x19], #0x4\n"
+      "ld1 { v19.s }[2], [x24], #0x4\n"
+      "ld1 { v18.s }[2], [x23], #0x4\n"
+      "ld1 { v17.s }[2], [x21], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v19.h }[6], [x23], #0x2\n"
-      "ld1 { v18.h }[6], [x21], #0x2\n"
-      "ld1 { v17.h }[6], [x20], #0x2\n"
-      "ld1 { v16.h }[6], [x19], #0x2\n"
+      "ld1 { v19.h }[6], [x24], #0x2\n"
+      "ld1 { v18.h }[6], [x23], #0x2\n"
+      "ld1 { v17.h }[6], [x21], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[14], [x23]\n"
-      "ld1 { v18.b }[14], [x21]\n"
-      "ld1 { v17.b }[14], [x20]\n"
-      "ld1 { v16.b }[14], [x19]\n"
+      "ld1 { v19.b }[14], [x24]\n"
+      "ld1 { v18.b }[14], [x23]\n"
+      "ld1 { v17.b }[14], [x21]\n"
+      "ld1 { v16.b }[14], [x20]\n"
       "b 13f\n"
       "6:"  // odd_loads_1_12
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[12], [x23]\n"
-      "ld1 { v18.b }[12], [x21]\n"
-      "ld1 { v17.b }[12], [x20]\n"
-      "ld1 { v16.b }[12], [x19]\n"
+      "ld1 { v19.b }[12], [x24]\n"
+      "ld1 { v18.b }[12], [x23]\n"
+      "ld1 { v17.b }[12], [x21]\n"
+      "ld1 { v16.b }[12], [x20]\n"
       "b 13f\n"
       "7:"  // odd_loads_2_8
       "tbz %x[width], #1, 8f\n"
-      "ld1 { v19.h }[4], [x23], #0x2\n"
-      "ld1 { v18.h }[4], [x21], #0x2\n"
-      "ld1 { v17.h }[4], [x20], #0x2\n"
-      "ld1 { v16.h }[4], [x19], #0x2\n"
+      "ld1 { v19.h }[4], [x24], #0x2\n"
+      "ld1 { v18.h }[4], [x23], #0x2\n"
+      "ld1 { v17.h }[4], [x21], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[10], [x23]\n"
-      "ld1 { v18.b }[10], [x21]\n"
-      "ld1 { v17.b }[10], [x20]\n"
-      "ld1 { v16.b }[10], [x19]\n"
+      "ld1 { v19.b }[10], [x24]\n"
+      "ld1 { v18.b }[10], [x23]\n"
+      "ld1 { v17.b }[10], [x21]\n"
+      "ld1 { v16.b }[10], [x20]\n"
       "b 13f\n"
       "8:"  // odd_loads_1_8
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[8], [x23]\n"
-      "ld1 { v18.b }[8], [x21]\n"
-      "ld1 { v17.b }[8], [x20]\n"
-      "ld1 { v16.b }[8], [x19]\n"
+      "ld1 { v19.b }[8], [x24]\n"
+      "ld1 { v18.b }[8], [x23]\n"
+      "ld1 { v17.b }[8], [x21]\n"
+      "ld1 { v16.b }[8], [x20]\n"
       "b 13f\n"
       "9:"  // odd_loads_4_0
       "tbz %x[width], #2, 11f\n"
-      "ldr s19, [x23], #0x4\n"
-      "ldr s18, [x21], #0x4\n"
-      "ldr s17, [x20], #0x4\n"
-      "ldr s16, [x19], #0x4\n"
+      "ldr s19, [x24], #0x4\n"
+      "ldr s18, [x23], #0x4\n"
+      "ldr s17, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
       "tbz %x[width], #1, 10f\n"
-      "ld1 { v19.h }[2], [x23], #0x2\n"
-      "ld1 { v18.h }[2], [x21], #0x2\n"
-      "ld1 { v17.h }[2], [x20], #0x2\n"
-      "ld1 { v16.h }[2], [x19], #0x2\n"
+      "ld1 { v19.h }[2], [x24], #0x2\n"
+      "ld1 { v18.h }[2], [x23], #0x2\n"
+      "ld1 { v17.h }[2], [x21], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[6], [x23]\n"
-      "ld1 { v18.b }[6], [x21]\n"
-      "ld1 { v17.b }[6], [x20]\n"
-      "ld1 { v16.b }[6], [x19]\n"
+      "ld1 { v19.b }[6], [x24]\n"
+      "ld1 { v18.b }[6], [x23]\n"
+      "ld1 { v17.b }[6], [x21]\n"
+      "ld1 { v16.b }[6], [x20]\n"
       "b 13f\n"
       "10:"  // odd_loads_1_4
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[4], [x23]\n"
-      "ld1 { v18.b }[4], [x21]\n"
-      "ld1 { v17.b }[4], [x20]\n"
-      "ld1 { v16.b }[4], [x19]\n"
+      "ld1 { v19.b }[4], [x24]\n"
+      "ld1 { v18.b }[4], [x23]\n"
+      "ld1 { v17.b }[4], [x21]\n"
+      "ld1 { v16.b }[4], [x20]\n"
       "b 13f\n"
       "11:"  // odd_loads_2_0
       "tbz %x[width], #1, 12f\n"
-      "ldr h19, [x23], #0x2\n"
-      "ldr h18, [x21], #0x2\n"
-      "ldr h17, [x20], #0x2\n"
-      "ldr h16, [x19], #0x2\n"
+      "ldr h19, [x24], #0x2\n"
+      "ldr h18, [x23], #0x2\n"
+      "ldr h17, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v19.b }[2], [x23]\n"
-      "ld1 { v18.b }[2], [x21]\n"
-      "ld1 { v17.b }[2], [x20]\n"
-      "ld1 { v16.b }[2], [x19]\n"
+      "ld1 { v19.b }[2], [x24]\n"
+      "ld1 { v18.b }[2], [x23]\n"
+      "ld1 { v17.b }[2], [x21]\n"
+      "ld1 { v16.b }[2], [x20]\n"
       "b 13f\n"
       "12:"  // odd_loads_1_0
-      "ldr b19, [x23, #0x0]\n"
-      "ldr b18, [x21, #0x0]\n"
-      "ldr b17, [x20, #0x0]\n"
-      "ldr b16, [x19, #0x0]\n"
+      "ldr b19, [x24, #0x0]\n"
+      "ldr b18, [x23, #0x0]\n"
+      "ldr b17, [x21, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
       "13:"  // Odd load end
       "str q19, [%x[out_ptr], #0x0]\n"
       "uadalp v28.8h, v19.16b\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
       "uadalp v27.8h, v18.16b\n"
-      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
       "uadalp v26.8h, v17.16b\n"
-      "str q16, [%x[out_ptr], #0x30]\n"
       "uadalp v25.8h, v16.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "14:"  // Odds skip
       "uadalp v24.4s, v28.8h\n"
       "uadalp v23.4s, v27.8h\n"
-      "addp v24.4s, v24.4s, v23.4s\n"
       "uadalp v22.4s, v26.8h\n"
       "uadalp v21.4s, v25.8h\n"
-      "addp v23.4s, v22.4s, v21.4s\n"
       "addp v24.4s, v24.4s, v23.4s\n"
+      "addp v16.4s, v22.4s, v21.4s\n"
+      "addp v24.4s, v24.4s, v16.4s\n"
       "add v24.4s, v24.4s, v20.4s\n"
       "str q24, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
index 42574295f1..b900c330b7 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,45 +31,46 @@ void interleave_block<8, 1, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "movi v30.8h, #0x0\n"
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "movi v16.8h, #0x0\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
+      "add x28, x28, %x[row_offset], LSL #1\n"
       "add x27, x27, %x[row_offset], LSL #1\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x26, x26, %x[row_offset], LSL #1\n"
-      "ldr x23, [%x[in], #0x20]\n"
       "add x25, x25, %x[row_offset], LSL #1\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x24, x24, %x[row_offset], LSL #1\n"
-      "ldr x20, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset], LSL #1\n"
       "add x22, x22, %x[row_offset], LSL #1\n"
       "add x21, x21, %x[row_offset], LSL #1\n"
-      "add x20, x20, %x[row_offset], LSL #1\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -77,135 +78,133 @@ void interleave_block<8, 1, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr d29, [x27], #0x8\n"
-      "zip1 v29.8h, v30.8h, v29.8h\n"
-      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x28], #0x8\n"
+      "ldr d26, [x27], #0x8\n"
+      "shll v27.4s, v27.4h, #0x10\n"
+      "shll v26.4s, v26.4h, #0x10\n"
+      "ldr d22, [x26], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "shll v22.4s, v22.4h, #0x10\n"
+      "shll v21.4s, v21.4h, #0x10\n"
+      "ldr d20, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "shll v20.4s, v20.4h, #0x10\n"
+      "shll v25.4s, v25.4h, #0x10\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d16, [x21], #0x8\n"
+      "shll v19.4s, v19.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "zip1 v24.4s, v27.4s, v22.4s\n"
+      "zip1 v23.4s, v26.4s, v21.4s\n"
       "subs %x[width], %x[width], #0x4\n"
-      "zip1 v28.8h, v30.8h, v28.8h\n"
-      "ldr d24, [x25], #0x8\n"
       "cmp %x[width], #0x4\n"
-      "zip1 v24.8h, v30.8h, v24.8h\n"
-      "ldr d27, [x24], #0x8\n"
-      "ldr d26, [x23], #0x8\n"
-      "zip1 v25.4s, v29.4s, v24.4s\n"
-      "zip2 v24.4s, v29.4s, v24.4s\n"
-      "ldr d23, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "zip1 v27.8h, v30.8h, v27.8h\n"
-      "ldr d21, [x20], #0x8\n"
-      "zip1 v26.8h, v30.8h, v26.8h\n"
+      "zip1 v18.4s, v20.4s, v19.4s\n"
+      "zip1 v17.4s, v25.4s, v16.4s\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip1 v20.4s, v28.4s, v27.4s\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "zip2 v21.4s, v26.4s, v21.4s\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip1 v23.8h, v30.8h, v23.8h\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v22.8h, v30.8h, v22.8h\n"
+      "zip2 v20.4s, v20.4s, v19.4s\n"
+      "zip2 v19.4s, v25.4s, v16.4s\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "zip1 v21.8h, v30.8h, v21.8h\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v17.4s, v25.4s, v20.4s\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip1 v19.4s, v26.4s, v22.4s\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip1 v18.4s, v23.4s, v21.4s\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip1 v16.4s, v19.4s, v18.4s\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v17.4s, v25.4s, v20.4s\n"
+      "zip1 v16.4s, v24.4s, v23.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
-      "zip2 v16.4s, v19.4s, v18.4s\n"
-      "str q17, [%x[out_ptr], #0x20]\n"
-      "zip2 v19.4s, v28.4s, v27.4s\n"
-      "str q16, [%x[out_ptr], #0x30]\n"
-      "zip1 v16.4s, v24.4s, v19.4s\n"
+      "zip2 v16.4s, v24.4s, v23.4s\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v17.4s, v18.4s, v17.4s\n"
+      "zip1 v16.4s, v22.4s, v21.4s\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "zip1 v18.4s, v20.4s, v19.4s\n"
+      "zip2 v17.4s, v22.4s, v21.4s\n"
       "str q16, [%x[out_ptr], #0x40]\n"
-      "zip2 v18.4s, v26.4s, v22.4s\n"
-      "zip2 v17.4s, v23.4s, v21.4s\n"
-      "zip1 v16.4s, v18.4s, v17.4s\n"
-      "str q16, [%x[out_ptr], #0x50]\n"
-      "zip2 v16.4s, v24.4s, v19.4s\n"
-      "str q16, [%x[out_ptr], #0x60]\n"
-      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
       "bge 2b\n"
       "3:"  // Main loop skip
       "cbz %x[width], 6f\n"
       "tbz %x[width], #1, 4f\n"
-      "ldr s29, [x27], #0x4\n"
-      "ldr s28, [x26], #0x4\n"
-      "mov x19, #0x2\n"
-      "ldr s24, [x25], #0x4\n"
-      "ldr s27, [x24], #0x4\n"
-      "ldr s26, [x23], #0x4\n"
-      "ldr s23, [x22], #0x4\n"
-      "ldr s22, [x21], #0x4\n"
-      "ldr s21, [x20], #0x4\n"
+      "ldr s28, [x28], #0x4\n"
+      "ldr s27, [x27], #0x4\n"
+      "mov x20, #0x2\n"
+      "ldr s26, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s24, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s22, [x22], #0x4\n"
+      "ldr s21, [x21], #0x4\n"
       "tbz %x[width], #0, 5f\n"
-      "ld1 { v29.h }[2], [x27]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v28.h }[2], [x26]\n"
-      "ld1 { v24.h }[2], [x25]\n"
-      "ld1 { v27.h }[2], [x24]\n"
-      "ld1 { v26.h }[2], [x23]\n"
-      "ld1 { v23.h }[2], [x22]\n"
-      "ld1 { v22.h }[2], [x21]\n"
-      "ld1 { v21.h }[2], [x20]\n"
+      "ld1 { v28.h }[2], [x28]\n"
+      "ld1 { v27.h }[2], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v26.h }[2], [x26]\n"
+      "ld1 { v25.h }[2], [x25]\n"
+      "ld1 { v24.h }[2], [x24]\n"
+      "ld1 { v23.h }[2], [x23]\n"
+      "ld1 { v22.h }[2], [x22]\n"
+      "ld1 { v21.h }[2], [x21]\n"
       "b 5f\n"
       "4:"  // odd_loads_1_0
-      "ldr h29, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr h28, [x26, #0x0]\n"
-      "ldr h24, [x25, #0x0]\n"
-      "ldr h27, [x24, #0x0]\n"
-      "ldr h26, [x23, #0x0]\n"
-      "ldr h23, [x22, #0x0]\n"
-      "ldr h22, [x21, #0x0]\n"
-      "ldr h21, [x20, #0x0]\n"
+      "ldr h28, [x28, #0x0]\n"
+      "ldr h27, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr h26, [x26, #0x0]\n"
+      "ldr h25, [x25, #0x0]\n"
+      "ldr h24, [x24, #0x0]\n"
+      "ldr h23, [x23, #0x0]\n"
+      "ldr h22, [x22, #0x0]\n"
+      "ldr h21, [x21, #0x0]\n"
       "5:"  // Odd load end
-      "zip1 v29.8h, v30.8h, v29.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v28.8h, v30.8h, v28.8h\n"
-      "zip1 v24.8h, v30.8h, v24.8h\n"
-      "zip1 v27.8h, v30.8h, v27.8h\n"
-      "zip1 v26.8h, v30.8h, v26.8h\n"
-      "zip1 v23.8h, v30.8h, v23.8h\n"
-      "zip1 v22.8h, v30.8h, v22.8h\n"
-      "zip1 v21.8h, v30.8h, v21.8h\n"
-      "zip1 v25.4s, v29.4s, v24.4s\n"
-      "zip1 v20.4s, v28.4s, v27.4s\n"
-      "zip1 v17.4s, v25.4s, v20.4s\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "zip1 v19.4s, v26.4s, v22.4s\n"
-      "zip1 v18.4s, v23.4s, v21.4s\n"
-      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "shll v28.4s, v28.4h, #0x10\n"
+      "shll v27.4s, v27.4h, #0x10\n"
+      "subs x20, x20, #0x1\n"
+      "shll v26.4s, v26.4h, #0x10\n"
+      "shll v25.4s, v25.4h, #0x10\n"
+      "shll v24.4s, v24.4h, #0x10\n"
+      "shll v23.4s, v23.4h, #0x10\n"
+      "shll v22.4s, v22.4h, #0x10\n"
+      "shll v21.4s, v21.4h, #0x10\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
-      "zip2 v17.4s, v25.4s, v20.4s\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.4s, v19.4s, v18.4s\n"
-      "subs x19, x19, #0x1\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
-      "zip2 v24.4s, v29.4s, v24.4s\n"
-      "zip2 v19.4s, v28.4s, v27.4s\n"
-      "zip1 v16.4s, v24.4s, v19.4s\n"
-      "str q16, [%x[out_ptr], #0x0]\n"
-      "zip2 v18.4s, v26.4s, v22.4s\n"
+      "zip2 v19.4s, v28.4s, v26.4s\n"
+      "zip2 v16.4s, v27.4s, v25.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
       "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v16.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "6:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
index 62d1657a9a..e54b3b9f41 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "add x28, x28, %x[row_offset], LSL #1\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "add x27, x27, %x[row_offset], LSL #1\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
       "add x26, x26, %x[row_offset], LSL #1\n"
-      "ldr x23, [%x[in], #0x20]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset], LSL #1\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset], LSL #1\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset], LSL #1\n"
       "add x22, x22, %x[row_offset], LSL #1\n"
       "add x21, x21, %x[row_offset], LSL #1\n"
-      "add x20, x20, %x[row_offset], LSL #1\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -76,193 +77,191 @@ void interleave_block<8, 1, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q30, [x27], #0x10\n"
+      "ldr q25, [x28], #0x10\n"
+      "ldr q27, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x8\n"
-      "ldr q29, [x26], #0x10\n"
       "cmp %x[width], #0x8\n"
-      "ldr q28, [x25], #0x10\n"
-      "ldr q27, [x24], #0x10\n"
-      "ldr q25, [x23], #0x10\n"
-      "zip1 v26.8h, v30.8h, v25.8h\n"
-      "ldr q21, [x22], #0x10\n"
-      "zip2 v25.8h, v30.8h, v25.8h\n"
-      "ldr q24, [x21], #0x10\n"
-      "ldr q23, [x20], #0x10\n"
-      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "ldr q26, [x26], #0x10\n"
+      "ldr q24, [x25], #0x10\n"
+      "ldr q21, [x24], #0x10\n"
+      "ldr q20, [x23], #0x10\n"
+      "zip1 v23.8h, v25.8h, v21.8h\n"
+      "zip1 v22.8h, v27.8h, v20.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v19.8h, v26.8h, v17.8h\n"
+      "zip1 v18.8h, v24.8h, v16.8h\n"
+      "zip2 v25.8h, v25.8h, v21.8h\n"
+      "zip2 v21.8h, v26.8h, v17.8h\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v20.8h, v27.8h, v20.8h\n"
+      "zip2 v16.8h, v24.8h, v16.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip1 v20.8h, v28.8h, v24.8h\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v18.8h, v26.8h, v20.8h\n"
+      "zip1 v24.8h, v23.8h, v19.8h\n"
+      "zip1 v17.8h, v22.8h, v18.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "zip1 v19.8h, v27.8h, v23.8h\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v16.8h, v22.8h, v19.8h\n"
+      "zip2 v23.8h, v23.8h, v19.8h\n"
+      "zip2 v19.8h, v22.8h, v18.8h\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip1 v17.8h, v18.8h, v16.8h\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v16.8h, v18.8h, v16.8h\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip2 v18.8h, v26.8h, v20.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v22.8h, v25.8h, v21.8h\n"
+      "zip1 v18.8h, v20.8h, v16.8h\n"
+      "zip2 v21.8h, v25.8h, v21.8h\n"
+      "zip2 v20.8h, v20.8h, v16.8h\n"
+      "zip1 v16.8h, v24.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v24.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x10]\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip1 v17.8h, v23.8h, v19.8h\n"
+      "zip2 v16.8h, v23.8h, v19.8h\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "zip1 v19.8h, v22.8h, v18.8h\n"
+      "zip2 v18.8h, v22.8h, v18.8h\n"
       "str q16, [%x[out_ptr], #0x30]\n"
-      "zip2 v20.8h, v28.8h, v24.8h\n"
-      "zip1 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v17.8h, v21.8h, v19.8h\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x40]\n"
-      "zip2 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x50]\n"
-      "zip2 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v17.8h, v21.8h, v19.8h\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x60]\n"
-      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip1 v17.8h, v21.8h, v20.8h\n"
+      "zip2 v16.8h, v21.8h, v20.8h\n"
+      "str q19, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
       "bge 2b\n"
       "3:"  // Main loop skip
       "cbz %x[width], 8f\n"
       "tbz %x[width], #2, 5f\n"
-      "ldr d30, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d27, [x24], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d26, [x24], #0x8\n"
       "ldr d25, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
-      "ldr d24, [x21], #0x8\n"
-      "ldr d23, [x20], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v30.s }[2], [x27], #0x4\n"
-      "mov x19, #0x6\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v30.s }[2], [x28], #0x4\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
+      "mov x20, #0x6\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x24], #0x4\n"
       "ld1 { v25.s }[2], [x23], #0x4\n"
-      "ld1 { v21.s }[2], [x22], #0x4\n"
-      "ld1 { v24.s }[2], [x21], #0x4\n"
-      "ld1 { v23.s }[2], [x20], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v30.h }[6], [x27]\n"
-      "mov x19, #0x7\n"
-      "ld1 { v29.h }[6], [x26]\n"
-      "ld1 { v28.h }[6], [x25]\n"
-      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v30.h }[6], [x28]\n"
+      "ld1 { v29.h }[6], [x27]\n"
+      "mov x20, #0x7\n"
+      "ld1 { v28.h }[6], [x26]\n"
+      "ld1 { v27.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x24]\n"
       "ld1 { v25.h }[6], [x23]\n"
-      "ld1 { v21.h }[6], [x22]\n"
-      "ld1 { v24.h }[6], [x21]\n"
-      "ld1 { v23.h }[6], [x20]\n"
+      "ld1 { v24.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
-      "mov x19, #0x4\n"
+      "mov x20, #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v30.h }[4], [x27]\n"
-      "ld1 { v29.h }[4], [x26]\n"
-      "mov x19, #0x5\n"
-      "ld1 { v28.h }[4], [x25]\n"
-      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v30.h }[4], [x28]\n"
+      "ld1 { v29.h }[4], [x27]\n"
+      "mov x20, #0x5\n"
+      "ld1 { v28.h }[4], [x26]\n"
+      "ld1 { v27.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x24]\n"
       "ld1 { v25.h }[4], [x23]\n"
-      "ld1 { v21.h }[4], [x22]\n"
-      "ld1 { v24.h }[4], [x21]\n"
-      "ld1 { v23.h }[4], [x20]\n"
+      "ld1 { v24.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
-      "ldr s30, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "mov x19, #0x2\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s27, [x24], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "mov x20, #0x2\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
       "ldr s25, [x23], #0x4\n"
-      "ldr s21, [x22], #0x4\n"
-      "ldr s24, [x21], #0x4\n"
-      "ldr s23, [x20], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v30.h }[2], [x27]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v29.h }[2], [x26]\n"
-      "ld1 { v28.h }[2], [x25]\n"
-      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v30.h }[2], [x28]\n"
+      "ld1 { v29.h }[2], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v28.h }[2], [x26]\n"
+      "ld1 { v27.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x24]\n"
       "ld1 { v25.h }[2], [x23]\n"
-      "ld1 { v21.h }[2], [x22]\n"
-      "ld1 { v24.h }[2], [x21]\n"
-      "ld1 { v23.h }[2], [x20]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
-      "ldr h30, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr h29, [x26, #0x0]\n"
-      "ldr h28, [x25, #0x0]\n"
-      "ldr h27, [x24, #0x0]\n"
+      "ldr h30, [x28, #0x0]\n"
+      "ldr h29, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr h28, [x26, #0x0]\n"
+      "ldr h27, [x25, #0x0]\n"
+      "ldr h26, [x24, #0x0]\n"
       "ldr h25, [x23, #0x0]\n"
-      "ldr h21, [x22, #0x0]\n"
-      "ldr h24, [x21, #0x0]\n"
-      "ldr h23, [x20, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
       "7:"  // Odd load end
-      "zip1 v26.8h, v30.8h, v25.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v20.8h, v28.8h, v24.8h\n"
-      "zip1 v18.8h, v26.8h, v20.8h\n"
-      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
       "zip1 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v16.8h, v22.8h, v19.8h\n"
-      "zip1 v17.8h, v18.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v18.8h, v26.8h, v20.8h\n"
-      "zip2 v17.8h, v22.8h, v19.8h\n"
-      "subs x19, x19, #0x1\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
+      "subs x20, x20, #0x1\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v25.8h, v30.8h, v25.8h\n"
-      "zip2 v20.8h, v28.8h, v24.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
       "zip2 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
+      "subs x20, x20, #0x1\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v17.8h, v21.8h, v19.8h\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
index b67840b280..3a5dcf4a6b 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "add x28, x28, %x[row_offset], LSL #1\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "add x27, x27, %x[row_offset], LSL #1\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
       "add x26, x26, %x[row_offset], LSL #1\n"
-      "ldr x23, [%x[in], #0x20]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset], LSL #1\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset], LSL #1\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset], LSL #1\n"
       "add x22, x22, %x[row_offset], LSL #1\n"
       "add x21, x21, %x[row_offset], LSL #1\n"
-      "add x20, x20, %x[row_offset], LSL #1\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -76,135 +77,133 @@ void interleave_block<8, 1, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr d30, [x27], #0x8\n"
+      "ldr d27, [x28], #0x8\n"
+      "ldr d26, [x27], #0x8\n"
+      "fcvtl v27.4s, v27.4h\n"
+      "fcvtl v26.4s, v26.4h\n"
+      "ldr d22, [x26], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "fcvtl v22.4s, v22.4h\n"
+      "fcvtl v21.4s, v21.4h\n"
+      "ldr d20, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "fcvtl v20.4s, v20.4h\n"
+      "fcvtl v25.4s, v25.4h\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d16, [x21], #0x8\n"
+      "fcvtl v19.4s, v19.4h\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "zip1 v24.4s, v27.4s, v22.4s\n"
+      "zip1 v23.4s, v26.4s, v21.4s\n"
       "subs %x[width], %x[width], #0x4\n"
-      "ldr d29, [x26], #0x8\n"
       "cmp %x[width], #0x4\n"
-      "ldr d28, [x25], #0x8\n"
-      "fcvtl v30.4s, v30.4h\n"
-      "ldr d21, [x24], #0x8\n"
-      "ldr d27, [x23], #0x8\n"
-      "fcvtl v29.4s, v29.4h\n"
-      "ldr d26, [x22], #0x8\n"
-      "fcvtl v28.4s, v28.4h\n"
-      "zip1 v20.4s, v30.4s, v28.4s\n"
-      "ldr d25, [x21], #0x8\n"
-      "fcvtl v21.4s, v21.4h\n"
-      "zip2 v17.4s, v30.4s, v28.4s\n"
-      "ldr d24, [x20], #0x8\n"
-      "fcvtl v27.4s, v27.4h\n"
-      "zip1 v18.4s, v29.4s, v21.4s\n"
+      "zip1 v18.4s, v20.4s, v19.4s\n"
+      "zip1 v17.4s, v25.4s, v16.4s\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "fcvtl v26.4s, v26.4h\n"
-      "zip1 v23.4s, v20.4s, v18.4s\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "zip2 v21.4s, v26.4s, v21.4s\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "fcvtl v25.4s, v25.4h\n"
-      "zip2 v22.4s, v20.4s, v18.4s\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "fcvtl v24.4s, v24.4h\n"
-      "zip2 v16.4s, v29.4s, v21.4s\n"
+      "zip2 v20.4s, v20.4s, v19.4s\n"
+      "zip2 v19.4s, v25.4s, v16.4s\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v21.4s, v17.4s, v16.4s\n"
-      "zip2 v20.4s, v17.4s, v16.4s\n"
       "prfm pldl1keep, [x22, #0x70]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip1 v19.4s, v27.4s, v25.4s\n"
-      "zip2 v18.4s, v27.4s, v25.4s\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip1 v17.4s, v26.4s, v24.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "zip1 v16.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v24.4s, v23.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
-      "zip2 v17.4s, v19.4s, v17.4s\n"
-      "str q22, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
+      "zip2 v16.4s, v24.4s, v23.4s\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v17.4s, v18.4s, v17.4s\n"
+      "zip1 v16.4s, v22.4s, v21.4s\n"
       "str q17, [%x[out_ptr], #0x30]\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
-      "str q21, [%x[out_ptr], #0x40]\n"
-      "zip2 v16.4s, v18.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x50]\n"
-      "str q20, [%x[out_ptr], #0x60]\n"
+      "zip1 v18.4s, v20.4s, v19.4s\n"
+      "zip2 v17.4s, v22.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
       "bge 2b\n"
       "3:"  // Main loop skip
       "cbz %x[width], 6f\n"
       "tbz %x[width], #1, 4f\n"
-      "ldr s30, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "mov x19, #0x2\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s21, [x24], #0x4\n"
-      "ldr s27, [x23], #0x4\n"
-      "ldr s26, [x22], #0x4\n"
-      "ldr s25, [x21], #0x4\n"
-      "ldr s24, [x20], #0x4\n"
+      "ldr s28, [x28], #0x4\n"
+      "ldr s27, [x27], #0x4\n"
+      "mov x20, #0x2\n"
+      "ldr s26, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s24, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s22, [x22], #0x4\n"
+      "ldr s21, [x21], #0x4\n"
       "tbz %x[width], #0, 5f\n"
-      "ld1 { v30.h }[2], [x27]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v29.h }[2], [x26]\n"
-      "ld1 { v28.h }[2], [x25]\n"
-      "ld1 { v21.h }[2], [x24]\n"
-      "ld1 { v27.h }[2], [x23]\n"
-      "ld1 { v26.h }[2], [x22]\n"
-      "ld1 { v25.h }[2], [x21]\n"
-      "ld1 { v24.h }[2], [x20]\n"
+      "ld1 { v28.h }[2], [x28]\n"
+      "ld1 { v27.h }[2], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v26.h }[2], [x26]\n"
+      "ld1 { v25.h }[2], [x25]\n"
+      "ld1 { v24.h }[2], [x24]\n"
+      "ld1 { v23.h }[2], [x23]\n"
+      "ld1 { v22.h }[2], [x22]\n"
+      "ld1 { v21.h }[2], [x21]\n"
       "b 5f\n"
       "4:"  // odd_loads_1_0
-      "ldr h30, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr h29, [x26, #0x0]\n"
-      "ldr h28, [x25, #0x0]\n"
-      "ldr h21, [x24, #0x0]\n"
-      "ldr h27, [x23, #0x0]\n"
-      "ldr h26, [x22, #0x0]\n"
-      "ldr h25, [x21, #0x0]\n"
-      "ldr h24, [x20, #0x0]\n"
+      "ldr h28, [x28, #0x0]\n"
+      "ldr h27, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr h26, [x26, #0x0]\n"
+      "ldr h25, [x25, #0x0]\n"
+      "ldr h24, [x24, #0x0]\n"
+      "ldr h23, [x23, #0x0]\n"
+      "ldr h22, [x22, #0x0]\n"
+      "ldr h21, [x21, #0x0]\n"
       "5:"  // Odd load end
-      "fcvtl v30.4s, v30.4h\n"
-      "fcvtl v29.4s, v29.4h\n"
       "fcvtl v28.4s, v28.4h\n"
-      "zip1 v20.4s, v30.4s, v28.4s\n"
-      "fcvtl v21.4s, v21.4h\n"
       "fcvtl v27.4s, v27.4h\n"
-      "zip1 v18.4s, v29.4s, v21.4s\n"
+      "subs x20, x20, #0x1\n"
       "fcvtl v26.4s, v26.4h\n"
       "fcvtl v25.4s, v25.4h\n"
-      "zip1 v23.4s, v20.4s, v18.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "zip1 v19.4s, v27.4s, v25.4s\n"
       "fcvtl v24.4s, v24.4h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v17.4s, v26.4s, v24.4s\n"
-      "zip1 v16.4s, v19.4s, v17.4s\n"
+      "fcvtl v23.4s, v23.4h\n"
+      "fcvtl v22.4s, v22.4h\n"
+      "fcvtl v21.4s, v21.4h\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
-      "zip2 v22.4s, v20.4s, v18.4s\n"
-      "str q22, [%x[out_ptr], #0x0]\n"
-      "zip2 v17.4s, v19.4s, v17.4s\n"
-      "subs x19, x19, #0x1\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
-      "zip2 v17.4s, v30.4s, v28.4s\n"
-      "zip2 v16.4s, v29.4s, v21.4s\n"
-      "zip1 v21.4s, v17.4s, v16.4s\n"
-      "str q21, [%x[out_ptr], #0x0]\n"
-      "zip2 v18.4s, v27.4s, v25.4s\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v28.4s, v26.4s\n"
+      "zip2 v16.4s, v27.4s, v25.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v16.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "6:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
index eefb8549ea..80c387db47 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "add x28, x28, %x[row_offset], LSL #2\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "add x27, x27, %x[row_offset], LSL #2\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
       "add x26, x26, %x[row_offset], LSL #2\n"
-      "ldr x23, [%x[in], #0x20]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset], LSL #2\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset], LSL #2\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset], LSL #2\n"
       "add x22, x22, %x[row_offset], LSL #2\n"
       "add x21, x21, %x[row_offset], LSL #2\n"
-      "add x20, x20, %x[row_offset], LSL #2\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -76,49 +77,48 @@ void interleave_block<8, 1, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q28, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q18, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x4\n"
-      "ldr q29, [x26], #0x10\n"
       "cmp %x[width], #0x4\n"
-      "ldr q25, [x25], #0x10\n"
-      "zip1 v22.4s, v28.4s, v25.4s\n"
-      "ldr q21, [x24], #0x10\n"
-      "zip2 v28.4s, v28.4s, v25.4s\n"
-      "ldr q27, [x23], #0x10\n"
-      "ldr q26, [x22], #0x10\n"
-      "zip1 v20.4s, v29.4s, v21.4s\n"
-      "ldr q19, [x21], #0x10\n"
-      "zip2 v25.4s, v29.4s, v21.4s\n"
-      "ldr q24, [x20], #0x10\n"
-      "zip1 v23.4s, v22.4s, v20.4s\n"
+      "ldr q17, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v25.4s, v20.4s, v17.4s\n"
+      "zip1 v24.4s, v18.4s, v16.4s\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q23, [x23], #0x10\n"
+      "zip2 v22.4s, v20.4s, v17.4s\n"
+      "zip2 v21.4s, v18.4s, v16.4s\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v20.4s, v19.4s, v18.4s\n"
+      "zip1 v17.4s, v23.4s, v16.4s\n"
+      "zip2 v19.4s, v19.4s, v18.4s\n"
+      "zip2 v18.4s, v23.4s, v16.4s\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v22.4s, v22.4s, v20.4s\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip1 v21.4s, v28.4s, v25.4s\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v18.4s, v27.4s, v19.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "zip1 v16.4s, v26.4s, v24.4s\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v20.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip2 v20.4s, v18.4s, v16.4s\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v19.4s, v27.4s, v19.4s\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.4s, v19.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
-      "zip2 v17.4s, v28.4s, v25.4s\n"
-      "str q22, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.4s, v19.4s, v16.4s\n"
-      "str q20, [%x[out_ptr], #0x30]\n"
-      "str q21, [%x[out_ptr], #0x40]\n"
-      "str q18, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.4s, v20.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip1 v16.4s, v22.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "zip2 v17.4s, v22.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
       "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
@@ -126,69 +126,68 @@ void interleave_block<8, 1, VLType::None, false>(
       "3:"  // Main loop skip
       "cbz %x[width], 6f\n"
       "tbz %x[width], #1, 4f\n"
-      "ldr d28, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
-      "mov x19, #0x2\n"
+      "ldr d28, [x28], #0x8\n"
+      "ldr d27, [x27], #0x8\n"
+      "mov x20, #0x2\n"
+      "ldr d26, [x26], #0x8\n"
       "ldr d25, [x25], #0x8\n"
-      "ldr d21, [x24], #0x8\n"
-      "ldr d27, [x23], #0x8\n"
-      "ldr d26, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
       "tbz %x[width], #0, 5f\n"
-      "ld1 { v28.s }[2], [x27]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v29.s }[2], [x26]\n"
+      "ld1 { v28.s }[2], [x28]\n"
+      "ld1 { v27.s }[2], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v26.s }[2], [x26]\n"
       "ld1 { v25.s }[2], [x25]\n"
-      "ld1 { v21.s }[2], [x24]\n"
-      "ld1 { v27.s }[2], [x23]\n"
-      "ld1 { v26.s }[2], [x22]\n"
-      "ld1 { v19.s }[2], [x21]\n"
-      "ld1 { v24.s }[2], [x20]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
       "b 5f\n"
       "4:"  // odd_loads_1_0
-      "ldr s28, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr s29, [x26, #0x0]\n"
+      "ldr s28, [x28, #0x0]\n"
+      "ldr s27, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr s26, [x26, #0x0]\n"
       "ldr s25, [x25, #0x0]\n"
-      "ldr s21, [x24, #0x0]\n"
-      "ldr s27, [x23, #0x0]\n"
-      "ldr s26, [x22, #0x0]\n"
-      "ldr s19, [x21, #0x0]\n"
-      "ldr s24, [x20, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
       "5:"  // Odd load end
-      "zip1 v22.4s, v28.4s, v25.4s\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v20.4s, v29.4s, v21.4s\n"
-      "zip1 v23.4s, v22.4s, v20.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.4s, v27.4s, v19.4s\n"
-      "zip1 v16.4s, v26.4s, v24.4s\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
-      "zip2 v22.4s, v22.4s, v20.4s\n"
-      "str q22, [%x[out_ptr], #0x0]\n"
-      "zip2 v20.4s, v18.4s, v16.4s\n"
-      "subs x19, x19, #0x1\n"
-      "str q20, [%x[out_ptr], #0x10]\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
-      "zip2 v28.4s, v28.4s, v25.4s\n"
-      "zip2 v25.4s, v29.4s, v21.4s\n"
-      "zip1 v21.4s, v28.4s, v25.4s\n"
-      "str q21, [%x[out_ptr], #0x0]\n"
-      "zip2 v19.4s, v27.4s, v19.4s\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
-      "zip1 v18.4s, v19.4s, v16.4s\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v28.4s, v26.4s\n"
+      "zip2 v16.4s, v27.4s, v25.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v16.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "6:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
index b0523b96ce..8e06b7ecab 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "add x28, x28, %x[row_offset], LSL #1\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "add x27, x27, %x[row_offset], LSL #1\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
       "add x26, x26, %x[row_offset], LSL #1\n"
-      "ldr x23, [%x[in], #0x20]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset], LSL #1\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset], LSL #1\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset], LSL #1\n"
       "add x22, x22, %x[row_offset], LSL #1\n"
       "add x21, x21, %x[row_offset], LSL #1\n"
-      "add x20, x20, %x[row_offset], LSL #1\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -76,193 +77,191 @@ void interleave_block<8, 1, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q30, [x27], #0x10\n"
+      "ldr q25, [x28], #0x10\n"
+      "ldr q27, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x8\n"
-      "ldr q29, [x26], #0x10\n"
       "cmp %x[width], #0x8\n"
-      "ldr q28, [x25], #0x10\n"
-      "ldr q27, [x24], #0x10\n"
-      "ldr q25, [x23], #0x10\n"
-      "zip1 v26.8h, v30.8h, v25.8h\n"
-      "ldr q21, [x22], #0x10\n"
-      "zip2 v25.8h, v30.8h, v25.8h\n"
-      "ldr q24, [x21], #0x10\n"
-      "ldr q23, [x20], #0x10\n"
-      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "ldr q26, [x26], #0x10\n"
+      "ldr q24, [x25], #0x10\n"
+      "ldr q21, [x24], #0x10\n"
+      "ldr q20, [x23], #0x10\n"
+      "zip1 v23.8h, v25.8h, v21.8h\n"
+      "zip1 v22.8h, v27.8h, v20.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v19.8h, v26.8h, v17.8h\n"
+      "zip1 v18.8h, v24.8h, v16.8h\n"
+      "zip2 v25.8h, v25.8h, v21.8h\n"
+      "zip2 v21.8h, v26.8h, v17.8h\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v20.8h, v27.8h, v20.8h\n"
+      "zip2 v16.8h, v24.8h, v16.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip1 v20.8h, v28.8h, v24.8h\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v18.8h, v26.8h, v20.8h\n"
+      "zip1 v24.8h, v23.8h, v19.8h\n"
+      "zip1 v17.8h, v22.8h, v18.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "zip1 v19.8h, v27.8h, v23.8h\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v16.8h, v22.8h, v19.8h\n"
+      "zip2 v23.8h, v23.8h, v19.8h\n"
+      "zip2 v19.8h, v22.8h, v18.8h\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip1 v17.8h, v18.8h, v16.8h\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v16.8h, v18.8h, v16.8h\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip2 v18.8h, v26.8h, v20.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v22.8h, v25.8h, v21.8h\n"
+      "zip1 v18.8h, v20.8h, v16.8h\n"
+      "zip2 v21.8h, v25.8h, v21.8h\n"
+      "zip2 v20.8h, v20.8h, v16.8h\n"
+      "zip1 v16.8h, v24.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v24.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x10]\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip1 v17.8h, v23.8h, v19.8h\n"
+      "zip2 v16.8h, v23.8h, v19.8h\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "zip1 v19.8h, v22.8h, v18.8h\n"
+      "zip2 v18.8h, v22.8h, v18.8h\n"
       "str q16, [%x[out_ptr], #0x30]\n"
-      "zip2 v20.8h, v28.8h, v24.8h\n"
-      "zip1 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v17.8h, v21.8h, v19.8h\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x40]\n"
-      "zip2 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x50]\n"
-      "zip2 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v17.8h, v21.8h, v19.8h\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x60]\n"
-      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip1 v17.8h, v21.8h, v20.8h\n"
+      "zip2 v16.8h, v21.8h, v20.8h\n"
+      "str q19, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
       "bge 2b\n"
       "3:"  // Main loop skip
       "cbz %x[width], 8f\n"
       "tbz %x[width], #2, 5f\n"
-      "ldr d30, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d27, [x24], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d26, [x24], #0x8\n"
       "ldr d25, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
-      "ldr d24, [x21], #0x8\n"
-      "ldr d23, [x20], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v30.s }[2], [x27], #0x4\n"
-      "mov x19, #0x6\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v30.s }[2], [x28], #0x4\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
+      "mov x20, #0x6\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x24], #0x4\n"
       "ld1 { v25.s }[2], [x23], #0x4\n"
-      "ld1 { v21.s }[2], [x22], #0x4\n"
-      "ld1 { v24.s }[2], [x21], #0x4\n"
-      "ld1 { v23.s }[2], [x20], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v30.h }[6], [x27]\n"
-      "mov x19, #0x7\n"
-      "ld1 { v29.h }[6], [x26]\n"
-      "ld1 { v28.h }[6], [x25]\n"
-      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v30.h }[6], [x28]\n"
+      "ld1 { v29.h }[6], [x27]\n"
+      "mov x20, #0x7\n"
+      "ld1 { v28.h }[6], [x26]\n"
+      "ld1 { v27.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x24]\n"
       "ld1 { v25.h }[6], [x23]\n"
-      "ld1 { v21.h }[6], [x22]\n"
-      "ld1 { v24.h }[6], [x21]\n"
-      "ld1 { v23.h }[6], [x20]\n"
+      "ld1 { v24.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
-      "mov x19, #0x4\n"
+      "mov x20, #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v30.h }[4], [x27]\n"
-      "ld1 { v29.h }[4], [x26]\n"
-      "mov x19, #0x5\n"
-      "ld1 { v28.h }[4], [x25]\n"
-      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v30.h }[4], [x28]\n"
+      "ld1 { v29.h }[4], [x27]\n"
+      "mov x20, #0x5\n"
+      "ld1 { v28.h }[4], [x26]\n"
+      "ld1 { v27.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x24]\n"
       "ld1 { v25.h }[4], [x23]\n"
-      "ld1 { v21.h }[4], [x22]\n"
-      "ld1 { v24.h }[4], [x21]\n"
-      "ld1 { v23.h }[4], [x20]\n"
+      "ld1 { v24.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
-      "ldr s30, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "mov x19, #0x2\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s27, [x24], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "mov x20, #0x2\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
       "ldr s25, [x23], #0x4\n"
-      "ldr s21, [x22], #0x4\n"
-      "ldr s24, [x21], #0x4\n"
-      "ldr s23, [x20], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v30.h }[2], [x27]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v29.h }[2], [x26]\n"
-      "ld1 { v28.h }[2], [x25]\n"
-      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v30.h }[2], [x28]\n"
+      "ld1 { v29.h }[2], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v28.h }[2], [x26]\n"
+      "ld1 { v27.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x24]\n"
       "ld1 { v25.h }[2], [x23]\n"
-      "ld1 { v21.h }[2], [x22]\n"
-      "ld1 { v24.h }[2], [x21]\n"
-      "ld1 { v23.h }[2], [x20]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
-      "ldr h30, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr h29, [x26, #0x0]\n"
-      "ldr h28, [x25, #0x0]\n"
-      "ldr h27, [x24, #0x0]\n"
+      "ldr h30, [x28, #0x0]\n"
+      "ldr h29, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr h28, [x26, #0x0]\n"
+      "ldr h27, [x25, #0x0]\n"
+      "ldr h26, [x24, #0x0]\n"
       "ldr h25, [x23, #0x0]\n"
-      "ldr h21, [x22, #0x0]\n"
-      "ldr h24, [x21, #0x0]\n"
-      "ldr h23, [x20, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
       "7:"  // Odd load end
-      "zip1 v26.8h, v30.8h, v25.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v20.8h, v28.8h, v24.8h\n"
-      "zip1 v18.8h, v26.8h, v20.8h\n"
-      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
       "zip1 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v16.8h, v22.8h, v19.8h\n"
-      "zip1 v17.8h, v18.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v18.8h, v26.8h, v20.8h\n"
-      "zip2 v17.8h, v22.8h, v19.8h\n"
-      "subs x19, x19, #0x1\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
+      "subs x20, x20, #0x1\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v25.8h, v30.8h, v25.8h\n"
-      "zip2 v20.8h, v28.8h, v24.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
       "zip2 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
+      "subs x20, x20, #0x1\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v17.8h, v21.8h, v19.8h\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
index 292a38f401..b91ae8a948 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,39 +31,40 @@ void interleave_block<8, 1, VLType::None, true>(
 )
 {
   __asm__ __volatile__(
-      "movi v1.8h, #0x0\n"
-      "ldr x27, [%x[in], #0x0]\n"
-      "mov x19, #0x0\n"
-      "movi v0.4s, #0x0\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "movi v31.4s, #0x0\n"
-      "ldr x25, [%x[in], #0x10]\n"
+      "mov x20, #0x0\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
+      "movi v2.8h, #0x0\n"
+      "movi v1.4s, #0x0\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
+      "movi v0.4s, #0x0\n"
+      "add x28, x28, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x27, x27, %x[row_offset], LSL #1\n"
-      "ldr x24, [%x[in], #0x18]\n"
-      "ldr x23, [%x[in], #0x20]\n"
       "add x26, x26, %x[row_offset], LSL #1\n"
-      "ldr x22, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset], LSL #1\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset], LSL #1\n"
-      "ldr x20, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset], LSL #1\n"
       "add x22, x22, %x[row_offset], LSL #1\n"
       "add x21, x21, %x[row_offset], LSL #1\n"
-      "add x20, x20, %x[row_offset], LSL #1\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x28, #0x0]\n"
       "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
@@ -71,7 +72,7 @@ void interleave_block<8, 1, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -79,226 +80,225 @@ void interleave_block<8, 1, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "cbnz %w[first], 2f\n"
       "sub %x[out_ptr], %x[out_ptr], #0x20\n"
-      "ld1 { v0.4s }, [%x[out_ptr]]\n"
-      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "ld1 { v1.4s }, [%x[out_ptr]]\n"
+      "ldr q0, [%x[out_ptr], #0x10]\n"
       "2:"  // first_pass
       "cmp %x[width], #0x8\n"
       "blt 5f\n"
       "3:"  // Main loop head
-      "cmp x19, #0xe\n"
+      "cmp x20, #0xe\n"
       "ble 4f\n"
-      "saddw v0.4s, v0.4s, v1.4h\n"
-      "saddw2 v31.4s, v31.4s, v1.8h\n"
-      "mov x19, #0x0\n"
-      "movi v1.8h, #0x0\n"
+      "saddw v1.4s, v1.4s, v2.4h\n"
+      "saddw2 v0.4s, v0.4s, v2.8h\n"
+      "mov x20, #0x0\n"
+      "movi v2.8h, #0x0\n"
       "4:"  // no_accumulate_16
+      "ldr q31, [x28], #0x10\n"
       "ldr q30, [x27], #0x10\n"
-      "add x19, x19, #0x1\n"
-      "ldr q29, [x26], #0x10\n"
       "subs %x[width], %x[width], #0x8\n"
-      "ldr q28, [x25], #0x10\n"
       "cmp %x[width], #0x8\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q28, [x25], #0x10\n"
+      "add x20, x20, #0x1\n"
       "ldr q27, [x24], #0x10\n"
-      "ldr q25, [x23], #0x10\n"
-      "zip1 v26.8h, v30.8h, v25.8h\n"
-      "ldr q21, [x22], #0x10\n"
-      "zip2 v25.8h, v30.8h, v25.8h\n"
-      "ldr q24, [x21], #0x10\n"
-      "ldr q23, [x20], #0x10\n"
-      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v25.8h, v31.8h, v27.8h\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "ldr q24, [x22], #0x10\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v18.8h, v29.8h, v24.8h\n"
+      "zip1 v21.8h, v28.8h, v23.8h\n"
+      "zip1 v17.8h, v25.8h, v18.8h\n"
+      "zip1 v16.8h, v22.8h, v21.8h\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "add v2.8h, v2.8h, v20.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip1 v20.8h, v28.8h, v24.8h\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v18.8h, v26.8h, v20.8h\n"
+      "zip2 v19.8h, v17.8h, v16.8h\n"
+      "zip2 v18.8h, v25.8h, v18.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "zip1 v19.8h, v27.8h, v23.8h\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v16.8h, v22.8h, v19.8h\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "add v2.8h, v2.8h, v19.8h\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip1 v17.8h, v18.8h, v16.8h\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "add v1.8h, v1.8h, v17.8h\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip2 v16.8h, v18.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v18.8h, v26.8h, v20.8h\n"
-      "str q16, [%x[out_ptr], #0x10]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip2 v17.8h, v22.8h, v19.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v22.8h, v31.8h, v27.8h\n"
+      "str q20, [%x[out_ptr], #0x0]\n"
+      "zip2 v21.8h, v29.8h, v24.8h\n"
+      "zip2 v20.8h, v30.8h, v26.8h\n"
+      "str q19, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.8h, v28.8h, v23.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x20]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
       "str q16, [%x[out_ptr], #0x30]\n"
-      "zip2 v20.8h, v28.8h, v24.8h\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip1 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x40]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
       "str q16, [%x[out_ptr], #0x50]\n"
-      "zip2 v18.8h, v25.8h, v20.8h\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x60]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x70]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "bge 3b\n"
       "5:"  // Main loop skip
       "cbz %x[width], 10f\n"
       "tbz %x[width], #2, 7f\n"
-      "ldr d30, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d27, [x24], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d26, [x24], #0x8\n"
       "ldr d25, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
-      "ldr d24, [x21], #0x8\n"
-      "ldr d23, [x20], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v30.s }[2], [x27], #0x4\n"
-      "mov x19, #0x6\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v30.s }[2], [x28], #0x4\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
+      "mov x20, #0x6\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x24], #0x4\n"
       "ld1 { v25.s }[2], [x23], #0x4\n"
-      "ld1 { v21.s }[2], [x22], #0x4\n"
-      "ld1 { v24.s }[2], [x21], #0x4\n"
-      "ld1 { v23.s }[2], [x20], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.h }[6], [x27]\n"
-      "mov x19, #0x7\n"
-      "ld1 { v29.h }[6], [x26]\n"
-      "ld1 { v28.h }[6], [x25]\n"
-      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v30.h }[6], [x28]\n"
+      "ld1 { v29.h }[6], [x27]\n"
+      "mov x20, #0x7\n"
+      "ld1 { v28.h }[6], [x26]\n"
+      "ld1 { v27.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x24]\n"
       "ld1 { v25.h }[6], [x23]\n"
-      "ld1 { v21.h }[6], [x22]\n"
-      "ld1 { v24.h }[6], [x21]\n"
-      "ld1 { v23.h }[6], [x20]\n"
+      "ld1 { v24.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
       "b 9f\n"
       "6:"  // odd_loads_1_4
-      "mov x19, #0x4\n"
+      "mov x20, #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.h }[4], [x27]\n"
-      "ld1 { v29.h }[4], [x26]\n"
-      "mov x19, #0x5\n"
-      "ld1 { v28.h }[4], [x25]\n"
-      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v30.h }[4], [x28]\n"
+      "ld1 { v29.h }[4], [x27]\n"
+      "mov x20, #0x5\n"
+      "ld1 { v28.h }[4], [x26]\n"
+      "ld1 { v27.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x24]\n"
       "ld1 { v25.h }[4], [x23]\n"
-      "ld1 { v21.h }[4], [x22]\n"
-      "ld1 { v24.h }[4], [x21]\n"
-      "ld1 { v23.h }[4], [x20]\n"
+      "ld1 { v24.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
       "b 9f\n"
       "7:"  // odd_loads_2_0
       "tbz %x[width], #1, 8f\n"
-      "ldr s30, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "mov x19, #0x2\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s27, [x24], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "mov x20, #0x2\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
       "ldr s25, [x23], #0x4\n"
-      "ldr s21, [x22], #0x4\n"
-      "ldr s24, [x21], #0x4\n"
-      "ldr s23, [x20], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.h }[2], [x27]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v29.h }[2], [x26]\n"
-      "ld1 { v28.h }[2], [x25]\n"
-      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v30.h }[2], [x28]\n"
+      "ld1 { v29.h }[2], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v28.h }[2], [x26]\n"
+      "ld1 { v27.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x24]\n"
       "ld1 { v25.h }[2], [x23]\n"
-      "ld1 { v21.h }[2], [x22]\n"
-      "ld1 { v24.h }[2], [x21]\n"
-      "ld1 { v23.h }[2], [x20]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
       "b 9f\n"
       "8:"  // odd_loads_1_0
-      "ldr h30, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr h29, [x26, #0x0]\n"
-      "ldr h28, [x25, #0x0]\n"
-      "ldr h27, [x24, #0x0]\n"
+      "ldr h30, [x28, #0x0]\n"
+      "ldr h29, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr h28, [x26, #0x0]\n"
+      "ldr h27, [x25, #0x0]\n"
+      "ldr h26, [x24, #0x0]\n"
       "ldr h25, [x23, #0x0]\n"
-      "ldr h21, [x22, #0x0]\n"
-      "ldr h24, [x21, #0x0]\n"
-      "ldr h23, [x20, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
       "9:"  // Odd load end
-      "zip1 v26.8h, v30.8h, v25.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v20.8h, v28.8h, v24.8h\n"
-      "zip1 v18.8h, v26.8h, v20.8h\n"
-      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
       "zip1 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v16.8h, v22.8h, v19.8h\n"
-      "zip1 v17.8h, v18.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v17.8h\n"
       "beq 10f\n"
-      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v16.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v18.8h, v26.8h, v20.8h\n"
-      "zip2 v17.8h, v22.8h, v19.8h\n"
-      "subs x19, x19, #0x1\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "beq 10f\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v16.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v25.8h, v30.8h, v25.8h\n"
-      "zip2 v20.8h, v28.8h, v24.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
       "zip2 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "beq 10f\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v16.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v17.8h, v21.8h, v19.8h\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "10:"  // Odds skip
-      "saddw v0.4s, v0.4s, v1.4h\n"
-      "str q0, [%x[out_ptr], #0x0]\n"
-      "saddw2 v31.4s, v31.4s, v1.8h\n"
-      "str q31, [%x[out_ptr], #0x10]\n"
+      "saddw v1.4s, v1.4s, v2.4h\n"
+      "saddw2 v0.4s, v0.4s, v2.8h\n"
+      "str q1, [%x[out_ptr], #0x0]\n"
+      "str q0, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
index 6cfed8f3a4..c41120c698 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "add x28, x28, %x[row_offset]\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "add x27, x27, %x[row_offset]\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
       "add x26, x26, %x[row_offset]\n"
-      "ldr x23, [%x[in], #0x20]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset]\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset]\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset]\n"
       "add x22, x22, %x[row_offset]\n"
       "add x21, x21, %x[row_offset]\n"
-      "add x20, x20, %x[row_offset]\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -76,209 +77,207 @@ void interleave_block<8, 1, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr d31, [x27], #0x8\n"
-      "sshll v31.8h, v31.8b, #0x0\n"
-      "ldr d30, [x26], #0x8\n"
+      "ldr d25, [x28], #0x8\n"
+      "ldr d27, [x27], #0x8\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
+      "sshll v27.8h, v27.8b, #0x0\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
+      "sshll v26.8h, v26.8b, #0x0\n"
+      "sshll v24.8h, v24.8b, #0x0\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "sshll v21.8h, v21.8b, #0x0\n"
+      "sshll v20.8h, v20.8b, #0x0\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x21], #0x8\n"
+      "sshll v17.8h, v17.8b, #0x0\n"
+      "sshll v16.8h, v16.8b, #0x0\n"
+      "zip1 v23.8h, v25.8h, v21.8h\n"
+      "zip1 v22.8h, v26.8h, v17.8h\n"
       "subs %x[width], %x[width], #0x8\n"
-      "sshll v30.8h, v30.8b, #0x0\n"
-      "ldr d29, [x25], #0x8\n"
       "cmp %x[width], #0x8\n"
-      "sshll v29.8h, v29.8b, #0x0\n"
-      "ldr d28, [x24], #0x8\n"
-      "ldr d25, [x23], #0x8\n"
-      "sshll v28.8h, v28.8b, #0x0\n"
-      "ldr d23, [x22], #0x8\n"
-      "sshll v25.8h, v25.8b, #0x0\n"
-      "ldr d27, [x21], #0x8\n"
-      "zip1 v20.8h, v31.8h, v25.8h\n"
-      "ldr d26, [x20], #0x8\n"
-      "zip2 v25.8h, v31.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v20.8h\n"
+      "zip1 v18.8h, v24.8h, v16.8h\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
+      "zip2 v25.8h, v25.8h, v21.8h\n"
+      "zip2 v21.8h, v26.8h, v17.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "sshll v23.8h, v23.8b, #0x0\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v24.8h, v30.8h, v23.8h\n"
-      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "zip2 v20.8h, v27.8h, v20.8h\n"
+      "zip2 v16.8h, v24.8h, v16.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "sshll v27.8h, v27.8b, #0x0\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v19.8h, v29.8h, v27.8h\n"
+      "zip1 v24.8h, v23.8h, v22.8h\n"
+      "zip1 v17.8h, v19.8h, v18.8h\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip1 v22.8h, v20.8h, v19.8h\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v21.8h, v20.8h, v19.8h\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip2 v19.8h, v29.8h, v27.8h\n"
-      "zip1 v20.8h, v25.8h, v19.8h\n"
-      "zip2 v19.8h, v25.8h, v19.8h\n"
-      "sshll v26.8h, v26.8b, #0x0\n"
-      "zip1 v18.8h, v28.8h, v26.8h\n"
-      "zip1 v17.8h, v24.8h, v18.8h\n"
-      "zip1 v16.8h, v22.8h, v17.8h\n"
+      "zip2 v23.8h, v23.8h, v22.8h\n"
+      "zip2 v19.8h, v19.8h, v18.8h\n"
+      "zip1 v22.8h, v25.8h, v21.8h\n"
+      "zip1 v18.8h, v20.8h, v16.8h\n"
+      "zip2 v21.8h, v25.8h, v21.8h\n"
+      "zip2 v20.8h, v20.8h, v16.8h\n"
+      "zip1 v16.8h, v24.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.8h, v22.8h, v17.8h\n"
+      "zip2 v16.8h, v24.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x10]\n"
-      "zip2 v17.8h, v24.8h, v18.8h\n"
-      "zip1 v16.8h, v21.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.8h, v21.8h, v17.8h\n"
+      "zip1 v17.8h, v23.8h, v19.8h\n"
+      "zip2 v16.8h, v23.8h, v19.8h\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "zip1 v19.8h, v22.8h, v18.8h\n"
+      "zip2 v18.8h, v22.8h, v18.8h\n"
       "str q16, [%x[out_ptr], #0x30]\n"
-      "zip2 v18.8h, v28.8h, v26.8h\n"
-      "zip1 v17.8h, v23.8h, v18.8h\n"
-      "zip1 v16.8h, v20.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x40]\n"
-      "zip2 v16.8h, v20.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x50]\n"
-      "zip2 v17.8h, v23.8h, v18.8h\n"
-      "zip1 v16.8h, v19.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x60]\n"
-      "zip2 v16.8h, v19.8h, v17.8h\n"
+      "zip1 v17.8h, v21.8h, v20.8h\n"
+      "zip2 v16.8h, v21.8h, v20.8h\n"
+      "str q19, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
       "bge 2b\n"
       "3:"  // Main loop skip
       "cbz %x[width], 8f\n"
       "tbz %x[width], #2, 5f\n"
-      "ldr s31, [x27], #0x4\n"
-      "ldr s30, [x26], #0x4\n"
-      "ldr s29, [x25], #0x4\n"
-      "ldr s28, [x24], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
       "ldr s25, [x23], #0x4\n"
-      "ldr s23, [x22], #0x4\n"
-      "ldr s27, [x21], #0x4\n"
-      "ldr s26, [x20], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v31.h }[2], [x27], #0x2\n"
-      "mov x19, #0x6\n"
-      "ld1 { v30.h }[2], [x26], #0x2\n"
-      "ld1 { v29.h }[2], [x25], #0x2\n"
-      "ld1 { v28.h }[2], [x24], #0x2\n"
+      "ld1 { v30.h }[2], [x28], #0x2\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
+      "mov x20, #0x6\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v26.h }[2], [x24], #0x2\n"
       "ld1 { v25.h }[2], [x23], #0x2\n"
-      "ld1 { v23.h }[2], [x22], #0x2\n"
-      "ld1 { v27.h }[2], [x21], #0x2\n"
-      "ld1 { v26.h }[2], [x20], #0x2\n"
+      "ld1 { v24.h }[2], [x22], #0x2\n"
+      "ld1 { v23.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v31.b }[6], [x27]\n"
-      "mov x19, #0x7\n"
-      "ld1 { v30.b }[6], [x26]\n"
-      "ld1 { v29.b }[6], [x25]\n"
-      "ld1 { v28.b }[6], [x24]\n"
+      "ld1 { v30.b }[6], [x28]\n"
+      "ld1 { v29.b }[6], [x27]\n"
+      "mov x20, #0x7\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v26.b }[6], [x24]\n"
       "ld1 { v25.b }[6], [x23]\n"
-      "ld1 { v23.b }[6], [x22]\n"
-      "ld1 { v27.b }[6], [x21]\n"
-      "ld1 { v26.b }[6], [x20]\n"
+      "ld1 { v24.b }[6], [x22]\n"
+      "ld1 { v23.b }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
-      "mov x19, #0x4\n"
+      "mov x20, #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v31.b }[4], [x27]\n"
-      "ld1 { v30.b }[4], [x26]\n"
-      "mov x19, #0x5\n"
-      "ld1 { v29.b }[4], [x25]\n"
-      "ld1 { v28.b }[4], [x24]\n"
+      "ld1 { v30.b }[4], [x28]\n"
+      "ld1 { v29.b }[4], [x27]\n"
+      "mov x20, #0x5\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v26.b }[4], [x24]\n"
       "ld1 { v25.b }[4], [x23]\n"
-      "ld1 { v23.b }[4], [x22]\n"
-      "ld1 { v27.b }[4], [x21]\n"
-      "ld1 { v26.b }[4], [x20]\n"
+      "ld1 { v24.b }[4], [x22]\n"
+      "ld1 { v23.b }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
-      "ldr h31, [x27], #0x2\n"
-      "ldr h30, [x26], #0x2\n"
-      "mov x19, #0x2\n"
-      "ldr h29, [x25], #0x2\n"
-      "ldr h28, [x24], #0x2\n"
+      "ldr h30, [x28], #0x2\n"
+      "ldr h29, [x27], #0x2\n"
+      "mov x20, #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h26, [x24], #0x2\n"
       "ldr h25, [x23], #0x2\n"
-      "ldr h23, [x22], #0x2\n"
-      "ldr h27, [x21], #0x2\n"
-      "ldr h26, [x20], #0x2\n"
+      "ldr h24, [x22], #0x2\n"
+      "ldr h23, [x21], #0x2\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v31.b }[2], [x27]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v30.b }[2], [x26]\n"
-      "ld1 { v29.b }[2], [x25]\n"
-      "ld1 { v28.b }[2], [x24]\n"
+      "ld1 { v30.b }[2], [x28]\n"
+      "ld1 { v29.b }[2], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v26.b }[2], [x24]\n"
       "ld1 { v25.b }[2], [x23]\n"
-      "ld1 { v23.b }[2], [x22]\n"
-      "ld1 { v27.b }[2], [x21]\n"
-      "ld1 { v26.b }[2], [x20]\n"
+      "ld1 { v24.b }[2], [x22]\n"
+      "ld1 { v23.b }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
-      "ldr b31, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr b30, [x26, #0x0]\n"
-      "ldr b29, [x25, #0x0]\n"
-      "ldr b28, [x24, #0x0]\n"
+      "ldr b30, [x28, #0x0]\n"
+      "ldr b29, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b26, [x24, #0x0]\n"
       "ldr b25, [x23, #0x0]\n"
-      "ldr b23, [x22, #0x0]\n"
-      "ldr b27, [x21, #0x0]\n"
-      "ldr b26, [x20, #0x0]\n"
+      "ldr b24, [x22, #0x0]\n"
+      "ldr b23, [x21, #0x0]\n"
       "7:"  // Odd load end
-      "sshll v31.8h, v31.8b, #0x0\n"
-      "subs x19, x19, #0x1\n"
       "sshll v30.8h, v30.8b, #0x0\n"
       "sshll v29.8h, v29.8b, #0x0\n"
+      "subs x20, x20, #0x1\n"
       "sshll v28.8h, v28.8b, #0x0\n"
-      "sshll v25.8h, v25.8b, #0x0\n"
-      "zip1 v20.8h, v31.8h, v25.8h\n"
-      "sshll v23.8h, v23.8b, #0x0\n"
-      "zip1 v24.8h, v30.8h, v23.8h\n"
       "sshll v27.8h, v27.8b, #0x0\n"
-      "zip1 v19.8h, v29.8h, v27.8h\n"
-      "zip1 v22.8h, v20.8h, v19.8h\n"
       "sshll v26.8h, v26.8b, #0x0\n"
-      "zip1 v18.8h, v28.8h, v26.8h\n"
-      "zip1 v17.8h, v24.8h, v18.8h\n"
-      "zip1 v16.8h, v22.8h, v17.8h\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
+      "sshll v24.8h, v24.8b, #0x0\n"
+      "sshll v23.8h, v23.8b, #0x0\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v16.8h, v22.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v21.8h, v20.8h, v19.8h\n"
-      "zip2 v17.8h, v24.8h, v18.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v16.8h, v21.8h, v17.8h\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v16.8h, v21.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v25.8h, v31.8h, v25.8h\n"
-      "zip2 v19.8h, v29.8h, v27.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v20.8h, v25.8h, v19.8h\n"
-      "zip2 v23.8h, v30.8h, v23.8h\n"
-      "zip2 v18.8h, v28.8h, v26.8h\n"
-      "zip1 v17.8h, v23.8h, v18.8h\n"
-      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v19.8h, v25.8h, v19.8h\n"
-      "zip2 v17.8h, v23.8h, v18.8h\n"
-      "zip1 v16.8h, v19.8h, v17.8h\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
index b710861417..9ac7053ad8 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,39 +31,40 @@ void interleave_block<8, 1, VLType::None, true>(
 )
 {
   __asm__ __volatile__(
-      "movi v1.8h, #0x0\n"
-      "ldr x27, [%x[in], #0x0]\n"
-      "mov x19, #0x0\n"
-      "movi v0.4s, #0x0\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "movi v31.4s, #0x0\n"
-      "ldr x25, [%x[in], #0x10]\n"
+      "mov x20, #0x0\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
+      "movi v2.8h, #0x0\n"
+      "movi v1.4s, #0x0\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
+      "movi v0.4s, #0x0\n"
+      "add x28, x28, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x27, x27, %x[row_offset]\n"
-      "ldr x24, [%x[in], #0x18]\n"
-      "ldr x23, [%x[in], #0x20]\n"
       "add x26, x26, %x[row_offset]\n"
-      "ldr x22, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset]\n"
-      "ldr x20, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset]\n"
       "add x22, x22, %x[row_offset]\n"
       "add x21, x21, %x[row_offset]\n"
-      "add x20, x20, %x[row_offset]\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x28, #0x0]\n"
       "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
@@ -71,7 +72,7 @@ void interleave_block<8, 1, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -79,242 +80,241 @@ void interleave_block<8, 1, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "cbnz %w[first], 2f\n"
       "sub %x[out_ptr], %x[out_ptr], #0x20\n"
-      "ld1 { v0.4s }, [%x[out_ptr]]\n"
-      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "ld1 { v1.4s }, [%x[out_ptr]]\n"
+      "ldr q0, [%x[out_ptr], #0x10]\n"
       "2:"  // first_pass
       "cmp %x[width], #0x8\n"
       "blt 5f\n"
       "3:"  // Main loop head
-      "cmp x19, #0xe\n"
+      "cmp x20, #0xe\n"
       "ble 4f\n"
-      "saddw v0.4s, v0.4s, v1.4h\n"
-      "saddw2 v31.4s, v31.4s, v1.8h\n"
-      "mov x19, #0x0\n"
-      "movi v1.8h, #0x0\n"
+      "saddw v1.4s, v1.4s, v2.4h\n"
+      "saddw2 v0.4s, v0.4s, v2.8h\n"
+      "mov x20, #0x0\n"
+      "movi v2.8h, #0x0\n"
       "4:"  // no_accumulate_16
+      "ldr d31, [x28], #0x8\n"
       "ldr d30, [x27], #0x8\n"
+      "sshll v31.8h, v31.8b, #0x0\n"
       "sshll v30.8h, v30.8b, #0x0\n"
       "ldr d29, [x26], #0x8\n"
-      "add x19, x19, #0x1\n"
-      "sshll v29.8h, v29.8b, #0x0\n"
       "ldr d28, [x25], #0x8\n"
-      "subs %x[width], %x[width], #0x8\n"
+      "sshll v29.8h, v29.8b, #0x0\n"
       "sshll v28.8h, v28.8b, #0x0\n"
       "ldr d27, [x24], #0x8\n"
-      "cmp %x[width], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
       "sshll v27.8h, v27.8b, #0x0\n"
-      "ldr d24, [x23], #0x8\n"
-      "ldr d23, [x22], #0x8\n"
+      "sshll v26.8h, v26.8b, #0x0\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
       "sshll v24.8h, v24.8b, #0x0\n"
-      "ldr d21, [x21], #0x8\n"
-      "sshll v23.8h, v23.8b, #0x0\n"
-      "ldr d26, [x20], #0x8\n"
-      "zip1 v20.8h, v30.8h, v24.8h\n"
+      "zip1 v23.8h, v31.8h, v27.8h\n"
+      "zip1 v22.8h, v29.8h, v25.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v21.8h, v30.8h, v26.8h\n"
+      "zip1 v20.8h, v28.8h, v24.8h\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip1 v25.8h, v29.8h, v23.8h\n"
+      "zip1 v18.8h, v23.8h, v22.8h\n"
+      "zip1 v17.8h, v21.8h, v20.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip2 v24.8h, v30.8h, v24.8h\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip2 v23.8h, v29.8h, v23.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "sshll v21.8h, v21.8b, #0x0\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v19.8h, v28.8h, v21.8h\n"
+      "zip2 v19.8h, v18.8h, v17.8h\n"
+      "zip2 v18.8h, v23.8h, v22.8h\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip1 v22.8h, v20.8h, v19.8h\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v19.8h, v20.8h, v19.8h\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip2 v20.8h, v28.8h, v21.8h\n"
-      "zip1 v21.8h, v24.8h, v20.8h\n"
-      "zip2 v20.8h, v24.8h, v20.8h\n"
-      "sshll v26.8h, v26.8b, #0x0\n"
-      "zip1 v18.8h, v27.8h, v26.8h\n"
-      "zip1 v17.8h, v25.8h, v18.8h\n"
-      "zip1 v16.8h, v22.8h, v17.8h\n"
+      "zip2 v17.8h, v21.8h, v20.8h\n"
+      "add v2.8h, v2.8h, v19.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip2 v17.8h, v22.8h, v17.8h\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
-      "zip2 v16.8h, v25.8h, v18.8h\n"
-      "add v1.8h, v1.8h, v17.8h\n"
-      "zip1 v17.8h, v19.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.8h, v19.8h, v16.8h\n"
+      "add x20, x20, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v22.8h, v31.8h, v27.8h\n"
+      "str q19, [%x[out_ptr], #0x10]\n"
+      "zip2 v21.8h, v29.8h, v25.8h\n"
+      "zip2 v20.8h, v30.8h, v26.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v19.8h, v28.8h, v24.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
       "str q16, [%x[out_ptr], #0x30]\n"
-      "add v1.8h, v1.8h, v17.8h\n"
-      "zip2 v19.8h, v27.8h, v26.8h\n"
-      "zip1 v17.8h, v23.8h, v19.8h\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip1 v16.8h, v21.8h, v17.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x40]\n"
-      "zip2 v18.8h, v21.8h, v17.8h\n"
-      "str q18, [%x[out_ptr], #0x50]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip2 v16.8h, v23.8h, v19.8h\n"
-      "zip1 v17.8h, v20.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x60]\n"
-      "add v1.8h, v1.8h, v18.8h\n"
-      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x70]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
-      "add v1.8h, v1.8h, v17.8h\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "bge 3b\n"
       "5:"  // Main loop skip
       "cbz %x[width], 10f\n"
       "tbz %x[width], #2, 7f\n"
-      "ldr s30, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s27, [x24], #0x4\n"
-      "ldr s24, [x23], #0x4\n"
-      "ldr s23, [x22], #0x4\n"
-      "ldr s21, [x21], #0x4\n"
-      "ldr s26, [x20], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v30.h }[2], [x27], #0x2\n"
-      "mov x19, #0x6\n"
-      "ld1 { v29.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v27.h }[2], [x24], #0x2\n"
-      "ld1 { v24.h }[2], [x23], #0x2\n"
-      "ld1 { v23.h }[2], [x22], #0x2\n"
-      "ld1 { v21.h }[2], [x21], #0x2\n"
-      "ld1 { v26.h }[2], [x20], #0x2\n"
+      "ld1 { v30.h }[2], [x28], #0x2\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
+      "mov x20, #0x6\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v26.h }[2], [x24], #0x2\n"
+      "ld1 { v25.h }[2], [x23], #0x2\n"
+      "ld1 { v24.h }[2], [x22], #0x2\n"
+      "ld1 { v23.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.b }[6], [x27]\n"
-      "mov x19, #0x7\n"
-      "ld1 { v29.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v27.b }[6], [x24]\n"
-      "ld1 { v24.b }[6], [x23]\n"
-      "ld1 { v23.b }[6], [x22]\n"
-      "ld1 { v21.b }[6], [x21]\n"
-      "ld1 { v26.b }[6], [x20]\n"
+      "ld1 { v30.b }[6], [x28]\n"
+      "ld1 { v29.b }[6], [x27]\n"
+      "mov x20, #0x7\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v26.b }[6], [x24]\n"
+      "ld1 { v25.b }[6], [x23]\n"
+      "ld1 { v24.b }[6], [x22]\n"
+      "ld1 { v23.b }[6], [x21]\n"
       "b 9f\n"
       "6:"  // odd_loads_1_4
-      "mov x19, #0x4\n"
+      "mov x20, #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.b }[4], [x27]\n"
-      "ld1 { v29.b }[4], [x26]\n"
-      "mov x19, #0x5\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v27.b }[4], [x24]\n"
-      "ld1 { v24.b }[4], [x23]\n"
-      "ld1 { v23.b }[4], [x22]\n"
-      "ld1 { v21.b }[4], [x21]\n"
-      "ld1 { v26.b }[4], [x20]\n"
+      "ld1 { v30.b }[4], [x28]\n"
+      "ld1 { v29.b }[4], [x27]\n"
+      "mov x20, #0x5\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v26.b }[4], [x24]\n"
+      "ld1 { v25.b }[4], [x23]\n"
+      "ld1 { v24.b }[4], [x22]\n"
+      "ld1 { v23.b }[4], [x21]\n"
       "b 9f\n"
       "7:"  // odd_loads_2_0
       "tbz %x[width], #1, 8f\n"
-      "ldr h30, [x27], #0x2\n"
-      "ldr h29, [x26], #0x2\n"
-      "mov x19, #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h27, [x24], #0x2\n"
-      "ldr h24, [x23], #0x2\n"
-      "ldr h23, [x22], #0x2\n"
-      "ldr h21, [x21], #0x2\n"
-      "ldr h26, [x20], #0x2\n"
+      "ldr h30, [x28], #0x2\n"
+      "ldr h29, [x27], #0x2\n"
+      "mov x20, #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h26, [x24], #0x2\n"
+      "ldr h25, [x23], #0x2\n"
+      "ldr h24, [x22], #0x2\n"
+      "ldr h23, [x21], #0x2\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.b }[2], [x27]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v29.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v27.b }[2], [x24]\n"
-      "ld1 { v24.b }[2], [x23]\n"
-      "ld1 { v23.b }[2], [x22]\n"
-      "ld1 { v21.b }[2], [x21]\n"
-      "ld1 { v26.b }[2], [x20]\n"
+      "ld1 { v30.b }[2], [x28]\n"
+      "ld1 { v29.b }[2], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v26.b }[2], [x24]\n"
+      "ld1 { v25.b }[2], [x23]\n"
+      "ld1 { v24.b }[2], [x22]\n"
+      "ld1 { v23.b }[2], [x21]\n"
       "b 9f\n"
       "8:"  // odd_loads_1_0
-      "ldr b30, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr b29, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b27, [x24, #0x0]\n"
-      "ldr b24, [x23, #0x0]\n"
-      "ldr b23, [x22, #0x0]\n"
-      "ldr b21, [x21, #0x0]\n"
-      "ldr b26, [x20, #0x0]\n"
+      "ldr b30, [x28, #0x0]\n"
+      "ldr b29, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b26, [x24, #0x0]\n"
+      "ldr b25, [x23, #0x0]\n"
+      "ldr b24, [x22, #0x0]\n"
+      "ldr b23, [x21, #0x0]\n"
       "9:"  // Odd load end
       "sshll v30.8h, v30.8b, #0x0\n"
-      "subs x19, x19, #0x1\n"
       "sshll v29.8h, v29.8b, #0x0\n"
+      "subs x20, x20, #0x1\n"
       "sshll v28.8h, v28.8b, #0x0\n"
       "sshll v27.8h, v27.8b, #0x0\n"
+      "sshll v26.8h, v26.8b, #0x0\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
       "sshll v24.8h, v24.8b, #0x0\n"
-      "zip1 v20.8h, v30.8h, v24.8h\n"
       "sshll v23.8h, v23.8b, #0x0\n"
-      "zip1 v25.8h, v29.8h, v23.8h\n"
-      "sshll v21.8h, v21.8b, #0x0\n"
-      "zip1 v19.8h, v28.8h, v21.8h\n"
-      "zip1 v22.8h, v20.8h, v19.8h\n"
-      "sshll v26.8h, v26.8b, #0x0\n"
-      "zip1 v18.8h, v27.8h, v26.8h\n"
-      "zip1 v17.8h, v25.8h, v18.8h\n"
-      "zip1 v16.8h, v22.8h, v17.8h\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "beq 10f\n"
-      "zip2 v17.8h, v22.8h, v17.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v17.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v19.8h, v20.8h, v19.8h\n"
-      "zip2 v16.8h, v25.8h, v18.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v17.8h, v19.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v17.8h\n"
       "beq 10f\n"
-      "zip2 v16.8h, v19.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v16.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v24.8h, v30.8h, v24.8h\n"
-      "zip2 v20.8h, v28.8h, v21.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v21.8h, v24.8h, v20.8h\n"
-      "zip2 v23.8h, v29.8h, v23.8h\n"
-      "zip2 v19.8h, v27.8h, v26.8h\n"
-      "zip1 v17.8h, v23.8h, v19.8h\n"
-      "zip1 v16.8h, v21.8h, v17.8h\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "beq 10f\n"
-      "zip2 v18.8h, v21.8h, v17.8h\n"
-      "str q18, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v18.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v20.8h, v24.8h, v20.8h\n"
-      "zip2 v16.8h, v23.8h, v19.8h\n"
-      "zip1 v17.8h, v20.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v17.8h\n"
       "10:"  // Odds skip
-      "saddw v0.4s, v0.4s, v1.4h\n"
-      "str q0, [%x[out_ptr], #0x0]\n"
-      "saddw2 v31.4s, v31.4s, v1.8h\n"
-      "str q31, [%x[out_ptr], #0x10]\n"
+      "saddw v1.4s, v1.4s, v2.4h\n"
+      "saddw2 v0.4s, v0.4s, v2.8h\n"
+      "str q1, [%x[out_ptr], #0x0]\n"
+      "str q0, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
index 24ece9a68e..c01d980f49 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,39 +31,40 @@ void interleave_block<8, 1, VLType::None, true>(
 )
 {
   __asm__ __volatile__(
-      "movi v1.8h, #0x0\n"
-      "ldr x27, [%x[in], #0x0]\n"
-      "mov x19, #0x0\n"
-      "movi v0.4s, #0x0\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "movi v31.4s, #0x0\n"
-      "ldr x25, [%x[in], #0x10]\n"
+      "mov x20, #0x0\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
+      "movi v2.8h, #0x0\n"
+      "movi v1.4s, #0x0\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
+      "movi v0.4s, #0x0\n"
+      "add x28, x28, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x27, x27, %x[row_offset], LSL #1\n"
-      "ldr x24, [%x[in], #0x18]\n"
-      "ldr x23, [%x[in], #0x20]\n"
       "add x26, x26, %x[row_offset], LSL #1\n"
-      "ldr x22, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset], LSL #1\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset], LSL #1\n"
-      "ldr x20, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset], LSL #1\n"
       "add x22, x22, %x[row_offset], LSL #1\n"
       "add x21, x21, %x[row_offset], LSL #1\n"
-      "add x20, x20, %x[row_offset], LSL #1\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x28, #0x0]\n"
       "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
@@ -71,7 +72,7 @@ void interleave_block<8, 1, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -79,226 +80,225 @@ void interleave_block<8, 1, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "cbnz %w[first], 2f\n"
       "sub %x[out_ptr], %x[out_ptr], #0x20\n"
-      "ld1 { v0.4s }, [%x[out_ptr]]\n"
-      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "ld1 { v1.4s }, [%x[out_ptr]]\n"
+      "ldr q0, [%x[out_ptr], #0x10]\n"
       "2:"  // first_pass
       "cmp %x[width], #0x8\n"
       "blt 5f\n"
       "3:"  // Main loop head
-      "cmp x19, #0xe\n"
+      "cmp x20, #0xe\n"
       "ble 4f\n"
-      "uaddw v0.4s, v0.4s, v1.4h\n"
-      "uaddw2 v31.4s, v31.4s, v1.8h\n"
-      "mov x19, #0x0\n"
-      "movi v1.8h, #0x0\n"
+      "uaddw v1.4s, v1.4s, v2.4h\n"
+      "uaddw2 v0.4s, v0.4s, v2.8h\n"
+      "mov x20, #0x0\n"
+      "movi v2.8h, #0x0\n"
       "4:"  // no_accumulate_16
+      "ldr q31, [x28], #0x10\n"
       "ldr q30, [x27], #0x10\n"
-      "add x19, x19, #0x1\n"
-      "ldr q29, [x26], #0x10\n"
       "subs %x[width], %x[width], #0x8\n"
-      "ldr q28, [x25], #0x10\n"
       "cmp %x[width], #0x8\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q28, [x25], #0x10\n"
+      "add x20, x20, #0x1\n"
       "ldr q27, [x24], #0x10\n"
-      "ldr q25, [x23], #0x10\n"
-      "zip1 v26.8h, v30.8h, v25.8h\n"
-      "ldr q21, [x22], #0x10\n"
-      "zip2 v25.8h, v30.8h, v25.8h\n"
-      "ldr q24, [x21], #0x10\n"
-      "ldr q23, [x20], #0x10\n"
-      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v25.8h, v31.8h, v27.8h\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "ldr q24, [x22], #0x10\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v18.8h, v29.8h, v24.8h\n"
+      "zip1 v21.8h, v28.8h, v23.8h\n"
+      "zip1 v17.8h, v25.8h, v18.8h\n"
+      "zip1 v16.8h, v22.8h, v21.8h\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "add v2.8h, v2.8h, v20.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip1 v20.8h, v28.8h, v24.8h\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v18.8h, v26.8h, v20.8h\n"
+      "zip2 v19.8h, v17.8h, v16.8h\n"
+      "zip2 v18.8h, v25.8h, v18.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "zip1 v19.8h, v27.8h, v23.8h\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v16.8h, v22.8h, v19.8h\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "add v2.8h, v2.8h, v19.8h\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip1 v17.8h, v18.8h, v16.8h\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "add v1.8h, v1.8h, v17.8h\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip2 v16.8h, v18.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v18.8h, v26.8h, v20.8h\n"
-      "str q16, [%x[out_ptr], #0x10]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip2 v17.8h, v22.8h, v19.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v22.8h, v31.8h, v27.8h\n"
+      "str q20, [%x[out_ptr], #0x0]\n"
+      "zip2 v21.8h, v29.8h, v24.8h\n"
+      "zip2 v20.8h, v30.8h, v26.8h\n"
+      "str q19, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.8h, v28.8h, v23.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x20]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
       "str q16, [%x[out_ptr], #0x30]\n"
-      "zip2 v20.8h, v28.8h, v24.8h\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip1 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x40]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
       "str q16, [%x[out_ptr], #0x50]\n"
-      "zip2 v18.8h, v25.8h, v20.8h\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x60]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x70]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "bge 3b\n"
       "5:"  // Main loop skip
       "cbz %x[width], 10f\n"
       "tbz %x[width], #2, 7f\n"
-      "ldr d30, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d27, [x24], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d26, [x24], #0x8\n"
       "ldr d25, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
-      "ldr d24, [x21], #0x8\n"
-      "ldr d23, [x20], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v30.s }[2], [x27], #0x4\n"
-      "mov x19, #0x6\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v30.s }[2], [x28], #0x4\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
+      "mov x20, #0x6\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x24], #0x4\n"
       "ld1 { v25.s }[2], [x23], #0x4\n"
-      "ld1 { v21.s }[2], [x22], #0x4\n"
-      "ld1 { v24.s }[2], [x21], #0x4\n"
-      "ld1 { v23.s }[2], [x20], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.h }[6], [x27]\n"
-      "mov x19, #0x7\n"
-      "ld1 { v29.h }[6], [x26]\n"
-      "ld1 { v28.h }[6], [x25]\n"
-      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v30.h }[6], [x28]\n"
+      "ld1 { v29.h }[6], [x27]\n"
+      "mov x20, #0x7\n"
+      "ld1 { v28.h }[6], [x26]\n"
+      "ld1 { v27.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x24]\n"
       "ld1 { v25.h }[6], [x23]\n"
-      "ld1 { v21.h }[6], [x22]\n"
-      "ld1 { v24.h }[6], [x21]\n"
-      "ld1 { v23.h }[6], [x20]\n"
+      "ld1 { v24.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
       "b 9f\n"
       "6:"  // odd_loads_1_4
-      "mov x19, #0x4\n"
+      "mov x20, #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.h }[4], [x27]\n"
-      "ld1 { v29.h }[4], [x26]\n"
-      "mov x19, #0x5\n"
-      "ld1 { v28.h }[4], [x25]\n"
-      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v30.h }[4], [x28]\n"
+      "ld1 { v29.h }[4], [x27]\n"
+      "mov x20, #0x5\n"
+      "ld1 { v28.h }[4], [x26]\n"
+      "ld1 { v27.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x24]\n"
       "ld1 { v25.h }[4], [x23]\n"
-      "ld1 { v21.h }[4], [x22]\n"
-      "ld1 { v24.h }[4], [x21]\n"
-      "ld1 { v23.h }[4], [x20]\n"
+      "ld1 { v24.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
       "b 9f\n"
       "7:"  // odd_loads_2_0
       "tbz %x[width], #1, 8f\n"
-      "ldr s30, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "mov x19, #0x2\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s27, [x24], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "mov x20, #0x2\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
       "ldr s25, [x23], #0x4\n"
-      "ldr s21, [x22], #0x4\n"
-      "ldr s24, [x21], #0x4\n"
-      "ldr s23, [x20], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.h }[2], [x27]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v29.h }[2], [x26]\n"
-      "ld1 { v28.h }[2], [x25]\n"
-      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v30.h }[2], [x28]\n"
+      "ld1 { v29.h }[2], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v28.h }[2], [x26]\n"
+      "ld1 { v27.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x24]\n"
       "ld1 { v25.h }[2], [x23]\n"
-      "ld1 { v21.h }[2], [x22]\n"
-      "ld1 { v24.h }[2], [x21]\n"
-      "ld1 { v23.h }[2], [x20]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
       "b 9f\n"
       "8:"  // odd_loads_1_0
-      "ldr h30, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr h29, [x26, #0x0]\n"
-      "ldr h28, [x25, #0x0]\n"
-      "ldr h27, [x24, #0x0]\n"
+      "ldr h30, [x28, #0x0]\n"
+      "ldr h29, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr h28, [x26, #0x0]\n"
+      "ldr h27, [x25, #0x0]\n"
+      "ldr h26, [x24, #0x0]\n"
       "ldr h25, [x23, #0x0]\n"
-      "ldr h21, [x22, #0x0]\n"
-      "ldr h24, [x21, #0x0]\n"
-      "ldr h23, [x20, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
       "9:"  // Odd load end
-      "zip1 v26.8h, v30.8h, v25.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v20.8h, v28.8h, v24.8h\n"
-      "zip1 v18.8h, v26.8h, v20.8h\n"
-      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
       "zip1 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v16.8h, v22.8h, v19.8h\n"
-      "zip1 v17.8h, v18.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v17.8h\n"
       "beq 10f\n"
-      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v16.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v18.8h, v26.8h, v20.8h\n"
-      "zip2 v17.8h, v22.8h, v19.8h\n"
-      "subs x19, x19, #0x1\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "beq 10f\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v16.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v25.8h, v30.8h, v25.8h\n"
-      "zip2 v20.8h, v28.8h, v24.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
       "zip2 v19.8h, v27.8h, v23.8h\n"
-      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "beq 10f\n"
       "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v16.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v18.8h, v25.8h, v20.8h\n"
-      "zip2 v17.8h, v21.8h, v19.8h\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "10:"  // Odds skip
-      "uaddw v0.4s, v0.4s, v1.4h\n"
-      "str q0, [%x[out_ptr], #0x0]\n"
-      "uaddw2 v31.4s, v31.4s, v1.8h\n"
-      "str q31, [%x[out_ptr], #0x10]\n"
+      "uaddw v1.4s, v1.4s, v2.4h\n"
+      "uaddw2 v0.4s, v0.4s, v2.8h\n"
+      "str q1, [%x[out_ptr], #0x0]\n"
+      "str q0, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
index 0db2f7fd51..d29a995b46 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "add x28, x28, %x[row_offset]\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "add x27, x27, %x[row_offset]\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
       "add x26, x26, %x[row_offset]\n"
-      "ldr x23, [%x[in], #0x20]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset]\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset]\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset]\n"
       "add x22, x22, %x[row_offset]\n"
       "add x21, x21, %x[row_offset]\n"
-      "add x20, x20, %x[row_offset]\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -76,209 +77,207 @@ void interleave_block<8, 1, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr d31, [x27], #0x8\n"
-      "ushll v31.8h, v31.8b, #0x0\n"
-      "ldr d30, [x26], #0x8\n"
+      "ldr d25, [x28], #0x8\n"
+      "ldr d27, [x27], #0x8\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
+      "ushll v27.8h, v27.8b, #0x0\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
+      "ushll v26.8h, v26.8b, #0x0\n"
+      "ushll v24.8h, v24.8b, #0x0\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ushll v21.8h, v21.8b, #0x0\n"
+      "ushll v20.8h, v20.8b, #0x0\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x21], #0x8\n"
+      "ushll v17.8h, v17.8b, #0x0\n"
+      "ushll v16.8h, v16.8b, #0x0\n"
+      "zip1 v23.8h, v25.8h, v21.8h\n"
+      "zip1 v22.8h, v26.8h, v17.8h\n"
       "subs %x[width], %x[width], #0x8\n"
-      "ushll v30.8h, v30.8b, #0x0\n"
-      "ldr d29, [x25], #0x8\n"
       "cmp %x[width], #0x8\n"
-      "ushll v29.8h, v29.8b, #0x0\n"
-      "ldr d28, [x24], #0x8\n"
-      "ldr d25, [x23], #0x8\n"
-      "ushll v28.8h, v28.8b, #0x0\n"
-      "ldr d23, [x22], #0x8\n"
-      "ushll v25.8h, v25.8b, #0x0\n"
-      "ldr d27, [x21], #0x8\n"
-      "zip1 v20.8h, v31.8h, v25.8h\n"
-      "ldr d26, [x20], #0x8\n"
-      "zip2 v25.8h, v31.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v20.8h\n"
+      "zip1 v18.8h, v24.8h, v16.8h\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
+      "zip2 v25.8h, v25.8h, v21.8h\n"
+      "zip2 v21.8h, v26.8h, v17.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "ushll v23.8h, v23.8b, #0x0\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v24.8h, v30.8h, v23.8h\n"
-      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "zip2 v20.8h, v27.8h, v20.8h\n"
+      "zip2 v16.8h, v24.8h, v16.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "ushll v27.8h, v27.8b, #0x0\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v19.8h, v29.8h, v27.8h\n"
+      "zip1 v24.8h, v23.8h, v22.8h\n"
+      "zip1 v17.8h, v19.8h, v18.8h\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip1 v22.8h, v20.8h, v19.8h\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v21.8h, v20.8h, v19.8h\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip2 v19.8h, v29.8h, v27.8h\n"
-      "zip1 v20.8h, v25.8h, v19.8h\n"
-      "zip2 v19.8h, v25.8h, v19.8h\n"
-      "ushll v26.8h, v26.8b, #0x0\n"
-      "zip1 v18.8h, v28.8h, v26.8h\n"
-      "zip1 v17.8h, v24.8h, v18.8h\n"
-      "zip1 v16.8h, v22.8h, v17.8h\n"
+      "zip2 v23.8h, v23.8h, v22.8h\n"
+      "zip2 v19.8h, v19.8h, v18.8h\n"
+      "zip1 v22.8h, v25.8h, v21.8h\n"
+      "zip1 v18.8h, v20.8h, v16.8h\n"
+      "zip2 v21.8h, v25.8h, v21.8h\n"
+      "zip2 v20.8h, v20.8h, v16.8h\n"
+      "zip1 v16.8h, v24.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.8h, v22.8h, v17.8h\n"
+      "zip2 v16.8h, v24.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x10]\n"
-      "zip2 v17.8h, v24.8h, v18.8h\n"
-      "zip1 v16.8h, v21.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.8h, v21.8h, v17.8h\n"
+      "zip1 v17.8h, v23.8h, v19.8h\n"
+      "zip2 v16.8h, v23.8h, v19.8h\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "zip1 v19.8h, v22.8h, v18.8h\n"
+      "zip2 v18.8h, v22.8h, v18.8h\n"
       "str q16, [%x[out_ptr], #0x30]\n"
-      "zip2 v18.8h, v28.8h, v26.8h\n"
-      "zip1 v17.8h, v23.8h, v18.8h\n"
-      "zip1 v16.8h, v20.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x40]\n"
-      "zip2 v16.8h, v20.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x50]\n"
-      "zip2 v17.8h, v23.8h, v18.8h\n"
-      "zip1 v16.8h, v19.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x60]\n"
-      "zip2 v16.8h, v19.8h, v17.8h\n"
+      "zip1 v17.8h, v21.8h, v20.8h\n"
+      "zip2 v16.8h, v21.8h, v20.8h\n"
+      "str q19, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
       "bge 2b\n"
       "3:"  // Main loop skip
       "cbz %x[width], 8f\n"
       "tbz %x[width], #2, 5f\n"
-      "ldr s31, [x27], #0x4\n"
-      "ldr s30, [x26], #0x4\n"
-      "ldr s29, [x25], #0x4\n"
-      "ldr s28, [x24], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
       "ldr s25, [x23], #0x4\n"
-      "ldr s23, [x22], #0x4\n"
-      "ldr s27, [x21], #0x4\n"
-      "ldr s26, [x20], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v31.h }[2], [x27], #0x2\n"
-      "mov x19, #0x6\n"
-      "ld1 { v30.h }[2], [x26], #0x2\n"
-      "ld1 { v29.h }[2], [x25], #0x2\n"
-      "ld1 { v28.h }[2], [x24], #0x2\n"
+      "ld1 { v30.h }[2], [x28], #0x2\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
+      "mov x20, #0x6\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v26.h }[2], [x24], #0x2\n"
       "ld1 { v25.h }[2], [x23], #0x2\n"
-      "ld1 { v23.h }[2], [x22], #0x2\n"
-      "ld1 { v27.h }[2], [x21], #0x2\n"
-      "ld1 { v26.h }[2], [x20], #0x2\n"
+      "ld1 { v24.h }[2], [x22], #0x2\n"
+      "ld1 { v23.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v31.b }[6], [x27]\n"
-      "mov x19, #0x7\n"
-      "ld1 { v30.b }[6], [x26]\n"
-      "ld1 { v29.b }[6], [x25]\n"
-      "ld1 { v28.b }[6], [x24]\n"
+      "ld1 { v30.b }[6], [x28]\n"
+      "ld1 { v29.b }[6], [x27]\n"
+      "mov x20, #0x7\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v26.b }[6], [x24]\n"
       "ld1 { v25.b }[6], [x23]\n"
-      "ld1 { v23.b }[6], [x22]\n"
-      "ld1 { v27.b }[6], [x21]\n"
-      "ld1 { v26.b }[6], [x20]\n"
+      "ld1 { v24.b }[6], [x22]\n"
+      "ld1 { v23.b }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
-      "mov x19, #0x4\n"
+      "mov x20, #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v31.b }[4], [x27]\n"
-      "ld1 { v30.b }[4], [x26]\n"
-      "mov x19, #0x5\n"
-      "ld1 { v29.b }[4], [x25]\n"
-      "ld1 { v28.b }[4], [x24]\n"
+      "ld1 { v30.b }[4], [x28]\n"
+      "ld1 { v29.b }[4], [x27]\n"
+      "mov x20, #0x5\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v26.b }[4], [x24]\n"
       "ld1 { v25.b }[4], [x23]\n"
-      "ld1 { v23.b }[4], [x22]\n"
-      "ld1 { v27.b }[4], [x21]\n"
-      "ld1 { v26.b }[4], [x20]\n"
+      "ld1 { v24.b }[4], [x22]\n"
+      "ld1 { v23.b }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
-      "ldr h31, [x27], #0x2\n"
-      "ldr h30, [x26], #0x2\n"
-      "mov x19, #0x2\n"
-      "ldr h29, [x25], #0x2\n"
-      "ldr h28, [x24], #0x2\n"
+      "ldr h30, [x28], #0x2\n"
+      "ldr h29, [x27], #0x2\n"
+      "mov x20, #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h26, [x24], #0x2\n"
       "ldr h25, [x23], #0x2\n"
-      "ldr h23, [x22], #0x2\n"
-      "ldr h27, [x21], #0x2\n"
-      "ldr h26, [x20], #0x2\n"
+      "ldr h24, [x22], #0x2\n"
+      "ldr h23, [x21], #0x2\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v31.b }[2], [x27]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v30.b }[2], [x26]\n"
-      "ld1 { v29.b }[2], [x25]\n"
-      "ld1 { v28.b }[2], [x24]\n"
+      "ld1 { v30.b }[2], [x28]\n"
+      "ld1 { v29.b }[2], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v26.b }[2], [x24]\n"
       "ld1 { v25.b }[2], [x23]\n"
-      "ld1 { v23.b }[2], [x22]\n"
-      "ld1 { v27.b }[2], [x21]\n"
-      "ld1 { v26.b }[2], [x20]\n"
+      "ld1 { v24.b }[2], [x22]\n"
+      "ld1 { v23.b }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
-      "ldr b31, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr b30, [x26, #0x0]\n"
-      "ldr b29, [x25, #0x0]\n"
-      "ldr b28, [x24, #0x0]\n"
+      "ldr b30, [x28, #0x0]\n"
+      "ldr b29, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b26, [x24, #0x0]\n"
       "ldr b25, [x23, #0x0]\n"
-      "ldr b23, [x22, #0x0]\n"
-      "ldr b27, [x21, #0x0]\n"
-      "ldr b26, [x20, #0x0]\n"
+      "ldr b24, [x22, #0x0]\n"
+      "ldr b23, [x21, #0x0]\n"
       "7:"  // Odd load end
-      "ushll v31.8h, v31.8b, #0x0\n"
-      "subs x19, x19, #0x1\n"
       "ushll v30.8h, v30.8b, #0x0\n"
       "ushll v29.8h, v29.8b, #0x0\n"
+      "subs x20, x20, #0x1\n"
       "ushll v28.8h, v28.8b, #0x0\n"
-      "ushll v25.8h, v25.8b, #0x0\n"
-      "zip1 v20.8h, v31.8h, v25.8h\n"
-      "ushll v23.8h, v23.8b, #0x0\n"
-      "zip1 v24.8h, v30.8h, v23.8h\n"
       "ushll v27.8h, v27.8b, #0x0\n"
-      "zip1 v19.8h, v29.8h, v27.8h\n"
-      "zip1 v22.8h, v20.8h, v19.8h\n"
       "ushll v26.8h, v26.8b, #0x0\n"
-      "zip1 v18.8h, v28.8h, v26.8h\n"
-      "zip1 v17.8h, v24.8h, v18.8h\n"
-      "zip1 v16.8h, v22.8h, v17.8h\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
+      "ushll v24.8h, v24.8b, #0x0\n"
+      "ushll v23.8h, v23.8b, #0x0\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v16.8h, v22.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v21.8h, v20.8h, v19.8h\n"
-      "zip2 v17.8h, v24.8h, v18.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v16.8h, v21.8h, v17.8h\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v16.8h, v21.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v25.8h, v31.8h, v25.8h\n"
-      "zip2 v19.8h, v29.8h, v27.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v20.8h, v25.8h, v19.8h\n"
-      "zip2 v23.8h, v30.8h, v23.8h\n"
-      "zip2 v18.8h, v28.8h, v26.8h\n"
-      "zip1 v17.8h, v23.8h, v18.8h\n"
-      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v19.8h, v25.8h, v19.8h\n"
-      "zip2 v17.8h, v23.8h, v18.8h\n"
-      "zip1 v16.8h, v19.8h, v17.8h\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
index 7c7d774a6b..ae4bf9bf3b 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,39 +31,40 @@ void interleave_block<8, 1, VLType::None, true>(
 )
 {
   __asm__ __volatile__(
-      "movi v1.8h, #0x0\n"
-      "ldr x27, [%x[in], #0x0]\n"
-      "mov x19, #0x0\n"
-      "movi v0.4s, #0x0\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "movi v31.4s, #0x0\n"
-      "ldr x25, [%x[in], #0x10]\n"
+      "mov x20, #0x0\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
+      "movi v2.8h, #0x0\n"
+      "movi v1.4s, #0x0\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
+      "movi v0.4s, #0x0\n"
+      "add x28, x28, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x27, x27, %x[row_offset]\n"
-      "ldr x24, [%x[in], #0x18]\n"
-      "ldr x23, [%x[in], #0x20]\n"
       "add x26, x26, %x[row_offset]\n"
-      "ldr x22, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset]\n"
-      "ldr x20, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset]\n"
       "add x22, x22, %x[row_offset]\n"
       "add x21, x21, %x[row_offset]\n"
-      "add x20, x20, %x[row_offset]\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x28, #0x0]\n"
       "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
@@ -71,7 +72,7 @@ void interleave_block<8, 1, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -79,242 +80,241 @@ void interleave_block<8, 1, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "cbnz %w[first], 2f\n"
       "sub %x[out_ptr], %x[out_ptr], #0x20\n"
-      "ld1 { v0.4s }, [%x[out_ptr]]\n"
-      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "ld1 { v1.4s }, [%x[out_ptr]]\n"
+      "ldr q0, [%x[out_ptr], #0x10]\n"
       "2:"  // first_pass
       "cmp %x[width], #0x8\n"
       "blt 5f\n"
       "3:"  // Main loop head
-      "cmp x19, #0xe\n"
+      "cmp x20, #0xe\n"
       "ble 4f\n"
-      "uaddw v0.4s, v0.4s, v1.4h\n"
-      "uaddw2 v31.4s, v31.4s, v1.8h\n"
-      "mov x19, #0x0\n"
-      "movi v1.8h, #0x0\n"
+      "uaddw v1.4s, v1.4s, v2.4h\n"
+      "uaddw2 v0.4s, v0.4s, v2.8h\n"
+      "mov x20, #0x0\n"
+      "movi v2.8h, #0x0\n"
       "4:"  // no_accumulate_16
+      "ldr d31, [x28], #0x8\n"
       "ldr d30, [x27], #0x8\n"
+      "ushll v31.8h, v31.8b, #0x0\n"
       "ushll v30.8h, v30.8b, #0x0\n"
       "ldr d29, [x26], #0x8\n"
-      "add x19, x19, #0x1\n"
-      "ushll v29.8h, v29.8b, #0x0\n"
       "ldr d28, [x25], #0x8\n"
-      "subs %x[width], %x[width], #0x8\n"
+      "ushll v29.8h, v29.8b, #0x0\n"
       "ushll v28.8h, v28.8b, #0x0\n"
       "ldr d27, [x24], #0x8\n"
-      "cmp %x[width], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
       "ushll v27.8h, v27.8b, #0x0\n"
-      "ldr d24, [x23], #0x8\n"
-      "ldr d23, [x22], #0x8\n"
+      "ushll v26.8h, v26.8b, #0x0\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
       "ushll v24.8h, v24.8b, #0x0\n"
-      "ldr d21, [x21], #0x8\n"
-      "ushll v23.8h, v23.8b, #0x0\n"
-      "ldr d26, [x20], #0x8\n"
-      "zip1 v20.8h, v30.8h, v24.8h\n"
+      "zip1 v23.8h, v31.8h, v27.8h\n"
+      "zip1 v22.8h, v29.8h, v25.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v21.8h, v30.8h, v26.8h\n"
+      "zip1 v20.8h, v28.8h, v24.8h\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip1 v25.8h, v29.8h, v23.8h\n"
+      "zip1 v18.8h, v23.8h, v22.8h\n"
+      "zip1 v17.8h, v21.8h, v20.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip2 v24.8h, v30.8h, v24.8h\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip2 v23.8h, v29.8h, v23.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "ushll v21.8h, v21.8b, #0x0\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v19.8h, v28.8h, v21.8h\n"
+      "zip2 v19.8h, v18.8h, v17.8h\n"
+      "zip2 v18.8h, v23.8h, v22.8h\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip1 v22.8h, v20.8h, v19.8h\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v19.8h, v20.8h, v19.8h\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip2 v20.8h, v28.8h, v21.8h\n"
-      "zip1 v21.8h, v24.8h, v20.8h\n"
-      "zip2 v20.8h, v24.8h, v20.8h\n"
-      "ushll v26.8h, v26.8b, #0x0\n"
-      "zip1 v18.8h, v27.8h, v26.8h\n"
-      "zip1 v17.8h, v25.8h, v18.8h\n"
-      "zip1 v16.8h, v22.8h, v17.8h\n"
+      "zip2 v17.8h, v21.8h, v20.8h\n"
+      "add v2.8h, v2.8h, v19.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip2 v17.8h, v22.8h, v17.8h\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
-      "zip2 v16.8h, v25.8h, v18.8h\n"
-      "add v1.8h, v1.8h, v17.8h\n"
-      "zip1 v17.8h, v19.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.8h, v19.8h, v16.8h\n"
+      "add x20, x20, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v22.8h, v31.8h, v27.8h\n"
+      "str q19, [%x[out_ptr], #0x10]\n"
+      "zip2 v21.8h, v29.8h, v25.8h\n"
+      "zip2 v20.8h, v30.8h, v26.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v19.8h, v28.8h, v24.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
       "str q16, [%x[out_ptr], #0x30]\n"
-      "add v1.8h, v1.8h, v17.8h\n"
-      "zip2 v19.8h, v27.8h, v26.8h\n"
-      "zip1 v17.8h, v23.8h, v19.8h\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip1 v16.8h, v21.8h, v17.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "str q16, [%x[out_ptr], #0x40]\n"
-      "zip2 v18.8h, v21.8h, v17.8h\n"
-      "str q18, [%x[out_ptr], #0x50]\n"
-      "add v1.8h, v1.8h, v16.8h\n"
-      "zip2 v16.8h, v23.8h, v19.8h\n"
-      "zip1 v17.8h, v20.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x60]\n"
-      "add v1.8h, v1.8h, v18.8h\n"
-      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x70]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
-      "add v1.8h, v1.8h, v17.8h\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "bge 3b\n"
       "5:"  // Main loop skip
       "cbz %x[width], 10f\n"
       "tbz %x[width], #2, 7f\n"
-      "ldr s30, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s27, [x24], #0x4\n"
-      "ldr s24, [x23], #0x4\n"
-      "ldr s23, [x22], #0x4\n"
-      "ldr s21, [x21], #0x4\n"
-      "ldr s26, [x20], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v30.h }[2], [x27], #0x2\n"
-      "mov x19, #0x6\n"
-      "ld1 { v29.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v27.h }[2], [x24], #0x2\n"
-      "ld1 { v24.h }[2], [x23], #0x2\n"
-      "ld1 { v23.h }[2], [x22], #0x2\n"
-      "ld1 { v21.h }[2], [x21], #0x2\n"
-      "ld1 { v26.h }[2], [x20], #0x2\n"
+      "ld1 { v30.h }[2], [x28], #0x2\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
+      "mov x20, #0x6\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v26.h }[2], [x24], #0x2\n"
+      "ld1 { v25.h }[2], [x23], #0x2\n"
+      "ld1 { v24.h }[2], [x22], #0x2\n"
+      "ld1 { v23.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.b }[6], [x27]\n"
-      "mov x19, #0x7\n"
-      "ld1 { v29.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v27.b }[6], [x24]\n"
-      "ld1 { v24.b }[6], [x23]\n"
-      "ld1 { v23.b }[6], [x22]\n"
-      "ld1 { v21.b }[6], [x21]\n"
-      "ld1 { v26.b }[6], [x20]\n"
+      "ld1 { v30.b }[6], [x28]\n"
+      "ld1 { v29.b }[6], [x27]\n"
+      "mov x20, #0x7\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v26.b }[6], [x24]\n"
+      "ld1 { v25.b }[6], [x23]\n"
+      "ld1 { v24.b }[6], [x22]\n"
+      "ld1 { v23.b }[6], [x21]\n"
       "b 9f\n"
       "6:"  // odd_loads_1_4
-      "mov x19, #0x4\n"
+      "mov x20, #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.b }[4], [x27]\n"
-      "ld1 { v29.b }[4], [x26]\n"
-      "mov x19, #0x5\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v27.b }[4], [x24]\n"
-      "ld1 { v24.b }[4], [x23]\n"
-      "ld1 { v23.b }[4], [x22]\n"
-      "ld1 { v21.b }[4], [x21]\n"
-      "ld1 { v26.b }[4], [x20]\n"
+      "ld1 { v30.b }[4], [x28]\n"
+      "ld1 { v29.b }[4], [x27]\n"
+      "mov x20, #0x5\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v26.b }[4], [x24]\n"
+      "ld1 { v25.b }[4], [x23]\n"
+      "ld1 { v24.b }[4], [x22]\n"
+      "ld1 { v23.b }[4], [x21]\n"
       "b 9f\n"
       "7:"  // odd_loads_2_0
       "tbz %x[width], #1, 8f\n"
-      "ldr h30, [x27], #0x2\n"
-      "ldr h29, [x26], #0x2\n"
-      "mov x19, #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h27, [x24], #0x2\n"
-      "ldr h24, [x23], #0x2\n"
-      "ldr h23, [x22], #0x2\n"
-      "ldr h21, [x21], #0x2\n"
-      "ldr h26, [x20], #0x2\n"
+      "ldr h30, [x28], #0x2\n"
+      "ldr h29, [x27], #0x2\n"
+      "mov x20, #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h26, [x24], #0x2\n"
+      "ldr h25, [x23], #0x2\n"
+      "ldr h24, [x22], #0x2\n"
+      "ldr h23, [x21], #0x2\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v30.b }[2], [x27]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v29.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v27.b }[2], [x24]\n"
-      "ld1 { v24.b }[2], [x23]\n"
-      "ld1 { v23.b }[2], [x22]\n"
-      "ld1 { v21.b }[2], [x21]\n"
-      "ld1 { v26.b }[2], [x20]\n"
+      "ld1 { v30.b }[2], [x28]\n"
+      "ld1 { v29.b }[2], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v26.b }[2], [x24]\n"
+      "ld1 { v25.b }[2], [x23]\n"
+      "ld1 { v24.b }[2], [x22]\n"
+      "ld1 { v23.b }[2], [x21]\n"
       "b 9f\n"
       "8:"  // odd_loads_1_0
-      "ldr b30, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr b29, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b27, [x24, #0x0]\n"
-      "ldr b24, [x23, #0x0]\n"
-      "ldr b23, [x22, #0x0]\n"
-      "ldr b21, [x21, #0x0]\n"
-      "ldr b26, [x20, #0x0]\n"
+      "ldr b30, [x28, #0x0]\n"
+      "ldr b29, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b26, [x24, #0x0]\n"
+      "ldr b25, [x23, #0x0]\n"
+      "ldr b24, [x22, #0x0]\n"
+      "ldr b23, [x21, #0x0]\n"
       "9:"  // Odd load end
       "ushll v30.8h, v30.8b, #0x0\n"
-      "subs x19, x19, #0x1\n"
       "ushll v29.8h, v29.8b, #0x0\n"
+      "subs x20, x20, #0x1\n"
       "ushll v28.8h, v28.8b, #0x0\n"
       "ushll v27.8h, v27.8b, #0x0\n"
+      "ushll v26.8h, v26.8b, #0x0\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
       "ushll v24.8h, v24.8b, #0x0\n"
-      "zip1 v20.8h, v30.8h, v24.8h\n"
       "ushll v23.8h, v23.8b, #0x0\n"
-      "zip1 v25.8h, v29.8h, v23.8h\n"
-      "ushll v21.8h, v21.8b, #0x0\n"
-      "zip1 v19.8h, v28.8h, v21.8h\n"
-      "zip1 v22.8h, v20.8h, v19.8h\n"
-      "ushll v26.8h, v26.8b, #0x0\n"
-      "zip1 v18.8h, v27.8h, v26.8h\n"
-      "zip1 v17.8h, v25.8h, v18.8h\n"
-      "zip1 v16.8h, v22.8h, v17.8h\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "beq 10f\n"
-      "zip2 v17.8h, v22.8h, v17.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v17.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v19.8h, v20.8h, v19.8h\n"
-      "zip2 v16.8h, v25.8h, v18.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v17.8h, v19.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v17.8h\n"
       "beq 10f\n"
-      "zip2 v16.8h, v19.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v16.8h\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v24.8h, v30.8h, v24.8h\n"
-      "zip2 v20.8h, v28.8h, v21.8h\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v21.8h, v24.8h, v20.8h\n"
-      "zip2 v23.8h, v29.8h, v23.8h\n"
-      "zip2 v19.8h, v27.8h, v26.8h\n"
-      "zip1 v17.8h, v23.8h, v19.8h\n"
-      "zip1 v16.8h, v21.8h, v17.8h\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v16.8h\n"
       "beq 10f\n"
-      "zip2 v18.8h, v21.8h, v17.8h\n"
-      "str q18, [%x[out_ptr], #0x0]\n"
-      "subs x19, x19, #0x1\n"
-      "add v1.8h, v1.8h, v18.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 10f\n"
-      "zip2 v20.8h, v24.8h, v20.8h\n"
-      "zip2 v16.8h, v23.8h, v19.8h\n"
-      "zip1 v17.8h, v20.8h, v16.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "add v1.8h, v1.8h, v17.8h\n"
       "10:"  // Odds skip
-      "uaddw v0.4s, v0.4s, v1.4h\n"
-      "str q0, [%x[out_ptr], #0x0]\n"
-      "uaddw2 v31.4s, v31.4s, v1.8h\n"
-      "str q31, [%x[out_ptr], #0x10]\n"
+      "uaddw v1.4s, v1.4s, v2.4h\n"
+      "uaddw2 v0.4s, v0.4s, v2.8h\n"
+      "str q1, [%x[out_ptr], #0x0]\n"
+      "str q0, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
index 1e5d395667..43d9d20c10 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,45 @@ void interleave_block<8, 2, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "add x28, x28, %x[row_offset], LSL #1\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "add x27, x27, %x[row_offset], LSL #1\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
       "add x26, x26, %x[row_offset], LSL #1\n"
-      "ldr x23, [%x[in], #0x20]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset], LSL #1\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset], LSL #1\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset], LSL #1\n"
       "add x22, x22, %x[row_offset], LSL #1\n"
       "add x21, x21, %x[row_offset], LSL #1\n"
-      "add x20, x20, %x[row_offset], LSL #1\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -76,49 +77,48 @@ void interleave_block<8, 2, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q28, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q18, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x8\n"
-      "ldr q29, [x26], #0x10\n"
       "cmp %x[width], #0x8\n"
-      "ldr q25, [x25], #0x10\n"
-      "zip1 v22.4s, v28.4s, v25.4s\n"
-      "ldr q21, [x24], #0x10\n"
-      "zip2 v28.4s, v28.4s, v25.4s\n"
-      "ldr q27, [x23], #0x10\n"
-      "ldr q26, [x22], #0x10\n"
-      "zip1 v20.4s, v29.4s, v21.4s\n"
-      "ldr q19, [x21], #0x10\n"
-      "zip2 v25.4s, v29.4s, v21.4s\n"
-      "ldr q24, [x20], #0x10\n"
-      "zip1 v23.4s, v22.4s, v20.4s\n"
+      "ldr q17, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v25.4s, v20.4s, v17.4s\n"
+      "zip1 v24.4s, v18.4s, v16.4s\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q23, [x23], #0x10\n"
+      "zip2 v22.4s, v20.4s, v17.4s\n"
+      "zip2 v21.4s, v18.4s, v16.4s\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v20.4s, v19.4s, v18.4s\n"
+      "zip1 v17.4s, v23.4s, v16.4s\n"
+      "zip2 v19.4s, v19.4s, v18.4s\n"
+      "zip2 v18.4s, v23.4s, v16.4s\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v22.4s, v22.4s, v20.4s\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip1 v21.4s, v28.4s, v25.4s\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v18.4s, v27.4s, v19.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "zip1 v16.4s, v26.4s, v24.4s\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v20.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip2 v20.4s, v18.4s, v16.4s\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v19.4s, v27.4s, v19.4s\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.4s, v19.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
-      "zip2 v17.4s, v28.4s, v25.4s\n"
-      "str q22, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.4s, v19.4s, v16.4s\n"
-      "str q20, [%x[out_ptr], #0x30]\n"
-      "str q21, [%x[out_ptr], #0x40]\n"
-      "str q18, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.4s, v20.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip1 v16.4s, v22.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "zip2 v17.4s, v22.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
       "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
@@ -126,120 +126,119 @@ void interleave_block<8, 2, VLType::None, false>(
       "3:"  // Main loop skip
       "cbz %x[width], 8f\n"
       "tbz %x[width], #2, 5f\n"
-      "ldr d28, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x28], #0x8\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d26, [x26], #0x8\n"
       "ldr d25, [x25], #0x8\n"
-      "ldr d21, [x24], #0x8\n"
-      "ldr d27, [x23], #0x8\n"
-      "ldr d26, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v28.s }[2], [x27], #0x4\n"
-      "mov x19, #0x3\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x28], #0x4\n"
+      "ld1 { v27.s }[2], [x27], #0x4\n"
+      "mov x20, #0x3\n"
+      "ld1 { v26.s }[2], [x26], #0x4\n"
       "ld1 { v25.s }[2], [x25], #0x4\n"
-      "ld1 { v21.s }[2], [x24], #0x4\n"
-      "ld1 { v27.s }[2], [x23], #0x4\n"
-      "ld1 { v26.s }[2], [x22], #0x4\n"
-      "ld1 { v19.s }[2], [x21], #0x4\n"
-      "ld1 { v24.s }[2], [x20], #0x4\n"
+      "ld1 { v24.s }[2], [x24], #0x4\n"
+      "ld1 { v23.s }[2], [x23], #0x4\n"
+      "ld1 { v22.s }[2], [x22], #0x4\n"
+      "ld1 { v21.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v28.h }[6], [x27]\n"
-      "mov x19, #0x4\n"
-      "ld1 { v29.h }[6], [x26]\n"
+      "ld1 { v28.h }[6], [x28]\n"
+      "ld1 { v27.h }[6], [x27]\n"
+      "mov x20, #0x4\n"
+      "ld1 { v26.h }[6], [x26]\n"
       "ld1 { v25.h }[6], [x25]\n"
-      "ld1 { v21.h }[6], [x24]\n"
-      "ld1 { v27.h }[6], [x23]\n"
-      "ld1 { v26.h }[6], [x22]\n"
-      "ld1 { v19.h }[6], [x21]\n"
-      "ld1 { v24.h }[6], [x20]\n"
+      "ld1 { v24.h }[6], [x24]\n"
+      "ld1 { v23.h }[6], [x23]\n"
+      "ld1 { v22.h }[6], [x22]\n"
+      "ld1 { v21.h }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
-      "mov x19, #0x2\n"
+      "mov x20, #0x2\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v28.h }[4], [x27]\n"
-      "ld1 { v29.h }[4], [x26]\n"
-      "mov x19, #0x3\n"
+      "ld1 { v28.h }[4], [x28]\n"
+      "ld1 { v27.h }[4], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v26.h }[4], [x26]\n"
       "ld1 { v25.h }[4], [x25]\n"
-      "ld1 { v21.h }[4], [x24]\n"
-      "ld1 { v27.h }[4], [x23]\n"
-      "ld1 { v26.h }[4], [x22]\n"
-      "ld1 { v19.h }[4], [x21]\n"
-      "ld1 { v24.h }[4], [x20]\n"
+      "ld1 { v24.h }[4], [x24]\n"
+      "ld1 { v23.h }[4], [x23]\n"
+      "ld1 { v22.h }[4], [x22]\n"
+      "ld1 { v21.h }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
-      "ldr s28, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "mov x19, #0x1\n"
+      "ldr s28, [x28], #0x4\n"
+      "ldr s27, [x27], #0x4\n"
+      "mov x20, #0x1\n"
+      "ldr s26, [x26], #0x4\n"
       "ldr s25, [x25], #0x4\n"
-      "ldr s21, [x24], #0x4\n"
-      "ldr s27, [x23], #0x4\n"
-      "ldr s26, [x22], #0x4\n"
-      "ldr s19, [x21], #0x4\n"
-      "ldr s24, [x20], #0x4\n"
+      "ldr s24, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s22, [x22], #0x4\n"
+      "ldr s21, [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v28.h }[2], [x27]\n"
-      "mov x19, #0x2\n"
-      "ld1 { v29.h }[2], [x26]\n"
+      "ld1 { v28.h }[2], [x28]\n"
+      "ld1 { v27.h }[2], [x27]\n"
+      "mov x20, #0x2\n"
+      "ld1 { v26.h }[2], [x26]\n"
       "ld1 { v25.h }[2], [x25]\n"
-      "ld1 { v21.h }[2], [x24]\n"
-      "ld1 { v27.h }[2], [x23]\n"
-      "ld1 { v26.h }[2], [x22]\n"
-      "ld1 { v19.h }[2], [x21]\n"
-      "ld1 { v24.h }[2], [x20]\n"
+      "ld1 { v24.h }[2], [x24]\n"
+      "ld1 { v23.h }[2], [x23]\n"
+      "ld1 { v22.h }[2], [x22]\n"
+      "ld1 { v21.h }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
-      "ldr h28, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr h29, [x26, #0x0]\n"
+      "ldr h28, [x28, #0x0]\n"
+      "ldr h27, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr h26, [x26, #0x0]\n"
       "ldr h25, [x25, #0x0]\n"
-      "ldr h21, [x24, #0x0]\n"
-      "ldr h27, [x23, #0x0]\n"
-      "ldr h26, [x22, #0x0]\n"
-      "ldr h19, [x21, #0x0]\n"
-      "ldr h24, [x20, #0x0]\n"
+      "ldr h24, [x24, #0x0]\n"
+      "ldr h23, [x23, #0x0]\n"
+      "ldr h22, [x22, #0x0]\n"
+      "ldr h21, [x21, #0x0]\n"
       "7:"  // Odd load end
-      "zip1 v22.4s, v28.4s, v25.4s\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v20.4s, v29.4s, v21.4s\n"
-      "zip1 v23.4s, v22.4s, v20.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.4s, v27.4s, v19.4s\n"
-      "zip1 v16.4s, v26.4s, v24.4s\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 8f\n"
-      "zip2 v22.4s, v22.4s, v20.4s\n"
-      "str q22, [%x[out_ptr], #0x0]\n"
-      "zip2 v20.4s, v18.4s, v16.4s\n"
-      "subs x19, x19, #0x1\n"
-      "str q20, [%x[out_ptr], #0x10]\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 8f\n"
-      "zip2 v28.4s, v28.4s, v25.4s\n"
-      "zip2 v25.4s, v29.4s, v21.4s\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v21.4s, v28.4s, v25.4s\n"
-      "str q21, [%x[out_ptr], #0x0]\n"
-      "zip2 v19.4s, v27.4s, v19.4s\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
-      "zip1 v18.4s, v19.4s, v16.4s\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
+      "zip2 v19.4s, v27.4s, v25.4s\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 8f\n"
-      "zip2 v17.4s, v28.4s, v25.4s\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.4s, v19.4s, v16.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
index 064207c0fa..3ec03370a0 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,45 @@ void interleave_block<8, 2, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "add x28, x28, %x[row_offset], LSL #2\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "add x27, x27, %x[row_offset], LSL #2\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
       "add x26, x26, %x[row_offset], LSL #2\n"
-      "ldr x23, [%x[in], #0x20]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset], LSL #2\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset], LSL #2\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset], LSL #2\n"
       "add x22, x22, %x[row_offset], LSL #2\n"
       "add x21, x21, %x[row_offset], LSL #2\n"
-      "add x20, x20, %x[row_offset], LSL #2\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -76,104 +77,102 @@ void interleave_block<8, 2, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q27, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q19, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x4\n"
-      "ldr q24, [x26], #0x10\n"
-      "zip1 v26.2d, v27.2d, v24.2d\n"
-      "ldr q25, [x25], #0x10\n"
       "cmp %x[width], #0x4\n"
-      "zip2 v24.2d, v27.2d, v24.2d\n"
-      "ldr q21, [x24], #0x10\n"
-      "ldr q23, [x23], #0x10\n"
-      "zip1 v22.2d, v25.2d, v21.2d\n"
-      "ldr q18, [x22], #0x10\n"
-      "zip2 v21.2d, v25.2d, v21.2d\n"
-      "ldr q20, [x21], #0x10\n"
-      "ldr q16, [x20], #0x10\n"
-      "zip1 v19.2d, v23.2d, v18.2d\n"
+      "ldr q25, [x26], #0x10\n"
+      "ldr q24, [x25], #0x10\n"
+      "zip1 v16.2d, v20.2d, v19.2d\n"
+      "zip1 v18.2d, v25.2d, v24.2d\n"
+      "ldr q23, [x24], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip1 v17.2d, v23.2d, v22.2d\n"
+      "zip2 v21.2d, v20.2d, v19.2d\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q19, [x21], #0x10\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.2d, v20.2d, v19.2d\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v18.2d, v23.2d, v18.2d\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "zip2 v18.2d, v25.2d, v24.2d\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip1 v17.2d, v20.2d, v16.2d\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip2 v16.2d, v20.2d, v16.2d\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "zip2 v17.2d, v23.2d, v22.2d\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v16.2d, v20.2d, v19.2d\n"
       "prfm pldl1keep, [x22, #0x70]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "str q22, [%x[out_ptr], #0x10]\n"
-      "str q19, [%x[out_ptr], #0x20]\n"
-      "str q17, [%x[out_ptr], #0x30]\n"
-      "str q24, [%x[out_ptr], #0x40]\n"
-      "str q21, [%x[out_ptr], #0x50]\n"
-      "str q18, [%x[out_ptr], #0x60]\n"
+      "str q21, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
       "bge 2b\n"
       "3:"  // Main loop skip
       "cbz %x[width], 6f\n"
       "tbz %x[width], #1, 4f\n"
-      "ldr d27, [x27], #0x8\n"
-      "ldr d24, [x26], #0x8\n"
-      "mov x19, #0x1\n"
-      "ldr d25, [x25], #0x8\n"
+      "ldr d25, [x28], #0x8\n"
+      "ldr d24, [x27], #0x8\n"
+      "mov x20, #0x1\n"
+      "ldr d23, [x26], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
       "ldr d21, [x24], #0x8\n"
-      "ldr d23, [x23], #0x8\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "ldr d16, [x20], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
       "tbz %x[width], #0, 5f\n"
-      "ld1 { v27.s }[2], [x27]\n"
-      "mov x19, #0x2\n"
-      "ld1 { v24.s }[2], [x26]\n"
-      "ld1 { v25.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x28]\n"
+      "ld1 { v24.s }[2], [x27]\n"
+      "mov x20, #0x2\n"
+      "ld1 { v23.s }[2], [x26]\n"
+      "ld1 { v22.s }[2], [x25]\n"
       "ld1 { v21.s }[2], [x24]\n"
-      "ld1 { v23.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
-      "ld1 { v16.s }[2], [x20]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v18.s }[2], [x21]\n"
       "b 5f\n"
       "4:"  // odd_loads_1_0
-      "ldr s27, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr s24, [x26, #0x0]\n"
-      "ldr s25, [x25, #0x0]\n"
+      "ldr s25, [x28, #0x0]\n"
+      "ldr s24, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr s23, [x26, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
       "ldr s21, [x24, #0x0]\n"
-      "ldr s23, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
-      "ldr s16, [x20, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s18, [x21, #0x0]\n"
       "5:"  // Odd load end
-      "zip1 v26.2d, v27.2d, v24.2d\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "zip1 v22.2d, v25.2d, v21.2d\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v19.2d, v23.2d, v18.2d\n"
-      "str q22, [%x[out_ptr], #0x10]\n"
-      "zip1 v17.2d, v20.2d, v16.2d\n"
-      "str q19, [%x[out_ptr], #0x20]\n"
-      "str q17, [%x[out_ptr], #0x30]\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v16.2d, v25.2d, v24.2d\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v21.2d, v20.2d\n"
+      "zip1 v16.2d, v19.2d, v18.2d\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "beq 6f\n"
-      "zip2 v24.2d, v27.2d, v24.2d\n"
-      "str q24, [%x[out_ptr], #0x0]\n"
-      "zip2 v21.2d, v25.2d, v21.2d\n"
-      "zip2 v18.2d, v23.2d, v18.2d\n"
-      "str q21, [%x[out_ptr], #0x10]\n"
-      "zip2 v16.2d, v20.2d, v16.2d\n"
-      "str q18, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.2d, v25.2d, v24.2d\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v17.2d, v21.2d, v20.2d\n"
+      "zip2 v16.2d, v19.2d, v18.2d\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "6:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
index 1f86722bc1..e9799f87a9 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,45 @@ void interleave_block<8, 4, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "add x28, x28, %x[row_offset], LSL #1\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "add x27, x27, %x[row_offset], LSL #1\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
       "add x26, x26, %x[row_offset], LSL #1\n"
-      "ldr x23, [%x[in], #0x20]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset], LSL #1\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset], LSL #1\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset], LSL #1\n"
       "add x22, x22, %x[row_offset], LSL #1\n"
       "add x21, x21, %x[row_offset], LSL #1\n"
-      "add x20, x20, %x[row_offset], LSL #1\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -76,146 +77,144 @@ void interleave_block<8, 4, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q27, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q19, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x8\n"
-      "ldr q24, [x26], #0x10\n"
-      "zip1 v26.2d, v27.2d, v24.2d\n"
-      "ldr q25, [x25], #0x10\n"
       "cmp %x[width], #0x8\n"
-      "zip2 v24.2d, v27.2d, v24.2d\n"
-      "ldr q21, [x24], #0x10\n"
-      "ldr q23, [x23], #0x10\n"
-      "zip1 v22.2d, v25.2d, v21.2d\n"
-      "ldr q18, [x22], #0x10\n"
-      "zip2 v21.2d, v25.2d, v21.2d\n"
-      "ldr q20, [x21], #0x10\n"
-      "ldr q16, [x20], #0x10\n"
-      "zip1 v19.2d, v23.2d, v18.2d\n"
+      "ldr q25, [x26], #0x10\n"
+      "ldr q24, [x25], #0x10\n"
+      "zip1 v16.2d, v20.2d, v19.2d\n"
+      "zip1 v18.2d, v25.2d, v24.2d\n"
+      "ldr q23, [x24], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip1 v17.2d, v23.2d, v22.2d\n"
+      "zip2 v21.2d, v20.2d, v19.2d\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q19, [x21], #0x10\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.2d, v20.2d, v19.2d\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v18.2d, v23.2d, v18.2d\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "zip2 v18.2d, v25.2d, v24.2d\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip1 v17.2d, v20.2d, v16.2d\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip2 v16.2d, v20.2d, v16.2d\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "zip2 v17.2d, v23.2d, v22.2d\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v16.2d, v20.2d, v19.2d\n"
       "prfm pldl1keep, [x22, #0x70]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "str q22, [%x[out_ptr], #0x10]\n"
-      "str q19, [%x[out_ptr], #0x20]\n"
-      "str q17, [%x[out_ptr], #0x30]\n"
-      "str q24, [%x[out_ptr], #0x40]\n"
-      "str q21, [%x[out_ptr], #0x50]\n"
-      "str q18, [%x[out_ptr], #0x60]\n"
+      "str q21, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
       "bge 2b\n"
       "3:"  // Main loop skip
       "cbz %x[width], 8f\n"
       "tbz %x[width], #2, 5f\n"
-      "ldr d27, [x27], #0x8\n"
-      "ldr d24, [x26], #0x8\n"
-      "ldr d25, [x25], #0x8\n"
+      "ldr d25, [x28], #0x8\n"
+      "ldr d24, [x27], #0x8\n"
+      "ldr d23, [x26], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
       "ldr d21, [x24], #0x8\n"
-      "ldr d23, [x23], #0x8\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "ldr d16, [x20], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v27.s }[2], [x27], #0x4\n"
-      "mov x19, #0x2\n"
-      "ld1 { v24.s }[2], [x26], #0x4\n"
-      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x28], #0x4\n"
+      "ld1 { v24.s }[2], [x27], #0x4\n"
+      "mov x20, #0x2\n"
+      "ld1 { v23.s }[2], [x26], #0x4\n"
+      "ld1 { v22.s }[2], [x25], #0x4\n"
       "ld1 { v21.s }[2], [x24], #0x4\n"
-      "ld1 { v23.s }[2], [x23], #0x4\n"
-      "ld1 { v18.s }[2], [x22], #0x4\n"
-      "ld1 { v20.s }[2], [x21], #0x4\n"
-      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "ld1 { v20.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "ld1 { v18.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v27.h }[6], [x27]\n"
-      "ld1 { v24.h }[6], [x26]\n"
-      "ld1 { v25.h }[6], [x25]\n"
+      "ld1 { v25.h }[6], [x28]\n"
+      "ld1 { v24.h }[6], [x27]\n"
+      "ld1 { v23.h }[6], [x26]\n"
+      "ld1 { v22.h }[6], [x25]\n"
       "ld1 { v21.h }[6], [x24]\n"
-      "ld1 { v23.h }[6], [x23]\n"
-      "ld1 { v18.h }[6], [x22]\n"
-      "ld1 { v20.h }[6], [x21]\n"
-      "ld1 { v16.h }[6], [x20]\n"
+      "ld1 { v20.h }[6], [x23]\n"
+      "ld1 { v19.h }[6], [x22]\n"
+      "ld1 { v18.h }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
-      "mov x19, #0x1\n"
+      "mov x20, #0x1\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v27.h }[4], [x27]\n"
-      "ld1 { v24.h }[4], [x26]\n"
-      "mov x19, #0x2\n"
-      "ld1 { v25.h }[4], [x25]\n"
+      "ld1 { v25.h }[4], [x28]\n"
+      "ld1 { v24.h }[4], [x27]\n"
+      "mov x20, #0x2\n"
+      "ld1 { v23.h }[4], [x26]\n"
+      "ld1 { v22.h }[4], [x25]\n"
       "ld1 { v21.h }[4], [x24]\n"
-      "ld1 { v23.h }[4], [x23]\n"
-      "ld1 { v18.h }[4], [x22]\n"
-      "ld1 { v20.h }[4], [x21]\n"
-      "ld1 { v16.h }[4], [x20]\n"
+      "ld1 { v20.h }[4], [x23]\n"
+      "ld1 { v19.h }[4], [x22]\n"
+      "ld1 { v18.h }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
-      "ldr s27, [x27], #0x4\n"
-      "ldr s24, [x26], #0x4\n"
-      "mov x19, #0x1\n"
-      "ldr s25, [x25], #0x4\n"
+      "ldr s25, [x28], #0x4\n"
+      "ldr s24, [x27], #0x4\n"
+      "mov x20, #0x1\n"
+      "ldr s23, [x26], #0x4\n"
+      "ldr s22, [x25], #0x4\n"
       "ldr s21, [x24], #0x4\n"
-      "ldr s23, [x23], #0x4\n"
-      "ldr s18, [x22], #0x4\n"
-      "ldr s20, [x21], #0x4\n"
-      "ldr s16, [x20], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v27.h }[2], [x27]\n"
-      "ld1 { v24.h }[2], [x26]\n"
-      "ld1 { v25.h }[2], [x25]\n"
+      "ld1 { v25.h }[2], [x28]\n"
+      "ld1 { v24.h }[2], [x27]\n"
+      "ld1 { v23.h }[2], [x26]\n"
+      "ld1 { v22.h }[2], [x25]\n"
       "ld1 { v21.h }[2], [x24]\n"
-      "ld1 { v23.h }[2], [x23]\n"
-      "ld1 { v18.h }[2], [x22]\n"
-      "ld1 { v20.h }[2], [x21]\n"
-      "ld1 { v16.h }[2], [x20]\n"
+      "ld1 { v20.h }[2], [x23]\n"
+      "ld1 { v19.h }[2], [x22]\n"
+      "ld1 { v18.h }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
-      "ldr h27, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr h24, [x26, #0x0]\n"
-      "ldr h25, [x25, #0x0]\n"
+      "ldr h25, [x28, #0x0]\n"
+      "ldr h24, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr h23, [x26, #0x0]\n"
+      "ldr h22, [x25, #0x0]\n"
       "ldr h21, [x24, #0x0]\n"
-      "ldr h23, [x23, #0x0]\n"
-      "ldr h18, [x22, #0x0]\n"
-      "ldr h20, [x21, #0x0]\n"
-      "ldr h16, [x20, #0x0]\n"
+      "ldr h20, [x23, #0x0]\n"
+      "ldr h19, [x22, #0x0]\n"
+      "ldr h18, [x21, #0x0]\n"
       "7:"  // Odd load end
-      "zip1 v26.2d, v27.2d, v24.2d\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "zip1 v22.2d, v25.2d, v21.2d\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v19.2d, v23.2d, v18.2d\n"
-      "str q22, [%x[out_ptr], #0x10]\n"
-      "zip1 v17.2d, v20.2d, v16.2d\n"
-      "str q19, [%x[out_ptr], #0x20]\n"
-      "str q17, [%x[out_ptr], #0x30]\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v16.2d, v25.2d, v24.2d\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v21.2d, v20.2d\n"
+      "zip1 v16.2d, v19.2d, v18.2d\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "beq 8f\n"
-      "zip2 v24.2d, v27.2d, v24.2d\n"
-      "str q24, [%x[out_ptr], #0x0]\n"
-      "zip2 v21.2d, v25.2d, v21.2d\n"
-      "zip2 v18.2d, v23.2d, v18.2d\n"
-      "str q21, [%x[out_ptr], #0x10]\n"
-      "zip2 v16.2d, v20.2d, v16.2d\n"
-      "str q18, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.2d, v25.2d, v24.2d\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v17.2d, v21.2d, v20.2d\n"
+      "zip2 v16.2d, v19.2d, v18.2d\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
new file mode 100644
index 0000000000..730bfd6342
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, false>(
+  bfloat16 * &out_ptr, const float * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "add x28, x28, %x[row_offset], LSL #2\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
+      "add x27, x27, %x[row_offset], LSL #2\n"
+      "add x26, x26, %x[row_offset], LSL #2\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset], LSL #2\n"
+      "add x24, x24, %x[row_offset], LSL #2\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #2\n"
+      "add x22, x22, %x[row_offset], LSL #2\n"
+      "add x21, x21, %x[row_offset], LSL #2\n"
+      "beq 1f\n"
+      "cmp %x[height], #0x2\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
+      "cmp %x[height], #0x6\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
+      "1:"  // no_pointer_adj
+      "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q17, [x28], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      ".inst 0x0ea16a37  // bfcvtn v23.4h, v17.4s\n"
+      ".inst 0x0ea16a16  // bfcvtn v22.4h, v16.4s\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      ".inst 0x0ea16a35  // bfcvtn v21.4h, v17.4s\n"
+      ".inst 0x0ea16a14  // bfcvtn v20.4h, v16.4s\n"
+      "ldr q19, [x27], #0x10\n"
+      "ldr q18, [x25], #0x10\n"
+      "subs %x[width], %x[width], #0x4\n"
+      "cmp %x[width], #0x4\n"
+      "ldr q17, [x23], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      ".inst 0x4ea16a77  // bfcvtn2 v23.8h, v19.4s\n"
+      ".inst 0x4ea16a56  // bfcvtn2 v22.8h, v18.4s\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      ".inst 0x4ea16a35  // bfcvtn2 v21.8h, v17.4s\n"
+      ".inst 0x4ea16a14  // bfcvtn2 v20.8h, v16.4s\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "str q23, [%x[out_ptr], #0x0]\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "str q22, [%x[out_ptr], #0x10]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q21, [%x[out_ptr], #0x20]\n"
+      "str q20, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 6f\n"
+      "tbz %x[width], #1, 4f\n"
+      "ldr d19, [x28], #0x8\n"
+      "ldr d23, [x27], #0x8\n"
+      "mov x20, #0x1\n"
+      "ldr d18, [x26], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
+      "tbz %x[width], #0, 5f\n"
+      "ld1 { v19.s }[2], [x28]\n"
+      "ld1 { v23.s }[2], [x27]\n"
+      "ld1 { v18.s }[2], [x26]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v16.s }[2], [x22]\n"
+      "ld1 { v20.s }[2], [x21]\n"
+      "b 5f\n"
+      "4:"  // odd_loads_1_0
+      "ldr s19, [x28, #0x0]\n"
+      "ldr s23, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr s18, [x26, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s16, [x22, #0x0]\n"
+      "ldr s20, [x21, #0x0]\n"
+      "5:"  // Odd load end
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16af3  // bfcvtn2 v19.8h, v23.4s\n"
+      ".inst 0x4ea16ad2  // bfcvtn2 v18.8h, v22.4s\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      ".inst 0x4ea16ab1  // bfcvtn2 v17.8h, v21.4s\n"
+      ".inst 0x4ea16a90  // bfcvtn2 v16.8h, v20.4s\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "6:"  // Odds skip
+      : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
index 659d9947e2..15d8ddbe53 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,45 @@ void interleave_block<8, 4, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "add x28, x28, %x[row_offset]\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "add x27, x27, %x[row_offset]\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
       "add x26, x26, %x[row_offset]\n"
-      "ldr x23, [%x[in], #0x20]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset]\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset]\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset]\n"
       "add x22, x22, %x[row_offset]\n"
       "add x21, x21, %x[row_offset]\n"
-      "add x20, x20, %x[row_offset]\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x10\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -76,49 +77,48 @@ void interleave_block<8, 4, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q28, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q18, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x10\n"
-      "ldr q29, [x26], #0x10\n"
       "cmp %x[width], #0x10\n"
-      "ldr q25, [x25], #0x10\n"
-      "zip1 v22.4s, v28.4s, v25.4s\n"
-      "ldr q21, [x24], #0x10\n"
-      "zip2 v28.4s, v28.4s, v25.4s\n"
-      "ldr q27, [x23], #0x10\n"
-      "ldr q26, [x22], #0x10\n"
-      "zip1 v20.4s, v29.4s, v21.4s\n"
-      "ldr q19, [x21], #0x10\n"
-      "zip2 v25.4s, v29.4s, v21.4s\n"
-      "ldr q24, [x20], #0x10\n"
-      "zip1 v23.4s, v22.4s, v20.4s\n"
+      "ldr q17, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v25.4s, v20.4s, v17.4s\n"
+      "zip1 v24.4s, v18.4s, v16.4s\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q23, [x23], #0x10\n"
+      "zip2 v22.4s, v20.4s, v17.4s\n"
+      "zip2 v21.4s, v18.4s, v16.4s\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v20.4s, v19.4s, v18.4s\n"
+      "zip1 v17.4s, v23.4s, v16.4s\n"
+      "zip2 v19.4s, v19.4s, v18.4s\n"
+      "zip2 v18.4s, v23.4s, v16.4s\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v22.4s, v22.4s, v20.4s\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip1 v21.4s, v28.4s, v25.4s\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v18.4s, v27.4s, v19.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "zip1 v16.4s, v26.4s, v24.4s\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v20.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip2 v20.4s, v18.4s, v16.4s\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v19.4s, v27.4s, v19.4s\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.4s, v19.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
-      "zip2 v17.4s, v28.4s, v25.4s\n"
-      "str q22, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.4s, v19.4s, v16.4s\n"
-      "str q20, [%x[out_ptr], #0x30]\n"
-      "str q21, [%x[out_ptr], #0x40]\n"
-      "str q18, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.4s, v20.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip1 v16.4s, v22.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "zip2 v17.4s, v22.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
       "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
@@ -126,204 +126,203 @@ void interleave_block<8, 4, VLType::None, false>(
       "3:"  // Main loop skip
       "cbz %x[width], 12f\n"
       "tbz %x[width], #3, 7f\n"
-      "ldr d28, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x28], #0x8\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d26, [x26], #0x8\n"
       "ldr d25, [x25], #0x8\n"
-      "ldr d21, [x24], #0x8\n"
-      "ldr d27, [x23], #0x8\n"
-      "ldr d26, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
       "tbz %x[width], #2, 5f\n"
-      "ld1 { v28.s }[2], [x27], #0x4\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x28], #0x4\n"
+      "ld1 { v27.s }[2], [x27], #0x4\n"
+      "ld1 { v26.s }[2], [x26], #0x4\n"
       "ld1 { v25.s }[2], [x25], #0x4\n"
-      "ld1 { v21.s }[2], [x24], #0x4\n"
-      "ld1 { v27.s }[2], [x23], #0x4\n"
-      "ld1 { v26.s }[2], [x22], #0x4\n"
-      "ld1 { v19.s }[2], [x21], #0x4\n"
-      "ld1 { v24.s }[2], [x20], #0x4\n"
+      "ld1 { v24.s }[2], [x24], #0x4\n"
+      "ld1 { v23.s }[2], [x23], #0x4\n"
+      "ld1 { v22.s }[2], [x22], #0x4\n"
+      "ld1 { v21.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v28.h }[6], [x27], #0x2\n"
-      "mov x19, #0x4\n"
-      "ld1 { v29.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x28], #0x2\n"
+      "ld1 { v27.h }[6], [x27], #0x2\n"
+      "mov x20, #0x4\n"
+      "ld1 { v26.h }[6], [x26], #0x2\n"
       "ld1 { v25.h }[6], [x25], #0x2\n"
-      "ld1 { v21.h }[6], [x24], #0x2\n"
-      "ld1 { v27.h }[6], [x23], #0x2\n"
-      "ld1 { v26.h }[6], [x22], #0x2\n"
-      "ld1 { v19.h }[6], [x21], #0x2\n"
-      "ld1 { v24.h }[6], [x20], #0x2\n"
+      "ld1 { v24.h }[6], [x24], #0x2\n"
+      "ld1 { v23.h }[6], [x23], #0x2\n"
+      "ld1 { v22.h }[6], [x22], #0x2\n"
+      "ld1 { v21.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v28.b }[14], [x27]\n"
-      "ld1 { v29.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x28]\n"
+      "ld1 { v27.b }[14], [x27]\n"
+      "ld1 { v26.b }[14], [x26]\n"
       "ld1 { v25.b }[14], [x25]\n"
-      "ld1 { v21.b }[14], [x24]\n"
-      "ld1 { v27.b }[14], [x23]\n"
-      "ld1 { v26.b }[14], [x22]\n"
-      "ld1 { v19.b }[14], [x21]\n"
-      "ld1 { v24.b }[14], [x20]\n"
+      "ld1 { v24.b }[14], [x24]\n"
+      "ld1 { v23.b }[14], [x23]\n"
+      "ld1 { v22.b }[14], [x22]\n"
+      "ld1 { v21.b }[14], [x21]\n"
       "b 11f\n"
       "4:"  // odd_loads_1_12
-      "mov x19, #0x3\n"
+      "mov x20, #0x3\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v28.b }[12], [x27]\n"
-      "ld1 { v29.b }[12], [x26]\n"
-      "mov x19, #0x4\n"
+      "ld1 { v28.b }[12], [x28]\n"
+      "ld1 { v27.b }[12], [x27]\n"
+      "mov x20, #0x4\n"
+      "ld1 { v26.b }[12], [x26]\n"
       "ld1 { v25.b }[12], [x25]\n"
-      "ld1 { v21.b }[12], [x24]\n"
-      "ld1 { v27.b }[12], [x23]\n"
-      "ld1 { v26.b }[12], [x22]\n"
-      "ld1 { v19.b }[12], [x21]\n"
-      "ld1 { v24.b }[12], [x20]\n"
+      "ld1 { v24.b }[12], [x24]\n"
+      "ld1 { v23.b }[12], [x23]\n"
+      "ld1 { v22.b }[12], [x22]\n"
+      "ld1 { v21.b }[12], [x21]\n"
       "b 11f\n"
       "5:"  // odd_loads_2_8
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v28.h }[4], [x27], #0x2\n"
-      "ld1 { v29.h }[4], [x26], #0x2\n"
-      "mov x19, #0x3\n"
+      "ld1 { v28.h }[4], [x28], #0x2\n"
+      "ld1 { v27.h }[4], [x27], #0x2\n"
+      "mov x20, #0x3\n"
+      "ld1 { v26.h }[4], [x26], #0x2\n"
       "ld1 { v25.h }[4], [x25], #0x2\n"
-      "ld1 { v21.h }[4], [x24], #0x2\n"
-      "ld1 { v27.h }[4], [x23], #0x2\n"
-      "ld1 { v26.h }[4], [x22], #0x2\n"
-      "ld1 { v19.h }[4], [x21], #0x2\n"
-      "ld1 { v24.h }[4], [x20], #0x2\n"
+      "ld1 { v24.h }[4], [x24], #0x2\n"
+      "ld1 { v23.h }[4], [x23], #0x2\n"
+      "ld1 { v22.h }[4], [x22], #0x2\n"
+      "ld1 { v21.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v28.b }[10], [x27]\n"
-      "ld1 { v29.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x28]\n"
+      "ld1 { v27.b }[10], [x27]\n"
+      "ld1 { v26.b }[10], [x26]\n"
       "ld1 { v25.b }[10], [x25]\n"
-      "ld1 { v21.b }[10], [x24]\n"
-      "ld1 { v27.b }[10], [x23]\n"
-      "ld1 { v26.b }[10], [x22]\n"
-      "ld1 { v19.b }[10], [x21]\n"
-      "ld1 { v24.b }[10], [x20]\n"
+      "ld1 { v24.b }[10], [x24]\n"
+      "ld1 { v23.b }[10], [x23]\n"
+      "ld1 { v22.b }[10], [x22]\n"
+      "ld1 { v21.b }[10], [x21]\n"
       "b 11f\n"
       "6:"  // odd_loads_1_8
-      "mov x19, #0x2\n"
+      "mov x20, #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v28.b }[8], [x27]\n"
-      "ld1 { v29.b }[8], [x26]\n"
-      "mov x19, #0x3\n"
+      "ld1 { v28.b }[8], [x28]\n"
+      "ld1 { v27.b }[8], [x27]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v26.b }[8], [x26]\n"
       "ld1 { v25.b }[8], [x25]\n"
-      "ld1 { v21.b }[8], [x24]\n"
-      "ld1 { v27.b }[8], [x23]\n"
-      "ld1 { v26.b }[8], [x22]\n"
-      "ld1 { v19.b }[8], [x21]\n"
-      "ld1 { v24.b }[8], [x20]\n"
+      "ld1 { v24.b }[8], [x24]\n"
+      "ld1 { v23.b }[8], [x23]\n"
+      "ld1 { v22.b }[8], [x22]\n"
+      "ld1 { v21.b }[8], [x21]\n"
       "b 11f\n"
       "7:"  // odd_loads_4_0
       "tbz %x[width], #2, 9f\n"
-      "ldr s28, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x28], #0x4\n"
+      "ldr s27, [x27], #0x4\n"
+      "ldr s26, [x26], #0x4\n"
       "ldr s25, [x25], #0x4\n"
-      "ldr s21, [x24], #0x4\n"
-      "ldr s27, [x23], #0x4\n"
-      "ldr s26, [x22], #0x4\n"
-      "ldr s19, [x21], #0x4\n"
-      "ldr s24, [x20], #0x4\n"
+      "ldr s24, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s22, [x22], #0x4\n"
+      "ldr s21, [x21], #0x4\n"
       "tbz %x[width], #1, 8f\n"
-      "ld1 { v28.h }[2], [x27], #0x2\n"
-      "mov x19, #0x2\n"
-      "ld1 { v29.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x28], #0x2\n"
+      "ld1 { v27.h }[2], [x27], #0x2\n"
+      "mov x20, #0x2\n"
+      "ld1 { v26.h }[2], [x26], #0x2\n"
       "ld1 { v25.h }[2], [x25], #0x2\n"
-      "ld1 { v21.h }[2], [x24], #0x2\n"
-      "ld1 { v27.h }[2], [x23], #0x2\n"
-      "ld1 { v26.h }[2], [x22], #0x2\n"
-      "ld1 { v19.h }[2], [x21], #0x2\n"
-      "ld1 { v24.h }[2], [x20], #0x2\n"
+      "ld1 { v24.h }[2], [x24], #0x2\n"
+      "ld1 { v23.h }[2], [x23], #0x2\n"
+      "ld1 { v22.h }[2], [x22], #0x2\n"
+      "ld1 { v21.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v28.b }[6], [x27]\n"
-      "ld1 { v29.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x28]\n"
+      "ld1 { v27.b }[6], [x27]\n"
+      "ld1 { v26.b }[6], [x26]\n"
       "ld1 { v25.b }[6], [x25]\n"
-      "ld1 { v21.b }[6], [x24]\n"
-      "ld1 { v27.b }[6], [x23]\n"
-      "ld1 { v26.b }[6], [x22]\n"
-      "ld1 { v19.b }[6], [x21]\n"
-      "ld1 { v24.b }[6], [x20]\n"
+      "ld1 { v24.b }[6], [x24]\n"
+      "ld1 { v23.b }[6], [x23]\n"
+      "ld1 { v22.b }[6], [x22]\n"
+      "ld1 { v21.b }[6], [x21]\n"
       "b 11f\n"
       "8:"  // odd_loads_1_4
-      "mov x19, #0x1\n"
+      "mov x20, #0x1\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v28.b }[4], [x27]\n"
-      "ld1 { v29.b }[4], [x26]\n"
-      "mov x19, #0x2\n"
+      "ld1 { v28.b }[4], [x28]\n"
+      "ld1 { v27.b }[4], [x27]\n"
+      "mov x20, #0x2\n"
+      "ld1 { v26.b }[4], [x26]\n"
       "ld1 { v25.b }[4], [x25]\n"
-      "ld1 { v21.b }[4], [x24]\n"
-      "ld1 { v27.b }[4], [x23]\n"
-      "ld1 { v26.b }[4], [x22]\n"
-      "ld1 { v19.b }[4], [x21]\n"
-      "ld1 { v24.b }[4], [x20]\n"
+      "ld1 { v24.b }[4], [x24]\n"
+      "ld1 { v23.b }[4], [x23]\n"
+      "ld1 { v22.b }[4], [x22]\n"
+      "ld1 { v21.b }[4], [x21]\n"
       "b 11f\n"
       "9:"  // odd_loads_2_0
       "tbz %x[width], #1, 10f\n"
-      "ldr h28, [x27], #0x2\n"
-      "ldr h29, [x26], #0x2\n"
-      "mov x19, #0x1\n"
+      "ldr h28, [x28], #0x2\n"
+      "ldr h27, [x27], #0x2\n"
+      "mov x20, #0x1\n"
+      "ldr h26, [x26], #0x2\n"
       "ldr h25, [x25], #0x2\n"
-      "ldr h21, [x24], #0x2\n"
-      "ldr h27, [x23], #0x2\n"
-      "ldr h26, [x22], #0x2\n"
-      "ldr h19, [x21], #0x2\n"
-      "ldr h24, [x20], #0x2\n"
+      "ldr h24, [x24], #0x2\n"
+      "ldr h23, [x23], #0x2\n"
+      "ldr h22, [x22], #0x2\n"
+      "ldr h21, [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v28.b }[2], [x27]\n"
-      "ld1 { v29.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x28]\n"
+      "ld1 { v27.b }[2], [x27]\n"
+      "ld1 { v26.b }[2], [x26]\n"
       "ld1 { v25.b }[2], [x25]\n"
-      "ld1 { v21.b }[2], [x24]\n"
-      "ld1 { v27.b }[2], [x23]\n"
-      "ld1 { v26.b }[2], [x22]\n"
-      "ld1 { v19.b }[2], [x21]\n"
-      "ld1 { v24.b }[2], [x20]\n"
+      "ld1 { v24.b }[2], [x24]\n"
+      "ld1 { v23.b }[2], [x23]\n"
+      "ld1 { v22.b }[2], [x22]\n"
+      "ld1 { v21.b }[2], [x21]\n"
       "b 11f\n"
       "10:"  // odd_loads_1_0
-      "ldr b28, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr b29, [x26, #0x0]\n"
+      "ldr b28, [x28, #0x0]\n"
+      "ldr b27, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr b26, [x26, #0x0]\n"
       "ldr b25, [x25, #0x0]\n"
-      "ldr b21, [x24, #0x0]\n"
-      "ldr b27, [x23, #0x0]\n"
-      "ldr b26, [x22, #0x0]\n"
-      "ldr b19, [x21, #0x0]\n"
-      "ldr b24, [x20, #0x0]\n"
+      "ldr b24, [x24, #0x0]\n"
+      "ldr b23, [x23, #0x0]\n"
+      "ldr b22, [x22, #0x0]\n"
+      "ldr b21, [x21, #0x0]\n"
       "11:"  // Odd load end
-      "zip1 v22.4s, v28.4s, v25.4s\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v20.4s, v29.4s, v21.4s\n"
-      "zip1 v23.4s, v22.4s, v20.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.4s, v27.4s, v19.4s\n"
-      "zip1 v16.4s, v26.4s, v24.4s\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 12f\n"
-      "zip2 v22.4s, v22.4s, v20.4s\n"
-      "str q22, [%x[out_ptr], #0x0]\n"
-      "zip2 v20.4s, v18.4s, v16.4s\n"
-      "subs x19, x19, #0x1\n"
-      "str q20, [%x[out_ptr], #0x10]\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 12f\n"
-      "zip2 v28.4s, v28.4s, v25.4s\n"
-      "zip2 v25.4s, v29.4s, v21.4s\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v21.4s, v28.4s, v25.4s\n"
-      "str q21, [%x[out_ptr], #0x0]\n"
-      "zip2 v19.4s, v27.4s, v19.4s\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
-      "zip1 v18.4s, v19.4s, v16.4s\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
+      "zip2 v19.4s, v27.4s, v25.4s\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 12f\n"
-      "zip2 v17.4s, v28.4s, v25.4s\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.4s, v19.4s, v16.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "12:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
index dfec94c952..6c41b5fdfb 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,40 +31,41 @@ void interleave_block<8, 4, VLType::None, true>(
 )
 {
   __asm__ __volatile__(
-      "movi v1.8h, #0x0\n"
-      "ldr x27, [%x[in], #0x0]\n"
-      "mov x19, #0x0\n"
-      "movi v0.8h, #0x0\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
+      "mov x20, #0x0\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
+      "movi v2.8h, #0x0\n"
+      "movi v1.8h, #0x0\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
+      "movi v0.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
-      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
+      "add x28, x28, %x[row_offset]\n"
       "add x27, x27, %x[row_offset]\n"
-      "movi v30.4s, #0x0\n"
-      "ldr x24, [%x[in], #0x18]\n"
-      "ldr x23, [%x[in], #0x20]\n"
       "add x26, x26, %x[row_offset]\n"
-      "ldr x22, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset]\n"
-      "ldr x20, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset]\n"
       "add x22, x22, %x[row_offset]\n"
       "add x21, x21, %x[row_offset]\n"
-      "add x20, x20, %x[row_offset]\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x28, #0x0]\n"
       "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
@@ -72,7 +73,7 @@ void interleave_block<8, 4, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -80,71 +81,70 @@ void interleave_block<8, 4, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "cbnz %w[first], 2f\n"
       "sub %x[out_ptr], %x[out_ptr], #0x20\n"
-      "ld1 { v31.4s }, [%x[out_ptr]]\n"
-      "ldr q30, [%x[out_ptr], #0x10]\n"
+      "ld1 { v0.4s }, [%x[out_ptr]]\n"
+      "ldr q31, [%x[out_ptr], #0x10]\n"
       "2:"  // first_pass
       "cmp %x[width], #0x10\n"
       "blt 5f\n"
       "3:"  // Main loop head
-      "cmp x19, #0x1e\n"
+      "cmp x20, #0x1e\n"
       "ble 4f\n"
+      "sadalp v0.4s, v2.8h\n"
+      "movi v2.8h, #0x0\n"
+      "mov x20, #0x0\n"
       "sadalp v31.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
-      "sadalp v30.4s, v0.8h\n"
-      "movi v0.8h, #0x0\n"
-      "mov x19, #0x0\n"
       "4:"  // no_accumulate_16
-      "ldr q28, [x27], #0x10\n"
-      "add x19, x19, #0x1\n"
-      "ldr q29, [x26], #0x10\n"
+      "ldr q30, [x28], #0x10\n"
+      "ldr q29, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x10\n"
-      "ldr q25, [x25], #0x10\n"
-      "zip1 v22.4s, v28.4s, v25.4s\n"
-      "ldr q21, [x24], #0x10\n"
       "cmp %x[width], #0x10\n"
-      "zip2 v28.4s, v28.4s, v25.4s\n"
-      "ldr q27, [x23], #0x10\n"
-      "ldr q26, [x22], #0x10\n"
-      "zip1 v20.4s, v29.4s, v21.4s\n"
-      "ldr q19, [x21], #0x10\n"
-      "zip2 v25.4s, v29.4s, v21.4s\n"
-      "ldr q24, [x20], #0x10\n"
-      "zip1 v23.4s, v22.4s, v20.4s\n"
+      "ldr q28, [x26], #0x10\n"
+      "ldr q27, [x25], #0x10\n"
+      "zip1 v22.4s, v30.4s, v28.4s\n"
+      "zip1 v21.4s, v29.4s, v27.4s\n"
+      "ldr q20, [x24], #0x10\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v25.4s, v22.4s, v21.4s\n"
+      "sadalp v2.8h, v25.16b\n"
+      "ldr q19, [x22], #0x10\n"
+      "ldr q18, [x21], #0x10\n"
+      "zip1 v17.4s, v20.4s, v19.4s\n"
+      "zip1 v16.4s, v26.4s, v18.4s\n"
+      "zip1 v24.4s, v17.4s, v16.4s\n"
+      "sadalp v1.8h, v24.16b\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "sadalp v1.8h, v23.16b\n"
-      "zip2 v22.4s, v22.4s, v20.4s\n"
+      "zip2 v23.4s, v22.4s, v21.4s\n"
+      "zip2 v22.4s, v17.4s, v16.4s\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "sadalp v1.8h, v22.16b\n"
-      "zip1 v18.4s, v27.4s, v19.4s\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v16.4s, v26.4s, v24.4s\n"
+      "zip2 v21.4s, v30.4s, v28.4s\n"
+      "zip2 v17.4s, v29.4s, v27.4s\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "zip1 v21.4s, v28.4s, v25.4s\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "sadalp v1.8h, v21.16b\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "zip2 v20.4s, v20.4s, v19.4s\n"
+      "zip2 v16.4s, v26.4s, v18.4s\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip2 v20.4s, v18.4s, v16.4s\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "sadalp v0.8h, v17.16b\n"
-      "zip2 v19.4s, v27.4s, v19.4s\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "sadalp v0.8h, v20.16b\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.4s, v19.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
-      "sadalp v0.8h, v18.16b\n"
-      "zip2 v17.4s, v28.4s, v25.4s\n"
-      "str q22, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.4s, v19.4s, v16.4s\n"
-      "str q20, [%x[out_ptr], #0x30]\n"
-      "sadalp v1.8h, v17.16b\n"
-      "str q21, [%x[out_ptr], #0x40]\n"
-      "sadalp v0.8h, v16.16b\n"
+      "sadalp v2.8h, v23.16b\n"
+      "sadalp v1.8h, v22.16b\n"
+      "str q25, [%x[out_ptr], #0x0]\n"
+      "add x20, x20, #0x1\n"
+      "zip1 v19.4s, v21.4s, v17.4s\n"
+      "zip1 v18.4s, v20.4s, v16.4s\n"
+      "str q24, [%x[out_ptr], #0x10]\n"
+      "sadalp v2.8h, v19.16b\n"
+      "sadalp v1.8h, v18.16b\n"
+      "str q23, [%x[out_ptr], #0x20]\n"
+      "zip2 v17.4s, v21.4s, v17.4s\n"
+      "zip2 v16.4s, v20.4s, v16.4s\n"
+      "str q22, [%x[out_ptr], #0x30]\n"
+      "str q19, [%x[out_ptr], #0x40]\n"
+      "sadalp v2.8h, v17.16b\n"
+      "sadalp v1.8h, v16.16b\n"
       "str q18, [%x[out_ptr], #0x50]\n"
       "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
@@ -153,216 +153,216 @@ void interleave_block<8, 4, VLType::None, true>(
       "5:"  // Main loop skip
       "cbz %x[width], 14f\n"
       "tbz %x[width], #3, 9f\n"
+      "ldr d29, [x28], #0x8\n"
       "ldr d28, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d25, [x25], #0x8\n"
-      "ldr d21, [x24], #0x8\n"
-      "ldr d27, [x23], #0x8\n"
-      "ldr d26, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
+      "ldr d27, [x26], #0x8\n"
+      "ldr d26, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
       "tbz %x[width], #2, 7f\n"
+      "ld1 { v29.s }[2], [x28], #0x4\n"
       "ld1 { v28.s }[2], [x27], #0x4\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
-      "ld1 { v25.s }[2], [x25], #0x4\n"
-      "ld1 { v21.s }[2], [x24], #0x4\n"
-      "ld1 { v27.s }[2], [x23], #0x4\n"
-      "ld1 { v26.s }[2], [x22], #0x4\n"
-      "ld1 { v19.s }[2], [x21], #0x4\n"
-      "ld1 { v24.s }[2], [x20], #0x4\n"
+      "ld1 { v27.s }[2], [x26], #0x4\n"
+      "ld1 { v26.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v23.s }[2], [x22], #0x4\n"
+      "ld1 { v22.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
+      "ld1 { v29.h }[6], [x28], #0x2\n"
       "ld1 { v28.h }[6], [x27], #0x2\n"
-      "mov x19, #0x4\n"
-      "ld1 { v29.h }[6], [x26], #0x2\n"
-      "ld1 { v25.h }[6], [x25], #0x2\n"
-      "ld1 { v21.h }[6], [x24], #0x2\n"
-      "ld1 { v27.h }[6], [x23], #0x2\n"
-      "ld1 { v26.h }[6], [x22], #0x2\n"
-      "ld1 { v19.h }[6], [x21], #0x2\n"
-      "ld1 { v24.h }[6], [x20], #0x2\n"
+      "mov x20, #0x4\n"
+      "ld1 { v27.h }[6], [x26], #0x2\n"
+      "ld1 { v26.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v24.h }[6], [x23], #0x2\n"
+      "ld1 { v23.h }[6], [x22], #0x2\n"
+      "ld1 { v22.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[14], [x28]\n"
       "ld1 { v28.b }[14], [x27]\n"
-      "ld1 { v29.b }[14], [x26]\n"
-      "ld1 { v25.b }[14], [x25]\n"
-      "ld1 { v21.b }[14], [x24]\n"
-      "ld1 { v27.b }[14], [x23]\n"
-      "ld1 { v26.b }[14], [x22]\n"
-      "ld1 { v19.b }[14], [x21]\n"
-      "ld1 { v24.b }[14], [x20]\n"
+      "ld1 { v27.b }[14], [x26]\n"
+      "ld1 { v26.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v24.b }[14], [x23]\n"
+      "ld1 { v23.b }[14], [x22]\n"
+      "ld1 { v22.b }[14], [x21]\n"
       "b 13f\n"
       "6:"  // odd_loads_1_12
-      "mov x19, #0x3\n"
+      "mov x20, #0x3\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[12], [x28]\n"
       "ld1 { v28.b }[12], [x27]\n"
-      "ld1 { v29.b }[12], [x26]\n"
-      "mov x19, #0x4\n"
-      "ld1 { v25.b }[12], [x25]\n"
-      "ld1 { v21.b }[12], [x24]\n"
-      "ld1 { v27.b }[12], [x23]\n"
-      "ld1 { v26.b }[12], [x22]\n"
-      "ld1 { v19.b }[12], [x21]\n"
-      "ld1 { v24.b }[12], [x20]\n"
+      "mov x20, #0x4\n"
+      "ld1 { v27.b }[12], [x26]\n"
+      "ld1 { v26.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v24.b }[12], [x23]\n"
+      "ld1 { v23.b }[12], [x22]\n"
+      "ld1 { v22.b }[12], [x21]\n"
       "b 13f\n"
       "7:"  // odd_loads_2_8
       "tbz %x[width], #1, 8f\n"
+      "ld1 { v29.h }[4], [x28], #0x2\n"
       "ld1 { v28.h }[4], [x27], #0x2\n"
-      "ld1 { v29.h }[4], [x26], #0x2\n"
-      "mov x19, #0x3\n"
-      "ld1 { v25.h }[4], [x25], #0x2\n"
-      "ld1 { v21.h }[4], [x24], #0x2\n"
-      "ld1 { v27.h }[4], [x23], #0x2\n"
-      "ld1 { v26.h }[4], [x22], #0x2\n"
-      "ld1 { v19.h }[4], [x21], #0x2\n"
-      "ld1 { v24.h }[4], [x20], #0x2\n"
+      "mov x20, #0x3\n"
+      "ld1 { v27.h }[4], [x26], #0x2\n"
+      "ld1 { v26.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v24.h }[4], [x23], #0x2\n"
+      "ld1 { v23.h }[4], [x22], #0x2\n"
+      "ld1 { v22.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[10], [x28]\n"
       "ld1 { v28.b }[10], [x27]\n"
-      "ld1 { v29.b }[10], [x26]\n"
-      "ld1 { v25.b }[10], [x25]\n"
-      "ld1 { v21.b }[10], [x24]\n"
-      "ld1 { v27.b }[10], [x23]\n"
-      "ld1 { v26.b }[10], [x22]\n"
-      "ld1 { v19.b }[10], [x21]\n"
-      "ld1 { v24.b }[10], [x20]\n"
+      "ld1 { v27.b }[10], [x26]\n"
+      "ld1 { v26.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v24.b }[10], [x23]\n"
+      "ld1 { v23.b }[10], [x22]\n"
+      "ld1 { v22.b }[10], [x21]\n"
       "b 13f\n"
       "8:"  // odd_loads_1_8
-      "mov x19, #0x2\n"
+      "mov x20, #0x2\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[8], [x28]\n"
       "ld1 { v28.b }[8], [x27]\n"
-      "ld1 { v29.b }[8], [x26]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v25.b }[8], [x25]\n"
-      "ld1 { v21.b }[8], [x24]\n"
-      "ld1 { v27.b }[8], [x23]\n"
-      "ld1 { v26.b }[8], [x22]\n"
-      "ld1 { v19.b }[8], [x21]\n"
-      "ld1 { v24.b }[8], [x20]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v27.b }[8], [x26]\n"
+      "ld1 { v26.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v24.b }[8], [x23]\n"
+      "ld1 { v23.b }[8], [x22]\n"
+      "ld1 { v22.b }[8], [x21]\n"
       "b 13f\n"
       "9:"  // odd_loads_4_0
       "tbz %x[width], #2, 11f\n"
+      "ldr s29, [x28], #0x4\n"
       "ldr s28, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s25, [x25], #0x4\n"
-      "ldr s21, [x24], #0x4\n"
-      "ldr s27, [x23], #0x4\n"
-      "ldr s26, [x22], #0x4\n"
-      "ldr s19, [x21], #0x4\n"
-      "ldr s24, [x20], #0x4\n"
+      "ldr s27, [x26], #0x4\n"
+      "ldr s26, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s23, [x22], #0x4\n"
+      "ldr s22, [x21], #0x4\n"
       "tbz %x[width], #1, 10f\n"
+      "ld1 { v29.h }[2], [x28], #0x2\n"
       "ld1 { v28.h }[2], [x27], #0x2\n"
-      "mov x19, #0x2\n"
-      "ld1 { v29.h }[2], [x26], #0x2\n"
-      "ld1 { v25.h }[2], [x25], #0x2\n"
-      "ld1 { v21.h }[2], [x24], #0x2\n"
-      "ld1 { v27.h }[2], [x23], #0x2\n"
-      "ld1 { v26.h }[2], [x22], #0x2\n"
-      "ld1 { v19.h }[2], [x21], #0x2\n"
-      "ld1 { v24.h }[2], [x20], #0x2\n"
+      "mov x20, #0x2\n"
+      "ld1 { v27.h }[2], [x26], #0x2\n"
+      "ld1 { v26.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v24.h }[2], [x23], #0x2\n"
+      "ld1 { v23.h }[2], [x22], #0x2\n"
+      "ld1 { v22.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[6], [x28]\n"
       "ld1 { v28.b }[6], [x27]\n"
-      "ld1 { v29.b }[6], [x26]\n"
-      "ld1 { v25.b }[6], [x25]\n"
-      "ld1 { v21.b }[6], [x24]\n"
-      "ld1 { v27.b }[6], [x23]\n"
-      "ld1 { v26.b }[6], [x22]\n"
-      "ld1 { v19.b }[6], [x21]\n"
-      "ld1 { v24.b }[6], [x20]\n"
+      "ld1 { v27.b }[6], [x26]\n"
+      "ld1 { v26.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v24.b }[6], [x23]\n"
+      "ld1 { v23.b }[6], [x22]\n"
+      "ld1 { v22.b }[6], [x21]\n"
       "b 13f\n"
       "10:"  // odd_loads_1_4
-      "mov x19, #0x1\n"
+      "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[4], [x28]\n"
       "ld1 { v28.b }[4], [x27]\n"
-      "ld1 { v29.b }[4], [x26]\n"
-      "mov x19, #0x2\n"
-      "ld1 { v25.b }[4], [x25]\n"
-      "ld1 { v21.b }[4], [x24]\n"
-      "ld1 { v27.b }[4], [x23]\n"
-      "ld1 { v26.b }[4], [x22]\n"
-      "ld1 { v19.b }[4], [x21]\n"
-      "ld1 { v24.b }[4], [x20]\n"
+      "mov x20, #0x2\n"
+      "ld1 { v27.b }[4], [x26]\n"
+      "ld1 { v26.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v24.b }[4], [x23]\n"
+      "ld1 { v23.b }[4], [x22]\n"
+      "ld1 { v22.b }[4], [x21]\n"
       "b 13f\n"
       "11:"  // odd_loads_2_0
       "tbz %x[width], #1, 12f\n"
+      "ldr h29, [x28], #0x2\n"
       "ldr h28, [x27], #0x2\n"
-      "ldr h29, [x26], #0x2\n"
-      "mov x19, #0x1\n"
-      "ldr h25, [x25], #0x2\n"
-      "ldr h21, [x24], #0x2\n"
-      "ldr h27, [x23], #0x2\n"
-      "ldr h26, [x22], #0x2\n"
-      "ldr h19, [x21], #0x2\n"
-      "ldr h24, [x20], #0x2\n"
+      "mov x20, #0x1\n"
+      "ldr h27, [x26], #0x2\n"
+      "ldr h26, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h24, [x23], #0x2\n"
+      "ldr h23, [x22], #0x2\n"
+      "ldr h22, [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[2], [x28]\n"
       "ld1 { v28.b }[2], [x27]\n"
-      "ld1 { v29.b }[2], [x26]\n"
-      "ld1 { v25.b }[2], [x25]\n"
-      "ld1 { v21.b }[2], [x24]\n"
-      "ld1 { v27.b }[2], [x23]\n"
-      "ld1 { v26.b }[2], [x22]\n"
-      "ld1 { v19.b }[2], [x21]\n"
-      "ld1 { v24.b }[2], [x20]\n"
+      "ld1 { v27.b }[2], [x26]\n"
+      "ld1 { v26.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v24.b }[2], [x23]\n"
+      "ld1 { v23.b }[2], [x22]\n"
+      "ld1 { v22.b }[2], [x21]\n"
       "b 13f\n"
       "12:"  // odd_loads_1_0
+      "ldr b29, [x28, #0x0]\n"
       "ldr b28, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr b29, [x26, #0x0]\n"
-      "ldr b25, [x25, #0x0]\n"
-      "ldr b21, [x24, #0x0]\n"
-      "ldr b27, [x23, #0x0]\n"
-      "ldr b26, [x22, #0x0]\n"
-      "ldr b19, [x21, #0x0]\n"
-      "ldr b24, [x20, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr b27, [x26, #0x0]\n"
+      "ldr b26, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b24, [x23, #0x0]\n"
+      "ldr b23, [x22, #0x0]\n"
+      "ldr b22, [x21, #0x0]\n"
       "13:"  // Odd load end
-      "zip1 v22.4s, v28.4s, v25.4s\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v20.4s, v29.4s, v21.4s\n"
-      "zip1 v23.4s, v22.4s, v20.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "sadalp v1.8h, v23.16b\n"
-      "zip1 v18.4s, v27.4s, v19.4s\n"
-      "zip1 v16.4s, v26.4s, v24.4s\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
-      "sadalp v0.8h, v17.16b\n"
+      "zip1 v21.4s, v29.4s, v27.4s\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v19.4s, v25.4s, v23.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v21.4s, v20.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "sadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "sadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 14f\n"
-      "zip2 v22.4s, v22.4s, v20.4s\n"
-      "str q22, [%x[out_ptr], #0x0]\n"
-      "zip2 v20.4s, v18.4s, v16.4s\n"
-      "sadalp v1.8h, v22.16b\n"
-      "str q20, [%x[out_ptr], #0x10]\n"
-      "subs x19, x19, #0x1\n"
+      "zip2 v17.4s, v21.4s, v20.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "subs x20, x20, #0x1\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "sadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "sadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
-      "sadalp v0.8h, v20.16b\n"
       "beq 14f\n"
-      "zip2 v28.4s, v28.4s, v25.4s\n"
-      "zip2 v25.4s, v29.4s, v21.4s\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v21.4s, v28.4s, v25.4s\n"
-      "str q21, [%x[out_ptr], #0x0]\n"
-      "sadalp v1.8h, v21.16b\n"
-      "zip2 v19.4s, v27.4s, v19.4s\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
-      "zip1 v18.4s, v19.4s, v16.4s\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
-      "sadalp v0.8h, v18.16b\n"
+      "zip2 v21.4s, v29.4s, v27.4s\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v19.4s, v25.4s, v23.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v21.4s, v20.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "sadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "sadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 14f\n"
-      "zip2 v17.4s, v28.4s, v25.4s\n"
+      "zip2 v17.4s, v21.4s, v20.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
       "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.4s, v19.4s, v16.4s\n"
-      "sadalp v1.8h, v17.16b\n"
+      "sadalp v2.8h, v17.16b\n"
       "str q16, [%x[out_ptr], #0x10]\n"
+      "sadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
-      "sadalp v0.8h, v16.16b\n"
       "14:"  // Odds skip
+      "sadalp v0.4s, v2.8h\n"
       "sadalp v31.4s, v1.8h\n"
-      "str q31, [%x[out_ptr], #0x0]\n"
-      "sadalp v30.4s, v0.8h\n"
-      "str q30, [%x[out_ptr], #0x10]\n"
+      "str q0, [%x[out_ptr], #0x0]\n"
+      "str q31, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
index 1b94c7f1f1..17eb7d5556 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,40 +31,41 @@ void interleave_block<8, 4, VLType::None, true>(
 )
 {
   __asm__ __volatile__(
-      "movi v1.8h, #0x0\n"
-      "ldr x27, [%x[in], #0x0]\n"
-      "mov x19, #0x0\n"
-      "movi v0.8h, #0x0\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
+      "mov x20, #0x0\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
+      "movi v2.8h, #0x0\n"
+      "movi v1.8h, #0x0\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
+      "movi v0.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
-      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
+      "add x28, x28, %x[row_offset]\n"
       "add x27, x27, %x[row_offset]\n"
-      "movi v30.4s, #0x0\n"
-      "ldr x24, [%x[in], #0x18]\n"
-      "ldr x23, [%x[in], #0x20]\n"
       "add x26, x26, %x[row_offset]\n"
-      "ldr x22, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset]\n"
-      "ldr x20, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset]\n"
       "add x22, x22, %x[row_offset]\n"
       "add x21, x21, %x[row_offset]\n"
-      "add x20, x20, %x[row_offset]\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x28, #0x0]\n"
       "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
@@ -72,7 +73,7 @@ void interleave_block<8, 4, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -80,71 +81,70 @@ void interleave_block<8, 4, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "cbnz %w[first], 2f\n"
       "sub %x[out_ptr], %x[out_ptr], #0x20\n"
-      "ld1 { v31.4s }, [%x[out_ptr]]\n"
-      "ldr q30, [%x[out_ptr], #0x10]\n"
+      "ld1 { v0.4s }, [%x[out_ptr]]\n"
+      "ldr q31, [%x[out_ptr], #0x10]\n"
       "2:"  // first_pass
       "cmp %x[width], #0x10\n"
       "blt 5f\n"
       "3:"  // Main loop head
-      "cmp x19, #0x1e\n"
+      "cmp x20, #0x1e\n"
       "ble 4f\n"
+      "uadalp v0.4s, v2.8h\n"
+      "movi v2.8h, #0x0\n"
+      "mov x20, #0x0\n"
       "uadalp v31.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
-      "uadalp v30.4s, v0.8h\n"
-      "movi v0.8h, #0x0\n"
-      "mov x19, #0x0\n"
       "4:"  // no_accumulate_16
-      "ldr q28, [x27], #0x10\n"
-      "add x19, x19, #0x1\n"
-      "ldr q29, [x26], #0x10\n"
+      "ldr q30, [x28], #0x10\n"
+      "ldr q29, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x10\n"
-      "ldr q25, [x25], #0x10\n"
-      "zip1 v22.4s, v28.4s, v25.4s\n"
-      "ldr q21, [x24], #0x10\n"
       "cmp %x[width], #0x10\n"
-      "zip2 v28.4s, v28.4s, v25.4s\n"
-      "ldr q27, [x23], #0x10\n"
-      "ldr q26, [x22], #0x10\n"
-      "zip1 v20.4s, v29.4s, v21.4s\n"
-      "ldr q19, [x21], #0x10\n"
-      "zip2 v25.4s, v29.4s, v21.4s\n"
-      "ldr q24, [x20], #0x10\n"
-      "zip1 v23.4s, v22.4s, v20.4s\n"
+      "ldr q28, [x26], #0x10\n"
+      "ldr q27, [x25], #0x10\n"
+      "zip1 v22.4s, v30.4s, v28.4s\n"
+      "zip1 v21.4s, v29.4s, v27.4s\n"
+      "ldr q20, [x24], #0x10\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v25.4s, v22.4s, v21.4s\n"
+      "uadalp v2.8h, v25.16b\n"
+      "ldr q19, [x22], #0x10\n"
+      "ldr q18, [x21], #0x10\n"
+      "zip1 v17.4s, v20.4s, v19.4s\n"
+      "zip1 v16.4s, v26.4s, v18.4s\n"
+      "zip1 v24.4s, v17.4s, v16.4s\n"
+      "uadalp v1.8h, v24.16b\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "uadalp v1.8h, v23.16b\n"
-      "zip2 v22.4s, v22.4s, v20.4s\n"
+      "zip2 v23.4s, v22.4s, v21.4s\n"
+      "zip2 v22.4s, v17.4s, v16.4s\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "uadalp v1.8h, v22.16b\n"
-      "zip1 v18.4s, v27.4s, v19.4s\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v16.4s, v26.4s, v24.4s\n"
+      "zip2 v21.4s, v30.4s, v28.4s\n"
+      "zip2 v17.4s, v29.4s, v27.4s\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "zip1 v21.4s, v28.4s, v25.4s\n"
       "prfm pldl1keep, [x23, #0x70]\n"
-      "uadalp v1.8h, v21.16b\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "zip2 v20.4s, v20.4s, v19.4s\n"
+      "zip2 v16.4s, v26.4s, v18.4s\n"
       "prfm pldl1keep, [x22, #0x70]\n"
-      "zip2 v20.4s, v18.4s, v16.4s\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "uadalp v0.8h, v17.16b\n"
-      "zip2 v19.4s, v27.4s, v19.4s\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "uadalp v0.8h, v20.16b\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.4s, v19.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
-      "uadalp v0.8h, v18.16b\n"
-      "zip2 v17.4s, v28.4s, v25.4s\n"
-      "str q22, [%x[out_ptr], #0x20]\n"
-      "zip2 v16.4s, v19.4s, v16.4s\n"
-      "str q20, [%x[out_ptr], #0x30]\n"
-      "uadalp v1.8h, v17.16b\n"
-      "str q21, [%x[out_ptr], #0x40]\n"
-      "uadalp v0.8h, v16.16b\n"
+      "uadalp v2.8h, v23.16b\n"
+      "uadalp v1.8h, v22.16b\n"
+      "str q25, [%x[out_ptr], #0x0]\n"
+      "add x20, x20, #0x1\n"
+      "zip1 v19.4s, v21.4s, v17.4s\n"
+      "zip1 v18.4s, v20.4s, v16.4s\n"
+      "str q24, [%x[out_ptr], #0x10]\n"
+      "uadalp v2.8h, v19.16b\n"
+      "uadalp v1.8h, v18.16b\n"
+      "str q23, [%x[out_ptr], #0x20]\n"
+      "zip2 v17.4s, v21.4s, v17.4s\n"
+      "zip2 v16.4s, v20.4s, v16.4s\n"
+      "str q22, [%x[out_ptr], #0x30]\n"
+      "str q19, [%x[out_ptr], #0x40]\n"
+      "uadalp v2.8h, v17.16b\n"
+      "uadalp v1.8h, v16.16b\n"
       "str q18, [%x[out_ptr], #0x50]\n"
       "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
@@ -153,216 +153,216 @@ void interleave_block<8, 4, VLType::None, true>(
       "5:"  // Main loop skip
       "cbz %x[width], 14f\n"
       "tbz %x[width], #3, 9f\n"
+      "ldr d29, [x28], #0x8\n"
       "ldr d28, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d25, [x25], #0x8\n"
-      "ldr d21, [x24], #0x8\n"
-      "ldr d27, [x23], #0x8\n"
-      "ldr d26, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
+      "ldr d27, [x26], #0x8\n"
+      "ldr d26, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
       "tbz %x[width], #2, 7f\n"
+      "ld1 { v29.s }[2], [x28], #0x4\n"
       "ld1 { v28.s }[2], [x27], #0x4\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
-      "ld1 { v25.s }[2], [x25], #0x4\n"
-      "ld1 { v21.s }[2], [x24], #0x4\n"
-      "ld1 { v27.s }[2], [x23], #0x4\n"
-      "ld1 { v26.s }[2], [x22], #0x4\n"
-      "ld1 { v19.s }[2], [x21], #0x4\n"
-      "ld1 { v24.s }[2], [x20], #0x4\n"
+      "ld1 { v27.s }[2], [x26], #0x4\n"
+      "ld1 { v26.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v23.s }[2], [x22], #0x4\n"
+      "ld1 { v22.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
+      "ld1 { v29.h }[6], [x28], #0x2\n"
       "ld1 { v28.h }[6], [x27], #0x2\n"
-      "mov x19, #0x4\n"
-      "ld1 { v29.h }[6], [x26], #0x2\n"
-      "ld1 { v25.h }[6], [x25], #0x2\n"
-      "ld1 { v21.h }[6], [x24], #0x2\n"
-      "ld1 { v27.h }[6], [x23], #0x2\n"
-      "ld1 { v26.h }[6], [x22], #0x2\n"
-      "ld1 { v19.h }[6], [x21], #0x2\n"
-      "ld1 { v24.h }[6], [x20], #0x2\n"
+      "mov x20, #0x4\n"
+      "ld1 { v27.h }[6], [x26], #0x2\n"
+      "ld1 { v26.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v24.h }[6], [x23], #0x2\n"
+      "ld1 { v23.h }[6], [x22], #0x2\n"
+      "ld1 { v22.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[14], [x28]\n"
       "ld1 { v28.b }[14], [x27]\n"
-      "ld1 { v29.b }[14], [x26]\n"
-      "ld1 { v25.b }[14], [x25]\n"
-      "ld1 { v21.b }[14], [x24]\n"
-      "ld1 { v27.b }[14], [x23]\n"
-      "ld1 { v26.b }[14], [x22]\n"
-      "ld1 { v19.b }[14], [x21]\n"
-      "ld1 { v24.b }[14], [x20]\n"
+      "ld1 { v27.b }[14], [x26]\n"
+      "ld1 { v26.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v24.b }[14], [x23]\n"
+      "ld1 { v23.b }[14], [x22]\n"
+      "ld1 { v22.b }[14], [x21]\n"
       "b 13f\n"
       "6:"  // odd_loads_1_12
-      "mov x19, #0x3\n"
+      "mov x20, #0x3\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[12], [x28]\n"
       "ld1 { v28.b }[12], [x27]\n"
-      "ld1 { v29.b }[12], [x26]\n"
-      "mov x19, #0x4\n"
-      "ld1 { v25.b }[12], [x25]\n"
-      "ld1 { v21.b }[12], [x24]\n"
-      "ld1 { v27.b }[12], [x23]\n"
-      "ld1 { v26.b }[12], [x22]\n"
-      "ld1 { v19.b }[12], [x21]\n"
-      "ld1 { v24.b }[12], [x20]\n"
+      "mov x20, #0x4\n"
+      "ld1 { v27.b }[12], [x26]\n"
+      "ld1 { v26.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v24.b }[12], [x23]\n"
+      "ld1 { v23.b }[12], [x22]\n"
+      "ld1 { v22.b }[12], [x21]\n"
       "b 13f\n"
       "7:"  // odd_loads_2_8
       "tbz %x[width], #1, 8f\n"
+      "ld1 { v29.h }[4], [x28], #0x2\n"
       "ld1 { v28.h }[4], [x27], #0x2\n"
-      "ld1 { v29.h }[4], [x26], #0x2\n"
-      "mov x19, #0x3\n"
-      "ld1 { v25.h }[4], [x25], #0x2\n"
-      "ld1 { v21.h }[4], [x24], #0x2\n"
-      "ld1 { v27.h }[4], [x23], #0x2\n"
-      "ld1 { v26.h }[4], [x22], #0x2\n"
-      "ld1 { v19.h }[4], [x21], #0x2\n"
-      "ld1 { v24.h }[4], [x20], #0x2\n"
+      "mov x20, #0x3\n"
+      "ld1 { v27.h }[4], [x26], #0x2\n"
+      "ld1 { v26.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v24.h }[4], [x23], #0x2\n"
+      "ld1 { v23.h }[4], [x22], #0x2\n"
+      "ld1 { v22.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[10], [x28]\n"
       "ld1 { v28.b }[10], [x27]\n"
-      "ld1 { v29.b }[10], [x26]\n"
-      "ld1 { v25.b }[10], [x25]\n"
-      "ld1 { v21.b }[10], [x24]\n"
-      "ld1 { v27.b }[10], [x23]\n"
-      "ld1 { v26.b }[10], [x22]\n"
-      "ld1 { v19.b }[10], [x21]\n"
-      "ld1 { v24.b }[10], [x20]\n"
+      "ld1 { v27.b }[10], [x26]\n"
+      "ld1 { v26.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v24.b }[10], [x23]\n"
+      "ld1 { v23.b }[10], [x22]\n"
+      "ld1 { v22.b }[10], [x21]\n"
       "b 13f\n"
       "8:"  // odd_loads_1_8
-      "mov x19, #0x2\n"
+      "mov x20, #0x2\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[8], [x28]\n"
       "ld1 { v28.b }[8], [x27]\n"
-      "ld1 { v29.b }[8], [x26]\n"
-      "mov x19, #0x3\n"
-      "ld1 { v25.b }[8], [x25]\n"
-      "ld1 { v21.b }[8], [x24]\n"
-      "ld1 { v27.b }[8], [x23]\n"
-      "ld1 { v26.b }[8], [x22]\n"
-      "ld1 { v19.b }[8], [x21]\n"
-      "ld1 { v24.b }[8], [x20]\n"
+      "mov x20, #0x3\n"
+      "ld1 { v27.b }[8], [x26]\n"
+      "ld1 { v26.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v24.b }[8], [x23]\n"
+      "ld1 { v23.b }[8], [x22]\n"
+      "ld1 { v22.b }[8], [x21]\n"
       "b 13f\n"
       "9:"  // odd_loads_4_0
       "tbz %x[width], #2, 11f\n"
+      "ldr s29, [x28], #0x4\n"
       "ldr s28, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s25, [x25], #0x4\n"
-      "ldr s21, [x24], #0x4\n"
-      "ldr s27, [x23], #0x4\n"
-      "ldr s26, [x22], #0x4\n"
-      "ldr s19, [x21], #0x4\n"
-      "ldr s24, [x20], #0x4\n"
+      "ldr s27, [x26], #0x4\n"
+      "ldr s26, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s23, [x22], #0x4\n"
+      "ldr s22, [x21], #0x4\n"
       "tbz %x[width], #1, 10f\n"
+      "ld1 { v29.h }[2], [x28], #0x2\n"
       "ld1 { v28.h }[2], [x27], #0x2\n"
-      "mov x19, #0x2\n"
-      "ld1 { v29.h }[2], [x26], #0x2\n"
-      "ld1 { v25.h }[2], [x25], #0x2\n"
-      "ld1 { v21.h }[2], [x24], #0x2\n"
-      "ld1 { v27.h }[2], [x23], #0x2\n"
-      "ld1 { v26.h }[2], [x22], #0x2\n"
-      "ld1 { v19.h }[2], [x21], #0x2\n"
-      "ld1 { v24.h }[2], [x20], #0x2\n"
+      "mov x20, #0x2\n"
+      "ld1 { v27.h }[2], [x26], #0x2\n"
+      "ld1 { v26.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v24.h }[2], [x23], #0x2\n"
+      "ld1 { v23.h }[2], [x22], #0x2\n"
+      "ld1 { v22.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[6], [x28]\n"
       "ld1 { v28.b }[6], [x27]\n"
-      "ld1 { v29.b }[6], [x26]\n"
-      "ld1 { v25.b }[6], [x25]\n"
-      "ld1 { v21.b }[6], [x24]\n"
-      "ld1 { v27.b }[6], [x23]\n"
-      "ld1 { v26.b }[6], [x22]\n"
-      "ld1 { v19.b }[6], [x21]\n"
-      "ld1 { v24.b }[6], [x20]\n"
+      "ld1 { v27.b }[6], [x26]\n"
+      "ld1 { v26.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v24.b }[6], [x23]\n"
+      "ld1 { v23.b }[6], [x22]\n"
+      "ld1 { v22.b }[6], [x21]\n"
       "b 13f\n"
       "10:"  // odd_loads_1_4
-      "mov x19, #0x1\n"
+      "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[4], [x28]\n"
       "ld1 { v28.b }[4], [x27]\n"
-      "ld1 { v29.b }[4], [x26]\n"
-      "mov x19, #0x2\n"
-      "ld1 { v25.b }[4], [x25]\n"
-      "ld1 { v21.b }[4], [x24]\n"
-      "ld1 { v27.b }[4], [x23]\n"
-      "ld1 { v26.b }[4], [x22]\n"
-      "ld1 { v19.b }[4], [x21]\n"
-      "ld1 { v24.b }[4], [x20]\n"
+      "mov x20, #0x2\n"
+      "ld1 { v27.b }[4], [x26]\n"
+      "ld1 { v26.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v24.b }[4], [x23]\n"
+      "ld1 { v23.b }[4], [x22]\n"
+      "ld1 { v22.b }[4], [x21]\n"
       "b 13f\n"
       "11:"  // odd_loads_2_0
       "tbz %x[width], #1, 12f\n"
+      "ldr h29, [x28], #0x2\n"
       "ldr h28, [x27], #0x2\n"
-      "ldr h29, [x26], #0x2\n"
-      "mov x19, #0x1\n"
-      "ldr h25, [x25], #0x2\n"
-      "ldr h21, [x24], #0x2\n"
-      "ldr h27, [x23], #0x2\n"
-      "ldr h26, [x22], #0x2\n"
-      "ldr h19, [x21], #0x2\n"
-      "ldr h24, [x20], #0x2\n"
+      "mov x20, #0x1\n"
+      "ldr h27, [x26], #0x2\n"
+      "ldr h26, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h24, [x23], #0x2\n"
+      "ldr h23, [x22], #0x2\n"
+      "ldr h22, [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[2], [x28]\n"
       "ld1 { v28.b }[2], [x27]\n"
-      "ld1 { v29.b }[2], [x26]\n"
-      "ld1 { v25.b }[2], [x25]\n"
-      "ld1 { v21.b }[2], [x24]\n"
-      "ld1 { v27.b }[2], [x23]\n"
-      "ld1 { v26.b }[2], [x22]\n"
-      "ld1 { v19.b }[2], [x21]\n"
-      "ld1 { v24.b }[2], [x20]\n"
+      "ld1 { v27.b }[2], [x26]\n"
+      "ld1 { v26.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v24.b }[2], [x23]\n"
+      "ld1 { v23.b }[2], [x22]\n"
+      "ld1 { v22.b }[2], [x21]\n"
       "b 13f\n"
       "12:"  // odd_loads_1_0
+      "ldr b29, [x28, #0x0]\n"
       "ldr b28, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr b29, [x26, #0x0]\n"
-      "ldr b25, [x25, #0x0]\n"
-      "ldr b21, [x24, #0x0]\n"
-      "ldr b27, [x23, #0x0]\n"
-      "ldr b26, [x22, #0x0]\n"
-      "ldr b19, [x21, #0x0]\n"
-      "ldr b24, [x20, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr b27, [x26, #0x0]\n"
+      "ldr b26, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b24, [x23, #0x0]\n"
+      "ldr b23, [x22, #0x0]\n"
+      "ldr b22, [x21, #0x0]\n"
       "13:"  // Odd load end
-      "zip1 v22.4s, v28.4s, v25.4s\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v20.4s, v29.4s, v21.4s\n"
-      "zip1 v23.4s, v22.4s, v20.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "uadalp v1.8h, v23.16b\n"
-      "zip1 v18.4s, v27.4s, v19.4s\n"
-      "zip1 v16.4s, v26.4s, v24.4s\n"
-      "zip1 v17.4s, v18.4s, v16.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
-      "uadalp v0.8h, v17.16b\n"
+      "zip1 v21.4s, v29.4s, v27.4s\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v19.4s, v25.4s, v23.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v21.4s, v20.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "uadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "uadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 14f\n"
-      "zip2 v22.4s, v22.4s, v20.4s\n"
-      "str q22, [%x[out_ptr], #0x0]\n"
-      "zip2 v20.4s, v18.4s, v16.4s\n"
-      "uadalp v1.8h, v22.16b\n"
-      "str q20, [%x[out_ptr], #0x10]\n"
-      "subs x19, x19, #0x1\n"
+      "zip2 v17.4s, v21.4s, v20.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "subs x20, x20, #0x1\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "uadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "uadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
-      "uadalp v0.8h, v20.16b\n"
       "beq 14f\n"
-      "zip2 v28.4s, v28.4s, v25.4s\n"
-      "zip2 v25.4s, v29.4s, v21.4s\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v21.4s, v28.4s, v25.4s\n"
-      "str q21, [%x[out_ptr], #0x0]\n"
-      "uadalp v1.8h, v21.16b\n"
-      "zip2 v19.4s, v27.4s, v19.4s\n"
-      "zip2 v16.4s, v26.4s, v24.4s\n"
-      "zip1 v18.4s, v19.4s, v16.4s\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
-      "uadalp v0.8h, v18.16b\n"
+      "zip2 v21.4s, v29.4s, v27.4s\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v19.4s, v25.4s, v23.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v21.4s, v20.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "uadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "uadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 14f\n"
-      "zip2 v17.4s, v28.4s, v25.4s\n"
+      "zip2 v17.4s, v21.4s, v20.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
       "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.4s, v19.4s, v16.4s\n"
-      "uadalp v1.8h, v17.16b\n"
+      "uadalp v2.8h, v17.16b\n"
       "str q16, [%x[out_ptr], #0x10]\n"
+      "uadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
-      "uadalp v0.8h, v16.16b\n"
       "14:"  // Odds skip
+      "uadalp v0.4s, v2.8h\n"
       "uadalp v31.4s, v1.8h\n"
-      "str q31, [%x[out_ptr], #0x0]\n"
-      "uadalp v30.4s, v0.8h\n"
-      "str q30, [%x[out_ptr], #0x10]\n"
+      "str q0, [%x[out_ptr], #0x0]\n"
+      "str q31, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
index 1330593cbf..7b445ef3d4 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,44 +31,45 @@ void interleave_block<8, 8, VLType::None, false>(
 )
 {
   __asm__ __volatile__(
-      "ldr x27, [%x[in], #0x0]\n"
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
       "cmp %x[height], #0x8\n"
-      "ldr x26, [%x[in], #0x8]\n"
+      "add x28, x28, %x[row_offset]\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "add x27, x27, %x[row_offset]\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "ldr x24, [%x[in], #0x18]\n"
       "add x26, x26, %x[row_offset]\n"
-      "ldr x23, [%x[in], #0x20]\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "add x25, x25, %x[row_offset]\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "ldr x21, [%x[in], #0x30]\n"
       "add x24, x24, %x[row_offset]\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "add x23, x23, %x[row_offset]\n"
       "add x22, x22, %x[row_offset]\n"
       "add x21, x21, %x[row_offset]\n"
-      "add x20, x20, %x[row_offset]\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "prfm pldl1keep, [x27, #0x0]\n"
       "cmp %x[width], #0x10\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
       "prfm pldl1keep, [x24, #0x0]\n"
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -76,230 +77,228 @@ void interleave_block<8, 8, VLType::None, false>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q27, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q19, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x10\n"
-      "ldr q24, [x26], #0x10\n"
-      "zip1 v26.2d, v27.2d, v24.2d\n"
-      "ldr q25, [x25], #0x10\n"
       "cmp %x[width], #0x10\n"
-      "zip2 v24.2d, v27.2d, v24.2d\n"
-      "ldr q21, [x24], #0x10\n"
-      "ldr q23, [x23], #0x10\n"
-      "zip1 v22.2d, v25.2d, v21.2d\n"
-      "ldr q18, [x22], #0x10\n"
-      "zip2 v21.2d, v25.2d, v21.2d\n"
-      "ldr q20, [x21], #0x10\n"
-      "ldr q16, [x20], #0x10\n"
-      "zip1 v19.2d, v23.2d, v18.2d\n"
+      "ldr q25, [x26], #0x10\n"
+      "ldr q24, [x25], #0x10\n"
+      "zip1 v16.2d, v20.2d, v19.2d\n"
+      "zip1 v18.2d, v25.2d, v24.2d\n"
+      "ldr q23, [x24], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip1 v17.2d, v23.2d, v22.2d\n"
+      "zip2 v21.2d, v20.2d, v19.2d\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q19, [x21], #0x10\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.2d, v20.2d, v19.2d\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v18.2d, v23.2d, v18.2d\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "zip2 v18.2d, v25.2d, v24.2d\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "zip1 v17.2d, v20.2d, v16.2d\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip2 v16.2d, v20.2d, v16.2d\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "zip2 v17.2d, v23.2d, v22.2d\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v16.2d, v20.2d, v19.2d\n"
       "prfm pldl1keep, [x22, #0x70]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "str q22, [%x[out_ptr], #0x10]\n"
-      "str q19, [%x[out_ptr], #0x20]\n"
-      "str q17, [%x[out_ptr], #0x30]\n"
-      "str q24, [%x[out_ptr], #0x40]\n"
-      "str q21, [%x[out_ptr], #0x50]\n"
-      "str q18, [%x[out_ptr], #0x60]\n"
+      "str q21, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
       "bge 2b\n"
       "3:"  // Main loop skip
       "cbz %x[width], 12f\n"
       "tbz %x[width], #3, 7f\n"
-      "ldr d27, [x27], #0x8\n"
-      "ldr d24, [x26], #0x8\n"
-      "ldr d25, [x25], #0x8\n"
+      "ldr d25, [x28], #0x8\n"
+      "ldr d24, [x27], #0x8\n"
+      "ldr d23, [x26], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
       "ldr d21, [x24], #0x8\n"
-      "ldr d23, [x23], #0x8\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "ldr d16, [x20], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
       "tbz %x[width], #2, 5f\n"
-      "ld1 { v27.s }[2], [x27], #0x4\n"
-      "ld1 { v24.s }[2], [x26], #0x4\n"
-      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x28], #0x4\n"
+      "ld1 { v24.s }[2], [x27], #0x4\n"
+      "ld1 { v23.s }[2], [x26], #0x4\n"
+      "ld1 { v22.s }[2], [x25], #0x4\n"
       "ld1 { v21.s }[2], [x24], #0x4\n"
-      "ld1 { v23.s }[2], [x23], #0x4\n"
-      "ld1 { v18.s }[2], [x22], #0x4\n"
-      "ld1 { v20.s }[2], [x21], #0x4\n"
-      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "ld1 { v20.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "ld1 { v18.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v27.h }[6], [x27], #0x2\n"
-      "mov x19, #0x2\n"
-      "ld1 { v24.h }[6], [x26], #0x2\n"
-      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x28], #0x2\n"
+      "ld1 { v24.h }[6], [x27], #0x2\n"
+      "mov x20, #0x2\n"
+      "ld1 { v23.h }[6], [x26], #0x2\n"
+      "ld1 { v22.h }[6], [x25], #0x2\n"
       "ld1 { v21.h }[6], [x24], #0x2\n"
-      "ld1 { v23.h }[6], [x23], #0x2\n"
-      "ld1 { v18.h }[6], [x22], #0x2\n"
-      "ld1 { v20.h }[6], [x21], #0x2\n"
-      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "ld1 { v20.h }[6], [x23], #0x2\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "ld1 { v18.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v27.b }[14], [x27]\n"
-      "ld1 { v24.b }[14], [x26]\n"
-      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x28]\n"
+      "ld1 { v24.b }[14], [x27]\n"
+      "ld1 { v23.b }[14], [x26]\n"
+      "ld1 { v22.b }[14], [x25]\n"
       "ld1 { v21.b }[14], [x24]\n"
-      "ld1 { v23.b }[14], [x23]\n"
-      "ld1 { v18.b }[14], [x22]\n"
-      "ld1 { v20.b }[14], [x21]\n"
-      "ld1 { v16.b }[14], [x20]\n"
+      "ld1 { v20.b }[14], [x23]\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "ld1 { v18.b }[14], [x21]\n"
       "b 11f\n"
       "4:"  // odd_loads_1_12
-      "mov x19, #0x2\n"
+      "mov x20, #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v27.b }[12], [x27]\n"
-      "ld1 { v24.b }[12], [x26]\n"
-      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x28]\n"
+      "ld1 { v24.b }[12], [x27]\n"
+      "ld1 { v23.b }[12], [x26]\n"
+      "ld1 { v22.b }[12], [x25]\n"
       "ld1 { v21.b }[12], [x24]\n"
-      "ld1 { v23.b }[12], [x23]\n"
-      "ld1 { v18.b }[12], [x22]\n"
-      "ld1 { v20.b }[12], [x21]\n"
-      "ld1 { v16.b }[12], [x20]\n"
+      "ld1 { v20.b }[12], [x23]\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "ld1 { v18.b }[12], [x21]\n"
       "b 11f\n"
       "5:"  // odd_loads_2_8
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v27.h }[4], [x27], #0x2\n"
-      "ld1 { v24.h }[4], [x26], #0x2\n"
-      "mov x19, #0x2\n"
-      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x28], #0x2\n"
+      "ld1 { v24.h }[4], [x27], #0x2\n"
+      "mov x20, #0x2\n"
+      "ld1 { v23.h }[4], [x26], #0x2\n"
+      "ld1 { v22.h }[4], [x25], #0x2\n"
       "ld1 { v21.h }[4], [x24], #0x2\n"
-      "ld1 { v23.h }[4], [x23], #0x2\n"
-      "ld1 { v18.h }[4], [x22], #0x2\n"
-      "ld1 { v20.h }[4], [x21], #0x2\n"
-      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "ld1 { v20.h }[4], [x23], #0x2\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "ld1 { v18.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v27.b }[10], [x27]\n"
-      "ld1 { v24.b }[10], [x26]\n"
-      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x28]\n"
+      "ld1 { v24.b }[10], [x27]\n"
+      "ld1 { v23.b }[10], [x26]\n"
+      "ld1 { v22.b }[10], [x25]\n"
       "ld1 { v21.b }[10], [x24]\n"
-      "ld1 { v23.b }[10], [x23]\n"
-      "ld1 { v18.b }[10], [x22]\n"
-      "ld1 { v20.b }[10], [x21]\n"
-      "ld1 { v16.b }[10], [x20]\n"
+      "ld1 { v20.b }[10], [x23]\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "ld1 { v18.b }[10], [x21]\n"
       "b 11f\n"
       "6:"  // odd_loads_1_8
-      "mov x19, #0x1\n"
+      "mov x20, #0x1\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v27.b }[8], [x27]\n"
-      "ld1 { v24.b }[8], [x26]\n"
-      "mov x19, #0x2\n"
-      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x28]\n"
+      "ld1 { v24.b }[8], [x27]\n"
+      "mov x20, #0x2\n"
+      "ld1 { v23.b }[8], [x26]\n"
+      "ld1 { v22.b }[8], [x25]\n"
       "ld1 { v21.b }[8], [x24]\n"
-      "ld1 { v23.b }[8], [x23]\n"
-      "ld1 { v18.b }[8], [x22]\n"
-      "ld1 { v20.b }[8], [x21]\n"
-      "ld1 { v16.b }[8], [x20]\n"
+      "ld1 { v20.b }[8], [x23]\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "ld1 { v18.b }[8], [x21]\n"
       "b 11f\n"
       "7:"  // odd_loads_4_0
       "tbz %x[width], #2, 9f\n"
-      "ldr s27, [x27], #0x4\n"
-      "ldr s24, [x26], #0x4\n"
-      "ldr s25, [x25], #0x4\n"
+      "ldr s25, [x28], #0x4\n"
+      "ldr s24, [x27], #0x4\n"
+      "ldr s23, [x26], #0x4\n"
+      "ldr s22, [x25], #0x4\n"
       "ldr s21, [x24], #0x4\n"
-      "ldr s23, [x23], #0x4\n"
-      "ldr s18, [x22], #0x4\n"
-      "ldr s20, [x21], #0x4\n"
-      "ldr s16, [x20], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
       "tbz %x[width], #1, 8f\n"
-      "ld1 { v27.h }[2], [x27], #0x2\n"
-      "mov x19, #0x1\n"
-      "ld1 { v24.h }[2], [x26], #0x2\n"
-      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x28], #0x2\n"
+      "ld1 { v24.h }[2], [x27], #0x2\n"
+      "mov x20, #0x1\n"
+      "ld1 { v23.h }[2], [x26], #0x2\n"
+      "ld1 { v22.h }[2], [x25], #0x2\n"
       "ld1 { v21.h }[2], [x24], #0x2\n"
-      "ld1 { v23.h }[2], [x23], #0x2\n"
-      "ld1 { v18.h }[2], [x22], #0x2\n"
-      "ld1 { v20.h }[2], [x21], #0x2\n"
-      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "ld1 { v20.h }[2], [x23], #0x2\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "ld1 { v18.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v27.b }[6], [x27]\n"
-      "ld1 { v24.b }[6], [x26]\n"
-      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x28]\n"
+      "ld1 { v24.b }[6], [x27]\n"
+      "ld1 { v23.b }[6], [x26]\n"
+      "ld1 { v22.b }[6], [x25]\n"
       "ld1 { v21.b }[6], [x24]\n"
-      "ld1 { v23.b }[6], [x23]\n"
-      "ld1 { v18.b }[6], [x22]\n"
-      "ld1 { v20.b }[6], [x21]\n"
-      "ld1 { v16.b }[6], [x20]\n"
+      "ld1 { v20.b }[6], [x23]\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "ld1 { v18.b }[6], [x21]\n"
       "b 11f\n"
       "8:"  // odd_loads_1_4
-      "mov x19, #0x1\n"
+      "mov x20, #0x1\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v27.b }[4], [x27]\n"
-      "ld1 { v24.b }[4], [x26]\n"
-      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x28]\n"
+      "ld1 { v24.b }[4], [x27]\n"
+      "ld1 { v23.b }[4], [x26]\n"
+      "ld1 { v22.b }[4], [x25]\n"
       "ld1 { v21.b }[4], [x24]\n"
-      "ld1 { v23.b }[4], [x23]\n"
-      "ld1 { v18.b }[4], [x22]\n"
-      "ld1 { v20.b }[4], [x21]\n"
-      "ld1 { v16.b }[4], [x20]\n"
+      "ld1 { v20.b }[4], [x23]\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "ld1 { v18.b }[4], [x21]\n"
       "b 11f\n"
       "9:"  // odd_loads_2_0
       "tbz %x[width], #1, 10f\n"
-      "ldr h27, [x27], #0x2\n"
-      "ldr h24, [x26], #0x2\n"
-      "mov x19, #0x1\n"
-      "ldr h25, [x25], #0x2\n"
+      "ldr h25, [x28], #0x2\n"
+      "ldr h24, [x27], #0x2\n"
+      "mov x20, #0x1\n"
+      "ldr h23, [x26], #0x2\n"
+      "ldr h22, [x25], #0x2\n"
       "ldr h21, [x24], #0x2\n"
-      "ldr h23, [x23], #0x2\n"
-      "ldr h18, [x22], #0x2\n"
-      "ldr h20, [x21], #0x2\n"
-      "ldr h16, [x20], #0x2\n"
+      "ldr h20, [x23], #0x2\n"
+      "ldr h19, [x22], #0x2\n"
+      "ldr h18, [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v27.b }[2], [x27]\n"
-      "ld1 { v24.b }[2], [x26]\n"
-      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x28]\n"
+      "ld1 { v24.b }[2], [x27]\n"
+      "ld1 { v23.b }[2], [x26]\n"
+      "ld1 { v22.b }[2], [x25]\n"
       "ld1 { v21.b }[2], [x24]\n"
-      "ld1 { v23.b }[2], [x23]\n"
-      "ld1 { v18.b }[2], [x22]\n"
-      "ld1 { v20.b }[2], [x21]\n"
-      "ld1 { v16.b }[2], [x20]\n"
+      "ld1 { v20.b }[2], [x23]\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "ld1 { v18.b }[2], [x21]\n"
       "b 11f\n"
       "10:"  // odd_loads_1_0
-      "ldr b27, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr b24, [x26, #0x0]\n"
-      "ldr b25, [x25, #0x0]\n"
+      "ldr b25, [x28, #0x0]\n"
+      "ldr b24, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr b23, [x26, #0x0]\n"
+      "ldr b22, [x25, #0x0]\n"
       "ldr b21, [x24, #0x0]\n"
-      "ldr b23, [x23, #0x0]\n"
-      "ldr b18, [x22, #0x0]\n"
-      "ldr b20, [x21, #0x0]\n"
-      "ldr b16, [x20, #0x0]\n"
+      "ldr b20, [x23, #0x0]\n"
+      "ldr b19, [x22, #0x0]\n"
+      "ldr b18, [x21, #0x0]\n"
       "11:"  // Odd load end
-      "zip1 v26.2d, v27.2d, v24.2d\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "zip1 v22.2d, v25.2d, v21.2d\n"
-      "subs x19, x19, #0x1\n"
-      "zip1 v19.2d, v23.2d, v18.2d\n"
-      "str q22, [%x[out_ptr], #0x10]\n"
-      "zip1 v17.2d, v20.2d, v16.2d\n"
-      "str q19, [%x[out_ptr], #0x20]\n"
-      "str q17, [%x[out_ptr], #0x30]\n"
+      "subs x20, x20, #0x1\n"
+      "zip1 v16.2d, v25.2d, v24.2d\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v21.2d, v20.2d\n"
+      "zip1 v16.2d, v19.2d, v18.2d\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "beq 12f\n"
-      "zip2 v24.2d, v27.2d, v24.2d\n"
-      "str q24, [%x[out_ptr], #0x0]\n"
-      "zip2 v21.2d, v25.2d, v21.2d\n"
-      "zip2 v18.2d, v23.2d, v18.2d\n"
-      "str q21, [%x[out_ptr], #0x10]\n"
-      "zip2 v16.2d, v20.2d, v16.2d\n"
-      "str q18, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.2d, v25.2d, v24.2d\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v17.2d, v21.2d, v20.2d\n"
+      "zip2 v16.2d, v19.2d, v18.2d\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "12:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
index 3550830fc3..a2288e8299 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,46 +31,47 @@ void interleave_block<8, 8, VLType::None, true>(
 )
 {
   __asm__ __volatile__(
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "mov x20, #0x0\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "movi v5.8h, #0x0\n"
-      "ldr x27, [%x[in], #0x0]\n"
-      "mov x19, #0x0\n"
       "movi v4.8h, #0x0\n"
-      "ldr x26, [%x[in], #0x8]\n"
-      "cmp %x[height], #0x8\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "movi v3.8h, #0x0\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "add x27, x27, %x[row_offset]\n"
       "movi v2.8h, #0x0\n"
-      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "movi v1.4s, #0x0\n"
-      "ldr x23, [%x[in], #0x20]\n"
-      "add x26, x26, %x[row_offset]\n"
       "movi v0.4s, #0x0\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "add x25, x25, %x[row_offset]\n"
       "movi v31.4s, #0x0\n"
-      "ldr x21, [%x[in], #0x30]\n"
-      "add x24, x24, %x[row_offset]\n"
       "movi v30.4s, #0x0\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "add x28, x28, %x[row_offset]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "add x24, x24, %x[row_offset]\n"
       "add x23, x23, %x[row_offset]\n"
       "add x22, x22, %x[row_offset]\n"
       "add x21, x21, %x[row_offset]\n"
-      "add x20, x20, %x[row_offset]\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "movi v29.4s, #0x0\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
       "prfm pldl1keep, [x27, #0x0]\n"
+      "movi v29.4s, #0x0\n"
       "movi v28.4s, #0x0\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
@@ -78,7 +79,7 @@ void interleave_block<8, 8, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -86,7 +87,6 @@ void interleave_block<8, 8, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "cbnz %w[first], 2f\n"
       "sub %x[out_ptr], %x[out_ptr], #0x20\n"
       "ld1 { v29.4s }, [%x[out_ptr]]\n"
@@ -95,266 +95,266 @@ void interleave_block<8, 8, VLType::None, true>(
       "cmp %x[width], #0x10\n"
       "blt 5f\n"
       "3:"  // Main loop head
-      "cmp x19, #0x3e\n"
+      "cmp x20, #0x3e\n"
       "ble 4f\n"
       "sadalp v1.4s, v5.8h\n"
       "movi v5.8h, #0x0\n"
+      "mov x20, #0x0\n"
       "sadalp v0.4s, v4.8h\n"
       "movi v4.8h, #0x0\n"
       "sadalp v31.4s, v3.8h\n"
       "movi v3.8h, #0x0\n"
       "sadalp v30.4s, v2.8h\n"
       "movi v2.8h, #0x0\n"
-      "mov x19, #0x0\n"
       "4:"  // no_accumulate_16
-      "ldr q27, [x27], #0x10\n"
-      "add x19, x19, #0x1\n"
-      "ldr q24, [x26], #0x10\n"
-      "zip1 v26.2d, v27.2d, v24.2d\n"
-      "ldr q25, [x25], #0x10\n"
-      "subs %x[width], %x[width], #0x10\n"
-      "zip2 v24.2d, v27.2d, v24.2d\n"
-      "ldr q21, [x24], #0x10\n"
+      "ldr q27, [x28], #0x10\n"
+      "ldr q19, [x27], #0x10\n"
+      "zip1 v26.2d, v27.2d, v19.2d\n"
       "sadalp v5.8h, v26.16b\n"
-      "zip1 v23.2d, v25.2d, v21.2d\n"
-      "ldr q22, [x23], #0x10\n"
+      "ldr q25, [x26], #0x10\n"
+      "ldr q18, [x25], #0x10\n"
+      "zip1 v24.2d, v25.2d, v18.2d\n"
+      "sadalp v4.8h, v24.16b\n"
+      "ldr q23, [x24], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "zip1 v22.2d, v23.2d, v17.2d\n"
+      "sadalp v3.8h, v22.16b\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v20.2d, v21.2d, v16.2d\n"
+      "sadalp v2.8h, v20.16b\n"
+      "zip2 v19.2d, v27.2d, v19.2d\n"
+      "zip2 v18.2d, v25.2d, v18.2d\n"
+      "subs %x[width], %x[width], #0x10\n"
       "cmp %x[width], #0x10\n"
-      "zip2 v21.2d, v25.2d, v21.2d\n"
-      "ldr q18, [x22], #0x10\n"
-      "sadalp v4.8h, v23.16b\n"
-      "zip1 v20.2d, v22.2d, v18.2d\n"
-      "ldr q19, [x21], #0x10\n"
-      "sadalp v5.8h, v24.16b\n"
-      "zip2 v18.2d, v22.2d, v18.2d\n"
-      "ldr q16, [x20], #0x10\n"
-      "sadalp v3.8h, v20.16b\n"
-      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "zip2 v17.2d, v23.2d, v17.2d\n"
+      "zip2 v16.2d, v21.2d, v16.2d\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "sadalp v4.8h, v21.16b\n"
-      "zip2 v16.2d, v19.2d, v16.2d\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "sadalp v2.8h, v17.16b\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "sadalp v3.8h, v18.16b\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "sadalp v5.8h, v19.16b\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "sadalp v2.8h, v16.16b\n"
       "prfm pldl1keep, [x23, #0x70]\n"
+      "str q24, [%x[out_ptr], #0x10]\n"
+      "sadalp v4.8h, v18.16b\n"
       "prfm pldl1keep, [x22, #0x70]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "str q23, [%x[out_ptr], #0x10]\n"
-      "str q20, [%x[out_ptr], #0x20]\n"
-      "str q17, [%x[out_ptr], #0x30]\n"
-      "str q24, [%x[out_ptr], #0x40]\n"
-      "str q21, [%x[out_ptr], #0x50]\n"
-      "str q18, [%x[out_ptr], #0x60]\n"
+      "str q22, [%x[out_ptr], #0x20]\n"
+      "sadalp v3.8h, v17.16b\n"
+      "str q20, [%x[out_ptr], #0x30]\n"
+      "sadalp v2.8h, v16.16b\n"
+      "add x20, x20, #0x1\n"
+      "str q19, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
       "bge 3b\n"
       "5:"  // Main loop skip
       "cbz %x[width], 14f\n"
       "tbz %x[width], #3, 9f\n"
-      "ldr d27, [x27], #0x8\n"
-      "ldr d24, [x26], #0x8\n"
-      "ldr d25, [x25], #0x8\n"
-      "ldr d21, [x24], #0x8\n"
+      "ldr d27, [x28], #0x8\n"
+      "ldr d26, [x27], #0x8\n"
+      "ldr d25, [x26], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
+      "ldr d23, [x24], #0x8\n"
       "ldr d22, [x23], #0x8\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
-      "ldr d16, [x20], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
       "tbz %x[width], #2, 7f\n"
-      "ld1 { v27.s }[2], [x27], #0x4\n"
-      "ld1 { v24.s }[2], [x26], #0x4\n"
-      "ld1 { v25.s }[2], [x25], #0x4\n"
-      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v27.s }[2], [x28], #0x4\n"
+      "ld1 { v26.s }[2], [x27], #0x4\n"
+      "ld1 { v25.s }[2], [x26], #0x4\n"
+      "ld1 { v24.s }[2], [x25], #0x4\n"
+      "ld1 { v23.s }[2], [x24], #0x4\n"
       "ld1 { v22.s }[2], [x23], #0x4\n"
-      "ld1 { v18.s }[2], [x22], #0x4\n"
-      "ld1 { v19.s }[2], [x21], #0x4\n"
-      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "ld1 { v21.s }[2], [x22], #0x4\n"
+      "ld1 { v20.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v27.h }[6], [x27], #0x2\n"
-      "mov x19, #0x2\n"
-      "ld1 { v24.h }[6], [x26], #0x2\n"
-      "ld1 { v25.h }[6], [x25], #0x2\n"
-      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v27.h }[6], [x28], #0x2\n"
+      "ld1 { v26.h }[6], [x27], #0x2\n"
+      "mov x20, #0x2\n"
+      "ld1 { v25.h }[6], [x26], #0x2\n"
+      "ld1 { v24.h }[6], [x25], #0x2\n"
+      "ld1 { v23.h }[6], [x24], #0x2\n"
       "ld1 { v22.h }[6], [x23], #0x2\n"
-      "ld1 { v18.h }[6], [x22], #0x2\n"
-      "ld1 { v19.h }[6], [x21], #0x2\n"
-      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "ld1 { v21.h }[6], [x22], #0x2\n"
+      "ld1 { v20.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[14], [x27]\n"
-      "ld1 { v24.b }[14], [x26]\n"
-      "ld1 { v25.b }[14], [x25]\n"
-      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v27.b }[14], [x28]\n"
+      "ld1 { v26.b }[14], [x27]\n"
+      "ld1 { v25.b }[14], [x26]\n"
+      "ld1 { v24.b }[14], [x25]\n"
+      "ld1 { v23.b }[14], [x24]\n"
       "ld1 { v22.b }[14], [x23]\n"
-      "ld1 { v18.b }[14], [x22]\n"
-      "ld1 { v19.b }[14], [x21]\n"
-      "ld1 { v16.b }[14], [x20]\n"
+      "ld1 { v21.b }[14], [x22]\n"
+      "ld1 { v20.b }[14], [x21]\n"
       "b 13f\n"
       "6:"  // odd_loads_1_12
-      "mov x19, #0x2\n"
+      "mov x20, #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[12], [x27]\n"
-      "ld1 { v24.b }[12], [x26]\n"
-      "ld1 { v25.b }[12], [x25]\n"
-      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v27.b }[12], [x28]\n"
+      "ld1 { v26.b }[12], [x27]\n"
+      "ld1 { v25.b }[12], [x26]\n"
+      "ld1 { v24.b }[12], [x25]\n"
+      "ld1 { v23.b }[12], [x24]\n"
       "ld1 { v22.b }[12], [x23]\n"
-      "ld1 { v18.b }[12], [x22]\n"
-      "ld1 { v19.b }[12], [x21]\n"
-      "ld1 { v16.b }[12], [x20]\n"
+      "ld1 { v21.b }[12], [x22]\n"
+      "ld1 { v20.b }[12], [x21]\n"
       "b 13f\n"
       "7:"  // odd_loads_2_8
       "tbz %x[width], #1, 8f\n"
-      "ld1 { v27.h }[4], [x27], #0x2\n"
-      "ld1 { v24.h }[4], [x26], #0x2\n"
-      "mov x19, #0x2\n"
-      "ld1 { v25.h }[4], [x25], #0x2\n"
-      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v27.h }[4], [x28], #0x2\n"
+      "ld1 { v26.h }[4], [x27], #0x2\n"
+      "mov x20, #0x2\n"
+      "ld1 { v25.h }[4], [x26], #0x2\n"
+      "ld1 { v24.h }[4], [x25], #0x2\n"
+      "ld1 { v23.h }[4], [x24], #0x2\n"
       "ld1 { v22.h }[4], [x23], #0x2\n"
-      "ld1 { v18.h }[4], [x22], #0x2\n"
-      "ld1 { v19.h }[4], [x21], #0x2\n"
-      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "ld1 { v21.h }[4], [x22], #0x2\n"
+      "ld1 { v20.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[10], [x27]\n"
-      "ld1 { v24.b }[10], [x26]\n"
-      "ld1 { v25.b }[10], [x25]\n"
-      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v27.b }[10], [x28]\n"
+      "ld1 { v26.b }[10], [x27]\n"
+      "ld1 { v25.b }[10], [x26]\n"
+      "ld1 { v24.b }[10], [x25]\n"
+      "ld1 { v23.b }[10], [x24]\n"
       "ld1 { v22.b }[10], [x23]\n"
-      "ld1 { v18.b }[10], [x22]\n"
-      "ld1 { v19.b }[10], [x21]\n"
-      "ld1 { v16.b }[10], [x20]\n"
+      "ld1 { v21.b }[10], [x22]\n"
+      "ld1 { v20.b }[10], [x21]\n"
       "b 13f\n"
       "8:"  // odd_loads_1_8
-      "mov x19, #0x1\n"
+      "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[8], [x27]\n"
-      "ld1 { v24.b }[8], [x26]\n"
-      "mov x19, #0x2\n"
-      "ld1 { v25.b }[8], [x25]\n"
-      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v27.b }[8], [x28]\n"
+      "ld1 { v26.b }[8], [x27]\n"
+      "mov x20, #0x2\n"
+      "ld1 { v25.b }[8], [x26]\n"
+      "ld1 { v24.b }[8], [x25]\n"
+      "ld1 { v23.b }[8], [x24]\n"
       "ld1 { v22.b }[8], [x23]\n"
-      "ld1 { v18.b }[8], [x22]\n"
-      "ld1 { v19.b }[8], [x21]\n"
-      "ld1 { v16.b }[8], [x20]\n"
+      "ld1 { v21.b }[8], [x22]\n"
+      "ld1 { v20.b }[8], [x21]\n"
       "b 13f\n"
       "9:"  // odd_loads_4_0
       "tbz %x[width], #2, 11f\n"
-      "ldr s27, [x27], #0x4\n"
-      "ldr s24, [x26], #0x4\n"
-      "ldr s25, [x25], #0x4\n"
-      "ldr s21, [x24], #0x4\n"
+      "ldr s27, [x28], #0x4\n"
+      "ldr s26, [x27], #0x4\n"
+      "ldr s25, [x26], #0x4\n"
+      "ldr s24, [x25], #0x4\n"
+      "ldr s23, [x24], #0x4\n"
       "ldr s22, [x23], #0x4\n"
-      "ldr s18, [x22], #0x4\n"
-      "ldr s19, [x21], #0x4\n"
-      "ldr s16, [x20], #0x4\n"
+      "ldr s21, [x22], #0x4\n"
+      "ldr s20, [x21], #0x4\n"
       "tbz %x[width], #1, 10f\n"
-      "ld1 { v27.h }[2], [x27], #0x2\n"
-      "mov x19, #0x1\n"
-      "ld1 { v24.h }[2], [x26], #0x2\n"
-      "ld1 { v25.h }[2], [x25], #0x2\n"
-      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v27.h }[2], [x28], #0x2\n"
+      "ld1 { v26.h }[2], [x27], #0x2\n"
+      "mov x20, #0x1\n"
+      "ld1 { v25.h }[2], [x26], #0x2\n"
+      "ld1 { v24.h }[2], [x25], #0x2\n"
+      "ld1 { v23.h }[2], [x24], #0x2\n"
       "ld1 { v22.h }[2], [x23], #0x2\n"
-      "ld1 { v18.h }[2], [x22], #0x2\n"
-      "ld1 { v19.h }[2], [x21], #0x2\n"
-      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "ld1 { v21.h }[2], [x22], #0x2\n"
+      "ld1 { v20.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[6], [x27]\n"
-      "ld1 { v24.b }[6], [x26]\n"
-      "ld1 { v25.b }[6], [x25]\n"
-      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v27.b }[6], [x28]\n"
+      "ld1 { v26.b }[6], [x27]\n"
+      "ld1 { v25.b }[6], [x26]\n"
+      "ld1 { v24.b }[6], [x25]\n"
+      "ld1 { v23.b }[6], [x24]\n"
       "ld1 { v22.b }[6], [x23]\n"
-      "ld1 { v18.b }[6], [x22]\n"
-      "ld1 { v19.b }[6], [x21]\n"
-      "ld1 { v16.b }[6], [x20]\n"
+      "ld1 { v21.b }[6], [x22]\n"
+      "ld1 { v20.b }[6], [x21]\n"
       "b 13f\n"
       "10:"  // odd_loads_1_4
-      "mov x19, #0x1\n"
+      "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[4], [x27]\n"
-      "ld1 { v24.b }[4], [x26]\n"
-      "ld1 { v25.b }[4], [x25]\n"
-      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v27.b }[4], [x28]\n"
+      "ld1 { v26.b }[4], [x27]\n"
+      "ld1 { v25.b }[4], [x26]\n"
+      "ld1 { v24.b }[4], [x25]\n"
+      "ld1 { v23.b }[4], [x24]\n"
       "ld1 { v22.b }[4], [x23]\n"
-      "ld1 { v18.b }[4], [x22]\n"
-      "ld1 { v19.b }[4], [x21]\n"
-      "ld1 { v16.b }[4], [x20]\n"
+      "ld1 { v21.b }[4], [x22]\n"
+      "ld1 { v20.b }[4], [x21]\n"
       "b 13f\n"
       "11:"  // odd_loads_2_0
       "tbz %x[width], #1, 12f\n"
-      "ldr h27, [x27], #0x2\n"
-      "ldr h24, [x26], #0x2\n"
-      "mov x19, #0x1\n"
-      "ldr h25, [x25], #0x2\n"
-      "ldr h21, [x24], #0x2\n"
+      "ldr h27, [x28], #0x2\n"
+      "ldr h26, [x27], #0x2\n"
+      "mov x20, #0x1\n"
+      "ldr h25, [x26], #0x2\n"
+      "ldr h24, [x25], #0x2\n"
+      "ldr h23, [x24], #0x2\n"
       "ldr h22, [x23], #0x2\n"
-      "ldr h18, [x22], #0x2\n"
-      "ldr h19, [x21], #0x2\n"
-      "ldr h16, [x20], #0x2\n"
+      "ldr h21, [x22], #0x2\n"
+      "ldr h20, [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[2], [x27]\n"
-      "ld1 { v24.b }[2], [x26]\n"
-      "ld1 { v25.b }[2], [x25]\n"
-      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v27.b }[2], [x28]\n"
+      "ld1 { v26.b }[2], [x27]\n"
+      "ld1 { v25.b }[2], [x26]\n"
+      "ld1 { v24.b }[2], [x25]\n"
+      "ld1 { v23.b }[2], [x24]\n"
       "ld1 { v22.b }[2], [x23]\n"
-      "ld1 { v18.b }[2], [x22]\n"
-      "ld1 { v19.b }[2], [x21]\n"
-      "ld1 { v16.b }[2], [x20]\n"
+      "ld1 { v21.b }[2], [x22]\n"
+      "ld1 { v20.b }[2], [x21]\n"
       "b 13f\n"
       "12:"  // odd_loads_1_0
-      "ldr b27, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr b24, [x26, #0x0]\n"
-      "ldr b25, [x25, #0x0]\n"
-      "ldr b21, [x24, #0x0]\n"
+      "ldr b27, [x28, #0x0]\n"
+      "ldr b26, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr b25, [x26, #0x0]\n"
+      "ldr b24, [x25, #0x0]\n"
+      "ldr b23, [x24, #0x0]\n"
       "ldr b22, [x23, #0x0]\n"
-      "ldr b18, [x22, #0x0]\n"
-      "ldr b19, [x21, #0x0]\n"
-      "ldr b16, [x20, #0x0]\n"
+      "ldr b21, [x22, #0x0]\n"
+      "ldr b20, [x21, #0x0]\n"
       "13:"  // Odd load end
-      "zip1 v26.2d, v27.2d, v24.2d\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "zip1 v23.2d, v25.2d, v21.2d\n"
-      "sadalp v5.8h, v26.16b\n"
-      "zip1 v20.2d, v22.2d, v18.2d\n"
-      "str q23, [%x[out_ptr], #0x10]\n"
-      "sadalp v4.8h, v23.16b\n"
-      "zip1 v17.2d, v19.2d, v16.2d\n"
-      "str q20, [%x[out_ptr], #0x20]\n"
-      "sadalp v3.8h, v20.16b\n"
-      "str q17, [%x[out_ptr], #0x30]\n"
-      "sadalp v2.8h, v17.16b\n"
-      "subs x19, x19, #0x1\n"
+      "zip1 v19.2d, v27.2d, v26.2d\n"
+      "zip1 v18.2d, v25.2d, v24.2d\n"
+      "subs x20, x20, #0x1\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "zip1 v17.2d, v23.2d, v22.2d\n"
+      "zip1 v16.2d, v21.2d, v20.2d\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "sadalp v5.8h, v19.16b\n"
+      "sadalp v4.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "sadalp v3.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "sadalp v2.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "beq 14f\n"
-      "zip2 v24.2d, v27.2d, v24.2d\n"
-      "str q24, [%x[out_ptr], #0x0]\n"
-      "zip2 v21.2d, v25.2d, v21.2d\n"
-      "sadalp v5.8h, v24.16b\n"
-      "zip2 v18.2d, v22.2d, v18.2d\n"
-      "str q21, [%x[out_ptr], #0x10]\n"
-      "sadalp v4.8h, v21.16b\n"
-      "zip2 v16.2d, v19.2d, v16.2d\n"
-      "str q18, [%x[out_ptr], #0x20]\n"
-      "sadalp v3.8h, v18.16b\n"
+      "zip2 v19.2d, v27.2d, v26.2d\n"
+      "zip2 v18.2d, v25.2d, v24.2d\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "zip2 v17.2d, v23.2d, v22.2d\n"
+      "zip2 v16.2d, v21.2d, v20.2d\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "sadalp v5.8h, v19.16b\n"
+      "sadalp v4.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "sadalp v3.8h, v17.16b\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "sadalp v2.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "14:"  // Odds skip
       "sadalp v1.4s, v5.8h\n"
       "sadalp v0.4s, v4.8h\n"
-      "addp v1.4s, v1.4s, v0.4s\n"
       "sadalp v31.4s, v3.8h\n"
       "sadalp v30.4s, v2.8h\n"
+      "addp v1.4s, v1.4s, v0.4s\n"
+      "addp v16.4s, v31.4s, v30.4s\n"
       "add v1.4s, v1.4s, v29.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
       "str q1, [%x[out_ptr], #0x0]\n"
-      "addp v0.4s, v31.4s, v30.4s\n"
-      "add v0.4s, v0.4s, v28.4s\n"
-      "str q0, [%x[out_ptr], #0x10]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
index 454260ef1a..56d34a8a64 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -31,46 +31,47 @@ void interleave_block<8, 8, VLType::None, true>(
 )
 {
   __asm__ __volatile__(
+      "ldr x28, [%x[in], #0x0]\n"
+      "ldr x27, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "mov x20, #0x0\n"
+      "ldr x26, [%x[in], #0x10]\n"
+      "ldr x25, [%x[in], #0x18]\n"
       "movi v5.8h, #0x0\n"
-      "ldr x27, [%x[in], #0x0]\n"
-      "mov x19, #0x0\n"
       "movi v4.8h, #0x0\n"
-      "ldr x26, [%x[in], #0x8]\n"
-      "cmp %x[height], #0x8\n"
+      "ldr x24, [%x[in], #0x20]\n"
+      "ldr x23, [%x[in], #0x28]\n"
       "movi v3.8h, #0x0\n"
-      "ldr x25, [%x[in], #0x10]\n"
-      "add x27, x27, %x[row_offset]\n"
       "movi v2.8h, #0x0\n"
-      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x22, [%x[in], #0x30]\n"
+      "ldr x21, [%x[in], #0x38]\n"
       "movi v1.4s, #0x0\n"
-      "ldr x23, [%x[in], #0x20]\n"
-      "add x26, x26, %x[row_offset]\n"
       "movi v0.4s, #0x0\n"
-      "ldr x22, [%x[in], #0x28]\n"
-      "add x25, x25, %x[row_offset]\n"
       "movi v31.4s, #0x0\n"
-      "ldr x21, [%x[in], #0x30]\n"
-      "add x24, x24, %x[row_offset]\n"
       "movi v30.4s, #0x0\n"
-      "ldr x20, [%x[in], #0x38]\n"
+      "add x28, x28, %x[row_offset]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "add x24, x24, %x[row_offset]\n"
       "add x23, x23, %x[row_offset]\n"
       "add x22, x22, %x[row_offset]\n"
       "add x21, x21, %x[row_offset]\n"
-      "add x20, x20, %x[row_offset]\n"
       "beq 1f\n"
-      "mov x20, x27\n"
       "cmp %x[height], #0x2\n"
-      "csel x26, x26, x27, GE\n"
-      "csel x25, x25, x27, GT\n"
+      "csel x27, x27, x28, GE\n"
+      "csel x26, x26, x28, GT\n"
       "cmp %x[height], #0x4\n"
-      "csel x24, x24, x27, GE\n"
-      "csel x23, x23, x27, GT\n"
+      "csel x25, x25, x28, GE\n"
+      "csel x24, x24, x28, GT\n"
       "cmp %x[height], #0x6\n"
-      "csel x22, x22, x27, GE\n"
-      "csel x21, x21, x27, GT\n"
+      "mov x21, x28\n"
+      "csel x23, x23, x28, GE\n"
+      "csel x22, x22, x28, GT\n"
       "1:"  // no_pointer_adj
-      "movi v29.4s, #0x0\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
       "prfm pldl1keep, [x27, #0x0]\n"
+      "movi v29.4s, #0x0\n"
       "movi v28.4s, #0x0\n"
       "prfm pldl1keep, [x26, #0x0]\n"
       "prfm pldl1keep, [x25, #0x0]\n"
@@ -78,7 +79,7 @@ void interleave_block<8, 8, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x0]\n"
       "prfm pldl1keep, [x22, #0x0]\n"
       "prfm pldl1keep, [x21, #0x0]\n"
-      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
       "prfm pldl1keep, [x27, #0x40]\n"
       "prfm pldl1keep, [x26, #0x40]\n"
       "prfm pldl1keep, [x25, #0x40]\n"
@@ -86,7 +87,6 @@ void interleave_block<8, 8, VLType::None, true>(
       "prfm pldl1keep, [x23, #0x40]\n"
       "prfm pldl1keep, [x22, #0x40]\n"
       "prfm pldl1keep, [x21, #0x40]\n"
-      "prfm pldl1keep, [x20, #0x40]\n"
       "cbnz %w[first], 2f\n"
       "sub %x[out_ptr], %x[out_ptr], #0x20\n"
       "ld1 { v29.4s }, [%x[out_ptr]]\n"
@@ -95,266 +95,266 @@ void interleave_block<8, 8, VLType::None, true>(
       "cmp %x[width], #0x10\n"
       "blt 5f\n"
       "3:"  // Main loop head
-      "cmp x19, #0x3e\n"
+      "cmp x20, #0x3e\n"
       "ble 4f\n"
       "uadalp v1.4s, v5.8h\n"
       "movi v5.8h, #0x0\n"
+      "mov x20, #0x0\n"
       "uadalp v0.4s, v4.8h\n"
       "movi v4.8h, #0x0\n"
       "uadalp v31.4s, v3.8h\n"
       "movi v3.8h, #0x0\n"
       "uadalp v30.4s, v2.8h\n"
       "movi v2.8h, #0x0\n"
-      "mov x19, #0x0\n"
       "4:"  // no_accumulate_16
-      "ldr q27, [x27], #0x10\n"
-      "add x19, x19, #0x1\n"
-      "ldr q24, [x26], #0x10\n"
-      "zip1 v26.2d, v27.2d, v24.2d\n"
-      "ldr q25, [x25], #0x10\n"
-      "subs %x[width], %x[width], #0x10\n"
-      "zip2 v24.2d, v27.2d, v24.2d\n"
-      "ldr q21, [x24], #0x10\n"
+      "ldr q27, [x28], #0x10\n"
+      "ldr q19, [x27], #0x10\n"
+      "zip1 v26.2d, v27.2d, v19.2d\n"
       "uadalp v5.8h, v26.16b\n"
-      "zip1 v23.2d, v25.2d, v21.2d\n"
-      "ldr q22, [x23], #0x10\n"
+      "ldr q25, [x26], #0x10\n"
+      "ldr q18, [x25], #0x10\n"
+      "zip1 v24.2d, v25.2d, v18.2d\n"
+      "uadalp v4.8h, v24.16b\n"
+      "ldr q23, [x24], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "zip1 v22.2d, v23.2d, v17.2d\n"
+      "uadalp v3.8h, v22.16b\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v20.2d, v21.2d, v16.2d\n"
+      "uadalp v2.8h, v20.16b\n"
+      "zip2 v19.2d, v27.2d, v19.2d\n"
+      "zip2 v18.2d, v25.2d, v18.2d\n"
+      "subs %x[width], %x[width], #0x10\n"
       "cmp %x[width], #0x10\n"
-      "zip2 v21.2d, v25.2d, v21.2d\n"
-      "ldr q18, [x22], #0x10\n"
-      "uadalp v4.8h, v23.16b\n"
-      "zip1 v20.2d, v22.2d, v18.2d\n"
-      "ldr q19, [x21], #0x10\n"
-      "uadalp v5.8h, v24.16b\n"
-      "zip2 v18.2d, v22.2d, v18.2d\n"
-      "ldr q16, [x20], #0x10\n"
-      "uadalp v3.8h, v20.16b\n"
-      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "zip2 v17.2d, v23.2d, v17.2d\n"
+      "zip2 v16.2d, v21.2d, v16.2d\n"
+      "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "uadalp v4.8h, v21.16b\n"
-      "zip2 v16.2d, v19.2d, v16.2d\n"
       "prfm pldl1keep, [x26, #0x70]\n"
-      "uadalp v2.8h, v17.16b\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "uadalp v3.8h, v18.16b\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "uadalp v5.8h, v19.16b\n"
       "prfm pldl1keep, [x24, #0x70]\n"
-      "uadalp v2.8h, v16.16b\n"
       "prfm pldl1keep, [x23, #0x70]\n"
+      "str q24, [%x[out_ptr], #0x10]\n"
+      "uadalp v4.8h, v18.16b\n"
       "prfm pldl1keep, [x22, #0x70]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "prfm pldl1keep, [x20, #0x70]\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "str q23, [%x[out_ptr], #0x10]\n"
-      "str q20, [%x[out_ptr], #0x20]\n"
-      "str q17, [%x[out_ptr], #0x30]\n"
-      "str q24, [%x[out_ptr], #0x40]\n"
-      "str q21, [%x[out_ptr], #0x50]\n"
-      "str q18, [%x[out_ptr], #0x60]\n"
+      "str q22, [%x[out_ptr], #0x20]\n"
+      "uadalp v3.8h, v17.16b\n"
+      "str q20, [%x[out_ptr], #0x30]\n"
+      "uadalp v2.8h, v16.16b\n"
+      "add x20, x20, #0x1\n"
+      "str q19, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
       "str q16, [%x[out_ptr], #0x70]\n"
       "add %x[out_ptr], %x[out_ptr], #0x80\n"
       "bge 3b\n"
       "5:"  // Main loop skip
       "cbz %x[width], 14f\n"
       "tbz %x[width], #3, 9f\n"
-      "ldr d27, [x27], #0x8\n"
-      "ldr d24, [x26], #0x8\n"
-      "ldr d25, [x25], #0x8\n"
-      "ldr d21, [x24], #0x8\n"
+      "ldr d27, [x28], #0x8\n"
+      "ldr d26, [x27], #0x8\n"
+      "ldr d25, [x26], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
+      "ldr d23, [x24], #0x8\n"
       "ldr d22, [x23], #0x8\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
-      "ldr d16, [x20], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
       "tbz %x[width], #2, 7f\n"
-      "ld1 { v27.s }[2], [x27], #0x4\n"
-      "ld1 { v24.s }[2], [x26], #0x4\n"
-      "ld1 { v25.s }[2], [x25], #0x4\n"
-      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v27.s }[2], [x28], #0x4\n"
+      "ld1 { v26.s }[2], [x27], #0x4\n"
+      "ld1 { v25.s }[2], [x26], #0x4\n"
+      "ld1 { v24.s }[2], [x25], #0x4\n"
+      "ld1 { v23.s }[2], [x24], #0x4\n"
       "ld1 { v22.s }[2], [x23], #0x4\n"
-      "ld1 { v18.s }[2], [x22], #0x4\n"
-      "ld1 { v19.s }[2], [x21], #0x4\n"
-      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "ld1 { v21.s }[2], [x22], #0x4\n"
+      "ld1 { v20.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v27.h }[6], [x27], #0x2\n"
-      "mov x19, #0x2\n"
-      "ld1 { v24.h }[6], [x26], #0x2\n"
-      "ld1 { v25.h }[6], [x25], #0x2\n"
-      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v27.h }[6], [x28], #0x2\n"
+      "ld1 { v26.h }[6], [x27], #0x2\n"
+      "mov x20, #0x2\n"
+      "ld1 { v25.h }[6], [x26], #0x2\n"
+      "ld1 { v24.h }[6], [x25], #0x2\n"
+      "ld1 { v23.h }[6], [x24], #0x2\n"
       "ld1 { v22.h }[6], [x23], #0x2\n"
-      "ld1 { v18.h }[6], [x22], #0x2\n"
-      "ld1 { v19.h }[6], [x21], #0x2\n"
-      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "ld1 { v21.h }[6], [x22], #0x2\n"
+      "ld1 { v20.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[14], [x27]\n"
-      "ld1 { v24.b }[14], [x26]\n"
-      "ld1 { v25.b }[14], [x25]\n"
-      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v27.b }[14], [x28]\n"
+      "ld1 { v26.b }[14], [x27]\n"
+      "ld1 { v25.b }[14], [x26]\n"
+      "ld1 { v24.b }[14], [x25]\n"
+      "ld1 { v23.b }[14], [x24]\n"
       "ld1 { v22.b }[14], [x23]\n"
-      "ld1 { v18.b }[14], [x22]\n"
-      "ld1 { v19.b }[14], [x21]\n"
-      "ld1 { v16.b }[14], [x20]\n"
+      "ld1 { v21.b }[14], [x22]\n"
+      "ld1 { v20.b }[14], [x21]\n"
       "b 13f\n"
       "6:"  // odd_loads_1_12
-      "mov x19, #0x2\n"
+      "mov x20, #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[12], [x27]\n"
-      "ld1 { v24.b }[12], [x26]\n"
-      "ld1 { v25.b }[12], [x25]\n"
-      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v27.b }[12], [x28]\n"
+      "ld1 { v26.b }[12], [x27]\n"
+      "ld1 { v25.b }[12], [x26]\n"
+      "ld1 { v24.b }[12], [x25]\n"
+      "ld1 { v23.b }[12], [x24]\n"
       "ld1 { v22.b }[12], [x23]\n"
-      "ld1 { v18.b }[12], [x22]\n"
-      "ld1 { v19.b }[12], [x21]\n"
-      "ld1 { v16.b }[12], [x20]\n"
+      "ld1 { v21.b }[12], [x22]\n"
+      "ld1 { v20.b }[12], [x21]\n"
       "b 13f\n"
       "7:"  // odd_loads_2_8
       "tbz %x[width], #1, 8f\n"
-      "ld1 { v27.h }[4], [x27], #0x2\n"
-      "ld1 { v24.h }[4], [x26], #0x2\n"
-      "mov x19, #0x2\n"
-      "ld1 { v25.h }[4], [x25], #0x2\n"
-      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v27.h }[4], [x28], #0x2\n"
+      "ld1 { v26.h }[4], [x27], #0x2\n"
+      "mov x20, #0x2\n"
+      "ld1 { v25.h }[4], [x26], #0x2\n"
+      "ld1 { v24.h }[4], [x25], #0x2\n"
+      "ld1 { v23.h }[4], [x24], #0x2\n"
       "ld1 { v22.h }[4], [x23], #0x2\n"
-      "ld1 { v18.h }[4], [x22], #0x2\n"
-      "ld1 { v19.h }[4], [x21], #0x2\n"
-      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "ld1 { v21.h }[4], [x22], #0x2\n"
+      "ld1 { v20.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[10], [x27]\n"
-      "ld1 { v24.b }[10], [x26]\n"
-      "ld1 { v25.b }[10], [x25]\n"
-      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v27.b }[10], [x28]\n"
+      "ld1 { v26.b }[10], [x27]\n"
+      "ld1 { v25.b }[10], [x26]\n"
+      "ld1 { v24.b }[10], [x25]\n"
+      "ld1 { v23.b }[10], [x24]\n"
       "ld1 { v22.b }[10], [x23]\n"
-      "ld1 { v18.b }[10], [x22]\n"
-      "ld1 { v19.b }[10], [x21]\n"
-      "ld1 { v16.b }[10], [x20]\n"
+      "ld1 { v21.b }[10], [x22]\n"
+      "ld1 { v20.b }[10], [x21]\n"
       "b 13f\n"
       "8:"  // odd_loads_1_8
-      "mov x19, #0x1\n"
+      "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[8], [x27]\n"
-      "ld1 { v24.b }[8], [x26]\n"
-      "mov x19, #0x2\n"
-      "ld1 { v25.b }[8], [x25]\n"
-      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v27.b }[8], [x28]\n"
+      "ld1 { v26.b }[8], [x27]\n"
+      "mov x20, #0x2\n"
+      "ld1 { v25.b }[8], [x26]\n"
+      "ld1 { v24.b }[8], [x25]\n"
+      "ld1 { v23.b }[8], [x24]\n"
       "ld1 { v22.b }[8], [x23]\n"
-      "ld1 { v18.b }[8], [x22]\n"
-      "ld1 { v19.b }[8], [x21]\n"
-      "ld1 { v16.b }[8], [x20]\n"
+      "ld1 { v21.b }[8], [x22]\n"
+      "ld1 { v20.b }[8], [x21]\n"
       "b 13f\n"
       "9:"  // odd_loads_4_0
       "tbz %x[width], #2, 11f\n"
-      "ldr s27, [x27], #0x4\n"
-      "ldr s24, [x26], #0x4\n"
-      "ldr s25, [x25], #0x4\n"
-      "ldr s21, [x24], #0x4\n"
+      "ldr s27, [x28], #0x4\n"
+      "ldr s26, [x27], #0x4\n"
+      "ldr s25, [x26], #0x4\n"
+      "ldr s24, [x25], #0x4\n"
+      "ldr s23, [x24], #0x4\n"
       "ldr s22, [x23], #0x4\n"
-      "ldr s18, [x22], #0x4\n"
-      "ldr s19, [x21], #0x4\n"
-      "ldr s16, [x20], #0x4\n"
+      "ldr s21, [x22], #0x4\n"
+      "ldr s20, [x21], #0x4\n"
       "tbz %x[width], #1, 10f\n"
-      "ld1 { v27.h }[2], [x27], #0x2\n"
-      "mov x19, #0x1\n"
-      "ld1 { v24.h }[2], [x26], #0x2\n"
-      "ld1 { v25.h }[2], [x25], #0x2\n"
-      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v27.h }[2], [x28], #0x2\n"
+      "ld1 { v26.h }[2], [x27], #0x2\n"
+      "mov x20, #0x1\n"
+      "ld1 { v25.h }[2], [x26], #0x2\n"
+      "ld1 { v24.h }[2], [x25], #0x2\n"
+      "ld1 { v23.h }[2], [x24], #0x2\n"
       "ld1 { v22.h }[2], [x23], #0x2\n"
-      "ld1 { v18.h }[2], [x22], #0x2\n"
-      "ld1 { v19.h }[2], [x21], #0x2\n"
-      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "ld1 { v21.h }[2], [x22], #0x2\n"
+      "ld1 { v20.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[6], [x27]\n"
-      "ld1 { v24.b }[6], [x26]\n"
-      "ld1 { v25.b }[6], [x25]\n"
-      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v27.b }[6], [x28]\n"
+      "ld1 { v26.b }[6], [x27]\n"
+      "ld1 { v25.b }[6], [x26]\n"
+      "ld1 { v24.b }[6], [x25]\n"
+      "ld1 { v23.b }[6], [x24]\n"
       "ld1 { v22.b }[6], [x23]\n"
-      "ld1 { v18.b }[6], [x22]\n"
-      "ld1 { v19.b }[6], [x21]\n"
-      "ld1 { v16.b }[6], [x20]\n"
+      "ld1 { v21.b }[6], [x22]\n"
+      "ld1 { v20.b }[6], [x21]\n"
       "b 13f\n"
       "10:"  // odd_loads_1_4
-      "mov x19, #0x1\n"
+      "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[4], [x27]\n"
-      "ld1 { v24.b }[4], [x26]\n"
-      "ld1 { v25.b }[4], [x25]\n"
-      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v27.b }[4], [x28]\n"
+      "ld1 { v26.b }[4], [x27]\n"
+      "ld1 { v25.b }[4], [x26]\n"
+      "ld1 { v24.b }[4], [x25]\n"
+      "ld1 { v23.b }[4], [x24]\n"
       "ld1 { v22.b }[4], [x23]\n"
-      "ld1 { v18.b }[4], [x22]\n"
-      "ld1 { v19.b }[4], [x21]\n"
-      "ld1 { v16.b }[4], [x20]\n"
+      "ld1 { v21.b }[4], [x22]\n"
+      "ld1 { v20.b }[4], [x21]\n"
       "b 13f\n"
       "11:"  // odd_loads_2_0
       "tbz %x[width], #1, 12f\n"
-      "ldr h27, [x27], #0x2\n"
-      "ldr h24, [x26], #0x2\n"
-      "mov x19, #0x1\n"
-      "ldr h25, [x25], #0x2\n"
-      "ldr h21, [x24], #0x2\n"
+      "ldr h27, [x28], #0x2\n"
+      "ldr h26, [x27], #0x2\n"
+      "mov x20, #0x1\n"
+      "ldr h25, [x26], #0x2\n"
+      "ldr h24, [x25], #0x2\n"
+      "ldr h23, [x24], #0x2\n"
       "ldr h22, [x23], #0x2\n"
-      "ldr h18, [x22], #0x2\n"
-      "ldr h19, [x21], #0x2\n"
-      "ldr h16, [x20], #0x2\n"
+      "ldr h21, [x22], #0x2\n"
+      "ldr h20, [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v27.b }[2], [x27]\n"
-      "ld1 { v24.b }[2], [x26]\n"
-      "ld1 { v25.b }[2], [x25]\n"
-      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v27.b }[2], [x28]\n"
+      "ld1 { v26.b }[2], [x27]\n"
+      "ld1 { v25.b }[2], [x26]\n"
+      "ld1 { v24.b }[2], [x25]\n"
+      "ld1 { v23.b }[2], [x24]\n"
       "ld1 { v22.b }[2], [x23]\n"
-      "ld1 { v18.b }[2], [x22]\n"
-      "ld1 { v19.b }[2], [x21]\n"
-      "ld1 { v16.b }[2], [x20]\n"
+      "ld1 { v21.b }[2], [x22]\n"
+      "ld1 { v20.b }[2], [x21]\n"
       "b 13f\n"
       "12:"  // odd_loads_1_0
-      "ldr b27, [x27, #0x0]\n"
-      "mov x19, #0x1\n"
-      "ldr b24, [x26, #0x0]\n"
-      "ldr b25, [x25, #0x0]\n"
-      "ldr b21, [x24, #0x0]\n"
+      "ldr b27, [x28, #0x0]\n"
+      "ldr b26, [x27, #0x0]\n"
+      "mov x20, #0x1\n"
+      "ldr b25, [x26, #0x0]\n"
+      "ldr b24, [x25, #0x0]\n"
+      "ldr b23, [x24, #0x0]\n"
       "ldr b22, [x23, #0x0]\n"
-      "ldr b18, [x22, #0x0]\n"
-      "ldr b19, [x21, #0x0]\n"
-      "ldr b16, [x20, #0x0]\n"
+      "ldr b21, [x22, #0x0]\n"
+      "ldr b20, [x21, #0x0]\n"
       "13:"  // Odd load end
-      "zip1 v26.2d, v27.2d, v24.2d\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "zip1 v23.2d, v25.2d, v21.2d\n"
-      "uadalp v5.8h, v26.16b\n"
-      "zip1 v20.2d, v22.2d, v18.2d\n"
-      "str q23, [%x[out_ptr], #0x10]\n"
-      "uadalp v4.8h, v23.16b\n"
-      "zip1 v17.2d, v19.2d, v16.2d\n"
-      "str q20, [%x[out_ptr], #0x20]\n"
-      "uadalp v3.8h, v20.16b\n"
-      "str q17, [%x[out_ptr], #0x30]\n"
-      "uadalp v2.8h, v17.16b\n"
-      "subs x19, x19, #0x1\n"
+      "zip1 v19.2d, v27.2d, v26.2d\n"
+      "zip1 v18.2d, v25.2d, v24.2d\n"
+      "subs x20, x20, #0x1\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "zip1 v17.2d, v23.2d, v22.2d\n"
+      "zip1 v16.2d, v21.2d, v20.2d\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "uadalp v5.8h, v19.16b\n"
+      "uadalp v4.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "uadalp v3.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "uadalp v2.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "beq 14f\n"
-      "zip2 v24.2d, v27.2d, v24.2d\n"
-      "str q24, [%x[out_ptr], #0x0]\n"
-      "zip2 v21.2d, v25.2d, v21.2d\n"
-      "uadalp v5.8h, v24.16b\n"
-      "zip2 v18.2d, v22.2d, v18.2d\n"
-      "str q21, [%x[out_ptr], #0x10]\n"
-      "uadalp v4.8h, v21.16b\n"
-      "zip2 v16.2d, v19.2d, v16.2d\n"
-      "str q18, [%x[out_ptr], #0x20]\n"
-      "uadalp v3.8h, v18.16b\n"
+      "zip2 v19.2d, v27.2d, v26.2d\n"
+      "zip2 v18.2d, v25.2d, v24.2d\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "zip2 v17.2d, v23.2d, v22.2d\n"
+      "zip2 v16.2d, v21.2d, v20.2d\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "uadalp v5.8h, v19.16b\n"
+      "uadalp v4.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "uadalp v3.8h, v17.16b\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "uadalp v2.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "14:"  // Odds skip
       "uadalp v1.4s, v5.8h\n"
       "uadalp v0.4s, v4.8h\n"
-      "addp v1.4s, v1.4s, v0.4s\n"
       "uadalp v31.4s, v3.8h\n"
       "uadalp v30.4s, v2.8h\n"
+      "addp v1.4s, v1.4s, v0.4s\n"
+      "addp v16.4s, v31.4s, v30.4s\n"
       "add v1.4s, v1.4s, v29.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
       "str q1, [%x[out_ptr], #0x0]\n"
-      "addp v0.4s, v31.4s, v30.4s\n"
-      "add v0.4s, v0.4s, v28.4s\n"
-      "str q0, [%x[out_ptr], #0x10]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list-sve.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list-sve.hpp
new file mode 100644
index 0000000000..57f26ac135
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list-sve.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "sme_interleave1VL_bf16_bf16.hpp"
+#include "sme_interleave1VL_block2_bf16_bf16.hpp"
+#include "sme_interleave1VL_block2_fp16_fp16.hpp"
+#include "sme_interleave1VL_block4_s8_s8.hpp"
+#include "sme_interleave1VL_block4_u8_u8.hpp"
+#include "sme_interleave1VL_block4_s8_s8_summing.hpp"
+#include "sme_interleave1VL_block4_u8_u8_summing.hpp"
+#include "sme_interleave1VL_fp16_fp16.hpp"
+#include "sme_interleave1VL_fp32_fp32.hpp"
+#include "sme_interleave2VL_block2_bf16_bf16.hpp"
+#include "sme_interleave2VL_block2_fp16_fp16.hpp"
+#include "sme_interleave2VL_block4_s8_s8.hpp"
+#include "sme_interleave2VL_block4_s8_s8_summing.hpp"
+#include "sme_interleave2VL_block4_u8_u8.hpp"
+#include "sme_interleave2VL_block4_u8_u8_summing.hpp"
+#include "sme_interleave2VL_fp16_fp16.hpp"
+#include "sme_interleave2VL_bf16_bf16.hpp"
+#include "sme_interleave2VL_fp32_fp32.hpp"
+#include "sme_interleave4VL_block2_bf16_bf16.hpp"
+#include "sme_interleave4VL_block2_fp16_fp16.hpp"
+#include "sme_interleave4VL_block4_s8_s8.hpp"
+#include "sme_interleave4VL_block4_u8_u8.hpp"
+#include "sme_interleave4VL_block4_s8_s8_summing.hpp"
+#include "sme_interleave4VL_block4_u8_u8_summing.hpp"
+#include "sme_interleave4VL_fp32_fp32.hpp"
+
+#include "sme2_interleave1VL_block2_fp32_bf16.hpp"
+#include "sme2_interleave2VL_block2_fp32_bf16.hpp"
+#include "sme2_interleave4VL_block2_fp32_bf16.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
index 52b49c0f0c..b13d32c324 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
@@ -40,6 +40,7 @@
 #include "a64_interleave8_block2_bf16_bf16.hpp"
 #include "a64_interleave8_block2_fp32_fp32.hpp"
 #include "a64_interleave8_block4_bf16_bf16.hpp"
+#include "a64_interleave8_block4_fp32_bf16.hpp"
 #include "a64_interleave8_block4_s8_s8.hpp"
 #include "a64_interleave8_block4_s8_s8_summing.hpp"
 #include "a64_interleave8_block4_u8_u8_summing.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp
new file mode 100644
index 0000000000..a5f4754d3d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+template <>
+void interleave_block<1, 2, VLType::SME, false>(
+  bfloat16 * &out, const float * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x22, ALL, MUL #2\n"
+      "sub x28, %x[width], #0x1\n"
+      "cntw x21, ALL, MUL #2\n"
+      "sub x20, x22, #0x1\n"
+      "whilelt p10.s, XZR, %x[height]\n"
+      "add x28, x28, x21\n"
+      "ands x27, %x[width], x20\n"
+      "udiv x28, x28, x21\n"
+      "csel x27, x27, x22, NE\n"
+      "mov x26, #0x0\n"
+      "and x25, x28, #0x1\n"
+      "sub x28, x28, #0x1\n"
+      "add x27, x27, #0x1\n"
+      "mov x20, %x[width]\n"
+      "ptrue p0.b\n"
+      "mov x24, %x[outptr_raw]\n"
+      "mov x23, %x[row_offset]\n"
+      "cntw x22\n"
+      "lsr x28, x28, #0x1\n"
+      "lsr x27, x27, #0x1\n"
+      "mov x12, #0x0\n"
+      ".inst 0x25b44751  // whilelt pn9.s, x26, x20, VLx2\n"
+      "mov x21, %x[in]\n"
+      "1:"  // Width loop: Preamble: Loop
+      "ldr x20, [x21], #0x8\n"
+      ".inst 0x25306548  // psel p8.s, p9.s/Z, p10.s[w12]\n"
+      ".inst 0xa0174286  // ld1w { z6.s-z7.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
+      ".inst 0xc160e0c6  // bfcvt z6.h, { z6.s-z7.s }\n"
+      ".inst 0xc08000c0  // mova za0h.s[x12], p0/M, z6.s\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x22\n"
+      "blt 1b\n"
+      "incw x23, ALL, MUL #2\n"
+      "incw x26, ALL, MUL #2\n"
+      "cbz x28, 5f\n"
+      "2:"  // Width loop
+      "mov x20, %x[width]\n"
+      "mov x12, #0x0\n"
+      ".inst 0x25b44751  // whilelt pn9.s, x26, x20, VLx2\n"
+      "mov x21, %x[in]\n"
+      "3:"  // Width loop: Odd: Loop
+      "ldr x20, [x21], #0x8\n"
+      ".inst 0x25306548  // psel p8.s, p9.s/Z, p10.s[w12]\n"
+      ".inst 0xa017429e  // ld1w { z30.s-z31.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
+      ".inst 0xc160e3de  // bfcvt z30.h, { z30.s-z31.s }\n"
+      ".inst 0xc08003c8  // mova za2h.s[x12], p0/M, z30.s\n"
+      ".inst 0xc082800f  // mova z15.s, p0/M, za0v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x22\n"
+      "st1w { z15.s }, p0, [x24]\n"
+      "addvl x24, x24, #1\n"
+      "blt 3b\n"
+      "incw x26, ALL, MUL #2\n"
+      "mov x20, %x[width]\n"
+      "incw x23, ALL, MUL #2\n"
+      "mov x12, #0x0\n"
+      ".inst 0x25b44751  // whilelt pn9.s, x26, x20, VLx2\n"
+      "mov x21, %x[in]\n"
+      "4:"  // Width loop: Even: Loop
+      "ldr x20, [x21], #0x8\n"
+      ".inst 0x25306548  // psel p8.s, p9.s/Z, p10.s[w12]\n"
+      ".inst 0xa0174298  // ld1w { z24.s-z25.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
+      ".inst 0xc160e318  // bfcvt z24.h, { z24.s-z25.s }\n"
+      ".inst 0xc0800300  // mova za0h.s[x12], p0/M, z24.s\n"
+      ".inst 0xc0828110  // mova z16.s, p0/M, za2v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x22\n"
+      "st1w { z16.s }, p0, [x24]\n"
+      "addvl x24, x24, #1\n"
+      "blt 4b\n"
+      "subs x28, x28, #0x1\n"
+      "incw x23, ALL, MUL #2\n"
+      "incw x26, ALL, MUL #2\n"
+      "bgt 2b\n"
+      "5:"  // Width loop: Tails
+      "cbnz x25, 8f\n"
+      "mov x20, %x[width]\n"
+      "mov x12, #0x0\n"
+      ".inst 0x25b44751  // whilelt pn9.s, x26, x20, VLx2\n"
+      "mov x21, %x[in]\n"
+      "6:"  // Width loop: Tails: Even: Odd: Loop
+      "ldr x20, [x21], #0x8\n"
+      ".inst 0x25306548  // psel p8.s, p9.s/Z, p10.s[w12]\n"
+      ".inst 0xa017428e  // ld1w { z14.s-z15.s }, pn8.s/Z, [x20, x23, LSL #2]\n"
+      ".inst 0xc160e1ce  // bfcvt z14.h, { z14.s-z15.s }\n"
+      ".inst 0xc08001c8  // mova za2h.s[x12], p0/M, z14.s\n"
+      ".inst 0xc0828010  // mova z16.s, p0/M, za0v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x22\n"
+      "st1w { z16.s }, p0, [x24]\n"
+      "addvl x24, x24, #1\n"
+      "blt 6b\n"
+      "mov x12, #0x0\n"
+      "7:"  // Width loop: Tails: Even: Even: Loop
+      ".inst 0xc0828110  // mova z16.s, p0/M, za2v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x27\n"
+      "st1w { z16.s }, p0, [x24]\n"
+      "addvl x24, x24, #1\n"
+      "blt 7b\n"
+      "b 10f\n"
+      "8:"  // Width loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "9:"  // Width loop: Tails: Odd: Loop
+      ".inst 0xc0828010  // mova z16.s, p0/M, za0v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x27\n"
+      "st1w { z16.s }, p0, [x24]\n"
+      "addvl x24, x24, #1\n"
+      "blt 9b\n"
+      "10:"  // End
+      "mov %x[outptr_raw], x24\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [outptr_raw] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp
new file mode 100644
index 0000000000..c1d0ac5bc7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+template <>
+void interleave_block<2, 2, VLType::SME, false>(
+  bfloat16 * &out, const float * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x22, ALL, MUL #2\n"
+      "cntw x9\n"
+      "sub x28, %x[width], #0x1\n"
+      "cntw x21, ALL, MUL #2\n"
+      "sub x20, x22, #0x1\n"
+      ".inst 0x25207815  // ptrue pn13.b\n"
+      "whilelt p12.s, XZR, %x[height]\n"
+      "whilelt p11.s, x9, %x[height]\n"
+      "add x28, x28, x21\n"
+      "ands x27, %x[width], x20\n"
+      "udiv x28, x28, x21\n"
+      "csel x27, x27, x22, NE\n"
+      "mov x26, #0x0\n"
+      "and x25, x28, #0x1\n"
+      "sub x28, x28, #0x1\n"
+      "add x27, x27, #0x1\n"
+      "mov x20, %x[width]\n"
+      "mov x24, %x[in]\n"
+      "ptrue p0.b\n"
+      "mov x23, %x[outptr_raw]\n"
+      "mov x22, %x[row_offset]\n"
+      "lsr x28, x28, #0x1\n"
+      "lsr x27, x27, #0x1\n"
+      "mov x12, #0x0\n"
+      ".inst 0x25b44752  // whilelt pn10.s, x26, x20, VLx2\n"
+      "add x21, x24, x9, LSL #3\n"
+      "1:"  // Width loop: Preamble: Loop
+      "ldr x20, [x24], #0x8\n"
+      ".inst 0x25306989  // psel p9.s, p10.s/Z, p12.s[w12]\n"
+      ".inst 0x25306968  // psel p8.s, p10.s/Z, p11.s[w12]\n"
+      ".inst 0xa0164698  // ld1w { z24.s-z25.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
+      "ldr x20, [x21], #0x8\n"
+      ".inst 0xa0164296  // ld1w { z22.s-z23.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+      ".inst 0xc160e318  // bfcvt z24.h, { z24.s-z25.s }\n"
+      ".inst 0xc160e2d6  // bfcvt z22.h, { z22.s-z23.s }\n"
+      ".inst 0xc0800300  // mova za0h.s[x12], p0/M, z24.s\n"
+      ".inst 0xc08002c4  // mova za1h.s[x12], p0/M, z22.s\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x9\n"
+      "blt 1b\n"
+      "incw x22, ALL, MUL #2\n"
+      "incw x26, ALL, MUL #2\n"
+      "cbz x28, 5f\n"
+      "2:"  // Width loop
+      "mov x20, %x[width]\n"
+      "mov x24, %x[in]\n"
+      "mov x12, #0x0\n"
+      ".inst 0x25b44752  // whilelt pn10.s, x26, x20, VLx2\n"
+      "add x21, x24, x9, LSL #3\n"
+      "3:"  // Width loop: Odd: Loop
+      "ldr x20, [x24], #0x8\n"
+      ".inst 0x25306989  // psel p9.s, p10.s/Z, p12.s[w12]\n"
+      ".inst 0x25306968  // psel p8.s, p10.s/Z, p11.s[w12]\n"
+      ".inst 0xa0164696  // ld1w { z22.s-z23.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
+      "ldr x20, [x21], #0x8\n"
+      ".inst 0xa016428a  // ld1w { z10.s-z11.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+      ".inst 0xc160e2d6  // bfcvt z22.h, { z22.s-z23.s }\n"
+      ".inst 0xc160e14a  // bfcvt z10.h, { z10.s-z11.s }\n"
+      ".inst 0xc08002c8  // mova za2h.s[x12], p0/M, z22.s\n"
+      ".inst 0xc080014c  // mova za3h.s[x12], p0/M, z10.s\n"
+      ".inst 0xc0828008  // mova z8.s, p0/M, za0v.s[x12]\n"
+      ".inst 0xc0828089  // mova z9.s, p0/M, za1v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x9\n"
+      ".inst 0xa06056e8  // st1w { z8.s-z9.s }, pn13.b, [x23]\n"
+      "addvl x23, x23, #2\n"
+      "blt 3b\n"
+      "incw x26, ALL, MUL #2\n"
+      "mov x20, %x[width]\n"
+      "mov x24, %x[in]\n"
+      "incw x22, ALL, MUL #2\n"
+      "mov x12, #0x0\n"
+      ".inst 0x25b44752  // whilelt pn10.s, x26, x20, VLx2\n"
+      "add x21, x24, x9, LSL #3\n"
+      "4:"  // Width loop: Even: Loop
+      "ldr x20, [x24], #0x8\n"
+      ".inst 0x25306989  // psel p9.s, p10.s/Z, p12.s[w12]\n"
+      ".inst 0x25306968  // psel p8.s, p10.s/Z, p11.s[w12]\n"
+      ".inst 0xa016469a  // ld1w { z26.s-z27.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
+      "ldr x20, [x21], #0x8\n"
+      ".inst 0xa016429e  // ld1w { z30.s-z31.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+      ".inst 0xc160e35a  // bfcvt z26.h, { z26.s-z27.s }\n"
+      ".inst 0xc160e3de  // bfcvt z30.h, { z30.s-z31.s }\n"
+      ".inst 0xc0800340  // mova za0h.s[x12], p0/M, z26.s\n"
+      ".inst 0xc08003c4  // mova za1h.s[x12], p0/M, z30.s\n"
+      ".inst 0xc0828106  // mova z6.s, p0/M, za2v.s[x12]\n"
+      ".inst 0xc082818e  // mova z14.s, p0/M, za3v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x9\n"
+      ".inst 0xa16056e6  // st1w { z6.s, z14.s }, pn13.b, [x23]\n"
+      "addvl x23, x23, #2\n"
+      "blt 4b\n"
+      "subs x28, x28, #0x1\n"
+      "incw x22, ALL, MUL #2\n"
+      "incw x26, ALL, MUL #2\n"
+      "bgt 2b\n"
+      "5:"  // Width loop: Tails
+      "cbnz x25, 8f\n"
+      "mov x20, %x[width]\n"
+      "mov x24, %x[in]\n"
+      "mov x12, #0x0\n"
+      ".inst 0x25b44752  // whilelt pn10.s, x26, x20, VLx2\n"
+      "add x21, x24, x9, LSL #3\n"
+      "6:"  // Width loop: Tails: Even: Odd: Loop
+      "ldr x20, [x24], #0x8\n"
+      ".inst 0x25306989  // psel p9.s, p10.s/Z, p12.s[w12]\n"
+      ".inst 0x25306968  // psel p8.s, p10.s/Z, p11.s[w12]\n"
+      ".inst 0xa016468c  // ld1w { z12.s-z13.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
+      "ldr x20, [x21], #0x8\n"
+      ".inst 0xa016428e  // ld1w { z14.s-z15.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+      ".inst 0xc160e18c  // bfcvt z12.h, { z12.s-z13.s }\n"
+      ".inst 0xc160e1ce  // bfcvt z14.h, { z14.s-z15.s }\n"
+      ".inst 0xc0800188  // mova za2h.s[x12], p0/M, z12.s\n"
+      ".inst 0xc08001cc  // mova za3h.s[x12], p0/M, z14.s\n"
+      ".inst 0xc0828007  // mova z7.s, p0/M, za0v.s[x12]\n"
+      ".inst 0xc082808f  // mova z15.s, p0/M, za1v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x9\n"
+      ".inst 0xa16056e7  // st1w { z7.s, z15.s }, pn13.b, [x23]\n"
+      "addvl x23, x23, #2\n"
+      "blt 6b\n"
+      "mov x12, #0x0\n"
+      "7:"  // Width loop: Tails: Even: Even: Loop
+      ".inst 0xc082810e  // mova z14.s, p0/M, za2v.s[x12]\n"
+      ".inst 0xc082818f  // mova z15.s, p0/M, za3v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x27\n"
+      ".inst 0xa06056ee  // st1w { z14.s-z15.s }, pn13.b, [x23]\n"
+      "addvl x23, x23, #2\n"
+      "blt 7b\n"
+      "b 10f\n"
+      "8:"  // Width loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "9:"  // Width loop: Tails: Odd: Loop
+      ".inst 0xc0828014  // mova z20.s, p0/M, za0v.s[x12]\n"
+      ".inst 0xc0828095  // mova z21.s, p0/M, za1v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x27\n"
+      ".inst 0xa06056f4  // st1w { z20.s-z21.s }, pn13.b, [x23]\n"
+      "addvl x23, x23, #2\n"
+      "blt 9b\n"
+      "10:"  // End
+      "mov %x[outptr_raw], x23\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [outptr_raw] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp
new file mode 100644
index 0000000000..03575d7ff2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+template <>
+void interleave_block<4, 2, VLType::SME, false>(
+  bfloat16 * &out, const float * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x23, ALL, MUL #2\n"
+      "cntw x10\n"
+      "cntw x22, ALL, MUL #2\n"
+      "cntw x20, ALL, MUL #3\n"
+      "sub x21, x23, #0x1\n"
+      ".inst 0x25207817  // ptrue pn15.b\n"
+      "whilelt p1.s, XZR, %x[height]\n"
+      "whilelt p14.s, x10, %x[height]\n"
+      "whilelt p13.s, x22, %x[height]\n"
+      "whilelt p12.s, x20, %x[height]\n"
+      "sub x9, %x[width], #0x1\n"
+      "cntw x20, ALL, MUL #2\n"
+      "ands x28, %x[width], x21\n"
+      "mov x27, %x[in]\n"
+      "add x9, x9, x20\n"
+      "csel x28, x28, x23, NE\n"
+      "add x26, x27, x10, LSL #3\n"
+      "mov x25, #0x0\n"
+      "udiv x9, x9, x20\n"
+      "add x28, x28, #0x1\n"
+      "mov x20, %x[width]\n"
+      "add x24, x26, x10, LSL #3\n"
+      "ptrue p0.b\n"
+      "mov x23, %x[outptr_raw]\n"
+      "mov x22, %x[row_offset]\n"
+      "sub x9, x9, #0x1\n"
+      "lsr x28, x28, #0x1\n"
+      "mov x12, #0x0\n"
+      ".inst 0x25b44733  // whilelt pn11.s, x25, x20, VLx2\n"
+      "add x21, x24, x10, LSL #3\n"
+      "1:"  // Width loop: Preamble: Loop
+      "ldr x20, [x27], #0x8\n"
+      ".inst 0x25306c28  // psel p8.s, p11.s/Z, p1.s[w12]\n"
+      ".inst 0x25306dca  // psel p10.s, p11.s/Z, p14.s[w12]\n"
+      ".inst 0xa0164298  // ld1w { z24.s-z25.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+      "ldr x20, [x26], #0x8\n"
+      ".inst 0x25306da9  // psel p9.s, p11.s/Z, p13.s[w12]\n"
+      ".inst 0x25306d88  // psel p8.s, p11.s/Z, p12.s[w12]\n"
+      ".inst 0xa0164a82  // ld1w { z2.s-z3.s }, pn10.s/Z, [x20, x22, LSL #2]\n"
+      "ldr x20, [x24], #0x8\n"
+      ".inst 0xa016468a  // ld1w { z10.s-z11.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
+      ".inst 0xc160e318  // bfcvt z24.h, { z24.s-z25.s }\n"
+      ".inst 0xc160e042  // bfcvt z2.h, { z2.s-z3.s }\n"
+      "ldr x20, [x21], #0x8\n"
+      ".inst 0xa016428c  // ld1w { z12.s-z13.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+      ".inst 0xc160e14a  // bfcvt z10.h, { z10.s-z11.s }\n"
+      ".inst 0xc160e18c  // bfcvt z12.h, { z12.s-z13.s }\n"
+      ".inst 0xc0800300  // mova za0h.s[x12], p0/M, z24.s\n"
+      ".inst 0xc0800044  // mova za1h.s[x12], p0/M, z2.s\n"
+      ".inst 0xc0800148  // mova za2h.s[x12], p0/M, z10.s\n"
+      ".inst 0xc080018c  // mova za3h.s[x12], p0/M, z12.s\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "blt 1b\n"
+      "incw x22, ALL, MUL #2\n"
+      "incw x25, ALL, MUL #2\n"
+      "cbz x9, 5f\n"
+      "2:"  // Width loop
+      "mov x12, #0x0\n"
+      "3:"  // Width loop: Store: Loop
+      ".inst 0xc0828011  // mova z17.s, p0/M, za0v.s[x12]\n"
+      ".inst 0xc0828095  // mova z21.s, p0/M, za1v.s[x12]\n"
+      ".inst 0xc0828119  // mova z25.s, p0/M, za2v.s[x12]\n"
+      ".inst 0xc082819d  // mova z29.s, p0/M, za3v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      ".inst 0xa160def1  // st1w { z17.s, z21.s, z25.s, z29.s }, pn15.b, [x23]\n"
+      "addvl x23, x23, #4\n"
+      "blt 3b\n"
+      "mov x27, %x[in]\n"
+      "add x26, x27, x10, LSL #3\n"
+      "mov x20, %x[width]\n"
+      "add x24, x26, x10, LSL #3\n"
+      "mov x12, #0x0\n"
+      ".inst 0x25b44733  // whilelt pn11.s, x25, x20, VLx2\n"
+      "add x21, x24, x10, LSL #3\n"
+      "4:"  // Width loop: Load: Loop
+      "ldr x20, [x27], #0x8\n"
+      ".inst 0x25306c28  // psel p8.s, p11.s/Z, p1.s[w12]\n"
+      ".inst 0x25306dca  // psel p10.s, p11.s/Z, p14.s[w12]\n"
+      ".inst 0xa016428c  // ld1w { z12.s-z13.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+      "ldr x20, [x26], #0x8\n"
+      ".inst 0x25306da9  // psel p9.s, p11.s/Z, p13.s[w12]\n"
+      ".inst 0x25306d88  // psel p8.s, p11.s/Z, p12.s[w12]\n"
+      ".inst 0xa0164a8e  // ld1w { z14.s-z15.s }, pn10.s/Z, [x20, x22, LSL #2]\n"
+      "ldr x20, [x24], #0x8\n"
+      ".inst 0xa0164692  // ld1w { z18.s-z19.s }, pn9.s/Z, [x20, x22, LSL #2]\n"
+      ".inst 0xc160e18c  // bfcvt z12.h, { z12.s-z13.s }\n"
+      ".inst 0xc160e1ce  // bfcvt z14.h, { z14.s-z15.s }\n"
+      "ldr x20, [x21], #0x8\n"
+      ".inst 0xa016429e  // ld1w { z30.s-z31.s }, pn8.s/Z, [x20, x22, LSL #2]\n"
+      ".inst 0xc160e252  // bfcvt z18.h, { z18.s-z19.s }\n"
+      ".inst 0xc160e3de  // bfcvt z30.h, { z30.s-z31.s }\n"
+      ".inst 0xc0800180  // mova za0h.s[x12], p0/M, z12.s\n"
+      ".inst 0xc08001c4  // mova za1h.s[x12], p0/M, z14.s\n"
+      ".inst 0xc0800248  // mova za2h.s[x12], p0/M, z18.s\n"
+      ".inst 0xc08003cc  // mova za3h.s[x12], p0/M, z30.s\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "blt 4b\n"
+      "subs x9, x9, #0x1\n"
+      "incw x22, ALL, MUL #2\n"
+      "incw x25, ALL, MUL #2\n"
+      "bgt 2b\n"
+      "5:"  // Width loop: Tails
+      "mov x12, #0x0\n"
+      "6:"  // Width loop: Tails: Loop
+      ".inst 0xc0828011  // mova z17.s, p0/M, za0v.s[x12]\n"
+      ".inst 0xc0828095  // mova z21.s, p0/M, za1v.s[x12]\n"
+      ".inst 0xc0828119  // mova z25.s, p0/M, za2v.s[x12]\n"
+      ".inst 0xc082819d  // mova z29.s, p0/M, za3v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x28\n"
+      ".inst 0xa160def1  // st1w { z17.s, z21.s, z25.s, z29.s }, pn15.b, [x23]\n"
+      "addvl x23, x23, #4\n"
+      "blt 6b\n"
+      "7:"  // End
+      "mov %x[outptr_raw], x23\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [outptr_raw] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp
new file mode 100644
index 0000000000..453778ae3f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<1, 1, VLType::SME, false>(
+  bfloat16 * &out, const bfloat16 * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "mov x21, %x[width]\n"
+      "inch x21\n"
+      "cnth x11\n"
+      "sub x21, x21, #0x1\n"
+      "udiv x21, x21, x11\n"  // n_passes = ceildiv(width, VL<T>)
+      "mov x20, %x[width]\n"
+      "sub x10, x11, #0x1\n"
+      "sub x9, x21, #0x1\n"
+      "ands x10, x20, x10\n"
+      "sub x28, x11, #0x2\n"
+      "lsl x20, %x[height], #0x1\n"  // height * 2
+      "mov x27, #0x0\n"
+      "mov x26, %x[in]\n"
+      "lsr x9, x9, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "ldr x25, [x26, #0x0]\n"
+      "and x24, x21, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "csel x10, x10, x11, NE\n"
+      "ldr x23, [x26, #0x8]\n"
+      "ptrue p11.h\n"
+      "whilelt p10.h, XZR, x20\n"
+      "mov x22, %x[row_offset]\n"
+      "mov x21, %x[out]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "add x26, x26, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560320  // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25386140  // psel p0.h, p8.h/Z, p10.h[w12, #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0xe05602e1  // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560320  // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25386140  // psel p0.h, p8.h/Z, p10.h[w12, #1]\n"
+      "mov x26, %x[in]\n"
+      ".inst 0xe05602e1  // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      "inch x22\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "inch x27\n"
+      "cbz x9, 8f\n"
+      "mov x20, x9\n"
+      "3:"  // K loop: Main loop
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560328  // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25386141  // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe05606e9  // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe07f82a0  // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      ".inst 0x25386d20  // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+      ".inst 0xe06b82a1  // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560328  // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25386141  // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe05606e9  // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe07f82a0  // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      ".inst 0x25386d20  // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "inch x27\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe06b82a1  // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+      "addvl x21, x21, #2\n"
+      "inch x22\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560320  // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25386141  // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe05606e1  // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe07f82a8  // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      ".inst 0x25386d20  // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+      ".inst 0xe06b82a9  // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560320  // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25386141  // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe05606e1  // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe07f82a8  // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      ".inst 0x25386d20  // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe06b82a9  // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+      "addvl x21, x21, #2\n"
+      "inch x27\n"
+      "inch x22\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x24, 11f\n"
+      "mov x26, %x[in]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe07f82a0  // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      "ldr x20, [x26, #0x0]\n"
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560288  // ld1h { za1h.h[x12] }, p0/Z, [x20, x22, LSL #1]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x11\n"
+      "add x26, x26, #0x8\n"
+      "addvl x21, x21, #1\n"
+      "blt 9b\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe07f82a8  // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "addvl x21, x21, #1\n"
+      "blt 10b\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe07f82a0  // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "addvl x21, x21, #1\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x21\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp
new file mode 100644
index 0000000000..98bdcd2fa2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<1, 2, VLType::SME, false>(
+  bfloat16 * &out, const bfloat16 * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cnth x22\n"
+      "mov x21, %x[width]\n"
+      "inch x21\n"
+      "mov x20, %x[width]\n"
+      "sub x11, x22, #0x1\n"
+      "sub x21, x21, #0x1\n"
+      "ands x11, x20, x11\n"
+      "cntw x10\n"
+      "udiv x21, x21, x22\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x11, x11, x22, NE\n"
+      "sub x9, x21, #0x1\n"
+      "add x11, x11, #0x1\n"
+      "sub x28, x10, #0x2\n"
+      "lsl x20, %x[height], #0x1\n"  // height * 2
+      "mov x27, #0x0\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      "lsr x9, x9, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "and x24, x21, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "ldr x23, [x26, #0x8]\n"
+      "lsr x11, x11, #0x1\n"
+      "ptrue p11.s\n"
+      "whilelt p10.h, XZR, x20\n"
+      "mov x22, %x[row_offset]\n"
+      "mov x21, %x[out]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "add x26, x26, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560320  // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25686140  // psel p0.h, p8.h/Z, p10.h[w12, #2]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0xe05602e2  // ld1h { za0h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x28, LSL #1\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560320  // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25686140  // psel p0.h, p8.h/Z, p10.h[w12, #2]\n"
+      "mov x26, %x[in]\n"
+      ".inst 0xe05602e2  // ld1h { za0h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      "inch x22\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "inch x27\n"
+      "cbz x9, 8f\n"
+      "mov x20, x9\n"
+      "3:"  // K loop: Main loop
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x25396140  // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
+      ".inst 0xe0562321  // ld1h { za0h.h[x13, #1] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25796141  // psel p1.h, p8.h/Z, p10.h[w13, #3]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe05626e3  // ld1h { za0h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0aa82a1  // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "add x13, x13, #0x4\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x25396140  // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
+      ".inst 0xe0562321  // ld1h { za0h.h[x13, #1] }, p0/Z, [x25, x22, LSL #1]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25796141  // psel p1.h, p8.h/Z, p10.h[w13, #3]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe05626e3  // ld1h { za0h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "inch x27\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe0aa82a1  // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "addvl x21, x21, #2\n"
+      "inch x22\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25296140  // psel p0.h, p8.h/Z, p10.h[w13]\n"
+      ".inst 0xe0562320  // ld1h { za0h.h[x13] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25696141  // psel p1.h, p8.h/Z, p10.h[w13, #2]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe05626e2  // ld1h { za0h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0aa82a9  // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "add x13, x13, #0x4\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25296140  // psel p0.h, p8.h/Z, p10.h[w13]\n"
+      ".inst 0xe0562320  // ld1h { za0h.h[x13] }, p0/Z, [x25, x22, LSL #1]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25696141  // psel p1.h, p8.h/Z, p10.h[w13, #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe05626e2  // ld1h { za0h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe0aa82a9  // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "addvl x21, x21, #2\n"
+      "inch x27\n"
+      "inch x22\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x24, 11f\n"
+      "mov x26, %x[in]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "ldr x20, [x26, #0x0]\n"
+      "add x12, x12, #0x1\n"
+      ".inst 0x25396140  // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
+      "cmp x12, x10\n"
+      ".inst 0xe0562281  // ld1h { za0h.h[x13, #1] }, p0/Z, [x20, x22, LSL #1]\n"
+      "add x26, x26, #0x8\n"
+      "addvl x21, x21, #1\n"
+      "add x13, x13, #0x2\n"
+      "blt 9b\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x11\n"
+      "addvl x21, x21, #1\n"
+      "add x20, x20, #0x2\n"
+      "blt 10b\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x11\n"
+      "addvl x21, x21, #1\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x21\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_fp16_fp16.hpp
new file mode 100644
index 0000000000..30c3e42aed
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_fp16_fp16.hpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+template <>
+void interleave_block<1, 2, VLType::SME, false>(
+  __fp16 * &out, const __fp16 * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "mov x22, %x[width]\n"
+      "mov x21, %x[width]\n"
+      "cnth x20\n"
+      "inch x22\n"
+      "sub x11, x20, #0x1\n"
+      "sub x22, x22, #0x1\n"
+      "ands x11, x21, x11\n"
+      "cntw x10\n"
+      "udiv x22, x22, x20\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x11, x11, x20, NE\n"
+      "sub x9, x22, #0x1\n"
+      "add x11, x11, #0x1\n"
+      "sub x28, x10, #0x2\n"
+      "lsl x20, %x[height], #0x1\n"  // height * 2
+      "mov x27, #0x0\n"
+      "mov x26, %x[in]\n"
+      "lsr x9, x9, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "and x25, x22, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "ldr x24, [x26, #0x0]\n"
+      "lsr x11, x11, #0x1\n"
+      "ptrue p11.s\n"
+      "ldr x23, [x26, #0x8]\n"
+      "whilelt p10.h, XZR, x20\n"
+      "mov x22, %x[row_offset]\n"
+      "mov x21, %x[out]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "add x26, x26, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25286143  // psel p3.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0x25686142  // psel p2.h, p8.h/Z, p10.h[w12, #2]\n"
+      ".inst 0xe0560f00  // ld1h { za0h.h[x12] }, p3/Z, [x24, x22, LSL #1]\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe0560ae2  // ld1h { za0h.h[x12, #2] }, p2/Z, [x23, x22, LSL #1]\n"
+      "add x12, x12, #0x4\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "cmp x12, x28, LSL #1\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25286141  // psel p1.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0x25686140  // psel p0.h, p8.h/Z, p10.h[w12, #2]\n"
+      "mov x26, %x[in]\n"
+      "inch x27\n"
+      ".inst 0xe0560700  // ld1h { za0h.h[x12] }, p1/Z, [x24, x22, LSL #1]\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe05602e2  // ld1h { za0h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "inch x22\n"
+      "cbz x9, 8f\n"
+      "mov x20, x9\n"
+      "3:"  // K loop: Main loop
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x15, #0x0\n"
+      "mov x14, #0x0\n"
+      "cbz x28, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x253b6143  // psel p3.h, p8.h/Z, p10.h[w15, #1]\n"
+      ".inst 0x257b6142  // psel p2.h, p8.h/Z, p10.h[w15, #3]\n"
+      ".inst 0x252a6d21  // psel p1.h, p11.h/Z, p9.h[w14]\n"
+      ".inst 0x253a6d20  // psel p0.h, p11.h/Z, p9.h[w14, #1]\n"
+      ".inst 0xe0566f01  // ld1h { za0h.h[x15, #1] }, p3/Z, [x24, x22, LSL #1]\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe0566ae3  // ld1h { za0h.h[x15, #3] }, p2/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "add x15, x15, #0x4\n"
+      ".inst 0xe0bfc6a0  // st1w { za0v.s[x14] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0aac2a1  // st1w { za0v.s[x14, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "add x14, x14, #0x2\n"
+      "addvl x21, x21, #2\n"
+      "cmp x14, x28\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x253b6143  // psel p3.h, p8.h/Z, p10.h[w15, #1]\n"
+      ".inst 0x257b6142  // psel p2.h, p8.h/Z, p10.h[w15, #3]\n"
+      ".inst 0x252a6d21  // psel p1.h, p11.h/Z, p9.h[w14]\n"
+      ".inst 0x253a6d20  // psel p0.h, p11.h/Z, p9.h[w14, #1]\n"
+      "mov x26, %x[in]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      ".inst 0xe0566f01  // ld1h { za0h.h[x15, #1] }, p3/Z, [x24, x22, LSL #1]\n"
+      "ldr x24, [x26, #0x0]\n"
+      "inch x27\n"
+      "mov x13, #0x0\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      ".inst 0xe0566ae3  // ld1h { za0h.h[x15, #3] }, p2/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "inch x22\n"
+      ".inst 0xe0bfc6a0  // st1w { za0v.s[x14] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0aac2a1  // st1w { za0v.s[x14, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "addvl x21, x21, #2\n"
+      "cbz x28, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25296143  // psel p3.h, p8.h/Z, p10.h[w13]\n"
+      ".inst 0x25696142  // psel p2.h, p8.h/Z, p10.h[w13, #2]\n"
+      ".inst 0x25286d21  // psel p1.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0x25386d20  // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+      ".inst 0xe0562f00  // ld1h { za0h.h[x13] }, p3/Z, [x24, x22, LSL #1]\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe0562ae2  // ld1h { za0h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "add x13, x13, #0x4\n"
+      ".inst 0xe0bf86a8  // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0aa82a9  // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "addvl x21, x21, #2\n"
+      "cmp x12, x28\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25296143  // psel p3.h, p8.h/Z, p10.h[w13]\n"
+      ".inst 0x25696142  // psel p2.h, p8.h/Z, p10.h[w13, #2]\n"
+      ".inst 0x25286d21  // psel p1.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0x25386d20  // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+      "mov x26, %x[in]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      ".inst 0xe0562f00  // ld1h { za0h.h[x13] }, p3/Z, [x24, x22, LSL #1]\n"
+      "ldr x24, [x26, #0x0]\n"
+      "subs x20, x20, #0x1\n"
+      "inch x27\n"
+      ".inst 0xe0562ae2  // ld1h { za0h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "inch x22\n"
+      ".inst 0xe0bf86a8  // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0aa82a9  // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "addvl x21, x21, #2\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x25, 11f\n"
+      "mov x26, %x[in]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0x25396143  // psel p3.h, p8.h/Z, p10.h[w13, #1]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "ldr x24, [x26, #0x0]\n"
+      "add x12, x12, #0x1\n"
+      "add x26, x26, #0x8\n"
+      "cmp x12, x10\n"
+      "addvl x21, x21, #1\n"
+      ".inst 0xe0562f01  // ld1h { za0h.h[x13, #1] }, p3/Z, [x24, x22, LSL #1]\n"
+      "add x13, x13, #0x2\n"
+      "blt 9b\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      "add x20, x20, #0x2\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "addvl x21, x21, #1\n"
+      "cmp x12, x11\n"
+      "blt 10b\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x11\n"
+      "addvl x21, x21, #1\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x21\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp
new file mode 100644
index 0000000000..4390bb7c7f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<1, 4, VLType::SME, false>(
+  int8_t * &out, const int8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntb x21\n"
+      "mov x23, %x[width]\n"
+      "incb x23\n"
+      "mov x20, %x[width]\n"
+      "sub x10, x21, #0x1\n"
+      "cntw x9\n"
+      "sub x23, x23, #0x1\n"
+      "ands x10, x20, x10\n"
+      "udiv x23, x23, x21\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x10, x10, x21, NE\n"
+      "lsl x22, %x[height], #0x1\n"  // height * 2
+      "lsl x21, x9, #0x1\n"
+      "sub x20, x23, #0x1\n"
+      "add x10, x10, #0x3\n"
+      "sub x28, x9, #0x2\n"
+      "whilelt p9.b, XZR, x22\n"
+      "whilelt p8.b, x21, x22\n"
+      "mov x27, #0x0\n"
+      "mov x26, %x[in]\n"
+      "lsr x20, x20, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "ldr x25, [x26, #0x0]\n"
+      "and x24, x23, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "lsr x10, x10, #0x2\n"
+      "ldr x23, [x26, #0x8]\n"
+      "ptrue p11.s\n"
+      "zip1 p10.b, p9.b, p8.b\n"
+      "mov x22, %x[row_offset]\n"
+      "mov x21, %x[out]\n"
+      "whilelt p9.b, x27, %x[width]\n"
+      "whilelt p8.b, x27, %x[width]\n"
+      "add x26, x26, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0160320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n"
+      ".inst 0x25646140  // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0xe01602e4  // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n"
+      "add x12, x12, #0x8\n"
+      "cmp x12, x28, LSL #2\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0160320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n"
+      ".inst 0x25646140  // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
+      "mov x26, %x[in]\n"
+      ".inst 0xe01602e4  // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n"
+      "ldr x25, [x26, #0x0]\n"
+      "incb x22\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "incb x27\n"
+      "cbz x20, 8f\n"
+      "mov x20, x20\n"
+      "3:"  // K loop: Main loop
+      "whilelt p8.b, x27, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe0162322  // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
+      ".inst 0x25756141  // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e6  // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0a982a1  // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "add x13, x13, #0x8\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe0162322  // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25756141  // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e6  // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "whilelt p9.b, x27, %x[width]\n"
+      "incb x27\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe0a982a1  // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+      "addvl x21, x21, #2\n"
+      "incb x22\n"
+      "whilelt p8.b, x27, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe0162320  // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n"
+      ".inst 0x25656141  // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e4  // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0a982a9  // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "add x13, x13, #0x8\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe0162320  // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25656141  // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e4  // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "whilelt p9.b, x27, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe0a982a9  // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+      "addvl x21, x21, #2\n"
+      "incb x27\n"
+      "incb x22\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x24, 11f\n"
+      "mov x26, %x[in]\n"
+      "whilelt p8.b, x27, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "ldr x20, [x26, #0x0]\n"
+      "add x12, x12, #0x1\n"
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      "cmp x12, x9\n"
+      ".inst 0xe0162282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+      "add x26, x26, #0x8\n"
+      "addvl x21, x21, #1\n"
+      "add x13, x13, #0x4\n"
+      "blt 9b\n"
+      "whilelt p9.b, x27, %x[width]\n"
+      "whilelt p8.b, x27, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "addvl x21, x21, #1\n"
+      "add x20, x20, #0x4\n"
+      "blt 10b\n"
+      "whilelt p8.b, x27, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "addvl x21, x21, #1\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x21\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp
new file mode 100644
index 0000000000..f5ee261964
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<1, 4, VLType::SME, true>(
+  int8_t * &out, const int8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntb x21\n"
+      "mov x23, %x[width]\n"
+      "mov z18.b, #0x1\n"
+      "incb x23\n"
+      "mov x20, %x[width]\n"
+      "mov z17.s, #0x0\n"
+      "sub x10, x21, #0x1\n"
+      "cntw x9\n"
+      "sub x23, x23, #0x1\n"
+      "ands x10, x20, x10\n"
+      "udiv x23, x23, x21\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x10, x10, x21, NE\n"
+      "lsl x22, %x[height], #0x1\n"  // height * 2
+      "lsl x21, x9, #0x1\n"
+      "sub x20, x23, #0x1\n"
+      "add x10, x10, #0x3\n"
+      "whilelt p9.b, XZR, x22\n"
+      "whilelt p8.b, x21, x22\n"
+      "mov x28, #0x0\n"
+      "ptrue p2.b\n"
+      "lsr x20, x20, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "and x27, x23, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "lsr x10, x10, #0x2\n"
+      "sub x26, x9, #0x2\n"
+      "ptrue p11.s\n"
+      "zip1 p10.b, p9.b, p8.b\n"
+      "mov x25, %x[row_offset]\n"
+      "mov x24, %x[out]\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "cbnz %x[first], 1f\n"
+      "addvl x24, x24, #-1\n"
+      "ld1w { z17.s }, p2/Z, [x24]\n"
+      "1:"  // K loop: Load row sums: End
+      "mov x23, %x[in]\n"
+      "ldr x22, [x23, #0x0]\n"
+      "mov x12, #0x0\n"
+      "ldr x21, [x23, #0x8]\n"
+      "add x23, x23, #0x10\n"
+      "cbz x26, 3f\n"
+      "2:"  // K loop: Charge: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe01902c0  // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25646140  // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
+      "ldr x22, [x23, #0x0]\n"
+      ".inst 0xe01902a4  // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n"
+      "add x12, x12, #0x8\n"
+      "cmp x12, x26, LSL #2\n"
+      "ldr x21, [x23, #0x8]\n"
+      "add x23, x23, #0x10\n"
+      "blt 2b\n"
+      "3:"  // K loop: Charge: End
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe01902c0  // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25646140  // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
+      "mov x23, %x[in]\n"
+      ".inst 0xe01902a4  // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n"
+      "ldr x22, [x23, #0x0]\n"
+      "incb x25\n"
+      "ldr x21, [x23, #0x8]\n"
+      "add x23, x23, #0x10\n"
+      "incb x28\n"
+      "cbz x20, 9f\n"
+      "mov x20, x20\n"
+      "4:"  // K loop: Main loop
+      "whilelt p8.b, x28, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x26, 6f\n"
+      "5:"  // K loop: Main loop: First: Loop
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe01922c2  // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25756140  // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+      "ldr x22, [x23, #0x0]\n"
+      ".inst 0xe01922a6  // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n"
+      ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      "sdot z17.s, z16.b, z18.b\n"
+      "ldr x21, [x23, #0x8]\n"
+      ".inst 0xe0bf8300  // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0828830  // mova z16.s, p2/M, za0v.s[x12, #1]\n"
+      ".inst 0xe0a98301  // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x26\n"
+      "sdot z17.s, z16.b, z18.b\n"
+      "add x23, x23, #0x10\n"
+      "addvl x24, x24, #2\n"
+      "add x13, x13, #0x8\n"
+      "blt 5b\n"
+      "6:"  // K loop: Main loop: First: Tail
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe01922c2  // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25756140  // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+      ".inst 0xe01922a6  // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n"
+      ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
+      "sdot z17.s, z16.b, z18.b\n"
+      "mov x23, %x[in]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x23, #0x0]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0828830  // mova z16.s, p2/M, za0v.s[x12, #1]\n"
+      "ldr x21, [x23, #0x8]\n"
+      ".inst 0xe0bf8700  // st1w { za0v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "incb x28\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xe0a98301  // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+      "sdot z17.s, z16.b, z18.b\n"
+      "addvl x24, x24, #2\n"
+      "incb x25\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x26, 8f\n"
+      "7:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe01922c0  // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25656140  // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+      "ldr x22, [x23, #0x0]\n"
+      ".inst 0xe01922a4  // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n"
+      ".inst 0xc0828910  // mova z16.s, p2/M, za2v.s[x12]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      "sdot z17.s, z16.b, z18.b\n"
+      "ldr x21, [x23, #0x8]\n"
+      ".inst 0xe0bf8308  // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0828930  // mova z16.s, p2/M, za2v.s[x12, #1]\n"
+      ".inst 0xe0a98309  // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x26\n"
+      "sdot z17.s, z16.b, z18.b\n"
+      "add x23, x23, #0x10\n"
+      "addvl x24, x24, #2\n"
+      "add x13, x13, #0x8\n"
+      "blt 7b\n"
+      "8:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe01922c0  // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25656140  // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+      ".inst 0xe01922a4  // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n"
+      ".inst 0xc0828910  // mova z16.s, p2/M, za2v.s[x12]\n"
+      "sdot z17.s, z16.b, z18.b\n"
+      "mov x23, %x[in]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x23, #0x0]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0828930  // mova z16.s, p2/M, za2v.s[x12, #1]\n"
+      "ldr x21, [x23, #0x8]\n"
+      ".inst 0xe0bf8708  // st1w { za2v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xe0a98309  // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+      "sdot z17.s, z16.b, z18.b\n"
+      "addvl x24, x24, #2\n"
+      "incb x28\n"
+      "incb x25\n"
+      "bgt 4b\n"
+      "9:"  // K loop: Tails
+      "cbnz x27, 12f\n"
+      "mov x23, %x[in]\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: First
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8300  // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+      "ldr x20, [x23, #0x0]\n"
+      ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      "sdot z17.s, z16.b, z18.b\n"
+      ".inst 0xe0192282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n"
+      "cmp x12, x9\n"
+      "add x23, x23, #0x8\n"
+      "addvl x24, x24, #1\n"
+      "add x13, x13, #0x4\n"
+      "blt 10b\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "11:"  // K loop: Tails: Even: Second
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8308  // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+      ".inst 0xc0828910  // mova z16.s, p2/M, za2v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "sdot z17.s, z16.b, z18.b\n"
+      "addvl x24, x24, #1\n"
+      "add x20, x20, #0x4\n"
+      "blt 11b\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "b 14f\n"
+      "12:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "13:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8300  // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+      ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "sdot z17.s, z16.b, z18.b\n"
+      "addvl x24, x24, #1\n"
+      "blt 13b\n"
+      "14:"  // K loop: End
+      "st1w { z17.s }, p2, [x24]\n"
+      "addvl x24, x24, #1\n"
+      "mov %x[out], x24\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp
new file mode 100644
index 0000000000..76c1d053cd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<1, 4, VLType::SME, false>(
+  uint8_t * &out, const uint8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntb x21\n"
+      "mov x23, %x[width]\n"
+      "incb x23\n"
+      "mov x20, %x[width]\n"
+      "sub x10, x21, #0x1\n"
+      "cntw x9\n"
+      "sub x23, x23, #0x1\n"
+      "ands x10, x20, x10\n"
+      "udiv x23, x23, x21\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x10, x10, x21, NE\n"
+      "lsl x22, %x[height], #0x1\n"  // height * 2
+      "lsl x21, x9, #0x1\n"
+      "sub x20, x23, #0x1\n"
+      "add x10, x10, #0x3\n"
+      "sub x28, x9, #0x2\n"
+      "whilelt p9.b, XZR, x22\n"
+      "whilelt p8.b, x21, x22\n"
+      "mov x27, #0x0\n"
+      "mov x26, %x[in]\n"
+      "lsr x20, x20, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "ldr x25, [x26, #0x0]\n"
+      "and x24, x23, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "lsr x10, x10, #0x2\n"
+      "ldr x23, [x26, #0x8]\n"
+      "ptrue p11.s\n"
+      "zip1 p10.b, p9.b, p8.b\n"
+      "mov x22, %x[row_offset]\n"
+      "mov x21, %x[out]\n"
+      "whilelt p9.b, x27, %x[width]\n"
+      "whilelt p8.b, x27, %x[width]\n"
+      "add x26, x26, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0160320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n"
+      ".inst 0x25646140  // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0xe01602e4  // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n"
+      "add x12, x12, #0x8\n"
+      "cmp x12, x28, LSL #2\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0160320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n"
+      ".inst 0x25646140  // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
+      "mov x26, %x[in]\n"
+      ".inst 0xe01602e4  // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n"
+      "ldr x25, [x26, #0x0]\n"
+      "incb x22\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "incb x27\n"
+      "cbz x20, 8f\n"
+      "mov x20, x20\n"
+      "3:"  // K loop: Main loop
+      "whilelt p8.b, x27, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe0162322  // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
+      ".inst 0x25756141  // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e6  // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0a982a1  // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "add x13, x13, #0x8\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe0162322  // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25756141  // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e6  // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "whilelt p9.b, x27, %x[width]\n"
+      "incb x27\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe0a982a1  // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+      "addvl x21, x21, #2\n"
+      "incb x22\n"
+      "whilelt p8.b, x27, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe0162320  // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n"
+      ".inst 0x25656141  // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e4  // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0a982a9  // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "add x13, x13, #0x8\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe0162320  // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25656141  // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e4  // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "whilelt p9.b, x27, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe0a982a9  // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n"
+      "addvl x21, x21, #2\n"
+      "incb x27\n"
+      "incb x22\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x24, 11f\n"
+      "mov x26, %x[in]\n"
+      "whilelt p8.b, x27, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "ldr x20, [x26, #0x0]\n"
+      "add x12, x12, #0x1\n"
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      "cmp x12, x9\n"
+      ".inst 0xe0162282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+      "add x26, x26, #0x8\n"
+      "addvl x21, x21, #1\n"
+      "add x13, x13, #0x4\n"
+      "blt 9b\n"
+      "whilelt p9.b, x27, %x[width]\n"
+      "whilelt p8.b, x27, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "addvl x21, x21, #1\n"
+      "add x20, x20, #0x4\n"
+      "blt 10b\n"
+      "whilelt p8.b, x27, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "addvl x21, x21, #1\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x21\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp
new file mode 100644
index 0000000000..daf2d3a100
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<1, 4, VLType::SME, true>(
+  uint8_t * &out, const uint8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntb x21\n"
+      "mov x23, %x[width]\n"
+      "mov z18.b, #0x1\n"
+      "incb x23\n"
+      "mov x20, %x[width]\n"
+      "mov z17.s, #0x0\n"
+      "sub x10, x21, #0x1\n"
+      "cntw x9\n"
+      "sub x23, x23, #0x1\n"
+      "ands x10, x20, x10\n"
+      "udiv x23, x23, x21\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x10, x10, x21, NE\n"
+      "lsl x22, %x[height], #0x1\n"  // height * 2
+      "lsl x21, x9, #0x1\n"
+      "sub x20, x23, #0x1\n"
+      "add x10, x10, #0x3\n"
+      "whilelt p9.b, XZR, x22\n"
+      "whilelt p8.b, x21, x22\n"
+      "mov x28, #0x0\n"
+      "ptrue p2.b\n"
+      "lsr x20, x20, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "and x27, x23, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "lsr x10, x10, #0x2\n"
+      "sub x26, x9, #0x2\n"
+      "ptrue p11.s\n"
+      "zip1 p10.b, p9.b, p8.b\n"
+      "mov x25, %x[row_offset]\n"
+      "mov x24, %x[out]\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "cbnz %x[first], 1f\n"
+      "addvl x24, x24, #-1\n"
+      "ld1w { z17.s }, p2/Z, [x24]\n"
+      "1:"  // K loop: Load row sums: End
+      "mov x23, %x[in]\n"
+      "ldr x22, [x23, #0x0]\n"
+      "mov x12, #0x0\n"
+      "ldr x21, [x23, #0x8]\n"
+      "add x23, x23, #0x10\n"
+      "cbz x26, 3f\n"
+      "2:"  // K loop: Charge: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe01902c0  // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25646140  // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
+      "ldr x22, [x23, #0x0]\n"
+      ".inst 0xe01902a4  // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n"
+      "add x12, x12, #0x8\n"
+      "cmp x12, x26, LSL #2\n"
+      "ldr x21, [x23, #0x8]\n"
+      "add x23, x23, #0x10\n"
+      "blt 2b\n"
+      "3:"  // K loop: Charge: End
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe01902c0  // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25646140  // psel p0.b, p8.b/Z, p10.b[w12, #4]\n"
+      "mov x23, %x[in]\n"
+      ".inst 0xe01902a4  // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n"
+      "ldr x22, [x23, #0x0]\n"
+      "incb x25\n"
+      "ldr x21, [x23, #0x8]\n"
+      "add x23, x23, #0x10\n"
+      "incb x28\n"
+      "cbz x20, 9f\n"
+      "mov x20, x20\n"
+      "4:"  // K loop: Main loop
+      "whilelt p8.b, x28, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x26, 6f\n"
+      "5:"  // K loop: Main loop: First: Loop
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe01922c2  // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25756140  // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+      "ldr x22, [x23, #0x0]\n"
+      ".inst 0xe01922a6  // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n"
+      ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      "udot z17.s, z16.b, z18.b\n"
+      "ldr x21, [x23, #0x8]\n"
+      ".inst 0xe0bf8300  // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0828830  // mova z16.s, p2/M, za0v.s[x12, #1]\n"
+      ".inst 0xe0a98301  // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x26\n"
+      "udot z17.s, z16.b, z18.b\n"
+      "add x23, x23, #0x10\n"
+      "addvl x24, x24, #2\n"
+      "add x13, x13, #0x8\n"
+      "blt 5b\n"
+      "6:"  // K loop: Main loop: First: Tail
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe01922c2  // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25756140  // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+      ".inst 0xe01922a6  // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n"
+      ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
+      "udot z17.s, z16.b, z18.b\n"
+      "mov x23, %x[in]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x23, #0x0]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0828830  // mova z16.s, p2/M, za0v.s[x12, #1]\n"
+      "ldr x21, [x23, #0x8]\n"
+      ".inst 0xe0bf8700  // st1w { za0v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "incb x28\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xe0a98301  // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+      "udot z17.s, z16.b, z18.b\n"
+      "addvl x24, x24, #2\n"
+      "incb x25\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x26, 8f\n"
+      "7:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe01922c0  // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25656140  // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+      "ldr x22, [x23, #0x0]\n"
+      ".inst 0xe01922a4  // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n"
+      ".inst 0xc0828910  // mova z16.s, p2/M, za2v.s[x12]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      "udot z17.s, z16.b, z18.b\n"
+      "ldr x21, [x23, #0x8]\n"
+      ".inst 0xe0bf8308  // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0828930  // mova z16.s, p2/M, za2v.s[x12, #1]\n"
+      ".inst 0xe0a98309  // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x26\n"
+      "udot z17.s, z16.b, z18.b\n"
+      "add x23, x23, #0x10\n"
+      "addvl x24, x24, #2\n"
+      "add x13, x13, #0x8\n"
+      "blt 7b\n"
+      "8:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe01922c0  // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n"
+      ".inst 0x25656140  // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+      ".inst 0xe01922a4  // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n"
+      ".inst 0xc0828910  // mova z16.s, p2/M, za2v.s[x12]\n"
+      "udot z17.s, z16.b, z18.b\n"
+      "mov x23, %x[in]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x23, #0x0]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0828930  // mova z16.s, p2/M, za2v.s[x12, #1]\n"
+      "ldr x21, [x23, #0x8]\n"
+      ".inst 0xe0bf8708  // st1w { za2v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xe0a98309  // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n"
+      "udot z17.s, z16.b, z18.b\n"
+      "addvl x24, x24, #2\n"
+      "incb x28\n"
+      "incb x25\n"
+      "bgt 4b\n"
+      "9:"  // K loop: Tails
+      "cbnz x27, 12f\n"
+      "mov x23, %x[in]\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: First
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8300  // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+      "ldr x20, [x23, #0x0]\n"
+      ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      "udot z17.s, z16.b, z18.b\n"
+      ".inst 0xe0192282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n"
+      "cmp x12, x9\n"
+      "add x23, x23, #0x8\n"
+      "addvl x24, x24, #1\n"
+      "add x13, x13, #0x4\n"
+      "blt 10b\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "11:"  // K loop: Tails: Even: Second
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8308  // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+      ".inst 0xc0828910  // mova z16.s, p2/M, za2v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "udot z17.s, z16.b, z18.b\n"
+      "addvl x24, x24, #1\n"
+      "add x20, x20, #0x4\n"
+      "blt 11b\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "b 14f\n"
+      "12:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "13:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8300  // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
+      ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "udot z17.s, z16.b, z18.b\n"
+      "addvl x24, x24, #1\n"
+      "blt 13b\n"
+      "14:"  // K loop: End
+      "st1w { z17.s }, p2, [x24]\n"
+      "addvl x24, x24, #1\n"
+      "mov %x[out], x24\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp
new file mode 100644
index 0000000000..274f69f370
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<1, 1, VLType::SME, false>(
+  __fp16 * &out, const __fp16 * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "mov x21, %x[width]\n"
+      "inch x21\n"
+      "cnth x11\n"
+      "sub x21, x21, #0x1\n"
+      "udiv x21, x21, x11\n"  // n_passes = ceildiv(width, VL<T>)
+      "mov x20, %x[width]\n"
+      "sub x10, x11, #0x1\n"
+      "sub x9, x21, #0x1\n"
+      "ands x10, x20, x10\n"
+      "sub x28, x11, #0x2\n"
+      "lsl x20, %x[height], #0x1\n"  // height * 2
+      "mov x27, #0x0\n"
+      "mov x26, %x[in]\n"
+      "lsr x9, x9, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "ldr x25, [x26, #0x0]\n"
+      "and x24, x21, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "csel x10, x10, x11, NE\n"
+      "ldr x23, [x26, #0x8]\n"
+      "ptrue p11.h\n"
+      "whilelt p10.h, XZR, x20\n"
+      "mov x22, %x[row_offset]\n"
+      "mov x21, %x[out]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "add x26, x26, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560320  // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25386140  // psel p0.h, p8.h/Z, p10.h[w12, #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0xe05602e1  // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560320  // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25386140  // psel p0.h, p8.h/Z, p10.h[w12, #1]\n"
+      "mov x26, %x[in]\n"
+      ".inst 0xe05602e1  // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      "inch x22\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "inch x27\n"
+      "cbz x9, 8f\n"
+      "mov x20, x9\n"
+      "3:"  // K loop: Main loop
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560328  // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25386141  // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe05606e9  // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe07f82a0  // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      ".inst 0x25386d20  // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+      ".inst 0xe06b82a1  // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560328  // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25386141  // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe05606e9  // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe07f82a0  // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      ".inst 0x25386d20  // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "inch x27\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe06b82a1  // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+      "addvl x21, x21, #2\n"
+      "inch x22\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560320  // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0x25386141  // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe05606e1  // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe07f82a8  // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      ".inst 0x25386d20  // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+      ".inst 0xe06b82a9  // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560320  // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25386141  // psel p1.h, p8.h/Z, p10.h[w12, #1]\n"
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe05606e1  // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe07f82a8  // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      ".inst 0x25386d20  // psel p0.h, p11.h/Z, p9.h[w12, #1]\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe06b82a9  // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n"
+      "addvl x21, x21, #2\n"
+      "inch x27\n"
+      "inch x22\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x24, 11f\n"
+      "mov x26, %x[in]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe07f82a0  // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      "ldr x20, [x26, #0x0]\n"
+      ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
+      ".inst 0xe0560288  // ld1h { za1h.h[x12] }, p0/Z, [x20, x22, LSL #1]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x11\n"
+      "add x26, x26, #0x8\n"
+      "addvl x21, x21, #1\n"
+      "blt 9b\n"
+      "whilelt p9.h, x27, %x[width]\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe07f82a8  // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "addvl x21, x21, #1\n"
+      "blt 10b\n"
+      "whilelt p8.h, x27, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
+      ".inst 0xe07f82a0  // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "addvl x21, x21, #1\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x21\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp
new file mode 100644
index 0000000000..ab290649fd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<1, 1, VLType::SME, false>(
+  float * &out, const float * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "mov x22, %x[width]\n"
+      "incw x22\n"
+      "cntw x10\n"
+      "sub x22, x22, #0x1\n"
+      "udiv x22, x22, x10\n"  // n_passes = ceildiv(width, VL<T>)
+      "mov x21, %x[width]\n"
+      "sub x9, x10, #0x1\n"
+      "sub x20, x22, #0x1\n"
+      "ands x9, x21, x9\n"
+      "sub x28, x10, #0x2\n"
+      "mov x27, #0x0\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      "lsr x20, x20, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "and x24, x22, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "ldr x23, [x26, #0x8]\n"
+      "csel x9, x9, x10, NE\n"
+      "ptrue p11.s\n"
+      "whilelt p10.s, XZR, %x[height]\n"
+      "mov x22, %x[row_offset]\n"
+      "mov x21, %x[out]\n"
+      "whilelt p9.s, x27, %x[width]\n"
+      "whilelt p8.s, x27, %x[width]\n"
+      "add x26, x26, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25306140  // psel p0.s, p8.s/Z, p10.s[w12]\n"
+      ".inst 0xe0960320  // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
+      ".inst 0x25706140  // psel p0.s, p8.s/Z, p10.s[w12, #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0xe09602e1  // ld1w { za0h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25306140  // psel p0.s, p8.s/Z, p10.s[w12]\n"
+      ".inst 0xe0960320  // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
+      ".inst 0x25706140  // psel p0.s, p8.s/Z, p10.s[w12, #1]\n"
+      "mov x26, %x[in]\n"
+      ".inst 0xe09602e1  // ld1w { za0h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n"
+      "ldr x25, [x26, #0x0]\n"
+      "incw x22\n"
+      "ldr x23, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "incw x27\n"
+      "cbz x20, 8f\n"
+      "mov x20, x20\n"
+      "3:"  // K loop: Main loop
+      "whilelt p8.s, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x25306140  // psel p0.s, p8.s/Z, p10.s[w12]\n"
+      ".inst 0xe0960328  // ld1w { za2h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
+      ".inst 0x25706141  // psel p1.s, p8.s/Z, p10.s[w12, #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe09606e9  // ld1w { za2h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0aa82a1  // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x25306140  // psel p0.s, p8.s/Z, p10.s[w12]\n"
+      ".inst 0xe0960328  // ld1w { za2h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25706141  // psel p1.s, p8.s/Z, p10.s[w12, #1]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe09606e9  // ld1w { za2h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "whilelt p9.s, x27, %x[width]\n"
+      "incw x27\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe0aa82a1  // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "addvl x21, x21, #2\n"
+      "incw x22\n"
+      "whilelt p8.s, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "cbz x28, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25306140  // psel p0.s, p8.s/Z, p10.s[w12]\n"
+      ".inst 0xe0960320  // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
+      ".inst 0x25706141  // psel p1.s, p8.s/Z, p10.s[w12, #1]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe09606e1  // ld1w { za0h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0aa82a9  // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28\n"
+      "add x26, x26, #0x10\n"
+      "addvl x21, x21, #2\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25306140  // psel p0.s, p8.s/Z, p10.s[w12]\n"
+      ".inst 0xe0960320  // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
+      "mov x26, %x[in]\n"
+      "ldr x25, [x26, #0x0]\n"
+      ".inst 0x25706141  // psel p1.s, p8.s/Z, p10.s[w12, #1]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe09606e1  // ld1w { za0h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
+      "ldr x23, [x26, #0x8]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "whilelt p9.s, x27, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe0aa82a9  // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n"
+      "addvl x21, x21, #2\n"
+      "incw x27\n"
+      "incw x22\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x24, 11f\n"
+      "mov x26, %x[in]\n"
+      "whilelt p8.s, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "ldr x20, [x26, #0x0]\n"
+      ".inst 0x25306140  // psel p0.s, p8.s/Z, p10.s[w12]\n"
+      ".inst 0xe0960288  // ld1w { za2h.s[x12] }, p0/Z, [x20, x22, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x10\n"
+      "add x26, x26, #0x8\n"
+      "addvl x21, x21, #1\n"
+      "blt 9b\n"
+      "whilelt p9.s, x27, %x[width]\n"
+      "whilelt p8.s, x27, %x[width]\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x9\n"
+      "addvl x21, x21, #1\n"
+      "blt 10b\n"
+      "whilelt p8.s, x27, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x9\n"
+      "addvl x21, x21, #1\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x21\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp
new file mode 100644
index 0000000000..dc6d12b61e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<2, 1, VLType::SME, false>(
+  bfloat16 * &out, const bfloat16 * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cnth x28\n"
+      "cmp %x[height], x28\n"
+      "cnth x27\n"
+      "csel x28, %x[height], x28, LT\n"
+      "mov x26, #0x0\n"
+      "ptrue p13.s\n"
+      "sub x28, x28, #0x1\n"
+      "whilelt p12.h, XZR, %x[height]\n"
+      "whilelt p11.h, x27, %x[height]\n"
+      "mov x25, %x[row_offset]\n"
+      "mov x24, %x[out]\n"
+      "whilelt p10.h, x26, %x[width]\n"
+      "whilelt p9.h, x26, %x[width]\n"
+      "whilelt p8.h, x26, %x[width]\n"
+      "1:"  // Width loop
+      "add x23, %x[in], XZR, LSL #3\n"
+      "add x20, %x[in], x27, LSL #3\n"
+      "ldr x22, [x23], #0x8\n"
+      "mov x12, #0x0\n"
+      "ldr x21, [x20], #0x8\n"
+      "cbz x28, 3f\n"
+      "2:"  // Loads: Loop
+      ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
+      ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
+      ".inst 0xe05906c0  // ld1h { za0h.h[x12] }, p1/Z, [x22, x25, LSL #1]\n"
+      "ldr x22, [x23], #0x8\n"
+      ".inst 0xe05902a8  // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28, LSL #1\n"
+      "ldr x21, [x20], #0x8\n"
+      "blt 2b\n"
+      "3:"  // Loads: Tail
+      "sub x20, %x[width], x26\n"
+      ".inst 0x25286580  // psel p0.h, p9.h/Z, p12.h[w12]\n"
+      ".inst 0xe05902c0  // ld1h { za0h.h[x12] }, p0/Z, [x22, x25, LSL #1]\n"
+      ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
+      "cmp x20, x27\n"
+      ".inst 0xe05902a8  // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n"
+      "mov x12, #0x0\n"
+      "csel x20, x20, x27, LT\n"
+      "4:"  // Stores: Loop
+      ".inst 0x25287540  // psel p0.h, p13.h/Z, p10.h[w12]\n"
+      ".inst 0xe07f8300  // st1h { za0v.h[x12] }, p0/Z, [x24, XZR, LSL #1]\n"
+      ".inst 0x25287540  // psel p0.h, p13.h/Z, p10.h[w12]\n"
+      ".inst 0xe07b8308  // st1h { za1v.h[x12] }, p0/Z, [x24, x27, LSL #1]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x20\n"
+      "addvl x24, x24, #4\n"
+      "blt 4b\n"
+      "inch x26\n"
+      "whilelt p10.h, x26, %x[width]\n"
+      "whilelt p9.h, x26, %x[width]\n"
+      "whilelt p8.h, x26, %x[width]\n"
+      "inch x25\n"
+      "b.any 1b\n"
+      "mov %x[out], x24\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp
new file mode 100644
index 0000000000..d9189258c1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<2, 2, VLType::SME, false>(
+  bfloat16 * &out, const bfloat16 * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cnth x22\n"
+      "mov x21, %x[width]\n"
+      "inch x21\n"
+      "mov x20, %x[width]\n"
+      "sub x17, x22, #0x1\n"
+      "sub x21, x21, #0x1\n"
+      "ands x17, x20, x17\n"
+      "cntw x16\n"
+      "udiv x21, x21, x22\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x17, x17, x22, NE\n"
+      "sub x13, x21, #0x1\n"
+      "add x17, x17, #0x1\n"
+      "sub x15, x16, #0x2\n"
+      "lsl x22, %x[height], #0x1\n"  // height * 2
+      "lsl x20, x16, #0x1\n"
+      "mov x14, #0x0\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      "ldr x9, [x11, #0x0]\n"
+      "cntw x28, ALL, MUL #2\n"
+      "cntw x27, ALL, MUL #3\n"
+      "ldr x26, [x10, #0x0]\n"
+      "lsr x13, x13, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "and x25, x21, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "ldr x24, [x11, #0x8]\n"
+      "lsr x17, x17, #0x1\n"
+      "ptrue p13.s\n"
+      "ldr x21, [x10, #0x8]\n"
+      "whilelt p12.h, XZR, x22\n"
+      "whilelt p11.h, x20, x22\n"
+      "mov x23, %x[row_offset]\n"
+      "mov x22, %x[out]\n"
+      "whilelt p10.h, x14, %x[width]\n"
+      "whilelt p9.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
+      ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
+      ".inst 0xe0570520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0570348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
+      ".inst 0x25686581  // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
+      ".inst 0x25686160  // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0xe0570702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe05702aa  // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x15, LSL #1\n"
+      "ldr x21, [x10, #0x8]\n"
+      "add x10, x10, #0x10\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
+      ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
+      ".inst 0xe0570520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0570348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
+      ".inst 0x25686581  // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
+      ".inst 0x25686160  // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      ".inst 0xe0570702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe05702aa  // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
+      "ldr x26, [x10, #0x0]\n"
+      "inch x23\n"
+      "inch x14\n"
+      "ldr x24, [x11, #0x8]\n"
+      "add x11, x11, #0x10\n"
+      "ldr x21, [x10, #0x8]\n"
+      "add x10, x10, #0x10\n"
+      "cbz x13, 8f\n"
+      "mov x20, x13\n"
+      "3:"  // K loop: Main loop
+      "whilelt p9.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+      ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+      ".inst 0xe0572521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0572349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
+      ".inst 0x25796580  // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
+      ".inst 0x25796162  // psel p2.h, p8.h/Z, p11.h[w13, #3]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0572303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0572aab  // ld1h { za1h.h[x13, #3] }, p2/Z, [x21, x23, LSL #1]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf86c0  // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x10, x10, #0x10\n"
+      "add x13, x13, #0x4\n"
+      ".inst 0xe0bb82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x15\n"
+      "addvl x22, x22, #4\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+      ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+      ".inst 0xe0572521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0572349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0x25796580  // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
+      ".inst 0x25796161  // psel p1.h, p8.h/Z, p11.h[w13, #3]\n"
+      ".inst 0xe0572303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe05726ab  // ld1h { za1h.h[x13, #3] }, p1/Z, [x21, x23, LSL #1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b08ac4  // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+      "whilelt p10.h, x14, %x[width]\n"
+      "inch x14\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0xe0bb82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "addvl x22, x22, #4\n"
+      "inch x23\n"
+      "whilelt p9.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25296581  // psel p1.h, p9.h/Z, p12.h[w13]\n"
+      ".inst 0x25296160  // psel p0.h, p8.h/Z, p11.h[w13]\n"
+      ".inst 0xe0572520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0572348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
+      ".inst 0x25696580  // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
+      ".inst 0x25696162  // psel p2.h, p8.h/Z, p11.h[w13, #2]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0572302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0572aaa  // ld1h { za1h.h[x13, #2] }, p2/Z, [x21, x23, LSL #1]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf86c8  // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x10, x10, #0x10\n"
+      "add x13, x13, #0x4\n"
+      ".inst 0xe0bb82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x15\n"
+      "addvl x22, x22, #4\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25296581  // psel p1.h, p9.h/Z, p12.h[w13]\n"
+      ".inst 0x25296160  // psel p0.h, p8.h/Z, p11.h[w13]\n"
+      ".inst 0xe0572520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0572348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0x25696580  // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
+      ".inst 0x25696161  // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
+      ".inst 0xe0572302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe05726aa  // ld1h { za1h.h[x13, #2] }, p1/Z, [x21, x23, LSL #1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b08acc  // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+      "whilelt p10.h, x14, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0xe0bb82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "addvl x22, x22, #4\n"
+      "inch x14\n"
+      "inch x23\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x25, 11f\n"
+      "mov x11, %x[in]\n"
+      "whilelt p9.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "ldr x21, [x11, #0x0]\n"
+      "add x12, x12, #0x1\n"
+      ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+      "ldr x20, [x11, x16, LSL #0x3]\n"
+      ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+      "cmp x12, x16\n"
+      ".inst 0xe05726a1  // ld1h { za0h.h[x13, #1] }, p1/Z, [x21, x23, LSL #1]\n"
+      ".inst 0xe0572289  // ld1h { za1h.h[x13, #1] }, p0/Z, [x20, x23, LSL #1]\n"
+      "add x11, x11, #0x8\n"
+      "addvl x22, x22, #2\n"
+      "add x13, x13, #0x2\n"
+      "blt 9b\n"
+      "whilelt p10.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "addvl x22, x22, #2\n"
+      "add x20, x20, #0x2\n"
+      "blt 10b\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "addvl x22, x22, #2\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x22\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp
new file mode 100644
index 0000000000..ef787c89b9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<2, 2, VLType::SME, false>(
+  __fp16 * &out, const __fp16 * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cnth x22\n"
+      "mov x21, %x[width]\n"
+      "inch x21\n"
+      "mov x20, %x[width]\n"
+      "sub x17, x22, #0x1\n"
+      "sub x21, x21, #0x1\n"
+      "ands x17, x20, x17\n"
+      "cntw x16\n"
+      "udiv x21, x21, x22\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x17, x17, x22, NE\n"
+      "sub x13, x21, #0x1\n"
+      "add x17, x17, #0x1\n"
+      "sub x15, x16, #0x2\n"
+      "lsl x22, %x[height], #0x1\n"  // height * 2
+      "lsl x20, x16, #0x1\n"
+      "mov x14, #0x0\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      "ldr x9, [x11, #0x0]\n"
+      "cntw x28, ALL, MUL #2\n"
+      "cntw x27, ALL, MUL #3\n"
+      "ldr x26, [x10, #0x0]\n"
+      "lsr x13, x13, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "and x25, x21, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "ldr x24, [x11, #0x8]\n"
+      "lsr x17, x17, #0x1\n"
+      "ptrue p13.s\n"
+      "ldr x21, [x10, #0x8]\n"
+      "whilelt p12.h, XZR, x22\n"
+      "whilelt p11.h, x20, x22\n"
+      "mov x23, %x[row_offset]\n"
+      "mov x22, %x[out]\n"
+      "whilelt p10.h, x14, %x[width]\n"
+      "whilelt p9.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
+      ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
+      ".inst 0xe0570520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0570348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
+      ".inst 0x25686581  // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
+      ".inst 0x25686160  // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0xe0570702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe05702aa  // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x15, LSL #1\n"
+      "ldr x21, [x10, #0x8]\n"
+      "add x10, x10, #0x10\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
+      ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
+      ".inst 0xe0570520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0570348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
+      ".inst 0x25686581  // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
+      ".inst 0x25686160  // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      ".inst 0xe0570702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe05702aa  // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
+      "ldr x26, [x10, #0x0]\n"
+      "inch x23\n"
+      "inch x14\n"
+      "ldr x24, [x11, #0x8]\n"
+      "add x11, x11, #0x10\n"
+      "ldr x21, [x10, #0x8]\n"
+      "add x10, x10, #0x10\n"
+      "cbz x13, 8f\n"
+      "mov x20, x13\n"
+      "3:"  // K loop: Main loop
+      "whilelt p9.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+      ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+      ".inst 0xe0572521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0572349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
+      ".inst 0x25796580  // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
+      ".inst 0x25796162  // psel p2.h, p8.h/Z, p11.h[w13, #3]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0572303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0572aab  // ld1h { za1h.h[x13, #3] }, p2/Z, [x21, x23, LSL #1]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf86c0  // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x10, x10, #0x10\n"
+      "add x13, x13, #0x4\n"
+      ".inst 0xe0bb82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x15\n"
+      "addvl x22, x22, #4\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+      ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+      ".inst 0xe0572521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0572349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0x25796580  // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
+      ".inst 0x25796161  // psel p1.h, p8.h/Z, p11.h[w13, #3]\n"
+      ".inst 0xe0572303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe05726ab  // ld1h { za1h.h[x13, #3] }, p1/Z, [x21, x23, LSL #1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b08ac4  // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+      "whilelt p10.h, x14, %x[width]\n"
+      "inch x14\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0xe0bb82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "addvl x22, x22, #4\n"
+      "inch x23\n"
+      "whilelt p9.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25296581  // psel p1.h, p9.h/Z, p12.h[w13]\n"
+      ".inst 0x25296160  // psel p0.h, p8.h/Z, p11.h[w13]\n"
+      ".inst 0xe0572520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0572348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
+      ".inst 0x25696580  // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
+      ".inst 0x25696162  // psel p2.h, p8.h/Z, p11.h[w13, #2]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0572302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0572aaa  // ld1h { za1h.h[x13, #2] }, p2/Z, [x21, x23, LSL #1]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf86c8  // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x10, x10, #0x10\n"
+      "add x13, x13, #0x4\n"
+      ".inst 0xe0bb82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x15\n"
+      "addvl x22, x22, #4\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25296581  // psel p1.h, p9.h/Z, p12.h[w13]\n"
+      ".inst 0x25296160  // psel p0.h, p8.h/Z, p11.h[w13]\n"
+      ".inst 0xe0572520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0572348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0x25696580  // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
+      ".inst 0x25696161  // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
+      ".inst 0xe0572302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe05726aa  // ld1h { za1h.h[x13, #2] }, p1/Z, [x21, x23, LSL #1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b08acc  // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+      "whilelt p10.h, x14, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0xe0bb82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "addvl x22, x22, #4\n"
+      "inch x14\n"
+      "inch x23\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x25, 11f\n"
+      "mov x11, %x[in]\n"
+      "whilelt p9.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "ldr x21, [x11, #0x0]\n"
+      "add x12, x12, #0x1\n"
+      ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
+      "ldr x20, [x11, x16, LSL #0x3]\n"
+      ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
+      "cmp x12, x16\n"
+      ".inst 0xe05726a1  // ld1h { za0h.h[x13, #1] }, p1/Z, [x21, x23, LSL #1]\n"
+      ".inst 0xe0572289  // ld1h { za1h.h[x13, #1] }, p0/Z, [x20, x23, LSL #1]\n"
+      "add x11, x11, #0x8\n"
+      "addvl x22, x22, #2\n"
+      "add x13, x13, #0x2\n"
+      "blt 9b\n"
+      "whilelt p10.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "addvl x22, x22, #2\n"
+      "add x20, x20, #0x2\n"
+      "blt 10b\n"
+      "whilelt p8.h, x14, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "addvl x22, x22, #2\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x22\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp
new file mode 100644
index 0000000000..905c6b41eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<2, 4, VLType::SME, false>(
+  int8_t * &out, const int8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntb x21\n"
+      "mov x23, %x[width]\n"
+      "incb x23\n"
+      "mov x20, %x[width]\n"
+      "sub x17, x21, #0x1\n"
+      "cntw x16\n"
+      "sub x23, x23, #0x1\n"
+      "ands x17, x20, x17\n"
+      "udiv x23, x23, x21\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x17, x17, x21, NE\n"
+      "lsl x22, %x[height], #0x1\n"  // height * 2
+      "lsl x21, x16, #0x1\n"
+      "sub x20, x23, #0x1\n"
+      "add x17, x17, #0x3\n"
+      "sub x15, x16, #0x2\n"
+      "whilelt p9.b, XZR, x22\n"
+      "whilelt p8.b, x21, x22\n"
+      "mov x14, #0x0\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      "ldr x9, [x11, #0x0]\n"
+      "cntw x28, ALL, MUL #2\n"
+      "cntw x27, ALL, MUL #3\n"
+      "ldr x26, [x10, #0x0]\n"
+      "lsr x20, x20, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "and x25, x23, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "ldr x24, [x11, #0x8]\n"
+      "lsr x17, x17, #0x2\n"
+      "ptrue p11.s\n"
+      "ldr x23, [x10, #0x8]\n"
+      "zip1 p10.b, p9.b, p8.b\n"
+      "mov x22, %x[row_offset]\n"
+      "mov x21, %x[out]\n"
+      "whilelt p9.b, x14, %x[width]\n"
+      "whilelt p8.b, x14, %x[width]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0160120  // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0160341  // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25646141  // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
+      ".inst 0x256c6140  // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0xe0160704  // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n"
+      "ldr x24, [x11, #0x8]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe01602e5  // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n"
+      "add x12, x12, #0x8\n"
+      "cmp x12, x15, LSL #2\n"
+      "ldr x23, [x10, #0x8]\n"
+      "add x10, x10, #0x10\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0160120  // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      ".inst 0xe0160341  // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25646141  // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
+      ".inst 0x256c6140  // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+      ".inst 0xe0160704  // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe01602e5  // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n"
+      "ldr x26, [x10, #0x0]\n"
+      "incb x22\n"
+      "incb x14\n"
+      "ldr x24, [x11, #0x8]\n"
+      "add x11, x11, #0x10\n"
+      "ldr x23, [x10, #0x8]\n"
+      "add x10, x10, #0x10\n"
+      "cbz x20, 8f\n"
+      "mov x20, x20\n"
+      "3:"  // K loop: Main loop
+      "whilelt p8.b, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe0162122  // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0162343  // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25756140  // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+      ".inst 0x257d6142  // psel p2.b, p8.b/Z, p10.b[w13, #7]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0xe0162306  // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0162ae7  // ld1b { za0h.b[x13, #7] }, p2/Z, [x23, x22]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0xe0bf86a0  // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe0bc86a1  // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      "add x10, x10, #0x10\n"
+      "add x13, x13, #0x8\n"
+      ".inst 0xe0bb82a5  // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x15\n"
+      "addvl x21, x21, #4\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe0162122  // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      ".inst 0xe0162343  // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25756140  // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+      "mov x11, %x[in]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0162306  // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      ".inst 0x257d6141  // psel p1.b, p8.b/Z, p10.b[w13, #7]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e7  // ld1b { za0h.b[x13, #7] }, p1/Z, [x23, x22]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0b08aa4  // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+      "whilelt p9.b, x14, %x[width]\n"
+      "incb x14\n"
+      ".inst 0xe0bc86a1  // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0xe0bb82a5  // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      "addvl x21, x21, #4\n"
+      "incb x22\n"
+      "whilelt p8.b, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe0162120  // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n"
+      ".inst 0x252d6140  // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0162341  // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25656140  // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+      ".inst 0x256d6142  // psel p2.b, p8.b/Z, p10.b[w13, #5]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0xe0162304  // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0162ae5  // ld1b { za0h.b[x13, #5] }, p2/Z, [x23, x22]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0xe0bf86a8  // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0b082ac  // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe0bc86a9  // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      "add x10, x10, #0x10\n"
+      "add x13, x13, #0x8\n"
+      ".inst 0xe0bb82ad  // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x15\n"
+      "addvl x21, x21, #4\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe0162120  // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n"
+      ".inst 0x252d6140  // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+      ".inst 0xe0162341  // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25656140  // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+      "mov x11, %x[in]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0162304  // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      ".inst 0x256d6141  // psel p1.b, p8.b/Z, p10.b[w13, #5]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e5  // ld1b { za0h.b[x13, #5] }, p1/Z, [x23, x22]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0b08aac  // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+      "whilelt p9.b, x14, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xe0bc86a9  // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0xe0bb82ad  // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      "addvl x21, x21, #4\n"
+      "incb x14\n"
+      "incb x22\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x25, 11f\n"
+      "mov x11, %x[in]\n"
+      "whilelt p8.b, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      "ldr x20, [x11, #0x0]\n"
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe0162282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+      "ldr x20, [x11, x16, LSL #0x3]\n"
+      "add x12, x12, #0x1\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      ".inst 0xe0162283  // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x22]\n"
+      "cmp x12, x16\n"
+      "add x11, x11, #0x8\n"
+      "addvl x21, x21, #2\n"
+      "add x13, x13, #0x4\n"
+      "blt 9b\n"
+      "whilelt p9.b, x14, %x[width]\n"
+      "whilelt p8.b, x14, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0b082ac  // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "addvl x21, x21, #2\n"
+      "add x20, x20, #0x4\n"
+      "blt 10b\n"
+      "whilelt p8.b, x14, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "addvl x21, x21, #2\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x21\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp
new file mode 100644
index 0000000000..c5c5af20e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<2, 4, VLType::SME, true>(
+  int8_t * &out, const int8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntb x21\n"
+      "mov x23, %x[width]\n"
+      "mov z20.b, #0x1\n"
+      "incb x23\n"
+      "mov x20, %x[width]\n"
+      "mov z19.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "sub x17, x21, #0x1\n"
+      "cntw x16\n"
+      "sub x23, x23, #0x1\n"
+      "ands x17, x20, x17\n"
+      "udiv x23, x23, x21\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x17, x17, x21, NE\n"
+      "lsl x22, %x[height], #0x1\n"  // height * 2
+      "lsl x21, x16, #0x1\n"
+      "sub x20, x23, #0x1\n"
+      "add x17, x17, #0x3\n"
+      "whilelt p9.b, XZR, x22\n"
+      "whilelt p8.b, x21, x22\n"
+      "mov x15, #0x0\n"
+      "cntw x14, ALL, MUL #2\n"
+      "cntw x11, ALL, MUL #3\n"
+      "ptrue p4.b\n"
+      "lsr x20, x20, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "and x10, x23, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "lsr x17, x17, #0x2\n"
+      "sub x9, x16, #0x2\n"
+      "ptrue p11.s\n"
+      "zip1 p10.b, p9.b, p8.b\n"
+      "mov x28, %x[row_offset]\n"
+      "mov x27, %x[out]\n"
+      "whilelt p9.b, x15, %x[width]\n"
+      "whilelt p8.b, x15, %x[width]\n"
+      "cbnz %x[first], 1f\n"
+      "addvl x27, x27, #-2\n"
+      "ld1w { z19.s }, p4/Z, [x27]\n"
+      "ld1w { z18.s }, p4/Z, [x27, #1, MUL VL]\n"
+      "1:"  // K loop: Load row sums: End
+      "mov x26, %x[in]\n"
+      "add x25, %x[in], x16, LSL #3\n"
+      "ldr x24, [x26, #0x0]\n"
+      "ldr x23, [x25, #0x0]\n"
+      "mov x12, #0x0\n"
+      "ldr x22, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "ldr x21, [x25, #0x8]\n"
+      "add x25, x25, #0x10\n"
+      "cbz x9, 3f\n"
+      "2:"  // K loop: Charge: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe01c0300  // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c02e1  // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25646141  // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
+      ".inst 0x256c6140  // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+      "ldr x23, [x25, #0x0]\n"
+      ".inst 0xe01c06c4  // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n"
+      "ldr x22, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe01c02a5  // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n"
+      "add x12, x12, #0x8\n"
+      "cmp x12, x9, LSL #2\n"
+      "ldr x21, [x25, #0x8]\n"
+      "add x25, x25, #0x10\n"
+      "blt 2b\n"
+      "3:"  // K loop: Charge: End
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe01c0300  // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      ".inst 0xe01c02e1  // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25646141  // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
+      ".inst 0x256c6140  // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+      ".inst 0xe01c06c4  // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n"
+      "mov x26, %x[in]\n"
+      "add x25, %x[in], x16, LSL #3\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c02a5  // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n"
+      "ldr x23, [x25, #0x0]\n"
+      "incb x28\n"
+      "incb x15\n"
+      "ldr x22, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "ldr x21, [x25, #0x8]\n"
+      "add x25, x25, #0x10\n"
+      "cbz x20, 9f\n"
+      "mov x20, x20\n"
+      "4:"  // K loop: Main loop
+      "whilelt p8.b, x15, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x9, 6f\n"
+      "5:"  // K loop: Main loop: First: Loop
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe01c2302  // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c22e3  // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25756140  // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+      ".inst 0x257d6142  // psel p2.b, p8.b/Z, p10.b[w13, #7]\n"
+      "ldr x23, [x25, #0x0]\n"
+      ".inst 0xe01c22c6  // ld1b { za0h.b[x13, #6] }, p0/Z, [x22, x28]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x26, #0x8]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01c2aa7  // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n"
+      "ldr x21, [x25, #0x8]\n"
+      ".inst 0xe0bf8760  // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      ".inst 0xe0ae8361  // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0829031  // mova z17.s, p4/M, za0v.s[x12, #1]\n"
+      ".inst 0xe0ab8365  // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+      ".inst 0xc08290b0  // mova z16.s, p4/M, za1v.s[x12, #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x9\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      "addvl x27, x27, #4\n"
+      "add x13, x13, #0x8\n"
+      "blt 5b\n"
+      "6:"  // K loop: Main loop: First: Tail
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe01c2302  // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      ".inst 0xe01c22e3  // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25756141  // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+      ".inst 0x257d6140  // psel p0.b, p8.b/Z, p10.b[w13, #7]\n"
+      ".inst 0xe01c26c6  // ld1b { za0h.b[x13, #6] }, p1/Z, [x22, x28]\n"
+      "mov x26, %x[in]\n"
+      "add x25, %x[in], x16, LSL #3\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c22a7  // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n"
+      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
+      ".inst 0x25306d23  // psel p3.s, p11.s/Z, p9.s[w12]\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      "ldr x23, [x25, #0x0]\n"
+      ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x26, #0x8]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0829031  // mova z17.s, p4/M, za0v.s[x12, #1]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "ldr x21, [x25, #0x8]\n"
+      ".inst 0xe0bf8f60  // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0xc08290b0  // mova z16.s, p4/M, za1v.s[x12, #1]\n"
+      "whilelt p9.b, x15, %x[width]\n"
+      ".inst 0xe0b08b64  // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
+      "incb x15\n"
+      "add x26, x26, #0x10\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      ".inst 0xe0ae8761  // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
+      "add x25, x25, #0x10\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      "incb x28\n"
+      ".inst 0xe0ab8365  // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+      "addvl x27, x27, #4\n"
+      "whilelt p8.b, x15, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x9, 8f\n"
+      "7:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe01c2300  // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n"
+      ".inst 0x252d6140  // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c22e1  // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25656140  // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+      ".inst 0x256d6142  // psel p2.b, p8.b/Z, p10.b[w13, #5]\n"
+      "ldr x23, [x25, #0x0]\n"
+      ".inst 0xe01c22c4  // ld1b { za0h.b[x13, #4] }, p0/Z, [x22, x28]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x26, #0x8]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01c2aa5  // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n"
+      "ldr x21, [x25, #0x8]\n"
+      ".inst 0xe0bf8768  // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0xc0829111  // mova z17.s, p4/M, za2v.s[x12]\n"
+      ".inst 0xc0829190  // mova z16.s, p4/M, za3v.s[x12]\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      ".inst 0xe0b0836c  // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      ".inst 0xe0ae8369  // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0829131  // mova z17.s, p4/M, za2v.s[x12, #1]\n"
+      ".inst 0xe0ab836d  // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+      ".inst 0xc08291b0  // mova z16.s, p4/M, za3v.s[x12, #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x9\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      "addvl x27, x27, #4\n"
+      "add x13, x13, #0x8\n"
+      "blt 7b\n"
+      "8:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe01c2300  // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n"
+      ".inst 0x252d6140  // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+      ".inst 0xe01c22e1  // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25656141  // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
+      ".inst 0x256d6140  // psel p0.b, p8.b/Z, p10.b[w13, #5]\n"
+      ".inst 0xe01c26c4  // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x28]\n"
+      "mov x26, %x[in]\n"
+      "add x25, %x[in], x16, LSL #3\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c22a5  // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
+      ".inst 0xc0829111  // mova z17.s, p4/M, za2v.s[x12]\n"
+      ".inst 0x25306d23  // psel p3.s, p11.s/Z, p9.s[w12]\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      ".inst 0xc0829190  // mova z16.s, p4/M, za3v.s[x12]\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      "ldr x23, [x25, #0x0]\n"
+      ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x26, #0x8]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0829131  // mova z17.s, p4/M, za2v.s[x12, #1]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "ldr x21, [x25, #0x8]\n"
+      ".inst 0xe0bf8f68  // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0xc08291b0  // mova z16.s, p4/M, za3v.s[x12, #1]\n"
+      "whilelt p9.b, x15, %x[width]\n"
+      ".inst 0xe0b08b6c  // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
+      "subs x20, x20, #0x1\n"
+      "add x26, x26, #0x10\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      ".inst 0xe0ae8769  // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
+      "add x25, x25, #0x10\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      "incb x15\n"
+      ".inst 0xe0ab836d  // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+      "addvl x27, x27, #4\n"
+      "incb x28\n"
+      "bgt 4b\n"
+      "9:"  // K loop: Tails
+      "cbnz x10, 12f\n"
+      "mov x26, %x[in]\n"
+      "whilelt p8.b, x15, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: First
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8360  // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+      "ldr x21, [x26, #0x0]\n"
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
+      "ldr x20, [x26, x16, LSL #0x3]\n"
+      ".inst 0xe01c22a2  // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n"
+      "add x12, x12, #0x1\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      "cmp x12, x16\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      ".inst 0xe01c2283  // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n"
+      "add x26, x26, #0x8\n"
+      "addvl x27, x27, #2\n"
+      "add x13, x13, #0x4\n"
+      "blt 10b\n"
+      "whilelt p9.b, x15, %x[width]\n"
+      "whilelt p8.b, x15, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "11:"  // K loop: Tails: Even: Second
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8368  // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xc0829111  // mova z17.s, p4/M, za2v.s[x12]\n"
+      ".inst 0xe0b0836c  // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+      ".inst 0xc0829190  // mova z16.s, p4/M, za3v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      "addvl x27, x27, #2\n"
+      "add x20, x20, #0x4\n"
+      "blt 11b\n"
+      "whilelt p8.b, x15, %x[width]\n"
+      "b 14f\n"
+      "12:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "13:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8360  // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      "addvl x27, x27, #2\n"
+      "blt 13b\n"
+      "14:"  // K loop: End
+      "st1w { z19.s }, p4, [x27]\n"
+      "st1w { z18.s }, p4, [x27, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "mov %x[out], x27\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp
new file mode 100644
index 0000000000..ce9a0065c7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<2, 4, VLType::SME, false>(
+  uint8_t * &out, const uint8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntb x21\n"
+      "mov x23, %x[width]\n"
+      "incb x23\n"
+      "mov x20, %x[width]\n"
+      "sub x17, x21, #0x1\n"
+      "cntw x16\n"
+      "sub x23, x23, #0x1\n"
+      "ands x17, x20, x17\n"
+      "udiv x23, x23, x21\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x17, x17, x21, NE\n"
+      "lsl x22, %x[height], #0x1\n"  // height * 2
+      "lsl x21, x16, #0x1\n"
+      "sub x20, x23, #0x1\n"
+      "add x17, x17, #0x3\n"
+      "sub x15, x16, #0x2\n"
+      "whilelt p9.b, XZR, x22\n"
+      "whilelt p8.b, x21, x22\n"
+      "mov x14, #0x0\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      "ldr x9, [x11, #0x0]\n"
+      "cntw x28, ALL, MUL #2\n"
+      "cntw x27, ALL, MUL #3\n"
+      "ldr x26, [x10, #0x0]\n"
+      "lsr x20, x20, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "and x25, x23, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "ldr x24, [x11, #0x8]\n"
+      "lsr x17, x17, #0x2\n"
+      "ptrue p11.s\n"
+      "ldr x23, [x10, #0x8]\n"
+      "zip1 p10.b, p9.b, p8.b\n"
+      "mov x22, %x[row_offset]\n"
+      "mov x21, %x[out]\n"
+      "whilelt p9.b, x14, %x[width]\n"
+      "whilelt p8.b, x14, %x[width]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0160120  // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0160341  // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25646141  // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
+      ".inst 0x256c6140  // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0xe0160704  // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n"
+      "ldr x24, [x11, #0x8]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe01602e5  // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n"
+      "add x12, x12, #0x8\n"
+      "cmp x12, x15, LSL #2\n"
+      "ldr x23, [x10, #0x8]\n"
+      "add x10, x10, #0x10\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0160120  // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      ".inst 0xe0160341  // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25646141  // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
+      ".inst 0x256c6140  // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+      ".inst 0xe0160704  // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n"
+      "mov x11, %x[in]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe01602e5  // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n"
+      "ldr x26, [x10, #0x0]\n"
+      "incb x22\n"
+      "incb x14\n"
+      "ldr x24, [x11, #0x8]\n"
+      "add x11, x11, #0x10\n"
+      "ldr x23, [x10, #0x8]\n"
+      "add x10, x10, #0x10\n"
+      "cbz x20, 8f\n"
+      "mov x20, x20\n"
+      "3:"  // K loop: Main loop
+      "whilelt p8.b, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe0162122  // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0162343  // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25756140  // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+      ".inst 0x257d6142  // psel p2.b, p8.b/Z, p10.b[w13, #7]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0xe0162306  // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0162ae7  // ld1b { za0h.b[x13, #7] }, p2/Z, [x23, x22]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0xe0bf86a0  // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe0bc86a1  // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      "add x10, x10, #0x10\n"
+      "add x13, x13, #0x8\n"
+      ".inst 0xe0bb82a5  // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x15\n"
+      "addvl x21, x21, #4\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe0162122  // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      ".inst 0xe0162343  // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25756140  // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+      "mov x11, %x[in]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0162306  // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      ".inst 0x257d6141  // psel p1.b, p8.b/Z, p10.b[w13, #7]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e7  // ld1b { za0h.b[x13, #7] }, p1/Z, [x23, x22]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0b08aa4  // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+      "whilelt p9.b, x14, %x[width]\n"
+      "incb x14\n"
+      ".inst 0xe0bc86a1  // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0xe0bb82a5  // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      "addvl x21, x21, #4\n"
+      "incb x22\n"
+      "whilelt p8.b, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x15, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe0162120  // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n"
+      ".inst 0x252d6140  // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0162341  // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25656140  // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+      ".inst 0x256d6142  // psel p2.b, p8.b/Z, p10.b[w13, #5]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0xe0162304  // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0162ae5  // ld1b { za0h.b[x13, #5] }, p2/Z, [x23, x22]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0xe0bf86a8  // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0b082ac  // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe0bc86a9  // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      "add x10, x10, #0x10\n"
+      "add x13, x13, #0x8\n"
+      ".inst 0xe0bb82ad  // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x15\n"
+      "addvl x21, x21, #4\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe0162120  // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n"
+      ".inst 0x252d6140  // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+      ".inst 0xe0162341  // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n"
+      ".inst 0x25656140  // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+      "mov x11, %x[in]\n"
+      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0162304  // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n"
+      "add x10, %x[in], x16, LSL #3\n"
+      ".inst 0x256d6141  // psel p1.b, p8.b/Z, p10.b[w13, #5]\n"
+      "ldr x26, [x10, #0x0]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01626e5  // ld1b { za0h.b[x13, #5] }, p1/Z, [x23, x22]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xe0b08aac  // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+      "whilelt p9.b, x14, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xe0bc86a9  // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0xe0bb82ad  // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      "addvl x21, x21, #4\n"
+      "incb x14\n"
+      "incb x22\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x25, 11f\n"
+      "mov x11, %x[in]\n"
+      "whilelt p8.b, x14, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      "ldr x20, [x11, #0x0]\n"
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe0162282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+      "ldr x20, [x11, x16, LSL #0x3]\n"
+      "add x12, x12, #0x1\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      ".inst 0xe0162283  // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x22]\n"
+      "cmp x12, x16\n"
+      "add x11, x11, #0x8\n"
+      "addvl x21, x21, #2\n"
+      "add x13, x13, #0x4\n"
+      "blt 9b\n"
+      "whilelt p9.b, x14, %x[width]\n"
+      "whilelt p8.b, x14, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0b082ac  // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "addvl x21, x21, #2\n"
+      "add x20, x20, #0x4\n"
+      "blt 10b\n"
+      "whilelt p8.b, x14, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "addvl x21, x21, #2\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x21\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
new file mode 100644
index 0000000000..7805152656
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<2, 4, VLType::SME, true>(
+  uint8_t * &out, const uint8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntb x21\n"
+      "mov x23, %x[width]\n"
+      "mov z20.b, #0x1\n"
+      "incb x23\n"
+      "mov x20, %x[width]\n"
+      "mov z19.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "sub x17, x21, #0x1\n"
+      "cntw x16\n"
+      "sub x23, x23, #0x1\n"
+      "ands x17, x20, x17\n"
+      "udiv x23, x23, x21\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x17, x17, x21, NE\n"
+      "lsl x22, %x[height], #0x1\n"  // height * 2
+      "lsl x21, x16, #0x1\n"
+      "sub x20, x23, #0x1\n"
+      "add x17, x17, #0x3\n"
+      "whilelt p9.b, XZR, x22\n"
+      "whilelt p8.b, x21, x22\n"
+      "mov x15, #0x0\n"
+      "cntw x14, ALL, MUL #2\n"
+      "cntw x11, ALL, MUL #3\n"
+      "ptrue p4.b\n"
+      "lsr x20, x20, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "and x10, x23, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "lsr x17, x17, #0x2\n"
+      "sub x9, x16, #0x2\n"
+      "ptrue p11.s\n"
+      "zip1 p10.b, p9.b, p8.b\n"
+      "mov x28, %x[row_offset]\n"
+      "mov x27, %x[out]\n"
+      "whilelt p9.b, x15, %x[width]\n"
+      "whilelt p8.b, x15, %x[width]\n"
+      "cbnz %x[first], 1f\n"
+      "addvl x27, x27, #-2\n"
+      "ld1w { z19.s }, p4/Z, [x27]\n"
+      "ld1w { z18.s }, p4/Z, [x27, #1, MUL VL]\n"
+      "1:"  // K loop: Load row sums: End
+      "mov x26, %x[in]\n"
+      "add x25, %x[in], x16, LSL #3\n"
+      "ldr x24, [x26, #0x0]\n"
+      "ldr x23, [x25, #0x0]\n"
+      "mov x12, #0x0\n"
+      "ldr x22, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "ldr x21, [x25, #0x8]\n"
+      "add x25, x25, #0x10\n"
+      "cbz x9, 3f\n"
+      "2:"  // K loop: Charge: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe01c0300  // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c02e1  // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25646141  // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
+      ".inst 0x256c6140  // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+      "ldr x23, [x25, #0x0]\n"
+      ".inst 0xe01c06c4  // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n"
+      "ldr x22, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0xe01c02a5  // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n"
+      "add x12, x12, #0x8\n"
+      "cmp x12, x9, LSL #2\n"
+      "ldr x21, [x25, #0x8]\n"
+      "add x25, x25, #0x10\n"
+      "blt 2b\n"
+      "3:"  // K loop: Charge: End
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe01c0300  // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      ".inst 0xe01c02e1  // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25646141  // psel p1.b, p8.b/Z, p10.b[w12, #4]\n"
+      ".inst 0x256c6140  // psel p0.b, p8.b/Z, p10.b[w12, #5]\n"
+      ".inst 0xe01c06c4  // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n"
+      "mov x26, %x[in]\n"
+      "add x25, %x[in], x16, LSL #3\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c02a5  // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n"
+      "ldr x23, [x25, #0x0]\n"
+      "incb x28\n"
+      "incb x15\n"
+      "ldr x22, [x26, #0x8]\n"
+      "add x26, x26, #0x10\n"
+      "ldr x21, [x25, #0x8]\n"
+      "add x25, x25, #0x10\n"
+      "cbz x20, 9f\n"
+      "mov x20, x20\n"
+      "4:"  // K loop: Main loop
+      "whilelt p8.b, x15, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x9, 6f\n"
+      "5:"  // K loop: Main loop: First: Loop
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe01c2302  // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c22e3  // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25756140  // psel p0.b, p8.b/Z, p10.b[w13, #6]\n"
+      ".inst 0x257d6142  // psel p2.b, p8.b/Z, p10.b[w13, #7]\n"
+      "ldr x23, [x25, #0x0]\n"
+      ".inst 0xe01c22c6  // ld1b { za0h.b[x13, #6] }, p0/Z, [x22, x28]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x26, #0x8]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01c2aa7  // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n"
+      "ldr x21, [x25, #0x8]\n"
+      ".inst 0xe0bf8760  // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
+      "udot z19.s, z16.b, z20.b\n"
+      ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "udot z18.s, z17.b, z20.b\n"
+      ".inst 0xe0ae8361  // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0829030  // mova z16.s, p4/M, za0v.s[x12, #1]\n"
+      ".inst 0xe0ab8365  // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+      ".inst 0xc08290b1  // mova z17.s, p4/M, za1v.s[x12, #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x9\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "udot z19.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
+      "addvl x27, x27, #4\n"
+      "add x13, x13, #0x8\n"
+      "blt 5b\n"
+      "6:"  // K loop: Main loop: First: Tail
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xe01c2302  // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      ".inst 0xe01c22e3  // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25756141  // psel p1.b, p8.b/Z, p10.b[w13, #6]\n"
+      ".inst 0x257d6140  // psel p0.b, p8.b/Z, p10.b[w13, #7]\n"
+      ".inst 0xe01c26c6  // ld1b { za0h.b[x13, #6] }, p1/Z, [x22, x28]\n"
+      "mov x26, %x[in]\n"
+      "add x25, %x[in], x16, LSL #3\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c22a7  // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n"
+      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
+      ".inst 0x25306d23  // psel p3.s, p11.s/Z, p9.s[w12]\n"
+      "udot z19.s, z16.b, z20.b\n"
+      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
+      "udot z18.s, z17.b, z20.b\n"
+      "ldr x23, [x25, #0x0]\n"
+      ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x26, #0x8]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0829030  // mova z16.s, p4/M, za0v.s[x12, #1]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "ldr x21, [x25, #0x8]\n"
+      ".inst 0xe0bf8f60  // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0xc08290b1  // mova z17.s, p4/M, za1v.s[x12, #1]\n"
+      "whilelt p9.b, x15, %x[width]\n"
+      ".inst 0xe0b08b64  // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
+      "incb x15\n"
+      "add x26, x26, #0x10\n"
+      "udot z19.s, z16.b, z20.b\n"
+      ".inst 0xe0ae8761  // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
+      "add x25, x25, #0x10\n"
+      "udot z18.s, z17.b, z20.b\n"
+      "incb x28\n"
+      ".inst 0xe0ab8365  // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+      "addvl x27, x27, #4\n"
+      "whilelt p8.b, x15, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "cbz x9, 8f\n"
+      "7:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe01c2300  // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n"
+      ".inst 0x252d6140  // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c22e1  // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25656140  // psel p0.b, p8.b/Z, p10.b[w13, #4]\n"
+      ".inst 0x256d6142  // psel p2.b, p8.b/Z, p10.b[w13, #5]\n"
+      "ldr x23, [x25, #0x0]\n"
+      ".inst 0xe01c22c4  // ld1b { za0h.b[x13, #4] }, p0/Z, [x22, x28]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x26, #0x8]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe01c2aa5  // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n"
+      "ldr x21, [x25, #0x8]\n"
+      ".inst 0xe0bf8768  // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0xc0829110  // mova z16.s, p4/M, za2v.s[x12]\n"
+      ".inst 0xc0829191  // mova z17.s, p4/M, za3v.s[x12]\n"
+      "udot z19.s, z16.b, z20.b\n"
+      ".inst 0xe0b0836c  // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "udot z18.s, z17.b, z20.b\n"
+      ".inst 0xe0ae8369  // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0829130  // mova z16.s, p4/M, za2v.s[x12, #1]\n"
+      ".inst 0xe0ab836d  // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+      ".inst 0xc08291b1  // mova z17.s, p4/M, za3v.s[x12, #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x9\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "udot z19.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
+      "addvl x27, x27, #4\n"
+      "add x13, x13, #0x8\n"
+      "blt 7b\n"
+      "8:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25256140  // psel p0.b, p8.b/Z, p10.b[w13]\n"
+      ".inst 0xe01c2300  // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n"
+      ".inst 0x252d6140  // psel p0.b, p8.b/Z, p10.b[w13, #1]\n"
+      ".inst 0xe01c22e1  // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n"
+      ".inst 0x25656141  // psel p1.b, p8.b/Z, p10.b[w13, #4]\n"
+      ".inst 0x256d6140  // psel p0.b, p8.b/Z, p10.b[w13, #5]\n"
+      ".inst 0xe01c26c4  // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x28]\n"
+      "mov x26, %x[in]\n"
+      "add x25, %x[in], x16, LSL #3\n"
+      "ldr x24, [x26, #0x0]\n"
+      ".inst 0xe01c22a5  // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
+      ".inst 0xc0829110  // mova z16.s, p4/M, za2v.s[x12]\n"
+      ".inst 0x25306d23  // psel p3.s, p11.s/Z, p9.s[w12]\n"
+      "udot z19.s, z16.b, z20.b\n"
+      ".inst 0xc0829191  // mova z17.s, p4/M, za3v.s[x12]\n"
+      "udot z18.s, z17.b, z20.b\n"
+      "ldr x23, [x25, #0x0]\n"
+      ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
+      "ldr x22, [x26, #0x8]\n"
+      ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
+      ".inst 0xc0829130  // mova z16.s, p4/M, za2v.s[x12, #1]\n"
+      ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
+      "ldr x21, [x25, #0x8]\n"
+      ".inst 0xe0bf8f68  // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0xc08291b1  // mova z17.s, p4/M, za3v.s[x12, #1]\n"
+      "whilelt p9.b, x15, %x[width]\n"
+      ".inst 0xe0b08b6c  // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
+      "subs x20, x20, #0x1\n"
+      "add x26, x26, #0x10\n"
+      "udot z19.s, z16.b, z20.b\n"
+      ".inst 0xe0ae8769  // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
+      "add x25, x25, #0x10\n"
+      "udot z18.s, z17.b, z20.b\n"
+      "incb x15\n"
+      ".inst 0xe0ab836d  // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
+      "addvl x27, x27, #4\n"
+      "incb x28\n"
+      "bgt 4b\n"
+      "9:"  // K loop: Tails
+      "cbnz x10, 12f\n"
+      "mov x26, %x[in]\n"
+      "whilelt p8.b, x15, %x[width]\n"
+      "mov x13, #0x0\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: First
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8360  // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+      "ldr x21, [x26, #0x0]\n"
+      ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
+      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
+      "ldr x20, [x26, x16, LSL #0x3]\n"
+      ".inst 0xe01c22a2  // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n"
+      "add x12, x12, #0x1\n"
+      ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
+      "cmp x12, x16\n"
+      "udot z19.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
+      ".inst 0xe01c2283  // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n"
+      "add x26, x26, #0x8\n"
+      "addvl x27, x27, #2\n"
+      "add x13, x13, #0x4\n"
+      "blt 10b\n"
+      "whilelt p9.b, x15, %x[width]\n"
+      "whilelt p8.b, x15, %x[width]\n"
+      "mov x20, #0x0\n"
+      "mov x12, #0x0\n"
+      "11:"  // K loop: Tails: Even: Second
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8368  // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xc0829110  // mova z16.s, p4/M, za2v.s[x12]\n"
+      ".inst 0xe0b0836c  // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+      ".inst 0xc0829191  // mova z17.s, p4/M, za3v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "udot z19.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
+      "addvl x27, x27, #2\n"
+      "add x20, x20, #0x4\n"
+      "blt 11b\n"
+      "whilelt p8.b, x15, %x[width]\n"
+      "b 14f\n"
+      "12:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "13:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8360  // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
+      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x17\n"
+      "udot z19.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
+      "addvl x27, x27, #2\n"
+      "blt 13b\n"
+      "14:"  // K loop: End
+      "st1w { z19.s }, p4, [x27]\n"
+      "st1w { z18.s }, p4, [x27, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "mov %x[out], x27\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp
new file mode 100644
index 0000000000..96ab55ee06
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<2, 1, VLType::SME, false>(
+  __fp16 * &out, const __fp16 * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cnth x28\n"
+      "cmp %x[height], x28\n"
+      "cnth x27\n"
+      "csel x28, %x[height], x28, LT\n"
+      "mov x26, #0x0\n"
+      "ptrue p13.s\n"
+      "sub x28, x28, #0x1\n"
+      "whilelt p12.h, XZR, %x[height]\n"
+      "whilelt p11.h, x27, %x[height]\n"
+      "mov x25, %x[row_offset]\n"
+      "mov x24, %x[out]\n"
+      "whilelt p10.h, x26, %x[width]\n"
+      "whilelt p9.h, x26, %x[width]\n"
+      "whilelt p8.h, x26, %x[width]\n"
+      "1:"  // Width loop
+      "add x23, %x[in], XZR, LSL #3\n"
+      "add x20, %x[in], x27, LSL #3\n"
+      "ldr x22, [x23], #0x8\n"
+      "mov x12, #0x0\n"
+      "ldr x21, [x20], #0x8\n"
+      "cbz x28, 3f\n"
+      "2:"  // Loads: Loop
+      ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
+      ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
+      ".inst 0xe05906c0  // ld1h { za0h.h[x12] }, p1/Z, [x22, x25, LSL #1]\n"
+      "ldr x22, [x23], #0x8\n"
+      ".inst 0xe05902a8  // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x28, LSL #1\n"
+      "ldr x21, [x20], #0x8\n"
+      "blt 2b\n"
+      "3:"  // Loads: Tail
+      "sub x20, %x[width], x26\n"
+      ".inst 0x25286580  // psel p0.h, p9.h/Z, p12.h[w12]\n"
+      ".inst 0xe05902c0  // ld1h { za0h.h[x12] }, p0/Z, [x22, x25, LSL #1]\n"
+      ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
+      "cmp x20, x27\n"
+      ".inst 0xe05902a8  // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n"
+      "mov x12, #0x0\n"
+      "csel x20, x20, x27, LT\n"
+      "4:"  // Stores: Loop
+      ".inst 0x25287540  // psel p0.h, p13.h/Z, p10.h[w12]\n"
+      ".inst 0xe07f8300  // st1h { za0v.h[x12] }, p0/Z, [x24, XZR, LSL #1]\n"
+      ".inst 0x25287540  // psel p0.h, p13.h/Z, p10.h[w12]\n"
+      ".inst 0xe07b8308  // st1h { za1v.h[x12] }, p0/Z, [x24, x27, LSL #1]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x20\n"
+      "addvl x24, x24, #4\n"
+      "blt 4b\n"
+      "inch x26\n"
+      "whilelt p10.h, x26, %x[width]\n"
+      "whilelt p9.h, x26, %x[width]\n"
+      "whilelt p8.h, x26, %x[width]\n"
+      "inch x25\n"
+      "b.any 1b\n"
+      "mov %x[out], x24\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp
new file mode 100644
index 0000000000..ac4b1b5086
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<2, 1, VLType::SME, false>(
+  float * &out, const float * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "mov x22, %x[width]\n"
+      "incw x22\n"
+      "cntw x16\n"
+      "sub x22, x22, #0x1\n"
+      "udiv x22, x22, x16\n"  // n_passes = ceildiv(width, VL<T>)
+      "mov x21, %x[width]\n"
+      "sub x15, x16, #0x1\n"
+      "sub x20, x22, #0x1\n"
+      "ands x15, x21, x15\n"
+      "sub x14, x16, #0x2\n"
+      "mov x13, #0x0\n"
+      "mov x11, %x[in]\n"
+      "ldr x10, [x11, #0x0]\n"
+      "add x9, %x[in], x16, LSL #3\n"
+      "cntw x28, ALL, MUL #2\n"
+      "ldr x27, [x9, #0x0]\n"
+      "cntw x26, ALL, MUL #3\n"
+      "lsr x20, x20, #0x1\n"  // n_loops = (n_passes - 1) / 2
+      "ldr x25, [x11, #0x8]\n"
+      "and x24, x22, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "csel x15, x15, x16, NE\n"
+      "ldr x21, [x9, #0x8]\n"
+      "ptrue p13.s\n"
+      "whilelt p12.s, XZR, %x[height]\n"
+      "whilelt p11.s, x16, %x[height]\n"
+      "mov x23, %x[row_offset]\n"
+      "mov x22, %x[out]\n"
+      "whilelt p10.s, x13, %x[width]\n"
+      "whilelt p9.s, x13, %x[width]\n"
+      "whilelt p8.s, x13, %x[width]\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "mov x12, #0x0\n"
+      "cbz x14, 2f\n"
+      "1:"  // K loop: Charge: Loop
+      ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
+      ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
+      ".inst 0xe0970540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+      "ldr x10, [x11, #0x0]\n"
+      ".inst 0xe0970364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
+      ".inst 0x25706581  // psel p1.s, p9.s/Z, p12.s[w12, #1]\n"
+      ".inst 0x25706160  // psel p0.s, p8.s/Z, p11.s[w12, #1]\n"
+      "ldr x27, [x9, #0x0]\n"
+      ".inst 0xe0970721  // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x23, LSL #2]\n"
+      "ldr x25, [x11, #0x8]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe09702a5  // ld1w { za1h.s[x12, #1] }, p0/Z, [x21, x23, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x14\n"
+      "ldr x21, [x9, #0x8]\n"
+      "add x9, x9, #0x10\n"
+      "blt 1b\n"
+      "2:"  // K loop: Charge: End
+      ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
+      ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
+      ".inst 0xe0970540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+      ".inst 0xe0970364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
+      ".inst 0x25706581  // psel p1.s, p9.s/Z, p12.s[w12, #1]\n"
+      ".inst 0x25706160  // psel p0.s, p8.s/Z, p11.s[w12, #1]\n"
+      "mov x11, %x[in]\n"
+      "add x9, %x[in], x16, LSL #3\n"
+      ".inst 0xe0970721  // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x23, LSL #2]\n"
+      "ldr x10, [x11, #0x0]\n"
+      ".inst 0xe09702a5  // ld1w { za1h.s[x12, #1] }, p0/Z, [x21, x23, LSL #2]\n"
+      "ldr x27, [x9, #0x0]\n"
+      "incw x23\n"
+      "incw x13\n"
+      "ldr x25, [x11, #0x8]\n"
+      "add x11, x11, #0x10\n"
+      "ldr x21, [x9, #0x8]\n"
+      "add x9, x9, #0x10\n"
+      "cbz x20, 8f\n"
+      "mov x20, x20\n"
+      "3:"  // K loop: Main loop
+      "whilelt p9.s, x13, %x[width]\n"
+      "whilelt p8.s, x13, %x[width]\n"
+      "mov x12, #0x0\n"
+      "cbz x14, 5f\n"
+      "4:"  // K loop: Main loop: First: Loop
+      ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
+      ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
+      ".inst 0xe0970548  // ld1w { za2h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+      "ldr x10, [x11, #0x0]\n"
+      ".inst 0xe097036c  // ld1w { za3h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
+      ".inst 0x25706580  // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
+      ".inst 0x25706162  // psel p2.s, p8.s/Z, p11.s[w12, #1]\n"
+      "ldr x27, [x9, #0x0]\n"
+      ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0970329  // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
+      "ldr x25, [x11, #0x8]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0970aad  // ld1w { za3h.s[x12, #1] }, p2/Z, [x21, x23, LSL #2]\n"
+      "ldr x21, [x9, #0x8]\n"
+      ".inst 0xe0bf86c0  // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0xe0ba82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x14\n"
+      "addvl x22, x22, #4\n"
+      "blt 4b\n"
+      "5:"  // K loop: Main loop: First: Tail
+      ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
+      ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
+      ".inst 0xe0970548  // ld1w { za2h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+      ".inst 0xe097036c  // ld1w { za3h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
+      "mov x11, %x[in]\n"
+      "add x9, %x[in], x16, LSL #3\n"
+      "ldr x10, [x11, #0x0]\n"
+      ".inst 0x25706580  // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
+      ".inst 0x25706161  // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
+      ".inst 0xe0970329  // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
+      "ldr x27, [x9, #0x0]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe09706ad  // ld1w { za3h.s[x12, #1] }, p1/Z, [x21, x23, LSL #2]\n"
+      "ldr x25, [x11, #0x8]\n"
+      ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
+      "ldr x21, [x9, #0x8]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b08ac4  // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+      "whilelt p10.s, x13, %x[width]\n"
+      "incw x13\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0xe0ba82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
+      "addvl x22, x22, #4\n"
+      "incw x23\n"
+      "whilelt p9.s, x13, %x[width]\n"
+      "whilelt p8.s, x13, %x[width]\n"
+      "mov x12, #0x0\n"
+      "cbz x14, 7f\n"
+      "6:"  // K loop: Main loop: Second: Loop
+      ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
+      ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
+      ".inst 0xe0970540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+      "ldr x10, [x11, #0x0]\n"
+      ".inst 0xe0970364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
+      ".inst 0x25706580  // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
+      ".inst 0x25706162  // psel p2.s, p8.s/Z, p11.s[w12, #1]\n"
+      "ldr x27, [x9, #0x0]\n"
+      ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0970321  // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
+      "ldr x25, [x11, #0x8]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0970aa5  // ld1w { za1h.s[x12, #1] }, p2/Z, [x21, x23, LSL #2]\n"
+      "ldr x21, [x9, #0x8]\n"
+      ".inst 0xe0bf86c8  // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0xe0ba82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x14\n"
+      "addvl x22, x22, #4\n"
+      "blt 6b\n"
+      "7:"  // K loop: Main loop: Second: Tail
+      ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
+      ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
+      ".inst 0xe0970540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+      ".inst 0xe0970364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
+      "mov x11, %x[in]\n"
+      "add x9, %x[in], x16, LSL #3\n"
+      "ldr x10, [x11, #0x0]\n"
+      ".inst 0x25706580  // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
+      ".inst 0x25706161  // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
+      ".inst 0xe0970321  // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
+      "ldr x27, [x9, #0x0]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe09706a5  // ld1w { za1h.s[x12, #1] }, p1/Z, [x21, x23, LSL #2]\n"
+      "ldr x25, [x11, #0x8]\n"
+      ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
+      "ldr x21, [x9, #0x8]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
+      ".inst 0xe0b08acc  // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
+      "whilelt p10.s, x13, %x[width]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0xe0ba82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
+      "addvl x22, x22, #4\n"
+      "incw x13\n"
+      "incw x23\n"
+      "bgt 3b\n"
+      "8:"  // K loop: Tails
+      "cbnz x24, 11f\n"
+      "mov x11, %x[in]\n"
+      "whilelt p9.s, x13, %x[width]\n"
+      "whilelt p8.s, x13, %x[width]\n"
+      "mov x12, #0x0\n"
+      "9:"  // K loop: Tails: Even: First
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "ldr x21, [x11, #0x0]\n"
+      ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
+      ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
+      "ldr x20, [x11, x16, LSL #0x3]\n"
+      ".inst 0xe09706a8  // ld1w { za2h.s[x12] }, p1/Z, [x21, x23, LSL #2]\n"
+      "add x11, x11, #0x8\n"
+      "addvl x22, x22, #2\n"
+      ".inst 0xe097028c  // ld1w { za3h.s[x12] }, p0/Z, [x20, x23, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x16\n"
+      "blt 9b\n"
+      "whilelt p10.s, x13, %x[width]\n"
+      "whilelt p8.s, x13, %x[width]\n"
+      "whilelt p8.s, x13, %x[width]\n"
+      "mov x12, #0x0\n"
+      "10:"  // K loop: Tails: Even: Second
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x15\n"
+      "addvl x22, x22, #2\n"
+      "blt 10b\n"
+      "whilelt p8.s, x13, %x[width]\n"
+      "b 13f\n"
+      "11:"  // K loop: Tails: Odd
+      "mov x12, #0x0\n"
+      "12:"  // K loop: Tails: Odd: Loop
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x15\n"
+      "addvl x22, x22, #2\n"
+      "blt 12b\n"
+      "13:"  // K loop: End
+      "mov %x[out], x22\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp
new file mode 100644
index 0000000000..2e53475b5c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<4, 2, VLType::SME, false>(
+  bfloat16 * &out, const bfloat16 * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x16\n"
+      "cntw x15\n"
+      "cntw x14, ALL, MUL #2\n"
+      "cntw x13, ALL, MUL #3\n"
+      "cmp %x[height], x16\n"
+      "csel x16, %x[height], x16, LT\n"
+      "whilelt p11.h, XZR, %x[height]\n"
+      "whilelt p10.h, x15, %x[height]\n"
+      "whilelt p9.h, x14, %x[height]\n"
+      "whilelt p8.h, x13, %x[height]\n"
+      "mov x11, #0x0\n"
+      "cnth x10\n"
+      "ptrue p13.s\n"
+      "sub x16, x16, #0x1\n"
+      "zip1 p12.h, p11.h, p9.h\n"
+      "zip1 p11.h, p10.h, p8.h\n"
+      "mov x9, %x[row_offset]\n"
+      "mov x28, %x[out]\n"
+      "whilelt p10.h, x11, %x[width]\n"
+      "whilelt p9.h, x11, %x[width]\n"
+      "whilelt p8.h, x11, %x[width]\n"
+      "1:"  // Width loop
+      "add x27, %x[in], XZR, LSL #3\n"
+      "add x26, %x[in], x15, LSL #3\n"
+      "ldr x25, [x27], #0x8\n"
+      "add x24, %x[in], x14, LSL #3\n"
+      "add x20, %x[in], x13, LSL #3\n"
+      "ldr x23, [x26], #0x8\n"
+      "mov x12, #0x0\n"
+      "ldr x22, [x24], #0x8\n"
+      "ldr x21, [x20], #0x8\n"
+      "cbz x16, 3f\n"
+      "2:"  // Loads: Loop
+      ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
+      ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
+      ".inst 0xe0490720  // ld1h { za0h.h[x12] }, p1/Z, [x25, x9, LSL #1]\n"
+      "ldr x25, [x27], #0x8\n"
+      ".inst 0xe04902e8  // ld1h { za1h.h[x12] }, p0/Z, [x23, x9, LSL #1]\n"
+      ".inst 0x25386581  // psel p1.h, p9.h/Z, p12.h[w12, #1]\n"
+      ".inst 0x25386160  // psel p0.h, p8.h/Z, p11.h[w12, #1]\n"
+      "ldr x23, [x26], #0x8\n"
+      ".inst 0xe04906c1  // ld1h { za0h.h[x12, #1] }, p1/Z, [x22, x9, LSL #1]\n"
+      "ldr x22, [x24], #0x8\n"
+      ".inst 0xe04902a9  // ld1h { za1h.h[x12, #1] }, p0/Z, [x21, x9, LSL #1]\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x16, LSL #1\n"
+      "ldr x21, [x20], #0x8\n"
+      "blt 2b\n"
+      "3:"  // Loads: Tail
+      ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
+      ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
+      ".inst 0xe0490720  // ld1h { za0h.h[x12] }, p1/Z, [x25, x9, LSL #1]\n"
+      "sub x20, %x[width], x11\n"
+      ".inst 0xe04902e8  // ld1h { za1h.h[x12] }, p0/Z, [x23, x9, LSL #1]\n"
+      "cmp x20, x10\n"
+      "csel x20, x20, x10, LT\n"
+      ".inst 0x25386580  // psel p0.h, p9.h/Z, p12.h[w12, #1]\n"
+      ".inst 0xe04902c1  // ld1h { za0h.h[x12, #1] }, p0/Z, [x22, x9, LSL #1]\n"
+      ".inst 0x25386160  // psel p0.h, p8.h/Z, p11.h[w12, #1]\n"
+      "add x20, x20, #0x1\n"
+      ".inst 0xe04902a9  // ld1h { za1h.h[x12, #1] }, p0/Z, [x21, x9, LSL #1]\n"
+      "mov x12, #0x0\n"
+      "lsr x20, x20, #0x1\n"
+      "4:"  // Stores: Loop
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0bf8380  // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0af8384  // st1w { za1v.s[x12] }, p0/Z, [x28, x15, LSL #2]\n"
+      ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0ae8788  // st1w { za2v.s[x12] }, p1/Z, [x28, x14, LSL #2]\n"
+      ".inst 0xe0ad838c  // st1w { za3v.s[x12] }, p0/Z, [x28, x13, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x20\n"
+      "addvl x28, x28, #4\n"
+      "blt 4b\n"
+      "inch x11\n"
+      "whilelt p10.h, x11, %x[width]\n"
+      "whilelt p9.h, x11, %x[width]\n"
+      "whilelt p8.h, x11, %x[width]\n"
+      "inch x9\n"
+      "b.any 1b\n"
+      "mov %x[out], x28\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_fp16_fp16.hpp
new file mode 100644
index 0000000000..268bdbb924
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_fp16_fp16.hpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+template <>
+void interleave_block<4, 2, VLType::SME, false>(
+  __fp16 * &out, const __fp16 * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "mov x17, #0x0\n"
+      "mov x16, %x[row_offset]\n"
+      "cntw x15\n"
+      "cntw x14\n"
+      "cntw x11, ALL, MUL #2\n"
+      "cntw x10, ALL, MUL #3\n"
+      "cmp %x[height], x15\n"
+      "cnth x9\n"
+      "csel x15, %x[height], x15, LT\n"
+      "whilelt p11.h, XZR, %x[height]\n"
+      "whilelt p10.h, x14, %x[height]\n"
+      "whilelt p9.h, x11, %x[height]\n"
+      "whilelt p8.h, x10, %x[height]\n"
+      "ptrue p13.s\n"
+      "sub x15, x15, #0x1\n"
+      "zip1 p12.h, p11.h, p9.h\n"
+      "zip1 p11.h, p10.h, p8.h\n"
+      "mov x28, %x[out]\n"
+      "whilelt p10.h, x17, %x[width]\n"
+      "whilelt p9.h, x17, %x[width]\n"
+      "whilelt p8.h, x17, %x[width]\n"
+      "1:"  // Width loop
+      "add x27, %x[in], XZR, LSL #3\n"
+      "add x26, %x[in], x14, LSL #3\n"
+      "add x25, %x[in], x11, LSL #3\n"
+      "add x20, %x[in], x10, LSL #3\n"
+      "ldr x24, [x27], #0x8\n"
+      "mov x13, #0x0\n"
+      "ldr x23, [x26], #0x8\n"
+      "ldr x22, [x25], #0x8\n"
+      "ldr x21, [x20], #0x8\n"
+      "cbz x15, 3f\n"
+      "2:"  // Loads: Loop
+      ".inst 0x25296582  // psel p2.h, p9.h/Z, p12.h[w13]\n"
+      ".inst 0x25296161  // psel p1.h, p8.h/Z, p11.h[w13]\n"
+      ".inst 0x25396580  // psel p0.h, p9.h/Z, p12.h[w13, #1]\n"
+      ".inst 0xe0502b00  // ld1h { za0h.h[x13] }, p2/Z, [x24, x16, LSL #1]\n"
+      ".inst 0x25396162  // psel p2.h, p8.h/Z, p11.h[w13, #1]\n"
+      "ldr x24, [x27], #0x8\n"
+      ".inst 0xe05026e8  // ld1h { za1h.h[x13] }, p1/Z, [x23, x16, LSL #1]\n"
+      "ldr x23, [x26], #0x8\n"
+      ".inst 0xe05022c1  // ld1h { za0h.h[x13, #1] }, p0/Z, [x22, x16, LSL #1]\n"
+      "ldr x22, [x25], #0x8\n"
+      ".inst 0xe0502aa9  // ld1h { za1h.h[x13, #1] }, p2/Z, [x21, x16, LSL #1]\n"
+      "add x13, x13, #0x2\n"
+      "ldr x21, [x20], #0x8\n"
+      "cmp x13, x15, LSL #1\n"
+      "blt 2b\n"
+      "3:"  // Loads: Tail
+      ".inst 0x25296581  // psel p1.h, p9.h/Z, p12.h[w13]\n"
+      ".inst 0x25296160  // psel p0.h, p8.h/Z, p11.h[w13]\n"
+      "sub x20, %x[width], x17\n"
+      ".inst 0x25396582  // psel p2.h, p9.h/Z, p12.h[w13, #1]\n"
+      "cmp x20, x9\n"
+      "mov x12, #0x0\n"
+      ".inst 0xe0502700  // ld1h { za0h.h[x13] }, p1/Z, [x24, x16, LSL #1]\n"
+      ".inst 0xe05022e8  // ld1h { za1h.h[x13] }, p0/Z, [x23, x16, LSL #1]\n"
+      ".inst 0x25396161  // psel p1.h, p8.h/Z, p11.h[w13, #1]\n"
+      "csel x20, x20, x9, LT\n"
+      "add x20, x20, #0x1\n"
+      ".inst 0xe0502ac1  // ld1h { za0h.h[x13, #1] }, p2/Z, [x22, x16, LSL #1]\n"
+      "lsr x20, x20, #0x1\n"
+      ".inst 0xe05026a9  // ld1h { za1h.h[x13, #1] }, p1/Z, [x21, x16, LSL #1]\n"
+      "4:"  // Stores: Loop
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0bf8380  // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n"
+      ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
+      ".inst 0xe0ae8b84  // st1w { za1v.s[x12] }, p2/Z, [x28, x14, LSL #2]\n"
+      ".inst 0xe0ab8788  // st1w { za2v.s[x12] }, p1/Z, [x28, x11, LSL #2]\n"
+      ".inst 0xe0aa838c  // st1w { za3v.s[x12] }, p0/Z, [x28, x10, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "addvl x28, x28, #4\n"
+      "cmp x12, x20\n"
+      "blt 4b\n"
+      "inch x17\n"
+      "inch x16\n"
+      "whilelt p10.h, x17, %x[width]\n"
+      "whilelt p9.h, x17, %x[width]\n"
+      "whilelt p8.h, x17, %x[width]\n"
+      "b.any 1b\n"
+      "mov %x[out], x28\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp
new file mode 100644
index 0000000000..67dd5a9bb7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<4, 4, VLType::SME, false>(
+  int8_t * &out, const int8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x16\n"
+      "cntw x15\n"
+      "cntw x14, ALL, MUL #2\n"
+      "cntw x13, ALL, MUL #3\n"
+      "cmp %x[height], x16\n"
+      "csel x16, %x[height], x16, LT\n"
+      "whilelt p12.b, XZR, %x[height]\n"
+      "whilelt p10.b, x15, %x[height]\n"
+      "whilelt p9.b, x14, %x[height]\n"
+      "whilelt p8.b, x13, %x[height]\n"
+      "zip1 p12.b, p12.b, p9.b\n"
+      "zip1 p10.b, p10.b, p8.b\n"
+      "mov x11, #0x0\n"
+      "cntb x10\n"
+      "ptrue p11.s\n"
+      "sub x16, x16, #0x1\n"
+      "zip1 p10.b, p12.b, p10.b\n"
+      "mov x9, %x[row_offset]\n"
+      "mov x28, %x[out]\n"
+      "whilelt p9.b, x11, %x[width]\n"
+      "whilelt p8.b, x11, %x[width]\n"
+      "1:"  // Width loop
+      "add x27, %x[in], XZR, LSL #3\n"
+      "add x26, %x[in], x15, LSL #3\n"
+      "ldr x25, [x27], #0x8\n"
+      "add x24, %x[in], x14, LSL #3\n"
+      "add x23, %x[in], x13, LSL #3\n"
+      "ldr x20, [x26], #0x8\n"
+      "mov x12, #0x0\n"
+      "ldr x22, [x24], #0x8\n"
+      "ldr x21, [x23], #0x8\n"
+      "cbz x16, 3f\n"
+      "2:"  // Loads: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0090320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      "ldr x25, [x27], #0x8\n"
+      ".inst 0xe0090281  // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n"
+      ".inst 0x25346141  // psel p1.b, p8.b/Z, p10.b[w12, #2]\n"
+      ".inst 0x253c6140  // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+      "ldr x20, [x26], #0x8\n"
+      ".inst 0xe00906c2  // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x9]\n"
+      "ldr x22, [x24], #0x8\n"
+      ".inst 0xe00902a3  // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x16, LSL #2\n"
+      "ldr x21, [x23], #0x8\n"
+      "blt 2b\n"
+      "3:"  // Loads: Tail
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0090320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      ".inst 0xe0090281  // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n"
+      ".inst 0x25346140  // psel p0.b, p8.b/Z, p10.b[w12, #2]\n"
+      "sub x20, %x[width], x11\n"
+      ".inst 0xe00902c2  // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x9]\n"
+      "cmp x20, x10\n"
+      "csel x20, x20, x10, LT\n"
+      ".inst 0x253c6140  // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+      "add x20, x20, #0x3\n"
+      ".inst 0xe00902a3  // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n"
+      "mov x12, #0x0\n"
+      "lsr x20, x20, #0x2\n"
+      "4:"  // Stores: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8380  // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0af8384  // st1w { za1v.s[x12] }, p0/Z, [x28, x15, LSL #2]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0ae8788  // st1w { za2v.s[x12] }, p1/Z, [x28, x14, LSL #2]\n"
+      ".inst 0xe0ad838c  // st1w { za3v.s[x12] }, p0/Z, [x28, x13, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x20\n"
+      "addvl x28, x28, #4\n"
+      "blt 4b\n"
+      "incb x11\n"
+      "whilelt p9.b, x11, %x[width]\n"
+      "whilelt p8.b, x11, %x[width]\n"
+      "incb x9\n"
+      "b.any 1b\n"
+      "mov %x[out], x28\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp
new file mode 100644
index 0000000000..21d9378368
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<4, 4, VLType::SME, true>(
+  int8_t * &out, const int8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x16\n"
+      "cntw x15\n"
+      "mov z24.b, #0x1\n"
+      "cntw x14, ALL, MUL #2\n"
+      "cntw x13, ALL, MUL #3\n"
+      "mov z23.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "cmp %x[height], x16\n"
+      "csel x16, %x[height], x16, LT\n"
+      "mov z21.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "whilelt p12.b, XZR, %x[height]\n"
+      "whilelt p10.b, x15, %x[height]\n"
+      "whilelt p9.b, x14, %x[height]\n"
+      "whilelt p8.b, x13, %x[height]\n"
+      "zip1 p12.b, p12.b, p9.b\n"
+      "zip1 p10.b, p10.b, p8.b\n"
+      "ptrue p2.b\n"
+      "cntb x11\n"
+      "ptrue p11.s\n"
+      "sub x16, x16, #0x1\n"
+      "zip1 p10.b, p12.b, p10.b\n"
+      "mov x10, %x[row_offset]\n"
+      "mov x9, %x[out]\n"
+      "cbnz %x[first], 1f\n"
+      "addvl x9, x9, #-4\n"
+      "ld1w { z23.s }, p2/Z, [x9]\n"
+      "ld1w { z22.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x9, #3, MUL VL]\n"
+      "1:"  // Initialise row sums: End
+      "mov x28, #0x0\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "2:"  // Width loop
+      "add x27, %x[in], XZR, LSL #3\n"
+      "add x26, %x[in], x15, LSL #3\n"
+      "ldr x25, [x27], #0x8\n"
+      "add x24, %x[in], x14, LSL #3\n"
+      "add x23, %x[in], x13, LSL #3\n"
+      "ldr x20, [x26], #0x8\n"
+      "mov x12, #0x0\n"
+      "ldr x22, [x24], #0x8\n"
+      "ldr x21, [x23], #0x8\n"
+      "cbz x16, 4f\n"
+      "3:"  // Loads: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe00a0320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      "ldr x25, [x27], #0x8\n"
+      ".inst 0xe00a0281  // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n"
+      ".inst 0x25346141  // psel p1.b, p8.b/Z, p10.b[w12, #2]\n"
+      ".inst 0x253c6140  // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+      "ldr x20, [x26], #0x8\n"
+      ".inst 0xe00a06c2  // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x10]\n"
+      "ldr x22, [x24], #0x8\n"
+      ".inst 0xe00a02a3  // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x16, LSL #2\n"
+      "ldr x21, [x23], #0x8\n"
+      "blt 3b\n"
+      "4:"  // Loads: Tail
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe00a0320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      ".inst 0xe00a0281  // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n"
+      ".inst 0x25346140  // psel p0.b, p8.b/Z, p10.b[w12, #2]\n"
+      "sub x20, %x[width], x28\n"
+      ".inst 0xe00a02c2  // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x10]\n"
+      "cmp x20, x11\n"
+      "csel x20, x20, x11, LT\n"
+      ".inst 0x253c6140  // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+      "add x20, x20, #0x3\n"
+      ".inst 0xe00a02a3  // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n"
+      "mov x12, #0x0\n"
+      "lsr x20, x20, #0x2\n"
+      "5:"  // Stores: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8120  // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xc0828811  // mova z17.s, p2/M, za0v.s[x12]\n"
+      ".inst 0xe0af8124  // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xc0828893  // mova z19.s, p2/M, za1v.s[x12]\n"
+      ".inst 0xe0ae8528  // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n"
+      ".inst 0xc0828910  // mova z16.s, p2/M, za2v.s[x12]\n"
+      "sdot z23.s, z17.b, z24.b\n"
+      ".inst 0xe0ad812c  // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n"
+      ".inst 0xc0828992  // mova z18.s, p2/M, za3v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x20\n"
+      "sdot z22.s, z19.b, z24.b\n"
+      "sdot z21.s, z16.b, z24.b\n"
+      "addvl x9, x9, #4\n"
+      "sdot z20.s, z18.b, z24.b\n"
+      "blt 5b\n"
+      "incb x28\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "incb x10\n"
+      "b.any 2b\n"
+      "st1w { z23.s }, p2, [x9]\n"
+      "st1w { z22.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z20.s }, p2, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "mov %x[out], x9\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp
new file mode 100644
index 0000000000..f149c93293
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<4, 4, VLType::SME, false>(
+  uint8_t * &out, const uint8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x16\n"
+      "cntw x15\n"
+      "cntw x14, ALL, MUL #2\n"
+      "cntw x13, ALL, MUL #3\n"
+      "cmp %x[height], x16\n"
+      "csel x16, %x[height], x16, LT\n"
+      "whilelt p12.b, XZR, %x[height]\n"
+      "whilelt p10.b, x15, %x[height]\n"
+      "whilelt p9.b, x14, %x[height]\n"
+      "whilelt p8.b, x13, %x[height]\n"
+      "zip1 p12.b, p12.b, p9.b\n"
+      "zip1 p10.b, p10.b, p8.b\n"
+      "mov x11, #0x0\n"
+      "cntb x10\n"
+      "ptrue p11.s\n"
+      "sub x16, x16, #0x1\n"
+      "zip1 p10.b, p12.b, p10.b\n"
+      "mov x9, %x[row_offset]\n"
+      "mov x28, %x[out]\n"
+      "whilelt p9.b, x11, %x[width]\n"
+      "whilelt p8.b, x11, %x[width]\n"
+      "1:"  // Width loop
+      "add x27, %x[in], XZR, LSL #3\n"
+      "add x26, %x[in], x15, LSL #3\n"
+      "ldr x25, [x27], #0x8\n"
+      "add x24, %x[in], x14, LSL #3\n"
+      "add x23, %x[in], x13, LSL #3\n"
+      "ldr x20, [x26], #0x8\n"
+      "mov x12, #0x0\n"
+      "ldr x22, [x24], #0x8\n"
+      "ldr x21, [x23], #0x8\n"
+      "cbz x16, 3f\n"
+      "2:"  // Loads: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0090320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      "ldr x25, [x27], #0x8\n"
+      ".inst 0xe0090281  // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n"
+      ".inst 0x25346141  // psel p1.b, p8.b/Z, p10.b[w12, #2]\n"
+      ".inst 0x253c6140  // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+      "ldr x20, [x26], #0x8\n"
+      ".inst 0xe00906c2  // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x9]\n"
+      "ldr x22, [x24], #0x8\n"
+      ".inst 0xe00902a3  // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x16, LSL #2\n"
+      "ldr x21, [x23], #0x8\n"
+      "blt 2b\n"
+      "3:"  // Loads: Tail
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe0090320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      ".inst 0xe0090281  // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n"
+      ".inst 0x25346140  // psel p0.b, p8.b/Z, p10.b[w12, #2]\n"
+      "sub x20, %x[width], x11\n"
+      ".inst 0xe00902c2  // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x9]\n"
+      "cmp x20, x10\n"
+      "csel x20, x20, x10, LT\n"
+      ".inst 0x253c6140  // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+      "add x20, x20, #0x3\n"
+      ".inst 0xe00902a3  // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n"
+      "mov x12, #0x0\n"
+      "lsr x20, x20, #0x2\n"
+      "4:"  // Stores: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8380  // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0af8384  // st1w { za1v.s[x12] }, p0/Z, [x28, x15, LSL #2]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0ae8788  // st1w { za2v.s[x12] }, p1/Z, [x28, x14, LSL #2]\n"
+      ".inst 0xe0ad838c  // st1w { za3v.s[x12] }, p0/Z, [x28, x13, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x20\n"
+      "addvl x28, x28, #4\n"
+      "blt 4b\n"
+      "incb x11\n"
+      "whilelt p9.b, x11, %x[width]\n"
+      "whilelt p8.b, x11, %x[width]\n"
+      "incb x9\n"
+      "b.any 1b\n"
+      "mov %x[out], x28\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp
new file mode 100644
index 0000000000..252152e3da
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<4, 4, VLType::SME, true>(
+  uint8_t * &out, const uint8_t * const *in,
+  size_t width, size_t height, size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x16\n"
+      "cntw x15\n"
+      "mov z24.b, #0x1\n"
+      "cntw x14, ALL, MUL #2\n"
+      "cntw x13, ALL, MUL #3\n"
+      "mov z23.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "cmp %x[height], x16\n"
+      "csel x16, %x[height], x16, LT\n"
+      "mov z21.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "whilelt p12.b, XZR, %x[height]\n"
+      "whilelt p10.b, x15, %x[height]\n"
+      "whilelt p9.b, x14, %x[height]\n"
+      "whilelt p8.b, x13, %x[height]\n"
+      "zip1 p12.b, p12.b, p9.b\n"
+      "zip1 p10.b, p10.b, p8.b\n"
+      "ptrue p2.b\n"
+      "cntb x11\n"
+      "ptrue p11.s\n"
+      "sub x16, x16, #0x1\n"
+      "zip1 p10.b, p12.b, p10.b\n"
+      "mov x10, %x[row_offset]\n"
+      "mov x9, %x[out]\n"
+      "cbnz %x[first], 1f\n"
+      "addvl x9, x9, #-4\n"
+      "ld1w { z23.s }, p2/Z, [x9]\n"
+      "ld1w { z22.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x9, #3, MUL VL]\n"
+      "1:"  // Initialise row sums: End
+      "mov x28, #0x0\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "2:"  // Width loop
+      "add x27, %x[in], XZR, LSL #3\n"
+      "add x26, %x[in], x15, LSL #3\n"
+      "ldr x25, [x27], #0x8\n"
+      "add x24, %x[in], x14, LSL #3\n"
+      "add x23, %x[in], x13, LSL #3\n"
+      "ldr x20, [x26], #0x8\n"
+      "mov x12, #0x0\n"
+      "ldr x22, [x24], #0x8\n"
+      "ldr x21, [x23], #0x8\n"
+      "cbz x16, 4f\n"
+      "3:"  // Loads: Loop
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe00a0320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      "ldr x25, [x27], #0x8\n"
+      ".inst 0xe00a0281  // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n"
+      ".inst 0x25346141  // psel p1.b, p8.b/Z, p10.b[w12, #2]\n"
+      ".inst 0x253c6140  // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+      "ldr x20, [x26], #0x8\n"
+      ".inst 0xe00a06c2  // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x10]\n"
+      "ldr x22, [x24], #0x8\n"
+      ".inst 0xe00a02a3  // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x16, LSL #2\n"
+      "ldr x21, [x23], #0x8\n"
+      "blt 3b\n"
+      "4:"  // Loads: Tail
+      ".inst 0x25246140  // psel p0.b, p8.b/Z, p10.b[w12]\n"
+      ".inst 0xe00a0320  // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n"
+      ".inst 0x252c6140  // psel p0.b, p8.b/Z, p10.b[w12, #1]\n"
+      ".inst 0xe00a0281  // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n"
+      ".inst 0x25346140  // psel p0.b, p8.b/Z, p10.b[w12, #2]\n"
+      "sub x20, %x[width], x28\n"
+      ".inst 0xe00a02c2  // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x10]\n"
+      "cmp x20, x11\n"
+      "csel x20, x20, x11, LT\n"
+      ".inst 0x253c6140  // psel p0.b, p8.b/Z, p10.b[w12, #3]\n"
+      "add x20, x20, #0x3\n"
+      ".inst 0xe00a02a3  // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n"
+      "mov x12, #0x0\n"
+      "lsr x20, x20, #0x2\n"
+      "5:"  // Stores: Loop
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xe0bf8120  // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
+      ".inst 0xe0af8124  // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n"
+      ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
+      ".inst 0xc0828891  // mova z17.s, p2/M, za1v.s[x12]\n"
+      ".inst 0xe0ae8528  // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n"
+      ".inst 0xc0828913  // mova z19.s, p2/M, za2v.s[x12]\n"
+      "udot z23.s, z16.b, z24.b\n"
+      ".inst 0xe0ad812c  // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n"
+      ".inst 0xc0828992  // mova z18.s, p2/M, za3v.s[x12]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x20\n"
+      "udot z22.s, z17.b, z24.b\n"
+      "udot z21.s, z19.b, z24.b\n"
+      "addvl x9, x9, #4\n"
+      "udot z20.s, z18.b, z24.b\n"
+      "blt 5b\n"
+      "incb x28\n"
+      "whilelt p9.b, x28, %x[width]\n"
+      "whilelt p8.b, x28, %x[width]\n"
+      "incb x10\n"
+      "b.any 2b\n"
+      "st1w { z23.s }, p2, [x9]\n"
+      "st1w { z22.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z20.s }, p2, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "mov %x[out], x9\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp
new file mode 100644
index 0000000000..b11bb93c42
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+template <>
+void interleave_block<4, 1, VLType::SME, false>(
+  float * &out, const float * const *in,
+  size_t width, size_t height, size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x15\n"
+      "cmp %x[height], x15\n"
+      "cntw x14\n"
+      "cntw x13, ALL, MUL #2\n"
+      "cntw x11, ALL, MUL #3\n"
+      "csel x15, %x[height], x15, LT\n"
+      "mov x10, #0x0\n"
+      "ptrue p4.s\n"
+      "sub x15, x15, #0x1\n"
+      "whilelt p3.s, XZR, %x[height]\n"
+      "whilelt p15.s, x14, %x[height]\n"
+      "whilelt p14.s, x13, %x[height]\n"
+      "whilelt p13.s, x11, %x[height]\n"
+      "mov x9, %x[row_offset]\n"
+      "mov x28, %x[out]\n"
+      "whilelt p12.s, x10, %x[width]\n"
+      "whilelt p11.s, x10, %x[width]\n"
+      "whilelt p10.s, x10, %x[width]\n"
+      "whilelt p9.s, x10, %x[width]\n"
+      "whilelt p8.s, x10, %x[width]\n"
+      "1:"  // Width loop
+      "add x27, %x[in], XZR, LSL #3\n"
+      "add x26, %x[in], x14, LSL #3\n"
+      "ldr x25, [x27], #0x8\n"
+      "add x24, %x[in], x13, LSL #3\n"
+      "add x20, %x[in], x11, LSL #3\n"
+      "ldr x23, [x26], #0x8\n"
+      "mov x12, #0x0\n"
+      "ldr x22, [x24], #0x8\n"
+      "ldr x21, [x20], #0x8\n"
+      "cbz x15, 3f\n"
+      "2:"  // Loads: Loop
+      ".inst 0x25306c60  // psel p0.s, p11.s/Z, p3.s[w12]\n"
+      ".inst 0x253069e2  // psel p2.s, p10.s/Z, p15.s[w12]\n"
+      ".inst 0xe0890320  // ld1w { za0h.s[x12] }, p0/Z, [x25, x9, LSL #2]\n"
+      "ldr x25, [x27], #0x8\n"
+      ".inst 0x253065c1  // psel p1.s, p9.s/Z, p14.s[w12]\n"
+      ".inst 0x253061a0  // psel p0.s, p8.s/Z, p13.s[w12]\n"
+      ".inst 0xe0890ae4  // ld1w { za1h.s[x12] }, p2/Z, [x23, x9, LSL #2]\n"
+      "ldr x23, [x26], #0x8\n"
+      ".inst 0xe08906c8  // ld1w { za2h.s[x12] }, p1/Z, [x22, x9, LSL #2]\n"
+      "ldr x22, [x24], #0x8\n"
+      ".inst 0xe08902ac  // ld1w { za3h.s[x12] }, p0/Z, [x21, x9, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x15\n"
+      "ldr x21, [x20], #0x8\n"
+      "blt 2b\n"
+      "3:"  // Loads: Tail
+      "sub x20, %x[width], x10\n"
+      ".inst 0x25306c60  // psel p0.s, p11.s/Z, p3.s[w12]\n"
+      ".inst 0xe0890320  // ld1w { za0h.s[x12] }, p0/Z, [x25, x9, LSL #2]\n"
+      ".inst 0x253069e0  // psel p0.s, p10.s/Z, p15.s[w12]\n"
+      ".inst 0x253065c1  // psel p1.s, p9.s/Z, p14.s[w12]\n"
+      ".inst 0xe08902e4  // ld1w { za1h.s[x12] }, p0/Z, [x23, x9, LSL #2]\n"
+      ".inst 0x253061a0  // psel p0.s, p8.s/Z, p13.s[w12]\n"
+      "cmp x20, x14\n"
+      ".inst 0xe08906c8  // ld1w { za2h.s[x12] }, p1/Z, [x22, x9, LSL #2]\n"
+      ".inst 0xe08902ac  // ld1w { za3h.s[x12] }, p0/Z, [x21, x9, LSL #2]\n"
+      "mov x12, #0x0\n"
+      "csel x20, x20, x14, LT\n"
+      "4:"  // Stores: Loop
+      ".inst 0x25305180  // psel p0.s, p4.s/Z, p12.s[w12]\n"
+      ".inst 0xe0bf8380  // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n"
+      ".inst 0x25305180  // psel p0.s, p4.s/Z, p12.s[w12]\n"
+      ".inst 0xe0ae8384  // st1w { za1v.s[x12] }, p0/Z, [x28, x14, LSL #2]\n"
+      ".inst 0x25305181  // psel p1.s, p4.s/Z, p12.s[w12]\n"
+      ".inst 0x25305180  // psel p0.s, p4.s/Z, p12.s[w12]\n"
+      ".inst 0xe0ad8788  // st1w { za2v.s[x12] }, p1/Z, [x28, x13, LSL #2]\n"
+      ".inst 0xe0ab838c  // st1w { za3v.s[x12] }, p0/Z, [x28, x11, LSL #2]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x20\n"
+      "addvl x28, x28, #4\n"
+      "blt 4b\n"
+      "incw x10\n"
+      "whilelt p12.s, x10, %x[width]\n"
+      "whilelt p11.s, x10, %x[width]\n"
+      "whilelt p10.s, x10, %x[width]\n"
+      "whilelt p9.s, x10, %x[width]\n"
+      "whilelt p8.s, x10, %x[width]\n"
+      "incw x9\n"
+      "b.any 1b\n"
+      "mov %x[out], x28\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [out] "+&r" (out)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/interleave-8way.cpp b/src/core/NEON/kernels/arm_gemm/interleave-8way.cpp
new file mode 100644
index 0000000000..a05d700c5e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/interleave-8way.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#if !defined(_WIN64) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
+
+#include <cstring>
+
+#include "transform.hpp"
+#include "utils.hpp"
+
+namespace arm_gemm {
+
+namespace {
+
+// Helper function to interleave a single 4x4 block of 32-bin values
+// together.
+
+// _full version doesn't need to worry about any padding.
+static inline void transpose_block_32_full(const uint8_t * __restrict in_ptr0, const uint8_t * __restrict in_ptr1, const uint8_t * __restrict in_ptr2, const uint8_t * __restrict in_ptr3, uint8_t * __restrict out_ptr, long output_stride) {
+    uint32x4_t inputs[4];
+    uint32x4_t inters[4];
+    uint32x4_t outputs[4];
+
+    inputs[0] = vld1q_u32(reinterpret_cast<const uint32_t *>(in_ptr0));
+    inputs[1] = vld1q_u32(reinterpret_cast<const uint32_t *>(in_ptr1));
+    inputs[2] = vld1q_u32(reinterpret_cast<const uint32_t *>(in_ptr2));
+    inputs[3] = vld1q_u32(reinterpret_cast<const uint32_t *>(in_ptr3));
+
+    inters[0] = vzip1q_u32(inputs[0], inputs[2]);
+    inters[1] = vzip2q_u32(inputs[0], inputs[2]);
+    inters[2] = vzip1q_u32(inputs[1], inputs[3]);
+    inters[3] = vzip2q_u32(inputs[1], inputs[3]);
+
+    outputs[0] = vzip1q_u32(inters[0], inters[2]);
+    outputs[1] = vzip2q_u32(inters[0], inters[2]);
+    outputs[2] = vzip1q_u32(inters[1], inters[3]);
+    outputs[3] = vzip2q_u32(inters[1], inters[3]);
+
+    vst1q_u32(reinterpret_cast<uint32_t *>(out_ptr), outputs[0]);
+    vst1q_u32(reinterpret_cast<uint32_t *>(out_ptr + output_stride), outputs[1]);
+    vst1q_u32(reinterpret_cast<uint32_t *>(out_ptr + output_stride*2), outputs[2]);
+    vst1q_u32(reinterpret_cast<uint32_t *>(out_ptr + output_stride*3), outputs[3]);
+}
+
+// _part version: Only read "bytes_in" bytes, not a full vector.  Only write
+// out 4-byte blocks that have some live content (if bytes_in is not a
+// multiple of 4 there will some padding in each 4-block)
+static inline void transpose_block_32_part(const uint8_t *in_ptr0, const uint8_t *in_ptr1, const uint8_t *in_ptr2, const uint8_t *in_ptr3, uint8_t *out_ptr, long bytes_in, long output_stride) {
+    uint32x4_t inputs[4];
+    uint32x4_t inters[4];
+    uint32x4_t outputs[4];
+    uint8_t scratch[16] = {0};
+
+    long num_outs = iceildiv<long>(bytes_in, 4);
+
+    memcpy(scratch, in_ptr0, bytes_in);
+    inputs[0] = vld1q_u32(reinterpret_cast<const uint32_t *>(scratch));
+    memcpy(scratch, in_ptr1, bytes_in);
+    inputs[1] = vld1q_u32(reinterpret_cast<const uint32_t *>(scratch));
+    memcpy(scratch, in_ptr2, bytes_in);
+    inputs[2] = vld1q_u32(reinterpret_cast<const uint32_t *>(scratch));
+    memcpy(scratch, in_ptr3, bytes_in);
+    inputs[3] = vld1q_u32(reinterpret_cast<const uint32_t *>(scratch));
+
+    inters[0] = vzip1q_u32(inputs[0], inputs[2]);
+    inters[1] = vzip2q_u32(inputs[0], inputs[2]);
+    inters[2] = vzip1q_u32(inputs[1], inputs[3]);
+    inters[3] = vzip2q_u32(inputs[1], inputs[3]);
+
+    outputs[0] = vzip1q_u32(inters[0], inters[2]);
+    outputs[1] = vzip2q_u32(inters[0], inters[2]);
+    outputs[2] = vzip1q_u32(inters[1], inters[3]);
+    outputs[3] = vzip2q_u32(inters[1], inters[3]);
+
+    do {
+        vst1q_u32(reinterpret_cast<uint32_t *>(out_ptr), outputs[0]);
+        if (num_outs < 2)
+            break;
+        vst1q_u32(reinterpret_cast<uint32_t *>(out_ptr + output_stride), outputs[1]);
+        if (num_outs < 3)
+            break;
+        vst1q_u32(reinterpret_cast<uint32_t *>(out_ptr + output_stride*2), outputs[2]);
+        if (num_outs < 4)
+            break;
+        vst1q_u32(reinterpret_cast<uint32_t *>(out_ptr + output_stride*3), outputs[3]);
+    } while (0);
+}
+
+template<unsigned N>
+struct Unroll {
+    template<typename F>
+    static void run(F f) {
+        Unroll<N-1>::run(f);
+        f(N-1);
+    }
+};
+
+template<>
+struct Unroll<0> {
+    template<typename F>
+    static void run(F) {
+    }
+};
+
+// Interleave some multiple of 4 rows together.
+//
+// The template parameter BLOCKS controls the size of the inner loop - each BLOCK is 4 rows.
+// The function parameter interleave_multiple controls the number of times the inner loop is run.
+
+// The total interleave depth for a given run is therefore BLOCKS * interleave_multiple * 4.
+template<unsigned BLOCKS>
+void a64_interleave_1x4(uint8_t *out, const uint8_t *in, long width, long in_stride, long height, long interleave_multiple) {
+    const long total_interleave_depth = BLOCKS * 4 * interleave_multiple;
+    constexpr long loop_interleave_depth = BLOCKS * 4;
+
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width));
+
+    if (height % total_interleave_depth) {
+        memset(pad_row, 0, width);
+    }
+
+    // Outer loop: process blocks of total_interleave_depth rows at a time.
+    for (long y0_base=0; y0_base<height; y0_base+=total_interleave_depth) {
+        // Middle loop: process each "interlave_multiple" block of rows.
+        for (long block=0; block<interleave_multiple; block++) {
+            const long y0 = y0_base + (block * loop_interleave_depth);
+            uint8_t *out_ptr = out + (block * loop_interleave_depth * 4); // 4 is the blocking depth (we interleave 4 bytes at a time from each input)
+
+            // Create and set up input row pointers.  The idea is that these
+            // should entirely fit in the register file, so we don't have to
+            // repeatedly load them (or perform the padding check)
+            const uint8_t *in_ptrs[loop_interleave_depth];
+            Unroll<loop_interleave_depth>::run( [&](unsigned y) {
+                in_ptrs[y] = (y+y0 < height) ? in + ((y+y0) * in_stride) : pad_row;
+            });
+
+            long bytes_left = width;
+            // Process full vectors using transpose_block_32_full()
+            while (bytes_left >= 16) { // 16 is the vector length in bytes
+                Unroll<BLOCKS>::run( [&](unsigned u) {
+                    transpose_block_32_full(in_ptrs[u*4 + 0],  in_ptrs[u*4 + 1],  in_ptrs[u*4 + 2],  in_ptrs[u*4 + 3],
+                                            out_ptr + 16*u, total_interleave_depth * 4); // 4 is the blocking depth
+                });
+
+                Unroll<loop_interleave_depth>::run( [&](unsigned y) {
+                    in_ptrs[y] += 16; // 16 is the vector length in bytes
+                });
+
+                out_ptr += total_interleave_depth * 16; // 16 is the vector length in bytes
+                bytes_left -= 16; // 16 is the vector length in bytes
+            }
+
+            // Process any remaining bytes using transpose_block_32_part()
+            if (bytes_left) {
+                Unroll<BLOCKS>::run( [&](unsigned u) {
+                    transpose_block_32_part(in_ptrs[u*4 + 0],  in_ptrs[u*4 + 1],  in_ptrs[u*4 + 2],  in_ptrs[u*4 + 3], 
+                                            out_ptr + 16*u, bytes_left, total_interleave_depth * 4);
+                });
+            }
+        }
+
+        // Update "out" pointer for next set of total_interleave_depth rows
+        out += total_interleave_depth * roundup<long>(width, 4);
+    }
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 4, false, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int y0, int ymax, int x0, int xmax)
+{
+    a64_interleave_1x4<4>(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + y0 * stride + x0),
+        (xmax - x0),
+        stride,
+        (ymax - y0),
+        1
+    );
+}
+
+template<>
+void Transform<16, 4, false, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int y0, int ymax, int x0, int xmax)
+{
+    a64_interleave_1x4<4>(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + y0 * stride + x0),
+        (xmax - x0),
+        stride,
+        (ymax - y0),
+        1
+    );
+}
+
+template<>
+void Transform<12, 1, false, VLType::None>(
+    float *out, const float *in, int stride, int y0, int ymax, int x0, int xmax)
+{
+    a64_interleave_1x4<3>(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + y0 * stride + x0),
+        (xmax - x0) * sizeof(float),
+        stride * sizeof(float),
+        (ymax - y0),
+        1
+    );
+}
+
+template<>
+void Transform<16, 1, false, VLType::None>(
+    float *out, const float *in, int stride, int y0, int ymax, int x0, int xmax)
+{
+    a64_interleave_1x4<4>(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + y0 * stride + x0),
+        (xmax - x0) * sizeof(float),
+        stride * sizeof(float),
+        (ymax - y0),
+        1
+    );
+}
+
+template<>
+void Transform<24, 1, false, VLType::None>(
+    float *out, const float *in, int stride, int y0, int ymax, int x0, int xmax)
+{
+    a64_interleave_1x4<3>(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + y0 * stride + x0),
+        (xmax - x0) * sizeof(float),
+        stride * sizeof(float),
+        (ymax - y0),
+        2
+    );
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect-sve.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect-sve.cpp
new file mode 100644
index 0000000000..7ed8af9b45
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect-sve.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "asmlib.hpp"
+#include "convolution_parameters.hpp"
+#include "convolver.hpp"
+#include "interleave_indirect.hpp"
+#include "bfloat.hpp"
+
+#include <alloca.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include <arm_neon.h>
+
+#include "utils.hpp"
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME
+namespace arm_gemm {
+
+#include "interleave_indirect_impl.hpp"
+
+#include "indirect-interleaves/list-sve.hpp"
+
+/**** Instantiate needed implementations ****/
+
+/* FP32: SME implementations (height 1VL, 2VL, 4VL) */
+template void IndirectInterleave<2, 1, VLType::SME>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<2, 1, VLType::SME>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<2, 1, VLType::SME>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<1, 1, VLType::SME>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<1, 1, VLType::SME>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<1, 1, VLType::SME>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<4, 1, VLType::SME>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 1, VLType::SME>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 1, VLType::SME>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* BF16: SME implementations (height 1VL, 2VL, 4VL) */
+template void IndirectInterleave<2, 2, VLType::SME>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<2, 2, VLType::SME>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<2, 2, VLType::SME>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<1, 2, VLType::SME>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<1, 2, VLType::SME>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<1, 2, VLType::SME>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<4, 2, VLType::SME>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 2, VLType::SME>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 2, VLType::SME>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* BF16: SME implementations narrow accumulators (no blocking) (height 1VL, 2VL) */
+template void IndirectInterleave<2, 1, VLType::SME>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<2, 1, VLType::SME>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<2, 1, VLType::SME>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<1, 1, VLType::SME>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<1, 1, VLType::SME>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<1, 1, VLType::SME>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP16: SME implementations narrow accumulators (no blocking) (height 1VL, 2VL) */
+template void IndirectInterleave<2, 1, VLType::SME>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<2, 1, VLType::SME>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<2, 1, VLType::SME>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<1, 1, VLType::SME>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<1, 1, VLType::SME>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<1, 1, VLType::SME>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP16: SME implementations with dot-product type outer product */
+template void IndirectInterleave<2, 2, VLType::SME>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<2, 2, VLType::SME>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<2, 2, VLType::SME>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<1, 2, VLType::SME>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<1, 2, VLType::SME>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<1, 2, VLType::SME>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<4, 2, VLType::SME>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 2, VLType::SME>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 2, VLType::SME>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP32 fast-mode: SME implementations */
+template void IndirectInterleave<1, 2, VLType::SME>(bfloat16 *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<1, 2, VLType::SME>(bfloat16 *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<1, 2, VLType::SME>(bfloat16 *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<2, 2, VLType::SME>(bfloat16 *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<2, 2, VLType::SME>(bfloat16 *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<2, 2, VLType::SME>(bfloat16 *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<4, 2, VLType::SME>(bfloat16 *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 2, VLType::SME>(bfloat16 *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 2, VLType::SME>(bfloat16 *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* INT8: SME implementation (height 1VL, 2VL, 4VL) */
+template void IndirectInterleave<1, 4, VLType::SME>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<1, 4, VLType::SME>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<1, 4, VLType::SME>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<2, 4, VLType::SME>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<2, 4, VLType::SME>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<2, 4, VLType::SME>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<4, 4, VLType::SME>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 4, VLType::SME>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 4, VLType::SME>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* UINT8: SME implementation (height 1VL, 2VL, 4VL) */
+template void IndirectInterleave<1, 4, VLType::SME>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<1, 4, VLType::SME>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<1, 4, VLType::SME>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<2, 4, VLType::SME>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<2, 4, VLType::SME>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<2, 4, VLType::SME>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<4, 4, VLType::SME>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 4, VLType::SME>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 4, VLType::SME>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
index 0d56b46e19..7c09608e3e 100644
--- a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,9 @@
 #include "interleave_indirect.hpp"
 #include "bfloat.hpp"
 
+#if !defined(_WIN64) && !defined(__OpenBSD__)
 #include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
 
 #include <algorithm>
 #include <cstddef>
@@ -320,19 +322,19 @@ template void IndirectInterleave<8, 1, VLType::None>(float *, const float * cons
 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 
-#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVEF32MM)
 /* FMMLA */
 template void IndirectInterleave<8, 2, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<8, 2, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 2, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
-#endif // SVE && MMLA_FP32
+#endif // ARM_COMPUTE_ENABLE_SVE && ARM_COMPUTE_ENABLE_SVEF32MM
 
 /* FP16 */
-#if defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(FP16_KERNELS) || defined(ARM_COMPUTE_ENABLE_FP16)
 template void IndirectInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
-#endif // FP16_KERNELS ar __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // FP16_KERNELS ar ARM_COMPUTE_ENABLE_FP16
 
 template void IndirectInterleave<8, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
@@ -340,7 +342,7 @@ template void Interleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, un
 
 /* BF16 */
 /* Arm® Neon™/SVE BFDOT */
-#ifdef V8P6_BF
+#ifdef ARM_COMPUTE_ENABLE_BF16
 template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
@@ -348,7 +350,11 @@ template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_
 template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
-#endif // V8P6_BF
+
+template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+#endif // ARM_COMPUTE_ENABLE_BF16
 
 /* Arm® Neon™/SVE using FP32 kernel */
 template void IndirectInterleave<8, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
@@ -375,12 +381,10 @@ template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * co
 template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 
-#ifdef MMLA_INT8
 /* MMLA SMMLA (height 8, block 8) */
 template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
 template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
-#endif // MMLA_INT8
 
 /* Arm® Neon™ SDOT (height 8, block 1) */
 template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
@@ -397,12 +401,10 @@ template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *
 template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 
-#ifdef MMLA_INT8
 /* MMLA SMMLA (height 8, block 8) */
 template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
 template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
-#endif // MMLA_INT8
 
 /* Arm® Neon™ 16-bit (height 8, block 1) */
 template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
new file mode 100644
index 0000000000..b921fd16d2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+// Implementations of interleave functions
+// These must be included with a "namespace arm_gemm" block.
+
+/*
+ * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together.
+ *
+ * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining
+ * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad
+ * with a particular value.
+ *
+ * Note that it is not expected for this templated version to ever be used - all cases that matter should be
+ * explicitly specialized with an optimized implementation.
+ */
+template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
+void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
+    const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
+                                                  (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+    const unsigned int int_by = height_vectors;
+#endif
+
+    std::vector<int32_t> the_sums;
+
+    if (integrate_sums) {
+        the_sums = std::vector<int32_t>(int_by, 0);
+
+        if (!first) {
+            // In 'integrate sums' mode, we dump the sums at the end on each pass.
+
+            // On the last pass this is correct, but on other passes it is not -
+            // so on the subsequent pass we need to take the output written by
+            // the previous pass as starting point for the sums, and then
+            // overwrite them with new interleaved data.
+            int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+            // Rewind pointer to where we wrote out the sums last time.
+            out_int32 -= int_by;
+
+            // Restore the running sums.
+            memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
+
+            // Update the "real" pointer so that the next output will clobber the old sums.
+            out = reinterpret_cast<TOut *>(out_int32);
+        }
+    }
+
+    for (unsigned int pos=0; pos<width; pos+=block) {
+        for (unsigned int row=0; row<int_by; row++) {
+            // Row out of range - pad 'block' entries.
+            if (row >= height) {
+                for (unsigned int col=0; col<block; col++) {
+                    *out++ = 0;
+                }
+                continue;
+            }
+
+            for (unsigned int col=0; col<block; col++) {
+                // Column out of range - pad a single entry
+                if (pos + col >= width) {
+                    *out++ = 0;
+                    continue;
+                }
+
+                if (integrate_sums) {
+                    the_sums[row] += in[row][row_offset + pos + col];
+                }
+
+                *out++ = in[row][row_offset + pos + col];
+            }
+        }
+    }
+
+    if (integrate_sums) {
+        int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+        memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
+
+        out = reinterpret_cast<TOut *>(out_int32 + int_by);
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
+inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
+                                                  (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+    const unsigned int height = height_vectors;
+#endif
+
+    // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
+    if (row_sum_multiplier) {
+        // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
+        // next block (post sums).
+        // We need to go back and apply the multiplier to the computed sums.  We don't need to change 'out'.
+        int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+        out_int32 -= height;
+        for (unsigned int i=0; i<height; i++) {
+            out_int32[i] *= row_sum_multiplier;
+        }
+    } else {
+        // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
+        // sum block.  We need to insert the (zero) sums, and advance 'out'.
+        int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+        for (unsigned int i=0; i<height; i++) {
+            out_int32[i] = 0;
+        }
+
+        out_int32 += height;
+
+        out = reinterpret_cast<TOut *>(out_int32);
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen,
+                        unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
+                        const unsigned int k0, const unsigned int kmax, bool integrate_sums,
+                        const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
+                                                  (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+    const unsigned int height = height_vectors;
+#endif
+
+    // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
+    // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
+    // out of range rows).  This allows interleave_block to use techniques like row predication, or loading all
+    // pointers and conditionally overriding the out of range ones.
+
+    // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
+    // range reads.  Avoid this with a local buffer to use in last-rows cases.  Use alloca as a std::vector can be
+    // expensive in highly threaded scenarios.
+    const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+    // Figure out the starting position based on k0 (with rounded length)
+    unsigned int start_string      = k0 / rounded_stringlen;
+    unsigned int start_stringpos   = k0 % rounded_stringlen;
+
+    // Process blocks of 'height' height...
+    for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
+        // Height to process
+        unsigned int active_height = std::min(ymax - ybase, height);
+
+        // Track our progress through the various strings
+        unsigned int k_left    = (kmax - k0);
+        unsigned int string    = start_string;
+        unsigned int stringpos = start_stringpos;
+
+        bool first = true;
+
+        // Prepare to call 'interleave_block' above for each string encompassed by K range
+        while (k_left > 0) {
+            // Width to process - and the width we will generate (with padding)
+            unsigned int in_width   = std::min(k_left, stringlen - stringpos);
+            unsigned int out_width  = std::min(k_left, rounded_stringlen - stringpos);
+
+            const TIn * const *row_base = ptr[string] + ybase;
+
+            // If not all rows are valid, copy the ones that are into local array (see above comment).
+            if (active_height < height) {
+                for (unsigned int i=0; i<active_height; i++) {
+                    row_ptrs[i] = ptr[string][ybase + i];
+                }
+
+                row_base = row_ptrs;
+            }
+
+            // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
+            // much code.  However, integrated sums make no sense for non-integral types and won't ever be
+            // requested.  So put a type trait check here to avoid generating pointless code.
+            if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+                interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
+            } else {
+                interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
+            }
+
+            k_left -= out_width;
+            string++;
+            stringpos=0;
+            first=false;
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums) {
+            FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+        }
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
+        const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
+                                                  (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+    const unsigned int height = height_vectors;
+#endif
+    auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
+
+    // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
+    const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+    for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
+        // How many of the rows are active - the rest will get padded in interleave_block.
+        unsigned int active_height   = std::min(ymax - ybase, height);
+        bool first = true;
+
+        auto conv_rows = conv_cols.process_rows(ybase, active_height);
+
+        while (!conv_rows.finished()) {
+            unsigned int width, offset;
+
+            // Get next set of parameters
+            std::tie(width, offset) = conv_rows.next_block(row_ptrs);
+
+            // Perform the interleave
+            if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+                interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
+            } else {
+                interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
+            }
+
+            first=false;
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums) {
+            FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+        }
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
+                                                  (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+    const unsigned int height = height_vectors;
+#endif
+    // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
+    const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+    const unsigned int width=kmax-k0;
+
+    for (unsigned int y=y0; y<ymax; y+=height) {
+        for (unsigned int r=0; r<height; r++) {
+            row_ptrs[r] = in + ((y + r) * in_stride);
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+            interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
+        } else {
+            interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums) {
+            FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/arm_gemm/kernel_traits.hpp b/src/core/NEON/kernels/arm_gemm/kernel_traits.hpp
new file mode 100644
index 0000000000..24c304ab5f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernel_traits.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace arm_gemm
+{
+
+namespace
+{
+  template <class T>
+  constexpr auto is_sme_impl(int)
+    -> decltype(T::is_sme(), std::true_type{})
+  {
+    return std::true_type{};
+  }
+
+  template <class>
+  constexpr auto is_sme_impl(...) -> std::false_type
+  {
+    return std::false_type{};
+  }
+}
+
+template <class T>
+struct is_sme
+{
+  static constexpr auto value = std::is_same<decltype(is_sme_impl<T>(0)),
+                                             std::true_type>::value;
+};
+
+}  // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernel_weight_format.hpp b/src/core/NEON/kernels/arm_gemm/kernel_weight_format.hpp
new file mode 100644
index 0000000000..6b89dd0d73
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernel_weight_format.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm {
+
+/* Internal enum to define the weight format a kernel is expecting.
+ *
+ * This is distinct from the "external" WeightFormat defined in arm_gemm.hpp primarily to allow for SVE, where
+ * internally kernels are defined in terms of multiples of the SVE vector length, but externally they are converted
+ * to a fixed format (based on the VL of the machine we are running on).
+ *
+ * Encoded as a bitfield:
+ *  bit     0 : SVE flag
+ *  bit     4 : BF16 convert flag (fast mode)
+ *  bits 11-8 : block length (bytes)
+ *  bits 15-12: vector count
+ */
+enum class KernelWeightFormat {
+    NON_FIXED        = 0,
+    VL128_BL16       = 0x1200,
+    VL128_BL32       = 0x1400,
+    VL128_BL32_BF16  = 0x1410,
+    VL128_BL64       = 0x1800,
+    VL256_BL64       = 0x2800,
+    VL256_BL64_BF16  = 0x2810,
+    VL1VL_BL16       = 0x1201,
+    VL1VL_BL32       = 0x1401,
+    VL1VL_BL32_BF16  = 0x1411,
+    VL1VL_BL64       = 0x1801,
+    VL2VL_BL64       = 0x2801,
+    VL2VL_BL64_BF16  = 0x2811
+};
+
+WeightFormat get_weight_format(const KernelWeightFormat, size_t);
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp
new file mode 100644
index 0000000000..72e414969e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<bfloat16>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    size_t, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_ffhybrid_bf16fp32_mmla_6x16( ARGLIST );
+
+class cls_a64_ffhybrid_bf16fp32_mmla_6x16
+{
+public:
+    typedef bfloat16 lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+    static unsigned int stripe_width()
+    {
+        return 4;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL256_BL64;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 37.09 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_ffhybrid_bf16fp32_mmla_6x16;
+    cls_a64_ffhybrid_bf16fp32_mmla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..377daddae9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp
@@ -0,0 +1,3807 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_ffhybrid_bf16fp32_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, size_t B_stride, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        const bfloat16 *cur_B_ptr = {};
+        size_t B_stride = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    ka.B_stride = B_stride;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 191f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 153f\n"
+      "beq 115f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 77f\n"
+      "beq 39f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 3f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 3f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 3f\n"
+      "mov x11, x12\n"
+      "3:"  // Height 1: B setup done
+      "cbz x15, 4f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 16f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 15f\n"
+      "cmp x14, #0x10\n"
+      "bge 13f\n"
+      "tbz x14, #3, 8f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x14, #2, 6f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 5f\n"
+      "ldr d16, [x13], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "b 12f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 12f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "b 12f\n"
+      "6:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x14, #1, 7f\n"
+      "ldr d11, [x13], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 12f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 12f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 12f\n"
+      "8:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x14, #2, 10f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 9f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 12f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 12f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 12f\n"
+      "10:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x14, #1, 11f\n"
+      "ldr d9, [x13], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 12f\n"
+      "11:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "mov x20, #0x0\n"
+      "12:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 14f\n"
+      "13:"  // Height 1: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "14:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 16f\n"
+      "15:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "16:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "17:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 18f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 19f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "b 19f\n"
+      "18:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "19:"  // Height 1: input setup done
+      "cmp x27, #0x8\n"
+      "blt 22f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q6, [x12, #0x10]\n"
+      "blt 21f\n"
+      "20:"  // Height 1: Multiply loop: Main loop head
+      "trn1 v20.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e47ee88  // bfmmla v8.4s, v20.8h, v7.8h\n"
+      "ldr q17, [x11, #0x0]\n"
+      ".inst 0x6e46ee8c  // bfmmla v12.4s, v20.8h, v6.8h\n"
+      "ldr q19, [x11, #0x10]\n"
+      ".inst 0x6e51ee89  // bfmmla v9.4s, v20.8h, v17.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e53ee8d  // bfmmla v13.4s, v20.8h, v19.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee8a  // bfmmla v10.4s, v20.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ee8e  // bfmmla v14.4s, v20.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      "trn2 v1.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e52ee8b  // bfmmla v11.4s, v20.8h, v18.8h\n"
+      "ldr q18, [x12, #0x20]\n"
+      ".inst 0x6e51ee8f  // bfmmla v15.4s, v20.8h, v17.8h\n"
+      "ldr q17, [x12, #0x30]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x11, #0x20]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x11, #0x30]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x9, #0x20]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x9, #0x30]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      "add x12, x12, #0x40\n"
+      "ldr q7, [x12, #0x0]\n"
+      "add x11, x11, #0x40\n"
+      "ldr q6, [x12, #0x10]\n"
+      "add x10, x10, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "bge 20b\n"
+      "21:"  // Height 1: Multiply loop: Single iteration only
+      "trn1 v19.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q17, [x11, #0x0]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q18, [x11, #0x10]\n"
+      ".inst 0x6e51ee69  // bfmmla v9.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x0]\n"
+      ".inst 0x6e52ee6d  // bfmmla v13.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x10]\n"
+      ".inst 0x6e51ee6a  // bfmmla v10.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x9, #0x0]\n"
+      ".inst 0x6e52ee6e  // bfmmla v14.4s, v19.8h, v18.8h\n"
+      "ldr q24, [x9, #0x10]\n"
+      "trn2 v1.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e51ee6b  // bfmmla v11.4s, v19.8h, v17.8h\n"
+      "ldr q18, [x12, #0x20]\n"
+      ".inst 0x6e58ee6f  // bfmmla v15.4s, v19.8h, v24.8h\n"
+      "ldr q17, [x12, #0x30]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q19, [x11, #0x20]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x11, #0x30]\n"
+      ".inst 0x6e53ec29  // bfmmla v9.4s, v1.8h, v19.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x9, #0x20]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x9, #0x30]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
+      "add x26, x26, #0x10\n"
+      "add x12, x12, #0x40\n"
+      "add x11, x11, #0x40\n"
+      "add x10, x10, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "22:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 27f\n"
+      "cmp x27, #0x4\n"
+      "blt 24f\n"
+      "23:"  // Height 1: Multiply loop: Odd block loop
+      "ldr d19, [x26], #0x8\n"
+      "ldr q18, [x12, #0x0]\n"
+      "trn1 v19.2d, v19.2d, v17.2d\n"
+      "ldr q17, [x12, #0x10]\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "add x12, x12, #0x20\n"
+      "add x11, x11, #0x20\n"
+      "add x10, x10, #0x20\n"
+      "add x9, x9, #0x20\n"
+      "bge 23b\n"
+      "24:"  // Height 1: Multiply loop: Skip odd blocks
+      "cbz x27, 27f\n"
+      "tbz x27, #1, 25f\n"
+      "ldr s1, [x26], #0x4\n"
+      "tbz x27, #0, 26f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "b 26f\n"
+      "25:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "26:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q20, [x12, #0x0]\n"
+      "ldr q18, [x12, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v17.2d\n"
+      ".inst 0x6e54ee68  // bfmmla v8.4s, v19.8h, v20.8h\n"
+      "ldr q17, [x11, #0x0]\n"
+      ".inst 0x6e52ee6c  // bfmmla v12.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x11, #0x10]\n"
+      ".inst 0x6e51ee69  // bfmmla v9.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x0]\n"
+      ".inst 0x6e52ee6d  // bfmmla v13.4s, v19.8h, v18.8h\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e51ee6a  // bfmmla v10.4s, v19.8h, v17.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e46ee6e  // bfmmla v14.4s, v19.8h, v6.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "add x12, x12, #0x20\n"
+      "add x11, x11, #0x20\n"
+      "add x10, x10, #0x20\n"
+      "add x9, x9, #0x20\n"
+      "27:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 17b\n"
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "tbz %x[flags], #1, 28f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v18.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
+      "28:"  // Height 1: No activation
+      "cmp x14, #0x10\n"
+      "bge 37f\n"
+      "tbz x14, #3, 32f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x14, #2, 30f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 29f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x14, #0, 36f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 36f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 36f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 36f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 31f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x14, #0, 36f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 36f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 36f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 36f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 34f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 33f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x14, #0, 36f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 36f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 36f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 36f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 35f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x14, #0, 36f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 36f\n"
+      "35:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "36:"  // Height 1: Partial direct writeback: Done
+      "b 38f\n"
+      "37:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "38:"  // Height 1: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 2b\n"
+      "b 230f\n"
+      "39:"  // Height 2
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "40:"  // Height 2: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 41f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 41f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 41f\n"
+      "mov x11, x12\n"
+      "41:"  // Height 2: B setup done
+      "cbz x15, 42f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 54f\n"
+      "42:"  // Height 2: no bias
+      "tbz %x[flags], #0, 53f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x14, #0x10\n"
+      "add x25, x13, x20, LSL #2\n"
+      "bge 51f\n"
+      "tbz x14, #3, 46f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "tbz x14, #2, 44f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 43f\n"
+      "ldr d16, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 50f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "b 50f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 50f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "b 50f\n"
+      "44:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x14, #1, 45f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 50f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "b 50f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 50f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "b 50f\n"
+      "46:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x14, #2, 48f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 47f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 50f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "b 50f\n"
+      "47:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 50f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "b 50f\n"
+      "48:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x14, #1, 49f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 50f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "b 50f\n"
+      "49:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "50:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 52f\n"
+      "51:"  // Height 2: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "52:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 54f\n"
+      "53:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "54:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "55:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 56f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 57f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "b 57f\n"
+      "56:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "57:"  // Height 2: input setup done
+      "cmp x27, #0x8\n"
+      "blt 60f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q7, [x12, #0x0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      "blt 59f\n"
+      "58:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x12, #0x20]\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x12, #0x30]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x11, #0x20]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x11, #0x30]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x9, #0x20]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x9, #0x30]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      "add x12, x12, #0x40\n"
+      "ldr q7, [x12, #0x0]\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      "add x11, x11, #0x40\n"
+      "add x10, x10, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "bge 58b\n"
+      "59:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x12, #0x20]\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x12, #0x30]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x11, #0x20]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x11, #0x30]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x9, #0x20]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x9, #0x30]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x12, x12, #0x40\n"
+      "add x11, x11, #0x40\n"
+      "add x10, x10, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "60:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 65f\n"
+      "cmp x27, #0x4\n"
+      "blt 62f\n"
+      "61:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d18, [x26], #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "trn1 v19.2d, v18.2d, v17.2d\n"
+      "sub x27, x27, #0x4\n"
+      "ldr q18, [x12, #0x0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6e5aee69  // bfmmla v9.4s, v19.8h, v26.8h\n"
+      ".inst 0x6e46ee6d  // bfmmla v13.4s, v19.8h, v6.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      "ldr q17, [x9, #0x10]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "add x12, x12, #0x20\n"
+      "add x11, x11, #0x20\n"
+      "add x10, x10, #0x20\n"
+      "add x9, x9, #0x20\n"
+      "bge 61b\n"
+      "62:"  // Height 2: Multiply loop: Skip odd blocks
+      "cbz x27, 65f\n"
+      "tbz x27, #1, 63f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "tbz x27, #0, 64f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "ld1 { v2.h }[2], [x25]\n"
+      "b 64f\n"
+      "63:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "ldr h2, [x25, #0x0]\n"
+      "64:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q18, [x12, #0x0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q3, [x10, #0x0]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q27, [x10, #0x10]\n"
+      ".inst 0x6e43ee6a  // bfmmla v10.4s, v19.8h, v3.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e5bee6e  // bfmmla v14.4s, v19.8h, v27.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "add x12, x12, #0x20\n"
+      "add x11, x11, #0x20\n"
+      "add x10, x10, #0x20\n"
+      "add x9, x9, #0x20\n"
+      "65:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 55b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "add x25, x13, x20, LSL #2\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "tbz %x[flags], #1, 66f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v18.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v18.4s\n"
+      "fmin v12.4s, v12.4s, v18.4s\n"
+      "fmin v13.4s, v13.4s, v18.4s\n"
+      "fmin v14.4s, v14.4s, v18.4s\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v7.4s, v7.4s, v17.4s\n"
+      "fmax v12.4s, v12.4s, v17.4s\n"
+      "fmax v13.4s, v13.4s, v17.4s\n"
+      "fmax v14.4s, v14.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
+      "66:"  // Height 2: No activation
+      "cmp x14, #0x10\n"
+      "bge 75f\n"
+      "tbz x14, #3, 70f\n"
+      "st1 { v7.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "tbz x14, #2, 68f\n"
+      "st1 { v13.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 67f\n"
+      "str d14, [x13], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "tbz x14, #0, 74f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "b 74f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 74f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "b 74f\n"
+      "68:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 69f\n"
+      "str d13, [x13], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "tbz x14, #0, 74f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "b 74f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 74f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "b 74f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 72f\n"
+      "st1 { v7.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 71f\n"
+      "str d12, [x13], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "tbz x14, #0, 74f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "b 74f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 74f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "b 74f\n"
+      "72:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 73f\n"
+      "str d7, [x13], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "tbz x14, #0, 74f\n"
+      "st1 { v7.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "b 74f\n"
+      "73:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s7, [x13, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "74:"  // Height 2: Partial direct writeback: Done
+      "b 76f\n"
+      "75:"  // Height 2: Full writeback
+      "str q7, [x13, #0x0]\n"
+      "str q12, [x13, #0x10]\n"
+      "str q13, [x13, #0x20]\n"
+      "str q14, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "76:"  // Height 2: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 40b\n"
+      "b 230f\n"
+      "77:"  // Height 3
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "78:"  // Height 3: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 79f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 79f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 79f\n"
+      "mov x11, x12\n"
+      "79:"  // Height 3: B setup done
+      "cbz x15, 80f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 92f\n"
+      "80:"  // Height 3: no bias
+      "tbz %x[flags], #0, 91f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "cmp x14, #0x10\n"
+      "add x24, x25, x20, LSL #2\n"
+      "bge 89f\n"
+      "tbz x14, #3, 84f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "tbz x14, #2, 82f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 81f\n"
+      "ldr d16, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "tbz x14, #0, 88f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "b 88f\n"
+      "81:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 88f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "b 88f\n"
+      "82:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x14, #1, 83f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "tbz x14, #0, 88f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "b 88f\n"
+      "83:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 88f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "b 88f\n"
+      "84:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x14, #2, 86f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 85f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "tbz x14, #0, 88f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "b 88f\n"
+      "85:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 88f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "b 88f\n"
+      "86:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x14, #1, 87f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "tbz x14, #0, 88f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "b 88f\n"
+      "87:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "88:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 90f\n"
+      "89:"  // Height 3: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "90:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 92f\n"
+      "91:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "92:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "93:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 94f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 95f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "b 95f\n"
+      "94:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "95:"  // Height 3: input setup done
+      "cmp x27, #0x8\n"
+      "blt 98f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      "blt 97f\n"
+      "96:"  // Height 3: Multiply loop: Main loop head
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x12, #0x20]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x12, #0x30]\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x11, #0x20]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x11, #0x30]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x9, #0x20]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      "ldr q7, [x12, #0x0]\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      "bge 96b\n"
+      "97:"  // Height 3: Multiply loop: Single iteration only
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x12, #0x20]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x12, #0x30]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x11, #0x20]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x11, #0x30]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x9, #0x20]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
+      "98:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 103f\n"
+      "cmp x27, #0x4\n"
+      "blt 100f\n"
+      "99:"  // Height 3: Multiply loop: Odd block loop
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr q26, [x12, #0x0]\n"
+      "trn1 v27.2d, v25.2d, v27.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      "ldr q25, [x12, #0x10]\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "bge 99b\n"
+      "100:"  // Height 3: Multiply loop: Skip odd blocks
+      "cbz x27, 103f\n"
+      "tbz x27, #1, 101f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "tbz x27, #0, 102f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "ld1 { v2.h }[2], [x25]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "b 102f\n"
+      "101:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "ldr h2, [x25, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "102:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q29, [x12, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v25.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e5def8c  // bfmmla v12.4s, v28.8h, v29.8h\n"
+      ".inst 0x6e5def74  // bfmmla v20.4s, v27.8h, v29.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "103:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 93b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "tbz %x[flags], #1, 104f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v7.4s, v7.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
+      "104:"  // Height 3: No activation
+      "cmp x14, #0x10\n"
+      "bge 113f\n"
+      "tbz x14, #3, 108f\n"
+      "st1 { v7.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "tbz x14, #2, 106f\n"
+      "st1 { v13.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 105f\n"
+      "str d14, [x13], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "tbz x14, #0, 112f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "b 112f\n"
+      "105:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 112f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "b 112f\n"
+      "106:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 107f\n"
+      "str d13, [x13], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "tbz x14, #0, 112f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "b 112f\n"
+      "107:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 112f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "b 112f\n"
+      "108:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 110f\n"
+      "st1 { v7.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 109f\n"
+      "str d12, [x13], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "tbz x14, #0, 112f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "b 112f\n"
+      "109:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 112f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "b 112f\n"
+      "110:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 111f\n"
+      "str d7, [x13], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "tbz x14, #0, 112f\n"
+      "st1 { v7.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "b 112f\n"
+      "111:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s7, [x13, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "112:"  // Height 3: Partial direct writeback: Done
+      "b 114f\n"
+      "113:"  // Height 3: Full writeback
+      "str q7, [x13, #0x0]\n"
+      "str q12, [x13, #0x10]\n"
+      "str q13, [x13, #0x20]\n"
+      "str q14, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "114:"  // Height 3: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 78b\n"
+      "b 230f\n"
+      "115:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "116:"  // Height 4: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 117f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 117f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 117f\n"
+      "mov x11, x12\n"
+      "117:"  // Height 4: B setup done
+      "cbz x15, 118f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 130f\n"
+      "118:"  // Height 4: no bias
+      "tbz %x[flags], #0, 129f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "cmp x14, #0x10\n"
+      "add x23, x24, x20, LSL #2\n"
+      "bge 127f\n"
+      "tbz x14, #3, 122f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "tbz x14, #2, 120f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 119f\n"
+      "ldr d16, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "tbz x14, #0, 126f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "b 126f\n"
+      "119:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 126f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "b 126f\n"
+      "120:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x14, #1, 121f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "tbz x14, #0, 126f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "b 126f\n"
+      "121:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 126f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "b 126f\n"
+      "122:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x14, #2, 124f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 123f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "tbz x14, #0, 126f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "b 126f\n"
+      "123:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 126f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "b 126f\n"
+      "124:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x14, #1, 125f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "tbz x14, #0, 126f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "b 126f\n"
+      "125:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "126:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 128f\n"
+      "127:"  // Height 4: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "128:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 130f\n"
+      "129:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "130:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "131:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 132f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 133f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "b 133f\n"
+      "132:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "133:"  // Height 4: input setup done
+      "cmp x27, #0x8\n"
+      "blt 136f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      "blt 135f\n"
+      "134:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "sub x27, x27, #0x8\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x12, #0x20]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      "add x23, x23, #0x10\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x12, #0x30]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x11, #0x20]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x11, #0x30]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x9, #0x20]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      "ldr q7, [x12, #0x0]\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      "bge 134b\n"
+      "135:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "sub x27, x27, #0x8\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x12, #0x20]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x12, #0x30]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x11, #0x20]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x11, #0x30]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x9, #0x20]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
+      "136:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 141f\n"
+      "cmp x27, #0x4\n"
+      "blt 138f\n"
+      "137:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "sub x27, x27, #0x4\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "trn1 v27.2d, v26.2d, v25.2d\n"
+      "cmp x27, #0x4\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "bge 137b\n"
+      "138:"  // Height 4: Multiply loop: Skip odd blocks
+      "cbz x27, 141f\n"
+      "tbz x27, #1, 139f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "tbz x27, #0, 140f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "ld1 { v2.h }[2], [x25]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "ld1 { v4.h }[2], [x23]\n"
+      "b 140f\n"
+      "139:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "ldr h2, [x25, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "ldr h4, [x23, #0x0]\n"
+      "140:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "141:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 131b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "tbz %x[flags], #1, 142f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v15.4s, v15.4s, v26.4s\n"
+      "fmin v20.4s, v20.4s, v26.4s\n"
+      "fmin v21.4s, v21.4s, v26.4s\n"
+      "fmin v22.4s, v22.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v7.4s, v7.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v15.4s, v15.4s, v25.4s\n"
+      "fmax v20.4s, v20.4s, v25.4s\n"
+      "fmax v21.4s, v21.4s, v25.4s\n"
+      "fmax v22.4s, v22.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
+      "142:"  // Height 4: No activation
+      "cmp x14, #0x10\n"
+      "bge 151f\n"
+      "tbz x14, #3, 146f\n"
+      "st1 { v7.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x14, #2, 144f\n"
+      "st1 { v13.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 143f\n"
+      "str d14, [x13], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d22, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x14, #0, 150f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v22.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "b 150f\n"
+      "143:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 150f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s22, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "b 150f\n"
+      "144:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 145f\n"
+      "str d13, [x13], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d21, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x14, #0, 150f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v21.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "b 150f\n"
+      "145:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 150f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s21, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "b 150f\n"
+      "146:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 148f\n"
+      "st1 { v7.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 147f\n"
+      "str d12, [x13], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d20, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x14, #0, 150f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v20.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "b 150f\n"
+      "147:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 150f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s20, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "b 150f\n"
+      "148:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 149f\n"
+      "str d7, [x13], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x14, #0, 150f\n"
+      "st1 { v7.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "b 150f\n"
+      "149:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s7, [x13, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "150:"  // Height 4: Partial direct writeback: Done
+      "b 152f\n"
+      "151:"  // Height 4: Full writeback
+      "str q7, [x13, #0x0]\n"
+      "str q12, [x13, #0x10]\n"
+      "str q13, [x13, #0x20]\n"
+      "str q14, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q20, [x24, #0x10]\n"
+      "str q21, [x24, #0x20]\n"
+      "str q22, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "152:"  // Height 4: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 116b\n"
+      "b 230f\n"
+      "153:"  // Height 5
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "154:"  // Height 5: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 155f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 155f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 155f\n"
+      "mov x11, x12\n"
+      "155:"  // Height 5: B setup done
+      "cbz x15, 156f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 168f\n"
+      "156:"  // Height 5: no bias
+      "tbz %x[flags], #0, 167f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x14, #0x10\n"
+      "add x22, x23, x20, LSL #2\n"
+      "bge 165f\n"
+      "tbz x14, #3, 160f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
+      "tbz x14, #2, 158f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v27.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 157f\n"
+      "ldr d16, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d6, [x22], #0x8\n"
+      "tbz x14, #0, 164f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v6.s }[2], [x22]\n"
+      "b 164f\n"
+      "157:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 164f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s6, [x22, #0x0]\n"
+      "b 164f\n"
+      "158:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x14, #1, 159f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "tbz x14, #0, 164f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
+      "b 164f\n"
+      "159:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 164f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
+      "b 164f\n"
+      "160:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x14, #2, 162f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 161f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "tbz x14, #0, 164f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
+      "b 164f\n"
+      "161:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 164f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
+      "b 164f\n"
+      "162:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x14, #1, 163f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "tbz x14, #0, 164f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "b 164f\n"
+      "163:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "164:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 166f\n"
+      "165:"  // Height 5: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q25, [x22, #0x0]\n"
+      "ldr q26, [x22, #0x10]\n"
+      "ldr q27, [x22, #0x20]\n"
+      "ldr q6, [x22, #0x30]\n"
+      "166:"  // Height 5: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 168f\n"
+      "167:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "168:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "169:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 170f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 171f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "b 171f\n"
+      "170:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "171:"  // Height 5: input setup done
+      "cmp x27, #0x8\n"
+      "blt 174f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      "blt 173f\n"
+      "172:"  // Height 5: Multiply loop: Main loop head
+      "trn1 v6.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ecc8  // bfmmla v8.4s, v6.8h, v7.8h\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "sub x27, x27, #0x8\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x12, #0x10]\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x11, #0x0]\n"
+      ".inst 0x6e40eccc  // bfmmla v12.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec54  // bfmmla v20.4s, v2.8h, v0.8h\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x6e40ec9c  // bfmmla v28.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x11, #0x10]\n"
+      ".inst 0x6e47ecc9  // bfmmla v9.4s, v6.8h, v7.8h\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e40eccd  // bfmmla v13.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec55  // bfmmla v21.4s, v2.8h, v0.8h\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e47ecca  // bfmmla v10.4s, v6.8h, v7.8h\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x9, #0x0]\n"
+      ".inst 0x6e40ecce  // bfmmla v14.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec56  // bfmmla v22.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e40ec9e  // bfmmla v30.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x6e47eccb  // bfmmla v11.4s, v6.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x12, #0x20]\n"
+      ".inst 0x6e40eccf  // bfmmla v15.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec57  // bfmmla v23.4s, v2.8h, v0.8h\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x12, #0x30]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q6, [x11, #0x20]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6e40ec2c  // bfmmla v12.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbc  // bfmmla v28.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x11, #0x30]\n"
+      ".inst 0x6e46ec29  // bfmmla v9.4s, v1.8h, v6.8h\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6e46ec71  // bfmmla v17.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecb9  // bfmmla v25.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e40ec2d  // bfmmla v13.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbd  // bfmmla v29.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e46ec2a  // bfmmla v10.4s, v1.8h, v6.8h\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6e46ec72  // bfmmla v18.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecba  // bfmmla v26.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x6e40ec2e  // bfmmla v14.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbe  // bfmmla v30.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x6e46ec2b  // bfmmla v11.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbb  // bfmmla v27.4s, v5.8h, v6.8h\n"
+      "ldr q7, [x12, #0x0]\n"
+      ".inst 0x6e40ec2f  // bfmmla v15.4s, v1.8h, v0.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      "ldr q3, [x24, #0x0]\n"
+      ".inst 0x6e40ecbf  // bfmmla v31.4s, v5.8h, v0.8h\n"
+      "ldr q5, [x22, #0x0]\n"
+      "bge 172b\n"
+      "173:"  // Height 5: Multiply loop: Single iteration only
+      "trn1 v6.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ecc8  // bfmmla v8.4s, v6.8h, v7.8h\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "sub x27, x27, #0x8\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x12, #0x10]\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x11, #0x0]\n"
+      ".inst 0x6e40eccc  // bfmmla v12.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec54  // bfmmla v20.4s, v2.8h, v0.8h\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e40ec9c  // bfmmla v28.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x11, #0x10]\n"
+      ".inst 0x6e47ecc9  // bfmmla v9.4s, v6.8h, v7.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e40eccd  // bfmmla v13.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec55  // bfmmla v21.4s, v2.8h, v0.8h\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e47ecca  // bfmmla v10.4s, v6.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x9, #0x0]\n"
+      ".inst 0x6e40ecce  // bfmmla v14.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec56  // bfmmla v22.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e40ec9e  // bfmmla v30.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x6e47eccb  // bfmmla v11.4s, v6.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x12, #0x20]\n"
+      ".inst 0x6e40eccf  // bfmmla v15.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec57  // bfmmla v23.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
+      "ldr q2, [x12, #0x30]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q0, [x11, #0x20]\n"
+      ".inst 0x6e42ec2c  // bfmmla v12.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec74  // bfmmla v20.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbc  // bfmmla v28.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x11, #0x30]\n"
+      ".inst 0x6e40ec29  // bfmmla v9.4s, v1.8h, v0.8h\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e42ec2d  // bfmmla v13.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec75  // bfmmla v21.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbd  // bfmmla v29.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0x30]\n"
+      ".inst 0x6e40ec2a  // bfmmla v10.4s, v1.8h, v0.8h\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecba  // bfmmla v26.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x9, #0x20]\n"
+      ".inst 0x6e42ec2e  // bfmmla v14.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec76  // bfmmla v22.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbe  // bfmmla v30.4s, v5.8h, v2.8h\n"
+      "ldr q6, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x6e40ec2b  // bfmmla v11.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbb  // bfmmla v27.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      "174:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 179f\n"
+      "cmp x27, #0x4\n"
+      "blt 176f\n"
+      "175:"  // Height 5: Multiply loop: Odd block loop
+      "ldr d1, [x26], #0x8\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
+      "sub x27, x27, #0x4\n"
+      "ldr d0, [x22], #0x8\n"
+      "ldr q1, [x12, #0x0]\n"
+      "trn1 v2.2d, v0.2d, v2.2d\n"
+      ".inst 0x6e41ec88  // bfmmla v8.4s, v4.8h, v1.8h\n"
+      "ldr q0, [x12, #0x10]\n"
+      ".inst 0x6e41ec70  // bfmmla v16.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec58  // bfmmla v24.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x11, #0x0]\n"
+      ".inst 0x6e40ec8c  // bfmmla v12.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      "cmp x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e40ec5c  // bfmmla v28.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x11, #0x10]\n"
+      ".inst 0x6e41ec89  // bfmmla v9.4s, v4.8h, v1.8h\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e41ec71  // bfmmla v17.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec59  // bfmmla v25.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x0]\n"
+      ".inst 0x6e40ec8d  // bfmmla v13.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5d  // bfmmla v29.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e41ec8a  // bfmmla v10.4s, v4.8h, v1.8h\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e41ec72  // bfmmla v18.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5a  // bfmmla v26.4s, v2.8h, v1.8h\n"
+      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x6e40ec8e  // bfmmla v14.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5e  // bfmmla v30.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x6e46ec8b  // bfmmla v11.4s, v4.8h, v6.8h\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40ec8f  // bfmmla v15.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5f  // bfmmla v31.4s, v2.8h, v0.8h\n"
+      "bge 175b\n"
+      "176:"  // Height 5: Multiply loop: Skip odd blocks
+      "cbz x27, 179f\n"
+      "tbz x27, #1, 177f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s5, [x22], #0x4\n"
+      "tbz x27, #0, 178f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "ld1 { v2.h }[2], [x25]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "ld1 { v4.h }[2], [x23]\n"
+      "ld1 { v5.h }[2], [x22]\n"
+      "b 178f\n"
+      "177:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "ldr h2, [x25, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "ldr h4, [x23, #0x0]\n"
+      "ldr h5, [x22, #0x0]\n"
+      "178:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x12, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      "trn1 v2.2d, v5.2d, v0.2d\n"
+      "ldr q1, [x12, #0x10]\n"
+      ".inst 0x6e46ece8  // bfmmla v8.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec70  // bfmmla v16.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec58  // bfmmla v24.4s, v2.8h, v6.8h\n"
+      "ldr q0, [x11, #0x0]\n"
+      ".inst 0x6e41ecec  // bfmmla v12.4s, v7.8h, v1.8h\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e41ec74  // bfmmla v20.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5c  // bfmmla v28.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e40ece9  // bfmmla v9.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec59  // bfmmla v25.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x0]\n"
+      ".inst 0x6e41eced  // bfmmla v13.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec75  // bfmmla v21.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e40ecea  // bfmmla v10.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5a  // bfmmla v26.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x9, #0x0]\n"
+      ".inst 0x6e41ecee  // bfmmla v14.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec76  // bfmmla v22.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5e  // bfmmla v30.4s, v2.8h, v1.8h\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5b  // bfmmla v27.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e46ecef  // bfmmla v15.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5f  // bfmmla v31.4s, v2.8h, v6.8h\n"
+      "179:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 169b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "tbz %x[flags], #1, 180f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmax v7.4s, v7.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "180:"  // Height 5: No activation
+      "cmp x14, #0x10\n"
+      "bge 189f\n"
+      "tbz x14, #3, 184f\n"
+      "st1 { v7.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v25.4s }, [x22], #0x10\n"
+      "tbz x14, #2, 182f\n"
+      "st1 { v13.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 181f\n"
+      "str d14, [x13], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d22, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "tbz x14, #0, 188f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v22.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
+      "b 188f\n"
+      "181:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 188f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s22, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
+      "b 188f\n"
+      "182:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 183f\n"
+      "str d13, [x13], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d21, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "tbz x14, #0, 188f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v21.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
+      "b 188f\n"
+      "183:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 188f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s21, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
+      "b 188f\n"
+      "184:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 186f\n"
+      "st1 { v7.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 185f\n"
+      "str d12, [x13], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d20, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "tbz x14, #0, 188f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v20.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "b 188f\n"
+      "185:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 188f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s20, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "b 188f\n"
+      "186:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 187f\n"
+      "str d7, [x13], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x14, #0, 188f\n"
+      "st1 { v7.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "b 188f\n"
+      "187:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s7, [x13, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "188:"  // Height 5: Partial direct writeback: Done
+      "b 190f\n"
+      "189:"  // Height 5: Full writeback
+      "str q7, [x13, #0x0]\n"
+      "str q12, [x13, #0x10]\n"
+      "str q13, [x13, #0x20]\n"
+      "str q14, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q20, [x24, #0x10]\n"
+      "str q21, [x24, #0x20]\n"
+      "str q22, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
+      "190:"  // Height 5: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 154b\n"
+      "b 230f\n"
+      "191:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x21, #0x18\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x15, %x[bias]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
+      "192:"  // Height 6: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 193f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 193f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 193f\n"
+      "mov x11, x12\n"
+      "193:"  // Height 6: B setup done
+      "cbz x15, 194f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 206f\n"
+      "194:"  // Height 6: no bias
+      "tbz %x[flags], #0, 205f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x14, #0x10\n"
+      "add x21, x22, x20, LSL #2\n"
+      "bge 203f\n"
+      "tbz x14, #3, 198f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x14, #2, 196f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v27.4s }, [x22], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 195f\n"
+      "ldr d16, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d6, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x14, #0, 202f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v6.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 202f\n"
+      "195:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 202f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s6, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 202f\n"
+      "196:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x14, #1, 197f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x14, #0, 202f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 202f\n"
+      "197:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 202f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 202f\n"
+      "198:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x14, #2, 200f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 199f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x14, #0, 202f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 202f\n"
+      "199:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 202f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 202f\n"
+      "200:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x14, #1, 201f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x14, #0, 202f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 202f\n"
+      "201:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "202:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 204f\n"
+      "203:"  // Height 6: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q25, [x22, #0x0]\n"
+      "ldr q26, [x22, #0x10]\n"
+      "ldr q27, [x22, #0x20]\n"
+      "ldr q6, [x22, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "204:"  // Height 6: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 206f\n"
+      "205:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "206:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "207:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 208f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 209f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
+      "b 209f\n"
+      "208:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
+      "209:"  // Height 6: input setup done
+      "cmp x27, #0x8\n"
+      "blt 212f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      "blt 211f\n"
+      "210:"  // Height 6: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "sub x27, x27, #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "cmp x27, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x12, #0x10]\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x11, #0x0]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x9, #0x0]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x12, #0x20]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      "ldr q0, [x12, #0x30]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q6, [x11, #0x20]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6e40ec2c  // bfmmla v12.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbc  // bfmmla v28.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x11, #0x30]\n"
+      ".inst 0x6e46ec29  // bfmmla v9.4s, v1.8h, v6.8h\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6e46ec71  // bfmmla v17.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecb9  // bfmmla v25.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e40ec2d  // bfmmla v13.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbd  // bfmmla v29.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e46ec2a  // bfmmla v10.4s, v1.8h, v6.8h\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6e46ec72  // bfmmla v18.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecba  // bfmmla v26.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x6e40ec2e  // bfmmla v14.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbe  // bfmmla v30.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x6e46ec2b  // bfmmla v11.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbb  // bfmmla v27.4s, v5.8h, v6.8h\n"
+      "ldr q7, [x12, #0x0]\n"
+      ".inst 0x6e40ec2f  // bfmmla v15.4s, v1.8h, v0.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      "ldr q3, [x24, #0x0]\n"
+      ".inst 0x6e40ecbf  // bfmmla v31.4s, v5.8h, v0.8h\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "bge 210b\n"
+      "211:"  // Height 6: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "sub x27, x27, #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x12, #0x10]\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x11, #0x0]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x9, #0x0]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x12, #0x20]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      "ldr q2, [x12, #0x30]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q0, [x11, #0x20]\n"
+      ".inst 0x6e42ec2c  // bfmmla v12.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec74  // bfmmla v20.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbc  // bfmmla v28.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x11, #0x30]\n"
+      ".inst 0x6e40ec29  // bfmmla v9.4s, v1.8h, v0.8h\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e42ec2d  // bfmmla v13.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec75  // bfmmla v21.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbd  // bfmmla v29.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0x30]\n"
+      ".inst 0x6e40ec2a  // bfmmla v10.4s, v1.8h, v0.8h\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecba  // bfmmla v26.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x9, #0x20]\n"
+      ".inst 0x6e42ec2e  // bfmmla v14.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec76  // bfmmla v22.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbe  // bfmmla v30.4s, v5.8h, v2.8h\n"
+      "ldr q6, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x6e40ec2b  // bfmmla v11.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbb  // bfmmla v27.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      "212:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 217f\n"
+      "cmp x27, #0x4\n"
+      "blt 214f\n"
+      "213:"  // Height 6: Multiply loop: Odd block loop
+      "ldr d1, [x26], #0x8\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "sub x27, x27, #0x4\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
+      "cmp x27, #0x4\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d0, [x21], #0x8\n"
+      "trn1 v2.2d, v1.2d, v0.2d\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q0, [x12, #0x10]\n"
+      ".inst 0x6e41ec88  // bfmmla v8.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec70  // bfmmla v16.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec58  // bfmmla v24.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x11, #0x0]\n"
+      ".inst 0x6e40ec8c  // bfmmla v12.4s, v4.8h, v0.8h\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5c  // bfmmla v28.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e41ec89  // bfmmla v9.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec71  // bfmmla v17.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec59  // bfmmla v25.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x0]\n"
+      ".inst 0x6e40ec8d  // bfmmla v13.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5d  // bfmmla v29.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e41ec8a  // bfmmla v10.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec72  // bfmmla v18.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5a  // bfmmla v26.4s, v2.8h, v1.8h\n"
+      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x6e40ec8e  // bfmmla v14.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5e  // bfmmla v30.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e46ec8b  // bfmmla v11.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40ec8f  // bfmmla v15.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5f  // bfmmla v31.4s, v2.8h, v0.8h\n"
+      "bge 213b\n"
+      "214:"  // Height 6: Multiply loop: Skip odd blocks
+      "cbz x27, 217f\n"
+      "tbz x27, #1, 215f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s5, [x22], #0x4\n"
+      "ldr s6, [x21], #0x4\n"
+      "tbz x27, #0, 216f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "ld1 { v2.h }[2], [x25]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "ld1 { v4.h }[2], [x23]\n"
+      "ld1 { v5.h }[2], [x22]\n"
+      "ld1 { v6.h }[2], [x21]\n"
+      "b 216f\n"
+      "215:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "ldr h2, [x25, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "ldr h4, [x23, #0x0]\n"
+      "ldr h5, [x22, #0x0]\n"
+      "ldr h6, [x21, #0x0]\n"
+      "216:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q0, [x12, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e40ece8  // bfmmla v8.4s, v7.8h, v0.8h\n"
+      "trn1 v2.2d, v5.2d, v6.2d\n"
+      "ldr q1, [x12, #0x10]\n"
+      ".inst 0x6e40ec70  // bfmmla v16.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec58  // bfmmla v24.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x11, #0x0]\n"
+      ".inst 0x6e41ecec  // bfmmla v12.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec74  // bfmmla v20.4s, v3.8h, v1.8h\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e41ec5c  // bfmmla v28.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x11, #0x10]\n"
+      ".inst 0x6e40ece9  // bfmmla v9.4s, v7.8h, v0.8h\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec59  // bfmmla v25.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x0]\n"
+      ".inst 0x6e41eced  // bfmmla v13.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec75  // bfmmla v21.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x6e40ecea  // bfmmla v10.4s, v7.8h, v0.8h\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5a  // bfmmla v26.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x9, #0x0]\n"
+      ".inst 0x6e41ecee  // bfmmla v14.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec76  // bfmmla v22.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5e  // bfmmla v30.4s, v2.8h, v1.8h\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5b  // bfmmla v27.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e46ecef  // bfmmla v15.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5f  // bfmmla v31.4s, v2.8h, v6.8h\n"
+      "217:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 207b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "tbz %x[flags], #1, 218f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v28.4s, v28.4s, v1.4s\n"
+      "fmin v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmax v7.4s, v7.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "218:"  // Height 6: No activation
+      "cmp x14, #0x10\n"
+      "bge 227f\n"
+      "tbz x14, #3, 222f\n"
+      "st1 { v7.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "tbz x14, #2, 220f\n"
+      "st1 { v13.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v29.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 219f\n"
+      "str d14, [x13], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d22, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d30, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x14, #0, 226f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v22.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "b 226f\n"
+      "219:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 226f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s22, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s30, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "b 226f\n"
+      "220:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 221f\n"
+      "str d13, [x13], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d21, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d29, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x14, #0, 226f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v21.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "b 226f\n"
+      "221:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 226f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s21, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s29, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "b 226f\n"
+      "222:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 224f\n"
+      "st1 { v7.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 223f\n"
+      "str d12, [x13], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d20, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d28, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x14, #0, 226f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v20.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "b 226f\n"
+      "223:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 226f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s20, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s28, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "b 226f\n"
+      "224:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 225f\n"
+      "str d7, [x13], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x14, #0, 226f\n"
+      "st1 { v7.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "b 226f\n"
+      "225:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s7, [x13, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "226:"  // Height 6: Partial direct writeback: Done
+      "b 228f\n"
+      "227:"  // Height 6: Full writeback
+      "str q7, [x13, #0x0]\n"
+      "str q12, [x13, #0x10]\n"
+      "str q13, [x13, #0x20]\n"
+      "str q14, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q20, [x24, #0x10]\n"
+      "str q21, [x24, #0x20]\n"
+      "str q22, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q23, [x22, #0x0]\n"
+      "str q28, [x22, #0x10]\n"
+      "str q29, [x22, #0x20]\n"
+      "str q30, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "228:"  // Height 6: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 192b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 230f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 229f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "229:"  // Update direct input
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "230:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp
new file mode 100644
index 0000000000..4924b3a549
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<__fp16>, \
+    size_t, size_t, \
+    const __fp16 *, \
+    size_t, \
+    IndirectOutputArg<__fp16>, \
+    const __fp16 *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_ffhybrid_fp16_mla_6x32( ARGLIST );
+
+class cls_a64_ffhybrid_fp16_mla_6x32
+{
+public:
+    typedef __fp16 lhs_operand_type;
+    typedef __fp16 rhs_operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+    static unsigned int stripe_width()
+    {
+        return 8;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL128_BL16;
+    }
+
+    static unsigned int out_width()
+    {
+        return 32;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 32, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, __fp16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 29.14 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_ffhybrid_fp16_mla_6x32;
+    cls_a64_ffhybrid_fp16_mla_6x32(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp
new file mode 100644
index 0000000000..8038612200
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp
@@ -0,0 +1,5429 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_ffhybrid_fp16_mla_6x32 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+    size_t M, size_t N, const __fp16 *B_ptr, size_t B_stride, IndirectOutputArg<__fp16> output_arg,
+    const __fp16 *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const __fp16 *B_ptr = {};
+        const __fp16 *cur_B_ptr = {};
+        size_t B_stride = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    ka.B_stride = B_stride;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 251f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 201f\n"
+      "beq 151f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 101f\n"
+      "beq 51f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0x18\n"
+      "bgt 3f\n"
+      "cmp x14, #0x10\n"
+      "mov x9, x12\n"
+      "bgt 3f\n"
+      "cmp x14, #0x8\n"
+      "mov x10, x12\n"
+      "bgt 3f\n"
+      "mov x11, x12\n"
+      "3:"  // Height 1: B setup done
+      "cbz x15, 4f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "add x15, x15, #0x40\n"
+      "b 23f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 22f\n"
+      "cmp x14, #0x20\n"
+      "bge 21f\n"
+      "tbz x14, #4, 12f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "tbz x14, #3, 8f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "tbz x14, #2, 6f\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x14, #1, 5f\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "mov x20, #0x3c\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "b 20f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_28
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "b 20f\n"
+      "6:"  // Height 1: Partial accumulate: partial_2_24
+      "tbz x14, #1, 7f\n"
+      "ldr s11, [x13], #0x4\n"
+      "mov x20, #0x34\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "b 20f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_24
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 20f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "b 20f\n"
+      "8:"  // Height 1: Partial accumulate: partial_4_16
+      "tbz x14, #2, 10f\n"
+      "ldr d10, [x13], #0x8\n"
+      "tbz x14, #1, 9f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "mov x20, #0x2c\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "b 20f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_20
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "b 20f\n"
+      "10:"  // Height 1: Partial accumulate: partial_2_16
+      "tbz x14, #1, 11f\n"
+      "ldr s10, [x13], #0x4\n"
+      "mov x20, #0x24\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "b 20f\n"
+      "11:"  // Height 1: Partial accumulate: partial_1_16
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 20f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "b 20f\n"
+      "12:"  // Height 1: Partial accumulate: partial_8_0
+      "tbz x14, #3, 16f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "tbz x14, #2, 14f\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x14, #1, 13f\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "mov x20, #0x1c\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "b 20f\n"
+      "13:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "b 20f\n"
+      "14:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x14, #1, 15f\n"
+      "ldr s9, [x13], #0x4\n"
+      "mov x20, #0x14\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "b 20f\n"
+      "15:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 20f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "b 20f\n"
+      "16:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x14, #2, 18f\n"
+      "ldr d8, [x13], #0x8\n"
+      "tbz x14, #1, 17f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "mov x20, #0xc\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "b 20f\n"
+      "17:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "b 20f\n"
+      "18:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x14, #1, 19f\n"
+      "ldr s8, [x13], #0x4\n"
+      "mov x20, #0x4\n"
+      "tbz x14, #0, 20f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "b 20f\n"
+      "19:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr h8, [x13, #0x0]\n"
+      "mov x20, #0x0\n"
+      "20:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 23f\n"
+      "21:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 23f\n"
+      "22:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "23:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "24:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 25f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 26f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "b 26f\n"
+      "25:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "26:"  // Height 1: input setup done
+      "cmp x27, #0x8\n"
+      "blt 29f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 28f\n"
+      "27:"  // Height 1: Multiply loop: Main loop head
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x12, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x11, #0x40]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x9, #0x40]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x12, #0x50]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x11, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x50]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x9, #0x50]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x12, #0x60]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x11, #0x60]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x9, #0x60]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x12, #0x70]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x11, #0x70]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr q17, [x10, #0x70]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr q16, [x9, #0x70]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x10\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "add x26, x26, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "add x12, x12, #0x80\n"
+      "ldr q6, [x12, #0x0]\n"
+      "add x11, x11, #0x80\n"
+      "ldr q7, [x11, #0x0]\n"
+      "add x10, x10, #0x80\n"
+      "add x9, x9, #0x80\n"
+      "bge 27b\n"
+      "28:"  // Height 1: Multiply loop: Single iteration only
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x12, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x11, #0x40]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x9, #0x40]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x12, #0x50]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x11, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x50]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x9, #0x50]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x12, #0x60]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x11, #0x60]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x9, #0x60]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x12, #0x70]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x11, #0x70]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr q17, [x10, #0x70]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr q16, [x9, #0x70]\n"
+      "sub x27, x27, #0x8\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "add x26, x26, #0x10\n"
+      "add x12, x12, #0x80\n"
+      "add x11, x11, #0x80\n"
+      "add x10, x10, #0x80\n"
+      "add x9, x9, #0x80\n"
+      "29:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 31f\n"
+      "30:"  // Height 1: Multiply loop: Odd block loop
+      "ldr h0, [x26], #0x2\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v8.8h, v16.8h, v0.h[0]\n"
+      "sub x27, x27, #0x1\n"
+      "ldr q17, [x11, #0x0]\n"
+      "ldr q16, [x10, #0x0]\n"
+      "fmla v9.8h, v17.8h, v0.h[0]\n"
+      "fmla v10.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "cbnz x27, 30b\n"
+      "31:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 24b\n"
+      "tbz %x[flags], #1, 32f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v17.8h\n"
+      "fmin v9.8h, v9.8h, v17.8h\n"
+      "fmin v10.8h, v10.8h, v17.8h\n"
+      "fmin v11.8h, v11.8h, v17.8h\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
+      "32:"  // Height 1: No activation
+      "cmp x14, #0x20\n"
+      "bge 49f\n"
+      "tbz x14, #4, 40f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "tbz x14, #3, 36f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "tbz x14, #2, 34f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x14, #1, 33f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "tbz x14, #0, 48f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "b 48f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_1_28
+      "tbz x14, #0, 48f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "b 48f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_2_24
+      "tbz x14, #1, 35f\n"
+      "str s11, [x13], #0x4\n"
+      "tbz x14, #0, 48f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "b 48f\n"
+      "35:"  // Height 1: Partial direct writeback: partial_1_24
+      "tbz x14, #0, 48f\n"
+      "str h11, [x13, #0x0]\n"
+      "b 48f\n"
+      "36:"  // Height 1: Partial direct writeback: partial_4_16
+      "tbz x14, #2, 38f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x14, #1, 37f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "tbz x14, #0, 48f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "b 48f\n"
+      "37:"  // Height 1: Partial direct writeback: partial_1_20
+      "tbz x14, #0, 48f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "b 48f\n"
+      "38:"  // Height 1: Partial direct writeback: partial_2_16
+      "tbz x14, #1, 39f\n"
+      "str s10, [x13], #0x4\n"
+      "tbz x14, #0, 48f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "b 48f\n"
+      "39:"  // Height 1: Partial direct writeback: partial_1_16
+      "tbz x14, #0, 48f\n"
+      "str h10, [x13, #0x0]\n"
+      "b 48f\n"
+      "40:"  // Height 1: Partial direct writeback: partial_8_0
+      "tbz x14, #3, 44f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "tbz x14, #2, 42f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x14, #1, 41f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "tbz x14, #0, 48f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "b 48f\n"
+      "41:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 48f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "b 48f\n"
+      "42:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 43f\n"
+      "str s9, [x13], #0x4\n"
+      "tbz x14, #0, 48f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "b 48f\n"
+      "43:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 48f\n"
+      "str h9, [x13, #0x0]\n"
+      "b 48f\n"
+      "44:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 46f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x14, #1, 45f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "tbz x14, #0, 48f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "b 48f\n"
+      "45:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 48f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "b 48f\n"
+      "46:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 47f\n"
+      "str s8, [x13], #0x4\n"
+      "tbz x14, #0, 48f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "b 48f\n"
+      "47:"  // Height 1: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "48:"  // Height 1: Partial direct writeback: Done
+      "b 50f\n"
+      "49:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "50:"  // Height 1: Writeback done
+      "subs x14, x14, #0x20\n"
+      "bgt 2b\n"
+      "b 302f\n"
+      "51:"  // Height 2
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "52:"  // Height 2: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0x18\n"
+      "bgt 53f\n"
+      "cmp x14, #0x10\n"
+      "mov x9, x12\n"
+      "bgt 53f\n"
+      "cmp x14, #0x8\n"
+      "mov x10, x12\n"
+      "bgt 53f\n"
+      "mov x11, x12\n"
+      "53:"  // Height 2: B setup done
+      "cbz x15, 54f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "mov v12.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "add x15, x15, #0x40\n"
+      "b 73f\n"
+      "54:"  // Height 2: no bias
+      "tbz %x[flags], #0, 72f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x14, #0x20\n"
+      "add x25, x13, x20, LSL #1\n"
+      "bge 71f\n"
+      "tbz x14, #4, 62f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
+      "tbz x14, #3, 58f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x25], #0x10\n"
+      "tbz x14, #2, 56f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "tbz x14, #1, 55f\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x25], #0x4\n"
+      "mov x20, #0x3c\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x25]\n"
+      "b 70f\n"
+      "55:"  // Height 2: Partial accumulate: partial_1_28
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x25]\n"
+      "b 70f\n"
+      "56:"  // Height 2: Partial accumulate: partial_2_24
+      "tbz x14, #1, 57f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x25], #0x4\n"
+      "mov x20, #0x34\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x25]\n"
+      "b 70f\n"
+      "57:"  // Height 2: Partial accumulate: partial_1_24
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 70f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x25, #0x0]\n"
+      "b 70f\n"
+      "58:"  // Height 2: Partial accumulate: partial_4_16
+      "tbz x14, #2, 60f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "tbz x14, #1, 59f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x25], #0x4\n"
+      "mov x20, #0x2c\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x25]\n"
+      "b 70f\n"
+      "59:"  // Height 2: Partial accumulate: partial_1_20
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x25]\n"
+      "b 70f\n"
+      "60:"  // Height 2: Partial accumulate: partial_2_16
+      "tbz x14, #1, 61f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x25], #0x4\n"
+      "mov x20, #0x24\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x25]\n"
+      "b 70f\n"
+      "61:"  // Height 2: Partial accumulate: partial_1_16
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 70f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x25, #0x0]\n"
+      "b 70f\n"
+      "62:"  // Height 2: Partial accumulate: partial_8_0
+      "tbz x14, #3, 66f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "tbz x14, #2, 64f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "tbz x14, #1, 63f\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x25], #0x4\n"
+      "mov x20, #0x1c\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x25]\n"
+      "b 70f\n"
+      "63:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x25]\n"
+      "b 70f\n"
+      "64:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x14, #1, 65f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x25], #0x4\n"
+      "mov x20, #0x14\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x25]\n"
+      "b 70f\n"
+      "65:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 70f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x25, #0x0]\n"
+      "b 70f\n"
+      "66:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x14, #2, 68f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "tbz x14, #1, 67f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x25], #0x4\n"
+      "mov x20, #0xc\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x25]\n"
+      "b 70f\n"
+      "67:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x25]\n"
+      "b 70f\n"
+      "68:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x14, #1, 69f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x25], #0x4\n"
+      "mov x20, #0x4\n"
+      "tbz x14, #0, 70f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x25]\n"
+      "b 70f\n"
+      "69:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "70:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 73f\n"
+      "71:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "b 73f\n"
+      "72:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "73:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "74:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 75f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 76f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "b 76f\n"
+      "75:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "76:"  // Height 2: input setup done
+      "cmp x27, #0x8\n"
+      "blt 79f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 78f\n"
+      "77:"  // Height 2: Multiply loop: Main loop head
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "sub x27, x27, #0x8\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x12, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x11, #0x40]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x9, #0x40]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x12, #0x50]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x11, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x50]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x9, #0x50]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x12, #0x60]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x11, #0x60]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x9, #0x60]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x12, #0x70]\n"
+      "add x12, x12, #0x80\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x11, #0x70]\n"
+      "add x11, x11, #0x80\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr q17, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr q16, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "bge 77b\n"
+      "78:"  // Height 2: Multiply loop: Single iteration only
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "sub x27, x27, #0x8\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x12, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x11, #0x40]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x9, #0x40]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x12, #0x50]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x11, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x50]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x9, #0x50]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x12, #0x60]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x11, #0x60]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x9, #0x60]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x12, #0x70]\n"
+      "add x12, x12, #0x80\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x11, #0x70]\n"
+      "add x11, x11, #0x80\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr q17, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr q16, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
+      "79:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 81f\n"
+      "80:"  // Height 2: Multiply loop: Odd block loop
+      "ldr h1, [x26], #0x2\n"
+      "ldr h0, [x25], #0x2\n"
+      "sub x27, x27, #0x1\n"
+      "ldr q17, [x12, #0x0]\n"
+      "ldr q16, [x11, #0x0]\n"
+      "fmla v8.8h, v17.8h, v1.h[0]\n"
+      "fmla v12.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "fmla v9.8h, v16.8h, v1.h[0]\n"
+      "fmla v13.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.8h, v17.8h, v1.h[0]\n"
+      "fmla v14.8h, v17.8h, v0.h[0]\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "fmla v11.8h, v16.8h, v1.h[0]\n"
+      "fmla v15.8h, v16.8h, v0.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "cbnz x27, 80b\n"
+      "81:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 74b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "tbz %x[flags], #1, 82f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v17.8h\n"
+      "fmin v9.8h, v9.8h, v17.8h\n"
+      "fmin v10.8h, v10.8h, v17.8h\n"
+      "fmin v11.8h, v11.8h, v17.8h\n"
+      "fmin v12.8h, v12.8h, v17.8h\n"
+      "fmin v13.8h, v13.8h, v17.8h\n"
+      "fmin v14.8h, v14.8h, v17.8h\n"
+      "fmin v15.8h, v15.8h, v17.8h\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
+      "fmax v12.8h, v12.8h, v16.8h\n"
+      "fmax v13.8h, v13.8h, v16.8h\n"
+      "fmax v14.8h, v14.8h, v16.8h\n"
+      "fmax v15.8h, v15.8h, v16.8h\n"
+      "82:"  // Height 2: No activation
+      "cmp x14, #0x20\n"
+      "bge 99f\n"
+      "tbz x14, #4, 90f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v13.8h }, [x25], #0x10\n"
+      "tbz x14, #3, 86f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x25], #0x10\n"
+      "tbz x14, #2, 84f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "tbz x14, #1, 83f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x25], #0x4\n"
+      "tbz x14, #0, 98f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x25]\n"
+      "b 98f\n"
+      "83:"  // Height 2: Partial direct writeback: partial_1_28
+      "tbz x14, #0, 98f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x25]\n"
+      "b 98f\n"
+      "84:"  // Height 2: Partial direct writeback: partial_2_24
+      "tbz x14, #1, 85f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x25], #0x4\n"
+      "tbz x14, #0, 98f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x25]\n"
+      "b 98f\n"
+      "85:"  // Height 2: Partial direct writeback: partial_1_24
+      "tbz x14, #0, 98f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x25, #0x0]\n"
+      "b 98f\n"
+      "86:"  // Height 2: Partial direct writeback: partial_4_16
+      "tbz x14, #2, 88f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "tbz x14, #1, 87f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x25], #0x4\n"
+      "tbz x14, #0, 98f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x25]\n"
+      "b 98f\n"
+      "87:"  // Height 2: Partial direct writeback: partial_1_20
+      "tbz x14, #0, 98f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x25]\n"
+      "b 98f\n"
+      "88:"  // Height 2: Partial direct writeback: partial_2_16
+      "tbz x14, #1, 89f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x25], #0x4\n"
+      "tbz x14, #0, 98f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x25]\n"
+      "b 98f\n"
+      "89:"  // Height 2: Partial direct writeback: partial_1_16
+      "tbz x14, #0, 98f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x25, #0x0]\n"
+      "b 98f\n"
+      "90:"  // Height 2: Partial direct writeback: partial_8_0
+      "tbz x14, #3, 94f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "tbz x14, #2, 92f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "tbz x14, #1, 91f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x25], #0x4\n"
+      "tbz x14, #0, 98f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x25]\n"
+      "b 98f\n"
+      "91:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 98f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x25]\n"
+      "b 98f\n"
+      "92:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 93f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x25], #0x4\n"
+      "tbz x14, #0, 98f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x25]\n"
+      "b 98f\n"
+      "93:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 98f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x25, #0x0]\n"
+      "b 98f\n"
+      "94:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 96f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "tbz x14, #1, 95f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "tbz x14, #0, 98f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x25]\n"
+      "b 98f\n"
+      "95:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 98f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x25]\n"
+      "b 98f\n"
+      "96:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 97f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "tbz x14, #0, 98f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x25]\n"
+      "b 98f\n"
+      "97:"  // Height 2: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x25, #0x0]\n"
+      "98:"  // Height 2: Partial direct writeback: Done
+      "b 100f\n"
+      "99:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "100:"  // Height 2: Writeback done
+      "subs x14, x14, #0x20\n"
+      "bgt 52b\n"
+      "b 302f\n"
+      "101:"  // Height 3
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "102:"  // Height 3: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0x18\n"
+      "bgt 103f\n"
+      "cmp x14, #0x10\n"
+      "mov x9, x12\n"
+      "bgt 103f\n"
+      "cmp x14, #0x8\n"
+      "mov x10, x12\n"
+      "bgt 103f\n"
+      "mov x11, x12\n"
+      "103:"  // Height 3: B setup done
+      "cbz x15, 104f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "mov v12.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x15, x15, #0x40\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "b 123f\n"
+      "104:"  // Height 3: no bias
+      "tbz %x[flags], #0, 122f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "cmp x14, #0x20\n"
+      "add x24, x25, x20, LSL #1\n"
+      "bge 121f\n"
+      "tbz x14, #4, 112f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
+      "ld1 { v17.8h }, [x24], #0x10\n"
+      "tbz x14, #3, 108f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x25], #0x10\n"
+      "ld1 { v18.8h }, [x24], #0x10\n"
+      "tbz x14, #2, 106f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "ldr d19, [x24], #0x8\n"
+      "tbz x14, #1, 105f\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x25], #0x4\n"
+      "mov x20, #0x3c\n"
+      "ld1 { v19.s }[2], [x24], #0x4\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x25]\n"
+      "ld1 { v19.h }[6], [x24]\n"
+      "b 120f\n"
+      "105:"  // Height 3: Partial accumulate: partial_1_28
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x25]\n"
+      "ld1 { v19.h }[4], [x24]\n"
+      "b 120f\n"
+      "106:"  // Height 3: Partial accumulate: partial_2_24
+      "tbz x14, #1, 107f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x25], #0x4\n"
+      "mov x20, #0x34\n"
+      "ldr s19, [x24], #0x4\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x25]\n"
+      "ld1 { v19.h }[2], [x24]\n"
+      "b 120f\n"
+      "107:"  // Height 3: Partial accumulate: partial_1_24
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 120f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x25, #0x0]\n"
+      "ldr h19, [x24, #0x0]\n"
+      "b 120f\n"
+      "108:"  // Height 3: Partial accumulate: partial_4_16
+      "tbz x14, #2, 110f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "ldr d18, [x24], #0x8\n"
+      "tbz x14, #1, 109f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x25], #0x4\n"
+      "mov x20, #0x2c\n"
+      "ld1 { v18.s }[2], [x24], #0x4\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x25]\n"
+      "ld1 { v18.h }[6], [x24]\n"
+      "b 120f\n"
+      "109:"  // Height 3: Partial accumulate: partial_1_20
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x25]\n"
+      "ld1 { v18.h }[4], [x24]\n"
+      "b 120f\n"
+      "110:"  // Height 3: Partial accumulate: partial_2_16
+      "tbz x14, #1, 111f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x25], #0x4\n"
+      "mov x20, #0x24\n"
+      "ldr s18, [x24], #0x4\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x25]\n"
+      "ld1 { v18.h }[2], [x24]\n"
+      "b 120f\n"
+      "111:"  // Height 3: Partial accumulate: partial_1_16
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 120f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x25, #0x0]\n"
+      "ldr h18, [x24, #0x0]\n"
+      "b 120f\n"
+      "112:"  // Height 3: Partial accumulate: partial_8_0
+      "tbz x14, #3, 116f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "tbz x14, #2, 114f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "tbz x14, #1, 113f\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x25], #0x4\n"
+      "mov x20, #0x1c\n"
+      "ld1 { v17.s }[2], [x24], #0x4\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x25]\n"
+      "ld1 { v17.h }[6], [x24]\n"
+      "b 120f\n"
+      "113:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x25]\n"
+      "ld1 { v17.h }[4], [x24]\n"
+      "b 120f\n"
+      "114:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x14, #1, 115f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x25], #0x4\n"
+      "mov x20, #0x14\n"
+      "ldr s17, [x24], #0x4\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x25]\n"
+      "ld1 { v17.h }[2], [x24]\n"
+      "b 120f\n"
+      "115:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 120f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x25, #0x0]\n"
+      "ldr h17, [x24, #0x0]\n"
+      "b 120f\n"
+      "116:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x14, #2, 118f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "tbz x14, #1, 117f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x25], #0x4\n"
+      "mov x20, #0xc\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x25]\n"
+      "ld1 { v16.h }[6], [x24]\n"
+      "b 120f\n"
+      "117:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x25]\n"
+      "ld1 { v16.h }[4], [x24]\n"
+      "b 120f\n"
+      "118:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x14, #1, 119f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x25], #0x4\n"
+      "mov x20, #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x25]\n"
+      "ld1 { v16.h }[2], [x24]\n"
+      "b 120f\n"
+      "119:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr h16, [x24, #0x0]\n"
+      "120:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 123f\n"
+      "121:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "b 123f\n"
+      "122:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "123:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "124:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 125f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 126f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "b 126f\n"
+      "125:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "126:"  // Height 3: input setup done
+      "cmp x27, #0x8\n"
+      "blt 129f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 128f\n"
+      "127:"  // Height 3: Multiply loop: Main loop head
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x10\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "ldr q20, [x9, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr q21, [x12, #0x10]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr q20, [x11, #0x10]\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x10]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x9, #0x10]\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x12, #0x20]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x11, #0x20]\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x9, #0x20]\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x12, #0x30]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x11, #0x30]\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0x30]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x9, #0x30]\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x12, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x11, #0x40]\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x9, #0x40]\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x12, #0x50]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x11, #0x50]\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x50]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x9, #0x50]\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x12, #0x60]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x11, #0x60]\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x9, #0x60]\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x12, #0x70]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
+      "add x12, x12, #0x80\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x11, #0x70]\n"
+      "add x11, x11, #0x80\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr q21, [x10, #0x70]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
+      "add x10, x10, #0x80\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr q20, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "bge 127b\n"
+      "128:"  // Height 3: Multiply loop: Single iteration only
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "ldr q20, [x9, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr q21, [x12, #0x10]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr q20, [x11, #0x10]\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x10]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x9, #0x10]\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x12, #0x20]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x11, #0x20]\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x9, #0x20]\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x12, #0x30]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x11, #0x30]\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0x30]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x9, #0x30]\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x12, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x11, #0x40]\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x9, #0x40]\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x12, #0x50]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x11, #0x50]\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x50]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x9, #0x50]\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x12, #0x60]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x11, #0x60]\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x9, #0x60]\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x12, #0x70]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
+      "add x12, x12, #0x80\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x11, #0x70]\n"
+      "add x11, x11, #0x80\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr q21, [x10, #0x70]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
+      "add x10, x10, #0x80\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr q20, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
+      "129:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 131f\n"
+      "130:"  // Height 3: Multiply loop: Odd block loop
+      "ldr h2, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "sub x27, x27, #0x1\n"
+      "ldr h0, [x24], #0x2\n"
+      "ldr q21, [x12, #0x0]\n"
+      "fmla v8.8h, v21.8h, v2.h[0]\n"
+      "fmla v12.8h, v21.8h, v1.h[0]\n"
+      "ldr q20, [x11, #0x0]\n"
+      "fmla v16.8h, v21.8h, v0.h[0]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v9.8h, v20.8h, v2.h[0]\n"
+      "fmla v13.8h, v20.8h, v1.h[0]\n"
+      "fmla v17.8h, v20.8h, v0.h[0]\n"
+      "ldr q20, [x9, #0x0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v10.8h, v21.8h, v2.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      "fmla v18.8h, v21.8h, v0.h[0]\n"
+      "fmla v11.8h, v20.8h, v2.h[0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v0.h[0]\n"
+      "cbnz x27, 130b\n"
+      "131:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 124b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "tbz %x[flags], #1, 132f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v21.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v20.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v21.8h\n"
+      "fmin v9.8h, v9.8h, v21.8h\n"
+      "fmin v10.8h, v10.8h, v21.8h\n"
+      "fmin v11.8h, v11.8h, v21.8h\n"
+      "fmin v12.8h, v12.8h, v21.8h\n"
+      "fmin v13.8h, v13.8h, v21.8h\n"
+      "fmin v14.8h, v14.8h, v21.8h\n"
+      "fmin v15.8h, v15.8h, v21.8h\n"
+      "fmin v16.8h, v16.8h, v21.8h\n"
+      "fmin v17.8h, v17.8h, v21.8h\n"
+      "fmin v18.8h, v18.8h, v21.8h\n"
+      "fmin v19.8h, v19.8h, v21.8h\n"
+      "fmax v8.8h, v8.8h, v20.8h\n"
+      "fmax v9.8h, v9.8h, v20.8h\n"
+      "fmax v10.8h, v10.8h, v20.8h\n"
+      "fmax v11.8h, v11.8h, v20.8h\n"
+      "fmax v12.8h, v12.8h, v20.8h\n"
+      "fmax v13.8h, v13.8h, v20.8h\n"
+      "fmax v14.8h, v14.8h, v20.8h\n"
+      "fmax v15.8h, v15.8h, v20.8h\n"
+      "fmax v16.8h, v16.8h, v20.8h\n"
+      "fmax v17.8h, v17.8h, v20.8h\n"
+      "fmax v18.8h, v18.8h, v20.8h\n"
+      "fmax v19.8h, v19.8h, v20.8h\n"
+      "132:"  // Height 3: No activation
+      "cmp x14, #0x20\n"
+      "bge 149f\n"
+      "tbz x14, #4, 140f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v13.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v17.8h }, [x24], #0x10\n"
+      "tbz x14, #3, 136f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x25], #0x10\n"
+      "st1 { v18.8h }, [x24], #0x10\n"
+      "tbz x14, #2, 134f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "tbz x14, #1, 133f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x25], #0x4\n"
+      "st1 { v19.s }[2], [x24], #0x4\n"
+      "tbz x14, #0, 148f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x25]\n"
+      "st1 { v19.h }[6], [x24]\n"
+      "b 148f\n"
+      "133:"  // Height 3: Partial direct writeback: partial_1_28
+      "tbz x14, #0, 148f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x25]\n"
+      "st1 { v19.h }[4], [x24]\n"
+      "b 148f\n"
+      "134:"  // Height 3: Partial direct writeback: partial_2_24
+      "tbz x14, #1, 135f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x25], #0x4\n"
+      "str s19, [x24], #0x4\n"
+      "tbz x14, #0, 148f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x25]\n"
+      "st1 { v19.h }[2], [x24]\n"
+      "b 148f\n"
+      "135:"  // Height 3: Partial direct writeback: partial_1_24
+      "tbz x14, #0, 148f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x25, #0x0]\n"
+      "str h19, [x24, #0x0]\n"
+      "b 148f\n"
+      "136:"  // Height 3: Partial direct writeback: partial_4_16
+      "tbz x14, #2, 138f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "tbz x14, #1, 137f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x25], #0x4\n"
+      "st1 { v18.s }[2], [x24], #0x4\n"
+      "tbz x14, #0, 148f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x25]\n"
+      "st1 { v18.h }[6], [x24]\n"
+      "b 148f\n"
+      "137:"  // Height 3: Partial direct writeback: partial_1_20
+      "tbz x14, #0, 148f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x25]\n"
+      "st1 { v18.h }[4], [x24]\n"
+      "b 148f\n"
+      "138:"  // Height 3: Partial direct writeback: partial_2_16
+      "tbz x14, #1, 139f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x25], #0x4\n"
+      "str s18, [x24], #0x4\n"
+      "tbz x14, #0, 148f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x25]\n"
+      "st1 { v18.h }[2], [x24]\n"
+      "b 148f\n"
+      "139:"  // Height 3: Partial direct writeback: partial_1_16
+      "tbz x14, #0, 148f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x25, #0x0]\n"
+      "str h18, [x24, #0x0]\n"
+      "b 148f\n"
+      "140:"  // Height 3: Partial direct writeback: partial_8_0
+      "tbz x14, #3, 144f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "tbz x14, #2, 142f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "tbz x14, #1, 141f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x25], #0x4\n"
+      "st1 { v17.s }[2], [x24], #0x4\n"
+      "tbz x14, #0, 148f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x25]\n"
+      "st1 { v17.h }[6], [x24]\n"
+      "b 148f\n"
+      "141:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 148f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x25]\n"
+      "st1 { v17.h }[4], [x24]\n"
+      "b 148f\n"
+      "142:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 143f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x25], #0x4\n"
+      "str s17, [x24], #0x4\n"
+      "tbz x14, #0, 148f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x25]\n"
+      "st1 { v17.h }[2], [x24]\n"
+      "b 148f\n"
+      "143:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 148f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x25, #0x0]\n"
+      "str h17, [x24, #0x0]\n"
+      "b 148f\n"
+      "144:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 146f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "tbz x14, #1, 145f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "tbz x14, #0, 148f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x25]\n"
+      "st1 { v16.h }[6], [x24]\n"
+      "b 148f\n"
+      "145:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 148f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x25]\n"
+      "st1 { v16.h }[4], [x24]\n"
+      "b 148f\n"
+      "146:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 147f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "tbz x14, #0, 148f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x25]\n"
+      "st1 { v16.h }[2], [x24]\n"
+      "b 148f\n"
+      "147:"  // Height 3: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x25, #0x0]\n"
+      "str h16, [x24, #0x0]\n"
+      "148:"  // Height 3: Partial direct writeback: Done
+      "b 150f\n"
+      "149:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "150:"  // Height 3: Writeback done
+      "subs x14, x14, #0x20\n"
+      "bgt 102b\n"
+      "b 302f\n"
+      "151:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "152:"  // Height 4: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0x18\n"
+      "bgt 153f\n"
+      "cmp x14, #0x10\n"
+      "mov x9, x12\n"
+      "bgt 153f\n"
+      "cmp x14, #0x8\n"
+      "mov x10, x12\n"
+      "bgt 153f\n"
+      "mov x11, x12\n"
+      "153:"  // Height 4: B setup done
+      "cbz x15, 154f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "mov v12.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x15, x15, #0x40\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "b 173f\n"
+      "154:"  // Height 4: no bias
+      "tbz %x[flags], #0, 172f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "cmp x14, #0x20\n"
+      "add x23, x24, x20, LSL #1\n"
+      "bge 171f\n"
+      "tbz x14, #4, 162f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
+      "ld1 { v17.8h }, [x24], #0x10\n"
+      "ld1 { v21.8h }, [x23], #0x10\n"
+      "tbz x14, #3, 158f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x25], #0x10\n"
+      "ld1 { v18.8h }, [x24], #0x10\n"
+      "ld1 { v22.8h }, [x23], #0x10\n"
+      "tbz x14, #2, 156f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "tbz x14, #1, 155f\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x25], #0x4\n"
+      "mov x20, #0x3c\n"
+      "ld1 { v19.s }[2], [x24], #0x4\n"
+      "ld1 { v23.s }[2], [x23], #0x4\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x25]\n"
+      "ld1 { v19.h }[6], [x24]\n"
+      "ld1 { v23.h }[6], [x23]\n"
+      "b 170f\n"
+      "155:"  // Height 4: Partial accumulate: partial_1_28
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x25]\n"
+      "ld1 { v19.h }[4], [x24]\n"
+      "ld1 { v23.h }[4], [x23]\n"
+      "b 170f\n"
+      "156:"  // Height 4: Partial accumulate: partial_2_24
+      "tbz x14, #1, 157f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x25], #0x4\n"
+      "mov x20, #0x34\n"
+      "ldr s19, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x25]\n"
+      "ld1 { v19.h }[2], [x24]\n"
+      "ld1 { v23.h }[2], [x23]\n"
+      "b 170f\n"
+      "157:"  // Height 4: Partial accumulate: partial_1_24
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 170f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x25, #0x0]\n"
+      "ldr h19, [x24, #0x0]\n"
+      "ldr h23, [x23, #0x0]\n"
+      "b 170f\n"
+      "158:"  // Height 4: Partial accumulate: partial_4_16
+      "tbz x14, #2, 160f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "tbz x14, #1, 159f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x25], #0x4\n"
+      "mov x20, #0x2c\n"
+      "ld1 { v18.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x25]\n"
+      "ld1 { v18.h }[6], [x24]\n"
+      "ld1 { v22.h }[6], [x23]\n"
+      "b 170f\n"
+      "159:"  // Height 4: Partial accumulate: partial_1_20
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x25]\n"
+      "ld1 { v18.h }[4], [x24]\n"
+      "ld1 { v22.h }[4], [x23]\n"
+      "b 170f\n"
+      "160:"  // Height 4: Partial accumulate: partial_2_16
+      "tbz x14, #1, 161f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x25], #0x4\n"
+      "mov x20, #0x24\n"
+      "ldr s18, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x25]\n"
+      "ld1 { v18.h }[2], [x24]\n"
+      "ld1 { v22.h }[2], [x23]\n"
+      "b 170f\n"
+      "161:"  // Height 4: Partial accumulate: partial_1_16
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 170f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x25, #0x0]\n"
+      "ldr h18, [x24, #0x0]\n"
+      "ldr h22, [x23, #0x0]\n"
+      "b 170f\n"
+      "162:"  // Height 4: Partial accumulate: partial_8_0
+      "tbz x14, #3, 166f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "tbz x14, #2, 164f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "tbz x14, #1, 163f\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x25], #0x4\n"
+      "mov x20, #0x1c\n"
+      "ld1 { v17.s }[2], [x24], #0x4\n"
+      "ld1 { v21.s }[2], [x23], #0x4\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x25]\n"
+      "ld1 { v17.h }[6], [x24]\n"
+      "ld1 { v21.h }[6], [x23]\n"
+      "b 170f\n"
+      "163:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x25]\n"
+      "ld1 { v17.h }[4], [x24]\n"
+      "ld1 { v21.h }[4], [x23]\n"
+      "b 170f\n"
+      "164:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x14, #1, 165f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x25], #0x4\n"
+      "mov x20, #0x14\n"
+      "ldr s17, [x24], #0x4\n"
+      "ldr s21, [x23], #0x4\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x25]\n"
+      "ld1 { v17.h }[2], [x24]\n"
+      "ld1 { v21.h }[2], [x23]\n"
+      "b 170f\n"
+      "165:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 170f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x25, #0x0]\n"
+      "ldr h17, [x24, #0x0]\n"
+      "ldr h21, [x23, #0x0]\n"
+      "b 170f\n"
+      "166:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x14, #2, 168f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "tbz x14, #1, 167f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x25], #0x4\n"
+      "mov x20, #0xc\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
+      "ld1 { v20.s }[2], [x23], #0x4\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x25]\n"
+      "ld1 { v16.h }[6], [x24]\n"
+      "ld1 { v20.h }[6], [x23]\n"
+      "b 170f\n"
+      "167:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x25]\n"
+      "ld1 { v16.h }[4], [x24]\n"
+      "ld1 { v20.h }[4], [x23]\n"
+      "b 170f\n"
+      "168:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x14, #1, 169f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x25], #0x4\n"
+      "mov x20, #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "tbz x14, #0, 170f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x25]\n"
+      "ld1 { v16.h }[2], [x24]\n"
+      "ld1 { v20.h }[2], [x23]\n"
+      "b 170f\n"
+      "169:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr h16, [x24, #0x0]\n"
+      "ldr h20, [x23, #0x0]\n"
+      "170:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 173f\n"
+      "171:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "b 173f\n"
+      "172:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "173:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "174:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 175f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 176f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "b 176f\n"
+      "175:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "176:"  // Height 4: input setup done
+      "cmp x27, #0x8\n"
+      "blt 179f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 178f\n"
+      "177:"  // Height 4: Multiply loop: Main loop head
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x10\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "ldr q24, [x9, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr q24, [x11, #0x10]\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x9, #0x10]\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x12, #0x20]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x11, #0x20]\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x9, #0x20]\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x12, #0x30]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x11, #0x30]\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0x30]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x9, #0x30]\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x12, #0x40]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x11, #0x40]\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x9, #0x40]\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x12, #0x50]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x11, #0x50]\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x50]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x9, #0x50]\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x12, #0x60]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x11, #0x60]\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x9, #0x60]\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x12, #0x70]\n"
+      "add x12, x12, #0x80\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x11, #0x70]\n"
+      "add x11, x11, #0x80\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr q24, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "bge 177b\n"
+      "178:"  // Height 4: Multiply loop: Single iteration only
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "ldr q24, [x9, #0x0]\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr q24, [x11, #0x10]\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x9, #0x10]\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x12, #0x20]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x11, #0x20]\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x9, #0x20]\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x12, #0x30]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x11, #0x30]\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0x30]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x9, #0x30]\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x12, #0x40]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x11, #0x40]\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x9, #0x40]\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x12, #0x50]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x11, #0x50]\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x50]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x9, #0x50]\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x12, #0x60]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x11, #0x60]\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x9, #0x60]\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x12, #0x70]\n"
+      "add x12, x12, #0x80\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x11, #0x70]\n"
+      "add x11, x11, #0x80\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr q24, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
+      "179:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 181f\n"
+      "180:"  // Height 4: Multiply loop: Odd block loop
+      "ldr h3, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "sub x27, x27, #0x1\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h0, [x23], #0x2\n"
+      "ldr q25, [x12, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
+      "fmla v8.8h, v25.8h, v3.h[0]\n"
+      "fmla v12.8h, v25.8h, v2.h[0]\n"
+      "fmla v16.8h, v25.8h, v1.h[0]\n"
+      "fmla v20.8h, v25.8h, v0.h[0]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v9.8h, v24.8h, v3.h[0]\n"
+      "fmla v13.8h, v24.8h, v2.h[0]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      "fmla v17.8h, v24.8h, v1.h[0]\n"
+      "fmla v21.8h, v24.8h, v0.h[0]\n"
+      "ldr q24, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v10.8h, v25.8h, v3.h[0]\n"
+      "fmla v14.8h, v25.8h, v2.h[0]\n"
+      "fmla v18.8h, v25.8h, v1.h[0]\n"
+      "fmla v22.8h, v25.8h, v0.h[0]\n"
+      "fmla v11.8h, v24.8h, v3.h[0]\n"
+      "fmla v15.8h, v24.8h, v2.h[0]\n"
+      "fmla v19.8h, v24.8h, v1.h[0]\n"
+      "fmla v23.8h, v24.8h, v0.h[0]\n"
+      "cbnz x27, 180b\n"
+      "181:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 174b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "tbz %x[flags], #1, 182f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v25.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v24.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v25.8h\n"
+      "fmin v9.8h, v9.8h, v25.8h\n"
+      "fmin v10.8h, v10.8h, v25.8h\n"
+      "fmin v11.8h, v11.8h, v25.8h\n"
+      "fmin v12.8h, v12.8h, v25.8h\n"
+      "fmin v13.8h, v13.8h, v25.8h\n"
+      "fmin v14.8h, v14.8h, v25.8h\n"
+      "fmin v15.8h, v15.8h, v25.8h\n"
+      "fmin v16.8h, v16.8h, v25.8h\n"
+      "fmin v17.8h, v17.8h, v25.8h\n"
+      "fmin v18.8h, v18.8h, v25.8h\n"
+      "fmin v19.8h, v19.8h, v25.8h\n"
+      "fmin v20.8h, v20.8h, v25.8h\n"
+      "fmin v21.8h, v21.8h, v25.8h\n"
+      "fmin v22.8h, v22.8h, v25.8h\n"
+      "fmin v23.8h, v23.8h, v25.8h\n"
+      "fmax v8.8h, v8.8h, v24.8h\n"
+      "fmax v9.8h, v9.8h, v24.8h\n"
+      "fmax v10.8h, v10.8h, v24.8h\n"
+      "fmax v11.8h, v11.8h, v24.8h\n"
+      "fmax v12.8h, v12.8h, v24.8h\n"
+      "fmax v13.8h, v13.8h, v24.8h\n"
+      "fmax v14.8h, v14.8h, v24.8h\n"
+      "fmax v15.8h, v15.8h, v24.8h\n"
+      "fmax v16.8h, v16.8h, v24.8h\n"
+      "fmax v17.8h, v17.8h, v24.8h\n"
+      "fmax v18.8h, v18.8h, v24.8h\n"
+      "fmax v19.8h, v19.8h, v24.8h\n"
+      "fmax v20.8h, v20.8h, v24.8h\n"
+      "fmax v21.8h, v21.8h, v24.8h\n"
+      "fmax v22.8h, v22.8h, v24.8h\n"
+      "fmax v23.8h, v23.8h, v24.8h\n"
+      "182:"  // Height 4: No activation
+      "cmp x14, #0x20\n"
+      "bge 199f\n"
+      "tbz x14, #4, 190f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v13.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v17.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
+      "st1 { v21.8h }, [x23], #0x10\n"
+      "tbz x14, #3, 186f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x25], #0x10\n"
+      "st1 { v18.8h }, [x24], #0x10\n"
+      "st1 { v22.8h }, [x23], #0x10\n"
+      "tbz x14, #2, 184f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "tbz x14, #1, 183f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x25], #0x4\n"
+      "st1 { v19.s }[2], [x24], #0x4\n"
+      "st1 { v23.s }[2], [x23], #0x4\n"
+      "tbz x14, #0, 198f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x25]\n"
+      "st1 { v19.h }[6], [x24]\n"
+      "st1 { v23.h }[6], [x23]\n"
+      "b 198f\n"
+      "183:"  // Height 4: Partial direct writeback: partial_1_28
+      "tbz x14, #0, 198f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x25]\n"
+      "st1 { v19.h }[4], [x24]\n"
+      "st1 { v23.h }[4], [x23]\n"
+      "b 198f\n"
+      "184:"  // Height 4: Partial direct writeback: partial_2_24
+      "tbz x14, #1, 185f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x25], #0x4\n"
+      "str s19, [x24], #0x4\n"
+      "str s23, [x23], #0x4\n"
+      "tbz x14, #0, 198f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x25]\n"
+      "st1 { v19.h }[2], [x24]\n"
+      "st1 { v23.h }[2], [x23]\n"
+      "b 198f\n"
+      "185:"  // Height 4: Partial direct writeback: partial_1_24
+      "tbz x14, #0, 198f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x25, #0x0]\n"
+      "str h19, [x24, #0x0]\n"
+      "str h23, [x23, #0x0]\n"
+      "b 198f\n"
+      "186:"  // Height 4: Partial direct writeback: partial_4_16
+      "tbz x14, #2, 188f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "tbz x14, #1, 187f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x25], #0x4\n"
+      "st1 { v18.s }[2], [x24], #0x4\n"
+      "st1 { v22.s }[2], [x23], #0x4\n"
+      "tbz x14, #0, 198f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x25]\n"
+      "st1 { v18.h }[6], [x24]\n"
+      "st1 { v22.h }[6], [x23]\n"
+      "b 198f\n"
+      "187:"  // Height 4: Partial direct writeback: partial_1_20
+      "tbz x14, #0, 198f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x25]\n"
+      "st1 { v18.h }[4], [x24]\n"
+      "st1 { v22.h }[4], [x23]\n"
+      "b 198f\n"
+      "188:"  // Height 4: Partial direct writeback: partial_2_16
+      "tbz x14, #1, 189f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x25], #0x4\n"
+      "str s18, [x24], #0x4\n"
+      "str s22, [x23], #0x4\n"
+      "tbz x14, #0, 198f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x25]\n"
+      "st1 { v18.h }[2], [x24]\n"
+      "st1 { v22.h }[2], [x23]\n"
+      "b 198f\n"
+      "189:"  // Height 4: Partial direct writeback: partial_1_16
+      "tbz x14, #0, 198f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x25, #0x0]\n"
+      "str h18, [x24, #0x0]\n"
+      "str h22, [x23, #0x0]\n"
+      "b 198f\n"
+      "190:"  // Height 4: Partial direct writeback: partial_8_0
+      "tbz x14, #3, 194f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
+      "tbz x14, #2, 192f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "tbz x14, #1, 191f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x25], #0x4\n"
+      "st1 { v17.s }[2], [x24], #0x4\n"
+      "st1 { v21.s }[2], [x23], #0x4\n"
+      "tbz x14, #0, 198f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x25]\n"
+      "st1 { v17.h }[6], [x24]\n"
+      "st1 { v21.h }[6], [x23]\n"
+      "b 198f\n"
+      "191:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 198f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x25]\n"
+      "st1 { v17.h }[4], [x24]\n"
+      "st1 { v21.h }[4], [x23]\n"
+      "b 198f\n"
+      "192:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 193f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x25], #0x4\n"
+      "str s17, [x24], #0x4\n"
+      "str s21, [x23], #0x4\n"
+      "tbz x14, #0, 198f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x25]\n"
+      "st1 { v17.h }[2], [x24]\n"
+      "st1 { v21.h }[2], [x23]\n"
+      "b 198f\n"
+      "193:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 198f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x25, #0x0]\n"
+      "str h17, [x24, #0x0]\n"
+      "str h21, [x23, #0x0]\n"
+      "b 198f\n"
+      "194:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 196f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "tbz x14, #1, 195f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "tbz x14, #0, 198f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x25]\n"
+      "st1 { v16.h }[6], [x24]\n"
+      "st1 { v20.h }[6], [x23]\n"
+      "b 198f\n"
+      "195:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 198f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x25]\n"
+      "st1 { v16.h }[4], [x24]\n"
+      "st1 { v20.h }[4], [x23]\n"
+      "b 198f\n"
+      "196:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 197f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "tbz x14, #0, 198f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x25]\n"
+      "st1 { v16.h }[2], [x24]\n"
+      "st1 { v20.h }[2], [x23]\n"
+      "b 198f\n"
+      "197:"  // Height 4: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x25, #0x0]\n"
+      "str h16, [x24, #0x0]\n"
+      "str h20, [x23, #0x0]\n"
+      "198:"  // Height 4: Partial direct writeback: Done
+      "b 200f\n"
+      "199:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "200:"  // Height 4: Writeback done
+      "subs x14, x14, #0x20\n"
+      "bgt 152b\n"
+      "b 302f\n"
+      "201:"  // Height 5
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "202:"  // Height 5: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0x18\n"
+      "bgt 203f\n"
+      "cmp x14, #0x10\n"
+      "mov x9, x12\n"
+      "bgt 203f\n"
+      "cmp x14, #0x8\n"
+      "mov x10, x12\n"
+      "bgt 203f\n"
+      "mov x11, x12\n"
+      "203:"  // Height 5: B setup done
+      "cbz x15, 204f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "mov v12.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x15, x15, #0x40\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "b 223f\n"
+      "204:"  // Height 5: no bias
+      "tbz %x[flags], #0, 222f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "cmp x14, #0x20\n"
+      "add x22, x23, x20, LSL #1\n"
+      "bge 221f\n"
+      "tbz x14, #4, 212f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "ld1 { v24.8h }, [x22], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
+      "ld1 { v17.8h }, [x24], #0x10\n"
+      "ld1 { v21.8h }, [x23], #0x10\n"
+      "ld1 { v25.8h }, [x22], #0x10\n"
+      "tbz x14, #3, 208f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x25], #0x10\n"
+      "ld1 { v18.8h }, [x24], #0x10\n"
+      "ld1 { v22.8h }, [x23], #0x10\n"
+      "ld1 { v26.8h }, [x22], #0x10\n"
+      "tbz x14, #2, 206f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "tbz x14, #1, 205f\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x25], #0x4\n"
+      "mov x20, #0x3c\n"
+      "ld1 { v19.s }[2], [x24], #0x4\n"
+      "ld1 { v23.s }[2], [x23], #0x4\n"
+      "ld1 { v27.s }[2], [x22], #0x4\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x25]\n"
+      "ld1 { v19.h }[6], [x24]\n"
+      "ld1 { v23.h }[6], [x23]\n"
+      "ld1 { v27.h }[6], [x22]\n"
+      "b 220f\n"
+      "205:"  // Height 5: Partial accumulate: partial_1_28
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x25]\n"
+      "ld1 { v19.h }[4], [x24]\n"
+      "ld1 { v23.h }[4], [x23]\n"
+      "ld1 { v27.h }[4], [x22]\n"
+      "b 220f\n"
+      "206:"  // Height 5: Partial accumulate: partial_2_24
+      "tbz x14, #1, 207f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x25], #0x4\n"
+      "mov x20, #0x34\n"
+      "ldr s19, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s27, [x22], #0x4\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x25]\n"
+      "ld1 { v19.h }[2], [x24]\n"
+      "ld1 { v23.h }[2], [x23]\n"
+      "ld1 { v27.h }[2], [x22]\n"
+      "b 220f\n"
+      "207:"  // Height 5: Partial accumulate: partial_1_24
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 220f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x25, #0x0]\n"
+      "ldr h19, [x24, #0x0]\n"
+      "ldr h23, [x23, #0x0]\n"
+      "ldr h27, [x22, #0x0]\n"
+      "b 220f\n"
+      "208:"  // Height 5: Partial accumulate: partial_4_16
+      "tbz x14, #2, 210f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "tbz x14, #1, 209f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x25], #0x4\n"
+      "mov x20, #0x2c\n"
+      "ld1 { v18.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v26.s }[2], [x22], #0x4\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x25]\n"
+      "ld1 { v18.h }[6], [x24]\n"
+      "ld1 { v22.h }[6], [x23]\n"
+      "ld1 { v26.h }[6], [x22]\n"
+      "b 220f\n"
+      "209:"  // Height 5: Partial accumulate: partial_1_20
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x25]\n"
+      "ld1 { v18.h }[4], [x24]\n"
+      "ld1 { v22.h }[4], [x23]\n"
+      "ld1 { v26.h }[4], [x22]\n"
+      "b 220f\n"
+      "210:"  // Height 5: Partial accumulate: partial_2_16
+      "tbz x14, #1, 211f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x25], #0x4\n"
+      "mov x20, #0x24\n"
+      "ldr s18, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s26, [x22], #0x4\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x25]\n"
+      "ld1 { v18.h }[2], [x24]\n"
+      "ld1 { v22.h }[2], [x23]\n"
+      "ld1 { v26.h }[2], [x22]\n"
+      "b 220f\n"
+      "211:"  // Height 5: Partial accumulate: partial_1_16
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 220f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x25, #0x0]\n"
+      "ldr h18, [x24, #0x0]\n"
+      "ldr h22, [x23, #0x0]\n"
+      "ldr h26, [x22, #0x0]\n"
+      "b 220f\n"
+      "212:"  // Height 5: Partial accumulate: partial_8_0
+      "tbz x14, #3, 216f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "ld1 { v24.8h }, [x22], #0x10\n"
+      "tbz x14, #2, 214f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "tbz x14, #1, 213f\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x25], #0x4\n"
+      "mov x20, #0x1c\n"
+      "ld1 { v17.s }[2], [x24], #0x4\n"
+      "ld1 { v21.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x25]\n"
+      "ld1 { v17.h }[6], [x24]\n"
+      "ld1 { v21.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
+      "b 220f\n"
+      "213:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x25]\n"
+      "ld1 { v17.h }[4], [x24]\n"
+      "ld1 { v21.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
+      "b 220f\n"
+      "214:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x14, #1, 215f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x25], #0x4\n"
+      "mov x20, #0x14\n"
+      "ldr s17, [x24], #0x4\n"
+      "ldr s21, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x25]\n"
+      "ld1 { v17.h }[2], [x24]\n"
+      "ld1 { v21.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
+      "b 220f\n"
+      "215:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 220f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x25, #0x0]\n"
+      "ldr h17, [x24, #0x0]\n"
+      "ldr h21, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
+      "b 220f\n"
+      "216:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x14, #2, 218f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "tbz x14, #1, 217f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x25], #0x4\n"
+      "mov x20, #0xc\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
+      "ld1 { v20.s }[2], [x23], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x25]\n"
+      "ld1 { v16.h }[6], [x24]\n"
+      "ld1 { v20.h }[6], [x23]\n"
+      "ld1 { v24.h }[6], [x22]\n"
+      "b 220f\n"
+      "217:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x25]\n"
+      "ld1 { v16.h }[4], [x24]\n"
+      "ld1 { v20.h }[4], [x23]\n"
+      "ld1 { v24.h }[4], [x22]\n"
+      "b 220f\n"
+      "218:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x14, #1, 219f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x25], #0x4\n"
+      "mov x20, #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "tbz x14, #0, 220f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x25]\n"
+      "ld1 { v16.h }[2], [x24]\n"
+      "ld1 { v20.h }[2], [x23]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "b 220f\n"
+      "219:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr h16, [x24, #0x0]\n"
+      "ldr h20, [x23, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "220:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 223f\n"
+      "221:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q24, [x22, #0x0]\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q26, [x22, #0x20]\n"
+      "ldr q27, [x22, #0x30]\n"
+      "b 223f\n"
+      "222:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "223:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "224:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 225f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 226f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "b 226f\n"
+      "225:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "226:"  // Height 5: input setup done
+      "cmp x27, #0x8\n"
+      "blt 229f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 228f\n"
+      "227:"  // Height 5: Multiply loop: Main loop head
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x10\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "ldr q28, [x9, #0x0]\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr q29, [x12, #0x10]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr q28, [x11, #0x10]\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x9, #0x10]\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x12, #0x20]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x11, #0x20]\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x9, #0x20]\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x12, #0x30]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x11, #0x30]\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0x30]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x9, #0x30]\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x12, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x11, #0x40]\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x9, #0x40]\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x12, #0x50]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x11, #0x50]\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x50]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x9, #0x50]\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x12, #0x60]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x11, #0x60]\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x9, #0x60]\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x12, #0x70]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
+      "add x12, x12, #0x80\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x11, #0x70]\n"
+      "add x11, x11, #0x80\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr q29, [x10, #0x70]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
+      "add x10, x10, #0x80\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr q28, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "bge 227b\n"
+      "228:"  // Height 5: Multiply loop: Single iteration only
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "ldr q28, [x9, #0x0]\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr q29, [x12, #0x10]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr q28, [x11, #0x10]\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x9, #0x10]\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x12, #0x20]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x11, #0x20]\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x9, #0x20]\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x12, #0x30]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x11, #0x30]\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0x30]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x9, #0x30]\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x12, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x11, #0x40]\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x9, #0x40]\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x12, #0x50]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x11, #0x50]\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x50]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x9, #0x50]\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x12, #0x60]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x11, #0x60]\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x9, #0x60]\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x12, #0x70]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
+      "add x12, x12, #0x80\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x11, #0x70]\n"
+      "add x11, x11, #0x80\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr q29, [x10, #0x70]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
+      "add x10, x10, #0x80\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr q28, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
+      "229:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 231f\n"
+      "230:"  // Height 5: Multiply loop: Odd block loop
+      "ldr h4, [x26], #0x2\n"
+      "ldr h3, [x25], #0x2\n"
+      "sub x27, x27, #0x1\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h0, [x22], #0x2\n"
+      "ldr q29, [x12, #0x0]\n"
+      "fmla v8.8h, v29.8h, v4.h[0]\n"
+      "fmla v12.8h, v29.8h, v3.h[0]\n"
+      "ldr q28, [x11, #0x0]\n"
+      "fmla v16.8h, v29.8h, v2.h[0]\n"
+      "fmla v20.8h, v29.8h, v1.h[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v24.8h, v29.8h, v0.h[0]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v9.8h, v28.8h, v4.h[0]\n"
+      "add x11, x11, #0x10\n"
+      "fmla v13.8h, v28.8h, v3.h[0]\n"
+      "fmla v17.8h, v28.8h, v2.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v21.8h, v28.8h, v1.h[0]\n"
+      "fmla v25.8h, v28.8h, v0.h[0]\n"
+      "ldr q28, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v10.8h, v29.8h, v4.h[0]\n"
+      "fmla v14.8h, v29.8h, v3.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v1.h[0]\n"
+      "fmla v26.8h, v29.8h, v0.h[0]\n"
+      "fmla v11.8h, v28.8h, v4.h[0]\n"
+      "fmla v15.8h, v28.8h, v3.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v1.h[0]\n"
+      "fmla v27.8h, v28.8h, v0.h[0]\n"
+      "cbnz x27, 230b\n"
+      "231:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 224b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "tbz %x[flags], #1, 232f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v29.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v28.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v29.8h\n"
+      "fmin v9.8h, v9.8h, v29.8h\n"
+      "fmin v10.8h, v10.8h, v29.8h\n"
+      "fmin v11.8h, v11.8h, v29.8h\n"
+      "fmin v12.8h, v12.8h, v29.8h\n"
+      "fmin v13.8h, v13.8h, v29.8h\n"
+      "fmin v14.8h, v14.8h, v29.8h\n"
+      "fmin v15.8h, v15.8h, v29.8h\n"
+      "fmin v16.8h, v16.8h, v29.8h\n"
+      "fmin v17.8h, v17.8h, v29.8h\n"
+      "fmin v18.8h, v18.8h, v29.8h\n"
+      "fmin v19.8h, v19.8h, v29.8h\n"
+      "fmin v20.8h, v20.8h, v29.8h\n"
+      "fmin v21.8h, v21.8h, v29.8h\n"
+      "fmin v22.8h, v22.8h, v29.8h\n"
+      "fmin v23.8h, v23.8h, v29.8h\n"
+      "fmin v24.8h, v24.8h, v29.8h\n"
+      "fmin v25.8h, v25.8h, v29.8h\n"
+      "fmin v26.8h, v26.8h, v29.8h\n"
+      "fmin v27.8h, v27.8h, v29.8h\n"
+      "fmax v8.8h, v8.8h, v28.8h\n"
+      "fmax v9.8h, v9.8h, v28.8h\n"
+      "fmax v10.8h, v10.8h, v28.8h\n"
+      "fmax v11.8h, v11.8h, v28.8h\n"
+      "fmax v12.8h, v12.8h, v28.8h\n"
+      "fmax v13.8h, v13.8h, v28.8h\n"
+      "fmax v14.8h, v14.8h, v28.8h\n"
+      "fmax v15.8h, v15.8h, v28.8h\n"
+      "fmax v16.8h, v16.8h, v28.8h\n"
+      "fmax v17.8h, v17.8h, v28.8h\n"
+      "fmax v18.8h, v18.8h, v28.8h\n"
+      "fmax v19.8h, v19.8h, v28.8h\n"
+      "fmax v20.8h, v20.8h, v28.8h\n"
+      "fmax v21.8h, v21.8h, v28.8h\n"
+      "fmax v22.8h, v22.8h, v28.8h\n"
+      "fmax v23.8h, v23.8h, v28.8h\n"
+      "fmax v24.8h, v24.8h, v28.8h\n"
+      "fmax v25.8h, v25.8h, v28.8h\n"
+      "fmax v26.8h, v26.8h, v28.8h\n"
+      "fmax v27.8h, v27.8h, v28.8h\n"
+      "232:"  // Height 5: No activation
+      "cmp x14, #0x20\n"
+      "bge 249f\n"
+      "tbz x14, #4, 240f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v13.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v17.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
+      "st1 { v21.8h }, [x23], #0x10\n"
+      "st1 { v24.8h }, [x22], #0x10\n"
+      "st1 { v25.8h }, [x22], #0x10\n"
+      "tbz x14, #3, 236f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x25], #0x10\n"
+      "st1 { v18.8h }, [x24], #0x10\n"
+      "st1 { v22.8h }, [x23], #0x10\n"
+      "st1 { v26.8h }, [x22], #0x10\n"
+      "tbz x14, #2, 234f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "tbz x14, #1, 233f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x25], #0x4\n"
+      "st1 { v19.s }[2], [x24], #0x4\n"
+      "st1 { v23.s }[2], [x23], #0x4\n"
+      "st1 { v27.s }[2], [x22], #0x4\n"
+      "tbz x14, #0, 248f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x25]\n"
+      "st1 { v19.h }[6], [x24]\n"
+      "st1 { v23.h }[6], [x23]\n"
+      "st1 { v27.h }[6], [x22]\n"
+      "b 248f\n"
+      "233:"  // Height 5: Partial direct writeback: partial_1_28
+      "tbz x14, #0, 248f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x25]\n"
+      "st1 { v19.h }[4], [x24]\n"
+      "st1 { v23.h }[4], [x23]\n"
+      "st1 { v27.h }[4], [x22]\n"
+      "b 248f\n"
+      "234:"  // Height 5: Partial direct writeback: partial_2_24
+      "tbz x14, #1, 235f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x25], #0x4\n"
+      "str s19, [x24], #0x4\n"
+      "str s23, [x23], #0x4\n"
+      "str s27, [x22], #0x4\n"
+      "tbz x14, #0, 248f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x25]\n"
+      "st1 { v19.h }[2], [x24]\n"
+      "st1 { v23.h }[2], [x23]\n"
+      "st1 { v27.h }[2], [x22]\n"
+      "b 248f\n"
+      "235:"  // Height 5: Partial direct writeback: partial_1_24
+      "tbz x14, #0, 248f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x25, #0x0]\n"
+      "str h19, [x24, #0x0]\n"
+      "str h23, [x23, #0x0]\n"
+      "str h27, [x22, #0x0]\n"
+      "b 248f\n"
+      "236:"  // Height 5: Partial direct writeback: partial_4_16
+      "tbz x14, #2, 238f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "tbz x14, #1, 237f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x25], #0x4\n"
+      "st1 { v18.s }[2], [x24], #0x4\n"
+      "st1 { v22.s }[2], [x23], #0x4\n"
+      "st1 { v26.s }[2], [x22], #0x4\n"
+      "tbz x14, #0, 248f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x25]\n"
+      "st1 { v18.h }[6], [x24]\n"
+      "st1 { v22.h }[6], [x23]\n"
+      "st1 { v26.h }[6], [x22]\n"
+      "b 248f\n"
+      "237:"  // Height 5: Partial direct writeback: partial_1_20
+      "tbz x14, #0, 248f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x25]\n"
+      "st1 { v18.h }[4], [x24]\n"
+      "st1 { v22.h }[4], [x23]\n"
+      "st1 { v26.h }[4], [x22]\n"
+      "b 248f\n"
+      "238:"  // Height 5: Partial direct writeback: partial_2_16
+      "tbz x14, #1, 239f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x25], #0x4\n"
+      "str s18, [x24], #0x4\n"
+      "str s22, [x23], #0x4\n"
+      "str s26, [x22], #0x4\n"
+      "tbz x14, #0, 248f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x25]\n"
+      "st1 { v18.h }[2], [x24]\n"
+      "st1 { v22.h }[2], [x23]\n"
+      "st1 { v26.h }[2], [x22]\n"
+      "b 248f\n"
+      "239:"  // Height 5: Partial direct writeback: partial_1_16
+      "tbz x14, #0, 248f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x25, #0x0]\n"
+      "str h18, [x24, #0x0]\n"
+      "str h22, [x23, #0x0]\n"
+      "str h26, [x22, #0x0]\n"
+      "b 248f\n"
+      "240:"  // Height 5: Partial direct writeback: partial_8_0
+      "tbz x14, #3, 244f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
+      "st1 { v24.8h }, [x22], #0x10\n"
+      "tbz x14, #2, 242f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "tbz x14, #1, 241f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x25], #0x4\n"
+      "st1 { v17.s }[2], [x24], #0x4\n"
+      "st1 { v21.s }[2], [x23], #0x4\n"
+      "st1 { v25.s }[2], [x22], #0x4\n"
+      "tbz x14, #0, 248f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x25]\n"
+      "st1 { v17.h }[6], [x24]\n"
+      "st1 { v21.h }[6], [x23]\n"
+      "st1 { v25.h }[6], [x22]\n"
+      "b 248f\n"
+      "241:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 248f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x25]\n"
+      "st1 { v17.h }[4], [x24]\n"
+      "st1 { v21.h }[4], [x23]\n"
+      "st1 { v25.h }[4], [x22]\n"
+      "b 248f\n"
+      "242:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 243f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x25], #0x4\n"
+      "str s17, [x24], #0x4\n"
+      "str s21, [x23], #0x4\n"
+      "str s25, [x22], #0x4\n"
+      "tbz x14, #0, 248f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x25]\n"
+      "st1 { v17.h }[2], [x24]\n"
+      "st1 { v21.h }[2], [x23]\n"
+      "st1 { v25.h }[2], [x22]\n"
+      "b 248f\n"
+      "243:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 248f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x25, #0x0]\n"
+      "str h17, [x24, #0x0]\n"
+      "str h21, [x23, #0x0]\n"
+      "str h25, [x22, #0x0]\n"
+      "b 248f\n"
+      "244:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 246f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x14, #1, 245f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "tbz x14, #0, 248f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x25]\n"
+      "st1 { v16.h }[6], [x24]\n"
+      "st1 { v20.h }[6], [x23]\n"
+      "st1 { v24.h }[6], [x22]\n"
+      "b 248f\n"
+      "245:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 248f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x25]\n"
+      "st1 { v16.h }[4], [x24]\n"
+      "st1 { v20.h }[4], [x23]\n"
+      "st1 { v24.h }[4], [x22]\n"
+      "b 248f\n"
+      "246:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 247f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "tbz x14, #0, 248f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x25]\n"
+      "st1 { v16.h }[2], [x24]\n"
+      "st1 { v20.h }[2], [x23]\n"
+      "st1 { v24.h }[2], [x22]\n"
+      "b 248f\n"
+      "247:"  // Height 5: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x25, #0x0]\n"
+      "str h16, [x24, #0x0]\n"
+      "str h20, [x23, #0x0]\n"
+      "str h24, [x22, #0x0]\n"
+      "248:"  // Height 5: Partial direct writeback: Done
+      "b 250f\n"
+      "249:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
+      "250:"  // Height 5: Writeback done
+      "subs x14, x14, #0x20\n"
+      "bgt 202b\n"
+      "b 302f\n"
+      "251:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x21, #0xc\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x15, %x[bias]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
+      "252:"  // Height 6: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0x18\n"
+      "bgt 253f\n"
+      "cmp x14, #0x10\n"
+      "mov x9, x12\n"
+      "bgt 253f\n"
+      "cmp x14, #0x8\n"
+      "mov x10, x12\n"
+      "bgt 253f\n"
+      "mov x11, x12\n"
+      "253:"  // Height 6: B setup done
+      "cbz x15, 254f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "mov v12.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x15, x15, #0x40\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v28.16b, v8.16b\n"
+      "mov v29.16b, v9.16b\n"
+      "mov v30.16b, v10.16b\n"
+      "mov v31.16b, v11.16b\n"
+      "b 273f\n"
+      "254:"  // Height 6: no bias
+      "tbz %x[flags], #0, 272f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "cmp x14, #0x20\n"
+      "add x21, x22, x20, LSL #1\n"
+      "bge 271f\n"
+      "tbz x14, #4, 262f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "ld1 { v24.8h }, [x22], #0x10\n"
+      "ld1 { v28.8h }, [x21], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
+      "ld1 { v17.8h }, [x24], #0x10\n"
+      "ld1 { v21.8h }, [x23], #0x10\n"
+      "ld1 { v25.8h }, [x22], #0x10\n"
+      "ld1 { v29.8h }, [x21], #0x10\n"
+      "tbz x14, #3, 258f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x25], #0x10\n"
+      "ld1 { v18.8h }, [x24], #0x10\n"
+      "ld1 { v22.8h }, [x23], #0x10\n"
+      "ld1 { v26.8h }, [x22], #0x10\n"
+      "ld1 { v30.8h }, [x21], #0x10\n"
+      "tbz x14, #2, 256f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x14, #1, 255f\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x25], #0x4\n"
+      "mov x20, #0x3c\n"
+      "ld1 { v19.s }[2], [x24], #0x4\n"
+      "ld1 { v23.s }[2], [x23], #0x4\n"
+      "ld1 { v27.s }[2], [x22], #0x4\n"
+      "ld1 { v31.s }[2], [x21], #0x4\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x25]\n"
+      "ld1 { v19.h }[6], [x24]\n"
+      "ld1 { v23.h }[6], [x23]\n"
+      "ld1 { v27.h }[6], [x22]\n"
+      "ld1 { v31.h }[6], [x21]\n"
+      "b 270f\n"
+      "255:"  // Height 6: Partial accumulate: partial_1_28
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x25]\n"
+      "ld1 { v19.h }[4], [x24]\n"
+      "ld1 { v23.h }[4], [x23]\n"
+      "ld1 { v27.h }[4], [x22]\n"
+      "ld1 { v31.h }[4], [x21]\n"
+      "b 270f\n"
+      "256:"  // Height 6: Partial accumulate: partial_2_24
+      "tbz x14, #1, 257f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x25], #0x4\n"
+      "mov x20, #0x34\n"
+      "ldr s19, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s27, [x22], #0x4\n"
+      "ldr s31, [x21], #0x4\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x25]\n"
+      "ld1 { v19.h }[2], [x24]\n"
+      "ld1 { v23.h }[2], [x23]\n"
+      "ld1 { v27.h }[2], [x22]\n"
+      "ld1 { v31.h }[2], [x21]\n"
+      "b 270f\n"
+      "257:"  // Height 6: Partial accumulate: partial_1_24
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 270f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x25, #0x0]\n"
+      "ldr h19, [x24, #0x0]\n"
+      "ldr h23, [x23, #0x0]\n"
+      "ldr h27, [x22, #0x0]\n"
+      "ldr h31, [x21, #0x0]\n"
+      "b 270f\n"
+      "258:"  // Height 6: Partial accumulate: partial_4_16
+      "tbz x14, #2, 260f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x14, #1, 259f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x25], #0x4\n"
+      "mov x20, #0x2c\n"
+      "ld1 { v18.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v26.s }[2], [x22], #0x4\n"
+      "ld1 { v30.s }[2], [x21], #0x4\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x25]\n"
+      "ld1 { v18.h }[6], [x24]\n"
+      "ld1 { v22.h }[6], [x23]\n"
+      "ld1 { v26.h }[6], [x22]\n"
+      "ld1 { v30.h }[6], [x21]\n"
+      "b 270f\n"
+      "259:"  // Height 6: Partial accumulate: partial_1_20
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x25]\n"
+      "ld1 { v18.h }[4], [x24]\n"
+      "ld1 { v22.h }[4], [x23]\n"
+      "ld1 { v26.h }[4], [x22]\n"
+      "ld1 { v30.h }[4], [x21]\n"
+      "b 270f\n"
+      "260:"  // Height 6: Partial accumulate: partial_2_16
+      "tbz x14, #1, 261f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x25], #0x4\n"
+      "mov x20, #0x24\n"
+      "ldr s18, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s26, [x22], #0x4\n"
+      "ldr s30, [x21], #0x4\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x25]\n"
+      "ld1 { v18.h }[2], [x24]\n"
+      "ld1 { v22.h }[2], [x23]\n"
+      "ld1 { v26.h }[2], [x22]\n"
+      "ld1 { v30.h }[2], [x21]\n"
+      "b 270f\n"
+      "261:"  // Height 6: Partial accumulate: partial_1_16
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 270f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x25, #0x0]\n"
+      "ldr h18, [x24, #0x0]\n"
+      "ldr h22, [x23, #0x0]\n"
+      "ldr h26, [x22, #0x0]\n"
+      "ldr h30, [x21, #0x0]\n"
+      "b 270f\n"
+      "262:"  // Height 6: Partial accumulate: partial_8_0
+      "tbz x14, #3, 266f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "ld1 { v24.8h }, [x22], #0x10\n"
+      "ld1 { v28.8h }, [x21], #0x10\n"
+      "tbz x14, #2, 264f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x14, #1, 263f\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x25], #0x4\n"
+      "mov x20, #0x1c\n"
+      "ld1 { v17.s }[2], [x24], #0x4\n"
+      "ld1 { v21.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v29.s }[2], [x21], #0x4\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x25]\n"
+      "ld1 { v17.h }[6], [x24]\n"
+      "ld1 { v21.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
+      "ld1 { v29.h }[6], [x21]\n"
+      "b 270f\n"
+      "263:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x25]\n"
+      "ld1 { v17.h }[4], [x24]\n"
+      "ld1 { v21.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
+      "ld1 { v29.h }[4], [x21]\n"
+      "b 270f\n"
+      "264:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x14, #1, 265f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x25], #0x4\n"
+      "mov x20, #0x14\n"
+      "ldr s17, [x24], #0x4\n"
+      "ldr s21, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s29, [x21], #0x4\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x25]\n"
+      "ld1 { v17.h }[2], [x24]\n"
+      "ld1 { v21.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
+      "ld1 { v29.h }[2], [x21]\n"
+      "b 270f\n"
+      "265:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 270f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x25, #0x0]\n"
+      "ldr h17, [x24, #0x0]\n"
+      "ldr h21, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
+      "ldr h29, [x21, #0x0]\n"
+      "b 270f\n"
+      "266:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x14, #2, 268f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x14, #1, 267f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x25], #0x4\n"
+      "mov x20, #0xc\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
+      "ld1 { v20.s }[2], [x23], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x25]\n"
+      "ld1 { v16.h }[6], [x24]\n"
+      "ld1 { v20.h }[6], [x23]\n"
+      "ld1 { v24.h }[6], [x22]\n"
+      "ld1 { v28.h }[6], [x21]\n"
+      "b 270f\n"
+      "267:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x25]\n"
+      "ld1 { v16.h }[4], [x24]\n"
+      "ld1 { v20.h }[4], [x23]\n"
+      "ld1 { v24.h }[4], [x22]\n"
+      "ld1 { v28.h }[4], [x21]\n"
+      "b 270f\n"
+      "268:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x14, #1, 269f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x25], #0x4\n"
+      "mov x20, #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s28, [x21], #0x4\n"
+      "tbz x14, #0, 270f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x25]\n"
+      "ld1 { v16.h }[2], [x24]\n"
+      "ld1 { v20.h }[2], [x23]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v28.h }[2], [x21]\n"
+      "b 270f\n"
+      "269:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr h16, [x24, #0x0]\n"
+      "ldr h20, [x23, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h28, [x21, #0x0]\n"
+      "270:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 273f\n"
+      "271:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q24, [x22, #0x0]\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q26, [x22, #0x20]\n"
+      "ldr q27, [x22, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 273f\n"
+      "272:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "273:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "274:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 275f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 276f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
+      "b 276f\n"
+      "275:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
+      "276:"  // Height 6: input setup done
+      "cmp x27, #0x8\n"
+      "blt 279f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 278f\n"
+      "277:"  // Height 6: Multiply loop: Main loop head
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x10\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "fmla v28.8h, v6.8h, v5.h[0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "add x21, x21, #0x10\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "fmla v29.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x9, #0x0]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "fmla v30.8h, v6.8h, v5.h[0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "fmla v31.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "fmla v24.8h, v6.8h, v4.h[1]\n"
+      "fmla v28.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "fmla v25.8h, v7.8h, v4.h[1]\n"
+      "fmla v29.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x9, #0x10]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "fmla v26.8h, v6.8h, v4.h[1]\n"
+      "fmla v30.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "fmla v27.8h, v7.8h, v4.h[1]\n"
+      "fmla v31.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x11, #0x20]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "fmla v24.8h, v6.8h, v4.h[2]\n"
+      "fmla v28.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x10, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "fmla v25.8h, v7.8h, v4.h[2]\n"
+      "fmla v29.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x9, #0x20]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "fmla v26.8h, v6.8h, v4.h[2]\n"
+      "fmla v30.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x12, #0x30]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "fmla v27.8h, v7.8h, v4.h[2]\n"
+      "fmla v31.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x11, #0x30]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "fmla v24.8h, v6.8h, v4.h[3]\n"
+      "fmla v28.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x10, #0x30]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "fmla v25.8h, v7.8h, v4.h[3]\n"
+      "fmla v29.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x9, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "fmla v26.8h, v6.8h, v4.h[3]\n"
+      "fmla v30.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x12, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "fmla v27.8h, v7.8h, v4.h[3]\n"
+      "fmla v31.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x11, #0x40]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "fmla v24.8h, v6.8h, v4.h[4]\n"
+      "fmla v28.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x10, #0x40]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "fmla v25.8h, v7.8h, v4.h[4]\n"
+      "fmla v29.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x9, #0x40]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "fmla v26.8h, v6.8h, v4.h[4]\n"
+      "fmla v30.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x12, #0x50]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "fmla v27.8h, v7.8h, v4.h[4]\n"
+      "fmla v31.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x11, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "fmla v24.8h, v6.8h, v4.h[5]\n"
+      "fmla v28.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x10, #0x50]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "fmla v25.8h, v7.8h, v4.h[5]\n"
+      "fmla v29.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x9, #0x50]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "fmla v26.8h, v6.8h, v4.h[5]\n"
+      "fmla v30.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x12, #0x60]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "fmla v27.8h, v7.8h, v4.h[5]\n"
+      "fmla v31.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x11, #0x60]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "fmla v24.8h, v6.8h, v4.h[6]\n"
+      "fmla v28.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x10, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "fmla v25.8h, v7.8h, v4.h[6]\n"
+      "fmla v29.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x9, #0x60]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "fmla v26.8h, v6.8h, v4.h[6]\n"
+      "fmla v30.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x12, #0x70]\n"
+      "add x12, x12, #0x80\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "fmla v27.8h, v7.8h, v4.h[6]\n"
+      "fmla v31.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x11, #0x70]\n"
+      "add x11, x11, #0x80\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "fmla v24.8h, v6.8h, v4.h[7]\n"
+      "fmla v28.8h, v6.8h, v5.h[7]\n"
+      "ldr q6, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "fmla v25.8h, v7.8h, v4.h[7]\n"
+      "fmla v29.8h, v7.8h, v5.h[7]\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v30.8h, v6.8h, v5.h[7]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "fmla v31.8h, v7.8h, v5.h[7]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "bge 277b\n"
+      "278:"  // Height 6: Multiply loop: Single iteration only
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "fmla v28.8h, v6.8h, v5.h[0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "fmla v29.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x9, #0x0]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "fmla v30.8h, v6.8h, v5.h[0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "fmla v31.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "fmla v24.8h, v6.8h, v4.h[1]\n"
+      "fmla v28.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "fmla v25.8h, v7.8h, v4.h[1]\n"
+      "fmla v29.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x9, #0x10]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "fmla v26.8h, v6.8h, v4.h[1]\n"
+      "fmla v30.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "fmla v27.8h, v7.8h, v4.h[1]\n"
+      "fmla v31.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x11, #0x20]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "fmla v24.8h, v6.8h, v4.h[2]\n"
+      "fmla v28.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x10, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "fmla v25.8h, v7.8h, v4.h[2]\n"
+      "fmla v29.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x9, #0x20]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "fmla v26.8h, v6.8h, v4.h[2]\n"
+      "fmla v30.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x12, #0x30]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "fmla v27.8h, v7.8h, v4.h[2]\n"
+      "fmla v31.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x11, #0x30]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "fmla v24.8h, v6.8h, v4.h[3]\n"
+      "fmla v28.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x10, #0x30]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "fmla v25.8h, v7.8h, v4.h[3]\n"
+      "fmla v29.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x9, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "fmla v26.8h, v6.8h, v4.h[3]\n"
+      "fmla v30.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x12, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "fmla v27.8h, v7.8h, v4.h[3]\n"
+      "fmla v31.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x11, #0x40]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "fmla v24.8h, v6.8h, v4.h[4]\n"
+      "fmla v28.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x10, #0x40]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "fmla v25.8h, v7.8h, v4.h[4]\n"
+      "fmla v29.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x9, #0x40]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "fmla v26.8h, v6.8h, v4.h[4]\n"
+      "fmla v30.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x12, #0x50]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "fmla v27.8h, v7.8h, v4.h[4]\n"
+      "fmla v31.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x11, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "fmla v24.8h, v6.8h, v4.h[5]\n"
+      "fmla v28.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x10, #0x50]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "fmla v25.8h, v7.8h, v4.h[5]\n"
+      "fmla v29.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x9, #0x50]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "fmla v26.8h, v6.8h, v4.h[5]\n"
+      "fmla v30.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x12, #0x60]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "fmla v27.8h, v7.8h, v4.h[5]\n"
+      "fmla v31.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x11, #0x60]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "fmla v24.8h, v6.8h, v4.h[6]\n"
+      "fmla v28.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x10, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "fmla v25.8h, v7.8h, v4.h[6]\n"
+      "fmla v29.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x9, #0x60]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "fmla v26.8h, v6.8h, v4.h[6]\n"
+      "fmla v30.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x12, #0x70]\n"
+      "add x12, x12, #0x80\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "fmla v27.8h, v7.8h, v4.h[6]\n"
+      "fmla v31.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x11, #0x70]\n"
+      "add x11, x11, #0x80\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "fmla v24.8h, v6.8h, v4.h[7]\n"
+      "fmla v28.8h, v6.8h, v5.h[7]\n"
+      "ldr q6, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "fmla v25.8h, v7.8h, v4.h[7]\n"
+      "fmla v29.8h, v7.8h, v5.h[7]\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v30.8h, v6.8h, v5.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v31.8h, v7.8h, v5.h[7]\n"
+      "279:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 281f\n"
+      "280:"  // Height 6: Multiply loop: Odd block loop
+      "ldr h7, [x26], #0x2\n"
+      "ldr h6, [x25], #0x2\n"
+      "sub x27, x27, #0x1\n"
+      "ldr h5, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "ldr h3, [x22], #0x2\n"
+      "ldr h2, [x21], #0x2\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q0, [x11, #0x0]\n"
+      "fmla v8.8h, v1.8h, v7.h[0]\n"
+      "fmla v12.8h, v1.8h, v6.h[0]\n"
+      "fmla v16.8h, v1.8h, v5.h[0]\n"
+      "fmla v20.8h, v1.8h, v4.h[0]\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "fmla v24.8h, v1.8h, v3.h[0]\n"
+      "fmla v28.8h, v1.8h, v2.h[0]\n"
+      "ldr q1, [x10, #0x0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.8h, v0.8h, v7.h[0]\n"
+      "fmla v13.8h, v0.8h, v6.h[0]\n"
+      "fmla v17.8h, v0.8h, v5.h[0]\n"
+      "fmla v21.8h, v0.8h, v4.h[0]\n"
+      "fmla v25.8h, v0.8h, v3.h[0]\n"
+      "fmla v29.8h, v0.8h, v2.h[0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v10.8h, v1.8h, v7.h[0]\n"
+      "fmla v14.8h, v1.8h, v6.h[0]\n"
+      "fmla v18.8h, v1.8h, v5.h[0]\n"
+      "fmla v22.8h, v1.8h, v4.h[0]\n"
+      "fmla v26.8h, v1.8h, v3.h[0]\n"
+      "fmla v30.8h, v1.8h, v2.h[0]\n"
+      "fmla v11.8h, v0.8h, v7.h[0]\n"
+      "fmla v15.8h, v0.8h, v6.h[0]\n"
+      "fmla v19.8h, v0.8h, v5.h[0]\n"
+      "fmla v23.8h, v0.8h, v4.h[0]\n"
+      "fmla v27.8h, v0.8h, v3.h[0]\n"
+      "fmla v31.8h, v0.8h, v2.h[0]\n"
+      "cbnz x27, 280b\n"
+      "281:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 274b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "tbz %x[flags], #1, 282f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v1.8h\n"
+      "fmin v9.8h, v9.8h, v1.8h\n"
+      "fmin v10.8h, v10.8h, v1.8h\n"
+      "fmin v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v1.8h\n"
+      "fmin v13.8h, v13.8h, v1.8h\n"
+      "fmin v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v1.8h\n"
+      "fmin v16.8h, v16.8h, v1.8h\n"
+      "fmin v17.8h, v17.8h, v1.8h\n"
+      "fmin v18.8h, v18.8h, v1.8h\n"
+      "fmin v19.8h, v19.8h, v1.8h\n"
+      "fmin v20.8h, v20.8h, v1.8h\n"
+      "fmin v21.8h, v21.8h, v1.8h\n"
+      "fmin v22.8h, v22.8h, v1.8h\n"
+      "fmin v23.8h, v23.8h, v1.8h\n"
+      "fmin v24.8h, v24.8h, v1.8h\n"
+      "fmin v25.8h, v25.8h, v1.8h\n"
+      "fmin v26.8h, v26.8h, v1.8h\n"
+      "fmin v27.8h, v27.8h, v1.8h\n"
+      "fmin v28.8h, v28.8h, v1.8h\n"
+      "fmin v29.8h, v29.8h, v1.8h\n"
+      "fmin v30.8h, v30.8h, v1.8h\n"
+      "fmin v31.8h, v31.8h, v1.8h\n"
+      "fmax v8.8h, v8.8h, v0.8h\n"
+      "fmax v9.8h, v9.8h, v0.8h\n"
+      "fmax v10.8h, v10.8h, v0.8h\n"
+      "fmax v11.8h, v11.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v0.8h\n"
+      "fmax v13.8h, v13.8h, v0.8h\n"
+      "fmax v14.8h, v14.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v0.8h\n"
+      "fmax v16.8h, v16.8h, v0.8h\n"
+      "fmax v17.8h, v17.8h, v0.8h\n"
+      "fmax v18.8h, v18.8h, v0.8h\n"
+      "fmax v19.8h, v19.8h, v0.8h\n"
+      "fmax v20.8h, v20.8h, v0.8h\n"
+      "fmax v21.8h, v21.8h, v0.8h\n"
+      "fmax v22.8h, v22.8h, v0.8h\n"
+      "fmax v23.8h, v23.8h, v0.8h\n"
+      "fmax v24.8h, v24.8h, v0.8h\n"
+      "fmax v25.8h, v25.8h, v0.8h\n"
+      "fmax v26.8h, v26.8h, v0.8h\n"
+      "fmax v27.8h, v27.8h, v0.8h\n"
+      "fmax v28.8h, v28.8h, v0.8h\n"
+      "fmax v29.8h, v29.8h, v0.8h\n"
+      "fmax v30.8h, v30.8h, v0.8h\n"
+      "fmax v31.8h, v31.8h, v0.8h\n"
+      "282:"  // Height 6: No activation
+      "cmp x14, #0x20\n"
+      "bge 299f\n"
+      "tbz x14, #4, 290f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v13.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v17.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
+      "st1 { v21.8h }, [x23], #0x10\n"
+      "st1 { v24.8h }, [x22], #0x10\n"
+      "st1 { v25.8h }, [x22], #0x10\n"
+      "st1 { v28.8h }, [x21], #0x10\n"
+      "st1 { v29.8h }, [x21], #0x10\n"
+      "tbz x14, #3, 286f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x25], #0x10\n"
+      "st1 { v18.8h }, [x24], #0x10\n"
+      "st1 { v22.8h }, [x23], #0x10\n"
+      "st1 { v26.8h }, [x22], #0x10\n"
+      "st1 { v30.8h }, [x21], #0x10\n"
+      "tbz x14, #2, 284f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x14, #1, 283f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x25], #0x4\n"
+      "st1 { v19.s }[2], [x24], #0x4\n"
+      "st1 { v23.s }[2], [x23], #0x4\n"
+      "st1 { v27.s }[2], [x22], #0x4\n"
+      "st1 { v31.s }[2], [x21], #0x4\n"
+      "tbz x14, #0, 298f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x25]\n"
+      "st1 { v19.h }[6], [x24]\n"
+      "st1 { v23.h }[6], [x23]\n"
+      "st1 { v27.h }[6], [x22]\n"
+      "st1 { v31.h }[6], [x21]\n"
+      "b 298f\n"
+      "283:"  // Height 6: Partial direct writeback: partial_1_28
+      "tbz x14, #0, 298f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x25]\n"
+      "st1 { v19.h }[4], [x24]\n"
+      "st1 { v23.h }[4], [x23]\n"
+      "st1 { v27.h }[4], [x22]\n"
+      "st1 { v31.h }[4], [x21]\n"
+      "b 298f\n"
+      "284:"  // Height 6: Partial direct writeback: partial_2_24
+      "tbz x14, #1, 285f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x25], #0x4\n"
+      "str s19, [x24], #0x4\n"
+      "str s23, [x23], #0x4\n"
+      "str s27, [x22], #0x4\n"
+      "str s31, [x21], #0x4\n"
+      "tbz x14, #0, 298f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x25]\n"
+      "st1 { v19.h }[2], [x24]\n"
+      "st1 { v23.h }[2], [x23]\n"
+      "st1 { v27.h }[2], [x22]\n"
+      "st1 { v31.h }[2], [x21]\n"
+      "b 298f\n"
+      "285:"  // Height 6: Partial direct writeback: partial_1_24
+      "tbz x14, #0, 298f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x25, #0x0]\n"
+      "str h19, [x24, #0x0]\n"
+      "str h23, [x23, #0x0]\n"
+      "str h27, [x22, #0x0]\n"
+      "str h31, [x21, #0x0]\n"
+      "b 298f\n"
+      "286:"  // Height 6: Partial direct writeback: partial_4_16
+      "tbz x14, #2, 288f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x14, #1, 287f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x25], #0x4\n"
+      "st1 { v18.s }[2], [x24], #0x4\n"
+      "st1 { v22.s }[2], [x23], #0x4\n"
+      "st1 { v26.s }[2], [x22], #0x4\n"
+      "st1 { v30.s }[2], [x21], #0x4\n"
+      "tbz x14, #0, 298f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x25]\n"
+      "st1 { v18.h }[6], [x24]\n"
+      "st1 { v22.h }[6], [x23]\n"
+      "st1 { v26.h }[6], [x22]\n"
+      "st1 { v30.h }[6], [x21]\n"
+      "b 298f\n"
+      "287:"  // Height 6: Partial direct writeback: partial_1_20
+      "tbz x14, #0, 298f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x25]\n"
+      "st1 { v18.h }[4], [x24]\n"
+      "st1 { v22.h }[4], [x23]\n"
+      "st1 { v26.h }[4], [x22]\n"
+      "st1 { v30.h }[4], [x21]\n"
+      "b 298f\n"
+      "288:"  // Height 6: Partial direct writeback: partial_2_16
+      "tbz x14, #1, 289f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x25], #0x4\n"
+      "str s18, [x24], #0x4\n"
+      "str s22, [x23], #0x4\n"
+      "str s26, [x22], #0x4\n"
+      "str s30, [x21], #0x4\n"
+      "tbz x14, #0, 298f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x25]\n"
+      "st1 { v18.h }[2], [x24]\n"
+      "st1 { v22.h }[2], [x23]\n"
+      "st1 { v26.h }[2], [x22]\n"
+      "st1 { v30.h }[2], [x21]\n"
+      "b 298f\n"
+      "289:"  // Height 6: Partial direct writeback: partial_1_16
+      "tbz x14, #0, 298f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x25, #0x0]\n"
+      "str h18, [x24, #0x0]\n"
+      "str h22, [x23, #0x0]\n"
+      "str h26, [x22, #0x0]\n"
+      "str h30, [x21, #0x0]\n"
+      "b 298f\n"
+      "290:"  // Height 6: Partial direct writeback: partial_8_0
+      "tbz x14, #3, 294f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
+      "st1 { v24.8h }, [x22], #0x10\n"
+      "st1 { v28.8h }, [x21], #0x10\n"
+      "tbz x14, #2, 292f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x14, #1, 291f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x25], #0x4\n"
+      "st1 { v17.s }[2], [x24], #0x4\n"
+      "st1 { v21.s }[2], [x23], #0x4\n"
+      "st1 { v25.s }[2], [x22], #0x4\n"
+      "st1 { v29.s }[2], [x21], #0x4\n"
+      "tbz x14, #0, 298f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x25]\n"
+      "st1 { v17.h }[6], [x24]\n"
+      "st1 { v21.h }[6], [x23]\n"
+      "st1 { v25.h }[6], [x22]\n"
+      "st1 { v29.h }[6], [x21]\n"
+      "b 298f\n"
+      "291:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 298f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x25]\n"
+      "st1 { v17.h }[4], [x24]\n"
+      "st1 { v21.h }[4], [x23]\n"
+      "st1 { v25.h }[4], [x22]\n"
+      "st1 { v29.h }[4], [x21]\n"
+      "b 298f\n"
+      "292:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 293f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x25], #0x4\n"
+      "str s17, [x24], #0x4\n"
+      "str s21, [x23], #0x4\n"
+      "str s25, [x22], #0x4\n"
+      "str s29, [x21], #0x4\n"
+      "tbz x14, #0, 298f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x25]\n"
+      "st1 { v17.h }[2], [x24]\n"
+      "st1 { v21.h }[2], [x23]\n"
+      "st1 { v25.h }[2], [x22]\n"
+      "st1 { v29.h }[2], [x21]\n"
+      "b 298f\n"
+      "293:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 298f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x25, #0x0]\n"
+      "str h17, [x24, #0x0]\n"
+      "str h21, [x23, #0x0]\n"
+      "str h25, [x22, #0x0]\n"
+      "str h29, [x21, #0x0]\n"
+      "b 298f\n"
+      "294:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 296f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x14, #1, 295f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x14, #0, 298f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x25]\n"
+      "st1 { v16.h }[6], [x24]\n"
+      "st1 { v20.h }[6], [x23]\n"
+      "st1 { v24.h }[6], [x22]\n"
+      "st1 { v28.h }[6], [x21]\n"
+      "b 298f\n"
+      "295:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 298f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x25]\n"
+      "st1 { v16.h }[4], [x24]\n"
+      "st1 { v20.h }[4], [x23]\n"
+      "st1 { v24.h }[4], [x22]\n"
+      "st1 { v28.h }[4], [x21]\n"
+      "b 298f\n"
+      "296:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 297f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x14, #0, 298f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x25]\n"
+      "st1 { v16.h }[2], [x24]\n"
+      "st1 { v20.h }[2], [x23]\n"
+      "st1 { v24.h }[2], [x22]\n"
+      "st1 { v28.h }[2], [x21]\n"
+      "b 298f\n"
+      "297:"  // Height 6: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x25, #0x0]\n"
+      "str h16, [x24, #0x0]\n"
+      "str h20, [x23, #0x0]\n"
+      "str h24, [x22, #0x0]\n"
+      "str h28, [x21, #0x0]\n"
+      "298:"  // Height 6: Partial direct writeback: Done
+      "b 300f\n"
+      "299:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "300:"  // Height 6: Writeback done
+      "subs x14, x14, #0x20\n"
+      "bgt 252b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 302f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 301f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "301:"  // Update direct input
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "302:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp
new file mode 100644
index 0000000000..94fb84e409
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const float *, \
+    size_t, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_ffhybrid_fp32_mla_6x16( ARGLIST );
+
+class cls_a64_ffhybrid_fp32_mla_6x16
+{
+public:
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+    static unsigned int stripe_width()
+    {
+        return 4;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL128_BL32;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 13.16 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_ffhybrid_fp32_mla_6x16;
+    cls_a64_ffhybrid_fp32_mla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp
new file mode 100644
index 0000000000..b1cd6dc970
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp
@@ -0,0 +1,3461 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_ffhybrid_fp32_mla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, size_t B_stride, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        const float *cur_B_ptr = {};
+        size_t B_stride = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    ka.B_stride = B_stride;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 171f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 137f\n"
+      "beq 103f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 69f\n"
+      "beq 35f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 3f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 3f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 3f\n"
+      "mov x11, x12\n"
+      "3:"  // Height 1: B setup done
+      "cbz x15, 4f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "add x15, x15, #0x40\n"
+      "b 15f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 14f\n"
+      "cmp x14, #0x10\n"
+      "bge 13f\n"
+      "tbz x14, #3, 8f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x14, #2, 6f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 5f\n"
+      "ldr d11, [x13], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 12f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 12f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 12f\n"
+      "6:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x14, #1, 7f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 12f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 12f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 12f\n"
+      "8:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x14, #2, 10f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 9f\n"
+      "ldr d9, [x13], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 12f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 12f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "b 12f\n"
+      "10:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x14, #1, 11f\n"
+      "ldr d8, [x13], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "b 12f\n"
+      "11:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s8, [x13, #0x0]\n"
+      "mov x20, #0x0\n"
+      "12:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 15f\n"
+      "13:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 15f\n"
+      "14:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "15:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "16:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 18f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "b 18f\n"
+      "17:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "18:"  // Height 1: input setup done
+      "cmp x27, #0x4\n"
+      "blt 21f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 20f\n"
+      "19:"  // Height 1: Multiply loop: Main loop head
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x8\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "add x26, x26, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "add x12, x12, #0x40\n"
+      "ldr q6, [x12, #0x0]\n"
+      "add x11, x11, #0x40\n"
+      "ldr q7, [x11, #0x0]\n"
+      "add x10, x10, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "bge 19b\n"
+      "20:"  // Height 1: Multiply loop: Single iteration only
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "sub x27, x27, #0x4\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "add x26, x26, #0x10\n"
+      "add x12, x12, #0x40\n"
+      "add x11, x11, #0x40\n"
+      "add x10, x10, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "21:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 23f\n"
+      "22:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v8.4s, v16.4s, v18.s[0]\n"
+      "sub x27, x27, #0x1\n"
+      "ldr q17, [x11, #0x0]\n"
+      "ldr q16, [x10, #0x0]\n"
+      "fmla v9.4s, v17.4s, v18.s[0]\n"
+      "fmla v10.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v11.4s, v16.4s, v18.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "cbnz x27, 22b\n"
+      "23:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 16b\n"
+      "tbz %x[flags], #1, 24f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "24:"  // Height 1: No activation
+      "cmp x14, #0x10\n"
+      "bge 33f\n"
+      "tbz x14, #3, 28f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x14, #2, 26f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 25f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x14, #0, 32f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 32f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 32f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 32f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 27f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x14, #0, 32f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 32f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 32f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 32f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 30f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 29f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x14, #0, 32f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 32f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 32f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 32f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 31f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x14, #0, 32f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 32f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "32:"  // Height 1: Partial direct writeback: Done
+      "b 34f\n"
+      "33:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "34:"  // Height 1: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 2b\n"
+      "b 206f\n"
+      "35:"  // Height 2
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "36:"  // Height 2: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 37f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 37f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 37f\n"
+      "mov x11, x12\n"
+      "37:"  // Height 2: B setup done
+      "cbz x15, 38f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "mov v12.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "add x15, x15, #0x40\n"
+      "b 49f\n"
+      "38:"  // Height 2: no bias
+      "tbz %x[flags], #0, 48f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x14, #0x10\n"
+      "add x25, x13, x20, LSL #2\n"
+      "bge 47f\n"
+      "tbz x14, #3, 42f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "tbz x14, #2, 40f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 39f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 46f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "b 46f\n"
+      "39:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 46f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "b 46f\n"
+      "40:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x14, #1, 41f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 46f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "b 46f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 46f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "b 46f\n"
+      "42:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x14, #2, 44f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 43f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 46f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "b 46f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 46f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "b 46f\n"
+      "44:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x14, #1, 45f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 46f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "b 46f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "46:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 49f\n"
+      "47:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "b 49f\n"
+      "48:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "49:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "50:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 51f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 52f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "b 52f\n"
+      "51:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "52:"  // Height 2: input setup done
+      "cmp x27, #0x4\n"
+      "blt 55f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 54f\n"
+      "53:"  // Height 2: Multiply loop: Main loop head
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "sub x27, x27, #0x4\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "bge 53b\n"
+      "54:"  // Height 2: Multiply loop: Single iteration only
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "sub x27, x27, #0x4\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
+      "55:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 57f\n"
+      "56:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
+      "sub x27, x27, #0x1\n"
+      "ldr q17, [x12, #0x0]\n"
+      "ldr q16, [x11, #0x0]\n"
+      "fmla v8.4s, v17.4s, v19.s[0]\n"
+      "fmla v12.4s, v17.4s, v18.s[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "fmla v9.4s, v16.4s, v19.s[0]\n"
+      "fmla v13.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.4s, v17.4s, v19.s[0]\n"
+      "fmla v14.4s, v17.4s, v18.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "fmla v11.4s, v16.4s, v19.s[0]\n"
+      "fmla v15.4s, v16.4s, v18.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "cbnz x27, 56b\n"
+      "57:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 50b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "tbz %x[flags], #1, 58f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmin v12.4s, v12.4s, v17.4s\n"
+      "fmin v13.4s, v13.4s, v17.4s\n"
+      "fmin v14.4s, v14.4s, v17.4s\n"
+      "fmin v15.4s, v15.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
+      "fmax v14.4s, v14.4s, v16.4s\n"
+      "fmax v15.4s, v15.4s, v16.4s\n"
+      "58:"  // Height 2: No activation
+      "cmp x14, #0x10\n"
+      "bge 67f\n"
+      "tbz x14, #3, 62f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "tbz x14, #2, 60f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 59f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "tbz x14, #0, 66f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "b 66f\n"
+      "59:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 66f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "b 66f\n"
+      "60:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 61f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "tbz x14, #0, 66f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "b 66f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 66f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "b 66f\n"
+      "62:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 64f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 63f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "tbz x14, #0, 66f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "b 66f\n"
+      "63:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 66f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "b 66f\n"
+      "64:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 65f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "tbz x14, #0, 66f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "b 66f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "66:"  // Height 2: Partial direct writeback: Done
+      "b 68f\n"
+      "67:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "68:"  // Height 2: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 36b\n"
+      "b 206f\n"
+      "69:"  // Height 3
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "70:"  // Height 3: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 71f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 71f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 71f\n"
+      "mov x11, x12\n"
+      "71:"  // Height 3: B setup done
+      "cbz x15, 72f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "mov v12.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x15, x15, #0x40\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "b 83f\n"
+      "72:"  // Height 3: no bias
+      "tbz %x[flags], #0, 82f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "cmp x14, #0x10\n"
+      "add x24, x25, x20, LSL #2\n"
+      "bge 81f\n"
+      "tbz x14, #3, 76f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "tbz x14, #2, 74f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 73f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
+      "tbz x14, #0, 80f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "b 80f\n"
+      "73:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 80f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "b 80f\n"
+      "74:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x14, #1, 75f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
+      "tbz x14, #0, 80f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "b 80f\n"
+      "75:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 80f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "b 80f\n"
+      "76:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x14, #2, 78f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 77f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
+      "tbz x14, #0, 80f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "b 80f\n"
+      "77:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 80f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "b 80f\n"
+      "78:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x14, #1, 79f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "tbz x14, #0, 80f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
+      "b 80f\n"
+      "79:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
+      "80:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 83f\n"
+      "81:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "b 83f\n"
+      "82:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "83:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "84:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 86f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 86f\n"
+      "85:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "86:"  // Height 3: input setup done
+      "cmp x27, #0x4\n"
+      "blt 89f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 88f\n"
+      "87:"  // Height 3: Multiply loop: Main loop head
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x8\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "ldr q20, [x9, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr q21, [x12, #0x10]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr q20, [x11, #0x10]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x10]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x9, #0x10]\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x12, #0x20]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x11, #0x20]\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x9, #0x20]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x12, #0x30]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "add x12, x12, #0x40\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr q21, [x10, #0x30]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "add x10, x10, #0x40\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr q20, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "bge 87b\n"
+      "88:"  // Height 3: Multiply loop: Single iteration only
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "ldr q20, [x9, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr q21, [x12, #0x10]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr q20, [x11, #0x10]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x10]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x9, #0x10]\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x12, #0x20]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x11, #0x20]\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x9, #0x20]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x12, #0x30]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "add x12, x12, #0x40\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr q21, [x10, #0x30]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "add x10, x10, #0x40\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr q20, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
+      "89:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 91f\n"
+      "90:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "sub x27, x27, #0x1\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x12, #0x0]\n"
+      "fmla v8.4s, v21.4s, v24.s[0]\n"
+      "fmla v12.4s, v21.4s, v23.s[0]\n"
+      "ldr q20, [x11, #0x0]\n"
+      "fmla v16.4s, v21.4s, v22.s[0]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v9.4s, v20.4s, v24.s[0]\n"
+      "fmla v13.4s, v20.4s, v23.s[0]\n"
+      "fmla v17.4s, v20.4s, v22.s[0]\n"
+      "ldr q20, [x9, #0x0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v10.4s, v21.4s, v24.s[0]\n"
+      "fmla v14.4s, v21.4s, v23.s[0]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      "fmla v18.4s, v21.4s, v22.s[0]\n"
+      "fmla v11.4s, v20.4s, v24.s[0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v15.4s, v20.4s, v23.s[0]\n"
+      "fmla v19.4s, v20.4s, v22.s[0]\n"
+      "cbnz x27, 90b\n"
+      "91:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 84b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "tbz %x[flags], #1, 92f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v21.4s\n"
+      "fmin v9.4s, v9.4s, v21.4s\n"
+      "fmin v10.4s, v10.4s, v21.4s\n"
+      "fmin v11.4s, v11.4s, v21.4s\n"
+      "fmin v12.4s, v12.4s, v21.4s\n"
+      "fmin v13.4s, v13.4s, v21.4s\n"
+      "fmin v14.4s, v14.4s, v21.4s\n"
+      "fmin v15.4s, v15.4s, v21.4s\n"
+      "fmin v16.4s, v16.4s, v21.4s\n"
+      "fmin v17.4s, v17.4s, v21.4s\n"
+      "fmin v18.4s, v18.4s, v21.4s\n"
+      "fmin v19.4s, v19.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
+      "92:"  // Height 3: No activation
+      "cmp x14, #0x10\n"
+      "bge 101f\n"
+      "tbz x14, #3, 96f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "tbz x14, #2, 94f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 93f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "tbz x14, #0, 100f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "b 100f\n"
+      "93:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 100f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "b 100f\n"
+      "94:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 95f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "tbz x14, #0, 100f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "b 100f\n"
+      "95:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 100f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "b 100f\n"
+      "96:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 98f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 97f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "tbz x14, #0, 100f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "b 100f\n"
+      "97:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 100f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "b 100f\n"
+      "98:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 99f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "tbz x14, #0, 100f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "b 100f\n"
+      "99:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "100:"  // Height 3: Partial direct writeback: Done
+      "b 102f\n"
+      "101:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "102:"  // Height 3: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 70b\n"
+      "b 206f\n"
+      "103:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "104:"  // Height 4: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 105f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 105f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 105f\n"
+      "mov x11, x12\n"
+      "105:"  // Height 4: B setup done
+      "cbz x15, 106f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "mov v12.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x15, x15, #0x40\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "b 117f\n"
+      "106:"  // Height 4: no bias
+      "tbz %x[flags], #0, 116f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "cmp x14, #0x10\n"
+      "add x23, x24, x20, LSL #2\n"
+      "bge 115f\n"
+      "tbz x14, #3, 110f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "tbz x14, #2, 108f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 107f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "tbz x14, #0, 114f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "b 114f\n"
+      "107:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 114f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "b 114f\n"
+      "108:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x14, #1, 109f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "tbz x14, #0, 114f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "b 114f\n"
+      "109:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 114f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "b 114f\n"
+      "110:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x14, #2, 112f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 111f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "tbz x14, #0, 114f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "b 114f\n"
+      "111:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 114f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "b 114f\n"
+      "112:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x14, #1, 113f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "tbz x14, #0, 114f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "b 114f\n"
+      "113:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "114:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 117f\n"
+      "115:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "b 117f\n"
+      "116:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "117:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "118:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 119f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 120f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 120f\n"
+      "119:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "120:"  // Height 4: input setup done
+      "cmp x27, #0x4\n"
+      "blt 123f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 122f\n"
+      "121:"  // Height 4: Multiply loop: Main loop head
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x8\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "ldr q24, [x9, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr q24, [x11, #0x10]\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x9, #0x10]\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x12, #0x20]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x11, #0x20]\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x9, #0x20]\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr q25, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr q24, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "bge 121b\n"
+      "122:"  // Height 4: Multiply loop: Single iteration only
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "ldr q24, [x9, #0x0]\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr q24, [x11, #0x10]\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x9, #0x10]\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x12, #0x20]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x11, #0x20]\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x9, #0x20]\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr q25, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr q24, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
+      "123:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 125f\n"
+      "124:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "sub x27, x27, #0x1\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x12, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
+      "fmla v8.4s, v25.4s, v29.s[0]\n"
+      "fmla v12.4s, v25.4s, v28.s[0]\n"
+      "fmla v16.4s, v25.4s, v27.s[0]\n"
+      "fmla v20.4s, v25.4s, v26.s[0]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v9.4s, v24.4s, v29.s[0]\n"
+      "fmla v13.4s, v24.4s, v28.s[0]\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
+      "fmla v17.4s, v24.4s, v27.s[0]\n"
+      "fmla v21.4s, v24.4s, v26.s[0]\n"
+      "ldr q24, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v10.4s, v25.4s, v29.s[0]\n"
+      "fmla v14.4s, v25.4s, v28.s[0]\n"
+      "fmla v18.4s, v25.4s, v27.s[0]\n"
+      "fmla v22.4s, v25.4s, v26.s[0]\n"
+      "fmla v11.4s, v24.4s, v29.s[0]\n"
+      "fmla v15.4s, v24.4s, v28.s[0]\n"
+      "fmla v19.4s, v24.4s, v27.s[0]\n"
+      "fmla v23.4s, v24.4s, v26.s[0]\n"
+      "cbnz x27, 124b\n"
+      "125:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 118b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "tbz %x[flags], #1, 126f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v25.4s\n"
+      "fmin v9.4s, v9.4s, v25.4s\n"
+      "fmin v10.4s, v10.4s, v25.4s\n"
+      "fmin v11.4s, v11.4s, v25.4s\n"
+      "fmin v12.4s, v12.4s, v25.4s\n"
+      "fmin v13.4s, v13.4s, v25.4s\n"
+      "fmin v14.4s, v14.4s, v25.4s\n"
+      "fmin v15.4s, v15.4s, v25.4s\n"
+      "fmin v16.4s, v16.4s, v25.4s\n"
+      "fmin v17.4s, v17.4s, v25.4s\n"
+      "fmin v18.4s, v18.4s, v25.4s\n"
+      "fmin v19.4s, v19.4s, v25.4s\n"
+      "fmin v20.4s, v20.4s, v25.4s\n"
+      "fmin v21.4s, v21.4s, v25.4s\n"
+      "fmin v22.4s, v22.4s, v25.4s\n"
+      "fmin v23.4s, v23.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v24.4s\n"
+      "fmax v9.4s, v9.4s, v24.4s\n"
+      "fmax v10.4s, v10.4s, v24.4s\n"
+      "fmax v11.4s, v11.4s, v24.4s\n"
+      "fmax v12.4s, v12.4s, v24.4s\n"
+      "fmax v13.4s, v13.4s, v24.4s\n"
+      "fmax v14.4s, v14.4s, v24.4s\n"
+      "fmax v15.4s, v15.4s, v24.4s\n"
+      "fmax v16.4s, v16.4s, v24.4s\n"
+      "fmax v17.4s, v17.4s, v24.4s\n"
+      "fmax v18.4s, v18.4s, v24.4s\n"
+      "fmax v19.4s, v19.4s, v24.4s\n"
+      "fmax v20.4s, v20.4s, v24.4s\n"
+      "fmax v21.4s, v21.4s, v24.4s\n"
+      "fmax v22.4s, v22.4s, v24.4s\n"
+      "fmax v23.4s, v23.4s, v24.4s\n"
+      "126:"  // Height 4: No activation
+      "cmp x14, #0x10\n"
+      "bge 135f\n"
+      "tbz x14, #3, 130f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "tbz x14, #2, 128f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "st1 { v22.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 127f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "tbz x14, #0, 134f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "st1 { v23.s }[2], [x23]\n"
+      "b 134f\n"
+      "127:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 134f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "str s23, [x23, #0x0]\n"
+      "b 134f\n"
+      "128:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 129f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "tbz x14, #0, 134f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "b 134f\n"
+      "129:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 134f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "b 134f\n"
+      "130:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 132f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 131f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "tbz x14, #0, 134f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "b 134f\n"
+      "131:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 134f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "b 134f\n"
+      "132:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 133f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "tbz x14, #0, 134f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "b 134f\n"
+      "133:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "134:"  // Height 4: Partial direct writeback: Done
+      "b 136f\n"
+      "135:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "136:"  // Height 4: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 104b\n"
+      "b 206f\n"
+      "137:"  // Height 5
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "138:"  // Height 5: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 139f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 139f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 139f\n"
+      "mov x11, x12\n"
+      "139:"  // Height 5: B setup done
+      "cbz x15, 140f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "mov v12.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x15, x15, #0x40\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "b 151f\n"
+      "140:"  // Height 5: no bias
+      "tbz %x[flags], #0, 150f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x14, #0x10\n"
+      "add x22, x23, x20, LSL #2\n"
+      "bge 149f\n"
+      "tbz x14, #3, 144f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "tbz x14, #2, 142f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 141f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "tbz x14, #0, 148f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
+      "b 148f\n"
+      "141:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 148f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
+      "b 148f\n"
+      "142:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x14, #1, 143f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "tbz x14, #0, 148f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
+      "b 148f\n"
+      "143:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 148f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
+      "b 148f\n"
+      "144:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x14, #2, 146f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 145f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "tbz x14, #0, 148f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "b 148f\n"
+      "145:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 148f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "b 148f\n"
+      "146:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x14, #1, 147f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "tbz x14, #0, 148f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "b 148f\n"
+      "147:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "148:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 151f\n"
+      "149:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q24, [x22, #0x0]\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q26, [x22, #0x20]\n"
+      "ldr q27, [x22, #0x30]\n"
+      "b 151f\n"
+      "150:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "151:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "152:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 153f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 154f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 154f\n"
+      "153:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "154:"  // Height 5: input setup done
+      "cmp x27, #0x4\n"
+      "blt 157f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 156f\n"
+      "155:"  // Height 5: Multiply loop: Main loop head
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x8\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "ldr q28, [x9, #0x0]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr q29, [x12, #0x10]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr q28, [x11, #0x10]\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x9, #0x10]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x12, #0x20]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x11, #0x20]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x9, #0x20]\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x12, #0x30]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
+      "add x12, x12, #0x40\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr q29, [x10, #0x30]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
+      "add x10, x10, #0x40\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr q28, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "bge 155b\n"
+      "156:"  // Height 5: Multiply loop: Single iteration only
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "ldr q28, [x9, #0x0]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr q29, [x12, #0x10]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr q28, [x11, #0x10]\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x9, #0x10]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x12, #0x20]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x11, #0x20]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x9, #0x20]\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x12, #0x30]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
+      "add x12, x12, #0x40\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr q29, [x10, #0x30]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
+      "add x10, x10, #0x40\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr q28, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
+      "157:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 159f\n"
+      "158:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s2, [x26], #0x4\n"
+      "ldr s1, [x25], #0x4\n"
+      "sub x27, x27, #0x1\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x12, #0x0]\n"
+      "fmla v8.4s, v29.4s, v2.s[0]\n"
+      "fmla v12.4s, v29.4s, v1.s[0]\n"
+      "ldr q28, [x11, #0x0]\n"
+      "fmla v16.4s, v29.4s, v0.s[0]\n"
+      "fmla v20.4s, v29.4s, v31.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v24.4s, v29.4s, v30.s[0]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v9.4s, v28.4s, v2.s[0]\n"
+      "add x11, x11, #0x10\n"
+      "fmla v13.4s, v28.4s, v1.s[0]\n"
+      "fmla v17.4s, v28.4s, v0.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v21.4s, v28.4s, v31.s[0]\n"
+      "fmla v25.4s, v28.4s, v30.s[0]\n"
+      "ldr q28, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v10.4s, v29.4s, v2.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "fmla v18.4s, v29.4s, v0.s[0]\n"
+      "fmla v22.4s, v29.4s, v31.s[0]\n"
+      "fmla v26.4s, v29.4s, v30.s[0]\n"
+      "fmla v11.4s, v28.4s, v2.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v0.s[0]\n"
+      "fmla v23.4s, v28.4s, v31.s[0]\n"
+      "fmla v27.4s, v28.4s, v30.s[0]\n"
+      "cbnz x27, 158b\n"
+      "159:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 152b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "tbz %x[flags], #1, 160f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v29.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v29.4s\n"
+      "fmin v9.4s, v9.4s, v29.4s\n"
+      "fmin v10.4s, v10.4s, v29.4s\n"
+      "fmin v11.4s, v11.4s, v29.4s\n"
+      "fmin v12.4s, v12.4s, v29.4s\n"
+      "fmin v13.4s, v13.4s, v29.4s\n"
+      "fmin v14.4s, v14.4s, v29.4s\n"
+      "fmin v15.4s, v15.4s, v29.4s\n"
+      "fmin v16.4s, v16.4s, v29.4s\n"
+      "fmin v17.4s, v17.4s, v29.4s\n"
+      "fmin v18.4s, v18.4s, v29.4s\n"
+      "fmin v19.4s, v19.4s, v29.4s\n"
+      "fmin v20.4s, v20.4s, v29.4s\n"
+      "fmin v21.4s, v21.4s, v29.4s\n"
+      "fmin v22.4s, v22.4s, v29.4s\n"
+      "fmin v23.4s, v23.4s, v29.4s\n"
+      "fmin v24.4s, v24.4s, v29.4s\n"
+      "fmin v25.4s, v25.4s, v29.4s\n"
+      "fmin v26.4s, v26.4s, v29.4s\n"
+      "fmin v27.4s, v27.4s, v29.4s\n"
+      "fmax v8.4s, v8.4s, v28.4s\n"
+      "fmax v9.4s, v9.4s, v28.4s\n"
+      "fmax v10.4s, v10.4s, v28.4s\n"
+      "fmax v11.4s, v11.4s, v28.4s\n"
+      "fmax v12.4s, v12.4s, v28.4s\n"
+      "fmax v13.4s, v13.4s, v28.4s\n"
+      "fmax v14.4s, v14.4s, v28.4s\n"
+      "fmax v15.4s, v15.4s, v28.4s\n"
+      "fmax v16.4s, v16.4s, v28.4s\n"
+      "fmax v17.4s, v17.4s, v28.4s\n"
+      "fmax v18.4s, v18.4s, v28.4s\n"
+      "fmax v19.4s, v19.4s, v28.4s\n"
+      "fmax v20.4s, v20.4s, v28.4s\n"
+      "fmax v21.4s, v21.4s, v28.4s\n"
+      "fmax v22.4s, v22.4s, v28.4s\n"
+      "fmax v23.4s, v23.4s, v28.4s\n"
+      "fmax v24.4s, v24.4s, v28.4s\n"
+      "fmax v25.4s, v25.4s, v28.4s\n"
+      "fmax v26.4s, v26.4s, v28.4s\n"
+      "fmax v27.4s, v27.4s, v28.4s\n"
+      "160:"  // Height 5: No activation
+      "cmp x14, #0x10\n"
+      "bge 169f\n"
+      "tbz x14, #3, 164f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v25.4s }, [x22], #0x10\n"
+      "tbz x14, #2, 162f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "st1 { v22.4s }, [x23], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 161f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "tbz x14, #0, 168f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "st1 { v23.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
+      "b 168f\n"
+      "161:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 168f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "str s23, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
+      "b 168f\n"
+      "162:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 163f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "tbz x14, #0, 168f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
+      "b 168f\n"
+      "163:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 168f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
+      "b 168f\n"
+      "164:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 166f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 165f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "tbz x14, #0, 168f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "b 168f\n"
+      "165:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 168f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "b 168f\n"
+      "166:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 167f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x14, #0, 168f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "b 168f\n"
+      "167:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "168:"  // Height 5: Partial direct writeback: Done
+      "b 170f\n"
+      "169:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
+      "170:"  // Height 5: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 138b\n"
+      "b 206f\n"
+      "171:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x21, #0x18\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x15, %x[bias]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
+      "172:"  // Height 6: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0xc\n"
+      "bgt 173f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 173f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 173f\n"
+      "mov x11, x12\n"
+      "173:"  // Height 6: B setup done
+      "cbz x15, 174f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "mov v12.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x15, x15, #0x40\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v28.16b, v8.16b\n"
+      "mov v29.16b, v9.16b\n"
+      "mov v30.16b, v10.16b\n"
+      "mov v31.16b, v11.16b\n"
+      "b 185f\n"
+      "174:"  // Height 6: no bias
+      "tbz %x[flags], #0, 184f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x14, #0x10\n"
+      "add x21, x22, x20, LSL #2\n"
+      "bge 183f\n"
+      "tbz x14, #3, 178f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x14, #2, 176f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 175f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x14, #0, 182f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 182f\n"
+      "175:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 182f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 182f\n"
+      "176:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x14, #1, 177f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x14, #0, 182f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 182f\n"
+      "177:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 182f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 182f\n"
+      "178:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x14, #2, 180f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 179f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x14, #0, 182f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 182f\n"
+      "179:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 182f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 182f\n"
+      "180:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x14, #1, 181f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x14, #0, 182f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 182f\n"
+      "181:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "182:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 185f\n"
+      "183:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q24, [x22, #0x0]\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q26, [x22, #0x20]\n"
+      "ldr q27, [x22, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 185f\n"
+      "184:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "185:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "186:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 187f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 188f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 188f\n"
+      "187:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "188:"  // Height 6: input setup done
+      "cmp x27, #0x4\n"
+      "blt 191f\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "blt 190f\n"
+      "189:"  // Height 6: Multiply loop: Main loop head
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x8\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "fmla v28.4s, v6.4s, v5.s[0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "add x21, x21, #0x10\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "fmla v29.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x9, #0x0]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "fmla v30.4s, v6.4s, v5.s[0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "fmla v31.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "fmla v24.4s, v6.4s, v4.s[1]\n"
+      "fmla v28.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "fmla v25.4s, v7.4s, v4.s[1]\n"
+      "fmla v29.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x9, #0x10]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "fmla v26.4s, v6.4s, v4.s[1]\n"
+      "fmla v30.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "fmla v27.4s, v7.4s, v4.s[1]\n"
+      "fmla v31.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x11, #0x20]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "fmla v24.4s, v6.4s, v4.s[2]\n"
+      "fmla v28.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x10, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "fmla v25.4s, v7.4s, v4.s[2]\n"
+      "fmla v29.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x9, #0x20]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "fmla v26.4s, v6.4s, v4.s[2]\n"
+      "fmla v30.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "fmla v27.4s, v7.4s, v4.s[2]\n"
+      "fmla v31.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "fmla v24.4s, v6.4s, v4.s[3]\n"
+      "fmla v28.4s, v6.4s, v5.s[3]\n"
+      "ldr q6, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "fmla v25.4s, v7.4s, v4.s[3]\n"
+      "fmla v29.4s, v7.4s, v5.s[3]\n"
+      "ldr q7, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v30.4s, v6.4s, v5.s[3]\n"
+      "ldr q6, [x12, #0x0]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "fmla v31.4s, v7.4s, v5.s[3]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "bge 189b\n"
+      "190:"  // Height 6: Multiply loop: Single iteration only
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "fmla v28.4s, v6.4s, v5.s[0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "fmla v29.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x9, #0x0]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "fmla v30.4s, v6.4s, v5.s[0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "fmla v31.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "fmla v24.4s, v6.4s, v4.s[1]\n"
+      "fmla v28.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "fmla v25.4s, v7.4s, v4.s[1]\n"
+      "fmla v29.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x9, #0x10]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "fmla v26.4s, v6.4s, v4.s[1]\n"
+      "fmla v30.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "fmla v27.4s, v7.4s, v4.s[1]\n"
+      "fmla v31.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x11, #0x20]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "fmla v24.4s, v6.4s, v4.s[2]\n"
+      "fmla v28.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x10, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "fmla v25.4s, v7.4s, v4.s[2]\n"
+      "fmla v29.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x9, #0x20]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "fmla v26.4s, v6.4s, v4.s[2]\n"
+      "fmla v30.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "fmla v27.4s, v7.4s, v4.s[2]\n"
+      "fmla v31.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "fmla v24.4s, v6.4s, v4.s[3]\n"
+      "fmla v28.4s, v6.4s, v5.s[3]\n"
+      "ldr q6, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "fmla v25.4s, v7.4s, v4.s[3]\n"
+      "fmla v29.4s, v7.4s, v5.s[3]\n"
+      "ldr q7, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v30.4s, v6.4s, v5.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v31.4s, v7.4s, v5.s[3]\n"
+      "191:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 193f\n"
+      "192:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
+      "sub x27, x27, #0x1\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s3, [x22], #0x4\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q0, [x11, #0x0]\n"
+      "fmla v8.4s, v1.4s, v7.s[0]\n"
+      "fmla v12.4s, v1.4s, v6.s[0]\n"
+      "fmla v16.4s, v1.4s, v5.s[0]\n"
+      "fmla v20.4s, v1.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "fmla v24.4s, v1.4s, v3.s[0]\n"
+      "fmla v28.4s, v1.4s, v2.s[0]\n"
+      "ldr q1, [x10, #0x0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.4s, v0.4s, v7.s[0]\n"
+      "fmla v13.4s, v0.4s, v6.s[0]\n"
+      "fmla v17.4s, v0.4s, v5.s[0]\n"
+      "fmla v21.4s, v0.4s, v4.s[0]\n"
+      "fmla v25.4s, v0.4s, v3.s[0]\n"
+      "fmla v29.4s, v0.4s, v2.s[0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v10.4s, v1.4s, v7.s[0]\n"
+      "fmla v14.4s, v1.4s, v6.s[0]\n"
+      "fmla v18.4s, v1.4s, v5.s[0]\n"
+      "fmla v22.4s, v1.4s, v4.s[0]\n"
+      "fmla v26.4s, v1.4s, v3.s[0]\n"
+      "fmla v30.4s, v1.4s, v2.s[0]\n"
+      "fmla v11.4s, v0.4s, v7.s[0]\n"
+      "fmla v15.4s, v0.4s, v6.s[0]\n"
+      "fmla v19.4s, v0.4s, v5.s[0]\n"
+      "fmla v23.4s, v0.4s, v4.s[0]\n"
+      "fmla v27.4s, v0.4s, v3.s[0]\n"
+      "fmla v31.4s, v0.4s, v2.s[0]\n"
+      "cbnz x27, 192b\n"
+      "193:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 186b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "tbz %x[flags], #1, 194f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmin v28.4s, v28.4s, v1.4s\n"
+      "fmin v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v1.4s\n"
+      "fmin v31.4s, v31.4s, v1.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v31.4s, v31.4s, v0.4s\n"
+      "194:"  // Height 6: No activation
+      "cmp x14, #0x10\n"
+      "bge 203f\n"
+      "tbz x14, #3, 198f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v25.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x14, #2, 196f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "st1 { v22.4s }, [x23], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 195f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x14, #0, 202f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "st1 { v23.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 202f\n"
+      "195:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 202f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "str s23, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 202f\n"
+      "196:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 197f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x14, #0, 202f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 202f\n"
+      "197:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 202f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 202f\n"
+      "198:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 200f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 199f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x14, #0, 202f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 202f\n"
+      "199:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 202f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 202f\n"
+      "200:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 201f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x14, #0, 202f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 202f\n"
+      "201:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "202:"  // Height 6: Partial direct writeback: Done
+      "b 204f\n"
+      "203:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "204:"  // Height 6: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 172b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 206f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 205f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "205:"  // Update direct input
+      "mov x20, #0x18\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "206:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp
new file mode 100644
index 0000000000..ac3cbf943f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    size_t, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_ffhybrid_fp32bf16fp32_mmla_4x24( ARGLIST );
+
+class cls_a64_ffhybrid_fp32bf16fp32_mmla_4x24
+{
+public:
+    typedef float lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+    static unsigned int stripe_width()
+    {
+        return 4;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL256_BL64_BF16;
+    }
+
+    static unsigned int out_width()
+    {
+        return 24;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 24, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::V1:
+                    return { 23.64 };
+                default:
+                    return { 16.89 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_ffhybrid_fp32bf16fp32_mmla_4x24;
+    cls_a64_ffhybrid_fp32bf16fp32_mmla_4x24(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp
new file mode 100644
index 0000000000..8961e615d7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp
@@ -0,0 +1,2561 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, size_t B_stride, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        const bfloat16 *cur_B_ptr = {};
+        size_t B_stride = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    ka.B_stride = B_stride;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 133f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 89f\n"
+      "beq 45f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x28, x9, x20, LSL #1\n"
+      "add x27, x28, x20, LSL #1\n"
+      "add x20, x27, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0x14\n"
+      "bgt 3f\n"
+      "cmp x14, #0x10\n"
+      "mov x27, x12\n"
+      "bgt 3f\n"
+      "cmp x14, #0xc\n"
+      "mov x28, x12\n"
+      "bgt 3f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 3f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 3f\n"
+      "mov x11, x12\n"
+      "3:"  // Height 1: B setup done
+      "cbz x15, 4f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "ldr q12, [x15, #0x40]\n"
+      "ldr q13, [x15, #0x50]\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "add x15, x15, #0x60\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "b 20f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "cmp x14, #0x18\n"
+      "bge 17f\n"
+      "tbz x14, #4, 8f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x13], #0x10\n"
+      "tbz x14, #2, 6f\n"
+      "ld1 { v13.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 5f\n"
+      "ldr d20, [x13], #0x8\n"
+      "mov x20, #0x58\n"
+      "tbz x14, #0, 16f\n"
+      "ld1 { v20.s }[2], [x13]\n"
+      "b 16f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x14, #0, 16f\n"
+      "ldr s20, [x13, #0x0]\n"
+      "b 16f\n"
+      "6:"  // Height 1: Partial accumulate: partial_2_16
+      "tbz x14, #1, 7f\n"
+      "ldr d13, [x13], #0x8\n"
+      "mov x20, #0x48\n"
+      "tbz x14, #0, 16f\n"
+      "ld1 { v13.s }[2], [x13]\n"
+      "b 16f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x14, #0, 16f\n"
+      "ldr s13, [x13, #0x0]\n"
+      "b 16f\n"
+      "8:"  // Height 1: Partial accumulate: partial_8_0
+      "tbz x14, #3, 12f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x14, #2, 10f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 9f\n"
+      "ldr d12, [x13], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 16f\n"
+      "ld1 { v12.s }[2], [x13]\n"
+      "b 16f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 16f\n"
+      "ldr s12, [x13, #0x0]\n"
+      "b 16f\n"
+      "10:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x14, #1, 11f\n"
+      "ldr d11, [x13], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 16f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 16f\n"
+      "11:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 16f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 16f\n"
+      "12:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x14, #2, 14f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 13f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 16f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 16f\n"
+      "13:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 16f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 16f\n"
+      "14:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x14, #1, 15f\n"
+      "ldr d9, [x13], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 16f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 16f\n"
+      "15:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "mov x20, #0x0\n"
+      "16:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 18f\n"
+      "17:"  // Height 1: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q12, [x13, #0x30]\n"
+      "ldr q13, [x13, #0x40]\n"
+      "ldr q20, [x13, #0x50]\n"
+      "18:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "b 20f\n"
+      "19:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "20:"  // Height 1: setup done
+      "mov x26, #0x0\n"
+      "21:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 23f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 23f\n"
+      "22:"  // Height 1: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "23:"  // Height 1: input setup done
+      "cmp x25, #0x4\n"
+      "blt 26f\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "ldr q4, [x12, #0x0]\n"
+      "cmp x25, #0x8\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x11, #0x0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "blt 25f\n"
+      "24:"  // Height 1: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q24, [x10, #0x0]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q23, [x10, #0x10]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q22, [x9, #0x0]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q21, [x9, #0x10]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      "ldr q24, [x28, #0x0]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x10]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x27, #0x0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x27, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x8\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
+      "add x12, x12, #0x20\n"
+      "ldr q4, [x12, #0x0]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      "ldr q5, [x12, #0x10]\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "ldr q7, [x11, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      "add x9, x9, #0x20\n"
+      "add x28, x28, #0x20\n"
+      "add x27, x27, #0x20\n"
+      "bge 24b\n"
+      "25:"  // Height 1: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q22, [x10, #0x0]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q21, [x9, #0x0]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q24, [x9, #0x10]\n"
+      ".inst 0x6e56ec0a  // bfmmla v10.4s, v0.8h, v22.8h\n"
+      "ldr q23, [x28, #0x0]\n"
+      ".inst 0x6e59ec10  // bfmmla v16.4s, v0.8h, v25.8h\n"
+      "ldr q22, [x28, #0x10]\n"
+      ".inst 0x6e55ec0b  // bfmmla v11.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x27, #0x0]\n"
+      ".inst 0x6e58ec11  // bfmmla v17.4s, v0.8h, v24.8h\n"
+      "ldr q3, [x27, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      ".inst 0x6e57ec0c  // bfmmla v12.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec12  // bfmmla v18.4s, v0.8h, v22.8h\n"
+      "add x12, x12, #0x20\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e55ec0d  // bfmmla v13.4s, v0.8h, v21.8h\n"
+      ".inst 0x6e43ec13  // bfmmla v19.4s, v0.8h, v3.8h\n"
+      "add x10, x10, #0x20\n"
+      "add x9, x9, #0x20\n"
+      "add x28, x28, #0x20\n"
+      "add x27, x27, #0x20\n"
+      "26:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x25, 29f\n"
+      "cbz x25, 29f\n"
+      "tbz x25, #1, 27f\n"
+      "ldr d0, [x24], #0x8\n"
+      "tbz x25, #0, 28f\n"
+      "ld1 { v0.s }[2], [x24]\n"
+      "b 28f\n"
+      "27:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x24, #0x0]\n"
+      "28:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q21, [x12, #0x0]\n"
+      "ldr q30, [x12, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x6e55ec08  // bfmmla v8.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x11, #0x0]\n"
+      "ldr q22, [x11, #0x10]\n"
+      ".inst 0x6e5eec0e  // bfmmla v14.4s, v0.8h, v30.8h\n"
+      ".inst 0x6e55ec09  // bfmmla v9.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x10, #0x0]\n"
+      "ldr q23, [x10, #0x10]\n"
+      ".inst 0x6e56ec0f  // bfmmla v15.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0a  // bfmmla v10.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x9, #0x0]\n"
+      "ldr q22, [x9, #0x10]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e55ec0b  // bfmmla v11.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q23, [x28, #0x10]\n"
+      ".inst 0x6e56ec11  // bfmmla v17.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0c  // bfmmla v12.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x27, #0x0]\n"
+      "ldr q21, [x27, #0x10]\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
+      "add x12, x12, #0x20\n"
+      "add x11, x11, #0x20\n"
+      "add x10, x10, #0x20\n"
+      "add x9, x9, #0x20\n"
+      "add x28, x28, #0x20\n"
+      "add x27, x27, #0x20\n"
+      "29:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 21b\n"
+      "uzp1 v8.2d, v8.2d, v14.2d\n"
+      "uzp1 v9.2d, v9.2d, v15.2d\n"
+      "uzp1 v10.2d, v10.2d, v16.2d\n"
+      "uzp1 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v13.2d, v13.2d, v19.2d\n"
+      "tbz %x[flags], #1, 30f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v22.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v22.4s\n"
+      "fmin v9.4s, v9.4s, v22.4s\n"
+      "fmin v10.4s, v10.4s, v22.4s\n"
+      "fmin v11.4s, v11.4s, v22.4s\n"
+      "fmin v12.4s, v12.4s, v22.4s\n"
+      "fmin v13.4s, v13.4s, v22.4s\n"
+      "fmax v8.4s, v8.4s, v21.4s\n"
+      "fmax v9.4s, v9.4s, v21.4s\n"
+      "fmax v10.4s, v10.4s, v21.4s\n"
+      "fmax v11.4s, v11.4s, v21.4s\n"
+      "fmax v12.4s, v12.4s, v21.4s\n"
+      "fmax v13.4s, v13.4s, v21.4s\n"
+      "30:"  // Height 1: No activation
+      "cmp x14, #0x18\n"
+      "bge 43f\n"
+      "tbz x14, #4, 34f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v11.4s }, [x13], #0x10\n"
+      "tbz x14, #2, 32f\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 31f\n"
+      "str d13, [x13], #0x8\n"
+      "tbz x14, #0, 42f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "b 42f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_1_20
+      "tbz x14, #0, 42f\n"
+      "str s13, [x13, #0x0]\n"
+      "b 42f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_2_16
+      "tbz x14, #1, 33f\n"
+      "str d12, [x13], #0x8\n"
+      "tbz x14, #0, 42f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "b 42f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_1_16
+      "tbz x14, #0, 42f\n"
+      "str s12, [x13, #0x0]\n"
+      "b 42f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_8_0
+      "tbz x14, #3, 38f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x14, #2, 36f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 35f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x14, #0, 42f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 42f\n"
+      "35:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 42f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 42f\n"
+      "36:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 37f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x14, #0, 42f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 42f\n"
+      "37:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 42f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 42f\n"
+      "38:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 40f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 39f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x14, #0, 42f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 42f\n"
+      "39:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 42f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 42f\n"
+      "40:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 41f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x14, #0, 42f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 42f\n"
+      "41:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "42:"  // Height 1: Partial direct writeback: Done
+      "b 44f\n"
+      "43:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x13, #0x40]\n"
+      "str q13, [x13, #0x50]\n"
+      "add x13, x13, #0x60\n"
+      "44:"  // Height 1: Writeback done
+      "subs x14, x14, #0x18\n"
+      "bgt 2b\n"
+      "b 178f\n"
+      "45:"  // Height 2
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "46:"  // Height 2: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x28, x9, x20, LSL #1\n"
+      "add x27, x28, x20, LSL #1\n"
+      "add x20, x27, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0x14\n"
+      "bgt 47f\n"
+      "cmp x14, #0x10\n"
+      "mov x27, x12\n"
+      "bgt 47f\n"
+      "cmp x14, #0xc\n"
+      "mov x28, x12\n"
+      "bgt 47f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 47f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 47f\n"
+      "mov x11, x12\n"
+      "47:"  // Height 2: B setup done
+      "cbz x15, 48f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "ldr q12, [x15, #0x40]\n"
+      "ldr q13, [x15, #0x50]\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "add x15, x15, #0x60\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "b 64f\n"
+      "48:"  // Height 2: no bias
+      "tbz %x[flags], #0, 63f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x14, #0x18\n"
+      "add x23, x13, x20, LSL #2\n"
+      "bge 61f\n"
+      "tbz x14, #4, 52f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v12.4s }, [x13], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x14, #2, 50f\n"
+      "ld1 { v13.4s }, [x13], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 49f\n"
+      "ldr d20, [x13], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "mov x20, #0x58\n"
+      "tbz x14, #0, 60f\n"
+      "ld1 { v20.s }[2], [x13]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "b 60f\n"
+      "49:"  // Height 2: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x14, #0, 60f\n"
+      "ldr s20, [x13, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "b 60f\n"
+      "50:"  // Height 2: Partial accumulate: partial_2_16
+      "tbz x14, #1, 51f\n"
+      "ldr d13, [x13], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x20, #0x48\n"
+      "tbz x14, #0, 60f\n"
+      "ld1 { v13.s }[2], [x13]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "b 60f\n"
+      "51:"  // Height 2: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x14, #0, 60f\n"
+      "ldr s13, [x13, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "b 60f\n"
+      "52:"  // Height 2: Partial accumulate: partial_8_0
+      "tbz x14, #3, 56f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "tbz x14, #2, 54f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 53f\n"
+      "ldr d12, [x13], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 60f\n"
+      "ld1 { v12.s }[2], [x13]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "b 60f\n"
+      "53:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 60f\n"
+      "ldr s12, [x13, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "b 60f\n"
+      "54:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x14, #1, 55f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 60f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "b 60f\n"
+      "55:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 60f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "b 60f\n"
+      "56:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x14, #2, 58f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 57f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 60f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "b 60f\n"
+      "57:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 60f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "b 60f\n"
+      "58:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x14, #1, 59f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 60f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "b 60f\n"
+      "59:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "mov x20, #0x0\n"
+      "60:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 62f\n"
+      "61:"  // Height 2: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q12, [x13, #0x30]\n"
+      "ldr q13, [x13, #0x40]\n"
+      "ldr q20, [x13, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "62:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "b 64f\n"
+      "63:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "64:"  // Height 2: setup done
+      "mov x26, #0x0\n"
+      "65:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 66f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 67f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 67f\n"
+      "66:"  // Height 2: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "67:"  // Height 2: input setup done
+      "cmp x25, #0x4\n"
+      "blt 70f\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      "cmp x25, #0x8\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x11, #0x0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "blt 69f\n"
+      "68:"  // Height 2: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q30, [x10, #0x0]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q23, [x10, #0x10]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q22, [x9, #0x0]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q21, [x9, #0x10]\n"
+      ".inst 0x6e5eec0a  // bfmmla v10.4s, v0.8h, v30.8h\n"
+      "ldr q2, [x28, #0x0]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x10]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x27, #0x0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x27, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x8\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e42ec0c  // bfmmla v12.4s, v0.8h, v2.8h\n"
+      "ldr q4, [x12, #0x0]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      "ldr q5, [x12, #0x10]\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "add x10, x10, #0x20\n"
+      "ldr q7, [x11, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      "add x28, x28, #0x20\n"
+      "add x27, x27, #0x20\n"
+      "bge 68b\n"
+      "69:"  // Height 2: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q24, [x10, #0x0]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q23, [x10, #0x10]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q22, [x9, #0x0]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q21, [x9, #0x10]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      "ldr q24, [x28, #0x0]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x10]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x27, #0x0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x27, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      "add x12, x12, #0x20\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
+      "add x10, x10, #0x20\n"
+      "add x9, x9, #0x20\n"
+      "add x28, x28, #0x20\n"
+      "add x27, x27, #0x20\n"
+      "70:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x25, 73f\n"
+      "cbz x25, 73f\n"
+      "tbz x25, #1, 71f\n"
+      "ldr d0, [x24], #0x8\n"
+      "ldr d1, [x23], #0x8\n"
+      "tbz x25, #0, 72f\n"
+      "ld1 { v0.s }[2], [x24]\n"
+      "ld1 { v1.s }[2], [x23]\n"
+      "b 72f\n"
+      "71:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x24, #0x0]\n"
+      "ldr s1, [x23, #0x0]\n"
+      "72:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q24, [x12, #0x0]\n"
+      "ldr q23, [x12, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ldr q22, [x11, #0x0]\n"
+      "ldr q21, [x11, #0x10]\n"
+      ".inst 0x6e58ec08  // bfmmla v8.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec0e  // bfmmla v14.4s, v0.8h, v23.8h\n"
+      "ldr q24, [x10, #0x0]\n"
+      "ldr q23, [x10, #0x10]\n"
+      ".inst 0x6e56ec09  // bfmmla v9.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0f  // bfmmla v15.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x9, #0x0]\n"
+      "ldr q21, [x9, #0x10]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q23, [x28, #0x10]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x27, #0x0]\n"
+      "ldr q21, [x27, #0x10]\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
+      "add x12, x12, #0x20\n"
+      "add x11, x11, #0x20\n"
+      "add x10, x10, #0x20\n"
+      "add x9, x9, #0x20\n"
+      "add x28, x28, #0x20\n"
+      "add x27, x27, #0x20\n"
+      "73:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 65b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v4.2d, v8.2d, v14.2d\n"
+      "uzp2 v8.2d, v8.2d, v14.2d\n"
+      "add x23, x13, x20, LSL #2\n"
+      "uzp1 v14.2d, v9.2d, v15.2d\n"
+      "uzp2 v9.2d, v9.2d, v15.2d\n"
+      "uzp1 v15.2d, v10.2d, v16.2d\n"
+      "uzp2 v10.2d, v10.2d, v16.2d\n"
+      "uzp1 v16.2d, v11.2d, v17.2d\n"
+      "uzp2 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v17.2d, v12.2d, v18.2d\n"
+      "uzp2 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v18.2d, v13.2d, v19.2d\n"
+      "uzp2 v13.2d, v13.2d, v19.2d\n"
+      "tbz %x[flags], #1, 74f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v22.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "fmin v4.4s, v4.4s, v22.4s\n"
+      "fmin v14.4s, v14.4s, v22.4s\n"
+      "fmin v15.4s, v15.4s, v22.4s\n"
+      "fmin v16.4s, v16.4s, v22.4s\n"
+      "fmin v17.4s, v17.4s, v22.4s\n"
+      "fmin v18.4s, v18.4s, v22.4s\n"
+      "fmin v8.4s, v8.4s, v22.4s\n"
+      "fmin v9.4s, v9.4s, v22.4s\n"
+      "fmin v10.4s, v10.4s, v22.4s\n"
+      "fmin v11.4s, v11.4s, v22.4s\n"
+      "fmin v12.4s, v12.4s, v22.4s\n"
+      "fmin v13.4s, v13.4s, v22.4s\n"
+      "fmax v4.4s, v4.4s, v21.4s\n"
+      "fmax v14.4s, v14.4s, v21.4s\n"
+      "fmax v15.4s, v15.4s, v21.4s\n"
+      "fmax v16.4s, v16.4s, v21.4s\n"
+      "fmax v17.4s, v17.4s, v21.4s\n"
+      "fmax v18.4s, v18.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v21.4s\n"
+      "fmax v9.4s, v9.4s, v21.4s\n"
+      "fmax v10.4s, v10.4s, v21.4s\n"
+      "fmax v11.4s, v11.4s, v21.4s\n"
+      "fmax v12.4s, v12.4s, v21.4s\n"
+      "fmax v13.4s, v13.4s, v21.4s\n"
+      "74:"  // Height 2: No activation
+      "cmp x14, #0x18\n"
+      "bge 87f\n"
+      "tbz x14, #4, 78f\n"
+      "st1 { v4.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x13], #0x10\n"
+      "st1 { v15.4s }, [x13], #0x10\n"
+      "st1 { v16.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v11.4s }, [x23], #0x10\n"
+      "tbz x14, #2, 76f\n"
+      "st1 { v17.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 75f\n"
+      "str d18, [x13], #0x8\n"
+      "str d13, [x23], #0x8\n"
+      "tbz x14, #0, 86f\n"
+      "st1 { v18.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x23]\n"
+      "b 86f\n"
+      "75:"  // Height 2: Partial direct writeback: partial_1_20
+      "tbz x14, #0, 86f\n"
+      "str s18, [x13, #0x0]\n"
+      "str s13, [x23, #0x0]\n"
+      "b 86f\n"
+      "76:"  // Height 2: Partial direct writeback: partial_2_16
+      "tbz x14, #1, 77f\n"
+      "str d17, [x13], #0x8\n"
+      "str d12, [x23], #0x8\n"
+      "tbz x14, #0, 86f\n"
+      "st1 { v17.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x23]\n"
+      "b 86f\n"
+      "77:"  // Height 2: Partial direct writeback: partial_1_16
+      "tbz x14, #0, 86f\n"
+      "str s17, [x13, #0x0]\n"
+      "str s12, [x23, #0x0]\n"
+      "b 86f\n"
+      "78:"  // Height 2: Partial direct writeback: partial_8_0
+      "tbz x14, #3, 82f\n"
+      "st1 { v4.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "tbz x14, #2, 80f\n"
+      "st1 { v15.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 79f\n"
+      "str d16, [x13], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "tbz x14, #0, 86f\n"
+      "st1 { v16.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "b 86f\n"
+      "79:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 86f\n"
+      "str s16, [x13, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "b 86f\n"
+      "80:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 81f\n"
+      "str d15, [x13], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "tbz x14, #0, 86f\n"
+      "st1 { v15.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "b 86f\n"
+      "81:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 86f\n"
+      "str s15, [x13, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "b 86f\n"
+      "82:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 84f\n"
+      "st1 { v4.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 83f\n"
+      "str d14, [x13], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "tbz x14, #0, 86f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "b 86f\n"
+      "83:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 86f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "b 86f\n"
+      "84:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 85f\n"
+      "str d4, [x13], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "tbz x14, #0, 86f\n"
+      "st1 { v4.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "b 86f\n"
+      "85:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s4, [x13, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "86:"  // Height 2: Partial direct writeback: Done
+      "b 88f\n"
+      "87:"  // Height 2: Full writeback
+      "str q4, [x13, #0x0]\n"
+      "str q14, [x13, #0x10]\n"
+      "str q15, [x13, #0x20]\n"
+      "str q16, [x13, #0x30]\n"
+      "str q17, [x13, #0x40]\n"
+      "str q18, [x13, #0x50]\n"
+      "add x13, x13, #0x60\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q12, [x23, #0x40]\n"
+      "str q13, [x23, #0x50]\n"
+      "88:"  // Height 2: Writeback done
+      "subs x14, x14, #0x18\n"
+      "bgt 46b\n"
+      "b 178f\n"
+      "89:"  // Height 3
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "90:"  // Height 3: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x28, x9, x20, LSL #1\n"
+      "add x27, x28, x20, LSL #1\n"
+      "add x20, x27, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0x14\n"
+      "bgt 91f\n"
+      "cmp x14, #0x10\n"
+      "mov x27, x12\n"
+      "bgt 91f\n"
+      "cmp x14, #0xc\n"
+      "mov x28, x12\n"
+      "bgt 91f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 91f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 91f\n"
+      "mov x11, x12\n"
+      "91:"  // Height 3: B setup done
+      "cbz x15, 92f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "ldr q12, [x15, #0x40]\n"
+      "ldr q13, [x15, #0x50]\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "add x15, x15, #0x60\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v26.16b, v14.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v27.16b, v15.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v28.16b, v16.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v29.16b, v17.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v30.16b, v18.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "mov v31.16b, v19.16b\n"
+      "b 108f\n"
+      "92:"  // Height 3: no bias
+      "tbz %x[flags], #0, 107f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20, LSL #2\n"
+      "cmp x14, #0x18\n"
+      "add x22, x23, x20, LSL #2\n"
+      "bge 105f\n"
+      "tbz x14, #4, 96f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "ld1 { v12.4s }, [x13], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "tbz x14, #2, 94f\n"
+      "ld1 { v13.4s }, [x13], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 93f\n"
+      "ldr d20, [x13], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "mov x20, #0x58\n"
+      "ldr d4, [x22], #0x8\n"
+      "tbz x14, #0, 104f\n"
+      "ld1 { v20.s }[2], [x13]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v4.s }[2], [x22]\n"
+      "b 104f\n"
+      "93:"  // Height 3: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x14, #0, 104f\n"
+      "ldr s20, [x13, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s4, [x22, #0x0]\n"
+      "b 104f\n"
+      "94:"  // Height 3: Partial accumulate: partial_2_16
+      "tbz x14, #1, 95f\n"
+      "ldr d13, [x13], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x20, #0x48\n"
+      "ldr d25, [x22], #0x8\n"
+      "tbz x14, #0, 104f\n"
+      "ld1 { v13.s }[2], [x13]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "b 104f\n"
+      "95:"  // Height 3: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x14, #0, 104f\n"
+      "ldr s13, [x13, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "b 104f\n"
+      "96:"  // Height 3: Partial accumulate: partial_8_0
+      "tbz x14, #3, 100f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x14, #2, 98f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 97f\n"
+      "ldr d12, [x13], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x22], #0x8\n"
+      "tbz x14, #0, 104f\n"
+      "ld1 { v12.s }[2], [x13]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "b 104f\n"
+      "97:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 104f\n"
+      "ldr s12, [x13, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "b 104f\n"
+      "98:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x14, #1, 99f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x14, #0, 104f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "b 104f\n"
+      "99:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 104f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "b 104f\n"
+      "100:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x14, #2, 102f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 101f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x14, #0, 104f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "b 104f\n"
+      "101:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 104f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "b 104f\n"
+      "102:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x14, #1, 103f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x14, #0, 104f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "b 104f\n"
+      "103:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s21, [x22, #0x0]\n"
+      "104:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 106f\n"
+      "105:"  // Height 3: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q12, [x13, #0x30]\n"
+      "ldr q13, [x13, #0x40]\n"
+      "ldr q20, [x13, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "ldr q21, [x22, #0x0]\n"
+      "ldr q22, [x22, #0x10]\n"
+      "ldr q23, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "ldr q25, [x22, #0x40]\n"
+      "ldr q4, [x22, #0x50]\n"
+      "106:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "zip1 v20.2d, v21.2d, v26.2d\n"
+      "zip2 v26.2d, v21.2d, v26.2d\n"
+      "zip1 v21.2d, v22.2d, v27.2d\n"
+      "zip2 v27.2d, v22.2d, v27.2d\n"
+      "zip1 v22.2d, v23.2d, v28.2d\n"
+      "zip2 v28.2d, v23.2d, v28.2d\n"
+      "zip1 v23.2d, v24.2d, v29.2d\n"
+      "zip2 v29.2d, v24.2d, v29.2d\n"
+      "zip1 v24.2d, v25.2d, v30.2d\n"
+      "zip2 v30.2d, v25.2d, v30.2d\n"
+      "zip1 v25.2d, v4.2d, v31.2d\n"
+      "zip2 v31.2d, v4.2d, v31.2d\n"
+      "b 108f\n"
+      "107:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "108:"  // Height 3: setup done
+      "mov x26, #0x0\n"
+      "109:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 110f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 111f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 111f\n"
+      "110:"  // Height 3: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "111:"  // Height 3: input setup done
+      "cmp x25, #0x4\n"
+      "blt 114f\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      "cmp x25, #0x8\n"
+      "ld1 { v2.4s }, [x22], #0x10\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x11, #0x0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "blt 113f\n"
+      "112:"  // Height 3: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x10, #0x0]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x10, #0x10]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "sub x25, x25, #0x4\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "cmp x25, #0x8\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q3, [x9, #0x10]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x0]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x27, #0x0]\n"
+      ".inst 0x6e43ec11  // bfmmla v17.4s, v0.8h, v3.8h\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e43ec5d  // bfmmla v29.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x27, #0x10]\n"
+      "add x28, x28, #0x20\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      "add x27, x27, #0x20\n"
+      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x12, #0x0]\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x12, #0x10]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6e43ec13  // bfmmla v19.4s, v0.8h, v3.8h\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      ".inst 0x6e43ec5f  // bfmmla v31.4s, v2.8h, v3.8h\n"
+      "ld1 { v2.4s }, [x22], #0x10\n"
+      "ldr q7, [x11, #0x10]\n"
+      "bge 112b\n"
+      "113:"  // Height 3: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "sub x25, x25, #0x4\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q3, [x10, #0x0]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q4, [x10, #0x10]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q1, [x9, #0x10]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e43ec56  // bfmmla v22.4s, v2.8h, v3.8h\n"
+      "ldr q5, [x28, #0x0]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x10]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      "add x28, x28, #0x20\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q3, [x27, #0x0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x27, #0x10]\n"
+      "add x27, x27, #0x20\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
+      "114:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x25, 117f\n"
+      "cbz x25, 117f\n"
+      "tbz x25, #1, 115f\n"
+      "ldr d0, [x24], #0x8\n"
+      "ldr d1, [x23], #0x8\n"
+      "ldr d2, [x22], #0x8\n"
+      "tbz x25, #0, 116f\n"
+      "ld1 { v0.s }[2], [x24]\n"
+      "ld1 { v1.s }[2], [x23]\n"
+      "ld1 { v2.s }[2], [x22]\n"
+      "b 116f\n"
+      "115:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x24, #0x0]\n"
+      "ldr s1, [x23, #0x0]\n"
+      "ldr s2, [x22, #0x0]\n"
+      "116:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q5, [x12, #0x0]\n"
+      "ldr q4, [x12, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ldr q3, [x11, #0x0]\n"
+      "ldr q1, [x11, #0x10]\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x6e45ec08  // bfmmla v8.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec54  // bfmmla v20.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x10, #0x0]\n"
+      ".inst 0x6e44ec0e  // bfmmla v14.4s, v0.8h, v4.8h\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e44ec5a  // bfmmla v26.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x10, #0x10]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e43ec55  // bfmmla v21.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x9, #0x0]\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e41ec5b  // bfmmla v27.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x9, #0x10]\n"
+      ".inst 0x6e45ec0a  // bfmmla v10.4s, v0.8h, v5.8h\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x0]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x10]\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      "add x28, x28, #0x20\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x27, #0x0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x27, #0x10]\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      "add x27, x27, #0x20\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
+      "117:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 109b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20, LSL #2\n"
+      "uzp1 v4.2d, v8.2d, v14.2d\n"
+      "uzp2 v8.2d, v8.2d, v14.2d\n"
+      "uzp1 v14.2d, v9.2d, v15.2d\n"
+      "uzp2 v9.2d, v9.2d, v15.2d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp1 v15.2d, v10.2d, v16.2d\n"
+      "uzp2 v10.2d, v10.2d, v16.2d\n"
+      "uzp1 v16.2d, v11.2d, v17.2d\n"
+      "uzp2 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v17.2d, v12.2d, v18.2d\n"
+      "uzp2 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v18.2d, v13.2d, v19.2d\n"
+      "uzp2 v13.2d, v13.2d, v19.2d\n"
+      "uzp1 v20.2d, v20.2d, v26.2d\n"
+      "uzp1 v21.2d, v21.2d, v27.2d\n"
+      "uzp1 v22.2d, v22.2d, v28.2d\n"
+      "uzp1 v23.2d, v23.2d, v29.2d\n"
+      "uzp1 v24.2d, v24.2d, v30.2d\n"
+      "uzp1 v25.2d, v25.2d, v31.2d\n"
+      "tbz %x[flags], #1, 118f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v4.4s, v4.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmax v4.4s, v4.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "118:"  // Height 3: No activation
+      "cmp x14, #0x18\n"
+      "bge 131f\n"
+      "tbz x14, #4, 122f\n"
+      "st1 { v4.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x13], #0x10\n"
+      "st1 { v15.4s }, [x13], #0x10\n"
+      "st1 { v16.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v11.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "tbz x14, #2, 120f\n"
+      "st1 { v17.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 119f\n"
+      "str d18, [x13], #0x8\n"
+      "str d13, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "tbz x14, #0, 130f\n"
+      "st1 { v18.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "b 130f\n"
+      "119:"  // Height 3: Partial direct writeback: partial_1_20
+      "tbz x14, #0, 130f\n"
+      "str s18, [x13, #0x0]\n"
+      "str s13, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "b 130f\n"
+      "120:"  // Height 3: Partial direct writeback: partial_2_16
+      "tbz x14, #1, 121f\n"
+      "str d17, [x13], #0x8\n"
+      "str d12, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x14, #0, 130f\n"
+      "st1 { v17.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "b 130f\n"
+      "121:"  // Height 3: Partial direct writeback: partial_1_16
+      "tbz x14, #0, 130f\n"
+      "str s17, [x13, #0x0]\n"
+      "str s12, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "b 130f\n"
+      "122:"  // Height 3: Partial direct writeback: partial_8_0
+      "tbz x14, #3, 126f\n"
+      "st1 { v4.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "tbz x14, #2, 124f\n"
+      "st1 { v15.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 123f\n"
+      "str d16, [x13], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "tbz x14, #0, 130f\n"
+      "st1 { v16.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "b 130f\n"
+      "123:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 130f\n"
+      "str s16, [x13, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "b 130f\n"
+      "124:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 125f\n"
+      "str d15, [x13], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "tbz x14, #0, 130f\n"
+      "st1 { v15.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "b 130f\n"
+      "125:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 130f\n"
+      "str s15, [x13, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "b 130f\n"
+      "126:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 128f\n"
+      "st1 { v4.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 127f\n"
+      "str d14, [x13], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "tbz x14, #0, 130f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "b 130f\n"
+      "127:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 130f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "b 130f\n"
+      "128:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 129f\n"
+      "str d4, [x13], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "tbz x14, #0, 130f\n"
+      "st1 { v4.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "b 130f\n"
+      "129:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s4, [x13, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "130:"  // Height 3: Partial direct writeback: Done
+      "b 132f\n"
+      "131:"  // Height 3: Full writeback
+      "str q4, [x13, #0x0]\n"
+      "str q14, [x13, #0x10]\n"
+      "str q15, [x13, #0x20]\n"
+      "str q16, [x13, #0x30]\n"
+      "str q17, [x13, #0x40]\n"
+      "str q18, [x13, #0x50]\n"
+      "add x13, x13, #0x60\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q12, [x23, #0x40]\n"
+      "str q13, [x23, #0x50]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x22, #0x40]\n"
+      "str q25, [x22, #0x50]\n"
+      "132:"  // Height 3: Writeback done
+      "subs x14, x14, #0x18\n"
+      "bgt 90b\n"
+      "b 178f\n"
+      "133:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x21, #0x10\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x15, %x[bias]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
+      "134:"  // Height 4: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x28, x9, x20, LSL #1\n"
+      "add x27, x28, x20, LSL #1\n"
+      "add x20, x27, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x14, #0x14\n"
+      "bgt 135f\n"
+      "cmp x14, #0x10\n"
+      "mov x27, x12\n"
+      "bgt 135f\n"
+      "cmp x14, #0xc\n"
+      "mov x28, x12\n"
+      "bgt 135f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 135f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 135f\n"
+      "mov x11, x12\n"
+      "135:"  // Height 4: B setup done
+      "cbz x15, 136f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "ldr q12, [x15, #0x40]\n"
+      "ldr q13, [x15, #0x50]\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "add x15, x15, #0x60\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v26.16b, v14.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v27.16b, v15.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v28.16b, v16.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v29.16b, v17.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v30.16b, v18.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "mov v31.16b, v19.16b\n"
+      "b 152f\n"
+      "136:"  // Height 4: no bias
+      "tbz %x[flags], #0, 151f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x14, #0x18\n"
+      "add x21, x22, x20, LSL #2\n"
+      "bge 149f\n"
+      "tbz x14, #4, 140f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v12.4s }, [x13], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x14, #2, 138f\n"
+      "ld1 { v13.4s }, [x13], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 137f\n"
+      "ldr d20, [x13], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "mov x20, #0x58\n"
+      "ldr d4, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x14, #0, 148f\n"
+      "ld1 { v20.s }[2], [x13]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v4.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 148f\n"
+      "137:"  // Height 4: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x14, #0, 148f\n"
+      "ldr s20, [x13, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s4, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 148f\n"
+      "138:"  // Height 4: Partial accumulate: partial_2_16
+      "tbz x14, #1, 139f\n"
+      "ldr d13, [x13], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x20, #0x48\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x14, #0, 148f\n"
+      "ld1 { v13.s }[2], [x13]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 148f\n"
+      "139:"  // Height 4: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x14, #0, 148f\n"
+      "ldr s13, [x13, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 148f\n"
+      "140:"  // Height 4: Partial accumulate: partial_8_0
+      "tbz x14, #3, 144f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "tbz x14, #2, 142f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 141f\n"
+      "ldr d12, [x13], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x14, #0, 148f\n"
+      "ld1 { v12.s }[2], [x13]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 148f\n"
+      "141:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 148f\n"
+      "ldr s12, [x13, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 148f\n"
+      "142:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x14, #1, 143f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x14, #0, 148f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 148f\n"
+      "143:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 148f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "b 148f\n"
+      "144:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x14, #2, 146f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 145f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x14, #0, 148f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "b 148f\n"
+      "145:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 148f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "b 148f\n"
+      "146:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x14, #1, 147f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x14, #0, 148f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "b 148f\n"
+      "147:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "148:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 150f\n"
+      "149:"  // Height 4: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q12, [x13, #0x30]\n"
+      "ldr q13, [x13, #0x40]\n"
+      "ldr q20, [x13, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "ldr q21, [x22, #0x0]\n"
+      "ldr q22, [x22, #0x10]\n"
+      "ldr q23, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "ldr q25, [x22, #0x40]\n"
+      "ldr q4, [x22, #0x50]\n"
+      "ldr q26, [x21, #0x0]\n"
+      "ldr q27, [x21, #0x10]\n"
+      "ldr q28, [x21, #0x20]\n"
+      "ldr q29, [x21, #0x30]\n"
+      "ldr q30, [x21, #0x40]\n"
+      "ldr q31, [x21, #0x50]\n"
+      "150:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "zip1 v20.2d, v21.2d, v26.2d\n"
+      "zip2 v26.2d, v21.2d, v26.2d\n"
+      "zip1 v21.2d, v22.2d, v27.2d\n"
+      "zip2 v27.2d, v22.2d, v27.2d\n"
+      "zip1 v22.2d, v23.2d, v28.2d\n"
+      "zip2 v28.2d, v23.2d, v28.2d\n"
+      "zip1 v23.2d, v24.2d, v29.2d\n"
+      "zip2 v29.2d, v24.2d, v29.2d\n"
+      "zip1 v24.2d, v25.2d, v30.2d\n"
+      "zip2 v30.2d, v25.2d, v30.2d\n"
+      "zip1 v25.2d, v4.2d, v31.2d\n"
+      "zip2 v31.2d, v4.2d, v31.2d\n"
+      "b 152f\n"
+      "151:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "152:"  // Height 4: setup done
+      "mov x26, #0x0\n"
+      "153:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 154f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 155f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 155f\n"
+      "154:"  // Height 4: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "155:"  // Height 4: input setup done
+      "cmp x25, #0x4\n"
+      "blt 158f\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "ld1 { v2.4s }, [x22], #0x10\n"
+      "cmp x25, #0x8\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      "ld1 { v3.4s }, [x21], #0x10\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x11, #0x0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "blt 157f\n"
+      "156:"  // Height 4: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x8\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "ld1 { v3.4s }, [x21], #0x10\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x10, #0x0]\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x10, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x9, #0x0]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x0]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x10]\n"
+      "add x28, x28, #0x20\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x27, #0x0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x27, #0x10]\n"
+      "add x27, x27, #0x20\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x12, #0x0]\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x12, #0x10]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      "ld1 { v2.4s }, [x22], #0x10\n"
+      "ldr q7, [x11, #0x10]\n"
+      "bge 156b\n"
+      "157:"  // Height 4: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x25, x25, #0x4\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q3, [x10, #0x0]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q4, [x10, #0x10]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q1, [x9, #0x10]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e43ec56  // bfmmla v22.4s, v2.8h, v3.8h\n"
+      "ldr q5, [x28, #0x0]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x10]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      "add x28, x28, #0x20\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q3, [x27, #0x0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x27, #0x10]\n"
+      "add x27, x27, #0x20\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
+      "158:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x25, 161f\n"
+      "cbz x25, 161f\n"
+      "tbz x25, #1, 159f\n"
+      "ldr d0, [x24], #0x8\n"
+      "ldr d1, [x23], #0x8\n"
+      "ldr d2, [x22], #0x8\n"
+      "ldr d3, [x21], #0x8\n"
+      "tbz x25, #0, 160f\n"
+      "ld1 { v0.s }[2], [x24]\n"
+      "ld1 { v1.s }[2], [x23]\n"
+      "ld1 { v2.s }[2], [x22]\n"
+      "ld1 { v3.s }[2], [x21]\n"
+      "b 160f\n"
+      "159:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x24, #0x0]\n"
+      "ldr s1, [x23, #0x0]\n"
+      "ldr s2, [x22, #0x0]\n"
+      "ldr s3, [x21, #0x0]\n"
+      "160:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q5, [x12, #0x0]\n"
+      "ldr q4, [x12, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q7, [x11, #0x0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e45ec08  // bfmmla v8.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec54  // bfmmla v20.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x10, #0x0]\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e44ec0e  // bfmmla v14.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5a  // bfmmla v26.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x10, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q3, [x9, #0x0]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      "ldr q1, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e45ec0a  // bfmmla v10.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x0]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x10]\n"
+      "add x28, x28, #0x20\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x27, #0x0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x27, #0x10]\n"
+      "add x27, x27, #0x20\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
+      "161:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 153b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp1 v4.2d, v8.2d, v14.2d\n"
+      "uzp2 v8.2d, v8.2d, v14.2d\n"
+      "uzp1 v14.2d, v9.2d, v15.2d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v15.2d\n"
+      "uzp1 v15.2d, v10.2d, v16.2d\n"
+      "uzp2 v10.2d, v10.2d, v16.2d\n"
+      "uzp1 v16.2d, v11.2d, v17.2d\n"
+      "uzp2 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v17.2d, v12.2d, v18.2d\n"
+      "uzp2 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v18.2d, v13.2d, v19.2d\n"
+      "uzp2 v13.2d, v13.2d, v19.2d\n"
+      "uzp1 v19.2d, v20.2d, v26.2d\n"
+      "uzp2 v20.2d, v20.2d, v26.2d\n"
+      "uzp1 v26.2d, v21.2d, v27.2d\n"
+      "uzp2 v21.2d, v21.2d, v27.2d\n"
+      "uzp1 v27.2d, v22.2d, v28.2d\n"
+      "uzp2 v22.2d, v22.2d, v28.2d\n"
+      "uzp1 v28.2d, v23.2d, v29.2d\n"
+      "uzp2 v23.2d, v23.2d, v29.2d\n"
+      "uzp1 v29.2d, v24.2d, v30.2d\n"
+      "uzp2 v24.2d, v24.2d, v30.2d\n"
+      "uzp1 v30.2d, v25.2d, v31.2d\n"
+      "uzp2 v25.2d, v25.2d, v31.2d\n"
+      "tbz %x[flags], #1, 162f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v4.4s, v4.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmin v28.4s, v28.4s, v1.4s\n"
+      "fmin v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmax v4.4s, v4.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "162:"  // Height 4: No activation
+      "cmp x14, #0x18\n"
+      "bge 175f\n"
+      "tbz x14, #4, 166f\n"
+      "st1 { v4.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x13], #0x10\n"
+      "st1 { v15.4s }, [x13], #0x10\n"
+      "st1 { v16.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v11.4s }, [x23], #0x10\n"
+      "st1 { v19.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "st1 { v27.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "st1 { v21.4s }, [x21], #0x10\n"
+      "st1 { v22.4s }, [x21], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "tbz x14, #2, 164f\n"
+      "st1 { v17.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x23], #0x10\n"
+      "st1 { v29.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 163f\n"
+      "str d18, [x13], #0x8\n"
+      "str d13, [x23], #0x8\n"
+      "str d30, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x14, #0, 174f\n"
+      "st1 { v18.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "b 174f\n"
+      "163:"  // Height 4: Partial direct writeback: partial_1_20
+      "tbz x14, #0, 174f\n"
+      "str s18, [x13, #0x0]\n"
+      "str s13, [x23, #0x0]\n"
+      "str s30, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "b 174f\n"
+      "164:"  // Height 4: Partial direct writeback: partial_2_16
+      "tbz x14, #1, 165f\n"
+      "str d17, [x13], #0x8\n"
+      "str d12, [x23], #0x8\n"
+      "str d29, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x14, #0, 174f\n"
+      "st1 { v17.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "b 174f\n"
+      "165:"  // Height 4: Partial direct writeback: partial_1_16
+      "tbz x14, #0, 174f\n"
+      "str s17, [x13, #0x0]\n"
+      "str s12, [x23, #0x0]\n"
+      "str s29, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "b 174f\n"
+      "166:"  // Height 4: Partial direct writeback: partial_8_0
+      "tbz x14, #3, 170f\n"
+      "st1 { v4.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v19.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "st1 { v21.4s }, [x21], #0x10\n"
+      "tbz x14, #2, 168f\n"
+      "st1 { v15.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v27.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 167f\n"
+      "str d16, [x13], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d28, [x22], #0x8\n"
+      "str d23, [x21], #0x8\n"
+      "tbz x14, #0, 174f\n"
+      "st1 { v16.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x22]\n"
+      "st1 { v23.s }[2], [x21]\n"
+      "b 174f\n"
+      "167:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 174f\n"
+      "str s16, [x13, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s28, [x22, #0x0]\n"
+      "str s23, [x21, #0x0]\n"
+      "b 174f\n"
+      "168:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 169f\n"
+      "str d15, [x13], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "str d22, [x21], #0x8\n"
+      "tbz x14, #0, 174f\n"
+      "st1 { v15.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
+      "st1 { v22.s }[2], [x21]\n"
+      "b 174f\n"
+      "169:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 174f\n"
+      "str s15, [x13, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
+      "str s22, [x21, #0x0]\n"
+      "b 174f\n"
+      "170:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 172f\n"
+      "st1 { v4.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v19.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "tbz x14, #1, 171f\n"
+      "str d14, [x13], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "str d21, [x21], #0x8\n"
+      "tbz x14, #0, 174f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
+      "st1 { v21.s }[2], [x21]\n"
+      "b 174f\n"
+      "171:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 174f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
+      "str s21, [x21, #0x0]\n"
+      "b 174f\n"
+      "172:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 173f\n"
+      "str d4, [x13], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d20, [x21], #0x8\n"
+      "tbz x14, #0, 174f\n"
+      "st1 { v4.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v20.s }[2], [x21]\n"
+      "b 174f\n"
+      "173:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s4, [x13, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s20, [x21, #0x0]\n"
+      "174:"  // Height 4: Partial direct writeback: Done
+      "b 176f\n"
+      "175:"  // Height 4: Full writeback
+      "str q4, [x13, #0x0]\n"
+      "str q14, [x13, #0x10]\n"
+      "str q15, [x13, #0x20]\n"
+      "str q16, [x13, #0x30]\n"
+      "str q17, [x13, #0x40]\n"
+      "str q18, [x13, #0x50]\n"
+      "add x13, x13, #0x60\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q12, [x23, #0x40]\n"
+      "str q13, [x23, #0x50]\n"
+      "str q19, [x22, #0x0]\n"
+      "str q26, [x22, #0x10]\n"
+      "str q27, [x22, #0x20]\n"
+      "str q28, [x22, #0x30]\n"
+      "str q29, [x22, #0x40]\n"
+      "str q30, [x22, #0x50]\n"
+      "str q20, [x21, #0x0]\n"
+      "str q21, [x21, #0x10]\n"
+      "str q22, [x21, #0x20]\n"
+      "str q23, [x21, #0x30]\n"
+      "str q24, [x21, #0x40]\n"
+      "str q25, [x21, #0x50]\n"
+      "176:"  // Height 4: Writeback done
+      "subs x14, x14, #0x18\n"
+      "bgt 134b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 178f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 177f\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "177:"  // Update direct input
+      "mov x20, #0x10\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "178:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16.hpp
new file mode 100644
index 0000000000..98f7fc9403
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16.hpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    size_t, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_ffhybrid_fp32bf16fp32_mmla_6x16( ARGLIST );
+
+class cls_a64_ffhybrid_fp32bf16fp32_mmla_6x16
+{
+public:
+    typedef float lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+    static unsigned int stripe_width()
+    {
+        return 4;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL256_BL64_BF16;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::V1:
+                    return { 21.05 };
+                default:
+                    return { 15.27 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_ffhybrid_fp32bf16fp32_mmla_6x16;
+    cls_a64_ffhybrid_fp32bf16fp32_mmla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..9ab4aa98f9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16/generic.cpp
@@ -0,0 +1,3240 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_ffhybrid_fp32bf16fp32_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, size_t B_stride, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        const bfloat16 *cur_B_ptr = {};
+        size_t B_stride = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+        void *output_ptr = nullptr;
+        const float *bias = nullptr;
+    } ka;
+
+    unsigned long flags=0;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        ka.output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        ka.output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    ka.bias = bias;
+    ka.B_stride = B_stride;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 181f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 145f\n"
+      "beq 109f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 73f\n"
+      "beq 37f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+      "2:"  // Height 1: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "cmp x14, #0xc\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 3f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 3f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 3f\n"
+      "mov x11, x12\n"
+      "3:"  // Height 1: B setup done
+      "cbz x15, 4f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 16f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 15f\n"
+      "cmp x14, #0x10\n"
+      "bge 13f\n"
+      "tbz x14, #3, 8f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x14, #2, 6f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 5f\n"
+      "ldr d16, [x13], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "b 12f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 12f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "b 12f\n"
+      "6:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x14, #1, 7f\n"
+      "ldr d11, [x13], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 12f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 12f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 12f\n"
+      "8:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x14, #2, 10f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 9f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 12f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 12f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 12f\n"
+      "10:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x14, #1, 11f\n"
+      "ldr d9, [x13], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 12f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 12f\n"
+      "11:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "mov x20, #0x0\n"
+      "12:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 14f\n"
+      "13:"  // Height 1: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "14:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 16f\n"
+      "15:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "16:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "17:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 18f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 19f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "b 19f\n"
+      "18:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "19:"  // Height 1: input setup done
+      "cmp x27, #0x4\n"
+      "blt 22f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ldr q6, [x12, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q7, [x12, #0x10]\n"
+      "blt 21f\n"
+      "20:"  // Height 1: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      "ldr q6, [x12, #0x0]\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ldr q7, [x12, #0x10]\n"
+      "bge 20b\n"
+      "21:"  // Height 1: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "22:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 25f\n"
+      "cbz x27, 25f\n"
+      "tbz x27, #1, 23f\n"
+      "ldr d0, [x26], #0x8\n"
+      "tbz x27, #0, 24f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "b 24f\n"
+      "23:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q18, [x12, #0x0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x6e52ec08  // bfmmla v8.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e51ec0c  // bfmmla v12.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "25:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 17b\n"
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "tbz %x[flags], #1, 26f\n"
+      "add x21, %x[args_ptr], %[offset_max]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v18.4s }, [x21]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
+      "26:"  // Height 1: No activation
+      "cmp x14, #0x10\n"
+      "bge 35f\n"
+      "tbz x14, #3, 30f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x14, #2, 28f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 27f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x14, #0, 34f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 34f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 34f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 34f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 29f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x14, #0, 34f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 34f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 34f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 34f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 32f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x14, #1, 31f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x14, #0, 34f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 34f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 34f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 34f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 33f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x14, #0, 34f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 34f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "34:"  // Height 1: Partial direct writeback: Done
+      "b 36f\n"
+      "35:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "36:"  // Height 1: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 2b\n"
+      "b 218f\n"
+      "37:"  // Height 2
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+      "38:"  // Height 2: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "cmp x14, #0xc\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 39f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 39f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 39f\n"
+      "mov x11, x12\n"
+      "39:"  // Height 2: B setup done
+      "cbz x15, 40f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 52f\n"
+      "40:"  // Height 2: no bias
+      "tbz %x[flags], #0, 51f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x14, #0x10\n"
+      "add x26, x13, x20, LSL #2\n"
+      "bge 49f\n"
+      "tbz x14, #3, 44f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x26], #0x10\n"
+      "tbz x14, #2, 42f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x26], #0x10\n"
+      "tbz x14, #1, 41f\n"
+      "ldr d16, [x13], #0x8\n"
+      "ldr d15, [x26], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x14, #0, 48f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x26]\n"
+      "b 48f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 48f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "ldr s15, [x26, #0x0]\n"
+      "b 48f\n"
+      "42:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x14, #1, 43f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d14, [x26], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x14, #0, 48f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x26]\n"
+      "b 48f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 48f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s14, [x26, #0x0]\n"
+      "b 48f\n"
+      "44:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x14, #2, 46f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "tbz x14, #1, 45f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d13, [x26], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x14, #0, 48f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "b 48f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 48f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s13, [x26, #0x0]\n"
+      "b 48f\n"
+      "46:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x14, #1, 47f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d12, [x26], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x14, #0, 48f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "b 48f\n"
+      "47:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s12, [x26, #0x0]\n"
+      "mov x20, #0x0\n"
+      "48:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 50f\n"
+      "49:"  // Height 2: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "ldr q12, [x26, #0x0]\n"
+      "ldr q13, [x26, #0x10]\n"
+      "ldr q14, [x26, #0x20]\n"
+      "ldr q15, [x26, #0x30]\n"
+      "50:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 52f\n"
+      "51:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "52:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "53:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 54f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 55f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "b 55f\n"
+      "54:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "55:"  // Height 2: input setup done
+      "cmp x27, #0x4\n"
+      "blt 58f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      "cmp x27, #0x8\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x12, #0x10]\n"
+      "blt 57f\n"
+      "56:"  // Height 2: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      "ldr q6, [x12, #0x0]\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ldr q7, [x12, #0x10]\n"
+      "bge 56b\n"
+      "57:"  // Height 2: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "58:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 61f\n"
+      "cbz x27, 61f\n"
+      "tbz x27, #1, 59f\n"
+      "ldr d0, [x26], #0x8\n"
+      "ldr d1, [x25], #0x8\n"
+      "tbz x27, #0, 60f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "ld1 { v1.s }[2], [x25]\n"
+      "b 60f\n"
+      "59:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "ldr s1, [x25, #0x0]\n"
+      "60:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q18, [x12, #0x0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e52ec08  // bfmmla v8.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e51ec0c  // bfmmla v12.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "61:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 53b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "add x26, x13, x20, LSL #2\n"
+      "tbz %x[flags], #1, 62f\n"
+      "add x21, %x[args_ptr], %[offset_max]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v18.4s }, [x21]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v18.4s\n"
+      "fmin v12.4s, v12.4s, v18.4s\n"
+      "fmin v13.4s, v13.4s, v18.4s\n"
+      "fmin v14.4s, v14.4s, v18.4s\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v6.4s, v6.4s, v17.4s\n"
+      "fmax v12.4s, v12.4s, v17.4s\n"
+      "fmax v13.4s, v13.4s, v17.4s\n"
+      "fmax v14.4s, v14.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
+      "62:"  // Height 2: No activation
+      "cmp x14, #0x10\n"
+      "bge 71f\n"
+      "tbz x14, #3, 66f\n"
+      "st1 { v6.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "tbz x14, #2, 64f\n"
+      "st1 { v13.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "tbz x14, #1, 63f\n"
+      "str d14, [x13], #0x8\n"
+      "str d11, [x26], #0x8\n"
+      "tbz x14, #0, 70f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x26]\n"
+      "b 70f\n"
+      "63:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 70f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s11, [x26, #0x0]\n"
+      "b 70f\n"
+      "64:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 65f\n"
+      "str d13, [x13], #0x8\n"
+      "str d10, [x26], #0x8\n"
+      "tbz x14, #0, 70f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x26]\n"
+      "b 70f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 70f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s10, [x26, #0x0]\n"
+      "b 70f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 68f\n"
+      "st1 { v6.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "tbz x14, #1, 67f\n"
+      "str d12, [x13], #0x8\n"
+      "str d9, [x26], #0x8\n"
+      "tbz x14, #0, 70f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x26]\n"
+      "b 70f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 70f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s9, [x26, #0x0]\n"
+      "b 70f\n"
+      "68:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 69f\n"
+      "str d6, [x13], #0x8\n"
+      "str d8, [x26], #0x8\n"
+      "tbz x14, #0, 70f\n"
+      "st1 { v6.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x26]\n"
+      "b 70f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s6, [x13, #0x0]\n"
+      "str s8, [x26, #0x0]\n"
+      "70:"  // Height 2: Partial direct writeback: Done
+      "b 72f\n"
+      "71:"  // Height 2: Full writeback
+      "str q6, [x13, #0x0]\n"
+      "str q12, [x13, #0x10]\n"
+      "str q13, [x13, #0x20]\n"
+      "str q14, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q8, [x26, #0x0]\n"
+      "str q9, [x26, #0x10]\n"
+      "str q10, [x26, #0x20]\n"
+      "str q11, [x26, #0x30]\n"
+      "72:"  // Height 2: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 38b\n"
+      "b 218f\n"
+      "73:"  // Height 3
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+      "74:"  // Height 3: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "cmp x14, #0xc\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 75f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 75f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 75f\n"
+      "mov x11, x12\n"
+      "75:"  // Height 3: B setup done
+      "cbz x15, 76f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 88f\n"
+      "76:"  // Height 3: no bias
+      "tbz %x[flags], #0, 87f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x14, #0x10\n"
+      "add x26, x13, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "bge 85f\n"
+      "tbz x14, #3, 80f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x25], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x26], #0x10\n"
+      "ld1 { v18.4s }, [x25], #0x10\n"
+      "tbz x14, #2, 78f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x26], #0x10\n"
+      "ld1 { v19.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 77f\n"
+      "ldr d16, [x13], #0x8\n"
+      "ldr d15, [x26], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x25], #0x8\n"
+      "tbz x14, #0, 84f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x26]\n"
+      "ld1 { v24.s }[2], [x25]\n"
+      "b 84f\n"
+      "77:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 84f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "ldr s15, [x26, #0x0]\n"
+      "ldr s24, [x25, #0x0]\n"
+      "b 84f\n"
+      "78:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x14, #1, 79f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d14, [x26], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x25], #0x8\n"
+      "tbz x14, #0, 84f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x26]\n"
+      "ld1 { v19.s }[2], [x25]\n"
+      "b 84f\n"
+      "79:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 84f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s14, [x26, #0x0]\n"
+      "ldr s19, [x25, #0x0]\n"
+      "b 84f\n"
+      "80:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x14, #2, 82f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 81f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d13, [x26], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x25], #0x8\n"
+      "tbz x14, #0, 84f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "ld1 { v18.s }[2], [x25]\n"
+      "b 84f\n"
+      "81:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 84f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s13, [x26, #0x0]\n"
+      "ldr s18, [x25, #0x0]\n"
+      "b 84f\n"
+      "82:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x14, #1, 83f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d12, [x26], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "tbz x14, #0, 84f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "ld1 { v17.s }[2], [x25]\n"
+      "b 84f\n"
+      "83:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s12, [x26, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x25, #0x0]\n"
+      "84:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 86f\n"
+      "85:"  // Height 3: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "ldr q12, [x26, #0x0]\n"
+      "ldr q13, [x26, #0x10]\n"
+      "ldr q14, [x26, #0x20]\n"
+      "ldr q15, [x26, #0x30]\n"
+      "ldr q17, [x25, #0x0]\n"
+      "ldr q18, [x25, #0x10]\n"
+      "ldr q19, [x25, #0x20]\n"
+      "ldr q24, [x25, #0x30]\n"
+      "86:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 88f\n"
+      "87:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "88:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "89:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 90f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 91f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 91f\n"
+      "90:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "91:"  // Height 3: input setup done
+      "cmp x27, #0x4\n"
+      "blt 94f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      "cmp x27, #0x8\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x12, #0x10]\n"
+      "blt 93f\n"
+      "92:"  // Height 3: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      "ldr q6, [x12, #0x0]\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "ldr q7, [x12, #0x10]\n"
+      "bge 92b\n"
+      "93:"  // Height 3: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "94:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 97f\n"
+      "cbz x27, 97f\n"
+      "tbz x27, #1, 95f\n"
+      "ldr d0, [x26], #0x8\n"
+      "ldr d1, [x25], #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "tbz x27, #0, 96f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "ld1 { v1.s }[2], [x25]\n"
+      "ld1 { v2.s }[2], [x24]\n"
+      "b 96f\n"
+      "95:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "ldr s1, [x25, #0x0]\n"
+      "ldr s2, [x24, #0x0]\n"
+      "96:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e5aec50  // bfmmla v16.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec54  // bfmmla v20.4s, v2.8h, v25.8h\n"
+      ".inst 0x6e5aec08  // bfmmla v8.4s, v0.8h, v26.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e59ec0c  // bfmmla v12.4s, v0.8h, v25.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "97:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 89b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "add x26, x13, x20, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "add x25, x26, x20, LSL #2\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "tbz %x[flags], #1, 98f\n"
+      "add x21, %x[args_ptr], %[offset_max]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v26.4s }, [x21]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v6.4s, v6.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
+      "98:"  // Height 3: No activation
+      "cmp x14, #0x10\n"
+      "bge 107f\n"
+      "tbz x14, #3, 102f\n"
+      "st1 { v6.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v16.4s }, [x25], #0x10\n"
+      "st1 { v17.4s }, [x25], #0x10\n"
+      "tbz x14, #2, 100f\n"
+      "st1 { v13.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v18.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 99f\n"
+      "str d14, [x13], #0x8\n"
+      "str d11, [x26], #0x8\n"
+      "str d19, [x25], #0x8\n"
+      "tbz x14, #0, 106f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x26]\n"
+      "st1 { v19.s }[2], [x25]\n"
+      "b 106f\n"
+      "99:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 106f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s11, [x26, #0x0]\n"
+      "str s19, [x25, #0x0]\n"
+      "b 106f\n"
+      "100:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 101f\n"
+      "str d13, [x13], #0x8\n"
+      "str d10, [x26], #0x8\n"
+      "str d18, [x25], #0x8\n"
+      "tbz x14, #0, 106f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x26]\n"
+      "st1 { v18.s }[2], [x25]\n"
+      "b 106f\n"
+      "101:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 106f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s10, [x26, #0x0]\n"
+      "str s18, [x25, #0x0]\n"
+      "b 106f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 104f\n"
+      "st1 { v6.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v16.4s }, [x25], #0x10\n"
+      "tbz x14, #1, 103f\n"
+      "str d12, [x13], #0x8\n"
+      "str d9, [x26], #0x8\n"
+      "str d17, [x25], #0x8\n"
+      "tbz x14, #0, 106f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x26]\n"
+      "st1 { v17.s }[2], [x25]\n"
+      "b 106f\n"
+      "103:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 106f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s9, [x26, #0x0]\n"
+      "str s17, [x25, #0x0]\n"
+      "b 106f\n"
+      "104:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 105f\n"
+      "str d6, [x13], #0x8\n"
+      "str d8, [x26], #0x8\n"
+      "str d16, [x25], #0x8\n"
+      "tbz x14, #0, 106f\n"
+      "st1 { v6.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x26]\n"
+      "st1 { v16.s }[2], [x25]\n"
+      "b 106f\n"
+      "105:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s6, [x13, #0x0]\n"
+      "str s8, [x26, #0x0]\n"
+      "str s16, [x25, #0x0]\n"
+      "106:"  // Height 3: Partial direct writeback: Done
+      "b 108f\n"
+      "107:"  // Height 3: Full writeback
+      "str q6, [x13, #0x0]\n"
+      "str q12, [x13, #0x10]\n"
+      "str q13, [x13, #0x20]\n"
+      "str q14, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q8, [x26, #0x0]\n"
+      "str q9, [x26, #0x10]\n"
+      "str q10, [x26, #0x20]\n"
+      "str q11, [x26, #0x30]\n"
+      "str q16, [x25, #0x0]\n"
+      "str q17, [x25, #0x10]\n"
+      "str q18, [x25, #0x20]\n"
+      "str q19, [x25, #0x30]\n"
+      "108:"  // Height 3: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 74b\n"
+      "b 218f\n"
+      "109:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+      "110:"  // Height 4: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "cmp x14, #0xc\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 111f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 111f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 111f\n"
+      "mov x11, x12\n"
+      "111:"  // Height 4: B setup done
+      "cbz x15, 112f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 124f\n"
+      "112:"  // Height 4: no bias
+      "tbz %x[flags], #0, 123f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x14, #0x10\n"
+      "add x26, x13, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "bge 121f\n"
+      "tbz x14, #3, 116f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x25], #0x10\n"
+      "ld1 { v20.4s }, [x24], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x26], #0x10\n"
+      "ld1 { v18.4s }, [x25], #0x10\n"
+      "ld1 { v21.4s }, [x24], #0x10\n"
+      "tbz x14, #2, 114f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x26], #0x10\n"
+      "ld1 { v19.4s }, [x25], #0x10\n"
+      "ld1 { v22.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 113f\n"
+      "ldr d16, [x13], #0x8\n"
+      "ldr d15, [x26], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x25], #0x8\n"
+      "ldr d23, [x24], #0x8\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x26]\n"
+      "ld1 { v24.s }[2], [x25]\n"
+      "ld1 { v23.s }[2], [x24]\n"
+      "b 120f\n"
+      "113:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 120f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "ldr s15, [x26, #0x0]\n"
+      "ldr s24, [x25, #0x0]\n"
+      "ldr s23, [x24, #0x0]\n"
+      "b 120f\n"
+      "114:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x14, #1, 115f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d14, [x26], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x26]\n"
+      "ld1 { v19.s }[2], [x25]\n"
+      "ld1 { v22.s }[2], [x24]\n"
+      "b 120f\n"
+      "115:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 120f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s14, [x26, #0x0]\n"
+      "ldr s19, [x25, #0x0]\n"
+      "ldr s22, [x24, #0x0]\n"
+      "b 120f\n"
+      "116:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x14, #2, 118f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x25], #0x10\n"
+      "ld1 { v20.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 117f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d13, [x26], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "ld1 { v18.s }[2], [x25]\n"
+      "ld1 { v21.s }[2], [x24]\n"
+      "b 120f\n"
+      "117:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 120f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s13, [x26, #0x0]\n"
+      "ldr s18, [x25, #0x0]\n"
+      "ldr s21, [x24, #0x0]\n"
+      "b 120f\n"
+      "118:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x14, #1, 119f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d12, [x26], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d20, [x24], #0x8\n"
+      "tbz x14, #0, 120f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "ld1 { v17.s }[2], [x25]\n"
+      "ld1 { v20.s }[2], [x24]\n"
+      "b 120f\n"
+      "119:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s12, [x26, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x25, #0x0]\n"
+      "ldr s20, [x24, #0x0]\n"
+      "120:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 122f\n"
+      "121:"  // Height 4: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "ldr q12, [x26, #0x0]\n"
+      "ldr q13, [x26, #0x10]\n"
+      "ldr q14, [x26, #0x20]\n"
+      "ldr q15, [x26, #0x30]\n"
+      "ldr q17, [x25, #0x0]\n"
+      "ldr q18, [x25, #0x10]\n"
+      "ldr q19, [x25, #0x20]\n"
+      "ldr q24, [x25, #0x30]\n"
+      "ldr q20, [x24, #0x0]\n"
+      "ldr q21, [x24, #0x10]\n"
+      "ldr q22, [x24, #0x20]\n"
+      "ldr q23, [x24, #0x30]\n"
+      "122:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 124f\n"
+      "123:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "124:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "125:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 126f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 127f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 127f\n"
+      "126:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "127:"  // Height 4: input setup done
+      "cmp x27, #0x4\n"
+      "blt 130f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "cmp x27, #0x8\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x12, #0x10]\n"
+      "blt 129f\n"
+      "128:"  // Height 4: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      "ldr q6, [x12, #0x0]\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "ldr q7, [x12, #0x10]\n"
+      "bge 128b\n"
+      "129:"  // Height 4: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "130:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 133f\n"
+      "cbz x27, 133f\n"
+      "tbz x27, #1, 131f\n"
+      "ldr d0, [x26], #0x8\n"
+      "ldr d1, [x25], #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "ldr d3, [x23], #0x8\n"
+      "tbz x27, #0, 132f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "ld1 { v1.s }[2], [x25]\n"
+      "ld1 { v2.s }[2], [x24]\n"
+      "ld1 { v3.s }[2], [x23]\n"
+      "b 132f\n"
+      "131:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "ldr s1, [x25, #0x0]\n"
+      "ldr s2, [x24, #0x0]\n"
+      "ldr s3, [x23, #0x0]\n"
+      "132:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e5aec08  // bfmmla v8.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec50  // bfmmla v16.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e59ec0c  // bfmmla v12.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec54  // bfmmla v20.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "133:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 125b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "add x26, x13, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "tbz %x[flags], #1, 134f\n"
+      "add x21, %x[args_ptr], %[offset_max]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v26.4s }, [x21]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v15.4s, v15.4s, v26.4s\n"
+      "fmin v20.4s, v20.4s, v26.4s\n"
+      "fmin v21.4s, v21.4s, v26.4s\n"
+      "fmin v22.4s, v22.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v6.4s, v6.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v15.4s, v15.4s, v25.4s\n"
+      "fmax v20.4s, v20.4s, v25.4s\n"
+      "fmax v21.4s, v21.4s, v25.4s\n"
+      "fmax v22.4s, v22.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
+      "134:"  // Height 4: No activation
+      "cmp x14, #0x10\n"
+      "bge 143f\n"
+      "tbz x14, #3, 138f\n"
+      "st1 { v6.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v15.4s }, [x25], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "tbz x14, #2, 136f\n"
+      "st1 { v13.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 135f\n"
+      "str d14, [x13], #0x8\n"
+      "str d11, [x26], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "tbz x14, #0, 142f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x26]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "b 142f\n"
+      "135:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 142f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s11, [x26, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "b 142f\n"
+      "136:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 137f\n"
+      "str d13, [x13], #0x8\n"
+      "str d10, [x26], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "tbz x14, #0, 142f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x26]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "b 142f\n"
+      "137:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 142f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s10, [x26, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "b 142f\n"
+      "138:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 140f\n"
+      "st1 { v6.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v15.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "tbz x14, #1, 139f\n"
+      "str d12, [x13], #0x8\n"
+      "str d9, [x26], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "tbz x14, #0, 142f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x26]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "b 142f\n"
+      "139:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 142f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s9, [x26, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "b 142f\n"
+      "140:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 141f\n"
+      "str d6, [x13], #0x8\n"
+      "str d8, [x26], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "tbz x14, #0, 142f\n"
+      "st1 { v6.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x26]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "b 142f\n"
+      "141:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s6, [x13, #0x0]\n"
+      "str s8, [x26, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "142:"  // Height 4: Partial direct writeback: Done
+      "b 144f\n"
+      "143:"  // Height 4: Full writeback
+      "str q6, [x13, #0x0]\n"
+      "str q12, [x13, #0x10]\n"
+      "str q13, [x13, #0x20]\n"
+      "str q14, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q8, [x26, #0x0]\n"
+      "str q9, [x26, #0x10]\n"
+      "str q10, [x26, #0x20]\n"
+      "str q11, [x26, #0x30]\n"
+      "str q15, [x25, #0x0]\n"
+      "str q20, [x25, #0x10]\n"
+      "str q21, [x25, #0x20]\n"
+      "str q22, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "144:"  // Height 4: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 110b\n"
+      "b 218f\n"
+      "145:"  // Height 5
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+      "146:"  // Height 5: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "cmp x14, #0xc\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 147f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 147f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 147f\n"
+      "mov x11, x12\n"
+      "147:"  // Height 5: B setup done
+      "cbz x15, 148f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 160f\n"
+      "148:"  // Height 5: no bias
+      "tbz %x[flags], #0, 159f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x14, #0x10\n"
+      "add x26, x13, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "bge 157f\n"
+      "tbz x14, #3, 152f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x25], #0x10\n"
+      "ld1 { v20.4s }, [x24], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x26], #0x10\n"
+      "ld1 { v18.4s }, [x25], #0x10\n"
+      "ld1 { v21.4s }, [x24], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "tbz x14, #2, 150f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x26], #0x10\n"
+      "ld1 { v19.4s }, [x25], #0x10\n"
+      "ld1 { v22.4s }, [x24], #0x10\n"
+      "ld1 { v27.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 149f\n"
+      "ldr d16, [x13], #0x8\n"
+      "ldr d15, [x26], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x25], #0x8\n"
+      "ldr d23, [x24], #0x8\n"
+      "ldr d6, [x23], #0x8\n"
+      "tbz x14, #0, 156f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x26]\n"
+      "ld1 { v24.s }[2], [x25]\n"
+      "ld1 { v23.s }[2], [x24]\n"
+      "ld1 { v6.s }[2], [x23]\n"
+      "b 156f\n"
+      "149:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 156f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "ldr s15, [x26, #0x0]\n"
+      "ldr s24, [x25, #0x0]\n"
+      "ldr s23, [x24, #0x0]\n"
+      "ldr s6, [x23, #0x0]\n"
+      "b 156f\n"
+      "150:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x14, #1, 151f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d14, [x26], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "tbz x14, #0, 156f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x26]\n"
+      "ld1 { v19.s }[2], [x25]\n"
+      "ld1 { v22.s }[2], [x24]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "b 156f\n"
+      "151:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 156f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s14, [x26, #0x0]\n"
+      "ldr s19, [x25, #0x0]\n"
+      "ldr s22, [x24, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "b 156f\n"
+      "152:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x14, #2, 154f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x25], #0x10\n"
+      "ld1 { v20.4s }, [x24], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 153f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d13, [x26], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "tbz x14, #0, 156f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "ld1 { v18.s }[2], [x25]\n"
+      "ld1 { v21.s }[2], [x24]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "b 156f\n"
+      "153:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 156f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s13, [x26, #0x0]\n"
+      "ldr s18, [x25, #0x0]\n"
+      "ldr s21, [x24, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "b 156f\n"
+      "154:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x14, #1, 155f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d12, [x26], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d20, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "tbz x14, #0, 156f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "ld1 { v17.s }[2], [x25]\n"
+      "ld1 { v20.s }[2], [x24]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "b 156f\n"
+      "155:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s12, [x26, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x25, #0x0]\n"
+      "ldr s20, [x24, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "156:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 158f\n"
+      "157:"  // Height 5: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "ldr q12, [x26, #0x0]\n"
+      "ldr q13, [x26, #0x10]\n"
+      "ldr q14, [x26, #0x20]\n"
+      "ldr q15, [x26, #0x30]\n"
+      "ldr q17, [x25, #0x0]\n"
+      "ldr q18, [x25, #0x10]\n"
+      "ldr q19, [x25, #0x20]\n"
+      "ldr q24, [x25, #0x30]\n"
+      "ldr q20, [x24, #0x0]\n"
+      "ldr q21, [x24, #0x10]\n"
+      "ldr q22, [x24, #0x20]\n"
+      "ldr q23, [x24, #0x30]\n"
+      "ldr q25, [x23, #0x0]\n"
+      "ldr q26, [x23, #0x10]\n"
+      "ldr q27, [x23, #0x20]\n"
+      "ldr q6, [x23, #0x30]\n"
+      "158:"  // Height 5: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 160f\n"
+      "159:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "160:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "161:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 162f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 163f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 163f\n"
+      "162:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "163:"  // Height 5: input setup done
+      "cmp x27, #0x4\n"
+      "blt 166f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "cmp x27, #0x8\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      "ld1 { v4.4s }, [x22], #0x10\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x12, #0x10]\n"
+      "blt 165f\n"
+      "164:"  // Height 5: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q5, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec55  // bfmmla v21.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec9d  // bfmmla v29.4s, v4.8h, v5.8h\n"
+      "ldr q5, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec9e  // bfmmla v30.4s, v4.8h, v5.8h\n"
+      "ldr q5, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x12, #0x0]\n"
+      ".inst 0x6e45ec0f  // bfmmla v15.4s, v0.8h, v5.8h\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      ".inst 0x6e45ec9f  // bfmmla v31.4s, v4.8h, v5.8h\n"
+      "ld1 { v4.4s }, [x22], #0x10\n"
+      "ldr q7, [x12, #0x10]\n"
+      "bge 164b\n"
+      "165:"  // Height 5: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q3, [x11, #0x0]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q1, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x0]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x9, #0x0]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
+      "166:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 169f\n"
+      "cbz x27, 169f\n"
+      "tbz x27, #1, 167f\n"
+      "ldr d0, [x26], #0x8\n"
+      "ldr d1, [x25], #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "ldr d3, [x23], #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "tbz x27, #0, 168f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "ld1 { v1.s }[2], [x25]\n"
+      "ld1 { v2.s }[2], [x24]\n"
+      "ld1 { v3.s }[2], [x23]\n"
+      "ld1 { v4.s }[2], [x22]\n"
+      "b 168f\n"
+      "167:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "ldr s1, [x25, #0x0]\n"
+      "ldr s2, [x24, #0x0]\n"
+      "ldr s3, [x23, #0x0]\n"
+      "ldr s4, [x22, #0x0]\n"
+      "168:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec9c  // bfmmla v28.4s, v4.8h, v5.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q3, [x11, #0x0]\n"
+      ".inst 0x6e45ec54  // bfmmla v20.4s, v2.8h, v5.8h\n"
+      "ldr q1, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x0]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x9, #0x0]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
+      "169:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 161b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "add x26, x13, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "tbz %x[flags], #1, 170f\n"
+      "add x21, %x[args_ptr], %[offset_max]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x21]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmax v6.4s, v6.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "170:"  // Height 5: No activation
+      "cmp x14, #0x10\n"
+      "bge 179f\n"
+      "tbz x14, #3, 174f\n"
+      "st1 { v6.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v15.4s }, [x25], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "tbz x14, #2, 172f\n"
+      "st1 { v13.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 171f\n"
+      "str d14, [x13], #0x8\n"
+      "str d11, [x26], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "tbz x14, #0, 178f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x26]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "b 178f\n"
+      "171:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 178f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s11, [x26, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "b 178f\n"
+      "172:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 173f\n"
+      "str d13, [x13], #0x8\n"
+      "str d10, [x26], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "tbz x14, #0, 178f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x26]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "b 178f\n"
+      "173:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 178f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s10, [x26, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "b 178f\n"
+      "174:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 176f\n"
+      "st1 { v6.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v15.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "tbz x14, #1, 175f\n"
+      "str d12, [x13], #0x8\n"
+      "str d9, [x26], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "tbz x14, #0, 178f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x26]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "b 178f\n"
+      "175:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 178f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s9, [x26, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "b 178f\n"
+      "176:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 177f\n"
+      "str d6, [x13], #0x8\n"
+      "str d8, [x26], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x14, #0, 178f\n"
+      "st1 { v6.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x26]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "b 178f\n"
+      "177:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s6, [x13, #0x0]\n"
+      "str s8, [x26, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "178:"  // Height 5: Partial direct writeback: Done
+      "b 180f\n"
+      "179:"  // Height 5: Full writeback
+      "str q6, [x13, #0x0]\n"
+      "str q12, [x13, #0x10]\n"
+      "str q13, [x13, #0x20]\n"
+      "str q14, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q8, [x26, #0x0]\n"
+      "str q9, [x26, #0x10]\n"
+      "str q10, [x26, #0x20]\n"
+      "str q11, [x26, #0x30]\n"
+      "str q15, [x25, #0x0]\n"
+      "str q20, [x25, #0x10]\n"
+      "str q21, [x25, #0x20]\n"
+      "str q22, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "180:"  // Height 5: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 146b\n"
+      "b 218f\n"
+      "181:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_bias]]\n"
+      "mov x21, #0x18\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+      "madd x21, x20, x21, x13\n"
+      "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
+      "182:"  // Height 6: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "cmp x14, #0xc\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 183f\n"
+      "cmp x14, #0x8\n"
+      "mov x9, x12\n"
+      "bgt 183f\n"
+      "cmp x14, #0x4\n"
+      "mov x10, x12\n"
+      "bgt 183f\n"
+      "mov x11, x12\n"
+      "183:"  // Height 6: B setup done
+      "cbz x15, 184f\n"
+      "ldr q8, [x15, #0x0]\n"
+      "ldr q9, [x15, #0x10]\n"
+      "ldr q10, [x15, #0x20]\n"
+      "ldr q11, [x15, #0x30]\n"
+      "add x15, x15, #0x40\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 196f\n"
+      "184:"  // Height 6: no bias
+      "tbz %x[flags], #0, 195f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x14, #0x10\n"
+      "add x26, x13, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "bge 193f\n"
+      "tbz x14, #3, 188f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x25], #0x10\n"
+      "ld1 { v20.4s }, [x24], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x26], #0x10\n"
+      "ld1 { v18.4s }, [x25], #0x10\n"
+      "ld1 { v21.4s }, [x24], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "ld1 { v29.4s }, [x22], #0x10\n"
+      "tbz x14, #2, 186f\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x26], #0x10\n"
+      "ld1 { v19.4s }, [x25], #0x10\n"
+      "ld1 { v22.4s }, [x24], #0x10\n"
+      "ld1 { v27.4s }, [x23], #0x10\n"
+      "ld1 { v30.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 185f\n"
+      "ldr d16, [x13], #0x8\n"
+      "ldr d15, [x26], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x25], #0x8\n"
+      "ldr d23, [x24], #0x8\n"
+      "ldr d6, [x23], #0x8\n"
+      "ldr d31, [x22], #0x8\n"
+      "tbz x14, #0, 192f\n"
+      "ld1 { v16.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x26]\n"
+      "ld1 { v24.s }[2], [x25]\n"
+      "ld1 { v23.s }[2], [x24]\n"
+      "ld1 { v6.s }[2], [x23]\n"
+      "ld1 { v31.s }[2], [x22]\n"
+      "b 192f\n"
+      "185:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x14, #0, 192f\n"
+      "ldr s16, [x13, #0x0]\n"
+      "ldr s15, [x26, #0x0]\n"
+      "ldr s24, [x25, #0x0]\n"
+      "ldr s23, [x24, #0x0]\n"
+      "ldr s6, [x23, #0x0]\n"
+      "ldr s31, [x22, #0x0]\n"
+      "b 192f\n"
+      "186:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x14, #1, 187f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d14, [x26], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "ldr d30, [x22], #0x8\n"
+      "tbz x14, #0, 192f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x26]\n"
+      "ld1 { v19.s }[2], [x25]\n"
+      "ld1 { v22.s }[2], [x24]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "ld1 { v30.s }[2], [x22]\n"
+      "b 192f\n"
+      "187:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x14, #0, 192f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s14, [x26, #0x0]\n"
+      "ldr s19, [x25, #0x0]\n"
+      "ldr s22, [x24, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "ldr s30, [x22, #0x0]\n"
+      "b 192f\n"
+      "188:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x14, #2, 190f\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x25], #0x10\n"
+      "ld1 { v20.4s }, [x24], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 189f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d13, [x26], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d29, [x22], #0x8\n"
+      "tbz x14, #0, 192f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "ld1 { v18.s }[2], [x25]\n"
+      "ld1 { v21.s }[2], [x24]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "ld1 { v29.s }[2], [x22]\n"
+      "b 192f\n"
+      "189:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x14, #0, 192f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s13, [x26, #0x0]\n"
+      "ldr s18, [x25, #0x0]\n"
+      "ldr s21, [x24, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "ldr s29, [x22, #0x0]\n"
+      "b 192f\n"
+      "190:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x14, #1, 191f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d12, [x26], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d20, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d28, [x22], #0x8\n"
+      "tbz x14, #0, 192f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "ld1 { v17.s }[2], [x25]\n"
+      "ld1 { v20.s }[2], [x24]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "ld1 { v28.s }[2], [x22]\n"
+      "b 192f\n"
+      "191:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s12, [x26, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x25, #0x0]\n"
+      "ldr s20, [x24, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "ldr s28, [x22, #0x0]\n"
+      "192:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x20\n"
+      "b 194f\n"
+      "193:"  // Height 6: full accumulate
+      "ldr q9, [x13, #0x0]\n"
+      "ldr q10, [x13, #0x10]\n"
+      "ldr q11, [x13, #0x20]\n"
+      "ldr q16, [x13, #0x30]\n"
+      "ldr q12, [x26, #0x0]\n"
+      "ldr q13, [x26, #0x10]\n"
+      "ldr q14, [x26, #0x20]\n"
+      "ldr q15, [x26, #0x30]\n"
+      "ldr q17, [x25, #0x0]\n"
+      "ldr q18, [x25, #0x10]\n"
+      "ldr q19, [x25, #0x20]\n"
+      "ldr q24, [x25, #0x30]\n"
+      "ldr q20, [x24, #0x0]\n"
+      "ldr q21, [x24, #0x10]\n"
+      "ldr q22, [x24, #0x20]\n"
+      "ldr q23, [x24, #0x30]\n"
+      "ldr q25, [x23, #0x0]\n"
+      "ldr q26, [x23, #0x10]\n"
+      "ldr q27, [x23, #0x20]\n"
+      "ldr q6, [x23, #0x30]\n"
+      "ldr q28, [x22, #0x0]\n"
+      "ldr q29, [x22, #0x10]\n"
+      "ldr q30, [x22, #0x20]\n"
+      "ldr q31, [x22, #0x30]\n"
+      "194:"  // Height 6: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 196f\n"
+      "195:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "196:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "197:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 198f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 199f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 199f\n"
+      "198:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "199:"  // Height 6: input setup done
+      "cmp x27, #0x4\n"
+      "blt 202f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "cmp x27, #0x8\n"
+      "ld1 { v4.4s }, [x22], #0x10\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      "ld1 { v5.4s }, [x21], #0x10\n"
+      "ldr q6, [x12, #0x0]\n"
+      "ldr q7, [x12, #0x10]\n"
+      "blt 201f\n"
+      "200:"  // Height 6: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      ".inst 0x4ea168a4  // bfcvtn2 v4.8h, v5.4s\n"
+      "ld1 { v5.4s }, [x21], #0x10\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x12, #0x0]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      "ld1 { v4.4s }, [x22], #0x10\n"
+      "ldr q7, [x12, #0x10]\n"
+      "bge 200b\n"
+      "201:"  // Height 6: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x4ea168a4  // bfcvtn2 v4.8h, v5.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q3, [x11, #0x0]\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q1, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x0]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x9, #0x0]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
+      "202:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 205f\n"
+      "cbz x27, 205f\n"
+      "tbz x27, #1, 203f\n"
+      "ldr d0, [x26], #0x8\n"
+      "ldr d1, [x25], #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "ldr d3, [x23], #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "ldr d5, [x21], #0x8\n"
+      "tbz x27, #0, 204f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "ld1 { v1.s }[2], [x25]\n"
+      "ld1 { v2.s }[2], [x24]\n"
+      "ld1 { v3.s }[2], [x23]\n"
+      "ld1 { v4.s }[2], [x22]\n"
+      "ld1 { v5.s }[2], [x21]\n"
+      "b 204f\n"
+      "203:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "ldr s1, [x25, #0x0]\n"
+      "ldr s2, [x24, #0x0]\n"
+      "ldr s3, [x23, #0x0]\n"
+      "ldr s4, [x22, #0x0]\n"
+      "ldr s5, [x21, #0x0]\n"
+      "204:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q7, [x12, #0x0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      "add x12, x12, #0x20\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x4ea168a4  // bfcvtn2 v4.8h, v5.4s\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q3, [x11, #0x0]\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q1, [x11, #0x10]\n"
+      "add x11, x11, #0x20\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x0]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add x10, x10, #0x20\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x9, #0x0]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x9, #0x10]\n"
+      "add x9, x9, #0x20\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
+      "205:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 197b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "add x26, x13, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "tbz %x[flags], #1, 206f\n"
+      "add x21, %x[args_ptr], %[offset_max]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x21]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v28.4s, v28.4s, v1.4s\n"
+      "fmin v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmax v6.4s, v6.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "206:"  // Height 6: No activation
+      "cmp x14, #0x10\n"
+      "bge 215f\n"
+      "tbz x14, #3, 210f\n"
+      "st1 { v6.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v15.4s }, [x25], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "st1 { v23.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v25.4s }, [x22], #0x10\n"
+      "tbz x14, #2, 208f\n"
+      "st1 { v13.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "st1 { v29.4s }, [x23], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 207f\n"
+      "str d14, [x13], #0x8\n"
+      "str d11, [x26], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d30, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "tbz x14, #0, 214f\n"
+      "st1 { v14.s }[2], [x13]\n"
+      "st1 { v11.s }[2], [x26]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "st1 { v30.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
+      "b 214f\n"
+      "207:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x14, #0, 214f\n"
+      "str s14, [x13, #0x0]\n"
+      "str s11, [x26, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "str s30, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
+      "b 214f\n"
+      "208:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x14, #1, 209f\n"
+      "str d13, [x13], #0x8\n"
+      "str d10, [x26], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d29, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "tbz x14, #0, 214f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v10.s }[2], [x26]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "st1 { v29.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
+      "b 214f\n"
+      "209:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x14, #0, 214f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s10, [x26, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "str s29, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
+      "b 214f\n"
+      "210:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x14, #2, 212f\n"
+      "st1 { v6.4s }, [x13], #0x10\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v15.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v23.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "tbz x14, #1, 211f\n"
+      "str d12, [x13], #0x8\n"
+      "str d9, [x26], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d28, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "tbz x14, #0, 214f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v9.s }[2], [x26]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "st1 { v28.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "b 214f\n"
+      "211:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x14, #0, 214f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s9, [x26, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "str s28, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "b 214f\n"
+      "212:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x14, #1, 213f\n"
+      "str d6, [x13], #0x8\n"
+      "str d8, [x26], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x14, #0, 214f\n"
+      "st1 { v6.s }[2], [x13]\n"
+      "st1 { v8.s }[2], [x26]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "st1 { v23.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "b 214f\n"
+      "213:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s6, [x13, #0x0]\n"
+      "str s8, [x26, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "str s23, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "214:"  // Height 6: Partial direct writeback: Done
+      "b 216f\n"
+      "215:"  // Height 6: Full writeback
+      "str q6, [x13, #0x0]\n"
+      "str q12, [x13, #0x10]\n"
+      "str q13, [x13, #0x20]\n"
+      "str q14, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "str q8, [x26, #0x0]\n"
+      "str q9, [x26, #0x10]\n"
+      "str q10, [x26, #0x20]\n"
+      "str q11, [x26, #0x30]\n"
+      "str q15, [x25, #0x0]\n"
+      "str q20, [x25, #0x10]\n"
+      "str q21, [x25, #0x20]\n"
+      "str q22, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q23, [x23, #0x0]\n"
+      "str q28, [x23, #0x10]\n"
+      "str q29, [x23, #0x20]\n"
+      "str q30, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
+      "216:"  // Height 6: Writeback done
+      "subs x14, x14, #0x10\n"
+      "bgt 182b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 218f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 217f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "217:"  // Update direct input
+      "mov x20, #0x18\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "218:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp
new file mode 100644
index 0000000000..745f89eff6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, size_t, \
+    float *, int, size_t, int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_ffinterleaved_bf16fp32_dot_8x12( ARGLIST );
+
+class cls_a64_ffinterleaved_bf16fp32_dot_8x12
+{
+public:
+    typedef bfloat16 operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return 12;
+    }
+    static unsigned int stripe_width()
+    {
+        return 4;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL128_BL32;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 2;
+    }
+
+
+    StdTransformsFixed<operand_type, result_type, 8, 12, 2> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 2, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 22.16, 8.25, 3.26 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_ffinterleaved_bf16fp32_dot_8x12;
+    cls_a64_ffinterleaved_bf16fp32_dot_8x12(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp
new file mode 100644
index 0000000000..5f4fcac690
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include "../../bfloat.hpp"
+
+namespace arm_gemm {
+
+void a64_ffinterleaved_bf16fp32_dot_8x12(
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    size_t B_stride,
+    float *Cpanel,
+    int ablocks,
+    size_t N,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+        size_t N = {};
+        size_t B_stride = {};
+        const bfloat16 *cur_B_ptr = {};
+    } ka;
+
+    ka.K = (K/2) - 1;
+    ka.Bpanel = Bpanel;
+    ka.N = N;
+    ka.B_stride = B_stride;
+
+    __asm__ __volatile__(
+      "1:"  // Height loop
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x24, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x25, #0x8\n"
+      "mov %x[Apanel], x24\n"
+      "bgt 3f\n"
+      "cmp x25, #0x4\n"
+      "mov x21, x23\n"
+      "bgt 3f\n"
+      "mov x22, x23\n"
+      "3:"  // B setup done
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v8.16b, #0x0\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "movi v9.16b, #0x0\n"
+      "ldr q6, [x21, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 5f\n"
+      "4:"  // main loop head
+      "ldr q3, [%x[Apanel], #0x20]\n"
+      "ldr q7, [%x[Apanel], #0x30]\n"
+      ".inst 0x4f40f088  // bfdot v8.4s, v4.8h, v0.h[0]\n"
+      ".inst 0x4f60f08b  // bfdot v11.4s, v4.8h, v0.h[1]\n"
+      ".inst 0x4f40f88e  // bfdot v14.4s, v4.8h, v0.h[2]\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x4f60f891  // bfdot v17.4s, v4.8h, v0.h[3]\n"
+      ".inst 0x4f41f094  // bfdot v20.4s, v4.8h, v1.h[0]\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x4f61f097  // bfdot v23.4s, v4.8h, v1.h[1]\n"
+      ".inst 0x4f41f89a  // bfdot v26.4s, v4.8h, v1.h[2]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x4f61f89d  // bfdot v29.4s, v4.8h, v1.h[3]\n"
+      "ldr q4, [x23, #0x10]\n"
+      ".inst 0x4f40f0a9  // bfdot v9.4s, v5.8h, v0.h[0]\n"
+      ".inst 0x4f60f0ac  // bfdot v12.4s, v5.8h, v0.h[1]\n"
+      ".inst 0x4f40f8af  // bfdot v15.4s, v5.8h, v0.h[2]\n"
+      "add x23, x23, #0x20\n"
+      ".inst 0x4f60f8b2  // bfdot v18.4s, v5.8h, v0.h[3]\n"
+      ".inst 0x4f41f0b5  // bfdot v21.4s, v5.8h, v1.h[0]\n"
+      ".inst 0x4f61f0b8  // bfdot v24.4s, v5.8h, v1.h[1]\n"
+      ".inst 0x4f41f8bb  // bfdot v27.4s, v5.8h, v1.h[2]\n"
+      ".inst 0x4f61f8be  // bfdot v30.4s, v5.8h, v1.h[3]\n"
+      "ldr q5, [x22, #0x10]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f60f0cd  // bfdot v13.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f40f8d0  // bfdot v16.4s, v6.8h, v0.h[2]\n"
+      "add x22, x22, #0x20\n"
+      ".inst 0x4f60f8d3  // bfdot v19.4s, v6.8h, v0.h[3]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      ".inst 0x4f41f0d6  // bfdot v22.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f61f0d9  // bfdot v25.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f41f8dc  // bfdot v28.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f61f8df  // bfdot v31.4s, v6.8h, v1.h[3]\n"
+      "ldr q2, [x21, #0x10]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "add x21, x21, #0x20\n"
+      ".inst 0x4f43f088  // bfdot v8.4s, v4.8h, v3.h[0]\n"
+      ".inst 0x4f63f08b  // bfdot v11.4s, v4.8h, v3.h[1]\n"
+      ".inst 0x4f43f88e  // bfdot v14.4s, v4.8h, v3.h[2]\n"
+      ".inst 0x4f63f891  // bfdot v17.4s, v4.8h, v3.h[3]\n"
+      ".inst 0x4f47f094  // bfdot v20.4s, v4.8h, v7.h[0]\n"
+      ".inst 0x4f67f097  // bfdot v23.4s, v4.8h, v7.h[1]\n"
+      ".inst 0x4f47f89a  // bfdot v26.4s, v4.8h, v7.h[2]\n"
+      ".inst 0x4f67f89d  // bfdot v29.4s, v4.8h, v7.h[3]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x4f43f0a9  // bfdot v9.4s, v5.8h, v3.h[0]\n"
+      ".inst 0x4f63f0ac  // bfdot v12.4s, v5.8h, v3.h[1]\n"
+      ".inst 0x4f43f8af  // bfdot v15.4s, v5.8h, v3.h[2]\n"
+      ".inst 0x4f63f8b2  // bfdot v18.4s, v5.8h, v3.h[3]\n"
+      ".inst 0x4f47f0b5  // bfdot v21.4s, v5.8h, v7.h[0]\n"
+      ".inst 0x4f67f0b8  // bfdot v24.4s, v5.8h, v7.h[1]\n"
+      ".inst 0x4f47f8bb  // bfdot v27.4s, v5.8h, v7.h[2]\n"
+      ".inst 0x4f67f8be  // bfdot v30.4s, v5.8h, v7.h[3]\n"
+      "ldr q5, [x22, #0x0]\n"
+      ".inst 0x4f43f04a  // bfdot v10.4s, v2.8h, v3.h[0]\n"
+      ".inst 0x4f63f04d  // bfdot v13.4s, v2.8h, v3.h[1]\n"
+      ".inst 0x4f43f850  // bfdot v16.4s, v2.8h, v3.h[2]\n"
+      ".inst 0x4f63f853  // bfdot v19.4s, v2.8h, v3.h[3]\n"
+      ".inst 0x4f47f056  // bfdot v22.4s, v2.8h, v7.h[0]\n"
+      ".inst 0x4f67f059  // bfdot v25.4s, v2.8h, v7.h[1]\n"
+      ".inst 0x4f47f85c  // bfdot v28.4s, v2.8h, v7.h[2]\n"
+      ".inst 0x4f67f85f  // bfdot v31.4s, v2.8h, v7.h[3]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "bge 4b\n"
+      "5:"  // main loop skip
+      ".inst 0x4f40f088  // bfdot v8.4s, v4.8h, v0.h[0]\n"
+      ".inst 0x4f60f08b  // bfdot v11.4s, v4.8h, v0.h[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      ".inst 0x4f40f88e  // bfdot v14.4s, v4.8h, v0.h[2]\n"
+      ".inst 0x4f60f891  // bfdot v17.4s, v4.8h, v0.h[3]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4f41f094  // bfdot v20.4s, v4.8h, v1.h[0]\n"
+      ".inst 0x4f61f097  // bfdot v23.4s, v4.8h, v1.h[1]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f41f89a  // bfdot v26.4s, v4.8h, v1.h[2]\n"
+      ".inst 0x4f61f89d  // bfdot v29.4s, v4.8h, v1.h[3]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x4f40f0a9  // bfdot v9.4s, v5.8h, v0.h[0]\n"
+      ".inst 0x4f60f0ac  // bfdot v12.4s, v5.8h, v0.h[1]\n"
+      ".inst 0x4f40f8af  // bfdot v15.4s, v5.8h, v0.h[2]\n"
+      ".inst 0x4f60f8b2  // bfdot v18.4s, v5.8h, v0.h[3]\n"
+      ".inst 0x4f41f0b5  // bfdot v21.4s, v5.8h, v1.h[0]\n"
+      ".inst 0x4f61f0b8  // bfdot v24.4s, v5.8h, v1.h[1]\n"
+      ".inst 0x4f41f8bb  // bfdot v27.4s, v5.8h, v1.h[2]\n"
+      ".inst 0x4f61f8be  // bfdot v30.4s, v5.8h, v1.h[3]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f60f0cd  // bfdot v13.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f40f8d0  // bfdot v16.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f60f8d3  // bfdot v19.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f41f0d6  // bfdot v22.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f61f0d9  // bfdot v25.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f41f8dc  // bfdot v28.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f61f8df  // bfdot v31.4s, v6.8h, v1.h[3]\n"
+      "cbz x20, 6f\n"
+      "ldr q4, [%x[Apanel], #0x0]\n"
+      "ldr q3, [%x[Apanel], #0x10]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ldr q2, [x23, #0x0]\n"
+      "ldr q1, [x22, #0x0]\n"
+      ".inst 0x4f44f048  // bfdot v8.4s, v2.8h, v4.h[0]\n"
+      "ldr q0, [x21, #0x0]\n"
+      ".inst 0x4f64f04b  // bfdot v11.4s, v2.8h, v4.h[1]\n"
+      ".inst 0x4f44f84e  // bfdot v14.4s, v2.8h, v4.h[2]\n"
+      ".inst 0x4f64f851  // bfdot v17.4s, v2.8h, v4.h[3]\n"
+      ".inst 0x4f43f054  // bfdot v20.4s, v2.8h, v3.h[0]\n"
+      ".inst 0x4f63f057  // bfdot v23.4s, v2.8h, v3.h[1]\n"
+      ".inst 0x4f43f85a  // bfdot v26.4s, v2.8h, v3.h[2]\n"
+      ".inst 0x4f63f85d  // bfdot v29.4s, v2.8h, v3.h[3]\n"
+      ".inst 0x4f44f029  // bfdot v9.4s, v1.8h, v4.h[0]\n"
+      ".inst 0x4f64f02c  // bfdot v12.4s, v1.8h, v4.h[1]\n"
+      ".inst 0x4f44f82f  // bfdot v15.4s, v1.8h, v4.h[2]\n"
+      ".inst 0x4f64f832  // bfdot v18.4s, v1.8h, v4.h[3]\n"
+      ".inst 0x4f43f035  // bfdot v21.4s, v1.8h, v3.h[0]\n"
+      ".inst 0x4f63f038  // bfdot v24.4s, v1.8h, v3.h[1]\n"
+      ".inst 0x4f43f83b  // bfdot v27.4s, v1.8h, v3.h[2]\n"
+      ".inst 0x4f63f83e  // bfdot v30.4s, v1.8h, v3.h[3]\n"
+      ".inst 0x4f44f00a  // bfdot v10.4s, v0.8h, v4.h[0]\n"
+      ".inst 0x4f64f00d  // bfdot v13.4s, v0.8h, v4.h[1]\n"
+      ".inst 0x4f44f810  // bfdot v16.4s, v0.8h, v4.h[2]\n"
+      ".inst 0x4f64f813  // bfdot v19.4s, v0.8h, v4.h[3]\n"
+      ".inst 0x4f43f016  // bfdot v22.4s, v0.8h, v3.h[0]\n"
+      ".inst 0x4f63f019  // bfdot v25.4s, v0.8h, v3.h[1]\n"
+      ".inst 0x4f43f81c  // bfdot v28.4s, v0.8h, v3.h[2]\n"
+      ".inst 0x4f63f81f  // bfdot v31.4s, v0.8h, v3.h[3]\n"
+      "6:"  // multiply loop done
+      "subs x25, x25, #0xc\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp
new file mode 100644
index 0000000000..1a8b0fd630
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, size_t, \
+    float *, int, size_t, int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_ffinterleaved_bf16fp32_mmla_8x12( ARGLIST );
+
+class cls_a64_ffinterleaved_bf16fp32_mmla_8x12
+{
+public:
+    typedef bfloat16 operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return 12;
+    }
+    static unsigned int stripe_width()
+    {
+        return 4;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL256_BL64;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.62, 9.07, 3.23 };
+            }
+        }
+
+
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::V1:
+                    return { 45.25, 4.29, 4.80 };
+                default:
+                    return { 29.85, 2.60, 5.49 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_ffinterleaved_bf16fp32_mmla_8x12;
+    cls_a64_ffinterleaved_bf16fp32_mmla_8x12(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp
new file mode 100644
index 0000000000..4a1c1b5638
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include "../../bfloat.hpp"
+
+namespace arm_gemm {
+
+void a64_ffinterleaved_bf16fp32_mmla_8x12(
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    size_t B_stride,
+    float *Cpanel,
+    int ablocks,
+    size_t N,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+        size_t N = {};
+        size_t B_stride = {};
+        const bfloat16 *cur_B_ptr = {};
+    } ka;
+
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+    ka.N = N;
+    ka.B_stride = B_stride;
+
+    __asm__ __volatile__(
+      "1:"  // Height loop
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x24, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x25, #0x8\n"
+      "mov %x[Apanel], x24\n"
+      "bgt 3f\n"
+      "cmp x25, #0x4\n"
+      "mov x21, x23\n"
+      "bgt 3f\n"
+      "mov x22, x23\n"
+      "3:"  // B setup done
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v8.16b, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q5, [x23, #0x10]\n"
+      "movi v9.16b, #0x0\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "add x23, x23, #0x20\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 5f\n"
+      "4:"  // main loop head
+      "ldr q6, [%x[Apanel], #0x0]\n"
+      "ldr q7, [x22, #0x0]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q3, [x22, #0x10]\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ecda  // bfmmla v26.4s, v6.8h, v4.8h\n"
+      "ldr q4, [x21, #0x0]\n"
+      ".inst 0x6e45ecdd  // bfmmla v29.4s, v6.8h, v5.8h\n"
+      "ldr q5, [x21, #0x10]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0c  // bfmmla v12.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e47ec2f  // bfmmla v15.4s, v1.8h, v7.8h\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x6e43ec32  // bfmmla v18.4s, v1.8h, v3.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec58  // bfmmla v24.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e47ecdb  // bfmmla v27.4s, v6.8h, v7.8h\n"
+      "ldr q7, [x23, #0x0]\n"
+      ".inst 0x6e43ecde  // bfmmla v30.4s, v6.8h, v3.8h\n"
+      "ldr q3, [x23, #0x10]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      "ldr q0, [%x[Apanel], #0x10]\n"
+      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
+      "ldr q1, [%x[Apanel], #0x20]\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
+      "ldr q2, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e44ecdc  // bfmmla v28.4s, v6.8h, v4.8h\n"
+      "ldr q4, [x22, #0x20]\n"
+      ".inst 0x6e45ecdf  // bfmmla v31.4s, v6.8h, v5.8h\n"
+      "ldr q6, [%x[Apanel], #0x40]\n"
+      "ldr q5, [x22, #0x30]\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e47ec2e  // bfmmla v14.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec31  // bfmmla v17.4s, v1.8h, v3.8h\n"
+      "add x22, x22, #0x40\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e47ecda  // bfmmla v26.4s, v6.8h, v7.8h\n"
+      "ldr q7, [x21, #0x20]\n"
+      ".inst 0x6e43ecdd  // bfmmla v29.4s, v6.8h, v3.8h\n"
+      "ldr q3, [x21, #0x30]\n"
+      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
+      "add x21, x21, #0x40\n"
+      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ecdb  // bfmmla v27.4s, v6.8h, v4.8h\n"
+      "ldr q4, [x23, #0x20]\n"
+      ".inst 0x6e45ecde  // bfmmla v30.4s, v6.8h, v5.8h\n"
+      "ldr q5, [x23, #0x30]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      "ldr q0, [%x[Apanel], #0x50]\n"
+      ".inst 0x6e47ec30  // bfmmla v16.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec33  // bfmmla v19.4s, v1.8h, v3.8h\n"
+      "ldr q1, [%x[Apanel], #0x60]\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      "ldr q2, [%x[Apanel], #0x70]\n"
+      ".inst 0x6e47ecdc  // bfmmla v28.4s, v6.8h, v7.8h\n"
+      ".inst 0x6e43ecdf  // bfmmla v31.4s, v6.8h, v3.8h\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "add x23, x23, #0x40\n"
+      "bge 4b\n"
+      "5:"  // main loop skip
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q7, [x22, #0x10]\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      "ldr q4, [x21, #0x0]\n"
+      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      "ldr q5, [x21, #0x10]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      "add x22, x22, #0x20\n"
+      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "add x21, x21, #0x20\n"
+      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
+      "cbz x20, 6f\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q7, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e41ece8  // bfmmla v8.4s, v7.8h, v1.8h\n"
+      "ldr q6, [%x[Apanel], #0x10]\n"
+      "ldr q0, [x23, #0x10]\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      "ldr q5, [%x[Apanel], #0x20]\n"
+      "ldr q4, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e41ecce  // bfmmla v14.4s, v6.8h, v1.8h\n"
+      "ldr q3, [x22, #0x0]\n"
+      "ldr q2, [x22, #0x10]\n"
+      ".inst 0x6e40ecd1  // bfmmla v17.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb4  // bfmmla v20.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb7  // bfmmla v23.4s, v5.8h, v0.8h\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x6e41ec9a  // bfmmla v26.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x21, #0x0]\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x21, #0x10]\n"
+      ".inst 0x6e43ece9  // bfmmla v9.4s, v7.8h, v3.8h\n"
+      ".inst 0x6e42ecec  // bfmmla v12.4s, v7.8h, v2.8h\n"
+      ".inst 0x6e43eccf  // bfmmla v15.4s, v6.8h, v3.8h\n"
+      ".inst 0x6e42ecd2  // bfmmla v18.4s, v6.8h, v2.8h\n"
+      ".inst 0x6e43ecb5  // bfmmla v21.4s, v5.8h, v3.8h\n"
+      ".inst 0x6e42ecb8  // bfmmla v24.4s, v5.8h, v2.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e42ec9e  // bfmmla v30.4s, v4.8h, v2.8h\n"
+      ".inst 0x6e41ecea  // bfmmla v10.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e40eced  // bfmmla v13.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e41ecd0  // bfmmla v16.4s, v6.8h, v1.8h\n"
+      ".inst 0x6e40ecd3  // bfmmla v19.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb6  // bfmmla v22.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e41ec9c  // bfmmla v28.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
+      "6:"  // multiply loop done
+      "subs x25, x25, #0xc\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp
new file mode 100644
index 0000000000..b9b4ad54df
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    const __fp16 *, const __fp16 *, size_t, \
+    __fp16 *, int, size_t, int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_ffinterleaved_fp16_mla_8x24( ARGLIST );
+
+class cls_a64_ffinterleaved_fp16_mla_8x24
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return 24;
+    }
+    static unsigned int stripe_width()
+    {
+        return 8;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL128_BL16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+
+    StdTransformsFixed<operand_type, result_type, 8, 24, 1> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 24, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, __fp16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 22.87, 7.77, 2.03 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_ffinterleaved_fp16_mla_8x24;
+    cls_a64_ffinterleaved_fp16_mla_8x24(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
new file mode 100644
index 0000000000..1e3f2f300b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void a64_ffinterleaved_fp16_mla_8x24(
+    const __fp16 *Apanel,
+    const __fp16 *Bpanel,
+    size_t B_stride,
+    __fp16 *Cpanel,
+    int ablocks,
+    size_t N,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const __fp16 *Bpanel = {};
+        size_t N = {};
+        size_t B_stride = {};
+        const __fp16 *cur_B_ptr = {};
+    } ka;
+
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+    ka.N = N;
+    ka.B_stride = B_stride;
+
+    __asm__ __volatile__(
+      "1:"  // Height loop
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x24, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x25, #0x10\n"
+      "mov %x[Apanel], x24\n"
+      "bgt 3f\n"
+      "cmp x25, #0x8\n"
+      "mov x21, x23\n"
+      "bgt 3f\n"
+      "mov x22, x23\n"
+      "3:"  // B setup done
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "movi v8.16b, #0x0\n"
+      "ldr q3, [x22, #0x0]\n"
+      "ldr q4, [x21, #0x0]\n"
+      "movi v9.16b, #0x0\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 5f\n"
+      "4:"  // main loop head
+      "ldr q7, [%x[Apanel], #0x10]\n"
+      "ldr q6, [x23, #0x10]\n"
+      "fmla v8.8h, v2.8h, v0.h[0]\n"
+      "ldr q5, [x22, #0x10]\n"
+      "ldr q1, [x21, #0x10]\n"
+      "fmla v11.8h, v2.8h, v0.h[1]\n"
+      "fmla v14.8h, v2.8h, v0.h[2]\n"
+      "fmla v17.8h, v2.8h, v0.h[3]\n"
+      "sub x20, x20, #0x2\n"
+      "fmla v20.8h, v2.8h, v0.h[4]\n"
+      "fmla v23.8h, v2.8h, v0.h[5]\n"
+      "cmp x20, #0x2\n"
+      "fmla v26.8h, v2.8h, v0.h[6]\n"
+      "fmla v29.8h, v2.8h, v0.h[7]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla v9.8h, v3.8h, v0.h[0]\n"
+      "fmla v12.8h, v3.8h, v0.h[1]\n"
+      "add x23, x23, #0x20\n"
+      "ldr q2, [x23, #0x0]\n"
+      "fmla v15.8h, v3.8h, v0.h[2]\n"
+      "fmla v18.8h, v3.8h, v0.h[3]\n"
+      "fmla v21.8h, v3.8h, v0.h[4]\n"
+      "fmla v24.8h, v3.8h, v0.h[5]\n"
+      "add x22, x22, #0x20\n"
+      "fmla v27.8h, v3.8h, v0.h[6]\n"
+      "fmla v30.8h, v3.8h, v0.h[7]\n"
+      "ldr q3, [x22, #0x0]\n"
+      "fmla v10.8h, v4.8h, v0.h[0]\n"
+      "fmla v13.8h, v4.8h, v0.h[1]\n"
+      "add x21, x21, #0x20\n"
+      "fmla v16.8h, v4.8h, v0.h[2]\n"
+      "fmla v19.8h, v4.8h, v0.h[3]\n"
+      "fmla v22.8h, v4.8h, v0.h[4]\n"
+      "fmla v25.8h, v4.8h, v0.h[5]\n"
+      "fmla v28.8h, v4.8h, v0.h[6]\n"
+      "fmla v31.8h, v4.8h, v0.h[7]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q4, [x21, #0x0]\n"
+      "fmla v8.8h, v6.8h, v7.h[0]\n"
+      "fmla v11.8h, v6.8h, v7.h[1]\n"
+      "fmla v14.8h, v6.8h, v7.h[2]\n"
+      "fmla v17.8h, v6.8h, v7.h[3]\n"
+      "fmla v20.8h, v6.8h, v7.h[4]\n"
+      "fmla v23.8h, v6.8h, v7.h[5]\n"
+      "fmla v26.8h, v6.8h, v7.h[6]\n"
+      "fmla v29.8h, v6.8h, v7.h[7]\n"
+      "fmla v9.8h, v5.8h, v7.h[0]\n"
+      "fmla v12.8h, v5.8h, v7.h[1]\n"
+      "fmla v15.8h, v5.8h, v7.h[2]\n"
+      "fmla v18.8h, v5.8h, v7.h[3]\n"
+      "fmla v21.8h, v5.8h, v7.h[4]\n"
+      "fmla v24.8h, v5.8h, v7.h[5]\n"
+      "fmla v27.8h, v5.8h, v7.h[6]\n"
+      "fmla v30.8h, v5.8h, v7.h[7]\n"
+      "fmla v10.8h, v1.8h, v7.h[0]\n"
+      "fmla v13.8h, v1.8h, v7.h[1]\n"
+      "fmla v16.8h, v1.8h, v7.h[2]\n"
+      "fmla v19.8h, v1.8h, v7.h[3]\n"
+      "fmla v22.8h, v1.8h, v7.h[4]\n"
+      "fmla v25.8h, v1.8h, v7.h[5]\n"
+      "fmla v28.8h, v1.8h, v7.h[6]\n"
+      "fmla v31.8h, v1.8h, v7.h[7]\n"
+      "bge 4b\n"
+      "5:"  // main loop skip
+      "fmla v8.8h, v2.8h, v0.h[0]\n"
+      "fmla v11.8h, v2.8h, v0.h[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla v14.8h, v2.8h, v0.h[2]\n"
+      "fmla v17.8h, v2.8h, v0.h[3]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v20.8h, v2.8h, v0.h[4]\n"
+      "fmla v23.8h, v2.8h, v0.h[5]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v26.8h, v2.8h, v0.h[6]\n"
+      "fmla v29.8h, v2.8h, v0.h[7]\n"
+      "add x21, x21, #0x10\n"
+      "fmla v9.8h, v3.8h, v0.h[0]\n"
+      "fmla v12.8h, v3.8h, v0.h[1]\n"
+      "fmla v15.8h, v3.8h, v0.h[2]\n"
+      "fmla v18.8h, v3.8h, v0.h[3]\n"
+      "fmla v21.8h, v3.8h, v0.h[4]\n"
+      "fmla v24.8h, v3.8h, v0.h[5]\n"
+      "fmla v27.8h, v3.8h, v0.h[6]\n"
+      "fmla v30.8h, v3.8h, v0.h[7]\n"
+      "fmla v10.8h, v4.8h, v0.h[0]\n"
+      "fmla v13.8h, v4.8h, v0.h[1]\n"
+      "fmla v16.8h, v4.8h, v0.h[2]\n"
+      "fmla v19.8h, v4.8h, v0.h[3]\n"
+      "fmla v22.8h, v4.8h, v0.h[4]\n"
+      "fmla v25.8h, v4.8h, v0.h[5]\n"
+      "fmla v28.8h, v4.8h, v0.h[6]\n"
+      "fmla v31.8h, v4.8h, v0.h[7]\n"
+      "cbz x20, 6f\n"
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "fmla v8.8h, v2.8h, v3.h[0]\n"
+      "ldr q1, [x22, #0x0]\n"
+      "ldr q0, [x21, #0x0]\n"
+      "fmla v11.8h, v2.8h, v3.h[1]\n"
+      "fmla v14.8h, v2.8h, v3.h[2]\n"
+      "fmla v17.8h, v2.8h, v3.h[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla v20.8h, v2.8h, v3.h[4]\n"
+      "fmla v23.8h, v2.8h, v3.h[5]\n"
+      "fmla v26.8h, v2.8h, v3.h[6]\n"
+      "fmla v29.8h, v2.8h, v3.h[7]\n"
+      "fmla v9.8h, v1.8h, v3.h[0]\n"
+      "fmla v12.8h, v1.8h, v3.h[1]\n"
+      "fmla v15.8h, v1.8h, v3.h[2]\n"
+      "fmla v18.8h, v1.8h, v3.h[3]\n"
+      "fmla v21.8h, v1.8h, v3.h[4]\n"
+      "fmla v24.8h, v1.8h, v3.h[5]\n"
+      "fmla v27.8h, v1.8h, v3.h[6]\n"
+      "fmla v30.8h, v1.8h, v3.h[7]\n"
+      "fmla v10.8h, v0.8h, v3.h[0]\n"
+      "fmla v13.8h, v0.8h, v3.h[1]\n"
+      "fmla v16.8h, v0.8h, v3.h[2]\n"
+      "fmla v19.8h, v0.8h, v3.h[3]\n"
+      "fmla v22.8h, v0.8h, v3.h[4]\n"
+      "fmla v25.8h, v0.8h, v3.h[5]\n"
+      "fmla v28.8h, v0.8h, v3.h[6]\n"
+      "fmla v31.8h, v0.8h, v3.h[7]\n"
+      "6:"  // multiply loop done
+      "subs x25, x25, #0x18\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // namespace arm_gemm
+#endif // defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp
new file mode 100644
index 0000000000..c4445ba14a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    const float *, const float *, size_t, \
+    float *, int, size_t, int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_ffinterleaved_fp32_mla_8x12( ARGLIST );
+
+class cls_a64_ffinterleaved_fp32_mla_8x12
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return 12;
+    }
+    static unsigned int stripe_width()
+    {
+        return 4;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL128_BL32;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+
+    StdTransformsFixed<operand_type, result_type, 8, 12, 1> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 12.56, 9.83, 3.02 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_ffinterleaved_fp32_mla_8x12;
+    cls_a64_ffinterleaved_fp32_mla_8x12(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
new file mode 100644
index 0000000000..6de0a380eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void a64_ffinterleaved_fp32_mla_8x12(
+    const float *Apanel,
+    const float *Bpanel,
+    size_t B_stride,
+    float *Cpanel,
+    int ablocks,
+    size_t N,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const float *Bpanel = {};
+        size_t N = {};
+        size_t B_stride = {};
+        const float *cur_B_ptr = {};
+    } ka;
+
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+    ka.N = N;
+    ka.B_stride = B_stride;
+
+    __asm__ __volatile__(
+      "1:"  // Height loop
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x24, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "cmp x25, #0x8\n"
+      "mov %x[Apanel], x24\n"
+      "bgt 3f\n"
+      "cmp x25, #0x4\n"
+      "mov x21, x23\n"
+      "bgt 3f\n"
+      "mov x22, x23\n"
+      "3:"  // B setup done
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v8.16b, #0x0\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "movi v9.16b, #0x0\n"
+      "ldr q6, [x21, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x4\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 5f\n"
+      "4:"  // main loop head
+      "ldr q3, [%x[Apanel], #0x20]\n"
+      "ldr q7, [%x[Apanel], #0x30]\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q2, [x23, #0x10]\n"
+      "fmla v11.4s, v4.4s, v0.s[1]\n"
+      "fmla v14.4s, v4.4s, v0.s[2]\n"
+      "fmla v17.4s, v4.4s, v0.s[3]\n"
+      "fmla v20.4s, v4.4s, v1.s[0]\n"
+      "sub x20, x20, #0x4\n"
+      "fmla v23.4s, v4.4s, v1.s[1]\n"
+      "fmla v26.4s, v4.4s, v1.s[2]\n"
+      "cmp x20, #0x4\n"
+      "fmla v29.4s, v4.4s, v1.s[3]\n"
+      "ldr q4, [x22, #0x10]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v12.4s, v5.4s, v0.s[1]\n"
+      "fmla v15.4s, v5.4s, v0.s[2]\n"
+      "fmla v18.4s, v5.4s, v0.s[3]\n"
+      "fmla v21.4s, v5.4s, v1.s[0]\n"
+      "fmla v24.4s, v5.4s, v1.s[1]\n"
+      "fmla v27.4s, v5.4s, v1.s[2]\n"
+      "fmla v30.4s, v5.4s, v1.s[3]\n"
+      "ldr q5, [x21, #0x10]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v13.4s, v6.4s, v0.s[1]\n"
+      "fmla v16.4s, v6.4s, v0.s[2]\n"
+      "fmla v19.4s, v6.4s, v0.s[3]\n"
+      "ldr q0, [%x[Apanel], #0x40]\n"
+      "fmla v22.4s, v6.4s, v1.s[0]\n"
+      "fmla v25.4s, v6.4s, v1.s[1]\n"
+      "fmla v28.4s, v6.4s, v1.s[2]\n"
+      "fmla v31.4s, v6.4s, v1.s[3]\n"
+      "ldr q1, [%x[Apanel], #0x50]\n"
+      "ldr q6, [x23, #0x20]\n"
+      "fmla v8.4s, v2.4s, v3.s[0]\n"
+      "fmla v11.4s, v2.4s, v3.s[1]\n"
+      "fmla v14.4s, v2.4s, v3.s[2]\n"
+      "fmla v17.4s, v2.4s, v3.s[3]\n"
+      "fmla v20.4s, v2.4s, v7.s[0]\n"
+      "fmla v23.4s, v2.4s, v7.s[1]\n"
+      "fmla v26.4s, v2.4s, v7.s[2]\n"
+      "fmla v29.4s, v2.4s, v7.s[3]\n"
+      "ldr q2, [x22, #0x20]\n"
+      "fmla v9.4s, v4.4s, v3.s[0]\n"
+      "fmla v12.4s, v4.4s, v3.s[1]\n"
+      "fmla v15.4s, v4.4s, v3.s[2]\n"
+      "fmla v18.4s, v4.4s, v3.s[3]\n"
+      "fmla v21.4s, v4.4s, v7.s[0]\n"
+      "fmla v24.4s, v4.4s, v7.s[1]\n"
+      "fmla v27.4s, v4.4s, v7.s[2]\n"
+      "fmla v30.4s, v4.4s, v7.s[3]\n"
+      "ldr q4, [x21, #0x20]\n"
+      "fmla v10.4s, v5.4s, v3.s[0]\n"
+      "fmla v13.4s, v5.4s, v3.s[1]\n"
+      "fmla v16.4s, v5.4s, v3.s[2]\n"
+      "fmla v19.4s, v5.4s, v3.s[3]\n"
+      "ldr q3, [%x[Apanel], #0x60]\n"
+      "fmla v22.4s, v5.4s, v7.s[0]\n"
+      "fmla v25.4s, v5.4s, v7.s[1]\n"
+      "fmla v28.4s, v5.4s, v7.s[2]\n"
+      "fmla v31.4s, v5.4s, v7.s[3]\n"
+      "ldr q7, [%x[Apanel], #0x70]\n"
+      "ldr q5, [x23, #0x30]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v11.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v0.s[2]\n"
+      "fmla v17.4s, v6.4s, v0.s[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "fmla v20.4s, v6.4s, v1.s[0]\n"
+      "fmla v23.4s, v6.4s, v1.s[1]\n"
+      "add x23, x23, #0x40\n"
+      "fmla v26.4s, v6.4s, v1.s[2]\n"
+      "fmla v29.4s, v6.4s, v1.s[3]\n"
+      "ldr q6, [x22, #0x30]\n"
+      "fmla v9.4s, v2.4s, v0.s[0]\n"
+      "fmla v12.4s, v2.4s, v0.s[1]\n"
+      "add x22, x22, #0x40\n"
+      "fmla v15.4s, v2.4s, v0.s[2]\n"
+      "fmla v18.4s, v2.4s, v0.s[3]\n"
+      "fmla v21.4s, v2.4s, v1.s[0]\n"
+      "fmla v24.4s, v2.4s, v1.s[1]\n"
+      "fmla v27.4s, v2.4s, v1.s[2]\n"
+      "fmla v30.4s, v2.4s, v1.s[3]\n"
+      "ldr q2, [x21, #0x30]\n"
+      "fmla v10.4s, v4.4s, v0.s[0]\n"
+      "fmla v13.4s, v4.4s, v0.s[1]\n"
+      "add x21, x21, #0x40\n"
+      "fmla v16.4s, v4.4s, v0.s[2]\n"
+      "fmla v19.4s, v4.4s, v0.s[3]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "fmla v22.4s, v4.4s, v1.s[0]\n"
+      "fmla v25.4s, v4.4s, v1.s[1]\n"
+      "fmla v28.4s, v4.4s, v1.s[2]\n"
+      "fmla v31.4s, v4.4s, v1.s[3]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "fmla v8.4s, v5.4s, v3.s[0]\n"
+      "fmla v11.4s, v5.4s, v3.s[1]\n"
+      "fmla v14.4s, v5.4s, v3.s[2]\n"
+      "fmla v17.4s, v5.4s, v3.s[3]\n"
+      "fmla v20.4s, v5.4s, v7.s[0]\n"
+      "fmla v23.4s, v5.4s, v7.s[1]\n"
+      "fmla v26.4s, v5.4s, v7.s[2]\n"
+      "fmla v29.4s, v5.4s, v7.s[3]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "fmla v9.4s, v6.4s, v3.s[0]\n"
+      "fmla v12.4s, v6.4s, v3.s[1]\n"
+      "fmla v15.4s, v6.4s, v3.s[2]\n"
+      "fmla v18.4s, v6.4s, v3.s[3]\n"
+      "fmla v21.4s, v6.4s, v7.s[0]\n"
+      "fmla v24.4s, v6.4s, v7.s[1]\n"
+      "fmla v27.4s, v6.4s, v7.s[2]\n"
+      "fmla v30.4s, v6.4s, v7.s[3]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "fmla v10.4s, v2.4s, v3.s[0]\n"
+      "fmla v13.4s, v2.4s, v3.s[1]\n"
+      "fmla v16.4s, v2.4s, v3.s[2]\n"
+      "fmla v19.4s, v2.4s, v3.s[3]\n"
+      "fmla v22.4s, v2.4s, v7.s[0]\n"
+      "fmla v25.4s, v2.4s, v7.s[1]\n"
+      "fmla v28.4s, v2.4s, v7.s[2]\n"
+      "fmla v31.4s, v2.4s, v7.s[3]\n"
+      "bge 4b\n"
+      "5:"  // main loop skip
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "fmla v11.4s, v4.4s, v0.s[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla v14.4s, v4.4s, v0.s[2]\n"
+      "fmla v17.4s, v4.4s, v0.s[3]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v20.4s, v4.4s, v1.s[0]\n"
+      "fmla v23.4s, v4.4s, v1.s[1]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v26.4s, v4.4s, v1.s[2]\n"
+      "fmla v29.4s, v4.4s, v1.s[3]\n"
+      "add x21, x21, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v12.4s, v5.4s, v0.s[1]\n"
+      "fmla v15.4s, v5.4s, v0.s[2]\n"
+      "fmla v18.4s, v5.4s, v0.s[3]\n"
+      "fmla v21.4s, v5.4s, v1.s[0]\n"
+      "fmla v24.4s, v5.4s, v1.s[1]\n"
+      "fmla v27.4s, v5.4s, v1.s[2]\n"
+      "fmla v30.4s, v5.4s, v1.s[3]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v13.4s, v6.4s, v0.s[1]\n"
+      "fmla v16.4s, v6.4s, v0.s[2]\n"
+      "fmla v19.4s, v6.4s, v0.s[3]\n"
+      "fmla v22.4s, v6.4s, v1.s[0]\n"
+      "fmla v25.4s, v6.4s, v1.s[1]\n"
+      "fmla v28.4s, v6.4s, v1.s[2]\n"
+      "fmla v31.4s, v6.4s, v1.s[3]\n"
+      "cbz x20, 7f\n"
+      "6:"  // odd loop
+      "ldr q4, [%x[Apanel], #0x0]\n"
+      "ldr q3, [%x[Apanel], #0x10]\n"
+      "subs x20, x20, #0x1\n"
+      "ldr q2, [x23, #0x0]\n"
+      "ldr q1, [x22, #0x0]\n"
+      "fmla v8.4s, v2.4s, v4.s[0]\n"
+      "ldr q0, [x21, #0x0]\n"
+      "fmla v11.4s, v2.4s, v4.s[1]\n"
+      "fmla v14.4s, v2.4s, v4.s[2]\n"
+      "fmla v17.4s, v2.4s, v4.s[3]\n"
+      "fmla v20.4s, v2.4s, v3.s[0]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla v23.4s, v2.4s, v3.s[1]\n"
+      "fmla v26.4s, v2.4s, v3.s[2]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v29.4s, v2.4s, v3.s[3]\n"
+      "fmla v9.4s, v1.4s, v4.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v12.4s, v1.4s, v4.s[1]\n"
+      "fmla v15.4s, v1.4s, v4.s[2]\n"
+      "add x21, x21, #0x10\n"
+      "fmla v18.4s, v1.4s, v4.s[3]\n"
+      "fmla v21.4s, v1.4s, v3.s[0]\n"
+      "fmla v24.4s, v1.4s, v3.s[1]\n"
+      "fmla v27.4s, v1.4s, v3.s[2]\n"
+      "fmla v30.4s, v1.4s, v3.s[3]\n"
+      "fmla v10.4s, v0.4s, v4.s[0]\n"
+      "fmla v13.4s, v0.4s, v4.s[1]\n"
+      "fmla v16.4s, v0.4s, v4.s[2]\n"
+      "fmla v19.4s, v0.4s, v4.s[3]\n"
+      "fmla v22.4s, v0.4s, v3.s[0]\n"
+      "fmla v25.4s, v0.4s, v3.s[1]\n"
+      "fmla v28.4s, v0.4s, v3.s[2]\n"
+      "fmla v31.4s, v0.4s, v3.s[3]\n"
+      "bne 6b\n"
+      "7:"  // multiply loop done
+      "subs x25, x25, #0xc\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index b68a5f518a..1363b939ab 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #ifdef __aarch64__
 
 #include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
 
 namespace arm_gemm {
 
@@ -58,6 +59,33 @@ public:
     StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
     StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
 
+    template<typename T>
+    static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r0:
+                case CPUModel::A55r1:
+                    return { 3.12, 2.93, 1.84 };
+                case CPUModel::A510:
+                    return { 3.32, 2.56, 2.63 };
+                default:
+                    return { 7.97, 3.72, 7.31 };
+            }
+        }
+
+        if (std::is_same<T, int8_t>::value) {
+            switch(ci->get_cpu_model()) {
+                case CPUModel::A55r0:
+                case CPUModel::A55r1:
+                    return { 3.12, 2.18, 0.09 };
+                case CPUModel::A510:
+                    return { 3.33, 2.89, 0.09 };
+                default:
+                    return { 7.97, 3.74, 0.34 };
+            }
+        }
+    }
+
     kern_type kernel=a64_gemm_s8_4x4;
 
     cls_a64_gemm_s8_4x4(const CPUInfo *) { }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
index 7c7b894b08..9af1b4df12 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
@@ -61,13 +61,38 @@ public:
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
     StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
 
+    template<typename T>
     static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 15.361, 0.9341, 0.1636 };
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 19.73, 3.38, 0.27 };
 
-            default:
-                return { 29.0698, 3.9793, 0.4003 };
+                case CPUModel::A55r1:
+                    return { 15.361, 0.9341, 0.1636 };
+
+                case CPUModel::V1:
+                    return { 51.14, 7.38, 0.65 };
+
+                default:
+                    return { 29.0698, 3.9793, 0.4003 };
+            }
+        }
+
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 19.73, 3.38, 3.70 };
+
+                case CPUModel::A55r1:
+                    return { 14.286, 1.171, 1.209 };
+
+                case CPUModel::V1:
+                    return { 61.58, 4.78, 10.83 };
+
+                default:
+                    return { 31.82, 3.51, 8.03 };
+            }
         }
     }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 854b6751c1..b747a1cf84 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #ifdef __aarch64__
 
+#include "../performance_parameters.hpp"
 #include "../std_transforms_fixed.hpp"
 
 namespace arm_gemm {
@@ -66,6 +67,35 @@ public:
     StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
     StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
 
+    template<typename T>
+    static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r0:
+                case CPUModel::A55r1:
+                    return { 2.25, 2.92, 1.84 };
+                case CPUModel::A510:
+                    return { 2.64, 2.72, 2.64 };
+                default:
+                    return { 7.95, 3.76, 7.27 };
+            }
+        }
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch(ci->get_cpu_model()) {
+                case CPUModel::A55r0:
+                case CPUModel::A55r1:
+                    return { 2.25, 2.18, 0.09 };
+                case CPUModel::A510:
+                    return { 2.64, 1.79, 0.10 };
+                default:
+                    return { 7.95, 4.09, 0.33 };
+            }
+        }
+
+        return { 0.0 };
+    }
+
     kern_type kernel = a64_gemm_u8_4x4;
 
     cls_a64_gemm_u8_4x4(const CPUInfo *) { }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
index 00ed5d03bf..6d333f3449 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2018,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,14 +69,41 @@ public:
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
     StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
 
+    template<typename T>
     static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 15.361, 0.9341, 0.1636 };
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 19.73, 3.38, 0.27 };
 
-            default:
-                return { 29.0698, 3.9793, 0.4003 };
+                case CPUModel::A55r1:
+                    return { 15.361, 0.9341, 0.1636 };
+
+                case CPUModel::V1:
+                    return { 51.14, 7.38, 0.65 };
+
+                default:
+                    return { 29.0698, 3.9793, 0.4003 };
+            }
+        }
+
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 19.73, 3.38, 3.70 };
+
+                case CPUModel::A55r1:
+                    return { 14.286, 1.171, 1.209 };
+
+                case CPUModel::V1:
+                    return { 61.58, 4.78, 10.83 };
+
+                default:
+                    return { 31.82, 3.51, 8.03 };
+            }
         }
+
+        return { 0.0 };
     }
 
     kern_type kernel = a64_gemm_u8_8x12;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
deleted file mode 100644
index b53172509e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "../performance_parameters.hpp"
-#include "../std_transforms_fixed.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void a64_gemv_fp32_mla_32(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
-
-class cls_a64_gemv_fp32_mla_32
-{
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
-
-    static unsigned int out_width()
-    {
-        return 32;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 1;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsFixed<operand_type, result_type, 1, 32, 1> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=a64_gemv_fp32_mla_32;
-
-    cls_a64_gemv_fp32_mla_32(const CPUInfo *)
-    {
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
deleted file mode 100644
index 51a9641af5..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
+++ /dev/null
@@ -1,1547 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-#include <limits>
-
-namespace arm_gemm {
-
-void a64_gemv_fp32_mla_32 (
-    const float *A_ptr, const float *B_ptr, float *output_ptr,
-    size_t N, size_t K,
-    const float *bias, Activation act, bool
-)
-{
-    struct KernelArgs {
-        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
-        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-        const float *B_ptr = {};
-        size_t output_offset = {};
-        unsigned int input_initial_col = {};
-    } ka;
-
-    unsigned long flags=0;
-    ka.B_ptr = B_ptr;
-    switch(act.type) {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            ka.maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            ka.minval = 0;
-            flags |= 0x2;
-            break;
-    }
-    __asm__ __volatile__(
-      "add x22, %x[N], #0x3\n"
-      "mov x21, %x[bias]\n"
-      "lsr x22, x22, #0x2\n"
-      "1:"  // Column loop
-      "cmp x22, #0x8\n"
-      "bge 85f\n"
-      "cmp x22, #0x6\n"
-      "bgt 73f\n"
-      "beq 61f\n"
-      "cmp x22, #0x4\n"
-      "bgt 49f\n"
-      "beq 37f\n"
-      "cmp x22, #0x2\n"
-      "bgt 25f\n"
-      "beq 13f\n"
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 2f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "add x21, x21, #0x10\n"
-      "b 3f\n"
-      "2:"  // Width 1: no bias
-      "movi v24.16b, #0x0\n"
-      "3:"  // Width 1: setup done
-      "cmp x20, #0x4\n"
-      "blt 6f\n"
-      "cmp x20, #0x8\n"
-      "blt 5f\n"
-      "4:"  // Width 1: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q2, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v2.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q3, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v3.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q4, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v4.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "bge 4b\n"
-      "5:"  // Width 1: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q5, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v5.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q6, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v6.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q7, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v7.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q8, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v8.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "6:"  // Width 1: Multiply loop: Main loop skip
-      "cbz x20, 8f\n"
-      "7:"  // Width 1: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q9, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v9.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "sub x20, x20, #0x1\n"
-      "cbnz x20, 7b\n"
-      "8:"  // Width 1: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 9f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "9:"  // Width 1: No activation
-      "cmp %x[N], #0x4\n"
-      "blt 10f\n"
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 12f\n"
-      "10:"  // Width 1: Partial writeback
-      "tbz %x[N], #1, 11f\n"
-      "str d24, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 12f\n"
-      "st1 { v24.s }[2], [%x[output_ptr]]\n"
-      "b 12f\n"
-      "11:"  // Width 1: Partial direct writeback: partial_1_0
-      "str s24, [%x[output_ptr], #0x0]\n"
-      "12:"  // Width 1: Writeback done
-      "b 97f\n"
-      "13:"  // Width 2
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 14f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "add x21, x21, #0x20\n"
-      "b 15f\n"
-      "14:"  // Width 2: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "15:"  // Width 2: setup done
-      "cmp x20, #0x4\n"
-      "blt 18f\n"
-      "cmp x20, #0x8\n"
-      "blt 17f\n"
-      "16:"  // Width 2: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q3, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v3.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q4, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v4.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q5, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v5.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q6, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v6.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q7, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v7.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q8, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v8.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "bge 16b\n"
-      "17:"  // Width 2: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q9, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v9.4s, v0.s[0]\n"
-      "ldr q10, [%x[B_ptr], #0x10]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v25.4s, v10.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q11, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v11.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q12, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v12.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v13.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v14.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q15, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v15.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q16, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v16.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "18:"  // Width 2: Multiply loop: Main loop skip
-      "cbz x20, 20f\n"
-      "19:"  // Width 2: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q17, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v17.4s, v0.s[0]\n"
-      "ldr q18, [%x[B_ptr], #0x10]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v25.4s, v18.4s, v0.s[0]\n"
-      "sub x20, x20, #0x1\n"
-      "cbnz x20, 19b\n"
-      "20:"  // Width 2: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 21f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "21:"  // Width 2: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "cmp %x[N], #0x8\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "blt 22f\n"
-      "str q25, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 24f\n"
-      "22:"  // Width 2: Partial writeback
-      "tbz %x[N], #1, 23f\n"
-      "str d25, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 24f\n"
-      "st1 { v25.s }[2], [%x[output_ptr]]\n"
-      "b 24f\n"
-      "23:"  // Width 2: Partial direct writeback: partial_1_4
-      "tbz %x[N], #0, 24f\n"
-      "str s25, [%x[output_ptr], #0x0]\n"
-      "24:"  // Width 2: Writeback done
-      "b 97f\n"
-      "25:"  // Width 3
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 26f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "add x21, x21, #0x30\n"
-      "b 27f\n"
-      "26:"  // Width 3: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "27:"  // Width 3: setup done
-      "cmp x20, #0x4\n"
-      "blt 30f\n"
-      "cmp x20, #0x8\n"
-      "blt 29f\n"
-      "28:"  // Width 3: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v4.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q5, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v5.4s, v0.s[1]\n"
-      "ldr q6, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v6.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q7, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v7.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q8, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v8.4s, v0.s[2]\n"
-      "ldr q9, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v9.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q10, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v10.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q11, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v11.4s, v0.s[3]\n"
-      "ldr q12, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v12.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "bge 28b\n"
-      "29:"  // Width 3: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v13.4s, v0.s[0]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "ldr q15, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v14.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v26.4s, v15.4s, v0.s[0]\n"
-      "ldr q16, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v16.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q17, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v17.4s, v0.s[1]\n"
-      "ldr q18, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v18.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q19, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v19.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q20, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v20.4s, v0.s[2]\n"
-      "ldr q21, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v21.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q22, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v22.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q23, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v23.4s, v0.s[3]\n"
-      "ldr q1, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v1.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "30:"  // Width 3: Multiply loop: Main loop skip
-      "cbz x20, 32f\n"
-      "31:"  // Width 3: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q2, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v2.4s, v0.s[0]\n"
-      "ldr q3, [%x[B_ptr], #0x10]\n"
-      "ldr q4, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v3.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v4.4s, v0.s[0]\n"
-      "sub x20, x20, #0x1\n"
-      "cbnz x20, 31b\n"
-      "32:"  // Width 3: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 33f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "33:"  // Width 3: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "cmp %x[N], #0xc\n"
-      "add %x[output_ptr], %x[output_ptr], #0x20\n"
-      "blt 34f\n"
-      "str q26, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 36f\n"
-      "34:"  // Width 3: Partial writeback
-      "tbz %x[N], #1, 35f\n"
-      "str d26, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 36f\n"
-      "st1 { v26.s }[2], [%x[output_ptr]]\n"
-      "b 36f\n"
-      "35:"  // Width 3: Partial direct writeback: partial_1_8
-      "tbz %x[N], #0, 36f\n"
-      "str s26, [%x[output_ptr], #0x0]\n"
-      "36:"  // Width 3: Writeback done
-      "b 97f\n"
-      "37:"  // Width 4
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 38f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "add x21, x21, #0x40\n"
-      "b 39f\n"
-      "38:"  // Width 4: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "movi v27.16b, #0x0\n"
-      "39:"  // Width 4: setup done
-      "cmp x20, #0x4\n"
-      "blt 42f\n"
-      "cmp x20, #0x8\n"
-      "blt 41f\n"
-      "40:"  // Width 4: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v4.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q5, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v5.4s, v0.s[1]\n"
-      "ldr q6, [%x[B_ptr], #0x10]\n"
-      "ldr q7, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v6.4s, v0.s[1]\n"
-      "ldr q8, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v7.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v8.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q9, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v9.4s, v0.s[2]\n"
-      "ldr q10, [%x[B_ptr], #0x10]\n"
-      "ldr q11, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v10.4s, v0.s[2]\n"
-      "ldr q12, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v11.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v12.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v13.4s, v0.s[3]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "ldr q15, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v14.4s, v0.s[3]\n"
-      "ldr q16, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v15.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v16.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "sub x20, x20, #0x4\n"
-      "cmp x20, #0x8\n"
-      "bge 40b\n"
-      "41:"  // Width 4: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q17, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v17.4s, v0.s[0]\n"
-      "ldr q18, [%x[B_ptr], #0x10]\n"
-      "ldr q19, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v18.4s, v0.s[0]\n"
-      "ldr q20, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v19.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v20.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q21, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v21.4s, v0.s[1]\n"
-      "ldr q22, [%x[B_ptr], #0x10]\n"
-      "ldr q23, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v22.4s, v0.s[1]\n"
-      "ldr q1, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v23.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v1.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q2, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v2.4s, v0.s[2]\n"
-      "ldr q3, [%x[B_ptr], #0x10]\n"
-      "ldr q4, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v3.4s, v0.s[2]\n"
-      "ldr q5, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v4.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v5.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q6, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v6.4s, v0.s[3]\n"
-      "ldr q7, [%x[B_ptr], #0x10]\n"
-      "ldr q8, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v7.4s, v0.s[3]\n"
-      "ldr q9, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v8.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v9.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "42:"  // Width 4: Multiply loop: Main loop skip
-      "cbz x20, 44f\n"
-      "43:"  // Width 4: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q10, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v10.4s, v0.s[0]\n"
-      "ldr q11, [%x[B_ptr], #0x10]\n"
-      "ldr q12, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v11.4s, v0.s[0]\n"
-      "ldr q13, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v12.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "sub x20, x20, #0x1\n"
-      "fmla v27.4s, v13.4s, v0.s[0]\n"
-      "cbnz x20, 43b\n"
-      "44:"  // Width 4: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 45f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "45:"  // Width 4: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "str q26, [%x[output_ptr], #0x20]\n"
-      "cmp %x[N], #0x10\n"
-      "add %x[output_ptr], %x[output_ptr], #0x30\n"
-      "blt 46f\n"
-      "str q27, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 48f\n"
-      "46:"  // Width 4: Partial writeback
-      "tbz %x[N], #1, 47f\n"
-      "str d27, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 48f\n"
-      "st1 { v27.s }[2], [%x[output_ptr]]\n"
-      "b 48f\n"
-      "47:"  // Width 4: Partial direct writeback: partial_1_12
-      "tbz %x[N], #0, 48f\n"
-      "str s27, [%x[output_ptr], #0x0]\n"
-      "48:"  // Width 4: Writeback done
-      "b 97f\n"
-      "49:"  // Width 5
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 50f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "ldr q28, [x21, #0x40]\n"
-      "add x21, x21, #0x50\n"
-      "b 51f\n"
-      "50:"  // Width 5: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "movi v27.16b, #0x0\n"
-      "movi v28.16b, #0x0\n"
-      "51:"  // Width 5: setup done
-      "cmp x20, #0x4\n"
-      "blt 54f\n"
-      "cmp x20, #0x8\n"
-      "blt 53f\n"
-      "52:"  // Width 5: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "ldr q5, [%x[B_ptr], #0x40]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v27.4s, v4.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q6, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v5.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q7, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v6.4s, v0.s[1]\n"
-      "ldr q8, [%x[B_ptr], #0x20]\n"
-      "ldr q9, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v7.4s, v0.s[1]\n"
-      "ldr q10, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v8.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v9.4s, v0.s[1]\n"
-      "ldr q11, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v10.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q12, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v11.4s, v0.s[2]\n"
-      "ldr q13, [%x[B_ptr], #0x20]\n"
-      "ldr q14, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v12.4s, v0.s[2]\n"
-      "ldr q15, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v13.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v14.4s, v0.s[2]\n"
-      "ldr q16, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v15.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q17, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v16.4s, v0.s[3]\n"
-      "ldr q18, [%x[B_ptr], #0x20]\n"
-      "ldr q19, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v17.4s, v0.s[3]\n"
-      "ldr q20, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v18.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v19.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v28.4s, v20.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "bge 52b\n"
-      "53:"  // Width 5: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q21, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v21.4s, v0.s[0]\n"
-      "ldr q22, [%x[B_ptr], #0x10]\n"
-      "ldr q23, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v22.4s, v0.s[0]\n"
-      "ldr q1, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v23.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x40]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v27.4s, v1.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q3, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v2.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q4, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v3.4s, v0.s[1]\n"
-      "ldr q5, [%x[B_ptr], #0x20]\n"
-      "ldr q6, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v4.4s, v0.s[1]\n"
-      "ldr q7, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v5.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v6.4s, v0.s[1]\n"
-      "ldr q8, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v7.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q9, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v8.4s, v0.s[2]\n"
-      "ldr q10, [%x[B_ptr], #0x20]\n"
-      "ldr q11, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v9.4s, v0.s[2]\n"
-      "ldr q12, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v10.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v11.4s, v0.s[2]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v12.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v13.4s, v0.s[3]\n"
-      "ldr q15, [%x[B_ptr], #0x20]\n"
-      "ldr q16, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v14.4s, v0.s[3]\n"
-      "ldr q17, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v15.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v16.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v28.4s, v17.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "54:"  // Width 5: Multiply loop: Main loop skip
-      "cbz x20, 56f\n"
-      "55:"  // Width 5: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q18, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v18.4s, v0.s[0]\n"
-      "ldr q19, [%x[B_ptr], #0x10]\n"
-      "ldr q20, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v19.4s, v0.s[0]\n"
-      "ldr q21, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v20.4s, v0.s[0]\n"
-      "ldr q22, [%x[B_ptr], #0x40]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v27.4s, v21.4s, v0.s[0]\n"
-      "sub x20, x20, #0x1\n"
-      "fmla v28.4s, v22.4s, v0.s[0]\n"
-      "cbnz x20, 55b\n"
-      "56:"  // Width 5: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 57f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "57:"  // Width 5: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "str q26, [%x[output_ptr], #0x20]\n"
-      "str q27, [%x[output_ptr], #0x30]\n"
-      "cmp %x[N], #0x14\n"
-      "add %x[output_ptr], %x[output_ptr], #0x40\n"
-      "blt 58f\n"
-      "str q28, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 60f\n"
-      "58:"  // Width 5: Partial writeback
-      "tbz %x[N], #1, 59f\n"
-      "str d28, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 60f\n"
-      "st1 { v28.s }[2], [%x[output_ptr]]\n"
-      "b 60f\n"
-      "59:"  // Width 5: Partial direct writeback: partial_1_16
-      "tbz %x[N], #0, 60f\n"
-      "str s28, [%x[output_ptr], #0x0]\n"
-      "60:"  // Width 5: Writeback done
-      "b 97f\n"
-      "61:"  // Width 6
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 62f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "ldr q28, [x21, #0x40]\n"
-      "ldr q29, [x21, #0x50]\n"
-      "add x21, x21, #0x60\n"
-      "b 63f\n"
-      "62:"  // Width 6: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "movi v27.16b, #0x0\n"
-      "movi v28.16b, #0x0\n"
-      "movi v29.16b, #0x0\n"
-      "63:"  // Width 6: setup done
-      "cmp x20, #0x4\n"
-      "blt 66f\n"
-      "cmp x20, #0x8\n"
-      "blt 65f\n"
-      "64:"  // Width 6: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "ldr q5, [%x[B_ptr], #0x40]\n"
-      "ldr q6, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v4.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v5.4s, v0.s[0]\n"
-      "ldr q7, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v6.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q8, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v7.4s, v0.s[1]\n"
-      "ldr q9, [%x[B_ptr], #0x20]\n"
-      "ldr q10, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v8.4s, v0.s[1]\n"
-      "ldr q11, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v9.4s, v0.s[1]\n"
-      "ldr q12, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v10.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v11.4s, v0.s[1]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v12.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v13.4s, v0.s[2]\n"
-      "ldr q15, [%x[B_ptr], #0x20]\n"
-      "ldr q16, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v14.4s, v0.s[2]\n"
-      "ldr q17, [%x[B_ptr], #0x40]\n"
-      "ldr q18, [%x[B_ptr], #0x50]\n"
-      "fmla v26.4s, v15.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v16.4s, v0.s[2]\n"
-      "ldr q19, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v17.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q20, [%x[B_ptr], #0x10]\n"
-      "fmla v29.4s, v18.4s, v0.s[2]\n"
-      "ldr q21, [%x[B_ptr], #0x20]\n"
-      "ldr q22, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v19.4s, v0.s[3]\n"
-      "ldr q23, [%x[B_ptr], #0x40]\n"
-      "ldr q1, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v20.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v26.4s, v21.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v27.4s, v22.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "fmla v28.4s, v23.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "fmla v29.4s, v1.4s, v0.s[3]\n"
-      "bge 64b\n"
-      "65:"  // Width 6: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q2, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v2.4s, v0.s[0]\n"
-      "ldr q3, [%x[B_ptr], #0x10]\n"
-      "ldr q4, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v3.4s, v0.s[0]\n"
-      "ldr q5, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v4.4s, v0.s[0]\n"
-      "ldr q6, [%x[B_ptr], #0x40]\n"
-      "ldr q7, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v5.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v6.4s, v0.s[0]\n"
-      "ldr q8, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q9, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v8.4s, v0.s[1]\n"
-      "ldr q10, [%x[B_ptr], #0x20]\n"
-      "ldr q11, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v9.4s, v0.s[1]\n"
-      "ldr q12, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v10.4s, v0.s[1]\n"
-      "ldr q13, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v11.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v12.4s, v0.s[1]\n"
-      "ldr q14, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v13.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q15, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v14.4s, v0.s[2]\n"
-      "ldr q16, [%x[B_ptr], #0x20]\n"
-      "ldr q17, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v15.4s, v0.s[2]\n"
-      "ldr q18, [%x[B_ptr], #0x40]\n"
-      "ldr q19, [%x[B_ptr], #0x50]\n"
-      "fmla v26.4s, v16.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v17.4s, v0.s[2]\n"
-      "ldr q20, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v18.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q21, [%x[B_ptr], #0x10]\n"
-      "fmla v29.4s, v19.4s, v0.s[2]\n"
-      "ldr q22, [%x[B_ptr], #0x20]\n"
-      "ldr q23, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v20.4s, v0.s[3]\n"
-      "ldr q1, [%x[B_ptr], #0x40]\n"
-      "ldr q2, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v21.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v26.4s, v22.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v27.4s, v23.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "fmla v28.4s, v1.4s, v0.s[3]\n"
-      "fmla v29.4s, v2.4s, v0.s[3]\n"
-      "66:"  // Width 6: Multiply loop: Main loop skip
-      "cbz x20, 68f\n"
-      "67:"  // Width 6: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q3, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v3.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x10]\n"
-      "ldr q5, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v4.4s, v0.s[0]\n"
-      "ldr q6, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v5.4s, v0.s[0]\n"
-      "ldr q7, [%x[B_ptr], #0x40]\n"
-      "ldr q8, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v6.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "sub x20, x20, #0x1\n"
-      "fmla v28.4s, v7.4s, v0.s[0]\n"
-      "fmla v29.4s, v8.4s, v0.s[0]\n"
-      "cbnz x20, 67b\n"
-      "68:"  // Width 6: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 69f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmin v29.4s, v29.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "69:"  // Width 6: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "str q26, [%x[output_ptr], #0x20]\n"
-      "str q27, [%x[output_ptr], #0x30]\n"
-      "str q28, [%x[output_ptr], #0x40]\n"
-      "cmp %x[N], #0x18\n"
-      "add %x[output_ptr], %x[output_ptr], #0x50\n"
-      "blt 70f\n"
-      "str q29, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 72f\n"
-      "70:"  // Width 6: Partial writeback
-      "tbz %x[N], #1, 71f\n"
-      "str d29, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 72f\n"
-      "st1 { v29.s }[2], [%x[output_ptr]]\n"
-      "b 72f\n"
-      "71:"  // Width 6: Partial direct writeback: partial_1_20
-      "tbz %x[N], #0, 72f\n"
-      "str s29, [%x[output_ptr], #0x0]\n"
-      "72:"  // Width 6: Writeback done
-      "b 97f\n"
-      "73:"  // Width 7
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 74f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "ldr q28, [x21, #0x40]\n"
-      "ldr q29, [x21, #0x50]\n"
-      "ldr q30, [x21, #0x60]\n"
-      "add x21, x21, #0x70\n"
-      "b 75f\n"
-      "74:"  // Width 7: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "movi v27.16b, #0x0\n"
-      "movi v28.16b, #0x0\n"
-      "movi v29.16b, #0x0\n"
-      "movi v30.16b, #0x0\n"
-      "75:"  // Width 7: setup done
-      "cmp x20, #0x4\n"
-      "blt 78f\n"
-      "cmp x20, #0x8\n"
-      "blt 77f\n"
-      "76:"  // Width 7: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "ldr q5, [%x[B_ptr], #0x40]\n"
-      "ldr q6, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v4.4s, v0.s[0]\n"
-      "ldr q7, [%x[B_ptr], #0x60]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v28.4s, v5.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v29.4s, v6.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q8, [%x[B_ptr], #0x0]\n"
-      "fmla v30.4s, v7.4s, v0.s[0]\n"
-      "ldr q9, [%x[B_ptr], #0x10]\n"
-      "ldr q10, [%x[B_ptr], #0x20]\n"
-      "fmla v24.4s, v8.4s, v0.s[1]\n"
-      "ldr q11, [%x[B_ptr], #0x30]\n"
-      "ldr q12, [%x[B_ptr], #0x40]\n"
-      "fmla v25.4s, v9.4s, v0.s[1]\n"
-      "ldr q13, [%x[B_ptr], #0x50]\n"
-      "fmla v26.4s, v10.4s, v0.s[1]\n"
-      "ldr q14, [%x[B_ptr], #0x60]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v27.4s, v11.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v12.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q15, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v13.4s, v0.s[1]\n"
-      "ldr q16, [%x[B_ptr], #0x10]\n"
-      "ldr q17, [%x[B_ptr], #0x20]\n"
-      "fmla v30.4s, v14.4s, v0.s[1]\n"
-      "ldr q18, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v15.4s, v0.s[2]\n"
-      "ldr q19, [%x[B_ptr], #0x40]\n"
-      "ldr q20, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v16.4s, v0.s[2]\n"
-      "ldr q21, [%x[B_ptr], #0x60]\n"
-      "fmla v26.4s, v17.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v18.4s, v0.s[2]\n"
-      "ldr q22, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v19.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q23, [%x[B_ptr], #0x10]\n"
-      "fmla v29.4s, v20.4s, v0.s[2]\n"
-      "ldr q1, [%x[B_ptr], #0x20]\n"
-      "ldr q2, [%x[B_ptr], #0x30]\n"
-      "fmla v30.4s, v21.4s, v0.s[2]\n"
-      "ldr q3, [%x[B_ptr], #0x40]\n"
-      "fmla v24.4s, v22.4s, v0.s[3]\n"
-      "ldr q4, [%x[B_ptr], #0x50]\n"
-      "ldr q5, [%x[B_ptr], #0x60]\n"
-      "fmla v25.4s, v23.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v1.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v2.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v28.4s, v3.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "sub x20, x20, #0x4\n"
-      "fmla v29.4s, v4.4s, v0.s[3]\n"
-      "cmp x20, #0x8\n"
-      "fmla v30.4s, v5.4s, v0.s[3]\n"
-      "bge 76b\n"
-      "77:"  // Width 7: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q6, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [%x[B_ptr], #0x10]\n"
-      "ldr q8, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v7.4s, v0.s[0]\n"
-      "ldr q9, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v8.4s, v0.s[0]\n"
-      "ldr q10, [%x[B_ptr], #0x40]\n"
-      "ldr q11, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v9.4s, v0.s[0]\n"
-      "ldr q12, [%x[B_ptr], #0x60]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v28.4s, v10.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v29.4s, v11.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v30.4s, v12.4s, v0.s[0]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "ldr q15, [%x[B_ptr], #0x20]\n"
-      "fmla v24.4s, v13.4s, v0.s[1]\n"
-      "ldr q16, [%x[B_ptr], #0x30]\n"
-      "ldr q17, [%x[B_ptr], #0x40]\n"
-      "fmla v25.4s, v14.4s, v0.s[1]\n"
-      "ldr q18, [%x[B_ptr], #0x50]\n"
-      "fmla v26.4s, v15.4s, v0.s[1]\n"
-      "ldr q19, [%x[B_ptr], #0x60]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v27.4s, v16.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v17.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q20, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v18.4s, v0.s[1]\n"
-      "ldr q21, [%x[B_ptr], #0x10]\n"
-      "ldr q22, [%x[B_ptr], #0x20]\n"
-      "fmla v30.4s, v19.4s, v0.s[1]\n"
-      "ldr q23, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v20.4s, v0.s[2]\n"
-      "ldr q1, [%x[B_ptr], #0x40]\n"
-      "ldr q2, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v21.4s, v0.s[2]\n"
-      "ldr q3, [%x[B_ptr], #0x60]\n"
-      "fmla v26.4s, v22.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v23.4s, v0.s[2]\n"
-      "ldr q4, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v1.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q5, [%x[B_ptr], #0x10]\n"
-      "fmla v29.4s, v2.4s, v0.s[2]\n"
-      "ldr q6, [%x[B_ptr], #0x20]\n"
-      "ldr q7, [%x[B_ptr], #0x30]\n"
-      "fmla v30.4s, v3.4s, v0.s[2]\n"
-      "ldr q8, [%x[B_ptr], #0x40]\n"
-      "fmla v24.4s, v4.4s, v0.s[3]\n"
-      "ldr q9, [%x[B_ptr], #0x50]\n"
-      "ldr q10, [%x[B_ptr], #0x60]\n"
-      "fmla v25.4s, v5.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v6.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v7.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v28.4s, v8.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "fmla v29.4s, v9.4s, v0.s[3]\n"
-      "fmla v30.4s, v10.4s, v0.s[3]\n"
-      "78:"  // Width 7: Multiply loop: Main loop skip
-      "cbz x20, 80f\n"
-      "79:"  // Width 7: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q11, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v11.4s, v0.s[0]\n"
-      "ldr q12, [%x[B_ptr], #0x10]\n"
-      "ldr q13, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v12.4s, v0.s[0]\n"
-      "ldr q14, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v13.4s, v0.s[0]\n"
-      "ldr q15, [%x[B_ptr], #0x40]\n"
-      "ldr q16, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v14.4s, v0.s[0]\n"
-      "ldr q17, [%x[B_ptr], #0x60]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v28.4s, v15.4s, v0.s[0]\n"
-      "fmla v29.4s, v16.4s, v0.s[0]\n"
-      "sub x20, x20, #0x1\n"
-      "fmla v30.4s, v17.4s, v0.s[0]\n"
-      "cbnz x20, 79b\n"
-      "80:"  // Width 7: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 81f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmin v29.4s, v29.4s, v16.4s\n"
-      "fmin v30.4s, v30.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "fmax v30.4s, v30.4s, v17.4s\n"
-      "81:"  // Width 7: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "str q26, [%x[output_ptr], #0x20]\n"
-      "str q27, [%x[output_ptr], #0x30]\n"
-      "str q28, [%x[output_ptr], #0x40]\n"
-      "str q29, [%x[output_ptr], #0x50]\n"
-      "cmp %x[N], #0x1c\n"
-      "add %x[output_ptr], %x[output_ptr], #0x60\n"
-      "blt 82f\n"
-      "str q30, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 84f\n"
-      "82:"  // Width 7: Partial writeback
-      "tbz %x[N], #1, 83f\n"
-      "str d30, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 84f\n"
-      "st1 { v30.s }[2], [%x[output_ptr]]\n"
-      "b 84f\n"
-      "83:"  // Width 7: Partial direct writeback: partial_1_24
-      "tbz %x[N], #0, 84f\n"
-      "str s30, [%x[output_ptr], #0x0]\n"
-      "84:"  // Width 7: Writeback done
-      "b 97f\n"
-      "85:"  // Width 8
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 86f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "ldr q28, [x21, #0x40]\n"
-      "ldr q29, [x21, #0x50]\n"
-      "ldr q30, [x21, #0x60]\n"
-      "ldr q31, [x21, #0x70]\n"
-      "add x21, x21, #0x80\n"
-      "b 87f\n"
-      "86:"  // Width 8: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "movi v27.16b, #0x0\n"
-      "movi v28.16b, #0x0\n"
-      "movi v29.16b, #0x0\n"
-      "movi v30.16b, #0x0\n"
-      "movi v31.16b, #0x0\n"
-      "87:"  // Width 8: setup done
-      "cmp x20, #0x4\n"
-      "blt 90f\n"
-      "cmp x20, #0x8\n"
-      "blt 89f\n"
-      "88:"  // Width 8: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "ldr q5, [%x[B_ptr], #0x40]\n"
-      "ldr q6, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v4.4s, v0.s[0]\n"
-      "ldr q7, [%x[B_ptr], #0x60]\n"
-      "ldr q8, [%x[B_ptr], #0x70]\n"
-      "fmla v28.4s, v5.4s, v0.s[0]\n"
-      "fmla v29.4s, v6.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v30.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q9, [%x[B_ptr], #0x0]\n"
-      "fmla v31.4s, v8.4s, v0.s[0]\n"
-      "ldr q10, [%x[B_ptr], #0x10]\n"
-      "ldr q11, [%x[B_ptr], #0x20]\n"
-      "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "ldr q12, [%x[B_ptr], #0x30]\n"
-      "ldr q13, [%x[B_ptr], #0x40]\n"
-      "fmla v25.4s, v10.4s, v0.s[1]\n"
-      "fmla v26.4s, v11.4s, v0.s[1]\n"
-      "ldr q14, [%x[B_ptr], #0x50]\n"
-      "ldr q15, [%x[B_ptr], #0x60]\n"
-      "fmla v27.4s, v12.4s, v0.s[1]\n"
-      "ldr q16, [%x[B_ptr], #0x70]\n"
-      "fmla v28.4s, v13.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v29.4s, v14.4s, v0.s[1]\n"
-      "ldr q17, [%x[B_ptr], #0x0]\n"
-      "fmla v30.4s, v15.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q18, [%x[B_ptr], #0x10]\n"
-      "fmla v31.4s, v16.4s, v0.s[1]\n"
-      "ldr q19, [%x[B_ptr], #0x20]\n"
-      "ldr q20, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v17.4s, v0.s[2]\n"
-      "ldr q21, [%x[B_ptr], #0x40]\n"
-      "ldr q22, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v18.4s, v0.s[2]\n"
-      "ldr q23, [%x[B_ptr], #0x60]\n"
-      "fmla v26.4s, v19.4s, v0.s[2]\n"
-      "ldr q1, [%x[B_ptr], #0x70]\n"
-      "fmla v27.4s, v20.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v21.4s, v0.s[2]\n"
-      "ldr q2, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v22.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q3, [%x[B_ptr], #0x10]\n"
-      "fmla v30.4s, v23.4s, v0.s[2]\n"
-      "ldr q4, [%x[B_ptr], #0x20]\n"
-      "ldr q5, [%x[B_ptr], #0x30]\n"
-      "fmla v31.4s, v1.4s, v0.s[2]\n"
-      "ldr q6, [%x[B_ptr], #0x40]\n"
-      "fmla v24.4s, v2.4s, v0.s[3]\n"
-      "ldr q7, [%x[B_ptr], #0x50]\n"
-      "ldr q8, [%x[B_ptr], #0x60]\n"
-      "fmla v25.4s, v3.4s, v0.s[3]\n"
-      "ldr q9, [%x[B_ptr], #0x70]\n"
-      "fmla v26.4s, v4.4s, v0.s[3]\n"
-      "fmla v27.4s, v5.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v6.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v29.4s, v7.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "fmla v30.4s, v8.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "fmla v31.4s, v9.4s, v0.s[3]\n"
-      "bge 88b\n"
-      "89:"  // Width 8: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q10, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v10.4s, v0.s[0]\n"
-      "ldr q11, [%x[B_ptr], #0x10]\n"
-      "ldr q12, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v11.4s, v0.s[0]\n"
-      "ldr q13, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v12.4s, v0.s[0]\n"
-      "ldr q14, [%x[B_ptr], #0x40]\n"
-      "ldr q15, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v13.4s, v0.s[0]\n"
-      "ldr q16, [%x[B_ptr], #0x60]\n"
-      "ldr q17, [%x[B_ptr], #0x70]\n"
-      "fmla v28.4s, v14.4s, v0.s[0]\n"
-      "fmla v29.4s, v15.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v30.4s, v16.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q18, [%x[B_ptr], #0x0]\n"
-      "fmla v31.4s, v17.4s, v0.s[0]\n"
-      "ldr q19, [%x[B_ptr], #0x10]\n"
-      "ldr q20, [%x[B_ptr], #0x20]\n"
-      "fmla v24.4s, v18.4s, v0.s[1]\n"
-      "ldr q21, [%x[B_ptr], #0x30]\n"
-      "ldr q22, [%x[B_ptr], #0x40]\n"
-      "fmla v25.4s, v19.4s, v0.s[1]\n"
-      "fmla v26.4s, v20.4s, v0.s[1]\n"
-      "ldr q23, [%x[B_ptr], #0x50]\n"
-      "ldr q1, [%x[B_ptr], #0x60]\n"
-      "fmla v27.4s, v21.4s, v0.s[1]\n"
-      "ldr q2, [%x[B_ptr], #0x70]\n"
-      "fmla v28.4s, v22.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v29.4s, v23.4s, v0.s[1]\n"
-      "ldr q3, [%x[B_ptr], #0x0]\n"
-      "fmla v30.4s, v1.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q4, [%x[B_ptr], #0x10]\n"
-      "fmla v31.4s, v2.4s, v0.s[1]\n"
-      "ldr q5, [%x[B_ptr], #0x20]\n"
-      "ldr q6, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v3.4s, v0.s[2]\n"
-      "ldr q7, [%x[B_ptr], #0x40]\n"
-      "ldr q8, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v4.4s, v0.s[2]\n"
-      "ldr q9, [%x[B_ptr], #0x60]\n"
-      "fmla v26.4s, v5.4s, v0.s[2]\n"
-      "ldr q10, [%x[B_ptr], #0x70]\n"
-      "fmla v27.4s, v6.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v7.4s, v0.s[2]\n"
-      "ldr q11, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v8.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q12, [%x[B_ptr], #0x10]\n"
-      "fmla v30.4s, v9.4s, v0.s[2]\n"
-      "ldr q13, [%x[B_ptr], #0x20]\n"
-      "ldr q14, [%x[B_ptr], #0x30]\n"
-      "fmla v31.4s, v10.4s, v0.s[2]\n"
-      "ldr q15, [%x[B_ptr], #0x40]\n"
-      "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr q16, [%x[B_ptr], #0x50]\n"
-      "ldr q17, [%x[B_ptr], #0x60]\n"
-      "fmla v25.4s, v12.4s, v0.s[3]\n"
-      "ldr q18, [%x[B_ptr], #0x70]\n"
-      "fmla v26.4s, v13.4s, v0.s[3]\n"
-      "fmla v27.4s, v14.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v15.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v29.4s, v16.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla v30.4s, v17.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "fmla v31.4s, v18.4s, v0.s[3]\n"
-      "90:"  // Width 8: Multiply loop: Main loop skip
-      "cbz x20, 92f\n"
-      "91:"  // Width 8: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q19, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v19.4s, v0.s[0]\n"
-      "ldr q20, [%x[B_ptr], #0x10]\n"
-      "ldr q21, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v20.4s, v0.s[0]\n"
-      "ldr q22, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v21.4s, v0.s[0]\n"
-      "ldr q23, [%x[B_ptr], #0x40]\n"
-      "ldr q1, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v22.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x60]\n"
-      "ldr q3, [%x[B_ptr], #0x70]\n"
-      "fmla v28.4s, v23.4s, v0.s[0]\n"
-      "fmla v29.4s, v1.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "sub x20, x20, #0x1\n"
-      "fmla v30.4s, v2.4s, v0.s[0]\n"
-      "fmla v31.4s, v3.4s, v0.s[0]\n"
-      "cbnz x20, 91b\n"
-      "92:"  // Width 8: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 93f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmin v29.4s, v29.4s, v16.4s\n"
-      "fmin v30.4s, v30.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "fmax v30.4s, v30.4s, v17.4s\n"
-      "fmin v31.4s, v31.4s, v16.4s\n"
-      "fmax v31.4s, v31.4s, v17.4s\n"
-      "93:"  // Width 8: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "str q26, [%x[output_ptr], #0x20]\n"
-      "str q27, [%x[output_ptr], #0x30]\n"
-      "str q28, [%x[output_ptr], #0x40]\n"
-      "str q29, [%x[output_ptr], #0x50]\n"
-      "str q30, [%x[output_ptr], #0x60]\n"
-      "cmp %x[N], #0x20\n"
-      "add %x[output_ptr], %x[output_ptr], #0x70\n"
-      "blt 94f\n"
-      "str q31, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 96f\n"
-      "94:"  // Width 8: Partial writeback
-      "tbz %x[N], #1, 95f\n"
-      "str d31, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 96f\n"
-      "st1 { v31.s }[2], [%x[output_ptr]]\n"
-      "b 96f\n"
-      "95:"  // Width 8: Partial direct writeback: partial_1_28
-      "tbz %x[N], #0, 96f\n"
-      "str s31, [%x[output_ptr], #0x0]\n"
-      "96:"  // Width 8: Writeback done
-      "subs x22, x22, #0x8\n"
-      "sub %x[N], %x[N], #0x20\n"
-      "bgt 1b\n"
-      "97:"  // Exit
-
-      : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
-      : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
-    );
-}
-
-} // namespace arm_gemm
-
-#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
index cccedc6b9c..d9668aae02 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(ARM_COMPUTE_ENABLE_FP16))
 
 #include "../performance_parameters.hpp"
 #include "../std_transforms_fixed.hpp"
@@ -62,6 +62,7 @@ public:
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 24> transforms = {};
 
+    template<typename T>
     static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
         switch (ci->get_cpu_model()) {
             case CPUModel::A55r1:
@@ -88,4 +89,4 @@ public:
 
 } // namespace arm_gemm
 
-#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // __aarch64__ && (FP16_KERNELS || ARM_COMPUTE_ENABLE_FP16)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
index 29cdd33893..e5728beba8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
@@ -71,10 +71,6 @@ void a64_hgemm_asimd_8x24_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp
             register float16x8_t b2  asm("v6");
 
             __asm __volatile (
-                // Enable FP16 instruction support (but only if it's not already on).
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                ".arch	armv8.2-a+fp16\n"
-#endif
                 // Initialize result registers, load initial operands, prime prefetches.
                 "movi	v8.8h, #0x0\n"
                 "ldr	%d[a0], [%[a_ptr]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
index c9c48dd1c0..23b87fa192 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
@@ -66,10 +66,6 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
             register float16x8_t b2a asm("v7");
 
             __asm __volatile (
-                // Enable FP16 instruction support (but only if it's not already on).
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                ".arch	armv8.2-a+fp16\n"
-#endif
                 // Initialize result registers, load initial operands, prime prefetches.
                 "movi	v8.8h, #0x0\n"
                 "ldr	%q[a0], [%[a_ptr]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
index a6d2405e7e..b47fa6a2d7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
@@ -63,10 +63,6 @@ void a64_hgemm_asimd_8x24_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16
             register float16x8_t b2  asm("v4");
 
             __asm __volatile (
-                // Enable FP16 instruction support (but only if it's not already on).
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                ".arch	armv8.2-a+fp16\n"
-#endif
                 // Initialize result registers, load initial operands, prime prefetches.
                 "movi	v8.8h, #0x0\n"
                 "ldr	%q[a0], [%[a_ptr]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
index fca96f6028..f1427669ea 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,22 +10,23 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
 
 #include "../std_transforms_fixed.hpp"
 #include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -43,7 +44,8 @@ void a64_hybrid_bf16fp32_dot_6x16( ARGLIST );
 class cls_a64_hybrid_bf16fp32_dot_6x16
 {
 public:
-    typedef bfloat16 operand_type;
+    typedef bfloat16 lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -69,7 +71,23 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 16, 2> transforms = {};
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 2> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.83 };
+                case CPUModel::A510:
+                    return { 7.28 };
+                case CPUModel::V1:
+                    return { 27.34 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=a64_hybrid_bf16fp32_dot_6x16;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
index afb06dedea..fc323ea4fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -93,7 +93,6 @@ void a64_hybrid_bf16fp32_dot_6x16 (
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 176f\n"
@@ -103,82 +102,82 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "cmp %x[M], #0x2\n"
       "bgt 71f\n"
       "beq 36f\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[bias]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "cbz x9, 3f\n"
-      "ldr q8, [x9, #0x0]\n"
-      "ldr q9, [x9, #0x10]\n"
-      "ldr q10, [x9, #0x20]\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
+      "cbz x12, 3f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
       "b 14f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 13f\n"
       "cmp x11, #0x10\n"
       "bge 12f\n"
       "tbz x11, #3, 7f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
       "tbz x11, #2, 5f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
       "tbz x11, #1, 4f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "mov x20, #0x38\n"
       "tbz x11, #0, 11f\n"
-      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v11.s }[2], [x9]\n"
       "b 11f\n"
       "4:"  // Height 1: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 11f\n"
-      "ldr s11, [x28, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
       "b 11f\n"
       "5:"  // Height 1: Partial accumulate: partial_2_8
       "tbz x11, #1, 6f\n"
-      "ldr d10, [x28], #0x8\n"
-      "mov x19, #0x28\n"
+      "ldr d10, [x9], #0x8\n"
+      "mov x20, #0x28\n"
       "tbz x11, #0, 11f\n"
-      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v10.s }[2], [x9]\n"
       "b 11f\n"
       "6:"  // Height 1: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 11f\n"
-      "ldr s10, [x28, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
       "b 11f\n"
       "7:"  // Height 1: Partial accumulate: partial_4_0
       "tbz x11, #2, 9f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
       "tbz x11, #1, 8f\n"
-      "ldr d9, [x28], #0x8\n"
-      "mov x19, #0x18\n"
+      "ldr d9, [x9], #0x8\n"
+      "mov x20, #0x18\n"
       "tbz x11, #0, 11f\n"
-      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v9.s }[2], [x9]\n"
       "b 11f\n"
       "8:"  // Height 1: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 11f\n"
-      "ldr s9, [x28, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
       "b 11f\n"
       "9:"  // Height 1: Partial accumulate: partial_2_0
       "tbz x11, #1, 10f\n"
-      "ldr d8, [x28], #0x8\n"
-      "mov x19, #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x11, #0, 11f\n"
-      "ld1 { v8.s }[2], [x28]\n"
+      "ld1 { v8.s }[2], [x9]\n"
       "b 11f\n"
       "10:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr s8, [x9, #0x0]\n"
+      "mov x20, #0x0\n"
       "11:"  // Height 1: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 14f\n"
       "12:"  // Height 1: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
       "b 14f\n"
       "13:"  // Height 1: no accumulate
       "movi v8.16b, #0x0\n"
@@ -186,316 +185,316 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "movi v10.16b, #0x0\n"
       "movi v11.16b, #0x0\n"
       "14:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "15:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 16f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 17f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 17f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
       "b 17f\n"
       "16:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "17:"  // Height 1: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 20f\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 19f\n"
       "18:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "sub x26, x26, #0x8\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "cmp x26, #0x10\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x4f60f228  // bfdot v8.4s, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x4f60f209  // bfdot v9.4s, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4f60f22a  // bfdot v10.4s, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4f60f20b  // bfdot v11.4s, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f40fa28  // bfdot v8.4s, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f40fa09  // bfdot v9.4s, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f40fa2a  // bfdot v10.4s, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f40fa0b  // bfdot v11.4s, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4f60fa28  // bfdot v8.4s, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4f60fa09  // bfdot v9.4s, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f60fa2a  // bfdot v10.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f60fa0b  // bfdot v11.4s, v16.8h, v0.h[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "cmp x27, #0x10\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "bge 18b\n"
       "19:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x4f60f228  // bfdot v8.4s, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x4f60f209  // bfdot v9.4s, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4f60f22a  // bfdot v10.4s, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4f60f20b  // bfdot v11.4s, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f40fa28  // bfdot v8.4s, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f40fa09  // bfdot v9.4s, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f40fa2a  // bfdot v10.4s, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f40fa0b  // bfdot v11.4s, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4f60fa28  // bfdot v8.4s, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4f60fa09  // bfdot v9.4s, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x4f60fa2a  // bfdot v10.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f60fa0b  // bfdot v11.4s, v16.8h, v0.h[3]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
       "20:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x26, 24f\n"
-      "cmp x26, #0x2\n"
+      "cbz x27, 24f\n"
+      "cmp x27, #0x2\n"
       "blt 22f\n"
       "21:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "cmp x26, #0x2\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x10, #0x0]\n"
+      ".inst 0x4f52f208  // bfdot v8.4s, v16.8h, v18.h[0]\n"
+      "sub x27, x27, #0x2\n"
+      "ldr q16, [x10, #0x10]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f52f209  // bfdot v9.4s, v16.8h, v18.h[0]\n"
+      "cmp x27, #0x2\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f52f22a  // bfdot v10.4s, v17.8h, v18.h[0]\n"
+      ".inst 0x4f52f20b  // bfdot v11.4s, v16.8h, v18.h[0]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
       "bge 21b\n"
-      "cbz x26, 24f\n"
       "22:"  // Height 1: Multiply loop: Skip odd blocks
-      "ldr h0, [x25, #0x0]\n"
+      "cbz x27, 24f\n"
+      "ldr h0, [x26, #0x0]\n"
       "23:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f40f228  // bfdot v8.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f40f209  // bfdot v9.4s, v16.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
       "24:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 15b\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
       "tbz %x[flags], #1, 25f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
       "25:"  // Height 1: No activation
       "cmp x11, #0x10\n"
       "bge 34f\n"
       "tbz x11, #3, 29f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
       "tbz x11, #2, 27f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
       "tbz x11, #1, 26f\n"
-      "str d11, [x28], #0x8\n"
+      "str d11, [x9], #0x8\n"
       "tbz x11, #0, 33f\n"
-      "st1 { v11.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x9]\n"
       "b 33f\n"
       "26:"  // Height 1: Partial direct writeback: partial_1_12
       "tbz x11, #0, 33f\n"
-      "str s11, [x28, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
       "b 33f\n"
       "27:"  // Height 1: Partial direct writeback: partial_2_8
       "tbz x11, #1, 28f\n"
-      "str d10, [x28], #0x8\n"
+      "str d10, [x9], #0x8\n"
       "tbz x11, #0, 33f\n"
-      "st1 { v10.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x9]\n"
       "b 33f\n"
       "28:"  // Height 1: Partial direct writeback: partial_1_8
       "tbz x11, #0, 33f\n"
-      "str s10, [x28, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
       "b 33f\n"
       "29:"  // Height 1: Partial direct writeback: partial_4_0
       "tbz x11, #2, 31f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
       "tbz x11, #1, 30f\n"
-      "str d9, [x28], #0x8\n"
+      "str d9, [x9], #0x8\n"
       "tbz x11, #0, 33f\n"
-      "st1 { v9.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x9]\n"
       "b 33f\n"
       "30:"  // Height 1: Partial direct writeback: partial_1_4
       "tbz x11, #0, 33f\n"
-      "str s9, [x28, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
       "b 33f\n"
       "31:"  // Height 1: Partial direct writeback: partial_2_0
       "tbz x11, #1, 32f\n"
-      "str d8, [x28], #0x8\n"
+      "str d8, [x9], #0x8\n"
       "tbz x11, #0, 33f\n"
-      "st1 { v8.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x9]\n"
       "b 33f\n"
       "32:"  // Height 1: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
       "33:"  // Height 1: Partial direct writeback: Done
       "b 35f\n"
       "34:"  // Height 1: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
       "35:"  // Height 1: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 2b\n"
       "b 212f\n"
       "36:"  // Height 2
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "37:"  // Height 2: Column loop
-      "cbz x9, 38f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 38f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "ldr q10, [x9, #0x20]\n"
       "mov v13.16b, v9.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "add x12, x12, #0x40\n"
       "b 49f\n"
       "38:"  // Height 2: no bias
       "tbz %x[flags], #0, 48f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "cmp x11, #0x10\n"
-      "add x24, x28, x19, LSL #2\n"
+      "add x25, x9, x20, LSL #2\n"
       "bge 47f\n"
       "tbz x11, #3, 42f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
       "tbz x11, #2, 40f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
       "tbz x11, #1, 39f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
       "tbz x11, #0, 46f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
       "b 46f\n"
       "39:"  // Height 2: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 46f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x24, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
       "b 46f\n"
       "40:"  // Height 2: Partial accumulate: partial_2_8
       "tbz x11, #1, 41f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "mov x19, #0x28\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
       "tbz x11, #0, 46f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
       "b 46f\n"
       "41:"  // Height 2: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 46f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x24, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
       "b 46f\n"
       "42:"  // Height 2: Partial accumulate: partial_4_0
       "tbz x11, #2, 44f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
       "tbz x11, #1, 43f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
       "tbz x11, #0, 46f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
       "b 46f\n"
       "43:"  // Height 2: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 46f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x24, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
       "b 46f\n"
       "44:"  // Height 2: Partial accumulate: partial_2_0
       "tbz x11, #1, 45f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "mov x19, #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x11, #0, 46f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
       "b 46f\n"
       "45:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s12, [x24, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
       "46:"  // Height 2: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 49f\n"
       "47:"  // Height 2: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
       "b 49f\n"
       "48:"  // Height 2: no accumulate
       "movi v8.16b, #0x0\n"
@@ -507,423 +506,423 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "movi v14.16b, #0x0\n"
       "movi v15.16b, #0x0\n"
       "49:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "50:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 51f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 52f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 52f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
       "b 52f\n"
       "51:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
       "52:"  // Height 2: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 55f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 54f\n"
       "53:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
+      "sub x27, x27, #0x8\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x8\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "cmp x26, #0x10\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f41f22e  // bfdot v14.4s, v17.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
+      ".inst 0x4f41f20f  // bfdot v15.4s, v16.8h, v1.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x4f60f228  // bfdot v8.4s, v17.8h, v0.h[1]\n"
+      ".inst 0x4f61f22c  // bfdot v12.4s, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f60f209  // bfdot v9.4s, v16.8h, v0.h[1]\n"
+      ".inst 0x4f61f20d  // bfdot v13.4s, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f60f22a  // bfdot v10.4s, v17.8h, v0.h[1]\n"
+      ".inst 0x4f61f22e  // bfdot v14.4s, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4f60f20b  // bfdot v11.4s, v16.8h, v0.h[1]\n"
+      ".inst 0x4f61f20f  // bfdot v15.4s, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f40fa28  // bfdot v8.4s, v17.8h, v0.h[2]\n"
+      ".inst 0x4f41fa2c  // bfdot v12.4s, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f40fa09  // bfdot v9.4s, v16.8h, v0.h[2]\n"
+      ".inst 0x4f41fa0d  // bfdot v13.4s, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f40fa2a  // bfdot v10.4s, v17.8h, v0.h[2]\n"
+      ".inst 0x4f41fa2e  // bfdot v14.4s, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f40fa0b  // bfdot v11.4s, v16.8h, v0.h[2]\n"
+      ".inst 0x4f41fa0f  // bfdot v15.4s, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4f60fa28  // bfdot v8.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f61fa2c  // bfdot v12.4s, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4f60fa09  // bfdot v9.4s, v16.8h, v0.h[3]\n"
+      ".inst 0x4f61fa0d  // bfdot v13.4s, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f60fa2a  // bfdot v10.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f61fa2e  // bfdot v14.4s, v17.8h, v1.h[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
-      "ldr q1, [x24, #0x0]\n"
+      ".inst 0x4f60fa0b  // bfdot v11.4s, v16.8h, v0.h[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4f61fa0f  // bfdot v15.4s, v16.8h, v1.h[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 53b\n"
       "54:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f41f22e  // bfdot v14.4s, v17.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
+      ".inst 0x4f41f20f  // bfdot v15.4s, v16.8h, v1.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f60f228  // bfdot v8.4s, v17.8h, v0.h[1]\n"
+      ".inst 0x4f61f22c  // bfdot v12.4s, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f60f209  // bfdot v9.4s, v16.8h, v0.h[1]\n"
+      ".inst 0x4f61f20d  // bfdot v13.4s, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4f60f22a  // bfdot v10.4s, v17.8h, v0.h[1]\n"
+      ".inst 0x4f61f22e  // bfdot v14.4s, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4f60f20b  // bfdot v11.4s, v16.8h, v0.h[1]\n"
+      ".inst 0x4f61f20f  // bfdot v15.4s, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f40fa28  // bfdot v8.4s, v17.8h, v0.h[2]\n"
+      ".inst 0x4f41fa2c  // bfdot v12.4s, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f40fa09  // bfdot v9.4s, v16.8h, v0.h[2]\n"
+      ".inst 0x4f41fa0d  // bfdot v13.4s, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f40fa2a  // bfdot v10.4s, v17.8h, v0.h[2]\n"
+      ".inst 0x4f41fa2e  // bfdot v14.4s, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f40fa0b  // bfdot v11.4s, v16.8h, v0.h[2]\n"
+      ".inst 0x4f41fa0f  // bfdot v15.4s, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4f60fa28  // bfdot v8.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f61fa2c  // bfdot v12.4s, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4f60fa09  // bfdot v9.4s, v16.8h, v0.h[3]\n"
+      ".inst 0x4f61fa0d  // bfdot v13.4s, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f60fa2a  // bfdot v10.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f61fa2e  // bfdot v14.4s, v17.8h, v1.h[3]\n"
+      ".inst 0x4f60fa0b  // bfdot v11.4s, v16.8h, v0.h[3]\n"
+      ".inst 0x4f61fa0f  // bfdot v15.4s, v16.8h, v1.h[3]\n"
       "55:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x26, 59f\n"
-      "cmp x26, #0x2\n"
+      "cbz x27, 59f\n"
+      "cmp x27, #0x2\n"
       "blt 57f\n"
       "56:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x2\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
+      "sub x27, x27, #0x2\n"
+      "cmp x27, #0x2\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f53f228  // bfdot v8.4s, v17.8h, v19.h[0]\n"
+      ".inst 0x4f52f22c  // bfdot v12.4s, v17.8h, v18.h[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f53f209  // bfdot v9.4s, v16.8h, v19.h[0]\n"
+      ".inst 0x4f52f20d  // bfdot v13.4s, v16.8h, v18.h[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f53f22a  // bfdot v10.4s, v17.8h, v19.h[0]\n"
+      ".inst 0x4f52f22e  // bfdot v14.4s, v17.8h, v18.h[0]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f53f20b  // bfdot v11.4s, v16.8h, v19.h[0]\n"
+      ".inst 0x4f52f20f  // bfdot v15.4s, v16.8h, v18.h[0]\n"
       "bge 56b\n"
-      "cbz x26, 59f\n"
       "57:"  // Height 2: Multiply loop: Skip odd blocks
-      "ldr h0, [x25, #0x0]\n"
-      "ldr h1, [x24, #0x0]\n"
+      "cbz x27, 59f\n"
+      "ldr h0, [x26, #0x0]\n"
+      "ldr h1, [x25, #0x0]\n"
       "58:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f40f228  // bfdot v8.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f41f22c  // bfdot v12.4s, v17.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f40f209  // bfdot v9.4s, v16.8h, v0.h[0]\n"
+      ".inst 0x4f41f20d  // bfdot v13.4s, v16.8h, v1.h[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f41f22e  // bfdot v14.4s, v17.8h, v1.h[0]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
+      ".inst 0x4f41f20f  // bfdot v15.4s, v16.8h, v1.h[0]\n"
       "59:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 50b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "tbz %x[flags], #1, 60f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmin v12.4s, v12.4s, v17.4s\n"
+      "fmin v13.4s, v13.4s, v17.4s\n"
+      "fmin v14.4s, v14.4s, v17.4s\n"
+      "fmin v15.4s, v15.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
+      "fmax v14.4s, v14.4s, v16.4s\n"
+      "fmax v15.4s, v15.4s, v16.4s\n"
       "60:"  // Height 2: No activation
       "cmp x11, #0x10\n"
       "bge 69f\n"
       "tbz x11, #3, 64f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
       "tbz x11, #2, 62f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
       "tbz x11, #1, 61f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
       "tbz x11, #0, 68f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x25]\n"
       "b 68f\n"
       "61:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x11, #0, 68f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x24, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
       "b 68f\n"
       "62:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x11, #1, 63f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
       "tbz x11, #0, 68f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x25]\n"
       "b 68f\n"
       "63:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x11, #0, 68f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x24, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
       "b 68f\n"
       "64:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x11, #2, 66f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
       "tbz x11, #1, 65f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
       "tbz x11, #0, 68f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x25]\n"
       "b 68f\n"
       "65:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x11, #0, 68f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x24, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
       "b 68f\n"
       "66:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x11, #1, 67f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
       "tbz x11, #0, 68f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x25]\n"
       "b 68f\n"
       "67:"  // Height 2: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x24, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
       "68:"  // Height 2: Partial direct writeback: Done
       "b 70f\n"
       "69:"  // Height 2: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
       "70:"  // Height 2: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 37b\n"
       "b 212f\n"
       "71:"  // Height 3
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "72:"  // Height 3: Column loop
-      "cbz x9, 73f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 73f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "ldr q11, [x9, #0x30]\n"
       "mov v13.16b, v9.16b\n"
-      "add x9, x9, #0x40\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
       "b 84f\n"
       "73:"  // Height 3: no bias
       "tbz %x[flags], #0, 83f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
       "cmp x11, #0x10\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
       "bge 82f\n"
       "tbz x11, #3, 77f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x24], #0x10\n"
-      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
       "tbz x11, #2, 75f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x24], #0x10\n"
-      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
       "tbz x11, #1, 74f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
       "tbz x11, #0, 81f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x24]\n"
-      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
       "b 81f\n"
       "74:"  // Height 3: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 81f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x24, #0x0]\n"
-      "ldr s19, [x23, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
       "b 81f\n"
       "75:"  // Height 3: Partial accumulate: partial_2_8
       "tbz x11, #1, 76f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "mov x19, #0x28\n"
-      "ldr d18, [x23], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
       "tbz x11, #0, 81f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x24]\n"
-      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
       "b 81f\n"
       "76:"  // Height 3: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 81f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x24, #0x0]\n"
-      "ldr s18, [x23, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
       "b 81f\n"
       "77:"  // Height 3: Partial accumulate: partial_4_0
       "tbz x11, #2, 79f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
       "tbz x11, #1, 78f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
       "tbz x11, #0, 81f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x24]\n"
-      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
       "b 81f\n"
       "78:"  // Height 3: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 81f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x24, #0x0]\n"
-      "ldr s17, [x23, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
       "b 81f\n"
       "79:"  // Height 3: Partial accumulate: partial_2_0
       "tbz x11, #1, 80f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d16, [x23], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
       "tbz x11, #0, 81f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x24]\n"
-      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
       "b 81f\n"
       "80:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s12, [x24, #0x0]\n"
-      "ldr s16, [x23, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
       "81:"  // Height 3: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 84f\n"
       "82:"  // Height 3: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
       "b 84f\n"
       "83:"  // Height 3: no accumulate
       "movi v8.16b, #0x0\n"
@@ -939,529 +938,529 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "movi v18.16b, #0x0\n"
       "movi v19.16b, #0x0\n"
       "84:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "85:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 86f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 87f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 87f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
       "b 87f\n"
       "86:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "87:"  // Height 3: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 90f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x10\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 89f\n"
       "88:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sub x26, x26, #0x8\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "cmp x26, #0x10\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f40f2aa  // bfdot v10.4s, v21.8h, v0.h[0]\n"
+      ".inst 0x4f41f2ae  // bfdot v14.4s, v21.8h, v1.h[0]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f42f2b2  // bfdot v18.4s, v21.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x4f40f28b  // bfdot v11.4s, v20.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f41f28f  // bfdot v15.4s, v20.8h, v1.h[0]\n"
+      ".inst 0x4f42f293  // bfdot v19.4s, v20.8h, v2.h[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f60f2a8  // bfdot v8.4s, v21.8h, v0.h[1]\n"
+      ".inst 0x4f61f2ac  // bfdot v12.4s, v21.8h, v1.h[1]\n"
+      ".inst 0x4f62f2b0  // bfdot v16.4s, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x4f60f289  // bfdot v9.4s, v20.8h, v0.h[1]\n"
+      ".inst 0x4f61f28d  // bfdot v13.4s, v20.8h, v1.h[1]\n"
+      ".inst 0x4f62f291  // bfdot v17.4s, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x4f60f2aa  // bfdot v10.4s, v21.8h, v0.h[1]\n"
+      ".inst 0x4f61f2ae  // bfdot v14.4s, v21.8h, v1.h[1]\n"
+      ".inst 0x4f62f2b2  // bfdot v18.4s, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x4f60f28b  // bfdot v11.4s, v20.8h, v0.h[1]\n"
+      ".inst 0x4f61f28f  // bfdot v15.4s, v20.8h, v1.h[1]\n"
+      ".inst 0x4f62f293  // bfdot v19.4s, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x4f40faa8  // bfdot v8.4s, v21.8h, v0.h[2]\n"
+      ".inst 0x4f41faac  // bfdot v12.4s, v21.8h, v1.h[2]\n"
+      ".inst 0x4f42fab0  // bfdot v16.4s, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x4f40fa89  // bfdot v9.4s, v20.8h, v0.h[2]\n"
+      ".inst 0x4f41fa8d  // bfdot v13.4s, v20.8h, v1.h[2]\n"
+      ".inst 0x4f42fa91  // bfdot v17.4s, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x4f40faaa  // bfdot v10.4s, v21.8h, v0.h[2]\n"
+      ".inst 0x4f41faae  // bfdot v14.4s, v21.8h, v1.h[2]\n"
+      ".inst 0x4f42fab2  // bfdot v18.4s, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x4f40fa8b  // bfdot v11.4s, v20.8h, v0.h[2]\n"
+      ".inst 0x4f41fa8f  // bfdot v15.4s, v20.8h, v1.h[2]\n"
+      ".inst 0x4f42fa93  // bfdot v19.4s, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x4f60faa8  // bfdot v8.4s, v21.8h, v0.h[3]\n"
+      ".inst 0x4f61faac  // bfdot v12.4s, v21.8h, v1.h[3]\n"
+      ".inst 0x4f62fab0  // bfdot v16.4s, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x4f60fa89  // bfdot v9.4s, v20.8h, v0.h[3]\n"
+      ".inst 0x4f61fa8d  // bfdot v13.4s, v20.8h, v1.h[3]\n"
+      ".inst 0x4f62fa91  // bfdot v17.4s, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f60faaa  // bfdot v10.4s, v21.8h, v0.h[3]\n"
+      ".inst 0x4f61faae  // bfdot v14.4s, v21.8h, v1.h[3]\n"
+      ".inst 0x4f62fab2  // bfdot v18.4s, v21.8h, v2.h[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
-      "ldr q2, [x23, #0x0]\n"
+      ".inst 0x4f60fa8b  // bfdot v11.4s, v20.8h, v0.h[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4f61fa8f  // bfdot v15.4s, v20.8h, v1.h[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4f62fa93  // bfdot v19.4s, v20.8h, v2.h[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 88b\n"
       "89:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x4f40f2aa  // bfdot v10.4s, v21.8h, v0.h[0]\n"
+      ".inst 0x4f41f2ae  // bfdot v14.4s, v21.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f42f2b2  // bfdot v18.4s, v21.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x4f40f28b  // bfdot v11.4s, v20.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f41f28f  // bfdot v15.4s, v20.8h, v1.h[0]\n"
+      ".inst 0x4f42f293  // bfdot v19.4s, v20.8h, v2.h[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      ".inst 0x4f60f2a8  // bfdot v8.4s, v21.8h, v0.h[1]\n"
+      ".inst 0x4f61f2ac  // bfdot v12.4s, v21.8h, v1.h[1]\n"
+      ".inst 0x4f62f2b0  // bfdot v16.4s, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x4f60f289  // bfdot v9.4s, v20.8h, v0.h[1]\n"
+      ".inst 0x4f61f28d  // bfdot v13.4s, v20.8h, v1.h[1]\n"
+      ".inst 0x4f62f291  // bfdot v17.4s, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x4f60f2aa  // bfdot v10.4s, v21.8h, v0.h[1]\n"
+      ".inst 0x4f61f2ae  // bfdot v14.4s, v21.8h, v1.h[1]\n"
+      ".inst 0x4f62f2b2  // bfdot v18.4s, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x4f60f28b  // bfdot v11.4s, v20.8h, v0.h[1]\n"
+      ".inst 0x4f61f28f  // bfdot v15.4s, v20.8h, v1.h[1]\n"
+      ".inst 0x4f62f293  // bfdot v19.4s, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x4f40faa8  // bfdot v8.4s, v21.8h, v0.h[2]\n"
+      ".inst 0x4f41faac  // bfdot v12.4s, v21.8h, v1.h[2]\n"
+      ".inst 0x4f42fab0  // bfdot v16.4s, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x4f40fa89  // bfdot v9.4s, v20.8h, v0.h[2]\n"
+      ".inst 0x4f41fa8d  // bfdot v13.4s, v20.8h, v1.h[2]\n"
+      ".inst 0x4f42fa91  // bfdot v17.4s, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x4f40faaa  // bfdot v10.4s, v21.8h, v0.h[2]\n"
+      ".inst 0x4f41faae  // bfdot v14.4s, v21.8h, v1.h[2]\n"
+      ".inst 0x4f42fab2  // bfdot v18.4s, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x4f40fa8b  // bfdot v11.4s, v20.8h, v0.h[2]\n"
+      ".inst 0x4f41fa8f  // bfdot v15.4s, v20.8h, v1.h[2]\n"
+      ".inst 0x4f42fa93  // bfdot v19.4s, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x4f60faa8  // bfdot v8.4s, v21.8h, v0.h[3]\n"
+      ".inst 0x4f61faac  // bfdot v12.4s, v21.8h, v1.h[3]\n"
+      ".inst 0x4f62fab0  // bfdot v16.4s, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x4f60fa89  // bfdot v9.4s, v20.8h, v0.h[3]\n"
+      ".inst 0x4f61fa8d  // bfdot v13.4s, v20.8h, v1.h[3]\n"
+      ".inst 0x4f62fa91  // bfdot v17.4s, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f60faaa  // bfdot v10.4s, v21.8h, v0.h[3]\n"
+      ".inst 0x4f61faae  // bfdot v14.4s, v21.8h, v1.h[3]\n"
+      ".inst 0x4f62fab2  // bfdot v18.4s, v21.8h, v2.h[3]\n"
+      ".inst 0x4f60fa8b  // bfdot v11.4s, v20.8h, v0.h[3]\n"
+      ".inst 0x4f61fa8f  // bfdot v15.4s, v20.8h, v1.h[3]\n"
+      ".inst 0x4f62fa93  // bfdot v19.4s, v20.8h, v2.h[3]\n"
       "90:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x26, 94f\n"
-      "cmp x26, #0x2\n"
+      "cbz x27, 94f\n"
+      "cmp x27, #0x2\n"
       "blt 92f\n"
       "91:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x2\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x2\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "sub x27, x27, #0x2\n"
+      "cmp x27, #0x2\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x10, #0x0]\n"
+      ".inst 0x4f58f2a8  // bfdot v8.4s, v21.8h, v24.h[0]\n"
+      ".inst 0x4f57f2ac  // bfdot v12.4s, v21.8h, v23.h[0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x4f56f2b0  // bfdot v16.4s, v21.8h, v22.h[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x4f58f289  // bfdot v9.4s, v20.8h, v24.h[0]\n"
+      ".inst 0x4f57f28d  // bfdot v13.4s, v20.8h, v23.h[0]\n"
+      ".inst 0x4f56f291  // bfdot v17.4s, v20.8h, v22.h[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f58f2aa  // bfdot v10.4s, v21.8h, v24.h[0]\n"
+      ".inst 0x4f57f2ae  // bfdot v14.4s, v21.8h, v23.h[0]\n"
+      ".inst 0x4f56f2b2  // bfdot v18.4s, v21.8h, v22.h[0]\n"
+      ".inst 0x4f58f28b  // bfdot v11.4s, v20.8h, v24.h[0]\n"
+      ".inst 0x4f57f28f  // bfdot v15.4s, v20.8h, v23.h[0]\n"
+      ".inst 0x4f56f293  // bfdot v19.4s, v20.8h, v22.h[0]\n"
       "bge 91b\n"
-      "cbz x26, 94f\n"
       "92:"  // Height 3: Multiply loop: Skip odd blocks
-      "ldr h0, [x25, #0x0]\n"
-      "ldr h1, [x24, #0x0]\n"
-      "ldr h2, [x23, #0x0]\n"
+      "cbz x27, 94f\n"
+      "ldr h0, [x26, #0x0]\n"
+      "ldr h1, [x25, #0x0]\n"
+      "ldr h2, [x24, #0x0]\n"
       "93:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x4f40f2a8  // bfdot v8.4s, v21.8h, v0.h[0]\n"
+      ".inst 0x4f41f2ac  // bfdot v12.4s, v21.8h, v1.h[0]\n"
+      ".inst 0x4f42f2b0  // bfdot v16.4s, v21.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x4f40f289  // bfdot v9.4s, v20.8h, v0.h[0]\n"
+      ".inst 0x4f41f28d  // bfdot v13.4s, v20.8h, v1.h[0]\n"
+      ".inst 0x4f42f291  // bfdot v17.4s, v20.8h, v2.h[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f40f2aa  // bfdot v10.4s, v21.8h, v0.h[0]\n"
+      ".inst 0x4f41f2ae  // bfdot v14.4s, v21.8h, v1.h[0]\n"
+      ".inst 0x4f42f2b2  // bfdot v18.4s, v21.8h, v2.h[0]\n"
+      ".inst 0x4f40f28b  // bfdot v11.4s, v20.8h, v0.h[0]\n"
+      ".inst 0x4f41f28f  // bfdot v15.4s, v20.8h, v1.h[0]\n"
+      ".inst 0x4f42f293  // bfdot v19.4s, v20.8h, v2.h[0]\n"
       "94:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 85b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 95f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v21.4s\n"
+      "fmin v9.4s, v9.4s, v21.4s\n"
+      "fmin v10.4s, v10.4s, v21.4s\n"
+      "fmin v11.4s, v11.4s, v21.4s\n"
+      "fmin v12.4s, v12.4s, v21.4s\n"
+      "fmin v13.4s, v13.4s, v21.4s\n"
+      "fmin v14.4s, v14.4s, v21.4s\n"
+      "fmin v15.4s, v15.4s, v21.4s\n"
+      "fmin v16.4s, v16.4s, v21.4s\n"
+      "fmin v17.4s, v17.4s, v21.4s\n"
+      "fmin v18.4s, v18.4s, v21.4s\n"
+      "fmin v19.4s, v19.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
       "95:"  // Height 3: No activation
       "cmp x11, #0x10\n"
       "bge 104f\n"
       "tbz x11, #3, 99f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v13.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
       "tbz x11, #2, 97f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x24], #0x10\n"
-      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
       "tbz x11, #1, 96f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
       "tbz x11, #0, 103f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x24]\n"
-      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
       "b 103f\n"
       "96:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x11, #0, 103f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x24, #0x0]\n"
-      "str s19, [x23, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
       "b 103f\n"
       "97:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x11, #1, 98f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
       "tbz x11, #0, 103f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x24]\n"
-      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
       "b 103f\n"
       "98:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x11, #0, 103f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x24, #0x0]\n"
-      "str s18, [x23, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
       "b 103f\n"
       "99:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x11, #2, 101f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
       "tbz x11, #1, 100f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
       "tbz x11, #0, 103f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x24]\n"
-      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
       "b 103f\n"
       "100:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x11, #0, 103f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x24, #0x0]\n"
-      "str s17, [x23, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
       "b 103f\n"
       "101:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x11, #1, 102f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
       "tbz x11, #0, 103f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x24]\n"
-      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
       "b 103f\n"
       "102:"  // Height 3: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x24, #0x0]\n"
-      "str s16, [x23, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
       "103:"  // Height 3: Partial direct writeback: Done
       "b 105f\n"
       "104:"  // Height 3: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
       "105:"  // Height 3: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 72b\n"
       "b 212f\n"
       "106:"  // Height 4
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "107:"  // Height 4: Column loop
-      "cbz x9, 108f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 108f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "mov v20.16b, v8.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
       "mov v23.16b, v11.16b\n"
       "b 119f\n"
       "108:"  // Height 4: no bias
       "tbz %x[flags], #0, 118f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
       "cmp x11, #0x10\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "bge 117f\n"
       "tbz x11, #3, 112f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x24], #0x10\n"
-      "ld1 { v17.4s }, [x23], #0x10\n"
-      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
       "tbz x11, #2, 110f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x24], #0x10\n"
-      "ld1 { v18.4s }, [x23], #0x10\n"
-      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
       "tbz x11, #1, 109f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
-      "ldr d23, [x22], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
       "tbz x11, #0, 116f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x24]\n"
-      "ld1 { v19.s }[2], [x23]\n"
-      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
       "b 116f\n"
       "109:"  // Height 4: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 116f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x24, #0x0]\n"
-      "ldr s19, [x23, #0x0]\n"
-      "ldr s23, [x22, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
       "b 116f\n"
       "110:"  // Height 4: Partial accumulate: partial_2_8
       "tbz x11, #1, 111f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "mov x19, #0x28\n"
-      "ldr d18, [x23], #0x8\n"
-      "ldr d22, [x22], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
       "tbz x11, #0, 116f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x24]\n"
-      "ld1 { v18.s }[2], [x23]\n"
-      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
       "b 116f\n"
       "111:"  // Height 4: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 116f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x24, #0x0]\n"
-      "ldr s18, [x23, #0x0]\n"
-      "ldr s22, [x22, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
       "b 116f\n"
       "112:"  // Height 4: Partial accumulate: partial_4_0
       "tbz x11, #2, 114f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
       "tbz x11, #1, 113f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
       "tbz x11, #0, 116f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x24]\n"
-      "ld1 { v17.s }[2], [x23]\n"
-      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
       "b 116f\n"
       "113:"  // Height 4: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 116f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x24, #0x0]\n"
-      "ldr s17, [x23, #0x0]\n"
-      "ldr s21, [x22, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
       "b 116f\n"
       "114:"  // Height 4: Partial accumulate: partial_2_0
       "tbz x11, #1, 115f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d16, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
       "tbz x11, #0, 116f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x24]\n"
-      "ld1 { v16.s }[2], [x23]\n"
-      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
       "b 116f\n"
       "115:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s12, [x24, #0x0]\n"
-      "ldr s16, [x23, #0x0]\n"
-      "ldr s20, [x22, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
       "116:"  // Height 4: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 119f\n"
       "117:"  // Height 4: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
-      "ldr q20, [x22, #0x0]\n"
-      "ldr q21, [x22, #0x10]\n"
-      "ldr q22, [x22, #0x20]\n"
-      "ldr q23, [x22, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
       "b 119f\n"
       "118:"  // Height 4: no accumulate
       "movi v8.16b, #0x0\n"
@@ -1481,635 +1480,635 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "movi v22.16b, #0x0\n"
       "movi v23.16b, #0x0\n"
       "119:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "120:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 121f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 122f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 122f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
       "b 122f\n"
       "121:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "122:"  // Height 4: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 125f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x10\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 124f\n"
       "123:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q25, [x10, #0x20]\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x26, x26, #0x8\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
       ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x4f40f32a  // bfdot v10.4s, v25.8h, v0.h[0]\n"
+      ".inst 0x4f41f32e  // bfdot v14.4s, v25.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f42f332  // bfdot v18.4s, v25.8h, v2.h[0]\n"
+      ".inst 0x4f43f336  // bfdot v22.4s, v25.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f40f30b  // bfdot v11.4s, v24.8h, v0.h[0]\n"
+      ".inst 0x4f41f30f  // bfdot v15.4s, v24.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4f42f313  // bfdot v19.4s, v24.8h, v2.h[0]\n"
+      ".inst 0x4f43f317  // bfdot v23.4s, v24.8h, v3.h[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x4f60f328  // bfdot v8.4s, v25.8h, v0.h[1]\n"
+      ".inst 0x4f61f32c  // bfdot v12.4s, v25.8h, v1.h[1]\n"
+      ".inst 0x4f62f330  // bfdot v16.4s, v25.8h, v2.h[1]\n"
+      ".inst 0x4f63f334  // bfdot v20.4s, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x4f60f309  // bfdot v9.4s, v24.8h, v0.h[1]\n"
+      ".inst 0x4f61f30d  // bfdot v13.4s, v24.8h, v1.h[1]\n"
+      ".inst 0x4f62f311  // bfdot v17.4s, v24.8h, v2.h[1]\n"
+      ".inst 0x4f63f315  // bfdot v21.4s, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x4f60f32a  // bfdot v10.4s, v25.8h, v0.h[1]\n"
+      ".inst 0x4f61f32e  // bfdot v14.4s, v25.8h, v1.h[1]\n"
+      ".inst 0x4f62f332  // bfdot v18.4s, v25.8h, v2.h[1]\n"
+      ".inst 0x4f63f336  // bfdot v22.4s, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x4f60f30b  // bfdot v11.4s, v24.8h, v0.h[1]\n"
+      ".inst 0x4f61f30f  // bfdot v15.4s, v24.8h, v1.h[1]\n"
+      ".inst 0x4f62f313  // bfdot v19.4s, v24.8h, v2.h[1]\n"
+      ".inst 0x4f63f317  // bfdot v23.4s, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x4f40fb28  // bfdot v8.4s, v25.8h, v0.h[2]\n"
+      ".inst 0x4f41fb2c  // bfdot v12.4s, v25.8h, v1.h[2]\n"
+      ".inst 0x4f42fb30  // bfdot v16.4s, v25.8h, v2.h[2]\n"
+      ".inst 0x4f43fb34  // bfdot v20.4s, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x4f40fb09  // bfdot v9.4s, v24.8h, v0.h[2]\n"
+      ".inst 0x4f41fb0d  // bfdot v13.4s, v24.8h, v1.h[2]\n"
+      ".inst 0x4f42fb11  // bfdot v17.4s, v24.8h, v2.h[2]\n"
+      ".inst 0x4f43fb15  // bfdot v21.4s, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x4f40fb2a  // bfdot v10.4s, v25.8h, v0.h[2]\n"
+      ".inst 0x4f41fb2e  // bfdot v14.4s, v25.8h, v1.h[2]\n"
+      ".inst 0x4f42fb32  // bfdot v18.4s, v25.8h, v2.h[2]\n"
+      ".inst 0x4f43fb36  // bfdot v22.4s, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x4f40fb0b  // bfdot v11.4s, v24.8h, v0.h[2]\n"
+      ".inst 0x4f41fb0f  // bfdot v15.4s, v24.8h, v1.h[2]\n"
+      ".inst 0x4f42fb13  // bfdot v19.4s, v24.8h, v2.h[2]\n"
+      ".inst 0x4f43fb17  // bfdot v23.4s, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x4f60fb28  // bfdot v8.4s, v25.8h, v0.h[3]\n"
+      ".inst 0x4f61fb2c  // bfdot v12.4s, v25.8h, v1.h[3]\n"
+      ".inst 0x4f62fb30  // bfdot v16.4s, v25.8h, v2.h[3]\n"
+      ".inst 0x4f63fb34  // bfdot v20.4s, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x4f60fb09  // bfdot v9.4s, v24.8h, v0.h[3]\n"
+      ".inst 0x4f61fb0d  // bfdot v13.4s, v24.8h, v1.h[3]\n"
+      ".inst 0x4f62fb11  // bfdot v17.4s, v24.8h, v2.h[3]\n"
+      ".inst 0x4f63fb15  // bfdot v21.4s, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f60fb2a  // bfdot v10.4s, v25.8h, v0.h[3]\n"
+      ".inst 0x4f61fb2e  // bfdot v14.4s, v25.8h, v1.h[3]\n"
+      ".inst 0x4f62fb32  // bfdot v18.4s, v25.8h, v2.h[3]\n"
+      ".inst 0x4f63fb36  // bfdot v22.4s, v25.8h, v3.h[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
-      "ldr q2, [x23, #0x0]\n"
-      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
-      "ldr q3, [x22, #0x0]\n"
+      ".inst 0x4f60fb0b  // bfdot v11.4s, v24.8h, v0.h[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4f61fb0f  // bfdot v15.4s, v24.8h, v1.h[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4f62fb13  // bfdot v19.4s, v24.8h, v2.h[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4f63fb17  // bfdot v23.4s, v24.8h, v3.h[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 123b\n"
       "124:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q25, [x10, #0x20]\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "add x23, x23, #0x10\n"
+      "sub x27, x27, #0x8\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f40f32a  // bfdot v10.4s, v25.8h, v0.h[0]\n"
+      ".inst 0x4f41f32e  // bfdot v14.4s, v25.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f42f332  // bfdot v18.4s, v25.8h, v2.h[0]\n"
+      ".inst 0x4f43f336  // bfdot v22.4s, v25.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4f40f30b  // bfdot v11.4s, v24.8h, v0.h[0]\n"
+      ".inst 0x4f41f30f  // bfdot v15.4s, v24.8h, v1.h[0]\n"
+      ".inst 0x4f42f313  // bfdot v19.4s, v24.8h, v2.h[0]\n"
+      ".inst 0x4f43f317  // bfdot v23.4s, v24.8h, v3.h[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x4f60f328  // bfdot v8.4s, v25.8h, v0.h[1]\n"
+      ".inst 0x4f61f32c  // bfdot v12.4s, v25.8h, v1.h[1]\n"
+      ".inst 0x4f62f330  // bfdot v16.4s, v25.8h, v2.h[1]\n"
+      ".inst 0x4f63f334  // bfdot v20.4s, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x4f60f309  // bfdot v9.4s, v24.8h, v0.h[1]\n"
+      ".inst 0x4f61f30d  // bfdot v13.4s, v24.8h, v1.h[1]\n"
+      ".inst 0x4f62f311  // bfdot v17.4s, v24.8h, v2.h[1]\n"
+      ".inst 0x4f63f315  // bfdot v21.4s, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x4f60f32a  // bfdot v10.4s, v25.8h, v0.h[1]\n"
+      ".inst 0x4f61f32e  // bfdot v14.4s, v25.8h, v1.h[1]\n"
+      ".inst 0x4f62f332  // bfdot v18.4s, v25.8h, v2.h[1]\n"
+      ".inst 0x4f63f336  // bfdot v22.4s, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x4f60f30b  // bfdot v11.4s, v24.8h, v0.h[1]\n"
+      ".inst 0x4f61f30f  // bfdot v15.4s, v24.8h, v1.h[1]\n"
+      ".inst 0x4f62f313  // bfdot v19.4s, v24.8h, v2.h[1]\n"
+      ".inst 0x4f63f317  // bfdot v23.4s, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x4f40fb28  // bfdot v8.4s, v25.8h, v0.h[2]\n"
+      ".inst 0x4f41fb2c  // bfdot v12.4s, v25.8h, v1.h[2]\n"
+      ".inst 0x4f42fb30  // bfdot v16.4s, v25.8h, v2.h[2]\n"
+      ".inst 0x4f43fb34  // bfdot v20.4s, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x4f40fb09  // bfdot v9.4s, v24.8h, v0.h[2]\n"
+      ".inst 0x4f41fb0d  // bfdot v13.4s, v24.8h, v1.h[2]\n"
+      ".inst 0x4f42fb11  // bfdot v17.4s, v24.8h, v2.h[2]\n"
+      ".inst 0x4f43fb15  // bfdot v21.4s, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x4f40fb2a  // bfdot v10.4s, v25.8h, v0.h[2]\n"
+      ".inst 0x4f41fb2e  // bfdot v14.4s, v25.8h, v1.h[2]\n"
+      ".inst 0x4f42fb32  // bfdot v18.4s, v25.8h, v2.h[2]\n"
+      ".inst 0x4f43fb36  // bfdot v22.4s, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x4f40fb0b  // bfdot v11.4s, v24.8h, v0.h[2]\n"
+      ".inst 0x4f41fb0f  // bfdot v15.4s, v24.8h, v1.h[2]\n"
+      ".inst 0x4f42fb13  // bfdot v19.4s, v24.8h, v2.h[2]\n"
+      ".inst 0x4f43fb17  // bfdot v23.4s, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x4f60fb28  // bfdot v8.4s, v25.8h, v0.h[3]\n"
+      ".inst 0x4f61fb2c  // bfdot v12.4s, v25.8h, v1.h[3]\n"
+      ".inst 0x4f62fb30  // bfdot v16.4s, v25.8h, v2.h[3]\n"
+      ".inst 0x4f63fb34  // bfdot v20.4s, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x4f60fb09  // bfdot v9.4s, v24.8h, v0.h[3]\n"
+      ".inst 0x4f61fb0d  // bfdot v13.4s, v24.8h, v1.h[3]\n"
+      ".inst 0x4f62fb11  // bfdot v17.4s, v24.8h, v2.h[3]\n"
+      ".inst 0x4f63fb15  // bfdot v21.4s, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f60fb2a  // bfdot v10.4s, v25.8h, v0.h[3]\n"
+      ".inst 0x4f61fb2e  // bfdot v14.4s, v25.8h, v1.h[3]\n"
+      ".inst 0x4f62fb32  // bfdot v18.4s, v25.8h, v2.h[3]\n"
+      ".inst 0x4f63fb36  // bfdot v22.4s, v25.8h, v3.h[3]\n"
+      ".inst 0x4f60fb0b  // bfdot v11.4s, v24.8h, v0.h[3]\n"
+      ".inst 0x4f61fb0f  // bfdot v15.4s, v24.8h, v1.h[3]\n"
+      ".inst 0x4f62fb13  // bfdot v19.4s, v24.8h, v2.h[3]\n"
+      ".inst 0x4f63fb17  // bfdot v23.4s, v24.8h, v3.h[3]\n"
       "125:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x26, 129f\n"
-      "cmp x26, #0x2\n"
+      "cbz x27, 129f\n"
+      "cmp x27, #0x2\n"
       "blt 127f\n"
       "126:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x2\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x2\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr s3, [x22], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "sub x27, x27, #0x2\n"
+      "cmp x27, #0x2\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x4f5df328  // bfdot v8.4s, v25.8h, v29.h[0]\n"
+      ".inst 0x4f5cf32c  // bfdot v12.4s, v25.8h, v28.h[0]\n"
+      ".inst 0x4f5bf330  // bfdot v16.4s, v25.8h, v27.h[0]\n"
+      ".inst 0x4f5af334  // bfdot v20.4s, v25.8h, v26.h[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x4f5df309  // bfdot v9.4s, v24.8h, v29.h[0]\n"
+      ".inst 0x4f5cf30d  // bfdot v13.4s, v24.8h, v28.h[0]\n"
+      ".inst 0x4f5bf311  // bfdot v17.4s, v24.8h, v27.h[0]\n"
+      ".inst 0x4f5af315  // bfdot v21.4s, v24.8h, v26.h[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f5df32a  // bfdot v10.4s, v25.8h, v29.h[0]\n"
+      ".inst 0x4f5cf32e  // bfdot v14.4s, v25.8h, v28.h[0]\n"
+      ".inst 0x4f5bf332  // bfdot v18.4s, v25.8h, v27.h[0]\n"
+      ".inst 0x4f5af336  // bfdot v22.4s, v25.8h, v26.h[0]\n"
+      ".inst 0x4f5df30b  // bfdot v11.4s, v24.8h, v29.h[0]\n"
+      ".inst 0x4f5cf30f  // bfdot v15.4s, v24.8h, v28.h[0]\n"
+      ".inst 0x4f5bf313  // bfdot v19.4s, v24.8h, v27.h[0]\n"
+      ".inst 0x4f5af317  // bfdot v23.4s, v24.8h, v26.h[0]\n"
       "bge 126b\n"
-      "cbz x26, 129f\n"
       "127:"  // Height 4: Multiply loop: Skip odd blocks
-      "ldr h0, [x25, #0x0]\n"
-      "ldr h1, [x24, #0x0]\n"
-      "ldr h2, [x23, #0x0]\n"
-      "ldr h3, [x22, #0x0]\n"
+      "cbz x27, 129f\n"
+      "ldr h0, [x26, #0x0]\n"
+      "ldr h1, [x25, #0x0]\n"
+      "ldr h2, [x24, #0x0]\n"
+      "ldr h3, [x23, #0x0]\n"
       "128:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x4f40f328  // bfdot v8.4s, v25.8h, v0.h[0]\n"
+      ".inst 0x4f41f32c  // bfdot v12.4s, v25.8h, v1.h[0]\n"
+      ".inst 0x4f42f330  // bfdot v16.4s, v25.8h, v2.h[0]\n"
+      ".inst 0x4f43f334  // bfdot v20.4s, v25.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x4f40f309  // bfdot v9.4s, v24.8h, v0.h[0]\n"
+      ".inst 0x4f41f30d  // bfdot v13.4s, v24.8h, v1.h[0]\n"
+      ".inst 0x4f42f311  // bfdot v17.4s, v24.8h, v2.h[0]\n"
+      ".inst 0x4f43f315  // bfdot v21.4s, v24.8h, v3.h[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f40f32a  // bfdot v10.4s, v25.8h, v0.h[0]\n"
+      ".inst 0x4f41f32e  // bfdot v14.4s, v25.8h, v1.h[0]\n"
+      ".inst 0x4f42f332  // bfdot v18.4s, v25.8h, v2.h[0]\n"
+      ".inst 0x4f43f336  // bfdot v22.4s, v25.8h, v3.h[0]\n"
+      ".inst 0x4f40f30b  // bfdot v11.4s, v24.8h, v0.h[0]\n"
+      ".inst 0x4f41f30f  // bfdot v15.4s, v24.8h, v1.h[0]\n"
+      ".inst 0x4f42f313  // bfdot v19.4s, v24.8h, v2.h[0]\n"
+      ".inst 0x4f43f317  // bfdot v23.4s, v24.8h, v3.h[0]\n"
       "129:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 120b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x24, x20, LSL #2\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 130f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmin v20.4s, v20.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
-      "fmax v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v0.4s\n"
-      "fmin v22.4s, v22.4s, v0.4s\n"
-      "fmin v23.4s, v23.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v1.4s\n"
-      "fmax v22.4s, v22.4s, v1.4s\n"
-      "fmax v23.4s, v23.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v25.4s\n"
+      "fmin v9.4s, v9.4s, v25.4s\n"
+      "fmin v10.4s, v10.4s, v25.4s\n"
+      "fmin v11.4s, v11.4s, v25.4s\n"
+      "fmin v12.4s, v12.4s, v25.4s\n"
+      "fmin v13.4s, v13.4s, v25.4s\n"
+      "fmin v14.4s, v14.4s, v25.4s\n"
+      "fmin v15.4s, v15.4s, v25.4s\n"
+      "fmin v16.4s, v16.4s, v25.4s\n"
+      "fmin v17.4s, v17.4s, v25.4s\n"
+      "fmin v18.4s, v18.4s, v25.4s\n"
+      "fmin v19.4s, v19.4s, v25.4s\n"
+      "fmin v20.4s, v20.4s, v25.4s\n"
+      "fmin v21.4s, v21.4s, v25.4s\n"
+      "fmin v22.4s, v22.4s, v25.4s\n"
+      "fmin v23.4s, v23.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v24.4s\n"
+      "fmax v9.4s, v9.4s, v24.4s\n"
+      "fmax v10.4s, v10.4s, v24.4s\n"
+      "fmax v11.4s, v11.4s, v24.4s\n"
+      "fmax v12.4s, v12.4s, v24.4s\n"
+      "fmax v13.4s, v13.4s, v24.4s\n"
+      "fmax v14.4s, v14.4s, v24.4s\n"
+      "fmax v15.4s, v15.4s, v24.4s\n"
+      "fmax v16.4s, v16.4s, v24.4s\n"
+      "fmax v17.4s, v17.4s, v24.4s\n"
+      "fmax v18.4s, v18.4s, v24.4s\n"
+      "fmax v19.4s, v19.4s, v24.4s\n"
+      "fmax v20.4s, v20.4s, v24.4s\n"
+      "fmax v21.4s, v21.4s, v24.4s\n"
+      "fmax v22.4s, v22.4s, v24.4s\n"
+      "fmax v23.4s, v23.4s, v24.4s\n"
       "130:"  // Height 4: No activation
       "cmp x11, #0x10\n"
       "bge 139f\n"
       "tbz x11, #3, 134f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v13.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v17.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
-      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
       "tbz x11, #2, 132f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x24], #0x10\n"
-      "st1 { v18.4s }, [x23], #0x10\n"
-      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "st1 { v22.4s }, [x23], #0x10\n"
       "tbz x11, #1, 131f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
-      "str d23, [x22], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
       "tbz x11, #0, 138f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x24]\n"
-      "st1 { v19.s }[2], [x23]\n"
-      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "st1 { v23.s }[2], [x23]\n"
       "b 138f\n"
       "131:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x11, #0, 138f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x24, #0x0]\n"
-      "str s19, [x23, #0x0]\n"
-      "str s23, [x22, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "str s23, [x23, #0x0]\n"
       "b 138f\n"
       "132:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x11, #1, 133f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
-      "str d22, [x22], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
       "tbz x11, #0, 138f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x24]\n"
-      "st1 { v18.s }[2], [x23]\n"
-      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
       "b 138f\n"
       "133:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x11, #0, 138f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x24, #0x0]\n"
-      "str s18, [x23, #0x0]\n"
-      "str s22, [x22, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
       "b 138f\n"
       "134:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x11, #2, 136f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
       "tbz x11, #1, 135f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
-      "str d21, [x22], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
       "tbz x11, #0, 138f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x24]\n"
-      "st1 { v17.s }[2], [x23]\n"
-      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
       "b 138f\n"
       "135:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x11, #0, 138f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x24, #0x0]\n"
-      "str s17, [x23, #0x0]\n"
-      "str s21, [x22, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
       "b 138f\n"
       "136:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x11, #1, 137f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
       "tbz x11, #0, 138f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x24]\n"
-      "st1 { v16.s }[2], [x23]\n"
-      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
       "b 138f\n"
       "137:"  // Height 4: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x24, #0x0]\n"
-      "str s16, [x23, #0x0]\n"
-      "str s20, [x22, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
       "138:"  // Height 4: Partial direct writeback: Done
       "b 140f\n"
       "139:"  // Height 4: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q21, [x22, #0x10]\n"
-      "str q22, [x22, #0x20]\n"
-      "str q23, [x22, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
       "140:"  // Height 4: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 107b\n"
       "b 212f\n"
       "141:"  // Height 5
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "142:"  // Height 5: Column loop
-      "cbz x9, 143f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 143f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "mov v20.16b, v8.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      "mov v24.16b, v8.16b\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
       "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
       "mov v27.16b, v11.16b\n"
       "b 154f\n"
       "143:"  // Height 5: no bias
       "tbz %x[flags], #0, 153f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "cmp x11, #0x10\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "bge 152f\n"
       "tbz x11, #3, 147f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
-      "ld1 { v24.4s }, [x21], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x24], #0x10\n"
-      "ld1 { v17.4s }, [x23], #0x10\n"
-      "ld1 { v21.4s }, [x22], #0x10\n"
-      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
       "tbz x11, #2, 145f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x24], #0x10\n"
-      "ld1 { v18.4s }, [x23], #0x10\n"
-      "ld1 { v22.4s }, [x22], #0x10\n"
-      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
       "tbz x11, #1, 144f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
-      "ldr d23, [x22], #0x8\n"
-      "ldr d27, [x21], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
       "tbz x11, #0, 151f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x24]\n"
-      "ld1 { v19.s }[2], [x23]\n"
-      "ld1 { v23.s }[2], [x22]\n"
-      "ld1 { v27.s }[2], [x21]\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
       "b 151f\n"
       "144:"  // Height 5: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 151f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x24, #0x0]\n"
-      "ldr s19, [x23, #0x0]\n"
-      "ldr s23, [x22, #0x0]\n"
-      "ldr s27, [x21, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
       "b 151f\n"
       "145:"  // Height 5: Partial accumulate: partial_2_8
       "tbz x11, #1, 146f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "mov x19, #0x28\n"
-      "ldr d18, [x23], #0x8\n"
-      "ldr d22, [x22], #0x8\n"
-      "ldr d26, [x21], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
       "tbz x11, #0, 151f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x24]\n"
-      "ld1 { v18.s }[2], [x23]\n"
-      "ld1 { v22.s }[2], [x22]\n"
-      "ld1 { v26.s }[2], [x21]\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
       "b 151f\n"
       "146:"  // Height 5: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 151f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x24, #0x0]\n"
-      "ldr s18, [x23, #0x0]\n"
-      "ldr s22, [x22, #0x0]\n"
-      "ldr s26, [x21, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
       "b 151f\n"
       "147:"  // Height 5: Partial accumulate: partial_4_0
       "tbz x11, #2, 149f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
-      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
       "tbz x11, #1, 148f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
-      "ldr d25, [x21], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
       "tbz x11, #0, 151f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x24]\n"
-      "ld1 { v17.s }[2], [x23]\n"
-      "ld1 { v21.s }[2], [x22]\n"
-      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
       "b 151f\n"
       "148:"  // Height 5: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 151f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x24, #0x0]\n"
-      "ldr s17, [x23, #0x0]\n"
-      "ldr s21, [x22, #0x0]\n"
-      "ldr s25, [x21, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
       "b 151f\n"
       "149:"  // Height 5: Partial accumulate: partial_2_0
       "tbz x11, #1, 150f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d16, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d24, [x21], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
       "tbz x11, #0, 151f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x24]\n"
-      "ld1 { v16.s }[2], [x23]\n"
-      "ld1 { v20.s }[2], [x22]\n"
-      "ld1 { v24.s }[2], [x21]\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
       "b 151f\n"
       "150:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s12, [x24, #0x0]\n"
-      "ldr s16, [x23, #0x0]\n"
-      "ldr s20, [x22, #0x0]\n"
-      "ldr s24, [x21, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
       "151:"  // Height 5: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 154f\n"
       "152:"  // Height 5: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
-      "ldr q20, [x22, #0x0]\n"
-      "ldr q21, [x22, #0x10]\n"
-      "ldr q22, [x22, #0x20]\n"
-      "ldr q23, [x22, #0x30]\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q24, [x22, #0x0]\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q26, [x22, #0x20]\n"
+      "ldr q27, [x22, #0x30]\n"
       "b 154f\n"
       "153:"  // Height 5: no accumulate
       "movi v8.16b, #0x0\n"
@@ -2133,744 +2132,744 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "movi v26.16b, #0x0\n"
       "movi v27.16b, #0x0\n"
       "154:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "155:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 156f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 157f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
-      "add x21, x21, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 157f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
       "b 157f\n"
       "156:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "157:"  // Height 5: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 160f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x10\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 159f\n"
       "158:"  // Height 5: Multiply loop: Main loop head
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sub x26, x26, #0x8\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "cmp x26, #0x10\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "cmp x27, #0x10\n"
       ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
       ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
-      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
-      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
-      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
-      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
-      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
-      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
-      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
-      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
-      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
-      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f40f3aa  // bfdot v10.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f41f3ae  // bfdot v14.4s, v29.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f42f3b2  // bfdot v18.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f43f3b6  // bfdot v22.4s, v29.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f44f3ba  // bfdot v26.4s, v29.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x4f40f38b  // bfdot v11.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f41f38f  // bfdot v15.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f42f393  // bfdot v19.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f43f397  // bfdot v23.4s, v28.8h, v3.h[0]\n"
+      ".inst 0x4f44f39b  // bfdot v27.4s, v28.8h, v4.h[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x4f60f3a8  // bfdot v8.4s, v29.8h, v0.h[1]\n"
+      ".inst 0x4f61f3ac  // bfdot v12.4s, v29.8h, v1.h[1]\n"
+      ".inst 0x4f62f3b0  // bfdot v16.4s, v29.8h, v2.h[1]\n"
+      ".inst 0x4f63f3b4  // bfdot v20.4s, v29.8h, v3.h[1]\n"
+      ".inst 0x4f64f3b8  // bfdot v24.4s, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x4f60f389  // bfdot v9.4s, v28.8h, v0.h[1]\n"
+      ".inst 0x4f61f38d  // bfdot v13.4s, v28.8h, v1.h[1]\n"
+      ".inst 0x4f62f391  // bfdot v17.4s, v28.8h, v2.h[1]\n"
+      ".inst 0x4f63f395  // bfdot v21.4s, v28.8h, v3.h[1]\n"
+      ".inst 0x4f64f399  // bfdot v25.4s, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x4f60f3aa  // bfdot v10.4s, v29.8h, v0.h[1]\n"
+      ".inst 0x4f61f3ae  // bfdot v14.4s, v29.8h, v1.h[1]\n"
+      ".inst 0x4f62f3b2  // bfdot v18.4s, v29.8h, v2.h[1]\n"
+      ".inst 0x4f63f3b6  // bfdot v22.4s, v29.8h, v3.h[1]\n"
+      ".inst 0x4f64f3ba  // bfdot v26.4s, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x4f60f38b  // bfdot v11.4s, v28.8h, v0.h[1]\n"
+      ".inst 0x4f61f38f  // bfdot v15.4s, v28.8h, v1.h[1]\n"
+      ".inst 0x4f62f393  // bfdot v19.4s, v28.8h, v2.h[1]\n"
+      ".inst 0x4f63f397  // bfdot v23.4s, v28.8h, v3.h[1]\n"
+      ".inst 0x4f64f39b  // bfdot v27.4s, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x4f40fba8  // bfdot v8.4s, v29.8h, v0.h[2]\n"
+      ".inst 0x4f41fbac  // bfdot v12.4s, v29.8h, v1.h[2]\n"
+      ".inst 0x4f42fbb0  // bfdot v16.4s, v29.8h, v2.h[2]\n"
+      ".inst 0x4f43fbb4  // bfdot v20.4s, v29.8h, v3.h[2]\n"
+      ".inst 0x4f44fbb8  // bfdot v24.4s, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x4f40fb89  // bfdot v9.4s, v28.8h, v0.h[2]\n"
+      ".inst 0x4f41fb8d  // bfdot v13.4s, v28.8h, v1.h[2]\n"
+      ".inst 0x4f42fb91  // bfdot v17.4s, v28.8h, v2.h[2]\n"
+      ".inst 0x4f43fb95  // bfdot v21.4s, v28.8h, v3.h[2]\n"
+      ".inst 0x4f44fb99  // bfdot v25.4s, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x4f40fbaa  // bfdot v10.4s, v29.8h, v0.h[2]\n"
+      ".inst 0x4f41fbae  // bfdot v14.4s, v29.8h, v1.h[2]\n"
+      ".inst 0x4f42fbb2  // bfdot v18.4s, v29.8h, v2.h[2]\n"
+      ".inst 0x4f43fbb6  // bfdot v22.4s, v29.8h, v3.h[2]\n"
+      ".inst 0x4f44fbba  // bfdot v26.4s, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x4f40fb8b  // bfdot v11.4s, v28.8h, v0.h[2]\n"
+      ".inst 0x4f41fb8f  // bfdot v15.4s, v28.8h, v1.h[2]\n"
+      ".inst 0x4f42fb93  // bfdot v19.4s, v28.8h, v2.h[2]\n"
+      ".inst 0x4f43fb97  // bfdot v23.4s, v28.8h, v3.h[2]\n"
+      ".inst 0x4f44fb9b  // bfdot v27.4s, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x4f60fba8  // bfdot v8.4s, v29.8h, v0.h[3]\n"
+      ".inst 0x4f61fbac  // bfdot v12.4s, v29.8h, v1.h[3]\n"
+      ".inst 0x4f62fbb0  // bfdot v16.4s, v29.8h, v2.h[3]\n"
+      ".inst 0x4f63fbb4  // bfdot v20.4s, v29.8h, v3.h[3]\n"
+      ".inst 0x4f64fbb8  // bfdot v24.4s, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x4f60fb89  // bfdot v9.4s, v28.8h, v0.h[3]\n"
+      ".inst 0x4f61fb8d  // bfdot v13.4s, v28.8h, v1.h[3]\n"
+      ".inst 0x4f62fb91  // bfdot v17.4s, v28.8h, v2.h[3]\n"
+      ".inst 0x4f63fb95  // bfdot v21.4s, v28.8h, v3.h[3]\n"
+      ".inst 0x4f64fb99  // bfdot v25.4s, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
-      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f60fbaa  // bfdot v10.4s, v29.8h, v0.h[3]\n"
+      ".inst 0x4f61fbae  // bfdot v14.4s, v29.8h, v1.h[3]\n"
+      ".inst 0x4f62fbb2  // bfdot v18.4s, v29.8h, v2.h[3]\n"
+      ".inst 0x4f63fbb6  // bfdot v22.4s, v29.8h, v3.h[3]\n"
+      ".inst 0x4f64fbba  // bfdot v26.4s, v29.8h, v4.h[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
-      "ldr q2, [x23, #0x0]\n"
-      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
-      "ldr q3, [x22, #0x0]\n"
-      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
-      "ldr q4, [x21, #0x0]\n"
+      ".inst 0x4f60fb8b  // bfdot v11.4s, v28.8h, v0.h[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4f61fb8f  // bfdot v15.4s, v28.8h, v1.h[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4f62fb93  // bfdot v19.4s, v28.8h, v2.h[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4f63fb97  // bfdot v23.4s, v28.8h, v3.h[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      ".inst 0x4f64fb9b  // bfdot v27.4s, v28.8h, v4.h[3]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 158b\n"
       "159:"  // Height 5: Multiply loop: Single iteration only
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "sub x27, x27, #0x8\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
       ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
-      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
-      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
-      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
-      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
-      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
-      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
-      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
-      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
-      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
-      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f40f3aa  // bfdot v10.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f41f3ae  // bfdot v14.4s, v29.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4f42f3b2  // bfdot v18.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f43f3b6  // bfdot v22.4s, v29.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f44f3ba  // bfdot v26.4s, v29.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x4f40f38b  // bfdot v11.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f41f38f  // bfdot v15.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f42f393  // bfdot v19.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f43f397  // bfdot v23.4s, v28.8h, v3.h[0]\n"
+      ".inst 0x4f44f39b  // bfdot v27.4s, v28.8h, v4.h[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x4f60f3a8  // bfdot v8.4s, v29.8h, v0.h[1]\n"
+      ".inst 0x4f61f3ac  // bfdot v12.4s, v29.8h, v1.h[1]\n"
+      ".inst 0x4f62f3b0  // bfdot v16.4s, v29.8h, v2.h[1]\n"
+      ".inst 0x4f63f3b4  // bfdot v20.4s, v29.8h, v3.h[1]\n"
+      ".inst 0x4f64f3b8  // bfdot v24.4s, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x4f60f389  // bfdot v9.4s, v28.8h, v0.h[1]\n"
+      ".inst 0x4f61f38d  // bfdot v13.4s, v28.8h, v1.h[1]\n"
+      ".inst 0x4f62f391  // bfdot v17.4s, v28.8h, v2.h[1]\n"
+      ".inst 0x4f63f395  // bfdot v21.4s, v28.8h, v3.h[1]\n"
+      ".inst 0x4f64f399  // bfdot v25.4s, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x4f60f3aa  // bfdot v10.4s, v29.8h, v0.h[1]\n"
+      ".inst 0x4f61f3ae  // bfdot v14.4s, v29.8h, v1.h[1]\n"
+      ".inst 0x4f62f3b2  // bfdot v18.4s, v29.8h, v2.h[1]\n"
+      ".inst 0x4f63f3b6  // bfdot v22.4s, v29.8h, v3.h[1]\n"
+      ".inst 0x4f64f3ba  // bfdot v26.4s, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x4f60f38b  // bfdot v11.4s, v28.8h, v0.h[1]\n"
+      ".inst 0x4f61f38f  // bfdot v15.4s, v28.8h, v1.h[1]\n"
+      ".inst 0x4f62f393  // bfdot v19.4s, v28.8h, v2.h[1]\n"
+      ".inst 0x4f63f397  // bfdot v23.4s, v28.8h, v3.h[1]\n"
+      ".inst 0x4f64f39b  // bfdot v27.4s, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x4f40fba8  // bfdot v8.4s, v29.8h, v0.h[2]\n"
+      ".inst 0x4f41fbac  // bfdot v12.4s, v29.8h, v1.h[2]\n"
+      ".inst 0x4f42fbb0  // bfdot v16.4s, v29.8h, v2.h[2]\n"
+      ".inst 0x4f43fbb4  // bfdot v20.4s, v29.8h, v3.h[2]\n"
+      ".inst 0x4f44fbb8  // bfdot v24.4s, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x4f40fb89  // bfdot v9.4s, v28.8h, v0.h[2]\n"
+      ".inst 0x4f41fb8d  // bfdot v13.4s, v28.8h, v1.h[2]\n"
+      ".inst 0x4f42fb91  // bfdot v17.4s, v28.8h, v2.h[2]\n"
+      ".inst 0x4f43fb95  // bfdot v21.4s, v28.8h, v3.h[2]\n"
+      ".inst 0x4f44fb99  // bfdot v25.4s, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x4f40fbaa  // bfdot v10.4s, v29.8h, v0.h[2]\n"
+      ".inst 0x4f41fbae  // bfdot v14.4s, v29.8h, v1.h[2]\n"
+      ".inst 0x4f42fbb2  // bfdot v18.4s, v29.8h, v2.h[2]\n"
+      ".inst 0x4f43fbb6  // bfdot v22.4s, v29.8h, v3.h[2]\n"
+      ".inst 0x4f44fbba  // bfdot v26.4s, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x4f40fb8b  // bfdot v11.4s, v28.8h, v0.h[2]\n"
+      ".inst 0x4f41fb8f  // bfdot v15.4s, v28.8h, v1.h[2]\n"
+      ".inst 0x4f42fb93  // bfdot v19.4s, v28.8h, v2.h[2]\n"
+      ".inst 0x4f43fb97  // bfdot v23.4s, v28.8h, v3.h[2]\n"
+      ".inst 0x4f44fb9b  // bfdot v27.4s, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x4f60fba8  // bfdot v8.4s, v29.8h, v0.h[3]\n"
+      ".inst 0x4f61fbac  // bfdot v12.4s, v29.8h, v1.h[3]\n"
+      ".inst 0x4f62fbb0  // bfdot v16.4s, v29.8h, v2.h[3]\n"
+      ".inst 0x4f63fbb4  // bfdot v20.4s, v29.8h, v3.h[3]\n"
+      ".inst 0x4f64fbb8  // bfdot v24.4s, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x4f60fb89  // bfdot v9.4s, v28.8h, v0.h[3]\n"
+      ".inst 0x4f61fb8d  // bfdot v13.4s, v28.8h, v1.h[3]\n"
+      ".inst 0x4f62fb91  // bfdot v17.4s, v28.8h, v2.h[3]\n"
+      ".inst 0x4f63fb95  // bfdot v21.4s, v28.8h, v3.h[3]\n"
+      ".inst 0x4f64fb99  // bfdot v25.4s, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
-      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
-      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
+      ".inst 0x4f60fbaa  // bfdot v10.4s, v29.8h, v0.h[3]\n"
+      ".inst 0x4f61fbae  // bfdot v14.4s, v29.8h, v1.h[3]\n"
+      ".inst 0x4f62fbb2  // bfdot v18.4s, v29.8h, v2.h[3]\n"
+      ".inst 0x4f63fbb6  // bfdot v22.4s, v29.8h, v3.h[3]\n"
+      ".inst 0x4f64fbba  // bfdot v26.4s, v29.8h, v4.h[3]\n"
+      ".inst 0x4f60fb8b  // bfdot v11.4s, v28.8h, v0.h[3]\n"
+      ".inst 0x4f61fb8f  // bfdot v15.4s, v28.8h, v1.h[3]\n"
+      ".inst 0x4f62fb93  // bfdot v19.4s, v28.8h, v2.h[3]\n"
+      ".inst 0x4f63fb97  // bfdot v23.4s, v28.8h, v3.h[3]\n"
+      ".inst 0x4f64fb9b  // bfdot v27.4s, v28.8h, v4.h[3]\n"
       "160:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x26, 164f\n"
-      "cmp x26, #0x2\n"
+      "cbz x27, 164f\n"
+      "cmp x27, #0x2\n"
       "blt 162f\n"
       "161:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x2\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x2\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr s3, [x22], #0x4\n"
-      "ldr s4, [x21], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s1, [x25], #0x4\n"
+      "sub x27, x27, #0x2\n"
+      "cmp x27, #0x2\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x10, #0x0]\n"
+      ".inst 0x4f42f3a8  // bfdot v8.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f41f3ac  // bfdot v12.4s, v29.8h, v1.h[0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x4f40f3b0  // bfdot v16.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f5ff3b4  // bfdot v20.4s, v29.8h, v31.h[0]\n"
+      ".inst 0x4f5ef3b8  // bfdot v24.4s, v29.8h, v30.h[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x4f42f389  // bfdot v9.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f41f38d  // bfdot v13.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f40f391  // bfdot v17.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f5ff395  // bfdot v21.4s, v28.8h, v31.h[0]\n"
+      ".inst 0x4f5ef399  // bfdot v25.4s, v28.8h, v30.h[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f42f3aa  // bfdot v10.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f41f3ae  // bfdot v14.4s, v29.8h, v1.h[0]\n"
+      ".inst 0x4f40f3b2  // bfdot v18.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f5ff3b6  // bfdot v22.4s, v29.8h, v31.h[0]\n"
+      ".inst 0x4f5ef3ba  // bfdot v26.4s, v29.8h, v30.h[0]\n"
+      ".inst 0x4f42f38b  // bfdot v11.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f41f38f  // bfdot v15.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f40f393  // bfdot v19.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f5ff397  // bfdot v23.4s, v28.8h, v31.h[0]\n"
+      ".inst 0x4f5ef39b  // bfdot v27.4s, v28.8h, v30.h[0]\n"
       "bge 161b\n"
-      "cbz x26, 164f\n"
       "162:"  // Height 5: Multiply loop: Skip odd blocks
-      "ldr h0, [x25, #0x0]\n"
-      "ldr h1, [x24, #0x0]\n"
-      "ldr h2, [x23, #0x0]\n"
-      "ldr h3, [x22, #0x0]\n"
-      "ldr h4, [x21, #0x0]\n"
+      "cbz x27, 164f\n"
+      "ldr h0, [x26, #0x0]\n"
+      "ldr h1, [x25, #0x0]\n"
+      "ldr h2, [x24, #0x0]\n"
+      "ldr h3, [x23, #0x0]\n"
+      "ldr h4, [x22, #0x0]\n"
       "163:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x4f40f3a8  // bfdot v8.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f41f3ac  // bfdot v12.4s, v29.8h, v1.h[0]\n"
+      ".inst 0x4f42f3b0  // bfdot v16.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f43f3b4  // bfdot v20.4s, v29.8h, v3.h[0]\n"
+      ".inst 0x4f44f3b8  // bfdot v24.4s, v29.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x4f40f389  // bfdot v9.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f41f38d  // bfdot v13.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f42f391  // bfdot v17.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f43f395  // bfdot v21.4s, v28.8h, v3.h[0]\n"
+      ".inst 0x4f44f399  // bfdot v25.4s, v28.8h, v4.h[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f40f3aa  // bfdot v10.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f41f3ae  // bfdot v14.4s, v29.8h, v1.h[0]\n"
+      ".inst 0x4f42f3b2  // bfdot v18.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f43f3b6  // bfdot v22.4s, v29.8h, v3.h[0]\n"
+      ".inst 0x4f44f3ba  // bfdot v26.4s, v29.8h, v4.h[0]\n"
+      ".inst 0x4f40f38b  // bfdot v11.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f41f38f  // bfdot v15.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f42f393  // bfdot v19.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f43f397  // bfdot v23.4s, v28.8h, v3.h[0]\n"
+      ".inst 0x4f44f39b  // bfdot v27.4s, v28.8h, v4.h[0]\n"
       "164:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 155b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "tbz %x[flags], #1, 165f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmin v20.4s, v20.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
-      "fmax v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v0.4s\n"
-      "fmin v22.4s, v22.4s, v0.4s\n"
-      "fmin v23.4s, v23.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v1.4s\n"
-      "fmax v22.4s, v22.4s, v1.4s\n"
-      "fmax v23.4s, v23.4s, v1.4s\n"
-      "fmin v24.4s, v24.4s, v0.4s\n"
-      "fmin v25.4s, v25.4s, v0.4s\n"
-      "fmin v26.4s, v26.4s, v0.4s\n"
-      "fmax v24.4s, v24.4s, v1.4s\n"
-      "fmax v25.4s, v25.4s, v1.4s\n"
-      "fmax v26.4s, v26.4s, v1.4s\n"
-      "fmin v27.4s, v27.4s, v0.4s\n"
-      "fmax v27.4s, v27.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v29.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v29.4s\n"
+      "fmin v9.4s, v9.4s, v29.4s\n"
+      "fmin v10.4s, v10.4s, v29.4s\n"
+      "fmin v11.4s, v11.4s, v29.4s\n"
+      "fmin v12.4s, v12.4s, v29.4s\n"
+      "fmin v13.4s, v13.4s, v29.4s\n"
+      "fmin v14.4s, v14.4s, v29.4s\n"
+      "fmin v15.4s, v15.4s, v29.4s\n"
+      "fmin v16.4s, v16.4s, v29.4s\n"
+      "fmin v17.4s, v17.4s, v29.4s\n"
+      "fmin v18.4s, v18.4s, v29.4s\n"
+      "fmin v19.4s, v19.4s, v29.4s\n"
+      "fmin v20.4s, v20.4s, v29.4s\n"
+      "fmin v21.4s, v21.4s, v29.4s\n"
+      "fmin v22.4s, v22.4s, v29.4s\n"
+      "fmin v23.4s, v23.4s, v29.4s\n"
+      "fmin v24.4s, v24.4s, v29.4s\n"
+      "fmin v25.4s, v25.4s, v29.4s\n"
+      "fmin v26.4s, v26.4s, v29.4s\n"
+      "fmin v27.4s, v27.4s, v29.4s\n"
+      "fmax v8.4s, v8.4s, v28.4s\n"
+      "fmax v9.4s, v9.4s, v28.4s\n"
+      "fmax v10.4s, v10.4s, v28.4s\n"
+      "fmax v11.4s, v11.4s, v28.4s\n"
+      "fmax v12.4s, v12.4s, v28.4s\n"
+      "fmax v13.4s, v13.4s, v28.4s\n"
+      "fmax v14.4s, v14.4s, v28.4s\n"
+      "fmax v15.4s, v15.4s, v28.4s\n"
+      "fmax v16.4s, v16.4s, v28.4s\n"
+      "fmax v17.4s, v17.4s, v28.4s\n"
+      "fmax v18.4s, v18.4s, v28.4s\n"
+      "fmax v19.4s, v19.4s, v28.4s\n"
+      "fmax v20.4s, v20.4s, v28.4s\n"
+      "fmax v21.4s, v21.4s, v28.4s\n"
+      "fmax v22.4s, v22.4s, v28.4s\n"
+      "fmax v23.4s, v23.4s, v28.4s\n"
+      "fmax v24.4s, v24.4s, v28.4s\n"
+      "fmax v25.4s, v25.4s, v28.4s\n"
+      "fmax v26.4s, v26.4s, v28.4s\n"
+      "fmax v27.4s, v27.4s, v28.4s\n"
       "165:"  // Height 5: No activation
       "cmp x11, #0x10\n"
       "bge 174f\n"
       "tbz x11, #3, 169f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v13.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v17.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
-      "st1 { v21.4s }, [x22], #0x10\n"
-      "st1 { v24.4s }, [x21], #0x10\n"
-      "st1 { v25.4s }, [x21], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v25.4s }, [x22], #0x10\n"
       "tbz x11, #2, 167f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x24], #0x10\n"
-      "st1 { v18.4s }, [x23], #0x10\n"
-      "st1 { v22.4s }, [x22], #0x10\n"
-      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "st1 { v22.4s }, [x23], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
       "tbz x11, #1, 166f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
-      "str d23, [x22], #0x8\n"
-      "str d27, [x21], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
       "tbz x11, #0, 173f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x24]\n"
-      "st1 { v19.s }[2], [x23]\n"
-      "st1 { v23.s }[2], [x22]\n"
-      "st1 { v27.s }[2], [x21]\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "st1 { v23.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
       "b 173f\n"
       "166:"  // Height 5: Partial direct writeback: partial_1_12
       "tbz x11, #0, 173f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x24, #0x0]\n"
-      "str s19, [x23, #0x0]\n"
-      "str s23, [x22, #0x0]\n"
-      "str s27, [x21, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "str s23, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
       "b 173f\n"
       "167:"  // Height 5: Partial direct writeback: partial_2_8
       "tbz x11, #1, 168f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
-      "str d22, [x22], #0x8\n"
-      "str d26, [x21], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
       "tbz x11, #0, 173f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x24]\n"
-      "st1 { v18.s }[2], [x23]\n"
-      "st1 { v22.s }[2], [x22]\n"
-      "st1 { v26.s }[2], [x21]\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
       "b 173f\n"
       "168:"  // Height 5: Partial direct writeback: partial_1_8
       "tbz x11, #0, 173f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x24, #0x0]\n"
-      "str s18, [x23, #0x0]\n"
-      "str s22, [x22, #0x0]\n"
-      "str s26, [x21, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
       "b 173f\n"
       "169:"  // Height 5: Partial direct writeback: partial_4_0
       "tbz x11, #2, 171f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
-      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
       "tbz x11, #1, 170f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
-      "str d21, [x22], #0x8\n"
-      "str d25, [x21], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
       "tbz x11, #0, 173f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x24]\n"
-      "st1 { v17.s }[2], [x23]\n"
-      "st1 { v21.s }[2], [x22]\n"
-      "st1 { v25.s }[2], [x21]\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
       "b 173f\n"
       "170:"  // Height 5: Partial direct writeback: partial_1_4
       "tbz x11, #0, 173f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x24, #0x0]\n"
-      "str s17, [x23, #0x0]\n"
-      "str s21, [x22, #0x0]\n"
-      "str s25, [x21, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
       "b 173f\n"
       "171:"  // Height 5: Partial direct writeback: partial_2_0
       "tbz x11, #1, 172f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x11, #0, 173f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x24]\n"
-      "st1 { v16.s }[2], [x23]\n"
-      "st1 { v20.s }[2], [x22]\n"
-      "st1 { v24.s }[2], [x21]\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
       "b 173f\n"
       "172:"  // Height 5: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x24, #0x0]\n"
-      "str s16, [x23, #0x0]\n"
-      "str s20, [x22, #0x0]\n"
-      "str s24, [x21, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
       "173:"  // Height 5: Partial direct writeback: Done
       "b 175f\n"
       "174:"  // Height 5: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q21, [x22, #0x10]\n"
-      "str q22, [x22, #0x20]\n"
-      "str q23, [x22, #0x30]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q25, [x21, #0x10]\n"
-      "str q26, [x21, #0x20]\n"
-      "str q27, [x21, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
       "175:"  // Height 5: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 142b\n"
       "b 212f\n"
       "176:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x19, #0x18\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "177:"  // Height 6: Column loop
-      "cbz x9, 178f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 178f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "mov v20.16b, v8.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      "mov v24.16b, v8.16b\n"
-      "mov v28.16b, v8.16b\n"
       "mov v13.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
       "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
       "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
       "mov v27.16b, v11.16b\n"
+      "mov v28.16b, v8.16b\n"
       "mov v29.16b, v9.16b\n"
       "mov v30.16b, v10.16b\n"
       "mov v31.16b, v11.16b\n"
       "b 189f\n"
       "178:"  // Height 6: no bias
       "tbz %x[flags], #0, 188f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "cmp x11, #0x10\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "bge 187f\n"
       "tbz x11, #3, 182f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
-      "ld1 { v24.4s }, [x21], #0x10\n"
-      "ld1 { v28.4s }, [x20], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x24], #0x10\n"
-      "ld1 { v17.4s }, [x23], #0x10\n"
-      "ld1 { v21.4s }, [x22], #0x10\n"
-      "ld1 { v25.4s }, [x21], #0x10\n"
-      "ld1 { v29.4s }, [x20], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
       "tbz x11, #2, 180f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x24], #0x10\n"
-      "ld1 { v18.4s }, [x23], #0x10\n"
-      "ld1 { v22.4s }, [x22], #0x10\n"
-      "ld1 { v26.4s }, [x21], #0x10\n"
-      "ld1 { v30.4s }, [x20], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
       "tbz x11, #1, 179f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
-      "ldr d23, [x22], #0x8\n"
-      "ldr d27, [x21], #0x8\n"
-      "ldr d31, [x20], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
       "tbz x11, #0, 186f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x24]\n"
-      "ld1 { v19.s }[2], [x23]\n"
-      "ld1 { v23.s }[2], [x22]\n"
-      "ld1 { v27.s }[2], [x21]\n"
-      "ld1 { v31.s }[2], [x20]\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
       "b 186f\n"
       "179:"  // Height 6: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 186f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x24, #0x0]\n"
-      "ldr s19, [x23, #0x0]\n"
-      "ldr s23, [x22, #0x0]\n"
-      "ldr s27, [x21, #0x0]\n"
-      "ldr s31, [x20, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
       "b 186f\n"
       "180:"  // Height 6: Partial accumulate: partial_2_8
       "tbz x11, #1, 181f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "mov x19, #0x28\n"
-      "ldr d18, [x23], #0x8\n"
-      "ldr d22, [x22], #0x8\n"
-      "ldr d26, [x21], #0x8\n"
-      "ldr d30, [x20], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
       "tbz x11, #0, 186f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x24]\n"
-      "ld1 { v18.s }[2], [x23]\n"
-      "ld1 { v22.s }[2], [x22]\n"
-      "ld1 { v26.s }[2], [x21]\n"
-      "ld1 { v30.s }[2], [x20]\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
+      "ld1 { v30.s }[2], [x21]\n"
       "b 186f\n"
       "181:"  // Height 6: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 186f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x24, #0x0]\n"
-      "ldr s18, [x23, #0x0]\n"
-      "ldr s22, [x22, #0x0]\n"
-      "ldr s26, [x21, #0x0]\n"
-      "ldr s30, [x20, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
       "b 186f\n"
       "182:"  // Height 6: Partial accumulate: partial_4_0
       "tbz x11, #2, 184f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
-      "ld1 { v24.4s }, [x21], #0x10\n"
-      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
       "tbz x11, #1, 183f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
-      "ldr d25, [x21], #0x8\n"
-      "ldr d29, [x20], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
       "tbz x11, #0, 186f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x24]\n"
-      "ld1 { v17.s }[2], [x23]\n"
-      "ld1 { v21.s }[2], [x22]\n"
-      "ld1 { v25.s }[2], [x21]\n"
-      "ld1 { v29.s }[2], [x20]\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "ld1 { v29.s }[2], [x21]\n"
       "b 186f\n"
       "183:"  // Height 6: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 186f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x24, #0x0]\n"
-      "ldr s17, [x23, #0x0]\n"
-      "ldr s21, [x22, #0x0]\n"
-      "ldr s25, [x21, #0x0]\n"
-      "ldr s29, [x20, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
       "b 186f\n"
       "184:"  // Height 6: Partial accumulate: partial_2_0
       "tbz x11, #1, 185f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d16, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d24, [x21], #0x8\n"
-      "ldr d28, [x20], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
       "tbz x11, #0, 186f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x24]\n"
-      "ld1 { v16.s }[2], [x23]\n"
-      "ld1 { v20.s }[2], [x22]\n"
-      "ld1 { v24.s }[2], [x21]\n"
-      "ld1 { v28.s }[2], [x20]\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v28.s }[2], [x21]\n"
       "b 186f\n"
       "185:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s12, [x24, #0x0]\n"
-      "ldr s16, [x23, #0x0]\n"
-      "ldr s20, [x22, #0x0]\n"
-      "ldr s24, [x21, #0x0]\n"
-      "ldr s28, [x20, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
       "186:"  // Height 6: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 189f\n"
       "187:"  // Height 6: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
-      "ldr q20, [x22, #0x0]\n"
-      "ldr q21, [x22, #0x10]\n"
-      "ldr q22, [x22, #0x20]\n"
-      "ldr q23, [x22, #0x30]\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "ldr q28, [x20, #0x0]\n"
-      "ldr q29, [x20, #0x10]\n"
-      "ldr q30, [x20, #0x20]\n"
-      "ldr q31, [x20, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q24, [x22, #0x0]\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q26, [x22, #0x20]\n"
+      "ldr q27, [x22, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
       "b 189f\n"
       "188:"  // Height 6: no accumulate
       "movi v8.16b, #0x0\n"
@@ -2898,82 +2897,82 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "movi v30.16b, #0x0\n"
       "movi v31.16b, #0x0\n"
       "189:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "190:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 191f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 192f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
-      "add x21, x21, x19, LSL #1\n"
-      "add x20, x20, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 192f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
       "b 192f\n"
       "191:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
-      "add x20, x21, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "192:"  // Height 6: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 195f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x10\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 194f\n"
       "193:"  // Height 6: Multiply loop: Main loop head
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
       "ldr q6, [x10, #0x20]\n"
-      "add x20, x20, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "sub x26, x26, #0x8\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "cmp x26, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
       ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
       ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
       "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
       ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
       ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
       ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
       "ldr q6, [x10, #0x40]\n"
@@ -3063,51 +3062,51 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       ".inst 0x4f65f8de  // bfdot v30.4s, v6.8h, v5.h[3]\n"
       "ldr q6, [x10, #0x0]\n"
       ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
       ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
-      "ldr q1, [x24, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
       ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
       ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
       ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
       ".inst 0x4f65f8ff  // bfdot v31.4s, v7.8h, v5.h[3]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 193b\n"
       "194:"  // Height 6: Multiply loop: Single iteration only
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
       "ldr q6, [x10, #0x20]\n"
-      "add x20, x20, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "sub x27, x27, #0x8\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
       ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
       "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
       ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
       ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
       ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
       "ldr q6, [x10, #0x40]\n"
@@ -3202,330 +3201,329 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
       ".inst 0x4f65f8ff  // bfdot v31.4s, v7.8h, v5.h[3]\n"
       "195:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x26, 199f\n"
-      "cmp x26, #0x2\n"
+      "cbz x27, 199f\n"
+      "cmp x27, #0x2\n"
       "blt 197f\n"
       "196:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x2\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x2\n"
-      "ldr s2, [x23], #0x4\n"
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
+      "sub x27, x27, #0x2\n"
+      "cmp x27, #0x2\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
       "ldr s3, [x22], #0x4\n"
-      "ldr s4, [x21], #0x4\n"
-      "ldr s5, [x20], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x4f47f028  // bfdot v8.4s, v1.8h, v7.h[0]\n"
+      ".inst 0x4f46f02c  // bfdot v12.4s, v1.8h, v6.h[0]\n"
+      ".inst 0x4f45f030  // bfdot v16.4s, v1.8h, v5.h[0]\n"
+      ".inst 0x4f44f034  // bfdot v20.4s, v1.8h, v4.h[0]\n"
+      ".inst 0x4f43f038  // bfdot v24.4s, v1.8h, v3.h[0]\n"
+      ".inst 0x4f42f03c  // bfdot v28.4s, v1.8h, v2.h[0]\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x4f47f009  // bfdot v9.4s, v0.8h, v7.h[0]\n"
+      ".inst 0x4f46f00d  // bfdot v13.4s, v0.8h, v6.h[0]\n"
+      ".inst 0x4f45f011  // bfdot v17.4s, v0.8h, v5.h[0]\n"
+      ".inst 0x4f44f015  // bfdot v21.4s, v0.8h, v4.h[0]\n"
+      ".inst 0x4f43f019  // bfdot v25.4s, v0.8h, v3.h[0]\n"
+      ".inst 0x4f42f01d  // bfdot v29.4s, v0.8h, v2.h[0]\n"
+      "ldr q0, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
-      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      ".inst 0x4f47f02a  // bfdot v10.4s, v1.8h, v7.h[0]\n"
+      ".inst 0x4f46f02e  // bfdot v14.4s, v1.8h, v6.h[0]\n"
+      ".inst 0x4f45f032  // bfdot v18.4s, v1.8h, v5.h[0]\n"
+      ".inst 0x4f44f036  // bfdot v22.4s, v1.8h, v4.h[0]\n"
+      ".inst 0x4f43f03a  // bfdot v26.4s, v1.8h, v3.h[0]\n"
+      ".inst 0x4f42f03e  // bfdot v30.4s, v1.8h, v2.h[0]\n"
+      ".inst 0x4f47f00b  // bfdot v11.4s, v0.8h, v7.h[0]\n"
+      ".inst 0x4f46f00f  // bfdot v15.4s, v0.8h, v6.h[0]\n"
+      ".inst 0x4f45f013  // bfdot v19.4s, v0.8h, v5.h[0]\n"
+      ".inst 0x4f44f017  // bfdot v23.4s, v0.8h, v4.h[0]\n"
+      ".inst 0x4f43f01b  // bfdot v27.4s, v0.8h, v3.h[0]\n"
+      ".inst 0x4f42f01f  // bfdot v31.4s, v0.8h, v2.h[0]\n"
       "bge 196b\n"
-      "cbz x26, 199f\n"
       "197:"  // Height 6: Multiply loop: Skip odd blocks
-      "ldr h0, [x25, #0x0]\n"
-      "ldr h1, [x24, #0x0]\n"
-      "ldr h2, [x23, #0x0]\n"
-      "ldr h3, [x22, #0x0]\n"
-      "ldr h4, [x21, #0x0]\n"
-      "ldr h5, [x20, #0x0]\n"
+      "cbz x27, 199f\n"
+      "ldr h0, [x26, #0x0]\n"
+      "ldr h1, [x25, #0x0]\n"
+      "ldr h2, [x24, #0x0]\n"
+      "ldr h3, [x23, #0x0]\n"
+      "ldr h4, [x22, #0x0]\n"
+      "ldr h5, [x21, #0x0]\n"
       "198:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x4f40f0e8  // bfdot v8.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ec  // bfdot v12.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f0  // bfdot v16.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f4  // bfdot v20.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f8  // bfdot v24.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fc  // bfdot v28.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x4f40f0c9  // bfdot v9.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0cd  // bfdot v13.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d1  // bfdot v17.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d5  // bfdot v21.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0d9  // bfdot v25.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0dd  // bfdot v29.4s, v6.8h, v5.h[0]\n"
+      "ldr q6, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
-      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      ".inst 0x4f40f0ea  // bfdot v10.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ee  // bfdot v14.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f2  // bfdot v18.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f6  // bfdot v22.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fa  // bfdot v26.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fe  // bfdot v30.4s, v7.8h, v5.h[0]\n"
+      ".inst 0x4f40f0cb  // bfdot v11.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0cf  // bfdot v15.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d3  // bfdot v19.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d7  // bfdot v23.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0db  // bfdot v27.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0df  // bfdot v31.4s, v6.8h, v5.h[0]\n"
       "199:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 190b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #2\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "tbz %x[flags], #1, 200f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmin v20.4s, v20.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
-      "fmax v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v0.4s\n"
-      "fmin v22.4s, v22.4s, v0.4s\n"
-      "fmin v23.4s, v23.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v1.4s\n"
-      "fmax v22.4s, v22.4s, v1.4s\n"
-      "fmax v23.4s, v23.4s, v1.4s\n"
-      "fmin v24.4s, v24.4s, v0.4s\n"
-      "fmin v25.4s, v25.4s, v0.4s\n"
-      "fmin v26.4s, v26.4s, v0.4s\n"
-      "fmax v24.4s, v24.4s, v1.4s\n"
-      "fmax v25.4s, v25.4s, v1.4s\n"
-      "fmax v26.4s, v26.4s, v1.4s\n"
-      "fmin v27.4s, v27.4s, v0.4s\n"
-      "fmin v28.4s, v28.4s, v0.4s\n"
-      "fmin v29.4s, v29.4s, v0.4s\n"
-      "fmax v27.4s, v27.4s, v1.4s\n"
-      "fmax v28.4s, v28.4s, v1.4s\n"
-      "fmax v29.4s, v29.4s, v1.4s\n"
-      "fmin v30.4s, v30.4s, v0.4s\n"
-      "fmin v31.4s, v31.4s, v0.4s\n"
-      "fmax v30.4s, v30.4s, v1.4s\n"
-      "fmax v31.4s, v31.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmin v28.4s, v28.4s, v1.4s\n"
+      "fmin v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v1.4s\n"
+      "fmin v31.4s, v31.4s, v1.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v31.4s, v31.4s, v0.4s\n"
       "200:"  // Height 6: No activation
       "cmp x11, #0x10\n"
       "bge 209f\n"
       "tbz x11, #3, 204f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v13.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v17.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
-      "st1 { v21.4s }, [x22], #0x10\n"
-      "st1 { v24.4s }, [x21], #0x10\n"
-      "st1 { v25.4s }, [x21], #0x10\n"
-      "st1 { v28.4s }, [x20], #0x10\n"
-      "st1 { v29.4s }, [x20], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v25.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
       "tbz x11, #2, 202f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x24], #0x10\n"
-      "st1 { v18.4s }, [x23], #0x10\n"
-      "st1 { v22.4s }, [x22], #0x10\n"
-      "st1 { v26.4s }, [x21], #0x10\n"
-      "st1 { v30.4s }, [x20], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "st1 { v22.4s }, [x23], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
       "tbz x11, #1, 201f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
-      "str d23, [x22], #0x8\n"
-      "str d27, [x21], #0x8\n"
-      "str d31, [x20], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "str d31, [x21], #0x8\n"
       "tbz x11, #0, 208f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x24]\n"
-      "st1 { v19.s }[2], [x23]\n"
-      "st1 { v23.s }[2], [x22]\n"
-      "st1 { v27.s }[2], [x21]\n"
-      "st1 { v31.s }[2], [x20]\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "st1 { v23.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
+      "st1 { v31.s }[2], [x21]\n"
       "b 208f\n"
       "201:"  // Height 6: Partial direct writeback: partial_1_12
       "tbz x11, #0, 208f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x24, #0x0]\n"
-      "str s19, [x23, #0x0]\n"
-      "str s23, [x22, #0x0]\n"
-      "str s27, [x21, #0x0]\n"
-      "str s31, [x20, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "str s23, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
       "b 208f\n"
       "202:"  // Height 6: Partial direct writeback: partial_2_8
       "tbz x11, #1, 203f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
-      "str d22, [x22], #0x8\n"
-      "str d26, [x21], #0x8\n"
-      "str d30, [x20], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
       "tbz x11, #0, 208f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x24]\n"
-      "st1 { v18.s }[2], [x23]\n"
-      "st1 { v22.s }[2], [x22]\n"
-      "st1 { v26.s }[2], [x21]\n"
-      "st1 { v30.s }[2], [x20]\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
+      "st1 { v30.s }[2], [x21]\n"
       "b 208f\n"
       "203:"  // Height 6: Partial direct writeback: partial_1_8
       "tbz x11, #0, 208f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x24, #0x0]\n"
-      "str s18, [x23, #0x0]\n"
-      "str s22, [x22, #0x0]\n"
-      "str s26, [x21, #0x0]\n"
-      "str s30, [x20, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
       "b 208f\n"
       "204:"  // Height 6: Partial direct writeback: partial_4_0
       "tbz x11, #2, 206f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
-      "st1 { v24.4s }, [x21], #0x10\n"
-      "st1 { v28.4s }, [x20], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
       "tbz x11, #1, 205f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
-      "str d21, [x22], #0x8\n"
-      "str d25, [x21], #0x8\n"
-      "str d29, [x20], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
       "tbz x11, #0, 208f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x24]\n"
-      "st1 { v17.s }[2], [x23]\n"
-      "st1 { v21.s }[2], [x22]\n"
-      "st1 { v25.s }[2], [x21]\n"
-      "st1 { v29.s }[2], [x20]\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "st1 { v29.s }[2], [x21]\n"
       "b 208f\n"
       "205:"  // Height 6: Partial direct writeback: partial_1_4
       "tbz x11, #0, 208f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x24, #0x0]\n"
-      "str s17, [x23, #0x0]\n"
-      "str s21, [x22, #0x0]\n"
-      "str s25, [x21, #0x0]\n"
-      "str s29, [x20, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
       "b 208f\n"
       "206:"  // Height 6: Partial direct writeback: partial_2_0
       "tbz x11, #1, 207f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
-      "str d28, [x20], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
       "tbz x11, #0, 208f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x24]\n"
-      "st1 { v16.s }[2], [x23]\n"
-      "st1 { v20.s }[2], [x22]\n"
-      "st1 { v24.s }[2], [x21]\n"
-      "st1 { v28.s }[2], [x20]\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "st1 { v28.s }[2], [x21]\n"
       "b 208f\n"
       "207:"  // Height 6: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x24, #0x0]\n"
-      "str s16, [x23, #0x0]\n"
-      "str s20, [x22, #0x0]\n"
-      "str s24, [x21, #0x0]\n"
-      "str s28, [x20, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
       "208:"  // Height 6: Partial direct writeback: Done
       "b 210f\n"
       "209:"  // Height 6: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q21, [x22, #0x10]\n"
-      "str q22, [x22, #0x20]\n"
-      "str q23, [x22, #0x30]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q25, [x21, #0x10]\n"
-      "str q26, [x21, #0x20]\n"
-      "str q27, [x21, #0x30]\n"
-      "str q28, [x20, #0x0]\n"
-      "str q29, [x20, #0x10]\n"
-      "str q30, [x20, #0x20]\n"
-      "str q31, [x20, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
       "210:"  // Height 6: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 177b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 212f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 211f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "211:"  // Update direct input
-      "mov x19, #0xc\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "212:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
new file mode 100644
index 0000000000..d9e7259fa2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<bfloat16>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_bf16fp32_mmla_6x16( ARGLIST );
+
+class cls_a64_hybrid_bf16fp32_mmla_6x16
+{
+public:
+    typedef bfloat16 lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 25.04 };
+                case CPUModel::A510:
+                    return { 7.27 };
+                case CPUModel::V1:
+                    return { 40.09 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_bf16fp32_mmla_6x16;
+    cls_a64_hybrid_bf16fp32_mmla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..f6389e27d1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
@@ -0,0 +1,3687 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_bf16fp32_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 186f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 149f\n"
+      "beq 112f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 75f\n"
+      "beq 38f\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "cbz x12, 3f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 15f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 14f\n"
+      "cmp x11, #0x10\n"
+      "bge 12f\n"
+      "tbz x11, #3, 7f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 5f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 4f\n"
+      "ldr d16, [x9], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "b 11f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "b 11f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x11, #1, 6f\n"
+      "ldr d11, [x9], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "b 11f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "b 11f\n"
+      "7:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x11, #2, 9f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 8f\n"
+      "ldr d10, [x9], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "b 11f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "b 11f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x11, #1, 10f\n"
+      "ldr d9, [x9], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "b 11f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "mov x20, #0x0\n"
+      "11:"  // Height 1: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 13f\n"
+      "12:"  // Height 1: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "13:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 15f\n"
+      "14:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "15:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "16:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 18f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "b 18f\n"
+      "17:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "18:"  // Height 1: input setup done
+      "cmp x27, #0x8\n"
+      "blt 21f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 20f\n"
+      "19:"  // Height 1: Multiply loop: Main loop head
+      "trn1 v20.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e47ee88  // bfmmla v8.4s, v20.8h, v7.8h\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6e46ee8c  // bfmmla v12.4s, v20.8h, v6.8h\n"
+      "ldr q19, [x10, #0x30]\n"
+      ".inst 0x6e51ee89  // bfmmla v9.4s, v20.8h, v17.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e53ee8d  // bfmmla v13.4s, v20.8h, v19.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ee8a  // bfmmla v10.4s, v20.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ee8e  // bfmmla v14.4s, v20.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e52ee8b  // bfmmla v11.4s, v20.8h, v18.8h\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e51ee8f  // bfmmla v15.4s, v20.8h, v17.8h\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      "add x10, x10, #0x100\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "bge 19b\n"
+      "20:"  // Height 1: Multiply loop: Single iteration only
+      "trn1 v19.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q18, [x10, #0x30]\n"
+      ".inst 0x6e51ee69  // bfmmla v9.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x6e52ee6d  // bfmmla v13.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x50]\n"
+      ".inst 0x6e51ee6a  // bfmmla v10.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x6e52ee6e  // bfmmla v14.4s, v19.8h, v18.8h\n"
+      "ldr q24, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e51ee6b  // bfmmla v11.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6e58ee6f  // bfmmla v15.4s, v19.8h, v24.8h\n"
+      "ldr q2, [x10, #0x90]\n"
+      ".inst 0x6e51ec28  // bfmmla v8.4s, v1.8h, v17.8h\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e42ec2c  // bfmmla v12.4s, v1.8h, v2.8h\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x10, x10, #0x100\n"
+      "21:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 26f\n"
+      "cmp x27, #0x4\n"
+      "blt 23f\n"
+      "22:"  // Height 1: Multiply loop: Odd block loop
+      "ldr d19, [x26], #0x8\n"
+      "ldr q18, [x10, #0x0]\n"
+      "trn1 v19.2d, v19.2d, v17.2d\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "add x10, x10, #0x80\n"
+      "bge 22b\n"
+      "23:"  // Height 1: Multiply loop: Skip odd blocks
+      "cbz x27, 26f\n"
+      "tbz x27, #1, 24f\n"
+      "ldr s1, [x26], #0x4\n"
+      "tbz x27, #0, 25f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "b 25f\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "25:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q20, [x10, #0x0]\n"
+      "ldr q18, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v17.2d\n"
+      ".inst 0x6e54ee68  // bfmmla v8.4s, v19.8h, v20.8h\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6e52ee6c  // bfmmla v12.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x30]\n"
+      ".inst 0x6e51ee69  // bfmmla v9.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x6e52ee6d  // bfmmla v13.4s, v19.8h, v18.8h\n"
+      "ldr q2, [x10, #0x50]\n"
+      ".inst 0x6e51ee6a  // bfmmla v10.4s, v19.8h, v17.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e42ee6e  // bfmmla v14.4s, v19.8h, v2.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "add x10, x10, #0x80\n"
+      "26:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 16b\n"
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v18.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
+      "27:"  // Height 1: No activation
+      "cmp x11, #0x10\n"
+      "bge 36f\n"
+      "tbz x11, #3, 31f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 29f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 28f\n"
+      "str d11, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "b 35f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 35f\n"
+      "str s11, [x9, #0x0]\n"
+      "b 35f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 30f\n"
+      "str d10, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "b 35f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 35f\n"
+      "str s10, [x9, #0x0]\n"
+      "b 35f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 33f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 32f\n"
+      "str d9, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "b 35f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 35f\n"
+      "str s9, [x9, #0x0]\n"
+      "b 35f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 34f\n"
+      "str d8, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "b 35f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x9, #0x0]\n"
+      "35:"  // Height 1: Partial direct writeback: Done
+      "b 37f\n"
+      "36:"  // Height 1: Full writeback
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "37:"  // Height 1: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 2b\n"
+      "b 224f\n"
+      "38:"  // Height 2
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "39:"  // Height 2: Column loop
+      "cbz x12, 40f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 52f\n"
+      "40:"  // Height 2: no bias
+      "tbz %x[flags], #0, 51f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x25, x9, x20, LSL #2\n"
+      "bge 49f\n"
+      "tbz x11, #3, 44f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "tbz x11, #2, 42f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "tbz x11, #1, 41f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x11, #0, 48f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "b 48f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 48f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "b 48f\n"
+      "42:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x11, #1, 43f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x11, #0, 48f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "b 48f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 48f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "b 48f\n"
+      "44:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x11, #2, 46f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "tbz x11, #1, 45f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x11, #0, 48f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "b 48f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 48f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "b 48f\n"
+      "46:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x11, #1, 47f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x11, #0, 48f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "b 48f\n"
+      "47:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "48:"  // Height 2: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 50f\n"
+      "49:"  // Height 2: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "50:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 52f\n"
+      "51:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "52:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "53:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 54f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 55f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "b 55f\n"
+      "54:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "55:"  // Height 2: input setup done
+      "cmp x27, #0x8\n"
+      "blt 58f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 57f\n"
+      "56:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      "add x10, x10, #0x100\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "bge 56b\n"
+      "57:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
+      "sub x27, x27, #0x8\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x10, x10, #0x100\n"
+      "58:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 63f\n"
+      "cmp x27, #0x4\n"
+      "blt 60f\n"
+      "59:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d18, [x26], #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "trn1 v19.2d, v18.2d, v17.2d\n"
+      "sub x27, x27, #0x4\n"
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      "ldr q5, [x10, #0x30]\n"
+      ".inst 0x6e5aee69  // bfmmla v9.4s, v19.8h, v26.8h\n"
+      ".inst 0x6e45ee6d  // bfmmla v13.4s, v19.8h, v5.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      "ldr q17, [x10, #0x70]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "add x10, x10, #0x80\n"
+      "bge 59b\n"
+      "60:"  // Height 2: Multiply loop: Skip odd blocks
+      "cbz x27, 63f\n"
+      "tbz x27, #1, 61f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "tbz x27, #0, 62f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "ld1 { v2.h }[2], [x25]\n"
+      "b 62f\n"
+      "61:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "ldr h2, [x25, #0x0]\n"
+      "62:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q30, [x10, #0x40]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q26, [x10, #0x50]\n"
+      ".inst 0x6e5eee6a  // bfmmla v10.4s, v19.8h, v30.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e5aee6e  // bfmmla v14.4s, v19.8h, v26.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "add x10, x10, #0x80\n"
+      "63:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 53b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "tbz %x[flags], #1, 64f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v18.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v18.4s\n"
+      "fmin v12.4s, v12.4s, v18.4s\n"
+      "fmin v13.4s, v13.4s, v18.4s\n"
+      "fmin v14.4s, v14.4s, v18.4s\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v7.4s, v7.4s, v17.4s\n"
+      "fmax v12.4s, v12.4s, v17.4s\n"
+      "fmax v13.4s, v13.4s, v17.4s\n"
+      "fmax v14.4s, v14.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
+      "64:"  // Height 2: No activation
+      "cmp x11, #0x10\n"
+      "bge 73f\n"
+      "tbz x11, #3, 68f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "tbz x11, #2, 66f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "tbz x11, #1, 65f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "b 72f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 72f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "b 72f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 67f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "b 72f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 72f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "b 72f\n"
+      "68:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 70f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "tbz x11, #1, 69f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "b 72f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 72f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "b 72f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 71f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "b 72f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "72:"  // Height 2: Partial direct writeback: Done
+      "b 74f\n"
+      "73:"  // Height 2: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "74:"  // Height 2: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 39b\n"
+      "b 224f\n"
+      "75:"  // Height 3
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "76:"  // Height 3: Column loop
+      "cbz x12, 77f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 89f\n"
+      "77:"  // Height 3: no bias
+      "tbz %x[flags], #0, 88f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x24, x25, x20, LSL #2\n"
+      "bge 86f\n"
+      "tbz x11, #3, 81f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 79f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 78f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "tbz x11, #0, 85f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "b 85f\n"
+      "78:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 85f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "b 85f\n"
+      "79:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x11, #1, 80f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "tbz x11, #0, 85f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "b 85f\n"
+      "80:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 85f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "b 85f\n"
+      "81:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x11, #2, 83f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 82f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "tbz x11, #0, 85f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "b 85f\n"
+      "82:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 85f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "b 85f\n"
+      "83:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x11, #1, 84f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "tbz x11, #0, 85f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "b 85f\n"
+      "84:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "85:"  // Height 3: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 87f\n"
+      "86:"  // Height 3: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "87:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 89f\n"
+      "88:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "89:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "90:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 91f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 92f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "b 92f\n"
+      "91:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "92:"  // Height 3: input setup done
+      "cmp x27, #0x8\n"
+      "blt 95f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 94f\n"
+      "93:"  // Height 3: Multiply loop: Main loop head
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x90]\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "bge 93b\n"
+      "94:"  // Height 3: Multiply loop: Single iteration only
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
+      "95:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 100f\n"
+      "cmp x27, #0x4\n"
+      "blt 97f\n"
+      "96:"  // Height 3: Multiply loop: Odd block loop
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr q26, [x10, #0x0]\n"
+      "trn1 v27.2d, v25.2d, v27.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "bge 96b\n"
+      "97:"  // Height 3: Multiply loop: Skip odd blocks
+      "cbz x27, 100f\n"
+      "tbz x27, #1, 98f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "tbz x27, #0, 99f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "ld1 { v2.h }[2], [x25]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "b 99f\n"
+      "98:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "ldr h2, [x25, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "99:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v25.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e5def8c  // bfmmla v12.4s, v28.8h, v29.8h\n"
+      ".inst 0x6e5def74  // bfmmla v20.4s, v27.8h, v29.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "100:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 90b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "tbz %x[flags], #1, 101f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v7.4s, v7.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
+      "101:"  // Height 3: No activation
+      "cmp x11, #0x10\n"
+      "bge 110f\n"
+      "tbz x11, #3, 105f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 103f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 102f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "b 109f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 109f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "b 109f\n"
+      "103:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 104f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "b 109f\n"
+      "104:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 109f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "b 109f\n"
+      "105:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 107f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 106f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "b 109f\n"
+      "106:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 109f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "b 109f\n"
+      "107:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 108f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "b 109f\n"
+      "108:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "109:"  // Height 3: Partial direct writeback: Done
+      "b 111f\n"
+      "110:"  // Height 3: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "111:"  // Height 3: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 76b\n"
+      "b 224f\n"
+      "112:"  // Height 4
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "113:"  // Height 4: Column loop
+      "cbz x12, 114f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 126f\n"
+      "114:"  // Height 4: no bias
+      "tbz %x[flags], #0, 125f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x23, x24, x20, LSL #2\n"
+      "bge 123f\n"
+      "tbz x11, #3, 118f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 116f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 115f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "tbz x11, #0, 122f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "b 122f\n"
+      "115:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 122f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "b 122f\n"
+      "116:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x11, #1, 117f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "tbz x11, #0, 122f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "b 122f\n"
+      "117:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 122f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "b 122f\n"
+      "118:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x11, #2, 120f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 119f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "tbz x11, #0, 122f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "b 122f\n"
+      "119:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 122f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "b 122f\n"
+      "120:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x11, #1, 121f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "tbz x11, #0, 122f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "b 122f\n"
+      "121:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "122:"  // Height 4: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 124f\n"
+      "123:"  // Height 4: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "124:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 126f\n"
+      "125:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "126:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "127:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 128f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 129f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "b 129f\n"
+      "128:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "129:"  // Height 4: input setup done
+      "cmp x27, #0x8\n"
+      "blt 132f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 131f\n"
+      "130:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "sub x27, x27, #0x8\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      "add x23, x23, #0x10\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x90]\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "bge 130b\n"
+      "131:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
+      "132:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 137f\n"
+      "cmp x27, #0x4\n"
+      "blt 134f\n"
+      "133:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "sub x27, x27, #0x4\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "trn1 v27.2d, v26.2d, v25.2d\n"
+      "cmp x27, #0x4\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "bge 133b\n"
+      "134:"  // Height 4: Multiply loop: Skip odd blocks
+      "cbz x27, 137f\n"
+      "tbz x27, #1, 135f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "tbz x27, #0, 136f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "ld1 { v2.h }[2], [x25]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "ld1 { v4.h }[2], [x23]\n"
+      "b 136f\n"
+      "135:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "ldr h2, [x25, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "ldr h4, [x23, #0x0]\n"
+      "136:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "137:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 127b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "tbz %x[flags], #1, 138f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v15.4s, v15.4s, v26.4s\n"
+      "fmin v20.4s, v20.4s, v26.4s\n"
+      "fmin v21.4s, v21.4s, v26.4s\n"
+      "fmin v22.4s, v22.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v7.4s, v7.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v15.4s, v15.4s, v25.4s\n"
+      "fmax v20.4s, v20.4s, v25.4s\n"
+      "fmax v21.4s, v21.4s, v25.4s\n"
+      "fmax v22.4s, v22.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
+      "138:"  // Height 4: No activation
+      "cmp x11, #0x10\n"
+      "bge 147f\n"
+      "tbz x11, #3, 142f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 140f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 139f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d22, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v22.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "b 146f\n"
+      "139:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 146f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s22, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "b 146f\n"
+      "140:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 141f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d21, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v21.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "b 146f\n"
+      "141:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 146f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s21, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "b 146f\n"
+      "142:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 144f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 143f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d20, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v20.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "b 146f\n"
+      "143:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 146f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s20, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "b 146f\n"
+      "144:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 145f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "b 146f\n"
+      "145:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "146:"  // Height 4: Partial direct writeback: Done
+      "b 148f\n"
+      "147:"  // Height 4: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q20, [x24, #0x10]\n"
+      "str q21, [x24, #0x20]\n"
+      "str q22, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "148:"  // Height 4: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 113b\n"
+      "b 224f\n"
+      "149:"  // Height 5
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "150:"  // Height 5: Column loop
+      "cbz x12, 151f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 163f\n"
+      "151:"  // Height 5: no bias
+      "tbz %x[flags], #0, 162f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x22, x23, x20, LSL #2\n"
+      "bge 160f\n"
+      "tbz x11, #3, 155f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 153f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v27.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 152f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d6, [x22], #0x8\n"
+      "tbz x11, #0, 159f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v6.s }[2], [x22]\n"
+      "b 159f\n"
+      "152:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 159f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s6, [x22, #0x0]\n"
+      "b 159f\n"
+      "153:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x11, #1, 154f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "tbz x11, #0, 159f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
+      "b 159f\n"
+      "154:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 159f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
+      "b 159f\n"
+      "155:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x11, #2, 157f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 156f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "tbz x11, #0, 159f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
+      "b 159f\n"
+      "156:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 159f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
+      "b 159f\n"
+      "157:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x11, #1, 158f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "tbz x11, #0, 159f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "b 159f\n"
+      "158:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "159:"  // Height 5: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 161f\n"
+      "160:"  // Height 5: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q25, [x22, #0x0]\n"
+      "ldr q26, [x22, #0x10]\n"
+      "ldr q27, [x22, #0x20]\n"
+      "ldr q6, [x22, #0x30]\n"
+      "161:"  // Height 5: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 163f\n"
+      "162:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "163:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "164:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 165f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 166f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "b 166f\n"
+      "165:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "166:"  // Height 5: input setup done
+      "cmp x27, #0x8\n"
+      "blt 169f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "blt 168f\n"
+      "167:"  // Height 5: Multiply loop: Main loop head
+      "trn1 v6.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ecc8  // bfmmla v8.4s, v6.8h, v7.8h\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "sub x27, x27, #0x8\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e40eccc  // bfmmla v12.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec54  // bfmmla v20.4s, v2.8h, v0.8h\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e40ec9c  // bfmmla v28.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e47ecc9  // bfmmla v9.4s, v6.8h, v7.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e40eccd  // bfmmla v13.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec55  // bfmmla v21.4s, v2.8h, v0.8h\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e47ecca  // bfmmla v10.4s, v6.8h, v7.8h\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e40ecce  // bfmmla v14.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec56  // bfmmla v22.4s, v2.8h, v0.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e40ec9e  // bfmmla v30.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e47eccb  // bfmmla v11.4s, v6.8h, v7.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e40eccf  // bfmmla v15.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec57  // bfmmla v23.4s, v2.8h, v0.8h\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x90]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x6e40ec2c  // bfmmla v12.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbc  // bfmmla v28.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x6e46ec29  // bfmmla v9.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec71  // bfmmla v17.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecb9  // bfmmla v25.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x6e40ec2d  // bfmmla v13.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbd  // bfmmla v29.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x6e46ec2a  // bfmmla v10.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec72  // bfmmla v18.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecba  // bfmmla v26.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x6e40ec2e  // bfmmla v14.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbe  // bfmmla v30.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e46ec2b  // bfmmla v11.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbb  // bfmmla v27.4s, v5.8h, v6.8h\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x6e40ec2f  // bfmmla v15.4s, v1.8h, v0.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      "ldr q3, [x24, #0x0]\n"
+      ".inst 0x6e40ecbf  // bfmmla v31.4s, v5.8h, v0.8h\n"
+      "ldr q5, [x22, #0x0]\n"
+      "bge 167b\n"
+      "168:"  // Height 5: Multiply loop: Single iteration only
+      "trn1 v6.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ecc8  // bfmmla v8.4s, v6.8h, v7.8h\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e40eccc  // bfmmla v12.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec54  // bfmmla v20.4s, v2.8h, v0.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e40ec9c  // bfmmla v28.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e47ecc9  // bfmmla v9.4s, v6.8h, v7.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e40eccd  // bfmmla v13.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec55  // bfmmla v21.4s, v2.8h, v0.8h\n"
+      "add x22, x22, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e47ecca  // bfmmla v10.4s, v6.8h, v7.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e40ecce  // bfmmla v14.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec56  // bfmmla v22.4s, v2.8h, v0.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e40ec9e  // bfmmla v30.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e47eccb  // bfmmla v11.4s, v6.8h, v7.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e40eccf  // bfmmla v15.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec57  // bfmmla v23.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
+      "ldr q2, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x6e42ec2c  // bfmmla v12.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec74  // bfmmla v20.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbc  // bfmmla v28.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x6e40ec29  // bfmmla v9.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x6e42ec2d  // bfmmla v13.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec75  // bfmmla v21.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbd  // bfmmla v29.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x6e40ec2a  // bfmmla v10.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecba  // bfmmla v26.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x6e42ec2e  // bfmmla v14.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec76  // bfmmla v22.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbe  // bfmmla v30.4s, v5.8h, v2.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e40ec2b  // bfmmla v11.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbb  // bfmmla v27.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      "169:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 174f\n"
+      "cmp x27, #0x4\n"
+      "blt 171f\n"
+      "170:"  // Height 5: Multiply loop: Odd block loop
+      "ldr d1, [x26], #0x8\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
+      "sub x27, x27, #0x4\n"
+      "ldr d0, [x22], #0x8\n"
+      "ldr q1, [x10, #0x0]\n"
+      "trn1 v2.2d, v0.2d, v2.2d\n"
+      ".inst 0x6e41ec88  // bfmmla v8.4s, v4.8h, v1.8h\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e41ec70  // bfmmla v16.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec58  // bfmmla v24.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x6e40ec8c  // bfmmla v12.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6e40ec5c  // bfmmla v28.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e41ec89  // bfmmla v9.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec71  // bfmmla v17.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec59  // bfmmla v25.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x6e40ec8d  // bfmmla v13.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5d  // bfmmla v29.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e41ec8a  // bfmmla v10.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec72  // bfmmla v18.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5a  // bfmmla v26.4s, v2.8h, v1.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e40ec8e  // bfmmla v14.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5e  // bfmmla v30.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e46ec8b  // bfmmla v11.4s, v4.8h, v6.8h\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40ec8f  // bfmmla v15.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5f  // bfmmla v31.4s, v2.8h, v0.8h\n"
+      "bge 170b\n"
+      "171:"  // Height 5: Multiply loop: Skip odd blocks
+      "cbz x27, 174f\n"
+      "tbz x27, #1, 172f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s5, [x22], #0x4\n"
+      "tbz x27, #0, 173f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "ld1 { v2.h }[2], [x25]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "ld1 { v4.h }[2], [x23]\n"
+      "ld1 { v5.h }[2], [x22]\n"
+      "b 173f\n"
+      "172:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "ldr h2, [x25, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "ldr h4, [x23, #0x0]\n"
+      "ldr h5, [x22, #0x0]\n"
+      "173:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      "trn1 v2.2d, v5.2d, v0.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x6e46ece8  // bfmmla v8.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec70  // bfmmla v16.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec58  // bfmmla v24.4s, v2.8h, v6.8h\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e41ecec  // bfmmla v12.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec74  // bfmmla v20.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5c  // bfmmla v28.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e40ece9  // bfmmla v9.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec59  // bfmmla v25.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x6e41eced  // bfmmla v13.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec75  // bfmmla v21.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e40ecea  // bfmmla v10.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5a  // bfmmla v26.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x6e41ecee  // bfmmla v14.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec76  // bfmmla v22.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5e  // bfmmla v30.4s, v2.8h, v1.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5b  // bfmmla v27.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e46ecef  // bfmmla v15.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5f  // bfmmla v31.4s, v2.8h, v6.8h\n"
+      "174:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 164b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "tbz %x[flags], #1, 175f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmax v7.4s, v7.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "175:"  // Height 5: No activation
+      "cmp x11, #0x10\n"
+      "bge 184f\n"
+      "tbz x11, #3, 179f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v25.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 177f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 176f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d22, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v22.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
+      "b 183f\n"
+      "176:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 183f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s22, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
+      "b 183f\n"
+      "177:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 178f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d21, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v21.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
+      "b 183f\n"
+      "178:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 183f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s21, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
+      "b 183f\n"
+      "179:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 181f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 180f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d20, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v20.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "b 183f\n"
+      "180:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 183f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s20, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "b 183f\n"
+      "181:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 182f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "b 183f\n"
+      "182:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "183:"  // Height 5: Partial direct writeback: Done
+      "b 185f\n"
+      "184:"  // Height 5: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q20, [x24, #0x10]\n"
+      "str q21, [x24, #0x20]\n"
+      "str q22, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
+      "185:"  // Height 5: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 150b\n"
+      "b 224f\n"
+      "186:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "187:"  // Height 6: Column loop
+      "cbz x12, 188f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 200f\n"
+      "188:"  // Height 6: no bias
+      "tbz %x[flags], #0, 199f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x21, x22, x20, LSL #2\n"
+      "bge 197f\n"
+      "tbz x11, #3, 192f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 190f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v27.4s }, [x22], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 189f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d6, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x11, #0, 196f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v6.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 196f\n"
+      "189:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 196f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s6, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 196f\n"
+      "190:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x11, #1, 191f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x11, #0, 196f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 196f\n"
+      "191:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 196f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 196f\n"
+      "192:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x11, #2, 194f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 193f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x11, #0, 196f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 196f\n"
+      "193:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 196f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 196f\n"
+      "194:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x11, #1, 195f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x11, #0, 196f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 196f\n"
+      "195:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "196:"  // Height 6: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 198f\n"
+      "197:"  // Height 6: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q25, [x22, #0x0]\n"
+      "ldr q26, [x22, #0x10]\n"
+      "ldr q27, [x22, #0x20]\n"
+      "ldr q6, [x22, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "198:"  // Height 6: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 200f\n"
+      "199:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "200:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "201:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 202f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 203f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
+      "b 203f\n"
+      "202:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
+      "203:"  // Height 6: input setup done
+      "cmp x27, #0x8\n"
+      "blt 206f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "blt 205f\n"
+      "204:"  // Height 6: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "sub x27, x27, #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q2, [x25, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      "ldr q0, [x10, #0x90]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x6e40ec2c  // bfmmla v12.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbc  // bfmmla v28.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x6e46ec29  // bfmmla v9.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec71  // bfmmla v17.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecb9  // bfmmla v25.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x6e40ec2d  // bfmmla v13.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbd  // bfmmla v29.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x6e46ec2a  // bfmmla v10.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec72  // bfmmla v18.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecba  // bfmmla v26.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x6e40ec2e  // bfmmla v14.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbe  // bfmmla v30.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e46ec2b  // bfmmla v11.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbb  // bfmmla v27.4s, v5.8h, v6.8h\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x6e40ec2f  // bfmmla v15.4s, v1.8h, v0.8h\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      "ldr q3, [x24, #0x0]\n"
+      ".inst 0x6e40ecbf  // bfmmla v31.4s, v5.8h, v0.8h\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "bge 204b\n"
+      "205:"  // Height 6: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "add x21, x21, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      "ldr q2, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x6e42ec2c  // bfmmla v12.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec74  // bfmmla v20.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbc  // bfmmla v28.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x6e40ec29  // bfmmla v9.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x6e42ec2d  // bfmmla v13.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec75  // bfmmla v21.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbd  // bfmmla v29.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x6e40ec2a  // bfmmla v10.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecba  // bfmmla v26.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x6e42ec2e  // bfmmla v14.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec76  // bfmmla v22.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbe  // bfmmla v30.4s, v5.8h, v2.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e40ec2b  // bfmmla v11.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbb  // bfmmla v27.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      "206:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 211f\n"
+      "cmp x27, #0x4\n"
+      "blt 208f\n"
+      "207:"  // Height 6: Multiply loop: Odd block loop
+      "ldr d1, [x26], #0x8\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "sub x27, x27, #0x4\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
+      "cmp x27, #0x4\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d0, [x21], #0x8\n"
+      "trn1 v2.2d, v1.2d, v0.2d\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e41ec88  // bfmmla v8.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec70  // bfmmla v16.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec58  // bfmmla v24.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x6e40ec8c  // bfmmla v12.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5c  // bfmmla v28.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e41ec89  // bfmmla v9.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec71  // bfmmla v17.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec59  // bfmmla v25.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x6e40ec8d  // bfmmla v13.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5d  // bfmmla v29.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e41ec8a  // bfmmla v10.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec72  // bfmmla v18.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5a  // bfmmla v26.4s, v2.8h, v1.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e40ec8e  // bfmmla v14.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5e  // bfmmla v30.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec8b  // bfmmla v11.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40ec8f  // bfmmla v15.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5f  // bfmmla v31.4s, v2.8h, v0.8h\n"
+      "bge 207b\n"
+      "208:"  // Height 6: Multiply loop: Skip odd blocks
+      "cbz x27, 211f\n"
+      "tbz x27, #1, 209f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s5, [x22], #0x4\n"
+      "ldr s6, [x21], #0x4\n"
+      "tbz x27, #0, 210f\n"
+      "ld1 { v1.h }[2], [x26]\n"
+      "ld1 { v2.h }[2], [x25]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "ld1 { v4.h }[2], [x23]\n"
+      "ld1 { v5.h }[2], [x22]\n"
+      "ld1 { v6.h }[2], [x21]\n"
+      "b 210f\n"
+      "209:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x26, #0x0]\n"
+      "ldr h2, [x25, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "ldr h4, [x23, #0x0]\n"
+      "ldr h5, [x22, #0x0]\n"
+      "ldr h6, [x21, #0x0]\n"
+      "210:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q0, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e40ece8  // bfmmla v8.4s, v7.8h, v0.8h\n"
+      "trn1 v2.2d, v5.2d, v6.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x6e40ec70  // bfmmla v16.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec58  // bfmmla v24.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e41ecec  // bfmmla v12.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec74  // bfmmla v20.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5c  // bfmmla v28.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e40ece9  // bfmmla v9.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec59  // bfmmla v25.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x6e41eced  // bfmmla v13.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec75  // bfmmla v21.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e40ecea  // bfmmla v10.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5a  // bfmmla v26.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x6e41ecee  // bfmmla v14.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec76  // bfmmla v22.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5e  // bfmmla v30.4s, v2.8h, v1.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5b  // bfmmla v27.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e46ecef  // bfmmla v15.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5f  // bfmmla v31.4s, v2.8h, v6.8h\n"
+      "211:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 201b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "tbz %x[flags], #1, 212f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v28.4s, v28.4s, v1.4s\n"
+      "fmin v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmax v7.4s, v7.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "212:"  // Height 6: No activation
+      "cmp x11, #0x10\n"
+      "bge 221f\n"
+      "tbz x11, #3, 216f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 214f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v29.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 213f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d22, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d30, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v22.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "b 220f\n"
+      "213:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 220f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s22, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s30, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "b 220f\n"
+      "214:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 215f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d21, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d29, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v21.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "b 220f\n"
+      "215:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 220f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s21, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s29, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "b 220f\n"
+      "216:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 218f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 217f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d20, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d28, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v20.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "b 220f\n"
+      "217:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 220f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s20, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s28, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "b 220f\n"
+      "218:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 219f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "b 220f\n"
+      "219:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "220:"  // Height 6: Partial direct writeback: Done
+      "b 222f\n"
+      "221:"  // Height 6: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q20, [x24, #0x10]\n"
+      "str q21, [x24, #0x20]\n"
+      "str q22, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q23, [x22, #0x0]\n"
+      "str q28, [x22, #0x10]\n"
+      "str q29, [x22, #0x20]\n"
+      "str q30, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "222:"  // Height 6: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 187b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 224f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 223f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "223:"  // Update direct input
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "224:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
index 674d71d626..8b80c25beb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,19 +10,19 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+#ifdef __aarch64__
 
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
@@ -44,7 +44,8 @@ void a64_hybrid_fp16_mla_6x32_a55( ARGLIST );
 class cls_a64_hybrid_fp16_mla_6x32
 {
 public:
-    typedef __fp16 operand_type;
+    typedef __fp16 lhs_operand_type;
+    typedef __fp16 rhs_operand_type;
     typedef __fp16 result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,24 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 32, 1> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 32, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 6.94 };
-            default:
-                return { 14.53 };
+        if (std::is_same<T, __fp16>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 6.94 };
+                default:
+                    return { 14.53 };
+                case CPUModel::A510:
+                    return { 8.94 };
+                case CPUModel::V1:
+                    return { 29.26 };
+            }
         }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
index 87c73740e7..b049ed45f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
 
@@ -92,9 +92,6 @@ void a64_hybrid_fp16_mla_6x32_a55 (
             break;
     }
     __asm__ __volatile__(
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-      ".arch  armv8.2-a+fp16\n"
-#endif
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 246f\n"
@@ -104,138 +101,138 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "cmp %x[M], #0x2\n"
       "bgt 99f\n"
       "beq 50f\n"
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x16, %x[bias]\n"
-      "mov x15, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "cbz x16, 3f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
+      "cbz x7, 3f\n"
+      "ldr q8, [x7, #0x0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "add x7, x7, #0x40\n"
       "b 22f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 21f\n"
       "cmp x8, #0x20\n"
       "bge 20f\n"
       "tbz x8, #4, 11f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
-      "ld1 { v9.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
+      "ld1 { v9.8h }, [x16], #0x10\n"
       "tbz x8, #3, 7f\n"
-      "ld1 { v10.8h }, [x15], #0x10\n"
+      "ld1 { v10.8h }, [x16], #0x10\n"
       "tbz x8, #2, 5f\n"
-      "ldr d11, [x15], #0x8\n"
+      "ldr d11, [x16], #0x8\n"
       "tbz x8, #1, 4f\n"
-      "mov x19, #0x3c\n"
-      "ld1 { v11.s }[2], [x15], #0x4\n"
+      "ld1 { v11.s }[2], [x16], #0x4\n"
+      "mov x20, #0x3c\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v11.h }[6], [x15]\n"
+      "ld1 { v11.h }[6], [x16]\n"
       "b 19f\n"
       "4:"  // Height 1: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v11.h }[4], [x15]\n"
+      "ld1 { v11.h }[4], [x16]\n"
       "b 19f\n"
       "5:"  // Height 1: Partial accumulate: partial_2_24
       "tbz x8, #1, 6f\n"
-      "ldr s11, [x15], #0x4\n"
-      "mov x19, #0x34\n"
+      "ldr s11, [x16], #0x4\n"
+      "mov x20, #0x34\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v11.h }[2], [x15]\n"
+      "ld1 { v11.h }[2], [x16]\n"
       "b 19f\n"
       "6:"  // Height 1: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 19f\n"
-      "ldr h11, [x15, #0x0]\n"
+      "ldr h11, [x16, #0x0]\n"
       "b 19f\n"
       "7:"  // Height 1: Partial accumulate: partial_4_16
       "tbz x8, #2, 9f\n"
-      "ldr d10, [x15], #0x8\n"
+      "ldr d10, [x16], #0x8\n"
       "tbz x8, #1, 8f\n"
-      "mov x19, #0x2c\n"
-      "ld1 { v10.s }[2], [x15], #0x4\n"
+      "ld1 { v10.s }[2], [x16], #0x4\n"
+      "mov x20, #0x2c\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v10.h }[6], [x15]\n"
+      "ld1 { v10.h }[6], [x16]\n"
       "b 19f\n"
       "8:"  // Height 1: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v10.h }[4], [x15]\n"
+      "ld1 { v10.h }[4], [x16]\n"
       "b 19f\n"
       "9:"  // Height 1: Partial accumulate: partial_2_16
       "tbz x8, #1, 10f\n"
-      "ldr s10, [x15], #0x4\n"
-      "mov x19, #0x24\n"
+      "ldr s10, [x16], #0x4\n"
+      "mov x20, #0x24\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v10.h }[2], [x15]\n"
+      "ld1 { v10.h }[2], [x16]\n"
       "b 19f\n"
       "10:"  // Height 1: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 19f\n"
-      "ldr h10, [x15, #0x0]\n"
+      "ldr h10, [x16, #0x0]\n"
       "b 19f\n"
       "11:"  // Height 1: Partial accumulate: partial_8_0
       "tbz x8, #3, 15f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
       "tbz x8, #2, 13f\n"
-      "ldr d9, [x15], #0x8\n"
+      "ldr d9, [x16], #0x8\n"
       "tbz x8, #1, 12f\n"
-      "mov x19, #0x1c\n"
-      "ld1 { v9.s }[2], [x15], #0x4\n"
+      "ld1 { v9.s }[2], [x16], #0x4\n"
+      "mov x20, #0x1c\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v9.h }[6], [x15]\n"
+      "ld1 { v9.h }[6], [x16]\n"
       "b 19f\n"
       "12:"  // Height 1: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v9.h }[4], [x15]\n"
+      "ld1 { v9.h }[4], [x16]\n"
       "b 19f\n"
       "13:"  // Height 1: Partial accumulate: partial_2_8
       "tbz x8, #1, 14f\n"
-      "ldr s9, [x15], #0x4\n"
-      "mov x19, #0x14\n"
+      "ldr s9, [x16], #0x4\n"
+      "mov x20, #0x14\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v9.h }[2], [x15]\n"
+      "ld1 { v9.h }[2], [x16]\n"
       "b 19f\n"
       "14:"  // Height 1: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 19f\n"
-      "ldr h9, [x15, #0x0]\n"
+      "ldr h9, [x16, #0x0]\n"
       "b 19f\n"
       "15:"  // Height 1: Partial accumulate: partial_4_0
       "tbz x8, #2, 17f\n"
-      "ldr d8, [x15], #0x8\n"
+      "ldr d8, [x16], #0x8\n"
       "tbz x8, #1, 16f\n"
-      "mov x19, #0xc\n"
-      "ld1 { v8.s }[2], [x15], #0x4\n"
+      "ld1 { v8.s }[2], [x16], #0x4\n"
+      "mov x20, #0xc\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v8.h }[6], [x15]\n"
+      "ld1 { v8.h }[6], [x16]\n"
       "b 19f\n"
       "16:"  // Height 1: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v8.h }[4], [x15]\n"
+      "ld1 { v8.h }[4], [x16]\n"
       "b 19f\n"
       "17:"  // Height 1: Partial accumulate: partial_2_0
       "tbz x8, #1, 18f\n"
-      "ldr s8, [x15], #0x4\n"
-      "mov x19, #0x4\n"
+      "ldr s8, [x16], #0x4\n"
+      "mov x20, #0x4\n"
       "tbz x8, #0, 19f\n"
-      "ld1 { v8.h }[2], [x15]\n"
+      "ld1 { v8.h }[2], [x16]\n"
       "b 19f\n"
       "18:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr h8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr h8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "19:"  // Height 1: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 22f\n"
       "20:"  // Height 1: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "b 22f\n"
       "21:"  // Height 1: no accumulate
       "movi v8.16b, #0x0\n"
@@ -243,546 +240,546 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "movi v10.16b, #0x0\n"
       "movi v11.16b, #0x0\n"
       "22:"  // Height 1: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "23:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 24f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "cbnz x14, 25f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "cbnz x15, 25f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #1\n"
       "b 25f\n"
       "24:"  // Height 1: setup direct input
-      "mov x12, %x[input_ptr]\n"
+      "mov x13, %x[input_ptr]\n"
       "25:"  // Height 1: input setup done
-      "cmp x13, #0x8\n"
+      "cmp x14, #0x8\n"
       "blt 28f\n"
-      "ldr q0, [x12, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x10\n"
       "ldr q6, [x17, #0x0]\n"
-      "cmp x13, #0x10\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 27f\n"
       "26:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr d7, [x17, #0x10]\n"
-      "ldr x11, [x17, #0x18]\n"
-      "add x12, x12, #0x10\n"
-      "ldr d6, [x17, #0x20]\n"
-      "sub x13, x13, #0x8\n"
-      "ldr x10, [x17, #0x28]\n"
-      "cmp x13, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr d17, [x17, #0x20]\n"
+      "ldr x20, [x17, #0x28]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "ldr x10, [x17, #0x48]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x58]\n"
-      "ldr x9, [x12, #0x8]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "ldr x10, [x17, #0x68]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "ldr x10, [x17, #0x88]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "ldr x10, [x17, #0xa8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "ldr x10, [x17, #0xc8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "ldr x10, [x17, #0xe8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr d6, [x17, #0x100]\n"
-      "ldr x10, [x17, #0x108]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x118]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr d6, [x17, #0x120]\n"
-      "ldr x10, [x17, #0x128]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x138]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr d6, [x17, #0x140]\n"
-      "ldr x10, [x17, #0x148]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x158]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr d6, [x17, #0x160]\n"
-      "ldr x10, [x17, #0x168]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x178]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr d6, [x17, #0x180]\n"
-      "ldr x10, [x17, #0x188]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x198]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr d6, [x17, #0x1a0]\n"
-      "ldr x10, [x17, #0x1a8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x1b8]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr d6, [x17, #0x1c0]\n"
-      "ldr x10, [x17, #0x1c8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x1d8]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr d6, [x17, #0x1e0]\n"
-      "ldr x10, [x17, #0x1e8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x1f8]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x1f0]\n"
+      "ldr d16, [x17, #0x30]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x38]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr d17, [x17, #0x40]\n"
+      "ldr x20, [x17, #0x48]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr d16, [x17, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr d17, [x17, #0x60]\n"
+      "ldr x20, [x17, #0x68]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr d16, [x17, #0x70]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr d17, [x17, #0x80]\n"
+      "ldr x20, [x17, #0x88]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr d16, [x17, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr d17, [x17, #0xa0]\n"
+      "ldr x20, [x17, #0xa8]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr d16, [x17, #0xb0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr d17, [x17, #0xc0]\n"
+      "ldr x20, [x17, #0xc8]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr d16, [x17, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr d17, [x17, #0xe0]\n"
+      "ldr x20, [x17, #0xe8]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr d16, [x17, #0xf0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr d17, [x17, #0x100]\n"
+      "ldr x20, [x17, #0x108]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr d16, [x17, #0x110]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x118]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr d17, [x17, #0x120]\n"
+      "ldr x20, [x17, #0x128]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr d16, [x17, #0x130]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x138]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr d17, [x17, #0x140]\n"
+      "ldr x20, [x17, #0x148]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr d16, [x17, #0x150]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x158]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr d17, [x17, #0x160]\n"
+      "ldr x20, [x17, #0x168]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr d16, [x17, #0x170]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x178]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr d17, [x17, #0x180]\n"
+      "ldr x20, [x17, #0x188]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr d16, [x17, #0x190]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x198]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr d17, [x17, #0x1a0]\n"
+      "ldr x20, [x17, #0x1a8]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr d16, [x17, #0x1b0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x1b8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr d17, [x17, #0x1c0]\n"
+      "ldr x20, [x17, #0x1c8]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr d16, [x17, #0x1d0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x1d8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr d17, [x17, #0x1e0]\n"
+      "ldr x20, [x17, #0x1e8]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr d16, [x17, #0x1f0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x1f8]\n"
+      "mov v16.d[1], x20\n"
+      "add x13, x13, #0x10\n"
       "add x17, x17, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
       "ldr d6, [x17, #0x0]\n"
-      "ldr x10, [x17, #0x8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d0, [x12, #0x0]\n"
-      "mov v0.d[1], x9\n"
+      "ldr x20, [x17, #0x8]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "sub x14, x14, #0x8\n"
+      "ldr d7, [x17, #0x10]\n"
+      "cmp x14, #0x10\n"
+      "ldr x21, [x13, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "bge 26b\n"
       "27:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "sub x13, x13, #0x8\n"
-      "add x12, x12, #0x10\n"
+      "ldr q17, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr q6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr q7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x17, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x17, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x17, #0x70]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x17, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x17, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x17, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x17, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x17, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x17, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x17, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x17, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x17, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x17, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x17, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x17, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x17, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x17, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x17, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x17, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x17, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x17, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x17, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x17, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x17, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x17, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr q17, [x17, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr q16, [x17, #0x1f0]\n"
+      "add x13, x13, #0x10\n"
+      "sub x14, x14, #0x8\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
       "add x17, x17, #0x200\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
       "28:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x13, 30f\n"
+      "cbz x14, 30f\n"
       "29:"  // Height 1: Multiply loop: Odd block loop
-      "ldr h0, [x12], #0x2\n"
-      "sub x13, x13, #0x1\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
+      "ldr h0, [x13], #0x2\n"
+      "sub x14, x14, #0x1\n"
+      "ldr q16, [x17, #0x0]\n"
+      "fmla v8.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x10]\n"
+      "fmla v9.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x20]\n"
+      "fmla v10.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "cbnz x13, 29b\n"
+      "cbnz x14, 29b\n"
       "30:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 23b\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "tbz %x[flags], #1, 31f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.8h }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x19]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v16.8h\n"
+      "fmin v9.8h, v9.8h, v16.8h\n"
+      "fmin v10.8h, v10.8h, v16.8h\n"
+      "fmin v11.8h, v11.8h, v16.8h\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
       "31:"  // Height 1: No activation
       "cmp x8, #0x20\n"
       "bge 48f\n"
       "tbz x8, #4, 39f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
-      "st1 { v9.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
+      "st1 { v9.8h }, [x16], #0x10\n"
       "tbz x8, #3, 35f\n"
-      "st1 { v10.8h }, [x15], #0x10\n"
+      "st1 { v10.8h }, [x16], #0x10\n"
       "tbz x8, #2, 33f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "tbz x8, #1, 32f\n"
-      "st1 { v11.s }[2], [x15], #0x4\n"
+      "st1 { v11.s }[2], [x16], #0x4\n"
       "tbz x8, #0, 47f\n"
-      "st1 { v11.h }[6], [x15]\n"
+      "st1 { v11.h }[6], [x16]\n"
       "b 47f\n"
       "32:"  // Height 1: Partial direct writeback: partial_1_28
       "tbz x8, #0, 47f\n"
-      "st1 { v11.h }[4], [x15]\n"
+      "st1 { v11.h }[4], [x16]\n"
       "b 47f\n"
       "33:"  // Height 1: Partial direct writeback: partial_2_24
       "tbz x8, #1, 34f\n"
-      "str s11, [x15], #0x4\n"
+      "str s11, [x16], #0x4\n"
       "tbz x8, #0, 47f\n"
-      "st1 { v11.h }[2], [x15]\n"
+      "st1 { v11.h }[2], [x16]\n"
       "b 47f\n"
       "34:"  // Height 1: Partial direct writeback: partial_1_24
       "tbz x8, #0, 47f\n"
-      "str h11, [x15, #0x0]\n"
+      "str h11, [x16, #0x0]\n"
       "b 47f\n"
       "35:"  // Height 1: Partial direct writeback: partial_4_16
       "tbz x8, #2, 37f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "tbz x8, #1, 36f\n"
-      "st1 { v10.s }[2], [x15], #0x4\n"
+      "st1 { v10.s }[2], [x16], #0x4\n"
       "tbz x8, #0, 47f\n"
-      "st1 { v10.h }[6], [x15]\n"
+      "st1 { v10.h }[6], [x16]\n"
       "b 47f\n"
       "36:"  // Height 1: Partial direct writeback: partial_1_20
       "tbz x8, #0, 47f\n"
-      "st1 { v10.h }[4], [x15]\n"
+      "st1 { v10.h }[4], [x16]\n"
       "b 47f\n"
       "37:"  // Height 1: Partial direct writeback: partial_2_16
       "tbz x8, #1, 38f\n"
-      "str s10, [x15], #0x4\n"
+      "str s10, [x16], #0x4\n"
       "tbz x8, #0, 47f\n"
-      "st1 { v10.h }[2], [x15]\n"
+      "st1 { v10.h }[2], [x16]\n"
       "b 47f\n"
       "38:"  // Height 1: Partial direct writeback: partial_1_16
       "tbz x8, #0, 47f\n"
-      "str h10, [x15, #0x0]\n"
+      "str h10, [x16, #0x0]\n"
       "b 47f\n"
       "39:"  // Height 1: Partial direct writeback: partial_8_0
       "tbz x8, #3, 43f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
       "tbz x8, #2, 41f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "tbz x8, #1, 40f\n"
-      "st1 { v9.s }[2], [x15], #0x4\n"
+      "st1 { v9.s }[2], [x16], #0x4\n"
       "tbz x8, #0, 47f\n"
-      "st1 { v9.h }[6], [x15]\n"
+      "st1 { v9.h }[6], [x16]\n"
       "b 47f\n"
       "40:"  // Height 1: Partial direct writeback: partial_1_12
       "tbz x8, #0, 47f\n"
-      "st1 { v9.h }[4], [x15]\n"
+      "st1 { v9.h }[4], [x16]\n"
       "b 47f\n"
       "41:"  // Height 1: Partial direct writeback: partial_2_8
       "tbz x8, #1, 42f\n"
-      "str s9, [x15], #0x4\n"
+      "str s9, [x16], #0x4\n"
       "tbz x8, #0, 47f\n"
-      "st1 { v9.h }[2], [x15]\n"
+      "st1 { v9.h }[2], [x16]\n"
       "b 47f\n"
       "42:"  // Height 1: Partial direct writeback: partial_1_8
       "tbz x8, #0, 47f\n"
-      "str h9, [x15, #0x0]\n"
+      "str h9, [x16, #0x0]\n"
       "b 47f\n"
       "43:"  // Height 1: Partial direct writeback: partial_4_0
       "tbz x8, #2, 45f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "tbz x8, #1, 44f\n"
-      "st1 { v8.s }[2], [x15], #0x4\n"
+      "st1 { v8.s }[2], [x16], #0x4\n"
       "tbz x8, #0, 47f\n"
-      "st1 { v8.h }[6], [x15]\n"
+      "st1 { v8.h }[6], [x16]\n"
       "b 47f\n"
       "44:"  // Height 1: Partial direct writeback: partial_1_4
       "tbz x8, #0, 47f\n"
-      "st1 { v8.h }[4], [x15]\n"
+      "st1 { v8.h }[4], [x16]\n"
       "b 47f\n"
       "45:"  // Height 1: Partial direct writeback: partial_2_0
       "tbz x8, #1, 46f\n"
-      "str s8, [x15], #0x4\n"
+      "str s8, [x16], #0x4\n"
       "tbz x8, #0, 47f\n"
-      "st1 { v8.h }[2], [x15]\n"
+      "st1 { v8.h }[2], [x16]\n"
       "b 47f\n"
       "46:"  // Height 1: Partial direct writeback: partial_1_0
-      "str h8, [x15, #0x0]\n"
+      "str h8, [x16, #0x0]\n"
       "47:"  // Height 1: Partial direct writeback: Done
       "b 49f\n"
       "48:"  // Height 1: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "49:"  // Height 1: Writeback done
       "subs x8, x8, #0x20\n"
       "bgt 2b\n"
       "b 296f\n"
       "50:"  // Height 2
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x16, %x[bias]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
       "51:"  // Height 2: Column loop
-      "cbz x16, 52f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "cbz x7, 52f\n"
+      "ldr q8, [x7, #0x0]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q11, [x16, #0x30]\n"
+      "ldr q9, [x7, #0x10]\n"
       "mov v13.16b, v9.16b\n"
-      "add x16, x16, #0x40\n"
+      "ldr q10, [x7, #0x20]\n"
       "mov v14.16b, v10.16b\n"
+      "ldr q11, [x7, #0x30]\n"
       "mov v15.16b, v11.16b\n"
+      "add x7, x7, #0x40\n"
       "b 71f\n"
       "52:"  // Height 2: no bias
       "tbz %x[flags], #0, 70f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "cmp x8, #0x20\n"
-      "add x25, x15, x19, LSL #1\n"
+      "add x25, x16, x20, LSL #1\n"
       "bge 69f\n"
       "tbz x8, #4, 60f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
       "ld1 { v12.8h }, [x25], #0x10\n"
-      "ld1 { v9.8h }, [x15], #0x10\n"
+      "ld1 { v9.8h }, [x16], #0x10\n"
       "ld1 { v13.8h }, [x25], #0x10\n"
       "tbz x8, #3, 56f\n"
-      "ld1 { v10.8h }, [x15], #0x10\n"
+      "ld1 { v10.8h }, [x16], #0x10\n"
       "ld1 { v14.8h }, [x25], #0x10\n"
       "tbz x8, #2, 54f\n"
-      "ldr d11, [x15], #0x8\n"
+      "ldr d11, [x16], #0x8\n"
       "ldr d15, [x25], #0x8\n"
       "tbz x8, #1, 53f\n"
-      "mov x19, #0x3c\n"
-      "ld1 { v11.s }[2], [x15], #0x4\n"
+      "ld1 { v11.s }[2], [x16], #0x4\n"
+      "mov x20, #0x3c\n"
       "ld1 { v15.s }[2], [x25], #0x4\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v11.h }[6], [x15]\n"
+      "ld1 { v11.h }[6], [x16]\n"
       "ld1 { v15.h }[6], [x25]\n"
       "b 68f\n"
       "53:"  // Height 2: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v11.h }[4], [x15]\n"
+      "ld1 { v11.h }[4], [x16]\n"
       "ld1 { v15.h }[4], [x25]\n"
       "b 68f\n"
       "54:"  // Height 2: Partial accumulate: partial_2_24
       "tbz x8, #1, 55f\n"
-      "ldr s11, [x15], #0x4\n"
+      "ldr s11, [x16], #0x4\n"
+      "mov x20, #0x34\n"
       "ldr s15, [x25], #0x4\n"
-      "mov x19, #0x34\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v11.h }[2], [x15]\n"
+      "ld1 { v11.h }[2], [x16]\n"
       "ld1 { v15.h }[2], [x25]\n"
       "b 68f\n"
       "55:"  // Height 2: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 68f\n"
-      "ldr h11, [x15, #0x0]\n"
+      "ldr h11, [x16, #0x0]\n"
       "ldr h15, [x25, #0x0]\n"
       "b 68f\n"
       "56:"  // Height 2: Partial accumulate: partial_4_16
       "tbz x8, #2, 58f\n"
-      "ldr d10, [x15], #0x8\n"
+      "ldr d10, [x16], #0x8\n"
       "ldr d14, [x25], #0x8\n"
       "tbz x8, #1, 57f\n"
-      "mov x19, #0x2c\n"
-      "ld1 { v10.s }[2], [x15], #0x4\n"
+      "ld1 { v10.s }[2], [x16], #0x4\n"
+      "mov x20, #0x2c\n"
       "ld1 { v14.s }[2], [x25], #0x4\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v10.h }[6], [x15]\n"
+      "ld1 { v10.h }[6], [x16]\n"
       "ld1 { v14.h }[6], [x25]\n"
       "b 68f\n"
       "57:"  // Height 2: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v10.h }[4], [x15]\n"
+      "ld1 { v10.h }[4], [x16]\n"
       "ld1 { v14.h }[4], [x25]\n"
       "b 68f\n"
       "58:"  // Height 2: Partial accumulate: partial_2_16
       "tbz x8, #1, 59f\n"
-      "ldr s10, [x15], #0x4\n"
+      "ldr s10, [x16], #0x4\n"
+      "mov x20, #0x24\n"
       "ldr s14, [x25], #0x4\n"
-      "mov x19, #0x24\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v10.h }[2], [x15]\n"
+      "ld1 { v10.h }[2], [x16]\n"
       "ld1 { v14.h }[2], [x25]\n"
       "b 68f\n"
       "59:"  // Height 2: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 68f\n"
-      "ldr h10, [x15, #0x0]\n"
+      "ldr h10, [x16, #0x0]\n"
       "ldr h14, [x25, #0x0]\n"
       "b 68f\n"
       "60:"  // Height 2: Partial accumulate: partial_8_0
       "tbz x8, #3, 64f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
       "ld1 { v12.8h }, [x25], #0x10\n"
       "tbz x8, #2, 62f\n"
-      "ldr d9, [x15], #0x8\n"
+      "ldr d9, [x16], #0x8\n"
       "ldr d13, [x25], #0x8\n"
       "tbz x8, #1, 61f\n"
-      "mov x19, #0x1c\n"
-      "ld1 { v9.s }[2], [x15], #0x4\n"
+      "ld1 { v9.s }[2], [x16], #0x4\n"
+      "mov x20, #0x1c\n"
       "ld1 { v13.s }[2], [x25], #0x4\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v9.h }[6], [x15]\n"
+      "ld1 { v9.h }[6], [x16]\n"
       "ld1 { v13.h }[6], [x25]\n"
       "b 68f\n"
       "61:"  // Height 2: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v9.h }[4], [x15]\n"
+      "ld1 { v9.h }[4], [x16]\n"
       "ld1 { v13.h }[4], [x25]\n"
       "b 68f\n"
       "62:"  // Height 2: Partial accumulate: partial_2_8
       "tbz x8, #1, 63f\n"
-      "ldr s9, [x15], #0x4\n"
+      "ldr s9, [x16], #0x4\n"
+      "mov x20, #0x14\n"
       "ldr s13, [x25], #0x4\n"
-      "mov x19, #0x14\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v9.h }[2], [x15]\n"
+      "ld1 { v9.h }[2], [x16]\n"
       "ld1 { v13.h }[2], [x25]\n"
       "b 68f\n"
       "63:"  // Height 2: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 68f\n"
-      "ldr h9, [x15, #0x0]\n"
+      "ldr h9, [x16, #0x0]\n"
       "ldr h13, [x25, #0x0]\n"
       "b 68f\n"
       "64:"  // Height 2: Partial accumulate: partial_4_0
       "tbz x8, #2, 66f\n"
-      "ldr d8, [x15], #0x8\n"
+      "ldr d8, [x16], #0x8\n"
       "ldr d12, [x25], #0x8\n"
       "tbz x8, #1, 65f\n"
-      "mov x19, #0xc\n"
-      "ld1 { v8.s }[2], [x15], #0x4\n"
+      "ld1 { v8.s }[2], [x16], #0x4\n"
+      "mov x20, #0xc\n"
       "ld1 { v12.s }[2], [x25], #0x4\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v8.h }[6], [x15]\n"
+      "ld1 { v8.h }[6], [x16]\n"
       "ld1 { v12.h }[6], [x25]\n"
       "b 68f\n"
       "65:"  // Height 2: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v8.h }[4], [x15]\n"
+      "ld1 { v8.h }[4], [x16]\n"
       "ld1 { v12.h }[4], [x25]\n"
       "b 68f\n"
       "66:"  // Height 2: Partial accumulate: partial_2_0
       "tbz x8, #1, 67f\n"
-      "ldr s8, [x15], #0x4\n"
+      "ldr s8, [x16], #0x4\n"
+      "mov x20, #0x4\n"
       "ldr s12, [x25], #0x4\n"
-      "mov x19, #0x4\n"
       "tbz x8, #0, 68f\n"
-      "ld1 { v8.h }[2], [x15]\n"
+      "ld1 { v8.h }[2], [x16]\n"
       "ld1 { v12.h }[2], [x25]\n"
       "b 68f\n"
       "67:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr h8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr h8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "ldr h12, [x25, #0x0]\n"
       "68:"  // Height 2: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 71f\n"
       "69:"  // Height 2: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "ldr q12, [x25, #0x0]\n"
       "ldr q13, [x25, #0x10]\n"
       "ldr q14, [x25, #0x20]\n"
@@ -798,494 +795,494 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "movi v14.16b, #0x0\n"
       "movi v15.16b, #0x0\n"
       "71:"  // Height 2: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "72:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 73f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "cbnz x14, 74f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #1\n"
-      "add x28, x28, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "cbnz x15, 74f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #1\n"
+      "add x12, x12, x20, LSL #1\n"
       "b 74f\n"
       "73:"  // Height 2: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19, LSL #1\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21, LSL #1\n"
       "74:"  // Height 2: input setup done
-      "cmp x13, #0x8\n"
+      "cmp x14, #0x8\n"
       "blt 77f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x10\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x10\n"
+      "ldr q1, [x12, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 76f\n"
       "75:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr d7, [x17, #0x10]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x18]\n"
-      "ldr d6, [x17, #0x20]\n"
-      "add x12, x12, #0x10\n"
-      "ldr x10, [x17, #0x28]\n"
-      "add x28, x28, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      "sub x13, x13, #0x8\n"
+      "ldr d17, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x10\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr x11, [x17, #0x38]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "ldr x10, [x17, #0x48]\n"
-      "cmp x13, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x58]\n"
-      "ldr x9, [x12, #0x8]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr x10, [x17, #0x68]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x78]\n"
-      "ldr x27, [x28, #0x8]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr x10, [x17, #0x88]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr x10, [x17, #0xa8]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr x10, [x17, #0xc8]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr x10, [x17, #0xe8]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr x10, [x17, #0x108]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr d6, [x17, #0x100]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x118]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr d7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr x10, [x17, #0x128]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr d6, [x17, #0x120]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x138]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr d7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr x10, [x17, #0x148]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr d6, [x17, #0x140]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x158]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr d7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr x10, [x17, #0x168]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr d6, [x17, #0x160]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x178]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr d7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr x10, [x17, #0x188]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr d6, [x17, #0x180]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x198]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr d7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr x10, [x17, #0x1a8]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr d6, [x17, #0x1a0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x1b8]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr d7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr x10, [x17, #0x1c8]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr d6, [x17, #0x1c0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x1d8]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr d7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr x10, [x17, #0x1e8]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr d6, [x17, #0x1e0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x1f8]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "ldr d7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "ldr d16, [x17, #0x30]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr d17, [x17, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr x20, [x17, #0x48]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr d16, [x17, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr d17, [x17, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr d16, [x17, #0x70]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr d17, [x17, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr x20, [x17, #0x88]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr d16, [x17, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr d17, [x17, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr d16, [x17, #0xb0]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr d17, [x17, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr x20, [x17, #0xc8]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr d16, [x17, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr d17, [x17, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr d16, [x17, #0xf0]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr d17, [x17, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr x20, [x17, #0x108]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr d16, [x17, #0x110]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x118]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr x21, [x17, #0x128]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr d17, [x17, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr x20, [x17, #0x138]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr d16, [x17, #0x130]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr d17, [x17, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr x20, [x17, #0x148]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr d16, [x17, #0x150]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x158]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr x21, [x17, #0x168]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr d17, [x17, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr x20, [x17, #0x178]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr d16, [x17, #0x170]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr d17, [x17, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr x20, [x17, #0x188]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr d16, [x17, #0x190]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x198]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr x21, [x17, #0x1a8]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr d17, [x17, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr x20, [x17, #0x1b8]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr d16, [x17, #0x1b0]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr d17, [x17, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr x20, [x17, #0x1c8]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr d16, [x17, #0x1d0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x1d8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr x21, [x17, #0x1e8]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr d17, [x17, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr x20, [x17, #0x1f8]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr d16, [x17, #0x1f0]\n"
+      "mov v17.d[1], x21\n"
+      "add x13, x13, #0x10\n"
+      "mov v16.d[1], x20\n"
+      "add x12, x12, #0x10\n"
       "add x17, x17, #0x200\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
       "ldr d6, [x17, #0x0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x10, [x17, #0x8]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "ldr d0, [x12, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d1, [x28, #0x0]\n"
-      "mov v0.d[1], x9\n"
-      "mov v1.d[1], x27\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "sub x14, x14, #0x8\n"
+      "ldr d7, [x17, #0x10]\n"
+      "cmp x14, #0x10\n"
+      "ldr x20, [x13, #0x8]\n"
+      "mov v6.d[1], x21\n"
+      "ldr x21, [x12, #0x8]\n"
+      "mov v0.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v1.d[1], x21\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "bge 75b\n"
       "76:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
+      "add x13, x13, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "sub x13, x13, #0x8\n"
-      "add x12, x12, #0x10\n"
+      "ldr q17, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x12, x12, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "add x28, x28, #0x10\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr q6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "ldr q7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "sub x14, x14, #0x8\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr q17, [x17, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr q16, [x17, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x17, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x17, #0x70]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x17, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x17, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x17, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x17, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x17, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x17, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x17, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x17, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x17, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x17, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x17, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x17, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x17, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x17, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x17, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x17, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x17, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x17, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x17, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x17, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x17, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x17, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr q17, [x17, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr q16, [x17, #0x1f0]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
       "add x17, x17, #0x200\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
       "77:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x13, 79f\n"
+      "cbz x14, 79f\n"
       "78:"  // Height 2: Multiply loop: Odd block loop
+      "ldr h1, [x13], #0x2\n"
+      "sub x14, x14, #0x1\n"
       "ldr h0, [x12], #0x2\n"
-      "sub x13, x13, #0x1\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr q17, [x17, #0x0]\n"
+      "fmla v8.8h, v17.8h, v1.h[0]\n"
+      "ldr q16, [x17, #0x10]\n"
+      "fmla v12.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x17, #0x20]\n"
+      "fmla v9.8h, v16.8h, v1.h[0]\n"
+      "fmla v13.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.8h, v17.8h, v1.h[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "cbnz x13, 78b\n"
+      "fmla v14.8h, v17.8h, v0.h[0]\n"
+      "fmla v11.8h, v16.8h, v1.h[0]\n"
+      "fmla v15.8h, v16.8h, v0.h[0]\n"
+      "cbnz x14, 78b\n"
       "79:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 72b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "add x25, x15, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #1\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "prfm pstl1keep, [x25, #0x0]\n"
       "tbz %x[flags], #1, 80f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v16.8h\n"
+      "fmin v9.8h, v9.8h, v16.8h\n"
+      "fmin v10.8h, v10.8h, v16.8h\n"
+      "fmin v11.8h, v11.8h, v16.8h\n"
+      "fmin v12.8h, v12.8h, v16.8h\n"
+      "fmin v13.8h, v13.8h, v16.8h\n"
+      "fmin v14.8h, v14.8h, v16.8h\n"
+      "fmin v15.8h, v15.8h, v16.8h\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
-      "ld1r { v0.8h }, [x19]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
-      "fmax v12.8h, v12.8h, v1.8h\n"
-      "fmax v13.8h, v13.8h, v1.8h\n"
-      "fmax v14.8h, v14.8h, v1.8h\n"
-      "fmax v15.8h, v15.8h, v1.8h\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
+      "fmax v12.8h, v12.8h, v16.8h\n"
+      "fmax v13.8h, v13.8h, v16.8h\n"
+      "fmax v14.8h, v14.8h, v16.8h\n"
+      "fmax v15.8h, v15.8h, v16.8h\n"
       "80:"  // Height 2: No activation
       "cmp x8, #0x20\n"
       "bge 97f\n"
       "tbz x8, #4, 88f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
-      "st1 { v9.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
+      "st1 { v9.8h }, [x16], #0x10\n"
       "st1 { v12.8h }, [x25], #0x10\n"
       "st1 { v13.8h }, [x25], #0x10\n"
       "tbz x8, #3, 84f\n"
-      "st1 { v10.8h }, [x15], #0x10\n"
+      "st1 { v10.8h }, [x16], #0x10\n"
       "st1 { v14.8h }, [x25], #0x10\n"
       "tbz x8, #2, 82f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "str d15, [x25], #0x8\n"
       "tbz x8, #1, 81f\n"
-      "st1 { v11.s }[2], [x15], #0x4\n"
+      "st1 { v11.s }[2], [x16], #0x4\n"
       "st1 { v15.s }[2], [x25], #0x4\n"
       "tbz x8, #0, 96f\n"
-      "st1 { v11.h }[6], [x15]\n"
+      "st1 { v11.h }[6], [x16]\n"
       "st1 { v15.h }[6], [x25]\n"
       "b 96f\n"
       "81:"  // Height 2: Partial direct writeback: partial_1_28
       "tbz x8, #0, 96f\n"
-      "st1 { v11.h }[4], [x15]\n"
+      "st1 { v11.h }[4], [x16]\n"
       "st1 { v15.h }[4], [x25]\n"
       "b 96f\n"
       "82:"  // Height 2: Partial direct writeback: partial_2_24
       "tbz x8, #1, 83f\n"
-      "str s11, [x15], #0x4\n"
+      "str s11, [x16], #0x4\n"
       "str s15, [x25], #0x4\n"
       "tbz x8, #0, 96f\n"
-      "st1 { v11.h }[2], [x15]\n"
+      "st1 { v11.h }[2], [x16]\n"
       "st1 { v15.h }[2], [x25]\n"
       "b 96f\n"
       "83:"  // Height 2: Partial direct writeback: partial_1_24
       "tbz x8, #0, 96f\n"
-      "str h11, [x15, #0x0]\n"
+      "str h11, [x16, #0x0]\n"
       "str h15, [x25, #0x0]\n"
       "b 96f\n"
       "84:"  // Height 2: Partial direct writeback: partial_4_16
       "tbz x8, #2, 86f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "str d14, [x25], #0x8\n"
       "tbz x8, #1, 85f\n"
-      "st1 { v10.s }[2], [x15], #0x4\n"
+      "st1 { v10.s }[2], [x16], #0x4\n"
       "st1 { v14.s }[2], [x25], #0x4\n"
       "tbz x8, #0, 96f\n"
-      "st1 { v10.h }[6], [x15]\n"
+      "st1 { v10.h }[6], [x16]\n"
       "st1 { v14.h }[6], [x25]\n"
       "b 96f\n"
       "85:"  // Height 2: Partial direct writeback: partial_1_20
       "tbz x8, #0, 96f\n"
-      "st1 { v10.h }[4], [x15]\n"
+      "st1 { v10.h }[4], [x16]\n"
       "st1 { v14.h }[4], [x25]\n"
       "b 96f\n"
       "86:"  // Height 2: Partial direct writeback: partial_2_16
       "tbz x8, #1, 87f\n"
-      "str s10, [x15], #0x4\n"
+      "str s10, [x16], #0x4\n"
       "str s14, [x25], #0x4\n"
       "tbz x8, #0, 96f\n"
-      "st1 { v10.h }[2], [x15]\n"
+      "st1 { v10.h }[2], [x16]\n"
       "st1 { v14.h }[2], [x25]\n"
       "b 96f\n"
       "87:"  // Height 2: Partial direct writeback: partial_1_16
       "tbz x8, #0, 96f\n"
-      "str h10, [x15, #0x0]\n"
+      "str h10, [x16, #0x0]\n"
       "str h14, [x25, #0x0]\n"
       "b 96f\n"
       "88:"  // Height 2: Partial direct writeback: partial_8_0
       "tbz x8, #3, 92f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
       "st1 { v12.8h }, [x25], #0x10\n"
       "tbz x8, #2, 90f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "str d13, [x25], #0x8\n"
       "tbz x8, #1, 89f\n"
-      "st1 { v9.s }[2], [x15], #0x4\n"
+      "st1 { v9.s }[2], [x16], #0x4\n"
       "st1 { v13.s }[2], [x25], #0x4\n"
       "tbz x8, #0, 96f\n"
-      "st1 { v9.h }[6], [x15]\n"
+      "st1 { v9.h }[6], [x16]\n"
       "st1 { v13.h }[6], [x25]\n"
       "b 96f\n"
       "89:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x8, #0, 96f\n"
-      "st1 { v9.h }[4], [x15]\n"
+      "st1 { v9.h }[4], [x16]\n"
       "st1 { v13.h }[4], [x25]\n"
       "b 96f\n"
       "90:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x8, #1, 91f\n"
-      "str s9, [x15], #0x4\n"
+      "str s9, [x16], #0x4\n"
       "str s13, [x25], #0x4\n"
       "tbz x8, #0, 96f\n"
-      "st1 { v9.h }[2], [x15]\n"
+      "st1 { v9.h }[2], [x16]\n"
       "st1 { v13.h }[2], [x25]\n"
       "b 96f\n"
       "91:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x8, #0, 96f\n"
-      "str h9, [x15, #0x0]\n"
+      "str h9, [x16, #0x0]\n"
       "str h13, [x25, #0x0]\n"
       "b 96f\n"
       "92:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x8, #2, 94f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "str d12, [x25], #0x8\n"
       "tbz x8, #1, 93f\n"
-      "st1 { v8.s }[2], [x15], #0x4\n"
+      "st1 { v8.s }[2], [x16], #0x4\n"
       "st1 { v12.s }[2], [x25], #0x4\n"
       "tbz x8, #0, 96f\n"
-      "st1 { v8.h }[6], [x15]\n"
+      "st1 { v8.h }[6], [x16]\n"
       "st1 { v12.h }[6], [x25]\n"
       "b 96f\n"
       "93:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x8, #0, 96f\n"
-      "st1 { v8.h }[4], [x15]\n"
+      "st1 { v8.h }[4], [x16]\n"
       "st1 { v12.h }[4], [x25]\n"
       "b 96f\n"
       "94:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x8, #1, 95f\n"
-      "str s8, [x15], #0x4\n"
+      "str s8, [x16], #0x4\n"
       "str s12, [x25], #0x4\n"
       "tbz x8, #0, 96f\n"
-      "st1 { v8.h }[2], [x15]\n"
+      "st1 { v8.h }[2], [x16]\n"
       "st1 { v12.h }[2], [x25]\n"
       "b 96f\n"
       "95:"  // Height 2: Partial direct writeback: partial_1_0
-      "str h8, [x15, #0x0]\n"
+      "str h8, [x16, #0x0]\n"
       "str h12, [x25, #0x0]\n"
       "96:"  // Height 2: Partial direct writeback: Done
       "b 98f\n"
       "97:"  // Height 2: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "str q12, [x25, #0x0]\n"
       "str q13, [x25, #0x10]\n"
       "str q14, [x25, #0x20]\n"
@@ -1295,213 +1292,213 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "bgt 51b\n"
       "b 296f\n"
       "99:"  // Height 3
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x16, %x[bias]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
       "100:"  // Height 3: Column loop
-      "cbz x16, 101f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "cbz x7, 101f\n"
+      "ldr q8, [x7, #0x0]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q9, [x7, #0x10]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x7, #0x20]\n"
       "mov v14.16b, v10.16b\n"
-      "mov v18.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
+      "ldr q11, [x7, #0x30]\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "add x7, x7, #0x40\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
       "b 120f\n"
       "101:"  // Height 3: no bias
       "tbz %x[flags], #0, 119f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #1\n"
       "cmp x8, #0x20\n"
-      "add x25, x15, x19, LSL #1\n"
-      "add x24, x25, x19, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
       "bge 118f\n"
       "tbz x8, #4, 109f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
       "ld1 { v12.8h }, [x25], #0x10\n"
       "ld1 { v16.8h }, [x24], #0x10\n"
-      "ld1 { v9.8h }, [x15], #0x10\n"
+      "ld1 { v9.8h }, [x16], #0x10\n"
       "ld1 { v13.8h }, [x25], #0x10\n"
       "ld1 { v17.8h }, [x24], #0x10\n"
       "tbz x8, #3, 105f\n"
-      "ld1 { v10.8h }, [x15], #0x10\n"
+      "ld1 { v10.8h }, [x16], #0x10\n"
       "ld1 { v14.8h }, [x25], #0x10\n"
       "ld1 { v18.8h }, [x24], #0x10\n"
       "tbz x8, #2, 103f\n"
-      "ldr d11, [x15], #0x8\n"
+      "ldr d11, [x16], #0x8\n"
       "ldr d15, [x25], #0x8\n"
       "ldr d19, [x24], #0x8\n"
       "tbz x8, #1, 102f\n"
-      "ld1 { v11.s }[2], [x15], #0x4\n"
-      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x16], #0x4\n"
+      "mov x20, #0x3c\n"
       "ld1 { v15.s }[2], [x25], #0x4\n"
       "ld1 { v19.s }[2], [x24], #0x4\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v11.h }[6], [x15]\n"
+      "ld1 { v11.h }[6], [x16]\n"
       "ld1 { v15.h }[6], [x25]\n"
       "ld1 { v19.h }[6], [x24]\n"
       "b 117f\n"
       "102:"  // Height 3: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v11.h }[4], [x15]\n"
+      "ld1 { v11.h }[4], [x16]\n"
       "ld1 { v15.h }[4], [x25]\n"
       "ld1 { v19.h }[4], [x24]\n"
       "b 117f\n"
       "103:"  // Height 3: Partial accumulate: partial_2_24
       "tbz x8, #1, 104f\n"
-      "ldr s11, [x15], #0x4\n"
+      "ldr s11, [x16], #0x4\n"
+      "mov x20, #0x34\n"
       "ldr s15, [x25], #0x4\n"
-      "mov x19, #0x34\n"
       "ldr s19, [x24], #0x4\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v11.h }[2], [x15]\n"
+      "ld1 { v11.h }[2], [x16]\n"
       "ld1 { v15.h }[2], [x25]\n"
       "ld1 { v19.h }[2], [x24]\n"
       "b 117f\n"
       "104:"  // Height 3: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 117f\n"
-      "ldr h11, [x15, #0x0]\n"
+      "ldr h11, [x16, #0x0]\n"
       "ldr h15, [x25, #0x0]\n"
       "ldr h19, [x24, #0x0]\n"
       "b 117f\n"
       "105:"  // Height 3: Partial accumulate: partial_4_16
       "tbz x8, #2, 107f\n"
-      "ldr d10, [x15], #0x8\n"
+      "ldr d10, [x16], #0x8\n"
       "ldr d14, [x25], #0x8\n"
       "ldr d18, [x24], #0x8\n"
       "tbz x8, #1, 106f\n"
-      "ld1 { v10.s }[2], [x15], #0x4\n"
-      "mov x19, #0x2c\n"
+      "ld1 { v10.s }[2], [x16], #0x4\n"
+      "mov x20, #0x2c\n"
       "ld1 { v14.s }[2], [x25], #0x4\n"
       "ld1 { v18.s }[2], [x24], #0x4\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v10.h }[6], [x15]\n"
+      "ld1 { v10.h }[6], [x16]\n"
       "ld1 { v14.h }[6], [x25]\n"
       "ld1 { v18.h }[6], [x24]\n"
       "b 117f\n"
       "106:"  // Height 3: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v10.h }[4], [x15]\n"
+      "ld1 { v10.h }[4], [x16]\n"
       "ld1 { v14.h }[4], [x25]\n"
       "ld1 { v18.h }[4], [x24]\n"
       "b 117f\n"
       "107:"  // Height 3: Partial accumulate: partial_2_16
       "tbz x8, #1, 108f\n"
-      "ldr s10, [x15], #0x4\n"
+      "ldr s10, [x16], #0x4\n"
+      "mov x20, #0x24\n"
       "ldr s14, [x25], #0x4\n"
-      "mov x19, #0x24\n"
       "ldr s18, [x24], #0x4\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v10.h }[2], [x15]\n"
+      "ld1 { v10.h }[2], [x16]\n"
       "ld1 { v14.h }[2], [x25]\n"
       "ld1 { v18.h }[2], [x24]\n"
       "b 117f\n"
       "108:"  // Height 3: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 117f\n"
-      "ldr h10, [x15, #0x0]\n"
+      "ldr h10, [x16, #0x0]\n"
       "ldr h14, [x25, #0x0]\n"
       "ldr h18, [x24, #0x0]\n"
       "b 117f\n"
       "109:"  // Height 3: Partial accumulate: partial_8_0
       "tbz x8, #3, 113f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
       "ld1 { v12.8h }, [x25], #0x10\n"
       "ld1 { v16.8h }, [x24], #0x10\n"
       "tbz x8, #2, 111f\n"
-      "ldr d9, [x15], #0x8\n"
+      "ldr d9, [x16], #0x8\n"
       "ldr d13, [x25], #0x8\n"
       "ldr d17, [x24], #0x8\n"
       "tbz x8, #1, 110f\n"
-      "ld1 { v9.s }[2], [x15], #0x4\n"
-      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x16], #0x4\n"
+      "mov x20, #0x1c\n"
       "ld1 { v13.s }[2], [x25], #0x4\n"
       "ld1 { v17.s }[2], [x24], #0x4\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v9.h }[6], [x15]\n"
+      "ld1 { v9.h }[6], [x16]\n"
       "ld1 { v13.h }[6], [x25]\n"
       "ld1 { v17.h }[6], [x24]\n"
       "b 117f\n"
       "110:"  // Height 3: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v9.h }[4], [x15]\n"
+      "ld1 { v9.h }[4], [x16]\n"
       "ld1 { v13.h }[4], [x25]\n"
       "ld1 { v17.h }[4], [x24]\n"
       "b 117f\n"
       "111:"  // Height 3: Partial accumulate: partial_2_8
       "tbz x8, #1, 112f\n"
-      "ldr s9, [x15], #0x4\n"
+      "ldr s9, [x16], #0x4\n"
+      "mov x20, #0x14\n"
       "ldr s13, [x25], #0x4\n"
-      "mov x19, #0x14\n"
       "ldr s17, [x24], #0x4\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v9.h }[2], [x15]\n"
+      "ld1 { v9.h }[2], [x16]\n"
       "ld1 { v13.h }[2], [x25]\n"
       "ld1 { v17.h }[2], [x24]\n"
       "b 117f\n"
       "112:"  // Height 3: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 117f\n"
-      "ldr h9, [x15, #0x0]\n"
+      "ldr h9, [x16, #0x0]\n"
       "ldr h13, [x25, #0x0]\n"
       "ldr h17, [x24, #0x0]\n"
       "b 117f\n"
       "113:"  // Height 3: Partial accumulate: partial_4_0
       "tbz x8, #2, 115f\n"
-      "ldr d8, [x15], #0x8\n"
+      "ldr d8, [x16], #0x8\n"
       "ldr d12, [x25], #0x8\n"
       "ldr d16, [x24], #0x8\n"
       "tbz x8, #1, 114f\n"
-      "ld1 { v8.s }[2], [x15], #0x4\n"
-      "mov x19, #0xc\n"
+      "ld1 { v8.s }[2], [x16], #0x4\n"
+      "mov x20, #0xc\n"
       "ld1 { v12.s }[2], [x25], #0x4\n"
       "ld1 { v16.s }[2], [x24], #0x4\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v8.h }[6], [x15]\n"
+      "ld1 { v8.h }[6], [x16]\n"
       "ld1 { v12.h }[6], [x25]\n"
       "ld1 { v16.h }[6], [x24]\n"
       "b 117f\n"
       "114:"  // Height 3: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v8.h }[4], [x15]\n"
+      "ld1 { v8.h }[4], [x16]\n"
       "ld1 { v12.h }[4], [x25]\n"
       "ld1 { v16.h }[4], [x24]\n"
       "b 117f\n"
       "115:"  // Height 3: Partial accumulate: partial_2_0
       "tbz x8, #1, 116f\n"
-      "ldr s8, [x15], #0x4\n"
+      "ldr s8, [x16], #0x4\n"
+      "mov x20, #0x4\n"
       "ldr s12, [x25], #0x4\n"
-      "mov x19, #0x4\n"
       "ldr s16, [x24], #0x4\n"
       "tbz x8, #0, 117f\n"
-      "ld1 { v8.h }[2], [x15]\n"
+      "ld1 { v8.h }[2], [x16]\n"
       "ld1 { v12.h }[2], [x25]\n"
       "ld1 { v16.h }[2], [x24]\n"
       "b 117f\n"
       "116:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr h8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr h8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "ldr h12, [x25, #0x0]\n"
       "ldr h16, [x24, #0x0]\n"
       "117:"  // Height 3: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 120f\n"
       "118:"  // Height 3: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "ldr q12, [x25, #0x0]\n"
       "ldr q13, [x25, #0x10]\n"
       "ldr q14, [x25, #0x20]\n"
@@ -1525,616 +1522,616 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "movi v18.16b, #0x0\n"
       "movi v19.16b, #0x0\n"
       "120:"  // Height 3: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "121:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 122f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "cbnz x14, 123f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #1\n"
-      "add x28, x28, x19, LSL #1\n"
-      "add x26, x26, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "cbnz x15, 123f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #1\n"
+      "add x12, x12, x20, LSL #1\n"
+      "add x11, x11, x20, LSL #1\n"
       "b 123f\n"
       "122:"  // Height 3: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19, LSL #1\n"
-      "add x26, x28, x19, LSL #1\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21, LSL #1\n"
+      "add x11, x12, x21, LSL #1\n"
       "123:"  // Height 3: input setup done
-      "cmp x13, #0x8\n"
+      "cmp x14, #0x8\n"
       "blt 126f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x10\n"
-      "ldr q2, [x26, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x10\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 125f\n"
       "124:"  // Height 3: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr d7, [x17, #0x10]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x18]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr d6, [x17, #0x20]\n"
-      "ldr x10, [x17, #0x28]\n"
-      "add x12, x12, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "ldr x11, [x17, #0x38]\n"
-      "add x28, x28, #0x10\n"
+      "ldr d21, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x10\n"
+      "mov v21.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr x10, [x17, #0x48]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr x9, [x12, #0x8]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x40]\n"
-      "add x26, x26, #0x10\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x50]\n"
-      "sub x13, x13, #0x8\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr x10, [x17, #0x68]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr x27, [x28, #0x8]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x60]\n"
-      "cmp x13, #0x10\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr x25, [x26, #0x8]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr x10, [x17, #0x88]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr x10, [x17, #0xa8]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr x10, [x17, #0xc8]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr x10, [x17, #0xe8]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr x10, [x17, #0x108]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr x11, [x17, #0x118]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr x10, [x17, #0x128]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr x11, [x17, #0x138]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr x10, [x17, #0x148]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr x11, [x17, #0x158]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr x10, [x17, #0x168]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr x11, [x17, #0x178]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr x10, [x17, #0x188]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr x11, [x17, #0x198]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr x10, [x17, #0x1a8]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr x11, [x17, #0x1b8]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr x10, [x17, #0x1c8]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr x11, [x17, #0x1d8]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr x10, [x17, #0x1e8]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr x11, [x17, #0x1f8]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x1f0]\n"
+      "ldr d20, [x17, #0x30]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr d21, [x17, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr d20, [x17, #0x50]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr d21, [x17, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr d20, [x17, #0x70]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr d21, [x17, #0x80]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr d20, [x17, #0x90]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr d21, [x17, #0xa0]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr d20, [x17, #0xb0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr d21, [x17, #0xc0]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr d20, [x17, #0xd0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr d21, [x17, #0xe0]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x108]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr d20, [x17, #0xf0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0x118]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr d21, [x17, #0x100]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x128]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr d20, [x17, #0x110]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x138]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr d21, [x17, #0x120]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x148]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr d20, [x17, #0x130]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x158]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr d21, [x17, #0x140]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x168]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr d20, [x17, #0x150]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x178]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr d21, [x17, #0x160]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x188]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr d20, [x17, #0x170]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x198]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr d21, [x17, #0x180]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x1a8]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr d20, [x17, #0x190]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1b8]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr d21, [x17, #0x1a0]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1c8]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr d20, [x17, #0x1b0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1d8]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr d21, [x17, #0x1c0]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1e8]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr d20, [x17, #0x1d0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "ldr x20, [x17, #0x1f8]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr d21, [x17, #0x1e0]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
+      "add x13, x13, #0x10\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr d20, [x17, #0x1f0]\n"
+      "mov v20.d[1], x20\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       "add x17, x17, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "ldr x10, [x17, #0x8]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "mov v7.d[1], x11\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
+      "ldr x20, [x17, #0x8]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "ldr x23, [x13, #0x8]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "ldr d0, [x12, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr d1, [x28, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "mov v6.d[1], x10\n"
-      "mov v0.d[1], x9\n"
-      "ldr d2, [x26, #0x0]\n"
-      "mov v1.d[1], x27\n"
-      "mov v2.d[1], x25\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "ldr x22, [x12, #0x8]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "sub x14, x14, #0x8\n"
+      "ldr d7, [x17, #0x10]\n"
+      "cmp x14, #0x10\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v0.d[1], x23\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v1.d[1], x22\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "mov v2.d[1], x21\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v7.d[1], x20\n"
       "bge 124b\n"
       "125:"  // Height 3: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
+      "add x13, x13, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "sub x13, x13, #0x8\n"
+      "add x12, x12, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q21, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "sub x14, x14, #0x8\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "add x28, x28, #0x10\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "add x26, x26, #0x10\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "ldr q6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "ldr q7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "ldr q20, [x17, #0x30]\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr q21, [x17, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr q20, [x17, #0x50]\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x17, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x17, #0x70]\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x17, #0x80]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x17, #0x90]\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x17, #0xa0]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x17, #0xb0]\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x17, #0xc0]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x17, #0xd0]\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x17, #0xe0]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x17, #0xf0]\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x17, #0x100]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x17, #0x110]\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x17, #0x120]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x17, #0x130]\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x17, #0x140]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x17, #0x150]\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x17, #0x160]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x17, #0x170]\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x17, #0x180]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x17, #0x190]\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x17, #0x1a0]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x17, #0x1b0]\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x17, #0x1c0]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x17, #0x1d0]\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr q21, [x17, #0x1e0]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr q20, [x17, #0x1f0]\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
       "add x17, x17, #0x200\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
       "126:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x13, 128f\n"
+      "cbz x14, 128f\n"
       "127:"  // Height 3: Multiply loop: Odd block loop
-      "ldr h0, [x12], #0x2\n"
-      "sub x13, x13, #0x1\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr h2, [x13], #0x2\n"
+      "sub x14, x14, #0x1\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h0, [x11], #0x2\n"
+      "ldr q21, [x17, #0x0]\n"
+      "fmla v8.8h, v21.8h, v2.h[0]\n"
+      "ldr q20, [x17, #0x10]\n"
+      "fmla v12.8h, v21.8h, v1.h[0]\n"
+      "fmla v16.8h, v21.8h, v0.h[0]\n"
+      "ldr q21, [x17, #0x20]\n"
+      "fmla v9.8h, v20.8h, v2.h[0]\n"
+      "fmla v13.8h, v20.8h, v1.h[0]\n"
+      "fmla v17.8h, v20.8h, v0.h[0]\n"
+      "ldr q20, [x17, #0x30]\n"
+      "fmla v10.8h, v21.8h, v2.h[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "cbnz x13, 127b\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "fmla v18.8h, v21.8h, v0.h[0]\n"
+      "fmla v11.8h, v20.8h, v2.h[0]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v0.h[0]\n"
+      "cbnz x14, 127b\n"
       "128:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 121b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "add x25, x15, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #1\n"
       "prfm pstl1keep, [x24, #0x0]\n"
       "tbz %x[flags], #1, 129f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v20.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v20.8h\n"
+      "fmin v9.8h, v9.8h, v20.8h\n"
+      "fmin v10.8h, v10.8h, v20.8h\n"
+      "fmin v11.8h, v11.8h, v20.8h\n"
+      "fmin v12.8h, v12.8h, v20.8h\n"
+      "fmin v13.8h, v13.8h, v20.8h\n"
+      "fmin v14.8h, v14.8h, v20.8h\n"
+      "fmin v15.8h, v15.8h, v20.8h\n"
+      "fmin v16.8h, v16.8h, v20.8h\n"
+      "fmin v17.8h, v17.8h, v20.8h\n"
+      "fmin v18.8h, v18.8h, v20.8h\n"
+      "fmin v19.8h, v19.8h, v20.8h\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
-      "ld1r { v0.8h }, [x19]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmin v16.8h, v16.8h, v0.8h\n"
-      "fmin v17.8h, v17.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
-      "fmax v12.8h, v12.8h, v1.8h\n"
-      "fmax v13.8h, v13.8h, v1.8h\n"
-      "fmax v14.8h, v14.8h, v1.8h\n"
-      "fmax v15.8h, v15.8h, v1.8h\n"
-      "fmax v16.8h, v16.8h, v1.8h\n"
-      "fmax v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v0.8h\n"
-      "fmin v19.8h, v19.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v1.8h\n"
-      "fmax v19.8h, v19.8h, v1.8h\n"
+      "ld1r { v20.8h }, [x20]\n"
+      "fmax v8.8h, v8.8h, v20.8h\n"
+      "fmax v9.8h, v9.8h, v20.8h\n"
+      "fmax v10.8h, v10.8h, v20.8h\n"
+      "fmax v11.8h, v11.8h, v20.8h\n"
+      "fmax v12.8h, v12.8h, v20.8h\n"
+      "fmax v13.8h, v13.8h, v20.8h\n"
+      "fmax v14.8h, v14.8h, v20.8h\n"
+      "fmax v15.8h, v15.8h, v20.8h\n"
+      "fmax v16.8h, v16.8h, v20.8h\n"
+      "fmax v17.8h, v17.8h, v20.8h\n"
+      "fmax v18.8h, v18.8h, v20.8h\n"
+      "fmax v19.8h, v19.8h, v20.8h\n"
       "129:"  // Height 3: No activation
       "cmp x8, #0x20\n"
       "bge 146f\n"
       "tbz x8, #4, 137f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
-      "st1 { v9.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
+      "st1 { v9.8h }, [x16], #0x10\n"
       "st1 { v12.8h }, [x25], #0x10\n"
       "st1 { v13.8h }, [x25], #0x10\n"
       "st1 { v16.8h }, [x24], #0x10\n"
       "st1 { v17.8h }, [x24], #0x10\n"
       "tbz x8, #3, 133f\n"
-      "st1 { v10.8h }, [x15], #0x10\n"
+      "st1 { v10.8h }, [x16], #0x10\n"
       "st1 { v14.8h }, [x25], #0x10\n"
       "st1 { v18.8h }, [x24], #0x10\n"
       "tbz x8, #2, 131f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "str d15, [x25], #0x8\n"
       "str d19, [x24], #0x8\n"
       "tbz x8, #1, 130f\n"
-      "st1 { v11.s }[2], [x15], #0x4\n"
+      "st1 { v11.s }[2], [x16], #0x4\n"
       "st1 { v15.s }[2], [x25], #0x4\n"
       "st1 { v19.s }[2], [x24], #0x4\n"
       "tbz x8, #0, 145f\n"
-      "st1 { v11.h }[6], [x15]\n"
+      "st1 { v11.h }[6], [x16]\n"
       "st1 { v15.h }[6], [x25]\n"
       "st1 { v19.h }[6], [x24]\n"
       "b 145f\n"
       "130:"  // Height 3: Partial direct writeback: partial_1_28
       "tbz x8, #0, 145f\n"
-      "st1 { v11.h }[4], [x15]\n"
+      "st1 { v11.h }[4], [x16]\n"
       "st1 { v15.h }[4], [x25]\n"
       "st1 { v19.h }[4], [x24]\n"
       "b 145f\n"
       "131:"  // Height 3: Partial direct writeback: partial_2_24
       "tbz x8, #1, 132f\n"
-      "str s11, [x15], #0x4\n"
+      "str s11, [x16], #0x4\n"
       "str s15, [x25], #0x4\n"
       "str s19, [x24], #0x4\n"
       "tbz x8, #0, 145f\n"
-      "st1 { v11.h }[2], [x15]\n"
+      "st1 { v11.h }[2], [x16]\n"
       "st1 { v15.h }[2], [x25]\n"
       "st1 { v19.h }[2], [x24]\n"
       "b 145f\n"
       "132:"  // Height 3: Partial direct writeback: partial_1_24
       "tbz x8, #0, 145f\n"
-      "str h11, [x15, #0x0]\n"
+      "str h11, [x16, #0x0]\n"
       "str h15, [x25, #0x0]\n"
       "str h19, [x24, #0x0]\n"
       "b 145f\n"
       "133:"  // Height 3: Partial direct writeback: partial_4_16
       "tbz x8, #2, 135f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "str d14, [x25], #0x8\n"
       "str d18, [x24], #0x8\n"
       "tbz x8, #1, 134f\n"
-      "st1 { v10.s }[2], [x15], #0x4\n"
+      "st1 { v10.s }[2], [x16], #0x4\n"
       "st1 { v14.s }[2], [x25], #0x4\n"
       "st1 { v18.s }[2], [x24], #0x4\n"
       "tbz x8, #0, 145f\n"
-      "st1 { v10.h }[6], [x15]\n"
+      "st1 { v10.h }[6], [x16]\n"
       "st1 { v14.h }[6], [x25]\n"
       "st1 { v18.h }[6], [x24]\n"
       "b 145f\n"
       "134:"  // Height 3: Partial direct writeback: partial_1_20
       "tbz x8, #0, 145f\n"
-      "st1 { v10.h }[4], [x15]\n"
+      "st1 { v10.h }[4], [x16]\n"
       "st1 { v14.h }[4], [x25]\n"
       "st1 { v18.h }[4], [x24]\n"
       "b 145f\n"
       "135:"  // Height 3: Partial direct writeback: partial_2_16
       "tbz x8, #1, 136f\n"
-      "str s10, [x15], #0x4\n"
+      "str s10, [x16], #0x4\n"
       "str s14, [x25], #0x4\n"
       "str s18, [x24], #0x4\n"
       "tbz x8, #0, 145f\n"
-      "st1 { v10.h }[2], [x15]\n"
+      "st1 { v10.h }[2], [x16]\n"
       "st1 { v14.h }[2], [x25]\n"
       "st1 { v18.h }[2], [x24]\n"
       "b 145f\n"
       "136:"  // Height 3: Partial direct writeback: partial_1_16
       "tbz x8, #0, 145f\n"
-      "str h10, [x15, #0x0]\n"
+      "str h10, [x16, #0x0]\n"
       "str h14, [x25, #0x0]\n"
       "str h18, [x24, #0x0]\n"
       "b 145f\n"
       "137:"  // Height 3: Partial direct writeback: partial_8_0
       "tbz x8, #3, 141f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
       "st1 { v12.8h }, [x25], #0x10\n"
       "st1 { v16.8h }, [x24], #0x10\n"
       "tbz x8, #2, 139f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "str d13, [x25], #0x8\n"
       "str d17, [x24], #0x8\n"
       "tbz x8, #1, 138f\n"
-      "st1 { v9.s }[2], [x15], #0x4\n"
+      "st1 { v9.s }[2], [x16], #0x4\n"
       "st1 { v13.s }[2], [x25], #0x4\n"
       "st1 { v17.s }[2], [x24], #0x4\n"
       "tbz x8, #0, 145f\n"
-      "st1 { v9.h }[6], [x15]\n"
+      "st1 { v9.h }[6], [x16]\n"
       "st1 { v13.h }[6], [x25]\n"
       "st1 { v17.h }[6], [x24]\n"
       "b 145f\n"
       "138:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x8, #0, 145f\n"
-      "st1 { v9.h }[4], [x15]\n"
+      "st1 { v9.h }[4], [x16]\n"
       "st1 { v13.h }[4], [x25]\n"
       "st1 { v17.h }[4], [x24]\n"
       "b 145f\n"
       "139:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x8, #1, 140f\n"
-      "str s9, [x15], #0x4\n"
+      "str s9, [x16], #0x4\n"
       "str s13, [x25], #0x4\n"
       "str s17, [x24], #0x4\n"
       "tbz x8, #0, 145f\n"
-      "st1 { v9.h }[2], [x15]\n"
+      "st1 { v9.h }[2], [x16]\n"
       "st1 { v13.h }[2], [x25]\n"
       "st1 { v17.h }[2], [x24]\n"
       "b 145f\n"
       "140:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x8, #0, 145f\n"
-      "str h9, [x15, #0x0]\n"
+      "str h9, [x16, #0x0]\n"
       "str h13, [x25, #0x0]\n"
       "str h17, [x24, #0x0]\n"
       "b 145f\n"
       "141:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x8, #2, 143f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "str d12, [x25], #0x8\n"
       "str d16, [x24], #0x8\n"
       "tbz x8, #1, 142f\n"
-      "st1 { v8.s }[2], [x15], #0x4\n"
+      "st1 { v8.s }[2], [x16], #0x4\n"
       "st1 { v12.s }[2], [x25], #0x4\n"
       "st1 { v16.s }[2], [x24], #0x4\n"
       "tbz x8, #0, 145f\n"
-      "st1 { v8.h }[6], [x15]\n"
+      "st1 { v8.h }[6], [x16]\n"
       "st1 { v12.h }[6], [x25]\n"
       "st1 { v16.h }[6], [x24]\n"
       "b 145f\n"
       "142:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x8, #0, 145f\n"
-      "st1 { v8.h }[4], [x15]\n"
+      "st1 { v8.h }[4], [x16]\n"
       "st1 { v12.h }[4], [x25]\n"
       "st1 { v16.h }[4], [x24]\n"
       "b 145f\n"
       "143:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x8, #1, 144f\n"
-      "str s8, [x15], #0x4\n"
+      "str s8, [x16], #0x4\n"
       "str s12, [x25], #0x4\n"
       "str s16, [x24], #0x4\n"
       "tbz x8, #0, 145f\n"
-      "st1 { v8.h }[2], [x15]\n"
+      "st1 { v8.h }[2], [x16]\n"
       "st1 { v12.h }[2], [x25]\n"
       "st1 { v16.h }[2], [x24]\n"
       "b 145f\n"
       "144:"  // Height 3: Partial direct writeback: partial_1_0
-      "str h8, [x15, #0x0]\n"
+      "str h8, [x16, #0x0]\n"
       "str h12, [x25, #0x0]\n"
       "str h16, [x24, #0x0]\n"
       "145:"  // Height 3: Partial direct writeback: Done
       "b 147f\n"
       "146:"  // Height 3: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "str q12, [x25, #0x0]\n"
       "str q13, [x25, #0x10]\n"
       "str q14, [x25, #0x20]\n"
@@ -2148,250 +2145,250 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "bgt 100b\n"
       "b 296f\n"
       "148:"  // Height 4
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x16, %x[bias]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
       "149:"  // Height 4: Column loop
-      "cbz x16, 150f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "cbz x7, 150f\n"
+      "ldr q8, [x7, #0x0]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q9, [x7, #0x10]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x7, #0x20]\n"
       "mov v14.16b, v10.16b\n"
+      "ldr q11, [x7, #0x30]\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "add x7, x7, #0x40\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
       "mov v23.16b, v11.16b\n"
       "b 169f\n"
       "150:"  // Height 4: no bias
       "tbz %x[flags], #0, 168f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
       "cmp x8, #0x20\n"
-      "add x25, x15, x19, LSL #1\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
       "bge 167f\n"
       "tbz x8, #4, 158f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
       "ld1 { v12.8h }, [x25], #0x10\n"
       "ld1 { v16.8h }, [x24], #0x10\n"
-      "ld1 { v9.8h }, [x15], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "ld1 { v9.8h }, [x16], #0x10\n"
       "ld1 { v13.8h }, [x25], #0x10\n"
       "ld1 { v17.8h }, [x24], #0x10\n"
-      "ld1 { v20.8h }, [x23], #0x10\n"
       "ld1 { v21.8h }, [x23], #0x10\n"
       "tbz x8, #3, 154f\n"
-      "ld1 { v10.8h }, [x15], #0x10\n"
+      "ld1 { v10.8h }, [x16], #0x10\n"
       "ld1 { v14.8h }, [x25], #0x10\n"
       "ld1 { v18.8h }, [x24], #0x10\n"
       "ld1 { v22.8h }, [x23], #0x10\n"
       "tbz x8, #2, 152f\n"
-      "ldr d11, [x15], #0x8\n"
+      "ldr d11, [x16], #0x8\n"
       "ldr d15, [x25], #0x8\n"
       "ldr d19, [x24], #0x8\n"
       "ldr d23, [x23], #0x8\n"
       "tbz x8, #1, 151f\n"
-      "ld1 { v11.s }[2], [x15], #0x4\n"
-      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x16], #0x4\n"
+      "mov x20, #0x3c\n"
       "ld1 { v15.s }[2], [x25], #0x4\n"
       "ld1 { v19.s }[2], [x24], #0x4\n"
       "ld1 { v23.s }[2], [x23], #0x4\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v11.h }[6], [x15]\n"
+      "ld1 { v11.h }[6], [x16]\n"
       "ld1 { v15.h }[6], [x25]\n"
       "ld1 { v19.h }[6], [x24]\n"
       "ld1 { v23.h }[6], [x23]\n"
       "b 166f\n"
       "151:"  // Height 4: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v11.h }[4], [x15]\n"
+      "ld1 { v11.h }[4], [x16]\n"
       "ld1 { v15.h }[4], [x25]\n"
       "ld1 { v19.h }[4], [x24]\n"
       "ld1 { v23.h }[4], [x23]\n"
       "b 166f\n"
       "152:"  // Height 4: Partial accumulate: partial_2_24
       "tbz x8, #1, 153f\n"
-      "ldr s11, [x15], #0x4\n"
+      "ldr s11, [x16], #0x4\n"
+      "mov x20, #0x34\n"
       "ldr s15, [x25], #0x4\n"
-      "mov x19, #0x34\n"
       "ldr s19, [x24], #0x4\n"
       "ldr s23, [x23], #0x4\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v11.h }[2], [x15]\n"
+      "ld1 { v11.h }[2], [x16]\n"
       "ld1 { v15.h }[2], [x25]\n"
       "ld1 { v19.h }[2], [x24]\n"
       "ld1 { v23.h }[2], [x23]\n"
       "b 166f\n"
       "153:"  // Height 4: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 166f\n"
-      "ldr h11, [x15, #0x0]\n"
+      "ldr h11, [x16, #0x0]\n"
       "ldr h15, [x25, #0x0]\n"
       "ldr h19, [x24, #0x0]\n"
       "ldr h23, [x23, #0x0]\n"
       "b 166f\n"
       "154:"  // Height 4: Partial accumulate: partial_4_16
       "tbz x8, #2, 156f\n"
-      "ldr d10, [x15], #0x8\n"
+      "ldr d10, [x16], #0x8\n"
       "ldr d14, [x25], #0x8\n"
       "ldr d18, [x24], #0x8\n"
       "ldr d22, [x23], #0x8\n"
       "tbz x8, #1, 155f\n"
-      "ld1 { v10.s }[2], [x15], #0x4\n"
-      "mov x19, #0x2c\n"
+      "ld1 { v10.s }[2], [x16], #0x4\n"
+      "mov x20, #0x2c\n"
       "ld1 { v14.s }[2], [x25], #0x4\n"
       "ld1 { v18.s }[2], [x24], #0x4\n"
       "ld1 { v22.s }[2], [x23], #0x4\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v10.h }[6], [x15]\n"
+      "ld1 { v10.h }[6], [x16]\n"
       "ld1 { v14.h }[6], [x25]\n"
       "ld1 { v18.h }[6], [x24]\n"
       "ld1 { v22.h }[6], [x23]\n"
       "b 166f\n"
       "155:"  // Height 4: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v10.h }[4], [x15]\n"
+      "ld1 { v10.h }[4], [x16]\n"
       "ld1 { v14.h }[4], [x25]\n"
       "ld1 { v18.h }[4], [x24]\n"
       "ld1 { v22.h }[4], [x23]\n"
       "b 166f\n"
       "156:"  // Height 4: Partial accumulate: partial_2_16
       "tbz x8, #1, 157f\n"
-      "ldr s10, [x15], #0x4\n"
+      "ldr s10, [x16], #0x4\n"
+      "mov x20, #0x24\n"
       "ldr s14, [x25], #0x4\n"
-      "mov x19, #0x24\n"
       "ldr s18, [x24], #0x4\n"
       "ldr s22, [x23], #0x4\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v10.h }[2], [x15]\n"
+      "ld1 { v10.h }[2], [x16]\n"
       "ld1 { v14.h }[2], [x25]\n"
       "ld1 { v18.h }[2], [x24]\n"
       "ld1 { v22.h }[2], [x23]\n"
       "b 166f\n"
       "157:"  // Height 4: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 166f\n"
-      "ldr h10, [x15, #0x0]\n"
+      "ldr h10, [x16, #0x0]\n"
       "ldr h14, [x25, #0x0]\n"
       "ldr h18, [x24, #0x0]\n"
       "ldr h22, [x23, #0x0]\n"
       "b 166f\n"
       "158:"  // Height 4: Partial accumulate: partial_8_0
       "tbz x8, #3, 162f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
       "ld1 { v12.8h }, [x25], #0x10\n"
       "ld1 { v16.8h }, [x24], #0x10\n"
       "ld1 { v20.8h }, [x23], #0x10\n"
       "tbz x8, #2, 160f\n"
-      "ldr d9, [x15], #0x8\n"
+      "ldr d9, [x16], #0x8\n"
       "ldr d13, [x25], #0x8\n"
       "ldr d17, [x24], #0x8\n"
       "ldr d21, [x23], #0x8\n"
       "tbz x8, #1, 159f\n"
-      "ld1 { v9.s }[2], [x15], #0x4\n"
-      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x16], #0x4\n"
+      "mov x20, #0x1c\n"
       "ld1 { v13.s }[2], [x25], #0x4\n"
       "ld1 { v17.s }[2], [x24], #0x4\n"
       "ld1 { v21.s }[2], [x23], #0x4\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v9.h }[6], [x15]\n"
+      "ld1 { v9.h }[6], [x16]\n"
       "ld1 { v13.h }[6], [x25]\n"
       "ld1 { v17.h }[6], [x24]\n"
       "ld1 { v21.h }[6], [x23]\n"
       "b 166f\n"
       "159:"  // Height 4: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v9.h }[4], [x15]\n"
+      "ld1 { v9.h }[4], [x16]\n"
       "ld1 { v13.h }[4], [x25]\n"
       "ld1 { v17.h }[4], [x24]\n"
       "ld1 { v21.h }[4], [x23]\n"
       "b 166f\n"
       "160:"  // Height 4: Partial accumulate: partial_2_8
       "tbz x8, #1, 161f\n"
-      "ldr s9, [x15], #0x4\n"
+      "ldr s9, [x16], #0x4\n"
+      "mov x20, #0x14\n"
       "ldr s13, [x25], #0x4\n"
-      "mov x19, #0x14\n"
       "ldr s17, [x24], #0x4\n"
       "ldr s21, [x23], #0x4\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v9.h }[2], [x15]\n"
+      "ld1 { v9.h }[2], [x16]\n"
       "ld1 { v13.h }[2], [x25]\n"
       "ld1 { v17.h }[2], [x24]\n"
       "ld1 { v21.h }[2], [x23]\n"
       "b 166f\n"
       "161:"  // Height 4: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 166f\n"
-      "ldr h9, [x15, #0x0]\n"
+      "ldr h9, [x16, #0x0]\n"
       "ldr h13, [x25, #0x0]\n"
       "ldr h17, [x24, #0x0]\n"
       "ldr h21, [x23, #0x0]\n"
       "b 166f\n"
       "162:"  // Height 4: Partial accumulate: partial_4_0
       "tbz x8, #2, 164f\n"
-      "ldr d8, [x15], #0x8\n"
+      "ldr d8, [x16], #0x8\n"
       "ldr d12, [x25], #0x8\n"
       "ldr d16, [x24], #0x8\n"
       "ldr d20, [x23], #0x8\n"
       "tbz x8, #1, 163f\n"
-      "ld1 { v8.s }[2], [x15], #0x4\n"
-      "mov x19, #0xc\n"
+      "ld1 { v8.s }[2], [x16], #0x4\n"
+      "mov x20, #0xc\n"
       "ld1 { v12.s }[2], [x25], #0x4\n"
       "ld1 { v16.s }[2], [x24], #0x4\n"
       "ld1 { v20.s }[2], [x23], #0x4\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v8.h }[6], [x15]\n"
+      "ld1 { v8.h }[6], [x16]\n"
       "ld1 { v12.h }[6], [x25]\n"
       "ld1 { v16.h }[6], [x24]\n"
       "ld1 { v20.h }[6], [x23]\n"
       "b 166f\n"
       "163:"  // Height 4: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v8.h }[4], [x15]\n"
+      "ld1 { v8.h }[4], [x16]\n"
       "ld1 { v12.h }[4], [x25]\n"
       "ld1 { v16.h }[4], [x24]\n"
       "ld1 { v20.h }[4], [x23]\n"
       "b 166f\n"
       "164:"  // Height 4: Partial accumulate: partial_2_0
       "tbz x8, #1, 165f\n"
-      "ldr s8, [x15], #0x4\n"
+      "ldr s8, [x16], #0x4\n"
+      "mov x20, #0x4\n"
       "ldr s12, [x25], #0x4\n"
-      "mov x19, #0x4\n"
       "ldr s16, [x24], #0x4\n"
       "ldr s20, [x23], #0x4\n"
       "tbz x8, #0, 166f\n"
-      "ld1 { v8.h }[2], [x15]\n"
+      "ld1 { v8.h }[2], [x16]\n"
       "ld1 { v12.h }[2], [x25]\n"
       "ld1 { v16.h }[2], [x24]\n"
       "ld1 { v20.h }[2], [x23]\n"
       "b 166f\n"
       "165:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr h8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr h8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "ldr h12, [x25, #0x0]\n"
       "ldr h16, [x24, #0x0]\n"
       "ldr h20, [x23, #0x0]\n"
       "166:"  // Height 4: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 169f\n"
       "167:"  // Height 4: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "ldr q12, [x25, #0x0]\n"
       "ldr q13, [x25, #0x10]\n"
       "ldr q14, [x25, #0x20]\n"
@@ -2423,545 +2420,545 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "movi v22.16b, #0x0\n"
       "movi v23.16b, #0x0\n"
       "169:"  // Height 4: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "170:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 171f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "cbnz x14, 172f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #1\n"
-      "add x28, x28, x19, LSL #1\n"
-      "add x26, x26, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "cbnz x15, 172f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #1\n"
+      "add x12, x12, x20, LSL #1\n"
+      "add x11, x11, x20, LSL #1\n"
+      "add x10, x10, x20, LSL #1\n"
       "b 172f\n"
       "171:"  // Height 4: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19, LSL #1\n"
-      "add x26, x28, x19, LSL #1\n"
-      "add x24, x26, x19, LSL #1\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21, LSL #1\n"
+      "add x11, x12, x21, LSL #1\n"
+      "add x10, x11, x21, LSL #1\n"
       "172:"  // Height 4: input setup done
-      "cmp x13, #0x8\n"
+      "cmp x14, #0x8\n"
       "blt 175f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x10\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x10\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 174f\n"
       "173:"  // Height 4: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr d7, [x17, #0x10]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x18]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr x10, [x17, #0x28]\n"
+      "add x13, x13, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr d6, [x17, #0x20]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x38]\n"
-      "add x12, x12, #0x10\n"
-      "add x28, x28, #0x10\n"
+      "ldr d25, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x10\n"
+      "mov v25.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x12, x12, #0x10\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr x10, [x17, #0x48]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr x9, [x12, #0x8]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr x27, [x28, #0x8]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr x10, [x17, #0x68]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "add x26, x26, #0x10\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "ldr x25, [x26, #0x8]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x70]\n"
-      "add x24, x24, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr x10, [x17, #0x88]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr x23, [x24, #0x8]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr x10, [x17, #0xa8]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "sub x13, x13, #0x8\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "cmp x13, #0x10\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr x10, [x17, #0xc8]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr x10, [x17, #0xe8]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr x10, [x17, #0x108]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr d6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr x11, [x17, #0x118]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr d7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr x10, [x17, #0x128]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr d6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr x11, [x17, #0x138]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr d7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr x10, [x17, #0x148]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr d6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr x11, [x17, #0x158]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr d7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr x10, [x17, #0x168]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr d6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr x11, [x17, #0x178]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr d7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr x10, [x17, #0x188]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr d6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr x11, [x17, #0x198]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr d7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr x10, [x17, #0x1a8]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr d6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr x11, [x17, #0x1b8]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr d7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr x10, [x17, #0x1c8]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr d6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr x11, [x17, #0x1d8]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr d7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr x10, [x17, #0x1e8]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr d6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr x11, [x17, #0x1f8]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "ldr d7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "ldr d24, [x17, #0x30]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "add x11, x11, #0x10\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr d25, [x17, #0x40]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr d24, [x17, #0x50]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "ldr x25, [x13, #0x8]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr d25, [x17, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "ldr x24, [x12, #0x8]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr d24, [x17, #0x70]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "ldr x23, [x11, #0x8]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr d25, [x17, #0x80]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "ldr x22, [x10, #0x8]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr d24, [x17, #0x90]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
+      "sub x14, x14, #0x8\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr d25, [x17, #0xa0]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
+      "cmp x14, #0x10\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr d24, [x17, #0xb0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr d25, [x17, #0xc0]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr d24, [x17, #0xd0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr d25, [x17, #0xe0]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x108]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr d24, [x17, #0xf0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0x118]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr d25, [x17, #0x100]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x128]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr d24, [x17, #0x110]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x138]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr d25, [x17, #0x120]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x148]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr d24, [x17, #0x130]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x158]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr d25, [x17, #0x140]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x168]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr d24, [x17, #0x150]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x178]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr d25, [x17, #0x160]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x188]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr d24, [x17, #0x170]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x198]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr d25, [x17, #0x180]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x1a8]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr d24, [x17, #0x190]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1b8]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr d25, [x17, #0x1a0]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1c8]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr d24, [x17, #0x1b0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1d8]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr d25, [x17, #0x1c0]\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1e8]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr d24, [x17, #0x1d0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "ldr x20, [x17, #0x1f8]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr d25, [x17, #0x1e0]\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr d24, [x17, #0x1f0]\n"
+      "mov v24.d[1], x20\n"
       "add x17, x17, #0x200\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "ldr x10, [x17, #0x8]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "ldr x20, [x17, #0x18]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "ldr d0, [x12, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr d1, [x28, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "mov v0.d[1], x9\n"
-      "mov v1.d[1], x27\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "mov v2.d[1], x25\n"
-      "mov v3.d[1], x23\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
+      "ldr d3, [x10, #0x0]\n"
+      "ldr d7, [x17, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x24\n"
+      "mov v2.d[1], x23\n"
+      "mov v3.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 173b\n"
       "174:"  // Height 4: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
+      "add x13, x13, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "sub x13, x13, #0x8\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
       "add x12, x12, #0x10\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr q25, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "add x10, x10, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "add x28, x28, #0x10\n"
+      "sub x14, x14, #0x8\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "add x26, x26, #0x10\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "add x24, x24, #0x10\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr q6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "ldr q7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "ldr q24, [x17, #0x30]\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr q25, [x17, #0x40]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr q24, [x17, #0x50]\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x17, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x17, #0x70]\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x17, #0x80]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x17, #0x90]\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x17, #0xa0]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x17, #0xb0]\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x17, #0xc0]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x17, #0xd0]\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x17, #0xe0]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x17, #0xf0]\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x17, #0x100]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x17, #0x110]\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x17, #0x120]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x17, #0x130]\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x17, #0x140]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x17, #0x150]\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x17, #0x160]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x17, #0x170]\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x17, #0x180]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x17, #0x190]\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x17, #0x1a0]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x17, #0x1b0]\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x17, #0x1c0]\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x17, #0x1d0]\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr q25, [x17, #0x1e0]\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr q24, [x17, #0x1f0]\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
       "add x17, x17, #0x200\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
       "175:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x13, 177f\n"
+      "cbz x14, 177f\n"
       "176:"  // Height 4: Multiply loop: Odd block loop
-      "ldr h0, [x12], #0x2\n"
-      "sub x13, x13, #0x1\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr h3, [x13], #0x2\n"
+      "sub x14, x14, #0x1\n"
+      "ldr h2, [x12], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr q25, [x17, #0x0]\n"
+      "fmla v8.8h, v25.8h, v3.h[0]\n"
+      "ldr q24, [x17, #0x10]\n"
+      "fmla v12.8h, v25.8h, v2.h[0]\n"
+      "fmla v16.8h, v25.8h, v1.h[0]\n"
+      "fmla v20.8h, v25.8h, v0.h[0]\n"
+      "ldr q25, [x17, #0x20]\n"
+      "fmla v9.8h, v24.8h, v3.h[0]\n"
+      "fmla v13.8h, v24.8h, v2.h[0]\n"
+      "fmla v17.8h, v24.8h, v1.h[0]\n"
+      "fmla v21.8h, v24.8h, v0.h[0]\n"
+      "ldr q24, [x17, #0x30]\n"
+      "fmla v10.8h, v25.8h, v3.h[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "cbnz x13, 176b\n"
+      "fmla v14.8h, v25.8h, v2.h[0]\n"
+      "fmla v18.8h, v25.8h, v1.h[0]\n"
+      "fmla v22.8h, v25.8h, v0.h[0]\n"
+      "fmla v11.8h, v24.8h, v3.h[0]\n"
+      "fmla v15.8h, v24.8h, v2.h[0]\n"
+      "fmla v19.8h, v24.8h, v1.h[0]\n"
+      "fmla v23.8h, v24.8h, v0.h[0]\n"
+      "cbnz x14, 176b\n"
       "177:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 170b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "add x25, x15, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #1\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #1\n"
       "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 178f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v24.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v24.8h\n"
+      "fmin v9.8h, v9.8h, v24.8h\n"
+      "fmin v10.8h, v10.8h, v24.8h\n"
+      "fmin v11.8h, v11.8h, v24.8h\n"
+      "fmin v12.8h, v12.8h, v24.8h\n"
+      "fmin v13.8h, v13.8h, v24.8h\n"
+      "fmin v14.8h, v14.8h, v24.8h\n"
+      "fmin v15.8h, v15.8h, v24.8h\n"
+      "fmin v16.8h, v16.8h, v24.8h\n"
+      "fmin v17.8h, v17.8h, v24.8h\n"
+      "fmin v18.8h, v18.8h, v24.8h\n"
+      "fmin v19.8h, v19.8h, v24.8h\n"
+      "fmin v20.8h, v20.8h, v24.8h\n"
+      "fmin v21.8h, v21.8h, v24.8h\n"
+      "fmin v22.8h, v22.8h, v24.8h\n"
+      "fmin v23.8h, v23.8h, v24.8h\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
-      "ld1r { v0.8h }, [x19]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmin v16.8h, v16.8h, v0.8h\n"
-      "fmin v17.8h, v17.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
-      "fmax v12.8h, v12.8h, v1.8h\n"
-      "fmax v13.8h, v13.8h, v1.8h\n"
-      "fmax v14.8h, v14.8h, v1.8h\n"
-      "fmax v15.8h, v15.8h, v1.8h\n"
-      "fmax v16.8h, v16.8h, v1.8h\n"
-      "fmax v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v0.8h\n"
-      "fmin v19.8h, v19.8h, v0.8h\n"
-      "fmin v20.8h, v20.8h, v0.8h\n"
-      "fmin v21.8h, v21.8h, v0.8h\n"
-      "fmin v22.8h, v22.8h, v0.8h\n"
-      "fmin v23.8h, v23.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v1.8h\n"
-      "fmax v19.8h, v19.8h, v1.8h\n"
-      "fmax v20.8h, v20.8h, v1.8h\n"
-      "fmax v21.8h, v21.8h, v1.8h\n"
-      "fmax v22.8h, v22.8h, v1.8h\n"
-      "fmax v23.8h, v23.8h, v1.8h\n"
+      "ld1r { v24.8h }, [x20]\n"
+      "fmax v8.8h, v8.8h, v24.8h\n"
+      "fmax v9.8h, v9.8h, v24.8h\n"
+      "fmax v10.8h, v10.8h, v24.8h\n"
+      "fmax v11.8h, v11.8h, v24.8h\n"
+      "fmax v12.8h, v12.8h, v24.8h\n"
+      "fmax v13.8h, v13.8h, v24.8h\n"
+      "fmax v14.8h, v14.8h, v24.8h\n"
+      "fmax v15.8h, v15.8h, v24.8h\n"
+      "fmax v16.8h, v16.8h, v24.8h\n"
+      "fmax v17.8h, v17.8h, v24.8h\n"
+      "fmax v18.8h, v18.8h, v24.8h\n"
+      "fmax v19.8h, v19.8h, v24.8h\n"
+      "fmax v20.8h, v20.8h, v24.8h\n"
+      "fmax v21.8h, v21.8h, v24.8h\n"
+      "fmax v22.8h, v22.8h, v24.8h\n"
+      "fmax v23.8h, v23.8h, v24.8h\n"
       "178:"  // Height 4: No activation
       "cmp x8, #0x20\n"
       "bge 195f\n"
       "tbz x8, #4, 186f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
-      "st1 { v9.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
+      "st1 { v9.8h }, [x16], #0x10\n"
       "st1 { v12.8h }, [x25], #0x10\n"
       "st1 { v13.8h }, [x25], #0x10\n"
       "st1 { v16.8h }, [x24], #0x10\n"
@@ -2969,192 +2966,192 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "st1 { v20.8h }, [x23], #0x10\n"
       "st1 { v21.8h }, [x23], #0x10\n"
       "tbz x8, #3, 182f\n"
-      "st1 { v10.8h }, [x15], #0x10\n"
+      "st1 { v10.8h }, [x16], #0x10\n"
       "st1 { v14.8h }, [x25], #0x10\n"
       "st1 { v18.8h }, [x24], #0x10\n"
       "st1 { v22.8h }, [x23], #0x10\n"
       "tbz x8, #2, 180f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "str d15, [x25], #0x8\n"
       "str d19, [x24], #0x8\n"
       "str d23, [x23], #0x8\n"
       "tbz x8, #1, 179f\n"
-      "st1 { v11.s }[2], [x15], #0x4\n"
+      "st1 { v11.s }[2], [x16], #0x4\n"
       "st1 { v15.s }[2], [x25], #0x4\n"
       "st1 { v19.s }[2], [x24], #0x4\n"
       "st1 { v23.s }[2], [x23], #0x4\n"
       "tbz x8, #0, 194f\n"
-      "st1 { v11.h }[6], [x15]\n"
+      "st1 { v11.h }[6], [x16]\n"
       "st1 { v15.h }[6], [x25]\n"
       "st1 { v19.h }[6], [x24]\n"
       "st1 { v23.h }[6], [x23]\n"
       "b 194f\n"
       "179:"  // Height 4: Partial direct writeback: partial_1_28
       "tbz x8, #0, 194f\n"
-      "st1 { v11.h }[4], [x15]\n"
+      "st1 { v11.h }[4], [x16]\n"
       "st1 { v15.h }[4], [x25]\n"
       "st1 { v19.h }[4], [x24]\n"
       "st1 { v23.h }[4], [x23]\n"
       "b 194f\n"
       "180:"  // Height 4: Partial direct writeback: partial_2_24
       "tbz x8, #1, 181f\n"
-      "str s11, [x15], #0x4\n"
+      "str s11, [x16], #0x4\n"
       "str s15, [x25], #0x4\n"
       "str s19, [x24], #0x4\n"
       "str s23, [x23], #0x4\n"
       "tbz x8, #0, 194f\n"
-      "st1 { v11.h }[2], [x15]\n"
+      "st1 { v11.h }[2], [x16]\n"
       "st1 { v15.h }[2], [x25]\n"
       "st1 { v19.h }[2], [x24]\n"
       "st1 { v23.h }[2], [x23]\n"
       "b 194f\n"
       "181:"  // Height 4: Partial direct writeback: partial_1_24
       "tbz x8, #0, 194f\n"
-      "str h11, [x15, #0x0]\n"
+      "str h11, [x16, #0x0]\n"
       "str h15, [x25, #0x0]\n"
       "str h19, [x24, #0x0]\n"
       "str h23, [x23, #0x0]\n"
       "b 194f\n"
       "182:"  // Height 4: Partial direct writeback: partial_4_16
       "tbz x8, #2, 184f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "str d14, [x25], #0x8\n"
       "str d18, [x24], #0x8\n"
       "str d22, [x23], #0x8\n"
       "tbz x8, #1, 183f\n"
-      "st1 { v10.s }[2], [x15], #0x4\n"
+      "st1 { v10.s }[2], [x16], #0x4\n"
       "st1 { v14.s }[2], [x25], #0x4\n"
       "st1 { v18.s }[2], [x24], #0x4\n"
       "st1 { v22.s }[2], [x23], #0x4\n"
       "tbz x8, #0, 194f\n"
-      "st1 { v10.h }[6], [x15]\n"
+      "st1 { v10.h }[6], [x16]\n"
       "st1 { v14.h }[6], [x25]\n"
       "st1 { v18.h }[6], [x24]\n"
       "st1 { v22.h }[6], [x23]\n"
       "b 194f\n"
       "183:"  // Height 4: Partial direct writeback: partial_1_20
       "tbz x8, #0, 194f\n"
-      "st1 { v10.h }[4], [x15]\n"
+      "st1 { v10.h }[4], [x16]\n"
       "st1 { v14.h }[4], [x25]\n"
       "st1 { v18.h }[4], [x24]\n"
       "st1 { v22.h }[4], [x23]\n"
       "b 194f\n"
       "184:"  // Height 4: Partial direct writeback: partial_2_16
       "tbz x8, #1, 185f\n"
-      "str s10, [x15], #0x4\n"
+      "str s10, [x16], #0x4\n"
       "str s14, [x25], #0x4\n"
       "str s18, [x24], #0x4\n"
       "str s22, [x23], #0x4\n"
       "tbz x8, #0, 194f\n"
-      "st1 { v10.h }[2], [x15]\n"
+      "st1 { v10.h }[2], [x16]\n"
       "st1 { v14.h }[2], [x25]\n"
       "st1 { v18.h }[2], [x24]\n"
       "st1 { v22.h }[2], [x23]\n"
       "b 194f\n"
       "185:"  // Height 4: Partial direct writeback: partial_1_16
       "tbz x8, #0, 194f\n"
-      "str h10, [x15, #0x0]\n"
+      "str h10, [x16, #0x0]\n"
       "str h14, [x25, #0x0]\n"
       "str h18, [x24, #0x0]\n"
       "str h22, [x23, #0x0]\n"
       "b 194f\n"
       "186:"  // Height 4: Partial direct writeback: partial_8_0
       "tbz x8, #3, 190f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
       "st1 { v12.8h }, [x25], #0x10\n"
       "st1 { v16.8h }, [x24], #0x10\n"
       "st1 { v20.8h }, [x23], #0x10\n"
       "tbz x8, #2, 188f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "str d13, [x25], #0x8\n"
       "str d17, [x24], #0x8\n"
       "str d21, [x23], #0x8\n"
       "tbz x8, #1, 187f\n"
-      "st1 { v9.s }[2], [x15], #0x4\n"
+      "st1 { v9.s }[2], [x16], #0x4\n"
       "st1 { v13.s }[2], [x25], #0x4\n"
       "st1 { v17.s }[2], [x24], #0x4\n"
       "st1 { v21.s }[2], [x23], #0x4\n"
       "tbz x8, #0, 194f\n"
-      "st1 { v9.h }[6], [x15]\n"
+      "st1 { v9.h }[6], [x16]\n"
       "st1 { v13.h }[6], [x25]\n"
       "st1 { v17.h }[6], [x24]\n"
       "st1 { v21.h }[6], [x23]\n"
       "b 194f\n"
       "187:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x8, #0, 194f\n"
-      "st1 { v9.h }[4], [x15]\n"
+      "st1 { v9.h }[4], [x16]\n"
       "st1 { v13.h }[4], [x25]\n"
       "st1 { v17.h }[4], [x24]\n"
       "st1 { v21.h }[4], [x23]\n"
       "b 194f\n"
       "188:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x8, #1, 189f\n"
-      "str s9, [x15], #0x4\n"
+      "str s9, [x16], #0x4\n"
       "str s13, [x25], #0x4\n"
       "str s17, [x24], #0x4\n"
       "str s21, [x23], #0x4\n"
       "tbz x8, #0, 194f\n"
-      "st1 { v9.h }[2], [x15]\n"
+      "st1 { v9.h }[2], [x16]\n"
       "st1 { v13.h }[2], [x25]\n"
       "st1 { v17.h }[2], [x24]\n"
       "st1 { v21.h }[2], [x23]\n"
       "b 194f\n"
       "189:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x8, #0, 194f\n"
-      "str h9, [x15, #0x0]\n"
+      "str h9, [x16, #0x0]\n"
       "str h13, [x25, #0x0]\n"
       "str h17, [x24, #0x0]\n"
       "str h21, [x23, #0x0]\n"
       "b 194f\n"
       "190:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x8, #2, 192f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "str d12, [x25], #0x8\n"
       "str d16, [x24], #0x8\n"
       "str d20, [x23], #0x8\n"
       "tbz x8, #1, 191f\n"
-      "st1 { v8.s }[2], [x15], #0x4\n"
+      "st1 { v8.s }[2], [x16], #0x4\n"
       "st1 { v12.s }[2], [x25], #0x4\n"
       "st1 { v16.s }[2], [x24], #0x4\n"
       "st1 { v20.s }[2], [x23], #0x4\n"
       "tbz x8, #0, 194f\n"
-      "st1 { v8.h }[6], [x15]\n"
+      "st1 { v8.h }[6], [x16]\n"
       "st1 { v12.h }[6], [x25]\n"
       "st1 { v16.h }[6], [x24]\n"
       "st1 { v20.h }[6], [x23]\n"
       "b 194f\n"
       "191:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x8, #0, 194f\n"
-      "st1 { v8.h }[4], [x15]\n"
+      "st1 { v8.h }[4], [x16]\n"
       "st1 { v12.h }[4], [x25]\n"
       "st1 { v16.h }[4], [x24]\n"
       "st1 { v20.h }[4], [x23]\n"
       "b 194f\n"
       "192:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x8, #1, 193f\n"
-      "str s8, [x15], #0x4\n"
+      "str s8, [x16], #0x4\n"
       "str s12, [x25], #0x4\n"
       "str s16, [x24], #0x4\n"
       "str s20, [x23], #0x4\n"
       "tbz x8, #0, 194f\n"
-      "st1 { v8.h }[2], [x15]\n"
+      "st1 { v8.h }[2], [x16]\n"
       "st1 { v12.h }[2], [x25]\n"
       "st1 { v16.h }[2], [x24]\n"
       "st1 { v20.h }[2], [x23]\n"
       "b 194f\n"
       "193:"  // Height 4: Partial direct writeback: partial_1_0
-      "str h8, [x15, #0x0]\n"
+      "str h8, [x16, #0x0]\n"
       "str h12, [x25, #0x0]\n"
       "str h16, [x24, #0x0]\n"
       "str h20, [x23, #0x0]\n"
       "194:"  // Height 4: Partial direct writeback: Done
       "b 196f\n"
       "195:"  // Height 4: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "str q12, [x25, #0x0]\n"
       "str q13, [x25, #0x10]\n"
       "str q14, [x25, #0x20]\n"
@@ -3172,84 +3169,84 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "bgt 149b\n"
       "b 296f\n"
       "197:"  // Height 5
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x16, %x[bias]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
       "198:"  // Height 5: Column loop
-      "cbz x16, 199f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "cbz x7, 199f\n"
+      "ldr q8, [x7, #0x0]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q9, [x7, #0x10]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x7, #0x20]\n"
       "mov v14.16b, v10.16b\n"
+      "ldr q11, [x7, #0x30]\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "add x7, x7, #0x40\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
       "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
-      "mov v23.16b, v11.16b\n"
       "mov v27.16b, v11.16b\n"
       "b 218f\n"
       "199:"  // Height 5: no bias
       "tbz %x[flags], #0, 217f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
       "cmp x8, #0x20\n"
-      "add x25, x15, x19, LSL #1\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
       "bge 216f\n"
       "tbz x8, #4, 207f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
       "ld1 { v12.8h }, [x25], #0x10\n"
       "ld1 { v16.8h }, [x24], #0x10\n"
-      "ld1 { v9.8h }, [x15], #0x10\n"
-      "ld1 { v13.8h }, [x25], #0x10\n"
-      "ld1 { v17.8h }, [x24], #0x10\n"
       "ld1 { v20.8h }, [x23], #0x10\n"
       "ld1 { v24.8h }, [x22], #0x10\n"
+      "ld1 { v9.8h }, [x16], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
+      "ld1 { v17.8h }, [x24], #0x10\n"
       "ld1 { v21.8h }, [x23], #0x10\n"
       "ld1 { v25.8h }, [x22], #0x10\n"
       "tbz x8, #3, 203f\n"
-      "ld1 { v10.8h }, [x15], #0x10\n"
+      "ld1 { v10.8h }, [x16], #0x10\n"
       "ld1 { v14.8h }, [x25], #0x10\n"
       "ld1 { v18.8h }, [x24], #0x10\n"
       "ld1 { v22.8h }, [x23], #0x10\n"
       "ld1 { v26.8h }, [x22], #0x10\n"
       "tbz x8, #2, 201f\n"
-      "ldr d11, [x15], #0x8\n"
+      "ldr d11, [x16], #0x8\n"
       "ldr d15, [x25], #0x8\n"
       "ldr d19, [x24], #0x8\n"
       "ldr d23, [x23], #0x8\n"
       "ldr d27, [x22], #0x8\n"
       "tbz x8, #1, 200f\n"
-      "ld1 { v11.s }[2], [x15], #0x4\n"
-      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x16], #0x4\n"
+      "mov x20, #0x3c\n"
       "ld1 { v15.s }[2], [x25], #0x4\n"
       "ld1 { v19.s }[2], [x24], #0x4\n"
       "ld1 { v23.s }[2], [x23], #0x4\n"
       "ld1 { v27.s }[2], [x22], #0x4\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v11.h }[6], [x15]\n"
+      "ld1 { v11.h }[6], [x16]\n"
       "ld1 { v15.h }[6], [x25]\n"
       "ld1 { v19.h }[6], [x24]\n"
       "ld1 { v23.h }[6], [x23]\n"
       "ld1 { v27.h }[6], [x22]\n"
       "b 215f\n"
       "200:"  // Height 5: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v11.h }[4], [x15]\n"
+      "ld1 { v11.h }[4], [x16]\n"
       "ld1 { v15.h }[4], [x25]\n"
       "ld1 { v19.h }[4], [x24]\n"
       "ld1 { v23.h }[4], [x23]\n"
@@ -3257,23 +3254,23 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 215f\n"
       "201:"  // Height 5: Partial accumulate: partial_2_24
       "tbz x8, #1, 202f\n"
-      "ldr s11, [x15], #0x4\n"
+      "ldr s11, [x16], #0x4\n"
+      "mov x20, #0x34\n"
       "ldr s15, [x25], #0x4\n"
-      "mov x19, #0x34\n"
       "ldr s19, [x24], #0x4\n"
       "ldr s23, [x23], #0x4\n"
       "ldr s27, [x22], #0x4\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v11.h }[2], [x15]\n"
+      "ld1 { v11.h }[2], [x16]\n"
       "ld1 { v15.h }[2], [x25]\n"
       "ld1 { v19.h }[2], [x24]\n"
       "ld1 { v23.h }[2], [x23]\n"
       "ld1 { v27.h }[2], [x22]\n"
       "b 215f\n"
       "202:"  // Height 5: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 215f\n"
-      "ldr h11, [x15, #0x0]\n"
+      "ldr h11, [x16, #0x0]\n"
       "ldr h15, [x25, #0x0]\n"
       "ldr h19, [x24, #0x0]\n"
       "ldr h23, [x23, #0x0]\n"
@@ -3281,29 +3278,29 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 215f\n"
       "203:"  // Height 5: Partial accumulate: partial_4_16
       "tbz x8, #2, 205f\n"
-      "ldr d10, [x15], #0x8\n"
+      "ldr d10, [x16], #0x8\n"
       "ldr d14, [x25], #0x8\n"
       "ldr d18, [x24], #0x8\n"
       "ldr d22, [x23], #0x8\n"
       "ldr d26, [x22], #0x8\n"
       "tbz x8, #1, 204f\n"
-      "ld1 { v10.s }[2], [x15], #0x4\n"
-      "mov x19, #0x2c\n"
+      "ld1 { v10.s }[2], [x16], #0x4\n"
+      "mov x20, #0x2c\n"
       "ld1 { v14.s }[2], [x25], #0x4\n"
       "ld1 { v18.s }[2], [x24], #0x4\n"
       "ld1 { v22.s }[2], [x23], #0x4\n"
       "ld1 { v26.s }[2], [x22], #0x4\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v10.h }[6], [x15]\n"
+      "ld1 { v10.h }[6], [x16]\n"
       "ld1 { v14.h }[6], [x25]\n"
       "ld1 { v18.h }[6], [x24]\n"
       "ld1 { v22.h }[6], [x23]\n"
       "ld1 { v26.h }[6], [x22]\n"
       "b 215f\n"
       "204:"  // Height 5: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v10.h }[4], [x15]\n"
+      "ld1 { v10.h }[4], [x16]\n"
       "ld1 { v14.h }[4], [x25]\n"
       "ld1 { v18.h }[4], [x24]\n"
       "ld1 { v22.h }[4], [x23]\n"
@@ -3311,23 +3308,23 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 215f\n"
       "205:"  // Height 5: Partial accumulate: partial_2_16
       "tbz x8, #1, 206f\n"
-      "ldr s10, [x15], #0x4\n"
+      "ldr s10, [x16], #0x4\n"
+      "mov x20, #0x24\n"
       "ldr s14, [x25], #0x4\n"
-      "mov x19, #0x24\n"
       "ldr s18, [x24], #0x4\n"
       "ldr s22, [x23], #0x4\n"
       "ldr s26, [x22], #0x4\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v10.h }[2], [x15]\n"
+      "ld1 { v10.h }[2], [x16]\n"
       "ld1 { v14.h }[2], [x25]\n"
       "ld1 { v18.h }[2], [x24]\n"
       "ld1 { v22.h }[2], [x23]\n"
       "ld1 { v26.h }[2], [x22]\n"
       "b 215f\n"
       "206:"  // Height 5: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 215f\n"
-      "ldr h10, [x15, #0x0]\n"
+      "ldr h10, [x16, #0x0]\n"
       "ldr h14, [x25, #0x0]\n"
       "ldr h18, [x24, #0x0]\n"
       "ldr h22, [x23, #0x0]\n"
@@ -3335,35 +3332,35 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 215f\n"
       "207:"  // Height 5: Partial accumulate: partial_8_0
       "tbz x8, #3, 211f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
       "ld1 { v12.8h }, [x25], #0x10\n"
       "ld1 { v16.8h }, [x24], #0x10\n"
       "ld1 { v20.8h }, [x23], #0x10\n"
       "ld1 { v24.8h }, [x22], #0x10\n"
       "tbz x8, #2, 209f\n"
-      "ldr d9, [x15], #0x8\n"
+      "ldr d9, [x16], #0x8\n"
       "ldr d13, [x25], #0x8\n"
       "ldr d17, [x24], #0x8\n"
       "ldr d21, [x23], #0x8\n"
       "ldr d25, [x22], #0x8\n"
       "tbz x8, #1, 208f\n"
-      "ld1 { v9.s }[2], [x15], #0x4\n"
-      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x16], #0x4\n"
+      "mov x20, #0x1c\n"
       "ld1 { v13.s }[2], [x25], #0x4\n"
       "ld1 { v17.s }[2], [x24], #0x4\n"
       "ld1 { v21.s }[2], [x23], #0x4\n"
       "ld1 { v25.s }[2], [x22], #0x4\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v9.h }[6], [x15]\n"
+      "ld1 { v9.h }[6], [x16]\n"
       "ld1 { v13.h }[6], [x25]\n"
       "ld1 { v17.h }[6], [x24]\n"
       "ld1 { v21.h }[6], [x23]\n"
       "ld1 { v25.h }[6], [x22]\n"
       "b 215f\n"
       "208:"  // Height 5: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v9.h }[4], [x15]\n"
+      "ld1 { v9.h }[4], [x16]\n"
       "ld1 { v13.h }[4], [x25]\n"
       "ld1 { v17.h }[4], [x24]\n"
       "ld1 { v21.h }[4], [x23]\n"
@@ -3371,23 +3368,23 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 215f\n"
       "209:"  // Height 5: Partial accumulate: partial_2_8
       "tbz x8, #1, 210f\n"
-      "ldr s9, [x15], #0x4\n"
+      "ldr s9, [x16], #0x4\n"
+      "mov x20, #0x14\n"
       "ldr s13, [x25], #0x4\n"
-      "mov x19, #0x14\n"
       "ldr s17, [x24], #0x4\n"
       "ldr s21, [x23], #0x4\n"
       "ldr s25, [x22], #0x4\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v9.h }[2], [x15]\n"
+      "ld1 { v9.h }[2], [x16]\n"
       "ld1 { v13.h }[2], [x25]\n"
       "ld1 { v17.h }[2], [x24]\n"
       "ld1 { v21.h }[2], [x23]\n"
       "ld1 { v25.h }[2], [x22]\n"
       "b 215f\n"
       "210:"  // Height 5: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 215f\n"
-      "ldr h9, [x15, #0x0]\n"
+      "ldr h9, [x16, #0x0]\n"
       "ldr h13, [x25, #0x0]\n"
       "ldr h17, [x24, #0x0]\n"
       "ldr h21, [x23, #0x0]\n"
@@ -3395,29 +3392,29 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 215f\n"
       "211:"  // Height 5: Partial accumulate: partial_4_0
       "tbz x8, #2, 213f\n"
-      "ldr d8, [x15], #0x8\n"
+      "ldr d8, [x16], #0x8\n"
       "ldr d12, [x25], #0x8\n"
       "ldr d16, [x24], #0x8\n"
       "ldr d20, [x23], #0x8\n"
       "ldr d24, [x22], #0x8\n"
       "tbz x8, #1, 212f\n"
-      "ld1 { v8.s }[2], [x15], #0x4\n"
-      "mov x19, #0xc\n"
+      "ld1 { v8.s }[2], [x16], #0x4\n"
+      "mov x20, #0xc\n"
       "ld1 { v12.s }[2], [x25], #0x4\n"
       "ld1 { v16.s }[2], [x24], #0x4\n"
       "ld1 { v20.s }[2], [x23], #0x4\n"
       "ld1 { v24.s }[2], [x22], #0x4\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v8.h }[6], [x15]\n"
+      "ld1 { v8.h }[6], [x16]\n"
       "ld1 { v12.h }[6], [x25]\n"
       "ld1 { v16.h }[6], [x24]\n"
       "ld1 { v20.h }[6], [x23]\n"
       "ld1 { v24.h }[6], [x22]\n"
       "b 215f\n"
       "212:"  // Height 5: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v8.h }[4], [x15]\n"
+      "ld1 { v8.h }[4], [x16]\n"
       "ld1 { v12.h }[4], [x25]\n"
       "ld1 { v16.h }[4], [x24]\n"
       "ld1 { v20.h }[4], [x23]\n"
@@ -3425,34 +3422,34 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 215f\n"
       "213:"  // Height 5: Partial accumulate: partial_2_0
       "tbz x8, #1, 214f\n"
-      "ldr s8, [x15], #0x4\n"
+      "ldr s8, [x16], #0x4\n"
+      "mov x20, #0x4\n"
       "ldr s12, [x25], #0x4\n"
-      "mov x19, #0x4\n"
       "ldr s16, [x24], #0x4\n"
       "ldr s20, [x23], #0x4\n"
       "ldr s24, [x22], #0x4\n"
       "tbz x8, #0, 215f\n"
-      "ld1 { v8.h }[2], [x15]\n"
+      "ld1 { v8.h }[2], [x16]\n"
       "ld1 { v12.h }[2], [x25]\n"
       "ld1 { v16.h }[2], [x24]\n"
       "ld1 { v20.h }[2], [x23]\n"
       "ld1 { v24.h }[2], [x22]\n"
       "b 215f\n"
       "214:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr h8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr h8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "ldr h12, [x25, #0x0]\n"
       "ldr h16, [x24, #0x0]\n"
       "ldr h20, [x23, #0x0]\n"
       "ldr h24, [x22, #0x0]\n"
       "215:"  // Height 5: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 218f\n"
       "216:"  // Height 5: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "ldr q12, [x25, #0x0]\n"
       "ldr q13, [x25, #0x10]\n"
       "ldr q14, [x25, #0x20]\n"
@@ -3492,635 +3489,635 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "movi v26.16b, #0x0\n"
       "movi v27.16b, #0x0\n"
       "218:"  // Height 5: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "219:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 220f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x22, [x20, #0x20]\n"
-      "cbnz x14, 221f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #1\n"
-      "add x28, x28, x19, LSL #1\n"
-      "add x26, x26, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "cbnz x15, 221f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #1\n"
+      "add x12, x12, x20, LSL #1\n"
+      "add x11, x11, x20, LSL #1\n"
+      "add x10, x10, x20, LSL #1\n"
+      "add x9, x9, x20, LSL #1\n"
       "b 221f\n"
       "220:"  // Height 5: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19, LSL #1\n"
-      "add x26, x28, x19, LSL #1\n"
-      "add x24, x26, x19, LSL #1\n"
-      "add x22, x24, x19, LSL #1\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21, LSL #1\n"
+      "add x11, x12, x21, LSL #1\n"
+      "add x10, x11, x21, LSL #1\n"
+      "add x9, x10, x21, LSL #1\n"
       "221:"  // Height 5: input setup done
-      "cmp x13, #0x8\n"
+      "cmp x14, #0x8\n"
       "blt 224f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x10\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x10\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 223f\n"
       "222:"  // Height 5: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr d7, [x17, #0x10]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x18]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr x10, [x17, #0x28]\n"
+      "add x13, x13, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
       "add x12, x12, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "add x28, x28, #0x10\n"
+      "ldr d29, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "mov v29.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr d6, [x17, #0x20]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "add x11, x11, #0x10\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr x9, [x12, #0x8]\n"
+      "add x10, x10, #0x10\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x30]\n"
-      "add x26, x26, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr x10, [x17, #0x48]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr x27, [x28, #0x8]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr x25, [x26, #0x8]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr x10, [x17, #0x68]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "add x24, x24, #0x10\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v24.8h, v6.8h, v4.h[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr x23, [x24, #0x8]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "add x22, x22, #0x10\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr x10, [x17, #0x88]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr x21, [x22, #0x8]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v26.8h, v6.8h, v4.h[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "sub x13, x13, #0x8\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "cmp x13, #0x10\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr x10, [x17, #0xa8]\n"
-      "fmla v27.8h, v7.8h, v4.h[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v24.8h, v6.8h, v4.h[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr x10, [x17, #0xc8]\n"
-      "fmla v25.8h, v7.8h, v4.h[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v26.8h, v6.8h, v4.h[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr x10, [x17, #0xe8]\n"
-      "fmla v27.8h, v7.8h, v4.h[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v24.8h, v6.8h, v4.h[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr x10, [x17, #0x108]\n"
-      "fmla v25.8h, v7.8h, v4.h[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr x11, [x17, #0x118]\n"
-      "fmla v26.8h, v6.8h, v4.h[3]\n"
-      "ldr d6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr x10, [x17, #0x128]\n"
-      "fmla v27.8h, v7.8h, v4.h[3]\n"
-      "ldr d7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr x11, [x17, #0x138]\n"
-      "fmla v24.8h, v6.8h, v4.h[4]\n"
-      "ldr d6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr x10, [x17, #0x148]\n"
-      "fmla v25.8h, v7.8h, v4.h[4]\n"
-      "ldr d7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr x11, [x17, #0x158]\n"
-      "fmla v26.8h, v6.8h, v4.h[4]\n"
-      "ldr d6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr x10, [x17, #0x168]\n"
-      "fmla v27.8h, v7.8h, v4.h[4]\n"
-      "ldr d7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr x11, [x17, #0x178]\n"
-      "fmla v24.8h, v6.8h, v4.h[5]\n"
-      "ldr d6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr x10, [x17, #0x188]\n"
-      "fmla v25.8h, v7.8h, v4.h[5]\n"
-      "ldr d7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr x11, [x17, #0x198]\n"
-      "fmla v26.8h, v6.8h, v4.h[5]\n"
-      "ldr d6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr x10, [x17, #0x1a8]\n"
-      "fmla v27.8h, v7.8h, v4.h[5]\n"
-      "ldr d7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr x11, [x17, #0x1b8]\n"
-      "fmla v24.8h, v6.8h, v4.h[6]\n"
-      "ldr d6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr x10, [x17, #0x1c8]\n"
-      "fmla v25.8h, v7.8h, v4.h[6]\n"
-      "ldr d7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr x11, [x17, #0x1d8]\n"
-      "fmla v26.8h, v6.8h, v4.h[6]\n"
-      "ldr d6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr x10, [x17, #0x1e8]\n"
-      "fmla v27.8h, v7.8h, v4.h[6]\n"
-      "ldr d7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr x11, [x17, #0x1f8]\n"
-      "fmla v24.8h, v6.8h, v4.h[7]\n"
-      "ldr d6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "fmla v25.8h, v7.8h, v4.h[7]\n"
-      "ldr d7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "ldr d28, [x17, #0x30]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
+      "ldr x26, [x13, #0x8]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr d29, [x17, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "ldr x25, [x12, #0x8]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "ldr x24, [x11, #0x8]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr d28, [x17, #0x50]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "ldr x23, [x10, #0x8]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "ldr x22, [x9, #0x8]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr d29, [x17, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
+      "sub x14, x14, #0x8\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
+      "cmp x14, #0x10\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr d28, [x17, #0x70]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr d29, [x17, #0x80]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr d28, [x17, #0x90]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr d29, [x17, #0xa0]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr d28, [x17, #0xb0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr d29, [x17, #0xc0]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr d28, [x17, #0xd0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr d29, [x17, #0xe0]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x108]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr d28, [x17, #0xf0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0x118]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr d29, [x17, #0x100]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x128]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr d28, [x17, #0x110]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x138]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr d29, [x17, #0x120]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x148]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr d28, [x17, #0x130]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x158]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr d29, [x17, #0x140]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x168]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr d28, [x17, #0x150]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x178]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr d29, [x17, #0x160]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x188]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr d28, [x17, #0x170]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x198]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr d29, [x17, #0x180]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x1a8]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr d28, [x17, #0x190]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1b8]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr d29, [x17, #0x1a0]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1c8]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr d28, [x17, #0x1b0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1d8]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr d29, [x17, #0x1c0]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1e8]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr d28, [x17, #0x1d0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "ldr x20, [x17, #0x1f8]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr d29, [x17, #0x1e0]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr d28, [x17, #0x1f0]\n"
+      "mov v28.d[1], x20\n"
       "add x17, x17, #0x200\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "ldr x10, [x17, #0x8]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "ldr x20, [x17, #0x18]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "ldr d0, [x12, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr d1, [x28, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "mov v0.d[1], x9\n"
-      "fmla v27.8h, v7.8h, v4.h[7]\n"
-      "mov v1.d[1], x27\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "ldr d4, [x22, #0x0]\n"
-      "mov v2.d[1], x25\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
+      "ldr d3, [x10, #0x0]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
+      "ldr d4, [x9, #0x0]\n"
+      "ldr d7, [x17, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x26\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
       "mov v3.d[1], x23\n"
-      "mov v4.d[1], x21\n"
+      "mov v4.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 222b\n"
       "223:"  // Height 5: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
+      "add x13, x13, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "sub x13, x13, #0x8\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
       "add x12, x12, #0x10\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q29, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "add x28, x28, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "sub x14, x14, #0x8\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "add x24, x24, #0x10\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "add x22, x22, #0x10\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "fmla v24.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "fmla v26.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "fmla v27.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "fmla v24.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "fmla v25.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "fmla v26.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "fmla v27.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "fmla v24.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "fmla v25.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "fmla v26.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "fmla v27.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "fmla v24.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "fmla v25.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "fmla v26.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "fmla v27.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "fmla v24.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "fmla v25.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "fmla v26.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "fmla v27.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "fmla v24.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "fmla v25.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "fmla v26.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "fmla v27.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "fmla v24.8h, v6.8h, v4.h[7]\n"
-      "ldr q6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "fmla v25.8h, v7.8h, v4.h[7]\n"
-      "ldr q7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "ldr q28, [x17, #0x30]\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr q29, [x17, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr q28, [x17, #0x50]\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x17, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x17, #0x70]\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x17, #0x80]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x17, #0x90]\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x17, #0xa0]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x17, #0xb0]\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x17, #0xc0]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x17, #0xd0]\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x17, #0xe0]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x17, #0xf0]\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x17, #0x100]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x17, #0x110]\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x17, #0x120]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x17, #0x130]\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x17, #0x140]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x17, #0x150]\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x17, #0x160]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x17, #0x170]\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x17, #0x180]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x17, #0x190]\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x17, #0x1a0]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x17, #0x1b0]\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x17, #0x1c0]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x17, #0x1d0]\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr q29, [x17, #0x1e0]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr q28, [x17, #0x1f0]\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
       "add x17, x17, #0x200\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v26.8h, v6.8h, v4.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
       "224:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x13, 226f\n"
+      "cbz x14, 226f\n"
       "225:"  // Height 5: Multiply loop: Odd block loop
-      "ldr h0, [x12], #0x2\n"
-      "sub x13, x13, #0x1\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr h4, [x13], #0x2\n"
+      "sub x14, x14, #0x1\n"
+      "ldr h3, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h1, [x10], #0x2\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr q29, [x17, #0x0]\n"
+      "fmla v8.8h, v29.8h, v4.h[0]\n"
+      "ldr q28, [x17, #0x10]\n"
+      "fmla v12.8h, v29.8h, v3.h[0]\n"
+      "fmla v16.8h, v29.8h, v2.h[0]\n"
+      "fmla v20.8h, v29.8h, v1.h[0]\n"
+      "fmla v24.8h, v29.8h, v0.h[0]\n"
+      "ldr q29, [x17, #0x20]\n"
+      "fmla v9.8h, v28.8h, v4.h[0]\n"
+      "fmla v13.8h, v28.8h, v3.h[0]\n"
+      "fmla v17.8h, v28.8h, v2.h[0]\n"
+      "fmla v21.8h, v28.8h, v1.h[0]\n"
+      "fmla v25.8h, v28.8h, v0.h[0]\n"
+      "ldr q28, [x17, #0x30]\n"
+      "fmla v10.8h, v29.8h, v4.h[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "cbnz x13, 225b\n"
+      "fmla v14.8h, v29.8h, v3.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v1.h[0]\n"
+      "fmla v26.8h, v29.8h, v0.h[0]\n"
+      "fmla v11.8h, v28.8h, v4.h[0]\n"
+      "fmla v15.8h, v28.8h, v3.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v1.h[0]\n"
+      "fmla v27.8h, v28.8h, v0.h[0]\n"
+      "cbnz x14, 225b\n"
       "226:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 219b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "add x25, x15, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #1\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #1\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #1\n"
       "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 227f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v28.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v28.8h\n"
+      "fmin v9.8h, v9.8h, v28.8h\n"
+      "fmin v10.8h, v10.8h, v28.8h\n"
+      "fmin v11.8h, v11.8h, v28.8h\n"
+      "fmin v12.8h, v12.8h, v28.8h\n"
+      "fmin v13.8h, v13.8h, v28.8h\n"
+      "fmin v14.8h, v14.8h, v28.8h\n"
+      "fmin v15.8h, v15.8h, v28.8h\n"
+      "fmin v16.8h, v16.8h, v28.8h\n"
+      "fmin v17.8h, v17.8h, v28.8h\n"
+      "fmin v18.8h, v18.8h, v28.8h\n"
+      "fmin v19.8h, v19.8h, v28.8h\n"
+      "fmin v20.8h, v20.8h, v28.8h\n"
+      "fmin v21.8h, v21.8h, v28.8h\n"
+      "fmin v22.8h, v22.8h, v28.8h\n"
+      "fmin v23.8h, v23.8h, v28.8h\n"
+      "fmin v24.8h, v24.8h, v28.8h\n"
+      "fmin v25.8h, v25.8h, v28.8h\n"
+      "fmin v26.8h, v26.8h, v28.8h\n"
+      "fmin v27.8h, v27.8h, v28.8h\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
-      "ld1r { v0.8h }, [x19]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmin v16.8h, v16.8h, v0.8h\n"
-      "fmin v17.8h, v17.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
-      "fmax v12.8h, v12.8h, v1.8h\n"
-      "fmax v13.8h, v13.8h, v1.8h\n"
-      "fmax v14.8h, v14.8h, v1.8h\n"
-      "fmax v15.8h, v15.8h, v1.8h\n"
-      "fmax v16.8h, v16.8h, v1.8h\n"
-      "fmax v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v0.8h\n"
-      "fmin v19.8h, v19.8h, v0.8h\n"
-      "fmin v20.8h, v20.8h, v0.8h\n"
-      "fmin v21.8h, v21.8h, v0.8h\n"
-      "fmin v22.8h, v22.8h, v0.8h\n"
-      "fmin v23.8h, v23.8h, v0.8h\n"
-      "fmin v24.8h, v24.8h, v0.8h\n"
-      "fmin v25.8h, v25.8h, v0.8h\n"
-      "fmin v26.8h, v26.8h, v0.8h\n"
-      "fmin v27.8h, v27.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v1.8h\n"
-      "fmax v19.8h, v19.8h, v1.8h\n"
-      "fmax v20.8h, v20.8h, v1.8h\n"
-      "fmax v21.8h, v21.8h, v1.8h\n"
-      "fmax v22.8h, v22.8h, v1.8h\n"
-      "fmax v23.8h, v23.8h, v1.8h\n"
-      "fmax v24.8h, v24.8h, v1.8h\n"
-      "fmax v25.8h, v25.8h, v1.8h\n"
-      "fmax v26.8h, v26.8h, v1.8h\n"
-      "fmax v27.8h, v27.8h, v1.8h\n"
+      "ld1r { v28.8h }, [x20]\n"
+      "fmax v8.8h, v8.8h, v28.8h\n"
+      "fmax v9.8h, v9.8h, v28.8h\n"
+      "fmax v10.8h, v10.8h, v28.8h\n"
+      "fmax v11.8h, v11.8h, v28.8h\n"
+      "fmax v12.8h, v12.8h, v28.8h\n"
+      "fmax v13.8h, v13.8h, v28.8h\n"
+      "fmax v14.8h, v14.8h, v28.8h\n"
+      "fmax v15.8h, v15.8h, v28.8h\n"
+      "fmax v16.8h, v16.8h, v28.8h\n"
+      "fmax v17.8h, v17.8h, v28.8h\n"
+      "fmax v18.8h, v18.8h, v28.8h\n"
+      "fmax v19.8h, v19.8h, v28.8h\n"
+      "fmax v20.8h, v20.8h, v28.8h\n"
+      "fmax v21.8h, v21.8h, v28.8h\n"
+      "fmax v22.8h, v22.8h, v28.8h\n"
+      "fmax v23.8h, v23.8h, v28.8h\n"
+      "fmax v24.8h, v24.8h, v28.8h\n"
+      "fmax v25.8h, v25.8h, v28.8h\n"
+      "fmax v26.8h, v26.8h, v28.8h\n"
+      "fmax v27.8h, v27.8h, v28.8h\n"
       "227:"  // Height 5: No activation
       "cmp x8, #0x20\n"
       "bge 244f\n"
       "tbz x8, #4, 235f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
-      "st1 { v9.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
+      "st1 { v9.8h }, [x16], #0x10\n"
       "st1 { v12.8h }, [x25], #0x10\n"
       "st1 { v13.8h }, [x25], #0x10\n"
       "st1 { v16.8h }, [x24], #0x10\n"
@@ -4130,25 +4127,25 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "st1 { v24.8h }, [x22], #0x10\n"
       "st1 { v25.8h }, [x22], #0x10\n"
       "tbz x8, #3, 231f\n"
-      "st1 { v10.8h }, [x15], #0x10\n"
+      "st1 { v10.8h }, [x16], #0x10\n"
       "st1 { v14.8h }, [x25], #0x10\n"
       "st1 { v18.8h }, [x24], #0x10\n"
       "st1 { v22.8h }, [x23], #0x10\n"
       "st1 { v26.8h }, [x22], #0x10\n"
       "tbz x8, #2, 229f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "str d15, [x25], #0x8\n"
       "str d19, [x24], #0x8\n"
       "str d23, [x23], #0x8\n"
       "str d27, [x22], #0x8\n"
       "tbz x8, #1, 228f\n"
-      "st1 { v11.s }[2], [x15], #0x4\n"
+      "st1 { v11.s }[2], [x16], #0x4\n"
       "st1 { v15.s }[2], [x25], #0x4\n"
       "st1 { v19.s }[2], [x24], #0x4\n"
       "st1 { v23.s }[2], [x23], #0x4\n"
       "st1 { v27.s }[2], [x22], #0x4\n"
       "tbz x8, #0, 243f\n"
-      "st1 { v11.h }[6], [x15]\n"
+      "st1 { v11.h }[6], [x16]\n"
       "st1 { v15.h }[6], [x25]\n"
       "st1 { v19.h }[6], [x24]\n"
       "st1 { v23.h }[6], [x23]\n"
@@ -4156,7 +4153,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "228:"  // Height 5: Partial direct writeback: partial_1_28
       "tbz x8, #0, 243f\n"
-      "st1 { v11.h }[4], [x15]\n"
+      "st1 { v11.h }[4], [x16]\n"
       "st1 { v15.h }[4], [x25]\n"
       "st1 { v19.h }[4], [x24]\n"
       "st1 { v23.h }[4], [x23]\n"
@@ -4164,13 +4161,13 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "229:"  // Height 5: Partial direct writeback: partial_2_24
       "tbz x8, #1, 230f\n"
-      "str s11, [x15], #0x4\n"
+      "str s11, [x16], #0x4\n"
       "str s15, [x25], #0x4\n"
       "str s19, [x24], #0x4\n"
       "str s23, [x23], #0x4\n"
       "str s27, [x22], #0x4\n"
       "tbz x8, #0, 243f\n"
-      "st1 { v11.h }[2], [x15]\n"
+      "st1 { v11.h }[2], [x16]\n"
       "st1 { v15.h }[2], [x25]\n"
       "st1 { v19.h }[2], [x24]\n"
       "st1 { v23.h }[2], [x23]\n"
@@ -4178,7 +4175,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "230:"  // Height 5: Partial direct writeback: partial_1_24
       "tbz x8, #0, 243f\n"
-      "str h11, [x15, #0x0]\n"
+      "str h11, [x16, #0x0]\n"
       "str h15, [x25, #0x0]\n"
       "str h19, [x24, #0x0]\n"
       "str h23, [x23, #0x0]\n"
@@ -4186,19 +4183,19 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "231:"  // Height 5: Partial direct writeback: partial_4_16
       "tbz x8, #2, 233f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "str d14, [x25], #0x8\n"
       "str d18, [x24], #0x8\n"
       "str d22, [x23], #0x8\n"
       "str d26, [x22], #0x8\n"
       "tbz x8, #1, 232f\n"
-      "st1 { v10.s }[2], [x15], #0x4\n"
+      "st1 { v10.s }[2], [x16], #0x4\n"
       "st1 { v14.s }[2], [x25], #0x4\n"
       "st1 { v18.s }[2], [x24], #0x4\n"
       "st1 { v22.s }[2], [x23], #0x4\n"
       "st1 { v26.s }[2], [x22], #0x4\n"
       "tbz x8, #0, 243f\n"
-      "st1 { v10.h }[6], [x15]\n"
+      "st1 { v10.h }[6], [x16]\n"
       "st1 { v14.h }[6], [x25]\n"
       "st1 { v18.h }[6], [x24]\n"
       "st1 { v22.h }[6], [x23]\n"
@@ -4206,7 +4203,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "232:"  // Height 5: Partial direct writeback: partial_1_20
       "tbz x8, #0, 243f\n"
-      "st1 { v10.h }[4], [x15]\n"
+      "st1 { v10.h }[4], [x16]\n"
       "st1 { v14.h }[4], [x25]\n"
       "st1 { v18.h }[4], [x24]\n"
       "st1 { v22.h }[4], [x23]\n"
@@ -4214,13 +4211,13 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "233:"  // Height 5: Partial direct writeback: partial_2_16
       "tbz x8, #1, 234f\n"
-      "str s10, [x15], #0x4\n"
+      "str s10, [x16], #0x4\n"
       "str s14, [x25], #0x4\n"
       "str s18, [x24], #0x4\n"
       "str s22, [x23], #0x4\n"
       "str s26, [x22], #0x4\n"
       "tbz x8, #0, 243f\n"
-      "st1 { v10.h }[2], [x15]\n"
+      "st1 { v10.h }[2], [x16]\n"
       "st1 { v14.h }[2], [x25]\n"
       "st1 { v18.h }[2], [x24]\n"
       "st1 { v22.h }[2], [x23]\n"
@@ -4228,7 +4225,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "234:"  // Height 5: Partial direct writeback: partial_1_16
       "tbz x8, #0, 243f\n"
-      "str h10, [x15, #0x0]\n"
+      "str h10, [x16, #0x0]\n"
       "str h14, [x25, #0x0]\n"
       "str h18, [x24, #0x0]\n"
       "str h22, [x23, #0x0]\n"
@@ -4236,25 +4233,25 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "235:"  // Height 5: Partial direct writeback: partial_8_0
       "tbz x8, #3, 239f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
       "st1 { v12.8h }, [x25], #0x10\n"
       "st1 { v16.8h }, [x24], #0x10\n"
       "st1 { v20.8h }, [x23], #0x10\n"
       "st1 { v24.8h }, [x22], #0x10\n"
       "tbz x8, #2, 237f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "str d13, [x25], #0x8\n"
       "str d17, [x24], #0x8\n"
       "str d21, [x23], #0x8\n"
       "str d25, [x22], #0x8\n"
       "tbz x8, #1, 236f\n"
-      "st1 { v9.s }[2], [x15], #0x4\n"
+      "st1 { v9.s }[2], [x16], #0x4\n"
       "st1 { v13.s }[2], [x25], #0x4\n"
       "st1 { v17.s }[2], [x24], #0x4\n"
       "st1 { v21.s }[2], [x23], #0x4\n"
       "st1 { v25.s }[2], [x22], #0x4\n"
       "tbz x8, #0, 243f\n"
-      "st1 { v9.h }[6], [x15]\n"
+      "st1 { v9.h }[6], [x16]\n"
       "st1 { v13.h }[6], [x25]\n"
       "st1 { v17.h }[6], [x24]\n"
       "st1 { v21.h }[6], [x23]\n"
@@ -4262,7 +4259,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "236:"  // Height 5: Partial direct writeback: partial_1_12
       "tbz x8, #0, 243f\n"
-      "st1 { v9.h }[4], [x15]\n"
+      "st1 { v9.h }[4], [x16]\n"
       "st1 { v13.h }[4], [x25]\n"
       "st1 { v17.h }[4], [x24]\n"
       "st1 { v21.h }[4], [x23]\n"
@@ -4270,13 +4267,13 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "237:"  // Height 5: Partial direct writeback: partial_2_8
       "tbz x8, #1, 238f\n"
-      "str s9, [x15], #0x4\n"
+      "str s9, [x16], #0x4\n"
       "str s13, [x25], #0x4\n"
       "str s17, [x24], #0x4\n"
       "str s21, [x23], #0x4\n"
       "str s25, [x22], #0x4\n"
       "tbz x8, #0, 243f\n"
-      "st1 { v9.h }[2], [x15]\n"
+      "st1 { v9.h }[2], [x16]\n"
       "st1 { v13.h }[2], [x25]\n"
       "st1 { v17.h }[2], [x24]\n"
       "st1 { v21.h }[2], [x23]\n"
@@ -4284,7 +4281,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "238:"  // Height 5: Partial direct writeback: partial_1_8
       "tbz x8, #0, 243f\n"
-      "str h9, [x15, #0x0]\n"
+      "str h9, [x16, #0x0]\n"
       "str h13, [x25, #0x0]\n"
       "str h17, [x24, #0x0]\n"
       "str h21, [x23, #0x0]\n"
@@ -4292,19 +4289,19 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "239:"  // Height 5: Partial direct writeback: partial_4_0
       "tbz x8, #2, 241f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "str d12, [x25], #0x8\n"
       "str d16, [x24], #0x8\n"
       "str d20, [x23], #0x8\n"
       "str d24, [x22], #0x8\n"
       "tbz x8, #1, 240f\n"
-      "st1 { v8.s }[2], [x15], #0x4\n"
+      "st1 { v8.s }[2], [x16], #0x4\n"
       "st1 { v12.s }[2], [x25], #0x4\n"
       "st1 { v16.s }[2], [x24], #0x4\n"
       "st1 { v20.s }[2], [x23], #0x4\n"
       "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x8, #0, 243f\n"
-      "st1 { v8.h }[6], [x15]\n"
+      "st1 { v8.h }[6], [x16]\n"
       "st1 { v12.h }[6], [x25]\n"
       "st1 { v16.h }[6], [x24]\n"
       "st1 { v20.h }[6], [x23]\n"
@@ -4312,7 +4309,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "240:"  // Height 5: Partial direct writeback: partial_1_4
       "tbz x8, #0, 243f\n"
-      "st1 { v8.h }[4], [x15]\n"
+      "st1 { v8.h }[4], [x16]\n"
       "st1 { v12.h }[4], [x25]\n"
       "st1 { v16.h }[4], [x24]\n"
       "st1 { v20.h }[4], [x23]\n"
@@ -4320,20 +4317,20 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 243f\n"
       "241:"  // Height 5: Partial direct writeback: partial_2_0
       "tbz x8, #1, 242f\n"
-      "str s8, [x15], #0x4\n"
+      "str s8, [x16], #0x4\n"
       "str s12, [x25], #0x4\n"
       "str s16, [x24], #0x4\n"
       "str s20, [x23], #0x4\n"
       "str s24, [x22], #0x4\n"
       "tbz x8, #0, 243f\n"
-      "st1 { v8.h }[2], [x15]\n"
+      "st1 { v8.h }[2], [x16]\n"
       "st1 { v12.h }[2], [x25]\n"
       "st1 { v16.h }[2], [x24]\n"
       "st1 { v20.h }[2], [x23]\n"
       "st1 { v24.h }[2], [x22]\n"
       "b 243f\n"
       "242:"  // Height 5: Partial direct writeback: partial_1_0
-      "str h8, [x15, #0x0]\n"
+      "str h8, [x16, #0x0]\n"
       "str h12, [x25, #0x0]\n"
       "str h16, [x24, #0x0]\n"
       "str h20, [x23, #0x0]\n"
@@ -4341,11 +4338,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "243:"  // Height 5: Partial direct writeback: Done
       "b 245f\n"
       "244:"  // Height 5: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "str q12, [x25, #0x0]\n"
       "str q13, [x25, #0x10]\n"
       "str q14, [x25, #0x20]\n"
@@ -4367,88 +4364,88 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "bgt 198b\n"
       "b 296f\n"
       "246:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0xc\n"
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x16, %x[bias]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x19, #0xc\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "247:"  // Height 6: Column loop
-      "cbz x16, 248f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "cbz x7, 248f\n"
+      "ldr q8, [x7, #0x0]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q9, [x7, #0x10]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x7, #0x20]\n"
       "mov v14.16b, v10.16b\n"
+      "ldr q11, [x7, #0x30]\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "add x7, x7, #0x40\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
       "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
       "mov v28.16b, v8.16b\n"
       "mov v29.16b, v9.16b\n"
       "mov v30.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
-      "mov v23.16b, v11.16b\n"
-      "mov v27.16b, v11.16b\n"
       "mov v31.16b, v11.16b\n"
       "b 267f\n"
       "248:"  // Height 6: no bias
       "tbz %x[flags], #0, 266f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
       "cmp x8, #0x20\n"
-      "add x25, x15, x19, LSL #1\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "bge 265f\n"
       "tbz x8, #4, 256f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
       "ld1 { v12.8h }, [x25], #0x10\n"
       "ld1 { v16.8h }, [x24], #0x10\n"
-      "ld1 { v9.8h }, [x15], #0x10\n"
-      "ld1 { v13.8h }, [x25], #0x10\n"
-      "ld1 { v17.8h }, [x24], #0x10\n"
       "ld1 { v20.8h }, [x23], #0x10\n"
       "ld1 { v24.8h }, [x22], #0x10\n"
       "ld1 { v28.8h }, [x21], #0x10\n"
+      "ld1 { v9.8h }, [x16], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
+      "ld1 { v17.8h }, [x24], #0x10\n"
       "ld1 { v21.8h }, [x23], #0x10\n"
       "ld1 { v25.8h }, [x22], #0x10\n"
       "ld1 { v29.8h }, [x21], #0x10\n"
       "tbz x8, #3, 252f\n"
-      "ld1 { v10.8h }, [x15], #0x10\n"
+      "ld1 { v10.8h }, [x16], #0x10\n"
       "ld1 { v14.8h }, [x25], #0x10\n"
       "ld1 { v18.8h }, [x24], #0x10\n"
       "ld1 { v22.8h }, [x23], #0x10\n"
       "ld1 { v26.8h }, [x22], #0x10\n"
       "ld1 { v30.8h }, [x21], #0x10\n"
       "tbz x8, #2, 250f\n"
-      "ldr d11, [x15], #0x8\n"
+      "ldr d11, [x16], #0x8\n"
       "ldr d15, [x25], #0x8\n"
       "ldr d19, [x24], #0x8\n"
       "ldr d23, [x23], #0x8\n"
       "ldr d27, [x22], #0x8\n"
       "ldr d31, [x21], #0x8\n"
       "tbz x8, #1, 249f\n"
-      "ld1 { v11.s }[2], [x15], #0x4\n"
-      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x16], #0x4\n"
+      "mov x20, #0x3c\n"
       "ld1 { v15.s }[2], [x25], #0x4\n"
       "ld1 { v19.s }[2], [x24], #0x4\n"
       "ld1 { v23.s }[2], [x23], #0x4\n"
       "ld1 { v27.s }[2], [x22], #0x4\n"
       "ld1 { v31.s }[2], [x21], #0x4\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v11.h }[6], [x15]\n"
+      "ld1 { v11.h }[6], [x16]\n"
       "ld1 { v15.h }[6], [x25]\n"
       "ld1 { v19.h }[6], [x24]\n"
       "ld1 { v23.h }[6], [x23]\n"
@@ -4456,9 +4453,9 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ld1 { v31.h }[6], [x21]\n"
       "b 264f\n"
       "249:"  // Height 6: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v11.h }[4], [x15]\n"
+      "ld1 { v11.h }[4], [x16]\n"
       "ld1 { v15.h }[4], [x25]\n"
       "ld1 { v19.h }[4], [x24]\n"
       "ld1 { v23.h }[4], [x23]\n"
@@ -4467,15 +4464,15 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 264f\n"
       "250:"  // Height 6: Partial accumulate: partial_2_24
       "tbz x8, #1, 251f\n"
-      "ldr s11, [x15], #0x4\n"
+      "ldr s11, [x16], #0x4\n"
+      "mov x20, #0x34\n"
       "ldr s15, [x25], #0x4\n"
-      "mov x19, #0x34\n"
       "ldr s19, [x24], #0x4\n"
       "ldr s23, [x23], #0x4\n"
       "ldr s27, [x22], #0x4\n"
       "ldr s31, [x21], #0x4\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v11.h }[2], [x15]\n"
+      "ld1 { v11.h }[2], [x16]\n"
       "ld1 { v15.h }[2], [x25]\n"
       "ld1 { v19.h }[2], [x24]\n"
       "ld1 { v23.h }[2], [x23]\n"
@@ -4483,9 +4480,9 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ld1 { v31.h }[2], [x21]\n"
       "b 264f\n"
       "251:"  // Height 6: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 264f\n"
-      "ldr h11, [x15, #0x0]\n"
+      "ldr h11, [x16, #0x0]\n"
       "ldr h15, [x25, #0x0]\n"
       "ldr h19, [x24, #0x0]\n"
       "ldr h23, [x23, #0x0]\n"
@@ -4494,22 +4491,22 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 264f\n"
       "252:"  // Height 6: Partial accumulate: partial_4_16
       "tbz x8, #2, 254f\n"
-      "ldr d10, [x15], #0x8\n"
+      "ldr d10, [x16], #0x8\n"
       "ldr d14, [x25], #0x8\n"
       "ldr d18, [x24], #0x8\n"
       "ldr d22, [x23], #0x8\n"
       "ldr d26, [x22], #0x8\n"
       "ldr d30, [x21], #0x8\n"
       "tbz x8, #1, 253f\n"
-      "ld1 { v10.s }[2], [x15], #0x4\n"
-      "mov x19, #0x2c\n"
+      "ld1 { v10.s }[2], [x16], #0x4\n"
+      "mov x20, #0x2c\n"
       "ld1 { v14.s }[2], [x25], #0x4\n"
       "ld1 { v18.s }[2], [x24], #0x4\n"
       "ld1 { v22.s }[2], [x23], #0x4\n"
       "ld1 { v26.s }[2], [x22], #0x4\n"
       "ld1 { v30.s }[2], [x21], #0x4\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v10.h }[6], [x15]\n"
+      "ld1 { v10.h }[6], [x16]\n"
       "ld1 { v14.h }[6], [x25]\n"
       "ld1 { v18.h }[6], [x24]\n"
       "ld1 { v22.h }[6], [x23]\n"
@@ -4517,9 +4514,9 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ld1 { v30.h }[6], [x21]\n"
       "b 264f\n"
       "253:"  // Height 6: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v10.h }[4], [x15]\n"
+      "ld1 { v10.h }[4], [x16]\n"
       "ld1 { v14.h }[4], [x25]\n"
       "ld1 { v18.h }[4], [x24]\n"
       "ld1 { v22.h }[4], [x23]\n"
@@ -4528,15 +4525,15 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 264f\n"
       "254:"  // Height 6: Partial accumulate: partial_2_16
       "tbz x8, #1, 255f\n"
-      "ldr s10, [x15], #0x4\n"
+      "ldr s10, [x16], #0x4\n"
+      "mov x20, #0x24\n"
       "ldr s14, [x25], #0x4\n"
-      "mov x19, #0x24\n"
       "ldr s18, [x24], #0x4\n"
       "ldr s22, [x23], #0x4\n"
       "ldr s26, [x22], #0x4\n"
       "ldr s30, [x21], #0x4\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v10.h }[2], [x15]\n"
+      "ld1 { v10.h }[2], [x16]\n"
       "ld1 { v14.h }[2], [x25]\n"
       "ld1 { v18.h }[2], [x24]\n"
       "ld1 { v22.h }[2], [x23]\n"
@@ -4544,9 +4541,9 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ld1 { v30.h }[2], [x21]\n"
       "b 264f\n"
       "255:"  // Height 6: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 264f\n"
-      "ldr h10, [x15, #0x0]\n"
+      "ldr h10, [x16, #0x0]\n"
       "ldr h14, [x25, #0x0]\n"
       "ldr h18, [x24, #0x0]\n"
       "ldr h22, [x23, #0x0]\n"
@@ -4555,29 +4552,29 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 264f\n"
       "256:"  // Height 6: Partial accumulate: partial_8_0
       "tbz x8, #3, 260f\n"
-      "ld1 { v8.8h }, [x15], #0x10\n"
+      "ld1 { v8.8h }, [x16], #0x10\n"
       "ld1 { v12.8h }, [x25], #0x10\n"
       "ld1 { v16.8h }, [x24], #0x10\n"
       "ld1 { v20.8h }, [x23], #0x10\n"
       "ld1 { v24.8h }, [x22], #0x10\n"
       "ld1 { v28.8h }, [x21], #0x10\n"
       "tbz x8, #2, 258f\n"
-      "ldr d9, [x15], #0x8\n"
+      "ldr d9, [x16], #0x8\n"
       "ldr d13, [x25], #0x8\n"
       "ldr d17, [x24], #0x8\n"
       "ldr d21, [x23], #0x8\n"
       "ldr d25, [x22], #0x8\n"
       "ldr d29, [x21], #0x8\n"
       "tbz x8, #1, 257f\n"
-      "ld1 { v9.s }[2], [x15], #0x4\n"
-      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x16], #0x4\n"
+      "mov x20, #0x1c\n"
       "ld1 { v13.s }[2], [x25], #0x4\n"
       "ld1 { v17.s }[2], [x24], #0x4\n"
       "ld1 { v21.s }[2], [x23], #0x4\n"
       "ld1 { v25.s }[2], [x22], #0x4\n"
       "ld1 { v29.s }[2], [x21], #0x4\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v9.h }[6], [x15]\n"
+      "ld1 { v9.h }[6], [x16]\n"
       "ld1 { v13.h }[6], [x25]\n"
       "ld1 { v17.h }[6], [x24]\n"
       "ld1 { v21.h }[6], [x23]\n"
@@ -4585,9 +4582,9 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ld1 { v29.h }[6], [x21]\n"
       "b 264f\n"
       "257:"  // Height 6: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v9.h }[4], [x15]\n"
+      "ld1 { v9.h }[4], [x16]\n"
       "ld1 { v13.h }[4], [x25]\n"
       "ld1 { v17.h }[4], [x24]\n"
       "ld1 { v21.h }[4], [x23]\n"
@@ -4596,15 +4593,15 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 264f\n"
       "258:"  // Height 6: Partial accumulate: partial_2_8
       "tbz x8, #1, 259f\n"
-      "ldr s9, [x15], #0x4\n"
+      "ldr s9, [x16], #0x4\n"
+      "mov x20, #0x14\n"
       "ldr s13, [x25], #0x4\n"
-      "mov x19, #0x14\n"
       "ldr s17, [x24], #0x4\n"
       "ldr s21, [x23], #0x4\n"
       "ldr s25, [x22], #0x4\n"
       "ldr s29, [x21], #0x4\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v9.h }[2], [x15]\n"
+      "ld1 { v9.h }[2], [x16]\n"
       "ld1 { v13.h }[2], [x25]\n"
       "ld1 { v17.h }[2], [x24]\n"
       "ld1 { v21.h }[2], [x23]\n"
@@ -4612,9 +4609,9 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ld1 { v29.h }[2], [x21]\n"
       "b 264f\n"
       "259:"  // Height 6: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 264f\n"
-      "ldr h9, [x15, #0x0]\n"
+      "ldr h9, [x16, #0x0]\n"
       "ldr h13, [x25, #0x0]\n"
       "ldr h17, [x24, #0x0]\n"
       "ldr h21, [x23, #0x0]\n"
@@ -4623,22 +4620,22 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 264f\n"
       "260:"  // Height 6: Partial accumulate: partial_4_0
       "tbz x8, #2, 262f\n"
-      "ldr d8, [x15], #0x8\n"
+      "ldr d8, [x16], #0x8\n"
       "ldr d12, [x25], #0x8\n"
       "ldr d16, [x24], #0x8\n"
       "ldr d20, [x23], #0x8\n"
       "ldr d24, [x22], #0x8\n"
       "ldr d28, [x21], #0x8\n"
       "tbz x8, #1, 261f\n"
-      "ld1 { v8.s }[2], [x15], #0x4\n"
-      "mov x19, #0xc\n"
+      "ld1 { v8.s }[2], [x16], #0x4\n"
+      "mov x20, #0xc\n"
       "ld1 { v12.s }[2], [x25], #0x4\n"
       "ld1 { v16.s }[2], [x24], #0x4\n"
       "ld1 { v20.s }[2], [x23], #0x4\n"
       "ld1 { v24.s }[2], [x22], #0x4\n"
       "ld1 { v28.s }[2], [x21], #0x4\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v8.h }[6], [x15]\n"
+      "ld1 { v8.h }[6], [x16]\n"
       "ld1 { v12.h }[6], [x25]\n"
       "ld1 { v16.h }[6], [x24]\n"
       "ld1 { v20.h }[6], [x23]\n"
@@ -4646,9 +4643,9 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ld1 { v28.h }[6], [x21]\n"
       "b 264f\n"
       "261:"  // Height 6: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v8.h }[4], [x15]\n"
+      "ld1 { v8.h }[4], [x16]\n"
       "ld1 { v12.h }[4], [x25]\n"
       "ld1 { v16.h }[4], [x24]\n"
       "ld1 { v20.h }[4], [x23]\n"
@@ -4657,15 +4654,15 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 264f\n"
       "262:"  // Height 6: Partial accumulate: partial_2_0
       "tbz x8, #1, 263f\n"
-      "ldr s8, [x15], #0x4\n"
+      "ldr s8, [x16], #0x4\n"
+      "mov x20, #0x4\n"
       "ldr s12, [x25], #0x4\n"
-      "mov x19, #0x4\n"
       "ldr s16, [x24], #0x4\n"
       "ldr s20, [x23], #0x4\n"
       "ldr s24, [x22], #0x4\n"
       "ldr s28, [x21], #0x4\n"
       "tbz x8, #0, 264f\n"
-      "ld1 { v8.h }[2], [x15]\n"
+      "ld1 { v8.h }[2], [x16]\n"
       "ld1 { v12.h }[2], [x25]\n"
       "ld1 { v16.h }[2], [x24]\n"
       "ld1 { v20.h }[2], [x23]\n"
@@ -4673,21 +4670,21 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ld1 { v28.h }[2], [x21]\n"
       "b 264f\n"
       "263:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr h8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr h8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "ldr h12, [x25, #0x0]\n"
       "ldr h16, [x24, #0x0]\n"
       "ldr h20, [x23, #0x0]\n"
       "ldr h24, [x22, #0x0]\n"
       "ldr h28, [x21, #0x0]\n"
       "264:"  // Height 6: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 267f\n"
       "265:"  // Height 6: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "ldr q12, [x25, #0x0]\n"
       "ldr q13, [x25, #0x10]\n"
       "ldr q14, [x25, #0x20]\n"
@@ -4735,404 +4732,404 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "movi v30.16b, #0x0\n"
       "movi v31.16b, #0x0\n"
       "267:"  // Height 6: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "268:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 269f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x22, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x14, 270f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #1\n"
-      "add x28, x28, x19, LSL #1\n"
-      "add x26, x26, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
-      "add x20, x20, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "ldr x28, [x20, #0x28]\n"
+      "cbnz x15, 270f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #1\n"
+      "add x12, x12, x20, LSL #1\n"
+      "add x11, x11, x20, LSL #1\n"
+      "add x10, x10, x20, LSL #1\n"
+      "add x9, x9, x20, LSL #1\n"
+      "add x28, x28, x20, LSL #1\n"
       "b 270f\n"
       "269:"  // Height 6: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19, LSL #1\n"
-      "add x26, x28, x19, LSL #1\n"
-      "add x24, x26, x19, LSL #1\n"
-      "add x22, x24, x19, LSL #1\n"
-      "add x20, x22, x19, LSL #1\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21, LSL #1\n"
+      "add x11, x12, x21, LSL #1\n"
+      "add x10, x11, x21, LSL #1\n"
+      "add x9, x10, x21, LSL #1\n"
+      "add x28, x9, x21, LSL #1\n"
       "270:"  // Height 6: input setup done
-      "cmp x13, #0x8\n"
+      "cmp x14, #0x8\n"
       "blt 273f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x10\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x10\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 272f\n"
       "271:"  // Height 6: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr d7, [x17, #0x10]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x18]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr x10, [x17, #0x28]\n"
+      "add x13, x13, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
       "add x12, x12, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "mov v7.d[1], x11\n"
+      "add x11, x11, #0x10\n"
       "fmla v28.8h, v6.8h, v5.h[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
       "ldr d6, [x17, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr x9, [x12, #0x8]\n"
+      "add x10, x10, #0x10\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "mov v6.d[1], x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr x10, [x17, #0x48]\n"
+      "add x28, x28, #0x10\n"
       "fmla v29.8h, v7.8h, v5.h[0]\n"
       "ldr d7, [x17, #0x30]\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "add x28, x28, #0x10\n"
       "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr x20, [x17, #0x58]\n"
       "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "mov v7.d[1], x11\n"
+      "ldr x27, [x13, #0x8]\n"
       "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr x11, [x17, #0x58]\n"
+      "ldr x26, [x12, #0x8]\n"
       "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr x27, [x28, #0x8]\n"
+      "ldr x25, [x11, #0x8]\n"
       "fmla v30.8h, v6.8h, v5.h[0]\n"
       "ldr d6, [x17, #0x40]\n"
       "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "add x26, x26, #0x10\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr x21, [x17, #0x68]\n"
       "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "mov v6.d[1], x10\n"
+      "ldr x24, [x10, #0x8]\n"
       "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr x10, [x17, #0x68]\n"
+      "ldr x23, [x9, #0x8]\n"
       "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr x25, [x26, #0x8]\n"
+      "ldr x22, [x28, #0x8]\n"
       "fmla v31.8h, v7.8h, v5.h[0]\n"
       "ldr d7, [x17, #0x50]\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "add x24, x24, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr x20, [x17, #0x78]\n"
       "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "mov v7.d[1], x11\n"
+      "sub x14, x14, #0x8\n"
       "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr x11, [x17, #0x78]\n"
+      "cmp x14, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[1]\n"
-      "ldr x23, [x24, #0x8]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v28.8h, v6.8h, v5.h[1]\n"
       "ldr d6, [x17, #0x60]\n"
       "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "add x22, x22, #0x10\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr x21, [x17, #0x88]\n"
       "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "mov v6.d[1], x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "ldr x10, [x17, #0x88]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "ldr x21, [x22, #0x8]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       "fmla v29.8h, v7.8h, v5.h[1]\n"
       "ldr d7, [x17, #0x70]\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "add x20, x20, #0x10\n"
       "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr x20, [x17, #0x98]\n"
       "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "mov v7.d[1], x11\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr x11, [x17, #0x98]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v26.8h, v6.8h, v4.h[1]\n"
-      "ldr x19, [x20, #0x8]\n"
       "fmla v30.8h, v6.8h, v5.h[1]\n"
       "ldr d6, [x17, #0x80]\n"
       "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "sub x13, x13, #0x8\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "cmp x13, #0x10\n"
+      "ldr x21, [x17, #0xa8]\n"
       "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "mov v6.d[1], x10\n"
       "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr x10, [x17, #0xa8]\n"
       "fmla v27.8h, v7.8h, v4.h[1]\n"
       "fmla v31.8h, v7.8h, v5.h[1]\n"
       "ldr d7, [x17, #0x90]\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[2]\n"
       "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
       "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "mov v7.d[1], x11\n"
       "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
       "fmla v24.8h, v6.8h, v4.h[2]\n"
       "fmla v28.8h, v6.8h, v5.h[2]\n"
       "ldr d6, [x17, #0xa0]\n"
       "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
       "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "mov v6.d[1], x10\n"
       "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr x10, [x17, #0xc8]\n"
       "fmla v25.8h, v7.8h, v4.h[2]\n"
       "fmla v29.8h, v7.8h, v5.h[2]\n"
       "ldr d7, [x17, #0xb0]\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[2]\n"
       "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
       "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "mov v7.d[1], x11\n"
       "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
       "fmla v26.8h, v6.8h, v4.h[2]\n"
       "fmla v30.8h, v6.8h, v5.h[2]\n"
       "ldr d6, [x17, #0xc0]\n"
       "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
       "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "mov v6.d[1], x10\n"
       "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr x10, [x17, #0xe8]\n"
       "fmla v27.8h, v7.8h, v4.h[2]\n"
       "fmla v31.8h, v7.8h, v5.h[2]\n"
       "ldr d7, [x17, #0xd0]\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[3]\n"
       "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
       "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "mov v7.d[1], x11\n"
       "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
       "fmla v24.8h, v6.8h, v4.h[3]\n"
       "fmla v28.8h, v6.8h, v5.h[3]\n"
       "ldr d6, [x17, #0xe0]\n"
       "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x108]\n"
       "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "mov v6.d[1], x10\n"
       "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr x10, [x17, #0x108]\n"
       "fmla v25.8h, v7.8h, v4.h[3]\n"
       "fmla v29.8h, v7.8h, v5.h[3]\n"
       "ldr d7, [x17, #0xf0]\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[3]\n"
       "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0x118]\n"
       "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "mov v7.d[1], x11\n"
       "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr x11, [x17, #0x118]\n"
       "fmla v26.8h, v6.8h, v4.h[3]\n"
       "fmla v30.8h, v6.8h, v5.h[3]\n"
       "ldr d6, [x17, #0x100]\n"
       "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x128]\n"
       "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "mov v6.d[1], x10\n"
       "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr x10, [x17, #0x128]\n"
       "fmla v27.8h, v7.8h, v4.h[3]\n"
       "fmla v31.8h, v7.8h, v5.h[3]\n"
       "ldr d7, [x17, #0x110]\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[4]\n"
       "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x138]\n"
       "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "mov v7.d[1], x11\n"
       "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr x11, [x17, #0x138]\n"
       "fmla v24.8h, v6.8h, v4.h[4]\n"
       "fmla v28.8h, v6.8h, v5.h[4]\n"
       "ldr d6, [x17, #0x120]\n"
       "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x148]\n"
       "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "mov v6.d[1], x10\n"
       "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr x10, [x17, #0x148]\n"
       "fmla v25.8h, v7.8h, v4.h[4]\n"
       "fmla v29.8h, v7.8h, v5.h[4]\n"
       "ldr d7, [x17, #0x130]\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[4]\n"
       "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x158]\n"
       "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "mov v7.d[1], x11\n"
       "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr x11, [x17, #0x158]\n"
       "fmla v26.8h, v6.8h, v4.h[4]\n"
       "fmla v30.8h, v6.8h, v5.h[4]\n"
       "ldr d6, [x17, #0x140]\n"
       "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x168]\n"
       "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "mov v6.d[1], x10\n"
       "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr x10, [x17, #0x168]\n"
       "fmla v27.8h, v7.8h, v4.h[4]\n"
       "fmla v31.8h, v7.8h, v5.h[4]\n"
       "ldr d7, [x17, #0x150]\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[5]\n"
       "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x178]\n"
       "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "mov v7.d[1], x11\n"
       "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr x11, [x17, #0x178]\n"
       "fmla v24.8h, v6.8h, v4.h[5]\n"
       "fmla v28.8h, v6.8h, v5.h[5]\n"
       "ldr d6, [x17, #0x160]\n"
       "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x188]\n"
       "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "mov v6.d[1], x10\n"
       "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr x10, [x17, #0x188]\n"
       "fmla v25.8h, v7.8h, v4.h[5]\n"
       "fmla v29.8h, v7.8h, v5.h[5]\n"
       "ldr d7, [x17, #0x170]\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[5]\n"
       "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x198]\n"
       "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "mov v7.d[1], x11\n"
       "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr x11, [x17, #0x198]\n"
       "fmla v26.8h, v6.8h, v4.h[5]\n"
       "fmla v30.8h, v6.8h, v5.h[5]\n"
       "ldr d6, [x17, #0x180]\n"
       "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x1a8]\n"
       "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "mov v6.d[1], x10\n"
       "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr x10, [x17, #0x1a8]\n"
       "fmla v27.8h, v7.8h, v4.h[5]\n"
       "fmla v31.8h, v7.8h, v5.h[5]\n"
       "ldr d7, [x17, #0x190]\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[6]\n"
       "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1b8]\n"
       "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "mov v7.d[1], x11\n"
       "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr x11, [x17, #0x1b8]\n"
       "fmla v24.8h, v6.8h, v4.h[6]\n"
       "fmla v28.8h, v6.8h, v5.h[6]\n"
       "ldr d6, [x17, #0x1a0]\n"
       "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1c8]\n"
       "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "mov v6.d[1], x10\n"
       "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr x10, [x17, #0x1c8]\n"
       "fmla v25.8h, v7.8h, v4.h[6]\n"
       "fmla v29.8h, v7.8h, v5.h[6]\n"
       "ldr d7, [x17, #0x1b0]\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[6]\n"
       "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1d8]\n"
       "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "mov v7.d[1], x11\n"
       "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr x11, [x17, #0x1d8]\n"
       "fmla v26.8h, v6.8h, v4.h[6]\n"
       "fmla v30.8h, v6.8h, v5.h[6]\n"
       "ldr d6, [x17, #0x1c0]\n"
       "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1e8]\n"
       "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "mov v6.d[1], x10\n"
       "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr x10, [x17, #0x1e8]\n"
       "fmla v27.8h, v7.8h, v4.h[6]\n"
       "fmla v31.8h, v7.8h, v5.h[6]\n"
       "ldr d7, [x17, #0x1d0]\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[7]\n"
       "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "ldr x20, [x17, #0x1f8]\n"
       "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "mov v7.d[1], x11\n"
       "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr x11, [x17, #0x1f8]\n"
       "fmla v24.8h, v6.8h, v4.h[7]\n"
       "fmla v28.8h, v6.8h, v5.h[7]\n"
       "ldr d6, [x17, #0x1e0]\n"
       "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[7]\n"
       "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "mov v6.d[1], x10\n"
       "fmla v21.8h, v7.8h, v3.h[7]\n"
       "fmla v25.8h, v7.8h, v4.h[7]\n"
       "fmla v29.8h, v7.8h, v5.h[7]\n"
       "ldr d7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "mov v7.d[1], x20\n"
       "add x17, x17, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "ldr x21, [x17, #0x8]\n"
       "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "ldr x10, [x17, #0x8]\n"
+      "ldr x20, [x17, #0x18]\n"
       "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "mov v7.d[1], x11\n"
       "fmla v22.8h, v6.8h, v3.h[7]\n"
       "fmla v26.8h, v6.8h, v4.h[7]\n"
       "fmla v30.8h, v6.8h, v5.h[7]\n"
       "ldr d6, [x17, #0x0]\n"
       "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "ldr d0, [x12, #0x0]\n"
+      "ldr d0, [x13, #0x0]\n"
       "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr d1, [x28, #0x0]\n"
+      "ldr d1, [x12, #0x0]\n"
       "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "mov v6.d[1], x10\n"
+      "ldr d2, [x11, #0x0]\n"
       "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "mov v0.d[1], x9\n"
+      "ldr d3, [x10, #0x0]\n"
       "fmla v27.8h, v7.8h, v4.h[7]\n"
-      "mov v1.d[1], x27\n"
+      "ldr d4, [x9, #0x0]\n"
       "fmla v31.8h, v7.8h, v5.h[7]\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "ldr d4, [x22, #0x0]\n"
+      "ldr d5, [x28, #0x0]\n"
+      "ldr d7, [x17, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x27\n"
+      "mov v1.d[1], x26\n"
       "mov v2.d[1], x25\n"
-      "ldr d5, [x20, #0x0]\n"
-      "mov v3.d[1], x23\n"
-      "mov v4.d[1], x21\n"
-      "mov v5.d[1], x19\n"
+      "mov v3.d[1], x24\n"
+      "mov v4.d[1], x23\n"
+      "mov v5.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 271b\n"
       "272:"  // Height 6: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
+      "add x13, x13, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "sub x13, x13, #0x8\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
       "add x12, x12, #0x10\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "add x28, x28, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v28.8h, v6.8h, v5.h[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
       "ldr q6, [x17, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x28, x28, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "add x26, x26, #0x10\n"
+      "sub x14, x14, #0x8\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v29.8h, v7.8h, v5.h[0]\n"
       "ldr q7, [x17, #0x30]\n"
       "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "add x20, x20, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "fmla v26.8h, v6.8h, v4.h[0]\n"
       "fmla v30.8h, v6.8h, v5.h[0]\n"
       "ldr q6, [x17, #0x40]\n"
@@ -5339,67 +5336,65 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "fmla v27.8h, v7.8h, v4.h[7]\n"
       "fmla v31.8h, v7.8h, v5.h[7]\n"
       "273:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x13, 275f\n"
+      "cbz x14, 275f\n"
       "274:"  // Height 6: Multiply loop: Odd block loop
-      "ldr h0, [x12], #0x2\n"
-      "sub x13, x13, #0x1\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
-      "ldr h5, [x20], #0x2\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "fmla v28.8h, v6.8h, v5.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "fmla v29.8h, v7.8h, v5.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr h7, [x13], #0x2\n"
+      "sub x14, x14, #0x1\n"
+      "ldr h6, [x12], #0x2\n"
+      "ldr h5, [x11], #0x2\n"
+      "ldr h4, [x10], #0x2\n"
+      "ldr h3, [x9], #0x2\n"
+      "ldr h2, [x28], #0x2\n"
+      "ldr q1, [x17, #0x0]\n"
+      "fmla v8.8h, v1.8h, v7.h[0]\n"
+      "ldr q0, [x17, #0x10]\n"
+      "fmla v12.8h, v1.8h, v6.h[0]\n"
+      "fmla v16.8h, v1.8h, v5.h[0]\n"
+      "fmla v20.8h, v1.8h, v4.h[0]\n"
+      "fmla v24.8h, v1.8h, v3.h[0]\n"
+      "fmla v28.8h, v1.8h, v2.h[0]\n"
+      "ldr q1, [x17, #0x20]\n"
+      "fmla v9.8h, v0.8h, v7.h[0]\n"
+      "fmla v13.8h, v0.8h, v6.h[0]\n"
+      "fmla v17.8h, v0.8h, v5.h[0]\n"
+      "fmla v21.8h, v0.8h, v4.h[0]\n"
+      "fmla v25.8h, v0.8h, v3.h[0]\n"
+      "fmla v29.8h, v0.8h, v2.h[0]\n"
+      "ldr q0, [x17, #0x30]\n"
+      "fmla v10.8h, v1.8h, v7.h[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "fmla v30.8h, v6.8h, v5.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "fmla v31.8h, v7.8h, v5.h[0]\n"
-      "cbnz x13, 274b\n"
+      "fmla v14.8h, v1.8h, v6.h[0]\n"
+      "fmla v18.8h, v1.8h, v5.h[0]\n"
+      "fmla v22.8h, v1.8h, v4.h[0]\n"
+      "fmla v26.8h, v1.8h, v3.h[0]\n"
+      "fmla v30.8h, v1.8h, v2.h[0]\n"
+      "fmla v11.8h, v0.8h, v7.h[0]\n"
+      "fmla v15.8h, v0.8h, v6.h[0]\n"
+      "fmla v19.8h, v0.8h, v5.h[0]\n"
+      "fmla v23.8h, v0.8h, v4.h[0]\n"
+      "fmla v27.8h, v0.8h, v3.h[0]\n"
+      "fmla v31.8h, v0.8h, v2.h[0]\n"
+      "cbnz x14, 274b\n"
       "275:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 268b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "add x25, x15, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #1\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #1\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #1\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #1\n"
       "prfm pstl1keep, [x21, #0x0]\n"
       "tbz %x[flags], #1, 276f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
-      "ld1r { v0.8h }, [x19]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x20]\n"
       "fmin v8.8h, v8.8h, v0.8h\n"
       "fmin v9.8h, v9.8h, v0.8h\n"
       "fmin v10.8h, v10.8h, v0.8h\n"
@@ -5410,16 +5405,6 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "fmin v15.8h, v15.8h, v0.8h\n"
       "fmin v16.8h, v16.8h, v0.8h\n"
       "fmin v17.8h, v17.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
-      "fmax v12.8h, v12.8h, v1.8h\n"
-      "fmax v13.8h, v13.8h, v1.8h\n"
-      "fmax v14.8h, v14.8h, v1.8h\n"
-      "fmax v15.8h, v15.8h, v1.8h\n"
-      "fmax v16.8h, v16.8h, v1.8h\n"
-      "fmax v17.8h, v17.8h, v1.8h\n"
       "fmin v18.8h, v18.8h, v0.8h\n"
       "fmin v19.8h, v19.8h, v0.8h\n"
       "fmin v20.8h, v20.8h, v0.8h\n"
@@ -5430,30 +5415,42 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "fmin v25.8h, v25.8h, v0.8h\n"
       "fmin v26.8h, v26.8h, v0.8h\n"
       "fmin v27.8h, v27.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v1.8h\n"
-      "fmax v19.8h, v19.8h, v1.8h\n"
-      "fmax v20.8h, v20.8h, v1.8h\n"
-      "fmax v21.8h, v21.8h, v1.8h\n"
-      "fmax v22.8h, v22.8h, v1.8h\n"
-      "fmax v23.8h, v23.8h, v1.8h\n"
-      "fmax v24.8h, v24.8h, v1.8h\n"
-      "fmax v25.8h, v25.8h, v1.8h\n"
-      "fmax v26.8h, v26.8h, v1.8h\n"
-      "fmax v27.8h, v27.8h, v1.8h\n"
       "fmin v28.8h, v28.8h, v0.8h\n"
       "fmin v29.8h, v29.8h, v0.8h\n"
       "fmin v30.8h, v30.8h, v0.8h\n"
       "fmin v31.8h, v31.8h, v0.8h\n"
-      "fmax v28.8h, v28.8h, v1.8h\n"
-      "fmax v29.8h, v29.8h, v1.8h\n"
-      "fmax v30.8h, v30.8h, v1.8h\n"
-      "fmax v31.8h, v31.8h, v1.8h\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.8h }, [x20]\n"
+      "fmax v8.8h, v8.8h, v0.8h\n"
+      "fmax v9.8h, v9.8h, v0.8h\n"
+      "fmax v10.8h, v10.8h, v0.8h\n"
+      "fmax v11.8h, v11.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v0.8h\n"
+      "fmax v13.8h, v13.8h, v0.8h\n"
+      "fmax v14.8h, v14.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v0.8h\n"
+      "fmax v16.8h, v16.8h, v0.8h\n"
+      "fmax v17.8h, v17.8h, v0.8h\n"
+      "fmax v18.8h, v18.8h, v0.8h\n"
+      "fmax v19.8h, v19.8h, v0.8h\n"
+      "fmax v20.8h, v20.8h, v0.8h\n"
+      "fmax v21.8h, v21.8h, v0.8h\n"
+      "fmax v22.8h, v22.8h, v0.8h\n"
+      "fmax v23.8h, v23.8h, v0.8h\n"
+      "fmax v24.8h, v24.8h, v0.8h\n"
+      "fmax v25.8h, v25.8h, v0.8h\n"
+      "fmax v26.8h, v26.8h, v0.8h\n"
+      "fmax v27.8h, v27.8h, v0.8h\n"
+      "fmax v28.8h, v28.8h, v0.8h\n"
+      "fmax v29.8h, v29.8h, v0.8h\n"
+      "fmax v30.8h, v30.8h, v0.8h\n"
+      "fmax v31.8h, v31.8h, v0.8h\n"
       "276:"  // Height 6: No activation
       "cmp x8, #0x20\n"
       "bge 293f\n"
       "tbz x8, #4, 284f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
-      "st1 { v9.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
+      "st1 { v9.8h }, [x16], #0x10\n"
       "st1 { v12.8h }, [x25], #0x10\n"
       "st1 { v13.8h }, [x25], #0x10\n"
       "st1 { v16.8h }, [x24], #0x10\n"
@@ -5465,28 +5462,28 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "st1 { v28.8h }, [x21], #0x10\n"
       "st1 { v29.8h }, [x21], #0x10\n"
       "tbz x8, #3, 280f\n"
-      "st1 { v10.8h }, [x15], #0x10\n"
+      "st1 { v10.8h }, [x16], #0x10\n"
       "st1 { v14.8h }, [x25], #0x10\n"
       "st1 { v18.8h }, [x24], #0x10\n"
       "st1 { v22.8h }, [x23], #0x10\n"
       "st1 { v26.8h }, [x22], #0x10\n"
       "st1 { v30.8h }, [x21], #0x10\n"
       "tbz x8, #2, 278f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "str d15, [x25], #0x8\n"
       "str d19, [x24], #0x8\n"
       "str d23, [x23], #0x8\n"
       "str d27, [x22], #0x8\n"
       "str d31, [x21], #0x8\n"
       "tbz x8, #1, 277f\n"
-      "st1 { v11.s }[2], [x15], #0x4\n"
+      "st1 { v11.s }[2], [x16], #0x4\n"
       "st1 { v15.s }[2], [x25], #0x4\n"
       "st1 { v19.s }[2], [x24], #0x4\n"
       "st1 { v23.s }[2], [x23], #0x4\n"
       "st1 { v27.s }[2], [x22], #0x4\n"
       "st1 { v31.s }[2], [x21], #0x4\n"
       "tbz x8, #0, 292f\n"
-      "st1 { v11.h }[6], [x15]\n"
+      "st1 { v11.h }[6], [x16]\n"
       "st1 { v15.h }[6], [x25]\n"
       "st1 { v19.h }[6], [x24]\n"
       "st1 { v23.h }[6], [x23]\n"
@@ -5495,7 +5492,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "277:"  // Height 6: Partial direct writeback: partial_1_28
       "tbz x8, #0, 292f\n"
-      "st1 { v11.h }[4], [x15]\n"
+      "st1 { v11.h }[4], [x16]\n"
       "st1 { v15.h }[4], [x25]\n"
       "st1 { v19.h }[4], [x24]\n"
       "st1 { v23.h }[4], [x23]\n"
@@ -5504,14 +5501,14 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "278:"  // Height 6: Partial direct writeback: partial_2_24
       "tbz x8, #1, 279f\n"
-      "str s11, [x15], #0x4\n"
+      "str s11, [x16], #0x4\n"
       "str s15, [x25], #0x4\n"
       "str s19, [x24], #0x4\n"
       "str s23, [x23], #0x4\n"
       "str s27, [x22], #0x4\n"
       "str s31, [x21], #0x4\n"
       "tbz x8, #0, 292f\n"
-      "st1 { v11.h }[2], [x15]\n"
+      "st1 { v11.h }[2], [x16]\n"
       "st1 { v15.h }[2], [x25]\n"
       "st1 { v19.h }[2], [x24]\n"
       "st1 { v23.h }[2], [x23]\n"
@@ -5520,7 +5517,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "279:"  // Height 6: Partial direct writeback: partial_1_24
       "tbz x8, #0, 292f\n"
-      "str h11, [x15, #0x0]\n"
+      "str h11, [x16, #0x0]\n"
       "str h15, [x25, #0x0]\n"
       "str h19, [x24, #0x0]\n"
       "str h23, [x23, #0x0]\n"
@@ -5529,21 +5526,21 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "280:"  // Height 6: Partial direct writeback: partial_4_16
       "tbz x8, #2, 282f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "str d14, [x25], #0x8\n"
       "str d18, [x24], #0x8\n"
       "str d22, [x23], #0x8\n"
       "str d26, [x22], #0x8\n"
       "str d30, [x21], #0x8\n"
       "tbz x8, #1, 281f\n"
-      "st1 { v10.s }[2], [x15], #0x4\n"
+      "st1 { v10.s }[2], [x16], #0x4\n"
       "st1 { v14.s }[2], [x25], #0x4\n"
       "st1 { v18.s }[2], [x24], #0x4\n"
       "st1 { v22.s }[2], [x23], #0x4\n"
       "st1 { v26.s }[2], [x22], #0x4\n"
       "st1 { v30.s }[2], [x21], #0x4\n"
       "tbz x8, #0, 292f\n"
-      "st1 { v10.h }[6], [x15]\n"
+      "st1 { v10.h }[6], [x16]\n"
       "st1 { v14.h }[6], [x25]\n"
       "st1 { v18.h }[6], [x24]\n"
       "st1 { v22.h }[6], [x23]\n"
@@ -5552,7 +5549,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "281:"  // Height 6: Partial direct writeback: partial_1_20
       "tbz x8, #0, 292f\n"
-      "st1 { v10.h }[4], [x15]\n"
+      "st1 { v10.h }[4], [x16]\n"
       "st1 { v14.h }[4], [x25]\n"
       "st1 { v18.h }[4], [x24]\n"
       "st1 { v22.h }[4], [x23]\n"
@@ -5561,14 +5558,14 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "282:"  // Height 6: Partial direct writeback: partial_2_16
       "tbz x8, #1, 283f\n"
-      "str s10, [x15], #0x4\n"
+      "str s10, [x16], #0x4\n"
       "str s14, [x25], #0x4\n"
       "str s18, [x24], #0x4\n"
       "str s22, [x23], #0x4\n"
       "str s26, [x22], #0x4\n"
       "str s30, [x21], #0x4\n"
       "tbz x8, #0, 292f\n"
-      "st1 { v10.h }[2], [x15]\n"
+      "st1 { v10.h }[2], [x16]\n"
       "st1 { v14.h }[2], [x25]\n"
       "st1 { v18.h }[2], [x24]\n"
       "st1 { v22.h }[2], [x23]\n"
@@ -5577,7 +5574,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "283:"  // Height 6: Partial direct writeback: partial_1_16
       "tbz x8, #0, 292f\n"
-      "str h10, [x15, #0x0]\n"
+      "str h10, [x16, #0x0]\n"
       "str h14, [x25, #0x0]\n"
       "str h18, [x24, #0x0]\n"
       "str h22, [x23, #0x0]\n"
@@ -5586,28 +5583,28 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "284:"  // Height 6: Partial direct writeback: partial_8_0
       "tbz x8, #3, 288f\n"
-      "st1 { v8.8h }, [x15], #0x10\n"
+      "st1 { v8.8h }, [x16], #0x10\n"
       "st1 { v12.8h }, [x25], #0x10\n"
       "st1 { v16.8h }, [x24], #0x10\n"
       "st1 { v20.8h }, [x23], #0x10\n"
       "st1 { v24.8h }, [x22], #0x10\n"
       "st1 { v28.8h }, [x21], #0x10\n"
       "tbz x8, #2, 286f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "str d13, [x25], #0x8\n"
       "str d17, [x24], #0x8\n"
       "str d21, [x23], #0x8\n"
       "str d25, [x22], #0x8\n"
       "str d29, [x21], #0x8\n"
       "tbz x8, #1, 285f\n"
-      "st1 { v9.s }[2], [x15], #0x4\n"
+      "st1 { v9.s }[2], [x16], #0x4\n"
       "st1 { v13.s }[2], [x25], #0x4\n"
       "st1 { v17.s }[2], [x24], #0x4\n"
       "st1 { v21.s }[2], [x23], #0x4\n"
       "st1 { v25.s }[2], [x22], #0x4\n"
       "st1 { v29.s }[2], [x21], #0x4\n"
       "tbz x8, #0, 292f\n"
-      "st1 { v9.h }[6], [x15]\n"
+      "st1 { v9.h }[6], [x16]\n"
       "st1 { v13.h }[6], [x25]\n"
       "st1 { v17.h }[6], [x24]\n"
       "st1 { v21.h }[6], [x23]\n"
@@ -5616,7 +5613,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "285:"  // Height 6: Partial direct writeback: partial_1_12
       "tbz x8, #0, 292f\n"
-      "st1 { v9.h }[4], [x15]\n"
+      "st1 { v9.h }[4], [x16]\n"
       "st1 { v13.h }[4], [x25]\n"
       "st1 { v17.h }[4], [x24]\n"
       "st1 { v21.h }[4], [x23]\n"
@@ -5625,14 +5622,14 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "286:"  // Height 6: Partial direct writeback: partial_2_8
       "tbz x8, #1, 287f\n"
-      "str s9, [x15], #0x4\n"
+      "str s9, [x16], #0x4\n"
       "str s13, [x25], #0x4\n"
       "str s17, [x24], #0x4\n"
       "str s21, [x23], #0x4\n"
       "str s25, [x22], #0x4\n"
       "str s29, [x21], #0x4\n"
       "tbz x8, #0, 292f\n"
-      "st1 { v9.h }[2], [x15]\n"
+      "st1 { v9.h }[2], [x16]\n"
       "st1 { v13.h }[2], [x25]\n"
       "st1 { v17.h }[2], [x24]\n"
       "st1 { v21.h }[2], [x23]\n"
@@ -5641,7 +5638,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "287:"  // Height 6: Partial direct writeback: partial_1_8
       "tbz x8, #0, 292f\n"
-      "str h9, [x15, #0x0]\n"
+      "str h9, [x16, #0x0]\n"
       "str h13, [x25, #0x0]\n"
       "str h17, [x24, #0x0]\n"
       "str h21, [x23, #0x0]\n"
@@ -5650,21 +5647,21 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "288:"  // Height 6: Partial direct writeback: partial_4_0
       "tbz x8, #2, 290f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "str d12, [x25], #0x8\n"
       "str d16, [x24], #0x8\n"
       "str d20, [x23], #0x8\n"
       "str d24, [x22], #0x8\n"
       "str d28, [x21], #0x8\n"
       "tbz x8, #1, 289f\n"
-      "st1 { v8.s }[2], [x15], #0x4\n"
+      "st1 { v8.s }[2], [x16], #0x4\n"
       "st1 { v12.s }[2], [x25], #0x4\n"
       "st1 { v16.s }[2], [x24], #0x4\n"
       "st1 { v20.s }[2], [x23], #0x4\n"
       "st1 { v24.s }[2], [x22], #0x4\n"
       "st1 { v28.s }[2], [x21], #0x4\n"
       "tbz x8, #0, 292f\n"
-      "st1 { v8.h }[6], [x15]\n"
+      "st1 { v8.h }[6], [x16]\n"
       "st1 { v12.h }[6], [x25]\n"
       "st1 { v16.h }[6], [x24]\n"
       "st1 { v20.h }[6], [x23]\n"
@@ -5673,7 +5670,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "289:"  // Height 6: Partial direct writeback: partial_1_4
       "tbz x8, #0, 292f\n"
-      "st1 { v8.h }[4], [x15]\n"
+      "st1 { v8.h }[4], [x16]\n"
       "st1 { v12.h }[4], [x25]\n"
       "st1 { v16.h }[4], [x24]\n"
       "st1 { v20.h }[4], [x23]\n"
@@ -5682,14 +5679,14 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "b 292f\n"
       "290:"  // Height 6: Partial direct writeback: partial_2_0
       "tbz x8, #1, 291f\n"
-      "str s8, [x15], #0x4\n"
+      "str s8, [x16], #0x4\n"
       "str s12, [x25], #0x4\n"
       "str s16, [x24], #0x4\n"
       "str s20, [x23], #0x4\n"
       "str s24, [x22], #0x4\n"
       "str s28, [x21], #0x4\n"
       "tbz x8, #0, 292f\n"
-      "st1 { v8.h }[2], [x15]\n"
+      "st1 { v8.h }[2], [x16]\n"
       "st1 { v12.h }[2], [x25]\n"
       "st1 { v16.h }[2], [x24]\n"
       "st1 { v20.h }[2], [x23]\n"
@@ -5697,7 +5694,7 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "st1 { v28.h }[2], [x21]\n"
       "b 292f\n"
       "291:"  // Height 6: Partial direct writeback: partial_1_0
-      "str h8, [x15, #0x0]\n"
+      "str h8, [x16, #0x0]\n"
       "str h12, [x25, #0x0]\n"
       "str h16, [x24, #0x0]\n"
       "str h20, [x23, #0x0]\n"
@@ -5706,11 +5703,11 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "292:"  // Height 6: Partial direct writeback: Done
       "b 294f\n"
       "293:"  // Height 6: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "str q12, [x25, #0x0]\n"
       "str q13, [x25, #0x10]\n"
       "str q14, [x25, #0x20]\n"
@@ -5736,20 +5733,19 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "bgt 247b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 296f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 295f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "295:"  // Update direct input
-      "mov x19, #0xc\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "296:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
index 6e51773166..8e5f600c83 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
 
@@ -92,9 +92,6 @@ void a64_hybrid_fp16_mla_6x32 (
             break;
     }
     __asm__ __volatile__(
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-      ".arch  armv8.2-a+fp16\n"
-#endif
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 246f\n"
@@ -104,138 +101,138 @@ void a64_hybrid_fp16_mla_6x32 (
       "cmp %x[M], #0x2\n"
       "bgt 99f\n"
       "beq 50f\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[bias]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "cbz x9, 3f\n"
-      "ldr q8, [x9, #0x0]\n"
-      "ldr q9, [x9, #0x10]\n"
-      "ldr q10, [x9, #0x20]\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
+      "cbz x12, 3f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
       "b 22f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 21f\n"
       "cmp x11, #0x20\n"
       "bge 20f\n"
       "tbz x11, #4, 11f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
-      "ld1 { v9.8h }, [x28], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
+      "ld1 { v9.8h }, [x9], #0x10\n"
       "tbz x11, #3, 7f\n"
-      "ld1 { v10.8h }, [x28], #0x10\n"
+      "ld1 { v10.8h }, [x9], #0x10\n"
       "tbz x11, #2, 5f\n"
-      "ldr d11, [x28], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
       "tbz x11, #1, 4f\n"
-      "mov x19, #0x3c\n"
-      "ld1 { v11.s }[2], [x28], #0x4\n"
+      "ld1 { v11.s }[2], [x9], #0x4\n"
+      "mov x20, #0x3c\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v11.h }[6], [x28]\n"
+      "ld1 { v11.h }[6], [x9]\n"
       "b 19f\n"
       "4:"  // Height 1: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v11.h }[4], [x28]\n"
+      "ld1 { v11.h }[4], [x9]\n"
       "b 19f\n"
       "5:"  // Height 1: Partial accumulate: partial_2_24
       "tbz x11, #1, 6f\n"
-      "ldr s11, [x28], #0x4\n"
-      "mov x19, #0x34\n"
+      "ldr s11, [x9], #0x4\n"
+      "mov x20, #0x34\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v11.h }[2], [x28]\n"
+      "ld1 { v11.h }[2], [x9]\n"
       "b 19f\n"
       "6:"  // Height 1: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 19f\n"
-      "ldr h11, [x28, #0x0]\n"
+      "ldr h11, [x9, #0x0]\n"
       "b 19f\n"
       "7:"  // Height 1: Partial accumulate: partial_4_16
       "tbz x11, #2, 9f\n"
-      "ldr d10, [x28], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
       "tbz x11, #1, 8f\n"
-      "ld1 { v10.s }[2], [x28], #0x4\n"
-      "mov x19, #0x2c\n"
+      "ld1 { v10.s }[2], [x9], #0x4\n"
+      "mov x20, #0x2c\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v10.h }[6], [x28]\n"
+      "ld1 { v10.h }[6], [x9]\n"
       "b 19f\n"
       "8:"  // Height 1: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v10.h }[4], [x28]\n"
+      "ld1 { v10.h }[4], [x9]\n"
       "b 19f\n"
       "9:"  // Height 1: Partial accumulate: partial_2_16
       "tbz x11, #1, 10f\n"
-      "ldr s10, [x28], #0x4\n"
-      "mov x19, #0x24\n"
+      "ldr s10, [x9], #0x4\n"
+      "mov x20, #0x24\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v10.h }[2], [x28]\n"
+      "ld1 { v10.h }[2], [x9]\n"
       "b 19f\n"
       "10:"  // Height 1: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 19f\n"
-      "ldr h10, [x28, #0x0]\n"
+      "ldr h10, [x9, #0x0]\n"
       "b 19f\n"
       "11:"  // Height 1: Partial accumulate: partial_8_0
       "tbz x11, #3, 15f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
       "tbz x11, #2, 13f\n"
-      "ldr d9, [x28], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
       "tbz x11, #1, 12f\n"
-      "mov x19, #0x1c\n"
-      "ld1 { v9.s }[2], [x28], #0x4\n"
+      "ld1 { v9.s }[2], [x9], #0x4\n"
+      "mov x20, #0x1c\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v9.h }[6], [x28]\n"
+      "ld1 { v9.h }[6], [x9]\n"
       "b 19f\n"
       "12:"  // Height 1: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v9.h }[4], [x28]\n"
+      "ld1 { v9.h }[4], [x9]\n"
       "b 19f\n"
       "13:"  // Height 1: Partial accumulate: partial_2_8
       "tbz x11, #1, 14f\n"
-      "ldr s9, [x28], #0x4\n"
-      "mov x19, #0x14\n"
+      "ldr s9, [x9], #0x4\n"
+      "mov x20, #0x14\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v9.h }[2], [x28]\n"
+      "ld1 { v9.h }[2], [x9]\n"
       "b 19f\n"
       "14:"  // Height 1: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 19f\n"
-      "ldr h9, [x28, #0x0]\n"
+      "ldr h9, [x9, #0x0]\n"
       "b 19f\n"
       "15:"  // Height 1: Partial accumulate: partial_4_0
       "tbz x11, #2, 17f\n"
-      "ldr d8, [x28], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
       "tbz x11, #1, 16f\n"
-      "ld1 { v8.s }[2], [x28], #0x4\n"
-      "mov x19, #0xc\n"
+      "ld1 { v8.s }[2], [x9], #0x4\n"
+      "mov x20, #0xc\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v8.h }[6], [x28]\n"
+      "ld1 { v8.h }[6], [x9]\n"
       "b 19f\n"
       "16:"  // Height 1: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v8.h }[4], [x28]\n"
+      "ld1 { v8.h }[4], [x9]\n"
       "b 19f\n"
       "17:"  // Height 1: Partial accumulate: partial_2_0
       "tbz x11, #1, 18f\n"
-      "ldr s8, [x28], #0x4\n"
-      "mov x19, #0x4\n"
+      "ldr s8, [x9], #0x4\n"
+      "mov x20, #0x4\n"
       "tbz x11, #0, 19f\n"
-      "ld1 { v8.h }[2], [x28]\n"
+      "ld1 { v8.h }[2], [x9]\n"
       "b 19f\n"
       "18:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr h8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr h8, [x9, #0x0]\n"
+      "mov x20, #0x0\n"
       "19:"  // Height 1: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 22f\n"
       "20:"  // Height 1: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
       "b 22f\n"
       "21:"  // Height 1: no accumulate
       "movi v8.16b, #0x0\n"
@@ -243,484 +240,484 @@ void a64_hybrid_fp16_mla_6x32 (
       "movi v10.16b, #0x0\n"
       "movi v11.16b, #0x0\n"
       "22:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "23:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 24f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 25f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 25f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
       "b 25f\n"
       "24:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "25:"  // Height 1: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 28f\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 27f\n"
       "26:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "sub x26, x26, #0x8\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "cmp x26, #0x10\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x10, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x10, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x10, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x10, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x10, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x10, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr q17, [x10, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr q16, [x10, #0x1f0]\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "cmp x27, #0x10\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "bge 26b\n"
       "27:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
+      "ldr q17, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x25, x25, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x10, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x10, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x10, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x10, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x10, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x10, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr q17, [x10, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr q16, [x10, #0x1f0]\n"
+      "add x26, x26, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
       "28:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x26, 30f\n"
+      "cbz x27, 30f\n"
       "29:"  // Height 1: Multiply loop: Odd block loop
-      "ldr h0, [x25], #0x2\n"
-      "sub x26, x26, #0x1\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr q16, [x10, #0x0]\n"
+      "fmla v8.8h, v16.8h, v0.h[0]\n"
+      "sub x27, x27, #0x1\n"
+      "ldr q17, [x10, #0x10]\n"
+      "ldr q16, [x10, #0x20]\n"
+      "fmla v9.8h, v17.8h, v0.h[0]\n"
+      "fmla v10.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "cbnz x26, 29b\n"
+      "cbnz x27, 29b\n"
       "30:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 23b\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
       "tbz %x[flags], #1, 31f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.8h }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x19]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v17.8h\n"
+      "fmin v9.8h, v9.8h, v17.8h\n"
+      "fmin v10.8h, v10.8h, v17.8h\n"
+      "fmin v11.8h, v11.8h, v17.8h\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
       "31:"  // Height 1: No activation
       "cmp x11, #0x20\n"
       "bge 48f\n"
       "tbz x11, #4, 39f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
-      "st1 { v9.8h }, [x28], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
+      "st1 { v9.8h }, [x9], #0x10\n"
       "tbz x11, #3, 35f\n"
-      "st1 { v10.8h }, [x28], #0x10\n"
+      "st1 { v10.8h }, [x9], #0x10\n"
       "tbz x11, #2, 33f\n"
-      "str d11, [x28], #0x8\n"
+      "str d11, [x9], #0x8\n"
       "tbz x11, #1, 32f\n"
-      "st1 { v11.s }[2], [x28], #0x4\n"
+      "st1 { v11.s }[2], [x9], #0x4\n"
       "tbz x11, #0, 47f\n"
-      "st1 { v11.h }[6], [x28]\n"
+      "st1 { v11.h }[6], [x9]\n"
       "b 47f\n"
       "32:"  // Height 1: Partial direct writeback: partial_1_28
       "tbz x11, #0, 47f\n"
-      "st1 { v11.h }[4], [x28]\n"
+      "st1 { v11.h }[4], [x9]\n"
       "b 47f\n"
       "33:"  // Height 1: Partial direct writeback: partial_2_24
       "tbz x11, #1, 34f\n"
-      "str s11, [x28], #0x4\n"
+      "str s11, [x9], #0x4\n"
       "tbz x11, #0, 47f\n"
-      "st1 { v11.h }[2], [x28]\n"
+      "st1 { v11.h }[2], [x9]\n"
       "b 47f\n"
       "34:"  // Height 1: Partial direct writeback: partial_1_24
       "tbz x11, #0, 47f\n"
-      "str h11, [x28, #0x0]\n"
+      "str h11, [x9, #0x0]\n"
       "b 47f\n"
       "35:"  // Height 1: Partial direct writeback: partial_4_16
       "tbz x11, #2, 37f\n"
-      "str d10, [x28], #0x8\n"
+      "str d10, [x9], #0x8\n"
       "tbz x11, #1, 36f\n"
-      "st1 { v10.s }[2], [x28], #0x4\n"
+      "st1 { v10.s }[2], [x9], #0x4\n"
       "tbz x11, #0, 47f\n"
-      "st1 { v10.h }[6], [x28]\n"
+      "st1 { v10.h }[6], [x9]\n"
       "b 47f\n"
       "36:"  // Height 1: Partial direct writeback: partial_1_20
       "tbz x11, #0, 47f\n"
-      "st1 { v10.h }[4], [x28]\n"
+      "st1 { v10.h }[4], [x9]\n"
       "b 47f\n"
       "37:"  // Height 1: Partial direct writeback: partial_2_16
       "tbz x11, #1, 38f\n"
-      "str s10, [x28], #0x4\n"
+      "str s10, [x9], #0x4\n"
       "tbz x11, #0, 47f\n"
-      "st1 { v10.h }[2], [x28]\n"
+      "st1 { v10.h }[2], [x9]\n"
       "b 47f\n"
       "38:"  // Height 1: Partial direct writeback: partial_1_16
       "tbz x11, #0, 47f\n"
-      "str h10, [x28, #0x0]\n"
+      "str h10, [x9, #0x0]\n"
       "b 47f\n"
       "39:"  // Height 1: Partial direct writeback: partial_8_0
       "tbz x11, #3, 43f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
       "tbz x11, #2, 41f\n"
-      "str d9, [x28], #0x8\n"
+      "str d9, [x9], #0x8\n"
       "tbz x11, #1, 40f\n"
-      "st1 { v9.s }[2], [x28], #0x4\n"
+      "st1 { v9.s }[2], [x9], #0x4\n"
       "tbz x11, #0, 47f\n"
-      "st1 { v9.h }[6], [x28]\n"
+      "st1 { v9.h }[6], [x9]\n"
       "b 47f\n"
       "40:"  // Height 1: Partial direct writeback: partial_1_12
       "tbz x11, #0, 47f\n"
-      "st1 { v9.h }[4], [x28]\n"
+      "st1 { v9.h }[4], [x9]\n"
       "b 47f\n"
       "41:"  // Height 1: Partial direct writeback: partial_2_8
       "tbz x11, #1, 42f\n"
-      "str s9, [x28], #0x4\n"
+      "str s9, [x9], #0x4\n"
       "tbz x11, #0, 47f\n"
-      "st1 { v9.h }[2], [x28]\n"
+      "st1 { v9.h }[2], [x9]\n"
       "b 47f\n"
       "42:"  // Height 1: Partial direct writeback: partial_1_8
       "tbz x11, #0, 47f\n"
-      "str h9, [x28, #0x0]\n"
+      "str h9, [x9, #0x0]\n"
       "b 47f\n"
       "43:"  // Height 1: Partial direct writeback: partial_4_0
       "tbz x11, #2, 45f\n"
-      "str d8, [x28], #0x8\n"
+      "str d8, [x9], #0x8\n"
       "tbz x11, #1, 44f\n"
-      "st1 { v8.s }[2], [x28], #0x4\n"
+      "st1 { v8.s }[2], [x9], #0x4\n"
       "tbz x11, #0, 47f\n"
-      "st1 { v8.h }[6], [x28]\n"
+      "st1 { v8.h }[6], [x9]\n"
       "b 47f\n"
       "44:"  // Height 1: Partial direct writeback: partial_1_4
       "tbz x11, #0, 47f\n"
-      "st1 { v8.h }[4], [x28]\n"
+      "st1 { v8.h }[4], [x9]\n"
       "b 47f\n"
       "45:"  // Height 1: Partial direct writeback: partial_2_0
       "tbz x11, #1, 46f\n"
-      "str s8, [x28], #0x4\n"
+      "str s8, [x9], #0x4\n"
       "tbz x11, #0, 47f\n"
-      "st1 { v8.h }[2], [x28]\n"
+      "st1 { v8.h }[2], [x9]\n"
       "b 47f\n"
       "46:"  // Height 1: Partial direct writeback: partial_1_0
-      "str h8, [x28, #0x0]\n"
+      "str h8, [x9, #0x0]\n"
       "47:"  // Height 1: Partial direct writeback: Done
       "b 49f\n"
       "48:"  // Height 1: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
       "49:"  // Height 1: Writeback done
       "subs x11, x11, #0x20\n"
       "bgt 2b\n"
       "b 296f\n"
       "50:"  // Height 2
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "51:"  // Height 2: Column loop
-      "cbz x9, 52f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 52f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "ldr q10, [x9, #0x20]\n"
       "mov v13.16b, v9.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "add x12, x12, #0x40\n"
       "b 71f\n"
       "52:"  // Height 2: no bias
       "tbz %x[flags], #0, 70f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "cmp x11, #0x20\n"
-      "add x24, x28, x19, LSL #1\n"
+      "add x25, x9, x20, LSL #1\n"
       "bge 69f\n"
       "tbz x11, #4, 60f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
-      "ld1 { v12.8h }, [x24], #0x10\n"
-      "ld1 { v9.8h }, [x28], #0x10\n"
-      "ld1 { v13.8h }, [x24], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v9.8h }, [x9], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
       "tbz x11, #3, 56f\n"
-      "ld1 { v10.8h }, [x28], #0x10\n"
-      "ld1 { v14.8h }, [x24], #0x10\n"
+      "ld1 { v10.8h }, [x9], #0x10\n"
+      "ld1 { v14.8h }, [x25], #0x10\n"
       "tbz x11, #2, 54f\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
       "tbz x11, #1, 53f\n"
-      "mov x19, #0x3c\n"
-      "ld1 { v11.s }[2], [x28], #0x4\n"
-      "ld1 { v15.s }[2], [x24], #0x4\n"
+      "ld1 { v11.s }[2], [x9], #0x4\n"
+      "ld1 { v15.s }[2], [x25], #0x4\n"
+      "mov x20, #0x3c\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v11.h }[6], [x28]\n"
-      "ld1 { v15.h }[6], [x24]\n"
+      "ld1 { v11.h }[6], [x9]\n"
+      "ld1 { v15.h }[6], [x25]\n"
       "b 68f\n"
       "53:"  // Height 2: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v11.h }[4], [x28]\n"
-      "ld1 { v15.h }[4], [x24]\n"
+      "ld1 { v11.h }[4], [x9]\n"
+      "ld1 { v15.h }[4], [x25]\n"
       "b 68f\n"
       "54:"  // Height 2: Partial accumulate: partial_2_24
       "tbz x11, #1, 55f\n"
-      "ldr s11, [x28], #0x4\n"
-      "ldr s15, [x24], #0x4\n"
-      "mov x19, #0x34\n"
+      "ldr s11, [x9], #0x4\n"
+      "ldr s15, [x25], #0x4\n"
+      "mov x20, #0x34\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v11.h }[2], [x28]\n"
-      "ld1 { v15.h }[2], [x24]\n"
+      "ld1 { v11.h }[2], [x9]\n"
+      "ld1 { v15.h }[2], [x25]\n"
       "b 68f\n"
       "55:"  // Height 2: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 68f\n"
-      "ldr h11, [x28, #0x0]\n"
-      "ldr h15, [x24, #0x0]\n"
+      "ldr h11, [x9, #0x0]\n"
+      "ldr h15, [x25, #0x0]\n"
       "b 68f\n"
       "56:"  // Height 2: Partial accumulate: partial_4_16
       "tbz x11, #2, 58f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
       "tbz x11, #1, 57f\n"
-      "mov x19, #0x2c\n"
-      "ld1 { v10.s }[2], [x28], #0x4\n"
-      "ld1 { v14.s }[2], [x24], #0x4\n"
+      "ld1 { v10.s }[2], [x9], #0x4\n"
+      "ld1 { v14.s }[2], [x25], #0x4\n"
+      "mov x20, #0x2c\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v10.h }[6], [x28]\n"
-      "ld1 { v14.h }[6], [x24]\n"
+      "ld1 { v10.h }[6], [x9]\n"
+      "ld1 { v14.h }[6], [x25]\n"
       "b 68f\n"
       "57:"  // Height 2: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v10.h }[4], [x28]\n"
-      "ld1 { v14.h }[4], [x24]\n"
+      "ld1 { v10.h }[4], [x9]\n"
+      "ld1 { v14.h }[4], [x25]\n"
       "b 68f\n"
       "58:"  // Height 2: Partial accumulate: partial_2_16
       "tbz x11, #1, 59f\n"
-      "ldr s10, [x28], #0x4\n"
-      "ldr s14, [x24], #0x4\n"
-      "mov x19, #0x24\n"
+      "ldr s10, [x9], #0x4\n"
+      "ldr s14, [x25], #0x4\n"
+      "mov x20, #0x24\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v10.h }[2], [x28]\n"
-      "ld1 { v14.h }[2], [x24]\n"
+      "ld1 { v10.h }[2], [x9]\n"
+      "ld1 { v14.h }[2], [x25]\n"
       "b 68f\n"
       "59:"  // Height 2: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 68f\n"
-      "ldr h10, [x28, #0x0]\n"
-      "ldr h14, [x24, #0x0]\n"
+      "ldr h10, [x9, #0x0]\n"
+      "ldr h14, [x25, #0x0]\n"
       "b 68f\n"
       "60:"  // Height 2: Partial accumulate: partial_8_0
       "tbz x11, #3, 64f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
-      "ld1 { v12.8h }, [x24], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
       "tbz x11, #2, 62f\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
       "tbz x11, #1, 61f\n"
-      "mov x19, #0x1c\n"
-      "ld1 { v9.s }[2], [x28], #0x4\n"
-      "ld1 { v13.s }[2], [x24], #0x4\n"
+      "ld1 { v9.s }[2], [x9], #0x4\n"
+      "ld1 { v13.s }[2], [x25], #0x4\n"
+      "mov x20, #0x1c\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v9.h }[6], [x28]\n"
-      "ld1 { v13.h }[6], [x24]\n"
+      "ld1 { v9.h }[6], [x9]\n"
+      "ld1 { v13.h }[6], [x25]\n"
       "b 68f\n"
       "61:"  // Height 2: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v9.h }[4], [x28]\n"
-      "ld1 { v13.h }[4], [x24]\n"
+      "ld1 { v9.h }[4], [x9]\n"
+      "ld1 { v13.h }[4], [x25]\n"
       "b 68f\n"
       "62:"  // Height 2: Partial accumulate: partial_2_8
       "tbz x11, #1, 63f\n"
-      "ldr s9, [x28], #0x4\n"
-      "ldr s13, [x24], #0x4\n"
-      "mov x19, #0x14\n"
+      "ldr s9, [x9], #0x4\n"
+      "ldr s13, [x25], #0x4\n"
+      "mov x20, #0x14\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v9.h }[2], [x28]\n"
-      "ld1 { v13.h }[2], [x24]\n"
+      "ld1 { v9.h }[2], [x9]\n"
+      "ld1 { v13.h }[2], [x25]\n"
       "b 68f\n"
       "63:"  // Height 2: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 68f\n"
-      "ldr h9, [x28, #0x0]\n"
-      "ldr h13, [x24, #0x0]\n"
+      "ldr h9, [x9, #0x0]\n"
+      "ldr h13, [x25, #0x0]\n"
       "b 68f\n"
       "64:"  // Height 2: Partial accumulate: partial_4_0
       "tbz x11, #2, 66f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
       "tbz x11, #1, 65f\n"
-      "mov x19, #0xc\n"
-      "ld1 { v8.s }[2], [x28], #0x4\n"
-      "ld1 { v12.s }[2], [x24], #0x4\n"
+      "ld1 { v8.s }[2], [x9], #0x4\n"
+      "ld1 { v12.s }[2], [x25], #0x4\n"
+      "mov x20, #0xc\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v8.h }[6], [x28]\n"
-      "ld1 { v12.h }[6], [x24]\n"
+      "ld1 { v8.h }[6], [x9]\n"
+      "ld1 { v12.h }[6], [x25]\n"
       "b 68f\n"
       "65:"  // Height 2: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v8.h }[4], [x28]\n"
-      "ld1 { v12.h }[4], [x24]\n"
+      "ld1 { v8.h }[4], [x9]\n"
+      "ld1 { v12.h }[4], [x25]\n"
       "b 68f\n"
       "66:"  // Height 2: Partial accumulate: partial_2_0
       "tbz x11, #1, 67f\n"
-      "ldr s8, [x28], #0x4\n"
-      "ldr s12, [x24], #0x4\n"
-      "mov x19, #0x4\n"
+      "ldr s8, [x9], #0x4\n"
+      "ldr s12, [x25], #0x4\n"
+      "mov x20, #0x4\n"
       "tbz x11, #0, 68f\n"
-      "ld1 { v8.h }[2], [x28]\n"
-      "ld1 { v12.h }[2], [x24]\n"
+      "ld1 { v8.h }[2], [x9]\n"
+      "ld1 { v12.h }[2], [x25]\n"
       "b 68f\n"
       "67:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr h8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr h12, [x24, #0x0]\n"
+      "ldr h8, [x9, #0x0]\n"
+      "ldr h12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
       "68:"  // Height 2: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 71f\n"
       "69:"  // Height 2: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
       "b 71f\n"
       "70:"  // Height 2: no accumulate
       "movi v8.16b, #0x0\n"
@@ -732,650 +729,650 @@ void a64_hybrid_fp16_mla_6x32 (
       "movi v14.16b, #0x0\n"
       "movi v15.16b, #0x0\n"
       "71:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "72:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 73f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 74f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 74f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
       "b 74f\n"
       "73:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
       "74:"  // Height 2: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 77f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 76f\n"
       "75:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
+      "sub x27, x27, #0x8\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x8\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "cmp x26, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "cmp x27, #0x10\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x10, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x10, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x10, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x10, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x10, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x10, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr q17, [x10, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr q16, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr q1, [x24, #0x0]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 75b\n"
       "76:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
+      "add x26, x26, #0x10\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "sub x27, x27, #0x8\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x10, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x10, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x10, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x10, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x10, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x10, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr q17, [x10, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr q16, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
       "77:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x26, 79f\n"
+      "cbz x27, 79f\n"
       "78:"  // Height 2: Multiply loop: Odd block loop
+      "ldr h1, [x26], #0x2\n"
       "ldr h0, [x25], #0x2\n"
-      "sub x26, x26, #0x1\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "sub x27, x27, #0x1\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      "fmla v8.8h, v17.8h, v1.h[0]\n"
+      "fmla v12.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.8h, v16.8h, v1.h[0]\n"
+      "fmla v13.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.8h, v17.8h, v1.h[0]\n"
+      "fmla v14.8h, v17.8h, v0.h[0]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "cbnz x26, 78b\n"
+      "fmla v11.8h, v16.8h, v1.h[0]\n"
+      "fmla v15.8h, v16.8h, v0.h[0]\n"
+      "cbnz x27, 78b\n"
       "79:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 72b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #1\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "tbz %x[flags], #1, 80f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.8h }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x19]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v1.8h\n"
-      "fmax v13.8h, v13.8h, v1.8h\n"
-      "fmax v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v1.8h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v17.8h\n"
+      "fmin v9.8h, v9.8h, v17.8h\n"
+      "fmin v10.8h, v10.8h, v17.8h\n"
+      "fmin v11.8h, v11.8h, v17.8h\n"
+      "fmin v12.8h, v12.8h, v17.8h\n"
+      "fmin v13.8h, v13.8h, v17.8h\n"
+      "fmin v14.8h, v14.8h, v17.8h\n"
+      "fmin v15.8h, v15.8h, v17.8h\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
+      "fmax v12.8h, v12.8h, v16.8h\n"
+      "fmax v13.8h, v13.8h, v16.8h\n"
+      "fmax v14.8h, v14.8h, v16.8h\n"
+      "fmax v15.8h, v15.8h, v16.8h\n"
       "80:"  // Height 2: No activation
       "cmp x11, #0x20\n"
       "bge 97f\n"
       "tbz x11, #4, 88f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
-      "st1 { v9.8h }, [x28], #0x10\n"
-      "st1 { v12.8h }, [x24], #0x10\n"
-      "st1 { v13.8h }, [x24], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
+      "st1 { v9.8h }, [x9], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v13.8h }, [x25], #0x10\n"
       "tbz x11, #3, 84f\n"
-      "st1 { v10.8h }, [x28], #0x10\n"
-      "st1 { v14.8h }, [x24], #0x10\n"
+      "st1 { v10.8h }, [x9], #0x10\n"
+      "st1 { v14.8h }, [x25], #0x10\n"
       "tbz x11, #2, 82f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
       "tbz x11, #1, 81f\n"
-      "st1 { v11.s }[2], [x28], #0x4\n"
-      "st1 { v15.s }[2], [x24], #0x4\n"
+      "st1 { v11.s }[2], [x9], #0x4\n"
+      "st1 { v15.s }[2], [x25], #0x4\n"
       "tbz x11, #0, 96f\n"
-      "st1 { v11.h }[6], [x28]\n"
-      "st1 { v15.h }[6], [x24]\n"
+      "st1 { v11.h }[6], [x9]\n"
+      "st1 { v15.h }[6], [x25]\n"
       "b 96f\n"
       "81:"  // Height 2: Partial direct writeback: partial_1_28
       "tbz x11, #0, 96f\n"
-      "st1 { v11.h }[4], [x28]\n"
-      "st1 { v15.h }[4], [x24]\n"
+      "st1 { v11.h }[4], [x9]\n"
+      "st1 { v15.h }[4], [x25]\n"
       "b 96f\n"
       "82:"  // Height 2: Partial direct writeback: partial_2_24
       "tbz x11, #1, 83f\n"
-      "str s11, [x28], #0x4\n"
-      "str s15, [x24], #0x4\n"
+      "str s11, [x9], #0x4\n"
+      "str s15, [x25], #0x4\n"
       "tbz x11, #0, 96f\n"
-      "st1 { v11.h }[2], [x28]\n"
-      "st1 { v15.h }[2], [x24]\n"
+      "st1 { v11.h }[2], [x9]\n"
+      "st1 { v15.h }[2], [x25]\n"
       "b 96f\n"
       "83:"  // Height 2: Partial direct writeback: partial_1_24
       "tbz x11, #0, 96f\n"
-      "str h11, [x28, #0x0]\n"
-      "str h15, [x24, #0x0]\n"
+      "str h11, [x9, #0x0]\n"
+      "str h15, [x25, #0x0]\n"
       "b 96f\n"
       "84:"  // Height 2: Partial direct writeback: partial_4_16
       "tbz x11, #2, 86f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
       "tbz x11, #1, 85f\n"
-      "st1 { v10.s }[2], [x28], #0x4\n"
-      "st1 { v14.s }[2], [x24], #0x4\n"
+      "st1 { v10.s }[2], [x9], #0x4\n"
+      "st1 { v14.s }[2], [x25], #0x4\n"
       "tbz x11, #0, 96f\n"
-      "st1 { v10.h }[6], [x28]\n"
-      "st1 { v14.h }[6], [x24]\n"
+      "st1 { v10.h }[6], [x9]\n"
+      "st1 { v14.h }[6], [x25]\n"
       "b 96f\n"
       "85:"  // Height 2: Partial direct writeback: partial_1_20
       "tbz x11, #0, 96f\n"
-      "st1 { v10.h }[4], [x28]\n"
-      "st1 { v14.h }[4], [x24]\n"
+      "st1 { v10.h }[4], [x9]\n"
+      "st1 { v14.h }[4], [x25]\n"
       "b 96f\n"
       "86:"  // Height 2: Partial direct writeback: partial_2_16
       "tbz x11, #1, 87f\n"
-      "str s10, [x28], #0x4\n"
-      "str s14, [x24], #0x4\n"
+      "str s10, [x9], #0x4\n"
+      "str s14, [x25], #0x4\n"
       "tbz x11, #0, 96f\n"
-      "st1 { v10.h }[2], [x28]\n"
-      "st1 { v14.h }[2], [x24]\n"
+      "st1 { v10.h }[2], [x9]\n"
+      "st1 { v14.h }[2], [x25]\n"
       "b 96f\n"
       "87:"  // Height 2: Partial direct writeback: partial_1_16
       "tbz x11, #0, 96f\n"
-      "str h10, [x28, #0x0]\n"
-      "str h14, [x24, #0x0]\n"
+      "str h10, [x9, #0x0]\n"
+      "str h14, [x25, #0x0]\n"
       "b 96f\n"
       "88:"  // Height 2: Partial direct writeback: partial_8_0
       "tbz x11, #3, 92f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
-      "st1 { v12.8h }, [x24], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
       "tbz x11, #2, 90f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
       "tbz x11, #1, 89f\n"
-      "st1 { v9.s }[2], [x28], #0x4\n"
-      "st1 { v13.s }[2], [x24], #0x4\n"
+      "st1 { v9.s }[2], [x9], #0x4\n"
+      "st1 { v13.s }[2], [x25], #0x4\n"
       "tbz x11, #0, 96f\n"
-      "st1 { v9.h }[6], [x28]\n"
-      "st1 { v13.h }[6], [x24]\n"
+      "st1 { v9.h }[6], [x9]\n"
+      "st1 { v13.h }[6], [x25]\n"
       "b 96f\n"
       "89:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x11, #0, 96f\n"
-      "st1 { v9.h }[4], [x28]\n"
-      "st1 { v13.h }[4], [x24]\n"
+      "st1 { v9.h }[4], [x9]\n"
+      "st1 { v13.h }[4], [x25]\n"
       "b 96f\n"
       "90:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x11, #1, 91f\n"
-      "str s9, [x28], #0x4\n"
-      "str s13, [x24], #0x4\n"
+      "str s9, [x9], #0x4\n"
+      "str s13, [x25], #0x4\n"
       "tbz x11, #0, 96f\n"
-      "st1 { v9.h }[2], [x28]\n"
-      "st1 { v13.h }[2], [x24]\n"
+      "st1 { v9.h }[2], [x9]\n"
+      "st1 { v13.h }[2], [x25]\n"
       "b 96f\n"
       "91:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x11, #0, 96f\n"
-      "str h9, [x28, #0x0]\n"
-      "str h13, [x24, #0x0]\n"
+      "str h9, [x9, #0x0]\n"
+      "str h13, [x25, #0x0]\n"
       "b 96f\n"
       "92:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x11, #2, 94f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
       "tbz x11, #1, 93f\n"
-      "st1 { v8.s }[2], [x28], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
+      "st1 { v8.s }[2], [x9], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
       "tbz x11, #0, 96f\n"
-      "st1 { v8.h }[6], [x28]\n"
-      "st1 { v12.h }[6], [x24]\n"
+      "st1 { v8.h }[6], [x9]\n"
+      "st1 { v12.h }[6], [x25]\n"
       "b 96f\n"
       "93:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x11, #0, 96f\n"
-      "st1 { v8.h }[4], [x28]\n"
-      "st1 { v12.h }[4], [x24]\n"
+      "st1 { v8.h }[4], [x9]\n"
+      "st1 { v12.h }[4], [x25]\n"
       "b 96f\n"
       "94:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x11, #1, 95f\n"
-      "str s8, [x28], #0x4\n"
-      "str s12, [x24], #0x4\n"
+      "str s8, [x9], #0x4\n"
+      "str s12, [x25], #0x4\n"
       "tbz x11, #0, 96f\n"
-      "st1 { v8.h }[2], [x28]\n"
-      "st1 { v12.h }[2], [x24]\n"
+      "st1 { v8.h }[2], [x9]\n"
+      "st1 { v12.h }[2], [x25]\n"
       "b 96f\n"
       "95:"  // Height 2: Partial direct writeback: partial_1_0
-      "str h8, [x28, #0x0]\n"
-      "str h12, [x24, #0x0]\n"
+      "str h8, [x9, #0x0]\n"
+      "str h12, [x25, #0x0]\n"
       "96:"  // Height 2: Partial direct writeback: Done
       "b 98f\n"
       "97:"  // Height 2: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
       "98:"  // Height 2: Writeback done
       "subs x11, x11, #0x20\n"
       "bgt 51b\n"
       "b 296f\n"
       "99:"  // Height 3
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "100:"  // Height 3: Column loop
-      "cbz x9, 101f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 101f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "ldr q11, [x9, #0x30]\n"
       "mov v13.16b, v9.16b\n"
-      "add x9, x9, #0x40\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
       "b 120f\n"
       "101:"  // Height 3: no bias
       "tbz %x[flags], #0, 119f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
       "cmp x11, #0x20\n"
-      "add x24, x28, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
       "bge 118f\n"
       "tbz x11, #4, 109f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
-      "ld1 { v12.8h }, [x24], #0x10\n"
-      "ld1 { v16.8h }, [x23], #0x10\n"
-      "ld1 { v9.8h }, [x28], #0x10\n"
-      "ld1 { v13.8h }, [x24], #0x10\n"
-      "ld1 { v17.8h }, [x23], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v9.8h }, [x9], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
+      "ld1 { v17.8h }, [x24], #0x10\n"
       "tbz x11, #3, 105f\n"
-      "ld1 { v10.8h }, [x28], #0x10\n"
-      "ld1 { v14.8h }, [x24], #0x10\n"
-      "ld1 { v18.8h }, [x23], #0x10\n"
+      "ld1 { v10.8h }, [x9], #0x10\n"
+      "ld1 { v14.8h }, [x25], #0x10\n"
+      "ld1 { v18.8h }, [x24], #0x10\n"
       "tbz x11, #2, 103f\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "ldr d19, [x24], #0x8\n"
       "tbz x11, #1, 102f\n"
-      "mov x19, #0x3c\n"
-      "ld1 { v11.s }[2], [x28], #0x4\n"
-      "ld1 { v15.s }[2], [x24], #0x4\n"
-      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "ld1 { v11.s }[2], [x9], #0x4\n"
+      "ld1 { v15.s }[2], [x25], #0x4\n"
+      "mov x20, #0x3c\n"
+      "ld1 { v19.s }[2], [x24], #0x4\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v11.h }[6], [x28]\n"
-      "ld1 { v15.h }[6], [x24]\n"
-      "ld1 { v19.h }[6], [x23]\n"
+      "ld1 { v11.h }[6], [x9]\n"
+      "ld1 { v15.h }[6], [x25]\n"
+      "ld1 { v19.h }[6], [x24]\n"
       "b 117f\n"
       "102:"  // Height 3: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v11.h }[4], [x28]\n"
-      "ld1 { v15.h }[4], [x24]\n"
-      "ld1 { v19.h }[4], [x23]\n"
+      "ld1 { v11.h }[4], [x9]\n"
+      "ld1 { v15.h }[4], [x25]\n"
+      "ld1 { v19.h }[4], [x24]\n"
       "b 117f\n"
       "103:"  // Height 3: Partial accumulate: partial_2_24
       "tbz x11, #1, 104f\n"
-      "ldr s11, [x28], #0x4\n"
-      "ldr s15, [x24], #0x4\n"
-      "mov x19, #0x34\n"
-      "ldr s19, [x23], #0x4\n"
+      "ldr s11, [x9], #0x4\n"
+      "ldr s15, [x25], #0x4\n"
+      "mov x20, #0x34\n"
+      "ldr s19, [x24], #0x4\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v11.h }[2], [x28]\n"
-      "ld1 { v15.h }[2], [x24]\n"
-      "ld1 { v19.h }[2], [x23]\n"
+      "ld1 { v11.h }[2], [x9]\n"
+      "ld1 { v15.h }[2], [x25]\n"
+      "ld1 { v19.h }[2], [x24]\n"
       "b 117f\n"
       "104:"  // Height 3: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 117f\n"
-      "ldr h11, [x28, #0x0]\n"
-      "ldr h15, [x24, #0x0]\n"
-      "ldr h19, [x23, #0x0]\n"
+      "ldr h11, [x9, #0x0]\n"
+      "ldr h15, [x25, #0x0]\n"
+      "ldr h19, [x24, #0x0]\n"
       "b 117f\n"
       "105:"  // Height 3: Partial accumulate: partial_4_16
       "tbz x11, #2, 107f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "ldr d18, [x23], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "ldr d18, [x24], #0x8\n"
       "tbz x11, #1, 106f\n"
-      "mov x19, #0x2c\n"
-      "ld1 { v10.s }[2], [x28], #0x4\n"
-      "ld1 { v14.s }[2], [x24], #0x4\n"
-      "ld1 { v18.s }[2], [x23], #0x4\n"
+      "ld1 { v10.s }[2], [x9], #0x4\n"
+      "ld1 { v14.s }[2], [x25], #0x4\n"
+      "mov x20, #0x2c\n"
+      "ld1 { v18.s }[2], [x24], #0x4\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v10.h }[6], [x28]\n"
-      "ld1 { v14.h }[6], [x24]\n"
-      "ld1 { v18.h }[6], [x23]\n"
+      "ld1 { v10.h }[6], [x9]\n"
+      "ld1 { v14.h }[6], [x25]\n"
+      "ld1 { v18.h }[6], [x24]\n"
       "b 117f\n"
       "106:"  // Height 3: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v10.h }[4], [x28]\n"
-      "ld1 { v14.h }[4], [x24]\n"
-      "ld1 { v18.h }[4], [x23]\n"
+      "ld1 { v10.h }[4], [x9]\n"
+      "ld1 { v14.h }[4], [x25]\n"
+      "ld1 { v18.h }[4], [x24]\n"
       "b 117f\n"
       "107:"  // Height 3: Partial accumulate: partial_2_16
       "tbz x11, #1, 108f\n"
-      "ldr s10, [x28], #0x4\n"
-      "ldr s14, [x24], #0x4\n"
-      "mov x19, #0x24\n"
-      "ldr s18, [x23], #0x4\n"
+      "ldr s10, [x9], #0x4\n"
+      "ldr s14, [x25], #0x4\n"
+      "mov x20, #0x24\n"
+      "ldr s18, [x24], #0x4\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v10.h }[2], [x28]\n"
-      "ld1 { v14.h }[2], [x24]\n"
-      "ld1 { v18.h }[2], [x23]\n"
+      "ld1 { v10.h }[2], [x9]\n"
+      "ld1 { v14.h }[2], [x25]\n"
+      "ld1 { v18.h }[2], [x24]\n"
       "b 117f\n"
       "108:"  // Height 3: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 117f\n"
-      "ldr h10, [x28, #0x0]\n"
-      "ldr h14, [x24, #0x0]\n"
-      "ldr h18, [x23, #0x0]\n"
+      "ldr h10, [x9, #0x0]\n"
+      "ldr h14, [x25, #0x0]\n"
+      "ldr h18, [x24, #0x0]\n"
       "b 117f\n"
       "109:"  // Height 3: Partial accumulate: partial_8_0
       "tbz x11, #3, 113f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
-      "ld1 { v12.8h }, [x24], #0x10\n"
-      "ld1 { v16.8h }, [x23], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
       "tbz x11, #2, 111f\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "ldr d17, [x24], #0x8\n"
       "tbz x11, #1, 110f\n"
-      "mov x19, #0x1c\n"
-      "ld1 { v9.s }[2], [x28], #0x4\n"
-      "ld1 { v13.s }[2], [x24], #0x4\n"
-      "ld1 { v17.s }[2], [x23], #0x4\n"
+      "ld1 { v9.s }[2], [x9], #0x4\n"
+      "ld1 { v13.s }[2], [x25], #0x4\n"
+      "mov x20, #0x1c\n"
+      "ld1 { v17.s }[2], [x24], #0x4\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v9.h }[6], [x28]\n"
-      "ld1 { v13.h }[6], [x24]\n"
-      "ld1 { v17.h }[6], [x23]\n"
+      "ld1 { v9.h }[6], [x9]\n"
+      "ld1 { v13.h }[6], [x25]\n"
+      "ld1 { v17.h }[6], [x24]\n"
       "b 117f\n"
       "110:"  // Height 3: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v9.h }[4], [x28]\n"
-      "ld1 { v13.h }[4], [x24]\n"
-      "ld1 { v17.h }[4], [x23]\n"
+      "ld1 { v9.h }[4], [x9]\n"
+      "ld1 { v13.h }[4], [x25]\n"
+      "ld1 { v17.h }[4], [x24]\n"
       "b 117f\n"
       "111:"  // Height 3: Partial accumulate: partial_2_8
       "tbz x11, #1, 112f\n"
-      "ldr s9, [x28], #0x4\n"
-      "ldr s13, [x24], #0x4\n"
-      "mov x19, #0x14\n"
-      "ldr s17, [x23], #0x4\n"
+      "ldr s9, [x9], #0x4\n"
+      "ldr s13, [x25], #0x4\n"
+      "mov x20, #0x14\n"
+      "ldr s17, [x24], #0x4\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v9.h }[2], [x28]\n"
-      "ld1 { v13.h }[2], [x24]\n"
-      "ld1 { v17.h }[2], [x23]\n"
+      "ld1 { v9.h }[2], [x9]\n"
+      "ld1 { v13.h }[2], [x25]\n"
+      "ld1 { v17.h }[2], [x24]\n"
       "b 117f\n"
       "112:"  // Height 3: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 117f\n"
-      "ldr h9, [x28, #0x0]\n"
-      "ldr h13, [x24, #0x0]\n"
-      "ldr h17, [x23, #0x0]\n"
+      "ldr h9, [x9, #0x0]\n"
+      "ldr h13, [x25, #0x0]\n"
+      "ldr h17, [x24, #0x0]\n"
       "b 117f\n"
       "113:"  // Height 3: Partial accumulate: partial_4_0
       "tbz x11, #2, 115f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "ldr d16, [x23], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "ldr d16, [x24], #0x8\n"
       "tbz x11, #1, 114f\n"
-      "mov x19, #0xc\n"
-      "ld1 { v8.s }[2], [x28], #0x4\n"
-      "ld1 { v12.s }[2], [x24], #0x4\n"
-      "ld1 { v16.s }[2], [x23], #0x4\n"
+      "ld1 { v8.s }[2], [x9], #0x4\n"
+      "ld1 { v12.s }[2], [x25], #0x4\n"
+      "mov x20, #0xc\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v8.h }[6], [x28]\n"
-      "ld1 { v12.h }[6], [x24]\n"
-      "ld1 { v16.h }[6], [x23]\n"
+      "ld1 { v8.h }[6], [x9]\n"
+      "ld1 { v12.h }[6], [x25]\n"
+      "ld1 { v16.h }[6], [x24]\n"
       "b 117f\n"
       "114:"  // Height 3: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v8.h }[4], [x28]\n"
-      "ld1 { v12.h }[4], [x24]\n"
-      "ld1 { v16.h }[4], [x23]\n"
+      "ld1 { v8.h }[4], [x9]\n"
+      "ld1 { v12.h }[4], [x25]\n"
+      "ld1 { v16.h }[4], [x24]\n"
       "b 117f\n"
       "115:"  // Height 3: Partial accumulate: partial_2_0
       "tbz x11, #1, 116f\n"
-      "ldr s8, [x28], #0x4\n"
-      "ldr s12, [x24], #0x4\n"
-      "mov x19, #0x4\n"
-      "ldr s16, [x23], #0x4\n"
+      "ldr s8, [x9], #0x4\n"
+      "ldr s12, [x25], #0x4\n"
+      "mov x20, #0x4\n"
+      "ldr s16, [x24], #0x4\n"
       "tbz x11, #0, 117f\n"
-      "ld1 { v8.h }[2], [x28]\n"
-      "ld1 { v12.h }[2], [x24]\n"
-      "ld1 { v16.h }[2], [x23]\n"
+      "ld1 { v8.h }[2], [x9]\n"
+      "ld1 { v12.h }[2], [x25]\n"
+      "ld1 { v16.h }[2], [x24]\n"
       "b 117f\n"
       "116:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr h8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr h12, [x24, #0x0]\n"
-      "ldr h16, [x23, #0x0]\n"
+      "ldr h8, [x9, #0x0]\n"
+      "ldr h12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr h16, [x24, #0x0]\n"
       "117:"  // Height 3: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 120f\n"
       "118:"  // Height 3: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
       "b 120f\n"
       "119:"  // Height 3: no accumulate
       "movi v8.16b, #0x0\n"
@@ -1391,815 +1388,815 @@ void a64_hybrid_fp16_mla_6x32 (
       "movi v18.16b, #0x0\n"
       "movi v19.16b, #0x0\n"
       "120:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "121:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 122f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 123f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 123f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
       "b 123f\n"
       "122:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "123:"  // Height 3: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 126f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x10\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 125f\n"
       "124:"  // Height 3: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q21, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sub x26, x26, #0x8\n"
+      "add x25, x25, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "cmp x26, #0x10\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0x100]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0x110]\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x120]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x10, #0x130]\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x140]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x10, #0x150]\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x160]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x10, #0x170]\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x180]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x10, #0x190]\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x1a0]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x10, #0x1b0]\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x1c0]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x10, #0x1d0]\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr q21, [x10, #0x1e0]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr q20, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 124b\n"
       "125:"  // Height 3: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q21, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "sub x27, x27, #0x8\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0x100]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0x110]\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x120]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x10, #0x130]\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x140]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x10, #0x150]\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x160]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x10, #0x170]\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x180]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x10, #0x190]\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x1a0]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x10, #0x1b0]\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x1c0]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x10, #0x1d0]\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr q21, [x10, #0x1e0]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr q20, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
       "126:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x26, 128f\n"
+      "cbz x27, 128f\n"
       "127:"  // Height 3: Multiply loop: Odd block loop
-      "ldr h0, [x25], #0x2\n"
-      "sub x26, x26, #0x1\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "sub x27, x27, #0x1\n"
+      "ldr h0, [x24], #0x2\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v8.8h, v21.8h, v2.h[0]\n"
+      "fmla v12.8h, v21.8h, v1.h[0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      "fmla v16.8h, v21.8h, v0.h[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.8h, v20.8h, v2.h[0]\n"
+      "fmla v13.8h, v20.8h, v1.h[0]\n"
+      "fmla v17.8h, v20.8h, v0.h[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "cbnz x26, 127b\n"
+      "fmla v10.8h, v21.8h, v2.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "fmla v18.8h, v21.8h, v0.h[0]\n"
+      "fmla v11.8h, v20.8h, v2.h[0]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v0.h[0]\n"
+      "cbnz x27, 127b\n"
       "128:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 121b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #1\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 129f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.8h }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x19]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v1.8h\n"
-      "fmax v13.8h, v13.8h, v1.8h\n"
-      "fmax v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmin v16.8h, v16.8h, v0.8h\n"
-      "fmin v17.8h, v17.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v1.8h\n"
-      "fmax v16.8h, v16.8h, v1.8h\n"
-      "fmax v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v0.8h\n"
-      "fmin v19.8h, v19.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v1.8h\n"
-      "fmax v19.8h, v19.8h, v1.8h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v21.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v20.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v21.8h\n"
+      "fmin v9.8h, v9.8h, v21.8h\n"
+      "fmin v10.8h, v10.8h, v21.8h\n"
+      "fmin v11.8h, v11.8h, v21.8h\n"
+      "fmin v12.8h, v12.8h, v21.8h\n"
+      "fmin v13.8h, v13.8h, v21.8h\n"
+      "fmin v14.8h, v14.8h, v21.8h\n"
+      "fmin v15.8h, v15.8h, v21.8h\n"
+      "fmin v16.8h, v16.8h, v21.8h\n"
+      "fmin v17.8h, v17.8h, v21.8h\n"
+      "fmin v18.8h, v18.8h, v21.8h\n"
+      "fmin v19.8h, v19.8h, v21.8h\n"
+      "fmax v8.8h, v8.8h, v20.8h\n"
+      "fmax v9.8h, v9.8h, v20.8h\n"
+      "fmax v10.8h, v10.8h, v20.8h\n"
+      "fmax v11.8h, v11.8h, v20.8h\n"
+      "fmax v12.8h, v12.8h, v20.8h\n"
+      "fmax v13.8h, v13.8h, v20.8h\n"
+      "fmax v14.8h, v14.8h, v20.8h\n"
+      "fmax v15.8h, v15.8h, v20.8h\n"
+      "fmax v16.8h, v16.8h, v20.8h\n"
+      "fmax v17.8h, v17.8h, v20.8h\n"
+      "fmax v18.8h, v18.8h, v20.8h\n"
+      "fmax v19.8h, v19.8h, v20.8h\n"
       "129:"  // Height 3: No activation
       "cmp x11, #0x20\n"
       "bge 146f\n"
       "tbz x11, #4, 137f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
-      "st1 { v9.8h }, [x28], #0x10\n"
-      "st1 { v12.8h }, [x24], #0x10\n"
-      "st1 { v13.8h }, [x24], #0x10\n"
-      "st1 { v16.8h }, [x23], #0x10\n"
-      "st1 { v17.8h }, [x23], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
+      "st1 { v9.8h }, [x9], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v13.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v17.8h }, [x24], #0x10\n"
       "tbz x11, #3, 133f\n"
-      "st1 { v10.8h }, [x28], #0x10\n"
-      "st1 { v14.8h }, [x24], #0x10\n"
-      "st1 { v18.8h }, [x23], #0x10\n"
+      "st1 { v10.8h }, [x9], #0x10\n"
+      "st1 { v14.8h }, [x25], #0x10\n"
+      "st1 { v18.8h }, [x24], #0x10\n"
       "tbz x11, #2, 131f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
       "tbz x11, #1, 130f\n"
-      "st1 { v11.s }[2], [x28], #0x4\n"
-      "st1 { v15.s }[2], [x24], #0x4\n"
-      "st1 { v19.s }[2], [x23], #0x4\n"
+      "st1 { v11.s }[2], [x9], #0x4\n"
+      "st1 { v15.s }[2], [x25], #0x4\n"
+      "st1 { v19.s }[2], [x24], #0x4\n"
       "tbz x11, #0, 145f\n"
-      "st1 { v11.h }[6], [x28]\n"
-      "st1 { v15.h }[6], [x24]\n"
-      "st1 { v19.h }[6], [x23]\n"
+      "st1 { v11.h }[6], [x9]\n"
+      "st1 { v15.h }[6], [x25]\n"
+      "st1 { v19.h }[6], [x24]\n"
       "b 145f\n"
       "130:"  // Height 3: Partial direct writeback: partial_1_28
       "tbz x11, #0, 145f\n"
-      "st1 { v11.h }[4], [x28]\n"
-      "st1 { v15.h }[4], [x24]\n"
-      "st1 { v19.h }[4], [x23]\n"
+      "st1 { v11.h }[4], [x9]\n"
+      "st1 { v15.h }[4], [x25]\n"
+      "st1 { v19.h }[4], [x24]\n"
       "b 145f\n"
       "131:"  // Height 3: Partial direct writeback: partial_2_24
       "tbz x11, #1, 132f\n"
-      "str s11, [x28], #0x4\n"
-      "str s15, [x24], #0x4\n"
-      "str s19, [x23], #0x4\n"
+      "str s11, [x9], #0x4\n"
+      "str s15, [x25], #0x4\n"
+      "str s19, [x24], #0x4\n"
       "tbz x11, #0, 145f\n"
-      "st1 { v11.h }[2], [x28]\n"
-      "st1 { v15.h }[2], [x24]\n"
-      "st1 { v19.h }[2], [x23]\n"
+      "st1 { v11.h }[2], [x9]\n"
+      "st1 { v15.h }[2], [x25]\n"
+      "st1 { v19.h }[2], [x24]\n"
       "b 145f\n"
       "132:"  // Height 3: Partial direct writeback: partial_1_24
       "tbz x11, #0, 145f\n"
-      "str h11, [x28, #0x0]\n"
-      "str h15, [x24, #0x0]\n"
-      "str h19, [x23, #0x0]\n"
+      "str h11, [x9, #0x0]\n"
+      "str h15, [x25, #0x0]\n"
+      "str h19, [x24, #0x0]\n"
       "b 145f\n"
       "133:"  // Height 3: Partial direct writeback: partial_4_16
       "tbz x11, #2, 135f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
       "tbz x11, #1, 134f\n"
-      "st1 { v10.s }[2], [x28], #0x4\n"
-      "st1 { v14.s }[2], [x24], #0x4\n"
-      "st1 { v18.s }[2], [x23], #0x4\n"
+      "st1 { v10.s }[2], [x9], #0x4\n"
+      "st1 { v14.s }[2], [x25], #0x4\n"
+      "st1 { v18.s }[2], [x24], #0x4\n"
       "tbz x11, #0, 145f\n"
-      "st1 { v10.h }[6], [x28]\n"
-      "st1 { v14.h }[6], [x24]\n"
-      "st1 { v18.h }[6], [x23]\n"
+      "st1 { v10.h }[6], [x9]\n"
+      "st1 { v14.h }[6], [x25]\n"
+      "st1 { v18.h }[6], [x24]\n"
       "b 145f\n"
       "134:"  // Height 3: Partial direct writeback: partial_1_20
       "tbz x11, #0, 145f\n"
-      "st1 { v10.h }[4], [x28]\n"
-      "st1 { v14.h }[4], [x24]\n"
-      "st1 { v18.h }[4], [x23]\n"
+      "st1 { v10.h }[4], [x9]\n"
+      "st1 { v14.h }[4], [x25]\n"
+      "st1 { v18.h }[4], [x24]\n"
       "b 145f\n"
       "135:"  // Height 3: Partial direct writeback: partial_2_16
       "tbz x11, #1, 136f\n"
-      "str s10, [x28], #0x4\n"
-      "str s14, [x24], #0x4\n"
-      "str s18, [x23], #0x4\n"
+      "str s10, [x9], #0x4\n"
+      "str s14, [x25], #0x4\n"
+      "str s18, [x24], #0x4\n"
       "tbz x11, #0, 145f\n"
-      "st1 { v10.h }[2], [x28]\n"
-      "st1 { v14.h }[2], [x24]\n"
-      "st1 { v18.h }[2], [x23]\n"
+      "st1 { v10.h }[2], [x9]\n"
+      "st1 { v14.h }[2], [x25]\n"
+      "st1 { v18.h }[2], [x24]\n"
       "b 145f\n"
       "136:"  // Height 3: Partial direct writeback: partial_1_16
       "tbz x11, #0, 145f\n"
-      "str h10, [x28, #0x0]\n"
-      "str h14, [x24, #0x0]\n"
-      "str h18, [x23, #0x0]\n"
+      "str h10, [x9, #0x0]\n"
+      "str h14, [x25, #0x0]\n"
+      "str h18, [x24, #0x0]\n"
       "b 145f\n"
       "137:"  // Height 3: Partial direct writeback: partial_8_0
       "tbz x11, #3, 141f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
-      "st1 { v12.8h }, [x24], #0x10\n"
-      "st1 { v16.8h }, [x23], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
       "tbz x11, #2, 139f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
       "tbz x11, #1, 138f\n"
-      "st1 { v9.s }[2], [x28], #0x4\n"
-      "st1 { v13.s }[2], [x24], #0x4\n"
-      "st1 { v17.s }[2], [x23], #0x4\n"
+      "st1 { v9.s }[2], [x9], #0x4\n"
+      "st1 { v13.s }[2], [x25], #0x4\n"
+      "st1 { v17.s }[2], [x24], #0x4\n"
       "tbz x11, #0, 145f\n"
-      "st1 { v9.h }[6], [x28]\n"
-      "st1 { v13.h }[6], [x24]\n"
-      "st1 { v17.h }[6], [x23]\n"
+      "st1 { v9.h }[6], [x9]\n"
+      "st1 { v13.h }[6], [x25]\n"
+      "st1 { v17.h }[6], [x24]\n"
       "b 145f\n"
       "138:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x11, #0, 145f\n"
-      "st1 { v9.h }[4], [x28]\n"
-      "st1 { v13.h }[4], [x24]\n"
-      "st1 { v17.h }[4], [x23]\n"
+      "st1 { v9.h }[4], [x9]\n"
+      "st1 { v13.h }[4], [x25]\n"
+      "st1 { v17.h }[4], [x24]\n"
       "b 145f\n"
       "139:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x11, #1, 140f\n"
-      "str s9, [x28], #0x4\n"
-      "str s13, [x24], #0x4\n"
-      "str s17, [x23], #0x4\n"
+      "str s9, [x9], #0x4\n"
+      "str s13, [x25], #0x4\n"
+      "str s17, [x24], #0x4\n"
       "tbz x11, #0, 145f\n"
-      "st1 { v9.h }[2], [x28]\n"
-      "st1 { v13.h }[2], [x24]\n"
-      "st1 { v17.h }[2], [x23]\n"
+      "st1 { v9.h }[2], [x9]\n"
+      "st1 { v13.h }[2], [x25]\n"
+      "st1 { v17.h }[2], [x24]\n"
       "b 145f\n"
       "140:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x11, #0, 145f\n"
-      "str h9, [x28, #0x0]\n"
-      "str h13, [x24, #0x0]\n"
-      "str h17, [x23, #0x0]\n"
+      "str h9, [x9, #0x0]\n"
+      "str h13, [x25, #0x0]\n"
+      "str h17, [x24, #0x0]\n"
       "b 145f\n"
       "141:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x11, #2, 143f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
       "tbz x11, #1, 142f\n"
-      "st1 { v8.s }[2], [x28], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v8.s }[2], [x9], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
       "tbz x11, #0, 145f\n"
-      "st1 { v8.h }[6], [x28]\n"
-      "st1 { v12.h }[6], [x24]\n"
-      "st1 { v16.h }[6], [x23]\n"
+      "st1 { v8.h }[6], [x9]\n"
+      "st1 { v12.h }[6], [x25]\n"
+      "st1 { v16.h }[6], [x24]\n"
       "b 145f\n"
       "142:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x11, #0, 145f\n"
-      "st1 { v8.h }[4], [x28]\n"
-      "st1 { v12.h }[4], [x24]\n"
-      "st1 { v16.h }[4], [x23]\n"
+      "st1 { v8.h }[4], [x9]\n"
+      "st1 { v12.h }[4], [x25]\n"
+      "st1 { v16.h }[4], [x24]\n"
       "b 145f\n"
       "143:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x11, #1, 144f\n"
-      "str s8, [x28], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
+      "str s8, [x9], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
       "tbz x11, #0, 145f\n"
-      "st1 { v8.h }[2], [x28]\n"
-      "st1 { v12.h }[2], [x24]\n"
-      "st1 { v16.h }[2], [x23]\n"
+      "st1 { v8.h }[2], [x9]\n"
+      "st1 { v12.h }[2], [x25]\n"
+      "st1 { v16.h }[2], [x24]\n"
       "b 145f\n"
       "144:"  // Height 3: Partial direct writeback: partial_1_0
-      "str h8, [x28, #0x0]\n"
-      "str h12, [x24, #0x0]\n"
-      "str h16, [x23, #0x0]\n"
+      "str h8, [x9, #0x0]\n"
+      "str h12, [x25, #0x0]\n"
+      "str h16, [x24, #0x0]\n"
       "145:"  // Height 3: Partial direct writeback: Done
       "b 147f\n"
       "146:"  // Height 3: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
       "147:"  // Height 3: Writeback done
       "subs x11, x11, #0x20\n"
       "bgt 100b\n"
       "b 296f\n"
       "148:"  // Height 4
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "149:"  // Height 4: Column loop
-      "cbz x9, 150f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 150f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "mov v20.16b, v8.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
       "mov v23.16b, v11.16b\n"
       "b 169f\n"
       "150:"  // Height 4: no bias
       "tbz %x[flags], #0, 168f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
       "cmp x11, #0x20\n"
-      "add x24, x28, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
       "bge 167f\n"
       "tbz x11, #4, 158f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
-      "ld1 { v12.8h }, [x24], #0x10\n"
-      "ld1 { v16.8h }, [x23], #0x10\n"
-      "ld1 { v20.8h }, [x22], #0x10\n"
-      "ld1 { v9.8h }, [x28], #0x10\n"
-      "ld1 { v13.8h }, [x24], #0x10\n"
-      "ld1 { v17.8h }, [x23], #0x10\n"
-      "ld1 { v21.8h }, [x22], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "ld1 { v9.8h }, [x9], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
+      "ld1 { v17.8h }, [x24], #0x10\n"
+      "ld1 { v21.8h }, [x23], #0x10\n"
       "tbz x11, #3, 154f\n"
-      "ld1 { v10.8h }, [x28], #0x10\n"
-      "ld1 { v14.8h }, [x24], #0x10\n"
-      "ld1 { v18.8h }, [x23], #0x10\n"
-      "ld1 { v22.8h }, [x22], #0x10\n"
+      "ld1 { v10.8h }, [x9], #0x10\n"
+      "ld1 { v14.8h }, [x25], #0x10\n"
+      "ld1 { v18.8h }, [x24], #0x10\n"
+      "ld1 { v22.8h }, [x23], #0x10\n"
       "tbz x11, #2, 152f\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
-      "ldr d23, [x22], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
       "tbz x11, #1, 151f\n"
-      "mov x19, #0x3c\n"
-      "ld1 { v11.s }[2], [x28], #0x4\n"
-      "ld1 { v15.s }[2], [x24], #0x4\n"
-      "ld1 { v19.s }[2], [x23], #0x4\n"
-      "ld1 { v23.s }[2], [x22], #0x4\n"
+      "ld1 { v11.s }[2], [x9], #0x4\n"
+      "ld1 { v15.s }[2], [x25], #0x4\n"
+      "mov x20, #0x3c\n"
+      "ld1 { v19.s }[2], [x24], #0x4\n"
+      "ld1 { v23.s }[2], [x23], #0x4\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v11.h }[6], [x28]\n"
-      "ld1 { v15.h }[6], [x24]\n"
-      "ld1 { v19.h }[6], [x23]\n"
-      "ld1 { v23.h }[6], [x22]\n"
+      "ld1 { v11.h }[6], [x9]\n"
+      "ld1 { v15.h }[6], [x25]\n"
+      "ld1 { v19.h }[6], [x24]\n"
+      "ld1 { v23.h }[6], [x23]\n"
       "b 166f\n"
       "151:"  // Height 4: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v11.h }[4], [x28]\n"
-      "ld1 { v15.h }[4], [x24]\n"
-      "ld1 { v19.h }[4], [x23]\n"
-      "ld1 { v23.h }[4], [x22]\n"
+      "ld1 { v11.h }[4], [x9]\n"
+      "ld1 { v15.h }[4], [x25]\n"
+      "ld1 { v19.h }[4], [x24]\n"
+      "ld1 { v23.h }[4], [x23]\n"
       "b 166f\n"
       "152:"  // Height 4: Partial accumulate: partial_2_24
       "tbz x11, #1, 153f\n"
-      "ldr s11, [x28], #0x4\n"
-      "ldr s15, [x24], #0x4\n"
-      "mov x19, #0x34\n"
-      "ldr s19, [x23], #0x4\n"
-      "ldr s23, [x22], #0x4\n"
+      "ldr s11, [x9], #0x4\n"
+      "ldr s15, [x25], #0x4\n"
+      "mov x20, #0x34\n"
+      "ldr s19, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v11.h }[2], [x28]\n"
-      "ld1 { v15.h }[2], [x24]\n"
-      "ld1 { v19.h }[2], [x23]\n"
-      "ld1 { v23.h }[2], [x22]\n"
+      "ld1 { v11.h }[2], [x9]\n"
+      "ld1 { v15.h }[2], [x25]\n"
+      "ld1 { v19.h }[2], [x24]\n"
+      "ld1 { v23.h }[2], [x23]\n"
       "b 166f\n"
       "153:"  // Height 4: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 166f\n"
-      "ldr h11, [x28, #0x0]\n"
-      "ldr h15, [x24, #0x0]\n"
-      "ldr h19, [x23, #0x0]\n"
-      "ldr h23, [x22, #0x0]\n"
+      "ldr h11, [x9, #0x0]\n"
+      "ldr h15, [x25, #0x0]\n"
+      "ldr h19, [x24, #0x0]\n"
+      "ldr h23, [x23, #0x0]\n"
       "b 166f\n"
       "154:"  // Height 4: Partial accumulate: partial_4_16
       "tbz x11, #2, 156f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "ldr d18, [x23], #0x8\n"
-      "ldr d22, [x22], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
       "tbz x11, #1, 155f\n"
-      "mov x19, #0x2c\n"
-      "ld1 { v10.s }[2], [x28], #0x4\n"
-      "ld1 { v14.s }[2], [x24], #0x4\n"
-      "ld1 { v18.s }[2], [x23], #0x4\n"
-      "ld1 { v22.s }[2], [x22], #0x4\n"
+      "ld1 { v10.s }[2], [x9], #0x4\n"
+      "ld1 { v14.s }[2], [x25], #0x4\n"
+      "mov x20, #0x2c\n"
+      "ld1 { v18.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v10.h }[6], [x28]\n"
-      "ld1 { v14.h }[6], [x24]\n"
-      "ld1 { v18.h }[6], [x23]\n"
-      "ld1 { v22.h }[6], [x22]\n"
+      "ld1 { v10.h }[6], [x9]\n"
+      "ld1 { v14.h }[6], [x25]\n"
+      "ld1 { v18.h }[6], [x24]\n"
+      "ld1 { v22.h }[6], [x23]\n"
       "b 166f\n"
       "155:"  // Height 4: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v10.h }[4], [x28]\n"
-      "ld1 { v14.h }[4], [x24]\n"
-      "ld1 { v18.h }[4], [x23]\n"
-      "ld1 { v22.h }[4], [x22]\n"
+      "ld1 { v10.h }[4], [x9]\n"
+      "ld1 { v14.h }[4], [x25]\n"
+      "ld1 { v18.h }[4], [x24]\n"
+      "ld1 { v22.h }[4], [x23]\n"
       "b 166f\n"
       "156:"  // Height 4: Partial accumulate: partial_2_16
       "tbz x11, #1, 157f\n"
-      "ldr s10, [x28], #0x4\n"
-      "ldr s14, [x24], #0x4\n"
-      "mov x19, #0x24\n"
-      "ldr s18, [x23], #0x4\n"
-      "ldr s22, [x22], #0x4\n"
+      "ldr s10, [x9], #0x4\n"
+      "ldr s14, [x25], #0x4\n"
+      "mov x20, #0x24\n"
+      "ldr s18, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v10.h }[2], [x28]\n"
-      "ld1 { v14.h }[2], [x24]\n"
-      "ld1 { v18.h }[2], [x23]\n"
-      "ld1 { v22.h }[2], [x22]\n"
+      "ld1 { v10.h }[2], [x9]\n"
+      "ld1 { v14.h }[2], [x25]\n"
+      "ld1 { v18.h }[2], [x24]\n"
+      "ld1 { v22.h }[2], [x23]\n"
       "b 166f\n"
       "157:"  // Height 4: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 166f\n"
-      "ldr h10, [x28, #0x0]\n"
-      "ldr h14, [x24, #0x0]\n"
-      "ldr h18, [x23, #0x0]\n"
-      "ldr h22, [x22, #0x0]\n"
+      "ldr h10, [x9, #0x0]\n"
+      "ldr h14, [x25, #0x0]\n"
+      "ldr h18, [x24, #0x0]\n"
+      "ldr h22, [x23, #0x0]\n"
       "b 166f\n"
       "158:"  // Height 4: Partial accumulate: partial_8_0
       "tbz x11, #3, 162f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
-      "ld1 { v12.8h }, [x24], #0x10\n"
-      "ld1 { v16.8h }, [x23], #0x10\n"
-      "ld1 { v20.8h }, [x22], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
       "tbz x11, #2, 160f\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
       "tbz x11, #1, 159f\n"
-      "mov x19, #0x1c\n"
-      "ld1 { v9.s }[2], [x28], #0x4\n"
-      "ld1 { v13.s }[2], [x24], #0x4\n"
-      "ld1 { v17.s }[2], [x23], #0x4\n"
-      "ld1 { v21.s }[2], [x22], #0x4\n"
+      "ld1 { v9.s }[2], [x9], #0x4\n"
+      "ld1 { v13.s }[2], [x25], #0x4\n"
+      "mov x20, #0x1c\n"
+      "ld1 { v17.s }[2], [x24], #0x4\n"
+      "ld1 { v21.s }[2], [x23], #0x4\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v9.h }[6], [x28]\n"
-      "ld1 { v13.h }[6], [x24]\n"
-      "ld1 { v17.h }[6], [x23]\n"
-      "ld1 { v21.h }[6], [x22]\n"
+      "ld1 { v9.h }[6], [x9]\n"
+      "ld1 { v13.h }[6], [x25]\n"
+      "ld1 { v17.h }[6], [x24]\n"
+      "ld1 { v21.h }[6], [x23]\n"
       "b 166f\n"
       "159:"  // Height 4: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v9.h }[4], [x28]\n"
-      "ld1 { v13.h }[4], [x24]\n"
-      "ld1 { v17.h }[4], [x23]\n"
-      "ld1 { v21.h }[4], [x22]\n"
+      "ld1 { v9.h }[4], [x9]\n"
+      "ld1 { v13.h }[4], [x25]\n"
+      "ld1 { v17.h }[4], [x24]\n"
+      "ld1 { v21.h }[4], [x23]\n"
       "b 166f\n"
       "160:"  // Height 4: Partial accumulate: partial_2_8
       "tbz x11, #1, 161f\n"
-      "ldr s9, [x28], #0x4\n"
-      "ldr s13, [x24], #0x4\n"
-      "mov x19, #0x14\n"
-      "ldr s17, [x23], #0x4\n"
-      "ldr s21, [x22], #0x4\n"
+      "ldr s9, [x9], #0x4\n"
+      "ldr s13, [x25], #0x4\n"
+      "mov x20, #0x14\n"
+      "ldr s17, [x24], #0x4\n"
+      "ldr s21, [x23], #0x4\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v9.h }[2], [x28]\n"
-      "ld1 { v13.h }[2], [x24]\n"
-      "ld1 { v17.h }[2], [x23]\n"
-      "ld1 { v21.h }[2], [x22]\n"
+      "ld1 { v9.h }[2], [x9]\n"
+      "ld1 { v13.h }[2], [x25]\n"
+      "ld1 { v17.h }[2], [x24]\n"
+      "ld1 { v21.h }[2], [x23]\n"
       "b 166f\n"
       "161:"  // Height 4: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 166f\n"
-      "ldr h9, [x28, #0x0]\n"
-      "ldr h13, [x24, #0x0]\n"
-      "ldr h17, [x23, #0x0]\n"
-      "ldr h21, [x22, #0x0]\n"
+      "ldr h9, [x9, #0x0]\n"
+      "ldr h13, [x25, #0x0]\n"
+      "ldr h17, [x24, #0x0]\n"
+      "ldr h21, [x23, #0x0]\n"
       "b 166f\n"
       "162:"  // Height 4: Partial accumulate: partial_4_0
       "tbz x11, #2, 164f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "ldr d16, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
       "tbz x11, #1, 163f\n"
-      "mov x19, #0xc\n"
-      "ld1 { v8.s }[2], [x28], #0x4\n"
-      "ld1 { v12.s }[2], [x24], #0x4\n"
-      "ld1 { v16.s }[2], [x23], #0x4\n"
-      "ld1 { v20.s }[2], [x22], #0x4\n"
+      "ld1 { v8.s }[2], [x9], #0x4\n"
+      "ld1 { v12.s }[2], [x25], #0x4\n"
+      "mov x20, #0xc\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
+      "ld1 { v20.s }[2], [x23], #0x4\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v8.h }[6], [x28]\n"
-      "ld1 { v12.h }[6], [x24]\n"
-      "ld1 { v16.h }[6], [x23]\n"
-      "ld1 { v20.h }[6], [x22]\n"
+      "ld1 { v8.h }[6], [x9]\n"
+      "ld1 { v12.h }[6], [x25]\n"
+      "ld1 { v16.h }[6], [x24]\n"
+      "ld1 { v20.h }[6], [x23]\n"
       "b 166f\n"
       "163:"  // Height 4: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v8.h }[4], [x28]\n"
-      "ld1 { v12.h }[4], [x24]\n"
-      "ld1 { v16.h }[4], [x23]\n"
-      "ld1 { v20.h }[4], [x22]\n"
+      "ld1 { v8.h }[4], [x9]\n"
+      "ld1 { v12.h }[4], [x25]\n"
+      "ld1 { v16.h }[4], [x24]\n"
+      "ld1 { v20.h }[4], [x23]\n"
       "b 166f\n"
       "164:"  // Height 4: Partial accumulate: partial_2_0
       "tbz x11, #1, 165f\n"
-      "ldr s8, [x28], #0x4\n"
-      "ldr s12, [x24], #0x4\n"
-      "mov x19, #0x4\n"
-      "ldr s16, [x23], #0x4\n"
-      "ldr s20, [x22], #0x4\n"
+      "ldr s8, [x9], #0x4\n"
+      "ldr s12, [x25], #0x4\n"
+      "mov x20, #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
       "tbz x11, #0, 166f\n"
-      "ld1 { v8.h }[2], [x28]\n"
-      "ld1 { v12.h }[2], [x24]\n"
-      "ld1 { v16.h }[2], [x23]\n"
-      "ld1 { v20.h }[2], [x22]\n"
+      "ld1 { v8.h }[2], [x9]\n"
+      "ld1 { v12.h }[2], [x25]\n"
+      "ld1 { v16.h }[2], [x24]\n"
+      "ld1 { v20.h }[2], [x23]\n"
       "b 166f\n"
       "165:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr h8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr h12, [x24, #0x0]\n"
-      "ldr h16, [x23, #0x0]\n"
-      "ldr h20, [x22, #0x0]\n"
+      "ldr h8, [x9, #0x0]\n"
+      "ldr h12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr h16, [x24, #0x0]\n"
+      "ldr h20, [x23, #0x0]\n"
       "166:"  // Height 4: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 169f\n"
       "167:"  // Height 4: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
-      "ldr q20, [x22, #0x0]\n"
-      "ldr q21, [x22, #0x10]\n"
-      "ldr q22, [x22, #0x20]\n"
-      "ldr q23, [x22, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
       "b 169f\n"
       "168:"  // Height 4: no accumulate
       "movi v8.16b, #0x0\n"
@@ -2219,980 +2216,980 @@ void a64_hybrid_fp16_mla_6x32 (
       "movi v22.16b, #0x0\n"
       "movi v23.16b, #0x0\n"
       "169:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "170:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 171f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 172f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 172f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
       "b 172f\n"
       "171:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "172:"  // Height 4: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 175f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x10\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 174f\n"
       "173:"  // Height 4: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q25, [x10, #0x20]\n"
+      "add x25, x25, #0x10\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x26, x26, #0x8\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "cmp x27, #0x10\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0x100]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0x110]\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x120]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x10, #0x130]\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x140]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x10, #0x150]\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x160]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x10, #0x170]\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x180]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x10, #0x190]\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x1a0]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x10, #0x1b0]\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x1c0]\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x10, #0x1d0]\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr q25, [x10, #0x1e0]\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr q24, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "ldr q2, [x23, #0x0]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 173b\n"
       "174:"  // Height 4: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "add x24, x24, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q25, [x10, #0x20]\n"
+      "add x24, x24, #0x10\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x22, x22, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "add x23, x23, #0x10\n"
+      "sub x27, x27, #0x8\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0x100]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0x110]\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x120]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x10, #0x130]\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x140]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x10, #0x150]\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x160]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x10, #0x170]\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x180]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x10, #0x190]\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x1a0]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x10, #0x1b0]\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x1c0]\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x10, #0x1d0]\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr q25, [x10, #0x1e0]\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr q24, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
       "175:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x26, 177f\n"
+      "cbz x27, 177f\n"
       "176:"  // Height 4: Multiply loop: Odd block loop
-      "ldr h0, [x25], #0x2\n"
-      "sub x26, x26, #0x1\n"
+      "ldr h3, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "sub x27, x27, #0x1\n"
       "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr h3, [x22], #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr h0, [x23], #0x2\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      "fmla v8.8h, v25.8h, v3.h[0]\n"
+      "fmla v12.8h, v25.8h, v2.h[0]\n"
+      "fmla v16.8h, v25.8h, v1.h[0]\n"
+      "fmla v20.8h, v25.8h, v0.h[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.8h, v24.8h, v3.h[0]\n"
+      "fmla v13.8h, v24.8h, v2.h[0]\n"
+      "fmla v17.8h, v24.8h, v1.h[0]\n"
+      "fmla v21.8h, v24.8h, v0.h[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "cbnz x26, 176b\n"
+      "fmla v10.8h, v25.8h, v3.h[0]\n"
+      "fmla v14.8h, v25.8h, v2.h[0]\n"
+      "fmla v18.8h, v25.8h, v1.h[0]\n"
+      "fmla v22.8h, v25.8h, v0.h[0]\n"
+      "fmla v11.8h, v24.8h, v3.h[0]\n"
+      "fmla v15.8h, v24.8h, v2.h[0]\n"
+      "fmla v19.8h, v24.8h, v1.h[0]\n"
+      "fmla v23.8h, v24.8h, v0.h[0]\n"
+      "cbnz x27, 176b\n"
       "177:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 170b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x24, x20, LSL #1\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #1\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #1\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 178f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.8h }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x19]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v1.8h\n"
-      "fmax v13.8h, v13.8h, v1.8h\n"
-      "fmax v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmin v16.8h, v16.8h, v0.8h\n"
-      "fmin v17.8h, v17.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v1.8h\n"
-      "fmax v16.8h, v16.8h, v1.8h\n"
-      "fmax v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v0.8h\n"
-      "fmin v19.8h, v19.8h, v0.8h\n"
-      "fmin v20.8h, v20.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v1.8h\n"
-      "fmax v19.8h, v19.8h, v1.8h\n"
-      "fmax v20.8h, v20.8h, v1.8h\n"
-      "fmin v21.8h, v21.8h, v0.8h\n"
-      "fmin v22.8h, v22.8h, v0.8h\n"
-      "fmin v23.8h, v23.8h, v0.8h\n"
-      "fmax v21.8h, v21.8h, v1.8h\n"
-      "fmax v22.8h, v22.8h, v1.8h\n"
-      "fmax v23.8h, v23.8h, v1.8h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v25.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v24.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v25.8h\n"
+      "fmin v9.8h, v9.8h, v25.8h\n"
+      "fmin v10.8h, v10.8h, v25.8h\n"
+      "fmin v11.8h, v11.8h, v25.8h\n"
+      "fmin v12.8h, v12.8h, v25.8h\n"
+      "fmin v13.8h, v13.8h, v25.8h\n"
+      "fmin v14.8h, v14.8h, v25.8h\n"
+      "fmin v15.8h, v15.8h, v25.8h\n"
+      "fmin v16.8h, v16.8h, v25.8h\n"
+      "fmin v17.8h, v17.8h, v25.8h\n"
+      "fmin v18.8h, v18.8h, v25.8h\n"
+      "fmin v19.8h, v19.8h, v25.8h\n"
+      "fmin v20.8h, v20.8h, v25.8h\n"
+      "fmin v21.8h, v21.8h, v25.8h\n"
+      "fmin v22.8h, v22.8h, v25.8h\n"
+      "fmin v23.8h, v23.8h, v25.8h\n"
+      "fmax v8.8h, v8.8h, v24.8h\n"
+      "fmax v9.8h, v9.8h, v24.8h\n"
+      "fmax v10.8h, v10.8h, v24.8h\n"
+      "fmax v11.8h, v11.8h, v24.8h\n"
+      "fmax v12.8h, v12.8h, v24.8h\n"
+      "fmax v13.8h, v13.8h, v24.8h\n"
+      "fmax v14.8h, v14.8h, v24.8h\n"
+      "fmax v15.8h, v15.8h, v24.8h\n"
+      "fmax v16.8h, v16.8h, v24.8h\n"
+      "fmax v17.8h, v17.8h, v24.8h\n"
+      "fmax v18.8h, v18.8h, v24.8h\n"
+      "fmax v19.8h, v19.8h, v24.8h\n"
+      "fmax v20.8h, v20.8h, v24.8h\n"
+      "fmax v21.8h, v21.8h, v24.8h\n"
+      "fmax v22.8h, v22.8h, v24.8h\n"
+      "fmax v23.8h, v23.8h, v24.8h\n"
       "178:"  // Height 4: No activation
       "cmp x11, #0x20\n"
       "bge 195f\n"
       "tbz x11, #4, 186f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
-      "st1 { v9.8h }, [x28], #0x10\n"
-      "st1 { v12.8h }, [x24], #0x10\n"
-      "st1 { v13.8h }, [x24], #0x10\n"
-      "st1 { v16.8h }, [x23], #0x10\n"
-      "st1 { v17.8h }, [x23], #0x10\n"
-      "st1 { v20.8h }, [x22], #0x10\n"
-      "st1 { v21.8h }, [x22], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
+      "st1 { v9.8h }, [x9], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v13.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v17.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
+      "st1 { v21.8h }, [x23], #0x10\n"
       "tbz x11, #3, 182f\n"
-      "st1 { v10.8h }, [x28], #0x10\n"
-      "st1 { v14.8h }, [x24], #0x10\n"
-      "st1 { v18.8h }, [x23], #0x10\n"
-      "st1 { v22.8h }, [x22], #0x10\n"
+      "st1 { v10.8h }, [x9], #0x10\n"
+      "st1 { v14.8h }, [x25], #0x10\n"
+      "st1 { v18.8h }, [x24], #0x10\n"
+      "st1 { v22.8h }, [x23], #0x10\n"
       "tbz x11, #2, 180f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
-      "str d23, [x22], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
       "tbz x11, #1, 179f\n"
-      "st1 { v11.s }[2], [x28], #0x4\n"
-      "st1 { v15.s }[2], [x24], #0x4\n"
-      "st1 { v19.s }[2], [x23], #0x4\n"
-      "st1 { v23.s }[2], [x22], #0x4\n"
+      "st1 { v11.s }[2], [x9], #0x4\n"
+      "st1 { v15.s }[2], [x25], #0x4\n"
+      "st1 { v19.s }[2], [x24], #0x4\n"
+      "st1 { v23.s }[2], [x23], #0x4\n"
       "tbz x11, #0, 194f\n"
-      "st1 { v11.h }[6], [x28]\n"
-      "st1 { v15.h }[6], [x24]\n"
-      "st1 { v19.h }[6], [x23]\n"
-      "st1 { v23.h }[6], [x22]\n"
+      "st1 { v11.h }[6], [x9]\n"
+      "st1 { v15.h }[6], [x25]\n"
+      "st1 { v19.h }[6], [x24]\n"
+      "st1 { v23.h }[6], [x23]\n"
       "b 194f\n"
       "179:"  // Height 4: Partial direct writeback: partial_1_28
       "tbz x11, #0, 194f\n"
-      "st1 { v11.h }[4], [x28]\n"
-      "st1 { v15.h }[4], [x24]\n"
-      "st1 { v19.h }[4], [x23]\n"
-      "st1 { v23.h }[4], [x22]\n"
+      "st1 { v11.h }[4], [x9]\n"
+      "st1 { v15.h }[4], [x25]\n"
+      "st1 { v19.h }[4], [x24]\n"
+      "st1 { v23.h }[4], [x23]\n"
       "b 194f\n"
       "180:"  // Height 4: Partial direct writeback: partial_2_24
       "tbz x11, #1, 181f\n"
-      "str s11, [x28], #0x4\n"
-      "str s15, [x24], #0x4\n"
-      "str s19, [x23], #0x4\n"
-      "str s23, [x22], #0x4\n"
+      "str s11, [x9], #0x4\n"
+      "str s15, [x25], #0x4\n"
+      "str s19, [x24], #0x4\n"
+      "str s23, [x23], #0x4\n"
       "tbz x11, #0, 194f\n"
-      "st1 { v11.h }[2], [x28]\n"
-      "st1 { v15.h }[2], [x24]\n"
-      "st1 { v19.h }[2], [x23]\n"
-      "st1 { v23.h }[2], [x22]\n"
+      "st1 { v11.h }[2], [x9]\n"
+      "st1 { v15.h }[2], [x25]\n"
+      "st1 { v19.h }[2], [x24]\n"
+      "st1 { v23.h }[2], [x23]\n"
       "b 194f\n"
       "181:"  // Height 4: Partial direct writeback: partial_1_24
       "tbz x11, #0, 194f\n"
-      "str h11, [x28, #0x0]\n"
-      "str h15, [x24, #0x0]\n"
-      "str h19, [x23, #0x0]\n"
-      "str h23, [x22, #0x0]\n"
+      "str h11, [x9, #0x0]\n"
+      "str h15, [x25, #0x0]\n"
+      "str h19, [x24, #0x0]\n"
+      "str h23, [x23, #0x0]\n"
       "b 194f\n"
       "182:"  // Height 4: Partial direct writeback: partial_4_16
       "tbz x11, #2, 184f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
-      "str d22, [x22], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
       "tbz x11, #1, 183f\n"
-      "st1 { v10.s }[2], [x28], #0x4\n"
-      "st1 { v14.s }[2], [x24], #0x4\n"
-      "st1 { v18.s }[2], [x23], #0x4\n"
-      "st1 { v22.s }[2], [x22], #0x4\n"
+      "st1 { v10.s }[2], [x9], #0x4\n"
+      "st1 { v14.s }[2], [x25], #0x4\n"
+      "st1 { v18.s }[2], [x24], #0x4\n"
+      "st1 { v22.s }[2], [x23], #0x4\n"
       "tbz x11, #0, 194f\n"
-      "st1 { v10.h }[6], [x28]\n"
-      "st1 { v14.h }[6], [x24]\n"
-      "st1 { v18.h }[6], [x23]\n"
-      "st1 { v22.h }[6], [x22]\n"
+      "st1 { v10.h }[6], [x9]\n"
+      "st1 { v14.h }[6], [x25]\n"
+      "st1 { v18.h }[6], [x24]\n"
+      "st1 { v22.h }[6], [x23]\n"
       "b 194f\n"
       "183:"  // Height 4: Partial direct writeback: partial_1_20
       "tbz x11, #0, 194f\n"
-      "st1 { v10.h }[4], [x28]\n"
-      "st1 { v14.h }[4], [x24]\n"
-      "st1 { v18.h }[4], [x23]\n"
-      "st1 { v22.h }[4], [x22]\n"
+      "st1 { v10.h }[4], [x9]\n"
+      "st1 { v14.h }[4], [x25]\n"
+      "st1 { v18.h }[4], [x24]\n"
+      "st1 { v22.h }[4], [x23]\n"
       "b 194f\n"
       "184:"  // Height 4: Partial direct writeback: partial_2_16
       "tbz x11, #1, 185f\n"
-      "str s10, [x28], #0x4\n"
-      "str s14, [x24], #0x4\n"
-      "str s18, [x23], #0x4\n"
-      "str s22, [x22], #0x4\n"
+      "str s10, [x9], #0x4\n"
+      "str s14, [x25], #0x4\n"
+      "str s18, [x24], #0x4\n"
+      "str s22, [x23], #0x4\n"
       "tbz x11, #0, 194f\n"
-      "st1 { v10.h }[2], [x28]\n"
-      "st1 { v14.h }[2], [x24]\n"
-      "st1 { v18.h }[2], [x23]\n"
-      "st1 { v22.h }[2], [x22]\n"
+      "st1 { v10.h }[2], [x9]\n"
+      "st1 { v14.h }[2], [x25]\n"
+      "st1 { v18.h }[2], [x24]\n"
+      "st1 { v22.h }[2], [x23]\n"
       "b 194f\n"
       "185:"  // Height 4: Partial direct writeback: partial_1_16
       "tbz x11, #0, 194f\n"
-      "str h10, [x28, #0x0]\n"
-      "str h14, [x24, #0x0]\n"
-      "str h18, [x23, #0x0]\n"
-      "str h22, [x22, #0x0]\n"
+      "str h10, [x9, #0x0]\n"
+      "str h14, [x25, #0x0]\n"
+      "str h18, [x24, #0x0]\n"
+      "str h22, [x23, #0x0]\n"
       "b 194f\n"
       "186:"  // Height 4: Partial direct writeback: partial_8_0
       "tbz x11, #3, 190f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
-      "st1 { v12.8h }, [x24], #0x10\n"
-      "st1 { v16.8h }, [x23], #0x10\n"
-      "st1 { v20.8h }, [x22], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
       "tbz x11, #2, 188f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
-      "str d21, [x22], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
       "tbz x11, #1, 187f\n"
-      "st1 { v9.s }[2], [x28], #0x4\n"
-      "st1 { v13.s }[2], [x24], #0x4\n"
-      "st1 { v17.s }[2], [x23], #0x4\n"
-      "st1 { v21.s }[2], [x22], #0x4\n"
+      "st1 { v9.s }[2], [x9], #0x4\n"
+      "st1 { v13.s }[2], [x25], #0x4\n"
+      "st1 { v17.s }[2], [x24], #0x4\n"
+      "st1 { v21.s }[2], [x23], #0x4\n"
       "tbz x11, #0, 194f\n"
-      "st1 { v9.h }[6], [x28]\n"
-      "st1 { v13.h }[6], [x24]\n"
-      "st1 { v17.h }[6], [x23]\n"
-      "st1 { v21.h }[6], [x22]\n"
+      "st1 { v9.h }[6], [x9]\n"
+      "st1 { v13.h }[6], [x25]\n"
+      "st1 { v17.h }[6], [x24]\n"
+      "st1 { v21.h }[6], [x23]\n"
       "b 194f\n"
       "187:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x11, #0, 194f\n"
-      "st1 { v9.h }[4], [x28]\n"
-      "st1 { v13.h }[4], [x24]\n"
-      "st1 { v17.h }[4], [x23]\n"
-      "st1 { v21.h }[4], [x22]\n"
+      "st1 { v9.h }[4], [x9]\n"
+      "st1 { v13.h }[4], [x25]\n"
+      "st1 { v17.h }[4], [x24]\n"
+      "st1 { v21.h }[4], [x23]\n"
       "b 194f\n"
       "188:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x11, #1, 189f\n"
-      "str s9, [x28], #0x4\n"
-      "str s13, [x24], #0x4\n"
-      "str s17, [x23], #0x4\n"
-      "str s21, [x22], #0x4\n"
+      "str s9, [x9], #0x4\n"
+      "str s13, [x25], #0x4\n"
+      "str s17, [x24], #0x4\n"
+      "str s21, [x23], #0x4\n"
       "tbz x11, #0, 194f\n"
-      "st1 { v9.h }[2], [x28]\n"
-      "st1 { v13.h }[2], [x24]\n"
-      "st1 { v17.h }[2], [x23]\n"
-      "st1 { v21.h }[2], [x22]\n"
+      "st1 { v9.h }[2], [x9]\n"
+      "st1 { v13.h }[2], [x25]\n"
+      "st1 { v17.h }[2], [x24]\n"
+      "st1 { v21.h }[2], [x23]\n"
       "b 194f\n"
       "189:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x11, #0, 194f\n"
-      "str h9, [x28, #0x0]\n"
-      "str h13, [x24, #0x0]\n"
-      "str h17, [x23, #0x0]\n"
-      "str h21, [x22, #0x0]\n"
+      "str h9, [x9, #0x0]\n"
+      "str h13, [x25, #0x0]\n"
+      "str h17, [x24, #0x0]\n"
+      "str h21, [x23, #0x0]\n"
       "b 194f\n"
       "190:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x11, #2, 192f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
       "tbz x11, #1, 191f\n"
-      "st1 { v8.s }[2], [x28], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
+      "st1 { v8.s }[2], [x9], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
       "tbz x11, #0, 194f\n"
-      "st1 { v8.h }[6], [x28]\n"
-      "st1 { v12.h }[6], [x24]\n"
-      "st1 { v16.h }[6], [x23]\n"
-      "st1 { v20.h }[6], [x22]\n"
+      "st1 { v8.h }[6], [x9]\n"
+      "st1 { v12.h }[6], [x25]\n"
+      "st1 { v16.h }[6], [x24]\n"
+      "st1 { v20.h }[6], [x23]\n"
       "b 194f\n"
       "191:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x11, #0, 194f\n"
-      "st1 { v8.h }[4], [x28]\n"
-      "st1 { v12.h }[4], [x24]\n"
-      "st1 { v16.h }[4], [x23]\n"
-      "st1 { v20.h }[4], [x22]\n"
+      "st1 { v8.h }[4], [x9]\n"
+      "st1 { v12.h }[4], [x25]\n"
+      "st1 { v16.h }[4], [x24]\n"
+      "st1 { v20.h }[4], [x23]\n"
       "b 194f\n"
       "192:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x11, #1, 193f\n"
-      "str s8, [x28], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
-      "str s20, [x22], #0x4\n"
+      "str s8, [x9], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
       "tbz x11, #0, 194f\n"
-      "st1 { v8.h }[2], [x28]\n"
-      "st1 { v12.h }[2], [x24]\n"
-      "st1 { v16.h }[2], [x23]\n"
-      "st1 { v20.h }[2], [x22]\n"
+      "st1 { v8.h }[2], [x9]\n"
+      "st1 { v12.h }[2], [x25]\n"
+      "st1 { v16.h }[2], [x24]\n"
+      "st1 { v20.h }[2], [x23]\n"
       "b 194f\n"
       "193:"  // Height 4: Partial direct writeback: partial_1_0
-      "str h8, [x28, #0x0]\n"
-      "str h12, [x24, #0x0]\n"
-      "str h16, [x23, #0x0]\n"
-      "str h20, [x22, #0x0]\n"
+      "str h8, [x9, #0x0]\n"
+      "str h12, [x25, #0x0]\n"
+      "str h16, [x24, #0x0]\n"
+      "str h20, [x23, #0x0]\n"
       "194:"  // Height 4: Partial direct writeback: Done
       "b 196f\n"
       "195:"  // Height 4: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q21, [x22, #0x10]\n"
-      "str q22, [x22, #0x20]\n"
-      "str q23, [x22, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
       "196:"  // Height 4: Writeback done
       "subs x11, x11, #0x20\n"
       "bgt 149b\n"
       "b 296f\n"
       "197:"  // Height 5
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "198:"  // Height 5: Column loop
-      "cbz x9, 199f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 199f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "mov v20.16b, v8.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      "mov v24.16b, v8.16b\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
       "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
       "mov v27.16b, v11.16b\n"
       "b 218f\n"
       "199:"  // Height 5: no bias
       "tbz %x[flags], #0, 217f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
       "cmp x11, #0x20\n"
-      "add x24, x28, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
       "bge 216f\n"
       "tbz x11, #4, 207f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
-      "ld1 { v12.8h }, [x24], #0x10\n"
-      "ld1 { v16.8h }, [x23], #0x10\n"
-      "ld1 { v20.8h }, [x22], #0x10\n"
-      "ld1 { v24.8h }, [x21], #0x10\n"
-      "ld1 { v9.8h }, [x28], #0x10\n"
-      "ld1 { v13.8h }, [x24], #0x10\n"
-      "ld1 { v17.8h }, [x23], #0x10\n"
-      "ld1 { v21.8h }, [x22], #0x10\n"
-      "ld1 { v25.8h }, [x21], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "ld1 { v24.8h }, [x22], #0x10\n"
+      "ld1 { v9.8h }, [x9], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
+      "ld1 { v17.8h }, [x24], #0x10\n"
+      "ld1 { v21.8h }, [x23], #0x10\n"
+      "ld1 { v25.8h }, [x22], #0x10\n"
       "tbz x11, #3, 203f\n"
-      "ld1 { v10.8h }, [x28], #0x10\n"
-      "ld1 { v14.8h }, [x24], #0x10\n"
-      "ld1 { v18.8h }, [x23], #0x10\n"
-      "ld1 { v22.8h }, [x22], #0x10\n"
-      "ld1 { v26.8h }, [x21], #0x10\n"
+      "ld1 { v10.8h }, [x9], #0x10\n"
+      "ld1 { v14.8h }, [x25], #0x10\n"
+      "ld1 { v18.8h }, [x24], #0x10\n"
+      "ld1 { v22.8h }, [x23], #0x10\n"
+      "ld1 { v26.8h }, [x22], #0x10\n"
       "tbz x11, #2, 201f\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
-      "ldr d23, [x22], #0x8\n"
-      "ldr d27, [x21], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
       "tbz x11, #1, 200f\n"
-      "ld1 { v11.s }[2], [x28], #0x4\n"
-      "mov x19, #0x3c\n"
-      "ld1 { v15.s }[2], [x24], #0x4\n"
-      "ld1 { v19.s }[2], [x23], #0x4\n"
-      "ld1 { v23.s }[2], [x22], #0x4\n"
-      "ld1 { v27.s }[2], [x21], #0x4\n"
+      "ld1 { v11.s }[2], [x9], #0x4\n"
+      "ld1 { v15.s }[2], [x25], #0x4\n"
+      "mov x20, #0x3c\n"
+      "ld1 { v19.s }[2], [x24], #0x4\n"
+      "ld1 { v23.s }[2], [x23], #0x4\n"
+      "ld1 { v27.s }[2], [x22], #0x4\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v11.h }[6], [x28]\n"
-      "ld1 { v15.h }[6], [x24]\n"
-      "ld1 { v19.h }[6], [x23]\n"
-      "ld1 { v23.h }[6], [x22]\n"
-      "ld1 { v27.h }[6], [x21]\n"
+      "ld1 { v11.h }[6], [x9]\n"
+      "ld1 { v15.h }[6], [x25]\n"
+      "ld1 { v19.h }[6], [x24]\n"
+      "ld1 { v23.h }[6], [x23]\n"
+      "ld1 { v27.h }[6], [x22]\n"
       "b 215f\n"
       "200:"  // Height 5: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v11.h }[4], [x28]\n"
-      "ld1 { v15.h }[4], [x24]\n"
-      "ld1 { v19.h }[4], [x23]\n"
-      "ld1 { v23.h }[4], [x22]\n"
-      "ld1 { v27.h }[4], [x21]\n"
+      "ld1 { v11.h }[4], [x9]\n"
+      "ld1 { v15.h }[4], [x25]\n"
+      "ld1 { v19.h }[4], [x24]\n"
+      "ld1 { v23.h }[4], [x23]\n"
+      "ld1 { v27.h }[4], [x22]\n"
       "b 215f\n"
       "201:"  // Height 5: Partial accumulate: partial_2_24
       "tbz x11, #1, 202f\n"
-      "ldr s11, [x28], #0x4\n"
-      "ldr s15, [x24], #0x4\n"
-      "mov x19, #0x34\n"
-      "ldr s19, [x23], #0x4\n"
-      "ldr s23, [x22], #0x4\n"
-      "ldr s27, [x21], #0x4\n"
+      "ldr s11, [x9], #0x4\n"
+      "ldr s15, [x25], #0x4\n"
+      "mov x20, #0x34\n"
+      "ldr s19, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s27, [x22], #0x4\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v11.h }[2], [x28]\n"
-      "ld1 { v15.h }[2], [x24]\n"
-      "ld1 { v19.h }[2], [x23]\n"
-      "ld1 { v23.h }[2], [x22]\n"
-      "ld1 { v27.h }[2], [x21]\n"
+      "ld1 { v11.h }[2], [x9]\n"
+      "ld1 { v15.h }[2], [x25]\n"
+      "ld1 { v19.h }[2], [x24]\n"
+      "ld1 { v23.h }[2], [x23]\n"
+      "ld1 { v27.h }[2], [x22]\n"
       "b 215f\n"
       "202:"  // Height 5: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 215f\n"
-      "ldr h11, [x28, #0x0]\n"
-      "ldr h15, [x24, #0x0]\n"
-      "ldr h19, [x23, #0x0]\n"
-      "ldr h23, [x22, #0x0]\n"
-      "ldr h27, [x21, #0x0]\n"
+      "ldr h11, [x9, #0x0]\n"
+      "ldr h15, [x25, #0x0]\n"
+      "ldr h19, [x24, #0x0]\n"
+      "ldr h23, [x23, #0x0]\n"
+      "ldr h27, [x22, #0x0]\n"
       "b 215f\n"
       "203:"  // Height 5: Partial accumulate: partial_4_16
       "tbz x11, #2, 205f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "ldr d18, [x23], #0x8\n"
-      "ldr d22, [x22], #0x8\n"
-      "ldr d26, [x21], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
       "tbz x11, #1, 204f\n"
-      "ld1 { v10.s }[2], [x28], #0x4\n"
-      "mov x19, #0x2c\n"
-      "ld1 { v14.s }[2], [x24], #0x4\n"
-      "ld1 { v18.s }[2], [x23], #0x4\n"
-      "ld1 { v22.s }[2], [x22], #0x4\n"
-      "ld1 { v26.s }[2], [x21], #0x4\n"
+      "ld1 { v10.s }[2], [x9], #0x4\n"
+      "ld1 { v14.s }[2], [x25], #0x4\n"
+      "mov x20, #0x2c\n"
+      "ld1 { v18.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v26.s }[2], [x22], #0x4\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v10.h }[6], [x28]\n"
-      "ld1 { v14.h }[6], [x24]\n"
-      "ld1 { v18.h }[6], [x23]\n"
-      "ld1 { v22.h }[6], [x22]\n"
-      "ld1 { v26.h }[6], [x21]\n"
+      "ld1 { v10.h }[6], [x9]\n"
+      "ld1 { v14.h }[6], [x25]\n"
+      "ld1 { v18.h }[6], [x24]\n"
+      "ld1 { v22.h }[6], [x23]\n"
+      "ld1 { v26.h }[6], [x22]\n"
       "b 215f\n"
       "204:"  // Height 5: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v10.h }[4], [x28]\n"
-      "ld1 { v14.h }[4], [x24]\n"
-      "ld1 { v18.h }[4], [x23]\n"
-      "ld1 { v22.h }[4], [x22]\n"
-      "ld1 { v26.h }[4], [x21]\n"
+      "ld1 { v10.h }[4], [x9]\n"
+      "ld1 { v14.h }[4], [x25]\n"
+      "ld1 { v18.h }[4], [x24]\n"
+      "ld1 { v22.h }[4], [x23]\n"
+      "ld1 { v26.h }[4], [x22]\n"
       "b 215f\n"
       "205:"  // Height 5: Partial accumulate: partial_2_16
       "tbz x11, #1, 206f\n"
-      "ldr s10, [x28], #0x4\n"
-      "ldr s14, [x24], #0x4\n"
-      "mov x19, #0x24\n"
-      "ldr s18, [x23], #0x4\n"
-      "ldr s22, [x22], #0x4\n"
-      "ldr s26, [x21], #0x4\n"
+      "ldr s10, [x9], #0x4\n"
+      "ldr s14, [x25], #0x4\n"
+      "mov x20, #0x24\n"
+      "ldr s18, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s26, [x22], #0x4\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v10.h }[2], [x28]\n"
-      "ld1 { v14.h }[2], [x24]\n"
-      "ld1 { v18.h }[2], [x23]\n"
-      "ld1 { v22.h }[2], [x22]\n"
-      "ld1 { v26.h }[2], [x21]\n"
+      "ld1 { v10.h }[2], [x9]\n"
+      "ld1 { v14.h }[2], [x25]\n"
+      "ld1 { v18.h }[2], [x24]\n"
+      "ld1 { v22.h }[2], [x23]\n"
+      "ld1 { v26.h }[2], [x22]\n"
       "b 215f\n"
       "206:"  // Height 5: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 215f\n"
-      "ldr h10, [x28, #0x0]\n"
-      "ldr h14, [x24, #0x0]\n"
-      "ldr h18, [x23, #0x0]\n"
-      "ldr h22, [x22, #0x0]\n"
-      "ldr h26, [x21, #0x0]\n"
+      "ldr h10, [x9, #0x0]\n"
+      "ldr h14, [x25, #0x0]\n"
+      "ldr h18, [x24, #0x0]\n"
+      "ldr h22, [x23, #0x0]\n"
+      "ldr h26, [x22, #0x0]\n"
       "b 215f\n"
       "207:"  // Height 5: Partial accumulate: partial_8_0
       "tbz x11, #3, 211f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
-      "ld1 { v12.8h }, [x24], #0x10\n"
-      "ld1 { v16.8h }, [x23], #0x10\n"
-      "ld1 { v20.8h }, [x22], #0x10\n"
-      "ld1 { v24.8h }, [x21], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "ld1 { v24.8h }, [x22], #0x10\n"
       "tbz x11, #2, 209f\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
-      "ldr d25, [x21], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
       "tbz x11, #1, 208f\n"
-      "ld1 { v9.s }[2], [x28], #0x4\n"
-      "mov x19, #0x1c\n"
-      "ld1 { v13.s }[2], [x24], #0x4\n"
-      "ld1 { v17.s }[2], [x23], #0x4\n"
-      "ld1 { v21.s }[2], [x22], #0x4\n"
-      "ld1 { v25.s }[2], [x21], #0x4\n"
+      "ld1 { v9.s }[2], [x9], #0x4\n"
+      "ld1 { v13.s }[2], [x25], #0x4\n"
+      "mov x20, #0x1c\n"
+      "ld1 { v17.s }[2], [x24], #0x4\n"
+      "ld1 { v21.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v9.h }[6], [x28]\n"
-      "ld1 { v13.h }[6], [x24]\n"
-      "ld1 { v17.h }[6], [x23]\n"
-      "ld1 { v21.h }[6], [x22]\n"
-      "ld1 { v25.h }[6], [x21]\n"
+      "ld1 { v9.h }[6], [x9]\n"
+      "ld1 { v13.h }[6], [x25]\n"
+      "ld1 { v17.h }[6], [x24]\n"
+      "ld1 { v21.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
       "b 215f\n"
       "208:"  // Height 5: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v9.h }[4], [x28]\n"
-      "ld1 { v13.h }[4], [x24]\n"
-      "ld1 { v17.h }[4], [x23]\n"
-      "ld1 { v21.h }[4], [x22]\n"
-      "ld1 { v25.h }[4], [x21]\n"
+      "ld1 { v9.h }[4], [x9]\n"
+      "ld1 { v13.h }[4], [x25]\n"
+      "ld1 { v17.h }[4], [x24]\n"
+      "ld1 { v21.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
       "b 215f\n"
       "209:"  // Height 5: Partial accumulate: partial_2_8
       "tbz x11, #1, 210f\n"
-      "ldr s9, [x28], #0x4\n"
-      "ldr s13, [x24], #0x4\n"
-      "mov x19, #0x14\n"
-      "ldr s17, [x23], #0x4\n"
-      "ldr s21, [x22], #0x4\n"
-      "ldr s25, [x21], #0x4\n"
+      "ldr s9, [x9], #0x4\n"
+      "ldr s13, [x25], #0x4\n"
+      "mov x20, #0x14\n"
+      "ldr s17, [x24], #0x4\n"
+      "ldr s21, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v9.h }[2], [x28]\n"
-      "ld1 { v13.h }[2], [x24]\n"
-      "ld1 { v17.h }[2], [x23]\n"
-      "ld1 { v21.h }[2], [x22]\n"
-      "ld1 { v25.h }[2], [x21]\n"
+      "ld1 { v9.h }[2], [x9]\n"
+      "ld1 { v13.h }[2], [x25]\n"
+      "ld1 { v17.h }[2], [x24]\n"
+      "ld1 { v21.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
       "b 215f\n"
       "210:"  // Height 5: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 215f\n"
-      "ldr h9, [x28, #0x0]\n"
-      "ldr h13, [x24, #0x0]\n"
-      "ldr h17, [x23, #0x0]\n"
-      "ldr h21, [x22, #0x0]\n"
-      "ldr h25, [x21, #0x0]\n"
+      "ldr h9, [x9, #0x0]\n"
+      "ldr h13, [x25, #0x0]\n"
+      "ldr h17, [x24, #0x0]\n"
+      "ldr h21, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
       "b 215f\n"
       "211:"  // Height 5: Partial accumulate: partial_4_0
       "tbz x11, #2, 213f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "ldr d16, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d24, [x21], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
       "tbz x11, #1, 212f\n"
-      "ld1 { v8.s }[2], [x28], #0x4\n"
-      "mov x19, #0xc\n"
-      "ld1 { v12.s }[2], [x24], #0x4\n"
-      "ld1 { v16.s }[2], [x23], #0x4\n"
-      "ld1 { v20.s }[2], [x22], #0x4\n"
-      "ld1 { v24.s }[2], [x21], #0x4\n"
+      "ld1 { v8.s }[2], [x9], #0x4\n"
+      "ld1 { v12.s }[2], [x25], #0x4\n"
+      "mov x20, #0xc\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
+      "ld1 { v20.s }[2], [x23], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v8.h }[6], [x28]\n"
-      "ld1 { v12.h }[6], [x24]\n"
-      "ld1 { v16.h }[6], [x23]\n"
-      "ld1 { v20.h }[6], [x22]\n"
-      "ld1 { v24.h }[6], [x21]\n"
+      "ld1 { v8.h }[6], [x9]\n"
+      "ld1 { v12.h }[6], [x25]\n"
+      "ld1 { v16.h }[6], [x24]\n"
+      "ld1 { v20.h }[6], [x23]\n"
+      "ld1 { v24.h }[6], [x22]\n"
       "b 215f\n"
       "212:"  // Height 5: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v8.h }[4], [x28]\n"
-      "ld1 { v12.h }[4], [x24]\n"
-      "ld1 { v16.h }[4], [x23]\n"
-      "ld1 { v20.h }[4], [x22]\n"
-      "ld1 { v24.h }[4], [x21]\n"
+      "ld1 { v8.h }[4], [x9]\n"
+      "ld1 { v12.h }[4], [x25]\n"
+      "ld1 { v16.h }[4], [x24]\n"
+      "ld1 { v20.h }[4], [x23]\n"
+      "ld1 { v24.h }[4], [x22]\n"
       "b 215f\n"
       "213:"  // Height 5: Partial accumulate: partial_2_0
       "tbz x11, #1, 214f\n"
-      "ldr s8, [x28], #0x4\n"
-      "ldr s12, [x24], #0x4\n"
-      "mov x19, #0x4\n"
-      "ldr s16, [x23], #0x4\n"
-      "ldr s20, [x22], #0x4\n"
-      "ldr s24, [x21], #0x4\n"
+      "ldr s8, [x9], #0x4\n"
+      "ldr s12, [x25], #0x4\n"
+      "mov x20, #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
       "tbz x11, #0, 215f\n"
-      "ld1 { v8.h }[2], [x28]\n"
-      "ld1 { v12.h }[2], [x24]\n"
-      "ld1 { v16.h }[2], [x23]\n"
-      "ld1 { v20.h }[2], [x22]\n"
-      "ld1 { v24.h }[2], [x21]\n"
+      "ld1 { v8.h }[2], [x9]\n"
+      "ld1 { v12.h }[2], [x25]\n"
+      "ld1 { v16.h }[2], [x24]\n"
+      "ld1 { v20.h }[2], [x23]\n"
+      "ld1 { v24.h }[2], [x22]\n"
       "b 215f\n"
       "214:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr h8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr h12, [x24, #0x0]\n"
-      "ldr h16, [x23, #0x0]\n"
-      "ldr h20, [x22, #0x0]\n"
-      "ldr h24, [x21, #0x0]\n"
+      "ldr h8, [x9, #0x0]\n"
+      "ldr h12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr h16, [x24, #0x0]\n"
+      "ldr h20, [x23, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
       "215:"  // Height 5: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 218f\n"
       "216:"  // Height 5: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
-      "ldr q20, [x22, #0x0]\n"
-      "ldr q21, [x22, #0x10]\n"
-      "ldr q22, [x22, #0x20]\n"
-      "ldr q23, [x22, #0x30]\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q24, [x22, #0x0]\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q26, [x22, #0x20]\n"
+      "ldr q27, [x22, #0x30]\n"
       "b 218f\n"
       "217:"  // Height 5: no accumulate
       "movi v8.16b, #0x0\n"
@@ -3216,1148 +3213,1148 @@ void a64_hybrid_fp16_mla_6x32 (
       "movi v26.16b, #0x0\n"
       "movi v27.16b, #0x0\n"
       "218:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "219:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 220f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 221f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
-      "add x21, x21, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 221f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
       "b 221f\n"
       "220:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "221:"  // Height 5: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 224f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x10\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 223f\n"
       "222:"  // Height 5: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q29, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sub x26, x26, #0x8\n"
+      "add x23, x23, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "cmp x26, #0x10\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "cmp x27, #0x10\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "fmla v24.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "fmla v26.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "fmla v27.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "fmla v24.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "fmla v25.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "fmla v26.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "fmla v27.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "fmla v24.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "fmla v25.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "fmla v26.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "fmla v27.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "fmla v24.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "fmla v25.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "fmla v26.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "fmla v27.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "fmla v24.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "fmla v25.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "fmla v26.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "fmla v27.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "fmla v24.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "fmla v25.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "fmla v26.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "fmla v27.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "fmla v24.8h, v6.8h, v4.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "fmla v25.8h, v7.8h, v4.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0x100]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0x110]\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x120]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x10, #0x130]\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x140]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x10, #0x150]\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x160]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x10, #0x170]\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x180]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x10, #0x190]\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x1a0]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x10, #0x1b0]\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x1c0]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x10, #0x1d0]\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr q29, [x10, #0x1e0]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr q28, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "ldr q2, [x23, #0x0]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "fmla v27.8h, v7.8h, v4.h[7]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 222b\n"
       "223:"  // Height 5: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "add x24, x24, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q29, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "sub x27, x27, #0x8\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "fmla v24.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "fmla v26.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "fmla v27.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "fmla v24.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "fmla v25.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "fmla v26.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "fmla v27.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "fmla v24.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "fmla v25.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "fmla v26.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "fmla v27.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "fmla v24.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "fmla v25.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "fmla v26.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "fmla v27.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "fmla v24.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "fmla v25.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "fmla v26.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "fmla v27.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "fmla v24.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "fmla v25.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "fmla v26.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "fmla v27.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "fmla v24.8h, v6.8h, v4.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "fmla v25.8h, v7.8h, v4.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0x100]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0x110]\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x120]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x10, #0x130]\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x140]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x10, #0x150]\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x160]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x10, #0x170]\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x180]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x10, #0x190]\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x1a0]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x10, #0x1b0]\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x1c0]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x10, #0x1d0]\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr q29, [x10, #0x1e0]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr q28, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v26.8h, v6.8h, v4.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
       "224:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x26, 226f\n"
+      "cbz x27, 226f\n"
       "225:"  // Height 5: Multiply loop: Odd block loop
-      "ldr h0, [x25], #0x2\n"
-      "sub x26, x26, #0x1\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr h3, [x22], #0x2\n"
-      "ldr h4, [x21], #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr h4, [x26], #0x2\n"
+      "ldr h3, [x25], #0x2\n"
+      "sub x27, x27, #0x1\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h0, [x22], #0x2\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v8.8h, v29.8h, v4.h[0]\n"
+      "fmla v12.8h, v29.8h, v3.h[0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      "fmla v16.8h, v29.8h, v2.h[0]\n"
+      "fmla v20.8h, v29.8h, v1.h[0]\n"
+      "fmla v24.8h, v29.8h, v0.h[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.8h, v28.8h, v4.h[0]\n"
+      "fmla v13.8h, v28.8h, v3.h[0]\n"
+      "fmla v17.8h, v28.8h, v2.h[0]\n"
+      "fmla v21.8h, v28.8h, v1.h[0]\n"
+      "fmla v25.8h, v28.8h, v0.h[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "cbnz x26, 225b\n"
+      "fmla v10.8h, v29.8h, v4.h[0]\n"
+      "fmla v14.8h, v29.8h, v3.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v1.h[0]\n"
+      "fmla v26.8h, v29.8h, v0.h[0]\n"
+      "fmla v11.8h, v28.8h, v4.h[0]\n"
+      "fmla v15.8h, v28.8h, v3.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v1.h[0]\n"
+      "fmla v27.8h, v28.8h, v0.h[0]\n"
+      "cbnz x27, 225b\n"
       "226:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 219b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #1\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #1\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #1\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "tbz %x[flags], #1, 227f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.8h }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x19]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v1.8h\n"
-      "fmax v13.8h, v13.8h, v1.8h\n"
-      "fmax v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmin v16.8h, v16.8h, v0.8h\n"
-      "fmin v17.8h, v17.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v1.8h\n"
-      "fmax v16.8h, v16.8h, v1.8h\n"
-      "fmax v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v0.8h\n"
-      "fmin v19.8h, v19.8h, v0.8h\n"
-      "fmin v20.8h, v20.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v1.8h\n"
-      "fmax v19.8h, v19.8h, v1.8h\n"
-      "fmax v20.8h, v20.8h, v1.8h\n"
-      "fmin v21.8h, v21.8h, v0.8h\n"
-      "fmin v22.8h, v22.8h, v0.8h\n"
-      "fmin v23.8h, v23.8h, v0.8h\n"
-      "fmax v21.8h, v21.8h, v1.8h\n"
-      "fmax v22.8h, v22.8h, v1.8h\n"
-      "fmax v23.8h, v23.8h, v1.8h\n"
-      "fmin v24.8h, v24.8h, v0.8h\n"
-      "fmin v25.8h, v25.8h, v0.8h\n"
-      "fmin v26.8h, v26.8h, v0.8h\n"
-      "fmax v24.8h, v24.8h, v1.8h\n"
-      "fmax v25.8h, v25.8h, v1.8h\n"
-      "fmax v26.8h, v26.8h, v1.8h\n"
-      "fmin v27.8h, v27.8h, v0.8h\n"
-      "fmax v27.8h, v27.8h, v1.8h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v29.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v28.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v29.8h\n"
+      "fmin v9.8h, v9.8h, v29.8h\n"
+      "fmin v10.8h, v10.8h, v29.8h\n"
+      "fmin v11.8h, v11.8h, v29.8h\n"
+      "fmin v12.8h, v12.8h, v29.8h\n"
+      "fmin v13.8h, v13.8h, v29.8h\n"
+      "fmin v14.8h, v14.8h, v29.8h\n"
+      "fmin v15.8h, v15.8h, v29.8h\n"
+      "fmin v16.8h, v16.8h, v29.8h\n"
+      "fmin v17.8h, v17.8h, v29.8h\n"
+      "fmin v18.8h, v18.8h, v29.8h\n"
+      "fmin v19.8h, v19.8h, v29.8h\n"
+      "fmin v20.8h, v20.8h, v29.8h\n"
+      "fmin v21.8h, v21.8h, v29.8h\n"
+      "fmin v22.8h, v22.8h, v29.8h\n"
+      "fmin v23.8h, v23.8h, v29.8h\n"
+      "fmin v24.8h, v24.8h, v29.8h\n"
+      "fmin v25.8h, v25.8h, v29.8h\n"
+      "fmin v26.8h, v26.8h, v29.8h\n"
+      "fmin v27.8h, v27.8h, v29.8h\n"
+      "fmax v8.8h, v8.8h, v28.8h\n"
+      "fmax v9.8h, v9.8h, v28.8h\n"
+      "fmax v10.8h, v10.8h, v28.8h\n"
+      "fmax v11.8h, v11.8h, v28.8h\n"
+      "fmax v12.8h, v12.8h, v28.8h\n"
+      "fmax v13.8h, v13.8h, v28.8h\n"
+      "fmax v14.8h, v14.8h, v28.8h\n"
+      "fmax v15.8h, v15.8h, v28.8h\n"
+      "fmax v16.8h, v16.8h, v28.8h\n"
+      "fmax v17.8h, v17.8h, v28.8h\n"
+      "fmax v18.8h, v18.8h, v28.8h\n"
+      "fmax v19.8h, v19.8h, v28.8h\n"
+      "fmax v20.8h, v20.8h, v28.8h\n"
+      "fmax v21.8h, v21.8h, v28.8h\n"
+      "fmax v22.8h, v22.8h, v28.8h\n"
+      "fmax v23.8h, v23.8h, v28.8h\n"
+      "fmax v24.8h, v24.8h, v28.8h\n"
+      "fmax v25.8h, v25.8h, v28.8h\n"
+      "fmax v26.8h, v26.8h, v28.8h\n"
+      "fmax v27.8h, v27.8h, v28.8h\n"
       "227:"  // Height 5: No activation
       "cmp x11, #0x20\n"
       "bge 244f\n"
       "tbz x11, #4, 235f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
-      "st1 { v9.8h }, [x28], #0x10\n"
-      "st1 { v12.8h }, [x24], #0x10\n"
-      "st1 { v13.8h }, [x24], #0x10\n"
-      "st1 { v16.8h }, [x23], #0x10\n"
-      "st1 { v17.8h }, [x23], #0x10\n"
-      "st1 { v20.8h }, [x22], #0x10\n"
-      "st1 { v21.8h }, [x22], #0x10\n"
-      "st1 { v24.8h }, [x21], #0x10\n"
-      "st1 { v25.8h }, [x21], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
+      "st1 { v9.8h }, [x9], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v13.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v17.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
+      "st1 { v21.8h }, [x23], #0x10\n"
+      "st1 { v24.8h }, [x22], #0x10\n"
+      "st1 { v25.8h }, [x22], #0x10\n"
       "tbz x11, #3, 231f\n"
-      "st1 { v10.8h }, [x28], #0x10\n"
-      "st1 { v14.8h }, [x24], #0x10\n"
-      "st1 { v18.8h }, [x23], #0x10\n"
-      "st1 { v22.8h }, [x22], #0x10\n"
-      "st1 { v26.8h }, [x21], #0x10\n"
+      "st1 { v10.8h }, [x9], #0x10\n"
+      "st1 { v14.8h }, [x25], #0x10\n"
+      "st1 { v18.8h }, [x24], #0x10\n"
+      "st1 { v22.8h }, [x23], #0x10\n"
+      "st1 { v26.8h }, [x22], #0x10\n"
       "tbz x11, #2, 229f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
-      "str d23, [x22], #0x8\n"
-      "str d27, [x21], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
       "tbz x11, #1, 228f\n"
-      "st1 { v11.s }[2], [x28], #0x4\n"
-      "st1 { v15.s }[2], [x24], #0x4\n"
-      "st1 { v19.s }[2], [x23], #0x4\n"
-      "st1 { v23.s }[2], [x22], #0x4\n"
-      "st1 { v27.s }[2], [x21], #0x4\n"
+      "st1 { v11.s }[2], [x9], #0x4\n"
+      "st1 { v15.s }[2], [x25], #0x4\n"
+      "st1 { v19.s }[2], [x24], #0x4\n"
+      "st1 { v23.s }[2], [x23], #0x4\n"
+      "st1 { v27.s }[2], [x22], #0x4\n"
       "tbz x11, #0, 243f\n"
-      "st1 { v11.h }[6], [x28]\n"
-      "st1 { v15.h }[6], [x24]\n"
-      "st1 { v19.h }[6], [x23]\n"
-      "st1 { v23.h }[6], [x22]\n"
-      "st1 { v27.h }[6], [x21]\n"
+      "st1 { v11.h }[6], [x9]\n"
+      "st1 { v15.h }[6], [x25]\n"
+      "st1 { v19.h }[6], [x24]\n"
+      "st1 { v23.h }[6], [x23]\n"
+      "st1 { v27.h }[6], [x22]\n"
       "b 243f\n"
       "228:"  // Height 5: Partial direct writeback: partial_1_28
       "tbz x11, #0, 243f\n"
-      "st1 { v11.h }[4], [x28]\n"
-      "st1 { v15.h }[4], [x24]\n"
-      "st1 { v19.h }[4], [x23]\n"
-      "st1 { v23.h }[4], [x22]\n"
-      "st1 { v27.h }[4], [x21]\n"
+      "st1 { v11.h }[4], [x9]\n"
+      "st1 { v15.h }[4], [x25]\n"
+      "st1 { v19.h }[4], [x24]\n"
+      "st1 { v23.h }[4], [x23]\n"
+      "st1 { v27.h }[4], [x22]\n"
       "b 243f\n"
       "229:"  // Height 5: Partial direct writeback: partial_2_24
       "tbz x11, #1, 230f\n"
-      "str s11, [x28], #0x4\n"
-      "str s15, [x24], #0x4\n"
-      "str s19, [x23], #0x4\n"
-      "str s23, [x22], #0x4\n"
-      "str s27, [x21], #0x4\n"
+      "str s11, [x9], #0x4\n"
+      "str s15, [x25], #0x4\n"
+      "str s19, [x24], #0x4\n"
+      "str s23, [x23], #0x4\n"
+      "str s27, [x22], #0x4\n"
       "tbz x11, #0, 243f\n"
-      "st1 { v11.h }[2], [x28]\n"
-      "st1 { v15.h }[2], [x24]\n"
-      "st1 { v19.h }[2], [x23]\n"
-      "st1 { v23.h }[2], [x22]\n"
-      "st1 { v27.h }[2], [x21]\n"
+      "st1 { v11.h }[2], [x9]\n"
+      "st1 { v15.h }[2], [x25]\n"
+      "st1 { v19.h }[2], [x24]\n"
+      "st1 { v23.h }[2], [x23]\n"
+      "st1 { v27.h }[2], [x22]\n"
       "b 243f\n"
       "230:"  // Height 5: Partial direct writeback: partial_1_24
       "tbz x11, #0, 243f\n"
-      "str h11, [x28, #0x0]\n"
-      "str h15, [x24, #0x0]\n"
-      "str h19, [x23, #0x0]\n"
-      "str h23, [x22, #0x0]\n"
-      "str h27, [x21, #0x0]\n"
+      "str h11, [x9, #0x0]\n"
+      "str h15, [x25, #0x0]\n"
+      "str h19, [x24, #0x0]\n"
+      "str h23, [x23, #0x0]\n"
+      "str h27, [x22, #0x0]\n"
       "b 243f\n"
       "231:"  // Height 5: Partial direct writeback: partial_4_16
       "tbz x11, #2, 233f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
-      "str d22, [x22], #0x8\n"
-      "str d26, [x21], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
       "tbz x11, #1, 232f\n"
-      "st1 { v10.s }[2], [x28], #0x4\n"
-      "st1 { v14.s }[2], [x24], #0x4\n"
-      "st1 { v18.s }[2], [x23], #0x4\n"
-      "st1 { v22.s }[2], [x22], #0x4\n"
-      "st1 { v26.s }[2], [x21], #0x4\n"
+      "st1 { v10.s }[2], [x9], #0x4\n"
+      "st1 { v14.s }[2], [x25], #0x4\n"
+      "st1 { v18.s }[2], [x24], #0x4\n"
+      "st1 { v22.s }[2], [x23], #0x4\n"
+      "st1 { v26.s }[2], [x22], #0x4\n"
       "tbz x11, #0, 243f\n"
-      "st1 { v10.h }[6], [x28]\n"
-      "st1 { v14.h }[6], [x24]\n"
-      "st1 { v18.h }[6], [x23]\n"
-      "st1 { v22.h }[6], [x22]\n"
-      "st1 { v26.h }[6], [x21]\n"
+      "st1 { v10.h }[6], [x9]\n"
+      "st1 { v14.h }[6], [x25]\n"
+      "st1 { v18.h }[6], [x24]\n"
+      "st1 { v22.h }[6], [x23]\n"
+      "st1 { v26.h }[6], [x22]\n"
       "b 243f\n"
       "232:"  // Height 5: Partial direct writeback: partial_1_20
       "tbz x11, #0, 243f\n"
-      "st1 { v10.h }[4], [x28]\n"
-      "st1 { v14.h }[4], [x24]\n"
-      "st1 { v18.h }[4], [x23]\n"
-      "st1 { v22.h }[4], [x22]\n"
-      "st1 { v26.h }[4], [x21]\n"
+      "st1 { v10.h }[4], [x9]\n"
+      "st1 { v14.h }[4], [x25]\n"
+      "st1 { v18.h }[4], [x24]\n"
+      "st1 { v22.h }[4], [x23]\n"
+      "st1 { v26.h }[4], [x22]\n"
       "b 243f\n"
       "233:"  // Height 5: Partial direct writeback: partial_2_16
       "tbz x11, #1, 234f\n"
-      "str s10, [x28], #0x4\n"
-      "str s14, [x24], #0x4\n"
-      "str s18, [x23], #0x4\n"
-      "str s22, [x22], #0x4\n"
-      "str s26, [x21], #0x4\n"
+      "str s10, [x9], #0x4\n"
+      "str s14, [x25], #0x4\n"
+      "str s18, [x24], #0x4\n"
+      "str s22, [x23], #0x4\n"
+      "str s26, [x22], #0x4\n"
       "tbz x11, #0, 243f\n"
-      "st1 { v10.h }[2], [x28]\n"
-      "st1 { v14.h }[2], [x24]\n"
-      "st1 { v18.h }[2], [x23]\n"
-      "st1 { v22.h }[2], [x22]\n"
-      "st1 { v26.h }[2], [x21]\n"
+      "st1 { v10.h }[2], [x9]\n"
+      "st1 { v14.h }[2], [x25]\n"
+      "st1 { v18.h }[2], [x24]\n"
+      "st1 { v22.h }[2], [x23]\n"
+      "st1 { v26.h }[2], [x22]\n"
       "b 243f\n"
       "234:"  // Height 5: Partial direct writeback: partial_1_16
       "tbz x11, #0, 243f\n"
-      "str h10, [x28, #0x0]\n"
-      "str h14, [x24, #0x0]\n"
-      "str h18, [x23, #0x0]\n"
-      "str h22, [x22, #0x0]\n"
-      "str h26, [x21, #0x0]\n"
+      "str h10, [x9, #0x0]\n"
+      "str h14, [x25, #0x0]\n"
+      "str h18, [x24, #0x0]\n"
+      "str h22, [x23, #0x0]\n"
+      "str h26, [x22, #0x0]\n"
       "b 243f\n"
       "235:"  // Height 5: Partial direct writeback: partial_8_0
       "tbz x11, #3, 239f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
-      "st1 { v12.8h }, [x24], #0x10\n"
-      "st1 { v16.8h }, [x23], #0x10\n"
-      "st1 { v20.8h }, [x22], #0x10\n"
-      "st1 { v24.8h }, [x21], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
+      "st1 { v24.8h }, [x22], #0x10\n"
       "tbz x11, #2, 237f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
-      "str d21, [x22], #0x8\n"
-      "str d25, [x21], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
       "tbz x11, #1, 236f\n"
-      "st1 { v9.s }[2], [x28], #0x4\n"
-      "st1 { v13.s }[2], [x24], #0x4\n"
-      "st1 { v17.s }[2], [x23], #0x4\n"
-      "st1 { v21.s }[2], [x22], #0x4\n"
-      "st1 { v25.s }[2], [x21], #0x4\n"
+      "st1 { v9.s }[2], [x9], #0x4\n"
+      "st1 { v13.s }[2], [x25], #0x4\n"
+      "st1 { v17.s }[2], [x24], #0x4\n"
+      "st1 { v21.s }[2], [x23], #0x4\n"
+      "st1 { v25.s }[2], [x22], #0x4\n"
       "tbz x11, #0, 243f\n"
-      "st1 { v9.h }[6], [x28]\n"
-      "st1 { v13.h }[6], [x24]\n"
-      "st1 { v17.h }[6], [x23]\n"
-      "st1 { v21.h }[6], [x22]\n"
-      "st1 { v25.h }[6], [x21]\n"
+      "st1 { v9.h }[6], [x9]\n"
+      "st1 { v13.h }[6], [x25]\n"
+      "st1 { v17.h }[6], [x24]\n"
+      "st1 { v21.h }[6], [x23]\n"
+      "st1 { v25.h }[6], [x22]\n"
       "b 243f\n"
       "236:"  // Height 5: Partial direct writeback: partial_1_12
       "tbz x11, #0, 243f\n"
-      "st1 { v9.h }[4], [x28]\n"
-      "st1 { v13.h }[4], [x24]\n"
-      "st1 { v17.h }[4], [x23]\n"
-      "st1 { v21.h }[4], [x22]\n"
-      "st1 { v25.h }[4], [x21]\n"
+      "st1 { v9.h }[4], [x9]\n"
+      "st1 { v13.h }[4], [x25]\n"
+      "st1 { v17.h }[4], [x24]\n"
+      "st1 { v21.h }[4], [x23]\n"
+      "st1 { v25.h }[4], [x22]\n"
       "b 243f\n"
       "237:"  // Height 5: Partial direct writeback: partial_2_8
       "tbz x11, #1, 238f\n"
-      "str s9, [x28], #0x4\n"
-      "str s13, [x24], #0x4\n"
-      "str s17, [x23], #0x4\n"
-      "str s21, [x22], #0x4\n"
-      "str s25, [x21], #0x4\n"
+      "str s9, [x9], #0x4\n"
+      "str s13, [x25], #0x4\n"
+      "str s17, [x24], #0x4\n"
+      "str s21, [x23], #0x4\n"
+      "str s25, [x22], #0x4\n"
       "tbz x11, #0, 243f\n"
-      "st1 { v9.h }[2], [x28]\n"
-      "st1 { v13.h }[2], [x24]\n"
-      "st1 { v17.h }[2], [x23]\n"
-      "st1 { v21.h }[2], [x22]\n"
-      "st1 { v25.h }[2], [x21]\n"
+      "st1 { v9.h }[2], [x9]\n"
+      "st1 { v13.h }[2], [x25]\n"
+      "st1 { v17.h }[2], [x24]\n"
+      "st1 { v21.h }[2], [x23]\n"
+      "st1 { v25.h }[2], [x22]\n"
       "b 243f\n"
       "238:"  // Height 5: Partial direct writeback: partial_1_8
       "tbz x11, #0, 243f\n"
-      "str h9, [x28, #0x0]\n"
-      "str h13, [x24, #0x0]\n"
-      "str h17, [x23, #0x0]\n"
-      "str h21, [x22, #0x0]\n"
-      "str h25, [x21, #0x0]\n"
+      "str h9, [x9, #0x0]\n"
+      "str h13, [x25, #0x0]\n"
+      "str h17, [x24, #0x0]\n"
+      "str h21, [x23, #0x0]\n"
+      "str h25, [x22, #0x0]\n"
       "b 243f\n"
       "239:"  // Height 5: Partial direct writeback: partial_4_0
       "tbz x11, #2, 241f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x11, #1, 240f\n"
-      "st1 { v8.s }[2], [x28], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
+      "st1 { v8.s }[2], [x9], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x11, #0, 243f\n"
-      "st1 { v8.h }[6], [x28]\n"
-      "st1 { v12.h }[6], [x24]\n"
-      "st1 { v16.h }[6], [x23]\n"
-      "st1 { v20.h }[6], [x22]\n"
-      "st1 { v24.h }[6], [x21]\n"
+      "st1 { v8.h }[6], [x9]\n"
+      "st1 { v12.h }[6], [x25]\n"
+      "st1 { v16.h }[6], [x24]\n"
+      "st1 { v20.h }[6], [x23]\n"
+      "st1 { v24.h }[6], [x22]\n"
       "b 243f\n"
       "240:"  // Height 5: Partial direct writeback: partial_1_4
       "tbz x11, #0, 243f\n"
-      "st1 { v8.h }[4], [x28]\n"
-      "st1 { v12.h }[4], [x24]\n"
-      "st1 { v16.h }[4], [x23]\n"
-      "st1 { v20.h }[4], [x22]\n"
-      "st1 { v24.h }[4], [x21]\n"
+      "st1 { v8.h }[4], [x9]\n"
+      "st1 { v12.h }[4], [x25]\n"
+      "st1 { v16.h }[4], [x24]\n"
+      "st1 { v20.h }[4], [x23]\n"
+      "st1 { v24.h }[4], [x22]\n"
       "b 243f\n"
       "241:"  // Height 5: Partial direct writeback: partial_2_0
       "tbz x11, #1, 242f\n"
-      "str s8, [x28], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
+      "str s8, [x9], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
       "tbz x11, #0, 243f\n"
-      "st1 { v8.h }[2], [x28]\n"
-      "st1 { v12.h }[2], [x24]\n"
-      "st1 { v16.h }[2], [x23]\n"
-      "st1 { v20.h }[2], [x22]\n"
-      "st1 { v24.h }[2], [x21]\n"
+      "st1 { v8.h }[2], [x9]\n"
+      "st1 { v12.h }[2], [x25]\n"
+      "st1 { v16.h }[2], [x24]\n"
+      "st1 { v20.h }[2], [x23]\n"
+      "st1 { v24.h }[2], [x22]\n"
       "b 243f\n"
       "242:"  // Height 5: Partial direct writeback: partial_1_0
-      "str h8, [x28, #0x0]\n"
-      "str h12, [x24, #0x0]\n"
-      "str h16, [x23, #0x0]\n"
-      "str h20, [x22, #0x0]\n"
-      "str h24, [x21, #0x0]\n"
+      "str h8, [x9, #0x0]\n"
+      "str h12, [x25, #0x0]\n"
+      "str h16, [x24, #0x0]\n"
+      "str h20, [x23, #0x0]\n"
+      "str h24, [x22, #0x0]\n"
       "243:"  // Height 5: Partial direct writeback: Done
       "b 245f\n"
       "244:"  // Height 5: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q21, [x22, #0x10]\n"
-      "str q22, [x22, #0x20]\n"
-      "str q23, [x22, #0x30]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q25, [x21, #0x10]\n"
-      "str q26, [x21, #0x20]\n"
-      "str q27, [x21, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
       "245:"  // Height 5: Writeback done
       "subs x11, x11, #0x20\n"
       "bgt 198b\n"
       "b 296f\n"
       "246:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0xc\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x19, #0xc\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "247:"  // Height 6: Column loop
-      "cbz x9, 248f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 248f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "mov v20.16b, v8.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      "mov v24.16b, v8.16b\n"
-      "mov v28.16b, v8.16b\n"
       "mov v13.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
       "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
       "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
       "mov v27.16b, v11.16b\n"
+      "mov v28.16b, v8.16b\n"
       "mov v29.16b, v9.16b\n"
       "mov v30.16b, v10.16b\n"
       "mov v31.16b, v11.16b\n"
       "b 267f\n"
       "248:"  // Height 6: no bias
       "tbz %x[flags], #0, 266f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
       "cmp x11, #0x20\n"
-      "add x24, x28, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
-      "add x20, x21, x19, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "bge 265f\n"
       "tbz x11, #4, 256f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
-      "ld1 { v12.8h }, [x24], #0x10\n"
-      "ld1 { v16.8h }, [x23], #0x10\n"
-      "ld1 { v20.8h }, [x22], #0x10\n"
-      "ld1 { v24.8h }, [x21], #0x10\n"
-      "ld1 { v28.8h }, [x20], #0x10\n"
-      "ld1 { v9.8h }, [x28], #0x10\n"
-      "ld1 { v13.8h }, [x24], #0x10\n"
-      "ld1 { v17.8h }, [x23], #0x10\n"
-      "ld1 { v21.8h }, [x22], #0x10\n"
-      "ld1 { v25.8h }, [x21], #0x10\n"
-      "ld1 { v29.8h }, [x20], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "ld1 { v24.8h }, [x22], #0x10\n"
+      "ld1 { v28.8h }, [x21], #0x10\n"
+      "ld1 { v9.8h }, [x9], #0x10\n"
+      "ld1 { v13.8h }, [x25], #0x10\n"
+      "ld1 { v17.8h }, [x24], #0x10\n"
+      "ld1 { v21.8h }, [x23], #0x10\n"
+      "ld1 { v25.8h }, [x22], #0x10\n"
+      "ld1 { v29.8h }, [x21], #0x10\n"
       "tbz x11, #3, 252f\n"
-      "ld1 { v10.8h }, [x28], #0x10\n"
-      "ld1 { v14.8h }, [x24], #0x10\n"
-      "ld1 { v18.8h }, [x23], #0x10\n"
-      "ld1 { v22.8h }, [x22], #0x10\n"
-      "ld1 { v26.8h }, [x21], #0x10\n"
-      "ld1 { v30.8h }, [x20], #0x10\n"
+      "ld1 { v10.8h }, [x9], #0x10\n"
+      "ld1 { v14.8h }, [x25], #0x10\n"
+      "ld1 { v18.8h }, [x24], #0x10\n"
+      "ld1 { v22.8h }, [x23], #0x10\n"
+      "ld1 { v26.8h }, [x22], #0x10\n"
+      "ld1 { v30.8h }, [x21], #0x10\n"
       "tbz x11, #2, 250f\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
-      "ldr d23, [x22], #0x8\n"
-      "ldr d27, [x21], #0x8\n"
-      "ldr d31, [x20], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
       "tbz x11, #1, 249f\n"
-      "ld1 { v11.s }[2], [x28], #0x4\n"
-      "mov x19, #0x3c\n"
-      "ld1 { v15.s }[2], [x24], #0x4\n"
-      "ld1 { v19.s }[2], [x23], #0x4\n"
-      "ld1 { v23.s }[2], [x22], #0x4\n"
-      "ld1 { v27.s }[2], [x21], #0x4\n"
-      "ld1 { v31.s }[2], [x20], #0x4\n"
+      "ld1 { v11.s }[2], [x9], #0x4\n"
+      "ld1 { v15.s }[2], [x25], #0x4\n"
+      "mov x20, #0x3c\n"
+      "ld1 { v19.s }[2], [x24], #0x4\n"
+      "ld1 { v23.s }[2], [x23], #0x4\n"
+      "ld1 { v27.s }[2], [x22], #0x4\n"
+      "ld1 { v31.s }[2], [x21], #0x4\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v11.h }[6], [x28]\n"
-      "ld1 { v15.h }[6], [x24]\n"
-      "ld1 { v19.h }[6], [x23]\n"
-      "ld1 { v23.h }[6], [x22]\n"
-      "ld1 { v27.h }[6], [x21]\n"
-      "ld1 { v31.h }[6], [x20]\n"
+      "ld1 { v11.h }[6], [x9]\n"
+      "ld1 { v15.h }[6], [x25]\n"
+      "ld1 { v19.h }[6], [x24]\n"
+      "ld1 { v23.h }[6], [x23]\n"
+      "ld1 { v27.h }[6], [x22]\n"
+      "ld1 { v31.h }[6], [x21]\n"
       "b 264f\n"
       "249:"  // Height 6: Partial accumulate: partial_1_28
-      "mov x19, #0x38\n"
+      "mov x20, #0x38\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v11.h }[4], [x28]\n"
-      "ld1 { v15.h }[4], [x24]\n"
-      "ld1 { v19.h }[4], [x23]\n"
-      "ld1 { v23.h }[4], [x22]\n"
-      "ld1 { v27.h }[4], [x21]\n"
-      "ld1 { v31.h }[4], [x20]\n"
+      "ld1 { v11.h }[4], [x9]\n"
+      "ld1 { v15.h }[4], [x25]\n"
+      "ld1 { v19.h }[4], [x24]\n"
+      "ld1 { v23.h }[4], [x23]\n"
+      "ld1 { v27.h }[4], [x22]\n"
+      "ld1 { v31.h }[4], [x21]\n"
       "b 264f\n"
       "250:"  // Height 6: Partial accumulate: partial_2_24
       "tbz x11, #1, 251f\n"
-      "ldr s11, [x28], #0x4\n"
-      "ldr s15, [x24], #0x4\n"
-      "mov x19, #0x34\n"
-      "ldr s19, [x23], #0x4\n"
-      "ldr s23, [x22], #0x4\n"
-      "ldr s27, [x21], #0x4\n"
-      "ldr s31, [x20], #0x4\n"
+      "ldr s11, [x9], #0x4\n"
+      "ldr s15, [x25], #0x4\n"
+      "mov x20, #0x34\n"
+      "ldr s19, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s27, [x22], #0x4\n"
+      "ldr s31, [x21], #0x4\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v11.h }[2], [x28]\n"
-      "ld1 { v15.h }[2], [x24]\n"
-      "ld1 { v19.h }[2], [x23]\n"
-      "ld1 { v23.h }[2], [x22]\n"
-      "ld1 { v27.h }[2], [x21]\n"
-      "ld1 { v31.h }[2], [x20]\n"
+      "ld1 { v11.h }[2], [x9]\n"
+      "ld1 { v15.h }[2], [x25]\n"
+      "ld1 { v19.h }[2], [x24]\n"
+      "ld1 { v23.h }[2], [x23]\n"
+      "ld1 { v27.h }[2], [x22]\n"
+      "ld1 { v31.h }[2], [x21]\n"
       "b 264f\n"
       "251:"  // Height 6: Partial accumulate: partial_1_24
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 264f\n"
-      "ldr h11, [x28, #0x0]\n"
-      "ldr h15, [x24, #0x0]\n"
-      "ldr h19, [x23, #0x0]\n"
-      "ldr h23, [x22, #0x0]\n"
-      "ldr h27, [x21, #0x0]\n"
-      "ldr h31, [x20, #0x0]\n"
+      "ldr h11, [x9, #0x0]\n"
+      "ldr h15, [x25, #0x0]\n"
+      "ldr h19, [x24, #0x0]\n"
+      "ldr h23, [x23, #0x0]\n"
+      "ldr h27, [x22, #0x0]\n"
+      "ldr h31, [x21, #0x0]\n"
       "b 264f\n"
       "252:"  // Height 6: Partial accumulate: partial_4_16
       "tbz x11, #2, 254f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "ldr d18, [x23], #0x8\n"
-      "ldr d22, [x22], #0x8\n"
-      "ldr d26, [x21], #0x8\n"
-      "ldr d30, [x20], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
       "tbz x11, #1, 253f\n"
-      "ld1 { v10.s }[2], [x28], #0x4\n"
-      "mov x19, #0x2c\n"
-      "ld1 { v14.s }[2], [x24], #0x4\n"
-      "ld1 { v18.s }[2], [x23], #0x4\n"
-      "ld1 { v22.s }[2], [x22], #0x4\n"
-      "ld1 { v26.s }[2], [x21], #0x4\n"
-      "ld1 { v30.s }[2], [x20], #0x4\n"
+      "ld1 { v10.s }[2], [x9], #0x4\n"
+      "ld1 { v14.s }[2], [x25], #0x4\n"
+      "mov x20, #0x2c\n"
+      "ld1 { v18.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v26.s }[2], [x22], #0x4\n"
+      "ld1 { v30.s }[2], [x21], #0x4\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v10.h }[6], [x28]\n"
-      "ld1 { v14.h }[6], [x24]\n"
-      "ld1 { v18.h }[6], [x23]\n"
-      "ld1 { v22.h }[6], [x22]\n"
-      "ld1 { v26.h }[6], [x21]\n"
-      "ld1 { v30.h }[6], [x20]\n"
+      "ld1 { v10.h }[6], [x9]\n"
+      "ld1 { v14.h }[6], [x25]\n"
+      "ld1 { v18.h }[6], [x24]\n"
+      "ld1 { v22.h }[6], [x23]\n"
+      "ld1 { v26.h }[6], [x22]\n"
+      "ld1 { v30.h }[6], [x21]\n"
       "b 264f\n"
       "253:"  // Height 6: Partial accumulate: partial_1_20
-      "mov x19, #0x28\n"
+      "mov x20, #0x28\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v10.h }[4], [x28]\n"
-      "ld1 { v14.h }[4], [x24]\n"
-      "ld1 { v18.h }[4], [x23]\n"
-      "ld1 { v22.h }[4], [x22]\n"
-      "ld1 { v26.h }[4], [x21]\n"
-      "ld1 { v30.h }[4], [x20]\n"
+      "ld1 { v10.h }[4], [x9]\n"
+      "ld1 { v14.h }[4], [x25]\n"
+      "ld1 { v18.h }[4], [x24]\n"
+      "ld1 { v22.h }[4], [x23]\n"
+      "ld1 { v26.h }[4], [x22]\n"
+      "ld1 { v30.h }[4], [x21]\n"
       "b 264f\n"
       "254:"  // Height 6: Partial accumulate: partial_2_16
       "tbz x11, #1, 255f\n"
-      "ldr s10, [x28], #0x4\n"
-      "ldr s14, [x24], #0x4\n"
-      "mov x19, #0x24\n"
-      "ldr s18, [x23], #0x4\n"
-      "ldr s22, [x22], #0x4\n"
-      "ldr s26, [x21], #0x4\n"
-      "ldr s30, [x20], #0x4\n"
+      "ldr s10, [x9], #0x4\n"
+      "ldr s14, [x25], #0x4\n"
+      "mov x20, #0x24\n"
+      "ldr s18, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s26, [x22], #0x4\n"
+      "ldr s30, [x21], #0x4\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v10.h }[2], [x28]\n"
-      "ld1 { v14.h }[2], [x24]\n"
-      "ld1 { v18.h }[2], [x23]\n"
-      "ld1 { v22.h }[2], [x22]\n"
-      "ld1 { v26.h }[2], [x21]\n"
-      "ld1 { v30.h }[2], [x20]\n"
+      "ld1 { v10.h }[2], [x9]\n"
+      "ld1 { v14.h }[2], [x25]\n"
+      "ld1 { v18.h }[2], [x24]\n"
+      "ld1 { v22.h }[2], [x23]\n"
+      "ld1 { v26.h }[2], [x22]\n"
+      "ld1 { v30.h }[2], [x21]\n"
       "b 264f\n"
       "255:"  // Height 6: Partial accumulate: partial_1_16
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 264f\n"
-      "ldr h10, [x28, #0x0]\n"
-      "ldr h14, [x24, #0x0]\n"
-      "ldr h18, [x23, #0x0]\n"
-      "ldr h22, [x22, #0x0]\n"
-      "ldr h26, [x21, #0x0]\n"
-      "ldr h30, [x20, #0x0]\n"
+      "ldr h10, [x9, #0x0]\n"
+      "ldr h14, [x25, #0x0]\n"
+      "ldr h18, [x24, #0x0]\n"
+      "ldr h22, [x23, #0x0]\n"
+      "ldr h26, [x22, #0x0]\n"
+      "ldr h30, [x21, #0x0]\n"
       "b 264f\n"
       "256:"  // Height 6: Partial accumulate: partial_8_0
       "tbz x11, #3, 260f\n"
-      "ld1 { v8.8h }, [x28], #0x10\n"
-      "ld1 { v12.8h }, [x24], #0x10\n"
-      "ld1 { v16.8h }, [x23], #0x10\n"
-      "ld1 { v20.8h }, [x22], #0x10\n"
-      "ld1 { v24.8h }, [x21], #0x10\n"
-      "ld1 { v28.8h }, [x20], #0x10\n"
+      "ld1 { v8.8h }, [x9], #0x10\n"
+      "ld1 { v12.8h }, [x25], #0x10\n"
+      "ld1 { v16.8h }, [x24], #0x10\n"
+      "ld1 { v20.8h }, [x23], #0x10\n"
+      "ld1 { v24.8h }, [x22], #0x10\n"
+      "ld1 { v28.8h }, [x21], #0x10\n"
       "tbz x11, #2, 258f\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
-      "ldr d25, [x21], #0x8\n"
-      "ldr d29, [x20], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
       "tbz x11, #1, 257f\n"
-      "ld1 { v9.s }[2], [x28], #0x4\n"
-      "mov x19, #0x1c\n"
-      "ld1 { v13.s }[2], [x24], #0x4\n"
-      "ld1 { v17.s }[2], [x23], #0x4\n"
-      "ld1 { v21.s }[2], [x22], #0x4\n"
-      "ld1 { v25.s }[2], [x21], #0x4\n"
-      "ld1 { v29.s }[2], [x20], #0x4\n"
+      "ld1 { v9.s }[2], [x9], #0x4\n"
+      "ld1 { v13.s }[2], [x25], #0x4\n"
+      "mov x20, #0x1c\n"
+      "ld1 { v17.s }[2], [x24], #0x4\n"
+      "ld1 { v21.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v29.s }[2], [x21], #0x4\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v9.h }[6], [x28]\n"
-      "ld1 { v13.h }[6], [x24]\n"
-      "ld1 { v17.h }[6], [x23]\n"
-      "ld1 { v21.h }[6], [x22]\n"
-      "ld1 { v25.h }[6], [x21]\n"
-      "ld1 { v29.h }[6], [x20]\n"
+      "ld1 { v9.h }[6], [x9]\n"
+      "ld1 { v13.h }[6], [x25]\n"
+      "ld1 { v17.h }[6], [x24]\n"
+      "ld1 { v21.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
+      "ld1 { v29.h }[6], [x21]\n"
       "b 264f\n"
       "257:"  // Height 6: Partial accumulate: partial_1_12
-      "mov x19, #0x18\n"
+      "mov x20, #0x18\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v9.h }[4], [x28]\n"
-      "ld1 { v13.h }[4], [x24]\n"
-      "ld1 { v17.h }[4], [x23]\n"
-      "ld1 { v21.h }[4], [x22]\n"
-      "ld1 { v25.h }[4], [x21]\n"
-      "ld1 { v29.h }[4], [x20]\n"
+      "ld1 { v9.h }[4], [x9]\n"
+      "ld1 { v13.h }[4], [x25]\n"
+      "ld1 { v17.h }[4], [x24]\n"
+      "ld1 { v21.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
+      "ld1 { v29.h }[4], [x21]\n"
       "b 264f\n"
       "258:"  // Height 6: Partial accumulate: partial_2_8
       "tbz x11, #1, 259f\n"
-      "ldr s9, [x28], #0x4\n"
-      "ldr s13, [x24], #0x4\n"
-      "mov x19, #0x14\n"
-      "ldr s17, [x23], #0x4\n"
-      "ldr s21, [x22], #0x4\n"
-      "ldr s25, [x21], #0x4\n"
-      "ldr s29, [x20], #0x4\n"
+      "ldr s9, [x9], #0x4\n"
+      "ldr s13, [x25], #0x4\n"
+      "mov x20, #0x14\n"
+      "ldr s17, [x24], #0x4\n"
+      "ldr s21, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s29, [x21], #0x4\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v9.h }[2], [x28]\n"
-      "ld1 { v13.h }[2], [x24]\n"
-      "ld1 { v17.h }[2], [x23]\n"
-      "ld1 { v21.h }[2], [x22]\n"
-      "ld1 { v25.h }[2], [x21]\n"
-      "ld1 { v29.h }[2], [x20]\n"
+      "ld1 { v9.h }[2], [x9]\n"
+      "ld1 { v13.h }[2], [x25]\n"
+      "ld1 { v17.h }[2], [x24]\n"
+      "ld1 { v21.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
+      "ld1 { v29.h }[2], [x21]\n"
       "b 264f\n"
       "259:"  // Height 6: Partial accumulate: partial_1_8
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 264f\n"
-      "ldr h9, [x28, #0x0]\n"
-      "ldr h13, [x24, #0x0]\n"
-      "ldr h17, [x23, #0x0]\n"
-      "ldr h21, [x22, #0x0]\n"
-      "ldr h25, [x21, #0x0]\n"
-      "ldr h29, [x20, #0x0]\n"
+      "ldr h9, [x9, #0x0]\n"
+      "ldr h13, [x25, #0x0]\n"
+      "ldr h17, [x24, #0x0]\n"
+      "ldr h21, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
+      "ldr h29, [x21, #0x0]\n"
       "b 264f\n"
       "260:"  // Height 6: Partial accumulate: partial_4_0
       "tbz x11, #2, 262f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "ldr d16, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d24, [x21], #0x8\n"
-      "ldr d28, [x20], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
       "tbz x11, #1, 261f\n"
-      "ld1 { v8.s }[2], [x28], #0x4\n"
-      "mov x19, #0xc\n"
-      "ld1 { v12.s }[2], [x24], #0x4\n"
-      "ld1 { v16.s }[2], [x23], #0x4\n"
-      "ld1 { v20.s }[2], [x22], #0x4\n"
-      "ld1 { v24.s }[2], [x21], #0x4\n"
-      "ld1 { v28.s }[2], [x20], #0x4\n"
+      "ld1 { v8.s }[2], [x9], #0x4\n"
+      "ld1 { v12.s }[2], [x25], #0x4\n"
+      "mov x20, #0xc\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
+      "ld1 { v20.s }[2], [x23], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v28.s }[2], [x21], #0x4\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v8.h }[6], [x28]\n"
-      "ld1 { v12.h }[6], [x24]\n"
-      "ld1 { v16.h }[6], [x23]\n"
-      "ld1 { v20.h }[6], [x22]\n"
-      "ld1 { v24.h }[6], [x21]\n"
-      "ld1 { v28.h }[6], [x20]\n"
+      "ld1 { v8.h }[6], [x9]\n"
+      "ld1 { v12.h }[6], [x25]\n"
+      "ld1 { v16.h }[6], [x24]\n"
+      "ld1 { v20.h }[6], [x23]\n"
+      "ld1 { v24.h }[6], [x22]\n"
+      "ld1 { v28.h }[6], [x21]\n"
       "b 264f\n"
       "261:"  // Height 6: Partial accumulate: partial_1_4
-      "mov x19, #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v8.h }[4], [x28]\n"
-      "ld1 { v12.h }[4], [x24]\n"
-      "ld1 { v16.h }[4], [x23]\n"
-      "ld1 { v20.h }[4], [x22]\n"
-      "ld1 { v24.h }[4], [x21]\n"
-      "ld1 { v28.h }[4], [x20]\n"
+      "ld1 { v8.h }[4], [x9]\n"
+      "ld1 { v12.h }[4], [x25]\n"
+      "ld1 { v16.h }[4], [x24]\n"
+      "ld1 { v20.h }[4], [x23]\n"
+      "ld1 { v24.h }[4], [x22]\n"
+      "ld1 { v28.h }[4], [x21]\n"
       "b 264f\n"
       "262:"  // Height 6: Partial accumulate: partial_2_0
       "tbz x11, #1, 263f\n"
-      "ldr s8, [x28], #0x4\n"
-      "ldr s12, [x24], #0x4\n"
-      "mov x19, #0x4\n"
-      "ldr s16, [x23], #0x4\n"
-      "ldr s20, [x22], #0x4\n"
-      "ldr s24, [x21], #0x4\n"
-      "ldr s28, [x20], #0x4\n"
+      "ldr s8, [x9], #0x4\n"
+      "ldr s12, [x25], #0x4\n"
+      "mov x20, #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s28, [x21], #0x4\n"
       "tbz x11, #0, 264f\n"
-      "ld1 { v8.h }[2], [x28]\n"
-      "ld1 { v12.h }[2], [x24]\n"
-      "ld1 { v16.h }[2], [x23]\n"
-      "ld1 { v20.h }[2], [x22]\n"
-      "ld1 { v24.h }[2], [x21]\n"
-      "ld1 { v28.h }[2], [x20]\n"
+      "ld1 { v8.h }[2], [x9]\n"
+      "ld1 { v12.h }[2], [x25]\n"
+      "ld1 { v16.h }[2], [x24]\n"
+      "ld1 { v20.h }[2], [x23]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v28.h }[2], [x21]\n"
       "b 264f\n"
       "263:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr h8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr h12, [x24, #0x0]\n"
-      "ldr h16, [x23, #0x0]\n"
-      "ldr h20, [x22, #0x0]\n"
-      "ldr h24, [x21, #0x0]\n"
-      "ldr h28, [x20, #0x0]\n"
+      "ldr h8, [x9, #0x0]\n"
+      "ldr h12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr h16, [x24, #0x0]\n"
+      "ldr h20, [x23, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h28, [x21, #0x0]\n"
       "264:"  // Height 6: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 267f\n"
       "265:"  // Height 6: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
-      "ldr q20, [x22, #0x0]\n"
-      "ldr q21, [x22, #0x10]\n"
-      "ldr q22, [x22, #0x20]\n"
-      "ldr q23, [x22, #0x30]\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "ldr q28, [x20, #0x0]\n"
-      "ldr q29, [x20, #0x10]\n"
-      "ldr q30, [x20, #0x20]\n"
-      "ldr q31, [x20, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q24, [x22, #0x0]\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q26, [x22, #0x20]\n"
+      "ldr q27, [x22, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
       "b 267f\n"
       "266:"  // Height 6: no accumulate
       "movi v8.16b, #0x0\n"
@@ -4385,82 +4382,82 @@ void a64_hybrid_fp16_mla_6x32 (
       "movi v30.16b, #0x0\n"
       "movi v31.16b, #0x0\n"
       "267:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "268:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 269f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 270f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
-      "add x21, x21, x19, LSL #1\n"
-      "add x20, x20, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 270f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
       "b 270f\n"
       "269:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
-      "add x20, x21, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "270:"  // Height 6: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "blt 273f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x10\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x10\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 272f\n"
       "271:"  // Height 6: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x8\n"
+      "add x26, x26, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
       "fmla v28.8h, v6.8h, v5.h[0]\n"
       "ldr q6, [x10, #0x20]\n"
-      "add x20, x20, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "sub x26, x26, #0x8\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "cmp x26, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
       "fmla v29.8h, v7.8h, v5.h[0]\n"
       "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v10.8h, v6.8h, v0.h[0]\n"
       "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla v18.8h, v6.8h, v2.h[0]\n"
       "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       "fmla v26.8h, v6.8h, v4.h[0]\n"
       "fmla v30.8h, v6.8h, v5.h[0]\n"
       "ldr q6, [x10, #0x40]\n"
@@ -4662,51 +4659,51 @@ void a64_hybrid_fp16_mla_6x32 (
       "fmla v30.8h, v6.8h, v5.h[7]\n"
       "ldr q6, [x10, #0x0]\n"
       "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
       "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr q1, [x24, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
       "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
       "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
       "fmla v27.8h, v7.8h, v4.h[7]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
       "fmla v31.8h, v7.8h, v5.h[7]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 271b\n"
       "272:"  // Height 6: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x8\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "add x24, x24, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
       "fmla v28.8h, v6.8h, v5.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
       "ldr q6, [x10, #0x20]\n"
-      "add x20, x20, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "sub x27, x27, #0x8\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
       "fmla v29.8h, v7.8h, v5.h[0]\n"
       "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v10.8h, v6.8h, v0.h[0]\n"
       "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v18.8h, v6.8h, v2.h[0]\n"
       "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       "fmla v26.8h, v6.8h, v4.h[0]\n"
       "fmla v30.8h, v6.8h, v5.h[0]\n"
       "ldr q6, [x10, #0x40]\n"
@@ -4913,417 +4910,416 @@ void a64_hybrid_fp16_mla_6x32 (
       "fmla v27.8h, v7.8h, v4.h[7]\n"
       "fmla v31.8h, v7.8h, v5.h[7]\n"
       "273:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x26, 275f\n"
+      "cbz x27, 275f\n"
       "274:"  // Height 6: Multiply loop: Odd block loop
-      "ldr h0, [x25], #0x2\n"
-      "sub x26, x26, #0x1\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
+      "ldr h7, [x26], #0x2\n"
+      "ldr h6, [x25], #0x2\n"
+      "sub x27, x27, #0x1\n"
+      "ldr h5, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
       "ldr h3, [x22], #0x2\n"
-      "ldr h4, [x21], #0x2\n"
-      "ldr h5, [x20], #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "fmla v28.8h, v6.8h, v5.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "fmla v29.8h, v7.8h, v5.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr h2, [x21], #0x2\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      "fmla v8.8h, v1.8h, v7.h[0]\n"
+      "fmla v12.8h, v1.8h, v6.h[0]\n"
+      "fmla v16.8h, v1.8h, v5.h[0]\n"
+      "fmla v20.8h, v1.8h, v4.h[0]\n"
+      "fmla v24.8h, v1.8h, v3.h[0]\n"
+      "fmla v28.8h, v1.8h, v2.h[0]\n"
+      "ldr q1, [x10, #0x20]\n"
+      "fmla v9.8h, v0.8h, v7.h[0]\n"
+      "fmla v13.8h, v0.8h, v6.h[0]\n"
+      "fmla v17.8h, v0.8h, v5.h[0]\n"
+      "fmla v21.8h, v0.8h, v4.h[0]\n"
+      "fmla v25.8h, v0.8h, v3.h[0]\n"
+      "fmla v29.8h, v0.8h, v2.h[0]\n"
+      "ldr q0, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "fmla v30.8h, v6.8h, v5.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "fmla v31.8h, v7.8h, v5.h[0]\n"
-      "cbnz x26, 274b\n"
+      "fmla v10.8h, v1.8h, v7.h[0]\n"
+      "fmla v14.8h, v1.8h, v6.h[0]\n"
+      "fmla v18.8h, v1.8h, v5.h[0]\n"
+      "fmla v22.8h, v1.8h, v4.h[0]\n"
+      "fmla v26.8h, v1.8h, v3.h[0]\n"
+      "fmla v30.8h, v1.8h, v2.h[0]\n"
+      "fmla v11.8h, v0.8h, v7.h[0]\n"
+      "fmla v15.8h, v0.8h, v6.h[0]\n"
+      "fmla v19.8h, v0.8h, v5.h[0]\n"
+      "fmla v23.8h, v0.8h, v4.h[0]\n"
+      "fmla v27.8h, v0.8h, v3.h[0]\n"
+      "fmla v31.8h, v0.8h, v2.h[0]\n"
+      "cbnz x27, 274b\n"
       "275:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 268b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #1\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #1\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #1\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "tbz %x[flags], #1, 276f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.8h }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x19]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmax v8.8h, v8.8h, v1.8h\n"
-      "fmax v9.8h, v9.8h, v1.8h\n"
-      "fmax v10.8h, v10.8h, v1.8h\n"
-      "fmax v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v1.8h\n"
-      "fmax v13.8h, v13.8h, v1.8h\n"
-      "fmax v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmin v16.8h, v16.8h, v0.8h\n"
-      "fmin v17.8h, v17.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v1.8h\n"
-      "fmax v16.8h, v16.8h, v1.8h\n"
-      "fmax v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v0.8h\n"
-      "fmin v19.8h, v19.8h, v0.8h\n"
-      "fmin v20.8h, v20.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v1.8h\n"
-      "fmax v19.8h, v19.8h, v1.8h\n"
-      "fmax v20.8h, v20.8h, v1.8h\n"
-      "fmin v21.8h, v21.8h, v0.8h\n"
-      "fmin v22.8h, v22.8h, v0.8h\n"
-      "fmin v23.8h, v23.8h, v0.8h\n"
-      "fmax v21.8h, v21.8h, v1.8h\n"
-      "fmax v22.8h, v22.8h, v1.8h\n"
-      "fmax v23.8h, v23.8h, v1.8h\n"
-      "fmin v24.8h, v24.8h, v0.8h\n"
-      "fmin v25.8h, v25.8h, v0.8h\n"
-      "fmin v26.8h, v26.8h, v0.8h\n"
-      "fmax v24.8h, v24.8h, v1.8h\n"
-      "fmax v25.8h, v25.8h, v1.8h\n"
-      "fmax v26.8h, v26.8h, v1.8h\n"
-      "fmin v27.8h, v27.8h, v0.8h\n"
-      "fmin v28.8h, v28.8h, v0.8h\n"
-      "fmin v29.8h, v29.8h, v0.8h\n"
-      "fmax v27.8h, v27.8h, v1.8h\n"
-      "fmax v28.8h, v28.8h, v1.8h\n"
-      "fmax v29.8h, v29.8h, v1.8h\n"
-      "fmin v30.8h, v30.8h, v0.8h\n"
-      "fmin v31.8h, v31.8h, v0.8h\n"
-      "fmax v30.8h, v30.8h, v1.8h\n"
-      "fmax v31.8h, v31.8h, v1.8h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.8h }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v1.8h\n"
+      "fmin v9.8h, v9.8h, v1.8h\n"
+      "fmin v10.8h, v10.8h, v1.8h\n"
+      "fmin v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v1.8h\n"
+      "fmin v13.8h, v13.8h, v1.8h\n"
+      "fmin v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v1.8h\n"
+      "fmin v16.8h, v16.8h, v1.8h\n"
+      "fmin v17.8h, v17.8h, v1.8h\n"
+      "fmin v18.8h, v18.8h, v1.8h\n"
+      "fmin v19.8h, v19.8h, v1.8h\n"
+      "fmin v20.8h, v20.8h, v1.8h\n"
+      "fmin v21.8h, v21.8h, v1.8h\n"
+      "fmin v22.8h, v22.8h, v1.8h\n"
+      "fmin v23.8h, v23.8h, v1.8h\n"
+      "fmin v24.8h, v24.8h, v1.8h\n"
+      "fmin v25.8h, v25.8h, v1.8h\n"
+      "fmin v26.8h, v26.8h, v1.8h\n"
+      "fmin v27.8h, v27.8h, v1.8h\n"
+      "fmin v28.8h, v28.8h, v1.8h\n"
+      "fmin v29.8h, v29.8h, v1.8h\n"
+      "fmin v30.8h, v30.8h, v1.8h\n"
+      "fmin v31.8h, v31.8h, v1.8h\n"
+      "fmax v8.8h, v8.8h, v0.8h\n"
+      "fmax v9.8h, v9.8h, v0.8h\n"
+      "fmax v10.8h, v10.8h, v0.8h\n"
+      "fmax v11.8h, v11.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v0.8h\n"
+      "fmax v13.8h, v13.8h, v0.8h\n"
+      "fmax v14.8h, v14.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v0.8h\n"
+      "fmax v16.8h, v16.8h, v0.8h\n"
+      "fmax v17.8h, v17.8h, v0.8h\n"
+      "fmax v18.8h, v18.8h, v0.8h\n"
+      "fmax v19.8h, v19.8h, v0.8h\n"
+      "fmax v20.8h, v20.8h, v0.8h\n"
+      "fmax v21.8h, v21.8h, v0.8h\n"
+      "fmax v22.8h, v22.8h, v0.8h\n"
+      "fmax v23.8h, v23.8h, v0.8h\n"
+      "fmax v24.8h, v24.8h, v0.8h\n"
+      "fmax v25.8h, v25.8h, v0.8h\n"
+      "fmax v26.8h, v26.8h, v0.8h\n"
+      "fmax v27.8h, v27.8h, v0.8h\n"
+      "fmax v28.8h, v28.8h, v0.8h\n"
+      "fmax v29.8h, v29.8h, v0.8h\n"
+      "fmax v30.8h, v30.8h, v0.8h\n"
+      "fmax v31.8h, v31.8h, v0.8h\n"
       "276:"  // Height 6: No activation
       "cmp x11, #0x20\n"
       "bge 293f\n"
       "tbz x11, #4, 284f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
-      "st1 { v9.8h }, [x28], #0x10\n"
-      "st1 { v12.8h }, [x24], #0x10\n"
-      "st1 { v13.8h }, [x24], #0x10\n"
-      "st1 { v16.8h }, [x23], #0x10\n"
-      "st1 { v17.8h }, [x23], #0x10\n"
-      "st1 { v20.8h }, [x22], #0x10\n"
-      "st1 { v21.8h }, [x22], #0x10\n"
-      "st1 { v24.8h }, [x21], #0x10\n"
-      "st1 { v25.8h }, [x21], #0x10\n"
-      "st1 { v28.8h }, [x20], #0x10\n"
-      "st1 { v29.8h }, [x20], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
+      "st1 { v9.8h }, [x9], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v13.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v17.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
+      "st1 { v21.8h }, [x23], #0x10\n"
+      "st1 { v24.8h }, [x22], #0x10\n"
+      "st1 { v25.8h }, [x22], #0x10\n"
+      "st1 { v28.8h }, [x21], #0x10\n"
+      "st1 { v29.8h }, [x21], #0x10\n"
       "tbz x11, #3, 280f\n"
-      "st1 { v10.8h }, [x28], #0x10\n"
-      "st1 { v14.8h }, [x24], #0x10\n"
-      "st1 { v18.8h }, [x23], #0x10\n"
-      "st1 { v22.8h }, [x22], #0x10\n"
-      "st1 { v26.8h }, [x21], #0x10\n"
-      "st1 { v30.8h }, [x20], #0x10\n"
+      "st1 { v10.8h }, [x9], #0x10\n"
+      "st1 { v14.8h }, [x25], #0x10\n"
+      "st1 { v18.8h }, [x24], #0x10\n"
+      "st1 { v22.8h }, [x23], #0x10\n"
+      "st1 { v26.8h }, [x22], #0x10\n"
+      "st1 { v30.8h }, [x21], #0x10\n"
       "tbz x11, #2, 278f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
-      "str d23, [x22], #0x8\n"
-      "str d27, [x21], #0x8\n"
-      "str d31, [x20], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "str d31, [x21], #0x8\n"
       "tbz x11, #1, 277f\n"
-      "st1 { v11.s }[2], [x28], #0x4\n"
-      "st1 { v15.s }[2], [x24], #0x4\n"
-      "st1 { v19.s }[2], [x23], #0x4\n"
-      "st1 { v23.s }[2], [x22], #0x4\n"
-      "st1 { v27.s }[2], [x21], #0x4\n"
-      "st1 { v31.s }[2], [x20], #0x4\n"
+      "st1 { v11.s }[2], [x9], #0x4\n"
+      "st1 { v15.s }[2], [x25], #0x4\n"
+      "st1 { v19.s }[2], [x24], #0x4\n"
+      "st1 { v23.s }[2], [x23], #0x4\n"
+      "st1 { v27.s }[2], [x22], #0x4\n"
+      "st1 { v31.s }[2], [x21], #0x4\n"
       "tbz x11, #0, 292f\n"
-      "st1 { v11.h }[6], [x28]\n"
-      "st1 { v15.h }[6], [x24]\n"
-      "st1 { v19.h }[6], [x23]\n"
-      "st1 { v23.h }[6], [x22]\n"
-      "st1 { v27.h }[6], [x21]\n"
-      "st1 { v31.h }[6], [x20]\n"
+      "st1 { v11.h }[6], [x9]\n"
+      "st1 { v15.h }[6], [x25]\n"
+      "st1 { v19.h }[6], [x24]\n"
+      "st1 { v23.h }[6], [x23]\n"
+      "st1 { v27.h }[6], [x22]\n"
+      "st1 { v31.h }[6], [x21]\n"
       "b 292f\n"
       "277:"  // Height 6: Partial direct writeback: partial_1_28
       "tbz x11, #0, 292f\n"
-      "st1 { v11.h }[4], [x28]\n"
-      "st1 { v15.h }[4], [x24]\n"
-      "st1 { v19.h }[4], [x23]\n"
-      "st1 { v23.h }[4], [x22]\n"
-      "st1 { v27.h }[4], [x21]\n"
-      "st1 { v31.h }[4], [x20]\n"
+      "st1 { v11.h }[4], [x9]\n"
+      "st1 { v15.h }[4], [x25]\n"
+      "st1 { v19.h }[4], [x24]\n"
+      "st1 { v23.h }[4], [x23]\n"
+      "st1 { v27.h }[4], [x22]\n"
+      "st1 { v31.h }[4], [x21]\n"
       "b 292f\n"
       "278:"  // Height 6: Partial direct writeback: partial_2_24
       "tbz x11, #1, 279f\n"
-      "str s11, [x28], #0x4\n"
-      "str s15, [x24], #0x4\n"
-      "str s19, [x23], #0x4\n"
-      "str s23, [x22], #0x4\n"
-      "str s27, [x21], #0x4\n"
-      "str s31, [x20], #0x4\n"
+      "str s11, [x9], #0x4\n"
+      "str s15, [x25], #0x4\n"
+      "str s19, [x24], #0x4\n"
+      "str s23, [x23], #0x4\n"
+      "str s27, [x22], #0x4\n"
+      "str s31, [x21], #0x4\n"
       "tbz x11, #0, 292f\n"
-      "st1 { v11.h }[2], [x28]\n"
-      "st1 { v15.h }[2], [x24]\n"
-      "st1 { v19.h }[2], [x23]\n"
-      "st1 { v23.h }[2], [x22]\n"
-      "st1 { v27.h }[2], [x21]\n"
-      "st1 { v31.h }[2], [x20]\n"
+      "st1 { v11.h }[2], [x9]\n"
+      "st1 { v15.h }[2], [x25]\n"
+      "st1 { v19.h }[2], [x24]\n"
+      "st1 { v23.h }[2], [x23]\n"
+      "st1 { v27.h }[2], [x22]\n"
+      "st1 { v31.h }[2], [x21]\n"
       "b 292f\n"
       "279:"  // Height 6: Partial direct writeback: partial_1_24
       "tbz x11, #0, 292f\n"
-      "str h11, [x28, #0x0]\n"
-      "str h15, [x24, #0x0]\n"
-      "str h19, [x23, #0x0]\n"
-      "str h23, [x22, #0x0]\n"
-      "str h27, [x21, #0x0]\n"
-      "str h31, [x20, #0x0]\n"
+      "str h11, [x9, #0x0]\n"
+      "str h15, [x25, #0x0]\n"
+      "str h19, [x24, #0x0]\n"
+      "str h23, [x23, #0x0]\n"
+      "str h27, [x22, #0x0]\n"
+      "str h31, [x21, #0x0]\n"
       "b 292f\n"
       "280:"  // Height 6: Partial direct writeback: partial_4_16
       "tbz x11, #2, 282f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
-      "str d22, [x22], #0x8\n"
-      "str d26, [x21], #0x8\n"
-      "str d30, [x20], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
       "tbz x11, #1, 281f\n"
-      "st1 { v10.s }[2], [x28], #0x4\n"
-      "st1 { v14.s }[2], [x24], #0x4\n"
-      "st1 { v18.s }[2], [x23], #0x4\n"
-      "st1 { v22.s }[2], [x22], #0x4\n"
-      "st1 { v26.s }[2], [x21], #0x4\n"
-      "st1 { v30.s }[2], [x20], #0x4\n"
+      "st1 { v10.s }[2], [x9], #0x4\n"
+      "st1 { v14.s }[2], [x25], #0x4\n"
+      "st1 { v18.s }[2], [x24], #0x4\n"
+      "st1 { v22.s }[2], [x23], #0x4\n"
+      "st1 { v26.s }[2], [x22], #0x4\n"
+      "st1 { v30.s }[2], [x21], #0x4\n"
       "tbz x11, #0, 292f\n"
-      "st1 { v10.h }[6], [x28]\n"
-      "st1 { v14.h }[6], [x24]\n"
-      "st1 { v18.h }[6], [x23]\n"
-      "st1 { v22.h }[6], [x22]\n"
-      "st1 { v26.h }[6], [x21]\n"
-      "st1 { v30.h }[6], [x20]\n"
+      "st1 { v10.h }[6], [x9]\n"
+      "st1 { v14.h }[6], [x25]\n"
+      "st1 { v18.h }[6], [x24]\n"
+      "st1 { v22.h }[6], [x23]\n"
+      "st1 { v26.h }[6], [x22]\n"
+      "st1 { v30.h }[6], [x21]\n"
       "b 292f\n"
       "281:"  // Height 6: Partial direct writeback: partial_1_20
       "tbz x11, #0, 292f\n"
-      "st1 { v10.h }[4], [x28]\n"
-      "st1 { v14.h }[4], [x24]\n"
-      "st1 { v18.h }[4], [x23]\n"
-      "st1 { v22.h }[4], [x22]\n"
-      "st1 { v26.h }[4], [x21]\n"
-      "st1 { v30.h }[4], [x20]\n"
+      "st1 { v10.h }[4], [x9]\n"
+      "st1 { v14.h }[4], [x25]\n"
+      "st1 { v18.h }[4], [x24]\n"
+      "st1 { v22.h }[4], [x23]\n"
+      "st1 { v26.h }[4], [x22]\n"
+      "st1 { v30.h }[4], [x21]\n"
       "b 292f\n"
       "282:"  // Height 6: Partial direct writeback: partial_2_16
       "tbz x11, #1, 283f\n"
-      "str s10, [x28], #0x4\n"
-      "str s14, [x24], #0x4\n"
-      "str s18, [x23], #0x4\n"
-      "str s22, [x22], #0x4\n"
-      "str s26, [x21], #0x4\n"
-      "str s30, [x20], #0x4\n"
+      "str s10, [x9], #0x4\n"
+      "str s14, [x25], #0x4\n"
+      "str s18, [x24], #0x4\n"
+      "str s22, [x23], #0x4\n"
+      "str s26, [x22], #0x4\n"
+      "str s30, [x21], #0x4\n"
       "tbz x11, #0, 292f\n"
-      "st1 { v10.h }[2], [x28]\n"
-      "st1 { v14.h }[2], [x24]\n"
-      "st1 { v18.h }[2], [x23]\n"
-      "st1 { v22.h }[2], [x22]\n"
-      "st1 { v26.h }[2], [x21]\n"
-      "st1 { v30.h }[2], [x20]\n"
+      "st1 { v10.h }[2], [x9]\n"
+      "st1 { v14.h }[2], [x25]\n"
+      "st1 { v18.h }[2], [x24]\n"
+      "st1 { v22.h }[2], [x23]\n"
+      "st1 { v26.h }[2], [x22]\n"
+      "st1 { v30.h }[2], [x21]\n"
       "b 292f\n"
       "283:"  // Height 6: Partial direct writeback: partial_1_16
       "tbz x11, #0, 292f\n"
-      "str h10, [x28, #0x0]\n"
-      "str h14, [x24, #0x0]\n"
-      "str h18, [x23, #0x0]\n"
-      "str h22, [x22, #0x0]\n"
-      "str h26, [x21, #0x0]\n"
-      "str h30, [x20, #0x0]\n"
+      "str h10, [x9, #0x0]\n"
+      "str h14, [x25, #0x0]\n"
+      "str h18, [x24, #0x0]\n"
+      "str h22, [x23, #0x0]\n"
+      "str h26, [x22, #0x0]\n"
+      "str h30, [x21, #0x0]\n"
       "b 292f\n"
       "284:"  // Height 6: Partial direct writeback: partial_8_0
       "tbz x11, #3, 288f\n"
-      "st1 { v8.8h }, [x28], #0x10\n"
-      "st1 { v12.8h }, [x24], #0x10\n"
-      "st1 { v16.8h }, [x23], #0x10\n"
-      "st1 { v20.8h }, [x22], #0x10\n"
-      "st1 { v24.8h }, [x21], #0x10\n"
-      "st1 { v28.8h }, [x20], #0x10\n"
+      "st1 { v8.8h }, [x9], #0x10\n"
+      "st1 { v12.8h }, [x25], #0x10\n"
+      "st1 { v16.8h }, [x24], #0x10\n"
+      "st1 { v20.8h }, [x23], #0x10\n"
+      "st1 { v24.8h }, [x22], #0x10\n"
+      "st1 { v28.8h }, [x21], #0x10\n"
       "tbz x11, #2, 286f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
-      "str d21, [x22], #0x8\n"
-      "str d25, [x21], #0x8\n"
-      "str d29, [x20], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
       "tbz x11, #1, 285f\n"
-      "st1 { v9.s }[2], [x28], #0x4\n"
-      "st1 { v13.s }[2], [x24], #0x4\n"
-      "st1 { v17.s }[2], [x23], #0x4\n"
-      "st1 { v21.s }[2], [x22], #0x4\n"
-      "st1 { v25.s }[2], [x21], #0x4\n"
-      "st1 { v29.s }[2], [x20], #0x4\n"
+      "st1 { v9.s }[2], [x9], #0x4\n"
+      "st1 { v13.s }[2], [x25], #0x4\n"
+      "st1 { v17.s }[2], [x24], #0x4\n"
+      "st1 { v21.s }[2], [x23], #0x4\n"
+      "st1 { v25.s }[2], [x22], #0x4\n"
+      "st1 { v29.s }[2], [x21], #0x4\n"
       "tbz x11, #0, 292f\n"
-      "st1 { v9.h }[6], [x28]\n"
-      "st1 { v13.h }[6], [x24]\n"
-      "st1 { v17.h }[6], [x23]\n"
-      "st1 { v21.h }[6], [x22]\n"
-      "st1 { v25.h }[6], [x21]\n"
-      "st1 { v29.h }[6], [x20]\n"
+      "st1 { v9.h }[6], [x9]\n"
+      "st1 { v13.h }[6], [x25]\n"
+      "st1 { v17.h }[6], [x24]\n"
+      "st1 { v21.h }[6], [x23]\n"
+      "st1 { v25.h }[6], [x22]\n"
+      "st1 { v29.h }[6], [x21]\n"
       "b 292f\n"
       "285:"  // Height 6: Partial direct writeback: partial_1_12
       "tbz x11, #0, 292f\n"
-      "st1 { v9.h }[4], [x28]\n"
-      "st1 { v13.h }[4], [x24]\n"
-      "st1 { v17.h }[4], [x23]\n"
-      "st1 { v21.h }[4], [x22]\n"
-      "st1 { v25.h }[4], [x21]\n"
-      "st1 { v29.h }[4], [x20]\n"
+      "st1 { v9.h }[4], [x9]\n"
+      "st1 { v13.h }[4], [x25]\n"
+      "st1 { v17.h }[4], [x24]\n"
+      "st1 { v21.h }[4], [x23]\n"
+      "st1 { v25.h }[4], [x22]\n"
+      "st1 { v29.h }[4], [x21]\n"
       "b 292f\n"
       "286:"  // Height 6: Partial direct writeback: partial_2_8
       "tbz x11, #1, 287f\n"
-      "str s9, [x28], #0x4\n"
-      "str s13, [x24], #0x4\n"
-      "str s17, [x23], #0x4\n"
-      "str s21, [x22], #0x4\n"
-      "str s25, [x21], #0x4\n"
-      "str s29, [x20], #0x4\n"
+      "str s9, [x9], #0x4\n"
+      "str s13, [x25], #0x4\n"
+      "str s17, [x24], #0x4\n"
+      "str s21, [x23], #0x4\n"
+      "str s25, [x22], #0x4\n"
+      "str s29, [x21], #0x4\n"
       "tbz x11, #0, 292f\n"
-      "st1 { v9.h }[2], [x28]\n"
-      "st1 { v13.h }[2], [x24]\n"
-      "st1 { v17.h }[2], [x23]\n"
-      "st1 { v21.h }[2], [x22]\n"
-      "st1 { v25.h }[2], [x21]\n"
-      "st1 { v29.h }[2], [x20]\n"
+      "st1 { v9.h }[2], [x9]\n"
+      "st1 { v13.h }[2], [x25]\n"
+      "st1 { v17.h }[2], [x24]\n"
+      "st1 { v21.h }[2], [x23]\n"
+      "st1 { v25.h }[2], [x22]\n"
+      "st1 { v29.h }[2], [x21]\n"
       "b 292f\n"
       "287:"  // Height 6: Partial direct writeback: partial_1_8
       "tbz x11, #0, 292f\n"
-      "str h9, [x28, #0x0]\n"
-      "str h13, [x24, #0x0]\n"
-      "str h17, [x23, #0x0]\n"
-      "str h21, [x22, #0x0]\n"
-      "str h25, [x21, #0x0]\n"
-      "str h29, [x20, #0x0]\n"
+      "str h9, [x9, #0x0]\n"
+      "str h13, [x25, #0x0]\n"
+      "str h17, [x24, #0x0]\n"
+      "str h21, [x23, #0x0]\n"
+      "str h25, [x22, #0x0]\n"
+      "str h29, [x21, #0x0]\n"
       "b 292f\n"
       "288:"  // Height 6: Partial direct writeback: partial_4_0
       "tbz x11, #2, 290f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
-      "str d28, [x20], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
       "tbz x11, #1, 289f\n"
-      "st1 { v8.s }[2], [x28], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
-      "st1 { v28.s }[2], [x20], #0x4\n"
+      "st1 { v8.s }[2], [x9], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
       "tbz x11, #0, 292f\n"
-      "st1 { v8.h }[6], [x28]\n"
-      "st1 { v12.h }[6], [x24]\n"
-      "st1 { v16.h }[6], [x23]\n"
-      "st1 { v20.h }[6], [x22]\n"
-      "st1 { v24.h }[6], [x21]\n"
-      "st1 { v28.h }[6], [x20]\n"
+      "st1 { v8.h }[6], [x9]\n"
+      "st1 { v12.h }[6], [x25]\n"
+      "st1 { v16.h }[6], [x24]\n"
+      "st1 { v20.h }[6], [x23]\n"
+      "st1 { v24.h }[6], [x22]\n"
+      "st1 { v28.h }[6], [x21]\n"
       "b 292f\n"
       "289:"  // Height 6: Partial direct writeback: partial_1_4
       "tbz x11, #0, 292f\n"
-      "st1 { v8.h }[4], [x28]\n"
-      "st1 { v12.h }[4], [x24]\n"
-      "st1 { v16.h }[4], [x23]\n"
-      "st1 { v20.h }[4], [x22]\n"
-      "st1 { v24.h }[4], [x21]\n"
-      "st1 { v28.h }[4], [x20]\n"
+      "st1 { v8.h }[4], [x9]\n"
+      "st1 { v12.h }[4], [x25]\n"
+      "st1 { v16.h }[4], [x24]\n"
+      "st1 { v20.h }[4], [x23]\n"
+      "st1 { v24.h }[4], [x22]\n"
+      "st1 { v28.h }[4], [x21]\n"
       "b 292f\n"
       "290:"  // Height 6: Partial direct writeback: partial_2_0
       "tbz x11, #1, 291f\n"
-      "str s8, [x28], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
-      "str s28, [x20], #0x4\n"
+      "str s8, [x9], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
       "tbz x11, #0, 292f\n"
-      "st1 { v8.h }[2], [x28]\n"
-      "st1 { v12.h }[2], [x24]\n"
-      "st1 { v16.h }[2], [x23]\n"
-      "st1 { v20.h }[2], [x22]\n"
-      "st1 { v24.h }[2], [x21]\n"
-      "st1 { v28.h }[2], [x20]\n"
+      "st1 { v8.h }[2], [x9]\n"
+      "st1 { v12.h }[2], [x25]\n"
+      "st1 { v16.h }[2], [x24]\n"
+      "st1 { v20.h }[2], [x23]\n"
+      "st1 { v24.h }[2], [x22]\n"
+      "st1 { v28.h }[2], [x21]\n"
       "b 292f\n"
       "291:"  // Height 6: Partial direct writeback: partial_1_0
-      "str h8, [x28, #0x0]\n"
-      "str h12, [x24, #0x0]\n"
-      "str h16, [x23, #0x0]\n"
-      "str h20, [x22, #0x0]\n"
-      "str h24, [x21, #0x0]\n"
-      "str h28, [x20, #0x0]\n"
+      "str h8, [x9, #0x0]\n"
+      "str h12, [x25, #0x0]\n"
+      "str h16, [x24, #0x0]\n"
+      "str h20, [x23, #0x0]\n"
+      "str h24, [x22, #0x0]\n"
+      "str h28, [x21, #0x0]\n"
       "292:"  // Height 6: Partial direct writeback: Done
       "b 294f\n"
       "293:"  // Height 6: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q21, [x22, #0x10]\n"
-      "str q22, [x22, #0x20]\n"
-      "str q23, [x22, #0x30]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q25, [x21, #0x10]\n"
-      "str q26, [x21, #0x20]\n"
-      "str q27, [x21, #0x30]\n"
-      "str q28, [x20, #0x0]\n"
-      "str q29, [x20, #0x10]\n"
-      "str q30, [x20, #0x20]\n"
-      "str q31, [x20, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
       "294:"  // Height 6: Writeback done
       "subs x11, x11, #0x20\n"
       "bgt 247b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 296f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 295f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "295:"  // Update direct input
-      "mov x19, #0xc\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "296:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
new file mode 100644
index 0000000000..bce4de74f7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed_trB.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const float *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_fp32_mla_4x24( ARGLIST );
+void a64_hybrid_fp32_mla_4x24_a55( ARGLIST );
+
+class cls_a64_hybrid_fp32_mla_4x24
+{
+public:
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 24;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixedTRB<rhs_operand_type, result_type, 4, 24, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 2.985 };
+                case CPUModel::A53:
+                    return { 1.43 };
+                case CPUModel::A73:
+                    return { 2.56 };
+                case CPUModel::A510:
+                    return { 3.51 };
+                case CPUModel::V1:
+                    return { 13.86 };
+                default:
+                    return { 6.614 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp32_mla_4x24;
+    cls_a64_hybrid_fp32_mla_4x24(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A55r1:
+            case CPUModel::A53:
+                kernel=a64_hybrid_fp32_mla_4x24_a55;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
new file mode 100644
index 0000000000..9ceda8fd0c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
@@ -0,0 +1,2805 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_4x24_a55 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 124f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 83f\n"
+      "beq 42f\n"
+      "mov x17, %x[bias]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "cbz x17, 3f\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
+      "ldr q12, [x17, #0x40]\n"
+      "ldr q13, [x17, #0x50]\n"
+      "add x17, x17, #0x60\n"
+      "b 18f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 17f\n"
+      "cmp x16, #0x18\n"
+      "bge 16f\n"
+      "tbz x16, #4, 7f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "ld1 { v9.4s }, [x14], #0x10\n"
+      "ld1 { v10.4s }, [x14], #0x10\n"
+      "ld1 { v11.4s }, [x14], #0x10\n"
+      "tbz x16, #2, 5f\n"
+      "ld1 { v12.4s }, [x14], #0x10\n"
+      "tbz x16, #1, 4f\n"
+      "ldr d13, [x14], #0x8\n"
+      "mov x20, #0x58\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v13.s }[2], [x14]\n"
+      "b 15f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x16, #0, 15f\n"
+      "ldr s13, [x14, #0x0]\n"
+      "b 15f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_16
+      "tbz x16, #1, 6f\n"
+      "ldr d12, [x14], #0x8\n"
+      "mov x20, #0x48\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v12.s }[2], [x14]\n"
+      "b 15f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x16, #0, 15f\n"
+      "ldr s12, [x14, #0x0]\n"
+      "b 15f\n"
+      "7:"  // Height 1: Partial accumulate: partial_8_0
+      "tbz x16, #3, 11f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "ld1 { v9.4s }, [x14], #0x10\n"
+      "tbz x16, #2, 9f\n"
+      "ld1 { v10.4s }, [x14], #0x10\n"
+      "tbz x16, #1, 8f\n"
+      "ldr d11, [x14], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v11.s }[2], [x14]\n"
+      "b 15f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x16, #0, 15f\n"
+      "ldr s11, [x14, #0x0]\n"
+      "b 15f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x16, #1, 10f\n"
+      "ldr d10, [x14], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v10.s }[2], [x14]\n"
+      "b 15f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x16, #0, 15f\n"
+      "ldr s10, [x14, #0x0]\n"
+      "b 15f\n"
+      "11:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x16, #2, 13f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "tbz x16, #1, 12f\n"
+      "ldr d9, [x14], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v9.s }[2], [x14]\n"
+      "b 15f\n"
+      "12:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x16, #0, 15f\n"
+      "ldr s9, [x14, #0x0]\n"
+      "b 15f\n"
+      "13:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x16, #1, 14f\n"
+      "ldr d8, [x14], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v8.s }[2], [x14]\n"
+      "b 15f\n"
+      "14:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s8, [x14, #0x0]\n"
+      "mov x20, #0x0\n"
+      "15:"  // Height 1: Partial accumulate: Done
+      "sub x14, x14, x20\n"
+      "b 18f\n"
+      "16:"  // Height 1: full accumulate
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "ldr q12, [x14, #0x40]\n"
+      "ldr q13, [x14, #0x50]\n"
+      "b 18f\n"
+      "17:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "18:"  // Height 1: setup done
+      "mov x13, #0x0\n"
+      "19:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x11, [x20, #0x0]\n"
+      "cbnz x13, 21f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x11, x11, x20, LSL #2\n"
+      "b 21f\n"
+      "20:"  // Height 1: setup direct input
+      "mov x11, %x[input_ptr]\n"
+      "21:"  // Height 1: input setup done
+      "cmp x12, #0x4\n"
+      "blt 24f\n"
+      "ldr q0, [x11, #0x0]\n"
+      "cmp x12, #0x8\n"
+      "ldr q4, [x15, #0x0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "blt 23f\n"
+      "22:"  // Height 1: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr d19, [x15, #0x40]\n"
+      "ldr x20, [x15, #0x48]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr d18, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr d17, [x15, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr d16, [x15, #0x70]\n"
+      "mov v19.d[1], x20\n"
+      "ldr x20, [x15, #0x58]\n"
+      "mov v18.d[1], x20\n"
+      "ldr x20, [x15, #0x68]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v12.4s, v19.4s, v0.s[0]\n"
+      "ldr d19, [x15, #0x80]\n"
+      "ldr x20, [x15, #0x88]\n"
+      "fmla v13.4s, v18.4s, v0.s[0]\n"
+      "ldr d18, [x15, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr d17, [x15, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr d16, [x15, #0xb0]\n"
+      "mov v19.d[1], x20\n"
+      "ldr x20, [x15, #0x98]\n"
+      "mov v18.d[1], x20\n"
+      "ldr x20, [x15, #0xa8]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.4s, v19.4s, v0.s[1]\n"
+      "ldr d19, [x15, #0xc0]\n"
+      "ldr x20, [x15, #0xc8]\n"
+      "fmla v11.4s, v18.4s, v0.s[1]\n"
+      "ldr d18, [x15, #0xd0]\n"
+      "fmla v12.4s, v17.4s, v0.s[1]\n"
+      "ldr d17, [x15, #0xe0]\n"
+      "fmla v13.4s, v16.4s, v0.s[1]\n"
+      "ldr d16, [x15, #0xf0]\n"
+      "mov v19.d[1], x20\n"
+      "ldr x20, [x15, #0xd8]\n"
+      "mov v18.d[1], x20\n"
+      "ldr x20, [x15, #0xe8]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xf8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v19.4s, v0.s[2]\n"
+      "ldr d19, [x15, #0x100]\n"
+      "ldr x20, [x15, #0x108]\n"
+      "fmla v9.4s, v18.4s, v0.s[2]\n"
+      "ldr d18, [x15, #0x110]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr d17, [x15, #0x120]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr d16, [x15, #0x130]\n"
+      "mov v19.d[1], x20\n"
+      "ldr x20, [x15, #0x118]\n"
+      "mov v18.d[1], x20\n"
+      "ldr x20, [x15, #0x128]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x138]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v12.4s, v19.4s, v0.s[2]\n"
+      "ldr d19, [x15, #0x140]\n"
+      "ldr x20, [x15, #0x148]\n"
+      "fmla v13.4s, v18.4s, v0.s[2]\n"
+      "ldr d18, [x15, #0x150]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr d17, [x15, #0x160]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr d16, [x15, #0x170]\n"
+      "mov v19.d[1], x20\n"
+      "ldr x20, [x15, #0x158]\n"
+      "mov v18.d[1], x20\n"
+      "ldr x20, [x15, #0x168]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x178]\n"
+      "mov v16.d[1], x20\n"
+      "add x11, x11, #0x10\n"
+      "add x15, x15, #0x180\n"
+      "fmla v10.4s, v19.4s, v0.s[3]\n"
+      "ldr d4, [x15, #0x0]\n"
+      "ldr x20, [x15, #0x8]\n"
+      "fmla v11.4s, v18.4s, v0.s[3]\n"
+      "ldr d5, [x15, #0x10]\n"
+      "fmla v12.4s, v17.4s, v0.s[3]\n"
+      "ldr d6, [x15, #0x20]\n"
+      "fmla v13.4s, v16.4s, v0.s[3]\n"
+      "ldr d0, [x11, #0x0]\n"
+      "sub x12, x12, #0x4\n"
+      "ldr d7, [x15, #0x30]\n"
+      "cmp x12, #0x8\n"
+      "ldr x21, [x15, #0x18]\n"
+      "mov v4.d[1], x20\n"
+      "ldr x20, [x15, #0x28]\n"
+      "mov v5.d[1], x21\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x15, #0x38]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "bge 22b\n"
+      "23:"  // Height 1: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q19, [x15, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q18, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q17, [x15, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr q16, [x15, #0x70]\n"
+      "fmla v12.4s, v19.4s, v0.s[0]\n"
+      "ldr q19, [x15, #0x80]\n"
+      "fmla v13.4s, v18.4s, v0.s[0]\n"
+      "ldr q18, [x15, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x15, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x15, #0xb0]\n"
+      "fmla v10.4s, v19.4s, v0.s[1]\n"
+      "ldr q19, [x15, #0xc0]\n"
+      "fmla v11.4s, v18.4s, v0.s[1]\n"
+      "ldr q18, [x15, #0xd0]\n"
+      "fmla v12.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x15, #0xe0]\n"
+      "fmla v13.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x15, #0xf0]\n"
+      "fmla v8.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x15, #0x100]\n"
+      "fmla v9.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x15, #0x110]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x15, #0x120]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x15, #0x130]\n"
+      "fmla v12.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x15, #0x140]\n"
+      "fmla v13.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x15, #0x150]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x15, #0x160]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x15, #0x170]\n"
+      "add x11, x11, #0x10\n"
+      "sub x12, x12, #0x4\n"
+      "fmla v10.4s, v19.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v11.4s, v18.4s, v0.s[3]\n"
+      "add x15, x15, #0x180\n"
+      "fmla v12.4s, v17.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v0.s[3]\n"
+      "24:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x12, 26f\n"
+      "25:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s17, [x11], #0x4\n"
+      "sub x12, x12, #0x1\n"
+      "ldr q16, [x15, #0x0]\n"
+      "fmla v8.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x15, #0x10]\n"
+      "fmla v9.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x15, #0x20]\n"
+      "fmla v10.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      "fmla v11.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x15, #0x40]\n"
+      "fmla v12.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x15, #0x50]\n"
+      "fmla v13.4s, v16.4s, v17.s[0]\n"
+      "add x15, x15, #0x60\n"
+      "cbnz x12, 25b\n"
+      "26:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x13, x13, #0x1\n"
+      "cmp x13, x20\n"
+      "bne 19b\n"
+      "prfm pstl1keep, [x14, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v16.4s\n"
+      "fmin v9.4s, v9.4s, v16.4s\n"
+      "fmin v10.4s, v10.4s, v16.4s\n"
+      "fmin v11.4s, v11.4s, v16.4s\n"
+      "fmin v12.4s, v12.4s, v16.4s\n"
+      "fmin v13.4s, v13.4s, v16.4s\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
+      "27:"  // Height 1: No activation
+      "cmp x16, #0x18\n"
+      "bge 40f\n"
+      "tbz x16, #4, 31f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "st1 { v9.4s }, [x14], #0x10\n"
+      "st1 { v10.4s }, [x14], #0x10\n"
+      "st1 { v11.4s }, [x14], #0x10\n"
+      "tbz x16, #2, 29f\n"
+      "st1 { v12.4s }, [x14], #0x10\n"
+      "tbz x16, #1, 28f\n"
+      "str d13, [x14], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v13.s }[2], [x14]\n"
+      "b 39f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 39f\n"
+      "str s13, [x14, #0x0]\n"
+      "b 39f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 30f\n"
+      "str d12, [x14], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v12.s }[2], [x14]\n"
+      "b 39f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 39f\n"
+      "str s12, [x14, #0x0]\n"
+      "b 39f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 35f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "st1 { v9.4s }, [x14], #0x10\n"
+      "tbz x16, #2, 33f\n"
+      "st1 { v10.4s }, [x14], #0x10\n"
+      "tbz x16, #1, 32f\n"
+      "str d11, [x14], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v11.s }[2], [x14]\n"
+      "b 39f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 39f\n"
+      "str s11, [x14, #0x0]\n"
+      "b 39f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 34f\n"
+      "str d10, [x14], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v10.s }[2], [x14]\n"
+      "b 39f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 39f\n"
+      "str s10, [x14, #0x0]\n"
+      "b 39f\n"
+      "35:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 37f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "tbz x16, #1, 36f\n"
+      "str d9, [x14], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v9.s }[2], [x14]\n"
+      "b 39f\n"
+      "36:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 39f\n"
+      "str s9, [x14, #0x0]\n"
+      "b 39f\n"
+      "37:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 38f\n"
+      "str d8, [x14], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v8.s }[2], [x14]\n"
+      "b 39f\n"
+      "38:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x14, #0x0]\n"
+      "39:"  // Height 1: Partial direct writeback: Done
+      "b 41f\n"
+      "40:"  // Height 1: Full writeback
+      "str q8, [x14, #0x0]\n"
+      "str q9, [x14, #0x10]\n"
+      "str q10, [x14, #0x20]\n"
+      "str q11, [x14, #0x30]\n"
+      "str q12, [x14, #0x40]\n"
+      "str q13, [x14, #0x50]\n"
+      "add x14, x14, #0x60\n"
+      "41:"  // Height 1: Writeback done
+      "subs x16, x16, #0x18\n"
+      "bgt 2b\n"
+      "b 166f\n"
+      "42:"  // Height 2
+      "mov x17, %x[bias]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[output_ptr]\n"
+      "43:"  // Height 2: Column loop
+      "cbz x17, 44f\n"
+      "ldr q8, [x17, #0x0]\n"
+      "mov v14.16b, v8.16b\n"
+      "ldr q9, [x17, #0x10]\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q10, [x17, #0x20]\n"
+      "mov v16.16b, v10.16b\n"
+      "ldr q11, [x17, #0x30]\n"
+      "mov v17.16b, v11.16b\n"
+      "ldr q12, [x17, #0x40]\n"
+      "mov v18.16b, v12.16b\n"
+      "ldr q13, [x17, #0x50]\n"
+      "mov v19.16b, v13.16b\n"
+      "add x17, x17, #0x60\n"
+      "b 59f\n"
+      "44:"  // Height 2: no bias
+      "tbz %x[flags], #0, 58f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x16, #0x18\n"
+      "add x23, x14, x20, LSL #2\n"
+      "bge 57f\n"
+      "tbz x16, #4, 48f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x14], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x14], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v11.4s }, [x14], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 46f\n"
+      "ld1 { v12.4s }, [x14], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 45f\n"
+      "ldr d13, [x14], #0x8\n"
+      "mov x20, #0x58\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v13.s }[2], [x14]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "b 56f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x16, #0, 56f\n"
+      "ldr s13, [x14, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "b 56f\n"
+      "46:"  // Height 2: Partial accumulate: partial_2_16
+      "tbz x16, #1, 47f\n"
+      "ldr d12, [x14], #0x8\n"
+      "mov x20, #0x48\n"
+      "ldr d18, [x23], #0x8\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v12.s }[2], [x14]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "b 56f\n"
+      "47:"  // Height 2: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x16, #0, 56f\n"
+      "ldr s12, [x14, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "b 56f\n"
+      "48:"  // Height 2: Partial accumulate: partial_8_0
+      "tbz x16, #3, 52f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x14], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 50f\n"
+      "ld1 { v10.4s }, [x14], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 49f\n"
+      "ldr d11, [x14], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d17, [x23], #0x8\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v11.s }[2], [x14]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "b 56f\n"
+      "49:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x16, #0, 56f\n"
+      "ldr s11, [x14, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "b 56f\n"
+      "50:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x16, #1, 51f\n"
+      "ldr d10, [x14], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d16, [x23], #0x8\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v10.s }[2], [x14]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "b 56f\n"
+      "51:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x16, #0, 56f\n"
+      "ldr s10, [x14, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "b 56f\n"
+      "52:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x16, #2, 54f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 53f\n"
+      "ldr d9, [x14], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d15, [x23], #0x8\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v9.s }[2], [x14]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "b 56f\n"
+      "53:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x16, #0, 56f\n"
+      "ldr s9, [x14, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "b 56f\n"
+      "54:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x16, #1, 55f\n"
+      "ldr d8, [x14], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v8.s }[2], [x14]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "b 56f\n"
+      "55:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s8, [x14, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s14, [x23, #0x0]\n"
+      "56:"  // Height 2: Partial accumulate: Done
+      "sub x14, x14, x20\n"
+      "b 59f\n"
+      "57:"  // Height 2: full accumulate
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "ldr q12, [x14, #0x40]\n"
+      "ldr q13, [x14, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "b 59f\n"
+      "58:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "59:"  // Height 2: setup done
+      "mov x13, #0x0\n"
+      "60:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x11, [x20, #0x0]\n"
+      "ldr x10, [x20, #0x8]\n"
+      "cbnz x13, 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x11, x11, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
+      "b 62f\n"
+      "61:"  // Height 2: setup direct input
+      "mov x11, %x[input_ptr]\n"
+      "add x10, x11, x21, LSL #2\n"
+      "62:"  // Height 2: input setup done
+      "cmp x12, #0x4\n"
+      "blt 65f\n"
+      "ldr q0, [x11, #0x0]\n"
+      "cmp x12, #0x8\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q4, [x15, #0x0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "blt 64f\n"
+      "63:"  // Height 2: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr x23, [x15, #0x48]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr d23, [x15, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr x22, [x15, #0x58]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr d22, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr x21, [x15, #0x68]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "ldr d21, [x15, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr x20, [x15, #0x78]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr d20, [x15, #0x70]\n"
+      "mov v23.d[1], x23\n"
+      "fmla v12.4s, v23.4s, v0.s[0]\n"
+      "mov v22.d[1], x22\n"
+      "fmla v18.4s, v23.4s, v1.s[0]\n"
+      "ldr d23, [x15, #0x80]\n"
+      "mov v21.d[1], x21\n"
+      "mov v20.d[1], x20\n"
+      "ldr x23, [x15, #0x88]\n"
+      "fmla v13.4s, v22.4s, v0.s[0]\n"
+      "ldr x22, [x15, #0x98]\n"
+      "fmla v19.4s, v22.4s, v1.s[0]\n"
+      "ldr d22, [x15, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "ldr d21, [x15, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "ldr d20, [x15, #0xb0]\n"
+      "mov v23.d[1], x23\n"
+      "fmla v10.4s, v23.4s, v0.s[1]\n"
+      "mov v22.d[1], x22\n"
+      "fmla v16.4s, v23.4s, v1.s[1]\n"
+      "ldr d23, [x15, #0xc0]\n"
+      "mov v21.d[1], x21\n"
+      "mov v20.d[1], x20\n"
+      "ldr x23, [x15, #0xc8]\n"
+      "fmla v11.4s, v22.4s, v0.s[1]\n"
+      "ldr x22, [x15, #0xd8]\n"
+      "fmla v17.4s, v22.4s, v1.s[1]\n"
+      "ldr d22, [x15, #0xd0]\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      "fmla v18.4s, v21.4s, v1.s[1]\n"
+      "ldr d21, [x15, #0xe0]\n"
+      "fmla v13.4s, v20.4s, v0.s[1]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      "fmla v19.4s, v20.4s, v1.s[1]\n"
+      "ldr d20, [x15, #0xf0]\n"
+      "mov v23.d[1], x23\n"
+      "fmla v8.4s, v23.4s, v0.s[2]\n"
+      "mov v22.d[1], x22\n"
+      "fmla v14.4s, v23.4s, v1.s[2]\n"
+      "ldr d23, [x15, #0x100]\n"
+      "mov v21.d[1], x21\n"
+      "mov v20.d[1], x20\n"
+      "ldr x23, [x15, #0x108]\n"
+      "fmla v9.4s, v22.4s, v0.s[2]\n"
+      "ldr x22, [x15, #0x118]\n"
+      "fmla v15.4s, v22.4s, v1.s[2]\n"
+      "ldr d22, [x15, #0x110]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "ldr x21, [x15, #0x128]\n"
+      "fmla v16.4s, v21.4s, v1.s[2]\n"
+      "ldr d21, [x15, #0x120]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "ldr x20, [x15, #0x138]\n"
+      "fmla v17.4s, v20.4s, v1.s[2]\n"
+      "ldr d20, [x15, #0x130]\n"
+      "mov v23.d[1], x23\n"
+      "fmla v12.4s, v23.4s, v0.s[2]\n"
+      "mov v22.d[1], x22\n"
+      "fmla v18.4s, v23.4s, v1.s[2]\n"
+      "ldr d23, [x15, #0x140]\n"
+      "mov v21.d[1], x21\n"
+      "mov v20.d[1], x20\n"
+      "ldr x23, [x15, #0x148]\n"
+      "fmla v13.4s, v22.4s, v0.s[2]\n"
+      "ldr x22, [x15, #0x158]\n"
+      "fmla v19.4s, v22.4s, v1.s[2]\n"
+      "ldr d22, [x15, #0x150]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "ldr x21, [x15, #0x168]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "ldr d21, [x15, #0x160]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "ldr x20, [x15, #0x178]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr d20, [x15, #0x170]\n"
+      "mov v23.d[1], x23\n"
+      "add x11, x11, #0x10\n"
+      "mov v22.d[1], x22\n"
+      "add x10, x10, #0x10\n"
+      "mov v21.d[1], x21\n"
+      "add x15, x15, #0x180\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.4s, v23.4s, v0.s[3]\n"
+      "fmla v16.4s, v23.4s, v1.s[3]\n"
+      "ldr d4, [x15, #0x0]\n"
+      "ldr x21, [x15, #0x8]\n"
+      "fmla v11.4s, v22.4s, v0.s[3]\n"
+      "fmla v17.4s, v22.4s, v1.s[3]\n"
+      "ldr d5, [x15, #0x10]\n"
+      "ldr x20, [x15, #0x18]\n"
+      "fmla v12.4s, v21.4s, v0.s[3]\n"
+      "fmla v18.4s, v21.4s, v1.s[3]\n"
+      "ldr d6, [x15, #0x20]\n"
+      "ldr x23, [x15, #0x28]\n"
+      "fmla v13.4s, v20.4s, v0.s[3]\n"
+      "ldr d0, [x11, #0x0]\n"
+      "fmla v19.4s, v20.4s, v1.s[3]\n"
+      "ldr d1, [x10, #0x0]\n"
+      "sub x12, x12, #0x4\n"
+      "ldr d7, [x15, #0x30]\n"
+      "cmp x12, #0x8\n"
+      "ldr x22, [x11, #0x8]\n"
+      "mov v4.d[1], x21\n"
+      "ldr x21, [x10, #0x8]\n"
+      "mov v5.d[1], x20\n"
+      "ldr x20, [x15, #0x38]\n"
+      "mov v6.d[1], x23\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v0.d[1], x22\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "mov v1.d[1], x21\n"
+      "mov v7.d[1], x20\n"
+      "bge 63b\n"
+      "64:"  // Height 2: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "add x11, x11, #0x10\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q23, [x15, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr q22, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "sub x12, x12, #0x4\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "ldr q21, [x15, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr q20, [x15, #0x70]\n"
+      "fmla v12.4s, v23.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v18.4s, v23.4s, v1.s[0]\n"
+      "ldr q23, [x15, #0x80]\n"
+      "fmla v13.4s, v22.4s, v0.s[0]\n"
+      "fmla v19.4s, v22.4s, v1.s[0]\n"
+      "ldr q22, [x15, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x15, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x15, #0xb0]\n"
+      "fmla v10.4s, v23.4s, v0.s[1]\n"
+      "fmla v16.4s, v23.4s, v1.s[1]\n"
+      "ldr q23, [x15, #0xc0]\n"
+      "fmla v11.4s, v22.4s, v0.s[1]\n"
+      "fmla v17.4s, v22.4s, v1.s[1]\n"
+      "ldr q22, [x15, #0xd0]\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "fmla v18.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x15, #0xe0]\n"
+      "fmla v13.4s, v20.4s, v0.s[1]\n"
+      "fmla v19.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x15, #0xf0]\n"
+      "fmla v8.4s, v23.4s, v0.s[2]\n"
+      "fmla v14.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x15, #0x100]\n"
+      "fmla v9.4s, v22.4s, v0.s[2]\n"
+      "fmla v15.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x15, #0x110]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v16.4s, v21.4s, v1.s[2]\n"
+      "ldr q21, [x15, #0x120]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v17.4s, v20.4s, v1.s[2]\n"
+      "ldr q20, [x15, #0x130]\n"
+      "fmla v12.4s, v23.4s, v0.s[2]\n"
+      "fmla v18.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x15, #0x140]\n"
+      "fmla v13.4s, v22.4s, v0.s[2]\n"
+      "fmla v19.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x15, #0x150]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "ldr q21, [x15, #0x160]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr q20, [x15, #0x170]\n"
+      "fmla v10.4s, v23.4s, v0.s[3]\n"
+      "add x15, x15, #0x180\n"
+      "fmla v16.4s, v23.4s, v1.s[3]\n"
+      "fmla v11.4s, v22.4s, v0.s[3]\n"
+      "fmla v17.4s, v22.4s, v1.s[3]\n"
+      "fmla v12.4s, v21.4s, v0.s[3]\n"
+      "fmla v18.4s, v21.4s, v1.s[3]\n"
+      "fmla v13.4s, v20.4s, v0.s[3]\n"
+      "fmla v19.4s, v20.4s, v1.s[3]\n"
+      "65:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x12, 67f\n"
+      "66:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s25, [x11], #0x4\n"
+      "sub x12, x12, #0x1\n"
+      "ldr s24, [x10], #0x4\n"
+      "ldr q21, [x15, #0x0]\n"
+      "fmla v8.4s, v21.4s, v25.s[0]\n"
+      "ldr q20, [x15, #0x10]\n"
+      "fmla v14.4s, v21.4s, v24.s[0]\n"
+      "ldr q23, [x15, #0x20]\n"
+      "fmla v9.4s, v20.4s, v25.s[0]\n"
+      "ldr q22, [x15, #0x30]\n"
+      "fmla v15.4s, v20.4s, v24.s[0]\n"
+      "ldr q21, [x15, #0x40]\n"
+      "fmla v10.4s, v23.4s, v25.s[0]\n"
+      "ldr q20, [x15, #0x50]\n"
+      "fmla v16.4s, v23.4s, v24.s[0]\n"
+      "fmla v11.4s, v22.4s, v25.s[0]\n"
+      "add x15, x15, #0x60\n"
+      "fmla v17.4s, v22.4s, v24.s[0]\n"
+      "fmla v12.4s, v21.4s, v25.s[0]\n"
+      "fmla v18.4s, v21.4s, v24.s[0]\n"
+      "fmla v13.4s, v20.4s, v25.s[0]\n"
+      "fmla v19.4s, v20.4s, v24.s[0]\n"
+      "cbnz x12, 66b\n"
+      "67:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x13, x13, #0x1\n"
+      "cmp x13, x20\n"
+      "bne 60b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x14, x20, LSL #2\n"
+      "prfm pstl1keep, [x14, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 68f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v20.4s\n"
+      "fmin v9.4s, v9.4s, v20.4s\n"
+      "fmin v10.4s, v10.4s, v20.4s\n"
+      "fmin v11.4s, v11.4s, v20.4s\n"
+      "fmin v12.4s, v12.4s, v20.4s\n"
+      "fmin v13.4s, v13.4s, v20.4s\n"
+      "fmin v14.4s, v14.4s, v20.4s\n"
+      "fmin v15.4s, v15.4s, v20.4s\n"
+      "fmin v16.4s, v16.4s, v20.4s\n"
+      "fmin v17.4s, v17.4s, v20.4s\n"
+      "fmin v18.4s, v18.4s, v20.4s\n"
+      "fmin v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
+      "68:"  // Height 2: No activation
+      "cmp x16, #0x18\n"
+      "bge 81f\n"
+      "tbz x16, #4, 72f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "st1 { v9.4s }, [x14], #0x10\n"
+      "st1 { v10.4s }, [x14], #0x10\n"
+      "st1 { v11.4s }, [x14], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 70f\n"
+      "st1 { v12.4s }, [x14], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 69f\n"
+      "str d13, [x14], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v13.s }[2], [x14]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "b 80f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 80f\n"
+      "str s13, [x14, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "b 80f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 71f\n"
+      "str d12, [x14], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v12.s }[2], [x14]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "b 80f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 80f\n"
+      "str s12, [x14, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "b 80f\n"
+      "72:"  // Height 2: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 76f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "st1 { v9.4s }, [x14], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 74f\n"
+      "st1 { v10.4s }, [x14], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 73f\n"
+      "str d11, [x14], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v11.s }[2], [x14]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "b 80f\n"
+      "73:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 80f\n"
+      "str s11, [x14, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "b 80f\n"
+      "74:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 75f\n"
+      "str d10, [x14], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v10.s }[2], [x14]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "b 80f\n"
+      "75:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 80f\n"
+      "str s10, [x14, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "b 80f\n"
+      "76:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 78f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 77f\n"
+      "str d9, [x14], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v9.s }[2], [x14]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "b 80f\n"
+      "77:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 80f\n"
+      "str s9, [x14, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "b 80f\n"
+      "78:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 79f\n"
+      "str d8, [x14], #0x8\n"
+      "str d14, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v8.s }[2], [x14]\n"
+      "st1 { v14.s }[2], [x23]\n"
+      "b 80f\n"
+      "79:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x14, #0x0]\n"
+      "str s14, [x23, #0x0]\n"
+      "80:"  // Height 2: Partial direct writeback: Done
+      "b 82f\n"
+      "81:"  // Height 2: Full writeback
+      "str q8, [x14, #0x0]\n"
+      "str q9, [x14, #0x10]\n"
+      "str q10, [x14, #0x20]\n"
+      "str q11, [x14, #0x30]\n"
+      "str q12, [x14, #0x40]\n"
+      "str q13, [x14, #0x50]\n"
+      "add x14, x14, #0x60\n"
+      "str q14, [x23, #0x0]\n"
+      "str q15, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "str q17, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "82:"  // Height 2: Writeback done
+      "subs x16, x16, #0x18\n"
+      "bgt 43b\n"
+      "b 166f\n"
+      "83:"  // Height 3
+      "mov x17, %x[bias]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[output_ptr]\n"
+      "84:"  // Height 3: Column loop
+      "cbz x17, 85f\n"
+      "ldr q8, [x17, #0x0]\n"
+      "mov v14.16b, v8.16b\n"
+      "ldr q9, [x17, #0x10]\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q10, [x17, #0x20]\n"
+      "mov v16.16b, v10.16b\n"
+      "ldr q11, [x17, #0x30]\n"
+      "mov v17.16b, v11.16b\n"
+      "ldr q12, [x17, #0x40]\n"
+      "mov v18.16b, v12.16b\n"
+      "ldr q13, [x17, #0x50]\n"
+      "mov v19.16b, v13.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "add x17, x17, #0x60\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "b 100f\n"
+      "85:"  // Height 3: no bias
+      "tbz %x[flags], #0, 99f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x14, x20, LSL #2\n"
+      "cmp x16, #0x18\n"
+      "add x22, x23, x20, LSL #2\n"
+      "bge 98f\n"
+      "tbz x16, #4, 89f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x14], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x14], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v11.4s }, [x14], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "tbz x16, #2, 87f\n"
+      "ld1 { v12.4s }, [x14], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 86f\n"
+      "ldr d13, [x14], #0x8\n"
+      "mov x20, #0x58\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v13.s }[2], [x14]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "b 97f\n"
+      "86:"  // Height 3: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x16, #0, 97f\n"
+      "ldr s13, [x14, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "b 97f\n"
+      "87:"  // Height 3: Partial accumulate: partial_2_16
+      "tbz x16, #1, 88f\n"
+      "ldr d12, [x14], #0x8\n"
+      "mov x20, #0x48\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v12.s }[2], [x14]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "b 97f\n"
+      "88:"  // Height 3: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x16, #0, 97f\n"
+      "ldr s12, [x14, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "b 97f\n"
+      "89:"  // Height 3: Partial accumulate: partial_8_0
+      "tbz x16, #3, 93f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x14], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x16, #2, 91f\n"
+      "ld1 { v10.4s }, [x14], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 90f\n"
+      "ldr d11, [x14], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v11.s }[2], [x14]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "b 97f\n"
+      "90:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x16, #0, 97f\n"
+      "ldr s11, [x14, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "b 97f\n"
+      "91:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x16, #1, 92f\n"
+      "ldr d10, [x14], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v10.s }[2], [x14]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "b 97f\n"
+      "92:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x16, #0, 97f\n"
+      "ldr s10, [x14, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "b 97f\n"
+      "93:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x16, #2, 95f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 94f\n"
+      "ldr d9, [x14], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v9.s }[2], [x14]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "b 97f\n"
+      "94:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x16, #0, 97f\n"
+      "ldr s9, [x14, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "b 97f\n"
+      "95:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x16, #1, 96f\n"
+      "ldr d8, [x14], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v8.s }[2], [x14]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "b 97f\n"
+      "96:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s8, [x14, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "97:"  // Height 3: Partial accumulate: Done
+      "sub x14, x14, x20\n"
+      "b 100f\n"
+      "98:"  // Height 3: full accumulate
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "ldr q12, [x14, #0x40]\n"
+      "ldr q13, [x14, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x22, #0x40]\n"
+      "ldr q25, [x22, #0x50]\n"
+      "b 100f\n"
+      "99:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "100:"  // Height 3: setup done
+      "mov x13, #0x0\n"
+      "101:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 102f\n"
+      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x11, [x20, #0x0]\n"
+      "ldr x10, [x20, #0x8]\n"
+      "ldr x9, [x20, #0x10]\n"
+      "cbnz x13, 103f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x11, x11, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
+      "add x9, x9, x20, LSL #2\n"
+      "b 103f\n"
+      "102:"  // Height 3: setup direct input
+      "mov x11, %x[input_ptr]\n"
+      "add x10, x11, x21, LSL #2\n"
+      "add x9, x10, x21, LSL #2\n"
+      "103:"  // Height 3: input setup done
+      "cmp x12, #0x4\n"
+      "blt 106f\n"
+      "ldr q0, [x11, #0x0]\n"
+      "cmp x12, #0x8\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q2, [x9, #0x0]\n"
+      "ldr q4, [x15, #0x0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "blt 105f\n"
+      "104:"  // Height 3: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr x23, [x15, #0x48]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr x22, [x15, #0x58]\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr d29, [x15, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr x21, [x15, #0x68]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr x20, [x15, #0x78]\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "ldr d28, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "mov v29.d[1], x23\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "mov v28.d[1], x22\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "ldr d27, [x15, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "mov v27.d[1], x21\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr x23, [x15, #0x88]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "ldr d26, [x15, #0x70]\n"
+      "mov v26.d[1], x20\n"
+      "fmla v12.4s, v29.4s, v0.s[0]\n"
+      "fmla v18.4s, v29.4s, v1.s[0]\n"
+      "ldr x22, [x15, #0x98]\n"
+      "fmla v24.4s, v29.4s, v2.s[0]\n"
+      "ldr d29, [x15, #0x80]\n"
+      "fmla v13.4s, v28.4s, v0.s[0]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      "fmla v19.4s, v28.4s, v1.s[0]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      "fmla v25.4s, v28.4s, v2.s[0]\n"
+      "ldr d28, [x15, #0x90]\n"
+      "fmla v8.4s, v27.4s, v0.s[1]\n"
+      "mov v29.d[1], x23\n"
+      "fmla v14.4s, v27.4s, v1.s[1]\n"
+      "mov v28.d[1], x22\n"
+      "fmla v20.4s, v27.4s, v2.s[1]\n"
+      "ldr d27, [x15, #0xa0]\n"
+      "fmla v9.4s, v26.4s, v0.s[1]\n"
+      "mov v27.d[1], x21\n"
+      "fmla v15.4s, v26.4s, v1.s[1]\n"
+      "ldr x23, [x15, #0xc8]\n"
+      "fmla v21.4s, v26.4s, v2.s[1]\n"
+      "ldr d26, [x15, #0xb0]\n"
+      "mov v26.d[1], x20\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v16.4s, v29.4s, v1.s[1]\n"
+      "ldr x22, [x15, #0xd8]\n"
+      "fmla v22.4s, v29.4s, v2.s[1]\n"
+      "ldr d29, [x15, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      "fmla v17.4s, v28.4s, v1.s[1]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      "fmla v23.4s, v28.4s, v2.s[1]\n"
+      "ldr d28, [x15, #0xd0]\n"
+      "fmla v12.4s, v27.4s, v0.s[1]\n"
+      "mov v29.d[1], x23\n"
+      "fmla v18.4s, v27.4s, v1.s[1]\n"
+      "mov v28.d[1], x22\n"
+      "fmla v24.4s, v27.4s, v2.s[1]\n"
+      "ldr d27, [x15, #0xe0]\n"
+      "fmla v13.4s, v26.4s, v0.s[1]\n"
+      "mov v27.d[1], x21\n"
+      "fmla v19.4s, v26.4s, v1.s[1]\n"
+      "ldr x23, [x15, #0x108]\n"
+      "fmla v25.4s, v26.4s, v2.s[1]\n"
+      "ldr d26, [x15, #0xf0]\n"
+      "mov v26.d[1], x20\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "ldr x22, [x15, #0x118]\n"
+      "fmla v20.4s, v29.4s, v2.s[2]\n"
+      "ldr d29, [x15, #0x100]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "ldr x21, [x15, #0x128]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "ldr x20, [x15, #0x138]\n"
+      "fmla v21.4s, v28.4s, v2.s[2]\n"
+      "ldr d28, [x15, #0x110]\n"
+      "fmla v10.4s, v27.4s, v0.s[2]\n"
+      "mov v29.d[1], x23\n"
+      "fmla v16.4s, v27.4s, v1.s[2]\n"
+      "mov v28.d[1], x22\n"
+      "fmla v22.4s, v27.4s, v2.s[2]\n"
+      "ldr d27, [x15, #0x120]\n"
+      "fmla v11.4s, v26.4s, v0.s[2]\n"
+      "mov v27.d[1], x21\n"
+      "fmla v17.4s, v26.4s, v1.s[2]\n"
+      "ldr x23, [x15, #0x148]\n"
+      "fmla v23.4s, v26.4s, v2.s[2]\n"
+      "ldr d26, [x15, #0x130]\n"
+      "mov v26.d[1], x20\n"
+      "fmla v12.4s, v29.4s, v0.s[2]\n"
+      "fmla v18.4s, v29.4s, v1.s[2]\n"
+      "ldr x22, [x15, #0x158]\n"
+      "fmla v24.4s, v29.4s, v2.s[2]\n"
+      "ldr d29, [x15, #0x140]\n"
+      "fmla v13.4s, v28.4s, v0.s[2]\n"
+      "ldr x21, [x15, #0x168]\n"
+      "fmla v19.4s, v28.4s, v1.s[2]\n"
+      "ldr x20, [x15, #0x178]\n"
+      "fmla v25.4s, v28.4s, v2.s[2]\n"
+      "ldr d28, [x15, #0x150]\n"
+      "fmla v8.4s, v27.4s, v0.s[3]\n"
+      "mov v29.d[1], x23\n"
+      "fmla v14.4s, v27.4s, v1.s[3]\n"
+      "mov v28.d[1], x22\n"
+      "fmla v20.4s, v27.4s, v2.s[3]\n"
+      "ldr d27, [x15, #0x160]\n"
+      "fmla v9.4s, v26.4s, v0.s[3]\n"
+      "mov v27.d[1], x21\n"
+      "fmla v15.4s, v26.4s, v1.s[3]\n"
+      "add x11, x11, #0x10\n"
+      "fmla v21.4s, v26.4s, v2.s[3]\n"
+      "ldr d26, [x15, #0x170]\n"
+      "mov v26.d[1], x20\n"
+      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x15, x15, #0x180\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "ldr x26, [x15, #0x8]\n"
+      "fmla v16.4s, v29.4s, v1.s[3]\n"
+      "ldr x25, [x15, #0x18]\n"
+      "fmla v22.4s, v29.4s, v2.s[3]\n"
+      "ldr d4, [x15, #0x0]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "ldr x24, [x15, #0x28]\n"
+      "fmla v17.4s, v28.4s, v1.s[3]\n"
+      "ldr x23, [x11, #0x8]\n"
+      "fmla v23.4s, v28.4s, v2.s[3]\n"
+      "ldr d5, [x15, #0x10]\n"
+      "fmla v12.4s, v27.4s, v0.s[3]\n"
+      "ldr x22, [x10, #0x8]\n"
+      "fmla v18.4s, v27.4s, v1.s[3]\n"
+      "ldr x21, [x9, #0x8]\n"
+      "fmla v24.4s, v27.4s, v2.s[3]\n"
+      "ldr d6, [x15, #0x20]\n"
+      "fmla v13.4s, v26.4s, v0.s[3]\n"
+      "ldr d0, [x11, #0x0]\n"
+      "fmla v19.4s, v26.4s, v1.s[3]\n"
+      "ldr d1, [x10, #0x0]\n"
+      "fmla v25.4s, v26.4s, v2.s[3]\n"
+      "ldr d2, [x9, #0x0]\n"
+      "ldr d7, [x15, #0x30]\n"
+      "sub x12, x12, #0x4\n"
+      "ldr x20, [x15, #0x38]\n"
+      "cmp x12, #0x8\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v4.d[1], x26\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "mov v5.d[1], x25\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "mov v6.d[1], x24\n"
+      "mov v0.d[1], x23\n"
+      "mov v1.d[1], x22\n"
+      "mov v2.d[1], x21\n"
+      "mov v7.d[1], x20\n"
+      "bge 104b\n"
+      "105:"  // Height 3: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "add x11, x11, #0x10\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q29, [x15, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "sub x12, x12, #0x4\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "ldr q28, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "ldr q27, [x15, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "ldr q26, [x15, #0x70]\n"
+      "fmla v12.4s, v29.4s, v0.s[0]\n"
+      "fmla v18.4s, v29.4s, v1.s[0]\n"
+      "fmla v24.4s, v29.4s, v2.s[0]\n"
+      "ldr q29, [x15, #0x80]\n"
+      "fmla v13.4s, v28.4s, v0.s[0]\n"
+      "fmla v19.4s, v28.4s, v1.s[0]\n"
+      "fmla v25.4s, v28.4s, v2.s[0]\n"
+      "ldr q28, [x15, #0x90]\n"
+      "fmla v8.4s, v27.4s, v0.s[1]\n"
+      "fmla v14.4s, v27.4s, v1.s[1]\n"
+      "fmla v20.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x15, #0xa0]\n"
+      "fmla v9.4s, v26.4s, v0.s[1]\n"
+      "fmla v15.4s, v26.4s, v1.s[1]\n"
+      "fmla v21.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x15, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v16.4s, v29.4s, v1.s[1]\n"
+      "fmla v22.4s, v29.4s, v2.s[1]\n"
+      "ldr q29, [x15, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v17.4s, v28.4s, v1.s[1]\n"
+      "fmla v23.4s, v28.4s, v2.s[1]\n"
+      "ldr q28, [x15, #0xd0]\n"
+      "fmla v12.4s, v27.4s, v0.s[1]\n"
+      "fmla v18.4s, v27.4s, v1.s[1]\n"
+      "fmla v24.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x15, #0xe0]\n"
+      "fmla v13.4s, v26.4s, v0.s[1]\n"
+      "fmla v19.4s, v26.4s, v1.s[1]\n"
+      "fmla v25.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x15, #0xf0]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v20.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x15, #0x100]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v21.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x15, #0x110]\n"
+      "fmla v10.4s, v27.4s, v0.s[2]\n"
+      "fmla v16.4s, v27.4s, v1.s[2]\n"
+      "fmla v22.4s, v27.4s, v2.s[2]\n"
+      "ldr q27, [x15, #0x120]\n"
+      "fmla v11.4s, v26.4s, v0.s[2]\n"
+      "fmla v17.4s, v26.4s, v1.s[2]\n"
+      "fmla v23.4s, v26.4s, v2.s[2]\n"
+      "ldr q26, [x15, #0x130]\n"
+      "fmla v12.4s, v29.4s, v0.s[2]\n"
+      "fmla v18.4s, v29.4s, v1.s[2]\n"
+      "fmla v24.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x15, #0x140]\n"
+      "fmla v13.4s, v28.4s, v0.s[2]\n"
+      "fmla v19.4s, v28.4s, v1.s[2]\n"
+      "fmla v25.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x15, #0x150]\n"
+      "fmla v8.4s, v27.4s, v0.s[3]\n"
+      "fmla v14.4s, v27.4s, v1.s[3]\n"
+      "fmla v20.4s, v27.4s, v2.s[3]\n"
+      "ldr q27, [x15, #0x160]\n"
+      "fmla v9.4s, v26.4s, v0.s[3]\n"
+      "fmla v15.4s, v26.4s, v1.s[3]\n"
+      "fmla v21.4s, v26.4s, v2.s[3]\n"
+      "ldr q26, [x15, #0x170]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "add x15, x15, #0x180\n"
+      "fmla v16.4s, v29.4s, v1.s[3]\n"
+      "fmla v22.4s, v29.4s, v2.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v17.4s, v28.4s, v1.s[3]\n"
+      "fmla v23.4s, v28.4s, v2.s[3]\n"
+      "fmla v12.4s, v27.4s, v0.s[3]\n"
+      "fmla v18.4s, v27.4s, v1.s[3]\n"
+      "fmla v24.4s, v27.4s, v2.s[3]\n"
+      "fmla v13.4s, v26.4s, v0.s[3]\n"
+      "fmla v19.4s, v26.4s, v1.s[3]\n"
+      "fmla v25.4s, v26.4s, v2.s[3]\n"
+      "106:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x12, 108f\n"
+      "107:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x11], #0x4\n"
+      "sub x12, x12, #0x1\n"
+      "ldr s31, [x10], #0x4\n"
+      "ldr s30, [x9], #0x4\n"
+      "ldr q27, [x15, #0x0]\n"
+      "fmla v8.4s, v27.4s, v0.s[0]\n"
+      "ldr q26, [x15, #0x10]\n"
+      "fmla v14.4s, v27.4s, v31.s[0]\n"
+      "ldr q29, [x15, #0x20]\n"
+      "fmla v20.4s, v27.4s, v30.s[0]\n"
+      "ldr q28, [x15, #0x30]\n"
+      "fmla v9.4s, v26.4s, v0.s[0]\n"
+      "ldr q27, [x15, #0x40]\n"
+      "fmla v15.4s, v26.4s, v31.s[0]\n"
+      "fmla v21.4s, v26.4s, v30.s[0]\n"
+      "ldr q26, [x15, #0x50]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "add x15, x15, #0x60\n"
+      "fmla v16.4s, v29.4s, v31.s[0]\n"
+      "fmla v22.4s, v29.4s, v30.s[0]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v17.4s, v28.4s, v31.s[0]\n"
+      "fmla v23.4s, v28.4s, v30.s[0]\n"
+      "fmla v12.4s, v27.4s, v0.s[0]\n"
+      "fmla v18.4s, v27.4s, v31.s[0]\n"
+      "fmla v24.4s, v27.4s, v30.s[0]\n"
+      "fmla v13.4s, v26.4s, v0.s[0]\n"
+      "fmla v19.4s, v26.4s, v31.s[0]\n"
+      "fmla v25.4s, v26.4s, v30.s[0]\n"
+      "cbnz x12, 107b\n"
+      "108:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x13, x13, #0x1\n"
+      "cmp x13, x20\n"
+      "bne 101b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x14, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "prfm pstl1keep, [x14, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "tbz %x[flags], #1, 109f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v15.4s, v15.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmin v20.4s, v20.4s, v26.4s\n"
+      "fmin v21.4s, v21.4s, v26.4s\n"
+      "fmin v22.4s, v22.4s, v26.4s\n"
+      "fmin v23.4s, v23.4s, v26.4s\n"
+      "fmin v24.4s, v24.4s, v26.4s\n"
+      "fmin v25.4s, v25.4s, v26.4s\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v26.4s\n"
+      "fmax v9.4s, v9.4s, v26.4s\n"
+      "fmax v10.4s, v10.4s, v26.4s\n"
+      "fmax v11.4s, v11.4s, v26.4s\n"
+      "fmax v12.4s, v12.4s, v26.4s\n"
+      "fmax v13.4s, v13.4s, v26.4s\n"
+      "fmax v14.4s, v14.4s, v26.4s\n"
+      "fmax v15.4s, v15.4s, v26.4s\n"
+      "fmax v16.4s, v16.4s, v26.4s\n"
+      "fmax v17.4s, v17.4s, v26.4s\n"
+      "fmax v18.4s, v18.4s, v26.4s\n"
+      "fmax v19.4s, v19.4s, v26.4s\n"
+      "fmax v20.4s, v20.4s, v26.4s\n"
+      "fmax v21.4s, v21.4s, v26.4s\n"
+      "fmax v22.4s, v22.4s, v26.4s\n"
+      "fmax v23.4s, v23.4s, v26.4s\n"
+      "fmax v24.4s, v24.4s, v26.4s\n"
+      "fmax v25.4s, v25.4s, v26.4s\n"
+      "109:"  // Height 3: No activation
+      "cmp x16, #0x18\n"
+      "bge 122f\n"
+      "tbz x16, #4, 113f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "st1 { v9.4s }, [x14], #0x10\n"
+      "st1 { v10.4s }, [x14], #0x10\n"
+      "st1 { v11.4s }, [x14], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "tbz x16, #2, 111f\n"
+      "st1 { v12.4s }, [x14], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 110f\n"
+      "str d13, [x14], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v13.s }[2], [x14]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "b 121f\n"
+      "110:"  // Height 3: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 121f\n"
+      "str s13, [x14, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "b 121f\n"
+      "111:"  // Height 3: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 112f\n"
+      "str d12, [x14], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v12.s }[2], [x14]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "b 121f\n"
+      "112:"  // Height 3: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 121f\n"
+      "str s12, [x14, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "b 121f\n"
+      "113:"  // Height 3: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 117f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "st1 { v9.4s }, [x14], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "tbz x16, #2, 115f\n"
+      "st1 { v10.4s }, [x14], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 114f\n"
+      "str d11, [x14], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v11.s }[2], [x14]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "b 121f\n"
+      "114:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 121f\n"
+      "str s11, [x14, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "b 121f\n"
+      "115:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 116f\n"
+      "str d10, [x14], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v10.s }[2], [x14]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "b 121f\n"
+      "116:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 121f\n"
+      "str s10, [x14, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "b 121f\n"
+      "117:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 119f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 118f\n"
+      "str d9, [x14], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v9.s }[2], [x14]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "b 121f\n"
+      "118:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 121f\n"
+      "str s9, [x14, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "b 121f\n"
+      "119:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 120f\n"
+      "str d8, [x14], #0x8\n"
+      "str d14, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v8.s }[2], [x14]\n"
+      "st1 { v14.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "b 121f\n"
+      "120:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x14, #0x0]\n"
+      "str s14, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "121:"  // Height 3: Partial direct writeback: Done
+      "b 123f\n"
+      "122:"  // Height 3: Full writeback
+      "str q8, [x14, #0x0]\n"
+      "str q9, [x14, #0x10]\n"
+      "str q10, [x14, #0x20]\n"
+      "str q11, [x14, #0x30]\n"
+      "str q12, [x14, #0x40]\n"
+      "str q13, [x14, #0x50]\n"
+      "add x14, x14, #0x60\n"
+      "str q14, [x23, #0x0]\n"
+      "str q15, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "str q17, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x22, #0x40]\n"
+      "str q25, [x22, #0x50]\n"
+      "123:"  // Height 3: Writeback done
+      "subs x16, x16, #0x18\n"
+      "bgt 84b\n"
+      "b 166f\n"
+      "124:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x10\n"
+      "mov x17, %x[bias]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "125:"  // Height 4: Column loop
+      "cbz x17, 126f\n"
+      "ldr q8, [x17, #0x0]\n"
+      "mov v14.16b, v8.16b\n"
+      "ldr q9, [x17, #0x10]\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q10, [x17, #0x20]\n"
+      "mov v16.16b, v10.16b\n"
+      "ldr q11, [x17, #0x30]\n"
+      "mov v17.16b, v11.16b\n"
+      "ldr q12, [x17, #0x40]\n"
+      "mov v18.16b, v12.16b\n"
+      "ldr q13, [x17, #0x50]\n"
+      "mov v19.16b, v13.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "add x17, x17, #0x60\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "mov v26.16b, v8.16b\n"
+      "mov v27.16b, v9.16b\n"
+      "mov v28.16b, v10.16b\n"
+      "mov v29.16b, v11.16b\n"
+      "mov v30.16b, v12.16b\n"
+      "mov v31.16b, v13.16b\n"
+      "b 141f\n"
+      "126:"  // Height 4: no bias
+      "tbz %x[flags], #0, 140f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x14, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x16, #0x18\n"
+      "add x21, x22, x20, LSL #2\n"
+      "bge 139f\n"
+      "tbz x16, #4, 130f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x14], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x14], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v11.4s }, [x14], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 128f\n"
+      "ld1 { v12.4s }, [x14], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 127f\n"
+      "ldr d13, [x14], #0x8\n"
+      "mov x20, #0x58\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v13.s }[2], [x14]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 138f\n"
+      "127:"  // Height 4: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x16, #0, 138f\n"
+      "ldr s13, [x14, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 138f\n"
+      "128:"  // Height 4: Partial accumulate: partial_2_16
+      "tbz x16, #1, 129f\n"
+      "ldr d12, [x14], #0x8\n"
+      "mov x20, #0x48\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v12.s }[2], [x14]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 138f\n"
+      "129:"  // Height 4: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x16, #0, 138f\n"
+      "ldr s12, [x14, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 138f\n"
+      "130:"  // Height 4: Partial accumulate: partial_8_0
+      "tbz x16, #3, 134f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x14], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 132f\n"
+      "ld1 { v10.4s }, [x14], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 131f\n"
+      "ldr d11, [x14], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v11.s }[2], [x14]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 138f\n"
+      "131:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x16, #0, 138f\n"
+      "ldr s11, [x14, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 138f\n"
+      "132:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x16, #1, 133f\n"
+      "ldr d10, [x14], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v10.s }[2], [x14]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 138f\n"
+      "133:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x16, #0, 138f\n"
+      "ldr s10, [x14, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "b 138f\n"
+      "134:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x16, #2, 136f\n"
+      "ld1 { v8.4s }, [x14], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 135f\n"
+      "ldr d9, [x14], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v9.s }[2], [x14]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "b 138f\n"
+      "135:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x16, #0, 138f\n"
+      "ldr s9, [x14, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "b 138f\n"
+      "136:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x16, #1, 137f\n"
+      "ldr d8, [x14], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v8.s }[2], [x14]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "b 138f\n"
+      "137:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s8, [x14, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "138:"  // Height 4: Partial accumulate: Done
+      "sub x14, x14, x20\n"
+      "b 141f\n"
+      "139:"  // Height 4: full accumulate
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "ldr q12, [x14, #0x40]\n"
+      "ldr q13, [x14, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x22, #0x40]\n"
+      "ldr q25, [x22, #0x50]\n"
+      "ldr q26, [x21, #0x0]\n"
+      "ldr q27, [x21, #0x10]\n"
+      "ldr q28, [x21, #0x20]\n"
+      "ldr q29, [x21, #0x30]\n"
+      "ldr q30, [x21, #0x40]\n"
+      "ldr q31, [x21, #0x50]\n"
+      "b 141f\n"
+      "140:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "141:"  // Height 4: setup done
+      "mov x13, #0x0\n"
+      "142:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 143f\n"
+      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x11, [x20, #0x0]\n"
+      "ldr x10, [x20, #0x8]\n"
+      "ldr x9, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "cbnz x13, 144f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x11, x11, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
+      "add x9, x9, x20, LSL #2\n"
+      "add x28, x28, x20, LSL #2\n"
+      "b 144f\n"
+      "143:"  // Height 4: setup direct input
+      "mov x11, %x[input_ptr]\n"
+      "add x10, x11, x21, LSL #2\n"
+      "add x9, x10, x21, LSL #2\n"
+      "add x28, x9, x21, LSL #2\n"
+      "144:"  // Height 4: input setup done
+      "cmp x12, #0x4\n"
+      "blt 147f\n"
+      "ldr q0, [x11, #0x0]\n"
+      "cmp x12, #0x8\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q2, [x9, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x15, #0x0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "blt 146f\n"
+      "145:"  // Height 4: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr x23, [x15, #0x48]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr x22, [x15, #0x58]\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr x21, [x15, #0x68]\n"
+      "fmla v26.4s, v4.4s, v3.s[0]\n"
+      "ldr d4, [x15, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr x20, [x15, #0x78]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "mov v4.d[1], x23\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "ldr x23, [x15, #0x88]\n"
+      "fmla v27.4s, v5.4s, v3.s[0]\n"
+      "ldr d5, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "mov v5.d[1], x22\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "ldr x22, [x15, #0x98]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "add x11, x11, #0x10\n"
+      "fmla v28.4s, v6.4s, v3.s[0]\n"
+      "ldr d6, [x15, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "mov v6.d[1], x21\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v29.4s, v7.4s, v3.s[0]\n"
+      "ldr d7, [x15, #0x70]\n"
+      "mov v7.d[1], x20\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v30.4s, v4.4s, v3.s[0]\n"
+      "ldr d4, [x15, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "mov v4.d[1], x23\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "ldr x23, [x15, #0xc8]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "ldr d5, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "mov v5.d[1], x22\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "ldr x22, [x15, #0xd8]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "ldr x27, [x11, #0x8]\n"
+      "fmla v26.4s, v6.4s, v3.s[1]\n"
+      "ldr d6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "mov v6.d[1], x21\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "ldr x26, [x10, #0x8]\n"
+      "fmla v27.4s, v7.4s, v3.s[1]\n"
+      "ldr d7, [x15, #0xb0]\n"
+      "mov v7.d[1], x20\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "ldr x25, [x9, #0x8]\n"
+      "fmla v28.4s, v4.4s, v3.s[1]\n"
+      "ldr d4, [x15, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "mov v4.d[1], x23\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "ldr x23, [x15, #0x108]\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "ldr x24, [x28, #0x8]\n"
+      "fmla v29.4s, v5.4s, v3.s[1]\n"
+      "ldr d5, [x15, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "mov v5.d[1], x22\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "ldr x22, [x15, #0x118]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "sub x12, x12, #0x4\n"
+      "fmla v30.4s, v6.4s, v3.s[1]\n"
+      "ldr d6, [x15, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "mov v6.d[1], x21\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "ldr x21, [x15, #0x128]\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "cmp x12, #0x8\n"
+      "fmla v31.4s, v7.4s, v3.s[1]\n"
+      "ldr d7, [x15, #0xf0]\n"
+      "mov v7.d[1], x20\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "ldr x20, [x15, #0x138]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v26.4s, v4.4s, v3.s[2]\n"
+      "ldr d4, [x15, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "mov v4.d[1], x23\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "ldr x23, [x15, #0x148]\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v27.4s, v5.4s, v3.s[2]\n"
+      "ldr d5, [x15, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "mov v5.d[1], x22\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "ldr x22, [x15, #0x158]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "fmla v28.4s, v6.4s, v3.s[2]\n"
+      "ldr d6, [x15, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "mov v6.d[1], x21\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "ldr x21, [x15, #0x168]\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla v29.4s, v7.4s, v3.s[2]\n"
+      "ldr d7, [x15, #0x130]\n"
+      "mov v7.d[1], x20\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "ldr x20, [x15, #0x178]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "fmla v30.4s, v4.4s, v3.s[2]\n"
+      "ldr d4, [x15, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "mov v4.d[1], x23\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "fmla v31.4s, v5.4s, v3.s[2]\n"
+      "ldr d5, [x15, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "mov v5.d[1], x22\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "fmla v26.4s, v6.4s, v3.s[3]\n"
+      "ldr d6, [x15, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "mov v6.d[1], x21\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "fmla v27.4s, v7.4s, v3.s[3]\n"
+      "ldr d7, [x15, #0x170]\n"
+      "mov v7.d[1], x20\n"
+      "add x15, x15, #0x180\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "ldr x23, [x15, #0x8]\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "ldr x22, [x15, #0x18]\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "ldr x21, [x15, #0x28]\n"
+      "fmla v28.4s, v4.4s, v3.s[3]\n"
+      "ldr d4, [x15, #0x0]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "ldr x20, [x15, #0x38]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "mov v4.d[1], x23\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v29.4s, v5.4s, v3.s[3]\n"
+      "ldr d5, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "mov v5.d[1], x22\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v30.4s, v6.4s, v3.s[3]\n"
+      "ldr d6, [x15, #0x20]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "ldr d0, [x11, #0x0]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "ldr d1, [x10, #0x0]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "ldr d2, [x9, #0x0]\n"
+      "fmla v31.4s, v7.4s, v3.s[3]\n"
+      "ldr d3, [x28, #0x0]\n"
+      "ldr d7, [x15, #0x30]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x27\n"
+      "mov v1.d[1], x26\n"
+      "mov v2.d[1], x25\n"
+      "mov v3.d[1], x24\n"
+      "mov v7.d[1], x20\n"
+      "bge 145b\n"
+      "146:"  // Height 4: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "add x11, x11, #0x10\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v26.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x15, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "sub x12, x12, #0x4\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v27.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla v28.4s, v6.4s, v3.s[0]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "fmla v29.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "fmla v30.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x15, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "fmla v26.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "fmla v27.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "fmla v28.4s, v4.4s, v3.s[1]\n"
+      "ldr q4, [x15, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "fmla v29.4s, v5.4s, v3.s[1]\n"
+      "ldr q5, [x15, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "fmla v30.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "fmla v31.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "fmla v26.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x15, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "fmla v27.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x15, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "fmla v28.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "fmla v29.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "fmla v30.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x15, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "fmla v31.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x15, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "fmla v26.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "fmla v27.4s, v7.4s, v3.s[3]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "add x15, x15, #0x180\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "fmla v28.4s, v4.4s, v3.s[3]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v29.4s, v5.4s, v3.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v30.4s, v6.4s, v3.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "fmla v31.4s, v7.4s, v3.s[3]\n"
+      "147:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x12, 149f\n"
+      "148:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s7, [x11], #0x4\n"
+      "sub x12, x12, #0x1\n"
+      "ldr s6, [x10], #0x4\n"
+      "ldr s5, [x9], #0x4\n"
+      "ldr s4, [x28], #0x4\n"
+      "ldr q1, [x15, #0x0]\n"
+      "fmla v8.4s, v1.4s, v7.s[0]\n"
+      "ldr q0, [x15, #0x10]\n"
+      "fmla v14.4s, v1.4s, v6.s[0]\n"
+      "ldr q3, [x15, #0x20]\n"
+      "fmla v20.4s, v1.4s, v5.s[0]\n"
+      "ldr q2, [x15, #0x30]\n"
+      "fmla v26.4s, v1.4s, v4.s[0]\n"
+      "ldr q1, [x15, #0x40]\n"
+      "fmla v9.4s, v0.4s, v7.s[0]\n"
+      "fmla v15.4s, v0.4s, v6.s[0]\n"
+      "fmla v21.4s, v0.4s, v5.s[0]\n"
+      "fmla v27.4s, v0.4s, v4.s[0]\n"
+      "ldr q0, [x15, #0x50]\n"
+      "fmla v10.4s, v3.4s, v7.s[0]\n"
+      "add x15, x15, #0x60\n"
+      "fmla v16.4s, v3.4s, v6.s[0]\n"
+      "fmla v22.4s, v3.4s, v5.s[0]\n"
+      "fmla v28.4s, v3.4s, v4.s[0]\n"
+      "fmla v11.4s, v2.4s, v7.s[0]\n"
+      "fmla v17.4s, v2.4s, v6.s[0]\n"
+      "fmla v23.4s, v2.4s, v5.s[0]\n"
+      "fmla v29.4s, v2.4s, v4.s[0]\n"
+      "fmla v12.4s, v1.4s, v7.s[0]\n"
+      "fmla v18.4s, v1.4s, v6.s[0]\n"
+      "fmla v24.4s, v1.4s, v5.s[0]\n"
+      "fmla v30.4s, v1.4s, v4.s[0]\n"
+      "fmla v13.4s, v0.4s, v7.s[0]\n"
+      "fmla v19.4s, v0.4s, v6.s[0]\n"
+      "fmla v25.4s, v0.4s, v5.s[0]\n"
+      "fmla v31.4s, v0.4s, v4.s[0]\n"
+      "cbnz x12, 148b\n"
+      "149:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x13, x13, #0x1\n"
+      "cmp x13, x20\n"
+      "bne 142b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x14, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "prfm pstl1keep, [x14, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 150f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmin v28.4s, v28.4s, v0.4s\n"
+      "fmin v29.4s, v29.4s, v0.4s\n"
+      "fmin v30.4s, v30.4s, v0.4s\n"
+      "fmin v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v31.4s, v31.4s, v0.4s\n"
+      "150:"  // Height 4: No activation
+      "cmp x16, #0x18\n"
+      "bge 163f\n"
+      "tbz x16, #4, 154f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "st1 { v9.4s }, [x14], #0x10\n"
+      "st1 { v10.4s }, [x14], #0x10\n"
+      "st1 { v11.4s }, [x14], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v27.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 152f\n"
+      "st1 { v12.4s }, [x14], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 151f\n"
+      "str d13, [x14], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v13.s }[2], [x14]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 162f\n"
+      "151:"  // Height 4: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 162f\n"
+      "str s13, [x14, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 162f\n"
+      "152:"  // Height 4: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 153f\n"
+      "str d12, [x14], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v12.s }[2], [x14]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 162f\n"
+      "153:"  // Height 4: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 162f\n"
+      "str s12, [x14, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 162f\n"
+      "154:"  // Height 4: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 158f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "st1 { v9.4s }, [x14], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v27.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 156f\n"
+      "st1 { v10.4s }, [x14], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 155f\n"
+      "str d11, [x14], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v11.s }[2], [x14]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 162f\n"
+      "155:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 162f\n"
+      "str s11, [x14, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 162f\n"
+      "156:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 157f\n"
+      "str d10, [x14], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v10.s }[2], [x14]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 162f\n"
+      "157:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 162f\n"
+      "str s10, [x14, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "b 162f\n"
+      "158:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 160f\n"
+      "st1 { v8.4s }, [x14], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 159f\n"
+      "str d9, [x14], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v9.s }[2], [x14]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "b 162f\n"
+      "159:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 162f\n"
+      "str s9, [x14, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "b 162f\n"
+      "160:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 161f\n"
+      "str d8, [x14], #0x8\n"
+      "str d14, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v8.s }[2], [x14]\n"
+      "st1 { v14.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "b 162f\n"
+      "161:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x14, #0x0]\n"
+      "str s14, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "162:"  // Height 4: Partial direct writeback: Done
+      "b 164f\n"
+      "163:"  // Height 4: Full writeback
+      "str q8, [x14, #0x0]\n"
+      "str q9, [x14, #0x10]\n"
+      "str q10, [x14, #0x20]\n"
+      "str q11, [x14, #0x30]\n"
+      "str q12, [x14, #0x40]\n"
+      "str q13, [x14, #0x50]\n"
+      "add x14, x14, #0x60\n"
+      "str q14, [x23, #0x0]\n"
+      "str q15, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "str q17, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x22, #0x40]\n"
+      "str q25, [x22, #0x50]\n"
+      "str q26, [x21, #0x0]\n"
+      "str q27, [x21, #0x10]\n"
+      "str q28, [x21, #0x20]\n"
+      "str q29, [x21, #0x30]\n"
+      "str q30, [x21, #0x40]\n"
+      "str q31, [x21, #0x50]\n"
+      "164:"  // Height 4: Writeback done
+      "subs x16, x16, #0x18\n"
+      "bgt 125b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 166f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 165f\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "165:"  // Update direct input
+      "mov x20, #0x10\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "166:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
new file mode 100644
index 0000000000..dbd45460e8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
@@ -0,0 +1,2593 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_4x24 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 124f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 83f\n"
+      "beq 42f\n"
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "cbz x10, 3f\n"
+      "ldr q8, [x10, #0x0]\n"
+      "ldr q9, [x10, #0x10]\n"
+      "ldr q10, [x10, #0x20]\n"
+      "ldr q11, [x10, #0x30]\n"
+      "ldr q12, [x10, #0x40]\n"
+      "ldr q13, [x10, #0x50]\n"
+      "add x10, x10, #0x60\n"
+      "b 18f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 17f\n"
+      "cmp x9, #0x18\n"
+      "bge 16f\n"
+      "tbz x9, #4, 7f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "tbz x9, #2, 5f\n"
+      "ld1 { v12.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 4f\n"
+      "ldr d13, [x27], #0x8\n"
+      "mov x20, #0x58\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v13.s }[2], [x27]\n"
+      "b 15f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s13, [x27, #0x0]\n"
+      "b 15f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_16
+      "tbz x9, #1, 6f\n"
+      "ldr d12, [x27], #0x8\n"
+      "mov x20, #0x48\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v12.s }[2], [x27]\n"
+      "b 15f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s12, [x27, #0x0]\n"
+      "b 15f\n"
+      "7:"  // Height 1: Partial accumulate: partial_8_0
+      "tbz x9, #3, 11f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "tbz x9, #2, 9f\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 8f\n"
+      "ldr d11, [x27], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v11.s }[2], [x27]\n"
+      "b 15f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s11, [x27, #0x0]\n"
+      "b 15f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x9, #1, 10f\n"
+      "ldr d10, [x27], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v10.s }[2], [x27]\n"
+      "b 15f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s10, [x27, #0x0]\n"
+      "b 15f\n"
+      "11:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x9, #2, 13f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 12f\n"
+      "ldr d9, [x27], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v9.s }[2], [x27]\n"
+      "b 15f\n"
+      "12:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s9, [x27, #0x0]\n"
+      "b 15f\n"
+      "13:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x9, #1, 14f\n"
+      "ldr d8, [x27], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v8.s }[2], [x27]\n"
+      "b 15f\n"
+      "14:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s8, [x27, #0x0]\n"
+      "mov x20, #0x0\n"
+      "15:"  // Height 1: Partial accumulate: Done
+      "sub x27, x27, x20\n"
+      "b 18f\n"
+      "16:"  // Height 1: full accumulate
+      "ldr q8, [x27, #0x0]\n"
+      "ldr q9, [x27, #0x10]\n"
+      "ldr q10, [x27, #0x20]\n"
+      "ldr q11, [x27, #0x30]\n"
+      "ldr q12, [x27, #0x40]\n"
+      "ldr q13, [x27, #0x50]\n"
+      "b 18f\n"
+      "17:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "18:"  // Height 1: setup done
+      "mov x26, #0x0\n"
+      "19:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 21f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 21f\n"
+      "20:"  // Height 1: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "21:"  // Height 1: input setup done
+      "cmp x25, #0x4\n"
+      "blt 24f\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "cmp x25, #0x8\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "blt 23f\n"
+      "22:"  // Height 1: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q19, [x28, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q18, [x28, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q17, [x28, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr q16, [x28, #0x70]\n"
+      "fmla v12.4s, v19.4s, v0.s[0]\n"
+      "ldr q19, [x28, #0x80]\n"
+      "fmla v13.4s, v18.4s, v0.s[0]\n"
+      "ldr q18, [x28, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x28, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x28, #0xb0]\n"
+      "fmla v10.4s, v19.4s, v0.s[1]\n"
+      "ldr q19, [x28, #0xc0]\n"
+      "fmla v11.4s, v18.4s, v0.s[1]\n"
+      "ldr q18, [x28, #0xd0]\n"
+      "fmla v12.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x28, #0xe0]\n"
+      "fmla v13.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x28, #0xf0]\n"
+      "fmla v8.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x28, #0x100]\n"
+      "fmla v9.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x28, #0x110]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x28, #0x120]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x28, #0x130]\n"
+      "fmla v12.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x28, #0x140]\n"
+      "fmla v13.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x28, #0x150]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x28, #0x160]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x28, #0x170]\n"
+      "sub x25, x25, #0x4\n"
+      "add x24, x24, #0x10\n"
+      "fmla v10.4s, v19.4s, v0.s[3]\n"
+      "fmla v11.4s, v18.4s, v0.s[3]\n"
+      "cmp x25, #0x8\n"
+      "add x28, x28, #0x180\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "fmla v12.4s, v17.4s, v0.s[3]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "fmla v13.4s, v16.4s, v0.s[3]\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "bge 22b\n"
+      "23:"  // Height 1: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q19, [x28, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q18, [x28, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q17, [x28, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr q16, [x28, #0x70]\n"
+      "fmla v12.4s, v19.4s, v0.s[0]\n"
+      "ldr q19, [x28, #0x80]\n"
+      "fmla v13.4s, v18.4s, v0.s[0]\n"
+      "ldr q18, [x28, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x28, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x28, #0xb0]\n"
+      "fmla v10.4s, v19.4s, v0.s[1]\n"
+      "ldr q19, [x28, #0xc0]\n"
+      "fmla v11.4s, v18.4s, v0.s[1]\n"
+      "ldr q18, [x28, #0xd0]\n"
+      "fmla v12.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x28, #0xe0]\n"
+      "fmla v13.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x28, #0xf0]\n"
+      "fmla v8.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x28, #0x100]\n"
+      "fmla v9.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x28, #0x110]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x28, #0x120]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x28, #0x130]\n"
+      "fmla v12.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x28, #0x140]\n"
+      "fmla v13.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x28, #0x150]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x28, #0x160]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x28, #0x170]\n"
+      "add x24, x24, #0x10\n"
+      "sub x25, x25, #0x4\n"
+      "fmla v10.4s, v19.4s, v0.s[3]\n"
+      "fmla v11.4s, v18.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v12.4s, v17.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v0.s[3]\n"
+      "24:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x25, 26f\n"
+      "25:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s18, [x24], #0x4\n"
+      "ldr q16, [x28, #0x0]\n"
+      "fmla v8.4s, v16.4s, v18.s[0]\n"
+      "sub x25, x25, #0x1\n"
+      "ldr q17, [x28, #0x10]\n"
+      "ldr q16, [x28, #0x20]\n"
+      "fmla v9.4s, v17.4s, v18.s[0]\n"
+      "fmla v10.4s, v16.4s, v18.s[0]\n"
+      "ldr q17, [x28, #0x30]\n"
+      "ldr q16, [x28, #0x40]\n"
+      "fmla v11.4s, v17.4s, v18.s[0]\n"
+      "fmla v12.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x28, #0x50]\n"
+      "fmla v13.4s, v16.4s, v18.s[0]\n"
+      "add x28, x28, #0x60\n"
+      "cbnz x25, 25b\n"
+      "26:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 19b\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmin v12.4s, v12.4s, v17.4s\n"
+      "fmin v13.4s, v13.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
+      "27:"  // Height 1: No activation
+      "cmp x9, #0x18\n"
+      "bge 40f\n"
+      "tbz x9, #4, 31f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v9.4s }, [x27], #0x10\n"
+      "st1 { v10.4s }, [x27], #0x10\n"
+      "st1 { v11.4s }, [x27], #0x10\n"
+      "tbz x9, #2, 29f\n"
+      "st1 { v12.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 28f\n"
+      "str d13, [x27], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v13.s }[2], [x27]\n"
+      "b 39f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 39f\n"
+      "str s13, [x27, #0x0]\n"
+      "b 39f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 30f\n"
+      "str d12, [x27], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v12.s }[2], [x27]\n"
+      "b 39f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 39f\n"
+      "str s12, [x27, #0x0]\n"
+      "b 39f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 35f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v9.4s }, [x27], #0x10\n"
+      "tbz x9, #2, 33f\n"
+      "st1 { v10.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 32f\n"
+      "str d11, [x27], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v11.s }[2], [x27]\n"
+      "b 39f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 39f\n"
+      "str s11, [x27, #0x0]\n"
+      "b 39f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 34f\n"
+      "str d10, [x27], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v10.s }[2], [x27]\n"
+      "b 39f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 39f\n"
+      "str s10, [x27, #0x0]\n"
+      "b 39f\n"
+      "35:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 37f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 36f\n"
+      "str d9, [x27], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v9.s }[2], [x27]\n"
+      "b 39f\n"
+      "36:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 39f\n"
+      "str s9, [x27, #0x0]\n"
+      "b 39f\n"
+      "37:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 38f\n"
+      "str d8, [x27], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v8.s }[2], [x27]\n"
+      "b 39f\n"
+      "38:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x27, #0x0]\n"
+      "39:"  // Height 1: Partial direct writeback: Done
+      "b 41f\n"
+      "40:"  // Height 1: Full writeback
+      "str q8, [x27, #0x0]\n"
+      "str q9, [x27, #0x10]\n"
+      "str q10, [x27, #0x20]\n"
+      "str q11, [x27, #0x30]\n"
+      "str q12, [x27, #0x40]\n"
+      "str q13, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "41:"  // Height 1: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 2b\n"
+      "b 166f\n"
+      "42:"  // Height 2
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "43:"  // Height 2: Column loop
+      "cbz x10, 44f\n"
+      "ldr q8, [x10, #0x0]\n"
+      "ldr q9, [x10, #0x10]\n"
+      "mov v14.16b, v8.16b\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q10, [x10, #0x20]\n"
+      "ldr q11, [x10, #0x30]\n"
+      "mov v16.16b, v10.16b\n"
+      "mov v17.16b, v11.16b\n"
+      "ldr q12, [x10, #0x40]\n"
+      "ldr q13, [x10, #0x50]\n"
+      "mov v18.16b, v12.16b\n"
+      "mov v19.16b, v13.16b\n"
+      "add x10, x10, #0x60\n"
+      "b 59f\n"
+      "44:"  // Height 2: no bias
+      "tbz %x[flags], #0, 58f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x9, #0x18\n"
+      "add x23, x27, x20, LSL #2\n"
+      "bge 57f\n"
+      "tbz x9, #4, 48f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x9, #2, 46f\n"
+      "ld1 { v12.4s }, [x27], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 45f\n"
+      "ldr d13, [x27], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "mov x20, #0x58\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v13.s }[2], [x27]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "b 56f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x9, #0, 56f\n"
+      "ldr s13, [x27, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "b 56f\n"
+      "46:"  // Height 2: Partial accumulate: partial_2_16
+      "tbz x9, #1, 47f\n"
+      "ldr d12, [x27], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x20, #0x48\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v12.s }[2], [x27]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "b 56f\n"
+      "47:"  // Height 2: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x9, #0, 56f\n"
+      "ldr s12, [x27, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "b 56f\n"
+      "48:"  // Height 2: Partial accumulate: partial_8_0
+      "tbz x9, #3, 52f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "tbz x9, #2, 50f\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 49f\n"
+      "ldr d11, [x27], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v11.s }[2], [x27]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "b 56f\n"
+      "49:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x9, #0, 56f\n"
+      "ldr s11, [x27, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "b 56f\n"
+      "50:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x9, #1, 51f\n"
+      "ldr d10, [x27], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v10.s }[2], [x27]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "b 56f\n"
+      "51:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x9, #0, 56f\n"
+      "ldr s10, [x27, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "b 56f\n"
+      "52:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x9, #2, 54f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 53f\n"
+      "ldr d9, [x27], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v9.s }[2], [x27]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "b 56f\n"
+      "53:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x9, #0, 56f\n"
+      "ldr s9, [x27, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "b 56f\n"
+      "54:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x9, #1, 55f\n"
+      "ldr d8, [x27], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v8.s }[2], [x27]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "b 56f\n"
+      "55:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s8, [x27, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "mov x20, #0x0\n"
+      "56:"  // Height 2: Partial accumulate: Done
+      "sub x27, x27, x20\n"
+      "b 59f\n"
+      "57:"  // Height 2: full accumulate
+      "ldr q8, [x27, #0x0]\n"
+      "ldr q9, [x27, #0x10]\n"
+      "ldr q10, [x27, #0x20]\n"
+      "ldr q11, [x27, #0x30]\n"
+      "ldr q12, [x27, #0x40]\n"
+      "ldr q13, [x27, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "b 59f\n"
+      "58:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "59:"  // Height 2: setup done
+      "mov x26, #0x0\n"
+      "60:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 62f\n"
+      "61:"  // Height 2: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "62:"  // Height 2: input setup done
+      "cmp x25, #0x4\n"
+      "blt 65f\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "cmp x25, #0x8\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "blt 64f\n"
+      "63:"  // Height 2: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q23, [x28, #0x40]\n"
+      "sub x25, x25, #0x4\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr q22, [x28, #0x50]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "ldr q21, [x28, #0x60]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr q20, [x28, #0x70]\n"
+      "cmp x25, #0x8\n"
+      "fmla v12.4s, v23.4s, v0.s[0]\n"
+      "fmla v18.4s, v23.4s, v1.s[0]\n"
+      "ldr q23, [x28, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v13.4s, v22.4s, v0.s[0]\n"
+      "fmla v19.4s, v22.4s, v1.s[0]\n"
+      "ldr q22, [x28, #0x90]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x28, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x28, #0xb0]\n"
+      "fmla v10.4s, v23.4s, v0.s[1]\n"
+      "fmla v16.4s, v23.4s, v1.s[1]\n"
+      "ldr q23, [x28, #0xc0]\n"
+      "fmla v11.4s, v22.4s, v0.s[1]\n"
+      "fmla v17.4s, v22.4s, v1.s[1]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "fmla v18.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      "fmla v13.4s, v20.4s, v0.s[1]\n"
+      "fmla v19.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      "fmla v8.4s, v23.4s, v0.s[2]\n"
+      "fmla v14.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x28, #0x100]\n"
+      "fmla v9.4s, v22.4s, v0.s[2]\n"
+      "fmla v15.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x28, #0x110]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v16.4s, v21.4s, v1.s[2]\n"
+      "ldr q21, [x28, #0x120]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v17.4s, v20.4s, v1.s[2]\n"
+      "ldr q20, [x28, #0x130]\n"
+      "fmla v12.4s, v23.4s, v0.s[2]\n"
+      "fmla v18.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x28, #0x140]\n"
+      "fmla v13.4s, v22.4s, v0.s[2]\n"
+      "fmla v19.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x28, #0x150]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "ldr q21, [x28, #0x160]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr q20, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v23.4s, v0.s[3]\n"
+      "fmla v16.4s, v23.4s, v1.s[3]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "fmla v11.4s, v22.4s, v0.s[3]\n"
+      "fmla v17.4s, v22.4s, v1.s[3]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "fmla v12.4s, v21.4s, v0.s[3]\n"
+      "fmla v18.4s, v21.4s, v1.s[3]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "fmla v13.4s, v20.4s, v0.s[3]\n"
+      "ldr q0, [x24, #0x0]\n"
+      "fmla v19.4s, v20.4s, v1.s[3]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "bge 63b\n"
+      "64:"  // Height 2: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q23, [x28, #0x40]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr q22, [x28, #0x50]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "ldr q21, [x28, #0x60]\n"
+      "sub x25, x25, #0x4\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr q20, [x28, #0x70]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v12.4s, v23.4s, v0.s[0]\n"
+      "fmla v18.4s, v23.4s, v1.s[0]\n"
+      "ldr q23, [x28, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v13.4s, v22.4s, v0.s[0]\n"
+      "fmla v19.4s, v22.4s, v1.s[0]\n"
+      "ldr q22, [x28, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x28, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x28, #0xb0]\n"
+      "fmla v10.4s, v23.4s, v0.s[1]\n"
+      "fmla v16.4s, v23.4s, v1.s[1]\n"
+      "ldr q23, [x28, #0xc0]\n"
+      "fmla v11.4s, v22.4s, v0.s[1]\n"
+      "fmla v17.4s, v22.4s, v1.s[1]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "fmla v18.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      "fmla v13.4s, v20.4s, v0.s[1]\n"
+      "fmla v19.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      "fmla v8.4s, v23.4s, v0.s[2]\n"
+      "fmla v14.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x28, #0x100]\n"
+      "fmla v9.4s, v22.4s, v0.s[2]\n"
+      "fmla v15.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x28, #0x110]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v16.4s, v21.4s, v1.s[2]\n"
+      "ldr q21, [x28, #0x120]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v17.4s, v20.4s, v1.s[2]\n"
+      "ldr q20, [x28, #0x130]\n"
+      "fmla v12.4s, v23.4s, v0.s[2]\n"
+      "fmla v18.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x28, #0x140]\n"
+      "fmla v13.4s, v22.4s, v0.s[2]\n"
+      "fmla v19.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x28, #0x150]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "ldr q21, [x28, #0x160]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr q20, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v23.4s, v0.s[3]\n"
+      "fmla v16.4s, v23.4s, v1.s[3]\n"
+      "fmla v11.4s, v22.4s, v0.s[3]\n"
+      "fmla v17.4s, v22.4s, v1.s[3]\n"
+      "fmla v12.4s, v21.4s, v0.s[3]\n"
+      "fmla v18.4s, v21.4s, v1.s[3]\n"
+      "fmla v13.4s, v20.4s, v0.s[3]\n"
+      "fmla v19.4s, v20.4s, v1.s[3]\n"
+      "65:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x25, 67f\n"
+      "66:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s25, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "sub x25, x25, #0x1\n"
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q20, [x28, #0x10]\n"
+      "fmla v8.4s, v21.4s, v25.s[0]\n"
+      "fmla v14.4s, v21.4s, v24.s[0]\n"
+      "ldr q23, [x28, #0x20]\n"
+      "ldr q22, [x28, #0x30]\n"
+      "fmla v9.4s, v20.4s, v25.s[0]\n"
+      "fmla v15.4s, v20.4s, v24.s[0]\n"
+      "ldr q21, [x28, #0x40]\n"
+      "ldr q20, [x28, #0x50]\n"
+      "fmla v10.4s, v23.4s, v25.s[0]\n"
+      "fmla v16.4s, v23.4s, v24.s[0]\n"
+      "fmla v11.4s, v22.4s, v25.s[0]\n"
+      "fmla v17.4s, v22.4s, v24.s[0]\n"
+      "add x28, x28, #0x60\n"
+      "fmla v12.4s, v21.4s, v25.s[0]\n"
+      "fmla v18.4s, v21.4s, v24.s[0]\n"
+      "fmla v13.4s, v20.4s, v25.s[0]\n"
+      "fmla v19.4s, v20.4s, v24.s[0]\n"
+      "cbnz x25, 66b\n"
+      "67:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 60b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 68f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v21.4s\n"
+      "fmin v9.4s, v9.4s, v21.4s\n"
+      "fmin v10.4s, v10.4s, v21.4s\n"
+      "fmin v11.4s, v11.4s, v21.4s\n"
+      "fmin v12.4s, v12.4s, v21.4s\n"
+      "fmin v13.4s, v13.4s, v21.4s\n"
+      "fmin v14.4s, v14.4s, v21.4s\n"
+      "fmin v15.4s, v15.4s, v21.4s\n"
+      "fmin v16.4s, v16.4s, v21.4s\n"
+      "fmin v17.4s, v17.4s, v21.4s\n"
+      "fmin v18.4s, v18.4s, v21.4s\n"
+      "fmin v19.4s, v19.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
+      "68:"  // Height 2: No activation
+      "cmp x9, #0x18\n"
+      "bge 81f\n"
+      "tbz x9, #4, 72f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v9.4s }, [x27], #0x10\n"
+      "st1 { v10.4s }, [x27], #0x10\n"
+      "st1 { v11.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x9, #2, 70f\n"
+      "st1 { v12.4s }, [x27], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 69f\n"
+      "str d13, [x27], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v13.s }[2], [x27]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "b 80f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 80f\n"
+      "str s13, [x27, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "b 80f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 71f\n"
+      "str d12, [x27], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v12.s }[2], [x27]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "b 80f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 80f\n"
+      "str s12, [x27, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "b 80f\n"
+      "72:"  // Height 2: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 76f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v9.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "tbz x9, #2, 74f\n"
+      "st1 { v10.4s }, [x27], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 73f\n"
+      "str d11, [x27], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v11.s }[2], [x27]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "b 80f\n"
+      "73:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 80f\n"
+      "str s11, [x27, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "b 80f\n"
+      "74:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 75f\n"
+      "str d10, [x27], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v10.s }[2], [x27]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "b 80f\n"
+      "75:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 80f\n"
+      "str s10, [x27, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "b 80f\n"
+      "76:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 78f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 77f\n"
+      "str d9, [x27], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v9.s }[2], [x27]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "b 80f\n"
+      "77:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 80f\n"
+      "str s9, [x27, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "b 80f\n"
+      "78:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 79f\n"
+      "str d8, [x27], #0x8\n"
+      "str d14, [x23], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v8.s }[2], [x27]\n"
+      "st1 { v14.s }[2], [x23]\n"
+      "b 80f\n"
+      "79:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x27, #0x0]\n"
+      "str s14, [x23, #0x0]\n"
+      "80:"  // Height 2: Partial direct writeback: Done
+      "b 82f\n"
+      "81:"  // Height 2: Full writeback
+      "str q8, [x27, #0x0]\n"
+      "str q9, [x27, #0x10]\n"
+      "str q10, [x27, #0x20]\n"
+      "str q11, [x27, #0x30]\n"
+      "str q12, [x27, #0x40]\n"
+      "str q13, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "str q14, [x23, #0x0]\n"
+      "str q15, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "str q17, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "82:"  // Height 2: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 43b\n"
+      "b 166f\n"
+      "83:"  // Height 3
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "84:"  // Height 3: Column loop
+      "cbz x10, 85f\n"
+      "ldr q8, [x10, #0x0]\n"
+      "ldr q9, [x10, #0x10]\n"
+      "mov v14.16b, v8.16b\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q10, [x10, #0x20]\n"
+      "ldr q11, [x10, #0x30]\n"
+      "mov v16.16b, v10.16b\n"
+      "mov v17.16b, v11.16b\n"
+      "ldr q12, [x10, #0x40]\n"
+      "ldr q13, [x10, #0x50]\n"
+      "mov v18.16b, v12.16b\n"
+      "mov v19.16b, v13.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "add x10, x10, #0x60\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "b 100f\n"
+      "85:"  // Height 3: no bias
+      "tbz %x[flags], #0, 99f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "cmp x9, #0x18\n"
+      "add x22, x23, x20, LSL #2\n"
+      "bge 98f\n"
+      "tbz x9, #4, 89f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 87f\n"
+      "ld1 { v12.4s }, [x27], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 86f\n"
+      "ldr d13, [x27], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "mov x20, #0x58\n"
+      "ldr d25, [x22], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v13.s }[2], [x27]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "b 97f\n"
+      "86:"  // Height 3: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x9, #0, 97f\n"
+      "ldr s13, [x27, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "b 97f\n"
+      "87:"  // Height 3: Partial accumulate: partial_2_16
+      "tbz x9, #1, 88f\n"
+      "ldr d12, [x27], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x20, #0x48\n"
+      "ldr d24, [x22], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v12.s }[2], [x27]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "b 97f\n"
+      "88:"  // Height 3: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x9, #0, 97f\n"
+      "ldr s12, [x27, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "b 97f\n"
+      "89:"  // Height 3: Partial accumulate: partial_8_0
+      "tbz x9, #3, 93f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 91f\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 90f\n"
+      "ldr d11, [x27], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v11.s }[2], [x27]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "b 97f\n"
+      "90:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x9, #0, 97f\n"
+      "ldr s11, [x27, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "b 97f\n"
+      "91:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x9, #1, 92f\n"
+      "ldr d10, [x27], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v10.s }[2], [x27]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "b 97f\n"
+      "92:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x9, #0, 97f\n"
+      "ldr s10, [x27, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "b 97f\n"
+      "93:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x9, #2, 95f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 94f\n"
+      "ldr d9, [x27], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v9.s }[2], [x27]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "b 97f\n"
+      "94:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x9, #0, 97f\n"
+      "ldr s9, [x27, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "b 97f\n"
+      "95:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x9, #1, 96f\n"
+      "ldr d8, [x27], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v8.s }[2], [x27]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "b 97f\n"
+      "96:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s8, [x27, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s20, [x22, #0x0]\n"
+      "97:"  // Height 3: Partial accumulate: Done
+      "sub x27, x27, x20\n"
+      "b 100f\n"
+      "98:"  // Height 3: full accumulate
+      "ldr q8, [x27, #0x0]\n"
+      "ldr q9, [x27, #0x10]\n"
+      "ldr q10, [x27, #0x20]\n"
+      "ldr q11, [x27, #0x30]\n"
+      "ldr q12, [x27, #0x40]\n"
+      "ldr q13, [x27, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x22, #0x40]\n"
+      "ldr q25, [x22, #0x50]\n"
+      "b 100f\n"
+      "99:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "100:"  // Height 3: setup done
+      "mov x26, #0x0\n"
+      "101:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 102f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 103f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 103f\n"
+      "102:"  // Height 3: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "103:"  // Height 3: input setup done
+      "cmp x25, #0x4\n"
+      "blt 106f\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "cmp x25, #0x8\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "blt 105f\n"
+      "104:"  // Height 3: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "sub x25, x25, #0x4\n"
+      "add x24, x24, #0x10\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q29, [x28, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "ldr q28, [x28, #0x50]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "cmp x25, #0x8\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "ldr q27, [x28, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "ldr q26, [x28, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v12.4s, v29.4s, v0.s[0]\n"
+      "fmla v18.4s, v29.4s, v1.s[0]\n"
+      "fmla v24.4s, v29.4s, v2.s[0]\n"
+      "ldr q29, [x28, #0x80]\n"
+      "fmla v13.4s, v28.4s, v0.s[0]\n"
+      "fmla v19.4s, v28.4s, v1.s[0]\n"
+      "fmla v25.4s, v28.4s, v2.s[0]\n"
+      "ldr q28, [x28, #0x90]\n"
+      "fmla v8.4s, v27.4s, v0.s[1]\n"
+      "fmla v14.4s, v27.4s, v1.s[1]\n"
+      "fmla v20.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x28, #0xa0]\n"
+      "fmla v9.4s, v26.4s, v0.s[1]\n"
+      "fmla v15.4s, v26.4s, v1.s[1]\n"
+      "fmla v21.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x28, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v16.4s, v29.4s, v1.s[1]\n"
+      "fmla v22.4s, v29.4s, v2.s[1]\n"
+      "ldr q29, [x28, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v17.4s, v28.4s, v1.s[1]\n"
+      "fmla v23.4s, v28.4s, v2.s[1]\n"
+      "ldr q28, [x28, #0xd0]\n"
+      "fmla v12.4s, v27.4s, v0.s[1]\n"
+      "fmla v18.4s, v27.4s, v1.s[1]\n"
+      "fmla v24.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x28, #0xe0]\n"
+      "fmla v13.4s, v26.4s, v0.s[1]\n"
+      "fmla v19.4s, v26.4s, v1.s[1]\n"
+      "fmla v25.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x28, #0xf0]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v20.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x28, #0x100]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v21.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x28, #0x110]\n"
+      "fmla v10.4s, v27.4s, v0.s[2]\n"
+      "fmla v16.4s, v27.4s, v1.s[2]\n"
+      "fmla v22.4s, v27.4s, v2.s[2]\n"
+      "ldr q27, [x28, #0x120]\n"
+      "fmla v11.4s, v26.4s, v0.s[2]\n"
+      "fmla v17.4s, v26.4s, v1.s[2]\n"
+      "fmla v23.4s, v26.4s, v2.s[2]\n"
+      "ldr q26, [x28, #0x130]\n"
+      "fmla v12.4s, v29.4s, v0.s[2]\n"
+      "fmla v18.4s, v29.4s, v1.s[2]\n"
+      "fmla v24.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x28, #0x140]\n"
+      "fmla v13.4s, v28.4s, v0.s[2]\n"
+      "fmla v19.4s, v28.4s, v1.s[2]\n"
+      "fmla v25.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x28, #0x150]\n"
+      "fmla v8.4s, v27.4s, v0.s[3]\n"
+      "fmla v14.4s, v27.4s, v1.s[3]\n"
+      "fmla v20.4s, v27.4s, v2.s[3]\n"
+      "ldr q27, [x28, #0x160]\n"
+      "fmla v9.4s, v26.4s, v0.s[3]\n"
+      "fmla v15.4s, v26.4s, v1.s[3]\n"
+      "fmla v21.4s, v26.4s, v2.s[3]\n"
+      "ldr q26, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v16.4s, v29.4s, v1.s[3]\n"
+      "fmla v22.4s, v29.4s, v2.s[3]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v17.4s, v28.4s, v1.s[3]\n"
+      "fmla v23.4s, v28.4s, v2.s[3]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "fmla v12.4s, v27.4s, v0.s[3]\n"
+      "fmla v18.4s, v27.4s, v1.s[3]\n"
+      "fmla v24.4s, v27.4s, v2.s[3]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "fmla v13.4s, v26.4s, v0.s[3]\n"
+      "ldr q0, [x24, #0x0]\n"
+      "fmla v19.4s, v26.4s, v1.s[3]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "fmla v25.4s, v26.4s, v2.s[3]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "bge 104b\n"
+      "105:"  // Height 3: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q29, [x28, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "ldr q28, [x28, #0x50]\n"
+      "sub x25, x25, #0x4\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "ldr q27, [x28, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "ldr q26, [x28, #0x70]\n"
+      "fmla v12.4s, v29.4s, v0.s[0]\n"
+      "fmla v18.4s, v29.4s, v1.s[0]\n"
+      "fmla v24.4s, v29.4s, v2.s[0]\n"
+      "ldr q29, [x28, #0x80]\n"
+      "fmla v13.4s, v28.4s, v0.s[0]\n"
+      "fmla v19.4s, v28.4s, v1.s[0]\n"
+      "fmla v25.4s, v28.4s, v2.s[0]\n"
+      "ldr q28, [x28, #0x90]\n"
+      "fmla v8.4s, v27.4s, v0.s[1]\n"
+      "fmla v14.4s, v27.4s, v1.s[1]\n"
+      "fmla v20.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x28, #0xa0]\n"
+      "fmla v9.4s, v26.4s, v0.s[1]\n"
+      "fmla v15.4s, v26.4s, v1.s[1]\n"
+      "fmla v21.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x28, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v16.4s, v29.4s, v1.s[1]\n"
+      "fmla v22.4s, v29.4s, v2.s[1]\n"
+      "ldr q29, [x28, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v17.4s, v28.4s, v1.s[1]\n"
+      "fmla v23.4s, v28.4s, v2.s[1]\n"
+      "ldr q28, [x28, #0xd0]\n"
+      "fmla v12.4s, v27.4s, v0.s[1]\n"
+      "fmla v18.4s, v27.4s, v1.s[1]\n"
+      "fmla v24.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x28, #0xe0]\n"
+      "fmla v13.4s, v26.4s, v0.s[1]\n"
+      "fmla v19.4s, v26.4s, v1.s[1]\n"
+      "fmla v25.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x28, #0xf0]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v20.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x28, #0x100]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v21.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x28, #0x110]\n"
+      "fmla v10.4s, v27.4s, v0.s[2]\n"
+      "fmla v16.4s, v27.4s, v1.s[2]\n"
+      "fmla v22.4s, v27.4s, v2.s[2]\n"
+      "ldr q27, [x28, #0x120]\n"
+      "fmla v11.4s, v26.4s, v0.s[2]\n"
+      "fmla v17.4s, v26.4s, v1.s[2]\n"
+      "fmla v23.4s, v26.4s, v2.s[2]\n"
+      "ldr q26, [x28, #0x130]\n"
+      "fmla v12.4s, v29.4s, v0.s[2]\n"
+      "fmla v18.4s, v29.4s, v1.s[2]\n"
+      "fmla v24.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x28, #0x140]\n"
+      "fmla v13.4s, v28.4s, v0.s[2]\n"
+      "fmla v19.4s, v28.4s, v1.s[2]\n"
+      "fmla v25.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x28, #0x150]\n"
+      "fmla v8.4s, v27.4s, v0.s[3]\n"
+      "fmla v14.4s, v27.4s, v1.s[3]\n"
+      "fmla v20.4s, v27.4s, v2.s[3]\n"
+      "ldr q27, [x28, #0x160]\n"
+      "fmla v9.4s, v26.4s, v0.s[3]\n"
+      "fmla v15.4s, v26.4s, v1.s[3]\n"
+      "fmla v21.4s, v26.4s, v2.s[3]\n"
+      "ldr q26, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v16.4s, v29.4s, v1.s[3]\n"
+      "fmla v22.4s, v29.4s, v2.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v17.4s, v28.4s, v1.s[3]\n"
+      "fmla v23.4s, v28.4s, v2.s[3]\n"
+      "fmla v12.4s, v27.4s, v0.s[3]\n"
+      "fmla v18.4s, v27.4s, v1.s[3]\n"
+      "fmla v24.4s, v27.4s, v2.s[3]\n"
+      "fmla v13.4s, v26.4s, v0.s[3]\n"
+      "fmla v19.4s, v26.4s, v1.s[3]\n"
+      "fmla v25.4s, v26.4s, v2.s[3]\n"
+      "106:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x25, 108f\n"
+      "107:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "sub x25, x25, #0x1\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q27, [x28, #0x0]\n"
+      "fmla v8.4s, v27.4s, v0.s[0]\n"
+      "fmla v14.4s, v27.4s, v31.s[0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      "ldr q29, [x28, #0x20]\n"
+      "fmla v20.4s, v27.4s, v30.s[0]\n"
+      "fmla v9.4s, v26.4s, v0.s[0]\n"
+      "ldr q28, [x28, #0x30]\n"
+      "ldr q27, [x28, #0x40]\n"
+      "fmla v15.4s, v26.4s, v31.s[0]\n"
+      "fmla v21.4s, v26.4s, v30.s[0]\n"
+      "ldr q26, [x28, #0x50]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v16.4s, v29.4s, v31.s[0]\n"
+      "add x28, x28, #0x60\n"
+      "fmla v22.4s, v29.4s, v30.s[0]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v17.4s, v28.4s, v31.s[0]\n"
+      "fmla v23.4s, v28.4s, v30.s[0]\n"
+      "fmla v12.4s, v27.4s, v0.s[0]\n"
+      "fmla v18.4s, v27.4s, v31.s[0]\n"
+      "fmla v24.4s, v27.4s, v30.s[0]\n"
+      "fmla v13.4s, v26.4s, v0.s[0]\n"
+      "fmla v19.4s, v26.4s, v31.s[0]\n"
+      "fmla v25.4s, v26.4s, v30.s[0]\n"
+      "cbnz x25, 107b\n"
+      "108:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 101b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "tbz %x[flags], #1, 109f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v27.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v27.4s\n"
+      "fmin v9.4s, v9.4s, v27.4s\n"
+      "fmin v10.4s, v10.4s, v27.4s\n"
+      "fmin v11.4s, v11.4s, v27.4s\n"
+      "fmin v12.4s, v12.4s, v27.4s\n"
+      "fmin v13.4s, v13.4s, v27.4s\n"
+      "fmin v14.4s, v14.4s, v27.4s\n"
+      "fmin v15.4s, v15.4s, v27.4s\n"
+      "fmin v16.4s, v16.4s, v27.4s\n"
+      "fmin v17.4s, v17.4s, v27.4s\n"
+      "fmin v18.4s, v18.4s, v27.4s\n"
+      "fmin v19.4s, v19.4s, v27.4s\n"
+      "fmin v20.4s, v20.4s, v27.4s\n"
+      "fmin v21.4s, v21.4s, v27.4s\n"
+      "fmin v22.4s, v22.4s, v27.4s\n"
+      "fmin v23.4s, v23.4s, v27.4s\n"
+      "fmin v24.4s, v24.4s, v27.4s\n"
+      "fmin v25.4s, v25.4s, v27.4s\n"
+      "fmax v8.4s, v8.4s, v26.4s\n"
+      "fmax v9.4s, v9.4s, v26.4s\n"
+      "fmax v10.4s, v10.4s, v26.4s\n"
+      "fmax v11.4s, v11.4s, v26.4s\n"
+      "fmax v12.4s, v12.4s, v26.4s\n"
+      "fmax v13.4s, v13.4s, v26.4s\n"
+      "fmax v14.4s, v14.4s, v26.4s\n"
+      "fmax v15.4s, v15.4s, v26.4s\n"
+      "fmax v16.4s, v16.4s, v26.4s\n"
+      "fmax v17.4s, v17.4s, v26.4s\n"
+      "fmax v18.4s, v18.4s, v26.4s\n"
+      "fmax v19.4s, v19.4s, v26.4s\n"
+      "fmax v20.4s, v20.4s, v26.4s\n"
+      "fmax v21.4s, v21.4s, v26.4s\n"
+      "fmax v22.4s, v22.4s, v26.4s\n"
+      "fmax v23.4s, v23.4s, v26.4s\n"
+      "fmax v24.4s, v24.4s, v26.4s\n"
+      "fmax v25.4s, v25.4s, v26.4s\n"
+      "109:"  // Height 3: No activation
+      "cmp x9, #0x18\n"
+      "bge 122f\n"
+      "tbz x9, #4, 113f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v9.4s }, [x27], #0x10\n"
+      "st1 { v10.4s }, [x27], #0x10\n"
+      "st1 { v11.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 111f\n"
+      "st1 { v12.4s }, [x27], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 110f\n"
+      "str d13, [x27], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v13.s }[2], [x27]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "b 121f\n"
+      "110:"  // Height 3: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 121f\n"
+      "str s13, [x27, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "b 121f\n"
+      "111:"  // Height 3: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 112f\n"
+      "str d12, [x27], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v12.s }[2], [x27]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "b 121f\n"
+      "112:"  // Height 3: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 121f\n"
+      "str s12, [x27, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "b 121f\n"
+      "113:"  // Height 3: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 117f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v9.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 115f\n"
+      "st1 { v10.4s }, [x27], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 114f\n"
+      "str d11, [x27], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v11.s }[2], [x27]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "b 121f\n"
+      "114:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 121f\n"
+      "str s11, [x27, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "b 121f\n"
+      "115:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 116f\n"
+      "str d10, [x27], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v10.s }[2], [x27]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "b 121f\n"
+      "116:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 121f\n"
+      "str s10, [x27, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "b 121f\n"
+      "117:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 119f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 118f\n"
+      "str d9, [x27], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v9.s }[2], [x27]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "b 121f\n"
+      "118:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 121f\n"
+      "str s9, [x27, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "b 121f\n"
+      "119:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 120f\n"
+      "str d8, [x27], #0x8\n"
+      "str d14, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v8.s }[2], [x27]\n"
+      "st1 { v14.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "b 121f\n"
+      "120:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x27, #0x0]\n"
+      "str s14, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "121:"  // Height 3: Partial direct writeback: Done
+      "b 123f\n"
+      "122:"  // Height 3: Full writeback
+      "str q8, [x27, #0x0]\n"
+      "str q9, [x27, #0x10]\n"
+      "str q10, [x27, #0x20]\n"
+      "str q11, [x27, #0x30]\n"
+      "str q12, [x27, #0x40]\n"
+      "str q13, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "str q14, [x23, #0x0]\n"
+      "str q15, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "str q17, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x22, #0x40]\n"
+      "str q25, [x22, #0x50]\n"
+      "123:"  // Height 3: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 84b\n"
+      "b 166f\n"
+      "124:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x10\n"
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "125:"  // Height 4: Column loop
+      "cbz x10, 126f\n"
+      "ldr q8, [x10, #0x0]\n"
+      "ldr q9, [x10, #0x10]\n"
+      "mov v14.16b, v8.16b\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q10, [x10, #0x20]\n"
+      "ldr q11, [x10, #0x30]\n"
+      "mov v16.16b, v10.16b\n"
+      "mov v17.16b, v11.16b\n"
+      "ldr q12, [x10, #0x40]\n"
+      "ldr q13, [x10, #0x50]\n"
+      "mov v18.16b, v12.16b\n"
+      "mov v19.16b, v13.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "add x10, x10, #0x60\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "mov v26.16b, v8.16b\n"
+      "mov v27.16b, v9.16b\n"
+      "mov v28.16b, v10.16b\n"
+      "mov v29.16b, v11.16b\n"
+      "mov v30.16b, v12.16b\n"
+      "mov v31.16b, v13.16b\n"
+      "b 141f\n"
+      "126:"  // Height 4: no bias
+      "tbz %x[flags], #0, 140f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x9, #0x18\n"
+      "add x21, x22, x20, LSL #2\n"
+      "bge 139f\n"
+      "tbz x9, #4, 130f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 128f\n"
+      "ld1 { v12.4s }, [x27], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 127f\n"
+      "ldr d13, [x27], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "mov x20, #0x58\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v13.s }[2], [x27]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 138f\n"
+      "127:"  // Height 4: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x9, #0, 138f\n"
+      "ldr s13, [x27, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 138f\n"
+      "128:"  // Height 4: Partial accumulate: partial_2_16
+      "tbz x9, #1, 129f\n"
+      "ldr d12, [x27], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x20, #0x48\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v12.s }[2], [x27]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 138f\n"
+      "129:"  // Height 4: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x9, #0, 138f\n"
+      "ldr s12, [x27, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 138f\n"
+      "130:"  // Height 4: Partial accumulate: partial_8_0
+      "tbz x9, #3, 134f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 132f\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 131f\n"
+      "ldr d11, [x27], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v11.s }[2], [x27]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 138f\n"
+      "131:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x9, #0, 138f\n"
+      "ldr s11, [x27, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 138f\n"
+      "132:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x9, #1, 133f\n"
+      "ldr d10, [x27], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v10.s }[2], [x27]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 138f\n"
+      "133:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x9, #0, 138f\n"
+      "ldr s10, [x27, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "b 138f\n"
+      "134:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x9, #2, 136f\n"
+      "ld1 { v8.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 135f\n"
+      "ldr d9, [x27], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v9.s }[2], [x27]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "b 138f\n"
+      "135:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x9, #0, 138f\n"
+      "ldr s9, [x27, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "b 138f\n"
+      "136:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x9, #1, 137f\n"
+      "ldr d8, [x27], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v8.s }[2], [x27]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "b 138f\n"
+      "137:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s8, [x27, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "138:"  // Height 4: Partial accumulate: Done
+      "sub x27, x27, x20\n"
+      "b 141f\n"
+      "139:"  // Height 4: full accumulate
+      "ldr q8, [x27, #0x0]\n"
+      "ldr q9, [x27, #0x10]\n"
+      "ldr q10, [x27, #0x20]\n"
+      "ldr q11, [x27, #0x30]\n"
+      "ldr q12, [x27, #0x40]\n"
+      "ldr q13, [x27, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x22, #0x40]\n"
+      "ldr q25, [x22, #0x50]\n"
+      "ldr q26, [x21, #0x0]\n"
+      "ldr q27, [x21, #0x10]\n"
+      "ldr q28, [x21, #0x20]\n"
+      "ldr q29, [x21, #0x30]\n"
+      "ldr q30, [x21, #0x40]\n"
+      "ldr q31, [x21, #0x50]\n"
+      "b 141f\n"
+      "140:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "141:"  // Height 4: setup done
+      "mov x26, #0x0\n"
+      "142:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 143f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 144f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 144f\n"
+      "143:"  // Height 4: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "144:"  // Height 4: input setup done
+      "cmp x25, #0x4\n"
+      "blt 147f\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "cmp x25, #0x8\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x21, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "blt 146f\n"
+      "145:"  // Height 4: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "sub x25, x25, #0x4\n"
+      "add x24, x24, #0x10\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "fmla v26.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "fmla v27.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "cmp x25, #0x8\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "fmla v28.4s, v6.4s, v3.s[0]\n"
+      "ldr q6, [x28, #0x60]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "fmla v29.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x28, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "fmla v30.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x28, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x28, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "fmla v26.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x28, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "fmla v27.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "fmla v28.4s, v4.4s, v3.s[1]\n"
+      "ldr q4, [x28, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "fmla v29.4s, v5.4s, v3.s[1]\n"
+      "ldr q5, [x28, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "fmla v30.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x28, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "fmla v31.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x28, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "fmla v26.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x28, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "fmla v27.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x28, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "fmla v28.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x28, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "fmla v29.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x28, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "fmla v30.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x28, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "fmla v31.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x28, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "fmla v26.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x28, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "fmla v27.4s, v7.4s, v3.s[3]\n"
+      "ldr q7, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "fmla v28.4s, v4.4s, v3.s[3]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v29.4s, v5.4s, v3.s[3]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v30.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "ldr q0, [x24, #0x0]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "fmla v31.4s, v7.4s, v3.s[3]\n"
+      "ldr q3, [x21, #0x0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "bge 145b\n"
+      "146:"  // Height 4: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "fmla v26.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "add x21, x21, #0x10\n"
+      "sub x25, x25, #0x4\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "fmla v27.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "fmla v28.4s, v6.4s, v3.s[0]\n"
+      "ldr q6, [x28, #0x60]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "fmla v29.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x28, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "fmla v30.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x28, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x28, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "fmla v26.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x28, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "fmla v27.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "fmla v28.4s, v4.4s, v3.s[1]\n"
+      "ldr q4, [x28, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "fmla v29.4s, v5.4s, v3.s[1]\n"
+      "ldr q5, [x28, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "fmla v30.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x28, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "fmla v31.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x28, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "fmla v26.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x28, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "fmla v27.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x28, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "fmla v28.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x28, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "fmla v29.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x28, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "fmla v30.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x28, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "fmla v31.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x28, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "fmla v26.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x28, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "fmla v27.4s, v7.4s, v3.s[3]\n"
+      "ldr q7, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "fmla v28.4s, v4.4s, v3.s[3]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v29.4s, v5.4s, v3.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v30.4s, v6.4s, v3.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "fmla v31.4s, v7.4s, v3.s[3]\n"
+      "147:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x25, 149f\n"
+      "148:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s7, [x24], #0x4\n"
+      "ldr s6, [x23], #0x4\n"
+      "sub x25, x25, #0x1\n"
+      "ldr s5, [x22], #0x4\n"
+      "ldr s4, [x21], #0x4\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q0, [x28, #0x10]\n"
+      "fmla v8.4s, v1.4s, v7.s[0]\n"
+      "fmla v14.4s, v1.4s, v6.s[0]\n"
+      "ldr q3, [x28, #0x20]\n"
+      "ldr q2, [x28, #0x30]\n"
+      "fmla v20.4s, v1.4s, v5.s[0]\n"
+      "fmla v26.4s, v1.4s, v4.s[0]\n"
+      "ldr q1, [x28, #0x40]\n"
+      "fmla v9.4s, v0.4s, v7.s[0]\n"
+      "fmla v15.4s, v0.4s, v6.s[0]\n"
+      "fmla v21.4s, v0.4s, v5.s[0]\n"
+      "fmla v27.4s, v0.4s, v4.s[0]\n"
+      "ldr q0, [x28, #0x50]\n"
+      "add x28, x28, #0x60\n"
+      "fmla v10.4s, v3.4s, v7.s[0]\n"
+      "fmla v16.4s, v3.4s, v6.s[0]\n"
+      "fmla v22.4s, v3.4s, v5.s[0]\n"
+      "fmla v28.4s, v3.4s, v4.s[0]\n"
+      "fmla v11.4s, v2.4s, v7.s[0]\n"
+      "fmla v17.4s, v2.4s, v6.s[0]\n"
+      "fmla v23.4s, v2.4s, v5.s[0]\n"
+      "fmla v29.4s, v2.4s, v4.s[0]\n"
+      "fmla v12.4s, v1.4s, v7.s[0]\n"
+      "fmla v18.4s, v1.4s, v6.s[0]\n"
+      "fmla v24.4s, v1.4s, v5.s[0]\n"
+      "fmla v30.4s, v1.4s, v4.s[0]\n"
+      "fmla v13.4s, v0.4s, v7.s[0]\n"
+      "fmla v19.4s, v0.4s, v6.s[0]\n"
+      "fmla v25.4s, v0.4s, v5.s[0]\n"
+      "fmla v31.4s, v0.4s, v4.s[0]\n"
+      "cbnz x25, 148b\n"
+      "149:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 142b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 150f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmin v28.4s, v28.4s, v1.4s\n"
+      "fmin v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v1.4s\n"
+      "fmin v31.4s, v31.4s, v1.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v31.4s, v31.4s, v0.4s\n"
+      "150:"  // Height 4: No activation
+      "cmp x9, #0x18\n"
+      "bge 163f\n"
+      "tbz x9, #4, 154f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v9.4s }, [x27], #0x10\n"
+      "st1 { v10.4s }, [x27], #0x10\n"
+      "st1 { v11.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v27.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 152f\n"
+      "st1 { v12.4s }, [x27], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 151f\n"
+      "str d13, [x27], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v13.s }[2], [x27]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 162f\n"
+      "151:"  // Height 4: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 162f\n"
+      "str s13, [x27, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 162f\n"
+      "152:"  // Height 4: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 153f\n"
+      "str d12, [x27], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v12.s }[2], [x27]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 162f\n"
+      "153:"  // Height 4: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 162f\n"
+      "str s12, [x27, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 162f\n"
+      "154:"  // Height 4: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 158f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v9.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v27.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 156f\n"
+      "st1 { v10.4s }, [x27], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 155f\n"
+      "str d11, [x27], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v11.s }[2], [x27]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 162f\n"
+      "155:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 162f\n"
+      "str s11, [x27, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 162f\n"
+      "156:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 157f\n"
+      "str d10, [x27], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v10.s }[2], [x27]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 162f\n"
+      "157:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 162f\n"
+      "str s10, [x27, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "b 162f\n"
+      "158:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 160f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 159f\n"
+      "str d9, [x27], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v9.s }[2], [x27]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "b 162f\n"
+      "159:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 162f\n"
+      "str s9, [x27, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "b 162f\n"
+      "160:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 161f\n"
+      "str d8, [x27], #0x8\n"
+      "str d14, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v8.s }[2], [x27]\n"
+      "st1 { v14.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "b 162f\n"
+      "161:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x27, #0x0]\n"
+      "str s14, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "162:"  // Height 4: Partial direct writeback: Done
+      "b 164f\n"
+      "163:"  // Height 4: Full writeback
+      "str q8, [x27, #0x0]\n"
+      "str q9, [x27, #0x10]\n"
+      "str q10, [x27, #0x20]\n"
+      "str q11, [x27, #0x30]\n"
+      "str q12, [x27, #0x40]\n"
+      "str q13, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "str q14, [x23, #0x0]\n"
+      "str q15, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "str q17, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x22, #0x40]\n"
+      "str q25, [x22, #0x50]\n"
+      "str q26, [x21, #0x0]\n"
+      "str q27, [x21, #0x10]\n"
+      "str q28, [x21, #0x20]\n"
+      "str q29, [x21, #0x30]\n"
+      "str q30, [x21, #0x40]\n"
+      "str q31, [x21, #0x50]\n"
+      "164:"  // Height 4: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 125b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 166f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 165f\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "165:"  // Update direct input
+      "mov x20, #0x10\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "166:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
index 7f83e617c5..7f85d2dd42 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,21 +10,21 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
 
-#include "../std_transforms_fixed.hpp"
+#include "../std_transforms_fixed_trB.hpp"
 #include "../performance_parameters.hpp"
 
 #define ARGLIST  \
@@ -44,7 +44,8 @@ void a64_hybrid_fp32_mla_6x16_a55( ARGLIST );
 class cls_a64_hybrid_fp32_mla_6x16
 {
 public:
-    typedef float operand_type;
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,20 +71,28 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 16, 1> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixedTRB<rhs_operand_type, result_type, 6, 16, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 3.04 };
-            case CPUModel::A53:
-                return { 1.43 };
-            case CPUModel::A73:
-                return { 2.56 };
-            default:
-                return { 6.667 };
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 2.986 };
+                case CPUModel::A53:
+                    return { 1.43 };
+                case CPUModel::A73:
+                    return { 2.56 };
+                default:
+                    return { 6.667 };
+                case CPUModel::A510:
+                    return { 3.88 };
+                case CPUModel::V1:
+                    return { 13.43 };
+            }
         }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
index 184cfaf95c..ddbc840829 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_6x16_a55 (
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 166f\n"
@@ -102,82 +101,82 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "cmp %x[M], #0x2\n"
       "bgt 67f\n"
       "beq 34f\n"
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x16, %x[bias]\n"
-      "mov x15, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "cbz x16, 3f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
+      "cbz x7, 3f\n"
+      "ldr q8, [x7, #0x0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "add x7, x7, #0x40\n"
       "b 14f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 13f\n"
       "cmp x8, #0x10\n"
       "bge 12f\n"
       "tbz x8, #3, 7f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
+      "ld1 { v9.4s }, [x16], #0x10\n"
       "tbz x8, #2, 5f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
+      "ld1 { v10.4s }, [x16], #0x10\n"
       "tbz x8, #1, 4f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x15], #0x8\n"
+      "ldr d11, [x16], #0x8\n"
+      "mov x20, #0x38\n"
       "tbz x8, #0, 11f\n"
-      "ld1 { v11.s }[2], [x15]\n"
+      "ld1 { v11.s }[2], [x16]\n"
       "b 11f\n"
       "4:"  // Height 1: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 11f\n"
-      "ldr s11, [x15, #0x0]\n"
+      "ldr s11, [x16, #0x0]\n"
       "b 11f\n"
       "5:"  // Height 1: Partial accumulate: partial_2_8
       "tbz x8, #1, 6f\n"
-      "ldr d10, [x15], #0x8\n"
-      "mov x19, #0x28\n"
+      "ldr d10, [x16], #0x8\n"
+      "mov x20, #0x28\n"
       "tbz x8, #0, 11f\n"
-      "ld1 { v10.s }[2], [x15]\n"
+      "ld1 { v10.s }[2], [x16]\n"
       "b 11f\n"
       "6:"  // Height 1: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 11f\n"
-      "ldr s10, [x15, #0x0]\n"
+      "ldr s10, [x16, #0x0]\n"
       "b 11f\n"
       "7:"  // Height 1: Partial accumulate: partial_4_0
       "tbz x8, #2, 9f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
       "tbz x8, #1, 8f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x15], #0x8\n"
+      "ldr d9, [x16], #0x8\n"
+      "mov x20, #0x18\n"
       "tbz x8, #0, 11f\n"
-      "ld1 { v9.s }[2], [x15]\n"
+      "ld1 { v9.s }[2], [x16]\n"
       "b 11f\n"
       "8:"  // Height 1: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 11f\n"
-      "ldr s9, [x15, #0x0]\n"
+      "ldr s9, [x16, #0x0]\n"
       "b 11f\n"
       "9:"  // Height 1: Partial accumulate: partial_2_0
       "tbz x8, #1, 10f\n"
-      "ldr d8, [x15], #0x8\n"
-      "mov x19, #0x8\n"
+      "ldr d8, [x16], #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x8, #0, 11f\n"
-      "ld1 { v8.s }[2], [x15]\n"
+      "ld1 { v8.s }[2], [x16]\n"
       "b 11f\n"
       "10:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr s8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "11:"  // Height 1: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 14f\n"
       "12:"  // Height 1: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "b 14f\n"
       "13:"  // Height 1: no accumulate
       "movi v8.16b, #0x0\n"
@@ -185,330 +184,330 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "movi v10.16b, #0x0\n"
       "movi v11.16b, #0x0\n"
       "14:"  // Height 1: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "15:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 16f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "cbnz x14, 17f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "cbnz x15, 17f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #2\n"
       "b 17f\n"
       "16:"  // Height 1: setup direct input
-      "mov x12, %x[input_ptr]\n"
+      "mov x13, %x[input_ptr]\n"
       "17:"  // Height 1: input setup done
-      "cmp x13, #0x4\n"
+      "cmp x14, #0x4\n"
       "blt 20f\n"
-      "ldr q0, [x12, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x8\n"
       "ldr q6, [x17, #0x0]\n"
-      "cmp x13, #0x8\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 19f\n"
       "18:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr d7, [x17, #0x10]\n"
-      "ldr x11, [x17, #0x18]\n"
-      "add x12, x12, #0x10\n"
-      "ldr d6, [x17, #0x20]\n"
-      "sub x13, x13, #0x4\n"
-      "ldr x10, [x17, #0x28]\n"
-      "cmp x13, #0x8\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr d17, [x17, #0x20]\n"
+      "ldr x20, [x17, #0x28]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "ldr x10, [x17, #0x48]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x58]\n"
-      "ldr x9, [x12, #0x8]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "ldr x10, [x17, #0x68]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "ldr x10, [x17, #0x88]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "ldr x10, [x17, #0xa8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "ldr x10, [x17, #0xc8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "ldr x10, [x17, #0xe8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xf0]\n"
+      "ldr d16, [x17, #0x30]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x38]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr d17, [x17, #0x40]\n"
+      "ldr x20, [x17, #0x48]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr d16, [x17, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr d17, [x17, #0x60]\n"
+      "ldr x20, [x17, #0x68]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr d16, [x17, #0x70]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr d17, [x17, #0x80]\n"
+      "ldr x20, [x17, #0x88]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr d16, [x17, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr d17, [x17, #0xa0]\n"
+      "ldr x20, [x17, #0xa8]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr d16, [x17, #0xb0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr d17, [x17, #0xc0]\n"
+      "ldr x20, [x17, #0xc8]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr d16, [x17, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr d17, [x17, #0xe0]\n"
+      "ldr x20, [x17, #0xe8]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr d16, [x17, #0xf0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "mov v16.d[1], x20\n"
+      "add x13, x13, #0x10\n"
       "add x17, x17, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
       "ldr d6, [x17, #0x0]\n"
-      "ldr x10, [x17, #0x8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d0, [x12, #0x0]\n"
-      "mov v0.d[1], x9\n"
+      "ldr x20, [x17, #0x8]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "sub x14, x14, #0x4\n"
+      "ldr d7, [x17, #0x10]\n"
+      "cmp x14, #0x8\n"
+      "ldr x21, [x13, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "bge 18b\n"
       "19:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "sub x13, x13, #0x4\n"
-      "add x12, x12, #0x10\n"
+      "ldr q17, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr q17, [x17, #0x40]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr q16, [x17, #0x50]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x17, #0x60]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x17, #0x70]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x17, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x17, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x17, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x17, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x17, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x17, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x17, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x17, #0xf0]\n"
+      "add x13, x13, #0x10\n"
+      "sub x14, x14, #0x4\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
       "add x17, x17, #0x100\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
       "20:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x13, 22f\n"
+      "cbz x14, 22f\n"
       "21:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x1\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
+      "ldr s17, [x13], #0x4\n"
+      "sub x14, x14, #0x1\n"
+      "ldr q16, [x17, #0x0]\n"
+      "fmla v8.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x17, #0x10]\n"
+      "fmla v9.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x17, #0x20]\n"
+      "fmla v10.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v11.4s, v16.4s, v17.s[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "cbnz x13, 21b\n"
+      "cbnz x14, 21b\n"
       "22:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 15b\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "tbz %x[flags], #1, 23f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v16.4s\n"
+      "fmin v9.4s, v9.4s, v16.4s\n"
+      "fmin v10.4s, v10.4s, v16.4s\n"
+      "fmin v11.4s, v11.4s, v16.4s\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
       "23:"  // Height 1: No activation
       "cmp x8, #0x10\n"
       "bge 32f\n"
       "tbz x8, #3, 27f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
+      "st1 { v9.4s }, [x16], #0x10\n"
       "tbz x8, #2, 25f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
+      "st1 { v10.4s }, [x16], #0x10\n"
       "tbz x8, #1, 24f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "tbz x8, #0, 31f\n"
-      "st1 { v11.s }[2], [x15]\n"
+      "st1 { v11.s }[2], [x16]\n"
       "b 31f\n"
       "24:"  // Height 1: Partial direct writeback: partial_1_12
       "tbz x8, #0, 31f\n"
-      "str s11, [x15, #0x0]\n"
+      "str s11, [x16, #0x0]\n"
       "b 31f\n"
       "25:"  // Height 1: Partial direct writeback: partial_2_8
       "tbz x8, #1, 26f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "tbz x8, #0, 31f\n"
-      "st1 { v10.s }[2], [x15]\n"
+      "st1 { v10.s }[2], [x16]\n"
       "b 31f\n"
       "26:"  // Height 1: Partial direct writeback: partial_1_8
       "tbz x8, #0, 31f\n"
-      "str s10, [x15, #0x0]\n"
+      "str s10, [x16, #0x0]\n"
       "b 31f\n"
       "27:"  // Height 1: Partial direct writeback: partial_4_0
       "tbz x8, #2, 29f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
       "tbz x8, #1, 28f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "tbz x8, #0, 31f\n"
-      "st1 { v9.s }[2], [x15]\n"
+      "st1 { v9.s }[2], [x16]\n"
       "b 31f\n"
       "28:"  // Height 1: Partial direct writeback: partial_1_4
       "tbz x8, #0, 31f\n"
-      "str s9, [x15, #0x0]\n"
+      "str s9, [x16, #0x0]\n"
       "b 31f\n"
       "29:"  // Height 1: Partial direct writeback: partial_2_0
       "tbz x8, #1, 30f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "tbz x8, #0, 31f\n"
-      "st1 { v8.s }[2], [x15]\n"
+      "st1 { v8.s }[2], [x16]\n"
       "b 31f\n"
       "30:"  // Height 1: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
+      "str s8, [x16, #0x0]\n"
       "31:"  // Height 1: Partial direct writeback: Done
       "b 33f\n"
       "32:"  // Height 1: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "33:"  // Height 1: Writeback done
       "subs x8, x8, #0x10\n"
       "bgt 2b\n"
       "b 200f\n"
       "34:"  // Height 2
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x16, %x[bias]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
       "35:"  // Height 2: Column loop
-      "cbz x16, 36f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "cbz x7, 36f\n"
+      "ldr q8, [x7, #0x0]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q11, [x16, #0x30]\n"
+      "ldr q9, [x7, #0x10]\n"
       "mov v13.16b, v9.16b\n"
-      "add x16, x16, #0x40\n"
+      "ldr q10, [x7, #0x20]\n"
       "mov v14.16b, v10.16b\n"
+      "ldr q11, [x7, #0x30]\n"
       "mov v15.16b, v11.16b\n"
+      "add x7, x7, #0x40\n"
       "b 47f\n"
       "36:"  // Height 2: no bias
       "tbz %x[flags], #0, 46f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "cmp x8, #0x10\n"
-      "add x25, x15, x19, LSL #2\n"
+      "add x25, x16, x20, LSL #2\n"
       "bge 45f\n"
       "tbz x8, #3, 40f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
       "ld1 { v12.4s }, [x25], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
+      "ld1 { v9.4s }, [x16], #0x10\n"
       "ld1 { v13.4s }, [x25], #0x10\n"
       "tbz x8, #2, 38f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
+      "ld1 { v10.4s }, [x16], #0x10\n"
       "ld1 { v14.4s }, [x25], #0x10\n"
       "tbz x8, #1, 37f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x15], #0x8\n"
+      "ldr d11, [x16], #0x8\n"
+      "mov x20, #0x38\n"
       "ldr d15, [x25], #0x8\n"
       "tbz x8, #0, 44f\n"
-      "ld1 { v11.s }[2], [x15]\n"
+      "ld1 { v11.s }[2], [x16]\n"
       "ld1 { v15.s }[2], [x25]\n"
       "b 44f\n"
       "37:"  // Height 2: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 44f\n"
-      "ldr s11, [x15, #0x0]\n"
+      "ldr s11, [x16, #0x0]\n"
       "ldr s15, [x25, #0x0]\n"
       "b 44f\n"
       "38:"  // Height 2: Partial accumulate: partial_2_8
       "tbz x8, #1, 39f\n"
-      "ldr d10, [x15], #0x8\n"
+      "ldr d10, [x16], #0x8\n"
+      "mov x20, #0x28\n"
       "ldr d14, [x25], #0x8\n"
-      "mov x19, #0x28\n"
       "tbz x8, #0, 44f\n"
-      "ld1 { v10.s }[2], [x15]\n"
+      "ld1 { v10.s }[2], [x16]\n"
       "ld1 { v14.s }[2], [x25]\n"
       "b 44f\n"
       "39:"  // Height 2: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 44f\n"
-      "ldr s10, [x15, #0x0]\n"
+      "ldr s10, [x16, #0x0]\n"
       "ldr s14, [x25, #0x0]\n"
       "b 44f\n"
       "40:"  // Height 2: Partial accumulate: partial_4_0
       "tbz x8, #2, 42f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
       "ld1 { v12.4s }, [x25], #0x10\n"
       "tbz x8, #1, 41f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x15], #0x8\n"
+      "ldr d9, [x16], #0x8\n"
+      "mov x20, #0x18\n"
       "ldr d13, [x25], #0x8\n"
       "tbz x8, #0, 44f\n"
-      "ld1 { v9.s }[2], [x15]\n"
+      "ld1 { v9.s }[2], [x16]\n"
       "ld1 { v13.s }[2], [x25]\n"
       "b 44f\n"
       "41:"  // Height 2: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 44f\n"
-      "ldr s9, [x15, #0x0]\n"
+      "ldr s9, [x16, #0x0]\n"
       "ldr s13, [x25, #0x0]\n"
       "b 44f\n"
       "42:"  // Height 2: Partial accumulate: partial_2_0
       "tbz x8, #1, 43f\n"
-      "ldr d8, [x15], #0x8\n"
+      "ldr d8, [x16], #0x8\n"
+      "mov x20, #0x8\n"
       "ldr d12, [x25], #0x8\n"
-      "mov x19, #0x8\n"
       "tbz x8, #0, 44f\n"
-      "ld1 { v8.s }[2], [x15]\n"
+      "ld1 { v8.s }[2], [x16]\n"
       "ld1 { v12.s }[2], [x25]\n"
       "b 44f\n"
       "43:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr s8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "ldr s12, [x25, #0x0]\n"
       "44:"  // Height 2: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 47f\n"
       "45:"  // Height 2: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "ldr q12, [x25, #0x0]\n"
       "ldr q13, [x25, #0x10]\n"
       "ldr q14, [x25, #0x20]\n"
@@ -524,302 +523,302 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "movi v14.16b, #0x0\n"
       "movi v15.16b, #0x0\n"
       "47:"  // Height 2: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "cbnz x14, 50f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #2\n"
-      "add x28, x28, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "cbnz x15, 50f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #2\n"
+      "add x12, x12, x20, LSL #2\n"
       "b 50f\n"
       "49:"  // Height 2: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19, LSL #2\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21, LSL #2\n"
       "50:"  // Height 2: input setup done
-      "cmp x13, #0x4\n"
+      "cmp x14, #0x4\n"
       "blt 53f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x8\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x8\n"
+      "ldr q1, [x12, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 52f\n"
       "51:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr d7, [x17, #0x10]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x18]\n"
-      "ldr d6, [x17, #0x20]\n"
-      "add x12, x12, #0x10\n"
-      "ldr x10, [x17, #0x28]\n"
-      "add x28, x28, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      "sub x13, x13, #0x4\n"
+      "ldr d17, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x10\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr x11, [x17, #0x38]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "ldr x10, [x17, #0x48]\n"
-      "cmp x13, #0x8\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x58]\n"
-      "ldr x9, [x12, #0x8]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr x10, [x17, #0x68]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x78]\n"
-      "ldr x27, [x28, #0x8]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr x10, [x17, #0x88]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr x10, [x17, #0xa8]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr x10, [x17, #0xc8]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr x10, [x17, #0xe8]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "ldr d16, [x17, #0x30]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr d17, [x17, #0x40]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr x20, [x17, #0x48]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr d16, [x17, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr d17, [x17, #0x60]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr d16, [x17, #0x70]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr d17, [x17, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr x20, [x17, #0x88]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr d16, [x17, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr d17, [x17, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr d16, [x17, #0xb0]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr d17, [x17, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr x20, [x17, #0xc8]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr d16, [x17, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr d17, [x17, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr d16, [x17, #0xf0]\n"
+      "mov v17.d[1], x21\n"
+      "add x13, x13, #0x10\n"
+      "mov v16.d[1], x20\n"
+      "add x12, x12, #0x10\n"
       "add x17, x17, #0x100\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
       "ldr d6, [x17, #0x0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x10, [x17, #0x8]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d1, [x28, #0x0]\n"
-      "mov v0.d[1], x9\n"
-      "mov v1.d[1], x27\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "sub x14, x14, #0x4\n"
+      "ldr d7, [x17, #0x10]\n"
+      "cmp x14, #0x8\n"
+      "ldr x20, [x13, #0x8]\n"
+      "mov v6.d[1], x21\n"
+      "ldr x21, [x12, #0x8]\n"
+      "mov v0.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v1.d[1], x21\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
+      "add x13, x13, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "sub x13, x13, #0x4\n"
-      "add x12, x12, #0x10\n"
+      "ldr q17, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x12, x12, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "add x28, x28, #0x10\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "sub x14, x14, #0x4\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr q17, [x17, #0x40]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr q16, [x17, #0x50]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x17, #0x60]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x17, #0x70]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x17, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x17, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x17, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x17, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x17, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x17, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr q17, [x17, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr q16, [x17, #0xf0]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
       "add x17, x17, #0x100\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x13, 55f\n"
+      "cbz x14, 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x1\n"
-      "ldr s1, [x28], #0x4\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s19, [x13], #0x4\n"
+      "sub x14, x14, #0x1\n"
+      "ldr s18, [x12], #0x4\n"
+      "ldr q17, [x17, #0x0]\n"
+      "fmla v8.4s, v17.4s, v19.s[0]\n"
+      "ldr q16, [x17, #0x10]\n"
+      "fmla v12.4s, v17.4s, v18.s[0]\n"
+      "ldr q17, [x17, #0x20]\n"
+      "fmla v9.4s, v16.4s, v19.s[0]\n"
+      "fmla v13.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.4s, v17.4s, v19.s[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "cbnz x13, 54b\n"
+      "fmla v14.4s, v17.4s, v18.s[0]\n"
+      "fmla v11.4s, v16.4s, v19.s[0]\n"
+      "fmla v15.4s, v16.4s, v18.s[0]\n"
+      "cbnz x14, 54b\n"
       "55:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 48b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "add x25, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #2\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "prfm pstl1keep, [x25, #0x0]\n"
       "tbz %x[flags], #1, 56f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v16.4s\n"
+      "fmin v9.4s, v9.4s, v16.4s\n"
+      "fmin v10.4s, v10.4s, v16.4s\n"
+      "fmin v11.4s, v11.4s, v16.4s\n"
+      "fmin v12.4s, v12.4s, v16.4s\n"
+      "fmin v13.4s, v13.4s, v16.4s\n"
+      "fmin v14.4s, v14.4s, v16.4s\n"
+      "fmin v15.4s, v15.4s, v16.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
+      "fmax v14.4s, v14.4s, v16.4s\n"
+      "fmax v15.4s, v15.4s, v16.4s\n"
       "56:"  // Height 2: No activation
       "cmp x8, #0x10\n"
       "bge 65f\n"
       "tbz x8, #3, 60f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
+      "st1 { v9.4s }, [x16], #0x10\n"
       "st1 { v12.4s }, [x25], #0x10\n"
       "st1 { v13.4s }, [x25], #0x10\n"
       "tbz x8, #2, 58f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
+      "st1 { v10.4s }, [x16], #0x10\n"
       "st1 { v14.4s }, [x25], #0x10\n"
       "tbz x8, #1, 57f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "str d15, [x25], #0x8\n"
       "tbz x8, #0, 64f\n"
-      "st1 { v11.s }[2], [x15]\n"
+      "st1 { v11.s }[2], [x16]\n"
       "st1 { v15.s }[2], [x25]\n"
       "b 64f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x8, #0, 64f\n"
-      "str s11, [x15, #0x0]\n"
+      "str s11, [x16, #0x0]\n"
       "str s15, [x25, #0x0]\n"
       "b 64f\n"
       "58:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x8, #1, 59f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "str d14, [x25], #0x8\n"
       "tbz x8, #0, 64f\n"
-      "st1 { v10.s }[2], [x15]\n"
+      "st1 { v10.s }[2], [x16]\n"
       "st1 { v14.s }[2], [x25]\n"
       "b 64f\n"
       "59:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x8, #0, 64f\n"
-      "str s10, [x15, #0x0]\n"
+      "str s10, [x16, #0x0]\n"
       "str s14, [x25, #0x0]\n"
       "b 64f\n"
       "60:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x8, #2, 62f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
       "st1 { v12.4s }, [x25], #0x10\n"
       "tbz x8, #1, 61f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "str d13, [x25], #0x8\n"
       "tbz x8, #0, 64f\n"
-      "st1 { v9.s }[2], [x15]\n"
+      "st1 { v9.s }[2], [x16]\n"
       "st1 { v13.s }[2], [x25]\n"
       "b 64f\n"
       "61:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x8, #0, 64f\n"
-      "str s9, [x15, #0x0]\n"
+      "str s9, [x16, #0x0]\n"
       "str s13, [x25, #0x0]\n"
       "b 64f\n"
       "62:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x8, #1, 63f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "str d12, [x25], #0x8\n"
       "tbz x8, #0, 64f\n"
-      "st1 { v8.s }[2], [x15]\n"
+      "st1 { v8.s }[2], [x16]\n"
       "st1 { v12.s }[2], [x25]\n"
       "b 64f\n"
       "63:"  // Height 2: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
+      "str s8, [x16, #0x0]\n"
       "str s12, [x25, #0x0]\n"
       "64:"  // Height 2: Partial direct writeback: Done
       "b 66f\n"
       "65:"  // Height 2: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "str q12, [x25, #0x0]\n"
       "str q13, [x25, #0x10]\n"
       "str q14, [x25, #0x20]\n"
@@ -829,125 +828,125 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "bgt 35b\n"
       "b 200f\n"
       "67:"  // Height 3
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x16, %x[bias]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
       "68:"  // Height 3: Column loop
-      "cbz x16, 69f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "cbz x7, 69f\n"
+      "ldr q8, [x7, #0x0]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q9, [x7, #0x10]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x7, #0x20]\n"
       "mov v14.16b, v10.16b\n"
-      "mov v18.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
+      "ldr q11, [x7, #0x30]\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "add x7, x7, #0x40\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
       "b 80f\n"
       "69:"  // Height 3: no bias
       "tbz %x[flags], #0, 79f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #2\n"
       "cmp x8, #0x10\n"
-      "add x25, x15, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
       "bge 78f\n"
       "tbz x8, #3, 73f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
       "ld1 { v12.4s }, [x25], #0x10\n"
       "ld1 { v16.4s }, [x24], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
+      "ld1 { v9.4s }, [x16], #0x10\n"
       "ld1 { v13.4s }, [x25], #0x10\n"
       "ld1 { v17.4s }, [x24], #0x10\n"
       "tbz x8, #2, 71f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
+      "ld1 { v10.4s }, [x16], #0x10\n"
       "ld1 { v14.4s }, [x25], #0x10\n"
       "ld1 { v18.4s }, [x24], #0x10\n"
       "tbz x8, #1, 70f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x19, #0x38\n"
+      "ldr d11, [x16], #0x8\n"
+      "mov x20, #0x38\n"
       "ldr d15, [x25], #0x8\n"
       "ldr d19, [x24], #0x8\n"
       "tbz x8, #0, 77f\n"
-      "ld1 { v11.s }[2], [x15]\n"
+      "ld1 { v11.s }[2], [x16]\n"
       "ld1 { v15.s }[2], [x25]\n"
       "ld1 { v19.s }[2], [x24]\n"
       "b 77f\n"
       "70:"  // Height 3: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 77f\n"
-      "ldr s11, [x15, #0x0]\n"
+      "ldr s11, [x16, #0x0]\n"
       "ldr s15, [x25, #0x0]\n"
       "ldr s19, [x24, #0x0]\n"
       "b 77f\n"
       "71:"  // Height 3: Partial accumulate: partial_2_8
       "tbz x8, #1, 72f\n"
-      "ldr d10, [x15], #0x8\n"
+      "ldr d10, [x16], #0x8\n"
+      "mov x20, #0x28\n"
       "ldr d14, [x25], #0x8\n"
-      "mov x19, #0x28\n"
       "ldr d18, [x24], #0x8\n"
       "tbz x8, #0, 77f\n"
-      "ld1 { v10.s }[2], [x15]\n"
+      "ld1 { v10.s }[2], [x16]\n"
       "ld1 { v14.s }[2], [x25]\n"
       "ld1 { v18.s }[2], [x24]\n"
       "b 77f\n"
       "72:"  // Height 3: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 77f\n"
-      "ldr s10, [x15, #0x0]\n"
+      "ldr s10, [x16, #0x0]\n"
       "ldr s14, [x25, #0x0]\n"
       "ldr s18, [x24, #0x0]\n"
       "b 77f\n"
       "73:"  // Height 3: Partial accumulate: partial_4_0
       "tbz x8, #2, 75f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
       "ld1 { v12.4s }, [x25], #0x10\n"
       "ld1 { v16.4s }, [x24], #0x10\n"
       "tbz x8, #1, 74f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x19, #0x18\n"
+      "ldr d9, [x16], #0x8\n"
+      "mov x20, #0x18\n"
       "ldr d13, [x25], #0x8\n"
       "ldr d17, [x24], #0x8\n"
       "tbz x8, #0, 77f\n"
-      "ld1 { v9.s }[2], [x15]\n"
+      "ld1 { v9.s }[2], [x16]\n"
       "ld1 { v13.s }[2], [x25]\n"
       "ld1 { v17.s }[2], [x24]\n"
       "b 77f\n"
       "74:"  // Height 3: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 77f\n"
-      "ldr s9, [x15, #0x0]\n"
+      "ldr s9, [x16, #0x0]\n"
       "ldr s13, [x25, #0x0]\n"
       "ldr s17, [x24, #0x0]\n"
       "b 77f\n"
       "75:"  // Height 3: Partial accumulate: partial_2_0
       "tbz x8, #1, 76f\n"
-      "ldr d8, [x15], #0x8\n"
+      "ldr d8, [x16], #0x8\n"
+      "mov x20, #0x8\n"
       "ldr d12, [x25], #0x8\n"
-      "mov x19, #0x8\n"
       "ldr d16, [x24], #0x8\n"
       "tbz x8, #0, 77f\n"
-      "ld1 { v8.s }[2], [x15]\n"
+      "ld1 { v8.s }[2], [x16]\n"
       "ld1 { v12.s }[2], [x25]\n"
       "ld1 { v16.s }[2], [x24]\n"
       "b 77f\n"
       "76:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr s8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "ldr s12, [x25, #0x0]\n"
       "ldr s16, [x24, #0x0]\n"
       "77:"  // Height 3: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 80f\n"
       "78:"  // Height 3: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "ldr q12, [x25, #0x0]\n"
       "ldr q13, [x25, #0x10]\n"
       "ldr q14, [x25, #0x20]\n"
@@ -971,376 +970,376 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "movi v18.16b, #0x0\n"
       "movi v19.16b, #0x0\n"
       "80:"  // Height 3: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "81:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 82f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "cbnz x14, 83f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #2\n"
-      "add x28, x28, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "cbnz x15, 83f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #2\n"
+      "add x12, x12, x20, LSL #2\n"
+      "add x11, x11, x20, LSL #2\n"
       "b 83f\n"
       "82:"  // Height 3: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19, LSL #2\n"
-      "add x26, x28, x19, LSL #2\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21, LSL #2\n"
+      "add x11, x12, x21, LSL #2\n"
       "83:"  // Height 3: input setup done
-      "cmp x13, #0x4\n"
+      "cmp x14, #0x4\n"
       "blt 86f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x8\n"
-      "ldr q2, [x26, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x8\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 85f\n"
       "84:"  // Height 3: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr d7, [x17, #0x10]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x18]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr d6, [x17, #0x20]\n"
-      "ldr x10, [x17, #0x28]\n"
-      "add x12, x12, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "ldr x11, [x17, #0x38]\n"
-      "add x28, x28, #0x10\n"
+      "ldr d21, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x10\n"
+      "mov v21.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr x10, [x17, #0x48]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr x9, [x12, #0x8]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x40]\n"
-      "add x26, x26, #0x10\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x50]\n"
-      "sub x13, x13, #0x4\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr x10, [x17, #0x68]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr x27, [x28, #0x8]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x60]\n"
-      "cmp x13, #0x8\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr x25, [x26, #0x8]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr x10, [x17, #0x88]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr x10, [x17, #0xa8]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr x10, [x17, #0xc8]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr x10, [x17, #0xe8]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0xf0]\n"
+      "ldr d20, [x17, #0x30]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr d21, [x17, #0x40]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr d20, [x17, #0x50]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr d21, [x17, #0x60]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr d20, [x17, #0x70]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr d21, [x17, #0x80]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr d20, [x17, #0x90]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr d21, [x17, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr d20, [x17, #0xb0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr d21, [x17, #0xc0]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr d20, [x17, #0xd0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr d21, [x17, #0xe0]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
+      "add x13, x13, #0x10\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr d20, [x17, #0xf0]\n"
+      "mov v20.d[1], x20\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       "add x17, x17, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "ldr x10, [x17, #0x8]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "mov v7.d[1], x11\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
+      "ldr x20, [x17, #0x8]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "ldr x23, [x13, #0x8]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "mov v6.d[1], x10\n"
-      "mov v0.d[1], x9\n"
-      "ldr d2, [x26, #0x0]\n"
-      "mov v1.d[1], x27\n"
-      "mov v2.d[1], x25\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "ldr x22, [x12, #0x8]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "sub x14, x14, #0x4\n"
+      "ldr d7, [x17, #0x10]\n"
+      "cmp x14, #0x8\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v0.d[1], x23\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v1.d[1], x22\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "mov v2.d[1], x21\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v7.d[1], x20\n"
       "bge 84b\n"
       "85:"  // Height 3: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
+      "add x13, x13, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "sub x13, x13, #0x4\n"
+      "add x12, x12, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q21, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "sub x14, x14, #0x4\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "add x28, x28, #0x10\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "add x26, x26, #0x10\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "ldr q20, [x17, #0x30]\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr q21, [x17, #0x40]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr q20, [x17, #0x50]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x17, #0x60]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x17, #0x70]\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x17, #0x80]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x17, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x17, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x17, #0xb0]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x17, #0xc0]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x17, #0xd0]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr q21, [x17, #0xe0]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr q20, [x17, #0xf0]\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
       "add x17, x17, #0x100\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
       "86:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x13, 88f\n"
+      "cbz x14, 88f\n"
       "87:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x1\n"
-      "ldr s1, [x28], #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s24, [x13], #0x4\n"
+      "sub x14, x14, #0x1\n"
+      "ldr s23, [x12], #0x4\n"
+      "ldr s22, [x11], #0x4\n"
+      "ldr q21, [x17, #0x0]\n"
+      "fmla v8.4s, v21.4s, v24.s[0]\n"
+      "ldr q20, [x17, #0x10]\n"
+      "fmla v12.4s, v21.4s, v23.s[0]\n"
+      "fmla v16.4s, v21.4s, v22.s[0]\n"
+      "ldr q21, [x17, #0x20]\n"
+      "fmla v9.4s, v20.4s, v24.s[0]\n"
+      "fmla v13.4s, v20.4s, v23.s[0]\n"
+      "fmla v17.4s, v20.4s, v22.s[0]\n"
+      "ldr q20, [x17, #0x30]\n"
+      "fmla v10.4s, v21.4s, v24.s[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "cbnz x13, 87b\n"
+      "fmla v14.4s, v21.4s, v23.s[0]\n"
+      "fmla v18.4s, v21.4s, v22.s[0]\n"
+      "fmla v11.4s, v20.4s, v24.s[0]\n"
+      "fmla v15.4s, v20.4s, v23.s[0]\n"
+      "fmla v19.4s, v20.4s, v22.s[0]\n"
+      "cbnz x14, 87b\n"
       "88:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 81b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "add x25, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
       "prfm pstl1keep, [x24, #0x0]\n"
       "tbz %x[flags], #1, 89f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v20.4s\n"
+      "fmin v9.4s, v9.4s, v20.4s\n"
+      "fmin v10.4s, v10.4s, v20.4s\n"
+      "fmin v11.4s, v11.4s, v20.4s\n"
+      "fmin v12.4s, v12.4s, v20.4s\n"
+      "fmin v13.4s, v13.4s, v20.4s\n"
+      "fmin v14.4s, v14.4s, v20.4s\n"
+      "fmin v15.4s, v15.4s, v20.4s\n"
+      "fmin v16.4s, v16.4s, v20.4s\n"
+      "fmin v17.4s, v17.4s, v20.4s\n"
+      "fmin v18.4s, v18.4s, v20.4s\n"
+      "fmin v19.4s, v19.4s, v20.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
       "89:"  // Height 3: No activation
       "cmp x8, #0x10\n"
       "bge 98f\n"
       "tbz x8, #3, 93f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
+      "st1 { v9.4s }, [x16], #0x10\n"
       "st1 { v12.4s }, [x25], #0x10\n"
       "st1 { v13.4s }, [x25], #0x10\n"
       "st1 { v16.4s }, [x24], #0x10\n"
       "st1 { v17.4s }, [x24], #0x10\n"
       "tbz x8, #2, 91f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
+      "st1 { v10.4s }, [x16], #0x10\n"
       "st1 { v14.4s }, [x25], #0x10\n"
       "st1 { v18.4s }, [x24], #0x10\n"
       "tbz x8, #1, 90f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "str d15, [x25], #0x8\n"
       "str d19, [x24], #0x8\n"
       "tbz x8, #0, 97f\n"
-      "st1 { v11.s }[2], [x15]\n"
+      "st1 { v11.s }[2], [x16]\n"
       "st1 { v15.s }[2], [x25]\n"
       "st1 { v19.s }[2], [x24]\n"
       "b 97f\n"
       "90:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x8, #0, 97f\n"
-      "str s11, [x15, #0x0]\n"
+      "str s11, [x16, #0x0]\n"
       "str s15, [x25, #0x0]\n"
       "str s19, [x24, #0x0]\n"
       "b 97f\n"
       "91:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x8, #1, 92f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "str d14, [x25], #0x8\n"
       "str d18, [x24], #0x8\n"
       "tbz x8, #0, 97f\n"
-      "st1 { v10.s }[2], [x15]\n"
+      "st1 { v10.s }[2], [x16]\n"
       "st1 { v14.s }[2], [x25]\n"
       "st1 { v18.s }[2], [x24]\n"
       "b 97f\n"
       "92:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x8, #0, 97f\n"
-      "str s10, [x15, #0x0]\n"
+      "str s10, [x16, #0x0]\n"
       "str s14, [x25, #0x0]\n"
       "str s18, [x24, #0x0]\n"
       "b 97f\n"
       "93:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x8, #2, 95f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
       "st1 { v12.4s }, [x25], #0x10\n"
       "st1 { v16.4s }, [x24], #0x10\n"
       "tbz x8, #1, 94f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "str d13, [x25], #0x8\n"
       "str d17, [x24], #0x8\n"
       "tbz x8, #0, 97f\n"
-      "st1 { v9.s }[2], [x15]\n"
+      "st1 { v9.s }[2], [x16]\n"
       "st1 { v13.s }[2], [x25]\n"
       "st1 { v17.s }[2], [x24]\n"
       "b 97f\n"
       "94:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x8, #0, 97f\n"
-      "str s9, [x15, #0x0]\n"
+      "str s9, [x16, #0x0]\n"
       "str s13, [x25, #0x0]\n"
       "str s17, [x24, #0x0]\n"
       "b 97f\n"
       "95:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x8, #1, 96f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "str d12, [x25], #0x8\n"
       "str d16, [x24], #0x8\n"
       "tbz x8, #0, 97f\n"
-      "st1 { v8.s }[2], [x15]\n"
+      "st1 { v8.s }[2], [x16]\n"
       "st1 { v12.s }[2], [x25]\n"
       "st1 { v16.s }[2], [x24]\n"
       "b 97f\n"
       "96:"  // Height 3: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
+      "str s8, [x16, #0x0]\n"
       "str s12, [x25, #0x0]\n"
       "str s16, [x24, #0x0]\n"
       "97:"  // Height 3: Partial direct writeback: Done
       "b 99f\n"
       "98:"  // Height 3: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "str q12, [x25, #0x0]\n"
       "str q13, [x25, #0x10]\n"
       "str q14, [x25, #0x20]\n"
@@ -1354,146 +1353,146 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "bgt 68b\n"
       "b 200f\n"
       "100:"  // Height 4
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x16, %x[bias]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
       "101:"  // Height 4: Column loop
-      "cbz x16, 102f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "cbz x7, 102f\n"
+      "ldr q8, [x7, #0x0]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q9, [x7, #0x10]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x7, #0x20]\n"
       "mov v14.16b, v10.16b\n"
+      "ldr q11, [x7, #0x30]\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "add x7, x7, #0x40\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
       "mov v23.16b, v11.16b\n"
       "b 113f\n"
       "102:"  // Height 4: no bias
       "tbz %x[flags], #0, 112f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
       "cmp x8, #0x10\n"
-      "add x25, x15, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "bge 111f\n"
       "tbz x8, #3, 106f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
       "ld1 { v12.4s }, [x25], #0x10\n"
       "ld1 { v16.4s }, [x24], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x16], #0x10\n"
       "ld1 { v13.4s }, [x25], #0x10\n"
       "ld1 { v17.4s }, [x24], #0x10\n"
-      "ld1 { v20.4s }, [x23], #0x10\n"
       "ld1 { v21.4s }, [x23], #0x10\n"
       "tbz x8, #2, 104f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
+      "ld1 { v10.4s }, [x16], #0x10\n"
       "ld1 { v14.4s }, [x25], #0x10\n"
       "ld1 { v18.4s }, [x24], #0x10\n"
       "ld1 { v22.4s }, [x23], #0x10\n"
       "tbz x8, #1, 103f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x19, #0x38\n"
+      "ldr d11, [x16], #0x8\n"
+      "mov x20, #0x38\n"
       "ldr d15, [x25], #0x8\n"
       "ldr d19, [x24], #0x8\n"
       "ldr d23, [x23], #0x8\n"
       "tbz x8, #0, 110f\n"
-      "ld1 { v11.s }[2], [x15]\n"
+      "ld1 { v11.s }[2], [x16]\n"
       "ld1 { v15.s }[2], [x25]\n"
       "ld1 { v19.s }[2], [x24]\n"
       "ld1 { v23.s }[2], [x23]\n"
       "b 110f\n"
       "103:"  // Height 4: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 110f\n"
-      "ldr s11, [x15, #0x0]\n"
+      "ldr s11, [x16, #0x0]\n"
       "ldr s15, [x25, #0x0]\n"
       "ldr s19, [x24, #0x0]\n"
       "ldr s23, [x23, #0x0]\n"
       "b 110f\n"
       "104:"  // Height 4: Partial accumulate: partial_2_8
       "tbz x8, #1, 105f\n"
-      "ldr d10, [x15], #0x8\n"
+      "ldr d10, [x16], #0x8\n"
+      "mov x20, #0x28\n"
       "ldr d14, [x25], #0x8\n"
-      "mov x19, #0x28\n"
       "ldr d18, [x24], #0x8\n"
       "ldr d22, [x23], #0x8\n"
       "tbz x8, #0, 110f\n"
-      "ld1 { v10.s }[2], [x15]\n"
+      "ld1 { v10.s }[2], [x16]\n"
       "ld1 { v14.s }[2], [x25]\n"
       "ld1 { v18.s }[2], [x24]\n"
       "ld1 { v22.s }[2], [x23]\n"
       "b 110f\n"
       "105:"  // Height 4: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 110f\n"
-      "ldr s10, [x15, #0x0]\n"
+      "ldr s10, [x16, #0x0]\n"
       "ldr s14, [x25, #0x0]\n"
       "ldr s18, [x24, #0x0]\n"
       "ldr s22, [x23, #0x0]\n"
       "b 110f\n"
       "106:"  // Height 4: Partial accumulate: partial_4_0
       "tbz x8, #2, 108f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
       "ld1 { v12.4s }, [x25], #0x10\n"
       "ld1 { v16.4s }, [x24], #0x10\n"
       "ld1 { v20.4s }, [x23], #0x10\n"
       "tbz x8, #1, 107f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x19, #0x18\n"
+      "ldr d9, [x16], #0x8\n"
+      "mov x20, #0x18\n"
       "ldr d13, [x25], #0x8\n"
       "ldr d17, [x24], #0x8\n"
       "ldr d21, [x23], #0x8\n"
       "tbz x8, #0, 110f\n"
-      "ld1 { v9.s }[2], [x15]\n"
+      "ld1 { v9.s }[2], [x16]\n"
       "ld1 { v13.s }[2], [x25]\n"
       "ld1 { v17.s }[2], [x24]\n"
       "ld1 { v21.s }[2], [x23]\n"
       "b 110f\n"
       "107:"  // Height 4: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 110f\n"
-      "ldr s9, [x15, #0x0]\n"
+      "ldr s9, [x16, #0x0]\n"
       "ldr s13, [x25, #0x0]\n"
       "ldr s17, [x24, #0x0]\n"
       "ldr s21, [x23, #0x0]\n"
       "b 110f\n"
       "108:"  // Height 4: Partial accumulate: partial_2_0
       "tbz x8, #1, 109f\n"
-      "ldr d8, [x15], #0x8\n"
+      "ldr d8, [x16], #0x8\n"
+      "mov x20, #0x8\n"
       "ldr d12, [x25], #0x8\n"
-      "mov x19, #0x8\n"
       "ldr d16, [x24], #0x8\n"
       "ldr d20, [x23], #0x8\n"
       "tbz x8, #0, 110f\n"
-      "ld1 { v8.s }[2], [x15]\n"
+      "ld1 { v8.s }[2], [x16]\n"
       "ld1 { v12.s }[2], [x25]\n"
       "ld1 { v16.s }[2], [x24]\n"
       "ld1 { v20.s }[2], [x23]\n"
       "b 110f\n"
       "109:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr s8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "ldr s12, [x25, #0x0]\n"
       "ldr s16, [x24, #0x0]\n"
       "ldr s20, [x23, #0x0]\n"
       "110:"  // Height 4: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 113f\n"
       "111:"  // Height 4: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "ldr q12, [x25, #0x0]\n"
       "ldr q13, [x25, #0x10]\n"
       "ldr q14, [x25, #0x20]\n"
@@ -1525,353 +1524,353 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "movi v22.16b, #0x0\n"
       "movi v23.16b, #0x0\n"
       "113:"  // Height 4: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "114:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 115f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "cbnz x14, 116f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #2\n"
-      "add x28, x28, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "cbnz x15, 116f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #2\n"
+      "add x12, x12, x20, LSL #2\n"
+      "add x11, x11, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
       "b 116f\n"
       "115:"  // Height 4: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19, LSL #2\n"
-      "add x26, x28, x19, LSL #2\n"
-      "add x24, x26, x19, LSL #2\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21, LSL #2\n"
+      "add x11, x12, x21, LSL #2\n"
+      "add x10, x11, x21, LSL #2\n"
       "116:"  // Height 4: input setup done
-      "cmp x13, #0x4\n"
+      "cmp x14, #0x4\n"
       "blt 119f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x8\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x8\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 118f\n"
       "117:"  // Height 4: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr d7, [x17, #0x10]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x18]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr x10, [x17, #0x28]\n"
+      "add x13, x13, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr d6, [x17, #0x20]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x17, #0x38]\n"
-      "add x12, x12, #0x10\n"
-      "add x28, x28, #0x10\n"
+      "ldr d25, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x10\n"
+      "mov v25.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x12, x12, #0x10\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr x10, [x17, #0x48]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr x9, [x12, #0x8]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr x27, [x28, #0x8]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr x10, [x17, #0x68]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "add x26, x26, #0x10\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "ldr x25, [x26, #0x8]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x70]\n"
-      "add x24, x24, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr x10, [x17, #0x88]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr x23, [x24, #0x8]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr x10, [x17, #0xa8]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "sub x13, x13, #0x4\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "cmp x13, #0x8\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr x10, [x17, #0xc8]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr x10, [x17, #0xe8]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "ldr d24, [x17, #0x30]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "add x11, x11, #0x10\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr d25, [x17, #0x40]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr d24, [x17, #0x50]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "ldr x25, [x13, #0x8]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr d25, [x17, #0x60]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "ldr x24, [x12, #0x8]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr d24, [x17, #0x70]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "ldr x23, [x11, #0x8]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr d25, [x17, #0x80]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "ldr x22, [x10, #0x8]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr d24, [x17, #0x90]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
+      "sub x14, x14, #0x4\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr d25, [x17, #0xa0]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
+      "cmp x14, #0x8\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr d24, [x17, #0xb0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr d25, [x17, #0xc0]\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr d24, [x17, #0xd0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr d25, [x17, #0xe0]\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr d24, [x17, #0xf0]\n"
+      "mov v24.d[1], x20\n"
       "add x17, x17, #0x100\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "ldr x10, [x17, #0x8]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "ldr x20, [x17, #0x18]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "mov v0.d[1], x9\n"
-      "mov v1.d[1], x27\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "mov v2.d[1], x25\n"
-      "mov v3.d[1], x23\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
+      "ldr d3, [x10, #0x0]\n"
+      "ldr d7, [x17, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x24\n"
+      "mov v2.d[1], x23\n"
+      "mov v3.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 117b\n"
       "118:"  // Height 4: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
+      "add x13, x13, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "sub x13, x13, #0x4\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
       "add x12, x12, #0x10\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr q25, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "add x10, x10, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "add x28, x28, #0x10\n"
+      "sub x14, x14, #0x4\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "add x26, x26, #0x10\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "add x24, x24, #0x10\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "ldr q24, [x17, #0x30]\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr q25, [x17, #0x40]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr q24, [x17, #0x50]\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x17, #0x60]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x17, #0x70]\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x17, #0x80]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x17, #0x90]\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x17, #0xa0]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x17, #0xb0]\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x17, #0xc0]\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x17, #0xd0]\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr q25, [x17, #0xe0]\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr q24, [x17, #0xf0]\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
       "add x17, x17, #0x100\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
       "119:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x13, 121f\n"
+      "cbz x14, 121f\n"
       "120:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x1\n"
-      "ldr s1, [x28], #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s29, [x13], #0x4\n"
+      "sub x14, x14, #0x1\n"
+      "ldr s28, [x12], #0x4\n"
+      "ldr s27, [x11], #0x4\n"
+      "ldr s26, [x10], #0x4\n"
+      "ldr q25, [x17, #0x0]\n"
+      "fmla v8.4s, v25.4s, v29.s[0]\n"
+      "ldr q24, [x17, #0x10]\n"
+      "fmla v12.4s, v25.4s, v28.s[0]\n"
+      "fmla v16.4s, v25.4s, v27.s[0]\n"
+      "fmla v20.4s, v25.4s, v26.s[0]\n"
+      "ldr q25, [x17, #0x20]\n"
+      "fmla v9.4s, v24.4s, v29.s[0]\n"
+      "fmla v13.4s, v24.4s, v28.s[0]\n"
+      "fmla v17.4s, v24.4s, v27.s[0]\n"
+      "fmla v21.4s, v24.4s, v26.s[0]\n"
+      "ldr q24, [x17, #0x30]\n"
+      "fmla v10.4s, v25.4s, v29.s[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "cbnz x13, 120b\n"
+      "fmla v14.4s, v25.4s, v28.s[0]\n"
+      "fmla v18.4s, v25.4s, v27.s[0]\n"
+      "fmla v22.4s, v25.4s, v26.s[0]\n"
+      "fmla v11.4s, v24.4s, v29.s[0]\n"
+      "fmla v15.4s, v24.4s, v28.s[0]\n"
+      "fmla v19.4s, v24.4s, v27.s[0]\n"
+      "fmla v23.4s, v24.4s, v26.s[0]\n"
+      "cbnz x14, 120b\n"
       "121:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 114b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "add x25, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 122f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v24.4s\n"
+      "fmin v9.4s, v9.4s, v24.4s\n"
+      "fmin v10.4s, v10.4s, v24.4s\n"
+      "fmin v11.4s, v11.4s, v24.4s\n"
+      "fmin v12.4s, v12.4s, v24.4s\n"
+      "fmin v13.4s, v13.4s, v24.4s\n"
+      "fmin v14.4s, v14.4s, v24.4s\n"
+      "fmin v15.4s, v15.4s, v24.4s\n"
+      "fmin v16.4s, v16.4s, v24.4s\n"
+      "fmin v17.4s, v17.4s, v24.4s\n"
+      "fmin v18.4s, v18.4s, v24.4s\n"
+      "fmin v19.4s, v19.4s, v24.4s\n"
+      "fmin v20.4s, v20.4s, v24.4s\n"
+      "fmin v21.4s, v21.4s, v24.4s\n"
+      "fmin v22.4s, v22.4s, v24.4s\n"
+      "fmin v23.4s, v23.4s, v24.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmin v20.4s, v20.4s, v0.4s\n"
-      "fmin v21.4s, v21.4s, v0.4s\n"
-      "fmin v22.4s, v22.4s, v0.4s\n"
-      "fmin v23.4s, v23.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
-      "fmax v20.4s, v20.4s, v1.4s\n"
-      "fmax v21.4s, v21.4s, v1.4s\n"
-      "fmax v22.4s, v22.4s, v1.4s\n"
-      "fmax v23.4s, v23.4s, v1.4s\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v24.4s\n"
+      "fmax v9.4s, v9.4s, v24.4s\n"
+      "fmax v10.4s, v10.4s, v24.4s\n"
+      "fmax v11.4s, v11.4s, v24.4s\n"
+      "fmax v12.4s, v12.4s, v24.4s\n"
+      "fmax v13.4s, v13.4s, v24.4s\n"
+      "fmax v14.4s, v14.4s, v24.4s\n"
+      "fmax v15.4s, v15.4s, v24.4s\n"
+      "fmax v16.4s, v16.4s, v24.4s\n"
+      "fmax v17.4s, v17.4s, v24.4s\n"
+      "fmax v18.4s, v18.4s, v24.4s\n"
+      "fmax v19.4s, v19.4s, v24.4s\n"
+      "fmax v20.4s, v20.4s, v24.4s\n"
+      "fmax v21.4s, v21.4s, v24.4s\n"
+      "fmax v22.4s, v22.4s, v24.4s\n"
+      "fmax v23.4s, v23.4s, v24.4s\n"
       "122:"  // Height 4: No activation
       "cmp x8, #0x10\n"
       "bge 131f\n"
       "tbz x8, #3, 126f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
+      "st1 { v9.4s }, [x16], #0x10\n"
       "st1 { v12.4s }, [x25], #0x10\n"
       "st1 { v13.4s }, [x25], #0x10\n"
       "st1 { v16.4s }, [x24], #0x10\n"
@@ -1879,96 +1878,96 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "st1 { v20.4s }, [x23], #0x10\n"
       "st1 { v21.4s }, [x23], #0x10\n"
       "tbz x8, #2, 124f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
+      "st1 { v10.4s }, [x16], #0x10\n"
       "st1 { v14.4s }, [x25], #0x10\n"
       "st1 { v18.4s }, [x24], #0x10\n"
       "st1 { v22.4s }, [x23], #0x10\n"
       "tbz x8, #1, 123f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "str d15, [x25], #0x8\n"
       "str d19, [x24], #0x8\n"
       "str d23, [x23], #0x8\n"
       "tbz x8, #0, 130f\n"
-      "st1 { v11.s }[2], [x15]\n"
+      "st1 { v11.s }[2], [x16]\n"
       "st1 { v15.s }[2], [x25]\n"
       "st1 { v19.s }[2], [x24]\n"
       "st1 { v23.s }[2], [x23]\n"
       "b 130f\n"
       "123:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x8, #0, 130f\n"
-      "str s11, [x15, #0x0]\n"
+      "str s11, [x16, #0x0]\n"
       "str s15, [x25, #0x0]\n"
       "str s19, [x24, #0x0]\n"
       "str s23, [x23, #0x0]\n"
       "b 130f\n"
       "124:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x8, #1, 125f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "str d14, [x25], #0x8\n"
       "str d18, [x24], #0x8\n"
       "str d22, [x23], #0x8\n"
       "tbz x8, #0, 130f\n"
-      "st1 { v10.s }[2], [x15]\n"
+      "st1 { v10.s }[2], [x16]\n"
       "st1 { v14.s }[2], [x25]\n"
       "st1 { v18.s }[2], [x24]\n"
       "st1 { v22.s }[2], [x23]\n"
       "b 130f\n"
       "125:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x8, #0, 130f\n"
-      "str s10, [x15, #0x0]\n"
+      "str s10, [x16, #0x0]\n"
       "str s14, [x25, #0x0]\n"
       "str s18, [x24, #0x0]\n"
       "str s22, [x23, #0x0]\n"
       "b 130f\n"
       "126:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x8, #2, 128f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
       "st1 { v12.4s }, [x25], #0x10\n"
       "st1 { v16.4s }, [x24], #0x10\n"
       "st1 { v20.4s }, [x23], #0x10\n"
       "tbz x8, #1, 127f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "str d13, [x25], #0x8\n"
       "str d17, [x24], #0x8\n"
       "str d21, [x23], #0x8\n"
       "tbz x8, #0, 130f\n"
-      "st1 { v9.s }[2], [x15]\n"
+      "st1 { v9.s }[2], [x16]\n"
       "st1 { v13.s }[2], [x25]\n"
       "st1 { v17.s }[2], [x24]\n"
       "st1 { v21.s }[2], [x23]\n"
       "b 130f\n"
       "127:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x8, #0, 130f\n"
-      "str s9, [x15, #0x0]\n"
+      "str s9, [x16, #0x0]\n"
       "str s13, [x25, #0x0]\n"
       "str s17, [x24, #0x0]\n"
       "str s21, [x23, #0x0]\n"
       "b 130f\n"
       "128:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x8, #1, 129f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "str d12, [x25], #0x8\n"
       "str d16, [x24], #0x8\n"
       "str d20, [x23], #0x8\n"
       "tbz x8, #0, 130f\n"
-      "st1 { v8.s }[2], [x15]\n"
+      "st1 { v8.s }[2], [x16]\n"
       "st1 { v12.s }[2], [x25]\n"
       "st1 { v16.s }[2], [x24]\n"
       "st1 { v20.s }[2], [x23]\n"
       "b 130f\n"
       "129:"  // Height 4: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
+      "str s8, [x16, #0x0]\n"
       "str s12, [x25, #0x0]\n"
       "str s16, [x24, #0x0]\n"
       "str s20, [x23, #0x0]\n"
       "130:"  // Height 4: Partial direct writeback: Done
       "b 132f\n"
       "131:"  // Height 4: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "str q12, [x25, #0x0]\n"
       "str q13, [x25, #0x10]\n"
       "str q14, [x25, #0x20]\n"
@@ -1986,78 +1985,78 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "bgt 101b\n"
       "b 200f\n"
       "133:"  // Height 5
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x16, %x[bias]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
       "134:"  // Height 5: Column loop
-      "cbz x16, 135f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "cbz x7, 135f\n"
+      "ldr q8, [x7, #0x0]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q9, [x7, #0x10]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x7, #0x20]\n"
       "mov v14.16b, v10.16b\n"
+      "ldr q11, [x7, #0x30]\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "add x7, x7, #0x40\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
       "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
-      "mov v23.16b, v11.16b\n"
       "mov v27.16b, v11.16b\n"
       "b 146f\n"
       "135:"  // Height 5: no bias
       "tbz %x[flags], #0, 145f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "cmp x8, #0x10\n"
-      "add x25, x15, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "bge 144f\n"
       "tbz x8, #3, 139f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
       "ld1 { v12.4s }, [x25], #0x10\n"
       "ld1 { v16.4s }, [x24], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x25], #0x10\n"
-      "ld1 { v17.4s }, [x24], #0x10\n"
       "ld1 { v20.4s }, [x23], #0x10\n"
       "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x16], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
       "ld1 { v21.4s }, [x23], #0x10\n"
       "ld1 { v25.4s }, [x22], #0x10\n"
       "tbz x8, #2, 137f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
+      "ld1 { v10.4s }, [x16], #0x10\n"
       "ld1 { v14.4s }, [x25], #0x10\n"
       "ld1 { v18.4s }, [x24], #0x10\n"
       "ld1 { v22.4s }, [x23], #0x10\n"
       "ld1 { v26.4s }, [x22], #0x10\n"
       "tbz x8, #1, 136f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x19, #0x38\n"
+      "ldr d11, [x16], #0x8\n"
+      "mov x20, #0x38\n"
       "ldr d15, [x25], #0x8\n"
       "ldr d19, [x24], #0x8\n"
       "ldr d23, [x23], #0x8\n"
       "ldr d27, [x22], #0x8\n"
       "tbz x8, #0, 143f\n"
-      "ld1 { v11.s }[2], [x15]\n"
+      "ld1 { v11.s }[2], [x16]\n"
       "ld1 { v15.s }[2], [x25]\n"
       "ld1 { v19.s }[2], [x24]\n"
       "ld1 { v23.s }[2], [x23]\n"
       "ld1 { v27.s }[2], [x22]\n"
       "b 143f\n"
       "136:"  // Height 5: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 143f\n"
-      "ldr s11, [x15, #0x0]\n"
+      "ldr s11, [x16, #0x0]\n"
       "ldr s15, [x25, #0x0]\n"
       "ldr s19, [x24, #0x0]\n"
       "ldr s23, [x23, #0x0]\n"
@@ -2065,23 +2064,23 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 143f\n"
       "137:"  // Height 5: Partial accumulate: partial_2_8
       "tbz x8, #1, 138f\n"
-      "ldr d10, [x15], #0x8\n"
+      "ldr d10, [x16], #0x8\n"
+      "mov x20, #0x28\n"
       "ldr d14, [x25], #0x8\n"
-      "mov x19, #0x28\n"
       "ldr d18, [x24], #0x8\n"
       "ldr d22, [x23], #0x8\n"
       "ldr d26, [x22], #0x8\n"
       "tbz x8, #0, 143f\n"
-      "ld1 { v10.s }[2], [x15]\n"
+      "ld1 { v10.s }[2], [x16]\n"
       "ld1 { v14.s }[2], [x25]\n"
       "ld1 { v18.s }[2], [x24]\n"
       "ld1 { v22.s }[2], [x23]\n"
       "ld1 { v26.s }[2], [x22]\n"
       "b 143f\n"
       "138:"  // Height 5: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 143f\n"
-      "ldr s10, [x15, #0x0]\n"
+      "ldr s10, [x16, #0x0]\n"
       "ldr s14, [x25, #0x0]\n"
       "ldr s18, [x24, #0x0]\n"
       "ldr s22, [x23, #0x0]\n"
@@ -2089,29 +2088,29 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 143f\n"
       "139:"  // Height 5: Partial accumulate: partial_4_0
       "tbz x8, #2, 141f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
       "ld1 { v12.4s }, [x25], #0x10\n"
       "ld1 { v16.4s }, [x24], #0x10\n"
       "ld1 { v20.4s }, [x23], #0x10\n"
       "ld1 { v24.4s }, [x22], #0x10\n"
       "tbz x8, #1, 140f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x19, #0x18\n"
+      "ldr d9, [x16], #0x8\n"
+      "mov x20, #0x18\n"
       "ldr d13, [x25], #0x8\n"
       "ldr d17, [x24], #0x8\n"
       "ldr d21, [x23], #0x8\n"
       "ldr d25, [x22], #0x8\n"
       "tbz x8, #0, 143f\n"
-      "ld1 { v9.s }[2], [x15]\n"
+      "ld1 { v9.s }[2], [x16]\n"
       "ld1 { v13.s }[2], [x25]\n"
       "ld1 { v17.s }[2], [x24]\n"
       "ld1 { v21.s }[2], [x23]\n"
       "ld1 { v25.s }[2], [x22]\n"
       "b 143f\n"
       "140:"  // Height 5: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 143f\n"
-      "ldr s9, [x15, #0x0]\n"
+      "ldr s9, [x16, #0x0]\n"
       "ldr s13, [x25, #0x0]\n"
       "ldr s17, [x24, #0x0]\n"
       "ldr s21, [x23, #0x0]\n"
@@ -2119,34 +2118,34 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 143f\n"
       "141:"  // Height 5: Partial accumulate: partial_2_0
       "tbz x8, #1, 142f\n"
-      "ldr d8, [x15], #0x8\n"
+      "ldr d8, [x16], #0x8\n"
+      "mov x20, #0x8\n"
       "ldr d12, [x25], #0x8\n"
-      "mov x19, #0x8\n"
       "ldr d16, [x24], #0x8\n"
       "ldr d20, [x23], #0x8\n"
       "ldr d24, [x22], #0x8\n"
       "tbz x8, #0, 143f\n"
-      "ld1 { v8.s }[2], [x15]\n"
+      "ld1 { v8.s }[2], [x16]\n"
       "ld1 { v12.s }[2], [x25]\n"
       "ld1 { v16.s }[2], [x24]\n"
       "ld1 { v20.s }[2], [x23]\n"
       "ld1 { v24.s }[2], [x22]\n"
       "b 143f\n"
       "142:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr s8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "ldr s12, [x25, #0x0]\n"
       "ldr s16, [x24, #0x0]\n"
       "ldr s20, [x23, #0x0]\n"
       "ldr s24, [x22, #0x0]\n"
       "143:"  // Height 5: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 146f\n"
       "144:"  // Height 5: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "ldr q12, [x25, #0x0]\n"
       "ldr q13, [x25, #0x10]\n"
       "ldr q14, [x25, #0x20]\n"
@@ -2186,411 +2185,411 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "movi v26.16b, #0x0\n"
       "movi v27.16b, #0x0\n"
       "146:"  // Height 5: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "147:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 148f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x22, [x20, #0x20]\n"
-      "cbnz x14, 149f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #2\n"
-      "add x28, x28, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "cbnz x15, 149f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #2\n"
+      "add x12, x12, x20, LSL #2\n"
+      "add x11, x11, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
+      "add x9, x9, x20, LSL #2\n"
       "b 149f\n"
       "148:"  // Height 5: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19, LSL #2\n"
-      "add x26, x28, x19, LSL #2\n"
-      "add x24, x26, x19, LSL #2\n"
-      "add x22, x24, x19, LSL #2\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21, LSL #2\n"
+      "add x11, x12, x21, LSL #2\n"
+      "add x10, x11, x21, LSL #2\n"
+      "add x9, x10, x21, LSL #2\n"
       "149:"  // Height 5: input setup done
-      "cmp x13, #0x4\n"
+      "cmp x14, #0x4\n"
       "blt 152f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x8\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x8\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 151f\n"
       "150:"  // Height 5: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr d7, [x17, #0x10]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x18]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr x10, [x17, #0x28]\n"
+      "add x13, x13, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
       "add x12, x12, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "add x28, x28, #0x10\n"
+      "ldr d29, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "mov v29.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr d6, [x17, #0x20]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "add x11, x11, #0x10\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr x9, [x12, #0x8]\n"
+      "add x10, x10, #0x10\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x17, #0x30]\n"
-      "add x26, x26, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr x10, [x17, #0x48]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr x27, [x28, #0x8]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr x25, [x26, #0x8]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr x10, [x17, #0x68]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "add x24, x24, #0x10\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v24.4s, v6.4s, v4.s[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr x23, [x24, #0x8]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "add x22, x22, #0x10\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr x10, [x17, #0x88]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr x21, [x22, #0x8]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v26.4s, v6.4s, v4.s[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "sub x13, x13, #0x4\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "cmp x13, #0x8\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr x10, [x17, #0xa8]\n"
-      "fmla v27.4s, v7.4s, v4.s[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v24.4s, v6.4s, v4.s[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr x10, [x17, #0xc8]\n"
-      "fmla v25.4s, v7.4s, v4.s[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v26.4s, v6.4s, v4.s[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr x10, [x17, #0xe8]\n"
-      "fmla v27.4s, v7.4s, v4.s[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v24.4s, v6.4s, v4.s[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "fmla v25.4s, v7.4s, v4.s[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "ldr d28, [x17, #0x30]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
+      "ldr x26, [x13, #0x8]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr d29, [x17, #0x40]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "ldr x25, [x12, #0x8]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "ldr x24, [x11, #0x8]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr d28, [x17, #0x50]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "ldr x23, [x10, #0x8]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "ldr x22, [x9, #0x8]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr d29, [x17, #0x60]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
+      "sub x14, x14, #0x4\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
+      "cmp x14, #0x8\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr d28, [x17, #0x70]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr d29, [x17, #0x80]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr d28, [x17, #0x90]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr d29, [x17, #0xa0]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr d28, [x17, #0xb0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr d29, [x17, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr d28, [x17, #0xd0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr d29, [x17, #0xe0]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr d28, [x17, #0xf0]\n"
+      "mov v28.d[1], x20\n"
       "add x17, x17, #0x100\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "ldr x10, [x17, #0x8]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "ldr x20, [x17, #0x18]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "mov v6.d[1], x10\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "mov v0.d[1], x9\n"
-      "fmla v27.4s, v7.4s, v4.s[3]\n"
-      "mov v1.d[1], x27\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "ldr d4, [x22, #0x0]\n"
-      "mov v2.d[1], x25\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
+      "ldr d3, [x10, #0x0]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
+      "ldr d4, [x9, #0x0]\n"
+      "ldr d7, [x17, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x26\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
       "mov v3.d[1], x23\n"
-      "mov v4.d[1], x21\n"
+      "mov v4.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 150b\n"
       "151:"  // Height 5: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
+      "add x13, x13, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "sub x13, x13, #0x4\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
       "add x12, x12, #0x10\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q29, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "add x28, x28, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "sub x14, x14, #0x4\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "add x24, x24, #0x10\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "add x22, x22, #0x10\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "fmla v24.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "fmla v26.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "fmla v27.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "fmla v24.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "fmla v25.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "fmla v26.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "fmla v27.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "fmla v24.4s, v6.4s, v4.s[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "fmla v25.4s, v7.4s, v4.s[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "ldr q28, [x17, #0x30]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr q29, [x17, #0x40]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr q28, [x17, #0x50]\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x17, #0x60]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x17, #0x70]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x17, #0x80]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x17, #0x90]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x17, #0xa0]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x17, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x17, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x17, #0xd0]\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr q29, [x17, #0xe0]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr q28, [x17, #0xf0]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
       "add x17, x17, #0x100\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v26.4s, v6.4s, v4.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
       "152:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x13, 154f\n"
+      "cbz x14, 154f\n"
       "153:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x1\n"
-      "ldr s1, [x28], #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s2, [x13], #0x4\n"
+      "sub x14, x14, #0x1\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s0, [x11], #0x4\n"
+      "ldr s31, [x10], #0x4\n"
+      "ldr s30, [x9], #0x4\n"
+      "ldr q29, [x17, #0x0]\n"
+      "fmla v8.4s, v29.4s, v2.s[0]\n"
+      "ldr q28, [x17, #0x10]\n"
+      "fmla v12.4s, v29.4s, v1.s[0]\n"
+      "fmla v16.4s, v29.4s, v0.s[0]\n"
+      "fmla v20.4s, v29.4s, v31.s[0]\n"
+      "fmla v24.4s, v29.4s, v30.s[0]\n"
+      "ldr q29, [x17, #0x20]\n"
+      "fmla v9.4s, v28.4s, v2.s[0]\n"
+      "fmla v13.4s, v28.4s, v1.s[0]\n"
+      "fmla v17.4s, v28.4s, v0.s[0]\n"
+      "fmla v21.4s, v28.4s, v31.s[0]\n"
+      "fmla v25.4s, v28.4s, v30.s[0]\n"
+      "ldr q28, [x17, #0x30]\n"
+      "fmla v10.4s, v29.4s, v2.s[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "cbnz x13, 153b\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "fmla v18.4s, v29.4s, v0.s[0]\n"
+      "fmla v22.4s, v29.4s, v31.s[0]\n"
+      "fmla v26.4s, v29.4s, v30.s[0]\n"
+      "fmla v11.4s, v28.4s, v2.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v0.s[0]\n"
+      "fmla v23.4s, v28.4s, v31.s[0]\n"
+      "fmla v27.4s, v28.4s, v30.s[0]\n"
+      "cbnz x14, 153b\n"
       "154:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 147b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "add x25, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 155f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v28.4s\n"
+      "fmin v9.4s, v9.4s, v28.4s\n"
+      "fmin v10.4s, v10.4s, v28.4s\n"
+      "fmin v11.4s, v11.4s, v28.4s\n"
+      "fmin v12.4s, v12.4s, v28.4s\n"
+      "fmin v13.4s, v13.4s, v28.4s\n"
+      "fmin v14.4s, v14.4s, v28.4s\n"
+      "fmin v15.4s, v15.4s, v28.4s\n"
+      "fmin v16.4s, v16.4s, v28.4s\n"
+      "fmin v17.4s, v17.4s, v28.4s\n"
+      "fmin v18.4s, v18.4s, v28.4s\n"
+      "fmin v19.4s, v19.4s, v28.4s\n"
+      "fmin v20.4s, v20.4s, v28.4s\n"
+      "fmin v21.4s, v21.4s, v28.4s\n"
+      "fmin v22.4s, v22.4s, v28.4s\n"
+      "fmin v23.4s, v23.4s, v28.4s\n"
+      "fmin v24.4s, v24.4s, v28.4s\n"
+      "fmin v25.4s, v25.4s, v28.4s\n"
+      "fmin v26.4s, v26.4s, v28.4s\n"
+      "fmin v27.4s, v27.4s, v28.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmin v20.4s, v20.4s, v0.4s\n"
-      "fmin v21.4s, v21.4s, v0.4s\n"
-      "fmin v22.4s, v22.4s, v0.4s\n"
-      "fmin v23.4s, v23.4s, v0.4s\n"
-      "fmin v24.4s, v24.4s, v0.4s\n"
-      "fmin v25.4s, v25.4s, v0.4s\n"
-      "fmin v26.4s, v26.4s, v0.4s\n"
-      "fmin v27.4s, v27.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
-      "fmax v20.4s, v20.4s, v1.4s\n"
-      "fmax v21.4s, v21.4s, v1.4s\n"
-      "fmax v22.4s, v22.4s, v1.4s\n"
-      "fmax v23.4s, v23.4s, v1.4s\n"
-      "fmax v24.4s, v24.4s, v1.4s\n"
-      "fmax v25.4s, v25.4s, v1.4s\n"
-      "fmax v26.4s, v26.4s, v1.4s\n"
-      "fmax v27.4s, v27.4s, v1.4s\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v28.4s\n"
+      "fmax v9.4s, v9.4s, v28.4s\n"
+      "fmax v10.4s, v10.4s, v28.4s\n"
+      "fmax v11.4s, v11.4s, v28.4s\n"
+      "fmax v12.4s, v12.4s, v28.4s\n"
+      "fmax v13.4s, v13.4s, v28.4s\n"
+      "fmax v14.4s, v14.4s, v28.4s\n"
+      "fmax v15.4s, v15.4s, v28.4s\n"
+      "fmax v16.4s, v16.4s, v28.4s\n"
+      "fmax v17.4s, v17.4s, v28.4s\n"
+      "fmax v18.4s, v18.4s, v28.4s\n"
+      "fmax v19.4s, v19.4s, v28.4s\n"
+      "fmax v20.4s, v20.4s, v28.4s\n"
+      "fmax v21.4s, v21.4s, v28.4s\n"
+      "fmax v22.4s, v22.4s, v28.4s\n"
+      "fmax v23.4s, v23.4s, v28.4s\n"
+      "fmax v24.4s, v24.4s, v28.4s\n"
+      "fmax v25.4s, v25.4s, v28.4s\n"
+      "fmax v26.4s, v26.4s, v28.4s\n"
+      "fmax v27.4s, v27.4s, v28.4s\n"
       "155:"  // Height 5: No activation
       "cmp x8, #0x10\n"
       "bge 164f\n"
       "tbz x8, #3, 159f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
+      "st1 { v9.4s }, [x16], #0x10\n"
       "st1 { v12.4s }, [x25], #0x10\n"
       "st1 { v13.4s }, [x25], #0x10\n"
       "st1 { v16.4s }, [x24], #0x10\n"
@@ -2600,19 +2599,19 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "st1 { v24.4s }, [x22], #0x10\n"
       "st1 { v25.4s }, [x22], #0x10\n"
       "tbz x8, #2, 157f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
+      "st1 { v10.4s }, [x16], #0x10\n"
       "st1 { v14.4s }, [x25], #0x10\n"
       "st1 { v18.4s }, [x24], #0x10\n"
       "st1 { v22.4s }, [x23], #0x10\n"
       "st1 { v26.4s }, [x22], #0x10\n"
       "tbz x8, #1, 156f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "str d15, [x25], #0x8\n"
       "str d19, [x24], #0x8\n"
       "str d23, [x23], #0x8\n"
       "str d27, [x22], #0x8\n"
       "tbz x8, #0, 163f\n"
-      "st1 { v11.s }[2], [x15]\n"
+      "st1 { v11.s }[2], [x16]\n"
       "st1 { v15.s }[2], [x25]\n"
       "st1 { v19.s }[2], [x24]\n"
       "st1 { v23.s }[2], [x23]\n"
@@ -2620,7 +2619,7 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 163f\n"
       "156:"  // Height 5: Partial direct writeback: partial_1_12
       "tbz x8, #0, 163f\n"
-      "str s11, [x15, #0x0]\n"
+      "str s11, [x16, #0x0]\n"
       "str s15, [x25, #0x0]\n"
       "str s19, [x24, #0x0]\n"
       "str s23, [x23, #0x0]\n"
@@ -2628,13 +2627,13 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 163f\n"
       "157:"  // Height 5: Partial direct writeback: partial_2_8
       "tbz x8, #1, 158f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "str d14, [x25], #0x8\n"
       "str d18, [x24], #0x8\n"
       "str d22, [x23], #0x8\n"
       "str d26, [x22], #0x8\n"
       "tbz x8, #0, 163f\n"
-      "st1 { v10.s }[2], [x15]\n"
+      "st1 { v10.s }[2], [x16]\n"
       "st1 { v14.s }[2], [x25]\n"
       "st1 { v18.s }[2], [x24]\n"
       "st1 { v22.s }[2], [x23]\n"
@@ -2642,7 +2641,7 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 163f\n"
       "158:"  // Height 5: Partial direct writeback: partial_1_8
       "tbz x8, #0, 163f\n"
-      "str s10, [x15, #0x0]\n"
+      "str s10, [x16, #0x0]\n"
       "str s14, [x25, #0x0]\n"
       "str s18, [x24, #0x0]\n"
       "str s22, [x23, #0x0]\n"
@@ -2650,19 +2649,19 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 163f\n"
       "159:"  // Height 5: Partial direct writeback: partial_4_0
       "tbz x8, #2, 161f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
       "st1 { v12.4s }, [x25], #0x10\n"
       "st1 { v16.4s }, [x24], #0x10\n"
       "st1 { v20.4s }, [x23], #0x10\n"
       "st1 { v24.4s }, [x22], #0x10\n"
       "tbz x8, #1, 160f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "str d13, [x25], #0x8\n"
       "str d17, [x24], #0x8\n"
       "str d21, [x23], #0x8\n"
       "str d25, [x22], #0x8\n"
       "tbz x8, #0, 163f\n"
-      "st1 { v9.s }[2], [x15]\n"
+      "st1 { v9.s }[2], [x16]\n"
       "st1 { v13.s }[2], [x25]\n"
       "st1 { v17.s }[2], [x24]\n"
       "st1 { v21.s }[2], [x23]\n"
@@ -2670,7 +2669,7 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 163f\n"
       "160:"  // Height 5: Partial direct writeback: partial_1_4
       "tbz x8, #0, 163f\n"
-      "str s9, [x15, #0x0]\n"
+      "str s9, [x16, #0x0]\n"
       "str s13, [x25, #0x0]\n"
       "str s17, [x24, #0x0]\n"
       "str s21, [x23, #0x0]\n"
@@ -2678,20 +2677,20 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 163f\n"
       "161:"  // Height 5: Partial direct writeback: partial_2_0
       "tbz x8, #1, 162f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "str d12, [x25], #0x8\n"
       "str d16, [x24], #0x8\n"
       "str d20, [x23], #0x8\n"
       "str d24, [x22], #0x8\n"
       "tbz x8, #0, 163f\n"
-      "st1 { v8.s }[2], [x15]\n"
+      "st1 { v8.s }[2], [x16]\n"
       "st1 { v12.s }[2], [x25]\n"
       "st1 { v16.s }[2], [x24]\n"
       "st1 { v20.s }[2], [x23]\n"
       "st1 { v24.s }[2], [x22]\n"
       "b 163f\n"
       "162:"  // Height 5: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
+      "str s8, [x16, #0x0]\n"
       "str s12, [x25, #0x0]\n"
       "str s16, [x24, #0x0]\n"
       "str s20, [x23, #0x0]\n"
@@ -2699,11 +2698,11 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "163:"  // Height 5: Partial direct writeback: Done
       "b 165f\n"
       "164:"  // Height 5: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "str q12, [x25, #0x0]\n"
       "str q13, [x25, #0x10]\n"
       "str q14, [x25, #0x20]\n"
@@ -2725,81 +2724,81 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "bgt 134b\n"
       "b 200f\n"
       "166:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "mov x7, %x[bias]\n"
       "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x16, %x[bias]\n"
       "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x19, #0x18\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x16, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "167:"  // Height 6: Column loop
-      "cbz x16, 168f\n"
-      "ldr q8, [x16, #0x0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "cbz x7, 168f\n"
+      "ldr q8, [x7, #0x0]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q9, [x7, #0x10]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x7, #0x20]\n"
       "mov v14.16b, v10.16b\n"
+      "ldr q11, [x7, #0x30]\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "add x7, x7, #0x40\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
       "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
       "mov v28.16b, v8.16b\n"
       "mov v29.16b, v9.16b\n"
       "mov v30.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
-      "mov v23.16b, v11.16b\n"
-      "mov v27.16b, v11.16b\n"
       "mov v31.16b, v11.16b\n"
       "b 179f\n"
       "168:"  // Height 6: no bias
       "tbz %x[flags], #0, 178f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "cmp x8, #0x10\n"
-      "add x25, x15, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "bge 177f\n"
       "tbz x8, #3, 172f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
       "ld1 { v12.4s }, [x25], #0x10\n"
       "ld1 { v16.4s }, [x24], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x25], #0x10\n"
-      "ld1 { v17.4s }, [x24], #0x10\n"
       "ld1 { v20.4s }, [x23], #0x10\n"
       "ld1 { v24.4s }, [x22], #0x10\n"
       "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x16], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
       "ld1 { v21.4s }, [x23], #0x10\n"
       "ld1 { v25.4s }, [x22], #0x10\n"
       "ld1 { v29.4s }, [x21], #0x10\n"
       "tbz x8, #2, 170f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
+      "ld1 { v10.4s }, [x16], #0x10\n"
       "ld1 { v14.4s }, [x25], #0x10\n"
       "ld1 { v18.4s }, [x24], #0x10\n"
       "ld1 { v22.4s }, [x23], #0x10\n"
       "ld1 { v26.4s }, [x22], #0x10\n"
       "ld1 { v30.4s }, [x21], #0x10\n"
       "tbz x8, #1, 169f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x19, #0x38\n"
+      "ldr d11, [x16], #0x8\n"
+      "mov x20, #0x38\n"
       "ldr d15, [x25], #0x8\n"
       "ldr d19, [x24], #0x8\n"
       "ldr d23, [x23], #0x8\n"
       "ldr d27, [x22], #0x8\n"
       "ldr d31, [x21], #0x8\n"
       "tbz x8, #0, 176f\n"
-      "ld1 { v11.s }[2], [x15]\n"
+      "ld1 { v11.s }[2], [x16]\n"
       "ld1 { v15.s }[2], [x25]\n"
       "ld1 { v19.s }[2], [x24]\n"
       "ld1 { v23.s }[2], [x23]\n"
@@ -2807,9 +2806,9 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "ld1 { v31.s }[2], [x21]\n"
       "b 176f\n"
       "169:"  // Height 6: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x8, #0, 176f\n"
-      "ldr s11, [x15, #0x0]\n"
+      "ldr s11, [x16, #0x0]\n"
       "ldr s15, [x25, #0x0]\n"
       "ldr s19, [x24, #0x0]\n"
       "ldr s23, [x23, #0x0]\n"
@@ -2818,15 +2817,15 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 176f\n"
       "170:"  // Height 6: Partial accumulate: partial_2_8
       "tbz x8, #1, 171f\n"
-      "ldr d10, [x15], #0x8\n"
+      "ldr d10, [x16], #0x8\n"
+      "mov x20, #0x28\n"
       "ldr d14, [x25], #0x8\n"
-      "mov x19, #0x28\n"
       "ldr d18, [x24], #0x8\n"
       "ldr d22, [x23], #0x8\n"
       "ldr d26, [x22], #0x8\n"
       "ldr d30, [x21], #0x8\n"
       "tbz x8, #0, 176f\n"
-      "ld1 { v10.s }[2], [x15]\n"
+      "ld1 { v10.s }[2], [x16]\n"
       "ld1 { v14.s }[2], [x25]\n"
       "ld1 { v18.s }[2], [x24]\n"
       "ld1 { v22.s }[2], [x23]\n"
@@ -2834,9 +2833,9 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "ld1 { v30.s }[2], [x21]\n"
       "b 176f\n"
       "171:"  // Height 6: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x8, #0, 176f\n"
-      "ldr s10, [x15, #0x0]\n"
+      "ldr s10, [x16, #0x0]\n"
       "ldr s14, [x25, #0x0]\n"
       "ldr s18, [x24, #0x0]\n"
       "ldr s22, [x23, #0x0]\n"
@@ -2845,22 +2844,22 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 176f\n"
       "172:"  // Height 6: Partial accumulate: partial_4_0
       "tbz x8, #2, 174f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
+      "ld1 { v8.4s }, [x16], #0x10\n"
       "ld1 { v12.4s }, [x25], #0x10\n"
       "ld1 { v16.4s }, [x24], #0x10\n"
       "ld1 { v20.4s }, [x23], #0x10\n"
       "ld1 { v24.4s }, [x22], #0x10\n"
       "ld1 { v28.4s }, [x21], #0x10\n"
       "tbz x8, #1, 173f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x19, #0x18\n"
+      "ldr d9, [x16], #0x8\n"
+      "mov x20, #0x18\n"
       "ldr d13, [x25], #0x8\n"
       "ldr d17, [x24], #0x8\n"
       "ldr d21, [x23], #0x8\n"
       "ldr d25, [x22], #0x8\n"
       "ldr d29, [x21], #0x8\n"
       "tbz x8, #0, 176f\n"
-      "ld1 { v9.s }[2], [x15]\n"
+      "ld1 { v9.s }[2], [x16]\n"
       "ld1 { v13.s }[2], [x25]\n"
       "ld1 { v17.s }[2], [x24]\n"
       "ld1 { v21.s }[2], [x23]\n"
@@ -2868,9 +2867,9 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "ld1 { v29.s }[2], [x21]\n"
       "b 176f\n"
       "173:"  // Height 6: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x8, #0, 176f\n"
-      "ldr s9, [x15, #0x0]\n"
+      "ldr s9, [x16, #0x0]\n"
       "ldr s13, [x25, #0x0]\n"
       "ldr s17, [x24, #0x0]\n"
       "ldr s21, [x23, #0x0]\n"
@@ -2879,15 +2878,15 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 176f\n"
       "174:"  // Height 6: Partial accumulate: partial_2_0
       "tbz x8, #1, 175f\n"
-      "ldr d8, [x15], #0x8\n"
+      "ldr d8, [x16], #0x8\n"
+      "mov x20, #0x8\n"
       "ldr d12, [x25], #0x8\n"
-      "mov x19, #0x8\n"
       "ldr d16, [x24], #0x8\n"
       "ldr d20, [x23], #0x8\n"
       "ldr d24, [x22], #0x8\n"
       "ldr d28, [x21], #0x8\n"
       "tbz x8, #0, 176f\n"
-      "ld1 { v8.s }[2], [x15]\n"
+      "ld1 { v8.s }[2], [x16]\n"
       "ld1 { v12.s }[2], [x25]\n"
       "ld1 { v16.s }[2], [x24]\n"
       "ld1 { v20.s }[2], [x23]\n"
@@ -2895,21 +2894,21 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "ld1 { v28.s }[2], [x21]\n"
       "b 176f\n"
       "175:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr s8, [x16, #0x0]\n"
+      "mov x20, #0x0\n"
       "ldr s12, [x25, #0x0]\n"
       "ldr s16, [x24, #0x0]\n"
       "ldr s20, [x23, #0x0]\n"
       "ldr s24, [x22, #0x0]\n"
       "ldr s28, [x21, #0x0]\n"
       "176:"  // Height 6: Partial accumulate: Done
-      "sub x15, x15, x19\n"
+      "sub x16, x16, x20\n"
       "b 179f\n"
       "177:"  // Height 6: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x16, #0x0]\n"
+      "ldr q9, [x16, #0x10]\n"
+      "ldr q10, [x16, #0x20]\n"
+      "ldr q11, [x16, #0x30]\n"
       "ldr q12, [x25, #0x0]\n"
       "ldr q13, [x25, #0x10]\n"
       "ldr q14, [x25, #0x20]\n"
@@ -2957,260 +2956,260 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "movi v30.16b, #0x0\n"
       "movi v31.16b, #0x0\n"
       "179:"  // Height 6: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "180:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 181f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x22, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x14, 182f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19, LSL #2\n"
-      "add x28, x28, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
-      "add x20, x20, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "ldr x28, [x20, #0x28]\n"
+      "cbnz x15, 182f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20, LSL #2\n"
+      "add x12, x12, x20, LSL #2\n"
+      "add x11, x11, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
+      "add x9, x9, x20, LSL #2\n"
+      "add x28, x28, x20, LSL #2\n"
       "b 182f\n"
       "181:"  // Height 6: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19, LSL #2\n"
-      "add x26, x28, x19, LSL #2\n"
-      "add x24, x26, x19, LSL #2\n"
-      "add x22, x24, x19, LSL #2\n"
-      "add x20, x22, x19, LSL #2\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21, LSL #2\n"
+      "add x11, x12, x21, LSL #2\n"
+      "add x10, x11, x21, LSL #2\n"
+      "add x9, x10, x21, LSL #2\n"
+      "add x28, x9, x21, LSL #2\n"
       "182:"  // Height 6: input setup done
-      "cmp x13, #0x4\n"
+      "cmp x14, #0x4\n"
       "blt 185f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x8\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x8\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
+      "ldr q7, [x17, #0x10]\n"
       "blt 184f\n"
       "183:"  // Height 6: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr d7, [x17, #0x10]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x18]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr x10, [x17, #0x28]\n"
+      "add x13, x13, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
       "add x12, x12, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "mov v7.d[1], x11\n"
+      "add x11, x11, #0x10\n"
       "fmla v28.4s, v6.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
       "ldr d6, [x17, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr x9, [x12, #0x8]\n"
+      "add x10, x10, #0x10\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "mov v6.d[1], x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr x10, [x17, #0x48]\n"
+      "add x28, x28, #0x10\n"
       "fmla v29.4s, v7.4s, v5.s[0]\n"
       "ldr d7, [x17, #0x30]\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "add x28, x28, #0x10\n"
       "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr x20, [x17, #0x58]\n"
       "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "mov v7.d[1], x11\n"
+      "ldr x27, [x13, #0x8]\n"
       "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr x11, [x17, #0x58]\n"
+      "ldr x26, [x12, #0x8]\n"
       "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr x27, [x28, #0x8]\n"
+      "ldr x25, [x11, #0x8]\n"
       "fmla v30.4s, v6.4s, v5.s[0]\n"
       "ldr d6, [x17, #0x40]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "add x26, x26, #0x10\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr x21, [x17, #0x68]\n"
       "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "mov v6.d[1], x10\n"
+      "ldr x24, [x10, #0x8]\n"
       "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr x10, [x17, #0x68]\n"
+      "ldr x23, [x9, #0x8]\n"
       "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr x25, [x26, #0x8]\n"
+      "ldr x22, [x28, #0x8]\n"
       "fmla v31.4s, v7.4s, v5.s[0]\n"
       "ldr d7, [x17, #0x50]\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "add x24, x24, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr x20, [x17, #0x78]\n"
       "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "mov v7.d[1], x11\n"
+      "sub x14, x14, #0x4\n"
       "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr x11, [x17, #0x78]\n"
+      "cmp x14, #0x8\n"
       "fmla v24.4s, v6.4s, v4.s[1]\n"
-      "ldr x23, [x24, #0x8]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v28.4s, v6.4s, v5.s[1]\n"
       "ldr d6, [x17, #0x60]\n"
       "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "add x22, x22, #0x10\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr x21, [x17, #0x88]\n"
       "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "mov v6.d[1], x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "ldr x10, [x17, #0x88]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "ldr x21, [x22, #0x8]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       "fmla v29.4s, v7.4s, v5.s[1]\n"
       "ldr d7, [x17, #0x70]\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "add x20, x20, #0x10\n"
       "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr x20, [x17, #0x98]\n"
       "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "mov v7.d[1], x11\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr x11, [x17, #0x98]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v26.4s, v6.4s, v4.s[1]\n"
-      "ldr x19, [x20, #0x8]\n"
       "fmla v30.4s, v6.4s, v5.s[1]\n"
       "ldr d6, [x17, #0x80]\n"
       "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "sub x13, x13, #0x4\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "cmp x13, #0x8\n"
+      "ldr x21, [x17, #0xa8]\n"
       "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "mov v6.d[1], x10\n"
       "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr x10, [x17, #0xa8]\n"
       "fmla v27.4s, v7.4s, v4.s[1]\n"
       "fmla v31.4s, v7.4s, v5.s[1]\n"
       "ldr d7, [x17, #0x90]\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.4s, v6.4s, v0.s[2]\n"
       "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
       "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "mov v7.d[1], x11\n"
       "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
       "fmla v24.4s, v6.4s, v4.s[2]\n"
       "fmla v28.4s, v6.4s, v5.s[2]\n"
       "ldr d6, [x17, #0xa0]\n"
       "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
       "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "mov v6.d[1], x10\n"
       "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr x10, [x17, #0xc8]\n"
       "fmla v25.4s, v7.4s, v4.s[2]\n"
       "fmla v29.4s, v7.4s, v5.s[2]\n"
       "ldr d7, [x17, #0xb0]\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.4s, v6.4s, v0.s[2]\n"
       "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
       "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "mov v7.d[1], x11\n"
       "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
       "fmla v26.4s, v6.4s, v4.s[2]\n"
       "fmla v30.4s, v6.4s, v5.s[2]\n"
       "ldr d6, [x17, #0xc0]\n"
       "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
       "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "mov v6.d[1], x10\n"
       "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr x10, [x17, #0xe8]\n"
       "fmla v27.4s, v7.4s, v4.s[2]\n"
       "fmla v31.4s, v7.4s, v5.s[2]\n"
       "ldr d7, [x17, #0xd0]\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.4s, v6.4s, v0.s[3]\n"
       "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
       "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "mov v7.d[1], x11\n"
       "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
       "fmla v24.4s, v6.4s, v4.s[3]\n"
       "fmla v28.4s, v6.4s, v5.s[3]\n"
       "ldr d6, [x17, #0xe0]\n"
       "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[3]\n"
       "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "mov v6.d[1], x10\n"
       "fmla v21.4s, v7.4s, v3.s[3]\n"
       "fmla v25.4s, v7.4s, v4.s[3]\n"
       "fmla v29.4s, v7.4s, v5.s[3]\n"
       "ldr d7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "mov v7.d[1], x20\n"
       "add x17, x17, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "ldr x21, [x17, #0x8]\n"
       "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "ldr x10, [x17, #0x8]\n"
+      "ldr x20, [x17, #0x18]\n"
       "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "mov v7.d[1], x11\n"
       "fmla v22.4s, v6.4s, v3.s[3]\n"
       "fmla v26.4s, v6.4s, v4.s[3]\n"
       "fmla v30.4s, v6.4s, v5.s[3]\n"
       "ldr d6, [x17, #0x0]\n"
       "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "ldr d0, [x12, #0x0]\n"
+      "ldr d0, [x13, #0x0]\n"
       "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x28, #0x0]\n"
+      "ldr d1, [x12, #0x0]\n"
       "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "mov v6.d[1], x10\n"
+      "ldr d2, [x11, #0x0]\n"
       "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "mov v0.d[1], x9\n"
+      "ldr d3, [x10, #0x0]\n"
       "fmla v27.4s, v7.4s, v4.s[3]\n"
-      "mov v1.d[1], x27\n"
+      "ldr d4, [x9, #0x0]\n"
       "fmla v31.4s, v7.4s, v5.s[3]\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "ldr d4, [x22, #0x0]\n"
+      "ldr d5, [x28, #0x0]\n"
+      "ldr d7, [x17, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x27\n"
+      "mov v1.d[1], x26\n"
       "mov v2.d[1], x25\n"
-      "ldr d5, [x20, #0x0]\n"
-      "mov v3.d[1], x23\n"
-      "mov v4.d[1], x21\n"
-      "mov v5.d[1], x19\n"
+      "mov v3.d[1], x24\n"
+      "mov v4.d[1], x23\n"
+      "mov v5.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 183b\n"
       "184:"  // Height 6: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
+      "add x13, x13, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "sub x13, x13, #0x4\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
       "add x12, x12, #0x10\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "add x28, x28, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v28.4s, v6.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
       "ldr q6, [x17, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x28, x28, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "add x26, x26, #0x10\n"
+      "sub x14, x14, #0x4\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v29.4s, v7.4s, v5.s[0]\n"
       "ldr q7, [x17, #0x30]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "add x20, x20, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "fmla v26.4s, v6.4s, v4.s[0]\n"
       "fmla v30.4s, v6.4s, v5.s[0]\n"
       "ldr q6, [x17, #0x40]\n"
@@ -3305,67 +3304,65 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "fmla v27.4s, v7.4s, v4.s[3]\n"
       "fmla v31.4s, v7.4s, v5.s[3]\n"
       "185:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x13, 187f\n"
+      "cbz x14, 187f\n"
       "186:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x1\n"
-      "ldr s1, [x28], #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr s5, [x20], #0x4\n"
-      "ldr q6, [x17, #0x0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "fmla v28.4s, v6.4s, v5.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "fmla v29.4s, v7.4s, v5.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s7, [x13], #0x4\n"
+      "sub x14, x14, #0x1\n"
+      "ldr s6, [x12], #0x4\n"
+      "ldr s5, [x11], #0x4\n"
+      "ldr s4, [x10], #0x4\n"
+      "ldr s3, [x9], #0x4\n"
+      "ldr s2, [x28], #0x4\n"
+      "ldr q1, [x17, #0x0]\n"
+      "fmla v8.4s, v1.4s, v7.s[0]\n"
+      "ldr q0, [x17, #0x10]\n"
+      "fmla v12.4s, v1.4s, v6.s[0]\n"
+      "fmla v16.4s, v1.4s, v5.s[0]\n"
+      "fmla v20.4s, v1.4s, v4.s[0]\n"
+      "fmla v24.4s, v1.4s, v3.s[0]\n"
+      "fmla v28.4s, v1.4s, v2.s[0]\n"
+      "ldr q1, [x17, #0x20]\n"
+      "fmla v9.4s, v0.4s, v7.s[0]\n"
+      "fmla v13.4s, v0.4s, v6.s[0]\n"
+      "fmla v17.4s, v0.4s, v5.s[0]\n"
+      "fmla v21.4s, v0.4s, v4.s[0]\n"
+      "fmla v25.4s, v0.4s, v3.s[0]\n"
+      "fmla v29.4s, v0.4s, v2.s[0]\n"
+      "ldr q0, [x17, #0x30]\n"
+      "fmla v10.4s, v1.4s, v7.s[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "fmla v30.4s, v6.4s, v5.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "fmla v31.4s, v7.4s, v5.s[0]\n"
-      "cbnz x13, 186b\n"
+      "fmla v14.4s, v1.4s, v6.s[0]\n"
+      "fmla v18.4s, v1.4s, v5.s[0]\n"
+      "fmla v22.4s, v1.4s, v4.s[0]\n"
+      "fmla v26.4s, v1.4s, v3.s[0]\n"
+      "fmla v30.4s, v1.4s, v2.s[0]\n"
+      "fmla v11.4s, v0.4s, v7.s[0]\n"
+      "fmla v15.4s, v0.4s, v6.s[0]\n"
+      "fmla v19.4s, v0.4s, v5.s[0]\n"
+      "fmla v23.4s, v0.4s, v4.s[0]\n"
+      "fmla v27.4s, v0.4s, v3.s[0]\n"
+      "fmla v31.4s, v0.4s, v2.s[0]\n"
+      "cbnz x14, 186b\n"
       "187:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 180b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "add x25, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x16, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "prfm pstl1keep, [x16, #0x0]\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
       "tbz %x[flags], #1, 188f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
-      "ld1r { v0.4s }, [x19]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x20]\n"
       "fmin v8.4s, v8.4s, v0.4s\n"
       "fmin v9.4s, v9.4s, v0.4s\n"
       "fmin v10.4s, v10.4s, v0.4s\n"
@@ -3376,16 +3373,6 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "fmin v15.4s, v15.4s, v0.4s\n"
       "fmin v16.4s, v16.4s, v0.4s\n"
       "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
       "fmin v18.4s, v18.4s, v0.4s\n"
       "fmin v19.4s, v19.4s, v0.4s\n"
       "fmin v20.4s, v20.4s, v0.4s\n"
@@ -3396,30 +3383,42 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "fmin v25.4s, v25.4s, v0.4s\n"
       "fmin v26.4s, v26.4s, v0.4s\n"
       "fmin v27.4s, v27.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
-      "fmax v20.4s, v20.4s, v1.4s\n"
-      "fmax v21.4s, v21.4s, v1.4s\n"
-      "fmax v22.4s, v22.4s, v1.4s\n"
-      "fmax v23.4s, v23.4s, v1.4s\n"
-      "fmax v24.4s, v24.4s, v1.4s\n"
-      "fmax v25.4s, v25.4s, v1.4s\n"
-      "fmax v26.4s, v26.4s, v1.4s\n"
-      "fmax v27.4s, v27.4s, v1.4s\n"
       "fmin v28.4s, v28.4s, v0.4s\n"
       "fmin v29.4s, v29.4s, v0.4s\n"
       "fmin v30.4s, v30.4s, v0.4s\n"
       "fmin v31.4s, v31.4s, v0.4s\n"
-      "fmax v28.4s, v28.4s, v1.4s\n"
-      "fmax v29.4s, v29.4s, v1.4s\n"
-      "fmax v30.4s, v30.4s, v1.4s\n"
-      "fmax v31.4s, v31.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v31.4s, v31.4s, v0.4s\n"
       "188:"  // Height 6: No activation
       "cmp x8, #0x10\n"
       "bge 197f\n"
       "tbz x8, #3, 192f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
+      "st1 { v9.4s }, [x16], #0x10\n"
       "st1 { v12.4s }, [x25], #0x10\n"
       "st1 { v13.4s }, [x25], #0x10\n"
       "st1 { v16.4s }, [x24], #0x10\n"
@@ -3431,21 +3430,21 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "st1 { v28.4s }, [x21], #0x10\n"
       "st1 { v29.4s }, [x21], #0x10\n"
       "tbz x8, #2, 190f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
+      "st1 { v10.4s }, [x16], #0x10\n"
       "st1 { v14.4s }, [x25], #0x10\n"
       "st1 { v18.4s }, [x24], #0x10\n"
       "st1 { v22.4s }, [x23], #0x10\n"
       "st1 { v26.4s }, [x22], #0x10\n"
       "st1 { v30.4s }, [x21], #0x10\n"
       "tbz x8, #1, 189f\n"
-      "str d11, [x15], #0x8\n"
+      "str d11, [x16], #0x8\n"
       "str d15, [x25], #0x8\n"
       "str d19, [x24], #0x8\n"
       "str d23, [x23], #0x8\n"
       "str d27, [x22], #0x8\n"
       "str d31, [x21], #0x8\n"
       "tbz x8, #0, 196f\n"
-      "st1 { v11.s }[2], [x15]\n"
+      "st1 { v11.s }[2], [x16]\n"
       "st1 { v15.s }[2], [x25]\n"
       "st1 { v19.s }[2], [x24]\n"
       "st1 { v23.s }[2], [x23]\n"
@@ -3454,7 +3453,7 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 196f\n"
       "189:"  // Height 6: Partial direct writeback: partial_1_12
       "tbz x8, #0, 196f\n"
-      "str s11, [x15, #0x0]\n"
+      "str s11, [x16, #0x0]\n"
       "str s15, [x25, #0x0]\n"
       "str s19, [x24, #0x0]\n"
       "str s23, [x23, #0x0]\n"
@@ -3463,14 +3462,14 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 196f\n"
       "190:"  // Height 6: Partial direct writeback: partial_2_8
       "tbz x8, #1, 191f\n"
-      "str d10, [x15], #0x8\n"
+      "str d10, [x16], #0x8\n"
       "str d14, [x25], #0x8\n"
       "str d18, [x24], #0x8\n"
       "str d22, [x23], #0x8\n"
       "str d26, [x22], #0x8\n"
       "str d30, [x21], #0x8\n"
       "tbz x8, #0, 196f\n"
-      "st1 { v10.s }[2], [x15]\n"
+      "st1 { v10.s }[2], [x16]\n"
       "st1 { v14.s }[2], [x25]\n"
       "st1 { v18.s }[2], [x24]\n"
       "st1 { v22.s }[2], [x23]\n"
@@ -3479,7 +3478,7 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 196f\n"
       "191:"  // Height 6: Partial direct writeback: partial_1_8
       "tbz x8, #0, 196f\n"
-      "str s10, [x15, #0x0]\n"
+      "str s10, [x16, #0x0]\n"
       "str s14, [x25, #0x0]\n"
       "str s18, [x24, #0x0]\n"
       "str s22, [x23, #0x0]\n"
@@ -3488,21 +3487,21 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 196f\n"
       "192:"  // Height 6: Partial direct writeback: partial_4_0
       "tbz x8, #2, 194f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
+      "st1 { v8.4s }, [x16], #0x10\n"
       "st1 { v12.4s }, [x25], #0x10\n"
       "st1 { v16.4s }, [x24], #0x10\n"
       "st1 { v20.4s }, [x23], #0x10\n"
       "st1 { v24.4s }, [x22], #0x10\n"
       "st1 { v28.4s }, [x21], #0x10\n"
       "tbz x8, #1, 193f\n"
-      "str d9, [x15], #0x8\n"
+      "str d9, [x16], #0x8\n"
       "str d13, [x25], #0x8\n"
       "str d17, [x24], #0x8\n"
       "str d21, [x23], #0x8\n"
       "str d25, [x22], #0x8\n"
       "str d29, [x21], #0x8\n"
       "tbz x8, #0, 196f\n"
-      "st1 { v9.s }[2], [x15]\n"
+      "st1 { v9.s }[2], [x16]\n"
       "st1 { v13.s }[2], [x25]\n"
       "st1 { v17.s }[2], [x24]\n"
       "st1 { v21.s }[2], [x23]\n"
@@ -3511,7 +3510,7 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 196f\n"
       "193:"  // Height 6: Partial direct writeback: partial_1_4
       "tbz x8, #0, 196f\n"
-      "str s9, [x15, #0x0]\n"
+      "str s9, [x16, #0x0]\n"
       "str s13, [x25, #0x0]\n"
       "str s17, [x24, #0x0]\n"
       "str s21, [x23, #0x0]\n"
@@ -3520,14 +3519,14 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "b 196f\n"
       "194:"  // Height 6: Partial direct writeback: partial_2_0
       "tbz x8, #1, 195f\n"
-      "str d8, [x15], #0x8\n"
+      "str d8, [x16], #0x8\n"
       "str d12, [x25], #0x8\n"
       "str d16, [x24], #0x8\n"
       "str d20, [x23], #0x8\n"
       "str d24, [x22], #0x8\n"
       "str d28, [x21], #0x8\n"
       "tbz x8, #0, 196f\n"
-      "st1 { v8.s }[2], [x15]\n"
+      "st1 { v8.s }[2], [x16]\n"
       "st1 { v12.s }[2], [x25]\n"
       "st1 { v16.s }[2], [x24]\n"
       "st1 { v20.s }[2], [x23]\n"
@@ -3535,7 +3534,7 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "st1 { v28.s }[2], [x21]\n"
       "b 196f\n"
       "195:"  // Height 6: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
+      "str s8, [x16, #0x0]\n"
       "str s12, [x25, #0x0]\n"
       "str s16, [x24, #0x0]\n"
       "str s20, [x23, #0x0]\n"
@@ -3544,11 +3543,11 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "196:"  // Height 6: Partial direct writeback: Done
       "b 198f\n"
       "197:"  // Height 6: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x16, #0x0]\n"
+      "str q9, [x16, #0x10]\n"
+      "str q10, [x16, #0x20]\n"
+      "str q11, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
       "str q12, [x25, #0x0]\n"
       "str q13, [x25, #0x10]\n"
       "str q14, [x25, #0x20]\n"
@@ -3574,20 +3573,19 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "bgt 167b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 200f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 199f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "199:"  // Update direct input
-      "mov x19, #0x18\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x18\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "200:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
index f5504b44d4..bb84a50282 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_6x16 (
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 166f\n"
@@ -102,82 +101,82 @@ void a64_hybrid_fp32_mla_6x16 (
       "cmp %x[M], #0x2\n"
       "bgt 67f\n"
       "beq 34f\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[bias]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "cbz x9, 3f\n"
-      "ldr q8, [x9, #0x0]\n"
-      "ldr q9, [x9, #0x10]\n"
-      "ldr q10, [x9, #0x20]\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
+      "cbz x12, 3f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
       "b 14f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 13f\n"
       "cmp x11, #0x10\n"
       "bge 12f\n"
       "tbz x11, #3, 7f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
       "tbz x11, #2, 5f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
       "tbz x11, #1, 4f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "mov x20, #0x38\n"
       "tbz x11, #0, 11f\n"
-      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v11.s }[2], [x9]\n"
       "b 11f\n"
       "4:"  // Height 1: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 11f\n"
-      "ldr s11, [x28, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
       "b 11f\n"
       "5:"  // Height 1: Partial accumulate: partial_2_8
       "tbz x11, #1, 6f\n"
-      "ldr d10, [x28], #0x8\n"
-      "mov x19, #0x28\n"
+      "ldr d10, [x9], #0x8\n"
+      "mov x20, #0x28\n"
       "tbz x11, #0, 11f\n"
-      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v10.s }[2], [x9]\n"
       "b 11f\n"
       "6:"  // Height 1: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 11f\n"
-      "ldr s10, [x28, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
       "b 11f\n"
       "7:"  // Height 1: Partial accumulate: partial_4_0
       "tbz x11, #2, 9f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
       "tbz x11, #1, 8f\n"
-      "ldr d9, [x28], #0x8\n"
-      "mov x19, #0x18\n"
+      "ldr d9, [x9], #0x8\n"
+      "mov x20, #0x18\n"
       "tbz x11, #0, 11f\n"
-      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v9.s }[2], [x9]\n"
       "b 11f\n"
       "8:"  // Height 1: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 11f\n"
-      "ldr s9, [x28, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
       "b 11f\n"
       "9:"  // Height 1: Partial accumulate: partial_2_0
       "tbz x11, #1, 10f\n"
-      "ldr d8, [x28], #0x8\n"
-      "mov x19, #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x11, #0, 11f\n"
-      "ld1 { v8.s }[2], [x28]\n"
+      "ld1 { v8.s }[2], [x9]\n"
       "b 11f\n"
       "10:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr s8, [x9, #0x0]\n"
+      "mov x20, #0x0\n"
       "11:"  // Height 1: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 14f\n"
       "12:"  // Height 1: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
       "b 14f\n"
       "13:"  // Height 1: no accumulate
       "movi v8.16b, #0x0\n"
@@ -185,300 +184,300 @@ void a64_hybrid_fp32_mla_6x16 (
       "movi v10.16b, #0x0\n"
       "movi v11.16b, #0x0\n"
       "14:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "15:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 16f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 17f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 17f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
       "b 17f\n"
       "16:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "17:"  // Height 1: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "blt 20f\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 19f\n"
       "18:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "sub x26, x26, #0x4\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "cmp x26, #0x8\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "cmp x27, #0x8\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "bge 18b\n"
       "19:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x4\n"
+      "ldr q17, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x25, x25, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "sub x27, x27, #0x4\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
       "20:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x26, 22f\n"
+      "cbz x27, 22f\n"
       "21:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x1\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x10, #0x0]\n"
+      "fmla v8.4s, v16.4s, v18.s[0]\n"
+      "sub x27, x27, #0x1\n"
+      "ldr q17, [x10, #0x10]\n"
+      "ldr q16, [x10, #0x20]\n"
+      "fmla v9.4s, v17.4s, v18.s[0]\n"
+      "fmla v10.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v11.4s, v16.4s, v18.s[0]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "cbnz x26, 21b\n"
+      "cbnz x27, 21b\n"
       "22:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 15b\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
       "tbz %x[flags], #1, 23f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
       "23:"  // Height 1: No activation
       "cmp x11, #0x10\n"
       "bge 32f\n"
       "tbz x11, #3, 27f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
       "tbz x11, #2, 25f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
       "tbz x11, #1, 24f\n"
-      "str d11, [x28], #0x8\n"
+      "str d11, [x9], #0x8\n"
       "tbz x11, #0, 31f\n"
-      "st1 { v11.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x9]\n"
       "b 31f\n"
       "24:"  // Height 1: Partial direct writeback: partial_1_12
       "tbz x11, #0, 31f\n"
-      "str s11, [x28, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
       "b 31f\n"
       "25:"  // Height 1: Partial direct writeback: partial_2_8
       "tbz x11, #1, 26f\n"
-      "str d10, [x28], #0x8\n"
+      "str d10, [x9], #0x8\n"
       "tbz x11, #0, 31f\n"
-      "st1 { v10.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x9]\n"
       "b 31f\n"
       "26:"  // Height 1: Partial direct writeback: partial_1_8
       "tbz x11, #0, 31f\n"
-      "str s10, [x28, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
       "b 31f\n"
       "27:"  // Height 1: Partial direct writeback: partial_4_0
       "tbz x11, #2, 29f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
       "tbz x11, #1, 28f\n"
-      "str d9, [x28], #0x8\n"
+      "str d9, [x9], #0x8\n"
       "tbz x11, #0, 31f\n"
-      "st1 { v9.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x9]\n"
       "b 31f\n"
       "28:"  // Height 1: Partial direct writeback: partial_1_4
       "tbz x11, #0, 31f\n"
-      "str s9, [x28, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
       "b 31f\n"
       "29:"  // Height 1: Partial direct writeback: partial_2_0
       "tbz x11, #1, 30f\n"
-      "str d8, [x28], #0x8\n"
+      "str d8, [x9], #0x8\n"
       "tbz x11, #0, 31f\n"
-      "st1 { v8.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x9]\n"
       "b 31f\n"
       "30:"  // Height 1: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
       "31:"  // Height 1: Partial direct writeback: Done
       "b 33f\n"
       "32:"  // Height 1: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
       "33:"  // Height 1: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 2b\n"
       "b 200f\n"
       "34:"  // Height 2
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "35:"  // Height 2: Column loop
-      "cbz x9, 36f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 36f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "ldr q10, [x9, #0x20]\n"
       "mov v13.16b, v9.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "add x12, x12, #0x40\n"
       "b 47f\n"
       "36:"  // Height 2: no bias
       "tbz %x[flags], #0, 46f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "cmp x11, #0x10\n"
-      "add x24, x28, x19, LSL #2\n"
+      "add x25, x9, x20, LSL #2\n"
       "bge 45f\n"
       "tbz x11, #3, 40f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
       "tbz x11, #2, 38f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
       "tbz x11, #1, 37f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
       "tbz x11, #0, 44f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
       "b 44f\n"
       "37:"  // Height 2: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 44f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x24, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
       "b 44f\n"
       "38:"  // Height 2: Partial accumulate: partial_2_8
       "tbz x11, #1, 39f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "mov x19, #0x28\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
       "tbz x11, #0, 44f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
       "b 44f\n"
       "39:"  // Height 2: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 44f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x24, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
       "b 44f\n"
       "40:"  // Height 2: Partial accumulate: partial_4_0
       "tbz x11, #2, 42f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
       "tbz x11, #1, 41f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
       "tbz x11, #0, 44f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
       "b 44f\n"
       "41:"  // Height 2: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 44f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x24, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
       "b 44f\n"
       "42:"  // Height 2: Partial accumulate: partial_2_0
       "tbz x11, #1, 43f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "mov x19, #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x11, #0, 44f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
       "b 44f\n"
       "43:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s12, [x24, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
       "44:"  // Height 2: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 47f\n"
       "45:"  // Height 2: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
       "b 47f\n"
       "46:"  // Height 2: no accumulate
       "movi v8.16b, #0x0\n"
@@ -490,402 +489,402 @@ void a64_hybrid_fp32_mla_6x16 (
       "movi v14.16b, #0x0\n"
       "movi v15.16b, #0x0\n"
       "47:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 50f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 50f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
       "b 50f\n"
       "49:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #2\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
       "50:"  // Height 2: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "blt 53f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x8\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x8\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 52f\n"
       "51:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
+      "sub x27, x27, #0x4\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x4\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "cmp x26, #0x8\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "cmp x27, #0x8\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr q1, [x24, #0x0]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x4\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
+      "add x26, x26, #0x10\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "add x25, x25, #0x10\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "sub x27, x27, #0x4\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x26, 55f\n"
+      "cbz x27, 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x1\n"
-      "ldr s1, [x24], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
+      "sub x27, x27, #0x1\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      "fmla v8.4s, v17.4s, v19.s[0]\n"
+      "fmla v12.4s, v17.4s, v18.s[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.4s, v16.4s, v19.s[0]\n"
+      "fmla v13.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.4s, v17.4s, v19.s[0]\n"
+      "fmla v14.4s, v17.4s, v18.s[0]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "cbnz x26, 54b\n"
+      "fmla v11.4s, v16.4s, v19.s[0]\n"
+      "fmla v15.4s, v16.4s, v18.s[0]\n"
+      "cbnz x27, 54b\n"
       "55:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 48b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "tbz %x[flags], #1, 56f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmin v12.4s, v12.4s, v17.4s\n"
+      "fmin v13.4s, v13.4s, v17.4s\n"
+      "fmin v14.4s, v14.4s, v17.4s\n"
+      "fmin v15.4s, v15.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
+      "fmax v14.4s, v14.4s, v16.4s\n"
+      "fmax v15.4s, v15.4s, v16.4s\n"
       "56:"  // Height 2: No activation
       "cmp x11, #0x10\n"
       "bge 65f\n"
       "tbz x11, #3, 60f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
       "tbz x11, #2, 58f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
       "tbz x11, #1, 57f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
       "tbz x11, #0, 64f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x25]\n"
       "b 64f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x11, #0, 64f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x24, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
       "b 64f\n"
       "58:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x11, #1, 59f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
       "tbz x11, #0, 64f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x25]\n"
       "b 64f\n"
       "59:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x11, #0, 64f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x24, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
       "b 64f\n"
       "60:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x11, #2, 62f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
       "tbz x11, #1, 61f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
       "tbz x11, #0, 64f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x25]\n"
       "b 64f\n"
       "61:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x11, #0, 64f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x24, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
       "b 64f\n"
       "62:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x11, #1, 63f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
       "tbz x11, #0, 64f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x25]\n"
       "b 64f\n"
       "63:"  // Height 2: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x24, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
       "64:"  // Height 2: Partial direct writeback: Done
       "b 66f\n"
       "65:"  // Height 2: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
       "66:"  // Height 2: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 35b\n"
       "b 200f\n"
       "67:"  // Height 3
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "68:"  // Height 3: Column loop
-      "cbz x9, 69f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 69f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "ldr q11, [x9, #0x30]\n"
       "mov v13.16b, v9.16b\n"
-      "add x9, x9, #0x40\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
       "b 80f\n"
       "69:"  // Height 3: no bias
       "tbz %x[flags], #0, 79f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
       "cmp x11, #0x10\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
       "bge 78f\n"
       "tbz x11, #3, 73f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x24], #0x10\n"
-      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
       "tbz x11, #2, 71f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x24], #0x10\n"
-      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
       "tbz x11, #1, 70f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
       "tbz x11, #0, 77f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x24]\n"
-      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
       "b 77f\n"
       "70:"  // Height 3: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 77f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x24, #0x0]\n"
-      "ldr s19, [x23, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
       "b 77f\n"
       "71:"  // Height 3: Partial accumulate: partial_2_8
       "tbz x11, #1, 72f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "mov x19, #0x28\n"
-      "ldr d18, [x23], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
       "tbz x11, #0, 77f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x24]\n"
-      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
       "b 77f\n"
       "72:"  // Height 3: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 77f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x24, #0x0]\n"
-      "ldr s18, [x23, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
       "b 77f\n"
       "73:"  // Height 3: Partial accumulate: partial_4_0
       "tbz x11, #2, 75f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
       "tbz x11, #1, 74f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
       "tbz x11, #0, 77f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x24]\n"
-      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
       "b 77f\n"
       "74:"  // Height 3: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 77f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x24, #0x0]\n"
-      "ldr s17, [x23, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
       "b 77f\n"
       "75:"  // Height 3: Partial accumulate: partial_2_0
       "tbz x11, #1, 76f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d16, [x23], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
       "tbz x11, #0, 77f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x24]\n"
-      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
       "b 77f\n"
       "76:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s12, [x24, #0x0]\n"
-      "ldr s16, [x23, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
       "77:"  // Height 3: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 80f\n"
       "78:"  // Height 3: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
       "b 80f\n"
       "79:"  // Height 3: no accumulate
       "movi v8.16b, #0x0\n"
@@ -901,503 +900,503 @@ void a64_hybrid_fp32_mla_6x16 (
       "movi v18.16b, #0x0\n"
       "movi v19.16b, #0x0\n"
       "80:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "81:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 82f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 83f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 83f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
       "b 83f\n"
       "82:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "83:"  // Height 3: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "blt 86f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x8\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q2, [x24, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 85f\n"
       "84:"  // Height 3: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x4\n"
+      "add x26, x26, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q21, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sub x26, x26, #0x4\n"
+      "add x25, x25, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "cmp x26, #0x8\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
+      "cmp x27, #0x8\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 84b\n"
       "85:"  // Height 3: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x4\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q21, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "sub x27, x27, #0x4\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
       "86:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x26, 88f\n"
+      "cbz x27, 88f\n"
       "87:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x1\n"
-      "ldr s1, [x24], #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "sub x27, x27, #0x1\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v8.4s, v21.4s, v24.s[0]\n"
+      "fmla v12.4s, v21.4s, v23.s[0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      "fmla v16.4s, v21.4s, v22.s[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.4s, v20.4s, v24.s[0]\n"
+      "fmla v13.4s, v20.4s, v23.s[0]\n"
+      "fmla v17.4s, v20.4s, v22.s[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "cbnz x26, 87b\n"
+      "fmla v10.4s, v21.4s, v24.s[0]\n"
+      "fmla v14.4s, v21.4s, v23.s[0]\n"
+      "fmla v18.4s, v21.4s, v22.s[0]\n"
+      "fmla v11.4s, v20.4s, v24.s[0]\n"
+      "fmla v15.4s, v20.4s, v23.s[0]\n"
+      "fmla v19.4s, v20.4s, v22.s[0]\n"
+      "cbnz x27, 87b\n"
       "88:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 81b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 89f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v21.4s\n"
+      "fmin v9.4s, v9.4s, v21.4s\n"
+      "fmin v10.4s, v10.4s, v21.4s\n"
+      "fmin v11.4s, v11.4s, v21.4s\n"
+      "fmin v12.4s, v12.4s, v21.4s\n"
+      "fmin v13.4s, v13.4s, v21.4s\n"
+      "fmin v14.4s, v14.4s, v21.4s\n"
+      "fmin v15.4s, v15.4s, v21.4s\n"
+      "fmin v16.4s, v16.4s, v21.4s\n"
+      "fmin v17.4s, v17.4s, v21.4s\n"
+      "fmin v18.4s, v18.4s, v21.4s\n"
+      "fmin v19.4s, v19.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
       "89:"  // Height 3: No activation
       "cmp x11, #0x10\n"
       "bge 98f\n"
       "tbz x11, #3, 93f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v13.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
       "tbz x11, #2, 91f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x24], #0x10\n"
-      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
       "tbz x11, #1, 90f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
       "tbz x11, #0, 97f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x24]\n"
-      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
       "b 97f\n"
       "90:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x11, #0, 97f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x24, #0x0]\n"
-      "str s19, [x23, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
       "b 97f\n"
       "91:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x11, #1, 92f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
       "tbz x11, #0, 97f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x24]\n"
-      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
       "b 97f\n"
       "92:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x11, #0, 97f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x24, #0x0]\n"
-      "str s18, [x23, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
       "b 97f\n"
       "93:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x11, #2, 95f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
       "tbz x11, #1, 94f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
       "tbz x11, #0, 97f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x24]\n"
-      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
       "b 97f\n"
       "94:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x11, #0, 97f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x24, #0x0]\n"
-      "str s17, [x23, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
       "b 97f\n"
       "95:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x11, #1, 96f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
       "tbz x11, #0, 97f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x24]\n"
-      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
       "b 97f\n"
       "96:"  // Height 3: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x24, #0x0]\n"
-      "str s16, [x23, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
       "97:"  // Height 3: Partial direct writeback: Done
       "b 99f\n"
       "98:"  // Height 3: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
       "99:"  // Height 3: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 68b\n"
       "b 200f\n"
       "100:"  // Height 4
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "101:"  // Height 4: Column loop
-      "cbz x9, 102f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 102f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "mov v20.16b, v8.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
       "mov v23.16b, v11.16b\n"
       "b 113f\n"
       "102:"  // Height 4: no bias
       "tbz %x[flags], #0, 112f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
       "cmp x11, #0x10\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "bge 111f\n"
       "tbz x11, #3, 106f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x24], #0x10\n"
-      "ld1 { v17.4s }, [x23], #0x10\n"
-      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
       "tbz x11, #2, 104f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x24], #0x10\n"
-      "ld1 { v18.4s }, [x23], #0x10\n"
-      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
       "tbz x11, #1, 103f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
-      "ldr d23, [x22], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
       "tbz x11, #0, 110f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x24]\n"
-      "ld1 { v19.s }[2], [x23]\n"
-      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
       "b 110f\n"
       "103:"  // Height 4: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 110f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x24, #0x0]\n"
-      "ldr s19, [x23, #0x0]\n"
-      "ldr s23, [x22, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
       "b 110f\n"
       "104:"  // Height 4: Partial accumulate: partial_2_8
       "tbz x11, #1, 105f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "mov x19, #0x28\n"
-      "ldr d18, [x23], #0x8\n"
-      "ldr d22, [x22], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
       "tbz x11, #0, 110f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x24]\n"
-      "ld1 { v18.s }[2], [x23]\n"
-      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
       "b 110f\n"
       "105:"  // Height 4: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 110f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x24, #0x0]\n"
-      "ldr s18, [x23, #0x0]\n"
-      "ldr s22, [x22, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
       "b 110f\n"
       "106:"  // Height 4: Partial accumulate: partial_4_0
       "tbz x11, #2, 108f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
       "tbz x11, #1, 107f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
       "tbz x11, #0, 110f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x24]\n"
-      "ld1 { v17.s }[2], [x23]\n"
-      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
       "b 110f\n"
       "107:"  // Height 4: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 110f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x24, #0x0]\n"
-      "ldr s17, [x23, #0x0]\n"
-      "ldr s21, [x22, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
       "b 110f\n"
       "108:"  // Height 4: Partial accumulate: partial_2_0
       "tbz x11, #1, 109f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d16, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
       "tbz x11, #0, 110f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x24]\n"
-      "ld1 { v16.s }[2], [x23]\n"
-      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
       "b 110f\n"
       "109:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s12, [x24, #0x0]\n"
-      "ldr s16, [x23, #0x0]\n"
-      "ldr s20, [x22, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
       "110:"  // Height 4: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 113f\n"
       "111:"  // Height 4: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
-      "ldr q20, [x22, #0x0]\n"
-      "ldr q21, [x22, #0x10]\n"
-      "ldr q22, [x22, #0x20]\n"
-      "ldr q23, [x22, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
       "b 113f\n"
       "112:"  // Height 4: no accumulate
       "movi v8.16b, #0x0\n"
@@ -1417,604 +1416,604 @@ void a64_hybrid_fp32_mla_6x16 (
       "movi v22.16b, #0x0\n"
       "movi v23.16b, #0x0\n"
       "113:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "114:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 115f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 116f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 116f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
       "b 116f\n"
       "115:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "116:"  // Height 4: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "blt 119f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x8\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 118f\n"
       "117:"  // Height 4: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x4\n"
+      "add x26, x26, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q25, [x10, #0x20]\n"
+      "add x25, x25, #0x10\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x26, x26, #0x4\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x8\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "cmp x27, #0x8\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "ldr q2, [x23, #0x0]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 117b\n"
       "118:"  // Height 4: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x4\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "add x24, x24, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q25, [x10, #0x20]\n"
+      "add x24, x24, #0x10\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x22, x22, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "add x23, x23, #0x10\n"
+      "sub x27, x27, #0x4\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
       "119:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x26, 121f\n"
+      "cbz x27, 121f\n"
       "120:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x1\n"
-      "ldr s1, [x24], #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr s3, [x22], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "sub x27, x27, #0x1\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      "fmla v8.4s, v25.4s, v29.s[0]\n"
+      "fmla v12.4s, v25.4s, v28.s[0]\n"
+      "fmla v16.4s, v25.4s, v27.s[0]\n"
+      "fmla v20.4s, v25.4s, v26.s[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.4s, v24.4s, v29.s[0]\n"
+      "fmla v13.4s, v24.4s, v28.s[0]\n"
+      "fmla v17.4s, v24.4s, v27.s[0]\n"
+      "fmla v21.4s, v24.4s, v26.s[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "cbnz x26, 120b\n"
+      "fmla v10.4s, v25.4s, v29.s[0]\n"
+      "fmla v14.4s, v25.4s, v28.s[0]\n"
+      "fmla v18.4s, v25.4s, v27.s[0]\n"
+      "fmla v22.4s, v25.4s, v26.s[0]\n"
+      "fmla v11.4s, v24.4s, v29.s[0]\n"
+      "fmla v15.4s, v24.4s, v28.s[0]\n"
+      "fmla v19.4s, v24.4s, v27.s[0]\n"
+      "fmla v23.4s, v24.4s, v26.s[0]\n"
+      "cbnz x27, 120b\n"
       "121:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 114b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x24, x20, LSL #2\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 122f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmin v20.4s, v20.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
-      "fmax v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v0.4s\n"
-      "fmin v22.4s, v22.4s, v0.4s\n"
-      "fmin v23.4s, v23.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v1.4s\n"
-      "fmax v22.4s, v22.4s, v1.4s\n"
-      "fmax v23.4s, v23.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v25.4s\n"
+      "fmin v9.4s, v9.4s, v25.4s\n"
+      "fmin v10.4s, v10.4s, v25.4s\n"
+      "fmin v11.4s, v11.4s, v25.4s\n"
+      "fmin v12.4s, v12.4s, v25.4s\n"
+      "fmin v13.4s, v13.4s, v25.4s\n"
+      "fmin v14.4s, v14.4s, v25.4s\n"
+      "fmin v15.4s, v15.4s, v25.4s\n"
+      "fmin v16.4s, v16.4s, v25.4s\n"
+      "fmin v17.4s, v17.4s, v25.4s\n"
+      "fmin v18.4s, v18.4s, v25.4s\n"
+      "fmin v19.4s, v19.4s, v25.4s\n"
+      "fmin v20.4s, v20.4s, v25.4s\n"
+      "fmin v21.4s, v21.4s, v25.4s\n"
+      "fmin v22.4s, v22.4s, v25.4s\n"
+      "fmin v23.4s, v23.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v24.4s\n"
+      "fmax v9.4s, v9.4s, v24.4s\n"
+      "fmax v10.4s, v10.4s, v24.4s\n"
+      "fmax v11.4s, v11.4s, v24.4s\n"
+      "fmax v12.4s, v12.4s, v24.4s\n"
+      "fmax v13.4s, v13.4s, v24.4s\n"
+      "fmax v14.4s, v14.4s, v24.4s\n"
+      "fmax v15.4s, v15.4s, v24.4s\n"
+      "fmax v16.4s, v16.4s, v24.4s\n"
+      "fmax v17.4s, v17.4s, v24.4s\n"
+      "fmax v18.4s, v18.4s, v24.4s\n"
+      "fmax v19.4s, v19.4s, v24.4s\n"
+      "fmax v20.4s, v20.4s, v24.4s\n"
+      "fmax v21.4s, v21.4s, v24.4s\n"
+      "fmax v22.4s, v22.4s, v24.4s\n"
+      "fmax v23.4s, v23.4s, v24.4s\n"
       "122:"  // Height 4: No activation
       "cmp x11, #0x10\n"
       "bge 131f\n"
       "tbz x11, #3, 126f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v13.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v17.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
-      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
       "tbz x11, #2, 124f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x24], #0x10\n"
-      "st1 { v18.4s }, [x23], #0x10\n"
-      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "st1 { v22.4s }, [x23], #0x10\n"
       "tbz x11, #1, 123f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
-      "str d23, [x22], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
       "tbz x11, #0, 130f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x24]\n"
-      "st1 { v19.s }[2], [x23]\n"
-      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "st1 { v23.s }[2], [x23]\n"
       "b 130f\n"
       "123:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x11, #0, 130f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x24, #0x0]\n"
-      "str s19, [x23, #0x0]\n"
-      "str s23, [x22, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "str s23, [x23, #0x0]\n"
       "b 130f\n"
       "124:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x11, #1, 125f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
-      "str d22, [x22], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
       "tbz x11, #0, 130f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x24]\n"
-      "st1 { v18.s }[2], [x23]\n"
-      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
       "b 130f\n"
       "125:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x11, #0, 130f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x24, #0x0]\n"
-      "str s18, [x23, #0x0]\n"
-      "str s22, [x22, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
       "b 130f\n"
       "126:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x11, #2, 128f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
       "tbz x11, #1, 127f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
-      "str d21, [x22], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
       "tbz x11, #0, 130f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x24]\n"
-      "st1 { v17.s }[2], [x23]\n"
-      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
       "b 130f\n"
       "127:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x11, #0, 130f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x24, #0x0]\n"
-      "str s17, [x23, #0x0]\n"
-      "str s21, [x22, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
       "b 130f\n"
       "128:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x11, #1, 129f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
       "tbz x11, #0, 130f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x24]\n"
-      "st1 { v16.s }[2], [x23]\n"
-      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
       "b 130f\n"
       "129:"  // Height 4: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x24, #0x0]\n"
-      "str s16, [x23, #0x0]\n"
-      "str s20, [x22, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
       "130:"  // Height 4: Partial direct writeback: Done
       "b 132f\n"
       "131:"  // Height 4: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q21, [x22, #0x10]\n"
-      "str q22, [x22, #0x20]\n"
-      "str q23, [x22, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
       "132:"  // Height 4: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 101b\n"
       "b 200f\n"
       "133:"  // Height 5
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "134:"  // Height 5: Column loop
-      "cbz x9, 135f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 135f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "mov v20.16b, v8.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      "mov v24.16b, v8.16b\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
       "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
       "mov v27.16b, v11.16b\n"
       "b 146f\n"
       "135:"  // Height 5: no bias
       "tbz %x[flags], #0, 145f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "cmp x11, #0x10\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "bge 144f\n"
       "tbz x11, #3, 139f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
-      "ld1 { v24.4s }, [x21], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x24], #0x10\n"
-      "ld1 { v17.4s }, [x23], #0x10\n"
-      "ld1 { v21.4s }, [x22], #0x10\n"
-      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
       "tbz x11, #2, 137f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x24], #0x10\n"
-      "ld1 { v18.4s }, [x23], #0x10\n"
-      "ld1 { v22.4s }, [x22], #0x10\n"
-      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
       "tbz x11, #1, 136f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
-      "ldr d23, [x22], #0x8\n"
-      "ldr d27, [x21], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
       "tbz x11, #0, 143f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x24]\n"
-      "ld1 { v19.s }[2], [x23]\n"
-      "ld1 { v23.s }[2], [x22]\n"
-      "ld1 { v27.s }[2], [x21]\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
       "b 143f\n"
       "136:"  // Height 5: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 143f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x24, #0x0]\n"
-      "ldr s19, [x23, #0x0]\n"
-      "ldr s23, [x22, #0x0]\n"
-      "ldr s27, [x21, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
       "b 143f\n"
       "137:"  // Height 5: Partial accumulate: partial_2_8
       "tbz x11, #1, 138f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "mov x19, #0x28\n"
-      "ldr d18, [x23], #0x8\n"
-      "ldr d22, [x22], #0x8\n"
-      "ldr d26, [x21], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
       "tbz x11, #0, 143f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x24]\n"
-      "ld1 { v18.s }[2], [x23]\n"
-      "ld1 { v22.s }[2], [x22]\n"
-      "ld1 { v26.s }[2], [x21]\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
       "b 143f\n"
       "138:"  // Height 5: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 143f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x24, #0x0]\n"
-      "ldr s18, [x23, #0x0]\n"
-      "ldr s22, [x22, #0x0]\n"
-      "ldr s26, [x21, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
       "b 143f\n"
       "139:"  // Height 5: Partial accumulate: partial_4_0
       "tbz x11, #2, 141f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
-      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
       "tbz x11, #1, 140f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
-      "ldr d25, [x21], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
       "tbz x11, #0, 143f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x24]\n"
-      "ld1 { v17.s }[2], [x23]\n"
-      "ld1 { v21.s }[2], [x22]\n"
-      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
       "b 143f\n"
       "140:"  // Height 5: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 143f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x24, #0x0]\n"
-      "ldr s17, [x23, #0x0]\n"
-      "ldr s21, [x22, #0x0]\n"
-      "ldr s25, [x21, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
       "b 143f\n"
       "141:"  // Height 5: Partial accumulate: partial_2_0
       "tbz x11, #1, 142f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d16, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d24, [x21], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
       "tbz x11, #0, 143f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x24]\n"
-      "ld1 { v16.s }[2], [x23]\n"
-      "ld1 { v20.s }[2], [x22]\n"
-      "ld1 { v24.s }[2], [x21]\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
       "b 143f\n"
       "142:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s12, [x24, #0x0]\n"
-      "ldr s16, [x23, #0x0]\n"
-      "ldr s20, [x22, #0x0]\n"
-      "ldr s24, [x21, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
       "143:"  // Height 5: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 146f\n"
       "144:"  // Height 5: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
-      "ldr q20, [x22, #0x0]\n"
-      "ldr q21, [x22, #0x10]\n"
-      "ldr q22, [x22, #0x20]\n"
-      "ldr q23, [x22, #0x30]\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q24, [x22, #0x0]\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q26, [x22, #0x20]\n"
+      "ldr q27, [x22, #0x30]\n"
       "b 146f\n"
       "145:"  // Height 5: no accumulate
       "movi v8.16b, #0x0\n"
@@ -2038,708 +2037,708 @@ void a64_hybrid_fp32_mla_6x16 (
       "movi v26.16b, #0x0\n"
       "movi v27.16b, #0x0\n"
       "146:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "147:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 148f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 149f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
-      "add x21, x21, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 149f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
       "b 149f\n"
       "148:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "149:"  // Height 5: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "blt 152f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x8\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 151f\n"
       "150:"  // Height 5: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x4\n"
+      "add x26, x26, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q29, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sub x26, x26, #0x4\n"
+      "add x23, x23, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "cmp x26, #0x8\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "cmp x27, #0x8\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "fmla v24.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "fmla v26.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "fmla v27.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "fmla v24.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "fmla v25.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "fmla v26.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "fmla v27.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "fmla v24.4s, v6.4s, v4.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "fmla v25.4s, v7.4s, v4.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "ldr q2, [x23, #0x0]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "fmla v27.4s, v7.4s, v4.s[3]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 150b\n"
       "151:"  // Height 5: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x4\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "add x24, x24, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q29, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "sub x27, x27, #0x4\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "fmla v24.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "fmla v26.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "fmla v27.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "fmla v24.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "fmla v25.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "fmla v26.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "fmla v27.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "fmla v24.4s, v6.4s, v4.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "fmla v25.4s, v7.4s, v4.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v26.4s, v6.4s, v4.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
       "152:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x26, 154f\n"
+      "cbz x27, 154f\n"
       "153:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x1\n"
-      "ldr s1, [x24], #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr s3, [x22], #0x4\n"
-      "ldr s4, [x21], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s1, [x25], #0x4\n"
+      "sub x27, x27, #0x1\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v8.4s, v29.4s, v2.s[0]\n"
+      "fmla v12.4s, v29.4s, v1.s[0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      "fmla v16.4s, v29.4s, v0.s[0]\n"
+      "fmla v20.4s, v29.4s, v31.s[0]\n"
+      "fmla v24.4s, v29.4s, v30.s[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.4s, v28.4s, v2.s[0]\n"
+      "fmla v13.4s, v28.4s, v1.s[0]\n"
+      "fmla v17.4s, v28.4s, v0.s[0]\n"
+      "fmla v21.4s, v28.4s, v31.s[0]\n"
+      "fmla v25.4s, v28.4s, v30.s[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "cbnz x26, 153b\n"
+      "fmla v10.4s, v29.4s, v2.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "fmla v18.4s, v29.4s, v0.s[0]\n"
+      "fmla v22.4s, v29.4s, v31.s[0]\n"
+      "fmla v26.4s, v29.4s, v30.s[0]\n"
+      "fmla v11.4s, v28.4s, v2.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v0.s[0]\n"
+      "fmla v23.4s, v28.4s, v31.s[0]\n"
+      "fmla v27.4s, v28.4s, v30.s[0]\n"
+      "cbnz x27, 153b\n"
       "154:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 147b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "tbz %x[flags], #1, 155f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmin v20.4s, v20.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
-      "fmax v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v0.4s\n"
-      "fmin v22.4s, v22.4s, v0.4s\n"
-      "fmin v23.4s, v23.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v1.4s\n"
-      "fmax v22.4s, v22.4s, v1.4s\n"
-      "fmax v23.4s, v23.4s, v1.4s\n"
-      "fmin v24.4s, v24.4s, v0.4s\n"
-      "fmin v25.4s, v25.4s, v0.4s\n"
-      "fmin v26.4s, v26.4s, v0.4s\n"
-      "fmax v24.4s, v24.4s, v1.4s\n"
-      "fmax v25.4s, v25.4s, v1.4s\n"
-      "fmax v26.4s, v26.4s, v1.4s\n"
-      "fmin v27.4s, v27.4s, v0.4s\n"
-      "fmax v27.4s, v27.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v29.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v29.4s\n"
+      "fmin v9.4s, v9.4s, v29.4s\n"
+      "fmin v10.4s, v10.4s, v29.4s\n"
+      "fmin v11.4s, v11.4s, v29.4s\n"
+      "fmin v12.4s, v12.4s, v29.4s\n"
+      "fmin v13.4s, v13.4s, v29.4s\n"
+      "fmin v14.4s, v14.4s, v29.4s\n"
+      "fmin v15.4s, v15.4s, v29.4s\n"
+      "fmin v16.4s, v16.4s, v29.4s\n"
+      "fmin v17.4s, v17.4s, v29.4s\n"
+      "fmin v18.4s, v18.4s, v29.4s\n"
+      "fmin v19.4s, v19.4s, v29.4s\n"
+      "fmin v20.4s, v20.4s, v29.4s\n"
+      "fmin v21.4s, v21.4s, v29.4s\n"
+      "fmin v22.4s, v22.4s, v29.4s\n"
+      "fmin v23.4s, v23.4s, v29.4s\n"
+      "fmin v24.4s, v24.4s, v29.4s\n"
+      "fmin v25.4s, v25.4s, v29.4s\n"
+      "fmin v26.4s, v26.4s, v29.4s\n"
+      "fmin v27.4s, v27.4s, v29.4s\n"
+      "fmax v8.4s, v8.4s, v28.4s\n"
+      "fmax v9.4s, v9.4s, v28.4s\n"
+      "fmax v10.4s, v10.4s, v28.4s\n"
+      "fmax v11.4s, v11.4s, v28.4s\n"
+      "fmax v12.4s, v12.4s, v28.4s\n"
+      "fmax v13.4s, v13.4s, v28.4s\n"
+      "fmax v14.4s, v14.4s, v28.4s\n"
+      "fmax v15.4s, v15.4s, v28.4s\n"
+      "fmax v16.4s, v16.4s, v28.4s\n"
+      "fmax v17.4s, v17.4s, v28.4s\n"
+      "fmax v18.4s, v18.4s, v28.4s\n"
+      "fmax v19.4s, v19.4s, v28.4s\n"
+      "fmax v20.4s, v20.4s, v28.4s\n"
+      "fmax v21.4s, v21.4s, v28.4s\n"
+      "fmax v22.4s, v22.4s, v28.4s\n"
+      "fmax v23.4s, v23.4s, v28.4s\n"
+      "fmax v24.4s, v24.4s, v28.4s\n"
+      "fmax v25.4s, v25.4s, v28.4s\n"
+      "fmax v26.4s, v26.4s, v28.4s\n"
+      "fmax v27.4s, v27.4s, v28.4s\n"
       "155:"  // Height 5: No activation
       "cmp x11, #0x10\n"
       "bge 164f\n"
       "tbz x11, #3, 159f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v13.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v17.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
-      "st1 { v21.4s }, [x22], #0x10\n"
-      "st1 { v24.4s }, [x21], #0x10\n"
-      "st1 { v25.4s }, [x21], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v25.4s }, [x22], #0x10\n"
       "tbz x11, #2, 157f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x24], #0x10\n"
-      "st1 { v18.4s }, [x23], #0x10\n"
-      "st1 { v22.4s }, [x22], #0x10\n"
-      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "st1 { v22.4s }, [x23], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
       "tbz x11, #1, 156f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
-      "str d23, [x22], #0x8\n"
-      "str d27, [x21], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
       "tbz x11, #0, 163f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x24]\n"
-      "st1 { v19.s }[2], [x23]\n"
-      "st1 { v23.s }[2], [x22]\n"
-      "st1 { v27.s }[2], [x21]\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "st1 { v23.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
       "b 163f\n"
       "156:"  // Height 5: Partial direct writeback: partial_1_12
       "tbz x11, #0, 163f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x24, #0x0]\n"
-      "str s19, [x23, #0x0]\n"
-      "str s23, [x22, #0x0]\n"
-      "str s27, [x21, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "str s23, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
       "b 163f\n"
       "157:"  // Height 5: Partial direct writeback: partial_2_8
       "tbz x11, #1, 158f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
-      "str d22, [x22], #0x8\n"
-      "str d26, [x21], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
       "tbz x11, #0, 163f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x24]\n"
-      "st1 { v18.s }[2], [x23]\n"
-      "st1 { v22.s }[2], [x22]\n"
-      "st1 { v26.s }[2], [x21]\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
       "b 163f\n"
       "158:"  // Height 5: Partial direct writeback: partial_1_8
       "tbz x11, #0, 163f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x24, #0x0]\n"
-      "str s18, [x23, #0x0]\n"
-      "str s22, [x22, #0x0]\n"
-      "str s26, [x21, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
       "b 163f\n"
       "159:"  // Height 5: Partial direct writeback: partial_4_0
       "tbz x11, #2, 161f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
-      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
       "tbz x11, #1, 160f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
-      "str d21, [x22], #0x8\n"
-      "str d25, [x21], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
       "tbz x11, #0, 163f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x24]\n"
-      "st1 { v17.s }[2], [x23]\n"
-      "st1 { v21.s }[2], [x22]\n"
-      "st1 { v25.s }[2], [x21]\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
       "b 163f\n"
       "160:"  // Height 5: Partial direct writeback: partial_1_4
       "tbz x11, #0, 163f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x24, #0x0]\n"
-      "str s17, [x23, #0x0]\n"
-      "str s21, [x22, #0x0]\n"
-      "str s25, [x21, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
       "b 163f\n"
       "161:"  // Height 5: Partial direct writeback: partial_2_0
       "tbz x11, #1, 162f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x11, #0, 163f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x24]\n"
-      "st1 { v16.s }[2], [x23]\n"
-      "st1 { v20.s }[2], [x22]\n"
-      "st1 { v24.s }[2], [x21]\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
       "b 163f\n"
       "162:"  // Height 5: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x24, #0x0]\n"
-      "str s16, [x23, #0x0]\n"
-      "str s20, [x22, #0x0]\n"
-      "str s24, [x21, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
       "163:"  // Height 5: Partial direct writeback: Done
       "b 165f\n"
       "164:"  // Height 5: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q21, [x22, #0x10]\n"
-      "str q22, [x22, #0x20]\n"
-      "str q23, [x22, #0x30]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q25, [x21, #0x10]\n"
-      "str q26, [x21, #0x20]\n"
-      "str q27, [x21, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
       "165:"  // Height 5: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 134b\n"
       "b 200f\n"
       "166:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x19, #0x18\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "167:"  // Height 6: Column loop
-      "cbz x9, 168f\n"
-      "ldr q8, [x9, #0x0]\n"
+      "cbz x12, 168f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "mov v12.16b, v8.16b\n"
-      "ldr q9, [x9, #0x10]\n"
-      "mov v16.16b, v8.16b\n"
-      "ldr q10, [x9, #0x20]\n"
-      "mov v20.16b, v8.16b\n"
-      "ldr q11, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      "mov v24.16b, v8.16b\n"
-      "mov v28.16b, v8.16b\n"
       "mov v13.16b, v9.16b\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
       "mov v17.16b, v9.16b\n"
+      "add x12, x12, #0x40\n"
       "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
+      "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
       "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
       "mov v27.16b, v11.16b\n"
+      "mov v28.16b, v8.16b\n"
       "mov v29.16b, v9.16b\n"
       "mov v30.16b, v10.16b\n"
       "mov v31.16b, v11.16b\n"
       "b 179f\n"
       "168:"  // Height 6: no bias
       "tbz %x[flags], #0, 178f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "cmp x11, #0x10\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "bge 177f\n"
       "tbz x11, #3, 172f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
-      "ld1 { v24.4s }, [x21], #0x10\n"
-      "ld1 { v28.4s }, [x20], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x24], #0x10\n"
-      "ld1 { v17.4s }, [x23], #0x10\n"
-      "ld1 { v21.4s }, [x22], #0x10\n"
-      "ld1 { v25.4s }, [x21], #0x10\n"
-      "ld1 { v29.4s }, [x20], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
       "tbz x11, #2, 170f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x24], #0x10\n"
-      "ld1 { v18.4s }, [x23], #0x10\n"
-      "ld1 { v22.4s }, [x22], #0x10\n"
-      "ld1 { v26.4s }, [x21], #0x10\n"
-      "ld1 { v30.4s }, [x20], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
       "tbz x11, #1, 169f\n"
-      "mov x19, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x24], #0x8\n"
-      "ldr d19, [x23], #0x8\n"
-      "ldr d23, [x22], #0x8\n"
-      "ldr d27, [x21], #0x8\n"
-      "ldr d31, [x20], #0x8\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
       "tbz x11, #0, 176f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x24]\n"
-      "ld1 { v19.s }[2], [x23]\n"
-      "ld1 { v23.s }[2], [x22]\n"
-      "ld1 { v27.s }[2], [x21]\n"
-      "ld1 { v31.s }[2], [x20]\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
       "b 176f\n"
       "169:"  // Height 6: Partial accumulate: partial_1_12
-      "mov x19, #0x30\n"
+      "mov x20, #0x30\n"
       "tbz x11, #0, 176f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x24, #0x0]\n"
-      "ldr s19, [x23, #0x0]\n"
-      "ldr s23, [x22, #0x0]\n"
-      "ldr s27, [x21, #0x0]\n"
-      "ldr s31, [x20, #0x0]\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
       "b 176f\n"
       "170:"  // Height 6: Partial accumulate: partial_2_8
       "tbz x11, #1, 171f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x24], #0x8\n"
-      "mov x19, #0x28\n"
-      "ldr d18, [x23], #0x8\n"
-      "ldr d22, [x22], #0x8\n"
-      "ldr d26, [x21], #0x8\n"
-      "ldr d30, [x20], #0x8\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
       "tbz x11, #0, 176f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x24]\n"
-      "ld1 { v18.s }[2], [x23]\n"
-      "ld1 { v22.s }[2], [x22]\n"
-      "ld1 { v26.s }[2], [x21]\n"
-      "ld1 { v30.s }[2], [x20]\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
+      "ld1 { v30.s }[2], [x21]\n"
       "b 176f\n"
       "171:"  // Height 6: Partial accumulate: partial_1_8
-      "mov x19, #0x20\n"
+      "mov x20, #0x20\n"
       "tbz x11, #0, 176f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x24, #0x0]\n"
-      "ldr s18, [x23, #0x0]\n"
-      "ldr s22, [x22, #0x0]\n"
-      "ldr s26, [x21, #0x0]\n"
-      "ldr s30, [x20, #0x0]\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
       "b 176f\n"
       "172:"  // Height 6: Partial accumulate: partial_4_0
       "tbz x11, #2, 174f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x24], #0x10\n"
-      "ld1 { v16.4s }, [x23], #0x10\n"
-      "ld1 { v20.4s }, [x22], #0x10\n"
-      "ld1 { v24.4s }, [x21], #0x10\n"
-      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v16.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
       "tbz x11, #1, 173f\n"
-      "mov x19, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
-      "ldr d21, [x22], #0x8\n"
-      "ldr d25, [x21], #0x8\n"
-      "ldr d29, [x20], #0x8\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
       "tbz x11, #0, 176f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x24]\n"
-      "ld1 { v17.s }[2], [x23]\n"
-      "ld1 { v21.s }[2], [x22]\n"
-      "ld1 { v25.s }[2], [x21]\n"
-      "ld1 { v29.s }[2], [x20]\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "ld1 { v29.s }[2], [x21]\n"
       "b 176f\n"
       "173:"  // Height 6: Partial accumulate: partial_1_4
-      "mov x19, #0x10\n"
+      "mov x20, #0x10\n"
       "tbz x11, #0, 176f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x24, #0x0]\n"
-      "ldr s17, [x23, #0x0]\n"
-      "ldr s21, [x22, #0x0]\n"
-      "ldr s25, [x21, #0x0]\n"
-      "ldr s29, [x20, #0x0]\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
       "b 176f\n"
       "174:"  // Height 6: Partial accumulate: partial_2_0
       "tbz x11, #1, 175f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x24], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d16, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d24, [x21], #0x8\n"
-      "ldr d28, [x20], #0x8\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
       "tbz x11, #0, 176f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x24]\n"
-      "ld1 { v16.s }[2], [x23]\n"
-      "ld1 { v20.s }[2], [x22]\n"
-      "ld1 { v24.s }[2], [x21]\n"
-      "ld1 { v28.s }[2], [x20]\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v16.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v28.s }[2], [x21]\n"
       "b 176f\n"
       "175:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s12, [x24, #0x0]\n"
-      "ldr s16, [x23, #0x0]\n"
-      "ldr s20, [x22, #0x0]\n"
-      "ldr s24, [x21, #0x0]\n"
-      "ldr s28, [x20, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s16, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
       "176:"  // Height 6: Partial accumulate: Done
-      "sub x28, x28, x19\n"
+      "sub x9, x9, x20\n"
       "b 179f\n"
       "177:"  // Height 6: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x24, #0x0]\n"
-      "ldr q13, [x24, #0x10]\n"
-      "ldr q14, [x24, #0x20]\n"
-      "ldr q15, [x24, #0x30]\n"
-      "ldr q16, [x23, #0x0]\n"
-      "ldr q17, [x23, #0x10]\n"
-      "ldr q18, [x23, #0x20]\n"
-      "ldr q19, [x23, #0x30]\n"
-      "ldr q20, [x22, #0x0]\n"
-      "ldr q21, [x22, #0x10]\n"
-      "ldr q22, [x22, #0x20]\n"
-      "ldr q23, [x22, #0x30]\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "ldr q28, [x20, #0x0]\n"
-      "ldr q29, [x20, #0x10]\n"
-      "ldr q30, [x20, #0x20]\n"
-      "ldr q31, [x20, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q16, [x24, #0x0]\n"
+      "ldr q17, [x24, #0x10]\n"
+      "ldr q18, [x24, #0x20]\n"
+      "ldr q19, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q24, [x22, #0x0]\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q26, [x22, #0x20]\n"
+      "ldr q27, [x22, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
       "b 179f\n"
       "178:"  // Height 6: no accumulate
       "movi v8.16b, #0x0\n"
@@ -2767,82 +2766,82 @@ void a64_hybrid_fp32_mla_6x16 (
       "movi v30.16b, #0x0\n"
       "movi v31.16b, #0x0\n"
       "179:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "180:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 181f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 182f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
-      "add x21, x21, x19, LSL #2\n"
-      "add x20, x20, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 182f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
       "b 182f\n"
       "181:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "182:"  // Height 6: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "blt 185f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x8\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
       "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 184f\n"
       "183:"  // Height 6: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "add x25, x25, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x4\n"
+      "add x26, x26, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
       "fmla v28.4s, v6.4s, v5.s[0]\n"
       "ldr q6, [x10, #0x20]\n"
-      "add x20, x20, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "sub x26, x26, #0x4\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "cmp x26, #0x8\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "cmp x27, #0x8\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
       "fmla v29.4s, v7.4s, v5.s[0]\n"
       "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
       "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla v18.4s, v6.4s, v2.s[0]\n"
       "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       "fmla v26.4s, v6.4s, v4.s[0]\n"
       "fmla v30.4s, v6.4s, v5.s[0]\n"
       "ldr q6, [x10, #0x40]\n"
@@ -2932,51 +2931,51 @@ void a64_hybrid_fp32_mla_6x16 (
       "fmla v30.4s, v6.4s, v5.s[3]\n"
       "ldr q6, [x10, #0x0]\n"
       "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
       "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr q1, [x24, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
       "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
       "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
       "fmla v27.4s, v7.4s, v4.s[3]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
       "fmla v31.4s, v7.4s, v5.s[3]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 183b\n"
       "184:"  // Height 6: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "sub x26, x26, #0x4\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "add x24, x24, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
       "fmla v28.4s, v6.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
       "ldr q6, [x10, #0x20]\n"
-      "add x20, x20, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "sub x27, x27, #0x4\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
       "fmla v29.4s, v7.4s, v5.s[0]\n"
       "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
       "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v18.4s, v6.4s, v2.s[0]\n"
       "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       "fmla v26.4s, v6.4s, v4.s[0]\n"
       "fmla v30.4s, v6.4s, v5.s[0]\n"
       "ldr q6, [x10, #0x40]\n"
@@ -3071,289 +3070,288 @@ void a64_hybrid_fp32_mla_6x16 (
       "fmla v27.4s, v7.4s, v4.s[3]\n"
       "fmla v31.4s, v7.4s, v5.s[3]\n"
       "185:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x26, 187f\n"
+      "cbz x27, 187f\n"
       "186:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x1\n"
-      "ldr s1, [x24], #0x4\n"
-      "ldr s2, [x23], #0x4\n"
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
+      "sub x27, x27, #0x1\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
       "ldr s3, [x22], #0x4\n"
-      "ldr s4, [x21], #0x4\n"
-      "ldr s5, [x20], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "fmla v28.4s, v6.4s, v5.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "fmla v29.4s, v7.4s, v5.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      "fmla v8.4s, v1.4s, v7.s[0]\n"
+      "fmla v12.4s, v1.4s, v6.s[0]\n"
+      "fmla v16.4s, v1.4s, v5.s[0]\n"
+      "fmla v20.4s, v1.4s, v4.s[0]\n"
+      "fmla v24.4s, v1.4s, v3.s[0]\n"
+      "fmla v28.4s, v1.4s, v2.s[0]\n"
+      "ldr q1, [x10, #0x20]\n"
+      "fmla v9.4s, v0.4s, v7.s[0]\n"
+      "fmla v13.4s, v0.4s, v6.s[0]\n"
+      "fmla v17.4s, v0.4s, v5.s[0]\n"
+      "fmla v21.4s, v0.4s, v4.s[0]\n"
+      "fmla v25.4s, v0.4s, v3.s[0]\n"
+      "fmla v29.4s, v0.4s, v2.s[0]\n"
+      "ldr q0, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "fmla v30.4s, v6.4s, v5.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "fmla v31.4s, v7.4s, v5.s[0]\n"
-      "cbnz x26, 186b\n"
+      "fmla v10.4s, v1.4s, v7.s[0]\n"
+      "fmla v14.4s, v1.4s, v6.s[0]\n"
+      "fmla v18.4s, v1.4s, v5.s[0]\n"
+      "fmla v22.4s, v1.4s, v4.s[0]\n"
+      "fmla v26.4s, v1.4s, v3.s[0]\n"
+      "fmla v30.4s, v1.4s, v2.s[0]\n"
+      "fmla v11.4s, v0.4s, v7.s[0]\n"
+      "fmla v15.4s, v0.4s, v6.s[0]\n"
+      "fmla v19.4s, v0.4s, v5.s[0]\n"
+      "fmla v23.4s, v0.4s, v4.s[0]\n"
+      "fmla v27.4s, v0.4s, v3.s[0]\n"
+      "fmla v31.4s, v0.4s, v2.s[0]\n"
+      "cbnz x27, 186b\n"
       "187:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 180b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x24, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #2\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "tbz %x[flags], #1, 188f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v1.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x19]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v1.4s\n"
-      "fmax v9.4s, v9.4s, v1.4s\n"
-      "fmax v10.4s, v10.4s, v1.4s\n"
-      "fmax v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v1.4s\n"
-      "fmax v13.4s, v13.4s, v1.4s\n"
-      "fmax v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v1.4s\n"
-      "fmax v16.4s, v16.4s, v1.4s\n"
-      "fmax v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmin v20.4s, v20.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v1.4s\n"
-      "fmax v19.4s, v19.4s, v1.4s\n"
-      "fmax v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v0.4s\n"
-      "fmin v22.4s, v22.4s, v0.4s\n"
-      "fmin v23.4s, v23.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v1.4s\n"
-      "fmax v22.4s, v22.4s, v1.4s\n"
-      "fmax v23.4s, v23.4s, v1.4s\n"
-      "fmin v24.4s, v24.4s, v0.4s\n"
-      "fmin v25.4s, v25.4s, v0.4s\n"
-      "fmin v26.4s, v26.4s, v0.4s\n"
-      "fmax v24.4s, v24.4s, v1.4s\n"
-      "fmax v25.4s, v25.4s, v1.4s\n"
-      "fmax v26.4s, v26.4s, v1.4s\n"
-      "fmin v27.4s, v27.4s, v0.4s\n"
-      "fmin v28.4s, v28.4s, v0.4s\n"
-      "fmin v29.4s, v29.4s, v0.4s\n"
-      "fmax v27.4s, v27.4s, v1.4s\n"
-      "fmax v28.4s, v28.4s, v1.4s\n"
-      "fmax v29.4s, v29.4s, v1.4s\n"
-      "fmin v30.4s, v30.4s, v0.4s\n"
-      "fmin v31.4s, v31.4s, v0.4s\n"
-      "fmax v30.4s, v30.4s, v1.4s\n"
-      "fmax v31.4s, v31.4s, v1.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmin v28.4s, v28.4s, v1.4s\n"
+      "fmin v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v1.4s\n"
+      "fmin v31.4s, v31.4s, v1.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v31.4s, v31.4s, v0.4s\n"
       "188:"  // Height 6: No activation
       "cmp x11, #0x10\n"
       "bge 197f\n"
       "tbz x11, #3, 192f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v13.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v17.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
-      "st1 { v21.4s }, [x22], #0x10\n"
-      "st1 { v24.4s }, [x21], #0x10\n"
-      "st1 { v25.4s }, [x21], #0x10\n"
-      "st1 { v28.4s }, [x20], #0x10\n"
-      "st1 { v29.4s }, [x20], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v13.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v25.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
       "tbz x11, #2, 190f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x24], #0x10\n"
-      "st1 { v18.4s }, [x23], #0x10\n"
-      "st1 { v22.4s }, [x22], #0x10\n"
-      "st1 { v26.4s }, [x21], #0x10\n"
-      "st1 { v30.4s }, [x20], #0x10\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "st1 { v22.4s }, [x23], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
       "tbz x11, #1, 189f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x24], #0x8\n"
-      "str d19, [x23], #0x8\n"
-      "str d23, [x22], #0x8\n"
-      "str d27, [x21], #0x8\n"
-      "str d31, [x20], #0x8\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "str d23, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "str d31, [x21], #0x8\n"
       "tbz x11, #0, 196f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x24]\n"
-      "st1 { v19.s }[2], [x23]\n"
-      "st1 { v23.s }[2], [x22]\n"
-      "st1 { v27.s }[2], [x21]\n"
-      "st1 { v31.s }[2], [x20]\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "st1 { v23.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
+      "st1 { v31.s }[2], [x21]\n"
       "b 196f\n"
       "189:"  // Height 6: Partial direct writeback: partial_1_12
       "tbz x11, #0, 196f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x24, #0x0]\n"
-      "str s19, [x23, #0x0]\n"
-      "str s23, [x22, #0x0]\n"
-      "str s27, [x21, #0x0]\n"
-      "str s31, [x20, #0x0]\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "str s23, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
       "b 196f\n"
       "190:"  // Height 6: Partial direct writeback: partial_2_8
       "tbz x11, #1, 191f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x24], #0x8\n"
-      "str d18, [x23], #0x8\n"
-      "str d22, [x22], #0x8\n"
-      "str d26, [x21], #0x8\n"
-      "str d30, [x20], #0x8\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
       "tbz x11, #0, 196f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x24]\n"
-      "st1 { v18.s }[2], [x23]\n"
-      "st1 { v22.s }[2], [x22]\n"
-      "st1 { v26.s }[2], [x21]\n"
-      "st1 { v30.s }[2], [x20]\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
+      "st1 { v30.s }[2], [x21]\n"
       "b 196f\n"
       "191:"  // Height 6: Partial direct writeback: partial_1_8
       "tbz x11, #0, 196f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x24, #0x0]\n"
-      "str s18, [x23, #0x0]\n"
-      "str s22, [x22, #0x0]\n"
-      "str s26, [x21, #0x0]\n"
-      "str s30, [x20, #0x0]\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
       "b 196f\n"
       "192:"  // Height 6: Partial direct writeback: partial_4_0
       "tbz x11, #2, 194f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x24], #0x10\n"
-      "st1 { v16.4s }, [x23], #0x10\n"
-      "st1 { v20.4s }, [x22], #0x10\n"
-      "st1 { v24.4s }, [x21], #0x10\n"
-      "st1 { v28.4s }, [x20], #0x10\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
       "tbz x11, #1, 193f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x24], #0x8\n"
-      "str d17, [x23], #0x8\n"
-      "str d21, [x22], #0x8\n"
-      "str d25, [x21], #0x8\n"
-      "str d29, [x20], #0x8\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
       "tbz x11, #0, 196f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x24]\n"
-      "st1 { v17.s }[2], [x23]\n"
-      "st1 { v21.s }[2], [x22]\n"
-      "st1 { v25.s }[2], [x21]\n"
-      "st1 { v29.s }[2], [x20]\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "st1 { v29.s }[2], [x21]\n"
       "b 196f\n"
       "193:"  // Height 6: Partial direct writeback: partial_1_4
       "tbz x11, #0, 196f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x24, #0x0]\n"
-      "str s17, [x23, #0x0]\n"
-      "str s21, [x22, #0x0]\n"
-      "str s25, [x21, #0x0]\n"
-      "str s29, [x20, #0x0]\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
       "b 196f\n"
       "194:"  // Height 6: Partial direct writeback: partial_2_0
       "tbz x11, #1, 195f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
-      "str d28, [x20], #0x8\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
       "tbz x11, #0, 196f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x24]\n"
-      "st1 { v16.s }[2], [x23]\n"
-      "st1 { v20.s }[2], [x22]\n"
-      "st1 { v24.s }[2], [x21]\n"
-      "st1 { v28.s }[2], [x20]\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "st1 { v28.s }[2], [x21]\n"
       "b 196f\n"
       "195:"  // Height 6: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x24, #0x0]\n"
-      "str s16, [x23, #0x0]\n"
-      "str s20, [x22, #0x0]\n"
-      "str s24, [x21, #0x0]\n"
-      "str s28, [x20, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
       "196:"  // Height 6: Partial direct writeback: Done
       "b 198f\n"
       "197:"  // Height 6: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x24, #0x0]\n"
-      "str q13, [x24, #0x10]\n"
-      "str q14, [x24, #0x20]\n"
-      "str q15, [x24, #0x30]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q17, [x23, #0x10]\n"
-      "str q18, [x23, #0x20]\n"
-      "str q19, [x23, #0x30]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q21, [x22, #0x10]\n"
-      "str q22, [x22, #0x20]\n"
-      "str q23, [x22, #0x30]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q25, [x21, #0x10]\n"
-      "str q26, [x21, #0x20]\n"
-      "str q27, [x21, #0x30]\n"
-      "str q28, [x20, #0x0]\n"
-      "str q29, [x20, #0x10]\n"
-      "str q30, [x20, #0x20]\n"
-      "str q31, [x20, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x25, #0x0]\n"
+      "str q13, [x25, #0x10]\n"
+      "str q14, [x25, #0x20]\n"
+      "str q15, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q21, [x23, #0x10]\n"
+      "str q22, [x23, #0x20]\n"
+      "str q23, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
       "198:"  // Height 6: Writeback done
       "subs x11, x11, #0x10\n"
       "bgt 167b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 200f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 199f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "199:"  // Update direct input
-      "mov x19, #0x18\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x18\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "200:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
index 957754ad68..3ec02395d1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
@@ -43,7 +43,8 @@ void a64_hybrid_fp32_mla_8x4_a55( ARGLIST );
 class cls_a64_hybrid_fp32_mla_8x4
 {
 public:
-    typedef float operand_type;
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -69,7 +70,7 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
+    StdTransformsFixed<rhs_operand_type, result_type, 8, 4, 1> transforms = {};
 
     // Default to the generic kernel
     kern_type kernel=a64_hybrid_fp32_mla_8x4;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
index 99920002b2..236865315e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_8x4_a55 (
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x8\n"
       "bge 148f\n"
@@ -105,563 +104,563 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "cmp %x[M], #0x2\n"
       "bgt 43f\n"
       "beq 22f\n"
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[bias]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "cbz x15, 3f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 3f\n"
+      "ldr q24, [x3, #0x0]\n"
+      "add x3, x3, #0x10\n"
       "b 8f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 7f\n"
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 6f\n"
-      "tbz x17, #1, 4f\n"
-      "ldr d24, [x14], #0x8\n"
-      "mov x19, #0x8\n"
-      "tbz x17, #0, 5f\n"
-      "ld1 { v24.s }[2], [x14]\n"
+      "tbz x4, #1, 4f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "tbz x4, #0, 5f\n"
+      "ld1 { v24.s }[2], [x6]\n"
       "b 5f\n"
       "4:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
       "5:"  // Height 1: Partial accumulate: Done
-      "sub x14, x14, x19\n"
+      "sub x6, x6, x26\n"
       "b 8f\n"
       "6:"  // Height 1: full accumulate
-      "ldr q24, [x14, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
       "b 8f\n"
       "7:"  // Height 1: no accumulate
       "movi v24.16b, #0x0\n"
       "8:"  // Height 1: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "9:"  // Height 1: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 10f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "cbnz x13, 11f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x19, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "cbnz x7, 11f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
       "b 11f\n"
       "10:"  // Height 1: setup direct input
-      "mov x11, %x[input_ptr]\n"
+      "mov x17, %x[input_ptr]\n"
       "11:"  // Height 1: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 14f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
-      "cmp x12, #0x8\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 13f\n"
       "12:"  // Height 1: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
-      "ldr x19, [x16, #0x18]\n"
-      "add x11, x11, #0x10\n"
-      "ldr d10, [x16, #0x20]\n"
-      "sub x12, x12, #0x4\n"
-      "ldr x21, [x16, #0x28]\n"
-      "cmp x12, #0x8\n"
-      "mov v9.d[1], x19\n"
-      "ldr d11, [x16, #0x30]\n"
-      "ldr x19, [x16, #0x38]\n"
-      "add x16, x16, #0x40\n"
+      "add x17, x17, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
-      "mov v11.d[1], x19\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "ldr x10, [x11, #0x8]\n"
-      "mov v8.d[1], x26\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
-      "mov v0.d[1], x10\n"
+      "ldr d0, [x17, #0x0]\n"
+      "sub x8, x8, #0x4\n"
+      "ldr d10, [x5, #0x20]\n"
+      "cmp x8, #0x8\n"
+      "ldr d11, [x5, #0x30]\n"
+      "ldr x26, [x5, #0x8]\n"
+      "mov v8.d[1], x26\n"
+      "ldr x26, [x5, #0x18]\n"
+      "mov v9.d[1], x26\n"
+      "ldr x26, [x17, #0x8]\n"
+      "mov v0.d[1], x26\n"
+      "ldr x26, [x5, #0x28]\n"
+      "mov v10.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "bge 12b\n"
       "13:"  // Height 1: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
-      "sub x12, x12, #0x4\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x11, x11, #0x10\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
-      "add x16, x16, #0x40\n"
+      "add x17, x17, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "add x5, x5, #0x40\n"
       "14:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x12, 16f\n"
+      "cbz x8, 16f\n"
       "15:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "cbnz x12, 15b\n"
+      "ldr s17, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v17.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "cbnz x8, 15b\n"
       "16:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x19\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 9b\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
       "tbz %x[flags], #1, 17f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
       "17:"  // Height 1: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 20f\n"
-      "tbz x17, #1, 18f\n"
-      "str d24, [x14], #0x8\n"
-      "tbz x17, #0, 19f\n"
-      "st1 { v24.s }[2], [x14]\n"
+      "tbz x4, #1, 18f\n"
+      "str d24, [x6], #0x8\n"
+      "tbz x4, #0, 19f\n"
+      "st1 { v24.s }[2], [x6]\n"
       "b 19f\n"
       "18:"  // Height 1: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
       "19:"  // Height 1: Partial direct writeback: Done
       "b 21f\n"
       "20:"  // Height 1: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
       "21:"  // Height 1: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 2b\n"
       "b 170f\n"
       "22:"  // Height 2
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "23:"  // Height 2: Column loop
-      "cbz x15, 24f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 24f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "b 29f\n"
       "24:"  // Height 2: no bias
       "tbz %x[flags], #0, 28f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x19, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x4, #0x4\n"
+      "add x13, x6, x26, LSL #2\n"
       "bge 27f\n"
-      "tbz x17, #1, 25f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x19, #0x8\n"
-      "tbz x17, #0, 26f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
+      "tbz x4, #1, 25f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "tbz x4, #0, 26f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
       "b 26f\n"
       "25:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
       "26:"  // Height 2: Partial accumulate: Done
-      "sub x14, x14, x19\n"
+      "sub x6, x6, x26\n"
       "b 29f\n"
       "27:"  // Height 2: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
       "b 29f\n"
       "28:"  // Height 2: no accumulate
       "movi v24.16b, #0x0\n"
       "movi v25.16b, #0x0\n"
       "29:"  // Height 2: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "30:"  // Height 2: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 31f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "cbnz x13, 32f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x19, LSL #2\n"
-      "add x9, x9, x19, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "cbnz x7, 32f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
       "b 32f\n"
       "31:"  // Height 2: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x19, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
       "32:"  // Height 2: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 35f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 34f\n"
       "33:"  // Height 2: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x19, [x16, #0x18]\n"
-      "ldr d10, [x16, #0x20]\n"
-      "add x11, x11, #0x10\n"
-      "ldr x21, [x16, #0x28]\n"
-      "add x9, x9, #0x10\n"
-      "mov v9.d[1], x19\n"
-      "ldr d11, [x16, #0x30]\n"
-      "ldr x19, [x16, #0x38]\n"
-      "sub x12, x12, #0x4\n"
+      "add x16, x16, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
-      "mov v11.d[1], x19\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "ldr x10, [x11, #0x8]\n"
-      "cmp x12, #0x8\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "add x16, x16, #0x40\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
+      "ldr d10, [x5, #0x20]\n"
+      "ldr x27, [x5, #0x8]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "mov v8.d[1], x26\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
+      "ldr d1, [x16, #0x0]\n"
+      "sub x8, x8, #0x4\n"
+      "ldr d11, [x5, #0x30]\n"
+      "cmp x8, #0x8\n"
+      "ldr x26, [x5, #0x18]\n"
+      "mov v8.d[1], x27\n"
+      "ldr x27, [x17, #0x8]\n"
+      "mov v9.d[1], x26\n"
+      "ldr x26, [x16, #0x8]\n"
+      "mov v0.d[1], x27\n"
+      "ldr x27, [x5, #0x28]\n"
+      "mov v1.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v10.d[1], x27\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "bge 33b\n"
       "34:"  // Height 2: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
-      "ldr q11, [x16, #0x30]\n"
-      "sub x12, x12, #0x4\n"
-      "add x11, x11, #0x10\n"
-      "add x9, x9, #0x10\n"
+      "add x16, x16, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "add x16, x16, #0x40\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "add x5, x5, #0x40\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
       "35:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x12, 37f\n"
+      "cbz x8, 37f\n"
       "36:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "cbnz x12, 36b\n"
+      "ldr s18, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s17, [x16], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v18.s[0]\n"
+      "fmla v25.4s, v16.4s, v17.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "cbnz x8, 36b\n"
       "37:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x19\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 30b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x19, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
       "tbz %x[flags], #1, 38f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x19]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
       "38:"  // Height 2: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 41f\n"
-      "tbz x17, #1, 39f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "tbz x17, #0, 40f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
+      "tbz x4, #1, 39f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "tbz x4, #0, 40f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
       "b 40f\n"
       "39:"  // Height 2: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
       "40:"  // Height 2: Partial direct writeback: Done
       "b 42f\n"
       "41:"  // Height 2: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
       "42:"  // Height 2: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 23b\n"
       "b 170f\n"
       "43:"  // Height 3
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "44:"  // Height 3: Column loop
-      "cbz x15, 45f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 45f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "b 50f\n"
       "45:"  // Height 3: no bias
       "tbz %x[flags], #0, 49f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x19, LSL #2\n"
-      "add x26, x27, x19, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x12, x13, x26, LSL #2\n"
       "bge 48f\n"
-      "tbz x17, #1, 46f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "tbz x17, #0, 47f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
+      "tbz x4, #1, 46f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "tbz x4, #0, 47f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
       "b 47f\n"
       "46:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
       "47:"  // Height 3: Partial accumulate: Done
-      "sub x14, x14, x19\n"
+      "sub x6, x6, x26\n"
       "b 50f\n"
       "48:"  // Height 3: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
       "b 50f\n"
       "49:"  // Height 3: no accumulate
       "movi v24.16b, #0x0\n"
       "movi v25.16b, #0x0\n"
       "movi v26.16b, #0x0\n"
       "50:"  // Height 3: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "51:"  // Height 3: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 52f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "cbnz x13, 53f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x19, LSL #2\n"
-      "add x9, x9, x19, LSL #2\n"
-      "add x27, x27, x19, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "cbnz x7, 53f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
       "b 53f\n"
       "52:"  // Height 3: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x19, LSL #2\n"
-      "add x27, x9, x19, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
       "53:"  // Height 3: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 56f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 55f\n"
       "54:"  // Height 3: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x19, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
-      "ldr x21, [x16, #0x28]\n"
-      "add x11, x11, #0x10\n"
-      "mov v9.d[1], x19\n"
-      "ldr d11, [x16, #0x30]\n"
-      "ldr x19, [x16, #0x38]\n"
-      "add x9, x9, #0x10\n"
+      "add x15, x15, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "mov v11.d[1], x19\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "add x27, x27, #0x10\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "ldr x28, [x5, #0x8]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "ldr x10, [x11, #0x8]\n"
+      "ldr x27, [x5, #0x18]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
-      "ldr x26, [x27, #0x8]\n"
-      "sub x12, x12, #0x4\n"
+      "ldr d10, [x5, #0x20]\n"
+      "ldr x26, [x5, #0x28]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x16, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      "mov v0.d[1], x10\n"
-      "cmp x12, #0x8\n"
+      "ldr d2, [x15, #0x0]\n"
+      "sub x8, x8, #0x4\n"
+      "ldr d11, [x5, #0x30]\n"
+      "cmp x8, #0x8\n"
+      "ldr x9, [x17, #0x8]\n"
+      "mov v8.d[1], x28\n"
+      "ldr x28, [x16, #0x8]\n"
+      "mov v9.d[1], x27\n"
+      "ldr x27, [x15, #0x8]\n"
+      "mov v10.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v0.d[1], x9\n"
       "mov v1.d[1], x28\n"
-      "add x16, x16, #0x40\n"
-      "mov v2.d[1], x26\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
-      "mov v8.d[1], x26\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
+      "mov v2.d[1], x27\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "bge 54b\n"
       "55:"  // Height 3: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
-      "sub x12, x12, #0x4\n"
-      "add x11, x11, #0x10\n"
+      "add x15, x15, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "add x9, x9, #0x10\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "add x27, x27, #0x10\n"
-      "add x16, x16, #0x40\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
       "56:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x12, 58f\n"
+      "cbz x8, 58f\n"
       "57:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "cbnz x12, 57b\n"
+      "ldr s19, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s18, [x16], #0x4\n"
+      "ldr s17, [x15], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v19.s[0]\n"
+      "fmla v25.4s, v16.4s, v18.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 57b\n"
       "58:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x19\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 51b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x19, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
       "tbz %x[flags], #1, 59f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x19]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
       "59:"  // Height 3: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 62f\n"
-      "tbz x17, #1, 60f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "tbz x17, #0, 61f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
+      "tbz x4, #1, 60f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "tbz x4, #0, 61f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
       "b 61f\n"
       "60:"  // Height 3: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
       "61:"  // Height 3: Partial direct writeback: Done
       "b 63f\n"
       "62:"  // Height 3: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
       "63:"  // Height 3: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 44b\n"
       "b 170f\n"
       "64:"  // Height 4
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "65:"  // Height 4: Column loop
-      "cbz x15, 66f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 66f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "b 71f\n"
       "66:"  // Height 4: no bias
       "tbz %x[flags], #0, 70f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x19, LSL #2\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x11, x12, x26, LSL #2\n"
       "bge 69f\n"
-      "tbz x17, #1, 67f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "tbz x17, #0, 68f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
-      "ld1 { v27.s }[2], [x25]\n"
+      "tbz x4, #1, 67f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "ldr d27, [x11], #0x8\n"
+      "tbz x4, #0, 68f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
+      "ld1 { v27.s }[2], [x11]\n"
       "b 68f\n"
       "67:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
-      "ldr s27, [x25, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
+      "ldr s27, [x11, #0x0]\n"
       "68:"  // Height 4: Partial accumulate: Done
-      "sub x14, x14, x19\n"
+      "sub x6, x6, x26\n"
       "b 71f\n"
       "69:"  // Height 4: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
-      "ldr q27, [x25, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q27, [x11, #0x0]\n"
       "b 71f\n"
       "70:"  // Height 4: no accumulate
       "movi v24.16b, #0x0\n"
@@ -669,248 +668,248 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "movi v26.16b, #0x0\n"
       "movi v27.16b, #0x0\n"
       "71:"  // Height 4: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "72:"  // Height 4: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 73f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "ldr x25, [x20, #0x18]\n"
-      "cbnz x13, 74f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x19, LSL #2\n"
-      "add x9, x9, x19, LSL #2\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "ldr x14, [x26, #0x18]\n"
+      "cbnz x7, 74f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
+      "add x14, x14, x26, LSL #2\n"
       "b 74f\n"
       "73:"  // Height 4: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x19, LSL #2\n"
-      "add x27, x9, x19, LSL #2\n"
-      "add x25, x27, x19, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
+      "add x14, x15, x27, LSL #2\n"
       "74:"  // Height 4: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 77f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 76f\n"
       "75:"  // Height 4: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x19, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "ldr x21, [x16, #0x28]\n"
-      "mov v9.d[1], x19\n"
-      "ldr d11, [x16, #0x30]\n"
-      "ldr x19, [x16, #0x38]\n"
-      "add x11, x11, #0x10\n"
+      "add x14, x14, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "mov v11.d[1], x19\n"
+      "ldr x27, [x5, #0x8]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "ldr x10, [x11, #0x8]\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x9, x9, #0x10\n"
+      "ldr x26, [x5, #0x18]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "ldr x11, [x5, #0x28]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x10, [x17, #0x8]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "add x27, x27, #0x10\n"
+      "ldr d10, [x5, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d1, [x16, #0x0]\n"
+      "ldr x9, [x16, #0x8]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d2, [x15, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d3, [x14, #0x0]\n"
+      "sub x8, x8, #0x4\n"
+      "ldr d11, [x5, #0x30]\n"
+      "cmp x8, #0x8\n"
+      "ldr x28, [x15, #0x8]\n"
+      "mov v8.d[1], x27\n"
+      "ldr x27, [x14, #0x8]\n"
+      "mov v9.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v10.d[1], x11\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "mov v0.d[1], x10\n"
-      "ldr x26, [x27, #0x8]\n"
-      "mov v1.d[1], x28\n"
-      "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x12, x12, #0x4\n"
-      "mov v2.d[1], x26\n"
-      "ldr d3, [x25, #0x0]\n"
-      "ldr x19, [x25, #0x8]\n"
-      "cmp x12, #0x8\n"
-      "add x16, x16, #0x40\n"
-      "ldr d8, [x16, #0x0]\n"
-      "mov v3.d[1], x19\n"
-      "ldr x26, [x16, #0x8]\n"
-      "mov v8.d[1], x26\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
+      "mov v1.d[1], x9\n"
+      "mov v2.d[1], x28\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
+      "mov v3.d[1], x27\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "mov v11.d[1], x26\n"
       "bge 75b\n"
       "76:"  // Height 4: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "sub x12, x12, #0x4\n"
+      "add x14, x14, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "add x11, x11, #0x10\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "add x9, x9, #0x10\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x27, x27, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x25, x25, #0x10\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "add x16, x16, #0x40\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
       "77:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x12, 79f\n"
+      "cbz x8, 79f\n"
       "78:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "cbnz x12, 78b\n"
+      "ldr s20, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s19, [x16], #0x4\n"
+      "ldr s18, [x15], #0x4\n"
+      "ldr s17, [x14], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v20.s[0]\n"
+      "fmla v25.4s, v16.4s, v19.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v18.s[0]\n"
+      "fmla v27.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 78b\n"
       "79:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x19\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 72b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x19, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "prfm pstl1keep, [x25, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
       "tbz %x[flags], #1, 80f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x19]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
       "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
       "80:"  // Height 4: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 83f\n"
-      "tbz x17, #1, 81f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "str d27, [x25], #0x8\n"
-      "tbz x17, #0, 82f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
-      "st1 { v27.s }[2], [x25]\n"
+      "tbz x4, #1, 81f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "str d27, [x11], #0x8\n"
+      "tbz x4, #0, 82f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
+      "st1 { v27.s }[2], [x11]\n"
       "b 82f\n"
       "81:"  // Height 4: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
-      "str s27, [x25, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
+      "str s27, [x11, #0x0]\n"
       "82:"  // Height 4: Partial direct writeback: Done
       "b 84f\n"
       "83:"  // Height 4: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
-      "str q27, [x25, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
+      "str q27, [x11, #0x0]\n"
       "84:"  // Height 4: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 65b\n"
       "b 170f\n"
       "85:"  // Height 5
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "86:"  // Height 5: Column loop
-      "cbz x15, 87f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 87f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "b 92f\n"
       "87:"  // Height 5: no bias
       "tbz %x[flags], #0, 91f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x19, LSL #2\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x10, x11, x26, LSL #2\n"
       "bge 90f\n"
-      "tbz x17, #1, 88f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "ldr d28, [x24], #0x8\n"
-      "tbz x17, #0, 89f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
-      "ld1 { v27.s }[2], [x25]\n"
-      "ld1 { v28.s }[2], [x24]\n"
+      "tbz x4, #1, 88f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "ldr d27, [x11], #0x8\n"
+      "ldr d28, [x10], #0x8\n"
+      "tbz x4, #0, 89f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
+      "ld1 { v27.s }[2], [x11]\n"
+      "ld1 { v28.s }[2], [x10]\n"
       "b 89f\n"
       "88:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
-      "ldr s27, [x25, #0x0]\n"
-      "ldr s28, [x24, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
+      "ldr s27, [x11, #0x0]\n"
+      "ldr s28, [x10, #0x0]\n"
       "89:"  // Height 5: Partial accumulate: Done
-      "sub x14, x14, x19\n"
+      "sub x6, x6, x26\n"
       "b 92f\n"
       "90:"  // Height 5: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
-      "ldr q27, [x25, #0x0]\n"
-      "ldr q28, [x24, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q27, [x11, #0x0]\n"
+      "ldr q28, [x10, #0x0]\n"
       "b 92f\n"
       "91:"  // Height 5: no accumulate
       "movi v24.16b, #0x0\n"
@@ -919,283 +918,283 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "movi v27.16b, #0x0\n"
       "movi v28.16b, #0x0\n"
       "92:"  // Height 5: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "93:"  // Height 5: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 94f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "ldr x25, [x20, #0x18]\n"
-      "ldr x24, [x20, #0x20]\n"
-      "cbnz x13, 95f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x19, LSL #2\n"
-      "add x9, x9, x19, LSL #2\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "ldr x14, [x26, #0x18]\n"
+      "ldr x13, [x26, #0x20]\n"
+      "cbnz x7, 95f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
+      "add x14, x14, x26, LSL #2\n"
+      "add x13, x13, x26, LSL #2\n"
       "b 95f\n"
       "94:"  // Height 5: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x19, LSL #2\n"
-      "add x27, x9, x19, LSL #2\n"
-      "add x25, x27, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
+      "add x14, x15, x27, LSL #2\n"
+      "add x13, x14, x27, LSL #2\n"
       "95:"  // Height 5: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 98f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x24, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 97f\n"
       "96:"  // Height 5: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x19, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "ldr x21, [x16, #0x28]\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "mov v9.d[1], x19\n"
-      "ldr d11, [x16, #0x30]\n"
-      "add x11, x11, #0x10\n"
+      "add x13, x13, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "ldr x19, [x16, #0x38]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "ldr x27, [x5, #0x8]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "ldr x10, [x11, #0x8]\n"
+      "ldr x26, [x5, #0x18]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "mov v11.d[1], x19\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x9, x9, #0x10\n"
+      "ldr x12, [x5, #0x28]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "ldr x11, [x17, #0x8]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x10, [x16, #0x8]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "add x27, x27, #0x10\n"
+      "ldr x9, [x15, #0x8]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "ldr d10, [x5, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x16, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d2, [x15, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "mov v0.d[1], x10\n"
+      "ldr d3, [x14, #0x0]\n"
+      "ldr x28, [x14, #0x8]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "mov v1.d[1], x28\n"
-      "ldr x26, [x27, #0x8]\n"
-      "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sub x12, x12, #0x4\n"
-      "mov v2.d[1], x26\n"
-      "ldr d3, [x25, #0x0]\n"
-      "ldr x19, [x25, #0x8]\n"
-      "cmp x12, #0x8\n"
-      "ldr d4, [x24, #0x0]\n"
-      "add x16, x16, #0x40\n"
-      "ldr x21, [x24, #0x8]\n"
-      "mov v3.d[1], x19\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
-      "mov v4.d[1], x21\n"
-      "mov v8.d[1], x26\n"
+      "ldr d4, [x13, #0x0]\n"
+      "sub x8, x8, #0x4\n"
+      "ldr d11, [x5, #0x30]\n"
+      "cmp x8, #0x8\n"
+      "mov v8.d[1], x27\n"
+      "ldr x27, [x13, #0x8]\n"
+      "mov v9.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
+      "mov v10.d[1], x12\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
+      "mov v0.d[1], x11\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
+      "mov v1.d[1], x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "mov v2.d[1], x9\n"
+      "mov v3.d[1], x28\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v4.d[1], x27\n"
+      "mov v11.d[1], x26\n"
       "bge 96b\n"
       "97:"  // Height 5: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "sub x12, x12, #0x4\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "add x11, x11, #0x10\n"
+      "add x13, x13, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "add x9, x9, #0x10\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "add x27, x27, #0x10\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x25, x25, #0x10\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x24, x24, #0x10\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "add x16, x16, #0x40\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
       "98:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x12, 100f\n"
+      "cbz x8, 100f\n"
       "99:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "cbnz x12, 99b\n"
+      "ldr s21, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s20, [x16], #0x4\n"
+      "ldr s19, [x15], #0x4\n"
+      "ldr s18, [x14], #0x4\n"
+      "ldr s17, [x13], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v21.s[0]\n"
+      "fmla v25.4s, v16.4s, v20.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v19.s[0]\n"
+      "fmla v27.4s, v16.4s, v18.s[0]\n"
+      "fmla v28.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 99b\n"
       "100:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x19\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 93b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x19, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x10, #0x0]\n"
       "tbz %x[flags], #1, 101f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x19]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
       "fmin v27.4s, v27.4s, v16.4s\n"
       "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
       "101:"  // Height 5: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 104f\n"
-      "tbz x17, #1, 102f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "str d27, [x25], #0x8\n"
-      "str d28, [x24], #0x8\n"
-      "tbz x17, #0, 103f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
-      "st1 { v27.s }[2], [x25]\n"
-      "st1 { v28.s }[2], [x24]\n"
+      "tbz x4, #1, 102f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "str d27, [x11], #0x8\n"
+      "str d28, [x10], #0x8\n"
+      "tbz x4, #0, 103f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
+      "st1 { v27.s }[2], [x11]\n"
+      "st1 { v28.s }[2], [x10]\n"
       "b 103f\n"
       "102:"  // Height 5: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
-      "str s27, [x25, #0x0]\n"
-      "str s28, [x24, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
+      "str s27, [x11, #0x0]\n"
+      "str s28, [x10, #0x0]\n"
       "103:"  // Height 5: Partial direct writeback: Done
       "b 105f\n"
       "104:"  // Height 5: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
-      "str q27, [x25, #0x0]\n"
-      "str q28, [x24, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
+      "str q27, [x11, #0x0]\n"
+      "str q28, [x10, #0x0]\n"
       "105:"  // Height 5: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 86b\n"
       "b 170f\n"
       "106:"  // Height 6
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "107:"  // Height 6: Column loop
-      "cbz x15, 108f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 108f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "mov v29.16b, v24.16b\n"
       "b 113f\n"
       "108:"  // Height 6: no bias
       "tbz %x[flags], #0, 112f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x19, LSL #2\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x9, x10, x26, LSL #2\n"
       "bge 111f\n"
-      "tbz x17, #1, 109f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "ldr d28, [x24], #0x8\n"
-      "ldr d29, [x23], #0x8\n"
-      "tbz x17, #0, 110f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
-      "ld1 { v27.s }[2], [x25]\n"
-      "ld1 { v28.s }[2], [x24]\n"
-      "ld1 { v29.s }[2], [x23]\n"
+      "tbz x4, #1, 109f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "ldr d27, [x11], #0x8\n"
+      "ldr d28, [x10], #0x8\n"
+      "ldr d29, [x9], #0x8\n"
+      "tbz x4, #0, 110f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
+      "ld1 { v27.s }[2], [x11]\n"
+      "ld1 { v28.s }[2], [x10]\n"
+      "ld1 { v29.s }[2], [x9]\n"
       "b 110f\n"
       "109:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
-      "ldr s27, [x25, #0x0]\n"
-      "ldr s28, [x24, #0x0]\n"
-      "ldr s29, [x23, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
+      "ldr s27, [x11, #0x0]\n"
+      "ldr s28, [x10, #0x0]\n"
+      "ldr s29, [x9, #0x0]\n"
       "110:"  // Height 6: Partial accumulate: Done
-      "sub x14, x14, x19\n"
+      "sub x6, x6, x26\n"
       "b 113f\n"
       "111:"  // Height 6: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
-      "ldr q27, [x25, #0x0]\n"
-      "ldr q28, [x24, #0x0]\n"
-      "ldr q29, [x23, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q27, [x11, #0x0]\n"
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q29, [x9, #0x0]\n"
       "b 113f\n"
       "112:"  // Height 6: no accumulate
       "movi v24.16b, #0x0\n"
@@ -1205,154 +1204,154 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "movi v28.16b, #0x0\n"
       "movi v29.16b, #0x0\n"
       "113:"  // Height 6: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "114:"  // Height 6: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 115f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "ldr x25, [x20, #0x18]\n"
-      "ldr x24, [x20, #0x20]\n"
-      "ldr x23, [x20, #0x28]\n"
-      "cbnz x13, 116f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x19, LSL #2\n"
-      "add x9, x9, x19, LSL #2\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "ldr x14, [x26, #0x18]\n"
+      "ldr x13, [x26, #0x20]\n"
+      "ldr x12, [x26, #0x28]\n"
+      "cbnz x7, 116f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
+      "add x14, x14, x26, LSL #2\n"
+      "add x13, x13, x26, LSL #2\n"
+      "add x12, x12, x26, LSL #2\n"
       "b 116f\n"
       "115:"  // Height 6: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x19, LSL #2\n"
-      "add x27, x9, x19, LSL #2\n"
-      "add x25, x27, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
+      "add x14, x15, x27, LSL #2\n"
+      "add x13, x14, x27, LSL #2\n"
+      "add x12, x13, x27, LSL #2\n"
       "116:"  // Height 6: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 119f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x24, #0x0]\n"
-      "ldr q5, [x23, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q5, [x12, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 118f\n"
       "117:"  // Height 6: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x19, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "ldr x21, [x16, #0x28]\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "mov v9.d[1], x19\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "ldr d11, [x16, #0x30]\n"
+      "add x12, x12, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "ldr x19, [x16, #0x38]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "add x11, x11, #0x10\n"
+      "ldr x9, [x5, #0x8]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "ldr x28, [x5, #0x18]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "mov v11.d[1], x19\n"
+      "ldr x27, [x5, #0x28]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "ldr x10, [x11, #0x8]\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x9, x9, #0x10\n"
+      "ldr x26, [x17, #0x8]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "ldr x11, [x16, #0x8]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x10, [x15, #0x8]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "add x27, x27, #0x10\n"
+      "sub x8, x8, #0x4\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "cmp x8, #0x8\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
-      "ldr x26, [x27, #0x8]\n"
+      "ldr d10, [x5, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x16, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d2, [x15, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "mov v0.d[1], x10\n"
+      "ldr d3, [x14, #0x0]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "mov v1.d[1], x28\n"
+      "ldr d4, [x13, #0x0]\n"
       "fmla v29.4s, v11.4s, v5.s[3]\n"
-      "mov v2.d[1], x26\n"
-      "add x25, x25, #0x10\n"
-      "add x24, x24, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x23, x23, #0x10\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sub x12, x12, #0x4\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "cmp x12, #0x8\n"
-      "ldr d3, [x25, #0x0]\n"
-      "add x16, x16, #0x40\n"
-      "ldr x19, [x25, #0x8]\n"
-      "ldr d4, [x24, #0x0]\n"
-      "ldr x21, [x24, #0x8]\n"
-      "mov v3.d[1], x19\n"
-      "ldr d5, [x23, #0x0]\n"
-      "ldr x19, [x23, #0x8]\n"
-      "mov v4.d[1], x21\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
-      "mov v5.d[1], x19\n"
-      "mov v8.d[1], x26\n"
+      "ldr d5, [x12, #0x0]\n"
+      "ldr d11, [x5, #0x30]\n"
+      "mov v8.d[1], x9\n"
+      "ldr x9, [x14, #0x8]\n"
+      "mov v9.d[1], x28\n"
+      "ldr x28, [x13, #0x8]\n"
+      "mov v10.d[1], x27\n"
+      "ldr x27, [x12, #0x8]\n"
+      "mov v0.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v1.d[1], x11\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
+      "mov v2.d[1], x10\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
+      "mov v3.d[1], x9\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
+      "mov v4.d[1], x28\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "mov v5.d[1], x27\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "bge 117b\n"
       "118:"  // Height 6: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "sub x12, x12, #0x4\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "add x11, x11, #0x10\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "add x12, x12, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "add x9, x9, #0x10\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "add x27, x27, #0x10\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "add x25, x25, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x23, x23, #0x10\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "add x16, x16, #0x40\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
@@ -1361,108 +1360,108 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "fmla v28.4s, v11.4s, v4.s[3]\n"
       "fmla v29.4s, v11.4s, v5.s[3]\n"
       "119:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x12, 121f\n"
+      "cbz x8, 121f\n"
       "120:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr s5, [x23], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
-      "cbnz x12, 120b\n"
+      "ldr s22, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s21, [x16], #0x4\n"
+      "ldr s20, [x15], #0x4\n"
+      "ldr s19, [x14], #0x4\n"
+      "ldr s18, [x13], #0x4\n"
+      "ldr s17, [x12], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v22.s[0]\n"
+      "fmla v25.4s, v16.4s, v21.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v20.s[0]\n"
+      "fmla v27.4s, v16.4s, v19.s[0]\n"
+      "fmla v28.4s, v16.4s, v18.s[0]\n"
+      "fmla v29.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 120b\n"
       "121:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x19\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 114b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x19, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "add x9, x10, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x10, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
       "tbz %x[flags], #1, 122f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x19]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
       "fmin v27.4s, v27.4s, v16.4s\n"
       "fmin v28.4s, v28.4s, v16.4s\n"
       "fmin v29.4s, v29.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
+      "fmax v29.4s, v29.4s, v16.4s\n"
       "122:"  // Height 6: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 125f\n"
-      "tbz x17, #1, 123f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "str d27, [x25], #0x8\n"
-      "str d28, [x24], #0x8\n"
-      "str d29, [x23], #0x8\n"
-      "tbz x17, #0, 124f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
-      "st1 { v27.s }[2], [x25]\n"
-      "st1 { v28.s }[2], [x24]\n"
-      "st1 { v29.s }[2], [x23]\n"
+      "tbz x4, #1, 123f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "str d27, [x11], #0x8\n"
+      "str d28, [x10], #0x8\n"
+      "str d29, [x9], #0x8\n"
+      "tbz x4, #0, 124f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
+      "st1 { v27.s }[2], [x11]\n"
+      "st1 { v28.s }[2], [x10]\n"
+      "st1 { v29.s }[2], [x9]\n"
       "b 124f\n"
       "123:"  // Height 6: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
-      "str s27, [x25, #0x0]\n"
-      "str s28, [x24, #0x0]\n"
-      "str s29, [x23, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
+      "str s27, [x11, #0x0]\n"
+      "str s28, [x10, #0x0]\n"
+      "str s29, [x9, #0x0]\n"
       "124:"  // Height 6: Partial direct writeback: Done
       "b 126f\n"
       "125:"  // Height 6: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
-      "str q27, [x25, #0x0]\n"
-      "str q28, [x24, #0x0]\n"
-      "str q29, [x23, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
+      "str q27, [x11, #0x0]\n"
+      "str q28, [x10, #0x0]\n"
+      "str q29, [x9, #0x0]\n"
       "126:"  // Height 6: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 107b\n"
       "b 170f\n"
       "127:"  // Height 7
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "128:"  // Height 7: Column loop
-      "cbz x15, 129f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 129f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "mov v29.16b, v24.16b\n"
@@ -1470,53 +1469,53 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "b 134f\n"
       "129:"  // Height 7: no bias
       "tbz %x[flags], #0, 133f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x19, LSL #2\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "add x9, x10, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x28, x9, x26, LSL #2\n"
       "bge 132f\n"
-      "tbz x17, #1, 130f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "ldr d28, [x24], #0x8\n"
-      "ldr d29, [x23], #0x8\n"
-      "ldr d30, [x22], #0x8\n"
-      "tbz x17, #0, 131f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
-      "ld1 { v27.s }[2], [x25]\n"
-      "ld1 { v28.s }[2], [x24]\n"
-      "ld1 { v29.s }[2], [x23]\n"
-      "ld1 { v30.s }[2], [x22]\n"
+      "tbz x4, #1, 130f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "ldr d27, [x11], #0x8\n"
+      "ldr d28, [x10], #0x8\n"
+      "ldr d29, [x9], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "tbz x4, #0, 131f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
+      "ld1 { v27.s }[2], [x11]\n"
+      "ld1 { v28.s }[2], [x10]\n"
+      "ld1 { v29.s }[2], [x9]\n"
+      "ld1 { v30.s }[2], [x28]\n"
       "b 131f\n"
       "130:"  // Height 7: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
-      "ldr s27, [x25, #0x0]\n"
-      "ldr s28, [x24, #0x0]\n"
-      "ldr s29, [x23, #0x0]\n"
-      "ldr s30, [x22, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
+      "ldr s27, [x11, #0x0]\n"
+      "ldr s28, [x10, #0x0]\n"
+      "ldr s29, [x9, #0x0]\n"
+      "ldr s30, [x28, #0x0]\n"
       "131:"  // Height 7: Partial accumulate: Done
-      "sub x14, x14, x19\n"
+      "sub x6, x6, x26\n"
       "b 134f\n"
       "132:"  // Height 7: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
-      "ldr q27, [x25, #0x0]\n"
-      "ldr q28, [x24, #0x0]\n"
-      "ldr q29, [x23, #0x0]\n"
-      "ldr q30, [x22, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q27, [x11, #0x0]\n"
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q29, [x9, #0x0]\n"
+      "ldr q30, [x28, #0x0]\n"
       "b 134f\n"
       "133:"  // Height 7: no accumulate
       "movi v24.16b, #0x0\n"
@@ -1527,171 +1526,171 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "movi v29.16b, #0x0\n"
       "movi v30.16b, #0x0\n"
       "134:"  // Height 7: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "135:"  // Height 7: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 136f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "ldr x25, [x20, #0x18]\n"
-      "ldr x24, [x20, #0x20]\n"
-      "ldr x23, [x20, #0x28]\n"
-      "ldr x22, [x20, #0x30]\n"
-      "cbnz x13, 137f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x19, LSL #2\n"
-      "add x9, x9, x19, LSL #2\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "ldr x14, [x26, #0x18]\n"
+      "ldr x13, [x26, #0x20]\n"
+      "ldr x12, [x26, #0x28]\n"
+      "ldr x11, [x26, #0x30]\n"
+      "cbnz x7, 137f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
+      "add x14, x14, x26, LSL #2\n"
+      "add x13, x13, x26, LSL #2\n"
+      "add x12, x12, x26, LSL #2\n"
+      "add x11, x11, x26, LSL #2\n"
       "b 137f\n"
       "136:"  // Height 7: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x19, LSL #2\n"
-      "add x27, x9, x19, LSL #2\n"
-      "add x25, x27, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
+      "add x14, x15, x27, LSL #2\n"
+      "add x13, x14, x27, LSL #2\n"
+      "add x12, x13, x27, LSL #2\n"
+      "add x11, x12, x27, LSL #2\n"
       "137:"  // Height 7: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 140f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x24, #0x0]\n"
-      "ldr q5, [x23, #0x0]\n"
-      "ldr q6, [x22, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q5, [x12, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 139f\n"
       "138:"  // Height 7: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x19, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "ldr x21, [x16, #0x28]\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "mov v9.d[1], x19\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "ldr d11, [x16, #0x30]\n"
+      "add x12, x12, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "mov v10.d[1], x21\n"
+      "add x11, x11, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "ldr x19, [x16, #0x38]\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "add x11, x11, #0x10\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "ldr x26, [x5, #0x8]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "mov v11.d[1], x19\n"
+      "ldr x10, [x5, #0x18]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "ldr x10, [x11, #0x8]\n"
+      "ldr x9, [x5, #0x28]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "add x9, x9, #0x10\n"
+      "ldr x28, [x17, #0x8]\n"
       "fmla v30.4s, v9.4s, v6.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x27, [x16, #0x8]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "add x27, x27, #0x10\n"
+      "sub x8, x8, #0x4\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "cmp x8, #0x8\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "ldr x26, [x27, #0x8]\n"
+      "mov v8.d[1], x26\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "add x25, x25, #0x10\n"
+      "ldr x26, [x15, #0x8]\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v30.4s, v10.4s, v6.s[2]\n"
-      "ldr x19, [x25, #0x8]\n"
+      "ldr d10, [x5, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x16, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d2, [x15, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "mov v0.d[1], x10\n"
+      "ldr d3, [x14, #0x0]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "mov v1.d[1], x28\n"
+      "ldr d4, [x13, #0x0]\n"
       "fmla v29.4s, v11.4s, v5.s[3]\n"
-      "mov v2.d[1], x26\n"
+      "ldr d5, [x12, #0x0]\n"
       "fmla v30.4s, v11.4s, v6.s[3]\n"
-      "ldr d3, [x25, #0x0]\n"
-      "add x24, x24, #0x10\n"
-      "add x23, x23, #0x10\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x22, x22, #0x10\n"
-      "mov v3.d[1], x19\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sub x12, x12, #0x4\n"
-      "ldr d4, [x24, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr x21, [x24, #0x8]\n"
-      "add x16, x16, #0x40\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
-      "mov v4.d[1], x21\n"
-      "ldr d5, [x23, #0x0]\n"
-      "ldr x19, [x23, #0x8]\n"
-      "mov v8.d[1], x26\n"
-      "ldr d6, [x22, #0x0]\n"
-      "ldr x21, [x22, #0x8]\n"
-      "mov v5.d[1], x19\n"
-      "mov v6.d[1], x21\n"
+      "ldr d6, [x11, #0x0]\n"
+      "ldr d11, [x5, #0x30]\n"
+      "mov v9.d[1], x10\n"
+      "ldr x10, [x14, #0x8]\n"
+      "mov v10.d[1], x9\n"
+      "ldr x9, [x13, #0x8]\n"
+      "mov v0.d[1], x28\n"
+      "ldr x28, [x12, #0x8]\n"
+      "mov v1.d[1], x27\n"
+      "ldr x27, [x11, #0x8]\n"
+      "mov v2.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v3.d[1], x10\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
+      "mov v4.d[1], x9\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
+      "mov v5.d[1], x28\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "mov v6.d[1], x27\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "bge 138b\n"
       "139:"  // Height 7: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "sub x12, x12, #0x4\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "add x11, x11, #0x10\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "add x12, x12, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "add x27, x27, #0x10\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "add x25, x25, #0x10\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v30.4s, v9.4s, v6.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x23, x23, #0x10\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x22, x22, #0x10\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "add x16, x16, #0x40\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
       "fmla v30.4s, v10.4s, v6.s[2]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
@@ -1702,50 +1701,48 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "fmla v29.4s, v11.4s, v5.s[3]\n"
       "fmla v30.4s, v11.4s, v6.s[3]\n"
       "140:"  // Height 7: Multiply loop: Main loop skip
-      "cbz x12, 142f\n"
+      "cbz x8, 142f\n"
       "141:"  // Height 7: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr s5, [x23], #0x4\n"
-      "ldr s6, [x22], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
-      "fmla v30.4s, v12.4s, v6.s[0]\n"
-      "cbnz x12, 141b\n"
+      "ldr s23, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s22, [x16], #0x4\n"
+      "ldr s21, [x15], #0x4\n"
+      "ldr s20, [x14], #0x4\n"
+      "ldr s19, [x13], #0x4\n"
+      "ldr s18, [x12], #0x4\n"
+      "ldr s17, [x11], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v23.s[0]\n"
+      "fmla v25.4s, v16.4s, v22.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v21.s[0]\n"
+      "fmla v27.4s, v16.4s, v20.s[0]\n"
+      "fmla v28.4s, v16.4s, v19.s[0]\n"
+      "fmla v29.4s, v16.4s, v18.s[0]\n"
+      "fmla v30.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 141b\n"
       "142:"  // Height 7: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x19\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 135b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x19, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "add x9, x10, x26, LSL #2\n"
+      "add x28, x9, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x10, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
       "tbz %x[flags], #1, 143f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x19]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
@@ -1753,70 +1750,72 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "fmin v28.4s, v28.4s, v16.4s\n"
       "fmin v29.4s, v29.4s, v16.4s\n"
       "fmin v30.4s, v30.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "fmax v30.4s, v30.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
+      "fmax v29.4s, v29.4s, v16.4s\n"
+      "fmax v30.4s, v30.4s, v16.4s\n"
       "143:"  // Height 7: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 146f\n"
-      "tbz x17, #1, 144f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "str d27, [x25], #0x8\n"
-      "str d28, [x24], #0x8\n"
-      "str d29, [x23], #0x8\n"
-      "str d30, [x22], #0x8\n"
-      "tbz x17, #0, 145f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
-      "st1 { v27.s }[2], [x25]\n"
-      "st1 { v28.s }[2], [x24]\n"
-      "st1 { v29.s }[2], [x23]\n"
-      "st1 { v30.s }[2], [x22]\n"
+      "tbz x4, #1, 144f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "str d27, [x11], #0x8\n"
+      "str d28, [x10], #0x8\n"
+      "str d29, [x9], #0x8\n"
+      "str d30, [x28], #0x8\n"
+      "tbz x4, #0, 145f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
+      "st1 { v27.s }[2], [x11]\n"
+      "st1 { v28.s }[2], [x10]\n"
+      "st1 { v29.s }[2], [x9]\n"
+      "st1 { v30.s }[2], [x28]\n"
       "b 145f\n"
       "144:"  // Height 7: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
-      "str s27, [x25, #0x0]\n"
-      "str s28, [x24, #0x0]\n"
-      "str s29, [x23, #0x0]\n"
-      "str s30, [x22, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
+      "str s27, [x11, #0x0]\n"
+      "str s28, [x10, #0x0]\n"
+      "str s29, [x9, #0x0]\n"
+      "str s30, [x28, #0x0]\n"
       "145:"  // Height 7: Partial direct writeback: Done
       "b 147f\n"
       "146:"  // Height 7: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
-      "str q27, [x25, #0x0]\n"
-      "str q28, [x24, #0x0]\n"
-      "str q29, [x23, #0x0]\n"
-      "str q30, [x22, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
+      "str q27, [x11, #0x0]\n"
+      "str q28, [x10, #0x0]\n"
+      "str q29, [x9, #0x0]\n"
+      "str q30, [x28, #0x0]\n"
       "147:"  // Height 7: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 128b\n"
       "b 170f\n"
       "148:"  // Height 8
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x19, #0x20\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x26, #0x20\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
+      "madd %x[output_ptr], x27, x26, %x[output_ptr]\n"
       "149:"  // Height 8: Column loop
-      "cbz x15, 150f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 150f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "mov v29.16b, v24.16b\n"
@@ -1825,58 +1824,58 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "b 155f\n"
       "150:"  // Height 8: no bias
       "tbz %x[flags], #0, 154f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x19, LSL #2\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "add x9, x10, x26, LSL #2\n"
+      "add x28, x9, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x27, x28, x26, LSL #2\n"
       "bge 153f\n"
-      "tbz x17, #1, 151f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "ldr d28, [x24], #0x8\n"
-      "ldr d29, [x23], #0x8\n"
-      "ldr d30, [x22], #0x8\n"
-      "ldr d31, [x21], #0x8\n"
-      "tbz x17, #0, 152f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
-      "ld1 { v27.s }[2], [x25]\n"
-      "ld1 { v28.s }[2], [x24]\n"
-      "ld1 { v29.s }[2], [x23]\n"
-      "ld1 { v30.s }[2], [x22]\n"
-      "ld1 { v31.s }[2], [x21]\n"
+      "tbz x4, #1, 151f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "ldr d27, [x11], #0x8\n"
+      "ldr d28, [x10], #0x8\n"
+      "ldr d29, [x9], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "ldr d31, [x27], #0x8\n"
+      "tbz x4, #0, 152f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
+      "ld1 { v27.s }[2], [x11]\n"
+      "ld1 { v28.s }[2], [x10]\n"
+      "ld1 { v29.s }[2], [x9]\n"
+      "ld1 { v30.s }[2], [x28]\n"
+      "ld1 { v31.s }[2], [x27]\n"
       "b 152f\n"
       "151:"  // Height 8: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
-      "ldr s27, [x25, #0x0]\n"
-      "ldr s28, [x24, #0x0]\n"
-      "ldr s29, [x23, #0x0]\n"
-      "ldr s30, [x22, #0x0]\n"
-      "ldr s31, [x21, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
+      "ldr s27, [x11, #0x0]\n"
+      "ldr s28, [x10, #0x0]\n"
+      "ldr s29, [x9, #0x0]\n"
+      "ldr s30, [x28, #0x0]\n"
+      "ldr s31, [x27, #0x0]\n"
       "152:"  // Height 8: Partial accumulate: Done
-      "sub x14, x14, x19\n"
+      "sub x6, x6, x26\n"
       "b 155f\n"
       "153:"  // Height 8: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
-      "ldr q27, [x25, #0x0]\n"
-      "ldr q28, [x24, #0x0]\n"
-      "ldr q29, [x23, #0x0]\n"
-      "ldr q30, [x22, #0x0]\n"
-      "ldr q31, [x21, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q27, [x11, #0x0]\n"
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q29, [x9, #0x0]\n"
+      "ldr q30, [x28, #0x0]\n"
+      "ldr q31, [x27, #0x0]\n"
       "b 155f\n"
       "154:"  // Height 8: no accumulate
       "movi v24.16b, #0x0\n"
@@ -1888,188 +1887,188 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "movi v30.16b, #0x0\n"
       "movi v31.16b, #0x0\n"
       "155:"  // Height 8: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "156:"  // Height 8: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 157f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "ldr x25, [x20, #0x18]\n"
-      "ldr x24, [x20, #0x20]\n"
-      "ldr x23, [x20, #0x28]\n"
-      "ldr x22, [x20, #0x30]\n"
-      "ldr x20, [x20, #0x38]\n"
-      "cbnz x13, 158f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x19, LSL #2\n"
-      "add x9, x9, x19, LSL #2\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
-      "add x20, x20, x19, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "ldr x14, [x26, #0x18]\n"
+      "ldr x13, [x26, #0x20]\n"
+      "ldr x12, [x26, #0x28]\n"
+      "ldr x11, [x26, #0x30]\n"
+      "ldr x27, [x26, #0x38]\n"
+      "cbnz x7, 158f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
+      "add x14, x14, x26, LSL #2\n"
+      "add x13, x13, x26, LSL #2\n"
+      "add x12, x12, x26, LSL #2\n"
+      "add x11, x11, x26, LSL #2\n"
+      "add x27, x27, x26, LSL #2\n"
       "b 158f\n"
       "157:"  // Height 8: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x19, LSL #2\n"
-      "add x27, x9, x19, LSL #2\n"
-      "add x25, x27, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x20, x22, x19, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
+      "add x14, x15, x27, LSL #2\n"
+      "add x13, x14, x27, LSL #2\n"
+      "add x12, x13, x27, LSL #2\n"
+      "add x11, x12, x27, LSL #2\n"
+      "add x27, x11, x27, LSL #2\n"
       "158:"  // Height 8: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 161f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x24, #0x0]\n"
-      "ldr q5, [x23, #0x0]\n"
-      "ldr q6, [x22, #0x0]\n"
-      "ldr q7, [x20, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q5, [x12, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      "ldr q7, [x27, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 160f\n"
       "159:"  // Height 8: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x19, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "ldr x21, [x16, #0x28]\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "mov v9.d[1], x19\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "ldr d11, [x16, #0x30]\n"
+      "add x12, x12, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "mov v10.d[1], x21\n"
+      "add x11, x11, #0x10\n"
       "fmla v31.4s, v8.4s, v7.s[0]\n"
-      "ldr x19, [x16, #0x38]\n"
+      "add x27, x27, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "add x11, x11, #0x10\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "mov v11.d[1], x19\n"
+      "ldr x26, [x5, #0x8]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "ldr x10, [x11, #0x8]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "add x9, x9, #0x10\n"
+      "cmp x8, #0x8\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "mov v8.d[1], x26\n"
       "fmla v30.4s, v9.4s, v6.s[1]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x26, [x5, #0x18]\n"
       "fmla v31.4s, v9.4s, v7.s[1]\n"
-      "add x27, x27, #0x10\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "ldr x26, [x27, #0x8]\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x25, x25, #0x10\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "mov v9.d[1], x26\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "ldr x19, [x25, #0x8]\n"
+      "ldr x26, [x5, #0x28]\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
       "fmla v30.4s, v10.4s, v6.s[2]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v31.4s, v10.4s, v7.s[2]\n"
-      "ldr x21, [x24, #0x8]\n"
+      "ldr d10, [x5, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x16, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d2, [x15, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "mov v0.d[1], x10\n"
+      "ldr d3, [x14, #0x0]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "mov v1.d[1], x28\n"
+      "ldr d4, [x13, #0x0]\n"
       "fmla v29.4s, v11.4s, v5.s[3]\n"
-      "mov v2.d[1], x26\n"
+      "ldr d5, [x12, #0x0]\n"
       "fmla v30.4s, v11.4s, v6.s[3]\n"
-      "ldr d3, [x25, #0x0]\n"
+      "ldr d6, [x11, #0x0]\n"
       "fmla v31.4s, v11.4s, v7.s[3]\n"
-      "ldr d4, [x24, #0x0]\n"
-      "add x23, x23, #0x10\n"
-      "add x22, x22, #0x10\n"
-      "mov v3.d[1], x19\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "mov v4.d[1], x21\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "ldr d5, [x23, #0x0]\n"
-      "add x20, x20, #0x10\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "sub x12, x12, #0x4\n"
-      "ldr x19, [x23, #0x8]\n"
-      "cmp x12, #0x8\n"
-      "ldr d6, [x22, #0x0]\n"
-      "add x16, x16, #0x40\n"
-      "ldr d8, [x16, #0x0]\n"
-      "mov v5.d[1], x19\n"
+      "ldr d7, [x27, #0x0]\n"
+      "ldr d11, [x5, #0x30]\n"
+      "mov v10.d[1], x26\n"
+      "ldr x26, [x17, #0x8]\n"
+      "mov v0.d[1], x26\n"
       "ldr x26, [x16, #0x8]\n"
-      "ldr x21, [x22, #0x8]\n"
-      "ldr d7, [x20, #0x0]\n"
-      "mov v8.d[1], x26\n"
-      "ldr x19, [x20, #0x8]\n"
-      "mov v6.d[1], x21\n"
-      "mov v7.d[1], x19\n"
+      "mov v1.d[1], x26\n"
+      "ldr x26, [x15, #0x8]\n"
+      "mov v2.d[1], x26\n"
+      "ldr x26, [x14, #0x8]\n"
+      "mov v3.d[1], x26\n"
+      "ldr x26, [x13, #0x8]\n"
+      "mov v4.d[1], x26\n"
+      "ldr x26, [x12, #0x8]\n"
+      "mov v5.d[1], x26\n"
+      "ldr x26, [x11, #0x8]\n"
+      "mov v6.d[1], x26\n"
+      "ldr x26, [x27, #0x8]\n"
+      "mov v7.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "bge 159b\n"
       "160:"  // Height 8: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "sub x12, x12, #0x4\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "add x11, x11, #0x10\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "add x12, x12, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v31.4s, v8.4s, v7.s[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v24.4s, v9.4s, v0.s[1]\n"
       "add x27, x27, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "add x25, x25, #0x10\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v30.4s, v9.4s, v6.s[1]\n"
-      "add x23, x23, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v31.4s, v9.4s, v7.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x20, x20, #0x10\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "add x16, x16, #0x40\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
       "fmla v30.4s, v10.4s, v6.s[2]\n"
       "fmla v31.4s, v10.4s, v7.s[2]\n"
@@ -2082,54 +2081,52 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "fmla v30.4s, v11.4s, v6.s[3]\n"
       "fmla v31.4s, v11.4s, v7.s[3]\n"
       "161:"  // Height 8: Multiply loop: Main loop skip
-      "cbz x12, 163f\n"
+      "cbz x8, 163f\n"
       "162:"  // Height 8: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr s5, [x23], #0x4\n"
-      "ldr s6, [x22], #0x4\n"
-      "ldr s7, [x20], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
-      "fmla v30.4s, v12.4s, v6.s[0]\n"
-      "fmla v31.4s, v12.4s, v7.s[0]\n"
-      "cbnz x12, 162b\n"
+      "ldr s0, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s23, [x16], #0x4\n"
+      "ldr s22, [x15], #0x4\n"
+      "ldr s21, [x14], #0x4\n"
+      "ldr s20, [x13], #0x4\n"
+      "ldr s19, [x12], #0x4\n"
+      "ldr s18, [x11], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "fmla v25.4s, v16.4s, v23.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v22.s[0]\n"
+      "fmla v27.4s, v16.4s, v21.s[0]\n"
+      "fmla v28.4s, v16.4s, v20.s[0]\n"
+      "fmla v29.4s, v16.4s, v19.s[0]\n"
+      "fmla v30.4s, v16.4s, v18.s[0]\n"
+      "fmla v31.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 162b\n"
       "163:"  // Height 8: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x19\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 156b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x19, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "add x9, x10, x26, LSL #2\n"
+      "add x28, x9, x26, LSL #2\n"
+      "add x27, x28, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x10, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
       "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "tbz %x[flags], #1, 164f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x19]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
@@ -2138,76 +2135,77 @@ void a64_hybrid_fp32_mla_8x4_a55 (
       "fmin v29.4s, v29.4s, v16.4s\n"
       "fmin v30.4s, v30.4s, v16.4s\n"
       "fmin v31.4s, v31.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "fmax v30.4s, v30.4s, v17.4s\n"
-      "fmax v31.4s, v31.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
+      "fmax v29.4s, v29.4s, v16.4s\n"
+      "fmax v30.4s, v30.4s, v16.4s\n"
+      "fmax v31.4s, v31.4s, v16.4s\n"
       "164:"  // Height 8: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 167f\n"
-      "tbz x17, #1, 165f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "str d27, [x25], #0x8\n"
-      "str d28, [x24], #0x8\n"
-      "str d29, [x23], #0x8\n"
-      "str d30, [x22], #0x8\n"
-      "str d31, [x21], #0x8\n"
-      "tbz x17, #0, 166f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
-      "st1 { v27.s }[2], [x25]\n"
-      "st1 { v28.s }[2], [x24]\n"
-      "st1 { v29.s }[2], [x23]\n"
-      "st1 { v30.s }[2], [x22]\n"
-      "st1 { v31.s }[2], [x21]\n"
+      "tbz x4, #1, 165f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "str d27, [x11], #0x8\n"
+      "str d28, [x10], #0x8\n"
+      "str d29, [x9], #0x8\n"
+      "str d30, [x28], #0x8\n"
+      "str d31, [x27], #0x8\n"
+      "tbz x4, #0, 166f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
+      "st1 { v27.s }[2], [x11]\n"
+      "st1 { v28.s }[2], [x10]\n"
+      "st1 { v29.s }[2], [x9]\n"
+      "st1 { v30.s }[2], [x28]\n"
+      "st1 { v31.s }[2], [x27]\n"
       "b 166f\n"
       "165:"  // Height 8: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
-      "str s27, [x25, #0x0]\n"
-      "str s28, [x24, #0x0]\n"
-      "str s29, [x23, #0x0]\n"
-      "str s30, [x22, #0x0]\n"
-      "str s31, [x21, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
+      "str s27, [x11, #0x0]\n"
+      "str s28, [x10, #0x0]\n"
+      "str s29, [x9, #0x0]\n"
+      "str s30, [x28, #0x0]\n"
+      "str s31, [x27, #0x0]\n"
       "166:"  // Height 8: Partial direct writeback: Done
       "b 168f\n"
       "167:"  // Height 8: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
-      "str q27, [x25, #0x0]\n"
-      "str q28, [x24, #0x0]\n"
-      "str q29, [x23, #0x0]\n"
-      "str q30, [x22, #0x0]\n"
-      "str q31, [x21, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
+      "str q27, [x11, #0x0]\n"
+      "str q28, [x10, #0x0]\n"
+      "str q29, [x9, #0x0]\n"
+      "str q30, [x28, #0x0]\n"
+      "str q31, [x27, #0x0]\n"
       "168:"  // Height 8: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 149b\n"
       "subs %x[M], %x[M], #0x8\n"
       "beq 170f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 169f\n"
-      "add x20, x20, #0x8\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x27, x27, #0x8\n"
+      "str x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "169:"  // Update direct input
-      "mov x19, #0x20\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x26, #0x20\n"
+      "madd %x[input_ptr], x26, x27, %x[input_ptr]\n"
       "b 1b\n"
       "170:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
index 9bed0213da..004e5d7f23 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_8x4 (
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x8\n"
       "bge 148f\n"
@@ -105,527 +104,527 @@ void a64_hybrid_fp32_mla_8x4 (
       "cmp %x[M], #0x2\n"
       "bgt 43f\n"
       "beq 22f\n"
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x11, %x[bias]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "cbz x11, 3f\n"
-      "ldr q24, [x11, #0x0]\n"
-      "add x11, x11, #0x10\n"
+      "cbz x14, 3f\n"
+      "ldr q24, [x14, #0x0]\n"
+      "add x14, x14, #0x10\n"
       "b 8f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 7f\n"
       "cmp x13, #0x4\n"
       "bge 6f\n"
       "tbz x13, #1, 4f\n"
-      "ldr d24, [x10], #0x8\n"
-      "mov x19, #0x8\n"
+      "ldr d24, [x11], #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x13, #0, 5f\n"
-      "ld1 { v24.s }[2], [x10]\n"
+      "ld1 { v24.s }[2], [x11]\n"
       "b 5f\n"
       "4:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr s24, [x10, #0x0]\n"
-      "mov x19, #0x0\n"
+      "ldr s24, [x11, #0x0]\n"
+      "mov x20, #0x0\n"
       "5:"  // Height 1: Partial accumulate: Done
-      "sub x10, x10, x19\n"
+      "sub x11, x11, x20\n"
       "b 8f\n"
       "6:"  // Height 1: full accumulate
-      "ldr q24, [x10, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
       "b 8f\n"
       "7:"  // Height 1: no accumulate
       "movi v24.16b, #0x0\n"
       "8:"  // Height 1: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "9:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 10f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "cbnz x9, 11f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "cbnz x10, 11f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
       "b 11f\n"
       "10:"  // Height 1: setup direct input
-      "mov x27, %x[input_ptr]\n"
+      "mov x28, %x[input_ptr]\n"
       "11:"  // Height 1: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "blt 14f\n"
-      "ldr q0, [x27, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
       "ldr q8, [x12, #0x0]\n"
-      "cmp x28, #0x8\n"
+      "cmp x9, #0x8\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "blt 13f\n"
       "12:"  // Height 1: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "add x27, x27, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "ldr q10, [x12, #0x20]\n"
-      "sub x28, x28, #0x4\n"
+      "sub x9, x9, #0x4\n"
+      "add x28, x28, #0x10\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "cmp x28, #0x8\n"
-      "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "cmp x9, #0x8\n"
       "add x12, x12, #0x40\n"
-      "ldr q0, [x27, #0x0]\n"
       "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "ldr q0, [x28, #0x0]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "bge 12b\n"
       "13:"  // Height 1: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "ldr q10, [x12, #0x20]\n"
-      "add x27, x27, #0x10\n"
+      "add x28, x28, #0x10\n"
+      "sub x9, x9, #0x4\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x12, x12, #0x40\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x12, x12, #0x40\n"
       "14:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x28, 16f\n"
+      "cbz x9, 16f\n"
       "15:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x27], #0x4\n"
-      "sub x28, x28, #0x1\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr s17, [x28], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "sub x9, x9, #0x1\n"
+      "fmla v24.4s, v16.4s, v17.s[0]\n"
       "add x12, x12, #0x10\n"
-      "cbnz x28, 15b\n"
+      "cbnz x9, 15b\n"
       "16:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x9, x9, #0x1\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 9b\n"
-      "prfm pstl1keep, [x10, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
       "tbz %x[flags], #1, 17f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v24.4s, v24.4s, v17.4s\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
       "17:"  // Height 1: No activation
       "cmp x13, #0x4\n"
       "bge 20f\n"
       "tbz x13, #1, 18f\n"
-      "str d24, [x10], #0x8\n"
+      "str d24, [x11], #0x8\n"
       "tbz x13, #0, 19f\n"
-      "st1 { v24.s }[2], [x10]\n"
+      "st1 { v24.s }[2], [x11]\n"
       "b 19f\n"
       "18:"  // Height 1: Partial direct writeback: partial_1_0
-      "str s24, [x10, #0x0]\n"
+      "str s24, [x11, #0x0]\n"
       "19:"  // Height 1: Partial direct writeback: Done
       "b 21f\n"
       "20:"  // Height 1: Full writeback
-      "str q24, [x10, #0x0]\n"
-      "add x10, x10, #0x10\n"
+      "str q24, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
       "21:"  // Height 1: Writeback done
       "subs x13, x13, #0x4\n"
       "bgt 2b\n"
       "b 170f\n"
       "22:"  // Height 2
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "23:"  // Height 2: Column loop
-      "cbz x11, 24f\n"
-      "ldr q24, [x11, #0x0]\n"
+      "cbz x14, 24f\n"
+      "ldr q24, [x14, #0x0]\n"
       "mov v25.16b, v24.16b\n"
-      "add x11, x11, #0x10\n"
+      "add x14, x14, #0x10\n"
       "b 29f\n"
       "24:"  // Height 2: no bias
       "tbz %x[flags], #0, 28f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "cmp x13, #0x4\n"
-      "add x26, x10, x19, LSL #2\n"
+      "add x27, x11, x20, LSL #2\n"
       "bge 27f\n"
       "tbz x13, #1, 25f\n"
-      "ldr d24, [x10], #0x8\n"
-      "ldr d25, [x26], #0x8\n"
-      "mov x19, #0x8\n"
+      "ldr d24, [x11], #0x8\n"
+      "ldr d25, [x27], #0x8\n"
+      "mov x20, #0x8\n"
       "tbz x13, #0, 26f\n"
-      "ld1 { v24.s }[2], [x10]\n"
-      "ld1 { v25.s }[2], [x26]\n"
+      "ld1 { v24.s }[2], [x11]\n"
+      "ld1 { v25.s }[2], [x27]\n"
       "b 26f\n"
       "25:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr s24, [x10, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x26, #0x0]\n"
+      "ldr s24, [x11, #0x0]\n"
+      "ldr s25, [x27, #0x0]\n"
+      "mov x20, #0x0\n"
       "26:"  // Height 2: Partial accumulate: Done
-      "sub x10, x10, x19\n"
+      "sub x11, x11, x20\n"
       "b 29f\n"
       "27:"  // Height 2: full accumulate
-      "ldr q24, [x10, #0x0]\n"
-      "ldr q25, [x26, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
+      "ldr q25, [x27, #0x0]\n"
       "b 29f\n"
       "28:"  // Height 2: no accumulate
       "movi v24.16b, #0x0\n"
       "movi v25.16b, #0x0\n"
       "29:"  // Height 2: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "30:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 31f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "cbnz x9, 32f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "cbnz x10, 32f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
       "b 32f\n"
       "31:"  // Height 2: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
       "32:"  // Height 2: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "blt 35f\n"
-      "ldr q0, [x27, #0x0]\n"
-      "ldr q1, [x26, #0x0]\n"
-      "cmp x28, #0x8\n"
+      "ldr q0, [x28, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
+      "cmp x9, #0x8\n"
       "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "blt 34f\n"
       "33:"  // Height 2: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "add x27, x27, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
-      "add x26, x26, #0x10\n"
+      "sub x9, x9, #0x4\n"
+      "add x28, x28, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "cmp x28, #0x8\n"
+      "add x27, x27, #0x10\n"
+      "cmp x9, #0x8\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "add x12, x12, #0x40\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "add x12, x12, #0x40\n"
       "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr q0, [x27, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr q1, [x26, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "bge 33b\n"
       "34:"  // Height 2: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
+      "add x28, x28, #0x10\n"
       "add x27, x27, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x26, x26, #0x10\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x12, x12, #0x40\n"
+      "sub x9, x9, #0x4\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "add x12, x12, #0x40\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
       "35:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x28, 37f\n"
+      "cbz x9, 37f\n"
       "36:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x27], #0x4\n"
-      "sub x28, x28, #0x1\n"
-      "ldr s1, [x26], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr s18, [x28], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "sub x9, x9, #0x1\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v18.s[0]\n"
+      "fmla v25.4s, v16.4s, v17.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "cbnz x28, 36b\n"
+      "cbnz x9, 36b\n"
       "37:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x9, x9, #0x1\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 30b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x10, #0x0]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
       "tbz %x[flags], #1, 38f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v24.4s, v24.4s, v17.4s\n"
+      "fmin v25.4s, v25.4s, v17.4s\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
       "38:"  // Height 2: No activation
       "cmp x13, #0x4\n"
       "bge 41f\n"
       "tbz x13, #1, 39f\n"
-      "str d24, [x10], #0x8\n"
-      "str d25, [x26], #0x8\n"
+      "str d24, [x11], #0x8\n"
+      "str d25, [x27], #0x8\n"
       "tbz x13, #0, 40f\n"
-      "st1 { v24.s }[2], [x10]\n"
-      "st1 { v25.s }[2], [x26]\n"
+      "st1 { v24.s }[2], [x11]\n"
+      "st1 { v25.s }[2], [x27]\n"
       "b 40f\n"
       "39:"  // Height 2: Partial direct writeback: partial_1_0
-      "str s24, [x10, #0x0]\n"
-      "str s25, [x26, #0x0]\n"
+      "str s24, [x11, #0x0]\n"
+      "str s25, [x27, #0x0]\n"
       "40:"  // Height 2: Partial direct writeback: Done
       "b 42f\n"
       "41:"  // Height 2: Full writeback
-      "str q24, [x10, #0x0]\n"
-      "add x10, x10, #0x10\n"
-      "str q25, [x26, #0x0]\n"
+      "str q24, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q25, [x27, #0x0]\n"
       "42:"  // Height 2: Writeback done
       "subs x13, x13, #0x4\n"
       "bgt 23b\n"
       "b 170f\n"
       "43:"  // Height 3
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "44:"  // Height 3: Column loop
-      "cbz x11, 45f\n"
-      "ldr q24, [x11, #0x0]\n"
+      "cbz x14, 45f\n"
+      "ldr q24, [x14, #0x0]\n"
       "mov v25.16b, v24.16b\n"
-      "add x11, x11, #0x10\n"
       "mov v26.16b, v24.16b\n"
+      "add x14, x14, #0x10\n"
       "b 50f\n"
       "45:"  // Height 3: no bias
       "tbz %x[flags], #0, 49f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
       "cmp x13, #0x4\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
       "bge 48f\n"
       "tbz x13, #1, 46f\n"
-      "ldr d24, [x10], #0x8\n"
-      "ldr d25, [x26], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x25], #0x8\n"
+      "ldr d24, [x11], #0x8\n"
+      "ldr d25, [x27], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d26, [x26], #0x8\n"
       "tbz x13, #0, 47f\n"
-      "ld1 { v24.s }[2], [x10]\n"
-      "ld1 { v25.s }[2], [x26]\n"
-      "ld1 { v26.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x11]\n"
+      "ld1 { v25.s }[2], [x27]\n"
+      "ld1 { v26.s }[2], [x26]\n"
       "b 47f\n"
       "46:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr s24, [x10, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x26, #0x0]\n"
-      "ldr s26, [x25, #0x0]\n"
+      "ldr s24, [x11, #0x0]\n"
+      "ldr s25, [x27, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s26, [x26, #0x0]\n"
       "47:"  // Height 3: Partial accumulate: Done
-      "sub x10, x10, x19\n"
+      "sub x11, x11, x20\n"
       "b 50f\n"
       "48:"  // Height 3: full accumulate
-      "ldr q24, [x10, #0x0]\n"
-      "ldr q25, [x26, #0x0]\n"
-      "ldr q26, [x25, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
+      "ldr q25, [x27, #0x0]\n"
+      "ldr q26, [x26, #0x0]\n"
       "b 50f\n"
       "49:"  // Height 3: no accumulate
       "movi v24.16b, #0x0\n"
       "movi v25.16b, #0x0\n"
       "movi v26.16b, #0x0\n"
       "50:"  // Height 3: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "51:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 52f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "cbnz x9, 53f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x10, 53f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
       "b 53f\n"
       "52:"  // Height 3: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
       "53:"  // Height 3: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "blt 56f\n"
-      "ldr q0, [x27, #0x0]\n"
-      "ldr q1, [x26, #0x0]\n"
-      "cmp x28, #0x8\n"
-      "ldr q2, [x25, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
+      "cmp x9, #0x8\n"
+      "ldr q2, [x26, #0x0]\n"
       "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "blt 55f\n"
       "54:"  // Height 3: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "add x27, x27, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
-      "add x26, x26, #0x10\n"
+      "sub x9, x9, #0x4\n"
+      "add x28, x28, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x25, x25, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "sub x28, x28, #0x4\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "cmp x28, #0x8\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "cmp x9, #0x8\n"
       "add x12, x12, #0x40\n"
-      "fmla v24.4s, v10.4s, v0.s[2]\n"
       "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "ldr q10, [x12, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr q0, [x27, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr q1, [x26, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr q2, [x25, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "bge 54b\n"
       "55:"  // Height 3: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
+      "add x28, x28, #0x10\n"
       "add x27, x27, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x26, x26, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x25, x25, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "sub x9, x9, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "add x12, x12, #0x40\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x12, x12, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
       "56:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x28, 58f\n"
+      "cbz x9, 58f\n"
       "57:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x27], #0x4\n"
-      "sub x28, x28, #0x1\n"
-      "ldr s1, [x26], #0x4\n"
-      "ldr s2, [x25], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr s19, [x28], #0x4\n"
+      "ldr s18, [x27], #0x4\n"
+      "sub x9, x9, #0x1\n"
+      "ldr s17, [x26], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v19.s[0]\n"
+      "fmla v25.4s, v16.4s, v18.s[0]\n"
+      "fmla v26.4s, v16.4s, v17.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "cbnz x28, 57b\n"
+      "cbnz x9, 57b\n"
       "58:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x9, x9, #0x1\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 51b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x10, #0x0]\n"
-      "add x26, x10, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
       "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "prfm pstl1keep, [x25, #0x0]\n"
       "tbz %x[flags], #1, 59f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v24.4s, v24.4s, v17.4s\n"
+      "fmin v25.4s, v25.4s, v17.4s\n"
+      "fmin v26.4s, v26.4s, v17.4s\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
       "59:"  // Height 3: No activation
       "cmp x13, #0x4\n"
       "bge 62f\n"
       "tbz x13, #1, 60f\n"
-      "str d24, [x10], #0x8\n"
-      "str d25, [x26], #0x8\n"
-      "str d26, [x25], #0x8\n"
+      "str d24, [x11], #0x8\n"
+      "str d25, [x27], #0x8\n"
+      "str d26, [x26], #0x8\n"
       "tbz x13, #0, 61f\n"
-      "st1 { v24.s }[2], [x10]\n"
-      "st1 { v25.s }[2], [x26]\n"
-      "st1 { v26.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x11]\n"
+      "st1 { v25.s }[2], [x27]\n"
+      "st1 { v26.s }[2], [x26]\n"
       "b 61f\n"
       "60:"  // Height 3: Partial direct writeback: partial_1_0
-      "str s24, [x10, #0x0]\n"
-      "str s25, [x26, #0x0]\n"
-      "str s26, [x25, #0x0]\n"
+      "str s24, [x11, #0x0]\n"
+      "str s25, [x27, #0x0]\n"
+      "str s26, [x26, #0x0]\n"
       "61:"  // Height 3: Partial direct writeback: Done
       "b 63f\n"
       "62:"  // Height 3: Full writeback
-      "str q24, [x10, #0x0]\n"
-      "add x10, x10, #0x10\n"
-      "str q25, [x26, #0x0]\n"
-      "str q26, [x25, #0x0]\n"
+      "str q24, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q25, [x27, #0x0]\n"
+      "str q26, [x26, #0x0]\n"
       "63:"  // Height 3: Writeback done
       "subs x13, x13, #0x4\n"
       "bgt 44b\n"
       "b 170f\n"
       "64:"  // Height 4
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "65:"  // Height 4: Column loop
-      "cbz x11, 66f\n"
-      "ldr q24, [x11, #0x0]\n"
+      "cbz x14, 66f\n"
+      "ldr q24, [x14, #0x0]\n"
       "mov v25.16b, v24.16b\n"
-      "add x11, x11, #0x10\n"
       "mov v26.16b, v24.16b\n"
+      "add x14, x14, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "b 71f\n"
       "66:"  // Height 4: no bias
       "tbz %x[flags], #0, 70f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
       "cmp x13, #0x4\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
       "bge 69f\n"
       "tbz x13, #1, 67f\n"
-      "ldr d24, [x10], #0x8\n"
-      "ldr d25, [x26], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x25], #0x8\n"
-      "ldr d27, [x24], #0x8\n"
+      "ldr d24, [x11], #0x8\n"
+      "ldr d25, [x27], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
       "tbz x13, #0, 68f\n"
-      "ld1 { v24.s }[2], [x10]\n"
-      "ld1 { v25.s }[2], [x26]\n"
-      "ld1 { v26.s }[2], [x25]\n"
-      "ld1 { v27.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x11]\n"
+      "ld1 { v25.s }[2], [x27]\n"
+      "ld1 { v26.s }[2], [x26]\n"
+      "ld1 { v27.s }[2], [x25]\n"
       "b 68f\n"
       "67:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr s24, [x10, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x26, #0x0]\n"
-      "ldr s26, [x25, #0x0]\n"
-      "ldr s27, [x24, #0x0]\n"
+      "ldr s24, [x11, #0x0]\n"
+      "ldr s25, [x27, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s26, [x26, #0x0]\n"
+      "ldr s27, [x25, #0x0]\n"
       "68:"  // Height 4: Partial accumulate: Done
-      "sub x10, x10, x19\n"
+      "sub x11, x11, x20\n"
       "b 71f\n"
       "69:"  // Height 4: full accumulate
-      "ldr q24, [x10, #0x0]\n"
-      "ldr q25, [x26, #0x0]\n"
-      "ldr q26, [x25, #0x0]\n"
-      "ldr q27, [x24, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
+      "ldr q25, [x27, #0x0]\n"
+      "ldr q26, [x26, #0x0]\n"
+      "ldr q27, [x25, #0x0]\n"
       "b 71f\n"
       "70:"  // Height 4: no accumulate
       "movi v24.16b, #0x0\n"
@@ -633,101 +632,101 @@ void a64_hybrid_fp32_mla_8x4 (
       "movi v26.16b, #0x0\n"
       "movi v27.16b, #0x0\n"
       "71:"  // Height 4: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "72:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 73f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "cbnz x9, 74f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "cbnz x10, 74f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
       "b 74f\n"
       "73:"  // Height 4: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "74:"  // Height 4: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "blt 77f\n"
-      "ldr q0, [x27, #0x0]\n"
-      "ldr q1, [x26, #0x0]\n"
-      "cmp x28, #0x8\n"
-      "ldr q2, [x25, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
+      "cmp x9, #0x8\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x25, #0x0]\n"
       "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "blt 76f\n"
       "75:"  // Height 4: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "add x27, x27, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
-      "add x26, x26, #0x10\n"
+      "sub x9, x9, #0x4\n"
+      "add x28, x28, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x25, x25, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "cmp x28, #0x8\n"
+      "add x25, x25, #0x10\n"
+      "cmp x9, #0x8\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x12, x12, #0x40\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "add x12, x12, #0x40\n"
       "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr q0, [x27, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr q1, [x26, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr q2, [x25, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "ldr q3, [x24, #0x0]\n"
+      "ldr q3, [x25, #0x0]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       "bge 75b\n"
       "76:"  // Height 4: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
+      "add x28, x28, #0x10\n"
       "add x27, x27, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x26, x26, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "add x24, x24, #0x10\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x12, x12, #0x40\n"
+      "sub x9, x9, #0x4\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x12, x12, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
@@ -735,130 +734,130 @@ void a64_hybrid_fp32_mla_8x4 (
       "fmla v26.4s, v11.4s, v2.s[3]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
       "77:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x28, 79f\n"
+      "cbz x9, 79f\n"
       "78:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x27], #0x4\n"
-      "sub x28, x28, #0x1\n"
-      "ldr s1, [x26], #0x4\n"
-      "ldr s2, [x25], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr s20, [x28], #0x4\n"
+      "ldr s19, [x27], #0x4\n"
+      "sub x9, x9, #0x1\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr s17, [x25], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v20.s[0]\n"
+      "fmla v25.4s, v16.4s, v19.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "cbnz x28, 78b\n"
+      "fmla v26.4s, v16.4s, v18.s[0]\n"
+      "fmla v27.4s, v16.4s, v17.s[0]\n"
+      "cbnz x9, 78b\n"
       "79:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x9, x9, #0x1\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 72b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x10, #0x0]\n"
-      "add x26, x10, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "add x25, x26, x20, LSL #2\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
       "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x19, LSL #2\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
       "tbz %x[flags], #1, 80f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v24.4s, v24.4s, v17.4s\n"
+      "fmin v25.4s, v25.4s, v17.4s\n"
+      "fmin v26.4s, v26.4s, v17.4s\n"
+      "fmin v27.4s, v27.4s, v17.4s\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
       "80:"  // Height 4: No activation
       "cmp x13, #0x4\n"
       "bge 83f\n"
       "tbz x13, #1, 81f\n"
-      "str d24, [x10], #0x8\n"
-      "str d25, [x26], #0x8\n"
-      "str d26, [x25], #0x8\n"
-      "str d27, [x24], #0x8\n"
+      "str d24, [x11], #0x8\n"
+      "str d25, [x27], #0x8\n"
+      "str d26, [x26], #0x8\n"
+      "str d27, [x25], #0x8\n"
       "tbz x13, #0, 82f\n"
-      "st1 { v24.s }[2], [x10]\n"
-      "st1 { v25.s }[2], [x26]\n"
-      "st1 { v26.s }[2], [x25]\n"
-      "st1 { v27.s }[2], [x24]\n"
+      "st1 { v24.s }[2], [x11]\n"
+      "st1 { v25.s }[2], [x27]\n"
+      "st1 { v26.s }[2], [x26]\n"
+      "st1 { v27.s }[2], [x25]\n"
       "b 82f\n"
       "81:"  // Height 4: Partial direct writeback: partial_1_0
-      "str s24, [x10, #0x0]\n"
-      "str s25, [x26, #0x0]\n"
-      "str s26, [x25, #0x0]\n"
-      "str s27, [x24, #0x0]\n"
+      "str s24, [x11, #0x0]\n"
+      "str s25, [x27, #0x0]\n"
+      "str s26, [x26, #0x0]\n"
+      "str s27, [x25, #0x0]\n"
       "82:"  // Height 4: Partial direct writeback: Done
       "b 84f\n"
       "83:"  // Height 4: Full writeback
-      "str q24, [x10, #0x0]\n"
-      "add x10, x10, #0x10\n"
-      "str q25, [x26, #0x0]\n"
-      "str q26, [x25, #0x0]\n"
-      "str q27, [x24, #0x0]\n"
+      "str q24, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q25, [x27, #0x0]\n"
+      "str q26, [x26, #0x0]\n"
+      "str q27, [x25, #0x0]\n"
       "84:"  // Height 4: Writeback done
       "subs x13, x13, #0x4\n"
       "bgt 65b\n"
       "b 170f\n"
       "85:"  // Height 5
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "86:"  // Height 5: Column loop
-      "cbz x11, 87f\n"
-      "ldr q24, [x11, #0x0]\n"
+      "cbz x14, 87f\n"
+      "ldr q24, [x14, #0x0]\n"
       "mov v25.16b, v24.16b\n"
-      "add x11, x11, #0x10\n"
       "mov v26.16b, v24.16b\n"
+      "add x14, x14, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "b 92f\n"
       "87:"  // Height 5: no bias
       "tbz %x[flags], #0, 91f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
       "cmp x13, #0x4\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
       "bge 90f\n"
       "tbz x13, #1, 88f\n"
-      "ldr d24, [x10], #0x8\n"
-      "ldr d25, [x26], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x25], #0x8\n"
-      "ldr d27, [x24], #0x8\n"
-      "ldr d28, [x23], #0x8\n"
+      "ldr d24, [x11], #0x8\n"
+      "ldr d25, [x27], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d28, [x24], #0x8\n"
       "tbz x13, #0, 89f\n"
-      "ld1 { v24.s }[2], [x10]\n"
-      "ld1 { v25.s }[2], [x26]\n"
-      "ld1 { v26.s }[2], [x25]\n"
-      "ld1 { v27.s }[2], [x24]\n"
-      "ld1 { v28.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x11]\n"
+      "ld1 { v25.s }[2], [x27]\n"
+      "ld1 { v26.s }[2], [x26]\n"
+      "ld1 { v27.s }[2], [x25]\n"
+      "ld1 { v28.s }[2], [x24]\n"
       "b 89f\n"
       "88:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr s24, [x10, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x26, #0x0]\n"
-      "ldr s26, [x25, #0x0]\n"
-      "ldr s27, [x24, #0x0]\n"
-      "ldr s28, [x23, #0x0]\n"
+      "ldr s24, [x11, #0x0]\n"
+      "ldr s25, [x27, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s26, [x26, #0x0]\n"
+      "ldr s27, [x25, #0x0]\n"
+      "ldr s28, [x24, #0x0]\n"
       "89:"  // Height 5: Partial accumulate: Done
-      "sub x10, x10, x19\n"
+      "sub x11, x11, x20\n"
       "b 92f\n"
       "90:"  // Height 5: full accumulate
-      "ldr q24, [x10, #0x0]\n"
-      "ldr q25, [x26, #0x0]\n"
-      "ldr q26, [x25, #0x0]\n"
-      "ldr q27, [x24, #0x0]\n"
-      "ldr q28, [x23, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
+      "ldr q25, [x27, #0x0]\n"
+      "ldr q26, [x26, #0x0]\n"
+      "ldr q27, [x25, #0x0]\n"
+      "ldr q28, [x24, #0x0]\n"
       "b 92f\n"
       "91:"  // Height 5: no accumulate
       "movi v24.16b, #0x0\n"
@@ -867,116 +866,116 @@ void a64_hybrid_fp32_mla_8x4 (
       "movi v27.16b, #0x0\n"
       "movi v28.16b, #0x0\n"
       "92:"  // Height 5: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "93:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 94f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x23, [x20, #0x20]\n"
-      "cbnz x9, 95f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "cbnz x10, 95f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
       "b 95f\n"
       "94:"  // Height 5: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "95:"  // Height 5: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "blt 98f\n"
-      "ldr q0, [x27, #0x0]\n"
-      "ldr q1, [x26, #0x0]\n"
-      "cmp x28, #0x8\n"
-      "ldr q2, [x25, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
+      "cmp x9, #0x8\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x25, #0x0]\n"
+      "ldr q4, [x24, #0x0]\n"
       "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "blt 97f\n"
       "96:"  // Height 5: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "add x27, x27, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
-      "add x26, x26, #0x10\n"
+      "sub x9, x9, #0x4\n"
+      "add x28, x28, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x25, x25, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "add x23, x23, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x28, x28, #0x4\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x28, #0x8\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "cmp x9, #0x8\n"
       "add x12, x12, #0x40\n"
-      "fmla v27.4s, v9.4s, v3.s[1]\n"
       "ldr q8, [x12, #0x0]\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "ldr q9, [x12, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
+      "ldr q10, [x12, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr q0, [x27, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr q1, [x26, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr q2, [x25, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "ldr q3, [x24, #0x0]\n"
+      "ldr q3, [x25, #0x0]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "ldr q4, [x23, #0x0]\n"
+      "ldr q4, [x24, #0x0]\n"
+      "ldr q11, [x12, #0x30]\n"
       "bge 96b\n"
       "97:"  // Height 5: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
+      "add x28, x28, #0x10\n"
       "add x27, x27, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x26, x26, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "add x24, x24, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "sub x9, x9, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x12, x12, #0x40\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x12, x12, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
@@ -986,146 +985,146 @@ void a64_hybrid_fp32_mla_8x4 (
       "fmla v27.4s, v11.4s, v3.s[3]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
       "98:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x28, 100f\n"
+      "cbz x9, 100f\n"
       "99:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x27], #0x4\n"
-      "sub x28, x28, #0x1\n"
-      "ldr s1, [x26], #0x4\n"
-      "ldr s2, [x25], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr s21, [x28], #0x4\n"
+      "ldr s20, [x27], #0x4\n"
+      "sub x9, x9, #0x1\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
+      "ldr s17, [x24], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v21.s[0]\n"
+      "fmla v25.4s, v16.4s, v20.s[0]\n"
+      "fmla v26.4s, v16.4s, v19.s[0]\n"
+      "fmla v27.4s, v16.4s, v18.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "cbnz x28, 99b\n"
+      "fmla v28.4s, v16.4s, v17.s[0]\n"
+      "cbnz x9, 99b\n"
       "100:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x9, x9, #0x1\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 93b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x10, #0x0]\n"
-      "add x26, x10, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
       "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x19, LSL #2\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 101f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v24.4s, v24.4s, v17.4s\n"
+      "fmin v25.4s, v25.4s, v17.4s\n"
+      "fmin v26.4s, v26.4s, v17.4s\n"
+      "fmin v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v17.4s\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
       "101:"  // Height 5: No activation
       "cmp x13, #0x4\n"
       "bge 104f\n"
       "tbz x13, #1, 102f\n"
-      "str d24, [x10], #0x8\n"
-      "str d25, [x26], #0x8\n"
-      "str d26, [x25], #0x8\n"
-      "str d27, [x24], #0x8\n"
-      "str d28, [x23], #0x8\n"
+      "str d24, [x11], #0x8\n"
+      "str d25, [x27], #0x8\n"
+      "str d26, [x26], #0x8\n"
+      "str d27, [x25], #0x8\n"
+      "str d28, [x24], #0x8\n"
       "tbz x13, #0, 103f\n"
-      "st1 { v24.s }[2], [x10]\n"
-      "st1 { v25.s }[2], [x26]\n"
-      "st1 { v26.s }[2], [x25]\n"
-      "st1 { v27.s }[2], [x24]\n"
-      "st1 { v28.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x11]\n"
+      "st1 { v25.s }[2], [x27]\n"
+      "st1 { v26.s }[2], [x26]\n"
+      "st1 { v27.s }[2], [x25]\n"
+      "st1 { v28.s }[2], [x24]\n"
       "b 103f\n"
       "102:"  // Height 5: Partial direct writeback: partial_1_0
-      "str s24, [x10, #0x0]\n"
-      "str s25, [x26, #0x0]\n"
-      "str s26, [x25, #0x0]\n"
-      "str s27, [x24, #0x0]\n"
-      "str s28, [x23, #0x0]\n"
+      "str s24, [x11, #0x0]\n"
+      "str s25, [x27, #0x0]\n"
+      "str s26, [x26, #0x0]\n"
+      "str s27, [x25, #0x0]\n"
+      "str s28, [x24, #0x0]\n"
       "103:"  // Height 5: Partial direct writeback: Done
       "b 105f\n"
       "104:"  // Height 5: Full writeback
-      "str q24, [x10, #0x0]\n"
-      "add x10, x10, #0x10\n"
-      "str q25, [x26, #0x0]\n"
-      "str q26, [x25, #0x0]\n"
-      "str q27, [x24, #0x0]\n"
-      "str q28, [x23, #0x0]\n"
+      "str q24, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q25, [x27, #0x0]\n"
+      "str q26, [x26, #0x0]\n"
+      "str q27, [x25, #0x0]\n"
+      "str q28, [x24, #0x0]\n"
       "105:"  // Height 5: Writeback done
       "subs x13, x13, #0x4\n"
       "bgt 86b\n"
       "b 170f\n"
       "106:"  // Height 6
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "107:"  // Height 6: Column loop
-      "cbz x11, 108f\n"
-      "ldr q24, [x11, #0x0]\n"
+      "cbz x14, 108f\n"
+      "ldr q24, [x14, #0x0]\n"
       "mov v25.16b, v24.16b\n"
-      "add x11, x11, #0x10\n"
       "mov v26.16b, v24.16b\n"
+      "add x14, x14, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "mov v29.16b, v24.16b\n"
       "b 113f\n"
       "108:"  // Height 6: no bias
       "tbz %x[flags], #0, 112f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
       "cmp x13, #0x4\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "bge 111f\n"
       "tbz x13, #1, 109f\n"
-      "ldr d24, [x10], #0x8\n"
-      "ldr d25, [x26], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x25], #0x8\n"
-      "ldr d27, [x24], #0x8\n"
-      "ldr d28, [x23], #0x8\n"
-      "ldr d29, [x22], #0x8\n"
+      "ldr d24, [x11], #0x8\n"
+      "ldr d25, [x27], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d28, [x24], #0x8\n"
+      "ldr d29, [x23], #0x8\n"
       "tbz x13, #0, 110f\n"
-      "ld1 { v24.s }[2], [x10]\n"
-      "ld1 { v25.s }[2], [x26]\n"
-      "ld1 { v26.s }[2], [x25]\n"
-      "ld1 { v27.s }[2], [x24]\n"
-      "ld1 { v28.s }[2], [x23]\n"
-      "ld1 { v29.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x11]\n"
+      "ld1 { v25.s }[2], [x27]\n"
+      "ld1 { v26.s }[2], [x26]\n"
+      "ld1 { v27.s }[2], [x25]\n"
+      "ld1 { v28.s }[2], [x24]\n"
+      "ld1 { v29.s }[2], [x23]\n"
       "b 110f\n"
       "109:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr s24, [x10, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x26, #0x0]\n"
-      "ldr s26, [x25, #0x0]\n"
-      "ldr s27, [x24, #0x0]\n"
-      "ldr s28, [x23, #0x0]\n"
-      "ldr s29, [x22, #0x0]\n"
+      "ldr s24, [x11, #0x0]\n"
+      "ldr s25, [x27, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s26, [x26, #0x0]\n"
+      "ldr s27, [x25, #0x0]\n"
+      "ldr s28, [x24, #0x0]\n"
+      "ldr s29, [x23, #0x0]\n"
       "110:"  // Height 6: Partial accumulate: Done
-      "sub x10, x10, x19\n"
+      "sub x11, x11, x20\n"
       "b 113f\n"
       "111:"  // Height 6: full accumulate
-      "ldr q24, [x10, #0x0]\n"
-      "ldr q25, [x26, #0x0]\n"
-      "ldr q26, [x25, #0x0]\n"
-      "ldr q27, [x24, #0x0]\n"
-      "ldr q28, [x23, #0x0]\n"
-      "ldr q29, [x22, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
+      "ldr q25, [x27, #0x0]\n"
+      "ldr q26, [x26, #0x0]\n"
+      "ldr q27, [x25, #0x0]\n"
+      "ldr q28, [x24, #0x0]\n"
+      "ldr q29, [x23, #0x0]\n"
       "b 113f\n"
       "112:"  // Height 6: no accumulate
       "movi v24.16b, #0x0\n"
@@ -1135,131 +1134,131 @@ void a64_hybrid_fp32_mla_8x4 (
       "movi v28.16b, #0x0\n"
       "movi v29.16b, #0x0\n"
       "113:"  // Height 6: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "114:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 115f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x23, [x20, #0x20]\n"
-      "ldr x22, [x20, #0x28]\n"
-      "cbnz x9, 116f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "cbnz x10, 116f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
       "b 116f\n"
       "115:"  // Height 6: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "116:"  // Height 6: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "blt 119f\n"
-      "ldr q0, [x27, #0x0]\n"
-      "ldr q1, [x26, #0x0]\n"
-      "cmp x28, #0x8\n"
-      "ldr q2, [x25, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
-      "ldr q5, [x22, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
+      "cmp x9, #0x8\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x25, #0x0]\n"
+      "ldr q4, [x24, #0x0]\n"
+      "ldr q5, [x23, #0x0]\n"
       "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "blt 118f\n"
       "117:"  // Height 6: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "add x27, x27, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
-      "add x26, x26, #0x10\n"
+      "sub x9, x9, #0x4\n"
+      "add x28, x28, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x25, x25, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "add x23, x23, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "cmp x28, #0x8\n"
+      "add x23, x23, #0x10\n"
+      "cmp x9, #0x8\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x12, x12, #0x40\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "add x12, x12, #0x40\n"
       "ldr q8, [x12, #0x0]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr q0, [x27, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr q1, [x26, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr q2, [x25, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "ldr q3, [x24, #0x0]\n"
+      "ldr q3, [x25, #0x0]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "ldr q4, [x23, #0x0]\n"
+      "ldr q4, [x24, #0x0]\n"
       "fmla v29.4s, v11.4s, v5.s[3]\n"
-      "ldr q5, [x22, #0x0]\n"
+      "ldr q5, [x23, #0x0]\n"
+      "ldr q11, [x12, #0x30]\n"
       "bge 117b\n"
       "118:"  // Height 6: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
+      "add x28, x28, #0x10\n"
       "add x27, x27, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x26, x26, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "add x24, x24, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x22, x22, #0x10\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x12, x12, #0x40\n"
+      "sub x9, x9, #0x4\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "add x12, x12, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
@@ -1271,108 +1270,108 @@ void a64_hybrid_fp32_mla_8x4 (
       "fmla v28.4s, v11.4s, v4.s[3]\n"
       "fmla v29.4s, v11.4s, v5.s[3]\n"
       "119:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x28, 121f\n"
+      "cbz x9, 121f\n"
       "120:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x27], #0x4\n"
-      "sub x28, x28, #0x1\n"
-      "ldr s1, [x26], #0x4\n"
-      "ldr s2, [x25], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr s5, [x22], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr s22, [x28], #0x4\n"
+      "ldr s21, [x27], #0x4\n"
+      "sub x9, x9, #0x1\n"
+      "ldr s20, [x26], #0x4\n"
+      "ldr s19, [x25], #0x4\n"
+      "ldr s18, [x24], #0x4\n"
+      "ldr s17, [x23], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v22.s[0]\n"
+      "fmla v25.4s, v16.4s, v21.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
-      "cbnz x28, 120b\n"
+      "fmla v26.4s, v16.4s, v20.s[0]\n"
+      "fmla v27.4s, v16.4s, v19.s[0]\n"
+      "fmla v28.4s, v16.4s, v18.s[0]\n"
+      "fmla v29.4s, v16.4s, v17.s[0]\n"
+      "cbnz x9, 120b\n"
       "121:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x9, x9, #0x1\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 114b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x10, #0x0]\n"
-      "add x26, x10, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
       "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x19, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 122f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmin v29.4s, v29.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v24.4s, v24.4s, v17.4s\n"
+      "fmin v25.4s, v25.4s, v17.4s\n"
+      "fmin v26.4s, v26.4s, v17.4s\n"
+      "fmin v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v17.4s\n"
+      "fmin v29.4s, v29.4s, v17.4s\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
+      "fmax v29.4s, v29.4s, v16.4s\n"
       "122:"  // Height 6: No activation
       "cmp x13, #0x4\n"
       "bge 125f\n"
       "tbz x13, #1, 123f\n"
-      "str d24, [x10], #0x8\n"
-      "str d25, [x26], #0x8\n"
-      "str d26, [x25], #0x8\n"
-      "str d27, [x24], #0x8\n"
-      "str d28, [x23], #0x8\n"
-      "str d29, [x22], #0x8\n"
+      "str d24, [x11], #0x8\n"
+      "str d25, [x27], #0x8\n"
+      "str d26, [x26], #0x8\n"
+      "str d27, [x25], #0x8\n"
+      "str d28, [x24], #0x8\n"
+      "str d29, [x23], #0x8\n"
       "tbz x13, #0, 124f\n"
-      "st1 { v24.s }[2], [x10]\n"
-      "st1 { v25.s }[2], [x26]\n"
-      "st1 { v26.s }[2], [x25]\n"
-      "st1 { v27.s }[2], [x24]\n"
-      "st1 { v28.s }[2], [x23]\n"
-      "st1 { v29.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x11]\n"
+      "st1 { v25.s }[2], [x27]\n"
+      "st1 { v26.s }[2], [x26]\n"
+      "st1 { v27.s }[2], [x25]\n"
+      "st1 { v28.s }[2], [x24]\n"
+      "st1 { v29.s }[2], [x23]\n"
       "b 124f\n"
       "123:"  // Height 6: Partial direct writeback: partial_1_0
-      "str s24, [x10, #0x0]\n"
-      "str s25, [x26, #0x0]\n"
-      "str s26, [x25, #0x0]\n"
-      "str s27, [x24, #0x0]\n"
-      "str s28, [x23, #0x0]\n"
-      "str s29, [x22, #0x0]\n"
+      "str s24, [x11, #0x0]\n"
+      "str s25, [x27, #0x0]\n"
+      "str s26, [x26, #0x0]\n"
+      "str s27, [x25, #0x0]\n"
+      "str s28, [x24, #0x0]\n"
+      "str s29, [x23, #0x0]\n"
       "124:"  // Height 6: Partial direct writeback: Done
       "b 126f\n"
       "125:"  // Height 6: Full writeback
-      "str q24, [x10, #0x0]\n"
-      "add x10, x10, #0x10\n"
-      "str q25, [x26, #0x0]\n"
-      "str q26, [x25, #0x0]\n"
-      "str q27, [x24, #0x0]\n"
-      "str q28, [x23, #0x0]\n"
-      "str q29, [x22, #0x0]\n"
+      "str q24, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q25, [x27, #0x0]\n"
+      "str q26, [x26, #0x0]\n"
+      "str q27, [x25, #0x0]\n"
+      "str q28, [x24, #0x0]\n"
+      "str q29, [x23, #0x0]\n"
       "126:"  // Height 6: Writeback done
       "subs x13, x13, #0x4\n"
       "bgt 107b\n"
       "b 170f\n"
       "127:"  // Height 7
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "128:"  // Height 7: Column loop
-      "cbz x11, 129f\n"
-      "ldr q24, [x11, #0x0]\n"
+      "cbz x14, 129f\n"
+      "ldr q24, [x14, #0x0]\n"
       "mov v25.16b, v24.16b\n"
-      "add x11, x11, #0x10\n"
       "mov v26.16b, v24.16b\n"
+      "add x14, x14, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "mov v29.16b, v24.16b\n"
@@ -1380,53 +1379,53 @@ void a64_hybrid_fp32_mla_8x4 (
       "b 134f\n"
       "129:"  // Height 7: no bias
       "tbz %x[flags], #0, 133f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "cmp x13, #0x4\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "bge 132f\n"
       "tbz x13, #1, 130f\n"
-      "ldr d24, [x10], #0x8\n"
-      "ldr d25, [x26], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x25], #0x8\n"
-      "ldr d27, [x24], #0x8\n"
-      "ldr d28, [x23], #0x8\n"
-      "ldr d29, [x22], #0x8\n"
-      "ldr d30, [x21], #0x8\n"
+      "ldr d24, [x11], #0x8\n"
+      "ldr d25, [x27], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d28, [x24], #0x8\n"
+      "ldr d29, [x23], #0x8\n"
+      "ldr d30, [x22], #0x8\n"
       "tbz x13, #0, 131f\n"
-      "ld1 { v24.s }[2], [x10]\n"
-      "ld1 { v25.s }[2], [x26]\n"
-      "ld1 { v26.s }[2], [x25]\n"
-      "ld1 { v27.s }[2], [x24]\n"
-      "ld1 { v28.s }[2], [x23]\n"
-      "ld1 { v29.s }[2], [x22]\n"
-      "ld1 { v30.s }[2], [x21]\n"
+      "ld1 { v24.s }[2], [x11]\n"
+      "ld1 { v25.s }[2], [x27]\n"
+      "ld1 { v26.s }[2], [x26]\n"
+      "ld1 { v27.s }[2], [x25]\n"
+      "ld1 { v28.s }[2], [x24]\n"
+      "ld1 { v29.s }[2], [x23]\n"
+      "ld1 { v30.s }[2], [x22]\n"
       "b 131f\n"
       "130:"  // Height 7: Partial accumulate: partial_1_0
-      "ldr s24, [x10, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x26, #0x0]\n"
-      "ldr s26, [x25, #0x0]\n"
-      "ldr s27, [x24, #0x0]\n"
-      "ldr s28, [x23, #0x0]\n"
-      "ldr s29, [x22, #0x0]\n"
-      "ldr s30, [x21, #0x0]\n"
+      "ldr s24, [x11, #0x0]\n"
+      "ldr s25, [x27, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s26, [x26, #0x0]\n"
+      "ldr s27, [x25, #0x0]\n"
+      "ldr s28, [x24, #0x0]\n"
+      "ldr s29, [x23, #0x0]\n"
+      "ldr s30, [x22, #0x0]\n"
       "131:"  // Height 7: Partial accumulate: Done
-      "sub x10, x10, x19\n"
+      "sub x11, x11, x20\n"
       "b 134f\n"
       "132:"  // Height 7: full accumulate
-      "ldr q24, [x10, #0x0]\n"
-      "ldr q25, [x26, #0x0]\n"
-      "ldr q26, [x25, #0x0]\n"
-      "ldr q27, [x24, #0x0]\n"
-      "ldr q28, [x23, #0x0]\n"
-      "ldr q29, [x22, #0x0]\n"
-      "ldr q30, [x21, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
+      "ldr q25, [x27, #0x0]\n"
+      "ldr q26, [x26, #0x0]\n"
+      "ldr q27, [x25, #0x0]\n"
+      "ldr q28, [x24, #0x0]\n"
+      "ldr q29, [x23, #0x0]\n"
+      "ldr q30, [x22, #0x0]\n"
       "b 134f\n"
       "133:"  // Height 7: no accumulate
       "movi v24.16b, #0x0\n"
@@ -1437,146 +1436,146 @@ void a64_hybrid_fp32_mla_8x4 (
       "movi v29.16b, #0x0\n"
       "movi v30.16b, #0x0\n"
       "134:"  // Height 7: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "135:"  // Height 7: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 136f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x23, [x20, #0x20]\n"
-      "ldr x22, [x20, #0x28]\n"
-      "ldr x21, [x20, #0x30]\n"
-      "cbnz x9, 137f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
-      "add x21, x21, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "cbnz x10, 137f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
       "b 137f\n"
       "136:"  // Height 7: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "137:"  // Height 7: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "blt 140f\n"
-      "ldr q0, [x27, #0x0]\n"
-      "ldr q1, [x26, #0x0]\n"
-      "cmp x28, #0x8\n"
-      "ldr q2, [x25, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
-      "ldr q5, [x22, #0x0]\n"
-      "ldr q6, [x21, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
+      "cmp x9, #0x8\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x25, #0x0]\n"
+      "ldr q4, [x24, #0x0]\n"
+      "ldr q5, [x23, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
       "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "blt 139f\n"
       "138:"  // Height 7: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "add x27, x27, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
-      "add x26, x26, #0x10\n"
+      "sub x9, x9, #0x4\n"
+      "add x28, x28, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x25, x25, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "add x23, x23, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x21, x21, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x28, x28, #0x4\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x28, #0x8\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "cmp x9, #0x8\n"
       "add x12, x12, #0x40\n"
-      "fmla v27.4s, v9.4s, v3.s[1]\n"
       "ldr q8, [x12, #0x0]\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
       "fmla v30.4s, v9.4s, v6.s[1]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v30.4s, v10.4s, v6.s[2]\n"
+      "ldr q10, [x12, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr q0, [x27, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr q1, [x26, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr q2, [x25, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "ldr q3, [x24, #0x0]\n"
+      "ldr q3, [x25, #0x0]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "ldr q4, [x23, #0x0]\n"
+      "ldr q4, [x24, #0x0]\n"
       "fmla v29.4s, v11.4s, v5.s[3]\n"
-      "ldr q5, [x22, #0x0]\n"
+      "ldr q5, [x23, #0x0]\n"
       "fmla v30.4s, v11.4s, v6.s[3]\n"
-      "ldr q6, [x21, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      "ldr q11, [x12, #0x30]\n"
       "bge 138b\n"
       "139:"  // Height 7: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
+      "add x28, x28, #0x10\n"
       "add x27, x27, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x26, x26, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "add x24, x24, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x22, x22, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "sub x9, x9, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x12, x12, #0x40\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
       "fmla v30.4s, v9.4s, v6.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "add x12, x12, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
@@ -1590,121 +1589,121 @@ void a64_hybrid_fp32_mla_8x4 (
       "fmla v29.4s, v11.4s, v5.s[3]\n"
       "fmla v30.4s, v11.4s, v6.s[3]\n"
       "140:"  // Height 7: Multiply loop: Main loop skip
-      "cbz x28, 142f\n"
+      "cbz x9, 142f\n"
       "141:"  // Height 7: Multiply loop: Odd block loop
-      "ldr s0, [x27], #0x4\n"
-      "sub x28, x28, #0x1\n"
-      "ldr s1, [x26], #0x4\n"
-      "ldr s2, [x25], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr s5, [x22], #0x4\n"
-      "ldr s6, [x21], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr s23, [x28], #0x4\n"
+      "ldr s22, [x27], #0x4\n"
+      "sub x9, x9, #0x1\n"
+      "ldr s21, [x26], #0x4\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s19, [x24], #0x4\n"
+      "ldr s18, [x23], #0x4\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v23.s[0]\n"
+      "fmla v25.4s, v16.4s, v22.s[0]\n"
+      "fmla v26.4s, v16.4s, v21.s[0]\n"
+      "fmla v27.4s, v16.4s, v20.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
-      "fmla v30.4s, v12.4s, v6.s[0]\n"
-      "cbnz x28, 141b\n"
+      "fmla v28.4s, v16.4s, v19.s[0]\n"
+      "fmla v29.4s, v16.4s, v18.s[0]\n"
+      "fmla v30.4s, v16.4s, v17.s[0]\n"
+      "cbnz x9, 141b\n"
       "142:"  // Height 7: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x9, x9, #0x1\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 135b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x10, #0x0]\n"
-      "add x26, x10, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
       "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x19, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "tbz %x[flags], #1, 143f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmin v29.4s, v29.4s, v16.4s\n"
-      "fmin v30.4s, v30.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "fmax v30.4s, v30.4s, v17.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v24.4s, v24.4s, v17.4s\n"
+      "fmin v25.4s, v25.4s, v17.4s\n"
+      "fmin v26.4s, v26.4s, v17.4s\n"
+      "fmin v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v17.4s\n"
+      "fmin v29.4s, v29.4s, v17.4s\n"
+      "fmin v30.4s, v30.4s, v17.4s\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
+      "fmax v29.4s, v29.4s, v16.4s\n"
+      "fmax v30.4s, v30.4s, v16.4s\n"
       "143:"  // Height 7: No activation
       "cmp x13, #0x4\n"
       "bge 146f\n"
       "tbz x13, #1, 144f\n"
-      "str d24, [x10], #0x8\n"
-      "str d25, [x26], #0x8\n"
-      "str d26, [x25], #0x8\n"
-      "str d27, [x24], #0x8\n"
-      "str d28, [x23], #0x8\n"
-      "str d29, [x22], #0x8\n"
-      "str d30, [x21], #0x8\n"
+      "str d24, [x11], #0x8\n"
+      "str d25, [x27], #0x8\n"
+      "str d26, [x26], #0x8\n"
+      "str d27, [x25], #0x8\n"
+      "str d28, [x24], #0x8\n"
+      "str d29, [x23], #0x8\n"
+      "str d30, [x22], #0x8\n"
       "tbz x13, #0, 145f\n"
-      "st1 { v24.s }[2], [x10]\n"
-      "st1 { v25.s }[2], [x26]\n"
-      "st1 { v26.s }[2], [x25]\n"
-      "st1 { v27.s }[2], [x24]\n"
-      "st1 { v28.s }[2], [x23]\n"
-      "st1 { v29.s }[2], [x22]\n"
-      "st1 { v30.s }[2], [x21]\n"
+      "st1 { v24.s }[2], [x11]\n"
+      "st1 { v25.s }[2], [x27]\n"
+      "st1 { v26.s }[2], [x26]\n"
+      "st1 { v27.s }[2], [x25]\n"
+      "st1 { v28.s }[2], [x24]\n"
+      "st1 { v29.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x22]\n"
       "b 145f\n"
       "144:"  // Height 7: Partial direct writeback: partial_1_0
-      "str s24, [x10, #0x0]\n"
-      "str s25, [x26, #0x0]\n"
-      "str s26, [x25, #0x0]\n"
-      "str s27, [x24, #0x0]\n"
-      "str s28, [x23, #0x0]\n"
-      "str s29, [x22, #0x0]\n"
-      "str s30, [x21, #0x0]\n"
+      "str s24, [x11, #0x0]\n"
+      "str s25, [x27, #0x0]\n"
+      "str s26, [x26, #0x0]\n"
+      "str s27, [x25, #0x0]\n"
+      "str s28, [x24, #0x0]\n"
+      "str s29, [x23, #0x0]\n"
+      "str s30, [x22, #0x0]\n"
       "145:"  // Height 7: Partial direct writeback: Done
       "b 147f\n"
       "146:"  // Height 7: Full writeback
-      "str q24, [x10, #0x0]\n"
-      "add x10, x10, #0x10\n"
-      "str q25, [x26, #0x0]\n"
-      "str q26, [x25, #0x0]\n"
-      "str q27, [x24, #0x0]\n"
-      "str q28, [x23, #0x0]\n"
-      "str q29, [x22, #0x0]\n"
-      "str q30, [x21, #0x0]\n"
+      "str q24, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q25, [x27, #0x0]\n"
+      "str q26, [x26, #0x0]\n"
+      "str q27, [x25, #0x0]\n"
+      "str q28, [x24, #0x0]\n"
+      "str q29, [x23, #0x0]\n"
+      "str q30, [x22, #0x0]\n"
       "147:"  // Height 7: Writeback done
       "subs x13, x13, #0x4\n"
       "bgt 128b\n"
       "b 170f\n"
       "148:"  // Height 8
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x20\n"
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x19, #0x20\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "149:"  // Height 8: Column loop
-      "cbz x11, 150f\n"
-      "ldr q24, [x11, #0x0]\n"
+      "cbz x14, 150f\n"
+      "ldr q24, [x14, #0x0]\n"
       "mov v25.16b, v24.16b\n"
-      "add x11, x11, #0x10\n"
       "mov v26.16b, v24.16b\n"
+      "add x14, x14, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "mov v29.16b, v24.16b\n"
@@ -1713,58 +1712,58 @@ void a64_hybrid_fp32_mla_8x4 (
       "b 155f\n"
       "150:"  // Height 8: no bias
       "tbz %x[flags], #0, 154f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "cmp x13, #0x4\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "bge 153f\n"
       "tbz x13, #1, 151f\n"
-      "ldr d24, [x10], #0x8\n"
-      "ldr d25, [x26], #0x8\n"
-      "mov x19, #0x8\n"
-      "ldr d26, [x25], #0x8\n"
-      "ldr d27, [x24], #0x8\n"
-      "ldr d28, [x23], #0x8\n"
-      "ldr d29, [x22], #0x8\n"
-      "ldr d30, [x21], #0x8\n"
-      "ldr d31, [x20], #0x8\n"
+      "ldr d24, [x11], #0x8\n"
+      "ldr d25, [x27], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d28, [x24], #0x8\n"
+      "ldr d29, [x23], #0x8\n"
+      "ldr d30, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
       "tbz x13, #0, 152f\n"
-      "ld1 { v24.s }[2], [x10]\n"
-      "ld1 { v25.s }[2], [x26]\n"
-      "ld1 { v26.s }[2], [x25]\n"
-      "ld1 { v27.s }[2], [x24]\n"
-      "ld1 { v28.s }[2], [x23]\n"
-      "ld1 { v29.s }[2], [x22]\n"
-      "ld1 { v30.s }[2], [x21]\n"
-      "ld1 { v31.s }[2], [x20]\n"
+      "ld1 { v24.s }[2], [x11]\n"
+      "ld1 { v25.s }[2], [x27]\n"
+      "ld1 { v26.s }[2], [x26]\n"
+      "ld1 { v27.s }[2], [x25]\n"
+      "ld1 { v28.s }[2], [x24]\n"
+      "ld1 { v29.s }[2], [x23]\n"
+      "ld1 { v30.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
       "b 152f\n"
       "151:"  // Height 8: Partial accumulate: partial_1_0
-      "ldr s24, [x10, #0x0]\n"
-      "mov x19, #0x0\n"
-      "ldr s25, [x26, #0x0]\n"
-      "ldr s26, [x25, #0x0]\n"
-      "ldr s27, [x24, #0x0]\n"
-      "ldr s28, [x23, #0x0]\n"
-      "ldr s29, [x22, #0x0]\n"
-      "ldr s30, [x21, #0x0]\n"
-      "ldr s31, [x20, #0x0]\n"
+      "ldr s24, [x11, #0x0]\n"
+      "ldr s25, [x27, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s26, [x26, #0x0]\n"
+      "ldr s27, [x25, #0x0]\n"
+      "ldr s28, [x24, #0x0]\n"
+      "ldr s29, [x23, #0x0]\n"
+      "ldr s30, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
       "152:"  // Height 8: Partial accumulate: Done
-      "sub x10, x10, x19\n"
+      "sub x11, x11, x20\n"
       "b 155f\n"
       "153:"  // Height 8: full accumulate
-      "ldr q24, [x10, #0x0]\n"
-      "ldr q25, [x26, #0x0]\n"
-      "ldr q26, [x25, #0x0]\n"
-      "ldr q27, [x24, #0x0]\n"
-      "ldr q28, [x23, #0x0]\n"
-      "ldr q29, [x22, #0x0]\n"
-      "ldr q30, [x21, #0x0]\n"
-      "ldr q31, [x20, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
+      "ldr q25, [x27, #0x0]\n"
+      "ldr q26, [x26, #0x0]\n"
+      "ldr q27, [x25, #0x0]\n"
+      "ldr q28, [x24, #0x0]\n"
+      "ldr q29, [x23, #0x0]\n"
+      "ldr q30, [x22, #0x0]\n"
+      "ldr q31, [x21, #0x0]\n"
       "b 155f\n"
       "154:"  // Height 8: no accumulate
       "movi v24.16b, #0x0\n"
@@ -1776,161 +1775,161 @@ void a64_hybrid_fp32_mla_8x4 (
       "movi v30.16b, #0x0\n"
       "movi v31.16b, #0x0\n"
       "155:"  // Height 8: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "156:"  // Height 8: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 157f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x23, [x20, #0x20]\n"
-      "ldr x22, [x20, #0x28]\n"
-      "ldr x21, [x20, #0x30]\n"
-      "ldr x20, [x20, #0x38]\n"
-      "cbnz x9, 158f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
-      "add x21, x21, x19, LSL #2\n"
-      "add x20, x20, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "ldr x21, [x20, #0x38]\n"
+      "cbnz x10, 158f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
       "b 158f\n"
       "157:"  // Height 8: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "158:"  // Height 8: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "blt 161f\n"
-      "ldr q0, [x27, #0x0]\n"
-      "ldr q1, [x26, #0x0]\n"
-      "cmp x28, #0x8\n"
-      "ldr q2, [x25, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
-      "ldr q5, [x22, #0x0]\n"
-      "ldr q6, [x21, #0x0]\n"
-      "ldr q7, [x20, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
+      "cmp x9, #0x8\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x25, #0x0]\n"
+      "ldr q4, [x24, #0x0]\n"
+      "ldr q5, [x23, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      "ldr q7, [x21, #0x0]\n"
       "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
       "blt 160f\n"
       "159:"  // Height 8: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "add x27, x27, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
-      "add x26, x26, #0x10\n"
+      "sub x9, x9, #0x4\n"
+      "add x28, x28, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x25, x25, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "add x23, x23, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x21, x21, #0x10\n"
       "fmla v31.4s, v8.4s, v7.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x20, x20, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "cmp x28, #0x8\n"
+      "add x21, x21, #0x10\n"
+      "cmp x9, #0x8\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "add x12, x12, #0x40\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "add x12, x12, #0x40\n"
       "ldr q8, [x12, #0x0]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "fmla v30.4s, v9.4s, v6.s[1]\n"
       "fmla v31.4s, v9.4s, v7.s[1]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       "fmla v30.4s, v10.4s, v6.s[2]\n"
       "fmla v31.4s, v10.4s, v7.s[2]\n"
+      "ldr q10, [x12, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr q0, [x27, #0x0]\n"
+      "ldr q0, [x28, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr q1, [x26, #0x0]\n"
+      "ldr q1, [x27, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr q2, [x25, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "ldr q3, [x24, #0x0]\n"
+      "ldr q3, [x25, #0x0]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "ldr q4, [x23, #0x0]\n"
+      "ldr q4, [x24, #0x0]\n"
       "fmla v29.4s, v11.4s, v5.s[3]\n"
-      "ldr q5, [x22, #0x0]\n"
+      "ldr q5, [x23, #0x0]\n"
       "fmla v30.4s, v11.4s, v6.s[3]\n"
-      "ldr q6, [x21, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
       "fmla v31.4s, v11.4s, v7.s[3]\n"
-      "ldr q7, [x20, #0x0]\n"
+      "ldr q7, [x21, #0x0]\n"
+      "ldr q11, [x12, #0x30]\n"
       "bge 159b\n"
       "160:"  // Height 8: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x12, #0x10]\n"
-      "sub x28, x28, #0x4\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x12, #0x20]\n"
+      "add x28, x28, #0x10\n"
       "add x27, x27, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x12, #0x30]\n"
-      "add x26, x26, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "add x24, x24, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x22, x22, #0x10\n"
       "fmla v31.4s, v8.4s, v7.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "add x22, x22, #0x10\n"
       "add x21, x21, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x20, x20, #0x10\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "add x12, x12, #0x40\n"
+      "sub x9, x9, #0x4\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v30.4s, v9.4s, v6.s[1]\n"
       "fmla v31.4s, v9.4s, v7.s[1]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "add x12, x12, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
@@ -1946,132 +1945,131 @@ void a64_hybrid_fp32_mla_8x4 (
       "fmla v30.4s, v11.4s, v6.s[3]\n"
       "fmla v31.4s, v11.4s, v7.s[3]\n"
       "161:"  // Height 8: Multiply loop: Main loop skip
-      "cbz x28, 163f\n"
+      "cbz x9, 163f\n"
       "162:"  // Height 8: Multiply loop: Odd block loop
-      "ldr s0, [x27], #0x4\n"
-      "sub x28, x28, #0x1\n"
-      "ldr s1, [x26], #0x4\n"
-      "ldr s2, [x25], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr s5, [x22], #0x4\n"
-      "ldr s6, [x21], #0x4\n"
-      "ldr s7, [x20], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr s0, [x28], #0x4\n"
+      "ldr s23, [x27], #0x4\n"
+      "sub x9, x9, #0x1\n"
+      "ldr s22, [x26], #0x4\n"
+      "ldr s21, [x25], #0x4\n"
+      "ldr s20, [x24], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "ldr s17, [x21], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "fmla v25.4s, v16.4s, v23.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
-      "fmla v30.4s, v12.4s, v6.s[0]\n"
-      "fmla v31.4s, v12.4s, v7.s[0]\n"
-      "cbnz x28, 162b\n"
+      "fmla v26.4s, v16.4s, v22.s[0]\n"
+      "fmla v27.4s, v16.4s, v21.s[0]\n"
+      "fmla v28.4s, v16.4s, v20.s[0]\n"
+      "fmla v29.4s, v16.4s, v19.s[0]\n"
+      "fmla v30.4s, v16.4s, v18.s[0]\n"
+      "fmla v31.4s, v16.4s, v17.s[0]\n"
+      "cbnz x9, 162b\n"
       "163:"  // Height 8: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x9, x9, #0x1\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 156b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x10, #0x0]\n"
-      "add x26, x10, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
       "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x19, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x19, LSL #2\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x19, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #2\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "tbz %x[flags], #1, 164f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmin v29.4s, v29.4s, v16.4s\n"
-      "fmin v30.4s, v30.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "fmax v30.4s, v30.4s, v17.4s\n"
-      "fmin v31.4s, v31.4s, v16.4s\n"
-      "fmax v31.4s, v31.4s, v17.4s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v24.4s, v24.4s, v17.4s\n"
+      "fmin v25.4s, v25.4s, v17.4s\n"
+      "fmin v26.4s, v26.4s, v17.4s\n"
+      "fmin v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v17.4s\n"
+      "fmin v29.4s, v29.4s, v17.4s\n"
+      "fmin v30.4s, v30.4s, v17.4s\n"
+      "fmin v31.4s, v31.4s, v17.4s\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
+      "fmax v29.4s, v29.4s, v16.4s\n"
+      "fmax v30.4s, v30.4s, v16.4s\n"
+      "fmax v31.4s, v31.4s, v16.4s\n"
       "164:"  // Height 8: No activation
       "cmp x13, #0x4\n"
       "bge 167f\n"
       "tbz x13, #1, 165f\n"
-      "str d24, [x10], #0x8\n"
-      "str d25, [x26], #0x8\n"
-      "str d26, [x25], #0x8\n"
-      "str d27, [x24], #0x8\n"
-      "str d28, [x23], #0x8\n"
-      "str d29, [x22], #0x8\n"
-      "str d30, [x21], #0x8\n"
-      "str d31, [x20], #0x8\n"
+      "str d24, [x11], #0x8\n"
+      "str d25, [x27], #0x8\n"
+      "str d26, [x26], #0x8\n"
+      "str d27, [x25], #0x8\n"
+      "str d28, [x24], #0x8\n"
+      "str d29, [x23], #0x8\n"
+      "str d30, [x22], #0x8\n"
+      "str d31, [x21], #0x8\n"
       "tbz x13, #0, 166f\n"
-      "st1 { v24.s }[2], [x10]\n"
-      "st1 { v25.s }[2], [x26]\n"
-      "st1 { v26.s }[2], [x25]\n"
-      "st1 { v27.s }[2], [x24]\n"
-      "st1 { v28.s }[2], [x23]\n"
-      "st1 { v29.s }[2], [x22]\n"
-      "st1 { v30.s }[2], [x21]\n"
-      "st1 { v31.s }[2], [x20]\n"
+      "st1 { v24.s }[2], [x11]\n"
+      "st1 { v25.s }[2], [x27]\n"
+      "st1 { v26.s }[2], [x26]\n"
+      "st1 { v27.s }[2], [x25]\n"
+      "st1 { v28.s }[2], [x24]\n"
+      "st1 { v29.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x22]\n"
+      "st1 { v31.s }[2], [x21]\n"
       "b 166f\n"
       "165:"  // Height 8: Partial direct writeback: partial_1_0
-      "str s24, [x10, #0x0]\n"
-      "str s25, [x26, #0x0]\n"
-      "str s26, [x25, #0x0]\n"
-      "str s27, [x24, #0x0]\n"
-      "str s28, [x23, #0x0]\n"
-      "str s29, [x22, #0x0]\n"
-      "str s30, [x21, #0x0]\n"
-      "str s31, [x20, #0x0]\n"
+      "str s24, [x11, #0x0]\n"
+      "str s25, [x27, #0x0]\n"
+      "str s26, [x26, #0x0]\n"
+      "str s27, [x25, #0x0]\n"
+      "str s28, [x24, #0x0]\n"
+      "str s29, [x23, #0x0]\n"
+      "str s30, [x22, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
       "166:"  // Height 8: Partial direct writeback: Done
       "b 168f\n"
       "167:"  // Height 8: Full writeback
-      "str q24, [x10, #0x0]\n"
-      "add x10, x10, #0x10\n"
-      "str q25, [x26, #0x0]\n"
-      "str q26, [x25, #0x0]\n"
-      "str q27, [x24, #0x0]\n"
-      "str q28, [x23, #0x0]\n"
-      "str q29, [x22, #0x0]\n"
-      "str q30, [x21, #0x0]\n"
-      "str q31, [x20, #0x0]\n"
+      "str q24, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q25, [x27, #0x0]\n"
+      "str q26, [x26, #0x0]\n"
+      "str q27, [x25, #0x0]\n"
+      "str q28, [x24, #0x0]\n"
+      "str q29, [x23, #0x0]\n"
+      "str q30, [x22, #0x0]\n"
+      "str q31, [x21, #0x0]\n"
       "168:"  // Height 8: Writeback done
       "subs x13, x13, #0x4\n"
       "bgt 149b\n"
       "subs %x[M], %x[M], #0x8\n"
       "beq 170f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 169f\n"
-      "add x20, x20, #0x8\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x8\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "169:"  // Update direct input
-      "mov x19, #0x20\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x20\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "170:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
new file mode 100644
index 0000000000..f31dd7afd0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_fp32bf16fp32_mmla_4x24( ARGLIST );
+
+class cls_a64_hybrid_fp32bf16fp32_mmla_4x24
+{
+public:
+    typedef float lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 24;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 24, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 18.9 };
+                case CPUModel::A510:
+                    return { 6.81 };
+                case CPUModel::V1:
+                    return { 22.33 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp32bf16fp32_mmla_4x24;
+    cls_a64_hybrid_fp32bf16fp32_mmla_4x24(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
new file mode 100644
index 0000000000..0e468b196a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
@@ -0,0 +1,2424 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32bf16fp32_mmla_4x24 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 130f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 87f\n"
+      "beq 44f\n"
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "cbz x10, 3f\n"
+      "ldr q8, [x10, #0x0]\n"
+      "ldr q9, [x10, #0x10]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x10, #0x20]\n"
+      "ldr q11, [x10, #0x30]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "ldr q12, [x10, #0x40]\n"
+      "ldr q13, [x10, #0x50]\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "add x10, x10, #0x60\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "b 19f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 18f\n"
+      "cmp x9, #0x18\n"
+      "bge 16f\n"
+      "tbz x9, #4, 7f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "ld1 { v12.4s }, [x27], #0x10\n"
+      "tbz x9, #2, 5f\n"
+      "ld1 { v13.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 4f\n"
+      "ldr d20, [x27], #0x8\n"
+      "mov x20, #0x58\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v20.s }[2], [x27]\n"
+      "b 15f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s20, [x27, #0x0]\n"
+      "b 15f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_16
+      "tbz x9, #1, 6f\n"
+      "ldr d13, [x27], #0x8\n"
+      "mov x20, #0x48\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v13.s }[2], [x27]\n"
+      "b 15f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s13, [x27, #0x0]\n"
+      "b 15f\n"
+      "7:"  // Height 1: Partial accumulate: partial_8_0
+      "tbz x9, #3, 11f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "tbz x9, #2, 9f\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 8f\n"
+      "ldr d12, [x27], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v12.s }[2], [x27]\n"
+      "b 15f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s12, [x27, #0x0]\n"
+      "b 15f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x9, #1, 10f\n"
+      "ldr d11, [x27], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v11.s }[2], [x27]\n"
+      "b 15f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s11, [x27, #0x0]\n"
+      "b 15f\n"
+      "11:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x9, #2, 13f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 12f\n"
+      "ldr d10, [x27], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v10.s }[2], [x27]\n"
+      "b 15f\n"
+      "12:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s10, [x27, #0x0]\n"
+      "b 15f\n"
+      "13:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x9, #1, 14f\n"
+      "ldr d9, [x27], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v9.s }[2], [x27]\n"
+      "b 15f\n"
+      "14:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x27, #0x0]\n"
+      "mov x20, #0x0\n"
+      "15:"  // Height 1: Partial accumulate: Done
+      "sub x27, x27, x20\n"
+      "b 17f\n"
+      "16:"  // Height 1: full accumulate
+      "ldr q9, [x27, #0x0]\n"
+      "ldr q10, [x27, #0x10]\n"
+      "ldr q11, [x27, #0x20]\n"
+      "ldr q12, [x27, #0x30]\n"
+      "ldr q13, [x27, #0x40]\n"
+      "ldr q20, [x27, #0x50]\n"
+      "17:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "b 19f\n"
+      "18:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "19:"  // Height 1: setup done
+      "mov x26, #0x0\n"
+      "20:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 21f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 22f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 22f\n"
+      "21:"  // Height 1: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "22:"  // Height 1: input setup done
+      "cmp x25, #0x4\n"
+      "blt 25f\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "ldr q4, [x28, #0x0]\n"
+      "cmp x25, #0x8\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "blt 24f\n"
+      "23:"  // Height 1: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q24, [x28, #0x40]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q23, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q22, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q21, [x28, #0x70]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      "ldr q24, [x28, #0x80]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x90]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x28, #0xa0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0xb0]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x8\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
+      "add x28, x28, #0xc0\n"
+      "ldr q4, [x28, #0x0]\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
+      "ldr q7, [x28, #0x30]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "bge 23b\n"
+      "24:"  // Height 1: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q23, [x28, #0x40]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q25, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q21, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x6e57ec0a  // bfmmla v10.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x80]\n"
+      ".inst 0x6e59ec10  // bfmmla v16.4s, v0.8h, v25.8h\n"
+      "ldr q22, [x28, #0x90]\n"
+      ".inst 0x6e55ec0b  // bfmmla v11.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0xa0]\n"
+      ".inst 0x6e58ec11  // bfmmla v17.4s, v0.8h, v24.8h\n"
+      "ldr q5, [x28, #0xb0]\n"
+      "sub x25, x25, #0x4\n"
+      ".inst 0x6e57ec0c  // bfmmla v12.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec12  // bfmmla v18.4s, v0.8h, v22.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e55ec0d  // bfmmla v13.4s, v0.8h, v21.8h\n"
+      ".inst 0x6e45ec13  // bfmmla v19.4s, v0.8h, v5.8h\n"
+      "25:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x25, 28f\n"
+      "cbz x25, 28f\n"
+      "tbz x25, #1, 26f\n"
+      "ldr d0, [x24], #0x8\n"
+      "tbz x25, #0, 27f\n"
+      "ld1 { v0.s }[2], [x24]\n"
+      "b 27f\n"
+      "26:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x24, #0x0]\n"
+      "27:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q1, [x28, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x6e55ec08  // bfmmla v8.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0x20]\n"
+      "ldr q22, [x28, #0x30]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e55ec09  // bfmmla v9.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0x40]\n"
+      "ldr q23, [x28, #0x50]\n"
+      ".inst 0x6e56ec0f  // bfmmla v15.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0a  // bfmmla v10.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0x60]\n"
+      "ldr q22, [x28, #0x70]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e55ec0b  // bfmmla v11.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0x80]\n"
+      "ldr q23, [x28, #0x90]\n"
+      ".inst 0x6e56ec11  // bfmmla v17.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0c  // bfmmla v12.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x28, #0xa0]\n"
+      "ldr q21, [x28, #0xb0]\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
+      "add x28, x28, #0xc0\n"
+      "28:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 20b\n"
+      "uzp1 v8.2d, v8.2d, v14.2d\n"
+      "uzp1 v9.2d, v9.2d, v15.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "uzp1 v10.2d, v10.2d, v16.2d\n"
+      "uzp1 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v13.2d, v13.2d, v19.2d\n"
+      "tbz %x[flags], #1, 29f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v22.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v22.4s\n"
+      "fmin v9.4s, v9.4s, v22.4s\n"
+      "fmin v10.4s, v10.4s, v22.4s\n"
+      "fmin v11.4s, v11.4s, v22.4s\n"
+      "fmin v12.4s, v12.4s, v22.4s\n"
+      "fmin v13.4s, v13.4s, v22.4s\n"
+      "fmax v8.4s, v8.4s, v21.4s\n"
+      "fmax v9.4s, v9.4s, v21.4s\n"
+      "fmax v10.4s, v10.4s, v21.4s\n"
+      "fmax v11.4s, v11.4s, v21.4s\n"
+      "fmax v12.4s, v12.4s, v21.4s\n"
+      "fmax v13.4s, v13.4s, v21.4s\n"
+      "29:"  // Height 1: No activation
+      "cmp x9, #0x18\n"
+      "bge 42f\n"
+      "tbz x9, #4, 33f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v9.4s }, [x27], #0x10\n"
+      "st1 { v10.4s }, [x27], #0x10\n"
+      "st1 { v11.4s }, [x27], #0x10\n"
+      "tbz x9, #2, 31f\n"
+      "st1 { v12.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 30f\n"
+      "str d13, [x27], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v13.s }[2], [x27]\n"
+      "b 41f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 41f\n"
+      "str s13, [x27, #0x0]\n"
+      "b 41f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 32f\n"
+      "str d12, [x27], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v12.s }[2], [x27]\n"
+      "b 41f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 41f\n"
+      "str s12, [x27, #0x0]\n"
+      "b 41f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 37f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "st1 { v9.4s }, [x27], #0x10\n"
+      "tbz x9, #2, 35f\n"
+      "st1 { v10.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 34f\n"
+      "str d11, [x27], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v11.s }[2], [x27]\n"
+      "b 41f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 41f\n"
+      "str s11, [x27, #0x0]\n"
+      "b 41f\n"
+      "35:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 36f\n"
+      "str d10, [x27], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v10.s }[2], [x27]\n"
+      "b 41f\n"
+      "36:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 41f\n"
+      "str s10, [x27, #0x0]\n"
+      "b 41f\n"
+      "37:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 39f\n"
+      "st1 { v8.4s }, [x27], #0x10\n"
+      "tbz x9, #1, 38f\n"
+      "str d9, [x27], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v9.s }[2], [x27]\n"
+      "b 41f\n"
+      "38:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 41f\n"
+      "str s9, [x27, #0x0]\n"
+      "b 41f\n"
+      "39:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 40f\n"
+      "str d8, [x27], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v8.s }[2], [x27]\n"
+      "b 41f\n"
+      "40:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x27, #0x0]\n"
+      "41:"  // Height 1: Partial direct writeback: Done
+      "b 43f\n"
+      "42:"  // Height 1: Full writeback
+      "str q8, [x27, #0x0]\n"
+      "str q9, [x27, #0x10]\n"
+      "str q10, [x27, #0x20]\n"
+      "str q11, [x27, #0x30]\n"
+      "str q12, [x27, #0x40]\n"
+      "str q13, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "43:"  // Height 1: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 2b\n"
+      "b 174f\n"
+      "44:"  // Height 2
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "45:"  // Height 2: Column loop
+      "cbz x10, 46f\n"
+      "ldr q8, [x10, #0x0]\n"
+      "ldr q9, [x10, #0x10]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x10, #0x20]\n"
+      "ldr q11, [x10, #0x30]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "ldr q12, [x10, #0x40]\n"
+      "ldr q13, [x10, #0x50]\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "add x10, x10, #0x60\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "b 62f\n"
+      "46:"  // Height 2: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x9, #0x18\n"
+      "add x23, x27, x20, LSL #2\n"
+      "bge 59f\n"
+      "tbz x9, #4, 50f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v12.4s }, [x27], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x9, #2, 48f\n"
+      "ld1 { v13.4s }, [x27], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 47f\n"
+      "ldr d20, [x27], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "mov x20, #0x58\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v20.s }[2], [x27]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "b 58f\n"
+      "47:"  // Height 2: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x9, #0, 58f\n"
+      "ldr s20, [x27, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "b 58f\n"
+      "48:"  // Height 2: Partial accumulate: partial_2_16
+      "tbz x9, #1, 49f\n"
+      "ldr d13, [x27], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x20, #0x48\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v13.s }[2], [x27]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "b 58f\n"
+      "49:"  // Height 2: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x9, #0, 58f\n"
+      "ldr s13, [x27, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "b 58f\n"
+      "50:"  // Height 2: Partial accumulate: partial_8_0
+      "tbz x9, #3, 54f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "tbz x9, #2, 52f\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 51f\n"
+      "ldr d12, [x27], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v12.s }[2], [x27]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "b 58f\n"
+      "51:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x9, #0, 58f\n"
+      "ldr s12, [x27, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "b 58f\n"
+      "52:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x9, #1, 53f\n"
+      "ldr d11, [x27], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v11.s }[2], [x27]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "b 58f\n"
+      "53:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x9, #0, 58f\n"
+      "ldr s11, [x27, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "b 58f\n"
+      "54:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x9, #2, 56f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 55f\n"
+      "ldr d10, [x27], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v10.s }[2], [x27]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "b 58f\n"
+      "55:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x9, #0, 58f\n"
+      "ldr s10, [x27, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "b 58f\n"
+      "56:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x9, #1, 57f\n"
+      "ldr d9, [x27], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v9.s }[2], [x27]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "b 58f\n"
+      "57:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x27, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "mov x20, #0x0\n"
+      "58:"  // Height 2: Partial accumulate: Done
+      "sub x27, x27, x20\n"
+      "b 60f\n"
+      "59:"  // Height 2: full accumulate
+      "ldr q9, [x27, #0x0]\n"
+      "ldr q10, [x27, #0x10]\n"
+      "ldr q11, [x27, #0x20]\n"
+      "ldr q12, [x27, #0x30]\n"
+      "ldr q13, [x27, #0x40]\n"
+      "ldr q20, [x27, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "60:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "b 62f\n"
+      "61:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "62:"  // Height 2: setup done
+      "mov x26, #0x0\n"
+      "63:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 65f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 65f\n"
+      "64:"  // Height 2: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "65:"  // Height 2: input setup done
+      "cmp x25, #0x4\n"
+      "blt 68f\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      "cmp x25, #0x8\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "blt 67f\n"
+      "66:"  // Height 2: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q3, [x28, #0x40]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q23, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q22, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q21, [x28, #0x70]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      "ldr q1, [x28, #0x80]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x90]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x28, #0xa0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0xb0]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x8\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e41ec0c  // bfmmla v12.4s, v0.8h, v1.8h\n"
+      "ldr q4, [x28, #0x0]\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
+      "ldr q7, [x28, #0x30]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      "bge 66b\n"
+      "67:"  // Height 2: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q24, [x28, #0x40]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q23, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q22, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q21, [x28, #0x70]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      "ldr q24, [x28, #0x80]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x90]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x28, #0xa0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0xb0]\n"
+      "sub x25, x25, #0x4\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
+      "add x28, x28, #0xc0\n"
+      "68:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x25, 71f\n"
+      "cbz x25, 71f\n"
+      "tbz x25, #1, 69f\n"
+      "ldr d0, [x24], #0x8\n"
+      "ldr d1, [x23], #0x8\n"
+      "tbz x25, #0, 70f\n"
+      "ld1 { v0.s }[2], [x24]\n"
+      "ld1 { v1.s }[2], [x23]\n"
+      "b 70f\n"
+      "69:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x24, #0x0]\n"
+      "ldr s1, [x23, #0x0]\n"
+      "70:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q23, [x28, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ldr q22, [x28, #0x20]\n"
+      "ldr q21, [x28, #0x30]\n"
+      ".inst 0x6e58ec08  // bfmmla v8.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec0e  // bfmmla v14.4s, v0.8h, v23.8h\n"
+      "ldr q24, [x28, #0x40]\n"
+      "ldr q23, [x28, #0x50]\n"
+      ".inst 0x6e56ec09  // bfmmla v9.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0f  // bfmmla v15.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x28, #0x60]\n"
+      "ldr q21, [x28, #0x70]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q24, [x28, #0x80]\n"
+      "ldr q23, [x28, #0x90]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x28, #0xa0]\n"
+      "ldr q21, [x28, #0xb0]\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
+      "add x28, x28, #0xc0\n"
+      "71:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 63b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "uzp1 v4.2d, v8.2d, v14.2d\n"
+      "uzp2 v8.2d, v8.2d, v14.2d\n"
+      "uzp1 v14.2d, v9.2d, v15.2d\n"
+      "uzp2 v9.2d, v9.2d, v15.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v15.2d, v10.2d, v16.2d\n"
+      "uzp2 v10.2d, v10.2d, v16.2d\n"
+      "uzp1 v16.2d, v11.2d, v17.2d\n"
+      "uzp2 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v17.2d, v12.2d, v18.2d\n"
+      "uzp2 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v18.2d, v13.2d, v19.2d\n"
+      "uzp2 v13.2d, v13.2d, v19.2d\n"
+      "tbz %x[flags], #1, 72f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v22.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "fmin v4.4s, v4.4s, v22.4s\n"
+      "fmin v14.4s, v14.4s, v22.4s\n"
+      "fmin v15.4s, v15.4s, v22.4s\n"
+      "fmin v16.4s, v16.4s, v22.4s\n"
+      "fmin v17.4s, v17.4s, v22.4s\n"
+      "fmin v18.4s, v18.4s, v22.4s\n"
+      "fmin v8.4s, v8.4s, v22.4s\n"
+      "fmin v9.4s, v9.4s, v22.4s\n"
+      "fmin v10.4s, v10.4s, v22.4s\n"
+      "fmin v11.4s, v11.4s, v22.4s\n"
+      "fmin v12.4s, v12.4s, v22.4s\n"
+      "fmin v13.4s, v13.4s, v22.4s\n"
+      "fmax v4.4s, v4.4s, v21.4s\n"
+      "fmax v14.4s, v14.4s, v21.4s\n"
+      "fmax v15.4s, v15.4s, v21.4s\n"
+      "fmax v16.4s, v16.4s, v21.4s\n"
+      "fmax v17.4s, v17.4s, v21.4s\n"
+      "fmax v18.4s, v18.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v21.4s\n"
+      "fmax v9.4s, v9.4s, v21.4s\n"
+      "fmax v10.4s, v10.4s, v21.4s\n"
+      "fmax v11.4s, v11.4s, v21.4s\n"
+      "fmax v12.4s, v12.4s, v21.4s\n"
+      "fmax v13.4s, v13.4s, v21.4s\n"
+      "72:"  // Height 2: No activation
+      "cmp x9, #0x18\n"
+      "bge 85f\n"
+      "tbz x9, #4, 76f\n"
+      "st1 { v4.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x27], #0x10\n"
+      "st1 { v15.4s }, [x27], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v11.4s }, [x23], #0x10\n"
+      "tbz x9, #2, 74f\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v12.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 73f\n"
+      "str d18, [x27], #0x8\n"
+      "str d13, [x23], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v13.s }[2], [x23]\n"
+      "b 84f\n"
+      "73:"  // Height 2: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 84f\n"
+      "str s18, [x27, #0x0]\n"
+      "str s13, [x23, #0x0]\n"
+      "b 84f\n"
+      "74:"  // Height 2: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 75f\n"
+      "str d17, [x27], #0x8\n"
+      "str d12, [x23], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v12.s }[2], [x23]\n"
+      "b 84f\n"
+      "75:"  // Height 2: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 84f\n"
+      "str s17, [x27, #0x0]\n"
+      "str s12, [x23, #0x0]\n"
+      "b 84f\n"
+      "76:"  // Height 2: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 80f\n"
+      "st1 { v4.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x27], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "tbz x9, #2, 78f\n"
+      "st1 { v15.4s }, [x27], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 77f\n"
+      "str d16, [x27], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "b 84f\n"
+      "77:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 84f\n"
+      "str s16, [x27, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "b 84f\n"
+      "78:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 79f\n"
+      "str d15, [x27], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v15.s }[2], [x27]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "b 84f\n"
+      "79:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 84f\n"
+      "str s15, [x27, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "b 84f\n"
+      "80:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 82f\n"
+      "st1 { v4.4s }, [x27], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "tbz x9, #1, 81f\n"
+      "str d14, [x27], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v14.s }[2], [x27]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "b 84f\n"
+      "81:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 84f\n"
+      "str s14, [x27, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "b 84f\n"
+      "82:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 83f\n"
+      "str d4, [x27], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v4.s }[2], [x27]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "b 84f\n"
+      "83:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s4, [x27, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "84:"  // Height 2: Partial direct writeback: Done
+      "b 86f\n"
+      "85:"  // Height 2: Full writeback
+      "str q4, [x27, #0x0]\n"
+      "str q14, [x27, #0x10]\n"
+      "str q15, [x27, #0x20]\n"
+      "str q16, [x27, #0x30]\n"
+      "str q17, [x27, #0x40]\n"
+      "str q18, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q12, [x23, #0x40]\n"
+      "str q13, [x23, #0x50]\n"
+      "86:"  // Height 2: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 45b\n"
+      "b 174f\n"
+      "87:"  // Height 3
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "88:"  // Height 3: Column loop
+      "cbz x10, 89f\n"
+      "ldr q8, [x10, #0x0]\n"
+      "ldr q9, [x10, #0x10]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x10, #0x20]\n"
+      "ldr q11, [x10, #0x30]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "ldr q12, [x10, #0x40]\n"
+      "ldr q13, [x10, #0x50]\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "add x10, x10, #0x60\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v26.16b, v14.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v27.16b, v15.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v28.16b, v16.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v29.16b, v17.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v30.16b, v18.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "mov v31.16b, v19.16b\n"
+      "b 105f\n"
+      "89:"  // Height 3: no bias
+      "tbz %x[flags], #0, 104f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "cmp x9, #0x18\n"
+      "add x22, x23, x20, LSL #2\n"
+      "bge 102f\n"
+      "tbz x9, #4, 93f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "ld1 { v12.4s }, [x27], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 91f\n"
+      "ld1 { v13.4s }, [x27], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 90f\n"
+      "ldr d20, [x27], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "mov x20, #0x58\n"
+      "ldr d4, [x22], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v20.s }[2], [x27]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v4.s }[2], [x22]\n"
+      "b 101f\n"
+      "90:"  // Height 3: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x9, #0, 101f\n"
+      "ldr s20, [x27, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s4, [x22, #0x0]\n"
+      "b 101f\n"
+      "91:"  // Height 3: Partial accumulate: partial_2_16
+      "tbz x9, #1, 92f\n"
+      "ldr d13, [x27], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x20, #0x48\n"
+      "ldr d25, [x22], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v13.s }[2], [x27]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "b 101f\n"
+      "92:"  // Height 3: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x9, #0, 101f\n"
+      "ldr s13, [x27, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "b 101f\n"
+      "93:"  // Height 3: Partial accumulate: partial_8_0
+      "tbz x9, #3, 97f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 95f\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 94f\n"
+      "ldr d12, [x27], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x22], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v12.s }[2], [x27]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "b 101f\n"
+      "94:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x9, #0, 101f\n"
+      "ldr s12, [x27, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "b 101f\n"
+      "95:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x9, #1, 96f\n"
+      "ldr d11, [x27], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v11.s }[2], [x27]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "b 101f\n"
+      "96:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x9, #0, 101f\n"
+      "ldr s11, [x27, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "b 101f\n"
+      "97:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x9, #2, 99f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 98f\n"
+      "ldr d10, [x27], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v10.s }[2], [x27]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "b 101f\n"
+      "98:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x9, #0, 101f\n"
+      "ldr s10, [x27, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "b 101f\n"
+      "99:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x9, #1, 100f\n"
+      "ldr d9, [x27], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v9.s }[2], [x27]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "b 101f\n"
+      "100:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x27, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s21, [x22, #0x0]\n"
+      "101:"  // Height 3: Partial accumulate: Done
+      "sub x27, x27, x20\n"
+      "b 103f\n"
+      "102:"  // Height 3: full accumulate
+      "ldr q9, [x27, #0x0]\n"
+      "ldr q10, [x27, #0x10]\n"
+      "ldr q11, [x27, #0x20]\n"
+      "ldr q12, [x27, #0x30]\n"
+      "ldr q13, [x27, #0x40]\n"
+      "ldr q20, [x27, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "ldr q21, [x22, #0x0]\n"
+      "ldr q22, [x22, #0x10]\n"
+      "ldr q23, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "ldr q25, [x22, #0x40]\n"
+      "ldr q4, [x22, #0x50]\n"
+      "103:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "zip1 v20.2d, v21.2d, v26.2d\n"
+      "zip2 v26.2d, v21.2d, v26.2d\n"
+      "zip1 v21.2d, v22.2d, v27.2d\n"
+      "zip2 v27.2d, v22.2d, v27.2d\n"
+      "zip1 v22.2d, v23.2d, v28.2d\n"
+      "zip2 v28.2d, v23.2d, v28.2d\n"
+      "zip1 v23.2d, v24.2d, v29.2d\n"
+      "zip2 v29.2d, v24.2d, v29.2d\n"
+      "zip1 v24.2d, v25.2d, v30.2d\n"
+      "zip2 v30.2d, v25.2d, v30.2d\n"
+      "zip1 v25.2d, v4.2d, v31.2d\n"
+      "zip2 v31.2d, v4.2d, v31.2d\n"
+      "b 105f\n"
+      "104:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "105:"  // Height 3: setup done
+      "mov x26, #0x0\n"
+      "106:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 107f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 108f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 108f\n"
+      "107:"  // Height 3: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "108:"  // Height 3: input setup done
+      "cmp x25, #0x4\n"
+      "blt 111f\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      "cmp x25, #0x8\n"
+      "ld1 { v2.4s }, [x22], #0x10\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "blt 110f\n"
+      "109:"  // Height 3: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "sub x25, x25, #0x4\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "cmp x25, #0x8\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q3, [x28, #0x70]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e43ec11  // bfmmla v17.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec5d  // bfmmla v29.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x0]\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x6e43ec13  // bfmmla v19.4s, v0.8h, v3.8h\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      ".inst 0x6e43ec5f  // bfmmla v31.4s, v2.8h, v3.8h\n"
+      "ld1 { v2.4s }, [x22], #0x10\n"
+      "ldr q7, [x28, #0x30]\n"
+      "bge 109b\n"
+      "110:"  // Height 3: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "sub x25, x25, #0x4\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q3, [x28, #0x40]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e43ec56  // bfmmla v22.4s, v2.8h, v3.8h\n"
+      "ldr q5, [x28, #0x80]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q3, [x28, #0xa0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
+      "111:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x25, 114f\n"
+      "cbz x25, 114f\n"
+      "tbz x25, #1, 112f\n"
+      "ldr d0, [x24], #0x8\n"
+      "ldr d1, [x23], #0x8\n"
+      "ldr d2, [x22], #0x8\n"
+      "tbz x25, #0, 113f\n"
+      "ld1 { v0.s }[2], [x24]\n"
+      "ld1 { v1.s }[2], [x23]\n"
+      "ld1 { v2.s }[2], [x22]\n"
+      "b 113f\n"
+      "112:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x24, #0x0]\n"
+      "ldr s1, [x23, #0x0]\n"
+      "ldr s2, [x22, #0x0]\n"
+      "113:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q4, [x28, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ldr q3, [x28, #0x20]\n"
+      "ldr q1, [x28, #0x30]\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x6e45ec08  // bfmmla v8.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec54  // bfmmla v20.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x6e44ec0e  // bfmmla v14.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5a  // bfmmla v26.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec55  // bfmmla v21.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x28, #0x60]\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5b  // bfmmla v27.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e45ec0a  // bfmmla v10.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x80]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x90]\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x28, #0xa0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x28, #0xb0]\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
+      "114:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 106b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp1 v4.2d, v8.2d, v14.2d\n"
+      "uzp2 v8.2d, v8.2d, v14.2d\n"
+      "uzp1 v14.2d, v9.2d, v15.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v15.2d\n"
+      "uzp1 v15.2d, v10.2d, v16.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v16.2d\n"
+      "uzp1 v16.2d, v11.2d, v17.2d\n"
+      "uzp2 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v17.2d, v12.2d, v18.2d\n"
+      "uzp2 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v18.2d, v13.2d, v19.2d\n"
+      "uzp2 v13.2d, v13.2d, v19.2d\n"
+      "uzp1 v20.2d, v20.2d, v26.2d\n"
+      "uzp1 v21.2d, v21.2d, v27.2d\n"
+      "uzp1 v22.2d, v22.2d, v28.2d\n"
+      "uzp1 v23.2d, v23.2d, v29.2d\n"
+      "uzp1 v24.2d, v24.2d, v30.2d\n"
+      "uzp1 v25.2d, v25.2d, v31.2d\n"
+      "tbz %x[flags], #1, 115f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v4.4s, v4.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmax v4.4s, v4.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "115:"  // Height 3: No activation
+      "cmp x9, #0x18\n"
+      "bge 128f\n"
+      "tbz x9, #4, 119f\n"
+      "st1 { v4.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x27], #0x10\n"
+      "st1 { v15.4s }, [x27], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v11.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 117f\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v12.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 116f\n"
+      "str d18, [x27], #0x8\n"
+      "str d13, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v13.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "b 127f\n"
+      "116:"  // Height 3: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 127f\n"
+      "str s18, [x27, #0x0]\n"
+      "str s13, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "b 127f\n"
+      "117:"  // Height 3: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 118f\n"
+      "str d17, [x27], #0x8\n"
+      "str d12, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v12.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "b 127f\n"
+      "118:"  // Height 3: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 127f\n"
+      "str s17, [x27, #0x0]\n"
+      "str s12, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "b 127f\n"
+      "119:"  // Height 3: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 123f\n"
+      "st1 { v4.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x27], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 121f\n"
+      "st1 { v15.4s }, [x27], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 120f\n"
+      "str d16, [x27], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "b 127f\n"
+      "120:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 127f\n"
+      "str s16, [x27, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "b 127f\n"
+      "121:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 122f\n"
+      "str d15, [x27], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v15.s }[2], [x27]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "b 127f\n"
+      "122:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 127f\n"
+      "str s15, [x27, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "b 127f\n"
+      "123:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 125f\n"
+      "st1 { v4.4s }, [x27], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 124f\n"
+      "str d14, [x27], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v14.s }[2], [x27]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "b 127f\n"
+      "124:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 127f\n"
+      "str s14, [x27, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "b 127f\n"
+      "125:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 126f\n"
+      "str d4, [x27], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v4.s }[2], [x27]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "b 127f\n"
+      "126:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s4, [x27, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "127:"  // Height 3: Partial direct writeback: Done
+      "b 129f\n"
+      "128:"  // Height 3: Full writeback
+      "str q4, [x27, #0x0]\n"
+      "str q14, [x27, #0x10]\n"
+      "str q15, [x27, #0x20]\n"
+      "str q16, [x27, #0x30]\n"
+      "str q17, [x27, #0x40]\n"
+      "str q18, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q12, [x23, #0x40]\n"
+      "str q13, [x23, #0x50]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x22, #0x40]\n"
+      "str q25, [x22, #0x50]\n"
+      "129:"  // Height 3: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 88b\n"
+      "b 174f\n"
+      "130:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x10\n"
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "131:"  // Height 4: Column loop
+      "cbz x10, 132f\n"
+      "ldr q8, [x10, #0x0]\n"
+      "ldr q9, [x10, #0x10]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x10, #0x20]\n"
+      "ldr q11, [x10, #0x30]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "ldr q12, [x10, #0x40]\n"
+      "ldr q13, [x10, #0x50]\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "add x10, x10, #0x60\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v26.16b, v14.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v27.16b, v15.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v28.16b, v16.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v29.16b, v17.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v30.16b, v18.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "mov v31.16b, v19.16b\n"
+      "b 148f\n"
+      "132:"  // Height 4: no bias
+      "tbz %x[flags], #0, 147f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x9, #0x18\n"
+      "add x21, x22, x20, LSL #2\n"
+      "bge 145f\n"
+      "tbz x9, #4, 136f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v12.4s }, [x27], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 134f\n"
+      "ld1 { v13.4s }, [x27], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 133f\n"
+      "ldr d20, [x27], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "mov x20, #0x58\n"
+      "ldr d4, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v20.s }[2], [x27]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v4.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 144f\n"
+      "133:"  // Height 4: Partial accumulate: partial_1_20
+      "mov x20, #0x50\n"
+      "tbz x9, #0, 144f\n"
+      "ldr s20, [x27, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s4, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 144f\n"
+      "134:"  // Height 4: Partial accumulate: partial_2_16
+      "tbz x9, #1, 135f\n"
+      "ldr d13, [x27], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x20, #0x48\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v13.s }[2], [x27]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 144f\n"
+      "135:"  // Height 4: Partial accumulate: partial_1_16
+      "mov x20, #0x40\n"
+      "tbz x9, #0, 144f\n"
+      "ldr s13, [x27, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 144f\n"
+      "136:"  // Height 4: Partial accumulate: partial_8_0
+      "tbz x9, #3, 140f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x27], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 138f\n"
+      "ld1 { v11.4s }, [x27], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 137f\n"
+      "ldr d12, [x27], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v12.s }[2], [x27]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 144f\n"
+      "137:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x9, #0, 144f\n"
+      "ldr s12, [x27, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 144f\n"
+      "138:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x9, #1, 139f\n"
+      "ldr d11, [x27], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v11.s }[2], [x27]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 144f\n"
+      "139:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x9, #0, 144f\n"
+      "ldr s11, [x27, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "b 144f\n"
+      "140:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x9, #2, 142f\n"
+      "ld1 { v9.4s }, [x27], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 141f\n"
+      "ldr d10, [x27], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v10.s }[2], [x27]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "b 144f\n"
+      "141:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x9, #0, 144f\n"
+      "ldr s10, [x27, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "b 144f\n"
+      "142:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x9, #1, 143f\n"
+      "ldr d9, [x27], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v9.s }[2], [x27]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "b 144f\n"
+      "143:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x27, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "144:"  // Height 4: Partial accumulate: Done
+      "sub x27, x27, x20\n"
+      "b 146f\n"
+      "145:"  // Height 4: full accumulate
+      "ldr q9, [x27, #0x0]\n"
+      "ldr q10, [x27, #0x10]\n"
+      "ldr q11, [x27, #0x20]\n"
+      "ldr q12, [x27, #0x30]\n"
+      "ldr q13, [x27, #0x40]\n"
+      "ldr q20, [x27, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "ldr q21, [x22, #0x0]\n"
+      "ldr q22, [x22, #0x10]\n"
+      "ldr q23, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "ldr q25, [x22, #0x40]\n"
+      "ldr q4, [x22, #0x50]\n"
+      "ldr q26, [x21, #0x0]\n"
+      "ldr q27, [x21, #0x10]\n"
+      "ldr q28, [x21, #0x20]\n"
+      "ldr q29, [x21, #0x30]\n"
+      "ldr q30, [x21, #0x40]\n"
+      "ldr q31, [x21, #0x50]\n"
+      "146:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "zip1 v20.2d, v21.2d, v26.2d\n"
+      "zip2 v26.2d, v21.2d, v26.2d\n"
+      "zip1 v21.2d, v22.2d, v27.2d\n"
+      "zip2 v27.2d, v22.2d, v27.2d\n"
+      "zip1 v22.2d, v23.2d, v28.2d\n"
+      "zip2 v28.2d, v23.2d, v28.2d\n"
+      "zip1 v23.2d, v24.2d, v29.2d\n"
+      "zip2 v29.2d, v24.2d, v29.2d\n"
+      "zip1 v24.2d, v25.2d, v30.2d\n"
+      "zip2 v30.2d, v25.2d, v30.2d\n"
+      "zip1 v25.2d, v4.2d, v31.2d\n"
+      "zip2 v31.2d, v4.2d, v31.2d\n"
+      "b 148f\n"
+      "147:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "148:"  // Height 4: setup done
+      "mov x26, #0x0\n"
+      "149:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 150f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 151f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 151f\n"
+      "150:"  // Height 4: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "151:"  // Height 4: input setup done
+      "cmp x25, #0x4\n"
+      "blt 154f\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      "ld1 { v2.4s }, [x22], #0x10\n"
+      "cmp x25, #0x8\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      "ld1 { v3.4s }, [x21], #0x10\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "blt 153f\n"
+      "152:"  // Height 4: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x8\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "ld1 { v1.4s }, [x23], #0x10\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0x70]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "ld1 { v3.4s }, [x21], #0x10\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x0]\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x24], #0x10\n"
+      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      "ld1 { v2.4s }, [x22], #0x10\n"
+      "ldr q7, [x28, #0x30]\n"
+      "bge 152b\n"
+      "153:"  // Height 4: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x25, x25, #0x4\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q3, [x28, #0x40]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec56  // bfmmla v22.4s, v2.8h, v3.8h\n"
+      "ldr q5, [x28, #0x80]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q3, [x28, #0xa0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
+      "154:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x25, 157f\n"
+      "cbz x25, 157f\n"
+      "tbz x25, #1, 155f\n"
+      "ldr d0, [x24], #0x8\n"
+      "ldr d1, [x23], #0x8\n"
+      "ldr d2, [x22], #0x8\n"
+      "ldr d3, [x21], #0x8\n"
+      "tbz x25, #0, 156f\n"
+      "ld1 { v0.s }[2], [x24]\n"
+      "ld1 { v1.s }[2], [x23]\n"
+      "ld1 { v2.s }[2], [x22]\n"
+      "ld1 { v3.s }[2], [x21]\n"
+      "b 156f\n"
+      "155:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x24, #0x0]\n"
+      "ldr s1, [x23, #0x0]\n"
+      "ldr s2, [x22, #0x0]\n"
+      "ldr s3, [x21, #0x0]\n"
+      "156:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q4, [x28, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e45ec08  // bfmmla v8.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec54  // bfmmla v20.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x6e44ec0e  // bfmmla v14.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5a  // bfmmla v26.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q3, [x28, #0x60]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e45ec0a  // bfmmla v10.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x80]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x90]\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x28, #0xa0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
+      "157:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 149b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp1 v4.2d, v8.2d, v14.2d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v14.2d\n"
+      "uzp1 v14.2d, v9.2d, v15.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v15.2d\n"
+      "uzp1 v15.2d, v10.2d, v16.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v16.2d\n"
+      "uzp1 v16.2d, v11.2d, v17.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v17.2d, v12.2d, v18.2d\n"
+      "uzp2 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v18.2d, v13.2d, v19.2d\n"
+      "uzp2 v13.2d, v13.2d, v19.2d\n"
+      "uzp1 v19.2d, v20.2d, v26.2d\n"
+      "uzp2 v20.2d, v20.2d, v26.2d\n"
+      "uzp1 v26.2d, v21.2d, v27.2d\n"
+      "uzp2 v21.2d, v21.2d, v27.2d\n"
+      "uzp1 v27.2d, v22.2d, v28.2d\n"
+      "uzp2 v22.2d, v22.2d, v28.2d\n"
+      "uzp1 v28.2d, v23.2d, v29.2d\n"
+      "uzp2 v23.2d, v23.2d, v29.2d\n"
+      "uzp1 v29.2d, v24.2d, v30.2d\n"
+      "uzp2 v24.2d, v24.2d, v30.2d\n"
+      "uzp1 v30.2d, v25.2d, v31.2d\n"
+      "uzp2 v25.2d, v25.2d, v31.2d\n"
+      "tbz %x[flags], #1, 158f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v4.4s, v4.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmin v28.4s, v28.4s, v1.4s\n"
+      "fmin v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmax v4.4s, v4.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "158:"  // Height 4: No activation
+      "cmp x9, #0x18\n"
+      "bge 171f\n"
+      "tbz x9, #4, 162f\n"
+      "st1 { v4.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x27], #0x10\n"
+      "st1 { v15.4s }, [x27], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v11.4s }, [x23], #0x10\n"
+      "st1 { v19.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "st1 { v27.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "st1 { v21.4s }, [x21], #0x10\n"
+      "st1 { v22.4s }, [x21], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 160f\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v12.4s }, [x23], #0x10\n"
+      "st1 { v29.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 159f\n"
+      "str d18, [x27], #0x8\n"
+      "str d13, [x23], #0x8\n"
+      "str d30, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v13.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "b 170f\n"
+      "159:"  // Height 4: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 170f\n"
+      "str s18, [x27, #0x0]\n"
+      "str s13, [x23, #0x0]\n"
+      "str s30, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "b 170f\n"
+      "160:"  // Height 4: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 161f\n"
+      "str d17, [x27], #0x8\n"
+      "str d12, [x23], #0x8\n"
+      "str d29, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v12.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "b 170f\n"
+      "161:"  // Height 4: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 170f\n"
+      "str s17, [x27, #0x0]\n"
+      "str s12, [x23, #0x0]\n"
+      "str s29, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "b 170f\n"
+      "162:"  // Height 4: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 166f\n"
+      "st1 { v4.4s }, [x27], #0x10\n"
+      "st1 { v14.4s }, [x27], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v19.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "st1 { v21.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 164f\n"
+      "st1 { v15.4s }, [x27], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v27.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 163f\n"
+      "str d16, [x27], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d28, [x22], #0x8\n"
+      "str d23, [x21], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x22]\n"
+      "st1 { v23.s }[2], [x21]\n"
+      "b 170f\n"
+      "163:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 170f\n"
+      "str s16, [x27, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s28, [x22, #0x0]\n"
+      "str s23, [x21, #0x0]\n"
+      "b 170f\n"
+      "164:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 165f\n"
+      "str d15, [x27], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "str d22, [x21], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v15.s }[2], [x27]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
+      "st1 { v22.s }[2], [x21]\n"
+      "b 170f\n"
+      "165:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 170f\n"
+      "str s15, [x27, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
+      "str s22, [x21, #0x0]\n"
+      "b 170f\n"
+      "166:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 168f\n"
+      "st1 { v4.4s }, [x27], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v19.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 167f\n"
+      "str d14, [x27], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "str d21, [x21], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v14.s }[2], [x27]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
+      "st1 { v21.s }[2], [x21]\n"
+      "b 170f\n"
+      "167:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 170f\n"
+      "str s14, [x27, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
+      "str s21, [x21, #0x0]\n"
+      "b 170f\n"
+      "168:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 169f\n"
+      "str d4, [x27], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d20, [x21], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v4.s }[2], [x27]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v20.s }[2], [x21]\n"
+      "b 170f\n"
+      "169:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s4, [x27, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s20, [x21, #0x0]\n"
+      "170:"  // Height 4: Partial direct writeback: Done
+      "b 172f\n"
+      "171:"  // Height 4: Full writeback
+      "str q4, [x27, #0x0]\n"
+      "str q14, [x27, #0x10]\n"
+      "str q15, [x27, #0x20]\n"
+      "str q16, [x27, #0x30]\n"
+      "str q17, [x27, #0x40]\n"
+      "str q18, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q12, [x23, #0x40]\n"
+      "str q13, [x23, #0x50]\n"
+      "str q19, [x22, #0x0]\n"
+      "str q26, [x22, #0x10]\n"
+      "str q27, [x22, #0x20]\n"
+      "str q28, [x22, #0x30]\n"
+      "str q29, [x22, #0x40]\n"
+      "str q30, [x22, #0x50]\n"
+      "str q20, [x21, #0x0]\n"
+      "str q21, [x21, #0x10]\n"
+      "str q22, [x21, #0x20]\n"
+      "str q23, [x21, #0x30]\n"
+      "str q24, [x21, #0x40]\n"
+      "str q25, [x21, #0x50]\n"
+      "172:"  // Height 4: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 131b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 174f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 173f\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "173:"  // Update direct input
+      "mov x20, #0x10\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "174:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
new file mode 100644
index 0000000000..71e16d68b5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_fp32bf16fp32_mmla_6x16( ARGLIST );
+
+class cls_a64_hybrid_fp32bf16fp32_mmla_6x16
+{
+public:
+    typedef float lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 16.37 };
+                case CPUModel::A510:
+                    return { 6.70 };
+                case CPUModel::V1:
+                    return { 21.28 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp32bf16fp32_mmla_6x16;
+    cls_a64_hybrid_fp32bf16fp32_mmla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..5693c3f397
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
@@ -0,0 +1,3135 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32bf16fp32_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 176f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 141f\n"
+      "beq 106f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 71f\n"
+      "beq 36f\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "cbz x12, 3f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 15f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 14f\n"
+      "cmp x11, #0x10\n"
+      "bge 12f\n"
+      "tbz x11, #3, 7f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 5f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 4f\n"
+      "ldr d16, [x9], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "b 11f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "b 11f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x11, #1, 6f\n"
+      "ldr d11, [x9], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "b 11f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "b 11f\n"
+      "7:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x11, #2, 9f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 8f\n"
+      "ldr d10, [x9], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "b 11f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "b 11f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x11, #1, 10f\n"
+      "ldr d9, [x9], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "b 11f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "mov x20, #0x0\n"
+      "11:"  // Height 1: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 13f\n"
+      "12:"  // Height 1: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "13:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 15f\n"
+      "14:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "15:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "16:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 18f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "b 18f\n"
+      "17:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "18:"  // Height 1: input setup done
+      "cmp x27, #0x4\n"
+      "blt 21f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ldr q6, [x10, #0x0]\n"
+      "cmp x27, #0x8\n"
+      "ldr q7, [x10, #0x10]\n"
+      "blt 20f\n"
+      "19:"  // Height 1: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      "add x10, x10, #0x80\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "ldr q7, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "bge 19b\n"
+      "20:"  // Height 1: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x10, x10, #0x80\n"
+      "21:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 24f\n"
+      "cbz x27, 24f\n"
+      "tbz x27, #1, 22f\n"
+      "ldr d0, [x26], #0x8\n"
+      "tbz x27, #0, 23f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "b 23f\n"
+      "22:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "23:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x6e52ec08  // bfmmla v8.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec0c  // bfmmla v12.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "add x10, x10, #0x80\n"
+      "24:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 16b\n"
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "tbz %x[flags], #1, 25f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v18.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
+      "25:"  // Height 1: No activation
+      "cmp x11, #0x10\n"
+      "bge 34f\n"
+      "tbz x11, #3, 29f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 27f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 26f\n"
+      "str d11, [x9], #0x8\n"
+      "tbz x11, #0, 33f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "b 33f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 33f\n"
+      "str s11, [x9, #0x0]\n"
+      "b 33f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 28f\n"
+      "str d10, [x9], #0x8\n"
+      "tbz x11, #0, 33f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "b 33f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 33f\n"
+      "str s10, [x9, #0x0]\n"
+      "b 33f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 31f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 30f\n"
+      "str d9, [x9], #0x8\n"
+      "tbz x11, #0, 33f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "b 33f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 33f\n"
+      "str s9, [x9, #0x0]\n"
+      "b 33f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 32f\n"
+      "str d8, [x9], #0x8\n"
+      "tbz x11, #0, 33f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "b 33f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x9, #0x0]\n"
+      "33:"  // Height 1: Partial direct writeback: Done
+      "b 35f\n"
+      "34:"  // Height 1: Full writeback
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "35:"  // Height 1: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 2b\n"
+      "b 212f\n"
+      "36:"  // Height 2
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "37:"  // Height 2: Column loop
+      "cbz x12, 38f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 50f\n"
+      "38:"  // Height 2: no bias
+      "tbz %x[flags], #0, 49f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x25, x9, x20, LSL #2\n"
+      "bge 47f\n"
+      "tbz x11, #3, 42f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "tbz x11, #2, 40f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "tbz x11, #1, 39f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "tbz x11, #0, 46f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "b 46f\n"
+      "39:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 46f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "b 46f\n"
+      "40:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x11, #1, 41f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "tbz x11, #0, 46f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "b 46f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 46f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "b 46f\n"
+      "42:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x11, #2, 44f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "tbz x11, #1, 43f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "tbz x11, #0, 46f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "b 46f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 46f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "b 46f\n"
+      "44:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x11, #1, 45f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "tbz x11, #0, 46f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "b 46f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "46:"  // Height 2: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 48f\n"
+      "47:"  // Height 2: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "48:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 50f\n"
+      "49:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "50:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "51:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 52f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 53f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "b 53f\n"
+      "52:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "53:"  // Height 2: input setup done
+      "cmp x27, #0x4\n"
+      "blt 56f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      "cmp x27, #0x8\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      "blt 55f\n"
+      "54:"  // Height 2: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x8\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "ldr q7, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      "bge 54b\n"
+      "55:"  // Height 2: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x10, x10, #0x80\n"
+      "56:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 59f\n"
+      "cbz x27, 59f\n"
+      "tbz x27, #1, 57f\n"
+      "ldr d0, [x26], #0x8\n"
+      "ldr d1, [x25], #0x8\n"
+      "tbz x27, #0, 58f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "ld1 { v1.s }[2], [x25]\n"
+      "b 58f\n"
+      "57:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "ldr s1, [x25, #0x0]\n"
+      "58:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e52ec08  // bfmmla v8.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec0c  // bfmmla v12.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
+      "add x10, x10, #0x80\n"
+      "59:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 51b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "tbz %x[flags], #1, 60f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v18.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v18.4s\n"
+      "fmin v12.4s, v12.4s, v18.4s\n"
+      "fmin v13.4s, v13.4s, v18.4s\n"
+      "fmin v14.4s, v14.4s, v18.4s\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v6.4s, v6.4s, v17.4s\n"
+      "fmax v12.4s, v12.4s, v17.4s\n"
+      "fmax v13.4s, v13.4s, v17.4s\n"
+      "fmax v14.4s, v14.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
+      "60:"  // Height 2: No activation
+      "cmp x11, #0x10\n"
+      "bge 69f\n"
+      "tbz x11, #3, 64f\n"
+      "st1 { v6.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "tbz x11, #2, 62f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "tbz x11, #1, 61f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "tbz x11, #0, 68f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "b 68f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 68f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "b 68f\n"
+      "62:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 63f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "tbz x11, #0, 68f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "b 68f\n"
+      "63:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 68f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "b 68f\n"
+      "64:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 66f\n"
+      "st1 { v6.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "tbz x11, #1, 65f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "tbz x11, #0, 68f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "b 68f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 68f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "b 68f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 67f\n"
+      "str d6, [x9], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "tbz x11, #0, 68f\n"
+      "st1 { v6.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "b 68f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s6, [x9, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "68:"  // Height 2: Partial direct writeback: Done
+      "b 70f\n"
+      "69:"  // Height 2: Full writeback
+      "str q6, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "70:"  // Height 2: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 37b\n"
+      "b 212f\n"
+      "71:"  // Height 3
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "72:"  // Height 3: Column loop
+      "cbz x12, 73f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 85f\n"
+      "73:"  // Height 3: no bias
+      "tbz %x[flags], #0, 84f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x24, x25, x20, LSL #2\n"
+      "bge 82f\n"
+      "tbz x11, #3, 77f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 75f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 74f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "tbz x11, #0, 81f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "b 81f\n"
+      "74:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 81f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "b 81f\n"
+      "75:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x11, #1, 76f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "tbz x11, #0, 81f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "b 81f\n"
+      "76:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 81f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "b 81f\n"
+      "77:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x11, #2, 79f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 78f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "tbz x11, #0, 81f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "b 81f\n"
+      "78:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 81f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "b 81f\n"
+      "79:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x11, #1, 80f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "tbz x11, #0, 81f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "b 81f\n"
+      "80:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "81:"  // Height 3: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 83f\n"
+      "82:"  // Height 3: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "83:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 85f\n"
+      "84:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "85:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "86:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 87f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 88f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 88f\n"
+      "87:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "88:"  // Height 3: input setup done
+      "cmp x27, #0x4\n"
+      "blt 91f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      "cmp x27, #0x8\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      "blt 90f\n"
+      "89:"  // Height 3: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "ldr q7, [x10, #0x10]\n"
+      "bge 89b\n"
+      "90:"  // Height 3: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "91:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 94f\n"
+      "cbz x27, 94f\n"
+      "tbz x27, #1, 92f\n"
+      "ldr d0, [x26], #0x8\n"
+      "ldr d1, [x25], #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "tbz x27, #0, 93f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "ld1 { v1.s }[2], [x25]\n"
+      "ld1 { v2.s }[2], [x24]\n"
+      "b 93f\n"
+      "92:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "ldr s1, [x25, #0x0]\n"
+      "ldr s2, [x24, #0x0]\n"
+      "93:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x6e5aec08  // bfmmla v8.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec50  // bfmmla v16.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec0c  // bfmmla v12.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec54  // bfmmla v20.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "94:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 86b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "tbz %x[flags], #1, 95f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v6.4s, v6.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
+      "95:"  // Height 3: No activation
+      "cmp x11, #0x10\n"
+      "bge 104f\n"
+      "tbz x11, #3, 99f\n"
+      "st1 { v6.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "st1 { v17.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 97f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v18.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 96f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d19, [x24], #0x8\n"
+      "tbz x11, #0, 103f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v19.s }[2], [x24]\n"
+      "b 103f\n"
+      "96:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 103f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s19, [x24, #0x0]\n"
+      "b 103f\n"
+      "97:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 98f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d18, [x24], #0x8\n"
+      "tbz x11, #0, 103f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v18.s }[2], [x24]\n"
+      "b 103f\n"
+      "98:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 103f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s18, [x24, #0x0]\n"
+      "b 103f\n"
+      "99:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 101f\n"
+      "st1 { v6.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v16.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 100f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d17, [x24], #0x8\n"
+      "tbz x11, #0, 103f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v17.s }[2], [x24]\n"
+      "b 103f\n"
+      "100:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 103f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s17, [x24, #0x0]\n"
+      "b 103f\n"
+      "101:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 102f\n"
+      "str d6, [x9], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "tbz x11, #0, 103f\n"
+      "st1 { v6.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v16.s }[2], [x24]\n"
+      "b 103f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s6, [x9, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s16, [x24, #0x0]\n"
+      "103:"  // Height 3: Partial direct writeback: Done
+      "b 105f\n"
+      "104:"  // Height 3: Full writeback
+      "str q6, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q17, [x24, #0x10]\n"
+      "str q18, [x24, #0x20]\n"
+      "str q19, [x24, #0x30]\n"
+      "105:"  // Height 3: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 72b\n"
+      "b 212f\n"
+      "106:"  // Height 4
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "107:"  // Height 4: Column loop
+      "cbz x12, 108f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 120f\n"
+      "108:"  // Height 4: no bias
+      "tbz %x[flags], #0, 119f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x23, x24, x20, LSL #2\n"
+      "bge 117f\n"
+      "tbz x11, #3, 112f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 110f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 109f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "tbz x11, #0, 116f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "b 116f\n"
+      "109:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 116f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "b 116f\n"
+      "110:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x11, #1, 111f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "tbz x11, #0, 116f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "b 116f\n"
+      "111:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 116f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "b 116f\n"
+      "112:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x11, #2, 114f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 113f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "tbz x11, #0, 116f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "b 116f\n"
+      "113:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 116f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "b 116f\n"
+      "114:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x11, #1, 115f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "tbz x11, #0, 116f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "b 116f\n"
+      "115:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "116:"  // Height 4: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 118f\n"
+      "117:"  // Height 4: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "118:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 120f\n"
+      "119:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "120:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "121:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 122f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 123f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 123f\n"
+      "122:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "123:"  // Height 4: input setup done
+      "cmp x27, #0x4\n"
+      "blt 126f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "cmp x27, #0x8\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      "blt 125f\n"
+      "124:"  // Height 4: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "ldr q7, [x10, #0x10]\n"
+      "bge 124b\n"
+      "125:"  // Height 4: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "126:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 129f\n"
+      "cbz x27, 129f\n"
+      "tbz x27, #1, 127f\n"
+      "ldr d0, [x26], #0x8\n"
+      "ldr d1, [x25], #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "ldr d3, [x23], #0x8\n"
+      "tbz x27, #0, 128f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "ld1 { v1.s }[2], [x25]\n"
+      "ld1 { v2.s }[2], [x24]\n"
+      "ld1 { v3.s }[2], [x23]\n"
+      "b 128f\n"
+      "127:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "ldr s1, [x25, #0x0]\n"
+      "ldr s2, [x24, #0x0]\n"
+      "ldr s3, [x23, #0x0]\n"
+      "128:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e5aec08  // bfmmla v8.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec50  // bfmmla v16.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec0c  // bfmmla v12.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec54  // bfmmla v20.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
+      "129:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 121b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "tbz %x[flags], #1, 130f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v15.4s, v15.4s, v26.4s\n"
+      "fmin v20.4s, v20.4s, v26.4s\n"
+      "fmin v21.4s, v21.4s, v26.4s\n"
+      "fmin v22.4s, v22.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v6.4s, v6.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v15.4s, v15.4s, v25.4s\n"
+      "fmax v20.4s, v20.4s, v25.4s\n"
+      "fmax v21.4s, v21.4s, v25.4s\n"
+      "fmax v22.4s, v22.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
+      "130:"  // Height 4: No activation
+      "cmp x11, #0x10\n"
+      "bge 139f\n"
+      "tbz x11, #3, 134f\n"
+      "st1 { v6.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 132f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 131f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d22, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x11, #0, 138f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v22.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "b 138f\n"
+      "131:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 138f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s22, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "b 138f\n"
+      "132:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 133f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d21, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x11, #0, 138f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v21.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "b 138f\n"
+      "133:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 138f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s21, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "b 138f\n"
+      "134:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 136f\n"
+      "st1 { v6.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 135f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d20, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x11, #0, 138f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v20.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "b 138f\n"
+      "135:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 138f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s20, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "b 138f\n"
+      "136:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 137f\n"
+      "str d6, [x9], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x11, #0, 138f\n"
+      "st1 { v6.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "b 138f\n"
+      "137:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s6, [x9, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "138:"  // Height 4: Partial direct writeback: Done
+      "b 140f\n"
+      "139:"  // Height 4: Full writeback
+      "str q6, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q20, [x24, #0x10]\n"
+      "str q21, [x24, #0x20]\n"
+      "str q22, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "140:"  // Height 4: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 107b\n"
+      "b 212f\n"
+      "141:"  // Height 5
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "142:"  // Height 5: Column loop
+      "cbz x12, 143f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 155f\n"
+      "143:"  // Height 5: no bias
+      "tbz %x[flags], #0, 154f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x22, x23, x20, LSL #2\n"
+      "bge 152f\n"
+      "tbz x11, #3, 147f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 145f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v27.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 144f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d6, [x22], #0x8\n"
+      "tbz x11, #0, 151f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v6.s }[2], [x22]\n"
+      "b 151f\n"
+      "144:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 151f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s6, [x22, #0x0]\n"
+      "b 151f\n"
+      "145:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x11, #1, 146f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "tbz x11, #0, 151f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
+      "b 151f\n"
+      "146:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 151f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
+      "b 151f\n"
+      "147:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x11, #2, 149f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 148f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "tbz x11, #0, 151f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
+      "b 151f\n"
+      "148:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 151f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
+      "b 151f\n"
+      "149:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x11, #1, 150f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "tbz x11, #0, 151f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "b 151f\n"
+      "150:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "151:"  // Height 5: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 153f\n"
+      "152:"  // Height 5: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q25, [x22, #0x0]\n"
+      "ldr q26, [x22, #0x10]\n"
+      "ldr q27, [x22, #0x20]\n"
+      "ldr q6, [x22, #0x30]\n"
+      "153:"  // Height 5: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 155f\n"
+      "154:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "155:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "156:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 157f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 158f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 158f\n"
+      "157:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "158:"  // Height 5: input setup done
+      "cmp x27, #0x4\n"
+      "blt 161f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "cmp x27, #0x8\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      "ld1 { v4.4s }, [x22], #0x10\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      "blt 160f\n"
+      "159:"  // Height 5: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q3, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q5, [x10, #0x30]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec55  // bfmmla v21.4s, v2.8h, v5.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e45ec9d  // bfmmla v29.4s, v4.8h, v5.8h\n"
+      "ldr q5, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec9e  // bfmmla v30.4s, v4.8h, v5.8h\n"
+      "ldr q5, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6e45ec0f  // bfmmla v15.4s, v0.8h, v5.8h\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      ".inst 0x6e45ec9f  // bfmmla v31.4s, v4.8h, v5.8h\n"
+      "ld1 { v4.4s }, [x22], #0x10\n"
+      "ldr q7, [x10, #0x10]\n"
+      "bge 159b\n"
+      "160:"  // Height 5: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q3, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x40]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x60]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
+      "161:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 164f\n"
+      "cbz x27, 164f\n"
+      "tbz x27, #1, 162f\n"
+      "ldr d0, [x26], #0x8\n"
+      "ldr d1, [x25], #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "ldr d3, [x23], #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "tbz x27, #0, 163f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "ld1 { v1.s }[2], [x25]\n"
+      "ld1 { v2.s }[2], [x24]\n"
+      "ld1 { v3.s }[2], [x23]\n"
+      "ld1 { v4.s }[2], [x22]\n"
+      "b 163f\n"
+      "162:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "ldr s1, [x25, #0x0]\n"
+      "ldr s2, [x24, #0x0]\n"
+      "ldr s3, [x23, #0x0]\n"
+      "ldr s4, [x22, #0x0]\n"
+      "163:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q5, [x10, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q3, [x10, #0x20]\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec54  // bfmmla v20.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec9c  // bfmmla v28.4s, v4.8h, v5.8h\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x40]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x60]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x70]\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
+      "164:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 156b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "tbz %x[flags], #1, 165f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmax v6.4s, v6.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "165:"  // Height 5: No activation
+      "cmp x11, #0x10\n"
+      "bge 174f\n"
+      "tbz x11, #3, 169f\n"
+      "st1 { v6.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v25.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 167f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v26.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 166f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d22, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d27, [x22], #0x8\n"
+      "tbz x11, #0, 173f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v22.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v27.s }[2], [x22]\n"
+      "b 173f\n"
+      "166:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 173f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s22, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s27, [x22, #0x0]\n"
+      "b 173f\n"
+      "167:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 168f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d21, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d26, [x22], #0x8\n"
+      "tbz x11, #0, 173f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v21.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v26.s }[2], [x22]\n"
+      "b 173f\n"
+      "168:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 173f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s21, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s26, [x22, #0x0]\n"
+      "b 173f\n"
+      "169:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 171f\n"
+      "st1 { v6.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 170f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d20, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "tbz x11, #0, 173f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v20.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "b 173f\n"
+      "170:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 173f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s20, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "b 173f\n"
+      "171:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 172f\n"
+      "str d6, [x9], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x11, #0, 173f\n"
+      "st1 { v6.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "b 173f\n"
+      "172:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s6, [x9, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "173:"  // Height 5: Partial direct writeback: Done
+      "b 175f\n"
+      "174:"  // Height 5: Full writeback
+      "str q6, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q20, [x24, #0x10]\n"
+      "str q21, [x24, #0x20]\n"
+      "str q22, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q25, [x22, #0x10]\n"
+      "str q26, [x22, #0x20]\n"
+      "str q27, [x22, #0x30]\n"
+      "175:"  // Height 5: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 142b\n"
+      "b 212f\n"
+      "176:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "177:"  // Height 6: Column loop
+      "cbz x12, 178f\n"
+      "ldr q8, [x12, #0x0]\n"
+      "ldr q9, [x12, #0x10]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x12, #0x20]\n"
+      "ldr q11, [x12, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "add x12, x12, #0x40\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 190f\n"
+      "178:"  // Height 6: no bias
+      "tbz %x[flags], #0, 189f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x21, x22, x20, LSL #2\n"
+      "bge 187f\n"
+      "tbz x11, #3, 182f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x25], #0x10\n"
+      "ld1 { v18.4s }, [x24], #0x10\n"
+      "ld1 { v21.4s }, [x23], #0x10\n"
+      "ld1 { v26.4s }, [x22], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 180f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x25], #0x10\n"
+      "ld1 { v19.4s }, [x24], #0x10\n"
+      "ld1 { v22.4s }, [x23], #0x10\n"
+      "ld1 { v27.4s }, [x22], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 179f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x25], #0x8\n"
+      "mov x20, #0x38\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d6, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x11, #0, 186f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x24]\n"
+      "ld1 { v23.s }[2], [x23]\n"
+      "ld1 { v6.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 186f\n"
+      "179:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x20, #0x30\n"
+      "tbz x11, #0, 186f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x25, #0x0]\n"
+      "ldr s24, [x24, #0x0]\n"
+      "ldr s23, [x23, #0x0]\n"
+      "ldr s6, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 186f\n"
+      "180:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x11, #1, 181f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x25], #0x8\n"
+      "mov x20, #0x28\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d27, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x11, #0, 186f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x25]\n"
+      "ld1 { v19.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v27.s }[2], [x22]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 186f\n"
+      "181:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x20, #0x20\n"
+      "tbz x11, #0, 186f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x25, #0x0]\n"
+      "ldr s19, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s27, [x22, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 186f\n"
+      "182:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x11, #2, 184f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x25], #0x10\n"
+      "ld1 { v17.4s }, [x24], #0x10\n"
+      "ld1 { v20.4s }, [x23], #0x10\n"
+      "ld1 { v25.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 183f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x25], #0x8\n"
+      "mov x20, #0x18\n"
+      "ldr d18, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d26, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x11, #0, 186f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x25]\n"
+      "ld1 { v18.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v26.s }[2], [x22]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 186f\n"
+      "183:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x20, #0x10\n"
+      "tbz x11, #0, 186f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x25, #0x0]\n"
+      "ldr s18, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s26, [x22, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 186f\n"
+      "184:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x11, #1, 185f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x25], #0x8\n"
+      "mov x20, #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x11, #0, 186f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 186f\n"
+      "185:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x25, #0x0]\n"
+      "mov x20, #0x0\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "186:"  // Height 6: Partial accumulate: Done
+      "sub x9, x9, x20\n"
+      "b 188f\n"
+      "187:"  // Height 6: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "ldr q14, [x25, #0x20]\n"
+      "ldr q15, [x25, #0x30]\n"
+      "ldr q17, [x24, #0x0]\n"
+      "ldr q18, [x24, #0x10]\n"
+      "ldr q19, [x24, #0x20]\n"
+      "ldr q24, [x24, #0x30]\n"
+      "ldr q20, [x23, #0x0]\n"
+      "ldr q21, [x23, #0x10]\n"
+      "ldr q22, [x23, #0x20]\n"
+      "ldr q23, [x23, #0x30]\n"
+      "ldr q25, [x22, #0x0]\n"
+      "ldr q26, [x22, #0x10]\n"
+      "ldr q27, [x22, #0x20]\n"
+      "ldr q6, [x22, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "188:"  // Height 6: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 190f\n"
+      "189:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "190:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "191:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 192f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 193f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 193f\n"
+      "192:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "193:"  // Height 6: input setup done
+      "cmp x27, #0x4\n"
+      "blt 196f\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      "cmp x27, #0x8\n"
+      "ld1 { v4.4s }, [x22], #0x10\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      "ld1 { v5.4s }, [x21], #0x10\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      "blt 195f\n"
+      "194:"  // Height 6: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x4ea168a4  // bfcvtn2 v4.8h, v5.4s\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "ld1 { v1.4s }, [x25], #0x10\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q5, [x10, #0x30]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "ld1 { v3.4s }, [x23], #0x10\n"
+      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e45ec55  // bfmmla v21.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec9d  // bfmmla v29.4s, v4.8h, v5.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "ld1 { v5.4s }, [x21], #0x10\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x26], #0x10\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "ld1 { v2.4s }, [x24], #0x10\n"
+      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      "ld1 { v4.4s }, [x22], #0x10\n"
+      "ldr q7, [x10, #0x10]\n"
+      "bge 194b\n"
+      "195:"  // Height 6: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "sub x27, x27, #0x4\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x4ea168a4  // bfcvtn2 v4.8h, v5.4s\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q3, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q1, [x10, #0x30]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x40]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x60]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
+      "196:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 199f\n"
+      "cbz x27, 199f\n"
+      "tbz x27, #1, 197f\n"
+      "ldr d0, [x26], #0x8\n"
+      "ldr d1, [x25], #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "ldr d3, [x23], #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "ldr d5, [x21], #0x8\n"
+      "tbz x27, #0, 198f\n"
+      "ld1 { v0.s }[2], [x26]\n"
+      "ld1 { v1.s }[2], [x25]\n"
+      "ld1 { v2.s }[2], [x24]\n"
+      "ld1 { v3.s }[2], [x23]\n"
+      "ld1 { v4.s }[2], [x22]\n"
+      "ld1 { v5.s }[2], [x21]\n"
+      "b 198f\n"
+      "197:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x26, #0x0]\n"
+      "ldr s1, [x25, #0x0]\n"
+      "ldr s2, [x24, #0x0]\n"
+      "ldr s3, [x23, #0x0]\n"
+      "ldr s4, [x22, #0x0]\n"
+      "ldr s5, [x21, #0x0]\n"
+      "198:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x4ea168a4  // bfcvtn2 v4.8h, v5.4s\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q3, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x40]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x60]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
+      "199:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 191b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "tbz %x[flags], #1, 200f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v1.4s\n"
+      "fmin v13.4s, v13.4s, v1.4s\n"
+      "fmin v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v1.4s\n"
+      "fmin v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v1.4s\n"
+      "fmin v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v1.4s\n"
+      "fmin v22.4s, v22.4s, v1.4s\n"
+      "fmin v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v1.4s\n"
+      "fmin v19.4s, v19.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v1.4s\n"
+      "fmin v28.4s, v28.4s, v1.4s\n"
+      "fmin v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v1.4s\n"
+      "fmin v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v1.4s\n"
+      "fmax v6.4s, v6.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v0.4s\n"
+      "fmax v13.4s, v13.4s, v0.4s\n"
+      "fmax v14.4s, v14.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v0.4s\n"
+      "fmax v10.4s, v10.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v0.4s\n"
+      "fmax v20.4s, v20.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v0.4s\n"
+      "fmax v22.4s, v22.4s, v0.4s\n"
+      "fmax v16.4s, v16.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v0.4s\n"
+      "fmax v19.4s, v19.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v0.4s\n"
+      "fmax v25.4s, v25.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v0.4s\n"
+      "200:"  // Height 6: No activation
+      "cmp x11, #0x10\n"
+      "bge 209f\n"
+      "tbz x11, #3, 204f\n"
+      "st1 { v6.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v9.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v20.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 202f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v29.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 201f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x25], #0x8\n"
+      "str d22, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d30, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x11, #0, 208f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x25]\n"
+      "st1 { v22.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "b 208f\n"
+      "201:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 208f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x25, #0x0]\n"
+      "str s22, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s30, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "b 208f\n"
+      "202:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 203f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x25], #0x8\n"
+      "str d21, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d29, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x11, #0, 208f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x25]\n"
+      "st1 { v21.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "b 208f\n"
+      "203:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 208f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x25, #0x0]\n"
+      "str s21, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s29, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "b 208f\n"
+      "204:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 206f\n"
+      "st1 { v6.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x25], #0x10\n"
+      "st1 { v15.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 205f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x25], #0x8\n"
+      "str d20, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d28, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x11, #0, 208f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x25]\n"
+      "st1 { v20.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "b 208f\n"
+      "205:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 208f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x25, #0x0]\n"
+      "str s20, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s28, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "b 208f\n"
+      "206:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 207f\n"
+      "str d6, [x9], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x11, #0, 208f\n"
+      "st1 { v6.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x25]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "b 208f\n"
+      "207:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s6, [x9, #0x0]\n"
+      "str s8, [x25, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "208:"  // Height 6: Partial direct writeback: Done
+      "b 210f\n"
+      "209:"  // Height 6: Full writeback
+      "str q6, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x25, #0x0]\n"
+      "str q9, [x25, #0x10]\n"
+      "str q10, [x25, #0x20]\n"
+      "str q11, [x25, #0x30]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q20, [x24, #0x10]\n"
+      "str q21, [x24, #0x20]\n"
+      "str q22, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q23, [x22, #0x0]\n"
+      "str q28, [x22, #0x10]\n"
+      "str q29, [x22, #0x20]\n"
+      "str q30, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "210:"  // Height 6: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 177b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 212f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 211f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "211:"  // Update direct input
+      "mov x20, #0x18\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "212:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index caef6396be..bfc9c7e8f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
@@ -44,7 +44,8 @@ void a64_hybrid_s8qa_dot_4x16_a55( ARGLIST );
 class cls_a64_hybrid_s8qa_dot_4x16
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,24 @@ public:
         return false;
     }
 
-    StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 7.5301 };
-            default:
-                return { 27.5482 };
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 7.5301 };
+                default:
+                    return { 27.5482 };
+                case CPUModel::A510:
+                    return { 14.81 };
+                case CPUModel::V1:
+                    return { 44.54 };
+            }
         }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
index 11aa05a9b7..eac0e7167e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -78,341 +78,328 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 91f\n"
       "cmp %x[M], #0x2\n"
       "bgt 61f\n"
       "beq 31f\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[col_bias]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x28, %x[output_ptr]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "2:"  // Height 1: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
       "movi v18.4s, #0x0\n"
       "movi v19.4s, #0x0\n"
       "3:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x11, #0x0\n"
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 6f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "cbnz x11, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x9, x9, x20\n"
       "b 6f\n"
       "5:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x9, %x[input_ptr]\n"
       "6:"  // Height 1: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 11f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
-      "cmp x26, #0x20\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 9f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr d5, [x10, #0x10]\n"
-      "ldr x24, [x10, #0x18]\n"
-      "add x25, x25, #0x10\n"
-      "ldr d6, [x10, #0x20]\n"
-      "ldr x23, [x10, #0x28]\n"
-      "mov v5.d[1], x24\n"
-      "ldr d7, [x10, #0x30]\n"
-      "ldr x19, [x10, #0x38]\n"
+      "ldr d21, [x12, #0x70]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v6.d[1], x23\n"
-      "ldr d8, [x10, #0x40]\n"
+      "ldr d20, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "mov v7.d[1], x19\n"
-      "ldr x23, [x10, #0x48]\n"
+      "ldr d26, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr d9, [x10, #0x50]\n"
-      "ldr x19, [x10, #0x58]\n"
-      "mov v8.d[1], x23\n"
-      "ldr d10, [x10, #0x60]\n"
-      "ldr x23, [x10, #0x68]\n"
+      "ldr d25, [x12, #0xa0]\n"
+      "mov v21.d[1], x20\n"
+      "ldr x20, [x12, #0x88]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "mov v9.d[1], x19\n"
-      "ldr d4, [x10, #0x70]\n"
+      "ldr d24, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "mov v10.d[1], x23\n"
-      "ldr x19, [x10, #0x78]\n"
+      "ldr d23, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr d5, [x10, #0x80]\n"
-      "ldr x24, [x10, #0x88]\n"
-      "mov v4.d[1], x19\n"
-      "ldr d6, [x10, #0x90]\n"
-      "ldr x23, [x10, #0x98]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v5.d[1], x24\n"
-      "ldr d7, [x10, #0xa0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "mov v6.d[1], x23\n"
-      "ldr x19, [x10, #0xa8]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "ldr d8, [x10, #0xb0]\n"
-      "ldr x23, [x10, #0xb8]\n"
-      "mov v7.d[1], x19\n"
-      "ldr d9, [x10, #0xc0]\n"
-      "ldr x19, [x10, #0xc8]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      "mov v8.d[1], x23\n"
-      "ldr d10, [x10, #0xd0]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      "mov v9.d[1], x19\n"
-      "ldr x23, [x10, #0xd8]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      "ldr d4, [x10, #0xe0]\n"
-      "ldr x19, [x10, #0xe8]\n"
-      "mov v10.d[1], x23\n"
-      "ldr d5, [x10, #0xf0]\n"
-      "ldr x24, [x10, #0xf8]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      "mov v4.d[1], x19\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      "mov v5.d[1], x24\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      "ldr d22, [x12, #0xd0]\n"
+      ".inst 0x4fa0e2b3  // sdot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr d21, [x12, #0xe0]\n"
+      "mov v20.d[1], x20\n"
+      "ldr x20, [x12, #0x98]\n"
+      "mov v26.d[1], x20\n"
+      "ldr x20, [x12, #0xa8]\n"
+      "mov v25.d[1], x20\n"
+      "ldr x20, [x12, #0xb8]\n"
+      "mov v24.d[1], x20\n"
+      "ldr x23, [x12, #0xc8]\n"
+      ".inst 0x4f80ea90  // sdot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr d20, [x12, #0xf0]\n"
+      ".inst 0x4f80eb51  // sdot v17.4s, v26.16b, v0.4b[2]\n"
+      "ldr x22, [x12, #0xd8]\n"
+      ".inst 0x4f80eb32  // sdot v18.4s, v25.16b, v0.4b[2]\n"
+      "ldr x21, [x12, #0xe8]\n"
+      ".inst 0x4f80eb13  // sdot v19.4s, v24.16b, v0.4b[2]\n"
+      "ldr x20, [x12, #0xf8]\n"
+      "mov v23.d[1], x23\n"
+      "mov v22.d[1], x22\n"
+      "add x9, x9, #0x10\n"
+      "mov v21.d[1], x21\n"
+      "add x12, x12, #0x100\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0eaf0  // sdot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ead1  // sdot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eab2  // sdot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea93  // sdot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 8f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x10\n"
-      "ldr q0, [x25, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q4, [x12, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "bge 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x10, #0x10]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "sub x26, x26, #0x10\n"
-      "ldr q7, [x10, #0x30]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q21, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q8, [x10, #0x40]\n"
+      "ldr q20, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x50]\n"
+      "ldr q26, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q10, [x10, #0x60]\n"
+      "ldr q25, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q4, [x10, #0x70]\n"
+      "ldr q24, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q5, [x10, #0x80]\n"
+      "ldr q23, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q8, [x10, #0xb0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "ldr q9, [x10, #0xc0]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      "ldr q10, [x10, #0xd0]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      "ldr q4, [x10, #0xe0]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      "ldr q5, [x10, #0xf0]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      "ldr q22, [x12, #0xd0]\n"
+      ".inst 0x4fa0e2b3  // sdot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x12, #0xe0]\n"
+      ".inst 0x4f80ea90  // sdot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x12, #0xf0]\n"
+      ".inst 0x4f80eb51  // sdot v17.4s, v26.16b, v0.4b[2]\n"
+      "sub x10, x10, #0x10\n"
+      ".inst 0x4f80eb32  // sdot v18.4s, v25.16b, v0.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x4f80eb13  // sdot v19.4s, v24.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x4fa0eaf0  // sdot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ead1  // sdot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eab2  // sdot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea93  // sdot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 10f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "10:"  // Height 1: Multiply loop: unique 2: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "11:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x26, 18f\n"
-      "cmp x26, #0x4\n"
+      "cbz x10, 18f\n"
+      "cmp x10, #0x4\n"
       "blt 14f\n"
       "12:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
       "tbnz %x[flags], #31, 13f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "13:"  // Height 1: Multiply loop: unique 3: skip row sum
-      "ldr q6, [x10, #0x0]\n"
-      "sub x26, x26, #0x4\n"
-      "ldr q7, [x10, #0x10]\n"
-      "cmp x26, #0x4\n"
-      "ldr q8, [x10, #0x20]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x30]\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q22, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q21, [x12, #0x20]\n"
+      ".inst 0x4f80e290  // sdot v16.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x30]\n"
+      ".inst 0x4f80e2d1  // sdot v17.4s, v22.16b, v0.4b[0]\n"
+      ".inst 0x4f80e2b2  // sdot v18.4s, v21.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f80e293  // sdot v19.4s, v20.16b, v0.4b[0]\n"
       "bge 12b\n"
-      "cbz x26, 18f\n"
       "14:"  // Height 1: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 15f\n"
-      "ldr h0, [x25], #0x2\n"
-      "tbz x26, #0, 16f\n"
-      "ld1 { v0.b }[2], [x25]\n"
+      "cbz x10, 18f\n"
+      "tbz x10, #1, 15f\n"
+      "ldr h0, [x9], #0x2\n"
+      "tbz x10, #0, 16f\n"
+      "ld1 { v0.b }[2], [x9]\n"
       "b 16f\n"
       "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
       "16:"  // Height 1: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 17f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "17:"  // Height 1: Multiply loop: unique 4: skip row sum
-      "ldr q10, [x10, #0x0]\n"
-      "ldr q4, [x10, #0x10]\n"
-      "ldr q5, [x10, #0x20]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x0]\n"
+      ".inst 0x4f80e290  // sdot v16.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x10]\n"
+      ".inst 0x4f80e291  // sdot v17.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x20]\n"
+      ".inst 0x4f80e292  // sdot v18.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x30]\n"
+      ".inst 0x4f80e293  // sdot v19.4s, v20.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
       "18:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 4b\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
       "tbnz %x[flags], #31, 19f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v1.4s }, [x22]\n"
-      "neg v1.4s, v1.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "neg v20.4s, v20.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "mul v11.4s, v11.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v20.4s\n"
       "19:"  // Height 1: skip row sum fixup
+      "ldr q23, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
+      "ldr q22, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q21, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q20, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "ldr q0, [x9, #0x0]\n"
+      "add v16.4s, v16.4s, v23.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "ldr q1, [x9, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ldr q2, [x9, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "ldr q3, [x9, #0x30]\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add x9, x9, #0x40\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v20.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 20f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v0.16b\n"
+      "and v21.16b, v18.16b, v0.16b\n"
+      "and v20.16b, v19.16b, v0.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "20:"  // Height 1: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "cmp x11, #0x10\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v20.4s\n"
+      "add v17.4s, v17.4s, v20.4s\n"
+      "add v18.4s, v18.4s, v20.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v20.4s\n"
+      "smin v17.4s, v17.4s, v20.4s\n"
+      "smin v18.4s, v18.4s, v20.4s\n"
+      "smin v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "cmp x14, #0x10\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 29f\n"
-      "tbz x11, #3, 24f\n"
-      "str d16, [x28], #0x8\n"
-      "tbz x11, #2, 22f\n"
-      "st1 { v16.s }[2], [x28], #0x4\n"
-      "tbz x11, #1, 21f\n"
-      "st1 { v16.h }[6], [x28], #0x2\n"
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[14], [x28]\n"
+      "tbz x14, #3, 24f\n"
+      "str d16, [x13], #0x8\n"
+      "tbz x14, #2, 22f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "tbz x14, #1, 21f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[14], [x13]\n"
       "b 28f\n"
       "21:"  // Height 1: Partial direct writeback: partial_1_12
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[12], [x28]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[12], [x13]\n"
       "b 28f\n"
       "22:"  // Height 1: Partial direct writeback: partial_2_8
-      "tbz x11, #1, 23f\n"
-      "st1 { v16.h }[4], [x28], #0x2\n"
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[10], [x28]\n"
+      "tbz x14, #1, 23f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[10], [x13]\n"
       "b 28f\n"
       "23:"  // Height 1: Partial direct writeback: partial_1_8
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[8], [x28]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[8], [x13]\n"
       "b 28f\n"
       "24:"  // Height 1: Partial direct writeback: partial_4_0
-      "tbz x11, #2, 26f\n"
-      "str s16, [x28], #0x4\n"
-      "tbz x11, #1, 25f\n"
-      "st1 { v16.h }[2], [x28], #0x2\n"
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[6], [x28]\n"
+      "tbz x14, #2, 26f\n"
+      "str s16, [x13], #0x4\n"
+      "tbz x14, #1, 25f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[6], [x13]\n"
       "b 28f\n"
       "25:"  // Height 1: Partial direct writeback: partial_1_4
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[4], [x28]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[4], [x13]\n"
       "b 28f\n"
       "26:"  // Height 1: Partial direct writeback: partial_2_0
-      "tbz x11, #1, 27f\n"
-      "str h16, [x28], #0x2\n"
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[2], [x28]\n"
+      "tbz x14, #1, 27f\n"
+      "str h16, [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[2], [x13]\n"
       "b 28f\n"
       "27:"  // Height 1: Partial direct writeback: partial_1_0
-      "str b16, [x28, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
       "28:"  // Height 1: Partial direct writeback: Done
       "b 30f\n"
       "29:"  // Height 1: Full writeback
-      "str q16, [x28, #0x0]\n"
-      "add x28, x28, #0x10\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
       "30:"  // Height 1: Writeback done
-      "subs x11, x11, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 2b\n"
       "b 122f\n"
       "31:"  // Height 2
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
-      "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[col_bias]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x28, %x[output_ptr]\n"
+      "movi v15.16b, #0x1\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
       "32:"  // Height 2: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -423,319 +410,307 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       "movi v22.4s, #0x0\n"
       "movi v23.4s, #0x0\n"
       "33:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x11, #0x0\n"
       "34:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 35f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "cbnz x27, 36f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x11, 36f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
       "b 36f\n"
       "35:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x22, x25, x19\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
       "36:"  // Height 2: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 41f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 39f\n"
       "37:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr d5, [x10, #0x10]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr x24, [x10, #0x18]\n"
-      "ldr d6, [x10, #0x20]\n"
-      "add x25, x25, #0x10\n"
-      "ldr x23, [x10, #0x28]\n"
-      "add x22, x22, #0x10\n"
-      "mov v5.d[1], x24\n"
-      "ldr d7, [x10, #0x30]\n"
-      "ldr x19, [x10, #0x38]\n"
+      "ldr d25, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v6.d[1], x23\n"
+      "mov v25.d[1], x20\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr d8, [x10, #0x40]\n"
+      "ldr d24, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "mov v7.d[1], x19\n"
+      "ldr x23, [x12, #0x88]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr x23, [x10, #0x48]\n"
+      "ldr d30, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr d9, [x10, #0x50]\n"
+      "ldr x22, [x12, #0x98]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr x19, [x10, #0x58]\n"
-      "mov v8.d[1], x23\n"
-      "ldr d10, [x10, #0x60]\n"
-      "ldr x23, [x10, #0x68]\n"
+      "ldr d29, [x12, #0xa0]\n"
+      "ldr x21, [x12, #0xa8]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "mov v9.d[1], x19\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr d4, [x10, #0x70]\n"
+      "ldr d28, [x12, #0xb0]\n"
+      "ldr x20, [x12, #0xb8]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "mov v10.d[1], x23\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr x19, [x10, #0x78]\n"
+      "ldr d27, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr d5, [x10, #0x80]\n"
+      "mov v24.d[1], x23\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr x24, [x10, #0x88]\n"
-      "mov v4.d[1], x19\n"
-      "ldr d6, [x10, #0x90]\n"
-      "ldr x23, [x10, #0x98]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v5.d[1], x24\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr d7, [x10, #0xa0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "mov v6.d[1], x23\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr x19, [x10, #0xa8]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "ldr d8, [x10, #0xb0]\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      "ldr x23, [x10, #0xb8]\n"
-      "mov v7.d[1], x19\n"
-      "ldr d9, [x10, #0xc0]\n"
-      "ldr x19, [x10, #0xc8]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      "mov v8.d[1], x23\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      "ldr d10, [x10, #0xd0]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      "mov v9.d[1], x19\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      "ldr x23, [x10, #0xd8]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      "ldr d4, [x10, #0xe0]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      "ldr x19, [x10, #0xe8]\n"
-      "mov v10.d[1], x23\n"
-      "ldr d5, [x10, #0xf0]\n"
-      "ldr x24, [x10, #0xf8]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      "mov v4.d[1], x19\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      "mov v5.d[1], x24\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      "ldr d26, [x12, #0xd0]\n"
+      ".inst 0x4fa0e333  // sdot v19.4s, v25.16b, v0.4b[1]\n"
+      "mov v30.d[1], x22\n"
+      ".inst 0x4fa1e337  // sdot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr d25, [x12, #0xe0]\n"
+      "mov v29.d[1], x21\n"
+      "ldr x23, [x12, #0xc8]\n"
+      "mov v28.d[1], x20\n"
+      "ldr x22, [x12, #0xd8]\n"
+      "ldr x21, [x12, #0xe8]\n"
+      ".inst 0x4f80eb10  // sdot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb14  // sdot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr d24, [x12, #0xf0]\n"
+      "ldr x20, [x12, #0xf8]\n"
+      ".inst 0x4f80ebd1  // sdot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebd5  // sdot v21.4s, v30.16b, v1.4b[2]\n"
+      "mov v27.d[1], x23\n"
+      ".inst 0x4f80ebb2  // sdot v18.4s, v29.16b, v0.4b[2]\n"
+      "mov v26.d[1], x22\n"
+      ".inst 0x4f81ebb6  // sdot v22.4s, v29.16b, v1.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f80eb93  // sdot v19.4s, v28.16b, v0.4b[2]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f81eb97  // sdot v23.4s, v28.16b, v1.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      "add x28, x28, #0x10\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x4fa0eb70  // sdot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb74  // sdot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb51  // sdot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb55  // sdot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb32  // sdot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb36  // sdot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb13  // sdot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb17  // sdot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 38f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "38:"  // Height 2: Multiply loop: unique 5: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x20\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "bge 37b\n"
       "39:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x10, #0x10]\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "sub x26, x26, #0x10\n"
+      "ldr q25, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q8, [x10, #0x40]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q9, [x10, #0x50]\n"
+      "ldr q24, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q10, [x10, #0x60]\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q4, [x10, #0x70]\n"
+      "ldr q30, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q5, [x10, #0x80]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q29, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0xa0]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q8, [x10, #0xb0]\n"
+      "ldr q28, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x10, #0xc0]\n"
+      "ldr q27, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x10, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x10, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x10, #0xf0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      "ldr q26, [x12, #0xd0]\n"
+      ".inst 0x4fa0e333  // sdot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e337  // sdot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x12, #0xe0]\n"
+      ".inst 0x4f80eb10  // sdot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb14  // sdot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x12, #0xf0]\n"
+      ".inst 0x4f80ebd1  // sdot v17.4s, v30.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x4f81ebd5  // sdot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x4f80ebb2  // sdot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebb6  // sdot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f80eb93  // sdot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb97  // sdot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4fa0eb70  // sdot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb74  // sdot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb51  // sdot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb55  // sdot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb32  // sdot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb36  // sdot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb13  // sdot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb17  // sdot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 40f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "40:"  // Height 2: Multiply loop: unique 6: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "41:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x26, 48f\n"
-      "cmp x26, #0x4\n"
+      "cbz x10, 48f\n"
+      "cmp x10, #0x4\n"
       "blt 44f\n"
       "42:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
       "tbnz %x[flags], #31, 43f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "43:"  // Height 2: Multiply loop: unique 7: skip row sum
-      "ldr q6, [x10, #0x0]\n"
-      "sub x26, x26, #0x4\n"
-      "ldr q7, [x10, #0x10]\n"
-      "cmp x26, #0x4\n"
-      "ldr q8, [x10, #0x20]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x30]\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
+      "ldr q27, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q26, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q25, [x12, #0x20]\n"
+      ".inst 0x4f80e370  // sdot v16.4s, v27.16b, v0.4b[0]\n"
+      "ldr q24, [x12, #0x30]\n"
+      ".inst 0x4f81e374  // sdot v20.4s, v27.16b, v1.4b[0]\n"
+      ".inst 0x4f80e351  // sdot v17.4s, v26.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f81e355  // sdot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x4f80e332  // sdot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e336  // sdot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f80e313  // sdot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e317  // sdot v23.4s, v24.16b, v1.4b[0]\n"
       "bge 42b\n"
-      "cbz x26, 48f\n"
       "44:"  // Height 2: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 45f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "tbz x26, #0, 46f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x22]\n"
+      "cbz x10, 48f\n"
+      "tbz x10, #1, 45f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "tbz x10, #0, 46f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
       "b 46f\n"
       "45:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
       "46:"  // Height 2: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 47f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "47:"  // Height 2: Multiply loop: unique 8: skip row sum
-      "ldr q10, [x10, #0x0]\n"
-      "ldr q4, [x10, #0x10]\n"
-      "ldr q5, [x10, #0x20]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
+      "ldr q24, [x12, #0x0]\n"
+      ".inst 0x4f80e310  // sdot v16.4s, v24.16b, v0.4b[0]\n"
+      "ldr q26, [x12, #0x10]\n"
+      ".inst 0x4f81e314  // sdot v20.4s, v24.16b, v1.4b[0]\n"
+      "ldr q25, [x12, #0x20]\n"
+      ".inst 0x4f80e351  // sdot v17.4s, v26.16b, v0.4b[0]\n"
+      "ldr q24, [x12, #0x30]\n"
+      ".inst 0x4f81e355  // sdot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x4f80e332  // sdot v18.4s, v25.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f81e336  // sdot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f80e313  // sdot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e317  // sdot v23.4s, v24.16b, v1.4b[0]\n"
       "48:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 34b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x21, x28, x19\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "tbnz %x[flags], #31, 49f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x22]\n"
-      "neg v2.4s, v2.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "neg v24.4s, v24.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "mul v11.4s, v11.4s, v2.4s\n"
-      "mul v12.4s, v12.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
       "49:"  // Height 2: skip row sum fixup
+      "ldr q27, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
+      "ldr q26, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q25, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q24, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
-      "ldr q0, [x9, #0x0]\n"
+      "add v16.4s, v16.4s, v27.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "ldr q1, [x9, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ldr q2, [x9, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "ldr q3, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 50f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v24.16b, v16.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v24.4s\n"
+      "and v30.16b, v17.16b, v0.16b\n"
+      "and v29.16b, v18.16b, v0.16b\n"
+      "and v28.16b, v19.16b, v0.16b\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v0.16b\n"
+      "and v25.16b, v22.16b, v0.16b\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "50:"  // Height 2: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -745,122 +720,122 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       "srshl v21.4s, v21.4s, v0.4s\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "cmp x11, #0x10\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v24.4s\n"
+      "add v18.4s, v18.4s, v24.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v24.4s\n"
+      "add v21.4s, v21.4s, v24.4s\n"
+      "add v22.4s, v22.4s, v24.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v24.4s\n"
+      "smin v17.4s, v17.4s, v24.4s\n"
+      "smin v18.4s, v18.4s, v24.4s\n"
+      "smin v19.4s, v19.4s, v24.4s\n"
+      "smin v20.4s, v20.4s, v24.4s\n"
+      "smin v21.4s, v21.4s, v24.4s\n"
+      "smin v22.4s, v22.4s, v24.4s\n"
+      "smin v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 59f\n"
-      "tbz x11, #3, 54f\n"
-      "str d16, [x28], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "tbz x11, #2, 52f\n"
-      "st1 { v16.s }[2], [x28], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "tbz x11, #1, 51f\n"
-      "st1 { v16.h }[6], [x28], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[14], [x28]\n"
-      "st1 { v20.b }[14], [x21]\n"
+      "tbz x14, #3, 54f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "tbz x14, #2, 52f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "tbz x14, #1, 51f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 58f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_12
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[12], [x28]\n"
-      "st1 { v20.b }[12], [x21]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 58f\n"
       "52:"  // Height 2: Partial direct writeback: partial_2_8
-      "tbz x11, #1, 53f\n"
-      "st1 { v16.h }[4], [x28], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[10], [x28]\n"
-      "st1 { v20.b }[10], [x21]\n"
+      "tbz x14, #1, 53f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 58f\n"
       "53:"  // Height 2: Partial direct writeback: partial_1_8
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[8], [x28]\n"
-      "st1 { v20.b }[8], [x21]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 58f\n"
       "54:"  // Height 2: Partial direct writeback: partial_4_0
-      "tbz x11, #2, 56f\n"
-      "str s16, [x28], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "tbz x11, #1, 55f\n"
-      "st1 { v16.h }[2], [x28], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[6], [x28]\n"
-      "st1 { v20.b }[6], [x21]\n"
+      "tbz x14, #2, 56f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "tbz x14, #1, 55f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 58f\n"
       "55:"  // Height 2: Partial direct writeback: partial_1_4
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[4], [x28]\n"
-      "st1 { v20.b }[4], [x21]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 58f\n"
       "56:"  // Height 2: Partial direct writeback: partial_2_0
-      "tbz x11, #1, 57f\n"
-      "str h16, [x28], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[2], [x28]\n"
-      "st1 { v20.b }[2], [x21]\n"
+      "tbz x14, #1, 57f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 58f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_0
-      "str b16, [x28, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "58:"  // Height 2: Partial direct writeback: Done
       "b 60f\n"
       "59:"  // Height 2: Full writeback
-      "str q16, [x28, #0x0]\n"
-      "add x28, x28, #0x10\n"
-      "str q20, [x21, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
       "60:"  // Height 2: Writeback done
-      "subs x11, x11, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 32b\n"
       "b 122f\n"
       "61:"  // Height 3
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[col_bias]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x28, %x[output_ptr]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
       "62:"  // Height 3: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -875,325 +850,317 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       "movi v26.4s, #0x0\n"
       "movi v27.4s, #0x0\n"
       "63:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x11, #0x0\n"
       "64:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 65f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "cbnz x27, 66f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x27, [x20, #0x10]\n"
+      "cbnz x11, 66f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
+      "add x27, x27, x20\n"
       "b 66f\n"
       "65:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x22, x25, x19\n"
-      "add x21, x22, x19\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
+      "add x27, x28, x21\n"
       "66:"  // Height 3: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 71f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 69f\n"
       "67:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr d5, [x10, #0x10]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr x24, [x10, #0x18]\n"
+      "ldr x23, [x12, #0x88]\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr d6, [x10, #0x20]\n"
-      "ldr x23, [x10, #0x28]\n"
-      "add x25, x25, #0x10\n"
-      "mov v5.d[1], x24\n"
-      "ldr d7, [x10, #0x30]\n"
-      "ldr x19, [x10, #0x38]\n"
-      "add x22, x22, #0x10\n"
+      "ldr d29, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v6.d[1], x23\n"
+      "mov v29.d[1], x20\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr d8, [x10, #0x40]\n"
+      "ldr x22, [x12, #0x98]\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "mov v7.d[1], x19\n"
+      "ldr d28, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr x23, [x10, #0x48]\n"
+      "ldr x21, [x12, #0xa8]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr d9, [x10, #0x50]\n"
+      "ldr x20, [x12, #0xb8]\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr x19, [x10, #0x58]\n"
+      "ldr d5, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "mov v8.d[1], x23\n"
+      "mov v28.d[1], x23\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr d10, [x10, #0x60]\n"
+      "mov v5.d[1], x22\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
-      "mov v9.d[1], x19\n"
+      "ldr d4, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr x23, [x10, #0x68]\n"
+      "mov v4.d[1], x21\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr d4, [x10, #0x70]\n"
+      "ldr x23, [x12, #0xc8]\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr x19, [x10, #0x78]\n"
+      "ldr d3, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "mov v10.d[1], x23\n"
+      "mov v3.d[1], x20\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr d5, [x10, #0x80]\n"
+      "ldr x22, [x12, #0xd8]\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
-      "mov v4.d[1], x19\n"
+      "ldr d31, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr x24, [x10, #0x88]\n"
+      "ldr x21, [x12, #0xe8]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr d6, [x10, #0x90]\n"
+      "ldr x20, [x12, #0xf8]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr x23, [x10, #0x98]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v5.d[1], x24\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr d7, [x10, #0xa0]\n"
-      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
-      "mov v6.d[1], x23\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr x19, [x10, #0xa8]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr d8, [x10, #0xb0]\n"
-      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr x23, [x10, #0xb8]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "mov v7.d[1], x19\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      "ldr d9, [x10, #0xc0]\n"
-      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
-      "mov v8.d[1], x23\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      "ldr x19, [x10, #0xc8]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      "ldr d10, [x10, #0xd0]\n"
-      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
-      "ldr x23, [x10, #0xd8]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      "mov v9.d[1], x19\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      "ldr d4, [x10, #0xe0]\n"
-      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
-      "mov v10.d[1], x23\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      "ldr x19, [x10, #0xe8]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      "ldr d5, [x10, #0xf0]\n"
-      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
-      "ldr x24, [x10, #0xf8]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      "mov v4.d[1], x19\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      "add x21, x21, #0x10\n"
-      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
-      "mov v5.d[1], x24\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      "ldr d30, [x12, #0xd0]\n"
+      ".inst 0x4fa0e3b3  // sdot v19.4s, v29.16b, v0.4b[1]\n"
+      "mov v31.d[1], x23\n"
+      ".inst 0x4fa1e3b7  // sdot v23.4s, v29.16b, v1.4b[1]\n"
+      "mov v30.d[1], x22\n"
+      ".inst 0x4fa2e3bb  // sdot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr d29, [x12, #0xe0]\n"
+      ".inst 0x4f80eb90  // sdot v16.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81eb94  // sdot v20.4s, v28.16b, v1.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x4f82eb98  // sdot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr d28, [x12, #0xf0]\n"
+      ".inst 0x4f80e8b1  // sdot v17.4s, v5.16b, v0.4b[2]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f81e8b5  // sdot v21.4s, v5.16b, v1.4b[2]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f82e8b9  // sdot v25.4s, v5.16b, v2.4b[2]\n"
+      "add x27, x27, #0x10\n"
+      ".inst 0x4f80e892  // sdot v18.4s, v4.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x4f81e896  // sdot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4f82e89a  // sdot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x4f80e873  // sdot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x4f81e877  // sdot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x4f82e87b  // sdot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x4fa0ebf0  // sdot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebf4  // sdot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebf8  // sdot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebd1  // sdot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebd5  // sdot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebd9  // sdot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebb2  // sdot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebb6  // sdot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebba  // sdot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eb93  // sdot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb97  // sdot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb9b  // sdot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 68f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "68:"  // Height 3: Multiply loop: unique 9: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x20\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "bge 67b\n"
       "69:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x10, #0x10]\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q29, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q8, [x10, #0x40]\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q9, [x10, #0x50]\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q10, [x10, #0x60]\n"
+      "ldr q28, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q4, [x10, #0x70]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q5, [x10, #0x80]\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q5, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0xa0]\n"
+      "ldr q4, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x10, #0xb0]\n"
+      "ldr q3, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x10, #0xc0]\n"
+      "ldr q31, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x10, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x10, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x10, #0xf0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      "ldr q30, [x12, #0xd0]\n"
+      ".inst 0x4fa0e3b3  // sdot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3b7  // sdot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3bb  // sdot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x12, #0xe0]\n"
+      ".inst 0x4f80eb90  // sdot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb94  // sdot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb98  // sdot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x12, #0xf0]\n"
+      ".inst 0x4f80e8b1  // sdot v17.4s, v5.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x4f81e8b5  // sdot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8b9  // sdot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x4f80e892  // sdot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4f81e896  // sdot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4f82e89a  // sdot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x4f80e873  // sdot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x4f81e877  // sdot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x4f82e87b  // sdot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x4fa0ebf0  // sdot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebf4  // sdot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebf8  // sdot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebd1  // sdot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebd5  // sdot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebd9  // sdot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebb2  // sdot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebb6  // sdot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebba  // sdot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eb93  // sdot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb97  // sdot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb9b  // sdot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 70f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "70:"  // Height 3: Multiply loop: unique 10: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "71:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x26, 78f\n"
-      "cmp x26, #0x4\n"
+      "cbz x10, 78f\n"
+      "cmp x10, #0x4\n"
       "blt 74f\n"
       "72:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
-      "ldr s2, [x21], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x27], #0x4\n"
       "tbnz %x[flags], #31, 73f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "73:"  // Height 3: Multiply loop: unique 11: skip row sum
-      "ldr q6, [x10, #0x0]\n"
-      "sub x26, x26, #0x4\n"
-      "ldr q7, [x10, #0x10]\n"
-      "cmp x26, #0x4\n"
-      "ldr q8, [x10, #0x20]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x30]\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
+      "ldr q31, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q30, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q29, [x12, #0x20]\n"
+      ".inst 0x4f80e3f0  // sdot v16.4s, v31.16b, v0.4b[0]\n"
+      "ldr q28, [x12, #0x30]\n"
+      ".inst 0x4f81e3f4  // sdot v20.4s, v31.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3f8  // sdot v24.4s, v31.16b, v2.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f80e3d1  // sdot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3d5  // sdot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3d9  // sdot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3b6  // sdot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3ba  // sdot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e397  // sdot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e39b  // sdot v27.4s, v28.16b, v2.4b[0]\n"
       "bge 72b\n"
-      "cbz x26, 78f\n"
       "74:"  // Height 3: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 75f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "ldr h2, [x21], #0x2\n"
-      "tbz x26, #0, 76f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x22]\n"
-      "ld1 { v2.b }[2], [x21]\n"
+      "cbz x10, 78f\n"
+      "tbz x10, #1, 75f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x27], #0x2\n"
+      "tbz x10, #0, 76f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x27]\n"
       "b 76f\n"
       "75:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
-      "ldr b2, [x21, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x27, #0x0]\n"
       "76:"  // Height 3: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 77f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "77:"  // Height 3: Multiply loop: unique 12: skip row sum
-      "ldr q10, [x10, #0x0]\n"
-      "ldr q4, [x10, #0x10]\n"
-      "ldr q5, [x10, #0x20]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x4f82e158  // sdot v24.4s, v10.16b, v2.4b[0]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x4f82e099  // sdot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0db  // sdot v27.4s, v6.16b, v2.4b[0]\n"
+      "ldr q28, [x12, #0x0]\n"
+      ".inst 0x4f80e390  // sdot v16.4s, v28.16b, v0.4b[0]\n"
+      "ldr q30, [x12, #0x10]\n"
+      ".inst 0x4f81e394  // sdot v20.4s, v28.16b, v1.4b[0]\n"
+      "ldr q29, [x12, #0x20]\n"
+      ".inst 0x4f82e398  // sdot v24.4s, v28.16b, v2.4b[0]\n"
+      "ldr q28, [x12, #0x30]\n"
+      ".inst 0x4f80e3d1  // sdot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3d5  // sdot v21.4s, v30.16b, v1.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f82e3d9  // sdot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3b6  // sdot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3ba  // sdot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e397  // sdot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e39b  // sdot v27.4s, v28.16b, v2.4b[0]\n"
       "78:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 64b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x21, x28, x19\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
       "tbnz %x[flags], #31, 79f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v3.4s }, [x22]\n"
-      "neg v3.4s, v3.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "neg v28.4s, v28.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "mul v11.4s, v11.4s, v3.4s\n"
-      "mul v12.4s, v12.4s, v3.4s\n"
-      "mul v13.4s, v13.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v28.4s\n"
+      "mul v12.4s, v12.4s, v28.4s\n"
+      "mul v13.4s, v13.4s, v28.4s\n"
       "79:"  // Height 3: skip row sum fixup
+      "ldr q31, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
+      "ldr q30, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q29, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q28, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
@@ -1203,77 +1170,73 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       "add v25.4s, v25.4s, v13.4s\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
-      "ldr q0, [x9, #0x0]\n"
+      "add v16.4s, v16.4s, v31.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v31.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v31.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "ldr q1, [x9, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ldr q2, [x9, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "ldr q3, [x9, #0x30]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add x9, x9, #0x40\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 80f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v1.16b, v16.16b, v0.16b\n"
+      "and v31.16b, v17.16b, v0.16b\n"
+      "and v30.16b, v18.16b, v0.16b\n"
+      "and v29.16b, v19.16b, v0.16b\n"
+      "and v28.16b, v20.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v1.4s\n"
+      "sqadd v17.4s, v17.4s, v31.4s\n"
+      "sqadd v18.4s, v18.4s, v30.4s\n"
+      "sqadd v19.4s, v19.4s, v29.4s\n"
+      "sqadd v20.4s, v20.4s, v28.4s\n"
+      "and v3.16b, v21.16b, v0.16b\n"
+      "and v2.16b, v22.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v0.16b\n"
+      "and v29.16b, v26.16b, v0.16b\n"
+      "and v28.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v3.4s\n"
+      "sqadd v22.4s, v22.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "80:"  // Height 3: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -1287,157 +1250,157 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       "srshl v25.4s, v25.4s, v0.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "cmp x11, #0x10\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v28.4s\n"
+      "add v18.4s, v18.4s, v28.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v28.4s\n"
+      "add v21.4s, v21.4s, v28.4s\n"
+      "add v22.4s, v22.4s, v28.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v28.4s\n"
+      "add v25.4s, v25.4s, v28.4s\n"
+      "add v26.4s, v26.4s, v28.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v23.4s, v23.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 89f\n"
-      "tbz x11, #3, 84f\n"
-      "str d16, [x28], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "tbz x11, #2, 82f\n"
-      "st1 { v16.s }[2], [x28], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
-      "tbz x11, #1, 81f\n"
-      "st1 { v16.h }[6], [x28], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[14], [x28]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
+      "tbz x14, #3, 84f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x14, #2, 82f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "tbz x14, #1, 81f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 88f\n"
       "81:"  // Height 3: Partial direct writeback: partial_1_12
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[12], [x28]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 88f\n"
       "82:"  // Height 3: Partial direct writeback: partial_2_8
-      "tbz x11, #1, 83f\n"
-      "st1 { v16.h }[4], [x28], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[10], [x28]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
+      "tbz x14, #1, 83f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 88f\n"
       "83:"  // Height 3: Partial direct writeback: partial_1_8
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[8], [x28]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 88f\n"
       "84:"  // Height 3: Partial direct writeback: partial_4_0
-      "tbz x11, #2, 86f\n"
-      "str s16, [x28], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
-      "tbz x11, #1, 85f\n"
-      "st1 { v16.h }[2], [x28], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[6], [x28]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
+      "tbz x14, #2, 86f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "tbz x14, #1, 85f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 88f\n"
       "85:"  // Height 3: Partial direct writeback: partial_1_4
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[4], [x28]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 88f\n"
       "86:"  // Height 3: Partial direct writeback: partial_2_0
-      "tbz x11, #1, 87f\n"
-      "str h16, [x28], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[2], [x28]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
+      "tbz x14, #1, 87f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 88f\n"
       "87:"  // Height 3: Partial direct writeback: partial_1_0
-      "str b16, [x28, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "88:"  // Height 3: Partial direct writeback: Done
       "b 90f\n"
       "89:"  // Height 3: Full writeback
-      "str q16, [x28, #0x0]\n"
-      "add x28, x28, #0x10\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "90:"  // Height 3: Writeback done
-      "subs x11, x11, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 62b\n"
       "b 122f\n"
       "91:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x4\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v14.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[col_bias]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x28, %x[output_ptr]\n"
-      "mov x19, #0x4\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "92:"  // Height 4: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -1456,137 +1419,125 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       "movi v30.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
       "93:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x11, #0x0\n"
       "94:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 95f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "ldr x20, [x20, #0x18]\n"
-      "cbnz x27, 96f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x27, [x20, #0x10]\n"
+      "ldr x26, [x20, #0x18]\n"
+      "cbnz x11, 96f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
       "b 96f\n"
       "95:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x22, x25, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
+      "add x27, x28, x21\n"
+      "add x26, x27, x21\n"
       "96:"  // Height 4: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 101f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q3, [x20, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q3, [x26, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 99f\n"
       "97:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr d5, [x10, #0x10]\n"
+      "ldr x22, [x12, #0x78]\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr x24, [x10, #0x18]\n"
+      "ldr x21, [x12, #0x88]\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr d6, [x10, #0x20]\n"
+      "ldr x20, [x12, #0x98]\n"
       ".inst 0x4f83e09c  // sdot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr x23, [x10, #0x28]\n"
-      "mov v5.d[1], x24\n"
-      "ldr d7, [x10, #0x30]\n"
-      "ldr x19, [x10, #0x38]\n"
-      "add x25, x25, #0x10\n"
+      "ldr d4, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v6.d[1], x23\n"
+      "mov v4.d[1], x22\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr d8, [x10, #0x40]\n"
+      "ldr x25, [x12, #0xa8]\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "mov v7.d[1], x19\n"
+      "ldr x24, [x12, #0xb8]\n"
       ".inst 0x4f83e0bd  // sdot v29.4s, v5.16b, v3.4b[0]\n"
-      "ldr x23, [x10, #0x48]\n"
+      "ldr d5, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr d9, [x10, #0x50]\n"
+      "mov v5.d[1], x21\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr x19, [x10, #0x58]\n"
+      "ldr x23, [x12, #0xc8]\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
-      "mov v8.d[1], x23\n"
+      "ldr x22, [x12, #0xd8]\n"
       ".inst 0x4f83e0de  // sdot v30.4s, v6.16b, v3.4b[0]\n"
-      "ldr d10, [x10, #0x60]\n"
+      "ldr d6, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "mov v9.d[1], x19\n"
+      "mov v6.d[1], x20\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr x23, [x10, #0x68]\n"
+      "ldr x21, [x12, #0xe8]\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr d4, [x10, #0x70]\n"
+      "ldr x20, [x12, #0xf8]\n"
       ".inst 0x4f83e0ff  // sdot v31.4s, v7.16b, v3.4b[0]\n"
-      "ldr x19, [x10, #0x78]\n"
+      "ldr d7, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "mov v10.d[1], x23\n"
+      "mov v7.d[1], x25\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr d5, [x10, #0x80]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
-      "mov v4.d[1], x19\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4fa3e11c  // sdot v28.4s, v8.16b, v3.4b[1]\n"
-      "ldr x24, [x10, #0x88]\n"
+      "ldr d8, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr d6, [x10, #0x90]\n"
+      "mov v8.d[1], x24\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr x23, [x10, #0x98]\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
-      "mov v5.d[1], x24\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4fa3e13d  // sdot v29.4s, v9.16b, v3.4b[1]\n"
-      "ldr d7, [x10, #0xa0]\n"
+      "ldr d9, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "mov v6.d[1], x23\n"
+      "mov v9.d[1], x23\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr x19, [x10, #0xa8]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr d8, [x10, #0xb0]\n"
       ".inst 0x4fa3e15e  // sdot v30.4s, v10.16b, v3.4b[1]\n"
-      "ldr x23, [x10, #0xb8]\n"
+      "ldr d10, [x12, #0xd0]\n"
       ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v7.d[1], x19\n"
+      "mov v10.d[1], x22\n"
       ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr d9, [x10, #0xc0]\n"
       ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
-      "mov v8.d[1], x23\n"
       ".inst 0x4fa3e09f  // sdot v31.4s, v4.16b, v3.4b[1]\n"
-      "ldr x19, [x10, #0xc8]\n"
+      "ldr d4, [x12, #0xe0]\n"
       ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr d10, [x10, #0xd0]\n"
+      "mov v4.d[1], x21\n"
       ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr x23, [x10, #0xd8]\n"
       ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
-      "mov v9.d[1], x19\n"
       ".inst 0x4f83e8bc  // sdot v28.4s, v5.16b, v3.4b[2]\n"
-      "ldr d4, [x10, #0xe0]\n"
+      "ldr d5, [x12, #0xf0]\n"
       ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "mov v10.d[1], x23\n"
+      "mov v5.d[1], x20\n"
       ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      "ldr x19, [x10, #0xe8]\n"
+      "add x12, x12, #0x100\n"
       ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
-      "ldr d5, [x10, #0xf0]\n"
       ".inst 0x4f83e8dd  // sdot v29.4s, v6.16b, v3.4b[2]\n"
-      "ldr x24, [x10, #0xf8]\n"
       ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      "mov v4.d[1], x19\n"
       ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
-      "mov v5.d[1], x24\n"
       ".inst 0x4f83e8fe  // sdot v30.4s, v7.16b, v3.4b[2]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      "add x10, x10, #0x100\n"
       ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
       ".inst 0x4f83e91f  // sdot v31.4s, v8.16b, v3.4b[2]\n"
       ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
@@ -1611,77 +1562,77 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "98:"  // Height 4: Multiply loop: unique 13: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x20\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q3, [x20, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q3, [x26, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "bge 97b\n"
       "99:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x10, #0x10]\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f83e09c  // sdot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr q8, [x10, #0x40]\n"
+      "ldr q4, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x50]\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q10, [x10, #0x60]\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q4, [x10, #0x70]\n"
       ".inst 0x4f83e0bd  // sdot v29.4s, v5.16b, v3.4b[0]\n"
-      "ldr q5, [x10, #0x80]\n"
+      "ldr q5, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4f83e0de  // sdot v30.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q6, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0ff  // sdot v31.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0xa0]\n"
+      "ldr q7, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
       ".inst 0x4fa3e11c  // sdot v28.4s, v8.16b, v3.4b[1]\n"
-      "ldr q8, [x10, #0xb0]\n"
+      "ldr q8, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
       ".inst 0x4fa3e13d  // sdot v29.4s, v9.16b, v3.4b[1]\n"
-      "ldr q9, [x10, #0xc0]\n"
+      "ldr q9, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
       ".inst 0x4fa3e15e  // sdot v30.4s, v10.16b, v3.4b[1]\n"
-      "ldr q10, [x10, #0xd0]\n"
+      "ldr q10, [x12, #0xd0]\n"
       ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
       ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
       ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
       ".inst 0x4fa3e09f  // sdot v31.4s, v4.16b, v3.4b[1]\n"
-      "ldr q4, [x10, #0xe0]\n"
+      "ldr q4, [x12, #0xe0]\n"
       ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
       ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
       ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
       ".inst 0x4f83e8bc  // sdot v28.4s, v5.16b, v3.4b[2]\n"
-      "ldr q5, [x10, #0xf0]\n"
+      "ldr q5, [x12, #0xf0]\n"
       ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x10, x10, #0x100\n"
+      "add x12, x12, #0x100\n"
       ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8dd  // sdot v29.4s, v6.16b, v3.4b[2]\n"
@@ -1715,67 +1666,67 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "100:"  // Height 4: Multiply loop: unique 14: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "101:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x26, 108f\n"
-      "cmp x26, #0x4\n"
+      "cbz x10, 108f\n"
+      "cmp x10, #0x4\n"
       "blt 104f\n"
       "102:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
-      "ldr s2, [x21], #0x4\n"
-      "ldr s3, [x20], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x27], #0x4\n"
+      "ldr s3, [x26], #0x4\n"
       "tbnz %x[flags], #31, 103f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "103:"  // Height 4: Multiply loop: unique 15: skip row sum
-      "ldr q6, [x10, #0x0]\n"
-      "sub x26, x26, #0x4\n"
-      "ldr q7, [x10, #0x10]\n"
-      "cmp x26, #0x4\n"
-      "ldr q8, [x10, #0x20]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x30]\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0dc  // sdot v28.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0fd  // sdot v29.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x4f83e11e  // sdot v30.4s, v8.16b, v3.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
-      ".inst 0x4f83e13f  // sdot v31.4s, v9.16b, v3.4b[0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q6, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q5, [x12, #0x20]\n"
+      ".inst 0x4f80e0f0  // sdot v16.4s, v7.16b, v0.4b[0]\n"
+      "ldr q4, [x12, #0x30]\n"
+      ".inst 0x4f81e0f4  // sdot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f83e0fc  // sdot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0dd  // sdot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0be  // sdot v30.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x4f83e09f  // sdot v31.4s, v4.16b, v3.4b[0]\n"
       "bge 102b\n"
-      "cbz x26, 108f\n"
       "104:"  // Height 4: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 105f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "ldr h2, [x21], #0x2\n"
-      "ldr h3, [x20], #0x2\n"
-      "tbz x26, #0, 106f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x22]\n"
-      "ld1 { v2.b }[2], [x21]\n"
-      "ld1 { v3.b }[2], [x20]\n"
+      "cbz x10, 108f\n"
+      "tbz x10, #1, 105f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x27], #0x2\n"
+      "ldr h3, [x26], #0x2\n"
+      "tbz x10, #0, 106f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x27]\n"
+      "ld1 { v3.b }[2], [x26]\n"
       "b 106f\n"
       "105:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
-      "ldr b2, [x21, #0x0]\n"
-      "ldr b3, [x20, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x27, #0x0]\n"
+      "ldr b3, [x26, #0x0]\n"
       "106:"  // Height 4: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 107f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
@@ -1783,60 +1734,64 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "107:"  // Height 4: Multiply loop: unique 16: skip row sum
-      "ldr q10, [x10, #0x0]\n"
-      "ldr q4, [x10, #0x10]\n"
-      "ldr q5, [x10, #0x20]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x4f82e158  // sdot v24.4s, v10.16b, v2.4b[0]\n"
-      ".inst 0x4f83e15c  // sdot v28.4s, v10.16b, v3.4b[0]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x4f82e099  // sdot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x4f83e09d  // sdot v29.4s, v4.16b, v3.4b[0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      ".inst 0x4f80e0f0  // sdot v16.4s, v7.16b, v0.4b[0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      ".inst 0x4f81e0f4  // sdot v20.4s, v7.16b, v1.4b[0]\n"
+      "ldr q5, [x12, #0x20]\n"
+      ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+      "ldr q4, [x12, #0x30]\n"
+      ".inst 0x4f83e0fc  // sdot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0dd  // sdot v29.4s, v6.16b, v3.4b[0]\n"
       ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
       ".inst 0x4f83e0be  // sdot v30.4s, v5.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0db  // sdot v27.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0df  // sdot v31.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x4f83e09f  // sdot v31.4s, v4.16b, v3.4b[0]\n"
       "108:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 94b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x21, x28, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20\n"
+      "add x22, x23, x20\n"
+      "add x21, x22, x20\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
-      "add x19, x20, x19\n"
-      "prfm pstl1keep, [x19, #0x0]\n"
       "tbnz %x[flags], #31, 109f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "neg v0.4s, v0.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "neg v4.4s, v4.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "mul v11.4s, v11.4s, v4.4s\n"
-      "mul v12.4s, v12.4s, v4.4s\n"
-      "mul v13.4s, v13.4s, v4.4s\n"
-      "mul v14.4s, v14.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
       "109:"  // Height 4: skip row sum fixup
+      "ldr q3, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
+      "ldr q2, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q1, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q0, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
@@ -1850,97 +1805,93 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       "add v29.4s, v29.4s, v14.4s\n"
       "add v30.4s, v30.4s, v14.4s\n"
       "add v31.4s, v31.4s, v14.4s\n"
-      "ldr q0, [x9, #0x0]\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v2.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v1.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v2.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v1.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "ldr q1, [x9, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ldr q2, [x9, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v28.4s, v28.4s, v0.4s\n"
-      "add v29.4s, v29.4s, v1.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "ldr q3, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 110f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v2.16b, v16.16b, v0.16b\n"
+      "and v1.16b, v17.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v2.4s\n"
+      "sqadd v17.4s, v17.4s, v1.4s\n"
+      "and v7.16b, v18.16b, v0.16b\n"
+      "and v6.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v4.16b, v21.16b, v0.16b\n"
+      "and v3.16b, v22.16b, v0.16b\n"
+      "and v2.16b, v23.16b, v0.16b\n"
+      "and v1.16b, v24.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "and v9.16b, v28.16b, v0.16b\n"
-      "and v10.16b, v29.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "and v4.16b, v30.16b, v0.16b\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "and v5.16b, v31.16b, v0.16b\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
-      "sqadd v28.4s, v28.4s, v9.4s\n"
-      "sqadd v29.4s, v29.4s, v10.4s\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v7.4s\n"
+      "sqadd v19.4s, v19.4s, v6.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v4.4s\n"
+      "sqadd v22.4s, v22.4s, v3.4s\n"
+      "sqadd v23.4s, v23.4s, v2.4s\n"
+      "sqadd v24.4s, v24.4s, v1.4s\n"
+      "and v7.16b, v25.16b, v0.16b\n"
+      "and v6.16b, v26.16b, v0.16b\n"
+      "and v5.16b, v27.16b, v0.16b\n"
+      "and v4.16b, v28.16b, v0.16b\n"
+      "and v3.16b, v29.16b, v0.16b\n"
+      "and v2.16b, v30.16b, v0.16b\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v30.4s, v30.4s, v4.4s\n"
-      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v7.4s\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "sqadd v28.4s, v28.4s, v4.4s\n"
+      "sqadd v29.4s, v29.4s, v3.4s\n"
+      "sqadd v30.4s, v30.4s, v2.4s\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
       "110:"  // Height 4: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -1958,189 +1909,188 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       "srshl v29.4s, v29.4s, v0.4s\n"
       "srshl v30.4s, v30.4s, v0.4s\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "cmp x11, #0x10\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v0.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v0.4s\n"
+      "add v22.4s, v22.4s, v0.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v0.4s\n"
+      "add v26.4s, v26.4s, v0.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "add v29.4s, v29.4s, v0.4s\n"
+      "add v30.4s, v30.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v0.4s\n"
+      "smin v17.4s, v17.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v0.4s\n"
+      "smin v19.4s, v19.4s, v0.4s\n"
+      "smin v20.4s, v20.4s, v0.4s\n"
+      "smin v21.4s, v21.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v0.4s\n"
+      "smin v23.4s, v23.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v0.4s\n"
+      "smin v25.4s, v25.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v0.4s\n"
+      "smin v27.4s, v27.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v0.4s\n"
+      "smin v29.4s, v29.4s, v0.4s\n"
+      "smin v30.4s, v30.4s, v0.4s\n"
+      "smin v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v0.4s\n"
+      "smax v17.4s, v17.4s, v0.4s\n"
+      "smax v18.4s, v18.4s, v0.4s\n"
+      "smax v19.4s, v19.4s, v0.4s\n"
+      "smax v20.4s, v20.4s, v0.4s\n"
+      "smax v21.4s, v21.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v0.4s\n"
+      "smax v23.4s, v23.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v0.4s\n"
+      "smax v25.4s, v25.4s, v0.4s\n"
+      "smax v26.4s, v26.4s, v0.4s\n"
+      "smax v27.4s, v27.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v0.4s\n"
+      "smax v29.4s, v29.4s, v0.4s\n"
+      "smax v30.4s, v30.4s, v0.4s\n"
+      "smax v31.4s, v31.4s, v0.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 119f\n"
-      "tbz x11, #3, 114f\n"
-      "str d16, [x28], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "str d28, [x19], #0x8\n"
-      "tbz x11, #2, 112f\n"
-      "st1 { v16.s }[2], [x28], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
-      "st1 { v28.s }[2], [x19], #0x4\n"
-      "tbz x11, #1, 111f\n"
-      "st1 { v16.h }[6], [x28], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
-      "st1 { v28.h }[6], [x19], #0x2\n"
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[14], [x28]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
-      "st1 { v28.b }[14], [x19]\n"
+      "tbz x14, #3, 114f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x14, #2, 112f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x14, #1, 111f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 118f\n"
       "111:"  // Height 4: Partial direct writeback: partial_1_12
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[12], [x28]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
-      "st1 { v28.b }[12], [x19]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 118f\n"
       "112:"  // Height 4: Partial direct writeback: partial_2_8
-      "tbz x11, #1, 113f\n"
-      "st1 { v16.h }[4], [x28], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
-      "st1 { v28.h }[4], [x19], #0x2\n"
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[10], [x28]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
-      "st1 { v28.b }[10], [x19]\n"
+      "tbz x14, #1, 113f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 118f\n"
       "113:"  // Height 4: Partial direct writeback: partial_1_8
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[8], [x28]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
-      "st1 { v28.b }[8], [x19]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 118f\n"
       "114:"  // Height 4: Partial direct writeback: partial_4_0
-      "tbz x11, #2, 116f\n"
-      "str s16, [x28], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
-      "str s28, [x19], #0x4\n"
-      "tbz x11, #1, 115f\n"
-      "st1 { v16.h }[2], [x28], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
-      "st1 { v28.h }[2], [x19], #0x2\n"
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[6], [x28]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
-      "st1 { v28.b }[6], [x19]\n"
+      "tbz x14, #2, 116f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x14, #1, 115f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 118f\n"
       "115:"  // Height 4: Partial direct writeback: partial_1_4
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[4], [x28]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
-      "st1 { v28.b }[4], [x19]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 118f\n"
       "116:"  // Height 4: Partial direct writeback: partial_2_0
-      "tbz x11, #1, 117f\n"
-      "str h16, [x28], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
-      "str h28, [x19], #0x2\n"
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[2], [x28]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
-      "st1 { v28.b }[2], [x19]\n"
+      "tbz x14, #1, 117f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 118f\n"
       "117:"  // Height 4: Partial direct writeback: partial_1_0
-      "str b16, [x28, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
-      "str b28, [x19, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "118:"  // Height 4: Partial direct writeback: Done
       "b 120f\n"
       "119:"  // Height 4: Full writeback
-      "str q16, [x28, #0x0]\n"
-      "add x28, x28, #0x10\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q28, [x19, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "120:"  // Height 4: Writeback done
-      "subs x11, x11, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 92b\n"
       "subs %x[M], %x[M], #0x4\n"
       "beq 122f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 121f\n"
-      "add x20, x20, #0x4\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "121:"  // Update direct input
-      "mov x19, #0x4\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x4\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "122:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
index 0adfb99f23..3b773a6827 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -78,311 +78,310 @@ void a64_hybrid_s8qa_dot_4x16 (
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 91f\n"
       "cmp %x[M], #0x2\n"
       "bgt 61f\n"
       "beq 31f\n"
+      "mov x10, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v15.16b, #0x1\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x27, %x[col_bias]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x26, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
       "movi v18.4s, #0x0\n"
       "movi v19.4s, #0x0\n"
       "3:"  // Height 1: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "cbnz x25, 6f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
       "b 6f\n"
       "5:"  // Height 1: setup direct input
-      "mov x23, %x[input_ptr]\n"
+      "mov x24, %x[input_ptr]\n"
       "6:"  // Height 1: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "blt 11f\n"
-      "ldr q0, [x23, #0x0]\n"
+      "ldr q0, [x24, #0x0]\n"
       "ldr q4, [x28, #0x0]\n"
-      "cmp x24, #0x20\n"
-      "blt 9f\n"
-      "7:"  // Height 1: Multiply loop: Main loop head
-      ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
+      "cmp x25, #0x20\n"
       "ldr q5, [x28, #0x10]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
       "ldr q6, [x28, #0x20]\n"
       "ldr q7, [x28, #0x30]\n"
-      ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
       "ldr q8, [x28, #0x40]\n"
-      ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       "ldr q9, [x28, #0x50]\n"
       "ldr q10, [x28, #0x60]\n"
+      "blt 9f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q21, [x28, #0x70]\n"
+      ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q20, [x28, #0x80]\n"
+      ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q26, [x28, #0x90]\n"
+      ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
+      "ldr q25, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q4, [x28, #0x70]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q24, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
+      "ldr q23, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr q6, [x28, #0x90]\n"
-      "ldr q7, [x28, #0xa0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q9, [x28, #0xc0]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      "ldr q5, [x28, #0xf0]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      ".inst 0x4fa0e2b3  // sdot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      ".inst 0x4f80ea90  // sdot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      ".inst 0x4f80eb51  // sdot v17.4s, v26.16b, v0.4b[2]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f80eb32  // sdot v18.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f80eb13  // sdot v19.4s, v24.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eaf0  // sdot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ead1  // sdot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eab2  // sdot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea93  // sdot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 8f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "ldr q0, [x23, #0x0]\n"
-      "cmp x24, #0x20\n"
+      "ldr q0, [x24, #0x0]\n"
       "ldr q4, [x28, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "bge 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "sub x24, x24, #0x10\n"
+      "ldr q21, [x28, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q20, [x28, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "ldr q8, [x28, #0x40]\n"
+      "ldr q26, [x28, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
+      "ldr q25, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q10, [x28, #0x60]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q24, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q5, [x28, #0x80]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q23, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q7, [x28, #0xa0]\n"
-      "ldr q8, [x28, #0xb0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q9, [x28, #0xc0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      ".inst 0x4fa0e2b3  // sdot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      ".inst 0x4f80ea90  // sdot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      ".inst 0x4f80eb51  // sdot v17.4s, v26.16b, v0.4b[2]\n"
+      "sub x25, x25, #0x10\n"
+      ".inst 0x4f80eb32  // sdot v18.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f80eb13  // sdot v19.4s, v24.16b, v0.4b[2]\n"
+      "add x24, x24, #0x10\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eaf0  // sdot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ead1  // sdot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eab2  // sdot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea93  // sdot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 10f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "10:"  // Height 1: Multiply loop: unique 2: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "11:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x24, 18f\n"
-      "cmp x24, #0x4\n"
+      "cbz x25, 18f\n"
+      "cmp x25, #0x4\n"
       "blt 14f\n"
       "12:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x23], #0x4\n"
+      "ldr s0, [x24], #0x4\n"
       "tbnz %x[flags], #31, 13f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "13:"  // Height 1: Multiply loop: unique 3: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x24, x24, #0x4\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      "ldr q8, [x28, #0x20]\n"
-      "cmp x24, #0x4\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x30]\n"
+      "ldr q23, [x28, #0x0]\n"
+      "ldr q22, [x28, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x4\n"
+      "ldr q21, [x28, #0x20]\n"
+      "ldr q20, [x28, #0x30]\n"
+      ".inst 0x4f80e2f0  // sdot v16.4s, v23.16b, v0.4b[0]\n"
+      ".inst 0x4f80e2d1  // sdot v17.4s, v22.16b, v0.4b[0]\n"
+      ".inst 0x4f80e2b2  // sdot v18.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f80e293  // sdot v19.4s, v20.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
       "bge 12b\n"
-      "cbz x24, 18f\n"
       "14:"  // Height 1: Multiply loop: Skip odd blocks
-      "tbz x24, #1, 15f\n"
-      "ldr h0, [x23], #0x2\n"
-      "tbz x24, #0, 16f\n"
-      "ld1 { v0.b }[2], [x23]\n"
+      "cbz x25, 18f\n"
+      "tbz x25, #1, 15f\n"
+      "ldr h0, [x24], #0x2\n"
+      "tbz x25, #0, 16f\n"
+      "ld1 { v0.b }[2], [x24]\n"
       "b 16f\n"
       "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x23, #0x0]\n"
+      "ldr b0, [x24, #0x0]\n"
       "16:"  // Height 1: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 17f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "17:"  // Height 1: Multiply loop: unique 4: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      "ldr q5, [x28, #0x20]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x30]\n"
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q20, [x28, #0x10]\n"
+      ".inst 0x4f80e2b0  // sdot v16.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f80e291  // sdot v17.4s, v20.16b, v0.4b[0]\n"
+      "ldr q21, [x28, #0x20]\n"
+      "ldr q20, [x28, #0x30]\n"
+      ".inst 0x4f80e2b2  // sdot v18.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f80e293  // sdot v19.4s, v20.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
       "18:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x25, x25, #0x1\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 4b\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
       "tbnz %x[flags], #31, 19f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
+      "neg v20.4s, v20.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "ld1r { v1.4s }, [x22]\n"
-      "neg v1.4s, v1.4s\n"
-      "mul v11.4s, v11.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v20.4s\n"
       "19:"  // Height 1: skip row sum fixup
+      "ldr q24, [x10, #0x0]\n"
+      "ldr q23, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q0, [x27, #0x0]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q1, [x27, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ldr q22, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q2, [x27, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "ldr q3, [x27, #0x30]\n"
-      "add x27, x27, #0x40\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v16.4s, v16.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v23.4s\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v19.4s, v19.4s, v21.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v20.4s\n"
+      "add x10, x10, #0x40\n"
+      "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v20.4s\n"
       "tbz %x[flags], #5, 20f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v0.16b\n"
+      "and v21.16b, v18.16b, v0.16b\n"
+      "and v20.16b, v19.16b, v0.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "20:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v22.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x22, %x[qp], %[minval]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
-      "ld1r { v5.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "ld1r { v6.4s }, [x22]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v22.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add v19.4s, v19.4s, v22.4s\n"
       "cmp x9, #0x10\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v16.4s, v16.4s, v21.4s\n"
+      "smin v17.4s, v17.4s, v21.4s\n"
+      "smin v18.4s, v18.4s, v21.4s\n"
+      "smin v19.4s, v19.4s, v21.4s\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 29f\n"
       "tbz x9, #3, 24f\n"
-      "str d16, [x26], #0x8\n"
+      "str d16, [x27], #0x8\n"
       "tbz x9, #2, 22f\n"
-      "st1 { v16.s }[2], [x26], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
       "tbz x9, #1, 21f\n"
-      "st1 { v16.h }[6], [x26], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[14], [x26]\n"
+      "st1 { v16.b }[14], [x27]\n"
       "b 28f\n"
       "21:"  // Height 1: Partial direct writeback: partial_1_12
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[12], [x26]\n"
+      "st1 { v16.b }[12], [x27]\n"
       "b 28f\n"
       "22:"  // Height 1: Partial direct writeback: partial_2_8
       "tbz x9, #1, 23f\n"
-      "st1 { v16.h }[4], [x26], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[10], [x26]\n"
+      "st1 { v16.b }[10], [x27]\n"
       "b 28f\n"
       "23:"  // Height 1: Partial direct writeback: partial_1_8
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[8], [x26]\n"
+      "st1 { v16.b }[8], [x27]\n"
       "b 28f\n"
       "24:"  // Height 1: Partial direct writeback: partial_4_0
       "tbz x9, #2, 26f\n"
-      "str s16, [x26], #0x4\n"
+      "str s16, [x27], #0x4\n"
       "tbz x9, #1, 25f\n"
-      "st1 { v16.h }[2], [x26], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[6], [x26]\n"
+      "st1 { v16.b }[6], [x27]\n"
       "b 28f\n"
       "25:"  // Height 1: Partial direct writeback: partial_1_4
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[4], [x26]\n"
+      "st1 { v16.b }[4], [x27]\n"
       "b 28f\n"
       "26:"  // Height 1: Partial direct writeback: partial_2_0
       "tbz x9, #1, 27f\n"
-      "str h16, [x26], #0x2\n"
+      "str h16, [x27], #0x2\n"
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[2], [x26]\n"
+      "st1 { v16.b }[2], [x27]\n"
       "b 28f\n"
       "27:"  // Height 1: Partial direct writeback: partial_1_0
-      "str b16, [x26, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
       "28:"  // Height 1: Partial direct writeback: Done
       "b 30f\n"
       "29:"  // Height 1: Full writeback
-      "str q16, [x26, #0x0]\n"
-      "add x26, x26, #0x10\n"
+      "str q16, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
       "30:"  // Height 1: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 2b\n"
       "b 122f\n"
       "31:"  // Height 2
+      "mov x10, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "movi v12.4s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v15.16b, #0x1\n"
-      "mov x26, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "32:"  // Height 2: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -393,414 +392,414 @@ void a64_hybrid_s8qa_dot_4x16 (
       "movi v22.4s, #0x0\n"
       "movi v23.4s, #0x0\n"
       "33:"  // Height 2: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "34:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 35f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "cbnz x25, 36f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 36f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 36f\n"
       "35:"  // Height 2: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
       "36:"  // Height 2: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "blt 41f\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x24, #0x20\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
       "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
       "blt 39f\n"
       "37:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q25, [x28, #0x70]\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x80]\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q6, [x28, #0x90]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4fa0e333  // sdot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e337  // sdot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4f80eb10  // sdot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb14  // sdot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4f80ebd1  // sdot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebd5  // sdot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x4f80ebb2  // sdot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebb6  // sdot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f80eb93  // sdot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb97  // sdot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4fa0eb70  // sdot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb74  // sdot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb51  // sdot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb55  // sdot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb32  // sdot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb36  // sdot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb13  // sdot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb17  // sdot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 38f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "38:"  // Height 2: Multiply loop: unique 5: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x20\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
       "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
       "bge 37b\n"
       "39:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "sub x24, x24, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q25, [x28, #0x70]\n"
+      "sub x25, x25, #0x10\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x80]\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q30, [x28, #0x90]\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q6, [x28, #0x90]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4fa0e333  // sdot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e337  // sdot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4f80eb10  // sdot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb14  // sdot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4f80ebd1  // sdot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebd5  // sdot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x4f80ebb2  // sdot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebb6  // sdot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f80eb93  // sdot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb97  // sdot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4fa0eb70  // sdot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb74  // sdot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb51  // sdot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb55  // sdot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb32  // sdot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb36  // sdot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb13  // sdot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb17  // sdot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 40f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "40:"  // Height 2: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "41:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x24, 48f\n"
-      "cmp x24, #0x4\n"
+      "cbz x25, 48f\n"
+      "cmp x25, #0x4\n"
       "blt 44f\n"
       "42:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x23], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s1, [x23], #0x4\n"
       "tbnz %x[flags], #31, 43f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "43:"  // Height 2: Multiply loop: unique 7: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x24, x24, #0x4\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      "ldr q8, [x28, #0x20]\n"
-      "cmp x24, #0x4\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x30]\n"
+      "ldr q27, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x4\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x4f80e370  // sdot v16.4s, v27.16b, v0.4b[0]\n"
+      ".inst 0x4f81e374  // sdot v20.4s, v27.16b, v1.4b[0]\n"
+      ".inst 0x4f80e351  // sdot v17.4s, v26.16b, v0.4b[0]\n"
+      ".inst 0x4f81e355  // sdot v21.4s, v26.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x4f80e332  // sdot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e336  // sdot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f80e313  // sdot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e317  // sdot v23.4s, v24.16b, v1.4b[0]\n"
       "bge 42b\n"
-      "cbz x24, 48f\n"
       "44:"  // Height 2: Multiply loop: Skip odd blocks
-      "tbz x24, #1, 45f\n"
-      "ldr h0, [x23], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "tbz x24, #0, 46f\n"
-      "ld1 { v0.b }[2], [x23]\n"
-      "ld1 { v1.b }[2], [x22]\n"
+      "cbz x25, 48f\n"
+      "tbz x25, #1, 45f\n"
+      "ldr h0, [x24], #0x2\n"
+      "ldr h1, [x23], #0x2\n"
+      "tbz x25, #0, 46f\n"
+      "ld1 { v0.b }[2], [x24]\n"
+      "ld1 { v1.b }[2], [x23]\n"
       "b 46f\n"
       "45:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x23, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
+      "ldr b0, [x24, #0x0]\n"
+      "ldr b1, [x23, #0x0]\n"
       "46:"  // Height 2: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 47f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "47:"  // Height 2: Multiply loop: unique 8: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x4f80e310  // sdot v16.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e314  // sdot v20.4s, v24.16b, v1.4b[0]\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x4f80e351  // sdot v17.4s, v26.16b, v0.4b[0]\n"
+      ".inst 0x4f81e355  // sdot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x4f80e332  // sdot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e336  // sdot v22.4s, v25.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e313  // sdot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e317  // sdot v23.4s, v24.16b, v1.4b[0]\n"
       "48:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x25, x25, #0x1\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 34b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x21, x26, x19\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "tbnz %x[flags], #31, 49f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x22]\n"
       "addp v12.4s, v12.4s, v12.4s\n"
+      "neg v24.4s, v24.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "neg v2.4s, v2.4s\n"
-      "mul v11.4s, v11.4s, v2.4s\n"
-      "mul v12.4s, v12.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
       "49:"  // Height 2: skip row sum fixup
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q27, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q0, [x27, #0x0]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q1, [x27, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ldr q26, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q2, [x27, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "ldr q3, [x27, #0x30]\n"
-      "add x27, x27, #0x40\n"
       "add v20.4s, v20.4s, v12.4s\n"
-      "ld1r { v4.4s }, [x22]\n"
       "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v27.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v25.4s\n"
+      "add v20.4s, v20.4s, v28.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v21.4s, v21.4s, v27.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v23.4s, v23.4s, v25.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
       "tbz %x[flags], #5, 50f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v24.16b, v16.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v24.4s\n"
+      "and v30.16b, v17.16b, v0.16b\n"
+      "and v29.16b, v18.16b, v0.16b\n"
+      "and v28.16b, v19.16b, v0.16b\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v0.16b\n"
+      "and v25.16b, v22.16b, v0.16b\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "50:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x22, %x[qp], %[minval]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
-      "ld1r { v5.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "cmp x9, #0x10\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
+      "cmp x9, #0x10\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 59f\n"
       "tbz x9, #3, 54f\n"
-      "str d16, [x26], #0x8\n"
-      "str d20, [x21], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x23], #0x8\n"
       "tbz x9, #2, 52f\n"
-      "st1 { v16.s }[2], [x26], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
       "tbz x9, #1, 51f\n"
-      "st1 { v16.h }[6], [x26], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[14], [x26]\n"
-      "st1 { v20.b }[14], [x21]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 58f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[12], [x26]\n"
-      "st1 { v20.b }[12], [x21]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 58f\n"
       "52:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x9, #1, 53f\n"
-      "st1 { v16.h }[4], [x26], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[10], [x26]\n"
-      "st1 { v20.b }[10], [x21]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 58f\n"
       "53:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[8], [x26]\n"
-      "st1 { v20.b }[8], [x21]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 58f\n"
       "54:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x9, #2, 56f\n"
-      "str s16, [x26], #0x4\n"
-      "str s20, [x21], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x23], #0x4\n"
       "tbz x9, #1, 55f\n"
-      "st1 { v16.h }[2], [x26], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[6], [x26]\n"
-      "st1 { v20.b }[6], [x21]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 58f\n"
       "55:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[4], [x26]\n"
-      "st1 { v20.b }[4], [x21]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 58f\n"
       "56:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x9, #1, 57f\n"
-      "str h16, [x26], #0x2\n"
-      "str h20, [x21], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[2], [x26]\n"
-      "st1 { v20.b }[2], [x21]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 58f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_0
-      "str b16, [x26, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "58:"  // Height 2: Partial direct writeback: Done
       "b 60f\n"
       "59:"  // Height 2: Full writeback
-      "str q16, [x26, #0x0]\n"
-      "add x26, x26, #0x10\n"
-      "str q20, [x21, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q20, [x23, #0x0]\n"
       "60:"  // Height 2: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 32b\n"
       "b 122f\n"
       "61:"  // Height 3
+      "mov x10, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "movi v12.4s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
-      "mov x26, %x[output_ptr]\n"
       "movi v15.16b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "62:"  // Height 3: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -815,539 +814,539 @@ void a64_hybrid_s8qa_dot_4x16 (
       "movi v26.4s, #0x0\n"
       "movi v27.4s, #0x0\n"
       "63:"  // Height 3: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "64:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 65f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "cbnz x25, 66f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 66f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
       "b 66f\n"
       "65:"  // Height 3: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "66:"  // Height 3: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "blt 71f\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x24, #0x20\n"
-      "ldr q2, [x21, #0x0]\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q2, [x22, #0x0]\n"
       "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
       "blt 69f\n"
       "67:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q29, [x28, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
+      "ldr q28, [x28, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q5, [x28, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q4, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q3, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q31, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q30, [x28, #0xd0]\n"
+      ".inst 0x4fa0e3b3  // sdot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3b7  // sdot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3bb  // sdot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x28, #0xe0]\n"
+      ".inst 0x4f80eb90  // sdot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb94  // sdot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb98  // sdot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x28, #0xf0]\n"
+      ".inst 0x4f80e8b1  // sdot v17.4s, v5.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4f81e8b5  // sdot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8b9  // sdot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x4f80e892  // sdot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4f81e896  // sdot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4f82e89a  // sdot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x4f80e873  // sdot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x4f81e877  // sdot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x4f82e87b  // sdot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x4fa0ebf0  // sdot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebf4  // sdot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebf8  // sdot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebd1  // sdot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebd5  // sdot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebd9  // sdot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebb2  // sdot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebb6  // sdot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebba  // sdot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eb93  // sdot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb97  // sdot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb9b  // sdot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 68f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "68:"  // Height 3: Multiply loop: unique 9: skip row sum
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x20\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q4, [x28, #0x0]\n"
       "bge 67b\n"
       "69:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "sub x24, x24, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "sub x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q29, [x28, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
-      "add x21, x21, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
+      "ldr q28, [x28, #0x80]\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q5, [x28, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q4, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q3, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q31, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q30, [x28, #0xd0]\n"
+      ".inst 0x4fa0e3b3  // sdot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3b7  // sdot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3bb  // sdot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x28, #0xe0]\n"
+      ".inst 0x4f80eb90  // sdot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb94  // sdot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb98  // sdot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x28, #0xf0]\n"
+      ".inst 0x4f80e8b1  // sdot v17.4s, v5.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4f81e8b5  // sdot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8b9  // sdot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x4f80e892  // sdot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4f81e896  // sdot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4f82e89a  // sdot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x4f80e873  // sdot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x4f81e877  // sdot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x4f82e87b  // sdot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x4fa0ebf0  // sdot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebf4  // sdot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebf8  // sdot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebd1  // sdot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebd5  // sdot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebd9  // sdot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebb2  // sdot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebb6  // sdot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebba  // sdot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eb93  // sdot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb97  // sdot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb9b  // sdot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 70f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "70:"  // Height 3: Multiply loop: unique 10: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "71:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x24, 78f\n"
-      "cmp x24, #0x4\n"
+      "cbz x25, 78f\n"
+      "cmp x25, #0x4\n"
       "blt 74f\n"
       "72:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x23], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
-      "ldr s2, [x21], #0x4\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s1, [x23], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
       "tbnz %x[flags], #31, 73f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "73:"  // Height 3: Multiply loop: unique 11: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x24, x24, #0x4\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      "ldr q8, [x28, #0x20]\n"
-      "cmp x24, #0x4\n"
-      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
-      "ldr q9, [x28, #0x30]\n"
+      "ldr q31, [x28, #0x0]\n"
+      "ldr q30, [x28, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x4\n"
+      "ldr q29, [x28, #0x20]\n"
+      "ldr q28, [x28, #0x30]\n"
+      ".inst 0x4f80e3f0  // sdot v16.4s, v31.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3f4  // sdot v20.4s, v31.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3f8  // sdot v24.4s, v31.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3d1  // sdot v17.4s, v30.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x4f81e3d5  // sdot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3d9  // sdot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3b6  // sdot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3ba  // sdot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e397  // sdot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e39b  // sdot v27.4s, v28.16b, v2.4b[0]\n"
       "bge 72b\n"
-      "cbz x24, 78f\n"
       "74:"  // Height 3: Multiply loop: Skip odd blocks
-      "tbz x24, #1, 75f\n"
-      "ldr h0, [x23], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "ldr h2, [x21], #0x2\n"
-      "tbz x24, #0, 76f\n"
-      "ld1 { v0.b }[2], [x23]\n"
-      "ld1 { v1.b }[2], [x22]\n"
-      "ld1 { v2.b }[2], [x21]\n"
+      "cbz x25, 78f\n"
+      "tbz x25, #1, 75f\n"
+      "ldr h0, [x24], #0x2\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "tbz x25, #0, 76f\n"
+      "ld1 { v0.b }[2], [x24]\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "ld1 { v2.b }[2], [x22]\n"
       "b 76f\n"
       "75:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x23, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
-      "ldr b2, [x21, #0x0]\n"
+      "ldr b0, [x24, #0x0]\n"
+      "ldr b1, [x23, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
       "76:"  // Height 3: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 77f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "77:"  // Height 3: Multiply loop: unique 12: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x20]\n"
-      ".inst 0x4f82e158  // sdot v24.4s, v10.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x30]\n"
+      "ldr q31, [x28, #0x0]\n"
+      "ldr q30, [x28, #0x10]\n"
+      ".inst 0x4f80e3f0  // sdot v16.4s, v31.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3f4  // sdot v20.4s, v31.16b, v1.4b[0]\n"
+      "ldr q29, [x28, #0x20]\n"
+      "ldr q28, [x28, #0x30]\n"
+      ".inst 0x4f82e3f8  // sdot v24.4s, v31.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3d1  // sdot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3d5  // sdot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3d9  // sdot v25.4s, v30.16b, v2.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x4f82e099  // sdot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0db  // sdot v27.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3b6  // sdot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3ba  // sdot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e397  // sdot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e39b  // sdot v27.4s, v28.16b, v2.4b[0]\n"
       "78:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x25, x25, #0x1\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 64b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x21, x26, x19\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
       "tbnz %x[flags], #31, 79f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v3.4s }, [x22]\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
+      "neg v28.4s, v28.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "neg v3.4s, v3.4s\n"
-      "mul v11.4s, v11.4s, v3.4s\n"
-      "mul v12.4s, v12.4s, v3.4s\n"
-      "mul v13.4s, v13.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v28.4s\n"
+      "mul v12.4s, v12.4s, v28.4s\n"
+      "mul v13.4s, v13.4s, v28.4s\n"
       "79:"  // Height 3: skip row sum fixup
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q31, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q0, [x27, #0x0]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q1, [x27, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ldr q30, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q2, [x27, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "ldr q3, [x27, #0x30]\n"
-      "add x27, x27, #0x40\n"
       "add v20.4s, v20.4s, v12.4s\n"
-      "ld1r { v4.4s }, [x22]\n"
       "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v24.4s, v24.4s, v13.4s\n"
       "add v25.4s, v25.4s, v13.4s\n"
+      "add x10, x10, #0x40\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v31.4s\n"
+      "add v18.4s, v18.4s, v30.4s\n"
+      "add v19.4s, v19.4s, v29.4s\n"
       "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v31.4s\n"
+      "add v22.4s, v22.4s, v30.4s\n"
+      "add v23.4s, v23.4s, v29.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v31.4s\n"
+      "add v26.4s, v26.4s, v30.4s\n"
+      "add v27.4s, v27.4s, v29.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v28.4s\n"
       "tbz %x[flags], #5, 80f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v1.16b, v16.16b, v0.16b\n"
+      "and v31.16b, v17.16b, v0.16b\n"
+      "and v30.16b, v18.16b, v0.16b\n"
+      "and v29.16b, v19.16b, v0.16b\n"
+      "and v28.16b, v20.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v1.4s\n"
+      "sqadd v17.4s, v17.4s, v31.4s\n"
+      "sqadd v18.4s, v18.4s, v30.4s\n"
+      "sqadd v19.4s, v19.4s, v29.4s\n"
+      "sqadd v20.4s, v20.4s, v28.4s\n"
+      "and v3.16b, v21.16b, v0.16b\n"
+      "and v2.16b, v22.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v0.16b\n"
+      "and v29.16b, v26.16b, v0.16b\n"
+      "and v28.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v3.4s\n"
+      "sqadd v22.4s, v22.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "80:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v30.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x22, %x[qp], %[minval]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
-      "ld1r { v5.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "cmp x9, #0x10\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
+      "cmp x9, #0x10\n"
       "srshl v24.4s, v24.4s, v0.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
       "srshl v25.4s, v25.4s, v0.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v30.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v30.4s\n"
+      "add v19.4s, v19.4s, v30.4s\n"
+      "add v20.4s, v20.4s, v30.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v30.4s\n"
+      "add v23.4s, v23.4s, v30.4s\n"
+      "add v24.4s, v24.4s, v30.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v30.4s\n"
+      "add v27.4s, v27.4s, v30.4s\n"
+      "smin v16.4s, v16.4s, v29.4s\n"
+      "smin v17.4s, v17.4s, v29.4s\n"
+      "smin v18.4s, v18.4s, v29.4s\n"
+      "smin v19.4s, v19.4s, v29.4s\n"
+      "smin v20.4s, v20.4s, v29.4s\n"
+      "smin v21.4s, v21.4s, v29.4s\n"
+      "smin v22.4s, v22.4s, v29.4s\n"
+      "smin v23.4s, v23.4s, v29.4s\n"
+      "smin v24.4s, v24.4s, v29.4s\n"
+      "smin v25.4s, v25.4s, v29.4s\n"
+      "smin v26.4s, v26.4s, v29.4s\n"
+      "smin v27.4s, v27.4s, v29.4s\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 89f\n"
       "tbz x9, #3, 84f\n"
-      "str d16, [x26], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x9, #2, 82f\n"
-      "st1 { v16.s }[2], [x26], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x9, #1, 81f\n"
-      "st1 { v16.h }[6], [x26], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[14], [x26]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 88f\n"
       "81:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[12], [x26]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 88f\n"
       "82:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x9, #1, 83f\n"
-      "st1 { v16.h }[4], [x26], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[10], [x26]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 88f\n"
       "83:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[8], [x26]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 88f\n"
       "84:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x9, #2, 86f\n"
-      "str s16, [x26], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
       "tbz x9, #1, 85f\n"
-      "st1 { v16.h }[2], [x26], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[6], [x26]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 88f\n"
       "85:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[4], [x26]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 88f\n"
       "86:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x9, #1, 87f\n"
-      "str h16, [x26], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[2], [x26]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 88f\n"
       "87:"  // Height 3: Partial direct writeback: partial_1_0
-      "str b16, [x26, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "88:"  // Height 3: Partial direct writeback: Done
       "b 90f\n"
       "89:"  // Height 3: Full writeback
-      "str q16, [x26, #0x0]\n"
-      "add x26, x26, #0x10\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "90:"  // Height 3: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 62b\n"
       "b 122f\n"
       "91:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x4\n"
+      "mov x10, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "movi v12.4s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x26, %x[output_ptr]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v14.4s, #0x0\n"
-      "mov x19, #0x4\n"
       "movi v15.16b, #0x1\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "92:"  // Height 4: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -1366,59 +1365,59 @@ void a64_hybrid_s8qa_dot_4x16 (
       "movi v30.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
       "93:"  // Height 4: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "94:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 95f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "ldr x20, [x20, #0x18]\n"
-      "cbnz x25, 96f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 96f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
       "b 96f\n"
       "95:"  // Height 4: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "96:"  // Height 4: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "blt 101f\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x24, #0x20\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q3, [x20, #0x0]\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x21, #0x0]\n"
       "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
       "blt 99f\n"
       "97:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x4f83e09c  // sdot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
-      "add x20, x20, #0x10\n"
+      "ldr q4, [x28, #0x70]\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
+      "add x21, x21, #0x10\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
       ".inst 0x4f83e0bd  // sdot v29.4s, v5.16b, v3.4b[0]\n"
       "ldr q5, [x28, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
@@ -1491,38 +1490,38 @@ void a64_hybrid_s8qa_dot_4x16 (
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "98:"  // Height 4: Multiply loop: unique 13: skip row sum
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x21, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x20\n"
       "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q3, [x20, #0x0]\n"
-      "ldr q4, [x28, #0x0]\n"
       "bge 97b\n"
       "99:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "sub x24, x24, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "sub x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4f83e09c  // sdot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q4, [x28, #0x70]\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
       ".inst 0x4f83e0bd  // sdot v29.4s, v5.16b, v3.4b[0]\n"
       "ldr q5, [x28, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
@@ -1595,67 +1594,67 @@ void a64_hybrid_s8qa_dot_4x16 (
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "100:"  // Height 4: Multiply loop: unique 14: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
       "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "101:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x24, 108f\n"
-      "cmp x24, #0x4\n"
+      "cbz x25, 108f\n"
+      "cmp x25, #0x4\n"
       "blt 104f\n"
       "102:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x23], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
-      "ldr s2, [x21], #0x4\n"
-      "ldr s3, [x20], #0x4\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s1, [x23], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr s3, [x21], #0x4\n"
       "tbnz %x[flags], #31, 103f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "103:"  // Height 4: Multiply loop: unique 15: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x24, x24, #0x4\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      "ldr q8, [x28, #0x20]\n"
-      "cmp x24, #0x4\n"
-      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
-      "ldr q9, [x28, #0x30]\n"
+      "ldr q7, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x4\n"
+      "ldr q5, [x28, #0x20]\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x4f80e0f0  // sdot v16.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f4  // sdot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0fc  // sdot v28.4s, v7.16b, v3.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f83e0dc  // sdot v28.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0fd  // sdot v29.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x4f83e11e  // sdot v30.4s, v8.16b, v3.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
-      ".inst 0x4f83e13f  // sdot v31.4s, v9.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0dd  // sdot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0be  // sdot v30.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x4f83e09f  // sdot v31.4s, v4.16b, v3.4b[0]\n"
       "bge 102b\n"
-      "cbz x24, 108f\n"
       "104:"  // Height 4: Multiply loop: Skip odd blocks
-      "tbz x24, #1, 105f\n"
-      "ldr h0, [x23], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "ldr h2, [x21], #0x2\n"
-      "ldr h3, [x20], #0x2\n"
-      "tbz x24, #0, 106f\n"
-      "ld1 { v0.b }[2], [x23]\n"
-      "ld1 { v1.b }[2], [x22]\n"
-      "ld1 { v2.b }[2], [x21]\n"
-      "ld1 { v3.b }[2], [x20]\n"
+      "cbz x25, 108f\n"
+      "tbz x25, #1, 105f\n"
+      "ldr h0, [x24], #0x2\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "ldr h3, [x21], #0x2\n"
+      "tbz x25, #0, 106f\n"
+      "ld1 { v0.b }[2], [x24]\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "ld1 { v3.b }[2], [x21]\n"
       "b 106f\n"
       "105:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x23, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
-      "ldr b2, [x21, #0x0]\n"
-      "ldr b3, [x20, #0x0]\n"
+      "ldr b0, [x24, #0x0]\n"
+      "ldr b1, [x23, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "ldr b3, [x21, #0x0]\n"
       "106:"  // Height 4: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 107f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
@@ -1663,76 +1662,76 @@ void a64_hybrid_s8qa_dot_4x16 (
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "107:"  // Height 4: Multiply loop: unique 16: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
+      "ldr q7, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      ".inst 0x4f80e0f0  // sdot v16.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f4  // sdot v20.4s, v7.16b, v1.4b[0]\n"
       "ldr q5, [x28, #0x20]\n"
-      ".inst 0x4f82e158  // sdot v24.4s, v10.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x30]\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0fc  // sdot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f83e15c  // sdot v28.4s, v10.16b, v3.4b[0]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x4f82e099  // sdot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x4f83e09d  // sdot v29.4s, v4.16b, v3.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0dd  // sdot v29.4s, v6.16b, v3.4b[0]\n"
       ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
       ".inst 0x4f83e0be  // sdot v30.4s, v5.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0db  // sdot v27.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0df  // sdot v31.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x4f83e09f  // sdot v31.4s, v4.16b, v3.4b[0]\n"
       "108:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x25, x25, #0x1\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 94b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x21, x26, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "add x21, x22, x20\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
-      "add x19, x20, x19\n"
-      "prfm pstl1keep, [x19, #0x0]\n"
       "tbnz %x[flags], #31, 109f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
+      "neg v0.4s, v0.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "neg v4.4s, v4.4s\n"
-      "mul v11.4s, v11.4s, v4.4s\n"
-      "mul v12.4s, v12.4s, v4.4s\n"
-      "mul v13.4s, v13.4s, v4.4s\n"
-      "mul v14.4s, v14.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
       "109:"  // Height 4: skip row sum fixup
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q4, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q0, [x27, #0x0]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q1, [x27, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ldr q3, [x10, #0x20]\n"
+      "ldr q2, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q2, [x27, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "ldr q3, [x27, #0x30]\n"
-      "add x27, x27, #0x40\n"
       "add v20.4s, v20.4s, v12.4s\n"
-      "ld1r { v4.4s }, [x22]\n"
       "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v24.4s, v24.4s, v13.4s\n"
       "add v25.4s, v25.4s, v13.4s\n"
+      "add x10, x10, #0x40\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
       "add v28.4s, v28.4s, v14.4s\n"
@@ -1740,287 +1739,286 @@ void a64_hybrid_s8qa_dot_4x16 (
       "add v30.4s, v30.4s, v14.4s\n"
       "add v31.4s, v31.4s, v14.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v2.4s\n"
       "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v2.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v2.4s\n"
       "add v28.4s, v28.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v29.4s, v29.4s, v1.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v31.4s, v31.4s, v2.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
       "tbz %x[flags], #5, 110f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "and v9.16b, v21.16b, v0.16b\n"
+      "and v2.16b, v16.16b, v0.16b\n"
+      "and v1.16b, v17.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v2.4s\n"
+      "sqadd v17.4s, v17.4s, v1.4s\n"
+      "and v7.16b, v18.16b, v0.16b\n"
+      "and v6.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v4.16b, v21.16b, v0.16b\n"
+      "and v3.16b, v22.16b, v0.16b\n"
+      "and v2.16b, v23.16b, v0.16b\n"
+      "and v1.16b, v24.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "and v9.16b, v28.16b, v0.16b\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "and v10.16b, v29.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "and v4.16b, v30.16b, v0.16b\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "and v5.16b, v31.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v7.4s\n"
+      "sqadd v19.4s, v19.4s, v6.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v4.4s\n"
+      "sqadd v22.4s, v22.4s, v3.4s\n"
+      "sqadd v23.4s, v23.4s, v2.4s\n"
+      "sqadd v24.4s, v24.4s, v1.4s\n"
+      "and v7.16b, v25.16b, v0.16b\n"
+      "and v6.16b, v26.16b, v0.16b\n"
+      "and v5.16b, v27.16b, v0.16b\n"
+      "and v4.16b, v28.16b, v0.16b\n"
+      "and v3.16b, v29.16b, v0.16b\n"
+      "and v2.16b, v30.16b, v0.16b\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v28.4s, v28.4s, v9.4s\n"
-      "sqadd v29.4s, v29.4s, v10.4s\n"
-      "sqadd v30.4s, v30.4s, v4.4s\n"
-      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v7.4s\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "sqadd v28.4s, v28.4s, v4.4s\n"
+      "sqadd v29.4s, v29.4s, v3.4s\n"
+      "sqadd v30.4s, v30.4s, v2.4s\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
       "110:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v3.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x22, %x[qp], %[minval]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
-      "ld1r { v5.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "cmp x9, #0x10\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v2.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
+      "cmp x9, #0x10\n"
       "srshl v24.4s, v24.4s, v0.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
       "srshl v25.4s, v25.4s, v0.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
       "srshl v28.4s, v28.4s, v0.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
       "srshl v29.4s, v29.4s, v0.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
       "srshl v30.4s, v30.4s, v0.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "smin v16.4s, v16.4s, v2.4s\n"
+      "smin v17.4s, v17.4s, v2.4s\n"
+      "smin v18.4s, v18.4s, v2.4s\n"
+      "smin v19.4s, v19.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v2.4s\n"
+      "smin v21.4s, v21.4s, v2.4s\n"
+      "smin v22.4s, v22.4s, v2.4s\n"
+      "smin v23.4s, v23.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v2.4s\n"
+      "smin v25.4s, v25.4s, v2.4s\n"
+      "smin v26.4s, v26.4s, v2.4s\n"
+      "smin v27.4s, v27.4s, v2.4s\n"
+      "smin v28.4s, v28.4s, v2.4s\n"
+      "smin v29.4s, v29.4s, v2.4s\n"
+      "smin v30.4s, v30.4s, v2.4s\n"
+      "smin v31.4s, v31.4s, v2.4s\n"
+      "smax v16.4s, v16.4s, v1.4s\n"
+      "smax v17.4s, v17.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v1.4s\n"
+      "smax v19.4s, v19.4s, v1.4s\n"
+      "smax v20.4s, v20.4s, v1.4s\n"
+      "smax v21.4s, v21.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v1.4s\n"
+      "smax v23.4s, v23.4s, v1.4s\n"
+      "smax v24.4s, v24.4s, v1.4s\n"
+      "smax v25.4s, v25.4s, v1.4s\n"
+      "smax v26.4s, v26.4s, v1.4s\n"
+      "smax v27.4s, v27.4s, v1.4s\n"
+      "smax v28.4s, v28.4s, v1.4s\n"
+      "smax v29.4s, v29.4s, v1.4s\n"
+      "smax v30.4s, v30.4s, v1.4s\n"
+      "smax v31.4s, v31.4s, v1.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 119f\n"
       "tbz x9, #3, 114f\n"
-      "str d16, [x26], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "str d28, [x19], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
       "tbz x9, #2, 112f\n"
-      "st1 { v16.s }[2], [x26], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
-      "st1 { v28.s }[2], [x19], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
       "tbz x9, #1, 111f\n"
-      "st1 { v16.h }[6], [x26], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
-      "st1 { v28.h }[6], [x19], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[14], [x26]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
-      "st1 { v28.b }[14], [x19]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 118f\n"
       "111:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[12], [x26]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
-      "st1 { v28.b }[12], [x19]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 118f\n"
       "112:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x9, #1, 113f\n"
-      "st1 { v16.h }[4], [x26], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
-      "st1 { v28.h }[4], [x19], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[10], [x26]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
-      "st1 { v28.b }[10], [x19]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 118f\n"
       "113:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[8], [x26]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
-      "st1 { v28.b }[8], [x19]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 118f\n"
       "114:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x9, #2, 116f\n"
-      "str s16, [x26], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
-      "str s28, [x19], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
       "tbz x9, #1, 115f\n"
-      "st1 { v16.h }[2], [x26], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
-      "st1 { v28.h }[2], [x19], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[6], [x26]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
-      "st1 { v28.b }[6], [x19]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 118f\n"
       "115:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[4], [x26]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
-      "st1 { v28.b }[4], [x19]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 118f\n"
       "116:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x9, #1, 117f\n"
-      "str h16, [x26], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
-      "str h28, [x19], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[2], [x26]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
-      "st1 { v28.b }[2], [x19]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 118f\n"
       "117:"  // Height 4: Partial direct writeback: partial_1_0
-      "str b16, [x26, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
-      "str b28, [x19, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "118:"  // Height 4: Partial direct writeback: Done
       "b 120f\n"
       "119:"  // Height 4: Full writeback
-      "str q16, [x26, #0x0]\n"
-      "add x26, x26, #0x10\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q28, [x19, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "120:"  // Height 4: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 92b\n"
       "subs %x[M], %x[M], #0x4\n"
       "beq 122f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 121f\n"
-      "add x20, x20, #0x4\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "121:"  // Update direct input
-      "mov x19, #0x4\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x4\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "122:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
new file mode 100644
index 0000000000..55ea68d1b5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_s8qa_mmla_4x16( ARGLIST );
+
+class cls_a64_hybrid_s8qa_mmla_4x16
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 47.74 };
+                case CPUModel::A510:
+                    return { 27.99 };
+                case CPUModel::V1:
+                    return { 62.26 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_s8qa_mmla_4x16;
+    cls_a64_hybrid_s8qa_mmla_4x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
new file mode 100644
index 0000000000..883bd5afdd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
@@ -0,0 +1,2098 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qa_mmla_4x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 97f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 65f\n"
+      "beq 33f\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v11.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x26, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x25, #0x10\n"
+      "blt 11f\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "blt 9f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v27.2d\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q25, [x28, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v27.2d\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "ldr q24, [x28, #0x80]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      "ldr q30, [x28, #0x90]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      "ldr q29, [x28, #0xa0]\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      "ldr q28, [x28, #0xb0]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      "ldr q27, [x28, #0xc0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4e99a417  // smmla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4e98a430  // smmla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      ".inst 0x4e9ea434  // smmla v20.4s, v1.16b, v30.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e9da431  // smmla v17.4s, v1.16b, v29.16b\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e9ca435  // smmla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x4e9ba432  // smmla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x4e9aa436  // smmla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e99a433  // smmla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e98a437  // smmla v23.4s, v1.16b, v24.16b\n"
+      "tbnz %x[flags], #31, 8f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      "8:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "bge 7b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v24.2d\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q25, [x28, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v24.2d\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "ldr q24, [x28, #0x80]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      "ldr q30, [x28, #0x90]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      "ldr q29, [x28, #0xa0]\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      "ldr q28, [x28, #0xb0]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      "ldr q27, [x28, #0xc0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4e99a417  // smmla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4e98a430  // smmla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      "sub x25, x25, #0x10\n"
+      ".inst 0x4e9ea434  // smmla v20.4s, v1.16b, v30.16b\n"
+      ".inst 0x4e9da431  // smmla v17.4s, v1.16b, v29.16b\n"
+      "add x24, x24, #0x10\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e9ca435  // smmla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x4e9ba432  // smmla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x4e9aa436  // smmla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e99a433  // smmla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e98a437  // smmla v23.4s, v1.16b, v24.16b\n"
+      "tbnz %x[flags], #31, 10f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      "10:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "11:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x25, 20f\n"
+      "cmp x25, #0x8\n"
+      "blt 14f\n"
+      "12:"  // Height 1: Multiply loop: Odd block loop
+      "ldr d25, [x24], #0x8\n"
+      "trn1 v0.2d, v25.2d, v24.2d\n"
+      "tbnz %x[flags], #31, 13f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "13:"  // Height 1: Multiply loop: unique 3: skip row sum
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x4e98a410  // smmla v16.4s, v0.16b, v24.16b\n"
+      "sub x25, x25, #0x8\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      "cmp x25, #0x8\n"
+      ".inst 0x4e9aa414  // smmla v20.4s, v0.16b, v26.16b\n"
+      "ldr q27, [x28, #0x40]\n"
+      "ldr q26, [x28, #0x50]\n"
+      ".inst 0x4e99a411  // smmla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a415  // smmla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x4e9ba412  // smmla v18.4s, v0.16b, v27.16b\n"
+      ".inst 0x4e9aa416  // smmla v22.4s, v0.16b, v26.16b\n"
+      ".inst 0x4e99a413  // smmla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a417  // smmla v23.4s, v0.16b, v24.16b\n"
+      "add x28, x28, #0x80\n"
+      "bge 12b\n"
+      "14:"  // Height 1: Multiply loop: Skip odd blocks
+      "cbz x25, 20f\n"
+      "tbz x25, #2, 16f\n"
+      "ldr s1, [x24], #0x4\n"
+      "tbz x25, #1, 15f\n"
+      "ld1 { v1.h }[2], [x24], #0x2\n"
+      "tbz x25, #0, 18f\n"
+      "ld1 { v1.b }[6], [x24]\n"
+      "b 18f\n"
+      "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x25, #0, 18f\n"
+      "ld1 { v1.b }[4], [x24]\n"
+      "b 18f\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x25, #1, 17f\n"
+      "ldr h1, [x24], #0x2\n"
+      "tbz x25, #0, 18f\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "b 18f\n"
+      "17:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x24, #0x0]\n"
+      "18:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v24.2d\n"
+      "tbnz %x[flags], #31, 19f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "19:"  // Height 1: Multiply loop: unique 4: skip row sum
+      "ldr q25, [x28, #0x0]\n"
+      "ldr q24, [x28, #0x10]\n"
+      ".inst 0x4e99a410  // smmla v16.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a414  // smmla v20.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x4e99a411  // smmla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a415  // smmla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x50]\n"
+      ".inst 0x4e99a412  // smmla v18.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a416  // smmla v22.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x4e99a413  // smmla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a417  // smmla v23.4s, v0.16b, v24.16b\n"
+      "add x28, x28, #0x80\n"
+      "20:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 4b\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "mov v23.16b, v16.16b\n"
+      "tbnz %x[flags], #31, 21f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "neg v16.4s, v16.4s\n"
+      "dup v11.4s, v11.s[0]\n"
+      "mul v11.4s, v11.4s, v16.4s\n"
+      "21:"  // Height 1: skip row sum fixup
+      "ldr q24, [x10, #0x0]\n"
+      "ldr q22, [x10, #0x10]\n"
+      "add v23.4s, v23.4s, v11.4s\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q21, [x10, #0x20]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v16.4s\n"
+      "add x10, x10, #0x40\n"
+      "sqrdmulh v17.4s, v17.4s, v16.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v16.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v16.4s\n"
+      "tbz %x[flags], #5, 22f\n"
+      "and v22.16b, v23.16b, v0.16b\n"
+      "and v21.16b, v17.16b, v0.16b\n"
+      "and v20.16b, v18.16b, v0.16b\n"
+      "and v16.16b, v19.16b, v0.16b\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v22.4s\n"
+      "sqadd v17.4s, v17.4s, v21.4s\n"
+      "sqadd v18.4s, v18.4s, v20.4s\n"
+      "sqadd v19.4s, v19.4s, v16.4s\n"
+      "22:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v23.4s, v23.4s, v21.4s\n"
+      "add v17.4s, v17.4s, v21.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v21.4s\n"
+      "cmp x9, #0x10\n"
+      "smin v23.4s, v23.4s, v20.4s\n"
+      "smin v17.4s, v17.4s, v20.4s\n"
+      "smin v18.4s, v18.4s, v20.4s\n"
+      "smin v19.4s, v19.4s, v20.4s\n"
+      "smax v23.4s, v23.4s, v16.4s\n"
+      "smax v17.4s, v17.4s, v16.4s\n"
+      "smax v18.4s, v18.4s, v16.4s\n"
+      "smax v19.4s, v19.4s, v16.4s\n"
+      "uzp1 v23.8h, v23.8h, v17.8h\n"
+      "uzp1 v16.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v16.16b\n"
+      "bge 31f\n"
+      "tbz x9, #3, 26f\n"
+      "str d23, [x27], #0x8\n"
+      "tbz x9, #2, 24f\n"
+      "st1 { v23.s }[2], [x27], #0x4\n"
+      "tbz x9, #1, 23f\n"
+      "st1 { v23.h }[6], [x27], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[14], [x27]\n"
+      "b 30f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[12], [x27]\n"
+      "b 30f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 25f\n"
+      "st1 { v23.h }[4], [x27], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[10], [x27]\n"
+      "b 30f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[8], [x27]\n"
+      "b 30f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 28f\n"
+      "str s23, [x27], #0x4\n"
+      "tbz x9, #1, 27f\n"
+      "st1 { v23.h }[2], [x27], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[6], [x27]\n"
+      "b 30f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[4], [x27]\n"
+      "b 30f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 29f\n"
+      "str h23, [x27], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[2], [x27]\n"
+      "b 30f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b23, [x27, #0x0]\n"
+      "30:"  // Height 1: Partial direct writeback: Done
+      "b 32f\n"
+      "31:"  // Height 1: Full writeback
+      "str q23, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "32:"  // Height 1: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 2b\n"
+      "b 130f\n"
+      "33:"  // Height 2
+      "mov x10, %x[col_bias]\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v15.16b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "34:"  // Height 2: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "35:"  // Height 2: setup done
+      "mov x26, #0x0\n"
+      "36:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 38f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "38:"  // Height 2: input setup done
+      "cmp x25, #0x10\n"
+      "blt 43f\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "blt 41f\n"
+      "39:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q25, [x28, #0x70]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "ldr q24, [x28, #0x80]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      "ldr q30, [x28, #0x90]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      "ldr q29, [x28, #0xa0]\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      "ldr q28, [x28, #0xb0]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      "ldr q27, [x28, #0xc0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4e99a417  // smmla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4e98a430  // smmla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      ".inst 0x4e9ea434  // smmla v20.4s, v1.16b, v30.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e9da431  // smmla v17.4s, v1.16b, v29.16b\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e9ca435  // smmla v21.4s, v1.16b, v28.16b\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e9ba432  // smmla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x4e9aa436  // smmla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e99a433  // smmla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e98a437  // smmla v23.4s, v1.16b, v24.16b\n"
+      "tbnz %x[flags], #31, 40f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      "40:"  // Height 2: Multiply loop: unique 5: skip row sum
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "bge 39b\n"
+      "41:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q25, [x28, #0x70]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "ldr q24, [x28, #0x80]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      "ldr q30, [x28, #0x90]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      "ldr q29, [x28, #0xa0]\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      "ldr q28, [x28, #0xb0]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      "ldr q27, [x28, #0xc0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4e99a417  // smmla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4e98a430  // smmla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      "sub x25, x25, #0x10\n"
+      ".inst 0x4e9ea434  // smmla v20.4s, v1.16b, v30.16b\n"
+      ".inst 0x4e9da431  // smmla v17.4s, v1.16b, v29.16b\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e9ca435  // smmla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x4e9ba432  // smmla v18.4s, v1.16b, v27.16b\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e9aa436  // smmla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e99a433  // smmla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e98a437  // smmla v23.4s, v1.16b, v24.16b\n"
+      "tbnz %x[flags], #31, 42f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      "42:"  // Height 2: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "43:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x25, 52f\n"
+      "cmp x25, #0x8\n"
+      "blt 46f\n"
+      "44:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d25, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "trn1 v0.2d, v25.2d, v24.2d\n"
+      "tbnz %x[flags], #31, 45f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "45:"  // Height 2: Multiply loop: unique 7: skip row sum
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x4e98a410  // smmla v16.4s, v0.16b, v24.16b\n"
+      "sub x25, x25, #0x8\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      "cmp x25, #0x8\n"
+      ".inst 0x4e9aa414  // smmla v20.4s, v0.16b, v26.16b\n"
+      "ldr q27, [x28, #0x40]\n"
+      "ldr q26, [x28, #0x50]\n"
+      ".inst 0x4e99a411  // smmla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a415  // smmla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x4e9ba412  // smmla v18.4s, v0.16b, v27.16b\n"
+      ".inst 0x4e9aa416  // smmla v22.4s, v0.16b, v26.16b\n"
+      ".inst 0x4e99a413  // smmla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a417  // smmla v23.4s, v0.16b, v24.16b\n"
+      "add x28, x28, #0x80\n"
+      "bge 44b\n"
+      "46:"  // Height 2: Multiply loop: Skip odd blocks
+      "cbz x25, 52f\n"
+      "tbz x25, #2, 48f\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x23], #0x4\n"
+      "tbz x25, #1, 47f\n"
+      "ld1 { v1.h }[2], [x24], #0x2\n"
+      "ld1 { v2.h }[2], [x23], #0x2\n"
+      "tbz x25, #0, 50f\n"
+      "ld1 { v1.b }[6], [x24]\n"
+      "ld1 { v2.b }[6], [x23]\n"
+      "b 50f\n"
+      "47:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x25, #0, 50f\n"
+      "ld1 { v1.b }[4], [x24]\n"
+      "ld1 { v2.b }[4], [x23]\n"
+      "b 50f\n"
+      "48:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x25, #1, 49f\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x23], #0x2\n"
+      "tbz x25, #0, 50f\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x23]\n"
+      "b 50f\n"
+      "49:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x23, #0x0]\n"
+      "50:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 51f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "51:"  // Height 2: Multiply loop: unique 8: skip row sum
+      "ldr q25, [x28, #0x0]\n"
+      "ldr q24, [x28, #0x10]\n"
+      ".inst 0x4e99a410  // smmla v16.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a414  // smmla v20.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x4e99a411  // smmla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a415  // smmla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x50]\n"
+      ".inst 0x4e99a412  // smmla v18.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a416  // smmla v22.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x4e99a413  // smmla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a417  // smmla v23.4s, v0.16b, v24.16b\n"
+      "add x28, x28, #0x80\n"
+      "52:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 36b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v24.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "mov v23.16b, v24.16b\n"
+      "tbnz %x[flags], #31, 53f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "neg v24.4s, v24.4s\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
+      "53:"  // Height 2: skip row sum fixup
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q27, [x10, #0x10]\n"
+      "add v23.4s, v23.4s, v11.4s\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q26, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v17.4s, v17.4s, v27.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v25.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+      "tbz %x[flags], #5, 54f\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
+      "and v30.16b, v20.16b, v0.16b\n"
+      "and v29.16b, v21.16b, v0.16b\n"
+      "and v28.16b, v22.16b, v0.16b\n"
+      "and v27.16b, v16.16b, v0.16b\n"
+      "and v26.16b, v17.16b, v0.16b\n"
+      "and v25.16b, v18.16b, v0.16b\n"
+      "and v24.16b, v19.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v30.4s\n"
+      "sqadd v21.4s, v21.4s, v29.4s\n"
+      "sqadd v22.4s, v22.4s, v28.4s\n"
+      "sqadd v16.4s, v16.4s, v27.4s\n"
+      "sqadd v17.4s, v17.4s, v26.4s\n"
+      "sqadd v18.4s, v18.4s, v25.4s\n"
+      "sqadd v19.4s, v19.4s, v24.4s\n"
+      "54:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "cmp x9, #0x10\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "uzp1 v23.8h, v23.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 63f\n"
+      "tbz x9, #3, 58f\n"
+      "str d23, [x27], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x9, #2, 56f\n"
+      "st1 { v23.s }[2], [x27], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "tbz x9, #1, 55f\n"
+      "st1 { v23.h }[6], [x27], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[14], [x27]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "b 62f\n"
+      "55:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[12], [x27]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "b 62f\n"
+      "56:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 57f\n"
+      "st1 { v23.h }[4], [x27], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[10], [x27]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "b 62f\n"
+      "57:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[8], [x27]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "b 62f\n"
+      "58:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 60f\n"
+      "str s23, [x27], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "tbz x9, #1, 59f\n"
+      "st1 { v23.h }[2], [x27], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[6], [x27]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "b 62f\n"
+      "59:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[4], [x27]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "b 62f\n"
+      "60:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 61f\n"
+      "str h23, [x27], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[2], [x27]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "b 62f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b23, [x27, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "62:"  // Height 2: Partial direct writeback: Done
+      "b 64f\n"
+      "63:"  // Height 2: Full writeback
+      "str q23, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q16, [x23, #0x0]\n"
+      "64:"  // Height 2: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 34b\n"
+      "b 130f\n"
+      "65:"  // Height 3
+      "mov x10, %x[col_bias]\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "66:"  // Height 3: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "67:"  // Height 3: setup done
+      "mov x26, #0x0\n"
+      "68:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 69f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 70f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 70f\n"
+      "69:"  // Height 3: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "70:"  // Height 3: input setup done
+      "cmp x25, #0x10\n"
+      "blt 75f\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q3, [x22, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "blt 73f\n"
+      "71:"  // Height 3: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      "ldr q14, [x28, #0x70]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x28, #0x60]\n"
+      ".inst 0x4e86a45c  // smmla v28.4s, v2.16b, v6.16b\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e88a45d  // smmla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x4e89a45a  // smmla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x4e8aa45e  // smmla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x4e85a413  // smmla v19.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45b  // smmla v27.4s, v2.16b, v5.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e8ea417  // smmla v23.4s, v0.16b, v14.16b\n"
+      ".inst 0x4e8ea45f  // smmla v31.4s, v2.16b, v14.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a478  // smmla v24.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a47c  // smmla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e88a479  // smmla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x4e89a47d  // smmla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e8aa47a  // smmla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x4e86a436  // smmla v22.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a47e  // smmla v30.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e85a47b  // smmla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e84a437  // smmla v23.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a47f  // smmla v31.4s, v3.16b, v4.16b\n"
+      "tbnz %x[flags], #31, 72f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f946d  // sdot v13.4s, v3.16b, v15.16b\n"
+      "72:"  // Height 3: Multiply loop: unique 9: skip row sum
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q3, [x22, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "bge 71b\n"
+      "73:"  // Height 3: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      "ldr q14, [x28, #0x70]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x28, #0x60]\n"
+      ".inst 0x4e86a45c  // smmla v28.4s, v2.16b, v6.16b\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      "sub x25, x25, #0x10\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e88a45d  // smmla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x4e89a45a  // smmla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x4e8aa45e  // smmla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e85a413  // smmla v19.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45b  // smmla v27.4s, v2.16b, v5.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e8ea417  // smmla v23.4s, v0.16b, v14.16b\n"
+      ".inst 0x4e8ea45f  // smmla v31.4s, v2.16b, v14.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a478  // smmla v24.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a47c  // smmla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e88a479  // smmla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x4e89a47d  // smmla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e8aa47a  // smmla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x4e86a436  // smmla v22.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a47e  // smmla v30.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e85a47b  // smmla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e84a437  // smmla v23.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a47f  // smmla v31.4s, v3.16b, v4.16b\n"
+      "tbnz %x[flags], #31, 74f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f946d  // sdot v13.4s, v3.16b, v15.16b\n"
+      "74:"  // Height 3: Multiply loop: unique 10: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "75:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x25, 84f\n"
+      "cmp x25, #0x8\n"
+      "blt 78f\n"
+      "76:"  // Height 3: Multiply loop: Odd block loop
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v0.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x22], #0x8\n"
+      "trn1 v2.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 77f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "77:"  // Height 3: Multiply loop: unique 11: skip row sum
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q1, [x28, #0x10]\n"
+      ".inst 0x4e83a410  // smmla v16.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a458  // smmla v24.4s, v2.16b, v3.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      "sub x25, x25, #0x8\n"
+      "cmp x25, #0x8\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x4e81a414  // smmla v20.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45c  // smmla v28.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45d  // smmla v29.4s, v2.16b, v6.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45a  // smmla v26.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a416  // smmla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45e  // smmla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e83a413  // smmla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45b  // smmla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e81a417  // smmla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45f  // smmla v31.4s, v2.16b, v1.16b\n"
+      "bge 76b\n"
+      "78:"  // Height 3: Multiply loop: Skip odd blocks
+      "cbz x25, 84f\n"
+      "tbz x25, #2, 80f\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x23], #0x4\n"
+      "ldr s3, [x22], #0x4\n"
+      "tbz x25, #1, 79f\n"
+      "ld1 { v1.h }[2], [x24], #0x2\n"
+      "ld1 { v2.h }[2], [x23], #0x2\n"
+      "ld1 { v3.h }[2], [x22], #0x2\n"
+      "tbz x25, #0, 82f\n"
+      "ld1 { v1.b }[6], [x24]\n"
+      "ld1 { v2.b }[6], [x23]\n"
+      "ld1 { v3.b }[6], [x22]\n"
+      "b 82f\n"
+      "79:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x25, #0, 82f\n"
+      "ld1 { v1.b }[4], [x24]\n"
+      "ld1 { v2.b }[4], [x23]\n"
+      "ld1 { v3.b }[4], [x22]\n"
+      "b 82f\n"
+      "80:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x25, #1, 81f\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x23], #0x2\n"
+      "ldr h3, [x22], #0x2\n"
+      "tbz x25, #0, 82f\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x23]\n"
+      "ld1 { v3.b }[2], [x22]\n"
+      "b 82f\n"
+      "81:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x23, #0x0]\n"
+      "ldr b3, [x22, #0x0]\n"
+      "82:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "tbnz %x[flags], #31, 83f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "83:"  // Height 3: Multiply loop: unique 12: skip row sum
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q3, [x28, #0x10]\n"
+      ".inst 0x4e81a410  // smmla v16.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e83a414  // smmla v20.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45c  // smmla v28.4s, v2.16b, v3.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45d  // smmla v29.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45a  // smmla v26.4s, v2.16b, v5.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e84a416  // smmla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45e  // smmla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e83a413  // smmla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45b  // smmla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e81a417  // smmla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45f  // smmla v31.4s, v2.16b, v1.16b\n"
+      "84:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 68b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v0.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v0.16b\n"
+      "tbnz %x[flags], #31, 85f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v23.4s }, [x20]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "neg v23.4s, v23.4s\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "dup v13.4s, v13.s[0]\n"
+      "mul v11.4s, v11.4s, v23.4s\n"
+      "mul v12.4s, v12.4s, v23.4s\n"
+      "mul v13.4s, v13.4s, v23.4s\n"
+      "85:"  // Height 3: skip row sum fixup
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q30, [x10, #0x10]\n"
+      "add v31.4s, v31.4s, v11.4s\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q29, [x10, #0x20]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v23.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v30.4s\n"
+      "add v21.4s, v21.4s, v29.4s\n"
+      "add v22.4s, v22.4s, v28.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v23.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v23.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v23.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v23.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v23.4s\n"
+      "tbz %x[flags], #5, 86f\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "and v30.16b, v20.16b, v0.16b\n"
+      "and v29.16b, v21.16b, v0.16b\n"
+      "and v28.16b, v22.16b, v0.16b\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
+      "sqadd v20.4s, v20.4s, v30.4s\n"
+      "sqadd v21.4s, v21.4s, v29.4s\n"
+      "sqadd v22.4s, v22.4s, v28.4s\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "and v3.16b, v17.16b, v0.16b\n"
+      "and v2.16b, v18.16b, v0.16b\n"
+      "and v1.16b, v19.16b, v0.16b\n"
+      "and v30.16b, v24.16b, v0.16b\n"
+      "and v29.16b, v25.16b, v0.16b\n"
+      "and v28.16b, v26.16b, v0.16b\n"
+      "and v23.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v3.4s\n"
+      "sqadd v18.4s, v18.4s, v2.4s\n"
+      "sqadd v19.4s, v19.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v30.4s\n"
+      "sqadd v25.4s, v25.4s, v29.4s\n"
+      "sqadd v26.4s, v26.4s, v28.4s\n"
+      "sqadd v27.4s, v27.4s, v23.4s\n"
+      "86:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v29.4s }, [x20]\n"
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v23.4s }, [x20]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "cmp x9, #0x10\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v29.4s\n"
+      "add v20.4s, v20.4s, v29.4s\n"
+      "add v21.4s, v21.4s, v29.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v16.4s, v16.4s, v29.4s\n"
+      "add v17.4s, v17.4s, v29.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v29.4s\n"
+      "add v24.4s, v24.4s, v29.4s\n"
+      "add v25.4s, v25.4s, v29.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v29.4s\n"
+      "smin v31.4s, v31.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "smax v31.4s, v31.4s, v23.4s\n"
+      "smax v20.4s, v20.4s, v23.4s\n"
+      "smax v21.4s, v21.4s, v23.4s\n"
+      "smax v22.4s, v22.4s, v23.4s\n"
+      "smax v16.4s, v16.4s, v23.4s\n"
+      "smax v17.4s, v17.4s, v23.4s\n"
+      "smax v18.4s, v18.4s, v23.4s\n"
+      "smax v19.4s, v19.4s, v23.4s\n"
+      "smax v24.4s, v24.4s, v23.4s\n"
+      "smax v25.4s, v25.4s, v23.4s\n"
+      "smax v26.4s, v26.4s, v23.4s\n"
+      "smax v27.4s, v27.4s, v23.4s\n"
+      "uzp1 v31.8h, v31.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
+      "bge 95f\n"
+      "tbz x9, #3, 90f\n"
+      "str d31, [x27], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x9, #2, 88f\n"
+      "st1 { v31.s }[2], [x27], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "tbz x9, #1, 87f\n"
+      "st1 { v31.h }[6], [x27], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[14], [x27]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "b 94f\n"
+      "87:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[12], [x27]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "b 94f\n"
+      "88:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 89f\n"
+      "st1 { v31.h }[4], [x27], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[10], [x27]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "b 94f\n"
+      "89:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[8], [x27]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "b 94f\n"
+      "90:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 92f\n"
+      "str s31, [x27], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "tbz x9, #1, 91f\n"
+      "st1 { v31.h }[2], [x27], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[6], [x27]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "b 94f\n"
+      "91:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[4], [x27]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "b 94f\n"
+      "92:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 93f\n"
+      "str h31, [x27], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[2], [x27]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "b 94f\n"
+      "93:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b31, [x27, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "94:"  // Height 3: Partial direct writeback: Done
+      "b 96f\n"
+      "95:"  // Height 3: Full writeback
+      "str q31, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q16, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "96:"  // Height 3: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 66b\n"
+      "b 130f\n"
+      "97:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x4\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "98:"  // Height 4: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "99:"  // Height 4: setup done
+      "mov x26, #0x0\n"
+      "100:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 101f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 102f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 102f\n"
+      "101:"  // Height 4: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "102:"  // Height 4: input setup done
+      "cmp x25, #0x10\n"
+      "blt 107f\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q3, [x22, #0x0]\n"
+      "ldr q4, [x21, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "blt 105f\n"
+      "103:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45c  // smmla v28.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e88a45d  // smmla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x4e89a45a  // smmla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x4e8aa45e  // smmla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45b  // smmla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45f  // smmla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a478  // smmla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a47c  // smmla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e88a479  // smmla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x4e89a47d  // smmla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e8aa47a  // smmla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a47e  // smmla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e85a47b  // smmla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a47f  // smmla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 104f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f946d  // sdot v13.4s, v3.16b, v15.16b\n"
+      "104:"  // Height 4: Multiply loop: unique 13: skip row sum
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q3, [x22, #0x0]\n"
+      "ldr q4, [x21, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "bge 103b\n"
+      "105:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "sub x25, x25, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45c  // smmla v28.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e88a45d  // smmla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x4e89a45a  // smmla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x4e8aa45e  // smmla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45b  // smmla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45f  // smmla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a478  // smmla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a47c  // smmla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e88a479  // smmla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x4e89a47d  // smmla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e8aa47a  // smmla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a47e  // smmla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e85a47b  // smmla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a47f  // smmla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 106f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f946d  // sdot v13.4s, v3.16b, v15.16b\n"
+      "106:"  // Height 4: Multiply loop: unique 14: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "107:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x25, 116f\n"
+      "cmp x25, #0x8\n"
+      "blt 110f\n"
+      "108:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v0.2d, v1.2d, v0.2d\n"
+      "ldr d2, [x22], #0x8\n"
+      "ldr d1, [x21], #0x8\n"
+      "trn1 v2.2d, v2.2d, v1.2d\n"
+      "tbnz %x[flags], #31, 109f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "109:"  // Height 4: Multiply loop: unique 15: skip row sum
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q1, [x28, #0x10]\n"
+      ".inst 0x4e83a410  // smmla v16.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a458  // smmla v24.4s, v2.16b, v3.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      "sub x25, x25, #0x8\n"
+      "cmp x25, #0x8\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x4e81a414  // smmla v20.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45c  // smmla v28.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45d  // smmla v29.4s, v2.16b, v6.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45a  // smmla v26.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a416  // smmla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45e  // smmla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e83a413  // smmla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45b  // smmla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e81a417  // smmla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45f  // smmla v31.4s, v2.16b, v1.16b\n"
+      "bge 108b\n"
+      "110:"  // Height 4: Multiply loop: Skip odd blocks
+      "cbz x25, 116f\n"
+      "tbz x25, #2, 112f\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x23], #0x4\n"
+      "ldr s3, [x22], #0x4\n"
+      "ldr s9, [x21], #0x4\n"
+      "tbz x25, #1, 111f\n"
+      "ld1 { v1.h }[2], [x24], #0x2\n"
+      "ld1 { v2.h }[2], [x23], #0x2\n"
+      "ld1 { v3.h }[2], [x22], #0x2\n"
+      "ld1 { v9.h }[2], [x21], #0x2\n"
+      "tbz x25, #0, 114f\n"
+      "ld1 { v1.b }[6], [x24]\n"
+      "ld1 { v2.b }[6], [x23]\n"
+      "ld1 { v3.b }[6], [x22]\n"
+      "ld1 { v9.b }[6], [x21]\n"
+      "b 114f\n"
+      "111:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x25, #0, 114f\n"
+      "ld1 { v1.b }[4], [x24]\n"
+      "ld1 { v2.b }[4], [x23]\n"
+      "ld1 { v3.b }[4], [x22]\n"
+      "ld1 { v9.b }[4], [x21]\n"
+      "b 114f\n"
+      "112:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x25, #1, 113f\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x23], #0x2\n"
+      "ldr h3, [x22], #0x2\n"
+      "ldr h9, [x21], #0x2\n"
+      "tbz x25, #0, 114f\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x23]\n"
+      "ld1 { v3.b }[2], [x22]\n"
+      "ld1 { v9.b }[2], [x21]\n"
+      "b 114f\n"
+      "113:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x23, #0x0]\n"
+      "ldr b3, [x22, #0x0]\n"
+      "ldr b9, [x21, #0x0]\n"
+      "114:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v9.2d\n"
+      "tbnz %x[flags], #31, 115f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "115:"  // Height 4: Multiply loop: unique 16: skip row sum
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q3, [x28, #0x10]\n"
+      ".inst 0x4e81a410  // smmla v16.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e83a414  // smmla v20.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45c  // smmla v28.4s, v2.16b, v3.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45d  // smmla v29.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45a  // smmla v26.4s, v2.16b, v5.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e84a416  // smmla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45e  // smmla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e83a413  // smmla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45b  // smmla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e81a417  // smmla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45f  // smmla v31.4s, v2.16b, v1.16b\n"
+      "116:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 100b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v0.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "add x21, x22, x20\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v0.16b\n"
+      "tbnz %x[flags], #31, 117f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "neg v0.4s, v0.4s\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "dup v14.4s, v13.s[3]\n"
+      "dup v13.4s, v13.s[0]\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
+      "117:"  // Height 4: skip row sum fixup
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q4, [x10, #0x10]\n"
+      "add v31.4s, v31.4s, v11.4s\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q3, [x10, #0x20]\n"
+      "ldr q2, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v23.4s, v23.4s, v13.4s\n"
+      "add v28.4s, v28.4s, v13.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v29.4s, v29.4s, v13.4s\n"
+      "add v30.4s, v30.4s, v13.4s\n"
+      "add v24.4s, v24.4s, v14.4s\n"
+      "add v25.4s, v25.4s, v14.4s\n"
+      "add v26.4s, v26.4s, v14.4s\n"
+      "add v27.4s, v27.4s, v14.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v2.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v2.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+      "tbz %x[flags], #5, 118f\n"
+      "and v2.16b, v31.16b, v0.16b\n"
+      "and v1.16b, v20.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v2.4s\n"
+      "sqadd v20.4s, v20.4s, v1.4s\n"
+      "and v7.16b, v21.16b, v0.16b\n"
+      "and v6.16b, v22.16b, v0.16b\n"
+      "and v5.16b, v16.16b, v0.16b\n"
+      "and v4.16b, v17.16b, v0.16b\n"
+      "and v3.16b, v18.16b, v0.16b\n"
+      "and v2.16b, v19.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v7.4s\n"
+      "sqadd v22.4s, v22.4s, v6.4s\n"
+      "sqadd v16.4s, v16.4s, v5.4s\n"
+      "sqadd v17.4s, v17.4s, v4.4s\n"
+      "sqadd v18.4s, v18.4s, v3.4s\n"
+      "sqadd v19.4s, v19.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "and v7.16b, v28.16b, v0.16b\n"
+      "and v6.16b, v29.16b, v0.16b\n"
+      "and v5.16b, v30.16b, v0.16b\n"
+      "and v4.16b, v24.16b, v0.16b\n"
+      "and v3.16b, v25.16b, v0.16b\n"
+      "and v2.16b, v26.16b, v0.16b\n"
+      "and v1.16b, v27.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v7.4s\n"
+      "sqadd v29.4s, v29.4s, v6.4s\n"
+      "sqadd v30.4s, v30.4s, v5.4s\n"
+      "sqadd v24.4s, v24.4s, v4.4s\n"
+      "sqadd v25.4s, v25.4s, v3.4s\n"
+      "sqadd v26.4s, v26.4s, v2.4s\n"
+      "sqadd v27.4s, v27.4s, v1.4s\n"
+      "118:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v3.4s }, [x20]\n"
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v2.4s }, [x20]\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "cmp x9, #0x10\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
+      "srshl v29.4s, v29.4s, v0.4s\n"
+      "srshl v30.4s, v30.4s, v0.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "smin v31.4s, v31.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v2.4s\n"
+      "smin v21.4s, v21.4s, v2.4s\n"
+      "smin v22.4s, v22.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v2.4s\n"
+      "smin v17.4s, v17.4s, v2.4s\n"
+      "smin v18.4s, v18.4s, v2.4s\n"
+      "smin v19.4s, v19.4s, v2.4s\n"
+      "smin v23.4s, v23.4s, v2.4s\n"
+      "smin v28.4s, v28.4s, v2.4s\n"
+      "smin v29.4s, v29.4s, v2.4s\n"
+      "smin v30.4s, v30.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v2.4s\n"
+      "smin v25.4s, v25.4s, v2.4s\n"
+      "smin v26.4s, v26.4s, v2.4s\n"
+      "smin v27.4s, v27.4s, v2.4s\n"
+      "smax v31.4s, v31.4s, v1.4s\n"
+      "smax v20.4s, v20.4s, v1.4s\n"
+      "smax v21.4s, v21.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v1.4s\n"
+      "smax v16.4s, v16.4s, v1.4s\n"
+      "smax v17.4s, v17.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v1.4s\n"
+      "smax v19.4s, v19.4s, v1.4s\n"
+      "smax v23.4s, v23.4s, v1.4s\n"
+      "smax v28.4s, v28.4s, v1.4s\n"
+      "smax v29.4s, v29.4s, v1.4s\n"
+      "smax v30.4s, v30.4s, v1.4s\n"
+      "smax v24.4s, v24.4s, v1.4s\n"
+      "smax v25.4s, v25.4s, v1.4s\n"
+      "smax v26.4s, v26.4s, v1.4s\n"
+      "smax v27.4s, v27.4s, v1.4s\n"
+      "uzp1 v31.8h, v31.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.8h, v23.8h, v28.8h\n"
+      "uzp1 v18.8h, v29.8h, v30.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v23.16b, v23.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
+      "bge 127f\n"
+      "tbz x9, #3, 122f\n"
+      "str d31, [x27], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x9, #2, 120f\n"
+      "st1 { v31.s }[2], [x27], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v23.s }[2], [x22], #0x4\n"
+      "st1 { v24.s }[2], [x21], #0x4\n"
+      "tbz x9, #1, 119f\n"
+      "st1 { v31.h }[6], [x27], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v23.h }[6], [x22], #0x2\n"
+      "st1 { v24.h }[6], [x21], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[14], [x27]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v23.b }[14], [x22]\n"
+      "st1 { v24.b }[14], [x21]\n"
+      "b 126f\n"
+      "119:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[12], [x27]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v23.b }[12], [x22]\n"
+      "st1 { v24.b }[12], [x21]\n"
+      "b 126f\n"
+      "120:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 121f\n"
+      "st1 { v31.h }[4], [x27], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v23.h }[4], [x22], #0x2\n"
+      "st1 { v24.h }[4], [x21], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[10], [x27]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v23.b }[10], [x22]\n"
+      "st1 { v24.b }[10], [x21]\n"
+      "b 126f\n"
+      "121:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[8], [x27]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v23.b }[8], [x22]\n"
+      "st1 { v24.b }[8], [x21]\n"
+      "b 126f\n"
+      "122:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 124f\n"
+      "str s31, [x27], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s23, [x22], #0x4\n"
+      "str s24, [x21], #0x4\n"
+      "tbz x9, #1, 123f\n"
+      "st1 { v31.h }[2], [x27], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v23.h }[2], [x22], #0x2\n"
+      "st1 { v24.h }[2], [x21], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[6], [x27]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v23.b }[6], [x22]\n"
+      "st1 { v24.b }[6], [x21]\n"
+      "b 126f\n"
+      "123:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[4], [x27]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v23.b }[4], [x22]\n"
+      "st1 { v24.b }[4], [x21]\n"
+      "b 126f\n"
+      "124:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 125f\n"
+      "str h31, [x27], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h23, [x22], #0x2\n"
+      "str h24, [x21], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[2], [x27]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v23.b }[2], [x22]\n"
+      "st1 { v24.b }[2], [x21]\n"
+      "b 126f\n"
+      "125:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b31, [x27, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b23, [x22, #0x0]\n"
+      "str b24, [x21, #0x0]\n"
+      "126:"  // Height 4: Partial direct writeback: Done
+      "b 128f\n"
+      "127:"  // Height 4: Full writeback
+      "str q31, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q16, [x23, #0x0]\n"
+      "str q23, [x22, #0x0]\n"
+      "str q24, [x21, #0x0]\n"
+      "128:"  // Height 4: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 98b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 130f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 129f\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "129:"  // Update direct input
+      "mov x20, #0x4\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "130:"  // Exit
+      : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
index eb5bdfe55c..2b7531d1e2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
@@ -44,7 +44,8 @@ void a64_hybrid_s8qs_dot_6x16_a55( ARGLIST );
 class cls_a64_hybrid_s8qs_dot_6x16
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,24 @@ public:
         return false;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 8.28 };
-            default:
-                return { 27.5482 };
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 7.5301 };
+                case CPUModel::A510:
+                    return { 15.71 };
+                default:
+                    return { 27.5482 };
+                case CPUModel::V1:
+                    return { 52.09 };
+            }
         }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
index 6e3a00ed72..38a57b0741 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -85,7 +85,6 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 136f\n"
@@ -95,11 +94,11 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "cmp %x[M], #0x2\n"
       "bgt 55f\n"
       "beq 28f\n"
-      "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "ldr x7, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x8, %x[col_bias]\n"
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x16, %x[output_ptr]\n"
+      "mov x6, %x[col_bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x17, %x[output_ptr]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "2:"  // Height 1: Column loop
       "movi v8.4s, #0x0\n"
@@ -110,15 +109,15 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "mov x14, #0x0\n"
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
       "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
+      "add x20, x20, x21, LSL #3\n"
       "ldr x12, [x20, #0x0]\n"
       "cbnz x14, 6f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x12, x12, x20\n"
       "b 6f\n"
       "5:"  // Height 1: setup direct input
       "mov x12, %x[input_ptr]\n"
@@ -126,139 +125,139 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "cmp x13, #0x10\n"
       "blt 9f\n"
       "ldr q0, [x12, #0x0]\n"
-      "ldr q6, [x15, #0x0]\n"
       "cmp x13, #0x20\n"
+      "ldr q6, [x15, #0x0]\n"
+      "ldr q7, [x15, #0x10]\n"
       "blt 8f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x15, #0x10]\n"
-      "ldr x11, [x15, #0x18]\n"
-      "add x12, x12, #0x10\n"
-      "ldr d6, [x15, #0x20]\n"
-      "sub x13, x13, #0x10\n"
-      "ldr x10, [x15, #0x28]\n"
-      "cmp x13, #0x20\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "ldr x11, [x15, #0x38]\n"
+      "ldr d17, [x15, #0x20]\n"
+      "ldr x20, [x15, #0x28]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr d6, [x15, #0x40]\n"
-      "ldr x10, [x15, #0x48]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0x58]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x15, #0x60]\n"
-      "ldr x10, [x15, #0x68]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0x78]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x15, #0x80]\n"
-      "ldr x10, [x15, #0x88]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0x98]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x15, #0xa0]\n"
-      "ldr x10, [x15, #0xa8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0xb8]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x15, #0xc0]\n"
-      "ldr x10, [x15, #0xc8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0xd8]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr d6, [x15, #0xe0]\n"
-      "ldr x10, [x15, #0xe8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0xf8]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0xf0]\n"
+      "ldr d16, [x15, #0x30]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x38]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr d17, [x15, #0x40]\n"
+      "ldr x20, [x15, #0x48]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr d16, [x15, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x15, #0x60]\n"
+      "ldr x20, [x15, #0x68]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x15, #0x70]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x15, #0x80]\n"
+      "ldr x20, [x15, #0x88]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x15, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x15, #0xa0]\n"
+      "ldr x20, [x15, #0xa8]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x15, #0xb0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x15, #0xc0]\n"
+      "ldr x20, [x15, #0xc8]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x15, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr d17, [x15, #0xe0]\n"
+      "ldr x20, [x15, #0xe8]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr d16, [x15, #0xf0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xf8]\n"
+      "mov v16.d[1], x20\n"
+      "add x12, x12, #0x10\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
       "ldr d6, [x15, #0x0]\n"
-      "ldr x10, [x15, #0x8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x10\n"
+      "ldr x20, [x15, #0x8]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
-      "mov v0.d[1], x9\n"
+      "sub x13, x13, #0x10\n"
+      "ldr d7, [x15, #0x10]\n"
+      "cmp x13, #0x20\n"
+      "ldr x21, [x12, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x15, #0x18]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "bge 7b\n"
       "8:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      "ldr q6, [x15, #0x20]\n"
-      "sub x13, x13, #0x10\n"
-      "add x12, x12, #0x10\n"
+      "ldr q17, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x15, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x15, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x15, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x15, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x15, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x15, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x15, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x15, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x15, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x15, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x15, #0xf0]\n"
+      "add x12, x12, #0x10\n"
+      "sub x13, x13, #0x10\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
       "prfm pldl1keep, [x12, #0x80]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      "ldr q6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
       "9:"  // Height 1: Multiply loop: Main loop skip
       "cbz x13, 14f\n"
       "cmp x13, #0x4\n"
       "blt 11f\n"
       "10:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s18, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
+      "ldr q16, [x15, #0x0]\n"
+      ".inst 0x4f92e208  // sdot v8.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x15, #0x10]\n"
+      ".inst 0x4f92e209  // sdot v9.4s, v16.16b, v18.4b[0]\n"
+      "ldr q17, [x15, #0x20]\n"
       "cmp x13, #0x4\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f92e22a  // sdot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f92e20b  // sdot v11.4s, v16.16b, v18.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
       "bge 10b\n"
-      "cbz x13, 14f\n"
       "11:"  // Height 1: Multiply loop: Skip odd blocks
+      "cbz x13, 14f\n"
       "tbz x13, #1, 12f\n"
       "ldr h0, [x12], #0x2\n"
       "tbz x13, #0, 13f\n"
@@ -267,50 +266,50 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "12:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
       "13:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
+      "ldr q16, [x15, #0x0]\n"
+      ".inst 0x4f80e208  // sdot v8.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x10]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x20]\n"
+      ".inst 0x4f80e20a  // sdot v10.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
       "14:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "cmp x14, x20\n"
       "bne 4b\n"
-      "prfm pstl1keep, [x16, #0x0]\n"
+      "ldr q16, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v16.4s\n"
+      "ldr q16, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v16.4s\n"
+      "ldr q16, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v16.4s\n"
+      "ldr q16, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v16.4s\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "add x6, x6, #0x40\n"
+      "tbz %x[flags], #4, 15f\n"
       "ldr q0, [x8, #0x0]\n"
+      "ldr q4, [x7, #0x0]\n"
       "ldr q1, [x8, #0x10]\n"
+      "ldr q5, [x7, #0x10]\n"
       "ldr q2, [x8, #0x20]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
+      "ldr q6, [x7, #0x20]\n"
       "ldr q3, [x8, #0x30]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
       "add x8, x8, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "tbz %x[flags], #4, 15f\n"
-      "ldr q0, [x7, #0x0]\n"
-      "ldr q4, [x6, #0x0]\n"
-      "ldr q1, [x7, #0x10]\n"
-      "ldr q5, [x6, #0x10]\n"
-      "ldr q2, [x7, #0x20]\n"
-      "ldr q6, [x6, #0x20]\n"
-      "ldr q3, [x7, #0x30]\n"
+      "ldr q7, [x7, #0x30]\n"
       "add x7, x7, #0x40\n"
-      "ldr q7, [x6, #0x30]\n"
-      "add x6, x6, #0x40\n"
       "b 16f\n"
       "15:"  // Height 1: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "ld1r { v4.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
@@ -320,104 +319,104 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "sqrdmulh v10.4s, v10.4s, v6.4s\n"
       "sqrdmulh v11.4s, v11.4s, v7.4s\n"
       "tbz %x[flags], #5, 17f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v19.16b, v8.16b, v0.16b\n"
+      "and v18.16b, v9.16b, v1.16b\n"
+      "and v17.16b, v10.16b, v2.16b\n"
+      "and v16.16b, v11.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v19.4s\n"
+      "sqadd v9.4s, v9.4s, v18.4s\n"
+      "sqadd v10.4s, v10.4s, v17.4s\n"
+      "sqadd v11.4s, v11.4s, v16.4s\n"
       "17:"  // Height 1: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "cmp x17, #0x10\n"
-      "ld1r { v6.4s }, [x24]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v16.4s\n"
+      "add v9.4s, v9.4s, v16.4s\n"
+      "add v10.4s, v10.4s, v16.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v16.4s\n"
+      "smin v9.4s, v9.4s, v16.4s\n"
+      "smin v10.4s, v10.4s, v16.4s\n"
+      "smin v11.4s, v11.4s, v16.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v16.8h, v10.8h, v11.8h\n"
+      "cmp x16, #0x10\n"
+      "uzp1 v8.16b, v8.16b, v16.16b\n"
       "bge 26f\n"
-      "tbz x17, #3, 21f\n"
-      "str d8, [x16], #0x8\n"
-      "tbz x17, #2, 19f\n"
-      "st1 { v8.s }[2], [x16], #0x4\n"
-      "tbz x17, #1, 18f\n"
-      "st1 { v8.h }[6], [x16], #0x2\n"
-      "tbz x17, #0, 25f\n"
-      "st1 { v8.b }[14], [x16]\n"
+      "tbz x16, #3, 21f\n"
+      "str d8, [x17], #0x8\n"
+      "tbz x16, #2, 19f\n"
+      "st1 { v8.s }[2], [x17], #0x4\n"
+      "tbz x16, #1, 18f\n"
+      "st1 { v8.h }[6], [x17], #0x2\n"
+      "tbz x16, #0, 25f\n"
+      "st1 { v8.b }[14], [x17]\n"
       "b 25f\n"
       "18:"  // Height 1: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 25f\n"
-      "st1 { v8.b }[12], [x16]\n"
+      "tbz x16, #0, 25f\n"
+      "st1 { v8.b }[12], [x17]\n"
       "b 25f\n"
       "19:"  // Height 1: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 20f\n"
-      "st1 { v8.h }[4], [x16], #0x2\n"
-      "tbz x17, #0, 25f\n"
-      "st1 { v8.b }[10], [x16]\n"
+      "tbz x16, #1, 20f\n"
+      "st1 { v8.h }[4], [x17], #0x2\n"
+      "tbz x16, #0, 25f\n"
+      "st1 { v8.b }[10], [x17]\n"
       "b 25f\n"
       "20:"  // Height 1: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 25f\n"
-      "st1 { v8.b }[8], [x16]\n"
+      "tbz x16, #0, 25f\n"
+      "st1 { v8.b }[8], [x17]\n"
       "b 25f\n"
       "21:"  // Height 1: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 23f\n"
-      "str s8, [x16], #0x4\n"
-      "tbz x17, #1, 22f\n"
-      "st1 { v8.h }[2], [x16], #0x2\n"
-      "tbz x17, #0, 25f\n"
-      "st1 { v8.b }[6], [x16]\n"
+      "tbz x16, #2, 23f\n"
+      "str s8, [x17], #0x4\n"
+      "tbz x16, #1, 22f\n"
+      "st1 { v8.h }[2], [x17], #0x2\n"
+      "tbz x16, #0, 25f\n"
+      "st1 { v8.b }[6], [x17]\n"
       "b 25f\n"
       "22:"  // Height 1: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 25f\n"
-      "st1 { v8.b }[4], [x16]\n"
+      "tbz x16, #0, 25f\n"
+      "st1 { v8.b }[4], [x17]\n"
       "b 25f\n"
       "23:"  // Height 1: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 24f\n"
-      "str h8, [x16], #0x2\n"
-      "tbz x17, #0, 25f\n"
-      "st1 { v8.b }[2], [x16]\n"
+      "tbz x16, #1, 24f\n"
+      "str h8, [x17], #0x2\n"
+      "tbz x16, #0, 25f\n"
+      "st1 { v8.b }[2], [x17]\n"
       "b 25f\n"
       "24:"  // Height 1: Partial direct writeback: partial_1_0
-      "str b8, [x16, #0x0]\n"
+      "str b8, [x17, #0x0]\n"
       "25:"  // Height 1: Partial direct writeback: Done
       "b 27f\n"
       "26:"  // Height 1: Full writeback
-      "str q8, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
+      "str q8, [x17, #0x0]\n"
+      "add x17, x17, #0x10\n"
       "27:"  // Height 1: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x16, x16, #0x10\n"
       "bgt 2b\n"
       "b 164f\n"
       "28:"  // Height 2
-      "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x8, %x[col_bias]\n"
-      "ldr x7, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x16, %x[output_ptr]\n"
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x6, %x[col_bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x17, %x[output_ptr]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "29:"  // Height 2: Column loop
       "movi v8.4s, #0x0\n"
@@ -432,269 +431,269 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "mov x14, #0x0\n"
       "31:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 32f\n"
       "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
+      "add x20, x20, x21, LSL #3\n"
       "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x8]\n"
       "cbnz x14, 33f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
       "b 33f\n"
       "32:"  // Height 2: setup direct input
       "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
+      "add x11, x12, x21\n"
       "33:"  // Height 2: input setup done
       "cmp x13, #0x10\n"
       "blt 36f\n"
       "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
       "cmp x13, #0x20\n"
+      "ldr q1, [x11, #0x0]\n"
       "ldr q6, [x15, #0x0]\n"
+      "ldr q7, [x15, #0x10]\n"
       "blt 35f\n"
       "34:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x15, #0x10]\n"
+      "ldr x21, [x15, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x18]\n"
-      "ldr d6, [x15, #0x20]\n"
-      "add x12, x12, #0x10\n"
-      "ldr x10, [x15, #0x28]\n"
-      "add x28, x28, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      "sub x13, x13, #0x10\n"
+      "ldr d17, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "ldr x20, [x15, #0x38]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr x11, [x15, #0x38]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr d6, [x15, #0x40]\n"
-      "ldr x10, [x15, #0x48]\n"
-      "cmp x13, #0x20\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0x58]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x15, #0x68]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x15, #0x60]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0x78]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x15, #0x88]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x15, #0x80]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0x98]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x15, #0xa8]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x15, #0xa0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0xb8]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x15, #0xc8]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x15, #0xc0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0xd8]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x15, #0xe8]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr d6, [x15, #0xe0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0xf8]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr d7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr d16, [x15, #0x30]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr d17, [x15, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr x20, [x15, #0x48]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr d16, [x15, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr x21, [x15, #0x68]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x15, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr x20, [x15, #0x78]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x15, #0x70]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x15, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr x20, [x15, #0x88]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x15, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x15, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x15, #0xb0]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x15, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr x20, [x15, #0xc8]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x15, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr d17, [x15, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr d16, [x15, #0xf0]\n"
+      "mov v17.d[1], x21\n"
+      "add x12, x12, #0x10\n"
+      "mov v16.d[1], x20\n"
+      "add x11, x11, #0x10\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
       "ldr d6, [x15, #0x0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x10, [x15, #0x8]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "ldr x21, [x15, #0x8]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d1, [x28, #0x0]\n"
-      "mov v0.d[1], x9\n"
-      "mov v1.d[1], x27\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
+      "ldr d1, [x11, #0x0]\n"
+      "sub x13, x13, #0x10\n"
+      "ldr d7, [x15, #0x10]\n"
+      "cmp x13, #0x20\n"
+      "ldr x20, [x12, #0x8]\n"
+      "mov v6.d[1], x21\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v0.d[1], x20\n"
+      "ldr x20, [x15, #0x18]\n"
+      "mov v1.d[1], x21\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "bge 34b\n"
       "35:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      "sub x13, x13, #0x10\n"
       "add x12, x12, #0x10\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q17, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x28, x28, #0x10\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x15, #0x40]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "sub x13, x13, #0x10\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x15, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x15, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x15, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x15, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x15, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x15, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x15, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x15, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x15, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x15, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x15, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x15, #0xf0]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
       "36:"  // Height 2: Multiply loop: Main loop skip
       "cbz x13, 41f\n"
       "cmp x13, #0x4\n"
       "blt 38f\n"
       "37:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s19, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
+      "ldr s18, [x11], #0x4\n"
       "cmp x13, #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q17, [x15, #0x0]\n"
+      ".inst 0x4f93e228  // sdot v8.4s, v17.16b, v19.4b[0]\n"
+      "ldr q16, [x15, #0x10]\n"
+      ".inst 0x4f92e22c  // sdot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x15, #0x20]\n"
+      ".inst 0x4f93e209  // sdot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20d  // sdot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f93e22a  // sdot v10.4s, v17.16b, v19.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f92e22e  // sdot v14.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f93e20b  // sdot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20f  // sdot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 37b\n"
-      "cbz x13, 41f\n"
       "38:"  // Height 2: Multiply loop: Skip odd blocks
+      "cbz x13, 41f\n"
       "tbz x13, #1, 39f\n"
       "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
       "tbz x13, #0, 40f\n"
       "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v1.b }[2], [x11]\n"
       "b 40f\n"
       "39:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
+      "ldr b1, [x11, #0x0]\n"
       "40:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q17, [x15, #0x0]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x10]\n"
+      ".inst 0x4f81e22c  // sdot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x15, #0x20]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20d  // sdot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
       "41:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "cmp x14, x20\n"
       "bne 31b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x16, #0x0]\n"
+      "ldr q19, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v19.4s\n"
+      "ldr q18, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "ldr q17, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v17.4s\n"
+      "ldr q16, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v16.4s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x17, x20\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "add v12.4s, v12.4s, v19.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v13.4s, v13.4s, v18.4s\n"
+      "add v14.4s, v14.4s, v17.4s\n"
+      "add v15.4s, v15.4s, v16.4s\n"
+      "add x6, x6, #0x40\n"
+      "tbz %x[flags], #4, 42f\n"
       "ldr q0, [x8, #0x0]\n"
+      "ldr q4, [x7, #0x0]\n"
       "ldr q1, [x8, #0x10]\n"
-      "add x23, x16, x19\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
+      "ldr q5, [x7, #0x10]\n"
       "ldr q2, [x8, #0x20]\n"
+      "ldr q6, [x7, #0x20]\n"
       "ldr q3, [x8, #0x30]\n"
       "add x8, x8, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "tbz %x[flags], #4, 42f\n"
-      "ldr q0, [x7, #0x0]\n"
-      "ldr q4, [x6, #0x0]\n"
-      "ldr q1, [x7, #0x10]\n"
-      "ldr q5, [x6, #0x10]\n"
-      "ldr q2, [x7, #0x20]\n"
-      "ldr q6, [x6, #0x20]\n"
-      "ldr q3, [x7, #0x30]\n"
+      "ldr q7, [x7, #0x30]\n"
       "add x7, x7, #0x40\n"
-      "ldr q7, [x6, #0x30]\n"
-      "add x6, x6, #0x40\n"
       "b 43f\n"
       "42:"  // Height 2: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "ld1r { v4.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
@@ -708,30 +707,30 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "sqrdmulh v14.4s, v14.4s, v6.4s\n"
       "sqrdmulh v15.4s, v15.4s, v7.4s\n"
       "tbz %x[flags], #5, 44f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v19.16b, v8.16b, v0.16b\n"
+      "and v18.16b, v9.16b, v1.16b\n"
+      "and v17.16b, v10.16b, v2.16b\n"
+      "and v16.16b, v11.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v19.4s\n"
+      "sqadd v9.4s, v9.4s, v18.4s\n"
+      "sqadd v10.4s, v10.4s, v17.4s\n"
+      "sqadd v11.4s, v11.4s, v16.4s\n"
+      "and v19.16b, v12.16b, v0.16b\n"
+      "and v18.16b, v13.16b, v1.16b\n"
+      "and v17.16b, v14.16b, v2.16b\n"
+      "and v16.16b, v15.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v19.4s\n"
+      "sqadd v13.4s, v13.4s, v18.4s\n"
+      "sqadd v14.4s, v14.4s, v17.4s\n"
+      "sqadd v15.4s, v15.4s, v16.4s\n"
       "44:"  // Height 2: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
@@ -741,118 +740,118 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "srshl v13.4s, v13.4s, v1.4s\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "cmp x17, #0x10\n"
-      "ld1r { v6.4s }, [x24]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v16.4s\n"
+      "add v9.4s, v9.4s, v16.4s\n"
+      "add v10.4s, v10.4s, v16.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
+      "add v12.4s, v12.4s, v16.4s\n"
+      "add v13.4s, v13.4s, v16.4s\n"
+      "add v14.4s, v14.4s, v16.4s\n"
+      "add v15.4s, v15.4s, v16.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v16.4s\n"
+      "smin v9.4s, v9.4s, v16.4s\n"
+      "smin v10.4s, v10.4s, v16.4s\n"
+      "smin v11.4s, v11.4s, v16.4s\n"
+      "smin v12.4s, v12.4s, v16.4s\n"
+      "smin v13.4s, v13.4s, v16.4s\n"
+      "smin v14.4s, v14.4s, v16.4s\n"
+      "smin v15.4s, v15.4s, v16.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
+      "smax v12.4s, v12.4s, v16.4s\n"
+      "smax v13.4s, v13.4s, v16.4s\n"
+      "smax v14.4s, v14.4s, v16.4s\n"
+      "smax v15.4s, v15.4s, v16.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v17.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v16.8h, v14.8h, v15.8h\n"
+      "cmp x16, #0x10\n"
+      "uzp1 v8.16b, v8.16b, v17.16b\n"
+      "uzp1 v12.16b, v12.16b, v16.16b\n"
       "bge 53f\n"
-      "tbz x17, #3, 48f\n"
-      "str d8, [x16], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "tbz x17, #2, 46f\n"
-      "st1 { v8.s }[2], [x16], #0x4\n"
-      "st1 { v12.s }[2], [x23], #0x4\n"
-      "tbz x17, #1, 45f\n"
-      "st1 { v8.h }[6], [x16], #0x2\n"
-      "st1 { v12.h }[6], [x23], #0x2\n"
-      "tbz x17, #0, 52f\n"
-      "st1 { v8.b }[14], [x16]\n"
-      "st1 { v12.b }[14], [x23]\n"
+      "tbz x16, #3, 48f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "tbz x16, #2, 46f\n"
+      "st1 { v8.s }[2], [x17], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "tbz x16, #1, 45f\n"
+      "st1 { v8.h }[6], [x17], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "tbz x16, #0, 52f\n"
+      "st1 { v8.b }[14], [x17]\n"
+      "st1 { v12.b }[14], [x25]\n"
       "b 52f\n"
       "45:"  // Height 2: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 52f\n"
-      "st1 { v8.b }[12], [x16]\n"
-      "st1 { v12.b }[12], [x23]\n"
+      "tbz x16, #0, 52f\n"
+      "st1 { v8.b }[12], [x17]\n"
+      "st1 { v12.b }[12], [x25]\n"
       "b 52f\n"
       "46:"  // Height 2: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 47f\n"
-      "st1 { v8.h }[4], [x16], #0x2\n"
-      "st1 { v12.h }[4], [x23], #0x2\n"
-      "tbz x17, #0, 52f\n"
-      "st1 { v8.b }[10], [x16]\n"
-      "st1 { v12.b }[10], [x23]\n"
+      "tbz x16, #1, 47f\n"
+      "st1 { v8.h }[4], [x17], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "tbz x16, #0, 52f\n"
+      "st1 { v8.b }[10], [x17]\n"
+      "st1 { v12.b }[10], [x25]\n"
       "b 52f\n"
       "47:"  // Height 2: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 52f\n"
-      "st1 { v8.b }[8], [x16]\n"
-      "st1 { v12.b }[8], [x23]\n"
+      "tbz x16, #0, 52f\n"
+      "st1 { v8.b }[8], [x17]\n"
+      "st1 { v12.b }[8], [x25]\n"
       "b 52f\n"
       "48:"  // Height 2: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 50f\n"
-      "str s8, [x16], #0x4\n"
-      "str s12, [x23], #0x4\n"
-      "tbz x17, #1, 49f\n"
-      "st1 { v8.h }[2], [x16], #0x2\n"
-      "st1 { v12.h }[2], [x23], #0x2\n"
-      "tbz x17, #0, 52f\n"
-      "st1 { v8.b }[6], [x16]\n"
-      "st1 { v12.b }[6], [x23]\n"
+      "tbz x16, #2, 50f\n"
+      "str s8, [x17], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "tbz x16, #1, 49f\n"
+      "st1 { v8.h }[2], [x17], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "tbz x16, #0, 52f\n"
+      "st1 { v8.b }[6], [x17]\n"
+      "st1 { v12.b }[6], [x25]\n"
       "b 52f\n"
       "49:"  // Height 2: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 52f\n"
-      "st1 { v8.b }[4], [x16]\n"
-      "st1 { v12.b }[4], [x23]\n"
+      "tbz x16, #0, 52f\n"
+      "st1 { v8.b }[4], [x17]\n"
+      "st1 { v12.b }[4], [x25]\n"
       "b 52f\n"
       "50:"  // Height 2: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 51f\n"
-      "str h8, [x16], #0x2\n"
-      "str h12, [x23], #0x2\n"
-      "tbz x17, #0, 52f\n"
-      "st1 { v8.b }[2], [x16]\n"
-      "st1 { v12.b }[2], [x23]\n"
+      "tbz x16, #1, 51f\n"
+      "str h8, [x17], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "tbz x16, #0, 52f\n"
+      "st1 { v8.b }[2], [x17]\n"
+      "st1 { v12.b }[2], [x25]\n"
       "b 52f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_0
-      "str b8, [x16, #0x0]\n"
-      "str b12, [x23, #0x0]\n"
+      "str b8, [x17, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
       "52:"  // Height 2: Partial direct writeback: Done
       "b 54f\n"
       "53:"  // Height 2: Full writeback
-      "str q8, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "str q12, [x23, #0x0]\n"
+      "str q8, [x17, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "str q12, [x25, #0x0]\n"
       "54:"  // Height 2: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x16, x16, #0x10\n"
       "bgt 29b\n"
       "b 164f\n"
       "55:"  // Height 3
-      "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x8, %x[col_bias]\n"
-      "ldr x7, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x16, %x[output_ptr]\n"
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x6, %x[col_bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x17, %x[output_ptr]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "56:"  // Height 3: Column loop
       "movi v8.4s, #0x0\n"
@@ -871,330 +870,330 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "mov x14, #0x0\n"
       "58:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
       "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
+      "add x20, x20, x21, LSL #3\n"
       "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
+      "ldr x11, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
       "cbnz x14, 60f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "b 60f\n"
       "59:"  // Height 3: setup direct input
       "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
       "60:"  // Height 3: input setup done
       "cmp x13, #0x10\n"
       "blt 63f\n"
       "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
       "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
+      "ldr q1, [x11, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
       "ldr q6, [x15, #0x0]\n"
+      "ldr q7, [x15, #0x10]\n"
       "blt 62f\n"
       "61:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x15, #0x10]\n"
+      "ldr x21, [x15, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x18]\n"
+      "ldr x20, [x15, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr d6, [x15, #0x20]\n"
-      "ldr x10, [x15, #0x28]\n"
-      "add x12, x12, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "ldr x11, [x15, #0x38]\n"
-      "add x28, x28, #0x10\n"
+      "ldr d21, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "mov v21.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr x21, [x15, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr d7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr x10, [x15, #0x48]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x15, #0x40]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x58]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0x50]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x15, #0x68]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x15, #0x60]\n"
-      "cmp x13, #0x20\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x15, #0x78]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x25, [x26, #0x8]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x15, #0x88]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x15, #0x98]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x15, #0xa8]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x15, #0xb8]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x15, #0xc8]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x15, #0xd8]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x15, #0xe8]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr x11, [x15, #0xf8]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0xf0]\n"
+      "ldr d20, [x15, #0x30]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "ldr x20, [x15, #0x58]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr d21, [x15, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      "ldr x21, [x15, #0x68]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr d20, [x15, #0x50]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x78]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x15, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0x88]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x15, #0x70]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x98]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x15, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x15, #0x90]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x15, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xc8]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x15, #0xb0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xd8]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x15, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x15, #0xd0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr d21, [x15, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      "add x12, x12, #0x10\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr d20, [x15, #0xf0]\n"
+      "mov v20.d[1], x20\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x15, #0x8]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
+      "ldr x20, [x15, #0x8]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      "ldr x23, [x12, #0x8]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
       "ldr d6, [x15, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "mov v0.d[1], x9\n"
-      "ldr d2, [x26, #0x0]\n"
-      "mov v1.d[1], x27\n"
-      "mov v2.d[1], x25\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      "ldr d1, [x11, #0x0]\n"
+      "ldr x22, [x11, #0x8]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
+      "ldr d2, [x10, #0x0]\n"
+      "sub x13, x13, #0x10\n"
+      "ldr d7, [x15, #0x10]\n"
+      "cmp x13, #0x20\n"
+      "ldr x21, [x10, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x15, #0x18]\n"
+      "mov v0.d[1], x23\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "mov v1.d[1], x22\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v2.d[1], x21\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "mov v7.d[1], x20\n"
       "bge 61b\n"
       "62:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
+      "ldr q21, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x12, x12, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "sub x13, x13, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x28, x28, #0x10\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q20, [x15, #0x30]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x15, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x15, #0x50]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x15, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x15, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x15, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x15, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x15, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x15, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x15, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x15, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x15, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x15, #0xf0]\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
       "63:"  // Height 3: Multiply loop: Main loop skip
       "cbz x13, 68f\n"
       "cmp x13, #0x4\n"
       "blt 65f\n"
       "64:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s24, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
+      "ldr s23, [x11], #0x4\n"
       "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s22, [x10], #0x4\n"
+      "ldr q21, [x15, #0x0]\n"
+      ".inst 0x4f98e2a8  // sdot v8.4s, v21.16b, v24.4b[0]\n"
+      "ldr q20, [x15, #0x10]\n"
+      ".inst 0x4f97e2ac  // sdot v12.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b0  // sdot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x15, #0x20]\n"
+      ".inst 0x4f98e289  // sdot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28d  // sdot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e291  // sdot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x15, #0x30]\n"
+      ".inst 0x4f98e2aa  // sdot v10.4s, v21.16b, v24.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f97e2ae  // sdot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b2  // sdot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x4f98e28b  // sdot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28f  // sdot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e293  // sdot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 64b\n"
-      "cbz x13, 68f\n"
       "65:"  // Height 3: Multiply loop: Skip odd blocks
+      "cbz x13, 68f\n"
       "tbz x13, #1, 66f\n"
       "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
+      "ldr h2, [x10], #0x2\n"
       "tbz x13, #0, 67f\n"
       "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x11]\n"
+      "ld1 { v2.b }[2], [x10]\n"
       "b 67f\n"
       "66:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
+      "ldr b1, [x11, #0x0]\n"
+      "ldr b2, [x10, #0x0]\n"
       "67:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q21, [x15, #0x0]\n"
+      ".inst 0x4f80e2a8  // sdot v8.4s, v21.16b, v0.4b[0]\n"
+      "ldr q20, [x15, #0x10]\n"
+      ".inst 0x4f81e2ac  // sdot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b0  // sdot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x15, #0x20]\n"
+      ".inst 0x4f80e289  // sdot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28d  // sdot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e291  // sdot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x15, #0x30]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
       "68:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "cmp x14, x20\n"
       "bne 58b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x16, #0x0]\n"
+      "ldr q23, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v23.4s\n"
+      "ldr q22, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v22.4s\n"
+      "ldr q21, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v21.4s\n"
+      "ldr q20, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v20.4s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x17, x20\n"
+      "add x24, x25, x20\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v12.4s, v12.4s, v23.4s\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "add v13.4s, v13.4s, v22.4s\n"
+      "add v14.4s, v14.4s, v21.4s\n"
+      "add v15.4s, v15.4s, v20.4s\n"
+      "add v16.4s, v16.4s, v23.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "add x6, x6, #0x40\n"
+      "tbz %x[flags], #4, 69f\n"
       "ldr q0, [x8, #0x0]\n"
+      "ldr q4, [x7, #0x0]\n"
       "ldr q1, [x8, #0x10]\n"
-      "add x23, x16, x19\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
+      "ldr q5, [x7, #0x10]\n"
       "ldr q2, [x8, #0x20]\n"
+      "ldr q6, [x7, #0x20]\n"
       "ldr q3, [x8, #0x30]\n"
       "add x8, x8, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "tbz %x[flags], #4, 69f\n"
-      "ldr q0, [x7, #0x0]\n"
-      "ldr q4, [x6, #0x0]\n"
-      "ldr q1, [x7, #0x10]\n"
-      "ldr q5, [x6, #0x10]\n"
-      "ldr q2, [x7, #0x20]\n"
-      "ldr q6, [x6, #0x20]\n"
-      "ldr q3, [x7, #0x30]\n"
+      "ldr q7, [x7, #0x30]\n"
       "add x7, x7, #0x40\n"
-      "ldr q7, [x6, #0x30]\n"
-      "add x6, x6, #0x40\n"
       "b 70f\n"
       "69:"  // Height 3: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "ld1r { v4.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
@@ -1212,42 +1211,42 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "sqrdmulh v18.4s, v18.4s, v6.4s\n"
       "sqrdmulh v19.4s, v19.4s, v7.4s\n"
       "tbz %x[flags], #5, 71f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v8.16b, v0.16b\n"
+      "and v22.16b, v9.16b, v1.16b\n"
+      "and v21.16b, v10.16b, v2.16b\n"
+      "and v20.16b, v11.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v23.4s\n"
+      "sqadd v9.4s, v9.4s, v22.4s\n"
+      "sqadd v10.4s, v10.4s, v21.4s\n"
+      "sqadd v11.4s, v11.4s, v20.4s\n"
+      "and v23.16b, v12.16b, v0.16b\n"
+      "and v22.16b, v13.16b, v1.16b\n"
+      "and v21.16b, v14.16b, v2.16b\n"
+      "and v20.16b, v15.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v23.4s\n"
+      "sqadd v13.4s, v13.4s, v22.4s\n"
+      "sqadd v14.4s, v14.4s, v21.4s\n"
+      "sqadd v15.4s, v15.4s, v20.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v1.16b\n"
+      "and v21.16b, v18.16b, v2.16b\n"
+      "and v20.16b, v19.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "71:"  // Height 3: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
@@ -1261,149 +1260,149 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "srshl v17.4s, v17.4s, v1.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "cmp x17, #0x10\n"
-      "ld1r { v6.4s }, [x24]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v20.4s\n"
+      "add v9.4s, v9.4s, v20.4s\n"
+      "add v10.4s, v10.4s, v20.4s\n"
+      "add v11.4s, v11.4s, v20.4s\n"
+      "add v12.4s, v12.4s, v20.4s\n"
+      "add v13.4s, v13.4s, v20.4s\n"
+      "add v14.4s, v14.4s, v20.4s\n"
+      "add v15.4s, v15.4s, v20.4s\n"
+      "add v16.4s, v16.4s, v20.4s\n"
+      "add v17.4s, v17.4s, v20.4s\n"
+      "add v18.4s, v18.4s, v20.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v20.4s\n"
+      "smin v9.4s, v9.4s, v20.4s\n"
+      "smin v10.4s, v10.4s, v20.4s\n"
+      "smin v11.4s, v11.4s, v20.4s\n"
+      "smin v12.4s, v12.4s, v20.4s\n"
+      "smin v13.4s, v13.4s, v20.4s\n"
+      "smin v14.4s, v14.4s, v20.4s\n"
+      "smin v15.4s, v15.4s, v20.4s\n"
+      "smin v16.4s, v16.4s, v20.4s\n"
+      "smin v17.4s, v17.4s, v20.4s\n"
+      "smin v18.4s, v18.4s, v20.4s\n"
+      "smin v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v20.4s\n"
+      "smax v9.4s, v9.4s, v20.4s\n"
+      "smax v10.4s, v10.4s, v20.4s\n"
+      "smax v11.4s, v11.4s, v20.4s\n"
+      "smax v12.4s, v12.4s, v20.4s\n"
+      "smax v13.4s, v13.4s, v20.4s\n"
+      "smax v14.4s, v14.4s, v20.4s\n"
+      "smax v15.4s, v15.4s, v20.4s\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v21.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v20.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "cmp x16, #0x10\n"
+      "uzp1 v8.16b, v8.16b, v21.16b\n"
+      "uzp1 v12.16b, v12.16b, v20.16b\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 80f\n"
-      "tbz x17, #3, 75f\n"
-      "str d8, [x16], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "tbz x17, #2, 73f\n"
-      "st1 { v8.s }[2], [x16], #0x4\n"
-      "st1 { v12.s }[2], [x23], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "tbz x17, #1, 72f\n"
-      "st1 { v8.h }[6], [x16], #0x2\n"
-      "st1 { v12.h }[6], [x23], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "tbz x17, #0, 79f\n"
-      "st1 { v8.b }[14], [x16]\n"
-      "st1 { v12.b }[14], [x23]\n"
-      "st1 { v16.b }[14], [x22]\n"
+      "tbz x16, #3, 75f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "tbz x16, #2, 73f\n"
+      "st1 { v8.s }[2], [x17], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "tbz x16, #1, 72f\n"
+      "st1 { v8.h }[6], [x17], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "tbz x16, #0, 79f\n"
+      "st1 { v8.b }[14], [x17]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
       "b 79f\n"
       "72:"  // Height 3: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 79f\n"
-      "st1 { v8.b }[12], [x16]\n"
-      "st1 { v12.b }[12], [x23]\n"
-      "st1 { v16.b }[12], [x22]\n"
+      "tbz x16, #0, 79f\n"
+      "st1 { v8.b }[12], [x17]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
       "b 79f\n"
       "73:"  // Height 3: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 74f\n"
-      "st1 { v8.h }[4], [x16], #0x2\n"
-      "st1 { v12.h }[4], [x23], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "tbz x17, #0, 79f\n"
-      "st1 { v8.b }[10], [x16]\n"
-      "st1 { v12.b }[10], [x23]\n"
-      "st1 { v16.b }[10], [x22]\n"
+      "tbz x16, #1, 74f\n"
+      "st1 { v8.h }[4], [x17], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "tbz x16, #0, 79f\n"
+      "st1 { v8.b }[10], [x17]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
       "b 79f\n"
       "74:"  // Height 3: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 79f\n"
-      "st1 { v8.b }[8], [x16]\n"
-      "st1 { v12.b }[8], [x23]\n"
-      "st1 { v16.b }[8], [x22]\n"
+      "tbz x16, #0, 79f\n"
+      "st1 { v8.b }[8], [x17]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
       "b 79f\n"
       "75:"  // Height 3: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 77f\n"
-      "str s8, [x16], #0x4\n"
-      "str s12, [x23], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "tbz x17, #1, 76f\n"
-      "st1 { v8.h }[2], [x16], #0x2\n"
-      "st1 { v12.h }[2], [x23], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "tbz x17, #0, 79f\n"
-      "st1 { v8.b }[6], [x16]\n"
-      "st1 { v12.b }[6], [x23]\n"
-      "st1 { v16.b }[6], [x22]\n"
+      "tbz x16, #2, 77f\n"
+      "str s8, [x17], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "tbz x16, #1, 76f\n"
+      "st1 { v8.h }[2], [x17], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "tbz x16, #0, 79f\n"
+      "st1 { v8.b }[6], [x17]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
       "b 79f\n"
       "76:"  // Height 3: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 79f\n"
-      "st1 { v8.b }[4], [x16]\n"
-      "st1 { v12.b }[4], [x23]\n"
-      "st1 { v16.b }[4], [x22]\n"
+      "tbz x16, #0, 79f\n"
+      "st1 { v8.b }[4], [x17]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
       "b 79f\n"
       "77:"  // Height 3: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 78f\n"
-      "str h8, [x16], #0x2\n"
-      "str h12, [x23], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "tbz x17, #0, 79f\n"
-      "st1 { v8.b }[2], [x16]\n"
-      "st1 { v12.b }[2], [x23]\n"
-      "st1 { v16.b }[2], [x22]\n"
+      "tbz x16, #1, 78f\n"
+      "str h8, [x17], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "tbz x16, #0, 79f\n"
+      "st1 { v8.b }[2], [x17]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
       "b 79f\n"
       "78:"  // Height 3: Partial direct writeback: partial_1_0
-      "str b8, [x16, #0x0]\n"
-      "str b12, [x23, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
+      "str b8, [x17, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
       "79:"  // Height 3: Partial direct writeback: Done
       "b 81f\n"
       "80:"  // Height 3: Full writeback
-      "str q8, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "str q12, [x23, #0x0]\n"
-      "str q16, [x22, #0x0]\n"
+      "str q8, [x17, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
       "81:"  // Height 3: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x16, x16, #0x10\n"
       "bgt 56b\n"
       "b 164f\n"
       "82:"  // Height 4
-      "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x8, %x[col_bias]\n"
-      "ldr x7, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x16, %x[output_ptr]\n"
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x6, %x[col_bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x17, %x[output_ptr]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "83:"  // Height 4: Column loop
       "movi v8.4s, #0x0\n"
@@ -1426,391 +1425,391 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "mov x14, #0x0\n"
       "85:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 86f\n"
       "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
+      "add x20, x20, x21, LSL #3\n"
       "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
+      "ldr x11, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x9, [x20, #0x18]\n"
       "cbnz x14, 87f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
-      "add x24, x24, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
+      "add x9, x9, x20\n"
       "b 87f\n"
       "86:"  // Height 4: setup direct input
       "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
-      "add x24, x26, x19\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
       "87:"  // Height 4: input setup done
       "cmp x13, #0x10\n"
       "blt 90f\n"
       "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
       "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
+      "ldr q1, [x11, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x9, #0x0]\n"
       "ldr q6, [x15, #0x0]\n"
+      "ldr q7, [x15, #0x10]\n"
       "blt 89f\n"
       "88:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x15, #0x10]\n"
+      "ldr x21, [x15, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x18]\n"
+      "ldr x20, [x15, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x15, #0x28]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x15, #0x20]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x15, #0x38]\n"
       "add x12, x12, #0x10\n"
-      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr d25, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "mov v25.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr x21, [x15, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr x10, [x15, #0x48]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr x11, [x15, #0x58]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x15, #0x68]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr d6, [x15, #0x60]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x11, [x15, #0x78]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr x25, [x26, #0x8]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0x70]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x10, [x15, #0x88]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr d6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x15, #0x98]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x23, [x24, #0x8]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr d7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x15, #0xa8]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr d24, [x15, #0x30]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "ldr x20, [x15, #0x58]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr d25, [x15, #0x40]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      "ldr x21, [x15, #0x68]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr d24, [x15, #0x50]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x78]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      "ldr x25, [x12, #0x8]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x15, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0x88]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x15, #0x70]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x98]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x15, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      "ldr x22, [x9, #0x8]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x15, #0x90]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
       "sub x13, x13, #0x10\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x15, #0xb8]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x15, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xc8]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
       "cmp x13, #0x20\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x15, #0xc8]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x15, #0xd8]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x15, #0xe8]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr d6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr x11, [x15, #0xf8]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr d7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x15, #0xb0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xd8]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x15, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x15, #0xd0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr d25, [x15, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr d24, [x15, #0xf0]\n"
+      "mov v24.d[1], x20\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x10, [x15, #0x8]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      "ldr x21, [x15, #0x8]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x15, #0x18]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
       "ldr d6, [x15, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "mov v0.d[1], x9\n"
-      "mov v1.d[1], x27\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "mov v2.d[1], x25\n"
-      "mov v3.d[1], x23\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      "ldr d1, [x11, #0x0]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      "ldr d2, [x10, #0x0]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
+      "ldr d3, [x9, #0x0]\n"
+      "ldr d7, [x15, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x24\n"
+      "mov v2.d[1], x23\n"
+      "mov v3.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 88b\n"
       "89:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x12, x12, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr q25, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "add x28, x28, #0x10\n"
+      "sub x13, x13, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x15, #0x40]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q24, [x15, #0x30]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x15, #0x40]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x15, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x15, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x15, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x15, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x15, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x15, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x15, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x15, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x15, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x15, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x15, #0xf0]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
       "90:"  // Height 4: Multiply loop: Main loop skip
       "cbz x13, 95f\n"
       "cmp x13, #0x4\n"
       "blt 92f\n"
       "91:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s29, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
+      "ldr s28, [x11], #0x4\n"
       "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s27, [x10], #0x4\n"
+      "ldr s26, [x9], #0x4\n"
+      "ldr q25, [x15, #0x0]\n"
+      ".inst 0x4f9de328  // sdot v8.4s, v25.16b, v29.4b[0]\n"
+      "ldr q24, [x15, #0x10]\n"
+      ".inst 0x4f9ce32c  // sdot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be330  // sdot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae334  // sdot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x15, #0x20]\n"
+      ".inst 0x4f9de309  // sdot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30d  // sdot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be311  // sdot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae315  // sdot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x15, #0x30]\n"
+      ".inst 0x4f9de32a  // sdot v10.4s, v25.16b, v29.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f9ce32e  // sdot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be332  // sdot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae336  // sdot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x4f9de30b  // sdot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30f  // sdot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be313  // sdot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae317  // sdot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 91b\n"
-      "cbz x13, 95f\n"
       "92:"  // Height 4: Multiply loop: Skip odd blocks
+      "cbz x13, 95f\n"
       "tbz x13, #1, 93f\n"
       "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
+      "ldr h2, [x10], #0x2\n"
+      "ldr h3, [x9], #0x2\n"
       "tbz x13, #0, 94f\n"
       "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
-      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v1.b }[2], [x11]\n"
+      "ld1 { v2.b }[2], [x10]\n"
+      "ld1 { v3.b }[2], [x9]\n"
       "b 94f\n"
       "93:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
-      "ldr b3, [x24, #0x0]\n"
+      "ldr b1, [x11, #0x0]\n"
+      "ldr b2, [x10, #0x0]\n"
+      "ldr b3, [x9, #0x0]\n"
       "94:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q25, [x15, #0x0]\n"
+      ".inst 0x4f80e328  // sdot v8.4s, v25.16b, v0.4b[0]\n"
+      "ldr q24, [x15, #0x10]\n"
+      ".inst 0x4f81e32c  // sdot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e330  // sdot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e334  // sdot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x15, #0x20]\n"
+      ".inst 0x4f80e309  // sdot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30d  // sdot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e311  // sdot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e315  // sdot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x15, #0x30]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
       "95:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "cmp x14, x20\n"
       "bne 85b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x16, #0x0]\n"
+      "ldr q27, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v27.4s\n"
+      "ldr q26, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v26.4s\n"
+      "ldr q25, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v25.4s\n"
+      "ldr q24, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v24.4s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x17, x20\n"
+      "add x24, x25, x20\n"
+      "add x23, x24, x20\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "add v12.4s, v12.4s, v27.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v13.4s, v13.4s, v26.4s\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "add v14.4s, v14.4s, v25.4s\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add v15.4s, v15.4s, v24.4s\n"
+      "add v16.4s, v16.4s, v27.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add x6, x6, #0x40\n"
+      "tbz %x[flags], #4, 96f\n"
       "ldr q0, [x8, #0x0]\n"
+      "ldr q4, [x7, #0x0]\n"
       "ldr q1, [x8, #0x10]\n"
-      "add x23, x16, x19\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
+      "ldr q5, [x7, #0x10]\n"
       "ldr q2, [x8, #0x20]\n"
+      "ldr q6, [x7, #0x20]\n"
       "ldr q3, [x8, #0x30]\n"
       "add x8, x8, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "tbz %x[flags], #4, 96f\n"
-      "ldr q0, [x7, #0x0]\n"
-      "ldr q4, [x6, #0x0]\n"
-      "ldr q1, [x7, #0x10]\n"
-      "ldr q5, [x6, #0x10]\n"
-      "ldr q2, [x7, #0x20]\n"
-      "ldr q6, [x6, #0x20]\n"
-      "ldr q3, [x7, #0x30]\n"
+      "ldr q7, [x7, #0x30]\n"
       "add x7, x7, #0x40\n"
-      "ldr q7, [x6, #0x30]\n"
-      "add x6, x6, #0x40\n"
       "b 97f\n"
       "96:"  // Height 4: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "ld1r { v4.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
@@ -1832,54 +1831,54 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "sqrdmulh v22.4s, v22.4s, v6.4s\n"
       "sqrdmulh v23.4s, v23.4s, v7.4s\n"
       "tbz %x[flags], #5, 98f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "and v27.16b, v8.16b, v0.16b\n"
+      "and v26.16b, v9.16b, v1.16b\n"
+      "and v25.16b, v10.16b, v2.16b\n"
+      "and v24.16b, v11.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v27.4s\n"
+      "sqadd v9.4s, v9.4s, v26.4s\n"
+      "sqadd v10.4s, v10.4s, v25.4s\n"
+      "sqadd v11.4s, v11.4s, v24.4s\n"
+      "and v27.16b, v12.16b, v0.16b\n"
+      "and v26.16b, v13.16b, v1.16b\n"
+      "and v25.16b, v14.16b, v2.16b\n"
+      "and v24.16b, v15.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v27.4s\n"
+      "sqadd v13.4s, v13.4s, v26.4s\n"
+      "sqadd v14.4s, v14.4s, v25.4s\n"
+      "sqadd v15.4s, v15.4s, v24.4s\n"
+      "and v27.16b, v16.16b, v0.16b\n"
+      "and v26.16b, v17.16b, v1.16b\n"
+      "and v25.16b, v18.16b, v2.16b\n"
+      "and v24.16b, v19.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v27.4s\n"
+      "sqadd v17.4s, v17.4s, v26.4s\n"
+      "sqadd v18.4s, v18.4s, v25.4s\n"
+      "sqadd v19.4s, v19.4s, v24.4s\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v1.16b\n"
+      "and v25.16b, v22.16b, v2.16b\n"
+      "and v24.16b, v23.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "98:"  // Height 4: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
@@ -1897,180 +1896,180 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "srshl v21.4s, v21.4s, v1.4s\n"
       "srshl v22.4s, v22.4s, v2.4s\n"
       "srshl v23.4s, v23.4s, v3.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "cmp x17, #0x10\n"
-      "ld1r { v6.4s }, [x24]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v24.4s\n"
+      "add v9.4s, v9.4s, v24.4s\n"
+      "add v10.4s, v10.4s, v24.4s\n"
+      "add v11.4s, v11.4s, v24.4s\n"
+      "add v12.4s, v12.4s, v24.4s\n"
+      "add v13.4s, v13.4s, v24.4s\n"
+      "add v14.4s, v14.4s, v24.4s\n"
+      "add v15.4s, v15.4s, v24.4s\n"
+      "add v16.4s, v16.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v24.4s\n"
+      "add v18.4s, v18.4s, v24.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v24.4s\n"
+      "add v21.4s, v21.4s, v24.4s\n"
+      "add v22.4s, v22.4s, v24.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v24.4s\n"
+      "smin v9.4s, v9.4s, v24.4s\n"
+      "smin v10.4s, v10.4s, v24.4s\n"
+      "smin v11.4s, v11.4s, v24.4s\n"
+      "smin v12.4s, v12.4s, v24.4s\n"
+      "smin v13.4s, v13.4s, v24.4s\n"
+      "smin v14.4s, v14.4s, v24.4s\n"
+      "smin v15.4s, v15.4s, v24.4s\n"
+      "smin v16.4s, v16.4s, v24.4s\n"
+      "smin v17.4s, v17.4s, v24.4s\n"
+      "smin v18.4s, v18.4s, v24.4s\n"
+      "smin v19.4s, v19.4s, v24.4s\n"
+      "smin v20.4s, v20.4s, v24.4s\n"
+      "smin v21.4s, v21.4s, v24.4s\n"
+      "smin v22.4s, v22.4s, v24.4s\n"
+      "smin v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v24.4s\n"
+      "smax v9.4s, v9.4s, v24.4s\n"
+      "smax v10.4s, v10.4s, v24.4s\n"
+      "smax v11.4s, v11.4s, v24.4s\n"
+      "smax v12.4s, v12.4s, v24.4s\n"
+      "smax v13.4s, v13.4s, v24.4s\n"
+      "smax v14.4s, v14.4s, v24.4s\n"
+      "smax v15.4s, v15.4s, v24.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v25.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
+      "uzp1 v24.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
+      "cmp x16, #0x10\n"
+      "uzp1 v8.16b, v8.16b, v25.16b\n"
+      "uzp1 v12.16b, v12.16b, v24.16b\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 107f\n"
-      "tbz x17, #3, 102f\n"
-      "str d8, [x16], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "tbz x17, #2, 100f\n"
-      "st1 { v8.s }[2], [x16], #0x4\n"
-      "st1 { v12.s }[2], [x23], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "tbz x17, #1, 99f\n"
-      "st1 { v8.h }[6], [x16], #0x2\n"
-      "st1 { v12.h }[6], [x23], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "tbz x17, #0, 106f\n"
-      "st1 { v8.b }[14], [x16]\n"
-      "st1 { v12.b }[14], [x23]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v20.b }[14], [x21]\n"
+      "tbz x16, #3, 102f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "tbz x16, #2, 100f\n"
+      "st1 { v8.s }[2], [x17], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "tbz x16, #1, 99f\n"
+      "st1 { v8.h }[6], [x17], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "tbz x16, #0, 106f\n"
+      "st1 { v8.b }[14], [x17]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 106f\n"
       "99:"  // Height 4: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 106f\n"
-      "st1 { v8.b }[12], [x16]\n"
-      "st1 { v12.b }[12], [x23]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v20.b }[12], [x21]\n"
+      "tbz x16, #0, 106f\n"
+      "st1 { v8.b }[12], [x17]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 106f\n"
       "100:"  // Height 4: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 101f\n"
-      "st1 { v8.h }[4], [x16], #0x2\n"
-      "st1 { v12.h }[4], [x23], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "tbz x17, #0, 106f\n"
-      "st1 { v8.b }[10], [x16]\n"
-      "st1 { v12.b }[10], [x23]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v20.b }[10], [x21]\n"
+      "tbz x16, #1, 101f\n"
+      "st1 { v8.h }[4], [x17], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "tbz x16, #0, 106f\n"
+      "st1 { v8.b }[10], [x17]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 106f\n"
       "101:"  // Height 4: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 106f\n"
-      "st1 { v8.b }[8], [x16]\n"
-      "st1 { v12.b }[8], [x23]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v20.b }[8], [x21]\n"
+      "tbz x16, #0, 106f\n"
+      "st1 { v8.b }[8], [x17]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 106f\n"
       "102:"  // Height 4: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 104f\n"
-      "str s8, [x16], #0x4\n"
-      "str s12, [x23], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "tbz x17, #1, 103f\n"
-      "st1 { v8.h }[2], [x16], #0x2\n"
-      "st1 { v12.h }[2], [x23], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "tbz x17, #0, 106f\n"
-      "st1 { v8.b }[6], [x16]\n"
-      "st1 { v12.b }[6], [x23]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v20.b }[6], [x21]\n"
+      "tbz x16, #2, 104f\n"
+      "str s8, [x17], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "tbz x16, #1, 103f\n"
+      "st1 { v8.h }[2], [x17], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "tbz x16, #0, 106f\n"
+      "st1 { v8.b }[6], [x17]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 106f\n"
       "103:"  // Height 4: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 106f\n"
-      "st1 { v8.b }[4], [x16]\n"
-      "st1 { v12.b }[4], [x23]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v20.b }[4], [x21]\n"
+      "tbz x16, #0, 106f\n"
+      "st1 { v8.b }[4], [x17]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 106f\n"
       "104:"  // Height 4: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 105f\n"
-      "str h8, [x16], #0x2\n"
-      "str h12, [x23], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "tbz x17, #0, 106f\n"
-      "st1 { v8.b }[2], [x16]\n"
-      "st1 { v12.b }[2], [x23]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v20.b }[2], [x21]\n"
+      "tbz x16, #1, 105f\n"
+      "str h8, [x17], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "tbz x16, #0, 106f\n"
+      "st1 { v8.b }[2], [x17]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 106f\n"
       "105:"  // Height 4: Partial direct writeback: partial_1_0
-      "str b8, [x16, #0x0]\n"
-      "str b12, [x23, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
+      "str b8, [x17, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "106:"  // Height 4: Partial direct writeback: Done
       "b 108f\n"
       "107:"  // Height 4: Full writeback
-      "str q8, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "str q12, [x23, #0x0]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q20, [x21, #0x0]\n"
+      "str q8, [x17, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
       "108:"  // Height 4: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x16, x16, #0x10\n"
       "bgt 83b\n"
       "b 164f\n"
       "109:"  // Height 5
-      "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x8, %x[col_bias]\n"
-      "ldr x7, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x16, %x[output_ptr]\n"
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x6, %x[col_bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x17, %x[output_ptr]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "110:"  // Height 5: Column loop
       "movi v8.4s, #0x0\n"
@@ -2097,452 +2096,452 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "mov x14, #0x0\n"
       "112:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 113f\n"
       "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
+      "add x20, x20, x21, LSL #3\n"
       "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x22, [x20, #0x20]\n"
+      "ldr x11, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x9, [x20, #0x18]\n"
+      "ldr x28, [x20, #0x20]\n"
       "cbnz x14, 114f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
-      "add x24, x24, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
       "b 114f\n"
       "113:"  // Height 5: setup direct input
       "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
-      "add x24, x26, x19\n"
-      "add x22, x24, x19\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
+      "add x28, x9, x21\n"
       "114:"  // Height 5: input setup done
       "cmp x13, #0x10\n"
       "blt 117f\n"
       "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
       "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
+      "ldr q1, [x11, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x9, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
       "ldr q6, [x15, #0x0]\n"
+      "ldr q7, [x15, #0x10]\n"
       "blt 116f\n"
       "115:"  // Height 5: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x15, #0x10]\n"
+      "ldr x21, [x15, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x18]\n"
+      "ldr x20, [x15, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x15, #0x28]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
       "add x12, x12, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "add x28, x28, #0x10\n"
+      "ldr d29, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "mov v29.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr d6, [x15, #0x20]\n"
+      "ldr x21, [x15, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr x11, [x15, #0x38]\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x15, #0x30]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x10, [x15, #0x48]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x11, [x15, #0x58]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr d6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x25, [x26, #0x8]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr x10, [x15, #0x68]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr d7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x15, #0x78]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr x23, [x24, #0x8]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x15, #0x88]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x21, [x22, #0x8]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x15, #0x98]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr d28, [x15, #0x30]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "ldr x20, [x15, #0x58]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      "ldr x26, [x12, #0x8]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr d29, [x15, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      "ldr x21, [x15, #0x68]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      "ldr x25, [x11, #0x8]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      "ldr x24, [x10, #0x8]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr d28, [x15, #0x50]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x78]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      "ldr x23, [x9, #0x8]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      "ldr x22, [x28, #0x8]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x15, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0x88]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
       "sub x13, x13, #0x10\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
       "cmp x13, #0x20\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr x10, [x15, #0xa8]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x15, #0xb8]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x15, #0xc8]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x15, #0xd8]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x15, #0xe8]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr x11, [x15, #0xf8]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr d6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr d7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x15, #0x70]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x98]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x15, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x15, #0x90]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x15, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xc8]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x15, #0xb0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xd8]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x15, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x15, #0xd0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr d29, [x15, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr d28, [x15, #0xf0]\n"
+      "mov v28.d[1], x20\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x10, [x15, #0x8]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      "ldr x21, [x15, #0x8]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x15, #0x18]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
       "ldr d6, [x15, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "mov v0.d[1], x9\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "mov v1.d[1], x27\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "ldr d4, [x22, #0x0]\n"
-      "mov v2.d[1], x25\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      "ldr d1, [x11, #0x0]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      "ldr d2, [x10, #0x0]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      "ldr d3, [x9, #0x0]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
+      "ldr d4, [x28, #0x0]\n"
+      "ldr d7, [x15, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x26\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
       "mov v3.d[1], x23\n"
-      "mov v4.d[1], x21\n"
+      "mov v4.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 115b\n"
       "116:"  // Height 5: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x12, x12, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
+      "ldr q29, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "add x28, x28, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "sub x13, x13, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q28, [x15, #0x30]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x15, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x15, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x15, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x15, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x15, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x15, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x15, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x15, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x15, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x15, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x15, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x15, #0xf0]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
       "117:"  // Height 5: Multiply loop: Main loop skip
       "cbz x13, 122f\n"
       "cmp x13, #0x4\n"
       "blt 119f\n"
       "118:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s2, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
+      "ldr s1, [x11], #0x4\n"
       "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s31, [x9], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr q29, [x15, #0x0]\n"
+      ".inst 0x4f82e3a8  // sdot v8.4s, v29.16b, v2.4b[0]\n"
+      "ldr q28, [x15, #0x10]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b0  // sdot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b4  // sdot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3b8  // sdot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x15, #0x20]\n"
+      ".inst 0x4f82e389  // sdot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e391  // sdot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe395  // sdot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee399  // sdot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x15, #0x30]\n"
+      ".inst 0x4f82e3aa  // sdot v10.4s, v29.16b, v2.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b6  // sdot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3ba  // sdot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x4f82e38b  // sdot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe397  // sdot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee39b  // sdot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 118b\n"
-      "cbz x13, 122f\n"
       "119:"  // Height 5: Multiply loop: Skip odd blocks
+      "cbz x13, 122f\n"
       "tbz x13, #1, 120f\n"
       "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
+      "ldr h2, [x10], #0x2\n"
+      "ldr h3, [x9], #0x2\n"
+      "ldr h4, [x28], #0x2\n"
       "tbz x13, #0, 121f\n"
       "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
-      "ld1 { v3.b }[2], [x24]\n"
-      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v1.b }[2], [x11]\n"
+      "ld1 { v2.b }[2], [x10]\n"
+      "ld1 { v3.b }[2], [x9]\n"
+      "ld1 { v4.b }[2], [x28]\n"
       "b 121f\n"
       "120:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
-      "ldr b3, [x24, #0x0]\n"
-      "ldr b4, [x22, #0x0]\n"
+      "ldr b1, [x11, #0x0]\n"
+      "ldr b2, [x10, #0x0]\n"
+      "ldr b3, [x9, #0x0]\n"
+      "ldr b4, [x28, #0x0]\n"
       "121:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q29, [x15, #0x0]\n"
+      ".inst 0x4f80e3a8  // sdot v8.4s, v29.16b, v0.4b[0]\n"
+      "ldr q28, [x15, #0x10]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b0  // sdot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b4  // sdot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3b8  // sdot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x15, #0x20]\n"
+      ".inst 0x4f80e389  // sdot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e391  // sdot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e395  // sdot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e399  // sdot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x15, #0x30]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
       "122:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "cmp x14, x20\n"
       "bne 112b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x16, #0x0]\n"
-      "ldr q0, [x8, #0x0]\n"
-      "ldr q1, [x8, #0x10]\n"
-      "add x23, x16, x19\n"
+      "ldr q31, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v31.4s\n"
+      "ldr q30, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v30.4s\n"
+      "ldr q29, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v29.4s\n"
+      "ldr q28, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v28.4s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x17, x20\n"
+      "add x24, x25, x20\n"
+      "add x23, x24, x20\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v12.4s, v12.4s, v31.4s\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "add v13.4s, v13.4s, v30.4s\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
+      "add v14.4s, v14.4s, v29.4s\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
+      "add v15.4s, v15.4s, v28.4s\n"
+      "add v16.4s, v16.4s, v31.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v31.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v31.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "add x6, x6, #0x40\n"
+      "tbz %x[flags], #4, 123f\n"
+      "ldr q0, [x8, #0x0]\n"
+      "ldr q4, [x7, #0x0]\n"
+      "ldr q1, [x8, #0x10]\n"
+      "ldr q5, [x7, #0x10]\n"
       "ldr q2, [x8, #0x20]\n"
+      "ldr q6, [x7, #0x20]\n"
       "ldr q3, [x8, #0x30]\n"
       "add x8, x8, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "tbz %x[flags], #4, 123f\n"
-      "ldr q0, [x7, #0x0]\n"
-      "ldr q4, [x6, #0x0]\n"
-      "ldr q1, [x7, #0x10]\n"
-      "ldr q5, [x6, #0x10]\n"
-      "ldr q2, [x7, #0x20]\n"
-      "ldr q6, [x6, #0x20]\n"
-      "ldr q3, [x7, #0x30]\n"
+      "ldr q7, [x7, #0x30]\n"
       "add x7, x7, #0x40\n"
-      "ldr q7, [x6, #0x30]\n"
-      "add x6, x6, #0x40\n"
       "b 124f\n"
       "123:"  // Height 5: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "ld1r { v4.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
@@ -2568,66 +2567,66 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "sqrdmulh v26.4s, v26.4s, v6.4s\n"
       "sqrdmulh v27.4s, v27.4s, v7.4s\n"
       "tbz %x[flags], #5, 125f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "and v5.16b, v25.16b, v1.16b\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "and v6.16b, v26.16b, v2.16b\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v27.16b, v3.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v4.4s\n"
-      "sqadd v25.4s, v25.4s, v5.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v26.4s, v26.4s, v6.4s\n"
-      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "and v31.16b, v8.16b, v0.16b\n"
+      "and v30.16b, v9.16b, v1.16b\n"
+      "and v29.16b, v10.16b, v2.16b\n"
+      "and v28.16b, v11.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v31.4s\n"
+      "sqadd v9.4s, v9.4s, v30.4s\n"
+      "sqadd v10.4s, v10.4s, v29.4s\n"
+      "sqadd v11.4s, v11.4s, v28.4s\n"
+      "and v31.16b, v12.16b, v0.16b\n"
+      "and v30.16b, v13.16b, v1.16b\n"
+      "and v29.16b, v14.16b, v2.16b\n"
+      "and v28.16b, v15.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v31.4s\n"
+      "sqadd v13.4s, v13.4s, v30.4s\n"
+      "sqadd v14.4s, v14.4s, v29.4s\n"
+      "sqadd v15.4s, v15.4s, v28.4s\n"
+      "and v31.16b, v16.16b, v0.16b\n"
+      "and v30.16b, v17.16b, v1.16b\n"
+      "and v29.16b, v18.16b, v2.16b\n"
+      "and v28.16b, v19.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v31.4s\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "and v31.16b, v20.16b, v0.16b\n"
+      "and v30.16b, v21.16b, v1.16b\n"
+      "and v29.16b, v22.16b, v2.16b\n"
+      "and v28.16b, v23.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v31.4s\n"
+      "sqadd v21.4s, v21.4s, v30.4s\n"
+      "sqadd v22.4s, v22.4s, v29.4s\n"
+      "sqadd v23.4s, v23.4s, v28.4s\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v1.16b\n"
+      "and v29.16b, v26.16b, v2.16b\n"
+      "and v28.16b, v27.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "125:"  // Height 5: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
@@ -2649,215 +2648,215 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "srshl v25.4s, v25.4s, v1.4s\n"
       "srshl v26.4s, v26.4s, v2.4s\n"
       "srshl v27.4s, v27.4s, v3.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "cmp x17, #0x10\n"
-      "ld1r { v6.4s }, [x24]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v28.4s\n"
+      "add v9.4s, v9.4s, v28.4s\n"
+      "add v10.4s, v10.4s, v28.4s\n"
+      "add v11.4s, v11.4s, v28.4s\n"
+      "add v12.4s, v12.4s, v28.4s\n"
+      "add v13.4s, v13.4s, v28.4s\n"
+      "add v14.4s, v14.4s, v28.4s\n"
+      "add v15.4s, v15.4s, v28.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v28.4s\n"
+      "add v18.4s, v18.4s, v28.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v28.4s\n"
+      "add v21.4s, v21.4s, v28.4s\n"
+      "add v22.4s, v22.4s, v28.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v28.4s\n"
+      "add v25.4s, v25.4s, v28.4s\n"
+      "add v26.4s, v26.4s, v28.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v28.4s\n"
+      "smin v9.4s, v9.4s, v28.4s\n"
+      "smin v10.4s, v10.4s, v28.4s\n"
+      "smin v11.4s, v11.4s, v28.4s\n"
+      "smin v12.4s, v12.4s, v28.4s\n"
+      "smin v13.4s, v13.4s, v28.4s\n"
+      "smin v14.4s, v14.4s, v28.4s\n"
+      "smin v15.4s, v15.4s, v28.4s\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v23.4s, v23.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v28.4s\n"
+      "smax v9.4s, v9.4s, v28.4s\n"
+      "smax v10.4s, v10.4s, v28.4s\n"
+      "smax v11.4s, v11.4s, v28.4s\n"
+      "smax v12.4s, v12.4s, v28.4s\n"
+      "smax v13.4s, v13.4s, v28.4s\n"
+      "smax v14.4s, v14.4s, v28.4s\n"
+      "smax v15.4s, v15.4s, v28.4s\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v29.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v28.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "cmp x16, #0x10\n"
+      "uzp1 v8.16b, v8.16b, v29.16b\n"
+      "uzp1 v12.16b, v12.16b, v28.16b\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 134f\n"
-      "tbz x17, #3, 129f\n"
-      "str d8, [x16], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "tbz x17, #2, 127f\n"
-      "st1 { v8.s }[2], [x16], #0x4\n"
-      "st1 { v12.s }[2], [x23], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
-      "tbz x17, #1, 126f\n"
-      "st1 { v8.h }[6], [x16], #0x2\n"
-      "st1 { v12.h }[6], [x23], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
-      "tbz x17, #0, 133f\n"
-      "st1 { v8.b }[14], [x16]\n"
-      "st1 { v12.b }[14], [x23]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
+      "tbz x16, #3, 129f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x16, #2, 127f\n"
+      "st1 { v8.s }[2], [x17], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "tbz x16, #1, 126f\n"
+      "st1 { v8.h }[6], [x17], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "tbz x16, #0, 133f\n"
+      "st1 { v8.b }[14], [x17]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 133f\n"
       "126:"  // Height 5: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 133f\n"
-      "st1 { v8.b }[12], [x16]\n"
-      "st1 { v12.b }[12], [x23]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
+      "tbz x16, #0, 133f\n"
+      "st1 { v8.b }[12], [x17]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 133f\n"
       "127:"  // Height 5: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 128f\n"
-      "st1 { v8.h }[4], [x16], #0x2\n"
-      "st1 { v12.h }[4], [x23], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
-      "tbz x17, #0, 133f\n"
-      "st1 { v8.b }[10], [x16]\n"
-      "st1 { v12.b }[10], [x23]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
+      "tbz x16, #1, 128f\n"
+      "st1 { v8.h }[4], [x17], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "tbz x16, #0, 133f\n"
+      "st1 { v8.b }[10], [x17]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 133f\n"
       "128:"  // Height 5: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 133f\n"
-      "st1 { v8.b }[8], [x16]\n"
-      "st1 { v12.b }[8], [x23]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
+      "tbz x16, #0, 133f\n"
+      "st1 { v8.b }[8], [x17]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 133f\n"
       "129:"  // Height 5: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 131f\n"
-      "str s8, [x16], #0x4\n"
-      "str s12, [x23], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
-      "tbz x17, #1, 130f\n"
-      "st1 { v8.h }[2], [x16], #0x2\n"
-      "st1 { v12.h }[2], [x23], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
-      "tbz x17, #0, 133f\n"
-      "st1 { v8.b }[6], [x16]\n"
-      "st1 { v12.b }[6], [x23]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
+      "tbz x16, #2, 131f\n"
+      "str s8, [x17], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "tbz x16, #1, 130f\n"
+      "st1 { v8.h }[2], [x17], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "tbz x16, #0, 133f\n"
+      "st1 { v8.b }[6], [x17]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 133f\n"
       "130:"  // Height 5: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 133f\n"
-      "st1 { v8.b }[4], [x16]\n"
-      "st1 { v12.b }[4], [x23]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
+      "tbz x16, #0, 133f\n"
+      "st1 { v8.b }[4], [x17]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 133f\n"
       "131:"  // Height 5: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 132f\n"
-      "str h8, [x16], #0x2\n"
-      "str h12, [x23], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
-      "tbz x17, #0, 133f\n"
-      "st1 { v8.b }[2], [x16]\n"
-      "st1 { v12.b }[2], [x23]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
+      "tbz x16, #1, 132f\n"
+      "str h8, [x17], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "tbz x16, #0, 133f\n"
+      "st1 { v8.b }[2], [x17]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 133f\n"
       "132:"  // Height 5: Partial direct writeback: partial_1_0
-      "str b8, [x16, #0x0]\n"
-      "str b12, [x23, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
+      "str b8, [x17, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "133:"  // Height 5: Partial direct writeback: Done
       "b 135f\n"
       "134:"  // Height 5: Full writeback
-      "str q8, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "str q12, [x23, #0x0]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
+      "str q8, [x17, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "135:"  // Height 5: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x16, x16, #0x10\n"
       "bgt 110b\n"
       "b 164f\n"
       "136:"  // Height 6
-      "ldr x6, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x8, %x[col_bias]\n"
-      "ldr x7, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x16, %x[output_ptr]\n"
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "mov x20, #0x6\n"
+      "mov x6, %x[col_bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x17, %x[output_ptr]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
       "137:"  // Height 6: Column loop
       "movi v8.4s, #0x0\n"
       "movi v9.4s, #0x0\n"
@@ -2887,257 +2886,257 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "mov x14, #0x0\n"
       "139:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 140f\n"
       "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
+      "add x20, x20, x21, LSL #3\n"
       "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x22, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
+      "ldr x11, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x9, [x20, #0x18]\n"
+      "ldr x28, [x20, #0x20]\n"
+      "ldr x27, [x20, #0x28]\n"
       "cbnz x14, 141f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
-      "add x24, x24, x19\n"
-      "add x22, x22, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
+      "add x27, x27, x20\n"
       "b 141f\n"
       "140:"  // Height 6: setup direct input
       "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
-      "add x24, x26, x19\n"
-      "add x22, x24, x19\n"
-      "add x20, x22, x19\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
+      "add x28, x9, x21\n"
+      "add x27, x28, x21\n"
       "141:"  // Height 6: input setup done
       "cmp x13, #0x10\n"
       "blt 144f\n"
       "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
       "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q1, [x11, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x9, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x27, #0x0]\n"
       "ldr q6, [x15, #0x0]\n"
+      "ldr q7, [x15, #0x10]\n"
       "blt 143f\n"
       "142:"  // Height 6: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x15, #0x10]\n"
+      "ldr x21, [x15, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x18]\n"
+      "ldr x20, [x15, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x15, #0x28]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
       "add x12, x12, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "mov v7.d[1], x11\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "ldr d6, [x15, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x38]\n"
+      "ldr x21, [x15, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr x10, [x15, #0x48]\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x15, #0x30]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x28, x28, #0x10\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr x20, [x15, #0x58]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
+      "ldr x26, [x12, #0x8]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x11, [x15, #0x58]\n"
+      "ldr x25, [x11, #0x8]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr x27, [x28, #0x8]\n"
+      "ldr x24, [x10, #0x8]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
       "ldr d6, [x15, #0x40]\n"
       ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "add x26, x26, #0x10\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr x21, [x15, #0x68]\n"
       ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "ldr x23, [x9, #0x8]\n"
       ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr x10, [x15, #0x68]\n"
+      "sub x13, x13, #0x10\n"
       ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr x25, [x26, #0x8]\n"
+      "cmp x13, #0x20\n"
       ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x15, #0x50]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr x20, [x15, #0x78]\n"
       ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x15, #0x78]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr x23, [x24, #0x8]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x15, #0x60]\n"
       ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "add x22, x22, #0x10\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr x21, [x15, #0x88]\n"
       ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr x10, [x15, #0x88]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr x21, [x22, #0x8]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x15, #0x70]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr x20, [x15, #0x98]\n"
       ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x15, #0x98]\n"
       ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "sub x13, x13, #0x10\n"
       ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x15, #0x80]\n"
       ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "cmp x13, #0x20\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0xa8]\n"
       ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr x10, [x15, #0xa8]\n"
       ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x15, #0x90]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xb8]\n"
       ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x15, #0xb8]\n"
       ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x15, #0xa0]\n"
       ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xc8]\n"
       ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x15, #0xc8]\n"
       ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x15, #0xb0]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xd8]\n"
       ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x15, #0xd8]\n"
       ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x15, #0xc0]\n"
       ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xe8]\n"
       ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x15, #0xe8]\n"
       ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x15, #0xd0]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr x20, [x15, #0xf8]\n"
       ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr x11, [x15, #0xf8]\n"
       ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
       "ldr d6, [x15, #0xe0]\n"
       ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr x22, [x28, #0x8]\n"
       ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr x19, [x20, #0x8]\n"
       ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
       "ldr d7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "mov v7.d[1], x20\n"
       "add x15, x15, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr x20, [x15, #0x8]\n"
       ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      "ldr x10, [x15, #0x8]\n"
       ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
       "ldr d6, [x15, #0x0]\n"
       ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
       ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
+      "ldr d1, [x11, #0x0]\n"
       ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
+      "ldr d2, [x10, #0x0]\n"
       ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "mov v0.d[1], x9\n"
+      "ldr d3, [x9, #0x0]\n"
       ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "mov v1.d[1], x27\n"
+      "ldr d4, [x28, #0x0]\n"
       ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "ldr d4, [x22, #0x0]\n"
-      "mov v2.d[1], x25\n"
-      "ldr d5, [x20, #0x0]\n"
+      "ldr d5, [x27, #0x0]\n"
+      "ldr d7, [x15, #0x10]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x21, [x27, #0x8]\n"
+      "mov v0.d[1], x26\n"
+      "ldr x20, [x15, #0x18]\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
       "mov v3.d[1], x23\n"
-      "mov v4.d[1], x21\n"
-      "mov v5.d[1], x19\n"
+      "mov v4.d[1], x22\n"
+      "mov v5.d[1], x21\n"
+      "mov v7.d[1], x20\n"
       "bge 142b\n"
       "143:"  // Height 6: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x12, x12, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
       "add x28, x28, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "add x26, x26, #0x10\n"
+      "sub x13, x13, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr q7, [x15, #0x30]\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x20, x20, #0x10\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
       "ldr q6, [x15, #0x40]\n"
@@ -3236,164 +3235,164 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "cmp x13, #0x4\n"
       "blt 146f\n"
       "145:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s7, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
+      "ldr s6, [x11], #0x4\n"
       "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr s5, [x20], #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s5, [x10], #0x4\n"
+      "ldr s4, [x9], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
+      "ldr s2, [x27], #0x4\n"
+      "ldr q1, [x15, #0x0]\n"
+      ".inst 0x4f87e028  // sdot v8.4s, v1.16b, v7.4b[0]\n"
+      "ldr q0, [x15, #0x10]\n"
+      ".inst 0x4f86e02c  // sdot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e030  // sdot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e034  // sdot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e038  // sdot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03c  // sdot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x15, #0x20]\n"
+      ".inst 0x4f87e009  // sdot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00d  // sdot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e011  // sdot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e015  // sdot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e019  // sdot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01d  // sdot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x15, #0x30]\n"
+      ".inst 0x4f87e02a  // sdot v10.4s, v1.16b, v7.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f86e02e  // sdot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e032  // sdot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e036  // sdot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e03a  // sdot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03e  // sdot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x4f87e00b  // sdot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00f  // sdot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e013  // sdot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e017  // sdot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e01b  // sdot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01f  // sdot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 145b\n"
-      "cbz x13, 149f\n"
       "146:"  // Height 6: Multiply loop: Skip odd blocks
+      "cbz x13, 149f\n"
       "tbz x13, #1, 147f\n"
       "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
-      "ldr h5, [x20], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
+      "ldr h2, [x10], #0x2\n"
+      "ldr h3, [x9], #0x2\n"
+      "ldr h4, [x28], #0x2\n"
+      "ldr h5, [x27], #0x2\n"
       "tbz x13, #0, 148f\n"
       "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
-      "ld1 { v3.b }[2], [x24]\n"
-      "ld1 { v4.b }[2], [x22]\n"
-      "ld1 { v5.b }[2], [x20]\n"
+      "ld1 { v1.b }[2], [x11]\n"
+      "ld1 { v2.b }[2], [x10]\n"
+      "ld1 { v3.b }[2], [x9]\n"
+      "ld1 { v4.b }[2], [x28]\n"
+      "ld1 { v5.b }[2], [x27]\n"
       "b 148f\n"
       "147:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
-      "ldr b3, [x24, #0x0]\n"
-      "ldr b4, [x22, #0x0]\n"
-      "ldr b5, [x20, #0x0]\n"
+      "ldr b1, [x11, #0x0]\n"
+      "ldr b2, [x10, #0x0]\n"
+      "ldr b3, [x9, #0x0]\n"
+      "ldr b4, [x28, #0x0]\n"
+      "ldr b5, [x27, #0x0]\n"
       "148:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x15, #0x0]\n"
+      ".inst 0x4f80e0e8  // sdot v8.4s, v7.16b, v0.4b[0]\n"
+      "ldr q6, [x15, #0x10]\n"
+      ".inst 0x4f81e0ec  // sdot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f0  // sdot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f4  // sdot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f8  // sdot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fc  // sdot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x15, #0x20]\n"
+      ".inst 0x4f80e0c9  // sdot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cd  // sdot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d1  // sdot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d5  // sdot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d9  // sdot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dd  // sdot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x15, #0x30]\n"
+      ".inst 0x4f80e0ea  // sdot v10.4s, v7.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f81e0ee  // sdot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f2  // sdot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f6  // sdot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fa  // sdot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fe  // sdot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0cb  // sdot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cf  // sdot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d3  // sdot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d7  // sdot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0db  // sdot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0df  // sdot v31.4s, v6.16b, v5.4b[0]\n"
       "149:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "cmp x14, x20\n"
       "bne 139b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x16, #0x0]\n"
-      "ldr q0, [x8, #0x0]\n"
-      "ldr q1, [x8, #0x10]\n"
-      "add x23, x16, x19\n"
+      "ldr q3, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v3.4s\n"
+      "ldr q2, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v2.4s\n"
+      "ldr q1, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v1.4s\n"
+      "ldr q0, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v0.4s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x17, x20\n"
+      "add x24, x25, x20\n"
+      "add x23, x24, x20\n"
+      "add x22, x23, x20\n"
+      "add x21, x22, x20\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "add v12.4s, v12.4s, v3.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v13.4s, v13.4s, v2.4s\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "add v14.4s, v14.4s, v1.4s\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v28.4s, v28.4s, v0.4s\n"
-      "add v29.4s, v29.4s, v1.4s\n"
+      "add v15.4s, v15.4s, v0.4s\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19\n"
+      "add v16.4s, v16.4s, v3.4s\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
-      "add x19, x20, x19\n"
-      "prfm pstl1keep, [x19, #0x0]\n"
+      "add v17.4s, v17.4s, v2.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v1.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v2.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v1.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add x6, x6, #0x40\n"
+      "tbz %x[flags], #4, 150f\n"
+      "ldr q0, [x8, #0x0]\n"
+      "ldr q4, [x7, #0x0]\n"
+      "ldr q1, [x8, #0x10]\n"
+      "ldr q5, [x7, #0x10]\n"
       "ldr q2, [x8, #0x20]\n"
+      "ldr q6, [x7, #0x20]\n"
       "ldr q3, [x8, #0x30]\n"
       "add x8, x8, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
-      "tbz %x[flags], #4, 150f\n"
-      "ldr q0, [x7, #0x0]\n"
-      "ldr q4, [x6, #0x0]\n"
-      "ldr q1, [x7, #0x10]\n"
-      "ldr q5, [x6, #0x10]\n"
-      "ldr q2, [x7, #0x20]\n"
-      "ldr q6, [x6, #0x20]\n"
-      "ldr q3, [x7, #0x30]\n"
+      "ldr q7, [x7, #0x30]\n"
       "add x7, x7, #0x40\n"
-      "ldr q7, [x6, #0x30]\n"
-      "add x6, x6, #0x40\n"
       "b 151f\n"
       "150:"  // Height 6: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "ld1r { v4.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
@@ -3423,78 +3422,78 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "sqrdmulh v30.4s, v30.4s, v6.4s\n"
       "sqrdmulh v31.4s, v31.4s, v7.4s\n"
       "tbz %x[flags], #5, 152f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v7.16b, v8.16b, v0.16b\n"
+      "and v6.16b, v9.16b, v1.16b\n"
+      "and v5.16b, v10.16b, v2.16b\n"
+      "and v4.16b, v11.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v7.4s\n"
+      "sqadd v9.4s, v9.4s, v6.4s\n"
+      "sqadd v10.4s, v10.4s, v5.4s\n"
+      "sqadd v11.4s, v11.4s, v4.4s\n"
+      "and v7.16b, v12.16b, v0.16b\n"
+      "and v6.16b, v13.16b, v1.16b\n"
+      "and v5.16b, v14.16b, v2.16b\n"
+      "and v4.16b, v15.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "and v5.16b, v25.16b, v1.16b\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "and v6.16b, v26.16b, v2.16b\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v27.16b, v3.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v4.4s\n"
-      "and v4.16b, v28.16b, v0.16b\n"
-      "sqadd v25.4s, v25.4s, v5.4s\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v7.4s\n"
+      "sqadd v13.4s, v13.4s, v6.4s\n"
+      "sqadd v14.4s, v14.4s, v5.4s\n"
+      "sqadd v15.4s, v15.4s, v4.4s\n"
+      "and v7.16b, v16.16b, v0.16b\n"
+      "and v6.16b, v17.16b, v1.16b\n"
+      "and v5.16b, v18.16b, v2.16b\n"
+      "and v4.16b, v19.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v5.16b, v29.16b, v1.16b\n"
-      "sqadd v26.4s, v26.4s, v6.4s\n"
-      "and v6.16b, v30.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v27.4s, v27.4s, v7.4s\n"
-      "and v7.16b, v31.16b, v3.16b\n"
+      "sqadd v16.4s, v16.4s, v7.4s\n"
+      "sqadd v17.4s, v17.4s, v6.4s\n"
+      "sqadd v18.4s, v18.4s, v5.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "and v7.16b, v20.16b, v0.16b\n"
+      "and v6.16b, v21.16b, v1.16b\n"
+      "and v5.16b, v22.16b, v2.16b\n"
+      "and v4.16b, v23.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v7.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "sqadd v22.4s, v22.4s, v5.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v7.16b, v24.16b, v0.16b\n"
+      "and v6.16b, v25.16b, v1.16b\n"
+      "and v5.16b, v26.16b, v2.16b\n"
+      "and v4.16b, v27.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v28.4s, v28.4s, v4.4s\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v7.4s\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sqadd v26.4s, v26.4s, v5.4s\n"
+      "sqadd v27.4s, v27.4s, v4.4s\n"
+      "and v7.16b, v28.16b, v0.16b\n"
+      "and v6.16b, v29.16b, v1.16b\n"
+      "and v5.16b, v30.16b, v2.16b\n"
+      "and v4.16b, v31.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v29.4s, v29.4s, v5.4s\n"
-      "sqadd v30.4s, v30.4s, v6.4s\n"
-      "sqadd v31.4s, v31.4s, v7.4s\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v7.4s\n"
+      "sqadd v29.4s, v29.4s, v6.4s\n"
+      "sqadd v30.4s, v30.4s, v5.4s\n"
+      "sqadd v31.4s, v31.4s, v4.4s\n"
       "152:"  // Height 6: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
@@ -3520,251 +3519,250 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "srshl v29.4s, v29.4s, v1.4s\n"
       "srshl v30.4s, v30.4s, v2.4s\n"
       "srshl v31.4s, v31.4s, v3.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "cmp x17, #0x10\n"
-      "ld1r { v6.4s }, [x24]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v0.4s\n"
+      "add v10.4s, v10.4s, v0.4s\n"
+      "add v11.4s, v11.4s, v0.4s\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "add v13.4s, v13.4s, v0.4s\n"
+      "add v14.4s, v14.4s, v0.4s\n"
+      "add v15.4s, v15.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v0.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v0.4s\n"
+      "add v22.4s, v22.4s, v0.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v0.4s\n"
+      "add v26.4s, v26.4s, v0.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "add v29.4s, v29.4s, v0.4s\n"
+      "add v30.4s, v30.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v0.4s\n"
+      "smin v9.4s, v9.4s, v0.4s\n"
+      "smin v10.4s, v10.4s, v0.4s\n"
+      "smin v11.4s, v11.4s, v0.4s\n"
+      "smin v12.4s, v12.4s, v0.4s\n"
+      "smin v13.4s, v13.4s, v0.4s\n"
+      "smin v14.4s, v14.4s, v0.4s\n"
+      "smin v15.4s, v15.4s, v0.4s\n"
+      "smin v16.4s, v16.4s, v0.4s\n"
+      "smin v17.4s, v17.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v0.4s\n"
+      "smin v19.4s, v19.4s, v0.4s\n"
+      "smin v20.4s, v20.4s, v0.4s\n"
+      "smin v21.4s, v21.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v0.4s\n"
+      "smin v23.4s, v23.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v0.4s\n"
+      "smin v25.4s, v25.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v0.4s\n"
+      "smin v27.4s, v27.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v0.4s\n"
+      "smin v29.4s, v29.4s, v0.4s\n"
+      "smin v30.4s, v30.4s, v0.4s\n"
+      "smin v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v0.4s\n"
+      "smax v9.4s, v9.4s, v0.4s\n"
+      "smax v10.4s, v10.4s, v0.4s\n"
+      "smax v11.4s, v11.4s, v0.4s\n"
+      "smax v12.4s, v12.4s, v0.4s\n"
+      "smax v13.4s, v13.4s, v0.4s\n"
+      "smax v14.4s, v14.4s, v0.4s\n"
+      "smax v15.4s, v15.4s, v0.4s\n"
+      "smax v16.4s, v16.4s, v0.4s\n"
+      "smax v17.4s, v17.4s, v0.4s\n"
+      "smax v18.4s, v18.4s, v0.4s\n"
+      "smax v19.4s, v19.4s, v0.4s\n"
+      "smax v20.4s, v20.4s, v0.4s\n"
+      "smax v21.4s, v21.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v0.4s\n"
+      "smax v23.4s, v23.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v0.4s\n"
+      "smax v25.4s, v25.4s, v0.4s\n"
+      "smax v26.4s, v26.4s, v0.4s\n"
+      "smax v27.4s, v27.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v0.4s\n"
+      "smax v29.4s, v29.4s, v0.4s\n"
+      "smax v30.4s, v30.4s, v0.4s\n"
+      "smax v31.4s, v31.4s, v0.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v2.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v1.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
+      "cmp x16, #0x10\n"
+      "uzp1 v8.16b, v8.16b, v2.16b\n"
+      "uzp1 v12.16b, v12.16b, v1.16b\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 161f\n"
-      "tbz x17, #3, 156f\n"
-      "str d8, [x16], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "str d28, [x19], #0x8\n"
-      "tbz x17, #2, 154f\n"
-      "st1 { v8.s }[2], [x16], #0x4\n"
-      "st1 { v12.s }[2], [x23], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
-      "st1 { v28.s }[2], [x19], #0x4\n"
-      "tbz x17, #1, 153f\n"
-      "st1 { v8.h }[6], [x16], #0x2\n"
-      "st1 { v12.h }[6], [x23], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
-      "st1 { v28.h }[6], [x19], #0x2\n"
-      "tbz x17, #0, 160f\n"
-      "st1 { v8.b }[14], [x16]\n"
-      "st1 { v12.b }[14], [x23]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
-      "st1 { v28.b }[14], [x19]\n"
+      "tbz x16, #3, 156f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x16, #2, 154f\n"
+      "st1 { v8.s }[2], [x17], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x16, #1, 153f\n"
+      "st1 { v8.h }[6], [x17], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
+      "tbz x16, #0, 160f\n"
+      "st1 { v8.b }[14], [x17]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 160f\n"
       "153:"  // Height 6: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 160f\n"
-      "st1 { v8.b }[12], [x16]\n"
-      "st1 { v12.b }[12], [x23]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
-      "st1 { v28.b }[12], [x19]\n"
+      "tbz x16, #0, 160f\n"
+      "st1 { v8.b }[12], [x17]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 160f\n"
       "154:"  // Height 6: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 155f\n"
-      "st1 { v8.h }[4], [x16], #0x2\n"
-      "st1 { v12.h }[4], [x23], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
-      "st1 { v28.h }[4], [x19], #0x2\n"
-      "tbz x17, #0, 160f\n"
-      "st1 { v8.b }[10], [x16]\n"
-      "st1 { v12.b }[10], [x23]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
-      "st1 { v28.b }[10], [x19]\n"
+      "tbz x16, #1, 155f\n"
+      "st1 { v8.h }[4], [x17], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
+      "tbz x16, #0, 160f\n"
+      "st1 { v8.b }[10], [x17]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 160f\n"
       "155:"  // Height 6: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 160f\n"
-      "st1 { v8.b }[8], [x16]\n"
-      "st1 { v12.b }[8], [x23]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
-      "st1 { v28.b }[8], [x19]\n"
+      "tbz x16, #0, 160f\n"
+      "st1 { v8.b }[8], [x17]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 160f\n"
       "156:"  // Height 6: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 158f\n"
-      "str s8, [x16], #0x4\n"
-      "str s12, [x23], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
-      "str s28, [x19], #0x4\n"
-      "tbz x17, #1, 157f\n"
-      "st1 { v8.h }[2], [x16], #0x2\n"
-      "st1 { v12.h }[2], [x23], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
-      "st1 { v28.h }[2], [x19], #0x2\n"
-      "tbz x17, #0, 160f\n"
-      "st1 { v8.b }[6], [x16]\n"
-      "st1 { v12.b }[6], [x23]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
-      "st1 { v28.b }[6], [x19]\n"
+      "tbz x16, #2, 158f\n"
+      "str s8, [x17], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x16, #1, 157f\n"
+      "st1 { v8.h }[2], [x17], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
+      "tbz x16, #0, 160f\n"
+      "st1 { v8.b }[6], [x17]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 160f\n"
       "157:"  // Height 6: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 160f\n"
-      "st1 { v8.b }[4], [x16]\n"
-      "st1 { v12.b }[4], [x23]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
-      "st1 { v28.b }[4], [x19]\n"
+      "tbz x16, #0, 160f\n"
+      "st1 { v8.b }[4], [x17]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 160f\n"
       "158:"  // Height 6: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 159f\n"
-      "str h8, [x16], #0x2\n"
-      "str h12, [x23], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
-      "str h28, [x19], #0x2\n"
-      "tbz x17, #0, 160f\n"
-      "st1 { v8.b }[2], [x16]\n"
-      "st1 { v12.b }[2], [x23]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
-      "st1 { v28.b }[2], [x19]\n"
+      "tbz x16, #1, 159f\n"
+      "str h8, [x17], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
+      "tbz x16, #0, 160f\n"
+      "st1 { v8.b }[2], [x17]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 160f\n"
       "159:"  // Height 6: Partial direct writeback: partial_1_0
-      "str b8, [x16, #0x0]\n"
-      "str b12, [x23, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
-      "str b28, [x19, #0x0]\n"
+      "str b8, [x17, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "160:"  // Height 6: Partial direct writeback: Done
       "b 162f\n"
       "161:"  // Height 6: Full writeback
-      "str q8, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "str q12, [x23, #0x0]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q28, [x19, #0x0]\n"
+      "str q8, [x17, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "162:"  // Height 6: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x16, x16, #0x10\n"
       "bgt 137b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 164f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 163f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "163:"  // Update direct input
-      "mov x19, #0x6\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "164:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
index 5a4df161aa..f3942328a6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -85,7 +85,6 @@ void a64_hybrid_s8qs_dot_6x16 (
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 136f\n"
@@ -95,168 +94,168 @@ void a64_hybrid_s8qs_dot_6x16 (
       "cmp %x[M], #0x2\n"
       "bgt 55f\n"
       "beq 28f\n"
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
+      "mov x11, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[output_ptr]\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "2:"  // Height 1: Column loop
       "movi v8.4s, #0x0\n"
       "movi v9.4s, #0x0\n"
       "movi v10.4s, #0x0\n"
       "movi v11.4s, #0x0\n"
       "3:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 6f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
       "b 6f\n"
       "5:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "6:"  // Height 1: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 9f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q6, [x28, #0x0]\n"
-      "cmp x26, #0x20\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q6, [x9, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q7, [x9, #0x10]\n"
       "blt 8f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "sub x26, x26, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "cmp x26, #0x20\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x40]\n"
-      "ldr q7, [x28, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x9, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x9, #0xf0]\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "add x9, x9, #0x100\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "bge 7b\n"
       "8:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x26, x26, #0x10\n"
+      "ldr q17, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "ldr q6, [x28, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x9, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x9, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x9, x9, #0x100\n"
       "9:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x26, 14f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 14f\n"
+      "cmp x27, #0x4\n"
       "blt 11f\n"
       "10:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "cmp x26, #0x4\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "ldr q7, [x28, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x9, #0x0]\n"
+      ".inst 0x4f92e208  // sdot v8.4s, v16.16b, v18.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      "ldr q16, [x9, #0x10]\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4f92e209  // sdot v9.4s, v16.16b, v18.4b[0]\n"
+      "cmp x27, #0x4\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f92e22a  // sdot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f92e20b  // sdot v11.4s, v16.16b, v18.4b[0]\n"
+      "add x9, x9, #0x40\n"
       "bge 10b\n"
-      "cbz x26, 14f\n"
       "11:"  // Height 1: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 12f\n"
-      "ldr h0, [x25], #0x2\n"
-      "tbz x26, #0, 13f\n"
-      "ld1 { v0.b }[2], [x25]\n"
+      "cbz x27, 14f\n"
+      "tbz x27, #1, 12f\n"
+      "ldr h0, [x26], #0x2\n"
+      "tbz x27, #0, 13f\n"
+      "ld1 { v0.b }[2], [x26]\n"
       "b 13f\n"
       "12:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
       "13:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "ldr q6, [x28, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x10]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q17, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "add x9, x9, #0x40\n"
       "14:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 4b\n"
-      "prfm pstl1keep, [x9, #0x0]\n"
-      "ldr q0, [x11, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "ldr q1, [x11, #0x10]\n"
-      "ldr q2, [x11, #0x20]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q3, [x11, #0x30]\n"
-      "add x11, x11, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q17, [x14, #0x0]\n"
+      "ldr q16, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v17.4s\n"
+      "add v9.4s, v9.4s, v16.4s\n"
+      "ldr q17, [x14, #0x20]\n"
+      "ldr q16, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v17.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "add x14, x14, #0x40\n"
       "tbz %x[flags], #4, 15f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -265,20 +264,20 @@ void a64_hybrid_s8qs_dot_6x16 (
       "ldr q2, [x12, #0x20]\n"
       "ldr q6, [x13, #0x20]\n"
       "ldr q3, [x12, #0x30]\n"
-      "add x12, x12, #0x40\n"
       "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
       "add x13, x13, #0x40\n"
       "b 16f\n"
       "15:"  // Height 1: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "mov v2.16b, v0.16b\n"
-      "mov v3.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
       "16:"  // Height 1: parameters loaded
       "sqrdmulh v8.4s, v8.4s, v4.4s\n"
@@ -286,105 +285,105 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v10.4s, v10.4s, v6.4s\n"
       "sqrdmulh v11.4s, v11.4s, v7.4s\n"
       "tbz %x[flags], #5, 17f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v19.16b, v8.16b, v0.16b\n"
+      "and v18.16b, v9.16b, v1.16b\n"
+      "and v17.16b, v10.16b, v2.16b\n"
+      "and v16.16b, v11.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v19.4s\n"
+      "sqadd v9.4s, v9.4s, v18.4s\n"
+      "sqadd v10.4s, v10.4s, v17.4s\n"
+      "sqadd v11.4s, v11.4s, v16.4s\n"
       "17:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x24]\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
-      "add x24, %x[qp], %[minval]\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
-      "ld1r { v5.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "ld1r { v6.4s }, [x24]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v18.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v10.4s, v10.4s, v18.4s\n"
+      "add v11.4s, v11.4s, v18.4s\n"
       "cmp x10, #0x10\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v8.4s, v8.4s, v17.4s\n"
+      "smin v9.4s, v9.4s, v17.4s\n"
+      "smin v10.4s, v10.4s, v17.4s\n"
+      "smin v11.4s, v11.4s, v17.4s\n"
+      "smax v8.4s, v8.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v16.8h, v10.8h, v11.8h\n"
+      "uzp1 v8.16b, v8.16b, v16.16b\n"
       "bge 26f\n"
       "tbz x10, #3, 21f\n"
-      "str d8, [x9], #0x8\n"
+      "str d8, [x11], #0x8\n"
       "tbz x10, #2, 19f\n"
-      "st1 { v8.s }[2], [x9], #0x4\n"
+      "st1 { v8.s }[2], [x11], #0x4\n"
       "tbz x10, #1, 18f\n"
-      "st1 { v8.h }[6], [x9], #0x2\n"
+      "st1 { v8.h }[6], [x11], #0x2\n"
       "tbz x10, #0, 25f\n"
-      "st1 { v8.b }[14], [x9]\n"
+      "st1 { v8.b }[14], [x11]\n"
       "b 25f\n"
       "18:"  // Height 1: Partial direct writeback: partial_1_12
       "tbz x10, #0, 25f\n"
-      "st1 { v8.b }[12], [x9]\n"
+      "st1 { v8.b }[12], [x11]\n"
       "b 25f\n"
       "19:"  // Height 1: Partial direct writeback: partial_2_8
       "tbz x10, #1, 20f\n"
-      "st1 { v8.h }[4], [x9], #0x2\n"
+      "st1 { v8.h }[4], [x11], #0x2\n"
       "tbz x10, #0, 25f\n"
-      "st1 { v8.b }[10], [x9]\n"
+      "st1 { v8.b }[10], [x11]\n"
       "b 25f\n"
       "20:"  // Height 1: Partial direct writeback: partial_1_8
       "tbz x10, #0, 25f\n"
-      "st1 { v8.b }[8], [x9]\n"
+      "st1 { v8.b }[8], [x11]\n"
       "b 25f\n"
       "21:"  // Height 1: Partial direct writeback: partial_4_0
       "tbz x10, #2, 23f\n"
-      "str s8, [x9], #0x4\n"
+      "str s8, [x11], #0x4\n"
       "tbz x10, #1, 22f\n"
-      "st1 { v8.h }[2], [x9], #0x2\n"
+      "st1 { v8.h }[2], [x11], #0x2\n"
       "tbz x10, #0, 25f\n"
-      "st1 { v8.b }[6], [x9]\n"
+      "st1 { v8.b }[6], [x11]\n"
       "b 25f\n"
       "22:"  // Height 1: Partial direct writeback: partial_1_4
       "tbz x10, #0, 25f\n"
-      "st1 { v8.b }[4], [x9]\n"
+      "st1 { v8.b }[4], [x11]\n"
       "b 25f\n"
       "23:"  // Height 1: Partial direct writeback: partial_2_0
       "tbz x10, #1, 24f\n"
-      "str h8, [x9], #0x2\n"
+      "str h8, [x11], #0x2\n"
       "tbz x10, #0, 25f\n"
-      "st1 { v8.b }[2], [x9]\n"
+      "st1 { v8.b }[2], [x11]\n"
       "b 25f\n"
       "24:"  // Height 1: Partial direct writeback: partial_1_0
-      "str b8, [x9, #0x0]\n"
+      "str b8, [x11, #0x0]\n"
       "25:"  // Height 1: Partial direct writeback: Done
       "b 27f\n"
       "26:"  // Height 1: Full writeback
-      "str q8, [x9, #0x0]\n"
-      "add x9, x9, #0x10\n"
+      "str q8, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
       "27:"  // Height 1: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 2b\n"
       "b 164f\n"
       "28:"  // Height 2
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x9, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "29:"  // Height 2: Column loop
       "movi v8.4s, #0x0\n"
       "movi v9.4s, #0x0\n"
@@ -395,216 +394,216 @@ void a64_hybrid_s8qs_dot_6x16 (
       "movi v14.4s, #0x0\n"
       "movi v15.4s, #0x0\n"
       "30:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "31:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 32f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 33f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 33f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
       "b 33f\n"
       "32:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
       "33:"  // Height 2: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 36f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q6, [x28, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
       "blt 35f\n"
       "34:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q17, [x9, #0x20]\n"
+      "sub x27, x27, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "cmp x26, #0x20\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x28, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x9, #0x40]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x9, #0x50]\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x9, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x9, #0x70]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
+      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
       "bge 34b\n"
       "35:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x9, #0x20]\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x28, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x9, #0x40]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x9, #0x50]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x9, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x9, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
       "36:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x26, 41f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 41f\n"
+      "cmp x27, #0x4\n"
       "blt 38f\n"
       "37:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x10]\n"
+      ".inst 0x4f93e228  // sdot v8.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x4f92e22c  // sdot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4f93e209  // sdot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20d  // sdot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f93e22a  // sdot v10.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x4f92e22e  // sdot v14.4s, v17.16b, v18.4b[0]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x4f93e20b  // sdot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20f  // sdot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 37b\n"
-      "cbz x26, 41f\n"
       "38:"  // Height 2: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 39f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "tbz x26, #0, 40f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
+      "cbz x27, 41f\n"
+      "tbz x27, #1, 39f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "tbz x27, #0, 40f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
       "b 40f\n"
       "39:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
       "40:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x10]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22c  // sdot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20d  // sdot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
       "41:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 31b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x9, #0x0]\n"
-      "add x23, x9, x19\n"
-      "ldr q0, [x11, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "ldr q1, [x11, #0x10]\n"
-      "ldr q2, [x11, #0x20]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q3, [x11, #0x30]\n"
-      "add x11, x11, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
+      "ldr q19, [x14, #0x0]\n"
+      "ldr q18, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v19.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "ldr q17, [x14, #0x20]\n"
+      "ldr q16, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v17.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x11, x20\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "add v12.4s, v12.4s, v19.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v13.4s, v13.4s, v18.4s\n"
+      "add v14.4s, v14.4s, v17.4s\n"
+      "add x14, x14, #0x40\n"
+      "add v15.4s, v15.4s, v16.4s\n"
       "tbz %x[flags], #4, 42f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -613,20 +612,20 @@ void a64_hybrid_s8qs_dot_6x16 (
       "ldr q2, [x12, #0x20]\n"
       "ldr q6, [x13, #0x20]\n"
       "ldr q3, [x12, #0x30]\n"
-      "add x12, x12, #0x40\n"
       "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
       "add x13, x13, #0x40\n"
       "b 43f\n"
       "42:"  // Height 2: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "mov v2.16b, v0.16b\n"
-      "mov v3.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
       "43:"  // Height 2: parameters loaded
       "sqrdmulh v8.4s, v8.4s, v4.4s\n"
@@ -638,152 +637,152 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v14.4s, v14.4s, v6.4s\n"
       "sqrdmulh v15.4s, v15.4s, v7.4s\n"
       "tbz %x[flags], #5, 44f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v19.16b, v8.16b, v0.16b\n"
+      "and v18.16b, v9.16b, v1.16b\n"
+      "and v17.16b, v10.16b, v2.16b\n"
+      "and v16.16b, v11.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v19.4s\n"
+      "sqadd v9.4s, v9.4s, v18.4s\n"
+      "sqadd v10.4s, v10.4s, v17.4s\n"
+      "sqadd v11.4s, v11.4s, v16.4s\n"
+      "and v19.16b, v12.16b, v0.16b\n"
+      "and v18.16b, v13.16b, v1.16b\n"
+      "and v17.16b, v14.16b, v2.16b\n"
+      "and v16.16b, v15.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v19.4s\n"
+      "sqadd v13.4s, v13.4s, v18.4s\n"
+      "sqadd v14.4s, v14.4s, v17.4s\n"
+      "sqadd v15.4s, v15.4s, v16.4s\n"
       "44:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x24]\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
-      "add x24, %x[qp], %[minval]\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
-      "ld1r { v5.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "ld1r { v6.4s }, [x24]\n"
-      "cmp x10, #0x10\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v17.4s }, [x20]\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
+      "cmp x10, #0x10\n"
+      "add v8.4s, v8.4s, v18.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add v10.4s, v10.4s, v18.4s\n"
+      "add v11.4s, v11.4s, v18.4s\n"
+      "add v12.4s, v12.4s, v18.4s\n"
+      "add v13.4s, v13.4s, v18.4s\n"
+      "add v14.4s, v14.4s, v18.4s\n"
+      "add v15.4s, v15.4s, v18.4s\n"
+      "smin v8.4s, v8.4s, v17.4s\n"
+      "smin v9.4s, v9.4s, v17.4s\n"
+      "smin v10.4s, v10.4s, v17.4s\n"
+      "smin v11.4s, v11.4s, v17.4s\n"
+      "smin v12.4s, v12.4s, v17.4s\n"
+      "smin v13.4s, v13.4s, v17.4s\n"
+      "smin v14.4s, v14.4s, v17.4s\n"
+      "smin v15.4s, v15.4s, v17.4s\n"
+      "smax v8.4s, v8.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
+      "smax v12.4s, v12.4s, v16.4s\n"
+      "smax v13.4s, v13.4s, v16.4s\n"
+      "smax v14.4s, v14.4s, v16.4s\n"
+      "smax v15.4s, v15.4s, v16.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
+      "uzp1 v17.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v16.8h, v14.8h, v15.8h\n"
+      "uzp1 v8.16b, v8.16b, v17.16b\n"
+      "uzp1 v12.16b, v12.16b, v16.16b\n"
       "bge 53f\n"
       "tbz x10, #3, 48f\n"
-      "str d8, [x9], #0x8\n"
-      "str d12, [x23], #0x8\n"
+      "str d8, [x11], #0x8\n"
+      "str d12, [x25], #0x8\n"
       "tbz x10, #2, 46f\n"
-      "st1 { v8.s }[2], [x9], #0x4\n"
-      "st1 { v12.s }[2], [x23], #0x4\n"
+      "st1 { v8.s }[2], [x11], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
       "tbz x10, #1, 45f\n"
-      "st1 { v8.h }[6], [x9], #0x2\n"
-      "st1 { v12.h }[6], [x23], #0x2\n"
+      "st1 { v8.h }[6], [x11], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
       "tbz x10, #0, 52f\n"
-      "st1 { v8.b }[14], [x9]\n"
-      "st1 { v12.b }[14], [x23]\n"
+      "st1 { v8.b }[14], [x11]\n"
+      "st1 { v12.b }[14], [x25]\n"
       "b 52f\n"
       "45:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x10, #0, 52f\n"
-      "st1 { v8.b }[12], [x9]\n"
-      "st1 { v12.b }[12], [x23]\n"
+      "st1 { v8.b }[12], [x11]\n"
+      "st1 { v12.b }[12], [x25]\n"
       "b 52f\n"
       "46:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x10, #1, 47f\n"
-      "st1 { v8.h }[4], [x9], #0x2\n"
-      "st1 { v12.h }[4], [x23], #0x2\n"
+      "st1 { v8.h }[4], [x11], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
       "tbz x10, #0, 52f\n"
-      "st1 { v8.b }[10], [x9]\n"
-      "st1 { v12.b }[10], [x23]\n"
+      "st1 { v8.b }[10], [x11]\n"
+      "st1 { v12.b }[10], [x25]\n"
       "b 52f\n"
       "47:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x10, #0, 52f\n"
-      "st1 { v8.b }[8], [x9]\n"
-      "st1 { v12.b }[8], [x23]\n"
+      "st1 { v8.b }[8], [x11]\n"
+      "st1 { v12.b }[8], [x25]\n"
       "b 52f\n"
       "48:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x10, #2, 50f\n"
-      "str s8, [x9], #0x4\n"
-      "str s12, [x23], #0x4\n"
+      "str s8, [x11], #0x4\n"
+      "str s12, [x25], #0x4\n"
       "tbz x10, #1, 49f\n"
-      "st1 { v8.h }[2], [x9], #0x2\n"
-      "st1 { v12.h }[2], [x23], #0x2\n"
+      "st1 { v8.h }[2], [x11], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
       "tbz x10, #0, 52f\n"
-      "st1 { v8.b }[6], [x9]\n"
-      "st1 { v12.b }[6], [x23]\n"
+      "st1 { v8.b }[6], [x11]\n"
+      "st1 { v12.b }[6], [x25]\n"
       "b 52f\n"
       "49:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x10, #0, 52f\n"
-      "st1 { v8.b }[4], [x9]\n"
-      "st1 { v12.b }[4], [x23]\n"
+      "st1 { v8.b }[4], [x11]\n"
+      "st1 { v12.b }[4], [x25]\n"
       "b 52f\n"
       "50:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x10, #1, 51f\n"
-      "str h8, [x9], #0x2\n"
-      "str h12, [x23], #0x2\n"
+      "str h8, [x11], #0x2\n"
+      "str h12, [x25], #0x2\n"
       "tbz x10, #0, 52f\n"
-      "st1 { v8.b }[2], [x9]\n"
-      "st1 { v12.b }[2], [x23]\n"
+      "st1 { v8.b }[2], [x11]\n"
+      "st1 { v12.b }[2], [x25]\n"
       "b 52f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_0
-      "str b8, [x9, #0x0]\n"
-      "str b12, [x23, #0x0]\n"
+      "str b8, [x11, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
       "52:"  // Height 2: Partial direct writeback: Done
       "b 54f\n"
       "53:"  // Height 2: Full writeback
-      "str q8, [x9, #0x0]\n"
-      "add x9, x9, #0x10\n"
-      "str q12, [x23, #0x0]\n"
+      "str q8, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q12, [x25, #0x0]\n"
       "54:"  // Height 2: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 29b\n"
       "b 164f\n"
       "55:"  // Height 3
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x9, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "56:"  // Height 3: Column loop
       "movi v8.4s, #0x0\n"
       "movi v9.4s, #0x0\n"
@@ -798,275 +797,275 @@ void a64_hybrid_s8qs_dot_6x16 (
       "movi v18.4s, #0x0\n"
       "movi v19.4s, #0x0\n"
       "57:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "58:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 60f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 60f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
       "b 60f\n"
       "59:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "60:"  // Height 3: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 63f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q6, [x28, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
       "blt 62f\n"
       "61:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q21, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sub x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "cmp x26, #0x20\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q20, [x9, #0x30]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x9, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x9, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x9, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x9, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x9, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x9, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x9, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x9, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x9, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x9, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
+      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
       "bge 61b\n"
       "62:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q21, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr q7, [x28, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      "ldr q20, [x9, #0x30]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x9, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x9, #0x50]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x9, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x9, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x9, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x9, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x9, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x9, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x9, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x9, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
       "63:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x26, 68f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 68f\n"
+      "cmp x27, #0x4\n"
       "blt 65f\n"
       "64:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x9, #0x0]\n"
+      ".inst 0x4f98e2a8  // sdot v8.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x4f97e2ac  // sdot v12.4s, v21.16b, v23.4b[0]\n"
+      "ldr q20, [x9, #0x10]\n"
+      ".inst 0x4f96e2b0  // sdot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x9, #0x20]\n"
+      ".inst 0x4f98e289  // sdot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28d  // sdot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e291  // sdot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x4f98e2aa  // sdot v10.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x4f97e2ae  // sdot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b2  // sdot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x4f98e28b  // sdot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28f  // sdot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e293  // sdot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 64b\n"
-      "cbz x26, 68f\n"
       "65:"  // Height 3: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 66f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "tbz x26, #0, 67f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
+      "cbz x27, 68f\n"
+      "tbz x27, #1, 66f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
       "b 67f\n"
       "66:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
       "67:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q21, [x9, #0x0]\n"
+      "ldr q20, [x9, #0x10]\n"
+      ".inst 0x4f80e2a8  // sdot v8.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ac  // sdot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b0  // sdot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x9, #0x20]\n"
+      ".inst 0x4f80e289  // sdot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28d  // sdot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e291  // sdot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
       "68:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 58b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x9, #0x0]\n"
-      "add x23, x9, x19\n"
-      "ldr q0, [x11, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "ldr q1, [x11, #0x10]\n"
-      "ldr q2, [x11, #0x20]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q3, [x11, #0x30]\n"
-      "add x11, x11, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "ldr q23, [x14, #0x0]\n"
+      "ldr q22, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v23.4s\n"
+      "add v9.4s, v9.4s, v22.4s\n"
+      "ldr q21, [x14, #0x20]\n"
+      "ldr q20, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v21.4s\n"
+      "add v11.4s, v11.4s, v20.4s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x11, x20\n"
+      "add x24, x25, x20\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "add v12.4s, v12.4s, v23.4s\n"
+      "add v13.4s, v13.4s, v22.4s\n"
+      "add v14.4s, v14.4s, v21.4s\n"
+      "add v15.4s, v15.4s, v20.4s\n"
+      "add x14, x14, #0x40\n"
+      "add v16.4s, v16.4s, v23.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
       "tbz %x[flags], #4, 69f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -1075,20 +1074,20 @@ void a64_hybrid_s8qs_dot_6x16 (
       "ldr q2, [x12, #0x20]\n"
       "ldr q6, [x13, #0x20]\n"
       "ldr q3, [x12, #0x30]\n"
-      "add x12, x12, #0x40\n"
       "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
       "add x13, x13, #0x40\n"
       "b 70f\n"
       "69:"  // Height 3: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "mov v2.16b, v0.16b\n"
-      "mov v3.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
       "70:"  // Height 3: parameters loaded
       "sqrdmulh v8.4s, v8.4s, v4.4s\n"
@@ -1104,199 +1103,199 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v18.4s, v18.4s, v6.4s\n"
       "sqrdmulh v19.4s, v19.4s, v7.4s\n"
       "tbz %x[flags], #5, 71f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v8.16b, v0.16b\n"
+      "and v22.16b, v9.16b, v1.16b\n"
+      "and v21.16b, v10.16b, v2.16b\n"
+      "and v20.16b, v11.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v23.4s\n"
+      "sqadd v9.4s, v9.4s, v22.4s\n"
+      "sqadd v10.4s, v10.4s, v21.4s\n"
+      "sqadd v11.4s, v11.4s, v20.4s\n"
+      "and v23.16b, v12.16b, v0.16b\n"
+      "and v22.16b, v13.16b, v1.16b\n"
+      "and v21.16b, v14.16b, v2.16b\n"
+      "and v20.16b, v15.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v23.4s\n"
+      "sqadd v13.4s, v13.4s, v22.4s\n"
+      "sqadd v14.4s, v14.4s, v21.4s\n"
+      "sqadd v15.4s, v15.4s, v20.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v1.16b\n"
+      "and v21.16b, v18.16b, v2.16b\n"
+      "and v20.16b, v19.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "71:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v22.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x24]\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
-      "add x24, %x[qp], %[minval]\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
-      "ld1r { v5.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "ld1r { v6.4s }, [x24]\n"
-      "cmp x10, #0x10\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v21.4s }, [x20]\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
+      "cmp x10, #0x10\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
       "srshl v17.4s, v17.4s, v1.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
+      "add v8.4s, v8.4s, v22.4s\n"
+      "add v9.4s, v9.4s, v22.4s\n"
+      "add v10.4s, v10.4s, v22.4s\n"
+      "add v11.4s, v11.4s, v22.4s\n"
+      "add v12.4s, v12.4s, v22.4s\n"
+      "add v13.4s, v13.4s, v22.4s\n"
+      "add v14.4s, v14.4s, v22.4s\n"
+      "add v15.4s, v15.4s, v22.4s\n"
+      "add v16.4s, v16.4s, v22.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add v19.4s, v19.4s, v22.4s\n"
+      "smin v8.4s, v8.4s, v21.4s\n"
+      "smin v9.4s, v9.4s, v21.4s\n"
+      "smin v10.4s, v10.4s, v21.4s\n"
+      "smin v11.4s, v11.4s, v21.4s\n"
+      "smin v12.4s, v12.4s, v21.4s\n"
+      "smin v13.4s, v13.4s, v21.4s\n"
+      "smin v14.4s, v14.4s, v21.4s\n"
+      "smin v15.4s, v15.4s, v21.4s\n"
+      "smin v16.4s, v16.4s, v21.4s\n"
+      "smin v17.4s, v17.4s, v21.4s\n"
+      "smin v18.4s, v18.4s, v21.4s\n"
+      "smin v19.4s, v19.4s, v21.4s\n"
+      "smax v8.4s, v8.4s, v20.4s\n"
+      "smax v9.4s, v9.4s, v20.4s\n"
+      "smax v10.4s, v10.4s, v20.4s\n"
+      "smax v11.4s, v11.4s, v20.4s\n"
+      "smax v12.4s, v12.4s, v20.4s\n"
+      "smax v13.4s, v13.4s, v20.4s\n"
+      "smax v14.4s, v14.4s, v20.4s\n"
+      "smax v15.4s, v15.4s, v20.4s\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v21.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v20.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v8.16b, v8.16b, v21.16b\n"
+      "uzp1 v12.16b, v12.16b, v20.16b\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 80f\n"
       "tbz x10, #3, 75f\n"
-      "str d8, [x9], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
+      "str d8, [x11], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
       "tbz x10, #2, 73f\n"
-      "st1 { v8.s }[2], [x9], #0x4\n"
-      "st1 { v12.s }[2], [x23], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
+      "st1 { v8.s }[2], [x11], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
       "tbz x10, #1, 72f\n"
-      "st1 { v8.h }[6], [x9], #0x2\n"
-      "st1 { v12.h }[6], [x23], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
+      "st1 { v8.h }[6], [x11], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
       "tbz x10, #0, 79f\n"
-      "st1 { v8.b }[14], [x9]\n"
-      "st1 { v12.b }[14], [x23]\n"
-      "st1 { v16.b }[14], [x22]\n"
+      "st1 { v8.b }[14], [x11]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
       "b 79f\n"
       "72:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x10, #0, 79f\n"
-      "st1 { v8.b }[12], [x9]\n"
-      "st1 { v12.b }[12], [x23]\n"
-      "st1 { v16.b }[12], [x22]\n"
+      "st1 { v8.b }[12], [x11]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
       "b 79f\n"
       "73:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x10, #1, 74f\n"
-      "st1 { v8.h }[4], [x9], #0x2\n"
-      "st1 { v12.h }[4], [x23], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
+      "st1 { v8.h }[4], [x11], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
       "tbz x10, #0, 79f\n"
-      "st1 { v8.b }[10], [x9]\n"
-      "st1 { v12.b }[10], [x23]\n"
-      "st1 { v16.b }[10], [x22]\n"
+      "st1 { v8.b }[10], [x11]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
       "b 79f\n"
       "74:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x10, #0, 79f\n"
-      "st1 { v8.b }[8], [x9]\n"
-      "st1 { v12.b }[8], [x23]\n"
-      "st1 { v16.b }[8], [x22]\n"
+      "st1 { v8.b }[8], [x11]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
       "b 79f\n"
       "75:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x10, #2, 77f\n"
-      "str s8, [x9], #0x4\n"
-      "str s12, [x23], #0x4\n"
-      "str s16, [x22], #0x4\n"
+      "str s8, [x11], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
       "tbz x10, #1, 76f\n"
-      "st1 { v8.h }[2], [x9], #0x2\n"
-      "st1 { v12.h }[2], [x23], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
+      "st1 { v8.h }[2], [x11], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
       "tbz x10, #0, 79f\n"
-      "st1 { v8.b }[6], [x9]\n"
-      "st1 { v12.b }[6], [x23]\n"
-      "st1 { v16.b }[6], [x22]\n"
+      "st1 { v8.b }[6], [x11]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
       "b 79f\n"
       "76:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x10, #0, 79f\n"
-      "st1 { v8.b }[4], [x9]\n"
-      "st1 { v12.b }[4], [x23]\n"
-      "st1 { v16.b }[4], [x22]\n"
+      "st1 { v8.b }[4], [x11]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
       "b 79f\n"
       "77:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x10, #1, 78f\n"
-      "str h8, [x9], #0x2\n"
-      "str h12, [x23], #0x2\n"
-      "str h16, [x22], #0x2\n"
+      "str h8, [x11], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
       "tbz x10, #0, 79f\n"
-      "st1 { v8.b }[2], [x9]\n"
-      "st1 { v12.b }[2], [x23]\n"
-      "st1 { v16.b }[2], [x22]\n"
+      "st1 { v8.b }[2], [x11]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
       "b 79f\n"
       "78:"  // Height 3: Partial direct writeback: partial_1_0
-      "str b8, [x9, #0x0]\n"
-      "str b12, [x23, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
+      "str b8, [x11, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
       "79:"  // Height 3: Partial direct writeback: Done
       "b 81f\n"
       "80:"  // Height 3: Full writeback
-      "str q8, [x9, #0x0]\n"
-      "add x9, x9, #0x10\n"
-      "str q12, [x23, #0x0]\n"
-      "str q16, [x22, #0x0]\n"
+      "str q8, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
       "81:"  // Height 3: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 56b\n"
       "b 164f\n"
       "82:"  // Height 4
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x9, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "83:"  // Height 4: Column loop
       "movi v8.4s, #0x0\n"
       "movi v9.4s, #0x0\n"
@@ -1315,334 +1314,334 @@ void a64_hybrid_s8qs_dot_6x16 (
       "movi v22.4s, #0x0\n"
       "movi v23.4s, #0x0\n"
       "84:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "85:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 86f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 87f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 87f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 87f\n"
       "86:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "87:"  // Height 4: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 90f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q6, [x28, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
       "blt 89f\n"
       "88:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q25, [x9, #0x20]\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x20\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x28, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x28, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q24, [x9, #0x30]\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x9, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x9, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
+      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
       "bge 88b\n"
       "89:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q25, [x9, #0x20]\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "add x23, x23, #0x10\n"
+      "sub x27, x27, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x28, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x28, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      "ldr q24, [x9, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x9, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x9, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
       "90:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x26, 95f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 95f\n"
+      "cmp x27, #0x4\n"
       "blt 92f\n"
       "91:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr s3, [x22], #0x4\n"
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x9, #0x0]\n"
+      "ldr q24, [x9, #0x10]\n"
+      ".inst 0x4f9de328  // sdot v8.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce32c  // sdot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be330  // sdot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae334  // sdot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4f9de309  // sdot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30d  // sdot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be311  // sdot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae315  // sdot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x4f9de32a  // sdot v10.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce32e  // sdot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be332  // sdot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae336  // sdot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x4f9de30b  // sdot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30f  // sdot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be313  // sdot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae317  // sdot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 91b\n"
-      "cbz x26, 95f\n"
       "92:"  // Height 4: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 93f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr h3, [x22], #0x2\n"
-      "tbz x26, #0, 94f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
-      "ld1 { v3.b }[2], [x22]\n"
+      "cbz x27, 95f\n"
+      "tbz x27, #1, 93f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "tbz x27, #0, 94f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
       "b 94f\n"
       "93:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
-      "ldr b3, [x22, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
       "94:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q25, [x9, #0x0]\n"
+      "ldr q24, [x9, #0x10]\n"
+      ".inst 0x4f80e328  // sdot v8.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32c  // sdot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e330  // sdot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e334  // sdot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4f80e309  // sdot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30d  // sdot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e311  // sdot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e315  // sdot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
       "95:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 85b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x9, #0x0]\n"
-      "add x23, x9, x19\n"
-      "ldr q0, [x11, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
+      "ldr q27, [x14, #0x0]\n"
+      "ldr q26, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v27.4s\n"
+      "add v9.4s, v9.4s, v26.4s\n"
+      "ldr q25, [x14, #0x20]\n"
+      "ldr q24, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v25.4s\n"
+      "add v11.4s, v11.4s, v24.4s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x11, x20\n"
+      "add x24, x25, x20\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "add x23, x24, x20\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "add v12.4s, v12.4s, v27.4s\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "ldr q1, [x11, #0x10]\n"
-      "ldr q2, [x11, #0x20]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q3, [x11, #0x30]\n"
-      "add x11, x11, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
+      "add v13.4s, v13.4s, v26.4s\n"
+      "add v14.4s, v14.4s, v25.4s\n"
+      "add x14, x14, #0x40\n"
+      "add v15.4s, v15.4s, v24.4s\n"
+      "add v16.4s, v16.4s, v27.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
       "tbz %x[flags], #4, 96f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -1651,20 +1650,20 @@ void a64_hybrid_s8qs_dot_6x16 (
       "ldr q2, [x12, #0x20]\n"
       "ldr q6, [x13, #0x20]\n"
       "ldr q3, [x12, #0x30]\n"
-      "add x12, x12, #0x40\n"
       "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
       "add x13, x13, #0x40\n"
       "b 97f\n"
       "96:"  // Height 4: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "mov v2.16b, v0.16b\n"
-      "mov v3.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
       "97:"  // Height 4: parameters loaded
       "sqrdmulh v8.4s, v8.4s, v4.4s\n"
@@ -1684,246 +1683,246 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v22.4s, v22.4s, v6.4s\n"
       "sqrdmulh v23.4s, v23.4s, v7.4s\n"
       "tbz %x[flags], #5, 98f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "and v27.16b, v8.16b, v0.16b\n"
+      "and v26.16b, v9.16b, v1.16b\n"
+      "and v25.16b, v10.16b, v2.16b\n"
+      "and v24.16b, v11.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v27.4s\n"
+      "sqadd v9.4s, v9.4s, v26.4s\n"
+      "sqadd v10.4s, v10.4s, v25.4s\n"
+      "sqadd v11.4s, v11.4s, v24.4s\n"
+      "and v27.16b, v12.16b, v0.16b\n"
+      "and v26.16b, v13.16b, v1.16b\n"
+      "and v25.16b, v14.16b, v2.16b\n"
+      "and v24.16b, v15.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v27.4s\n"
+      "sqadd v13.4s, v13.4s, v26.4s\n"
+      "sqadd v14.4s, v14.4s, v25.4s\n"
+      "sqadd v15.4s, v15.4s, v24.4s\n"
+      "and v27.16b, v16.16b, v0.16b\n"
+      "and v26.16b, v17.16b, v1.16b\n"
+      "and v25.16b, v18.16b, v2.16b\n"
+      "and v24.16b, v19.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v27.4s\n"
+      "sqadd v17.4s, v17.4s, v26.4s\n"
+      "sqadd v18.4s, v18.4s, v25.4s\n"
+      "sqadd v19.4s, v19.4s, v24.4s\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v1.16b\n"
+      "and v25.16b, v22.16b, v2.16b\n"
+      "and v24.16b, v23.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "98:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x24]\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
-      "add x24, %x[qp], %[minval]\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
-      "ld1r { v5.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "ld1r { v6.4s }, [x24]\n"
-      "cmp x10, #0x10\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
+      "cmp x10, #0x10\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
       "srshl v17.4s, v17.4s, v1.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
       "srshl v21.4s, v21.4s, v1.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
       "srshl v22.4s, v22.4s, v2.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
       "srshl v23.4s, v23.4s, v3.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
+      "add v8.4s, v8.4s, v26.4s\n"
+      "add v9.4s, v9.4s, v26.4s\n"
+      "add v10.4s, v10.4s, v26.4s\n"
+      "add v11.4s, v11.4s, v26.4s\n"
+      "add v12.4s, v12.4s, v26.4s\n"
+      "add v13.4s, v13.4s, v26.4s\n"
+      "add v14.4s, v14.4s, v26.4s\n"
+      "add v15.4s, v15.4s, v26.4s\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "smin v8.4s, v8.4s, v25.4s\n"
+      "smin v9.4s, v9.4s, v25.4s\n"
+      "smin v10.4s, v10.4s, v25.4s\n"
+      "smin v11.4s, v11.4s, v25.4s\n"
+      "smin v12.4s, v12.4s, v25.4s\n"
+      "smin v13.4s, v13.4s, v25.4s\n"
+      "smin v14.4s, v14.4s, v25.4s\n"
+      "smin v15.4s, v15.4s, v25.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smax v8.4s, v8.4s, v24.4s\n"
+      "smax v9.4s, v9.4s, v24.4s\n"
+      "smax v10.4s, v10.4s, v24.4s\n"
+      "smax v11.4s, v11.4s, v24.4s\n"
+      "smax v12.4s, v12.4s, v24.4s\n"
+      "smax v13.4s, v13.4s, v24.4s\n"
+      "smax v14.4s, v14.4s, v24.4s\n"
+      "smax v15.4s, v15.4s, v24.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
+      "uzp1 v25.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v24.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
+      "uzp1 v8.16b, v8.16b, v25.16b\n"
+      "uzp1 v12.16b, v12.16b, v24.16b\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 107f\n"
       "tbz x10, #3, 102f\n"
-      "str d8, [x9], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
+      "str d8, [x11], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
       "tbz x10, #2, 100f\n"
-      "st1 { v8.s }[2], [x9], #0x4\n"
-      "st1 { v12.s }[2], [x23], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
+      "st1 { v8.s }[2], [x11], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
       "tbz x10, #1, 99f\n"
-      "st1 { v8.h }[6], [x9], #0x2\n"
-      "st1 { v12.h }[6], [x23], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
+      "st1 { v8.h }[6], [x11], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
       "tbz x10, #0, 106f\n"
-      "st1 { v8.b }[14], [x9]\n"
-      "st1 { v12.b }[14], [x23]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v20.b }[14], [x21]\n"
+      "st1 { v8.b }[14], [x11]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 106f\n"
       "99:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x10, #0, 106f\n"
-      "st1 { v8.b }[12], [x9]\n"
-      "st1 { v12.b }[12], [x23]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v20.b }[12], [x21]\n"
+      "st1 { v8.b }[12], [x11]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 106f\n"
       "100:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x10, #1, 101f\n"
-      "st1 { v8.h }[4], [x9], #0x2\n"
-      "st1 { v12.h }[4], [x23], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
+      "st1 { v8.h }[4], [x11], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
       "tbz x10, #0, 106f\n"
-      "st1 { v8.b }[10], [x9]\n"
-      "st1 { v12.b }[10], [x23]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v20.b }[10], [x21]\n"
+      "st1 { v8.b }[10], [x11]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 106f\n"
       "101:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x10, #0, 106f\n"
-      "st1 { v8.b }[8], [x9]\n"
-      "st1 { v12.b }[8], [x23]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v20.b }[8], [x21]\n"
+      "st1 { v8.b }[8], [x11]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 106f\n"
       "102:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x10, #2, 104f\n"
-      "str s8, [x9], #0x4\n"
-      "str s12, [x23], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s20, [x21], #0x4\n"
+      "str s8, [x11], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
       "tbz x10, #1, 103f\n"
-      "st1 { v8.h }[2], [x9], #0x2\n"
-      "st1 { v12.h }[2], [x23], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
+      "st1 { v8.h }[2], [x11], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
       "tbz x10, #0, 106f\n"
-      "st1 { v8.b }[6], [x9]\n"
-      "st1 { v12.b }[6], [x23]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v20.b }[6], [x21]\n"
+      "st1 { v8.b }[6], [x11]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 106f\n"
       "103:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x10, #0, 106f\n"
-      "st1 { v8.b }[4], [x9]\n"
-      "st1 { v12.b }[4], [x23]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v20.b }[4], [x21]\n"
+      "st1 { v8.b }[4], [x11]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 106f\n"
       "104:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x10, #1, 105f\n"
-      "str h8, [x9], #0x2\n"
-      "str h12, [x23], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h20, [x21], #0x2\n"
+      "str h8, [x11], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
       "tbz x10, #0, 106f\n"
-      "st1 { v8.b }[2], [x9]\n"
-      "st1 { v12.b }[2], [x23]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v20.b }[2], [x21]\n"
+      "st1 { v8.b }[2], [x11]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 106f\n"
       "105:"  // Height 4: Partial direct writeback: partial_1_0
-      "str b8, [x9, #0x0]\n"
-      "str b12, [x23, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
+      "str b8, [x11, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "106:"  // Height 4: Partial direct writeback: Done
       "b 108f\n"
       "107:"  // Height 4: Full writeback
-      "str q8, [x9, #0x0]\n"
-      "add x9, x9, #0x10\n"
-      "str q12, [x23, #0x0]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q20, [x21, #0x0]\n"
+      "str q8, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
       "108:"  // Height 4: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 83b\n"
       "b 164f\n"
       "109:"  // Height 5
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x9, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "110:"  // Height 5: Column loop
       "movi v8.4s, #0x0\n"
       "movi v9.4s, #0x0\n"
@@ -1946,393 +1945,393 @@ void a64_hybrid_s8qs_dot_6x16 (
       "movi v26.4s, #0x0\n"
       "movi v27.4s, #0x0\n"
       "111:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "112:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 113f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 114f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 114f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
       "b 114f\n"
       "113:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "114:"  // Height 5: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 117f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
-      "ldr q6, [x28, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
       "blt 116f\n"
       "115:"  // Height 5: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q29, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sub x26, x26, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "cmp x26, #0x20\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "cmp x27, #0x20\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x28, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x28, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr q3, [x22, #0x0]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q28, [x9, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x9, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x9, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x9, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x9, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x9, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x9, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x9, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x9, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x9, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x9, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x9, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
+      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
       "bge 115b\n"
       "116:"  // Height 5: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q29, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x28, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x28, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      "ldr q28, [x9, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x9, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x9, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x9, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x9, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x9, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x9, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x9, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x9, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x9, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x9, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x9, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
       "117:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x26, 122f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 122f\n"
+      "cmp x27, #0x4\n"
       "blt 119f\n"
       "118:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr s3, [x22], #0x4\n"
-      "ldr s4, [x21], #0x4\n"
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s1, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x9, #0x0]\n"
+      ".inst 0x4f82e3a8  // sdot v8.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      "ldr q28, [x9, #0x10]\n"
+      ".inst 0x4f80e3b0  // sdot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b4  // sdot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3b8  // sdot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x9, #0x20]\n"
+      ".inst 0x4f82e389  // sdot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e391  // sdot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe395  // sdot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee399  // sdot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x4f82e3aa  // sdot v10.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b6  // sdot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3ba  // sdot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x4f82e38b  // sdot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe397  // sdot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee39b  // sdot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 118b\n"
-      "cbz x26, 122f\n"
       "119:"  // Height 5: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 120f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr h3, [x22], #0x2\n"
-      "ldr h4, [x21], #0x2\n"
-      "tbz x26, #0, 121f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
-      "ld1 { v3.b }[2], [x22]\n"
-      "ld1 { v4.b }[2], [x21]\n"
+      "cbz x27, 122f\n"
+      "tbz x27, #1, 120f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x27, #0, 121f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
       "b 121f\n"
       "120:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
-      "ldr b3, [x22, #0x0]\n"
-      "ldr b4, [x21, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
       "121:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q29, [x9, #0x0]\n"
+      "ldr q28, [x9, #0x10]\n"
+      ".inst 0x4f80e3a8  // sdot v8.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b0  // sdot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b4  // sdot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3b8  // sdot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x9, #0x20]\n"
+      ".inst 0x4f80e389  // sdot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e391  // sdot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e395  // sdot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e399  // sdot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
       "122:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 112b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x9, #0x0]\n"
-      "add x23, x9, x19\n"
-      "ldr q0, [x11, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
+      "ldr q31, [x14, #0x0]\n"
+      "ldr q30, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v31.4s\n"
+      "add v9.4s, v9.4s, v30.4s\n"
+      "ldr q29, [x14, #0x20]\n"
+      "ldr q28, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v29.4s\n"
+      "add v11.4s, v11.4s, v28.4s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x11, x20\n"
+      "add x24, x25, x20\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "add x23, x24, x20\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19\n"
-      "add v12.4s, v12.4s, v0.4s\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "ldr q1, [x11, #0x10]\n"
-      "ldr q2, [x11, #0x20]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q3, [x11, #0x30]\n"
-      "add x11, x11, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
+      "add v12.4s, v12.4s, v31.4s\n"
+      "add v13.4s, v13.4s, v30.4s\n"
+      "add v14.4s, v14.4s, v29.4s\n"
+      "add v15.4s, v15.4s, v28.4s\n"
+      "add x14, x14, #0x40\n"
+      "add v16.4s, v16.4s, v31.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v31.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v31.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
       "tbz %x[flags], #4, 123f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -2341,20 +2340,20 @@ void a64_hybrid_s8qs_dot_6x16 (
       "ldr q2, [x12, #0x20]\n"
       "ldr q6, [x13, #0x20]\n"
       "ldr q3, [x12, #0x30]\n"
-      "add x12, x12, #0x40\n"
       "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
       "add x13, x13, #0x40\n"
       "b 124f\n"
       "123:"  // Height 5: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "mov v2.16b, v0.16b\n"
-      "mov v3.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
       "124:"  // Height 5: parameters loaded
       "sqrdmulh v8.4s, v8.4s, v4.4s\n"
@@ -2378,296 +2377,296 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v26.4s, v26.4s, v6.4s\n"
       "sqrdmulh v27.4s, v27.4s, v7.4s\n"
       "tbz %x[flags], #5, 125f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "and v5.16b, v25.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
-      "and v6.16b, v26.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v4.4s\n"
-      "and v7.16b, v27.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v25.4s, v25.4s, v5.4s\n"
-      "sqadd v26.4s, v26.4s, v6.4s\n"
-      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "and v31.16b, v8.16b, v0.16b\n"
+      "and v30.16b, v9.16b, v1.16b\n"
+      "and v29.16b, v10.16b, v2.16b\n"
+      "and v28.16b, v11.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v31.4s\n"
+      "sqadd v9.4s, v9.4s, v30.4s\n"
+      "sqadd v10.4s, v10.4s, v29.4s\n"
+      "sqadd v11.4s, v11.4s, v28.4s\n"
+      "and v31.16b, v12.16b, v0.16b\n"
+      "and v30.16b, v13.16b, v1.16b\n"
+      "and v29.16b, v14.16b, v2.16b\n"
+      "and v28.16b, v15.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v31.4s\n"
+      "sqadd v13.4s, v13.4s, v30.4s\n"
+      "sqadd v14.4s, v14.4s, v29.4s\n"
+      "sqadd v15.4s, v15.4s, v28.4s\n"
+      "and v31.16b, v16.16b, v0.16b\n"
+      "and v30.16b, v17.16b, v1.16b\n"
+      "and v29.16b, v18.16b, v2.16b\n"
+      "and v28.16b, v19.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v31.4s\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "and v31.16b, v20.16b, v0.16b\n"
+      "and v30.16b, v21.16b, v1.16b\n"
+      "and v29.16b, v22.16b, v2.16b\n"
+      "and v28.16b, v23.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v31.4s\n"
+      "sqadd v21.4s, v21.4s, v30.4s\n"
+      "sqadd v22.4s, v22.4s, v29.4s\n"
+      "sqadd v23.4s, v23.4s, v28.4s\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v1.16b\n"
+      "and v29.16b, v26.16b, v2.16b\n"
+      "and v28.16b, v27.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "125:"  // Height 5: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v30.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x24]\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
-      "add x24, %x[qp], %[minval]\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
-      "ld1r { v5.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "ld1r { v6.4s }, [x24]\n"
-      "cmp x10, #0x10\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
+      "cmp x10, #0x10\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
       "srshl v17.4s, v17.4s, v1.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
       "srshl v21.4s, v21.4s, v1.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
       "srshl v22.4s, v22.4s, v2.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
       "srshl v23.4s, v23.4s, v3.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
       "srshl v24.4s, v24.4s, v0.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
       "srshl v25.4s, v25.4s, v1.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
       "srshl v26.4s, v26.4s, v2.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
       "srshl v27.4s, v27.4s, v3.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
+      "add v8.4s, v8.4s, v30.4s\n"
+      "add v9.4s, v9.4s, v30.4s\n"
+      "add v10.4s, v10.4s, v30.4s\n"
+      "add v11.4s, v11.4s, v30.4s\n"
+      "add v12.4s, v12.4s, v30.4s\n"
+      "add v13.4s, v13.4s, v30.4s\n"
+      "add v14.4s, v14.4s, v30.4s\n"
+      "add v15.4s, v15.4s, v30.4s\n"
+      "add v16.4s, v16.4s, v30.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v30.4s\n"
+      "add v19.4s, v19.4s, v30.4s\n"
+      "add v20.4s, v20.4s, v30.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v30.4s\n"
+      "add v23.4s, v23.4s, v30.4s\n"
+      "add v24.4s, v24.4s, v30.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v30.4s\n"
+      "add v27.4s, v27.4s, v30.4s\n"
+      "smin v8.4s, v8.4s, v29.4s\n"
+      "smin v9.4s, v9.4s, v29.4s\n"
+      "smin v10.4s, v10.4s, v29.4s\n"
+      "smin v11.4s, v11.4s, v29.4s\n"
+      "smin v12.4s, v12.4s, v29.4s\n"
+      "smin v13.4s, v13.4s, v29.4s\n"
+      "smin v14.4s, v14.4s, v29.4s\n"
+      "smin v15.4s, v15.4s, v29.4s\n"
+      "smin v16.4s, v16.4s, v29.4s\n"
+      "smin v17.4s, v17.4s, v29.4s\n"
+      "smin v18.4s, v18.4s, v29.4s\n"
+      "smin v19.4s, v19.4s, v29.4s\n"
+      "smin v20.4s, v20.4s, v29.4s\n"
+      "smin v21.4s, v21.4s, v29.4s\n"
+      "smin v22.4s, v22.4s, v29.4s\n"
+      "smin v23.4s, v23.4s, v29.4s\n"
+      "smin v24.4s, v24.4s, v29.4s\n"
+      "smin v25.4s, v25.4s, v29.4s\n"
+      "smin v26.4s, v26.4s, v29.4s\n"
+      "smin v27.4s, v27.4s, v29.4s\n"
+      "smax v8.4s, v8.4s, v28.4s\n"
+      "smax v9.4s, v9.4s, v28.4s\n"
+      "smax v10.4s, v10.4s, v28.4s\n"
+      "smax v11.4s, v11.4s, v28.4s\n"
+      "smax v12.4s, v12.4s, v28.4s\n"
+      "smax v13.4s, v13.4s, v28.4s\n"
+      "smax v14.4s, v14.4s, v28.4s\n"
+      "smax v15.4s, v15.4s, v28.4s\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v29.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v28.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v8.16b, v8.16b, v29.16b\n"
+      "uzp1 v12.16b, v12.16b, v28.16b\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 134f\n"
       "tbz x10, #3, 129f\n"
-      "str d8, [x9], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
+      "str d8, [x11], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x10, #2, 127f\n"
-      "st1 { v8.s }[2], [x9], #0x4\n"
-      "st1 { v12.s }[2], [x23], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
+      "st1 { v8.s }[2], [x11], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x10, #1, 126f\n"
-      "st1 { v8.h }[6], [x9], #0x2\n"
-      "st1 { v12.h }[6], [x23], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
+      "st1 { v8.h }[6], [x11], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
       "tbz x10, #0, 133f\n"
-      "st1 { v8.b }[14], [x9]\n"
-      "st1 { v12.b }[14], [x23]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
+      "st1 { v8.b }[14], [x11]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 133f\n"
       "126:"  // Height 5: Partial direct writeback: partial_1_12
       "tbz x10, #0, 133f\n"
-      "st1 { v8.b }[12], [x9]\n"
-      "st1 { v12.b }[12], [x23]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
+      "st1 { v8.b }[12], [x11]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 133f\n"
       "127:"  // Height 5: Partial direct writeback: partial_2_8
       "tbz x10, #1, 128f\n"
-      "st1 { v8.h }[4], [x9], #0x2\n"
-      "st1 { v12.h }[4], [x23], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
+      "st1 { v8.h }[4], [x11], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
       "tbz x10, #0, 133f\n"
-      "st1 { v8.b }[10], [x9]\n"
-      "st1 { v12.b }[10], [x23]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
+      "st1 { v8.b }[10], [x11]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 133f\n"
       "128:"  // Height 5: Partial direct writeback: partial_1_8
       "tbz x10, #0, 133f\n"
-      "st1 { v8.b }[8], [x9]\n"
-      "st1 { v12.b }[8], [x23]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
+      "st1 { v8.b }[8], [x11]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 133f\n"
       "129:"  // Height 5: Partial direct writeback: partial_4_0
       "tbz x10, #2, 131f\n"
-      "str s8, [x9], #0x4\n"
-      "str s12, [x23], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
+      "str s8, [x11], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
       "tbz x10, #1, 130f\n"
-      "st1 { v8.h }[2], [x9], #0x2\n"
-      "st1 { v12.h }[2], [x23], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
+      "st1 { v8.h }[2], [x11], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
       "tbz x10, #0, 133f\n"
-      "st1 { v8.b }[6], [x9]\n"
-      "st1 { v12.b }[6], [x23]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
+      "st1 { v8.b }[6], [x11]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 133f\n"
       "130:"  // Height 5: Partial direct writeback: partial_1_4
       "tbz x10, #0, 133f\n"
-      "st1 { v8.b }[4], [x9]\n"
-      "st1 { v12.b }[4], [x23]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
+      "st1 { v8.b }[4], [x11]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 133f\n"
       "131:"  // Height 5: Partial direct writeback: partial_2_0
       "tbz x10, #1, 132f\n"
-      "str h8, [x9], #0x2\n"
-      "str h12, [x23], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
+      "str h8, [x11], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
       "tbz x10, #0, 133f\n"
-      "st1 { v8.b }[2], [x9]\n"
-      "st1 { v12.b }[2], [x23]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
+      "st1 { v8.b }[2], [x11]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 133f\n"
       "132:"  // Height 5: Partial direct writeback: partial_1_0
-      "str b8, [x9, #0x0]\n"
-      "str b12, [x23, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
+      "str b8, [x11, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "133:"  // Height 5: Partial direct writeback: Done
       "b 135f\n"
       "134:"  // Height 5: Full writeback
-      "str q8, [x9, #0x0]\n"
-      "add x9, x9, #0x10\n"
-      "str q12, [x23, #0x0]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
+      "str q8, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "135:"  // Height 5: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 110b\n"
       "b 164f\n"
       "136:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x6\n"
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x9, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x20, #0x6\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "137:"  // Height 6: Column loop
       "movi v8.4s, #0x0\n"
       "movi v9.4s, #0x0\n"
@@ -2694,297 +2693,297 @@ void a64_hybrid_s8qs_dot_6x16 (
       "movi v30.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
       "138:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "139:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 140f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 141f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 141f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
       "b 141f\n"
       "140:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "141:"  // Height 6: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 144f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
-      "ldr q5, [x20, #0x0]\n"
-      "ldr q6, [x28, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
       "blt 143f\n"
       "142:"  // Height 6: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x20, x20, #0x10\n"
+      "ldr q6, [x9, #0x20]\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "cmp x26, #0x20\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
+      "ldr q7, [x9, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x28, #0x40]\n"
+      "ldr q6, [x9, #0x40]\n"
       ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x28, #0x50]\n"
+      "ldr q7, [x9, #0x50]\n"
       ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q6, [x9, #0x60]\n"
       ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
+      "ldr q7, [x9, #0x70]\n"
       ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q6, [x9, #0x80]\n"
       ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
+      "ldr q7, [x9, #0x90]\n"
       ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
+      "ldr q6, [x9, #0xa0]\n"
       ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
+      "ldr q7, [x9, #0xb0]\n"
       ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
+      "ldr q6, [x9, #0xc0]\n"
       ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
+      "ldr q7, [x9, #0xd0]\n"
       ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
+      "ldr q6, [x9, #0xe0]\n"
       ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
+      "ldr q7, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
       ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
-      "ldr q6, [x28, #0x0]\n"
+      "ldr q6, [x9, #0x0]\n"
       ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
       ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
       ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
       ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
       ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
       ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
       "bge 142b\n"
       "143:"  // Height 6: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q6, [x9, #0x20]\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "sub x27, x27, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
+      "ldr q7, [x9, #0x30]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x28, #0x40]\n"
+      "ldr q6, [x9, #0x40]\n"
       ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x28, #0x50]\n"
+      "ldr q7, [x9, #0x50]\n"
       ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q6, [x9, #0x60]\n"
       ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x28, #0x70]\n"
+      "ldr q7, [x9, #0x70]\n"
       ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q6, [x9, #0x80]\n"
       ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x28, #0x90]\n"
+      "ldr q7, [x9, #0x90]\n"
       ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x28, #0xa0]\n"
+      "ldr q6, [x9, #0xa0]\n"
       ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x28, #0xb0]\n"
+      "ldr q7, [x9, #0xb0]\n"
       ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x28, #0xc0]\n"
+      "ldr q6, [x9, #0xc0]\n"
       ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x28, #0xd0]\n"
+      "ldr q7, [x9, #0xd0]\n"
       ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
-      "ldr q6, [x28, #0xe0]\n"
+      "ldr q6, [x9, #0xe0]\n"
       ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "add x28, x28, #0x100\n"
+      "ldr q7, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
       ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
@@ -2998,148 +2997,148 @@ void a64_hybrid_s8qs_dot_6x16 (
       ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
       "144:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x26, 149f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 149f\n"
+      "cmp x27, #0x4\n"
       "blt 146f\n"
       "145:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
       "ldr s3, [x22], #0x4\n"
-      "ldr s4, [x21], #0x4\n"
-      "ldr s5, [x20], #0x4\n"
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x9, #0x0]\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x4f87e028  // sdot v8.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x4f86e02c  // sdot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e030  // sdot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e034  // sdot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e038  // sdot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03c  // sdot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x9, #0x20]\n"
+      ".inst 0x4f87e009  // sdot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00d  // sdot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e011  // sdot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e015  // sdot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e019  // sdot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01d  // sdot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x4f87e02a  // sdot v10.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x4f86e02e  // sdot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e032  // sdot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e036  // sdot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e03a  // sdot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03e  // sdot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x4f87e00b  // sdot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00f  // sdot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e013  // sdot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e017  // sdot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e01b  // sdot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01f  // sdot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 145b\n"
-      "cbz x26, 149f\n"
       "146:"  // Height 6: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 147f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr h3, [x22], #0x2\n"
-      "ldr h4, [x21], #0x2\n"
-      "ldr h5, [x20], #0x2\n"
-      "tbz x26, #0, 148f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
-      "ld1 { v3.b }[2], [x22]\n"
-      "ld1 { v4.b }[2], [x21]\n"
-      "ld1 { v5.b }[2], [x20]\n"
+      "cbz x27, 149f\n"
+      "tbz x27, #1, 147f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x21], #0x2\n"
+      "tbz x27, #0, 148f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x21]\n"
       "b 148f\n"
       "147:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
-      "ldr b3, [x22, #0x0]\n"
-      "ldr b4, [x21, #0x0]\n"
-      "ldr b5, [x20, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x21, #0x0]\n"
       "148:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x9, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x4f80e0e8  // sdot v8.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ec  // sdot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f0  // sdot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f4  // sdot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f8  // sdot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fc  // sdot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4f80e0c9  // sdot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cd  // sdot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d1  // sdot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d5  // sdot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d9  // sdot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dd  // sdot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      ".inst 0x4f80e0ea  // sdot v10.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ee  // sdot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f2  // sdot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f6  // sdot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fa  // sdot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fe  // sdot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0cb  // sdot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cf  // sdot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d3  // sdot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d7  // sdot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0db  // sdot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0df  // sdot v31.4s, v6.16b, v5.4b[0]\n"
       "149:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 139b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x9, #0x0]\n"
-      "add x23, x9, x19\n"
-      "ldr q0, [x11, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q2, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v3.4s\n"
+      "add v9.4s, v9.4s, v2.4s\n"
+      "ldr q1, [x14, #0x20]\n"
+      "ldr q0, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v1.4s\n"
+      "add v11.4s, v11.4s, v0.4s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x11, x20\n"
+      "add x24, x25, x20\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "add x23, x24, x20\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "add x21, x22, x20\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19\n"
-      "add v12.4s, v12.4s, v0.4s\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19\n"
-      "add v16.4s, v16.4s, v0.4s\n"
+      "add v12.4s, v12.4s, v3.4s\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
-      "add x19, x20, x19\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "prfm pstl1keep, [x19, #0x0]\n"
-      "add v28.4s, v28.4s, v0.4s\n"
-      "ldr q1, [x11, #0x10]\n"
-      "ldr q2, [x11, #0x20]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q3, [x11, #0x30]\n"
-      "add x11, x11, #0x40\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "add v29.4s, v29.4s, v1.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
+      "add v13.4s, v13.4s, v2.4s\n"
+      "add v14.4s, v14.4s, v1.4s\n"
+      "add x14, x14, #0x40\n"
+      "add v15.4s, v15.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v2.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v1.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v2.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v1.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
       "tbz %x[flags], #4, 150f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -3148,20 +3147,20 @@ void a64_hybrid_s8qs_dot_6x16 (
       "ldr q2, [x12, #0x20]\n"
       "ldr q6, [x13, #0x20]\n"
       "ldr q3, [x12, #0x30]\n"
-      "add x12, x12, #0x40\n"
       "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
       "add x13, x13, #0x40\n"
       "b 151f\n"
       "150:"  // Height 6: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x24]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x24]\n"
-      "mov v2.16b, v0.16b\n"
-      "mov v3.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
       "151:"  // Height 6: parameters loaded
       "sqrdmulh v8.4s, v8.4s, v4.4s\n"
@@ -3189,348 +3188,347 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v30.4s, v30.4s, v6.4s\n"
       "sqrdmulh v31.4s, v31.4s, v7.4s\n"
       "tbz %x[flags], #5, 152f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "and v7.16b, v8.16b, v0.16b\n"
+      "and v6.16b, v9.16b, v1.16b\n"
+      "and v5.16b, v10.16b, v2.16b\n"
+      "and v4.16b, v11.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v6.16b, v14.16b, v2.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "and v7.16b, v23.16b, v3.16b\n"
+      "sqadd v8.4s, v8.4s, v7.4s\n"
+      "sqadd v9.4s, v9.4s, v6.4s\n"
+      "sqadd v10.4s, v10.4s, v5.4s\n"
+      "sqadd v11.4s, v11.4s, v4.4s\n"
+      "and v7.16b, v12.16b, v0.16b\n"
+      "and v6.16b, v13.16b, v1.16b\n"
+      "and v5.16b, v14.16b, v2.16b\n"
+      "and v4.16b, v15.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "and v5.16b, v25.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
-      "and v6.16b, v26.16b, v2.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v4.4s\n"
-      "and v7.16b, v27.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v7.4s\n"
+      "sqadd v13.4s, v13.4s, v6.4s\n"
+      "sqadd v14.4s, v14.4s, v5.4s\n"
+      "sqadd v15.4s, v15.4s, v4.4s\n"
+      "and v7.16b, v16.16b, v0.16b\n"
+      "and v6.16b, v17.16b, v1.16b\n"
+      "and v5.16b, v18.16b, v2.16b\n"
+      "and v4.16b, v19.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v25.4s, v25.4s, v5.4s\n"
-      "and v4.16b, v28.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v26.4s, v26.4s, v6.4s\n"
-      "and v5.16b, v29.16b, v1.16b\n"
+      "sqadd v16.4s, v16.4s, v7.4s\n"
+      "sqadd v17.4s, v17.4s, v6.4s\n"
+      "sqadd v18.4s, v18.4s, v5.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "and v7.16b, v20.16b, v0.16b\n"
+      "and v6.16b, v21.16b, v1.16b\n"
+      "and v5.16b, v22.16b, v2.16b\n"
+      "and v4.16b, v23.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v27.4s, v27.4s, v7.4s\n"
-      "and v6.16b, v30.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v7.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "sqadd v22.4s, v22.4s, v5.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v7.16b, v24.16b, v0.16b\n"
+      "and v6.16b, v25.16b, v1.16b\n"
+      "and v5.16b, v26.16b, v2.16b\n"
+      "and v4.16b, v27.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v28.4s, v28.4s, v4.4s\n"
-      "and v7.16b, v31.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v7.4s\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sqadd v26.4s, v26.4s, v5.4s\n"
+      "sqadd v27.4s, v27.4s, v4.4s\n"
+      "and v7.16b, v28.16b, v0.16b\n"
+      "and v6.16b, v29.16b, v1.16b\n"
+      "and v5.16b, v30.16b, v2.16b\n"
+      "and v4.16b, v31.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v29.4s, v29.4s, v5.4s\n"
-      "sqadd v30.4s, v30.4s, v6.4s\n"
-      "sqadd v31.4s, v31.4s, v7.4s\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v7.4s\n"
+      "sqadd v29.4s, v29.4s, v6.4s\n"
+      "sqadd v30.4s, v30.4s, v5.4s\n"
+      "sqadd v31.4s, v31.4s, v4.4s\n"
       "152:"  // Height 6: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v6.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x24]\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
-      "add x24, %x[qp], %[minval]\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
-      "ld1r { v5.4s }, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "ld1r { v6.4s }, [x24]\n"
-      "cmp x10, #0x10\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v5.4s }, [x20]\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
+      "cmp x10, #0x10\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
       "srshl v17.4s, v17.4s, v1.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
       "srshl v21.4s, v21.4s, v1.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
       "srshl v22.4s, v22.4s, v2.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
       "srshl v23.4s, v23.4s, v3.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
       "srshl v24.4s, v24.4s, v0.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
       "srshl v25.4s, v25.4s, v1.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
       "srshl v26.4s, v26.4s, v2.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
       "srshl v27.4s, v27.4s, v3.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
       "srshl v28.4s, v28.4s, v0.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
       "srshl v29.4s, v29.4s, v1.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
       "srshl v30.4s, v30.4s, v2.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
       "srshl v31.4s, v31.4s, v3.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
+      "add v8.4s, v8.4s, v6.4s\n"
+      "add v9.4s, v9.4s, v6.4s\n"
+      "add v10.4s, v10.4s, v6.4s\n"
+      "add v11.4s, v11.4s, v6.4s\n"
+      "add v12.4s, v12.4s, v6.4s\n"
+      "add v13.4s, v13.4s, v6.4s\n"
+      "add v14.4s, v14.4s, v6.4s\n"
+      "add v15.4s, v15.4s, v6.4s\n"
+      "add v16.4s, v16.4s, v6.4s\n"
+      "add v17.4s, v17.4s, v6.4s\n"
+      "add v18.4s, v18.4s, v6.4s\n"
+      "add v19.4s, v19.4s, v6.4s\n"
+      "add v20.4s, v20.4s, v6.4s\n"
+      "add v21.4s, v21.4s, v6.4s\n"
+      "add v22.4s, v22.4s, v6.4s\n"
+      "add v23.4s, v23.4s, v6.4s\n"
+      "add v24.4s, v24.4s, v6.4s\n"
+      "add v25.4s, v25.4s, v6.4s\n"
+      "add v26.4s, v26.4s, v6.4s\n"
+      "add v27.4s, v27.4s, v6.4s\n"
+      "add v28.4s, v28.4s, v6.4s\n"
+      "add v29.4s, v29.4s, v6.4s\n"
+      "add v30.4s, v30.4s, v6.4s\n"
+      "add v31.4s, v31.4s, v6.4s\n"
+      "smin v8.4s, v8.4s, v5.4s\n"
+      "smin v9.4s, v9.4s, v5.4s\n"
+      "smin v10.4s, v10.4s, v5.4s\n"
+      "smin v11.4s, v11.4s, v5.4s\n"
+      "smin v12.4s, v12.4s, v5.4s\n"
+      "smin v13.4s, v13.4s, v5.4s\n"
+      "smin v14.4s, v14.4s, v5.4s\n"
+      "smin v15.4s, v15.4s, v5.4s\n"
+      "smin v16.4s, v16.4s, v5.4s\n"
+      "smin v17.4s, v17.4s, v5.4s\n"
+      "smin v18.4s, v18.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v5.4s\n"
+      "smin v20.4s, v20.4s, v5.4s\n"
+      "smin v21.4s, v21.4s, v5.4s\n"
+      "smin v22.4s, v22.4s, v5.4s\n"
+      "smin v23.4s, v23.4s, v5.4s\n"
+      "smin v24.4s, v24.4s, v5.4s\n"
+      "smin v25.4s, v25.4s, v5.4s\n"
+      "smin v26.4s, v26.4s, v5.4s\n"
+      "smin v27.4s, v27.4s, v5.4s\n"
+      "smin v28.4s, v28.4s, v5.4s\n"
+      "smin v29.4s, v29.4s, v5.4s\n"
+      "smin v30.4s, v30.4s, v5.4s\n"
+      "smin v31.4s, v31.4s, v5.4s\n"
+      "smax v8.4s, v8.4s, v4.4s\n"
+      "smax v9.4s, v9.4s, v4.4s\n"
+      "smax v10.4s, v10.4s, v4.4s\n"
+      "smax v11.4s, v11.4s, v4.4s\n"
+      "smax v12.4s, v12.4s, v4.4s\n"
+      "smax v13.4s, v13.4s, v4.4s\n"
+      "smax v14.4s, v14.4s, v4.4s\n"
+      "smax v15.4s, v15.4s, v4.4s\n"
+      "smax v16.4s, v16.4s, v4.4s\n"
+      "smax v17.4s, v17.4s, v4.4s\n"
+      "smax v18.4s, v18.4s, v4.4s\n"
+      "smax v19.4s, v19.4s, v4.4s\n"
+      "smax v20.4s, v20.4s, v4.4s\n"
+      "smax v21.4s, v21.4s, v4.4s\n"
+      "smax v22.4s, v22.4s, v4.4s\n"
+      "smax v23.4s, v23.4s, v4.4s\n"
+      "smax v24.4s, v24.4s, v4.4s\n"
+      "smax v25.4s, v25.4s, v4.4s\n"
+      "smax v26.4s, v26.4s, v4.4s\n"
+      "smax v27.4s, v27.4s, v4.4s\n"
+      "smax v28.4s, v28.4s, v4.4s\n"
+      "smax v29.4s, v29.4s, v4.4s\n"
+      "smax v30.4s, v30.4s, v4.4s\n"
+      "smax v31.4s, v31.4s, v4.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
+      "uzp1 v2.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v1.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
+      "uzp1 v8.16b, v8.16b, v2.16b\n"
+      "uzp1 v12.16b, v12.16b, v1.16b\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 161f\n"
       "tbz x10, #3, 156f\n"
-      "str d8, [x9], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "str d28, [x19], #0x8\n"
+      "str d8, [x11], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
       "tbz x10, #2, 154f\n"
-      "st1 { v8.s }[2], [x9], #0x4\n"
-      "st1 { v12.s }[2], [x23], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
-      "st1 { v28.s }[2], [x19], #0x4\n"
+      "st1 { v8.s }[2], [x11], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
       "tbz x10, #1, 153f\n"
-      "st1 { v8.h }[6], [x9], #0x2\n"
-      "st1 { v12.h }[6], [x23], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
-      "st1 { v28.h }[6], [x19], #0x2\n"
+      "st1 { v8.h }[6], [x11], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
       "tbz x10, #0, 160f\n"
-      "st1 { v8.b }[14], [x9]\n"
-      "st1 { v12.b }[14], [x23]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
-      "st1 { v28.b }[14], [x19]\n"
+      "st1 { v8.b }[14], [x11]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 160f\n"
       "153:"  // Height 6: Partial direct writeback: partial_1_12
       "tbz x10, #0, 160f\n"
-      "st1 { v8.b }[12], [x9]\n"
-      "st1 { v12.b }[12], [x23]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
-      "st1 { v28.b }[12], [x19]\n"
+      "st1 { v8.b }[12], [x11]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 160f\n"
       "154:"  // Height 6: Partial direct writeback: partial_2_8
       "tbz x10, #1, 155f\n"
-      "st1 { v8.h }[4], [x9], #0x2\n"
-      "st1 { v12.h }[4], [x23], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
-      "st1 { v28.h }[4], [x19], #0x2\n"
+      "st1 { v8.h }[4], [x11], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
       "tbz x10, #0, 160f\n"
-      "st1 { v8.b }[10], [x9]\n"
-      "st1 { v12.b }[10], [x23]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
-      "st1 { v28.b }[10], [x19]\n"
+      "st1 { v8.b }[10], [x11]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 160f\n"
       "155:"  // Height 6: Partial direct writeback: partial_1_8
       "tbz x10, #0, 160f\n"
-      "st1 { v8.b }[8], [x9]\n"
-      "st1 { v12.b }[8], [x23]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
-      "st1 { v28.b }[8], [x19]\n"
+      "st1 { v8.b }[8], [x11]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 160f\n"
       "156:"  // Height 6: Partial direct writeback: partial_4_0
       "tbz x10, #2, 158f\n"
-      "str s8, [x9], #0x4\n"
-      "str s12, [x23], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
-      "str s28, [x19], #0x4\n"
+      "str s8, [x11], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
       "tbz x10, #1, 157f\n"
-      "st1 { v8.h }[2], [x9], #0x2\n"
-      "st1 { v12.h }[2], [x23], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
-      "st1 { v28.h }[2], [x19], #0x2\n"
+      "st1 { v8.h }[2], [x11], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
       "tbz x10, #0, 160f\n"
-      "st1 { v8.b }[6], [x9]\n"
-      "st1 { v12.b }[6], [x23]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
-      "st1 { v28.b }[6], [x19]\n"
+      "st1 { v8.b }[6], [x11]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 160f\n"
       "157:"  // Height 6: Partial direct writeback: partial_1_4
       "tbz x10, #0, 160f\n"
-      "st1 { v8.b }[4], [x9]\n"
-      "st1 { v12.b }[4], [x23]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
-      "st1 { v28.b }[4], [x19]\n"
+      "st1 { v8.b }[4], [x11]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 160f\n"
       "158:"  // Height 6: Partial direct writeback: partial_2_0
       "tbz x10, #1, 159f\n"
-      "str h8, [x9], #0x2\n"
-      "str h12, [x23], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
-      "str h28, [x19], #0x2\n"
+      "str h8, [x11], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
       "tbz x10, #0, 160f\n"
-      "st1 { v8.b }[2], [x9]\n"
-      "st1 { v12.b }[2], [x23]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
-      "st1 { v28.b }[2], [x19]\n"
+      "st1 { v8.b }[2], [x11]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 160f\n"
       "159:"  // Height 6: Partial direct writeback: partial_1_0
-      "str b8, [x9, #0x0]\n"
-      "str b12, [x23, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
-      "str b28, [x19, #0x0]\n"
+      "str b8, [x11, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "160:"  // Height 6: Partial direct writeback: Done
       "b 162f\n"
       "161:"  // Height 6: Full writeback
-      "str q8, [x9, #0x0]\n"
-      "add x9, x9, #0x10\n"
-      "str q12, [x23, #0x0]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q28, [x19, #0x0]\n"
+      "str q8, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "162:"  // Height 6: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 137b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 164f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 163f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "163:"  // Update direct input
-      "mov x19, #0x6\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "164:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
new file mode 100644
index 0000000000..d0d5f1b80d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_s8qs_mmla_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8qs_mmla_6x16
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 50.42 };
+                case CPUModel::A510:
+                    return { 28.71 };
+                case CPUModel::V1:
+                    return { 77.72 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_s8qs_mmla_6x16;
+    cls_a64_hybrid_s8qs_mmla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..0771829d37
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
@@ -0,0 +1,3626 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qs_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+    struct KernelArgs {
+        const int32_t *multiplier_ptr = {};
+        const int32_t *shift_ptr = {};
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->per_channel_requant) {
+        flags |= 0x10;
+        ka.multiplier_ptr=qp->per_channel_muls + col_base;
+        ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+    }
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 146f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 117f\n"
+      "beq 88f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 59f\n"
+      "beq 30f\n"
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "2:"  // Height 1: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "blt 9f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q7, [x9, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q6, [x9, #0x10]\n"
+      "blt 8f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "trn1 v18.2d, v1.2d, v21.2d\n"
+      ".inst 0x4e87a648  // smmla v8.4s, v18.16b, v7.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e86a64c  // smmla v12.4s, v18.16b, v6.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v21.2d\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4e91a428  // smmla v8.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4e90a42c  // smmla v12.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4e91a429  // smmla v9.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4e90a42d  // smmla v13.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4e91a42a  // smmla v10.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4e90a42e  // smmla v14.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xf0]\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e91a42b  // smmla v11.4s, v1.16b, v17.16b\n"
+      ".inst 0x4e90a42f  // smmla v15.4s, v1.16b, v16.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      "add x9, x9, #0x100\n"
+      "ldr q7, [x9, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "bge 7b\n"
+      "8:"  // Height 1: Multiply loop: Single iteration only
+      "trn1 v18.2d, v1.2d, v19.2d\n"
+      ".inst 0x4e87a648  // smmla v8.4s, v18.16b, v7.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e86a64c  // smmla v12.4s, v18.16b, v6.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v19.2d\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4e91a428  // smmla v8.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4e90a42c  // smmla v12.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4e91a429  // smmla v9.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4e90a42d  // smmla v13.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4e91a42a  // smmla v10.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4e90a42e  // smmla v14.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e91a42b  // smmla v11.4s, v1.16b, v17.16b\n"
+      ".inst 0x4e90a42f  // smmla v15.4s, v1.16b, v16.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x9, x9, #0x100\n"
+      "9:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 16f\n"
+      "cmp x27, #0x8\n"
+      "blt 11f\n"
+      "10:"  // Height 1: Multiply loop: Odd block loop
+      "ldr d18, [x26], #0x8\n"
+      "ldr q17, [x9, #0x0]\n"
+      "trn1 v18.2d, v18.2d, v16.2d\n"
+      "ldr q31, [x9, #0x10]\n"
+      ".inst 0x4e91a648  // smmla v8.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e9fa64c  // smmla v12.4s, v18.16b, v31.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "add x9, x9, #0x80\n"
+      "bge 10b\n"
+      "11:"  // Height 1: Multiply loop: Skip odd blocks
+      "cbz x27, 16f\n"
+      "tbz x27, #2, 13f\n"
+      "ldr s1, [x26], #0x4\n"
+      "tbz x27, #1, 12f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "tbz x27, #0, 15f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "b 15f\n"
+      "12:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 15f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "b 15f\n"
+      "13:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 14f\n"
+      "ldr h1, [x26], #0x2\n"
+      "tbz x27, #0, 15f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "b 15f\n"
+      "14:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "15:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q19, [x9, #0x10]\n"
+      "trn1 v18.2d, v1.2d, v16.2d\n"
+      ".inst 0x4e91a648  // smmla v8.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e93a64c  // smmla v12.4s, v18.16b, v19.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "add x9, x9, #0x80\n"
+      "16:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 4b\n"
+      "ldr q19, [x14, #0x0]\n"
+      "ldr q18, [x14, #0x10]\n"
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "ldr q17, [x14, #0x20]\n"
+      "ldr q16, [x14, #0x30]\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "mov v15.16b, v8.16b\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "add v15.4s, v15.4s, v19.4s\n"
+      "add x14, x14, #0x40\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add v10.4s, v10.4s, v17.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
+      "tbz %x[flags], #4, 17f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "add x13, x13, #0x40\n"
+      "b 18f\n"
+      "17:"  // Height 1: per layer parameters
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
+      "mov v1.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "18:"  // Height 1: parameters loaded
+      "sqrdmulh v15.4s, v15.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "tbz %x[flags], #5, 19f\n"
+      "and v17.16b, v15.16b, v0.16b\n"
+      "and v16.16b, v9.16b, v1.16b\n"
+      "and v25.16b, v10.16b, v2.16b\n"
+      "and v18.16b, v11.16b, v3.16b\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v17.4s\n"
+      "sqadd v9.4s, v9.4s, v16.4s\n"
+      "sqadd v10.4s, v10.4s, v25.4s\n"
+      "sqadd v11.4s, v11.4s, v18.4s\n"
+      "19:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v18.4s }, [x20]\n"
+      "srshl v15.4s, v15.4s, v0.4s\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add v15.4s, v15.4s, v18.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v10.4s, v10.4s, v18.4s\n"
+      "add v11.4s, v11.4s, v18.4s\n"
+      "cmp x10, #0x10\n"
+      "smin v15.4s, v15.4s, v17.4s\n"
+      "smin v9.4s, v9.4s, v17.4s\n"
+      "smin v10.4s, v10.4s, v17.4s\n"
+      "smin v11.4s, v11.4s, v17.4s\n"
+      "smax v15.4s, v15.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
+      "uzp1 v15.8h, v15.8h, v9.8h\n"
+      "uzp1 v16.8h, v10.8h, v11.8h\n"
+      "uzp1 v15.16b, v15.16b, v16.16b\n"
+      "bge 28f\n"
+      "tbz x10, #3, 23f\n"
+      "str d15, [x11], #0x8\n"
+      "tbz x10, #2, 21f\n"
+      "st1 { v15.s }[2], [x11], #0x4\n"
+      "tbz x10, #1, 20f\n"
+      "st1 { v15.h }[6], [x11], #0x2\n"
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[14], [x11]\n"
+      "b 27f\n"
+      "20:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[12], [x11]\n"
+      "b 27f\n"
+      "21:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 22f\n"
+      "st1 { v15.h }[4], [x11], #0x2\n"
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[10], [x11]\n"
+      "b 27f\n"
+      "22:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[8], [x11]\n"
+      "b 27f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 25f\n"
+      "str s15, [x11], #0x4\n"
+      "tbz x10, #1, 24f\n"
+      "st1 { v15.h }[2], [x11], #0x2\n"
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[6], [x11]\n"
+      "b 27f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[4], [x11]\n"
+      "b 27f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 26f\n"
+      "str h15, [x11], #0x2\n"
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[2], [x11]\n"
+      "b 27f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b15, [x11, #0x0]\n"
+      "27:"  // Height 1: Partial direct writeback: Done
+      "b 29f\n"
+      "28:"  // Height 1: Full writeback
+      "str q15, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "29:"  // Height 1: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 2b\n"
+      "b 176f\n"
+      "30:"  // Height 2
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "31:"  // Height 2: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "32:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "33:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 34f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 35f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "b 35f\n"
+      "34:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "35:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "blt 38f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q7, [x9, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      "blt 37f\n"
+      "36:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v18.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a648  // smmla v8.4s, v18.16b, v7.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e86a64c  // smmla v12.4s, v18.16b, v6.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4e91a428  // smmla v8.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4e90a42c  // smmla v12.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4e91a429  // smmla v9.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4e90a42d  // smmla v13.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4e91a42a  // smmla v10.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4e90a42e  // smmla v14.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xf0]\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e91a42b  // smmla v11.4s, v1.16b, v17.16b\n"
+      "add x9, x9, #0x100\n"
+      "ldr q7, [x9, #0x0]\n"
+      ".inst 0x4e90a42f  // smmla v15.4s, v1.16b, v16.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "bge 36b\n"
+      "37:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v18.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a648  // smmla v8.4s, v18.16b, v7.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e86a64c  // smmla v12.4s, v18.16b, v6.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4e91a428  // smmla v8.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4e90a42c  // smmla v12.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4e91a429  // smmla v9.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4e90a42d  // smmla v13.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4e91a42a  // smmla v10.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4e90a42e  // smmla v14.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e91a42b  // smmla v11.4s, v1.16b, v17.16b\n"
+      ".inst 0x4e90a42f  // smmla v15.4s, v1.16b, v16.16b\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x9, x9, #0x100\n"
+      "38:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 45f\n"
+      "cmp x27, #0x8\n"
+      "blt 40f\n"
+      "39:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d17, [x26], #0x8\n"
+      "ldr d16, [x25], #0x8\n"
+      "trn1 v18.2d, v17.2d, v16.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x10]\n"
+      ".inst 0x4e91a648  // smmla v8.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64c  // smmla v12.4s, v18.16b, v16.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      "ldr q16, [x9, #0x70]\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "add x9, x9, #0x80\n"
+      "bge 39b\n"
+      "40:"  // Height 2: Multiply loop: Skip odd blocks
+      "cbz x27, 45f\n"
+      "tbz x27, #2, 42f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "tbz x27, #1, 41f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "tbz x27, #0, 44f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "b 44f\n"
+      "41:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 44f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "b 44f\n"
+      "42:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 43f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "tbz x27, #0, 44f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "b 44f\n"
+      "43:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "44:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "trn1 v18.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e91a648  // smmla v8.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e90a64c  // smmla v12.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "add x9, x9, #0x80\n"
+      "45:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 33b\n"
+      "ldr q19, [x14, #0x0]\n"
+      "ldr q18, [x14, #0x10]\n"
+      "uzp1 v17.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "ldr q5, [x14, #0x20]\n"
+      "ldr q16, [x14, #0x30]\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "add x25, x11, x20\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "mov v15.16b, v17.16b\n"
+      "add v15.4s, v15.4s, v19.4s\n"
+      "add x14, x14, #0x40\n"
+      "add v12.4s, v12.4s, v18.4s\n"
+      "add v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v16.4s\n"
+      "add v8.4s, v8.4s, v19.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
+      "tbz %x[flags], #4, 46f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "add x13, x13, #0x40\n"
+      "b 47f\n"
+      "46:"  // Height 2: per layer parameters
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
+      "mov v1.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "47:"  // Height 2: parameters loaded
+      "sqrdmulh v15.4s, v15.4s, v4.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v5.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v6.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v7.4s\n"
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "tbz %x[flags], #5, 48f\n"
+      "and v19.16b, v15.16b, v0.16b\n"
+      "and v18.16b, v12.16b, v1.16b\n"
+      "and v17.16b, v13.16b, v2.16b\n"
+      "and v16.16b, v14.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v19.4s\n"
+      "sqadd v12.4s, v12.4s, v18.4s\n"
+      "sqadd v13.4s, v13.4s, v17.4s\n"
+      "sqadd v14.4s, v14.4s, v16.4s\n"
+      "and v19.16b, v8.16b, v0.16b\n"
+      "and v18.16b, v9.16b, v1.16b\n"
+      "and v17.16b, v10.16b, v2.16b\n"
+      "and v16.16b, v11.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v19.4s\n"
+      "sqadd v9.4s, v9.4s, v18.4s\n"
+      "sqadd v10.4s, v10.4s, v17.4s\n"
+      "sqadd v11.4s, v11.4s, v16.4s\n"
+      "48:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v18.4s }, [x20]\n"
+      "srshl v15.4s, v15.4s, v0.4s\n"
+      "srshl v12.4s, v12.4s, v1.4s\n"
+      "srshl v13.4s, v13.4s, v2.4s\n"
+      "srshl v14.4s, v14.4s, v3.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "cmp x10, #0x10\n"
+      "add v15.4s, v15.4s, v18.4s\n"
+      "add v12.4s, v12.4s, v18.4s\n"
+      "add v13.4s, v13.4s, v18.4s\n"
+      "add v14.4s, v14.4s, v18.4s\n"
+      "add v8.4s, v8.4s, v18.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add v10.4s, v10.4s, v18.4s\n"
+      "add v11.4s, v11.4s, v18.4s\n"
+      "smin v15.4s, v15.4s, v17.4s\n"
+      "smin v12.4s, v12.4s, v17.4s\n"
+      "smin v13.4s, v13.4s, v17.4s\n"
+      "smin v14.4s, v14.4s, v17.4s\n"
+      "smin v8.4s, v8.4s, v17.4s\n"
+      "smin v9.4s, v9.4s, v17.4s\n"
+      "smin v10.4s, v10.4s, v17.4s\n"
+      "smin v11.4s, v11.4s, v17.4s\n"
+      "smax v15.4s, v15.4s, v16.4s\n"
+      "smax v12.4s, v12.4s, v16.4s\n"
+      "smax v13.4s, v13.4s, v16.4s\n"
+      "smax v14.4s, v14.4s, v16.4s\n"
+      "smax v8.4s, v8.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
+      "uzp1 v15.8h, v15.8h, v12.8h\n"
+      "uzp1 v17.8h, v13.8h, v14.8h\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "uzp1 v16.8h, v10.8h, v11.8h\n"
+      "uzp1 v15.16b, v15.16b, v17.16b\n"
+      "uzp1 v8.16b, v8.16b, v16.16b\n"
+      "bge 57f\n"
+      "tbz x10, #3, 52f\n"
+      "str d15, [x11], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "tbz x10, #2, 50f\n"
+      "st1 { v15.s }[2], [x11], #0x4\n"
+      "st1 { v8.s }[2], [x25], #0x4\n"
+      "tbz x10, #1, 49f\n"
+      "st1 { v15.h }[6], [x11], #0x2\n"
+      "st1 { v8.h }[6], [x25], #0x2\n"
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[14], [x11]\n"
+      "st1 { v8.b }[14], [x25]\n"
+      "b 56f\n"
+      "49:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[12], [x11]\n"
+      "st1 { v8.b }[12], [x25]\n"
+      "b 56f\n"
+      "50:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 51f\n"
+      "st1 { v15.h }[4], [x11], #0x2\n"
+      "st1 { v8.h }[4], [x25], #0x2\n"
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[10], [x11]\n"
+      "st1 { v8.b }[10], [x25]\n"
+      "b 56f\n"
+      "51:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[8], [x11]\n"
+      "st1 { v8.b }[8], [x25]\n"
+      "b 56f\n"
+      "52:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 54f\n"
+      "str s15, [x11], #0x4\n"
+      "str s8, [x25], #0x4\n"
+      "tbz x10, #1, 53f\n"
+      "st1 { v15.h }[2], [x11], #0x2\n"
+      "st1 { v8.h }[2], [x25], #0x2\n"
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[6], [x11]\n"
+      "st1 { v8.b }[6], [x25]\n"
+      "b 56f\n"
+      "53:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[4], [x11]\n"
+      "st1 { v8.b }[4], [x25]\n"
+      "b 56f\n"
+      "54:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 55f\n"
+      "str h15, [x11], #0x2\n"
+      "str h8, [x25], #0x2\n"
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[2], [x11]\n"
+      "st1 { v8.b }[2], [x25]\n"
+      "b 56f\n"
+      "55:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b15, [x11, #0x0]\n"
+      "str b8, [x25, #0x0]\n"
+      "56:"  // Height 2: Partial direct writeback: Done
+      "b 58f\n"
+      "57:"  // Height 2: Full writeback
+      "str q15, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q8, [x25, #0x0]\n"
+      "58:"  // Height 2: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 31b\n"
+      "b 176f\n"
+      "59:"  // Height 3
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "60:"  // Height 3: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "61:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "62:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 63f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 64f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "b 64f\n"
+      "63:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "64:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "blt 67f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q7, [x9, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      "blt 66f\n"
+      "65:"  // Height 3: Multiply loop: Main loop head
+      "trn1 v27.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a768  // smmla v8.4s, v27.16b, v7.16b\n"
+      "trn1 v26.2d, v3.2d, v28.2d\n"
+      ".inst 0x4e87a750  // smmla v16.4s, v26.16b, v7.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e86a76c  // smmla v12.4s, v27.16b, v6.16b\n"
+      ".inst 0x4e86a754  // smmla v20.4s, v26.16b, v6.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
+      "trn2 v3.2d, v3.2d, v28.2d\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x90]\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x4e99a428  // smmla v8.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a470  // smmla v16.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4e98a42c  // smmla v12.4s, v1.16b, v24.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e98a474  // smmla v20.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4e99a429  // smmla v9.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e99a471  // smmla v17.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4e98a42d  // smmla v13.4s, v1.16b, v24.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e98a475  // smmla v21.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4e99a42a  // smmla v10.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a472  // smmla v18.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4e98a42e  // smmla v14.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a476  // smmla v22.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e99a42b  // smmla v11.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a473  // smmla v19.4s, v3.16b, v25.16b\n"
+      "ldr q7, [x9, #0x0]\n"
+      ".inst 0x4e98a42f  // smmla v15.4s, v1.16b, v24.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x4e98a477  // smmla v23.4s, v3.16b, v24.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      "bge 65b\n"
+      "66:"  // Height 3: Multiply loop: Single iteration only
+      "trn1 v27.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a768  // smmla v8.4s, v27.16b, v7.16b\n"
+      "trn1 v26.2d, v3.2d, v25.2d\n"
+      ".inst 0x4e87a750  // smmla v16.4s, v26.16b, v7.16b\n"
+      "ldr q24, [x9, #0x20]\n"
+      ".inst 0x4e86a76c  // smmla v12.4s, v27.16b, v6.16b\n"
+      ".inst 0x4e86a754  // smmla v20.4s, v26.16b, v6.16b\n"
+      "ldr q0, [x9, #0x30]\n"
+      ".inst 0x4e98a769  // smmla v9.4s, v27.16b, v24.16b\n"
+      "trn2 v3.2d, v3.2d, v25.2d\n"
+      ".inst 0x4e98a751  // smmla v17.4s, v26.16b, v24.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e80a76d  // smmla v13.4s, v27.16b, v0.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4e80a755  // smmla v21.4s, v26.16b, v0.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x90]\n"
+      ".inst 0x4e99a428  // smmla v8.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e99a470  // smmla v16.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4e98a42c  // smmla v12.4s, v1.16b, v24.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e98a474  // smmla v20.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4e99a429  // smmla v9.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a471  // smmla v17.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4e98a42d  // smmla v13.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a475  // smmla v21.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4e99a42a  // smmla v10.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a472  // smmla v18.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4e98a42e  // smmla v14.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a476  // smmla v22.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e99a42b  // smmla v11.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a473  // smmla v19.4s, v3.16b, v25.16b\n"
+      ".inst 0x4e98a42f  // smmla v15.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a477  // smmla v23.4s, v3.16b, v24.16b\n"
+      "67:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 74f\n"
+      "cmp x27, #0x8\n"
+      "blt 69f\n"
+      "68:"  // Height 3: Multiply loop: Odd block loop
+      "ldr d25, [x26], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
+      "trn1 v27.2d, v25.2d, v24.2d\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr q25, [x9, #0x0]\n"
+      "trn1 v26.2d, v24.2d, v26.2d\n"
+      ".inst 0x4e99a768  // smmla v8.4s, v27.16b, v25.16b\n"
+      "ldr q24, [x9, #0x10]\n"
+      ".inst 0x4e99a750  // smmla v16.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e98a76c  // smmla v12.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a754  // smmla v20.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "bge 68b\n"
+      "69:"  // Height 3: Multiply loop: Skip odd blocks
+      "cbz x27, 74f\n"
+      "tbz x27, #2, 71f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "tbz x27, #1, 70f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "tbz x27, #0, 73f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "b 73f\n"
+      "70:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 73f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "b 73f\n"
+      "71:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 72f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "tbz x27, #0, 73f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "b 73f\n"
+      "72:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "73:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q25, [x9, #0x0]\n"
+      "ldr q28, [x9, #0x10]\n"
+      "trn1 v27.2d, v1.2d, v2.2d\n"
+      "trn1 v26.2d, v3.2d, v24.2d\n"
+      ".inst 0x4e99a768  // smmla v8.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a750  // smmla v16.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e9ca76c  // smmla v12.4s, v27.16b, v28.16b\n"
+      ".inst 0x4e9ca754  // smmla v20.4s, v26.16b, v28.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "74:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 62b\n"
+      "ldr q28, [x14, #0x0]\n"
+      "ldr q27, [x14, #0x10]\n"
+      "uzp1 v26.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "ldr q25, [x14, #0x20]\n"
+      "ldr q24, [x14, #0x30]\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "add x25, x11, x20\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "add x24, x25, x20\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "add x14, x14, #0x40\n"
+      "mov v23.16b, v26.16b\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v12.4s, v12.4s, v27.4s\n"
+      "add v13.4s, v13.4s, v25.4s\n"
+      "add v14.4s, v14.4s, v24.4s\n"
+      "add v8.4s, v8.4s, v28.4s\n"
+      "add v9.4s, v9.4s, v27.4s\n"
+      "add v10.4s, v10.4s, v25.4s\n"
+      "add v11.4s, v11.4s, v24.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v27.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "tbz %x[flags], #4, 75f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "add x13, x13, #0x40\n"
+      "b 76f\n"
+      "75:"  // Height 3: per layer parameters
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
+      "mov v1.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "76:"  // Height 3: parameters loaded
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v5.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v6.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v7.4s\n"
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "tbz %x[flags], #5, 77f\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "and v22.16b, v12.16b, v1.16b\n"
+      "and v21.16b, v13.16b, v2.16b\n"
+      "and v20.16b, v14.16b, v3.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
+      "sqadd v12.4s, v12.4s, v22.4s\n"
+      "sqadd v13.4s, v13.4s, v21.4s\n"
+      "sqadd v14.4s, v14.4s, v20.4s\n"
+      "and v24.16b, v8.16b, v0.16b\n"
+      "and v22.16b, v9.16b, v1.16b\n"
+      "and v21.16b, v10.16b, v2.16b\n"
+      "and v20.16b, v11.16b, v3.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v24.4s\n"
+      "sqadd v9.4s, v9.4s, v22.4s\n"
+      "sqadd v10.4s, v10.4s, v21.4s\n"
+      "sqadd v11.4s, v11.4s, v20.4s\n"
+      "and v24.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v1.16b\n"
+      "and v21.16b, v18.16b, v2.16b\n"
+      "and v20.16b, v19.16b, v3.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v24.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
+      "77:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v22.4s }, [x20]\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v12.4s, v12.4s, v1.4s\n"
+      "srshl v13.4s, v13.4s, v2.4s\n"
+      "srshl v14.4s, v14.4s, v3.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "cmp x10, #0x10\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v22.4s\n"
+      "add v12.4s, v12.4s, v22.4s\n"
+      "add v13.4s, v13.4s, v22.4s\n"
+      "add v14.4s, v14.4s, v22.4s\n"
+      "add v8.4s, v8.4s, v22.4s\n"
+      "add v9.4s, v9.4s, v22.4s\n"
+      "add v10.4s, v10.4s, v22.4s\n"
+      "add v11.4s, v11.4s, v22.4s\n"
+      "add v16.4s, v16.4s, v22.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add v19.4s, v19.4s, v22.4s\n"
+      "smin v23.4s, v23.4s, v21.4s\n"
+      "smin v12.4s, v12.4s, v21.4s\n"
+      "smin v13.4s, v13.4s, v21.4s\n"
+      "smin v14.4s, v14.4s, v21.4s\n"
+      "smin v8.4s, v8.4s, v21.4s\n"
+      "smin v9.4s, v9.4s, v21.4s\n"
+      "smin v10.4s, v10.4s, v21.4s\n"
+      "smin v11.4s, v11.4s, v21.4s\n"
+      "smin v16.4s, v16.4s, v21.4s\n"
+      "smin v17.4s, v17.4s, v21.4s\n"
+      "smin v18.4s, v18.4s, v21.4s\n"
+      "smin v19.4s, v19.4s, v21.4s\n"
+      "smax v23.4s, v23.4s, v20.4s\n"
+      "smax v12.4s, v12.4s, v20.4s\n"
+      "smax v13.4s, v13.4s, v20.4s\n"
+      "smax v14.4s, v14.4s, v20.4s\n"
+      "smax v8.4s, v8.4s, v20.4s\n"
+      "smax v9.4s, v9.4s, v20.4s\n"
+      "smax v10.4s, v10.4s, v20.4s\n"
+      "smax v11.4s, v11.4s, v20.4s\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
+      "uzp1 v23.8h, v23.8h, v12.8h\n"
+      "uzp1 v21.8h, v13.8h, v14.8h\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "uzp1 v20.8h, v10.8h, v11.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v21.16b\n"
+      "uzp1 v8.16b, v8.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 86f\n"
+      "tbz x10, #3, 81f\n"
+      "str d23, [x11], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "tbz x10, #2, 79f\n"
+      "st1 { v23.s }[2], [x11], #0x4\n"
+      "st1 { v8.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "tbz x10, #1, 78f\n"
+      "st1 { v23.h }[6], [x11], #0x2\n"
+      "st1 { v8.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[14], [x11]\n"
+      "st1 { v8.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "b 85f\n"
+      "78:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[12], [x11]\n"
+      "st1 { v8.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "b 85f\n"
+      "79:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 80f\n"
+      "st1 { v23.h }[4], [x11], #0x2\n"
+      "st1 { v8.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[10], [x11]\n"
+      "st1 { v8.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "b 85f\n"
+      "80:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[8], [x11]\n"
+      "st1 { v8.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "b 85f\n"
+      "81:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 83f\n"
+      "str s23, [x11], #0x4\n"
+      "str s8, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "tbz x10, #1, 82f\n"
+      "st1 { v23.h }[2], [x11], #0x2\n"
+      "st1 { v8.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[6], [x11]\n"
+      "st1 { v8.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "b 85f\n"
+      "82:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[4], [x11]\n"
+      "st1 { v8.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "b 85f\n"
+      "83:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 84f\n"
+      "str h23, [x11], #0x2\n"
+      "str h8, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[2], [x11]\n"
+      "st1 { v8.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "b 85f\n"
+      "84:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b23, [x11, #0x0]\n"
+      "str b8, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "85:"  // Height 3: Partial direct writeback: Done
+      "b 87f\n"
+      "86:"  // Height 3: Full writeback
+      "str q23, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q8, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "87:"  // Height 3: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 60b\n"
+      "b 176f\n"
+      "88:"  // Height 4
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "89:"  // Height 4: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "90:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "91:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 92f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 93f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 93f\n"
+      "92:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "93:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "blt 96f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q7, [x9, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      "blt 95f\n"
+      "94:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v27.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a768  // smmla v8.4s, v27.16b, v7.16b\n"
+      "sub x27, x27, #0x10\n"
+      "trn1 v26.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a750  // smmla v16.4s, v26.16b, v7.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e86a76c  // smmla v12.4s, v27.16b, v6.16b\n"
+      ".inst 0x4e86a754  // smmla v20.4s, v26.16b, v6.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
+      "add x23, x23, #0x10\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x90]\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x4e99a428  // smmla v8.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a470  // smmla v16.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4e98a42c  // smmla v12.4s, v1.16b, v24.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e98a474  // smmla v20.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4e99a429  // smmla v9.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e99a471  // smmla v17.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4e98a42d  // smmla v13.4s, v1.16b, v24.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e98a475  // smmla v21.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4e99a42a  // smmla v10.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e99a472  // smmla v18.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4e98a42e  // smmla v14.4s, v1.16b, v24.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e98a476  // smmla v22.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e99a42b  // smmla v11.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a473  // smmla v19.4s, v3.16b, v25.16b\n"
+      "ldr q7, [x9, #0x0]\n"
+      ".inst 0x4e98a42f  // smmla v15.4s, v1.16b, v24.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x4e98a477  // smmla v23.4s, v3.16b, v24.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      "bge 94b\n"
+      "95:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v27.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a768  // smmla v8.4s, v27.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v26.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a750  // smmla v16.4s, v26.16b, v7.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e86a76c  // smmla v12.4s, v27.16b, v6.16b\n"
+      ".inst 0x4e86a754  // smmla v20.4s, v26.16b, v6.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x90]\n"
+      ".inst 0x4e99a428  // smmla v8.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e99a470  // smmla v16.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4e98a42c  // smmla v12.4s, v1.16b, v24.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e98a474  // smmla v20.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4e99a429  // smmla v9.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e99a471  // smmla v17.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4e98a42d  // smmla v13.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a475  // smmla v21.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4e99a42a  // smmla v10.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a472  // smmla v18.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4e98a42e  // smmla v14.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a476  // smmla v22.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e99a42b  // smmla v11.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a473  // smmla v19.4s, v3.16b, v25.16b\n"
+      ".inst 0x4e98a42f  // smmla v15.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a477  // smmla v23.4s, v3.16b, v24.16b\n"
+      "96:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 103f\n"
+      "cmp x27, #0x8\n"
+      "blt 98f\n"
+      "97:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d25, [x26], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
+      "trn1 v27.2d, v25.2d, v24.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "trn1 v26.2d, v25.2d, v24.2d\n"
+      "cmp x27, #0x8\n"
+      "ldr q25, [x9, #0x0]\n"
+      "ldr q24, [x9, #0x10]\n"
+      ".inst 0x4e99a768  // smmla v8.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a750  // smmla v16.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e98a76c  // smmla v12.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a754  // smmla v20.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "bge 97b\n"
+      "98:"  // Height 4: Multiply loop: Skip odd blocks
+      "cbz x27, 103f\n"
+      "tbz x27, #2, 100f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "tbz x27, #1, 99f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "ld1 { v4.h }[2], [x23], #0x2\n"
+      "tbz x27, #0, 102f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "ld1 { v4.b }[6], [x23]\n"
+      "b 102f\n"
+      "99:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 102f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "ld1 { v4.b }[4], [x23]\n"
+      "b 102f\n"
+      "100:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 101f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "tbz x27, #0, 102f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x23]\n"
+      "b 102f\n"
+      "101:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x23, #0x0]\n"
+      "102:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q25, [x9, #0x0]\n"
+      "ldr q24, [x9, #0x10]\n"
+      "trn1 v27.2d, v1.2d, v2.2d\n"
+      "trn1 v26.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e99a768  // smmla v8.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a750  // smmla v16.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e98a76c  // smmla v12.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a754  // smmla v20.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "103:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 91b\n"
+      "ldr q28, [x14, #0x0]\n"
+      "ldr q27, [x14, #0x10]\n"
+      "uzp1 v26.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "ldr q25, [x14, #0x20]\n"
+      "ldr q24, [x14, #0x30]\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "add x25, x11, x20\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "add x24, x25, x20\n"
+      "add x23, x24, x20\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "add x14, x14, #0x40\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "mov v23.16b, v26.16b\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v12.4s, v12.4s, v27.4s\n"
+      "add v13.4s, v13.4s, v25.4s\n"
+      "add v14.4s, v14.4s, v24.4s\n"
+      "add v8.4s, v8.4s, v28.4s\n"
+      "add v9.4s, v9.4s, v27.4s\n"
+      "add v10.4s, v10.4s, v25.4s\n"
+      "add v11.4s, v11.4s, v24.4s\n"
+      "add v15.4s, v15.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add v21.4s, v21.4s, v25.4s\n"
+      "add v22.4s, v22.4s, v24.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v27.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "tbz %x[flags], #4, 104f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "add x13, x13, #0x40\n"
+      "b 105f\n"
+      "104:"  // Height 4: per layer parameters
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
+      "mov v1.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "105:"  // Height 4: parameters loaded
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v5.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v6.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v7.4s\n"
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v5.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "tbz %x[flags], #5, 106f\n"
+      "and v27.16b, v23.16b, v0.16b\n"
+      "and v26.16b, v12.16b, v1.16b\n"
+      "and v25.16b, v13.16b, v2.16b\n"
+      "and v24.16b, v14.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v27.4s\n"
+      "sqadd v12.4s, v12.4s, v26.4s\n"
+      "sqadd v13.4s, v13.4s, v25.4s\n"
+      "sqadd v14.4s, v14.4s, v24.4s\n"
+      "and v27.16b, v8.16b, v0.16b\n"
+      "and v26.16b, v9.16b, v1.16b\n"
+      "and v25.16b, v10.16b, v2.16b\n"
+      "and v24.16b, v11.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v27.4s\n"
+      "sqadd v9.4s, v9.4s, v26.4s\n"
+      "sqadd v10.4s, v10.4s, v25.4s\n"
+      "sqadd v11.4s, v11.4s, v24.4s\n"
+      "and v27.16b, v15.16b, v0.16b\n"
+      "and v26.16b, v20.16b, v1.16b\n"
+      "and v25.16b, v21.16b, v2.16b\n"
+      "and v24.16b, v22.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v27.4s\n"
+      "sqadd v20.4s, v20.4s, v26.4s\n"
+      "sqadd v21.4s, v21.4s, v25.4s\n"
+      "sqadd v22.4s, v22.4s, v24.4s\n"
+      "and v27.16b, v16.16b, v0.16b\n"
+      "and v26.16b, v17.16b, v1.16b\n"
+      "and v25.16b, v18.16b, v2.16b\n"
+      "and v24.16b, v19.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v27.4s\n"
+      "sqadd v17.4s, v17.4s, v26.4s\n"
+      "sqadd v18.4s, v18.4s, v25.4s\n"
+      "sqadd v19.4s, v19.4s, v24.4s\n"
+      "106:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v12.4s, v12.4s, v1.4s\n"
+      "srshl v13.4s, v13.4s, v2.4s\n"
+      "srshl v14.4s, v14.4s, v3.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "cmp x10, #0x10\n"
+      "srshl v15.4s, v15.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v1.4s\n"
+      "srshl v21.4s, v21.4s, v2.4s\n"
+      "srshl v22.4s, v22.4s, v3.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "add v12.4s, v12.4s, v26.4s\n"
+      "add v13.4s, v13.4s, v26.4s\n"
+      "add v14.4s, v14.4s, v26.4s\n"
+      "add v8.4s, v8.4s, v26.4s\n"
+      "add v9.4s, v9.4s, v26.4s\n"
+      "add v10.4s, v10.4s, v26.4s\n"
+      "add v11.4s, v11.4s, v26.4s\n"
+      "add v15.4s, v15.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smin v12.4s, v12.4s, v25.4s\n"
+      "smin v13.4s, v13.4s, v25.4s\n"
+      "smin v14.4s, v14.4s, v25.4s\n"
+      "smin v8.4s, v8.4s, v25.4s\n"
+      "smin v9.4s, v9.4s, v25.4s\n"
+      "smin v10.4s, v10.4s, v25.4s\n"
+      "smin v11.4s, v11.4s, v25.4s\n"
+      "smin v15.4s, v15.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
+      "smax v12.4s, v12.4s, v24.4s\n"
+      "smax v13.4s, v13.4s, v24.4s\n"
+      "smax v14.4s, v14.4s, v24.4s\n"
+      "smax v8.4s, v8.4s, v24.4s\n"
+      "smax v9.4s, v9.4s, v24.4s\n"
+      "smax v10.4s, v10.4s, v24.4s\n"
+      "smax v11.4s, v11.4s, v24.4s\n"
+      "smax v15.4s, v15.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "uzp1 v23.8h, v23.8h, v12.8h\n"
+      "uzp1 v25.8h, v13.8h, v14.8h\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "uzp1 v24.8h, v10.8h, v11.8h\n"
+      "uzp1 v15.8h, v15.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v25.16b\n"
+      "uzp1 v8.16b, v8.16b, v24.16b\n"
+      "uzp1 v15.16b, v15.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 115f\n"
+      "tbz x10, #3, 110f\n"
+      "str d23, [x11], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x10, #2, 108f\n"
+      "st1 { v23.s }[2], [x11], #0x4\n"
+      "st1 { v8.s }[2], [x25], #0x4\n"
+      "st1 { v15.s }[2], [x24], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "tbz x10, #1, 107f\n"
+      "st1 { v23.h }[6], [x11], #0x2\n"
+      "st1 { v8.h }[6], [x25], #0x2\n"
+      "st1 { v15.h }[6], [x24], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[14], [x11]\n"
+      "st1 { v8.b }[14], [x25]\n"
+      "st1 { v15.b }[14], [x24]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "b 114f\n"
+      "107:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[12], [x11]\n"
+      "st1 { v8.b }[12], [x25]\n"
+      "st1 { v15.b }[12], [x24]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "b 114f\n"
+      "108:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 109f\n"
+      "st1 { v23.h }[4], [x11], #0x2\n"
+      "st1 { v8.h }[4], [x25], #0x2\n"
+      "st1 { v15.h }[4], [x24], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[10], [x11]\n"
+      "st1 { v8.b }[10], [x25]\n"
+      "st1 { v15.b }[10], [x24]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "b 114f\n"
+      "109:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[8], [x11]\n"
+      "st1 { v8.b }[8], [x25]\n"
+      "st1 { v15.b }[8], [x24]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "b 114f\n"
+      "110:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 112f\n"
+      "str s23, [x11], #0x4\n"
+      "str s8, [x25], #0x4\n"
+      "str s15, [x24], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "tbz x10, #1, 111f\n"
+      "st1 { v23.h }[2], [x11], #0x2\n"
+      "st1 { v8.h }[2], [x25], #0x2\n"
+      "st1 { v15.h }[2], [x24], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[6], [x11]\n"
+      "st1 { v8.b }[6], [x25]\n"
+      "st1 { v15.b }[6], [x24]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "b 114f\n"
+      "111:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[4], [x11]\n"
+      "st1 { v8.b }[4], [x25]\n"
+      "st1 { v15.b }[4], [x24]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "b 114f\n"
+      "112:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 113f\n"
+      "str h23, [x11], #0x2\n"
+      "str h8, [x25], #0x2\n"
+      "str h15, [x24], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[2], [x11]\n"
+      "st1 { v8.b }[2], [x25]\n"
+      "st1 { v15.b }[2], [x24]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "b 114f\n"
+      "113:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b23, [x11, #0x0]\n"
+      "str b8, [x25, #0x0]\n"
+      "str b15, [x24, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "114:"  // Height 4: Partial direct writeback: Done
+      "b 116f\n"
+      "115:"  // Height 4: Full writeback
+      "str q23, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q8, [x25, #0x0]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
+      "116:"  // Height 4: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 89b\n"
+      "b 176f\n"
+      "117:"  // Height 5
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "118:"  // Height 5: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "119:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "120:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 121f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 122f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 122f\n"
+      "121:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "122:"  // Height 5: input setup done
+      "cmp x27, #0x10\n"
+      "blt 125f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q7, [x9, #0x0]\n"
+      "blt 124f\n"
+      "123:"  // Height 5: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "sub x27, x27, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q0, [x9, #0x90]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q6, [x9, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xb0]\n"
+      ".inst 0x4e86a429  // smmla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a471  // smmla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4b9  // smmla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xd0]\n"
+      ".inst 0x4e86a42a  // smmla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a472  // smmla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4ba  // smmla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e86a42b  // smmla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bb  // smmla v27.4s, v5.16b, v6.16b\n"
+      "ldr q7, [x9, #0x0]\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
+      "ldr q5, [x22, #0x0]\n"
+      "bge 123b\n"
+      "124:"  // Height 5: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "add x22, x22, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q0, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q2, [x9, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xb0]\n"
+      ".inst 0x4e82a429  // smmla v9.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a471  // smmla v17.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4b9  // smmla v25.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x9, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xd0]\n"
+      ".inst 0x4e82a42a  // smmla v10.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a472  // smmla v18.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4ba  // smmla v26.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x9, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e82a42b  // smmla v11.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a473  // smmla v19.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bb  // smmla v27.4s, v5.16b, v2.16b\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
+      "125:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 132f\n"
+      "cmp x27, #0x8\n"
+      "blt 127f\n"
+      "126:"  // Height 5: Multiply loop: Odd block loop
+      "ldr d1, [x26], #0x8\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr d0, [x22], #0x8\n"
+      "ldr q1, [x9, #0x0]\n"
+      "trn1 v2.2d, v0.2d, v2.2d\n"
+      ".inst 0x4e81a488  // smmla v8.4s, v4.16b, v1.16b\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x4e81a470  // smmla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x20]\n"
+      ".inst 0x4e80a48c  // smmla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4e80a45c  // smmla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x30]\n"
+      ".inst 0x4e81a489  // smmla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a471  // smmla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x40]\n"
+      ".inst 0x4e80a48d  // smmla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45d  // smmla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x50]\n"
+      ".inst 0x4e81a48a  // smmla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a472  // smmla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45a  // smmla v26.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x60]\n"
+      ".inst 0x4e80a48e  // smmla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45e  // smmla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x70]\n"
+      ".inst 0x4e81a48b  // smmla v11.4s, v4.16b, v1.16b\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e81a473  // smmla v19.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45b  // smmla v27.4s, v2.16b, v1.16b\n"
+      ".inst 0x4e80a48f  // smmla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45f  // smmla v31.4s, v2.16b, v0.16b\n"
+      "bge 126b\n"
+      "127:"  // Height 5: Multiply loop: Skip odd blocks
+      "cbz x27, 132f\n"
+      "tbz x27, #2, 129f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s5, [x22], #0x4\n"
+      "tbz x27, #1, 128f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "ld1 { v4.h }[2], [x23], #0x2\n"
+      "ld1 { v5.h }[2], [x22], #0x2\n"
+      "tbz x27, #0, 131f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "ld1 { v4.b }[6], [x23]\n"
+      "ld1 { v5.b }[6], [x22]\n"
+      "b 131f\n"
+      "128:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 131f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "ld1 { v4.b }[4], [x23]\n"
+      "ld1 { v5.b }[4], [x22]\n"
+      "b 131f\n"
+      "129:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 130f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "ldr h5, [x22], #0x2\n"
+      "tbz x27, #0, 131f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x23]\n"
+      "ld1 { v5.b }[2], [x22]\n"
+      "b 131f\n"
+      "130:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x23, #0x0]\n"
+      "ldr b5, [x22, #0x0]\n"
+      "131:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q7, [x9, #0x0]\n"
+      "trn1 v6.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      "trn1 v2.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x4e87a4c8  // smmla v8.4s, v6.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      "ldr q1, [x9, #0x20]\n"
+      ".inst 0x4e80a4cc  // smmla v12.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45c  // smmla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x30]\n"
+      ".inst 0x4e81a4c9  // smmla v9.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e81a471  // smmla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x40]\n"
+      ".inst 0x4e80a4cd  // smmla v13.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45d  // smmla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x50]\n"
+      ".inst 0x4e81a4ca  // smmla v10.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e81a472  // smmla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45a  // smmla v26.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x60]\n"
+      ".inst 0x4e80a4ce  // smmla v14.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45e  // smmla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e81a4cb  // smmla v11.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e81a473  // smmla v19.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45b  // smmla v27.4s, v2.16b, v1.16b\n"
+      ".inst 0x4e80a4cf  // smmla v15.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45f  // smmla v31.4s, v2.16b, v0.16b\n"
+      "132:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 120b\n"
+      "ldr q4, [x14, #0x0]\n"
+      "ldr q3, [x14, #0x10]\n"
+      "uzp1 v2.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "ldr q1, [x14, #0x20]\n"
+      "ldr q0, [x14, #0x30]\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x11, x20\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "add x24, x25, x20\n"
+      "add x23, x24, x20\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "add x14, x14, #0x40\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v2.16b\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v3.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v14.4s, v14.4s, v0.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v3.4s\n"
+      "add v10.4s, v10.4s, v1.4s\n"
+      "add v11.4s, v11.4s, v0.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "tbz %x[flags], #4, 133f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "add x13, x13, #0x40\n"
+      "b 134f\n"
+      "133:"  // Height 5: per layer parameters
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
+      "mov v1.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "134:"  // Height 5: parameters loaded
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v5.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v6.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v7.4s\n"
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v5.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+      "tbz %x[flags], #5, 135f\n"
+      "and v30.16b, v31.16b, v0.16b\n"
+      "and v29.16b, v12.16b, v1.16b\n"
+      "and v28.16b, v13.16b, v2.16b\n"
+      "and v23.16b, v14.16b, v3.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v30.4s\n"
+      "sqadd v12.4s, v12.4s, v29.4s\n"
+      "sqadd v13.4s, v13.4s, v28.4s\n"
+      "sqadd v14.4s, v14.4s, v23.4s\n"
+      "and v30.16b, v8.16b, v0.16b\n"
+      "and v29.16b, v9.16b, v1.16b\n"
+      "and v28.16b, v10.16b, v2.16b\n"
+      "and v23.16b, v11.16b, v3.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v30.4s\n"
+      "sqadd v9.4s, v9.4s, v29.4s\n"
+      "sqadd v10.4s, v10.4s, v28.4s\n"
+      "sqadd v11.4s, v11.4s, v23.4s\n"
+      "and v30.16b, v15.16b, v0.16b\n"
+      "and v29.16b, v20.16b, v1.16b\n"
+      "and v28.16b, v21.16b, v2.16b\n"
+      "and v23.16b, v22.16b, v3.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v30.4s\n"
+      "sqadd v20.4s, v20.4s, v29.4s\n"
+      "sqadd v21.4s, v21.4s, v28.4s\n"
+      "sqadd v22.4s, v22.4s, v23.4s\n"
+      "and v30.16b, v16.16b, v0.16b\n"
+      "and v29.16b, v17.16b, v1.16b\n"
+      "and v28.16b, v18.16b, v2.16b\n"
+      "and v23.16b, v19.16b, v3.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v30.4s\n"
+      "sqadd v17.4s, v17.4s, v29.4s\n"
+      "sqadd v18.4s, v18.4s, v28.4s\n"
+      "sqadd v19.4s, v19.4s, v23.4s\n"
+      "and v30.16b, v24.16b, v0.16b\n"
+      "and v29.16b, v25.16b, v1.16b\n"
+      "and v28.16b, v26.16b, v2.16b\n"
+      "and v23.16b, v27.16b, v3.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v30.4s\n"
+      "sqadd v25.4s, v25.4s, v29.4s\n"
+      "sqadd v26.4s, v26.4s, v28.4s\n"
+      "sqadd v27.4s, v27.4s, v23.4s\n"
+      "135:"  // Height 5: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v29.4s }, [x20]\n"
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "srshl v12.4s, v12.4s, v1.4s\n"
+      "srshl v13.4s, v13.4s, v2.4s\n"
+      "srshl v14.4s, v14.4s, v3.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v23.4s }, [x20]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "cmp x10, #0x10\n"
+      "srshl v15.4s, v15.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v1.4s\n"
+      "srshl v21.4s, v21.4s, v2.4s\n"
+      "srshl v22.4s, v22.4s, v3.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v1.4s\n"
+      "srshl v26.4s, v26.4s, v2.4s\n"
+      "srshl v27.4s, v27.4s, v3.4s\n"
+      "add v31.4s, v31.4s, v29.4s\n"
+      "add v12.4s, v12.4s, v29.4s\n"
+      "add v13.4s, v13.4s, v29.4s\n"
+      "add v14.4s, v14.4s, v29.4s\n"
+      "add v8.4s, v8.4s, v29.4s\n"
+      "add v9.4s, v9.4s, v29.4s\n"
+      "add v10.4s, v10.4s, v29.4s\n"
+      "add v11.4s, v11.4s, v29.4s\n"
+      "add v15.4s, v15.4s, v29.4s\n"
+      "add v20.4s, v20.4s, v29.4s\n"
+      "add v21.4s, v21.4s, v29.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v16.4s, v16.4s, v29.4s\n"
+      "add v17.4s, v17.4s, v29.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v29.4s\n"
+      "add v24.4s, v24.4s, v29.4s\n"
+      "add v25.4s, v25.4s, v29.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v29.4s\n"
+      "smin v31.4s, v31.4s, v28.4s\n"
+      "smin v12.4s, v12.4s, v28.4s\n"
+      "smin v13.4s, v13.4s, v28.4s\n"
+      "smin v14.4s, v14.4s, v28.4s\n"
+      "smin v8.4s, v8.4s, v28.4s\n"
+      "smin v9.4s, v9.4s, v28.4s\n"
+      "smin v10.4s, v10.4s, v28.4s\n"
+      "smin v11.4s, v11.4s, v28.4s\n"
+      "smin v15.4s, v15.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "smax v31.4s, v31.4s, v23.4s\n"
+      "smax v12.4s, v12.4s, v23.4s\n"
+      "smax v13.4s, v13.4s, v23.4s\n"
+      "smax v14.4s, v14.4s, v23.4s\n"
+      "smax v8.4s, v8.4s, v23.4s\n"
+      "smax v9.4s, v9.4s, v23.4s\n"
+      "smax v10.4s, v10.4s, v23.4s\n"
+      "smax v11.4s, v11.4s, v23.4s\n"
+      "smax v15.4s, v15.4s, v23.4s\n"
+      "smax v20.4s, v20.4s, v23.4s\n"
+      "smax v21.4s, v21.4s, v23.4s\n"
+      "smax v22.4s, v22.4s, v23.4s\n"
+      "smax v16.4s, v16.4s, v23.4s\n"
+      "smax v17.4s, v17.4s, v23.4s\n"
+      "smax v18.4s, v18.4s, v23.4s\n"
+      "smax v19.4s, v19.4s, v23.4s\n"
+      "smax v24.4s, v24.4s, v23.4s\n"
+      "smax v25.4s, v25.4s, v23.4s\n"
+      "smax v26.4s, v26.4s, v23.4s\n"
+      "smax v27.4s, v27.4s, v23.4s\n"
+      "uzp1 v31.8h, v31.8h, v12.8h\n"
+      "uzp1 v28.8h, v13.8h, v14.8h\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "uzp1 v23.8h, v10.8h, v11.8h\n"
+      "uzp1 v15.8h, v15.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v28.16b\n"
+      "uzp1 v8.16b, v8.16b, v23.16b\n"
+      "uzp1 v15.16b, v15.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
+      "bge 144f\n"
+      "tbz x10, #3, 139f\n"
+      "str d31, [x11], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x10, #2, 137f\n"
+      "st1 { v31.s }[2], [x11], #0x4\n"
+      "st1 { v8.s }[2], [x25], #0x4\n"
+      "st1 { v15.s }[2], [x24], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "tbz x10, #1, 136f\n"
+      "st1 { v31.h }[6], [x11], #0x2\n"
+      "st1 { v8.h }[6], [x25], #0x2\n"
+      "st1 { v15.h }[6], [x24], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[14], [x11]\n"
+      "st1 { v8.b }[14], [x25]\n"
+      "st1 { v15.b }[14], [x24]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "b 143f\n"
+      "136:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[12], [x11]\n"
+      "st1 { v8.b }[12], [x25]\n"
+      "st1 { v15.b }[12], [x24]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "b 143f\n"
+      "137:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 138f\n"
+      "st1 { v31.h }[4], [x11], #0x2\n"
+      "st1 { v8.h }[4], [x25], #0x2\n"
+      "st1 { v15.h }[4], [x24], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[10], [x11]\n"
+      "st1 { v8.b }[10], [x25]\n"
+      "st1 { v15.b }[10], [x24]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "b 143f\n"
+      "138:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[8], [x11]\n"
+      "st1 { v8.b }[8], [x25]\n"
+      "st1 { v15.b }[8], [x24]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "b 143f\n"
+      "139:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 141f\n"
+      "str s31, [x11], #0x4\n"
+      "str s8, [x25], #0x4\n"
+      "str s15, [x24], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "tbz x10, #1, 140f\n"
+      "st1 { v31.h }[2], [x11], #0x2\n"
+      "st1 { v8.h }[2], [x25], #0x2\n"
+      "st1 { v15.h }[2], [x24], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[6], [x11]\n"
+      "st1 { v8.b }[6], [x25]\n"
+      "st1 { v15.b }[6], [x24]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "b 143f\n"
+      "140:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[4], [x11]\n"
+      "st1 { v8.b }[4], [x25]\n"
+      "st1 { v15.b }[4], [x24]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "b 143f\n"
+      "141:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 142f\n"
+      "str h31, [x11], #0x2\n"
+      "str h8, [x25], #0x2\n"
+      "str h15, [x24], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[2], [x11]\n"
+      "st1 { v8.b }[2], [x25]\n"
+      "st1 { v15.b }[2], [x24]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "b 143f\n"
+      "142:"  // Height 5: Partial direct writeback: partial_1_0
+      "str b31, [x11, #0x0]\n"
+      "str b8, [x25, #0x0]\n"
+      "str b15, [x24, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "143:"  // Height 5: Partial direct writeback: Done
+      "b 145f\n"
+      "144:"  // Height 5: Full writeback
+      "str q31, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q8, [x25, #0x0]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "145:"  // Height 5: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 118b\n"
+      "b 176f\n"
+      "146:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x6\n"
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "147:"  // Height 6: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "148:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "149:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 150f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 151f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 151f\n"
+      "150:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "151:"  // Height 6: input setup done
+      "cmp x27, #0x10\n"
+      "blt 154f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "ldr q7, [x9, #0x0]\n"
+      "blt 153f\n"
+      "152:"  // Height 6: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "sub x27, x27, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q2, [x25, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q0, [x9, #0x90]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q6, [x9, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xb0]\n"
+      ".inst 0x4e86a429  // smmla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a471  // smmla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4b9  // smmla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xd0]\n"
+      ".inst 0x4e86a42a  // smmla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a472  // smmla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4ba  // smmla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e86a42b  // smmla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bb  // smmla v27.4s, v5.16b, v6.16b\n"
+      "ldr q7, [x9, #0x0]\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "bge 152b\n"
+      "153:"  // Height 6: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "add x21, x21, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q0, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q2, [x9, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xb0]\n"
+      ".inst 0x4e82a429  // smmla v9.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a471  // smmla v17.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4b9  // smmla v25.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x9, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xd0]\n"
+      ".inst 0x4e82a42a  // smmla v10.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a472  // smmla v18.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4ba  // smmla v26.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x9, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e82a42b  // smmla v11.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a473  // smmla v19.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bb  // smmla v27.4s, v5.16b, v2.16b\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
+      "154:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 161f\n"
+      "cmp x27, #0x8\n"
+      "blt 156f\n"
+      "155:"  // Height 6: Multiply loop: Odd block loop
+      "ldr d1, [x26], #0x8\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
+      "cmp x27, #0x8\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d0, [x21], #0x8\n"
+      "trn1 v2.2d, v1.2d, v0.2d\n"
+      "ldr q1, [x9, #0x0]\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x4e81a488  // smmla v8.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a470  // smmla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x20]\n"
+      ".inst 0x4e80a48c  // smmla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45c  // smmla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x30]\n"
+      ".inst 0x4e81a489  // smmla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a471  // smmla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x40]\n"
+      ".inst 0x4e80a48d  // smmla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45d  // smmla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x50]\n"
+      ".inst 0x4e81a48a  // smmla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a472  // smmla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45a  // smmla v26.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x60]\n"
+      ".inst 0x4e80a48e  // smmla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45e  // smmla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e81a48b  // smmla v11.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a473  // smmla v19.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45b  // smmla v27.4s, v2.16b, v1.16b\n"
+      ".inst 0x4e80a48f  // smmla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45f  // smmla v31.4s, v2.16b, v0.16b\n"
+      "bge 155b\n"
+      "156:"  // Height 6: Multiply loop: Skip odd blocks
+      "cbz x27, 161f\n"
+      "tbz x27, #2, 158f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s5, [x22], #0x4\n"
+      "ldr s6, [x21], #0x4\n"
+      "tbz x27, #1, 157f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "ld1 { v4.h }[2], [x23], #0x2\n"
+      "ld1 { v5.h }[2], [x22], #0x2\n"
+      "ld1 { v6.h }[2], [x21], #0x2\n"
+      "tbz x27, #0, 160f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "ld1 { v4.b }[6], [x23]\n"
+      "ld1 { v5.b }[6], [x22]\n"
+      "ld1 { v6.b }[6], [x21]\n"
+      "b 160f\n"
+      "157:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 160f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "ld1 { v4.b }[4], [x23]\n"
+      "ld1 { v5.b }[4], [x22]\n"
+      "ld1 { v6.b }[4], [x21]\n"
+      "b 160f\n"
+      "158:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 159f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "ldr h5, [x22], #0x2\n"
+      "ldr h6, [x21], #0x2\n"
+      "tbz x27, #0, 160f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x23]\n"
+      "ld1 { v5.b }[2], [x22]\n"
+      "ld1 { v6.b }[2], [x21]\n"
+      "b 160f\n"
+      "159:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x23, #0x0]\n"
+      "ldr b5, [x22, #0x0]\n"
+      "ldr b6, [x21, #0x0]\n"
+      "160:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q7, [x9, #0x0]\n"
+      "trn1 v2.2d, v1.2d, v2.2d\n"
+      "trn1 v4.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a448  // smmla v8.4s, v2.16b, v7.16b\n"
+      "trn1 v3.2d, v5.2d, v6.2d\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x4e87a490  // smmla v16.4s, v4.16b, v7.16b\n"
+      ".inst 0x4e87a478  // smmla v24.4s, v3.16b, v7.16b\n"
+      "ldr q1, [x9, #0x20]\n"
+      ".inst 0x4e80a44c  // smmla v12.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e80a494  // smmla v20.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a47c  // smmla v28.4s, v3.16b, v0.16b\n"
+      "ldr q0, [x9, #0x30]\n"
+      ".inst 0x4e81a449  // smmla v9.4s, v2.16b, v1.16b\n"
+      ".inst 0x4e81a491  // smmla v17.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a479  // smmla v25.4s, v3.16b, v1.16b\n"
+      "ldr q1, [x9, #0x40]\n"
+      ".inst 0x4e80a44d  // smmla v13.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e80a495  // smmla v21.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a47d  // smmla v29.4s, v3.16b, v0.16b\n"
+      "ldr q0, [x9, #0x50]\n"
+      ".inst 0x4e81a44a  // smmla v10.4s, v2.16b, v1.16b\n"
+      ".inst 0x4e81a492  // smmla v18.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a47a  // smmla v26.4s, v3.16b, v1.16b\n"
+      "ldr q1, [x9, #0x60]\n"
+      ".inst 0x4e80a44e  // smmla v14.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e80a496  // smmla v22.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a47e  // smmla v30.4s, v3.16b, v0.16b\n"
+      "ldr q0, [x9, #0x70]\n"
+      ".inst 0x4e81a44b  // smmla v11.4s, v2.16b, v1.16b\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e81a493  // smmla v19.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a47b  // smmla v27.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e80a44f  // smmla v15.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e80a497  // smmla v23.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a47f  // smmla v31.4s, v3.16b, v0.16b\n"
+      "161:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 149b\n"
+      "ldr q4, [x14, #0x0]\n"
+      "ldr q3, [x14, #0x10]\n"
+      "uzp1 v2.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "ldr q1, [x14, #0x20]\n"
+      "ldr q0, [x14, #0x30]\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x11, x20\n"
+      "add x24, x25, x20\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "add x23, x24, x20\n"
+      "add x22, x23, x20\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "add x21, x22, x20\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x14, x14, #0x40\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v2.16b\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v3.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v14.4s, v14.4s, v0.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v3.4s\n"
+      "add v10.4s, v10.4s, v1.4s\n"
+      "add v11.4s, v11.4s, v0.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v1.4s\n"
+      "add v30.4s, v30.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "tbz %x[flags], #4, 162f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "add x13, x13, #0x40\n"
+      "b 163f\n"
+      "162:"  // Height 6: per layer parameters
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
+      "mov v1.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "163:"  // Height 6: parameters loaded
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v5.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v6.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v7.4s\n"
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v5.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v5.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v6.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v7.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+      "tbz %x[flags], #5, 164f\n"
+      "and v7.16b, v31.16b, v0.16b\n"
+      "and v6.16b, v12.16b, v1.16b\n"
+      "and v5.16b, v13.16b, v2.16b\n"
+      "and v4.16b, v14.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v7.4s\n"
+      "sqadd v12.4s, v12.4s, v6.4s\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "sqadd v14.4s, v14.4s, v4.4s\n"
+      "and v7.16b, v8.16b, v0.16b\n"
+      "and v6.16b, v9.16b, v1.16b\n"
+      "and v5.16b, v10.16b, v2.16b\n"
+      "and v4.16b, v11.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v7.4s\n"
+      "sqadd v9.4s, v9.4s, v6.4s\n"
+      "sqadd v10.4s, v10.4s, v5.4s\n"
+      "sqadd v11.4s, v11.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v0.16b\n"
+      "and v6.16b, v20.16b, v1.16b\n"
+      "and v5.16b, v21.16b, v2.16b\n"
+      "and v4.16b, v22.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v6.4s\n"
+      "sqadd v21.4s, v21.4s, v5.4s\n"
+      "sqadd v22.4s, v22.4s, v4.4s\n"
+      "and v7.16b, v16.16b, v0.16b\n"
+      "and v6.16b, v17.16b, v1.16b\n"
+      "and v5.16b, v18.16b, v2.16b\n"
+      "and v4.16b, v19.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v7.4s\n"
+      "sqadd v17.4s, v17.4s, v6.4s\n"
+      "sqadd v18.4s, v18.4s, v5.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "and v7.16b, v23.16b, v0.16b\n"
+      "and v6.16b, v28.16b, v1.16b\n"
+      "and v5.16b, v29.16b, v2.16b\n"
+      "and v4.16b, v30.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "sqadd v28.4s, v28.4s, v6.4s\n"
+      "sqadd v29.4s, v29.4s, v5.4s\n"
+      "sqadd v30.4s, v30.4s, v4.4s\n"
+      "and v7.16b, v24.16b, v0.16b\n"
+      "and v6.16b, v25.16b, v1.16b\n"
+      "and v5.16b, v26.16b, v2.16b\n"
+      "and v4.16b, v27.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v7.4s\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sqadd v26.4s, v26.4s, v5.4s\n"
+      "sqadd v27.4s, v27.4s, v4.4s\n"
+      "164:"  // Height 6: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v6.4s }, [x20]\n"
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "srshl v12.4s, v12.4s, v1.4s\n"
+      "srshl v13.4s, v13.4s, v2.4s\n"
+      "srshl v14.4s, v14.4s, v3.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v5.4s }, [x20]\n"
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v4.4s }, [x20]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "cmp x10, #0x10\n"
+      "srshl v15.4s, v15.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v1.4s\n"
+      "srshl v21.4s, v21.4s, v2.4s\n"
+      "srshl v22.4s, v22.4s, v3.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v28.4s, v28.4s, v1.4s\n"
+      "srshl v29.4s, v29.4s, v2.4s\n"
+      "srshl v30.4s, v30.4s, v3.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v1.4s\n"
+      "srshl v26.4s, v26.4s, v2.4s\n"
+      "srshl v27.4s, v27.4s, v3.4s\n"
+      "add v31.4s, v31.4s, v6.4s\n"
+      "add v12.4s, v12.4s, v6.4s\n"
+      "add v13.4s, v13.4s, v6.4s\n"
+      "add v14.4s, v14.4s, v6.4s\n"
+      "add v8.4s, v8.4s, v6.4s\n"
+      "add v9.4s, v9.4s, v6.4s\n"
+      "add v10.4s, v10.4s, v6.4s\n"
+      "add v11.4s, v11.4s, v6.4s\n"
+      "add v15.4s, v15.4s, v6.4s\n"
+      "add v20.4s, v20.4s, v6.4s\n"
+      "add v21.4s, v21.4s, v6.4s\n"
+      "add v22.4s, v22.4s, v6.4s\n"
+      "add v16.4s, v16.4s, v6.4s\n"
+      "add v17.4s, v17.4s, v6.4s\n"
+      "add v18.4s, v18.4s, v6.4s\n"
+      "add v19.4s, v19.4s, v6.4s\n"
+      "add v23.4s, v23.4s, v6.4s\n"
+      "add v28.4s, v28.4s, v6.4s\n"
+      "add v29.4s, v29.4s, v6.4s\n"
+      "add v30.4s, v30.4s, v6.4s\n"
+      "add v24.4s, v24.4s, v6.4s\n"
+      "add v25.4s, v25.4s, v6.4s\n"
+      "add v26.4s, v26.4s, v6.4s\n"
+      "add v27.4s, v27.4s, v6.4s\n"
+      "smin v31.4s, v31.4s, v5.4s\n"
+      "smin v12.4s, v12.4s, v5.4s\n"
+      "smin v13.4s, v13.4s, v5.4s\n"
+      "smin v14.4s, v14.4s, v5.4s\n"
+      "smin v8.4s, v8.4s, v5.4s\n"
+      "smin v9.4s, v9.4s, v5.4s\n"
+      "smin v10.4s, v10.4s, v5.4s\n"
+      "smin v11.4s, v11.4s, v5.4s\n"
+      "smin v15.4s, v15.4s, v5.4s\n"
+      "smin v20.4s, v20.4s, v5.4s\n"
+      "smin v21.4s, v21.4s, v5.4s\n"
+      "smin v22.4s, v22.4s, v5.4s\n"
+      "smin v16.4s, v16.4s, v5.4s\n"
+      "smin v17.4s, v17.4s, v5.4s\n"
+      "smin v18.4s, v18.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v5.4s\n"
+      "smin v23.4s, v23.4s, v5.4s\n"
+      "smin v28.4s, v28.4s, v5.4s\n"
+      "smin v29.4s, v29.4s, v5.4s\n"
+      "smin v30.4s, v30.4s, v5.4s\n"
+      "smin v24.4s, v24.4s, v5.4s\n"
+      "smin v25.4s, v25.4s, v5.4s\n"
+      "smin v26.4s, v26.4s, v5.4s\n"
+      "smin v27.4s, v27.4s, v5.4s\n"
+      "smax v31.4s, v31.4s, v4.4s\n"
+      "smax v12.4s, v12.4s, v4.4s\n"
+      "smax v13.4s, v13.4s, v4.4s\n"
+      "smax v14.4s, v14.4s, v4.4s\n"
+      "smax v8.4s, v8.4s, v4.4s\n"
+      "smax v9.4s, v9.4s, v4.4s\n"
+      "smax v10.4s, v10.4s, v4.4s\n"
+      "smax v11.4s, v11.4s, v4.4s\n"
+      "smax v15.4s, v15.4s, v4.4s\n"
+      "smax v20.4s, v20.4s, v4.4s\n"
+      "smax v21.4s, v21.4s, v4.4s\n"
+      "smax v22.4s, v22.4s, v4.4s\n"
+      "smax v16.4s, v16.4s, v4.4s\n"
+      "smax v17.4s, v17.4s, v4.4s\n"
+      "smax v18.4s, v18.4s, v4.4s\n"
+      "smax v19.4s, v19.4s, v4.4s\n"
+      "smax v23.4s, v23.4s, v4.4s\n"
+      "smax v28.4s, v28.4s, v4.4s\n"
+      "smax v29.4s, v29.4s, v4.4s\n"
+      "smax v30.4s, v30.4s, v4.4s\n"
+      "smax v24.4s, v24.4s, v4.4s\n"
+      "smax v25.4s, v25.4s, v4.4s\n"
+      "smax v26.4s, v26.4s, v4.4s\n"
+      "smax v27.4s, v27.4s, v4.4s\n"
+      "uzp1 v31.8h, v31.8h, v12.8h\n"
+      "uzp1 v1.8h, v13.8h, v14.8h\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "uzp1 v0.8h, v10.8h, v11.8h\n"
+      "uzp1 v15.8h, v15.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.8h, v23.8h, v28.8h\n"
+      "uzp1 v18.8h, v29.8h, v30.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v1.16b\n"
+      "uzp1 v8.16b, v8.16b, v0.16b\n"
+      "uzp1 v15.16b, v15.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v23.16b, v23.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
+      "bge 173f\n"
+      "tbz x10, #3, 168f\n"
+      "str d31, [x11], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x10, #2, 166f\n"
+      "st1 { v31.s }[2], [x11], #0x4\n"
+      "st1 { v8.s }[2], [x25], #0x4\n"
+      "st1 { v15.s }[2], [x24], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v23.s }[2], [x22], #0x4\n"
+      "st1 { v24.s }[2], [x21], #0x4\n"
+      "tbz x10, #1, 165f\n"
+      "st1 { v31.h }[6], [x11], #0x2\n"
+      "st1 { v8.h }[6], [x25], #0x2\n"
+      "st1 { v15.h }[6], [x24], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v23.h }[6], [x22], #0x2\n"
+      "st1 { v24.h }[6], [x21], #0x2\n"
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[14], [x11]\n"
+      "st1 { v8.b }[14], [x25]\n"
+      "st1 { v15.b }[14], [x24]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v23.b }[14], [x22]\n"
+      "st1 { v24.b }[14], [x21]\n"
+      "b 172f\n"
+      "165:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[12], [x11]\n"
+      "st1 { v8.b }[12], [x25]\n"
+      "st1 { v15.b }[12], [x24]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v23.b }[12], [x22]\n"
+      "st1 { v24.b }[12], [x21]\n"
+      "b 172f\n"
+      "166:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 167f\n"
+      "st1 { v31.h }[4], [x11], #0x2\n"
+      "st1 { v8.h }[4], [x25], #0x2\n"
+      "st1 { v15.h }[4], [x24], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v23.h }[4], [x22], #0x2\n"
+      "st1 { v24.h }[4], [x21], #0x2\n"
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[10], [x11]\n"
+      "st1 { v8.b }[10], [x25]\n"
+      "st1 { v15.b }[10], [x24]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v23.b }[10], [x22]\n"
+      "st1 { v24.b }[10], [x21]\n"
+      "b 172f\n"
+      "167:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[8], [x11]\n"
+      "st1 { v8.b }[8], [x25]\n"
+      "st1 { v15.b }[8], [x24]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v23.b }[8], [x22]\n"
+      "st1 { v24.b }[8], [x21]\n"
+      "b 172f\n"
+      "168:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 170f\n"
+      "str s31, [x11], #0x4\n"
+      "str s8, [x25], #0x4\n"
+      "str s15, [x24], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s23, [x22], #0x4\n"
+      "str s24, [x21], #0x4\n"
+      "tbz x10, #1, 169f\n"
+      "st1 { v31.h }[2], [x11], #0x2\n"
+      "st1 { v8.h }[2], [x25], #0x2\n"
+      "st1 { v15.h }[2], [x24], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v23.h }[2], [x22], #0x2\n"
+      "st1 { v24.h }[2], [x21], #0x2\n"
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[6], [x11]\n"
+      "st1 { v8.b }[6], [x25]\n"
+      "st1 { v15.b }[6], [x24]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v23.b }[6], [x22]\n"
+      "st1 { v24.b }[6], [x21]\n"
+      "b 172f\n"
+      "169:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[4], [x11]\n"
+      "st1 { v8.b }[4], [x25]\n"
+      "st1 { v15.b }[4], [x24]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v23.b }[4], [x22]\n"
+      "st1 { v24.b }[4], [x21]\n"
+      "b 172f\n"
+      "170:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 171f\n"
+      "str h31, [x11], #0x2\n"
+      "str h8, [x25], #0x2\n"
+      "str h15, [x24], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h23, [x22], #0x2\n"
+      "str h24, [x21], #0x2\n"
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[2], [x11]\n"
+      "st1 { v8.b }[2], [x25]\n"
+      "st1 { v15.b }[2], [x24]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v23.b }[2], [x22]\n"
+      "st1 { v24.b }[2], [x21]\n"
+      "b 172f\n"
+      "171:"  // Height 6: Partial direct writeback: partial_1_0
+      "str b31, [x11, #0x0]\n"
+      "str b8, [x25, #0x0]\n"
+      "str b15, [x24, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b23, [x22, #0x0]\n"
+      "str b24, [x21, #0x0]\n"
+      "172:"  // Height 6: Partial direct writeback: Done
+      "b 174f\n"
+      "173:"  // Height 6: Full writeback
+      "str q31, [x11, #0x0]\n"
+      "add x11, x11, #0x10\n"
+      "str q8, [x25, #0x0]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q23, [x22, #0x0]\n"
+      "str q24, [x21, #0x0]\n"
+      "174:"  // Height 6: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 147b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 176f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 175f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "175:"  // Update direct input
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "176:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
index 759a78a413..a02fbe8f28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
@@ -44,7 +44,8 @@ void a64_hybrid_s8s32_dot_6x16_a55( ARGLIST );
 class cls_a64_hybrid_s8s32_dot_6x16
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int32_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,37 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 12.667, 2.0799, 0.2279 };
-            default:
-                return { 29.6736, 11.4025, 0.5591 };
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.65 };
+                case CPUModel::A510:
+                    return { 15.87 };
+                case CPUModel::V1:
+                    return { 54.50 };
+                case CPUModel::A55r1:
+                    return { 9.217 };
+            }
         }
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 9.5238, 2.0799, 0.2279 };
+                default:
+                    return { 29.6736, 11.4025, 0.5591 };
+                case CPUModel::A510:
+                    return { 16.66, 3.92, 0.48 };
+                case CPUModel::V1:
+                    return { 42.62, 16.32, 0.83 };
+            }
+        }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
index 3817785a79..289d38c3b6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -77,7 +77,6 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
     ka.N = N;
     ka.B_ptr = B_ptr;
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 171f\n"
@@ -87,73 +86,73 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
       "cmp %x[M], #0x2\n"
       "bgt 69f\n"
       "beq 35f\n"
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
       "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
       "tbz %x[flags], #0, 12f\n"
-      "cmp x17, #0x10\n"
+      "cmp x8, #0x10\n"
       "bge 11f\n"
-      "tbz x17, #3, 6f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "tbz x17, #2, 4f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "tbz x17, #1, 3f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x15], #0x8\n"
-      "tbz x17, #0, 10f\n"
-      "ld1 { v11.s }[2], [x15]\n"
+      "tbz x8, #3, 6f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "tbz x8, #2, 4f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "tbz x8, #1, 3f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "tbz x8, #0, 10f\n"
+      "ld1 { v11.s }[2], [x17]\n"
       "b 10f\n"
       "3:"  // Height 1: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 10f\n"
-      "ldr s11, [x15, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 10f\n"
+      "ldr s11, [x17, #0x0]\n"
       "b 10f\n"
       "4:"  // Height 1: Partial accumulate: partial_2_8
-      "tbz x17, #1, 5f\n"
-      "ldr d10, [x15], #0x8\n"
-      "mov x24, #0x28\n"
-      "tbz x17, #0, 10f\n"
-      "ld1 { v10.s }[2], [x15]\n"
+      "tbz x8, #1, 5f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "tbz x8, #0, 10f\n"
+      "ld1 { v10.s }[2], [x17]\n"
       "b 10f\n"
       "5:"  // Height 1: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 10f\n"
-      "ldr s10, [x15, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 10f\n"
+      "ldr s10, [x17, #0x0]\n"
       "b 10f\n"
       "6:"  // Height 1: Partial accumulate: partial_4_0
-      "tbz x17, #2, 8f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "tbz x17, #1, 7f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x15], #0x8\n"
-      "tbz x17, #0, 10f\n"
-      "ld1 { v9.s }[2], [x15]\n"
+      "tbz x8, #2, 8f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "tbz x8, #1, 7f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "tbz x8, #0, 10f\n"
+      "ld1 { v9.s }[2], [x17]\n"
       "b 10f\n"
       "7:"  // Height 1: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 10f\n"
-      "ldr s9, [x15, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 10f\n"
+      "ldr s9, [x17, #0x0]\n"
       "b 10f\n"
       "8:"  // Height 1: Partial accumulate: partial_2_0
-      "tbz x17, #1, 9f\n"
-      "ldr d8, [x15], #0x8\n"
-      "mov x24, #0x8\n"
-      "tbz x17, #0, 10f\n"
-      "ld1 { v8.s }[2], [x15]\n"
+      "tbz x8, #1, 9f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "tbz x8, #0, 10f\n"
+      "ld1 { v8.s }[2], [x17]\n"
       "b 10f\n"
       "9:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
       "10:"  // Height 1: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 13f\n"
       "11:"  // Height 1: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
       "b 13f\n"
       "12:"  // Height 1: no accumulate
       "movi v8.4s, #0x0\n"
@@ -161,329 +160,329 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
       "movi v10.4s, #0x0\n"
       "movi v11.4s, #0x0\n"
       "13:"  // Height 1: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "14:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 15f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "cbnz x14, 16f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "cbnz x15, 16f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
       "b 16f\n"
       "15:"  // Height 1: setup direct input
-      "mov x12, %x[input_ptr]\n"
+      "mov x13, %x[input_ptr]\n"
       "16:"  // Height 1: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 19f\n"
-      "ldr q0, [x12, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
       "ldr q6, [x16, #0x0]\n"
-      "cmp x13, #0x20\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 18f\n"
       "17:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
-      "ldr x11, [x16, #0x18]\n"
-      "add x12, x12, #0x10\n"
-      "ldr d6, [x16, #0x20]\n"
-      "sub x13, x13, #0x10\n"
-      "ldr x10, [x16, #0x28]\n"
-      "cmp x13, #0x20\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr d17, [x16, #0x20]\n"
+      "ldr x20, [x16, #0x28]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      "ldr x10, [x16, #0x48]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x58]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      "ldr x10, [x16, #0x68]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      "ldr x10, [x16, #0x88]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      "ldr x10, [x16, #0xa8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      "ldr x10, [x16, #0xc8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      "ldr x10, [x16, #0xe8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xf0]\n"
+      "ldr d16, [x16, #0x30]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x38]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr d17, [x16, #0x40]\n"
+      "ldr x20, [x16, #0x48]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr d16, [x16, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x16, #0x60]\n"
+      "ldr x20, [x16, #0x68]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x16, #0x70]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x16, #0x80]\n"
+      "ldr x20, [x16, #0x88]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x16, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x16, #0xa0]\n"
+      "ldr x20, [x16, #0xa8]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x16, #0xb0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x16, #0xc0]\n"
+      "ldr x20, [x16, #0xc8]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x16, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr d17, [x16, #0xe0]\n"
+      "ldr x20, [x16, #0xe8]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr d16, [x16, #0xf0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xf8]\n"
+      "mov v16.d[1], x20\n"
+      "add x13, x13, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      "ldr x10, [x16, #0x8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d0, [x12, #0x0]\n"
-      "mov v0.d[1], x9\n"
+      "ldr x20, [x16, #0x8]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "sub x14, x14, #0x10\n"
+      "ldr d7, [x16, #0x10]\n"
+      "cmp x14, #0x20\n"
+      "ldr x21, [x13, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "bge 17b\n"
       "18:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      "ldr q6, [x16, #0x20]\n"
-      "sub x13, x13, #0x10\n"
-      "add x12, x12, #0x10\n"
+      "ldr q17, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x16, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x16, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x16, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x16, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x16, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x16, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x16, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x16, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x16, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x16, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x16, #0xf0]\n"
+      "add x13, x13, #0x10\n"
+      "sub x14, x14, #0x10\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
       "19:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x13, 24f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 24f\n"
+      "cmp x14, #0x4\n"
       "blt 21f\n"
       "20:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "cmp x13, #0x4\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
+      "ldr s18, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr q16, [x16, #0x0]\n"
+      ".inst 0x4f92e208  // sdot v8.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x4f92e209  // sdot v9.4s, v16.16b, v18.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
+      "cmp x14, #0x4\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f92e22a  // sdot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f92e20b  // sdot v11.4s, v16.16b, v18.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
       "bge 20b\n"
-      "cbz x13, 24f\n"
       "21:"  // Height 1: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 22f\n"
-      "ldr h0, [x12], #0x2\n"
-      "tbz x13, #0, 23f\n"
-      "ld1 { v0.b }[2], [x12]\n"
+      "cbz x14, 24f\n"
+      "tbz x14, #1, 22f\n"
+      "ldr h0, [x13], #0x2\n"
+      "tbz x14, #0, 23f\n"
+      "ld1 { v0.b }[2], [x13]\n"
       "b 23f\n"
       "22:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
       "23:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
+      "ldr q16, [x16, #0x0]\n"
+      ".inst 0x4f80e208  // sdot v8.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x20]\n"
+      ".inst 0x4f80e20a  // sdot v10.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
       "24:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 14b\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
       "bge 33f\n"
-      "tbz x17, #3, 28f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "tbz x17, #2, 26f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "tbz x17, #1, 25f\n"
-      "str d11, [x15], #0x8\n"
-      "tbz x17, #0, 32f\n"
-      "st1 { v11.s }[2], [x15]\n"
+      "tbz x8, #3, 28f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "tbz x8, #2, 26f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "tbz x8, #1, 25f\n"
+      "str d11, [x17], #0x8\n"
+      "tbz x8, #0, 32f\n"
+      "st1 { v11.s }[2], [x17]\n"
       "b 32f\n"
       "25:"  // Height 1: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 32f\n"
-      "str s11, [x15, #0x0]\n"
+      "tbz x8, #0, 32f\n"
+      "str s11, [x17, #0x0]\n"
       "b 32f\n"
       "26:"  // Height 1: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 27f\n"
-      "str d10, [x15], #0x8\n"
-      "tbz x17, #0, 32f\n"
-      "st1 { v10.s }[2], [x15]\n"
+      "tbz x8, #1, 27f\n"
+      "str d10, [x17], #0x8\n"
+      "tbz x8, #0, 32f\n"
+      "st1 { v10.s }[2], [x17]\n"
       "b 32f\n"
       "27:"  // Height 1: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 32f\n"
-      "str s10, [x15, #0x0]\n"
+      "tbz x8, #0, 32f\n"
+      "str s10, [x17, #0x0]\n"
       "b 32f\n"
       "28:"  // Height 1: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 30f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "tbz x17, #1, 29f\n"
-      "str d9, [x15], #0x8\n"
-      "tbz x17, #0, 32f\n"
-      "st1 { v9.s }[2], [x15]\n"
+      "tbz x8, #2, 30f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "tbz x8, #1, 29f\n"
+      "str d9, [x17], #0x8\n"
+      "tbz x8, #0, 32f\n"
+      "st1 { v9.s }[2], [x17]\n"
       "b 32f\n"
       "29:"  // Height 1: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 32f\n"
-      "str s9, [x15, #0x0]\n"
+      "tbz x8, #0, 32f\n"
+      "str s9, [x17, #0x0]\n"
       "b 32f\n"
       "30:"  // Height 1: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 31f\n"
-      "str d8, [x15], #0x8\n"
-      "tbz x17, #0, 32f\n"
-      "st1 { v8.s }[2], [x15]\n"
+      "tbz x8, #1, 31f\n"
+      "str d8, [x17], #0x8\n"
+      "tbz x8, #0, 32f\n"
+      "st1 { v8.s }[2], [x17]\n"
       "b 32f\n"
       "31:"  // Height 1: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
       "32:"  // Height 1: Partial direct writeback: Done
       "b 34f\n"
       "33:"  // Height 1: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
       "34:"  // Height 1: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 2b\n"
       "b 206f\n"
       "35:"  // Height 2
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
       "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "36:"  // Height 2: Column loop
       "tbz %x[flags], #0, 46f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x8, #0x10\n"
+      "add x24, x17, x20, LSL #2\n"
       "bge 45f\n"
-      "tbz x17, #3, 40f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "tbz x17, #2, 38f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "tbz x17, #1, 37f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x15], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "tbz x17, #0, 44f\n"
-      "ld1 { v11.s }[2], [x15]\n"
-      "ld1 { v15.s }[2], [x23]\n"
+      "tbz x8, #3, 40f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "tbz x8, #2, 38f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "tbz x8, #1, 37f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "tbz x8, #0, 44f\n"
+      "ld1 { v11.s }[2], [x17]\n"
+      "ld1 { v15.s }[2], [x24]\n"
       "b 44f\n"
       "37:"  // Height 2: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 44f\n"
-      "ldr s11, [x15, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 44f\n"
+      "ldr s11, [x17, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
       "b 44f\n"
       "38:"  // Height 2: Partial accumulate: partial_2_8
-      "tbz x17, #1, 39f\n"
-      "ldr d10, [x15], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "tbz x17, #0, 44f\n"
-      "ld1 { v10.s }[2], [x15]\n"
-      "ld1 { v14.s }[2], [x23]\n"
+      "tbz x8, #1, 39f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d14, [x24], #0x8\n"
+      "tbz x8, #0, 44f\n"
+      "ld1 { v10.s }[2], [x17]\n"
+      "ld1 { v14.s }[2], [x24]\n"
       "b 44f\n"
       "39:"  // Height 2: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 44f\n"
-      "ldr s10, [x15, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 44f\n"
+      "ldr s10, [x17, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
       "b 44f\n"
       "40:"  // Height 2: Partial accumulate: partial_4_0
-      "tbz x17, #2, 42f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "tbz x17, #1, 41f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x15], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "tbz x17, #0, 44f\n"
-      "ld1 { v9.s }[2], [x15]\n"
-      "ld1 { v13.s }[2], [x23]\n"
+      "tbz x8, #2, 42f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "tbz x8, #1, 41f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "tbz x8, #0, 44f\n"
+      "ld1 { v9.s }[2], [x17]\n"
+      "ld1 { v13.s }[2], [x24]\n"
       "b 44f\n"
       "41:"  // Height 2: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 44f\n"
-      "ldr s9, [x15, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 44f\n"
+      "ldr s9, [x17, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
       "b 44f\n"
       "42:"  // Height 2: Partial accumulate: partial_2_0
-      "tbz x17, #1, 43f\n"
-      "ldr d8, [x15], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "tbz x17, #0, 44f\n"
-      "ld1 { v8.s }[2], [x15]\n"
-      "ld1 { v12.s }[2], [x23]\n"
+      "tbz x8, #1, 43f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "tbz x8, #0, 44f\n"
+      "ld1 { v8.s }[2], [x17]\n"
+      "ld1 { v12.s }[2], [x24]\n"
       "b 44f\n"
       "43:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
       "44:"  // Height 2: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 47f\n"
       "45:"  // Height 2: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
       "b 47f\n"
       "46:"  // Height 2: no accumulate
       "movi v8.4s, #0x0\n"
@@ -495,428 +494,428 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
       "movi v14.4s, #0x0\n"
       "movi v15.4s, #0x0\n"
       "47:"  // Height 2: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "cbnz x14, 50f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "cbnz x15, 50f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
       "b 50f\n"
       "49:"  // Height 2: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21\n"
       "50:"  // Height 2: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 53f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x20\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
+      "ldr q1, [x12, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 52f\n"
       "51:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x18]\n"
-      "ldr d6, [x16, #0x20]\n"
-      "add x12, x12, #0x10\n"
-      "ldr x10, [x16, #0x28]\n"
-      "add x28, x28, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      "sub x13, x13, #0x10\n"
+      "ldr d17, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      "ldr x10, [x16, #0x48]\n"
-      "cmp x13, #0x20\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x58]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x68]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x78]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x88]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xa8]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xc8]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x16, #0xe8]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr d16, [x16, #0x30]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr d17, [x16, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr x20, [x16, #0x48]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr d16, [x16, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x16, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x16, #0x70]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x16, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr x20, [x16, #0x88]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x16, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x16, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x16, #0xb0]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x16, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr x20, [x16, #0xc8]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x16, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr d17, [x16, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr d16, [x16, #0xf0]\n"
+      "mov v17.d[1], x21\n"
+      "add x13, x13, #0x10\n"
+      "mov v16.d[1], x20\n"
+      "add x12, x12, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x10, [x16, #0x8]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d1, [x28, #0x0]\n"
-      "mov v0.d[1], x9\n"
-      "mov v1.d[1], x27\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "sub x14, x14, #0x10\n"
+      "ldr d7, [x16, #0x10]\n"
+      "cmp x14, #0x20\n"
+      "ldr x20, [x13, #0x8]\n"
+      "mov v6.d[1], x21\n"
+      "ldr x21, [x12, #0x8]\n"
+      "mov v0.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v1.d[1], x21\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      "sub x13, x13, #0x10\n"
-      "add x12, x12, #0x10\n"
+      "ldr q17, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x28, x28, #0x10\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "sub x14, x14, #0x10\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x16, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x16, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x16, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x16, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x16, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x16, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x16, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x16, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x16, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x16, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x16, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x16, #0xf0]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x13, 58f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 58f\n"
+      "cmp x14, #0x4\n"
       "blt 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
-      "cmp x13, #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s19, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr s18, [x12], #0x4\n"
+      "cmp x14, #0x4\n"
+      "ldr q17, [x16, #0x0]\n"
+      ".inst 0x4f93e228  // sdot v8.4s, v17.16b, v19.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x4f92e22c  // sdot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
+      ".inst 0x4f93e209  // sdot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20d  // sdot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f93e22a  // sdot v10.4s, v17.16b, v19.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f92e22e  // sdot v14.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f93e20b  // sdot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20f  // sdot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 54b\n"
-      "cbz x13, 58f\n"
       "55:"  // Height 2: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 56f\n"
-      "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "tbz x13, #0, 57f\n"
-      "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
+      "cbz x14, 58f\n"
+      "tbz x14, #1, 56f\n"
+      "ldr h0, [x13], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "tbz x14, #0, 57f\n"
+      "ld1 { v0.b }[2], [x13]\n"
+      "ld1 { v1.b }[2], [x12]\n"
       "b 57f\n"
       "56:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
       "57:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q17, [x16, #0x0]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x4f81e22c  // sdot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20d  // sdot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
       "58:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 48b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "bge 67f\n"
-      "tbz x17, #3, 62f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "tbz x17, #2, 60f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "tbz x17, #1, 59f\n"
-      "str d11, [x15], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "tbz x17, #0, 66f\n"
-      "st1 { v11.s }[2], [x15]\n"
-      "st1 { v15.s }[2], [x23]\n"
+      "tbz x8, #3, 62f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "tbz x8, #2, 60f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "tbz x8, #1, 59f\n"
+      "str d11, [x17], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "tbz x8, #0, 66f\n"
+      "st1 { v11.s }[2], [x17]\n"
+      "st1 { v15.s }[2], [x24]\n"
       "b 66f\n"
       "59:"  // Height 2: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 66f\n"
-      "str s11, [x15, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
+      "tbz x8, #0, 66f\n"
+      "str s11, [x17, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
       "b 66f\n"
       "60:"  // Height 2: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 61f\n"
-      "str d10, [x15], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "tbz x17, #0, 66f\n"
-      "st1 { v10.s }[2], [x15]\n"
-      "st1 { v14.s }[2], [x23]\n"
+      "tbz x8, #1, 61f\n"
+      "str d10, [x17], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "tbz x8, #0, 66f\n"
+      "st1 { v10.s }[2], [x17]\n"
+      "st1 { v14.s }[2], [x24]\n"
       "b 66f\n"
       "61:"  // Height 2: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 66f\n"
-      "str s10, [x15, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
+      "tbz x8, #0, 66f\n"
+      "str s10, [x17, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
       "b 66f\n"
       "62:"  // Height 2: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 64f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "tbz x17, #1, 63f\n"
-      "str d9, [x15], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "tbz x17, #0, 66f\n"
-      "st1 { v9.s }[2], [x15]\n"
-      "st1 { v13.s }[2], [x23]\n"
+      "tbz x8, #2, 64f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "tbz x8, #1, 63f\n"
+      "str d9, [x17], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "tbz x8, #0, 66f\n"
+      "st1 { v9.s }[2], [x17]\n"
+      "st1 { v13.s }[2], [x24]\n"
       "b 66f\n"
       "63:"  // Height 2: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 66f\n"
-      "str s9, [x15, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
+      "tbz x8, #0, 66f\n"
+      "str s9, [x17, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
       "b 66f\n"
       "64:"  // Height 2: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 65f\n"
-      "str d8, [x15], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "tbz x17, #0, 66f\n"
-      "st1 { v8.s }[2], [x15]\n"
-      "st1 { v12.s }[2], [x23]\n"
+      "tbz x8, #1, 65f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "tbz x8, #0, 66f\n"
+      "st1 { v8.s }[2], [x17]\n"
+      "st1 { v12.s }[2], [x24]\n"
       "b 66f\n"
       "65:"  // Height 2: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
       "66:"  // Height 2: Partial direct writeback: Done
       "b 68f\n"
       "67:"  // Height 2: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
       "68:"  // Height 2: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 36b\n"
       "b 206f\n"
       "69:"  // Height 3
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
       "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "70:"  // Height 3: Column loop
       "tbz %x[flags], #0, 80f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "add x23, x24, x20, LSL #2\n"
       "bge 79f\n"
-      "tbz x17, #3, 74f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "tbz x17, #2, 72f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "tbz x17, #1, 71f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x24, #0x38\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "tbz x17, #0, 78f\n"
-      "ld1 { v11.s }[2], [x15]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
+      "tbz x8, #3, 74f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x8, #2, 72f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x8, #1, 71f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x8, #0, 78f\n"
+      "ld1 { v11.s }[2], [x17]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
       "b 78f\n"
       "71:"  // Height 3: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 78f\n"
-      "ldr s11, [x15, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 78f\n"
+      "ldr s11, [x17, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
       "b 78f\n"
       "72:"  // Height 3: Partial accumulate: partial_2_8
-      "tbz x17, #1, 73f\n"
-      "ldr d10, [x15], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "tbz x17, #0, 78f\n"
-      "ld1 { v10.s }[2], [x15]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
+      "tbz x8, #1, 73f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d14, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "tbz x8, #0, 78f\n"
+      "ld1 { v10.s }[2], [x17]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
       "b 78f\n"
       "73:"  // Height 3: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 78f\n"
-      "ldr s10, [x15, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 78f\n"
+      "ldr s10, [x17, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
       "b 78f\n"
       "74:"  // Height 3: Partial accumulate: partial_4_0
-      "tbz x17, #2, 76f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "tbz x17, #1, 75f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x24, #0x18\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "tbz x17, #0, 78f\n"
-      "ld1 { v9.s }[2], [x15]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
+      "tbz x8, #2, 76f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "tbz x8, #1, 75f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "tbz x8, #0, 78f\n"
+      "ld1 { v9.s }[2], [x17]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
       "b 78f\n"
       "75:"  // Height 3: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 78f\n"
-      "ldr s9, [x15, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 78f\n"
+      "ldr s9, [x17, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
       "b 78f\n"
       "76:"  // Height 3: Partial accumulate: partial_2_0
-      "tbz x17, #1, 77f\n"
-      "ldr d8, [x15], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "tbz x17, #0, 78f\n"
-      "ld1 { v8.s }[2], [x15]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
+      "tbz x8, #1, 77f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "tbz x8, #0, 78f\n"
+      "ld1 { v8.s }[2], [x17]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
       "b 78f\n"
       "77:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
       "78:"  // Height 3: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 81f\n"
       "79:"  // Height 3: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
       "b 81f\n"
       "80:"  // Height 3: no accumulate
       "movi v8.4s, #0x0\n"
@@ -932,526 +931,526 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
       "movi v18.4s, #0x0\n"
       "movi v19.4s, #0x0\n"
       "81:"  // Height 3: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "82:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 83f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "cbnz x14, 84f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "cbnz x15, 84f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
       "b 84f\n"
       "83:"  // Height 3: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
       "84:"  // Height 3: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 87f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 86f\n"
       "85:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x18]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
-      "ldr x10, [x16, #0x28]\n"
-      "add x12, x12, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "ldr x11, [x16, #0x38]\n"
-      "add x28, x28, #0x10\n"
+      "ldr d21, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "mov v21.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr x10, [x16, #0x48]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0x40]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x50]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x68]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0x60]\n"
-      "cmp x13, #0x20\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x25, [x26, #0x8]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x88]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xa8]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xc8]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x16, #0xe8]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xf0]\n"
+      "ldr d20, [x16, #0x30]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr d21, [x16, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr d20, [x16, #0x50]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x16, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x16, #0x70]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x16, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x16, #0x90]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x16, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x16, #0xb0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x16, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x16, #0xd0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr d21, [x16, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      "add x13, x13, #0x10\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr d20, [x16, #0xf0]\n"
+      "mov v20.d[1], x20\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x16, #0x8]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
+      "ldr x20, [x16, #0x8]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      "ldr x23, [x13, #0x8]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "mov v0.d[1], x9\n"
-      "ldr d2, [x26, #0x0]\n"
-      "mov v1.d[1], x27\n"
-      "mov v2.d[1], x25\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "ldr x22, [x12, #0x8]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "sub x14, x14, #0x10\n"
+      "ldr d7, [x16, #0x10]\n"
+      "cmp x14, #0x20\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v0.d[1], x23\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v1.d[1], x22\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "mov v2.d[1], x21\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v7.d[1], x20\n"
       "bge 85b\n"
       "86:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q21, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "sub x14, x14, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x28, x28, #0x10\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x16, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x16, #0x50]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x16, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x16, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x16, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x16, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x16, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x16, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x16, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x16, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x16, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x16, #0xf0]\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
       "87:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x13, 92f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 92f\n"
+      "cmp x14, #0x4\n"
       "blt 89f\n"
       "88:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
-      "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s24, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr s23, [x12], #0x4\n"
+      "cmp x14, #0x4\n"
+      "ldr s22, [x11], #0x4\n"
+      "ldr q21, [x16, #0x0]\n"
+      ".inst 0x4f98e2a8  // sdot v8.4s, v21.16b, v24.4b[0]\n"
+      "ldr q20, [x16, #0x10]\n"
+      ".inst 0x4f97e2ac  // sdot v12.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b0  // sdot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x16, #0x20]\n"
+      ".inst 0x4f98e289  // sdot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28d  // sdot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e291  // sdot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x4f98e2aa  // sdot v10.4s, v21.16b, v24.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f97e2ae  // sdot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b2  // sdot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x4f98e28b  // sdot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28f  // sdot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e293  // sdot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 88b\n"
-      "cbz x13, 92f\n"
       "89:"  // Height 3: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 90f\n"
-      "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "tbz x13, #0, 91f\n"
-      "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
+      "cbz x14, 92f\n"
+      "tbz x14, #1, 90f\n"
+      "ldr h0, [x13], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "tbz x14, #0, 91f\n"
+      "ld1 { v0.b }[2], [x13]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
       "b 91f\n"
       "90:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
       "91:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q21, [x16, #0x0]\n"
+      ".inst 0x4f80e2a8  // sdot v8.4s, v21.16b, v0.4b[0]\n"
+      "ldr q20, [x16, #0x10]\n"
+      ".inst 0x4f81e2ac  // sdot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b0  // sdot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x16, #0x20]\n"
+      ".inst 0x4f80e289  // sdot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28d  // sdot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e291  // sdot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
       "92:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 82b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
       "bge 101f\n"
-      "tbz x17, #3, 96f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "tbz x17, #2, 94f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "tbz x17, #1, 93f\n"
-      "str d11, [x15], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "tbz x17, #0, 100f\n"
-      "st1 { v11.s }[2], [x15]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
+      "tbz x8, #3, 96f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x8, #2, 94f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x8, #1, 93f\n"
+      "str d11, [x17], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x8, #0, 100f\n"
+      "st1 { v11.s }[2], [x17]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
       "b 100f\n"
       "93:"  // Height 3: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 100f\n"
-      "str s11, [x15, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
+      "tbz x8, #0, 100f\n"
+      "str s11, [x17, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
       "b 100f\n"
       "94:"  // Height 3: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 95f\n"
-      "str d10, [x15], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "tbz x17, #0, 100f\n"
-      "st1 { v10.s }[2], [x15]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
+      "tbz x8, #1, 95f\n"
+      "str d10, [x17], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x8, #0, 100f\n"
+      "st1 { v10.s }[2], [x17]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
       "b 100f\n"
       "95:"  // Height 3: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 100f\n"
-      "str s10, [x15, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
+      "tbz x8, #0, 100f\n"
+      "str s10, [x17, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
       "b 100f\n"
       "96:"  // Height 3: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 98f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "tbz x17, #1, 97f\n"
-      "str d9, [x15], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "tbz x17, #0, 100f\n"
-      "st1 { v9.s }[2], [x15]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
+      "tbz x8, #2, 98f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x8, #1, 97f\n"
+      "str d9, [x17], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x8, #0, 100f\n"
+      "st1 { v9.s }[2], [x17]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
       "b 100f\n"
       "97:"  // Height 3: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 100f\n"
-      "str s9, [x15, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
+      "tbz x8, #0, 100f\n"
+      "str s9, [x17, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
       "b 100f\n"
       "98:"  // Height 3: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 99f\n"
-      "str d8, [x15], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "tbz x17, #0, 100f\n"
-      "st1 { v8.s }[2], [x15]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
+      "tbz x8, #1, 99f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x8, #0, 100f\n"
+      "st1 { v8.s }[2], [x17]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
       "b 100f\n"
       "99:"  // Height 3: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
       "100:"  // Height 3: Partial direct writeback: Done
       "b 102f\n"
       "101:"  // Height 3: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
       "102:"  // Height 3: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 70b\n"
       "b 206f\n"
       "103:"  // Height 4
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
       "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "104:"  // Height 4: Column loop
       "tbz %x[flags], #0, 114f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "add x22, x23, x20, LSL #2\n"
       "bge 113f\n"
-      "tbz x17, #3, 108f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "tbz x17, #2, 106f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "tbz x17, #1, 105f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x24, #0x38\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "tbz x17, #0, 112f\n"
-      "ld1 { v11.s }[2], [x15]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
+      "tbz x8, #3, 108f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x8, #2, 106f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x8, #1, 105f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x8, #0, 112f\n"
+      "ld1 { v11.s }[2], [x17]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
       "b 112f\n"
       "105:"  // Height 4: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 112f\n"
-      "ldr s11, [x15, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 112f\n"
+      "ldr s11, [x17, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
       "b 112f\n"
       "106:"  // Height 4: Partial accumulate: partial_2_8
-      "tbz x17, #1, 107f\n"
-      "ldr d10, [x15], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "tbz x17, #0, 112f\n"
-      "ld1 { v10.s }[2], [x15]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
+      "tbz x8, #1, 107f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d14, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x8, #0, 112f\n"
+      "ld1 { v10.s }[2], [x17]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
       "b 112f\n"
       "107:"  // Height 4: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 112f\n"
-      "ldr s10, [x15, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 112f\n"
+      "ldr s10, [x17, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
       "b 112f\n"
       "108:"  // Height 4: Partial accumulate: partial_4_0
-      "tbz x17, #2, 110f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "tbz x17, #1, 109f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x24, #0x18\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "tbz x17, #0, 112f\n"
-      "ld1 { v9.s }[2], [x15]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
+      "tbz x8, #2, 110f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "tbz x8, #1, 109f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x8, #0, 112f\n"
+      "ld1 { v9.s }[2], [x17]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
       "b 112f\n"
       "109:"  // Height 4: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 112f\n"
-      "ldr s9, [x15, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 112f\n"
+      "ldr s9, [x17, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
       "b 112f\n"
       "110:"  // Height 4: Partial accumulate: partial_2_0
-      "tbz x17, #1, 111f\n"
-      "ldr d8, [x15], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "tbz x17, #0, 112f\n"
-      "ld1 { v8.s }[2], [x15]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
+      "tbz x8, #1, 111f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "tbz x8, #0, 112f\n"
+      "ld1 { v8.s }[2], [x17]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
       "b 112f\n"
       "111:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
       "112:"  // Height 4: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 115f\n"
       "113:"  // Height 4: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
       "b 115f\n"
       "114:"  // Height 4: no accumulate
       "movi v8.4s, #0x0\n"
@@ -1471,624 +1470,624 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
       "movi v22.4s, #0x0\n"
       "movi v23.4s, #0x0\n"
       "115:"  // Height 4: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "116:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 117f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "cbnz x14, 118f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
-      "add x24, x24, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "cbnz x15, 118f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "b 118f\n"
       "117:"  // Height 4: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
-      "add x24, x26, x19\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
       "118:"  // Height 4: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 121f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 120f\n"
       "119:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x18]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x16, #0x28]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x38]\n"
-      "add x12, x12, #0x10\n"
-      "add x28, x28, #0x10\n"
+      "ldr d25, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "mov v25.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr x10, [x16, #0x48]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x68]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr x25, [x26, #0x8]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x70]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x10, [x16, #0x88]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x23, [x24, #0x8]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xa8]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "cmp x13, #0x20\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xc8]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x16, #0xe8]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr d24, [x16, #0x30]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr d25, [x16, #0x40]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr d24, [x16, #0x50]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      "ldr x25, [x13, #0x8]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x16, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      "ldr x24, [x12, #0x8]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x16, #0x70]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      "ldr x23, [x11, #0x8]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x16, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      "ldr x22, [x10, #0x8]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x16, #0x90]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      "sub x14, x14, #0x10\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x16, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      "cmp x14, #0x20\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x16, #0xb0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x16, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x16, #0xd0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr d25, [x16, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr d24, [x16, #0xf0]\n"
+      "mov v24.d[1], x20\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x10, [x16, #0x8]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0x18]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "mov v0.d[1], x9\n"
-      "mov v1.d[1], x27\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "mov v2.d[1], x25\n"
-      "mov v3.d[1], x23\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
+      "ldr d3, [x10, #0x0]\n"
+      "ldr d7, [x16, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x24\n"
+      "mov v2.d[1], x23\n"
+      "mov v3.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 119b\n"
       "120:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       "add x12, x12, #0x10\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr q25, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "add x28, x28, #0x10\n"
+      "sub x14, x14, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x16, #0x40]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x16, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x16, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x16, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x16, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x16, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x16, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x16, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x16, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x16, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x16, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x16, #0xf0]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
       "121:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x13, 126f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 126f\n"
+      "cmp x14, #0x4\n"
       "blt 123f\n"
       "122:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
-      "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s29, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr s28, [x12], #0x4\n"
+      "cmp x14, #0x4\n"
+      "ldr s27, [x11], #0x4\n"
+      "ldr s26, [x10], #0x4\n"
+      "ldr q25, [x16, #0x0]\n"
+      ".inst 0x4f9de328  // sdot v8.4s, v25.16b, v29.4b[0]\n"
+      "ldr q24, [x16, #0x10]\n"
+      ".inst 0x4f9ce32c  // sdot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be330  // sdot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae334  // sdot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x16, #0x20]\n"
+      ".inst 0x4f9de309  // sdot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30d  // sdot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be311  // sdot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae315  // sdot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x4f9de32a  // sdot v10.4s, v25.16b, v29.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f9ce32e  // sdot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be332  // sdot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae336  // sdot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x4f9de30b  // sdot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30f  // sdot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be313  // sdot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae317  // sdot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 122b\n"
-      "cbz x13, 126f\n"
       "123:"  // Height 4: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 124f\n"
-      "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
-      "tbz x13, #0, 125f\n"
-      "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
-      "ld1 { v3.b }[2], [x24]\n"
+      "cbz x14, 126f\n"
+      "tbz x14, #1, 124f\n"
+      "ldr h0, [x13], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
+      "tbz x14, #0, 125f\n"
+      "ld1 { v0.b }[2], [x13]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
       "b 125f\n"
       "124:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
-      "ldr b3, [x24, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
       "125:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q25, [x16, #0x0]\n"
+      ".inst 0x4f80e328  // sdot v8.4s, v25.16b, v0.4b[0]\n"
+      "ldr q24, [x16, #0x10]\n"
+      ".inst 0x4f81e32c  // sdot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e330  // sdot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e334  // sdot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x16, #0x20]\n"
+      ".inst 0x4f80e309  // sdot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30d  // sdot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e311  // sdot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e315  // sdot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
       "126:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 116b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "bge 135f\n"
-      "tbz x17, #3, 130f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "tbz x17, #2, 128f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "tbz x17, #1, 127f\n"
-      "str d11, [x15], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "tbz x17, #0, 134f\n"
-      "st1 { v11.s }[2], [x15]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
+      "tbz x8, #3, 130f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "tbz x8, #2, 128f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "tbz x8, #1, 127f\n"
+      "str d11, [x17], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "tbz x8, #0, 134f\n"
+      "st1 { v11.s }[2], [x17]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
       "b 134f\n"
       "127:"  // Height 4: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 134f\n"
-      "str s11, [x15, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
+      "tbz x8, #0, 134f\n"
+      "str s11, [x17, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
       "b 134f\n"
       "128:"  // Height 4: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 129f\n"
-      "str d10, [x15], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "tbz x17, #0, 134f\n"
-      "st1 { v10.s }[2], [x15]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
+      "tbz x8, #1, 129f\n"
+      "str d10, [x17], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "tbz x8, #0, 134f\n"
+      "st1 { v10.s }[2], [x17]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
       "b 134f\n"
       "129:"  // Height 4: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 134f\n"
-      "str s10, [x15, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
+      "tbz x8, #0, 134f\n"
+      "str s10, [x17, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
       "b 134f\n"
       "130:"  // Height 4: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 132f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "tbz x17, #1, 131f\n"
-      "str d9, [x15], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "tbz x17, #0, 134f\n"
-      "st1 { v9.s }[2], [x15]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
+      "tbz x8, #2, 132f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "tbz x8, #1, 131f\n"
+      "str d9, [x17], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "tbz x8, #0, 134f\n"
+      "st1 { v9.s }[2], [x17]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
       "b 134f\n"
       "131:"  // Height 4: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 134f\n"
-      "str s9, [x15, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
+      "tbz x8, #0, 134f\n"
+      "str s9, [x17, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
       "b 134f\n"
       "132:"  // Height 4: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 133f\n"
-      "str d8, [x15], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "tbz x17, #0, 134f\n"
-      "st1 { v8.s }[2], [x15]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
+      "tbz x8, #1, 133f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "tbz x8, #0, 134f\n"
+      "st1 { v8.s }[2], [x17]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
       "b 134f\n"
       "133:"  // Height 4: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
       "134:"  // Height 4: Partial direct writeback: Done
       "b 136f\n"
       "135:"  // Height 4: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
       "136:"  // Height 4: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 104b\n"
       "b 206f\n"
       "137:"  // Height 5
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
       "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "138:"  // Height 5: Column loop
       "tbz %x[flags], #0, 148f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "add x21, x22, x20, LSL #2\n"
       "bge 147f\n"
-      "tbz x17, #3, 142f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "ld1 { v25.4s }, [x20], #0x10\n"
-      "tbz x17, #2, 140f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "ld1 { v26.4s }, [x20], #0x10\n"
-      "tbz x17, #1, 139f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x24, #0x38\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "ldr d27, [x20], #0x8\n"
-      "tbz x17, #0, 146f\n"
-      "ld1 { v11.s }[2], [x15]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
-      "ld1 { v27.s }[2], [x20]\n"
+      "tbz x8, #3, 142f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "tbz x8, #2, 140f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x8, #1, 139f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x8, #0, 146f\n"
+      "ld1 { v11.s }[2], [x17]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
       "b 146f\n"
       "139:"  // Height 5: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 146f\n"
-      "ldr s11, [x15, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
-      "ldr s27, [x20, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 146f\n"
+      "ldr s11, [x17, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
       "b 146f\n"
       "140:"  // Height 5: Partial accumulate: partial_2_8
-      "tbz x17, #1, 141f\n"
-      "ldr d10, [x15], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "ldr d26, [x20], #0x8\n"
-      "tbz x17, #0, 146f\n"
-      "ld1 { v10.s }[2], [x15]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
-      "ld1 { v26.s }[2], [x20]\n"
+      "tbz x8, #1, 141f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d14, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x8, #0, 146f\n"
+      "ld1 { v10.s }[2], [x17]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
       "b 146f\n"
       "141:"  // Height 5: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 146f\n"
-      "ldr s10, [x15, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
-      "ldr s26, [x20, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 146f\n"
+      "ldr s10, [x17, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
       "b 146f\n"
       "142:"  // Height 5: Partial accumulate: partial_4_0
-      "tbz x17, #2, 144f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "tbz x17, #1, 143f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x24, #0x18\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "ldr d25, [x20], #0x8\n"
-      "tbz x17, #0, 146f\n"
-      "ld1 { v9.s }[2], [x15]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
-      "ld1 { v25.s }[2], [x20]\n"
+      "tbz x8, #2, 144f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "tbz x8, #1, 143f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "tbz x8, #0, 146f\n"
+      "ld1 { v9.s }[2], [x17]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
       "b 146f\n"
       "143:"  // Height 5: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 146f\n"
-      "ldr s9, [x15, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
-      "ldr s25, [x20, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 146f\n"
+      "ldr s9, [x17, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
       "b 146f\n"
       "144:"  // Height 5: Partial accumulate: partial_2_0
-      "tbz x17, #1, 145f\n"
-      "ldr d8, [x15], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
-      "tbz x17, #0, 146f\n"
-      "ld1 { v8.s }[2], [x15]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
-      "ld1 { v24.s }[2], [x20]\n"
+      "tbz x8, #1, 145f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "tbz x8, #0, 146f\n"
+      "ld1 { v8.s }[2], [x17]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
       "b 146f\n"
       "145:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
-      "ldr s24, [x20, #0x0]\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
       "146:"  // Height 5: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 149f\n"
       "147:"  // Height 5: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
-      "ldr q24, [x20, #0x0]\n"
-      "ldr q25, [x20, #0x10]\n"
-      "ldr q26, [x20, #0x20]\n"
-      "ldr q27, [x20, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
       "b 149f\n"
       "148:"  // Height 5: no accumulate
       "movi v8.4s, #0x0\n"
@@ -2112,725 +2111,725 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
       "movi v26.4s, #0x0\n"
       "movi v27.4s, #0x0\n"
       "149:"  // Height 5: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "150:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 151f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x22, [x20, #0x20]\n"
-      "cbnz x14, 152f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
-      "add x24, x24, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "cbnz x15, 152f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
+      "add x9, x9, x20\n"
       "b 152f\n"
       "151:"  // Height 5: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
-      "add x24, x26, x19\n"
-      "add x22, x24, x19\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
       "152:"  // Height 5: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 155f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 154f\n"
       "153:"  // Height 5: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x18]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x16, #0x28]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "add x28, x28, #0x10\n"
+      "ldr d29, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "mov v29.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x30]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x10, [x16, #0x48]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x25, [x26, #0x8]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr x10, [x16, #0x68]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr x23, [x24, #0x8]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x88]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x21, [x22, #0x8]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "cmp x13, #0x20\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr x10, [x16, #0xa8]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x16, #0xc8]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x16, #0xe8]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr d28, [x16, #0x30]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      "ldr x26, [x13, #0x8]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr d29, [x16, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      "ldr x25, [x12, #0x8]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr d28, [x16, #0x50]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      "ldr x22, [x9, #0x8]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x16, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      "sub x14, x14, #0x10\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      "cmp x14, #0x20\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x16, #0x70]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x16, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x16, #0x90]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x16, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x16, #0xb0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x16, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x16, #0xd0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr d29, [x16, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr d28, [x16, #0xf0]\n"
+      "mov v28.d[1], x20\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x10, [x16, #0x8]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0x18]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "mov v0.d[1], x9\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "mov v1.d[1], x27\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "ldr d4, [x22, #0x0]\n"
-      "mov v2.d[1], x25\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      "ldr d3, [x10, #0x0]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
+      "ldr d4, [x9, #0x0]\n"
+      "ldr d7, [x16, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x26\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
       "mov v3.d[1], x23\n"
-      "mov v4.d[1], x21\n"
+      "mov v4.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 153b\n"
       "154:"  // Height 5: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       "add x12, x12, #0x10\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q29, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x28, x28, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "sub x14, x14, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x16, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x16, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x16, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x16, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x16, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x16, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x16, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x16, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x16, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x16, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x16, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x16, #0xf0]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
       "155:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x13, 160f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 160f\n"
+      "cmp x14, #0x4\n"
       "blt 157f\n"
       "156:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
-      "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s2, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "cmp x14, #0x4\n"
+      "ldr s0, [x11], #0x4\n"
+      "ldr s31, [x10], #0x4\n"
+      "ldr s30, [x9], #0x4\n"
+      "ldr q29, [x16, #0x0]\n"
+      ".inst 0x4f82e3a8  // sdot v8.4s, v29.16b, v2.4b[0]\n"
+      "ldr q28, [x16, #0x10]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b0  // sdot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b4  // sdot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3b8  // sdot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x16, #0x20]\n"
+      ".inst 0x4f82e389  // sdot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e391  // sdot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe395  // sdot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee399  // sdot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x4f82e3aa  // sdot v10.4s, v29.16b, v2.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b6  // sdot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3ba  // sdot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x4f82e38b  // sdot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe397  // sdot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee39b  // sdot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 156b\n"
-      "cbz x13, 160f\n"
       "157:"  // Height 5: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 158f\n"
-      "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
-      "tbz x13, #0, 159f\n"
-      "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
-      "ld1 { v3.b }[2], [x24]\n"
-      "ld1 { v4.b }[2], [x22]\n"
+      "cbz x14, 160f\n"
+      "tbz x14, #1, 158f\n"
+      "ldr h0, [x13], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
+      "ldr h4, [x9], #0x2\n"
+      "tbz x14, #0, 159f\n"
+      "ld1 { v0.b }[2], [x13]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
+      "ld1 { v4.b }[2], [x9]\n"
       "b 159f\n"
       "158:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
-      "ldr b3, [x24, #0x0]\n"
-      "ldr b4, [x22, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
+      "ldr b4, [x9, #0x0]\n"
       "159:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q29, [x16, #0x0]\n"
+      ".inst 0x4f80e3a8  // sdot v8.4s, v29.16b, v0.4b[0]\n"
+      "ldr q28, [x16, #0x10]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b0  // sdot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b4  // sdot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3b8  // sdot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x16, #0x20]\n"
+      ".inst 0x4f80e389  // sdot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e391  // sdot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e395  // sdot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e399  // sdot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
       "160:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 150b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #2\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "bge 169f\n"
-      "tbz x17, #3, 164f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v25.4s }, [x20], #0x10\n"
-      "tbz x17, #2, 162f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "st1 { v26.4s }, [x20], #0x10\n"
-      "tbz x17, #1, 161f\n"
-      "str d11, [x15], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "str d27, [x20], #0x8\n"
-      "tbz x17, #0, 168f\n"
-      "st1 { v11.s }[2], [x15]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
-      "st1 { v27.s }[2], [x20]\n"
+      "tbz x8, #3, 164f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "tbz x8, #2, 162f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x8, #1, 161f\n"
+      "str d11, [x17], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x8, #0, 168f\n"
+      "st1 { v11.s }[2], [x17]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
       "b 168f\n"
       "161:"  // Height 5: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 168f\n"
-      "str s11, [x15, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
-      "str s27, [x20, #0x0]\n"
+      "tbz x8, #0, 168f\n"
+      "str s11, [x17, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
       "b 168f\n"
       "162:"  // Height 5: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 163f\n"
-      "str d10, [x15], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "str d26, [x20], #0x8\n"
-      "tbz x17, #0, 168f\n"
-      "st1 { v10.s }[2], [x15]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
-      "st1 { v26.s }[2], [x20]\n"
+      "tbz x8, #1, 163f\n"
+      "str d10, [x17], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x8, #0, 168f\n"
+      "st1 { v10.s }[2], [x17]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
       "b 168f\n"
       "163:"  // Height 5: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 168f\n"
-      "str s10, [x15, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
-      "str s26, [x20, #0x0]\n"
+      "tbz x8, #0, 168f\n"
+      "str s10, [x17, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
       "b 168f\n"
       "164:"  // Height 5: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 166f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "tbz x17, #1, 165f\n"
-      "str d9, [x15], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "str d25, [x20], #0x8\n"
-      "tbz x17, #0, 168f\n"
-      "st1 { v9.s }[2], [x15]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
-      "st1 { v25.s }[2], [x20]\n"
+      "tbz x8, #2, 166f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x8, #1, 165f\n"
+      "str d9, [x17], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x8, #0, 168f\n"
+      "st1 { v9.s }[2], [x17]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
       "b 168f\n"
       "165:"  // Height 5: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 168f\n"
-      "str s9, [x15, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
-      "str s25, [x20, #0x0]\n"
+      "tbz x8, #0, 168f\n"
+      "str s9, [x17, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
       "b 168f\n"
       "166:"  // Height 5: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 167f\n"
-      "str d8, [x15], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "tbz x17, #0, 168f\n"
-      "st1 { v8.s }[2], [x15]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
-      "st1 { v24.s }[2], [x20]\n"
+      "tbz x8, #1, 167f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x8, #0, 168f\n"
+      "st1 { v8.s }[2], [x17]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
       "b 168f\n"
       "167:"  // Height 5: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
-      "str s24, [x20, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
       "168:"  // Height 5: Partial direct writeback: Done
       "b 170f\n"
       "169:"  // Height 5: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q25, [x20, #0x10]\n"
-      "str q26, [x20, #0x20]\n"
-      "str q27, [x20, #0x30]\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
       "170:"  // Height 5: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 138b\n"
       "b 206f\n"
       "171:"  // Height 6
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[output_ptr]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "mov x20, #0x18\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "172:"  // Height 6: Column loop
       "tbz %x[flags], #0, 182f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
-      "add x19, x20, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "add x20, x21, x20, LSL #2\n"
       "bge 181f\n"
-      "tbz x17, #3, 176f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v28.4s }, [x19], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "ld1 { v25.4s }, [x20], #0x10\n"
-      "ld1 { v29.4s }, [x19], #0x10\n"
-      "tbz x17, #2, 174f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "ld1 { v26.4s }, [x20], #0x10\n"
-      "ld1 { v30.4s }, [x19], #0x10\n"
-      "tbz x17, #1, 173f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x24, #0x38\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "ldr d27, [x20], #0x8\n"
-      "ldr d31, [x19], #0x8\n"
-      "tbz x17, #0, 180f\n"
-      "ld1 { v11.s }[2], [x15]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
-      "ld1 { v27.s }[2], [x20]\n"
-      "ld1 { v31.s }[2], [x19]\n"
+      "tbz x8, #3, 176f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v29.4s }, [x20], #0x10\n"
+      "tbz x8, #2, 174f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v30.4s }, [x20], #0x10\n"
+      "tbz x8, #1, 173f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "ldr d31, [x20], #0x8\n"
+      "tbz x8, #0, 180f\n"
+      "ld1 { v11.s }[2], [x17]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "ld1 { v31.s }[2], [x20]\n"
       "b 180f\n"
       "173:"  // Height 6: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 180f\n"
-      "ldr s11, [x15, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
-      "ldr s27, [x20, #0x0]\n"
-      "ldr s31, [x19, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 180f\n"
+      "ldr s11, [x17, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "ldr s31, [x20, #0x0]\n"
       "b 180f\n"
       "174:"  // Height 6: Partial accumulate: partial_2_8
-      "tbz x17, #1, 175f\n"
-      "ldr d10, [x15], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "ldr d26, [x20], #0x8\n"
-      "ldr d30, [x19], #0x8\n"
-      "tbz x17, #0, 180f\n"
-      "ld1 { v10.s }[2], [x15]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
-      "ld1 { v26.s }[2], [x20]\n"
-      "ld1 { v30.s }[2], [x19]\n"
+      "tbz x8, #1, 175f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d14, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d30, [x20], #0x8\n"
+      "tbz x8, #0, 180f\n"
+      "ld1 { v10.s }[2], [x17]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "ld1 { v30.s }[2], [x20]\n"
       "b 180f\n"
       "175:"  // Height 6: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 180f\n"
-      "ldr s10, [x15, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
-      "ldr s26, [x20, #0x0]\n"
-      "ldr s30, [x19, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 180f\n"
+      "ldr s10, [x17, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "ldr s30, [x20, #0x0]\n"
       "b 180f\n"
       "176:"  // Height 6: Partial accumulate: partial_4_0
-      "tbz x17, #2, 178f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v28.4s }, [x19], #0x10\n"
-      "tbz x17, #1, 177f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x24, #0x18\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "ldr d25, [x20], #0x8\n"
-      "ldr d29, [x19], #0x8\n"
-      "tbz x17, #0, 180f\n"
-      "ld1 { v9.s }[2], [x15]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
-      "ld1 { v25.s }[2], [x20]\n"
-      "ld1 { v29.s }[2], [x19]\n"
+      "tbz x8, #2, 178f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "tbz x8, #1, 177f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "ldr d29, [x20], #0x8\n"
+      "tbz x8, #0, 180f\n"
+      "ld1 { v9.s }[2], [x17]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v29.s }[2], [x20]\n"
       "b 180f\n"
       "177:"  // Height 6: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 180f\n"
-      "ldr s9, [x15, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
-      "ldr s25, [x20, #0x0]\n"
-      "ldr s29, [x19, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 180f\n"
+      "ldr s9, [x17, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "ldr s29, [x20, #0x0]\n"
       "b 180f\n"
       "178:"  // Height 6: Partial accumulate: partial_2_0
-      "tbz x17, #1, 179f\n"
-      "ldr d8, [x15], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
-      "ldr d28, [x19], #0x8\n"
-      "tbz x17, #0, 180f\n"
-      "ld1 { v8.s }[2], [x15]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
-      "ld1 { v24.s }[2], [x20]\n"
-      "ld1 { v28.s }[2], [x19]\n"
+      "tbz x8, #1, 179f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "tbz x8, #0, 180f\n"
+      "ld1 { v8.s }[2], [x17]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
+      "ld1 { v28.s }[2], [x20]\n"
       "b 180f\n"
       "179:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
-      "ldr s24, [x20, #0x0]\n"
-      "ldr s28, [x19, #0x0]\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
+      "ldr s28, [x20, #0x0]\n"
       "180:"  // Height 6: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 183f\n"
       "181:"  // Height 6: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
-      "ldr q24, [x20, #0x0]\n"
-      "ldr q25, [x20, #0x10]\n"
-      "ldr q26, [x20, #0x20]\n"
-      "ldr q27, [x20, #0x30]\n"
-      "ldr q28, [x19, #0x0]\n"
-      "ldr q29, [x19, #0x10]\n"
-      "ldr q30, [x19, #0x20]\n"
-      "ldr q31, [x19, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x20, #0x0]\n"
+      "ldr q29, [x20, #0x10]\n"
+      "ldr q30, [x20, #0x20]\n"
+      "ldr q31, [x20, #0x30]\n"
       "b 183f\n"
       "182:"  // Height 6: no accumulate
       "movi v8.4s, #0x0\n"
@@ -2858,260 +2857,260 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
       "movi v30.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
       "183:"  // Height 6: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "184:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 185f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x22, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x14, 186f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
-      "add x24, x24, x19\n"
-      "add x22, x22, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "ldr x28, [x20, #0x28]\n"
+      "cbnz x15, 186f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
       "b 186f\n"
       "185:"  // Height 6: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
-      "add x24, x26, x19\n"
-      "add x22, x24, x19\n"
-      "add x20, x22, x19\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
+      "add x28, x9, x21\n"
       "186:"  // Height 6: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 189f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 188f\n"
       "187:"  // Height 6: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x18]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x16, #0x28]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "mov v7.d[1], x11\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "ldr d6, [x16, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr x10, [x16, #0x48]\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x16, #0x30]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x28, x28, #0x10\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr x20, [x16, #0x58]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
+      "ldr x27, [x13, #0x8]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
+      "ldr x26, [x12, #0x8]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr x27, [x28, #0x8]\n"
+      "ldr x25, [x11, #0x8]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
       "ldr d6, [x16, #0x40]\n"
       ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "add x26, x26, #0x10\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr x21, [x16, #0x68]\n"
       ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "ldr x24, [x10, #0x8]\n"
       ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr x10, [x16, #0x68]\n"
+      "ldr x23, [x9, #0x8]\n"
       ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr x25, [x26, #0x8]\n"
+      "ldr x22, [x28, #0x8]\n"
       ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x16, #0x50]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr x20, [x16, #0x78]\n"
       ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
+      "sub x14, x14, #0x10\n"
       ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
+      "cmp x14, #0x20\n"
       ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr x23, [x24, #0x8]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x16, #0x60]\n"
       ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "add x22, x22, #0x10\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr x21, [x16, #0x88]\n"
       ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr x10, [x16, #0x88]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr x21, [x22, #0x8]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x16, #0x70]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr x20, [x16, #0x98]\n"
       ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr x19, [x20, #0x8]\n"
       ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x16, #0x80]\n"
       ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "sub x13, x13, #0x10\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "cmp x13, #0x20\n"
+      "ldr x21, [x16, #0xa8]\n"
       ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr x10, [x16, #0xa8]\n"
       ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x16, #0x90]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
       ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
       ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x16, #0xa0]\n"
       ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
       ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x16, #0xc8]\n"
       ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x16, #0xb0]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
       ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
       ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x16, #0xc0]\n"
       ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
       ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x16, #0xe8]\n"
       ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x16, #0xd0]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
       ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
       ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
       "ldr d6, [x16, #0xe0]\n"
       ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
       "ldr d7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "mov v7.d[1], x20\n"
       "add x16, x16, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
       ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x10, [x16, #0x8]\n"
+      "ldr x20, [x16, #0x18]\n"
       ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
       ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr d0, [x12, #0x0]\n"
+      "ldr d0, [x13, #0x0]\n"
       ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
+      "ldr d1, [x12, #0x0]\n"
       ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
+      "ldr d2, [x11, #0x0]\n"
       ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "mov v0.d[1], x9\n"
+      "ldr d3, [x10, #0x0]\n"
       ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "mov v1.d[1], x27\n"
+      "ldr d4, [x9, #0x0]\n"
       ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "ldr d4, [x22, #0x0]\n"
+      "ldr d5, [x28, #0x0]\n"
+      "ldr d7, [x16, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x27\n"
+      "mov v1.d[1], x26\n"
       "mov v2.d[1], x25\n"
-      "ldr d5, [x20, #0x0]\n"
-      "mov v3.d[1], x23\n"
-      "mov v4.d[1], x21\n"
-      "mov v5.d[1], x19\n"
+      "mov v3.d[1], x24\n"
+      "mov v4.d[1], x23\n"
+      "mov v5.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 187b\n"
       "188:"  // Height 6: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       "add x12, x12, #0x10\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "add x28, x28, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "ldr q6, [x16, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "add x26, x26, #0x10\n"
+      "sub x14, x14, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr q7, [x16, #0x30]\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x20, x20, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
       "ldr q6, [x16, #0x40]\n"
@@ -3206,292 +3205,291 @@ void a64_hybrid_s8s32_dot_6x16_a55 (
       ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
       "189:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x13, 194f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 194f\n"
+      "cmp x14, #0x4\n"
       "blt 191f\n"
       "190:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
-      "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr s5, [x20], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s7, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr s6, [x12], #0x4\n"
+      "cmp x14, #0x4\n"
+      "ldr s5, [x11], #0x4\n"
+      "ldr s4, [x10], #0x4\n"
+      "ldr s3, [x9], #0x4\n"
+      "ldr s2, [x28], #0x4\n"
+      "ldr q1, [x16, #0x0]\n"
+      ".inst 0x4f87e028  // sdot v8.4s, v1.16b, v7.4b[0]\n"
+      "ldr q0, [x16, #0x10]\n"
+      ".inst 0x4f86e02c  // sdot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e030  // sdot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e034  // sdot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e038  // sdot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03c  // sdot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x16, #0x20]\n"
+      ".inst 0x4f87e009  // sdot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00d  // sdot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e011  // sdot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e015  // sdot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e019  // sdot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01d  // sdot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x16, #0x30]\n"
+      ".inst 0x4f87e02a  // sdot v10.4s, v1.16b, v7.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f86e02e  // sdot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e032  // sdot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e036  // sdot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e03a  // sdot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03e  // sdot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x4f87e00b  // sdot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00f  // sdot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e013  // sdot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e017  // sdot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e01b  // sdot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01f  // sdot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 190b\n"
-      "cbz x13, 194f\n"
       "191:"  // Height 6: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 192f\n"
-      "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
-      "ldr h5, [x20], #0x2\n"
-      "tbz x13, #0, 193f\n"
-      "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
-      "ld1 { v3.b }[2], [x24]\n"
-      "ld1 { v4.b }[2], [x22]\n"
-      "ld1 { v5.b }[2], [x20]\n"
+      "cbz x14, 194f\n"
+      "tbz x14, #1, 192f\n"
+      "ldr h0, [x13], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
+      "ldr h4, [x9], #0x2\n"
+      "ldr h5, [x28], #0x2\n"
+      "tbz x14, #0, 193f\n"
+      "ld1 { v0.b }[2], [x13]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
+      "ld1 { v4.b }[2], [x9]\n"
+      "ld1 { v5.b }[2], [x28]\n"
       "b 193f\n"
       "192:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
-      "ldr b3, [x24, #0x0]\n"
-      "ldr b4, [x22, #0x0]\n"
-      "ldr b5, [x20, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
+      "ldr b4, [x9, #0x0]\n"
+      "ldr b5, [x28, #0x0]\n"
       "193:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x16, #0x0]\n"
+      ".inst 0x4f80e0e8  // sdot v8.4s, v7.16b, v0.4b[0]\n"
+      "ldr q6, [x16, #0x10]\n"
+      ".inst 0x4f81e0ec  // sdot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f0  // sdot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f4  // sdot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f8  // sdot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fc  // sdot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x16, #0x20]\n"
+      ".inst 0x4f80e0c9  // sdot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cd  // sdot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d1  // sdot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d5  // sdot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d9  // sdot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dd  // sdot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x16, #0x30]\n"
+      ".inst 0x4f80e0ea  // sdot v10.4s, v7.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f81e0ee  // sdot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f2  // sdot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f6  // sdot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fa  // sdot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fe  // sdot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0cb  // sdot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cf  // sdot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d3  // sdot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d7  // sdot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0db  // sdot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0df  // sdot v31.4s, v6.16b, v5.4b[0]\n"
       "194:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 184b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #2\n"
       "prfm pstl1keep, [x20, #0x0]\n"
-      "add x19, x20, x19, LSL #2\n"
-      "prfm pstl1keep, [x19, #0x0]\n"
       "bge 203f\n"
-      "tbz x17, #3, 198f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v25.4s }, [x20], #0x10\n"
-      "st1 { v28.4s }, [x19], #0x10\n"
-      "st1 { v29.4s }, [x19], #0x10\n"
-      "tbz x17, #2, 196f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "st1 { v26.4s }, [x20], #0x10\n"
-      "st1 { v30.4s }, [x19], #0x10\n"
-      "tbz x17, #1, 195f\n"
-      "str d11, [x15], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "str d27, [x20], #0x8\n"
-      "str d31, [x19], #0x8\n"
-      "tbz x17, #0, 202f\n"
-      "st1 { v11.s }[2], [x15]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
-      "st1 { v27.s }[2], [x20]\n"
-      "st1 { v31.s }[2], [x19]\n"
+      "tbz x8, #3, 198f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "st1 { v29.4s }, [x20], #0x10\n"
+      "tbz x8, #2, 196f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v30.4s }, [x20], #0x10\n"
+      "tbz x8, #1, 195f\n"
+      "str d11, [x17], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "str d31, [x20], #0x8\n"
+      "tbz x8, #0, 202f\n"
+      "st1 { v11.s }[2], [x17]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "st1 { v31.s }[2], [x20]\n"
       "b 202f\n"
       "195:"  // Height 6: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 202f\n"
-      "str s11, [x15, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
-      "str s27, [x20, #0x0]\n"
-      "str s31, [x19, #0x0]\n"
+      "tbz x8, #0, 202f\n"
+      "str s11, [x17, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "str s31, [x20, #0x0]\n"
       "b 202f\n"
       "196:"  // Height 6: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 197f\n"
-      "str d10, [x15], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "str d26, [x20], #0x8\n"
-      "str d30, [x19], #0x8\n"
-      "tbz x17, #0, 202f\n"
-      "st1 { v10.s }[2], [x15]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
-      "st1 { v26.s }[2], [x20]\n"
-      "st1 { v30.s }[2], [x19]\n"
+      "tbz x8, #1, 197f\n"
+      "str d10, [x17], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "str d30, [x20], #0x8\n"
+      "tbz x8, #0, 202f\n"
+      "st1 { v10.s }[2], [x17]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "st1 { v30.s }[2], [x20]\n"
       "b 202f\n"
       "197:"  // Height 6: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 202f\n"
-      "str s10, [x15, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
-      "str s26, [x20, #0x0]\n"
-      "str s30, [x19, #0x0]\n"
+      "tbz x8, #0, 202f\n"
+      "str s10, [x17, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "str s30, [x20, #0x0]\n"
       "b 202f\n"
       "198:"  // Height 6: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 200f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v28.4s }, [x19], #0x10\n"
-      "tbz x17, #1, 199f\n"
-      "str d9, [x15], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "str d25, [x20], #0x8\n"
-      "str d29, [x19], #0x8\n"
-      "tbz x17, #0, 202f\n"
-      "st1 { v9.s }[2], [x15]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
-      "st1 { v25.s }[2], [x20]\n"
-      "st1 { v29.s }[2], [x19]\n"
+      "tbz x8, #2, 200f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "tbz x8, #1, 199f\n"
+      "str d9, [x17], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "str d29, [x20], #0x8\n"
+      "tbz x8, #0, 202f\n"
+      "st1 { v9.s }[2], [x17]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "st1 { v29.s }[2], [x20]\n"
       "b 202f\n"
       "199:"  // Height 6: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 202f\n"
-      "str s9, [x15, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
-      "str s25, [x20, #0x0]\n"
-      "str s29, [x19, #0x0]\n"
+      "tbz x8, #0, 202f\n"
+      "str s9, [x17, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "str s29, [x20, #0x0]\n"
       "b 202f\n"
       "200:"  // Height 6: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 201f\n"
-      "str d8, [x15], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "str d28, [x19], #0x8\n"
-      "tbz x17, #0, 202f\n"
-      "st1 { v8.s }[2], [x15]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
-      "st1 { v24.s }[2], [x20]\n"
-      "st1 { v28.s }[2], [x19]\n"
+      "tbz x8, #1, 201f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "str d28, [x20], #0x8\n"
+      "tbz x8, #0, 202f\n"
+      "st1 { v8.s }[2], [x17]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "st1 { v28.s }[2], [x20]\n"
       "b 202f\n"
       "201:"  // Height 6: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
-      "str s24, [x20, #0x0]\n"
-      "str s28, [x19, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "str s28, [x20, #0x0]\n"
       "202:"  // Height 6: Partial direct writeback: Done
       "b 204f\n"
       "203:"  // Height 6: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q25, [x20, #0x10]\n"
-      "str q26, [x20, #0x20]\n"
-      "str q27, [x20, #0x30]\n"
-      "str q28, [x19, #0x0]\n"
-      "str q29, [x19, #0x10]\n"
-      "str q30, [x19, #0x20]\n"
-      "str q31, [x19, #0x30]\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "str q28, [x20, #0x0]\n"
+      "str q29, [x20, #0x10]\n"
+      "str q30, [x20, #0x20]\n"
+      "str q31, [x20, #0x30]\n"
       "204:"  // Height 6: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 172b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 206f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 205f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "205:"  // Update direct input
-      "mov x19, #0x6\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "206:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
index 3566027a50..452d647bb4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -77,7 +77,6 @@ void a64_hybrid_s8s32_dot_6x16 (
     ka.N = N;
     ka.B_ptr = B_ptr;
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 171f\n"
@@ -87,73 +86,73 @@ void a64_hybrid_s8s32_dot_6x16 (
       "cmp %x[M], #0x2\n"
       "bgt 69f\n"
       "beq 35f\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
       "tbz %x[flags], #0, 12f\n"
-      "cmp x10, #0x10\n"
+      "cmp x11, #0x10\n"
       "bge 11f\n"
-      "tbz x10, #3, 6f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "tbz x10, #2, 4f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "tbz x10, #1, 3f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "tbz x10, #0, 10f\n"
-      "ld1 { v11.s }[2], [x28]\n"
+      "tbz x11, #3, 6f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 4f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 3f\n"
+      "ldr d11, [x9], #0x8\n"
+      "mov x25, #0x38\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v11.s }[2], [x9]\n"
       "b 10f\n"
       "3:"  // Height 1: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 10f\n"
-      "ldr s11, [x28, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s11, [x9, #0x0]\n"
       "b 10f\n"
       "4:"  // Height 1: Partial accumulate: partial_2_8
-      "tbz x10, #1, 5f\n"
-      "ldr d10, [x28], #0x8\n"
-      "mov x24, #0x28\n"
-      "tbz x10, #0, 10f\n"
-      "ld1 { v10.s }[2], [x28]\n"
+      "tbz x11, #1, 5f\n"
+      "ldr d10, [x9], #0x8\n"
+      "mov x25, #0x28\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v10.s }[2], [x9]\n"
       "b 10f\n"
       "5:"  // Height 1: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 10f\n"
-      "ldr s10, [x28, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s10, [x9, #0x0]\n"
       "b 10f\n"
       "6:"  // Height 1: Partial accumulate: partial_4_0
-      "tbz x10, #2, 8f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "tbz x10, #1, 7f\n"
-      "ldr d9, [x28], #0x8\n"
-      "mov x24, #0x18\n"
-      "tbz x10, #0, 10f\n"
-      "ld1 { v9.s }[2], [x28]\n"
+      "tbz x11, #2, 8f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 7f\n"
+      "ldr d9, [x9], #0x8\n"
+      "mov x25, #0x18\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v9.s }[2], [x9]\n"
       "b 10f\n"
       "7:"  // Height 1: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 10f\n"
-      "ldr s9, [x28, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s9, [x9, #0x0]\n"
       "b 10f\n"
       "8:"  // Height 1: Partial accumulate: partial_2_0
-      "tbz x10, #1, 9f\n"
-      "ldr d8, [x28], #0x8\n"
-      "mov x24, #0x8\n"
-      "tbz x10, #0, 10f\n"
-      "ld1 { v8.s }[2], [x28]\n"
+      "tbz x11, #1, 9f\n"
+      "ldr d8, [x9], #0x8\n"
+      "mov x25, #0x8\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v8.s }[2], [x9]\n"
       "b 10f\n"
       "9:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
+      "ldr s8, [x9, #0x0]\n"
+      "mov x25, #0x0\n"
       "10:"  // Height 1: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 13f\n"
       "11:"  // Height 1: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
       "b 13f\n"
       "12:"  // Height 1: no accumulate
       "movi v8.4s, #0x0\n"
@@ -161,295 +160,295 @@ void a64_hybrid_s8s32_dot_6x16 (
       "movi v10.4s, #0x0\n"
       "movi v11.4s, #0x0\n"
       "13:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "14:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 15f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 16f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 16f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
       "b 16f\n"
       "15:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "16:"  // Height 1: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 19f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q6, [x9, #0x0]\n"
-      "cmp x26, #0x20\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 18f\n"
       "17:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "sub x26, x26, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "cmp x26, #0x20\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "add x10, x10, #0x100\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "bge 17b\n"
       "18:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x10, x10, #0x100\n"
       "19:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x26, 24f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 24f\n"
+      "cmp x27, #0x4\n"
       "blt 21f\n"
       "20:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "cmp x26, #0x4\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x10, #0x0]\n"
+      ".inst 0x4f92e208  // sdot v8.4s, v16.16b, v18.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      "ldr q16, [x10, #0x10]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f92e209  // sdot v9.4s, v16.16b, v18.4b[0]\n"
+      "cmp x27, #0x4\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f92e22a  // sdot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f92e20b  // sdot v11.4s, v16.16b, v18.4b[0]\n"
+      "add x10, x10, #0x40\n"
       "bge 20b\n"
-      "cbz x26, 24f\n"
       "21:"  // Height 1: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 22f\n"
-      "ldr h0, [x25], #0x2\n"
-      "tbz x26, #0, 23f\n"
-      "ld1 { v0.b }[2], [x25]\n"
+      "cbz x27, 24f\n"
+      "tbz x27, #1, 22f\n"
+      "ldr h0, [x26], #0x2\n"
+      "tbz x27, #0, 23f\n"
+      "ld1 { v0.b }[2], [x26]\n"
       "b 23f\n"
       "22:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
       "23:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "add x10, x10, #0x40\n"
       "24:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 14b\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
+      "cmp x11, #0x10\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
       "bge 33f\n"
-      "tbz x10, #3, 28f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "tbz x10, #2, 26f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "tbz x10, #1, 25f\n"
-      "str d11, [x28], #0x8\n"
-      "tbz x10, #0, 32f\n"
-      "st1 { v11.s }[2], [x28]\n"
+      "tbz x11, #3, 28f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 26f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 25f\n"
+      "str d11, [x9], #0x8\n"
+      "tbz x11, #0, 32f\n"
+      "st1 { v11.s }[2], [x9]\n"
       "b 32f\n"
       "25:"  // Height 1: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 32f\n"
-      "str s11, [x28, #0x0]\n"
+      "tbz x11, #0, 32f\n"
+      "str s11, [x9, #0x0]\n"
       "b 32f\n"
       "26:"  // Height 1: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 27f\n"
-      "str d10, [x28], #0x8\n"
-      "tbz x10, #0, 32f\n"
-      "st1 { v10.s }[2], [x28]\n"
+      "tbz x11, #1, 27f\n"
+      "str d10, [x9], #0x8\n"
+      "tbz x11, #0, 32f\n"
+      "st1 { v10.s }[2], [x9]\n"
       "b 32f\n"
       "27:"  // Height 1: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 32f\n"
-      "str s10, [x28, #0x0]\n"
+      "tbz x11, #0, 32f\n"
+      "str s10, [x9, #0x0]\n"
       "b 32f\n"
       "28:"  // Height 1: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 30f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "tbz x10, #1, 29f\n"
-      "str d9, [x28], #0x8\n"
-      "tbz x10, #0, 32f\n"
-      "st1 { v9.s }[2], [x28]\n"
+      "tbz x11, #2, 30f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 29f\n"
+      "str d9, [x9], #0x8\n"
+      "tbz x11, #0, 32f\n"
+      "st1 { v9.s }[2], [x9]\n"
       "b 32f\n"
       "29:"  // Height 1: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 32f\n"
-      "str s9, [x28, #0x0]\n"
+      "tbz x11, #0, 32f\n"
+      "str s9, [x9, #0x0]\n"
       "b 32f\n"
       "30:"  // Height 1: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 31f\n"
-      "str d8, [x28], #0x8\n"
-      "tbz x10, #0, 32f\n"
-      "st1 { v8.s }[2], [x28]\n"
+      "tbz x11, #1, 31f\n"
+      "str d8, [x9], #0x8\n"
+      "tbz x11, #0, 32f\n"
+      "st1 { v8.s }[2], [x9]\n"
       "b 32f\n"
       "31:"  // Height 1: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
       "32:"  // Height 1: Partial direct writeback: Done
       "b 34f\n"
       "33:"  // Height 1: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
       "34:"  // Height 1: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 2b\n"
       "b 206f\n"
       "35:"  // Height 2
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "36:"  // Height 2: Column loop
       "tbz %x[flags], #0, 46f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x9, x20, LSL #2\n"
       "bge 45f\n"
-      "tbz x10, #3, 40f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "tbz x10, #2, 38f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "tbz x10, #1, 37f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "tbz x10, #0, 44f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x23]\n"
+      "tbz x11, #3, 40f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 38f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 37f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "tbz x11, #0, 44f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
       "b 44f\n"
       "37:"  // Height 2: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 44f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 44f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
       "b 44f\n"
       "38:"  // Height 2: Partial accumulate: partial_2_8
-      "tbz x10, #1, 39f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "tbz x10, #0, 44f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x23]\n"
+      "tbz x11, #1, 39f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "tbz x11, #0, 44f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
       "b 44f\n"
       "39:"  // Height 2: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 44f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 44f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
       "b 44f\n"
       "40:"  // Height 2: Partial accumulate: partial_4_0
-      "tbz x10, #2, 42f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "tbz x10, #1, 41f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "tbz x10, #0, 44f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x23]\n"
+      "tbz x11, #2, 42f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 41f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "tbz x11, #0, 44f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
       "b 44f\n"
       "41:"  // Height 2: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 44f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 44f\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
       "b 44f\n"
       "42:"  // Height 2: Partial accumulate: partial_2_0
-      "tbz x10, #1, 43f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "tbz x10, #0, 44f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x23]\n"
+      "tbz x11, #1, 43f\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "tbz x11, #0, 44f\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
       "b 44f\n"
       "43:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
       "44:"  // Height 2: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 47f\n"
       "45:"  // Height 2: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
       "b 47f\n"
       "46:"  // Height 2: no accumulate
       "movi v8.4s, #0x0\n"
@@ -461,392 +460,392 @@ void a64_hybrid_s8s32_dot_6x16 (
       "movi v14.4s, #0x0\n"
       "movi v15.4s, #0x0\n"
       "47:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 50f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 50f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
       "b 50f\n"
       "49:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
       "50:"  // Height 2: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 53f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 52f\n"
       "51:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
+      "sub x27, x27, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "cmp x26, #0x20\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x26, 58f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 58f\n"
+      "cmp x27, #0x4\n"
       "blt 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f93e228  // sdot v8.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x4f92e22c  // sdot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f93e209  // sdot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20d  // sdot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f93e22a  // sdot v10.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x4f92e22e  // sdot v14.4s, v17.16b, v18.4b[0]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x4f93e20b  // sdot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20f  // sdot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 54b\n"
-      "cbz x26, 58f\n"
       "55:"  // Height 2: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 56f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "tbz x26, #0, 57f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
+      "cbz x27, 58f\n"
+      "tbz x27, #1, 56f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "tbz x27, #0, 57f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
       "b 57f\n"
       "56:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
       "57:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22c  // sdot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20d  // sdot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
       "58:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 48b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "bge 67f\n"
-      "tbz x10, #3, 62f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "tbz x10, #2, 60f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "tbz x10, #1, 59f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "tbz x10, #0, 66f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x23]\n"
+      "tbz x11, #3, 62f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 60f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 59f\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "tbz x11, #0, 66f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x24]\n"
       "b 66f\n"
       "59:"  // Height 2: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 66f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
+      "tbz x11, #0, 66f\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
       "b 66f\n"
       "60:"  // Height 2: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 61f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "tbz x10, #0, 66f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x23]\n"
+      "tbz x11, #1, 61f\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "tbz x11, #0, 66f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x24]\n"
       "b 66f\n"
       "61:"  // Height 2: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 66f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
+      "tbz x11, #0, 66f\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
       "b 66f\n"
       "62:"  // Height 2: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 64f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "tbz x10, #1, 63f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "tbz x10, #0, 66f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x23]\n"
+      "tbz x11, #2, 64f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 63f\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "tbz x11, #0, 66f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x24]\n"
       "b 66f\n"
       "63:"  // Height 2: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 66f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
+      "tbz x11, #0, 66f\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
       "b 66f\n"
       "64:"  // Height 2: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 65f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "tbz x10, #0, 66f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x23]\n"
+      "tbz x11, #1, 65f\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "tbz x11, #0, 66f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x24]\n"
       "b 66f\n"
       "65:"  // Height 2: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
       "66:"  // Height 2: Partial direct writeback: Done
       "b 68f\n"
       "67:"  // Height 2: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
       "68:"  // Height 2: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 36b\n"
       "b 206f\n"
       "69:"  // Height 3
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "70:"  // Height 3: Column loop
       "tbz %x[flags], #0, 80f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x23, x24, x20, LSL #2\n"
       "bge 79f\n"
-      "tbz x10, #3, 74f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "tbz x10, #2, 72f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "tbz x10, #1, 71f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "tbz x10, #0, 78f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
+      "tbz x11, #3, 74f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 72f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 71f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x11, #0, 78f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
       "b 78f\n"
       "71:"  // Height 3: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 78f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 78f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
       "b 78f\n"
       "72:"  // Height 3: Partial accumulate: partial_2_8
-      "tbz x10, #1, 73f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "tbz x10, #0, 78f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
+      "tbz x11, #1, 73f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d18, [x23], #0x8\n"
+      "tbz x11, #0, 78f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
       "b 78f\n"
       "73:"  // Height 3: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 78f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 78f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
       "b 78f\n"
       "74:"  // Height 3: Partial accumulate: partial_4_0
-      "tbz x10, #2, 76f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "tbz x10, #1, 75f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "tbz x10, #0, 78f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
+      "tbz x11, #2, 76f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 75f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d17, [x23], #0x8\n"
+      "tbz x11, #0, 78f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
       "b 78f\n"
       "75:"  // Height 3: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 78f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 78f\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
       "b 78f\n"
       "76:"  // Height 3: Partial accumulate: partial_2_0
-      "tbz x10, #1, 77f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "tbz x10, #0, 78f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
+      "tbz x11, #1, 77f\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "tbz x11, #0, 78f\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
       "b 78f\n"
       "77:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s16, [x23, #0x0]\n"
       "78:"  // Height 3: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 81f\n"
       "79:"  // Height 3: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
       "b 81f\n"
       "80:"  // Height 3: no accumulate
       "movi v8.4s, #0x0\n"
@@ -862,488 +861,488 @@ void a64_hybrid_s8s32_dot_6x16 (
       "movi v18.4s, #0x0\n"
       "movi v19.4s, #0x0\n"
       "81:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "82:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 83f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 84f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 84f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
       "b 84f\n"
       "83:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "84:"  // Height 3: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 87f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 86f\n"
       "85:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sub x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "cmp x26, #0x20\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 85b\n"
       "86:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
       "87:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x26, 92f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 92f\n"
+      "cmp x27, #0x4\n"
       "blt 89f\n"
       "88:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x10, #0x0]\n"
+      ".inst 0x4f98e2a8  // sdot v8.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x4f97e2ac  // sdot v12.4s, v21.16b, v23.4b[0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x4f96e2b0  // sdot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x4f98e289  // sdot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28d  // sdot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e291  // sdot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x4f98e2aa  // sdot v10.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x4f97e2ae  // sdot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b2  // sdot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x4f98e28b  // sdot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28f  // sdot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e293  // sdot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 88b\n"
-      "cbz x26, 92f\n"
       "89:"  // Height 3: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 90f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "tbz x26, #0, 91f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
+      "cbz x27, 92f\n"
+      "tbz x27, #1, 90f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "tbz x27, #0, 91f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
       "b 91f\n"
       "90:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
       "91:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x4f80e2a8  // sdot v8.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ac  // sdot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b0  // sdot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x4f80e289  // sdot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28d  // sdot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e291  // sdot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
       "92:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 82b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x11, #0x10\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
       "bge 101f\n"
-      "tbz x10, #3, 96f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "tbz x10, #2, 94f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "tbz x10, #1, 93f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "tbz x10, #0, 100f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
+      "tbz x11, #3, 96f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 94f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 93f\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x11, #0, 100f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
       "b 100f\n"
       "93:"  // Height 3: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 100f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
+      "tbz x11, #0, 100f\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
       "b 100f\n"
       "94:"  // Height 3: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 95f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "tbz x10, #0, 100f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
+      "tbz x11, #1, 95f\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x11, #0, 100f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
       "b 100f\n"
       "95:"  // Height 3: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 100f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
+      "tbz x11, #0, 100f\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
       "b 100f\n"
       "96:"  // Height 3: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 98f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "tbz x10, #1, 97f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "tbz x10, #0, 100f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
+      "tbz x11, #2, 98f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 97f\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x11, #0, 100f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
       "b 100f\n"
       "97:"  // Height 3: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 100f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
+      "tbz x11, #0, 100f\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
       "b 100f\n"
       "98:"  // Height 3: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 99f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "tbz x10, #0, 100f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
+      "tbz x11, #1, 99f\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x11, #0, 100f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
       "b 100f\n"
       "99:"  // Height 3: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
       "100:"  // Height 3: Partial direct writeback: Done
       "b 102f\n"
       "101:"  // Height 3: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
       "102:"  // Height 3: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 70b\n"
       "b 206f\n"
       "103:"  // Height 4
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "104:"  // Height 4: Column loop
       "tbz %x[flags], #0, 114f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x22, x23, x20, LSL #2\n"
       "bge 113f\n"
-      "tbz x10, #3, 108f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "tbz x10, #2, 106f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "tbz x10, #1, 105f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "tbz x10, #0, 112f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
+      "tbz x11, #3, 108f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 106f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 105f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x11, #0, 112f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
       "b 112f\n"
       "105:"  // Height 4: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 112f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 112f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
       "b 112f\n"
       "106:"  // Height 4: Partial accumulate: partial_2_8
-      "tbz x10, #1, 107f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "tbz x10, #0, 112f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
+      "tbz x11, #1, 107f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x11, #0, 112f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
       "b 112f\n"
       "107:"  // Height 4: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 112f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 112f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
       "b 112f\n"
       "108:"  // Height 4: Partial accumulate: partial_4_0
-      "tbz x10, #2, 110f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "tbz x10, #1, 109f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "tbz x10, #0, 112f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
+      "tbz x11, #2, 110f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 109f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x11, #0, 112f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
       "b 112f\n"
       "109:"  // Height 4: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 112f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 112f\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
       "b 112f\n"
       "110:"  // Height 4: Partial accumulate: partial_2_0
-      "tbz x10, #1, 111f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "tbz x10, #0, 112f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
+      "tbz x11, #1, 111f\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "tbz x11, #0, 112f\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
       "b 112f\n"
       "111:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
       "112:"  // Height 4: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 115f\n"
       "113:"  // Height 4: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
       "b 115f\n"
       "114:"  // Height 4: no accumulate
       "movi v8.4s, #0x0\n"
@@ -1363,584 +1362,584 @@ void a64_hybrid_s8s32_dot_6x16 (
       "movi v22.4s, #0x0\n"
       "movi v23.4s, #0x0\n"
       "115:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "116:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 117f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 118f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 118f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 118f\n"
       "117:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "118:"  // Height 4: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 121f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 120f\n"
       "119:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q25, [x10, #0x20]\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x20\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 119b\n"
       "120:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q25, [x10, #0x20]\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "add x23, x23, #0x10\n"
+      "sub x27, x27, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
       "121:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x26, 126f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 126f\n"
+      "cmp x27, #0x4\n"
       "blt 123f\n"
       "122:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr s3, [x22], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x4f9de328  // sdot v8.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce32c  // sdot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be330  // sdot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae334  // sdot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x4f9de309  // sdot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30d  // sdot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be311  // sdot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae315  // sdot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x4f9de32a  // sdot v10.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce32e  // sdot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be332  // sdot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae336  // sdot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x4f9de30b  // sdot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30f  // sdot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be313  // sdot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae317  // sdot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 122b\n"
-      "cbz x26, 126f\n"
       "123:"  // Height 4: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 124f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr h3, [x22], #0x2\n"
-      "tbz x26, #0, 125f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
-      "ld1 { v3.b }[2], [x22]\n"
+      "cbz x27, 126f\n"
+      "tbz x27, #1, 124f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "tbz x27, #0, 125f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
       "b 125f\n"
       "124:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
-      "ldr b3, [x22, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
       "125:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x4f80e328  // sdot v8.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32c  // sdot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e330  // sdot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e334  // sdot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x4f80e309  // sdot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30d  // sdot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e311  // sdot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e315  // sdot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
       "126:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 116b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "bge 135f\n"
-      "tbz x10, #3, 130f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "tbz x10, #2, 128f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "tbz x10, #1, 127f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "tbz x10, #0, 134f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
+      "tbz x11, #3, 130f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 128f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 127f\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "tbz x11, #0, 134f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
       "b 134f\n"
       "127:"  // Height 4: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 134f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
+      "tbz x11, #0, 134f\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
       "b 134f\n"
       "128:"  // Height 4: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 129f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "tbz x10, #0, 134f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
+      "tbz x11, #1, 129f\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "tbz x11, #0, 134f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
       "b 134f\n"
       "129:"  // Height 4: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 134f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
+      "tbz x11, #0, 134f\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
       "b 134f\n"
       "130:"  // Height 4: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 132f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "tbz x10, #1, 131f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "tbz x10, #0, 134f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
+      "tbz x11, #2, 132f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 131f\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "tbz x11, #0, 134f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
       "b 134f\n"
       "131:"  // Height 4: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 134f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
+      "tbz x11, #0, 134f\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
       "b 134f\n"
       "132:"  // Height 4: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 133f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "tbz x10, #0, 134f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
+      "tbz x11, #1, 133f\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "tbz x11, #0, 134f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
       "b 134f\n"
       "133:"  // Height 4: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
       "134:"  // Height 4: Partial direct writeback: Done
       "b 136f\n"
       "135:"  // Height 4: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
       "136:"  // Height 4: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 104b\n"
       "b 206f\n"
       "137:"  // Height 5
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "138:"  // Height 5: Column loop
       "tbz %x[flags], #0, 148f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x21, x22, x20, LSL #2\n"
       "bge 147f\n"
-      "tbz x10, #3, 142f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "ld1 { v25.4s }, [x20], #0x10\n"
-      "tbz x10, #2, 140f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "ld1 { v26.4s }, [x20], #0x10\n"
-      "tbz x10, #1, 139f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "ldr d27, [x20], #0x8\n"
-      "tbz x10, #0, 146f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
-      "ld1 { v27.s }[2], [x20]\n"
+      "tbz x11, #3, 142f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 140f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 139f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
       "b 146f\n"
       "139:"  // Height 5: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 146f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
-      "ldr s27, [x20, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 146f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
       "b 146f\n"
       "140:"  // Height 5: Partial accumulate: partial_2_8
-      "tbz x10, #1, 141f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "ldr d26, [x20], #0x8\n"
-      "tbz x10, #0, 146f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
-      "ld1 { v26.s }[2], [x20]\n"
+      "tbz x11, #1, 141f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
       "b 146f\n"
       "141:"  // Height 5: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 146f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
-      "ldr s26, [x20, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 146f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
       "b 146f\n"
       "142:"  // Height 5: Partial accumulate: partial_4_0
-      "tbz x10, #2, 144f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "tbz x10, #1, 143f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "ldr d25, [x20], #0x8\n"
-      "tbz x10, #0, 146f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
-      "ld1 { v25.s }[2], [x20]\n"
+      "tbz x11, #2, 144f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 143f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
       "b 146f\n"
       "143:"  // Height 5: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 146f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
-      "ldr s25, [x20, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 146f\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
       "b 146f\n"
       "144:"  // Height 5: Partial accumulate: partial_2_0
-      "tbz x10, #1, 145f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
-      "tbz x10, #0, 146f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
-      "ld1 { v24.s }[2], [x20]\n"
+      "tbz x11, #1, 145f\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
       "b 146f\n"
       "145:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
-      "ldr s24, [x20, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
       "146:"  // Height 5: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 149f\n"
       "147:"  // Height 5: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
-      "ldr q24, [x20, #0x0]\n"
-      "ldr q25, [x20, #0x10]\n"
-      "ldr q26, [x20, #0x20]\n"
-      "ldr q27, [x20, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
       "b 149f\n"
       "148:"  // Height 5: no accumulate
       "movi v8.4s, #0x0\n"
@@ -1964,683 +1963,683 @@ void a64_hybrid_s8s32_dot_6x16 (
       "movi v26.4s, #0x0\n"
       "movi v27.4s, #0x0\n"
       "149:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "150:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 151f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 152f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 152f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
       "b 152f\n"
       "151:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "152:"  // Height 5: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 155f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 154f\n"
       "153:"  // Height 5: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sub x26, x26, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "cmp x26, #0x20\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "cmp x27, #0x20\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr q3, [x22, #0x0]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 153b\n"
       "154:"  // Height 5: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
       "155:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x26, 160f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 160f\n"
+      "cmp x27, #0x4\n"
       "blt 157f\n"
       "156:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr s3, [x22], #0x4\n"
-      "ldr s4, [x21], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s1, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x10, #0x0]\n"
+      ".inst 0x4f82e3a8  // sdot v8.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x4f80e3b0  // sdot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b4  // sdot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3b8  // sdot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x4f82e389  // sdot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e391  // sdot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe395  // sdot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee399  // sdot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x4f82e3aa  // sdot v10.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b6  // sdot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3ba  // sdot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x4f82e38b  // sdot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe397  // sdot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee39b  // sdot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 156b\n"
-      "cbz x26, 160f\n"
       "157:"  // Height 5: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 158f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr h3, [x22], #0x2\n"
-      "ldr h4, [x21], #0x2\n"
-      "tbz x26, #0, 159f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
-      "ld1 { v3.b }[2], [x22]\n"
-      "ld1 { v4.b }[2], [x21]\n"
+      "cbz x27, 160f\n"
+      "tbz x27, #1, 158f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x27, #0, 159f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
       "b 159f\n"
       "158:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
-      "ldr b3, [x22, #0x0]\n"
-      "ldr b4, [x21, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
       "159:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x4f80e3a8  // sdot v8.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b0  // sdot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b4  // sdot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3b8  // sdot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x4f80e389  // sdot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e391  // sdot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e395  // sdot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e399  // sdot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
       "160:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 150b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
+      "cmp x11, #0x10\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #2\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "bge 169f\n"
-      "tbz x10, #3, 164f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v25.4s }, [x20], #0x10\n"
-      "tbz x10, #2, 162f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "st1 { v26.4s }, [x20], #0x10\n"
-      "tbz x10, #1, 161f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "str d27, [x20], #0x8\n"
-      "tbz x10, #0, 168f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
-      "st1 { v27.s }[2], [x20]\n"
+      "tbz x11, #3, 164f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 162f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 161f\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x11, #0, 168f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
       "b 168f\n"
       "161:"  // Height 5: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 168f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
-      "str s27, [x20, #0x0]\n"
+      "tbz x11, #0, 168f\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
       "b 168f\n"
       "162:"  // Height 5: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 163f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "str d26, [x20], #0x8\n"
-      "tbz x10, #0, 168f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
-      "st1 { v26.s }[2], [x20]\n"
+      "tbz x11, #1, 163f\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x11, #0, 168f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
       "b 168f\n"
       "163:"  // Height 5: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 168f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
-      "str s26, [x20, #0x0]\n"
+      "tbz x11, #0, 168f\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
       "b 168f\n"
       "164:"  // Height 5: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 166f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "tbz x10, #1, 165f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "str d25, [x20], #0x8\n"
-      "tbz x10, #0, 168f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
-      "st1 { v25.s }[2], [x20]\n"
+      "tbz x11, #2, 166f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 165f\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x11, #0, 168f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
       "b 168f\n"
       "165:"  // Height 5: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 168f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
-      "str s25, [x20, #0x0]\n"
+      "tbz x11, #0, 168f\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
       "b 168f\n"
       "166:"  // Height 5: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 167f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "tbz x10, #0, 168f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
-      "st1 { v24.s }[2], [x20]\n"
+      "tbz x11, #1, 167f\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x11, #0, 168f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
       "b 168f\n"
       "167:"  // Height 5: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
-      "str s24, [x20, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
       "168:"  // Height 5: Partial direct writeback: Done
       "b 170f\n"
       "169:"  // Height 5: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q25, [x20, #0x10]\n"
-      "str q26, [x20, #0x20]\n"
-      "str q27, [x20, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
       "170:"  // Height 5: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 138b\n"
       "b 206f\n"
       "171:"  // Height 6
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "mov x20, #0x18\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "172:"  // Height 6: Column loop
       "tbz %x[flags], #0, 182f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
-      "add x19, x20, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x20, x21, x20, LSL #2\n"
       "bge 181f\n"
-      "tbz x10, #3, 176f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v28.4s }, [x19], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "ld1 { v25.4s }, [x20], #0x10\n"
-      "ld1 { v29.4s }, [x19], #0x10\n"
-      "tbz x10, #2, 174f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "ld1 { v26.4s }, [x20], #0x10\n"
-      "ld1 { v30.4s }, [x19], #0x10\n"
-      "tbz x10, #1, 173f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "ldr d27, [x20], #0x8\n"
-      "ldr d31, [x19], #0x8\n"
-      "tbz x10, #0, 180f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
-      "ld1 { v27.s }[2], [x20]\n"
-      "ld1 { v31.s }[2], [x19]\n"
+      "tbz x11, #3, 176f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v29.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 174f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v30.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 173f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "ldr d31, [x20], #0x8\n"
+      "tbz x11, #0, 180f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "ld1 { v31.s }[2], [x20]\n"
       "b 180f\n"
       "173:"  // Height 6: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 180f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
-      "ldr s27, [x20, #0x0]\n"
-      "ldr s31, [x19, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 180f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "ldr s31, [x20, #0x0]\n"
       "b 180f\n"
       "174:"  // Height 6: Partial accumulate: partial_2_8
-      "tbz x10, #1, 175f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "ldr d26, [x20], #0x8\n"
-      "ldr d30, [x19], #0x8\n"
-      "tbz x10, #0, 180f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
-      "ld1 { v26.s }[2], [x20]\n"
-      "ld1 { v30.s }[2], [x19]\n"
+      "tbz x11, #1, 175f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d30, [x20], #0x8\n"
+      "tbz x11, #0, 180f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "ld1 { v30.s }[2], [x20]\n"
       "b 180f\n"
       "175:"  // Height 6: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 180f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
-      "ldr s26, [x20, #0x0]\n"
-      "ldr s30, [x19, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 180f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "ldr s30, [x20, #0x0]\n"
       "b 180f\n"
       "176:"  // Height 6: Partial accumulate: partial_4_0
-      "tbz x10, #2, 178f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v28.4s }, [x19], #0x10\n"
-      "tbz x10, #1, 177f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "ldr d25, [x20], #0x8\n"
-      "ldr d29, [x19], #0x8\n"
-      "tbz x10, #0, 180f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
-      "ld1 { v25.s }[2], [x20]\n"
-      "ld1 { v29.s }[2], [x19]\n"
+      "tbz x11, #2, 178f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 177f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "ldr d29, [x20], #0x8\n"
+      "tbz x11, #0, 180f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v29.s }[2], [x20]\n"
       "b 180f\n"
       "177:"  // Height 6: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 180f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
-      "ldr s25, [x20, #0x0]\n"
-      "ldr s29, [x19, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 180f\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "ldr s29, [x20, #0x0]\n"
       "b 180f\n"
       "178:"  // Height 6: Partial accumulate: partial_2_0
-      "tbz x10, #1, 179f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
-      "ldr d28, [x19], #0x8\n"
-      "tbz x10, #0, 180f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
-      "ld1 { v24.s }[2], [x20]\n"
-      "ld1 { v28.s }[2], [x19]\n"
+      "tbz x11, #1, 179f\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "tbz x11, #0, 180f\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
+      "ld1 { v28.s }[2], [x20]\n"
       "b 180f\n"
       "179:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
-      "ldr s24, [x20, #0x0]\n"
-      "ldr s28, [x19, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
+      "ldr s28, [x20, #0x0]\n"
       "180:"  // Height 6: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 183f\n"
       "181:"  // Height 6: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
-      "ldr q24, [x20, #0x0]\n"
-      "ldr q25, [x20, #0x10]\n"
-      "ldr q26, [x20, #0x20]\n"
-      "ldr q27, [x20, #0x30]\n"
-      "ldr q28, [x19, #0x0]\n"
-      "ldr q29, [x19, #0x10]\n"
-      "ldr q30, [x19, #0x20]\n"
-      "ldr q31, [x19, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x20, #0x0]\n"
+      "ldr q29, [x20, #0x10]\n"
+      "ldr q30, [x20, #0x20]\n"
+      "ldr q31, [x20, #0x30]\n"
       "b 183f\n"
       "182:"  // Height 6: no accumulate
       "movi v8.4s, #0x0\n"
@@ -2668,297 +2667,297 @@ void a64_hybrid_s8s32_dot_6x16 (
       "movi v30.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
       "183:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "184:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 185f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 186f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 186f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
       "b 186f\n"
       "185:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "186:"  // Height 6: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 189f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
-      "ldr q5, [x20, #0x0]\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 188f\n"
       "187:"  // Height 6: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x20, x20, #0x10\n"
+      "ldr q6, [x10, #0x20]\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "cmp x26, #0x20\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
+      "ldr q6, [x10, #0x40]\n"
       ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
+      "ldr q7, [x10, #0x50]\n"
       ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
+      "ldr q6, [x10, #0x60]\n"
       ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "ldr q7, [x10, #0x70]\n"
       ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
+      "ldr q6, [x10, #0x80]\n"
       ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
+      "ldr q7, [x10, #0x90]\n"
       ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
+      "ldr q6, [x10, #0xa0]\n"
       ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
+      "ldr q7, [x10, #0xb0]\n"
       ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
+      "ldr q6, [x10, #0xc0]\n"
       ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
+      "ldr q7, [x10, #0xd0]\n"
       ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
+      "ldr q6, [x10, #0xe0]\n"
       ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
+      "ldr q7, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
       ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
       ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
       ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
       ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
       ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
       ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
       ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 187b\n"
       "188:"  // Height 6: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q6, [x10, #0x20]\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "sub x27, x27, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
+      "ldr q6, [x10, #0x40]\n"
       ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
+      "ldr q7, [x10, #0x50]\n"
       ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
+      "ldr q6, [x10, #0x60]\n"
       ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "ldr q7, [x10, #0x70]\n"
       ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
+      "ldr q6, [x10, #0x80]\n"
       ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
+      "ldr q7, [x10, #0x90]\n"
       ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
+      "ldr q6, [x10, #0xa0]\n"
       ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
+      "ldr q7, [x10, #0xb0]\n"
       ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
+      "ldr q6, [x10, #0xc0]\n"
       ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
+      "ldr q7, [x10, #0xd0]\n"
       ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
+      "ldr q6, [x10, #0xe0]\n"
       ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
+      "ldr q7, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
       ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
@@ -2972,292 +2971,291 @@ void a64_hybrid_s8s32_dot_6x16 (
       ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
       "189:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x26, 194f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 194f\n"
+      "cmp x27, #0x4\n"
       "blt 191f\n"
       "190:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
       "ldr s3, [x22], #0x4\n"
-      "ldr s4, [x21], #0x4\n"
-      "ldr s5, [x20], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x4f87e028  // sdot v8.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x4f86e02c  // sdot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e030  // sdot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e034  // sdot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e038  // sdot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03c  // sdot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x4f87e009  // sdot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00d  // sdot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e011  // sdot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e015  // sdot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e019  // sdot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01d  // sdot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x4f87e02a  // sdot v10.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x4f86e02e  // sdot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e032  // sdot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e036  // sdot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e03a  // sdot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03e  // sdot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x4f87e00b  // sdot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00f  // sdot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e013  // sdot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e017  // sdot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e01b  // sdot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01f  // sdot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 190b\n"
-      "cbz x26, 194f\n"
       "191:"  // Height 6: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 192f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr h3, [x22], #0x2\n"
-      "ldr h4, [x21], #0x2\n"
-      "ldr h5, [x20], #0x2\n"
-      "tbz x26, #0, 193f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
-      "ld1 { v3.b }[2], [x22]\n"
-      "ld1 { v4.b }[2], [x21]\n"
-      "ld1 { v5.b }[2], [x20]\n"
+      "cbz x27, 194f\n"
+      "tbz x27, #1, 192f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x21], #0x2\n"
+      "tbz x27, #0, 193f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x21]\n"
       "b 193f\n"
       "192:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
-      "ldr b3, [x22, #0x0]\n"
-      "ldr b4, [x21, #0x0]\n"
-      "ldr b5, [x20, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x21, #0x0]\n"
       "193:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x4f80e0e8  // sdot v8.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ec  // sdot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f0  // sdot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f4  // sdot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f8  // sdot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fc  // sdot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x4f80e0c9  // sdot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cd  // sdot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d1  // sdot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d5  // sdot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d9  // sdot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dd  // sdot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x4f80e0ea  // sdot v10.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ee  // sdot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f2  // sdot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f6  // sdot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fa  // sdot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fe  // sdot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0cb  // sdot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cf  // sdot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d3  // sdot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d7  // sdot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0db  // sdot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0df  // sdot v31.4s, v6.16b, v5.4b[0]\n"
       "194:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 184b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #2\n"
       "prfm pstl1keep, [x20, #0x0]\n"
-      "add x19, x20, x19, LSL #2\n"
-      "prfm pstl1keep, [x19, #0x0]\n"
       "bge 203f\n"
-      "tbz x10, #3, 198f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v25.4s }, [x20], #0x10\n"
-      "st1 { v28.4s }, [x19], #0x10\n"
-      "st1 { v29.4s }, [x19], #0x10\n"
-      "tbz x10, #2, 196f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "st1 { v26.4s }, [x20], #0x10\n"
-      "st1 { v30.4s }, [x19], #0x10\n"
-      "tbz x10, #1, 195f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "str d27, [x20], #0x8\n"
-      "str d31, [x19], #0x8\n"
-      "tbz x10, #0, 202f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
-      "st1 { v27.s }[2], [x20]\n"
-      "st1 { v31.s }[2], [x19]\n"
+      "tbz x11, #3, 198f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "st1 { v29.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 196f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v30.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 195f\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "str d31, [x20], #0x8\n"
+      "tbz x11, #0, 202f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "st1 { v31.s }[2], [x20]\n"
       "b 202f\n"
       "195:"  // Height 6: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 202f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
-      "str s27, [x20, #0x0]\n"
-      "str s31, [x19, #0x0]\n"
+      "tbz x11, #0, 202f\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "str s31, [x20, #0x0]\n"
       "b 202f\n"
       "196:"  // Height 6: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 197f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "str d26, [x20], #0x8\n"
-      "str d30, [x19], #0x8\n"
-      "tbz x10, #0, 202f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
-      "st1 { v26.s }[2], [x20]\n"
-      "st1 { v30.s }[2], [x19]\n"
+      "tbz x11, #1, 197f\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "str d30, [x20], #0x8\n"
+      "tbz x11, #0, 202f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "st1 { v30.s }[2], [x20]\n"
       "b 202f\n"
       "197:"  // Height 6: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 202f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
-      "str s26, [x20, #0x0]\n"
-      "str s30, [x19, #0x0]\n"
+      "tbz x11, #0, 202f\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "str s30, [x20, #0x0]\n"
       "b 202f\n"
       "198:"  // Height 6: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 200f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v28.4s }, [x19], #0x10\n"
-      "tbz x10, #1, 199f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "str d25, [x20], #0x8\n"
-      "str d29, [x19], #0x8\n"
-      "tbz x10, #0, 202f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
-      "st1 { v25.s }[2], [x20]\n"
-      "st1 { v29.s }[2], [x19]\n"
+      "tbz x11, #2, 200f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 199f\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "str d29, [x20], #0x8\n"
+      "tbz x11, #0, 202f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "st1 { v29.s }[2], [x20]\n"
       "b 202f\n"
       "199:"  // Height 6: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 202f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
-      "str s25, [x20, #0x0]\n"
-      "str s29, [x19, #0x0]\n"
+      "tbz x11, #0, 202f\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "str s29, [x20, #0x0]\n"
       "b 202f\n"
       "200:"  // Height 6: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 201f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "str d28, [x19], #0x8\n"
-      "tbz x10, #0, 202f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
-      "st1 { v24.s }[2], [x20]\n"
-      "st1 { v28.s }[2], [x19]\n"
+      "tbz x11, #1, 201f\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "str d28, [x20], #0x8\n"
+      "tbz x11, #0, 202f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "st1 { v28.s }[2], [x20]\n"
       "b 202f\n"
       "201:"  // Height 6: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
-      "str s24, [x20, #0x0]\n"
-      "str s28, [x19, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "str s28, [x20, #0x0]\n"
       "202:"  // Height 6: Partial direct writeback: Done
       "b 204f\n"
       "203:"  // Height 6: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q25, [x20, #0x10]\n"
-      "str q26, [x20, #0x20]\n"
-      "str q27, [x20, #0x30]\n"
-      "str q28, [x19, #0x0]\n"
-      "str q29, [x19, #0x10]\n"
-      "str q30, [x19, #0x20]\n"
-      "str q31, [x19, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "str q28, [x20, #0x0]\n"
+      "str q29, [x20, #0x10]\n"
+      "str q30, [x20, #0x20]\n"
+      "str q31, [x20, #0x30]\n"
       "204:"  // Height 6: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 172b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 206f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 205f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "205:"  // Update direct input
-      "mov x19, #0x6\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "206:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
new file mode 100644
index 0000000000..4905ba5656
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int32_t>, \
+    const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_s8s32_mmla_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8s32_mmla_6x16
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 54.98 };
+                case CPUModel::A510:
+                    return { 30.30 };
+                case CPUModel::V1:
+                    return { 83.71 };
+            }
+        }
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 55.27, 15.25, 0.62 };
+                case CPUModel::A510:
+                    return { 33.62, 3.92, 0.48 };
+                case CPUModel::V1:
+                    return { 63.94, 16.18, 0.83 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_s8s32_mmla_6x16;
+    cls_a64_hybrid_s8s32_mmla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..f8a76b5244
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
@@ -0,0 +1,3449 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+    const int32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 186f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 149f\n"
+      "beq 112f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 75f\n"
+      "beq 38f\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "tbz %x[flags], #0, 13f\n"
+      "cmp x11, #0x10\n"
+      "bge 11f\n"
+      "tbz x11, #3, 6f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 4f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 3f\n"
+      "ldr d16, [x9], #0x8\n"
+      "mov x25, #0x38\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "b 10f\n"
+      "3:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "b 10f\n"
+      "4:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x11, #1, 5f\n"
+      "ldr d11, [x9], #0x8\n"
+      "mov x25, #0x28\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "b 10f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "b 10f\n"
+      "6:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x11, #2, 8f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 7f\n"
+      "ldr d10, [x9], #0x8\n"
+      "mov x25, #0x18\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "b 10f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "b 10f\n"
+      "8:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x11, #1, 9f\n"
+      "ldr d9, [x9], #0x8\n"
+      "mov x25, #0x8\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "b 10f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "mov x25, #0x0\n"
+      "10:"  // Height 1: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 12f\n"
+      "11:"  // Height 1: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "12:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 14f\n"
+      "13:"  // Height 1: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "14:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "15:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 17f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "b 17f\n"
+      "16:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "17:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "blt 20f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 19f\n"
+      "18:"  // Height 1: Multiply loop: Main loop head
+      "trn1 v19.2d, v1.2d, v20.2d\n"
+      ".inst 0x4e87a668  // smmla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x4e86a66c  // smmla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e92a669  // smmla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e91a66d  // smmla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v20.2d\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x4e92a428  // smmla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x4e91a42c  // smmla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x4e92a429  // smmla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x4e91a42d  // smmla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x4e92a42a  // smmla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x4e91a42e  // smmla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e92a42b  // smmla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x4e91a42f  // smmla v15.4s, v1.16b, v17.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      "add x10, x10, #0x100\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "bge 18b\n"
+      "19:"  // Height 1: Multiply loop: Single iteration only
+      "trn1 v20.2d, v1.2d, v21.2d\n"
+      ".inst 0x4e87a688  // smmla v8.4s, v20.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x4e86a68c  // smmla v12.4s, v20.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e92a689  // smmla v9.4s, v20.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e91a68d  // smmla v13.4s, v20.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a68a  // smmla v10.4s, v20.16b, v18.16b\n"
+      "ldr q19, [x10, #0x60]\n"
+      ".inst 0x4e91a68e  // smmla v14.4s, v20.16b, v17.16b\n"
+      "ldr q18, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v21.2d\n"
+      ".inst 0x4e93a68b  // smmla v11.4s, v20.16b, v19.16b\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4e92a68f  // smmla v15.4s, v20.16b, v18.16b\n"
+      "ldr q19, [x10, #0x90]\n"
+      ".inst 0x4e91a428  // smmla v8.4s, v1.16b, v17.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x4e93a42c  // smmla v12.4s, v1.16b, v19.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x4e92a429  // smmla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x4e91a42d  // smmla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x4e92a42a  // smmla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x4e91a42e  // smmla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e92a42b  // smmla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x4e91a42f  // smmla v15.4s, v1.16b, v17.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x10, x10, #0x100\n"
+      "20:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 27f\n"
+      "cmp x27, #0x8\n"
+      "blt 22f\n"
+      "21:"  // Height 1: Multiply loop: Odd block loop
+      "ldr d19, [x26], #0x8\n"
+      "ldr q18, [x10, #0x0]\n"
+      "trn1 v19.2d, v19.2d, v17.2d\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x4e92a668  // smmla v8.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x4e91a66c  // smmla v12.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e92a669  // smmla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e91a66d  // smmla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
+      "add x10, x10, #0x80\n"
+      "bge 21b\n"
+      "22:"  // Height 1: Multiply loop: Skip odd blocks
+      "cbz x27, 27f\n"
+      "tbz x27, #2, 24f\n"
+      "ldr s1, [x26], #0x4\n"
+      "tbz x27, #1, 23f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "tbz x27, #0, 26f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "b 26f\n"
+      "23:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 26f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "b 26f\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 25f\n"
+      "ldr h1, [x26], #0x2\n"
+      "tbz x27, #0, 26f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "b 26f\n"
+      "25:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "26:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q23, [x10, #0x0]\n"
+      "ldr q18, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v17.2d\n"
+      ".inst 0x4e97a668  // smmla v8.4s, v19.16b, v23.16b\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4e92a66c  // smmla v12.4s, v19.16b, v18.16b\n"
+      "ldr q31, [x10, #0x30]\n"
+      ".inst 0x4e91a669  // smmla v9.4s, v19.16b, v17.16b\n"
+      "ldr q20, [x10, #0x40]\n"
+      ".inst 0x4e9fa66d  // smmla v13.4s, v19.16b, v31.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e94a66a  // smmla v10.4s, v19.16b, v20.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
+      "add x10, x10, #0x80\n"
+      "27:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 15b\n"
+      "cmp x11, #0x10\n"
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "bge 36f\n"
+      "tbz x11, #3, 31f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 29f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 28f\n"
+      "str d11, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "b 35f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 35f\n"
+      "str s11, [x9, #0x0]\n"
+      "b 35f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 30f\n"
+      "str d10, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "b 35f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 35f\n"
+      "str s10, [x9, #0x0]\n"
+      "b 35f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 33f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 32f\n"
+      "str d9, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "b 35f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 35f\n"
+      "str s9, [x9, #0x0]\n"
+      "b 35f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 34f\n"
+      "str d8, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "b 35f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x9, #0x0]\n"
+      "35:"  // Height 1: Partial direct writeback: Done
+      "b 37f\n"
+      "36:"  // Height 1: Full writeback
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "37:"  // Height 1: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 2b\n"
+      "b 224f\n"
+      "38:"  // Height 2
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "39:"  // Height 2: Column loop
+      "tbz %x[flags], #0, 50f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x9, x20, LSL #2\n"
+      "bge 48f\n"
+      "tbz x11, #3, 43f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 41f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 40f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "tbz x11, #0, 47f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "b 47f\n"
+      "40:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 47f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "b 47f\n"
+      "41:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x11, #1, 42f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "tbz x11, #0, 47f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "b 47f\n"
+      "42:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 47f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "b 47f\n"
+      "43:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x11, #2, 45f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 44f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "tbz x11, #0, 47f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "b 47f\n"
+      "44:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 47f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "b 47f\n"
+      "45:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x11, #1, 46f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "tbz x11, #0, 47f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "b 47f\n"
+      "46:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "47:"  // Height 2: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 49f\n"
+      "48:"  // Height 2: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "49:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 51f\n"
+      "50:"  // Height 2: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "51:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "52:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 53f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 54f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "b 54f\n"
+      "53:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "54:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "blt 57f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 56f\n"
+      "55:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a668  // smmla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x4e86a66c  // smmla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e92a669  // smmla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e91a66d  // smmla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x4e92a428  // smmla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x4e91a42c  // smmla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x4e92a429  // smmla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x4e91a42d  // smmla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x4e92a42a  // smmla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x4e91a42e  // smmla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e92a42b  // smmla v11.4s, v1.16b, v18.16b\n"
+      "add x10, x10, #0x100\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x4e91a42f  // smmla v15.4s, v1.16b, v17.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "bge 55b\n"
+      "56:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a668  // smmla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x4e86a66c  // smmla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e92a669  // smmla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e91a66d  // smmla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x4e92a428  // smmla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x4e91a42c  // smmla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x4e92a429  // smmla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x4e91a42d  // smmla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x4e92a42a  // smmla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x4e91a42e  // smmla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e92a42b  // smmla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x4e91a42f  // smmla v15.4s, v1.16b, v17.16b\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x10, x10, #0x100\n"
+      "57:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 64f\n"
+      "cmp x27, #0x8\n"
+      "blt 59f\n"
+      "58:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d18, [x26], #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "trn1 v19.2d, v18.2d, v17.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q22, [x10, #0x10]\n"
+      ".inst 0x4e91a668  // smmla v8.4s, v19.16b, v17.16b\n"
+      ".inst 0x4e96a66c  // smmla v12.4s, v19.16b, v22.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e81a669  // smmla v9.4s, v19.16b, v1.16b\n"
+      ".inst 0x4e91a66d  // smmla v13.4s, v19.16b, v17.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      "ldr q17, [x10, #0x70]\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
+      "add x10, x10, #0x80\n"
+      "bge 58b\n"
+      "59:"  // Height 2: Multiply loop: Skip odd blocks
+      "cbz x27, 64f\n"
+      "tbz x27, #2, 61f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "tbz x27, #1, 60f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "tbz x27, #0, 63f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "b 63f\n"
+      "60:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 63f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "b 63f\n"
+      "61:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 62f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "tbz x27, #0, 63f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "b 63f\n"
+      "62:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "63:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e92a668  // smmla v8.4s, v19.16b, v18.16b\n"
+      "ldr q5, [x10, #0x20]\n"
+      ".inst 0x4e91a66c  // smmla v12.4s, v19.16b, v17.16b\n"
+      "ldr q21, [x10, #0x30]\n"
+      ".inst 0x4e85a669  // smmla v9.4s, v19.16b, v5.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e95a66d  // smmla v13.4s, v19.16b, v21.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
+      "add x10, x10, #0x80\n"
+      "64:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 52b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "bge 73f\n"
+      "tbz x11, #3, 68f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 66f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 65f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "b 72f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 72f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "b 72f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 67f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "b 72f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 72f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "b 72f\n"
+      "68:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 70f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 69f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "b 72f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 72f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "b 72f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 71f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "b 72f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "72:"  // Height 2: Partial direct writeback: Done
+      "b 74f\n"
+      "73:"  // Height 2: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "74:"  // Height 2: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 39b\n"
+      "b 224f\n"
+      "75:"  // Height 3
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "76:"  // Height 3: Column loop
+      "tbz %x[flags], #0, 87f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x23, x24, x20, LSL #2\n"
+      "bge 85f\n"
+      "tbz x11, #3, 80f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 78f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 77f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d24, [x23], #0x8\n"
+      "tbz x11, #0, 84f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 84f\n"
+      "77:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 84f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "b 84f\n"
+      "78:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x11, #1, 79f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x11, #0, 84f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "b 84f\n"
+      "79:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 84f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "b 84f\n"
+      "80:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x11, #2, 82f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 81f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d18, [x23], #0x8\n"
+      "tbz x11, #0, 84f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "b 84f\n"
+      "81:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 84f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "b 84f\n"
+      "82:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x11, #1, 83f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "tbz x11, #0, 84f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "b 84f\n"
+      "83:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s17, [x23, #0x0]\n"
+      "84:"  // Height 3: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 86f\n"
+      "85:"  // Height 3: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "86:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 88f\n"
+      "87:"  // Height 3: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "88:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "89:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 90f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 91f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "b 91f\n"
+      "90:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "91:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "blt 94f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 93f\n"
+      "92:"  // Height 3: Multiply loop: Main loop head
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a788  // smmla v8.4s, v28.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x4e87a770  // smmla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e86a78c  // smmla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x4e86a774  // smmla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x4e9aa428  // smmla v8.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa470  // smmla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x4e99a42c  // smmla v12.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e99a474  // smmla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x4e9aa429  // smmla v9.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e9aa471  // smmla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x4e99a42d  // smmla v13.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e99a475  // smmla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x4e9aa42a  // smmla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa472  // smmla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x4e99a42e  // smmla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a476  // smmla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4e9aa42b  // smmla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa473  // smmla v19.4s, v3.16b, v26.16b\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x4e99a42f  // smmla v15.4s, v1.16b, v25.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x4e99a477  // smmla v23.4s, v3.16b, v25.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "bge 92b\n"
+      "93:"  // Height 3: Multiply loop: Single iteration only
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a788  // smmla v8.4s, v28.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x4e87a770  // smmla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e86a78c  // smmla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x4e86a774  // smmla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x4e9aa428  // smmla v8.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e9aa470  // smmla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x4e99a42c  // smmla v12.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e99a474  // smmla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x4e9aa429  // smmla v9.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa471  // smmla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x4e99a42d  // smmla v13.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a475  // smmla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x4e9aa42a  // smmla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa472  // smmla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x4e99a42e  // smmla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a476  // smmla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4e9aa42b  // smmla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa473  // smmla v19.4s, v3.16b, v26.16b\n"
+      ".inst 0x4e99a42f  // smmla v15.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a477  // smmla v23.4s, v3.16b, v25.16b\n"
+      "94:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 101f\n"
+      "cmp x27, #0x8\n"
+      "blt 96f\n"
+      "95:"  // Height 3: Multiply loop: Odd block loop
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr q26, [x10, #0x0]\n"
+      "trn1 v27.2d, v25.2d, v27.2d\n"
+      ".inst 0x4e9aa788  // smmla v8.4s, v28.16b, v26.16b\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x4e9aa770  // smmla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e99a78c  // smmla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a774  // smmla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "bge 95b\n"
+      "96:"  // Height 3: Multiply loop: Skip odd blocks
+      "cbz x27, 101f\n"
+      "tbz x27, #2, 98f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "tbz x27, #1, 97f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "tbz x27, #0, 100f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "b 100f\n"
+      "97:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 100f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "b 100f\n"
+      "98:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 99f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "tbz x27, #0, 100f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "b 100f\n"
+      "99:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "100:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v25.2d\n"
+      ".inst 0x4e9aa788  // smmla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa770  // smmla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e9da78c  // smmla v12.4s, v28.16b, v29.16b\n"
+      ".inst 0x4e9da774  // smmla v20.4s, v27.16b, v29.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "101:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 89b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "bge 110f\n"
+      "tbz x11, #3, 105f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 103f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 102f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "b 109f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 109f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "b 109f\n"
+      "103:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 104f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "b 109f\n"
+      "104:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 109f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "b 109f\n"
+      "105:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 107f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 106f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "b 109f\n"
+      "106:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 109f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "b 109f\n"
+      "107:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 108f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "b 109f\n"
+      "108:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "109:"  // Height 3: Partial direct writeback: Done
+      "b 111f\n"
+      "110:"  // Height 3: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "111:"  // Height 3: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 76b\n"
+      "b 224f\n"
+      "112:"  // Height 4
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "113:"  // Height 4: Column loop
+      "tbz %x[flags], #0, 124f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x22, x23, x20, LSL #2\n"
+      "bge 122f\n"
+      "tbz x11, #3, 117f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 115f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 114f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x11, #0, 121f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "b 121f\n"
+      "114:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 121f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "b 121f\n"
+      "115:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x11, #1, 116f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x11, #0, 121f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "b 121f\n"
+      "116:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 121f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "b 121f\n"
+      "117:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x11, #2, 119f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 118f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x11, #0, 121f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "b 121f\n"
+      "118:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 121f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "b 121f\n"
+      "119:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x11, #1, 120f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "tbz x11, #0, 121f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "b 121f\n"
+      "120:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "121:"  // Height 4: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 123f\n"
+      "122:"  // Height 4: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "123:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 125f\n"
+      "124:"  // Height 4: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "125:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "126:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 127f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 128f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 128f\n"
+      "127:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "128:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "blt 131f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 130f\n"
+      "129:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a788  // smmla v8.4s, v28.16b, v7.16b\n"
+      "sub x27, x27, #0x10\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a770  // smmla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e86a78c  // smmla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x4e86a774  // smmla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
+      "add x23, x23, #0x10\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x4e9aa428  // smmla v8.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa470  // smmla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x4e99a42c  // smmla v12.4s, v1.16b, v25.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e99a474  // smmla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x4e9aa429  // smmla v9.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e9aa471  // smmla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x4e99a42d  // smmla v13.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e99a475  // smmla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x4e9aa42a  // smmla v10.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e9aa472  // smmla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x4e99a42e  // smmla v14.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e99a476  // smmla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4e9aa42b  // smmla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa473  // smmla v19.4s, v3.16b, v26.16b\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x4e99a42f  // smmla v15.4s, v1.16b, v25.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x4e99a477  // smmla v23.4s, v3.16b, v25.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "bge 129b\n"
+      "130:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a788  // smmla v8.4s, v28.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a770  // smmla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e86a78c  // smmla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x4e86a774  // smmla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x4e9aa428  // smmla v8.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e9aa470  // smmla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x4e99a42c  // smmla v12.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e99a474  // smmla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x4e9aa429  // smmla v9.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e9aa471  // smmla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x4e99a42d  // smmla v13.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a475  // smmla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x4e9aa42a  // smmla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa472  // smmla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x4e99a42e  // smmla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a476  // smmla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4e9aa42b  // smmla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa473  // smmla v19.4s, v3.16b, v26.16b\n"
+      ".inst 0x4e99a42f  // smmla v15.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a477  // smmla v23.4s, v3.16b, v25.16b\n"
+      "131:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 138f\n"
+      "cmp x27, #0x8\n"
+      "blt 133f\n"
+      "132:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "trn1 v27.2d, v26.2d, v25.2d\n"
+      "cmp x27, #0x8\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x4e9aa788  // smmla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa770  // smmla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e99a78c  // smmla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a774  // smmla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "bge 132b\n"
+      "133:"  // Height 4: Multiply loop: Skip odd blocks
+      "cbz x27, 138f\n"
+      "tbz x27, #2, 135f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "tbz x27, #1, 134f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "ld1 { v4.h }[2], [x23], #0x2\n"
+      "tbz x27, #0, 137f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "ld1 { v4.b }[6], [x23]\n"
+      "b 137f\n"
+      "134:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 137f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "ld1 { v4.b }[4], [x23]\n"
+      "b 137f\n"
+      "135:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 136f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "tbz x27, #0, 137f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x23]\n"
+      "b 137f\n"
+      "136:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x23, #0x0]\n"
+      "137:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e9aa788  // smmla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa770  // smmla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e99a78c  // smmla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a774  // smmla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "138:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 126b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "bge 147f\n"
+      "tbz x11, #3, 142f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 140f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 139f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "b 146f\n"
+      "139:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 146f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "b 146f\n"
+      "140:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 141f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "b 146f\n"
+      "141:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 146f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "b 146f\n"
+      "142:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 144f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 143f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "b 146f\n"
+      "143:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 146f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "b 146f\n"
+      "144:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 145f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "b 146f\n"
+      "145:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "146:"  // Height 4: Partial direct writeback: Done
+      "b 148f\n"
+      "147:"  // Height 4: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "148:"  // Height 4: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 113b\n"
+      "b 224f\n"
+      "149:"  // Height 5
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "150:"  // Height 5: Column loop
+      "tbz %x[flags], #0, 161f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x21, x22, x20, LSL #2\n"
+      "bge 159f\n"
+      "tbz x11, #3, 154f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 152f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 151f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d6, [x21], #0x8\n"
+      "tbz x11, #0, 158f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v6.s }[2], [x21]\n"
+      "b 158f\n"
+      "151:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 158f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s6, [x21, #0x0]\n"
+      "b 158f\n"
+      "152:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x11, #1, 153f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x11, #0, 158f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "b 158f\n"
+      "153:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 158f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "b 158f\n"
+      "154:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x11, #2, 156f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 155f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x11, #0, 158f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "b 158f\n"
+      "155:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 158f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "b 158f\n"
+      "156:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x11, #1, 157f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "tbz x11, #0, 158f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "b 158f\n"
+      "157:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "158:"  // Height 5: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 160f\n"
+      "159:"  // Height 5: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q25, [x21, #0x0]\n"
+      "ldr q26, [x21, #0x10]\n"
+      "ldr q27, [x21, #0x20]\n"
+      "ldr q6, [x21, #0x30]\n"
+      "160:"  // Height 5: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 162f\n"
+      "161:"  // Height 5: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "162:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "163:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 164f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 165f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 165f\n"
+      "164:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "165:"  // Height 5: input setup done
+      "cmp x27, #0x10\n"
+      "blt 168f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "blt 167f\n"
+      "166:"  // Height 5: Multiply loop: Main loop head
+      "trn1 v6.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a4c8  // smmla v8.4s, v6.16b, v7.16b\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "sub x27, x27, #0x10\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x4e80a4cc  // smmla v12.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a454  // smmla v20.4s, v2.16b, v0.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4e80a49c  // smmla v28.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x4e87a4c9  // smmla v9.4s, v6.16b, v7.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e80a4cd  // smmla v13.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a455  // smmla v21.4s, v2.16b, v0.16b\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e80a49d  // smmla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x4e87a4ca  // smmla v10.4s, v6.16b, v7.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e80a4ce  // smmla v14.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a456  // smmla v22.4s, v2.16b, v0.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e80a49e  // smmla v30.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x4e87a4cb  // smmla v11.4s, v6.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e80a4cf  // smmla v15.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a457  // smmla v23.4s, v2.16b, v0.16b\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x4e80a49f  // smmla v31.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x90]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x4e86a429  // smmla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a471  // smmla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4b9  // smmla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x4e86a42a  // smmla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a472  // smmla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4ba  // smmla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4e86a42b  // smmla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bb  // smmla v27.4s, v5.16b, v6.16b\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
+      "ldr q5, [x22, #0x0]\n"
+      "bge 166b\n"
+      "167:"  // Height 5: Multiply loop: Single iteration only
+      "trn1 v6.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a4c8  // smmla v8.4s, v6.16b, v7.16b\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x4e80a4cc  // smmla v12.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a454  // smmla v20.4s, v2.16b, v0.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e80a49c  // smmla v28.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x4e87a4c9  // smmla v9.4s, v6.16b, v7.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e80a4cd  // smmla v13.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a455  // smmla v21.4s, v2.16b, v0.16b\n"
+      "add x22, x22, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e80a49d  // smmla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x4e87a4ca  // smmla v10.4s, v6.16b, v7.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e80a4ce  // smmla v14.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a456  // smmla v22.4s, v2.16b, v0.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e80a49e  // smmla v30.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x4e87a4cb  // smmla v11.4s, v6.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x4e80a4cf  // smmla v15.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a457  // smmla v23.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e80a49f  // smmla v31.4s, v4.16b, v0.16b\n"
+      "ldr q2, [x10, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x4e82a42c  // smmla v12.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a474  // smmla v20.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bc  // smmla v28.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x4e80a429  // smmla v9.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a471  // smmla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4b9  // smmla v25.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x4e82a42d  // smmla v13.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a475  // smmla v21.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bd  // smmla v29.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x4e80a42a  // smmla v10.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a472  // smmla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4ba  // smmla v26.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x4e82a42e  // smmla v14.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a476  // smmla v22.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4be  // smmla v30.4s, v5.16b, v2.16b\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4e80a42b  // smmla v11.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a473  // smmla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bb  // smmla v27.4s, v5.16b, v0.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      "168:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 175f\n"
+      "cmp x27, #0x8\n"
+      "blt 170f\n"
+      "169:"  // Height 5: Multiply loop: Odd block loop
+      "ldr d1, [x26], #0x8\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr d0, [x22], #0x8\n"
+      "ldr q1, [x10, #0x0]\n"
+      "trn1 v2.2d, v0.2d, v2.2d\n"
+      ".inst 0x4e81a488  // smmla v8.4s, v4.16b, v1.16b\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x4e81a470  // smmla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x4e80a48c  // smmla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x4e80a45c  // smmla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x4e81a489  // smmla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a471  // smmla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x4e80a48d  // smmla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45d  // smmla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x4e81a48a  // smmla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a472  // smmla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45a  // smmla v26.4s, v2.16b, v1.16b\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x4e80a48e  // smmla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45e  // smmla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x4e86a48b  // smmla v11.4s, v4.16b, v6.16b\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a45b  // smmla v27.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e80a48f  // smmla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45f  // smmla v31.4s, v2.16b, v0.16b\n"
+      "bge 169b\n"
+      "170:"  // Height 5: Multiply loop: Skip odd blocks
+      "cbz x27, 175f\n"
+      "tbz x27, #2, 172f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s5, [x22], #0x4\n"
+      "tbz x27, #1, 171f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "ld1 { v4.h }[2], [x23], #0x2\n"
+      "ld1 { v5.h }[2], [x22], #0x2\n"
+      "tbz x27, #0, 174f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "ld1 { v4.b }[6], [x23]\n"
+      "ld1 { v5.b }[6], [x22]\n"
+      "b 174f\n"
+      "171:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 174f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "ld1 { v4.b }[4], [x23]\n"
+      "ld1 { v5.b }[4], [x22]\n"
+      "b 174f\n"
+      "172:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 173f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "ldr h5, [x22], #0x2\n"
+      "tbz x27, #0, 174f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x23]\n"
+      "ld1 { v5.b }[2], [x22]\n"
+      "b 174f\n"
+      "173:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x23, #0x0]\n"
+      "ldr b5, [x22, #0x0]\n"
+      "174:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      "trn1 v2.2d, v5.2d, v0.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x4e86a4e8  // smmla v8.4s, v7.16b, v6.16b\n"
+      ".inst 0x4e86a470  // smmla v16.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a458  // smmla v24.4s, v2.16b, v6.16b\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x4e81a4ec  // smmla v12.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a474  // smmla v20.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45c  // smmla v28.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x4e80a4e9  // smmla v9.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e80a471  // smmla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a459  // smmla v25.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x4e81a4ed  // smmla v13.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a475  // smmla v21.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45d  // smmla v29.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x4e80a4ea  // smmla v10.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e80a472  // smmla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45a  // smmla v26.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x4e81a4ee  // smmla v14.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a476  // smmla v22.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45e  // smmla v30.4s, v2.16b, v1.16b\n"
+      "ldr q6, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x4e80a4eb  // smmla v11.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e80a473  // smmla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45b  // smmla v27.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e86a4ef  // smmla v15.4s, v7.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a45f  // smmla v31.4s, v2.16b, v6.16b\n"
+      "175:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 163b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "bge 184f\n"
+      "tbz x11, #3, 179f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 177f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 176f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "b 183f\n"
+      "176:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 183f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "b 183f\n"
+      "177:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 178f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "b 183f\n"
+      "178:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 183f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "b 183f\n"
+      "179:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 181f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 180f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "b 183f\n"
+      "180:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 183f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "b 183f\n"
+      "181:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 182f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "b 183f\n"
+      "182:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "183:"  // Height 5: Partial direct writeback: Done
+      "b 185f\n"
+      "184:"  // Height 5: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "185:"  // Height 5: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 150b\n"
+      "b 224f\n"
+      "186:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "187:"  // Height 6: Column loop
+      "tbz %x[flags], #0, 198f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x20, x21, x20, LSL #2\n"
+      "bge 196f\n"
+      "tbz x11, #3, 191f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v29.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 189f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "ld1 { v30.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 188f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d6, [x21], #0x8\n"
+      "ldr d31, [x20], #0x8\n"
+      "tbz x11, #0, 195f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v6.s }[2], [x21]\n"
+      "ld1 { v31.s }[2], [x20]\n"
+      "b 195f\n"
+      "188:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 195f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s6, [x21, #0x0]\n"
+      "ldr s31, [x20, #0x0]\n"
+      "b 195f\n"
+      "189:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x11, #1, 190f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "ldr d30, [x20], #0x8\n"
+      "tbz x11, #0, 195f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "ld1 { v30.s }[2], [x20]\n"
+      "b 195f\n"
+      "190:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 195f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "ldr s30, [x20, #0x0]\n"
+      "b 195f\n"
+      "191:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x11, #2, 193f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 192f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d29, [x20], #0x8\n"
+      "tbz x11, #0, 195f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "ld1 { v29.s }[2], [x20]\n"
+      "b 195f\n"
+      "192:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 195f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "ldr s29, [x20, #0x0]\n"
+      "b 195f\n"
+      "193:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x11, #1, 194f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "tbz x11, #0, 195f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v28.s }[2], [x20]\n"
+      "b 195f\n"
+      "194:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "ldr s28, [x20, #0x0]\n"
+      "195:"  // Height 6: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 197f\n"
+      "196:"  // Height 6: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q25, [x21, #0x0]\n"
+      "ldr q26, [x21, #0x10]\n"
+      "ldr q27, [x21, #0x20]\n"
+      "ldr q6, [x21, #0x30]\n"
+      "ldr q28, [x20, #0x0]\n"
+      "ldr q29, [x20, #0x10]\n"
+      "ldr q30, [x20, #0x20]\n"
+      "ldr q31, [x20, #0x30]\n"
+      "197:"  // Height 6: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 199f\n"
+      "198:"  // Height 6: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "199:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "200:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 201f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 202f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 202f\n"
+      "201:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "202:"  // Height 6: input setup done
+      "cmp x27, #0x10\n"
+      "blt 205f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "blt 204f\n"
+      "203:"  // Height 6: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "sub x27, x27, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q2, [x25, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q0, [x10, #0x90]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x4e86a429  // smmla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a471  // smmla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4b9  // smmla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x4e86a42a  // smmla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a472  // smmla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4ba  // smmla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4e86a42b  // smmla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bb  // smmla v27.4s, v5.16b, v6.16b\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "bge 203b\n"
+      "204:"  // Height 6: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "add x21, x21, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q2, [x10, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x4e82a42c  // smmla v12.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a474  // smmla v20.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bc  // smmla v28.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x4e80a429  // smmla v9.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a471  // smmla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4b9  // smmla v25.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x4e82a42d  // smmla v13.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a475  // smmla v21.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bd  // smmla v29.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x4e80a42a  // smmla v10.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a472  // smmla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4ba  // smmla v26.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x4e82a42e  // smmla v14.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a476  // smmla v22.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4be  // smmla v30.4s, v5.16b, v2.16b\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x4e80a42b  // smmla v11.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a473  // smmla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bb  // smmla v27.4s, v5.16b, v0.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      "205:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 212f\n"
+      "cmp x27, #0x8\n"
+      "blt 207f\n"
+      "206:"  // Height 6: Multiply loop: Odd block loop
+      "ldr d1, [x26], #0x8\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
+      "cmp x27, #0x8\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d0, [x21], #0x8\n"
+      "trn1 v2.2d, v1.2d, v0.2d\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x4e81a488  // smmla v8.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a470  // smmla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x4e80a48c  // smmla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45c  // smmla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x4e81a489  // smmla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a471  // smmla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x4e80a48d  // smmla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45d  // smmla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x4e81a48a  // smmla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a472  // smmla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45a  // smmla v26.4s, v2.16b, v1.16b\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x4e80a48e  // smmla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45e  // smmla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x4e86a48b  // smmla v11.4s, v4.16b, v6.16b\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a45b  // smmla v27.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e80a48f  // smmla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45f  // smmla v31.4s, v2.16b, v0.16b\n"
+      "bge 206b\n"
+      "207:"  // Height 6: Multiply loop: Skip odd blocks
+      "cbz x27, 212f\n"
+      "tbz x27, #2, 209f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s5, [x22], #0x4\n"
+      "ldr s6, [x21], #0x4\n"
+      "tbz x27, #1, 208f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "ld1 { v4.h }[2], [x23], #0x2\n"
+      "ld1 { v5.h }[2], [x22], #0x2\n"
+      "ld1 { v6.h }[2], [x21], #0x2\n"
+      "tbz x27, #0, 211f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "ld1 { v4.b }[6], [x23]\n"
+      "ld1 { v5.b }[6], [x22]\n"
+      "ld1 { v6.b }[6], [x21]\n"
+      "b 211f\n"
+      "208:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 211f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "ld1 { v4.b }[4], [x23]\n"
+      "ld1 { v5.b }[4], [x22]\n"
+      "ld1 { v6.b }[4], [x21]\n"
+      "b 211f\n"
+      "209:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 210f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "ldr h5, [x22], #0x2\n"
+      "ldr h6, [x21], #0x2\n"
+      "tbz x27, #0, 211f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x23]\n"
+      "ld1 { v5.b }[2], [x22]\n"
+      "ld1 { v6.b }[2], [x21]\n"
+      "b 211f\n"
+      "210:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x23, #0x0]\n"
+      "ldr b5, [x22, #0x0]\n"
+      "ldr b6, [x21, #0x0]\n"
+      "211:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q0, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e80a4e8  // smmla v8.4s, v7.16b, v0.16b\n"
+      "trn1 v2.2d, v5.2d, v6.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x4e80a470  // smmla v16.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a458  // smmla v24.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x4e81a4ec  // smmla v12.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a474  // smmla v20.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45c  // smmla v28.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x4e80a4e9  // smmla v9.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e80a471  // smmla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a459  // smmla v25.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x4e81a4ed  // smmla v13.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a475  // smmla v21.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45d  // smmla v29.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x4e80a4ea  // smmla v10.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e80a472  // smmla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45a  // smmla v26.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x4e81a4ee  // smmla v14.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a476  // smmla v22.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45e  // smmla v30.4s, v2.16b, v1.16b\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x4e80a4eb  // smmla v11.4s, v7.16b, v0.16b\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x4e80a473  // smmla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45b  // smmla v27.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e86a4ef  // smmla v15.4s, v7.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a45f  // smmla v31.4s, v2.16b, v6.16b\n"
+      "212:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 200b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x20, x21, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "bge 221f\n"
+      "tbz x11, #3, 216f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "st1 { v25.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 214f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "st1 { v26.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 213f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "str d27, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "st1 { v27.s }[2], [x20]\n"
+      "b 220f\n"
+      "213:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 220f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "str s27, [x20, #0x0]\n"
+      "b 220f\n"
+      "214:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 215f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "str d26, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "st1 { v26.s }[2], [x20]\n"
+      "b 220f\n"
+      "215:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 220f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "str s26, [x20, #0x0]\n"
+      "b 220f\n"
+      "216:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 218f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 217f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "str d25, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "st1 { v25.s }[2], [x20]\n"
+      "b 220f\n"
+      "217:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 220f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "str s25, [x20, #0x0]\n"
+      "b 220f\n"
+      "218:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 219f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "str d23, [x21], #0x8\n"
+      "str d24, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "st1 { v23.s }[2], [x21]\n"
+      "st1 { v24.s }[2], [x20]\n"
+      "b 220f\n"
+      "219:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "str s23, [x21, #0x0]\n"
+      "str s24, [x20, #0x0]\n"
+      "220:"  // Height 6: Partial direct writeback: Done
+      "b 222f\n"
+      "221:"  // Height 6: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "str q23, [x21, #0x0]\n"
+      "str q28, [x21, #0x10]\n"
+      "str q29, [x21, #0x20]\n"
+      "str q30, [x21, #0x30]\n"
+      "str q24, [x20, #0x0]\n"
+      "str q25, [x20, #0x10]\n"
+      "str q26, [x20, #0x20]\n"
+      "str q27, [x20, #0x30]\n"
+      "222:"  // Height 6: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 187b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 224f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 223f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "223:"  // Update direct input
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "224:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
index 5d9d84815a..14aba00788 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
@@ -44,7 +44,8 @@ void a64_hybrid_u8qa_dot_4x16_a55( ARGLIST );
 class cls_a64_hybrid_u8qa_dot_4x16
 {
 public:
-    typedef uint8_t operand_type;
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
     typedef uint8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,24 @@ public:
         return false;
     }
 
-    StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 7.5301 };
-            default:
-                return { 27.5482 };
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 7.5301 };
+                default:
+                    return { 27.5482 };
+                case CPUModel::A510:
+                    return { 14.81 };
+                case CPUModel::V1:
+                    return { 44.54 };
+            }
         }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
index 954e2891fb..00d063b426 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -78,341 +78,328 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 91f\n"
       "cmp %x[M], #0x2\n"
       "bgt 61f\n"
       "beq 31f\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[col_bias]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x28, %x[output_ptr]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "2:"  // Height 1: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
       "movi v18.4s, #0x0\n"
       "movi v19.4s, #0x0\n"
       "3:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x11, #0x0\n"
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 6f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "cbnz x11, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x9, x9, x20\n"
       "b 6f\n"
       "5:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x9, %x[input_ptr]\n"
       "6:"  // Height 1: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 11f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
-      "cmp x26, #0x20\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 9f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr d5, [x10, #0x10]\n"
-      "ldr x24, [x10, #0x18]\n"
-      "add x25, x25, #0x10\n"
-      "ldr d6, [x10, #0x20]\n"
-      "ldr x23, [x10, #0x28]\n"
-      "mov v5.d[1], x24\n"
-      "ldr d7, [x10, #0x30]\n"
-      "ldr x19, [x10, #0x38]\n"
+      "ldr d21, [x12, #0x70]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v6.d[1], x23\n"
-      "ldr d8, [x10, #0x40]\n"
+      "ldr d20, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "mov v7.d[1], x19\n"
-      "ldr x23, [x10, #0x48]\n"
+      "ldr d26, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr d9, [x10, #0x50]\n"
-      "ldr x19, [x10, #0x58]\n"
-      "mov v8.d[1], x23\n"
-      "ldr d10, [x10, #0x60]\n"
-      "ldr x23, [x10, #0x68]\n"
+      "ldr d25, [x12, #0xa0]\n"
+      "mov v21.d[1], x20\n"
+      "ldr x20, [x12, #0x88]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "mov v9.d[1], x19\n"
-      "ldr d4, [x10, #0x70]\n"
+      "ldr d24, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "mov v10.d[1], x23\n"
-      "ldr x19, [x10, #0x78]\n"
+      "ldr d23, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr d5, [x10, #0x80]\n"
-      "ldr x24, [x10, #0x88]\n"
-      "mov v4.d[1], x19\n"
-      "ldr d6, [x10, #0x90]\n"
-      "ldr x23, [x10, #0x98]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v5.d[1], x24\n"
-      "ldr d7, [x10, #0xa0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "mov v6.d[1], x23\n"
-      "ldr x19, [x10, #0xa8]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "ldr d8, [x10, #0xb0]\n"
-      "ldr x23, [x10, #0xb8]\n"
-      "mov v7.d[1], x19\n"
-      "ldr d9, [x10, #0xc0]\n"
-      "ldr x19, [x10, #0xc8]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      "mov v8.d[1], x23\n"
-      "ldr d10, [x10, #0xd0]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      "mov v9.d[1], x19\n"
-      "ldr x23, [x10, #0xd8]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      "ldr d4, [x10, #0xe0]\n"
-      "ldr x19, [x10, #0xe8]\n"
-      "mov v10.d[1], x23\n"
-      "ldr d5, [x10, #0xf0]\n"
-      "ldr x24, [x10, #0xf8]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      "mov v4.d[1], x19\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      "mov v5.d[1], x24\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      "ldr d22, [x12, #0xd0]\n"
+      ".inst 0x6fa0e2b3  // udot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr d21, [x12, #0xe0]\n"
+      "mov v20.d[1], x20\n"
+      "ldr x20, [x12, #0x98]\n"
+      "mov v26.d[1], x20\n"
+      "ldr x20, [x12, #0xa8]\n"
+      "mov v25.d[1], x20\n"
+      "ldr x20, [x12, #0xb8]\n"
+      "mov v24.d[1], x20\n"
+      "ldr x23, [x12, #0xc8]\n"
+      ".inst 0x6f80ea90  // udot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr d20, [x12, #0xf0]\n"
+      ".inst 0x6f80eb51  // udot v17.4s, v26.16b, v0.4b[2]\n"
+      "ldr x22, [x12, #0xd8]\n"
+      ".inst 0x6f80eb32  // udot v18.4s, v25.16b, v0.4b[2]\n"
+      "ldr x21, [x12, #0xe8]\n"
+      ".inst 0x6f80eb13  // udot v19.4s, v24.16b, v0.4b[2]\n"
+      "ldr x20, [x12, #0xf8]\n"
+      "mov v23.d[1], x23\n"
+      "mov v22.d[1], x22\n"
+      "add x9, x9, #0x10\n"
+      "mov v21.d[1], x21\n"
+      "add x12, x12, #0x100\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6fa0eaf0  // udot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ead1  // udot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eab2  // udot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea93  // udot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 8f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x10\n"
-      "ldr q0, [x25, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q4, [x12, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "bge 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x10, #0x10]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "sub x26, x26, #0x10\n"
-      "ldr q7, [x10, #0x30]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q21, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q8, [x10, #0x40]\n"
+      "ldr q20, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x50]\n"
+      "ldr q26, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q10, [x10, #0x60]\n"
+      "ldr q25, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q4, [x10, #0x70]\n"
+      "ldr q24, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q5, [x10, #0x80]\n"
+      "ldr q23, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q8, [x10, #0xb0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "ldr q9, [x10, #0xc0]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      "ldr q10, [x10, #0xd0]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      "ldr q4, [x10, #0xe0]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      "ldr q5, [x10, #0xf0]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      "ldr q22, [x12, #0xd0]\n"
+      ".inst 0x6fa0e2b3  // udot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x12, #0xe0]\n"
+      ".inst 0x6f80ea90  // udot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x12, #0xf0]\n"
+      ".inst 0x6f80eb51  // udot v17.4s, v26.16b, v0.4b[2]\n"
+      "sub x10, x10, #0x10\n"
+      ".inst 0x6f80eb32  // udot v18.4s, v25.16b, v0.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x6f80eb13  // udot v19.4s, v24.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x6fa0eaf0  // udot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ead1  // udot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eab2  // udot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea93  // udot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 10f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "10:"  // Height 1: Multiply loop: unique 2: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "11:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x26, 18f\n"
-      "cmp x26, #0x4\n"
+      "cbz x10, 18f\n"
+      "cmp x10, #0x4\n"
       "blt 14f\n"
       "12:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
       "tbnz %x[flags], #31, 13f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "13:"  // Height 1: Multiply loop: unique 3: skip row sum
-      "ldr q6, [x10, #0x0]\n"
-      "sub x26, x26, #0x4\n"
-      "ldr q7, [x10, #0x10]\n"
-      "cmp x26, #0x4\n"
-      "ldr q8, [x10, #0x20]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x30]\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q22, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q21, [x12, #0x20]\n"
+      ".inst 0x6f80e290  // udot v16.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x30]\n"
+      ".inst 0x6f80e2d1  // udot v17.4s, v22.16b, v0.4b[0]\n"
+      ".inst 0x6f80e2b2  // udot v18.4s, v21.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f80e293  // udot v19.4s, v20.16b, v0.4b[0]\n"
       "bge 12b\n"
-      "cbz x26, 18f\n"
       "14:"  // Height 1: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 15f\n"
-      "ldr h0, [x25], #0x2\n"
-      "tbz x26, #0, 16f\n"
-      "ld1 { v0.b }[2], [x25]\n"
+      "cbz x10, 18f\n"
+      "tbz x10, #1, 15f\n"
+      "ldr h0, [x9], #0x2\n"
+      "tbz x10, #0, 16f\n"
+      "ld1 { v0.b }[2], [x9]\n"
       "b 16f\n"
       "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
       "16:"  // Height 1: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 17f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "17:"  // Height 1: Multiply loop: unique 4: skip row sum
-      "ldr q10, [x10, #0x0]\n"
-      "ldr q4, [x10, #0x10]\n"
-      "ldr q5, [x10, #0x20]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x0]\n"
+      ".inst 0x6f80e290  // udot v16.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x10]\n"
+      ".inst 0x6f80e291  // udot v17.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x20]\n"
+      ".inst 0x6f80e292  // udot v18.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x30]\n"
+      ".inst 0x6f80e293  // udot v19.4s, v20.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
       "18:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 4b\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
       "tbnz %x[flags], #31, 19f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v1.4s }, [x22]\n"
-      "neg v1.4s, v1.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "neg v20.4s, v20.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "mul v11.4s, v11.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v20.4s\n"
       "19:"  // Height 1: skip row sum fixup
+      "ldr q23, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
+      "ldr q22, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q21, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q20, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "ldr q0, [x9, #0x0]\n"
+      "add v16.4s, v16.4s, v23.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "ldr q1, [x9, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ldr q2, [x9, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "ldr q3, [x9, #0x30]\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add x9, x9, #0x40\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v20.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 20f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v0.16b\n"
+      "and v21.16b, v18.16b, v0.16b\n"
+      "and v20.16b, v19.16b, v0.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "20:"  // Height 1: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "cmp x11, #0x10\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v20.4s\n"
+      "add v17.4s, v17.4s, v20.4s\n"
+      "add v18.4s, v18.4s, v20.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v20.4s\n"
+      "smin v17.4s, v17.4s, v20.4s\n"
+      "smin v18.4s, v18.4s, v20.4s\n"
+      "smin v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "cmp x14, #0x10\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 29f\n"
-      "tbz x11, #3, 24f\n"
-      "str d16, [x28], #0x8\n"
-      "tbz x11, #2, 22f\n"
-      "st1 { v16.s }[2], [x28], #0x4\n"
-      "tbz x11, #1, 21f\n"
-      "st1 { v16.h }[6], [x28], #0x2\n"
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[14], [x28]\n"
+      "tbz x14, #3, 24f\n"
+      "str d16, [x13], #0x8\n"
+      "tbz x14, #2, 22f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "tbz x14, #1, 21f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[14], [x13]\n"
       "b 28f\n"
       "21:"  // Height 1: Partial direct writeback: partial_1_12
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[12], [x28]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[12], [x13]\n"
       "b 28f\n"
       "22:"  // Height 1: Partial direct writeback: partial_2_8
-      "tbz x11, #1, 23f\n"
-      "st1 { v16.h }[4], [x28], #0x2\n"
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[10], [x28]\n"
+      "tbz x14, #1, 23f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[10], [x13]\n"
       "b 28f\n"
       "23:"  // Height 1: Partial direct writeback: partial_1_8
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[8], [x28]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[8], [x13]\n"
       "b 28f\n"
       "24:"  // Height 1: Partial direct writeback: partial_4_0
-      "tbz x11, #2, 26f\n"
-      "str s16, [x28], #0x4\n"
-      "tbz x11, #1, 25f\n"
-      "st1 { v16.h }[2], [x28], #0x2\n"
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[6], [x28]\n"
+      "tbz x14, #2, 26f\n"
+      "str s16, [x13], #0x4\n"
+      "tbz x14, #1, 25f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[6], [x13]\n"
       "b 28f\n"
       "25:"  // Height 1: Partial direct writeback: partial_1_4
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[4], [x28]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[4], [x13]\n"
       "b 28f\n"
       "26:"  // Height 1: Partial direct writeback: partial_2_0
-      "tbz x11, #1, 27f\n"
-      "str h16, [x28], #0x2\n"
-      "tbz x11, #0, 28f\n"
-      "st1 { v16.b }[2], [x28]\n"
+      "tbz x14, #1, 27f\n"
+      "str h16, [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[2], [x13]\n"
       "b 28f\n"
       "27:"  // Height 1: Partial direct writeback: partial_1_0
-      "str b16, [x28, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
       "28:"  // Height 1: Partial direct writeback: Done
       "b 30f\n"
       "29:"  // Height 1: Full writeback
-      "str q16, [x28, #0x0]\n"
-      "add x28, x28, #0x10\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
       "30:"  // Height 1: Writeback done
-      "subs x11, x11, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 2b\n"
       "b 122f\n"
       "31:"  // Height 2
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
-      "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[col_bias]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x28, %x[output_ptr]\n"
+      "movi v15.16b, #0x1\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
       "32:"  // Height 2: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -423,319 +410,307 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       "movi v22.4s, #0x0\n"
       "movi v23.4s, #0x0\n"
       "33:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x11, #0x0\n"
       "34:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 35f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "cbnz x27, 36f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x11, 36f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
       "b 36f\n"
       "35:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x22, x25, x19\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
       "36:"  // Height 2: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 41f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 39f\n"
       "37:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr d5, [x10, #0x10]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr x24, [x10, #0x18]\n"
-      "ldr d6, [x10, #0x20]\n"
-      "add x25, x25, #0x10\n"
-      "ldr x23, [x10, #0x28]\n"
-      "add x22, x22, #0x10\n"
-      "mov v5.d[1], x24\n"
-      "ldr d7, [x10, #0x30]\n"
-      "ldr x19, [x10, #0x38]\n"
+      "ldr d25, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v6.d[1], x23\n"
+      "mov v25.d[1], x20\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr d8, [x10, #0x40]\n"
+      "ldr d24, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "mov v7.d[1], x19\n"
+      "ldr x23, [x12, #0x88]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr x23, [x10, #0x48]\n"
+      "ldr d30, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr d9, [x10, #0x50]\n"
+      "ldr x22, [x12, #0x98]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr x19, [x10, #0x58]\n"
-      "mov v8.d[1], x23\n"
-      "ldr d10, [x10, #0x60]\n"
-      "ldr x23, [x10, #0x68]\n"
+      "ldr d29, [x12, #0xa0]\n"
+      "ldr x21, [x12, #0xa8]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "mov v9.d[1], x19\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr d4, [x10, #0x70]\n"
+      "ldr d28, [x12, #0xb0]\n"
+      "ldr x20, [x12, #0xb8]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "mov v10.d[1], x23\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr x19, [x10, #0x78]\n"
+      "ldr d27, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr d5, [x10, #0x80]\n"
+      "mov v24.d[1], x23\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr x24, [x10, #0x88]\n"
-      "mov v4.d[1], x19\n"
-      "ldr d6, [x10, #0x90]\n"
-      "ldr x23, [x10, #0x98]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v5.d[1], x24\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr d7, [x10, #0xa0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "mov v6.d[1], x23\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr x19, [x10, #0xa8]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "ldr d8, [x10, #0xb0]\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      "ldr x23, [x10, #0xb8]\n"
-      "mov v7.d[1], x19\n"
-      "ldr d9, [x10, #0xc0]\n"
-      "ldr x19, [x10, #0xc8]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      "mov v8.d[1], x23\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      "ldr d10, [x10, #0xd0]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      "mov v9.d[1], x19\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      "ldr x23, [x10, #0xd8]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      "ldr d4, [x10, #0xe0]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      "ldr x19, [x10, #0xe8]\n"
-      "mov v10.d[1], x23\n"
-      "ldr d5, [x10, #0xf0]\n"
-      "ldr x24, [x10, #0xf8]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      "mov v4.d[1], x19\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      "mov v5.d[1], x24\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      "ldr d26, [x12, #0xd0]\n"
+      ".inst 0x6fa0e333  // udot v19.4s, v25.16b, v0.4b[1]\n"
+      "mov v30.d[1], x22\n"
+      ".inst 0x6fa1e337  // udot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr d25, [x12, #0xe0]\n"
+      "mov v29.d[1], x21\n"
+      "ldr x23, [x12, #0xc8]\n"
+      "mov v28.d[1], x20\n"
+      "ldr x22, [x12, #0xd8]\n"
+      "ldr x21, [x12, #0xe8]\n"
+      ".inst 0x6f80eb10  // udot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb14  // udot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr d24, [x12, #0xf0]\n"
+      "ldr x20, [x12, #0xf8]\n"
+      ".inst 0x6f80ebd1  // udot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebd5  // udot v21.4s, v30.16b, v1.4b[2]\n"
+      "mov v27.d[1], x23\n"
+      ".inst 0x6f80ebb2  // udot v18.4s, v29.16b, v0.4b[2]\n"
+      "mov v26.d[1], x22\n"
+      ".inst 0x6f81ebb6  // udot v22.4s, v29.16b, v1.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6f80eb93  // udot v19.4s, v28.16b, v0.4b[2]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6f81eb97  // udot v23.4s, v28.16b, v1.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      "add x28, x28, #0x10\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x6fa0eb70  // udot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb74  // udot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb51  // udot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb55  // udot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb32  // udot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb36  // udot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb13  // udot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb17  // udot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 38f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "38:"  // Height 2: Multiply loop: unique 5: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x20\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "bge 37b\n"
       "39:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x10, #0x10]\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "sub x26, x26, #0x10\n"
+      "ldr q25, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q8, [x10, #0x40]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q9, [x10, #0x50]\n"
+      "ldr q24, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q10, [x10, #0x60]\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q4, [x10, #0x70]\n"
+      "ldr q30, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q5, [x10, #0x80]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q29, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0xa0]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q8, [x10, #0xb0]\n"
+      "ldr q28, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x10, #0xc0]\n"
+      "ldr q27, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x10, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x10, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x10, #0xf0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      "ldr q26, [x12, #0xd0]\n"
+      ".inst 0x6fa0e333  // udot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e337  // udot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x12, #0xe0]\n"
+      ".inst 0x6f80eb10  // udot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb14  // udot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x12, #0xf0]\n"
+      ".inst 0x6f80ebd1  // udot v17.4s, v30.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x6f81ebd5  // udot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x6f80ebb2  // udot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebb6  // udot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f80eb93  // udot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb97  // udot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6fa0eb70  // udot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb74  // udot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb51  // udot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb55  // udot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb32  // udot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb36  // udot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb13  // udot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb17  // udot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 40f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "40:"  // Height 2: Multiply loop: unique 6: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "41:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x26, 48f\n"
-      "cmp x26, #0x4\n"
+      "cbz x10, 48f\n"
+      "cmp x10, #0x4\n"
       "blt 44f\n"
       "42:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
       "tbnz %x[flags], #31, 43f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "43:"  // Height 2: Multiply loop: unique 7: skip row sum
-      "ldr q6, [x10, #0x0]\n"
-      "sub x26, x26, #0x4\n"
-      "ldr q7, [x10, #0x10]\n"
-      "cmp x26, #0x4\n"
-      "ldr q8, [x10, #0x20]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x30]\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
+      "ldr q27, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q26, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q25, [x12, #0x20]\n"
+      ".inst 0x6f80e370  // udot v16.4s, v27.16b, v0.4b[0]\n"
+      "ldr q24, [x12, #0x30]\n"
+      ".inst 0x6f81e374  // udot v20.4s, v27.16b, v1.4b[0]\n"
+      ".inst 0x6f80e351  // udot v17.4s, v26.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f81e355  // udot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x6f80e332  // udot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e336  // udot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f80e313  // udot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e317  // udot v23.4s, v24.16b, v1.4b[0]\n"
       "bge 42b\n"
-      "cbz x26, 48f\n"
       "44:"  // Height 2: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 45f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "tbz x26, #0, 46f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x22]\n"
+      "cbz x10, 48f\n"
+      "tbz x10, #1, 45f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "tbz x10, #0, 46f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
       "b 46f\n"
       "45:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
       "46:"  // Height 2: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 47f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "47:"  // Height 2: Multiply loop: unique 8: skip row sum
-      "ldr q10, [x10, #0x0]\n"
-      "ldr q4, [x10, #0x10]\n"
-      "ldr q5, [x10, #0x20]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
+      "ldr q24, [x12, #0x0]\n"
+      ".inst 0x6f80e310  // udot v16.4s, v24.16b, v0.4b[0]\n"
+      "ldr q26, [x12, #0x10]\n"
+      ".inst 0x6f81e314  // udot v20.4s, v24.16b, v1.4b[0]\n"
+      "ldr q25, [x12, #0x20]\n"
+      ".inst 0x6f80e351  // udot v17.4s, v26.16b, v0.4b[0]\n"
+      "ldr q24, [x12, #0x30]\n"
+      ".inst 0x6f81e355  // udot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x6f80e332  // udot v18.4s, v25.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f81e336  // udot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f80e313  // udot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e317  // udot v23.4s, v24.16b, v1.4b[0]\n"
       "48:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 34b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x21, x28, x19\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "tbnz %x[flags], #31, 49f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x22]\n"
-      "neg v2.4s, v2.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "neg v24.4s, v24.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "mul v11.4s, v11.4s, v2.4s\n"
-      "mul v12.4s, v12.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
       "49:"  // Height 2: skip row sum fixup
+      "ldr q27, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
+      "ldr q26, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q25, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q24, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
-      "ldr q0, [x9, #0x0]\n"
+      "add v16.4s, v16.4s, v27.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "ldr q1, [x9, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ldr q2, [x9, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "ldr q3, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 50f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v24.16b, v16.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v24.4s\n"
+      "and v30.16b, v17.16b, v0.16b\n"
+      "and v29.16b, v18.16b, v0.16b\n"
+      "and v28.16b, v19.16b, v0.16b\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v0.16b\n"
+      "and v25.16b, v22.16b, v0.16b\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "50:"  // Height 2: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -745,122 +720,122 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       "srshl v21.4s, v21.4s, v0.4s\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "cmp x11, #0x10\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v24.4s\n"
+      "add v18.4s, v18.4s, v24.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v24.4s\n"
+      "add v21.4s, v21.4s, v24.4s\n"
+      "add v22.4s, v22.4s, v24.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v24.4s\n"
+      "smin v17.4s, v17.4s, v24.4s\n"
+      "smin v18.4s, v18.4s, v24.4s\n"
+      "smin v19.4s, v19.4s, v24.4s\n"
+      "smin v20.4s, v20.4s, v24.4s\n"
+      "smin v21.4s, v21.4s, v24.4s\n"
+      "smin v22.4s, v22.4s, v24.4s\n"
+      "smin v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 59f\n"
-      "tbz x11, #3, 54f\n"
-      "str d16, [x28], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "tbz x11, #2, 52f\n"
-      "st1 { v16.s }[2], [x28], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "tbz x11, #1, 51f\n"
-      "st1 { v16.h }[6], [x28], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[14], [x28]\n"
-      "st1 { v20.b }[14], [x21]\n"
+      "tbz x14, #3, 54f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "tbz x14, #2, 52f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "tbz x14, #1, 51f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 58f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_12
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[12], [x28]\n"
-      "st1 { v20.b }[12], [x21]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 58f\n"
       "52:"  // Height 2: Partial direct writeback: partial_2_8
-      "tbz x11, #1, 53f\n"
-      "st1 { v16.h }[4], [x28], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[10], [x28]\n"
-      "st1 { v20.b }[10], [x21]\n"
+      "tbz x14, #1, 53f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 58f\n"
       "53:"  // Height 2: Partial direct writeback: partial_1_8
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[8], [x28]\n"
-      "st1 { v20.b }[8], [x21]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 58f\n"
       "54:"  // Height 2: Partial direct writeback: partial_4_0
-      "tbz x11, #2, 56f\n"
-      "str s16, [x28], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "tbz x11, #1, 55f\n"
-      "st1 { v16.h }[2], [x28], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[6], [x28]\n"
-      "st1 { v20.b }[6], [x21]\n"
+      "tbz x14, #2, 56f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "tbz x14, #1, 55f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 58f\n"
       "55:"  // Height 2: Partial direct writeback: partial_1_4
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[4], [x28]\n"
-      "st1 { v20.b }[4], [x21]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 58f\n"
       "56:"  // Height 2: Partial direct writeback: partial_2_0
-      "tbz x11, #1, 57f\n"
-      "str h16, [x28], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "tbz x11, #0, 58f\n"
-      "st1 { v16.b }[2], [x28]\n"
-      "st1 { v20.b }[2], [x21]\n"
+      "tbz x14, #1, 57f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 58f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_0
-      "str b16, [x28, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "58:"  // Height 2: Partial direct writeback: Done
       "b 60f\n"
       "59:"  // Height 2: Full writeback
-      "str q16, [x28, #0x0]\n"
-      "add x28, x28, #0x10\n"
-      "str q20, [x21, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
       "60:"  // Height 2: Writeback done
-      "subs x11, x11, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 32b\n"
       "b 122f\n"
       "61:"  // Height 3
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[col_bias]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x28, %x[output_ptr]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
       "62:"  // Height 3: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -875,325 +850,317 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       "movi v26.4s, #0x0\n"
       "movi v27.4s, #0x0\n"
       "63:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x11, #0x0\n"
       "64:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 65f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "cbnz x27, 66f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x27, [x20, #0x10]\n"
+      "cbnz x11, 66f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
+      "add x27, x27, x20\n"
       "b 66f\n"
       "65:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x22, x25, x19\n"
-      "add x21, x22, x19\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
+      "add x27, x28, x21\n"
       "66:"  // Height 3: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 71f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 69f\n"
       "67:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr d5, [x10, #0x10]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr x24, [x10, #0x18]\n"
+      "ldr x23, [x12, #0x88]\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr d6, [x10, #0x20]\n"
-      "ldr x23, [x10, #0x28]\n"
-      "add x25, x25, #0x10\n"
-      "mov v5.d[1], x24\n"
-      "ldr d7, [x10, #0x30]\n"
-      "ldr x19, [x10, #0x38]\n"
-      "add x22, x22, #0x10\n"
+      "ldr d29, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v6.d[1], x23\n"
+      "mov v29.d[1], x20\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr d8, [x10, #0x40]\n"
+      "ldr x22, [x12, #0x98]\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "mov v7.d[1], x19\n"
+      "ldr d28, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr x23, [x10, #0x48]\n"
+      "ldr x21, [x12, #0xa8]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr d9, [x10, #0x50]\n"
+      "ldr x20, [x12, #0xb8]\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr x19, [x10, #0x58]\n"
+      "ldr d5, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "mov v8.d[1], x23\n"
+      "mov v28.d[1], x23\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr d10, [x10, #0x60]\n"
+      "mov v5.d[1], x22\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
-      "mov v9.d[1], x19\n"
+      "ldr d4, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr x23, [x10, #0x68]\n"
+      "mov v4.d[1], x21\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr d4, [x10, #0x70]\n"
+      "ldr x23, [x12, #0xc8]\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr x19, [x10, #0x78]\n"
+      "ldr d3, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "mov v10.d[1], x23\n"
+      "mov v3.d[1], x20\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr d5, [x10, #0x80]\n"
+      "ldr x22, [x12, #0xd8]\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
-      "mov v4.d[1], x19\n"
+      "ldr d31, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr x24, [x10, #0x88]\n"
+      "ldr x21, [x12, #0xe8]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr d6, [x10, #0x90]\n"
+      "ldr x20, [x12, #0xf8]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr x23, [x10, #0x98]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v5.d[1], x24\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr d7, [x10, #0xa0]\n"
-      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
-      "mov v6.d[1], x23\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr x19, [x10, #0xa8]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr d8, [x10, #0xb0]\n"
-      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr x23, [x10, #0xb8]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "mov v7.d[1], x19\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      "ldr d9, [x10, #0xc0]\n"
-      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
-      "mov v8.d[1], x23\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      "ldr x19, [x10, #0xc8]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      "ldr d10, [x10, #0xd0]\n"
-      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
-      "ldr x23, [x10, #0xd8]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      "mov v9.d[1], x19\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      "ldr d4, [x10, #0xe0]\n"
-      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
-      "mov v10.d[1], x23\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      "ldr x19, [x10, #0xe8]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      "ldr d5, [x10, #0xf0]\n"
-      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
-      "ldr x24, [x10, #0xf8]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      "mov v4.d[1], x19\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      "add x21, x21, #0x10\n"
-      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
-      "mov v5.d[1], x24\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      "ldr d30, [x12, #0xd0]\n"
+      ".inst 0x6fa0e3b3  // udot v19.4s, v29.16b, v0.4b[1]\n"
+      "mov v31.d[1], x23\n"
+      ".inst 0x6fa1e3b7  // udot v23.4s, v29.16b, v1.4b[1]\n"
+      "mov v30.d[1], x22\n"
+      ".inst 0x6fa2e3bb  // udot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr d29, [x12, #0xe0]\n"
+      ".inst 0x6f80eb90  // udot v16.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6f81eb94  // udot v20.4s, v28.16b, v1.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x6f82eb98  // udot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr d28, [x12, #0xf0]\n"
+      ".inst 0x6f80e8b1  // udot v17.4s, v5.16b, v0.4b[2]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6f81e8b5  // udot v21.4s, v5.16b, v1.4b[2]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f82e8b9  // udot v25.4s, v5.16b, v2.4b[2]\n"
+      "add x27, x27, #0x10\n"
+      ".inst 0x6f80e892  // udot v18.4s, v4.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x6f81e896  // udot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6f82e89a  // udot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x6f80e873  // udot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x6f81e877  // udot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x6f82e87b  // udot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x6fa0ebf0  // udot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebf4  // udot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebf8  // udot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebd1  // udot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebd5  // udot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebd9  // udot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebb2  // udot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebb6  // udot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebba  // udot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa0eb93  // udot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb97  // udot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb9b  // udot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 68f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "68:"  // Height 3: Multiply loop: unique 9: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x20\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "bge 67b\n"
       "69:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x10, #0x10]\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q29, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q8, [x10, #0x40]\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q9, [x10, #0x50]\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q10, [x10, #0x60]\n"
+      "ldr q28, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q4, [x10, #0x70]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q5, [x10, #0x80]\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q5, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0xa0]\n"
+      "ldr q4, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x10, #0xb0]\n"
+      "ldr q3, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x10, #0xc0]\n"
+      "ldr q31, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x10, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x10, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x10, #0xf0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x10, x10, #0x100\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      "ldr q30, [x12, #0xd0]\n"
+      ".inst 0x6fa0e3b3  // udot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3b7  // udot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3bb  // udot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x12, #0xe0]\n"
+      ".inst 0x6f80eb90  // udot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb94  // udot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb98  // udot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x12, #0xf0]\n"
+      ".inst 0x6f80e8b1  // udot v17.4s, v5.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x6f81e8b5  // udot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8b9  // udot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x6f80e892  // udot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6f81e896  // udot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6f82e89a  // udot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x6f80e873  // udot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x6f81e877  // udot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x6f82e87b  // udot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x6fa0ebf0  // udot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebf4  // udot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebf8  // udot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebd1  // udot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebd5  // udot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebd9  // udot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebb2  // udot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebb6  // udot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebba  // udot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa0eb93  // udot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb97  // udot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb9b  // udot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 70f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "70:"  // Height 3: Multiply loop: unique 10: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "71:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x26, 78f\n"
-      "cmp x26, #0x4\n"
+      "cbz x10, 78f\n"
+      "cmp x10, #0x4\n"
       "blt 74f\n"
       "72:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
-      "ldr s2, [x21], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x27], #0x4\n"
       "tbnz %x[flags], #31, 73f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "73:"  // Height 3: Multiply loop: unique 11: skip row sum
-      "ldr q6, [x10, #0x0]\n"
-      "sub x26, x26, #0x4\n"
-      "ldr q7, [x10, #0x10]\n"
-      "cmp x26, #0x4\n"
-      "ldr q8, [x10, #0x20]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x30]\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
+      "ldr q31, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q30, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q29, [x12, #0x20]\n"
+      ".inst 0x6f80e3f0  // udot v16.4s, v31.16b, v0.4b[0]\n"
+      "ldr q28, [x12, #0x30]\n"
+      ".inst 0x6f81e3f4  // udot v20.4s, v31.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3f8  // udot v24.4s, v31.16b, v2.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f80e3d1  // udot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3d5  // udot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3d9  // udot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3b6  // udot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3ba  // udot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e397  // udot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e39b  // udot v27.4s, v28.16b, v2.4b[0]\n"
       "bge 72b\n"
-      "cbz x26, 78f\n"
       "74:"  // Height 3: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 75f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "ldr h2, [x21], #0x2\n"
-      "tbz x26, #0, 76f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x22]\n"
-      "ld1 { v2.b }[2], [x21]\n"
+      "cbz x10, 78f\n"
+      "tbz x10, #1, 75f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x27], #0x2\n"
+      "tbz x10, #0, 76f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x27]\n"
       "b 76f\n"
       "75:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
-      "ldr b2, [x21, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x27, #0x0]\n"
       "76:"  // Height 3: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 77f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "77:"  // Height 3: Multiply loop: unique 12: skip row sum
-      "ldr q10, [x10, #0x0]\n"
-      "ldr q4, [x10, #0x10]\n"
-      "ldr q5, [x10, #0x20]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x6f82e158  // udot v24.4s, v10.16b, v2.4b[0]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x6f82e099  // udot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0db  // udot v27.4s, v6.16b, v2.4b[0]\n"
+      "ldr q28, [x12, #0x0]\n"
+      ".inst 0x6f80e390  // udot v16.4s, v28.16b, v0.4b[0]\n"
+      "ldr q30, [x12, #0x10]\n"
+      ".inst 0x6f81e394  // udot v20.4s, v28.16b, v1.4b[0]\n"
+      "ldr q29, [x12, #0x20]\n"
+      ".inst 0x6f82e398  // udot v24.4s, v28.16b, v2.4b[0]\n"
+      "ldr q28, [x12, #0x30]\n"
+      ".inst 0x6f80e3d1  // udot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3d5  // udot v21.4s, v30.16b, v1.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f82e3d9  // udot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3b6  // udot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3ba  // udot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e397  // udot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e39b  // udot v27.4s, v28.16b, v2.4b[0]\n"
       "78:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 64b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x21, x28, x19\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
       "tbnz %x[flags], #31, 79f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v3.4s }, [x22]\n"
-      "neg v3.4s, v3.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "neg v28.4s, v28.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "mul v11.4s, v11.4s, v3.4s\n"
-      "mul v12.4s, v12.4s, v3.4s\n"
-      "mul v13.4s, v13.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v28.4s\n"
+      "mul v12.4s, v12.4s, v28.4s\n"
+      "mul v13.4s, v13.4s, v28.4s\n"
       "79:"  // Height 3: skip row sum fixup
+      "ldr q31, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
+      "ldr q30, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q29, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q28, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
@@ -1203,77 +1170,73 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       "add v25.4s, v25.4s, v13.4s\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
-      "ldr q0, [x9, #0x0]\n"
+      "add v16.4s, v16.4s, v31.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v31.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v31.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "ldr q1, [x9, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ldr q2, [x9, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "ldr q3, [x9, #0x30]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add x9, x9, #0x40\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 80f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v1.16b, v16.16b, v0.16b\n"
+      "and v31.16b, v17.16b, v0.16b\n"
+      "and v30.16b, v18.16b, v0.16b\n"
+      "and v29.16b, v19.16b, v0.16b\n"
+      "and v28.16b, v20.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v1.4s\n"
+      "sqadd v17.4s, v17.4s, v31.4s\n"
+      "sqadd v18.4s, v18.4s, v30.4s\n"
+      "sqadd v19.4s, v19.4s, v29.4s\n"
+      "sqadd v20.4s, v20.4s, v28.4s\n"
+      "and v3.16b, v21.16b, v0.16b\n"
+      "and v2.16b, v22.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v0.16b\n"
+      "and v29.16b, v26.16b, v0.16b\n"
+      "and v28.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v3.4s\n"
+      "sqadd v22.4s, v22.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "80:"  // Height 3: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -1287,157 +1250,157 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       "srshl v25.4s, v25.4s, v0.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "cmp x11, #0x10\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v28.4s\n"
+      "add v18.4s, v18.4s, v28.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v28.4s\n"
+      "add v21.4s, v21.4s, v28.4s\n"
+      "add v22.4s, v22.4s, v28.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v28.4s\n"
+      "add v25.4s, v25.4s, v28.4s\n"
+      "add v26.4s, v26.4s, v28.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v23.4s, v23.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 89f\n"
-      "tbz x11, #3, 84f\n"
-      "str d16, [x28], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "tbz x11, #2, 82f\n"
-      "st1 { v16.s }[2], [x28], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
-      "tbz x11, #1, 81f\n"
-      "st1 { v16.h }[6], [x28], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[14], [x28]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
+      "tbz x14, #3, 84f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x14, #2, 82f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "tbz x14, #1, 81f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 88f\n"
       "81:"  // Height 3: Partial direct writeback: partial_1_12
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[12], [x28]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 88f\n"
       "82:"  // Height 3: Partial direct writeback: partial_2_8
-      "tbz x11, #1, 83f\n"
-      "st1 { v16.h }[4], [x28], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[10], [x28]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
+      "tbz x14, #1, 83f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 88f\n"
       "83:"  // Height 3: Partial direct writeback: partial_1_8
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[8], [x28]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 88f\n"
       "84:"  // Height 3: Partial direct writeback: partial_4_0
-      "tbz x11, #2, 86f\n"
-      "str s16, [x28], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
-      "tbz x11, #1, 85f\n"
-      "st1 { v16.h }[2], [x28], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[6], [x28]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
+      "tbz x14, #2, 86f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "tbz x14, #1, 85f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 88f\n"
       "85:"  // Height 3: Partial direct writeback: partial_1_4
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[4], [x28]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 88f\n"
       "86:"  // Height 3: Partial direct writeback: partial_2_0
-      "tbz x11, #1, 87f\n"
-      "str h16, [x28], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
-      "tbz x11, #0, 88f\n"
-      "st1 { v16.b }[2], [x28]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
+      "tbz x14, #1, 87f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 88f\n"
       "87:"  // Height 3: Partial direct writeback: partial_1_0
-      "str b16, [x28, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "88:"  // Height 3: Partial direct writeback: Done
       "b 90f\n"
       "89:"  // Height 3: Full writeback
-      "str q16, [x28, #0x0]\n"
-      "add x28, x28, #0x10\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "90:"  // Height 3: Writeback done
-      "subs x11, x11, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 62b\n"
       "b 122f\n"
       "91:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x4\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v14.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[col_bias]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x28, %x[output_ptr]\n"
-      "mov x19, #0x4\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "92:"  // Height 4: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -1456,137 +1419,125 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       "movi v30.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
       "93:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x11, #0x0\n"
       "94:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 95f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "ldr x20, [x20, #0x18]\n"
-      "cbnz x27, 96f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x27, [x20, #0x10]\n"
+      "ldr x26, [x20, #0x18]\n"
+      "cbnz x11, 96f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
       "b 96f\n"
       "95:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x22, x25, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
+      "add x27, x28, x21\n"
+      "add x26, x27, x21\n"
       "96:"  // Height 4: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 101f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q3, [x20, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q3, [x26, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 99f\n"
       "97:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr d5, [x10, #0x10]\n"
+      "ldr x22, [x12, #0x78]\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr x24, [x10, #0x18]\n"
+      "ldr x21, [x12, #0x88]\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr d6, [x10, #0x20]\n"
+      "ldr x20, [x12, #0x98]\n"
       ".inst 0x6f83e09c  // udot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr x23, [x10, #0x28]\n"
-      "mov v5.d[1], x24\n"
-      "ldr d7, [x10, #0x30]\n"
-      "ldr x19, [x10, #0x38]\n"
-      "add x25, x25, #0x10\n"
+      "ldr d4, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v6.d[1], x23\n"
+      "mov v4.d[1], x22\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr d8, [x10, #0x40]\n"
+      "ldr x25, [x12, #0xa8]\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "mov v7.d[1], x19\n"
+      "ldr x24, [x12, #0xb8]\n"
       ".inst 0x6f83e0bd  // udot v29.4s, v5.16b, v3.4b[0]\n"
-      "ldr x23, [x10, #0x48]\n"
+      "ldr d5, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr d9, [x10, #0x50]\n"
+      "mov v5.d[1], x21\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr x19, [x10, #0x58]\n"
+      "ldr x23, [x12, #0xc8]\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
-      "mov v8.d[1], x23\n"
+      "ldr x22, [x12, #0xd8]\n"
       ".inst 0x6f83e0de  // udot v30.4s, v6.16b, v3.4b[0]\n"
-      "ldr d10, [x10, #0x60]\n"
+      "ldr d6, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "mov v9.d[1], x19\n"
+      "mov v6.d[1], x20\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr x23, [x10, #0x68]\n"
+      "ldr x21, [x12, #0xe8]\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr d4, [x10, #0x70]\n"
+      "ldr x20, [x12, #0xf8]\n"
       ".inst 0x6f83e0ff  // udot v31.4s, v7.16b, v3.4b[0]\n"
-      "ldr x19, [x10, #0x78]\n"
+      "ldr d7, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "mov v10.d[1], x23\n"
+      "mov v7.d[1], x25\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr d5, [x10, #0x80]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
-      "mov v4.d[1], x19\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6fa3e11c  // udot v28.4s, v8.16b, v3.4b[1]\n"
-      "ldr x24, [x10, #0x88]\n"
+      "ldr d8, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr d6, [x10, #0x90]\n"
+      "mov v8.d[1], x24\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr x23, [x10, #0x98]\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
-      "mov v5.d[1], x24\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x6fa3e13d  // udot v29.4s, v9.16b, v3.4b[1]\n"
-      "ldr d7, [x10, #0xa0]\n"
+      "ldr d9, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "mov v6.d[1], x23\n"
+      "mov v9.d[1], x23\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr x19, [x10, #0xa8]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr d8, [x10, #0xb0]\n"
       ".inst 0x6fa3e15e  // udot v30.4s, v10.16b, v3.4b[1]\n"
-      "ldr x23, [x10, #0xb8]\n"
+      "ldr d10, [x12, #0xd0]\n"
       ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v7.d[1], x19\n"
+      "mov v10.d[1], x22\n"
       ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr d9, [x10, #0xc0]\n"
       ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
-      "mov v8.d[1], x23\n"
       ".inst 0x6fa3e09f  // udot v31.4s, v4.16b, v3.4b[1]\n"
-      "ldr x19, [x10, #0xc8]\n"
+      "ldr d4, [x12, #0xe0]\n"
       ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr d10, [x10, #0xd0]\n"
+      "mov v4.d[1], x21\n"
       ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr x23, [x10, #0xd8]\n"
       ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
-      "mov v9.d[1], x19\n"
       ".inst 0x6f83e8bc  // udot v28.4s, v5.16b, v3.4b[2]\n"
-      "ldr d4, [x10, #0xe0]\n"
+      "ldr d5, [x12, #0xf0]\n"
       ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "mov v10.d[1], x23\n"
+      "mov v5.d[1], x20\n"
       ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      "ldr x19, [x10, #0xe8]\n"
+      "add x12, x12, #0x100\n"
       ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
-      "ldr d5, [x10, #0xf0]\n"
       ".inst 0x6f83e8dd  // udot v29.4s, v6.16b, v3.4b[2]\n"
-      "ldr x24, [x10, #0xf8]\n"
       ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      "mov v4.d[1], x19\n"
       ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
-      "mov v5.d[1], x24\n"
       ".inst 0x6f83e8fe  // udot v30.4s, v7.16b, v3.4b[2]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      "add x10, x10, #0x100\n"
       ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
       ".inst 0x6f83e91f  // udot v31.4s, v8.16b, v3.4b[2]\n"
       ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
@@ -1611,77 +1562,77 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "98:"  // Height 4: Multiply loop: unique 13: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x20\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q3, [x20, #0x0]\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q3, [x26, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "bge 97b\n"
       "99:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x10, #0x10]\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6f83e09c  // udot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr q8, [x10, #0x40]\n"
+      "ldr q4, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x50]\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q10, [x10, #0x60]\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q4, [x10, #0x70]\n"
       ".inst 0x6f83e0bd  // udot v29.4s, v5.16b, v3.4b[0]\n"
-      "ldr q5, [x10, #0x80]\n"
+      "ldr q5, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x6f83e0de  // udot v30.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q6, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x6f83e0ff  // udot v31.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0xa0]\n"
+      "ldr q7, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
       ".inst 0x6fa3e11c  // udot v28.4s, v8.16b, v3.4b[1]\n"
-      "ldr q8, [x10, #0xb0]\n"
+      "ldr q8, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
       ".inst 0x6fa3e13d  // udot v29.4s, v9.16b, v3.4b[1]\n"
-      "ldr q9, [x10, #0xc0]\n"
+      "ldr q9, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
       ".inst 0x6fa3e15e  // udot v30.4s, v10.16b, v3.4b[1]\n"
-      "ldr q10, [x10, #0xd0]\n"
+      "ldr q10, [x12, #0xd0]\n"
       ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
       ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
       ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
       ".inst 0x6fa3e09f  // udot v31.4s, v4.16b, v3.4b[1]\n"
-      "ldr q4, [x10, #0xe0]\n"
+      "ldr q4, [x12, #0xe0]\n"
       ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
       ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
       ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
       ".inst 0x6f83e8bc  // udot v28.4s, v5.16b, v3.4b[2]\n"
-      "ldr q5, [x10, #0xf0]\n"
+      "ldr q5, [x12, #0xf0]\n"
       ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x10, x10, #0x100\n"
+      "add x12, x12, #0x100\n"
       ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x6f83e8dd  // udot v29.4s, v6.16b, v3.4b[2]\n"
@@ -1715,67 +1666,67 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "100:"  // Height 4: Multiply loop: unique 14: skip row sum
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "101:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x26, 108f\n"
-      "cmp x26, #0x4\n"
+      "cbz x10, 108f\n"
+      "cmp x10, #0x4\n"
       "blt 104f\n"
       "102:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
-      "ldr s2, [x21], #0x4\n"
-      "ldr s3, [x20], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x27], #0x4\n"
+      "ldr s3, [x26], #0x4\n"
       "tbnz %x[flags], #31, 103f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "103:"  // Height 4: Multiply loop: unique 15: skip row sum
-      "ldr q6, [x10, #0x0]\n"
-      "sub x26, x26, #0x4\n"
-      "ldr q7, [x10, #0x10]\n"
-      "cmp x26, #0x4\n"
-      "ldr q8, [x10, #0x20]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x10, #0x30]\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0dc  // udot v28.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0fd  // udot v29.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x6f83e11e  // udot v30.4s, v8.16b, v3.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
-      ".inst 0x6f83e13f  // udot v31.4s, v9.16b, v3.4b[0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q6, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q5, [x12, #0x20]\n"
+      ".inst 0x6f80e0f0  // udot v16.4s, v7.16b, v0.4b[0]\n"
+      "ldr q4, [x12, #0x30]\n"
+      ".inst 0x6f81e0f4  // udot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f83e0fc  // udot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0dd  // udot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0be  // udot v30.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x6f83e09f  // udot v31.4s, v4.16b, v3.4b[0]\n"
       "bge 102b\n"
-      "cbz x26, 108f\n"
       "104:"  // Height 4: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 105f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "ldr h2, [x21], #0x2\n"
-      "ldr h3, [x20], #0x2\n"
-      "tbz x26, #0, 106f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x22]\n"
-      "ld1 { v2.b }[2], [x21]\n"
-      "ld1 { v3.b }[2], [x20]\n"
+      "cbz x10, 108f\n"
+      "tbz x10, #1, 105f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x27], #0x2\n"
+      "ldr h3, [x26], #0x2\n"
+      "tbz x10, #0, 106f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x27]\n"
+      "ld1 { v3.b }[2], [x26]\n"
       "b 106f\n"
       "105:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
-      "ldr b2, [x21, #0x0]\n"
-      "ldr b3, [x20, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x27, #0x0]\n"
+      "ldr b3, [x26, #0x0]\n"
       "106:"  // Height 4: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 107f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
@@ -1783,60 +1734,64 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "107:"  // Height 4: Multiply loop: unique 16: skip row sum
-      "ldr q10, [x10, #0x0]\n"
-      "ldr q4, [x10, #0x10]\n"
-      "ldr q5, [x10, #0x20]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
-      "add x10, x10, #0x40\n"
-      ".inst 0x6f82e158  // udot v24.4s, v10.16b, v2.4b[0]\n"
-      ".inst 0x6f83e15c  // udot v28.4s, v10.16b, v3.4b[0]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x6f82e099  // udot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x6f83e09d  // udot v29.4s, v4.16b, v3.4b[0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      ".inst 0x6f80e0f0  // udot v16.4s, v7.16b, v0.4b[0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      ".inst 0x6f81e0f4  // udot v20.4s, v7.16b, v1.4b[0]\n"
+      "ldr q5, [x12, #0x20]\n"
+      ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+      "ldr q4, [x12, #0x30]\n"
+      ".inst 0x6f83e0fc  // udot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0dd  // udot v29.4s, v6.16b, v3.4b[0]\n"
       ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
       ".inst 0x6f83e0be  // udot v30.4s, v5.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0db  // udot v27.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0df  // udot v31.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x6f83e09f  // udot v31.4s, v4.16b, v3.4b[0]\n"
       "108:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 94b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "add x21, x28, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20\n"
+      "add x22, x23, x20\n"
+      "add x21, x22, x20\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
-      "add x19, x20, x19\n"
-      "prfm pstl1keep, [x19, #0x0]\n"
       "tbnz %x[flags], #31, 109f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "neg v0.4s, v0.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "neg v4.4s, v4.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "mul v11.4s, v11.4s, v4.4s\n"
-      "mul v12.4s, v12.4s, v4.4s\n"
-      "mul v13.4s, v13.4s, v4.4s\n"
-      "mul v14.4s, v14.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
       "109:"  // Height 4: skip row sum fixup
+      "ldr q3, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
+      "ldr q2, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q1, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q0, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
@@ -1850,97 +1805,93 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       "add v29.4s, v29.4s, v14.4s\n"
       "add v30.4s, v30.4s, v14.4s\n"
       "add v31.4s, v31.4s, v14.4s\n"
-      "ldr q0, [x9, #0x0]\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v2.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v1.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v2.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v1.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "ldr q1, [x9, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ldr q2, [x9, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v28.4s, v28.4s, v0.4s\n"
-      "add v29.4s, v29.4s, v1.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "ldr q3, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 110f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v2.16b, v16.16b, v0.16b\n"
+      "and v1.16b, v17.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v2.4s\n"
+      "sqadd v17.4s, v17.4s, v1.4s\n"
+      "and v7.16b, v18.16b, v0.16b\n"
+      "and v6.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v4.16b, v21.16b, v0.16b\n"
+      "and v3.16b, v22.16b, v0.16b\n"
+      "and v2.16b, v23.16b, v0.16b\n"
+      "and v1.16b, v24.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "and v9.16b, v28.16b, v0.16b\n"
-      "and v10.16b, v29.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "and v4.16b, v30.16b, v0.16b\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "and v5.16b, v31.16b, v0.16b\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
-      "sqadd v28.4s, v28.4s, v9.4s\n"
-      "sqadd v29.4s, v29.4s, v10.4s\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v7.4s\n"
+      "sqadd v19.4s, v19.4s, v6.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v4.4s\n"
+      "sqadd v22.4s, v22.4s, v3.4s\n"
+      "sqadd v23.4s, v23.4s, v2.4s\n"
+      "sqadd v24.4s, v24.4s, v1.4s\n"
+      "and v7.16b, v25.16b, v0.16b\n"
+      "and v6.16b, v26.16b, v0.16b\n"
+      "and v5.16b, v27.16b, v0.16b\n"
+      "and v4.16b, v28.16b, v0.16b\n"
+      "and v3.16b, v29.16b, v0.16b\n"
+      "and v2.16b, v30.16b, v0.16b\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v30.4s, v30.4s, v4.4s\n"
-      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v7.4s\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "sqadd v28.4s, v28.4s, v4.4s\n"
+      "sqadd v29.4s, v29.4s, v3.4s\n"
+      "sqadd v30.4s, v30.4s, v2.4s\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
       "110:"  // Height 4: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -1958,189 +1909,188 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       "srshl v29.4s, v29.4s, v0.4s\n"
       "srshl v30.4s, v30.4s, v0.4s\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "cmp x11, #0x10\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v0.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v0.4s\n"
+      "add v22.4s, v22.4s, v0.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v0.4s\n"
+      "add v26.4s, v26.4s, v0.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "add v29.4s, v29.4s, v0.4s\n"
+      "add v30.4s, v30.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v0.4s\n"
+      "smin v17.4s, v17.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v0.4s\n"
+      "smin v19.4s, v19.4s, v0.4s\n"
+      "smin v20.4s, v20.4s, v0.4s\n"
+      "smin v21.4s, v21.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v0.4s\n"
+      "smin v23.4s, v23.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v0.4s\n"
+      "smin v25.4s, v25.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v0.4s\n"
+      "smin v27.4s, v27.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v0.4s\n"
+      "smin v29.4s, v29.4s, v0.4s\n"
+      "smin v30.4s, v30.4s, v0.4s\n"
+      "smin v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v0.4s\n"
+      "smax v17.4s, v17.4s, v0.4s\n"
+      "smax v18.4s, v18.4s, v0.4s\n"
+      "smax v19.4s, v19.4s, v0.4s\n"
+      "smax v20.4s, v20.4s, v0.4s\n"
+      "smax v21.4s, v21.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v0.4s\n"
+      "smax v23.4s, v23.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v0.4s\n"
+      "smax v25.4s, v25.4s, v0.4s\n"
+      "smax v26.4s, v26.4s, v0.4s\n"
+      "smax v27.4s, v27.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v0.4s\n"
+      "smax v29.4s, v29.4s, v0.4s\n"
+      "smax v30.4s, v30.4s, v0.4s\n"
+      "smax v31.4s, v31.4s, v0.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 119f\n"
-      "tbz x11, #3, 114f\n"
-      "str d16, [x28], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "str d28, [x19], #0x8\n"
-      "tbz x11, #2, 112f\n"
-      "st1 { v16.s }[2], [x28], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
-      "st1 { v28.s }[2], [x19], #0x4\n"
-      "tbz x11, #1, 111f\n"
-      "st1 { v16.h }[6], [x28], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
-      "st1 { v28.h }[6], [x19], #0x2\n"
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[14], [x28]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
-      "st1 { v28.b }[14], [x19]\n"
+      "tbz x14, #3, 114f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x14, #2, 112f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x14, #1, 111f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 118f\n"
       "111:"  // Height 4: Partial direct writeback: partial_1_12
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[12], [x28]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
-      "st1 { v28.b }[12], [x19]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 118f\n"
       "112:"  // Height 4: Partial direct writeback: partial_2_8
-      "tbz x11, #1, 113f\n"
-      "st1 { v16.h }[4], [x28], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
-      "st1 { v28.h }[4], [x19], #0x2\n"
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[10], [x28]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
-      "st1 { v28.b }[10], [x19]\n"
+      "tbz x14, #1, 113f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 118f\n"
       "113:"  // Height 4: Partial direct writeback: partial_1_8
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[8], [x28]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
-      "st1 { v28.b }[8], [x19]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 118f\n"
       "114:"  // Height 4: Partial direct writeback: partial_4_0
-      "tbz x11, #2, 116f\n"
-      "str s16, [x28], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
-      "str s28, [x19], #0x4\n"
-      "tbz x11, #1, 115f\n"
-      "st1 { v16.h }[2], [x28], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
-      "st1 { v28.h }[2], [x19], #0x2\n"
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[6], [x28]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
-      "st1 { v28.b }[6], [x19]\n"
+      "tbz x14, #2, 116f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x14, #1, 115f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 118f\n"
       "115:"  // Height 4: Partial direct writeback: partial_1_4
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[4], [x28]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
-      "st1 { v28.b }[4], [x19]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 118f\n"
       "116:"  // Height 4: Partial direct writeback: partial_2_0
-      "tbz x11, #1, 117f\n"
-      "str h16, [x28], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
-      "str h28, [x19], #0x2\n"
-      "tbz x11, #0, 118f\n"
-      "st1 { v16.b }[2], [x28]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
-      "st1 { v28.b }[2], [x19]\n"
+      "tbz x14, #1, 117f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 118f\n"
       "117:"  // Height 4: Partial direct writeback: partial_1_0
-      "str b16, [x28, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
-      "str b28, [x19, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "118:"  // Height 4: Partial direct writeback: Done
       "b 120f\n"
       "119:"  // Height 4: Full writeback
-      "str q16, [x28, #0x0]\n"
-      "add x28, x28, #0x10\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q28, [x19, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "120:"  // Height 4: Writeback done
-      "subs x11, x11, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 92b\n"
       "subs %x[M], %x[M], #0x4\n"
       "beq 122f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 121f\n"
-      "add x20, x20, #0x4\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "121:"  // Update direct input
-      "mov x19, #0x4\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x4\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "122:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
index 6e85eec204..ebe583b5d4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -78,311 +78,310 @@ void a64_hybrid_u8qa_dot_4x16 (
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 91f\n"
       "cmp %x[M], #0x2\n"
       "bgt 61f\n"
       "beq 31f\n"
+      "mov x10, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v15.16b, #0x1\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x27, %x[col_bias]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x26, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
       "movi v18.4s, #0x0\n"
       "movi v19.4s, #0x0\n"
       "3:"  // Height 1: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "cbnz x25, 6f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
       "b 6f\n"
       "5:"  // Height 1: setup direct input
-      "mov x23, %x[input_ptr]\n"
+      "mov x24, %x[input_ptr]\n"
       "6:"  // Height 1: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "blt 11f\n"
-      "ldr q0, [x23, #0x0]\n"
+      "ldr q0, [x24, #0x0]\n"
       "ldr q4, [x28, #0x0]\n"
-      "cmp x24, #0x20\n"
-      "blt 9f\n"
-      "7:"  // Height 1: Multiply loop: Main loop head
-      ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
+      "cmp x25, #0x20\n"
       "ldr q5, [x28, #0x10]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
       "ldr q6, [x28, #0x20]\n"
       "ldr q7, [x28, #0x30]\n"
-      ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
       "ldr q8, [x28, #0x40]\n"
-      ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       "ldr q9, [x28, #0x50]\n"
       "ldr q10, [x28, #0x60]\n"
+      "blt 9f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q21, [x28, #0x70]\n"
+      ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q20, [x28, #0x80]\n"
+      ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q26, [x28, #0x90]\n"
+      ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
+      "ldr q25, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q4, [x28, #0x70]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q24, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
+      "ldr q23, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr q6, [x28, #0x90]\n"
-      "ldr q7, [x28, #0xa0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q9, [x28, #0xc0]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      "ldr q5, [x28, #0xf0]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      ".inst 0x6fa0e2b3  // udot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      ".inst 0x6f80ea90  // udot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      ".inst 0x6f80eb51  // udot v17.4s, v26.16b, v0.4b[2]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f80eb32  // udot v18.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f80eb13  // udot v19.4s, v24.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eaf0  // udot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ead1  // udot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eab2  // udot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea93  // udot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 8f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "ldr q0, [x23, #0x0]\n"
-      "cmp x24, #0x20\n"
+      "ldr q0, [x24, #0x0]\n"
       "ldr q4, [x28, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "bge 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "sub x24, x24, #0x10\n"
+      "ldr q21, [x28, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q20, [x28, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "ldr q8, [x28, #0x40]\n"
+      "ldr q26, [x28, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
+      "ldr q25, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q10, [x28, #0x60]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q24, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q5, [x28, #0x80]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q23, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q7, [x28, #0xa0]\n"
-      "ldr q8, [x28, #0xb0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q9, [x28, #0xc0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      ".inst 0x6fa0e2b3  // udot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      ".inst 0x6f80ea90  // udot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      ".inst 0x6f80eb51  // udot v17.4s, v26.16b, v0.4b[2]\n"
+      "sub x25, x25, #0x10\n"
+      ".inst 0x6f80eb32  // udot v18.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f80eb13  // udot v19.4s, v24.16b, v0.4b[2]\n"
+      "add x24, x24, #0x10\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eaf0  // udot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ead1  // udot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eab2  // udot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea93  // udot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 10f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "10:"  // Height 1: Multiply loop: unique 2: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "11:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x24, 18f\n"
-      "cmp x24, #0x4\n"
+      "cbz x25, 18f\n"
+      "cmp x25, #0x4\n"
       "blt 14f\n"
       "12:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x23], #0x4\n"
+      "ldr s0, [x24], #0x4\n"
       "tbnz %x[flags], #31, 13f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "13:"  // Height 1: Multiply loop: unique 3: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x24, x24, #0x4\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      "ldr q8, [x28, #0x20]\n"
-      "cmp x24, #0x4\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x30]\n"
+      "ldr q23, [x28, #0x0]\n"
+      "ldr q22, [x28, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x4\n"
+      "ldr q21, [x28, #0x20]\n"
+      "ldr q20, [x28, #0x30]\n"
+      ".inst 0x6f80e2f0  // udot v16.4s, v23.16b, v0.4b[0]\n"
+      ".inst 0x6f80e2d1  // udot v17.4s, v22.16b, v0.4b[0]\n"
+      ".inst 0x6f80e2b2  // udot v18.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f80e293  // udot v19.4s, v20.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
       "bge 12b\n"
-      "cbz x24, 18f\n"
       "14:"  // Height 1: Multiply loop: Skip odd blocks
-      "tbz x24, #1, 15f\n"
-      "ldr h0, [x23], #0x2\n"
-      "tbz x24, #0, 16f\n"
-      "ld1 { v0.b }[2], [x23]\n"
+      "cbz x25, 18f\n"
+      "tbz x25, #1, 15f\n"
+      "ldr h0, [x24], #0x2\n"
+      "tbz x25, #0, 16f\n"
+      "ld1 { v0.b }[2], [x24]\n"
       "b 16f\n"
       "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x23, #0x0]\n"
+      "ldr b0, [x24, #0x0]\n"
       "16:"  // Height 1: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 17f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "17:"  // Height 1: Multiply loop: unique 4: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      "ldr q5, [x28, #0x20]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x30]\n"
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q20, [x28, #0x10]\n"
+      ".inst 0x6f80e2b0  // udot v16.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f80e291  // udot v17.4s, v20.16b, v0.4b[0]\n"
+      "ldr q21, [x28, #0x20]\n"
+      "ldr q20, [x28, #0x30]\n"
+      ".inst 0x6f80e2b2  // udot v18.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f80e293  // udot v19.4s, v20.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
       "18:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x25, x25, #0x1\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 4b\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
       "tbnz %x[flags], #31, 19f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
+      "neg v20.4s, v20.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "ld1r { v1.4s }, [x22]\n"
-      "neg v1.4s, v1.4s\n"
-      "mul v11.4s, v11.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v20.4s\n"
       "19:"  // Height 1: skip row sum fixup
+      "ldr q24, [x10, #0x0]\n"
+      "ldr q23, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q0, [x27, #0x0]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q1, [x27, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ldr q22, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q2, [x27, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "ldr q3, [x27, #0x30]\n"
-      "add x27, x27, #0x40\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "ld1r { v4.4s }, [x22]\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v16.4s, v16.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v23.4s\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v19.4s, v19.4s, v21.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v20.4s\n"
+      "add x10, x10, #0x40\n"
+      "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v20.4s\n"
       "tbz %x[flags], #5, 20f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v0.16b\n"
+      "and v21.16b, v18.16b, v0.16b\n"
+      "and v20.16b, v19.16b, v0.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "20:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v22.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x22, %x[qp], %[minval]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
-      "ld1r { v5.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "ld1r { v6.4s }, [x22]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v22.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add v19.4s, v19.4s, v22.4s\n"
       "cmp x9, #0x10\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v16.4s, v16.4s, v21.4s\n"
+      "smin v17.4s, v17.4s, v21.4s\n"
+      "smin v18.4s, v18.4s, v21.4s\n"
+      "smin v19.4s, v19.4s, v21.4s\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 29f\n"
       "tbz x9, #3, 24f\n"
-      "str d16, [x26], #0x8\n"
+      "str d16, [x27], #0x8\n"
       "tbz x9, #2, 22f\n"
-      "st1 { v16.s }[2], [x26], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
       "tbz x9, #1, 21f\n"
-      "st1 { v16.h }[6], [x26], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[14], [x26]\n"
+      "st1 { v16.b }[14], [x27]\n"
       "b 28f\n"
       "21:"  // Height 1: Partial direct writeback: partial_1_12
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[12], [x26]\n"
+      "st1 { v16.b }[12], [x27]\n"
       "b 28f\n"
       "22:"  // Height 1: Partial direct writeback: partial_2_8
       "tbz x9, #1, 23f\n"
-      "st1 { v16.h }[4], [x26], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[10], [x26]\n"
+      "st1 { v16.b }[10], [x27]\n"
       "b 28f\n"
       "23:"  // Height 1: Partial direct writeback: partial_1_8
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[8], [x26]\n"
+      "st1 { v16.b }[8], [x27]\n"
       "b 28f\n"
       "24:"  // Height 1: Partial direct writeback: partial_4_0
       "tbz x9, #2, 26f\n"
-      "str s16, [x26], #0x4\n"
+      "str s16, [x27], #0x4\n"
       "tbz x9, #1, 25f\n"
-      "st1 { v16.h }[2], [x26], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[6], [x26]\n"
+      "st1 { v16.b }[6], [x27]\n"
       "b 28f\n"
       "25:"  // Height 1: Partial direct writeback: partial_1_4
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[4], [x26]\n"
+      "st1 { v16.b }[4], [x27]\n"
       "b 28f\n"
       "26:"  // Height 1: Partial direct writeback: partial_2_0
       "tbz x9, #1, 27f\n"
-      "str h16, [x26], #0x2\n"
+      "str h16, [x27], #0x2\n"
       "tbz x9, #0, 28f\n"
-      "st1 { v16.b }[2], [x26]\n"
+      "st1 { v16.b }[2], [x27]\n"
       "b 28f\n"
       "27:"  // Height 1: Partial direct writeback: partial_1_0
-      "str b16, [x26, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
       "28:"  // Height 1: Partial direct writeback: Done
       "b 30f\n"
       "29:"  // Height 1: Full writeback
-      "str q16, [x26, #0x0]\n"
-      "add x26, x26, #0x10\n"
+      "str q16, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
       "30:"  // Height 1: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 2b\n"
       "b 122f\n"
       "31:"  // Height 2
+      "mov x10, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "movi v12.4s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v15.16b, #0x1\n"
-      "mov x26, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "32:"  // Height 2: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -393,414 +392,414 @@ void a64_hybrid_u8qa_dot_4x16 (
       "movi v22.4s, #0x0\n"
       "movi v23.4s, #0x0\n"
       "33:"  // Height 2: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "34:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 35f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "cbnz x25, 36f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 36f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 36f\n"
       "35:"  // Height 2: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
       "36:"  // Height 2: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "blt 41f\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x24, #0x20\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
       "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
       "blt 39f\n"
       "37:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q25, [x28, #0x70]\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x80]\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q6, [x28, #0x90]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6fa0e333  // udot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e337  // udot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6f80eb10  // udot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb14  // udot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6f80ebd1  // udot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebd5  // udot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x6f80ebb2  // udot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebb6  // udot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f80eb93  // udot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb97  // udot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6fa0eb70  // udot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb74  // udot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb51  // udot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb55  // udot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb32  // udot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb36  // udot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb13  // udot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb17  // udot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 38f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "38:"  // Height 2: Multiply loop: unique 5: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x20\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
       "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
       "bge 37b\n"
       "39:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "sub x24, x24, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q25, [x28, #0x70]\n"
+      "sub x25, x25, #0x10\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x80]\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q30, [x28, #0x90]\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q6, [x28, #0x90]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6fa0e333  // udot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e337  // udot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6f80eb10  // udot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb14  // udot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6f80ebd1  // udot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebd5  // udot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x6f80ebb2  // udot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebb6  // udot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f80eb93  // udot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb97  // udot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6fa0eb70  // udot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb74  // udot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb51  // udot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb55  // udot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb32  // udot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb36  // udot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb13  // udot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb17  // udot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 40f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "40:"  // Height 2: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "41:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x24, 48f\n"
-      "cmp x24, #0x4\n"
+      "cbz x25, 48f\n"
+      "cmp x25, #0x4\n"
       "blt 44f\n"
       "42:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x23], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s1, [x23], #0x4\n"
       "tbnz %x[flags], #31, 43f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "43:"  // Height 2: Multiply loop: unique 7: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x24, x24, #0x4\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      "ldr q8, [x28, #0x20]\n"
-      "cmp x24, #0x4\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x30]\n"
+      "ldr q27, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x4\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x6f80e370  // udot v16.4s, v27.16b, v0.4b[0]\n"
+      ".inst 0x6f81e374  // udot v20.4s, v27.16b, v1.4b[0]\n"
+      ".inst 0x6f80e351  // udot v17.4s, v26.16b, v0.4b[0]\n"
+      ".inst 0x6f81e355  // udot v21.4s, v26.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x6f80e332  // udot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e336  // udot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f80e313  // udot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e317  // udot v23.4s, v24.16b, v1.4b[0]\n"
       "bge 42b\n"
-      "cbz x24, 48f\n"
       "44:"  // Height 2: Multiply loop: Skip odd blocks
-      "tbz x24, #1, 45f\n"
-      "ldr h0, [x23], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "tbz x24, #0, 46f\n"
-      "ld1 { v0.b }[2], [x23]\n"
-      "ld1 { v1.b }[2], [x22]\n"
+      "cbz x25, 48f\n"
+      "tbz x25, #1, 45f\n"
+      "ldr h0, [x24], #0x2\n"
+      "ldr h1, [x23], #0x2\n"
+      "tbz x25, #0, 46f\n"
+      "ld1 { v0.b }[2], [x24]\n"
+      "ld1 { v1.b }[2], [x23]\n"
       "b 46f\n"
       "45:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x23, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
+      "ldr b0, [x24, #0x0]\n"
+      "ldr b1, [x23, #0x0]\n"
       "46:"  // Height 2: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 47f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "47:"  // Height 2: Multiply loop: unique 8: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x6f80e310  // udot v16.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e314  // udot v20.4s, v24.16b, v1.4b[0]\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x6f80e351  // udot v17.4s, v26.16b, v0.4b[0]\n"
+      ".inst 0x6f81e355  // udot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x6f80e332  // udot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e336  // udot v22.4s, v25.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e313  // udot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e317  // udot v23.4s, v24.16b, v1.4b[0]\n"
       "48:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x25, x25, #0x1\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 34b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x21, x26, x19\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "tbnz %x[flags], #31, 49f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x22]\n"
       "addp v12.4s, v12.4s, v12.4s\n"
+      "neg v24.4s, v24.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "neg v2.4s, v2.4s\n"
-      "mul v11.4s, v11.4s, v2.4s\n"
-      "mul v12.4s, v12.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
       "49:"  // Height 2: skip row sum fixup
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q27, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q0, [x27, #0x0]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q1, [x27, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ldr q26, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q2, [x27, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "ldr q3, [x27, #0x30]\n"
-      "add x27, x27, #0x40\n"
       "add v20.4s, v20.4s, v12.4s\n"
-      "ld1r { v4.4s }, [x22]\n"
       "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v27.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v25.4s\n"
+      "add v20.4s, v20.4s, v28.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v21.4s, v21.4s, v27.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v23.4s, v23.4s, v25.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
       "tbz %x[flags], #5, 50f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v24.16b, v16.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v24.4s\n"
+      "and v30.16b, v17.16b, v0.16b\n"
+      "and v29.16b, v18.16b, v0.16b\n"
+      "and v28.16b, v19.16b, v0.16b\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v0.16b\n"
+      "and v25.16b, v22.16b, v0.16b\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "50:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x22, %x[qp], %[minval]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
-      "ld1r { v5.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "cmp x9, #0x10\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
+      "cmp x9, #0x10\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 59f\n"
       "tbz x9, #3, 54f\n"
-      "str d16, [x26], #0x8\n"
-      "str d20, [x21], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x23], #0x8\n"
       "tbz x9, #2, 52f\n"
-      "st1 { v16.s }[2], [x26], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
       "tbz x9, #1, 51f\n"
-      "st1 { v16.h }[6], [x26], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[14], [x26]\n"
-      "st1 { v20.b }[14], [x21]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 58f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[12], [x26]\n"
-      "st1 { v20.b }[12], [x21]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 58f\n"
       "52:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x9, #1, 53f\n"
-      "st1 { v16.h }[4], [x26], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[10], [x26]\n"
-      "st1 { v20.b }[10], [x21]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 58f\n"
       "53:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[8], [x26]\n"
-      "st1 { v20.b }[8], [x21]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 58f\n"
       "54:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x9, #2, 56f\n"
-      "str s16, [x26], #0x4\n"
-      "str s20, [x21], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x23], #0x4\n"
       "tbz x9, #1, 55f\n"
-      "st1 { v16.h }[2], [x26], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[6], [x26]\n"
-      "st1 { v20.b }[6], [x21]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 58f\n"
       "55:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[4], [x26]\n"
-      "st1 { v20.b }[4], [x21]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 58f\n"
       "56:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x9, #1, 57f\n"
-      "str h16, [x26], #0x2\n"
-      "str h20, [x21], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
-      "st1 { v16.b }[2], [x26]\n"
-      "st1 { v20.b }[2], [x21]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 58f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_0
-      "str b16, [x26, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "58:"  // Height 2: Partial direct writeback: Done
       "b 60f\n"
       "59:"  // Height 2: Full writeback
-      "str q16, [x26, #0x0]\n"
-      "add x26, x26, #0x10\n"
-      "str q20, [x21, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q20, [x23, #0x0]\n"
       "60:"  // Height 2: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 32b\n"
       "b 122f\n"
       "61:"  // Height 3
+      "mov x10, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "movi v12.4s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
-      "mov x26, %x[output_ptr]\n"
       "movi v15.16b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "62:"  // Height 3: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -815,539 +814,539 @@ void a64_hybrid_u8qa_dot_4x16 (
       "movi v26.4s, #0x0\n"
       "movi v27.4s, #0x0\n"
       "63:"  // Height 3: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "64:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 65f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "cbnz x25, 66f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 66f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
       "b 66f\n"
       "65:"  // Height 3: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "66:"  // Height 3: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "blt 71f\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x24, #0x20\n"
-      "ldr q2, [x21, #0x0]\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q2, [x22, #0x0]\n"
       "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
       "blt 69f\n"
       "67:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q29, [x28, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
+      "ldr q28, [x28, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q5, [x28, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q4, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q3, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q31, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q30, [x28, #0xd0]\n"
+      ".inst 0x6fa0e3b3  // udot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3b7  // udot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3bb  // udot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x28, #0xe0]\n"
+      ".inst 0x6f80eb90  // udot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb94  // udot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb98  // udot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x28, #0xf0]\n"
+      ".inst 0x6f80e8b1  // udot v17.4s, v5.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6f81e8b5  // udot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8b9  // udot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x6f80e892  // udot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6f81e896  // udot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6f82e89a  // udot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x6f80e873  // udot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x6f81e877  // udot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x6f82e87b  // udot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x6fa0ebf0  // udot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebf4  // udot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebf8  // udot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebd1  // udot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebd5  // udot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebd9  // udot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebb2  // udot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebb6  // udot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebba  // udot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa0eb93  // udot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb97  // udot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb9b  // udot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 68f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "68:"  // Height 3: Multiply loop: unique 9: skip row sum
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x20\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q4, [x28, #0x0]\n"
       "bge 67b\n"
       "69:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "sub x24, x24, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "sub x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q29, [x28, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
-      "add x21, x21, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
+      "ldr q28, [x28, #0x80]\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q5, [x28, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q4, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q3, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q31, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q30, [x28, #0xd0]\n"
+      ".inst 0x6fa0e3b3  // udot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3b7  // udot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3bb  // udot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x28, #0xe0]\n"
+      ".inst 0x6f80eb90  // udot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb94  // udot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb98  // udot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x28, #0xf0]\n"
+      ".inst 0x6f80e8b1  // udot v17.4s, v5.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6f81e8b5  // udot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8b9  // udot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x6f80e892  // udot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6f81e896  // udot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6f82e89a  // udot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x6f80e873  // udot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x6f81e877  // udot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x6f82e87b  // udot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x6fa0ebf0  // udot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebf4  // udot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebf8  // udot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebd1  // udot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebd5  // udot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebd9  // udot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebb2  // udot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebb6  // udot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebba  // udot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa0eb93  // udot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb97  // udot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb9b  // udot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 70f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "70:"  // Height 3: Multiply loop: unique 10: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "71:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x24, 78f\n"
-      "cmp x24, #0x4\n"
+      "cbz x25, 78f\n"
+      "cmp x25, #0x4\n"
       "blt 74f\n"
       "72:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x23], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
-      "ldr s2, [x21], #0x4\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s1, [x23], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
       "tbnz %x[flags], #31, 73f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "73:"  // Height 3: Multiply loop: unique 11: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x24, x24, #0x4\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      "ldr q8, [x28, #0x20]\n"
-      "cmp x24, #0x4\n"
-      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
-      "ldr q9, [x28, #0x30]\n"
+      "ldr q31, [x28, #0x0]\n"
+      "ldr q30, [x28, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x4\n"
+      "ldr q29, [x28, #0x20]\n"
+      "ldr q28, [x28, #0x30]\n"
+      ".inst 0x6f80e3f0  // udot v16.4s, v31.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3f4  // udot v20.4s, v31.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3f8  // udot v24.4s, v31.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3d1  // udot v17.4s, v30.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x6f81e3d5  // udot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3d9  // udot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3b6  // udot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3ba  // udot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e397  // udot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e39b  // udot v27.4s, v28.16b, v2.4b[0]\n"
       "bge 72b\n"
-      "cbz x24, 78f\n"
       "74:"  // Height 3: Multiply loop: Skip odd blocks
-      "tbz x24, #1, 75f\n"
-      "ldr h0, [x23], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "ldr h2, [x21], #0x2\n"
-      "tbz x24, #0, 76f\n"
-      "ld1 { v0.b }[2], [x23]\n"
-      "ld1 { v1.b }[2], [x22]\n"
-      "ld1 { v2.b }[2], [x21]\n"
+      "cbz x25, 78f\n"
+      "tbz x25, #1, 75f\n"
+      "ldr h0, [x24], #0x2\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "tbz x25, #0, 76f\n"
+      "ld1 { v0.b }[2], [x24]\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "ld1 { v2.b }[2], [x22]\n"
       "b 76f\n"
       "75:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x23, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
-      "ldr b2, [x21, #0x0]\n"
+      "ldr b0, [x24, #0x0]\n"
+      "ldr b1, [x23, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
       "76:"  // Height 3: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 77f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "77:"  // Height 3: Multiply loop: unique 12: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x20]\n"
-      ".inst 0x6f82e158  // udot v24.4s, v10.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x30]\n"
+      "ldr q31, [x28, #0x0]\n"
+      "ldr q30, [x28, #0x10]\n"
+      ".inst 0x6f80e3f0  // udot v16.4s, v31.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3f4  // udot v20.4s, v31.16b, v1.4b[0]\n"
+      "ldr q29, [x28, #0x20]\n"
+      "ldr q28, [x28, #0x30]\n"
+      ".inst 0x6f82e3f8  // udot v24.4s, v31.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3d1  // udot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3d5  // udot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3d9  // udot v25.4s, v30.16b, v2.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x6f82e099  // udot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0db  // udot v27.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3b6  // udot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3ba  // udot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e397  // udot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e39b  // udot v27.4s, v28.16b, v2.4b[0]\n"
       "78:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x25, x25, #0x1\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 64b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x21, x26, x19\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
       "tbnz %x[flags], #31, 79f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v3.4s }, [x22]\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
+      "neg v28.4s, v28.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "neg v3.4s, v3.4s\n"
-      "mul v11.4s, v11.4s, v3.4s\n"
-      "mul v12.4s, v12.4s, v3.4s\n"
-      "mul v13.4s, v13.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v28.4s\n"
+      "mul v12.4s, v12.4s, v28.4s\n"
+      "mul v13.4s, v13.4s, v28.4s\n"
       "79:"  // Height 3: skip row sum fixup
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q31, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q0, [x27, #0x0]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q1, [x27, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ldr q30, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q2, [x27, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "ldr q3, [x27, #0x30]\n"
-      "add x27, x27, #0x40\n"
       "add v20.4s, v20.4s, v12.4s\n"
-      "ld1r { v4.4s }, [x22]\n"
       "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v24.4s, v24.4s, v13.4s\n"
       "add v25.4s, v25.4s, v13.4s\n"
+      "add x10, x10, #0x40\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v31.4s\n"
+      "add v18.4s, v18.4s, v30.4s\n"
+      "add v19.4s, v19.4s, v29.4s\n"
       "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v31.4s\n"
+      "add v22.4s, v22.4s, v30.4s\n"
+      "add v23.4s, v23.4s, v29.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v31.4s\n"
+      "add v26.4s, v26.4s, v30.4s\n"
+      "add v27.4s, v27.4s, v29.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v28.4s\n"
       "tbz %x[flags], #5, 80f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v1.16b, v16.16b, v0.16b\n"
+      "and v31.16b, v17.16b, v0.16b\n"
+      "and v30.16b, v18.16b, v0.16b\n"
+      "and v29.16b, v19.16b, v0.16b\n"
+      "and v28.16b, v20.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v1.4s\n"
+      "sqadd v17.4s, v17.4s, v31.4s\n"
+      "sqadd v18.4s, v18.4s, v30.4s\n"
+      "sqadd v19.4s, v19.4s, v29.4s\n"
+      "sqadd v20.4s, v20.4s, v28.4s\n"
+      "and v3.16b, v21.16b, v0.16b\n"
+      "and v2.16b, v22.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v0.16b\n"
+      "and v29.16b, v26.16b, v0.16b\n"
+      "and v28.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v3.4s\n"
+      "sqadd v22.4s, v22.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "80:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v30.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x22, %x[qp], %[minval]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
-      "ld1r { v5.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "cmp x9, #0x10\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
+      "cmp x9, #0x10\n"
       "srshl v24.4s, v24.4s, v0.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
       "srshl v25.4s, v25.4s, v0.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v30.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v30.4s\n"
+      "add v19.4s, v19.4s, v30.4s\n"
+      "add v20.4s, v20.4s, v30.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v30.4s\n"
+      "add v23.4s, v23.4s, v30.4s\n"
+      "add v24.4s, v24.4s, v30.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v30.4s\n"
+      "add v27.4s, v27.4s, v30.4s\n"
+      "smin v16.4s, v16.4s, v29.4s\n"
+      "smin v17.4s, v17.4s, v29.4s\n"
+      "smin v18.4s, v18.4s, v29.4s\n"
+      "smin v19.4s, v19.4s, v29.4s\n"
+      "smin v20.4s, v20.4s, v29.4s\n"
+      "smin v21.4s, v21.4s, v29.4s\n"
+      "smin v22.4s, v22.4s, v29.4s\n"
+      "smin v23.4s, v23.4s, v29.4s\n"
+      "smin v24.4s, v24.4s, v29.4s\n"
+      "smin v25.4s, v25.4s, v29.4s\n"
+      "smin v26.4s, v26.4s, v29.4s\n"
+      "smin v27.4s, v27.4s, v29.4s\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 89f\n"
       "tbz x9, #3, 84f\n"
-      "str d16, [x26], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x9, #2, 82f\n"
-      "st1 { v16.s }[2], [x26], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x9, #1, 81f\n"
-      "st1 { v16.h }[6], [x26], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[14], [x26]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 88f\n"
       "81:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[12], [x26]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 88f\n"
       "82:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x9, #1, 83f\n"
-      "st1 { v16.h }[4], [x26], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[10], [x26]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 88f\n"
       "83:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[8], [x26]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 88f\n"
       "84:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x9, #2, 86f\n"
-      "str s16, [x26], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
       "tbz x9, #1, 85f\n"
-      "st1 { v16.h }[2], [x26], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[6], [x26]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 88f\n"
       "85:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[4], [x26]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 88f\n"
       "86:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x9, #1, 87f\n"
-      "str h16, [x26], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
-      "st1 { v16.b }[2], [x26]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 88f\n"
       "87:"  // Height 3: Partial direct writeback: partial_1_0
-      "str b16, [x26, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "88:"  // Height 3: Partial direct writeback: Done
       "b 90f\n"
       "89:"  // Height 3: Full writeback
-      "str q16, [x26, #0x0]\n"
-      "add x26, x26, #0x10\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "90:"  // Height 3: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 62b\n"
       "b 122f\n"
       "91:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x4\n"
+      "mov x10, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "movi v12.4s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x26, %x[output_ptr]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v14.4s, #0x0\n"
-      "mov x19, #0x4\n"
       "movi v15.16b, #0x1\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "92:"  // Height 4: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -1366,59 +1365,59 @@ void a64_hybrid_u8qa_dot_4x16 (
       "movi v30.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
       "93:"  // Height 4: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "94:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 95f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "ldr x20, [x20, #0x18]\n"
-      "cbnz x25, 96f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 96f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
       "b 96f\n"
       "95:"  // Height 4: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "96:"  // Height 4: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "blt 101f\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "cmp x24, #0x20\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q3, [x20, #0x0]\n"
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x21, #0x0]\n"
       "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
       "blt 99f\n"
       "97:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x6f83e09c  // udot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
-      "add x20, x20, #0x10\n"
+      "ldr q4, [x28, #0x70]\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
+      "add x21, x21, #0x10\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
       ".inst 0x6f83e0bd  // udot v29.4s, v5.16b, v3.4b[0]\n"
       "ldr q5, [x28, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
@@ -1491,38 +1490,38 @@ void a64_hybrid_u8qa_dot_4x16 (
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "98:"  // Height 4: Multiply loop: unique 13: skip row sum
+      "ldr q0, [x24, #0x0]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x21, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q8, [x28, #0x40]\n"
+      "ldr q9, [x28, #0x50]\n"
+      "ldr q10, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x20\n"
       "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr q0, [x23, #0x0]\n"
-      "ldr q1, [x22, #0x0]\n"
-      "ldr q2, [x21, #0x0]\n"
-      "ldr q3, [x20, #0x0]\n"
-      "ldr q4, [x28, #0x0]\n"
       "bge 97b\n"
       "99:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "sub x24, x24, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "sub x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x6f83e09c  // udot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr q8, [x28, #0x40]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q4, [x28, #0x70]\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q9, [x28, #0x50]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q10, [x28, #0x60]\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
       ".inst 0x6f83e0bd  // udot v29.4s, v5.16b, v3.4b[0]\n"
       "ldr q5, [x28, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
@@ -1595,67 +1594,67 @@ void a64_hybrid_u8qa_dot_4x16 (
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "100:"  // Height 4: Multiply loop: unique 14: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
       "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "101:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x24, 108f\n"
-      "cmp x24, #0x4\n"
+      "cbz x25, 108f\n"
+      "cmp x25, #0x4\n"
       "blt 104f\n"
       "102:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x23], #0x4\n"
-      "ldr s1, [x22], #0x4\n"
-      "ldr s2, [x21], #0x4\n"
-      "ldr s3, [x20], #0x4\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s1, [x23], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr s3, [x21], #0x4\n"
       "tbnz %x[flags], #31, 103f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "103:"  // Height 4: Multiply loop: unique 15: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0x10]\n"
-      "sub x24, x24, #0x4\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      "ldr q8, [x28, #0x20]\n"
-      "cmp x24, #0x4\n"
-      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
-      "ldr q9, [x28, #0x30]\n"
+      "ldr q7, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x4\n"
+      "ldr q5, [x28, #0x20]\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x6f80e0f0  // udot v16.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f4  // udot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0fc  // udot v28.4s, v7.16b, v3.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f83e0dc  // udot v28.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0fd  // udot v29.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x6f83e11e  // udot v30.4s, v8.16b, v3.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
-      ".inst 0x6f83e13f  // udot v31.4s, v9.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0dd  // udot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0be  // udot v30.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x6f83e09f  // udot v31.4s, v4.16b, v3.4b[0]\n"
       "bge 102b\n"
-      "cbz x24, 108f\n"
       "104:"  // Height 4: Multiply loop: Skip odd blocks
-      "tbz x24, #1, 105f\n"
-      "ldr h0, [x23], #0x2\n"
-      "ldr h1, [x22], #0x2\n"
-      "ldr h2, [x21], #0x2\n"
-      "ldr h3, [x20], #0x2\n"
-      "tbz x24, #0, 106f\n"
-      "ld1 { v0.b }[2], [x23]\n"
-      "ld1 { v1.b }[2], [x22]\n"
-      "ld1 { v2.b }[2], [x21]\n"
-      "ld1 { v3.b }[2], [x20]\n"
+      "cbz x25, 108f\n"
+      "tbz x25, #1, 105f\n"
+      "ldr h0, [x24], #0x2\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "ldr h3, [x21], #0x2\n"
+      "tbz x25, #0, 106f\n"
+      "ld1 { v0.b }[2], [x24]\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "ld1 { v3.b }[2], [x21]\n"
       "b 106f\n"
       "105:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x23, #0x0]\n"
-      "ldr b1, [x22, #0x0]\n"
-      "ldr b2, [x21, #0x0]\n"
-      "ldr b3, [x20, #0x0]\n"
+      "ldr b0, [x24, #0x0]\n"
+      "ldr b1, [x23, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "ldr b3, [x21, #0x0]\n"
       "106:"  // Height 4: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 107f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
@@ -1663,76 +1662,76 @@ void a64_hybrid_u8qa_dot_4x16 (
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "107:"  // Height 4: Multiply loop: unique 16: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
+      "ldr q7, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      ".inst 0x6f80e0f0  // udot v16.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f4  // udot v20.4s, v7.16b, v1.4b[0]\n"
       "ldr q5, [x28, #0x20]\n"
-      ".inst 0x6f82e158  // udot v24.4s, v10.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x30]\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0fc  // udot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f83e15c  // udot v28.4s, v10.16b, v3.4b[0]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x6f82e099  // udot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x6f83e09d  // udot v29.4s, v4.16b, v3.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0dd  // udot v29.4s, v6.16b, v3.4b[0]\n"
       ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
       ".inst 0x6f83e0be  // udot v30.4s, v5.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0db  // udot v27.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0df  // udot v31.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x6f83e09f  // udot v31.4s, v4.16b, v3.4b[0]\n"
       "108:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x25, x25, #0x1\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 94b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x21, x26, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "add x21, x22, x20\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
-      "add x19, x20, x19\n"
-      "prfm pstl1keep, [x19, #0x0]\n"
       "tbnz %x[flags], #31, 109f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x22, %x[qp], %[b_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
+      "neg v0.4s, v0.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "neg v4.4s, v4.4s\n"
-      "mul v11.4s, v11.4s, v4.4s\n"
-      "mul v12.4s, v12.4s, v4.4s\n"
-      "mul v13.4s, v13.4s, v4.4s\n"
-      "mul v14.4s, v14.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
       "109:"  // Height 4: skip row sum fixup
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q4, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q0, [x27, #0x0]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q1, [x27, #0x10]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ldr q3, [x10, #0x20]\n"
+      "ldr q2, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q2, [x27, #0x20]\n"
-      "add x22, %x[qp], %[per_layer_mul]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "ldr q3, [x27, #0x30]\n"
-      "add x27, x27, #0x40\n"
       "add v20.4s, v20.4s, v12.4s\n"
-      "ld1r { v4.4s }, [x22]\n"
       "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v24.4s, v24.4s, v13.4s\n"
       "add v25.4s, v25.4s, v13.4s\n"
+      "add x10, x10, #0x40\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
       "add v28.4s, v28.4s, v14.4s\n"
@@ -1740,287 +1739,286 @@ void a64_hybrid_u8qa_dot_4x16 (
       "add v30.4s, v30.4s, v14.4s\n"
       "add v31.4s, v31.4s, v14.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v2.4s\n"
       "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v2.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v2.4s\n"
       "add v28.4s, v28.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v29.4s, v29.4s, v1.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v31.4s, v31.4s, v2.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
       "tbz %x[flags], #5, 110f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "and v9.16b, v21.16b, v0.16b\n"
+      "and v2.16b, v16.16b, v0.16b\n"
+      "and v1.16b, v17.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v2.4s\n"
+      "sqadd v17.4s, v17.4s, v1.4s\n"
+      "and v7.16b, v18.16b, v0.16b\n"
+      "and v6.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v4.16b, v21.16b, v0.16b\n"
+      "and v3.16b, v22.16b, v0.16b\n"
+      "and v2.16b, v23.16b, v0.16b\n"
+      "and v1.16b, v24.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "and v9.16b, v28.16b, v0.16b\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "and v10.16b, v29.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "and v4.16b, v30.16b, v0.16b\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "and v5.16b, v31.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v7.4s\n"
+      "sqadd v19.4s, v19.4s, v6.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v4.4s\n"
+      "sqadd v22.4s, v22.4s, v3.4s\n"
+      "sqadd v23.4s, v23.4s, v2.4s\n"
+      "sqadd v24.4s, v24.4s, v1.4s\n"
+      "and v7.16b, v25.16b, v0.16b\n"
+      "and v6.16b, v26.16b, v0.16b\n"
+      "and v5.16b, v27.16b, v0.16b\n"
+      "and v4.16b, v28.16b, v0.16b\n"
+      "and v3.16b, v29.16b, v0.16b\n"
+      "and v2.16b, v30.16b, v0.16b\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v28.4s, v28.4s, v9.4s\n"
-      "sqadd v29.4s, v29.4s, v10.4s\n"
-      "sqadd v30.4s, v30.4s, v4.4s\n"
-      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v7.4s\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "sqadd v28.4s, v28.4s, v4.4s\n"
+      "sqadd v29.4s, v29.4s, v3.4s\n"
+      "sqadd v30.4s, v30.4s, v2.4s\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
       "110:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v3.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
-      "add x22, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x22]\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x22, %x[qp], %[minval]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
-      "ld1r { v5.4s }, [x22]\n"
-      "add x22, %x[qp], %[maxval]\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "ld1r { v6.4s }, [x22]\n"
-      "cmp x9, #0x10\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v2.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
+      "cmp x9, #0x10\n"
       "srshl v24.4s, v24.4s, v0.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
       "srshl v25.4s, v25.4s, v0.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
       "srshl v28.4s, v28.4s, v0.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
       "srshl v29.4s, v29.4s, v0.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
       "srshl v30.4s, v30.4s, v0.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "smin v16.4s, v16.4s, v2.4s\n"
+      "smin v17.4s, v17.4s, v2.4s\n"
+      "smin v18.4s, v18.4s, v2.4s\n"
+      "smin v19.4s, v19.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v2.4s\n"
+      "smin v21.4s, v21.4s, v2.4s\n"
+      "smin v22.4s, v22.4s, v2.4s\n"
+      "smin v23.4s, v23.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v2.4s\n"
+      "smin v25.4s, v25.4s, v2.4s\n"
+      "smin v26.4s, v26.4s, v2.4s\n"
+      "smin v27.4s, v27.4s, v2.4s\n"
+      "smin v28.4s, v28.4s, v2.4s\n"
+      "smin v29.4s, v29.4s, v2.4s\n"
+      "smin v30.4s, v30.4s, v2.4s\n"
+      "smin v31.4s, v31.4s, v2.4s\n"
+      "smax v16.4s, v16.4s, v1.4s\n"
+      "smax v17.4s, v17.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v1.4s\n"
+      "smax v19.4s, v19.4s, v1.4s\n"
+      "smax v20.4s, v20.4s, v1.4s\n"
+      "smax v21.4s, v21.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v1.4s\n"
+      "smax v23.4s, v23.4s, v1.4s\n"
+      "smax v24.4s, v24.4s, v1.4s\n"
+      "smax v25.4s, v25.4s, v1.4s\n"
+      "smax v26.4s, v26.4s, v1.4s\n"
+      "smax v27.4s, v27.4s, v1.4s\n"
+      "smax v28.4s, v28.4s, v1.4s\n"
+      "smax v29.4s, v29.4s, v1.4s\n"
+      "smax v30.4s, v30.4s, v1.4s\n"
+      "smax v31.4s, v31.4s, v1.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 119f\n"
       "tbz x9, #3, 114f\n"
-      "str d16, [x26], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "str d28, [x19], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
       "tbz x9, #2, 112f\n"
-      "st1 { v16.s }[2], [x26], #0x4\n"
-      "st1 { v20.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
-      "st1 { v28.s }[2], [x19], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
       "tbz x9, #1, 111f\n"
-      "st1 { v16.h }[6], [x26], #0x2\n"
-      "st1 { v20.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
-      "st1 { v28.h }[6], [x19], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[14], [x26]\n"
-      "st1 { v20.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
-      "st1 { v28.b }[14], [x19]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 118f\n"
       "111:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[12], [x26]\n"
-      "st1 { v20.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
-      "st1 { v28.b }[12], [x19]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 118f\n"
       "112:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x9, #1, 113f\n"
-      "st1 { v16.h }[4], [x26], #0x2\n"
-      "st1 { v20.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
-      "st1 { v28.h }[4], [x19], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[10], [x26]\n"
-      "st1 { v20.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
-      "st1 { v28.b }[10], [x19]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 118f\n"
       "113:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[8], [x26]\n"
-      "st1 { v20.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
-      "st1 { v28.b }[8], [x19]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 118f\n"
       "114:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x9, #2, 116f\n"
-      "str s16, [x26], #0x4\n"
-      "str s20, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
-      "str s28, [x19], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
       "tbz x9, #1, 115f\n"
-      "st1 { v16.h }[2], [x26], #0x2\n"
-      "st1 { v20.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
-      "st1 { v28.h }[2], [x19], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[6], [x26]\n"
-      "st1 { v20.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
-      "st1 { v28.b }[6], [x19]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 118f\n"
       "115:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[4], [x26]\n"
-      "st1 { v20.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
-      "st1 { v28.b }[4], [x19]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 118f\n"
       "116:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x9, #1, 117f\n"
-      "str h16, [x26], #0x2\n"
-      "str h20, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
-      "str h28, [x19], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
-      "st1 { v16.b }[2], [x26]\n"
-      "st1 { v20.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
-      "st1 { v28.b }[2], [x19]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 118f\n"
       "117:"  // Height 4: Partial direct writeback: partial_1_0
-      "str b16, [x26, #0x0]\n"
-      "str b20, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
-      "str b28, [x19, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "118:"  // Height 4: Partial direct writeback: Done
       "b 120f\n"
       "119:"  // Height 4: Full writeback
-      "str q16, [x26, #0x0]\n"
-      "add x26, x26, #0x10\n"
-      "str q20, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q28, [x19, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "120:"  // Height 4: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 92b\n"
       "subs %x[M], %x[M], #0x4\n"
       "beq 122f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 121f\n"
-      "add x20, x20, #0x4\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "121:"  // Update direct input
-      "mov x19, #0x4\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x4\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "122:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
new file mode 100644
index 0000000000..17e7405a0a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<uint8_t>, \
+    size_t, size_t, \
+    const uint8_t *, \
+    IndirectOutputArg<uint8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_u8qa_mmla_4x16( ARGLIST );
+
+class cls_a64_hybrid_u8qa_mmla_4x16
+{
+public:
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
+    typedef uint8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 47.68 };
+                case CPUModel::A510:
+                    return { 28.00 };
+                case CPUModel::V1:
+                    return { 62.26 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_u8qa_mmla_4x16;
+    cls_a64_hybrid_u8qa_mmla_4x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
new file mode 100644
index 0000000000..1335b355ef
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
@@ -0,0 +1,2098 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8qa_mmla_4x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 97f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 65f\n"
+      "beq 33f\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v11.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x26, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x25, #0x10\n"
+      "blt 11f\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "blt 9f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v27.2d\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q25, [x28, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v27.2d\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "ldr q24, [x28, #0x80]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      "ldr q30, [x28, #0x90]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      "ldr q29, [x28, #0xa0]\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      "ldr q28, [x28, #0xb0]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      "ldr q27, [x28, #0xc0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6e99a417  // ummla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6e98a430  // ummla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      ".inst 0x6e9ea434  // ummla v20.4s, v1.16b, v30.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e9da431  // ummla v17.4s, v1.16b, v29.16b\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e9ca435  // ummla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x6e9ba432  // ummla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x6e9aa436  // ummla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e99a433  // ummla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e98a437  // ummla v23.4s, v1.16b, v24.16b\n"
+      "tbnz %x[flags], #31, 8f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      "8:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "bge 7b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v24.2d\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q25, [x28, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v24.2d\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "ldr q24, [x28, #0x80]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      "ldr q30, [x28, #0x90]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      "ldr q29, [x28, #0xa0]\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      "ldr q28, [x28, #0xb0]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      "ldr q27, [x28, #0xc0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6e99a417  // ummla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6e98a430  // ummla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      "sub x25, x25, #0x10\n"
+      ".inst 0x6e9ea434  // ummla v20.4s, v1.16b, v30.16b\n"
+      ".inst 0x6e9da431  // ummla v17.4s, v1.16b, v29.16b\n"
+      "add x24, x24, #0x10\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e9ca435  // ummla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x6e9ba432  // ummla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x6e9aa436  // ummla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e99a433  // ummla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e98a437  // ummla v23.4s, v1.16b, v24.16b\n"
+      "tbnz %x[flags], #31, 10f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      "10:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "11:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x25, 20f\n"
+      "cmp x25, #0x8\n"
+      "blt 14f\n"
+      "12:"  // Height 1: Multiply loop: Odd block loop
+      "ldr d25, [x24], #0x8\n"
+      "trn1 v0.2d, v25.2d, v24.2d\n"
+      "tbnz %x[flags], #31, 13f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "13:"  // Height 1: Multiply loop: unique 3: skip row sum
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x6e98a410  // ummla v16.4s, v0.16b, v24.16b\n"
+      "sub x25, x25, #0x8\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      "cmp x25, #0x8\n"
+      ".inst 0x6e9aa414  // ummla v20.4s, v0.16b, v26.16b\n"
+      "ldr q27, [x28, #0x40]\n"
+      "ldr q26, [x28, #0x50]\n"
+      ".inst 0x6e99a411  // ummla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a415  // ummla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x6e9ba412  // ummla v18.4s, v0.16b, v27.16b\n"
+      ".inst 0x6e9aa416  // ummla v22.4s, v0.16b, v26.16b\n"
+      ".inst 0x6e99a413  // ummla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a417  // ummla v23.4s, v0.16b, v24.16b\n"
+      "add x28, x28, #0x80\n"
+      "bge 12b\n"
+      "14:"  // Height 1: Multiply loop: Skip odd blocks
+      "cbz x25, 20f\n"
+      "tbz x25, #2, 16f\n"
+      "ldr s1, [x24], #0x4\n"
+      "tbz x25, #1, 15f\n"
+      "ld1 { v1.h }[2], [x24], #0x2\n"
+      "tbz x25, #0, 18f\n"
+      "ld1 { v1.b }[6], [x24]\n"
+      "b 18f\n"
+      "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x25, #0, 18f\n"
+      "ld1 { v1.b }[4], [x24]\n"
+      "b 18f\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x25, #1, 17f\n"
+      "ldr h1, [x24], #0x2\n"
+      "tbz x25, #0, 18f\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "b 18f\n"
+      "17:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x24, #0x0]\n"
+      "18:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v24.2d\n"
+      "tbnz %x[flags], #31, 19f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "19:"  // Height 1: Multiply loop: unique 4: skip row sum
+      "ldr q25, [x28, #0x0]\n"
+      "ldr q24, [x28, #0x10]\n"
+      ".inst 0x6e99a410  // ummla v16.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a414  // ummla v20.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x6e99a411  // ummla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a415  // ummla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x50]\n"
+      ".inst 0x6e99a412  // ummla v18.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a416  // ummla v22.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x6e99a413  // ummla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a417  // ummla v23.4s, v0.16b, v24.16b\n"
+      "add x28, x28, #0x80\n"
+      "20:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 4b\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "mov v23.16b, v16.16b\n"
+      "tbnz %x[flags], #31, 21f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "neg v16.4s, v16.4s\n"
+      "dup v11.4s, v11.s[0]\n"
+      "mul v11.4s, v11.4s, v16.4s\n"
+      "21:"  // Height 1: skip row sum fixup
+      "ldr q24, [x10, #0x0]\n"
+      "ldr q22, [x10, #0x10]\n"
+      "add v23.4s, v23.4s, v11.4s\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q21, [x10, #0x20]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v16.4s\n"
+      "add x10, x10, #0x40\n"
+      "sqrdmulh v17.4s, v17.4s, v16.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v16.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v16.4s\n"
+      "tbz %x[flags], #5, 22f\n"
+      "and v22.16b, v23.16b, v0.16b\n"
+      "and v21.16b, v17.16b, v0.16b\n"
+      "and v20.16b, v18.16b, v0.16b\n"
+      "and v16.16b, v19.16b, v0.16b\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v22.4s\n"
+      "sqadd v17.4s, v17.4s, v21.4s\n"
+      "sqadd v18.4s, v18.4s, v20.4s\n"
+      "sqadd v19.4s, v19.4s, v16.4s\n"
+      "22:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v23.4s, v23.4s, v21.4s\n"
+      "add v17.4s, v17.4s, v21.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v21.4s\n"
+      "cmp x9, #0x10\n"
+      "smin v23.4s, v23.4s, v20.4s\n"
+      "smin v17.4s, v17.4s, v20.4s\n"
+      "smin v18.4s, v18.4s, v20.4s\n"
+      "smin v19.4s, v19.4s, v20.4s\n"
+      "smax v23.4s, v23.4s, v16.4s\n"
+      "smax v17.4s, v17.4s, v16.4s\n"
+      "smax v18.4s, v18.4s, v16.4s\n"
+      "smax v19.4s, v19.4s, v16.4s\n"
+      "uzp1 v23.8h, v23.8h, v17.8h\n"
+      "uzp1 v16.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v16.16b\n"
+      "bge 31f\n"
+      "tbz x9, #3, 26f\n"
+      "str d23, [x27], #0x8\n"
+      "tbz x9, #2, 24f\n"
+      "st1 { v23.s }[2], [x27], #0x4\n"
+      "tbz x9, #1, 23f\n"
+      "st1 { v23.h }[6], [x27], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[14], [x27]\n"
+      "b 30f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[12], [x27]\n"
+      "b 30f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 25f\n"
+      "st1 { v23.h }[4], [x27], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[10], [x27]\n"
+      "b 30f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[8], [x27]\n"
+      "b 30f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 28f\n"
+      "str s23, [x27], #0x4\n"
+      "tbz x9, #1, 27f\n"
+      "st1 { v23.h }[2], [x27], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[6], [x27]\n"
+      "b 30f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[4], [x27]\n"
+      "b 30f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 29f\n"
+      "str h23, [x27], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[2], [x27]\n"
+      "b 30f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b23, [x27, #0x0]\n"
+      "30:"  // Height 1: Partial direct writeback: Done
+      "b 32f\n"
+      "31:"  // Height 1: Full writeback
+      "str q23, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "32:"  // Height 1: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 2b\n"
+      "b 130f\n"
+      "33:"  // Height 2
+      "mov x10, %x[col_bias]\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v15.16b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "34:"  // Height 2: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "35:"  // Height 2: setup done
+      "mov x26, #0x0\n"
+      "36:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 38f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "38:"  // Height 2: input setup done
+      "cmp x25, #0x10\n"
+      "blt 43f\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "blt 41f\n"
+      "39:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q25, [x28, #0x70]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "ldr q24, [x28, #0x80]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      "ldr q30, [x28, #0x90]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      "ldr q29, [x28, #0xa0]\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      "ldr q28, [x28, #0xb0]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      "ldr q27, [x28, #0xc0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6e99a417  // ummla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6e98a430  // ummla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      ".inst 0x6e9ea434  // ummla v20.4s, v1.16b, v30.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e9da431  // ummla v17.4s, v1.16b, v29.16b\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e9ca435  // ummla v21.4s, v1.16b, v28.16b\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e9ba432  // ummla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x6e9aa436  // ummla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e99a433  // ummla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e98a437  // ummla v23.4s, v1.16b, v24.16b\n"
+      "tbnz %x[flags], #31, 40f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      "40:"  // Height 2: Multiply loop: unique 5: skip row sum
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "bge 39b\n"
+      "41:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q25, [x28, #0x70]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "ldr q24, [x28, #0x80]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      "ldr q30, [x28, #0x90]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      "ldr q29, [x28, #0xa0]\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      "ldr q28, [x28, #0xb0]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      "ldr q27, [x28, #0xc0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6e99a417  // ummla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6e98a430  // ummla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      "sub x25, x25, #0x10\n"
+      ".inst 0x6e9ea434  // ummla v20.4s, v1.16b, v30.16b\n"
+      ".inst 0x6e9da431  // ummla v17.4s, v1.16b, v29.16b\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e9ca435  // ummla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x6e9ba432  // ummla v18.4s, v1.16b, v27.16b\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e9aa436  // ummla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e99a433  // ummla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e98a437  // ummla v23.4s, v1.16b, v24.16b\n"
+      "tbnz %x[flags], #31, 42f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      "42:"  // Height 2: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "43:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x25, 52f\n"
+      "cmp x25, #0x8\n"
+      "blt 46f\n"
+      "44:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d25, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "trn1 v0.2d, v25.2d, v24.2d\n"
+      "tbnz %x[flags], #31, 45f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "45:"  // Height 2: Multiply loop: unique 7: skip row sum
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x6e98a410  // ummla v16.4s, v0.16b, v24.16b\n"
+      "sub x25, x25, #0x8\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      "cmp x25, #0x8\n"
+      ".inst 0x6e9aa414  // ummla v20.4s, v0.16b, v26.16b\n"
+      "ldr q27, [x28, #0x40]\n"
+      "ldr q26, [x28, #0x50]\n"
+      ".inst 0x6e99a411  // ummla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a415  // ummla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x6e9ba412  // ummla v18.4s, v0.16b, v27.16b\n"
+      ".inst 0x6e9aa416  // ummla v22.4s, v0.16b, v26.16b\n"
+      ".inst 0x6e99a413  // ummla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a417  // ummla v23.4s, v0.16b, v24.16b\n"
+      "add x28, x28, #0x80\n"
+      "bge 44b\n"
+      "46:"  // Height 2: Multiply loop: Skip odd blocks
+      "cbz x25, 52f\n"
+      "tbz x25, #2, 48f\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x23], #0x4\n"
+      "tbz x25, #1, 47f\n"
+      "ld1 { v1.h }[2], [x24], #0x2\n"
+      "ld1 { v2.h }[2], [x23], #0x2\n"
+      "tbz x25, #0, 50f\n"
+      "ld1 { v1.b }[6], [x24]\n"
+      "ld1 { v2.b }[6], [x23]\n"
+      "b 50f\n"
+      "47:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x25, #0, 50f\n"
+      "ld1 { v1.b }[4], [x24]\n"
+      "ld1 { v2.b }[4], [x23]\n"
+      "b 50f\n"
+      "48:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x25, #1, 49f\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x23], #0x2\n"
+      "tbz x25, #0, 50f\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x23]\n"
+      "b 50f\n"
+      "49:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x23, #0x0]\n"
+      "50:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 51f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "51:"  // Height 2: Multiply loop: unique 8: skip row sum
+      "ldr q25, [x28, #0x0]\n"
+      "ldr q24, [x28, #0x10]\n"
+      ".inst 0x6e99a410  // ummla v16.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a414  // ummla v20.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x6e99a411  // ummla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a415  // ummla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x50]\n"
+      ".inst 0x6e99a412  // ummla v18.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a416  // ummla v22.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x6e99a413  // ummla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a417  // ummla v23.4s, v0.16b, v24.16b\n"
+      "add x28, x28, #0x80\n"
+      "52:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 36b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v24.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "mov v23.16b, v24.16b\n"
+      "tbnz %x[flags], #31, 53f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "neg v24.4s, v24.4s\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
+      "53:"  // Height 2: skip row sum fixup
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q27, [x10, #0x10]\n"
+      "add v23.4s, v23.4s, v11.4s\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q26, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v17.4s, v17.4s, v27.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v25.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+      "tbz %x[flags], #5, 54f\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
+      "and v30.16b, v20.16b, v0.16b\n"
+      "and v29.16b, v21.16b, v0.16b\n"
+      "and v28.16b, v22.16b, v0.16b\n"
+      "and v27.16b, v16.16b, v0.16b\n"
+      "and v26.16b, v17.16b, v0.16b\n"
+      "and v25.16b, v18.16b, v0.16b\n"
+      "and v24.16b, v19.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v30.4s\n"
+      "sqadd v21.4s, v21.4s, v29.4s\n"
+      "sqadd v22.4s, v22.4s, v28.4s\n"
+      "sqadd v16.4s, v16.4s, v27.4s\n"
+      "sqadd v17.4s, v17.4s, v26.4s\n"
+      "sqadd v18.4s, v18.4s, v25.4s\n"
+      "sqadd v19.4s, v19.4s, v24.4s\n"
+      "54:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "cmp x9, #0x10\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "uzp1 v23.8h, v23.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 63f\n"
+      "tbz x9, #3, 58f\n"
+      "str d23, [x27], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x9, #2, 56f\n"
+      "st1 { v23.s }[2], [x27], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "tbz x9, #1, 55f\n"
+      "st1 { v23.h }[6], [x27], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[14], [x27]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "b 62f\n"
+      "55:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[12], [x27]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "b 62f\n"
+      "56:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 57f\n"
+      "st1 { v23.h }[4], [x27], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[10], [x27]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "b 62f\n"
+      "57:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[8], [x27]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "b 62f\n"
+      "58:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 60f\n"
+      "str s23, [x27], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "tbz x9, #1, 59f\n"
+      "st1 { v23.h }[2], [x27], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[6], [x27]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "b 62f\n"
+      "59:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[4], [x27]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "b 62f\n"
+      "60:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 61f\n"
+      "str h23, [x27], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[2], [x27]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "b 62f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b23, [x27, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "62:"  // Height 2: Partial direct writeback: Done
+      "b 64f\n"
+      "63:"  // Height 2: Full writeback
+      "str q23, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q16, [x23, #0x0]\n"
+      "64:"  // Height 2: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 34b\n"
+      "b 130f\n"
+      "65:"  // Height 3
+      "mov x10, %x[col_bias]\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "66:"  // Height 3: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "67:"  // Height 3: setup done
+      "mov x26, #0x0\n"
+      "68:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 69f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 70f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 70f\n"
+      "69:"  // Height 3: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "70:"  // Height 3: input setup done
+      "cmp x25, #0x10\n"
+      "blt 75f\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q3, [x22, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "blt 73f\n"
+      "71:"  // Height 3: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      "ldr q14, [x28, #0x70]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x28, #0x60]\n"
+      ".inst 0x6e86a45c  // ummla v28.4s, v2.16b, v6.16b\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e88a45d  // ummla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x6e89a45a  // ummla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x6e8aa45e  // ummla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x6e85a413  // ummla v19.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45b  // ummla v27.4s, v2.16b, v5.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x6e8ea417  // ummla v23.4s, v0.16b, v14.16b\n"
+      ".inst 0x6e8ea45f  // ummla v31.4s, v2.16b, v14.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a478  // ummla v24.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a47c  // ummla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e88a479  // ummla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x6e89a47d  // ummla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e8aa47a  // ummla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x6e86a436  // ummla v22.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a47e  // ummla v30.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e85a47b  // ummla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e84a437  // ummla v23.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a47f  // ummla v31.4s, v3.16b, v4.16b\n"
+      "tbnz %x[flags], #31, 72f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f946d  // udot v13.4s, v3.16b, v15.16b\n"
+      "72:"  // Height 3: Multiply loop: unique 9: skip row sum
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q3, [x22, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "bge 71b\n"
+      "73:"  // Height 3: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      "ldr q14, [x28, #0x70]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x28, #0x60]\n"
+      ".inst 0x6e86a45c  // ummla v28.4s, v2.16b, v6.16b\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      "sub x25, x25, #0x10\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e88a45d  // ummla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x6e89a45a  // ummla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x6e8aa45e  // ummla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e85a413  // ummla v19.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45b  // ummla v27.4s, v2.16b, v5.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x6e8ea417  // ummla v23.4s, v0.16b, v14.16b\n"
+      ".inst 0x6e8ea45f  // ummla v31.4s, v2.16b, v14.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a478  // ummla v24.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a47c  // ummla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e88a479  // ummla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x6e89a47d  // ummla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e8aa47a  // ummla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x6e86a436  // ummla v22.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a47e  // ummla v30.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e85a47b  // ummla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e84a437  // ummla v23.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a47f  // ummla v31.4s, v3.16b, v4.16b\n"
+      "tbnz %x[flags], #31, 74f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f946d  // udot v13.4s, v3.16b, v15.16b\n"
+      "74:"  // Height 3: Multiply loop: unique 10: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "75:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x25, 84f\n"
+      "cmp x25, #0x8\n"
+      "blt 78f\n"
+      "76:"  // Height 3: Multiply loop: Odd block loop
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v0.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x22], #0x8\n"
+      "trn1 v2.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 77f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "77:"  // Height 3: Multiply loop: unique 11: skip row sum
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q1, [x28, #0x10]\n"
+      ".inst 0x6e83a410  // ummla v16.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a458  // ummla v24.4s, v2.16b, v3.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      "sub x25, x25, #0x8\n"
+      "cmp x25, #0x8\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e81a414  // ummla v20.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45c  // ummla v28.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45d  // ummla v29.4s, v2.16b, v6.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45a  // ummla v26.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a416  // ummla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45e  // ummla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e83a413  // ummla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45b  // ummla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e81a417  // ummla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45f  // ummla v31.4s, v2.16b, v1.16b\n"
+      "bge 76b\n"
+      "78:"  // Height 3: Multiply loop: Skip odd blocks
+      "cbz x25, 84f\n"
+      "tbz x25, #2, 80f\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x23], #0x4\n"
+      "ldr s3, [x22], #0x4\n"
+      "tbz x25, #1, 79f\n"
+      "ld1 { v1.h }[2], [x24], #0x2\n"
+      "ld1 { v2.h }[2], [x23], #0x2\n"
+      "ld1 { v3.h }[2], [x22], #0x2\n"
+      "tbz x25, #0, 82f\n"
+      "ld1 { v1.b }[6], [x24]\n"
+      "ld1 { v2.b }[6], [x23]\n"
+      "ld1 { v3.b }[6], [x22]\n"
+      "b 82f\n"
+      "79:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x25, #0, 82f\n"
+      "ld1 { v1.b }[4], [x24]\n"
+      "ld1 { v2.b }[4], [x23]\n"
+      "ld1 { v3.b }[4], [x22]\n"
+      "b 82f\n"
+      "80:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x25, #1, 81f\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x23], #0x2\n"
+      "ldr h3, [x22], #0x2\n"
+      "tbz x25, #0, 82f\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x23]\n"
+      "ld1 { v3.b }[2], [x22]\n"
+      "b 82f\n"
+      "81:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x23, #0x0]\n"
+      "ldr b3, [x22, #0x0]\n"
+      "82:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "tbnz %x[flags], #31, 83f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "83:"  // Height 3: Multiply loop: unique 12: skip row sum
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q3, [x28, #0x10]\n"
+      ".inst 0x6e81a410  // ummla v16.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a458  // ummla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x6e83a414  // ummla v20.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45c  // ummla v28.4s, v2.16b, v3.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e81a411  // ummla v17.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a459  // ummla v25.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45d  // ummla v29.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45a  // ummla v26.4s, v2.16b, v5.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e84a416  // ummla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45e  // ummla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e83a413  // ummla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45b  // ummla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e81a417  // ummla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45f  // ummla v31.4s, v2.16b, v1.16b\n"
+      "84:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 68b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v0.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v0.16b\n"
+      "tbnz %x[flags], #31, 85f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v23.4s }, [x20]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "neg v23.4s, v23.4s\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "dup v13.4s, v13.s[0]\n"
+      "mul v11.4s, v11.4s, v23.4s\n"
+      "mul v12.4s, v12.4s, v23.4s\n"
+      "mul v13.4s, v13.4s, v23.4s\n"
+      "85:"  // Height 3: skip row sum fixup
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q30, [x10, #0x10]\n"
+      "add v31.4s, v31.4s, v11.4s\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q29, [x10, #0x20]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v23.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v30.4s\n"
+      "add v21.4s, v21.4s, v29.4s\n"
+      "add v22.4s, v22.4s, v28.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v23.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v23.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v23.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v23.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v23.4s\n"
+      "tbz %x[flags], #5, 86f\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "and v30.16b, v20.16b, v0.16b\n"
+      "and v29.16b, v21.16b, v0.16b\n"
+      "and v28.16b, v22.16b, v0.16b\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
+      "sqadd v20.4s, v20.4s, v30.4s\n"
+      "sqadd v21.4s, v21.4s, v29.4s\n"
+      "sqadd v22.4s, v22.4s, v28.4s\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "and v3.16b, v17.16b, v0.16b\n"
+      "and v2.16b, v18.16b, v0.16b\n"
+      "and v1.16b, v19.16b, v0.16b\n"
+      "and v30.16b, v24.16b, v0.16b\n"
+      "and v29.16b, v25.16b, v0.16b\n"
+      "and v28.16b, v26.16b, v0.16b\n"
+      "and v23.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v3.4s\n"
+      "sqadd v18.4s, v18.4s, v2.4s\n"
+      "sqadd v19.4s, v19.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v30.4s\n"
+      "sqadd v25.4s, v25.4s, v29.4s\n"
+      "sqadd v26.4s, v26.4s, v28.4s\n"
+      "sqadd v27.4s, v27.4s, v23.4s\n"
+      "86:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v29.4s }, [x20]\n"
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v23.4s }, [x20]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "cmp x9, #0x10\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v29.4s\n"
+      "add v20.4s, v20.4s, v29.4s\n"
+      "add v21.4s, v21.4s, v29.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v16.4s, v16.4s, v29.4s\n"
+      "add v17.4s, v17.4s, v29.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v29.4s\n"
+      "add v24.4s, v24.4s, v29.4s\n"
+      "add v25.4s, v25.4s, v29.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v29.4s\n"
+      "smin v31.4s, v31.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "smax v31.4s, v31.4s, v23.4s\n"
+      "smax v20.4s, v20.4s, v23.4s\n"
+      "smax v21.4s, v21.4s, v23.4s\n"
+      "smax v22.4s, v22.4s, v23.4s\n"
+      "smax v16.4s, v16.4s, v23.4s\n"
+      "smax v17.4s, v17.4s, v23.4s\n"
+      "smax v18.4s, v18.4s, v23.4s\n"
+      "smax v19.4s, v19.4s, v23.4s\n"
+      "smax v24.4s, v24.4s, v23.4s\n"
+      "smax v25.4s, v25.4s, v23.4s\n"
+      "smax v26.4s, v26.4s, v23.4s\n"
+      "smax v27.4s, v27.4s, v23.4s\n"
+      "uzp1 v31.8h, v31.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
+      "bge 95f\n"
+      "tbz x9, #3, 90f\n"
+      "str d31, [x27], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x9, #2, 88f\n"
+      "st1 { v31.s }[2], [x27], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "tbz x9, #1, 87f\n"
+      "st1 { v31.h }[6], [x27], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[14], [x27]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "b 94f\n"
+      "87:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[12], [x27]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "b 94f\n"
+      "88:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 89f\n"
+      "st1 { v31.h }[4], [x27], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[10], [x27]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "b 94f\n"
+      "89:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[8], [x27]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "b 94f\n"
+      "90:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 92f\n"
+      "str s31, [x27], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "tbz x9, #1, 91f\n"
+      "st1 { v31.h }[2], [x27], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[6], [x27]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "b 94f\n"
+      "91:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[4], [x27]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "b 94f\n"
+      "92:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 93f\n"
+      "str h31, [x27], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[2], [x27]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "b 94f\n"
+      "93:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b31, [x27, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "94:"  // Height 3: Partial direct writeback: Done
+      "b 96f\n"
+      "95:"  // Height 3: Full writeback
+      "str q31, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q16, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "96:"  // Height 3: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 66b\n"
+      "b 130f\n"
+      "97:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x4\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "98:"  // Height 4: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "99:"  // Height 4: setup done
+      "mov x26, #0x0\n"
+      "100:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 101f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 102f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 102f\n"
+      "101:"  // Height 4: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "102:"  // Height 4: input setup done
+      "cmp x25, #0x10\n"
+      "blt 107f\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "cmp x25, #0x20\n"
+      "ldr q3, [x22, #0x0]\n"
+      "ldr q4, [x21, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "blt 105f\n"
+      "103:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45c  // ummla v28.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e88a45d  // ummla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x6e89a45a  // ummla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x6e8aa45e  // ummla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45b  // ummla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45f  // ummla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a478  // ummla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a47c  // ummla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e88a479  // ummla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x6e89a47d  // ummla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e8aa47a  // ummla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a47e  // ummla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e85a47b  // ummla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a47f  // ummla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 104f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f946d  // udot v13.4s, v3.16b, v15.16b\n"
+      "104:"  // Height 4: Multiply loop: unique 13: skip row sum
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x20\n"
+      "ldr q3, [x22, #0x0]\n"
+      "ldr q4, [x21, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "bge 103b\n"
+      "105:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "sub x25, x25, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45c  // ummla v28.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e88a45d  // ummla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x6e89a45a  // ummla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x6e8aa45e  // ummla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45b  // ummla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45f  // ummla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a478  // ummla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a47c  // ummla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e88a479  // ummla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x6e89a47d  // ummla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e8aa47a  // ummla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a47e  // ummla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e85a47b  // ummla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a47f  // ummla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 106f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f946d  // udot v13.4s, v3.16b, v15.16b\n"
+      "106:"  // Height 4: Multiply loop: unique 14: skip row sum
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "107:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x25, 116f\n"
+      "cmp x25, #0x8\n"
+      "blt 110f\n"
+      "108:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v0.2d, v1.2d, v0.2d\n"
+      "ldr d2, [x22], #0x8\n"
+      "ldr d1, [x21], #0x8\n"
+      "trn1 v2.2d, v2.2d, v1.2d\n"
+      "tbnz %x[flags], #31, 109f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "109:"  // Height 4: Multiply loop: unique 15: skip row sum
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q1, [x28, #0x10]\n"
+      ".inst 0x6e83a410  // ummla v16.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a458  // ummla v24.4s, v2.16b, v3.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      "sub x25, x25, #0x8\n"
+      "cmp x25, #0x8\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e81a414  // ummla v20.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45c  // ummla v28.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45d  // ummla v29.4s, v2.16b, v6.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45a  // ummla v26.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a416  // ummla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45e  // ummla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e83a413  // ummla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45b  // ummla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e81a417  // ummla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45f  // ummla v31.4s, v2.16b, v1.16b\n"
+      "bge 108b\n"
+      "110:"  // Height 4: Multiply loop: Skip odd blocks
+      "cbz x25, 116f\n"
+      "tbz x25, #2, 112f\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x23], #0x4\n"
+      "ldr s3, [x22], #0x4\n"
+      "ldr s9, [x21], #0x4\n"
+      "tbz x25, #1, 111f\n"
+      "ld1 { v1.h }[2], [x24], #0x2\n"
+      "ld1 { v2.h }[2], [x23], #0x2\n"
+      "ld1 { v3.h }[2], [x22], #0x2\n"
+      "ld1 { v9.h }[2], [x21], #0x2\n"
+      "tbz x25, #0, 114f\n"
+      "ld1 { v1.b }[6], [x24]\n"
+      "ld1 { v2.b }[6], [x23]\n"
+      "ld1 { v3.b }[6], [x22]\n"
+      "ld1 { v9.b }[6], [x21]\n"
+      "b 114f\n"
+      "111:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x25, #0, 114f\n"
+      "ld1 { v1.b }[4], [x24]\n"
+      "ld1 { v2.b }[4], [x23]\n"
+      "ld1 { v3.b }[4], [x22]\n"
+      "ld1 { v9.b }[4], [x21]\n"
+      "b 114f\n"
+      "112:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x25, #1, 113f\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x23], #0x2\n"
+      "ldr h3, [x22], #0x2\n"
+      "ldr h9, [x21], #0x2\n"
+      "tbz x25, #0, 114f\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x23]\n"
+      "ld1 { v3.b }[2], [x22]\n"
+      "ld1 { v9.b }[2], [x21]\n"
+      "b 114f\n"
+      "113:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x23, #0x0]\n"
+      "ldr b3, [x22, #0x0]\n"
+      "ldr b9, [x21, #0x0]\n"
+      "114:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v9.2d\n"
+      "tbnz %x[flags], #31, 115f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "115:"  // Height 4: Multiply loop: unique 16: skip row sum
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q3, [x28, #0x10]\n"
+      ".inst 0x6e81a410  // ummla v16.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a458  // ummla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x6e83a414  // ummla v20.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45c  // ummla v28.4s, v2.16b, v3.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e81a411  // ummla v17.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a459  // ummla v25.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45d  // ummla v29.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45a  // ummla v26.4s, v2.16b, v5.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e84a416  // ummla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45e  // ummla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e83a413  // ummla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45b  // ummla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e81a417  // ummla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45f  // ummla v31.4s, v2.16b, v1.16b\n"
+      "116:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 100b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 v0.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "add x21, x22, x20\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v0.16b\n"
+      "tbnz %x[flags], #31, 117f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "neg v0.4s, v0.4s\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "dup v14.4s, v13.s[3]\n"
+      "dup v13.4s, v13.s[0]\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
+      "117:"  // Height 4: skip row sum fixup
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q4, [x10, #0x10]\n"
+      "add v31.4s, v31.4s, v11.4s\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q3, [x10, #0x20]\n"
+      "ldr q2, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v23.4s, v23.4s, v13.4s\n"
+      "add v28.4s, v28.4s, v13.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v29.4s, v29.4s, v13.4s\n"
+      "add v30.4s, v30.4s, v13.4s\n"
+      "add v24.4s, v24.4s, v14.4s\n"
+      "add v25.4s, v25.4s, v14.4s\n"
+      "add v26.4s, v26.4s, v14.4s\n"
+      "add v27.4s, v27.4s, v14.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v2.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v2.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+      "tbz %x[flags], #5, 118f\n"
+      "and v2.16b, v31.16b, v0.16b\n"
+      "and v1.16b, v20.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v2.4s\n"
+      "sqadd v20.4s, v20.4s, v1.4s\n"
+      "and v7.16b, v21.16b, v0.16b\n"
+      "and v6.16b, v22.16b, v0.16b\n"
+      "and v5.16b, v16.16b, v0.16b\n"
+      "and v4.16b, v17.16b, v0.16b\n"
+      "and v3.16b, v18.16b, v0.16b\n"
+      "and v2.16b, v19.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v7.4s\n"
+      "sqadd v22.4s, v22.4s, v6.4s\n"
+      "sqadd v16.4s, v16.4s, v5.4s\n"
+      "sqadd v17.4s, v17.4s, v4.4s\n"
+      "sqadd v18.4s, v18.4s, v3.4s\n"
+      "sqadd v19.4s, v19.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "and v7.16b, v28.16b, v0.16b\n"
+      "and v6.16b, v29.16b, v0.16b\n"
+      "and v5.16b, v30.16b, v0.16b\n"
+      "and v4.16b, v24.16b, v0.16b\n"
+      "and v3.16b, v25.16b, v0.16b\n"
+      "and v2.16b, v26.16b, v0.16b\n"
+      "and v1.16b, v27.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v7.4s\n"
+      "sqadd v29.4s, v29.4s, v6.4s\n"
+      "sqadd v30.4s, v30.4s, v5.4s\n"
+      "sqadd v24.4s, v24.4s, v4.4s\n"
+      "sqadd v25.4s, v25.4s, v3.4s\n"
+      "sqadd v26.4s, v26.4s, v2.4s\n"
+      "sqadd v27.4s, v27.4s, v1.4s\n"
+      "118:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v3.4s }, [x20]\n"
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v2.4s }, [x20]\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "cmp x9, #0x10\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
+      "srshl v29.4s, v29.4s, v0.4s\n"
+      "srshl v30.4s, v30.4s, v0.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "smin v31.4s, v31.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v2.4s\n"
+      "smin v21.4s, v21.4s, v2.4s\n"
+      "smin v22.4s, v22.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v2.4s\n"
+      "smin v17.4s, v17.4s, v2.4s\n"
+      "smin v18.4s, v18.4s, v2.4s\n"
+      "smin v19.4s, v19.4s, v2.4s\n"
+      "smin v23.4s, v23.4s, v2.4s\n"
+      "smin v28.4s, v28.4s, v2.4s\n"
+      "smin v29.4s, v29.4s, v2.4s\n"
+      "smin v30.4s, v30.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v2.4s\n"
+      "smin v25.4s, v25.4s, v2.4s\n"
+      "smin v26.4s, v26.4s, v2.4s\n"
+      "smin v27.4s, v27.4s, v2.4s\n"
+      "smax v31.4s, v31.4s, v1.4s\n"
+      "smax v20.4s, v20.4s, v1.4s\n"
+      "smax v21.4s, v21.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v1.4s\n"
+      "smax v16.4s, v16.4s, v1.4s\n"
+      "smax v17.4s, v17.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v1.4s\n"
+      "smax v19.4s, v19.4s, v1.4s\n"
+      "smax v23.4s, v23.4s, v1.4s\n"
+      "smax v28.4s, v28.4s, v1.4s\n"
+      "smax v29.4s, v29.4s, v1.4s\n"
+      "smax v30.4s, v30.4s, v1.4s\n"
+      "smax v24.4s, v24.4s, v1.4s\n"
+      "smax v25.4s, v25.4s, v1.4s\n"
+      "smax v26.4s, v26.4s, v1.4s\n"
+      "smax v27.4s, v27.4s, v1.4s\n"
+      "uzp1 v31.8h, v31.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.8h, v23.8h, v28.8h\n"
+      "uzp1 v18.8h, v29.8h, v30.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v23.16b, v23.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
+      "bge 127f\n"
+      "tbz x9, #3, 122f\n"
+      "str d31, [x27], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x9, #2, 120f\n"
+      "st1 { v31.s }[2], [x27], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v23.s }[2], [x22], #0x4\n"
+      "st1 { v24.s }[2], [x21], #0x4\n"
+      "tbz x9, #1, 119f\n"
+      "st1 { v31.h }[6], [x27], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v23.h }[6], [x22], #0x2\n"
+      "st1 { v24.h }[6], [x21], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[14], [x27]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v23.b }[14], [x22]\n"
+      "st1 { v24.b }[14], [x21]\n"
+      "b 126f\n"
+      "119:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[12], [x27]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v23.b }[12], [x22]\n"
+      "st1 { v24.b }[12], [x21]\n"
+      "b 126f\n"
+      "120:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 121f\n"
+      "st1 { v31.h }[4], [x27], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v23.h }[4], [x22], #0x2\n"
+      "st1 { v24.h }[4], [x21], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[10], [x27]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v23.b }[10], [x22]\n"
+      "st1 { v24.b }[10], [x21]\n"
+      "b 126f\n"
+      "121:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[8], [x27]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v23.b }[8], [x22]\n"
+      "st1 { v24.b }[8], [x21]\n"
+      "b 126f\n"
+      "122:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 124f\n"
+      "str s31, [x27], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s23, [x22], #0x4\n"
+      "str s24, [x21], #0x4\n"
+      "tbz x9, #1, 123f\n"
+      "st1 { v31.h }[2], [x27], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v23.h }[2], [x22], #0x2\n"
+      "st1 { v24.h }[2], [x21], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[6], [x27]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v23.b }[6], [x22]\n"
+      "st1 { v24.b }[6], [x21]\n"
+      "b 126f\n"
+      "123:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[4], [x27]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v23.b }[4], [x22]\n"
+      "st1 { v24.b }[4], [x21]\n"
+      "b 126f\n"
+      "124:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 125f\n"
+      "str h31, [x27], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h23, [x22], #0x2\n"
+      "str h24, [x21], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[2], [x27]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v23.b }[2], [x22]\n"
+      "st1 { v24.b }[2], [x21]\n"
+      "b 126f\n"
+      "125:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b31, [x27, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b23, [x22, #0x0]\n"
+      "str b24, [x21, #0x0]\n"
+      "126:"  // Height 4: Partial direct writeback: Done
+      "b 128f\n"
+      "127:"  // Height 4: Full writeback
+      "str q31, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "str q16, [x23, #0x0]\n"
+      "str q23, [x22, #0x0]\n"
+      "str q24, [x21, #0x0]\n"
+      "128:"  // Height 4: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 98b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 130f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 129f\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "129:"  // Update direct input
+      "mov x20, #0x4\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "130:"  // Exit
+      : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
index da07fc17a1..38bb7c646d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
@@ -44,7 +44,8 @@ void a64_hybrid_u8u32_dot_6x16_a55( ARGLIST );
 class cls_a64_hybrid_u8u32_dot_6x16
 {
 public:
-    typedef uint8_t operand_type;
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
     typedef uint32_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,37 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 12.667, 2.0799, 0.2279 };
-            default:
-                return { 29.6736, 11.4025, 0.5591 };
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.63 };
+                case CPUModel::A510:
+                    return { 15.89 };
+                case CPUModel::V1:
+                    return { 53.87 };
+                case CPUModel::A55r1:
+                    return { 9.217 };
+            }
         }
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 9.5238, 2.0799, 0.2279 };
+                default:
+                    return { 29.6736, 11.4025, 0.5591 };
+                case CPUModel::A510:
+                    return { 16.65, 3.92, 0.48 };
+                case CPUModel::V1:
+                    return { 42.62, 16.32, 0.83 };
+            }
+        }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
index 8833651768..7f0fad7fa7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -77,7 +77,6 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
     ka.N = N;
     ka.B_ptr = B_ptr;
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 171f\n"
@@ -87,73 +86,73 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
       "cmp %x[M], #0x2\n"
       "bgt 69f\n"
       "beq 35f\n"
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
       "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
       "tbz %x[flags], #0, 12f\n"
-      "cmp x17, #0x10\n"
+      "cmp x8, #0x10\n"
       "bge 11f\n"
-      "tbz x17, #3, 6f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "tbz x17, #2, 4f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "tbz x17, #1, 3f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x15], #0x8\n"
-      "tbz x17, #0, 10f\n"
-      "ld1 { v11.s }[2], [x15]\n"
+      "tbz x8, #3, 6f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "tbz x8, #2, 4f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "tbz x8, #1, 3f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "tbz x8, #0, 10f\n"
+      "ld1 { v11.s }[2], [x17]\n"
       "b 10f\n"
       "3:"  // Height 1: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 10f\n"
-      "ldr s11, [x15, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 10f\n"
+      "ldr s11, [x17, #0x0]\n"
       "b 10f\n"
       "4:"  // Height 1: Partial accumulate: partial_2_8
-      "tbz x17, #1, 5f\n"
-      "ldr d10, [x15], #0x8\n"
-      "mov x24, #0x28\n"
-      "tbz x17, #0, 10f\n"
-      "ld1 { v10.s }[2], [x15]\n"
+      "tbz x8, #1, 5f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "tbz x8, #0, 10f\n"
+      "ld1 { v10.s }[2], [x17]\n"
       "b 10f\n"
       "5:"  // Height 1: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 10f\n"
-      "ldr s10, [x15, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 10f\n"
+      "ldr s10, [x17, #0x0]\n"
       "b 10f\n"
       "6:"  // Height 1: Partial accumulate: partial_4_0
-      "tbz x17, #2, 8f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "tbz x17, #1, 7f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x15], #0x8\n"
-      "tbz x17, #0, 10f\n"
-      "ld1 { v9.s }[2], [x15]\n"
+      "tbz x8, #2, 8f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "tbz x8, #1, 7f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "tbz x8, #0, 10f\n"
+      "ld1 { v9.s }[2], [x17]\n"
       "b 10f\n"
       "7:"  // Height 1: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 10f\n"
-      "ldr s9, [x15, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 10f\n"
+      "ldr s9, [x17, #0x0]\n"
       "b 10f\n"
       "8:"  // Height 1: Partial accumulate: partial_2_0
-      "tbz x17, #1, 9f\n"
-      "ldr d8, [x15], #0x8\n"
-      "mov x24, #0x8\n"
-      "tbz x17, #0, 10f\n"
-      "ld1 { v8.s }[2], [x15]\n"
+      "tbz x8, #1, 9f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "tbz x8, #0, 10f\n"
+      "ld1 { v8.s }[2], [x17]\n"
       "b 10f\n"
       "9:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
       "10:"  // Height 1: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 13f\n"
       "11:"  // Height 1: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
       "b 13f\n"
       "12:"  // Height 1: no accumulate
       "movi v8.4s, #0x0\n"
@@ -161,329 +160,329 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
       "movi v10.4s, #0x0\n"
       "movi v11.4s, #0x0\n"
       "13:"  // Height 1: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "14:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 15f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "cbnz x14, 16f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "cbnz x15, 16f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
       "b 16f\n"
       "15:"  // Height 1: setup direct input
-      "mov x12, %x[input_ptr]\n"
+      "mov x13, %x[input_ptr]\n"
       "16:"  // Height 1: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 19f\n"
-      "ldr q0, [x12, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
       "ldr q6, [x16, #0x0]\n"
-      "cmp x13, #0x20\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 18f\n"
       "17:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
-      "ldr x11, [x16, #0x18]\n"
-      "add x12, x12, #0x10\n"
-      "ldr d6, [x16, #0x20]\n"
-      "sub x13, x13, #0x10\n"
-      "ldr x10, [x16, #0x28]\n"
-      "cmp x13, #0x20\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr d17, [x16, #0x20]\n"
+      "ldr x20, [x16, #0x28]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      "ldr x10, [x16, #0x48]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x58]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      "ldr x10, [x16, #0x68]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      "ldr x10, [x16, #0x88]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      "ldr x10, [x16, #0xa8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      "ldr x10, [x16, #0xc8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      "ldr x10, [x16, #0xe8]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xf0]\n"
+      "ldr d16, [x16, #0x30]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x38]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr d17, [x16, #0x40]\n"
+      "ldr x20, [x16, #0x48]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr d16, [x16, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x16, #0x60]\n"
+      "ldr x20, [x16, #0x68]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x16, #0x70]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x16, #0x80]\n"
+      "ldr x20, [x16, #0x88]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x16, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x16, #0xa0]\n"
+      "ldr x20, [x16, #0xa8]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x16, #0xb0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x16, #0xc0]\n"
+      "ldr x20, [x16, #0xc8]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x16, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr d17, [x16, #0xe0]\n"
+      "ldr x20, [x16, #0xe8]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr d16, [x16, #0xf0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xf8]\n"
+      "mov v16.d[1], x20\n"
+      "add x13, x13, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      "ldr x10, [x16, #0x8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d0, [x12, #0x0]\n"
-      "mov v0.d[1], x9\n"
+      "ldr x20, [x16, #0x8]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      "sub x14, x14, #0x10\n"
+      "ldr d7, [x16, #0x10]\n"
+      "cmp x14, #0x20\n"
+      "ldr x21, [x13, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "bge 17b\n"
       "18:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      "ldr q6, [x16, #0x20]\n"
-      "sub x13, x13, #0x10\n"
-      "add x12, x12, #0x10\n"
+      "ldr q17, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x16, #0x40]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x50]\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x16, #0x60]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x16, #0x70]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x16, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x16, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x16, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x16, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x16, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x16, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x16, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x16, #0xf0]\n"
+      "add x13, x13, #0x10\n"
+      "sub x14, x14, #0x10\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
       "19:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x13, 24f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 24f\n"
+      "cmp x14, #0x4\n"
       "blt 21f\n"
       "20:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "cmp x13, #0x4\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
+      "ldr s18, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr q16, [x16, #0x0]\n"
+      ".inst 0x6f92e208  // udot v8.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x6f92e209  // udot v9.4s, v16.16b, v18.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
+      "cmp x14, #0x4\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f92e22a  // udot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x6f92e20b  // udot v11.4s, v16.16b, v18.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
       "bge 20b\n"
-      "cbz x13, 24f\n"
       "21:"  // Height 1: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 22f\n"
-      "ldr h0, [x12], #0x2\n"
-      "tbz x13, #0, 23f\n"
-      "ld1 { v0.b }[2], [x12]\n"
+      "cbz x14, 24f\n"
+      "tbz x14, #1, 22f\n"
+      "ldr h0, [x13], #0x2\n"
+      "tbz x14, #0, 23f\n"
+      "ld1 { v0.b }[2], [x13]\n"
       "b 23f\n"
       "22:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
       "23:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
+      "ldr q16, [x16, #0x0]\n"
+      ".inst 0x6f80e208  // udot v8.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x6f80e209  // udot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x20]\n"
+      ".inst 0x6f80e20a  // udot v10.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
       "24:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 14b\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
       "bge 33f\n"
-      "tbz x17, #3, 28f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "tbz x17, #2, 26f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "tbz x17, #1, 25f\n"
-      "str d11, [x15], #0x8\n"
-      "tbz x17, #0, 32f\n"
-      "st1 { v11.s }[2], [x15]\n"
+      "tbz x8, #3, 28f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "tbz x8, #2, 26f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "tbz x8, #1, 25f\n"
+      "str d11, [x17], #0x8\n"
+      "tbz x8, #0, 32f\n"
+      "st1 { v11.s }[2], [x17]\n"
       "b 32f\n"
       "25:"  // Height 1: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 32f\n"
-      "str s11, [x15, #0x0]\n"
+      "tbz x8, #0, 32f\n"
+      "str s11, [x17, #0x0]\n"
       "b 32f\n"
       "26:"  // Height 1: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 27f\n"
-      "str d10, [x15], #0x8\n"
-      "tbz x17, #0, 32f\n"
-      "st1 { v10.s }[2], [x15]\n"
+      "tbz x8, #1, 27f\n"
+      "str d10, [x17], #0x8\n"
+      "tbz x8, #0, 32f\n"
+      "st1 { v10.s }[2], [x17]\n"
       "b 32f\n"
       "27:"  // Height 1: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 32f\n"
-      "str s10, [x15, #0x0]\n"
+      "tbz x8, #0, 32f\n"
+      "str s10, [x17, #0x0]\n"
       "b 32f\n"
       "28:"  // Height 1: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 30f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "tbz x17, #1, 29f\n"
-      "str d9, [x15], #0x8\n"
-      "tbz x17, #0, 32f\n"
-      "st1 { v9.s }[2], [x15]\n"
+      "tbz x8, #2, 30f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "tbz x8, #1, 29f\n"
+      "str d9, [x17], #0x8\n"
+      "tbz x8, #0, 32f\n"
+      "st1 { v9.s }[2], [x17]\n"
       "b 32f\n"
       "29:"  // Height 1: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 32f\n"
-      "str s9, [x15, #0x0]\n"
+      "tbz x8, #0, 32f\n"
+      "str s9, [x17, #0x0]\n"
       "b 32f\n"
       "30:"  // Height 1: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 31f\n"
-      "str d8, [x15], #0x8\n"
-      "tbz x17, #0, 32f\n"
-      "st1 { v8.s }[2], [x15]\n"
+      "tbz x8, #1, 31f\n"
+      "str d8, [x17], #0x8\n"
+      "tbz x8, #0, 32f\n"
+      "st1 { v8.s }[2], [x17]\n"
       "b 32f\n"
       "31:"  // Height 1: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
       "32:"  // Height 1: Partial direct writeback: Done
       "b 34f\n"
       "33:"  // Height 1: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
       "34:"  // Height 1: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 2b\n"
       "b 206f\n"
       "35:"  // Height 2
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
       "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "36:"  // Height 2: Column loop
       "tbz %x[flags], #0, 46f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x8, #0x10\n"
+      "add x24, x17, x20, LSL #2\n"
       "bge 45f\n"
-      "tbz x17, #3, 40f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "tbz x17, #2, 38f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "tbz x17, #1, 37f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x15], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "tbz x17, #0, 44f\n"
-      "ld1 { v11.s }[2], [x15]\n"
-      "ld1 { v15.s }[2], [x23]\n"
+      "tbz x8, #3, 40f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "tbz x8, #2, 38f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "tbz x8, #1, 37f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "tbz x8, #0, 44f\n"
+      "ld1 { v11.s }[2], [x17]\n"
+      "ld1 { v15.s }[2], [x24]\n"
       "b 44f\n"
       "37:"  // Height 2: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 44f\n"
-      "ldr s11, [x15, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 44f\n"
+      "ldr s11, [x17, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
       "b 44f\n"
       "38:"  // Height 2: Partial accumulate: partial_2_8
-      "tbz x17, #1, 39f\n"
-      "ldr d10, [x15], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "tbz x17, #0, 44f\n"
-      "ld1 { v10.s }[2], [x15]\n"
-      "ld1 { v14.s }[2], [x23]\n"
+      "tbz x8, #1, 39f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d14, [x24], #0x8\n"
+      "tbz x8, #0, 44f\n"
+      "ld1 { v10.s }[2], [x17]\n"
+      "ld1 { v14.s }[2], [x24]\n"
       "b 44f\n"
       "39:"  // Height 2: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 44f\n"
-      "ldr s10, [x15, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 44f\n"
+      "ldr s10, [x17, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
       "b 44f\n"
       "40:"  // Height 2: Partial accumulate: partial_4_0
-      "tbz x17, #2, 42f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "tbz x17, #1, 41f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x15], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "tbz x17, #0, 44f\n"
-      "ld1 { v9.s }[2], [x15]\n"
-      "ld1 { v13.s }[2], [x23]\n"
+      "tbz x8, #2, 42f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "tbz x8, #1, 41f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "tbz x8, #0, 44f\n"
+      "ld1 { v9.s }[2], [x17]\n"
+      "ld1 { v13.s }[2], [x24]\n"
       "b 44f\n"
       "41:"  // Height 2: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 44f\n"
-      "ldr s9, [x15, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 44f\n"
+      "ldr s9, [x17, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
       "b 44f\n"
       "42:"  // Height 2: Partial accumulate: partial_2_0
-      "tbz x17, #1, 43f\n"
-      "ldr d8, [x15], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "tbz x17, #0, 44f\n"
-      "ld1 { v8.s }[2], [x15]\n"
-      "ld1 { v12.s }[2], [x23]\n"
+      "tbz x8, #1, 43f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "tbz x8, #0, 44f\n"
+      "ld1 { v8.s }[2], [x17]\n"
+      "ld1 { v12.s }[2], [x24]\n"
       "b 44f\n"
       "43:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
       "44:"  // Height 2: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 47f\n"
       "45:"  // Height 2: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
       "b 47f\n"
       "46:"  // Height 2: no accumulate
       "movi v8.4s, #0x0\n"
@@ -495,428 +494,428 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
       "movi v14.4s, #0x0\n"
       "movi v15.4s, #0x0\n"
       "47:"  // Height 2: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "cbnz x14, 50f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "cbnz x15, 50f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
       "b 50f\n"
       "49:"  // Height 2: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21\n"
       "50:"  // Height 2: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 53f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x20\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
+      "ldr q1, [x12, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 52f\n"
       "51:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x18]\n"
-      "ldr d6, [x16, #0x20]\n"
-      "add x12, x12, #0x10\n"
-      "ldr x10, [x16, #0x28]\n"
-      "add x28, x28, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      "sub x13, x13, #0x10\n"
+      "ldr d17, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      "ldr x10, [x16, #0x48]\n"
-      "cmp x13, #0x20\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x58]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x68]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x78]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x88]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xa8]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xc8]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x16, #0xe8]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr d16, [x16, #0x30]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr d17, [x16, #0x40]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr x20, [x16, #0x48]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr d16, [x16, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x6fa1e22c  // udot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x16, #0x60]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x6fa1e20d  // udot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x16, #0x70]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa1e22e  // udot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x16, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr x20, [x16, #0x88]\n"
+      ".inst 0x6fa1e20f  // udot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x16, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x6f81ea2c  // udot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x16, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x6f81ea0d  // udot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x16, #0xb0]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f81ea2e  // udot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x16, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr x20, [x16, #0xc8]\n"
+      ".inst 0x6f81ea0f  // udot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x16, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x6fa1ea2c  // udot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr d17, [x16, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x6fa1ea0d  // udot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr d16, [x16, #0xf0]\n"
+      "mov v17.d[1], x21\n"
+      "add x13, x13, #0x10\n"
+      "mov v16.d[1], x20\n"
+      "add x12, x12, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2e  // udot v14.4s, v17.16b, v1.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x10, [x16, #0x8]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d1, [x28, #0x0]\n"
-      "mov v0.d[1], x9\n"
-      "mov v1.d[1], x27\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      ".inst 0x6fa1ea0f  // udot v15.4s, v16.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "sub x14, x14, #0x10\n"
+      "ldr d7, [x16, #0x10]\n"
+      "cmp x14, #0x20\n"
+      "ldr x20, [x13, #0x8]\n"
+      "mov v6.d[1], x21\n"
+      "ldr x21, [x12, #0x8]\n"
+      "mov v0.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v1.d[1], x21\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      "sub x13, x13, #0x10\n"
-      "add x12, x12, #0x10\n"
+      "ldr q17, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x28, x28, #0x10\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      "sub x14, x14, #0x10\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x16, #0x40]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x16, #0x50]\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x6fa1e22c  // udot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x16, #0x60]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20d  // udot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x16, #0x70]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e22e  // udot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x16, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20f  // udot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x16, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2c  // udot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x16, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0d  // udot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x16, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2e  // udot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x16, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0f  // udot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x16, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2c  // udot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x16, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea0d  // udot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x16, #0xf0]\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa1ea2e  // udot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea0f  // udot v15.4s, v16.16b, v1.4b[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x13, 58f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 58f\n"
+      "cmp x14, #0x4\n"
       "blt 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
-      "cmp x13, #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s19, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr s18, [x12], #0x4\n"
+      "cmp x14, #0x4\n"
+      "ldr q17, [x16, #0x0]\n"
+      ".inst 0x6f93e228  // udot v8.4s, v17.16b, v19.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x6f92e22c  // udot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
+      ".inst 0x6f93e209  // udot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x6f92e20d  // udot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f93e22a  // udot v10.4s, v17.16b, v19.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f92e22e  // udot v14.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x6f93e20b  // udot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x6f92e20f  // udot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 54b\n"
-      "cbz x13, 58f\n"
       "55:"  // Height 2: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 56f\n"
-      "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "tbz x13, #0, 57f\n"
-      "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
+      "cbz x14, 58f\n"
+      "tbz x14, #1, 56f\n"
+      "ldr h0, [x13], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "tbz x14, #0, 57f\n"
+      "ld1 { v0.b }[2], [x13]\n"
+      "ld1 { v1.b }[2], [x12]\n"
       "b 57f\n"
       "56:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
       "57:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q17, [x16, #0x0]\n"
+      ".inst 0x6f80e228  // udot v8.4s, v17.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x6f81e22c  // udot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
+      ".inst 0x6f80e209  // udot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20d  // udot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
       "58:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 48b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "bge 67f\n"
-      "tbz x17, #3, 62f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "tbz x17, #2, 60f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "tbz x17, #1, 59f\n"
-      "str d11, [x15], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "tbz x17, #0, 66f\n"
-      "st1 { v11.s }[2], [x15]\n"
-      "st1 { v15.s }[2], [x23]\n"
+      "tbz x8, #3, 62f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "tbz x8, #2, 60f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "tbz x8, #1, 59f\n"
+      "str d11, [x17], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "tbz x8, #0, 66f\n"
+      "st1 { v11.s }[2], [x17]\n"
+      "st1 { v15.s }[2], [x24]\n"
       "b 66f\n"
       "59:"  // Height 2: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 66f\n"
-      "str s11, [x15, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
+      "tbz x8, #0, 66f\n"
+      "str s11, [x17, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
       "b 66f\n"
       "60:"  // Height 2: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 61f\n"
-      "str d10, [x15], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "tbz x17, #0, 66f\n"
-      "st1 { v10.s }[2], [x15]\n"
-      "st1 { v14.s }[2], [x23]\n"
+      "tbz x8, #1, 61f\n"
+      "str d10, [x17], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "tbz x8, #0, 66f\n"
+      "st1 { v10.s }[2], [x17]\n"
+      "st1 { v14.s }[2], [x24]\n"
       "b 66f\n"
       "61:"  // Height 2: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 66f\n"
-      "str s10, [x15, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
+      "tbz x8, #0, 66f\n"
+      "str s10, [x17, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
       "b 66f\n"
       "62:"  // Height 2: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 64f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "tbz x17, #1, 63f\n"
-      "str d9, [x15], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "tbz x17, #0, 66f\n"
-      "st1 { v9.s }[2], [x15]\n"
-      "st1 { v13.s }[2], [x23]\n"
+      "tbz x8, #2, 64f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "tbz x8, #1, 63f\n"
+      "str d9, [x17], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "tbz x8, #0, 66f\n"
+      "st1 { v9.s }[2], [x17]\n"
+      "st1 { v13.s }[2], [x24]\n"
       "b 66f\n"
       "63:"  // Height 2: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 66f\n"
-      "str s9, [x15, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
+      "tbz x8, #0, 66f\n"
+      "str s9, [x17, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
       "b 66f\n"
       "64:"  // Height 2: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 65f\n"
-      "str d8, [x15], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "tbz x17, #0, 66f\n"
-      "st1 { v8.s }[2], [x15]\n"
-      "st1 { v12.s }[2], [x23]\n"
+      "tbz x8, #1, 65f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "tbz x8, #0, 66f\n"
+      "st1 { v8.s }[2], [x17]\n"
+      "st1 { v12.s }[2], [x24]\n"
       "b 66f\n"
       "65:"  // Height 2: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
       "66:"  // Height 2: Partial direct writeback: Done
       "b 68f\n"
       "67:"  // Height 2: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
       "68:"  // Height 2: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 36b\n"
       "b 206f\n"
       "69:"  // Height 3
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
       "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "70:"  // Height 3: Column loop
       "tbz %x[flags], #0, 80f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "add x23, x24, x20, LSL #2\n"
       "bge 79f\n"
-      "tbz x17, #3, 74f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "tbz x17, #2, 72f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "tbz x17, #1, 71f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x24, #0x38\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "tbz x17, #0, 78f\n"
-      "ld1 { v11.s }[2], [x15]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
+      "tbz x8, #3, 74f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x8, #2, 72f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x8, #1, 71f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x8, #0, 78f\n"
+      "ld1 { v11.s }[2], [x17]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
       "b 78f\n"
       "71:"  // Height 3: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 78f\n"
-      "ldr s11, [x15, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 78f\n"
+      "ldr s11, [x17, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
       "b 78f\n"
       "72:"  // Height 3: Partial accumulate: partial_2_8
-      "tbz x17, #1, 73f\n"
-      "ldr d10, [x15], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "tbz x17, #0, 78f\n"
-      "ld1 { v10.s }[2], [x15]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
+      "tbz x8, #1, 73f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d14, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "tbz x8, #0, 78f\n"
+      "ld1 { v10.s }[2], [x17]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
       "b 78f\n"
       "73:"  // Height 3: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 78f\n"
-      "ldr s10, [x15, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 78f\n"
+      "ldr s10, [x17, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
       "b 78f\n"
       "74:"  // Height 3: Partial accumulate: partial_4_0
-      "tbz x17, #2, 76f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "tbz x17, #1, 75f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x24, #0x18\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "tbz x17, #0, 78f\n"
-      "ld1 { v9.s }[2], [x15]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
+      "tbz x8, #2, 76f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "tbz x8, #1, 75f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "tbz x8, #0, 78f\n"
+      "ld1 { v9.s }[2], [x17]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
       "b 78f\n"
       "75:"  // Height 3: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 78f\n"
-      "ldr s9, [x15, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 78f\n"
+      "ldr s9, [x17, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
       "b 78f\n"
       "76:"  // Height 3: Partial accumulate: partial_2_0
-      "tbz x17, #1, 77f\n"
-      "ldr d8, [x15], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "tbz x17, #0, 78f\n"
-      "ld1 { v8.s }[2], [x15]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
+      "tbz x8, #1, 77f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "tbz x8, #0, 78f\n"
+      "ld1 { v8.s }[2], [x17]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
       "b 78f\n"
       "77:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
       "78:"  // Height 3: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 81f\n"
       "79:"  // Height 3: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
       "b 81f\n"
       "80:"  // Height 3: no accumulate
       "movi v8.4s, #0x0\n"
@@ -932,526 +931,526 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
       "movi v18.4s, #0x0\n"
       "movi v19.4s, #0x0\n"
       "81:"  // Height 3: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "82:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 83f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "cbnz x14, 84f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "cbnz x15, 84f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
       "b 84f\n"
       "83:"  // Height 3: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
       "84:"  // Height 3: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 87f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 86f\n"
       "85:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x18]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
-      "ldr x10, [x16, #0x28]\n"
-      "add x12, x12, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "ldr x11, [x16, #0x38]\n"
-      "add x28, x28, #0x10\n"
+      "ldr d21, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "mov v21.d[1], x21\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr x10, [x16, #0x48]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0x40]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x50]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x68]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0x60]\n"
-      "cmp x13, #0x20\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x25, [x26, #0x8]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x88]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xa8]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xc8]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x16, #0xe8]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0xf0]\n"
+      "ldr d20, [x16, #0x30]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr d21, [x16, #0x40]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr d20, [x16, #0x50]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6fa0e2a8  // udot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ac  // udot v12.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x6fa2e2b0  // udot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x16, #0x60]\n"
+      ".inst 0x6fa0e289  // udot v9.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6fa1e28d  // udot v13.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x6fa2e291  // udot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x16, #0x70]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6fa0e2aa  // udot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ae  // udot v14.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x6fa2e2b2  // udot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x16, #0x80]\n"
+      ".inst 0x6fa0e28b  // udot v11.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6fa1e28f  // udot v15.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x6fa2e293  // udot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x16, #0x90]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6f80eaa8  // udot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaac  // udot v12.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x6f82eab0  // udot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x16, #0xa0]\n"
+      ".inst 0x6f80ea89  // udot v9.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6f81ea8d  // udot v13.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x6f82ea91  // udot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x16, #0xb0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6f80eaaa  // udot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaae  // udot v14.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x6f82eab2  // udot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x16, #0xc0]\n"
+      ".inst 0x6f80ea8b  // udot v11.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6f81ea8f  // udot v15.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x6f82ea93  // udot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x16, #0xd0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6fa0eaa8  // udot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaac  // udot v12.4s, v21.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x6fa2eab0  // udot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr d21, [x16, #0xe0]\n"
+      ".inst 0x6fa0ea89  // udot v9.4s, v20.16b, v0.4b[3]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6fa1ea8d  // udot v13.4s, v20.16b, v1.4b[3]\n"
+      "add x13, x13, #0x10\n"
+      ".inst 0x6fa2ea91  // udot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr d20, [x16, #0xf0]\n"
+      "mov v20.d[1], x20\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x16, #0x8]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
+      ".inst 0x6fa0eaaa  // udot v10.4s, v21.16b, v0.4b[3]\n"
+      "ldr x20, [x16, #0x8]\n"
+      ".inst 0x6fa1eaae  // udot v14.4s, v21.16b, v1.4b[3]\n"
+      "ldr x23, [x13, #0x8]\n"
+      ".inst 0x6fa2eab2  // udot v18.4s, v21.16b, v2.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      "mov v0.d[1], x9\n"
-      "ldr d2, [x26, #0x0]\n"
-      "mov v1.d[1], x27\n"
-      "mov v2.d[1], x25\n"
+      ".inst 0x6fa0ea8b  // udot v11.4s, v20.16b, v0.4b[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      ".inst 0x6fa1ea8f  // udot v15.4s, v20.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "ldr x22, [x12, #0x8]\n"
+      ".inst 0x6fa2ea93  // udot v19.4s, v20.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "sub x14, x14, #0x10\n"
+      "ldr d7, [x16, #0x10]\n"
+      "cmp x14, #0x20\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v0.d[1], x23\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v1.d[1], x22\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "mov v2.d[1], x21\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v7.d[1], x20\n"
       "bge 85b\n"
       "86:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q21, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "sub x14, x14, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x28, x28, #0x10\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x16, #0x40]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x16, #0x50]\n"
+      ".inst 0x6fa0e2a8  // udot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ac  // udot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b0  // udot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x16, #0x60]\n"
+      ".inst 0x6fa0e289  // udot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28d  // udot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e291  // udot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x16, #0x70]\n"
+      ".inst 0x6fa0e2aa  // udot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ae  // udot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b2  // udot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x16, #0x80]\n"
+      ".inst 0x6fa0e28b  // udot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28f  // udot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e293  // udot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x16, #0x90]\n"
+      ".inst 0x6f80eaa8  // udot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaac  // udot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab0  // udot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x16, #0xa0]\n"
+      ".inst 0x6f80ea89  // udot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8d  // udot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea91  // udot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x16, #0xb0]\n"
+      ".inst 0x6f80eaaa  // udot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaae  // udot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab2  // udot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x16, #0xc0]\n"
+      ".inst 0x6f80ea8b  // udot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8f  // udot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea93  // udot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x16, #0xd0]\n"
+      ".inst 0x6fa0eaa8  // udot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaac  // udot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab0  // udot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x16, #0xe0]\n"
+      ".inst 0x6fa0ea89  // udot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea8d  // udot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ea91  // udot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x16, #0xf0]\n"
+      ".inst 0x6fa0eaaa  // udot v10.4s, v21.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa1eaae  // udot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab2  // udot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ea8b  // udot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea8f  // udot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ea93  // udot v19.4s, v20.16b, v2.4b[3]\n"
       "87:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x13, 92f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 92f\n"
+      "cmp x14, #0x4\n"
       "blt 89f\n"
       "88:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
-      "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s24, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr s23, [x12], #0x4\n"
+      "cmp x14, #0x4\n"
+      "ldr s22, [x11], #0x4\n"
+      "ldr q21, [x16, #0x0]\n"
+      ".inst 0x6f98e2a8  // udot v8.4s, v21.16b, v24.4b[0]\n"
+      "ldr q20, [x16, #0x10]\n"
+      ".inst 0x6f97e2ac  // udot v12.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x6f96e2b0  // udot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x16, #0x20]\n"
+      ".inst 0x6f98e289  // udot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x6f97e28d  // udot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x6f96e291  // udot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x6f98e2aa  // udot v10.4s, v21.16b, v24.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f97e2ae  // udot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x6f96e2b2  // udot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x6f98e28b  // udot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x6f97e28f  // udot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x6f96e293  // udot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 88b\n"
-      "cbz x13, 92f\n"
       "89:"  // Height 3: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 90f\n"
-      "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "tbz x13, #0, 91f\n"
-      "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
+      "cbz x14, 92f\n"
+      "tbz x14, #1, 90f\n"
+      "ldr h0, [x13], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "tbz x14, #0, 91f\n"
+      "ld1 { v0.b }[2], [x13]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
       "b 91f\n"
       "90:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
       "91:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q21, [x16, #0x0]\n"
+      ".inst 0x6f80e2a8  // udot v8.4s, v21.16b, v0.4b[0]\n"
+      "ldr q20, [x16, #0x10]\n"
+      ".inst 0x6f81e2ac  // udot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x6f82e2b0  // udot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x16, #0x20]\n"
+      ".inst 0x6f80e289  // udot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x6f81e28d  // udot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e291  // udot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
       "92:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 82b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
       "bge 101f\n"
-      "tbz x17, #3, 96f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "tbz x17, #2, 94f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "tbz x17, #1, 93f\n"
-      "str d11, [x15], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "tbz x17, #0, 100f\n"
-      "st1 { v11.s }[2], [x15]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
+      "tbz x8, #3, 96f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x8, #2, 94f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x8, #1, 93f\n"
+      "str d11, [x17], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x8, #0, 100f\n"
+      "st1 { v11.s }[2], [x17]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
       "b 100f\n"
       "93:"  // Height 3: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 100f\n"
-      "str s11, [x15, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
+      "tbz x8, #0, 100f\n"
+      "str s11, [x17, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
       "b 100f\n"
       "94:"  // Height 3: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 95f\n"
-      "str d10, [x15], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "tbz x17, #0, 100f\n"
-      "st1 { v10.s }[2], [x15]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
+      "tbz x8, #1, 95f\n"
+      "str d10, [x17], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x8, #0, 100f\n"
+      "st1 { v10.s }[2], [x17]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
       "b 100f\n"
       "95:"  // Height 3: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 100f\n"
-      "str s10, [x15, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
+      "tbz x8, #0, 100f\n"
+      "str s10, [x17, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
       "b 100f\n"
       "96:"  // Height 3: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 98f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "tbz x17, #1, 97f\n"
-      "str d9, [x15], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "tbz x17, #0, 100f\n"
-      "st1 { v9.s }[2], [x15]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
+      "tbz x8, #2, 98f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x8, #1, 97f\n"
+      "str d9, [x17], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x8, #0, 100f\n"
+      "st1 { v9.s }[2], [x17]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
       "b 100f\n"
       "97:"  // Height 3: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 100f\n"
-      "str s9, [x15, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
+      "tbz x8, #0, 100f\n"
+      "str s9, [x17, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
       "b 100f\n"
       "98:"  // Height 3: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 99f\n"
-      "str d8, [x15], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "tbz x17, #0, 100f\n"
-      "st1 { v8.s }[2], [x15]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
+      "tbz x8, #1, 99f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x8, #0, 100f\n"
+      "st1 { v8.s }[2], [x17]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
       "b 100f\n"
       "99:"  // Height 3: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
       "100:"  // Height 3: Partial direct writeback: Done
       "b 102f\n"
       "101:"  // Height 3: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
       "102:"  // Height 3: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 70b\n"
       "b 206f\n"
       "103:"  // Height 4
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
       "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "104:"  // Height 4: Column loop
       "tbz %x[flags], #0, 114f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "add x22, x23, x20, LSL #2\n"
       "bge 113f\n"
-      "tbz x17, #3, 108f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "tbz x17, #2, 106f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "tbz x17, #1, 105f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x24, #0x38\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "tbz x17, #0, 112f\n"
-      "ld1 { v11.s }[2], [x15]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
+      "tbz x8, #3, 108f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x8, #2, 106f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x8, #1, 105f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x8, #0, 112f\n"
+      "ld1 { v11.s }[2], [x17]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
       "b 112f\n"
       "105:"  // Height 4: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 112f\n"
-      "ldr s11, [x15, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 112f\n"
+      "ldr s11, [x17, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
       "b 112f\n"
       "106:"  // Height 4: Partial accumulate: partial_2_8
-      "tbz x17, #1, 107f\n"
-      "ldr d10, [x15], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "tbz x17, #0, 112f\n"
-      "ld1 { v10.s }[2], [x15]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
+      "tbz x8, #1, 107f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d14, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x8, #0, 112f\n"
+      "ld1 { v10.s }[2], [x17]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
       "b 112f\n"
       "107:"  // Height 4: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 112f\n"
-      "ldr s10, [x15, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 112f\n"
+      "ldr s10, [x17, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
       "b 112f\n"
       "108:"  // Height 4: Partial accumulate: partial_4_0
-      "tbz x17, #2, 110f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "tbz x17, #1, 109f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x24, #0x18\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "tbz x17, #0, 112f\n"
-      "ld1 { v9.s }[2], [x15]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
+      "tbz x8, #2, 110f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "tbz x8, #1, 109f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x8, #0, 112f\n"
+      "ld1 { v9.s }[2], [x17]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
       "b 112f\n"
       "109:"  // Height 4: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 112f\n"
-      "ldr s9, [x15, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 112f\n"
+      "ldr s9, [x17, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
       "b 112f\n"
       "110:"  // Height 4: Partial accumulate: partial_2_0
-      "tbz x17, #1, 111f\n"
-      "ldr d8, [x15], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "tbz x17, #0, 112f\n"
-      "ld1 { v8.s }[2], [x15]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
+      "tbz x8, #1, 111f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "tbz x8, #0, 112f\n"
+      "ld1 { v8.s }[2], [x17]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
       "b 112f\n"
       "111:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
       "112:"  // Height 4: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 115f\n"
       "113:"  // Height 4: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
       "b 115f\n"
       "114:"  // Height 4: no accumulate
       "movi v8.4s, #0x0\n"
@@ -1471,624 +1470,624 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
       "movi v22.4s, #0x0\n"
       "movi v23.4s, #0x0\n"
       "115:"  // Height 4: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "116:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 117f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "cbnz x14, 118f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
-      "add x24, x24, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "cbnz x15, 118f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "b 118f\n"
       "117:"  // Height 4: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
-      "add x24, x26, x19\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
       "118:"  // Height 4: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 121f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 120f\n"
       "119:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x18]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x16, #0x28]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
-      "mov v7.d[1], x11\n"
-      "ldr x11, [x16, #0x38]\n"
-      "add x12, x12, #0x10\n"
-      "add x28, x28, #0x10\n"
+      "ldr d25, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "mov v25.d[1], x21\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr x10, [x16, #0x48]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x68]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr x25, [x26, #0x8]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x70]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x10, [x16, #0x88]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x23, [x24, #0x8]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xa8]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "cmp x13, #0x20\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr x10, [x16, #0xc8]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x10, [x16, #0xe8]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr d24, [x16, #0x30]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr d25, [x16, #0x40]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr d24, [x16, #0x50]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6fa0e328  // udot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32c  // udot v12.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x6fa2e330  // udot v16.4s, v25.16b, v2.4b[1]\n"
+      "ldr x25, [x13, #0x8]\n"
+      ".inst 0x6fa3e334  // udot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x16, #0x60]\n"
+      ".inst 0x6fa0e309  // udot v9.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6fa1e30d  // udot v13.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x6fa2e311  // udot v17.4s, v24.16b, v2.4b[1]\n"
+      "ldr x24, [x12, #0x8]\n"
+      ".inst 0x6fa3e315  // udot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x16, #0x70]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6fa0e32a  // udot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32e  // udot v14.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x6fa2e332  // udot v18.4s, v25.16b, v2.4b[1]\n"
+      "ldr x23, [x11, #0x8]\n"
+      ".inst 0x6fa3e336  // udot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x16, #0x80]\n"
+      ".inst 0x6fa0e30b  // udot v11.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6fa1e30f  // udot v15.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x6fa2e313  // udot v19.4s, v24.16b, v2.4b[1]\n"
+      "ldr x22, [x10, #0x8]\n"
+      ".inst 0x6fa3e317  // udot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x16, #0x90]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6f80eb28  // udot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2c  // udot v12.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x6f82eb30  // udot v16.4s, v25.16b, v2.4b[2]\n"
+      "sub x14, x14, #0x10\n"
+      ".inst 0x6f83eb34  // udot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x16, #0xa0]\n"
+      ".inst 0x6f80eb09  // udot v9.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6f81eb0d  // udot v13.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x6f82eb11  // udot v17.4s, v24.16b, v2.4b[2]\n"
+      "cmp x14, #0x20\n"
+      ".inst 0x6f83eb15  // udot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x16, #0xb0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6f80eb2a  // udot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2e  // udot v14.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x6f82eb32  // udot v18.4s, v25.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      ".inst 0x6f83eb36  // udot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x16, #0xc0]\n"
+      ".inst 0x6f80eb0b  // udot v11.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6f81eb0f  // udot v15.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x6f82eb13  // udot v19.4s, v24.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x6f83eb17  // udot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x16, #0xd0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6fa0eb28  // udot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2c  // udot v12.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x6fa2eb30  // udot v16.4s, v25.16b, v2.4b[3]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x6fa3eb34  // udot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr d25, [x16, #0xe0]\n"
+      ".inst 0x6fa0eb09  // udot v9.4s, v24.16b, v0.4b[3]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6fa1eb0d  // udot v13.4s, v24.16b, v1.4b[3]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6fa2eb11  // udot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb15  // udot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr d24, [x16, #0xf0]\n"
+      "mov v24.d[1], x20\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x10, [x16, #0x8]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa0eb2a  // udot v10.4s, v25.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x6fa1eb2e  // udot v14.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0x18]\n"
+      ".inst 0x6fa2eb32  // udot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb36  // udot v22.4s, v25.16b, v3.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      "mov v0.d[1], x9\n"
-      "mov v1.d[1], x27\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "mov v2.d[1], x25\n"
-      "mov v3.d[1], x23\n"
+      ".inst 0x6fa0eb0b  // udot v11.4s, v24.16b, v0.4b[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      ".inst 0x6fa1eb0f  // udot v15.4s, v24.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      ".inst 0x6fa2eb13  // udot v19.4s, v24.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      ".inst 0x6fa3eb17  // udot v23.4s, v24.16b, v3.4b[3]\n"
+      "ldr d3, [x10, #0x0]\n"
+      "ldr d7, [x16, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x24\n"
+      "mov v2.d[1], x23\n"
+      "mov v3.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 119b\n"
       "120:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
       "add x12, x12, #0x10\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr q25, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "add x28, x28, #0x10\n"
+      "sub x14, x14, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x16, #0x40]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x16, #0x50]\n"
+      ".inst 0x6fa0e328  // udot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32c  // udot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e330  // udot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e334  // udot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x16, #0x60]\n"
+      ".inst 0x6fa0e309  // udot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30d  // udot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e311  // udot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e315  // udot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x16, #0x70]\n"
+      ".inst 0x6fa0e32a  // udot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32e  // udot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e332  // udot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e336  // udot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x16, #0x80]\n"
+      ".inst 0x6fa0e30b  // udot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30f  // udot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e313  // udot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e317  // udot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x16, #0x90]\n"
+      ".inst 0x6f80eb28  // udot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2c  // udot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb30  // udot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb34  // udot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x16, #0xa0]\n"
+      ".inst 0x6f80eb09  // udot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0d  // udot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb11  // udot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb15  // udot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x16, #0xb0]\n"
+      ".inst 0x6f80eb2a  // udot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2e  // udot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb32  // udot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb36  // udot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x16, #0xc0]\n"
+      ".inst 0x6f80eb0b  // udot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0f  // udot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb13  // udot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb17  // udot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x16, #0xd0]\n"
+      ".inst 0x6fa0eb28  // udot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2c  // udot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb30  // udot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb34  // udot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x16, #0xe0]\n"
+      ".inst 0x6fa0eb09  // udot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb0d  // udot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb11  // udot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb15  // udot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x16, #0xf0]\n"
+      ".inst 0x6fa0eb2a  // udot v10.4s, v25.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa1eb2e  // udot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb32  // udot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb36  // udot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x6fa0eb0b  // udot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb0f  // udot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb13  // udot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb17  // udot v23.4s, v24.16b, v3.4b[3]\n"
       "121:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x13, 126f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 126f\n"
+      "cmp x14, #0x4\n"
       "blt 123f\n"
       "122:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
-      "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s29, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr s28, [x12], #0x4\n"
+      "cmp x14, #0x4\n"
+      "ldr s27, [x11], #0x4\n"
+      "ldr s26, [x10], #0x4\n"
+      "ldr q25, [x16, #0x0]\n"
+      ".inst 0x6f9de328  // udot v8.4s, v25.16b, v29.4b[0]\n"
+      "ldr q24, [x16, #0x10]\n"
+      ".inst 0x6f9ce32c  // udot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x6f9be330  // udot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae334  // udot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x16, #0x20]\n"
+      ".inst 0x6f9de309  // udot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce30d  // udot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x6f9be311  // udot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae315  // udot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x6f9de32a  // udot v10.4s, v25.16b, v29.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f9ce32e  // udot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x6f9be332  // udot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae336  // udot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x6f9de30b  // udot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce30f  // udot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x6f9be313  // udot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae317  // udot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 122b\n"
-      "cbz x13, 126f\n"
       "123:"  // Height 4: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 124f\n"
-      "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
-      "tbz x13, #0, 125f\n"
-      "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
-      "ld1 { v3.b }[2], [x24]\n"
+      "cbz x14, 126f\n"
+      "tbz x14, #1, 124f\n"
+      "ldr h0, [x13], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
+      "tbz x14, #0, 125f\n"
+      "ld1 { v0.b }[2], [x13]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
       "b 125f\n"
       "124:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
-      "ldr b3, [x24, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
       "125:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q25, [x16, #0x0]\n"
+      ".inst 0x6f80e328  // udot v8.4s, v25.16b, v0.4b[0]\n"
+      "ldr q24, [x16, #0x10]\n"
+      ".inst 0x6f81e32c  // udot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f82e330  // udot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e334  // udot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x16, #0x20]\n"
+      ".inst 0x6f80e309  // udot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30d  // udot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e311  // udot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e315  // udot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
       "126:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 116b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "bge 135f\n"
-      "tbz x17, #3, 130f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "tbz x17, #2, 128f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "tbz x17, #1, 127f\n"
-      "str d11, [x15], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "tbz x17, #0, 134f\n"
-      "st1 { v11.s }[2], [x15]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
+      "tbz x8, #3, 130f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "tbz x8, #2, 128f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "tbz x8, #1, 127f\n"
+      "str d11, [x17], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "tbz x8, #0, 134f\n"
+      "st1 { v11.s }[2], [x17]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
       "b 134f\n"
       "127:"  // Height 4: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 134f\n"
-      "str s11, [x15, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
+      "tbz x8, #0, 134f\n"
+      "str s11, [x17, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
       "b 134f\n"
       "128:"  // Height 4: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 129f\n"
-      "str d10, [x15], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "tbz x17, #0, 134f\n"
-      "st1 { v10.s }[2], [x15]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
+      "tbz x8, #1, 129f\n"
+      "str d10, [x17], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "tbz x8, #0, 134f\n"
+      "st1 { v10.s }[2], [x17]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
       "b 134f\n"
       "129:"  // Height 4: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 134f\n"
-      "str s10, [x15, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
+      "tbz x8, #0, 134f\n"
+      "str s10, [x17, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
       "b 134f\n"
       "130:"  // Height 4: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 132f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "tbz x17, #1, 131f\n"
-      "str d9, [x15], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "tbz x17, #0, 134f\n"
-      "st1 { v9.s }[2], [x15]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
+      "tbz x8, #2, 132f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "tbz x8, #1, 131f\n"
+      "str d9, [x17], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "tbz x8, #0, 134f\n"
+      "st1 { v9.s }[2], [x17]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
       "b 134f\n"
       "131:"  // Height 4: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 134f\n"
-      "str s9, [x15, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
+      "tbz x8, #0, 134f\n"
+      "str s9, [x17, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
       "b 134f\n"
       "132:"  // Height 4: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 133f\n"
-      "str d8, [x15], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "tbz x17, #0, 134f\n"
-      "st1 { v8.s }[2], [x15]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
+      "tbz x8, #1, 133f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "tbz x8, #0, 134f\n"
+      "st1 { v8.s }[2], [x17]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
       "b 134f\n"
       "133:"  // Height 4: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
       "134:"  // Height 4: Partial direct writeback: Done
       "b 136f\n"
       "135:"  // Height 4: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
       "136:"  // Height 4: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 104b\n"
       "b 206f\n"
       "137:"  // Height 5
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[output_ptr]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
       "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "138:"  // Height 5: Column loop
       "tbz %x[flags], #0, 148f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "add x21, x22, x20, LSL #2\n"
       "bge 147f\n"
-      "tbz x17, #3, 142f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "ld1 { v25.4s }, [x20], #0x10\n"
-      "tbz x17, #2, 140f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "ld1 { v26.4s }, [x20], #0x10\n"
-      "tbz x17, #1, 139f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x24, #0x38\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "ldr d27, [x20], #0x8\n"
-      "tbz x17, #0, 146f\n"
-      "ld1 { v11.s }[2], [x15]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
-      "ld1 { v27.s }[2], [x20]\n"
+      "tbz x8, #3, 142f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "tbz x8, #2, 140f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x8, #1, 139f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x8, #0, 146f\n"
+      "ld1 { v11.s }[2], [x17]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
       "b 146f\n"
       "139:"  // Height 5: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 146f\n"
-      "ldr s11, [x15, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
-      "ldr s27, [x20, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 146f\n"
+      "ldr s11, [x17, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
       "b 146f\n"
       "140:"  // Height 5: Partial accumulate: partial_2_8
-      "tbz x17, #1, 141f\n"
-      "ldr d10, [x15], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "ldr d26, [x20], #0x8\n"
-      "tbz x17, #0, 146f\n"
-      "ld1 { v10.s }[2], [x15]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
-      "ld1 { v26.s }[2], [x20]\n"
+      "tbz x8, #1, 141f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d14, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x8, #0, 146f\n"
+      "ld1 { v10.s }[2], [x17]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
       "b 146f\n"
       "141:"  // Height 5: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 146f\n"
-      "ldr s10, [x15, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
-      "ldr s26, [x20, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 146f\n"
+      "ldr s10, [x17, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
       "b 146f\n"
       "142:"  // Height 5: Partial accumulate: partial_4_0
-      "tbz x17, #2, 144f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "tbz x17, #1, 143f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x24, #0x18\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "ldr d25, [x20], #0x8\n"
-      "tbz x17, #0, 146f\n"
-      "ld1 { v9.s }[2], [x15]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
-      "ld1 { v25.s }[2], [x20]\n"
+      "tbz x8, #2, 144f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "tbz x8, #1, 143f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "tbz x8, #0, 146f\n"
+      "ld1 { v9.s }[2], [x17]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
       "b 146f\n"
       "143:"  // Height 5: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 146f\n"
-      "ldr s9, [x15, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
-      "ldr s25, [x20, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 146f\n"
+      "ldr s9, [x17, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
       "b 146f\n"
       "144:"  // Height 5: Partial accumulate: partial_2_0
-      "tbz x17, #1, 145f\n"
-      "ldr d8, [x15], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
-      "tbz x17, #0, 146f\n"
-      "ld1 { v8.s }[2], [x15]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
-      "ld1 { v24.s }[2], [x20]\n"
+      "tbz x8, #1, 145f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "tbz x8, #0, 146f\n"
+      "ld1 { v8.s }[2], [x17]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
       "b 146f\n"
       "145:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
-      "ldr s24, [x20, #0x0]\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
       "146:"  // Height 5: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 149f\n"
       "147:"  // Height 5: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
-      "ldr q24, [x20, #0x0]\n"
-      "ldr q25, [x20, #0x10]\n"
-      "ldr q26, [x20, #0x20]\n"
-      "ldr q27, [x20, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
       "b 149f\n"
       "148:"  // Height 5: no accumulate
       "movi v8.4s, #0x0\n"
@@ -2112,725 +2111,725 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
       "movi v26.4s, #0x0\n"
       "movi v27.4s, #0x0\n"
       "149:"  // Height 5: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "150:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 151f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x22, [x20, #0x20]\n"
-      "cbnz x14, 152f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
-      "add x24, x24, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "cbnz x15, 152f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
+      "add x9, x9, x20\n"
       "b 152f\n"
       "151:"  // Height 5: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
-      "add x24, x26, x19\n"
-      "add x22, x24, x19\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
       "152:"  // Height 5: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 155f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 154f\n"
       "153:"  // Height 5: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x18]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x16, #0x28]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      "add x28, x28, #0x10\n"
+      "ldr d29, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "mov v29.d[1], x21\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      "ldr d7, [x16, #0x30]\n"
-      "add x26, x26, #0x10\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x10, [x16, #0x48]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr x27, [x28, #0x8]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x25, [x26, #0x8]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr x10, [x16, #0x68]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr x23, [x24, #0x8]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr x10, [x16, #0x88]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x21, [x22, #0x8]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "cmp x13, #0x20\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr x10, [x16, #0xa8]\n"
-      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x16, #0xc8]\n"
-      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x16, #0xe8]\n"
-      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr d28, [x16, #0x30]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
+      "ldr x26, [x13, #0x8]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr d29, [x16, #0x40]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      "ldr x25, [x12, #0x8]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr d28, [x16, #0x50]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6fa0e3a8  // udot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ac  // udot v12.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x6fa2e3b0  // udot v16.4s, v29.16b, v2.4b[1]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0x6fa3e3b4  // udot v20.4s, v29.16b, v3.4b[1]\n"
+      "ldr x22, [x9, #0x8]\n"
+      ".inst 0x6fa4e3b8  // udot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x16, #0x60]\n"
+      ".inst 0x6fa0e389  // udot v9.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6fa1e38d  // udot v13.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x6fa2e391  // udot v17.4s, v28.16b, v2.4b[1]\n"
+      "sub x14, x14, #0x10\n"
+      ".inst 0x6fa3e395  // udot v21.4s, v28.16b, v3.4b[1]\n"
+      "cmp x14, #0x20\n"
+      ".inst 0x6fa4e399  // udot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x16, #0x70]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6fa0e3aa  // udot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ae  // udot v14.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x6fa2e3b2  // udot v18.4s, v29.16b, v2.4b[1]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      ".inst 0x6fa3e3b6  // udot v22.4s, v29.16b, v3.4b[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x6fa4e3ba  // udot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x16, #0x80]\n"
+      ".inst 0x6fa0e38b  // udot v11.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6fa1e38f  // udot v15.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x6fa2e393  // udot v19.4s, v28.16b, v2.4b[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x6fa3e397  // udot v23.4s, v28.16b, v3.4b[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6fa4e39b  // udot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x16, #0x90]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6f80eba8  // udot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebac  // udot v12.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x6f82ebb0  // udot v16.4s, v29.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      ".inst 0x6f83ebb4  // udot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebb8  // udot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x16, #0xa0]\n"
+      ".inst 0x6f80eb89  // udot v9.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6f81eb8d  // udot v13.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x6f82eb91  // udot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb95  // udot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb99  // udot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x16, #0xb0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6f80ebaa  // udot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebae  // udot v14.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x6f82ebb2  // udot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb6  // udot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebba  // udot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x16, #0xc0]\n"
+      ".inst 0x6f80eb8b  // udot v11.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6f81eb8f  // udot v15.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x6f82eb93  // udot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb97  // udot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb9b  // udot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x16, #0xd0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6fa0eba8  // udot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebac  // udot v12.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x6fa2ebb0  // udot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb4  // udot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebb8  // udot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr d29, [x16, #0xe0]\n"
+      ".inst 0x6fa0eb89  // udot v9.4s, v28.16b, v0.4b[3]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6fa1eb8d  // udot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb91  // udot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb95  // udot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb99  // udot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr d28, [x16, #0xf0]\n"
+      "mov v28.d[1], x20\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x10, [x16, #0x8]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa0ebaa  // udot v10.4s, v29.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x6fa1ebae  // udot v14.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0x18]\n"
+      ".inst 0x6fa2ebb2  // udot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb6  // udot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebba  // udot v26.4s, v29.16b, v4.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr d0, [x12, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      "mov v0.d[1], x9\n"
-      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
-      "mov v1.d[1], x27\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "ldr d4, [x22, #0x0]\n"
-      "mov v2.d[1], x25\n"
+      ".inst 0x6fa0eb8b  // udot v11.4s, v28.16b, v0.4b[3]\n"
+      "ldr d0, [x13, #0x0]\n"
+      ".inst 0x6fa1eb8f  // udot v15.4s, v28.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      ".inst 0x6fa2eb93  // udot v19.4s, v28.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      ".inst 0x6fa3eb97  // udot v23.4s, v28.16b, v3.4b[3]\n"
+      "ldr d3, [x10, #0x0]\n"
+      ".inst 0x6fa4eb9b  // udot v27.4s, v28.16b, v4.4b[3]\n"
+      "ldr d4, [x9, #0x0]\n"
+      "ldr d7, [x16, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x26\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
       "mov v3.d[1], x23\n"
-      "mov v4.d[1], x21\n"
+      "mov v4.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 153b\n"
       "154:"  // Height 5: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
       "add x12, x12, #0x10\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q29, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x28, x28, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "sub x14, x14, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x16, #0x40]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x16, #0x50]\n"
+      ".inst 0x6fa0e3a8  // udot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ac  // udot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b0  // udot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b4  // udot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3b8  // udot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x16, #0x60]\n"
+      ".inst 0x6fa0e389  // udot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38d  // udot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e391  // udot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e395  // udot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e399  // udot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x16, #0x70]\n"
+      ".inst 0x6fa0e3aa  // udot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ae  // udot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b2  // udot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b6  // udot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3ba  // udot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x16, #0x80]\n"
+      ".inst 0x6fa0e38b  // udot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38f  // udot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e393  // udot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e397  // udot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e39b  // udot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x16, #0x90]\n"
+      ".inst 0x6f80eba8  // udot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebac  // udot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb0  // udot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb4  // udot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebb8  // udot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x16, #0xa0]\n"
+      ".inst 0x6f80eb89  // udot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8d  // udot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb91  // udot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb95  // udot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb99  // udot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x16, #0xb0]\n"
+      ".inst 0x6f80ebaa  // udot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebae  // udot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb2  // udot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb6  // udot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebba  // udot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x16, #0xc0]\n"
+      ".inst 0x6f80eb8b  // udot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8f  // udot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb93  // udot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb97  // udot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb9b  // udot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x16, #0xd0]\n"
+      ".inst 0x6fa0eba8  // udot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebac  // udot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb0  // udot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb4  // udot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebb8  // udot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x16, #0xe0]\n"
+      ".inst 0x6fa0eb89  // udot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb8d  // udot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb91  // udot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb95  // udot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb99  // udot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x16, #0xf0]\n"
+      ".inst 0x6fa0ebaa  // udot v10.4s, v29.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa1ebae  // udot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb2  // udot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb6  // udot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebba  // udot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x6fa0eb8b  // udot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb8f  // udot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb93  // udot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb97  // udot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb9b  // udot v27.4s, v28.16b, v4.4b[3]\n"
       "155:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x13, 160f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 160f\n"
+      "cmp x14, #0x4\n"
       "blt 157f\n"
       "156:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
-      "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s2, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "cmp x14, #0x4\n"
+      "ldr s0, [x11], #0x4\n"
+      "ldr s31, [x10], #0x4\n"
+      "ldr s30, [x9], #0x4\n"
+      "ldr q29, [x16, #0x0]\n"
+      ".inst 0x6f82e3a8  // udot v8.4s, v29.16b, v2.4b[0]\n"
+      "ldr q28, [x16, #0x10]\n"
+      ".inst 0x6f81e3ac  // udot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f80e3b0  // udot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe3b4  // udot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee3b8  // udot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x16, #0x20]\n"
+      ".inst 0x6f82e389  // udot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f81e38d  // udot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f80e391  // udot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe395  // udot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee399  // udot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x6f82e3aa  // udot v10.4s, v29.16b, v2.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe3b6  // udot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee3ba  // udot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x6f82e38b  // udot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe397  // udot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee39b  // udot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 156b\n"
-      "cbz x13, 160f\n"
       "157:"  // Height 5: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 158f\n"
-      "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
-      "tbz x13, #0, 159f\n"
-      "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
-      "ld1 { v3.b }[2], [x24]\n"
-      "ld1 { v4.b }[2], [x22]\n"
+      "cbz x14, 160f\n"
+      "tbz x14, #1, 158f\n"
+      "ldr h0, [x13], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
+      "ldr h4, [x9], #0x2\n"
+      "tbz x14, #0, 159f\n"
+      "ld1 { v0.b }[2], [x13]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
+      "ld1 { v4.b }[2], [x9]\n"
       "b 159f\n"
       "158:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
-      "ldr b3, [x24, #0x0]\n"
-      "ldr b4, [x22, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
+      "ldr b4, [x9, #0x0]\n"
       "159:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q29, [x16, #0x0]\n"
+      ".inst 0x6f80e3a8  // udot v8.4s, v29.16b, v0.4b[0]\n"
+      "ldr q28, [x16, #0x10]\n"
+      ".inst 0x6f81e3ac  // udot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3b0  // udot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b4  // udot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x6f84e3b8  // udot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x16, #0x20]\n"
+      ".inst 0x6f80e389  // udot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38d  // udot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e391  // udot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e395  // udot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e399  // udot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
       "160:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 150b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #2\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "bge 169f\n"
-      "tbz x17, #3, 164f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v25.4s }, [x20], #0x10\n"
-      "tbz x17, #2, 162f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "st1 { v26.4s }, [x20], #0x10\n"
-      "tbz x17, #1, 161f\n"
-      "str d11, [x15], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "str d27, [x20], #0x8\n"
-      "tbz x17, #0, 168f\n"
-      "st1 { v11.s }[2], [x15]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
-      "st1 { v27.s }[2], [x20]\n"
+      "tbz x8, #3, 164f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "tbz x8, #2, 162f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x8, #1, 161f\n"
+      "str d11, [x17], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x8, #0, 168f\n"
+      "st1 { v11.s }[2], [x17]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
       "b 168f\n"
       "161:"  // Height 5: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 168f\n"
-      "str s11, [x15, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
-      "str s27, [x20, #0x0]\n"
+      "tbz x8, #0, 168f\n"
+      "str s11, [x17, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
       "b 168f\n"
       "162:"  // Height 5: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 163f\n"
-      "str d10, [x15], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "str d26, [x20], #0x8\n"
-      "tbz x17, #0, 168f\n"
-      "st1 { v10.s }[2], [x15]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
-      "st1 { v26.s }[2], [x20]\n"
+      "tbz x8, #1, 163f\n"
+      "str d10, [x17], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x8, #0, 168f\n"
+      "st1 { v10.s }[2], [x17]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
       "b 168f\n"
       "163:"  // Height 5: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 168f\n"
-      "str s10, [x15, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
-      "str s26, [x20, #0x0]\n"
+      "tbz x8, #0, 168f\n"
+      "str s10, [x17, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
       "b 168f\n"
       "164:"  // Height 5: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 166f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "tbz x17, #1, 165f\n"
-      "str d9, [x15], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "str d25, [x20], #0x8\n"
-      "tbz x17, #0, 168f\n"
-      "st1 { v9.s }[2], [x15]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
-      "st1 { v25.s }[2], [x20]\n"
+      "tbz x8, #2, 166f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x8, #1, 165f\n"
+      "str d9, [x17], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x8, #0, 168f\n"
+      "st1 { v9.s }[2], [x17]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
       "b 168f\n"
       "165:"  // Height 5: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 168f\n"
-      "str s9, [x15, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
-      "str s25, [x20, #0x0]\n"
+      "tbz x8, #0, 168f\n"
+      "str s9, [x17, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
       "b 168f\n"
       "166:"  // Height 5: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 167f\n"
-      "str d8, [x15], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "tbz x17, #0, 168f\n"
-      "st1 { v8.s }[2], [x15]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
-      "st1 { v24.s }[2], [x20]\n"
+      "tbz x8, #1, 167f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x8, #0, 168f\n"
+      "st1 { v8.s }[2], [x17]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
       "b 168f\n"
       "167:"  // Height 5: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
-      "str s24, [x20, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
       "168:"  // Height 5: Partial direct writeback: Done
       "b 170f\n"
       "169:"  // Height 5: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q25, [x20, #0x10]\n"
-      "str q26, [x20, #0x20]\n"
-      "str q27, [x20, #0x30]\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
       "170:"  // Height 5: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 138b\n"
       "b 206f\n"
       "171:"  // Height 6
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[output_ptr]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "mov x20, #0x18\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x17, %x[output_ptr]\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "172:"  // Height 6: Column loop
       "tbz %x[flags], #0, 182f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
-      "add x19, x20, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "add x20, x21, x20, LSL #2\n"
       "bge 181f\n"
-      "tbz x17, #3, 176f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x15], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v28.4s }, [x19], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "ld1 { v25.4s }, [x20], #0x10\n"
-      "ld1 { v29.4s }, [x19], #0x10\n"
-      "tbz x17, #2, 174f\n"
-      "ld1 { v10.4s }, [x15], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "ld1 { v26.4s }, [x20], #0x10\n"
-      "ld1 { v30.4s }, [x19], #0x10\n"
-      "tbz x17, #1, 173f\n"
-      "ldr d11, [x15], #0x8\n"
-      "mov x24, #0x38\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "ldr d27, [x20], #0x8\n"
-      "ldr d31, [x19], #0x8\n"
-      "tbz x17, #0, 180f\n"
-      "ld1 { v11.s }[2], [x15]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
-      "ld1 { v27.s }[2], [x20]\n"
-      "ld1 { v31.s }[2], [x19]\n"
+      "tbz x8, #3, 176f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v9.4s }, [x17], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v29.4s }, [x20], #0x10\n"
+      "tbz x8, #2, 174f\n"
+      "ld1 { v10.4s }, [x17], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v30.4s }, [x20], #0x10\n"
+      "tbz x8, #1, 173f\n"
+      "ldr d11, [x17], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "ldr d31, [x20], #0x8\n"
+      "tbz x8, #0, 180f\n"
+      "ld1 { v11.s }[2], [x17]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "ld1 { v31.s }[2], [x20]\n"
       "b 180f\n"
       "173:"  // Height 6: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x17, #0, 180f\n"
-      "ldr s11, [x15, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
-      "ldr s27, [x20, #0x0]\n"
-      "ldr s31, [x19, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x8, #0, 180f\n"
+      "ldr s11, [x17, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "ldr s31, [x20, #0x0]\n"
       "b 180f\n"
       "174:"  // Height 6: Partial accumulate: partial_2_8
-      "tbz x17, #1, 175f\n"
-      "ldr d10, [x15], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "ldr d26, [x20], #0x8\n"
-      "ldr d30, [x19], #0x8\n"
-      "tbz x17, #0, 180f\n"
-      "ld1 { v10.s }[2], [x15]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
-      "ld1 { v26.s }[2], [x20]\n"
-      "ld1 { v30.s }[2], [x19]\n"
+      "tbz x8, #1, 175f\n"
+      "ldr d10, [x17], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d14, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d30, [x20], #0x8\n"
+      "tbz x8, #0, 180f\n"
+      "ld1 { v10.s }[2], [x17]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "ld1 { v30.s }[2], [x20]\n"
       "b 180f\n"
       "175:"  // Height 6: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x17, #0, 180f\n"
-      "ldr s10, [x15, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
-      "ldr s26, [x20, #0x0]\n"
-      "ldr s30, [x19, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x8, #0, 180f\n"
+      "ldr s10, [x17, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "ldr s30, [x20, #0x0]\n"
       "b 180f\n"
       "176:"  // Height 6: Partial accumulate: partial_4_0
-      "tbz x17, #2, 178f\n"
-      "ld1 { v8.4s }, [x15], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v28.4s }, [x19], #0x10\n"
-      "tbz x17, #1, 177f\n"
-      "ldr d9, [x15], #0x8\n"
-      "mov x24, #0x18\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "ldr d25, [x20], #0x8\n"
-      "ldr d29, [x19], #0x8\n"
-      "tbz x17, #0, 180f\n"
-      "ld1 { v9.s }[2], [x15]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
-      "ld1 { v25.s }[2], [x20]\n"
-      "ld1 { v29.s }[2], [x19]\n"
+      "tbz x8, #2, 178f\n"
+      "ld1 { v8.4s }, [x17], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "tbz x8, #1, 177f\n"
+      "ldr d9, [x17], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "ldr d29, [x20], #0x8\n"
+      "tbz x8, #0, 180f\n"
+      "ld1 { v9.s }[2], [x17]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v29.s }[2], [x20]\n"
       "b 180f\n"
       "177:"  // Height 6: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x17, #0, 180f\n"
-      "ldr s9, [x15, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
-      "ldr s25, [x20, #0x0]\n"
-      "ldr s29, [x19, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x8, #0, 180f\n"
+      "ldr s9, [x17, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "ldr s29, [x20, #0x0]\n"
       "b 180f\n"
       "178:"  // Height 6: Partial accumulate: partial_2_0
-      "tbz x17, #1, 179f\n"
-      "ldr d8, [x15], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
-      "ldr d28, [x19], #0x8\n"
-      "tbz x17, #0, 180f\n"
-      "ld1 { v8.s }[2], [x15]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
-      "ld1 { v24.s }[2], [x20]\n"
-      "ld1 { v28.s }[2], [x19]\n"
+      "tbz x8, #1, 179f\n"
+      "ldr d8, [x17], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "tbz x8, #0, 180f\n"
+      "ld1 { v8.s }[2], [x17]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
+      "ld1 { v28.s }[2], [x20]\n"
       "b 180f\n"
       "179:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr s8, [x15, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
-      "ldr s24, [x20, #0x0]\n"
-      "ldr s28, [x19, #0x0]\n"
+      "ldr s8, [x17, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
+      "ldr s28, [x20, #0x0]\n"
       "180:"  // Height 6: Partial accumulate: Done
-      "sub x15, x15, x24\n"
+      "sub x17, x17, x25\n"
       "b 183f\n"
       "181:"  // Height 6: full accumulate
-      "ldr q8, [x15, #0x0]\n"
-      "ldr q9, [x15, #0x10]\n"
-      "ldr q10, [x15, #0x20]\n"
-      "ldr q11, [x15, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
-      "ldr q24, [x20, #0x0]\n"
-      "ldr q25, [x20, #0x10]\n"
-      "ldr q26, [x20, #0x20]\n"
-      "ldr q27, [x20, #0x30]\n"
-      "ldr q28, [x19, #0x0]\n"
-      "ldr q29, [x19, #0x10]\n"
-      "ldr q30, [x19, #0x20]\n"
-      "ldr q31, [x19, #0x30]\n"
+      "ldr q8, [x17, #0x0]\n"
+      "ldr q9, [x17, #0x10]\n"
+      "ldr q10, [x17, #0x20]\n"
+      "ldr q11, [x17, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x20, #0x0]\n"
+      "ldr q29, [x20, #0x10]\n"
+      "ldr q30, [x20, #0x20]\n"
+      "ldr q31, [x20, #0x30]\n"
       "b 183f\n"
       "182:"  // Height 6: no accumulate
       "movi v8.4s, #0x0\n"
@@ -2858,260 +2857,260 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
       "movi v30.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
       "183:"  // Height 6: setup done
-      "mov x14, #0x0\n"
+      "mov x15, #0x0\n"
       "184:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w13, [x20, x14, LSL #0x2]\n"
+      "ldr w14, [x20, x15, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 185f\n"
-      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x12, [x20, #0x0]\n"
-      "ldr x28, [x20, #0x8]\n"
-      "ldr x26, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x22, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x14, 186f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x12, x12, x19\n"
-      "add x28, x28, x19\n"
-      "add x26, x26, x19\n"
-      "add x24, x24, x19\n"
-      "add x22, x22, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "ldr x28, [x20, #0x28]\n"
+      "cbnz x15, 186f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
       "b 186f\n"
       "185:"  // Height 6: setup direct input
-      "mov x12, %x[input_ptr]\n"
-      "add x28, x12, x19\n"
-      "add x26, x28, x19\n"
-      "add x24, x26, x19\n"
-      "add x22, x24, x19\n"
-      "add x20, x22, x19\n"
+      "mov x13, %x[input_ptr]\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
+      "add x28, x9, x21\n"
       "186:"  // Height 6: input setup done
-      "cmp x13, #0x10\n"
+      "cmp x14, #0x10\n"
       "blt 189f\n"
-      "ldr q0, [x12, #0x0]\n"
-      "ldr q1, [x28, #0x0]\n"
-      "cmp x13, #0x20\n"
-      "ldr q2, [x26, #0x0]\n"
-      "ldr q3, [x24, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q0, [x13, #0x0]\n"
+      "cmp x14, #0x20\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
+      "ldr q7, [x16, #0x10]\n"
       "blt 188f\n"
       "187:"  // Height 6: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x10]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x18]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x16, #0x28]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "mov v7.d[1], x11\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
       "ldr d6, [x16, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr x9, [x12, #0x8]\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr x10, [x16, #0x48]\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x16, #0x30]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x28, x28, #0x10\n"
       ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr x20, [x16, #0x58]\n"
       ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "mov v7.d[1], x11\n"
+      "ldr x27, [x13, #0x8]\n"
       ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
+      "ldr x26, [x12, #0x8]\n"
       ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr x27, [x28, #0x8]\n"
+      "ldr x25, [x11, #0x8]\n"
       ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
       "ldr d6, [x16, #0x40]\n"
       ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "add x26, x26, #0x10\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr x21, [x16, #0x68]\n"
       ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "mov v6.d[1], x10\n"
+      "ldr x24, [x10, #0x8]\n"
       ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr x10, [x16, #0x68]\n"
+      "ldr x23, [x9, #0x8]\n"
       ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr x25, [x26, #0x8]\n"
+      "ldr x22, [x28, #0x8]\n"
       ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x16, #0x50]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr x20, [x16, #0x78]\n"
       ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
+      "sub x14, x14, #0x10\n"
       ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
+      "cmp x14, #0x20\n"
       ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr x23, [x24, #0x8]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x6fa5e0dc  // udot v28.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x16, #0x60]\n"
       ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "add x22, x22, #0x10\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr x21, [x16, #0x88]\n"
       ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr x10, [x16, #0x88]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr x21, [x22, #0x8]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x6fa5e0fd  // udot v29.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x16, #0x70]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr x20, [x16, #0x98]\n"
       ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "mov v7.d[1], x11\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr x19, [x20, #0x8]\n"
       ".inst 0x6fa5e0de  // udot v30.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x16, #0x80]\n"
       ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "sub x13, x13, #0x10\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "cmp x13, #0x20\n"
+      "ldr x21, [x16, #0xa8]\n"
       ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr x10, [x16, #0xa8]\n"
       ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x6fa5e0ff  // udot v31.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x16, #0x90]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
       ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
       ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x6f85e8dc  // udot v28.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x16, #0xa0]\n"
       ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
       ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x16, #0xc8]\n"
       ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x6f85e8fd  // udot v29.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x16, #0xb0]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
       ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
       ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x6f85e8de  // udot v30.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x16, #0xc0]\n"
       ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
       ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr x10, [x16, #0xe8]\n"
       ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x6f85e8ff  // udot v31.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x16, #0xd0]\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
       ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
       ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8dc  // udot v28.4s, v6.16b, v5.4b[3]\n"
       "ldr d6, [x16, #0xe0]\n"
       ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
       ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
       ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
       ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8fd  // udot v29.4s, v7.16b, v5.4b[3]\n"
       "ldr d7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "mov v7.d[1], x20\n"
       "add x16, x16, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
       ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x10, [x16, #0x8]\n"
+      "ldr x20, [x16, #0x18]\n"
       ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      "mov v7.d[1], x11\n"
       ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8de  // udot v30.4s, v6.16b, v5.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
       ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr d0, [x12, #0x0]\n"
+      "ldr d0, [x13, #0x0]\n"
       ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x28, #0x0]\n"
+      "ldr d1, [x12, #0x0]\n"
       ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "mov v6.d[1], x10\n"
+      "ldr d2, [x11, #0x0]\n"
       ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      "mov v0.d[1], x9\n"
+      "ldr d3, [x10, #0x0]\n"
       ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
-      "mov v1.d[1], x27\n"
+      "ldr d4, [x9, #0x0]\n"
       ".inst 0x6fa5e8ff  // udot v31.4s, v7.16b, v5.4b[3]\n"
-      "ldr d2, [x26, #0x0]\n"
-      "ldr d3, [x24, #0x0]\n"
-      "ldr d4, [x22, #0x0]\n"
+      "ldr d5, [x28, #0x0]\n"
+      "ldr d7, [x16, #0x10]\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x27\n"
+      "mov v1.d[1], x26\n"
       "mov v2.d[1], x25\n"
-      "ldr d5, [x20, #0x0]\n"
-      "mov v3.d[1], x23\n"
-      "mov v4.d[1], x21\n"
-      "mov v5.d[1], x19\n"
+      "mov v3.d[1], x24\n"
+      "mov v4.d[1], x23\n"
+      "mov v5.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 187b\n"
       "188:"  // Height 6: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
+      "add x13, x13, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "sub x13, x13, #0x10\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
       "add x12, x12, #0x10\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "add x28, x28, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "prfm pldl1keep, [x28, #0x80]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
       "ldr q6, [x16, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "add x26, x26, #0x10\n"
+      "sub x14, x14, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr q7, [x16, #0x30]\n"
       ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x20, x20, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
       "ldr q6, [x16, #0x40]\n"
@@ -3206,292 +3205,291 @@ void a64_hybrid_u8u32_dot_6x16_a55 (
       ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8ff  // udot v31.4s, v7.16b, v5.4b[3]\n"
       "189:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x13, 194f\n"
-      "cmp x13, #0x4\n"
+      "cbz x14, 194f\n"
+      "cmp x14, #0x4\n"
       "blt 191f\n"
       "190:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
-      "sub x13, x13, #0x4\n"
-      "ldr s1, [x28], #0x4\n"
-      "cmp x13, #0x4\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x24], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr s5, [x20], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s7, [x13], #0x4\n"
+      "sub x14, x14, #0x4\n"
+      "ldr s6, [x12], #0x4\n"
+      "cmp x14, #0x4\n"
+      "ldr s5, [x11], #0x4\n"
+      "ldr s4, [x10], #0x4\n"
+      "ldr s3, [x9], #0x4\n"
+      "ldr s2, [x28], #0x4\n"
+      "ldr q1, [x16, #0x0]\n"
+      ".inst 0x6f87e028  // udot v8.4s, v1.16b, v7.4b[0]\n"
+      "ldr q0, [x16, #0x10]\n"
+      ".inst 0x6f86e02c  // udot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x6f85e030  // udot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x6f84e034  // udot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x6f83e038  // udot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x6f82e03c  // udot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x16, #0x20]\n"
+      ".inst 0x6f87e009  // udot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x6f86e00d  // udot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x6f85e011  // udot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x6f84e015  // udot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x6f83e019  // udot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x6f82e01d  // udot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x16, #0x30]\n"
+      ".inst 0x6f87e02a  // udot v10.4s, v1.16b, v7.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x6f86e02e  // udot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x6f85e032  // udot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x6f84e036  // udot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x6f83e03a  // udot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x6f82e03e  // udot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x6f87e00b  // udot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x6f86e00f  // udot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x6f85e013  // udot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x6f84e017  // udot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x6f83e01b  // udot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x6f82e01f  // udot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 190b\n"
-      "cbz x13, 194f\n"
       "191:"  // Height 6: Multiply loop: Skip odd blocks
-      "tbz x13, #1, 192f\n"
-      "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x28], #0x2\n"
-      "ldr h2, [x26], #0x2\n"
-      "ldr h3, [x24], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
-      "ldr h5, [x20], #0x2\n"
-      "tbz x13, #0, 193f\n"
-      "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x28]\n"
-      "ld1 { v2.b }[2], [x26]\n"
-      "ld1 { v3.b }[2], [x24]\n"
-      "ld1 { v4.b }[2], [x22]\n"
-      "ld1 { v5.b }[2], [x20]\n"
+      "cbz x14, 194f\n"
+      "tbz x14, #1, 192f\n"
+      "ldr h0, [x13], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
+      "ldr h4, [x9], #0x2\n"
+      "ldr h5, [x28], #0x2\n"
+      "tbz x14, #0, 193f\n"
+      "ld1 { v0.b }[2], [x13]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
+      "ld1 { v4.b }[2], [x9]\n"
+      "ld1 { v5.b }[2], [x28]\n"
       "b 193f\n"
       "192:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x28, #0x0]\n"
-      "ldr b2, [x26, #0x0]\n"
-      "ldr b3, [x24, #0x0]\n"
-      "ldr b4, [x22, #0x0]\n"
-      "ldr b5, [x20, #0x0]\n"
+      "ldr b0, [x13, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
+      "ldr b4, [x9, #0x0]\n"
+      "ldr b5, [x28, #0x0]\n"
       "193:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x16, #0x0]\n"
+      ".inst 0x6f80e0e8  // udot v8.4s, v7.16b, v0.4b[0]\n"
+      "ldr q6, [x16, #0x10]\n"
+      ".inst 0x6f81e0ec  // udot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f0  // udot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f4  // udot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f8  // udot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fc  // udot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x16, #0x20]\n"
+      ".inst 0x6f80e0c9  // udot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0cd  // udot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d1  // udot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d5  // udot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d9  // udot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0dd  // udot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x16, #0x30]\n"
+      ".inst 0x6f80e0ea  // udot v10.4s, v7.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x6f81e0ee  // udot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f2  // udot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f6  // udot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fa  // udot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fe  // udot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x6f80e0cb  // udot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0cf  // udot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d3  // udot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d7  // udot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0db  // udot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0df  // udot v31.4s, v6.16b, v5.4b[0]\n"
       "194:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x14, x14, #0x1\n"
-      "cmp x14, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x15, x15, #0x1\n"
+      "cmp x15, x20\n"
       "bne 184b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x15, #0x0]\n"
-      "cmp x17, #0x10\n"
-      "add x23, x15, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x17, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "cmp x8, #0x10\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #2\n"
       "prfm pstl1keep, [x20, #0x0]\n"
-      "add x19, x20, x19, LSL #2\n"
-      "prfm pstl1keep, [x19, #0x0]\n"
       "bge 203f\n"
-      "tbz x17, #3, 198f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v9.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v25.4s }, [x20], #0x10\n"
-      "st1 { v28.4s }, [x19], #0x10\n"
-      "st1 { v29.4s }, [x19], #0x10\n"
-      "tbz x17, #2, 196f\n"
-      "st1 { v10.4s }, [x15], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "st1 { v26.4s }, [x20], #0x10\n"
-      "st1 { v30.4s }, [x19], #0x10\n"
-      "tbz x17, #1, 195f\n"
-      "str d11, [x15], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "str d27, [x20], #0x8\n"
-      "str d31, [x19], #0x8\n"
-      "tbz x17, #0, 202f\n"
-      "st1 { v11.s }[2], [x15]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
-      "st1 { v27.s }[2], [x20]\n"
-      "st1 { v31.s }[2], [x19]\n"
+      "tbz x8, #3, 198f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v9.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "st1 { v29.4s }, [x20], #0x10\n"
+      "tbz x8, #2, 196f\n"
+      "st1 { v10.4s }, [x17], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v30.4s }, [x20], #0x10\n"
+      "tbz x8, #1, 195f\n"
+      "str d11, [x17], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "str d31, [x20], #0x8\n"
+      "tbz x8, #0, 202f\n"
+      "st1 { v11.s }[2], [x17]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "st1 { v31.s }[2], [x20]\n"
       "b 202f\n"
       "195:"  // Height 6: Partial direct writeback: partial_1_12
-      "tbz x17, #0, 202f\n"
-      "str s11, [x15, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
-      "str s27, [x20, #0x0]\n"
-      "str s31, [x19, #0x0]\n"
+      "tbz x8, #0, 202f\n"
+      "str s11, [x17, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "str s31, [x20, #0x0]\n"
       "b 202f\n"
       "196:"  // Height 6: Partial direct writeback: partial_2_8
-      "tbz x17, #1, 197f\n"
-      "str d10, [x15], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "str d26, [x20], #0x8\n"
-      "str d30, [x19], #0x8\n"
-      "tbz x17, #0, 202f\n"
-      "st1 { v10.s }[2], [x15]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
-      "st1 { v26.s }[2], [x20]\n"
-      "st1 { v30.s }[2], [x19]\n"
+      "tbz x8, #1, 197f\n"
+      "str d10, [x17], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "str d30, [x20], #0x8\n"
+      "tbz x8, #0, 202f\n"
+      "st1 { v10.s }[2], [x17]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "st1 { v30.s }[2], [x20]\n"
       "b 202f\n"
       "197:"  // Height 6: Partial direct writeback: partial_1_8
-      "tbz x17, #0, 202f\n"
-      "str s10, [x15, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
-      "str s26, [x20, #0x0]\n"
-      "str s30, [x19, #0x0]\n"
+      "tbz x8, #0, 202f\n"
+      "str s10, [x17, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "str s30, [x20, #0x0]\n"
       "b 202f\n"
       "198:"  // Height 6: Partial direct writeback: partial_4_0
-      "tbz x17, #2, 200f\n"
-      "st1 { v8.4s }, [x15], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v28.4s }, [x19], #0x10\n"
-      "tbz x17, #1, 199f\n"
-      "str d9, [x15], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "str d25, [x20], #0x8\n"
-      "str d29, [x19], #0x8\n"
-      "tbz x17, #0, 202f\n"
-      "st1 { v9.s }[2], [x15]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
-      "st1 { v25.s }[2], [x20]\n"
-      "st1 { v29.s }[2], [x19]\n"
+      "tbz x8, #2, 200f\n"
+      "st1 { v8.4s }, [x17], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "tbz x8, #1, 199f\n"
+      "str d9, [x17], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "str d29, [x20], #0x8\n"
+      "tbz x8, #0, 202f\n"
+      "st1 { v9.s }[2], [x17]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "st1 { v29.s }[2], [x20]\n"
       "b 202f\n"
       "199:"  // Height 6: Partial direct writeback: partial_1_4
-      "tbz x17, #0, 202f\n"
-      "str s9, [x15, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
-      "str s25, [x20, #0x0]\n"
-      "str s29, [x19, #0x0]\n"
+      "tbz x8, #0, 202f\n"
+      "str s9, [x17, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "str s29, [x20, #0x0]\n"
       "b 202f\n"
       "200:"  // Height 6: Partial direct writeback: partial_2_0
-      "tbz x17, #1, 201f\n"
-      "str d8, [x15], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "str d28, [x19], #0x8\n"
-      "tbz x17, #0, 202f\n"
-      "st1 { v8.s }[2], [x15]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
-      "st1 { v24.s }[2], [x20]\n"
-      "st1 { v28.s }[2], [x19]\n"
+      "tbz x8, #1, 201f\n"
+      "str d8, [x17], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "str d28, [x20], #0x8\n"
+      "tbz x8, #0, 202f\n"
+      "st1 { v8.s }[2], [x17]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "st1 { v28.s }[2], [x20]\n"
       "b 202f\n"
       "201:"  // Height 6: Partial direct writeback: partial_1_0
-      "str s8, [x15, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
-      "str s24, [x20, #0x0]\n"
-      "str s28, [x19, #0x0]\n"
+      "str s8, [x17, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "str s28, [x20, #0x0]\n"
       "202:"  // Height 6: Partial direct writeback: Done
       "b 204f\n"
       "203:"  // Height 6: Full writeback
-      "str q8, [x15, #0x0]\n"
-      "str q9, [x15, #0x10]\n"
-      "str q10, [x15, #0x20]\n"
-      "str q11, [x15, #0x30]\n"
-      "add x15, x15, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q25, [x20, #0x10]\n"
-      "str q26, [x20, #0x20]\n"
-      "str q27, [x20, #0x30]\n"
-      "str q28, [x19, #0x0]\n"
-      "str q29, [x19, #0x10]\n"
-      "str q30, [x19, #0x20]\n"
-      "str q31, [x19, #0x30]\n"
+      "str q8, [x17, #0x0]\n"
+      "str q9, [x17, #0x10]\n"
+      "str q10, [x17, #0x20]\n"
+      "str q11, [x17, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "str q28, [x20, #0x0]\n"
+      "str q29, [x20, #0x10]\n"
+      "str q30, [x20, #0x20]\n"
+      "str q31, [x20, #0x30]\n"
       "204:"  // Height 6: Writeback done
-      "subs x17, x17, #0x10\n"
+      "subs x8, x8, #0x10\n"
       "bgt 172b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 206f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 205f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "205:"  // Update direct input
-      "mov x19, #0x6\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "206:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
index ba57ad493a..849c680843 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifdef __aarch64__
 
@@ -77,7 +77,6 @@ void a64_hybrid_u8u32_dot_6x16 (
     ka.N = N;
     ka.B_ptr = B_ptr;
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 171f\n"
@@ -87,73 +86,73 @@ void a64_hybrid_u8u32_dot_6x16 (
       "cmp %x[M], #0x2\n"
       "bgt 69f\n"
       "beq 35f\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
       "tbz %x[flags], #0, 12f\n"
-      "cmp x10, #0x10\n"
+      "cmp x11, #0x10\n"
       "bge 11f\n"
-      "tbz x10, #3, 6f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "tbz x10, #2, 4f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "tbz x10, #1, 3f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "tbz x10, #0, 10f\n"
-      "ld1 { v11.s }[2], [x28]\n"
+      "tbz x11, #3, 6f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 4f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 3f\n"
+      "ldr d11, [x9], #0x8\n"
+      "mov x25, #0x38\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v11.s }[2], [x9]\n"
       "b 10f\n"
       "3:"  // Height 1: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 10f\n"
-      "ldr s11, [x28, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s11, [x9, #0x0]\n"
       "b 10f\n"
       "4:"  // Height 1: Partial accumulate: partial_2_8
-      "tbz x10, #1, 5f\n"
-      "ldr d10, [x28], #0x8\n"
-      "mov x24, #0x28\n"
-      "tbz x10, #0, 10f\n"
-      "ld1 { v10.s }[2], [x28]\n"
+      "tbz x11, #1, 5f\n"
+      "ldr d10, [x9], #0x8\n"
+      "mov x25, #0x28\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v10.s }[2], [x9]\n"
       "b 10f\n"
       "5:"  // Height 1: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 10f\n"
-      "ldr s10, [x28, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s10, [x9, #0x0]\n"
       "b 10f\n"
       "6:"  // Height 1: Partial accumulate: partial_4_0
-      "tbz x10, #2, 8f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "tbz x10, #1, 7f\n"
-      "ldr d9, [x28], #0x8\n"
-      "mov x24, #0x18\n"
-      "tbz x10, #0, 10f\n"
-      "ld1 { v9.s }[2], [x28]\n"
+      "tbz x11, #2, 8f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 7f\n"
+      "ldr d9, [x9], #0x8\n"
+      "mov x25, #0x18\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v9.s }[2], [x9]\n"
       "b 10f\n"
       "7:"  // Height 1: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 10f\n"
-      "ldr s9, [x28, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s9, [x9, #0x0]\n"
       "b 10f\n"
       "8:"  // Height 1: Partial accumulate: partial_2_0
-      "tbz x10, #1, 9f\n"
-      "ldr d8, [x28], #0x8\n"
-      "mov x24, #0x8\n"
-      "tbz x10, #0, 10f\n"
-      "ld1 { v8.s }[2], [x28]\n"
+      "tbz x11, #1, 9f\n"
+      "ldr d8, [x9], #0x8\n"
+      "mov x25, #0x8\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v8.s }[2], [x9]\n"
       "b 10f\n"
       "9:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
+      "ldr s8, [x9, #0x0]\n"
+      "mov x25, #0x0\n"
       "10:"  // Height 1: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 13f\n"
       "11:"  // Height 1: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
       "b 13f\n"
       "12:"  // Height 1: no accumulate
       "movi v8.4s, #0x0\n"
@@ -161,295 +160,295 @@ void a64_hybrid_u8u32_dot_6x16 (
       "movi v10.4s, #0x0\n"
       "movi v11.4s, #0x0\n"
       "13:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "14:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 15f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 16f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 16f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
       "b 16f\n"
       "15:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "16:"  // Height 1: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 19f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q6, [x9, #0x0]\n"
-      "cmp x26, #0x20\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 18f\n"
       "17:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "sub x26, x26, #0x10\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "cmp x26, #0x20\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "add x10, x10, #0x100\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "bge 17b\n"
       "18:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x10, x10, #0x100\n"
       "19:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x26, 24f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 24f\n"
+      "cmp x27, #0x4\n"
       "blt 21f\n"
       "20:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "cmp x26, #0x4\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x10, #0x0]\n"
+      ".inst 0x6f92e208  // udot v8.4s, v16.16b, v18.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      "ldr q16, [x10, #0x10]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6f92e209  // udot v9.4s, v16.16b, v18.4b[0]\n"
+      "cmp x27, #0x4\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f92e22a  // udot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x6f92e20b  // udot v11.4s, v16.16b, v18.4b[0]\n"
+      "add x10, x10, #0x40\n"
       "bge 20b\n"
-      "cbz x26, 24f\n"
       "21:"  // Height 1: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 22f\n"
-      "ldr h0, [x25], #0x2\n"
-      "tbz x26, #0, 23f\n"
-      "ld1 { v0.b }[2], [x25]\n"
+      "cbz x27, 24f\n"
+      "tbz x27, #1, 22f\n"
+      "ldr h0, [x26], #0x2\n"
+      "tbz x27, #0, 23f\n"
+      "ld1 { v0.b }[2], [x26]\n"
       "b 23f\n"
       "22:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
       "23:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x6f80e228  // udot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f80e209  // udot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "add x10, x10, #0x40\n"
       "24:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 14b\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
+      "cmp x11, #0x10\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
       "bge 33f\n"
-      "tbz x10, #3, 28f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "tbz x10, #2, 26f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "tbz x10, #1, 25f\n"
-      "str d11, [x28], #0x8\n"
-      "tbz x10, #0, 32f\n"
-      "st1 { v11.s }[2], [x28]\n"
+      "tbz x11, #3, 28f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 26f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 25f\n"
+      "str d11, [x9], #0x8\n"
+      "tbz x11, #0, 32f\n"
+      "st1 { v11.s }[2], [x9]\n"
       "b 32f\n"
       "25:"  // Height 1: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 32f\n"
-      "str s11, [x28, #0x0]\n"
+      "tbz x11, #0, 32f\n"
+      "str s11, [x9, #0x0]\n"
       "b 32f\n"
       "26:"  // Height 1: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 27f\n"
-      "str d10, [x28], #0x8\n"
-      "tbz x10, #0, 32f\n"
-      "st1 { v10.s }[2], [x28]\n"
+      "tbz x11, #1, 27f\n"
+      "str d10, [x9], #0x8\n"
+      "tbz x11, #0, 32f\n"
+      "st1 { v10.s }[2], [x9]\n"
       "b 32f\n"
       "27:"  // Height 1: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 32f\n"
-      "str s10, [x28, #0x0]\n"
+      "tbz x11, #0, 32f\n"
+      "str s10, [x9, #0x0]\n"
       "b 32f\n"
       "28:"  // Height 1: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 30f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "tbz x10, #1, 29f\n"
-      "str d9, [x28], #0x8\n"
-      "tbz x10, #0, 32f\n"
-      "st1 { v9.s }[2], [x28]\n"
+      "tbz x11, #2, 30f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 29f\n"
+      "str d9, [x9], #0x8\n"
+      "tbz x11, #0, 32f\n"
+      "st1 { v9.s }[2], [x9]\n"
       "b 32f\n"
       "29:"  // Height 1: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 32f\n"
-      "str s9, [x28, #0x0]\n"
+      "tbz x11, #0, 32f\n"
+      "str s9, [x9, #0x0]\n"
       "b 32f\n"
       "30:"  // Height 1: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 31f\n"
-      "str d8, [x28], #0x8\n"
-      "tbz x10, #0, 32f\n"
-      "st1 { v8.s }[2], [x28]\n"
+      "tbz x11, #1, 31f\n"
+      "str d8, [x9], #0x8\n"
+      "tbz x11, #0, 32f\n"
+      "st1 { v8.s }[2], [x9]\n"
       "b 32f\n"
       "31:"  // Height 1: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
       "32:"  // Height 1: Partial direct writeback: Done
       "b 34f\n"
       "33:"  // Height 1: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
       "34:"  // Height 1: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 2b\n"
       "b 206f\n"
       "35:"  // Height 2
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "36:"  // Height 2: Column loop
       "tbz %x[flags], #0, 46f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x9, x20, LSL #2\n"
       "bge 45f\n"
-      "tbz x10, #3, 40f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "tbz x10, #2, 38f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "tbz x10, #1, 37f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "tbz x10, #0, 44f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x23]\n"
+      "tbz x11, #3, 40f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 38f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 37f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "tbz x11, #0, 44f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
       "b 44f\n"
       "37:"  // Height 2: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 44f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 44f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
       "b 44f\n"
       "38:"  // Height 2: Partial accumulate: partial_2_8
-      "tbz x10, #1, 39f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "tbz x10, #0, 44f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x23]\n"
+      "tbz x11, #1, 39f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "tbz x11, #0, 44f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
       "b 44f\n"
       "39:"  // Height 2: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 44f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 44f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
       "b 44f\n"
       "40:"  // Height 2: Partial accumulate: partial_4_0
-      "tbz x10, #2, 42f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "tbz x10, #1, 41f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "tbz x10, #0, 44f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x23]\n"
+      "tbz x11, #2, 42f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 41f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "tbz x11, #0, 44f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
       "b 44f\n"
       "41:"  // Height 2: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 44f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 44f\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
       "b 44f\n"
       "42:"  // Height 2: Partial accumulate: partial_2_0
-      "tbz x10, #1, 43f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "tbz x10, #0, 44f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x23]\n"
+      "tbz x11, #1, 43f\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "tbz x11, #0, 44f\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
       "b 44f\n"
       "43:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
       "44:"  // Height 2: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 47f\n"
       "45:"  // Height 2: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
       "b 47f\n"
       "46:"  // Height 2: no accumulate
       "movi v8.4s, #0x0\n"
@@ -461,392 +460,392 @@ void a64_hybrid_u8u32_dot_6x16 (
       "movi v14.4s, #0x0\n"
       "movi v15.4s, #0x0\n"
       "47:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 50f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 50f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
       "b 50f\n"
       "49:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
       "50:"  // Height 2: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 53f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 52f\n"
       "51:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
+      "sub x27, x27, #0x10\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "cmp x26, #0x20\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e22c  // udot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20d  // udot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e22e  // udot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20f  // udot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2c  // udot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0d  // udot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2e  // udot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0f  // udot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2c  // udot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea0d  // udot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2e  // udot v14.4s, v17.16b, v1.4b[3]\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x6fa1ea0f  // udot v15.4s, v16.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x25, x25, #0x10\n"
+      "ldr q17, [x10, #0x20]\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e22c  // udot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20d  // udot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e22e  // udot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20f  // udot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2c  // udot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0d  // udot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2e  // udot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0f  // udot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2c  // udot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea0d  // udot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2e  // udot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea0f  // udot v15.4s, v16.16b, v1.4b[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x26, 58f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 58f\n"
+      "cmp x27, #0x4\n"
       "blt 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x6f93e228  // udot v8.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x6f92e22c  // udot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6f93e209  // udot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x6f92e20d  // udot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f93e22a  // udot v10.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x6f92e22e  // udot v14.4s, v17.16b, v18.4b[0]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6f93e20b  // udot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x6f92e20f  // udot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 54b\n"
-      "cbz x26, 58f\n"
       "55:"  // Height 2: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 56f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "tbz x26, #0, 57f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
+      "cbz x27, 58f\n"
+      "tbz x27, #1, 56f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "tbz x27, #0, 57f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
       "b 57f\n"
       "56:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
       "57:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x6f80e228  // udot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f81e22c  // udot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6f80e209  // udot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20d  // udot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
       "58:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 48b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "bge 67f\n"
-      "tbz x10, #3, 62f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "tbz x10, #2, 60f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "tbz x10, #1, 59f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "tbz x10, #0, 66f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x23]\n"
+      "tbz x11, #3, 62f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 60f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 59f\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "tbz x11, #0, 66f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x24]\n"
       "b 66f\n"
       "59:"  // Height 2: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 66f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
+      "tbz x11, #0, 66f\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
       "b 66f\n"
       "60:"  // Height 2: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 61f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "tbz x10, #0, 66f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x23]\n"
+      "tbz x11, #1, 61f\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "tbz x11, #0, 66f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x24]\n"
       "b 66f\n"
       "61:"  // Height 2: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 66f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
+      "tbz x11, #0, 66f\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
       "b 66f\n"
       "62:"  // Height 2: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 64f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "tbz x10, #1, 63f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "tbz x10, #0, 66f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x23]\n"
+      "tbz x11, #2, 64f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 63f\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "tbz x11, #0, 66f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x24]\n"
       "b 66f\n"
       "63:"  // Height 2: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 66f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
+      "tbz x11, #0, 66f\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
       "b 66f\n"
       "64:"  // Height 2: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 65f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "tbz x10, #0, 66f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x23]\n"
+      "tbz x11, #1, 65f\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "tbz x11, #0, 66f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x24]\n"
       "b 66f\n"
       "65:"  // Height 2: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
       "66:"  // Height 2: Partial direct writeback: Done
       "b 68f\n"
       "67:"  // Height 2: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
       "68:"  // Height 2: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 36b\n"
       "b 206f\n"
       "69:"  // Height 3
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "70:"  // Height 3: Column loop
       "tbz %x[flags], #0, 80f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x23, x24, x20, LSL #2\n"
       "bge 79f\n"
-      "tbz x10, #3, 74f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "tbz x10, #2, 72f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "tbz x10, #1, 71f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "tbz x10, #0, 78f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
+      "tbz x11, #3, 74f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 72f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 71f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x11, #0, 78f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
       "b 78f\n"
       "71:"  // Height 3: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 78f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 78f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
       "b 78f\n"
       "72:"  // Height 3: Partial accumulate: partial_2_8
-      "tbz x10, #1, 73f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "tbz x10, #0, 78f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
+      "tbz x11, #1, 73f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d18, [x23], #0x8\n"
+      "tbz x11, #0, 78f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
       "b 78f\n"
       "73:"  // Height 3: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 78f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 78f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
       "b 78f\n"
       "74:"  // Height 3: Partial accumulate: partial_4_0
-      "tbz x10, #2, 76f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "tbz x10, #1, 75f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "tbz x10, #0, 78f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
+      "tbz x11, #2, 76f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 75f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d17, [x23], #0x8\n"
+      "tbz x11, #0, 78f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
       "b 78f\n"
       "75:"  // Height 3: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 78f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 78f\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
       "b 78f\n"
       "76:"  // Height 3: Partial accumulate: partial_2_0
-      "tbz x10, #1, 77f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "tbz x10, #0, 78f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
+      "tbz x11, #1, 77f\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "tbz x11, #0, 78f\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
       "b 78f\n"
       "77:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s16, [x23, #0x0]\n"
       "78:"  // Height 3: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 81f\n"
       "79:"  // Height 3: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
       "b 81f\n"
       "80:"  // Height 3: no accumulate
       "movi v8.4s, #0x0\n"
@@ -862,488 +861,488 @@ void a64_hybrid_u8u32_dot_6x16 (
       "movi v18.4s, #0x0\n"
       "movi v19.4s, #0x0\n"
       "81:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "82:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 83f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 84f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 84f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
       "b 84f\n"
       "83:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "84:"  // Height 3: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 87f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 86f\n"
       "85:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sub x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "cmp x26, #0x20\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6fa0e2a8  // udot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ac  // udot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b0  // udot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x6fa0e289  // udot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28d  // udot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e291  // udot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x6fa0e2aa  // udot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ae  // udot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b2  // udot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x6fa0e28b  // udot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28f  // udot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e293  // udot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x6f80eaa8  // udot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaac  // udot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab0  // udot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x6f80ea89  // udot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8d  // udot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea91  // udot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x6f80eaaa  // udot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaae  // udot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab2  // udot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x6f80ea8b  // udot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8f  // udot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea93  // udot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x6fa0eaa8  // udot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaac  // udot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab0  // udot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea89  // udot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea8d  // udot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ea91  // udot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6fa0eaaa  // udot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaae  // udot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab2  // udot v18.4s, v21.16b, v2.4b[3]\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6fa0ea8b  // udot v11.4s, v20.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x6fa1ea8f  // udot v15.4s, v20.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x6fa2ea93  // udot v19.4s, v20.16b, v2.4b[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 85b\n"
       "86:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      ".inst 0x6fa0e2a8  // udot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ac  // udot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b0  // udot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x6fa0e289  // udot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28d  // udot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e291  // udot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x6fa0e2aa  // udot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ae  // udot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b2  // udot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x6fa0e28b  // udot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28f  // udot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e293  // udot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x6f80eaa8  // udot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaac  // udot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab0  // udot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x6f80ea89  // udot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8d  // udot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea91  // udot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x6f80eaaa  // udot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaae  // udot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab2  // udot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x6f80ea8b  // udot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8f  // udot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea93  // udot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x6fa0eaa8  // udot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaac  // udot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab0  // udot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea89  // udot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea8d  // udot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ea91  // udot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6fa0eaaa  // udot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaae  // udot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab2  // udot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ea8b  // udot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea8f  // udot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ea93  // udot v19.4s, v20.16b, v2.4b[3]\n"
       "87:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x26, 92f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 92f\n"
+      "cmp x27, #0x4\n"
       "blt 89f\n"
       "88:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x10, #0x0]\n"
+      ".inst 0x6f98e2a8  // udot v8.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x6f97e2ac  // udot v12.4s, v21.16b, v23.4b[0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x6f96e2b0  // udot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x6f98e289  // udot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x6f97e28d  // udot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x6f96e291  // udot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6f98e2aa  // udot v10.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x6f97e2ae  // udot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x6f96e2b2  // udot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x6f98e28b  // udot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x6f97e28f  // udot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x6f96e293  // udot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 88b\n"
-      "cbz x26, 92f\n"
       "89:"  // Height 3: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 90f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "tbz x26, #0, 91f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
+      "cbz x27, 92f\n"
+      "tbz x27, #1, 90f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "tbz x27, #0, 91f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
       "b 91f\n"
       "90:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
       "91:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x6f80e2a8  // udot v8.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f81e2ac  // udot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x6f82e2b0  // udot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x6f80e289  // udot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x6f81e28d  // udot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e291  // udot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
       "92:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 82b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x11, #0x10\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
       "bge 101f\n"
-      "tbz x10, #3, 96f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "tbz x10, #2, 94f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "tbz x10, #1, 93f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "tbz x10, #0, 100f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
+      "tbz x11, #3, 96f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 94f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 93f\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x11, #0, 100f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
       "b 100f\n"
       "93:"  // Height 3: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 100f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
+      "tbz x11, #0, 100f\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
       "b 100f\n"
       "94:"  // Height 3: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 95f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "tbz x10, #0, 100f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
+      "tbz x11, #1, 95f\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x11, #0, 100f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
       "b 100f\n"
       "95:"  // Height 3: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 100f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
+      "tbz x11, #0, 100f\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
       "b 100f\n"
       "96:"  // Height 3: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 98f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "tbz x10, #1, 97f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "tbz x10, #0, 100f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
+      "tbz x11, #2, 98f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 97f\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x11, #0, 100f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
       "b 100f\n"
       "97:"  // Height 3: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 100f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
+      "tbz x11, #0, 100f\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
       "b 100f\n"
       "98:"  // Height 3: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 99f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "tbz x10, #0, 100f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
+      "tbz x11, #1, 99f\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x11, #0, 100f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
       "b 100f\n"
       "99:"  // Height 3: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
       "100:"  // Height 3: Partial direct writeback: Done
       "b 102f\n"
       "101:"  // Height 3: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
       "102:"  // Height 3: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 70b\n"
       "b 206f\n"
       "103:"  // Height 4
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "104:"  // Height 4: Column loop
       "tbz %x[flags], #0, 114f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x22, x23, x20, LSL #2\n"
       "bge 113f\n"
-      "tbz x10, #3, 108f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "tbz x10, #2, 106f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "tbz x10, #1, 105f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "tbz x10, #0, 112f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
+      "tbz x11, #3, 108f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 106f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 105f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x11, #0, 112f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
       "b 112f\n"
       "105:"  // Height 4: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 112f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 112f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
       "b 112f\n"
       "106:"  // Height 4: Partial accumulate: partial_2_8
-      "tbz x10, #1, 107f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "tbz x10, #0, 112f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
+      "tbz x11, #1, 107f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x11, #0, 112f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
       "b 112f\n"
       "107:"  // Height 4: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 112f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 112f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
       "b 112f\n"
       "108:"  // Height 4: Partial accumulate: partial_4_0
-      "tbz x10, #2, 110f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "tbz x10, #1, 109f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "tbz x10, #0, 112f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
+      "tbz x11, #2, 110f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 109f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x11, #0, 112f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
       "b 112f\n"
       "109:"  // Height 4: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 112f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 112f\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
       "b 112f\n"
       "110:"  // Height 4: Partial accumulate: partial_2_0
-      "tbz x10, #1, 111f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "tbz x10, #0, 112f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
+      "tbz x11, #1, 111f\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "tbz x11, #0, 112f\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
       "b 112f\n"
       "111:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
       "112:"  // Height 4: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 115f\n"
       "113:"  // Height 4: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
       "b 115f\n"
       "114:"  // Height 4: no accumulate
       "movi v8.4s, #0x0\n"
@@ -1363,584 +1362,584 @@ void a64_hybrid_u8u32_dot_6x16 (
       "movi v22.4s, #0x0\n"
       "movi v23.4s, #0x0\n"
       "115:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "116:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 117f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 118f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 118f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 118f\n"
       "117:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "118:"  // Height 4: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 121f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 120f\n"
       "119:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q25, [x10, #0x20]\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x26, #0x20\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x6fa0e328  // udot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32c  // udot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e330  // udot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e334  // udot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x6fa0e309  // udot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30d  // udot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e311  // udot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e315  // udot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x6fa0e32a  // udot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32e  // udot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e332  // udot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e336  // udot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x6fa0e30b  // udot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30f  // udot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e313  // udot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e317  // udot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x6f80eb28  // udot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2c  // udot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb30  // udot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb34  // udot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x6f80eb09  // udot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0d  // udot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb11  // udot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb15  // udot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x6f80eb2a  // udot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2e  // udot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb32  // udot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb36  // udot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x6f80eb0b  // udot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0f  // udot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb13  // udot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb17  // udot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x6fa0eb28  // udot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2c  // udot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb30  // udot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb34  // udot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x6fa0eb09  // udot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb0d  // udot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb11  // udot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb15  // udot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6fa0eb2a  // udot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2e  // udot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb32  // udot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb36  // udot v22.4s, v25.16b, v3.4b[3]\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6fa0eb0b  // udot v11.4s, v24.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x6fa1eb0f  // udot v15.4s, v24.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x6fa2eb13  // udot v19.4s, v24.16b, v2.4b[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x6fa3eb17  // udot v23.4s, v24.16b, v3.4b[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 119b\n"
       "120:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
+      "ldr q25, [x10, #0x20]\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "add x23, x23, #0x10\n"
+      "sub x27, x27, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x6fa0e328  // udot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32c  // udot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e330  // udot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e334  // udot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x6fa0e309  // udot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30d  // udot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e311  // udot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e315  // udot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x6fa0e32a  // udot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32e  // udot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e332  // udot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e336  // udot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x6fa0e30b  // udot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30f  // udot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e313  // udot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e317  // udot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x6f80eb28  // udot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2c  // udot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb30  // udot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb34  // udot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x6f80eb09  // udot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0d  // udot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb11  // udot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb15  // udot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x6f80eb2a  // udot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2e  // udot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb32  // udot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb36  // udot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x6f80eb0b  // udot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0f  // udot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb13  // udot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb17  // udot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x6fa0eb28  // udot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2c  // udot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb30  // udot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb34  // udot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x6fa0eb09  // udot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb0d  // udot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb11  // udot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb15  // udot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6fa0eb2a  // udot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2e  // udot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb32  // udot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb36  // udot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x6fa0eb0b  // udot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb0f  // udot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb13  // udot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb17  // udot v23.4s, v24.16b, v3.4b[3]\n"
       "121:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x26, 126f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 126f\n"
+      "cmp x27, #0x4\n"
       "blt 123f\n"
       "122:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr s3, [x22], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x6f9de328  // udot v8.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce32c  // udot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x6f9be330  // udot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae334  // udot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x6f9de309  // udot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce30d  // udot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x6f9be311  // udot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae315  // udot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6f9de32a  // udot v10.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce32e  // udot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x6f9be332  // udot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae336  // udot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x6f9de30b  // udot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce30f  // udot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x6f9be313  // udot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae317  // udot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 122b\n"
-      "cbz x26, 126f\n"
       "123:"  // Height 4: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 124f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr h3, [x22], #0x2\n"
-      "tbz x26, #0, 125f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
-      "ld1 { v3.b }[2], [x22]\n"
+      "cbz x27, 126f\n"
+      "tbz x27, #1, 124f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "tbz x27, #0, 125f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
       "b 125f\n"
       "124:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
-      "ldr b3, [x22, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
       "125:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x6f80e328  // udot v8.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e32c  // udot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f82e330  // udot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e334  // udot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x6f80e309  // udot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30d  // udot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e311  // udot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e315  // udot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
       "126:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 116b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "bge 135f\n"
-      "tbz x10, #3, 130f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "tbz x10, #2, 128f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "tbz x10, #1, 127f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "tbz x10, #0, 134f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
+      "tbz x11, #3, 130f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 128f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 127f\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "tbz x11, #0, 134f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
       "b 134f\n"
       "127:"  // Height 4: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 134f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
+      "tbz x11, #0, 134f\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
       "b 134f\n"
       "128:"  // Height 4: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 129f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "tbz x10, #0, 134f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
+      "tbz x11, #1, 129f\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "tbz x11, #0, 134f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
       "b 134f\n"
       "129:"  // Height 4: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 134f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
+      "tbz x11, #0, 134f\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
       "b 134f\n"
       "130:"  // Height 4: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 132f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "tbz x10, #1, 131f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "tbz x10, #0, 134f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
+      "tbz x11, #2, 132f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 131f\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "tbz x11, #0, 134f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
       "b 134f\n"
       "131:"  // Height 4: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 134f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
+      "tbz x11, #0, 134f\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
       "b 134f\n"
       "132:"  // Height 4: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 133f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "tbz x10, #0, 134f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
+      "tbz x11, #1, 133f\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "tbz x11, #0, 134f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
       "b 134f\n"
       "133:"  // Height 4: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
       "134:"  // Height 4: Partial direct writeback: Done
       "b 136f\n"
       "135:"  // Height 4: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
       "136:"  // Height 4: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 104b\n"
       "b 206f\n"
       "137:"  // Height 5
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "138:"  // Height 5: Column loop
       "tbz %x[flags], #0, 148f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x21, x22, x20, LSL #2\n"
       "bge 147f\n"
-      "tbz x10, #3, 142f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "ld1 { v25.4s }, [x20], #0x10\n"
-      "tbz x10, #2, 140f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "ld1 { v26.4s }, [x20], #0x10\n"
-      "tbz x10, #1, 139f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "ldr d27, [x20], #0x8\n"
-      "tbz x10, #0, 146f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
-      "ld1 { v27.s }[2], [x20]\n"
+      "tbz x11, #3, 142f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 140f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 139f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
       "b 146f\n"
       "139:"  // Height 5: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 146f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
-      "ldr s27, [x20, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 146f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
       "b 146f\n"
       "140:"  // Height 5: Partial accumulate: partial_2_8
-      "tbz x10, #1, 141f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "ldr d26, [x20], #0x8\n"
-      "tbz x10, #0, 146f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
-      "ld1 { v26.s }[2], [x20]\n"
+      "tbz x11, #1, 141f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
       "b 146f\n"
       "141:"  // Height 5: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 146f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
-      "ldr s26, [x20, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 146f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
       "b 146f\n"
       "142:"  // Height 5: Partial accumulate: partial_4_0
-      "tbz x10, #2, 144f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "tbz x10, #1, 143f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "ldr d25, [x20], #0x8\n"
-      "tbz x10, #0, 146f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
-      "ld1 { v25.s }[2], [x20]\n"
+      "tbz x11, #2, 144f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 143f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
       "b 146f\n"
       "143:"  // Height 5: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 146f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
-      "ldr s25, [x20, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 146f\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
       "b 146f\n"
       "144:"  // Height 5: Partial accumulate: partial_2_0
-      "tbz x10, #1, 145f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
-      "tbz x10, #0, 146f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
-      "ld1 { v24.s }[2], [x20]\n"
+      "tbz x11, #1, 145f\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
       "b 146f\n"
       "145:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
-      "ldr s24, [x20, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
       "146:"  // Height 5: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 149f\n"
       "147:"  // Height 5: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
-      "ldr q24, [x20, #0x0]\n"
-      "ldr q25, [x20, #0x10]\n"
-      "ldr q26, [x20, #0x20]\n"
-      "ldr q27, [x20, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
       "b 149f\n"
       "148:"  // Height 5: no accumulate
       "movi v8.4s, #0x0\n"
@@ -1964,683 +1963,683 @@ void a64_hybrid_u8u32_dot_6x16 (
       "movi v26.4s, #0x0\n"
       "movi v27.4s, #0x0\n"
       "149:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "150:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 151f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 152f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 152f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
       "b 152f\n"
       "151:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "152:"  // Height 5: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 155f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 154f\n"
       "153:"  // Height 5: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sub x26, x26, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "cmp x26, #0x20\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "cmp x27, #0x20\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr q3, [x22, #0x0]\n"
-      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x6fa0e3a8  // udot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ac  // udot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b0  // udot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b4  // udot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3b8  // udot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x6fa0e389  // udot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38d  // udot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e391  // udot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e395  // udot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e399  // udot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x6fa0e3aa  // udot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ae  // udot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b2  // udot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b6  // udot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3ba  // udot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x6fa0e38b  // udot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38f  // udot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e393  // udot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e397  // udot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e39b  // udot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x6f80eba8  // udot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebac  // udot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb0  // udot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb4  // udot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebb8  // udot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x6f80eb89  // udot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8d  // udot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb91  // udot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb95  // udot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb99  // udot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x6f80ebaa  // udot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebae  // udot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb2  // udot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb6  // udot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebba  // udot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x6f80eb8b  // udot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8f  // udot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb93  // udot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb97  // udot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb9b  // udot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x6fa0eba8  // udot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebac  // udot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb0  // udot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb4  // udot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebb8  // udot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x6fa0eb89  // udot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb8d  // udot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb91  // udot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb95  // udot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb99  // udot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6fa0ebaa  // udot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebae  // udot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb2  // udot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb6  // udot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebba  // udot v26.4s, v29.16b, v4.4b[3]\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x6fa0eb8b  // udot v11.4s, v28.16b, v0.4b[3]\n"
+      "ldr q0, [x26, #0x0]\n"
+      ".inst 0x6fa1eb8f  // udot v15.4s, v28.16b, v1.4b[3]\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x6fa2eb93  // udot v19.4s, v28.16b, v2.4b[3]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x6fa3eb97  // udot v23.4s, v28.16b, v3.4b[3]\n"
+      "ldr q3, [x23, #0x0]\n"
+      ".inst 0x6fa4eb9b  // udot v27.4s, v28.16b, v4.4b[3]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 153b\n"
       "154:"  // Height 5: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x6fa0e3a8  // udot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ac  // udot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b0  // udot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b4  // udot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3b8  // udot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x6fa0e389  // udot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38d  // udot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e391  // udot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e395  // udot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e399  // udot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x6fa0e3aa  // udot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ae  // udot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b2  // udot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b6  // udot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3ba  // udot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x6fa0e38b  // udot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38f  // udot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e393  // udot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e397  // udot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e39b  // udot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x6f80eba8  // udot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebac  // udot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb0  // udot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb4  // udot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebb8  // udot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x6f80eb89  // udot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8d  // udot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb91  // udot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb95  // udot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb99  // udot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x6f80ebaa  // udot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebae  // udot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb2  // udot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb6  // udot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebba  // udot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x6f80eb8b  // udot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8f  // udot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb93  // udot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb97  // udot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb9b  // udot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x6fa0eba8  // udot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebac  // udot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb0  // udot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb4  // udot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebb8  // udot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x6fa0eb89  // udot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb8d  // udot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb91  // udot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb95  // udot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb99  // udot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6fa0ebaa  // udot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebae  // udot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb2  // udot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb6  // udot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebba  // udot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x6fa0eb8b  // udot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb8f  // udot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb93  // udot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb97  // udot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb9b  // udot v27.4s, v28.16b, v4.4b[3]\n"
       "155:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x26, 160f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 160f\n"
+      "cmp x27, #0x4\n"
       "blt 157f\n"
       "156:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr s3, [x22], #0x4\n"
-      "ldr s4, [x21], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s1, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x10, #0x0]\n"
+      ".inst 0x6f82e3a8  // udot v8.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f81e3ac  // udot v12.4s, v29.16b, v1.4b[0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x6f80e3b0  // udot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe3b4  // udot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee3b8  // udot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x6f82e389  // udot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f81e38d  // udot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f80e391  // udot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe395  // udot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee399  // udot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6f82e3aa  // udot v10.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe3b6  // udot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee3ba  // udot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x6f82e38b  // udot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe397  // udot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee39b  // udot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 156b\n"
-      "cbz x26, 160f\n"
       "157:"  // Height 5: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 158f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr h3, [x22], #0x2\n"
-      "ldr h4, [x21], #0x2\n"
-      "tbz x26, #0, 159f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
-      "ld1 { v3.b }[2], [x22]\n"
-      "ld1 { v4.b }[2], [x21]\n"
+      "cbz x27, 160f\n"
+      "tbz x27, #1, 158f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x27, #0, 159f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
       "b 159f\n"
       "158:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
-      "ldr b3, [x22, #0x0]\n"
-      "ldr b4, [x21, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
       "159:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x6f80e3a8  // udot v8.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3ac  // udot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3b0  // udot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b4  // udot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x6f84e3b8  // udot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x6f80e389  // udot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38d  // udot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e391  // udot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e395  // udot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e399  // udot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
       "160:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 150b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
+      "cmp x11, #0x10\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #2\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "bge 169f\n"
-      "tbz x10, #3, 164f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v25.4s }, [x20], #0x10\n"
-      "tbz x10, #2, 162f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "st1 { v26.4s }, [x20], #0x10\n"
-      "tbz x10, #1, 161f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "str d27, [x20], #0x8\n"
-      "tbz x10, #0, 168f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
-      "st1 { v27.s }[2], [x20]\n"
+      "tbz x11, #3, 164f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 162f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 161f\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x11, #0, 168f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
       "b 168f\n"
       "161:"  // Height 5: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 168f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
-      "str s27, [x20, #0x0]\n"
+      "tbz x11, #0, 168f\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
       "b 168f\n"
       "162:"  // Height 5: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 163f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "str d26, [x20], #0x8\n"
-      "tbz x10, #0, 168f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
-      "st1 { v26.s }[2], [x20]\n"
+      "tbz x11, #1, 163f\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x11, #0, 168f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
       "b 168f\n"
       "163:"  // Height 5: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 168f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
-      "str s26, [x20, #0x0]\n"
+      "tbz x11, #0, 168f\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
       "b 168f\n"
       "164:"  // Height 5: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 166f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "tbz x10, #1, 165f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "str d25, [x20], #0x8\n"
-      "tbz x10, #0, 168f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
-      "st1 { v25.s }[2], [x20]\n"
+      "tbz x11, #2, 166f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 165f\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x11, #0, 168f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
       "b 168f\n"
       "165:"  // Height 5: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 168f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
-      "str s25, [x20, #0x0]\n"
+      "tbz x11, #0, 168f\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
       "b 168f\n"
       "166:"  // Height 5: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 167f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "tbz x10, #0, 168f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
-      "st1 { v24.s }[2], [x20]\n"
+      "tbz x11, #1, 167f\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x11, #0, 168f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
       "b 168f\n"
       "167:"  // Height 5: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
-      "str s24, [x20, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
       "168:"  // Height 5: Partial direct writeback: Done
       "b 170f\n"
       "169:"  // Height 5: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q25, [x20, #0x10]\n"
-      "str q26, [x20, #0x20]\n"
-      "str q27, [x20, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
       "170:"  // Height 5: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 138b\n"
       "b 206f\n"
       "171:"  // Height 6
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "mov x20, #0x18\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "172:"  // Height 6: Column loop
       "tbz %x[flags], #0, 182f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
-      "add x19, x20, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x20, x21, x20, LSL #2\n"
       "bge 181f\n"
-      "tbz x10, #3, 176f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v28.4s }, [x19], #0x10\n"
-      "ld1 { v9.4s }, [x28], #0x10\n"
-      "ld1 { v13.4s }, [x23], #0x10\n"
-      "ld1 { v17.4s }, [x22], #0x10\n"
-      "ld1 { v21.4s }, [x21], #0x10\n"
-      "ld1 { v25.4s }, [x20], #0x10\n"
-      "ld1 { v29.4s }, [x19], #0x10\n"
-      "tbz x10, #2, 174f\n"
-      "ld1 { v10.4s }, [x28], #0x10\n"
-      "ld1 { v14.4s }, [x23], #0x10\n"
-      "ld1 { v18.4s }, [x22], #0x10\n"
-      "ld1 { v22.4s }, [x21], #0x10\n"
-      "ld1 { v26.4s }, [x20], #0x10\n"
-      "ld1 { v30.4s }, [x19], #0x10\n"
-      "tbz x10, #1, 173f\n"
-      "mov x24, #0x38\n"
-      "ldr d11, [x28], #0x8\n"
-      "ldr d15, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d23, [x21], #0x8\n"
-      "ldr d27, [x20], #0x8\n"
-      "ldr d31, [x19], #0x8\n"
-      "tbz x10, #0, 180f\n"
-      "ld1 { v11.s }[2], [x28]\n"
-      "ld1 { v15.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v23.s }[2], [x21]\n"
-      "ld1 { v27.s }[2], [x20]\n"
-      "ld1 { v31.s }[2], [x19]\n"
+      "tbz x11, #3, 176f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v29.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 174f\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v30.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 173f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "ldr d31, [x20], #0x8\n"
+      "tbz x11, #0, 180f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "ld1 { v31.s }[2], [x20]\n"
       "b 180f\n"
       "173:"  // Height 6: Partial accumulate: partial_1_12
-      "mov x24, #0x30\n"
-      "tbz x10, #0, 180f\n"
-      "ldr s11, [x28, #0x0]\n"
-      "ldr s15, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s23, [x21, #0x0]\n"
-      "ldr s27, [x20, #0x0]\n"
-      "ldr s31, [x19, #0x0]\n"
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 180f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "ldr s31, [x20, #0x0]\n"
       "b 180f\n"
       "174:"  // Height 6: Partial accumulate: partial_2_8
-      "tbz x10, #1, 175f\n"
-      "ldr d10, [x28], #0x8\n"
-      "ldr d14, [x23], #0x8\n"
-      "mov x24, #0x28\n"
-      "ldr d18, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
-      "ldr d26, [x20], #0x8\n"
-      "ldr d30, [x19], #0x8\n"
-      "tbz x10, #0, 180f\n"
-      "ld1 { v10.s }[2], [x28]\n"
-      "ld1 { v14.s }[2], [x23]\n"
-      "ld1 { v18.s }[2], [x22]\n"
-      "ld1 { v22.s }[2], [x21]\n"
-      "ld1 { v26.s }[2], [x20]\n"
-      "ld1 { v30.s }[2], [x19]\n"
+      "tbz x11, #1, 175f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d30, [x20], #0x8\n"
+      "tbz x11, #0, 180f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "ld1 { v30.s }[2], [x20]\n"
       "b 180f\n"
       "175:"  // Height 6: Partial accumulate: partial_1_8
-      "mov x24, #0x20\n"
-      "tbz x10, #0, 180f\n"
-      "ldr s10, [x28, #0x0]\n"
-      "ldr s14, [x23, #0x0]\n"
-      "ldr s18, [x22, #0x0]\n"
-      "ldr s22, [x21, #0x0]\n"
-      "ldr s26, [x20, #0x0]\n"
-      "ldr s30, [x19, #0x0]\n"
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 180f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "ldr s30, [x20, #0x0]\n"
       "b 180f\n"
       "176:"  // Height 6: Partial accumulate: partial_4_0
-      "tbz x10, #2, 178f\n"
-      "ld1 { v8.4s }, [x28], #0x10\n"
-      "ld1 { v12.4s }, [x23], #0x10\n"
-      "ld1 { v16.4s }, [x22], #0x10\n"
-      "ld1 { v20.4s }, [x21], #0x10\n"
-      "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v28.4s }, [x19], #0x10\n"
-      "tbz x10, #1, 177f\n"
-      "mov x24, #0x18\n"
-      "ldr d9, [x28], #0x8\n"
-      "ldr d13, [x23], #0x8\n"
-      "ldr d17, [x22], #0x8\n"
-      "ldr d21, [x21], #0x8\n"
-      "ldr d25, [x20], #0x8\n"
-      "ldr d29, [x19], #0x8\n"
-      "tbz x10, #0, 180f\n"
-      "ld1 { v9.s }[2], [x28]\n"
-      "ld1 { v13.s }[2], [x23]\n"
-      "ld1 { v17.s }[2], [x22]\n"
-      "ld1 { v21.s }[2], [x21]\n"
-      "ld1 { v25.s }[2], [x20]\n"
-      "ld1 { v29.s }[2], [x19]\n"
+      "tbz x11, #2, 178f\n"
+      "ld1 { v8.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 177f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "ldr d29, [x20], #0x8\n"
+      "tbz x11, #0, 180f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v29.s }[2], [x20]\n"
       "b 180f\n"
       "177:"  // Height 6: Partial accumulate: partial_1_4
-      "mov x24, #0x10\n"
-      "tbz x10, #0, 180f\n"
-      "ldr s9, [x28, #0x0]\n"
-      "ldr s13, [x23, #0x0]\n"
-      "ldr s17, [x22, #0x0]\n"
-      "ldr s21, [x21, #0x0]\n"
-      "ldr s25, [x20, #0x0]\n"
-      "ldr s29, [x19, #0x0]\n"
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 180f\n"
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "ldr s29, [x20, #0x0]\n"
       "b 180f\n"
       "178:"  // Height 6: Partial accumulate: partial_2_0
-      "tbz x10, #1, 179f\n"
-      "ldr d8, [x28], #0x8\n"
-      "ldr d12, [x23], #0x8\n"
-      "mov x24, #0x8\n"
-      "ldr d16, [x22], #0x8\n"
-      "ldr d20, [x21], #0x8\n"
-      "ldr d24, [x20], #0x8\n"
-      "ldr d28, [x19], #0x8\n"
-      "tbz x10, #0, 180f\n"
-      "ld1 { v8.s }[2], [x28]\n"
-      "ld1 { v12.s }[2], [x23]\n"
-      "ld1 { v16.s }[2], [x22]\n"
-      "ld1 { v20.s }[2], [x21]\n"
-      "ld1 { v24.s }[2], [x20]\n"
-      "ld1 { v28.s }[2], [x19]\n"
+      "tbz x11, #1, 179f\n"
+      "ldr d8, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "tbz x11, #0, 180f\n"
+      "ld1 { v8.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
+      "ld1 { v28.s }[2], [x20]\n"
       "b 180f\n"
       "179:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr s8, [x28, #0x0]\n"
-      "mov x24, #0x0\n"
-      "ldr s12, [x23, #0x0]\n"
-      "ldr s16, [x22, #0x0]\n"
-      "ldr s20, [x21, #0x0]\n"
-      "ldr s24, [x20, #0x0]\n"
-      "ldr s28, [x19, #0x0]\n"
+      "ldr s8, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
+      "ldr s28, [x20, #0x0]\n"
       "180:"  // Height 6: Partial accumulate: Done
-      "sub x28, x28, x24\n"
+      "sub x9, x9, x25\n"
       "b 183f\n"
       "181:"  // Height 6: full accumulate
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q11, [x28, #0x30]\n"
-      "ldr q12, [x23, #0x0]\n"
-      "ldr q13, [x23, #0x10]\n"
-      "ldr q14, [x23, #0x20]\n"
-      "ldr q15, [x23, #0x30]\n"
-      "ldr q16, [x22, #0x0]\n"
-      "ldr q17, [x22, #0x10]\n"
-      "ldr q18, [x22, #0x20]\n"
-      "ldr q19, [x22, #0x30]\n"
-      "ldr q20, [x21, #0x0]\n"
-      "ldr q21, [x21, #0x10]\n"
-      "ldr q22, [x21, #0x20]\n"
-      "ldr q23, [x21, #0x30]\n"
-      "ldr q24, [x20, #0x0]\n"
-      "ldr q25, [x20, #0x10]\n"
-      "ldr q26, [x20, #0x20]\n"
-      "ldr q27, [x20, #0x30]\n"
-      "ldr q28, [x19, #0x0]\n"
-      "ldr q29, [x19, #0x10]\n"
-      "ldr q30, [x19, #0x20]\n"
-      "ldr q31, [x19, #0x30]\n"
+      "ldr q8, [x9, #0x0]\n"
+      "ldr q9, [x9, #0x10]\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q16, [x23, #0x0]\n"
+      "ldr q17, [x23, #0x10]\n"
+      "ldr q18, [x23, #0x20]\n"
+      "ldr q19, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x20, #0x0]\n"
+      "ldr q29, [x20, #0x10]\n"
+      "ldr q30, [x20, #0x20]\n"
+      "ldr q31, [x20, #0x30]\n"
       "b 183f\n"
       "182:"  // Height 6: no accumulate
       "movi v8.4s, #0x0\n"
@@ -2668,297 +2667,297 @@ void a64_hybrid_u8u32_dot_6x16 (
       "movi v30.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
       "183:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "184:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 185f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 186f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 186f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
       "b 186f\n"
       "185:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "186:"  // Height 6: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "blt 189f\n"
-      "ldr q0, [x25, #0x0]\n"
-      "ldr q1, [x24, #0x0]\n"
-      "cmp x26, #0x20\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x22, #0x0]\n"
-      "ldr q4, [x21, #0x0]\n"
-      "ldr q5, [x20, #0x0]\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q2, [x24, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "blt 188f\n"
       "187:"  // Height 6: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
       ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x20, x20, #0x10\n"
+      "ldr q6, [x10, #0x20]\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "cmp x26, #0x20\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
+      "ldr q6, [x10, #0x40]\n"
       ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
+      "ldr q7, [x10, #0x50]\n"
       ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x6fa5e0dc  // udot v28.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
+      "ldr q6, [x10, #0x60]\n"
       ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x6fa5e0fd  // udot v29.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "ldr q7, [x10, #0x70]\n"
       ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x6fa5e0de  // udot v30.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
+      "ldr q6, [x10, #0x80]\n"
       ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x6fa5e0ff  // udot v31.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
+      "ldr q7, [x10, #0x90]\n"
       ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x6f85e8dc  // udot v28.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
+      "ldr q6, [x10, #0xa0]\n"
       ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x6f85e8fd  // udot v29.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
+      "ldr q7, [x10, #0xb0]\n"
       ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x6f85e8de  // udot v30.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
+      "ldr q6, [x10, #0xc0]\n"
       ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x6f85e8ff  // udot v31.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
+      "ldr q7, [x10, #0xd0]\n"
       ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8dc  // udot v28.4s, v6.16b, v5.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
+      "ldr q6, [x10, #0xe0]\n"
       ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
       ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
       ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
       ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
       ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8fd  // udot v29.4s, v7.16b, v5.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
+      "ldr q7, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
       ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8de  // udot v30.4s, v6.16b, v5.4b[3]\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q6, [x10, #0x0]\n"
       ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      "ldr q0, [x25, #0x0]\n"
+      "ldr q0, [x26, #0x0]\n"
       ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr q1, [x24, #0x0]\n"
+      "ldr q1, [x25, #0x0]\n"
       ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
       ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr q3, [x22, #0x0]\n"
+      "ldr q3, [x23, #0x0]\n"
       ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr q4, [x21, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
       ".inst 0x6fa5e8ff  // udot v31.4s, v7.16b, v5.4b[3]\n"
-      "ldr q5, [x20, #0x0]\n"
+      "ldr q5, [x21, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
       "bge 187b\n"
       "188:"  // Height 6: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "sub x26, x26, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "add x21, x21, #0x10\n"
+      "ldr q6, [x10, #0x20]\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "add x20, x20, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "sub x27, x27, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
+      "ldr q6, [x10, #0x40]\n"
       ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
       ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
+      "ldr q7, [x10, #0x50]\n"
       ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x6fa5e0dc  // udot v28.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
+      "ldr q6, [x10, #0x60]\n"
       ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x6fa5e0fd  // udot v29.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "ldr q7, [x10, #0x70]\n"
       ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
       ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x6fa5e0de  // udot v30.4s, v6.16b, v5.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
+      "ldr q6, [x10, #0x80]\n"
       ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
       ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
       ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x6fa5e0ff  // udot v31.4s, v7.16b, v5.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
+      "ldr q7, [x10, #0x90]\n"
       ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x6f85e8dc  // udot v28.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
+      "ldr q6, [x10, #0xa0]\n"
       ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x6f85e8fd  // udot v29.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
+      "ldr q7, [x10, #0xb0]\n"
       ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x6f85e8de  // udot v30.4s, v6.16b, v5.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
+      "ldr q6, [x10, #0xc0]\n"
       ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
       ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
       ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x6f85e8ff  // udot v31.4s, v7.16b, v5.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
+      "ldr q7, [x10, #0xd0]\n"
       ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8dc  // udot v28.4s, v6.16b, v5.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
+      "ldr q6, [x10, #0xe0]\n"
       ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
       ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
       ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
       ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
       ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8fd  // udot v29.4s, v7.16b, v5.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
-      "add x9, x9, #0x100\n"
+      "ldr q7, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
       ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
       ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
@@ -2972,292 +2971,291 @@ void a64_hybrid_u8u32_dot_6x16 (
       ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8ff  // udot v31.4s, v7.16b, v5.4b[3]\n"
       "189:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x26, 194f\n"
-      "cmp x26, #0x4\n"
+      "cbz x27, 194f\n"
+      "cmp x27, #0x4\n"
       "blt 191f\n"
       "190:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x25], #0x4\n"
-      "sub x26, x26, #0x4\n"
-      "ldr s1, [x24], #0x4\n"
-      "cmp x26, #0x4\n"
-      "ldr s2, [x23], #0x4\n"
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
       "ldr s3, [x22], #0x4\n"
-      "ldr s4, [x21], #0x4\n"
-      "ldr s5, [x20], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6f87e028  // udot v8.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x6f86e02c  // udot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x6f85e030  // udot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x6f84e034  // udot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x6f83e038  // udot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x6f82e03c  // udot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x6f87e009  // udot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x6f86e00d  // udot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x6f85e011  // udot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x6f84e015  // udot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x6f83e019  // udot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x6f82e01d  // udot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6f87e02a  // udot v10.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x6f86e02e  // udot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x6f85e032  // udot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x6f84e036  // udot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x6f83e03a  // udot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x6f82e03e  // udot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x6f87e00b  // udot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x6f86e00f  // udot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x6f85e013  // udot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x6f84e017  // udot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x6f83e01b  // udot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x6f82e01f  // udot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 190b\n"
-      "cbz x26, 194f\n"
       "191:"  // Height 6: Multiply loop: Skip odd blocks
-      "tbz x26, #1, 192f\n"
-      "ldr h0, [x25], #0x2\n"
-      "ldr h1, [x24], #0x2\n"
-      "ldr h2, [x23], #0x2\n"
-      "ldr h3, [x22], #0x2\n"
-      "ldr h4, [x21], #0x2\n"
-      "ldr h5, [x20], #0x2\n"
-      "tbz x26, #0, 193f\n"
-      "ld1 { v0.b }[2], [x25]\n"
-      "ld1 { v1.b }[2], [x24]\n"
-      "ld1 { v2.b }[2], [x23]\n"
-      "ld1 { v3.b }[2], [x22]\n"
-      "ld1 { v4.b }[2], [x21]\n"
-      "ld1 { v5.b }[2], [x20]\n"
+      "cbz x27, 194f\n"
+      "tbz x27, #1, 192f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x21], #0x2\n"
+      "tbz x27, #0, 193f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x21]\n"
       "b 193f\n"
       "192:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x25, #0x0]\n"
-      "ldr b1, [x24, #0x0]\n"
-      "ldr b2, [x23, #0x0]\n"
-      "ldr b3, [x22, #0x0]\n"
-      "ldr b4, [x21, #0x0]\n"
-      "ldr b5, [x20, #0x0]\n"
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x21, #0x0]\n"
       "193:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "add x9, x9, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6f80e0e8  // udot v8.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ec  // udot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f0  // udot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f4  // udot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f8  // udot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fc  // udot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6f80e0c9  // udot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0cd  // udot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d1  // udot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d5  // udot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d9  // udot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0dd  // udot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x10, #0x30]\n"
+      "add x10, x10, #0x40\n"
+      ".inst 0x6f80e0ea  // udot v10.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ee  // udot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f2  // udot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f6  // udot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fa  // udot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fe  // udot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x6f80e0cb  // udot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0cf  // udot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d3  // udot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d7  // udot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0db  // udot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0df  // udot v31.4s, v6.16b, v5.4b[0]\n"
       "194:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x27, x27, #0x1\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 184b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x28, #0x0]\n"
-      "cmp x10, #0x10\n"
-      "add x23, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x19, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x19, LSL #2\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add x20, x21, x19, LSL #2\n"
       "prfm pstl1keep, [x20, #0x0]\n"
-      "add x19, x20, x19, LSL #2\n"
-      "prfm pstl1keep, [x19, #0x0]\n"
       "bge 203f\n"
-      "tbz x10, #3, 198f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v9.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v13.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v17.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v21.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v25.4s }, [x20], #0x10\n"
-      "st1 { v28.4s }, [x19], #0x10\n"
-      "st1 { v29.4s }, [x19], #0x10\n"
-      "tbz x10, #2, 196f\n"
-      "st1 { v10.4s }, [x28], #0x10\n"
-      "st1 { v14.4s }, [x23], #0x10\n"
-      "st1 { v18.4s }, [x22], #0x10\n"
-      "st1 { v22.4s }, [x21], #0x10\n"
-      "st1 { v26.4s }, [x20], #0x10\n"
-      "st1 { v30.4s }, [x19], #0x10\n"
-      "tbz x10, #1, 195f\n"
-      "str d11, [x28], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d19, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "str d27, [x20], #0x8\n"
-      "str d31, [x19], #0x8\n"
-      "tbz x10, #0, 202f\n"
-      "st1 { v11.s }[2], [x28]\n"
-      "st1 { v15.s }[2], [x23]\n"
-      "st1 { v19.s }[2], [x22]\n"
-      "st1 { v23.s }[2], [x21]\n"
-      "st1 { v27.s }[2], [x20]\n"
-      "st1 { v31.s }[2], [x19]\n"
+      "tbz x11, #3, 198f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v13.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "st1 { v29.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 196f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "st1 { v14.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v30.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 195f\n"
+      "str d11, [x9], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "str d31, [x20], #0x8\n"
+      "tbz x11, #0, 202f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "st1 { v15.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "st1 { v31.s }[2], [x20]\n"
       "b 202f\n"
       "195:"  // Height 6: Partial direct writeback: partial_1_12
-      "tbz x10, #0, 202f\n"
-      "str s11, [x28, #0x0]\n"
-      "str s15, [x23, #0x0]\n"
-      "str s19, [x22, #0x0]\n"
-      "str s23, [x21, #0x0]\n"
-      "str s27, [x20, #0x0]\n"
-      "str s31, [x19, #0x0]\n"
+      "tbz x11, #0, 202f\n"
+      "str s11, [x9, #0x0]\n"
+      "str s15, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "str s31, [x20, #0x0]\n"
       "b 202f\n"
       "196:"  // Height 6: Partial direct writeback: partial_2_8
-      "tbz x10, #1, 197f\n"
-      "str d10, [x28], #0x8\n"
-      "str d14, [x23], #0x8\n"
-      "str d18, [x22], #0x8\n"
-      "str d22, [x21], #0x8\n"
-      "str d26, [x20], #0x8\n"
-      "str d30, [x19], #0x8\n"
-      "tbz x10, #0, 202f\n"
-      "st1 { v10.s }[2], [x28]\n"
-      "st1 { v14.s }[2], [x23]\n"
-      "st1 { v18.s }[2], [x22]\n"
-      "st1 { v22.s }[2], [x21]\n"
-      "st1 { v26.s }[2], [x20]\n"
-      "st1 { v30.s }[2], [x19]\n"
+      "tbz x11, #1, 197f\n"
+      "str d10, [x9], #0x8\n"
+      "str d14, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "str d30, [x20], #0x8\n"
+      "tbz x11, #0, 202f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "st1 { v14.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "st1 { v30.s }[2], [x20]\n"
       "b 202f\n"
       "197:"  // Height 6: Partial direct writeback: partial_1_8
-      "tbz x10, #0, 202f\n"
-      "str s10, [x28, #0x0]\n"
-      "str s14, [x23, #0x0]\n"
-      "str s18, [x22, #0x0]\n"
-      "str s22, [x21, #0x0]\n"
-      "str s26, [x20, #0x0]\n"
-      "str s30, [x19, #0x0]\n"
+      "tbz x11, #0, 202f\n"
+      "str s10, [x9, #0x0]\n"
+      "str s14, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "str s30, [x20, #0x0]\n"
       "b 202f\n"
       "198:"  // Height 6: Partial direct writeback: partial_4_0
-      "tbz x10, #2, 200f\n"
-      "st1 { v8.4s }, [x28], #0x10\n"
-      "st1 { v12.4s }, [x23], #0x10\n"
-      "st1 { v16.4s }, [x22], #0x10\n"
-      "st1 { v20.4s }, [x21], #0x10\n"
-      "st1 { v24.4s }, [x20], #0x10\n"
-      "st1 { v28.4s }, [x19], #0x10\n"
-      "tbz x10, #1, 199f\n"
-      "str d9, [x28], #0x8\n"
-      "str d13, [x23], #0x8\n"
-      "str d17, [x22], #0x8\n"
-      "str d21, [x21], #0x8\n"
-      "str d25, [x20], #0x8\n"
-      "str d29, [x19], #0x8\n"
-      "tbz x10, #0, 202f\n"
-      "st1 { v9.s }[2], [x28]\n"
-      "st1 { v13.s }[2], [x23]\n"
-      "st1 { v17.s }[2], [x22]\n"
-      "st1 { v21.s }[2], [x21]\n"
-      "st1 { v25.s }[2], [x20]\n"
-      "st1 { v29.s }[2], [x19]\n"
+      "tbz x11, #2, 200f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 199f\n"
+      "str d9, [x9], #0x8\n"
+      "str d13, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "str d29, [x20], #0x8\n"
+      "tbz x11, #0, 202f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "st1 { v13.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "st1 { v29.s }[2], [x20]\n"
       "b 202f\n"
       "199:"  // Height 6: Partial direct writeback: partial_1_4
-      "tbz x10, #0, 202f\n"
-      "str s9, [x28, #0x0]\n"
-      "str s13, [x23, #0x0]\n"
-      "str s17, [x22, #0x0]\n"
-      "str s21, [x21, #0x0]\n"
-      "str s25, [x20, #0x0]\n"
-      "str s29, [x19, #0x0]\n"
+      "tbz x11, #0, 202f\n"
+      "str s9, [x9, #0x0]\n"
+      "str s13, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "str s29, [x20, #0x0]\n"
       "b 202f\n"
       "200:"  // Height 6: Partial direct writeback: partial_2_0
-      "tbz x10, #1, 201f\n"
-      "str d8, [x28], #0x8\n"
-      "str d12, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d20, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
-      "str d28, [x19], #0x8\n"
-      "tbz x10, #0, 202f\n"
-      "st1 { v8.s }[2], [x28]\n"
-      "st1 { v12.s }[2], [x23]\n"
-      "st1 { v16.s }[2], [x22]\n"
-      "st1 { v20.s }[2], [x21]\n"
-      "st1 { v24.s }[2], [x20]\n"
-      "st1 { v28.s }[2], [x19]\n"
+      "tbz x11, #1, 201f\n"
+      "str d8, [x9], #0x8\n"
+      "str d12, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "str d28, [x20], #0x8\n"
+      "tbz x11, #0, 202f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "st1 { v12.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "st1 { v28.s }[2], [x20]\n"
       "b 202f\n"
       "201:"  // Height 6: Partial direct writeback: partial_1_0
-      "str s8, [x28, #0x0]\n"
-      "str s12, [x23, #0x0]\n"
-      "str s16, [x22, #0x0]\n"
-      "str s20, [x21, #0x0]\n"
-      "str s24, [x20, #0x0]\n"
-      "str s28, [x19, #0x0]\n"
+      "str s8, [x9, #0x0]\n"
+      "str s12, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "str s28, [x20, #0x0]\n"
       "202:"  // Height 6: Partial direct writeback: Done
       "b 204f\n"
       "203:"  // Height 6: Full writeback
-      "str q8, [x28, #0x0]\n"
-      "str q9, [x28, #0x10]\n"
-      "str q10, [x28, #0x20]\n"
-      "str q11, [x28, #0x30]\n"
-      "add x28, x28, #0x40\n"
-      "str q12, [x23, #0x0]\n"
-      "str q13, [x23, #0x10]\n"
-      "str q14, [x23, #0x20]\n"
-      "str q15, [x23, #0x30]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q17, [x22, #0x10]\n"
-      "str q18, [x22, #0x20]\n"
-      "str q19, [x22, #0x30]\n"
-      "str q20, [x21, #0x0]\n"
-      "str q21, [x21, #0x10]\n"
-      "str q22, [x21, #0x20]\n"
-      "str q23, [x21, #0x30]\n"
-      "str q24, [x20, #0x0]\n"
-      "str q25, [x20, #0x10]\n"
-      "str q26, [x20, #0x20]\n"
-      "str q27, [x20, #0x30]\n"
-      "str q28, [x19, #0x0]\n"
-      "str q29, [x19, #0x10]\n"
-      "str q30, [x19, #0x20]\n"
-      "str q31, [x19, #0x30]\n"
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q12, [x24, #0x0]\n"
+      "str q13, [x24, #0x10]\n"
+      "str q14, [x24, #0x20]\n"
+      "str q15, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "str q28, [x20, #0x0]\n"
+      "str q29, [x20, #0x10]\n"
+      "str q30, [x20, #0x20]\n"
+      "str q31, [x20, #0x30]\n"
       "204:"  // Height 6: Writeback done
-      "subs x10, x10, #0x10\n"
+      "subs x11, x11, #0x10\n"
       "bgt 172b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 206f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 205f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "205:"  // Update direct input
-      "mov x19, #0x6\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "206:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
new file mode 100644
index 0000000000..e360452108
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<uint8_t>, \
+    size_t, size_t, \
+    const uint8_t *, \
+    IndirectOutputArg<uint32_t>, \
+    const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_u8u32_mmla_6x16( ARGLIST );
+
+class cls_a64_hybrid_u8u32_mmla_6x16
+{
+public:
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 55.05 };
+                case CPUModel::A510:
+                    return { 30.34 };
+                case CPUModel::V1:
+                    return { 83.77 };
+            }
+        }
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 55.31, 15.72, 0.62 };
+                case CPUModel::A510:
+                    return { 33.64, 3.92, 0.48 };
+                case CPUModel::V1:
+                    return { 63.94, 16.18, 0.83 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_u8u32_mmla_6x16;
+    cls_a64_hybrid_u8u32_mmla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..364f388e79
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
@@ -0,0 +1,3449 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+    const uint32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 186f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 149f\n"
+      "beq 112f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 75f\n"
+      "beq 38f\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "tbz %x[flags], #0, 13f\n"
+      "cmp x11, #0x10\n"
+      "bge 11f\n"
+      "tbz x11, #3, 6f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 4f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 3f\n"
+      "ldr d16, [x9], #0x8\n"
+      "mov x25, #0x38\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "b 10f\n"
+      "3:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "b 10f\n"
+      "4:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x11, #1, 5f\n"
+      "ldr d11, [x9], #0x8\n"
+      "mov x25, #0x28\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "b 10f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "b 10f\n"
+      "6:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x11, #2, 8f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 7f\n"
+      "ldr d10, [x9], #0x8\n"
+      "mov x25, #0x18\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "b 10f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 10f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "b 10f\n"
+      "8:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x11, #1, 9f\n"
+      "ldr d9, [x9], #0x8\n"
+      "mov x25, #0x8\n"
+      "tbz x11, #0, 10f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "b 10f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "mov x25, #0x0\n"
+      "10:"  // Height 1: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 12f\n"
+      "11:"  // Height 1: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "12:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 14f\n"
+      "13:"  // Height 1: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "14:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "15:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 17f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "b 17f\n"
+      "16:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "17:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "blt 20f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 19f\n"
+      "18:"  // Height 1: Multiply loop: Main loop head
+      "trn1 v19.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e87a668  // ummla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e86a66c  // ummla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e92a669  // ummla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e91a66d  // ummla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e92a428  // ummla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e91a42c  // ummla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e92a429  // ummla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e91a42d  // ummla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e92a42a  // ummla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e91a42e  // ummla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x6e92a42b  // ummla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x6e91a42f  // ummla v15.4s, v1.16b, v17.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      "add x10, x10, #0x100\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "bge 18b\n"
+      "19:"  // Height 1: Multiply loop: Single iteration only
+      "trn1 v20.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e87a688  // ummla v8.4s, v20.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e86a68c  // ummla v12.4s, v20.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e92a689  // ummla v9.4s, v20.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e91a68d  // ummla v13.4s, v20.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a68a  // ummla v10.4s, v20.16b, v18.16b\n"
+      "ldr q19, [x10, #0x60]\n"
+      ".inst 0x6e91a68e  // ummla v14.4s, v20.16b, v17.16b\n"
+      "ldr q18, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e93a68b  // ummla v11.4s, v20.16b, v19.16b\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6e92a68f  // ummla v15.4s, v20.16b, v18.16b\n"
+      "ldr q19, [x10, #0x90]\n"
+      ".inst 0x6e91a428  // ummla v8.4s, v1.16b, v17.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e93a42c  // ummla v12.4s, v1.16b, v19.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e92a429  // ummla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e91a42d  // ummla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e92a42a  // ummla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e91a42e  // ummla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x6e92a42b  // ummla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x6e91a42f  // ummla v15.4s, v1.16b, v17.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x10, x10, #0x100\n"
+      "20:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 27f\n"
+      "cmp x27, #0x8\n"
+      "blt 22f\n"
+      "21:"  // Height 1: Multiply loop: Odd block loop
+      "ldr d19, [x26], #0x8\n"
+      "ldr q18, [x10, #0x0]\n"
+      "trn1 v19.2d, v19.2d, v17.2d\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e92a668  // ummla v8.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e91a66c  // ummla v12.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e92a669  // ummla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e91a66d  // ummla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
+      "add x10, x10, #0x80\n"
+      "bge 21b\n"
+      "22:"  // Height 1: Multiply loop: Skip odd blocks
+      "cbz x27, 27f\n"
+      "tbz x27, #2, 24f\n"
+      "ldr s1, [x26], #0x4\n"
+      "tbz x27, #1, 23f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "tbz x27, #0, 26f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "b 26f\n"
+      "23:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 26f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "b 26f\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 25f\n"
+      "ldr h1, [x26], #0x2\n"
+      "tbz x27, #0, 26f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "b 26f\n"
+      "25:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "26:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q23, [x10, #0x0]\n"
+      "ldr q18, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v17.2d\n"
+      ".inst 0x6e97a668  // ummla v8.4s, v19.16b, v23.16b\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6e92a66c  // ummla v12.4s, v19.16b, v18.16b\n"
+      "ldr q31, [x10, #0x30]\n"
+      ".inst 0x6e91a669  // ummla v9.4s, v19.16b, v17.16b\n"
+      "ldr q20, [x10, #0x40]\n"
+      ".inst 0x6e9fa66d  // ummla v13.4s, v19.16b, v31.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e94a66a  // ummla v10.4s, v19.16b, v20.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
+      "add x10, x10, #0x80\n"
+      "27:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 15b\n"
+      "cmp x11, #0x10\n"
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "bge 36f\n"
+      "tbz x11, #3, 31f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "st1 { v9.4s }, [x9], #0x10\n"
+      "tbz x11, #2, 29f\n"
+      "st1 { v10.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 28f\n"
+      "str d11, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v11.s }[2], [x9]\n"
+      "b 35f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 35f\n"
+      "str s11, [x9, #0x0]\n"
+      "b 35f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 30f\n"
+      "str d10, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v10.s }[2], [x9]\n"
+      "b 35f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 35f\n"
+      "str s10, [x9, #0x0]\n"
+      "b 35f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 33f\n"
+      "st1 { v8.4s }, [x9], #0x10\n"
+      "tbz x11, #1, 32f\n"
+      "str d9, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v9.s }[2], [x9]\n"
+      "b 35f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 35f\n"
+      "str s9, [x9, #0x0]\n"
+      "b 35f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 34f\n"
+      "str d8, [x9], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v8.s }[2], [x9]\n"
+      "b 35f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x9, #0x0]\n"
+      "35:"  // Height 1: Partial direct writeback: Done
+      "b 37f\n"
+      "36:"  // Height 1: Full writeback
+      "str q8, [x9, #0x0]\n"
+      "str q9, [x9, #0x10]\n"
+      "str q10, [x9, #0x20]\n"
+      "str q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "37:"  // Height 1: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 2b\n"
+      "b 224f\n"
+      "38:"  // Height 2
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "39:"  // Height 2: Column loop
+      "tbz %x[flags], #0, 50f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x9, x20, LSL #2\n"
+      "bge 48f\n"
+      "tbz x11, #3, 43f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 41f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 40f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "tbz x11, #0, 47f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "b 47f\n"
+      "40:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 47f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "b 47f\n"
+      "41:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x11, #1, 42f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "tbz x11, #0, 47f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "b 47f\n"
+      "42:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 47f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "b 47f\n"
+      "43:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x11, #2, 45f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 44f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "tbz x11, #0, 47f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "b 47f\n"
+      "44:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 47f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "b 47f\n"
+      "45:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x11, #1, 46f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "tbz x11, #0, 47f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "b 47f\n"
+      "46:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "47:"  // Height 2: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 49f\n"
+      "48:"  // Height 2: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "49:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 51f\n"
+      "50:"  // Height 2: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "51:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "52:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 53f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 54f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "b 54f\n"
+      "53:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "54:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "blt 57f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 56f\n"
+      "55:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a668  // ummla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e86a66c  // ummla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e92a669  // ummla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e91a66d  // ummla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e92a428  // ummla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e91a42c  // ummla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e92a429  // ummla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e91a42d  // ummla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e92a42a  // ummla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e91a42e  // ummla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "sub x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x6e92a42b  // ummla v11.4s, v1.16b, v18.16b\n"
+      "add x10, x10, #0x100\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x6e91a42f  // ummla v15.4s, v1.16b, v17.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "bge 55b\n"
+      "56:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a668  // ummla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e86a66c  // ummla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e92a669  // ummla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e91a66d  // ummla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e92a428  // ummla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e91a42c  // ummla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e92a429  // ummla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e91a42d  // ummla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e92a42a  // ummla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e91a42e  // ummla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e92a42b  // ummla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x6e91a42f  // ummla v15.4s, v1.16b, v17.16b\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x10, x10, #0x100\n"
+      "57:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 64f\n"
+      "cmp x27, #0x8\n"
+      "blt 59f\n"
+      "58:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d18, [x26], #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "trn1 v19.2d, v18.2d, v17.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q22, [x10, #0x10]\n"
+      ".inst 0x6e91a668  // ummla v8.4s, v19.16b, v17.16b\n"
+      ".inst 0x6e96a66c  // ummla v12.4s, v19.16b, v22.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e81a669  // ummla v9.4s, v19.16b, v1.16b\n"
+      ".inst 0x6e91a66d  // ummla v13.4s, v19.16b, v17.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      "ldr q17, [x10, #0x70]\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
+      "add x10, x10, #0x80\n"
+      "bge 58b\n"
+      "59:"  // Height 2: Multiply loop: Skip odd blocks
+      "cbz x27, 64f\n"
+      "tbz x27, #2, 61f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "tbz x27, #1, 60f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "tbz x27, #0, 63f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "b 63f\n"
+      "60:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 63f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "b 63f\n"
+      "61:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 62f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "tbz x27, #0, 63f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "b 63f\n"
+      "62:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "63:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e92a668  // ummla v8.4s, v19.16b, v18.16b\n"
+      "ldr q5, [x10, #0x20]\n"
+      ".inst 0x6e91a66c  // ummla v12.4s, v19.16b, v17.16b\n"
+      "ldr q21, [x10, #0x30]\n"
+      ".inst 0x6e85a669  // ummla v9.4s, v19.16b, v5.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e95a66d  // ummla v13.4s, v19.16b, v21.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
+      "add x10, x10, #0x80\n"
+      "64:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 52b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "bge 73f\n"
+      "tbz x11, #3, 68f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 66f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 65f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "b 72f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 72f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "b 72f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 67f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "b 72f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 72f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "b 72f\n"
+      "68:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 70f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 69f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "b 72f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 72f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "b 72f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 71f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "b 72f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "72:"  // Height 2: Partial direct writeback: Done
+      "b 74f\n"
+      "73:"  // Height 2: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "74:"  // Height 2: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 39b\n"
+      "b 224f\n"
+      "75:"  // Height 3
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "76:"  // Height 3: Column loop
+      "tbz %x[flags], #0, 87f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x23, x24, x20, LSL #2\n"
+      "bge 85f\n"
+      "tbz x11, #3, 80f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 78f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 77f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d24, [x23], #0x8\n"
+      "tbz x11, #0, 84f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 84f\n"
+      "77:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 84f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "b 84f\n"
+      "78:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x11, #1, 79f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x11, #0, 84f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "b 84f\n"
+      "79:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 84f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "b 84f\n"
+      "80:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x11, #2, 82f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 81f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d18, [x23], #0x8\n"
+      "tbz x11, #0, 84f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "b 84f\n"
+      "81:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 84f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "b 84f\n"
+      "82:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x11, #1, 83f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "tbz x11, #0, 84f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "b 84f\n"
+      "83:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s17, [x23, #0x0]\n"
+      "84:"  // Height 3: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 86f\n"
+      "85:"  // Height 3: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "86:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 88f\n"
+      "87:"  // Height 3: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "88:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "89:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 90f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 91f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "b 91f\n"
+      "90:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "91:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "blt 94f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 93f\n"
+      "92:"  // Height 3: Multiply loop: Main loop head
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a788  // ummla v8.4s, v28.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e87a770  // ummla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e86a78c  // ummla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x6e86a774  // ummla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x6e9aa428  // ummla v8.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa470  // ummla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e99a42c  // ummla v12.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e99a474  // ummla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e9aa429  // ummla v9.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e9aa471  // ummla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e99a42d  // ummla v13.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e99a475  // ummla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e9aa42a  // ummla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa472  // ummla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e99a42e  // ummla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a476  // ummla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e9aa42b  // ummla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa473  // ummla v19.4s, v3.16b, v26.16b\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x6e99a42f  // ummla v15.4s, v1.16b, v25.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e99a477  // ummla v23.4s, v3.16b, v25.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "bge 92b\n"
+      "93:"  // Height 3: Multiply loop: Single iteration only
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a788  // ummla v8.4s, v28.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e87a770  // ummla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e86a78c  // ummla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x6e86a774  // ummla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x6e9aa428  // ummla v8.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e9aa470  // ummla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e99a42c  // ummla v12.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e99a474  // ummla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e9aa429  // ummla v9.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa471  // ummla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e99a42d  // ummla v13.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a475  // ummla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e9aa42a  // ummla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa472  // ummla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e99a42e  // ummla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a476  // ummla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e9aa42b  // ummla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa473  // ummla v19.4s, v3.16b, v26.16b\n"
+      ".inst 0x6e99a42f  // ummla v15.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a477  // ummla v23.4s, v3.16b, v25.16b\n"
+      "94:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 101f\n"
+      "cmp x27, #0x8\n"
+      "blt 96f\n"
+      "95:"  // Height 3: Multiply loop: Odd block loop
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr q26, [x10, #0x0]\n"
+      "trn1 v27.2d, v25.2d, v27.2d\n"
+      ".inst 0x6e9aa788  // ummla v8.4s, v28.16b, v26.16b\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e9aa770  // ummla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e99a78c  // ummla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a774  // ummla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "bge 95b\n"
+      "96:"  // Height 3: Multiply loop: Skip odd blocks
+      "cbz x27, 101f\n"
+      "tbz x27, #2, 98f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "tbz x27, #1, 97f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "tbz x27, #0, 100f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "b 100f\n"
+      "97:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 100f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "b 100f\n"
+      "98:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 99f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "tbz x27, #0, 100f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "b 100f\n"
+      "99:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "100:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v25.2d\n"
+      ".inst 0x6e9aa788  // ummla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa770  // ummla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e9da78c  // ummla v12.4s, v28.16b, v29.16b\n"
+      ".inst 0x6e9da774  // ummla v20.4s, v27.16b, v29.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "101:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 89b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "bge 110f\n"
+      "tbz x11, #3, 105f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 103f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 102f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "b 109f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 109f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "b 109f\n"
+      "103:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 104f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "b 109f\n"
+      "104:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 109f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "b 109f\n"
+      "105:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 107f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 106f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "b 109f\n"
+      "106:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 109f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "b 109f\n"
+      "107:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 108f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "b 109f\n"
+      "108:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "109:"  // Height 3: Partial direct writeback: Done
+      "b 111f\n"
+      "110:"  // Height 3: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "111:"  // Height 3: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 76b\n"
+      "b 224f\n"
+      "112:"  // Height 4
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "113:"  // Height 4: Column loop
+      "tbz %x[flags], #0, 124f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x22, x23, x20, LSL #2\n"
+      "bge 122f\n"
+      "tbz x11, #3, 117f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 115f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 114f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x11, #0, 121f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "b 121f\n"
+      "114:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 121f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "b 121f\n"
+      "115:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x11, #1, 116f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x11, #0, 121f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "b 121f\n"
+      "116:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 121f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "b 121f\n"
+      "117:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x11, #2, 119f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 118f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x11, #0, 121f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "b 121f\n"
+      "118:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 121f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "b 121f\n"
+      "119:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x11, #1, 120f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "tbz x11, #0, 121f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "b 121f\n"
+      "120:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "121:"  // Height 4: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 123f\n"
+      "122:"  // Height 4: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "123:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 125f\n"
+      "124:"  // Height 4: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "125:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "126:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 127f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 128f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 128f\n"
+      "127:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "128:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "blt 131f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "blt 130f\n"
+      "129:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a788  // ummla v8.4s, v28.16b, v7.16b\n"
+      "sub x27, x27, #0x10\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e87a770  // ummla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e86a78c  // ummla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x6e86a774  // ummla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
+      "add x23, x23, #0x10\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x6e9aa428  // ummla v8.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa470  // ummla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e99a42c  // ummla v12.4s, v1.16b, v25.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x6e99a474  // ummla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e9aa429  // ummla v9.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e9aa471  // ummla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e99a42d  // ummla v13.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e99a475  // ummla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e9aa42a  // ummla v10.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e9aa472  // ummla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e99a42e  // ummla v14.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e99a476  // ummla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e9aa42b  // ummla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa473  // ummla v19.4s, v3.16b, v26.16b\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x6e99a42f  // ummla v15.4s, v1.16b, v25.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e99a477  // ummla v23.4s, v3.16b, v25.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "bge 129b\n"
+      "130:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a788  // ummla v8.4s, v28.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e87a770  // ummla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e86a78c  // ummla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x6e86a774  // ummla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x6e9aa428  // ummla v8.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e9aa470  // ummla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e99a42c  // ummla v12.4s, v1.16b, v25.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e99a474  // ummla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e9aa429  // ummla v9.4s, v1.16b, v26.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e9aa471  // ummla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e99a42d  // ummla v13.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a475  // ummla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e9aa42a  // ummla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa472  // ummla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e99a42e  // ummla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a476  // ummla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e9aa42b  // ummla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa473  // ummla v19.4s, v3.16b, v26.16b\n"
+      ".inst 0x6e99a42f  // ummla v15.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a477  // ummla v23.4s, v3.16b, v25.16b\n"
+      "131:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 138f\n"
+      "cmp x27, #0x8\n"
+      "blt 133f\n"
+      "132:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "trn1 v27.2d, v26.2d, v25.2d\n"
+      "cmp x27, #0x8\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e9aa788  // ummla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa770  // ummla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e99a78c  // ummla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a774  // ummla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "bge 132b\n"
+      "133:"  // Height 4: Multiply loop: Skip odd blocks
+      "cbz x27, 138f\n"
+      "tbz x27, #2, 135f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "tbz x27, #1, 134f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "ld1 { v4.h }[2], [x23], #0x2\n"
+      "tbz x27, #0, 137f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "ld1 { v4.b }[6], [x23]\n"
+      "b 137f\n"
+      "134:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 137f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "ld1 { v4.b }[4], [x23]\n"
+      "b 137f\n"
+      "135:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 136f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "tbz x27, #0, 137f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x23]\n"
+      "b 137f\n"
+      "136:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x23, #0x0]\n"
+      "137:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e9aa788  // ummla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa770  // ummla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e99a78c  // ummla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a774  // ummla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "138:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 126b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "bge 147f\n"
+      "tbz x11, #3, 142f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 140f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 139f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "b 146f\n"
+      "139:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 146f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "b 146f\n"
+      "140:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 141f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "b 146f\n"
+      "141:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 146f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "b 146f\n"
+      "142:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 144f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 143f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "b 146f\n"
+      "143:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 146f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "b 146f\n"
+      "144:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 145f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "b 146f\n"
+      "145:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "146:"  // Height 4: Partial direct writeback: Done
+      "b 148f\n"
+      "147:"  // Height 4: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "148:"  // Height 4: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 113b\n"
+      "b 224f\n"
+      "149:"  // Height 5
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "150:"  // Height 5: Column loop
+      "tbz %x[flags], #0, 161f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x21, x22, x20, LSL #2\n"
+      "bge 159f\n"
+      "tbz x11, #3, 154f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 152f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 151f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d6, [x21], #0x8\n"
+      "tbz x11, #0, 158f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v6.s }[2], [x21]\n"
+      "b 158f\n"
+      "151:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 158f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s6, [x21, #0x0]\n"
+      "b 158f\n"
+      "152:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x11, #1, 153f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x11, #0, 158f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "b 158f\n"
+      "153:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 158f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "b 158f\n"
+      "154:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x11, #2, 156f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 155f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x11, #0, 158f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "b 158f\n"
+      "155:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 158f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "b 158f\n"
+      "156:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x11, #1, 157f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "tbz x11, #0, 158f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "b 158f\n"
+      "157:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "158:"  // Height 5: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 160f\n"
+      "159:"  // Height 5: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q25, [x21, #0x0]\n"
+      "ldr q26, [x21, #0x10]\n"
+      "ldr q27, [x21, #0x20]\n"
+      "ldr q6, [x21, #0x30]\n"
+      "160:"  // Height 5: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 162f\n"
+      "161:"  // Height 5: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "162:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "163:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 164f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 165f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 165f\n"
+      "164:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "165:"  // Height 5: input setup done
+      "cmp x27, #0x10\n"
+      "blt 168f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "blt 167f\n"
+      "166:"  // Height 5: Multiply loop: Main loop head
+      "trn1 v6.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a4c8  // ummla v8.4s, v6.16b, v7.16b\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "sub x27, x27, #0x10\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e80a4cc  // ummla v12.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a454  // ummla v20.4s, v2.16b, v0.16b\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6e80a49c  // ummla v28.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e87a4c9  // ummla v9.4s, v6.16b, v7.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e80a4cd  // ummla v13.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a455  // ummla v21.4s, v2.16b, v0.16b\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e80a49d  // ummla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e87a4ca  // ummla v10.4s, v6.16b, v7.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e80a4ce  // ummla v14.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a456  // ummla v22.4s, v2.16b, v0.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e80a49e  // ummla v30.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e87a4cb  // ummla v11.4s, v6.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e80a4cf  // ummla v15.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a457  // ummla v23.4s, v2.16b, v0.16b\n"
+      "ldr q2, [x25, #0x0]\n"
+      ".inst 0x6e80a49f  // ummla v31.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x90]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x6e80a42c  // ummla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a474  // ummla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bc  // ummla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x6e86a429  // ummla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a471  // ummla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4b9  // ummla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x6e80a42d  // ummla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a475  // ummla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bd  // ummla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x6e86a42a  // ummla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a472  // ummla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4ba  // ummla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x6e80a42e  // ummla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a476  // ummla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4be  // ummla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e86a42b  // ummla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a473  // ummla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bb  // ummla v27.4s, v5.16b, v6.16b\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x6e80a42f  // ummla v15.4s, v1.16b, v0.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e80a477  // ummla v23.4s, v3.16b, v0.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      ".inst 0x6e80a4bf  // ummla v31.4s, v5.16b, v0.16b\n"
+      "ldr q5, [x22, #0x0]\n"
+      "bge 166b\n"
+      "167:"  // Height 5: Multiply loop: Single iteration only
+      "trn1 v6.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a4c8  // ummla v8.4s, v6.16b, v7.16b\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e80a4cc  // ummla v12.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a454  // ummla v20.4s, v2.16b, v0.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e80a49c  // ummla v28.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e87a4c9  // ummla v9.4s, v6.16b, v7.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e80a4cd  // ummla v13.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a455  // ummla v21.4s, v2.16b, v0.16b\n"
+      "add x22, x22, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x6e80a49d  // ummla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e87a4ca  // ummla v10.4s, v6.16b, v7.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e80a4ce  // ummla v14.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a456  // ummla v22.4s, v2.16b, v0.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e80a49e  // ummla v30.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e87a4cb  // ummla v11.4s, v6.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e80a4cf  // ummla v15.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a457  // ummla v23.4s, v2.16b, v0.16b\n"
+      ".inst 0x6e80a49f  // ummla v31.4s, v4.16b, v0.16b\n"
+      "ldr q2, [x10, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x6e82a42c  // ummla v12.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a474  // ummla v20.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4bc  // ummla v28.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x6e80a429  // ummla v9.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a471  // ummla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4b9  // ummla v25.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x6e82a42d  // ummla v13.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a475  // ummla v21.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4bd  // ummla v29.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x6e80a42a  // ummla v10.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a472  // ummla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4ba  // ummla v26.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x6e82a42e  // ummla v14.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a476  // ummla v22.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4be  // ummla v30.4s, v5.16b, v2.16b\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e80a42b  // ummla v11.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a473  // ummla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bb  // ummla v27.4s, v5.16b, v0.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bf  // ummla v31.4s, v5.16b, v6.16b\n"
+      "168:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 175f\n"
+      "cmp x27, #0x8\n"
+      "blt 170f\n"
+      "169:"  // Height 5: Multiply loop: Odd block loop
+      "ldr d1, [x26], #0x8\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr d0, [x22], #0x8\n"
+      "ldr q1, [x10, #0x0]\n"
+      "trn1 v2.2d, v0.2d, v2.2d\n"
+      ".inst 0x6e81a488  // ummla v8.4s, v4.16b, v1.16b\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e81a470  // ummla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a458  // ummla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x6e80a48c  // ummla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a474  // ummla v20.4s, v3.16b, v0.16b\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6e80a45c  // ummla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e81a489  // ummla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e81a471  // ummla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a459  // ummla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x6e80a48d  // ummla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a475  // ummla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45d  // ummla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e81a48a  // ummla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e81a472  // ummla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45a  // ummla v26.4s, v2.16b, v1.16b\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e80a48e  // ummla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a476  // ummla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45e  // ummla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e86a48b  // ummla v11.4s, v4.16b, v6.16b\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e86a473  // ummla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a45b  // ummla v27.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e80a48f  // ummla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a477  // ummla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45f  // ummla v31.4s, v2.16b, v0.16b\n"
+      "bge 169b\n"
+      "170:"  // Height 5: Multiply loop: Skip odd blocks
+      "cbz x27, 175f\n"
+      "tbz x27, #2, 172f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s5, [x22], #0x4\n"
+      "tbz x27, #1, 171f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "ld1 { v4.h }[2], [x23], #0x2\n"
+      "ld1 { v5.h }[2], [x22], #0x2\n"
+      "tbz x27, #0, 174f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "ld1 { v4.b }[6], [x23]\n"
+      "ld1 { v5.b }[6], [x22]\n"
+      "b 174f\n"
+      "171:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 174f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "ld1 { v4.b }[4], [x23]\n"
+      "ld1 { v5.b }[4], [x22]\n"
+      "b 174f\n"
+      "172:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 173f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "ldr h5, [x22], #0x2\n"
+      "tbz x27, #0, 174f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x23]\n"
+      "ld1 { v5.b }[2], [x22]\n"
+      "b 174f\n"
+      "173:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x23, #0x0]\n"
+      "ldr b5, [x22, #0x0]\n"
+      "174:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      "trn1 v2.2d, v5.2d, v0.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x6e86a4e8  // ummla v8.4s, v7.16b, v6.16b\n"
+      ".inst 0x6e86a470  // ummla v16.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a458  // ummla v24.4s, v2.16b, v6.16b\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e81a4ec  // ummla v12.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a474  // ummla v20.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45c  // ummla v28.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e80a4e9  // ummla v9.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e80a471  // ummla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a459  // ummla v25.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x6e81a4ed  // ummla v13.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a475  // ummla v21.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45d  // ummla v29.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e80a4ea  // ummla v10.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e80a472  // ummla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45a  // ummla v26.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x6e81a4ee  // ummla v14.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a476  // ummla v22.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45e  // ummla v30.4s, v2.16b, v1.16b\n"
+      "ldr q6, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e80a4eb  // ummla v11.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e80a473  // ummla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45b  // ummla v27.4s, v2.16b, v0.16b\n"
+      ".inst 0x6e86a4ef  // ummla v15.4s, v7.16b, v6.16b\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a45f  // ummla v31.4s, v2.16b, v6.16b\n"
+      "175:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 163b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "bge 184f\n"
+      "tbz x11, #3, 179f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 177f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 176f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "b 183f\n"
+      "176:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 183f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "b 183f\n"
+      "177:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 178f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "b 183f\n"
+      "178:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 183f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "b 183f\n"
+      "179:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 181f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 180f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "b 183f\n"
+      "180:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 183f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "b 183f\n"
+      "181:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 182f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "b 183f\n"
+      "182:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "183:"  // Height 5: Partial direct writeback: Done
+      "b 185f\n"
+      "184:"  // Height 5: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "185:"  // Height 5: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 150b\n"
+      "b 224f\n"
+      "186:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "187:"  // Height 6: Column loop
+      "tbz %x[flags], #0, 198f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "add x20, x21, x20, LSL #2\n"
+      "bge 196f\n"
+      "tbz x11, #3, 191f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v10.4s }, [x9], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v29.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 189f\n"
+      "ld1 { v11.4s }, [x9], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "ld1 { v30.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 188f\n"
+      "ldr d16, [x9], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "mov x25, #0x38\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d6, [x21], #0x8\n"
+      "ldr d31, [x20], #0x8\n"
+      "tbz x11, #0, 195f\n"
+      "ld1 { v16.s }[2], [x9]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v6.s }[2], [x21]\n"
+      "ld1 { v31.s }[2], [x20]\n"
+      "b 195f\n"
+      "188:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x25, #0x30\n"
+      "tbz x11, #0, 195f\n"
+      "ldr s16, [x9, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s6, [x21, #0x0]\n"
+      "ldr s31, [x20, #0x0]\n"
+      "b 195f\n"
+      "189:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x11, #1, 190f\n"
+      "ldr d11, [x9], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x25, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "ldr d30, [x20], #0x8\n"
+      "tbz x11, #0, 195f\n"
+      "ld1 { v11.s }[2], [x9]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "ld1 { v30.s }[2], [x20]\n"
+      "b 195f\n"
+      "190:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x25, #0x20\n"
+      "tbz x11, #0, 195f\n"
+      "ldr s11, [x9, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "ldr s30, [x20, #0x0]\n"
+      "b 195f\n"
+      "191:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x11, #2, 193f\n"
+      "ld1 { v9.4s }, [x9], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 192f\n"
+      "ldr d10, [x9], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "mov x25, #0x18\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d29, [x20], #0x8\n"
+      "tbz x11, #0, 195f\n"
+      "ld1 { v10.s }[2], [x9]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "ld1 { v29.s }[2], [x20]\n"
+      "b 195f\n"
+      "192:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x25, #0x10\n"
+      "tbz x11, #0, 195f\n"
+      "ldr s10, [x9, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "ldr s29, [x20, #0x0]\n"
+      "b 195f\n"
+      "193:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x11, #1, 194f\n"
+      "ldr d9, [x9], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x25, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "tbz x11, #0, 195f\n"
+      "ld1 { v9.s }[2], [x9]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v28.s }[2], [x20]\n"
+      "b 195f\n"
+      "194:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr s9, [x9, #0x0]\n"
+      "ldr s12, [x24, #0x0]\n"
+      "mov x25, #0x0\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "ldr s28, [x20, #0x0]\n"
+      "195:"  // Height 6: Partial accumulate: Done
+      "sub x9, x9, x25\n"
+      "b 197f\n"
+      "196:"  // Height 6: full accumulate
+      "ldr q9, [x9, #0x0]\n"
+      "ldr q10, [x9, #0x10]\n"
+      "ldr q11, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q25, [x21, #0x0]\n"
+      "ldr q26, [x21, #0x10]\n"
+      "ldr q27, [x21, #0x20]\n"
+      "ldr q6, [x21, #0x30]\n"
+      "ldr q28, [x20, #0x0]\n"
+      "ldr q29, [x20, #0x10]\n"
+      "ldr q30, [x20, #0x20]\n"
+      "ldr q31, [x20, #0x30]\n"
+      "197:"  // Height 6: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 199f\n"
+      "198:"  // Height 6: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "199:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "200:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 201f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 202f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 202f\n"
+      "201:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "202:"  // Height 6: input setup done
+      "cmp x27, #0x10\n"
+      "blt 205f\n"
+      "ldr q1, [x26, #0x0]\n"
+      "ldr q2, [x25, #0x0]\n"
+      "cmp x27, #0x20\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "blt 204f\n"
+      "203:"  // Height 6: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "sub x27, x27, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      "cmp x27, #0x20\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      "ldr q2, [x25, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
+      "ldr q0, [x10, #0x90]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x6e80a42c  // ummla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a474  // ummla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bc  // ummla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x6e86a429  // ummla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a471  // ummla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4b9  // ummla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x6e80a42d  // ummla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a475  // ummla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bd  // ummla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x6e86a42a  // ummla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a472  // ummla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4ba  // ummla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x6e80a42e  // ummla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a476  // ummla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4be  // ummla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e86a42b  // ummla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a473  // ummla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bb  // ummla v27.4s, v5.16b, v6.16b\n"
+      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x6e80a42f  // ummla v15.4s, v1.16b, v0.16b\n"
+      "ldr q1, [x26, #0x0]\n"
+      ".inst 0x6e80a477  // ummla v23.4s, v3.16b, v0.16b\n"
+      "ldr q3, [x24, #0x0]\n"
+      ".inst 0x6e80a4bf  // ummla v31.4s, v5.16b, v0.16b\n"
+      "ldr q5, [x22, #0x0]\n"
+      "ldr q6, [x21, #0x0]\n"
+      "bge 203b\n"
+      "204:"  // Height 6: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "add x26, x26, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x40]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      "add x21, x21, #0x10\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x10, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
+      "ldr q2, [x10, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x6e82a42c  // ummla v12.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a474  // ummla v20.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4bc  // ummla v28.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x6e80a429  // ummla v9.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a471  // ummla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4b9  // ummla v25.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x6e82a42d  // ummla v13.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a475  // ummla v21.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4bd  // ummla v29.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x6e80a42a  // ummla v10.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a472  // ummla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4ba  // ummla v26.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x6e82a42e  // ummla v14.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a476  // ummla v22.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4be  // ummla v30.4s, v5.16b, v2.16b\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e80a42b  // ummla v11.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a473  // ummla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bb  // ummla v27.4s, v5.16b, v0.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bf  // ummla v31.4s, v5.16b, v6.16b\n"
+      "205:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 212f\n"
+      "cmp x27, #0x8\n"
+      "blt 207f\n"
+      "206:"  // Height 6: Multiply loop: Odd block loop
+      "ldr d1, [x26], #0x8\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "sub x27, x27, #0x8\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
+      "cmp x27, #0x8\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d0, [x21], #0x8\n"
+      "trn1 v2.2d, v1.2d, v0.2d\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e81a488  // ummla v8.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e81a470  // ummla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a458  // ummla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x6e80a48c  // ummla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a474  // ummla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45c  // ummla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e81a489  // ummla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e81a471  // ummla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a459  // ummla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x6e80a48d  // ummla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a475  // ummla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45d  // ummla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e81a48a  // ummla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e81a472  // ummla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45a  // ummla v26.4s, v2.16b, v1.16b\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e80a48e  // ummla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a476  // ummla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45e  // ummla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e86a48b  // ummla v11.4s, v4.16b, v6.16b\n"
+      ".inst 0x6e86a473  // ummla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a45b  // ummla v27.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e80a48f  // ummla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a477  // ummla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45f  // ummla v31.4s, v2.16b, v0.16b\n"
+      "bge 206b\n"
+      "207:"  // Height 6: Multiply loop: Skip odd blocks
+      "cbz x27, 212f\n"
+      "tbz x27, #2, 209f\n"
+      "ldr s1, [x26], #0x4\n"
+      "ldr s2, [x25], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s5, [x22], #0x4\n"
+      "ldr s6, [x21], #0x4\n"
+      "tbz x27, #1, 208f\n"
+      "ld1 { v1.h }[2], [x26], #0x2\n"
+      "ld1 { v2.h }[2], [x25], #0x2\n"
+      "ld1 { v3.h }[2], [x24], #0x2\n"
+      "ld1 { v4.h }[2], [x23], #0x2\n"
+      "ld1 { v5.h }[2], [x22], #0x2\n"
+      "ld1 { v6.h }[2], [x21], #0x2\n"
+      "tbz x27, #0, 211f\n"
+      "ld1 { v1.b }[6], [x26]\n"
+      "ld1 { v2.b }[6], [x25]\n"
+      "ld1 { v3.b }[6], [x24]\n"
+      "ld1 { v4.b }[6], [x23]\n"
+      "ld1 { v5.b }[6], [x22]\n"
+      "ld1 { v6.b }[6], [x21]\n"
+      "b 211f\n"
+      "208:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 211f\n"
+      "ld1 { v1.b }[4], [x26]\n"
+      "ld1 { v2.b }[4], [x25]\n"
+      "ld1 { v3.b }[4], [x24]\n"
+      "ld1 { v4.b }[4], [x23]\n"
+      "ld1 { v5.b }[4], [x22]\n"
+      "ld1 { v6.b }[4], [x21]\n"
+      "b 211f\n"
+      "209:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 210f\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "ldr h5, [x22], #0x2\n"
+      "ldr h6, [x21], #0x2\n"
+      "tbz x27, #0, 211f\n"
+      "ld1 { v1.b }[2], [x26]\n"
+      "ld1 { v2.b }[2], [x25]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x23]\n"
+      "ld1 { v5.b }[2], [x22]\n"
+      "ld1 { v6.b }[2], [x21]\n"
+      "b 211f\n"
+      "210:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x26, #0x0]\n"
+      "ldr b2, [x25, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x23, #0x0]\n"
+      "ldr b5, [x22, #0x0]\n"
+      "ldr b6, [x21, #0x0]\n"
+      "211:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q0, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e80a4e8  // ummla v8.4s, v7.16b, v0.16b\n"
+      "trn1 v2.2d, v5.2d, v6.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x6e80a470  // ummla v16.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a458  // ummla v24.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e81a4ec  // ummla v12.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a474  // ummla v20.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45c  // ummla v28.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e80a4e9  // ummla v9.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e80a471  // ummla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a459  // ummla v25.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x6e81a4ed  // ummla v13.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a475  // ummla v21.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45d  // ummla v29.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e80a4ea  // ummla v10.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e80a472  // ummla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45a  // ummla v26.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x6e81a4ee  // ummla v14.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a476  // ummla v22.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45e  // ummla v30.4s, v2.16b, v1.16b\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e80a4eb  // ummla v11.4s, v7.16b, v0.16b\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e80a473  // ummla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45b  // ummla v27.4s, v2.16b, v0.16b\n"
+      ".inst 0x6e86a4ef  // ummla v15.4s, v7.16b, v6.16b\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a45f  // ummla v31.4s, v2.16b, v6.16b\n"
+      "212:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 200b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x20, x21, x20, LSL #2\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "bge 221f\n"
+      "tbz x11, #3, 216f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "st1 { v25.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 214f\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "st1 { v26.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 213f\n"
+      "str d14, [x9], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "str d27, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "st1 { v27.s }[2], [x20]\n"
+      "b 220f\n"
+      "213:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 220f\n"
+      "str s14, [x9, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "str s27, [x20, #0x0]\n"
+      "b 220f\n"
+      "214:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 215f\n"
+      "str d13, [x9], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "str d26, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "st1 { v26.s }[2], [x20]\n"
+      "b 220f\n"
+      "215:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 220f\n"
+      "str s13, [x9, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "str s26, [x20, #0x0]\n"
+      "b 220f\n"
+      "216:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 218f\n"
+      "st1 { v7.4s }, [x9], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 217f\n"
+      "str d12, [x9], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "str d25, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "st1 { v25.s }[2], [x20]\n"
+      "b 220f\n"
+      "217:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 220f\n"
+      "str s12, [x9, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "str s25, [x20, #0x0]\n"
+      "b 220f\n"
+      "218:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 219f\n"
+      "str d7, [x9], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "str d23, [x21], #0x8\n"
+      "str d24, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v7.s }[2], [x9]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "st1 { v23.s }[2], [x21]\n"
+      "st1 { v24.s }[2], [x20]\n"
+      "b 220f\n"
+      "219:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s7, [x9, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "str s23, [x21, #0x0]\n"
+      "str s24, [x20, #0x0]\n"
+      "220:"  // Height 6: Partial direct writeback: Done
+      "b 222f\n"
+      "221:"  // Height 6: Full writeback
+      "str q7, [x9, #0x0]\n"
+      "str q12, [x9, #0x10]\n"
+      "str q13, [x9, #0x20]\n"
+      "str q14, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "str q23, [x21, #0x0]\n"
+      "str q28, [x21, #0x10]\n"
+      "str q29, [x21, #0x20]\n"
+      "str q30, [x21, #0x30]\n"
+      "str q24, [x20, #0x0]\n"
+      "str q25, [x20, #0x10]\n"
+      "str q26, [x20, #0x20]\n"
+      "str q27, [x20, #0x30]\n"
+      "222:"  // Height 6: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 187b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 224f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 223f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "223:"  // Update direct input
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "224:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
index 2fea5ad2e7..25c5bf1b44 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,51 +22,74 @@
  * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
 
-#include "../bfloat.hpp"
 #include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, \
+    float *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_dot_8x12( ARGLIST );
 
-class cls_a64_interleaved_bf16fp32_dot_8x12 {
+class cls_a64_interleaved_bf16fp32_dot_8x12
+{
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return 12;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return 12;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 2;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsFixed<operand_type, result_type, 8, 12, 2> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 2, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=a64_interleaved_bf16fp32_dot_8x12;
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.93, 4.16, 7.19 };
+                case CPUModel::V1:
+                    return { 20.88, 5.10, 6.57 };
+                case CPUModel::A510:
+                    return { 7.77, 3.69, 3.02 };
+            }
+        }
 
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_bf16fp32_dot_8x12;
     cls_a64_interleaved_bf16fp32_dot_8x12(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
index 92149a5579..5684f464b6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,305 +23,234 @@
  */
 #ifdef __aarch64__
 
+#include <cstddef>
 #include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const bfloat16 *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
+void a64_interleaved_bf16fp32_dot_8x12(
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
-    K /= 2;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const bfloat16 *a_ptr0 = a_ptr;
-        const bfloat16 *b_ptr = Bpanel;
+    ka.K = (K/2) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "movi v8.4s, #0\n"
-                "ldr q0, [%[a_ptr]]\n"
-                "movi v9.4s, #0\n"
-                "ldr q4, [%[b_ptr]]\n"
-                "movi v10.4s, #0\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                "movi v11.4s, #0\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                "movi v12.4s, #0\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                "movi v13.4s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "movi v14.4s, #0\n"
-                "add %[b_ptr], %[b_ptr], #0x30\n"
-                "movi v15.4s, #0\n"
-                "movi v16.4s, #0\n"
-                "movi v17.4s, #0\n"
-                "movi v18.4s, #0\n"
-                "movi v19.4s, #0\n"
-                "movi v20.4s, #0\n"
-                "movi v21.4s, #0\n"
-                "movi v22.4s, #0\n"
-                "movi v23.4s, #0\n"
-                "movi v24.4s, #0\n"
-                "movi v25.4s, #0\n"
-                "movi v26.4s, #0\n"
-                "movi v27.4s, #0\n"
-                "movi v28.4s, #0\n"
-                "movi v29.4s, #0\n"
-                "movi v30.4s, #0\n"
-                "movi v31.4s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
-                ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n"
-                ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
-                ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n"
-                ".inst 0x4f60f8af // bfdot v15.4s, v5.8h, v0.h[3]\n"
-                ".inst 0x4f41f0b8 // bfdot v24.4s, v5.8h, v1.h[0]\n"
-                ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
-                ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n"
-                ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n"
-                ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n"
-                ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n"
-                ".inst 0x4f60f8d3 // bfdot v19.4s, v6.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4f41f0dc // bfdot v28.4s, v6.8h, v1.h[0]\n"
-                ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n"
-                ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n"
-                ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
-                "add %[b_ptr], %[b_ptr], #0x60\n"
-                ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
-                ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
-                ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
-                ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"
-                ".inst 0x4f63f897 // bfdot v23.4s, v4.8h, v3.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f42f0ac // bfdot v12.4s, v5.8h, v2.h[0]\n"
-                ".inst 0x4f62f0ad // bfdot v13.4s, v5.8h, v2.h[1]\n"
-                ".inst 0x4f42f8ae // bfdot v14.4s, v5.8h, v2.h[2]\n"
-                ".inst 0x4f62f8af // bfdot v15.4s, v5.8h, v2.h[3]\n"
-                ".inst 0x4f43f0b8 // bfdot v24.4s, v5.8h, v3.h[0]\n"
-                ".inst 0x4f63f0b9 // bfdot v25.4s, v5.8h, v3.h[1]\n"
-                ".inst 0x4f43f8ba // bfdot v26.4s, v5.8h, v3.h[2]\n"
-                ".inst 0x4f63f8bb // bfdot v27.4s, v5.8h, v3.h[3]\n"
-                "ldr q5, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
-                ".inst 0x4f62f0d1 // bfdot v17.4s, v6.8h, v2.h[1]\n"
-                ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
-                ".inst 0x4f62f8d3 // bfdot v19.4s, v6.8h, v2.h[3]\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4f43f0dc // bfdot v28.4s, v6.8h, v3.h[0]\n"
-                ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
-                ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
-                ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
-                ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n"
-                ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
-                ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n"
-                ".inst 0x4f60f8af // bfdot v15.4s, v5.8h, v0.h[3]\n"
-                ".inst 0x4f41f0b8 // bfdot v24.4s, v5.8h, v1.h[0]\n"
-                ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
-                ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n"
-                ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n"
-                ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n"
-                ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n"
-                ".inst 0x4f60f8d3 // bfdot v19.4s, v6.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4f41f0dc // bfdot v28.4s, v6.8h, v1.h[0]\n"
-                ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n"
-                ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n"
-                ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
-                "add %[b_ptr], %[b_ptr], #0x60\n"
-                ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
-                ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
-                ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
-                ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"
-                ".inst 0x4f63f897 // bfdot v23.4s, v4.8h, v3.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f42f0ac // bfdot v12.4s, v5.8h, v2.h[0]\n"
-                ".inst 0x4f62f0ad // bfdot v13.4s, v5.8h, v2.h[1]\n"
-                ".inst 0x4f42f8ae // bfdot v14.4s, v5.8h, v2.h[2]\n"
-                ".inst 0x4f62f8af // bfdot v15.4s, v5.8h, v2.h[3]\n"
-                ".inst 0x4f43f0b8 // bfdot v24.4s, v5.8h, v3.h[0]\n"
-                ".inst 0x4f63f0b9 // bfdot v25.4s, v5.8h, v3.h[1]\n"
-                ".inst 0x4f43f8ba // bfdot v26.4s, v5.8h, v3.h[2]\n"
-                ".inst 0x4f63f8bb // bfdot v27.4s, v5.8h, v3.h[3]\n"
-                "ldr q5, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
-                ".inst 0x4f62f0d1 // bfdot v17.4s, v6.8h, v2.h[1]\n"
-                ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
-                ".inst 0x4f62f8d3 // bfdot v19.4s, v6.8h, v2.h[3]\n"
-                ".inst 0x4f43f0dc // bfdot v28.4s, v6.8h, v3.h[0]\n"
-                ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
-                ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
-                ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
-                "str q8, [%[c_ptr]]\n"
-                ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n"
-                ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n"
-                ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
-                ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n"
-                ".inst 0x4f60f8af // bfdot v15.4s, v5.8h, v0.h[3]\n"
-                "str q12, [%[c_ptr], #0x10]\n"
-                ".inst 0x4f41f0b8 // bfdot v24.4s, v5.8h, v1.h[0]\n"
-                ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
-                ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n"
-                ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n"
-                ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n"
-                ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n"
-                ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n"
-                ".inst 0x4f60f8d3 // bfdot v19.4s, v6.8h, v0.h[3]\n"
-                "str q16, [%[c_ptr], #0x20]\n"
-                ".inst 0x4f41f0dc // bfdot v28.4s, v6.8h, v1.h[0]\n"
-                ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n"
-                ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n"
-                "str q9, [%[c_ptr], #0x30]\n"
-                ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
-                "add %[b_ptr], %[b_ptr], #0x30\n"
-                ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
-                ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n"
-                ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
-                ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n"
-                ".inst 0x4f60f8af // bfdot v15.4s, v5.8h, v0.h[3]\n"
-                ".inst 0x4f41f0b8 // bfdot v24.4s, v5.8h, v1.h[0]\n"
-                ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
-                ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n"
-                ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n"
-                "ldr q5, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n"
-                ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n"
-                ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n"
-                ".inst 0x4f60f8d3 // bfdot v19.4s, v6.8h, v0.h[3]\n"
-                ".inst 0x4f41f0dc // bfdot v28.4s, v6.8h, v1.h[0]\n"
-                ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n"
-                ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n"
-                ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n"
-                ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
-                ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
-                ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
-                "str q8, [%[c_ptr]]\n"
-                ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
-                ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
-                ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"
-                ".inst 0x4f63f897 // bfdot v23.4s, v4.8h, v3.h[3]\n"
-                ".inst 0x4f42f0ac // bfdot v12.4s, v5.8h, v2.h[0]\n"
-                ".inst 0x4f62f0ad // bfdot v13.4s, v5.8h, v2.h[1]\n"
-                ".inst 0x4f42f8ae // bfdot v14.4s, v5.8h, v2.h[2]\n"
-                ".inst 0x4f62f8af // bfdot v15.4s, v5.8h, v2.h[3]\n"
-                "str q12, [%[c_ptr], #0x10]\n"
-                ".inst 0x4f43f0b8 // bfdot v24.4s, v5.8h, v3.h[0]\n"
-                ".inst 0x4f63f0b9 // bfdot v25.4s, v5.8h, v3.h[1]\n"
-                ".inst 0x4f43f8ba // bfdot v26.4s, v5.8h, v3.h[2]\n"
-                ".inst 0x4f63f8bb // bfdot v27.4s, v5.8h, v3.h[3]\n"
-                ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
-                ".inst 0x4f62f0d1 // bfdot v17.4s, v6.8h, v2.h[1]\n"
-                ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
-                ".inst 0x4f62f8d3 // bfdot v19.4s, v6.8h, v2.h[3]\n"
-                "str q16, [%[c_ptr], #0x20]\n"
-                ".inst 0x4f43f0dc // bfdot v28.4s, v6.8h, v3.h[0]\n"
-                ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
-                ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
-                "str q9, [%[c_ptr], #0x30]\n"
-                ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
-                "4:\n"
-                "str q13, [%[c_ptr], #0x40]\n"
-                "str q17, [%[c_ptr], #0x50]\n"
-                "str q10, [%[c_ptr], #0x60]\n"
-                "str q14, [%[c_ptr], #0x70]\n"
-                "str q18, [%[c_ptr], #0x80]\n"
-                "str q11, [%[c_ptr], #0x90]\n"
-                "str q15, [%[c_ptr], #0xa0]\n"
-                "str q19, [%[c_ptr], #0xb0]\n"
-                "str q20, [%[c_ptr], #0xc0]\n"
-                "str q24, [%[c_ptr], #0xd0]\n"
-                "str q28, [%[c_ptr], #0xe0]\n"
-                "str q21, [%[c_ptr], #0xf0]\n"
-                "str q25, [%[c_ptr], #0x100]\n"
-                "str q29, [%[c_ptr], #0x110]\n"
-                "str q22, [%[c_ptr], #0x120]\n"
-                "str q26, [%[c_ptr], #0x130]\n"
-                "str q30, [%[c_ptr], #0x140]\n"
-                "str q23, [%[c_ptr], #0x150]\n"
-                "str q27, [%[c_ptr], #0x160]\n"
-                "str q31, [%[c_ptr], #0x170]\n"
-                "add %[c_ptr], %[c_ptr], #0x180\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x22, #0x10]\n"
+      "mov %x[Apanel], x21\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v8.16b, #0x0\n"
+      "ldr q6, [x22, #0x20]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "movi v9.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x40]\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ldr q3, [%x[Apanel], #0x20]\n"
+      "ldr q7, [%x[Apanel], #0x30]\n"
+      ".inst 0x4f40f088  // bfdot v8.4s, v4.8h, v0.h[0]\n"
+      ".inst 0x4f60f08b  // bfdot v11.4s, v4.8h, v0.h[1]\n"
+      ".inst 0x4f40f88e  // bfdot v14.4s, v4.8h, v0.h[2]\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x4f60f891  // bfdot v17.4s, v4.8h, v0.h[3]\n"
+      ".inst 0x4f41f094  // bfdot v20.4s, v4.8h, v1.h[0]\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x4f61f097  // bfdot v23.4s, v4.8h, v1.h[1]\n"
+      ".inst 0x4f41f89a  // bfdot v26.4s, v4.8h, v1.h[2]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      ".inst 0x4f61f89d  // bfdot v29.4s, v4.8h, v1.h[3]\n"
+      "ldr q4, [x22, #0x30]\n"
+      ".inst 0x4f40f0a9  // bfdot v9.4s, v5.8h, v0.h[0]\n"
+      ".inst 0x4f60f0ac  // bfdot v12.4s, v5.8h, v0.h[1]\n"
+      ".inst 0x4f40f8af  // bfdot v15.4s, v5.8h, v0.h[2]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x4f60f8b2  // bfdot v18.4s, v5.8h, v0.h[3]\n"
+      ".inst 0x4f41f0b5  // bfdot v21.4s, v5.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x22, #0x100]\n"
+      ".inst 0x4f61f0b8  // bfdot v24.4s, v5.8h, v1.h[1]\n"
+      ".inst 0x4f41f8bb  // bfdot v27.4s, v5.8h, v1.h[2]\n"
+      "prfm pldl1keep, [x22, #0x140]\n"
+      ".inst 0x4f61f8be  // bfdot v30.4s, v5.8h, v1.h[3]\n"
+      "ldr q5, [x22, #0x40]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f60f0cd  // bfdot v13.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f40f8d0  // bfdot v16.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f60f8d3  // bfdot v19.4s, v6.8h, v0.h[3]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      ".inst 0x4f41f0d6  // bfdot v22.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f61f0d9  // bfdot v25.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f41f8dc  // bfdot v28.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f61f8df  // bfdot v31.4s, v6.8h, v1.h[3]\n"
+      "ldr q2, [x22, #0x50]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "add x22, x22, #0x60\n"
+      ".inst 0x4f43f088  // bfdot v8.4s, v4.8h, v3.h[0]\n"
+      ".inst 0x4f63f08b  // bfdot v11.4s, v4.8h, v3.h[1]\n"
+      ".inst 0x4f43f88e  // bfdot v14.4s, v4.8h, v3.h[2]\n"
+      ".inst 0x4f63f891  // bfdot v17.4s, v4.8h, v3.h[3]\n"
+      ".inst 0x4f47f094  // bfdot v20.4s, v4.8h, v7.h[0]\n"
+      ".inst 0x4f67f097  // bfdot v23.4s, v4.8h, v7.h[1]\n"
+      ".inst 0x4f47f89a  // bfdot v26.4s, v4.8h, v7.h[2]\n"
+      ".inst 0x4f67f89d  // bfdot v29.4s, v4.8h, v7.h[3]\n"
+      "ldr q4, [x22, #0x0]\n"
+      ".inst 0x4f43f0a9  // bfdot v9.4s, v5.8h, v3.h[0]\n"
+      ".inst 0x4f63f0ac  // bfdot v12.4s, v5.8h, v3.h[1]\n"
+      ".inst 0x4f43f8af  // bfdot v15.4s, v5.8h, v3.h[2]\n"
+      ".inst 0x4f63f8b2  // bfdot v18.4s, v5.8h, v3.h[3]\n"
+      ".inst 0x4f47f0b5  // bfdot v21.4s, v5.8h, v7.h[0]\n"
+      ".inst 0x4f67f0b8  // bfdot v24.4s, v5.8h, v7.h[1]\n"
+      ".inst 0x4f47f8bb  // bfdot v27.4s, v5.8h, v7.h[2]\n"
+      ".inst 0x4f67f8be  // bfdot v30.4s, v5.8h, v7.h[3]\n"
+      "ldr q5, [x22, #0x10]\n"
+      ".inst 0x4f43f04a  // bfdot v10.4s, v2.8h, v3.h[0]\n"
+      ".inst 0x4f63f04d  // bfdot v13.4s, v2.8h, v3.h[1]\n"
+      ".inst 0x4f43f850  // bfdot v16.4s, v2.8h, v3.h[2]\n"
+      ".inst 0x4f63f853  // bfdot v19.4s, v2.8h, v3.h[3]\n"
+      ".inst 0x4f47f056  // bfdot v22.4s, v2.8h, v7.h[0]\n"
+      ".inst 0x4f67f059  // bfdot v25.4s, v2.8h, v7.h[1]\n"
+      ".inst 0x4f47f85c  // bfdot v28.4s, v2.8h, v7.h[2]\n"
+      ".inst 0x4f67f85f  // bfdot v31.4s, v2.8h, v7.h[3]\n"
+      "ldr q6, [x22, #0x20]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      ".inst 0x4f40f088  // bfdot v8.4s, v4.8h, v0.h[0]\n"
+      ".inst 0x4f60f08b  // bfdot v11.4s, v4.8h, v0.h[1]\n"
+      "add x22, x22, #0x30\n"
+      ".inst 0x4f40f88e  // bfdot v14.4s, v4.8h, v0.h[2]\n"
+      ".inst 0x4f60f891  // bfdot v17.4s, v4.8h, v0.h[3]\n"
+      ".inst 0x4f41f094  // bfdot v20.4s, v4.8h, v1.h[0]\n"
+      ".inst 0x4f61f097  // bfdot v23.4s, v4.8h, v1.h[1]\n"
+      ".inst 0x4f41f89a  // bfdot v26.4s, v4.8h, v1.h[2]\n"
+      ".inst 0x4f61f89d  // bfdot v29.4s, v4.8h, v1.h[3]\n"
+      ".inst 0x4f40f0a9  // bfdot v9.4s, v5.8h, v0.h[0]\n"
+      ".inst 0x4f60f0ac  // bfdot v12.4s, v5.8h, v0.h[1]\n"
+      ".inst 0x4f40f8af  // bfdot v15.4s, v5.8h, v0.h[2]\n"
+      ".inst 0x4f60f8b2  // bfdot v18.4s, v5.8h, v0.h[3]\n"
+      ".inst 0x4f41f0b5  // bfdot v21.4s, v5.8h, v1.h[0]\n"
+      ".inst 0x4f61f0b8  // bfdot v24.4s, v5.8h, v1.h[1]\n"
+      ".inst 0x4f41f8bb  // bfdot v27.4s, v5.8h, v1.h[2]\n"
+      ".inst 0x4f61f8be  // bfdot v30.4s, v5.8h, v1.h[3]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f60f0cd  // bfdot v13.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f40f8d0  // bfdot v16.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f60f8d3  // bfdot v19.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f41f0d6  // bfdot v22.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f61f0d9  // bfdot v25.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f41f8dc  // bfdot v28.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f61f8df  // bfdot v31.4s, v6.8h, v1.h[3]\n"
+      "cbz x20, 5f\n"
+      "ldr q4, [%x[Apanel], #0x0]\n"
+      "ldr q3, [%x[Apanel], #0x10]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q1, [x22, #0x10]\n"
+      ".inst 0x4f44f048  // bfdot v8.4s, v2.8h, v4.h[0]\n"
+      "ldr q0, [x22, #0x20]\n"
+      ".inst 0x4f64f04b  // bfdot v11.4s, v2.8h, v4.h[1]\n"
+      ".inst 0x4f44f84e  // bfdot v14.4s, v2.8h, v4.h[2]\n"
+      ".inst 0x4f64f851  // bfdot v17.4s, v2.8h, v4.h[3]\n"
+      ".inst 0x4f43f054  // bfdot v20.4s, v2.8h, v3.h[0]\n"
+      "add x22, x22, #0x30\n"
+      ".inst 0x4f63f057  // bfdot v23.4s, v2.8h, v3.h[1]\n"
+      ".inst 0x4f43f85a  // bfdot v26.4s, v2.8h, v3.h[2]\n"
+      ".inst 0x4f63f85d  // bfdot v29.4s, v2.8h, v3.h[3]\n"
+      ".inst 0x4f44f029  // bfdot v9.4s, v1.8h, v4.h[0]\n"
+      ".inst 0x4f64f02c  // bfdot v12.4s, v1.8h, v4.h[1]\n"
+      ".inst 0x4f44f82f  // bfdot v15.4s, v1.8h, v4.h[2]\n"
+      ".inst 0x4f64f832  // bfdot v18.4s, v1.8h, v4.h[3]\n"
+      ".inst 0x4f43f035  // bfdot v21.4s, v1.8h, v3.h[0]\n"
+      ".inst 0x4f63f038  // bfdot v24.4s, v1.8h, v3.h[1]\n"
+      ".inst 0x4f43f83b  // bfdot v27.4s, v1.8h, v3.h[2]\n"
+      ".inst 0x4f63f83e  // bfdot v30.4s, v1.8h, v3.h[3]\n"
+      ".inst 0x4f44f00a  // bfdot v10.4s, v0.8h, v4.h[0]\n"
+      ".inst 0x4f64f00d  // bfdot v13.4s, v0.8h, v4.h[1]\n"
+      ".inst 0x4f44f810  // bfdot v16.4s, v0.8h, v4.h[2]\n"
+      ".inst 0x4f64f813  // bfdot v19.4s, v0.8h, v4.h[3]\n"
+      ".inst 0x4f43f016  // bfdot v22.4s, v0.8h, v3.h[0]\n"
+      ".inst 0x4f63f019  // bfdot v25.4s, v0.8h, v3.h[1]\n"
+      ".inst 0x4f43f81c  // bfdot v28.4s, v0.8h, v3.h[2]\n"
+      ".inst 0x4f63f81f  // bfdot v31.4s, v0.8h, v3.h[3]\n"
+      "5:"  // multiply loop done
+      "subs x23, x23, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+    );
 }
 
 } // namespace arm_gemm
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp
new file mode 100644
index 0000000000..304fb64891
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include "../../bfloat.hpp"
+
+namespace arm_gemm {
+
+void a64_interleaved_bf16fp32_dot_8x12_x1(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    const bfloat16 *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+
+    K /= 2;
+    const long loops_count = (K / 2) - 1;
+    const long tails_count = K % 2;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const bfloat16 *a_ptr0 = a_ptr;
+        const bfloat16 *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            long loops = loops_count;
+            long tails = tails_count;
+
+            __asm __volatile (
+                "movi v8.4s, #0\n"
+                "ldr q0, [%[a_ptr]]\n"
+                "movi v9.4s, #0\n"
+                "ldr q2, [%[b_ptr]]\n"
+                "movi v10.4s, #0\n"
+                "ldr q3, [%[b_ptr], #0x10]\n"
+                "movi v11.4s, #0\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
+                "movi v12.4s, #0\n"
+                "add %[b_ptr], %[b_ptr], #0x30\n"
+                "movi v13.4s, #0\n"
+                "movi v14.4s, #0\n"
+                "movi v15.4s, #0\n"
+                "movi v16.4s, #0\n"
+                "movi v17.4s, #0\n"
+                "movi v18.4s, #0\n"
+                "movi v19.4s, #0\n"
+                "movi v20.4s, #0\n"
+                "movi v21.4s, #0\n"
+                "movi v22.4s, #0\n"
+                "movi v23.4s, #0\n"
+                "movi v24.4s, #0\n"
+                "movi v25.4s, #0\n"
+                "movi v26.4s, #0\n"
+                "movi v27.4s, #0\n"
+                "movi v28.4s, #0\n"
+                "movi v29.4s, #0\n"
+                "movi v30.4s, #0\n"
+                "movi v31.4s, #0\n"
+                "cbz %[loops], 1f\n"
+                "2:\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                "subs %[loops], %[loops], #0x1\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr]]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #0x10]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr]]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #0x20]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                "add %[b_ptr], %[b_ptr], #0x60\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr], #-0x30]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #-0x20]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr], #-0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "b.ne 2b\n"
+                "1:\n"
+                "cbz %[tails], 3f\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr]]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #0x10]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr]]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #0x20]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                "add %[b_ptr], %[b_ptr], #0x60\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr], #-0x30]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #-0x20]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr], #-0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                "str q8, [%[c_ptr]]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                "str q12, [%[c_ptr], #0x10]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "str q16, [%[c_ptr], #0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                "str q9, [%[c_ptr], #0x30]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "b 4f\n"
+                "3:\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                "add %[b_ptr], %[b_ptr], #0x30\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr], #-0x30]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #-0x20]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr], #-0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                "str q8, [%[c_ptr]]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                "str q12, [%[c_ptr], #0x10]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "str q16, [%[c_ptr], #0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                "str q9, [%[c_ptr], #0x30]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "4:\n"
+                "str q13, [%[c_ptr], #0x40]\n"
+                "str q17, [%[c_ptr], #0x50]\n"
+                "str q10, [%[c_ptr], #0x60]\n"
+                "str q14, [%[c_ptr], #0x70]\n"
+                "str q18, [%[c_ptr], #0x80]\n"
+                "str q11, [%[c_ptr], #0x90]\n"
+                "str q15, [%[c_ptr], #0xa0]\n"
+                "str q19, [%[c_ptr], #0xb0]\n"
+                "str q20, [%[c_ptr], #0xc0]\n"
+                "str q24, [%[c_ptr], #0xd0]\n"
+                "str q28, [%[c_ptr], #0xe0]\n"
+                "str q21, [%[c_ptr], #0xf0]\n"
+                "str q25, [%[c_ptr], #0x100]\n"
+                "str q29, [%[c_ptr], #0x110]\n"
+                "str q22, [%[c_ptr], #0x120]\n"
+                "str q26, [%[c_ptr], #0x130]\n"
+                "str q30, [%[c_ptr], #0x140]\n"
+                "str q23, [%[c_ptr], #0x150]\n"
+                "str q27, [%[c_ptr], #0x160]\n"
+                "str q31, [%[c_ptr], #0x170]\n"
+                "add %[c_ptr], %[c_ptr], #0x180\n"
+            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [loops] "+r" (loops), [tails] "+r" (tails)
+            :
+            : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+            );
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index b2c2407b28..66c2b92a34 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,51 +22,94 @@
  * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
 
-#include "../bfloat.hpp"
 #include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, \
+    float *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_mmla_8x12( ARGLIST );
+void a64_interleaved_bf16fp32_mmla_8x12_a510( ARGLIST );
 
-class cls_a64_interleaved_bf16fp32_mmla_8x12 {
+class cls_a64_interleaved_bf16fp32_mmla_8x12
+{
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return 12;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return 12;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 4;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.54, 4.30, 7.33 };
+                case CPUModel::V1:
+                    return { 59.94, 5.08, 9.83 };
+                case CPUModel::A510:
+                    return { 7.82, 4.05, 3.07 };
+            }
+        }
 
-    cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *)
-    {
 
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.15, 2.51, 5.25 };
+                case CPUModel::V1:
+                    return { 41.44, 5.01, 5.64 };
+                case CPUModel::A510:
+                    return { 7.83, 2.53, 2.71 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
+    cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A510:
+                kernel=a64_interleaved_bf16fp32_mmla_8x12_a510;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
new file mode 100644
index 0000000000..bab687a9b4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include "../../bfloat.hpp"
+
+namespace arm_gemm {
+
+void a64_interleaved_bf16fp32_mmla_8x12_a510(
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
+
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
+
+    __asm__ __volatile__(
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldp q4, q5, [x22], #0x20\n"
+      "mov %x[Apanel], x21\n"
+      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+      "movi v8.16b, #0x0\n"
+      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1 { v6.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      "ldp q3, q7, [x22], #0x20\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ecda  // bfmmla v26.4s, v6.8h, v4.8h\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x6e45ecdd  // bfmmla v29.4s, v6.8h, v5.8h\n"
+      "ldp q4, q5, [x22], #0x20\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec2f  // bfmmla v15.4s, v1.8h, v3.8h\n"
+      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec55  // bfmmla v21.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ecdb  // bfmmla v27.4s, v6.8h, v3.8h\n"
+      ".inst 0x6e47ecde  // bfmmla v30.4s, v6.8h, v7.8h\n"
+      "ldp q7, q3, [x22], #0x20\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
+      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
+      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ecdc  // bfmmla v28.4s, v6.8h, v4.8h\n"
+      ".inst 0x6e45ecdf  // bfmmla v31.4s, v6.8h, v5.8h\n"
+      "ld1 { v6.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "ldp q4, q5, [x22], #0x20\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e47ec2e  // bfmmla v14.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec31  // bfmmla v17.4s, v1.8h, v3.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e47ecda  // bfmmla v26.4s, v6.8h, v7.8h\n"
+      ".inst 0x6e43ecdd  // bfmmla v29.4s, v6.8h, v3.8h\n"
+      "ldp q7, q3, [x22], #0x20\n"
+      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ecdb  // bfmmla v27.4s, v6.8h, v4.8h\n"
+      ".inst 0x6e45ecde  // bfmmla v30.4s, v6.8h, v5.8h\n"
+      "ldp q4, q5, [x22], #0x20\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e47ec30  // bfmmla v16.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec33  // bfmmla v19.4s, v1.8h, v3.8h\n"
+      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e47ecdc  // bfmmla v28.4s, v6.8h, v7.8h\n"
+      ".inst 0x6e43ecdf  // bfmmla v31.4s, v6.8h, v3.8h\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      "ldp q6, q7, [x22], #0x20\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      "ldp q5, q4, [x22], #0x20\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e45ec0a  // bfmmla v10.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e44ec0d  // bfmmla v13.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec30  // bfmmla v16.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec33  // bfmmla v19.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec59  // bfmmla v25.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec7c  // bfmmla v28.4s, v3.8h, v5.8h\n"
+      ".inst 0x6e44ec7f  // bfmmla v31.4s, v3.8h, v4.8h\n"
+      "cbz x20, 5f\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      "ld1 { v7.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e41ece8  // bfmmla v8.4s, v7.8h, v1.8h\n"
+      "ld1 { v5.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      "ldp q3, q2, [x22], #0x20\n"
+      ".inst 0x6e41ecce  // bfmmla v14.4s, v6.8h, v1.8h\n"
+      ".inst 0x6e40ecd1  // bfmmla v17.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb4  // bfmmla v20.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb7  // bfmmla v23.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e41ec9a  // bfmmla v26.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      ".inst 0x6e43ece9  // bfmmla v9.4s, v7.8h, v3.8h\n"
+      ".inst 0x6e42ecec  // bfmmla v12.4s, v7.8h, v2.8h\n"
+      ".inst 0x6e43eccf  // bfmmla v15.4s, v6.8h, v3.8h\n"
+      ".inst 0x6e42ecd2  // bfmmla v18.4s, v6.8h, v2.8h\n"
+      ".inst 0x6e43ecb5  // bfmmla v21.4s, v5.8h, v3.8h\n"
+      ".inst 0x6e42ecb8  // bfmmla v24.4s, v5.8h, v2.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e42ec9e  // bfmmla v30.4s, v4.8h, v2.8h\n"
+      ".inst 0x6e41ecea  // bfmmla v10.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e40eced  // bfmmla v13.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e41ecd0  // bfmmla v16.4s, v6.8h, v1.8h\n"
+      ".inst 0x6e40ecd3  // bfmmla v19.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb6  // bfmmla v22.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e41ec9c  // bfmmla v28.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
+      "5:"  // multiply loop done
+      "subs x23, x23, #0x1\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
index c476fcf171..8485820c7c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,406 +23,272 @@
  */
 #ifdef __aarch64__
 
+#include <cstddef>
 #include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const bfloat16 *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
+void a64_interleaved_bf16fp32_mmla_8x12(
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
-    K /= 4;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const bfloat16 *a_ptr0 = a_ptr;
-        const bfloat16 *b_ptr = Bpanel;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "movi v8.4s, #0\n"
-                "ldr q0, [%[a_ptr]]\n"
-                "movi v9.4s, #0\n"
-                "ldr q4, [%[b_ptr]]\n"
-                "movi v10.4s, #0\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                "movi v11.4s, #0\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                "movi v12.4s, #0\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                "movi v13.4s, #0\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                "movi v14.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x40]\n"
-                "movi v15.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x40]\n"
-                "movi v16.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x80]\n"
-                "movi v17.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x80]\n"
-                "movi v18.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0xc0]\n"
-                "movi v19.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0xc0]\n"
-                "movi v20.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x100]\n"
-                "movi v21.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x100]\n"
-                "movi v22.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x140]\n"
-                "movi v23.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x140]\n"
-                "movi v24.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x180]\n"
-                "movi v25.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x180]\n"
-                "movi v26.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n"
-                "movi v27.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x1c0]\n"
-                "movi v28.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
-                "movi v29.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
-                "movi v30.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x240]\n"
-                "movi v31.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x280]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x380]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x3c0]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x400]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "add %[b_ptr], %[b_ptr], #0x40\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
-                ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
-                ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x300]\n"
-                ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x480]\n"
-                ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x4c0]\n"
-                ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #0x40]\n"
-                ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #0x50]\n"
-                ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
-                "add %[b_ptr], %[b_ptr], #0xc0\n"
-                ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x60]\n"
-                ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7b // bfmmla v27.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #-0x50]\n"
-                ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
-                ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #-0x30]\n"
-                ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec32 // bfmmla v18.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7e // bfmmla v30.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
-                ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #0x40]\n"
-                ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #0x50]\n"
-                ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
-                "add %[b_ptr], %[b_ptr], #0xe0\n"
-                ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x80]\n"
-                ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7b // bfmmla v27.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #-0x70]\n"
-                ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #-0x60]\n"
-                ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
-                ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #-0x50]\n"
-                ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec32 // bfmmla v18.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7e // bfmmla v30.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #-0x30]\n"
-                ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
-                ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
-                ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                ".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
-                "str q0, [%[c_ptr]]\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
-                "add %[b_ptr], %[b_ptr], #0x80\n"
-                ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #-0x80]\n"
-                ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #-0x70]\n"
-                ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x60]\n"
-                ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #-0x50]\n"
-                ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #-0x30]\n"
-                ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7b // bfmmla v27.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
-                ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
-                ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec32 // bfmmla v18.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7e // bfmmla v30.4s, v3.8h, v6.8h\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "str q0, [%[c_ptr]]\n"
-                ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                "4:\n"
-                "uzp2 v2.2d, v16.2d, v17.2d\n"
-                "str q3, [%[c_ptr], #0x30]\n"
-                "uzp2 v3.2d, v18.2d, v19.2d\n"
-                "str q4, [%[c_ptr], #0x40]\n"
-                "uzp1 v4.2d, v20.2d, v21.2d\n"
-                "str q5, [%[c_ptr], #0x50]\n"
-                "uzp1 v5.2d, v22.2d, v23.2d\n"
-                "str q6, [%[c_ptr], #0x60]\n"
-                "uzp1 v6.2d, v24.2d, v25.2d\n"
-                "str q7, [%[c_ptr], #0x70]\n"
-                "uzp2 v7.2d, v20.2d, v21.2d\n"
-                "str q0, [%[c_ptr], #0x80]\n"
-                "uzp2 v0.2d, v22.2d, v23.2d\n"
-                "str q1, [%[c_ptr], #0x90]\n"
-                "uzp2 v1.2d, v24.2d, v25.2d\n"
-                "str q2, [%[c_ptr], #0xa0]\n"
-                "uzp1 v2.2d, v26.2d, v27.2d\n"
-                "str q3, [%[c_ptr], #0xb0]\n"
-                "uzp1 v3.2d, v28.2d, v29.2d\n"
-                "str q4, [%[c_ptr], #0xc0]\n"
-                "uzp1 v4.2d, v30.2d, v31.2d\n"
-                "str q5, [%[c_ptr], #0xd0]\n"
-                "uzp2 v5.2d, v26.2d, v27.2d\n"
-                "str q6, [%[c_ptr], #0xe0]\n"
-                "uzp2 v6.2d, v28.2d, v29.2d\n"
-                "str q7, [%[c_ptr], #0xf0]\n"
-                "uzp2 v7.2d, v30.2d, v31.2d\n"
-                "str q0, [%[c_ptr], #0x100]\n"
-                "str q1, [%[c_ptr], #0x110]\n"
-                "str q2, [%[c_ptr], #0x120]\n"
-                "str q3, [%[c_ptr], #0x130]\n"
-                "str q4, [%[c_ptr], #0x140]\n"
-                "str q5, [%[c_ptr], #0x150]\n"
-                "str q6, [%[c_ptr], #0x160]\n"
-                "str q7, [%[c_ptr], #0x170]\n"
-                "add %[c_ptr], %[c_ptr], #0x180\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x22, #0x10]\n"
+      "mov %x[Apanel], x21\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v8.16b, #0x0\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "add x22, x22, #0x20\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ldr q6, [%x[Apanel], #0x0]\n"
+      "ldr q7, [x22, #0x0]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q3, [x22, #0x10]\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ecda  // bfmmla v26.4s, v6.8h, v4.8h\n"
+      "ldr q4, [x22, #0x20]\n"
+      ".inst 0x6e45ecdd  // bfmmla v29.4s, v6.8h, v5.8h\n"
+      "ldr q5, [x22, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0c  // bfmmla v12.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e47ec2f  // bfmmla v15.4s, v1.8h, v7.8h\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x6e43ec32  // bfmmla v18.4s, v1.8h, v3.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec58  // bfmmla v24.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e47ecdb  // bfmmla v27.4s, v6.8h, v7.8h\n"
+      "ldr q7, [x22, #0x40]\n"
+      ".inst 0x6e43ecde  // bfmmla v30.4s, v6.8h, v3.8h\n"
+      "ldr q3, [x22, #0x50]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      "ldr q0, [%x[Apanel], #0x10]\n"
+      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
+      "ldr q1, [%x[Apanel], #0x20]\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
+      "ldr q2, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e44ecdc  // bfmmla v28.4s, v6.8h, v4.8h\n"
+      "ldr q4, [x22, #0x60]\n"
+      ".inst 0x6e45ecdf  // bfmmla v31.4s, v6.8h, v5.8h\n"
+      "ldr q6, [%x[Apanel], #0x40]\n"
+      "ldr q5, [x22, #0x70]\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e47ec2e  // bfmmla v14.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec31  // bfmmla v17.4s, v1.8h, v3.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e47ecda  // bfmmla v26.4s, v6.8h, v7.8h\n"
+      "ldr q7, [x22, #0x80]\n"
+      ".inst 0x6e43ecdd  // bfmmla v29.4s, v6.8h, v3.8h\n"
+      "ldr q3, [x22, #0x90]\n"
+      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ecdb  // bfmmla v27.4s, v6.8h, v4.8h\n"
+      "ldr q4, [x22, #0xa0]\n"
+      ".inst 0x6e45ecde  // bfmmla v30.4s, v6.8h, v5.8h\n"
+      "ldr q5, [x22, #0xb0]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      "ldr q0, [%x[Apanel], #0x50]\n"
+      ".inst 0x6e47ec30  // bfmmla v16.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec33  // bfmmla v19.4s, v1.8h, v3.8h\n"
+      "ldr q1, [%x[Apanel], #0x60]\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      "ldr q2, [%x[Apanel], #0x70]\n"
+      ".inst 0x6e47ecdc  // bfmmla v28.4s, v6.8h, v7.8h\n"
+      ".inst 0x6e43ecdf  // bfmmla v31.4s, v6.8h, v3.8h\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "add x22, x22, #0xc0\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q7, [x22, #0x10]\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      "ldr q4, [x22, #0x20]\n"
+      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      "ldr q5, [x22, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      "add x22, x22, #0x40\n"
+      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
+      "cbz x20, 5f\n"
+      "ldr q1, [x22, #0x0]\n"
+      "ldr q7, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e41ece8  // bfmmla v8.4s, v7.8h, v1.8h\n"
+      "ldr q6, [%x[Apanel], #0x10]\n"
+      "ldr q0, [x22, #0x10]\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      "ldr q5, [%x[Apanel], #0x20]\n"
+      "ldr q4, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e41ecce  // bfmmla v14.4s, v6.8h, v1.8h\n"
+      "ldr q3, [x22, #0x20]\n"
+      "ldr q2, [x22, #0x30]\n"
+      ".inst 0x6e40ecd1  // bfmmla v17.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb4  // bfmmla v20.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb7  // bfmmla v23.4s, v5.8h, v0.8h\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x6e41ec9a  // bfmmla v26.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x22, #0x40]\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x22, #0x50]\n"
+      ".inst 0x6e43ece9  // bfmmla v9.4s, v7.8h, v3.8h\n"
+      ".inst 0x6e42ecec  // bfmmla v12.4s, v7.8h, v2.8h\n"
+      ".inst 0x6e43eccf  // bfmmla v15.4s, v6.8h, v3.8h\n"
+      ".inst 0x6e42ecd2  // bfmmla v18.4s, v6.8h, v2.8h\n"
+      "add x22, x22, #0x60\n"
+      ".inst 0x6e43ecb5  // bfmmla v21.4s, v5.8h, v3.8h\n"
+      ".inst 0x6e42ecb8  // bfmmla v24.4s, v5.8h, v2.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e42ec9e  // bfmmla v30.4s, v4.8h, v2.8h\n"
+      ".inst 0x6e41ecea  // bfmmla v10.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e40eced  // bfmmla v13.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e41ecd0  // bfmmla v16.4s, v6.8h, v1.8h\n"
+      ".inst 0x6e40ecd3  // bfmmla v19.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb6  // bfmmla v22.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e41ec9c  // bfmmla v28.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
+      "5:"  // multiply loop done
+      "subs x23, x23, #0x1\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+    );
 }
 
 } // namespace arm_gemm
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index b17b76f170..37a54fcfab 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,52 +22,93 @@
  * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
 
-#include <cstdint>
 #include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const int8_t *, const int8_t *, \
+    int32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void a64_interleaved_s8s32_mmla_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_interleaved_s8s32_mmla_8x12( ARGLIST );
+void a64_interleaved_s8s32_mmla_8x12_a510( ARGLIST );
 
-class cls_a64_interleaved_s8s32_mmla_8x12 {
+class cls_a64_interleaved_s8s32_mmla_8x12
+{
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
 
-    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return 12;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return 12;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 8;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
     StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 62.57, 4.08, 8.01 };
+                case CPUModel::A510:
+                    return { 48.25, 3.53, 3.71 };
+                case CPUModel::V1:
+                    return { 117.02, 4.98, 10.87 };
+            }
+        }
 
-    cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *)
-    {
 
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 62.53, 3.70, 0.50 };
+                case CPUModel::A510:
+                    return { 48.22, 2.49, 0.29 };
+                case CPUModel::V1:
+                    return { 75.54, 8.06, 0.63 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
+    cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A510:
+                kernel=a64_interleaved_s8s32_mmla_8x12_a510;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
new file mode 100644
index 0000000000..c1d37383df
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_s8s32_mmla_8x12_a510(
+    const int8_t *Apanel,
+    const int8_t *Bpanel,
+    int32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const int8_t *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
+
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
+
+    __asm__ __volatile__(
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldp q4, q5, [x22], #0x20\n"
+      "mov %x[Apanel], x21\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      "movi v8.4s, #0x0\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
+      "ldp q3, q7, [x22], #0x20\n"
+      ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a4da  // smmla v26.4s, v6.16b, v4.16b\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x4e85a4dd  // smmla v29.4s, v6.16b, v5.16b\n"
+      "ldp q4, q5, [x22], #0x20\n"
+      ".inst 0x4e83a409  // smmla v9.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e83a42f  // smmla v15.4s, v1.16b, v3.16b\n"
+      ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e83a455  // smmla v21.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e83a4db  // smmla v27.4s, v6.16b, v3.16b\n"
+      ".inst 0x4e87a4de  // smmla v30.4s, v6.16b, v7.16b\n"
+      "ldp q7, q3, [x22], #0x20\n"
+      ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a4dc  // smmla v28.4s, v6.16b, v4.16b\n"
+      ".inst 0x4e85a4df  // smmla v31.4s, v6.16b, v5.16b\n"
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldp q4, q5, [x22], #0x20\n"
+      ".inst 0x4e83a40b  // smmla v11.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e87a42e  // smmla v14.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e83a431  // smmla v17.4s, v1.16b, v3.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e83a457  // smmla v23.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e87a4da  // smmla v26.4s, v6.16b, v7.16b\n"
+      ".inst 0x4e83a4dd  // smmla v29.4s, v6.16b, v3.16b\n"
+      "ldp q7, q3, [x22], #0x20\n"
+      ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a4db  // smmla v27.4s, v6.16b, v4.16b\n"
+      ".inst 0x4e85a4de  // smmla v30.4s, v6.16b, v5.16b\n"
+      "ldp q4, q5, [x22], #0x20\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e83a40d  // smmla v13.4s, v0.16b, v3.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e87a430  // smmla v16.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e83a433  // smmla v19.4s, v1.16b, v3.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e83a459  // smmla v25.4s, v2.16b, v3.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e87a4dc  // smmla v28.4s, v6.16b, v7.16b\n"
+      ".inst 0x4e83a4df  // smmla v31.4s, v6.16b, v3.16b\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
+      "ldp q6, q7, [x22], #0x20\n"
+      ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
+      "ldp q5, q4, [x22], #0x20\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e85a40a  // smmla v10.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e84a40d  // smmla v13.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a430  // smmla v16.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a433  // smmla v19.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a456  // smmla v22.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a459  // smmla v25.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a47c  // smmla v28.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e84a47f  // smmla v31.4s, v3.16b, v4.16b\n"
+      "cbz x20, 5f\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      "ld1 { v7.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e81a4e8  // smmla v8.4s, v7.16b, v1.16b\n"
+      "ld1 { v5.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e80a4eb  // smmla v11.4s, v7.16b, v0.16b\n"
+      "ldp q3, q2, [x22], #0x20\n"
+      ".inst 0x4e81a4ce  // smmla v14.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e80a4d1  // smmla v17.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e81a4b4  // smmla v20.4s, v5.16b, v1.16b\n"
+      ".inst 0x4e80a4b7  // smmla v23.4s, v5.16b, v0.16b\n"
+      ".inst 0x4e81a49a  // smmla v26.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e80a49d  // smmla v29.4s, v4.16b, v0.16b\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      ".inst 0x4e83a4e9  // smmla v9.4s, v7.16b, v3.16b\n"
+      ".inst 0x4e82a4ec  // smmla v12.4s, v7.16b, v2.16b\n"
+      ".inst 0x4e83a4cf  // smmla v15.4s, v6.16b, v3.16b\n"
+      ".inst 0x4e82a4d2  // smmla v18.4s, v6.16b, v2.16b\n"
+      ".inst 0x4e83a4b5  // smmla v21.4s, v5.16b, v3.16b\n"
+      ".inst 0x4e82a4b8  // smmla v24.4s, v5.16b, v2.16b\n"
+      ".inst 0x4e83a49b  // smmla v27.4s, v4.16b, v3.16b\n"
+      ".inst 0x4e82a49e  // smmla v30.4s, v4.16b, v2.16b\n"
+      ".inst 0x4e81a4ea  // smmla v10.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e80a4ed  // smmla v13.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e81a4d0  // smmla v16.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e80a4d3  // smmla v19.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e81a4b6  // smmla v22.4s, v5.16b, v1.16b\n"
+      ".inst 0x4e80a4b9  // smmla v25.4s, v5.16b, v0.16b\n"
+      ".inst 0x4e81a49c  // smmla v28.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e80a49f  // smmla v31.4s, v4.16b, v0.16b\n"
+      "5:"  // multiply loop done
+      "subs x23, x23, #0x1\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
index 2093e75b8e..a097dc358a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,373 +23,272 @@
  */
 #ifdef __aarch64__
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void a64_interleaved_s8s32_mmla_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const int8_t *a_ptr = Apanel;
-    int32_t *c_ptr = Cpanel;
+void a64_interleaved_s8s32_mmla_8x12(
+    const int8_t *Apanel,
+    const int8_t *Bpanel,
+    int32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
-    K /= 8;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t K = {};
+        const int8_t *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr = Bpanel;
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "movi v8.4s, #0\n"
-                "ldr q0, [%[a_ptr]]\n"
-                "movi v9.4s, #0\n"
-                "ldr q4, [%[b_ptr]]\n"
-                "movi v10.4s, #0\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                "movi v11.4s, #0\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                "movi v12.4s, #0\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                "movi v13.4s, #0\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                "movi v14.4s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "movi v15.4s, #0\n"
-                "add %[b_ptr], %[b_ptr], #0x40\n"
-                "movi v16.4s, #0\n"
-                "movi v17.4s, #0\n"
-                "movi v18.4s, #0\n"
-                "movi v19.4s, #0\n"
-                "movi v20.4s, #0\n"
-                "movi v21.4s, #0\n"
-                "movi v22.4s, #0\n"
-                "movi v23.4s, #0\n"
-                "movi v24.4s, #0\n"
-                "movi v25.4s, #0\n"
-                "movi v26.4s, #0\n"
-                "movi v27.4s, #0\n"
-                "movi v28.4s, #0\n"
-                "movi v29.4s, #0\n"
-                "movi v30.4s, #0\n"
-                "movi v31.4s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                ".inst 0x4e84a40c // smmla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a432 // smmla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a458 // smmla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #0x40]\n"
-                ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x50]\n"
-                ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
-                "add %[b_ptr], %[b_ptr], #0xc0\n"
-                ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x60]\n"
-                ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a42f // smmla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47b // smmla v27.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x50]\n"
-                ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x40]\n"
-                ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a432 // smmla v18.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a458 // smmla v24.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                ".inst 0x4e84a40c // smmla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a432 // smmla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a458 // smmla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #0x40]\n"
-                ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x50]\n"
-                ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
-                "add %[b_ptr], %[b_ptr], #0xe0\n"
-                ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x80]\n"
-                ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a42f // smmla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47b // smmla v27.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x70]\n"
-                ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x60]\n"
-                ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x50]\n"
-                ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a432 // smmla v18.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a458 // smmla v24.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x40]\n"
-                ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
-                ".inst 0x4e84a40c // smmla v12.4s, v0.16b, v4.16b\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                ".inst 0x4e84a432 // smmla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a458 // smmla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
-                "str q0, [%[c_ptr]]\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
-                "add %[b_ptr], %[b_ptr], #0x80\n"
-                ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x80]\n"
-                ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x70]\n"
-                ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x60]\n"
-                ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x50]\n"
-                ".inst 0x4e84a40c // smmla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a432 // smmla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a458 // smmla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x40]\n"
-                ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4e87a42f // smmla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47b // smmla v27.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
-                ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a432 // smmla v18.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a458 // smmla v24.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "str q0, [%[c_ptr]]\n"
-                ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                "4:\n"
-                "uzp2 v2.2d, v16.2d, v17.2d\n"
-                "str q3, [%[c_ptr], #0x30]\n"
-                "uzp2 v3.2d, v18.2d, v19.2d\n"
-                "str q4, [%[c_ptr], #0x40]\n"
-                "uzp1 v4.2d, v20.2d, v21.2d\n"
-                "str q5, [%[c_ptr], #0x50]\n"
-                "uzp1 v5.2d, v22.2d, v23.2d\n"
-                "str q6, [%[c_ptr], #0x60]\n"
-                "uzp1 v6.2d, v24.2d, v25.2d\n"
-                "str q7, [%[c_ptr], #0x70]\n"
-                "uzp2 v7.2d, v20.2d, v21.2d\n"
-                "str q0, [%[c_ptr], #0x80]\n"
-                "uzp2 v0.2d, v22.2d, v23.2d\n"
-                "str q1, [%[c_ptr], #0x90]\n"
-                "uzp2 v1.2d, v24.2d, v25.2d\n"
-                "str q2, [%[c_ptr], #0xa0]\n"
-                "uzp1 v2.2d, v26.2d, v27.2d\n"
-                "str q3, [%[c_ptr], #0xb0]\n"
-                "uzp1 v3.2d, v28.2d, v29.2d\n"
-                "str q4, [%[c_ptr], #0xc0]\n"
-                "uzp1 v4.2d, v30.2d, v31.2d\n"
-                "str q5, [%[c_ptr], #0xd0]\n"
-                "uzp2 v5.2d, v26.2d, v27.2d\n"
-                "str q6, [%[c_ptr], #0xe0]\n"
-                "uzp2 v6.2d, v28.2d, v29.2d\n"
-                "str q7, [%[c_ptr], #0xf0]\n"
-                "uzp2 v7.2d, v30.2d, v31.2d\n"
-                "str q0, [%[c_ptr], #0x100]\n"
-                "str q1, [%[c_ptr], #0x110]\n"
-                "str q2, [%[c_ptr], #0x120]\n"
-                "str q3, [%[c_ptr], #0x130]\n"
-                "str q4, [%[c_ptr], #0x140]\n"
-                "str q5, [%[c_ptr], #0x150]\n"
-                "str q6, [%[c_ptr], #0x160]\n"
-                "str q7, [%[c_ptr], #0x170]\n"
-                "add %[c_ptr], %[c_ptr], #0x180\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x22, #0x10]\n"
+      "mov %x[Apanel], x21\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v8.4s, #0x0\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "add x22, x22, #0x20\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ldr q6, [%x[Apanel], #0x0]\n"
+      "ldr q7, [x22, #0x0]\n"
+      ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
+      "ldr q3, [x22, #0x10]\n"
+      ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a4da  // smmla v26.4s, v6.16b, v4.16b\n"
+      "ldr q4, [x22, #0x20]\n"
+      ".inst 0x4e85a4dd  // smmla v29.4s, v6.16b, v5.16b\n"
+      "ldr q5, [x22, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e83a40c  // smmla v12.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e87a42f  // smmla v15.4s, v1.16b, v7.16b\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x4e83a432  // smmla v18.4s, v1.16b, v3.16b\n"
+      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e83a458  // smmla v24.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e87a4db  // smmla v27.4s, v6.16b, v7.16b\n"
+      "ldr q7, [x22, #0x40]\n"
+      ".inst 0x4e83a4de  // smmla v30.4s, v6.16b, v3.16b\n"
+      "ldr q3, [x22, #0x50]\n"
+      ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
+      "ldr q0, [%x[Apanel], #0x10]\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      "ldr q1, [%x[Apanel], #0x20]\n"
+      ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      "ldr q2, [%x[Apanel], #0x30]\n"
+      ".inst 0x4e84a4dc  // smmla v28.4s, v6.16b, v4.16b\n"
+      "ldr q4, [x22, #0x60]\n"
+      ".inst 0x4e85a4df  // smmla v31.4s, v6.16b, v5.16b\n"
+      "ldr q6, [%x[Apanel], #0x40]\n"
+      "ldr q5, [x22, #0x70]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e83a40b  // smmla v11.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e87a42e  // smmla v14.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e83a431  // smmla v17.4s, v1.16b, v3.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e83a457  // smmla v23.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e87a4da  // smmla v26.4s, v6.16b, v7.16b\n"
+      "ldr q7, [x22, #0x80]\n"
+      ".inst 0x4e83a4dd  // smmla v29.4s, v6.16b, v3.16b\n"
+      "ldr q3, [x22, #0x90]\n"
+      ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a4db  // smmla v27.4s, v6.16b, v4.16b\n"
+      "ldr q4, [x22, #0xa0]\n"
+      ".inst 0x4e85a4de  // smmla v30.4s, v6.16b, v5.16b\n"
+      "ldr q5, [x22, #0xb0]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e83a40d  // smmla v13.4s, v0.16b, v3.16b\n"
+      "ldr q0, [%x[Apanel], #0x50]\n"
+      ".inst 0x4e87a430  // smmla v16.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e83a433  // smmla v19.4s, v1.16b, v3.16b\n"
+      "ldr q1, [%x[Apanel], #0x60]\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e83a459  // smmla v25.4s, v2.16b, v3.16b\n"
+      "ldr q2, [%x[Apanel], #0x70]\n"
+      ".inst 0x4e87a4dc  // smmla v28.4s, v6.16b, v7.16b\n"
+      ".inst 0x4e83a4df  // smmla v31.4s, v6.16b, v3.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "add x22, x22, #0xc0\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
+      "ldr q7, [x22, #0x10]\n"
+      ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x22, #0x20]\n"
+      ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
+      "ldr q5, [x22, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "add x22, x22, #0x40\n"
+      ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
+      "cbz x20, 5f\n"
+      "ldr q1, [x22, #0x0]\n"
+      "ldr q7, [%x[Apanel], #0x0]\n"
+      ".inst 0x4e81a4e8  // smmla v8.4s, v7.16b, v1.16b\n"
+      "ldr q6, [%x[Apanel], #0x10]\n"
+      "ldr q0, [x22, #0x10]\n"
+      ".inst 0x4e80a4eb  // smmla v11.4s, v7.16b, v0.16b\n"
+      "ldr q5, [%x[Apanel], #0x20]\n"
+      "ldr q4, [%x[Apanel], #0x30]\n"
+      ".inst 0x4e81a4ce  // smmla v14.4s, v6.16b, v1.16b\n"
+      "ldr q3, [x22, #0x20]\n"
+      "ldr q2, [x22, #0x30]\n"
+      ".inst 0x4e80a4d1  // smmla v17.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e81a4b4  // smmla v20.4s, v5.16b, v1.16b\n"
+      ".inst 0x4e80a4b7  // smmla v23.4s, v5.16b, v0.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x4e81a49a  // smmla v26.4s, v4.16b, v1.16b\n"
+      "ldr q1, [x22, #0x40]\n"
+      ".inst 0x4e80a49d  // smmla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x22, #0x50]\n"
+      ".inst 0x4e83a4e9  // smmla v9.4s, v7.16b, v3.16b\n"
+      ".inst 0x4e82a4ec  // smmla v12.4s, v7.16b, v2.16b\n"
+      ".inst 0x4e83a4cf  // smmla v15.4s, v6.16b, v3.16b\n"
+      ".inst 0x4e82a4d2  // smmla v18.4s, v6.16b, v2.16b\n"
+      "add x22, x22, #0x60\n"
+      ".inst 0x4e83a4b5  // smmla v21.4s, v5.16b, v3.16b\n"
+      ".inst 0x4e82a4b8  // smmla v24.4s, v5.16b, v2.16b\n"
+      ".inst 0x4e83a49b  // smmla v27.4s, v4.16b, v3.16b\n"
+      ".inst 0x4e82a49e  // smmla v30.4s, v4.16b, v2.16b\n"
+      ".inst 0x4e81a4ea  // smmla v10.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e80a4ed  // smmla v13.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e81a4d0  // smmla v16.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e80a4d3  // smmla v19.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e81a4b6  // smmla v22.4s, v5.16b, v1.16b\n"
+      ".inst 0x4e80a4b9  // smmla v25.4s, v5.16b, v0.16b\n"
+      ".inst 0x4e81a49c  // smmla v28.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e80a49f  // smmla v31.4s, v4.16b, v0.16b\n"
+      "5:"  // multiply loop done
+      "subs x23, x23, #0x1\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+    );
 }
 
 } // namespace arm_gemm
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index 99dd0be0d9..0088557b8d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,52 +22,93 @@
  * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
 
-#include <cstdint>
 #include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const uint8_t *, const uint8_t *, \
+    uint32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void a64_interleaved_u8u32_mmla_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_interleaved_u8u32_mmla_8x12( ARGLIST );
+void a64_interleaved_u8u32_mmla_8x12_a510( ARGLIST );
 
-class cls_a64_interleaved_u8u32_mmla_8x12 {
+class cls_a64_interleaved_u8u32_mmla_8x12
+{
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
 
-    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return 12;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return 12;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 8;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
     StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 62.58, 4.06, 8.02 };
+                case CPUModel::A510:
+                    return { 47.83, 3.59, 3.72 };
+                case CPUModel::V1:
+                    return { 111.52, 4.97, 10.80 };
+            }
+        }
 
-    cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *)
-    {
 
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 62.57, 4.10, 0.51 };
+                case CPUModel::A510:
+                    return { 47.66, 2.47, 0.29 };
+                case CPUModel::V1:
+                    return { 75.54, 8.06, 0.63 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
+    cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A510:
+                kernel=a64_interleaved_u8u32_mmla_8x12_a510;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
new file mode 100644
index 0000000000..54c51954c8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_u8u32_mmla_8x12_a510(
+    const uint8_t *Apanel,
+    const uint8_t *Bpanel,
+    uint32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
+
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
+
+    __asm__ __volatile__(
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldp q4, q5, [x22], #0x20\n"
+      "mov %x[Apanel], x21\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      "movi v8.4s, #0x0\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
+      "ldp q3, q7, [x22], #0x20\n"
+      ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a4da  // ummla v26.4s, v6.16b, v4.16b\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x6e85a4dd  // ummla v29.4s, v6.16b, v5.16b\n"
+      "ldp q4, q5, [x22], #0x20\n"
+      ".inst 0x6e83a409  // ummla v9.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e83a42f  // ummla v15.4s, v1.16b, v3.16b\n"
+      ".inst 0x6e87a432  // ummla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e83a455  // ummla v21.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e83a4db  // ummla v27.4s, v6.16b, v3.16b\n"
+      ".inst 0x6e87a4de  // ummla v30.4s, v6.16b, v7.16b\n"
+      "ldp q7, q3, [x22], #0x20\n"
+      ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a4dc  // ummla v28.4s, v6.16b, v4.16b\n"
+      ".inst 0x6e85a4df  // ummla v31.4s, v6.16b, v5.16b\n"
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "ldp q4, q5, [x22], #0x20\n"
+      ".inst 0x6e83a40b  // ummla v11.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e87a42e  // ummla v14.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e83a431  // ummla v17.4s, v1.16b, v3.16b\n"
+      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e83a457  // ummla v23.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e87a4da  // ummla v26.4s, v6.16b, v7.16b\n"
+      ".inst 0x6e83a4dd  // ummla v29.4s, v6.16b, v3.16b\n"
+      "ldp q7, q3, [x22], #0x20\n"
+      ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a4db  // ummla v27.4s, v6.16b, v4.16b\n"
+      ".inst 0x6e85a4de  // ummla v30.4s, v6.16b, v5.16b\n"
+      "ldp q4, q5, [x22], #0x20\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e83a40d  // ummla v13.4s, v0.16b, v3.16b\n"
+      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e87a430  // ummla v16.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e83a433  // ummla v19.4s, v1.16b, v3.16b\n"
+      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e83a459  // ummla v25.4s, v2.16b, v3.16b\n"
+      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e87a4dc  // ummla v28.4s, v6.16b, v7.16b\n"
+      ".inst 0x6e83a4df  // ummla v31.4s, v6.16b, v3.16b\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
+      "ldp q6, q7, [x22], #0x20\n"
+      ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a47a  // ummla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47d  // ummla v29.4s, v3.16b, v5.16b\n"
+      "ldp q5, q4, [x22], #0x20\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e87a432  // ummla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a47b  // ummla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47e  // ummla v30.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e85a40a  // ummla v10.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e84a40d  // ummla v13.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a430  // ummla v16.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a433  // ummla v19.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a456  // ummla v22.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a459  // ummla v25.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a47c  // ummla v28.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e84a47f  // ummla v31.4s, v3.16b, v4.16b\n"
+      "cbz x20, 5f\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      "ld1 { v7.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e81a4e8  // ummla v8.4s, v7.16b, v1.16b\n"
+      "ld1 { v5.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e80a4eb  // ummla v11.4s, v7.16b, v0.16b\n"
+      "ldp q3, q2, [x22], #0x20\n"
+      ".inst 0x6e81a4ce  // ummla v14.4s, v6.16b, v1.16b\n"
+      ".inst 0x6e80a4d1  // ummla v17.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e81a4b4  // ummla v20.4s, v5.16b, v1.16b\n"
+      ".inst 0x6e80a4b7  // ummla v23.4s, v5.16b, v0.16b\n"
+      ".inst 0x6e81a49a  // ummla v26.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e80a49d  // ummla v29.4s, v4.16b, v0.16b\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      ".inst 0x6e83a4e9  // ummla v9.4s, v7.16b, v3.16b\n"
+      ".inst 0x6e82a4ec  // ummla v12.4s, v7.16b, v2.16b\n"
+      ".inst 0x6e83a4cf  // ummla v15.4s, v6.16b, v3.16b\n"
+      ".inst 0x6e82a4d2  // ummla v18.4s, v6.16b, v2.16b\n"
+      ".inst 0x6e83a4b5  // ummla v21.4s, v5.16b, v3.16b\n"
+      ".inst 0x6e82a4b8  // ummla v24.4s, v5.16b, v2.16b\n"
+      ".inst 0x6e83a49b  // ummla v27.4s, v4.16b, v3.16b\n"
+      ".inst 0x6e82a49e  // ummla v30.4s, v4.16b, v2.16b\n"
+      ".inst 0x6e81a4ea  // ummla v10.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e80a4ed  // ummla v13.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e81a4d0  // ummla v16.4s, v6.16b, v1.16b\n"
+      ".inst 0x6e80a4d3  // ummla v19.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e81a4b6  // ummla v22.4s, v5.16b, v1.16b\n"
+      ".inst 0x6e80a4b9  // ummla v25.4s, v5.16b, v0.16b\n"
+      ".inst 0x6e81a49c  // ummla v28.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e80a49f  // ummla v31.4s, v4.16b, v0.16b\n"
+      "5:"  // multiply loop done
+      "subs x23, x23, #0x1\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
index 238a703708..30260b9c29 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,395 +23,272 @@
  */
 #ifdef __aarch64__
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
-
-    K /= 8;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "movi v8.4s, #0\n"
-                "ldr q0, [%[a_ptr]]\n"
-                "movi v9.4s, #0\n"
-                "ldr q4, [%[b_ptr]]\n"
-                "movi v10.4s, #0\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                "movi v11.4s, #0\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                "movi v12.4s, #0\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                "movi v13.4s, #0\n"
-                "movi v14.4s, #0\n"
-                "movi v15.4s, #0\n"
-                "movi v16.4s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "movi v17.4s, #0\n"
-                "add %[b_ptr], %[b_ptr], #0x40\n"
-                "movi v18.4s, #0\n"
-                "movi v19.4s, #0\n"
-                "movi v20.4s, #0\n"
-                "movi v21.4s, #0\n"
-                "movi v22.4s, #0\n"
-                "movi v23.4s, #0\n"
-                "movi v24.4s, #0\n"
-                "movi v25.4s, #0\n"
-                "movi v26.4s, #0\n"
-                "movi v27.4s, #0\n"
-                "movi v28.4s, #0\n"
-                "movi v29.4s, #0\n"
-                "movi v30.4s, #0\n"
-                "movi v31.4s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
-
-                "ldp q6, q7, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
-
-                ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
-
-                ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
-                "subs %[loops], %[loops], #0x1\n"
-
-                ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
-                ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
-
-                "ldp q4, q5, [%[b_ptr]]\n"
-                ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
-
-                ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
-
-                ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
-                ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
-
-                ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
-
-                ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
-                "ldp q6, q7, [%[b_ptr], #0x20]\n"
-
-                ".inst 0x6e84a40c // ummla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr]]\n"
-
-                ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-
-                ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-
-                ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
-                ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-
-                ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
-
-                "ldp q4, q5, [%[b_ptr], #0x40]\n"
-                ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
-
-                ".inst 0x6e87a42f // ummla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
-
-                ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
-
-                ".inst 0x6e87a47b // ummla v27.4s, v3.16b, v7.16b\n"
-                "add %[b_ptr], %[b_ptr], #0xc0\n"
-                "ldp q6, q7, [%[b_ptr], #-0x60]\n"
-
-                ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
-
-                ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
-
-                ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
-
-                ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
-
-                "ldp q4, q5, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
-
-                ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e86a432 // ummla v18.4s, v1.16b, v6.16b\n"
-
-                ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n"
-
-                ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n"
-
-                ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
-                "b.ne 2b\n"
-
-                "1:\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                ".inst 0x6e84a40c // ummla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #0x40]\n"
-                ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x50]\n"
-                ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
-                "add %[b_ptr], %[b_ptr], #0xe0\n"
-                ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x80]\n"
-                ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
-                ".inst 0x6e87a42f // ummla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
-                ".inst 0x6e87a47b // ummla v27.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x70]\n"
-                ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x60]\n"
-                ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
-                ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x50]\n"
-                ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e86a432 // ummla v18.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x30]\n"
-                ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
-                ".inst 0x6e84a40c // ummla v12.4s, v0.16b, v4.16b\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
-                "str q0, [%[c_ptr]]\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "b 4f\n"
-                "3:\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
-                "add %[b_ptr], %[b_ptr], #0x80\n"
-                ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x80]\n"
-                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x70]\n"
-                ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x60]\n"
-                ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x50]\n"
-                ".inst 0x6e84a40c // ummla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x30]\n"
-                ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
-                ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e87a42f // ummla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
-                ".inst 0x6e87a47b // ummla v27.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
-                ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
-                ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e86a432 // ummla v18.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "str q0, [%[c_ptr]]\n"
-                ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                "4:\n"
-                "uzp2 v2.2d, v16.2d, v17.2d\n"
-                "str q3, [%[c_ptr], #0x30]\n"
-                "uzp2 v3.2d, v18.2d, v19.2d\n"
-                "str q4, [%[c_ptr], #0x40]\n"
-                "uzp1 v4.2d, v20.2d, v21.2d\n"
-                "str q5, [%[c_ptr], #0x50]\n"
-                "uzp1 v5.2d, v22.2d, v23.2d\n"
-                "str q6, [%[c_ptr], #0x60]\n"
-                "uzp1 v6.2d, v24.2d, v25.2d\n"
-                "str q7, [%[c_ptr], #0x70]\n"
-                "uzp2 v7.2d, v20.2d, v21.2d\n"
-                "str q0, [%[c_ptr], #0x80]\n"
-                "uzp2 v0.2d, v22.2d, v23.2d\n"
-                "str q1, [%[c_ptr], #0x90]\n"
-                "uzp2 v1.2d, v24.2d, v25.2d\n"
-                "str q2, [%[c_ptr], #0xa0]\n"
-                "uzp1 v2.2d, v26.2d, v27.2d\n"
-                "str q3, [%[c_ptr], #0xb0]\n"
-                "uzp1 v3.2d, v28.2d, v29.2d\n"
-                "str q4, [%[c_ptr], #0xc0]\n"
-                "uzp1 v4.2d, v30.2d, v31.2d\n"
-                "str q5, [%[c_ptr], #0xd0]\n"
-                "uzp2 v5.2d, v26.2d, v27.2d\n"
-                "str q6, [%[c_ptr], #0xe0]\n"
-                "uzp2 v6.2d, v28.2d, v29.2d\n"
-                "str q7, [%[c_ptr], #0xf0]\n"
-                "uzp2 v7.2d, v30.2d, v31.2d\n"
-                "str q0, [%[c_ptr], #0x100]\n"
-                "str q1, [%[c_ptr], #0x110]\n"
-                "str q2, [%[c_ptr], #0x120]\n"
-                "str q3, [%[c_ptr], #0x130]\n"
-                "str q4, [%[c_ptr], #0x140]\n"
-                "str q5, [%[c_ptr], #0x150]\n"
-                "str q6, [%[c_ptr], #0x160]\n"
-                "str q7, [%[c_ptr], #0x170]\n"
-                "add %[c_ptr], %[c_ptr], #0x180\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
+void a64_interleaved_u8u32_mmla_8x12(
+    const uint8_t *Apanel,
+    const uint8_t *Bpanel,
+    uint32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
+
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
+
+    __asm__ __volatile__(
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x22, #0x10]\n"
+      "mov %x[Apanel], x21\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v8.4s, #0x0\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "add x22, x22, #0x20\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ldr q6, [%x[Apanel], #0x0]\n"
+      "ldr q7, [x22, #0x0]\n"
+      ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
+      "ldr q3, [x22, #0x10]\n"
+      ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a4da  // ummla v26.4s, v6.16b, v4.16b\n"
+      "ldr q4, [x22, #0x20]\n"
+      ".inst 0x6e85a4dd  // ummla v29.4s, v6.16b, v5.16b\n"
+      "ldr q5, [x22, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e83a40c  // ummla v12.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e87a42f  // ummla v15.4s, v1.16b, v7.16b\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x6e83a432  // ummla v18.4s, v1.16b, v3.16b\n"
+      ".inst 0x6e87a455  // ummla v21.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e83a458  // ummla v24.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e87a4db  // ummla v27.4s, v6.16b, v7.16b\n"
+      "ldr q7, [x22, #0x40]\n"
+      ".inst 0x6e83a4de  // ummla v30.4s, v6.16b, v3.16b\n"
+      "ldr q3, [x22, #0x50]\n"
+      ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
+      "ldr q0, [%x[Apanel], #0x10]\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      "ldr q1, [%x[Apanel], #0x20]\n"
+      ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
+      "ldr q2, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e84a4dc  // ummla v28.4s, v6.16b, v4.16b\n"
+      "ldr q4, [x22, #0x60]\n"
+      ".inst 0x6e85a4df  // ummla v31.4s, v6.16b, v5.16b\n"
+      "ldr q6, [%x[Apanel], #0x40]\n"
+      "ldr q5, [x22, #0x70]\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e83a40b  // ummla v11.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e87a42e  // ummla v14.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e83a431  // ummla v17.4s, v1.16b, v3.16b\n"
+      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e83a457  // ummla v23.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e87a4da  // ummla v26.4s, v6.16b, v7.16b\n"
+      "ldr q7, [x22, #0x80]\n"
+      ".inst 0x6e83a4dd  // ummla v29.4s, v6.16b, v3.16b\n"
+      "ldr q3, [x22, #0x90]\n"
+      ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a4db  // ummla v27.4s, v6.16b, v4.16b\n"
+      "ldr q4, [x22, #0xa0]\n"
+      ".inst 0x6e85a4de  // ummla v30.4s, v6.16b, v5.16b\n"
+      "ldr q5, [x22, #0xb0]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e83a40d  // ummla v13.4s, v0.16b, v3.16b\n"
+      "ldr q0, [%x[Apanel], #0x50]\n"
+      ".inst 0x6e87a430  // ummla v16.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e83a433  // ummla v19.4s, v1.16b, v3.16b\n"
+      "ldr q1, [%x[Apanel], #0x60]\n"
+      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e83a459  // ummla v25.4s, v2.16b, v3.16b\n"
+      "ldr q2, [%x[Apanel], #0x70]\n"
+      ".inst 0x6e87a4dc  // ummla v28.4s, v6.16b, v7.16b\n"
+      ".inst 0x6e83a4df  // ummla v31.4s, v6.16b, v3.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "add x22, x22, #0xc0\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
+      "ldr q7, [x22, #0x10]\n"
+      ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a47a  // ummla v26.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x22, #0x20]\n"
+      ".inst 0x6e85a47d  // ummla v29.4s, v3.16b, v5.16b\n"
+      "ldr q5, [x22, #0x30]\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      "add x22, x22, #0x40\n"
+      ".inst 0x6e87a432  // ummla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a47b  // ummla v27.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47e  // ummla v30.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a47c  // ummla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47f  // ummla v31.4s, v3.16b, v5.16b\n"
+      "cbz x20, 5f\n"
+      "ldr q1, [x22, #0x0]\n"
+      "ldr q7, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e81a4e8  // ummla v8.4s, v7.16b, v1.16b\n"
+      "ldr q6, [%x[Apanel], #0x10]\n"
+      "ldr q0, [x22, #0x10]\n"
+      ".inst 0x6e80a4eb  // ummla v11.4s, v7.16b, v0.16b\n"
+      "ldr q5, [%x[Apanel], #0x20]\n"
+      "ldr q4, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e81a4ce  // ummla v14.4s, v6.16b, v1.16b\n"
+      "ldr q3, [x22, #0x20]\n"
+      "ldr q2, [x22, #0x30]\n"
+      ".inst 0x6e80a4d1  // ummla v17.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e81a4b4  // ummla v20.4s, v5.16b, v1.16b\n"
+      ".inst 0x6e80a4b7  // ummla v23.4s, v5.16b, v0.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x6e81a49a  // ummla v26.4s, v4.16b, v1.16b\n"
+      "ldr q1, [x22, #0x40]\n"
+      ".inst 0x6e80a49d  // ummla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x22, #0x50]\n"
+      ".inst 0x6e83a4e9  // ummla v9.4s, v7.16b, v3.16b\n"
+      ".inst 0x6e82a4ec  // ummla v12.4s, v7.16b, v2.16b\n"
+      ".inst 0x6e83a4cf  // ummla v15.4s, v6.16b, v3.16b\n"
+      ".inst 0x6e82a4d2  // ummla v18.4s, v6.16b, v2.16b\n"
+      "add x22, x22, #0x60\n"
+      ".inst 0x6e83a4b5  // ummla v21.4s, v5.16b, v3.16b\n"
+      ".inst 0x6e82a4b8  // ummla v24.4s, v5.16b, v2.16b\n"
+      ".inst 0x6e83a49b  // ummla v27.4s, v4.16b, v3.16b\n"
+      ".inst 0x6e82a49e  // ummla v30.4s, v4.16b, v2.16b\n"
+      ".inst 0x6e81a4ea  // ummla v10.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e80a4ed  // ummla v13.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e81a4d0  // ummla v16.4s, v6.16b, v1.16b\n"
+      ".inst 0x6e80a4d3  // ummla v19.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e81a4b6  // ummla v22.4s, v5.16b, v1.16b\n"
+      ".inst 0x6e80a4b9  // ummla v25.4s, v5.16b, v0.16b\n"
+      ".inst 0x6e81a49c  // ummla v28.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e80a49f  // ummla v31.4s, v4.16b, v0.16b\n"
+      "5:"  // multiply loop done
+      "subs x23, x23, #0x1\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // __aarch64__
-\ No newline at end of file
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
index f327e84861..19acfe8ae9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,11 @@
 
 #ifdef __aarch64__
 
-#include "../std_transforms_fixed.hpp"
+#include "../std_transforms_fixed_trB.hpp"
 #include "../performance_parameters.hpp"
 
+#include "../bfloat.hpp"
+
 namespace arm_gemm {
 
 // Actual kernel implementations
@@ -66,21 +68,37 @@ public:
     }
 
     // Use the standard fixed size transforms.
-    StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+    StdTransformsFixedTRB<operand_type, result_type, 8, 12> transforms = {};
 
+    template<typename T>
     static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 3.954, 1.252, 1.141 };
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 3.954, 1.252, 1.141 };
 
-            case CPUModel::A53:
-                return { 2.777, 0.987, 0.898 };
+                case CPUModel::A53:
+                    return { 2.777, 0.987, 0.898 };
 
-            case CPUModel::A73:
-                return { 2.885, 1.429, 1.163 };
+                case CPUModel::A73:
+                    return { 2.885, 1.429, 1.163 };
 
-            default:
-                return { 7.2307, 3.876, 2.932 };
+                case CPUModel::V1:
+                    return { 14.95, 9.95, 5.28 };
+
+                default:
+                    return { 7.2307, 3.876, 2.932 };
+            }
+        }
+
+        if (std::is_same<T, bfloat16>::value) {
+            switch(ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 4.98, 2.27, 3.05 };
+
+                default:
+                    return { 7.99, 5.06, 7.32 };
+            }
         }
     }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
index 2e87a47036..52548b462c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
@@ -24,7 +24,6 @@
 #ifdef __aarch64__
 
 #include <algorithm>
-#include <limits>
 
 #include "arm_gemm.hpp"
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
index ca4a44a2c7..deaef27ee9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
@@ -24,7 +24,6 @@
 #ifdef __aarch64__
 
 #include <algorithm>
-#include <limits>
 
 #include "arm_gemm.hpp"
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
new file mode 100644
index 0000000000..76f43f0933
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "../std_transforms_sme.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, \
+    float *, size_t, size_t, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+void sme2_gemv_bf16fp32_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_bf16fp32_dot_16VL
+{
+public:
+    typedef bfloat16 operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<float>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 2;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return true;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return true;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 2> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_bf16fp32_dot_16VL;
+    cls_sme2_gemv_bf16fp32_dot_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..db29e42ef1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
@@ -0,0 +1,553 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sme2_gemv_bf16fp32_dot_16VL (
+    const bfloat16 *A_ptr, const bfloat16 *B_ptr, float *output_ptr,
+    size_t N, size_t K,
+    const float *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p8.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x28, ALL, MUL #4\n"
+      "add x27, %x[N], x28\n"
+      "sub x27, x27, #0x1\n"
+      "udiv x27, x27, x28\n"
+      "add x22, x27, #0x3\n"
+      "and x22, x22, #0xfffffffffffffffc\n"
+      "mul x22, x22, x28\n"
+      "mul x22, x22, %x[K]\n"
+      "mov x9, #0x0\n"
+      "mov x26, %x[B_ptr]\n"
+      "mov x25, %x[output_ptr]\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "lsl x22, x22, #0x1\n"
+      "mov x21, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x22, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x22, #0, 3f\n"
+      "lsr x22, x22, #0x1\n"
+      "lsl x21, x21, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x20, x22, #0x26\n"
+      "sub x21, x21, #0x1\n"
+      "lsl x21, x21, #0x16\n"
+      "orr x22, x22, x20\n"
+      "orr x22, x22, x21\n"
+      ".inst 0xf8b64b5a  // rprfm pldonce, x22, [x26]\n"
+      "3:"  // RHS prefetch exit
+      "mov x24, %x[bias]\n"
+      "4:"  // Column loop
+      "cmp x27, #0x4\n"
+      "bge 28f\n"
+      "cmp x27, #0x2\n"
+      "bgt 20f\n"
+      "beq 12f\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "mov x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 5f\n"
+      ".inst 0xa040c718  // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f00  // mova za.d[x9, #0], { z24.d-z27.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x22, #0x8\n"
+      "ble 8f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z8.h }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x8\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158b298  // bfdot za.s[x9, 0], { z20.h-z23.h }, z8.h[0]\n"
+      "addvl x26, x26, #16\n"
+      "cmp x22, #0x8\n"
+      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158b498  // bfdot za.s[x9, 0], { z4.h-z7.h }, z8.h[1]\n"
+      "addvl x26, x26, #16\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158bb98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z8.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158bf18  // bfdot za.s[x9, 0], { z24.h-z27.h }, z8.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "bgt 7b\n"
+      "8:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc15bb398  // bfdot za.s[x9, 0], { z28.h-z31.h }, z11.h[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb598  // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bbc18  // bfdot za.s[x9, 0], { z0.h-z3.h }, z11.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "9:"  // Width 1: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 10f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z3.s }, p1/Z, [x21]\n"
+      "ld1rw { z29.s }, p1/Z, [x20]\n"
+      ".inst 0xc1bdc868  // fclamp { z8.s-z11.s }, z3.s, z29.s\n"
+      ".inst 0xa060c328  // st1w { z8.s-z11.s }, p8, [x25]\n"
+      "addvl x25, x25, #4\n"
+      "b 11f\n"
+      "10:"  // Width 1: No activation
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c32c  // st1w { z12.s-z15.s }, p8, [x25]\n"
+      "addvl x25, x25, #4\n"
+      "11:"  // Width 1: Output done
+      "b 36f\n"
+      "12:"  // Width 2
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "sub x20, %x[N], x28\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 13f\n"
+      ".inst 0xa040c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c714  // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
+      "b 14f\n"
+      "13:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "14:"  // Width 2: setup done
+      "cmp x22, #0x8\n"
+      "ble 16f\n"
+      "15:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z9.h }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x8\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159b198  // bfdot za.s[x9, 0], { z12.h-z15.h }, z9.h[0]\n"
+      "cmp x22, #0x8\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159b099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z9.h[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159b598  // bfdot za.s[x9, 0], { z12.h-z15.h }, z9.h[1]\n"
+      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z9.h[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159bb18  // bfdot za.s[x9, 0], { z24.h-z27.h }, z9.h[2]\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159b819  // bfdot za.s[x9, 1], { z0.h-z3.h }, z9.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159bc18  // bfdot za.s[x9, 0], { z0.h-z3.h }, z9.h[3]\n"
+      ".inst 0xa041a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159bf99  // bfdot za.s[x9, 1], { z28.h-z31.h }, z9.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "bgt 15b\n"
+      "16:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc15bb198  // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[0]\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb019  // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z11.h[1]\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb419  // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[2]\n"
+      ".inst 0xa041a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bbb99  // bfdot za.s[x9, 1], { z28.h-z31.h }, z11.h[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bbe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[3]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bbe99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "17:"  // Width 2: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 18f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c00  // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+      "ld1rw { z9.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      "ld1rw { z8.s }, p1/Z, [x20]\n"
+      ".inst 0xc1a8c920  // fclamp { z0.s-z3.s }, z9.s, z8.s\n"
+      ".inst 0xa060c720  // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+      ".inst 0xc1a8c924  // fclamp { z4.s-z7.s }, z9.s, z8.s\n"
+      ".inst 0xa061c324  // st1w { z4.s-z7.s }, p8, [x25, #0x4, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "b 19f\n"
+      "18:"  // Width 2: No activation
+      ".inst 0xc0062c10  // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c730  // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "19:"  // Width 2: Output done
+      "b 36f\n"
+      "20:"  // Width 3
+      "mov x20, #0x2\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 21f\n"
+      ".inst 0xa040c718  // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f00  // mova za.d[x9, #0], { z24.d-z27.d }\n"
+      ".inst 0xa041c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042c81  // mova za.d[x9, #1], { z4.d-z7.d }\n"
+      ".inst 0xa042c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042e02  // mova za.d[x9, #2], { z16.d-z19.d }\n"
+      "b 22f\n"
+      "21:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "22:"  // Width 3: setup done
+      "cmp x22, #0x8\n"
+      "ble 24f\n"
+      "23:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z15.h }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x8\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fb018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z15.h[0]\n"
+      "cmp x22, #0x8\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[0]\n"
+      ".inst 0xa042a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fb01a  // bfdot za.s[x9, 2], { z0.h-z3.h }, z15.h[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fb698  // bfdot za.s[x9, 0], { z20.h-z23.h }, z15.h[1]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb699  // bfdot za.s[x9, 1], { z20.h-z23.h }, z15.h[1]\n"
+      ".inst 0xa042a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fb51a  // bfdot za.s[x9, 2], { z8.h-z11.h }, z15.h[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fbb18  // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[2]\n"
+      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb919  // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[2]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fbe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[3]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fbe19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[3]\n"
+      ".inst 0xa042a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fbd1a  // bfdot za.s[x9, 2], { z8.h-z11.h }, z15.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "bgt 23b\n"
+      "24:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc15bb398  // bfdot za.s[x9, 0], { z28.h-z31.h }, z11.h[0]\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb019  // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n"
+      ".inst 0xa042a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bb29a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z11.h[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb598  // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n"
+      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z11.h[1]\n"
+      ".inst 0xa042a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bb79a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z11.h[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb898  // bfdot za.s[x9, 0], { z4.h-z7.h }, z11.h[2]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[2]\n"
+      ".inst 0xa042a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bb99a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z11.h[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bbd98  // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[3]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bbe99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[3]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bbe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "25:"  // Width 3: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 26f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c04  // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+      "ld1rw { z17.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c28  // mova { z8.d-z11.d }, za.d[x9, #1]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      ".inst 0xc1b0ca24  // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c724  // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b0ca28  // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
+      ".inst 0xa061c728  // st1w { z8.s-z11.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b0ca2c  // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+      ".inst 0xa062c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
+      "addvl x25, x25, #12\n"
+      "b 27f\n"
+      "26:"  // Width 3: No activation
+      ".inst 0xc0062c14  // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c734  // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c72c  // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
+      "addvl x25, x25, #12\n"
+      "27:"  // Width 3: Output done
+      "b 36f\n"
+      "28:"  // Width 4
+      "mov x20, #0x3\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 29f\n"
+      ".inst 0xa040c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      ".inst 0xa042c70c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042d82  // mova za.d[x9, #2], { z12.d-z15.d }\n"
+      ".inst 0xa043c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
+      ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
+      "addvl x24, x24, #16\n"
+      "b 30f\n"
+      "29:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "30:"  // Width 4: setup done
+      "cmp x22, #0x8\n"
+      "ble 32f\n"
+      "31:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z8.h }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x8\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158b218  // bfdot za.s[x9, 0], { z16.h-z19.h }, z8.h[0]\n"
+      "cmp x22, #0x8\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158b199  // bfdot za.s[x9, 1], { z12.h-z15.h }, z8.h[0]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158b21a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[0]\n"
+      ".inst 0xa043a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158b19b  // bfdot za.s[x9, 3], { z12.h-z15.h }, z8.h[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158b598  // bfdot za.s[x9, 0], { z12.h-z15.h }, z8.h[1]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158b699  // bfdot za.s[x9, 1], { z20.h-z23.h }, z8.h[1]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158b61a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[1]\n"
+      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158b69b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z8.h[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158b898  // bfdot za.s[x9, 0], { z4.h-z7.h }, z8.h[2]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158ba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z8.h[2]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158ba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[2]\n"
+      ".inst 0xa043a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158b81b  // bfdot za.s[x9, 3], { z0.h-z3.h }, z8.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158be98  // bfdot za.s[x9, 0], { z20.h-z23.h }, z8.h[3]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158be19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z8.h[3]\n"
+      ".inst 0xa042a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158bc9a  // bfdot za.s[x9, 2], { z4.h-z7.h }, z8.h[3]\n"
+      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158be9b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z8.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "bgt 31b\n"
+      "32:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc15bb218  // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[0]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb299  // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[0]\n"
+      ".inst 0xa042a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bb39a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z11.h[0]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15bb21b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb418  // bfdot za.s[x9, 0], { z0.h-z3.h }, z11.h[1]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z11.h[1]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bb61a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[1]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15bb61b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[2]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z11.h[2]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[2]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15bba1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bbe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[3]\n"
+      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bbf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z11.h[3]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bbe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[3]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15bbe1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "33:"  // Width 4: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 34f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      "ld1rw { z21.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c38  // mova { z24.d-z27.d }, za.d[x9, #1]\n"
+      "ld1rw { z20.s }, p1/Z, [x20]\n"
+      ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c72c  // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c70  // mova { z16.d-z19.d }, za.d[x9, #3]\n"
+      ".inst 0xa061c738  // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+      ".inst 0xa062c720  // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      ".inst 0xa063c330  // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
+      "addvl x25, x25, #16\n"
+      "b 35f\n"
+      "34:"  // Width 4: No activation
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c72c  // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc0062c64  // mova { z4.d-z7.d }, za.d[x9, #3]\n"
+      ".inst 0xa063c324  // st1w { z4.s-z7.s }, p8, [x25, #0xc, MUL VL]\n"
+      "addvl x25, x25, #16\n"
+      "35:"  // Width 4: Output done
+      "subs x27, x27, #0x4\n"
+      "sub %x[N], %x[N], x28, LSL #2\n"
+      "bgt 4b\n"
+      "36:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p8.b\n"
+      : [N] "+&r" (N)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL.hpp
new file mode 100644
index 0000000000..50013e581c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "../std_transforms_sme.hpp"
+
+#define ARGLIST  \
+    const __fp16 *, const __fp16 *, \
+    __fp16 *, size_t, size_t, \
+    const __fp16 *, Activation, bool
+
+namespace arm_gemm
+{
+void sme2_gemv_fp16fp32fp16_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_fp16fp32fp16_dot_16VL
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<float>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 2;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return true;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return true;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 2> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_fp16fp32fp16_dot_16VL;
+    cls_sme2_gemv_fp16fp32fp16_dot_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..97c2427617
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp16fp32fp16_dot_16VL/generic.cpp
@@ -0,0 +1,666 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sme2_gemv_fp16fp32fp16_dot_16VL (
+    const __fp16 *A_ptr, const __fp16 *B_ptr, __fp16 *output_ptr,
+    size_t N, size_t K,
+    const __fp16 *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        const __fp16 *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p8.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "mov x9, #0x0\n"
+      "cntw x28, ALL, MUL #4\n"
+      "mov x27, %x[B_ptr]\n"
+      "add x26, %x[N], x28\n"
+      "mov x25, %x[output_ptr]\n"
+      "sub x26, x26, #0x1\n"
+      "ptrue p1.b\n"
+      "udiv x26, x26, x28\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "add x22, x26, #0x3\n"
+      "mov x21, #0x1\n"
+      "and x22, x22, #0xfffffffffffffffc\n"
+      "mul x22, x22, x28\n"
+      "mul x22, x22, %x[K]\n"
+      "lsl x22, x22, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x22, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x22, #0, 3f\n"
+      "lsr x22, x22, #0x1\n"
+      "lsl x21, x21, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x20, x22, #0x26\n"
+      "sub x21, x21, #0x1\n"
+      "lsl x21, x21, #0x16\n"
+      "orr x22, x22, x20\n"
+      "orr x22, x22, x21\n"
+      ".inst 0xf8b64b7a  // rprfm pldonce, x22, [x27]\n"
+      "3:"  // RHS prefetch exit
+      "mov x24, %x[bias]\n"
+      "4:"  // Column loop
+      "cmp x26, #0x4\n"
+      "bge 28f\n"
+      "cmp x26, #0x2\n"
+      "bgt 20f\n"
+      "beq 12f\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "mov x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x257447f0  // whilelt p8.h, XZR, x20, VLx2\n"
+      "cbz x24, 5f\n"
+      "ld1h { z20.s }, p1/Z, [x24]\n"
+      "addvl x20, x24, #4\n"
+      "ld1h { z21.s }, p1/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z22.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z23.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "fcvt z20.s, p1/m, z20.h\n"
+      "fcvt z21.s, p1/m, z21.h\n"
+      "fcvt z22.s, p1/m, z22.h\n"
+      "fcvt z23.s, p1/m, z23.h\n"
+      ".inst 0xc0042e80  // mova za.d[x9, #0], { z20.d-z23.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x22, #0x8\n"
+      "ble 8f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #16\n"
+      "ld1rqh { z0.h }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x8\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa040a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #16\n"
+      "cmp x22, #0x8\n"
+      ".inst 0xa040a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc150b208  // fdot za.s[x9, 0], { z16.h-z19.h }, z0.h[0]\n"
+      ".inst 0xa040a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc150b788  // fdot za.s[x9, 0], { z28.h-z31.h }, z0.h[1]\n"
+      ".inst 0xc150bb08  // fdot za.s[x9, 0], { z24.h-z27.h }, z0.h[2]\n"
+      ".inst 0xc150bc88  // fdot za.s[x9, 0], { z4.h-z7.h }, z0.h[3]\n"
+      "bgt 7b\n"
+      "8:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bb308  // fdot za.s[x9, 0], { z24.h-z27.h }, z11.h[0]\n"
+      "ble 9f\n"
+      ".inst 0xa040a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bb708  // fdot za.s[x9, 0], { z24.h-z27.h }, z11.h[1]\n"
+      "ble 9f\n"
+      ".inst 0xa040a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bbb08  // fdot za.s[x9, 0], { z24.h-z27.h }, z11.h[2]\n"
+      "ble 9f\n"
+      ".inst 0xa040a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bbf88  // fdot za.s[x9, 0], { z28.h-z31.h }, z11.h[3]\n"
+      "9:"  // Width 1: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 10f\n"
+      ".inst 0xc0062c10  // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z29.h }, p1/Z, [x21]\n"
+      "ld1rh { z20.h }, p1/Z, [x20]\n"
+      ".inst 0xc120e204  // fcvt z4.h, { z16.s-z17.s }\n"
+      ".inst 0xc120e245  // fcvt z5.h, { z18.s-z19.s }\n"
+      ".inst 0xc174c3a4  // fclamp { z4.h-z5.h }, z29.h, z20.h\n"
+      ".inst 0xa0602324  // st1h { z4.h-z5.h }, p8, [x25]\n"
+      "addvl x25, x25, #2\n"
+      "b 11f\n"
+      "10:"  // Width 1: No activation
+      ".inst 0xc0062c00  // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+      ".inst 0xc120e012  // fcvt z18.h, { z0.s-z1.s }\n"
+      ".inst 0xc120e05a  // fcvt z26.h, { z2.s-z3.s }\n"
+      ".inst 0xa1602332  // st1h { z18.h, z26.h }, p8, [x25]\n"
+      "addvl x25, x25, #2\n"
+      "11:"  // Width 1: Output done
+      "b 36f\n"
+      "12:"  // Width 2
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "sub x20, %x[N], x28\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x257447f0  // whilelt p8.h, XZR, x20, VLx2\n"
+      "cbz x24, 13f\n"
+      "ld1h { z12.s }, p1/Z, [x24]\n"
+      "addvl x20, x24, #4\n"
+      "ld1h { z13.s }, p1/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "fcvt z12.s, p1/m, z12.h\n"
+      "ld1h { z28.s }, p1/Z, [x24, #4, MUL VL]\n"
+      "fcvt z13.s, p1/m, z13.h\n"
+      "ld1h { z29.s }, p1/Z, [x24, #5, MUL VL]\n"
+      "fcvt z14.s, p1/m, z14.h\n"
+      "ld1h { z30.s }, p1/Z, [x24, #6, MUL VL]\n"
+      "fcvt z15.s, p1/m, z15.h\n"
+      "ld1h { z31.s }, p1/Z, [x24, #7, MUL VL]\n"
+      "fcvt z28.s, p1/m, z28.h\n"
+      "fcvt z29.s, p1/m, z29.h\n"
+      "fcvt z30.s, p1/m, z30.h\n"
+      "fcvt z31.s, p1/m, z31.h\n"
+      ".inst 0xc0042d80  // mova za.d[x9, #0], { z12.d-z15.d }\n"
+      ".inst 0xc0042f81  // mova za.d[x9, #1], { z28.d-z31.d }\n"
+      "b 14f\n"
+      "13:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "14:"  // Width 2: setup done
+      "cmp x22, #0x8\n"
+      "ble 16f\n"
+      "15:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
+      "sub x22, x22, #0x8\n"
+      "ld1rqh { z8.h }, p0/Z, [x23]\n"
+      "cmp x22, #0x8\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xa040a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc158b088  // fdot za.s[x9, 0], { z4.h-z7.h }, z8.h[0]\n"
+      ".inst 0xa041a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc158b009  // fdot za.s[x9, 1], { z0.h-z3.h }, z8.h[0]\n"
+      ".inst 0xa040a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa041a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xa040a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc158b608  // fdot za.s[x9, 0], { z16.h-z19.h }, z8.h[1]\n"
+      ".inst 0xa041a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc158b589  // fdot za.s[x9, 1], { z12.h-z15.h }, z8.h[1]\n"
+      ".inst 0xc158bb08  // fdot za.s[x9, 0], { z24.h-z27.h }, z8.h[2]\n"
+      ".inst 0xc158b809  // fdot za.s[x9, 1], { z0.h-z3.h }, z8.h[2]\n"
+      ".inst 0xc158bc88  // fdot za.s[x9, 0], { z4.h-z7.h }, z8.h[3]\n"
+      ".inst 0xc158bf89  // fdot za.s[x9, 1], { z28.h-z31.h }, z8.h[3]\n"
+      "bgt 15b\n"
+      "16:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bb088  // fdot za.s[x9, 0], { z4.h-z7.h }, z11.h[0]\n"
+      ".inst 0xc15bb189  // fdot za.s[x9, 1], { z12.h-z15.h }, z11.h[0]\n"
+      "ble 17f\n"
+      ".inst 0xa040a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bb608  // fdot za.s[x9, 0], { z16.h-z19.h }, z11.h[1]\n"
+      ".inst 0xc15bb689  // fdot za.s[x9, 1], { z20.h-z23.h }, z11.h[1]\n"
+      "ble 17f\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bb988  // fdot za.s[x9, 0], { z12.h-z15.h }, z11.h[2]\n"
+      ".inst 0xc15bba89  // fdot za.s[x9, 1], { z20.h-z23.h }, z11.h[2]\n"
+      "ble 17f\n"
+      ".inst 0xa040a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa041a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bbc08  // fdot za.s[x9, 0], { z0.h-z3.h }, z11.h[3]\n"
+      ".inst 0xc15bbf09  // fdot za.s[x9, 1], { z24.h-z27.h }, z11.h[3]\n"
+      "17:"  // Width 2: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 18f\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c3c  // mova { z28.d-z31.d }, za.d[x9, #1]\n"
+      "ld1rh { z5.h }, p1/Z, [x21]\n"
+      "ld1rh { z21.h }, p1/Z, [x20]\n"
+      ".inst 0xc120e188  // fcvt z8.h, { z12.s-z13.s }\n"
+      ".inst 0xc120e1c9  // fcvt z9.h, { z14.s-z15.s }\n"
+      ".inst 0xc120e39c  // fcvt z28.h, { z28.s-z29.s }\n"
+      ".inst 0xc120e3dd  // fcvt z29.h, { z30.s-z31.s }\n"
+      ".inst 0xc175c0a8  // fclamp { z8.h-z9.h }, z5.h, z21.h\n"
+      ".inst 0xc175c0bc  // fclamp { z28.h-z29.h }, z5.h, z21.h\n"
+      ".inst 0xa0602728  // st1h { z8.h-z9.h }, pn9.b, [x25]\n"
+      ".inst 0xa061233c  // st1h { z28.h-z29.h }, p8, [x25, #0x2, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "b 19f\n"
+      "18:"  // Width 2: No activation
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      ".inst 0xc120e194  // fcvt z20.h, { z12.s-z13.s }\n"
+      ".inst 0xc120e1dc  // fcvt z28.h, { z14.s-z15.s }\n"
+      ".inst 0xa1602734  // st1h { z20.h, z28.h }, pn9.b, [x25]\n"
+      ".inst 0xc120e09a  // fcvt z26.h, { z4.s-z5.s }\n"
+      ".inst 0xc120e0db  // fcvt z27.h, { z6.s-z7.s }\n"
+      ".inst 0xa061233a  // st1h { z26.h-z27.h }, p8, [x25, #0x2, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "19:"  // Width 2: Output done
+      "b 36f\n"
+      "20:"  // Width 3
+      "mov x20, #0x2\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x257447f0  // whilelt p8.h, XZR, x20, VLx2\n"
+      "cbz x24, 21f\n"
+      "addvl x20, x24, #4\n"
+      "ld1h { z16.s }, p1/Z, [x24]\n"
+      "ld1h { z17.s }, p1/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "fcvt z16.s, p1/m, z16.h\n"
+      "ld1h { z8.s }, p1/Z, [x24, #4, MUL VL]\n"
+      "fcvt z17.s, p1/m, z17.h\n"
+      "ld1h { z9.s }, p1/Z, [x24, #5, MUL VL]\n"
+      "fcvt z18.s, p1/m, z18.h\n"
+      "ld1h { z10.s }, p1/Z, [x24, #6, MUL VL]\n"
+      "fcvt z19.s, p1/m, z19.h\n"
+      "ld1h { z11.s }, p1/Z, [x24, #7, MUL VL]\n"
+      "fcvt z8.s, p1/m, z8.h\n"
+      "ld1h { z24.s }, p1/Z, [x20]\n"
+      "fcvt z9.s, p1/m, z9.h\n"
+      "ld1h { z25.s }, p1/Z, [x20, #1, MUL VL]\n"
+      "fcvt z10.s, p1/m, z10.h\n"
+      "ld1h { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "fcvt z11.s, p1/m, z11.h\n"
+      ".inst 0xc0042e00  // mova za.d[x9, #0], { z16.d-z19.d }\n"
+      "ld1h { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "fcvt z24.s, p1/m, z24.h\n"
+      "fcvt z25.s, p1/m, z25.h\n"
+      "fcvt z26.s, p1/m, z26.h\n"
+      "fcvt z27.s, p1/m, z27.h\n"
+      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
+      ".inst 0xc0042f02  // mova za.d[x9, #2], { z24.d-z27.d }\n"
+      "b 22f\n"
+      "21:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "22:"  // Width 3: setup done
+      "cmp x22, #0x8\n"
+      "ble 24f\n"
+      "23:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
+      "sub x22, x22, #0x8\n"
+      "ld1rqh { z6.h }, p0/Z, [x23]\n"
+      "cmp x22, #0x8\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc156b288  // fdot za.s[x9, 0], { z20.h-z23.h }, z6.h[0]\n"
+      ".inst 0xa040a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc156b189  // fdot za.s[x9, 1], { z12.h-z15.h }, z6.h[0]\n"
+      ".inst 0xa041a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc156b00a  // fdot za.s[x9, 2], { z0.h-z3.h }, z6.h[0]\n"
+      ".inst 0xa042a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xa040a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc156b788  // fdot za.s[x9, 0], { z28.h-z31.h }, z6.h[1]\n"
+      ".inst 0xa041a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc156b589  // fdot za.s[x9, 1], { z12.h-z15.h }, z6.h[1]\n"
+      ".inst 0xa042a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc156b40a  // fdot za.s[x9, 2], { z0.h-z3.h }, z6.h[1]\n"
+      ".inst 0xa040a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa041a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc156ba88  // fdot za.s[x9, 0], { z20.h-z23.h }, z6.h[2]\n"
+      ".inst 0xa042a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc156b909  // fdot za.s[x9, 1], { z8.h-z11.h }, z6.h[2]\n"
+      ".inst 0xc156b98a  // fdot za.s[x9, 2], { z12.h-z15.h }, z6.h[2]\n"
+      ".inst 0xc156bc08  // fdot za.s[x9, 0], { z0.h-z3.h }, z6.h[3]\n"
+      ".inst 0xc156be09  // fdot za.s[x9, 1], { z16.h-z19.h }, z6.h[3]\n"
+      ".inst 0xc156be8a  // fdot za.s[x9, 2], { z20.h-z23.h }, z6.h[3]\n"
+      "bgt 23b\n"
+      "24:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bb188  // fdot za.s[x9, 0], { z12.h-z15.h }, z11.h[0]\n"
+      ".inst 0xc15bb009  // fdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n"
+      ".inst 0xc15bb20a  // fdot za.s[x9, 2], { z16.h-z19.h }, z11.h[0]\n"
+      "ble 25f\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bb588  // fdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n"
+      ".inst 0xc15bb609  // fdot za.s[x9, 1], { z16.h-z19.h }, z11.h[1]\n"
+      ".inst 0xc15bb68a  // fdot za.s[x9, 2], { z20.h-z23.h }, z11.h[1]\n"
+      "ble 25f\n"
+      ".inst 0xa040a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bb888  // fdot za.s[x9, 0], { z4.h-z7.h }, z11.h[2]\n"
+      ".inst 0xc15bbb89  // fdot za.s[x9, 1], { z28.h-z31.h }, z11.h[2]\n"
+      ".inst 0xc15bba8a  // fdot za.s[x9, 2], { z20.h-z23.h }, z11.h[2]\n"
+      "ble 25f\n"
+      ".inst 0xa040a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa041a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bbc88  // fdot za.s[x9, 0], { z4.h-z7.h }, z11.h[3]\n"
+      ".inst 0xc15bbf89  // fdot za.s[x9, 1], { z28.h-z31.h }, z11.h[3]\n"
+      ".inst 0xc15bbd8a  // fdot za.s[x9, 2], { z12.h-z15.h }, z11.h[3]\n"
+      "25:"  // Width 3: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 26f\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c20  // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+      "ld1rh { z17.h }, p1/Z, [x21]\n"
+      ".inst 0xc0062c44  // mova { z4.d-z7.d }, za.d[x9, #2]\n"
+      "ld1rh { z16.h }, p1/Z, [x20]\n"
+      ".inst 0xc120e18c  // fcvt z12.h, { z12.s-z13.s }\n"
+      ".inst 0xc120e1cd  // fcvt z13.h, { z14.s-z15.s }\n"
+      ".inst 0xc120e00e  // fcvt z14.h, { z0.s-z1.s }\n"
+      ".inst 0xc120e04f  // fcvt z15.h, { z2.s-z3.s }\n"
+      ".inst 0xc170c22c  // fclamp { z12.h-z13.h }, z17.h, z16.h\n"
+      ".inst 0xc120e092  // fcvt z18.h, { z4.s-z5.s }\n"
+      ".inst 0xc120e0d3  // fcvt z19.h, { z6.s-z7.s }\n"
+      ".inst 0xc170c22e  // fclamp { z14.h-z15.h }, z17.h, z16.h\n"
+      ".inst 0xc170c232  // fclamp { z18.h-z19.h }, z17.h, z16.h\n"
+      ".inst 0xa060272c  // st1h { z12.h-z13.h }, pn9.b, [x25]\n"
+      ".inst 0xa061272e  // st1h { z14.h-z15.h }, pn9.b, [x25, #0x2, MUL VL]\n"
+      ".inst 0xa0622332  // st1h { z18.h-z19.h }, p8, [x25, #0x4, MUL VL]\n"
+      "addvl x25, x25, #6\n"
+      "b 27f\n"
+      "26:"  // Width 3: No activation
+      ".inst 0xc0062c18  // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+      ".inst 0xc0062c28  // mova { z8.d-z11.d }, za.d[x9, #1]\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xc120e311  // fcvt z17.h, { z24.s-z25.s }\n"
+      ".inst 0xc120e359  // fcvt z25.h, { z26.s-z27.s }\n"
+      ".inst 0xa1602731  // st1h { z17.h, z25.h }, pn9.b, [x25]\n"
+      ".inst 0xc120e112  // fcvt z18.h, { z8.s-z9.s }\n"
+      ".inst 0xc120e153  // fcvt z19.h, { z10.s-z11.s }\n"
+      ".inst 0xa0612732  // st1h { z18.h-z19.h }, pn9.b, [x25, #0x2, MUL VL]\n"
+      ".inst 0xc120e191  // fcvt z17.h, { z12.s-z13.s }\n"
+      ".inst 0xc120e1d9  // fcvt z25.h, { z14.s-z15.s }\n"
+      ".inst 0xa1622331  // st1h { z17.h, z25.h }, p8, [x25, #0x4, MUL VL]\n"
+      "addvl x25, x25, #6\n"
+      "27:"  // Width 3: Output done
+      "b 36f\n"
+      "28:"  // Width 4
+      "mov x20, #0x3\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x257447f0  // whilelt p8.h, XZR, x20, VLx2\n"
+      "cbz x24, 29f\n"
+      "addvl x20, x24, #4\n"
+      "ld1h { z28.s }, p1/Z, [x24]\n"
+      "ld1h { z29.s }, p1/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z30.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z31.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "fcvt z28.s, p1/m, z28.h\n"
+      "ld1h { z8.s }, p1/Z, [x24, #4, MUL VL]\n"
+      "fcvt z29.s, p1/m, z29.h\n"
+      "ld1h { z9.s }, p1/Z, [x24, #5, MUL VL]\n"
+      "fcvt z30.s, p1/m, z30.h\n"
+      "ld1h { z10.s }, p1/Z, [x24, #6, MUL VL]\n"
+      "fcvt z31.s, p1/m, z31.h\n"
+      "ld1h { z11.s }, p1/Z, [x24, #7, MUL VL]\n"
+      "fcvt z8.s, p1/m, z8.h\n"
+      "addvl x24, x24, #8\n"
+      "ld1h { z0.s }, p1/Z, [x20]\n"
+      "fcvt z9.s, p1/m, z9.h\n"
+      "ld1h { z1.s }, p1/Z, [x20, #1, MUL VL]\n"
+      "fcvt z10.s, p1/m, z10.h\n"
+      "ld1h { z2.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "fcvt z11.s, p1/m, z11.h\n"
+      ".inst 0xc0042f80  // mova za.d[x9, #0], { z28.d-z31.d }\n"
+      "ld1h { z3.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "fcvt z0.s, p1/m, z0.h\n"
+      "ld1h { z28.s }, p1/Z, [x20, #4, MUL VL]\n"
+      "fcvt z1.s, p1/m, z1.h\n"
+      "ld1h { z29.s }, p1/Z, [x20, #5, MUL VL]\n"
+      "fcvt z2.s, p1/m, z2.h\n"
+      "ld1h { z30.s }, p1/Z, [x20, #6, MUL VL]\n"
+      "fcvt z3.s, p1/m, z3.h\n"
+      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
+      "ld1h { z31.s }, p1/Z, [x20, #7, MUL VL]\n"
+      "fcvt z28.s, p1/m, z28.h\n"
+      "fcvt z29.s, p1/m, z29.h\n"
+      "fcvt z30.s, p1/m, z30.h\n"
+      "fcvt z31.s, p1/m, z31.h\n"
+      ".inst 0xc0042c02  // mova za.d[x9, #2], { z0.d-z3.d }\n"
+      ".inst 0xc0042f83  // mova za.d[x9, #3], { z28.d-z31.d }\n"
+      "b 30f\n"
+      "29:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "30:"  // Width 4: setup done
+      "cmp x22, #0x8\n"
+      "ble 32f\n"
+      "31:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
+      "sub x22, x22, #0x8\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "cmp x22, #0x8\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc153b108  // fdot za.s[x9, 0], { z8.h-z11.h }, z3.h[0]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc153b389  // fdot za.s[x9, 1], { z28.h-z31.h }, z3.h[0]\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc153b30a  // fdot za.s[x9, 2], { z24.h-z27.h }, z3.h[0]\n"
+      ".inst 0xa041a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc153b08b  // fdot za.s[x9, 3], { z4.h-z7.h }, z3.h[0]\n"
+      ".inst 0xa042a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc153b588  // fdot za.s[x9, 0], { z12.h-z15.h }, z3.h[1]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc153b509  // fdot za.s[x9, 1], { z8.h-z11.h }, z3.h[1]\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc153b60a  // fdot za.s[x9, 2], { z16.h-z19.h }, z3.h[1]\n"
+      ".inst 0xa041a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc153b70b  // fdot za.s[x9, 3], { z24.h-z27.h }, z3.h[1]\n"
+      ".inst 0xa042a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc153b988  // fdot za.s[x9, 0], { z12.h-z15.h }, z3.h[2]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc153b889  // fdot za.s[x9, 1], { z4.h-z7.h }, z3.h[2]\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xc153ba8a  // fdot za.s[x9, 2], { z20.h-z23.h }, z3.h[2]\n"
+      ".inst 0xa041a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xc153b90b  // fdot za.s[x9, 3], { z8.h-z11.h }, z3.h[2]\n"
+      ".inst 0xa042a769  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc153bd88  // fdot za.s[x9, 0], { z12.h-z15.h }, z3.h[3]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc153bc89  // fdot za.s[x9, 1], { z4.h-z7.h }, z3.h[3]\n"
+      ".inst 0xc153bd0a  // fdot za.s[x9, 2], { z8.h-z11.h }, z3.h[3]\n"
+      ".inst 0xc153be0b  // fdot za.s[x9, 3], { z16.h-z19.h }, z3.h[3]\n"
+      "bgt 31b\n"
+      "32:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x22\n"
+      ".inst 0xa040a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc15bb208  // fdot za.s[x9, 0], { z16.h-z19.h }, z11.h[0]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bb089  // fdot za.s[x9, 1], { z4.h-z7.h }, z11.h[0]\n"
+      ".inst 0xc15bb18a  // fdot za.s[x9, 2], { z12.h-z15.h }, z11.h[0]\n"
+      ".inst 0xc15bb38b  // fdot za.s[x9, 3], { z28.h-z31.h }, z11.h[0]\n"
+      "ble 33f\n"
+      ".inst 0xa040a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a779  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc15bb488  // fdot za.s[x9, 0], { z4.h-z7.h }, z11.h[1]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bb609  // fdot za.s[x9, 1], { z16.h-z19.h }, z11.h[1]\n"
+      ".inst 0xc15bb58a  // fdot za.s[x9, 2], { z12.h-z15.h }, z11.h[1]\n"
+      ".inst 0xc15bb70b  // fdot za.s[x9, 3], { z24.h-z27.h }, z11.h[1]\n"
+      "ble 33f\n"
+      ".inst 0xa040a76d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa041a77d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc15bb988  // fdot za.s[x9, 0], { z12.h-z15.h }, z11.h[2]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bbb89  // fdot za.s[x9, 1], { z28.h-z31.h }, z11.h[2]\n"
+      ".inst 0xc15bb80a  // fdot za.s[x9, 2], { z0.h-z3.h }, z11.h[2]\n"
+      ".inst 0xc15bb88b  // fdot za.s[x9, 3], { z4.h-z7.h }, z11.h[2]\n"
+      "ble 33f\n"
+      ".inst 0xa040a761  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa041a771  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa042a765  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      ".inst 0xa043a775  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xc15bbc08  // fdot za.s[x9, 0], { z0.h-z3.h }, z11.h[3]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0xc15bbe09  // fdot za.s[x9, 1], { z16.h-z19.h }, z11.h[3]\n"
+      ".inst 0xc15bbc8a  // fdot za.s[x9, 2], { z4.h-z7.h }, z11.h[3]\n"
+      ".inst 0xc15bbe8b  // fdot za.s[x9, 3], { z20.h-z23.h }, z11.h[3]\n"
+      "33:"  // Width 4: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 34f\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      "ld1rh { z19.h }, p1/Z, [x21]\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      "ld1rh { z18.h }, p1/Z, [x20]\n"
+      ".inst 0xc0062c64  // mova { z4.d-z7.d }, za.d[x9, #3]\n"
+      ".inst 0xc120e38a  // fcvt z10.h, { z28.s-z29.s }\n"
+      ".inst 0xc120e3cb  // fcvt z11.h, { z30.s-z31.s }\n"
+      ".inst 0xc120e18c  // fcvt z12.h, { z12.s-z13.s }\n"
+      ".inst 0xc120e1cd  // fcvt z13.h, { z14.s-z15.s }\n"
+      ".inst 0xc172c26a  // fclamp { z10.h-z11.h }, z19.h, z18.h\n"
+      ".inst 0xc120e00e  // fcvt z14.h, { z0.s-z1.s }\n"
+      ".inst 0xc120e04f  // fcvt z15.h, { z2.s-z3.s }\n"
+      ".inst 0xc172c26c  // fclamp { z12.h-z13.h }, z19.h, z18.h\n"
+      ".inst 0xc120e090  // fcvt z16.h, { z4.s-z5.s }\n"
+      ".inst 0xc120e0d1  // fcvt z17.h, { z6.s-z7.s }\n"
+      ".inst 0xc172c26e  // fclamp { z14.h-z15.h }, z19.h, z18.h\n"
+      ".inst 0xc172c270  // fclamp { z16.h-z17.h }, z19.h, z18.h\n"
+      ".inst 0xa060272a  // st1h { z10.h-z11.h }, pn9.b, [x25]\n"
+      ".inst 0xa061272c  // st1h { z12.h-z13.h }, pn9.b, [x25, #0x2, MUL VL]\n"
+      ".inst 0xa062272e  // st1h { z14.h-z15.h }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xa0632330  // st1h { z16.h-z17.h }, p8, [x25, #0x6, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "b 35f\n"
+      "34:"  // Width 4: No activation
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xc0062c5c  // mova { z28.d-z31.d }, za.d[x9, #2]\n"
+      ".inst 0xc0062c68  // mova { z8.d-z11.d }, za.d[x9, #3]\n"
+      ".inst 0xc120e187  // fcvt z7.h, { z12.s-z13.s }\n"
+      ".inst 0xc120e1cf  // fcvt z15.h, { z14.s-z15.s }\n"
+      ".inst 0xa1602727  // st1h { z7.h, z15.h }, pn9.b, [x25]\n"
+      ".inst 0xc120e207  // fcvt z7.h, { z16.s-z17.s }\n"
+      ".inst 0xc120e24f  // fcvt z15.h, { z18.s-z19.s }\n"
+      ".inst 0xa1612727  // st1h { z7.h, z15.h }, pn9.b, [x25, #0x2, MUL VL]\n"
+      ".inst 0xc120e38e  // fcvt z14.h, { z28.s-z29.s }\n"
+      ".inst 0xc120e3cf  // fcvt z15.h, { z30.s-z31.s }\n"
+      ".inst 0xa062272e  // st1h { z14.h-z15.h }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc120e112  // fcvt z18.h, { z8.s-z9.s }\n"
+      ".inst 0xc120e15a  // fcvt z26.h, { z10.s-z11.s }\n"
+      ".inst 0xa1632332  // st1h { z18.h, z26.h }, p8, [x25, #0x6, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "35:"  // Width 4: Output done
+      "subs x26, x26, #0x4\n"
+      "sub %x[N], %x[N], x28, LSL #2\n"
+      "bgt 4b\n"
+      "36:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p8.b\n"
+      : [N] "+&r" (N)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
new file mode 100644
index 0000000000..7d98d5cb98
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "../std_transforms_sme.hpp"
+
+#define ARGLIST  \
+    const float *, const float *, \
+    float *, size_t, size_t, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+void sme2_gemv_fp32_mla_16VL( ARGLIST );
+
+class cls_sme2_gemv_fp32_mla_16VL
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<float>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return true;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return true;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 1> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_fp32_mla_16VL;
+    cls_sme2_gemv_fp32_mla_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
new file mode 100644
index 0000000000..d2c260536d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sme2_gemv_fp32_mla_16VL (
+    const float *A_ptr, const float *B_ptr, float *output_ptr,
+    size_t N, size_t K,
+    const float *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p8.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x28, ALL, MUL #4\n"
+      "add x27, %x[N], x28\n"
+      "sub x27, x27, #0x1\n"
+      "udiv x27, x27, x28\n"
+      "add x22, x27, #0x3\n"
+      "and x22, x22, #0xfffffffffffffffc\n"
+      "mul x22, x22, x28\n"
+      "mul x22, x22, %x[K]\n"
+      "mov x9, #0x0\n"
+      "mov x26, %x[B_ptr]\n"
+      "mov x25, %x[output_ptr]\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "lsl x22, x22, #0x2\n"
+      "mov x21, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x22, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x22, #0, 3f\n"
+      "lsr x22, x22, #0x1\n"
+      "lsl x21, x21, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x20, x22, #0x26\n"
+      "sub x21, x21, #0x1\n"
+      "lsl x21, x21, #0x16\n"
+      "orr x22, x22, x20\n"
+      "orr x22, x22, x21\n"
+      ".inst 0xf8b64b5a  // rprfm pldonce, x22, [x26]\n"
+      "3:"  // RHS prefetch exit
+      "mov x24, %x[bias]\n"
+      "4:"  // Column loop
+      "cmp x27, #0x4\n"
+      "bge 28f\n"
+      "cmp x27, #0x2\n"
+      "bgt 20f\n"
+      "beq 12f\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "mov x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 5f\n"
+      ".inst 0xa040c718  // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f00  // mova za.d[x9, #0], { z24.d-z27.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x22, #0x4\n"
+      "ble 8f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z8.s }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x4\n"
+      ".inst 0xa040c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158a280  // fmla za.s[x9, 0], { z20.s-z23.s }, z8.s[0]\n"
+      "addvl x26, x26, #16\n"
+      "cmp x22, #0x4\n"
+      ".inst 0xa040c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158a480  // fmla za.s[x9, 0], { z4.s-z7.s }, z8.s[1]\n"
+      "addvl x26, x26, #16\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa040c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158ab80  // fmla za.s[x9, 0], { z28.s-z31.s }, z8.s[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158af00  // fmla za.s[x9, 0], { z24.s-z27.s }, z8.s[3]\n"
+      "addvl x26, x26, #16\n"
+      "bgt 7b\n"
+      "8:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z11.s }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa040c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc15ba380  // fmla za.s[x9, 0], { z28.s-z31.s }, z11.s[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba580  // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15baa00  // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bac00  // fmla za.s[x9, 0], { z0.s-z3.s }, z11.s[3]\n"
+      "addvl x26, x26, #16\n"
+      "9:"  // Width 1: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 10f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z3.s }, p1/Z, [x21]\n"
+      "ld1rw { z29.s }, p1/Z, [x20]\n"
+      ".inst 0xc1bdc868  // fclamp { z8.s-z11.s }, z3.s, z29.s\n"
+      ".inst 0xa060c328  // st1w { z8.s-z11.s }, p8, [x25]\n"
+      "addvl x25, x25, #4\n"
+      "b 11f\n"
+      "10:"  // Width 1: No activation
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c32c  // st1w { z12.s-z15.s }, p8, [x25]\n"
+      "addvl x25, x25, #4\n"
+      "11:"  // Width 1: Output done
+      "b 36f\n"
+      "12:"  // Width 2
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "sub x20, %x[N], x28\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 13f\n"
+      ".inst 0xa040c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c714  // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
+      "b 14f\n"
+      "13:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "14:"  // Width 2: setup done
+      "cmp x22, #0x4\n"
+      "ble 16f\n"
+      "15:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z9.s }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x4\n"
+      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159a180  // fmla za.s[x9, 0], { z12.s-z15.s }, z9.s[0]\n"
+      "cmp x22, #0x4\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159a081  // fmla za.s[x9, 1], { z4.s-z7.s }, z9.s[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159a580  // fmla za.s[x9, 0], { z12.s-z15.s }, z9.s[1]\n"
+      ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159a481  // fmla za.s[x9, 1], { z4.s-z7.s }, z9.s[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159ab00  // fmla za.s[x9, 0], { z24.s-z27.s }, z9.s[2]\n"
+      ".inst 0xa041c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159a801  // fmla za.s[x9, 1], { z0.s-z3.s }, z9.s[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159ac00  // fmla za.s[x9, 0], { z0.s-z3.s }, z9.s[3]\n"
+      ".inst 0xa041c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159af81  // fmla za.s[x9, 1], { z28.s-z31.s }, z9.s[3]\n"
+      "addvl x26, x26, #16\n"
+      "bgt 15b\n"
+      "16:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z11.s }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc15ba180  // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[0]\n"
+      ".inst 0xa041c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba001  // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba700  // fmla za.s[x9, 0], { z24.s-z27.s }, z11.s[1]\n"
+      ".inst 0xa041c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba401  // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba980  // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[2]\n"
+      ".inst 0xa041c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bab81  // fmla za.s[x9, 1], { z28.s-z31.s }, z11.s[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[3]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bae81  // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[3]\n"
+      "addvl x26, x26, #16\n"
+      "17:"  // Width 2: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 18f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c00  // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+      "ld1rw { z9.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      "ld1rw { z8.s }, p1/Z, [x20]\n"
+      ".inst 0xc1a8c920  // fclamp { z0.s-z3.s }, z9.s, z8.s\n"
+      ".inst 0xa060c720  // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+      ".inst 0xc1a8c924  // fclamp { z4.s-z7.s }, z9.s, z8.s\n"
+      ".inst 0xa061c324  // st1w { z4.s-z7.s }, p8, [x25, #0x4, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "b 19f\n"
+      "18:"  // Width 2: No activation
+      ".inst 0xc0062c10  // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c730  // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "19:"  // Width 2: Output done
+      "b 36f\n"
+      "20:"  // Width 3
+      "mov x20, #0x2\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 21f\n"
+      ".inst 0xa040c718  // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f00  // mova za.d[x9, #0], { z24.d-z27.d }\n"
+      ".inst 0xa041c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042c81  // mova za.d[x9, #1], { z4.d-z7.d }\n"
+      ".inst 0xa042c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042e02  // mova za.d[x9, #2], { z16.d-z19.d }\n"
+      "b 22f\n"
+      "21:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "22:"  // Width 3: setup done
+      "cmp x22, #0x4\n"
+      "ble 24f\n"
+      "23:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z15.s }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x4\n"
+      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z15.s[0]\n"
+      "cmp x22, #0x4\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z15.s[0]\n"
+      ".inst 0xa042c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fa002  // fmla za.s[x9, 2], { z0.s-z3.s }, z15.s[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fa680  // fmla za.s[x9, 0], { z20.s-z23.s }, z15.s[1]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fa681  // fmla za.s[x9, 1], { z20.s-z23.s }, z15.s[1]\n"
+      ".inst 0xa042c749  // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fa502  // fmla za.s[x9, 2], { z8.s-z11.s }, z15.s[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fab00  // fmla za.s[x9, 0], { z24.s-z27.s }, z15.s[2]\n"
+      ".inst 0xa041c749  // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fa901  // fmla za.s[x9, 1], { z8.s-z11.s }, z15.s[2]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15faa02  // fmla za.s[x9, 2], { z16.s-z19.s }, z15.s[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z15.s[3]\n"
+      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fae01  // fmla za.s[x9, 1], { z16.s-z19.s }, z15.s[3]\n"
+      ".inst 0xa042c749  // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fad02  // fmla za.s[x9, 2], { z8.s-z11.s }, z15.s[3]\n"
+      "addvl x26, x26, #16\n"
+      "bgt 23b\n"
+      "24:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z11.s }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa040c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc15ba380  // fmla za.s[x9, 0], { z28.s-z31.s }, z11.s[0]\n"
+      ".inst 0xa041c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba001  // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[0]\n"
+      ".inst 0xa042c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15ba282  // fmla za.s[x9, 2], { z20.s-z23.s }, z11.s[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba580  // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[1]\n"
+      ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba481  // fmla za.s[x9, 1], { z4.s-z7.s }, z11.s[1]\n"
+      ".inst 0xa042c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15ba782  // fmla za.s[x9, 2], { z28.s-z31.s }, z11.s[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba880  // fmla za.s[x9, 0], { z4.s-z7.s }, z11.s[2]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15baa81  // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[2]\n"
+      ".inst 0xa042c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15ba982  // fmla za.s[x9, 2], { z12.s-z15.s }, z11.s[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bad80  // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[3]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bae81  // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[3]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[3]\n"
+      "addvl x26, x26, #16\n"
+      "25:"  // Width 3: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 26f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c04  // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+      "ld1rw { z17.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c28  // mova { z8.d-z11.d }, za.d[x9, #1]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      ".inst 0xc1b0ca24  // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c724  // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b0ca28  // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
+      ".inst 0xa061c728  // st1w { z8.s-z11.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b0ca2c  // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+      ".inst 0xa062c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
+      "addvl x25, x25, #12\n"
+      "b 27f\n"
+      "26:"  // Width 3: No activation
+      ".inst 0xc0062c14  // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c734  // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c72c  // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
+      "addvl x25, x25, #12\n"
+      "27:"  // Width 3: Output done
+      "b 36f\n"
+      "28:"  // Width 4
+      "mov x20, #0x3\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 29f\n"
+      ".inst 0xa040c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      ".inst 0xa042c70c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042d82  // mova za.d[x9, #2], { z12.d-z15.d }\n"
+      ".inst 0xa043c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
+      ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
+      "addvl x24, x24, #16\n"
+      "b 30f\n"
+      "29:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "30:"  // Width 4: setup done
+      "cmp x22, #0x4\n"
+      "ble 32f\n"
+      "31:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z8.s }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x4\n"
+      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158a200  // fmla za.s[x9, 0], { z16.s-z19.s }, z8.s[0]\n"
+      "cmp x22, #0x4\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa041c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158a181  // fmla za.s[x9, 1], { z12.s-z15.s }, z8.s[0]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158a202  // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[0]\n"
+      ".inst 0xa043c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158a183  // fmla za.s[x9, 3], { z12.s-z15.s }, z8.s[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158a580  // fmla za.s[x9, 0], { z12.s-z15.s }, z8.s[1]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158a681  // fmla za.s[x9, 1], { z20.s-z23.s }, z8.s[1]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158a602  // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[1]\n"
+      ".inst 0xa043c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158a683  // fmla za.s[x9, 3], { z20.s-z23.s }, z8.s[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158a880  // fmla za.s[x9, 0], { z4.s-z7.s }, z8.s[2]\n"
+      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158aa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z8.s[2]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158aa02  // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[2]\n"
+      ".inst 0xa043c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158a803  // fmla za.s[x9, 3], { z0.s-z3.s }, z8.s[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158ae80  // fmla za.s[x9, 0], { z20.s-z23.s }, z8.s[3]\n"
+      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158ae01  // fmla za.s[x9, 1], { z16.s-z19.s }, z8.s[3]\n"
+      ".inst 0xa042c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158ac82  // fmla za.s[x9, 2], { z4.s-z7.s }, z8.s[3]\n"
+      ".inst 0xa043c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158ae83  // fmla za.s[x9, 3], { z20.s-z23.s }, z8.s[3]\n"
+      "addvl x26, x26, #16\n"
+      "bgt 31b\n"
+      "32:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z11.s }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc15ba200  // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[0]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba281  // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[0]\n"
+      ".inst 0xa042c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15ba382  // fmla za.s[x9, 2], { z28.s-z31.s }, z11.s[0]\n"
+      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15ba203  // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba400  // fmla za.s[x9, 0], { z0.s-z3.s }, z11.s[1]\n"
+      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba601  // fmla za.s[x9, 1], { z16.s-z19.s }, z11.s[1]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15ba602  // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[1]\n"
+      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15ba603  // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15baa00  // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[2]\n"
+      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15baa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z11.s[2]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15baa02  // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[2]\n"
+      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15baa03  // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[3]\n"
+      ".inst 0xa041c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15baf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z11.s[3]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[3]\n"
+      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15bae03  // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[3]\n"
+      "addvl x26, x26, #16\n"
+      "33:"  // Width 4: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 34f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      "ld1rw { z21.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c38  // mova { z24.d-z27.d }, za.d[x9, #1]\n"
+      "ld1rw { z20.s }, p1/Z, [x20]\n"
+      ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c72c  // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c70  // mova { z16.d-z19.d }, za.d[x9, #3]\n"
+      ".inst 0xa061c738  // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+      ".inst 0xa062c720  // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      ".inst 0xa063c330  // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
+      "addvl x25, x25, #16\n"
+      "b 35f\n"
+      "34:"  // Width 4: No activation
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c72c  // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc0062c64  // mova { z4.d-z7.d }, za.d[x9, #3]\n"
+      ".inst 0xa063c324  // st1w { z4.s-z7.s }, p8, [x25, #0xc, MUL VL]\n"
+      "addvl x25, x25, #16\n"
+      "35:"  // Width 4: Output done
+      "subs x27, x27, #0x4\n"
+      "sub %x[N], %x[N], x28, LSL #2\n"
+      "bgt 4b\n"
+      "36:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p8.b\n"
+      : [N] "+&r" (N)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
new file mode 100644
index 0000000000..76c2bdd71e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "../std_transforms_sme.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST  \
+    const float *, const bfloat16 *, \
+    float *, size_t, size_t, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+void sme2_gemv_fp32bf16fp32_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_fp32bf16fp32_dot_16VL
+{
+public:
+    typedef bfloat16 operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<float>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 2;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return true;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return true;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 2> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_fp32bf16fp32_dot_16VL;
+    cls_sme2_gemv_fp32bf16fp32_dot_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..c6fa11016f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
@@ -0,0 +1,610 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sme2_gemv_fp32bf16fp32_dot_16VL (
+    const float *A_ptr, const bfloat16 *B_ptr, float *output_ptr,
+    size_t N, size_t K,
+    const float *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p8.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x10, ALL, MUL #4\n"
+      "add x28, %x[N], x10\n"
+      "sub x28, x28, #0x1\n"
+      "udiv x28, x28, x10\n"
+      "add x22, x28, #0x3\n"
+      "and x22, x22, #0xfffffffffffffffc\n"
+      "mul x22, x22, x10\n"
+      "mul x22, x22, %x[K]\n"
+      "mov x9, #0x0\n"
+      "mov x27, #0x4\n"
+      "mov x26, %x[B_ptr]\n"
+      "mov x25, %x[output_ptr]\n"
+      "ptrue p2.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "lsl x22, x22, #0x1\n"
+      "mov x21, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x22, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x22, #0, 3f\n"
+      "lsr x22, x22, #0x1\n"
+      "lsl x21, x21, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x20, x22, #0x26\n"
+      "sub x21, x21, #0x1\n"
+      "lsl x21, x21, #0x16\n"
+      "orr x22, x22, x20\n"
+      "orr x22, x22, x21\n"
+      ".inst 0xf8b64b5a  // rprfm pldonce, x22, [x26]\n"
+      "3:"  // RHS prefetch exit
+      "mov x24, %x[bias]\n"
+      "4:"  // Column loop
+      "cmp x28, #0x4\n"
+      "bge 28f\n"
+      "cmp x28, #0x2\n"
+      "bgt 20f\n"
+      "beq 12f\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "mov x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 5f\n"
+      ".inst 0xa040c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042e00  // mova za.d[x9, #0], { z16.d-z19.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x22, #0x8\n"
+      "ble 8f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z10.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa94a  // bfcvt z10.h, p2/M, z10.s\n"
+      "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+      "uzp1 z10.h, z10.h, z10.h\n"
+      "sub x22, x22, #0x8\n"
+      "uzp1 z16.h, z16.h, z16.h\n"
+      "trn1 z10.d, z10.d, z16.d\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15ab198  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[0]\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      "addvl x26, x26, #16\n"
+      "cmp x22, #0x8\n"
+      ".inst 0xc15ab598  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[1]\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      "addvl x26, x26, #16\n"
+      "add x23, x23, #0x20\n"
+      ".inst 0xc15ab818  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[2]\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15abf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z10.h[3]\n"
+      "bgt 7b\n"
+      "8:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z15.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ef  // bfcvt z15.h, p2/M, z15.s\n"
+      "ld1rqw { z17.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aaa31  // bfcvt z17.h, p2/M, z17.s\n"
+      "uzp1 z15.h, z15.h, z15.h\n"
+      "subs x22, x22, #0x2\n"
+      "uzp1 z17.h, z17.h, z17.h\n"
+      "trn1 z15.d, z15.d, z17.d\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x20\n"
+      ".inst 0xc15fb218  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb418  // bfdot za.s[x9, 0], { z0.h-z3.h }, z15.h[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb898  // bfdot za.s[x9, 0], { z4.h-z7.h }, z15.h[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fbd18  // bfdot za.s[x9, 0], { z8.h-z11.h }, z15.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "9:"  // Width 1: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 10f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c00  // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+      "ld1rw { z8.s }, p2/Z, [x21]\n"
+      "ld1rw { z26.s }, p2/Z, [x20]\n"
+      ".inst 0xc1bac900  // fclamp { z0.s-z3.s }, z8.s, z26.s\n"
+      ".inst 0xa060c320  // st1w { z0.s-z3.s }, p8, [x25]\n"
+      "addvl x25, x25, #4\n"
+      "b 11f\n"
+      "10:"  // Width 1: No activation
+      ".inst 0xc0062c04  // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c324  // st1w { z4.s-z7.s }, p8, [x25]\n"
+      "addvl x25, x25, #4\n"
+      "11:"  // Width 1: Output done
+      "b 36f\n"
+      "12:"  // Width 2
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "sub x20, %x[N], x10\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 13f\n"
+      ".inst 0xa040c718  // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f00  // mova za.d[x9, #0], { z24.d-z27.d }\n"
+      ".inst 0xa041c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      "b 14f\n"
+      "13:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "14:"  // Width 2: setup done
+      "cmp x22, #0x8\n"
+      "ble 16f\n"
+      "15:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z13.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ad  // bfcvt z13.h, p2/M, z13.s\n"
+      "ld1rqw { z27.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aab7b  // bfcvt z27.h, p2/M, z27.s\n"
+      "uzp1 z13.h, z13.h, z13.h\n"
+      "sub x22, x22, #0x8\n"
+      "uzp1 z27.h, z27.h, z27.h\n"
+      "trn1 z13.d, z13.d, z27.d\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      "cmp x22, #0x8\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15db298  // bfdot za.s[x9, 0], { z20.h-z23.h }, z13.h[0]\n"
+      "addvl x26, x26, #16\n"
+      "add x23, x23, #0x20\n"
+      ".inst 0xc15db019  // bfdot za.s[x9, 1], { z0.h-z3.h }, z13.h[0]\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15db698  // bfdot za.s[x9, 0], { z20.h-z23.h }, z13.h[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15db719  // bfdot za.s[x9, 1], { z24.h-z27.h }, z13.h[1]\n"
+      ".inst 0xa040a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15db918  // bfdot za.s[x9, 0], { z8.h-z11.h }, z13.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15dba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z13.h[2]\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15dbc18  // bfdot za.s[x9, 0], { z0.h-z3.h }, z13.h[3]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15dbc99  // bfdot za.s[x9, 1], { z4.h-z7.h }, z13.h[3]\n"
+      "bgt 15b\n"
+      "16:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z15.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ef  // bfcvt z15.h, p2/M, z15.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aa8a5  // bfcvt z5.h, p2/M, z5.s\n"
+      "uzp1 z15.h, z15.h, z15.h\n"
+      "subs x22, x22, #0x2\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "trn1 z15.d, z15.d, z5.d\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x20\n"
+      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb218  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15fb319  // bfdot za.s[x9, 1], { z24.h-z27.h }, z15.h[0]\n"
+      "ble 17f\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb798  // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[1]\n"
+      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[2]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fbf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[3]\n"
+      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fbd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "17:"  // Width 2: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 18f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c14  // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+      "ld1rw { z11.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      ".inst 0xc1bcc974  // fclamp { z20.s-z23.s }, z11.s, z28.s\n"
+      ".inst 0xa060c734  // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
+      ".inst 0xc1bcc96c  // fclamp { z12.s-z15.s }, z11.s, z28.s\n"
+      ".inst 0xa061c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "b 19f\n"
+      "18:"  // Width 2: No activation
+      ".inst 0xc0062c00  // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c720  // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c20  // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c320  // st1w { z0.s-z3.s }, p8, [x25, #0x4, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "19:"  // Width 2: Output done
+      "b 36f\n"
+      "20:"  // Width 3
+      "mov x20, #0x2\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "msub x20, x10, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 21f\n"
+      ".inst 0xa040c71c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f80  // mova za.d[x9, #0], { z28.d-z31.d }\n"
+      ".inst 0xa041c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042c81  // mova za.d[x9, #1], { z4.d-z7.d }\n"
+      ".inst 0xa042c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042c82  // mova za.d[x9, #2], { z4.d-z7.d }\n"
+      "b 22f\n"
+      "21:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "22:"  // Width 3: setup done
+      "cmp x22, #0x8\n"
+      "ble 24f\n"
+      "23:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z14.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ce  // bfcvt z14.h, p2/M, z14.s\n"
+      "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+      "uzp1 z14.h, z14.h, z14.h\n"
+      "sub x22, x22, #0x8\n"
+      "uzp1 z16.h, z16.h, z16.h\n"
+      "trn1 z14.d, z14.d, z16.d\n"
+      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "cmp x22, #0x8\n"
+      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15eb098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z14.h[0]\n"
+      "add x23, x23, #0x20\n"
+      ".inst 0xa042a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15eb319  // bfdot za.s[x9, 1], { z24.h-z27.h }, z14.h[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15eb01a  // bfdot za.s[x9, 2], { z0.h-z3.h }, z14.h[0]\n"
+      ".inst 0xa040a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15eb518  // bfdot za.s[x9, 0], { z8.h-z11.h }, z14.h[1]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15eb499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z14.h[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15eb61a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z14.h[1]\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15eb818  // bfdot za.s[x9, 0], { z0.h-z3.h }, z14.h[2]\n"
+      ".inst 0xa042a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15ebb99  // bfdot za.s[x9, 1], { z28.h-z31.h }, z14.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15eb81a  // bfdot za.s[x9, 2], { z0.h-z3.h }, z14.h[2]\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ebf18  // bfdot za.s[x9, 0], { z24.h-z27.h }, z14.h[3]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15ebf99  // bfdot za.s[x9, 1], { z28.h-z31.h }, z14.h[3]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15ebe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z14.h[3]\n"
+      "bgt 23b\n"
+      "24:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z15.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ef  // bfcvt z15.h, p2/M, z15.s\n"
+      "ld1rqw { z31.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aabff  // bfcvt z31.h, p2/M, z31.s\n"
+      "uzp1 z15.h, z15.h, z15.h\n"
+      "subs x22, x22, #0x2\n"
+      "uzp1 z31.h, z31.h, z31.h\n"
+      "trn1 z15.d, z15.d, z31.d\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x20\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb218  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
+      ".inst 0xa042a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fb019  // bfdot za.s[x9, 1], { z0.h-z3.h }, z15.h[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15fb09a  // bfdot za.s[x9, 2], { z4.h-z7.h }, z15.h[0]\n"
+      "ble 25f\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb698  // bfdot za.s[x9, 0], { z20.h-z23.h }, z15.h[1]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb699  // bfdot za.s[x9, 1], { z20.h-z23.h }, z15.h[1]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fb61a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb898  // bfdot za.s[x9, 0], { z4.h-z7.h }, z15.h[2]\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb819  // bfdot za.s[x9, 1], { z0.h-z3.h }, z15.h[2]\n"
+      ".inst 0xa042a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fbb1a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z15.h[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fbf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[3]\n"
+      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fbd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[3]\n"
+      ".inst 0xa042a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fbc9a  // bfdot za.s[x9, 2], { z4.h-z7.h }, z15.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "25:"  // Width 3: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 26f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      "ld1rw { z17.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      ".inst 0xc1b0ca3c  // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c73c  // st1w { z28.s-z31.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b0ca24  // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+      ".inst 0xa061c724  // st1w { z4.s-z7.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b0ca2c  // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+      ".inst 0xa062c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
+      "addvl x25, x25, #12\n"
+      "b 27f\n"
+      "26:"  // Width 3: No activation
+      ".inst 0xc0062c00  // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c720  // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c330  // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
+      "addvl x25, x25, #12\n"
+      "27:"  // Width 3: Output done
+      "b 36f\n"
+      "28:"  // Width 4
+      "mov x20, #0x3\n"
+      "mov x23, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "msub x20, x10, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
+      "cbz x24, 29f\n"
+      ".inst 0xa040c70c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042d80  // mova za.d[x9, #0], { z12.d-z15.d }\n"
+      ".inst 0xa041c70c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042d81  // mova za.d[x9, #1], { z12.d-z15.d }\n"
+      ".inst 0xa042c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042e02  // mova za.d[x9, #2], { z16.d-z19.d }\n"
+      ".inst 0xa043c714  // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
+      ".inst 0xc0042e83  // mova za.d[x9, #3], { z20.d-z23.d }\n"
+      "addvl x24, x24, #16\n"
+      "b 30f\n"
+      "29:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "30:"  // Width 4: setup done
+      "cmp x22, #0x8\n"
+      "ble 32f\n"
+      "31:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z6.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa8c6  // bfcvt z6.h, p2/M, z6.s\n"
+      "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "sub x22, x22, #0x8\n"
+      "uzp1 z16.h, z16.h, z16.h\n"
+      "trn1 z6.d, z6.d, z16.d\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      "cmp x22, #0x8\n"
+      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc156b198  // bfdot za.s[x9, 0], { z12.h-z15.h }, z6.h[0]\n"
+      "add x23, x23, #0x20\n"
+      ".inst 0xa042a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc156b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z6.h[0]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc156b19a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z6.h[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc156b21b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z6.h[0]\n"
+      ".inst 0xa040a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc156b518  // bfdot za.s[x9, 0], { z8.h-z11.h }, z6.h[1]\n"
+      ".inst 0xa042a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc156b599  // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[1]\n"
+      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc156b41a  // bfdot za.s[x9, 2], { z0.h-z3.h }, z6.h[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc156b69b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z6.h[1]\n"
+      ".inst 0xa040a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc156b918  // bfdot za.s[x9, 0], { z8.h-z11.h }, z6.h[2]\n"
+      ".inst 0xa042a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc156b999  // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[2]\n"
+      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc156b91a  // bfdot za.s[x9, 2], { z8.h-z11.h }, z6.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc156ba9b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z6.h[2]\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc156bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z6.h[3]\n"
+      ".inst 0xa042a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc156bd99  // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[3]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc156bf1a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z6.h[3]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc156be1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z6.h[3]\n"
+      "bgt 31b\n"
+      "32:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z15.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ef  // bfcvt z15.h, p2/M, z15.s\n"
+      "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+      "uzp1 z15.h, z15.h, z15.h\n"
+      "subs x22, x22, #0x2\n"
+      "uzp1 z16.h, z16.h, z16.h\n"
+      "trn1 z15.d, z15.d, z16.d\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+      "add x23, x23, #0x20\n"
+      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb318  // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[0]\n"
+      ".inst 0xa042a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fb099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[0]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15fb01a  // bfdot za.s[x9, 2], { z0.h-z3.h }, z15.h[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15fb21b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z15.h[0]\n"
+      "ble 33f\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[1]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[1]\n"
+      ".inst 0xa042a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fb69a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z15.h[1]\n"
+      ".inst 0xa043a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15fb41b  // bfdot za.s[x9, 3], { z0.h-z3.h }, z15.h[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[2]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[2]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[2]\n"
+      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15fba9b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z15.h[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fbe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[3]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fbe19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[3]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fbe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[3]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15fbe1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z15.h[3]\n"
+      "addvl x26, x26, #16\n"
+      "33:"  // Width 4: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 34f\n"
+      "add x21, %x[args_ptr], %[offset_min]\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      "ld1rw { z21.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c38  // mova { z24.d-z27.d }, za.d[x9, #1]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c72c  // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c70  // mova { z16.d-z19.d }, za.d[x9, #3]\n"
+      ".inst 0xa061c738  // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+      ".inst 0xa062c720  // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      ".inst 0xa063c330  // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
+      "addvl x25, x25, #16\n"
+      "b 35f\n"
+      "34:"  // Width 4: No activation
+      ".inst 0xc0062c10  // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c730  // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c54  // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c734  // st1w { z20.s-z23.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc0062c78  // mova { z24.d-z27.d }, za.d[x9, #3]\n"
+      ".inst 0xa063c338  // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
+      "addvl x25, x25, #16\n"
+      "35:"  // Width 4: Output done
+      "subs x28, x28, #0x4\n"
+      "sub %x[N], %x[N], x10, LSL #2\n"
+      "bgt 4b\n"
+      "36:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p8.b\n"
+      : [N] "+&r" (N)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
new file mode 100644
index 0000000000..65e4667f88
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "../std_transforms_sme.hpp"
+
+#define ARGLIST  \
+    const int8_t *, const int8_t *, \
+    int8_t *, size_t, size_t, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+void sme2_gemv_s8qa_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_s8qa_dot_16VL
+{
+public:
+    typedef int8_t operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<int32_t>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return false;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 4> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_s8qa_dot_16VL;
+    cls_sme2_gemv_s8qa_dot_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..86bd8aeb04
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
@@ -0,0 +1,675 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sme2_gemv_s8qa_dot_16VL (
+    const int8_t *A_ptr, const int8_t *B_ptr, int8_t *output_ptr,
+    size_t N, size_t K,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p8.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x28, ALL, MUL #4\n"
+      "add x27, %x[N], x28\n"
+      "sub x27, x27, #0x1\n"
+      "udiv x27, x27, x28\n"
+      "add x22, x27, #0x3\n"
+      "and x22, x22, #0xfffffffffffffffc\n"
+      "mul x22, x22, x28\n"
+      "mov x9, #0x0\n"
+      "mov x26, %x[B_ptr]\n"
+      "mov x25, %x[output_ptr]\n"
+      "ptrue p2.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "mul x22, x22, %x[K]\n"
+      "mov x21, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x22, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x22, #0, 3f\n"
+      "lsr x22, x22, #0x1\n"
+      "lsl x21, x21, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x20, x22, #0x26\n"
+      "sub x21, x21, #0x1\n"
+      "lsl x21, x21, #0x16\n"
+      "orr x22, x22, x20\n"
+      "orr x22, x22, x21\n"
+      ".inst 0xf8b64b5a  // rprfm pldonce, x22, [x26]\n"
+      "3:"  // RHS prefetch exit
+      "mov x24, %x[col_bias]\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.b, #0x1\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "4:"  // Column loop
+      "cmp x27, #0x4\n"
+      "bge 34f\n"
+      "cmp x27, #0x2\n"
+      "bgt 24f\n"
+      "beq 14f\n"
+      "mov x23, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "mov x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      "whilelt p1.b, XZR, x20\n"
+      "cbz x24, 5f\n"
+      ".inst 0xa040c300  // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x22, #0x10\n"
+      "ble 9f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b2a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b5a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b9a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bda0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "tbnz %x[flags], #31, 8f\n"
+      "sdot z28.s, z1.b, z29.b\n"
+      "8:"  // Width 1: Multiply loop: unique 1: skip row sum
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
+      "bgt 7b\n"
+      "9:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc151b1a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b920  // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bd20  // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "10:"  // Width 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 11f\n"
+      "sdot z28.s, z1.b, z29.b\n"
+      "11:"  // Width 1: Multiply loop: unique 2: skip row sum
+      "tbnz %x[flags], #31, 12f\n"
+      "add x21, %x[qp], %[b_offset]\n"
+      "mov x20, #0x4\n"
+      "ld1rw { z26.s }, p2/Z, [x21]\n"
+      "neg z26.s, p2/M, z26.s\n"
+      "whilelt p0.s, XZR, x20\n"
+      "saddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z26.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "12:"  // Width 1: skip row sum fixup
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z7.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "add x21, %x[qp], %[minval]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a1ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      "ld1rw { z30.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a2ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      ".inst 0xc1bece0c  // sclamp { z12.s-z15.s }, z16.s, z30.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z19.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z19.b\n"
+      "st1b { z12.b }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "13:"  // Width 1: Output done
+      "b 44f\n"
+      "14:"  // Width 2
+      "mov x23, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "sub x20, %x[N], x28\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      "whilelt p1.b, XZR, x20\n"
+      "cbz x24, 15f\n"
+      ".inst 0xa040c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e00  // mova za.d[x9, #0], { z16.d-z19.d }\n"
+      ".inst 0xa041c318  // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042f01  // mova za.d[x9, #1], { z24.d-z27.d }\n"
+      "b 16f\n"
+      "15:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "16:"  // Width 2: setup done
+      "cmp x22, #0x10\n"
+      "ble 19f\n"
+      "17:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b1a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+      ".inst 0xa0418359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b321  // sdot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b620  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b6a1  // sdot za.s[x9, 1], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b9a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bca0  // sdot za.s[x9, 0], { z4.b-z7.b }, z1.b[3]\n"
+      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bd21  // sdot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "tbnz %x[flags], #31, 18f\n"
+      "sdot z28.s, z1.b, z29.b\n"
+      "18:"  // Width 2: Multiply loop: unique 3: skip row sum
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
+      "bgt 17b\n"
+      "19:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc151b320  // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b221  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 20f\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b9a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bf20  // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bd21  // sdot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "20:"  // Width 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 21f\n"
+      "sdot z28.s, z1.b, z29.b\n"
+      "21:"  // Width 2: Multiply loop: unique 4: skip row sum
+      "tbnz %x[flags], #31, 22f\n"
+      "add x21, %x[qp], %[b_offset]\n"
+      "mov x20, #0x4\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
+      "whilelt p0.s, XZR, x20\n"
+      "saddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "22:"  // Width 2: skip row sum fixup
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z6.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z5.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "add x21, %x[qp], %[minval]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z9.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c18  // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+      ".inst 0xc0062c20  // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+      ".inst 0xc1a5aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a5aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+      ".inst 0xc1a9ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+      ".inst 0xc1a9ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+      ".inst 0xc1b5ce18  // sclamp { z24.s-z27.s }, z16.s, z21.s\n"
+      ".inst 0xc1b5ce00  // sclamp { z0.s-z3.s }, z16.s, z21.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "uzp1 z9.h, z26.h, z27.h\n"
+      "uzp1 z0.h, z0.h, z1.h\n"
+      "uzp1 z26.h, z2.h, z3.h\n"
+      "uzp1 z24.b, z24.b, z9.b\n"
+      "st1b { z24.b }, p2, [x25]\n"
+      "uzp1 z0.b, z0.b, z26.b\n"
+      "st1b { z0.b }, p1, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      "23:"  // Width 2: Output done
+      "b 44f\n"
+      "24:"  // Width 3
+      "mov x20, #0x2\n"
+      "mov x23, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      "whilelt p1.b, XZR, x20\n"
+      "cbz x24, 25f\n"
+      ".inst 0xa040c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e00  // mova za.d[x9, #0], { z16.d-z19.d }\n"
+      ".inst 0xa041c30c  // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042d81  // mova za.d[x9, #1], { z12.d-z15.d }\n"
+      ".inst 0xa042c318  // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042f02  // mova za.d[x9, #2], { z24.d-z27.d }\n"
+      "b 26f\n"
+      "25:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "26:"  // Width 3: setup done
+      "cmp x22, #0x10\n"
+      "ble 29f\n"
+      "27:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b221  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b5a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0428355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b6a2  // sdot za.s[x9, 2], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b920  // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
+      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z1.b[2]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151ba22  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bf20  // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bca1  // sdot za.s[x9, 1], { z4.b-z7.b }, z1.b[3]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "tbnz %x[flags], #31, 28f\n"
+      "sdot z28.s, z1.b, z29.b\n"
+      "28:"  // Width 3: Multiply loop: unique 5: skip row sum
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
+      "bgt 27b\n"
+      "29:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc151b2a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b221  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b222  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b720  // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151ba20  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151ba21  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0428355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151baa2  // sdot za.s[x9, 2], { z20.b-z23.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 30f\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bda0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151be21  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151bda2  // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "30:"  // Width 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 31f\n"
+      "sdot z28.s, z1.b, z29.b\n"
+      "31:"  // Width 3: Multiply loop: unique 6: skip row sum
+      "tbnz %x[flags], #31, 32f\n"
+      "add x21, %x[qp], %[b_offset]\n"
+      "mov x20, #0x4\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
+      "whilelt p0.s, XZR, x20\n"
+      "saddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "32:"  // Width 3: skip row sum fixup
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "add x21, %x[qp], %[minval]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z3.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a2ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      ".inst 0xc1a1aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a3ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+      ".inst 0xc1a3ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+      ".inst 0xc1a3ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+      ".inst 0xc1a0ce08  // sclamp { z8.s-z11.s }, z16.s, z0.s\n"
+      ".inst 0xc1a0ce04  // sclamp { z4.s-z7.s }, z16.s, z0.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0xc1a0ce0c  // sclamp { z12.s-z15.s }, z16.s, z0.s\n"
+      "uzp1 z18.h, z10.h, z11.h\n"
+      "uzp1 z4.h, z4.h, z5.h\n"
+      "uzp1 z17.h, z6.h, z7.h\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z16.h, z14.h, z15.h\n"
+      "uzp1 z8.b, z8.b, z18.b\n"
+      "st1b { z8.b }, p2, [x25]\n"
+      "uzp1 z4.b, z4.b, z17.b\n"
+      "st1b { z4.b }, p2, [x25, #1, MUL VL]\n"
+      "uzp1 z12.b, z12.b, z16.b\n"
+      "st1b { z12.b }, p1, [x25, #2, MUL VL]\n"
+      "addvl x25, x25, #3\n"
+      "33:"  // Width 3: Output done
+      "b 44f\n"
+      "34:"  // Width 4
+      "mov x20, #0x3\n"
+      "mov x23, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      "whilelt p1.b, XZR, x20\n"
+      "cbz x24, 35f\n"
+      ".inst 0xa040c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e80  // mova za.d[x9, #0], { z20.d-z23.d }\n"
+      ".inst 0xa041c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      ".inst 0xa042c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042e02  // mova za.d[x9, #2], { z16.d-z19.d }\n"
+      ".inst 0xa043c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n"
+      ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
+      "addvl x24, x24, #16\n"
+      "b 36f\n"
+      "35:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "36:"  // Width 4: setup done
+      "cmp x22, #0x10\n"
+      "ble 39f\n"
+      "37:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b221  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
+      ".inst 0xa043834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151b1a3  // sdot za.s[x9, 3], { z12.b-z15.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b620  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151b623  // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151ba20  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151ba22  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151ba23  // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bda0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bda1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa0428359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151bf22  // sdot za.s[x9, 2], { z24.b-z27.b }, z1.b[3]\n"
+      ".inst 0xa0438345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151bca3  // sdot za.s[x9, 3], { z4.b-z7.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "tbnz %x[flags], #31, 38f\n"
+      "sdot z28.s, z1.b, z29.b\n"
+      "38:"  // Width 4: Multiply loop: unique 7: skip row sum
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
+      "bgt 37b\n"
+      "39:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc151b1a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+      ".inst 0xa0418359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b321  // sdot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+      ".inst 0xa0428349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b122  // sdot za.s[x9, 2], { z8.b-z11.b }, z1.b[0]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151b223  // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b620  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b621  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b5a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0438355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151b6a3  // sdot za.s[x9, 3], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151ba20  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151ba21  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151ba22  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151ba23  // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151be20  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151be21  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151be23  // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "40:"  // Width 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 41f\n"
+      "sdot z28.s, z1.b, z29.b\n"
+      "41:"  // Width 4: Multiply loop: unique 8: skip row sum
+      "tbnz %x[flags], #31, 42f\n"
+      "add x21, %x[qp], %[b_offset]\n"
+      "mov x20, #0x4\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
+      "whilelt p0.s, XZR, x20\n"
+      "saddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "42:"  // Width 4: skip row sum fixup
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z11.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z7.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "add x21, %x[qp], %[minval]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z6.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z3.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c18  // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+      ".inst 0xc1abac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xc1abac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+      ".inst 0xc0062c54  // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+      ".inst 0xc1abac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+      ".inst 0xc0062c6c  // mova { z12.d-z15.d }, za.d[x9, #3]\n"
+      ".inst 0xc1abac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+      ".inst 0xc1a7aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+      "ld1rw { z31.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+      ".inst 0xc1a7aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      ".inst 0xc1a6ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+      ".inst 0xc1a6ab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1bfcc78  // sclamp { z24.s-z27.s }, z3.s, z31.s\n"
+      ".inst 0xc1bfcc70  // sclamp { z16.s-z19.s }, z3.s, z31.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      ".inst 0xc1bfcc74  // sclamp { z20.s-z23.s }, z3.s, z31.s\n"
+      ".inst 0xc1bfcc6c  // sclamp { z12.s-z15.s }, z3.s, z31.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "uzp1 z18.h, z18.h, z19.h\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "uzp1 z17.h, z22.h, z23.h\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z30.h, z14.h, z15.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p2, [x25]\n"
+      "uzp1 z16.b, z16.b, z18.b\n"
+      "st1b { z16.b }, p2, [x25, #1, MUL VL]\n"
+      "uzp1 z20.b, z20.b, z17.b\n"
+      "uzp1 z12.b, z12.b, z30.b\n"
+      "st1b { z20.b }, p2, [x25, #2, MUL VL]\n"
+      "st1b { z12.b }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "43:"  // Width 4: Output done
+      "subs x27, x27, #0x4\n"
+      "sub %x[N], %x[N], x28, LSL #2\n"
+      "bgt 4b\n"
+      "44:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p8.b\n"
+      : [N] "+&r" (N), [flags] "+&r" (flags)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
new file mode 100644
index 0000000000..46d8c4439b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "../std_transforms_sme.hpp"
+
+#define ARGLIST  \
+    const uint8_t *, const uint8_t *, \
+    uint8_t *, size_t, size_t, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+void sme2_gemv_u8qa_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_u8qa_dot_16VL
+{
+public:
+    typedef uint8_t operand_type;
+    typedef uint8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<uint32_t>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return false;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 4> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_u8qa_dot_16VL;
+    cls_sme2_gemv_u8qa_dot_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..093feee6ce
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
@@ -0,0 +1,675 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sme2_gemv_u8qa_dot_16VL (
+    const uint8_t *A_ptr, const uint8_t *B_ptr, uint8_t *output_ptr,
+    size_t N, size_t K,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p8.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x28, ALL, MUL #4\n"
+      "add x27, %x[N], x28\n"
+      "sub x27, x27, #0x1\n"
+      "udiv x27, x27, x28\n"
+      "add x22, x27, #0x3\n"
+      "and x22, x22, #0xfffffffffffffffc\n"
+      "mul x22, x22, x28\n"
+      "mov x9, #0x0\n"
+      "mov x26, %x[B_ptr]\n"
+      "mov x25, %x[output_ptr]\n"
+      "ptrue p2.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "mul x22, x22, %x[K]\n"
+      "mov x21, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x22, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x22, #0, 3f\n"
+      "lsr x22, x22, #0x1\n"
+      "lsl x21, x21, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x20, x22, #0x26\n"
+      "sub x21, x21, #0x1\n"
+      "lsl x21, x21, #0x16\n"
+      "orr x22, x22, x20\n"
+      "orr x22, x22, x21\n"
+      ".inst 0xf8b64b5a  // rprfm pldonce, x22, [x26]\n"
+      "3:"  // RHS prefetch exit
+      "mov x24, %x[col_bias]\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.b, #0x1\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "4:"  // Column loop
+      "cmp x27, #0x4\n"
+      "bge 34f\n"
+      "cmp x27, #0x2\n"
+      "bgt 24f\n"
+      "beq 14f\n"
+      "mov x23, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "mov x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      "whilelt p1.b, XZR, x20\n"
+      "cbz x24, 5f\n"
+      ".inst 0xa040c300  // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x22, #0x10\n"
+      "ble 9f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b2b0  // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b5b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b9b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bdb0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "tbnz %x[flags], #31, 8f\n"
+      "udot z28.s, z1.b, z29.b\n"
+      "8:"  // Width 1: Multiply loop: unique 1: skip row sum
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
+      "bgt 7b\n"
+      "9:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc151b1b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b930  // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bd30  // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "10:"  // Width 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 11f\n"
+      "udot z28.s, z1.b, z29.b\n"
+      "11:"  // Width 1: Multiply loop: unique 2: skip row sum
+      "tbnz %x[flags], #31, 12f\n"
+      "add x21, %x[qp], %[b_offset]\n"
+      "mov x20, #0x4\n"
+      "ld1rw { z26.s }, p2/Z, [x21]\n"
+      "neg z26.s, p2/M, z26.s\n"
+      "whilelt p0.s, XZR, x20\n"
+      "uaddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z26.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "12:"  // Width 1: skip row sum fixup
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z7.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "add x21, %x[qp], %[minval]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a1ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      "ld1rw { z30.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a2ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      ".inst 0xc1bece0c  // sclamp { z12.s-z15.s }, z16.s, z30.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z19.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z19.b\n"
+      "st1b { z12.b }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "13:"  // Width 1: Output done
+      "b 44f\n"
+      "14:"  // Width 2
+      "mov x23, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "sub x20, %x[N], x28\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      "whilelt p1.b, XZR, x20\n"
+      "cbz x24, 15f\n"
+      ".inst 0xa040c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e00  // mova za.d[x9, #0], { z16.d-z19.d }\n"
+      ".inst 0xa041c318  // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042f01  // mova za.d[x9, #1], { z24.d-z27.d }\n"
+      "b 16f\n"
+      "15:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "16:"  // Width 2: setup done
+      "cmp x22, #0x10\n"
+      "ble 19f\n"
+      "17:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b1b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+      ".inst 0xa0418359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b331  // udot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b630  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b6b1  // udot za.s[x9, 1], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b9b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bcb0  // udot za.s[x9, 0], { z4.b-z7.b }, z1.b[3]\n"
+      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bd31  // udot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "tbnz %x[flags], #31, 18f\n"
+      "udot z28.s, z1.b, z29.b\n"
+      "18:"  // Width 2: Multiply loop: unique 3: skip row sum
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
+      "bgt 17b\n"
+      "19:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc151b330  // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b231  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 20f\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b9b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bf30  // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bd31  // udot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "20:"  // Width 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 21f\n"
+      "udot z28.s, z1.b, z29.b\n"
+      "21:"  // Width 2: Multiply loop: unique 4: skip row sum
+      "tbnz %x[flags], #31, 22f\n"
+      "add x21, %x[qp], %[b_offset]\n"
+      "mov x20, #0x4\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
+      "whilelt p0.s, XZR, x20\n"
+      "uaddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "22:"  // Width 2: skip row sum fixup
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z6.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z5.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "add x21, %x[qp], %[minval]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z9.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c18  // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+      ".inst 0xc0062c20  // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+      ".inst 0xc1a5aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a5aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+      ".inst 0xc1a9ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+      ".inst 0xc1a9ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+      ".inst 0xc1b5ce18  // sclamp { z24.s-z27.s }, z16.s, z21.s\n"
+      ".inst 0xc1b5ce00  // sclamp { z0.s-z3.s }, z16.s, z21.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "uzp1 z9.h, z26.h, z27.h\n"
+      "uzp1 z0.h, z0.h, z1.h\n"
+      "uzp1 z26.h, z2.h, z3.h\n"
+      "uzp1 z24.b, z24.b, z9.b\n"
+      "st1b { z24.b }, p2, [x25]\n"
+      "uzp1 z0.b, z0.b, z26.b\n"
+      "st1b { z0.b }, p1, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      "23:"  // Width 2: Output done
+      "b 44f\n"
+      "24:"  // Width 3
+      "mov x20, #0x2\n"
+      "mov x23, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      "whilelt p1.b, XZR, x20\n"
+      "cbz x24, 25f\n"
+      ".inst 0xa040c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e00  // mova za.d[x9, #0], { z16.d-z19.d }\n"
+      ".inst 0xa041c30c  // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042d81  // mova za.d[x9, #1], { z12.d-z15.d }\n"
+      ".inst 0xa042c318  // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042f02  // mova za.d[x9, #2], { z24.d-z27.d }\n"
+      "b 26f\n"
+      "25:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "26:"  // Width 3: setup done
+      "cmp x22, #0x10\n"
+      "ble 29f\n"
+      "27:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b230  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b231  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b5b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0428355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b6b2  // udot za.s[x9, 2], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b930  // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
+      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z1.b[2]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151ba32  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bf30  // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bcb1  // udot za.s[x9, 1], { z4.b-z7.b }, z1.b[3]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151be32  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "tbnz %x[flags], #31, 28f\n"
+      "udot z28.s, z1.b, z29.b\n"
+      "28:"  // Width 3: Multiply loop: unique 5: skip row sum
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
+      "bgt 27b\n"
+      "29:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc151b2b0  // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b231  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b232  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b730  // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b632  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151ba30  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151ba31  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0428355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151bab2  // udot za.s[x9, 2], { z20.b-z23.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 30f\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bdb0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151be31  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151bdb2  // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "30:"  // Width 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 31f\n"
+      "udot z28.s, z1.b, z29.b\n"
+      "31:"  // Width 3: Multiply loop: unique 6: skip row sum
+      "tbnz %x[flags], #31, 32f\n"
+      "add x21, %x[qp], %[b_offset]\n"
+      "mov x20, #0x4\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
+      "whilelt p0.s, XZR, x20\n"
+      "uaddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "32:"  // Width 3: skip row sum fixup
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "add x21, %x[qp], %[minval]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z3.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a2ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      ".inst 0xc1a1aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a3ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+      ".inst 0xc1a3ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+      ".inst 0xc1a3ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+      ".inst 0xc1a0ce08  // sclamp { z8.s-z11.s }, z16.s, z0.s\n"
+      ".inst 0xc1a0ce04  // sclamp { z4.s-z7.s }, z16.s, z0.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0xc1a0ce0c  // sclamp { z12.s-z15.s }, z16.s, z0.s\n"
+      "uzp1 z18.h, z10.h, z11.h\n"
+      "uzp1 z4.h, z4.h, z5.h\n"
+      "uzp1 z17.h, z6.h, z7.h\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z16.h, z14.h, z15.h\n"
+      "uzp1 z8.b, z8.b, z18.b\n"
+      "st1b { z8.b }, p2, [x25]\n"
+      "uzp1 z4.b, z4.b, z17.b\n"
+      "st1b { z4.b }, p2, [x25, #1, MUL VL]\n"
+      "uzp1 z12.b, z12.b, z16.b\n"
+      "st1b { z12.b }, p1, [x25, #2, MUL VL]\n"
+      "addvl x25, x25, #3\n"
+      "33:"  // Width 3: Output done
+      "b 44f\n"
+      "34:"  // Width 4
+      "mov x20, #0x3\n"
+      "mov x23, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
+      "whilelt p1.b, XZR, x20\n"
+      "cbz x24, 35f\n"
+      ".inst 0xa040c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e80  // mova za.d[x9, #0], { z20.d-z23.d }\n"
+      ".inst 0xa041c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      ".inst 0xa042c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042e02  // mova za.d[x9, #2], { z16.d-z19.d }\n"
+      ".inst 0xa043c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n"
+      ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
+      "addvl x24, x24, #16\n"
+      "b 36f\n"
+      "35:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "36:"  // Width 4: setup done
+      "cmp x22, #0x10\n"
+      "ble 39f\n"
+      "37:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b230  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b231  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
+      ".inst 0xa043834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151b1b3  // udot za.s[x9, 3], { z12.b-z15.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b630  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b632  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151b633  // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151ba30  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151ba32  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151ba33  // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bdb0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bdb1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa0428359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151bf32  // udot za.s[x9, 2], { z24.b-z27.b }, z1.b[3]\n"
+      ".inst 0xa0438345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151bcb3  // udot za.s[x9, 3], { z4.b-z7.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "tbnz %x[flags], #31, 38f\n"
+      "udot z28.s, z1.b, z29.b\n"
+      "38:"  // Width 4: Multiply loop: unique 7: skip row sum
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
+      "bgt 37b\n"
+      "39:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0xc151b1b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+      ".inst 0xa0418359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b331  // udot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+      ".inst 0xa0428349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b132  // udot za.s[x9, 2], { z8.b-z11.b }, z1.b[0]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151b233  // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b630  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b631  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b5b2  // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0438355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151b6b3  // udot za.s[x9, 3], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151ba30  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151ba31  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151ba32  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151ba33  // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151be30  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151be31  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151be32  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151be33  // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[3]\n"
+      "addvl x26, x26, #16\n"
+      "40:"  // Width 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 41f\n"
+      "udot z28.s, z1.b, z29.b\n"
+      "41:"  // Width 4: Multiply loop: unique 8: skip row sum
+      "tbnz %x[flags], #31, 42f\n"
+      "add x21, %x[qp], %[b_offset]\n"
+      "mov x20, #0x4\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
+      "whilelt p0.s, XZR, x20\n"
+      "uaddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "42:"  // Width 4: skip row sum fixup
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z11.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z7.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "add x21, %x[qp], %[minval]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z6.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z3.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c18  // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+      ".inst 0xc1abac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xc1abac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+      ".inst 0xc0062c54  // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+      ".inst 0xc1abac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+      ".inst 0xc0062c6c  // mova { z12.d-z15.d }, za.d[x9, #3]\n"
+      ".inst 0xc1abac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+      ".inst 0xc1a7aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+      "ld1rw { z31.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+      ".inst 0xc1a7aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      ".inst 0xc1a6ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+      ".inst 0xc1a6ab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1bfcc78  // sclamp { z24.s-z27.s }, z3.s, z31.s\n"
+      ".inst 0xc1bfcc70  // sclamp { z16.s-z19.s }, z3.s, z31.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      ".inst 0xc1bfcc74  // sclamp { z20.s-z23.s }, z3.s, z31.s\n"
+      ".inst 0xc1bfcc6c  // sclamp { z12.s-z15.s }, z3.s, z31.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "uzp1 z18.h, z18.h, z19.h\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "uzp1 z17.h, z22.h, z23.h\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z30.h, z14.h, z15.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p2, [x25]\n"
+      "uzp1 z16.b, z16.b, z18.b\n"
+      "st1b { z16.b }, p2, [x25, #1, MUL VL]\n"
+      "uzp1 z20.b, z20.b, z17.b\n"
+      "uzp1 z12.b, z12.b, z30.b\n"
+      "st1b { z20.b }, p2, [x25, #2, MUL VL]\n"
+      "st1b { z12.b }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "43:"  // Width 4: Output done
+      "subs x27, x27, #0x4\n"
+      "sub %x[N], %x[N], x28, LSL #2\n"
+      "bgt 4b\n"
+      "44:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p8.b\n"
+      : [N] "+&r" (N), [flags] "+&r" (flags)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..edfb362aab
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "../bfloat.hpp"
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL
+{
+public:
+  typedef bfloat16 operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 2;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 2> transforms = {};
+
+  cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..8105300cb7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include "../../bfloat.hpp"
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const bfloat16 *const A,
+      const bfloat16 *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const bfloat16 *const A;
+    const bfloat16 *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c5d4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      ".inst 0x25bc6530  // whilelt pn8.s, x9, x28, VLx4\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "fmov z6.s, #1.0\n"
+      ".inst 0xa009c29d  // ldnt1w { z28.s-z31.s }, p8/Z, [x20, x9, LSL #2]\n"
+      ".inst 0x809c00c0  // fmopa za0.s, p0/M, p0/M, z6.s, z28.s\n"
+      ".inst 0x809d00c1  // fmopa za1.s, p0/M, p0/M, z6.s, z29.s\n"
+      ".inst 0x809e00c2  // fmopa za2.s, p0/M, p0/M, z6.s, z30.s\n"
+      ".inst 0x809f00c3  // fmopa za3.s, p0/M, p0/M, z6.s, z31.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x9\n"
+      "mov x21, x10\n"
+      "incw x20, ALL, MUL #4\n"
+      "incw x21\n"
+      "cmp x20, x28\n"
+      "csel x21, x10, x21, LT\n"
+      "mov x20, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x21, x11\n"
+      "csel x15, x20, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x1\n"
+      "lsr x20, x20, #0x1\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x9, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      "ld1h { z28.h }, p0/Z, [x26]\n"
+      ".inst 0xa040a6e9  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x23]\n"
+      "ld1h { z22.h }, p0/Z, [x26, #1, MUL VL]\n"
+      ".inst 0xa041a6ed  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      "ld1h { z30.h }, p0/Z, [x26, #2, MUL VL]\n"
+      ".inst 0xa042a6e5  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      "ld1h { z20.h }, p0/Z, [x26, #3, MUL VL]\n"
+      "addvl x26, x26, #4\n"
+      ".inst 0xa143a6fb  // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x81880380  // bfmopa za0.s, p0/M, p0/M, z28.h, z8.h\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0x81890381  // bfmopa za1.s, p0/M, p0/M, z28.h, z9.h\n"
+      ".inst 0x818a0382  // bfmopa za2.s, p0/M, p0/M, z28.h, z10.h\n"
+      ".inst 0x818b0383  // bfmopa za3.s, p0/M, p0/M, z28.h, z11.h\n"
+      "ld1h { z28.h }, p0/Z, [x26]\n"
+      ".inst 0x818c02c0  // bfmopa za0.s, p0/M, p0/M, z22.h, z12.h\n"
+      ".inst 0xa040a6e9  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x23]\n"
+      ".inst 0x818d02c1  // bfmopa za1.s, p0/M, p0/M, z22.h, z13.h\n"
+      ".inst 0x818e02c2  // bfmopa za2.s, p0/M, p0/M, z22.h, z14.h\n"
+      ".inst 0x818f02c3  // bfmopa za3.s, p0/M, p0/M, z22.h, z15.h\n"
+      "ld1h { z22.h }, p0/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x818403c0  // bfmopa za0.s, p0/M, p0/M, z30.h, z4.h\n"
+      ".inst 0xa041a6ed  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0x818503c1  // bfmopa za1.s, p0/M, p0/M, z30.h, z5.h\n"
+      ".inst 0x818603c2  // bfmopa za2.s, p0/M, p0/M, z30.h, z6.h\n"
+      ".inst 0x818703c3  // bfmopa za3.s, p0/M, p0/M, z30.h, z7.h\n"
+      "ld1h { z30.h }, p0/Z, [x26, #2, MUL VL]\n"
+      ".inst 0xa042a6e5  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0x81930280  // bfmopa za0.s, p0/M, p0/M, z20.h, z19.h\n"
+      ".inst 0x81970281  // bfmopa za1.s, p0/M, p0/M, z20.h, z23.h\n"
+      ".inst 0x819b0282  // bfmopa za2.s, p0/M, p0/M, z20.h, z27.h\n"
+      ".inst 0x819f0283  // bfmopa za3.s, p0/M, p0/M, z20.h, z31.h\n"
+      "ld1h { z20.h }, p0/Z, [x26, #3, MUL VL]\n"
+      "addvl x26, x26, #4\n"
+      ".inst 0xa143a6fb  // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x81880380  // bfmopa za0.s, p0/M, p0/M, z28.h, z8.h\n"
+      ".inst 0x81890381  // bfmopa za1.s, p0/M, p0/M, z28.h, z9.h\n"
+      ".inst 0x818a0382  // bfmopa za2.s, p0/M, p0/M, z28.h, z10.h\n"
+      ".inst 0x818b0383  // bfmopa za3.s, p0/M, p0/M, z28.h, z11.h\n"
+      ".inst 0x818c02c0  // bfmopa za0.s, p0/M, p0/M, z22.h, z12.h\n"
+      ".inst 0x818d02c1  // bfmopa za1.s, p0/M, p0/M, z22.h, z13.h\n"
+      ".inst 0x818e02c2  // bfmopa za2.s, p0/M, p0/M, z22.h, z14.h\n"
+      ".inst 0x818f02c3  // bfmopa za3.s, p0/M, p0/M, z22.h, z15.h\n"
+      ".inst 0x818403c0  // bfmopa za0.s, p0/M, p0/M, z30.h, z4.h\n"
+      ".inst 0x818503c1  // bfmopa za1.s, p0/M, p0/M, z30.h, z5.h\n"
+      ".inst 0x818603c2  // bfmopa za2.s, p0/M, p0/M, z30.h, z6.h\n"
+      ".inst 0x818703c3  // bfmopa za3.s, p0/M, p0/M, z30.h, z7.h\n"
+      ".inst 0x81930280  // bfmopa za0.s, p0/M, p0/M, z20.h, z19.h\n"
+      ".inst 0x81970281  // bfmopa za1.s, p0/M, p0/M, z20.h, z23.h\n"
+      ".inst 0x819b0282  // bfmopa za2.s, p0/M, p0/M, z20.h, z27.h\n"
+      ".inst 0x819f0283  // bfmopa za3.s, p0/M, p0/M, z20.h, z31.h\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1h { z8.h }, p0/Z, [x26]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x26, x26, #1\n"
+      ".inst 0xa140a6e3  // ld1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x23]\n"
+      "addvl x23, x23, #4\n"
+      ".inst 0x81830100  // bfmopa za0.s, p0/M, p0/M, z8.h, z3.h\n"
+      ".inst 0x81870101  // bfmopa za1.s, p0/M, p0/M, z8.h, z7.h\n"
+      ".inst 0x818b0102  // bfmopa za2.s, p0/M, p0/M, z8.h, z11.h\n"
+      ".inst 0x818f0103  // bfmopa za3.s, p0/M, p0/M, z8.h, z15.h\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5d4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a0  // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 24f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5b8  // st1w { z24.s-z27.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 24f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "add x25, x25, x9, LSL #2\n"  // C += n
+      "sub x24, x11, x10\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x25, x10, x23, x25\n"  // C += m * ldc
+      "tbz x15, #2, 18f\n"
+      "cntw x20\n"
+      "cmp x24, x20\n"
+      "csel x22, x24, x20, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa160c320  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa160c321  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa160c322  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa160c323  // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x20, 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x24, x24, x22\n"
+      "beq 18f\n"
+      "b 22f\n"
+      "18:"  // Store to output array: Skip activation: End
+      "cntw x20\n"
+      "cmp x24, x20\n"
+      "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x20, x24, x20, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 20f\n"
+      "19:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa160c333  // st1w { z19.s, z23.s, z27.s, z31.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 21f\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
+      "21:"  // Store to output array: Accumulator row 0 oddments: End
+      "22:"  // Store to output array: End
+      "tbz x15, #0, 24f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "23:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x14, x14, #16\n"
+      "blt 23b\n"
+      "24:"  // End block
+      "incw x9, ALL, MUL #4\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..ca7b0573fc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "../bfloat.hpp"
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL
+{
+public:
+  typedef bfloat16 operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 2;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 2> transforms = {};
+
+  cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..20c1de9418
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include "../../bfloat.hpp"
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const bfloat16 *const A,
+      const bfloat16 *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const bfloat16 *const A;
+    const bfloat16 *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa042c5f4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      ".inst 0x25a94550  // whilelt pn8.s, x10, x9, VLx2\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "fmov z12.s, #1.0\n"
+      ".inst 0xa10a4289  // ldnt1w { z1.s, z9.s }, p8/Z, [x20, x10, LSL #2]\n"
+      ".inst 0x80810180  // fmopa za0.s, p0/M, p0/M, z12.s, z1.s\n"
+      ".inst 0x80890181  // fmopa za1.s, p0/M, p0/M, z12.s, z9.s\n"
+      ".inst 0x80810182  // fmopa za2.s, p0/M, p0/M, z12.s, z1.s\n"
+      ".inst 0x80890183  // fmopa za3.s, p0/M, p0/M, z12.s, z9.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20, ALL, MUL #2\n"
+      "incw x21, ALL, MUL #2\n"
+      "cmp x20, x9\n"
+      "csel x21, x11, x21, LT\n"
+      "mov x20, x16\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x1\n"
+      "lsr x20, x20, #0x1\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa0402772  // ld1h { z18.h-z19.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa04026e3  // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23]\n"
+      ".inst 0xa0412764  // ld1h { z4.h-z5.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa04126fb  // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa042276a  // ld1h { z10.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa04226f5  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa0432766  // ld1h { z6.h-z7.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa04326e9  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x81820240  // bfmopa za0.s, p0/M, p0/M, z18.h, z2.h\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0x81830241  // bfmopa za1.s, p0/M, p0/M, z18.h, z3.h\n"
+      ".inst 0x81820262  // bfmopa za2.s, p0/M, p0/M, z19.h, z2.h\n"
+      ".inst 0x81830263  // bfmopa za3.s, p0/M, p0/M, z19.h, z3.h\n"
+      ".inst 0xa0402772  // ld1h { z18.h-z19.h }, pn9.b/Z, [x27]\n"
+      ".inst 0x819a0080  // bfmopa za0.s, p0/M, p0/M, z4.h, z26.h\n"
+      ".inst 0xa04026e3  // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23]\n"
+      ".inst 0x819b0081  // bfmopa za1.s, p0/M, p0/M, z4.h, z27.h\n"
+      ".inst 0x819a00a2  // bfmopa za2.s, p0/M, p0/M, z5.h, z26.h\n"
+      ".inst 0x819b00a3  // bfmopa za3.s, p0/M, p0/M, z5.h, z27.h\n"
+      ".inst 0xa0412764  // ld1h { z4.h-z5.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0x81940140  // bfmopa za0.s, p0/M, p0/M, z10.h, z20.h\n"
+      ".inst 0xa04126fb  // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0x81950141  // bfmopa za1.s, p0/M, p0/M, z10.h, z21.h\n"
+      ".inst 0x81940162  // bfmopa za2.s, p0/M, p0/M, z11.h, z20.h\n"
+      ".inst 0x81950163  // bfmopa za3.s, p0/M, p0/M, z11.h, z21.h\n"
+      ".inst 0xa042276a  // ld1h { z10.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa04226f5  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0x818800c0  // bfmopa za0.s, p0/M, p0/M, z6.h, z8.h\n"
+      ".inst 0x818900c1  // bfmopa za1.s, p0/M, p0/M, z6.h, z9.h\n"
+      ".inst 0x818800e2  // bfmopa za2.s, p0/M, p0/M, z7.h, z8.h\n"
+      ".inst 0x818900e3  // bfmopa za3.s, p0/M, p0/M, z7.h, z9.h\n"
+      ".inst 0xa0432766  // ld1h { z6.h-z7.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa04326e9  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x81820240  // bfmopa za0.s, p0/M, p0/M, z18.h, z2.h\n"
+      ".inst 0x81830241  // bfmopa za1.s, p0/M, p0/M, z18.h, z3.h\n"
+      ".inst 0x81820262  // bfmopa za2.s, p0/M, p0/M, z19.h, z2.h\n"
+      ".inst 0x81830263  // bfmopa za3.s, p0/M, p0/M, z19.h, z3.h\n"
+      ".inst 0x819a0080  // bfmopa za0.s, p0/M, p0/M, z4.h, z26.h\n"
+      ".inst 0x819b0081  // bfmopa za1.s, p0/M, p0/M, z4.h, z27.h\n"
+      ".inst 0x819a00a2  // bfmopa za2.s, p0/M, p0/M, z5.h, z26.h\n"
+      ".inst 0x819b00a3  // bfmopa za3.s, p0/M, p0/M, z5.h, z27.h\n"
+      ".inst 0x81940140  // bfmopa za0.s, p0/M, p0/M, z10.h, z20.h\n"
+      ".inst 0x81950141  // bfmopa za1.s, p0/M, p0/M, z10.h, z21.h\n"
+      ".inst 0x81940162  // bfmopa za2.s, p0/M, p0/M, z11.h, z20.h\n"
+      ".inst 0x81950163  // bfmopa za3.s, p0/M, p0/M, z11.h, z21.h\n"
+      ".inst 0x818800c0  // bfmopa za0.s, p0/M, p0/M, z6.h, z8.h\n"
+      ".inst 0x818900c1  // bfmopa za1.s, p0/M, p0/M, z6.h, z9.h\n"
+      ".inst 0x818800e2  // bfmopa za2.s, p0/M, p0/M, z7.h, z8.h\n"
+      ".inst 0x818900e3  // bfmopa za3.s, p0/M, p0/M, z7.h, z9.h\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa040277e  // ld1h { z30.h-z31.h }, pn9.b/Z, [x27]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0xa14026e5  // ld1h { z5.h, z13.h }, pn9.b/Z, [x23]\n"
+      "addvl x23, x23, #2\n"
+      ".inst 0x818503c0  // bfmopa za0.s, p0/M, p0/M, z30.h, z5.h\n"
+      ".inst 0x818d03c1  // bfmopa za1.s, p0/M, p0/M, z30.h, z13.h\n"
+      ".inst 0x818503e2  // bfmopa za2.s, p0/M, p0/M, z31.h, z5.h\n"
+      ".inst 0x818d03e3  // bfmopa za3.s, p0/M, p0/M, z31.h, z13.h\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xa061c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 30f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c5d8  // st1w { z24.s-z27.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 30f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "add x26, x26, x10, LSL #2\n"  // C += n
+      "sub x25, x13, x11\n"
+      "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x26, x11, x24, x26\n"  // C += m * ldc
+      "tbz x16, #2, 21f\n"
+      "cntw x23\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa1604347  // st1w { z7.s, z15.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x20, 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa1604354  // st1w { z20.s, z28.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604355  // st1w { z21.s, z29.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 17f\n"
+      ".inst 0xa1604356  // st1w { z22.s, z30.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 21f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 19f\n"
+      "18:"  // Store to output array: Skip activation: Accumulator row 1 loop
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa1604347  // st1w { z7.s, z15.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
+      "cbz x20, 20f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 20f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 20f\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 21f\n"
+      "b 28f\n"
+      "21:"  // Store to output array: Skip activation: End
+      "cntw x23\n"
+      "cmp x25, x23\n"
+      "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 23f\n"
+      "22:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+      ".inst 0xa1604354  // st1w { z20.s, z28.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604355  // st1w { z21.s, z29.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa1604356  // st1w { z22.s, z30.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604357  // st1w { z23.s, z31.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 22b\n"
+      "23:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 24f\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 24f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 24f\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "24:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 28f\n"
+      "cmp x25, x23\n"
+      "csel x20, x25, x23, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 26f\n"
+      "25:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604353  // st1w { z19.s, z27.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 25b\n"
+      "26:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 27f\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 27f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 27f\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
+      "27:"  // Store to output array: Accumulator row 1 oddments: End
+      "28:"  // Store to output array: End
+      "tbz x16, #0, 30f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "29:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 29b\n"
+      "30:"  // End block
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #2\n"
+      "cmp x11, x13\n"
+      "mov x10, #0x0\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..7b31d6d2db
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "../bfloat.hpp"
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL
+{
+public:
+  typedef bfloat16 operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 2;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 2> transforms = {};
+
+  cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..70c94d32a3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include "../../bfloat.hpp"
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const bfloat16 *const A,
+      const bfloat16 *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const bfloat16 *const A;
+    const bfloat16 *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa042c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      "whilelt p0.s, x10, x9\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "fmov z11.s, #1.0\n"
+      "ldnt1w { z13.s }, p0/Z, [x20, x10, LSL #2]\n"
+      ".inst 0x808d2560  // fmopa za0.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2561  // fmopa za1.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2562  // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2563  // fmopa za3.s, p1/M, p1/M, z11.s, z13.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20\n"
+      "incw x21, ALL, MUL #4\n"
+      "cmp x20, x9\n"
+      "csel x21, x11, x21, LT\n"
+      "mov x20, x16\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x1\n"
+      "lsr x20, x20, #0x1\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa140a360  // ld1h { z0.h, z4.h, z8.h, z12.h }, pn8.b/Z, [x27]\n"
+      "ldnt1h { z19.h }, p1/Z, [x23]\n"
+      ".inst 0xa141a371  // ld1h { z17.h, z21.h, z25.h, z29.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      "ldnt1h { z22.h }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa142a370  // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1h { z23.h }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa143a363  // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ldnt1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x81932400  // bfmopa za0.s, p1/M, p1/M, z0.h, z19.h\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0x81932481  // bfmopa za1.s, p1/M, p1/M, z4.h, z19.h\n"
+      ".inst 0x81932502  // bfmopa za2.s, p1/M, p1/M, z8.h, z19.h\n"
+      ".inst 0x81932583  // bfmopa za3.s, p1/M, p1/M, z12.h, z19.h\n"
+      ".inst 0xa140a360  // ld1h { z0.h, z4.h, z8.h, z12.h }, pn8.b/Z, [x27]\n"
+      ".inst 0x81962620  // bfmopa za0.s, p1/M, p1/M, z17.h, z22.h\n"
+      "ldnt1h { z19.h }, p1/Z, [x23]\n"
+      ".inst 0x819626a1  // bfmopa za1.s, p1/M, p1/M, z21.h, z22.h\n"
+      ".inst 0x81962722  // bfmopa za2.s, p1/M, p1/M, z25.h, z22.h\n"
+      ".inst 0x819627a3  // bfmopa za3.s, p1/M, p1/M, z29.h, z22.h\n"
+      ".inst 0xa141a371  // ld1h { z17.h, z21.h, z25.h, z29.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0x81972600  // bfmopa za0.s, p1/M, p1/M, z16.h, z23.h\n"
+      "ldnt1h { z22.h }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0x81972681  // bfmopa za1.s, p1/M, p1/M, z20.h, z23.h\n"
+      ".inst 0x81972702  // bfmopa za2.s, p1/M, p1/M, z24.h, z23.h\n"
+      ".inst 0x81972783  // bfmopa za3.s, p1/M, p1/M, z28.h, z23.h\n"
+      ".inst 0xa142a370  // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1h { z23.h }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0x81822460  // bfmopa za0.s, p1/M, p1/M, z3.h, z2.h\n"
+      ".inst 0x818224e1  // bfmopa za1.s, p1/M, p1/M, z7.h, z2.h\n"
+      ".inst 0x81822562  // bfmopa za2.s, p1/M, p1/M, z11.h, z2.h\n"
+      ".inst 0x818225e3  // bfmopa za3.s, p1/M, p1/M, z15.h, z2.h\n"
+      ".inst 0xa143a363  // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ldnt1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x81932400  // bfmopa za0.s, p1/M, p1/M, z0.h, z19.h\n"
+      ".inst 0x81932481  // bfmopa za1.s, p1/M, p1/M, z4.h, z19.h\n"
+      ".inst 0x81932502  // bfmopa za2.s, p1/M, p1/M, z8.h, z19.h\n"
+      ".inst 0x81932583  // bfmopa za3.s, p1/M, p1/M, z12.h, z19.h\n"
+      ".inst 0x81962620  // bfmopa za0.s, p1/M, p1/M, z17.h, z22.h\n"
+      ".inst 0x819626a1  // bfmopa za1.s, p1/M, p1/M, z21.h, z22.h\n"
+      ".inst 0x81962722  // bfmopa za2.s, p1/M, p1/M, z25.h, z22.h\n"
+      ".inst 0x819627a3  // bfmopa za3.s, p1/M, p1/M, z29.h, z22.h\n"
+      ".inst 0x81972600  // bfmopa za0.s, p1/M, p1/M, z16.h, z23.h\n"
+      ".inst 0x81972681  // bfmopa za1.s, p1/M, p1/M, z20.h, z23.h\n"
+      ".inst 0x81972702  // bfmopa za2.s, p1/M, p1/M, z24.h, z23.h\n"
+      ".inst 0x81972783  // bfmopa za3.s, p1/M, p1/M, z28.h, z23.h\n"
+      ".inst 0x81822460  // bfmopa za0.s, p1/M, p1/M, z3.h, z2.h\n"
+      ".inst 0x818224e1  // bfmopa za1.s, p1/M, p1/M, z7.h, z2.h\n"
+      ".inst 0x81822562  // bfmopa za2.s, p1/M, p1/M, z11.h, z2.h\n"
+      ".inst 0x818225e3  // bfmopa za3.s, p1/M, p1/M, z15.h, z2.h\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa140a373  // ld1h { z19.h, z23.h, z27.h, z31.h }, pn8.b/Z, [x27]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x27, x27, #4\n"
+      "ld1h { z11.h }, p1/Z, [x23]\n"
+      "addvl x23, x23, #1\n"
+      ".inst 0x818b2660  // bfmopa za0.s, p1/M, p1/M, z19.h, z11.h\n"
+      ".inst 0x818b26e1  // bfmopa za1.s, p1/M, p1/M, z23.h, z11.h\n"
+      ".inst 0x818b2762  // bfmopa za2.s, p1/M, p1/M, z27.h, z11.h\n"
+      ".inst 0x818b27e3  // bfmopa za3.s, p1/M, p1/M, z31.h, z11.h\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c1d8  // st1w { z24.s-z27.s }, pn8.b, [x14]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xa061c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c1d0  // st1w { z16.s-z19.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 42f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa060c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14]\n"
+      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 42f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "add x26, x26, x10, LSL #2\n"  // C += n
+      "sub x25, x13, x11\n"
+      "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x26, x11, x24, x26\n"  // C += m * ldc
+      "tbz x16, #2, 27f\n"
+      "cntw x23\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z7.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x20, 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 17f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 17f\n"
+      "st1w { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 19f\n"
+      "18:"  // Store to output array: Skip activation: Accumulator row 1 loop
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      "st1w { z8.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z9.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z10.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z11.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
+      "cbz x20, 20f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      "st1w { z24.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 20f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z25.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 20f\n"
+      "st1w { z26.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 22f\n"
+      "21:"  // Store to output array: Skip activation: Accumulator row 2 loop
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z7.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 21b\n"
+      "22:"  // Store to output array: Skip activation: Accumulator row 2 oddments
+      "cbz x20, 23f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      "st1w { z12.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 23f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z13.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 23f\n"
+      "st1w { z14.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "23:"  // Store to output array: Skip activation: Accumulator row 2 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 25f\n"
+      "24:"  // Store to output array: Skip activation: Accumulator row 3 loop
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z19.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 24b\n"
+      "25:"  // Store to output array: Skip activation: Accumulator row 3 oddments
+      "cbz x20, 26f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 26f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 26f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "26:"  // Store to output array: Skip activation: Accumulator row 3 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "b 40f\n"
+      "27:"  // Store to output array: Skip activation: End
+      "cntw x23\n"
+      "cmp x25, x23\n"
+      "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 29f\n"
+      "28:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc1b4cabc  // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1w { z28.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z29.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z30.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z31.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 28b\n"
+      "29:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 30f\n"
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1b4cabc  // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1w { z28.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 30f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z29.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 30f\n"
+      "st1w { z30.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "30:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 40f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 32f\n"
+      "31:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1b4caa4  // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1w { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z7.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 31b\n"
+      "32:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 33f\n"
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 33f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 33f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "33:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 40f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 35f\n"
+      "34:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z19.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 34b\n"
+      "35:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x20, 36f\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 36f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 36f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "36:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 40f\n"
+      "cmp x25, x23\n"
+      "csel x20, x25, x23, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 38f\n"
+      "37:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z19.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 37b\n"
+      "38:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x20, 39f\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 39f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 39f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "39:"  // Store to output array: Accumulator row 3 oddments: End
+      "40:"  // Store to output array: End
+      "tbz x16, #0, 42f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "41:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 41b\n"
+      "42:"  // End block
+      "incw x10\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #4\n"
+      "cmp x11, x13\n"
+      "mov x10, #0x0\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..a9196958c7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL
+{
+public:
+  typedef __fp16 operand_type;
+  typedef __fp16 result_type;
+
+  typedef void (*kern_type)(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 2;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 2> transforms = {};
+
+  cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..ad10ce7993
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const __fp16 *const A,
+      const __fp16 *const B,
+      __fp16 *const C, const int ldc,
+      const int M, const int N, const int K,
+      const __fp16 *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 2) * sizeof(__fp16)),
+        C(C), ldcb(ldc * sizeof(__fp16)),
+        M(M), N(N), K(K),
+        min(-static_cast<__fp16>(std::numeric_limits<float>::infinity())),
+        max(static_cast<__fp16>(std::numeric_limits<float>::infinity())),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<__fp16>(0);
+            break;
+      }
+    }
+
+    const __fp16 *const A;
+    const __fp16 *const B;
+    const long kstride_bytes;
+    __fp16 *const C;
+    const long ldcb;
+    const long M, N, K;
+    __fp16 min = -static_cast<__fp16>(std::numeric_limits<float>::infinity());
+    __fp16 max = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+
+    const __fp16 *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x13, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x10, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x13, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c578  // ld1w { z24.s-z27.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xa041c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xa042c564  // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa043c574  // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840700  // mova za0h.s[x12], { z24.s-z27.s }\n"
+      "addvl x11, x11, #16\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w9, [%x[args], %[offsetof_M]]\n"
+      "mov x28, #0x0\n"
+      "mov x27, #0x0\n"
+      "ldr w26, [%x[args], %[offsetof_N]]\n"
+      "ldr x25, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x24, x25\n"
+      "tbnz x13, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      ".inst 0x257a4770  // whilelt pn8.h, x27, x26, VLx2\n"
+      "fmov z6.h, #0.0\n"
+      "fmov z19.h, #1.0\n"
+      ".inst 0xa01b2295  // ldnt1h { z20.h-z21.h }, p8/Z, [x20, x27, LSL #1]\n"
+      "zip1 z23.h, z20.h, z6.h\n"
+      "zip2 z12.h, z20.h, z6.h\n"
+      "zip1 z16.h, z21.h, z6.h\n"
+      "zip2 z8.h, z21.h, z6.h\n"
+      ".inst 0x81b70260  // fmopa za0.s, p0/M, p0/M, z19.h, z23.h\n"
+      ".inst 0x81ac0261  // fmopa za1.s, p0/M, p0/M, z19.h, z12.h\n"
+      ".inst 0x81b00262  // fmopa za2.s, p0/M, p0/M, z19.h, z16.h\n"
+      ".inst 0x81a80263  // fmopa za3.s, p0/M, p0/M, z19.h, z8.h\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x27\n"
+      "mov x21, x28\n"
+      "incw x20, ALL, MUL #4\n"
+      "incw x21\n"
+      "cmp x20, x26\n"
+      "mov x20, x13\n"
+      "csel x21, x28, x21, LT\n"
+      "bfm x13, XZR, #0x0, #0x0  // bfc x13, #0x0, #0x1\n"
+      "cmp x21, x9\n"
+      "csel x13, x20, x13, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "add x20, x20, #0x1\n"
+      "lsr x20, x20, #0x1\n"
+      "lsr x21, x20, #0x2\n"
+      "and x20, x20, #0x3\n"
+      "madd x23, x27, x22, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      "ld1h { z21.h }, p0/Z, [x24]\n"
+      ".inst 0xa140a6f8  // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23]\n"
+      "ld1h { z29.h }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0xa041a6ed  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      "ld1h { z4.h }, p0/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa042a6e1  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      "ld1h { z25.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      ".inst 0xa143a6fb  // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x81b002a0  // fmopa za0.s, p0/M, p0/M, z21.h, z16.h\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0x81b402a1  // fmopa za1.s, p0/M, p0/M, z21.h, z20.h\n"
+      ".inst 0x81b802a2  // fmopa za2.s, p0/M, p0/M, z21.h, z24.h\n"
+      ".inst 0x81bc02a3  // fmopa za3.s, p0/M, p0/M, z21.h, z28.h\n"
+      "ld1h { z21.h }, p0/Z, [x24]\n"
+      ".inst 0x81ac03a0  // fmopa za0.s, p0/M, p0/M, z29.h, z12.h\n"
+      ".inst 0xa140a6f0  // ld1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23]\n"
+      ".inst 0x81ad03a1  // fmopa za1.s, p0/M, p0/M, z29.h, z13.h\n"
+      ".inst 0x81ae03a2  // fmopa za2.s, p0/M, p0/M, z29.h, z14.h\n"
+      ".inst 0x81af03a3  // fmopa za3.s, p0/M, p0/M, z29.h, z15.h\n"
+      "ld1h { z29.h }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0x81a00080  // fmopa za0.s, p0/M, p0/M, z4.h, z0.h\n"
+      ".inst 0xa041a6ec  // ld1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0x81a10081  // fmopa za1.s, p0/M, p0/M, z4.h, z1.h\n"
+      ".inst 0x81a20082  // fmopa za2.s, p0/M, p0/M, z4.h, z2.h\n"
+      ".inst 0x81a30083  // fmopa za3.s, p0/M, p0/M, z4.h, z3.h\n"
+      "ld1h { z4.h }, p0/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa042a6e0  // ld1h { z0.h-z3.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0x81b30320  // fmopa za0.s, p0/M, p0/M, z25.h, z19.h\n"
+      ".inst 0x81b70321  // fmopa za1.s, p0/M, p0/M, z25.h, z23.h\n"
+      ".inst 0x81bb0322  // fmopa za2.s, p0/M, p0/M, z25.h, z27.h\n"
+      ".inst 0x81bf0323  // fmopa za3.s, p0/M, p0/M, z25.h, z31.h\n"
+      "ld1h { z25.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      ".inst 0xa143a6f3  // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x81b002a0  // fmopa za0.s, p0/M, p0/M, z21.h, z16.h\n"
+      ".inst 0x81b402a1  // fmopa za1.s, p0/M, p0/M, z21.h, z20.h\n"
+      ".inst 0x81b802a2  // fmopa za2.s, p0/M, p0/M, z21.h, z24.h\n"
+      ".inst 0x81bc02a3  // fmopa za3.s, p0/M, p0/M, z21.h, z28.h\n"
+      ".inst 0x81ac03a0  // fmopa za0.s, p0/M, p0/M, z29.h, z12.h\n"
+      ".inst 0x81ad03a1  // fmopa za1.s, p0/M, p0/M, z29.h, z13.h\n"
+      ".inst 0x81ae03a2  // fmopa za2.s, p0/M, p0/M, z29.h, z14.h\n"
+      ".inst 0x81af03a3  // fmopa za3.s, p0/M, p0/M, z29.h, z15.h\n"
+      ".inst 0x81a00080  // fmopa za0.s, p0/M, p0/M, z4.h, z0.h\n"
+      ".inst 0x81a10081  // fmopa za1.s, p0/M, p0/M, z4.h, z1.h\n"
+      ".inst 0x81a20082  // fmopa za2.s, p0/M, p0/M, z4.h, z2.h\n"
+      ".inst 0x81a30083  // fmopa za3.s, p0/M, p0/M, z4.h, z3.h\n"
+      ".inst 0x81b30320  // fmopa za0.s, p0/M, p0/M, z25.h, z19.h\n"
+      ".inst 0x81b70321  // fmopa za1.s, p0/M, p0/M, z25.h, z23.h\n"
+      ".inst 0x81bb0322  // fmopa za2.s, p0/M, p0/M, z25.h, z27.h\n"
+      ".inst 0x81bf0323  // fmopa za3.s, p0/M, p0/M, z25.h, z31.h\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1h { z21.h }, p0/Z, [x24]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x24, x24, #1\n"
+      ".inst 0xa140a6f0  // ld1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23]\n"
+      "addvl x23, x23, #4\n"
+      ".inst 0x81b002a0  // fmopa za0.s, p0/M, p0/M, z21.h, z16.h\n"
+      ".inst 0x81b402a1  // fmopa za1.s, p0/M, p0/M, z21.h, z20.h\n"
+      ".inst 0x81b802a2  // fmopa za2.s, p0/M, p0/M, z21.h, z24.h\n"
+      ".inst 0x81bc02a3  // fmopa za3.s, p0/M, p0/M, z21.h, z28.h\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x13, #1, 14f\n"
+      "tbz x13, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c574  // ld1w { z20.s-z23.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xa041c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa042c568  // ld1w { z8.s-z11.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa043c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      "addvl x11, x11, #16\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa060c540  // st1w { z0.s-z3.s }, pn9.b, [x10]\n"
+      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa061c558  // st1w { z24.s-z27.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa062c544  // st1w { z4.s-z7.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa063c55c  // st1w { z28.s-z31.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "blt 11b\n"
+      "b 18f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa060c540  // st1w { z0.s-z3.s }, pn9.b, [x10]\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa061c548  // st1w { z8.s-z11.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c54c  // st1w { z12.s-z15.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      ".inst 0xa063c544  // st1w { z4.s-z7.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "blt 13b\n"
+      "b 18f\n"
+      "14:"  // Store to output array
+      "ldr x23, [%x[args], %[offsetof_C]]\n"
+      "sub x22, x9, x28\n"
+      "cntw x21\n"
+      "ld1rh { z17.h }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "ldr x20, [%x[args], %[offsetof_ldcb]]\n"
+      ".inst 0x257a4770  // whilelt pn8.h, x27, x26, VLx2\n"
+      "cmp x22, x21\n"
+      "ld1rh { z16.h }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "csel x22, x22, x21, LT\n"
+      "add x23, x23, x27, LSL #1\n"  // C += n
+      "madd x23, x28, x20, x23\n"  // C += m * ldc
+      "15:"  // Store to output array: Accumulator loop
+      ".inst 0xc0060414  // mova { z20.b-z23.b }, za0h.b[x12, 0:3]\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc120e28e  // fcvt z14.h, { z20.s-z21.s }\n"
+      ".inst 0xc120e2cf  // fcvt z15.h, { z22.s-z23.s }\n"
+      "cmp x12, x22, LSL #2\n"
+      ".inst 0xc170c22e  // fclamp { z14.h-z15.h }, z17.h, z16.h\n"
+      ".inst 0xa06022ee  // st1h { z14.h-z15.h }, p8, [x23]\n"
+      "add x23, x23, x20\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: End
+      "tbz x13, #0, 18f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "17:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c578  // ld1w { z24.s-z27.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xa041c574  // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xa042c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa043c57c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840700  // mova za0h.s[x12], { z24.s-z27.s }\n"
+      "addvl x11, x11, #16\n"
+      ".inst 0xc0840681  // mova za1h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 17b\n"
+      "18:"  // End block
+      "incw x27, ALL, MUL #4\n"
+      "cmp x27, x26\n"
+      "blt 3b\n"
+      "incw x28\n"
+      "mov x27, #0x0\n"
+      "cmp x28, x9\n"
+      "mov x25, x24\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..5bd34b2ca0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL
+{
+public:
+  typedef __fp16 operand_type;
+  typedef __fp16 result_type;
+
+  typedef void (*kern_type)(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 2;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 2> transforms = {};
+
+  cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..5c48f953e8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const __fp16 *const A,
+      const __fp16 *const B,
+      __fp16 *const C, const int ldc,
+      const int M, const int N, const int K,
+      const __fp16 *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 2) * sizeof(__fp16)),
+        C(C), ldcb(ldc * sizeof(__fp16)),
+        M(M), N(N), K(K),
+        min(-static_cast<__fp16>(std::numeric_limits<float>::infinity())),
+        max(static_cast<__fp16>(std::numeric_limits<float>::infinity())),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<__fp16>(0);
+            break;
+      }
+    }
+
+    const __fp16 *const A;
+    const __fp16 *const B;
+    const long kstride_bytes;
+    __fp16 *const C;
+    const long ldcb;
+    const long M, N, K;
+    __fp16 min = -static_cast<__fp16>(std::numeric_limits<float>::infinity());
+    __fp16 max = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+
+    const __fp16 *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xa041c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "whilelt p0.h, x10, x9\n"
+      "fmov z10.h, #0.0\n"
+      "fmov z11.h, #1.0\n"
+      "ld1h { z18.h }, p0/Z, [x20, x10, LSL #1]\n"
+      "zip1 z2.h, z18.h, z10.h\n"
+      "zip2 z19.h, z18.h, z10.h\n"
+      ".inst 0x81a22560  // fmopa za0.s, p1/M, p1/M, z11.h, z2.h\n"
+      ".inst 0x81b32561  // fmopa za1.s, p1/M, p1/M, z11.h, z19.h\n"
+      ".inst 0x81a22562  // fmopa za2.s, p1/M, p1/M, z11.h, z2.h\n"
+      ".inst 0x81b32563  // fmopa za3.s, p1/M, p1/M, z11.h, z19.h\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20, ALL, MUL #2\n"
+      "incw x21, ALL, MUL #2\n"
+      "cmp x20, x9\n"
+      "mov x20, x16\n"
+      "csel x21, x11, x21, LT\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "add x20, x20, #0x1\n"
+      "lsr x20, x20, #0x1\n"
+      "lsr x21, x20, #0x2\n"
+      "and x20, x20, #0x3\n"
+      "madd x23, x10, x22, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa0402374  // ld1h { z20.h-z21.h }, pn8.b/Z, [x27]\n"
+      ".inst 0xa14022ed  // ldnt1h { z5.h, z13.h }, pn8.b/Z, [x23]\n"
+      ".inst 0xa041236a  // ld1h { z10.h-z11.h }, pn8.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa14122ec  // ldnt1h { z4.h, z12.h }, pn8.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa0422372  // ld1h { z18.h-z19.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa04222fb  // ldnt1h { z26.h-z27.h }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa1432366  // ld1h { z6.h, z14.h }, pn8.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa04322f9  // ldnt1h { z24.h-z25.h }, pn8.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x81a52680  // fmopa za0.s, p1/M, p1/M, z20.h, z5.h\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0x81ad2681  // fmopa za1.s, p1/M, p1/M, z20.h, z13.h\n"
+      ".inst 0x81a526a2  // fmopa za2.s, p1/M, p1/M, z21.h, z5.h\n"
+      ".inst 0x81ad26a3  // fmopa za3.s, p1/M, p1/M, z21.h, z13.h\n"
+      ".inst 0xa0402374  // ld1h { z20.h-z21.h }, pn8.b/Z, [x27]\n"
+      ".inst 0x81a42540  // fmopa za0.s, p1/M, p1/M, z10.h, z4.h\n"
+      ".inst 0xa14022e5  // ld1h { z5.h, z13.h }, pn8.b/Z, [x23]\n"
+      ".inst 0x81ac2541  // fmopa za1.s, p1/M, p1/M, z10.h, z12.h\n"
+      ".inst 0x81a42562  // fmopa za2.s, p1/M, p1/M, z11.h, z4.h\n"
+      ".inst 0x81ac2563  // fmopa za3.s, p1/M, p1/M, z11.h, z12.h\n"
+      ".inst 0xa041236a  // ld1h { z10.h-z11.h }, pn8.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0x81ba2640  // fmopa za0.s, p1/M, p1/M, z18.h, z26.h\n"
+      ".inst 0xa14122e4  // ld1h { z4.h, z12.h }, pn8.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0x81bb2641  // fmopa za1.s, p1/M, p1/M, z18.h, z27.h\n"
+      ".inst 0x81ba2662  // fmopa za2.s, p1/M, p1/M, z19.h, z26.h\n"
+      ".inst 0x81bb2663  // fmopa za3.s, p1/M, p1/M, z19.h, z27.h\n"
+      ".inst 0xa0422372  // ld1h { z18.h-z19.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa04222fa  // ld1h { z26.h-z27.h }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0x81b824c0  // fmopa za0.s, p1/M, p1/M, z6.h, z24.h\n"
+      ".inst 0x81b924c1  // fmopa za1.s, p1/M, p1/M, z6.h, z25.h\n"
+      ".inst 0x81b825c2  // fmopa za2.s, p1/M, p1/M, z14.h, z24.h\n"
+      ".inst 0x81b925c3  // fmopa za3.s, p1/M, p1/M, z14.h, z25.h\n"
+      ".inst 0xa1432366  // ld1h { z6.h, z14.h }, pn8.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa04322f8  // ld1h { z24.h-z25.h }, pn8.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x81a52680  // fmopa za0.s, p1/M, p1/M, z20.h, z5.h\n"
+      ".inst 0x81ad2681  // fmopa za1.s, p1/M, p1/M, z20.h, z13.h\n"
+      ".inst 0x81a526a2  // fmopa za2.s, p1/M, p1/M, z21.h, z5.h\n"
+      ".inst 0x81ad26a3  // fmopa za3.s, p1/M, p1/M, z21.h, z13.h\n"
+      ".inst 0x81a42540  // fmopa za0.s, p1/M, p1/M, z10.h, z4.h\n"
+      ".inst 0x81ac2541  // fmopa za1.s, p1/M, p1/M, z10.h, z12.h\n"
+      ".inst 0x81a42562  // fmopa za2.s, p1/M, p1/M, z11.h, z4.h\n"
+      ".inst 0x81ac2563  // fmopa za3.s, p1/M, p1/M, z11.h, z12.h\n"
+      ".inst 0x81ba2640  // fmopa za0.s, p1/M, p1/M, z18.h, z26.h\n"
+      ".inst 0x81bb2641  // fmopa za1.s, p1/M, p1/M, z18.h, z27.h\n"
+      ".inst 0x81ba2662  // fmopa za2.s, p1/M, p1/M, z19.h, z26.h\n"
+      ".inst 0x81bb2663  // fmopa za3.s, p1/M, p1/M, z19.h, z27.h\n"
+      ".inst 0x81b824c0  // fmopa za0.s, p1/M, p1/M, z6.h, z24.h\n"
+      ".inst 0x81b924c1  // fmopa za1.s, p1/M, p1/M, z6.h, z25.h\n"
+      ".inst 0x81b825c2  // fmopa za2.s, p1/M, p1/M, z14.h, z24.h\n"
+      ".inst 0x81b925c3  // fmopa za3.s, p1/M, p1/M, z14.h, z25.h\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa0402374  // ld1h { z20.h-z21.h }, pn8.b/Z, [x27]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0xa14022e5  // ld1h { z5.h, z13.h }, pn8.b/Z, [x23]\n"
+      "addvl x23, x23, #2\n"
+      ".inst 0x81a52680  // fmopa za0.s, p1/M, p1/M, z20.h, z5.h\n"
+      ".inst 0x81ad2681  // fmopa za1.s, p1/M, p1/M, z20.h, z13.h\n"
+      ".inst 0x81a526a2  // fmopa za2.s, p1/M, p1/M, z21.h, z5.h\n"
+      ".inst 0x81ad26a3  // fmopa za3.s, p1/M, p1/M, z21.h, z13.h\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa060c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14]\n"
+      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa061c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa062c1d4  // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa063c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 23f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa060c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14]\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa061c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1d8  // st1w { z24.s-z27.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 23f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "sub x25, x13, x11\n"
+      "cntw x24\n"
+      "ld1rh { z20.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "whilelt p0.h, x10, x9\n"
+      "cmp x25, x24\n"
+      "ld1rh { z19.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "csel x22, x25, x24, LT\n"
+      "mov x12, #0x0\n"
+      "add x26, x26, x10, LSL #1\n"  // C += n
+      "lsr x21, x22, #0x2\n"
+      "madd x26, x11, x23, x26\n"  // C += m * ldc
+      "and x20, x22, #0x3\n"
+      "cbz x21, 16f\n"
+      "15:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      "fcvt z8.h, p1/m, z8.s\n"
+      "fcvt z9.h, p1/m, z9.s\n"
+      "fcvt z10.h, p1/m, z10.s\n"
+      "fcvt z11.h, p1/m, z11.s\n"
+      "add x12, x12, #0x4\n"
+      "fcvt z28.h, p1/m, z28.s\n"
+      "fcvt z29.h, p1/m, z29.s\n"
+      "cmp x12, x21, LSL #2\n"
+      "fcvt z30.h, p1/m, z30.s\n"
+      "fcvt z31.h, p1/m, z31.s\n"
+      ".inst 0xc173ca88  // fclamp { z8.h-z11.h }, z20.h, z19.h\n"
+      ".inst 0xc173ca9c  // fclamp { z28.h-z31.h }, z20.h, z19.h\n"
+      "uzp1 z16.h, z8.h, z28.h\n"
+      "st1h { z16.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "uzp1 z18.h, z9.h, z29.h\n"
+      "uzp1 z17.h, z10.h, z30.h\n"
+      "uzp1 z16.h, z11.h, z31.h\n"
+      "st1h { z18.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z17.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z16.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 17f\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      "fcvt z8.h, p1/m, z8.s\n"
+      "fcvt z9.h, p1/m, z9.s\n"
+      "fcvt z10.h, p1/m, z10.s\n"
+      "fcvt z11.h, p1/m, z11.s\n"
+      "subs x20, x20, #0x1\n"
+      "fcvt z12.h, p1/m, z12.s\n"
+      "fcvt z13.h, p1/m, z13.s\n"
+      "fcvt z14.h, p1/m, z14.s\n"
+      "fcvt z15.h, p1/m, z15.s\n"
+      ".inst 0xc173ca88  // fclamp { z8.h-z11.h }, z20.h, z19.h\n"
+      ".inst 0xc173ca8c  // fclamp { z12.h-z15.h }, z20.h, z19.h\n"
+      "uzp1 z16.h, z8.h, z12.h\n"
+      "st1h { z16.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 17f\n"
+      "subs x20, x20, #0x1\n"
+      "uzp1 z16.h, z9.h, z13.h\n"
+      "st1h { z16.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 17f\n"
+      "uzp1 z16.h, z10.h, z14.h\n"
+      "st1h { z16.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 21f\n"
+      "whilelt p0.h, x10, x9\n"
+      "cmp x25, x24\n"
+      "csel x20, x25, x24, LT\n"
+      "mov x12, #0x0\n"
+      "lsr x21, x20, #0x2\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 19f\n"
+      "18:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      "fcvt z0.h, p1/m, z0.s\n"
+      "fcvt z1.h, p1/m, z1.s\n"
+      "fcvt z2.h, p1/m, z2.s\n"
+      "fcvt z3.h, p1/m, z3.s\n"
+      "add x12, x12, #0x4\n"
+      "fcvt z28.h, p1/m, z28.s\n"
+      "fcvt z29.h, p1/m, z29.s\n"
+      "cmp x12, x21, LSL #2\n"
+      "fcvt z30.h, p1/m, z30.s\n"
+      "fcvt z31.h, p1/m, z31.s\n"
+      ".inst 0xc173ca80  // fclamp { z0.h-z3.h }, z20.h, z19.h\n"
+      ".inst 0xc173ca9c  // fclamp { z28.h-z31.h }, z20.h, z19.h\n"
+      "uzp1 z16.h, z0.h, z28.h\n"
+      "st1h { z16.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "uzp1 z18.h, z1.h, z29.h\n"
+      "uzp1 z17.h, z2.h, z30.h\n"
+      "uzp1 z16.h, z3.h, z31.h\n"
+      "st1h { z18.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z17.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z16.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 20f\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      "fcvt z28.h, p1/m, z28.s\n"
+      "fcvt z29.h, p1/m, z29.s\n"
+      "fcvt z30.h, p1/m, z30.s\n"
+      "fcvt z31.h, p1/m, z31.s\n"
+      "subs x20, x20, #0x1\n"
+      "fcvt z12.h, p1/m, z12.s\n"
+      "fcvt z13.h, p1/m, z13.s\n"
+      "fcvt z14.h, p1/m, z14.s\n"
+      "fcvt z15.h, p1/m, z15.s\n"
+      ".inst 0xc173ca9c  // fclamp { z28.h-z31.h }, z20.h, z19.h\n"
+      ".inst 0xc173ca8c  // fclamp { z12.h-z15.h }, z20.h, z19.h\n"
+      "uzp1 z16.h, z28.h, z12.h\n"
+      "st1h { z16.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 20f\n"
+      "subs x20, x20, #0x1\n"
+      "uzp1 z16.h, z29.h, z13.h\n"
+      "st1h { z16.h }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 20f\n"
+      "uzp1 z16.h, z30.h, z14.h\n"
+      "st1h { z16.h }, p0, [x26]\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments: End
+      "21:"  // Store to output array: End
+      "tbz x16, #0, 23f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "22:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xa041c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xa042c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 22b\n"
+      "23:"  // End block
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #2\n"
+      "mov x10, #0x0\n"
+      "cmp x11, x13\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..05029f04b0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL
+{
+public:
+  typedef __fp16 operand_type;
+  typedef __fp16 result_type;
+
+  typedef void (*kern_type)(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 2;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 2> transforms = {};
+
+  cls_sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..8728cff31d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,506 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL(const __fp16 *const A, const __fp16 *const B, __fp16 *const C, int ldc, const int M, const int N, const int K, const __fp16 *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const __fp16 *const A,
+      const __fp16 *const B,
+      __fp16 *const C, const int ldc,
+      const int M, const int N, const int K,
+      const __fp16 *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 2) * sizeof(__fp16)),
+        C(C), ldcb(ldc * sizeof(__fp16)),
+        M(M), N(N), K(K),
+        min(-static_cast<__fp16>(std::numeric_limits<float>::infinity())),
+        max(static_cast<__fp16>(std::numeric_limits<float>::infinity())),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<__fp16>(0);
+            break;
+      }
+    }
+
+    const __fp16 *const A;
+    const __fp16 *const B;
+    const long kstride_bytes;
+    __fp16 *const C;
+    const long ldcb;
+    const long M, N, K;
+    __fp16 min = -static_cast<__fp16>(std::numeric_limits<float>::infinity());
+    __fp16 max = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+
+    const __fp16 *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xa041c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xa042c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c5f4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840700  // mova za0h.s[x12], { z24.s-z27.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840481  // mova za1h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      "whilelt p8.s, x10, x9\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "whilelt p0.h, x10, x9\n"
+      "fmov z5.h, #0.0\n"
+      "fmov z18.h, #1.0\n"
+      "ld1h { z31.h }, p0/Z, [x20, x10, LSL #1]\n"
+      "zip1 z15.h, z31.h, z5.h\n"
+      ".inst 0x81af2640  // fmopa za0.s, p1/M, p1/M, z18.h, z15.h\n"
+      ".inst 0x81af2641  // fmopa za1.s, p1/M, p1/M, z18.h, z15.h\n"
+      ".inst 0x81af2642  // fmopa za2.s, p1/M, p1/M, z18.h, z15.h\n"
+      ".inst 0x81af2643  // fmopa za3.s, p1/M, p1/M, z18.h, z15.h\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20\n"
+      "incw x21, ALL, MUL #4\n"
+      "cmp x20, x9\n"
+      "mov x20, x16\n"
+      "csel x21, x11, x21, LT\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "add x20, x20, #0x1\n"
+      "lsr x20, x20, #0x1\n"
+      "lsr x21, x20, #0x2\n"
+      "and x20, x20, #0x3\n"
+      "madd x23, x10, x22, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa140a773  // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x27]\n"
+      "ldnt1h { z17.h }, p1/Z, [x23]\n"
+      ".inst 0xa041a76c  // ld1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      "ldnt1h { z26.h }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa042a760  // ld1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1h { z30.h }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa143a770  // ld1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ldnt1h { z18.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x81b12660  // fmopa za0.s, p1/M, p1/M, z19.h, z17.h\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0x81b126e1  // fmopa za1.s, p1/M, p1/M, z23.h, z17.h\n"
+      ".inst 0x81b12762  // fmopa za2.s, p1/M, p1/M, z27.h, z17.h\n"
+      ".inst 0x81b127e3  // fmopa za3.s, p1/M, p1/M, z31.h, z17.h\n"
+      ".inst 0xa140a773  // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x27]\n"
+      ".inst 0x81ba2580  // fmopa za0.s, p1/M, p1/M, z12.h, z26.h\n"
+      "ld1h { z17.h }, p1/Z, [x23]\n"
+      ".inst 0x81ba25a1  // fmopa za1.s, p1/M, p1/M, z13.h, z26.h\n"
+      ".inst 0x81ba25c2  // fmopa za2.s, p1/M, p1/M, z14.h, z26.h\n"
+      ".inst 0x81ba25e3  // fmopa za3.s, p1/M, p1/M, z15.h, z26.h\n"
+      ".inst 0xa041a76c  // ld1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0x81be2400  // fmopa za0.s, p1/M, p1/M, z0.h, z30.h\n"
+      "ld1h { z26.h }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0x81be2421  // fmopa za1.s, p1/M, p1/M, z1.h, z30.h\n"
+      ".inst 0x81be2442  // fmopa za2.s, p1/M, p1/M, z2.h, z30.h\n"
+      ".inst 0x81be2463  // fmopa za3.s, p1/M, p1/M, z3.h, z30.h\n"
+      ".inst 0xa042a760  // ld1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
+      "ld1h { z30.h }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0x81b22600  // fmopa za0.s, p1/M, p1/M, z16.h, z18.h\n"
+      ".inst 0x81b22681  // fmopa za1.s, p1/M, p1/M, z20.h, z18.h\n"
+      ".inst 0x81b22702  // fmopa za2.s, p1/M, p1/M, z24.h, z18.h\n"
+      ".inst 0x81b22783  // fmopa za3.s, p1/M, p1/M, z28.h, z18.h\n"
+      ".inst 0xa143a770  // ld1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ld1h { z18.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x81b12660  // fmopa za0.s, p1/M, p1/M, z19.h, z17.h\n"
+      ".inst 0x81b126e1  // fmopa za1.s, p1/M, p1/M, z23.h, z17.h\n"
+      ".inst 0x81b12762  // fmopa za2.s, p1/M, p1/M, z27.h, z17.h\n"
+      ".inst 0x81b127e3  // fmopa za3.s, p1/M, p1/M, z31.h, z17.h\n"
+      ".inst 0x81ba2580  // fmopa za0.s, p1/M, p1/M, z12.h, z26.h\n"
+      ".inst 0x81ba25a1  // fmopa za1.s, p1/M, p1/M, z13.h, z26.h\n"
+      ".inst 0x81ba25c2  // fmopa za2.s, p1/M, p1/M, z14.h, z26.h\n"
+      ".inst 0x81ba25e3  // fmopa za3.s, p1/M, p1/M, z15.h, z26.h\n"
+      ".inst 0x81be2400  // fmopa za0.s, p1/M, p1/M, z0.h, z30.h\n"
+      ".inst 0x81be2421  // fmopa za1.s, p1/M, p1/M, z1.h, z30.h\n"
+      ".inst 0x81be2442  // fmopa za2.s, p1/M, p1/M, z2.h, z30.h\n"
+      ".inst 0x81be2463  // fmopa za3.s, p1/M, p1/M, z3.h, z30.h\n"
+      ".inst 0x81b22600  // fmopa za0.s, p1/M, p1/M, z16.h, z18.h\n"
+      ".inst 0x81b22681  // fmopa za1.s, p1/M, p1/M, z20.h, z18.h\n"
+      ".inst 0x81b22702  // fmopa za2.s, p1/M, p1/M, z24.h, z18.h\n"
+      ".inst 0x81b22783  // fmopa za3.s, p1/M, p1/M, z28.h, z18.h\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa140a773  // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x27]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x27, x27, #4\n"
+      "ld1h { z17.h }, p1/Z, [x23]\n"
+      "addvl x23, x23, #1\n"
+      ".inst 0x81b12660  // fmopa za0.s, p1/M, p1/M, z19.h, z17.h\n"
+      ".inst 0x81b126e1  // fmopa za1.s, p1/M, p1/M, z23.h, z17.h\n"
+      ".inst 0x81b12762  // fmopa za2.s, p1/M, p1/M, z27.h, z17.h\n"
+      ".inst 0x81b127e3  // fmopa za3.s, p1/M, p1/M, z31.h, z17.h\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840400  // mova za0h.s[x12], { z0.s-z3.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa060c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14]\n"
+      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa061c5d4  // st1w { z20.s-z23.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa062c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa063c5d8  // st1w { z24.s-z27.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 29f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa060c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14]\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa061c5d8  // st1w { z24.s-z27.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 29f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "sub x25, x13, x11\n"
+      "cntw x24\n"
+      "ld1rh { z29.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x24\n"
+      "ld1rh { z28.h }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "csel x22, x25, x24, LT\n"
+      "mov x12, #0x0\n"
+      "add x26, x26, x10, LSL #1\n"  // C += n
+      "lsr x21, x22, #0x2\n"
+      "madd x26, x11, x23, x26\n"  // C += m * ldc
+      "and x20, x22, #0x3\n"
+      "cbz x21, 16f\n"
+      "15:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      "add x12, x12, #0x4\n"
+      "fcvt z0.h, p1/m, z0.s\n"
+      "fcvt z1.h, p1/m, z1.s\n"
+      "fcvt z2.h, p1/m, z2.s\n"
+      "fcvt z3.h, p1/m, z3.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc17ccba0  // fclamp { z0.h-z3.h }, z29.h, z28.h\n"
+      "st1h { z0.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z1.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z2.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z3.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 17f\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      "fcvt z16.h, p1/m, z16.s\n"
+      "fcvt z17.h, p1/m, z17.s\n"
+      "fcvt z18.h, p1/m, z18.s\n"
+      "fcvt z19.h, p1/m, z19.s\n"
+      ".inst 0xc17ccbb0  // fclamp { z16.h-z19.h }, z29.h, z28.h\n"
+      "st1h { z16.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 17f\n"
+      "subs x20, x20, #0x1\n"
+      "st1h { z17.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 17f\n"
+      "st1h { z18.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x24\n"
+      "csel x22, x25, x24, LT\n"
+      "mov x12, #0x0\n"
+      "lsr x21, x22, #0x2\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 19f\n"
+      "18:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      "add x12, x12, #0x4\n"
+      "fcvt z24.h, p1/m, z24.s\n"
+      "fcvt z25.h, p1/m, z25.s\n"
+      "fcvt z26.h, p1/m, z26.s\n"
+      "fcvt z27.h, p1/m, z27.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc17ccbb8  // fclamp { z24.h-z27.h }, z29.h, z28.h\n"
+      "st1h { z24.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z25.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z26.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z27.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 20f\n"
+      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      "fcvt z0.h, p1/m, z0.s\n"
+      "fcvt z1.h, p1/m, z1.s\n"
+      "fcvt z2.h, p1/m, z2.s\n"
+      "fcvt z3.h, p1/m, z3.s\n"
+      ".inst 0xc17ccba0  // fclamp { z0.h-z3.h }, z29.h, z28.h\n"
+      "st1h { z0.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 20f\n"
+      "subs x20, x20, #0x1\n"
+      "st1h { z1.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 20f\n"
+      "st1h { z2.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x24\n"
+      "csel x22, x25, x24, LT\n"
+      "mov x12, #0x0\n"
+      "lsr x21, x22, #0x2\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 22f\n"
+      "21:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      "add x12, x12, #0x4\n"
+      "fcvt z20.h, p1/m, z20.s\n"
+      "fcvt z21.h, p1/m, z21.s\n"
+      "fcvt z22.h, p1/m, z22.s\n"
+      "fcvt z23.h, p1/m, z23.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc17ccbb4  // fclamp { z20.h-z23.h }, z29.h, z28.h\n"
+      "st1h { z20.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z21.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z22.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z23.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 21b\n"
+      "22:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x20, 23f\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      "fcvt z12.h, p1/m, z12.s\n"
+      "fcvt z13.h, p1/m, z13.s\n"
+      "fcvt z14.h, p1/m, z14.s\n"
+      "fcvt z15.h, p1/m, z15.s\n"
+      ".inst 0xc17ccbac  // fclamp { z12.h-z15.h }, z29.h, z28.h\n"
+      "st1h { z12.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 23f\n"
+      "subs x20, x20, #0x1\n"
+      "st1h { z13.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 23f\n"
+      "st1h { z14.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "23:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x24\n"
+      "csel x20, x25, x24, LT\n"
+      "mov x12, #0x0\n"
+      "lsr x21, x20, #0x2\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 25f\n"
+      "24:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      "add x12, x12, #0x4\n"
+      "fcvt z4.h, p1/m, z4.s\n"
+      "fcvt z5.h, p1/m, z5.s\n"
+      "fcvt z6.h, p1/m, z6.s\n"
+      "fcvt z7.h, p1/m, z7.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc17ccba4  // fclamp { z4.h-z7.h }, z29.h, z28.h\n"
+      "st1h { z4.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z5.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z6.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1h { z7.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 24b\n"
+      "25:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x20, 26f\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      "fcvt z4.h, p1/m, z4.s\n"
+      "fcvt z5.h, p1/m, z5.s\n"
+      "fcvt z6.h, p1/m, z6.s\n"
+      "fcvt z7.h, p1/m, z7.s\n"
+      ".inst 0xc17ccba4  // fclamp { z4.h-z7.h }, z29.h, z28.h\n"
+      "st1h { z4.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 26f\n"
+      "subs x20, x20, #0x1\n"
+      "st1h { z5.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 26f\n"
+      "st1h { z6.s }, p0, [x26]\n"
+      "26:"  // Store to output array: Accumulator row 3 oddments: End
+      "27:"  // Store to output array: End
+      "tbz x16, #0, 29f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "28:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xa041c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xa042c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c5f4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840400  // mova za0h.s[x12], { z0.s-z3.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 28b\n"
+      "29:"  // End block
+      "incw x10\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #4\n"
+      "mov x10, #0x0\n"
+      "cmp x11, x13\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..bf3de2118e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL
+{
+public:
+  typedef float operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 1;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 1> transforms = {};
+
+  cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..97be758bd6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const float *const A,
+      const float *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(K * sizeof(float)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const float *const A;
+    const float *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c5d4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      ".inst 0x25bc6530  // whilelt pn8.s, x9, x28, VLx4\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "fmov z6.s, #1.0\n"
+      ".inst 0xa009c29d  // ldnt1w { z28.s-z31.s }, p8/Z, [x20, x9, LSL #2]\n"
+      ".inst 0x809c00c0  // fmopa za0.s, p0/M, p0/M, z6.s, z28.s\n"
+      ".inst 0x809d00c1  // fmopa za1.s, p0/M, p0/M, z6.s, z29.s\n"
+      ".inst 0x809e00c2  // fmopa za2.s, p0/M, p0/M, z6.s, z30.s\n"
+      ".inst 0x809f00c3  // fmopa za3.s, p0/M, p0/M, z6.s, z31.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x9\n"
+      "mov x21, x10\n"
+      "incw x20, ALL, MUL #4\n"
+      "incw x21\n"
+      "cmp x20, x28\n"
+      "csel x21, x10, x21, LT\n"
+      "mov x20, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x21, x11\n"
+      "csel x15, x20, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "lsr x23, x20, #0x2\n"
+      "and x22, x20, #0x3\n"
+      "ldr x21, [%x[args], %[offsetof_B]]\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x21, x9, x20, x21\n"  // bptr = B + n * kstride_bytes
+      "cbz x23, 8f\n"
+      "subs x23, x23, #0x1\n"
+      "ld1w { z28.s }, p0/Z, [x26]\n"
+      ".inst 0xa040c6a9  // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x21]\n"
+      "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n"
+      ".inst 0xa041c6ad  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+      "ld1w { z30.s }, p0/Z, [x26, #2, MUL VL]\n"
+      ".inst 0xa042c6a5  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
+      "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
+      "addvl x26, x26, #4\n"
+      ".inst 0xa143c6bb  // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
+      "addvl x21, x21, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x80880380  // fmopa za0.s, p0/M, p0/M, z28.s, z8.s\n"
+      "subs x23, x23, #0x1\n"
+      ".inst 0x80890381  // fmopa za1.s, p0/M, p0/M, z28.s, z9.s\n"
+      ".inst 0x808a0382  // fmopa za2.s, p0/M, p0/M, z28.s, z10.s\n"
+      ".inst 0x808b0383  // fmopa za3.s, p0/M, p0/M, z28.s, z11.s\n"
+      "ld1w { z28.s }, p0/Z, [x26]\n"
+      ".inst 0x808c02c0  // fmopa za0.s, p0/M, p0/M, z22.s, z12.s\n"
+      ".inst 0xa040c6a9  // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x21]\n"
+      ".inst 0x808d02c1  // fmopa za1.s, p0/M, p0/M, z22.s, z13.s\n"
+      ".inst 0x808e02c2  // fmopa za2.s, p0/M, p0/M, z22.s, z14.s\n"
+      ".inst 0x808f02c3  // fmopa za3.s, p0/M, p0/M, z22.s, z15.s\n"
+      "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x808403c0  // fmopa za0.s, p0/M, p0/M, z30.s, z4.s\n"
+      ".inst 0xa041c6ad  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+      ".inst 0x808503c1  // fmopa za1.s, p0/M, p0/M, z30.s, z5.s\n"
+      ".inst 0x808603c2  // fmopa za2.s, p0/M, p0/M, z30.s, z6.s\n"
+      ".inst 0x808703c3  // fmopa za3.s, p0/M, p0/M, z30.s, z7.s\n"
+      "ld1w { z30.s }, p0/Z, [x26, #2, MUL VL]\n"
+      ".inst 0xa042c6a5  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
+      ".inst 0x80930280  // fmopa za0.s, p0/M, p0/M, z20.s, z19.s\n"
+      ".inst 0x80970281  // fmopa za1.s, p0/M, p0/M, z20.s, z23.s\n"
+      ".inst 0x809b0282  // fmopa za2.s, p0/M, p0/M, z20.s, z27.s\n"
+      ".inst 0x809f0283  // fmopa za3.s, p0/M, p0/M, z20.s, z31.s\n"
+      "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
+      "addvl x26, x26, #4\n"
+      ".inst 0xa143c6bb  // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
+      "addvl x21, x21, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x80880380  // fmopa za0.s, p0/M, p0/M, z28.s, z8.s\n"
+      ".inst 0x80890381  // fmopa za1.s, p0/M, p0/M, z28.s, z9.s\n"
+      ".inst 0x808a0382  // fmopa za2.s, p0/M, p0/M, z28.s, z10.s\n"
+      ".inst 0x808b0383  // fmopa za3.s, p0/M, p0/M, z28.s, z11.s\n"
+      ".inst 0x808c02c0  // fmopa za0.s, p0/M, p0/M, z22.s, z12.s\n"
+      ".inst 0x808d02c1  // fmopa za1.s, p0/M, p0/M, z22.s, z13.s\n"
+      ".inst 0x808e02c2  // fmopa za2.s, p0/M, p0/M, z22.s, z14.s\n"
+      ".inst 0x808f02c3  // fmopa za3.s, p0/M, p0/M, z22.s, z15.s\n"
+      ".inst 0x808403c0  // fmopa za0.s, p0/M, p0/M, z30.s, z4.s\n"
+      ".inst 0x808503c1  // fmopa za1.s, p0/M, p0/M, z30.s, z5.s\n"
+      ".inst 0x808603c2  // fmopa za2.s, p0/M, p0/M, z30.s, z6.s\n"
+      ".inst 0x808703c3  // fmopa za3.s, p0/M, p0/M, z30.s, z7.s\n"
+      ".inst 0x80930280  // fmopa za0.s, p0/M, p0/M, z20.s, z19.s\n"
+      ".inst 0x80970281  // fmopa za1.s, p0/M, p0/M, z20.s, z23.s\n"
+      ".inst 0x809b0282  // fmopa za2.s, p0/M, p0/M, z20.s, z27.s\n"
+      ".inst 0x809f0283  // fmopa za3.s, p0/M, p0/M, z20.s, z31.s\n"
+      "8:"  // K oddments
+      "cbz x22, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1w { z8.s }, p0/Z, [x26]\n"
+      "subs x22, x22, #0x1\n"
+      "addvl x26, x26, #1\n"
+      ".inst 0xa140c6a3  // ld1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x21]\n"
+      "addvl x21, x21, #4\n"
+      ".inst 0x80830100  // fmopa za0.s, p0/M, p0/M, z8.s, z3.s\n"
+      ".inst 0x80870101  // fmopa za1.s, p0/M, p0/M, z8.s, z7.s\n"
+      ".inst 0x808b0102  // fmopa za2.s, p0/M, p0/M, z8.s, z11.s\n"
+      ".inst 0x808f0103  // fmopa za3.s, p0/M, p0/M, z8.s, z15.s\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5d4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a0  // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 24f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5b8  // st1w { z24.s-z27.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 24f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "add x25, x25, x9, LSL #2\n"  // C += n
+      "sub x24, x11, x10\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x25, x10, x23, x25\n"  // C += m * ldc
+      "tbz x15, #2, 18f\n"
+      "cntw x20\n"
+      "cmp x24, x20\n"
+      "csel x22, x24, x20, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa160c320  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa160c321  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa160c322  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa160c323  // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x20, 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x24, x24, x22\n"
+      "beq 18f\n"
+      "b 22f\n"
+      "18:"  // Store to output array: Skip activation: End
+      "cntw x20\n"
+      "cmp x24, x20\n"
+      "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x20, x24, x20, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 20f\n"
+      "19:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa160c333  // st1w { z19.s, z23.s, z27.s, z31.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 21f\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
+      "21:"  // Store to output array: Accumulator row 0 oddments: End
+      "22:"  // Store to output array: End
+      "tbz x15, #0, 24f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "23:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x14, x14, #16\n"
+      "blt 23b\n"
+      "24:"  // End block
+      "incw x9, ALL, MUL #4\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..9bc1f83100
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL
+{
+public:
+  typedef float operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 1;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 1> transforms = {};
+
+  cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..3c475044e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const float *const A,
+      const float *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(K * sizeof(float)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const float *const A;
+    const float *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa042c5f4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      ".inst 0x25a94550  // whilelt pn8.s, x10, x9, VLx2\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "fmov z12.s, #1.0\n"
+      ".inst 0xa10a4289  // ldnt1w { z1.s, z9.s }, p8/Z, [x20, x10, LSL #2]\n"
+      ".inst 0x80810180  // fmopa za0.s, p0/M, p0/M, z12.s, z1.s\n"
+      ".inst 0x80890181  // fmopa za1.s, p0/M, p0/M, z12.s, z9.s\n"
+      ".inst 0x80810182  // fmopa za2.s, p0/M, p0/M, z12.s, z1.s\n"
+      ".inst 0x80890183  // fmopa za3.s, p0/M, p0/M, z12.s, z9.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20, ALL, MUL #2\n"
+      "incw x21, ALL, MUL #2\n"
+      "cmp x20, x9\n"
+      "csel x21, x11, x21, LT\n"
+      "mov x20, x16\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "lsr x23, x20, #0x2\n"
+      "and x22, x20, #0x3\n"
+      "ldr x21, [%x[args], %[offsetof_B]]\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x21, x10, x20, x21\n"  // bptr = B + n * kstride_bytes
+      "cbz x23, 8f\n"
+      "subs x23, x23, #0x1\n"
+      ".inst 0xa0404772  // ld1w { z18.s-z19.s }, pn9.b/Z, [x27]\n"
+      ".inst 0xa04046a3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21]\n"
+      ".inst 0xa0414764  // ld1w { z4.s-z5.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa04146bb  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
+      ".inst 0xa042476a  // ld1w { z10.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa04246b5  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+      ".inst 0xa0434766  // ld1w { z6.s-z7.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa04346a9  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
+      "addvl x21, x21, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x80820240  // fmopa za0.s, p0/M, p0/M, z18.s, z2.s\n"
+      "subs x23, x23, #0x1\n"
+      ".inst 0x80830241  // fmopa za1.s, p0/M, p0/M, z18.s, z3.s\n"
+      ".inst 0x80820262  // fmopa za2.s, p0/M, p0/M, z19.s, z2.s\n"
+      ".inst 0x80830263  // fmopa za3.s, p0/M, p0/M, z19.s, z3.s\n"
+      ".inst 0xa0404772  // ld1w { z18.s-z19.s }, pn9.b/Z, [x27]\n"
+      ".inst 0x809a0080  // fmopa za0.s, p0/M, p0/M, z4.s, z26.s\n"
+      ".inst 0xa04046a3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21]\n"
+      ".inst 0x809b0081  // fmopa za1.s, p0/M, p0/M, z4.s, z27.s\n"
+      ".inst 0x809a00a2  // fmopa za2.s, p0/M, p0/M, z5.s, z26.s\n"
+      ".inst 0x809b00a3  // fmopa za3.s, p0/M, p0/M, z5.s, z27.s\n"
+      ".inst 0xa0414764  // ld1w { z4.s-z5.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0x80940140  // fmopa za0.s, p0/M, p0/M, z10.s, z20.s\n"
+      ".inst 0xa04146bb  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
+      ".inst 0x80950141  // fmopa za1.s, p0/M, p0/M, z10.s, z21.s\n"
+      ".inst 0x80940162  // fmopa za2.s, p0/M, p0/M, z11.s, z20.s\n"
+      ".inst 0x80950163  // fmopa za3.s, p0/M, p0/M, z11.s, z21.s\n"
+      ".inst 0xa042476a  // ld1w { z10.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa04246b5  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+      ".inst 0x808800c0  // fmopa za0.s, p0/M, p0/M, z6.s, z8.s\n"
+      ".inst 0x808900c1  // fmopa za1.s, p0/M, p0/M, z6.s, z9.s\n"
+      ".inst 0x808800e2  // fmopa za2.s, p0/M, p0/M, z7.s, z8.s\n"
+      ".inst 0x808900e3  // fmopa za3.s, p0/M, p0/M, z7.s, z9.s\n"
+      ".inst 0xa0434766  // ld1w { z6.s-z7.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa04346a9  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
+      "addvl x21, x21, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x80820240  // fmopa za0.s, p0/M, p0/M, z18.s, z2.s\n"
+      ".inst 0x80830241  // fmopa za1.s, p0/M, p0/M, z18.s, z3.s\n"
+      ".inst 0x80820262  // fmopa za2.s, p0/M, p0/M, z19.s, z2.s\n"
+      ".inst 0x80830263  // fmopa za3.s, p0/M, p0/M, z19.s, z3.s\n"
+      ".inst 0x809a0080  // fmopa za0.s, p0/M, p0/M, z4.s, z26.s\n"
+      ".inst 0x809b0081  // fmopa za1.s, p0/M, p0/M, z4.s, z27.s\n"
+      ".inst 0x809a00a2  // fmopa za2.s, p0/M, p0/M, z5.s, z26.s\n"
+      ".inst 0x809b00a3  // fmopa za3.s, p0/M, p0/M, z5.s, z27.s\n"
+      ".inst 0x80940140  // fmopa za0.s, p0/M, p0/M, z10.s, z20.s\n"
+      ".inst 0x80950141  // fmopa za1.s, p0/M, p0/M, z10.s, z21.s\n"
+      ".inst 0x80940162  // fmopa za2.s, p0/M, p0/M, z11.s, z20.s\n"
+      ".inst 0x80950163  // fmopa za3.s, p0/M, p0/M, z11.s, z21.s\n"
+      ".inst 0x808800c0  // fmopa za0.s, p0/M, p0/M, z6.s, z8.s\n"
+      ".inst 0x808900c1  // fmopa za1.s, p0/M, p0/M, z6.s, z9.s\n"
+      ".inst 0x808800e2  // fmopa za2.s, p0/M, p0/M, z7.s, z8.s\n"
+      ".inst 0x808900e3  // fmopa za3.s, p0/M, p0/M, z7.s, z9.s\n"
+      "8:"  // K oddments
+      "cbz x22, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa040477e  // ld1w { z30.s-z31.s }, pn9.b/Z, [x27]\n"
+      "subs x22, x22, #0x1\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0xa14046a5  // ld1w { z5.s, z13.s }, pn9.b/Z, [x21]\n"
+      "addvl x21, x21, #2\n"
+      ".inst 0x808503c0  // fmopa za0.s, p0/M, p0/M, z30.s, z5.s\n"
+      ".inst 0x808d03c1  // fmopa za1.s, p0/M, p0/M, z30.s, z13.s\n"
+      ".inst 0x808503e2  // fmopa za2.s, p0/M, p0/M, z31.s, z5.s\n"
+      ".inst 0x808d03e3  // fmopa za3.s, p0/M, p0/M, z31.s, z13.s\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xa061c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 30f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c5d8  // st1w { z24.s-z27.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 30f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "add x26, x26, x10, LSL #2\n"  // C += n
+      "sub x25, x13, x11\n"
+      "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x26, x11, x24, x26\n"  // C += m * ldc
+      "tbz x16, #2, 21f\n"
+      "cntw x23\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa1604347  // st1w { z7.s, z15.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x20, 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa1604354  // st1w { z20.s, z28.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604355  // st1w { z21.s, z29.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 17f\n"
+      ".inst 0xa1604356  // st1w { z22.s, z30.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 21f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 19f\n"
+      "18:"  // Store to output array: Skip activation: Accumulator row 1 loop
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa1604347  // st1w { z7.s, z15.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
+      "cbz x20, 20f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 20f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 20f\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 21f\n"
+      "b 28f\n"
+      "21:"  // Store to output array: Skip activation: End
+      "cntw x23\n"
+      "cmp x25, x23\n"
+      "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 23f\n"
+      "22:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+      ".inst 0xa1604354  // st1w { z20.s, z28.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604355  // st1w { z21.s, z29.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa1604356  // st1w { z22.s, z30.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604357  // st1w { z23.s, z31.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 22b\n"
+      "23:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 24f\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 24f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 24f\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "24:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 28f\n"
+      "cmp x25, x23\n"
+      "csel x20, x25, x23, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 26f\n"
+      "25:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604353  // st1w { z19.s, z27.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 25b\n"
+      "26:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 27f\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 27f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 27f\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
+      "27:"  // Store to output array: Accumulator row 1 oddments: End
+      "28:"  // Store to output array: End
+      "tbz x16, #0, 30f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "29:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 29b\n"
+      "30:"  // End block
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #2\n"
+      "cmp x11, x13\n"
+      "mov x10, #0x0\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..165e25dd8f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL
+{
+public:
+  typedef float operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 1;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 1> transforms = {};
+
+  cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..ae1f812442
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const float *const A,
+      const float *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(K * sizeof(float)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const float *const A;
+    const float *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa042c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      "whilelt p0.s, x10, x9\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "fmov z11.s, #1.0\n"
+      "ldnt1w { z13.s }, p0/Z, [x20, x10, LSL #2]\n"
+      ".inst 0x808d2560  // fmopa za0.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2561  // fmopa za1.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2562  // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2563  // fmopa za3.s, p1/M, p1/M, z11.s, z13.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20\n"
+      "incw x21, ALL, MUL #4\n"
+      "cmp x20, x9\n"
+      "csel x21, x11, x21, LT\n"
+      "mov x20, x16\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "lsr x23, x20, #0x2\n"
+      "and x22, x20, #0x3\n"
+      "ldr x21, [%x[args], %[offsetof_B]]\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x21, x10, x20, x21\n"  // bptr = B + n * kstride_bytes
+      "cbz x23, 8f\n"
+      "subs x23, x23, #0x1\n"
+      ".inst 0xa140c360  // ld1w { z0.s, z4.s, z8.s, z12.s }, pn8.b/Z, [x27]\n"
+      "ldnt1w { z19.s }, p1/Z, [x21]\n"
+      ".inst 0xa141c371  // ld1w { z17.s, z21.s, z25.s, z29.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      "ldnt1w { z22.s }, p1/Z, [x21, #1, MUL VL]\n"
+      ".inst 0xa142c370  // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1w { z23.s }, p1/Z, [x21, #2, MUL VL]\n"
+      ".inst 0xa143c363  // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ldnt1w { z2.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x80932400  // fmopa za0.s, p1/M, p1/M, z0.s, z19.s\n"
+      "subs x23, x23, #0x1\n"
+      ".inst 0x80932481  // fmopa za1.s, p1/M, p1/M, z4.s, z19.s\n"
+      ".inst 0x80932502  // fmopa za2.s, p1/M, p1/M, z8.s, z19.s\n"
+      ".inst 0x80932583  // fmopa za3.s, p1/M, p1/M, z12.s, z19.s\n"
+      ".inst 0xa140c360  // ld1w { z0.s, z4.s, z8.s, z12.s }, pn8.b/Z, [x27]\n"
+      ".inst 0x80962620  // fmopa za0.s, p1/M, p1/M, z17.s, z22.s\n"
+      "ldnt1w { z19.s }, p1/Z, [x21]\n"
+      ".inst 0x809626a1  // fmopa za1.s, p1/M, p1/M, z21.s, z22.s\n"
+      ".inst 0x80962722  // fmopa za2.s, p1/M, p1/M, z25.s, z22.s\n"
+      ".inst 0x809627a3  // fmopa za3.s, p1/M, p1/M, z29.s, z22.s\n"
+      ".inst 0xa141c371  // ld1w { z17.s, z21.s, z25.s, z29.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0x80972600  // fmopa za0.s, p1/M, p1/M, z16.s, z23.s\n"
+      "ldnt1w { z22.s }, p1/Z, [x21, #1, MUL VL]\n"
+      ".inst 0x80972681  // fmopa za1.s, p1/M, p1/M, z20.s, z23.s\n"
+      ".inst 0x80972702  // fmopa za2.s, p1/M, p1/M, z24.s, z23.s\n"
+      ".inst 0x80972783  // fmopa za3.s, p1/M, p1/M, z28.s, z23.s\n"
+      ".inst 0xa142c370  // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1w { z23.s }, p1/Z, [x21, #2, MUL VL]\n"
+      ".inst 0x80822460  // fmopa za0.s, p1/M, p1/M, z3.s, z2.s\n"
+      ".inst 0x808224e1  // fmopa za1.s, p1/M, p1/M, z7.s, z2.s\n"
+      ".inst 0x80822562  // fmopa za2.s, p1/M, p1/M, z11.s, z2.s\n"
+      ".inst 0x808225e3  // fmopa za3.s, p1/M, p1/M, z15.s, z2.s\n"
+      ".inst 0xa143c363  // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ldnt1w { z2.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x80932400  // fmopa za0.s, p1/M, p1/M, z0.s, z19.s\n"
+      ".inst 0x80932481  // fmopa za1.s, p1/M, p1/M, z4.s, z19.s\n"
+      ".inst 0x80932502  // fmopa za2.s, p1/M, p1/M, z8.s, z19.s\n"
+      ".inst 0x80932583  // fmopa za3.s, p1/M, p1/M, z12.s, z19.s\n"
+      ".inst 0x80962620  // fmopa za0.s, p1/M, p1/M, z17.s, z22.s\n"
+      ".inst 0x809626a1  // fmopa za1.s, p1/M, p1/M, z21.s, z22.s\n"
+      ".inst 0x80962722  // fmopa za2.s, p1/M, p1/M, z25.s, z22.s\n"
+      ".inst 0x809627a3  // fmopa za3.s, p1/M, p1/M, z29.s, z22.s\n"
+      ".inst 0x80972600  // fmopa za0.s, p1/M, p1/M, z16.s, z23.s\n"
+      ".inst 0x80972681  // fmopa za1.s, p1/M, p1/M, z20.s, z23.s\n"
+      ".inst 0x80972702  // fmopa za2.s, p1/M, p1/M, z24.s, z23.s\n"
+      ".inst 0x80972783  // fmopa za3.s, p1/M, p1/M, z28.s, z23.s\n"
+      ".inst 0x80822460  // fmopa za0.s, p1/M, p1/M, z3.s, z2.s\n"
+      ".inst 0x808224e1  // fmopa za1.s, p1/M, p1/M, z7.s, z2.s\n"
+      ".inst 0x80822562  // fmopa za2.s, p1/M, p1/M, z11.s, z2.s\n"
+      ".inst 0x808225e3  // fmopa za3.s, p1/M, p1/M, z15.s, z2.s\n"
+      "8:"  // K oddments
+      "cbz x22, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa140c373  // ld1w { z19.s, z23.s, z27.s, z31.s }, pn8.b/Z, [x27]\n"
+      "subs x22, x22, #0x1\n"
+      "addvl x27, x27, #4\n"
+      "ld1w { z11.s }, p1/Z, [x21]\n"
+      "addvl x21, x21, #1\n"
+      ".inst 0x808b2660  // fmopa za0.s, p1/M, p1/M, z19.s, z11.s\n"
+      ".inst 0x808b26e1  // fmopa za1.s, p1/M, p1/M, z23.s, z11.s\n"
+      ".inst 0x808b2762  // fmopa za2.s, p1/M, p1/M, z27.s, z11.s\n"
+      ".inst 0x808b27e3  // fmopa za3.s, p1/M, p1/M, z31.s, z11.s\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c1d8  // st1w { z24.s-z27.s }, pn8.b, [x14]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xa061c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c1d0  // st1w { z16.s-z19.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 42f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa060c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14]\n"
+      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 42f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "add x26, x26, x10, LSL #2\n"  // C += n
+      "sub x25, x13, x11\n"
+      "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x26, x11, x24, x26\n"  // C += m * ldc
+      "tbz x16, #2, 27f\n"
+      "cntw x23\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z7.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x20, 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 17f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 17f\n"
+      "st1w { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 19f\n"
+      "18:"  // Store to output array: Skip activation: Accumulator row 1 loop
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      "st1w { z8.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z9.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z10.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z11.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
+      "cbz x20, 20f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      "st1w { z24.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 20f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z25.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 20f\n"
+      "st1w { z26.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 22f\n"
+      "21:"  // Store to output array: Skip activation: Accumulator row 2 loop
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z7.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 21b\n"
+      "22:"  // Store to output array: Skip activation: Accumulator row 2 oddments
+      "cbz x20, 23f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      "st1w { z12.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 23f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z13.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 23f\n"
+      "st1w { z14.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "23:"  // Store to output array: Skip activation: Accumulator row 2 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 25f\n"
+      "24:"  // Store to output array: Skip activation: Accumulator row 3 loop
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z19.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 24b\n"
+      "25:"  // Store to output array: Skip activation: Accumulator row 3 oddments
+      "cbz x20, 26f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 26f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 26f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "26:"  // Store to output array: Skip activation: Accumulator row 3 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "b 40f\n"
+      "27:"  // Store to output array: Skip activation: End
+      "cntw x23\n"
+      "cmp x25, x23\n"
+      "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 29f\n"
+      "28:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc1b4cabc  // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1w { z28.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z29.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z30.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z31.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 28b\n"
+      "29:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 30f\n"
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1b4cabc  // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1w { z28.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 30f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z29.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 30f\n"
+      "st1w { z30.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "30:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 40f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 32f\n"
+      "31:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1b4caa4  // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1w { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z7.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 31b\n"
+      "32:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 33f\n"
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 33f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 33f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "33:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 40f\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 35f\n"
+      "34:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z19.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 34b\n"
+      "35:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x20, 36f\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 36f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 36f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "36:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 40f\n"
+      "cmp x25, x23\n"
+      "csel x20, x25, x23, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 38f\n"
+      "37:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z19.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 37b\n"
+      "38:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x20, 39f\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 39f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 39f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "39:"  // Store to output array: Accumulator row 3 oddments: End
+      "40:"  // Store to output array: End
+      "tbz x16, #0, 42f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "41:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 41b\n"
+      "42:"  // End block
+      "incw x10\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #4\n"
+      "cmp x11, x13\n"
+      "mov x10, #0x0\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..7b3cc77867
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int8_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..aba677b158
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<int8_t>::min();
+    int32_t max = std::numeric_limits<int8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x14, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x14, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5a8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa042c5a8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa043c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x13, x13, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w10, [%x[args], %[offsetof_M]]\n"
+      "mov x9, #0x0\n"
+      "mov x28, #0x0\n"
+      "ldr w27, [%x[args], %[offsetof_N]]\n"
+      "ldr x26, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x25, x26\n"
+      ".inst 0x25bb6790  // whilelt pn8.s, x28, x27, VLx4\n"
+      "tbnz x14, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      ".inst 0xa11cc289  // ldnt1w { z1.s, z5.s, z9.s, z13.s }, p8/Z, [x20, x28, LSL #2]\n"
+      ".inst 0xc0902420  // addha za0.s, p1/M, p1/M, z1.s\n"
+      ".inst 0xc09024a1  // addha za1.s, p1/M, p1/M, z5.s\n"
+      ".inst 0xc0902522  // addha za2.s, p1/M, p1/M, z9.s\n"
+      ".inst 0xc09025a3  // addha za3.s, p1/M, p1/M, z13.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x28\n"
+      "mov x21, x9\n"
+      "incw x20, ALL, MUL #4\n"
+      "incw x21\n"
+      "cmp x20, x27\n"
+      "csel x21, x9, x21, LT\n"
+      "mov x20, x14\n"
+      "bfm x14, XZR, #0x0, #0x0  // bfc x14, #0x0, #0x1\n"
+      "cmp x21, x10\n"
+      "csel x14, x20, x14, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x28, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      "ld1b { z20.b }, p1/Z, [x25]\n"
+      ".inst 0xa04086e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+      "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa04286fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa0842680  // smopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa0852681  // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa0862682  // smopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+      ".inst 0xa0872683  // smopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+      "ld1b { z20.b }, p1/Z, [x25]\n"
+      ".inst 0xa0982560  // smopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+      ".inst 0xa04086e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa0992561  // smopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa09a2562  // smopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+      ".inst 0xa09b2563  // smopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+      "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+      ".inst 0xa09c2440  // smopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa09d2441  // smopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+      ".inst 0xa09e2442  // smopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+      ".inst 0xa09f2443  // smopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+      "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa04286fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xa09025c0  // smopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+      ".inst 0xa09125c1  // smopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+      ".inst 0xa09225c2  // smopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+      ".inst 0xa09325c3  // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+      "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa0842680  // smopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+      ".inst 0xa0852681  // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa0862682  // smopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+      ".inst 0xa0872683  // smopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+      ".inst 0xa0982560  // smopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+      ".inst 0xa0992561  // smopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa09a2562  // smopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+      ".inst 0xa09b2563  // smopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+      ".inst 0xa09c2440  // smopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+      ".inst 0xa09d2441  // smopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+      ".inst 0xa09e2442  // smopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+      ".inst 0xa09f2443  // smopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+      ".inst 0xa09025c0  // smopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+      ".inst 0xa09125c1  // smopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+      ".inst 0xa09225c2  // smopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+      ".inst 0xa09325c3  // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1b { z16.b }, p1/Z, [x25]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x25, x25, #1\n"
+      ".inst 0xa04086e4  // ld1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+      "addvl x23, x23, #4\n"
+      ".inst 0xa0842600  // smopa za0.s, p1/M, p1/M, z16.b, z4.b\n"
+      ".inst 0xa0852601  // smopa za1.s, p1/M, p1/M, z16.b, z5.b\n"
+      ".inst 0xa0862602  // smopa za2.s, p1/M, p1/M, z16.b, z6.b\n"
+      ".inst 0xa0872603  // smopa za3.s, p1/M, p1/M, z16.b, z7.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "ld1w { z15.s }, p1/Z, [x25]\n"
+      "addvl x25, x25, #1\n"
+      ".inst 0xc09125e0  // addva za0.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e1  // addva za1.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e2  // addva za2.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
+      "tbz x14, #1, 14f\n"
+      "tbz x14, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0840400  // mova za0h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c578  // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
+      "addvl x13, x13, #16\n"
+      ".inst 0xa061c564  // st1w { z4.s-z7.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      ".inst 0xa062c574  // st1w { z20.s-z23.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c560  // st1w { z0.s-z3.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "blt 11b\n"
+      "b 21f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xa060c564  // st1w { z4.s-z7.s }, pn9.b, [x11]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa061c574  // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c56c  // st1w { z12.s-z15.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c568  // st1w { z8.s-z11.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "blt 13b\n"
+      "b 21f\n"
+      "14:"  // Store to output array
+      "ldr x24, [%x[args], %[offsetof_C]]\n"
+      "add x24, x24, x28\n"  // C += n
+      "sub x23, x10, x9\n"
+      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x24, x9, x22, x24\n"  // C += m * ldc
+      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x14, #2, 15f\n"
+      "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+      "add x21, x21, x28\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      ".inst 0xa040c284  // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      ".inst 0xa040c28c  // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x20\n"
+      "whilelt p0.b, x28, x27\n"
+      "cmp x23, x20\n"
+      "csel x20, x23, x20, LT\n"
+      "lsr x21, x20, #0x1\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x1\n"
+      "cbz x21, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc086001a  // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
+      ".inst 0xc086005c  // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
+      ".inst 0xc1a4a41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
+      ".inst 0xc0860096  // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
+      ".inst 0xc08600d0  // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
+      ".inst 0xc1a5a41c  // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+      ".inst 0xc1a6a416  // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x21, LSL #1\n"
+      ".inst 0xc1a7a410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+      ".inst 0xc1aca23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+      ".inst 0xc1ada23c  // srshl { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+      ".inst 0xc1aea236  // srshl { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+      ".inst 0xc1afa230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+      ".inst 0xc1a0a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+      ".inst 0xc1a0a31c  // add { z28.s-z29.s }, { z28.s-z29.s }, z0.s\n"
+      ".inst 0xc1a0a316  // add { z22.s-z23.s }, { z22.s-z23.s }, z0.s\n"
+      ".inst 0xc1a0a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n"
+      ".inst 0xc1b4c6ba  // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6bc  // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
+      "uzp1 z19.b, z26.b, z28.b\n"
+      ".inst 0xc1b4c6b6  // sclamp { z22.s-z23.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6b0  // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+      "uzp1 z16.b, z22.b, z16.b\n"
+      "uzp1 z18.b, z27.b, z29.b\n"
+      "uzp1 z17.b, z23.b, z17.b\n"
+      "uzp1 z16.b, z19.b, z16.b\n"
+      "st1b { z16.b }, p0, [x24]\n"
+      "add x24, x24, x22\n"
+      "uzp1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p0, [x24]\n"
+      "add x24, x24, x22\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 18f\n"
+      ".inst 0xc086000a  // mova { z10.s-z11.s }, za0h.s[x12, 0:1]\n"
+      ".inst 0xc0860058  // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
+      ".inst 0xc1a4a40a  // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z4.s\n"
+      ".inst 0xc086009a  // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n"
+      ".inst 0xc08600de  // mova { z30.s-z31.s }, za3h.s[x12, 0:1]\n"
+      ".inst 0xc1a5a418  // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+      ".inst 0xc1a6a41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n"
+      ".inst 0xc1a7a41e  // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z7.s\n"
+      ".inst 0xc1aca22a  // srshl { z10.s-z11.s }, { z10.s-z11.s }, z12.s\n"
+      ".inst 0xc1ada238  // srshl { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+      ".inst 0xc1aea23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z14.s\n"
+      ".inst 0xc1afa23e  // srshl { z30.s-z31.s }, { z30.s-z31.s }, z15.s\n"
+      ".inst 0xc1a0a30a  // add { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n"
+      ".inst 0xc1a0a318  // add { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
+      ".inst 0xc1a0a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+      ".inst 0xc1a0a31e  // add { z30.s-z31.s }, { z30.s-z31.s }, z0.s\n"
+      ".inst 0xc1b4c6aa  // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6b8  // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+      "uzp1 z17.b, z10.b, z24.b\n"
+      ".inst 0xc1b4c6ba  // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6be  // sclamp { z30.s-z31.s }, z21.s, z20.s\n"
+      "uzp1 z16.b, z26.b, z30.b\n"
+      "uzp1 z16.b, z17.b, z16.b\n"
+      "st1b { z16.b }, p0, [x24]\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "19:"  // Store to output array: End
+      "tbz x14, #0, 21f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "20:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5bc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x13, x13, #16\n"
+      "blt 20b\n"
+      "21:"  // End block
+      "incw x28, ALL, MUL #4\n"
+      "cmp x28, x27\n"
+      "blt 3b\n"
+      "incw x9\n"
+      "cmp x9, x10\n"
+      "mov x28, #0x0\n"
+      "mov x26, x25\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..79990f72e5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int8_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..7033de5fe3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<int8_t>::min();
+    int32_t max = std::numeric_limits<int8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa041c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      ".inst 0x25a94550  // whilelt pn8.s, x10, x9, VLx2\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      ".inst 0xa00a4299  // ldnt1w { z24.s-z25.s }, p8/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc0902700  // addha za0.s, p1/M, p1/M, z24.s\n"
+      ".inst 0xc0902721  // addha za1.s, p1/M, p1/M, z25.s\n"
+      ".inst 0xc0902702  // addha za2.s, p1/M, p1/M, z24.s\n"
+      ".inst 0xc0902723  // addha za3.s, p1/M, p1/M, z25.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20, ALL, MUL #2\n"
+      "incw x21, ALL, MUL #2\n"
+      "cmp x20, x9\n"
+      "csel x21, x11, x21, LT\n"
+      "mov x20, x16\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa1400763  // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa14006f9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa1410774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa04106f7  // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa1420775  // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa1430765  // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa14306ef  // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa0912460  // smopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa0992461  // smopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+      ".inst 0xa0912562  // smopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+      ".inst 0xa0992563  // smopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa1400763  // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa0962680  // smopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+      ".inst 0xa14006f9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa0972681  // smopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+      ".inst 0xa0962782  // smopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+      ".inst 0xa0972783  // smopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+      ".inst 0xa1410774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa09026a0  // smopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+      ".inst 0xa04106f7  // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa09826a1  // smopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa09027a2  // smopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+      ".inst 0xa09827a3  // smopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+      ".inst 0xa1420775  // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa08724a0  // smopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+      ".inst 0xa08f24a1  // smopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+      ".inst 0xa08725a2  // smopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+      ".inst 0xa08f25a3  // smopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
+      ".inst 0xa1430765  // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa14306ef  // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa0912460  // smopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+      ".inst 0xa0992461  // smopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+      ".inst 0xa0912562  // smopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+      ".inst 0xa0992563  // smopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa0962680  // smopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+      ".inst 0xa0972681  // smopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+      ".inst 0xa0962782  // smopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+      ".inst 0xa0972783  // smopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+      ".inst 0xa09026a0  // smopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+      ".inst 0xa09826a1  // smopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa09027a2  // smopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+      ".inst 0xa09827a3  // smopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+      ".inst 0xa08724a0  // smopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+      ".inst 0xa08f24a1  // smopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+      ".inst 0xa08725a2  // smopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+      ".inst 0xa08f25a3  // smopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa1400773  // ld1b { z19.b, z27.b }, pn9.b/Z, [x27]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0xa04006f0  // ld1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
+      "addvl x23, x23, #2\n"
+      ".inst 0xa0902660  // smopa za0.s, p1/M, p1/M, z19.b, z16.b\n"
+      ".inst 0xa0912661  // smopa za1.s, p1/M, p1/M, z19.b, z17.b\n"
+      ".inst 0xa0902762  // smopa za2.s, p1/M, p1/M, z27.b, z16.b\n"
+      ".inst 0xa0912763  // smopa za3.s, p1/M, p1/M, z27.b, z17.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      ".inst 0xa040476e  // ld1w { z14.s-z15.s }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0xc09125c0  // addva za0.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125c1  // addva za1.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125e2  // addva za2.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xa061c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 24f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5d4  // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 24f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "add x26, x26, x10\n"  // C += n
+      "sub x25, x13, x11\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x26, x11, x24, x26\n"  // C += m * ldc
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x16, #2, 15f\n"
+      "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+      "add x21, x21, x10\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      ".inst 0xa0404280  // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      ".inst 0xa0404282  // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x23\n"
+      "whilelt p0.h, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf28  // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z8.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "uzp1 z16.h, z5.h, z9.h\n"
+      "uzp1 z17.h, z6.h, z10.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "uzp1 z16.h, z7.h, z11.h\n"
+      "st1b { z17.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 18f\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+      ".inst 0xc1a1ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a2aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+      ".inst 0xc1a3aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+      ".inst 0xc1aeab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1b8cf28  // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z8.h, z4.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      "subs x20, x20, #0x1\n"
+      "uzp1 z16.h, z9.h, z5.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      "uzp1 z16.h, z10.h, z6.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 22f\n"
+      "whilelt p0.h, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x20, x25, x23, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 20f\n"
+      "19:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z14.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf34  // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z20.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "uzp1 z16.h, z5.h, z21.h\n"
+      "uzp1 z17.h, z6.h, z22.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "uzp1 z16.h, z7.h, z23.h\n"
+      "st1b { z17.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 21f\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z14.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf30  // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z16.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      "subs x20, x20, #0x1\n"
+      "uzp1 z16.h, z5.h, z17.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      "uzp1 z16.h, z6.h, z18.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "21:"  // Store to output array: Accumulator row 1 oddments: End
+      "22:"  // Store to output array: End
+      "tbz x16, #0, 24f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "23:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 23b\n"
+      "24:"  // End block
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #2\n"
+      "cmp x11, x13\n"
+      "mov x10, #0x0\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..ef39cbbb28
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int8_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..4601f05501
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<int8_t>::min();
+    int32_t max = std::numeric_limits<int8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      "whilelt p0.s, x10, x9\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "ldnt1w { z8.s }, p0/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc0902500  // addha za0.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902501  // addha za1.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902502  // addha za2.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902503  // addha za3.s, p1/M, p1/M, z8.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20\n"
+      "incw x21, ALL, MUL #4\n"
+      "cmp x20, x9\n"
+      "csel x21, x11, x21, LT\n"
+      "mov x20, x16\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa0408364  // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+      "ldnt1b { z14.b }, p1/Z, [x23]\n"
+      ".inst 0xa0418374  // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa0428378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa0438368  // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa08e2480  // smopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa08e24a1  // smopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+      ".inst 0xa08e24c2  // smopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+      ".inst 0xa08e24e3  // smopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+      ".inst 0xa0408364  // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+      ".inst 0xa09f2680  // smopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+      "ldnt1b { z14.b }, p1/Z, [x23]\n"
+      ".inst 0xa09f26a1  // smopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+      ".inst 0xa09f26c2  // smopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+      ".inst 0xa09f26e3  // smopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+      ".inst 0xa0418374  // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa08d2700  // smopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+      "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa08d2721  // smopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+      ".inst 0xa08d2742  // smopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+      ".inst 0xa08d2763  // smopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+      ".inst 0xa0428378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa09d2500  // smopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+      ".inst 0xa09d2521  // smopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+      ".inst 0xa09d2542  // smopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa09d2563  // smopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
+      ".inst 0xa0438368  // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa08e2480  // smopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+      ".inst 0xa08e24a1  // smopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+      ".inst 0xa08e24c2  // smopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+      ".inst 0xa08e24e3  // smopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+      ".inst 0xa09f2680  // smopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+      ".inst 0xa09f26a1  // smopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+      ".inst 0xa09f26c2  // smopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+      ".inst 0xa09f26e3  // smopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+      ".inst 0xa08d2700  // smopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+      ".inst 0xa08d2721  // smopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+      ".inst 0xa08d2742  // smopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+      ".inst 0xa08d2763  // smopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+      ".inst 0xa09d2500  // smopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+      ".inst 0xa09d2521  // smopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+      ".inst 0xa09d2542  // smopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa09d2563  // smopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x27, x27, #4\n"
+      "ld1b { z15.b }, p1/Z, [x23]\n"
+      "addvl x23, x23, #1\n"
+      ".inst 0xa08f2640  // smopa za0.s, p1/M, p1/M, z18.b, z15.b\n"
+      ".inst 0xa08f26c1  // smopa za1.s, p1/M, p1/M, z22.b, z15.b\n"
+      ".inst 0xa08f2742  // smopa za2.s, p1/M, p1/M, z26.b, z15.b\n"
+      ".inst 0xa08f27c3  // smopa za3.s, p1/M, p1/M, z30.b, z15.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      ".inst 0xa140c363  // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27]\n"
+      "addvl x27, x27, #4\n"
+      ".inst 0xc0912460  // addva za0.s, p1/M, p1/M, z3.s\n"
+      ".inst 0xc09124e1  // addva za1.s, p1/M, p1/M, z7.s\n"
+      ".inst 0xc0912562  // addva za2.s, p1/M, p1/M, z11.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xa061c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c1d4  // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 30f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 30f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "add x26, x26, x10\n"  // C += n
+      "sub x25, x13, x11\n"
+      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x26, x11, x24, x26\n"  // C += m * ldc
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x16, #2, 15f\n"
+      "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+      "add x21, x21, x10\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      "ld1w { z2.s }, p0/Z, [x20]\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      "ld1w { z1.s }, p0/Z, [x20]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x23\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc1a2ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a0ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+      ".inst 0xc1b4ceb0  // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1b { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z19.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 18f\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc1a2ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      ".inst 0xc1a0ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+      ".inst 0xc1b4ceb0  // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1b { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      "subs x20, x20, #0x1\n"
+      "st1b { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      "st1b { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 28f\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 20f\n"
+      "19:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a0ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1b4cea4  // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1b { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z7.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 21f\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      ".inst 0xc1a0ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1b4cea4  // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1b { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      "subs x20, x20, #0x1\n"
+      "st1b { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      "st1b { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "21:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 28f\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 23f\n"
+      "22:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc1a2ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a1aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a0ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+      ".inst 0xc1b4cea8  // sclamp { z8.s-z11.s }, z21.s, z20.s\n"
+      "st1b { z8.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z9.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z10.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z11.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 22b\n"
+      "23:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x20, 24f\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a0ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+      ".inst 0xc1b4ceac  // sclamp { z12.s-z15.s }, z21.s, z20.s\n"
+      "st1b { z12.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 24f\n"
+      "subs x20, x20, #0x1\n"
+      "st1b { z13.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 24f\n"
+      "st1b { z14.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "24:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 28f\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x20, x25, x23, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 26f\n"
+      "25:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a0ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+      ".inst 0xc1b4cebc  // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1b { z28.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z29.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z30.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z31.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 25b\n"
+      "26:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x20, 27f\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+      ".inst 0xc1a0ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+      ".inst 0xc1b4cebc  // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1b { z28.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 27f\n"
+      "subs x20, x20, #0x1\n"
+      "st1b { z29.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 27f\n"
+      "st1b { z30.s }, p0, [x26]\n"
+      "27:"  // Store to output array: Accumulator row 3 oddments: End
+      "28:"  // Store to output array: End
+      "tbz x16, #0, 30f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "29:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 29b\n"
+      "30:"  // End block
+      "incw x10\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #4\n"
+      "cmp x11, x13\n"
+      "mov x10, #0x0\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..7792192856
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 4> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..4b26a6578c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias, const float *const late_bias, const Activation act,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias), late_bias(late_bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const int32_t *const bias;
+    const float *const late_bias;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, late_bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x13, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x10, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x13, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c57c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xa041c560  // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xa042c578  // ld1w { z24.s-z27.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa043c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      "addvl x11, x11, #16\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w9, [%x[args], %[offsetof_M]]\n"
+      "mov x28, #0x0\n"
+      "mov x27, #0x0\n"
+      "ldr w26, [%x[args], %[offsetof_N]]\n"
+      "ldr x25, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x24, x25\n"
+      ".inst 0x25ba6770  // whilelt pn8.s, x27, x26, VLx4\n"
+      "tbnz x13, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      ".inst 0xa01bc288  // ld1w { z8.s-z11.s }, p8/Z, [x20, x27, LSL #2]\n"
+      ".inst 0xc0900100  // addha za0.s, p0/M, p0/M, z8.s\n"
+      ".inst 0xc0900121  // addha za1.s, p0/M, p0/M, z9.s\n"
+      ".inst 0xc0900142  // addha za2.s, p0/M, p0/M, z10.s\n"
+      ".inst 0xc0900163  // addha za3.s, p0/M, p0/M, z11.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x27\n"
+      "mov x21, x28\n"
+      "incw x20, ALL, MUL #4\n"
+      "incw x21\n"
+      "cmp x20, x26\n"
+      "mov x20, x13\n"
+      "csel x21, x28, x21, LT\n"
+      "bfm x13, XZR, #0x0, #0x0  // bfc x13, #0x0, #0x1\n"
+      "cmp x21, x9\n"
+      "csel x13, x20, x13, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "lsr x21, x20, #0x2\n"
+      "madd x23, x27, x22, x23\n"  // bptr = B + n * kstride_bytes
+      "and x20, x20, #0x3\n"
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      "ld1b { z31.b }, p0/Z, [x24]\n"
+      ".inst 0xa04086e8  // ld1b { z8.b-z11.b }, pn9.b/Z, [x23]\n"
+      "ld1b { z1.b }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0xa04186e4  // ld1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      "ld1b { z0.b }, p0/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa04286ec  // ld1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      "ld1b { z3.b }, p0/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      ".inst 0xa04386f0  // ld1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa08803e0  // smopa za0.s, p0/M, p0/M, z31.b, z8.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa08903e1  // smopa za1.s, p0/M, p0/M, z31.b, z9.b\n"
+      ".inst 0xa08a03e2  // smopa za2.s, p0/M, p0/M, z31.b, z10.b\n"
+      ".inst 0xa08b03e3  // smopa za3.s, p0/M, p0/M, z31.b, z11.b\n"
+      "ld1b { z31.b }, p0/Z, [x24]\n"
+      ".inst 0xa0840020  // smopa za0.s, p0/M, p0/M, z1.b, z4.b\n"
+      ".inst 0xa04086e8  // ld1b { z8.b-z11.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa0850021  // smopa za1.s, p0/M, p0/M, z1.b, z5.b\n"
+      ".inst 0xa0860022  // smopa za2.s, p0/M, p0/M, z1.b, z6.b\n"
+      ".inst 0xa0870023  // smopa za3.s, p0/M, p0/M, z1.b, z7.b\n"
+      "ld1b { z1.b }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0xa08c0000  // smopa za0.s, p0/M, p0/M, z0.b, z12.b\n"
+      ".inst 0xa04186e4  // ld1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa08d0001  // smopa za1.s, p0/M, p0/M, z0.b, z13.b\n"
+      ".inst 0xa08e0002  // smopa za2.s, p0/M, p0/M, z0.b, z14.b\n"
+      ".inst 0xa08f0003  // smopa za3.s, p0/M, p0/M, z0.b, z15.b\n"
+      "ld1b { z0.b }, p0/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa04286ec  // ld1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xa0900060  // smopa za0.s, p0/M, p0/M, z3.b, z16.b\n"
+      ".inst 0xa0910061  // smopa za1.s, p0/M, p0/M, z3.b, z17.b\n"
+      ".inst 0xa0920062  // smopa za2.s, p0/M, p0/M, z3.b, z18.b\n"
+      ".inst 0xa0930063  // smopa za3.s, p0/M, p0/M, z3.b, z19.b\n"
+      "ld1b { z3.b }, p0/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      ".inst 0xa04386f0  // ld1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa08803e0  // smopa za0.s, p0/M, p0/M, z31.b, z8.b\n"
+      ".inst 0xa08903e1  // smopa za1.s, p0/M, p0/M, z31.b, z9.b\n"
+      ".inst 0xa08a03e2  // smopa za2.s, p0/M, p0/M, z31.b, z10.b\n"
+      ".inst 0xa08b03e3  // smopa za3.s, p0/M, p0/M, z31.b, z11.b\n"
+      ".inst 0xa0840020  // smopa za0.s, p0/M, p0/M, z1.b, z4.b\n"
+      ".inst 0xa0850021  // smopa za1.s, p0/M, p0/M, z1.b, z5.b\n"
+      ".inst 0xa0860022  // smopa za2.s, p0/M, p0/M, z1.b, z6.b\n"
+      ".inst 0xa0870023  // smopa za3.s, p0/M, p0/M, z1.b, z7.b\n"
+      ".inst 0xa08c0000  // smopa za0.s, p0/M, p0/M, z0.b, z12.b\n"
+      ".inst 0xa08d0001  // smopa za1.s, p0/M, p0/M, z0.b, z13.b\n"
+      ".inst 0xa08e0002  // smopa za2.s, p0/M, p0/M, z0.b, z14.b\n"
+      ".inst 0xa08f0003  // smopa za3.s, p0/M, p0/M, z0.b, z15.b\n"
+      ".inst 0xa0900060  // smopa za0.s, p0/M, p0/M, z3.b, z16.b\n"
+      ".inst 0xa0910061  // smopa za1.s, p0/M, p0/M, z3.b, z17.b\n"
+      ".inst 0xa0920062  // smopa za2.s, p0/M, p0/M, z3.b, z18.b\n"
+      ".inst 0xa0930063  // smopa za3.s, p0/M, p0/M, z3.b, z19.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1b { z18.b }, p0/Z, [x24]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x24, x24, #1\n"
+      ".inst 0xa04086fc  // ld1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
+      "addvl x23, x23, #4\n"
+      ".inst 0xa09c0240  // smopa za0.s, p0/M, p0/M, z18.b, z28.b\n"
+      ".inst 0xa09d0241  // smopa za1.s, p0/M, p0/M, z18.b, z29.b\n"
+      ".inst 0xa09e0242  // smopa za2.s, p0/M, p0/M, z18.b, z30.b\n"
+      ".inst 0xa09f0243  // smopa za3.s, p0/M, p0/M, z18.b, z31.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x13, #1, 14f\n"
+      "tbz x13, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c560  // ld1w { z0.s-z3.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa041c57c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa042c578  // ld1w { z24.s-z27.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa043c574  // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840400  // mova za0h.s[x12], { z0.s-z3.s }\n"
+      "addvl x11, x11, #16\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa060c548  // st1w { z8.s-z11.s }, pn9.b, [x10]\n"
+      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa061c54c  // st1w { z12.s-z15.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa062c544  // st1w { z4.s-z7.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa063c550  // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "blt 11b\n"
+      "b 21f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa060c544  // st1w { z4.s-z7.s }, pn9.b, [x10]\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa061c550  // st1w { z16.s-z19.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c548  // st1w { z8.s-z11.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      ".inst 0xa063c54c  // st1w { z12.s-z15.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "blt 13b\n"
+      "b 21f\n"
+      "14:"  // Store to output array
+      "ldr x23, [%x[args], %[offsetof_C]]\n"
+      "sub x21, x9, x28\n"
+      "ld1rw { z18.s }, p0/Z, [%x[dq], %[offset_DequantizeFloat_scale]]\n"
+      "fmov z20.s, #0x0\n"
+      "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+      "fmov z21.s, #0x0\n"
+      "fmov z22.s, #0x0\n"
+      "ldr x20, [%x[args], %[offsetof_late_bias]]\n"
+      "fmov z23.s, #0x0\n"
+      "add x23, x23, x27, LSL #2\n"  // C += n
+      "madd x23, x28, x22, x23\n"  // C += m * ldc
+      "cbz x20, 15f\n"
+      "add x20, x20, x27, LSL #2\n"
+      ".inst 0xa040c294  // ld1w { z20.s-z23.s }, p8/Z, [x20]\n"
+      "15:"  // Store to output array: no late bias
+      "cntw x20\n"
+      "ld1rw { z17.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "mov x12, #0x0\n"
+      "cmp x21, x20\n"
+      "ld1rw { z16.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "csel x20, x21, x20, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xc132e000  // scvtf { z0.s-z3.s }, { z0.s-z3.s }\n"
+      ".inst 0xc132e084  // scvtf { z4.s-z7.s }, { z4.s-z7.s }\n"
+      ".inst 0xc132e108  // scvtf { z8.s-z11.s }, { z8.s-z11.s }\n"
+      ".inst 0xc132e18c  // scvtf { z12.s-z15.s }, { z12.s-z15.s }\n"
+      "fmad z0.s, p0/M, z18.s, z20.s\n"
+      "fmad z1.s, p0/M, z18.s, z20.s\n"
+      "fmad z2.s, p0/M, z18.s, z20.s\n"
+      "fmad z3.s, p0/M, z18.s, z20.s\n"
+      "add x12, x12, #0x4\n"
+      "fmad z4.s, p0/M, z18.s, z21.s\n"
+      "fmad z5.s, p0/M, z18.s, z21.s\n"
+      "cmp x12, x21, LSL #2\n"
+      "fmad z6.s, p0/M, z18.s, z21.s\n"
+      "fmad z7.s, p0/M, z18.s, z21.s\n"
+      "fmad z8.s, p0/M, z18.s, z22.s\n"
+      "fmad z9.s, p0/M, z18.s, z22.s\n"
+      "fmad z10.s, p0/M, z18.s, z22.s\n"
+      "fmad z11.s, p0/M, z18.s, z22.s\n"
+      "fmad z12.s, p0/M, z18.s, z23.s\n"
+      "fmad z13.s, p0/M, z18.s, z23.s\n"
+      "fmad z14.s, p0/M, z18.s, z23.s\n"
+      "fmad z15.s, p0/M, z18.s, z23.s\n"
+      ".inst 0xc1b0ca20  // fclamp { z0.s-z3.s }, z17.s, z16.s\n"
+      ".inst 0xc1b0ca24  // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+      ".inst 0xc1b0ca28  // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
+      ".inst 0xc1b0ca2c  // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+      ".inst 0xa160c2e0  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      ".inst 0xa160c2e1  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      ".inst 0xa160c2e2  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      ".inst 0xa160c2e3  // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 18f\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xc132e000  // scvtf { z0.s-z3.s }, { z0.s-z3.s }\n"
+      ".inst 0xc132e084  // scvtf { z4.s-z7.s }, { z4.s-z7.s }\n"
+      ".inst 0xc132e108  // scvtf { z8.s-z11.s }, { z8.s-z11.s }\n"
+      ".inst 0xc132e18c  // scvtf { z12.s-z15.s }, { z12.s-z15.s }\n"
+      "fmad z0.s, p0/M, z18.s, z20.s\n"
+      "fmad z1.s, p0/M, z18.s, z20.s\n"
+      "fmad z2.s, p0/M, z18.s, z20.s\n"
+      "fmad z3.s, p0/M, z18.s, z20.s\n"
+      "subs x20, x20, #0x1\n"
+      "fmad z4.s, p0/M, z18.s, z21.s\n"
+      "fmad z5.s, p0/M, z18.s, z21.s\n"
+      "fmad z6.s, p0/M, z18.s, z21.s\n"
+      "fmad z7.s, p0/M, z18.s, z21.s\n"
+      "fmad z8.s, p0/M, z18.s, z22.s\n"
+      "fmad z9.s, p0/M, z18.s, z22.s\n"
+      "fmad z10.s, p0/M, z18.s, z22.s\n"
+      "fmad z11.s, p0/M, z18.s, z22.s\n"
+      "fmad z12.s, p0/M, z18.s, z23.s\n"
+      "fmad z13.s, p0/M, z18.s, z23.s\n"
+      "fmad z14.s, p0/M, z18.s, z23.s\n"
+      "fmad z15.s, p0/M, z18.s, z23.s\n"
+      ".inst 0xc1b0ca20  // fclamp { z0.s-z3.s }, z17.s, z16.s\n"
+      ".inst 0xc1b0ca24  // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+      ".inst 0xc1b0ca28  // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
+      ".inst 0xc1b0ca2c  // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+      ".inst 0xa160c2e0  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      "beq 18f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa160c2e1  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      "beq 18f\n"
+      ".inst 0xa160c2e2  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x23]\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "19:"  // Store to output array: End
+      "tbz x13, #0, 21f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "20:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c574  // ld1w { z20.s-z23.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xa041c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xa042c560  // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa043c568  // ld1w { z8.s-z11.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      "addvl x11, x11, #16\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 20b\n"
+      "21:"  // End block
+      "incw x27, ALL, MUL #4\n"
+      "cmp x27, x26\n"
+      "blt 3b\n"
+      "incw x28\n"
+      "mov x27, #0x0\n"
+      "cmp x28, x9\n"
+      "mov x25, x24\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [dq] "r" (&dq), [offset_DequantizeFloat_scale] "I" (offsetof(DequantizeFloat, scale)), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_late_bias] "I" (offsetof(KernelArgs, late_bias)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..df2c9c0ca3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 4> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..1631fae8e9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias, const float *const late_bias, const Activation act,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias), late_bias(late_bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const int32_t *const bias;
+    const float *const late_bias;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, late_bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xa041c5f4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xa042c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840681  // mova za1h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0840703  // mova za3h.s[x12], { z24.s-z27.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      ".inst 0x25a94550  // whilelt pn8.s, x10, x9, VLx2\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      ".inst 0xa10a4286  // ld1w { z6.s, z14.s }, p8/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc09000c0  // addha za0.s, p0/M, p0/M, z6.s\n"
+      ".inst 0xc09001c1  // addha za1.s, p0/M, p0/M, z14.s\n"
+      ".inst 0xc09000c2  // addha za2.s, p0/M, p0/M, z6.s\n"
+      ".inst 0xc09001c3  // addha za3.s, p0/M, p0/M, z14.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20, ALL, MUL #2\n"
+      "incw x21, ALL, MUL #2\n"
+      "cmp x20, x9\n"
+      "mov x20, x16\n"
+      "csel x21, x11, x21, LT\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "lsr x21, x20, #0x2\n"
+      "madd x23, x10, x22, x23\n"  // bptr = B + n * kstride_bytes
+      "and x20, x20, #0x3\n"
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa1400775  // ld1b { z21.b, z29.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa04006f2  // ld1b { z18.b-z19.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa041076a  // ld1b { z10.b-z11.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa14106e5  // ld1b { z5.b, z13.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa1420767  // ld1b { z7.b, z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f0  // ld1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa1430774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa14306f7  // ld1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa09202a0  // smopa za0.s, p0/M, p0/M, z21.b, z18.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa09302a1  // smopa za1.s, p0/M, p0/M, z21.b, z19.b\n"
+      ".inst 0xa09203a2  // smopa za2.s, p0/M, p0/M, z29.b, z18.b\n"
+      ".inst 0xa09303a3  // smopa za3.s, p0/M, p0/M, z29.b, z19.b\n"
+      ".inst 0xa1400775  // ld1b { z21.b, z29.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa0850140  // smopa za0.s, p0/M, p0/M, z10.b, z5.b\n"
+      ".inst 0xa04006f2  // ld1b { z18.b-z19.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa08d0141  // smopa za1.s, p0/M, p0/M, z10.b, z13.b\n"
+      ".inst 0xa0850162  // smopa za2.s, p0/M, p0/M, z11.b, z5.b\n"
+      ".inst 0xa08d0163  // smopa za3.s, p0/M, p0/M, z11.b, z13.b\n"
+      ".inst 0xa041076a  // ld1b { z10.b-z11.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa09000e0  // smopa za0.s, p0/M, p0/M, z7.b, z16.b\n"
+      ".inst 0xa14106e5  // ld1b { z5.b, z13.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa09800e1  // smopa za1.s, p0/M, p0/M, z7.b, z24.b\n"
+      ".inst 0xa09001e2  // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n"
+      ".inst 0xa09801e3  // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n"
+      ".inst 0xa1420767  // ld1b { z7.b, z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f0  // ld1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa0970280  // smopa za0.s, p0/M, p0/M, z20.b, z23.b\n"
+      ".inst 0xa09f0281  // smopa za1.s, p0/M, p0/M, z20.b, z31.b\n"
+      ".inst 0xa0970382  // smopa za2.s, p0/M, p0/M, z28.b, z23.b\n"
+      ".inst 0xa09f0383  // smopa za3.s, p0/M, p0/M, z28.b, z31.b\n"
+      ".inst 0xa1430774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa14306f7  // ld1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa09202a0  // smopa za0.s, p0/M, p0/M, z21.b, z18.b\n"
+      ".inst 0xa09302a1  // smopa za1.s, p0/M, p0/M, z21.b, z19.b\n"
+      ".inst 0xa09203a2  // smopa za2.s, p0/M, p0/M, z29.b, z18.b\n"
+      ".inst 0xa09303a3  // smopa za3.s, p0/M, p0/M, z29.b, z19.b\n"
+      ".inst 0xa0850140  // smopa za0.s, p0/M, p0/M, z10.b, z5.b\n"
+      ".inst 0xa08d0141  // smopa za1.s, p0/M, p0/M, z10.b, z13.b\n"
+      ".inst 0xa0850162  // smopa za2.s, p0/M, p0/M, z11.b, z5.b\n"
+      ".inst 0xa08d0163  // smopa za3.s, p0/M, p0/M, z11.b, z13.b\n"
+      ".inst 0xa09000e0  // smopa za0.s, p0/M, p0/M, z7.b, z16.b\n"
+      ".inst 0xa09800e1  // smopa za1.s, p0/M, p0/M, z7.b, z24.b\n"
+      ".inst 0xa09001e2  // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n"
+      ".inst 0xa09801e3  // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n"
+      ".inst 0xa0970280  // smopa za0.s, p0/M, p0/M, z20.b, z23.b\n"
+      ".inst 0xa09f0281  // smopa za1.s, p0/M, p0/M, z20.b, z31.b\n"
+      ".inst 0xa0970382  // smopa za2.s, p0/M, p0/M, z28.b, z23.b\n"
+      ".inst 0xa09f0383  // smopa za3.s, p0/M, p0/M, z28.b, z31.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa040077e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0xa14006e7  // ld1b { z7.b, z15.b }, pn9.b/Z, [x23]\n"
+      "addvl x23, x23, #2\n"
+      ".inst 0xa08703c0  // smopa za0.s, p0/M, p0/M, z30.b, z7.b\n"
+      ".inst 0xa08f03c1  // smopa za1.s, p0/M, p0/M, z30.b, z15.b\n"
+      ".inst 0xa08703e2  // smopa za2.s, p0/M, p0/M, z31.b, z7.b\n"
+      ".inst 0xa08f03e3  // smopa za3.s, p0/M, p0/M, z31.b, z15.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c5f4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa060c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14]\n"
+      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa061c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa062c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa063c5d8  // st1w { z24.s-z27.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 24f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa060c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa061c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 24f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "sub x25, x13, x11\n"
+      "ld1rw { z3.s }, p0/Z, [%x[dq], %[offset_DequantizeFloat_scale]]\n"
+      "fmov z2.s, #0x0\n"
+      "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+      "fmov z10.s, #0x0\n"
+      "ldr x20, [%x[args], %[offsetof_late_bias]]\n"
+      "add x26, x26, x10, LSL #2\n"  // C += n
+      "madd x26, x11, x24, x26\n"  // C += m * ldc
+      "cbz x20, 15f\n"
+      "add x20, x20, x10, LSL #2\n"
+      ".inst 0xa1404282  // ld1w { z2.s, z10.s }, p8/Z, [x20]\n"
+      "15:"  // Store to output array: no late bias
+      "cntw x23\n"
+      "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "mov x12, #0x0\n"
+      "cmp x25, x23\n"
+      "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xc132e084  // scvtf { z4.s-z7.s }, { z4.s-z7.s }\n"
+      ".inst 0xc132e18c  // scvtf { z12.s-z15.s }, { z12.s-z15.s }\n"
+      "fmad z4.s, p0/M, z3.s, z2.s\n"
+      "fmad z5.s, p0/M, z3.s, z2.s\n"
+      "add x12, x12, #0x4\n"
+      "fmad z6.s, p0/M, z3.s, z2.s\n"
+      "fmad z7.s, p0/M, z3.s, z2.s\n"
+      "cmp x12, x21, LSL #2\n"
+      "fmad z12.s, p0/M, z3.s, z10.s\n"
+      "fmad z13.s, p0/M, z3.s, z10.s\n"
+      "fmad z14.s, p0/M, z3.s, z10.s\n"
+      "fmad z15.s, p0/M, z3.s, z10.s\n"
+      ".inst 0xc1a0c824  // fclamp { z4.s-z7.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c82c  // fclamp { z12.s-z15.s }, z1.s, z0.s\n"
+      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604347  // st1w { z7.s, z15.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 18f\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xc132e210  // scvtf { z16.s-z19.s }, { z16.s-z19.s }\n"
+      ".inst 0xc132e318  // scvtf { z24.s-z27.s }, { z24.s-z27.s }\n"
+      "fmad z16.s, p0/M, z3.s, z2.s\n"
+      "fmad z17.s, p0/M, z3.s, z2.s\n"
+      "subs x20, x20, #0x1\n"
+      "fmad z18.s, p0/M, z3.s, z2.s\n"
+      "fmad z19.s, p0/M, z3.s, z2.s\n"
+      "fmad z24.s, p0/M, z3.s, z10.s\n"
+      "fmad z25.s, p0/M, z3.s, z10.s\n"
+      "fmad z26.s, p0/M, z3.s, z10.s\n"
+      "fmad z27.s, p0/M, z3.s, z10.s\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 22f\n"
+      "cmp x25, x23\n"
+      "mov x12, #0x0\n"
+      "csel x20, x25, x23, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 20f\n"
+      "19:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc132e294  // scvtf { z20.s-z23.s }, { z20.s-z23.s }\n"
+      ".inst 0xc132e39c  // scvtf { z28.s-z31.s }, { z28.s-z31.s }\n"
+      "fmad z20.s, p0/M, z3.s, z2.s\n"
+      "fmad z21.s, p0/M, z3.s, z2.s\n"
+      "add x12, x12, #0x4\n"
+      "fmad z22.s, p0/M, z3.s, z2.s\n"
+      "fmad z23.s, p0/M, z3.s, z2.s\n"
+      "cmp x12, x21, LSL #2\n"
+      "fmad z28.s, p0/M, z3.s, z10.s\n"
+      "fmad z29.s, p0/M, z3.s, z10.s\n"
+      "fmad z30.s, p0/M, z3.s, z10.s\n"
+      "fmad z31.s, p0/M, z3.s, z10.s\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+      ".inst 0xa1604354  // st1w { z20.s, z28.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604355  // st1w { z21.s, z29.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604356  // st1w { z22.s, z30.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      ".inst 0xa1604357  // st1w { z23.s, z31.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 21f\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xc132e084  // scvtf { z4.s-z7.s }, { z4.s-z7.s }\n"
+      ".inst 0xc132e18c  // scvtf { z12.s-z15.s }, { z12.s-z15.s }\n"
+      "fmad z4.s, p0/M, z3.s, z2.s\n"
+      "fmad z5.s, p0/M, z3.s, z2.s\n"
+      "subs x20, x20, #0x1\n"
+      "fmad z6.s, p0/M, z3.s, z2.s\n"
+      "fmad z7.s, p0/M, z3.s, z2.s\n"
+      "fmad z12.s, p0/M, z3.s, z10.s\n"
+      "fmad z13.s, p0/M, z3.s, z10.s\n"
+      "fmad z14.s, p0/M, z3.s, z10.s\n"
+      "fmad z15.s, p0/M, z3.s, z10.s\n"
+      ".inst 0xc1a0c824  // fclamp { z4.s-z7.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c82c  // fclamp { z12.s-z15.s }, z1.s, z0.s\n"
+      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      "21:"  // Store to output array: Accumulator row 1 oddments: End
+      "22:"  // Store to output array: End
+      "tbz x16, #0, 24f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "23:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5f4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xa041c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xa042c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 23b\n"
+      "24:"  // End block
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #2\n"
+      "mov x10, #0x0\n"
+      "cmp x11, x13\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [dq] "r" (&dq), [offset_DequantizeFloat_scale] "I" (offsetof(DequantizeFloat, scale)), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_late_bias] "I" (offsetof(KernelArgs, late_bias)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..70952f4f03
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 4> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..bafb16bca8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, float *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const DequantizeFloat &dq, const float *const late_bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias, const float *const late_bias, const Activation act,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias), late_bias(late_bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const int32_t *const bias;
+    const float *const late_bias;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, late_bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xa041c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xa042c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      "whilelt p0.s, x10, x9\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "ld1w { z23.s }, p0/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc09026e0  // addha za0.s, p1/M, p1/M, z23.s\n"
+      ".inst 0xc09026e1  // addha za1.s, p1/M, p1/M, z23.s\n"
+      ".inst 0xc09026e2  // addha za2.s, p1/M, p1/M, z23.s\n"
+      ".inst 0xc09026e3  // addha za3.s, p1/M, p1/M, z23.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20\n"
+      "incw x21, ALL, MUL #4\n"
+      "cmp x20, x9\n"
+      "mov x20, x16\n"
+      "csel x21, x11, x21, LT\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "lsr x21, x20, #0x2\n"
+      "madd x23, x10, x22, x23\n"  // bptr = B + n * kstride_bytes
+      "and x20, x20, #0x3\n"
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa0408378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
+      "ld1b { z4.b }, p1/Z, [x23]\n"
+      ".inst 0xa0418374  // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      "ld1b { z2.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa042836c  // ld1b { z12.b-z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ld1b { z11.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa0438370  // ld1b { z16.b-z19.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ld1b { z28.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa0842700  // smopa za0.s, p1/M, p1/M, z24.b, z4.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa0842721  // smopa za1.s, p1/M, p1/M, z25.b, z4.b\n"
+      ".inst 0xa0842742  // smopa za2.s, p1/M, p1/M, z26.b, z4.b\n"
+      ".inst 0xa0842763  // smopa za3.s, p1/M, p1/M, z27.b, z4.b\n"
+      ".inst 0xa0408378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27]\n"
+      ".inst 0xa0822680  // smopa za0.s, p1/M, p1/M, z20.b, z2.b\n"
+      "ld1b { z4.b }, p1/Z, [x23]\n"
+      ".inst 0xa08226a1  // smopa za1.s, p1/M, p1/M, z21.b, z2.b\n"
+      ".inst 0xa08226c2  // smopa za2.s, p1/M, p1/M, z22.b, z2.b\n"
+      ".inst 0xa08226e3  // smopa za3.s, p1/M, p1/M, z23.b, z2.b\n"
+      ".inst 0xa0418374  // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa08b2580  // smopa za0.s, p1/M, p1/M, z12.b, z11.b\n"
+      "ld1b { z2.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa08b25a1  // smopa za1.s, p1/M, p1/M, z13.b, z11.b\n"
+      ".inst 0xa08b25c2  // smopa za2.s, p1/M, p1/M, z14.b, z11.b\n"
+      ".inst 0xa08b25e3  // smopa za3.s, p1/M, p1/M, z15.b, z11.b\n"
+      ".inst 0xa042836c  // ld1b { z12.b-z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ld1b { z11.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa09c2600  // smopa za0.s, p1/M, p1/M, z16.b, z28.b\n"
+      ".inst 0xa09c2621  // smopa za1.s, p1/M, p1/M, z17.b, z28.b\n"
+      ".inst 0xa09c2642  // smopa za2.s, p1/M, p1/M, z18.b, z28.b\n"
+      ".inst 0xa09c2663  // smopa za3.s, p1/M, p1/M, z19.b, z28.b\n"
+      ".inst 0xa0438370  // ld1b { z16.b-z19.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ld1b { z28.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa0842700  // smopa za0.s, p1/M, p1/M, z24.b, z4.b\n"
+      ".inst 0xa0842721  // smopa za1.s, p1/M, p1/M, z25.b, z4.b\n"
+      ".inst 0xa0842742  // smopa za2.s, p1/M, p1/M, z26.b, z4.b\n"
+      ".inst 0xa0842763  // smopa za3.s, p1/M, p1/M, z27.b, z4.b\n"
+      ".inst 0xa0822680  // smopa za0.s, p1/M, p1/M, z20.b, z2.b\n"
+      ".inst 0xa08226a1  // smopa za1.s, p1/M, p1/M, z21.b, z2.b\n"
+      ".inst 0xa08226c2  // smopa za2.s, p1/M, p1/M, z22.b, z2.b\n"
+      ".inst 0xa08226e3  // smopa za3.s, p1/M, p1/M, z23.b, z2.b\n"
+      ".inst 0xa08b2580  // smopa za0.s, p1/M, p1/M, z12.b, z11.b\n"
+      ".inst 0xa08b25a1  // smopa za1.s, p1/M, p1/M, z13.b, z11.b\n"
+      ".inst 0xa08b25c2  // smopa za2.s, p1/M, p1/M, z14.b, z11.b\n"
+      ".inst 0xa08b25e3  // smopa za3.s, p1/M, p1/M, z15.b, z11.b\n"
+      ".inst 0xa09c2600  // smopa za0.s, p1/M, p1/M, z16.b, z28.b\n"
+      ".inst 0xa09c2621  // smopa za1.s, p1/M, p1/M, z17.b, z28.b\n"
+      ".inst 0xa09c2642  // smopa za2.s, p1/M, p1/M, z18.b, z28.b\n"
+      ".inst 0xa09c2663  // smopa za3.s, p1/M, p1/M, z19.b, z28.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa1408373  // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x27, x27, #4\n"
+      "ld1b { z16.b }, p1/Z, [x23]\n"
+      "addvl x23, x23, #1\n"
+      ".inst 0xa0902660  // smopa za0.s, p1/M, p1/M, z19.b, z16.b\n"
+      ".inst 0xa09026e1  // smopa za1.s, p1/M, p1/M, z23.b, z16.b\n"
+      ".inst 0xa0902762  // smopa za2.s, p1/M, p1/M, z27.b, z16.b\n"
+      ".inst 0xa09027e3  // smopa za3.s, p1/M, p1/M, z31.b, z16.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa061c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa062c1d8  // st1w { z24.s-z27.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa063c1d0  // st1w { z16.s-z19.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 30f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa060c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14]\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa061c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c1d4  // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1d0  // st1w { z16.s-z19.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 30f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "sub x25, x13, x11\n"
+      "ld1rw { z23.s }, p1/Z, [%x[dq], %[offset_DequantizeFloat_scale]]\n"
+      "fmov z22.s, #0x0\n"
+      "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+      "ldr x20, [%x[args], %[offsetof_late_bias]]\n"
+      "add x26, x26, x10, LSL #2\n"  // C += n
+      "madd x26, x11, x24, x26\n"  // C += m * ldc
+      "cbz x20, 15f\n"
+      "add x20, x20, x10, LSL #2\n"
+      "ld1w { z22.s }, p0/Z, [x20]\n"
+      "15:"  // Store to output array: no late bias
+      "cntw x23\n"
+      "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "mov x12, #0x0\n"
+      "cmp x25, x23\n"
+      "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc132e000  // scvtf { z0.s-z3.s }, { z0.s-z3.s }\n"
+      "cmp x12, x21, LSL #2\n"
+      "fmad z0.s, p1/M, z23.s, z22.s\n"
+      "fmad z1.s, p1/M, z23.s, z22.s\n"
+      "fmad z2.s, p1/M, z23.s, z22.s\n"
+      "fmad z3.s, p1/M, z23.s, z22.s\n"
+      ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+      "st1w { z0.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z1.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z2.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z3.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 18f\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc132e210  // scvtf { z16.s-z19.s }, { z16.s-z19.s }\n"
+      "fmad z16.s, p1/M, z23.s, z22.s\n"
+      "fmad z17.s, p1/M, z23.s, z22.s\n"
+      "fmad z18.s, p1/M, z23.s, z22.s\n"
+      "fmad z19.s, p1/M, z23.s, z22.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 28f\n"
+      "cmp x25, x23\n"
+      "mov x12, #0x0\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 20f\n"
+      "19:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc132e210  // scvtf { z16.s-z19.s }, { z16.s-z19.s }\n"
+      "cmp x12, x21, LSL #2\n"
+      "fmad z16.s, p1/M, z23.s, z22.s\n"
+      "fmad z17.s, p1/M, z23.s, z22.s\n"
+      "fmad z18.s, p1/M, z23.s, z22.s\n"
+      "fmad z19.s, p1/M, z23.s, z22.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z19.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 21f\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc132e39c  // scvtf { z28.s-z31.s }, { z28.s-z31.s }\n"
+      "fmad z28.s, p1/M, z23.s, z22.s\n"
+      "fmad z29.s, p1/M, z23.s, z22.s\n"
+      "fmad z30.s, p1/M, z23.s, z22.s\n"
+      "fmad z31.s, p1/M, z23.s, z22.s\n"
+      ".inst 0xc1b4cabc  // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1w { z28.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z29.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      "st1w { z30.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "21:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 28f\n"
+      "cmp x25, x23\n"
+      "mov x12, #0x0\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 23f\n"
+      "22:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc132e18c  // scvtf { z12.s-z15.s }, { z12.s-z15.s }\n"
+      "cmp x12, x21, LSL #2\n"
+      "fmad z12.s, p1/M, z23.s, z22.s\n"
+      "fmad z13.s, p1/M, z23.s, z22.s\n"
+      "fmad z14.s, p1/M, z23.s, z22.s\n"
+      "fmad z15.s, p1/M, z23.s, z22.s\n"
+      ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+      "st1w { z12.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z13.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z14.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z15.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 22b\n"
+      "23:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x20, 24f\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc132e210  // scvtf { z16.s-z19.s }, { z16.s-z19.s }\n"
+      "fmad z16.s, p1/M, z23.s, z22.s\n"
+      "fmad z17.s, p1/M, z23.s, z22.s\n"
+      "fmad z18.s, p1/M, z23.s, z22.s\n"
+      "fmad z19.s, p1/M, z23.s, z22.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 24f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 24f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "24:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 28f\n"
+      "cmp x25, x23\n"
+      "mov x12, #0x0\n"
+      "csel x20, x25, x23, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 26f\n"
+      "25:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc132e318  // scvtf { z24.s-z27.s }, { z24.s-z27.s }\n"
+      "cmp x12, x21, LSL #2\n"
+      "fmad z24.s, p1/M, z23.s, z22.s\n"
+      "fmad z25.s, p1/M, z23.s, z22.s\n"
+      "fmad z26.s, p1/M, z23.s, z22.s\n"
+      "fmad z27.s, p1/M, z23.s, z22.s\n"
+      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      "st1w { z24.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z25.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z26.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1w { z27.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 25b\n"
+      "26:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x20, 27f\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc132e210  // scvtf { z16.s-z19.s }, { z16.s-z19.s }\n"
+      "fmad z16.s, p1/M, z23.s, z22.s\n"
+      "fmad z17.s, p1/M, z23.s, z22.s\n"
+      "fmad z18.s, p1/M, z23.s, z22.s\n"
+      "fmad z19.s, p1/M, z23.s, z22.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 27f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 27f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "27:"  // Store to output array: Accumulator row 3 oddments: End
+      "28:"  // Store to output array: End
+      "tbz x16, #0, 30f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "29:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xa041c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xa042c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xa043c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "blt 29b\n"
+      "30:"  // End block
+      "incw x10\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #4\n"
+      "mov x10, #0x0\n"
+      "cmp x11, x13\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [dq] "r" (&dq), [offset_DequantizeFloat_scale] "I" (offsetof(DequantizeFloat, scale)), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_late_bias] "I" (offsetof(KernelArgs, late_bias)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..b9d8b60c8d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int32_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 4> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..d11faa634d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int32_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int32_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int32_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+
+    const int32_t *const bias;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x13, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x10, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x13, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c564  // ld1w { z4.s-z7.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa042c57c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa043c574  // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x11, x11, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w9, [%x[args], %[offsetof_M]]\n"
+      "mov x28, #0x0\n"
+      "mov x27, #0x0\n"
+      "ldr w26, [%x[args], %[offsetof_N]]\n"
+      "ldr x25, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x24, x25\n"
+      ".inst 0x25ba6770  // whilelt pn8.s, x27, x26, VLx4\n"
+      "tbnz x13, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      ".inst 0xa11bc29b  // ldnt1w { z19.s, z23.s, z27.s, z31.s }, p8/Z, [x20, x27, LSL #2]\n"
+      ".inst 0xc0900260  // addha za0.s, p0/M, p0/M, z19.s\n"
+      ".inst 0xc09002e1  // addha za1.s, p0/M, p0/M, z23.s\n"
+      ".inst 0xc0900362  // addha za2.s, p0/M, p0/M, z27.s\n"
+      ".inst 0xc09003e3  // addha za3.s, p0/M, p0/M, z31.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x27\n"
+      "mov x21, x28\n"
+      "incw x20, ALL, MUL #4\n"
+      "incw x21\n"
+      "cmp x20, x26\n"
+      "csel x21, x28, x21, LT\n"
+      "mov x20, x13\n"
+      "bfm x13, XZR, #0x0, #0x0  // bfc x13, #0x0, #0x1\n"
+      "cmp x21, x9\n"
+      "csel x13, x20, x13, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x27, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      "ld1b { z30.b }, p0/Z, [x24]\n"
+      ".inst 0xa04086e1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23]\n"
+      "ld1b { z21.b }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      "ld1b { z28.b }, p0/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa04286e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      "ld1b { z11.b }, p0/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa08003c0  // smopa za0.s, p0/M, p0/M, z30.b, z0.b\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa08103c1  // smopa za1.s, p0/M, p0/M, z30.b, z1.b\n"
+      ".inst 0xa08203c2  // smopa za2.s, p0/M, p0/M, z30.b, z2.b\n"
+      ".inst 0xa08303c3  // smopa za3.s, p0/M, p0/M, z30.b, z3.b\n"
+      "ld1b { z30.b }, p0/Z, [x24]\n"
+      ".inst 0xa09802a0  // smopa za0.s, p0/M, p0/M, z21.b, z24.b\n"
+      ".inst 0xa04086e1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa09902a1  // smopa za1.s, p0/M, p0/M, z21.b, z25.b\n"
+      ".inst 0xa09a02a2  // smopa za2.s, p0/M, p0/M, z21.b, z26.b\n"
+      ".inst 0xa09b02a3  // smopa za3.s, p0/M, p0/M, z21.b, z27.b\n"
+      "ld1b { z21.b }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0xa0840380  // smopa za0.s, p0/M, p0/M, z28.b, z4.b\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa0850381  // smopa za1.s, p0/M, p0/M, z28.b, z5.b\n"
+      ".inst 0xa0860382  // smopa za2.s, p0/M, p0/M, z28.b, z6.b\n"
+      ".inst 0xa0870383  // smopa za3.s, p0/M, p0/M, z28.b, z7.b\n"
+      "ld1b { z28.b }, p0/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa04286e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xa0900160  // smopa za0.s, p0/M, p0/M, z11.b, z16.b\n"
+      ".inst 0xa0910161  // smopa za1.s, p0/M, p0/M, z11.b, z17.b\n"
+      ".inst 0xa0920162  // smopa za2.s, p0/M, p0/M, z11.b, z18.b\n"
+      ".inst 0xa0930163  // smopa za3.s, p0/M, p0/M, z11.b, z19.b\n"
+      "ld1b { z11.b }, p0/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa08003c0  // smopa za0.s, p0/M, p0/M, z30.b, z0.b\n"
+      ".inst 0xa08103c1  // smopa za1.s, p0/M, p0/M, z30.b, z1.b\n"
+      ".inst 0xa08203c2  // smopa za2.s, p0/M, p0/M, z30.b, z2.b\n"
+      ".inst 0xa08303c3  // smopa za3.s, p0/M, p0/M, z30.b, z3.b\n"
+      ".inst 0xa09802a0  // smopa za0.s, p0/M, p0/M, z21.b, z24.b\n"
+      ".inst 0xa09902a1  // smopa za1.s, p0/M, p0/M, z21.b, z25.b\n"
+      ".inst 0xa09a02a2  // smopa za2.s, p0/M, p0/M, z21.b, z26.b\n"
+      ".inst 0xa09b02a3  // smopa za3.s, p0/M, p0/M, z21.b, z27.b\n"
+      ".inst 0xa0840380  // smopa za0.s, p0/M, p0/M, z28.b, z4.b\n"
+      ".inst 0xa0850381  // smopa za1.s, p0/M, p0/M, z28.b, z5.b\n"
+      ".inst 0xa0860382  // smopa za2.s, p0/M, p0/M, z28.b, z6.b\n"
+      ".inst 0xa0870383  // smopa za3.s, p0/M, p0/M, z28.b, z7.b\n"
+      ".inst 0xa0900160  // smopa za0.s, p0/M, p0/M, z11.b, z16.b\n"
+      ".inst 0xa0910161  // smopa za1.s, p0/M, p0/M, z11.b, z17.b\n"
+      ".inst 0xa0920162  // smopa za2.s, p0/M, p0/M, z11.b, z18.b\n"
+      ".inst 0xa0930163  // smopa za3.s, p0/M, p0/M, z11.b, z19.b\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1b { z22.b }, p0/Z, [x24]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x24, x24, #1\n"
+      ".inst 0xa14086f1  // ld1b { z17.b, z21.b, z25.b, z29.b }, pn9.b/Z, [x23]\n"
+      "addvl x23, x23, #4\n"
+      ".inst 0xa09102c0  // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+      ".inst 0xa09502c1  // smopa za1.s, p0/M, p0/M, z22.b, z21.b\n"
+      ".inst 0xa09902c2  // smopa za2.s, p0/M, p0/M, z22.b, z25.b\n"
+      ".inst 0xa09d02c3  // smopa za3.s, p0/M, p0/M, z22.b, z29.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x13, #1, 14f\n"
+      "tbz x13, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa041c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa042c560  // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa043c574  // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c544  // st1w { z4.s-z7.s }, pn9.b, [x10]\n"
+      "addvl x11, x11, #16\n"
+      ".inst 0xa061c54c  // st1w { z12.s-z15.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      ".inst 0xa062c55c  // st1w { z28.s-z31.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      ".inst 0xa063c550  // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "blt 11b\n"
+      "b 20f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa060c54c  // st1w { z12.s-z15.s }, pn9.b, [x10]\n"
+      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa061c544  // st1w { z4.s-z7.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c540  // st1w { z0.s-z3.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      ".inst 0xa063c558  // st1w { z24.s-z27.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "blt 13b\n"
+      "b 20f\n"
+      "14:"  // Store to output array
+      "ldr x23, [%x[args], %[offsetof_C]]\n"
+      "sub x21, x9, x28\n"
+      "cntw x20\n"
+      "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+      "cmp x21, x20\n"
+      "csel x20, x21, x20, LT\n"
+      "add x23, x23, x27, LSL #2\n"  // C += n
+      "lsr x21, x20, #0x2\n"
+      "madd x23, x28, x22, x23\n"  // C += m * ldc
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 16f\n"
+      "15:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa160c2e0  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      ".inst 0xa160c2e1  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa160c2e2  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa160c2e3  // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa160c2f0  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      "beq 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa160c2f1  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x23]\n"
+      "add x23, x23, x22\n"
+      "beq 17f\n"
+      ".inst 0xa160c2f2  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x23]\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments: End
+      "18:"  // Store to output array: End
+      "tbz x13, #0, 20f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "19:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c568  // ld1w { z8.s-z11.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa041c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c568  // ld1w { z8.s-z11.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x11, x11, #16\n"
+      "blt 19b\n"
+      "20:"  // End block
+      "incw x27, ALL, MUL #4\n"
+      "cmp x27, x26\n"
+      "blt 3b\n"
+      "incw x28\n"
+      "cmp x28, x9\n"
+      "mov x27, #0x0\n"
+      "mov x25, x24\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..f05d2cf215
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int32_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 4> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..47de894306
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int32_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int32_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int32_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+
+    const int32_t *const bias;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa041c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa042c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      ".inst 0x25a94550  // whilelt pn8.s, x10, x9, VLx2\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      ".inst 0xa00a4295  // ldnt1w { z20.s-z21.s }, p8/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc0900280  // addha za0.s, p0/M, p0/M, z20.s\n"
+      ".inst 0xc09002a1  // addha za1.s, p0/M, p0/M, z21.s\n"
+      ".inst 0xc0900282  // addha za2.s, p0/M, p0/M, z20.s\n"
+      ".inst 0xc09002a3  // addha za3.s, p0/M, p0/M, z21.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20, ALL, MUL #2\n"
+      "incw x21, ALL, MUL #2\n"
+      "cmp x20, x9\n"
+      "csel x21, x11, x21, LT\n"
+      "mov x20, x16\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa040077c  // ld1b { z28.b-z29.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa14006e8  // ldnt1b { z0.b, z8.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa0410762  // ld1b { z2.b-z3.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa14106ff  // ldnt1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa042076e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa0430764  // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa04306f5  // ldnt1b { z20.b-z21.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa0800380  // smopa za0.s, p0/M, p0/M, z28.b, z0.b\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa0880381  // smopa za1.s, p0/M, p0/M, z28.b, z8.b\n"
+      ".inst 0xa08003a2  // smopa za2.s, p0/M, p0/M, z29.b, z0.b\n"
+      ".inst 0xa08803a3  // smopa za3.s, p0/M, p0/M, z29.b, z8.b\n"
+      ".inst 0xa040077c  // ld1b { z28.b-z29.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa0970040  // smopa za0.s, p0/M, p0/M, z2.b, z23.b\n"
+      ".inst 0xa14006e8  // ldnt1b { z0.b, z8.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa09f0041  // smopa za1.s, p0/M, p0/M, z2.b, z31.b\n"
+      ".inst 0xa0970062  // smopa za2.s, p0/M, p0/M, z3.b, z23.b\n"
+      ".inst 0xa09f0063  // smopa za3.s, p0/M, p0/M, z3.b, z31.b\n"
+      ".inst 0xa0410762  // ld1b { z2.b-z3.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa09001c0  // smopa za0.s, p0/M, p0/M, z14.b, z16.b\n"
+      ".inst 0xa14106ff  // ldnt1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa09801c1  // smopa za1.s, p0/M, p0/M, z14.b, z24.b\n"
+      ".inst 0xa09001e2  // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n"
+      ".inst 0xa09801e3  // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n"
+      ".inst 0xa042076e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa0940080  // smopa za0.s, p0/M, p0/M, z4.b, z20.b\n"
+      ".inst 0xa0950081  // smopa za1.s, p0/M, p0/M, z4.b, z21.b\n"
+      ".inst 0xa09400a2  // smopa za2.s, p0/M, p0/M, z5.b, z20.b\n"
+      ".inst 0xa09500a3  // smopa za3.s, p0/M, p0/M, z5.b, z21.b\n"
+      ".inst 0xa0430764  // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa04306f5  // ldnt1b { z20.b-z21.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa0800380  // smopa za0.s, p0/M, p0/M, z28.b, z0.b\n"
+      ".inst 0xa0880381  // smopa za1.s, p0/M, p0/M, z28.b, z8.b\n"
+      ".inst 0xa08003a2  // smopa za2.s, p0/M, p0/M, z29.b, z0.b\n"
+      ".inst 0xa08803a3  // smopa za3.s, p0/M, p0/M, z29.b, z8.b\n"
+      ".inst 0xa0970040  // smopa za0.s, p0/M, p0/M, z2.b, z23.b\n"
+      ".inst 0xa09f0041  // smopa za1.s, p0/M, p0/M, z2.b, z31.b\n"
+      ".inst 0xa0970062  // smopa za2.s, p0/M, p0/M, z3.b, z23.b\n"
+      ".inst 0xa09f0063  // smopa za3.s, p0/M, p0/M, z3.b, z31.b\n"
+      ".inst 0xa09001c0  // smopa za0.s, p0/M, p0/M, z14.b, z16.b\n"
+      ".inst 0xa09801c1  // smopa za1.s, p0/M, p0/M, z14.b, z24.b\n"
+      ".inst 0xa09001e2  // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n"
+      ".inst 0xa09801e3  // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n"
+      ".inst 0xa0940080  // smopa za0.s, p0/M, p0/M, z4.b, z20.b\n"
+      ".inst 0xa0950081  // smopa za1.s, p0/M, p0/M, z4.b, z21.b\n"
+      ".inst 0xa09400a2  // smopa za2.s, p0/M, p0/M, z5.b, z20.b\n"
+      ".inst 0xa09500a3  // smopa za3.s, p0/M, p0/M, z5.b, z21.b\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa1400774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0xa14006e7  // ld1b { z7.b, z15.b }, pn9.b/Z, [x23]\n"
+      "addvl x23, x23, #2\n"
+      ".inst 0xa0870280  // smopa za0.s, p0/M, p0/M, z20.b, z7.b\n"
+      ".inst 0xa08f0281  // smopa za1.s, p0/M, p0/M, z20.b, z15.b\n"
+      ".inst 0xa0870382  // smopa za2.s, p0/M, p0/M, z28.b, z7.b\n"
+      ".inst 0xa08f0383  // smopa za3.s, p0/M, p0/M, z28.b, z15.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa043c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xa061c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 23f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 23f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "sub x25, x13, x11\n"
+      "cntw x24\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "cmp x25, x24\n"
+      "csel x22, x25, x24, LT\n"
+      "add x26, x26, x10, LSL #2\n"  // C += n
+      "lsr x21, x22, #0x2\n"
+      "madd x26, x11, x23, x26\n"  // C += m * ldc
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 16f\n"
+      "15:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa1604347  // st1w { z7.s, z15.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 17f\n"
+      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 21f\n"
+      "cmp x25, x24\n"
+      "csel x20, x25, x24, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 19f\n"
+      "18:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xa1604353  // st1w { z19.s, z27.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 20f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa1604340  // st1w { z0.s, z8.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 20f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa1604341  // st1w { z1.s, z9.s }, p8, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 20f\n"
+      ".inst 0xa1604342  // st1w { z2.s, z10.s }, p8, [x26]\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments: End
+      "21:"  // Store to output array: End
+      "tbz x16, #0, 23f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "22:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840481  // mova za1h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa042c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 22b\n"
+      "23:"  // End block
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #2\n"
+      "cmp x11, x13\n"
+      "mov x10, #0x0\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..ce10ab30e7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int32_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 4> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..a23c44b7da
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int32_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int32_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int32_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+
+    const int32_t *const bias;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840700  // mova za0h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa041c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840703  // mova za3h.s[x12], { z24.s-z27.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      "whilelt p0.s, x10, x9\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "ldnt1w { z17.s }, p0/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc0902620  // addha za0.s, p1/M, p1/M, z17.s\n"
+      ".inst 0xc0902621  // addha za1.s, p1/M, p1/M, z17.s\n"
+      ".inst 0xc0902622  // addha za2.s, p1/M, p1/M, z17.s\n"
+      ".inst 0xc0902623  // addha za3.s, p1/M, p1/M, z17.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20\n"
+      "incw x21, ALL, MUL #4\n"
+      "cmp x20, x9\n"
+      "csel x21, x11, x21, LT\n"
+      "mov x20, x16\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
+      "ldnt1b { z12.b }, p1/Z, [x23]\n"
+      ".inst 0xa1418370  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      "ldnt1b { z5.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa1428363  // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z4.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa1438362  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ldnt1b { z19.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa08c2640  // smopa za0.s, p1/M, p1/M, z18.b, z12.b\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa08c26c1  // smopa za1.s, p1/M, p1/M, z22.b, z12.b\n"
+      ".inst 0xa08c2742  // smopa za2.s, p1/M, p1/M, z26.b, z12.b\n"
+      ".inst 0xa08c27c3  // smopa za3.s, p1/M, p1/M, z30.b, z12.b\n"
+      ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
+      ".inst 0xa0852600  // smopa za0.s, p1/M, p1/M, z16.b, z5.b\n"
+      "ldnt1b { z12.b }, p1/Z, [x23]\n"
+      ".inst 0xa0852681  // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa0852702  // smopa za2.s, p1/M, p1/M, z24.b, z5.b\n"
+      ".inst 0xa0852783  // smopa za3.s, p1/M, p1/M, z28.b, z5.b\n"
+      ".inst 0xa1418370  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa0842460  // smopa za0.s, p1/M, p1/M, z3.b, z4.b\n"
+      "ldnt1b { z5.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa08424e1  // smopa za1.s, p1/M, p1/M, z7.b, z4.b\n"
+      ".inst 0xa0842562  // smopa za2.s, p1/M, p1/M, z11.b, z4.b\n"
+      ".inst 0xa08425e3  // smopa za3.s, p1/M, p1/M, z15.b, z4.b\n"
+      ".inst 0xa1428363  // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z4.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa0932440  // smopa za0.s, p1/M, p1/M, z2.b, z19.b\n"
+      ".inst 0xa09324c1  // smopa za1.s, p1/M, p1/M, z6.b, z19.b\n"
+      ".inst 0xa0932542  // smopa za2.s, p1/M, p1/M, z10.b, z19.b\n"
+      ".inst 0xa09325c3  // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+      ".inst 0xa1438362  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ldnt1b { z19.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa08c2640  // smopa za0.s, p1/M, p1/M, z18.b, z12.b\n"
+      ".inst 0xa08c26c1  // smopa za1.s, p1/M, p1/M, z22.b, z12.b\n"
+      ".inst 0xa08c2742  // smopa za2.s, p1/M, p1/M, z26.b, z12.b\n"
+      ".inst 0xa08c27c3  // smopa za3.s, p1/M, p1/M, z30.b, z12.b\n"
+      ".inst 0xa0852600  // smopa za0.s, p1/M, p1/M, z16.b, z5.b\n"
+      ".inst 0xa0852681  // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa0852702  // smopa za2.s, p1/M, p1/M, z24.b, z5.b\n"
+      ".inst 0xa0852783  // smopa za3.s, p1/M, p1/M, z28.b, z5.b\n"
+      ".inst 0xa0842460  // smopa za0.s, p1/M, p1/M, z3.b, z4.b\n"
+      ".inst 0xa08424e1  // smopa za1.s, p1/M, p1/M, z7.b, z4.b\n"
+      ".inst 0xa0842562  // smopa za2.s, p1/M, p1/M, z11.b, z4.b\n"
+      ".inst 0xa08425e3  // smopa za3.s, p1/M, p1/M, z15.b, z4.b\n"
+      ".inst 0xa0932440  // smopa za0.s, p1/M, p1/M, z2.b, z19.b\n"
+      ".inst 0xa09324c1  // smopa za1.s, p1/M, p1/M, z6.b, z19.b\n"
+      ".inst 0xa0932542  // smopa za2.s, p1/M, p1/M, z10.b, z19.b\n"
+      ".inst 0xa09325c3  // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa0408368  // ld1b { z8.b-z11.b }, pn8.b/Z, [x27]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x27, x27, #4\n"
+      "ld1b { z15.b }, p1/Z, [x23]\n"
+      "addvl x23, x23, #1\n"
+      ".inst 0xa08f2500  // smopa za0.s, p1/M, p1/M, z8.b, z15.b\n"
+      ".inst 0xa08f2521  // smopa za1.s, p1/M, p1/M, z9.b, z15.b\n"
+      ".inst 0xa08f2542  // smopa za2.s, p1/M, p1/M, z10.b, z15.b\n"
+      ".inst 0xa08f2563  // smopa za3.s, p1/M, p1/M, z11.b, z15.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xa061c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1d4  // st1w { z20.s-z23.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 29f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 29f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "sub x25, x13, x11\n"
+      "cntw x24\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "cmp x25, x24\n"
+      "csel x22, x25, x24, LT\n"
+      "add x26, x26, x10, LSL #2\n"  // C += n
+      "lsr x21, x22, #0x2\n"
+      "madd x26, x11, x23, x26\n"  // C += m * ldc
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 16f\n"
+      "15:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      "st1w { z8.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1w { z9.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z10.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z11.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 17f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 17f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z5.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 17f\n"
+      "st1w { z6.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "cmp x25, x24\n"
+      "csel x22, x25, x24, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 19f\n"
+      "18:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z19.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 20f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      "st1w { z20.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 20f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z21.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 20f\n"
+      "st1w { z22.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "cmp x25, x24\n"
+      "csel x22, x25, x24, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 22f\n"
+      "21:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      "st1w { z24.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1w { z25.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z26.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z27.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 21b\n"
+      "22:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x20, 23f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 23f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 23f\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "23:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 27f\n"
+      "cmp x25, x24\n"
+      "csel x20, x25, x24, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 25f\n"
+      "24:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "st1w { z17.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "cmp x12, x21, LSL #2\n"
+      "st1w { z19.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "blt 24b\n"
+      "25:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x20, 26f\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      "st1w { z12.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 26f\n"
+      "subs x20, x20, #0x1\n"
+      "st1w { z13.s }, p0, [x26]\n"
+      "add x26, x26, x23\n"
+      "beq 26f\n"
+      "st1w { z14.s }, p0, [x26]\n"
+      "26:"  // Store to output array: Accumulator row 3 oddments: End
+      "27:"  // Store to output array: End
+      "tbz x16, #0, 29f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "28:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 28b\n"
+      "29:"  // End block
+      "incw x10\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #4\n"
+      "cmp x11, x13\n"
+      "mov x10, #0x0\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..fb84883913
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL
+{
+public:
+  typedef uint8_t operand_type;
+  typedef uint8_t result_type;
+
+  typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<uint32_t>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<uint32_t>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..96247d2db5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const uint8_t *const A,
+      const uint8_t *const B,
+      uint8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
+        C(C), ldcb(ldc * sizeof(uint8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const uint8_t *const A;
+    const uint8_t *const B;
+    const long kstride_bytes;
+    uint8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<uint8_t>::min();
+    int32_t max = std::numeric_limits<uint8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x14, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x14, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5a8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa042c5a8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa043c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x13, x13, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w10, [%x[args], %[offsetof_M]]\n"
+      "mov x9, #0x0\n"
+      "mov x28, #0x0\n"
+      "ldr w27, [%x[args], %[offsetof_N]]\n"
+      "ldr x26, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x25, x26\n"
+      ".inst 0x25bb6790  // whilelt pn8.s, x28, x27, VLx4\n"
+      "tbnz x14, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      ".inst 0xa11cc289  // ldnt1w { z1.s, z5.s, z9.s, z13.s }, p8/Z, [x20, x28, LSL #2]\n"
+      ".inst 0xc0902420  // addha za0.s, p1/M, p1/M, z1.s\n"
+      ".inst 0xc09024a1  // addha za1.s, p1/M, p1/M, z5.s\n"
+      ".inst 0xc0902522  // addha za2.s, p1/M, p1/M, z9.s\n"
+      ".inst 0xc09025a3  // addha za3.s, p1/M, p1/M, z13.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x28\n"
+      "mov x21, x9\n"
+      "incw x20, ALL, MUL #4\n"
+      "incw x21\n"
+      "cmp x20, x27\n"
+      "csel x21, x9, x21, LT\n"
+      "mov x20, x14\n"
+      "bfm x14, XZR, #0x0, #0x0  // bfc x14, #0x0, #0x1\n"
+      "cmp x21, x10\n"
+      "csel x14, x20, x14, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x28, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      "ld1b { z20.b }, p1/Z, [x25]\n"
+      ".inst 0xa04086e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+      "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa04286fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa1a42680  // umopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa1a52681  // umopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa1a62682  // umopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+      ".inst 0xa1a72683  // umopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+      "ld1b { z20.b }, p1/Z, [x25]\n"
+      ".inst 0xa1b82560  // umopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+      ".inst 0xa04086e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa1b92561  // umopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa1ba2562  // umopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+      ".inst 0xa1bb2563  // umopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+      "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+      ".inst 0xa1bc2440  // umopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa1bd2441  // umopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+      ".inst 0xa1be2442  // umopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+      ".inst 0xa1bf2443  // umopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+      "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa04286fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xa1b025c0  // umopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+      ".inst 0xa1b125c1  // umopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+      ".inst 0xa1b225c2  // umopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+      ".inst 0xa1b325c3  // umopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+      "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa1a42680  // umopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+      ".inst 0xa1a52681  // umopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa1a62682  // umopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+      ".inst 0xa1a72683  // umopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+      ".inst 0xa1b82560  // umopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+      ".inst 0xa1b92561  // umopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa1ba2562  // umopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+      ".inst 0xa1bb2563  // umopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+      ".inst 0xa1bc2440  // umopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+      ".inst 0xa1bd2441  // umopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+      ".inst 0xa1be2442  // umopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+      ".inst 0xa1bf2443  // umopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+      ".inst 0xa1b025c0  // umopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+      ".inst 0xa1b125c1  // umopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+      ".inst 0xa1b225c2  // umopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+      ".inst 0xa1b325c3  // umopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1b { z16.b }, p1/Z, [x25]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x25, x25, #1\n"
+      ".inst 0xa04086e4  // ld1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+      "addvl x23, x23, #4\n"
+      ".inst 0xa1a42600  // umopa za0.s, p1/M, p1/M, z16.b, z4.b\n"
+      ".inst 0xa1a52601  // umopa za1.s, p1/M, p1/M, z16.b, z5.b\n"
+      ".inst 0xa1a62602  // umopa za2.s, p1/M, p1/M, z16.b, z6.b\n"
+      ".inst 0xa1a72603  // umopa za3.s, p1/M, p1/M, z16.b, z7.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "ld1w { z15.s }, p1/Z, [x25]\n"
+      "addvl x25, x25, #1\n"
+      ".inst 0xc09125e0  // addva za0.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e1  // addva za1.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e2  // addva za2.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
+      "tbz x14, #1, 14f\n"
+      "tbz x14, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0840400  // mova za0h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c578  // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
+      "addvl x13, x13, #16\n"
+      ".inst 0xa061c564  // st1w { z4.s-z7.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      ".inst 0xa062c574  // st1w { z20.s-z23.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c560  // st1w { z0.s-z3.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "blt 11b\n"
+      "b 21f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xa060c564  // st1w { z4.s-z7.s }, pn9.b, [x11]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa061c574  // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c56c  // st1w { z12.s-z15.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c568  // st1w { z8.s-z11.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "blt 13b\n"
+      "b 21f\n"
+      "14:"  // Store to output array
+      "ldr x24, [%x[args], %[offsetof_C]]\n"
+      "add x24, x24, x28\n"  // C += n
+      "sub x23, x10, x9\n"
+      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x24, x9, x22, x24\n"  // C += m * ldc
+      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x14, #2, 15f\n"
+      "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+      "add x21, x21, x28\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      ".inst 0xa040c284  // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      ".inst 0xa040c28c  // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x20\n"
+      "whilelt p0.b, x28, x27\n"
+      "cmp x23, x20\n"
+      "csel x20, x23, x20, LT\n"
+      "lsr x21, x20, #0x1\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x1\n"
+      "cbz x21, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc086001a  // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
+      ".inst 0xc086005c  // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
+      ".inst 0xc1a4a41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
+      ".inst 0xc0860096  // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
+      ".inst 0xc08600d0  // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
+      ".inst 0xc1a5a41c  // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+      ".inst 0xc1a6a416  // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x21, LSL #1\n"
+      ".inst 0xc1a7a410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+      ".inst 0xc1aca23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+      ".inst 0xc1ada23c  // srshl { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+      ".inst 0xc1aea236  // srshl { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+      ".inst 0xc1afa230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+      ".inst 0xc1a0a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+      ".inst 0xc1a0a31c  // add { z28.s-z29.s }, { z28.s-z29.s }, z0.s\n"
+      ".inst 0xc1a0a316  // add { z22.s-z23.s }, { z22.s-z23.s }, z0.s\n"
+      ".inst 0xc1a0a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n"
+      ".inst 0xc1b4c6ba  // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6bc  // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
+      "uzp1 z19.b, z26.b, z28.b\n"
+      ".inst 0xc1b4c6b6  // sclamp { z22.s-z23.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6b0  // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+      "uzp1 z16.b, z22.b, z16.b\n"
+      "uzp1 z18.b, z27.b, z29.b\n"
+      "uzp1 z17.b, z23.b, z17.b\n"
+      "uzp1 z16.b, z19.b, z16.b\n"
+      "st1b { z16.b }, p0, [x24]\n"
+      "add x24, x24, x22\n"
+      "uzp1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p0, [x24]\n"
+      "add x24, x24, x22\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 18f\n"
+      ".inst 0xc086000a  // mova { z10.s-z11.s }, za0h.s[x12, 0:1]\n"
+      ".inst 0xc0860058  // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
+      ".inst 0xc1a4a40a  // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z4.s\n"
+      ".inst 0xc086009a  // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n"
+      ".inst 0xc08600de  // mova { z30.s-z31.s }, za3h.s[x12, 0:1]\n"
+      ".inst 0xc1a5a418  // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+      ".inst 0xc1a6a41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n"
+      ".inst 0xc1a7a41e  // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z7.s\n"
+      ".inst 0xc1aca22a  // srshl { z10.s-z11.s }, { z10.s-z11.s }, z12.s\n"
+      ".inst 0xc1ada238  // srshl { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+      ".inst 0xc1aea23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z14.s\n"
+      ".inst 0xc1afa23e  // srshl { z30.s-z31.s }, { z30.s-z31.s }, z15.s\n"
+      ".inst 0xc1a0a30a  // add { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n"
+      ".inst 0xc1a0a318  // add { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
+      ".inst 0xc1a0a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+      ".inst 0xc1a0a31e  // add { z30.s-z31.s }, { z30.s-z31.s }, z0.s\n"
+      ".inst 0xc1b4c6aa  // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6b8  // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+      "uzp1 z17.b, z10.b, z24.b\n"
+      ".inst 0xc1b4c6ba  // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6be  // sclamp { z30.s-z31.s }, z21.s, z20.s\n"
+      "uzp1 z16.b, z26.b, z30.b\n"
+      "uzp1 z16.b, z17.b, z16.b\n"
+      "st1b { z16.b }, p0, [x24]\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "19:"  // Store to output array: End
+      "tbz x14, #0, 21f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "20:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5bc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x13, x13, #16\n"
+      "blt 20b\n"
+      "21:"  // End block
+      "incw x28, ALL, MUL #4\n"
+      "cmp x28, x27\n"
+      "blt 3b\n"
+      "incw x9\n"
+      "cmp x9, x10\n"
+      "mov x28, #0x0\n"
+      "mov x26, x25\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..f8c375f9f5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL
+{
+public:
+  typedef uint8_t operand_type;
+  typedef uint8_t result_type;
+
+  typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<uint32_t>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<uint32_t>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..9a59799529
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const uint8_t *const A,
+      const uint8_t *const B,
+      uint8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
+        C(C), ldcb(ldc * sizeof(uint8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const uint8_t *const A;
+    const uint8_t *const B;
+    const long kstride_bytes;
+    uint8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<uint8_t>::min();
+    int32_t max = std::numeric_limits<uint8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa041c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      ".inst 0x25a94550  // whilelt pn8.s, x10, x9, VLx2\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      ".inst 0xa00a4299  // ldnt1w { z24.s-z25.s }, p8/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc0902700  // addha za0.s, p1/M, p1/M, z24.s\n"
+      ".inst 0xc0902721  // addha za1.s, p1/M, p1/M, z25.s\n"
+      ".inst 0xc0902702  // addha za2.s, p1/M, p1/M, z24.s\n"
+      ".inst 0xc0902723  // addha za3.s, p1/M, p1/M, z25.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20, ALL, MUL #2\n"
+      "incw x21, ALL, MUL #2\n"
+      "cmp x20, x9\n"
+      "csel x21, x11, x21, LT\n"
+      "mov x20, x16\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa1400763  // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa14006f9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa1410774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa04106f7  // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa1420775  // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa1430765  // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa14306ef  // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa1b12460  // umopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa1b92461  // umopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+      ".inst 0xa1b12562  // umopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+      ".inst 0xa1b92563  // umopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa1400763  // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa1b62680  // umopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+      ".inst 0xa14006f9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa1b72681  // umopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+      ".inst 0xa1b62782  // umopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+      ".inst 0xa1b72783  // umopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+      ".inst 0xa1410774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa1b026a0  // umopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+      ".inst 0xa04106f7  // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa1b826a1  // umopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa1b027a2  // umopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+      ".inst 0xa1b827a3  // umopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+      ".inst 0xa1420775  // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa1a724a0  // umopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+      ".inst 0xa1af24a1  // umopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+      ".inst 0xa1a725a2  // umopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+      ".inst 0xa1af25a3  // umopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
+      ".inst 0xa1430765  // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      ".inst 0xa14306ef  // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa1b12460  // umopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+      ".inst 0xa1b92461  // umopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+      ".inst 0xa1b12562  // umopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+      ".inst 0xa1b92563  // umopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa1b62680  // umopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+      ".inst 0xa1b72681  // umopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+      ".inst 0xa1b62782  // umopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+      ".inst 0xa1b72783  // umopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+      ".inst 0xa1b026a0  // umopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+      ".inst 0xa1b826a1  // umopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa1b027a2  // umopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+      ".inst 0xa1b827a3  // umopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+      ".inst 0xa1a724a0  // umopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+      ".inst 0xa1af24a1  // umopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+      ".inst 0xa1a725a2  // umopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+      ".inst 0xa1af25a3  // umopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa1400773  // ld1b { z19.b, z27.b }, pn9.b/Z, [x27]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0xa04006f0  // ld1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
+      "addvl x23, x23, #2\n"
+      ".inst 0xa1b02660  // umopa za0.s, p1/M, p1/M, z19.b, z16.b\n"
+      ".inst 0xa1b12661  // umopa za1.s, p1/M, p1/M, z19.b, z17.b\n"
+      ".inst 0xa1b02762  // umopa za2.s, p1/M, p1/M, z27.b, z16.b\n"
+      ".inst 0xa1b12763  // umopa za3.s, p1/M, p1/M, z27.b, z17.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      ".inst 0xa040476e  // ld1w { z14.s-z15.s }, pn9.b/Z, [x27]\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0xc09125c0  // addva za0.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125c1  // addva za1.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125e2  // addva za2.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xa061c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 24f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5d4  // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 24f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "add x26, x26, x10\n"  // C += n
+      "sub x25, x13, x11\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x26, x11, x24, x26\n"  // C += m * ldc
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x16, #2, 15f\n"
+      "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+      "add x21, x21, x10\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      ".inst 0xa0404280  // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      ".inst 0xa0404282  // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x23\n"
+      "whilelt p0.h, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf28  // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z8.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "uzp1 z16.h, z5.h, z9.h\n"
+      "uzp1 z17.h, z6.h, z10.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "uzp1 z16.h, z7.h, z11.h\n"
+      "st1b { z17.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 18f\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+      ".inst 0xc1a1ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a2aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+      ".inst 0xc1a3aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+      ".inst 0xc1aeab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1b8cf28  // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z8.h, z4.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      "subs x20, x20, #0x1\n"
+      "uzp1 z16.h, z9.h, z5.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      "uzp1 z16.h, z10.h, z6.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 22f\n"
+      "whilelt p0.h, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x20, x25, x23, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 20f\n"
+      "19:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z14.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf34  // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z20.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "uzp1 z16.h, z5.h, z21.h\n"
+      "uzp1 z17.h, z6.h, z22.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "uzp1 z16.h, z7.h, z23.h\n"
+      "st1b { z17.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 21f\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z14.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf30  // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z16.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      "subs x20, x20, #0x1\n"
+      "uzp1 z16.h, z5.h, z17.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      "uzp1 z16.h, z6.h, z18.h\n"
+      "st1b { z16.h }, p0, [x26]\n"
+      "21:"  // Store to output array: Accumulator row 1 oddments: End
+      "22:"  // Store to output array: End
+      "tbz x16, #0, 24f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "23:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 23b\n"
+      "24:"  // End block
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #2\n"
+      "cmp x11, x13\n"
+      "mov x10, #0x0\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..04d19324c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL
+{
+public:
+  typedef uint8_t operand_type;
+  typedef uint8_t result_type;
+
+  typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<uint32_t>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<uint32_t>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const CPUInfo *)
+  {
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..0f3346e65e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const uint8_t *const A,
+      const uint8_t *const B,
+      uint8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
+        C(C), ldcb(ldc * sizeof(uint8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const uint8_t *const A;
+    const uint8_t *const B;
+    const long kstride_bytes;
+    uint8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<uint8_t>::min();
+    int32_t max = std::numeric_limits<uint8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x16, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x16, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w13, [%x[args], %[offsetof_M]]\n"
+      "mov x11, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w9, [%x[args], %[offsetof_N]]\n"
+      "ldr x28, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x27, x28\n"
+      "whilelt p0.s, x10, x9\n"
+      "tbnz x16, #0, 4f\n"
+      "ldr x20, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x20, 5f\n"
+      "ldnt1w { z8.s }, p0/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc0902500  // addha za0.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902501  // addha za1.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902502  // addha za2.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902503  // addha za3.s, p1/M, p1/M, z8.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x20, x10\n"
+      "mov x21, x11\n"
+      "incw x20\n"
+      "incw x21, ALL, MUL #4\n"
+      "cmp x20, x9\n"
+      "csel x21, x11, x21, LT\n"
+      "mov x20, x16\n"
+      "bfm x16, XZR, #0x0, #0x0  // bfc x16, #0x0, #0x1\n"
+      "cmp x21, x13\n"
+      "csel x16, x20, x16, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x20, [%x[args], %[offsetof_K]]\n"
+      "add x20, x20, #0x3\n"
+      "lsr x20, x20, #0x2\n"
+      "ldr x23, [%x[args], %[offsetof_B]]\n"
+      "lsr x22, x20, #0x2\n"
+      "and x21, x20, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa0408364  // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+      "ldnt1b { z14.b }, p1/Z, [x23]\n"
+      ".inst 0xa0418374  // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa0428378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa0438368  // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa1ae2480  // umopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa1ae24a1  // umopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+      ".inst 0xa1ae24c2  // umopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+      ".inst 0xa1ae24e3  // umopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+      ".inst 0xa0408364  // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+      ".inst 0xa1bf2680  // umopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+      "ldnt1b { z14.b }, p1/Z, [x23]\n"
+      ".inst 0xa1bf26a1  // umopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+      ".inst 0xa1bf26c2  // umopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+      ".inst 0xa1bf26e3  // umopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+      ".inst 0xa0418374  // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa1ad2700  // umopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+      "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa1ad2721  // umopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+      ".inst 0xa1ad2742  // umopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+      ".inst 0xa1ad2763  // umopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+      ".inst 0xa0428378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa1bd2500  // umopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+      ".inst 0xa1bd2521  // umopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+      ".inst 0xa1bd2542  // umopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa1bd2563  // umopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
+      ".inst 0xa0438368  // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa1ae2480  // umopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+      ".inst 0xa1ae24a1  // umopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+      ".inst 0xa1ae24c2  // umopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+      ".inst 0xa1ae24e3  // umopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+      ".inst 0xa1bf2680  // umopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+      ".inst 0xa1bf26a1  // umopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+      ".inst 0xa1bf26c2  // umopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+      ".inst 0xa1bf26e3  // umopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+      ".inst 0xa1ad2700  // umopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+      ".inst 0xa1ad2721  // umopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+      ".inst 0xa1ad2742  // umopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+      ".inst 0xa1ad2763  // umopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+      ".inst 0xa1bd2500  // umopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+      ".inst 0xa1bd2521  // umopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+      ".inst 0xa1bd2542  // umopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa1bd2563  // umopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x27, x27, #4\n"
+      "ld1b { z15.b }, p1/Z, [x23]\n"
+      "addvl x23, x23, #1\n"
+      ".inst 0xa1af2640  // umopa za0.s, p1/M, p1/M, z18.b, z15.b\n"
+      ".inst 0xa1af26c1  // umopa za1.s, p1/M, p1/M, z22.b, z15.b\n"
+      ".inst 0xa1af2742  // umopa za2.s, p1/M, p1/M, z26.b, z15.b\n"
+      ".inst 0xa1af27c3  // umopa za3.s, p1/M, p1/M, z30.b, z15.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      ".inst 0xa140c363  // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27]\n"
+      "addvl x27, x27, #4\n"
+      ".inst 0xc0912460  // addva za0.s, p1/M, p1/M, z3.s\n"
+      ".inst 0xc09124e1  // addva za1.s, p1/M, p1/M, z7.s\n"
+      ".inst 0xc0912562  // addva za2.s, p1/M, p1/M, z11.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
+      "tbz x16, #1, 14f\n"
+      "tbz x16, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0xa061c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c1d4  // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 11b\n"
+      "b 30f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      ".inst 0xa062c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "blt 13b\n"
+      "b 30f\n"
+      "14:"  // Store to output array
+      "ldr x26, [%x[args], %[offsetof_C]]\n"
+      "add x26, x26, x10\n"  // C += n
+      "sub x25, x13, x11\n"
+      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x26, x11, x24, x26\n"  // C += m * ldc
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x16, #2, 15f\n"
+      "ldr w21, [%x[args], %[offsetof_n_0]]\n"
+      "add x21, x21, x10\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      "ld1w { z2.s }, p0/Z, [x20]\n"
+      "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x20, x20, x21, LSL #2\n"
+      "ld1w { z1.s }, p0/Z, [x20]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x23\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc1a2ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a0ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+      ".inst 0xc1b4ceb0  // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1b { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z19.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x20, 18f\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc1a2ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      ".inst 0xc1a0ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+      ".inst 0xc1b4ceb0  // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1b { z16.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      "subs x20, x20, #0x1\n"
+      "st1b { z17.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 18f\n"
+      "st1b { z18.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 28f\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 20f\n"
+      "19:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a0ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1b4cea4  // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1b { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z7.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x20, 21f\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      ".inst 0xc1a0ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1b4cea4  // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1b { z4.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      "subs x20, x20, #0x1\n"
+      "st1b { z5.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 21f\n"
+      "st1b { z6.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "21:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 28f\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x22, x25, x23, LT\n"
+      "lsr x21, x22, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x22, #0x3\n"
+      "cbz x21, 23f\n"
+      "22:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc1a2ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a1aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a0ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+      ".inst 0xc1b4cea8  // sclamp { z8.s-z11.s }, z21.s, z20.s\n"
+      "st1b { z8.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z9.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z10.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z11.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 22b\n"
+      "23:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x20, 24f\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a0ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+      ".inst 0xc1b4ceac  // sclamp { z12.s-z15.s }, z21.s, z20.s\n"
+      "st1b { z12.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 24f\n"
+      "subs x20, x20, #0x1\n"
+      "st1b { z13.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 24f\n"
+      "st1b { z14.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "24:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x25, x25, x22\n"
+      "beq 28f\n"
+      "whilelt p0.s, x10, x9\n"
+      "cmp x25, x23\n"
+      "csel x20, x25, x23, LT\n"
+      "lsr x21, x20, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x20, x20, #0x3\n"
+      "cbz x21, 26f\n"
+      "25:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+      "cmp x12, x21, LSL #2\n"
+      ".inst 0xc1a0ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+      ".inst 0xc1b4cebc  // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1b { z28.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z29.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z30.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "st1b { z31.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "blt 25b\n"
+      "26:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x20, 27f\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+      ".inst 0xc1a0ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+      ".inst 0xc1b4cebc  // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1b { z28.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 27f\n"
+      "subs x20, x20, #0x1\n"
+      "st1b { z29.s }, p0, [x26]\n"
+      "add x26, x26, x24\n"
+      "beq 27f\n"
+      "st1b { z30.s }, p0, [x26]\n"
+      "27:"  // Store to output array: Accumulator row 3 oddments: End
+      "28:"  // Store to output array: End
+      "tbz x16, #0, 30f\n"
+      "mov x12, #0x0\n"
+      "cntw x20\n"
+      "29:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20\n"
+      "addvl x15, x15, #16\n"
+      "blt 29b\n"
+      "30:"  // End block
+      "incw x10\n"
+      "cmp x10, x9\n"
+      "blt 3b\n"
+      "incw x11, ALL, MUL #4\n"
+      "cmp x11, x13\n"
+      "mov x10, #0x0\n"
+      "mov x28, x27\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp
new file mode 100644
index 0000000000..1ce169d562
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<bfloat16>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    size_t, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_ffhybrid_bf16fp32_mmla_6x4VL( ARGLIST );
+
+class cls_sve_ffhybrid_bf16fp32_mmla_6x4VL
+{
+public:
+    typedef bfloat16 lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+    static unsigned int stripe_width()
+    {
+        return get_vector_length<float>() * 1;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL2VL_BL64;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 49.10 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_ffhybrid_bf16fp32_mmla_6x4VL;
+    cls_sve_ffhybrid_bf16fp32_mmla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..9136e32567
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
@@ -0,0 +1,2227 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_ffhybrid_bf16fp32_mmla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, size_t B_stride, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        const bfloat16 *cur_B_ptr = {};
+        size_t B_stride = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    ka.B_stride = B_stride;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 3f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 3f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 3f\n"
+      "mov x11, x12\n"
+      "3:"  // Height 1: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 4f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x15, x15, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1w { z16.s }, p4/Z, [x13]\n"
+      "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "zip1 z8.d, z16.d, z12.d\n"
+      "zip2 z12.d, z16.d, z12.d\n"
+      "ld1w { z17.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 9f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x27, #0x8\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z20.h }, p0/Z, [x26]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
+      ".inst 0x6471e688  // bfmmla z8.s, z20.h, z17.h\n"
+      ".inst 0x6470e68c  // bfmmla z12.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+      ".inst 0x6471e689  // bfmmla z9.s, z20.h, z17.h\n"
+      ".inst 0x6470e68d  // bfmmla z13.s, z20.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6470e68a  // bfmmla z10.s, z20.h, z16.h\n"
+      ".inst 0x6471e68e  // bfmmla z14.s, z20.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6471e68b  // bfmmla z11.s, z20.h, z17.h\n"
+      ".inst 0x6470e68f  // bfmmla z15.s, z20.h, z16.h\n"
+      "add x26, x26, #0x10\n"
+      "addvl x12, x12, #4\n"
+      "addvl x11, x11, #4\n"
+      "addvl x10, x10, #4\n"
+      "addvl x9, x9, #4\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "addvl x12, x12, #2\n"
+      "addvl x11, x11, #2\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "ble 12f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e428  // bfmmla z8.s, z1.h, z17.h\n"
+      ".inst 0x6470e42c  // bfmmla z12.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e429  // bfmmla z9.s, z1.h, z17.h\n"
+      ".inst 0x6470e42d  // bfmmla z13.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e42a  // bfmmla z10.s, z1.h, z17.h\n"
+      ".inst 0x6470e42e  // bfmmla z14.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6471e42b  // bfmmla z11.s, z1.h, z17.h\n"
+      ".inst 0x6470e42f  // bfmmla z15.s, z1.h, z16.h\n"
+      "addvl x12, x12, #2\n"
+      "addvl x11, x11, #2\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 7b\n"
+      "uzp1 z8.d, z8.d, z12.d\n"
+      "uzp1 z9.d, z9.d, z13.d\n"
+      "uzp1 z10.d, z10.d, z14.d\n"
+      "uzp1 z11.d, z11.d, z15.d\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z21.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z21.s\n"
+      "fmin z9.s, p5/M, z9.s, z21.s\n"
+      "fmin z10.s, p5/M, z10.s, z21.s\n"
+      "fmin z11.s, p5/M, z11.s, z21.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "13:"  // Height 1: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "14:"  // Height 1: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 2b\n"
+      "b 86f\n"
+      "15:"  // Height 2
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "16:"  // Height 2: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 17f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 17f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 17f\n"
+      "mov x11, x12\n"
+      "17:"  // Height 2: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 18f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x15, x15, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x13, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x13]\n"
+      "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 23f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "23:"  // Height 2: input setup done
+      "cmp x27, #0x8\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z20.h }, p0/Z, [x26]\n"
+      "ld1rqh { z19.h }, p0/Z, [x25]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
+      ".inst 0x6471e688  // bfmmla z8.s, z20.h, z17.h\n"
+      ".inst 0x6470e68c  // bfmmla z12.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+      ".inst 0x6471e689  // bfmmla z9.s, z20.h, z17.h\n"
+      ".inst 0x6470e68d  // bfmmla z13.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e68a  // bfmmla z10.s, z20.h, z17.h\n"
+      ".inst 0x6470e68e  // bfmmla z14.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6471e68b  // bfmmla z11.s, z20.h, z17.h\n"
+      ".inst 0x6470e68f  // bfmmla z15.s, z20.h, z16.h\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "addvl x12, x12, #4\n"
+      "addvl x11, x11, #4\n"
+      "addvl x10, x10, #4\n"
+      "addvl x9, x9, #4\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z19.h }, p0/Z, [x25]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "addvl x12, x12, #2\n"
+      "addvl x11, x11, #2\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "ble 26f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e428  // bfmmla z8.s, z1.h, z17.h\n"
+      ".inst 0x6470e42c  // bfmmla z12.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e429  // bfmmla z9.s, z1.h, z17.h\n"
+      ".inst 0x6470e42d  // bfmmla z13.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e42a  // bfmmla z10.s, z1.h, z17.h\n"
+      ".inst 0x6470e42e  // bfmmla z14.s, z1.h, z16.h\n"
+      "ld1h { z22.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6476e42b  // bfmmla z11.s, z1.h, z22.h\n"
+      ".inst 0x6470e42f  // bfmmla z15.s, z1.h, z16.h\n"
+      "addvl x12, x12, #2\n"
+      "addvl x11, x11, #2\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 21b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "add x25, x13, x20, LSL #2\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z7.s, p5/M, z7.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "27:"  // Height 2: No activation
+      "st1w { z7.s }, p4, [x13]\n"
+      "st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "28:"  // Height 2: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 16b\n"
+      "b 86f\n"
+      "29:"  // Height 3
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "30:"  // Height 3: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 31f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 31f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 31f\n"
+      "mov x11, x12\n"
+      "31:"  // Height 3: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 32f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x15, x15, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x13, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x13]\n"
+      "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x20]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 37f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "37:"  // Height 3: input setup done
+      "cmp x27, #0x8\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z30.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "ld1rqh { z28.h }, p0/Z, [x24]\n"
+      "trn1 z27.d, z30.d, z24.d\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "trn1 z26.d, z28.d, z29.d\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e768  // bfmmla z8.s, z27.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e76c  // bfmmla z12.s, z27.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e769  // bfmmla z9.s, z27.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "trn2 z28.d, z28.d, z29.d\n"
+      ".inst 0x6478e76d  // bfmmla z13.s, z27.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6479e76a  // bfmmla z10.s, z27.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6478e76e  // bfmmla z14.s, z27.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6479e76b  // bfmmla z11.s, z27.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6478e76f  // bfmmla z15.s, z27.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6479e7c8  // bfmmla z8.s, z30.h, z25.h\n"
+      ".inst 0x6479e790  // bfmmla z16.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      ".inst 0x6478e7cc  // bfmmla z12.s, z30.h, z24.h\n"
+      ".inst 0x6478e794  // bfmmla z20.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      ".inst 0x6479e7c9  // bfmmla z9.s, z30.h, z25.h\n"
+      ".inst 0x6479e791  // bfmmla z17.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6478e7cd  // bfmmla z13.s, z30.h, z24.h\n"
+      ".inst 0x6478e795  // bfmmla z21.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      ".inst 0x6479e7ca  // bfmmla z10.s, z30.h, z25.h\n"
+      ".inst 0x6479e792  // bfmmla z18.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6478e7ce  // bfmmla z14.s, z30.h, z24.h\n"
+      ".inst 0x6478e796  // bfmmla z22.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      ".inst 0x6479e7cb  // bfmmla z11.s, z30.h, z25.h\n"
+      ".inst 0x6479e793  // bfmmla z19.s, z28.h, z25.h\n"
+      ".inst 0x6478e7cf  // bfmmla z15.s, z30.h, z24.h\n"
+      ".inst 0x6478e797  // bfmmla z23.s, z28.h, z24.h\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "trn1 z27.d, z1.d, z24.d\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "trn1 z26.d, z3.d, z28.d\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e768  // bfmmla z8.s, z27.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e76c  // bfmmla z12.s, z27.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e769  // bfmmla z9.s, z27.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x4\n"
+      ".inst 0x6478e76d  // bfmmla z13.s, z27.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "trn2 z3.d, z3.d, z28.d\n"
+      ".inst 0x6479e76a  // bfmmla z10.s, z27.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
+      "addvl x12, x12, #2\n"
+      ".inst 0x6478e76e  // bfmmla z14.s, z27.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6479e76b  // bfmmla z11.s, z27.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      ".inst 0x6478e76f  // bfmmla z15.s, z27.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ble 40f\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e428  // bfmmla z8.s, z1.h, z25.h\n"
+      ".inst 0x6479e470  // bfmmla z16.s, z3.h, z25.h\n"
+      ".inst 0x6478e42c  // bfmmla z12.s, z1.h, z24.h\n"
+      ".inst 0x6478e474  // bfmmla z20.s, z3.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e429  // bfmmla z9.s, z1.h, z25.h\n"
+      ".inst 0x6479e471  // bfmmla z17.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "addvl x12, x12, #2\n"
+      ".inst 0x6478e42d  // bfmmla z13.s, z1.h, z24.h\n"
+      ".inst 0x6478e475  // bfmmla z21.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6479e42a  // bfmmla z10.s, z1.h, z25.h\n"
+      ".inst 0x6479e472  // bfmmla z18.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
+      "addvl x10, x10, #2\n"
+      ".inst 0x6478e42e  // bfmmla z14.s, z1.h, z24.h\n"
+      ".inst 0x6478e476  // bfmmla z22.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      ".inst 0x6479e42b  // bfmmla z11.s, z1.h, z25.h\n"
+      ".inst 0x6479e473  // bfmmla z19.s, z3.h, z25.h\n"
+      ".inst 0x6478e42f  // bfmmla z15.s, z1.h, z24.h\n"
+      ".inst 0x6478e477  // bfmmla z23.s, z3.h, z24.h\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 35b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmax z7.s, p5/M, z7.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
+      "41:"  // Height 3: No activation
+      "st1w { z7.s }, p4, [x13]\n"
+      "st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "42:"  // Height 3: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 30b\n"
+      "b 86f\n"
+      "43:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "44:"  // Height 4: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 45f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 45f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 45f\n"
+      "mov x11, x12\n"
+      "45:"  // Height 4: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 46f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x15, x15, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x13, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x13]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x21]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 51f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "51:"  // Height 4: input setup done
+      "cmp x27, #0x8\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z30.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "trn1 z29.d, z30.d, z24.d\n"
+      "ld1rqh { z28.h }, p0/Z, [x24]\n"
+      "ld1rqh { z27.h }, p0/Z, [x23]\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "trn1 z26.d, z28.d, z27.d\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e7a8  // bfmmla z8.s, z29.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e7ac  // bfmmla z12.s, z29.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e7a9  // bfmmla z9.s, z29.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "trn2 z28.d, z28.d, z27.d\n"
+      ".inst 0x6478e7ad  // bfmmla z13.s, z29.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6479e7aa  // bfmmla z10.s, z29.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6478e7ae  // bfmmla z14.s, z29.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6479e7ab  // bfmmla z11.s, z29.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6478e7af  // bfmmla z15.s, z29.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6479e7c8  // bfmmla z8.s, z30.h, z25.h\n"
+      ".inst 0x6479e790  // bfmmla z16.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6478e7cc  // bfmmla z12.s, z30.h, z24.h\n"
+      ".inst 0x6478e794  // bfmmla z20.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      ".inst 0x6479e7c9  // bfmmla z9.s, z30.h, z25.h\n"
+      ".inst 0x6479e791  // bfmmla z17.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      ".inst 0x6478e7cd  // bfmmla z13.s, z30.h, z24.h\n"
+      ".inst 0x6478e795  // bfmmla z21.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      ".inst 0x6479e7ca  // bfmmla z10.s, z30.h, z25.h\n"
+      ".inst 0x6479e792  // bfmmla z18.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6478e7ce  // bfmmla z14.s, z30.h, z24.h\n"
+      ".inst 0x6478e796  // bfmmla z22.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      ".inst 0x6479e7cb  // bfmmla z11.s, z30.h, z25.h\n"
+      ".inst 0x6479e793  // bfmmla z19.s, z28.h, z25.h\n"
+      ".inst 0x6478e7cf  // bfmmla z15.s, z30.h, z24.h\n"
+      ".inst 0x6478e797  // bfmmla z23.s, z28.h, z24.h\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "trn1 z28.d, z1.d, z24.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "ld1rqh { z27.h }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "trn1 z26.d, z3.d, z27.d\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e788  // bfmmla z8.s, z28.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e78c  // bfmmla z12.s, z28.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e789  // bfmmla z9.s, z28.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x4\n"
+      ".inst 0x6478e78d  // bfmmla z13.s, z28.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "trn2 z3.d, z3.d, z27.d\n"
+      ".inst 0x6479e78a  // bfmmla z10.s, z28.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
+      "addvl x12, x12, #2\n"
+      ".inst 0x6478e78e  // bfmmla z14.s, z28.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6479e78b  // bfmmla z11.s, z28.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      ".inst 0x6478e78f  // bfmmla z15.s, z28.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ble 54f\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e428  // bfmmla z8.s, z1.h, z25.h\n"
+      ".inst 0x6479e470  // bfmmla z16.s, z3.h, z25.h\n"
+      ".inst 0x6478e42c  // bfmmla z12.s, z1.h, z24.h\n"
+      ".inst 0x6478e474  // bfmmla z20.s, z3.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e429  // bfmmla z9.s, z1.h, z25.h\n"
+      ".inst 0x6479e471  // bfmmla z17.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "addvl x12, x12, #2\n"
+      ".inst 0x6478e42d  // bfmmla z13.s, z1.h, z24.h\n"
+      ".inst 0x6478e475  // bfmmla z21.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6479e42a  // bfmmla z10.s, z1.h, z25.h\n"
+      ".inst 0x6479e472  // bfmmla z18.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
+      "addvl x10, x10, #2\n"
+      ".inst 0x6478e42e  // bfmmla z14.s, z1.h, z24.h\n"
+      ".inst 0x6478e476  // bfmmla z22.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      ".inst 0x6479e42b  // bfmmla z11.s, z1.h, z25.h\n"
+      ".inst 0x6479e473  // bfmmla z19.s, z3.h, z25.h\n"
+      ".inst 0x6478e42f  // bfmmla z15.s, z1.h, z24.h\n"
+      ".inst 0x6478e477  // bfmmla z23.s, z3.h, z24.h\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 49b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z24.s\n"
+      "fmin z12.s, p5/M, z12.s, z24.s\n"
+      "fmin z13.s, p5/M, z13.s, z24.s\n"
+      "fmin z14.s, p5/M, z14.s, z24.s\n"
+      "fmin z8.s, p5/M, z8.s, z24.s\n"
+      "fmin z9.s, p5/M, z9.s, z24.s\n"
+      "fmin z10.s, p5/M, z10.s, z24.s\n"
+      "fmin z11.s, p5/M, z11.s, z24.s\n"
+      "fmin z15.s, p5/M, z15.s, z24.s\n"
+      "fmin z20.s, p5/M, z20.s, z24.s\n"
+      "fmin z21.s, p5/M, z21.s, z24.s\n"
+      "fmin z22.s, p5/M, z22.s, z24.s\n"
+      "fmin z16.s, p5/M, z16.s, z24.s\n"
+      "fmin z17.s, p5/M, z17.s, z24.s\n"
+      "fmin z18.s, p5/M, z18.s, z24.s\n"
+      "fmin z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z7.s, p5/M, z7.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
+      "55:"  // Height 4: No activation
+      "st1w { z7.s }, p4, [x13]\n"
+      "st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x24]\n"
+      "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "56:"  // Height 4: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 44b\n"
+      "b 86f\n"
+      "57:"  // Height 5
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "58:"  // Height 5: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 59f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 59f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 59f\n"
+      "mov x11, x12\n"
+      "59:"  // Height 5: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 60f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x15, x15, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z28.d, z12.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z29.d, z13.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z30.d, z14.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z31.d, z15.d\n"
+      "b 62f\n"
+      "60:"  // Height 5: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x13]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 62f\n"
+      "61:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "62:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "63:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 65f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "b 65f\n"
+      "64:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "65:"  // Height 5: input setup done
+      "cmp x27, #0x8\n"
+      "ble 67f\n"
+      "66:"  // Height 5: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z6.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z7.h }, p0/Z, [x24]\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn1 z5.d, z6.d, z1.d\n"
+      "trn2 z6.d, z6.d, z1.d\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "trn1 z3.d, z7.d, z2.d\n"
+      "trn2 z7.d, z7.d, z2.d\n"
+      "ld1h { z1.h }, p5/Z, [x12]\n"
+      "trn1 z2.d, z4.d, z0.d\n"
+      "trn2 z4.d, z4.d, z0.d\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6461e4a8  // bfmmla z8.s, z5.h, z1.h\n"
+      ".inst 0x6461e470  // bfmmla z16.s, z3.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x11]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6460e4ac  // bfmmla z12.s, z5.h, z0.h\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6461e471  // bfmmla z17.s, z3.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6460e4ad  // bfmmla z13.s, z5.h, z0.h\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6461e4aa  // bfmmla z10.s, z5.h, z1.h\n"
+      ".inst 0x6461e472  // bfmmla z18.s, z3.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e4ae  // bfmmla z14.s, z5.h, z0.h\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      ".inst 0x6461e473  // bfmmla z19.s, z3.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x12, #3, MUL VL]\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
+      "addvl x12, x12, #4\n"
+      ".inst 0x6461e4f0  // bfmmla z16.s, z7.h, z1.h\n"
+      ".inst 0x6461e498  // bfmmla z24.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x11, #2, MUL VL]\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f4  // bfmmla z20.s, z7.h, z0.h\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
+      "addvl x11, x11, #4\n"
+      ".inst 0x6461e4f1  // bfmmla z17.s, z7.h, z1.h\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f5  // bfmmla z21.s, z7.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
+      "addvl x10, x10, #4\n"
+      ".inst 0x6461e4f2  // bfmmla z18.s, z7.h, z1.h\n"
+      ".inst 0x6461e49a  // bfmmla z26.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f6  // bfmmla z22.s, z7.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
+      "addvl x9, x9, #4\n"
+      ".inst 0x6461e4f3  // bfmmla z19.s, z7.h, z1.h\n"
+      ".inst 0x6461e49b  // bfmmla z27.s, z4.h, z1.h\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f7  // bfmmla z23.s, z7.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "bgt 66b\n"
+      "67:"  // Height 5: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z4.h }, p0/Z, [x25]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn1 z7.d, z1.d, z4.d\n"
+      "trn2 z1.d, z1.d, z4.d\n"
+      "ld1rqh { z5.h }, p0/Z, [x22]\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "ld1h { z2.h }, p5/Z, [x12]\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6462e4e8  // bfmmla z8.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d0  // bfmmla z16.s, z6.h, z2.h\n"
+      ".inst 0x6462e498  // bfmmla z24.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x11]\n"
+      "subs x27, x27, #0x4\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d4  // bfmmla z20.s, z6.h, z0.h\n"
+      "addvl x12, x12, #2\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6462e4e9  // bfmmla z9.s, z7.h, z2.h\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6462e4d1  // bfmmla z17.s, z6.h, z2.h\n"
+      ".inst 0x6462e499  // bfmmla z25.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d5  // bfmmla z21.s, z6.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e4ea  // bfmmla z10.s, z7.h, z2.h\n"
+      "addvl x10, x10, #2\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d6  // bfmmla z22.s, z6.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6462e4eb  // bfmmla z11.s, z7.h, z2.h\n"
+      "addvl x9, x9, #2\n"
+      ".inst 0x6462e4d3  // bfmmla z19.s, z6.h, z2.h\n"
+      ".inst 0x6462e49b  // bfmmla z27.s, z4.h, z2.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d7  // bfmmla z23.s, z6.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "ble 68f\n"
+      "ld1h { z2.h }, p5/Z, [x12]\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6462e428  // bfmmla z8.s, z1.h, z2.h\n"
+      ".inst 0x6462e470  // bfmmla z16.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6460e42c  // bfmmla z12.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x11]\n"
+      "addvl x12, x12, #2\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bc  // bfmmla z28.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6462e429  // bfmmla z9.s, z1.h, z2.h\n"
+      ".inst 0x6462e471  // bfmmla z17.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b9  // bfmmla z25.s, z5.h, z2.h\n"
+      ".inst 0x6460e42d  // bfmmla z13.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "addvl x10, x10, #2\n"
+      ".inst 0x6462e42a  // bfmmla z10.s, z1.h, z2.h\n"
+      ".inst 0x6462e472  // bfmmla z18.s, z3.h, z2.h\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6460e42e  // bfmmla z14.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e4be  // bfmmla z30.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      ".inst 0x6462e42b  // bfmmla z11.s, z1.h, z2.h\n"
+      ".inst 0x6462e473  // bfmmla z19.s, z3.h, z2.h\n"
+      ".inst 0x6462e4bb  // bfmmla z27.s, z5.h, z2.h\n"
+      ".inst 0x6460e42f  // bfmmla z15.s, z1.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
+      "68:"  // Height 5: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 63b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z7.s, p5/M, z7.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
+      "fmax z24.s, p5/M, z24.s, z23.s\n"
+      "fmax z25.s, p5/M, z25.s, z23.s\n"
+      "fmax z26.s, p5/M, z26.s, z23.s\n"
+      "fmax z27.s, p5/M, z27.s, z23.s\n"
+      "69:"  // Height 5: No activation
+      "st1w { z7.s }, p4, [x13]\n"
+      "st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x24]\n"
+      "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x22]\n"
+      "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+      "70:"  // Height 5: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 58b\n"
+      "b 86f\n"
+      "71:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x21, #0x18\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x15, %x[bias]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
+      "72:"  // Height 6: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 73f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 73f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 73f\n"
+      "mov x11, x12\n"
+      "73:"  // Height 6: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 74f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x15, x15, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z28.d, z12.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z29.d, z13.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z30.d, z14.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z31.d, z15.d\n"
+      "b 76f\n"
+      "74:"  // Height 6: no bias
+      "tbz %x[flags], #0, 75f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x13, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z17.s }, p4/Z, [x13]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z17.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip2 z12.d, z17.d, z12.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z20.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z14.d, z20.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 76f\n"
+      "75:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "76:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "77:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 78f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 79f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
+      "b 79f\n"
+      "78:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
+      "79:"  // Height 6: input setup done
+      "cmp x27, #0x8\n"
+      "ble 81f\n"
+      "80:"  // Height 6: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z7.h }, p0/Z, [x26]\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
+      "trn1 z6.d, z7.d, z0.d\n"
+      "ld1rqh { z5.h }, p0/Z, [x24]\n"
+      "ld1rqh { z1.h }, p0/Z, [x23]\n"
+      "trn2 z7.d, z7.d, z0.d\n"
+      "trn1 z4.d, z5.d, z1.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1rqh { z0.h }, p0/Z, [x21]\n"
+      "trn2 z5.d, z5.d, z1.d\n"
+      "trn1 z2.d, z3.d, z0.d\n"
+      "trn2 z3.d, z3.d, z0.d\n"
+      "ld1h { z1.h }, p5/Z, [x12]\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
+      ".inst 0x6461e490  // bfmmla z16.s, z4.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x11]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e494  // bfmmla z20.s, z4.h, z0.h\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6461e491  // bfmmla z17.s, z4.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e495  // bfmmla z21.s, z4.h, z0.h\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6461e492  // bfmmla z18.s, z4.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e496  // bfmmla z22.s, z4.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
+      ".inst 0x6461e493  // bfmmla z19.s, z4.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e497  // bfmmla z23.s, z4.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x12, #3, MUL VL]\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
+      "addvl x12, x12, #4\n"
+      ".inst 0x6461e4b0  // bfmmla z16.s, z5.h, z1.h\n"
+      ".inst 0x6461e478  // bfmmla z24.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x11, #2, MUL VL]\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b4  // bfmmla z20.s, z5.h, z0.h\n"
+      ".inst 0x6460e47c  // bfmmla z28.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      "addvl x11, x11, #4\n"
+      ".inst 0x6461e4b1  // bfmmla z17.s, z5.h, z1.h\n"
+      ".inst 0x6461e479  // bfmmla z25.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b5  // bfmmla z21.s, z5.h, z0.h\n"
+      ".inst 0x6460e47d  // bfmmla z29.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
+      "addvl x10, x10, #4\n"
+      ".inst 0x6461e4b2  // bfmmla z18.s, z5.h, z1.h\n"
+      ".inst 0x6461e47a  // bfmmla z26.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b6  // bfmmla z22.s, z5.h, z0.h\n"
+      ".inst 0x6460e47e  // bfmmla z30.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      "addvl x9, x9, #4\n"
+      ".inst 0x6461e4b3  // bfmmla z19.s, z5.h, z1.h\n"
+      ".inst 0x6461e47b  // bfmmla z27.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6460e47f  // bfmmla z31.s, z3.h, z0.h\n"
+      "bgt 80b\n"
+      "81:"  // Height 6: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
+      "trn1 z7.d, z1.d, z0.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z0.d\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "ld1rqh { z5.h }, p0/Z, [x22]\n"
+      "ld1rqh { z0.h }, p0/Z, [x21]\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1h { z2.h }, p5/Z, [x12]\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6462e4e8  // bfmmla z8.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d0  // bfmmla z16.s, z6.h, z2.h\n"
+      ".inst 0x6462e498  // bfmmla z24.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x11]\n"
+      "subs x27, x27, #0x4\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d4  // bfmmla z20.s, z6.h, z0.h\n"
+      "addvl x12, x12, #2\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6462e4e9  // bfmmla z9.s, z7.h, z2.h\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6462e4d1  // bfmmla z17.s, z6.h, z2.h\n"
+      ".inst 0x6462e499  // bfmmla z25.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d5  // bfmmla z21.s, z6.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e4ea  // bfmmla z10.s, z7.h, z2.h\n"
+      "addvl x10, x10, #2\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d6  // bfmmla z22.s, z6.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6462e4eb  // bfmmla z11.s, z7.h, z2.h\n"
+      "addvl x9, x9, #2\n"
+      ".inst 0x6462e4d3  // bfmmla z19.s, z6.h, z2.h\n"
+      ".inst 0x6462e49b  // bfmmla z27.s, z4.h, z2.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d7  // bfmmla z23.s, z6.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "ble 82f\n"
+      "ld1h { z2.h }, p5/Z, [x12]\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6462e428  // bfmmla z8.s, z1.h, z2.h\n"
+      ".inst 0x6462e470  // bfmmla z16.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6460e42c  // bfmmla z12.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x11]\n"
+      "addvl x12, x12, #2\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bc  // bfmmla z28.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6462e429  // bfmmla z9.s, z1.h, z2.h\n"
+      ".inst 0x6462e471  // bfmmla z17.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b9  // bfmmla z25.s, z5.h, z2.h\n"
+      ".inst 0x6460e42d  // bfmmla z13.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "addvl x10, x10, #2\n"
+      ".inst 0x6462e42a  // bfmmla z10.s, z1.h, z2.h\n"
+      ".inst 0x6462e472  // bfmmla z18.s, z3.h, z2.h\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6460e42e  // bfmmla z14.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e4be  // bfmmla z30.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      ".inst 0x6462e42b  // bfmmla z11.s, z1.h, z2.h\n"
+      ".inst 0x6462e473  // bfmmla z19.s, z3.h, z2.h\n"
+      ".inst 0x6462e4bb  // bfmmla z27.s, z5.h, z2.h\n"
+      ".inst 0x6460e42f  // bfmmla z15.s, z1.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
+      "82:"  // Height 6: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 77b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z1.s\n"
+      "fmin z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z1.s\n"
+      "fmin z14.s, p5/M, z14.s, z1.s\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmin z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z1.s\n"
+      "fmin z22.s, p5/M, z22.s, z1.s\n"
+      "fmin z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z1.s\n"
+      "fmin z18.s, p5/M, z18.s, z1.s\n"
+      "fmin z19.s, p5/M, z19.s, z1.s\n"
+      "fmin z23.s, p5/M, z23.s, z1.s\n"
+      "fmin z28.s, p5/M, z28.s, z1.s\n"
+      "fmin z29.s, p5/M, z29.s, z1.s\n"
+      "fmin z30.s, p5/M, z30.s, z1.s\n"
+      "fmin z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z1.s\n"
+      "fmin z26.s, p5/M, z26.s, z1.s\n"
+      "fmin z27.s, p5/M, z27.s, z1.s\n"
+      "fmax z7.s, p5/M, z7.s, z0.s\n"
+      "fmax z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z0.s\n"
+      "fmax z14.s, p5/M, z14.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z0.s\n"
+      "fmax z22.s, p5/M, z22.s, z0.s\n"
+      "fmax z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z0.s\n"
+      "fmax z18.s, p5/M, z18.s, z0.s\n"
+      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z28.s, p5/M, z28.s, z0.s\n"
+      "fmax z29.s, p5/M, z29.s, z0.s\n"
+      "fmax z30.s, p5/M, z30.s, z0.s\n"
+      "fmax z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z0.s\n"
+      "fmax z26.s, p5/M, z26.s, z0.s\n"
+      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "83:"  // Height 6: No activation
+      "st1w { z7.s }, p4, [x13]\n"
+      "st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x24]\n"
+      "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z23.s }, p4, [x22]\n"
+      "st1w { z28.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z29.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z30.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x21]\n"
+      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "84:"  // Height 6: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 72b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 86f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "85:"  // Update direct input
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "86:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp
new file mode 100644
index 0000000000..c42ad7e879
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<__fp16>, \
+    size_t, size_t, \
+    const __fp16 *, \
+    size_t, \
+    IndirectOutputArg<__fp16>, \
+    const __fp16 *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_ffhybrid_fp16_mla_6x4VL( ARGLIST );
+void sve_ffhybrid_fp16_mla_6x4VL_a64fx( ARGLIST );
+
+class cls_sve_ffhybrid_fp16_mla_6x4VL
+{
+public:
+    typedef __fp16 lhs_operand_type;
+    typedef __fp16 rhs_operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+    static unsigned int stripe_width()
+    {
+        return get_vector_length<__fp16>() * 1;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL1VL_BL16;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<__fp16>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, __fp16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.51 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_ffhybrid_fp16_mla_6x4VL;
+    cls_sve_ffhybrid_fp16_mla_6x4VL(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_ffhybrid_fp16_mla_6x4VL_a64fx;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
new file mode 100644
index 0000000000..66601bd312
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
@@ -0,0 +1,1530 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_ffhybrid_fp16_mla_6x4VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+    size_t M, size_t N, const __fp16 *B_ptr, size_t B_stride, IndirectOutputArg<__fp16> output_arg,
+    const __fp16 *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const __fp16 *B_ptr = {};
+        const __fp16 *cur_B_ptr = {};
+        size_t B_stride = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    ka.B_stride = B_stride;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 66f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 53f\n"
+      "beq 40f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 27f\n"
+      "beq 14f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 3f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 3f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 3f\n"
+      "mov x11, x12\n"
+      "3:"  // Height 1: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x14\n"
+      "cbz x15, 4f\n"
+      "ld1h { z8.h }, p4/Z, [x15]\n"
+      "ld1h { z9.h }, p4/Z, [x15, #1, MUL VL]\n"
+      "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1h { z8.h }, p3/Z, [x13]\n"
+      "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 9f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z17.h }, p4/Z, [x10]\n"
+      "ld1h { z16.h }, p4/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "add x26, x26, #0x2\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z17.h }, p4/Z, [x10]\n"
+      "ld1h { z16.h }, p4/Z, [x9]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "bne 7b\n"
+      "tbz %x[flags], #1, 12f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z17.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z16.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z17.h\n"
+      "fmin z9.h, p4/M, z9.h, z17.h\n"
+      "fmin z10.h, p4/M, z10.h, z17.h\n"
+      "fmin z11.h, p4/M, z11.h, z17.h\n"
+      "fmax z8.h, p4/M, z8.h, z16.h\n"
+      "fmax z9.h, p4/M, z9.h, z16.h\n"
+      "fmax z10.h, p4/M, z10.h, z16.h\n"
+      "fmax z11.h, p4/M, z11.h, z16.h\n"
+      "12:"  // Height 1: No activation
+      "st1h { z8.h }, p3, [x13]\n"
+      "st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "13:"  // Height 1: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 2b\n"
+      "b 80f\n"
+      "14:"  // Height 2
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "15:"  // Height 2: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 16f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 16f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 16f\n"
+      "mov x11, x12\n"
+      "16:"  // Height 2: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x14\n"
+      "cbz x15, 17f\n"
+      "ld1h { z8.h }, p4/Z, [x15]\n"
+      "ld1h { z9.h }, p4/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x15, x15, #4\n"
+      "b 19f\n"
+      "17:"  // Height 2: no bias
+      "tbz %x[flags], #0, 18f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x13, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x13]\n"
+      "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x20]\n"
+      "ld1h { z13.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 19f\n"
+      "18:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "19:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "20:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 21f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 22f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "b 22f\n"
+      "21:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "22:"  // Height 2: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "ble 24f\n"
+      "23:"  // Height 2: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "ld1h { z17.h }, p4/Z, [x10]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "ld1h { z16.h }, p4/Z, [x9]\n"
+      "addvl x11, x11, #1\n"
+      "add x26, x26, #0x2\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z14.h, p4/M, z17.h, z1.h\n"
+      "add x25, x25, #0x2\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "fmla z15.h, p4/M, z16.h, z1.h\n"
+      "addvl x10, x10, #1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "addvl x9, x9, #1\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "bgt 23b\n"
+      "24:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "ld1h { z17.h }, p4/Z, [x10]\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "ld1h { z16.h }, p4/Z, [x9]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z14.h, p4/M, z17.h, z1.h\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "fmla z15.h, p4/M, z16.h, z1.h\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "bne 20b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "tbz %x[flags], #1, 25f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z17.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z16.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z17.h\n"
+      "fmin z9.h, p4/M, z9.h, z17.h\n"
+      "fmin z10.h, p4/M, z10.h, z17.h\n"
+      "fmin z11.h, p4/M, z11.h, z17.h\n"
+      "fmin z12.h, p4/M, z12.h, z17.h\n"
+      "fmin z13.h, p4/M, z13.h, z17.h\n"
+      "fmin z14.h, p4/M, z14.h, z17.h\n"
+      "fmin z15.h, p4/M, z15.h, z17.h\n"
+      "fmax z8.h, p4/M, z8.h, z16.h\n"
+      "fmax z9.h, p4/M, z9.h, z16.h\n"
+      "fmax z10.h, p4/M, z10.h, z16.h\n"
+      "fmax z11.h, p4/M, z11.h, z16.h\n"
+      "fmax z12.h, p4/M, z12.h, z16.h\n"
+      "fmax z13.h, p4/M, z13.h, z16.h\n"
+      "fmax z14.h, p4/M, z14.h, z16.h\n"
+      "fmax z15.h, p4/M, z15.h, z16.h\n"
+      "25:"  // Height 2: No activation
+      "st1h { z8.h }, p3, [x13]\n"
+      "st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p3, [x25]\n"
+      "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+      "26:"  // Height 2: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 15b\n"
+      "b 80f\n"
+      "27:"  // Height 3
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "28:"  // Height 3: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 29f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 29f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 29f\n"
+      "mov x11, x12\n"
+      "29:"  // Height 3: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x14\n"
+      "cbz x15, 30f\n"
+      "ld1h { z8.h }, p4/Z, [x15]\n"
+      "ld1h { z9.h }, p4/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 32f\n"
+      "30:"  // Height 3: no bias
+      "tbz %x[flags], #0, 31f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x13, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x13]\n"
+      "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x21]\n"
+      "ld1h { z13.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x20]\n"
+      "ld1h { z17.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 32f\n"
+      "31:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "32:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "33:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 34f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 35f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "b 35f\n"
+      "34:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "35:"  // Height 3: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "ble 37f\n"
+      "36:"  // Height 3: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z21.h }, p4/Z, [x10]\n"
+      "add x26, x26, #0x2\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "ld1h { z20.h }, p4/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "add x25, x25, #0x2\n"
+      "add x24, x24, #0x2\n"
+      "fmla z10.h, p4/M, z21.h, z0.h\n"
+      "fmla z14.h, p4/M, z21.h, z1.h\n"
+      "fmla z18.h, p4/M, z21.h, z2.h\n"
+      "fmla z11.h, p4/M, z20.h, z0.h\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.h, p4/M, z20.h, z1.h\n"
+      "fmla z19.h, p4/M, z20.h, z2.h\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "bgt 36b\n"
+      "37:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z21.h }, p4/Z, [x10]\n"
+      "cmp x28, x20\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "ld1h { z20.h }, p4/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.h, p4/M, z21.h, z0.h\n"
+      "fmla z14.h, p4/M, z21.h, z1.h\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.h, p4/M, z21.h, z2.h\n"
+      "fmla z11.h, p4/M, z20.h, z0.h\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.h, p4/M, z20.h, z1.h\n"
+      "fmla z19.h, p4/M, z20.h, z2.h\n"
+      "bne 33b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "tbz %x[flags], #1, 38f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z21.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z20.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z21.h\n"
+      "fmin z9.h, p4/M, z9.h, z21.h\n"
+      "fmin z10.h, p4/M, z10.h, z21.h\n"
+      "fmin z11.h, p4/M, z11.h, z21.h\n"
+      "fmin z12.h, p4/M, z12.h, z21.h\n"
+      "fmin z13.h, p4/M, z13.h, z21.h\n"
+      "fmin z14.h, p4/M, z14.h, z21.h\n"
+      "fmin z15.h, p4/M, z15.h, z21.h\n"
+      "fmin z16.h, p4/M, z16.h, z21.h\n"
+      "fmin z17.h, p4/M, z17.h, z21.h\n"
+      "fmin z18.h, p4/M, z18.h, z21.h\n"
+      "fmin z19.h, p4/M, z19.h, z21.h\n"
+      "fmax z8.h, p4/M, z8.h, z20.h\n"
+      "fmax z9.h, p4/M, z9.h, z20.h\n"
+      "fmax z10.h, p4/M, z10.h, z20.h\n"
+      "fmax z11.h, p4/M, z11.h, z20.h\n"
+      "fmax z12.h, p4/M, z12.h, z20.h\n"
+      "fmax z13.h, p4/M, z13.h, z20.h\n"
+      "fmax z14.h, p4/M, z14.h, z20.h\n"
+      "fmax z15.h, p4/M, z15.h, z20.h\n"
+      "fmax z16.h, p4/M, z16.h, z20.h\n"
+      "fmax z17.h, p4/M, z17.h, z20.h\n"
+      "fmax z18.h, p4/M, z18.h, z20.h\n"
+      "fmax z19.h, p4/M, z19.h, z20.h\n"
+      "38:"  // Height 3: No activation
+      "st1h { z8.h }, p3, [x13]\n"
+      "st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p3, [x25]\n"
+      "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x24]\n"
+      "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+      "39:"  // Height 3: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 28b\n"
+      "b 80f\n"
+      "40:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "41:"  // Height 4: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 42f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 42f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 42f\n"
+      "mov x11, x12\n"
+      "42:"  // Height 4: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x14\n"
+      "cbz x15, 43f\n"
+      "ld1h { z8.h }, p4/Z, [x15]\n"
+      "ld1h { z9.h }, p4/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 45f\n"
+      "43:"  // Height 4: no bias
+      "tbz %x[flags], #0, 44f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x13, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x13]\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x22]\n"
+      "ld1h { z13.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x21]\n"
+      "ld1h { z17.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x20]\n"
+      "ld1h { z21.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 45f\n"
+      "44:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "45:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "46:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 47f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 48f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "b 48f\n"
+      "47:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "48:"  // Height 4: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "ble 50f\n"
+      "49:"  // Height 4: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "ld1h { z25.h }, p4/Z, [x10]\n"
+      "add x26, x26, #0x2\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "subs x27, x27, #0x1\n"
+      "add x25, x25, #0x2\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "ld1h { z24.h }, p4/Z, [x9]\n"
+      "add x24, x24, #0x2\n"
+      "add x23, x23, #0x2\n"
+      "fmla z10.h, p4/M, z25.h, z0.h\n"
+      "fmla z14.h, p4/M, z25.h, z1.h\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.h, p4/M, z25.h, z2.h\n"
+      "fmla z22.h, p4/M, z25.h, z3.h\n"
+      "addvl x9, x9, #1\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "fmla z11.h, p4/M, z24.h, z0.h\n"
+      "fmla z15.h, p4/M, z24.h, z1.h\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "fmla z19.h, p4/M, z24.h, z2.h\n"
+      "fmla z23.h, p4/M, z24.h, z3.h\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "bgt 49b\n"
+      "50:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "ld1h { z25.h }, p4/Z, [x10]\n"
+      "cmp x28, x20\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "ld1h { z24.h }, p4/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.h, p4/M, z25.h, z0.h\n"
+      "fmla z14.h, p4/M, z25.h, z1.h\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.h, p4/M, z25.h, z2.h\n"
+      "fmla z22.h, p4/M, z25.h, z3.h\n"
+      "fmla z11.h, p4/M, z24.h, z0.h\n"
+      "fmla z15.h, p4/M, z24.h, z1.h\n"
+      "fmla z19.h, p4/M, z24.h, z2.h\n"
+      "fmla z23.h, p4/M, z24.h, z3.h\n"
+      "bne 46b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "tbz %x[flags], #1, 51f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z25.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z24.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z25.h\n"
+      "fmin z9.h, p4/M, z9.h, z25.h\n"
+      "fmin z10.h, p4/M, z10.h, z25.h\n"
+      "fmin z11.h, p4/M, z11.h, z25.h\n"
+      "fmin z12.h, p4/M, z12.h, z25.h\n"
+      "fmin z13.h, p4/M, z13.h, z25.h\n"
+      "fmin z14.h, p4/M, z14.h, z25.h\n"
+      "fmin z15.h, p4/M, z15.h, z25.h\n"
+      "fmin z16.h, p4/M, z16.h, z25.h\n"
+      "fmin z17.h, p4/M, z17.h, z25.h\n"
+      "fmin z18.h, p4/M, z18.h, z25.h\n"
+      "fmin z19.h, p4/M, z19.h, z25.h\n"
+      "fmin z20.h, p4/M, z20.h, z25.h\n"
+      "fmin z21.h, p4/M, z21.h, z25.h\n"
+      "fmin z22.h, p4/M, z22.h, z25.h\n"
+      "fmin z23.h, p4/M, z23.h, z25.h\n"
+      "fmax z8.h, p4/M, z8.h, z24.h\n"
+      "fmax z9.h, p4/M, z9.h, z24.h\n"
+      "fmax z10.h, p4/M, z10.h, z24.h\n"
+      "fmax z11.h, p4/M, z11.h, z24.h\n"
+      "fmax z12.h, p4/M, z12.h, z24.h\n"
+      "fmax z13.h, p4/M, z13.h, z24.h\n"
+      "fmax z14.h, p4/M, z14.h, z24.h\n"
+      "fmax z15.h, p4/M, z15.h, z24.h\n"
+      "fmax z16.h, p4/M, z16.h, z24.h\n"
+      "fmax z17.h, p4/M, z17.h, z24.h\n"
+      "fmax z18.h, p4/M, z18.h, z24.h\n"
+      "fmax z19.h, p4/M, z19.h, z24.h\n"
+      "fmax z20.h, p4/M, z20.h, z24.h\n"
+      "fmax z21.h, p4/M, z21.h, z24.h\n"
+      "fmax z22.h, p4/M, z22.h, z24.h\n"
+      "fmax z23.h, p4/M, z23.h, z24.h\n"
+      "51:"  // Height 4: No activation
+      "st1h { z8.h }, p3, [x13]\n"
+      "st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p3, [x25]\n"
+      "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x24]\n"
+      "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p3, [x23]\n"
+      "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
+      "52:"  // Height 4: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 41b\n"
+      "b 80f\n"
+      "53:"  // Height 5
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "54:"  // Height 5: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 55f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 55f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 55f\n"
+      "mov x11, x12\n"
+      "55:"  // Height 5: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x14\n"
+      "cbz x15, 56f\n"
+      "ld1h { z8.h }, p4/Z, [x15]\n"
+      "ld1h { z9.h }, p4/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 58f\n"
+      "56:"  // Height 5: no bias
+      "tbz %x[flags], #0, 57f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x13]\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x23]\n"
+      "ld1h { z13.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x22]\n"
+      "ld1h { z17.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x21]\n"
+      "ld1h { z21.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z24.h }, p3/Z, [x20]\n"
+      "ld1h { z25.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z26.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z27.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 58f\n"
+      "57:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "58:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "59:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 60f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 61f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "b 61f\n"
+      "60:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "61:"  // Height 5: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "ld1rh { z4.h }, p4/Z, [x22]\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "ble 63f\n"
+      "62:"  // Height 5: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "add x26, x26, #0x2\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z29.h }, p4/Z, [x10]\n"
+      "add x25, x25, #0x2\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "add x24, x24, #0x2\n"
+      "add x23, x23, #0x2\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "ld1h { z28.h }, p4/Z, [x9]\n"
+      "add x22, x22, #0x2\n"
+      "fmla z10.h, p4/M, z29.h, z0.h\n"
+      "fmla z14.h, p4/M, z29.h, z1.h\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.h, p4/M, z29.h, z2.h\n"
+      "fmla z22.h, p4/M, z29.h, z3.h\n"
+      "fmla z26.h, p4/M, z29.h, z4.h\n"
+      "fmla z11.h, p4/M, z28.h, z0.h\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "fmla z15.h, p4/M, z28.h, z1.h\n"
+      "fmla z19.h, p4/M, z28.h, z2.h\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "fmla z23.h, p4/M, z28.h, z3.h\n"
+      "fmla z27.h, p4/M, z28.h, z4.h\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "ld1rh { z4.h }, p4/Z, [x22]\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "bgt 62b\n"
+      "63:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "cmp x28, x20\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z29.h }, p4/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "ld1h { z28.h }, p4/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, p4/M, z29.h, z0.h\n"
+      "fmla z14.h, p4/M, z29.h, z1.h\n"
+      "fmla z18.h, p4/M, z29.h, z2.h\n"
+      "fmla z22.h, p4/M, z29.h, z3.h\n"
+      "fmla z26.h, p4/M, z29.h, z4.h\n"
+      "fmla z11.h, p4/M, z28.h, z0.h\n"
+      "fmla z15.h, p4/M, z28.h, z1.h\n"
+      "fmla z19.h, p4/M, z28.h, z2.h\n"
+      "fmla z23.h, p4/M, z28.h, z3.h\n"
+      "fmla z27.h, p4/M, z28.h, z4.h\n"
+      "bne 59b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "tbz %x[flags], #1, 64f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z29.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z28.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z29.h\n"
+      "fmin z9.h, p4/M, z9.h, z29.h\n"
+      "fmin z10.h, p4/M, z10.h, z29.h\n"
+      "fmin z11.h, p4/M, z11.h, z29.h\n"
+      "fmin z12.h, p4/M, z12.h, z29.h\n"
+      "fmin z13.h, p4/M, z13.h, z29.h\n"
+      "fmin z14.h, p4/M, z14.h, z29.h\n"
+      "fmin z15.h, p4/M, z15.h, z29.h\n"
+      "fmin z16.h, p4/M, z16.h, z29.h\n"
+      "fmin z17.h, p4/M, z17.h, z29.h\n"
+      "fmin z18.h, p4/M, z18.h, z29.h\n"
+      "fmin z19.h, p4/M, z19.h, z29.h\n"
+      "fmin z20.h, p4/M, z20.h, z29.h\n"
+      "fmin z21.h, p4/M, z21.h, z29.h\n"
+      "fmin z22.h, p4/M, z22.h, z29.h\n"
+      "fmin z23.h, p4/M, z23.h, z29.h\n"
+      "fmin z24.h, p4/M, z24.h, z29.h\n"
+      "fmin z25.h, p4/M, z25.h, z29.h\n"
+      "fmin z26.h, p4/M, z26.h, z29.h\n"
+      "fmin z27.h, p4/M, z27.h, z29.h\n"
+      "fmax z8.h, p4/M, z8.h, z28.h\n"
+      "fmax z9.h, p4/M, z9.h, z28.h\n"
+      "fmax z10.h, p4/M, z10.h, z28.h\n"
+      "fmax z11.h, p4/M, z11.h, z28.h\n"
+      "fmax z12.h, p4/M, z12.h, z28.h\n"
+      "fmax z13.h, p4/M, z13.h, z28.h\n"
+      "fmax z14.h, p4/M, z14.h, z28.h\n"
+      "fmax z15.h, p4/M, z15.h, z28.h\n"
+      "fmax z16.h, p4/M, z16.h, z28.h\n"
+      "fmax z17.h, p4/M, z17.h, z28.h\n"
+      "fmax z18.h, p4/M, z18.h, z28.h\n"
+      "fmax z19.h, p4/M, z19.h, z28.h\n"
+      "fmax z20.h, p4/M, z20.h, z28.h\n"
+      "fmax z21.h, p4/M, z21.h, z28.h\n"
+      "fmax z22.h, p4/M, z22.h, z28.h\n"
+      "fmax z23.h, p4/M, z23.h, z28.h\n"
+      "fmax z24.h, p4/M, z24.h, z28.h\n"
+      "fmax z25.h, p4/M, z25.h, z28.h\n"
+      "fmax z26.h, p4/M, z26.h, z28.h\n"
+      "fmax z27.h, p4/M, z27.h, z28.h\n"
+      "64:"  // Height 5: No activation
+      "st1h { z8.h }, p3, [x13]\n"
+      "st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p3, [x25]\n"
+      "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x24]\n"
+      "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p3, [x23]\n"
+      "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
+      "st1h { z24.h }, p3, [x22]\n"
+      "st1h { z25.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z26.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [x22, #3, MUL VL]\n"
+      "65:"  // Height 5: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 54b\n"
+      "b 80f\n"
+      "66:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x21, #0xc\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x15, %x[bias]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
+      "67:"  // Height 6: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 68f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 68f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 68f\n"
+      "mov x11, x12\n"
+      "68:"  // Height 6: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x14\n"
+      "cbz x15, 69f\n"
+      "ld1h { z8.h }, p4/Z, [x15]\n"
+      "ld1h { z9.h }, p4/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 71f\n"
+      "69:"  // Height 6: no bias
+      "tbz %x[flags], #0, 70f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x13, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x13]\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x24]\n"
+      "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x23]\n"
+      "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x22]\n"
+      "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z24.h }, p3/Z, [x21]\n"
+      "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z28.h }, p3/Z, [x20]\n"
+      "ld1h { z29.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z30.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z31.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 71f\n"
+      "70:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "71:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "72:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 73f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 74f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
+      "b 74f\n"
+      "73:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
+      "74:"  // Height 6: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "ld1rh { z4.h }, p4/Z, [x22]\n"
+      "ld1rh { z5.h }, p4/Z, [x21]\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "ble 76f\n"
+      "75:"  // Height 6: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "add x26, x26, #0x2\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z28.h, p4/M, z6.h, z5.h\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "add x25, x25, #0x2\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "add x24, x24, #0x2\n"
+      "add x23, x23, #0x2\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "add x22, x22, #0x2\n"
+      "add x21, x21, #0x2\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "fmla z29.h, p4/M, z7.h, z5.h\n"
+      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "fmla z26.h, p4/M, z6.h, z4.h\n"
+      "fmla z30.h, p4/M, z6.h, z5.h\n"
+      "ld1h { z6.h }, p4/Z, [x12]\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "fmla z31.h, p4/M, z7.h, z5.h\n"
+      "ld1rh { z4.h }, p4/Z, [x22]\n"
+      "ld1rh { z5.h }, p4/Z, [x21]\n"
+      "ld1h { z7.h }, p4/Z, [x11]\n"
+      "bgt 75b\n"
+      "76:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "cmp x28, x20\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z28.h, p4/M, z6.h, z5.h\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "fmla z29.h, p4/M, z7.h, z5.h\n"
+      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "fmla z26.h, p4/M, z6.h, z4.h\n"
+      "fmla z30.h, p4/M, z6.h, z5.h\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "fmla z31.h, p4/M, z7.h, z5.h\n"
+      "bne 72b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "tbz %x[flags], #1, 77f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z0.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z1.h\n"
+      "fmin z9.h, p4/M, z9.h, z1.h\n"
+      "fmin z10.h, p4/M, z10.h, z1.h\n"
+      "fmin z11.h, p4/M, z11.h, z1.h\n"
+      "fmin z12.h, p4/M, z12.h, z1.h\n"
+      "fmin z13.h, p4/M, z13.h, z1.h\n"
+      "fmin z14.h, p4/M, z14.h, z1.h\n"
+      "fmin z15.h, p4/M, z15.h, z1.h\n"
+      "fmin z16.h, p4/M, z16.h, z1.h\n"
+      "fmin z17.h, p4/M, z17.h, z1.h\n"
+      "fmin z18.h, p4/M, z18.h, z1.h\n"
+      "fmin z19.h, p4/M, z19.h, z1.h\n"
+      "fmin z20.h, p4/M, z20.h, z1.h\n"
+      "fmin z21.h, p4/M, z21.h, z1.h\n"
+      "fmin z22.h, p4/M, z22.h, z1.h\n"
+      "fmin z23.h, p4/M, z23.h, z1.h\n"
+      "fmin z24.h, p4/M, z24.h, z1.h\n"
+      "fmin z25.h, p4/M, z25.h, z1.h\n"
+      "fmin z26.h, p4/M, z26.h, z1.h\n"
+      "fmin z27.h, p4/M, z27.h, z1.h\n"
+      "fmin z28.h, p4/M, z28.h, z1.h\n"
+      "fmin z29.h, p4/M, z29.h, z1.h\n"
+      "fmin z30.h, p4/M, z30.h, z1.h\n"
+      "fmin z31.h, p4/M, z31.h, z1.h\n"
+      "fmax z8.h, p4/M, z8.h, z0.h\n"
+      "fmax z9.h, p4/M, z9.h, z0.h\n"
+      "fmax z10.h, p4/M, z10.h, z0.h\n"
+      "fmax z11.h, p4/M, z11.h, z0.h\n"
+      "fmax z12.h, p4/M, z12.h, z0.h\n"
+      "fmax z13.h, p4/M, z13.h, z0.h\n"
+      "fmax z14.h, p4/M, z14.h, z0.h\n"
+      "fmax z15.h, p4/M, z15.h, z0.h\n"
+      "fmax z16.h, p4/M, z16.h, z0.h\n"
+      "fmax z17.h, p4/M, z17.h, z0.h\n"
+      "fmax z18.h, p4/M, z18.h, z0.h\n"
+      "fmax z19.h, p4/M, z19.h, z0.h\n"
+      "fmax z20.h, p4/M, z20.h, z0.h\n"
+      "fmax z21.h, p4/M, z21.h, z0.h\n"
+      "fmax z22.h, p4/M, z22.h, z0.h\n"
+      "fmax z23.h, p4/M, z23.h, z0.h\n"
+      "fmax z24.h, p4/M, z24.h, z0.h\n"
+      "fmax z25.h, p4/M, z25.h, z0.h\n"
+      "fmax z26.h, p4/M, z26.h, z0.h\n"
+      "fmax z27.h, p4/M, z27.h, z0.h\n"
+      "fmax z28.h, p4/M, z28.h, z0.h\n"
+      "fmax z29.h, p4/M, z29.h, z0.h\n"
+      "fmax z30.h, p4/M, z30.h, z0.h\n"
+      "fmax z31.h, p4/M, z31.h, z0.h\n"
+      "77:"  // Height 6: No activation
+      "st1h { z8.h }, p3, [x13]\n"
+      "st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p3, [x25]\n"
+      "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x24]\n"
+      "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p3, [x23]\n"
+      "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
+      "st1h { z24.h }, p3, [x22]\n"
+      "st1h { z25.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z26.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [x22, #3, MUL VL]\n"
+      "st1h { z28.h }, p3, [x21]\n"
+      "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+      "st1h { z30.h }, p1, [x21, #2, MUL VL]\n"
+      "st1h { z31.h }, p0, [x21, #3, MUL VL]\n"
+      "78:"  // Height 6: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 67b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 80f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 79f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "79:"  // Update direct input
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "80:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..842db1a4fc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp
@@ -0,0 +1,3318 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_ffhybrid_fp16_mla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+    size_t M, size_t N, const __fp16 *B_ptr, size_t B_stride, IndirectOutputArg<__fp16> output_arg,
+    const __fp16 *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const __fp16 *B_ptr = {};
+        const __fp16 *cur_B_ptr = {};
+        size_t B_stride = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    ka.B_stride = B_stride;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 3f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 3f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 3f\n"
+      "mov x11, x12\n"
+      "3:"  // Height 1: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "cbz x15, 4f\n"
+      "ld1h { z8.h }, p5/Z, [x15]\n"
+      "ld1h { z9.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 9f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x27, #0x8\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1h { z16.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10]\n"
+      "fmla z10.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[7]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1h { z16.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "cmp x27, #0x8\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
+      "add x26, x26, #0x10\n"
+      "addvl x12, x12, #8\n"
+      "addvl x11, x11, #8\n"
+      "addvl x10, x10, #8\n"
+      "addvl x9, x9, #8\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1h { z16.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z17.h, z0.h[0]\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 12f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[1]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[1]\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 12f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[2]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[2]\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 12f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[3]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[3]\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 12f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[4]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[4]\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 12f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[5]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[5]\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 12f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[6]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[6]\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 12f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[7]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 7b\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z17.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z16.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z17.h\n"
+      "fmin z9.h, p5/M, z9.h, z17.h\n"
+      "fmin z10.h, p5/M, z10.h, z17.h\n"
+      "fmin z11.h, p5/M, z11.h, z17.h\n"
+      "fmax z8.h, p5/M, z8.h, z16.h\n"
+      "fmax z9.h, p5/M, z9.h, z16.h\n"
+      "fmax z10.h, p5/M, z10.h, z16.h\n"
+      "fmax z11.h, p5/M, z11.h, z16.h\n"
+      "13:"  // Height 1: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "14:"  // Height 1: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 2b\n"
+      "b 86f\n"
+      "15:"  // Height 2
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "16:"  // Height 2: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 17f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 17f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 17f\n"
+      "mov x11, x12\n"
+      "17:"  // Height 2: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "cbz x15, 18f\n"
+      "ld1h { z8.h }, p5/Z, [x15]\n"
+      "ld1h { z9.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x15, x15, #4\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x13, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x20]\n"
+      "ld1h { z13.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x20, #3, MUL VL]\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 23f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "23:"  // Height 2: input setup done
+      "cmp x27, #0x8\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z1.h[0]\n"
+      "fmla z12.h, z17.h, z0.h[0]\n"
+      "fmla z9.h, z16.h, z1.h[0]\n"
+      "fmla z13.h, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z17.h, z1.h[0]\n"
+      "fmla z14.h, z17.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "cmp x27, #0x8\n"
+      "fmla z11.h, z16.h, z1.h[0]\n"
+      "fmla z15.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z8.h, z17.h, z1.h[1]\n"
+      "fmla z12.h, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      "fmla z9.h, z16.h, z1.h[1]\n"
+      "fmla z13.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[1]\n"
+      "fmla z14.h, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[1]\n"
+      "fmla z15.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[2]\n"
+      "fmla z12.h, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[2]\n"
+      "fmla z13.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[2]\n"
+      "fmla z14.h, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[2]\n"
+      "fmla z15.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[3]\n"
+      "fmla z12.h, z17.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[3]\n"
+      "fmla z13.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[3]\n"
+      "fmla z14.h, z17.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[3]\n"
+      "fmla z15.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[4]\n"
+      "fmla z12.h, z17.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[4]\n"
+      "fmla z13.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[4]\n"
+      "fmla z14.h, z17.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[4]\n"
+      "fmla z15.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[5]\n"
+      "fmla z12.h, z17.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[5]\n"
+      "fmla z13.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[5]\n"
+      "fmla z14.h, z17.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[5]\n"
+      "fmla z15.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[6]\n"
+      "fmla z12.h, z17.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[6]\n"
+      "fmla z13.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[6]\n"
+      "fmla z14.h, z17.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "addvl x12, x12, #8\n"
+      "fmla z11.h, z16.h, z1.h[6]\n"
+      "fmla z15.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #8\n"
+      "fmla z8.h, z17.h, z1.h[7]\n"
+      "fmla z12.h, z17.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      "fmla z9.h, z16.h, z1.h[7]\n"
+      "fmla z13.h, z16.h, z0.h[7]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      "fmla z10.h, z17.h, z1.h[7]\n"
+      "fmla z14.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z1.h[7]\n"
+      "fmla z15.h, z16.h, z0.h[7]\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[0]\n"
+      "fmla z12.h, z17.h, z1.h[0]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "fmla z13.h, z16.h, z1.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z17.h, z0.h[0]\n"
+      "fmla z14.h, z17.h, z1.h[0]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
+      "fmla z15.h, z16.h, z1.h[0]\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 26f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[1]\n"
+      "fmla z12.h, z17.h, z1.h[1]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "fmla z13.h, z16.h, z1.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[1]\n"
+      "fmla z14.h, z17.h, z1.h[1]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
+      "fmla z15.h, z16.h, z1.h[1]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 26f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[2]\n"
+      "fmla z12.h, z17.h, z1.h[2]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "fmla z13.h, z16.h, z1.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[2]\n"
+      "fmla z14.h, z17.h, z1.h[2]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
+      "fmla z15.h, z16.h, z1.h[2]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 26f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[3]\n"
+      "fmla z12.h, z17.h, z1.h[3]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "fmla z13.h, z16.h, z1.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[3]\n"
+      "fmla z14.h, z17.h, z1.h[3]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
+      "fmla z15.h, z16.h, z1.h[3]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 26f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[4]\n"
+      "fmla z12.h, z17.h, z1.h[4]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "fmla z13.h, z16.h, z1.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[4]\n"
+      "fmla z14.h, z17.h, z1.h[4]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
+      "fmla z15.h, z16.h, z1.h[4]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 26f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[5]\n"
+      "fmla z12.h, z17.h, z1.h[5]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "fmla z13.h, z16.h, z1.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[5]\n"
+      "fmla z14.h, z17.h, z1.h[5]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
+      "fmla z15.h, z16.h, z1.h[5]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 26f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[6]\n"
+      "fmla z12.h, z17.h, z1.h[6]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "fmla z13.h, z16.h, z1.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[6]\n"
+      "fmla z14.h, z17.h, z1.h[6]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
+      "fmla z15.h, z16.h, z1.h[6]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 26f\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[7]\n"
+      "fmla z12.h, z17.h, z1.h[7]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "fmla z13.h, z16.h, z1.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z14.h, z17.h, z1.h[7]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
+      "fmla z15.h, z16.h, z1.h[7]\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 21b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z17.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z16.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z17.h\n"
+      "fmin z9.h, p5/M, z9.h, z17.h\n"
+      "fmin z10.h, p5/M, z10.h, z17.h\n"
+      "fmin z11.h, p5/M, z11.h, z17.h\n"
+      "fmin z12.h, p5/M, z12.h, z17.h\n"
+      "fmin z13.h, p5/M, z13.h, z17.h\n"
+      "fmin z14.h, p5/M, z14.h, z17.h\n"
+      "fmin z15.h, p5/M, z15.h, z17.h\n"
+      "fmax z8.h, p5/M, z8.h, z16.h\n"
+      "fmax z9.h, p5/M, z9.h, z16.h\n"
+      "fmax z10.h, p5/M, z10.h, z16.h\n"
+      "fmax z11.h, p5/M, z11.h, z16.h\n"
+      "fmax z12.h, p5/M, z12.h, z16.h\n"
+      "fmax z13.h, p5/M, z13.h, z16.h\n"
+      "fmax z14.h, p5/M, z14.h, z16.h\n"
+      "fmax z15.h, p5/M, z15.h, z16.h\n"
+      "27:"  // Height 2: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x25]\n"
+      "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+      "28:"  // Height 2: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 16b\n"
+      "b 86f\n"
+      "29:"  // Height 3
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "30:"  // Height 3: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 31f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 31f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 31f\n"
+      "mov x11, x12\n"
+      "31:"  // Height 3: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "cbz x15, 32f\n"
+      "ld1h { z8.h }, p5/Z, [x15]\n"
+      "ld1h { z9.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x13, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x21]\n"
+      "ld1h { z13.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x20]\n"
+      "ld1h { z17.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x20, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 37f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "37:"  // Height 3: input setup done
+      "cmp x27, #0x8\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x24]\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z21.h, z2.h[0]\n"
+      "fmla z12.h, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z16.h, z21.h, z0.h[0]\n"
+      "fmla z9.h, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "fmla z13.h, z20.h, z1.h[0]\n"
+      "fmla z17.h, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
+      "cmp x27, #0x8\n"
+      "fmla z10.h, z21.h, z2.h[0]\n"
+      "fmla z14.h, z21.h, z1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "fmla z18.h, z21.h, z0.h[0]\n"
+      "fmla z11.h, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z15.h, z20.h, z1.h[0]\n"
+      "fmla z19.h, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[1]\n"
+      "fmla z12.h, z21.h, z1.h[1]\n"
+      "fmla z16.h, z21.h, z0.h[1]\n"
+      "fmla z9.h, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[1]\n"
+      "fmla z17.h, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[1]\n"
+      "fmla z14.h, z21.h, z1.h[1]\n"
+      "fmla z18.h, z21.h, z0.h[1]\n"
+      "fmla z11.h, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[1]\n"
+      "fmla z19.h, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[2]\n"
+      "fmla z12.h, z21.h, z1.h[2]\n"
+      "fmla z16.h, z21.h, z0.h[2]\n"
+      "fmla z9.h, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[2]\n"
+      "fmla z17.h, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[2]\n"
+      "fmla z14.h, z21.h, z1.h[2]\n"
+      "fmla z18.h, z21.h, z0.h[2]\n"
+      "fmla z11.h, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[2]\n"
+      "fmla z19.h, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[3]\n"
+      "fmla z12.h, z21.h, z1.h[3]\n"
+      "fmla z16.h, z21.h, z0.h[3]\n"
+      "fmla z9.h, z20.h, z2.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[3]\n"
+      "fmla z17.h, z20.h, z0.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[3]\n"
+      "fmla z14.h, z21.h, z1.h[3]\n"
+      "fmla z18.h, z21.h, z0.h[3]\n"
+      "fmla z11.h, z20.h, z2.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[3]\n"
+      "fmla z19.h, z20.h, z0.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[4]\n"
+      "fmla z12.h, z21.h, z1.h[4]\n"
+      "fmla z16.h, z21.h, z0.h[4]\n"
+      "fmla z9.h, z20.h, z2.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[4]\n"
+      "fmla z17.h, z20.h, z0.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[4]\n"
+      "fmla z14.h, z21.h, z1.h[4]\n"
+      "fmla z18.h, z21.h, z0.h[4]\n"
+      "fmla z11.h, z20.h, z2.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[4]\n"
+      "fmla z19.h, z20.h, z0.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[5]\n"
+      "fmla z12.h, z21.h, z1.h[5]\n"
+      "fmla z16.h, z21.h, z0.h[5]\n"
+      "fmla z9.h, z20.h, z2.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[5]\n"
+      "fmla z17.h, z20.h, z0.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[5]\n"
+      "fmla z14.h, z21.h, z1.h[5]\n"
+      "fmla z18.h, z21.h, z0.h[5]\n"
+      "fmla z11.h, z20.h, z2.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[5]\n"
+      "fmla z19.h, z20.h, z0.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[6]\n"
+      "fmla z12.h, z21.h, z1.h[6]\n"
+      "fmla z16.h, z21.h, z0.h[6]\n"
+      "fmla z9.h, z20.h, z2.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[6]\n"
+      "fmla z17.h, z20.h, z0.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[6]\n"
+      "fmla z14.h, z21.h, z1.h[6]\n"
+      "fmla z18.h, z21.h, z0.h[6]\n"
+      "fmla z11.h, z20.h, z2.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "addvl x12, x12, #8\n"
+      "fmla z15.h, z20.h, z1.h[6]\n"
+      "fmla z19.h, z20.h, z0.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #8\n"
+      "fmla z8.h, z21.h, z2.h[7]\n"
+      "fmla z12.h, z21.h, z1.h[7]\n"
+      "fmla z16.h, z21.h, z0.h[7]\n"
+      "fmla z9.h, z20.h, z2.h[7]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      "fmla z13.h, z20.h, z1.h[7]\n"
+      "fmla z17.h, z20.h, z0.h[7]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      "fmla z10.h, z21.h, z2.h[7]\n"
+      "fmla z14.h, z21.h, z1.h[7]\n"
+      "fmla z18.h, z21.h, z0.h[7]\n"
+      "fmla z11.h, z20.h, z2.h[7]\n"
+      "fmla z15.h, z20.h, z1.h[7]\n"
+      "fmla z19.h, z20.h, z0.h[7]\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z21.h, z0.h[0]\n"
+      "fmla z12.h, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z16.h, z21.h, z2.h[0]\n"
+      "fmla z9.h, z20.h, z0.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "fmla z13.h, z20.h, z1.h[0]\n"
+      "fmla z17.h, z20.h, z2.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.h, z21.h, z0.h[0]\n"
+      "fmla z14.h, z21.h, z1.h[0]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.h, z21.h, z2.h[0]\n"
+      "fmla z11.h, z20.h, z0.h[0]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.h, z20.h, z1.h[0]\n"
+      "fmla z19.h, z20.h, z2.h[0]\n"
+      "ble 40f\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[1]\n"
+      "fmla z12.h, z21.h, z1.h[1]\n"
+      "fmla z16.h, z21.h, z2.h[1]\n"
+      "fmla z9.h, z20.h, z0.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[1]\n"
+      "fmla z17.h, z20.h, z2.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.h, z21.h, z0.h[1]\n"
+      "fmla z14.h, z21.h, z1.h[1]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.h, z21.h, z2.h[1]\n"
+      "fmla z11.h, z20.h, z0.h[1]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.h, z20.h, z1.h[1]\n"
+      "fmla z19.h, z20.h, z2.h[1]\n"
+      "ble 40f\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[2]\n"
+      "fmla z12.h, z21.h, z1.h[2]\n"
+      "fmla z16.h, z21.h, z2.h[2]\n"
+      "fmla z9.h, z20.h, z0.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[2]\n"
+      "fmla z17.h, z20.h, z2.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.h, z21.h, z0.h[2]\n"
+      "fmla z14.h, z21.h, z1.h[2]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.h, z21.h, z2.h[2]\n"
+      "fmla z11.h, z20.h, z0.h[2]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.h, z20.h, z1.h[2]\n"
+      "fmla z19.h, z20.h, z2.h[2]\n"
+      "ble 40f\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[3]\n"
+      "fmla z12.h, z21.h, z1.h[3]\n"
+      "fmla z16.h, z21.h, z2.h[3]\n"
+      "fmla z9.h, z20.h, z0.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[3]\n"
+      "fmla z17.h, z20.h, z2.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.h, z21.h, z0.h[3]\n"
+      "fmla z14.h, z21.h, z1.h[3]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.h, z21.h, z2.h[3]\n"
+      "fmla z11.h, z20.h, z0.h[3]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.h, z20.h, z1.h[3]\n"
+      "fmla z19.h, z20.h, z2.h[3]\n"
+      "ble 40f\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[4]\n"
+      "fmla z12.h, z21.h, z1.h[4]\n"
+      "fmla z16.h, z21.h, z2.h[4]\n"
+      "fmla z9.h, z20.h, z0.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[4]\n"
+      "fmla z17.h, z20.h, z2.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.h, z21.h, z0.h[4]\n"
+      "fmla z14.h, z21.h, z1.h[4]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.h, z21.h, z2.h[4]\n"
+      "fmla z11.h, z20.h, z0.h[4]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.h, z20.h, z1.h[4]\n"
+      "fmla z19.h, z20.h, z2.h[4]\n"
+      "ble 40f\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[5]\n"
+      "fmla z12.h, z21.h, z1.h[5]\n"
+      "fmla z16.h, z21.h, z2.h[5]\n"
+      "fmla z9.h, z20.h, z0.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[5]\n"
+      "fmla z17.h, z20.h, z2.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.h, z21.h, z0.h[5]\n"
+      "fmla z14.h, z21.h, z1.h[5]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.h, z21.h, z2.h[5]\n"
+      "fmla z11.h, z20.h, z0.h[5]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.h, z20.h, z1.h[5]\n"
+      "fmla z19.h, z20.h, z2.h[5]\n"
+      "ble 40f\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[6]\n"
+      "fmla z12.h, z21.h, z1.h[6]\n"
+      "fmla z16.h, z21.h, z2.h[6]\n"
+      "fmla z9.h, z20.h, z0.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[6]\n"
+      "fmla z17.h, z20.h, z2.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.h, z21.h, z0.h[6]\n"
+      "fmla z14.h, z21.h, z1.h[6]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.h, z21.h, z2.h[6]\n"
+      "fmla z11.h, z20.h, z0.h[6]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.h, z20.h, z1.h[6]\n"
+      "fmla z19.h, z20.h, z2.h[6]\n"
+      "ble 40f\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[7]\n"
+      "fmla z12.h, z21.h, z1.h[7]\n"
+      "fmla z16.h, z21.h, z2.h[7]\n"
+      "fmla z9.h, z20.h, z0.h[7]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z13.h, z20.h, z1.h[7]\n"
+      "fmla z17.h, z20.h, z2.h[7]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z10.h, z21.h, z0.h[7]\n"
+      "fmla z14.h, z21.h, z1.h[7]\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.h, z21.h, z2.h[7]\n"
+      "fmla z11.h, z20.h, z0.h[7]\n"
+      "fmla z15.h, z20.h, z1.h[7]\n"
+      "fmla z19.h, z20.h, z2.h[7]\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 35b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z21.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z20.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z21.h\n"
+      "fmin z9.h, p5/M, z9.h, z21.h\n"
+      "fmin z10.h, p5/M, z10.h, z21.h\n"
+      "fmin z11.h, p5/M, z11.h, z21.h\n"
+      "fmin z12.h, p5/M, z12.h, z21.h\n"
+      "fmin z13.h, p5/M, z13.h, z21.h\n"
+      "fmin z14.h, p5/M, z14.h, z21.h\n"
+      "fmin z15.h, p5/M, z15.h, z21.h\n"
+      "fmin z16.h, p5/M, z16.h, z21.h\n"
+      "fmin z17.h, p5/M, z17.h, z21.h\n"
+      "fmin z18.h, p5/M, z18.h, z21.h\n"
+      "fmin z19.h, p5/M, z19.h, z21.h\n"
+      "fmax z8.h, p5/M, z8.h, z20.h\n"
+      "fmax z9.h, p5/M, z9.h, z20.h\n"
+      "fmax z10.h, p5/M, z10.h, z20.h\n"
+      "fmax z11.h, p5/M, z11.h, z20.h\n"
+      "fmax z12.h, p5/M, z12.h, z20.h\n"
+      "fmax z13.h, p5/M, z13.h, z20.h\n"
+      "fmax z14.h, p5/M, z14.h, z20.h\n"
+      "fmax z15.h, p5/M, z15.h, z20.h\n"
+      "fmax z16.h, p5/M, z16.h, z20.h\n"
+      "fmax z17.h, p5/M, z17.h, z20.h\n"
+      "fmax z18.h, p5/M, z18.h, z20.h\n"
+      "fmax z19.h, p5/M, z19.h, z20.h\n"
+      "41:"  // Height 3: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x25]\n"
+      "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p4, [x24]\n"
+      "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+      "42:"  // Height 3: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 30b\n"
+      "b 86f\n"
+      "43:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "44:"  // Height 4: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 45f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 45f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 45f\n"
+      "mov x11, x12\n"
+      "45:"  // Height 4: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "cbz x15, 46f\n"
+      "ld1h { z8.h }, p5/Z, [x15]\n"
+      "ld1h { z9.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x13, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x22]\n"
+      "ld1h { z13.h }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x21]\n"
+      "ld1h { z17.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x20]\n"
+      "ld1h { z21.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x20, #3, MUL VL]\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 51f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "51:"  // Height 4: input setup done
+      "cmp x27, #0x8\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z3.h }, p0/Z, [x26]\n"
+      "ld1rqh { z2.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "ld1rqh { z0.h }, p0/Z, [x23]\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z3.h[0]\n"
+      "fmla z12.h, z25.h, z2.h[0]\n"
+      "fmla z16.h, z25.h, z1.h[0]\n"
+      "fmla z20.h, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "add x25, x25, #0x10\n"
+      "fmla z9.h, z24.h, z3.h[0]\n"
+      "fmla z13.h, z24.h, z2.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "fmla z17.h, z24.h, z1.h[0]\n"
+      "fmla z21.h, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z25.h, z3.h[0]\n"
+      "fmla z14.h, z25.h, z2.h[0]\n"
+      "fmla z18.h, z25.h, z1.h[0]\n"
+      "fmla z22.h, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[0]\n"
+      "fmla z15.h, z24.h, z2.h[0]\n"
+      "fmla z19.h, z24.h, z1.h[0]\n"
+      "fmla z23.h, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[1]\n"
+      "fmla z12.h, z25.h, z2.h[1]\n"
+      "fmla z16.h, z25.h, z1.h[1]\n"
+      "fmla z20.h, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[1]\n"
+      "fmla z13.h, z24.h, z2.h[1]\n"
+      "fmla z17.h, z24.h, z1.h[1]\n"
+      "fmla z21.h, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[1]\n"
+      "fmla z14.h, z25.h, z2.h[1]\n"
+      "fmla z18.h, z25.h, z1.h[1]\n"
+      "fmla z22.h, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[1]\n"
+      "fmla z15.h, z24.h, z2.h[1]\n"
+      "fmla z19.h, z24.h, z1.h[1]\n"
+      "fmla z23.h, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[2]\n"
+      "fmla z12.h, z25.h, z2.h[2]\n"
+      "fmla z16.h, z25.h, z1.h[2]\n"
+      "fmla z20.h, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[2]\n"
+      "fmla z13.h, z24.h, z2.h[2]\n"
+      "fmla z17.h, z24.h, z1.h[2]\n"
+      "fmla z21.h, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[2]\n"
+      "fmla z14.h, z25.h, z2.h[2]\n"
+      "fmla z18.h, z25.h, z1.h[2]\n"
+      "fmla z22.h, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[2]\n"
+      "fmla z15.h, z24.h, z2.h[2]\n"
+      "fmla z19.h, z24.h, z1.h[2]\n"
+      "fmla z23.h, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[3]\n"
+      "fmla z12.h, z25.h, z2.h[3]\n"
+      "fmla z16.h, z25.h, z1.h[3]\n"
+      "fmla z20.h, z25.h, z0.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[3]\n"
+      "fmla z13.h, z24.h, z2.h[3]\n"
+      "fmla z17.h, z24.h, z1.h[3]\n"
+      "fmla z21.h, z24.h, z0.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[3]\n"
+      "fmla z14.h, z25.h, z2.h[3]\n"
+      "fmla z18.h, z25.h, z1.h[3]\n"
+      "fmla z22.h, z25.h, z0.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[3]\n"
+      "fmla z15.h, z24.h, z2.h[3]\n"
+      "fmla z19.h, z24.h, z1.h[3]\n"
+      "fmla z23.h, z24.h, z0.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[4]\n"
+      "fmla z12.h, z25.h, z2.h[4]\n"
+      "fmla z16.h, z25.h, z1.h[4]\n"
+      "fmla z20.h, z25.h, z0.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[4]\n"
+      "fmla z13.h, z24.h, z2.h[4]\n"
+      "fmla z17.h, z24.h, z1.h[4]\n"
+      "fmla z21.h, z24.h, z0.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[4]\n"
+      "fmla z14.h, z25.h, z2.h[4]\n"
+      "fmla z18.h, z25.h, z1.h[4]\n"
+      "fmla z22.h, z25.h, z0.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[4]\n"
+      "fmla z15.h, z24.h, z2.h[4]\n"
+      "fmla z19.h, z24.h, z1.h[4]\n"
+      "fmla z23.h, z24.h, z0.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[5]\n"
+      "fmla z12.h, z25.h, z2.h[5]\n"
+      "fmla z16.h, z25.h, z1.h[5]\n"
+      "fmla z20.h, z25.h, z0.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[5]\n"
+      "fmla z13.h, z24.h, z2.h[5]\n"
+      "fmla z17.h, z24.h, z1.h[5]\n"
+      "fmla z21.h, z24.h, z0.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[5]\n"
+      "fmla z14.h, z25.h, z2.h[5]\n"
+      "fmla z18.h, z25.h, z1.h[5]\n"
+      "fmla z22.h, z25.h, z0.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[5]\n"
+      "fmla z15.h, z24.h, z2.h[5]\n"
+      "fmla z19.h, z24.h, z1.h[5]\n"
+      "fmla z23.h, z24.h, z0.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[6]\n"
+      "fmla z12.h, z25.h, z2.h[6]\n"
+      "fmla z16.h, z25.h, z1.h[6]\n"
+      "fmla z20.h, z25.h, z0.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[6]\n"
+      "fmla z13.h, z24.h, z2.h[6]\n"
+      "fmla z17.h, z24.h, z1.h[6]\n"
+      "fmla z21.h, z24.h, z0.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[6]\n"
+      "fmla z14.h, z25.h, z2.h[6]\n"
+      "fmla z18.h, z25.h, z1.h[6]\n"
+      "fmla z22.h, z25.h, z0.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "addvl x12, x12, #8\n"
+      "fmla z11.h, z24.h, z3.h[6]\n"
+      "fmla z15.h, z24.h, z2.h[6]\n"
+      "fmla z19.h, z24.h, z1.h[6]\n"
+      "fmla z23.h, z24.h, z0.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #8\n"
+      "fmla z8.h, z25.h, z3.h[7]\n"
+      "fmla z12.h, z25.h, z2.h[7]\n"
+      "fmla z16.h, z25.h, z1.h[7]\n"
+      "fmla z20.h, z25.h, z0.h[7]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      "fmla z9.h, z24.h, z3.h[7]\n"
+      "fmla z13.h, z24.h, z2.h[7]\n"
+      "fmla z17.h, z24.h, z1.h[7]\n"
+      "fmla z21.h, z24.h, z0.h[7]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      "fmla z10.h, z25.h, z3.h[7]\n"
+      "fmla z14.h, z25.h, z2.h[7]\n"
+      "fmla z18.h, z25.h, z1.h[7]\n"
+      "fmla z22.h, z25.h, z0.h[7]\n"
+      "fmla z11.h, z24.h, z3.h[7]\n"
+      "fmla z15.h, z24.h, z2.h[7]\n"
+      "fmla z19.h, z24.h, z1.h[7]\n"
+      "fmla z23.h, z24.h, z0.h[7]\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[0]\n"
+      "fmla z12.h, z25.h, z1.h[0]\n"
+      "fmla z16.h, z25.h, z2.h[0]\n"
+      "fmla z20.h, z25.h, z3.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z9.h, z24.h, z0.h[0]\n"
+      "fmla z13.h, z24.h, z1.h[0]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.h, z24.h, z2.h[0]\n"
+      "fmla z21.h, z24.h, z3.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z25.h, z0.h[0]\n"
+      "fmla z14.h, z25.h, z1.h[0]\n"
+      "fmla z18.h, z25.h, z2.h[0]\n"
+      "fmla z22.h, z25.h, z3.h[0]\n"
+      "fmla z11.h, z24.h, z0.h[0]\n"
+      "fmla z15.h, z24.h, z1.h[0]\n"
+      "fmla z19.h, z24.h, z2.h[0]\n"
+      "fmla z23.h, z24.h, z3.h[0]\n"
+      "ble 54f\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[1]\n"
+      "fmla z12.h, z25.h, z1.h[1]\n"
+      "fmla z16.h, z25.h, z2.h[1]\n"
+      "fmla z20.h, z25.h, z3.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[1]\n"
+      "fmla z13.h, z24.h, z1.h[1]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z17.h, z24.h, z2.h[1]\n"
+      "fmla z21.h, z24.h, z3.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.h, z25.h, z0.h[1]\n"
+      "fmla z14.h, z25.h, z1.h[1]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.h, z25.h, z2.h[1]\n"
+      "fmla z22.h, z25.h, z3.h[1]\n"
+      "fmla z11.h, z24.h, z0.h[1]\n"
+      "fmla z15.h, z24.h, z1.h[1]\n"
+      "fmla z19.h, z24.h, z2.h[1]\n"
+      "fmla z23.h, z24.h, z3.h[1]\n"
+      "ble 54f\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[2]\n"
+      "fmla z12.h, z25.h, z1.h[2]\n"
+      "fmla z16.h, z25.h, z2.h[2]\n"
+      "fmla z20.h, z25.h, z3.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[2]\n"
+      "fmla z13.h, z24.h, z1.h[2]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z17.h, z24.h, z2.h[2]\n"
+      "fmla z21.h, z24.h, z3.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.h, z25.h, z0.h[2]\n"
+      "fmla z14.h, z25.h, z1.h[2]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.h, z25.h, z2.h[2]\n"
+      "fmla z22.h, z25.h, z3.h[2]\n"
+      "fmla z11.h, z24.h, z0.h[2]\n"
+      "fmla z15.h, z24.h, z1.h[2]\n"
+      "fmla z19.h, z24.h, z2.h[2]\n"
+      "fmla z23.h, z24.h, z3.h[2]\n"
+      "ble 54f\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[3]\n"
+      "fmla z12.h, z25.h, z1.h[3]\n"
+      "fmla z16.h, z25.h, z2.h[3]\n"
+      "fmla z20.h, z25.h, z3.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[3]\n"
+      "fmla z13.h, z24.h, z1.h[3]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z17.h, z24.h, z2.h[3]\n"
+      "fmla z21.h, z24.h, z3.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.h, z25.h, z0.h[3]\n"
+      "fmla z14.h, z25.h, z1.h[3]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.h, z25.h, z2.h[3]\n"
+      "fmla z22.h, z25.h, z3.h[3]\n"
+      "fmla z11.h, z24.h, z0.h[3]\n"
+      "fmla z15.h, z24.h, z1.h[3]\n"
+      "fmla z19.h, z24.h, z2.h[3]\n"
+      "fmla z23.h, z24.h, z3.h[3]\n"
+      "ble 54f\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[4]\n"
+      "fmla z12.h, z25.h, z1.h[4]\n"
+      "fmla z16.h, z25.h, z2.h[4]\n"
+      "fmla z20.h, z25.h, z3.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[4]\n"
+      "fmla z13.h, z24.h, z1.h[4]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z17.h, z24.h, z2.h[4]\n"
+      "fmla z21.h, z24.h, z3.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.h, z25.h, z0.h[4]\n"
+      "fmla z14.h, z25.h, z1.h[4]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.h, z25.h, z2.h[4]\n"
+      "fmla z22.h, z25.h, z3.h[4]\n"
+      "fmla z11.h, z24.h, z0.h[4]\n"
+      "fmla z15.h, z24.h, z1.h[4]\n"
+      "fmla z19.h, z24.h, z2.h[4]\n"
+      "fmla z23.h, z24.h, z3.h[4]\n"
+      "ble 54f\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[5]\n"
+      "fmla z12.h, z25.h, z1.h[5]\n"
+      "fmla z16.h, z25.h, z2.h[5]\n"
+      "fmla z20.h, z25.h, z3.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[5]\n"
+      "fmla z13.h, z24.h, z1.h[5]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z17.h, z24.h, z2.h[5]\n"
+      "fmla z21.h, z24.h, z3.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.h, z25.h, z0.h[5]\n"
+      "fmla z14.h, z25.h, z1.h[5]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.h, z25.h, z2.h[5]\n"
+      "fmla z22.h, z25.h, z3.h[5]\n"
+      "fmla z11.h, z24.h, z0.h[5]\n"
+      "fmla z15.h, z24.h, z1.h[5]\n"
+      "fmla z19.h, z24.h, z2.h[5]\n"
+      "fmla z23.h, z24.h, z3.h[5]\n"
+      "ble 54f\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[6]\n"
+      "fmla z12.h, z25.h, z1.h[6]\n"
+      "fmla z16.h, z25.h, z2.h[6]\n"
+      "fmla z20.h, z25.h, z3.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[6]\n"
+      "fmla z13.h, z24.h, z1.h[6]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z17.h, z24.h, z2.h[6]\n"
+      "fmla z21.h, z24.h, z3.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.h, z25.h, z0.h[6]\n"
+      "fmla z14.h, z25.h, z1.h[6]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.h, z25.h, z2.h[6]\n"
+      "fmla z22.h, z25.h, z3.h[6]\n"
+      "fmla z11.h, z24.h, z0.h[6]\n"
+      "fmla z15.h, z24.h, z1.h[6]\n"
+      "fmla z19.h, z24.h, z2.h[6]\n"
+      "fmla z23.h, z24.h, z3.h[6]\n"
+      "ble 54f\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[7]\n"
+      "fmla z12.h, z25.h, z1.h[7]\n"
+      "fmla z16.h, z25.h, z2.h[7]\n"
+      "fmla z20.h, z25.h, z3.h[7]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z9.h, z24.h, z0.h[7]\n"
+      "fmla z13.h, z24.h, z1.h[7]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.h, z24.h, z2.h[7]\n"
+      "fmla z21.h, z24.h, z3.h[7]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z25.h, z0.h[7]\n"
+      "fmla z14.h, z25.h, z1.h[7]\n"
+      "fmla z18.h, z25.h, z2.h[7]\n"
+      "fmla z22.h, z25.h, z3.h[7]\n"
+      "fmla z11.h, z24.h, z0.h[7]\n"
+      "fmla z15.h, z24.h, z1.h[7]\n"
+      "fmla z19.h, z24.h, z2.h[7]\n"
+      "fmla z23.h, z24.h, z3.h[7]\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 49b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z25.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z24.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z25.h\n"
+      "fmin z9.h, p5/M, z9.h, z25.h\n"
+      "fmin z10.h, p5/M, z10.h, z25.h\n"
+      "fmin z11.h, p5/M, z11.h, z25.h\n"
+      "fmin z12.h, p5/M, z12.h, z25.h\n"
+      "fmin z13.h, p5/M, z13.h, z25.h\n"
+      "fmin z14.h, p5/M, z14.h, z25.h\n"
+      "fmin z15.h, p5/M, z15.h, z25.h\n"
+      "fmin z16.h, p5/M, z16.h, z25.h\n"
+      "fmin z17.h, p5/M, z17.h, z25.h\n"
+      "fmin z18.h, p5/M, z18.h, z25.h\n"
+      "fmin z19.h, p5/M, z19.h, z25.h\n"
+      "fmin z20.h, p5/M, z20.h, z25.h\n"
+      "fmin z21.h, p5/M, z21.h, z25.h\n"
+      "fmin z22.h, p5/M, z22.h, z25.h\n"
+      "fmin z23.h, p5/M, z23.h, z25.h\n"
+      "fmax z8.h, p5/M, z8.h, z24.h\n"
+      "fmax z9.h, p5/M, z9.h, z24.h\n"
+      "fmax z10.h, p5/M, z10.h, z24.h\n"
+      "fmax z11.h, p5/M, z11.h, z24.h\n"
+      "fmax z12.h, p5/M, z12.h, z24.h\n"
+      "fmax z13.h, p5/M, z13.h, z24.h\n"
+      "fmax z14.h, p5/M, z14.h, z24.h\n"
+      "fmax z15.h, p5/M, z15.h, z24.h\n"
+      "fmax z16.h, p5/M, z16.h, z24.h\n"
+      "fmax z17.h, p5/M, z17.h, z24.h\n"
+      "fmax z18.h, p5/M, z18.h, z24.h\n"
+      "fmax z19.h, p5/M, z19.h, z24.h\n"
+      "fmax z20.h, p5/M, z20.h, z24.h\n"
+      "fmax z21.h, p5/M, z21.h, z24.h\n"
+      "fmax z22.h, p5/M, z22.h, z24.h\n"
+      "fmax z23.h, p5/M, z23.h, z24.h\n"
+      "55:"  // Height 4: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x25]\n"
+      "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p4, [x24]\n"
+      "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p4, [x23]\n"
+      "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
+      "56:"  // Height 4: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 44b\n"
+      "b 86f\n"
+      "57:"  // Height 5
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "58:"  // Height 5: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 59f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 59f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 59f\n"
+      "mov x11, x12\n"
+      "59:"  // Height 5: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "cbz x15, 60f\n"
+      "ld1h { z8.h }, p5/Z, [x15]\n"
+      "ld1h { z9.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 62f\n"
+      "60:"  // Height 5: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x23]\n"
+      "ld1h { z13.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x22]\n"
+      "ld1h { z17.h }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x21]\n"
+      "ld1h { z21.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x20]\n"
+      "ld1h { z25.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z27.h }, p1/Z, [x20, #3, MUL VL]\n"
+      "b 62f\n"
+      "61:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "62:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "63:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 65f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "b 65f\n"
+      "64:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "65:"  // Height 5: input setup done
+      "cmp x27, #0x8\n"
+      "ble 67f\n"
+      "66:"  // Height 5: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z4.h }, p0/Z, [x26]\n"
+      "ld1rqh { z3.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z1.h }, p0/Z, [x23]\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqh { z0.h }, p0/Z, [x22]\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z29.h, z4.h[0]\n"
+      "fmla z12.h, z29.h, z3.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z16.h, z29.h, z2.h[0]\n"
+      "fmla z20.h, z29.h, z1.h[0]\n"
+      "add x25, x25, #0x10\n"
+      "fmla z24.h, z29.h, z0.h[0]\n"
+      "fmla z9.h, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z28.h, z3.h[0]\n"
+      "fmla z17.h, z28.h, z2.h[0]\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "fmla z21.h, z28.h, z1.h[0]\n"
+      "fmla z25.h, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z29.h, z4.h[0]\n"
+      "fmla z14.h, z29.h, z3.h[0]\n"
+      "fmla z18.h, z29.h, z2.h[0]\n"
+      "fmla z22.h, z29.h, z1.h[0]\n"
+      "fmla z26.h, z29.h, z0.h[0]\n"
+      "fmla z11.h, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[0]\n"
+      "fmla z19.h, z28.h, z2.h[0]\n"
+      "fmla z23.h, z28.h, z1.h[0]\n"
+      "fmla z27.h, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[1]\n"
+      "fmla z12.h, z29.h, z3.h[1]\n"
+      "fmla z16.h, z29.h, z2.h[1]\n"
+      "fmla z20.h, z29.h, z1.h[1]\n"
+      "fmla z24.h, z29.h, z0.h[1]\n"
+      "fmla z9.h, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[1]\n"
+      "fmla z17.h, z28.h, z2.h[1]\n"
+      "fmla z21.h, z28.h, z1.h[1]\n"
+      "fmla z25.h, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[1]\n"
+      "fmla z14.h, z29.h, z3.h[1]\n"
+      "fmla z18.h, z29.h, z2.h[1]\n"
+      "fmla z22.h, z29.h, z1.h[1]\n"
+      "fmla z26.h, z29.h, z0.h[1]\n"
+      "fmla z11.h, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[1]\n"
+      "fmla z19.h, z28.h, z2.h[1]\n"
+      "fmla z23.h, z28.h, z1.h[1]\n"
+      "fmla z27.h, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[2]\n"
+      "fmla z12.h, z29.h, z3.h[2]\n"
+      "fmla z16.h, z29.h, z2.h[2]\n"
+      "fmla z20.h, z29.h, z1.h[2]\n"
+      "fmla z24.h, z29.h, z0.h[2]\n"
+      "fmla z9.h, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[2]\n"
+      "fmla z17.h, z28.h, z2.h[2]\n"
+      "fmla z21.h, z28.h, z1.h[2]\n"
+      "fmla z25.h, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[2]\n"
+      "fmla z14.h, z29.h, z3.h[2]\n"
+      "fmla z18.h, z29.h, z2.h[2]\n"
+      "fmla z22.h, z29.h, z1.h[2]\n"
+      "fmla z26.h, z29.h, z0.h[2]\n"
+      "fmla z11.h, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[2]\n"
+      "fmla z19.h, z28.h, z2.h[2]\n"
+      "fmla z23.h, z28.h, z1.h[2]\n"
+      "fmla z27.h, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[3]\n"
+      "fmla z12.h, z29.h, z3.h[3]\n"
+      "fmla z16.h, z29.h, z2.h[3]\n"
+      "fmla z20.h, z29.h, z1.h[3]\n"
+      "fmla z24.h, z29.h, z0.h[3]\n"
+      "fmla z9.h, z28.h, z4.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[3]\n"
+      "fmla z17.h, z28.h, z2.h[3]\n"
+      "fmla z21.h, z28.h, z1.h[3]\n"
+      "fmla z25.h, z28.h, z0.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[3]\n"
+      "fmla z14.h, z29.h, z3.h[3]\n"
+      "fmla z18.h, z29.h, z2.h[3]\n"
+      "fmla z22.h, z29.h, z1.h[3]\n"
+      "fmla z26.h, z29.h, z0.h[3]\n"
+      "fmla z11.h, z28.h, z4.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[3]\n"
+      "fmla z19.h, z28.h, z2.h[3]\n"
+      "fmla z23.h, z28.h, z1.h[3]\n"
+      "fmla z27.h, z28.h, z0.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[4]\n"
+      "fmla z12.h, z29.h, z3.h[4]\n"
+      "fmla z16.h, z29.h, z2.h[4]\n"
+      "fmla z20.h, z29.h, z1.h[4]\n"
+      "fmla z24.h, z29.h, z0.h[4]\n"
+      "fmla z9.h, z28.h, z4.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[4]\n"
+      "fmla z17.h, z28.h, z2.h[4]\n"
+      "fmla z21.h, z28.h, z1.h[4]\n"
+      "fmla z25.h, z28.h, z0.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[4]\n"
+      "fmla z14.h, z29.h, z3.h[4]\n"
+      "fmla z18.h, z29.h, z2.h[4]\n"
+      "fmla z22.h, z29.h, z1.h[4]\n"
+      "fmla z26.h, z29.h, z0.h[4]\n"
+      "fmla z11.h, z28.h, z4.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[4]\n"
+      "fmla z19.h, z28.h, z2.h[4]\n"
+      "fmla z23.h, z28.h, z1.h[4]\n"
+      "fmla z27.h, z28.h, z0.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[5]\n"
+      "fmla z12.h, z29.h, z3.h[5]\n"
+      "fmla z16.h, z29.h, z2.h[5]\n"
+      "fmla z20.h, z29.h, z1.h[5]\n"
+      "fmla z24.h, z29.h, z0.h[5]\n"
+      "fmla z9.h, z28.h, z4.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[5]\n"
+      "fmla z17.h, z28.h, z2.h[5]\n"
+      "fmla z21.h, z28.h, z1.h[5]\n"
+      "fmla z25.h, z28.h, z0.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[5]\n"
+      "fmla z14.h, z29.h, z3.h[5]\n"
+      "fmla z18.h, z29.h, z2.h[5]\n"
+      "fmla z22.h, z29.h, z1.h[5]\n"
+      "fmla z26.h, z29.h, z0.h[5]\n"
+      "fmla z11.h, z28.h, z4.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[5]\n"
+      "fmla z19.h, z28.h, z2.h[5]\n"
+      "fmla z23.h, z28.h, z1.h[5]\n"
+      "fmla z27.h, z28.h, z0.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[6]\n"
+      "fmla z12.h, z29.h, z3.h[6]\n"
+      "fmla z16.h, z29.h, z2.h[6]\n"
+      "fmla z20.h, z29.h, z1.h[6]\n"
+      "fmla z24.h, z29.h, z0.h[6]\n"
+      "fmla z9.h, z28.h, z4.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[6]\n"
+      "fmla z17.h, z28.h, z2.h[6]\n"
+      "fmla z21.h, z28.h, z1.h[6]\n"
+      "fmla z25.h, z28.h, z0.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[6]\n"
+      "fmla z14.h, z29.h, z3.h[6]\n"
+      "fmla z18.h, z29.h, z2.h[6]\n"
+      "fmla z22.h, z29.h, z1.h[6]\n"
+      "fmla z26.h, z29.h, z0.h[6]\n"
+      "fmla z11.h, z28.h, z4.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "addvl x12, x12, #8\n"
+      "fmla z15.h, z28.h, z3.h[6]\n"
+      "fmla z19.h, z28.h, z2.h[6]\n"
+      "fmla z23.h, z28.h, z1.h[6]\n"
+      "fmla z27.h, z28.h, z0.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #8\n"
+      "fmla z8.h, z29.h, z4.h[7]\n"
+      "fmla z12.h, z29.h, z3.h[7]\n"
+      "fmla z16.h, z29.h, z2.h[7]\n"
+      "fmla z20.h, z29.h, z1.h[7]\n"
+      "fmla z24.h, z29.h, z0.h[7]\n"
+      "fmla z9.h, z28.h, z4.h[7]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      "fmla z13.h, z28.h, z3.h[7]\n"
+      "fmla z17.h, z28.h, z2.h[7]\n"
+      "fmla z21.h, z28.h, z1.h[7]\n"
+      "fmla z25.h, z28.h, z0.h[7]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      "fmla z10.h, z29.h, z4.h[7]\n"
+      "fmla z14.h, z29.h, z3.h[7]\n"
+      "fmla z18.h, z29.h, z2.h[7]\n"
+      "fmla z22.h, z29.h, z1.h[7]\n"
+      "fmla z26.h, z29.h, z0.h[7]\n"
+      "fmla z11.h, z28.h, z4.h[7]\n"
+      "fmla z15.h, z28.h, z3.h[7]\n"
+      "fmla z19.h, z28.h, z2.h[7]\n"
+      "fmla z23.h, z28.h, z1.h[7]\n"
+      "fmla z27.h, z28.h, z0.h[7]\n"
+      "bgt 66b\n"
+      "67:"  // Height 5: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z29.h, z0.h[0]\n"
+      "fmla z12.h, z29.h, z1.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z16.h, z29.h, z2.h[0]\n"
+      "fmla z20.h, z29.h, z3.h[0]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z29.h, z4.h[0]\n"
+      "fmla z9.h, z28.h, z0.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.h, z28.h, z1.h[0]\n"
+      "fmla z17.h, z28.h, z2.h[0]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.h, z28.h, z3.h[0]\n"
+      "fmla z25.h, z28.h, z4.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z29.h, z0.h[0]\n"
+      "fmla z14.h, z29.h, z1.h[0]\n"
+      "fmla z18.h, z29.h, z2.h[0]\n"
+      "fmla z22.h, z29.h, z3.h[0]\n"
+      "fmla z26.h, z29.h, z4.h[0]\n"
+      "fmla z11.h, z28.h, z0.h[0]\n"
+      "fmla z15.h, z28.h, z1.h[0]\n"
+      "fmla z19.h, z28.h, z2.h[0]\n"
+      "fmla z23.h, z28.h, z3.h[0]\n"
+      "fmla z27.h, z28.h, z4.h[0]\n"
+      "ble 68f\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[1]\n"
+      "fmla z12.h, z29.h, z1.h[1]\n"
+      "fmla z16.h, z29.h, z2.h[1]\n"
+      "fmla z20.h, z29.h, z3.h[1]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z29.h, z4.h[1]\n"
+      "fmla z9.h, z28.h, z0.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.h, z28.h, z1.h[1]\n"
+      "fmla z17.h, z28.h, z2.h[1]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.h, z28.h, z3.h[1]\n"
+      "fmla z25.h, z28.h, z4.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z29.h, z0.h[1]\n"
+      "fmla z14.h, z29.h, z1.h[1]\n"
+      "fmla z18.h, z29.h, z2.h[1]\n"
+      "fmla z22.h, z29.h, z3.h[1]\n"
+      "fmla z26.h, z29.h, z4.h[1]\n"
+      "fmla z11.h, z28.h, z0.h[1]\n"
+      "fmla z15.h, z28.h, z1.h[1]\n"
+      "fmla z19.h, z28.h, z2.h[1]\n"
+      "fmla z23.h, z28.h, z3.h[1]\n"
+      "fmla z27.h, z28.h, z4.h[1]\n"
+      "ble 68f\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[2]\n"
+      "fmla z12.h, z29.h, z1.h[2]\n"
+      "fmla z16.h, z29.h, z2.h[2]\n"
+      "fmla z20.h, z29.h, z3.h[2]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z29.h, z4.h[2]\n"
+      "fmla z9.h, z28.h, z0.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.h, z28.h, z1.h[2]\n"
+      "fmla z17.h, z28.h, z2.h[2]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.h, z28.h, z3.h[2]\n"
+      "fmla z25.h, z28.h, z4.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z29.h, z0.h[2]\n"
+      "fmla z14.h, z29.h, z1.h[2]\n"
+      "fmla z18.h, z29.h, z2.h[2]\n"
+      "fmla z22.h, z29.h, z3.h[2]\n"
+      "fmla z26.h, z29.h, z4.h[2]\n"
+      "fmla z11.h, z28.h, z0.h[2]\n"
+      "fmla z15.h, z28.h, z1.h[2]\n"
+      "fmla z19.h, z28.h, z2.h[2]\n"
+      "fmla z23.h, z28.h, z3.h[2]\n"
+      "fmla z27.h, z28.h, z4.h[2]\n"
+      "ble 68f\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[3]\n"
+      "fmla z12.h, z29.h, z1.h[3]\n"
+      "fmla z16.h, z29.h, z2.h[3]\n"
+      "fmla z20.h, z29.h, z3.h[3]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z29.h, z4.h[3]\n"
+      "fmla z9.h, z28.h, z0.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.h, z28.h, z1.h[3]\n"
+      "fmla z17.h, z28.h, z2.h[3]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.h, z28.h, z3.h[3]\n"
+      "fmla z25.h, z28.h, z4.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z29.h, z0.h[3]\n"
+      "fmla z14.h, z29.h, z1.h[3]\n"
+      "fmla z18.h, z29.h, z2.h[3]\n"
+      "fmla z22.h, z29.h, z3.h[3]\n"
+      "fmla z26.h, z29.h, z4.h[3]\n"
+      "fmla z11.h, z28.h, z0.h[3]\n"
+      "fmla z15.h, z28.h, z1.h[3]\n"
+      "fmla z19.h, z28.h, z2.h[3]\n"
+      "fmla z23.h, z28.h, z3.h[3]\n"
+      "fmla z27.h, z28.h, z4.h[3]\n"
+      "ble 68f\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[4]\n"
+      "fmla z12.h, z29.h, z1.h[4]\n"
+      "fmla z16.h, z29.h, z2.h[4]\n"
+      "fmla z20.h, z29.h, z3.h[4]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z29.h, z4.h[4]\n"
+      "fmla z9.h, z28.h, z0.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.h, z28.h, z1.h[4]\n"
+      "fmla z17.h, z28.h, z2.h[4]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.h, z28.h, z3.h[4]\n"
+      "fmla z25.h, z28.h, z4.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z29.h, z0.h[4]\n"
+      "fmla z14.h, z29.h, z1.h[4]\n"
+      "fmla z18.h, z29.h, z2.h[4]\n"
+      "fmla z22.h, z29.h, z3.h[4]\n"
+      "fmla z26.h, z29.h, z4.h[4]\n"
+      "fmla z11.h, z28.h, z0.h[4]\n"
+      "fmla z15.h, z28.h, z1.h[4]\n"
+      "fmla z19.h, z28.h, z2.h[4]\n"
+      "fmla z23.h, z28.h, z3.h[4]\n"
+      "fmla z27.h, z28.h, z4.h[4]\n"
+      "ble 68f\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[5]\n"
+      "fmla z12.h, z29.h, z1.h[5]\n"
+      "fmla z16.h, z29.h, z2.h[5]\n"
+      "fmla z20.h, z29.h, z3.h[5]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z29.h, z4.h[5]\n"
+      "fmla z9.h, z28.h, z0.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.h, z28.h, z1.h[5]\n"
+      "fmla z17.h, z28.h, z2.h[5]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.h, z28.h, z3.h[5]\n"
+      "fmla z25.h, z28.h, z4.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z29.h, z0.h[5]\n"
+      "fmla z14.h, z29.h, z1.h[5]\n"
+      "fmla z18.h, z29.h, z2.h[5]\n"
+      "fmla z22.h, z29.h, z3.h[5]\n"
+      "fmla z26.h, z29.h, z4.h[5]\n"
+      "fmla z11.h, z28.h, z0.h[5]\n"
+      "fmla z15.h, z28.h, z1.h[5]\n"
+      "fmla z19.h, z28.h, z2.h[5]\n"
+      "fmla z23.h, z28.h, z3.h[5]\n"
+      "fmla z27.h, z28.h, z4.h[5]\n"
+      "ble 68f\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[6]\n"
+      "fmla z12.h, z29.h, z1.h[6]\n"
+      "fmla z16.h, z29.h, z2.h[6]\n"
+      "fmla z20.h, z29.h, z3.h[6]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z29.h, z4.h[6]\n"
+      "fmla z9.h, z28.h, z0.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.h, z28.h, z1.h[6]\n"
+      "fmla z17.h, z28.h, z2.h[6]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.h, z28.h, z3.h[6]\n"
+      "fmla z25.h, z28.h, z4.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z29.h, z0.h[6]\n"
+      "fmla z14.h, z29.h, z1.h[6]\n"
+      "fmla z18.h, z29.h, z2.h[6]\n"
+      "fmla z22.h, z29.h, z3.h[6]\n"
+      "fmla z26.h, z29.h, z4.h[6]\n"
+      "fmla z11.h, z28.h, z0.h[6]\n"
+      "fmla z15.h, z28.h, z1.h[6]\n"
+      "fmla z19.h, z28.h, z2.h[6]\n"
+      "fmla z23.h, z28.h, z3.h[6]\n"
+      "fmla z27.h, z28.h, z4.h[6]\n"
+      "ble 68f\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[7]\n"
+      "fmla z12.h, z29.h, z1.h[7]\n"
+      "fmla z16.h, z29.h, z2.h[7]\n"
+      "fmla z20.h, z29.h, z3.h[7]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z24.h, z29.h, z4.h[7]\n"
+      "fmla z9.h, z28.h, z0.h[7]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z13.h, z28.h, z1.h[7]\n"
+      "fmla z17.h, z28.h, z2.h[7]\n"
+      "fmla z21.h, z28.h, z3.h[7]\n"
+      "fmla z25.h, z28.h, z4.h[7]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z29.h, z0.h[7]\n"
+      "fmla z14.h, z29.h, z1.h[7]\n"
+      "fmla z18.h, z29.h, z2.h[7]\n"
+      "fmla z22.h, z29.h, z3.h[7]\n"
+      "fmla z26.h, z29.h, z4.h[7]\n"
+      "fmla z11.h, z28.h, z0.h[7]\n"
+      "fmla z15.h, z28.h, z1.h[7]\n"
+      "fmla z19.h, z28.h, z2.h[7]\n"
+      "fmla z23.h, z28.h, z3.h[7]\n"
+      "fmla z27.h, z28.h, z4.h[7]\n"
+      "68:"  // Height 5: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 63b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z29.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z28.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z29.h\n"
+      "fmin z9.h, p5/M, z9.h, z29.h\n"
+      "fmin z10.h, p5/M, z10.h, z29.h\n"
+      "fmin z11.h, p5/M, z11.h, z29.h\n"
+      "fmin z12.h, p5/M, z12.h, z29.h\n"
+      "fmin z13.h, p5/M, z13.h, z29.h\n"
+      "fmin z14.h, p5/M, z14.h, z29.h\n"
+      "fmin z15.h, p5/M, z15.h, z29.h\n"
+      "fmin z16.h, p5/M, z16.h, z29.h\n"
+      "fmin z17.h, p5/M, z17.h, z29.h\n"
+      "fmin z18.h, p5/M, z18.h, z29.h\n"
+      "fmin z19.h, p5/M, z19.h, z29.h\n"
+      "fmin z20.h, p5/M, z20.h, z29.h\n"
+      "fmin z21.h, p5/M, z21.h, z29.h\n"
+      "fmin z22.h, p5/M, z22.h, z29.h\n"
+      "fmin z23.h, p5/M, z23.h, z29.h\n"
+      "fmin z24.h, p5/M, z24.h, z29.h\n"
+      "fmin z25.h, p5/M, z25.h, z29.h\n"
+      "fmin z26.h, p5/M, z26.h, z29.h\n"
+      "fmin z27.h, p5/M, z27.h, z29.h\n"
+      "fmax z8.h, p5/M, z8.h, z28.h\n"
+      "fmax z9.h, p5/M, z9.h, z28.h\n"
+      "fmax z10.h, p5/M, z10.h, z28.h\n"
+      "fmax z11.h, p5/M, z11.h, z28.h\n"
+      "fmax z12.h, p5/M, z12.h, z28.h\n"
+      "fmax z13.h, p5/M, z13.h, z28.h\n"
+      "fmax z14.h, p5/M, z14.h, z28.h\n"
+      "fmax z15.h, p5/M, z15.h, z28.h\n"
+      "fmax z16.h, p5/M, z16.h, z28.h\n"
+      "fmax z17.h, p5/M, z17.h, z28.h\n"
+      "fmax z18.h, p5/M, z18.h, z28.h\n"
+      "fmax z19.h, p5/M, z19.h, z28.h\n"
+      "fmax z20.h, p5/M, z20.h, z28.h\n"
+      "fmax z21.h, p5/M, z21.h, z28.h\n"
+      "fmax z22.h, p5/M, z22.h, z28.h\n"
+      "fmax z23.h, p5/M, z23.h, z28.h\n"
+      "fmax z24.h, p5/M, z24.h, z28.h\n"
+      "fmax z25.h, p5/M, z25.h, z28.h\n"
+      "fmax z26.h, p5/M, z26.h, z28.h\n"
+      "fmax z27.h, p5/M, z27.h, z28.h\n"
+      "69:"  // Height 5: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x25]\n"
+      "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p4, [x24]\n"
+      "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p4, [x23]\n"
+      "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
+      "st1h { z24.h }, p4, [x22]\n"
+      "st1h { z25.h }, p3, [x22, #1, MUL VL]\n"
+      "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z27.h }, p1, [x22, #3, MUL VL]\n"
+      "70:"  // Height 5: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 58b\n"
+      "b 86f\n"
+      "71:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x21, #0xc\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x15, %x[bias]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
+      "72:"  // Height 6: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "cnth x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 73f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 73f\n"
+      "dech x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 73f\n"
+      "mov x11, x12\n"
+      "73:"  // Height 6: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x14\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x14\n"
+      "cbz x15, 74f\n"
+      "ld1h { z8.h }, p5/Z, [x15]\n"
+      "ld1h { z9.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 76f\n"
+      "74:"  // Height 6: no bias
+      "tbz %x[flags], #0, 75f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x13, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x24]\n"
+      "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x23]\n"
+      "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x22]\n"
+      "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x21]\n"
+      "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z28.h }, p4/Z, [x20]\n"
+      "ld1h { z29.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z30.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z31.h }, p1/Z, [x20, #3, MUL VL]\n"
+      "b 76f\n"
+      "75:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "76:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "77:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 78f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 79f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
+      "b 79f\n"
+      "78:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
+      "79:"  // Height 6: input setup done
+      "cmp x27, #0x8\n"
+      "ble 81f\n"
+      "80:"  // Height 6: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z7.h }, p0/Z, [x26]\n"
+      "ld1rqh { z6.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1rqh { z5.h }, p0/Z, [x24]\n"
+      "ld1rqh { z4.h }, p0/Z, [x23]\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1rqh { z2.h }, p0/Z, [x21]\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "ld1h { z1.h }, p5/Z, [x12]\n"
+      "ld1h { z0.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z1.h, z7.h[0]\n"
+      "fmla z12.h, z1.h, z6.h[0]\n"
+      "fmla z16.h, z1.h, z5.h[0]\n"
+      "fmla z20.h, z1.h, z4.h[0]\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "fmla z24.h, z1.h, z3.h[0]\n"
+      "fmla z28.h, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "add x21, x21, #0x10\n"
+      "fmla z9.h, z0.h, z7.h[0]\n"
+      "fmla z13.h, z0.h, z6.h[0]\n"
+      "fmla z17.h, z0.h, z5.h[0]\n"
+      "fmla z21.h, z0.h, z4.h[0]\n"
+      "fmla z25.h, z0.h, z3.h[0]\n"
+      "fmla z29.h, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z1.h, z7.h[0]\n"
+      "fmla z14.h, z1.h, z6.h[0]\n"
+      "fmla z18.h, z1.h, z5.h[0]\n"
+      "fmla z22.h, z1.h, z4.h[0]\n"
+      "fmla z26.h, z1.h, z3.h[0]\n"
+      "fmla z30.h, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[0]\n"
+      "fmla z15.h, z0.h, z6.h[0]\n"
+      "fmla z19.h, z0.h, z5.h[0]\n"
+      "fmla z23.h, z0.h, z4.h[0]\n"
+      "fmla z27.h, z0.h, z3.h[0]\n"
+      "fmla z31.h, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[1]\n"
+      "fmla z12.h, z1.h, z6.h[1]\n"
+      "fmla z16.h, z1.h, z5.h[1]\n"
+      "fmla z20.h, z1.h, z4.h[1]\n"
+      "fmla z24.h, z1.h, z3.h[1]\n"
+      "fmla z28.h, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[1]\n"
+      "fmla z13.h, z0.h, z6.h[1]\n"
+      "fmla z17.h, z0.h, z5.h[1]\n"
+      "fmla z21.h, z0.h, z4.h[1]\n"
+      "fmla z25.h, z0.h, z3.h[1]\n"
+      "fmla z29.h, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[1]\n"
+      "fmla z14.h, z1.h, z6.h[1]\n"
+      "fmla z18.h, z1.h, z5.h[1]\n"
+      "fmla z22.h, z1.h, z4.h[1]\n"
+      "fmla z26.h, z1.h, z3.h[1]\n"
+      "fmla z30.h, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[1]\n"
+      "fmla z15.h, z0.h, z6.h[1]\n"
+      "fmla z19.h, z0.h, z5.h[1]\n"
+      "fmla z23.h, z0.h, z4.h[1]\n"
+      "fmla z27.h, z0.h, z3.h[1]\n"
+      "fmla z31.h, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[2]\n"
+      "fmla z12.h, z1.h, z6.h[2]\n"
+      "fmla z16.h, z1.h, z5.h[2]\n"
+      "fmla z20.h, z1.h, z4.h[2]\n"
+      "fmla z24.h, z1.h, z3.h[2]\n"
+      "fmla z28.h, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[2]\n"
+      "fmla z13.h, z0.h, z6.h[2]\n"
+      "fmla z17.h, z0.h, z5.h[2]\n"
+      "fmla z21.h, z0.h, z4.h[2]\n"
+      "fmla z25.h, z0.h, z3.h[2]\n"
+      "fmla z29.h, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[2]\n"
+      "fmla z14.h, z1.h, z6.h[2]\n"
+      "fmla z18.h, z1.h, z5.h[2]\n"
+      "fmla z22.h, z1.h, z4.h[2]\n"
+      "fmla z26.h, z1.h, z3.h[2]\n"
+      "fmla z30.h, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[2]\n"
+      "fmla z15.h, z0.h, z6.h[2]\n"
+      "fmla z19.h, z0.h, z5.h[2]\n"
+      "fmla z23.h, z0.h, z4.h[2]\n"
+      "fmla z27.h, z0.h, z3.h[2]\n"
+      "fmla z31.h, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[3]\n"
+      "fmla z12.h, z1.h, z6.h[3]\n"
+      "fmla z16.h, z1.h, z5.h[3]\n"
+      "fmla z20.h, z1.h, z4.h[3]\n"
+      "fmla z24.h, z1.h, z3.h[3]\n"
+      "fmla z28.h, z1.h, z2.h[3]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[3]\n"
+      "fmla z13.h, z0.h, z6.h[3]\n"
+      "fmla z17.h, z0.h, z5.h[3]\n"
+      "fmla z21.h, z0.h, z4.h[3]\n"
+      "fmla z25.h, z0.h, z3.h[3]\n"
+      "fmla z29.h, z0.h, z2.h[3]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[3]\n"
+      "fmla z14.h, z1.h, z6.h[3]\n"
+      "fmla z18.h, z1.h, z5.h[3]\n"
+      "fmla z22.h, z1.h, z4.h[3]\n"
+      "fmla z26.h, z1.h, z3.h[3]\n"
+      "fmla z30.h, z1.h, z2.h[3]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[3]\n"
+      "fmla z15.h, z0.h, z6.h[3]\n"
+      "fmla z19.h, z0.h, z5.h[3]\n"
+      "fmla z23.h, z0.h, z4.h[3]\n"
+      "fmla z27.h, z0.h, z3.h[3]\n"
+      "fmla z31.h, z0.h, z2.h[3]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[4]\n"
+      "fmla z12.h, z1.h, z6.h[4]\n"
+      "fmla z16.h, z1.h, z5.h[4]\n"
+      "fmla z20.h, z1.h, z4.h[4]\n"
+      "fmla z24.h, z1.h, z3.h[4]\n"
+      "fmla z28.h, z1.h, z2.h[4]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[4]\n"
+      "fmla z13.h, z0.h, z6.h[4]\n"
+      "fmla z17.h, z0.h, z5.h[4]\n"
+      "fmla z21.h, z0.h, z4.h[4]\n"
+      "fmla z25.h, z0.h, z3.h[4]\n"
+      "fmla z29.h, z0.h, z2.h[4]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[4]\n"
+      "fmla z14.h, z1.h, z6.h[4]\n"
+      "fmla z18.h, z1.h, z5.h[4]\n"
+      "fmla z22.h, z1.h, z4.h[4]\n"
+      "fmla z26.h, z1.h, z3.h[4]\n"
+      "fmla z30.h, z1.h, z2.h[4]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[4]\n"
+      "fmla z15.h, z0.h, z6.h[4]\n"
+      "fmla z19.h, z0.h, z5.h[4]\n"
+      "fmla z23.h, z0.h, z4.h[4]\n"
+      "fmla z27.h, z0.h, z3.h[4]\n"
+      "fmla z31.h, z0.h, z2.h[4]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[5]\n"
+      "fmla z12.h, z1.h, z6.h[5]\n"
+      "fmla z16.h, z1.h, z5.h[5]\n"
+      "fmla z20.h, z1.h, z4.h[5]\n"
+      "fmla z24.h, z1.h, z3.h[5]\n"
+      "fmla z28.h, z1.h, z2.h[5]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[5]\n"
+      "fmla z13.h, z0.h, z6.h[5]\n"
+      "fmla z17.h, z0.h, z5.h[5]\n"
+      "fmla z21.h, z0.h, z4.h[5]\n"
+      "fmla z25.h, z0.h, z3.h[5]\n"
+      "fmla z29.h, z0.h, z2.h[5]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[5]\n"
+      "fmla z14.h, z1.h, z6.h[5]\n"
+      "fmla z18.h, z1.h, z5.h[5]\n"
+      "fmla z22.h, z1.h, z4.h[5]\n"
+      "fmla z26.h, z1.h, z3.h[5]\n"
+      "fmla z30.h, z1.h, z2.h[5]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[5]\n"
+      "fmla z15.h, z0.h, z6.h[5]\n"
+      "fmla z19.h, z0.h, z5.h[5]\n"
+      "fmla z23.h, z0.h, z4.h[5]\n"
+      "fmla z27.h, z0.h, z3.h[5]\n"
+      "fmla z31.h, z0.h, z2.h[5]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[6]\n"
+      "fmla z12.h, z1.h, z6.h[6]\n"
+      "fmla z16.h, z1.h, z5.h[6]\n"
+      "fmla z20.h, z1.h, z4.h[6]\n"
+      "fmla z24.h, z1.h, z3.h[6]\n"
+      "fmla z28.h, z1.h, z2.h[6]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[6]\n"
+      "fmla z13.h, z0.h, z6.h[6]\n"
+      "fmla z17.h, z0.h, z5.h[6]\n"
+      "fmla z21.h, z0.h, z4.h[6]\n"
+      "fmla z25.h, z0.h, z3.h[6]\n"
+      "fmla z29.h, z0.h, z2.h[6]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[6]\n"
+      "fmla z14.h, z1.h, z6.h[6]\n"
+      "fmla z18.h, z1.h, z5.h[6]\n"
+      "fmla z22.h, z1.h, z4.h[6]\n"
+      "fmla z26.h, z1.h, z3.h[6]\n"
+      "fmla z30.h, z1.h, z2.h[6]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "addvl x12, x12, #8\n"
+      "fmla z11.h, z0.h, z7.h[6]\n"
+      "fmla z15.h, z0.h, z6.h[6]\n"
+      "fmla z19.h, z0.h, z5.h[6]\n"
+      "fmla z23.h, z0.h, z4.h[6]\n"
+      "fmla z27.h, z0.h, z3.h[6]\n"
+      "fmla z31.h, z0.h, z2.h[6]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #8\n"
+      "fmla z8.h, z1.h, z7.h[7]\n"
+      "fmla z12.h, z1.h, z6.h[7]\n"
+      "fmla z16.h, z1.h, z5.h[7]\n"
+      "fmla z20.h, z1.h, z4.h[7]\n"
+      "fmla z24.h, z1.h, z3.h[7]\n"
+      "fmla z28.h, z1.h, z2.h[7]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      "fmla z9.h, z0.h, z7.h[7]\n"
+      "fmla z13.h, z0.h, z6.h[7]\n"
+      "fmla z17.h, z0.h, z5.h[7]\n"
+      "fmla z21.h, z0.h, z4.h[7]\n"
+      "fmla z25.h, z0.h, z3.h[7]\n"
+      "fmla z29.h, z0.h, z2.h[7]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      "fmla z10.h, z1.h, z7.h[7]\n"
+      "fmla z14.h, z1.h, z6.h[7]\n"
+      "fmla z18.h, z1.h, z5.h[7]\n"
+      "fmla z22.h, z1.h, z4.h[7]\n"
+      "fmla z26.h, z1.h, z3.h[7]\n"
+      "fmla z30.h, z1.h, z2.h[7]\n"
+      "fmla z11.h, z0.h, z7.h[7]\n"
+      "fmla z15.h, z0.h, z6.h[7]\n"
+      "fmla z19.h, z0.h, z5.h[7]\n"
+      "fmla z23.h, z0.h, z4.h[7]\n"
+      "fmla z27.h, z0.h, z3.h[7]\n"
+      "fmla z31.h, z0.h, z2.h[7]\n"
+      "bgt 80b\n"
+      "81:"  // Height 6: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "ld1rqh { z5.h }, p0/Z, [x21]\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[0]\n"
+      "fmla z12.h, z7.h, z1.h[0]\n"
+      "fmla z16.h, z7.h, z2.h[0]\n"
+      "fmla z20.h, z7.h, z3.h[0]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z24.h, z7.h, z4.h[0]\n"
+      "fmla z28.h, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z9.h, z6.h, z0.h[0]\n"
+      "fmla z13.h, z6.h, z1.h[0]\n"
+      "fmla z17.h, z6.h, z2.h[0]\n"
+      "fmla z21.h, z6.h, z3.h[0]\n"
+      "fmla z25.h, z6.h, z4.h[0]\n"
+      "fmla z29.h, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z7.h, z0.h[0]\n"
+      "fmla z14.h, z7.h, z1.h[0]\n"
+      "fmla z18.h, z7.h, z2.h[0]\n"
+      "fmla z22.h, z7.h, z3.h[0]\n"
+      "fmla z26.h, z7.h, z4.h[0]\n"
+      "fmla z30.h, z7.h, z5.h[0]\n"
+      "fmla z11.h, z6.h, z0.h[0]\n"
+      "fmla z15.h, z6.h, z1.h[0]\n"
+      "fmla z19.h, z6.h, z2.h[0]\n"
+      "fmla z23.h, z6.h, z3.h[0]\n"
+      "fmla z27.h, z6.h, z4.h[0]\n"
+      "fmla z31.h, z6.h, z5.h[0]\n"
+      "ble 82f\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[1]\n"
+      "fmla z12.h, z7.h, z1.h[1]\n"
+      "fmla z16.h, z7.h, z2.h[1]\n"
+      "fmla z20.h, z7.h, z3.h[1]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z7.h, z4.h[1]\n"
+      "fmla z28.h, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z9.h, z6.h, z0.h[1]\n"
+      "fmla z13.h, z6.h, z1.h[1]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.h, z6.h, z2.h[1]\n"
+      "fmla z21.h, z6.h, z3.h[1]\n"
+      "fmla z25.h, z6.h, z4.h[1]\n"
+      "fmla z29.h, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z7.h, z0.h[1]\n"
+      "fmla z14.h, z7.h, z1.h[1]\n"
+      "fmla z18.h, z7.h, z2.h[1]\n"
+      "fmla z22.h, z7.h, z3.h[1]\n"
+      "fmla z26.h, z7.h, z4.h[1]\n"
+      "fmla z30.h, z7.h, z5.h[1]\n"
+      "fmla z11.h, z6.h, z0.h[1]\n"
+      "fmla z15.h, z6.h, z1.h[1]\n"
+      "fmla z19.h, z6.h, z2.h[1]\n"
+      "fmla z23.h, z6.h, z3.h[1]\n"
+      "fmla z27.h, z6.h, z4.h[1]\n"
+      "fmla z31.h, z6.h, z5.h[1]\n"
+      "ble 82f\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[2]\n"
+      "fmla z12.h, z7.h, z1.h[2]\n"
+      "fmla z16.h, z7.h, z2.h[2]\n"
+      "fmla z20.h, z7.h, z3.h[2]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z7.h, z4.h[2]\n"
+      "fmla z28.h, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z9.h, z6.h, z0.h[2]\n"
+      "fmla z13.h, z6.h, z1.h[2]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.h, z6.h, z2.h[2]\n"
+      "fmla z21.h, z6.h, z3.h[2]\n"
+      "fmla z25.h, z6.h, z4.h[2]\n"
+      "fmla z29.h, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z7.h, z0.h[2]\n"
+      "fmla z14.h, z7.h, z1.h[2]\n"
+      "fmla z18.h, z7.h, z2.h[2]\n"
+      "fmla z22.h, z7.h, z3.h[2]\n"
+      "fmla z26.h, z7.h, z4.h[2]\n"
+      "fmla z30.h, z7.h, z5.h[2]\n"
+      "fmla z11.h, z6.h, z0.h[2]\n"
+      "fmla z15.h, z6.h, z1.h[2]\n"
+      "fmla z19.h, z6.h, z2.h[2]\n"
+      "fmla z23.h, z6.h, z3.h[2]\n"
+      "fmla z27.h, z6.h, z4.h[2]\n"
+      "fmla z31.h, z6.h, z5.h[2]\n"
+      "ble 82f\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[3]\n"
+      "fmla z12.h, z7.h, z1.h[3]\n"
+      "fmla z16.h, z7.h, z2.h[3]\n"
+      "fmla z20.h, z7.h, z3.h[3]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z7.h, z4.h[3]\n"
+      "fmla z28.h, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z9.h, z6.h, z0.h[3]\n"
+      "fmla z13.h, z6.h, z1.h[3]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.h, z6.h, z2.h[3]\n"
+      "fmla z21.h, z6.h, z3.h[3]\n"
+      "fmla z25.h, z6.h, z4.h[3]\n"
+      "fmla z29.h, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z7.h, z0.h[3]\n"
+      "fmla z14.h, z7.h, z1.h[3]\n"
+      "fmla z18.h, z7.h, z2.h[3]\n"
+      "fmla z22.h, z7.h, z3.h[3]\n"
+      "fmla z26.h, z7.h, z4.h[3]\n"
+      "fmla z30.h, z7.h, z5.h[3]\n"
+      "fmla z11.h, z6.h, z0.h[3]\n"
+      "fmla z15.h, z6.h, z1.h[3]\n"
+      "fmla z19.h, z6.h, z2.h[3]\n"
+      "fmla z23.h, z6.h, z3.h[3]\n"
+      "fmla z27.h, z6.h, z4.h[3]\n"
+      "fmla z31.h, z6.h, z5.h[3]\n"
+      "ble 82f\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[4]\n"
+      "fmla z12.h, z7.h, z1.h[4]\n"
+      "fmla z16.h, z7.h, z2.h[4]\n"
+      "fmla z20.h, z7.h, z3.h[4]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z7.h, z4.h[4]\n"
+      "fmla z28.h, z7.h, z5.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z9.h, z6.h, z0.h[4]\n"
+      "fmla z13.h, z6.h, z1.h[4]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.h, z6.h, z2.h[4]\n"
+      "fmla z21.h, z6.h, z3.h[4]\n"
+      "fmla z25.h, z6.h, z4.h[4]\n"
+      "fmla z29.h, z6.h, z5.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z7.h, z0.h[4]\n"
+      "fmla z14.h, z7.h, z1.h[4]\n"
+      "fmla z18.h, z7.h, z2.h[4]\n"
+      "fmla z22.h, z7.h, z3.h[4]\n"
+      "fmla z26.h, z7.h, z4.h[4]\n"
+      "fmla z30.h, z7.h, z5.h[4]\n"
+      "fmla z11.h, z6.h, z0.h[4]\n"
+      "fmla z15.h, z6.h, z1.h[4]\n"
+      "fmla z19.h, z6.h, z2.h[4]\n"
+      "fmla z23.h, z6.h, z3.h[4]\n"
+      "fmla z27.h, z6.h, z4.h[4]\n"
+      "fmla z31.h, z6.h, z5.h[4]\n"
+      "ble 82f\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[5]\n"
+      "fmla z12.h, z7.h, z1.h[5]\n"
+      "fmla z16.h, z7.h, z2.h[5]\n"
+      "fmla z20.h, z7.h, z3.h[5]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z7.h, z4.h[5]\n"
+      "fmla z28.h, z7.h, z5.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z9.h, z6.h, z0.h[5]\n"
+      "fmla z13.h, z6.h, z1.h[5]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.h, z6.h, z2.h[5]\n"
+      "fmla z21.h, z6.h, z3.h[5]\n"
+      "fmla z25.h, z6.h, z4.h[5]\n"
+      "fmla z29.h, z6.h, z5.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z7.h, z0.h[5]\n"
+      "fmla z14.h, z7.h, z1.h[5]\n"
+      "fmla z18.h, z7.h, z2.h[5]\n"
+      "fmla z22.h, z7.h, z3.h[5]\n"
+      "fmla z26.h, z7.h, z4.h[5]\n"
+      "fmla z30.h, z7.h, z5.h[5]\n"
+      "fmla z11.h, z6.h, z0.h[5]\n"
+      "fmla z15.h, z6.h, z1.h[5]\n"
+      "fmla z19.h, z6.h, z2.h[5]\n"
+      "fmla z23.h, z6.h, z3.h[5]\n"
+      "fmla z27.h, z6.h, z4.h[5]\n"
+      "fmla z31.h, z6.h, z5.h[5]\n"
+      "ble 82f\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[6]\n"
+      "fmla z12.h, z7.h, z1.h[6]\n"
+      "fmla z16.h, z7.h, z2.h[6]\n"
+      "fmla z20.h, z7.h, z3.h[6]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.h, z7.h, z4.h[6]\n"
+      "fmla z28.h, z7.h, z5.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z9.h, z6.h, z0.h[6]\n"
+      "fmla z13.h, z6.h, z1.h[6]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.h, z6.h, z2.h[6]\n"
+      "fmla z21.h, z6.h, z3.h[6]\n"
+      "fmla z25.h, z6.h, z4.h[6]\n"
+      "fmla z29.h, z6.h, z5.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z7.h, z0.h[6]\n"
+      "fmla z14.h, z7.h, z1.h[6]\n"
+      "fmla z18.h, z7.h, z2.h[6]\n"
+      "fmla z22.h, z7.h, z3.h[6]\n"
+      "fmla z26.h, z7.h, z4.h[6]\n"
+      "fmla z30.h, z7.h, z5.h[6]\n"
+      "fmla z11.h, z6.h, z0.h[6]\n"
+      "fmla z15.h, z6.h, z1.h[6]\n"
+      "fmla z19.h, z6.h, z2.h[6]\n"
+      "fmla z23.h, z6.h, z3.h[6]\n"
+      "fmla z27.h, z6.h, z4.h[6]\n"
+      "fmla z31.h, z6.h, z5.h[6]\n"
+      "ble 82f\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[7]\n"
+      "fmla z12.h, z7.h, z1.h[7]\n"
+      "fmla z16.h, z7.h, z2.h[7]\n"
+      "fmla z20.h, z7.h, z3.h[7]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z24.h, z7.h, z4.h[7]\n"
+      "fmla z28.h, z7.h, z5.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z9.h, z6.h, z0.h[7]\n"
+      "fmla z13.h, z6.h, z1.h[7]\n"
+      "fmla z17.h, z6.h, z2.h[7]\n"
+      "fmla z21.h, z6.h, z3.h[7]\n"
+      "fmla z25.h, z6.h, z4.h[7]\n"
+      "fmla z29.h, z6.h, z5.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.h, z7.h, z0.h[7]\n"
+      "fmla z14.h, z7.h, z1.h[7]\n"
+      "fmla z18.h, z7.h, z2.h[7]\n"
+      "fmla z22.h, z7.h, z3.h[7]\n"
+      "fmla z26.h, z7.h, z4.h[7]\n"
+      "fmla z30.h, z7.h, z5.h[7]\n"
+      "fmla z11.h, z6.h, z0.h[7]\n"
+      "fmla z15.h, z6.h, z1.h[7]\n"
+      "fmla z19.h, z6.h, z2.h[7]\n"
+      "fmla z23.h, z6.h, z3.h[7]\n"
+      "fmla z27.h, z6.h, z4.h[7]\n"
+      "fmla z31.h, z6.h, z5.h[7]\n"
+      "82:"  // Height 6: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 77b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z0.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z1.h\n"
+      "fmin z9.h, p5/M, z9.h, z1.h\n"
+      "fmin z10.h, p5/M, z10.h, z1.h\n"
+      "fmin z11.h, p5/M, z11.h, z1.h\n"
+      "fmin z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z1.h\n"
+      "fmin z14.h, p5/M, z14.h, z1.h\n"
+      "fmin z15.h, p5/M, z15.h, z1.h\n"
+      "fmin z16.h, p5/M, z16.h, z1.h\n"
+      "fmin z17.h, p5/M, z17.h, z1.h\n"
+      "fmin z18.h, p5/M, z18.h, z1.h\n"
+      "fmin z19.h, p5/M, z19.h, z1.h\n"
+      "fmin z20.h, p5/M, z20.h, z1.h\n"
+      "fmin z21.h, p5/M, z21.h, z1.h\n"
+      "fmin z22.h, p5/M, z22.h, z1.h\n"
+      "fmin z23.h, p5/M, z23.h, z1.h\n"
+      "fmin z24.h, p5/M, z24.h, z1.h\n"
+      "fmin z25.h, p5/M, z25.h, z1.h\n"
+      "fmin z26.h, p5/M, z26.h, z1.h\n"
+      "fmin z27.h, p5/M, z27.h, z1.h\n"
+      "fmin z28.h, p5/M, z28.h, z1.h\n"
+      "fmin z29.h, p5/M, z29.h, z1.h\n"
+      "fmin z30.h, p5/M, z30.h, z1.h\n"
+      "fmin z31.h, p5/M, z31.h, z1.h\n"
+      "fmax z8.h, p5/M, z8.h, z0.h\n"
+      "fmax z9.h, p5/M, z9.h, z0.h\n"
+      "fmax z10.h, p5/M, z10.h, z0.h\n"
+      "fmax z11.h, p5/M, z11.h, z0.h\n"
+      "fmax z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z0.h\n"
+      "fmax z14.h, p5/M, z14.h, z0.h\n"
+      "fmax z15.h, p5/M, z15.h, z0.h\n"
+      "fmax z16.h, p5/M, z16.h, z0.h\n"
+      "fmax z17.h, p5/M, z17.h, z0.h\n"
+      "fmax z18.h, p5/M, z18.h, z0.h\n"
+      "fmax z19.h, p5/M, z19.h, z0.h\n"
+      "fmax z20.h, p5/M, z20.h, z0.h\n"
+      "fmax z21.h, p5/M, z21.h, z0.h\n"
+      "fmax z22.h, p5/M, z22.h, z0.h\n"
+      "fmax z23.h, p5/M, z23.h, z0.h\n"
+      "fmax z24.h, p5/M, z24.h, z0.h\n"
+      "fmax z25.h, p5/M, z25.h, z0.h\n"
+      "fmax z26.h, p5/M, z26.h, z0.h\n"
+      "fmax z27.h, p5/M, z27.h, z0.h\n"
+      "fmax z28.h, p5/M, z28.h, z0.h\n"
+      "fmax z29.h, p5/M, z29.h, z0.h\n"
+      "fmax z30.h, p5/M, z30.h, z0.h\n"
+      "fmax z31.h, p5/M, z31.h, z0.h\n"
+      "83:"  // Height 6: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x25]\n"
+      "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p4, [x24]\n"
+      "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p4, [x23]\n"
+      "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
+      "st1h { z24.h }, p4, [x22]\n"
+      "st1h { z25.h }, p3, [x22, #1, MUL VL]\n"
+      "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z27.h }, p1, [x22, #3, MUL VL]\n"
+      "st1h { z28.h }, p4, [x21]\n"
+      "st1h { z29.h }, p3, [x21, #1, MUL VL]\n"
+      "st1h { z30.h }, p2, [x21, #2, MUL VL]\n"
+      "st1h { z31.h }, p1, [x21, #3, MUL VL]\n"
+      "84:"  // Height 6: Writeback done
+      "dech x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 72b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 86f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "85:"  // Update direct input
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "86:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp
new file mode 100644
index 0000000000..3a93a2f7c8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const float *, \
+    size_t, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_ffhybrid_fp32_mla_6x4VL( ARGLIST );
+void sve_ffhybrid_fp32_mla_6x4VL_a64fx( ARGLIST );
+
+class cls_sve_ffhybrid_fp32_mla_6x4VL
+{
+public:
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+    static unsigned int stripe_width()
+    {
+        return get_vector_length<float>() * 1;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL1VL_BL32;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.27 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_ffhybrid_fp32_mla_6x4VL;
+    cls_sve_ffhybrid_fp32_mla_6x4VL(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_ffhybrid_fp32_mla_6x4VL_a64fx;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp
new file mode 100644
index 0000000000..8e4fd4388e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp
@@ -0,0 +1,1530 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_ffhybrid_fp32_mla_6x4VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, size_t B_stride, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        const float *cur_B_ptr = {};
+        size_t B_stride = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    ka.B_stride = B_stride;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 66f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 53f\n"
+      "beq 40f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 27f\n"
+      "beq 14f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 3f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 3f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 3f\n"
+      "mov x11, x12\n"
+      "3:"  // Height 1: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x14\n"
+      "cbz x15, 4f\n"
+      "ld1w { z8.s }, p4/Z, [x15]\n"
+      "ld1w { z9.s }, p4/Z, [x15, #1, MUL VL]\n"
+      "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1w { z8.s }, p3/Z, [x13]\n"
+      "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 9f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z17.s }, p4/Z, [x10]\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z17.s }, p4/Z, [x10]\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "bne 7b\n"
+      "tbz %x[flags], #1, 12f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z17.s\n"
+      "fmin z9.s, p4/M, z9.s, z17.s\n"
+      "fmin z10.s, p4/M, z10.s, z17.s\n"
+      "fmin z11.s, p4/M, z11.s, z17.s\n"
+      "fmax z8.s, p4/M, z8.s, z16.s\n"
+      "fmax z9.s, p4/M, z9.s, z16.s\n"
+      "fmax z10.s, p4/M, z10.s, z16.s\n"
+      "fmax z11.s, p4/M, z11.s, z16.s\n"
+      "12:"  // Height 1: No activation
+      "st1w { z8.s }, p3, [x13]\n"
+      "st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "13:"  // Height 1: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 2b\n"
+      "b 80f\n"
+      "14:"  // Height 2
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "15:"  // Height 2: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 16f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 16f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 16f\n"
+      "mov x11, x12\n"
+      "16:"  // Height 2: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x14\n"
+      "cbz x15, 17f\n"
+      "ld1w { z8.s }, p4/Z, [x15]\n"
+      "ld1w { z9.s }, p4/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x15, x15, #4\n"
+      "b 19f\n"
+      "17:"  // Height 2: no bias
+      "tbz %x[flags], #0, 18f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x13, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x13]\n"
+      "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x20]\n"
+      "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 19f\n"
+      "18:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "19:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "20:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 21f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 22f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "b 22f\n"
+      "21:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "22:"  // Height 2: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "ble 24f\n"
+      "23:"  // Height 2: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "ld1w { z17.s }, p4/Z, [x10]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
+      "addvl x11, x11, #1\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z14.s, p4/M, z17.s, z1.s\n"
+      "add x25, x25, #0x4\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "fmla z15.s, p4/M, z16.s, z1.s\n"
+      "addvl x10, x10, #1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "addvl x9, x9, #1\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "bgt 23b\n"
+      "24:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "ld1w { z17.s }, p4/Z, [x10]\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z14.s, p4/M, z17.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "fmla z15.s, p4/M, z16.s, z1.s\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "bne 20b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "tbz %x[flags], #1, 25f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z17.s\n"
+      "fmin z9.s, p4/M, z9.s, z17.s\n"
+      "fmin z10.s, p4/M, z10.s, z17.s\n"
+      "fmin z11.s, p4/M, z11.s, z17.s\n"
+      "fmin z12.s, p4/M, z12.s, z17.s\n"
+      "fmin z13.s, p4/M, z13.s, z17.s\n"
+      "fmin z14.s, p4/M, z14.s, z17.s\n"
+      "fmin z15.s, p4/M, z15.s, z17.s\n"
+      "fmax z8.s, p4/M, z8.s, z16.s\n"
+      "fmax z9.s, p4/M, z9.s, z16.s\n"
+      "fmax z10.s, p4/M, z10.s, z16.s\n"
+      "fmax z11.s, p4/M, z11.s, z16.s\n"
+      "fmax z12.s, p4/M, z12.s, z16.s\n"
+      "fmax z13.s, p4/M, z13.s, z16.s\n"
+      "fmax z14.s, p4/M, z14.s, z16.s\n"
+      "fmax z15.s, p4/M, z15.s, z16.s\n"
+      "25:"  // Height 2: No activation
+      "st1w { z8.s }, p3, [x13]\n"
+      "st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p3, [x25]\n"
+      "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+      "26:"  // Height 2: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 15b\n"
+      "b 80f\n"
+      "27:"  // Height 3
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "28:"  // Height 3: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 29f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 29f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 29f\n"
+      "mov x11, x12\n"
+      "29:"  // Height 3: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x14\n"
+      "cbz x15, 30f\n"
+      "ld1w { z8.s }, p4/Z, [x15]\n"
+      "ld1w { z9.s }, p4/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 32f\n"
+      "30:"  // Height 3: no bias
+      "tbz %x[flags], #0, 31f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x13, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x13]\n"
+      "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x21]\n"
+      "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x20]\n"
+      "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 32f\n"
+      "31:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "32:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "33:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 34f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 35f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 35f\n"
+      "34:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "35:"  // Height 3: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "ble 37f\n"
+      "36:"  // Height 3: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z21.s }, p4/Z, [x10]\n"
+      "add x26, x26, #0x4\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "ld1w { z20.s }, p4/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "add x25, x25, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "fmla z10.s, p4/M, z21.s, z0.s\n"
+      "fmla z14.s, p4/M, z21.s, z1.s\n"
+      "fmla z18.s, p4/M, z21.s, z2.s\n"
+      "fmla z11.s, p4/M, z20.s, z0.s\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.s, p4/M, z20.s, z1.s\n"
+      "fmla z19.s, p4/M, z20.s, z2.s\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "bgt 36b\n"
+      "37:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z21.s }, p4/Z, [x10]\n"
+      "cmp x28, x20\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "ld1w { z20.s }, p4/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.s, p4/M, z21.s, z0.s\n"
+      "fmla z14.s, p4/M, z21.s, z1.s\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.s, p4/M, z21.s, z2.s\n"
+      "fmla z11.s, p4/M, z20.s, z0.s\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.s, p4/M, z20.s, z1.s\n"
+      "fmla z19.s, p4/M, z20.s, z2.s\n"
+      "bne 33b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "tbz %x[flags], #1, 38f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z21.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z20.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z21.s\n"
+      "fmin z9.s, p4/M, z9.s, z21.s\n"
+      "fmin z10.s, p4/M, z10.s, z21.s\n"
+      "fmin z11.s, p4/M, z11.s, z21.s\n"
+      "fmin z12.s, p4/M, z12.s, z21.s\n"
+      "fmin z13.s, p4/M, z13.s, z21.s\n"
+      "fmin z14.s, p4/M, z14.s, z21.s\n"
+      "fmin z15.s, p4/M, z15.s, z21.s\n"
+      "fmin z16.s, p4/M, z16.s, z21.s\n"
+      "fmin z17.s, p4/M, z17.s, z21.s\n"
+      "fmin z18.s, p4/M, z18.s, z21.s\n"
+      "fmin z19.s, p4/M, z19.s, z21.s\n"
+      "fmax z8.s, p4/M, z8.s, z20.s\n"
+      "fmax z9.s, p4/M, z9.s, z20.s\n"
+      "fmax z10.s, p4/M, z10.s, z20.s\n"
+      "fmax z11.s, p4/M, z11.s, z20.s\n"
+      "fmax z12.s, p4/M, z12.s, z20.s\n"
+      "fmax z13.s, p4/M, z13.s, z20.s\n"
+      "fmax z14.s, p4/M, z14.s, z20.s\n"
+      "fmax z15.s, p4/M, z15.s, z20.s\n"
+      "fmax z16.s, p4/M, z16.s, z20.s\n"
+      "fmax z17.s, p4/M, z17.s, z20.s\n"
+      "fmax z18.s, p4/M, z18.s, z20.s\n"
+      "fmax z19.s, p4/M, z19.s, z20.s\n"
+      "38:"  // Height 3: No activation
+      "st1w { z8.s }, p3, [x13]\n"
+      "st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p3, [x25]\n"
+      "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x24]\n"
+      "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+      "39:"  // Height 3: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 28b\n"
+      "b 80f\n"
+      "40:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "41:"  // Height 4: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 42f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 42f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 42f\n"
+      "mov x11, x12\n"
+      "42:"  // Height 4: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x14\n"
+      "cbz x15, 43f\n"
+      "ld1w { z8.s }, p4/Z, [x15]\n"
+      "ld1w { z9.s }, p4/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 45f\n"
+      "43:"  // Height 4: no bias
+      "tbz %x[flags], #0, 44f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x13, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x13]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x22]\n"
+      "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x21]\n"
+      "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x20]\n"
+      "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 45f\n"
+      "44:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "45:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "46:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 47f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 48f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 48f\n"
+      "47:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "48:"  // Height 4: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "ble 50f\n"
+      "49:"  // Height 4: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "ld1w { z25.s }, p4/Z, [x10]\n"
+      "add x26, x26, #0x4\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "subs x27, x27, #0x1\n"
+      "add x25, x25, #0x4\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "ld1w { z24.s }, p4/Z, [x9]\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "fmla z10.s, p4/M, z25.s, z0.s\n"
+      "fmla z14.s, p4/M, z25.s, z1.s\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.s, p4/M, z25.s, z2.s\n"
+      "fmla z22.s, p4/M, z25.s, z3.s\n"
+      "addvl x9, x9, #1\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "fmla z11.s, p4/M, z24.s, z0.s\n"
+      "fmla z15.s, p4/M, z24.s, z1.s\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "fmla z19.s, p4/M, z24.s, z2.s\n"
+      "fmla z23.s, p4/M, z24.s, z3.s\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "bgt 49b\n"
+      "50:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "ld1w { z25.s }, p4/Z, [x10]\n"
+      "cmp x28, x20\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "ld1w { z24.s }, p4/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.s, p4/M, z25.s, z0.s\n"
+      "fmla z14.s, p4/M, z25.s, z1.s\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.s, p4/M, z25.s, z2.s\n"
+      "fmla z22.s, p4/M, z25.s, z3.s\n"
+      "fmla z11.s, p4/M, z24.s, z0.s\n"
+      "fmla z15.s, p4/M, z24.s, z1.s\n"
+      "fmla z19.s, p4/M, z24.s, z2.s\n"
+      "fmla z23.s, p4/M, z24.s, z3.s\n"
+      "bne 46b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "tbz %x[flags], #1, 51f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z25.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z24.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z25.s\n"
+      "fmin z9.s, p4/M, z9.s, z25.s\n"
+      "fmin z10.s, p4/M, z10.s, z25.s\n"
+      "fmin z11.s, p4/M, z11.s, z25.s\n"
+      "fmin z12.s, p4/M, z12.s, z25.s\n"
+      "fmin z13.s, p4/M, z13.s, z25.s\n"
+      "fmin z14.s, p4/M, z14.s, z25.s\n"
+      "fmin z15.s, p4/M, z15.s, z25.s\n"
+      "fmin z16.s, p4/M, z16.s, z25.s\n"
+      "fmin z17.s, p4/M, z17.s, z25.s\n"
+      "fmin z18.s, p4/M, z18.s, z25.s\n"
+      "fmin z19.s, p4/M, z19.s, z25.s\n"
+      "fmin z20.s, p4/M, z20.s, z25.s\n"
+      "fmin z21.s, p4/M, z21.s, z25.s\n"
+      "fmin z22.s, p4/M, z22.s, z25.s\n"
+      "fmin z23.s, p4/M, z23.s, z25.s\n"
+      "fmax z8.s, p4/M, z8.s, z24.s\n"
+      "fmax z9.s, p4/M, z9.s, z24.s\n"
+      "fmax z10.s, p4/M, z10.s, z24.s\n"
+      "fmax z11.s, p4/M, z11.s, z24.s\n"
+      "fmax z12.s, p4/M, z12.s, z24.s\n"
+      "fmax z13.s, p4/M, z13.s, z24.s\n"
+      "fmax z14.s, p4/M, z14.s, z24.s\n"
+      "fmax z15.s, p4/M, z15.s, z24.s\n"
+      "fmax z16.s, p4/M, z16.s, z24.s\n"
+      "fmax z17.s, p4/M, z17.s, z24.s\n"
+      "fmax z18.s, p4/M, z18.s, z24.s\n"
+      "fmax z19.s, p4/M, z19.s, z24.s\n"
+      "fmax z20.s, p4/M, z20.s, z24.s\n"
+      "fmax z21.s, p4/M, z21.s, z24.s\n"
+      "fmax z22.s, p4/M, z22.s, z24.s\n"
+      "fmax z23.s, p4/M, z23.s, z24.s\n"
+      "51:"  // Height 4: No activation
+      "st1w { z8.s }, p3, [x13]\n"
+      "st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p3, [x25]\n"
+      "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x24]\n"
+      "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x23]\n"
+      "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
+      "52:"  // Height 4: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 41b\n"
+      "b 80f\n"
+      "53:"  // Height 5
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "54:"  // Height 5: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 55f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 55f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 55f\n"
+      "mov x11, x12\n"
+      "55:"  // Height 5: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x14\n"
+      "cbz x15, 56f\n"
+      "ld1w { z8.s }, p4/Z, [x15]\n"
+      "ld1w { z9.s }, p4/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 58f\n"
+      "56:"  // Height 5: no bias
+      "tbz %x[flags], #0, 57f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x13]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 58f\n"
+      "57:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "58:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "59:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 60f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 61f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 61f\n"
+      "60:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "61:"  // Height 5: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "ble 63f\n"
+      "62:"  // Height 5: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z29.s }, p4/Z, [x10]\n"
+      "add x25, x25, #0x4\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "ld1w { z28.s }, p4/Z, [x9]\n"
+      "add x22, x22, #0x4\n"
+      "fmla z10.s, p4/M, z29.s, z0.s\n"
+      "fmla z14.s, p4/M, z29.s, z1.s\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.s, p4/M, z29.s, z2.s\n"
+      "fmla z22.s, p4/M, z29.s, z3.s\n"
+      "fmla z26.s, p4/M, z29.s, z4.s\n"
+      "fmla z11.s, p4/M, z28.s, z0.s\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "fmla z15.s, p4/M, z28.s, z1.s\n"
+      "fmla z19.s, p4/M, z28.s, z2.s\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "fmla z23.s, p4/M, z28.s, z3.s\n"
+      "fmla z27.s, p4/M, z28.s, z4.s\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "bgt 62b\n"
+      "63:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "cmp x28, x20\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z29.s }, p4/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "ld1w { z28.s }, p4/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, p4/M, z29.s, z0.s\n"
+      "fmla z14.s, p4/M, z29.s, z1.s\n"
+      "fmla z18.s, p4/M, z29.s, z2.s\n"
+      "fmla z22.s, p4/M, z29.s, z3.s\n"
+      "fmla z26.s, p4/M, z29.s, z4.s\n"
+      "fmla z11.s, p4/M, z28.s, z0.s\n"
+      "fmla z15.s, p4/M, z28.s, z1.s\n"
+      "fmla z19.s, p4/M, z28.s, z2.s\n"
+      "fmla z23.s, p4/M, z28.s, z3.s\n"
+      "fmla z27.s, p4/M, z28.s, z4.s\n"
+      "bne 59b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "tbz %x[flags], #1, 64f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z29.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z28.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z29.s\n"
+      "fmin z9.s, p4/M, z9.s, z29.s\n"
+      "fmin z10.s, p4/M, z10.s, z29.s\n"
+      "fmin z11.s, p4/M, z11.s, z29.s\n"
+      "fmin z12.s, p4/M, z12.s, z29.s\n"
+      "fmin z13.s, p4/M, z13.s, z29.s\n"
+      "fmin z14.s, p4/M, z14.s, z29.s\n"
+      "fmin z15.s, p4/M, z15.s, z29.s\n"
+      "fmin z16.s, p4/M, z16.s, z29.s\n"
+      "fmin z17.s, p4/M, z17.s, z29.s\n"
+      "fmin z18.s, p4/M, z18.s, z29.s\n"
+      "fmin z19.s, p4/M, z19.s, z29.s\n"
+      "fmin z20.s, p4/M, z20.s, z29.s\n"
+      "fmin z21.s, p4/M, z21.s, z29.s\n"
+      "fmin z22.s, p4/M, z22.s, z29.s\n"
+      "fmin z23.s, p4/M, z23.s, z29.s\n"
+      "fmin z24.s, p4/M, z24.s, z29.s\n"
+      "fmin z25.s, p4/M, z25.s, z29.s\n"
+      "fmin z26.s, p4/M, z26.s, z29.s\n"
+      "fmin z27.s, p4/M, z27.s, z29.s\n"
+      "fmax z8.s, p4/M, z8.s, z28.s\n"
+      "fmax z9.s, p4/M, z9.s, z28.s\n"
+      "fmax z10.s, p4/M, z10.s, z28.s\n"
+      "fmax z11.s, p4/M, z11.s, z28.s\n"
+      "fmax z12.s, p4/M, z12.s, z28.s\n"
+      "fmax z13.s, p4/M, z13.s, z28.s\n"
+      "fmax z14.s, p4/M, z14.s, z28.s\n"
+      "fmax z15.s, p4/M, z15.s, z28.s\n"
+      "fmax z16.s, p4/M, z16.s, z28.s\n"
+      "fmax z17.s, p4/M, z17.s, z28.s\n"
+      "fmax z18.s, p4/M, z18.s, z28.s\n"
+      "fmax z19.s, p4/M, z19.s, z28.s\n"
+      "fmax z20.s, p4/M, z20.s, z28.s\n"
+      "fmax z21.s, p4/M, z21.s, z28.s\n"
+      "fmax z22.s, p4/M, z22.s, z28.s\n"
+      "fmax z23.s, p4/M, z23.s, z28.s\n"
+      "fmax z24.s, p4/M, z24.s, z28.s\n"
+      "fmax z25.s, p4/M, z25.s, z28.s\n"
+      "fmax z26.s, p4/M, z26.s, z28.s\n"
+      "fmax z27.s, p4/M, z27.s, z28.s\n"
+      "64:"  // Height 5: No activation
+      "st1w { z8.s }, p3, [x13]\n"
+      "st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p3, [x25]\n"
+      "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x24]\n"
+      "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x23]\n"
+      "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x22]\n"
+      "st1w { z25.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x22, #3, MUL VL]\n"
+      "65:"  // Height 5: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 54b\n"
+      "b 80f\n"
+      "66:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x21, #0x18\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x15, %x[bias]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
+      "67:"  // Height 6: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 68f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 68f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 68f\n"
+      "mov x11, x12\n"
+      "68:"  // Height 6: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x14\n"
+      "cbz x15, 69f\n"
+      "ld1w { z8.s }, p4/Z, [x15]\n"
+      "ld1w { z9.s }, p4/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p4/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 71f\n"
+      "69:"  // Height 6: no bias
+      "tbz %x[flags], #0, 70f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x13, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x13]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x24]\n"
+      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x22]\n"
+      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x21]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p3/Z, [x20]\n"
+      "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 71f\n"
+      "70:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "71:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "72:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 73f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 74f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 74f\n"
+      "73:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "74:"  // Height 6: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1rw { z5.s }, p4/Z, [x21]\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "ble 76f\n"
+      "75:"  // Height 6: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z28.s, p4/M, z6.s, z5.s\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "add x25, x25, #0x4\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "add x22, x22, #0x4\n"
+      "add x21, x21, #0x4\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "fmla z29.s, p4/M, z7.s, z5.s\n"
+      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "fmla z26.s, p4/M, z6.s, z4.s\n"
+      "fmla z30.s, p4/M, z6.s, z5.s\n"
+      "ld1w { z6.s }, p4/Z, [x12]\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "fmla z31.s, p4/M, z7.s, z5.s\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1rw { z5.s }, p4/Z, [x21]\n"
+      "ld1w { z7.s }, p4/Z, [x11]\n"
+      "bgt 75b\n"
+      "76:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "cmp x28, x20\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z28.s, p4/M, z6.s, z5.s\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "fmla z29.s, p4/M, z7.s, z5.s\n"
+      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "fmla z26.s, p4/M, z6.s, z4.s\n"
+      "fmla z30.s, p4/M, z6.s, z5.s\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "fmla z31.s, p4/M, z7.s, z5.s\n"
+      "bne 72b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "tbz %x[flags], #1, 77f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z1.s\n"
+      "fmin z9.s, p4/M, z9.s, z1.s\n"
+      "fmin z10.s, p4/M, z10.s, z1.s\n"
+      "fmin z11.s, p4/M, z11.s, z1.s\n"
+      "fmin z12.s, p4/M, z12.s, z1.s\n"
+      "fmin z13.s, p4/M, z13.s, z1.s\n"
+      "fmin z14.s, p4/M, z14.s, z1.s\n"
+      "fmin z15.s, p4/M, z15.s, z1.s\n"
+      "fmin z16.s, p4/M, z16.s, z1.s\n"
+      "fmin z17.s, p4/M, z17.s, z1.s\n"
+      "fmin z18.s, p4/M, z18.s, z1.s\n"
+      "fmin z19.s, p4/M, z19.s, z1.s\n"
+      "fmin z20.s, p4/M, z20.s, z1.s\n"
+      "fmin z21.s, p4/M, z21.s, z1.s\n"
+      "fmin z22.s, p4/M, z22.s, z1.s\n"
+      "fmin z23.s, p4/M, z23.s, z1.s\n"
+      "fmin z24.s, p4/M, z24.s, z1.s\n"
+      "fmin z25.s, p4/M, z25.s, z1.s\n"
+      "fmin z26.s, p4/M, z26.s, z1.s\n"
+      "fmin z27.s, p4/M, z27.s, z1.s\n"
+      "fmin z28.s, p4/M, z28.s, z1.s\n"
+      "fmin z29.s, p4/M, z29.s, z1.s\n"
+      "fmin z30.s, p4/M, z30.s, z1.s\n"
+      "fmin z31.s, p4/M, z31.s, z1.s\n"
+      "fmax z8.s, p4/M, z8.s, z0.s\n"
+      "fmax z9.s, p4/M, z9.s, z0.s\n"
+      "fmax z10.s, p4/M, z10.s, z0.s\n"
+      "fmax z11.s, p4/M, z11.s, z0.s\n"
+      "fmax z12.s, p4/M, z12.s, z0.s\n"
+      "fmax z13.s, p4/M, z13.s, z0.s\n"
+      "fmax z14.s, p4/M, z14.s, z0.s\n"
+      "fmax z15.s, p4/M, z15.s, z0.s\n"
+      "fmax z16.s, p4/M, z16.s, z0.s\n"
+      "fmax z17.s, p4/M, z17.s, z0.s\n"
+      "fmax z18.s, p4/M, z18.s, z0.s\n"
+      "fmax z19.s, p4/M, z19.s, z0.s\n"
+      "fmax z20.s, p4/M, z20.s, z0.s\n"
+      "fmax z21.s, p4/M, z21.s, z0.s\n"
+      "fmax z22.s, p4/M, z22.s, z0.s\n"
+      "fmax z23.s, p4/M, z23.s, z0.s\n"
+      "fmax z24.s, p4/M, z24.s, z0.s\n"
+      "fmax z25.s, p4/M, z25.s, z0.s\n"
+      "fmax z26.s, p4/M, z26.s, z0.s\n"
+      "fmax z27.s, p4/M, z27.s, z0.s\n"
+      "fmax z28.s, p4/M, z28.s, z0.s\n"
+      "fmax z29.s, p4/M, z29.s, z0.s\n"
+      "fmax z30.s, p4/M, z30.s, z0.s\n"
+      "fmax z31.s, p4/M, z31.s, z0.s\n"
+      "77:"  // Height 6: No activation
+      "st1w { z8.s }, p3, [x13]\n"
+      "st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p3, [x25]\n"
+      "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x24]\n"
+      "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x23]\n"
+      "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x22]\n"
+      "st1w { z25.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z28.s }, p3, [x21]\n"
+      "st1w { z29.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p0, [x21, #3, MUL VL]\n"
+      "78:"  // Height 6: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 67b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 80f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 79f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "79:"  // Update direct input
+      "mov x20, #0x18\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "80:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..b1ab31e618
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp
@@ -0,0 +1,2310 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_ffhybrid_fp32_mla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, size_t B_stride, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        const float *cur_B_ptr = {};
+        size_t B_stride = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    ka.B_stride = B_stride;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 3f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 3f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 3f\n"
+      "mov x11, x12\n"
+      "3:"  // Height 1: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 4f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 9f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x27, #0x4\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1w { z16.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10]\n"
+      "fmla z10.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z8.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z10.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z8.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z10.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z8.s, z16.s, z0.s[3]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "sub x27, x27, #0x4\n"
+      "ld1w { z16.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "cmp x27, #0x4\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
+      "add x26, x26, #0x10\n"
+      "addvl x12, x12, #4\n"
+      "addvl x11, x11, #4\n"
+      "addvl x10, x10, #4\n"
+      "addvl x9, x9, #4\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1w { z16.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z17.s, z0.s[0]\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 12f\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[1]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.s, z17.s, z0.s[1]\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 12f\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[2]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.s, z17.s, z0.s[2]\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 12f\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[3]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 7b\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "13:"  // Height 1: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "14:"  // Height 1: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 2b\n"
+      "b 86f\n"
+      "15:"  // Height 2
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "16:"  // Height 2: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 17f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 17f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 17f\n"
+      "mov x11, x12\n"
+      "17:"  // Height 2: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 18f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x15, x15, #4\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x13, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 23f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "23:"  // Height 2: input setup done
+      "cmp x27, #0x4\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z1.s }, p0/Z, [x26]\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x4\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z1.s[0]\n"
+      "fmla z12.s, z17.s, z0.s[0]\n"
+      "fmla z9.s, z16.s, z1.s[0]\n"
+      "fmla z13.s, z16.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z17.s, z1.s[0]\n"
+      "fmla z14.s, z17.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "cmp x27, #0x4\n"
+      "fmla z11.s, z16.s, z1.s[0]\n"
+      "fmla z15.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z8.s, z17.s, z1.s[1]\n"
+      "fmla z12.s, z17.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      "fmla z9.s, z16.s, z1.s[1]\n"
+      "fmla z13.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.s, z17.s, z1.s[1]\n"
+      "fmla z14.s, z17.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.s, z16.s, z1.s[1]\n"
+      "fmla z15.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.s, z17.s, z1.s[2]\n"
+      "fmla z12.s, z17.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z16.s, z1.s[2]\n"
+      "fmla z13.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.s, z17.s, z1.s[2]\n"
+      "fmla z14.s, z17.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "fmla z11.s, z16.s, z1.s[2]\n"
+      "fmla z15.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "fmla z8.s, z17.s, z1.s[3]\n"
+      "fmla z12.s, z17.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z9.s, z16.s, z1.s[3]\n"
+      "fmla z13.s, z16.s, z0.s[3]\n"
+      "ld1w { z16.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, z17.s, z1.s[3]\n"
+      "fmla z14.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z1.s[3]\n"
+      "fmla z15.s, z16.s, z0.s[3]\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[0]\n"
+      "fmla z12.s, z17.s, z1.s[0]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "fmla z13.s, z16.s, z1.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z17.s, z0.s[0]\n"
+      "fmla z14.s, z17.s, z1.s[0]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
+      "fmla z15.s, z16.s, z1.s[0]\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 26f\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[1]\n"
+      "fmla z12.s, z17.s, z1.s[1]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "fmla z13.s, z16.s, z1.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.s, z17.s, z0.s[1]\n"
+      "fmla z14.s, z17.s, z1.s[1]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
+      "fmla z15.s, z16.s, z1.s[1]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 26f\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[2]\n"
+      "fmla z12.s, z17.s, z1.s[2]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "fmla z13.s, z16.s, z1.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.s, z17.s, z0.s[2]\n"
+      "fmla z14.s, z17.s, z1.s[2]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
+      "fmla z15.s, z16.s, z1.s[2]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ble 26f\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[3]\n"
+      "fmla z12.s, z17.s, z1.s[3]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "fmla z13.s, z16.s, z1.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z14.s, z17.s, z1.s[3]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
+      "fmla z15.s, z16.s, z1.s[3]\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 21b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z15.s, p5/M, z15.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z15.s, p5/M, z15.s, z16.s\n"
+      "27:"  // Height 2: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "28:"  // Height 2: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 16b\n"
+      "b 86f\n"
+      "29:"  // Height 3
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "30:"  // Height 3: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 31f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 31f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 31f\n"
+      "mov x11, x12\n"
+      "31:"  // Height 3: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 32f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x13, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20]\n"
+      "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 37f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "37:"  // Height 3: input setup done
+      "cmp x27, #0x4\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x24]\n"
+      "ld1w { z21.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z21.s, z2.s[0]\n"
+      "fmla z12.s, z21.s, z1.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x11]\n"
+      "fmla z16.s, z21.s, z0.s[0]\n"
+      "fmla z9.s, z20.s, z2.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "fmla z13.s, z20.s, z1.s[0]\n"
+      "fmla z17.s, z20.s, z0.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x9]\n"
+      "cmp x27, #0x4\n"
+      "fmla z10.s, z21.s, z2.s[0]\n"
+      "fmla z14.s, z21.s, z1.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "fmla z18.s, z21.s, z0.s[0]\n"
+      "fmla z11.s, z20.s, z2.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z15.s, z20.s, z1.s[0]\n"
+      "fmla z19.s, z20.s, z0.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.s, z21.s, z2.s[1]\n"
+      "fmla z12.s, z21.s, z1.s[1]\n"
+      "fmla z16.s, z21.s, z0.s[1]\n"
+      "fmla z9.s, z20.s, z2.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[1]\n"
+      "fmla z17.s, z20.s, z0.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.s, z21.s, z2.s[1]\n"
+      "fmla z14.s, z21.s, z1.s[1]\n"
+      "fmla z18.s, z21.s, z0.s[1]\n"
+      "fmla z11.s, z20.s, z2.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z15.s, z20.s, z1.s[1]\n"
+      "fmla z19.s, z20.s, z0.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.s, z21.s, z2.s[2]\n"
+      "fmla z12.s, z21.s, z1.s[2]\n"
+      "fmla z16.s, z21.s, z0.s[2]\n"
+      "fmla z9.s, z20.s, z2.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[2]\n"
+      "fmla z17.s, z20.s, z0.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.s, z21.s, z2.s[2]\n"
+      "fmla z14.s, z21.s, z1.s[2]\n"
+      "fmla z18.s, z21.s, z0.s[2]\n"
+      "fmla z11.s, z20.s, z2.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "fmla z15.s, z20.s, z1.s[2]\n"
+      "fmla z19.s, z20.s, z0.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "fmla z8.s, z21.s, z2.s[3]\n"
+      "fmla z12.s, z21.s, z1.s[3]\n"
+      "fmla z16.s, z21.s, z0.s[3]\n"
+      "fmla z9.s, z20.s, z2.s[3]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z13.s, z20.s, z1.s[3]\n"
+      "fmla z17.s, z20.s, z0.s[3]\n"
+      "ld1w { z20.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, z21.s, z2.s[3]\n"
+      "fmla z14.s, z21.s, z1.s[3]\n"
+      "fmla z18.s, z21.s, z0.s[3]\n"
+      "fmla z11.s, z20.s, z2.s[3]\n"
+      "fmla z15.s, z20.s, z1.s[3]\n"
+      "fmla z19.s, z20.s, z0.s[3]\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1w { z21.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z21.s, z0.s[0]\n"
+      "fmla z12.s, z21.s, z1.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x11]\n"
+      "fmla z16.s, z21.s, z2.s[0]\n"
+      "fmla z9.s, z20.s, z0.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "fmla z13.s, z20.s, z1.s[0]\n"
+      "fmla z17.s, z20.s, z2.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.s, z21.s, z0.s[0]\n"
+      "fmla z14.s, z21.s, z1.s[0]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.s, z21.s, z2.s[0]\n"
+      "fmla z11.s, z20.s, z0.s[0]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.s, z20.s, z1.s[0]\n"
+      "fmla z19.s, z20.s, z2.s[0]\n"
+      "ble 40f\n"
+      "ld1w { z21.s }, p5/Z, [x12]\n"
+      "ld1w { z20.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z21.s, z0.s[1]\n"
+      "fmla z12.s, z21.s, z1.s[1]\n"
+      "fmla z16.s, z21.s, z2.s[1]\n"
+      "fmla z9.s, z20.s, z0.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.s, z20.s, z1.s[1]\n"
+      "fmla z17.s, z20.s, z2.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.s, z21.s, z0.s[1]\n"
+      "fmla z14.s, z21.s, z1.s[1]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.s, z21.s, z2.s[1]\n"
+      "fmla z11.s, z20.s, z0.s[1]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.s, z20.s, z1.s[1]\n"
+      "fmla z19.s, z20.s, z2.s[1]\n"
+      "ble 40f\n"
+      "ld1w { z21.s }, p5/Z, [x12]\n"
+      "ld1w { z20.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z21.s, z0.s[2]\n"
+      "fmla z12.s, z21.s, z1.s[2]\n"
+      "fmla z16.s, z21.s, z2.s[2]\n"
+      "fmla z9.s, z20.s, z0.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.s, z20.s, z1.s[2]\n"
+      "fmla z17.s, z20.s, z2.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x9]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z10.s, z21.s, z0.s[2]\n"
+      "fmla z14.s, z21.s, z1.s[2]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z18.s, z21.s, z2.s[2]\n"
+      "fmla z11.s, z20.s, z0.s[2]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z15.s, z20.s, z1.s[2]\n"
+      "fmla z19.s, z20.s, z2.s[2]\n"
+      "ble 40f\n"
+      "ld1w { z21.s }, p5/Z, [x12]\n"
+      "ld1w { z20.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z21.s, z0.s[3]\n"
+      "fmla z12.s, z21.s, z1.s[3]\n"
+      "fmla z16.s, z21.s, z2.s[3]\n"
+      "fmla z9.s, z20.s, z0.s[3]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z13.s, z20.s, z1.s[3]\n"
+      "fmla z17.s, z20.s, z2.s[3]\n"
+      "ld1w { z20.s }, p5/Z, [x9]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z10.s, z21.s, z0.s[3]\n"
+      "fmla z14.s, z21.s, z1.s[3]\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.s, z21.s, z2.s[3]\n"
+      "fmla z11.s, z20.s, z0.s[3]\n"
+      "fmla z15.s, z20.s, z1.s[3]\n"
+      "fmla z19.s, z20.s, z2.s[3]\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 35b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z21.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z20.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z21.s\n"
+      "fmin z9.s, p5/M, z9.s, z21.s\n"
+      "fmin z10.s, p5/M, z10.s, z21.s\n"
+      "fmin z11.s, p5/M, z11.s, z21.s\n"
+      "fmin z12.s, p5/M, z12.s, z21.s\n"
+      "fmin z13.s, p5/M, z13.s, z21.s\n"
+      "fmin z14.s, p5/M, z14.s, z21.s\n"
+      "fmin z15.s, p5/M, z15.s, z21.s\n"
+      "fmin z16.s, p5/M, z16.s, z21.s\n"
+      "fmin z17.s, p5/M, z17.s, z21.s\n"
+      "fmin z18.s, p5/M, z18.s, z21.s\n"
+      "fmin z19.s, p5/M, z19.s, z21.s\n"
+      "fmax z8.s, p5/M, z8.s, z20.s\n"
+      "fmax z9.s, p5/M, z9.s, z20.s\n"
+      "fmax z10.s, p5/M, z10.s, z20.s\n"
+      "fmax z11.s, p5/M, z11.s, z20.s\n"
+      "fmax z12.s, p5/M, z12.s, z20.s\n"
+      "fmax z13.s, p5/M, z13.s, z20.s\n"
+      "fmax z14.s, p5/M, z14.s, z20.s\n"
+      "fmax z15.s, p5/M, z15.s, z20.s\n"
+      "fmax z16.s, p5/M, z16.s, z20.s\n"
+      "fmax z17.s, p5/M, z17.s, z20.s\n"
+      "fmax z18.s, p5/M, z18.s, z20.s\n"
+      "fmax z19.s, p5/M, z19.s, z20.s\n"
+      "41:"  // Height 3: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "42:"  // Height 3: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 30b\n"
+      "b 86f\n"
+      "43:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "44:"  // Height 4: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 45f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 45f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 45f\n"
+      "mov x11, x12\n"
+      "45:"  // Height 4: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 46f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x13, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21]\n"
+      "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 51f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "51:"  // Height 4: input setup done
+      "cmp x27, #0x4\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z3.s }, p0/Z, [x26]\n"
+      "ld1rqw { z2.s }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x4\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      "cmp x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "ld1w { z25.s }, p5/Z, [x12]\n"
+      "ld1w { z24.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z25.s, z3.s[0]\n"
+      "fmla z12.s, z25.s, z2.s[0]\n"
+      "fmla z16.s, z25.s, z1.s[0]\n"
+      "fmla z20.s, z25.s, z0.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "add x25, x25, #0x10\n"
+      "fmla z9.s, z24.s, z3.s[0]\n"
+      "fmla z13.s, z24.s, z2.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "fmla z17.s, z24.s, z1.s[0]\n"
+      "fmla z21.s, z24.s, z0.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z25.s, z3.s[0]\n"
+      "fmla z14.s, z25.s, z2.s[0]\n"
+      "fmla z18.s, z25.s, z1.s[0]\n"
+      "fmla z22.s, z25.s, z0.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z11.s, z24.s, z3.s[0]\n"
+      "fmla z15.s, z24.s, z2.s[0]\n"
+      "fmla z19.s, z24.s, z1.s[0]\n"
+      "fmla z23.s, z24.s, z0.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[1]\n"
+      "fmla z12.s, z25.s, z2.s[1]\n"
+      "fmla z16.s, z25.s, z1.s[1]\n"
+      "fmla z20.s, z25.s, z0.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.s, z24.s, z3.s[1]\n"
+      "fmla z13.s, z24.s, z2.s[1]\n"
+      "fmla z17.s, z24.s, z1.s[1]\n"
+      "fmla z21.s, z24.s, z0.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.s, z25.s, z3.s[1]\n"
+      "fmla z14.s, z25.s, z2.s[1]\n"
+      "fmla z18.s, z25.s, z1.s[1]\n"
+      "fmla z22.s, z25.s, z0.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.s, z24.s, z3.s[1]\n"
+      "fmla z15.s, z24.s, z2.s[1]\n"
+      "fmla z19.s, z24.s, z1.s[1]\n"
+      "fmla z23.s, z24.s, z0.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[2]\n"
+      "fmla z12.s, z25.s, z2.s[2]\n"
+      "fmla z16.s, z25.s, z1.s[2]\n"
+      "fmla z20.s, z25.s, z0.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z24.s, z3.s[2]\n"
+      "fmla z13.s, z24.s, z2.s[2]\n"
+      "fmla z17.s, z24.s, z1.s[2]\n"
+      "fmla z21.s, z24.s, z0.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.s, z25.s, z3.s[2]\n"
+      "fmla z14.s, z25.s, z2.s[2]\n"
+      "fmla z18.s, z25.s, z1.s[2]\n"
+      "fmla z22.s, z25.s, z0.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "fmla z11.s, z24.s, z3.s[2]\n"
+      "fmla z15.s, z24.s, z2.s[2]\n"
+      "fmla z19.s, z24.s, z1.s[2]\n"
+      "fmla z23.s, z24.s, z0.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "fmla z8.s, z25.s, z3.s[3]\n"
+      "fmla z12.s, z25.s, z2.s[3]\n"
+      "fmla z16.s, z25.s, z1.s[3]\n"
+      "fmla z20.s, z25.s, z0.s[3]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z9.s, z24.s, z3.s[3]\n"
+      "fmla z13.s, z24.s, z2.s[3]\n"
+      "fmla z17.s, z24.s, z1.s[3]\n"
+      "fmla z21.s, z24.s, z0.s[3]\n"
+      "ld1w { z24.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, z25.s, z3.s[3]\n"
+      "fmla z14.s, z25.s, z2.s[3]\n"
+      "fmla z18.s, z25.s, z1.s[3]\n"
+      "fmla z22.s, z25.s, z0.s[3]\n"
+      "fmla z11.s, z24.s, z3.s[3]\n"
+      "fmla z15.s, z24.s, z2.s[3]\n"
+      "fmla z19.s, z24.s, z1.s[3]\n"
+      "fmla z23.s, z24.s, z0.s[3]\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1w { z25.s }, p5/Z, [x12]\n"
+      "ld1w { z24.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z25.s, z0.s[0]\n"
+      "fmla z12.s, z25.s, z1.s[0]\n"
+      "fmla z16.s, z25.s, z2.s[0]\n"
+      "fmla z20.s, z25.s, z3.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z9.s, z24.s, z0.s[0]\n"
+      "fmla z13.s, z24.s, z1.s[0]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.s, z24.s, z2.s[0]\n"
+      "fmla z21.s, z24.s, z3.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, z25.s, z0.s[0]\n"
+      "fmla z14.s, z25.s, z1.s[0]\n"
+      "fmla z18.s, z25.s, z2.s[0]\n"
+      "fmla z22.s, z25.s, z3.s[0]\n"
+      "fmla z11.s, z24.s, z0.s[0]\n"
+      "fmla z15.s, z24.s, z1.s[0]\n"
+      "fmla z19.s, z24.s, z2.s[0]\n"
+      "fmla z23.s, z24.s, z3.s[0]\n"
+      "ble 54f\n"
+      "ld1w { z25.s }, p5/Z, [x12]\n"
+      "ld1w { z24.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z25.s, z0.s[1]\n"
+      "fmla z12.s, z25.s, z1.s[1]\n"
+      "fmla z16.s, z25.s, z2.s[1]\n"
+      "fmla z20.s, z25.s, z3.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.s, z24.s, z0.s[1]\n"
+      "fmla z13.s, z24.s, z1.s[1]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z17.s, z24.s, z2.s[1]\n"
+      "fmla z21.s, z24.s, z3.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.s, z25.s, z0.s[1]\n"
+      "fmla z14.s, z25.s, z1.s[1]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.s, z25.s, z2.s[1]\n"
+      "fmla z22.s, z25.s, z3.s[1]\n"
+      "fmla z11.s, z24.s, z0.s[1]\n"
+      "fmla z15.s, z24.s, z1.s[1]\n"
+      "fmla z19.s, z24.s, z2.s[1]\n"
+      "fmla z23.s, z24.s, z3.s[1]\n"
+      "ble 54f\n"
+      "ld1w { z25.s }, p5/Z, [x12]\n"
+      "ld1w { z24.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z25.s, z0.s[2]\n"
+      "fmla z12.s, z25.s, z1.s[2]\n"
+      "fmla z16.s, z25.s, z2.s[2]\n"
+      "fmla z20.s, z25.s, z3.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.s, z24.s, z0.s[2]\n"
+      "fmla z13.s, z24.s, z1.s[2]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z17.s, z24.s, z2.s[2]\n"
+      "fmla z21.s, z24.s, z3.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x9]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z10.s, z25.s, z0.s[2]\n"
+      "fmla z14.s, z25.s, z1.s[2]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z18.s, z25.s, z2.s[2]\n"
+      "fmla z22.s, z25.s, z3.s[2]\n"
+      "fmla z11.s, z24.s, z0.s[2]\n"
+      "fmla z15.s, z24.s, z1.s[2]\n"
+      "fmla z19.s, z24.s, z2.s[2]\n"
+      "fmla z23.s, z24.s, z3.s[2]\n"
+      "ble 54f\n"
+      "ld1w { z25.s }, p5/Z, [x12]\n"
+      "ld1w { z24.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z25.s, z0.s[3]\n"
+      "fmla z12.s, z25.s, z1.s[3]\n"
+      "fmla z16.s, z25.s, z2.s[3]\n"
+      "fmla z20.s, z25.s, z3.s[3]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z9.s, z24.s, z0.s[3]\n"
+      "fmla z13.s, z24.s, z1.s[3]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.s, z24.s, z2.s[3]\n"
+      "fmla z21.s, z24.s, z3.s[3]\n"
+      "ld1w { z24.s }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, z25.s, z0.s[3]\n"
+      "fmla z14.s, z25.s, z1.s[3]\n"
+      "fmla z18.s, z25.s, z2.s[3]\n"
+      "fmla z22.s, z25.s, z3.s[3]\n"
+      "fmla z11.s, z24.s, z0.s[3]\n"
+      "fmla z15.s, z24.s, z1.s[3]\n"
+      "fmla z19.s, z24.s, z2.s[3]\n"
+      "fmla z23.s, z24.s, z3.s[3]\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 49b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z15.s, p5/M, z15.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmin z20.s, p5/M, z20.s, z25.s\n"
+      "fmin z21.s, p5/M, z21.s, z25.s\n"
+      "fmin z22.s, p5/M, z22.s, z25.s\n"
+      "fmin z23.s, p5/M, z23.s, z25.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z15.s, p5/M, z15.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z20.s, p5/M, z20.s, z24.s\n"
+      "fmax z21.s, p5/M, z21.s, z24.s\n"
+      "fmax z22.s, p5/M, z22.s, z24.s\n"
+      "fmax z23.s, p5/M, z23.s, z24.s\n"
+      "55:"  // Height 4: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x23]\n"
+      "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+      "56:"  // Height 4: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 44b\n"
+      "b 86f\n"
+      "57:"  // Height 5
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "58:"  // Height 5: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 59f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 59f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 59f\n"
+      "mov x11, x12\n"
+      "59:"  // Height 5: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 60f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 62f\n"
+      "60:"  // Height 5: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22]\n"
+      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x20]\n"
+      "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "b 62f\n"
+      "61:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "62:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "63:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 65f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 65f\n"
+      "64:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "65:"  // Height 5: input setup done
+      "cmp x27, #0x4\n"
+      "ble 67f\n"
+      "66:"  // Height 5: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x4\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z1.s }, p0/Z, [x23]\n"
+      "cmp x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqw { z0.s }, p0/Z, [x22]\n"
+      "ld1w { z29.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z29.s, z4.s[0]\n"
+      "fmla z12.s, z29.s, z3.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x11]\n"
+      "fmla z16.s, z29.s, z2.s[0]\n"
+      "fmla z20.s, z29.s, z1.s[0]\n"
+      "add x25, x25, #0x10\n"
+      "fmla z24.s, z29.s, z0.s[0]\n"
+      "fmla z9.s, z28.s, z4.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z28.s, z3.s[0]\n"
+      "fmla z17.s, z28.s, z2.s[0]\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "fmla z21.s, z28.s, z1.s[0]\n"
+      "fmla z25.s, z28.s, z0.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z29.s, z4.s[0]\n"
+      "fmla z14.s, z29.s, z3.s[0]\n"
+      "fmla z18.s, z29.s, z2.s[0]\n"
+      "fmla z22.s, z29.s, z1.s[0]\n"
+      "fmla z26.s, z29.s, z0.s[0]\n"
+      "fmla z11.s, z28.s, z4.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z15.s, z28.s, z3.s[0]\n"
+      "fmla z19.s, z28.s, z2.s[0]\n"
+      "fmla z23.s, z28.s, z1.s[0]\n"
+      "fmla z27.s, z28.s, z0.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.s, z29.s, z4.s[1]\n"
+      "fmla z12.s, z29.s, z3.s[1]\n"
+      "fmla z16.s, z29.s, z2.s[1]\n"
+      "fmla z20.s, z29.s, z1.s[1]\n"
+      "fmla z24.s, z29.s, z0.s[1]\n"
+      "fmla z9.s, z28.s, z4.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z13.s, z28.s, z3.s[1]\n"
+      "fmla z17.s, z28.s, z2.s[1]\n"
+      "fmla z21.s, z28.s, z1.s[1]\n"
+      "fmla z25.s, z28.s, z0.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.s, z29.s, z4.s[1]\n"
+      "fmla z14.s, z29.s, z3.s[1]\n"
+      "fmla z18.s, z29.s, z2.s[1]\n"
+      "fmla z22.s, z29.s, z1.s[1]\n"
+      "fmla z26.s, z29.s, z0.s[1]\n"
+      "fmla z11.s, z28.s, z4.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z15.s, z28.s, z3.s[1]\n"
+      "fmla z19.s, z28.s, z2.s[1]\n"
+      "fmla z23.s, z28.s, z1.s[1]\n"
+      "fmla z27.s, z28.s, z0.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.s, z29.s, z4.s[2]\n"
+      "fmla z12.s, z29.s, z3.s[2]\n"
+      "fmla z16.s, z29.s, z2.s[2]\n"
+      "fmla z20.s, z29.s, z1.s[2]\n"
+      "fmla z24.s, z29.s, z0.s[2]\n"
+      "fmla z9.s, z28.s, z4.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z28.s, z3.s[2]\n"
+      "fmla z17.s, z28.s, z2.s[2]\n"
+      "fmla z21.s, z28.s, z1.s[2]\n"
+      "fmla z25.s, z28.s, z0.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.s, z29.s, z4.s[2]\n"
+      "fmla z14.s, z29.s, z3.s[2]\n"
+      "fmla z18.s, z29.s, z2.s[2]\n"
+      "fmla z22.s, z29.s, z1.s[2]\n"
+      "fmla z26.s, z29.s, z0.s[2]\n"
+      "fmla z11.s, z28.s, z4.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "fmla z15.s, z28.s, z3.s[2]\n"
+      "fmla z19.s, z28.s, z2.s[2]\n"
+      "fmla z23.s, z28.s, z1.s[2]\n"
+      "fmla z27.s, z28.s, z0.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "fmla z8.s, z29.s, z4.s[3]\n"
+      "fmla z12.s, z29.s, z3.s[3]\n"
+      "fmla z16.s, z29.s, z2.s[3]\n"
+      "fmla z20.s, z29.s, z1.s[3]\n"
+      "fmla z24.s, z29.s, z0.s[3]\n"
+      "fmla z9.s, z28.s, z4.s[3]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z13.s, z28.s, z3.s[3]\n"
+      "fmla z17.s, z28.s, z2.s[3]\n"
+      "fmla z21.s, z28.s, z1.s[3]\n"
+      "fmla z25.s, z28.s, z0.s[3]\n"
+      "ld1w { z28.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, z29.s, z4.s[3]\n"
+      "fmla z14.s, z29.s, z3.s[3]\n"
+      "fmla z18.s, z29.s, z2.s[3]\n"
+      "fmla z22.s, z29.s, z1.s[3]\n"
+      "fmla z26.s, z29.s, z0.s[3]\n"
+      "fmla z11.s, z28.s, z4.s[3]\n"
+      "fmla z15.s, z28.s, z3.s[3]\n"
+      "fmla z19.s, z28.s, z2.s[3]\n"
+      "fmla z23.s, z28.s, z1.s[3]\n"
+      "fmla z27.s, z28.s, z0.s[3]\n"
+      "bgt 66b\n"
+      "67:"  // Height 5: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "ld1w { z29.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z29.s, z0.s[0]\n"
+      "fmla z12.s, z29.s, z1.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x11]\n"
+      "fmla z16.s, z29.s, z2.s[0]\n"
+      "fmla z20.s, z29.s, z3.s[0]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.s, z29.s, z4.s[0]\n"
+      "fmla z9.s, z28.s, z0.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.s, z28.s, z1.s[0]\n"
+      "fmla z17.s, z28.s, z2.s[0]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.s, z28.s, z3.s[0]\n"
+      "fmla z25.s, z28.s, z4.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, z29.s, z0.s[0]\n"
+      "fmla z14.s, z29.s, z1.s[0]\n"
+      "fmla z18.s, z29.s, z2.s[0]\n"
+      "fmla z22.s, z29.s, z3.s[0]\n"
+      "fmla z26.s, z29.s, z4.s[0]\n"
+      "fmla z11.s, z28.s, z0.s[0]\n"
+      "fmla z15.s, z28.s, z1.s[0]\n"
+      "fmla z19.s, z28.s, z2.s[0]\n"
+      "fmla z23.s, z28.s, z3.s[0]\n"
+      "fmla z27.s, z28.s, z4.s[0]\n"
+      "ble 68f\n"
+      "ld1w { z29.s }, p5/Z, [x12]\n"
+      "ld1w { z28.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z29.s, z0.s[1]\n"
+      "fmla z12.s, z29.s, z1.s[1]\n"
+      "fmla z16.s, z29.s, z2.s[1]\n"
+      "fmla z20.s, z29.s, z3.s[1]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.s, z29.s, z4.s[1]\n"
+      "fmla z9.s, z28.s, z0.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.s, z28.s, z1.s[1]\n"
+      "fmla z17.s, z28.s, z2.s[1]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.s, z28.s, z3.s[1]\n"
+      "fmla z25.s, z28.s, z4.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, z29.s, z0.s[1]\n"
+      "fmla z14.s, z29.s, z1.s[1]\n"
+      "fmla z18.s, z29.s, z2.s[1]\n"
+      "fmla z22.s, z29.s, z3.s[1]\n"
+      "fmla z26.s, z29.s, z4.s[1]\n"
+      "fmla z11.s, z28.s, z0.s[1]\n"
+      "fmla z15.s, z28.s, z1.s[1]\n"
+      "fmla z19.s, z28.s, z2.s[1]\n"
+      "fmla z23.s, z28.s, z3.s[1]\n"
+      "fmla z27.s, z28.s, z4.s[1]\n"
+      "ble 68f\n"
+      "ld1w { z29.s }, p5/Z, [x12]\n"
+      "ld1w { z28.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z29.s, z0.s[2]\n"
+      "fmla z12.s, z29.s, z1.s[2]\n"
+      "fmla z16.s, z29.s, z2.s[2]\n"
+      "fmla z20.s, z29.s, z3.s[2]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.s, z29.s, z4.s[2]\n"
+      "fmla z9.s, z28.s, z0.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z13.s, z28.s, z1.s[2]\n"
+      "fmla z17.s, z28.s, z2.s[2]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z21.s, z28.s, z3.s[2]\n"
+      "fmla z25.s, z28.s, z4.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, z29.s, z0.s[2]\n"
+      "fmla z14.s, z29.s, z1.s[2]\n"
+      "fmla z18.s, z29.s, z2.s[2]\n"
+      "fmla z22.s, z29.s, z3.s[2]\n"
+      "fmla z26.s, z29.s, z4.s[2]\n"
+      "fmla z11.s, z28.s, z0.s[2]\n"
+      "fmla z15.s, z28.s, z1.s[2]\n"
+      "fmla z19.s, z28.s, z2.s[2]\n"
+      "fmla z23.s, z28.s, z3.s[2]\n"
+      "fmla z27.s, z28.s, z4.s[2]\n"
+      "ble 68f\n"
+      "ld1w { z29.s }, p5/Z, [x12]\n"
+      "ld1w { z28.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z29.s, z0.s[3]\n"
+      "fmla z12.s, z29.s, z1.s[3]\n"
+      "fmla z16.s, z29.s, z2.s[3]\n"
+      "fmla z20.s, z29.s, z3.s[3]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z24.s, z29.s, z4.s[3]\n"
+      "fmla z9.s, z28.s, z0.s[3]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z13.s, z28.s, z1.s[3]\n"
+      "fmla z17.s, z28.s, z2.s[3]\n"
+      "fmla z21.s, z28.s, z3.s[3]\n"
+      "fmla z25.s, z28.s, z4.s[3]\n"
+      "ld1w { z28.s }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, z29.s, z0.s[3]\n"
+      "fmla z14.s, z29.s, z1.s[3]\n"
+      "fmla z18.s, z29.s, z2.s[3]\n"
+      "fmla z22.s, z29.s, z3.s[3]\n"
+      "fmla z26.s, z29.s, z4.s[3]\n"
+      "fmla z11.s, z28.s, z0.s[3]\n"
+      "fmla z15.s, z28.s, z1.s[3]\n"
+      "fmla z19.s, z28.s, z2.s[3]\n"
+      "fmla z23.s, z28.s, z3.s[3]\n"
+      "fmla z27.s, z28.s, z4.s[3]\n"
+      "68:"  // Height 5: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 63b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z29.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z28.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z29.s\n"
+      "fmin z9.s, p5/M, z9.s, z29.s\n"
+      "fmin z10.s, p5/M, z10.s, z29.s\n"
+      "fmin z11.s, p5/M, z11.s, z29.s\n"
+      "fmin z12.s, p5/M, z12.s, z29.s\n"
+      "fmin z13.s, p5/M, z13.s, z29.s\n"
+      "fmin z14.s, p5/M, z14.s, z29.s\n"
+      "fmin z15.s, p5/M, z15.s, z29.s\n"
+      "fmin z16.s, p5/M, z16.s, z29.s\n"
+      "fmin z17.s, p5/M, z17.s, z29.s\n"
+      "fmin z18.s, p5/M, z18.s, z29.s\n"
+      "fmin z19.s, p5/M, z19.s, z29.s\n"
+      "fmin z20.s, p5/M, z20.s, z29.s\n"
+      "fmin z21.s, p5/M, z21.s, z29.s\n"
+      "fmin z22.s, p5/M, z22.s, z29.s\n"
+      "fmin z23.s, p5/M, z23.s, z29.s\n"
+      "fmin z24.s, p5/M, z24.s, z29.s\n"
+      "fmin z25.s, p5/M, z25.s, z29.s\n"
+      "fmin z26.s, p5/M, z26.s, z29.s\n"
+      "fmin z27.s, p5/M, z27.s, z29.s\n"
+      "fmax z8.s, p5/M, z8.s, z28.s\n"
+      "fmax z9.s, p5/M, z9.s, z28.s\n"
+      "fmax z10.s, p5/M, z10.s, z28.s\n"
+      "fmax z11.s, p5/M, z11.s, z28.s\n"
+      "fmax z12.s, p5/M, z12.s, z28.s\n"
+      "fmax z13.s, p5/M, z13.s, z28.s\n"
+      "fmax z14.s, p5/M, z14.s, z28.s\n"
+      "fmax z15.s, p5/M, z15.s, z28.s\n"
+      "fmax z16.s, p5/M, z16.s, z28.s\n"
+      "fmax z17.s, p5/M, z17.s, z28.s\n"
+      "fmax z18.s, p5/M, z18.s, z28.s\n"
+      "fmax z19.s, p5/M, z19.s, z28.s\n"
+      "fmax z20.s, p5/M, z20.s, z28.s\n"
+      "fmax z21.s, p5/M, z21.s, z28.s\n"
+      "fmax z22.s, p5/M, z22.s, z28.s\n"
+      "fmax z23.s, p5/M, z23.s, z28.s\n"
+      "fmax z24.s, p5/M, z24.s, z28.s\n"
+      "fmax z25.s, p5/M, z25.s, z28.s\n"
+      "fmax z26.s, p5/M, z26.s, z28.s\n"
+      "fmax z27.s, p5/M, z27.s, z28.s\n"
+      "69:"  // Height 5: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x23]\n"
+      "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x22]\n"
+      "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+      "70:"  // Height 5: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 58b\n"
+      "b 86f\n"
+      "71:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x21, #0x18\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x15, %x[bias]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
+      "72:"  // Height 6: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #2\n"
+      "cntw x21, ALL, MUL #3\n"
+      "add x10, x11, x20, LSL #2\n"
+      "add x9, x10, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 73f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 73f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 73f\n"
+      "mov x11, x12\n"
+      "73:"  // Height 6: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 74f\n"
+      "ld1w { z8.s }, p5/Z, [x15]\n"
+      "ld1w { z9.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x15, x15, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 76f\n"
+      "74:"  // Height 6: no bias
+      "tbz %x[flags], #0, 75f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x13, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x23]\n"
+      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x21]\n"
+      "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "b 76f\n"
+      "75:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "76:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "77:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 78f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 79f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 79f\n"
+      "78:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "79:"  // Height 6: input setup done
+      "cmp x27, #0x4\n"
+      "ble 81f\n"
+      "80:"  // Height 6: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z7.s }, p0/Z, [x26]\n"
+      "ld1rqw { z6.s }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x4\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z4.s }, p0/Z, [x23]\n"
+      "cmp x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "ld1w { z1.s }, p5/Z, [x12]\n"
+      "ld1w { z0.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z1.s, z7.s[0]\n"
+      "fmla z12.s, z1.s, z6.s[0]\n"
+      "fmla z16.s, z1.s, z5.s[0]\n"
+      "fmla z20.s, z1.s, z4.s[0]\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "fmla z24.s, z1.s, z3.s[0]\n"
+      "fmla z28.s, z1.s, z2.s[0]\n"
+      "ld1w { z1.s }, p5/Z, [x10]\n"
+      "add x21, x21, #0x10\n"
+      "fmla z9.s, z0.s, z7.s[0]\n"
+      "fmla z13.s, z0.s, z6.s[0]\n"
+      "fmla z17.s, z0.s, z5.s[0]\n"
+      "fmla z21.s, z0.s, z4.s[0]\n"
+      "fmla z25.s, z0.s, z3.s[0]\n"
+      "fmla z29.s, z0.s, z2.s[0]\n"
+      "ld1w { z0.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z1.s, z7.s[0]\n"
+      "fmla z14.s, z1.s, z6.s[0]\n"
+      "fmla z18.s, z1.s, z5.s[0]\n"
+      "fmla z22.s, z1.s, z4.s[0]\n"
+      "fmla z26.s, z1.s, z3.s[0]\n"
+      "fmla z30.s, z1.s, z2.s[0]\n"
+      "ld1w { z1.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z11.s, z0.s, z7.s[0]\n"
+      "fmla z15.s, z0.s, z6.s[0]\n"
+      "fmla z19.s, z0.s, z5.s[0]\n"
+      "fmla z23.s, z0.s, z4.s[0]\n"
+      "fmla z27.s, z0.s, z3.s[0]\n"
+      "fmla z31.s, z0.s, z2.s[0]\n"
+      "ld1w { z0.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[1]\n"
+      "fmla z12.s, z1.s, z6.s[1]\n"
+      "fmla z16.s, z1.s, z5.s[1]\n"
+      "fmla z20.s, z1.s, z4.s[1]\n"
+      "fmla z24.s, z1.s, z3.s[1]\n"
+      "fmla z28.s, z1.s, z2.s[1]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[1]\n"
+      "fmla z13.s, z0.s, z6.s[1]\n"
+      "fmla z17.s, z0.s, z5.s[1]\n"
+      "fmla z21.s, z0.s, z4.s[1]\n"
+      "fmla z25.s, z0.s, z3.s[1]\n"
+      "fmla z29.s, z0.s, z2.s[1]\n"
+      "ld1w { z0.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.s, z1.s, z7.s[1]\n"
+      "fmla z14.s, z1.s, z6.s[1]\n"
+      "fmla z18.s, z1.s, z5.s[1]\n"
+      "fmla z22.s, z1.s, z4.s[1]\n"
+      "fmla z26.s, z1.s, z3.s[1]\n"
+      "fmla z30.s, z1.s, z2.s[1]\n"
+      "ld1w { z1.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.s, z0.s, z7.s[1]\n"
+      "fmla z15.s, z0.s, z6.s[1]\n"
+      "fmla z19.s, z0.s, z5.s[1]\n"
+      "fmla z23.s, z0.s, z4.s[1]\n"
+      "fmla z27.s, z0.s, z3.s[1]\n"
+      "fmla z31.s, z0.s, z2.s[1]\n"
+      "ld1w { z0.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[2]\n"
+      "fmla z12.s, z1.s, z6.s[2]\n"
+      "fmla z16.s, z1.s, z5.s[2]\n"
+      "fmla z20.s, z1.s, z4.s[2]\n"
+      "fmla z24.s, z1.s, z3.s[2]\n"
+      "fmla z28.s, z1.s, z2.s[2]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[2]\n"
+      "fmla z13.s, z0.s, z6.s[2]\n"
+      "fmla z17.s, z0.s, z5.s[2]\n"
+      "fmla z21.s, z0.s, z4.s[2]\n"
+      "fmla z25.s, z0.s, z3.s[2]\n"
+      "fmla z29.s, z0.s, z2.s[2]\n"
+      "ld1w { z0.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.s, z1.s, z7.s[2]\n"
+      "fmla z14.s, z1.s, z6.s[2]\n"
+      "fmla z18.s, z1.s, z5.s[2]\n"
+      "fmla z22.s, z1.s, z4.s[2]\n"
+      "fmla z26.s, z1.s, z3.s[2]\n"
+      "fmla z30.s, z1.s, z2.s[2]\n"
+      "ld1w { z1.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "fmla z11.s, z0.s, z7.s[2]\n"
+      "fmla z15.s, z0.s, z6.s[2]\n"
+      "fmla z19.s, z0.s, z5.s[2]\n"
+      "fmla z23.s, z0.s, z4.s[2]\n"
+      "fmla z27.s, z0.s, z3.s[2]\n"
+      "fmla z31.s, z0.s, z2.s[2]\n"
+      "ld1w { z0.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "fmla z8.s, z1.s, z7.s[3]\n"
+      "fmla z12.s, z1.s, z6.s[3]\n"
+      "fmla z16.s, z1.s, z5.s[3]\n"
+      "fmla z20.s, z1.s, z4.s[3]\n"
+      "fmla z24.s, z1.s, z3.s[3]\n"
+      "fmla z28.s, z1.s, z2.s[3]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z9.s, z0.s, z7.s[3]\n"
+      "fmla z13.s, z0.s, z6.s[3]\n"
+      "fmla z17.s, z0.s, z5.s[3]\n"
+      "fmla z21.s, z0.s, z4.s[3]\n"
+      "fmla z25.s, z0.s, z3.s[3]\n"
+      "fmla z29.s, z0.s, z2.s[3]\n"
+      "ld1w { z0.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, z1.s, z7.s[3]\n"
+      "fmla z14.s, z1.s, z6.s[3]\n"
+      "fmla z18.s, z1.s, z5.s[3]\n"
+      "fmla z22.s, z1.s, z4.s[3]\n"
+      "fmla z26.s, z1.s, z3.s[3]\n"
+      "fmla z30.s, z1.s, z2.s[3]\n"
+      "fmla z11.s, z0.s, z7.s[3]\n"
+      "fmla z15.s, z0.s, z6.s[3]\n"
+      "fmla z19.s, z0.s, z5.s[3]\n"
+      "fmla z23.s, z0.s, z4.s[3]\n"
+      "fmla z27.s, z0.s, z3.s[3]\n"
+      "fmla z31.s, z0.s, z2.s[3]\n"
+      "bgt 80b\n"
+      "81:"  // Height 6: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "ld1rqw { z5.s }, p0/Z, [x21]\n"
+      "ld1w { z7.s }, p5/Z, [x12]\n"
+      "ld1w { z6.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z7.s, z0.s[0]\n"
+      "fmla z12.s, z7.s, z1.s[0]\n"
+      "fmla z16.s, z7.s, z2.s[0]\n"
+      "fmla z20.s, z7.s, z3.s[0]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z24.s, z7.s, z4.s[0]\n"
+      "fmla z28.s, z7.s, z5.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z9.s, z6.s, z0.s[0]\n"
+      "fmla z13.s, z6.s, z1.s[0]\n"
+      "fmla z17.s, z6.s, z2.s[0]\n"
+      "fmla z21.s, z6.s, z3.s[0]\n"
+      "fmla z25.s, z6.s, z4.s[0]\n"
+      "fmla z29.s, z6.s, z5.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, z7.s, z0.s[0]\n"
+      "fmla z14.s, z7.s, z1.s[0]\n"
+      "fmla z18.s, z7.s, z2.s[0]\n"
+      "fmla z22.s, z7.s, z3.s[0]\n"
+      "fmla z26.s, z7.s, z4.s[0]\n"
+      "fmla z30.s, z7.s, z5.s[0]\n"
+      "fmla z11.s, z6.s, z0.s[0]\n"
+      "fmla z15.s, z6.s, z1.s[0]\n"
+      "fmla z19.s, z6.s, z2.s[0]\n"
+      "fmla z23.s, z6.s, z3.s[0]\n"
+      "fmla z27.s, z6.s, z4.s[0]\n"
+      "fmla z31.s, z6.s, z5.s[0]\n"
+      "ble 82f\n"
+      "ld1w { z7.s }, p5/Z, [x12]\n"
+      "ld1w { z6.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z7.s, z0.s[1]\n"
+      "fmla z12.s, z7.s, z1.s[1]\n"
+      "fmla z16.s, z7.s, z2.s[1]\n"
+      "fmla z20.s, z7.s, z3.s[1]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.s, z7.s, z4.s[1]\n"
+      "fmla z28.s, z7.s, z5.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z9.s, z6.s, z0.s[1]\n"
+      "fmla z13.s, z6.s, z1.s[1]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.s, z6.s, z2.s[1]\n"
+      "fmla z21.s, z6.s, z3.s[1]\n"
+      "fmla z25.s, z6.s, z4.s[1]\n"
+      "fmla z29.s, z6.s, z5.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, z7.s, z0.s[1]\n"
+      "fmla z14.s, z7.s, z1.s[1]\n"
+      "fmla z18.s, z7.s, z2.s[1]\n"
+      "fmla z22.s, z7.s, z3.s[1]\n"
+      "fmla z26.s, z7.s, z4.s[1]\n"
+      "fmla z30.s, z7.s, z5.s[1]\n"
+      "fmla z11.s, z6.s, z0.s[1]\n"
+      "fmla z15.s, z6.s, z1.s[1]\n"
+      "fmla z19.s, z6.s, z2.s[1]\n"
+      "fmla z23.s, z6.s, z3.s[1]\n"
+      "fmla z27.s, z6.s, z4.s[1]\n"
+      "fmla z31.s, z6.s, z5.s[1]\n"
+      "ble 82f\n"
+      "ld1w { z7.s }, p5/Z, [x12]\n"
+      "ld1w { z6.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z7.s, z0.s[2]\n"
+      "fmla z12.s, z7.s, z1.s[2]\n"
+      "fmla z16.s, z7.s, z2.s[2]\n"
+      "fmla z20.s, z7.s, z3.s[2]\n"
+      "subs x27, x27, #0x1\n"
+      "addvl x12, x12, #1\n"
+      "fmla z24.s, z7.s, z4.s[2]\n"
+      "fmla z28.s, z7.s, z5.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z9.s, z6.s, z0.s[2]\n"
+      "fmla z13.s, z6.s, z1.s[2]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z17.s, z6.s, z2.s[2]\n"
+      "fmla z21.s, z6.s, z3.s[2]\n"
+      "fmla z25.s, z6.s, z4.s[2]\n"
+      "fmla z29.s, z6.s, z5.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, z7.s, z0.s[2]\n"
+      "fmla z14.s, z7.s, z1.s[2]\n"
+      "fmla z18.s, z7.s, z2.s[2]\n"
+      "fmla z22.s, z7.s, z3.s[2]\n"
+      "fmla z26.s, z7.s, z4.s[2]\n"
+      "fmla z30.s, z7.s, z5.s[2]\n"
+      "fmla z11.s, z6.s, z0.s[2]\n"
+      "fmla z15.s, z6.s, z1.s[2]\n"
+      "fmla z19.s, z6.s, z2.s[2]\n"
+      "fmla z23.s, z6.s, z3.s[2]\n"
+      "fmla z27.s, z6.s, z4.s[2]\n"
+      "fmla z31.s, z6.s, z5.s[2]\n"
+      "ble 82f\n"
+      "ld1w { z7.s }, p5/Z, [x12]\n"
+      "ld1w { z6.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z7.s, z0.s[3]\n"
+      "fmla z12.s, z7.s, z1.s[3]\n"
+      "fmla z16.s, z7.s, z2.s[3]\n"
+      "fmla z20.s, z7.s, z3.s[3]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "fmla z24.s, z7.s, z4.s[3]\n"
+      "fmla z28.s, z7.s, z5.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "fmla z9.s, z6.s, z0.s[3]\n"
+      "fmla z13.s, z6.s, z1.s[3]\n"
+      "fmla z17.s, z6.s, z2.s[3]\n"
+      "fmla z21.s, z6.s, z3.s[3]\n"
+      "fmla z25.s, z6.s, z4.s[3]\n"
+      "fmla z29.s, z6.s, z5.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "fmla z10.s, z7.s, z0.s[3]\n"
+      "fmla z14.s, z7.s, z1.s[3]\n"
+      "fmla z18.s, z7.s, z2.s[3]\n"
+      "fmla z22.s, z7.s, z3.s[3]\n"
+      "fmla z26.s, z7.s, z4.s[3]\n"
+      "fmla z30.s, z7.s, z5.s[3]\n"
+      "fmla z11.s, z6.s, z0.s[3]\n"
+      "fmla z15.s, z6.s, z1.s[3]\n"
+      "fmla z19.s, z6.s, z2.s[3]\n"
+      "fmla z23.s, z6.s, z3.s[3]\n"
+      "fmla z27.s, z6.s, z4.s[3]\n"
+      "fmla z31.s, z6.s, z5.s[3]\n"
+      "82:"  // Height 6: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 77b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x13, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmin z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z1.s\n"
+      "fmin z14.s, p5/M, z14.s, z1.s\n"
+      "fmin z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z1.s\n"
+      "fmin z18.s, p5/M, z18.s, z1.s\n"
+      "fmin z19.s, p5/M, z19.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z1.s\n"
+      "fmin z22.s, p5/M, z22.s, z1.s\n"
+      "fmin z23.s, p5/M, z23.s, z1.s\n"
+      "fmin z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z1.s\n"
+      "fmin z26.s, p5/M, z26.s, z1.s\n"
+      "fmin z27.s, p5/M, z27.s, z1.s\n"
+      "fmin z28.s, p5/M, z28.s, z1.s\n"
+      "fmin z29.s, p5/M, z29.s, z1.s\n"
+      "fmin z30.s, p5/M, z30.s, z1.s\n"
+      "fmin z31.s, p5/M, z31.s, z1.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z0.s\n"
+      "fmax z14.s, p5/M, z14.s, z0.s\n"
+      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z0.s\n"
+      "fmax z18.s, p5/M, z18.s, z0.s\n"
+      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z0.s\n"
+      "fmax z22.s, p5/M, z22.s, z0.s\n"
+      "fmax z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z0.s\n"
+      "fmax z26.s, p5/M, z26.s, z0.s\n"
+      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z28.s, p5/M, z28.s, z0.s\n"
+      "fmax z29.s, p5/M, z29.s, z0.s\n"
+      "fmax z30.s, p5/M, z30.s, z0.s\n"
+      "fmax z31.s, p5/M, z31.s, z0.s\n"
+      "83:"  // Height 6: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x23]\n"
+      "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x22]\n"
+      "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z28.s }, p4, [x21]\n"
+      "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+      "84:"  // Height 6: Writeback done
+      "decw x14, ALL, MUL #4\n"
+      "cmp x14, XZR\n"
+      "bgt 72b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 86f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "85:"  // Update direct input
+      "mov x20, #0x18\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "86:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp
new file mode 100644
index 0000000000..23f686a902
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    size_t, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL( ARGLIST );
+
+class cls_sve_ffhybrid_fp32bf16fp32_mmla_4x6VL
+{
+public:
+    typedef float lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+    static unsigned int stripe_width()
+    {
+        return get_vector_length<float>() * 1;
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL2VL_BL64_BF16;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 6;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 4, 12, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::V1:
+                    return { 28.74 };
+                default:
+                    return { 15.27 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_ffhybrid_fp32bf16fp32_mmla_4x6VL;
+    cls_sve_ffhybrid_fp32bf16fp32_mmla_4x6VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
new file mode 100644
index 0000000000..57f42cce77
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
@@ -0,0 +1,1464 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, size_t B_stride, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        const bfloat16 *cur_B_ptr = {};
+        size_t B_stride = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    ka.B_stride = B_stride;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p7.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "cntw x21, ALL, MUL #5\n"
+      "add x28, x9, x20, LSL #1\n"
+      "add x27, x28, x20, LSL #1\n"
+      "add x20, x27, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 3f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x27, x12\n"
+      "bgt 3f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x28, x12\n"
+      "bgt 3f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 3f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 3f\n"
+      "mov x11, x12\n"
+      "3:"  // Height 1: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p6.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p5.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 4f\n"
+      "ld1w { z8.s }, p7/Z, [x15]\n"
+      "ld1w { z9.s }, p7/Z, [x15, #1, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
+      "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "addvl x15, x15, #6\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1w { z21.s }, p6/Z, [x13]\n"
+      "ld1w { z20.s }, p5/Z, [x13, #1, MUL VL]\n"
+      "zip1 z8.d, z21.d, z14.d\n"
+      "zip2 z14.d, z21.d, z14.d\n"
+      "ld1w { z23.s }, p4/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+      "zip1 z9.d, z20.d, z15.d\n"
+      "zip2 z15.d, z20.d, z15.d\n"
+      "ld1w { z21.s }, p2/Z, [x13, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
+      "zip1 z10.d, z23.d, z16.d\n"
+      "zip2 z16.d, z23.d, z16.d\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "zip1 z12.d, z21.d, z18.d\n"
+      "zip2 z18.d, z21.d, z18.d\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x26, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 9f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x25, #0x4\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z21.h }, p7/Z, [x12]\n"
+      "ld1h { z20.h }, p7/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6475e708  // bfmmla z8.s, z24.h, z21.h\n"
+      ".inst 0x6474e70e  // bfmmla z14.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x11]\n"
+      "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x10]\n"
+      "ld1h { z20.h }, p7/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6475e70a  // bfmmla z10.s, z24.h, z21.h\n"
+      ".inst 0x6474e710  // bfmmla z16.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x9]\n"
+      "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z23.h }, p7/Z, [x28]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x4\n"
+      "ld1h { z21.h }, p7/Z, [x27]\n"
+      "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
+      "add x24, x24, #0x10\n"
+      "addvl x12, x12, #2\n"
+      "addvl x11, x11, #2\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "addvl x28, x28, #2\n"
+      "addvl x27, x27, #2\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z22.s }, p0/Z, [x24]\n"
+      ".inst 0x658abed6  // bfcvt z22.h, p7/M, z22.s\n"
+      "uzp1 z22.h, z22.h, z22.h\n"
+      "ld1h { z21.h }, p7/Z, [x12]\n"
+      "ld1h { z20.h }, p7/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6475e6c8  // bfmmla z8.s, z22.h, z21.h\n"
+      ".inst 0x6474e6ce  // bfmmla z14.s, z22.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x11]\n"
+      "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6475e6c9  // bfmmla z9.s, z22.h, z21.h\n"
+      ".inst 0x6474e6cf  // bfmmla z15.s, z22.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x10]\n"
+      "ld1h { z20.h }, p7/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6475e6ca  // bfmmla z10.s, z22.h, z21.h\n"
+      ".inst 0x6474e6d0  // bfmmla z16.s, z22.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x9]\n"
+      "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6475e6cb  // bfmmla z11.s, z22.h, z21.h\n"
+      ".inst 0x6474e6d1  // bfmmla z17.s, z22.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6475e6cc  // bfmmla z12.s, z22.h, z21.h\n"
+      ".inst 0x6474e6d2  // bfmmla z18.s, z22.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x27]\n"
+      "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6475e6cd  // bfmmla z13.s, z22.h, z21.h\n"
+      ".inst 0x6474e6d3  // bfmmla z19.s, z22.h, z20.h\n"
+      "addvl x12, x12, #2\n"
+      "addvl x11, x11, #2\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "addvl x28, x28, #2\n"
+      "addvl x27, x27, #2\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 7b\n"
+      "uzp1 z8.d, z8.d, z14.d\n"
+      "uzp1 z9.d, z9.d, z15.d\n"
+      "uzp1 z10.d, z10.d, z16.d\n"
+      "uzp1 z11.d, z11.d, z17.d\n"
+      "uzp1 z12.d, z12.d, z18.d\n"
+      "uzp1 z13.d, z13.d, z19.d\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z21.s }, p7/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z20.s }, p7/Z, [x20]\n"
+      "fmin z8.s, p7/M, z8.s, z21.s\n"
+      "fmin z9.s, p7/M, z9.s, z21.s\n"
+      "fmin z10.s, p7/M, z10.s, z21.s\n"
+      "fmin z11.s, p7/M, z11.s, z21.s\n"
+      "fmin z12.s, p7/M, z12.s, z21.s\n"
+      "fmin z13.s, p7/M, z13.s, z21.s\n"
+      "fmax z8.s, p7/M, z8.s, z20.s\n"
+      "fmax z9.s, p7/M, z9.s, z20.s\n"
+      "fmax z10.s, p7/M, z10.s, z20.s\n"
+      "fmax z11.s, p7/M, z11.s, z20.s\n"
+      "fmax z12.s, p7/M, z12.s, z20.s\n"
+      "fmax z13.s, p7/M, z13.s, z20.s\n"
+      "13:"  // Height 1: No activation
+      "st1w { z8.s }, p6, [x13]\n"
+      "st1w { z9.s }, p5, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x13, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x13, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x13, #5, MUL VL]\n"
+      "addvl x13, x13, #6\n"
+      "14:"  // Height 1: Writeback done
+      "decw x14, ALL, MUL #6\n"
+      "cmp x14, XZR\n"
+      "bgt 2b\n"
+      "b 58f\n"
+      "15:"  // Height 2
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "16:"  // Height 2: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "cntw x21, ALL, MUL #5\n"
+      "add x28, x9, x20, LSL #1\n"
+      "add x27, x28, x20, LSL #1\n"
+      "add x20, x27, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 17f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x27, x12\n"
+      "bgt 17f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x28, x12\n"
+      "bgt 17f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 17f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 17f\n"
+      "mov x11, x12\n"
+      "17:"  // Height 2: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p6.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p5.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 18f\n"
+      "ld1w { z8.s }, p7/Z, [x15]\n"
+      "ld1w { z9.s }, p7/Z, [x15, #1, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
+      "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "addvl x15, x15, #6\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x13, x20, LSL #2\n"
+      "ld1w { z16.s }, p6/Z, [x13]\n"
+      "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x13, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x20]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x20, #5, MUL VL]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "zip1 z12.d, z21.d, z18.d\n"
+      "zip2 z18.d, z21.d, z18.d\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x26, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 23f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "23:"  // Height 2: input setup done
+      "cmp x25, #0x4\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      "ld1rqw { z20.s }, p0/Z, [x23]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      ".inst 0x658abe94  // bfcvt z20.h, p7/M, z20.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z23.h }, p7/Z, [x12]\n"
+      "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "uzp1 z20.h, z20.h, z20.h\n"
+      "trn1 z24.d, z24.d, z20.d\n"
+      "ld1h { z21.h }, p7/Z, [x11]\n"
+      "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6477e708  // bfmmla z8.s, z24.h, z23.h\n"
+      ".inst 0x6476e70e  // bfmmla z14.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x10]\n"
+      "ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x9]\n"
+      "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6477e70a  // bfmmla z10.s, z24.h, z23.h\n"
+      ".inst 0x6476e710  // bfmmla z16.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x27]\n"
+      "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x4\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "addvl x12, x12, #2\n"
+      "addvl x11, x11, #2\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "addvl x28, x28, #2\n"
+      "addvl x27, x27, #2\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      "ld1rqw { z20.s }, p0/Z, [x23]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      ".inst 0x658abe94  // bfcvt z20.h, p7/M, z20.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z23.h }, p7/Z, [x12]\n"
+      "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "uzp1 z20.h, z20.h, z20.h\n"
+      "trn1 z24.d, z24.d, z20.d\n"
+      "ld1h { z21.h }, p7/Z, [x11]\n"
+      "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6477e708  // bfmmla z8.s, z24.h, z23.h\n"
+      ".inst 0x6476e70e  // bfmmla z14.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x10]\n"
+      "ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x9]\n"
+      "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6477e70a  // bfmmla z10.s, z24.h, z23.h\n"
+      ".inst 0x6476e710  // bfmmla z16.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x27]\n"
+      "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
+      "addvl x12, x12, #2\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "addvl x28, x28, #2\n"
+      "addvl x27, x27, #2\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 21b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z4.d, z8.d, z14.d\n"
+      "uzp2 z8.d, z8.d, z14.d\n"
+      "add x23, x13, x20, LSL #2\n"
+      "uzp1 z14.d, z9.d, z15.d\n"
+      "uzp2 z9.d, z9.d, z15.d\n"
+      "uzp1 z15.d, z10.d, z16.d\n"
+      "uzp2 z10.d, z10.d, z16.d\n"
+      "uzp1 z16.d, z11.d, z17.d\n"
+      "uzp2 z11.d, z11.d, z17.d\n"
+      "uzp1 z17.d, z12.d, z18.d\n"
+      "uzp2 z12.d, z12.d, z18.d\n"
+      "uzp1 z18.d, z13.d, z19.d\n"
+      "uzp2 z13.d, z13.d, z19.d\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z20.s }, p7/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z19.s }, p7/Z, [x20]\n"
+      "fmin z4.s, p7/M, z4.s, z20.s\n"
+      "fmin z14.s, p7/M, z14.s, z20.s\n"
+      "fmin z15.s, p7/M, z15.s, z20.s\n"
+      "fmin z16.s, p7/M, z16.s, z20.s\n"
+      "fmin z17.s, p7/M, z17.s, z20.s\n"
+      "fmin z18.s, p7/M, z18.s, z20.s\n"
+      "fmin z8.s, p7/M, z8.s, z20.s\n"
+      "fmin z9.s, p7/M, z9.s, z20.s\n"
+      "fmin z10.s, p7/M, z10.s, z20.s\n"
+      "fmin z11.s, p7/M, z11.s, z20.s\n"
+      "fmin z12.s, p7/M, z12.s, z20.s\n"
+      "fmin z13.s, p7/M, z13.s, z20.s\n"
+      "fmax z4.s, p7/M, z4.s, z19.s\n"
+      "fmax z14.s, p7/M, z14.s, z19.s\n"
+      "fmax z15.s, p7/M, z15.s, z19.s\n"
+      "fmax z16.s, p7/M, z16.s, z19.s\n"
+      "fmax z17.s, p7/M, z17.s, z19.s\n"
+      "fmax z18.s, p7/M, z18.s, z19.s\n"
+      "fmax z8.s, p7/M, z8.s, z19.s\n"
+      "fmax z9.s, p7/M, z9.s, z19.s\n"
+      "fmax z10.s, p7/M, z10.s, z19.s\n"
+      "fmax z11.s, p7/M, z11.s, z19.s\n"
+      "fmax z12.s, p7/M, z12.s, z19.s\n"
+      "fmax z13.s, p7/M, z13.s, z19.s\n"
+      "27:"  // Height 2: No activation
+      "st1w { z4.s }, p6, [x13]\n"
+      "st1w { z14.s }, p5, [x13, #1, MUL VL]\n"
+      "st1w { z15.s }, p4, [x13, #2, MUL VL]\n"
+      "st1w { z16.s }, p3, [x13, #3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x13, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x13, #5, MUL VL]\n"
+      "addvl x13, x13, #6\n"
+      "st1w { z8.s }, p6, [x23]\n"
+      "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
+      "28:"  // Height 2: Writeback done
+      "decw x14, ALL, MUL #6\n"
+      "cmp x14, XZR\n"
+      "bgt 16b\n"
+      "b 58f\n"
+      "29:"  // Height 3
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x15, %x[bias]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "30:"  // Height 3: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "cntw x21, ALL, MUL #5\n"
+      "add x28, x9, x20, LSL #1\n"
+      "add x27, x28, x20, LSL #1\n"
+      "add x20, x27, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 31f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x27, x12\n"
+      "bgt 31f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x28, x12\n"
+      "bgt 31f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 31f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 31f\n"
+      "mov x11, x12\n"
+      "31:"  // Height 3: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p6.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p5.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 32f\n"
+      "ld1w { z8.s }, p7/Z, [x15]\n"
+      "ld1w { z9.s }, p7/Z, [x15, #1, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
+      "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "addvl x15, x15, #6\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z26.d, z14.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z27.d, z15.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z28.d, z16.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z29.d, z17.d\n"
+      "mov z24.d, z12.d\n"
+      "mov z30.d, z18.d\n"
+      "mov z25.d, z13.d\n"
+      "mov z31.d, z19.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x13, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z16.s }, p6/Z, [x13]\n"
+      "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x13, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x21]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "ld1w { z21.s }, p6/Z, [x20]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "ld1w { z22.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x20, #2, MUL VL]\n"
+      "zip1 z12.d, z24.d, z18.d\n"
+      "zip2 z18.d, z24.d, z18.d\n"
+      "ld1w { z24.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #4, MUL VL]\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n"
+      "zip1 z20.d, z21.d, z26.d\n"
+      "zip2 z26.d, z21.d, z26.d\n"
+      "zip1 z21.d, z22.d, z27.d\n"
+      "zip2 z27.d, z22.d, z27.d\n"
+      "zip1 z22.d, z23.d, z28.d\n"
+      "zip2 z28.d, z23.d, z28.d\n"
+      "zip1 z23.d, z24.d, z29.d\n"
+      "zip2 z29.d, z24.d, z29.d\n"
+      "zip1 z24.d, z25.d, z30.d\n"
+      "zip2 z30.d, z25.d, z30.d\n"
+      "zip1 z25.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x26, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 37f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "37:"  // Height 3: input setup done
+      "cmp x25, #0x4\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z3.h }, p7/Z, [x12]\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z1.h }, p7/Z, [x11]\n"
+      "trn1 z5.d, z5.d, z0.d\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6463e4a8  // bfmmla z8.s, z5.h, z3.h\n"
+      ".inst 0x6463e494  // bfmmla z20.s, z4.h, z3.h\n"
+      ".inst 0x6462e4ae  // bfmmla z14.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x10]\n"
+      "sub x25, x25, #0x4\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
+      "cmp x25, #0x4\n"
+      ".inst 0x6461e495  // bfmmla z21.s, z4.h, z1.h\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x9]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6460e49b  // bfmmla z27.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6463e4aa  // bfmmla z10.s, z5.h, z3.h\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6463e496  // bfmmla z22.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b0  // bfmmla z16.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6462e49c  // bfmmla z28.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "addvl x12, x12, #2\n"
+      ".inst 0x6461e497  // bfmmla z23.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b1  // bfmmla z17.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x27]\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6463e4ac  // bfmmla z12.s, z5.h, z3.h\n"
+      "addvl x10, x10, #2\n"
+      ".inst 0x6463e498  // bfmmla z24.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b2  // bfmmla z18.s, z5.h, z2.h\n"
+      "addvl x9, x9, #2\n"
+      "addvl x28, x28, #2\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ad  // bfmmla z13.s, z5.h, z1.h\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b3  // bfmmla z19.s, z5.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z3.h }, p7/Z, [x12]\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z1.h }, p7/Z, [x11]\n"
+      "trn1 z5.d, z5.d, z0.d\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6463e4a8  // bfmmla z8.s, z5.h, z3.h\n"
+      ".inst 0x6463e494  // bfmmla z20.s, z4.h, z3.h\n"
+      ".inst 0x6462e4ae  // bfmmla z14.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x10]\n"
+      "addvl x12, x12, #2\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6461e495  // bfmmla z21.s, z4.h, z1.h\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x9]\n"
+      "addvl x10, x10, #2\n"
+      ".inst 0x6460e49b  // bfmmla z27.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6463e4aa  // bfmmla z10.s, z5.h, z3.h\n"
+      "addvl x9, x9, #2\n"
+      ".inst 0x6463e496  // bfmmla z22.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b0  // bfmmla z16.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      ".inst 0x6462e49c  // bfmmla z28.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "addvl x28, x28, #2\n"
+      ".inst 0x6461e497  // bfmmla z23.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b1  // bfmmla z17.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x27]\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6463e4ac  // bfmmla z12.s, z5.h, z3.h\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0x6463e498  // bfmmla z24.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b2  // bfmmla z18.s, z5.h, z2.h\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ad  // bfmmla z13.s, z5.h, z1.h\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b3  // bfmmla z19.s, z5.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 35b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20, LSL #2\n"
+      "uzp1 z4.d, z8.d, z14.d\n"
+      "uzp2 z8.d, z8.d, z14.d\n"
+      "uzp1 z14.d, z9.d, z15.d\n"
+      "uzp2 z9.d, z9.d, z15.d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp1 z15.d, z10.d, z16.d\n"
+      "uzp2 z10.d, z10.d, z16.d\n"
+      "uzp1 z16.d, z11.d, z17.d\n"
+      "uzp2 z11.d, z11.d, z17.d\n"
+      "uzp1 z17.d, z12.d, z18.d\n"
+      "uzp2 z12.d, z12.d, z18.d\n"
+      "uzp1 z18.d, z13.d, z19.d\n"
+      "uzp2 z13.d, z13.d, z19.d\n"
+      "uzp1 z20.d, z20.d, z26.d\n"
+      "uzp1 z21.d, z21.d, z27.d\n"
+      "uzp1 z22.d, z22.d, z28.d\n"
+      "uzp1 z23.d, z23.d, z29.d\n"
+      "uzp1 z24.d, z24.d, z30.d\n"
+      "uzp1 z25.d, z25.d, z31.d\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p7/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z19.s }, p7/Z, [x20]\n"
+      "fmin z4.s, p7/M, z4.s, z0.s\n"
+      "fmin z14.s, p7/M, z14.s, z0.s\n"
+      "fmin z15.s, p7/M, z15.s, z0.s\n"
+      "fmin z16.s, p7/M, z16.s, z0.s\n"
+      "fmin z17.s, p7/M, z17.s, z0.s\n"
+      "fmin z18.s, p7/M, z18.s, z0.s\n"
+      "fmin z8.s, p7/M, z8.s, z0.s\n"
+      "fmin z9.s, p7/M, z9.s, z0.s\n"
+      "fmin z10.s, p7/M, z10.s, z0.s\n"
+      "fmin z11.s, p7/M, z11.s, z0.s\n"
+      "fmin z12.s, p7/M, z12.s, z0.s\n"
+      "fmin z13.s, p7/M, z13.s, z0.s\n"
+      "fmin z20.s, p7/M, z20.s, z0.s\n"
+      "fmin z21.s, p7/M, z21.s, z0.s\n"
+      "fmin z22.s, p7/M, z22.s, z0.s\n"
+      "fmin z23.s, p7/M, z23.s, z0.s\n"
+      "fmin z24.s, p7/M, z24.s, z0.s\n"
+      "fmin z25.s, p7/M, z25.s, z0.s\n"
+      "fmax z4.s, p7/M, z4.s, z19.s\n"
+      "fmax z14.s, p7/M, z14.s, z19.s\n"
+      "fmax z15.s, p7/M, z15.s, z19.s\n"
+      "fmax z16.s, p7/M, z16.s, z19.s\n"
+      "fmax z17.s, p7/M, z17.s, z19.s\n"
+      "fmax z18.s, p7/M, z18.s, z19.s\n"
+      "fmax z8.s, p7/M, z8.s, z19.s\n"
+      "fmax z9.s, p7/M, z9.s, z19.s\n"
+      "fmax z10.s, p7/M, z10.s, z19.s\n"
+      "fmax z11.s, p7/M, z11.s, z19.s\n"
+      "fmax z12.s, p7/M, z12.s, z19.s\n"
+      "fmax z13.s, p7/M, z13.s, z19.s\n"
+      "fmax z20.s, p7/M, z20.s, z19.s\n"
+      "fmax z21.s, p7/M, z21.s, z19.s\n"
+      "fmax z22.s, p7/M, z22.s, z19.s\n"
+      "fmax z23.s, p7/M, z23.s, z19.s\n"
+      "fmax z24.s, p7/M, z24.s, z19.s\n"
+      "fmax z25.s, p7/M, z25.s, z19.s\n"
+      "41:"  // Height 3: No activation
+      "st1w { z4.s }, p6, [x13]\n"
+      "st1w { z14.s }, p5, [x13, #1, MUL VL]\n"
+      "st1w { z15.s }, p4, [x13, #2, MUL VL]\n"
+      "st1w { z16.s }, p3, [x13, #3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x13, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x13, #5, MUL VL]\n"
+      "addvl x13, x13, #6\n"
+      "st1w { z8.s }, p6, [x23]\n"
+      "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
+      "st1w { z20.s }, p6, [x22]\n"
+      "st1w { z21.s }, p5, [x22, #1, MUL VL]\n"
+      "st1w { z22.s }, p4, [x22, #2, MUL VL]\n"
+      "st1w { z23.s }, p3, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p2, [x22, #4, MUL VL]\n"
+      "st1w { z25.s }, p1, [x22, #5, MUL VL]\n"
+      "42:"  // Height 3: Writeback done
+      "decw x14, ALL, MUL #6\n"
+      "cmp x14, XZR\n"
+      "bgt 30b\n"
+      "b 58f\n"
+      "43:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x21, #0x10\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x15, %x[bias]\n"
+      "mov x13, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x21, %x[output_ptr]\n"
+      "44:"  // Height 4: Column loop
+      "ldr x12, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "add x11, x12, x20, LSL #1\n"
+      "add x10, x11, x20, LSL #1\n"
+      "add x9, x10, x20, LSL #1\n"
+      "cntw x21, ALL, MUL #5\n"
+      "add x28, x9, x20, LSL #1\n"
+      "add x27, x28, x20, LSL #1\n"
+      "add x20, x27, x20, LSL #1\n"
+      "cmp x14, x21\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "bgt 45f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x27, x12\n"
+      "bgt 45f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x28, x12\n"
+      "bgt 45f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x9, x12\n"
+      "bgt 45f\n"
+      "decw x21\n"
+      "cmp x14, x21\n"
+      "mov x10, x12\n"
+      "bgt 45f\n"
+      "mov x11, x12\n"
+      "45:"  // Height 4: B setup done
+      "mov x20, #0x0\n"
+      "whilelt p6.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p5.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p4.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x14\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x14\n"
+      "cbz x15, 46f\n"
+      "ld1w { z8.s }, p7/Z, [x15]\n"
+      "ld1w { z9.s }, p7/Z, [x15, #1, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p7/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z11.s }, p7/Z, [x15, #3, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z12.s }, p7/Z, [x15, #4, MUL VL]\n"
+      "ld1w { z13.s }, p7/Z, [x15, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "addvl x15, x15, #6\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z26.d, z14.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z27.d, z15.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z28.d, z16.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z29.d, z17.d\n"
+      "mov z24.d, z12.d\n"
+      "mov z30.d, z18.d\n"
+      "mov z25.d, z13.d\n"
+      "mov z31.d, z19.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x13, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z16.s }, p6/Z, [x13]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x13, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x22]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n"
+      "ld1w { z21.s }, p6/Z, [x21]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "zip1 z12.d, z24.d, z18.d\n"
+      "zip2 z18.d, z24.d, z18.d\n"
+      "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "ld1w { z26.s }, p6/Z, [x20]\n"
+      "zip1 z20.d, z21.d, z26.d\n"
+      "zip2 z26.d, z21.d, z26.d\n"
+      "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n"
+      "zip1 z21.d, z22.d, z27.d\n"
+      "zip2 z27.d, z22.d, z27.d\n"
+      "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n"
+      "zip1 z22.d, z23.d, z28.d\n"
+      "zip2 z28.d, z23.d, z28.d\n"
+      "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n"
+      "zip1 z23.d, z24.d, z29.d\n"
+      "zip2 z29.d, z24.d, z29.d\n"
+      "zip1 z24.d, z25.d, z30.d\n"
+      "zip2 z30.d, z25.d, z30.d\n"
+      "zip1 z25.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x26, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 51f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "51:"  // Height 4: input setup done
+      "cmp x25, #0x4\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z7.s }, p0/Z, [x24]\n"
+      "ld1rqw { z6.s }, p0/Z, [x23]\n"
+      ".inst 0x658abce7  // bfcvt z7.h, p7/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x22]\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      ".inst 0x658abcc6  // bfcvt z6.h, p7/M, z6.s\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z3.h }, p7/Z, [x12]\n"
+      "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z1.h }, p7/Z, [x11]\n"
+      "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6463e4e8  // bfmmla z8.s, z7.h, z3.h\n"
+      "sub x25, x25, #0x4\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6462e4ee  // bfmmla z14.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x10]\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
+      "cmp x25, #0x4\n"
+      ".inst 0x6461e4b5  // bfmmla z21.s, z5.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x9]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6460e4bb  // bfmmla z27.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6463e4ea  // bfmmla z10.s, z7.h, z3.h\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f0  // bfmmla z16.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6462e4bc  // bfmmla z28.s, z5.h, z2.h\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6461e4b7  // bfmmla z23.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f1  // bfmmla z17.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x27]\n"
+      "addvl x12, x12, #2\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6463e4ec  // bfmmla z12.s, z7.h, z3.h\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6463e4b8  // bfmmla z24.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f2  // bfmmla z18.s, z7.h, z2.h\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      ".inst 0x6462e4be  // bfmmla z30.s, z5.h, z2.h\n"
+      ".inst 0x6461e4ed  // bfmmla z13.s, z7.h, z1.h\n"
+      "addvl x28, x28, #2\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0x6461e4b9  // bfmmla z25.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f3  // bfmmla z19.s, z7.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z7.s }, p0/Z, [x24]\n"
+      "ld1rqw { z6.s }, p0/Z, [x23]\n"
+      ".inst 0x658abce7  // bfcvt z7.h, p7/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x22]\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      ".inst 0x658abcc6  // bfcvt z6.h, p7/M, z6.s\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z3.h }, p7/Z, [x12]\n"
+      "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z1.h }, p7/Z, [x11]\n"
+      "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6463e4e8  // bfmmla z8.s, z7.h, z3.h\n"
+      "addvl x12, x12, #2\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6462e4ee  // bfmmla z14.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x10]\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      ".inst 0x6461e4b5  // bfmmla z21.s, z5.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x9]\n"
+      "addvl x10, x10, #2\n"
+      ".inst 0x6460e4bb  // bfmmla z27.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6463e4ea  // bfmmla z10.s, z7.h, z3.h\n"
+      "addvl x9, x9, #2\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f0  // bfmmla z16.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      ".inst 0x6462e4bc  // bfmmla z28.s, z5.h, z2.h\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "addvl x28, x28, #2\n"
+      ".inst 0x6461e4b7  // bfmmla z23.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f1  // bfmmla z17.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x27]\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6463e4ec  // bfmmla z12.s, z7.h, z3.h\n"
+      "addvl x27, x27, #2\n"
+      ".inst 0x6463e4b8  // bfmmla z24.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f2  // bfmmla z18.s, z7.h, z2.h\n"
+      ".inst 0x6462e4be  // bfmmla z30.s, z5.h, z2.h\n"
+      ".inst 0x6461e4ed  // bfmmla z13.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b9  // bfmmla z25.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f3  // bfmmla z19.s, z7.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 49b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x13, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp1 z4.d, z8.d, z14.d\n"
+      "uzp2 z8.d, z8.d, z14.d\n"
+      "uzp1 z14.d, z9.d, z15.d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 z9.d, z9.d, z15.d\n"
+      "uzp1 z15.d, z10.d, z16.d\n"
+      "uzp2 z10.d, z10.d, z16.d\n"
+      "uzp1 z16.d, z11.d, z17.d\n"
+      "uzp2 z11.d, z11.d, z17.d\n"
+      "uzp1 z17.d, z12.d, z18.d\n"
+      "uzp2 z12.d, z12.d, z18.d\n"
+      "uzp1 z18.d, z13.d, z19.d\n"
+      "uzp2 z13.d, z13.d, z19.d\n"
+      "uzp1 z19.d, z20.d, z26.d\n"
+      "uzp2 z20.d, z20.d, z26.d\n"
+      "uzp1 z26.d, z21.d, z27.d\n"
+      "uzp2 z21.d, z21.d, z27.d\n"
+      "uzp1 z27.d, z22.d, z28.d\n"
+      "uzp2 z22.d, z22.d, z28.d\n"
+      "uzp1 z28.d, z23.d, z29.d\n"
+      "uzp2 z23.d, z23.d, z29.d\n"
+      "uzp1 z29.d, z24.d, z30.d\n"
+      "uzp2 z24.d, z24.d, z30.d\n"
+      "uzp1 z30.d, z25.d, z31.d\n"
+      "uzp2 z25.d, z25.d, z31.d\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p7/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p7/Z, [x20]\n"
+      "fmin z4.s, p7/M, z4.s, z1.s\n"
+      "fmin z14.s, p7/M, z14.s, z1.s\n"
+      "fmin z15.s, p7/M, z15.s, z1.s\n"
+      "fmin z16.s, p7/M, z16.s, z1.s\n"
+      "fmin z17.s, p7/M, z17.s, z1.s\n"
+      "fmin z18.s, p7/M, z18.s, z1.s\n"
+      "fmin z8.s, p7/M, z8.s, z1.s\n"
+      "fmin z9.s, p7/M, z9.s, z1.s\n"
+      "fmin z10.s, p7/M, z10.s, z1.s\n"
+      "fmin z11.s, p7/M, z11.s, z1.s\n"
+      "fmin z12.s, p7/M, z12.s, z1.s\n"
+      "fmin z13.s, p7/M, z13.s, z1.s\n"
+      "fmin z19.s, p7/M, z19.s, z1.s\n"
+      "fmin z26.s, p7/M, z26.s, z1.s\n"
+      "fmin z27.s, p7/M, z27.s, z1.s\n"
+      "fmin z28.s, p7/M, z28.s, z1.s\n"
+      "fmin z29.s, p7/M, z29.s, z1.s\n"
+      "fmin z30.s, p7/M, z30.s, z1.s\n"
+      "fmin z20.s, p7/M, z20.s, z1.s\n"
+      "fmin z21.s, p7/M, z21.s, z1.s\n"
+      "fmin z22.s, p7/M, z22.s, z1.s\n"
+      "fmin z23.s, p7/M, z23.s, z1.s\n"
+      "fmin z24.s, p7/M, z24.s, z1.s\n"
+      "fmin z25.s, p7/M, z25.s, z1.s\n"
+      "fmax z4.s, p7/M, z4.s, z0.s\n"
+      "fmax z14.s, p7/M, z14.s, z0.s\n"
+      "fmax z15.s, p7/M, z15.s, z0.s\n"
+      "fmax z16.s, p7/M, z16.s, z0.s\n"
+      "fmax z17.s, p7/M, z17.s, z0.s\n"
+      "fmax z18.s, p7/M, z18.s, z0.s\n"
+      "fmax z8.s, p7/M, z8.s, z0.s\n"
+      "fmax z9.s, p7/M, z9.s, z0.s\n"
+      "fmax z10.s, p7/M, z10.s, z0.s\n"
+      "fmax z11.s, p7/M, z11.s, z0.s\n"
+      "fmax z12.s, p7/M, z12.s, z0.s\n"
+      "fmax z13.s, p7/M, z13.s, z0.s\n"
+      "fmax z19.s, p7/M, z19.s, z0.s\n"
+      "fmax z26.s, p7/M, z26.s, z0.s\n"
+      "fmax z27.s, p7/M, z27.s, z0.s\n"
+      "fmax z28.s, p7/M, z28.s, z0.s\n"
+      "fmax z29.s, p7/M, z29.s, z0.s\n"
+      "fmax z30.s, p7/M, z30.s, z0.s\n"
+      "fmax z20.s, p7/M, z20.s, z0.s\n"
+      "fmax z21.s, p7/M, z21.s, z0.s\n"
+      "fmax z22.s, p7/M, z22.s, z0.s\n"
+      "fmax z23.s, p7/M, z23.s, z0.s\n"
+      "fmax z24.s, p7/M, z24.s, z0.s\n"
+      "fmax z25.s, p7/M, z25.s, z0.s\n"
+      "55:"  // Height 4: No activation
+      "st1w { z4.s }, p6, [x13]\n"
+      "st1w { z14.s }, p5, [x13, #1, MUL VL]\n"
+      "st1w { z15.s }, p4, [x13, #2, MUL VL]\n"
+      "st1w { z16.s }, p3, [x13, #3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x13, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x13, #5, MUL VL]\n"
+      "addvl x13, x13, #6\n"
+      "st1w { z8.s }, p6, [x23]\n"
+      "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
+      "st1w { z19.s }, p6, [x22]\n"
+      "st1w { z26.s }, p5, [x22, #1, MUL VL]\n"
+      "st1w { z27.s }, p4, [x22, #2, MUL VL]\n"
+      "st1w { z28.s }, p3, [x22, #3, MUL VL]\n"
+      "st1w { z29.s }, p2, [x22, #4, MUL VL]\n"
+      "st1w { z30.s }, p1, [x22, #5, MUL VL]\n"
+      "st1w { z20.s }, p6, [x21]\n"
+      "st1w { z21.s }, p5, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p4, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p3, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p2, [x21, #4, MUL VL]\n"
+      "st1w { z25.s }, p1, [x21, #5, MUL VL]\n"
+      "56:"  // Height 4: Writeback done
+      "decw x14, ALL, MUL #6\n"
+      "cmp x14, XZR\n"
+      "bgt 44b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 58f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 57f\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "57:"  // Update direct input
+      "mov x20, #0x10\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "58:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp
new file mode 100644
index 0000000000..1fe5f48da6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, size_t, \
+    float *, int, size_t, int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_ffinterleaved_bf16fp32_mmla_8x3VL( ARGLIST );
+
+class cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL
+{
+public:
+    typedef bfloat16 operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 3;
+    }
+    static unsigned int stripe_width()
+    {
+        return get_vector_length<float>();
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL2VL_BL64;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+
+    StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 39.90, 8.55, 4.42 };
+            }
+        }
+
+
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::V1:
+                    return { 53.48, 4.23, 6.53 };
+                default:                
+                    return { 29.07, 2.76, 5.39 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_ffinterleaved_bf16fp32_mmla_8x3VL;
+    cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp
new file mode 100644
index 0000000000..576bd47039
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+#include "../../bfloat.hpp"
+
+namespace arm_gemm {
+
+void sve_ffinterleaved_bf16fp32_mmla_8x3VL(
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    size_t B_stride,
+    float *Cpanel,
+    int ablocks,
+    size_t N,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+        size_t N = {};
+        size_t B_stride = {};
+        const bfloat16 *cur_B_ptr = {};
+    } ka;
+
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+    ka.N = N;
+    ka.B_stride = B_stride;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x25, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "cntw x23, ALL, MUL #2\n"
+      "add x22, x24, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "cmp x26, x23\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov %x[Apanel], x25\n"
+      "bgt 3f\n"
+      "decw x23\n"
+      "cmp x26, x23\n"
+      "mov x21, x24\n"
+      "bgt 3f\n"
+      "mov x22, x24\n"
+      "3:"  // B setup done
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "ld1h { z4.h }, p0/Z, [x24]\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "ld1h { z5.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "addvl x24, x24, #2\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 5f\n"
+      "4:"  // main loop head
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6465e40b  // bfmmla z11.s, z0.h, z5.h\n"
+      ".inst 0x6464e42e  // bfmmla z14.s, z1.h, z4.h\n"
+      ".inst 0x6465e431  // bfmmla z17.s, z1.h, z5.h\n"
+      "ld1h { z7.h }, p0/Z, [x22]\n"
+      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
+      ".inst 0x6465e457  // bfmmla z23.s, z2.h, z5.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6464e4da  // bfmmla z26.s, z6.h, z4.h\n"
+      ".inst 0x6465e4dd  // bfmmla z29.s, z6.h, z5.h\n"
+      "ld1h { z5.h }, p0/Z, [x21]\n"
+      "ld1h { z4.h }, p0/Z, [x21, #1, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6463e40c  // bfmmla z12.s, z0.h, z3.h\n"
+      ".inst 0x6467e42f  // bfmmla z15.s, z1.h, z7.h\n"
+      ".inst 0x6463e432  // bfmmla z18.s, z1.h, z3.h\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      ".inst 0x6463e458  // bfmmla z24.s, z2.h, z3.h\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x6467e4db  // bfmmla z27.s, z6.h, z7.h\n"
+      ".inst 0x6463e4de  // bfmmla z30.s, z6.h, z3.h\n"
+      "ld1h { z3.h }, p0/Z, [x24]\n"
+      ".inst 0x6465e40a  // bfmmla z10.s, z0.h, z5.h\n"
+      ".inst 0x6464e40d  // bfmmla z13.s, z0.h, z4.h\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #16]\n"
+      ".inst 0x6465e430  // bfmmla z16.s, z1.h, z5.h\n"
+      ".inst 0x6464e433  // bfmmla z19.s, z1.h, z4.h\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #32]\n"
+      ".inst 0x6465e456  // bfmmla z22.s, z2.h, z5.h\n"
+      ".inst 0x6464e459  // bfmmla z25.s, z2.h, z4.h\n"
+      "ld1h { z7.h }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0x6465e4dc  // bfmmla z28.s, z6.h, z5.h\n"
+      ".inst 0x6464e4df  // bfmmla z31.s, z6.h, z4.h\n"
+      "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #64]\n"
+      ".inst 0x6463e408  // bfmmla z8.s, z0.h, z3.h\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6463e42e  // bfmmla z14.s, z1.h, z3.h\n"
+      ".inst 0x6467e431  // bfmmla z17.s, z1.h, z7.h\n"
+      "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6467e4b7  // bfmmla z23.s, z5.h, z7.h\n"
+      "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x6463e4da  // bfmmla z26.s, z6.h, z3.h\n"
+      ".inst 0x6467e4dd  // bfmmla z29.s, z6.h, z7.h\n"
+      "ld1h { z3.h }, p0/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z7.h }, p0/Z, [x21, #3, MUL VL]\n"
+      ".inst 0x6462e409  // bfmmla z9.s, z0.h, z2.h\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6462e42f  // bfmmla z15.s, z1.h, z2.h\n"
+      ".inst 0x6464e432  // bfmmla z18.s, z1.h, z4.h\n"
+      "addvl x22, x22, #4\n"
+      ".inst 0x6462e4b5  // bfmmla z21.s, z5.h, z2.h\n"
+      ".inst 0x6464e4b8  // bfmmla z24.s, z5.h, z4.h\n"
+      "addvl x21, x21, #4\n"
+      ".inst 0x6462e4db  // bfmmla z27.s, z6.h, z2.h\n"
+      ".inst 0x6464e4de  // bfmmla z30.s, z6.h, z4.h\n"
+      "ld1h { z4.h }, p0/Z, [x24, #2, MUL VL]\n"
+      ".inst 0x6463e40a  // bfmmla z10.s, z0.h, z3.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #80]\n"
+      ".inst 0x6463e430  // bfmmla z16.s, z1.h, z3.h\n"
+      ".inst 0x6467e433  // bfmmla z19.s, z1.h, z7.h\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #96]\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
+      "ld1h { z5.h }, p0/Z, [x24, #3, MUL VL]\n"
+      ".inst 0x6463e4dc  // bfmmla z28.s, z6.h, z3.h\n"
+      ".inst 0x6467e4df  // bfmmla z31.s, z6.h, z7.h\n"
+      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "addvl x24, x24, #4\n"
+      "bge 4b\n"
+      "5:"  // main loop skip
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6465e40b  // bfmmla z11.s, z0.h, z5.h\n"
+      ".inst 0x6464e42e  // bfmmla z14.s, z1.h, z4.h\n"
+      ".inst 0x6465e431  // bfmmla z17.s, z1.h, z5.h\n"
+      "ld1h { z6.h }, p0/Z, [x22]\n"
+      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
+      ".inst 0x6465e457  // bfmmla z23.s, z2.h, z5.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6464e4fa  // bfmmla z26.s, z7.h, z4.h\n"
+      ".inst 0x6465e4fd  // bfmmla z29.s, z7.h, z5.h\n"
+      "ld1h { z5.h }, p0/Z, [x21]\n"
+      "ld1h { z4.h }, p0/Z, [x21, #1, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6463e40c  // bfmmla z12.s, z0.h, z3.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6463e432  // bfmmla z18.s, z1.h, z3.h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6463e458  // bfmmla z24.s, z2.h, z3.h\n"
+      "addvl x22, x22, #2\n"
+      ".inst 0x6466e4fb  // bfmmla z27.s, z7.h, z6.h\n"
+      ".inst 0x6463e4fe  // bfmmla z30.s, z7.h, z3.h\n"
+      "addvl x21, x21, #2\n"
+      ".inst 0x6465e40a  // bfmmla z10.s, z0.h, z5.h\n"
+      ".inst 0x6464e40d  // bfmmla z13.s, z0.h, z4.h\n"
+      ".inst 0x6465e430  // bfmmla z16.s, z1.h, z5.h\n"
+      ".inst 0x6464e433  // bfmmla z19.s, z1.h, z4.h\n"
+      ".inst 0x6465e456  // bfmmla z22.s, z2.h, z5.h\n"
+      ".inst 0x6464e459  // bfmmla z25.s, z2.h, z4.h\n"
+      ".inst 0x6465e4fc  // bfmmla z28.s, z7.h, z5.h\n"
+      ".inst 0x6464e4ff  // bfmmla z31.s, z7.h, z4.h\n"
+      "cbz x20, 6f\n"
+      "ld1h { z1.h }, p0/Z, [x24]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1h { z0.h }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0x6460e4eb  // bfmmla z11.s, z7.h, z0.h\n"
+      "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqh { z4.h }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x6461e4ce  // bfmmla z14.s, z6.h, z1.h\n"
+      ".inst 0x6460e4d1  // bfmmla z17.s, z6.h, z0.h\n"
+      ".inst 0x6461e4b4  // bfmmla z20.s, z5.h, z1.h\n"
+      "ld1h { z3.h }, p0/Z, [x22]\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6461e49a  // bfmmla z26.s, z4.h, z1.h\n"
+      "ld1h { z2.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z1.h }, p0/Z, [x21]\n"
+      "ld1h { z0.h }, p0/Z, [x21, #1, MUL VL]\n"
+      ".inst 0x6463e4e9  // bfmmla z9.s, z7.h, z3.h\n"
+      ".inst 0x6462e4ec  // bfmmla z12.s, z7.h, z2.h\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x6463e4cf  // bfmmla z15.s, z6.h, z3.h\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
+      ".inst 0x6463e4b5  // bfmmla z21.s, z5.h, z3.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6463e49b  // bfmmla z27.s, z4.h, z3.h\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6461e4d0  // bfmmla z16.s, z6.h, z1.h\n"
+      ".inst 0x6460e4d3  // bfmmla z19.s, z6.h, z0.h\n"
+      ".inst 0x6461e4b6  // bfmmla z22.s, z5.h, z1.h\n"
+      ".inst 0x6460e4b9  // bfmmla z25.s, z5.h, z0.h\n"
+      ".inst 0x6461e49c  // bfmmla z28.s, z4.h, z1.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "6:"  // multiply loop done
+      "decw x26, ALL, MUL #3\n"
+      "uzp1 z0.d, z8.d, z11.d\n"
+      "uzp2 z8.d, z8.d, z11.d\n"
+      "uzp1 z1.d, z9.d, z12.d\n"
+      "uzp2 z9.d, z9.d, z12.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+      "uzp1 z0.d, z10.d, z13.d\n"
+      "uzp2 z10.d, z10.d, z13.d\n"
+      "st1w { z1.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "uzp1 z2.d, z14.d, z17.d\n"
+      "uzp2 z14.d, z14.d, z17.d\n"
+      "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "uzp1 z1.d, z15.d, z18.d\n"
+      "cmp x26, XZR\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "uzp2 z15.d, z15.d, z18.d\n"
+      "uzp1 z17.d, z16.d, z19.d\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "uzp2 z16.d, z16.d, z19.d\n"
+      "uzp1 z0.d, z20.d, z23.d\n"
+      "st1w { z2.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "uzp2 z20.d, z20.d, z23.d\n"
+      "uzp1 z23.d, z21.d, z24.d\n"
+      "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "uzp2 z21.d, z21.d, z24.d\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "uzp1 z19.d, z22.d, z25.d\n"
+      "uzp2 z22.d, z22.d, z25.d\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "uzp1 z18.d, z26.d, z29.d\n"
+      "uzp2 z26.d, z26.d, z29.d\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "uzp1 z17.d, z27.d, z30.d\n"
+      "uzp2 z27.d, z27.d, z30.d\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "uzp1 z16.d, z28.d, z31.d\n"
+      "uzp2 z28.d, z28.d, z31.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp
new file mode 100644
index 0000000000..60f1b699c3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    const __fp16 *, const __fp16 *, size_t, \
+    __fp16 *, int, size_t, int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_ffinterleaved_fp16_mla_8x3VL( ARGLIST );
+void sve_ffinterleaved_fp16_mla_8x3VL_a64fx( ARGLIST );
+
+class cls_sve_ffinterleaved_fp16_mla_8x3VL
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<__fp16>() * 3;
+    }
+    static unsigned int stripe_width()
+    {
+        return get_vector_length<__fp16>();
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL1VL_BL16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+
+    StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, __fp16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 25.53, 7.89, 3.82 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_ffinterleaved_fp16_mla_8x3VL;
+    cls_sve_ffinterleaved_fp16_mla_8x3VL(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_ffinterleaved_fp16_mla_8x3VL_a64fx;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp
new file mode 100644
index 0000000000..69ddb21c31
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void sve_ffinterleaved_fp16_mla_8x3VL_a64fx(
+    const __fp16 *Apanel,
+    const __fp16 *Bpanel,
+    size_t B_stride,
+    __fp16 *Cpanel,
+    int ablocks,
+    size_t N,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const __fp16 *Bpanel = {};
+        size_t N = {};
+        size_t B_stride = {};
+        const __fp16 *cur_B_ptr = {};
+    } ka;
+
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+    ka.N = N;
+    ka.B_stride = B_stride;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x25, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "cnth x23, ALL, MUL #2\n"
+      "add x22, x24, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "cmp x26, x23\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov %x[Apanel], x25\n"
+      "bgt 3f\n"
+      "dech x23\n"
+      "cmp x26, x23\n"
+      "mov x21, x24\n"
+      "bgt 3f\n"
+      "mov x22, x24\n"
+      "3:"  // B setup done
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "ld1h { z0.h }, p0/Z, [x24]\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "ld1h { z1.h }, p0/Z, [x22]\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "ld1h { z2.h }, p0/Z, [x21]\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 5f\n"
+      "4:"  // main loop head
+      "fmla z8.h, p0/M, z0.h, z3.h\n"
+      "fmla z9.h, p0/M, z1.h, z3.h\n"
+      "sub x20, x20, #0x2\n"
+      "fmla z10.h, p0/M, z2.h, z3.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z11.h, p0/M, z0.h, z4.h\n"
+      "fmla z12.h, p0/M, z1.h, z4.h\n"
+      "fmla z13.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #10]\n"
+      "fmla z14.h, p0/M, z0.h, z5.h\n"
+      "fmla z15.h, p0/M, z1.h, z5.h\n"
+      "cmp x20, #0x2\n"
+      "fmla z16.h, p0/M, z2.h, z5.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z17.h, p0/M, z0.h, z6.h\n"
+      "fmla z18.h, p0/M, z1.h, z6.h\n"
+      "fmla z19.h, p0/M, z2.h, z6.h\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z0.h, z3.h\n"
+      "fmla z21.h, p0/M, z1.h, z3.h\n"
+      "fmla z22.h, p0/M, z2.h, z3.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z23.h, p0/M, z0.h, z7.h\n"
+      "fmla z24.h, p0/M, z1.h, z7.h\n"
+      "fmla z25.h, p0/M, z2.h, z7.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #18]\n"
+      "fmla z26.h, p0/M, z0.h, z4.h\n"
+      "fmla z27.h, p0/M, z1.h, z4.h\n"
+      "fmla z28.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z29.h, p0/M, z0.h, z6.h\n"
+      "ld1h { z7.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "fmla z30.h, p0/M, z1.h, z6.h\n"
+      "fmla z31.h, p0/M, z2.h, z6.h\n"
+      "ld1h { z6.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z2.h }, p0/Z, [x21, #1, MUL VL]\n"
+      "fmla z8.h, p0/M, z7.h, z3.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #22]\n"
+      "fmla z9.h, p0/M, z6.h, z3.h\n"
+      "fmla z10.h, p0/M, z2.h, z3.h\n"
+      "fmla z11.h, p0/M, z7.h, z5.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z12.h, p0/M, z6.h, z5.h\n"
+      "fmla z13.h, p0/M, z2.h, z5.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #26]\n"
+      "fmla z14.h, p0/M, z7.h, z4.h\n"
+      "fmla z15.h, p0/M, z6.h, z4.h\n"
+      "addvl x24, x24, #2\n"
+      "fmla z16.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z17.h, p0/M, z7.h, z1.h\n"
+      "fmla z18.h, p0/M, z6.h, z1.h\n"
+      "fmla z19.h, p0/M, z2.h, z1.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #30]\n"
+      "addvl x22, x22, #2\n"
+      "addvl x21, x21, #2\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z20.h, p0/M, z7.h, z3.h\n"
+      "fmla z21.h, p0/M, z6.h, z3.h\n"
+      "fmla z22.h, p0/M, z2.h, z3.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "fmla z23.h, p0/M, z7.h, z5.h\n"
+      "fmla z24.h, p0/M, z6.h, z5.h\n"
+      "fmla z25.h, p0/M, z2.h, z5.h\n"
+      "fmla z26.h, p0/M, z7.h, z0.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
+      "fmla z27.h, p0/M, z6.h, z0.h\n"
+      "fmla z28.h, p0/M, z2.h, z0.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
+      "fmla z29.h, p0/M, z7.h, z1.h\n"
+      "ld1h { z0.h }, p0/Z, [x24]\n"
+      "fmla z30.h, p0/M, z6.h, z1.h\n"
+      "fmla z31.h, p0/M, z2.h, z1.h\n"
+      "ld1h { z1.h }, p0/Z, [x22]\n"
+      "ld1h { z2.h }, p0/Z, [x21]\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
+      "bge 4b\n"
+      "5:"  // main loop skip
+      "fmla z8.h, p0/M, z0.h, z3.h\n"
+      "fmla z9.h, p0/M, z1.h, z3.h\n"
+      "addvl x24, x24, #1\n"
+      "fmla z10.h, p0/M, z2.h, z3.h\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z11.h, p0/M, z0.h, z4.h\n"
+      "fmla z12.h, p0/M, z1.h, z4.h\n"
+      "fmla z13.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
+      "fmla z14.h, p0/M, z0.h, z5.h\n"
+      "fmla z15.h, p0/M, z1.h, z5.h\n"
+      "addvl x22, x22, #1\n"
+      "fmla z16.h, p0/M, z2.h, z5.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z17.h, p0/M, z0.h, z6.h\n"
+      "fmla z18.h, p0/M, z1.h, z6.h\n"
+      "fmla z19.h, p0/M, z2.h, z6.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z0.h, z7.h\n"
+      "fmla z21.h, p0/M, z1.h, z7.h\n"
+      "addvl x21, x21, #1\n"
+      "fmla z22.h, p0/M, z2.h, z7.h\n"
+      "fmla z23.h, p0/M, z0.h, z4.h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla z24.h, p0/M, z1.h, z4.h\n"
+      "fmla z25.h, p0/M, z2.h, z4.h\n"
+      "fmla z26.h, p0/M, z0.h, z5.h\n"
+      "fmla z27.h, p0/M, z1.h, z5.h\n"
+      "fmla z28.h, p0/M, z2.h, z5.h\n"
+      "fmla z29.h, p0/M, z0.h, z3.h\n"
+      "fmla z30.h, p0/M, z1.h, z3.h\n"
+      "fmla z31.h, p0/M, z2.h, z3.h\n"
+      "cbz x20, 6f\n"
+      "ld1h { z6.h }, p0/Z, [x24]\n"
+      "ld1h { z5.h }, p0/Z, [x22]\n"
+      "ld1h { z4.h }, p0/Z, [x21]\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "fmla z8.h, p0/M, z6.h, z3.h\n"
+      "ld1rh { z2.h }, p0/Z, [%x[Apanel], #2]\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #4]\n"
+      "fmla z9.h, p0/M, z5.h, z3.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n"
+      "fmla z10.h, p0/M, z4.h, z3.h\n"
+      "fmla z11.h, p0/M, z6.h, z2.h\n"
+      "fmla z12.h, p0/M, z5.h, z2.h\n"
+      "fmla z13.h, p0/M, z4.h, z2.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z14.h, p0/M, z6.h, z1.h\n"
+      "fmla z15.h, p0/M, z5.h, z1.h\n"
+      "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n"
+      "fmla z16.h, p0/M, z4.h, z1.h\n"
+      "fmla z17.h, p0/M, z6.h, z0.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z18.h, p0/M, z5.h, z0.h\n"
+      "fmla z19.h, p0/M, z4.h, z0.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z6.h, z3.h\n"
+      "fmla z21.h, p0/M, z5.h, z3.h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla z22.h, p0/M, z4.h, z3.h\n"
+      "fmla z23.h, p0/M, z6.h, z2.h\n"
+      "fmla z24.h, p0/M, z5.h, z2.h\n"
+      "fmla z25.h, p0/M, z4.h, z2.h\n"
+      "fmla z26.h, p0/M, z6.h, z1.h\n"
+      "fmla z27.h, p0/M, z5.h, z1.h\n"
+      "fmla z28.h, p0/M, z4.h, z1.h\n"
+      "fmla z29.h, p0/M, z6.h, z0.h\n"
+      "fmla z30.h, p0/M, z5.h, z0.h\n"
+      "fmla z31.h, p0/M, z4.h, z0.h\n"
+      "6:"  // multiply loop done
+      "dech x26, ALL, MUL #3\n"
+      "st1h { z8.h }, p0, [%x[Cpanel]]\n"
+      "cmp x26, XZR\n"
+      "st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z12.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z13.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z14.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z15.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1h { z16.h }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1h { z17.h }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1h { z18.h }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1h { z19.h }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1h { z20.h }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1h { z21.h }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1h { z22.h }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1h { z23.h }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1h { z24.h }, p0, [%x[Cpanel]]\n"
+      "st1h { z25.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z26.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z28.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z29.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z30.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z31.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp
new file mode 100644
index 0000000000..23503fa108
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void sve_ffinterleaved_fp16_mla_8x3VL(
+    const __fp16 *Apanel,
+    const __fp16 *Bpanel,
+    size_t B_stride,
+    __fp16 *Cpanel,
+    int ablocks,
+    size_t N,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const __fp16 *Bpanel = {};
+        size_t N = {};
+        size_t B_stride = {};
+        const __fp16 *cur_B_ptr = {};
+    } ka;
+
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+    ka.N = N;
+    ka.B_stride = B_stride;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x25, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "cnth x23, ALL, MUL #2\n"
+      "add x22, x24, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "cmp x26, x23\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov %x[Apanel], x25\n"
+      "bgt 3f\n"
+      "dech x23\n"
+      "cmp x26, x23\n"
+      "mov x21, x24\n"
+      "bgt 3f\n"
+      "mov x22, x24\n"
+      "3:"  // B setup done
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "ld1h { z2.h }, p0/Z, [x24]\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "ld1h { z3.h }, p0/Z, [x22]\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "ld1h { z4.h }, p0/Z, [x21]\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 5f\n"
+      "4:"  // main loop head
+      "fmla z8.h, z2.h, z0.h[0]\n"
+      "fmla z11.h, z2.h, z0.h[1]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z14.h, z2.h, z0.h[2]\n"
+      "fmla z17.h, z2.h, z0.h[3]\n"
+      "ld1h { z6.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "fmla z20.h, z2.h, z0.h[4]\n"
+      "fmla z23.h, z2.h, z0.h[5]\n"
+      "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "fmla z26.h, z2.h, z0.h[6]\n"
+      "fmla z29.h, z2.h, z0.h[7]\n"
+      "ld1h { z1.h }, p0/Z, [x21, #1, MUL VL]\n"
+      "fmla z9.h, z3.h, z0.h[0]\n"
+      "fmla z12.h, z3.h, z0.h[1]\n"
+      "addvl x24, x24, #2\n"
+      "fmla z15.h, z3.h, z0.h[2]\n"
+      "fmla z18.h, z3.h, z0.h[3]\n"
+      "addvl x22, x22, #2\n"
+      "fmla z21.h, z3.h, z0.h[4]\n"
+      "fmla z24.h, z3.h, z0.h[5]\n"
+      "addvl x21, x21, #2\n"
+      "fmla z27.h, z3.h, z0.h[6]\n"
+      "fmla z30.h, z3.h, z0.h[7]\n"
+      "sub x20, x20, #0x2\n"
+      "fmla z10.h, z4.h, z0.h[0]\n"
+      "fmla z13.h, z4.h, z0.h[1]\n"
+      "cmp x20, #0x2\n"
+      "fmla z16.h, z4.h, z0.h[2]\n"
+      "fmla z19.h, z4.h, z0.h[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z22.h, z4.h, z0.h[4]\n"
+      "fmla z25.h, z4.h, z0.h[5]\n"
+      "ld1h { z2.h }, p0/Z, [x24]\n"
+      "fmla z28.h, z4.h, z0.h[6]\n"
+      "fmla z31.h, z4.h, z0.h[7]\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "fmla z8.h, z6.h, z7.h[0]\n"
+      "fmla z11.h, z6.h, z7.h[1]\n"
+      "ld1h { z3.h }, p0/Z, [x22]\n"
+      "fmla z14.h, z6.h, z7.h[2]\n"
+      "fmla z17.h, z6.h, z7.h[3]\n"
+      "ld1h { z4.h }, p0/Z, [x21]\n"
+      "fmla z20.h, z6.h, z7.h[4]\n"
+      "fmla z23.h, z6.h, z7.h[5]\n"
+      "fmla z26.h, z6.h, z7.h[6]\n"
+      "fmla z29.h, z6.h, z7.h[7]\n"
+      "fmla z9.h, z5.h, z7.h[0]\n"
+      "fmla z12.h, z5.h, z7.h[1]\n"
+      "fmla z15.h, z5.h, z7.h[2]\n"
+      "fmla z18.h, z5.h, z7.h[3]\n"
+      "fmla z21.h, z5.h, z7.h[4]\n"
+      "fmla z24.h, z5.h, z7.h[5]\n"
+      "fmla z27.h, z5.h, z7.h[6]\n"
+      "fmla z30.h, z5.h, z7.h[7]\n"
+      "fmla z10.h, z1.h, z7.h[0]\n"
+      "fmla z13.h, z1.h, z7.h[1]\n"
+      "fmla z16.h, z1.h, z7.h[2]\n"
+      "fmla z19.h, z1.h, z7.h[3]\n"
+      "fmla z22.h, z1.h, z7.h[4]\n"
+      "fmla z25.h, z1.h, z7.h[5]\n"
+      "fmla z28.h, z1.h, z7.h[6]\n"
+      "fmla z31.h, z1.h, z7.h[7]\n"
+      "bge 4b\n"
+      "5:"  // main loop skip
+      "fmla z8.h, z2.h, z0.h[0]\n"
+      "fmla z11.h, z2.h, z0.h[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla z14.h, z2.h, z0.h[2]\n"
+      "fmla z17.h, z2.h, z0.h[3]\n"
+      "addvl x24, x24, #1\n"
+      "fmla z20.h, z2.h, z0.h[4]\n"
+      "fmla z23.h, z2.h, z0.h[5]\n"
+      "addvl x22, x22, #1\n"
+      "fmla z26.h, z2.h, z0.h[6]\n"
+      "fmla z29.h, z2.h, z0.h[7]\n"
+      "addvl x21, x21, #1\n"
+      "fmla z9.h, z3.h, z0.h[0]\n"
+      "fmla z12.h, z3.h, z0.h[1]\n"
+      "fmla z15.h, z3.h, z0.h[2]\n"
+      "fmla z18.h, z3.h, z0.h[3]\n"
+      "fmla z21.h, z3.h, z0.h[4]\n"
+      "fmla z24.h, z3.h, z0.h[5]\n"
+      "fmla z27.h, z3.h, z0.h[6]\n"
+      "fmla z30.h, z3.h, z0.h[7]\n"
+      "fmla z10.h, z4.h, z0.h[0]\n"
+      "fmla z13.h, z4.h, z0.h[1]\n"
+      "fmla z16.h, z4.h, z0.h[2]\n"
+      "fmla z19.h, z4.h, z0.h[3]\n"
+      "fmla z22.h, z4.h, z0.h[4]\n"
+      "fmla z25.h, z4.h, z0.h[5]\n"
+      "fmla z28.h, z4.h, z0.h[6]\n"
+      "fmla z31.h, z4.h, z0.h[7]\n"
+      "cbz x20, 6f\n"
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "ld1h { z2.h }, p0/Z, [x24]\n"
+      "fmla z8.h, z2.h, z3.h[0]\n"
+      "ld1h { z1.h }, p0/Z, [x22]\n"
+      "ld1h { z0.h }, p0/Z, [x21]\n"
+      "fmla z11.h, z2.h, z3.h[1]\n"
+      "fmla z14.h, z2.h, z3.h[2]\n"
+      "fmla z17.h, z2.h, z3.h[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla z20.h, z2.h, z3.h[4]\n"
+      "fmla z23.h, z2.h, z3.h[5]\n"
+      "fmla z26.h, z2.h, z3.h[6]\n"
+      "fmla z29.h, z2.h, z3.h[7]\n"
+      "fmla z9.h, z1.h, z3.h[0]\n"
+      "fmla z12.h, z1.h, z3.h[1]\n"
+      "fmla z15.h, z1.h, z3.h[2]\n"
+      "fmla z18.h, z1.h, z3.h[3]\n"
+      "fmla z21.h, z1.h, z3.h[4]\n"
+      "fmla z24.h, z1.h, z3.h[5]\n"
+      "fmla z27.h, z1.h, z3.h[6]\n"
+      "fmla z30.h, z1.h, z3.h[7]\n"
+      "fmla z10.h, z0.h, z3.h[0]\n"
+      "fmla z13.h, z0.h, z3.h[1]\n"
+      "fmla z16.h, z0.h, z3.h[2]\n"
+      "fmla z19.h, z0.h, z3.h[3]\n"
+      "fmla z22.h, z0.h, z3.h[4]\n"
+      "fmla z25.h, z0.h, z3.h[5]\n"
+      "fmla z28.h, z0.h, z3.h[6]\n"
+      "fmla z31.h, z0.h, z3.h[7]\n"
+      "6:"  // multiply loop done
+      "dech x26, ALL, MUL #3\n"
+      "st1h { z8.h }, p0, [%x[Cpanel]]\n"
+      "cmp x26, XZR\n"
+      "st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z12.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z13.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z14.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z15.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1h { z16.h }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1h { z17.h }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1h { z18.h }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1h { z19.h }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1h { z20.h }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1h { z21.h }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1h { z22.h }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1h { z23.h }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1h { z24.h }, p0, [%x[Cpanel]]\n"
+      "st1h { z25.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z26.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z28.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z29.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z30.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z31.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp
new file mode 100644
index 0000000000..ac6986913d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../kernel_weight_format.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    const float *, const float *, size_t, \
+    float *, int, size_t, int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_ffinterleaved_fp32_mla_8x3VL( ARGLIST );
+void sve_ffinterleaved_fp32_mla_8x3VL_a64fx( ARGLIST );
+
+class cls_sve_ffinterleaved_fp32_mla_8x3VL
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 3;
+    }
+    static unsigned int stripe_width()
+    {
+        return get_vector_length<float>();
+    }
+
+    static KernelWeightFormat kernel_weight_format()
+    {
+        return KernelWeightFormat::VL1VL_BL32;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+
+    StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 13.51, 9.27, 3.98 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_ffinterleaved_fp32_mla_8x3VL;
+    cls_sve_ffinterleaved_fp32_mla_8x3VL(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_ffinterleaved_fp32_mla_8x3VL_a64fx;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp
new file mode 100644
index 0000000000..c65c3a3ce4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void sve_ffinterleaved_fp32_mla_8x3VL_a64fx(
+    const float *Apanel,
+    const float *Bpanel,
+    size_t B_stride,
+    float *Cpanel,
+    int ablocks,
+    size_t N,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const float *Bpanel = {};
+        size_t N = {};
+        size_t B_stride = {};
+        const float *cur_B_ptr = {};
+    } ka;
+
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+    ka.N = N;
+    ka.B_stride = B_stride;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x25, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "cntw x23, ALL, MUL #2\n"
+      "add x22, x24, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "cmp x26, x23\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov %x[Apanel], x25\n"
+      "bgt 3f\n"
+      "decw x23\n"
+      "cmp x26, x23\n"
+      "mov x21, x24\n"
+      "bgt 3f\n"
+      "mov x22, x24\n"
+      "3:"  // B setup done
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "ld1w { z0.s }, p0/Z, [x24]\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "ld1w { z1.s }, p0/Z, [x22]\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "ld1w { z2.s }, p0/Z, [x21]\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 5f\n"
+      "4:"  // main loop head
+      "fmla z8.s, p0/M, z0.s, z3.s\n"
+      "fmla z9.s, p0/M, z1.s, z3.s\n"
+      "sub x20, x20, #0x2\n"
+      "fmla z10.s, p0/M, z2.s, z3.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z11.s, p0/M, z0.s, z4.s\n"
+      "fmla z12.s, p0/M, z1.s, z4.s\n"
+      "fmla z13.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z14.s, p0/M, z0.s, z5.s\n"
+      "fmla z15.s, p0/M, z1.s, z5.s\n"
+      "cmp x20, #0x2\n"
+      "fmla z16.s, p0/M, z2.s, z5.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z17.s, p0/M, z0.s, z6.s\n"
+      "fmla z18.s, p0/M, z1.s, z6.s\n"
+      "fmla z19.s, p0/M, z2.s, z6.s\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z0.s, z3.s\n"
+      "fmla z21.s, p0/M, z1.s, z3.s\n"
+      "fmla z22.s, p0/M, z2.s, z3.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+      "fmla z23.s, p0/M, z0.s, z7.s\n"
+      "fmla z24.s, p0/M, z1.s, z7.s\n"
+      "fmla z25.s, p0/M, z2.s, z7.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #36]\n"
+      "fmla z26.s, p0/M, z0.s, z4.s\n"
+      "fmla z27.s, p0/M, z1.s, z4.s\n"
+      "fmla z28.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #40]\n"
+      "fmla z29.s, p0/M, z0.s, z6.s\n"
+      "ld1w { z7.s }, p0/Z, [x24, #1, MUL VL]\n"
+      "fmla z30.s, p0/M, z1.s, z6.s\n"
+      "fmla z31.s, p0/M, z2.s, z6.s\n"
+      "ld1w { z6.s }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z2.s }, p0/Z, [x21, #1, MUL VL]\n"
+      "fmla z8.s, p0/M, z7.s, z3.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+      "fmla z9.s, p0/M, z6.s, z3.s\n"
+      "fmla z10.s, p0/M, z2.s, z3.s\n"
+      "fmla z11.s, p0/M, z7.s, z5.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
+      "fmla z12.s, p0/M, z6.s, z5.s\n"
+      "fmla z13.s, p0/M, z2.s, z5.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #52]\n"
+      "fmla z14.s, p0/M, z7.s, z4.s\n"
+      "fmla z15.s, p0/M, z6.s, z4.s\n"
+      "addvl x24, x24, #2\n"
+      "fmla z16.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+      "fmla z17.s, p0/M, z7.s, z1.s\n"
+      "fmla z18.s, p0/M, z6.s, z1.s\n"
+      "fmla z19.s, p0/M, z2.s, z1.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
+      "addvl x22, x22, #2\n"
+      "addvl x21, x21, #2\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "fmla z20.s, p0/M, z7.s, z3.s\n"
+      "fmla z21.s, p0/M, z6.s, z3.s\n"
+      "fmla z22.s, p0/M, z2.s, z3.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "fmla z23.s, p0/M, z7.s, z5.s\n"
+      "fmla z24.s, p0/M, z6.s, z5.s\n"
+      "fmla z25.s, p0/M, z2.s, z5.s\n"
+      "fmla z26.s, p0/M, z7.s, z0.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "fmla z27.s, p0/M, z6.s, z0.s\n"
+      "fmla z28.s, p0/M, z2.s, z0.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z29.s, p0/M, z7.s, z1.s\n"
+      "ld1w { z0.s }, p0/Z, [x24]\n"
+      "fmla z30.s, p0/M, z6.s, z1.s\n"
+      "fmla z31.s, p0/M, z2.s, z1.s\n"
+      "ld1w { z1.s }, p0/Z, [x22]\n"
+      "ld1w { z2.s }, p0/Z, [x21]\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "bge 4b\n"
+      "5:"  // main loop skip
+      "fmla z8.s, p0/M, z0.s, z3.s\n"
+      "fmla z9.s, p0/M, z1.s, z3.s\n"
+      "addvl x24, x24, #1\n"
+      "fmla z10.s, p0/M, z2.s, z3.s\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z11.s, p0/M, z0.s, z4.s\n"
+      "fmla z12.s, p0/M, z1.s, z4.s\n"
+      "fmla z13.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z14.s, p0/M, z0.s, z5.s\n"
+      "fmla z15.s, p0/M, z1.s, z5.s\n"
+      "addvl x22, x22, #1\n"
+      "fmla z16.s, p0/M, z2.s, z5.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z17.s, p0/M, z0.s, z6.s\n"
+      "fmla z18.s, p0/M, z1.s, z6.s\n"
+      "fmla z19.s, p0/M, z2.s, z6.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z0.s, z7.s\n"
+      "fmla z21.s, p0/M, z1.s, z7.s\n"
+      "addvl x21, x21, #1\n"
+      "fmla z22.s, p0/M, z2.s, z7.s\n"
+      "fmla z23.s, p0/M, z0.s, z4.s\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z24.s, p0/M, z1.s, z4.s\n"
+      "fmla z25.s, p0/M, z2.s, z4.s\n"
+      "fmla z26.s, p0/M, z0.s, z5.s\n"
+      "fmla z27.s, p0/M, z1.s, z5.s\n"
+      "fmla z28.s, p0/M, z2.s, z5.s\n"
+      "fmla z29.s, p0/M, z0.s, z3.s\n"
+      "fmla z30.s, p0/M, z1.s, z3.s\n"
+      "fmla z31.s, p0/M, z2.s, z3.s\n"
+      "cbz x20, 6f\n"
+      "ld1w { z6.s }, p0/Z, [x24]\n"
+      "ld1w { z5.s }, p0/Z, [x22]\n"
+      "ld1w { z4.s }, p0/Z, [x21]\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "fmla z8.s, p0/M, z6.s, z3.s\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z9.s, p0/M, z5.s, z3.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z10.s, p0/M, z4.s, z3.s\n"
+      "fmla z11.s, p0/M, z6.s, z2.s\n"
+      "fmla z12.s, p0/M, z5.s, z2.s\n"
+      "fmla z13.s, p0/M, z4.s, z2.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z14.s, p0/M, z6.s, z1.s\n"
+      "fmla z15.s, p0/M, z5.s, z1.s\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z16.s, p0/M, z4.s, z1.s\n"
+      "fmla z17.s, p0/M, z6.s, z0.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z18.s, p0/M, z5.s, z0.s\n"
+      "fmla z19.s, p0/M, z4.s, z0.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z6.s, z3.s\n"
+      "fmla z21.s, p0/M, z5.s, z3.s\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z22.s, p0/M, z4.s, z3.s\n"
+      "fmla z23.s, p0/M, z6.s, z2.s\n"
+      "fmla z24.s, p0/M, z5.s, z2.s\n"
+      "fmla z25.s, p0/M, z4.s, z2.s\n"
+      "fmla z26.s, p0/M, z6.s, z1.s\n"
+      "fmla z27.s, p0/M, z5.s, z1.s\n"
+      "fmla z28.s, p0/M, z4.s, z1.s\n"
+      "fmla z29.s, p0/M, z6.s, z0.s\n"
+      "fmla z30.s, p0/M, z5.s, z0.s\n"
+      "fmla z31.s, p0/M, z4.s, z0.s\n"
+      "6:"  // multiply loop done
+      "decw x26, ALL, MUL #3\n"
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "cmp x26, XZR\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp
new file mode 100644
index 0000000000..4b20be6f01
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void sve_ffinterleaved_fp32_mla_8x3VL(
+    const float *Apanel,
+    const float *Bpanel,
+    size_t B_stride,
+    float *Cpanel,
+    int ablocks,
+    size_t N,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const float *Bpanel = {};
+        size_t N = {};
+        size_t B_stride = {};
+        const float *cur_B_ptr = {};
+    } ka;
+
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+    ka.N = N;
+    ka.B_stride = B_stride;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x25, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
+      "cntw x23, ALL, MUL #2\n"
+      "add x22, x24, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "cmp x26, x23\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov %x[Apanel], x25\n"
+      "bgt 3f\n"
+      "decw x23\n"
+      "cmp x26, x23\n"
+      "mov x21, x24\n"
+      "bgt 3f\n"
+      "mov x22, x24\n"
+      "3:"  // B setup done
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "cmp x20, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "ld1w { z4.s }, p0/Z, [x24]\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "ld1w { z5.s }, p0/Z, [x22]\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "ld1w { z6.s }, p0/Z, [x21]\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 5f\n"
+      "4:"  // main loop head
+      "fmla z8.s, z4.s, z0.s[0]\n"
+      "fmla z11.s, z4.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+      "fmla z14.s, z4.s, z0.s[2]\n"
+      "fmla z17.s, z4.s, z0.s[3]\n"
+      "ld1rqw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+      "fmla z20.s, z4.s, z1.s[0]\n"
+      "fmla z23.s, z4.s, z1.s[1]\n"
+      "sub x20, x20, #0x2\n"
+      "fmla z26.s, z4.s, z1.s[2]\n"
+      "fmla z29.s, z4.s, z1.s[3]\n"
+      "ld1w { z4.s }, p0/Z, [x24, #1, MUL VL]\n"
+      "fmla z9.s, z5.s, z0.s[0]\n"
+      "fmla z12.s, z5.s, z0.s[1]\n"
+      "addvl x24, x24, #2\n"
+      "fmla z15.s, z5.s, z0.s[2]\n"
+      "fmla z18.s, z5.s, z0.s[3]\n"
+      "cmp x20, #0x2\n"
+      "fmla z21.s, z5.s, z1.s[0]\n"
+      "fmla z24.s, z5.s, z1.s[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "fmla z27.s, z5.s, z1.s[2]\n"
+      "fmla z30.s, z5.s, z1.s[3]\n"
+      "ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z13.s, z6.s, z0.s[1]\n"
+      "addvl x22, x22, #2\n"
+      "fmla z16.s, z6.s, z0.s[2]\n"
+      "fmla z19.s, z6.s, z0.s[3]\n"
+      "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
+      "fmla z22.s, z6.s, z1.s[0]\n"
+      "fmla z25.s, z6.s, z1.s[1]\n"
+      "fmla z28.s, z6.s, z1.s[2]\n"
+      "fmla z31.s, z6.s, z1.s[3]\n"
+      "ld1w { z2.s }, p0/Z, [x21, #1, MUL VL]\n"
+      "addvl x21, x21, #2\n"
+      "fmla z8.s, z4.s, z3.s[0]\n"
+      "fmla z11.s, z4.s, z3.s[1]\n"
+      "fmla z14.s, z4.s, z3.s[2]\n"
+      "fmla z17.s, z4.s, z3.s[3]\n"
+      "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z20.s, z4.s, z7.s[0]\n"
+      "fmla z23.s, z4.s, z7.s[1]\n"
+      "fmla z26.s, z4.s, z7.s[2]\n"
+      "fmla z29.s, z4.s, z7.s[3]\n"
+      "ld1w { z4.s }, p0/Z, [x24]\n"
+      "fmla z9.s, z5.s, z3.s[0]\n"
+      "fmla z12.s, z5.s, z3.s[1]\n"
+      "fmla z15.s, z5.s, z3.s[2]\n"
+      "fmla z18.s, z5.s, z3.s[3]\n"
+      "fmla z21.s, z5.s, z7.s[0]\n"
+      "fmla z24.s, z5.s, z7.s[1]\n"
+      "fmla z27.s, z5.s, z7.s[2]\n"
+      "fmla z30.s, z5.s, z7.s[3]\n"
+      "ld1w { z5.s }, p0/Z, [x22]\n"
+      "fmla z10.s, z2.s, z3.s[0]\n"
+      "fmla z13.s, z2.s, z3.s[1]\n"
+      "fmla z16.s, z2.s, z3.s[2]\n"
+      "fmla z19.s, z2.s, z3.s[3]\n"
+      "fmla z22.s, z2.s, z7.s[0]\n"
+      "fmla z25.s, z2.s, z7.s[1]\n"
+      "fmla z28.s, z2.s, z7.s[2]\n"
+      "fmla z31.s, z2.s, z7.s[3]\n"
+      "ld1w { z6.s }, p0/Z, [x21]\n"
+      "bge 4b\n"
+      "5:"  // main loop skip
+      "fmla z8.s, z4.s, z0.s[0]\n"
+      "fmla z11.s, z4.s, z0.s[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z14.s, z4.s, z0.s[2]\n"
+      "fmla z17.s, z4.s, z0.s[3]\n"
+      "addvl x24, x24, #1\n"
+      "fmla z20.s, z4.s, z1.s[0]\n"
+      "fmla z23.s, z4.s, z1.s[1]\n"
+      "addvl x22, x22, #1\n"
+      "fmla z26.s, z4.s, z1.s[2]\n"
+      "fmla z29.s, z4.s, z1.s[3]\n"
+      "addvl x21, x21, #1\n"
+      "fmla z9.s, z5.s, z0.s[0]\n"
+      "fmla z12.s, z5.s, z0.s[1]\n"
+      "fmla z15.s, z5.s, z0.s[2]\n"
+      "fmla z18.s, z5.s, z0.s[3]\n"
+      "fmla z21.s, z5.s, z1.s[0]\n"
+      "fmla z24.s, z5.s, z1.s[1]\n"
+      "fmla z27.s, z5.s, z1.s[2]\n"
+      "fmla z30.s, z5.s, z1.s[3]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z13.s, z6.s, z0.s[1]\n"
+      "fmla z16.s, z6.s, z0.s[2]\n"
+      "fmla z19.s, z6.s, z0.s[3]\n"
+      "fmla z22.s, z6.s, z1.s[0]\n"
+      "fmla z25.s, z6.s, z1.s[1]\n"
+      "fmla z28.s, z6.s, z1.s[2]\n"
+      "fmla z31.s, z6.s, z1.s[3]\n"
+      "cbz x20, 6f\n"
+      "ld1rqw { z4.s }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ld1w { z2.s }, p0/Z, [x24]\n"
+      "ld1w { z1.s }, p0/Z, [x22]\n"
+      "fmla z8.s, z2.s, z4.s[0]\n"
+      "ld1w { z0.s }, p0/Z, [x21]\n"
+      "fmla z11.s, z2.s, z4.s[1]\n"
+      "fmla z14.s, z2.s, z4.s[2]\n"
+      "fmla z17.s, z2.s, z4.s[3]\n"
+      "fmla z20.s, z2.s, z3.s[0]\n"
+      "fmla z23.s, z2.s, z3.s[1]\n"
+      "fmla z26.s, z2.s, z3.s[2]\n"
+      "fmla z29.s, z2.s, z3.s[3]\n"
+      "fmla z9.s, z1.s, z4.s[0]\n"
+      "fmla z12.s, z1.s, z4.s[1]\n"
+      "fmla z15.s, z1.s, z4.s[2]\n"
+      "fmla z18.s, z1.s, z4.s[3]\n"
+      "fmla z21.s, z1.s, z3.s[0]\n"
+      "fmla z24.s, z1.s, z3.s[1]\n"
+      "fmla z27.s, z1.s, z3.s[2]\n"
+      "fmla z30.s, z1.s, z3.s[3]\n"
+      "fmla z10.s, z0.s, z4.s[0]\n"
+      "fmla z13.s, z0.s, z4.s[1]\n"
+      "fmla z16.s, z0.s, z4.s[2]\n"
+      "fmla z19.s, z0.s, z4.s[3]\n"
+      "fmla z22.s, z0.s, z3.s[0]\n"
+      "fmla z25.s, z0.s, z3.s[1]\n"
+      "fmla z28.s, z0.s, z3.s[2]\n"
+      "fmla z31.s, z0.s, z3.s[3]\n"
+      "6:"  // multiply loop done
+      "decw x26, ALL, MUL #3\n"
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "cmp x26, XZR\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
deleted file mode 100644
index 57fd9c909e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "../performance_parameters.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_gemv_fp32_mla_8VL(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
-
-class cls_sve_gemv_fp32_mla_8VL
-{
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
-
-    static unsigned int out_width()
-    {
-        return 8 * get_vector_length<float>();
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 1;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 1, 8, 1> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=sve_gemv_fp32_mla_8VL;
-
-    cls_sve_gemv_fp32_mla_8VL(const CPUInfo *)
-    {
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
deleted file mode 100644
index c62e31936c..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
+++ /dev/null
@@ -1,1372 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-
-namespace arm_gemm {
-
-void sve_gemv_fp32_mla_8VL (
-    const float *A_ptr, const float *B_ptr, float *output_ptr,
-    size_t N, size_t K,
-    const float *bias, Activation act, bool
-)
-{
-    struct KernelArgs {
-        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
-        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-        const float *B_ptr = {};
-        size_t output_offset = {};
-        unsigned int input_initial_col = {};
-    } ka;
-
-    unsigned long flags=0;
-    ka.B_ptr = B_ptr;
-    switch(act.type) {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            ka.maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            ka.minval = 0;
-            flags |= 0x2;
-            break;
-    }
-    __asm__ __volatile__(
-      "ptrue p2.b\n"
-      "cntw x24\n"
-      "add x23, %x[N], x24\n"
-      "sub x23, x23, #0x1\n"
-      "udiv x23, x23, x24\n"
-      "mov x22, %x[bias]\n"
-      "1:"  // Column loop
-      "cmp x23, #0x8\n"
-      "bge 50f\n"
-      "cmp x23, #0x6\n"
-      "bgt 43f\n"
-      "beq 36f\n"
-      "cmp x23, #0x4\n"
-      "bgt 29f\n"
-      "beq 22f\n"
-      "cmp x23, #0x2\n"
-      "bgt 15f\n"
-      "beq 8f\n"
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "whilelt p1.s, XZR, %x[N]\n"
-      "cbz x22, 2f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "addvl x22, x22, #1\n"
-      "b 3f\n"
-      "2:"  // Width 1: no bias
-      "mov z24.b, #0x0\n"
-      "3:"  // Width 1: setup done
-      "cmp x21, #0x4\n"
-      "ble 5f\n"
-      "4:"  // Width 1: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x20, x20, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z2.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "cmp x21, #0x4\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z3.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z4.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 4b\n"
-      "5:"  // Width 1: Multiply loop: Single iteration only
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z5.s, z0.s[0]\n"
-      "add x20, x20, #0x10\n"
-      "subs x21, x21, #0x1\n"
-      "ble 6f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z6.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ble 6f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z7.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ble 6f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z8.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "6:"  // Width 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 7f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "7:"  // Width 1: No activation
-      "st1w { z24.s }, p1, [%x[output_ptr]]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #1\n"
-      "b 57f\n"
-      "8:"  // Width 2
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "sub x19, %x[N], x24\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 9f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "addvl x22, x22, #2\n"
-      "b 10f\n"
-      "9:"  // Width 2: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "10:"  // Width 2: setup done
-      "cmp x21, #0x4\n"
-      "ble 12f\n"
-      "11:"  // Width 2: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z3.s, z0.s[1]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z4.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "cmp x21, #0x4\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z5.s, z0.s[2]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z6.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z7.s, z0.s[3]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z8.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 11b\n"
-      "12:"  // Width 2: Multiply loop: Single iteration only
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z9.s, z0.s[0]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z10.s, z0.s[0]\n"
-      "subs x21, x21, #0x1\n"
-      "ble 13f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z11.s, z0.s[1]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z12.s, z0.s[1]\n"
-      "ble 13f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z13.s, z0.s[2]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z14.s, z0.s[2]\n"
-      "ble 13f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z15.s, z0.s[3]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z16.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "13:"  // Width 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 14f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "14:"  // Width 2: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p1, [%x[output_ptr], #1, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #2\n"
-      "b 57f\n"
-      "15:"  // Width 3
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x2\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 16f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "addvl x22, x22, #3\n"
-      "b 17f\n"
-      "16:"  // Width 3: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "17:"  // Width 3: setup done
-      "cmp x21, #0x4\n"
-      "ble 19f\n"
-      "18:"  // Width 3: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "cmp x21, #0x4\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z4.s, z0.s[1]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z5.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z26.s, z6.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z7.s, z0.s[2]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z8.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z26.s, z9.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z10.s, z0.s[3]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z11.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z26.s, z12.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 18b\n"
-      "19:"  // Width 3: Multiply loop: Single iteration only
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z13.s, z0.s[0]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z14.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z15.s, z0.s[0]\n"
-      "ble 20f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z16.s, z0.s[1]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z17.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z18.s, z0.s[1]\n"
-      "ble 20f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z19.s, z0.s[2]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z20.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z21.s, z0.s[2]\n"
-      "ble 20f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z22.s, z0.s[3]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z23.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z26.s, z1.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "20:"  // Width 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 21f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "21:"  // Width 3: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "st1w { z26.s }, p1, [%x[output_ptr], #2, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #3\n"
-      "b 57f\n"
-      "22:"  // Width 4
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x3\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 23f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
-      "addvl x22, x22, #4\n"
-      "b 24f\n"
-      "23:"  // Width 4: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "mov z27.b, #0x0\n"
-      "24:"  // Width 4: setup done
-      "cmp x21, #0x4\n"
-      "ble 26f\n"
-      "25:"  // Width 4: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "cmp x21, #0x4\n"
-      "fmla z27.s, z4.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z5.s, z0.s[1]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z6.s, z0.s[1]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z7.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z27.s, z8.s, z0.s[1]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z9.s, z0.s[2]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z10.s, z0.s[2]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z11.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z27.s, z12.s, z0.s[2]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z13.s, z0.s[3]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z14.s, z0.s[3]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z15.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z27.s, z16.s, z0.s[3]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 25b\n"
-      "26:"  // Width 4: Multiply loop: Single iteration only
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z17.s, z0.s[0]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z18.s, z0.s[0]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z19.s, z0.s[0]\n"
-      "fmla z27.s, z20.s, z0.s[0]\n"
-      "ble 27f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z21.s, z0.s[1]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z22.s, z0.s[1]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z23.s, z0.s[1]\n"
-      "fmla z27.s, z1.s, z0.s[1]\n"
-      "ble 27f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z2.s, z0.s[2]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z3.s, z0.s[2]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z4.s, z0.s[2]\n"
-      "fmla z27.s, z5.s, z0.s[2]\n"
-      "ble 27f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z6.s, z0.s[3]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z7.s, z0.s[3]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z8.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z27.s, z9.s, z0.s[3]\n"
-      "27:"  // Width 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 28f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "28:"  // Width 4: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [%x[output_ptr], #3, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #4\n"
-      "b 57f\n"
-      "29:"  // Width 5
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x4\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 30f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
-      "addvl x22, x22, #5\n"
-      "b 31f\n"
-      "30:"  // Width 5: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "mov z27.b, #0x0\n"
-      "mov z28.b, #0x0\n"
-      "31:"  // Width 5: setup done
-      "cmp x21, #0x4\n"
-      "ble 33f\n"
-      "32:"  // Width 5: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "cmp x21, #0x4\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z4.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z28.s, z5.s, z0.s[0]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z6.s, z0.s[1]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z7.s, z0.s[1]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z8.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z27.s, z9.s, z0.s[1]\n"
-      "fmla z28.s, z10.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z11.s, z0.s[2]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z12.s, z0.s[2]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z13.s, z0.s[2]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z14.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z28.s, z15.s, z0.s[2]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z16.s, z0.s[3]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z17.s, z0.s[3]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z18.s, z0.s[3]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z19.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z28.s, z20.s, z0.s[3]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 32b\n"
-      "33:"  // Width 5: Multiply loop: Single iteration only
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z21.s, z0.s[0]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z22.s, z0.s[0]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z23.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z1.s, z0.s[0]\n"
-      "fmla z28.s, z2.s, z0.s[0]\n"
-      "ble 34f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z3.s, z0.s[1]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z4.s, z0.s[1]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z5.s, z0.s[1]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z6.s, z0.s[1]\n"
-      "fmla z28.s, z7.s, z0.s[1]\n"
-      "ble 34f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z8.s, z0.s[2]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z9.s, z0.s[2]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z10.s, z0.s[2]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z11.s, z0.s[2]\n"
-      "fmla z28.s, z12.s, z0.s[2]\n"
-      "ble 34f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z13.s, z0.s[3]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z14.s, z0.s[3]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z15.s, z0.s[3]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z16.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z28.s, z17.s, z0.s[3]\n"
-      "34:"  // Width 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 35f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
-      "35:"  // Width 5: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
-      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
-      "st1w { z28.s }, p1, [%x[output_ptr], #4, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #5\n"
-      "b 57f\n"
-      "36:"  // Width 6
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x5\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 37f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
-      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
-      "addvl x22, x22, #6\n"
-      "b 38f\n"
-      "37:"  // Width 6: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "mov z27.b, #0x0\n"
-      "mov z28.b, #0x0\n"
-      "mov z29.b, #0x0\n"
-      "38:"  // Width 6: setup done
-      "cmp x21, #0x4\n"
-      "ble 40f\n"
-      "39:"  // Width 6: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "cmp x21, #0x4\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z4.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z5.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z29.s, z6.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z7.s, z0.s[1]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z8.s, z0.s[1]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z9.s, z0.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z10.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z28.s, z11.s, z0.s[1]\n"
-      "fmla z29.s, z12.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z13.s, z0.s[2]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z14.s, z0.s[2]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z15.s, z0.s[2]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z16.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z28.s, z17.s, z0.s[2]\n"
-      "fmla z29.s, z18.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z19.s, z0.s[3]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z20.s, z0.s[3]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z21.s, z0.s[3]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z22.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z28.s, z23.s, z0.s[3]\n"
-      "fmla z29.s, z1.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 39b\n"
-      "40:"  // Width 6: Multiply loop: Single iteration only
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z3.s, z0.s[0]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z4.s, z0.s[0]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z5.s, z0.s[0]\n"
-      "fmla z28.s, z6.s, z0.s[0]\n"
-      "fmla z29.s, z7.s, z0.s[0]\n"
-      "ble 41f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z8.s, z0.s[1]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z9.s, z0.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z10.s, z0.s[1]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z11.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z12.s, z0.s[1]\n"
-      "fmla z29.s, z13.s, z0.s[1]\n"
-      "ble 41f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z14.s, z0.s[2]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z15.s, z0.s[2]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z16.s, z0.s[2]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z17.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z18.s, z0.s[2]\n"
-      "fmla z29.s, z19.s, z0.s[2]\n"
-      "ble 41f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z20.s, z0.s[3]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z21.s, z0.s[3]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z22.s, z0.s[3]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z23.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z28.s, z1.s, z0.s[3]\n"
-      "fmla z29.s, z2.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "41:"  // Width 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 42f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
-      "fmin z29.s, p2/M, z29.s, z16.s\n"
-      "fmax z29.s, p2/M, z29.s, z17.s\n"
-      "42:"  // Width 6: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
-      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
-      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
-      "st1w { z29.s }, p1, [%x[output_ptr], #5, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #6\n"
-      "b 57f\n"
-      "43:"  // Width 7
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x6\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 44f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
-      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
-      "addvl x22, x22, #7\n"
-      "b 45f\n"
-      "44:"  // Width 7: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "mov z27.b, #0x0\n"
-      "mov z28.b, #0x0\n"
-      "mov z29.b, #0x0\n"
-      "mov z30.b, #0x0\n"
-      "45:"  // Width 7: setup done
-      "cmp x21, #0x4\n"
-      "ble 47f\n"
-      "46:"  // Width 7: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "cmp x21, #0x4\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z4.s, z0.s[0]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z28.s, z5.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z29.s, z6.s, z0.s[0]\n"
-      "fmla z30.s, z7.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z8.s, z0.s[1]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z9.s, z0.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z10.s, z0.s[1]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z11.s, z0.s[1]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z12.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z29.s, z13.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z30.s, z14.s, z0.s[1]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z15.s, z0.s[2]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z25.s, z16.s, z0.s[2]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z17.s, z0.s[2]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z18.s, z0.s[2]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z19.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z29.s, z20.s, z0.s[2]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z30.s, z21.s, z0.s[2]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z22.s, z0.s[3]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z25.s, z23.s, z0.s[3]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z1.s, z0.s[3]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z2.s, z0.s[3]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z3.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z29.s, z4.s, z0.s[3]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "fmla z30.s, z5.s, z0.s[3]\n"
-      "bgt 46b\n"
-      "47:"  // Width 7: Multiply loop: Single iteration only
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z6.s, z0.s[0]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z7.s, z0.s[0]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z8.s, z0.s[0]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z27.s, z9.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z10.s, z0.s[0]\n"
-      "fmla z29.s, z11.s, z0.s[0]\n"
-      "fmla z30.s, z12.s, z0.s[0]\n"
-      "ble 48f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z13.s, z0.s[1]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z14.s, z0.s[1]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z15.s, z0.s[1]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z16.s, z0.s[1]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z17.s, z0.s[1]\n"
-      "fmla z29.s, z18.s, z0.s[1]\n"
-      "fmla z30.s, z19.s, z0.s[1]\n"
-      "ble 48f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z20.s, z0.s[2]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z21.s, z0.s[2]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z22.s, z0.s[2]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z23.s, z0.s[2]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z1.s, z0.s[2]\n"
-      "fmla z29.s, z2.s, z0.s[2]\n"
-      "fmla z30.s, z3.s, z0.s[2]\n"
-      "ble 48f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z4.s, z0.s[3]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z5.s, z0.s[3]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z6.s, z0.s[3]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z7.s, z0.s[3]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z8.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z29.s, z9.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z30.s, z10.s, z0.s[3]\n"
-      "48:"  // Width 7: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 49f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
-      "fmin z29.s, p2/M, z29.s, z16.s\n"
-      "fmin z30.s, p2/M, z30.s, z16.s\n"
-      "fmax z29.s, p2/M, z29.s, z17.s\n"
-      "fmax z30.s, p2/M, z30.s, z17.s\n"
-      "49:"  // Width 7: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
-      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
-      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
-      "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
-      "st1w { z30.s }, p1, [%x[output_ptr], #6, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #7\n"
-      "b 57f\n"
-      "50:"  // Width 8
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x7\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 51f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
-      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
-      "ld1w { z31.s }, p2/Z, [x22, #7, MUL VL]\n"
-      "addvl x22, x22, #8\n"
-      "b 52f\n"
-      "51:"  // Width 8: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "mov z27.b, #0x0\n"
-      "mov z28.b, #0x0\n"
-      "mov z29.b, #0x0\n"
-      "mov z30.b, #0x0\n"
-      "mov z31.b, #0x0\n"
-      "52:"  // Width 8: setup done
-      "cmp x21, #0x4\n"
-      "ble 54f\n"
-      "53:"  // Width 8: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "cmp x21, #0x4\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z4.s, z0.s[0]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z28.s, z5.s, z0.s[0]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z6.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z30.s, z7.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z31.s, z8.s, z0.s[0]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z25.s, z10.s, z0.s[1]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z11.s, z0.s[1]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z12.s, z0.s[1]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z28.s, z13.s, z0.s[1]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z14.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z30.s, z15.s, z0.s[1]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z31.s, z16.s, z0.s[1]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z17.s, z0.s[2]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z25.s, z18.s, z0.s[2]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z19.s, z0.s[2]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z20.s, z0.s[2]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z28.s, z21.s, z0.s[2]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z22.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z30.s, z23.s, z0.s[2]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z31.s, z1.s, z0.s[2]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z2.s, z0.s[3]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z25.s, z3.s, z0.s[3]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z4.s, z0.s[3]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z5.s, z0.s[3]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z28.s, z6.s, z0.s[3]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z7.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z30.s, z8.s, z0.s[3]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "fmla z31.s, z9.s, z0.s[3]\n"
-      "bgt 53b\n"
-      "54:"  // Width 8: Multiply loop: Single iteration only
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z10.s, z0.s[0]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z11.s, z0.s[0]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z12.s, z0.s[0]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z27.s, z13.s, z0.s[0]\n"
-      "fmla z28.s, z14.s, z0.s[0]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z15.s, z0.s[0]\n"
-      "fmla z30.s, z16.s, z0.s[0]\n"
-      "fmla z31.s, z17.s, z0.s[0]\n"
-      "ble 55f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z18.s, z0.s[1]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z19.s, z0.s[1]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z20.s, z0.s[1]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z21.s, z0.s[1]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "fmla z28.s, z22.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z23.s, z0.s[1]\n"
-      "fmla z30.s, z1.s, z0.s[1]\n"
-      "fmla z31.s, z2.s, z0.s[1]\n"
-      "ble 55f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z3.s, z0.s[2]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z4.s, z0.s[2]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z5.s, z0.s[2]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z6.s, z0.s[2]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "fmla z28.s, z7.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z8.s, z0.s[2]\n"
-      "fmla z30.s, z9.s, z0.s[2]\n"
-      "fmla z31.s, z10.s, z0.s[2]\n"
-      "ble 55f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z12.s, z0.s[3]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z13.s, z0.s[3]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z14.s, z0.s[3]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "fmla z28.s, z15.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z16.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z30.s, z17.s, z0.s[3]\n"
-      "fmla z31.s, z18.s, z0.s[3]\n"
-      "55:"  // Width 8: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 56f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
-      "fmin z29.s, p2/M, z29.s, z16.s\n"
-      "fmin z30.s, p2/M, z30.s, z16.s\n"
-      "fmin z31.s, p2/M, z31.s, z16.s\n"
-      "fmax z29.s, p2/M, z29.s, z17.s\n"
-      "fmax z30.s, p2/M, z30.s, z17.s\n"
-      "fmax z31.s, p2/M, z31.s, z17.s\n"
-      "56:"  // Width 8: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "subs x23, x23, #0x8\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "sub %x[N], %x[N], x24, LSL #3\n"
-      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
-      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
-      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
-      "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
-      "st1w { z30.s }, p2, [%x[output_ptr], #6, MUL VL]\n"
-      "st1w { z31.s }, p1, [%x[output_ptr], #7, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #8\n"
-      "bgt 1b\n"
-      "57:"  // Exit
-
-      : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
-      : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
-      : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
-    );
-}
-
-} // namespace arm_gemm
-
-#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
index 066bff4602..49ccce342e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,22 +10,23 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
 #include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -43,7 +44,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL( ARGLIST );
 class cls_sve_hybrid_bf16fp32_dot_6x4VL
 {
 public:
-    typedef bfloat16 operand_type;
+    typedef bfloat16 lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -69,7 +71,23 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 2> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 2> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.83 };
+                case CPUModel::A510:
+                    return { 6.80 };
+                case CPUModel::V1:
+                    return { 31.55 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_bf16fp32_dot_6x4VL;
@@ -81,4 +99,4 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
index 1233a98531..176f6e0d3a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,18 +10,18 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -103,32 +103,32 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "cmp %x[M], #0x2\n"
       "bgt 27f\n"
       "beq 14f\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[bias]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 3f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 3f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
       "b 5f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 4f\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
       "b 5f\n"
       "4:"  // Height 1: no accumulate
       "mov z8.b, #0x0\n"
@@ -136,178 +136,175 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "mov z10.b, #0x0\n"
       "mov z11.b, #0x0\n"
       "5:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 8f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 8f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
       "b 8f\n"
       "7:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "8:"  // Height 1: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 10f\n"
       "9:"  // Height 1: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "cmp x26, #0x8\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1h { z16.h }, p5/Z, [x10]\n"
+      ".inst 0x64604208  // bfdot z8.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64604209  // bfdot z9.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460420a  // bfdot z10.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6460420b  // bfdot z11.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x64684208  // bfdot z8.s, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x64684209  // bfdot z9.s, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6468420a  // bfdot z10.s, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x6468420b  // bfdot z11.s, z16.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x64704228  // bfdot z8.s, z17.h, z0.h[2]\n"
+      ".inst 0x64704209  // bfdot z9.s, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6470422a  // bfdot z10.s, z17.h, z0.h[2]\n"
+      ".inst 0x6470420b  // bfdot z11.s, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x64784228  // bfdot z8.s, z17.h, z0.h[3]\n"
+      ".inst 0x64784209  // bfdot z9.s, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6478422a  // bfdot z10.s, z17.h, z0.h[3]\n"
+      ".inst 0x6478420b  // bfdot z11.s, z16.h, z0.h[3]\n"
+      "add x26, x26, #0x10\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1h { z16.h }, p5/Z, [x10]\n"
+      ".inst 0x64604208  // bfdot z8.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64604209  // bfdot z9.s, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x2\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6460422a  // bfdot z10.s, z17.h, z0.h[0]\n"
+      ".inst 0x6460420b  // bfdot z11.s, z16.h, z0.h[0]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64684228  // bfdot z8.s, z17.h, z0.h[1]\n"
+      ".inst 0x64684209  // bfdot z9.s, z16.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x6468422a  // bfdot z10.s, z17.h, z0.h[1]\n"
+      ".inst 0x6468420b  // bfdot z11.s, z16.h, z0.h[1]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64704228  // bfdot z8.s, z17.h, z0.h[2]\n"
+      ".inst 0x64704209  // bfdot z9.s, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x6470422a  // bfdot z10.s, z17.h, z0.h[2]\n"
+      ".inst 0x6470420b  // bfdot z11.s, z16.h, z0.h[2]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64784228  // bfdot z8.s, z17.h, z0.h[3]\n"
+      ".inst 0x64784209  // bfdot z9.s, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6478422a  // bfdot z10.s, z17.h, z0.h[3]\n"
+      ".inst 0x6478420b  // bfdot z11.s, z16.h, z0.h[3]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
       "11:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 6b\n"
       "tbz %x[flags], #1, 12f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
       "12:"  // Height 1: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
       "13:"  // Height 1: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 2b\n"
       "b 80f\n"
       "14:"  // Height 2
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "15:"  // Height 2: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 16f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 16f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
       "mov z13.d, z9.d\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "addvl x12, x12, #4\n"
       "b 18f\n"
       "16:"  // Height 2: no bias
       "tbz %x[flags], #0, 17f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 18f\n"
       "17:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -319,242 +316,236 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "mov z14.b, #0x0\n"
       "mov z15.b, #0x0\n"
       "18:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "19:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 21f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 21f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
       "b 21f\n"
       "20:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
       "21:"  // Height 2: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 23f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64614228  // bfdot z8.s, z17.h, z1.h[0]\n"
+      ".inst 0x6460422c  // bfdot z12.s, z17.h, z0.h[0]\n"
+      ".inst 0x64614209  // bfdot z9.s, z16.h, z1.h[0]\n"
+      ".inst 0x6460420d  // bfdot z13.s, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461422a  // bfdot z10.s, z17.h, z1.h[0]\n"
+      ".inst 0x6460422e  // bfdot z14.s, z17.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6461420b  // bfdot z11.s, z16.h, z1.h[0]\n"
+      ".inst 0x6460420f  // bfdot z15.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x64694228  // bfdot z8.s, z17.h, z1.h[1]\n"
+      ".inst 0x6468422c  // bfdot z12.s, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x8\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x64694209  // bfdot z9.s, z16.h, z1.h[1]\n"
+      ".inst 0x6468420d  // bfdot z13.s, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x6469422a  // bfdot z10.s, z17.h, z1.h[1]\n"
+      ".inst 0x6468422e  // bfdot z14.s, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6469420b  // bfdot z11.s, z16.h, z1.h[1]\n"
+      ".inst 0x6468420f  // bfdot z15.s, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x64714228  // bfdot z8.s, z17.h, z1.h[2]\n"
+      ".inst 0x6470422c  // bfdot z12.s, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x64714209  // bfdot z9.s, z16.h, z1.h[2]\n"
+      ".inst 0x6470420d  // bfdot z13.s, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6471422a  // bfdot z10.s, z17.h, z1.h[2]\n"
+      ".inst 0x6470422e  // bfdot z14.s, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6471420b  // bfdot z11.s, z16.h, z1.h[2]\n"
+      ".inst 0x6470420f  // bfdot z15.s, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x64794228  // bfdot z8.s, z17.h, z1.h[3]\n"
+      ".inst 0x6478422c  // bfdot z12.s, z17.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x64794209  // bfdot z9.s, z16.h, z1.h[3]\n"
+      ".inst 0x6478420d  // bfdot z13.s, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6479422a  // bfdot z10.s, z17.h, z1.h[3]\n"
+      ".inst 0x6478422e  // bfdot z14.s, z17.h, z0.h[3]\n"
+      ".inst 0x6479420b  // bfdot z11.s, z16.h, z1.h[3]\n"
+      ".inst 0x6478420f  // bfdot z15.s, z16.h, z0.h[3]\n"
       "bgt 22b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x2\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64604228  // bfdot z8.s, z17.h, z0.h[0]\n"
+      ".inst 0x6461422c  // bfdot z12.s, z17.h, z1.h[0]\n"
+      ".inst 0x64604209  // bfdot z9.s, z16.h, z0.h[0]\n"
+      ".inst 0x6461420d  // bfdot z13.s, z16.h, z1.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6460422a  // bfdot z10.s, z17.h, z0.h[0]\n"
+      ".inst 0x6461422e  // bfdot z14.s, z17.h, z1.h[0]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x6460420b  // bfdot z11.s, z16.h, z0.h[0]\n"
+      ".inst 0x6461420f  // bfdot z15.s, z16.h, z1.h[0]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64684228  // bfdot z8.s, z17.h, z0.h[1]\n"
+      ".inst 0x6469422c  // bfdot z12.s, z17.h, z1.h[1]\n"
+      ".inst 0x64684209  // bfdot z9.s, z16.h, z0.h[1]\n"
+      ".inst 0x6469420d  // bfdot z13.s, z16.h, z1.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x6468422a  // bfdot z10.s, z17.h, z0.h[1]\n"
+      ".inst 0x6469422e  // bfdot z14.s, z17.h, z1.h[1]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x6468420b  // bfdot z11.s, z16.h, z0.h[1]\n"
+      ".inst 0x6469420f  // bfdot z15.s, z16.h, z1.h[1]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64704228  // bfdot z8.s, z17.h, z0.h[2]\n"
+      ".inst 0x6471422c  // bfdot z12.s, z17.h, z1.h[2]\n"
+      ".inst 0x64704209  // bfdot z9.s, z16.h, z0.h[2]\n"
+      ".inst 0x6471420d  // bfdot z13.s, z16.h, z1.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x6470422a  // bfdot z10.s, z17.h, z0.h[2]\n"
+      ".inst 0x6471422e  // bfdot z14.s, z17.h, z1.h[2]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x6470420b  // bfdot z11.s, z16.h, z0.h[2]\n"
+      ".inst 0x6471420f  // bfdot z15.s, z16.h, z1.h[2]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64784228  // bfdot z8.s, z17.h, z0.h[3]\n"
+      ".inst 0x6479422c  // bfdot z12.s, z17.h, z1.h[3]\n"
+      ".inst 0x64784209  // bfdot z9.s, z16.h, z0.h[3]\n"
+      ".inst 0x6479420d  // bfdot z13.s, z16.h, z1.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6478422a  // bfdot z10.s, z17.h, z0.h[3]\n"
+      ".inst 0x6479422e  // bfdot z14.s, z17.h, z1.h[3]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x6478420b  // bfdot z11.s, z16.h, z0.h[3]\n"
+      ".inst 0x6479420f  // bfdot z15.s, z16.h, z1.h[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 19b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
       "tbz %x[flags], #1, 25f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmin z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z0.s\n"
-      "fmin z14.s, p5/M, z14.s, z0.s\n"
-      "fmin z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z1.s\n"
-      "fmax z14.s, p5/M, z14.s, z1.s\n"
-      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z15.s, p5/M, z15.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z15.s, p5/M, z15.s, z16.s\n"
       "25:"  // Height 2: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
       "26:"  // Height 2: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 15b\n"
       "b 80f\n"
       "27:"  // Height 3
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "28:"  // Height 3: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 29f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 29f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
       "mov z13.d, z9.d\n"
-      "addvl x9, x9, #4\n"
-      "mov z17.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
       "b 31f\n"
       "29:"  // Height 3: no bias
       "tbz %x[flags], #0, 30f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20]\n"
+      "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 31f\n"
       "30:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -570,305 +561,296 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "mov z18.b, #0x0\n"
       "mov z19.b, #0x0\n"
       "31:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 34f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 34f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
       "b 34f\n"
       "33:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "34:"  // Height 3: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 36f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x24]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      ".inst 0x646242a8  // bfdot z8.s, z21.h, z2.h[0]\n"
+      ".inst 0x646142ac  // bfdot z12.s, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646042b0  // bfdot z16.s, z21.h, z0.h[0]\n"
+      ".inst 0x64624289  // bfdot z9.s, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6461428d  // bfdot z13.s, z20.h, z1.h[0]\n"
+      ".inst 0x64604291  // bfdot z17.s, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x646242aa  // bfdot z10.s, z21.h, z2.h[0]\n"
+      ".inst 0x646142ae  // bfdot z14.s, z21.h, z1.h[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      ".inst 0x646042b2  // bfdot z18.s, z21.h, z0.h[0]\n"
+      ".inst 0x6462428b  // bfdot z11.s, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x8\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6461428f  // bfdot z15.s, z20.h, z1.h[0]\n"
+      ".inst 0x64604293  // bfdot z19.s, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x646a42a8  // bfdot z8.s, z21.h, z2.h[1]\n"
+      ".inst 0x646942ac  // bfdot z12.s, z21.h, z1.h[1]\n"
+      ".inst 0x646842b0  // bfdot z16.s, z21.h, z0.h[1]\n"
+      ".inst 0x646a4289  // bfdot z9.s, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6469428d  // bfdot z13.s, z20.h, z1.h[1]\n"
+      ".inst 0x64684291  // bfdot z17.s, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x646a42aa  // bfdot z10.s, z21.h, z2.h[1]\n"
+      ".inst 0x646942ae  // bfdot z14.s, z21.h, z1.h[1]\n"
+      ".inst 0x646842b2  // bfdot z18.s, z21.h, z0.h[1]\n"
+      ".inst 0x646a428b  // bfdot z11.s, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6469428f  // bfdot z15.s, z20.h, z1.h[1]\n"
+      ".inst 0x64684293  // bfdot z19.s, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x647242a8  // bfdot z8.s, z21.h, z2.h[2]\n"
+      ".inst 0x647142ac  // bfdot z12.s, z21.h, z1.h[2]\n"
+      ".inst 0x647042b0  // bfdot z16.s, z21.h, z0.h[2]\n"
+      ".inst 0x64724289  // bfdot z9.s, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6471428d  // bfdot z13.s, z20.h, z1.h[2]\n"
+      ".inst 0x64704291  // bfdot z17.s, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x647242aa  // bfdot z10.s, z21.h, z2.h[2]\n"
+      ".inst 0x647142ae  // bfdot z14.s, z21.h, z1.h[2]\n"
+      ".inst 0x647042b2  // bfdot z18.s, z21.h, z0.h[2]\n"
+      ".inst 0x6472428b  // bfdot z11.s, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6471428f  // bfdot z15.s, z20.h, z1.h[2]\n"
+      ".inst 0x64704293  // bfdot z19.s, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x647a42a8  // bfdot z8.s, z21.h, z2.h[3]\n"
+      ".inst 0x647942ac  // bfdot z12.s, z21.h, z1.h[3]\n"
+      ".inst 0x647842b0  // bfdot z16.s, z21.h, z0.h[3]\n"
+      ".inst 0x647a4289  // bfdot z9.s, z20.h, z2.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6479428d  // bfdot z13.s, z20.h, z1.h[3]\n"
+      ".inst 0x64784291  // bfdot z17.s, z20.h, z0.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x647a42aa  // bfdot z10.s, z21.h, z2.h[3]\n"
+      ".inst 0x647942ae  // bfdot z14.s, z21.h, z1.h[3]\n"
+      ".inst 0x647842b2  // bfdot z18.s, z21.h, z0.h[3]\n"
+      ".inst 0x647a428b  // bfdot z11.s, z20.h, z2.h[3]\n"
+      ".inst 0x6479428f  // bfdot z15.s, z20.h, z1.h[3]\n"
+      ".inst 0x64784293  // bfdot z19.s, z20.h, z0.h[3]\n"
       "bgt 35b\n"
       "36:"  // Height 3: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x2\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      ".inst 0x646042a8  // bfdot z8.s, z21.h, z0.h[0]\n"
+      ".inst 0x646142ac  // bfdot z12.s, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646242b0  // bfdot z16.s, z21.h, z2.h[0]\n"
+      ".inst 0x64604289  // bfdot z9.s, z20.h, z0.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6461428d  // bfdot z13.s, z20.h, z1.h[0]\n"
+      ".inst 0x64624291  // bfdot z17.s, z20.h, z2.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646042aa  // bfdot z10.s, z21.h, z0.h[0]\n"
+      ".inst 0x646142ae  // bfdot z14.s, z21.h, z1.h[0]\n"
+      ".inst 0x646242b2  // bfdot z18.s, z21.h, z2.h[0]\n"
+      ".inst 0x6460428b  // bfdot z11.s, z20.h, z0.h[0]\n"
+      ".inst 0x6461428f  // bfdot z15.s, z20.h, z1.h[0]\n"
+      ".inst 0x64624293  // bfdot z19.s, z20.h, z2.h[0]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646842a8  // bfdot z8.s, z21.h, z0.h[1]\n"
+      ".inst 0x646942ac  // bfdot z12.s, z21.h, z1.h[1]\n"
+      ".inst 0x646a42b0  // bfdot z16.s, z21.h, z2.h[1]\n"
+      ".inst 0x64684289  // bfdot z9.s, z20.h, z0.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x6469428d  // bfdot z13.s, z20.h, z1.h[1]\n"
+      ".inst 0x646a4291  // bfdot z17.s, z20.h, z2.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646842aa  // bfdot z10.s, z21.h, z0.h[1]\n"
+      ".inst 0x646942ae  // bfdot z14.s, z21.h, z1.h[1]\n"
+      ".inst 0x646a42b2  // bfdot z18.s, z21.h, z2.h[1]\n"
+      ".inst 0x6468428b  // bfdot z11.s, z20.h, z0.h[1]\n"
+      ".inst 0x6469428f  // bfdot z15.s, z20.h, z1.h[1]\n"
+      ".inst 0x646a4293  // bfdot z19.s, z20.h, z2.h[1]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647042a8  // bfdot z8.s, z21.h, z0.h[2]\n"
+      ".inst 0x647142ac  // bfdot z12.s, z21.h, z1.h[2]\n"
+      ".inst 0x647242b0  // bfdot z16.s, z21.h, z2.h[2]\n"
+      ".inst 0x64704289  // bfdot z9.s, z20.h, z0.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x6471428d  // bfdot z13.s, z20.h, z1.h[2]\n"
+      ".inst 0x64724291  // bfdot z17.s, z20.h, z2.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647042aa  // bfdot z10.s, z21.h, z0.h[2]\n"
+      ".inst 0x647142ae  // bfdot z14.s, z21.h, z1.h[2]\n"
+      ".inst 0x647242b2  // bfdot z18.s, z21.h, z2.h[2]\n"
+      ".inst 0x6470428b  // bfdot z11.s, z20.h, z0.h[2]\n"
+      ".inst 0x6471428f  // bfdot z15.s, z20.h, z1.h[2]\n"
+      ".inst 0x64724293  // bfdot z19.s, z20.h, z2.h[2]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647842a8  // bfdot z8.s, z21.h, z0.h[3]\n"
+      ".inst 0x647942ac  // bfdot z12.s, z21.h, z1.h[3]\n"
+      ".inst 0x647a42b0  // bfdot z16.s, z21.h, z2.h[3]\n"
+      ".inst 0x64784289  // bfdot z9.s, z20.h, z0.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6479428d  // bfdot z13.s, z20.h, z1.h[3]\n"
+      ".inst 0x647a4291  // bfdot z17.s, z20.h, z2.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647842aa  // bfdot z10.s, z21.h, z0.h[3]\n"
+      ".inst 0x647942ae  // bfdot z14.s, z21.h, z1.h[3]\n"
+      ".inst 0x647a42b2  // bfdot z18.s, z21.h, z2.h[3]\n"
+      ".inst 0x6478428b  // bfdot z11.s, z20.h, z0.h[3]\n"
+      ".inst 0x6479428f  // bfdot z15.s, z20.h, z1.h[3]\n"
+      ".inst 0x647a4293  // bfdot z19.s, z20.h, z2.h[3]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 32b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
       "tbz %x[flags], #1, 38f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmin z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z0.s\n"
-      "fmin z14.s, p5/M, z14.s, z0.s\n"
-      "fmin z15.s, p5/M, z15.s, z0.s\n"
-      "fmin z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z1.s\n"
-      "fmax z14.s, p5/M, z14.s, z1.s\n"
-      "fmax z15.s, p5/M, z15.s, z1.s\n"
-      "fmax z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z0.s\n"
-      "fmin z18.s, p5/M, z18.s, z0.s\n"
-      "fmin z19.s, p5/M, z19.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z1.s\n"
-      "fmax z18.s, p5/M, z18.s, z1.s\n"
-      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z21.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z20.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z21.s\n"
+      "fmin z9.s, p5/M, z9.s, z21.s\n"
+      "fmin z10.s, p5/M, z10.s, z21.s\n"
+      "fmin z11.s, p5/M, z11.s, z21.s\n"
+      "fmin z12.s, p5/M, z12.s, z21.s\n"
+      "fmin z13.s, p5/M, z13.s, z21.s\n"
+      "fmin z14.s, p5/M, z14.s, z21.s\n"
+      "fmin z15.s, p5/M, z15.s, z21.s\n"
+      "fmin z16.s, p5/M, z16.s, z21.s\n"
+      "fmin z17.s, p5/M, z17.s, z21.s\n"
+      "fmin z18.s, p5/M, z18.s, z21.s\n"
+      "fmin z19.s, p5/M, z19.s, z21.s\n"
+      "fmax z8.s, p5/M, z8.s, z20.s\n"
+      "fmax z9.s, p5/M, z9.s, z20.s\n"
+      "fmax z10.s, p5/M, z10.s, z20.s\n"
+      "fmax z11.s, p5/M, z11.s, z20.s\n"
+      "fmax z12.s, p5/M, z12.s, z20.s\n"
+      "fmax z13.s, p5/M, z13.s, z20.s\n"
+      "fmax z14.s, p5/M, z14.s, z20.s\n"
+      "fmax z15.s, p5/M, z15.s, z20.s\n"
+      "fmax z16.s, p5/M, z16.s, z20.s\n"
+      "fmax z17.s, p5/M, z17.s, z20.s\n"
+      "fmax z18.s, p5/M, z18.s, z20.s\n"
+      "fmax z19.s, p5/M, z19.s, z20.s\n"
       "38:"  // Height 3: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
       "39:"  // Height 3: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 28b\n"
       "b 80f\n"
       "40:"  // Height 4
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "41:"  // Height 4: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 42f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 42f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "mov z20.d, z8.d\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
       "mov z13.d, z9.d\n"
-      "mov z17.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
       "mov z21.d, z9.d\n"
       "mov z22.d, z10.d\n"
       "mov z23.d, z11.d\n"
       "b 44f\n"
       "42:"  // Height 4: no bias
       "tbz %x[flags], #0, 43f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21]\n"
+      "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 44f\n"
       "43:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -888,333 +870,321 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "mov z22.b, #0x0\n"
       "mov z23.b, #0x0\n"
       "44:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "45:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 47f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
       "b 47f\n"
       "46:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "47:"  // Height 4: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 49f\n"
       "48:"  // Height 4: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z3.h }, p0/Z, [x26]\n"
+      "ld1rqh { z2.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "ld1rqh { z0.h }, p0/Z, [x23]\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64634328  // bfdot z8.s, z25.h, z3.h[0]\n"
+      ".inst 0x6462432c  // bfdot z12.s, z25.h, z2.h[0]\n"
+      ".inst 0x64614330  // bfdot z16.s, z25.h, z1.h[0]\n"
+      ".inst 0x64604334  // bfdot z20.s, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      ".inst 0x64634309  // bfdot z9.s, z24.h, z3.h[0]\n"
+      ".inst 0x6462430d  // bfdot z13.s, z24.h, z2.h[0]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x64614311  // bfdot z17.s, z24.h, z1.h[0]\n"
+      ".inst 0x64604315  // bfdot z21.s, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6463432a  // bfdot z10.s, z25.h, z3.h[0]\n"
+      ".inst 0x6462432e  // bfdot z14.s, z25.h, z2.h[0]\n"
+      ".inst 0x64614332  // bfdot z18.s, z25.h, z1.h[0]\n"
+      ".inst 0x64604336  // bfdot z22.s, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6463430b  // bfdot z11.s, z24.h, z3.h[0]\n"
+      ".inst 0x6462430f  // bfdot z15.s, z24.h, z2.h[0]\n"
+      ".inst 0x64614313  // bfdot z19.s, z24.h, z1.h[0]\n"
+      ".inst 0x64604317  // bfdot z23.s, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x646b4328  // bfdot z8.s, z25.h, z3.h[1]\n"
+      ".inst 0x646a432c  // bfdot z12.s, z25.h, z2.h[1]\n"
+      ".inst 0x64694330  // bfdot z16.s, z25.h, z1.h[1]\n"
+      ".inst 0x64684334  // bfdot z20.s, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x646b4309  // bfdot z9.s, z24.h, z3.h[1]\n"
+      ".inst 0x646a430d  // bfdot z13.s, z24.h, z2.h[1]\n"
+      ".inst 0x64694311  // bfdot z17.s, z24.h, z1.h[1]\n"
+      ".inst 0x64684315  // bfdot z21.s, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x646b432a  // bfdot z10.s, z25.h, z3.h[1]\n"
+      ".inst 0x646a432e  // bfdot z14.s, z25.h, z2.h[1]\n"
+      ".inst 0x64694332  // bfdot z18.s, z25.h, z1.h[1]\n"
+      ".inst 0x64684336  // bfdot z22.s, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x646b430b  // bfdot z11.s, z24.h, z3.h[1]\n"
+      ".inst 0x646a430f  // bfdot z15.s, z24.h, z2.h[1]\n"
+      ".inst 0x64694313  // bfdot z19.s, z24.h, z1.h[1]\n"
+      ".inst 0x64684317  // bfdot z23.s, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x64734328  // bfdot z8.s, z25.h, z3.h[2]\n"
+      ".inst 0x6472432c  // bfdot z12.s, z25.h, z2.h[2]\n"
+      ".inst 0x64714330  // bfdot z16.s, z25.h, z1.h[2]\n"
+      ".inst 0x64704334  // bfdot z20.s, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x64734309  // bfdot z9.s, z24.h, z3.h[2]\n"
+      ".inst 0x6472430d  // bfdot z13.s, z24.h, z2.h[2]\n"
+      ".inst 0x64714311  // bfdot z17.s, z24.h, z1.h[2]\n"
+      ".inst 0x64704315  // bfdot z21.s, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6473432a  // bfdot z10.s, z25.h, z3.h[2]\n"
+      ".inst 0x6472432e  // bfdot z14.s, z25.h, z2.h[2]\n"
+      ".inst 0x64714332  // bfdot z18.s, z25.h, z1.h[2]\n"
+      ".inst 0x64704336  // bfdot z22.s, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6473430b  // bfdot z11.s, z24.h, z3.h[2]\n"
+      ".inst 0x6472430f  // bfdot z15.s, z24.h, z2.h[2]\n"
+      ".inst 0x64714313  // bfdot z19.s, z24.h, z1.h[2]\n"
+      ".inst 0x64704317  // bfdot z23.s, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x647b4328  // bfdot z8.s, z25.h, z3.h[3]\n"
+      ".inst 0x647a432c  // bfdot z12.s, z25.h, z2.h[3]\n"
+      ".inst 0x64794330  // bfdot z16.s, z25.h, z1.h[3]\n"
+      ".inst 0x64784334  // bfdot z20.s, z25.h, z0.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x647b4309  // bfdot z9.s, z24.h, z3.h[3]\n"
+      ".inst 0x647a430d  // bfdot z13.s, z24.h, z2.h[3]\n"
+      ".inst 0x64794311  // bfdot z17.s, z24.h, z1.h[3]\n"
+      ".inst 0x64784315  // bfdot z21.s, z24.h, z0.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x647b432a  // bfdot z10.s, z25.h, z3.h[3]\n"
+      ".inst 0x647a432e  // bfdot z14.s, z25.h, z2.h[3]\n"
+      ".inst 0x64794332  // bfdot z18.s, z25.h, z1.h[3]\n"
+      ".inst 0x64784336  // bfdot z22.s, z25.h, z0.h[3]\n"
+      ".inst 0x647b430b  // bfdot z11.s, z24.h, z3.h[3]\n"
+      ".inst 0x647a430f  // bfdot z15.s, z24.h, z2.h[3]\n"
+      ".inst 0x64794313  // bfdot z19.s, z24.h, z1.h[3]\n"
+      ".inst 0x64784317  // bfdot z23.s, z24.h, z0.h[3]\n"
       "bgt 48b\n"
       "49:"  // Height 4: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x2\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64604328  // bfdot z8.s, z25.h, z0.h[0]\n"
+      ".inst 0x6461432c  // bfdot z12.s, z25.h, z1.h[0]\n"
+      ".inst 0x64624330  // bfdot z16.s, z25.h, z2.h[0]\n"
+      ".inst 0x64634334  // bfdot z20.s, z25.h, z3.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x64604309  // bfdot z9.s, z24.h, z0.h[0]\n"
+      ".inst 0x6461430d  // bfdot z13.s, z24.h, z1.h[0]\n"
+      ".inst 0x64624311  // bfdot z17.s, z24.h, z2.h[0]\n"
+      ".inst 0x64634315  // bfdot z21.s, z24.h, z3.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      ".inst 0x6460432a  // bfdot z10.s, z25.h, z0.h[0]\n"
+      ".inst 0x6461432e  // bfdot z14.s, z25.h, z1.h[0]\n"
+      ".inst 0x64624332  // bfdot z18.s, z25.h, z2.h[0]\n"
+      ".inst 0x64634336  // bfdot z22.s, z25.h, z3.h[0]\n"
+      ".inst 0x6460430b  // bfdot z11.s, z24.h, z0.h[0]\n"
+      ".inst 0x6461430f  // bfdot z15.s, z24.h, z1.h[0]\n"
+      ".inst 0x64624313  // bfdot z19.s, z24.h, z2.h[0]\n"
+      ".inst 0x64634317  // bfdot z23.s, z24.h, z3.h[0]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64684328  // bfdot z8.s, z25.h, z0.h[1]\n"
+      ".inst 0x6469432c  // bfdot z12.s, z25.h, z1.h[1]\n"
+      ".inst 0x646a4330  // bfdot z16.s, z25.h, z2.h[1]\n"
+      ".inst 0x646b4334  // bfdot z20.s, z25.h, z3.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x64684309  // bfdot z9.s, z24.h, z0.h[1]\n"
+      ".inst 0x6469430d  // bfdot z13.s, z24.h, z1.h[1]\n"
+      ".inst 0x646a4311  // bfdot z17.s, z24.h, z2.h[1]\n"
+      ".inst 0x646b4315  // bfdot z21.s, z24.h, z3.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      ".inst 0x6468432a  // bfdot z10.s, z25.h, z0.h[1]\n"
+      ".inst 0x6469432e  // bfdot z14.s, z25.h, z1.h[1]\n"
+      ".inst 0x646a4332  // bfdot z18.s, z25.h, z2.h[1]\n"
+      ".inst 0x646b4336  // bfdot z22.s, z25.h, z3.h[1]\n"
+      ".inst 0x6468430b  // bfdot z11.s, z24.h, z0.h[1]\n"
+      ".inst 0x6469430f  // bfdot z15.s, z24.h, z1.h[1]\n"
+      ".inst 0x646a4313  // bfdot z19.s, z24.h, z2.h[1]\n"
+      ".inst 0x646b4317  // bfdot z23.s, z24.h, z3.h[1]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64704328  // bfdot z8.s, z25.h, z0.h[2]\n"
+      ".inst 0x6471432c  // bfdot z12.s, z25.h, z1.h[2]\n"
+      ".inst 0x64724330  // bfdot z16.s, z25.h, z2.h[2]\n"
+      ".inst 0x64734334  // bfdot z20.s, z25.h, z3.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x64704309  // bfdot z9.s, z24.h, z0.h[2]\n"
+      ".inst 0x6471430d  // bfdot z13.s, z24.h, z1.h[2]\n"
+      ".inst 0x64724311  // bfdot z17.s, z24.h, z2.h[2]\n"
+      ".inst 0x64734315  // bfdot z21.s, z24.h, z3.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      ".inst 0x6470432a  // bfdot z10.s, z25.h, z0.h[2]\n"
+      ".inst 0x6471432e  // bfdot z14.s, z25.h, z1.h[2]\n"
+      ".inst 0x64724332  // bfdot z18.s, z25.h, z2.h[2]\n"
+      ".inst 0x64734336  // bfdot z22.s, z25.h, z3.h[2]\n"
+      ".inst 0x6470430b  // bfdot z11.s, z24.h, z0.h[2]\n"
+      ".inst 0x6471430f  // bfdot z15.s, z24.h, z1.h[2]\n"
+      ".inst 0x64724313  // bfdot z19.s, z24.h, z2.h[2]\n"
+      ".inst 0x64734317  // bfdot z23.s, z24.h, z3.h[2]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64784328  // bfdot z8.s, z25.h, z0.h[3]\n"
+      ".inst 0x6479432c  // bfdot z12.s, z25.h, z1.h[3]\n"
+      ".inst 0x647a4330  // bfdot z16.s, z25.h, z2.h[3]\n"
+      ".inst 0x647b4334  // bfdot z20.s, z25.h, z3.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x64784309  // bfdot z9.s, z24.h, z0.h[3]\n"
+      ".inst 0x6479430d  // bfdot z13.s, z24.h, z1.h[3]\n"
+      ".inst 0x647a4311  // bfdot z17.s, z24.h, z2.h[3]\n"
+      ".inst 0x647b4315  // bfdot z21.s, z24.h, z3.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x6478432a  // bfdot z10.s, z25.h, z0.h[3]\n"
+      ".inst 0x6479432e  // bfdot z14.s, z25.h, z1.h[3]\n"
+      ".inst 0x647a4332  // bfdot z18.s, z25.h, z2.h[3]\n"
+      ".inst 0x647b4336  // bfdot z22.s, z25.h, z3.h[3]\n"
+      ".inst 0x6478430b  // bfdot z11.s, z24.h, z0.h[3]\n"
+      ".inst 0x6479430f  // bfdot z15.s, z24.h, z1.h[3]\n"
+      ".inst 0x647a4313  // bfdot z19.s, z24.h, z2.h[3]\n"
+      ".inst 0x647b4317  // bfdot z23.s, z24.h, z3.h[3]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 45b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "tbz %x[flags], #1, 51f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmin z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z0.s\n"
-      "fmin z14.s, p5/M, z14.s, z0.s\n"
-      "fmin z15.s, p5/M, z15.s, z0.s\n"
-      "fmin z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z1.s\n"
-      "fmax z14.s, p5/M, z14.s, z1.s\n"
-      "fmax z15.s, p5/M, z15.s, z1.s\n"
-      "fmax z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z0.s\n"
-      "fmin z18.s, p5/M, z18.s, z0.s\n"
-      "fmin z19.s, p5/M, z19.s, z0.s\n"
-      "fmin z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z1.s\n"
-      "fmax z18.s, p5/M, z18.s, z1.s\n"
-      "fmax z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z0.s\n"
-      "fmin z22.s, p5/M, z22.s, z0.s\n"
-      "fmin z23.s, p5/M, z23.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z1.s\n"
-      "fmax z22.s, p5/M, z22.s, z1.s\n"
-      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z15.s, p5/M, z15.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmin z20.s, p5/M, z20.s, z25.s\n"
+      "fmin z21.s, p5/M, z21.s, z25.s\n"
+      "fmin z22.s, p5/M, z22.s, z25.s\n"
+      "fmin z23.s, p5/M, z23.s, z25.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z15.s, p5/M, z15.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z20.s, p5/M, z20.s, z24.s\n"
+      "fmax z21.s, p5/M, z21.s, z24.s\n"
+      "fmax z22.s, p5/M, z22.s, z24.s\n"
+      "fmax z23.s, p5/M, z23.s, z24.s\n"
       "51:"  // Height 4: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x22]\n"
-      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x23]\n"
+      "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
       "52:"  // Height 4: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 41b\n"
       "b 80f\n"
       "53:"  // Height 5
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "54:"  // Height 5: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 55f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 55f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "mov z20.d, z8.d\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
       "mov z13.d, z9.d\n"
-      "mov z17.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
       "mov z21.d, z9.d\n"
       "mov z22.d, z10.d\n"
       "mov z23.d, z11.d\n"
@@ -1225,31 +1195,31 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "b 57f\n"
       "55:"  // Height 5: no bias
       "tbz %x[flags], #0, 56f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x21]\n"
-      "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22]\n"
+      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x20]\n"
+      "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 57f\n"
       "56:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -1273,390 +1243,375 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "mov z26.b, #0x0\n"
       "mov z27.b, #0x0\n"
       "57:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "58:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 60f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
-      "add x21, x21, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 60f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
       "b 60f\n"
       "59:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "60:"  // Height 5: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 62f\n"
       "61:"  // Height 5: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z4.h }, p0/Z, [x26]\n"
+      "ld1rqh { z3.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z1.h }, p0/Z, [x23]\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqh { z0.h }, p0/Z, [x22]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      ".inst 0x646443a8  // bfdot z8.s, z29.h, z4.h[0]\n"
+      ".inst 0x646343ac  // bfdot z12.s, z29.h, z3.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646243b0  // bfdot z16.s, z29.h, z2.h[0]\n"
+      ".inst 0x646143b4  // bfdot z20.s, z29.h, z1.h[0]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      ".inst 0x646043b8  // bfdot z24.s, z29.h, z0.h[0]\n"
+      ".inst 0x64644389  // bfdot z9.s, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      ".inst 0x6463438d  // bfdot z13.s, z28.h, z3.h[0]\n"
+      ".inst 0x64624391  // bfdot z17.s, z28.h, z2.h[0]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      "ld1rqh { z4.h }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x21, x21, #0x10\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
-      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x64614395  // bfdot z21.s, z28.h, z1.h[0]\n"
+      ".inst 0x64604399  // bfdot z25.s, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x646443aa  // bfdot z10.s, z29.h, z4.h[0]\n"
+      ".inst 0x646343ae  // bfdot z14.s, z29.h, z3.h[0]\n"
+      ".inst 0x646243b2  // bfdot z18.s, z29.h, z2.h[0]\n"
+      ".inst 0x646143b6  // bfdot z22.s, z29.h, z1.h[0]\n"
+      ".inst 0x646043ba  // bfdot z26.s, z29.h, z0.h[0]\n"
+      ".inst 0x6464438b  // bfdot z11.s, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6463438f  // bfdot z15.s, z28.h, z3.h[0]\n"
+      ".inst 0x64624393  // bfdot z19.s, z28.h, z2.h[0]\n"
+      ".inst 0x64614397  // bfdot z23.s, z28.h, z1.h[0]\n"
+      ".inst 0x6460439b  // bfdot z27.s, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x646c43a8  // bfdot z8.s, z29.h, z4.h[1]\n"
+      ".inst 0x646b43ac  // bfdot z12.s, z29.h, z3.h[1]\n"
+      ".inst 0x646a43b0  // bfdot z16.s, z29.h, z2.h[1]\n"
+      ".inst 0x646943b4  // bfdot z20.s, z29.h, z1.h[1]\n"
+      ".inst 0x646843b8  // bfdot z24.s, z29.h, z0.h[1]\n"
+      ".inst 0x646c4389  // bfdot z9.s, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x646b438d  // bfdot z13.s, z28.h, z3.h[1]\n"
+      ".inst 0x646a4391  // bfdot z17.s, z28.h, z2.h[1]\n"
+      ".inst 0x64694395  // bfdot z21.s, z28.h, z1.h[1]\n"
+      ".inst 0x64684399  // bfdot z25.s, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      ".inst 0x646c43aa  // bfdot z10.s, z29.h, z4.h[1]\n"
+      ".inst 0x646b43ae  // bfdot z14.s, z29.h, z3.h[1]\n"
+      ".inst 0x646a43b2  // bfdot z18.s, z29.h, z2.h[1]\n"
+      ".inst 0x646943b6  // bfdot z22.s, z29.h, z1.h[1]\n"
+      ".inst 0x646843ba  // bfdot z26.s, z29.h, z0.h[1]\n"
+      ".inst 0x646c438b  // bfdot z11.s, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x646b438f  // bfdot z15.s, z28.h, z3.h[1]\n"
+      ".inst 0x646a4393  // bfdot z19.s, z28.h, z2.h[1]\n"
+      ".inst 0x64694397  // bfdot z23.s, z28.h, z1.h[1]\n"
+      ".inst 0x6468439b  // bfdot z27.s, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x647443a8  // bfdot z8.s, z29.h, z4.h[2]\n"
+      ".inst 0x647343ac  // bfdot z12.s, z29.h, z3.h[2]\n"
+      ".inst 0x647243b0  // bfdot z16.s, z29.h, z2.h[2]\n"
+      ".inst 0x647143b4  // bfdot z20.s, z29.h, z1.h[2]\n"
+      ".inst 0x647043b8  // bfdot z24.s, z29.h, z0.h[2]\n"
+      ".inst 0x64744389  // bfdot z9.s, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6473438d  // bfdot z13.s, z28.h, z3.h[2]\n"
+      ".inst 0x64724391  // bfdot z17.s, z28.h, z2.h[2]\n"
+      ".inst 0x64714395  // bfdot z21.s, z28.h, z1.h[2]\n"
+      ".inst 0x64704399  // bfdot z25.s, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x647443aa  // bfdot z10.s, z29.h, z4.h[2]\n"
+      ".inst 0x647343ae  // bfdot z14.s, z29.h, z3.h[2]\n"
+      ".inst 0x647243b2  // bfdot z18.s, z29.h, z2.h[2]\n"
+      ".inst 0x647143b6  // bfdot z22.s, z29.h, z1.h[2]\n"
+      ".inst 0x647043ba  // bfdot z26.s, z29.h, z0.h[2]\n"
+      ".inst 0x6474438b  // bfdot z11.s, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6473438f  // bfdot z15.s, z28.h, z3.h[2]\n"
+      ".inst 0x64724393  // bfdot z19.s, z28.h, z2.h[2]\n"
+      ".inst 0x64714397  // bfdot z23.s, z28.h, z1.h[2]\n"
+      ".inst 0x6470439b  // bfdot z27.s, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x647c43a8  // bfdot z8.s, z29.h, z4.h[3]\n"
+      ".inst 0x647b43ac  // bfdot z12.s, z29.h, z3.h[3]\n"
+      ".inst 0x647a43b0  // bfdot z16.s, z29.h, z2.h[3]\n"
+      ".inst 0x647943b4  // bfdot z20.s, z29.h, z1.h[3]\n"
+      ".inst 0x647843b8  // bfdot z24.s, z29.h, z0.h[3]\n"
+      ".inst 0x647c4389  // bfdot z9.s, z28.h, z4.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x647b438d  // bfdot z13.s, z28.h, z3.h[3]\n"
+      ".inst 0x647a4391  // bfdot z17.s, z28.h, z2.h[3]\n"
+      ".inst 0x64794395  // bfdot z21.s, z28.h, z1.h[3]\n"
+      ".inst 0x64784399  // bfdot z25.s, z28.h, z0.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x647c43aa  // bfdot z10.s, z29.h, z4.h[3]\n"
+      ".inst 0x647b43ae  // bfdot z14.s, z29.h, z3.h[3]\n"
+      ".inst 0x647a43b2  // bfdot z18.s, z29.h, z2.h[3]\n"
+      ".inst 0x647943b6  // bfdot z22.s, z29.h, z1.h[3]\n"
+      ".inst 0x647843ba  // bfdot z26.s, z29.h, z0.h[3]\n"
+      ".inst 0x647c438b  // bfdot z11.s, z28.h, z4.h[3]\n"
+      ".inst 0x647b438f  // bfdot z15.s, z28.h, z3.h[3]\n"
+      ".inst 0x647a4393  // bfdot z19.s, z28.h, z2.h[3]\n"
+      ".inst 0x64794397  // bfdot z23.s, z28.h, z1.h[3]\n"
+      ".inst 0x6478439b  // bfdot z27.s, z28.h, z0.h[3]\n"
       "bgt 61b\n"
       "62:"  // Height 5: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "add x21, x21, #0x10\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x2\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      ".inst 0x646043a8  // bfdot z8.s, z29.h, z0.h[0]\n"
+      ".inst 0x646143ac  // bfdot z12.s, z29.h, z1.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646243b0  // bfdot z16.s, z29.h, z2.h[0]\n"
+      ".inst 0x646343b4  // bfdot z20.s, z29.h, z3.h[0]\n"
+      ".inst 0x646443b8  // bfdot z24.s, z29.h, z4.h[0]\n"
+      ".inst 0x64604389  // bfdot z9.s, z28.h, z0.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6461438d  // bfdot z13.s, z28.h, z1.h[0]\n"
+      ".inst 0x64624391  // bfdot z17.s, z28.h, z2.h[0]\n"
+      ".inst 0x64634395  // bfdot z21.s, z28.h, z3.h[0]\n"
+      ".inst 0x64644399  // bfdot z25.s, z28.h, z4.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
+      ".inst 0x646043aa  // bfdot z10.s, z29.h, z0.h[0]\n"
+      ".inst 0x646143ae  // bfdot z14.s, z29.h, z1.h[0]\n"
+      ".inst 0x646243b2  // bfdot z18.s, z29.h, z2.h[0]\n"
+      ".inst 0x646343b6  // bfdot z22.s, z29.h, z3.h[0]\n"
+      ".inst 0x646443ba  // bfdot z26.s, z29.h, z4.h[0]\n"
+      ".inst 0x6460438b  // bfdot z11.s, z28.h, z0.h[0]\n"
+      ".inst 0x6461438f  // bfdot z15.s, z28.h, z1.h[0]\n"
+      ".inst 0x64624393  // bfdot z19.s, z28.h, z2.h[0]\n"
+      ".inst 0x64634397  // bfdot z23.s, z28.h, z3.h[0]\n"
+      ".inst 0x6464439b  // bfdot z27.s, z28.h, z4.h[0]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646843a8  // bfdot z8.s, z29.h, z0.h[1]\n"
+      ".inst 0x646943ac  // bfdot z12.s, z29.h, z1.h[1]\n"
+      ".inst 0x646a43b0  // bfdot z16.s, z29.h, z2.h[1]\n"
+      ".inst 0x646b43b4  // bfdot z20.s, z29.h, z3.h[1]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x646c43b8  // bfdot z24.s, z29.h, z4.h[1]\n"
+      ".inst 0x64684389  // bfdot z9.s, z28.h, z0.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6469438d  // bfdot z13.s, z28.h, z1.h[1]\n"
+      ".inst 0x646a4391  // bfdot z17.s, z28.h, z2.h[1]\n"
+      ".inst 0x646b4395  // bfdot z21.s, z28.h, z3.h[1]\n"
+      ".inst 0x646c4399  // bfdot z25.s, z28.h, z4.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
+      ".inst 0x646843aa  // bfdot z10.s, z29.h, z0.h[1]\n"
+      ".inst 0x646943ae  // bfdot z14.s, z29.h, z1.h[1]\n"
+      ".inst 0x646a43b2  // bfdot z18.s, z29.h, z2.h[1]\n"
+      ".inst 0x646b43b6  // bfdot z22.s, z29.h, z3.h[1]\n"
+      ".inst 0x646c43ba  // bfdot z26.s, z29.h, z4.h[1]\n"
+      ".inst 0x6468438b  // bfdot z11.s, z28.h, z0.h[1]\n"
+      ".inst 0x6469438f  // bfdot z15.s, z28.h, z1.h[1]\n"
+      ".inst 0x646a4393  // bfdot z19.s, z28.h, z2.h[1]\n"
+      ".inst 0x646b4397  // bfdot z23.s, z28.h, z3.h[1]\n"
+      ".inst 0x646c439b  // bfdot z27.s, z28.h, z4.h[1]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647043a8  // bfdot z8.s, z29.h, z0.h[2]\n"
+      ".inst 0x647143ac  // bfdot z12.s, z29.h, z1.h[2]\n"
+      ".inst 0x647243b0  // bfdot z16.s, z29.h, z2.h[2]\n"
+      ".inst 0x647343b4  // bfdot z20.s, z29.h, z3.h[2]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x647443b8  // bfdot z24.s, z29.h, z4.h[2]\n"
+      ".inst 0x64704389  // bfdot z9.s, z28.h, z0.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6471438d  // bfdot z13.s, z28.h, z1.h[2]\n"
+      ".inst 0x64724391  // bfdot z17.s, z28.h, z2.h[2]\n"
+      ".inst 0x64734395  // bfdot z21.s, z28.h, z3.h[2]\n"
+      ".inst 0x64744399  // bfdot z25.s, z28.h, z4.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
+      ".inst 0x647043aa  // bfdot z10.s, z29.h, z0.h[2]\n"
+      ".inst 0x647143ae  // bfdot z14.s, z29.h, z1.h[2]\n"
+      ".inst 0x647243b2  // bfdot z18.s, z29.h, z2.h[2]\n"
+      ".inst 0x647343b6  // bfdot z22.s, z29.h, z3.h[2]\n"
+      ".inst 0x647443ba  // bfdot z26.s, z29.h, z4.h[2]\n"
+      ".inst 0x6470438b  // bfdot z11.s, z28.h, z0.h[2]\n"
+      ".inst 0x6471438f  // bfdot z15.s, z28.h, z1.h[2]\n"
+      ".inst 0x64724393  // bfdot z19.s, z28.h, z2.h[2]\n"
+      ".inst 0x64734397  // bfdot z23.s, z28.h, z3.h[2]\n"
+      ".inst 0x6474439b  // bfdot z27.s, z28.h, z4.h[2]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647843a8  // bfdot z8.s, z29.h, z0.h[3]\n"
+      ".inst 0x647943ac  // bfdot z12.s, z29.h, z1.h[3]\n"
+      ".inst 0x647a43b0  // bfdot z16.s, z29.h, z2.h[3]\n"
+      ".inst 0x647b43b4  // bfdot z20.s, z29.h, z3.h[3]\n"
+      ".inst 0x647c43b8  // bfdot z24.s, z29.h, z4.h[3]\n"
+      ".inst 0x64784389  // bfdot z9.s, z28.h, z0.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6479438d  // bfdot z13.s, z28.h, z1.h[3]\n"
+      ".inst 0x647a4391  // bfdot z17.s, z28.h, z2.h[3]\n"
+      ".inst 0x647b4395  // bfdot z21.s, z28.h, z3.h[3]\n"
+      ".inst 0x647c4399  // bfdot z25.s, z28.h, z4.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      ".inst 0x647843aa  // bfdot z10.s, z29.h, z0.h[3]\n"
+      ".inst 0x647943ae  // bfdot z14.s, z29.h, z1.h[3]\n"
+      ".inst 0x647a43b2  // bfdot z18.s, z29.h, z2.h[3]\n"
+      ".inst 0x647b43b6  // bfdot z22.s, z29.h, z3.h[3]\n"
+      ".inst 0x647c43ba  // bfdot z26.s, z29.h, z4.h[3]\n"
+      ".inst 0x6478438b  // bfdot z11.s, z28.h, z0.h[3]\n"
+      ".inst 0x6479438f  // bfdot z15.s, z28.h, z1.h[3]\n"
+      ".inst 0x647a4393  // bfdot z19.s, z28.h, z2.h[3]\n"
+      ".inst 0x647b4397  // bfdot z23.s, z28.h, z3.h[3]\n"
+      ".inst 0x647c439b  // bfdot z27.s, z28.h, z4.h[3]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 58b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "tbz %x[flags], #1, 64f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmin z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z0.s\n"
-      "fmin z14.s, p5/M, z14.s, z0.s\n"
-      "fmin z15.s, p5/M, z15.s, z0.s\n"
-      "fmin z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z1.s\n"
-      "fmax z14.s, p5/M, z14.s, z1.s\n"
-      "fmax z15.s, p5/M, z15.s, z1.s\n"
-      "fmax z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z0.s\n"
-      "fmin z18.s, p5/M, z18.s, z0.s\n"
-      "fmin z19.s, p5/M, z19.s, z0.s\n"
-      "fmin z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z1.s\n"
-      "fmax z18.s, p5/M, z18.s, z1.s\n"
-      "fmax z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z0.s\n"
-      "fmin z22.s, p5/M, z22.s, z0.s\n"
-      "fmin z23.s, p5/M, z23.s, z0.s\n"
-      "fmin z24.s, p5/M, z24.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z1.s\n"
-      "fmax z22.s, p5/M, z22.s, z1.s\n"
-      "fmax z23.s, p5/M, z23.s, z1.s\n"
-      "fmax z24.s, p5/M, z24.s, z1.s\n"
-      "fmin z25.s, p5/M, z25.s, z0.s\n"
-      "fmin z26.s, p5/M, z26.s, z0.s\n"
-      "fmin z27.s, p5/M, z27.s, z0.s\n"
-      "fmax z25.s, p5/M, z25.s, z1.s\n"
-      "fmax z26.s, p5/M, z26.s, z1.s\n"
-      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z29.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z28.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z29.s\n"
+      "fmin z9.s, p5/M, z9.s, z29.s\n"
+      "fmin z10.s, p5/M, z10.s, z29.s\n"
+      "fmin z11.s, p5/M, z11.s, z29.s\n"
+      "fmin z12.s, p5/M, z12.s, z29.s\n"
+      "fmin z13.s, p5/M, z13.s, z29.s\n"
+      "fmin z14.s, p5/M, z14.s, z29.s\n"
+      "fmin z15.s, p5/M, z15.s, z29.s\n"
+      "fmin z16.s, p5/M, z16.s, z29.s\n"
+      "fmin z17.s, p5/M, z17.s, z29.s\n"
+      "fmin z18.s, p5/M, z18.s, z29.s\n"
+      "fmin z19.s, p5/M, z19.s, z29.s\n"
+      "fmin z20.s, p5/M, z20.s, z29.s\n"
+      "fmin z21.s, p5/M, z21.s, z29.s\n"
+      "fmin z22.s, p5/M, z22.s, z29.s\n"
+      "fmin z23.s, p5/M, z23.s, z29.s\n"
+      "fmin z24.s, p5/M, z24.s, z29.s\n"
+      "fmin z25.s, p5/M, z25.s, z29.s\n"
+      "fmin z26.s, p5/M, z26.s, z29.s\n"
+      "fmin z27.s, p5/M, z27.s, z29.s\n"
+      "fmax z8.s, p5/M, z8.s, z28.s\n"
+      "fmax z9.s, p5/M, z9.s, z28.s\n"
+      "fmax z10.s, p5/M, z10.s, z28.s\n"
+      "fmax z11.s, p5/M, z11.s, z28.s\n"
+      "fmax z12.s, p5/M, z12.s, z28.s\n"
+      "fmax z13.s, p5/M, z13.s, z28.s\n"
+      "fmax z14.s, p5/M, z14.s, z28.s\n"
+      "fmax z15.s, p5/M, z15.s, z28.s\n"
+      "fmax z16.s, p5/M, z16.s, z28.s\n"
+      "fmax z17.s, p5/M, z17.s, z28.s\n"
+      "fmax z18.s, p5/M, z18.s, z28.s\n"
+      "fmax z19.s, p5/M, z19.s, z28.s\n"
+      "fmax z20.s, p5/M, z20.s, z28.s\n"
+      "fmax z21.s, p5/M, z21.s, z28.s\n"
+      "fmax z22.s, p5/M, z22.s, z28.s\n"
+      "fmax z23.s, p5/M, z23.s, z28.s\n"
+      "fmax z24.s, p5/M, z24.s, z28.s\n"
+      "fmax z25.s, p5/M, z25.s, z28.s\n"
+      "fmax z26.s, p5/M, z26.s, z28.s\n"
+      "fmax z27.s, p5/M, z27.s, z28.s\n"
       "64:"  // Height 5: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x22]\n"
-      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z24.s }, p4, [x21]\n"
-      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x23]\n"
+      "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x22]\n"
+      "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
       "65:"  // Height 5: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 54b\n"
       "b 80f\n"
       "66:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x19, #0x18\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "67:"  // Height 6: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 68f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 68f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "mov z20.d, z8.d\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
       "mov z13.d, z9.d\n"
-      "mov z17.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
       "mov z21.d, z9.d\n"
       "mov z22.d, z10.d\n"
       "mov z23.d, z11.d\n"
@@ -1671,18 +1626,18 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "b 70f\n"
       "68:"  // Height 6: no bias
       "tbz %x[flags], #0, 69f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
       "ld1w { z12.s }, p4/Z, [x24]\n"
-      "add x21, x22, x19, LSL #2\n"
       "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "add x20, x21, x19, LSL #2\n"
       "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
       "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
       "ld1w { z16.s }, p4/Z, [x23]\n"
@@ -1728,429 +1683,410 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "mov z30.b, #0x0\n"
       "mov z31.b, #0x0\n"
       "70:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "71:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 72f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 73f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
-      "add x21, x21, x19, LSL #1\n"
-      "add x20, x20, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 73f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
       "b 73f\n"
       "72:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
-      "add x20, x21, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "73:"  // Height 6: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 75f\n"
       "74:"  // Height 6: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z7.h }, p0/Z, [x26]\n"
+      "ld1rqh { z6.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1rqh { z5.h }, p0/Z, [x24]\n"
+      "ld1rqh { z4.h }, p0/Z, [x23]\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1rqh { z2.h }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64674028  // bfdot z8.s, z1.h, z7.h[0]\n"
+      ".inst 0x6466402c  // bfdot z12.s, z1.h, z6.h[0]\n"
+      ".inst 0x64654030  // bfdot z16.s, z1.h, z5.h[0]\n"
+      ".inst 0x64644034  // bfdot z20.s, z1.h, z4.h[0]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      "ld1rqh { z4.h }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "ld1rqh { z5.h }, p0/Z, [x20]\n"
+      ".inst 0x64634038  // bfdot z24.s, z1.h, z3.h[0]\n"
+      ".inst 0x6462403c  // bfdot z28.s, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x20, x20, #0x10\n"
-      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
-      ".inst 0x646540dc  // bfdot z28.s, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      ".inst 0x646540fd  // bfdot z29.s, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
-      ".inst 0x646540de  // bfdot z30.s, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
-      ".inst 0x646540ff  // bfdot z31.s, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
-      ".inst 0x646d40dc  // bfdot z28.s, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
-      ".inst 0x646d40fd  // bfdot z29.s, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x64674009  // bfdot z9.s, z0.h, z7.h[0]\n"
+      ".inst 0x6466400d  // bfdot z13.s, z0.h, z6.h[0]\n"
+      ".inst 0x64654011  // bfdot z17.s, z0.h, z5.h[0]\n"
+      ".inst 0x64644015  // bfdot z21.s, z0.h, z4.h[0]\n"
+      ".inst 0x64634019  // bfdot z25.s, z0.h, z3.h[0]\n"
+      ".inst 0x6462401d  // bfdot z29.s, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467402a  // bfdot z10.s, z1.h, z7.h[0]\n"
+      ".inst 0x6466402e  // bfdot z14.s, z1.h, z6.h[0]\n"
+      ".inst 0x64654032  // bfdot z18.s, z1.h, z5.h[0]\n"
+      ".inst 0x64644036  // bfdot z22.s, z1.h, z4.h[0]\n"
+      ".inst 0x6463403a  // bfdot z26.s, z1.h, z3.h[0]\n"
+      ".inst 0x6462403e  // bfdot z30.s, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6467400b  // bfdot z11.s, z0.h, z7.h[0]\n"
+      ".inst 0x6466400f  // bfdot z15.s, z0.h, z6.h[0]\n"
+      ".inst 0x64654013  // bfdot z19.s, z0.h, z5.h[0]\n"
+      ".inst 0x64644017  // bfdot z23.s, z0.h, z4.h[0]\n"
+      ".inst 0x6463401b  // bfdot z27.s, z0.h, z3.h[0]\n"
+      ".inst 0x6462401f  // bfdot z31.s, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x646f4028  // bfdot z8.s, z1.h, z7.h[1]\n"
+      ".inst 0x646e402c  // bfdot z12.s, z1.h, z6.h[1]\n"
+      ".inst 0x646d4030  // bfdot z16.s, z1.h, z5.h[1]\n"
+      ".inst 0x646c4034  // bfdot z20.s, z1.h, z4.h[1]\n"
+      ".inst 0x646b4038  // bfdot z24.s, z1.h, z3.h[1]\n"
+      ".inst 0x646a403c  // bfdot z28.s, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x646f4009  // bfdot z9.s, z0.h, z7.h[1]\n"
+      ".inst 0x646e400d  // bfdot z13.s, z0.h, z6.h[1]\n"
+      ".inst 0x646d4011  // bfdot z17.s, z0.h, z5.h[1]\n"
+      ".inst 0x646c4015  // bfdot z21.s, z0.h, z4.h[1]\n"
+      ".inst 0x646b4019  // bfdot z25.s, z0.h, z3.h[1]\n"
+      ".inst 0x646a401d  // bfdot z29.s, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
-      ".inst 0x646d40de  // bfdot z30.s, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
-      ".inst 0x646d40ff  // bfdot z31.s, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
-      ".inst 0x647540dc  // bfdot z28.s, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
-      ".inst 0x647540fd  // bfdot z29.s, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
-      ".inst 0x647540de  // bfdot z30.s, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
-      ".inst 0x647540ff  // bfdot z31.s, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
-      ".inst 0x647d40dc  // bfdot z28.s, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
-      ".inst 0x647d40fd  // bfdot z29.s, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
-      ".inst 0x647d40de  // bfdot z30.s, z6.h, z5.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
-      ".inst 0x647d40ff  // bfdot z31.s, z7.h, z5.h[3]\n"
+      ".inst 0x646f402a  // bfdot z10.s, z1.h, z7.h[1]\n"
+      ".inst 0x646e402e  // bfdot z14.s, z1.h, z6.h[1]\n"
+      ".inst 0x646d4032  // bfdot z18.s, z1.h, z5.h[1]\n"
+      ".inst 0x646c4036  // bfdot z22.s, z1.h, z4.h[1]\n"
+      ".inst 0x646b403a  // bfdot z26.s, z1.h, z3.h[1]\n"
+      ".inst 0x646a403e  // bfdot z30.s, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x646f400b  // bfdot z11.s, z0.h, z7.h[1]\n"
+      ".inst 0x646e400f  // bfdot z15.s, z0.h, z6.h[1]\n"
+      ".inst 0x646d4013  // bfdot z19.s, z0.h, z5.h[1]\n"
+      ".inst 0x646c4017  // bfdot z23.s, z0.h, z4.h[1]\n"
+      ".inst 0x646b401b  // bfdot z27.s, z0.h, z3.h[1]\n"
+      ".inst 0x646a401f  // bfdot z31.s, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x64774028  // bfdot z8.s, z1.h, z7.h[2]\n"
+      ".inst 0x6476402c  // bfdot z12.s, z1.h, z6.h[2]\n"
+      ".inst 0x64754030  // bfdot z16.s, z1.h, z5.h[2]\n"
+      ".inst 0x64744034  // bfdot z20.s, z1.h, z4.h[2]\n"
+      ".inst 0x64734038  // bfdot z24.s, z1.h, z3.h[2]\n"
+      ".inst 0x6472403c  // bfdot z28.s, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x64774009  // bfdot z9.s, z0.h, z7.h[2]\n"
+      ".inst 0x6476400d  // bfdot z13.s, z0.h, z6.h[2]\n"
+      ".inst 0x64754011  // bfdot z17.s, z0.h, z5.h[2]\n"
+      ".inst 0x64744015  // bfdot z21.s, z0.h, z4.h[2]\n"
+      ".inst 0x64734019  // bfdot z25.s, z0.h, z3.h[2]\n"
+      ".inst 0x6472401d  // bfdot z29.s, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6477402a  // bfdot z10.s, z1.h, z7.h[2]\n"
+      ".inst 0x6476402e  // bfdot z14.s, z1.h, z6.h[2]\n"
+      ".inst 0x64754032  // bfdot z18.s, z1.h, z5.h[2]\n"
+      ".inst 0x64744036  // bfdot z22.s, z1.h, z4.h[2]\n"
+      ".inst 0x6473403a  // bfdot z26.s, z1.h, z3.h[2]\n"
+      ".inst 0x6472403e  // bfdot z30.s, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6477400b  // bfdot z11.s, z0.h, z7.h[2]\n"
+      ".inst 0x6476400f  // bfdot z15.s, z0.h, z6.h[2]\n"
+      ".inst 0x64754013  // bfdot z19.s, z0.h, z5.h[2]\n"
+      ".inst 0x64744017  // bfdot z23.s, z0.h, z4.h[2]\n"
+      ".inst 0x6473401b  // bfdot z27.s, z0.h, z3.h[2]\n"
+      ".inst 0x6472401f  // bfdot z31.s, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x647f4028  // bfdot z8.s, z1.h, z7.h[3]\n"
+      ".inst 0x647e402c  // bfdot z12.s, z1.h, z6.h[3]\n"
+      ".inst 0x647d4030  // bfdot z16.s, z1.h, z5.h[3]\n"
+      ".inst 0x647c4034  // bfdot z20.s, z1.h, z4.h[3]\n"
+      ".inst 0x647b4038  // bfdot z24.s, z1.h, z3.h[3]\n"
+      ".inst 0x647a403c  // bfdot z28.s, z1.h, z2.h[3]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x647f4009  // bfdot z9.s, z0.h, z7.h[3]\n"
+      ".inst 0x647e400d  // bfdot z13.s, z0.h, z6.h[3]\n"
+      ".inst 0x647d4011  // bfdot z17.s, z0.h, z5.h[3]\n"
+      ".inst 0x647c4015  // bfdot z21.s, z0.h, z4.h[3]\n"
+      ".inst 0x647b4019  // bfdot z25.s, z0.h, z3.h[3]\n"
+      ".inst 0x647a401d  // bfdot z29.s, z0.h, z2.h[3]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x647f402a  // bfdot z10.s, z1.h, z7.h[3]\n"
+      ".inst 0x647e402e  // bfdot z14.s, z1.h, z6.h[3]\n"
+      ".inst 0x647d4032  // bfdot z18.s, z1.h, z5.h[3]\n"
+      ".inst 0x647c4036  // bfdot z22.s, z1.h, z4.h[3]\n"
+      ".inst 0x647b403a  // bfdot z26.s, z1.h, z3.h[3]\n"
+      ".inst 0x647a403e  // bfdot z30.s, z1.h, z2.h[3]\n"
+      ".inst 0x647f400b  // bfdot z11.s, z0.h, z7.h[3]\n"
+      ".inst 0x647e400f  // bfdot z15.s, z0.h, z6.h[3]\n"
+      ".inst 0x647d4013  // bfdot z19.s, z0.h, z5.h[3]\n"
+      ".inst 0x647c4017  // bfdot z23.s, z0.h, z4.h[3]\n"
+      ".inst 0x647b401b  // bfdot z27.s, z0.h, z3.h[3]\n"
+      ".inst 0x647a401f  // bfdot z31.s, z0.h, z2.h[3]\n"
       "bgt 74b\n"
       "75:"  // Height 6: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "ld1rqh { z5.h }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      "add x20, x20, #0x10\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
-      ".inst 0x646540dc  // bfdot z28.s, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
-      ".inst 0x646540fd  // bfdot z29.s, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x2\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "ld1rqh { z5.h }, p0/Z, [x21]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646040e8  // bfdot z8.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ec  // bfdot z12.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f0  // bfdot z16.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f4  // bfdot z20.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440f8  // bfdot z24.s, z7.h, z4.h[0]\n"
+      ".inst 0x646540fc  // bfdot z28.s, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x646040c9  // bfdot z9.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140cd  // bfdot z13.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d1  // bfdot z17.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d5  // bfdot z21.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440d9  // bfdot z25.s, z6.h, z4.h[0]\n"
+      ".inst 0x646540dd  // bfdot z29.s, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
-      ".inst 0x646540de  // bfdot z30.s, z6.h, z5.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
-      ".inst 0x646540ff  // bfdot z31.s, z7.h, z5.h[0]\n"
+      ".inst 0x646040ea  // bfdot z10.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ee  // bfdot z14.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f2  // bfdot z18.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f6  // bfdot z22.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440fa  // bfdot z26.s, z7.h, z4.h[0]\n"
+      ".inst 0x646540fe  // bfdot z30.s, z7.h, z5.h[0]\n"
+      ".inst 0x646040cb  // bfdot z11.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140cf  // bfdot z15.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d3  // bfdot z19.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d7  // bfdot z23.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440db  // bfdot z27.s, z6.h, z4.h[0]\n"
+      ".inst 0x646540df  // bfdot z31.s, z6.h, z5.h[0]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
-      ".inst 0x646d40dc  // bfdot z28.s, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
-      ".inst 0x646d40fd  // bfdot z29.s, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646840e8  // bfdot z8.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ec  // bfdot z12.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f0  // bfdot z16.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f4  // bfdot z20.s, z7.h, z3.h[1]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x646c40f8  // bfdot z24.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40fc  // bfdot z28.s, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x646840c9  // bfdot z9.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cd  // bfdot z13.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d1  // bfdot z17.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d5  // bfdot z21.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40d9  // bfdot z25.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40dd  // bfdot z29.s, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
-      ".inst 0x646d40de  // bfdot z30.s, z6.h, z5.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
-      ".inst 0x646d40ff  // bfdot z31.s, z7.h, z5.h[1]\n"
+      ".inst 0x646840ea  // bfdot z10.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ee  // bfdot z14.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f2  // bfdot z18.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f6  // bfdot z22.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40fa  // bfdot z26.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40fe  // bfdot z30.s, z7.h, z5.h[1]\n"
+      ".inst 0x646840cb  // bfdot z11.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cf  // bfdot z15.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d3  // bfdot z19.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d7  // bfdot z23.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40db  // bfdot z27.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40df  // bfdot z31.s, z6.h, z5.h[1]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x2\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
-      ".inst 0x647540dc  // bfdot z28.s, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
-      ".inst 0x647540fd  // bfdot z29.s, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647040e8  // bfdot z8.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ec  // bfdot z12.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f0  // bfdot z16.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f4  // bfdot z20.s, z7.h, z3.h[2]\n"
+      "subs x27, x27, #0x2\n"
+      ".inst 0x647440f8  // bfdot z24.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540fc  // bfdot z28.s, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x647040c9  // bfdot z9.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cd  // bfdot z13.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d1  // bfdot z17.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d5  // bfdot z21.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440d9  // bfdot z25.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540dd  // bfdot z29.s, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
-      ".inst 0x647540de  // bfdot z30.s, z6.h, z5.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
-      ".inst 0x647540ff  // bfdot z31.s, z7.h, z5.h[2]\n"
+      ".inst 0x647040ea  // bfdot z10.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ee  // bfdot z14.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f2  // bfdot z18.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f6  // bfdot z22.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440fa  // bfdot z26.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540fe  // bfdot z30.s, z7.h, z5.h[2]\n"
+      ".inst 0x647040cb  // bfdot z11.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cf  // bfdot z15.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d3  // bfdot z19.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d7  // bfdot z23.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440db  // bfdot z27.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540df  // bfdot z31.s, z6.h, z5.h[2]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
-      ".inst 0x647d40dc  // bfdot z28.s, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
-      ".inst 0x647d40fd  // bfdot z29.s, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647840e8  // bfdot z8.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ec  // bfdot z12.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f0  // bfdot z16.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f4  // bfdot z20.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40f8  // bfdot z24.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40fc  // bfdot z28.s, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x647840c9  // bfdot z9.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cd  // bfdot z13.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d1  // bfdot z17.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d5  // bfdot z21.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40d9  // bfdot z25.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40dd  // bfdot z29.s, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
-      ".inst 0x647d40de  // bfdot z30.s, z6.h, z5.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
-      ".inst 0x647d40ff  // bfdot z31.s, z7.h, z5.h[3]\n"
+      ".inst 0x647840ea  // bfdot z10.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ee  // bfdot z14.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f2  // bfdot z18.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f6  // bfdot z22.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40fa  // bfdot z26.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40fe  // bfdot z30.s, z7.h, z5.h[3]\n"
+      ".inst 0x647840cb  // bfdot z11.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cf  // bfdot z15.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d3  // bfdot z19.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d7  // bfdot z23.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40db  // bfdot z27.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40df  // bfdot z31.s, z6.h, z5.h[3]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 71b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "tbz %x[flags], #1, 77f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmin z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z0.s\n"
-      "fmin z14.s, p5/M, z14.s, z0.s\n"
-      "fmin z15.s, p5/M, z15.s, z0.s\n"
-      "fmin z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z1.s\n"
-      "fmax z14.s, p5/M, z14.s, z1.s\n"
-      "fmax z15.s, p5/M, z15.s, z1.s\n"
-      "fmax z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z0.s\n"
-      "fmin z18.s, p5/M, z18.s, z0.s\n"
-      "fmin z19.s, p5/M, z19.s, z0.s\n"
-      "fmin z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z1.s\n"
-      "fmax z18.s, p5/M, z18.s, z1.s\n"
-      "fmax z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z0.s\n"
-      "fmin z22.s, p5/M, z22.s, z0.s\n"
-      "fmin z23.s, p5/M, z23.s, z0.s\n"
-      "fmin z24.s, p5/M, z24.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z1.s\n"
-      "fmax z22.s, p5/M, z22.s, z1.s\n"
-      "fmax z23.s, p5/M, z23.s, z1.s\n"
-      "fmax z24.s, p5/M, z24.s, z1.s\n"
-      "fmin z25.s, p5/M, z25.s, z0.s\n"
-      "fmin z26.s, p5/M, z26.s, z0.s\n"
-      "fmin z27.s, p5/M, z27.s, z0.s\n"
-      "fmin z28.s, p5/M, z28.s, z0.s\n"
-      "fmax z25.s, p5/M, z25.s, z1.s\n"
-      "fmax z26.s, p5/M, z26.s, z1.s\n"
-      "fmax z27.s, p5/M, z27.s, z1.s\n"
-      "fmax z28.s, p5/M, z28.s, z1.s\n"
-      "fmin z29.s, p5/M, z29.s, z0.s\n"
-      "fmin z30.s, p5/M, z30.s, z0.s\n"
-      "fmin z31.s, p5/M, z31.s, z0.s\n"
-      "fmax z29.s, p5/M, z29.s, z1.s\n"
-      "fmax z30.s, p5/M, z30.s, z1.s\n"
-      "fmax z31.s, p5/M, z31.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmin z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z1.s\n"
+      "fmin z14.s, p5/M, z14.s, z1.s\n"
+      "fmin z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z1.s\n"
+      "fmin z18.s, p5/M, z18.s, z1.s\n"
+      "fmin z19.s, p5/M, z19.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z1.s\n"
+      "fmin z22.s, p5/M, z22.s, z1.s\n"
+      "fmin z23.s, p5/M, z23.s, z1.s\n"
+      "fmin z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z1.s\n"
+      "fmin z26.s, p5/M, z26.s, z1.s\n"
+      "fmin z27.s, p5/M, z27.s, z1.s\n"
+      "fmin z28.s, p5/M, z28.s, z1.s\n"
+      "fmin z29.s, p5/M, z29.s, z1.s\n"
+      "fmin z30.s, p5/M, z30.s, z1.s\n"
+      "fmin z31.s, p5/M, z31.s, z1.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z0.s\n"
+      "fmax z14.s, p5/M, z14.s, z0.s\n"
+      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z0.s\n"
+      "fmax z18.s, p5/M, z18.s, z0.s\n"
+      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z0.s\n"
+      "fmax z22.s, p5/M, z22.s, z0.s\n"
+      "fmax z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z0.s\n"
+      "fmax z26.s, p5/M, z26.s, z0.s\n"
+      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z28.s, p5/M, z28.s, z0.s\n"
+      "fmax z29.s, p5/M, z29.s, z0.s\n"
+      "fmax z30.s, p5/M, z30.s, z0.s\n"
+      "fmax z31.s, p5/M, z31.s, z0.s\n"
       "77:"  // Height 6: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x22]\n"
-      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z24.s }, p4, [x21]\n"
-      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
-      "st1w { z28.s }, p4, [x20]\n"
-      "st1w { z29.s }, p3, [x20, #1, MUL VL]\n"
-      "st1w { z30.s }, p2, [x20, #2, MUL VL]\n"
-      "st1w { z31.s }, p1, [x20, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x23]\n"
+      "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x22]\n"
+      "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z28.s }, p4, [x21]\n"
+      "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
       "78:"  // Height 6: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 67b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 80f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 79f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "79:"  // Update direct input
-      "mov x19, #0xc\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "80:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
new file mode 100644
index 0000000000..223d8a78de
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<bfloat16>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_bf16fp32_mmla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_bf16fp32_mmla_6x4VL
+{
+public:
+    typedef bfloat16 lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 24.74 };
+                case CPUModel::A510:
+                    return { 6.74 };
+                case CPUModel::V1:
+                    return { 53.59 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_bf16fp32_mmla_6x4VL;
+    cls_sve_hybrid_bf16fp32_mmla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..74e2d267bc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
@@ -0,0 +1,2044 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_bf16fp32_mmla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 66f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 53f\n"
+      "beq 40f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 27f\n"
+      "beq 14f\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 3f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z16.d, z12.d\n"
+      "zip2 z12.d, z16.d, z12.d\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 8f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "cmp x27, #0x8\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z20.h }, p0/Z, [x26]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6471e688  // bfmmla z8.s, z20.h, z17.h\n"
+      ".inst 0x6470e68c  // bfmmla z12.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6471e689  // bfmmla z9.s, z20.h, z17.h\n"
+      ".inst 0x6470e68d  // bfmmla z13.s, z20.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6470e68a  // bfmmla z10.s, z20.h, z16.h\n"
+      ".inst 0x6471e68e  // bfmmla z14.s, z20.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6471e68b  // bfmmla z11.s, z20.h, z17.h\n"
+      ".inst 0x6470e68f  // bfmmla z15.s, z20.h, z16.h\n"
+      "add x26, x26, #0x10\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "addvl x10, x10, #8\n"
+      "ble 11f\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e428  // bfmmla z8.s, z1.h, z17.h\n"
+      ".inst 0x6470e42c  // bfmmla z12.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e429  // bfmmla z9.s, z1.h, z17.h\n"
+      ".inst 0x6470e42d  // bfmmla z13.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e42a  // bfmmla z10.s, z1.h, z17.h\n"
+      ".inst 0x6470e42e  // bfmmla z14.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6471e42b  // bfmmla z11.s, z1.h, z17.h\n"
+      ".inst 0x6470e42f  // bfmmla z15.s, z1.h, z16.h\n"
+      "addvl x10, x10, #8\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 6b\n"
+      "uzp1 z8.d, z8.d, z12.d\n"
+      "uzp1 z9.d, z9.d, z13.d\n"
+      "uzp1 z10.d, z10.d, z14.d\n"
+      "uzp1 z11.d, z11.d, z15.d\n"
+      "tbz %x[flags], #1, 12f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z21.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z21.s\n"
+      "fmin z9.s, p5/M, z9.s, z21.s\n"
+      "fmin z10.s, p5/M, z10.s, z21.s\n"
+      "fmin z11.s, p5/M, z11.s, z21.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "12:"  // Height 1: No activation
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "13:"  // Height 1: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 2b\n"
+      "b 80f\n"
+      "14:"  // Height 2
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "15:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 16f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "b 18f\n"
+      "16:"  // Height 2: no bias
+      "tbz %x[flags], #0, 17f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 18f\n"
+      "17:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "18:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 21f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "21:"  // Height 2: input setup done
+      "cmp x27, #0x8\n"
+      "ble 23f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z20.h }, p0/Z, [x26]\n"
+      "ld1rqh { z19.h }, p0/Z, [x25]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6471e688  // bfmmla z8.s, z20.h, z17.h\n"
+      ".inst 0x6470e68c  // bfmmla z12.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6471e689  // bfmmla z9.s, z20.h, z17.h\n"
+      ".inst 0x6470e68d  // bfmmla z13.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6471e68a  // bfmmla z10.s, z20.h, z17.h\n"
+      ".inst 0x6470e68e  // bfmmla z14.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6471e68b  // bfmmla z11.s, z20.h, z17.h\n"
+      ".inst 0x6470e68f  // bfmmla z15.s, z20.h, z16.h\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "bgt 22b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z19.h }, p0/Z, [x25]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "addvl x10, x10, #8\n"
+      "ble 24f\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e428  // bfmmla z8.s, z1.h, z17.h\n"
+      ".inst 0x6470e42c  // bfmmla z12.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e429  // bfmmla z9.s, z1.h, z17.h\n"
+      ".inst 0x6470e42d  // bfmmla z13.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e42a  // bfmmla z10.s, z1.h, z17.h\n"
+      ".inst 0x6470e42e  // bfmmla z14.s, z1.h, z16.h\n"
+      "ld1h { z22.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6476e42b  // bfmmla z11.s, z1.h, z22.h\n"
+      ".inst 0x6470e42f  // bfmmla z15.s, z1.h, z16.h\n"
+      "addvl x10, x10, #8\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 19b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "add x25, x9, x20, LSL #2\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "tbz %x[flags], #1, 25f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z7.s, p5/M, z7.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "25:"  // Height 2: No activation
+      "st1w { z7.s }, p4, [x9]\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "26:"  // Height 2: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 15b\n"
+      "b 80f\n"
+      "27:"  // Height 3
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "28:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 29f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "b 31f\n"
+      "29:"  // Height 3: no bias
+      "tbz %x[flags], #0, 30f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x20]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 31f\n"
+      "30:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "31:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "32:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 34f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "b 34f\n"
+      "33:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "34:"  // Height 3: input setup done
+      "cmp x27, #0x8\n"
+      "ble 36f\n"
+      "35:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z30.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "ld1rqh { z28.h }, p0/Z, [x24]\n"
+      "trn1 z27.d, z30.d, z24.d\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "trn1 z26.d, z28.d, z29.d\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e768  // bfmmla z8.s, z27.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e76c  // bfmmla z12.s, z27.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e769  // bfmmla z9.s, z27.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z29.d\n"
+      ".inst 0x6478e76d  // bfmmla z13.s, z27.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6479e76a  // bfmmla z10.s, z27.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6478e76e  // bfmmla z14.s, z27.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x6479e76b  // bfmmla z11.s, z27.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6478e76f  // bfmmla z15.s, z27.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6479e7c8  // bfmmla z8.s, z30.h, z25.h\n"
+      ".inst 0x6479e790  // bfmmla z16.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6478e7cc  // bfmmla z12.s, z30.h, z24.h\n"
+      ".inst 0x6478e794  // bfmmla z20.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6479e7c9  // bfmmla z9.s, z30.h, z25.h\n"
+      ".inst 0x6479e791  // bfmmla z17.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6478e7cd  // bfmmla z13.s, z30.h, z24.h\n"
+      ".inst 0x6478e795  // bfmmla z21.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6479e7ca  // bfmmla z10.s, z30.h, z25.h\n"
+      ".inst 0x6479e792  // bfmmla z18.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6478e7ce  // bfmmla z14.s, z30.h, z24.h\n"
+      ".inst 0x6478e796  // bfmmla z22.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6479e7cb  // bfmmla z11.s, z30.h, z25.h\n"
+      ".inst 0x6479e793  // bfmmla z19.s, z28.h, z25.h\n"
+      ".inst 0x6478e7cf  // bfmmla z15.s, z30.h, z24.h\n"
+      ".inst 0x6478e797  // bfmmla z23.s, z28.h, z24.h\n"
+      "bgt 35b\n"
+      "36:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "trn1 z27.d, z1.d, z24.d\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "trn1 z26.d, z3.d, z28.d\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e768  // bfmmla z8.s, z27.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e76c  // bfmmla z12.s, z27.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e769  // bfmmla z9.s, z27.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      ".inst 0x6478e76d  // bfmmla z13.s, z27.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z28.d\n"
+      ".inst 0x6479e76a  // bfmmla z10.s, z27.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e76e  // bfmmla z14.s, z27.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6479e76b  // bfmmla z11.s, z27.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      ".inst 0x6478e76f  // bfmmla z15.s, z27.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ble 37f\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e428  // bfmmla z8.s, z1.h, z25.h\n"
+      ".inst 0x6479e470  // bfmmla z16.s, z3.h, z25.h\n"
+      ".inst 0x6478e42c  // bfmmla z12.s, z1.h, z24.h\n"
+      ".inst 0x6478e474  // bfmmla z20.s, z3.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e429  // bfmmla z9.s, z1.h, z25.h\n"
+      ".inst 0x6479e471  // bfmmla z17.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6478e42d  // bfmmla z13.s, z1.h, z24.h\n"
+      ".inst 0x6478e475  // bfmmla z21.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e42a  // bfmmla z10.s, z1.h, z25.h\n"
+      ".inst 0x6479e472  // bfmmla z18.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e42e  // bfmmla z14.s, z1.h, z24.h\n"
+      ".inst 0x6478e476  // bfmmla z22.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6479e42b  // bfmmla z11.s, z1.h, z25.h\n"
+      ".inst 0x6479e473  // bfmmla z19.s, z3.h, z25.h\n"
+      ".inst 0x6478e42f  // bfmmla z15.s, z1.h, z24.h\n"
+      ".inst 0x6478e477  // bfmmla z23.s, z3.h, z24.h\n"
+      "37:"  // Height 3: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 32b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "tbz %x[flags], #1, 38f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmax z7.s, p5/M, z7.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
+      "38:"  // Height 3: No activation
+      "st1w { z7.s }, p4, [x9]\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "39:"  // Height 3: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 28b\n"
+      "b 80f\n"
+      "40:"  // Height 4
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "41:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 42f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "b 44f\n"
+      "42:"  // Height 4: no bias
+      "tbz %x[flags], #0, 43f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x21]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 44f\n"
+      "43:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "44:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "45:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 46f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "b 47f\n"
+      "46:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "47:"  // Height 4: input setup done
+      "cmp x27, #0x8\n"
+      "ble 49f\n"
+      "48:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z30.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "trn1 z29.d, z30.d, z24.d\n"
+      "ld1rqh { z28.h }, p0/Z, [x24]\n"
+      "ld1rqh { z27.h }, p0/Z, [x23]\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "trn1 z26.d, z28.d, z27.d\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e7a8  // bfmmla z8.s, z29.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e7ac  // bfmmla z12.s, z29.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e7a9  // bfmmla z9.s, z29.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z27.d\n"
+      ".inst 0x6478e7ad  // bfmmla z13.s, z29.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6479e7aa  // bfmmla z10.s, z29.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "cmp x27, #0x8\n"
+      ".inst 0x6478e7ae  // bfmmla z14.s, z29.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x6479e7ab  // bfmmla z11.s, z29.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6478e7af  // bfmmla z15.s, z29.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6479e7c8  // bfmmla z8.s, z30.h, z25.h\n"
+      ".inst 0x6479e790  // bfmmla z16.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6478e7cc  // bfmmla z12.s, z30.h, z24.h\n"
+      ".inst 0x6478e794  // bfmmla z20.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6479e7c9  // bfmmla z9.s, z30.h, z25.h\n"
+      ".inst 0x6479e791  // bfmmla z17.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6478e7cd  // bfmmla z13.s, z30.h, z24.h\n"
+      ".inst 0x6478e795  // bfmmla z21.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6479e7ca  // bfmmla z10.s, z30.h, z25.h\n"
+      ".inst 0x6479e792  // bfmmla z18.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6478e7ce  // bfmmla z14.s, z30.h, z24.h\n"
+      ".inst 0x6478e796  // bfmmla z22.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6479e7cb  // bfmmla z11.s, z30.h, z25.h\n"
+      ".inst 0x6479e793  // bfmmla z19.s, z28.h, z25.h\n"
+      ".inst 0x6478e7cf  // bfmmla z15.s, z30.h, z24.h\n"
+      ".inst 0x6478e797  // bfmmla z23.s, z28.h, z24.h\n"
+      "bgt 48b\n"
+      "49:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "trn1 z28.d, z1.d, z24.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "ld1rqh { z27.h }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "trn1 z26.d, z3.d, z27.d\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e788  // bfmmla z8.s, z28.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e78c  // bfmmla z12.s, z28.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e789  // bfmmla z9.s, z28.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      ".inst 0x6478e78d  // bfmmla z13.s, z28.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z27.d\n"
+      ".inst 0x6479e78a  // bfmmla z10.s, z28.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e78e  // bfmmla z14.s, z28.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6479e78b  // bfmmla z11.s, z28.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      ".inst 0x6478e78f  // bfmmla z15.s, z28.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ble 50f\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e428  // bfmmla z8.s, z1.h, z25.h\n"
+      ".inst 0x6479e470  // bfmmla z16.s, z3.h, z25.h\n"
+      ".inst 0x6478e42c  // bfmmla z12.s, z1.h, z24.h\n"
+      ".inst 0x6478e474  // bfmmla z20.s, z3.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e429  // bfmmla z9.s, z1.h, z25.h\n"
+      ".inst 0x6479e471  // bfmmla z17.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6478e42d  // bfmmla z13.s, z1.h, z24.h\n"
+      ".inst 0x6478e475  // bfmmla z21.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e42a  // bfmmla z10.s, z1.h, z25.h\n"
+      ".inst 0x6479e472  // bfmmla z18.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e42e  // bfmmla z14.s, z1.h, z24.h\n"
+      ".inst 0x6478e476  // bfmmla z22.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6479e42b  // bfmmla z11.s, z1.h, z25.h\n"
+      ".inst 0x6479e473  // bfmmla z19.s, z3.h, z25.h\n"
+      ".inst 0x6478e42f  // bfmmla z15.s, z1.h, z24.h\n"
+      ".inst 0x6478e477  // bfmmla z23.s, z3.h, z24.h\n"
+      "50:"  // Height 4: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 45b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "tbz %x[flags], #1, 51f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z24.s\n"
+      "fmin z12.s, p5/M, z12.s, z24.s\n"
+      "fmin z13.s, p5/M, z13.s, z24.s\n"
+      "fmin z14.s, p5/M, z14.s, z24.s\n"
+      "fmin z8.s, p5/M, z8.s, z24.s\n"
+      "fmin z9.s, p5/M, z9.s, z24.s\n"
+      "fmin z10.s, p5/M, z10.s, z24.s\n"
+      "fmin z11.s, p5/M, z11.s, z24.s\n"
+      "fmin z15.s, p5/M, z15.s, z24.s\n"
+      "fmin z20.s, p5/M, z20.s, z24.s\n"
+      "fmin z21.s, p5/M, z21.s, z24.s\n"
+      "fmin z22.s, p5/M, z22.s, z24.s\n"
+      "fmin z16.s, p5/M, z16.s, z24.s\n"
+      "fmin z17.s, p5/M, z17.s, z24.s\n"
+      "fmin z18.s, p5/M, z18.s, z24.s\n"
+      "fmin z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z7.s, p5/M, z7.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
+      "51:"  // Height 4: No activation
+      "st1w { z7.s }, p4, [x9]\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x24]\n"
+      "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "52:"  // Height 4: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 41b\n"
+      "b 80f\n"
+      "53:"  // Height 5
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "54:"  // Height 5: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 55f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z28.d, z12.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z29.d, z13.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z30.d, z14.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z31.d, z15.d\n"
+      "b 57f\n"
+      "55:"  // Height 5: no bias
+      "tbz %x[flags], #0, 56f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 57f\n"
+      "56:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "57:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "58:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 59f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 60f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "b 60f\n"
+      "59:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "60:"  // Height 5: input setup done
+      "cmp x27, #0x8\n"
+      "ble 62f\n"
+      "61:"  // Height 5: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z6.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z7.h }, p0/Z, [x24]\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn1 z5.d, z6.d, z1.d\n"
+      "trn2 z6.d, z6.d, z1.d\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "trn1 z3.d, z7.d, z2.d\n"
+      "trn2 z7.d, z7.d, z2.d\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "trn1 z2.d, z4.d, z0.d\n"
+      "trn2 z4.d, z4.d, z0.d\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6461e4a8  // bfmmla z8.s, z5.h, z1.h\n"
+      ".inst 0x6461e470  // bfmmla z16.s, z3.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6460e4ac  // bfmmla z12.s, z5.h, z0.h\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6461e471  // bfmmla z17.s, z3.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6460e4ad  // bfmmla z13.s, z5.h, z0.h\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4aa  // bfmmla z10.s, z5.h, z1.h\n"
+      ".inst 0x6461e472  // bfmmla z18.s, z3.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ae  // bfmmla z14.s, z5.h, z0.h\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      ".inst 0x6461e473  // bfmmla z19.s, z3.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
+      ".inst 0x6461e4f0  // bfmmla z16.s, z7.h, z1.h\n"
+      ".inst 0x6461e498  // bfmmla z24.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f4  // bfmmla z20.s, z7.h, z0.h\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
+      ".inst 0x6461e4f1  // bfmmla z17.s, z7.h, z1.h\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f5  // bfmmla z21.s, z7.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
+      ".inst 0x6461e4f2  // bfmmla z18.s, z7.h, z1.h\n"
+      ".inst 0x6461e49a  // bfmmla z26.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f6  // bfmmla z22.s, z7.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
+      ".inst 0x6461e4f3  // bfmmla z19.s, z7.h, z1.h\n"
+      ".inst 0x6461e49b  // bfmmla z27.s, z4.h, z1.h\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f7  // bfmmla z23.s, z7.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "bgt 61b\n"
+      "62:"  // Height 5: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z4.h }, p0/Z, [x25]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn1 z7.d, z1.d, z4.d\n"
+      "trn2 z1.d, z1.d, z4.d\n"
+      "ld1rqh { z5.h }, p0/Z, [x22]\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e4e8  // bfmmla z8.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d0  // bfmmla z16.s, z6.h, z2.h\n"
+      ".inst 0x6462e498  // bfmmla z24.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d4  // bfmmla z20.s, z6.h, z0.h\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6462e4e9  // bfmmla z9.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d1  // bfmmla z17.s, z6.h, z2.h\n"
+      ".inst 0x6462e499  // bfmmla z25.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d5  // bfmmla z21.s, z6.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6462e4ea  // bfmmla z10.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d6  // bfmmla z22.s, z6.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6462e4eb  // bfmmla z11.s, z7.h, z2.h\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6462e4d3  // bfmmla z19.s, z6.h, z2.h\n"
+      ".inst 0x6462e49b  // bfmmla z27.s, z4.h, z2.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d7  // bfmmla z23.s, z6.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "ble 63f\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e428  // bfmmla z8.s, z1.h, z2.h\n"
+      ".inst 0x6462e470  // bfmmla z16.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6460e42c  // bfmmla z12.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bc  // bfmmla z28.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6462e429  // bfmmla z9.s, z1.h, z2.h\n"
+      ".inst 0x6462e471  // bfmmla z17.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b9  // bfmmla z25.s, z5.h, z2.h\n"
+      ".inst 0x6460e42d  // bfmmla z13.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6462e42a  // bfmmla z10.s, z1.h, z2.h\n"
+      ".inst 0x6462e472  // bfmmla z18.s, z3.h, z2.h\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6460e42e  // bfmmla z14.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e4be  // bfmmla z30.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6462e42b  // bfmmla z11.s, z1.h, z2.h\n"
+      ".inst 0x6462e473  // bfmmla z19.s, z3.h, z2.h\n"
+      ".inst 0x6462e4bb  // bfmmla z27.s, z5.h, z2.h\n"
+      ".inst 0x6460e42f  // bfmmla z15.s, z1.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
+      "63:"  // Height 5: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 58b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "tbz %x[flags], #1, 64f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z7.s, p5/M, z7.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
+      "fmax z24.s, p5/M, z24.s, z23.s\n"
+      "fmax z25.s, p5/M, z25.s, z23.s\n"
+      "fmax z26.s, p5/M, z26.s, z23.s\n"
+      "fmax z27.s, p5/M, z27.s, z23.s\n"
+      "64:"  // Height 5: No activation
+      "st1w { z7.s }, p4, [x9]\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x24]\n"
+      "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x22]\n"
+      "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+      "65:"  // Height 5: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 54b\n"
+      "b 80f\n"
+      "66:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "67:"  // Height 6: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 68f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z28.d, z12.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z29.d, z13.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z30.d, z14.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z31.d, z15.d\n"
+      "b 70f\n"
+      "68:"  // Height 6: no bias
+      "tbz %x[flags], #0, 69f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z17.s }, p4/Z, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z17.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip2 z12.d, z17.d, z12.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z20.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z14.d, z20.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 70f\n"
+      "69:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "70:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "71:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 72f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 73f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
+      "73:"  // Height 6: input setup done
+      "cmp x27, #0x8\n"
+      "ble 75f\n"
+      "74:"  // Height 6: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z7.h }, p0/Z, [x26]\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
+      "trn1 z6.d, z7.d, z0.d\n"
+      "ld1rqh { z5.h }, p0/Z, [x24]\n"
+      "ld1rqh { z1.h }, p0/Z, [x23]\n"
+      "trn2 z7.d, z7.d, z0.d\n"
+      "trn1 z4.d, z5.d, z1.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1rqh { z0.h }, p0/Z, [x21]\n"
+      "trn2 z5.d, z5.d, z1.d\n"
+      "trn1 z2.d, z3.d, z0.d\n"
+      "trn2 z3.d, z3.d, z0.d\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
+      ".inst 0x6461e490  // bfmmla z16.s, z4.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e494  // bfmmla z20.s, z4.h, z0.h\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6461e491  // bfmmla z17.s, z4.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e495  // bfmmla z21.s, z4.h, z0.h\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6461e492  // bfmmla z18.s, z4.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e496  // bfmmla z22.s, z4.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
+      ".inst 0x6461e493  // bfmmla z19.s, z4.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e497  // bfmmla z23.s, z4.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b0  // bfmmla z16.s, z5.h, z1.h\n"
+      ".inst 0x6461e478  // bfmmla z24.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b4  // bfmmla z20.s, z5.h, z0.h\n"
+      ".inst 0x6460e47c  // bfmmla z28.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b1  // bfmmla z17.s, z5.h, z1.h\n"
+      ".inst 0x6461e479  // bfmmla z25.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b5  // bfmmla z21.s, z5.h, z0.h\n"
+      ".inst 0x6460e47d  // bfmmla z29.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b2  // bfmmla z18.s, z5.h, z1.h\n"
+      ".inst 0x6461e47a  // bfmmla z26.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b6  // bfmmla z22.s, z5.h, z0.h\n"
+      ".inst 0x6460e47e  // bfmmla z30.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b3  // bfmmla z19.s, z5.h, z1.h\n"
+      ".inst 0x6461e47b  // bfmmla z27.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6460e47f  // bfmmla z31.s, z3.h, z0.h\n"
+      "bgt 74b\n"
+      "75:"  // Height 6: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
+      "trn1 z7.d, z1.d, z0.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z0.d\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "ld1rqh { z5.h }, p0/Z, [x22]\n"
+      "ld1rqh { z0.h }, p0/Z, [x21]\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e4e8  // bfmmla z8.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d0  // bfmmla z16.s, z6.h, z2.h\n"
+      ".inst 0x6462e498  // bfmmla z24.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d4  // bfmmla z20.s, z6.h, z0.h\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6462e4e9  // bfmmla z9.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d1  // bfmmla z17.s, z6.h, z2.h\n"
+      ".inst 0x6462e499  // bfmmla z25.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d5  // bfmmla z21.s, z6.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6462e4ea  // bfmmla z10.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d6  // bfmmla z22.s, z6.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6462e4eb  // bfmmla z11.s, z7.h, z2.h\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6462e4d3  // bfmmla z19.s, z6.h, z2.h\n"
+      ".inst 0x6462e49b  // bfmmla z27.s, z4.h, z2.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d7  // bfmmla z23.s, z6.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "ble 76f\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e428  // bfmmla z8.s, z1.h, z2.h\n"
+      ".inst 0x6462e470  // bfmmla z16.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6460e42c  // bfmmla z12.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bc  // bfmmla z28.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6462e429  // bfmmla z9.s, z1.h, z2.h\n"
+      ".inst 0x6462e471  // bfmmla z17.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b9  // bfmmla z25.s, z5.h, z2.h\n"
+      ".inst 0x6460e42d  // bfmmla z13.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6462e42a  // bfmmla z10.s, z1.h, z2.h\n"
+      ".inst 0x6462e472  // bfmmla z18.s, z3.h, z2.h\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6460e42e  // bfmmla z14.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e4be  // bfmmla z30.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6462e42b  // bfmmla z11.s, z1.h, z2.h\n"
+      ".inst 0x6462e473  // bfmmla z19.s, z3.h, z2.h\n"
+      ".inst 0x6462e4bb  // bfmmla z27.s, z5.h, z2.h\n"
+      ".inst 0x6460e42f  // bfmmla z15.s, z1.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
+      "76:"  // Height 6: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 71b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "tbz %x[flags], #1, 77f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z1.s\n"
+      "fmin z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z1.s\n"
+      "fmin z14.s, p5/M, z14.s, z1.s\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmin z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z1.s\n"
+      "fmin z22.s, p5/M, z22.s, z1.s\n"
+      "fmin z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z1.s\n"
+      "fmin z18.s, p5/M, z18.s, z1.s\n"
+      "fmin z19.s, p5/M, z19.s, z1.s\n"
+      "fmin z23.s, p5/M, z23.s, z1.s\n"
+      "fmin z28.s, p5/M, z28.s, z1.s\n"
+      "fmin z29.s, p5/M, z29.s, z1.s\n"
+      "fmin z30.s, p5/M, z30.s, z1.s\n"
+      "fmin z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z1.s\n"
+      "fmin z26.s, p5/M, z26.s, z1.s\n"
+      "fmin z27.s, p5/M, z27.s, z1.s\n"
+      "fmax z7.s, p5/M, z7.s, z0.s\n"
+      "fmax z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z0.s\n"
+      "fmax z14.s, p5/M, z14.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z0.s\n"
+      "fmax z22.s, p5/M, z22.s, z0.s\n"
+      "fmax z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z0.s\n"
+      "fmax z18.s, p5/M, z18.s, z0.s\n"
+      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z28.s, p5/M, z28.s, z0.s\n"
+      "fmax z29.s, p5/M, z29.s, z0.s\n"
+      "fmax z30.s, p5/M, z30.s, z0.s\n"
+      "fmax z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z0.s\n"
+      "fmax z26.s, p5/M, z26.s, z0.s\n"
+      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "77:"  // Height 6: No activation
+      "st1w { z7.s }, p4, [x9]\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x24]\n"
+      "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z23.s }, p4, [x22]\n"
+      "st1w { z28.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z29.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z30.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x21]\n"
+      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "78:"  // Height 6: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 67b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 80f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 79f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "79:"  // Update direct input
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "80:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
index 5c8563952f..b930e4c0d5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,21 +10,22 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -38,11 +39,13 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void sve_hybrid_fp16_mla_6x4VL( ARGLIST );
+void sve_hybrid_fp16_mla_6x4VL_a64fx( ARGLIST );
 
 class cls_sve_hybrid_fp16_mla_6x4VL
 {
 public:
-    typedef __fp16 operand_type;
+    typedef __fp16 lhs_operand_type;
+    typedef __fp16 rhs_operand_type;
     typedef __fp16 result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,16 +71,41 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, __fp16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.90 };
+                case CPUModel::A510:
+                    return { 12.44 };
+                case CPUModel::V1:
+                    return { 31.51 };
+                case CPUModel::A64FX:
+                    return { 49.14 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_fp16_mla_6x4VL;
-    cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *)
+    cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_hybrid_fp16_mla_6x4VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
new file mode 100644
index 0000000000..d1a9bb4a26
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
@@ -0,0 +1,1365 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp16_mla_6x4VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+    size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg,
+    const __fp16 *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const __fp16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 61f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x11\n"
+      "cbz x12, 3f\n"
+      "ld1h { z8.h }, p4/Z, [x12]\n"
+      "ld1h { z9.h }, p4/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1h { z8.h }, p3/Z, [x9]\n"
+      "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 8f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add x26, x26, #0x2\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "addvl x10, x10, #4\n"
+      "bne 6b\n"
+      "tbz %x[flags], #1, 11f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z17.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z16.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z17.h\n"
+      "fmin z9.h, p4/M, z9.h, z17.h\n"
+      "fmin z10.h, p4/M, z10.h, z17.h\n"
+      "fmin z11.h, p4/M, z11.h, z17.h\n"
+      "fmax z8.h, p4/M, z8.h, z16.h\n"
+      "fmax z9.h, p4/M, z9.h, z16.h\n"
+      "fmax z10.h, p4/M, z10.h, z16.h\n"
+      "fmax z11.h, p4/M, z11.h, z16.h\n"
+      "11:"  // Height 1: No activation
+      "st1h { z8.h }, p3, [x9]\n"
+      "st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "12:"  // Height 1: Writeback done
+      "dech x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 2b\n"
+      "b 74f\n"
+      "13:"  // Height 2
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "14:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x11\n"
+      "cbz x12, 15f\n"
+      "ld1h { z8.h }, p4/Z, [x12]\n"
+      "ld1h { z9.h }, p4/Z, [x12, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x12, x12, #4\n"
+      "b 17f\n"
+      "15:"  // Height 2: no bias
+      "tbz %x[flags], #0, 16f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x9]\n"
+      "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x20]\n"
+      "ld1h { z13.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 17f\n"
+      "16:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 20f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "20:"  // Height 2: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 22f\n"
+      "21:"  // Height 2: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x26, x26, #0x2\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "subs x27, x27, #0x1\n"
+      "add x25, x25, #0x2\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z14.h, p4/M, z17.h, z1.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "fmla z15.h, p4/M, z16.h, z1.h\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 21b\n"
+      "22:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z14.h, p4/M, z17.h, z1.h\n"
+      "addvl x10, x10, #4\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "fmla z15.h, p4/M, z16.h, z1.h\n"
+      "bne 18b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "tbz %x[flags], #1, 23f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z17.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z16.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z17.h\n"
+      "fmin z9.h, p4/M, z9.h, z17.h\n"
+      "fmin z10.h, p4/M, z10.h, z17.h\n"
+      "fmin z11.h, p4/M, z11.h, z17.h\n"
+      "fmin z12.h, p4/M, z12.h, z17.h\n"
+      "fmin z13.h, p4/M, z13.h, z17.h\n"
+      "fmin z14.h, p4/M, z14.h, z17.h\n"
+      "fmin z15.h, p4/M, z15.h, z17.h\n"
+      "fmax z8.h, p4/M, z8.h, z16.h\n"
+      "fmax z9.h, p4/M, z9.h, z16.h\n"
+      "fmax z10.h, p4/M, z10.h, z16.h\n"
+      "fmax z11.h, p4/M, z11.h, z16.h\n"
+      "fmax z12.h, p4/M, z12.h, z16.h\n"
+      "fmax z13.h, p4/M, z13.h, z16.h\n"
+      "fmax z14.h, p4/M, z14.h, z16.h\n"
+      "fmax z15.h, p4/M, z15.h, z16.h\n"
+      "23:"  // Height 2: No activation
+      "st1h { z8.h }, p3, [x9]\n"
+      "st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z12.h }, p3, [x25]\n"
+      "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+      "24:"  // Height 2: Writeback done
+      "dech x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 14b\n"
+      "b 74f\n"
+      "25:"  // Height 3
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "26:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x11\n"
+      "cbz x12, 27f\n"
+      "ld1h { z8.h }, p4/Z, [x12]\n"
+      "ld1h { z9.h }, p4/Z, [x12, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 29f\n"
+      "27:"  // Height 3: no bias
+      "tbz %x[flags], #0, 28f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x9]\n"
+      "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x21]\n"
+      "ld1h { z13.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x20]\n"
+      "ld1h { z17.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 29f\n"
+      "28:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 32f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "32:"  // Height 3: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x26, x26, #0x2\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x25, x25, #0x2\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "ld1h { z20.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add x24, x24, #0x2\n"
+      "fmla z10.h, p4/M, z21.h, z0.h\n"
+      "fmla z14.h, p4/M, z21.h, z1.h\n"
+      "fmla z18.h, p4/M, z21.h, z2.h\n"
+      "fmla z11.h, p4/M, z20.h, z0.h\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "fmla z15.h, p4/M, z20.h, z1.h\n"
+      "fmla z19.h, p4/M, z20.h, z2.h\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "cmp x28, x20\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "ld1h { z20.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.h, p4/M, z21.h, z0.h\n"
+      "fmla z14.h, p4/M, z21.h, z1.h\n"
+      "fmla z18.h, p4/M, z21.h, z2.h\n"
+      "fmla z11.h, p4/M, z20.h, z0.h\n"
+      "fmla z15.h, p4/M, z20.h, z1.h\n"
+      "fmla z19.h, p4/M, z20.h, z2.h\n"
+      "bne 30b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "tbz %x[flags], #1, 35f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z21.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z20.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z21.h\n"
+      "fmin z9.h, p4/M, z9.h, z21.h\n"
+      "fmin z10.h, p4/M, z10.h, z21.h\n"
+      "fmin z11.h, p4/M, z11.h, z21.h\n"
+      "fmin z12.h, p4/M, z12.h, z21.h\n"
+      "fmin z13.h, p4/M, z13.h, z21.h\n"
+      "fmin z14.h, p4/M, z14.h, z21.h\n"
+      "fmin z15.h, p4/M, z15.h, z21.h\n"
+      "fmin z16.h, p4/M, z16.h, z21.h\n"
+      "fmin z17.h, p4/M, z17.h, z21.h\n"
+      "fmin z18.h, p4/M, z18.h, z21.h\n"
+      "fmin z19.h, p4/M, z19.h, z21.h\n"
+      "fmax z8.h, p4/M, z8.h, z20.h\n"
+      "fmax z9.h, p4/M, z9.h, z20.h\n"
+      "fmax z10.h, p4/M, z10.h, z20.h\n"
+      "fmax z11.h, p4/M, z11.h, z20.h\n"
+      "fmax z12.h, p4/M, z12.h, z20.h\n"
+      "fmax z13.h, p4/M, z13.h, z20.h\n"
+      "fmax z14.h, p4/M, z14.h, z20.h\n"
+      "fmax z15.h, p4/M, z15.h, z20.h\n"
+      "fmax z16.h, p4/M, z16.h, z20.h\n"
+      "fmax z17.h, p4/M, z17.h, z20.h\n"
+      "fmax z18.h, p4/M, z18.h, z20.h\n"
+      "fmax z19.h, p4/M, z19.h, z20.h\n"
+      "35:"  // Height 3: No activation
+      "st1h { z8.h }, p3, [x9]\n"
+      "st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z12.h }, p3, [x25]\n"
+      "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x24]\n"
+      "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+      "36:"  // Height 3: Writeback done
+      "dech x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 26b\n"
+      "b 74f\n"
+      "37:"  // Height 4
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "38:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x11\n"
+      "cbz x12, 39f\n"
+      "ld1h { z8.h }, p4/Z, [x12]\n"
+      "ld1h { z9.h }, p4/Z, [x12, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 41f\n"
+      "39:"  // Height 4: no bias
+      "tbz %x[flags], #0, 40f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x9]\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x22]\n"
+      "ld1h { z13.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x21]\n"
+      "ld1h { z17.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x20]\n"
+      "ld1h { z21.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 41f\n"
+      "40:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "41:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "42:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 43f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 44f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "b 44f\n"
+      "43:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "44:"  // Height 4: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 46f\n"
+      "45:"  // Height 4: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x26, x26, #0x2\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "ld1h { z25.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x25, x25, #0x2\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "add x24, x24, #0x2\n"
+      "add x23, x23, #0x2\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "ld1h { z24.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.h, p4/M, z25.h, z0.h\n"
+      "fmla z14.h, p4/M, z25.h, z1.h\n"
+      "fmla z18.h, p4/M, z25.h, z2.h\n"
+      "fmla z22.h, p4/M, z25.h, z3.h\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "fmla z11.h, p4/M, z24.h, z0.h\n"
+      "fmla z15.h, p4/M, z24.h, z1.h\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "fmla z19.h, p4/M, z24.h, z2.h\n"
+      "fmla z23.h, p4/M, z24.h, z3.h\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 45b\n"
+      "46:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "ld1h { z25.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "cmp x28, x20\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "ld1h { z24.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.h, p4/M, z25.h, z0.h\n"
+      "fmla z14.h, p4/M, z25.h, z1.h\n"
+      "fmla z18.h, p4/M, z25.h, z2.h\n"
+      "fmla z22.h, p4/M, z25.h, z3.h\n"
+      "fmla z11.h, p4/M, z24.h, z0.h\n"
+      "fmla z15.h, p4/M, z24.h, z1.h\n"
+      "fmla z19.h, p4/M, z24.h, z2.h\n"
+      "fmla z23.h, p4/M, z24.h, z3.h\n"
+      "bne 42b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "tbz %x[flags], #1, 47f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z25.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z24.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z25.h\n"
+      "fmin z9.h, p4/M, z9.h, z25.h\n"
+      "fmin z10.h, p4/M, z10.h, z25.h\n"
+      "fmin z11.h, p4/M, z11.h, z25.h\n"
+      "fmin z12.h, p4/M, z12.h, z25.h\n"
+      "fmin z13.h, p4/M, z13.h, z25.h\n"
+      "fmin z14.h, p4/M, z14.h, z25.h\n"
+      "fmin z15.h, p4/M, z15.h, z25.h\n"
+      "fmin z16.h, p4/M, z16.h, z25.h\n"
+      "fmin z17.h, p4/M, z17.h, z25.h\n"
+      "fmin z18.h, p4/M, z18.h, z25.h\n"
+      "fmin z19.h, p4/M, z19.h, z25.h\n"
+      "fmin z20.h, p4/M, z20.h, z25.h\n"
+      "fmin z21.h, p4/M, z21.h, z25.h\n"
+      "fmin z22.h, p4/M, z22.h, z25.h\n"
+      "fmin z23.h, p4/M, z23.h, z25.h\n"
+      "fmax z8.h, p4/M, z8.h, z24.h\n"
+      "fmax z9.h, p4/M, z9.h, z24.h\n"
+      "fmax z10.h, p4/M, z10.h, z24.h\n"
+      "fmax z11.h, p4/M, z11.h, z24.h\n"
+      "fmax z12.h, p4/M, z12.h, z24.h\n"
+      "fmax z13.h, p4/M, z13.h, z24.h\n"
+      "fmax z14.h, p4/M, z14.h, z24.h\n"
+      "fmax z15.h, p4/M, z15.h, z24.h\n"
+      "fmax z16.h, p4/M, z16.h, z24.h\n"
+      "fmax z17.h, p4/M, z17.h, z24.h\n"
+      "fmax z18.h, p4/M, z18.h, z24.h\n"
+      "fmax z19.h, p4/M, z19.h, z24.h\n"
+      "fmax z20.h, p4/M, z20.h, z24.h\n"
+      "fmax z21.h, p4/M, z21.h, z24.h\n"
+      "fmax z22.h, p4/M, z22.h, z24.h\n"
+      "fmax z23.h, p4/M, z23.h, z24.h\n"
+      "47:"  // Height 4: No activation
+      "st1h { z8.h }, p3, [x9]\n"
+      "st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z12.h }, p3, [x25]\n"
+      "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x24]\n"
+      "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p3, [x23]\n"
+      "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
+      "48:"  // Height 4: Writeback done
+      "dech x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 38b\n"
+      "b 74f\n"
+      "49:"  // Height 5
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "50:"  // Height 5: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x11\n"
+      "cbz x12, 51f\n"
+      "ld1h { z8.h }, p4/Z, [x12]\n"
+      "ld1h { z9.h }, p4/Z, [x12, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 53f\n"
+      "51:"  // Height 5: no bias
+      "tbz %x[flags], #0, 52f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x9]\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x23]\n"
+      "ld1h { z13.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x22]\n"
+      "ld1h { z17.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x21]\n"
+      "ld1h { z21.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z24.h }, p3/Z, [x20]\n"
+      "ld1h { z25.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z26.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z27.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 53f\n"
+      "52:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "53:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "54:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 56f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "b 56f\n"
+      "55:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "56:"  // Height 5: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "ld1rh { z4.h }, p4/Z, [x22]\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 58f\n"
+      "57:"  // Height 5: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x26, x26, #0x2\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "add x25, x25, #0x2\n"
+      "add x24, x24, #0x2\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x23, x23, #0x2\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "add x22, x22, #0x2\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "ld1h { z28.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.h, p4/M, z29.h, z0.h\n"
+      "fmla z14.h, p4/M, z29.h, z1.h\n"
+      "fmla z18.h, p4/M, z29.h, z2.h\n"
+      "fmla z22.h, p4/M, z29.h, z3.h\n"
+      "fmla z26.h, p4/M, z29.h, z4.h\n"
+      "fmla z11.h, p4/M, z28.h, z0.h\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "fmla z15.h, p4/M, z28.h, z1.h\n"
+      "fmla z19.h, p4/M, z28.h, z2.h\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "fmla z23.h, p4/M, z28.h, z3.h\n"
+      "fmla z27.h, p4/M, z28.h, z4.h\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "ld1rh { z4.h }, p4/Z, [x22]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 57b\n"
+      "58:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "cmp x28, x20\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "ld1h { z28.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.h, p4/M, z29.h, z0.h\n"
+      "fmla z14.h, p4/M, z29.h, z1.h\n"
+      "fmla z18.h, p4/M, z29.h, z2.h\n"
+      "fmla z22.h, p4/M, z29.h, z3.h\n"
+      "fmla z26.h, p4/M, z29.h, z4.h\n"
+      "fmla z11.h, p4/M, z28.h, z0.h\n"
+      "fmla z15.h, p4/M, z28.h, z1.h\n"
+      "fmla z19.h, p4/M, z28.h, z2.h\n"
+      "fmla z23.h, p4/M, z28.h, z3.h\n"
+      "fmla z27.h, p4/M, z28.h, z4.h\n"
+      "bne 54b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "tbz %x[flags], #1, 59f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z29.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z28.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z29.h\n"
+      "fmin z9.h, p4/M, z9.h, z29.h\n"
+      "fmin z10.h, p4/M, z10.h, z29.h\n"
+      "fmin z11.h, p4/M, z11.h, z29.h\n"
+      "fmin z12.h, p4/M, z12.h, z29.h\n"
+      "fmin z13.h, p4/M, z13.h, z29.h\n"
+      "fmin z14.h, p4/M, z14.h, z29.h\n"
+      "fmin z15.h, p4/M, z15.h, z29.h\n"
+      "fmin z16.h, p4/M, z16.h, z29.h\n"
+      "fmin z17.h, p4/M, z17.h, z29.h\n"
+      "fmin z18.h, p4/M, z18.h, z29.h\n"
+      "fmin z19.h, p4/M, z19.h, z29.h\n"
+      "fmin z20.h, p4/M, z20.h, z29.h\n"
+      "fmin z21.h, p4/M, z21.h, z29.h\n"
+      "fmin z22.h, p4/M, z22.h, z29.h\n"
+      "fmin z23.h, p4/M, z23.h, z29.h\n"
+      "fmin z24.h, p4/M, z24.h, z29.h\n"
+      "fmin z25.h, p4/M, z25.h, z29.h\n"
+      "fmin z26.h, p4/M, z26.h, z29.h\n"
+      "fmin z27.h, p4/M, z27.h, z29.h\n"
+      "fmax z8.h, p4/M, z8.h, z28.h\n"
+      "fmax z9.h, p4/M, z9.h, z28.h\n"
+      "fmax z10.h, p4/M, z10.h, z28.h\n"
+      "fmax z11.h, p4/M, z11.h, z28.h\n"
+      "fmax z12.h, p4/M, z12.h, z28.h\n"
+      "fmax z13.h, p4/M, z13.h, z28.h\n"
+      "fmax z14.h, p4/M, z14.h, z28.h\n"
+      "fmax z15.h, p4/M, z15.h, z28.h\n"
+      "fmax z16.h, p4/M, z16.h, z28.h\n"
+      "fmax z17.h, p4/M, z17.h, z28.h\n"
+      "fmax z18.h, p4/M, z18.h, z28.h\n"
+      "fmax z19.h, p4/M, z19.h, z28.h\n"
+      "fmax z20.h, p4/M, z20.h, z28.h\n"
+      "fmax z21.h, p4/M, z21.h, z28.h\n"
+      "fmax z22.h, p4/M, z22.h, z28.h\n"
+      "fmax z23.h, p4/M, z23.h, z28.h\n"
+      "fmax z24.h, p4/M, z24.h, z28.h\n"
+      "fmax z25.h, p4/M, z25.h, z28.h\n"
+      "fmax z26.h, p4/M, z26.h, z28.h\n"
+      "fmax z27.h, p4/M, z27.h, z28.h\n"
+      "59:"  // Height 5: No activation
+      "st1h { z8.h }, p3, [x9]\n"
+      "st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z12.h }, p3, [x25]\n"
+      "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x24]\n"
+      "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p3, [x23]\n"
+      "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
+      "st1h { z24.h }, p3, [x22]\n"
+      "st1h { z25.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z26.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [x22, #3, MUL VL]\n"
+      "60:"  // Height 5: Writeback done
+      "dech x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 50b\n"
+      "b 74f\n"
+      "61:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0xc\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "62:"  // Height 6: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p0.h, x20, x11\n"
+      "cbz x12, 63f\n"
+      "ld1h { z8.h }, p4/Z, [x12]\n"
+      "ld1h { z9.h }, p4/Z, [x12, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z10.h }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x12, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 65f\n"
+      "63:"  // Height 6: no bias
+      "tbz %x[flags], #0, 64f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x9]\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x24]\n"
+      "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x23]\n"
+      "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x22]\n"
+      "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z24.h }, p3/Z, [x21]\n"
+      "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z28.h }, p3/Z, [x20]\n"
+      "ld1h { z29.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z30.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z31.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 65f\n"
+      "64:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "65:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "66:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 68f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
+      "b 68f\n"
+      "67:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
+      "68:"  // Height 6: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "ld1rh { z4.h }, p4/Z, [x22]\n"
+      "ld1rh { z5.h }, p4/Z, [x21]\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 70f\n"
+      "69:"  // Height 6: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x26, x26, #0x2\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "add x25, x25, #0x2\n"
+      "add x24, x24, #0x2\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z28.h, p4/M, z6.h, z5.h\n"
+      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x23, x23, #0x2\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "add x22, x22, #0x2\n"
+      "add x21, x21, #0x2\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "fmla z29.h, p4/M, z7.h, z5.h\n"
+      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "fmla z26.h, p4/M, z6.h, z4.h\n"
+      "fmla z30.h, p4/M, z6.h, z5.h\n"
+      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "ld1rh { z0.h }, p4/Z, [x26]\n"
+      "ld1rh { z1.h }, p4/Z, [x25]\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "ld1rh { z2.h }, p4/Z, [x24]\n"
+      "ld1rh { z3.h }, p4/Z, [x23]\n"
+      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "fmla z31.h, p4/M, z7.h, z5.h\n"
+      "ld1rh { z4.h }, p4/Z, [x22]\n"
+      "ld1rh { z5.h }, p4/Z, [x21]\n"
+      "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 69b\n"
+      "70:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "cmp x28, x20\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z28.h, p4/M, z6.h, z5.h\n"
+      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "fmla z29.h, p4/M, z7.h, z5.h\n"
+      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "fmla z26.h, p4/M, z6.h, z4.h\n"
+      "fmla z30.h, p4/M, z6.h, z5.h\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "fmla z31.h, p4/M, z7.h, z5.h\n"
+      "bne 66b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "tbz %x[flags], #1, 71f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z0.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z1.h\n"
+      "fmin z9.h, p4/M, z9.h, z1.h\n"
+      "fmin z10.h, p4/M, z10.h, z1.h\n"
+      "fmin z11.h, p4/M, z11.h, z1.h\n"
+      "fmin z12.h, p4/M, z12.h, z1.h\n"
+      "fmin z13.h, p4/M, z13.h, z1.h\n"
+      "fmin z14.h, p4/M, z14.h, z1.h\n"
+      "fmin z15.h, p4/M, z15.h, z1.h\n"
+      "fmin z16.h, p4/M, z16.h, z1.h\n"
+      "fmin z17.h, p4/M, z17.h, z1.h\n"
+      "fmin z18.h, p4/M, z18.h, z1.h\n"
+      "fmin z19.h, p4/M, z19.h, z1.h\n"
+      "fmin z20.h, p4/M, z20.h, z1.h\n"
+      "fmin z21.h, p4/M, z21.h, z1.h\n"
+      "fmin z22.h, p4/M, z22.h, z1.h\n"
+      "fmin z23.h, p4/M, z23.h, z1.h\n"
+      "fmin z24.h, p4/M, z24.h, z1.h\n"
+      "fmin z25.h, p4/M, z25.h, z1.h\n"
+      "fmin z26.h, p4/M, z26.h, z1.h\n"
+      "fmin z27.h, p4/M, z27.h, z1.h\n"
+      "fmin z28.h, p4/M, z28.h, z1.h\n"
+      "fmin z29.h, p4/M, z29.h, z1.h\n"
+      "fmin z30.h, p4/M, z30.h, z1.h\n"
+      "fmin z31.h, p4/M, z31.h, z1.h\n"
+      "fmax z8.h, p4/M, z8.h, z0.h\n"
+      "fmax z9.h, p4/M, z9.h, z0.h\n"
+      "fmax z10.h, p4/M, z10.h, z0.h\n"
+      "fmax z11.h, p4/M, z11.h, z0.h\n"
+      "fmax z12.h, p4/M, z12.h, z0.h\n"
+      "fmax z13.h, p4/M, z13.h, z0.h\n"
+      "fmax z14.h, p4/M, z14.h, z0.h\n"
+      "fmax z15.h, p4/M, z15.h, z0.h\n"
+      "fmax z16.h, p4/M, z16.h, z0.h\n"
+      "fmax z17.h, p4/M, z17.h, z0.h\n"
+      "fmax z18.h, p4/M, z18.h, z0.h\n"
+      "fmax z19.h, p4/M, z19.h, z0.h\n"
+      "fmax z20.h, p4/M, z20.h, z0.h\n"
+      "fmax z21.h, p4/M, z21.h, z0.h\n"
+      "fmax z22.h, p4/M, z22.h, z0.h\n"
+      "fmax z23.h, p4/M, z23.h, z0.h\n"
+      "fmax z24.h, p4/M, z24.h, z0.h\n"
+      "fmax z25.h, p4/M, z25.h, z0.h\n"
+      "fmax z26.h, p4/M, z26.h, z0.h\n"
+      "fmax z27.h, p4/M, z27.h, z0.h\n"
+      "fmax z28.h, p4/M, z28.h, z0.h\n"
+      "fmax z29.h, p4/M, z29.h, z0.h\n"
+      "fmax z30.h, p4/M, z30.h, z0.h\n"
+      "fmax z31.h, p4/M, z31.h, z0.h\n"
+      "71:"  // Height 6: No activation
+      "st1h { z8.h }, p3, [x9]\n"
+      "st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z12.h }, p3, [x25]\n"
+      "st1h { z13.h }, p2, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x24]\n"
+      "st1h { z17.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p3, [x23]\n"
+      "st1h { z21.h }, p2, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p1, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p0, [x23, #3, MUL VL]\n"
+      "st1h { z24.h }, p3, [x22]\n"
+      "st1h { z25.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z26.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [x22, #3, MUL VL]\n"
+      "st1h { z28.h }, p3, [x21]\n"
+      "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+      "st1h { z30.h }, p1, [x21, #2, MUL VL]\n"
+      "st1h { z31.h }, p0, [x21, #3, MUL VL]\n"
+      "72:"  // Height 6: Writeback done
+      "dech x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 62b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 74f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 73f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "73:"  // Update direct input
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "74:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
index 7cc03bbfb5..041825df6b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,18 +10,18 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -102,32 +102,32 @@ void sve_hybrid_fp16_mla_6x4VL (
       "cmp %x[M], #0x2\n"
       "bgt 27f\n"
       "beq 14f\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[bias]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p3.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p2.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p1.h, x19, x11\n"
-      "cbz x9, 3f\n"
-      "ld1h { z8.h }, p5/Z, [x9]\n"
-      "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "cbz x12, 3f\n"
+      "ld1h { z8.h }, p5/Z, [x12]\n"
+      "ld1h { z9.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
       "b 5f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 4f\n"
-      "ld1h { z8.h }, p4/Z, [x28]\n"
-      "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1h { z8.h }, p4/Z, [x9]\n"
+      "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
       "b 5f\n"
       "4:"  // Height 1: no accumulate
       "mov z8.b, #0x0\n"
@@ -135,255 +135,252 @@ void sve_hybrid_fp16_mla_6x4VL (
       "mov z10.b, #0x0\n"
       "mov z11.b, #0x0\n"
       "5:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 8f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 8f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
       "b 8f\n"
       "7:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "8:"  // Height 1: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 10f\n"
       "9:"  // Height 1: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "cmp x26, #0x8\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1h { z16.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[2]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[2]\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[3]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[3]\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[4]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[4]\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[5]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z10.h, z17.h, z0.h[5]\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[6]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[6]\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[7]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sub x27, x27, #0x8\n"
+      "cmp x27, #0x8\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
+      "add x26, x26, #0x10\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1h { z16.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[0]\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[1]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[1]\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[2]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[2]\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[3]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[3]\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[4]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[4]\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[5]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[5]\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[6]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[6]\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[7]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
       "11:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 6b\n"
       "tbz %x[flags], #1, 12f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z1.h }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z0.h }, p5/Z, [x19]\n"
-      "fmin z8.h, p5/M, z8.h, z0.h\n"
-      "fmin z9.h, p5/M, z9.h, z0.h\n"
-      "fmin z10.h, p5/M, z10.h, z0.h\n"
-      "fmin z11.h, p5/M, z11.h, z0.h\n"
-      "fmax z8.h, p5/M, z8.h, z1.h\n"
-      "fmax z9.h, p5/M, z9.h, z1.h\n"
-      "fmax z10.h, p5/M, z10.h, z1.h\n"
-      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z17.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z16.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z17.h\n"
+      "fmin z9.h, p5/M, z9.h, z17.h\n"
+      "fmin z10.h, p5/M, z10.h, z17.h\n"
+      "fmin z11.h, p5/M, z11.h, z17.h\n"
+      "fmax z8.h, p5/M, z8.h, z16.h\n"
+      "fmax z9.h, p5/M, z9.h, z16.h\n"
+      "fmax z10.h, p5/M, z10.h, z16.h\n"
+      "fmax z11.h, p5/M, z11.h, z16.h\n"
       "12:"  // Height 1: No activation
-      "st1h { z8.h }, p4, [x28]\n"
-      "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
-      "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
-      "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
+      "st1h { z8.h }, p4, [x9]\n"
+      "st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
       "13:"  // Height 1: Writeback done
       "dech x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 2b\n"
       "b 80f\n"
       "14:"  // Height 2
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "15:"  // Height 2: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p3.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p2.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p1.h, x19, x11\n"
-      "cbz x9, 16f\n"
-      "ld1h { z8.h }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "cbz x12, 16f\n"
+      "ld1h { z8.h }, p5/Z, [x12]\n"
+      "ld1h { z9.h }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
       "mov z13.d, z9.d\n"
-      "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
+      "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "addvl x12, x12, #4\n"
       "b 18f\n"
       "16:"  // Height 2: no bias
       "tbz %x[flags], #0, 17f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1h { z8.h }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #1\n"
-      "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x24]\n"
-      "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x9]\n"
+      "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x20]\n"
+      "ld1h { z13.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 18f\n"
       "17:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -395,351 +392,345 @@ void sve_hybrid_fp16_mla_6x4VL (
       "mov z14.b, #0x0\n"
       "mov z15.b, #0x0\n"
       "18:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "19:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 21f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 21f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
       "b 21f\n"
       "20:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
       "21:"  // Height 2: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 23f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[0]\n"
+      "fmla z12.h, z17.h, z0.h[0]\n"
+      "fmla z9.h, z16.h, z1.h[0]\n"
+      "fmla z13.h, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[0]\n"
+      "fmla z14.h, z17.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "cmp x27, #0x8\n"
+      "fmla z11.h, z16.h, z1.h[0]\n"
+      "fmla z15.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z8.h, z17.h, z1.h[1]\n"
+      "fmla z12.h, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x8\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[1]\n"
+      "fmla z13.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[1]\n"
+      "fmla z14.h, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[1]\n"
+      "fmla z15.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[2]\n"
+      "fmla z12.h, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[2]\n"
+      "fmla z13.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[2]\n"
+      "fmla z14.h, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[2]\n"
+      "fmla z15.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[3]\n"
+      "fmla z12.h, z17.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[3]\n"
+      "fmla z13.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[3]\n"
+      "fmla z14.h, z17.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "fmla z11.h, z16.h, z1.h[3]\n"
+      "fmla z15.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[4]\n"
+      "fmla z12.h, z17.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[4]\n"
+      "fmla z13.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[4]\n"
+      "fmla z14.h, z17.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[4]\n"
+      "fmla z15.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[5]\n"
+      "fmla z12.h, z17.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[5]\n"
+      "fmla z13.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z10.h, z17.h, z1.h[5]\n"
+      "fmla z14.h, z17.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[5]\n"
+      "fmla z15.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[6]\n"
+      "fmla z12.h, z17.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[6]\n"
+      "fmla z13.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[6]\n"
+      "fmla z14.h, z17.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[6]\n"
+      "fmla z15.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[7]\n"
+      "fmla z12.h, z17.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[7]\n"
+      "fmla z13.h, z16.h, z0.h[7]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[7]\n"
+      "fmla z14.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z1.h[7]\n"
+      "fmla z15.h, z16.h, z0.h[7]\n"
       "bgt 22b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[0]\n"
+      "fmla z12.h, z17.h, z1.h[0]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "fmla z13.h, z16.h, z1.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[0]\n"
+      "fmla z14.h, z17.h, z1.h[0]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
+      "fmla z15.h, z16.h, z1.h[0]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[1]\n"
+      "fmla z12.h, z17.h, z1.h[1]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "fmla z13.h, z16.h, z1.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[1]\n"
+      "fmla z14.h, z17.h, z1.h[1]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
+      "fmla z15.h, z16.h, z1.h[1]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[2]\n"
+      "fmla z12.h, z17.h, z1.h[2]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "fmla z13.h, z16.h, z1.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[2]\n"
+      "fmla z14.h, z17.h, z1.h[2]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
+      "fmla z15.h, z16.h, z1.h[2]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[3]\n"
+      "fmla z12.h, z17.h, z1.h[3]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "fmla z13.h, z16.h, z1.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[3]\n"
+      "fmla z14.h, z17.h, z1.h[3]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
+      "fmla z15.h, z16.h, z1.h[3]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[4]\n"
+      "fmla z12.h, z17.h, z1.h[4]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "fmla z13.h, z16.h, z1.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[4]\n"
+      "fmla z14.h, z17.h, z1.h[4]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
+      "fmla z15.h, z16.h, z1.h[4]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[5]\n"
+      "fmla z12.h, z17.h, z1.h[5]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "fmla z13.h, z16.h, z1.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[5]\n"
+      "fmla z14.h, z17.h, z1.h[5]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
+      "fmla z15.h, z16.h, z1.h[5]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[6]\n"
+      "fmla z12.h, z17.h, z1.h[6]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "fmla z13.h, z16.h, z1.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.h, z17.h, z0.h[6]\n"
+      "fmla z14.h, z17.h, z1.h[6]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
+      "fmla z15.h, z16.h, z1.h[6]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[7]\n"
+      "fmla z12.h, z17.h, z1.h[7]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "fmla z13.h, z16.h, z1.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z14.h, z17.h, z1.h[7]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
+      "fmla z15.h, z16.h, z1.h[7]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 19b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
       "tbz %x[flags], #1, 25f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z1.h }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z0.h }, p5/Z, [x19]\n"
-      "fmin z8.h, p5/M, z8.h, z0.h\n"
-      "fmin z9.h, p5/M, z9.h, z0.h\n"
-      "fmin z10.h, p5/M, z10.h, z0.h\n"
-      "fmin z11.h, p5/M, z11.h, z0.h\n"
-      "fmin z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z8.h, p5/M, z8.h, z1.h\n"
-      "fmax z9.h, p5/M, z9.h, z1.h\n"
-      "fmax z10.h, p5/M, z10.h, z1.h\n"
-      "fmax z11.h, p5/M, z11.h, z1.h\n"
-      "fmax z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z0.h\n"
-      "fmin z14.h, p5/M, z14.h, z0.h\n"
-      "fmin z15.h, p5/M, z15.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z1.h\n"
-      "fmax z14.h, p5/M, z14.h, z1.h\n"
-      "fmax z15.h, p5/M, z15.h, z1.h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z17.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z16.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z17.h\n"
+      "fmin z9.h, p5/M, z9.h, z17.h\n"
+      "fmin z10.h, p5/M, z10.h, z17.h\n"
+      "fmin z11.h, p5/M, z11.h, z17.h\n"
+      "fmin z12.h, p5/M, z12.h, z17.h\n"
+      "fmin z13.h, p5/M, z13.h, z17.h\n"
+      "fmin z14.h, p5/M, z14.h, z17.h\n"
+      "fmin z15.h, p5/M, z15.h, z17.h\n"
+      "fmax z8.h, p5/M, z8.h, z16.h\n"
+      "fmax z9.h, p5/M, z9.h, z16.h\n"
+      "fmax z10.h, p5/M, z10.h, z16.h\n"
+      "fmax z11.h, p5/M, z11.h, z16.h\n"
+      "fmax z12.h, p5/M, z12.h, z16.h\n"
+      "fmax z13.h, p5/M, z13.h, z16.h\n"
+      "fmax z14.h, p5/M, z14.h, z16.h\n"
+      "fmax z15.h, p5/M, z15.h, z16.h\n"
       "25:"  // Height 2: No activation
-      "st1h { z8.h }, p4, [x28]\n"
-      "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
-      "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
-      "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1h { z12.h }, p4, [x24]\n"
-      "st1h { z13.h }, p3, [x24, #1, MUL VL]\n"
-      "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
-      "st1h { z15.h }, p1, [x24, #3, MUL VL]\n"
+      "st1h { z8.h }, p4, [x9]\n"
+      "st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z12.h }, p4, [x25]\n"
+      "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
       "26:"  // Height 2: Writeback done
       "dech x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 15b\n"
       "b 80f\n"
       "27:"  // Height 3
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "28:"  // Height 3: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p3.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p2.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p1.h, x19, x11\n"
-      "cbz x9, 29f\n"
-      "ld1h { z8.h }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "cbz x12, 29f\n"
+      "ld1h { z8.h }, p5/Z, [x12]\n"
+      "ld1h { z9.h }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
       "mov z13.d, z9.d\n"
-      "addvl x9, x9, #4\n"
-      "mov z17.d, z9.d\n"
+      "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
       "b 31f\n"
       "29:"  // Height 3: no bias
       "tbz %x[flags], #0, 30f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1h { z8.h }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #1\n"
-      "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #1\n"
-      "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x24]\n"
-      "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z16.h }, p4/Z, [x23]\n"
-      "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x9]\n"
+      "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x21]\n"
+      "ld1h { z13.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x20]\n"
+      "ld1h { z17.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 31f\n"
       "30:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -755,446 +746,437 @@ void sve_hybrid_fp16_mla_6x4VL (
       "mov z18.b, #0x0\n"
       "mov z19.b, #0x0\n"
       "31:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 34f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 34f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
       "b 34f\n"
       "33:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "34:"  // Height 3: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 36f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x24]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z21.h, z2.h[0]\n"
+      "fmla z12.h, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.h, z21.h, z0.h[0]\n"
+      "fmla z9.h, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[0]\n"
+      "fmla z17.h, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "cmp x27, #0x8\n"
+      "fmla z10.h, z21.h, z2.h[0]\n"
+      "fmla z14.h, z21.h, z1.h[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "fmla z18.h, z21.h, z0.h[0]\n"
+      "fmla z11.h, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x8\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[0]\n"
+      "fmla z19.h, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[1]\n"
+      "fmla z12.h, z21.h, z1.h[1]\n"
+      "fmla z16.h, z21.h, z0.h[1]\n"
+      "fmla z9.h, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[1]\n"
+      "fmla z17.h, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[1]\n"
+      "fmla z14.h, z21.h, z1.h[1]\n"
+      "fmla z18.h, z21.h, z0.h[1]\n"
+      "fmla z11.h, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[1]\n"
+      "fmla z19.h, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[2]\n"
+      "fmla z12.h, z21.h, z1.h[2]\n"
+      "fmla z16.h, z21.h, z0.h[2]\n"
+      "fmla z9.h, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[2]\n"
+      "fmla z17.h, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[2]\n"
+      "fmla z14.h, z21.h, z1.h[2]\n"
+      "fmla z18.h, z21.h, z0.h[2]\n"
+      "fmla z11.h, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[2]\n"
+      "fmla z19.h, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[3]\n"
+      "fmla z12.h, z21.h, z1.h[3]\n"
+      "fmla z16.h, z21.h, z0.h[3]\n"
+      "fmla z9.h, z20.h, z2.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[3]\n"
+      "fmla z17.h, z20.h, z0.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[3]\n"
+      "fmla z14.h, z21.h, z1.h[3]\n"
+      "fmla z18.h, z21.h, z0.h[3]\n"
+      "fmla z11.h, z20.h, z2.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "fmla z15.h, z20.h, z1.h[3]\n"
+      "fmla z19.h, z20.h, z0.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[4]\n"
+      "fmla z12.h, z21.h, z1.h[4]\n"
+      "fmla z16.h, z21.h, z0.h[4]\n"
+      "fmla z9.h, z20.h, z2.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[4]\n"
+      "fmla z17.h, z20.h, z0.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[4]\n"
+      "fmla z14.h, z21.h, z1.h[4]\n"
+      "fmla z18.h, z21.h, z0.h[4]\n"
+      "fmla z11.h, z20.h, z2.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[4]\n"
+      "fmla z19.h, z20.h, z0.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[5]\n"
+      "fmla z12.h, z21.h, z1.h[5]\n"
+      "fmla z16.h, z21.h, z0.h[5]\n"
+      "fmla z9.h, z20.h, z2.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[5]\n"
+      "fmla z17.h, z20.h, z0.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z10.h, z21.h, z2.h[5]\n"
+      "fmla z14.h, z21.h, z1.h[5]\n"
+      "fmla z18.h, z21.h, z0.h[5]\n"
+      "fmla z11.h, z20.h, z2.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[5]\n"
+      "fmla z19.h, z20.h, z0.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[6]\n"
+      "fmla z12.h, z21.h, z1.h[6]\n"
+      "fmla z16.h, z21.h, z0.h[6]\n"
+      "fmla z9.h, z20.h, z2.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[6]\n"
+      "fmla z17.h, z20.h, z0.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[6]\n"
+      "fmla z14.h, z21.h, z1.h[6]\n"
+      "fmla z18.h, z21.h, z0.h[6]\n"
+      "fmla z11.h, z20.h, z2.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[6]\n"
+      "fmla z19.h, z20.h, z0.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[7]\n"
+      "fmla z12.h, z21.h, z1.h[7]\n"
+      "fmla z16.h, z21.h, z0.h[7]\n"
+      "fmla z9.h, z20.h, z2.h[7]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[7]\n"
+      "fmla z17.h, z20.h, z0.h[7]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[7]\n"
+      "fmla z14.h, z21.h, z1.h[7]\n"
+      "fmla z18.h, z21.h, z0.h[7]\n"
+      "fmla z11.h, z20.h, z2.h[7]\n"
+      "fmla z15.h, z20.h, z1.h[7]\n"
+      "fmla z19.h, z20.h, z0.h[7]\n"
       "bgt 35b\n"
       "36:"  // Height 3: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z21.h, z0.h[0]\n"
+      "fmla z12.h, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.h, z21.h, z2.h[0]\n"
+      "fmla z9.h, z20.h, z0.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[0]\n"
+      "fmla z17.h, z20.h, z2.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z10.h, z21.h, z0.h[0]\n"
+      "fmla z14.h, z21.h, z1.h[0]\n"
+      "fmla z18.h, z21.h, z2.h[0]\n"
+      "fmla z11.h, z20.h, z0.h[0]\n"
+      "fmla z15.h, z20.h, z1.h[0]\n"
+      "fmla z19.h, z20.h, z2.h[0]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[1]\n"
+      "fmla z12.h, z21.h, z1.h[1]\n"
+      "fmla z16.h, z21.h, z2.h[1]\n"
+      "fmla z9.h, z20.h, z0.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[1]\n"
+      "fmla z17.h, z20.h, z2.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z10.h, z21.h, z0.h[1]\n"
+      "fmla z14.h, z21.h, z1.h[1]\n"
+      "fmla z18.h, z21.h, z2.h[1]\n"
+      "fmla z11.h, z20.h, z0.h[1]\n"
+      "fmla z15.h, z20.h, z1.h[1]\n"
+      "fmla z19.h, z20.h, z2.h[1]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[2]\n"
+      "fmla z12.h, z21.h, z1.h[2]\n"
+      "fmla z16.h, z21.h, z2.h[2]\n"
+      "fmla z9.h, z20.h, z0.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[2]\n"
+      "fmla z17.h, z20.h, z2.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z10.h, z21.h, z0.h[2]\n"
+      "fmla z14.h, z21.h, z1.h[2]\n"
+      "fmla z18.h, z21.h, z2.h[2]\n"
+      "fmla z11.h, z20.h, z0.h[2]\n"
+      "fmla z15.h, z20.h, z1.h[2]\n"
+      "fmla z19.h, z20.h, z2.h[2]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[3]\n"
+      "fmla z12.h, z21.h, z1.h[3]\n"
+      "fmla z16.h, z21.h, z2.h[3]\n"
+      "fmla z9.h, z20.h, z0.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[3]\n"
+      "fmla z17.h, z20.h, z2.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z10.h, z21.h, z0.h[3]\n"
+      "fmla z14.h, z21.h, z1.h[3]\n"
+      "fmla z18.h, z21.h, z2.h[3]\n"
+      "fmla z11.h, z20.h, z0.h[3]\n"
+      "fmla z15.h, z20.h, z1.h[3]\n"
+      "fmla z19.h, z20.h, z2.h[3]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[4]\n"
+      "fmla z12.h, z21.h, z1.h[4]\n"
+      "fmla z16.h, z21.h, z2.h[4]\n"
+      "fmla z9.h, z20.h, z0.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[4]\n"
+      "fmla z17.h, z20.h, z2.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z10.h, z21.h, z0.h[4]\n"
+      "fmla z14.h, z21.h, z1.h[4]\n"
+      "fmla z18.h, z21.h, z2.h[4]\n"
+      "fmla z11.h, z20.h, z0.h[4]\n"
+      "fmla z15.h, z20.h, z1.h[4]\n"
+      "fmla z19.h, z20.h, z2.h[4]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[5]\n"
+      "fmla z12.h, z21.h, z1.h[5]\n"
+      "fmla z16.h, z21.h, z2.h[5]\n"
+      "fmla z9.h, z20.h, z0.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[5]\n"
+      "fmla z17.h, z20.h, z2.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z10.h, z21.h, z0.h[5]\n"
+      "fmla z14.h, z21.h, z1.h[5]\n"
+      "fmla z18.h, z21.h, z2.h[5]\n"
+      "fmla z11.h, z20.h, z0.h[5]\n"
+      "fmla z15.h, z20.h, z1.h[5]\n"
+      "fmla z19.h, z20.h, z2.h[5]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[6]\n"
+      "fmla z12.h, z21.h, z1.h[6]\n"
+      "fmla z16.h, z21.h, z2.h[6]\n"
+      "fmla z9.h, z20.h, z0.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.h, z20.h, z1.h[6]\n"
+      "fmla z17.h, z20.h, z2.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z10.h, z21.h, z0.h[6]\n"
+      "fmla z14.h, z21.h, z1.h[6]\n"
+      "fmla z18.h, z21.h, z2.h[6]\n"
+      "fmla z11.h, z20.h, z0.h[6]\n"
+      "fmla z15.h, z20.h, z1.h[6]\n"
+      "fmla z19.h, z20.h, z2.h[6]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[7]\n"
+      "fmla z12.h, z21.h, z1.h[7]\n"
+      "fmla z16.h, z21.h, z2.h[7]\n"
+      "fmla z9.h, z20.h, z0.h[7]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[7]\n"
+      "fmla z17.h, z20.h, z2.h[7]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z10.h, z21.h, z0.h[7]\n"
+      "fmla z14.h, z21.h, z1.h[7]\n"
+      "fmla z18.h, z21.h, z2.h[7]\n"
+      "fmla z11.h, z20.h, z0.h[7]\n"
+      "fmla z15.h, z20.h, z1.h[7]\n"
+      "fmla z19.h, z20.h, z2.h[7]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 32b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
       "tbz %x[flags], #1, 38f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z1.h }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z0.h }, p5/Z, [x19]\n"
-      "fmin z8.h, p5/M, z8.h, z0.h\n"
-      "fmin z9.h, p5/M, z9.h, z0.h\n"
-      "fmin z10.h, p5/M, z10.h, z0.h\n"
-      "fmin z11.h, p5/M, z11.h, z0.h\n"
-      "fmin z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z8.h, p5/M, z8.h, z1.h\n"
-      "fmax z9.h, p5/M, z9.h, z1.h\n"
-      "fmax z10.h, p5/M, z10.h, z1.h\n"
-      "fmax z11.h, p5/M, z11.h, z1.h\n"
-      "fmax z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z0.h\n"
-      "fmin z14.h, p5/M, z14.h, z0.h\n"
-      "fmin z15.h, p5/M, z15.h, z0.h\n"
-      "fmin z16.h, p5/M, z16.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z1.h\n"
-      "fmax z14.h, p5/M, z14.h, z1.h\n"
-      "fmax z15.h, p5/M, z15.h, z1.h\n"
-      "fmax z16.h, p5/M, z16.h, z1.h\n"
-      "fmin z17.h, p5/M, z17.h, z0.h\n"
-      "fmin z18.h, p5/M, z18.h, z0.h\n"
-      "fmin z19.h, p5/M, z19.h, z0.h\n"
-      "fmax z17.h, p5/M, z17.h, z1.h\n"
-      "fmax z18.h, p5/M, z18.h, z1.h\n"
-      "fmax z19.h, p5/M, z19.h, z1.h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z21.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z20.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z21.h\n"
+      "fmin z9.h, p5/M, z9.h, z21.h\n"
+      "fmin z10.h, p5/M, z10.h, z21.h\n"
+      "fmin z11.h, p5/M, z11.h, z21.h\n"
+      "fmin z12.h, p5/M, z12.h, z21.h\n"
+      "fmin z13.h, p5/M, z13.h, z21.h\n"
+      "fmin z14.h, p5/M, z14.h, z21.h\n"
+      "fmin z15.h, p5/M, z15.h, z21.h\n"
+      "fmin z16.h, p5/M, z16.h, z21.h\n"
+      "fmin z17.h, p5/M, z17.h, z21.h\n"
+      "fmin z18.h, p5/M, z18.h, z21.h\n"
+      "fmin z19.h, p5/M, z19.h, z21.h\n"
+      "fmax z8.h, p5/M, z8.h, z20.h\n"
+      "fmax z9.h, p5/M, z9.h, z20.h\n"
+      "fmax z10.h, p5/M, z10.h, z20.h\n"
+      "fmax z11.h, p5/M, z11.h, z20.h\n"
+      "fmax z12.h, p5/M, z12.h, z20.h\n"
+      "fmax z13.h, p5/M, z13.h, z20.h\n"
+      "fmax z14.h, p5/M, z14.h, z20.h\n"
+      "fmax z15.h, p5/M, z15.h, z20.h\n"
+      "fmax z16.h, p5/M, z16.h, z20.h\n"
+      "fmax z17.h, p5/M, z17.h, z20.h\n"
+      "fmax z18.h, p5/M, z18.h, z20.h\n"
+      "fmax z19.h, p5/M, z19.h, z20.h\n"
       "38:"  // Height 3: No activation
-      "st1h { z8.h }, p4, [x28]\n"
-      "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
-      "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
-      "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1h { z12.h }, p4, [x24]\n"
-      "st1h { z13.h }, p3, [x24, #1, MUL VL]\n"
-      "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
-      "st1h { z15.h }, p1, [x24, #3, MUL VL]\n"
-      "st1h { z16.h }, p4, [x23]\n"
-      "st1h { z17.h }, p3, [x23, #1, MUL VL]\n"
-      "st1h { z18.h }, p2, [x23, #2, MUL VL]\n"
-      "st1h { z19.h }, p1, [x23, #3, MUL VL]\n"
+      "st1h { z8.h }, p4, [x9]\n"
+      "st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z12.h }, p4, [x25]\n"
+      "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p4, [x24]\n"
+      "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
       "39:"  // Height 3: Writeback done
       "dech x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 28b\n"
       "b 80f\n"
       "40:"  // Height 4
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "41:"  // Height 4: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p3.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p2.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p1.h, x19, x11\n"
-      "cbz x9, 42f\n"
-      "ld1h { z8.h }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "cbz x12, 42f\n"
+      "ld1h { z8.h }, p5/Z, [x12]\n"
+      "ld1h { z9.h }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "mov z20.d, z8.d\n"
-      "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
       "mov z13.d, z9.d\n"
-      "mov z17.d, z9.d\n"
+      "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
       "mov z21.d, z9.d\n"
       "mov z22.d, z10.d\n"
       "mov z23.d, z11.d\n"
       "b 44f\n"
       "42:"  // Height 4: no bias
       "tbz %x[flags], #0, 43f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1h { z8.h }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #1\n"
-      "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #1\n"
-      "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x22, x23, x19, LSL #1\n"
-      "ld1h { z12.h }, p4/Z, [x24]\n"
-      "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z16.h }, p4/Z, [x23]\n"
-      "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1h { z20.h }, p4/Z, [x22]\n"
-      "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x22]\n"
+      "ld1h { z13.h }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x21]\n"
+      "ld1h { z17.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x20]\n"
+      "ld1h { z21.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 44f\n"
       "43:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -1214,506 +1196,494 @@ void sve_hybrid_fp16_mla_6x4VL (
       "mov z22.b, #0x0\n"
       "mov z23.b, #0x0\n"
       "44:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "45:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 47f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
       "b 47f\n"
       "46:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "47:"  // Height 4: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 49f\n"
       "48:"  // Height 4: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z3.h }, p0/Z, [x26]\n"
+      "ld1rqh { z2.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "ld1rqh { z0.h }, p0/Z, [x23]\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[0]\n"
+      "fmla z12.h, z25.h, z2.h[0]\n"
+      "fmla z16.h, z25.h, z1.h[0]\n"
+      "fmla z20.h, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "fmla z9.h, z24.h, z3.h[0]\n"
+      "fmla z13.h, z24.h, z2.h[0]\n"
       "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
       "add x23, x23, #0x10\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z17.h, z24.h, z1.h[0]\n"
+      "fmla z21.h, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[0]\n"
+      "fmla z14.h, z25.h, z2.h[0]\n"
+      "fmla z18.h, z25.h, z1.h[0]\n"
+      "fmla z22.h, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[0]\n"
+      "fmla z15.h, z24.h, z2.h[0]\n"
+      "fmla z19.h, z24.h, z1.h[0]\n"
+      "fmla z23.h, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[1]\n"
+      "fmla z12.h, z25.h, z2.h[1]\n"
+      "fmla z16.h, z25.h, z1.h[1]\n"
+      "fmla z20.h, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[1]\n"
+      "fmla z13.h, z24.h, z2.h[1]\n"
+      "fmla z17.h, z24.h, z1.h[1]\n"
+      "fmla z21.h, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[1]\n"
+      "fmla z14.h, z25.h, z2.h[1]\n"
+      "fmla z18.h, z25.h, z1.h[1]\n"
+      "fmla z22.h, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[1]\n"
+      "fmla z15.h, z24.h, z2.h[1]\n"
+      "fmla z19.h, z24.h, z1.h[1]\n"
+      "fmla z23.h, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[2]\n"
+      "fmla z12.h, z25.h, z2.h[2]\n"
+      "fmla z16.h, z25.h, z1.h[2]\n"
+      "fmla z20.h, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[2]\n"
+      "fmla z13.h, z24.h, z2.h[2]\n"
+      "fmla z17.h, z24.h, z1.h[2]\n"
+      "fmla z21.h, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[2]\n"
+      "fmla z14.h, z25.h, z2.h[2]\n"
+      "fmla z18.h, z25.h, z1.h[2]\n"
+      "fmla z22.h, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[2]\n"
+      "fmla z15.h, z24.h, z2.h[2]\n"
+      "fmla z19.h, z24.h, z1.h[2]\n"
+      "fmla z23.h, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[3]\n"
+      "fmla z12.h, z25.h, z2.h[3]\n"
+      "fmla z16.h, z25.h, z1.h[3]\n"
+      "fmla z20.h, z25.h, z0.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[3]\n"
+      "fmla z13.h, z24.h, z2.h[3]\n"
+      "fmla z17.h, z24.h, z1.h[3]\n"
+      "fmla z21.h, z24.h, z0.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[3]\n"
+      "fmla z14.h, z25.h, z2.h[3]\n"
+      "fmla z18.h, z25.h, z1.h[3]\n"
+      "fmla z22.h, z25.h, z0.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "fmla z11.h, z24.h, z3.h[3]\n"
+      "fmla z15.h, z24.h, z2.h[3]\n"
+      "fmla z19.h, z24.h, z1.h[3]\n"
+      "fmla z23.h, z24.h, z0.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[4]\n"
+      "fmla z12.h, z25.h, z2.h[4]\n"
+      "fmla z16.h, z25.h, z1.h[4]\n"
+      "fmla z20.h, z25.h, z0.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[4]\n"
+      "fmla z13.h, z24.h, z2.h[4]\n"
+      "fmla z17.h, z24.h, z1.h[4]\n"
+      "fmla z21.h, z24.h, z0.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[4]\n"
+      "fmla z14.h, z25.h, z2.h[4]\n"
+      "fmla z18.h, z25.h, z1.h[4]\n"
+      "fmla z22.h, z25.h, z0.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[4]\n"
+      "fmla z15.h, z24.h, z2.h[4]\n"
+      "fmla z19.h, z24.h, z1.h[4]\n"
+      "fmla z23.h, z24.h, z0.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[5]\n"
+      "fmla z12.h, z25.h, z2.h[5]\n"
+      "fmla z16.h, z25.h, z1.h[5]\n"
+      "fmla z20.h, z25.h, z0.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[5]\n"
+      "fmla z13.h, z24.h, z2.h[5]\n"
+      "fmla z17.h, z24.h, z1.h[5]\n"
+      "fmla z21.h, z24.h, z0.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z10.h, z25.h, z3.h[5]\n"
+      "fmla z14.h, z25.h, z2.h[5]\n"
+      "fmla z18.h, z25.h, z1.h[5]\n"
+      "fmla z22.h, z25.h, z0.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[5]\n"
+      "fmla z15.h, z24.h, z2.h[5]\n"
+      "fmla z19.h, z24.h, z1.h[5]\n"
+      "fmla z23.h, z24.h, z0.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[6]\n"
+      "fmla z12.h, z25.h, z2.h[6]\n"
+      "fmla z16.h, z25.h, z1.h[6]\n"
+      "fmla z20.h, z25.h, z0.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[6]\n"
+      "fmla z13.h, z24.h, z2.h[6]\n"
+      "fmla z17.h, z24.h, z1.h[6]\n"
+      "fmla z21.h, z24.h, z0.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[6]\n"
+      "fmla z14.h, z25.h, z2.h[6]\n"
+      "fmla z18.h, z25.h, z1.h[6]\n"
+      "fmla z22.h, z25.h, z0.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[6]\n"
+      "fmla z15.h, z24.h, z2.h[6]\n"
+      "fmla z19.h, z24.h, z1.h[6]\n"
+      "fmla z23.h, z24.h, z0.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[7]\n"
+      "fmla z12.h, z25.h, z2.h[7]\n"
+      "fmla z16.h, z25.h, z1.h[7]\n"
+      "fmla z20.h, z25.h, z0.h[7]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[7]\n"
+      "fmla z13.h, z24.h, z2.h[7]\n"
+      "fmla z17.h, z24.h, z1.h[7]\n"
+      "fmla z21.h, z24.h, z0.h[7]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[7]\n"
+      "fmla z14.h, z25.h, z2.h[7]\n"
+      "fmla z18.h, z25.h, z1.h[7]\n"
+      "fmla z22.h, z25.h, z0.h[7]\n"
+      "fmla z11.h, z24.h, z3.h[7]\n"
+      "fmla z15.h, z24.h, z2.h[7]\n"
+      "fmla z19.h, z24.h, z1.h[7]\n"
+      "fmla z23.h, z24.h, z0.h[7]\n"
       "bgt 48b\n"
       "49:"  // Height 4: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[0]\n"
+      "fmla z12.h, z25.h, z1.h[0]\n"
+      "fmla z16.h, z25.h, z2.h[0]\n"
+      "fmla z20.h, z25.h, z3.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z24.h, z0.h[0]\n"
+      "fmla z13.h, z24.h, z1.h[0]\n"
+      "fmla z17.h, z24.h, z2.h[0]\n"
+      "fmla z21.h, z24.h, z3.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
+      "fmla z10.h, z25.h, z0.h[0]\n"
+      "fmla z14.h, z25.h, z1.h[0]\n"
+      "fmla z18.h, z25.h, z2.h[0]\n"
+      "fmla z22.h, z25.h, z3.h[0]\n"
+      "fmla z11.h, z24.h, z0.h[0]\n"
+      "fmla z15.h, z24.h, z1.h[0]\n"
+      "fmla z19.h, z24.h, z2.h[0]\n"
+      "fmla z23.h, z24.h, z3.h[0]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[1]\n"
+      "fmla z12.h, z25.h, z1.h[1]\n"
+      "fmla z16.h, z25.h, z2.h[1]\n"
+      "fmla z20.h, z25.h, z3.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[1]\n"
+      "fmla z13.h, z24.h, z1.h[1]\n"
+      "fmla z17.h, z24.h, z2.h[1]\n"
+      "fmla z21.h, z24.h, z3.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
+      "fmla z10.h, z25.h, z0.h[1]\n"
+      "fmla z14.h, z25.h, z1.h[1]\n"
+      "fmla z18.h, z25.h, z2.h[1]\n"
+      "fmla z22.h, z25.h, z3.h[1]\n"
+      "fmla z11.h, z24.h, z0.h[1]\n"
+      "fmla z15.h, z24.h, z1.h[1]\n"
+      "fmla z19.h, z24.h, z2.h[1]\n"
+      "fmla z23.h, z24.h, z3.h[1]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[2]\n"
+      "fmla z12.h, z25.h, z1.h[2]\n"
+      "fmla z16.h, z25.h, z2.h[2]\n"
+      "fmla z20.h, z25.h, z3.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[2]\n"
+      "fmla z13.h, z24.h, z1.h[2]\n"
+      "fmla z17.h, z24.h, z2.h[2]\n"
+      "fmla z21.h, z24.h, z3.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
+      "fmla z10.h, z25.h, z0.h[2]\n"
+      "fmla z14.h, z25.h, z1.h[2]\n"
+      "fmla z18.h, z25.h, z2.h[2]\n"
+      "fmla z22.h, z25.h, z3.h[2]\n"
+      "fmla z11.h, z24.h, z0.h[2]\n"
+      "fmla z15.h, z24.h, z1.h[2]\n"
+      "fmla z19.h, z24.h, z2.h[2]\n"
+      "fmla z23.h, z24.h, z3.h[2]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[3]\n"
+      "fmla z12.h, z25.h, z1.h[3]\n"
+      "fmla z16.h, z25.h, z2.h[3]\n"
+      "fmla z20.h, z25.h, z3.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[3]\n"
+      "fmla z13.h, z24.h, z1.h[3]\n"
+      "fmla z17.h, z24.h, z2.h[3]\n"
+      "fmla z21.h, z24.h, z3.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
+      "fmla z10.h, z25.h, z0.h[3]\n"
+      "fmla z14.h, z25.h, z1.h[3]\n"
+      "fmla z18.h, z25.h, z2.h[3]\n"
+      "fmla z22.h, z25.h, z3.h[3]\n"
+      "fmla z11.h, z24.h, z0.h[3]\n"
+      "fmla z15.h, z24.h, z1.h[3]\n"
+      "fmla z19.h, z24.h, z2.h[3]\n"
+      "fmla z23.h, z24.h, z3.h[3]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[4]\n"
+      "fmla z12.h, z25.h, z1.h[4]\n"
+      "fmla z16.h, z25.h, z2.h[4]\n"
+      "fmla z20.h, z25.h, z3.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[4]\n"
+      "fmla z13.h, z24.h, z1.h[4]\n"
+      "fmla z17.h, z24.h, z2.h[4]\n"
+      "fmla z21.h, z24.h, z3.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
+      "fmla z10.h, z25.h, z0.h[4]\n"
+      "fmla z14.h, z25.h, z1.h[4]\n"
+      "fmla z18.h, z25.h, z2.h[4]\n"
+      "fmla z22.h, z25.h, z3.h[4]\n"
+      "fmla z11.h, z24.h, z0.h[4]\n"
+      "fmla z15.h, z24.h, z1.h[4]\n"
+      "fmla z19.h, z24.h, z2.h[4]\n"
+      "fmla z23.h, z24.h, z3.h[4]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[5]\n"
+      "fmla z12.h, z25.h, z1.h[5]\n"
+      "fmla z16.h, z25.h, z2.h[5]\n"
+      "fmla z20.h, z25.h, z3.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[5]\n"
+      "fmla z13.h, z24.h, z1.h[5]\n"
+      "fmla z17.h, z24.h, z2.h[5]\n"
+      "fmla z21.h, z24.h, z3.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
+      "fmla z10.h, z25.h, z0.h[5]\n"
+      "fmla z14.h, z25.h, z1.h[5]\n"
+      "fmla z18.h, z25.h, z2.h[5]\n"
+      "fmla z22.h, z25.h, z3.h[5]\n"
+      "fmla z11.h, z24.h, z0.h[5]\n"
+      "fmla z15.h, z24.h, z1.h[5]\n"
+      "fmla z19.h, z24.h, z2.h[5]\n"
+      "fmla z23.h, z24.h, z3.h[5]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[6]\n"
+      "fmla z12.h, z25.h, z1.h[6]\n"
+      "fmla z16.h, z25.h, z2.h[6]\n"
+      "fmla z20.h, z25.h, z3.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.h, z24.h, z0.h[6]\n"
+      "fmla z13.h, z24.h, z1.h[6]\n"
+      "fmla z17.h, z24.h, z2.h[6]\n"
+      "fmla z21.h, z24.h, z3.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
+      "fmla z10.h, z25.h, z0.h[6]\n"
+      "fmla z14.h, z25.h, z1.h[6]\n"
+      "fmla z18.h, z25.h, z2.h[6]\n"
+      "fmla z22.h, z25.h, z3.h[6]\n"
+      "fmla z11.h, z24.h, z0.h[6]\n"
+      "fmla z15.h, z24.h, z1.h[6]\n"
+      "fmla z19.h, z24.h, z2.h[6]\n"
+      "fmla z23.h, z24.h, z3.h[6]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[7]\n"
+      "fmla z12.h, z25.h, z1.h[7]\n"
+      "fmla z16.h, z25.h, z2.h[7]\n"
+      "fmla z20.h, z25.h, z3.h[7]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z24.h, z0.h[7]\n"
+      "fmla z13.h, z24.h, z1.h[7]\n"
+      "fmla z17.h, z24.h, z2.h[7]\n"
+      "fmla z21.h, z24.h, z3.h[7]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z10.h, z25.h, z0.h[7]\n"
+      "fmla z14.h, z25.h, z1.h[7]\n"
+      "fmla z18.h, z25.h, z2.h[7]\n"
+      "fmla z22.h, z25.h, z3.h[7]\n"
+      "fmla z11.h, z24.h, z0.h[7]\n"
+      "fmla z15.h, z24.h, z1.h[7]\n"
+      "fmla z19.h, z24.h, z2.h[7]\n"
+      "fmla z23.h, z24.h, z3.h[7]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 45b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
       "tbz %x[flags], #1, 51f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z1.h }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z0.h }, p5/Z, [x19]\n"
-      "fmin z8.h, p5/M, z8.h, z0.h\n"
-      "fmin z9.h, p5/M, z9.h, z0.h\n"
-      "fmin z10.h, p5/M, z10.h, z0.h\n"
-      "fmin z11.h, p5/M, z11.h, z0.h\n"
-      "fmin z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z8.h, p5/M, z8.h, z1.h\n"
-      "fmax z9.h, p5/M, z9.h, z1.h\n"
-      "fmax z10.h, p5/M, z10.h, z1.h\n"
-      "fmax z11.h, p5/M, z11.h, z1.h\n"
-      "fmax z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z0.h\n"
-      "fmin z14.h, p5/M, z14.h, z0.h\n"
-      "fmin z15.h, p5/M, z15.h, z0.h\n"
-      "fmin z16.h, p5/M, z16.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z1.h\n"
-      "fmax z14.h, p5/M, z14.h, z1.h\n"
-      "fmax z15.h, p5/M, z15.h, z1.h\n"
-      "fmax z16.h, p5/M, z16.h, z1.h\n"
-      "fmin z17.h, p5/M, z17.h, z0.h\n"
-      "fmin z18.h, p5/M, z18.h, z0.h\n"
-      "fmin z19.h, p5/M, z19.h, z0.h\n"
-      "fmin z20.h, p5/M, z20.h, z0.h\n"
-      "fmax z17.h, p5/M, z17.h, z1.h\n"
-      "fmax z18.h, p5/M, z18.h, z1.h\n"
-      "fmax z19.h, p5/M, z19.h, z1.h\n"
-      "fmax z20.h, p5/M, z20.h, z1.h\n"
-      "fmin z21.h, p5/M, z21.h, z0.h\n"
-      "fmin z22.h, p5/M, z22.h, z0.h\n"
-      "fmin z23.h, p5/M, z23.h, z0.h\n"
-      "fmax z21.h, p5/M, z21.h, z1.h\n"
-      "fmax z22.h, p5/M, z22.h, z1.h\n"
-      "fmax z23.h, p5/M, z23.h, z1.h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z25.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z24.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z25.h\n"
+      "fmin z9.h, p5/M, z9.h, z25.h\n"
+      "fmin z10.h, p5/M, z10.h, z25.h\n"
+      "fmin z11.h, p5/M, z11.h, z25.h\n"
+      "fmin z12.h, p5/M, z12.h, z25.h\n"
+      "fmin z13.h, p5/M, z13.h, z25.h\n"
+      "fmin z14.h, p5/M, z14.h, z25.h\n"
+      "fmin z15.h, p5/M, z15.h, z25.h\n"
+      "fmin z16.h, p5/M, z16.h, z25.h\n"
+      "fmin z17.h, p5/M, z17.h, z25.h\n"
+      "fmin z18.h, p5/M, z18.h, z25.h\n"
+      "fmin z19.h, p5/M, z19.h, z25.h\n"
+      "fmin z20.h, p5/M, z20.h, z25.h\n"
+      "fmin z21.h, p5/M, z21.h, z25.h\n"
+      "fmin z22.h, p5/M, z22.h, z25.h\n"
+      "fmin z23.h, p5/M, z23.h, z25.h\n"
+      "fmax z8.h, p5/M, z8.h, z24.h\n"
+      "fmax z9.h, p5/M, z9.h, z24.h\n"
+      "fmax z10.h, p5/M, z10.h, z24.h\n"
+      "fmax z11.h, p5/M, z11.h, z24.h\n"
+      "fmax z12.h, p5/M, z12.h, z24.h\n"
+      "fmax z13.h, p5/M, z13.h, z24.h\n"
+      "fmax z14.h, p5/M, z14.h, z24.h\n"
+      "fmax z15.h, p5/M, z15.h, z24.h\n"
+      "fmax z16.h, p5/M, z16.h, z24.h\n"
+      "fmax z17.h, p5/M, z17.h, z24.h\n"
+      "fmax z18.h, p5/M, z18.h, z24.h\n"
+      "fmax z19.h, p5/M, z19.h, z24.h\n"
+      "fmax z20.h, p5/M, z20.h, z24.h\n"
+      "fmax z21.h, p5/M, z21.h, z24.h\n"
+      "fmax z22.h, p5/M, z22.h, z24.h\n"
+      "fmax z23.h, p5/M, z23.h, z24.h\n"
       "51:"  // Height 4: No activation
-      "st1h { z8.h }, p4, [x28]\n"
-      "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
-      "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
-      "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1h { z12.h }, p4, [x24]\n"
-      "st1h { z13.h }, p3, [x24, #1, MUL VL]\n"
-      "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
-      "st1h { z15.h }, p1, [x24, #3, MUL VL]\n"
-      "st1h { z16.h }, p4, [x23]\n"
-      "st1h { z17.h }, p3, [x23, #1, MUL VL]\n"
-      "st1h { z18.h }, p2, [x23, #2, MUL VL]\n"
-      "st1h { z19.h }, p1, [x23, #3, MUL VL]\n"
-      "st1h { z20.h }, p4, [x22]\n"
-      "st1h { z21.h }, p3, [x22, #1, MUL VL]\n"
-      "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
-      "st1h { z23.h }, p1, [x22, #3, MUL VL]\n"
+      "st1h { z8.h }, p4, [x9]\n"
+      "st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z12.h }, p4, [x25]\n"
+      "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p4, [x24]\n"
+      "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p4, [x23]\n"
+      "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
       "52:"  // Height 4: Writeback done
       "dech x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 41b\n"
       "b 80f\n"
       "53:"  // Height 5
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "54:"  // Height 5: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p3.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p2.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p1.h, x19, x11\n"
-      "cbz x9, 55f\n"
-      "ld1h { z8.h }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "cbz x12, 55f\n"
+      "ld1h { z8.h }, p5/Z, [x12]\n"
+      "ld1h { z9.h }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "mov z20.d, z8.d\n"
-      "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
       "mov z13.d, z9.d\n"
-      "mov z17.d, z9.d\n"
+      "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
       "mov z21.d, z9.d\n"
       "mov z22.d, z10.d\n"
       "mov z23.d, z11.d\n"
@@ -1724,31 +1694,31 @@ void sve_hybrid_fp16_mla_6x4VL (
       "b 57f\n"
       "55:"  // Height 5: no bias
       "tbz %x[flags], #0, 56f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1h { z8.h }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #1\n"
-      "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #1\n"
-      "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x22, x23, x19, LSL #1\n"
-      "ld1h { z12.h }, p4/Z, [x24]\n"
-      "add x21, x22, x19, LSL #1\n"
-      "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z16.h }, p4/Z, [x23]\n"
-      "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1h { z20.h }, p4/Z, [x22]\n"
-      "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1h { z24.h }, p4/Z, [x21]\n"
-      "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x23]\n"
+      "ld1h { z13.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x22]\n"
+      "ld1h { z17.h }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x21]\n"
+      "ld1h { z21.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x20]\n"
+      "ld1h { z25.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z27.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 57f\n"
       "56:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -1772,595 +1742,580 @@ void sve_hybrid_fp16_mla_6x4VL (
       "mov z26.b, #0x0\n"
       "mov z27.b, #0x0\n"
       "57:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "58:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 60f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
-      "add x21, x21, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 60f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
       "b 60f\n"
       "59:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "60:"  // Height 5: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 62f\n"
       "61:"  // Height 5: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z4.h }, p0/Z, [x26]\n"
+      "ld1rqh { z3.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z1.h }, p0/Z, [x23]\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqh { z0.h }, p0/Z, [x22]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z29.h, z4.h[0]\n"
+      "fmla z12.h, z29.h, z3.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.h, z29.h, z2.h[0]\n"
+      "fmla z20.h, z29.h, z1.h[0]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "fmla z24.h, z29.h, z0.h[0]\n"
+      "fmla z9.h, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "fmla z13.h, z28.h, z3.h[0]\n"
+      "fmla z17.h, z28.h, z2.h[0]\n"
       "add x23, x23, #0x10\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "ld1rqh { z4.h }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x21, x21, #0x10\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z21.h, z28.h, z1.h[0]\n"
+      "fmla z25.h, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[0]\n"
+      "fmla z14.h, z29.h, z3.h[0]\n"
+      "fmla z18.h, z29.h, z2.h[0]\n"
+      "fmla z22.h, z29.h, z1.h[0]\n"
+      "fmla z26.h, z29.h, z0.h[0]\n"
+      "fmla z11.h, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[0]\n"
+      "fmla z19.h, z28.h, z2.h[0]\n"
+      "fmla z23.h, z28.h, z1.h[0]\n"
+      "fmla z27.h, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[1]\n"
+      "fmla z12.h, z29.h, z3.h[1]\n"
+      "fmla z16.h, z29.h, z2.h[1]\n"
+      "fmla z20.h, z29.h, z1.h[1]\n"
+      "fmla z24.h, z29.h, z0.h[1]\n"
+      "fmla z9.h, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[1]\n"
+      "fmla z17.h, z28.h, z2.h[1]\n"
+      "fmla z21.h, z28.h, z1.h[1]\n"
+      "fmla z25.h, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[1]\n"
+      "fmla z14.h, z29.h, z3.h[1]\n"
+      "fmla z18.h, z29.h, z2.h[1]\n"
+      "fmla z22.h, z29.h, z1.h[1]\n"
+      "fmla z26.h, z29.h, z0.h[1]\n"
+      "fmla z11.h, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[1]\n"
+      "fmla z19.h, z28.h, z2.h[1]\n"
+      "fmla z23.h, z28.h, z1.h[1]\n"
+      "fmla z27.h, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[2]\n"
+      "fmla z12.h, z29.h, z3.h[2]\n"
+      "fmla z16.h, z29.h, z2.h[2]\n"
+      "fmla z20.h, z29.h, z1.h[2]\n"
+      "fmla z24.h, z29.h, z0.h[2]\n"
+      "fmla z9.h, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[2]\n"
+      "fmla z17.h, z28.h, z2.h[2]\n"
+      "fmla z21.h, z28.h, z1.h[2]\n"
+      "fmla z25.h, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[2]\n"
+      "fmla z14.h, z29.h, z3.h[2]\n"
+      "fmla z18.h, z29.h, z2.h[2]\n"
+      "fmla z22.h, z29.h, z1.h[2]\n"
+      "fmla z26.h, z29.h, z0.h[2]\n"
+      "fmla z11.h, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[2]\n"
+      "fmla z19.h, z28.h, z2.h[2]\n"
+      "fmla z23.h, z28.h, z1.h[2]\n"
+      "fmla z27.h, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[3]\n"
+      "fmla z12.h, z29.h, z3.h[3]\n"
+      "fmla z16.h, z29.h, z2.h[3]\n"
+      "fmla z20.h, z29.h, z1.h[3]\n"
+      "fmla z24.h, z29.h, z0.h[3]\n"
+      "fmla z9.h, z28.h, z4.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[3]\n"
+      "fmla z17.h, z28.h, z2.h[3]\n"
+      "fmla z21.h, z28.h, z1.h[3]\n"
+      "fmla z25.h, z28.h, z0.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[3]\n"
+      "fmla z14.h, z29.h, z3.h[3]\n"
+      "fmla z18.h, z29.h, z2.h[3]\n"
+      "fmla z22.h, z29.h, z1.h[3]\n"
+      "fmla z26.h, z29.h, z0.h[3]\n"
+      "fmla z11.h, z28.h, z4.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "fmla z15.h, z28.h, z3.h[3]\n"
+      "fmla z19.h, z28.h, z2.h[3]\n"
+      "fmla z23.h, z28.h, z1.h[3]\n"
+      "fmla z27.h, z28.h, z0.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[4]\n"
+      "fmla z12.h, z29.h, z3.h[4]\n"
+      "fmla z16.h, z29.h, z2.h[4]\n"
+      "fmla z20.h, z29.h, z1.h[4]\n"
+      "fmla z24.h, z29.h, z0.h[4]\n"
+      "fmla z9.h, z28.h, z4.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[4]\n"
+      "fmla z17.h, z28.h, z2.h[4]\n"
+      "fmla z21.h, z28.h, z1.h[4]\n"
+      "fmla z25.h, z28.h, z0.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[4]\n"
+      "fmla z14.h, z29.h, z3.h[4]\n"
+      "fmla z18.h, z29.h, z2.h[4]\n"
+      "fmla z22.h, z29.h, z1.h[4]\n"
+      "fmla z26.h, z29.h, z0.h[4]\n"
+      "fmla z11.h, z28.h, z4.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[4]\n"
+      "fmla z19.h, z28.h, z2.h[4]\n"
+      "fmla z23.h, z28.h, z1.h[4]\n"
+      "fmla z27.h, z28.h, z0.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[5]\n"
+      "fmla z12.h, z29.h, z3.h[5]\n"
+      "fmla z16.h, z29.h, z2.h[5]\n"
+      "fmla z20.h, z29.h, z1.h[5]\n"
+      "fmla z24.h, z29.h, z0.h[5]\n"
+      "fmla z9.h, z28.h, z4.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[5]\n"
+      "fmla z17.h, z28.h, z2.h[5]\n"
+      "fmla z21.h, z28.h, z1.h[5]\n"
+      "fmla z25.h, z28.h, z0.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
+      "fmla z10.h, z29.h, z4.h[5]\n"
+      "fmla z14.h, z29.h, z3.h[5]\n"
+      "fmla z18.h, z29.h, z2.h[5]\n"
+      "fmla z22.h, z29.h, z1.h[5]\n"
+      "fmla z26.h, z29.h, z0.h[5]\n"
+      "fmla z11.h, z28.h, z4.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[5]\n"
+      "fmla z19.h, z28.h, z2.h[5]\n"
+      "fmla z23.h, z28.h, z1.h[5]\n"
+      "fmla z27.h, z28.h, z0.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[6]\n"
+      "fmla z12.h, z29.h, z3.h[6]\n"
+      "fmla z16.h, z29.h, z2.h[6]\n"
+      "fmla z20.h, z29.h, z1.h[6]\n"
+      "fmla z24.h, z29.h, z0.h[6]\n"
+      "fmla z9.h, z28.h, z4.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[6]\n"
+      "fmla z17.h, z28.h, z2.h[6]\n"
+      "fmla z21.h, z28.h, z1.h[6]\n"
+      "fmla z25.h, z28.h, z0.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[6]\n"
+      "fmla z14.h, z29.h, z3.h[6]\n"
+      "fmla z18.h, z29.h, z2.h[6]\n"
+      "fmla z22.h, z29.h, z1.h[6]\n"
+      "fmla z26.h, z29.h, z0.h[6]\n"
+      "fmla z11.h, z28.h, z4.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[6]\n"
+      "fmla z19.h, z28.h, z2.h[6]\n"
+      "fmla z23.h, z28.h, z1.h[6]\n"
+      "fmla z27.h, z28.h, z0.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[7]\n"
+      "fmla z12.h, z29.h, z3.h[7]\n"
+      "fmla z16.h, z29.h, z2.h[7]\n"
+      "fmla z20.h, z29.h, z1.h[7]\n"
+      "fmla z24.h, z29.h, z0.h[7]\n"
+      "fmla z9.h, z28.h, z4.h[7]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[7]\n"
+      "fmla z17.h, z28.h, z2.h[7]\n"
+      "fmla z21.h, z28.h, z1.h[7]\n"
+      "fmla z25.h, z28.h, z0.h[7]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[7]\n"
+      "fmla z14.h, z29.h, z3.h[7]\n"
+      "fmla z18.h, z29.h, z2.h[7]\n"
+      "fmla z22.h, z29.h, z1.h[7]\n"
+      "fmla z26.h, z29.h, z0.h[7]\n"
+      "fmla z11.h, z28.h, z4.h[7]\n"
+      "fmla z15.h, z28.h, z3.h[7]\n"
+      "fmla z19.h, z28.h, z2.h[7]\n"
+      "fmla z23.h, z28.h, z1.h[7]\n"
+      "fmla z27.h, z28.h, z0.h[7]\n"
       "bgt 61b\n"
       "62:"  // Height 5: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "add x21, x21, #0x10\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z29.h, z0.h[0]\n"
+      "fmla z12.h, z29.h, z1.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.h, z29.h, z2.h[0]\n"
+      "fmla z20.h, z29.h, z3.h[0]\n"
+      "fmla z24.h, z29.h, z4.h[0]\n"
+      "fmla z9.h, z28.h, z0.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[0]\n"
+      "fmla z17.h, z28.h, z2.h[0]\n"
+      "fmla z21.h, z28.h, z3.h[0]\n"
+      "fmla z25.h, z28.h, z4.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
+      "fmla z10.h, z29.h, z0.h[0]\n"
+      "fmla z14.h, z29.h, z1.h[0]\n"
+      "fmla z18.h, z29.h, z2.h[0]\n"
+      "fmla z22.h, z29.h, z3.h[0]\n"
+      "fmla z26.h, z29.h, z4.h[0]\n"
+      "fmla z11.h, z28.h, z0.h[0]\n"
+      "fmla z15.h, z28.h, z1.h[0]\n"
+      "fmla z19.h, z28.h, z2.h[0]\n"
+      "fmla z23.h, z28.h, z3.h[0]\n"
+      "fmla z27.h, z28.h, z4.h[0]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[1]\n"
+      "fmla z12.h, z29.h, z1.h[1]\n"
+      "fmla z16.h, z29.h, z2.h[1]\n"
+      "fmla z20.h, z29.h, z3.h[1]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z29.h, z4.h[1]\n"
+      "fmla z9.h, z28.h, z0.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[1]\n"
+      "fmla z17.h, z28.h, z2.h[1]\n"
+      "fmla z21.h, z28.h, z3.h[1]\n"
+      "fmla z25.h, z28.h, z4.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
+      "fmla z10.h, z29.h, z0.h[1]\n"
+      "fmla z14.h, z29.h, z1.h[1]\n"
+      "fmla z18.h, z29.h, z2.h[1]\n"
+      "fmla z22.h, z29.h, z3.h[1]\n"
+      "fmla z26.h, z29.h, z4.h[1]\n"
+      "fmla z11.h, z28.h, z0.h[1]\n"
+      "fmla z15.h, z28.h, z1.h[1]\n"
+      "fmla z19.h, z28.h, z2.h[1]\n"
+      "fmla z23.h, z28.h, z3.h[1]\n"
+      "fmla z27.h, z28.h, z4.h[1]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[2]\n"
+      "fmla z12.h, z29.h, z1.h[2]\n"
+      "fmla z16.h, z29.h, z2.h[2]\n"
+      "fmla z20.h, z29.h, z3.h[2]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z29.h, z4.h[2]\n"
+      "fmla z9.h, z28.h, z0.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[2]\n"
+      "fmla z17.h, z28.h, z2.h[2]\n"
+      "fmla z21.h, z28.h, z3.h[2]\n"
+      "fmla z25.h, z28.h, z4.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
+      "fmla z10.h, z29.h, z0.h[2]\n"
+      "fmla z14.h, z29.h, z1.h[2]\n"
+      "fmla z18.h, z29.h, z2.h[2]\n"
+      "fmla z22.h, z29.h, z3.h[2]\n"
+      "fmla z26.h, z29.h, z4.h[2]\n"
+      "fmla z11.h, z28.h, z0.h[2]\n"
+      "fmla z15.h, z28.h, z1.h[2]\n"
+      "fmla z19.h, z28.h, z2.h[2]\n"
+      "fmla z23.h, z28.h, z3.h[2]\n"
+      "fmla z27.h, z28.h, z4.h[2]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[3]\n"
+      "fmla z12.h, z29.h, z1.h[3]\n"
+      "fmla z16.h, z29.h, z2.h[3]\n"
+      "fmla z20.h, z29.h, z3.h[3]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z29.h, z4.h[3]\n"
+      "fmla z9.h, z28.h, z0.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[3]\n"
+      "fmla z17.h, z28.h, z2.h[3]\n"
+      "fmla z21.h, z28.h, z3.h[3]\n"
+      "fmla z25.h, z28.h, z4.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
+      "fmla z10.h, z29.h, z0.h[3]\n"
+      "fmla z14.h, z29.h, z1.h[3]\n"
+      "fmla z18.h, z29.h, z2.h[3]\n"
+      "fmla z22.h, z29.h, z3.h[3]\n"
+      "fmla z26.h, z29.h, z4.h[3]\n"
+      "fmla z11.h, z28.h, z0.h[3]\n"
+      "fmla z15.h, z28.h, z1.h[3]\n"
+      "fmla z19.h, z28.h, z2.h[3]\n"
+      "fmla z23.h, z28.h, z3.h[3]\n"
+      "fmla z27.h, z28.h, z4.h[3]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[4]\n"
+      "fmla z12.h, z29.h, z1.h[4]\n"
+      "fmla z16.h, z29.h, z2.h[4]\n"
+      "fmla z20.h, z29.h, z3.h[4]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z29.h, z4.h[4]\n"
+      "fmla z9.h, z28.h, z0.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[4]\n"
+      "fmla z17.h, z28.h, z2.h[4]\n"
+      "fmla z21.h, z28.h, z3.h[4]\n"
+      "fmla z25.h, z28.h, z4.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
+      "fmla z10.h, z29.h, z0.h[4]\n"
+      "fmla z14.h, z29.h, z1.h[4]\n"
+      "fmla z18.h, z29.h, z2.h[4]\n"
+      "fmla z22.h, z29.h, z3.h[4]\n"
+      "fmla z26.h, z29.h, z4.h[4]\n"
+      "fmla z11.h, z28.h, z0.h[4]\n"
+      "fmla z15.h, z28.h, z1.h[4]\n"
+      "fmla z19.h, z28.h, z2.h[4]\n"
+      "fmla z23.h, z28.h, z3.h[4]\n"
+      "fmla z27.h, z28.h, z4.h[4]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[5]\n"
+      "fmla z12.h, z29.h, z1.h[5]\n"
+      "fmla z16.h, z29.h, z2.h[5]\n"
+      "fmla z20.h, z29.h, z3.h[5]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z29.h, z4.h[5]\n"
+      "fmla z9.h, z28.h, z0.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[5]\n"
+      "fmla z17.h, z28.h, z2.h[5]\n"
+      "fmla z21.h, z28.h, z3.h[5]\n"
+      "fmla z25.h, z28.h, z4.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
+      "fmla z10.h, z29.h, z0.h[5]\n"
+      "fmla z14.h, z29.h, z1.h[5]\n"
+      "fmla z18.h, z29.h, z2.h[5]\n"
+      "fmla z22.h, z29.h, z3.h[5]\n"
+      "fmla z26.h, z29.h, z4.h[5]\n"
+      "fmla z11.h, z28.h, z0.h[5]\n"
+      "fmla z15.h, z28.h, z1.h[5]\n"
+      "fmla z19.h, z28.h, z2.h[5]\n"
+      "fmla z23.h, z28.h, z3.h[5]\n"
+      "fmla z27.h, z28.h, z4.h[5]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[6]\n"
+      "fmla z12.h, z29.h, z1.h[6]\n"
+      "fmla z16.h, z29.h, z2.h[6]\n"
+      "fmla z20.h, z29.h, z3.h[6]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z29.h, z4.h[6]\n"
+      "fmla z9.h, z28.h, z0.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[6]\n"
+      "fmla z17.h, z28.h, z2.h[6]\n"
+      "fmla z21.h, z28.h, z3.h[6]\n"
+      "fmla z25.h, z28.h, z4.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
+      "fmla z10.h, z29.h, z0.h[6]\n"
+      "fmla z14.h, z29.h, z1.h[6]\n"
+      "fmla z18.h, z29.h, z2.h[6]\n"
+      "fmla z22.h, z29.h, z3.h[6]\n"
+      "fmla z26.h, z29.h, z4.h[6]\n"
+      "fmla z11.h, z28.h, z0.h[6]\n"
+      "fmla z15.h, z28.h, z1.h[6]\n"
+      "fmla z19.h, z28.h, z2.h[6]\n"
+      "fmla z23.h, z28.h, z3.h[6]\n"
+      "fmla z27.h, z28.h, z4.h[6]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[7]\n"
+      "fmla z12.h, z29.h, z1.h[7]\n"
+      "fmla z16.h, z29.h, z2.h[7]\n"
+      "fmla z20.h, z29.h, z3.h[7]\n"
+      "fmla z24.h, z29.h, z4.h[7]\n"
+      "fmla z9.h, z28.h, z0.h[7]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[7]\n"
+      "fmla z17.h, z28.h, z2.h[7]\n"
+      "fmla z21.h, z28.h, z3.h[7]\n"
+      "fmla z25.h, z28.h, z4.h[7]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
+      "fmla z10.h, z29.h, z0.h[7]\n"
+      "fmla z14.h, z29.h, z1.h[7]\n"
+      "fmla z18.h, z29.h, z2.h[7]\n"
+      "fmla z22.h, z29.h, z3.h[7]\n"
+      "fmla z26.h, z29.h, z4.h[7]\n"
+      "fmla z11.h, z28.h, z0.h[7]\n"
+      "fmla z15.h, z28.h, z1.h[7]\n"
+      "fmla z19.h, z28.h, z2.h[7]\n"
+      "fmla z23.h, z28.h, z3.h[7]\n"
+      "fmla z27.h, z28.h, z4.h[7]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 58b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
       "tbz %x[flags], #1, 64f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z1.h }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z0.h }, p5/Z, [x19]\n"
-      "fmin z8.h, p5/M, z8.h, z0.h\n"
-      "fmin z9.h, p5/M, z9.h, z0.h\n"
-      "fmin z10.h, p5/M, z10.h, z0.h\n"
-      "fmin z11.h, p5/M, z11.h, z0.h\n"
-      "fmin z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z8.h, p5/M, z8.h, z1.h\n"
-      "fmax z9.h, p5/M, z9.h, z1.h\n"
-      "fmax z10.h, p5/M, z10.h, z1.h\n"
-      "fmax z11.h, p5/M, z11.h, z1.h\n"
-      "fmax z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z0.h\n"
-      "fmin z14.h, p5/M, z14.h, z0.h\n"
-      "fmin z15.h, p5/M, z15.h, z0.h\n"
-      "fmin z16.h, p5/M, z16.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z1.h\n"
-      "fmax z14.h, p5/M, z14.h, z1.h\n"
-      "fmax z15.h, p5/M, z15.h, z1.h\n"
-      "fmax z16.h, p5/M, z16.h, z1.h\n"
-      "fmin z17.h, p5/M, z17.h, z0.h\n"
-      "fmin z18.h, p5/M, z18.h, z0.h\n"
-      "fmin z19.h, p5/M, z19.h, z0.h\n"
-      "fmin z20.h, p5/M, z20.h, z0.h\n"
-      "fmax z17.h, p5/M, z17.h, z1.h\n"
-      "fmax z18.h, p5/M, z18.h, z1.h\n"
-      "fmax z19.h, p5/M, z19.h, z1.h\n"
-      "fmax z20.h, p5/M, z20.h, z1.h\n"
-      "fmin z21.h, p5/M, z21.h, z0.h\n"
-      "fmin z22.h, p5/M, z22.h, z0.h\n"
-      "fmin z23.h, p5/M, z23.h, z0.h\n"
-      "fmin z24.h, p5/M, z24.h, z0.h\n"
-      "fmax z21.h, p5/M, z21.h, z1.h\n"
-      "fmax z22.h, p5/M, z22.h, z1.h\n"
-      "fmax z23.h, p5/M, z23.h, z1.h\n"
-      "fmax z24.h, p5/M, z24.h, z1.h\n"
-      "fmin z25.h, p5/M, z25.h, z0.h\n"
-      "fmin z26.h, p5/M, z26.h, z0.h\n"
-      "fmin z27.h, p5/M, z27.h, z0.h\n"
-      "fmax z25.h, p5/M, z25.h, z1.h\n"
-      "fmax z26.h, p5/M, z26.h, z1.h\n"
-      "fmax z27.h, p5/M, z27.h, z1.h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z29.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z28.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z29.h\n"
+      "fmin z9.h, p5/M, z9.h, z29.h\n"
+      "fmin z10.h, p5/M, z10.h, z29.h\n"
+      "fmin z11.h, p5/M, z11.h, z29.h\n"
+      "fmin z12.h, p5/M, z12.h, z29.h\n"
+      "fmin z13.h, p5/M, z13.h, z29.h\n"
+      "fmin z14.h, p5/M, z14.h, z29.h\n"
+      "fmin z15.h, p5/M, z15.h, z29.h\n"
+      "fmin z16.h, p5/M, z16.h, z29.h\n"
+      "fmin z17.h, p5/M, z17.h, z29.h\n"
+      "fmin z18.h, p5/M, z18.h, z29.h\n"
+      "fmin z19.h, p5/M, z19.h, z29.h\n"
+      "fmin z20.h, p5/M, z20.h, z29.h\n"
+      "fmin z21.h, p5/M, z21.h, z29.h\n"
+      "fmin z22.h, p5/M, z22.h, z29.h\n"
+      "fmin z23.h, p5/M, z23.h, z29.h\n"
+      "fmin z24.h, p5/M, z24.h, z29.h\n"
+      "fmin z25.h, p5/M, z25.h, z29.h\n"
+      "fmin z26.h, p5/M, z26.h, z29.h\n"
+      "fmin z27.h, p5/M, z27.h, z29.h\n"
+      "fmax z8.h, p5/M, z8.h, z28.h\n"
+      "fmax z9.h, p5/M, z9.h, z28.h\n"
+      "fmax z10.h, p5/M, z10.h, z28.h\n"
+      "fmax z11.h, p5/M, z11.h, z28.h\n"
+      "fmax z12.h, p5/M, z12.h, z28.h\n"
+      "fmax z13.h, p5/M, z13.h, z28.h\n"
+      "fmax z14.h, p5/M, z14.h, z28.h\n"
+      "fmax z15.h, p5/M, z15.h, z28.h\n"
+      "fmax z16.h, p5/M, z16.h, z28.h\n"
+      "fmax z17.h, p5/M, z17.h, z28.h\n"
+      "fmax z18.h, p5/M, z18.h, z28.h\n"
+      "fmax z19.h, p5/M, z19.h, z28.h\n"
+      "fmax z20.h, p5/M, z20.h, z28.h\n"
+      "fmax z21.h, p5/M, z21.h, z28.h\n"
+      "fmax z22.h, p5/M, z22.h, z28.h\n"
+      "fmax z23.h, p5/M, z23.h, z28.h\n"
+      "fmax z24.h, p5/M, z24.h, z28.h\n"
+      "fmax z25.h, p5/M, z25.h, z28.h\n"
+      "fmax z26.h, p5/M, z26.h, z28.h\n"
+      "fmax z27.h, p5/M, z27.h, z28.h\n"
       "64:"  // Height 5: No activation
-      "st1h { z8.h }, p4, [x28]\n"
-      "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
-      "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
-      "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1h { z12.h }, p4, [x24]\n"
-      "st1h { z13.h }, p3, [x24, #1, MUL VL]\n"
-      "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
-      "st1h { z15.h }, p1, [x24, #3, MUL VL]\n"
-      "st1h { z16.h }, p4, [x23]\n"
-      "st1h { z17.h }, p3, [x23, #1, MUL VL]\n"
-      "st1h { z18.h }, p2, [x23, #2, MUL VL]\n"
-      "st1h { z19.h }, p1, [x23, #3, MUL VL]\n"
-      "st1h { z20.h }, p4, [x22]\n"
-      "st1h { z21.h }, p3, [x22, #1, MUL VL]\n"
-      "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
-      "st1h { z23.h }, p1, [x22, #3, MUL VL]\n"
-      "st1h { z24.h }, p4, [x21]\n"
-      "st1h { z25.h }, p3, [x21, #1, MUL VL]\n"
-      "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
-      "st1h { z27.h }, p1, [x21, #3, MUL VL]\n"
+      "st1h { z8.h }, p4, [x9]\n"
+      "st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z12.h }, p4, [x25]\n"
+      "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p4, [x24]\n"
+      "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p4, [x23]\n"
+      "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
+      "st1h { z24.h }, p4, [x22]\n"
+      "st1h { z25.h }, p3, [x22, #1, MUL VL]\n"
+      "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z27.h }, p1, [x22, #3, MUL VL]\n"
       "65:"  // Height 5: Writeback done
       "dech x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 54b\n"
       "b 80f\n"
       "66:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0xc\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x19, #0xc\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "67:"  // Height 6: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p3.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p2.h, x19, x11\n"
-      "inch x19\n"
-      "whilelt p1.h, x19, x11\n"
-      "cbz x9, 68f\n"
-      "ld1h { z8.h }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p3.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p2.h, x20, x11\n"
+      "inch x20\n"
+      "whilelt p1.h, x20, x11\n"
+      "cbz x12, 68f\n"
+      "ld1h { z8.h }, p5/Z, [x12]\n"
+      "ld1h { z9.h }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "mov z20.d, z8.d\n"
-      "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
       "mov z13.d, z9.d\n"
-      "mov z17.d, z9.d\n"
+      "ld1h { z10.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
       "mov z21.d, z9.d\n"
       "mov z22.d, z10.d\n"
       "mov z23.d, z11.d\n"
@@ -2375,18 +2330,18 @@ void sve_hybrid_fp16_mla_6x4VL (
       "b 70f\n"
       "68:"  // Height 6: no bias
       "tbz %x[flags], #0, 69f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1h { z8.h }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #1\n"
-      "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #1\n"
-      "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x22, x23, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x9]\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
+      "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #1\n"
+      "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
       "ld1h { z12.h }, p4/Z, [x24]\n"
-      "add x21, x22, x19, LSL #1\n"
       "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "add x20, x21, x19, LSL #1\n"
       "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
       "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
       "ld1h { z16.h }, p4/Z, [x23]\n"
@@ -2432,666 +2387,647 @@ void sve_hybrid_fp16_mla_6x4VL (
       "mov z30.b, #0x0\n"
       "mov z31.b, #0x0\n"
       "70:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "71:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 72f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 73f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #1\n"
-      "add x24, x24, x19, LSL #1\n"
-      "add x23, x23, x19, LSL #1\n"
-      "add x22, x22, x19, LSL #1\n"
-      "add x21, x21, x19, LSL #1\n"
-      "add x20, x20, x19, LSL #1\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 73f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #1\n"
+      "add x25, x25, x20, LSL #1\n"
+      "add x24, x24, x20, LSL #1\n"
+      "add x23, x23, x20, LSL #1\n"
+      "add x22, x22, x20, LSL #1\n"
+      "add x21, x21, x20, LSL #1\n"
       "b 73f\n"
       "72:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
-      "add x20, x21, x19, LSL #1\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "73:"  // Height 6: input setup done
-      "cmp x26, #0x8\n"
+      "cmp x27, #0x8\n"
       "ble 75f\n"
       "74:"  // Height 6: Multiply loop: Main loop head
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x8\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z7.h }, p0/Z, [x26]\n"
+      "ld1rqh { z6.h }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x8\n"
+      "ld1rqh { z5.h }, p0/Z, [x24]\n"
+      "ld1rqh { z4.h }, p0/Z, [x23]\n"
+      "cmp x27, #0x8\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1rqh { z2.h }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[0]\n"
+      "fmla z12.h, z1.h, z6.h[0]\n"
+      "fmla z16.h, z1.h, z5.h[0]\n"
+      "fmla z20.h, z1.h, z4.h[0]\n"
       "add x23, x23, #0x10\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "ld1rqh { z4.h }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "ld1rqh { z5.h }, p0/Z, [x20]\n"
+      "fmla z24.h, z1.h, z3.h[0]\n"
+      "fmla z28.h, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
-      "fmla z28.h, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "fmla z29.h, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "fmla z30.h, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
-      "fmla z31.h, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "fmla z28.h, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "fmla z29.h, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[0]\n"
+      "fmla z13.h, z0.h, z6.h[0]\n"
+      "fmla z17.h, z0.h, z5.h[0]\n"
+      "fmla z21.h, z0.h, z4.h[0]\n"
+      "fmla z25.h, z0.h, z3.h[0]\n"
+      "fmla z29.h, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[0]\n"
+      "fmla z14.h, z1.h, z6.h[0]\n"
+      "fmla z18.h, z1.h, z5.h[0]\n"
+      "fmla z22.h, z1.h, z4.h[0]\n"
+      "fmla z26.h, z1.h, z3.h[0]\n"
+      "fmla z30.h, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[0]\n"
+      "fmla z15.h, z0.h, z6.h[0]\n"
+      "fmla z19.h, z0.h, z5.h[0]\n"
+      "fmla z23.h, z0.h, z4.h[0]\n"
+      "fmla z27.h, z0.h, z3.h[0]\n"
+      "fmla z31.h, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[1]\n"
+      "fmla z12.h, z1.h, z6.h[1]\n"
+      "fmla z16.h, z1.h, z5.h[1]\n"
+      "fmla z20.h, z1.h, z4.h[1]\n"
+      "fmla z24.h, z1.h, z3.h[1]\n"
+      "fmla z28.h, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[1]\n"
+      "fmla z13.h, z0.h, z6.h[1]\n"
+      "fmla z17.h, z0.h, z5.h[1]\n"
+      "fmla z21.h, z0.h, z4.h[1]\n"
+      "fmla z25.h, z0.h, z3.h[1]\n"
+      "fmla z29.h, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "fmla z30.h, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
-      "fmla z31.h, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "fmla z28.h, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "fmla z29.h, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "fmla z30.h, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
-      "fmla z31.h, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "fmla z28.h, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "fmla z29.h, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "fmla z30.h, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
-      "fmla z31.h, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "fmla z28.h, z6.h, z5.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "fmla z29.h, z7.h, z5.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "fmla z30.h, z6.h, z5.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
-      "fmla z31.h, z7.h, z5.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "fmla z28.h, z6.h, z5.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "fmla z29.h, z7.h, z5.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[1]\n"
+      "fmla z14.h, z1.h, z6.h[1]\n"
+      "fmla z18.h, z1.h, z5.h[1]\n"
+      "fmla z22.h, z1.h, z4.h[1]\n"
+      "fmla z26.h, z1.h, z3.h[1]\n"
+      "fmla z30.h, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[1]\n"
+      "fmla z15.h, z0.h, z6.h[1]\n"
+      "fmla z19.h, z0.h, z5.h[1]\n"
+      "fmla z23.h, z0.h, z4.h[1]\n"
+      "fmla z27.h, z0.h, z3.h[1]\n"
+      "fmla z31.h, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[2]\n"
+      "fmla z12.h, z1.h, z6.h[2]\n"
+      "fmla z16.h, z1.h, z5.h[2]\n"
+      "fmla z20.h, z1.h, z4.h[2]\n"
+      "fmla z24.h, z1.h, z3.h[2]\n"
+      "fmla z28.h, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[2]\n"
+      "fmla z13.h, z0.h, z6.h[2]\n"
+      "fmla z17.h, z0.h, z5.h[2]\n"
+      "fmla z21.h, z0.h, z4.h[2]\n"
+      "fmla z25.h, z0.h, z3.h[2]\n"
+      "fmla z29.h, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[2]\n"
+      "fmla z14.h, z1.h, z6.h[2]\n"
+      "fmla z18.h, z1.h, z5.h[2]\n"
+      "fmla z22.h, z1.h, z4.h[2]\n"
+      "fmla z26.h, z1.h, z3.h[2]\n"
+      "fmla z30.h, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[2]\n"
+      "fmla z15.h, z0.h, z6.h[2]\n"
+      "fmla z19.h, z0.h, z5.h[2]\n"
+      "fmla z23.h, z0.h, z4.h[2]\n"
+      "fmla z27.h, z0.h, z3.h[2]\n"
+      "fmla z31.h, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[3]\n"
+      "fmla z12.h, z1.h, z6.h[3]\n"
+      "fmla z16.h, z1.h, z5.h[3]\n"
+      "fmla z20.h, z1.h, z4.h[3]\n"
+      "fmla z24.h, z1.h, z3.h[3]\n"
+      "fmla z28.h, z1.h, z2.h[3]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[3]\n"
+      "fmla z13.h, z0.h, z6.h[3]\n"
+      "fmla z17.h, z0.h, z5.h[3]\n"
+      "fmla z21.h, z0.h, z4.h[3]\n"
+      "fmla z25.h, z0.h, z3.h[3]\n"
+      "fmla z29.h, z0.h, z2.h[3]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[3]\n"
+      "fmla z14.h, z1.h, z6.h[3]\n"
+      "fmla z18.h, z1.h, z5.h[3]\n"
+      "fmla z22.h, z1.h, z4.h[3]\n"
+      "fmla z26.h, z1.h, z3.h[3]\n"
+      "fmla z30.h, z1.h, z2.h[3]\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "fmla z11.h, z0.h, z7.h[3]\n"
+      "fmla z15.h, z0.h, z6.h[3]\n"
+      "fmla z19.h, z0.h, z5.h[3]\n"
+      "fmla z23.h, z0.h, z4.h[3]\n"
+      "fmla z27.h, z0.h, z3.h[3]\n"
+      "fmla z31.h, z0.h, z2.h[3]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[4]\n"
+      "fmla z12.h, z1.h, z6.h[4]\n"
+      "fmla z16.h, z1.h, z5.h[4]\n"
+      "fmla z20.h, z1.h, z4.h[4]\n"
+      "fmla z24.h, z1.h, z3.h[4]\n"
+      "fmla z28.h, z1.h, z2.h[4]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[4]\n"
+      "fmla z13.h, z0.h, z6.h[4]\n"
+      "fmla z17.h, z0.h, z5.h[4]\n"
+      "fmla z21.h, z0.h, z4.h[4]\n"
+      "fmla z25.h, z0.h, z3.h[4]\n"
+      "fmla z29.h, z0.h, z2.h[4]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[4]\n"
+      "fmla z14.h, z1.h, z6.h[4]\n"
+      "fmla z18.h, z1.h, z5.h[4]\n"
+      "fmla z22.h, z1.h, z4.h[4]\n"
+      "fmla z26.h, z1.h, z3.h[4]\n"
+      "fmla z30.h, z1.h, z2.h[4]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[4]\n"
+      "fmla z15.h, z0.h, z6.h[4]\n"
+      "fmla z19.h, z0.h, z5.h[4]\n"
+      "fmla z23.h, z0.h, z4.h[4]\n"
+      "fmla z27.h, z0.h, z3.h[4]\n"
+      "fmla z31.h, z0.h, z2.h[4]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[5]\n"
+      "fmla z12.h, z1.h, z6.h[5]\n"
+      "fmla z16.h, z1.h, z5.h[5]\n"
+      "fmla z20.h, z1.h, z4.h[5]\n"
+      "fmla z24.h, z1.h, z3.h[5]\n"
+      "fmla z28.h, z1.h, z2.h[5]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[5]\n"
+      "fmla z13.h, z0.h, z6.h[5]\n"
+      "fmla z17.h, z0.h, z5.h[5]\n"
+      "fmla z21.h, z0.h, z4.h[5]\n"
+      "fmla z25.h, z0.h, z3.h[5]\n"
+      "fmla z29.h, z0.h, z2.h[5]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "fmla z30.h, z6.h, z5.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
-      "fmla z31.h, z7.h, z5.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "fmla z28.h, z6.h, z5.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "fmla z29.h, z7.h, z5.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "fmla z30.h, z6.h, z5.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
-      "fmla z31.h, z7.h, z5.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "fmla z28.h, z6.h, z5.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "fmla z29.h, z7.h, z5.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z30.h, z6.h, z5.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
-      "fmla z31.h, z7.h, z5.h[7]\n"
+      "fmla z10.h, z1.h, z7.h[5]\n"
+      "fmla z14.h, z1.h, z6.h[5]\n"
+      "fmla z18.h, z1.h, z5.h[5]\n"
+      "fmla z22.h, z1.h, z4.h[5]\n"
+      "fmla z26.h, z1.h, z3.h[5]\n"
+      "fmla z30.h, z1.h, z2.h[5]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[5]\n"
+      "fmla z15.h, z0.h, z6.h[5]\n"
+      "fmla z19.h, z0.h, z5.h[5]\n"
+      "fmla z23.h, z0.h, z4.h[5]\n"
+      "fmla z27.h, z0.h, z3.h[5]\n"
+      "fmla z31.h, z0.h, z2.h[5]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[6]\n"
+      "fmla z12.h, z1.h, z6.h[6]\n"
+      "fmla z16.h, z1.h, z5.h[6]\n"
+      "fmla z20.h, z1.h, z4.h[6]\n"
+      "fmla z24.h, z1.h, z3.h[6]\n"
+      "fmla z28.h, z1.h, z2.h[6]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[6]\n"
+      "fmla z13.h, z0.h, z6.h[6]\n"
+      "fmla z17.h, z0.h, z5.h[6]\n"
+      "fmla z21.h, z0.h, z4.h[6]\n"
+      "fmla z25.h, z0.h, z3.h[6]\n"
+      "fmla z29.h, z0.h, z2.h[6]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[6]\n"
+      "fmla z14.h, z1.h, z6.h[6]\n"
+      "fmla z18.h, z1.h, z5.h[6]\n"
+      "fmla z22.h, z1.h, z4.h[6]\n"
+      "fmla z26.h, z1.h, z3.h[6]\n"
+      "fmla z30.h, z1.h, z2.h[6]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[6]\n"
+      "fmla z15.h, z0.h, z6.h[6]\n"
+      "fmla z19.h, z0.h, z5.h[6]\n"
+      "fmla z23.h, z0.h, z4.h[6]\n"
+      "fmla z27.h, z0.h, z3.h[6]\n"
+      "fmla z31.h, z0.h, z2.h[6]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[7]\n"
+      "fmla z12.h, z1.h, z6.h[7]\n"
+      "fmla z16.h, z1.h, z5.h[7]\n"
+      "fmla z20.h, z1.h, z4.h[7]\n"
+      "fmla z24.h, z1.h, z3.h[7]\n"
+      "fmla z28.h, z1.h, z2.h[7]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[7]\n"
+      "fmla z13.h, z0.h, z6.h[7]\n"
+      "fmla z17.h, z0.h, z5.h[7]\n"
+      "fmla z21.h, z0.h, z4.h[7]\n"
+      "fmla z25.h, z0.h, z3.h[7]\n"
+      "fmla z29.h, z0.h, z2.h[7]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[7]\n"
+      "fmla z14.h, z1.h, z6.h[7]\n"
+      "fmla z18.h, z1.h, z5.h[7]\n"
+      "fmla z22.h, z1.h, z4.h[7]\n"
+      "fmla z26.h, z1.h, z3.h[7]\n"
+      "fmla z30.h, z1.h, z2.h[7]\n"
+      "fmla z11.h, z0.h, z7.h[7]\n"
+      "fmla z15.h, z0.h, z6.h[7]\n"
+      "fmla z19.h, z0.h, z5.h[7]\n"
+      "fmla z23.h, z0.h, z4.h[7]\n"
+      "fmla z27.h, z0.h, z3.h[7]\n"
+      "fmla z31.h, z0.h, z2.h[7]\n"
       "bgt 74b\n"
       "75:"  // Height 6: Multiply loop: Single iteration only
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "whilelt p0.h, XZR, x26\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqh { z0.h }, p0/Z, [x25]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "ld1rqh { z5.h }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "fmla z28.h, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "fmla z29.h, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.h, XZR, x27\n"
+      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "ld1rqh { z5.h }, p0/Z, [x21]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[0]\n"
+      "fmla z12.h, z7.h, z1.h[0]\n"
+      "fmla z16.h, z7.h, z2.h[0]\n"
+      "fmla z20.h, z7.h, z3.h[0]\n"
+      "fmla z24.h, z7.h, z4.h[0]\n"
+      "fmla z28.h, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[0]\n"
+      "fmla z13.h, z6.h, z1.h[0]\n"
+      "fmla z17.h, z6.h, z2.h[0]\n"
+      "fmla z21.h, z6.h, z3.h[0]\n"
+      "fmla z25.h, z6.h, z4.h[0]\n"
+      "fmla z29.h, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "fmla z30.h, z6.h, z5.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
-      "fmla z31.h, z7.h, z5.h[0]\n"
+      "fmla z10.h, z7.h, z0.h[0]\n"
+      "fmla z14.h, z7.h, z1.h[0]\n"
+      "fmla z18.h, z7.h, z2.h[0]\n"
+      "fmla z22.h, z7.h, z3.h[0]\n"
+      "fmla z26.h, z7.h, z4.h[0]\n"
+      "fmla z30.h, z7.h, z5.h[0]\n"
+      "fmla z11.h, z6.h, z0.h[0]\n"
+      "fmla z15.h, z6.h, z1.h[0]\n"
+      "fmla z19.h, z6.h, z2.h[0]\n"
+      "fmla z23.h, z6.h, z3.h[0]\n"
+      "fmla z27.h, z6.h, z4.h[0]\n"
+      "fmla z31.h, z6.h, z5.h[0]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "fmla z28.h, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "fmla z29.h, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[1]\n"
+      "fmla z12.h, z7.h, z1.h[1]\n"
+      "fmla z16.h, z7.h, z2.h[1]\n"
+      "fmla z20.h, z7.h, z3.h[1]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z7.h, z4.h[1]\n"
+      "fmla z28.h, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[1]\n"
+      "fmla z13.h, z6.h, z1.h[1]\n"
+      "fmla z17.h, z6.h, z2.h[1]\n"
+      "fmla z21.h, z6.h, z3.h[1]\n"
+      "fmla z25.h, z6.h, z4.h[1]\n"
+      "fmla z29.h, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "fmla z30.h, z6.h, z5.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
-      "fmla z31.h, z7.h, z5.h[1]\n"
+      "fmla z10.h, z7.h, z0.h[1]\n"
+      "fmla z14.h, z7.h, z1.h[1]\n"
+      "fmla z18.h, z7.h, z2.h[1]\n"
+      "fmla z22.h, z7.h, z3.h[1]\n"
+      "fmla z26.h, z7.h, z4.h[1]\n"
+      "fmla z30.h, z7.h, z5.h[1]\n"
+      "fmla z11.h, z6.h, z0.h[1]\n"
+      "fmla z15.h, z6.h, z1.h[1]\n"
+      "fmla z19.h, z6.h, z2.h[1]\n"
+      "fmla z23.h, z6.h, z3.h[1]\n"
+      "fmla z27.h, z6.h, z4.h[1]\n"
+      "fmla z31.h, z6.h, z5.h[1]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "fmla z28.h, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "fmla z29.h, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[2]\n"
+      "fmla z12.h, z7.h, z1.h[2]\n"
+      "fmla z16.h, z7.h, z2.h[2]\n"
+      "fmla z20.h, z7.h, z3.h[2]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z7.h, z4.h[2]\n"
+      "fmla z28.h, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[2]\n"
+      "fmla z13.h, z6.h, z1.h[2]\n"
+      "fmla z17.h, z6.h, z2.h[2]\n"
+      "fmla z21.h, z6.h, z3.h[2]\n"
+      "fmla z25.h, z6.h, z4.h[2]\n"
+      "fmla z29.h, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "fmla z30.h, z6.h, z5.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
-      "fmla z31.h, z7.h, z5.h[2]\n"
+      "fmla z10.h, z7.h, z0.h[2]\n"
+      "fmla z14.h, z7.h, z1.h[2]\n"
+      "fmla z18.h, z7.h, z2.h[2]\n"
+      "fmla z22.h, z7.h, z3.h[2]\n"
+      "fmla z26.h, z7.h, z4.h[2]\n"
+      "fmla z30.h, z7.h, z5.h[2]\n"
+      "fmla z11.h, z6.h, z0.h[2]\n"
+      "fmla z15.h, z6.h, z1.h[2]\n"
+      "fmla z19.h, z6.h, z2.h[2]\n"
+      "fmla z23.h, z6.h, z3.h[2]\n"
+      "fmla z27.h, z6.h, z4.h[2]\n"
+      "fmla z31.h, z6.h, z5.h[2]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "fmla z28.h, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "fmla z29.h, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[3]\n"
+      "fmla z12.h, z7.h, z1.h[3]\n"
+      "fmla z16.h, z7.h, z2.h[3]\n"
+      "fmla z20.h, z7.h, z3.h[3]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z7.h, z4.h[3]\n"
+      "fmla z28.h, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[3]\n"
+      "fmla z13.h, z6.h, z1.h[3]\n"
+      "fmla z17.h, z6.h, z2.h[3]\n"
+      "fmla z21.h, z6.h, z3.h[3]\n"
+      "fmla z25.h, z6.h, z4.h[3]\n"
+      "fmla z29.h, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "fmla z30.h, z6.h, z5.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
-      "fmla z31.h, z7.h, z5.h[3]\n"
+      "fmla z10.h, z7.h, z0.h[3]\n"
+      "fmla z14.h, z7.h, z1.h[3]\n"
+      "fmla z18.h, z7.h, z2.h[3]\n"
+      "fmla z22.h, z7.h, z3.h[3]\n"
+      "fmla z26.h, z7.h, z4.h[3]\n"
+      "fmla z30.h, z7.h, z5.h[3]\n"
+      "fmla z11.h, z6.h, z0.h[3]\n"
+      "fmla z15.h, z6.h, z1.h[3]\n"
+      "fmla z19.h, z6.h, z2.h[3]\n"
+      "fmla z23.h, z6.h, z3.h[3]\n"
+      "fmla z27.h, z6.h, z4.h[3]\n"
+      "fmla z31.h, z6.h, z5.h[3]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "fmla z28.h, z6.h, z5.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "fmla z29.h, z7.h, z5.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[4]\n"
+      "fmla z12.h, z7.h, z1.h[4]\n"
+      "fmla z16.h, z7.h, z2.h[4]\n"
+      "fmla z20.h, z7.h, z3.h[4]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z7.h, z4.h[4]\n"
+      "fmla z28.h, z7.h, z5.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[4]\n"
+      "fmla z13.h, z6.h, z1.h[4]\n"
+      "fmla z17.h, z6.h, z2.h[4]\n"
+      "fmla z21.h, z6.h, z3.h[4]\n"
+      "fmla z25.h, z6.h, z4.h[4]\n"
+      "fmla z29.h, z6.h, z5.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "fmla z30.h, z6.h, z5.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
-      "fmla z31.h, z7.h, z5.h[4]\n"
+      "fmla z10.h, z7.h, z0.h[4]\n"
+      "fmla z14.h, z7.h, z1.h[4]\n"
+      "fmla z18.h, z7.h, z2.h[4]\n"
+      "fmla z22.h, z7.h, z3.h[4]\n"
+      "fmla z26.h, z7.h, z4.h[4]\n"
+      "fmla z30.h, z7.h, z5.h[4]\n"
+      "fmla z11.h, z6.h, z0.h[4]\n"
+      "fmla z15.h, z6.h, z1.h[4]\n"
+      "fmla z19.h, z6.h, z2.h[4]\n"
+      "fmla z23.h, z6.h, z3.h[4]\n"
+      "fmla z27.h, z6.h, z4.h[4]\n"
+      "fmla z31.h, z6.h, z5.h[4]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "fmla z28.h, z6.h, z5.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "fmla z29.h, z7.h, z5.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[5]\n"
+      "fmla z12.h, z7.h, z1.h[5]\n"
+      "fmla z16.h, z7.h, z2.h[5]\n"
+      "fmla z20.h, z7.h, z3.h[5]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z7.h, z4.h[5]\n"
+      "fmla z28.h, z7.h, z5.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[5]\n"
+      "fmla z13.h, z6.h, z1.h[5]\n"
+      "fmla z17.h, z6.h, z2.h[5]\n"
+      "fmla z21.h, z6.h, z3.h[5]\n"
+      "fmla z25.h, z6.h, z4.h[5]\n"
+      "fmla z29.h, z6.h, z5.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "fmla z30.h, z6.h, z5.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
-      "fmla z31.h, z7.h, z5.h[5]\n"
+      "fmla z10.h, z7.h, z0.h[5]\n"
+      "fmla z14.h, z7.h, z1.h[5]\n"
+      "fmla z18.h, z7.h, z2.h[5]\n"
+      "fmla z22.h, z7.h, z3.h[5]\n"
+      "fmla z26.h, z7.h, z4.h[5]\n"
+      "fmla z30.h, z7.h, z5.h[5]\n"
+      "fmla z11.h, z6.h, z0.h[5]\n"
+      "fmla z15.h, z6.h, z1.h[5]\n"
+      "fmla z19.h, z6.h, z2.h[5]\n"
+      "fmla z23.h, z6.h, z3.h[5]\n"
+      "fmla z27.h, z6.h, z4.h[5]\n"
+      "fmla z31.h, z6.h, z5.h[5]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "fmla z28.h, z6.h, z5.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "fmla z29.h, z7.h, z5.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[6]\n"
+      "fmla z12.h, z7.h, z1.h[6]\n"
+      "fmla z16.h, z7.h, z2.h[6]\n"
+      "fmla z20.h, z7.h, z3.h[6]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.h, z7.h, z4.h[6]\n"
+      "fmla z28.h, z7.h, z5.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[6]\n"
+      "fmla z13.h, z6.h, z1.h[6]\n"
+      "fmla z17.h, z6.h, z2.h[6]\n"
+      "fmla z21.h, z6.h, z3.h[6]\n"
+      "fmla z25.h, z6.h, z4.h[6]\n"
+      "fmla z29.h, z6.h, z5.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "fmla z30.h, z6.h, z5.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
-      "fmla z31.h, z7.h, z5.h[6]\n"
+      "fmla z10.h, z7.h, z0.h[6]\n"
+      "fmla z14.h, z7.h, z1.h[6]\n"
+      "fmla z18.h, z7.h, z2.h[6]\n"
+      "fmla z22.h, z7.h, z3.h[6]\n"
+      "fmla z26.h, z7.h, z4.h[6]\n"
+      "fmla z30.h, z7.h, z5.h[6]\n"
+      "fmla z11.h, z6.h, z0.h[6]\n"
+      "fmla z15.h, z6.h, z1.h[6]\n"
+      "fmla z19.h, z6.h, z2.h[6]\n"
+      "fmla z23.h, z6.h, z3.h[6]\n"
+      "fmla z27.h, z6.h, z4.h[6]\n"
+      "fmla z31.h, z6.h, z5.h[6]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "fmla z28.h, z6.h, z5.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "fmla z29.h, z7.h, z5.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[7]\n"
+      "fmla z12.h, z7.h, z1.h[7]\n"
+      "fmla z16.h, z7.h, z2.h[7]\n"
+      "fmla z20.h, z7.h, z3.h[7]\n"
+      "fmla z24.h, z7.h, z4.h[7]\n"
+      "fmla z28.h, z7.h, z5.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[7]\n"
+      "fmla z13.h, z6.h, z1.h[7]\n"
+      "fmla z17.h, z6.h, z2.h[7]\n"
+      "fmla z21.h, z6.h, z3.h[7]\n"
+      "fmla z25.h, z6.h, z4.h[7]\n"
+      "fmla z29.h, z6.h, z5.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z30.h, z6.h, z5.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
-      "fmla z31.h, z7.h, z5.h[7]\n"
+      "fmla z10.h, z7.h, z0.h[7]\n"
+      "fmla z14.h, z7.h, z1.h[7]\n"
+      "fmla z18.h, z7.h, z2.h[7]\n"
+      "fmla z22.h, z7.h, z3.h[7]\n"
+      "fmla z26.h, z7.h, z4.h[7]\n"
+      "fmla z30.h, z7.h, z5.h[7]\n"
+      "fmla z11.h, z6.h, z0.h[7]\n"
+      "fmla z15.h, z6.h, z1.h[7]\n"
+      "fmla z19.h, z6.h, z2.h[7]\n"
+      "fmla z23.h, z6.h, z3.h[7]\n"
+      "fmla z27.h, z6.h, z4.h[7]\n"
+      "fmla z31.h, z6.h, z5.h[7]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 71b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #1\n"
-      "add x23, x24, x19, LSL #1\n"
-      "add x22, x23, x19, LSL #1\n"
-      "add x21, x22, x19, LSL #1\n"
-      "add x20, x21, x19, LSL #1\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #1\n"
+      "add x24, x25, x20, LSL #1\n"
+      "add x23, x24, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "tbz %x[flags], #1, 77f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z1.h }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z0.h }, p5/Z, [x19]\n"
-      "fmin z8.h, p5/M, z8.h, z0.h\n"
-      "fmin z9.h, p5/M, z9.h, z0.h\n"
-      "fmin z10.h, p5/M, z10.h, z0.h\n"
-      "fmin z11.h, p5/M, z11.h, z0.h\n"
-      "fmin z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z8.h, p5/M, z8.h, z1.h\n"
-      "fmax z9.h, p5/M, z9.h, z1.h\n"
-      "fmax z10.h, p5/M, z10.h, z1.h\n"
-      "fmax z11.h, p5/M, z11.h, z1.h\n"
-      "fmax z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z0.h\n"
-      "fmin z14.h, p5/M, z14.h, z0.h\n"
-      "fmin z15.h, p5/M, z15.h, z0.h\n"
-      "fmin z16.h, p5/M, z16.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z1.h\n"
-      "fmax z14.h, p5/M, z14.h, z1.h\n"
-      "fmax z15.h, p5/M, z15.h, z1.h\n"
-      "fmax z16.h, p5/M, z16.h, z1.h\n"
-      "fmin z17.h, p5/M, z17.h, z0.h\n"
-      "fmin z18.h, p5/M, z18.h, z0.h\n"
-      "fmin z19.h, p5/M, z19.h, z0.h\n"
-      "fmin z20.h, p5/M, z20.h, z0.h\n"
-      "fmax z17.h, p5/M, z17.h, z1.h\n"
-      "fmax z18.h, p5/M, z18.h, z1.h\n"
-      "fmax z19.h, p5/M, z19.h, z1.h\n"
-      "fmax z20.h, p5/M, z20.h, z1.h\n"
-      "fmin z21.h, p5/M, z21.h, z0.h\n"
-      "fmin z22.h, p5/M, z22.h, z0.h\n"
-      "fmin z23.h, p5/M, z23.h, z0.h\n"
-      "fmin z24.h, p5/M, z24.h, z0.h\n"
-      "fmax z21.h, p5/M, z21.h, z1.h\n"
-      "fmax z22.h, p5/M, z22.h, z1.h\n"
-      "fmax z23.h, p5/M, z23.h, z1.h\n"
-      "fmax z24.h, p5/M, z24.h, z1.h\n"
-      "fmin z25.h, p5/M, z25.h, z0.h\n"
-      "fmin z26.h, p5/M, z26.h, z0.h\n"
-      "fmin z27.h, p5/M, z27.h, z0.h\n"
-      "fmin z28.h, p5/M, z28.h, z0.h\n"
-      "fmax z25.h, p5/M, z25.h, z1.h\n"
-      "fmax z26.h, p5/M, z26.h, z1.h\n"
-      "fmax z27.h, p5/M, z27.h, z1.h\n"
-      "fmax z28.h, p5/M, z28.h, z1.h\n"
-      "fmin z29.h, p5/M, z29.h, z0.h\n"
-      "fmin z30.h, p5/M, z30.h, z0.h\n"
-      "fmin z31.h, p5/M, z31.h, z0.h\n"
-      "fmax z29.h, p5/M, z29.h, z1.h\n"
-      "fmax z30.h, p5/M, z30.h, z1.h\n"
-      "fmax z31.h, p5/M, z31.h, z1.h\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z0.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z1.h\n"
+      "fmin z9.h, p5/M, z9.h, z1.h\n"
+      "fmin z10.h, p5/M, z10.h, z1.h\n"
+      "fmin z11.h, p5/M, z11.h, z1.h\n"
+      "fmin z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z1.h\n"
+      "fmin z14.h, p5/M, z14.h, z1.h\n"
+      "fmin z15.h, p5/M, z15.h, z1.h\n"
+      "fmin z16.h, p5/M, z16.h, z1.h\n"
+      "fmin z17.h, p5/M, z17.h, z1.h\n"
+      "fmin z18.h, p5/M, z18.h, z1.h\n"
+      "fmin z19.h, p5/M, z19.h, z1.h\n"
+      "fmin z20.h, p5/M, z20.h, z1.h\n"
+      "fmin z21.h, p5/M, z21.h, z1.h\n"
+      "fmin z22.h, p5/M, z22.h, z1.h\n"
+      "fmin z23.h, p5/M, z23.h, z1.h\n"
+      "fmin z24.h, p5/M, z24.h, z1.h\n"
+      "fmin z25.h, p5/M, z25.h, z1.h\n"
+      "fmin z26.h, p5/M, z26.h, z1.h\n"
+      "fmin z27.h, p5/M, z27.h, z1.h\n"
+      "fmin z28.h, p5/M, z28.h, z1.h\n"
+      "fmin z29.h, p5/M, z29.h, z1.h\n"
+      "fmin z30.h, p5/M, z30.h, z1.h\n"
+      "fmin z31.h, p5/M, z31.h, z1.h\n"
+      "fmax z8.h, p5/M, z8.h, z0.h\n"
+      "fmax z9.h, p5/M, z9.h, z0.h\n"
+      "fmax z10.h, p5/M, z10.h, z0.h\n"
+      "fmax z11.h, p5/M, z11.h, z0.h\n"
+      "fmax z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z0.h\n"
+      "fmax z14.h, p5/M, z14.h, z0.h\n"
+      "fmax z15.h, p5/M, z15.h, z0.h\n"
+      "fmax z16.h, p5/M, z16.h, z0.h\n"
+      "fmax z17.h, p5/M, z17.h, z0.h\n"
+      "fmax z18.h, p5/M, z18.h, z0.h\n"
+      "fmax z19.h, p5/M, z19.h, z0.h\n"
+      "fmax z20.h, p5/M, z20.h, z0.h\n"
+      "fmax z21.h, p5/M, z21.h, z0.h\n"
+      "fmax z22.h, p5/M, z22.h, z0.h\n"
+      "fmax z23.h, p5/M, z23.h, z0.h\n"
+      "fmax z24.h, p5/M, z24.h, z0.h\n"
+      "fmax z25.h, p5/M, z25.h, z0.h\n"
+      "fmax z26.h, p5/M, z26.h, z0.h\n"
+      "fmax z27.h, p5/M, z27.h, z0.h\n"
+      "fmax z28.h, p5/M, z28.h, z0.h\n"
+      "fmax z29.h, p5/M, z29.h, z0.h\n"
+      "fmax z30.h, p5/M, z30.h, z0.h\n"
+      "fmax z31.h, p5/M, z31.h, z0.h\n"
       "77:"  // Height 6: No activation
-      "st1h { z8.h }, p4, [x28]\n"
-      "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
-      "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
-      "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1h { z12.h }, p4, [x24]\n"
-      "st1h { z13.h }, p3, [x24, #1, MUL VL]\n"
-      "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
-      "st1h { z15.h }, p1, [x24, #3, MUL VL]\n"
-      "st1h { z16.h }, p4, [x23]\n"
-      "st1h { z17.h }, p3, [x23, #1, MUL VL]\n"
-      "st1h { z18.h }, p2, [x23, #2, MUL VL]\n"
-      "st1h { z19.h }, p1, [x23, #3, MUL VL]\n"
-      "st1h { z20.h }, p4, [x22]\n"
-      "st1h { z21.h }, p3, [x22, #1, MUL VL]\n"
-      "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
-      "st1h { z23.h }, p1, [x22, #3, MUL VL]\n"
-      "st1h { z24.h }, p4, [x21]\n"
-      "st1h { z25.h }, p3, [x21, #1, MUL VL]\n"
-      "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
-      "st1h { z27.h }, p1, [x21, #3, MUL VL]\n"
-      "st1h { z28.h }, p4, [x20]\n"
-      "st1h { z29.h }, p3, [x20, #1, MUL VL]\n"
-      "st1h { z30.h }, p2, [x20, #2, MUL VL]\n"
-      "st1h { z31.h }, p1, [x20, #3, MUL VL]\n"
+      "st1h { z8.h }, p4, [x9]\n"
+      "st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z12.h }, p4, [x25]\n"
+      "st1h { z13.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x25, #3, MUL VL]\n"
+      "st1h { z16.h }, p4, [x24]\n"
+      "st1h { z17.h }, p3, [x24, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x24, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x24, #3, MUL VL]\n"
+      "st1h { z20.h }, p4, [x23]\n"
+      "st1h { z21.h }, p3, [x23, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x23, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x23, #3, MUL VL]\n"
+      "st1h { z24.h }, p4, [x22]\n"
+      "st1h { z25.h }, p3, [x22, #1, MUL VL]\n"
+      "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z27.h }, p1, [x22, #3, MUL VL]\n"
+      "st1h { z28.h }, p4, [x21]\n"
+      "st1h { z29.h }, p3, [x21, #1, MUL VL]\n"
+      "st1h { z30.h }, p2, [x21, #2, MUL VL]\n"
+      "st1h { z31.h }, p1, [x21, #3, MUL VL]\n"
       "78:"  // Height 6: Writeback done
       "dech x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 67b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 80f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 79f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "79:"  // Update direct input
-      "mov x19, #0xc\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0xc\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "80:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
index b696e73637..880f9d1a27 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,21 +10,22 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -38,11 +39,13 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void sve_hybrid_fp32_mla_6x4VL( ARGLIST );
+void sve_hybrid_fp32_mla_6x4VL_a64fx( ARGLIST );
 
 class cls_sve_hybrid_fp32_mla_6x4VL
 {
 public:
-    typedef float operand_type;
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,16 +71,41 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 6.667 };
+                case CPUModel::A510:
+                    return { 5.41 };
+                case CPUModel::V1:
+                    return { 15.65 };
+                case CPUModel::A64FX:
+                    return { 25.55 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_fp32_mla_6x4VL;
-    cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *)
+    cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_hybrid_fp32_mla_6x4VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
new file mode 100644
index 0000000000..66481f04f9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
@@ -0,0 +1,1365 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_6x4VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 61f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "cbz x12, 3f\n"
+      "ld1w { z8.s }, p4/Z, [x12]\n"
+      "ld1w { z9.s }, p4/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 8f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add x26, x26, #0x4\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "addvl x10, x10, #4\n"
+      "bne 6b\n"
+      "tbz %x[flags], #1, 11f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z17.s\n"
+      "fmin z9.s, p4/M, z9.s, z17.s\n"
+      "fmin z10.s, p4/M, z10.s, z17.s\n"
+      "fmin z11.s, p4/M, z11.s, z17.s\n"
+      "fmax z8.s, p4/M, z8.s, z16.s\n"
+      "fmax z9.s, p4/M, z9.s, z16.s\n"
+      "fmax z10.s, p4/M, z10.s, z16.s\n"
+      "fmax z11.s, p4/M, z11.s, z16.s\n"
+      "11:"  // Height 1: No activation
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "12:"  // Height 1: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 2b\n"
+      "b 74f\n"
+      "13:"  // Height 2
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "14:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "cbz x12, 15f\n"
+      "ld1w { z8.s }, p4/Z, [x12]\n"
+      "ld1w { z9.s }, p4/Z, [x12, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x12, x12, #4\n"
+      "b 17f\n"
+      "15:"  // Height 2: no bias
+      "tbz %x[flags], #0, 16f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x20]\n"
+      "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 17f\n"
+      "16:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 20f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "20:"  // Height 2: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 22f\n"
+      "21:"  // Height 2: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x26, x26, #0x4\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "subs x27, x27, #0x1\n"
+      "add x25, x25, #0x4\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z14.s, p4/M, z17.s, z1.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "fmla z15.s, p4/M, z16.s, z1.s\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 21b\n"
+      "22:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z14.s, p4/M, z17.s, z1.s\n"
+      "addvl x10, x10, #4\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "fmla z15.s, p4/M, z16.s, z1.s\n"
+      "bne 18b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "tbz %x[flags], #1, 23f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z17.s\n"
+      "fmin z9.s, p4/M, z9.s, z17.s\n"
+      "fmin z10.s, p4/M, z10.s, z17.s\n"
+      "fmin z11.s, p4/M, z11.s, z17.s\n"
+      "fmin z12.s, p4/M, z12.s, z17.s\n"
+      "fmin z13.s, p4/M, z13.s, z17.s\n"
+      "fmin z14.s, p4/M, z14.s, z17.s\n"
+      "fmin z15.s, p4/M, z15.s, z17.s\n"
+      "fmax z8.s, p4/M, z8.s, z16.s\n"
+      "fmax z9.s, p4/M, z9.s, z16.s\n"
+      "fmax z10.s, p4/M, z10.s, z16.s\n"
+      "fmax z11.s, p4/M, z11.s, z16.s\n"
+      "fmax z12.s, p4/M, z12.s, z16.s\n"
+      "fmax z13.s, p4/M, z13.s, z16.s\n"
+      "fmax z14.s, p4/M, z14.s, z16.s\n"
+      "fmax z15.s, p4/M, z15.s, z16.s\n"
+      "23:"  // Height 2: No activation
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x25]\n"
+      "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+      "24:"  // Height 2: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 14b\n"
+      "b 74f\n"
+      "25:"  // Height 3
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "26:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "cbz x12, 27f\n"
+      "ld1w { z8.s }, p4/Z, [x12]\n"
+      "ld1w { z9.s }, p4/Z, [x12, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 29f\n"
+      "27:"  // Height 3: no bias
+      "tbz %x[flags], #0, 28f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x21]\n"
+      "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x20]\n"
+      "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 29f\n"
+      "28:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 32f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "32:"  // Height 3: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x25, x25, #0x4\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "ld1w { z20.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add x24, x24, #0x4\n"
+      "fmla z10.s, p4/M, z21.s, z0.s\n"
+      "fmla z14.s, p4/M, z21.s, z1.s\n"
+      "fmla z18.s, p4/M, z21.s, z2.s\n"
+      "fmla z11.s, p4/M, z20.s, z0.s\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "fmla z15.s, p4/M, z20.s, z1.s\n"
+      "fmla z19.s, p4/M, z20.s, z2.s\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "cmp x28, x20\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "ld1w { z20.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.s, p4/M, z21.s, z0.s\n"
+      "fmla z14.s, p4/M, z21.s, z1.s\n"
+      "fmla z18.s, p4/M, z21.s, z2.s\n"
+      "fmla z11.s, p4/M, z20.s, z0.s\n"
+      "fmla z15.s, p4/M, z20.s, z1.s\n"
+      "fmla z19.s, p4/M, z20.s, z2.s\n"
+      "bne 30b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "tbz %x[flags], #1, 35f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z21.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z20.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z21.s\n"
+      "fmin z9.s, p4/M, z9.s, z21.s\n"
+      "fmin z10.s, p4/M, z10.s, z21.s\n"
+      "fmin z11.s, p4/M, z11.s, z21.s\n"
+      "fmin z12.s, p4/M, z12.s, z21.s\n"
+      "fmin z13.s, p4/M, z13.s, z21.s\n"
+      "fmin z14.s, p4/M, z14.s, z21.s\n"
+      "fmin z15.s, p4/M, z15.s, z21.s\n"
+      "fmin z16.s, p4/M, z16.s, z21.s\n"
+      "fmin z17.s, p4/M, z17.s, z21.s\n"
+      "fmin z18.s, p4/M, z18.s, z21.s\n"
+      "fmin z19.s, p4/M, z19.s, z21.s\n"
+      "fmax z8.s, p4/M, z8.s, z20.s\n"
+      "fmax z9.s, p4/M, z9.s, z20.s\n"
+      "fmax z10.s, p4/M, z10.s, z20.s\n"
+      "fmax z11.s, p4/M, z11.s, z20.s\n"
+      "fmax z12.s, p4/M, z12.s, z20.s\n"
+      "fmax z13.s, p4/M, z13.s, z20.s\n"
+      "fmax z14.s, p4/M, z14.s, z20.s\n"
+      "fmax z15.s, p4/M, z15.s, z20.s\n"
+      "fmax z16.s, p4/M, z16.s, z20.s\n"
+      "fmax z17.s, p4/M, z17.s, z20.s\n"
+      "fmax z18.s, p4/M, z18.s, z20.s\n"
+      "fmax z19.s, p4/M, z19.s, z20.s\n"
+      "35:"  // Height 3: No activation
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x25]\n"
+      "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x24]\n"
+      "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+      "36:"  // Height 3: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 26b\n"
+      "b 74f\n"
+      "37:"  // Height 4
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "38:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "cbz x12, 39f\n"
+      "ld1w { z8.s }, p4/Z, [x12]\n"
+      "ld1w { z9.s }, p4/Z, [x12, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 41f\n"
+      "39:"  // Height 4: no bias
+      "tbz %x[flags], #0, 40f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x22]\n"
+      "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x21]\n"
+      "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x20]\n"
+      "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 41f\n"
+      "40:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "41:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "42:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 43f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 44f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 44f\n"
+      "43:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "44:"  // Height 4: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 46f\n"
+      "45:"  // Height 4: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "ld1w { z25.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x25, x25, #0x4\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "ld1w { z24.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.s, p4/M, z25.s, z0.s\n"
+      "fmla z14.s, p4/M, z25.s, z1.s\n"
+      "fmla z18.s, p4/M, z25.s, z2.s\n"
+      "fmla z22.s, p4/M, z25.s, z3.s\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "fmla z11.s, p4/M, z24.s, z0.s\n"
+      "fmla z15.s, p4/M, z24.s, z1.s\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "fmla z19.s, p4/M, z24.s, z2.s\n"
+      "fmla z23.s, p4/M, z24.s, z3.s\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 45b\n"
+      "46:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "ld1w { z25.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "cmp x28, x20\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "ld1w { z24.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.s, p4/M, z25.s, z0.s\n"
+      "fmla z14.s, p4/M, z25.s, z1.s\n"
+      "fmla z18.s, p4/M, z25.s, z2.s\n"
+      "fmla z22.s, p4/M, z25.s, z3.s\n"
+      "fmla z11.s, p4/M, z24.s, z0.s\n"
+      "fmla z15.s, p4/M, z24.s, z1.s\n"
+      "fmla z19.s, p4/M, z24.s, z2.s\n"
+      "fmla z23.s, p4/M, z24.s, z3.s\n"
+      "bne 42b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "tbz %x[flags], #1, 47f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z25.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z24.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z25.s\n"
+      "fmin z9.s, p4/M, z9.s, z25.s\n"
+      "fmin z10.s, p4/M, z10.s, z25.s\n"
+      "fmin z11.s, p4/M, z11.s, z25.s\n"
+      "fmin z12.s, p4/M, z12.s, z25.s\n"
+      "fmin z13.s, p4/M, z13.s, z25.s\n"
+      "fmin z14.s, p4/M, z14.s, z25.s\n"
+      "fmin z15.s, p4/M, z15.s, z25.s\n"
+      "fmin z16.s, p4/M, z16.s, z25.s\n"
+      "fmin z17.s, p4/M, z17.s, z25.s\n"
+      "fmin z18.s, p4/M, z18.s, z25.s\n"
+      "fmin z19.s, p4/M, z19.s, z25.s\n"
+      "fmin z20.s, p4/M, z20.s, z25.s\n"
+      "fmin z21.s, p4/M, z21.s, z25.s\n"
+      "fmin z22.s, p4/M, z22.s, z25.s\n"
+      "fmin z23.s, p4/M, z23.s, z25.s\n"
+      "fmax z8.s, p4/M, z8.s, z24.s\n"
+      "fmax z9.s, p4/M, z9.s, z24.s\n"
+      "fmax z10.s, p4/M, z10.s, z24.s\n"
+      "fmax z11.s, p4/M, z11.s, z24.s\n"
+      "fmax z12.s, p4/M, z12.s, z24.s\n"
+      "fmax z13.s, p4/M, z13.s, z24.s\n"
+      "fmax z14.s, p4/M, z14.s, z24.s\n"
+      "fmax z15.s, p4/M, z15.s, z24.s\n"
+      "fmax z16.s, p4/M, z16.s, z24.s\n"
+      "fmax z17.s, p4/M, z17.s, z24.s\n"
+      "fmax z18.s, p4/M, z18.s, z24.s\n"
+      "fmax z19.s, p4/M, z19.s, z24.s\n"
+      "fmax z20.s, p4/M, z20.s, z24.s\n"
+      "fmax z21.s, p4/M, z21.s, z24.s\n"
+      "fmax z22.s, p4/M, z22.s, z24.s\n"
+      "fmax z23.s, p4/M, z23.s, z24.s\n"
+      "47:"  // Height 4: No activation
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x25]\n"
+      "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x24]\n"
+      "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x23]\n"
+      "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
+      "48:"  // Height 4: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 38b\n"
+      "b 74f\n"
+      "49:"  // Height 5
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "50:"  // Height 5: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "cbz x12, 51f\n"
+      "ld1w { z8.s }, p4/Z, [x12]\n"
+      "ld1w { z9.s }, p4/Z, [x12, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 53f\n"
+      "51:"  // Height 5: no bias
+      "tbz %x[flags], #0, 52f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 53f\n"
+      "52:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "53:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "54:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 56f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 56f\n"
+      "55:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "56:"  // Height 5: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 58f\n"
+      "57:"  // Height 5: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "add x25, x25, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x23, x23, #0x4\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "add x22, x22, #0x4\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "ld1w { z28.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.s, p4/M, z29.s, z0.s\n"
+      "fmla z14.s, p4/M, z29.s, z1.s\n"
+      "fmla z18.s, p4/M, z29.s, z2.s\n"
+      "fmla z22.s, p4/M, z29.s, z3.s\n"
+      "fmla z26.s, p4/M, z29.s, z4.s\n"
+      "fmla z11.s, p4/M, z28.s, z0.s\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "fmla z15.s, p4/M, z28.s, z1.s\n"
+      "fmla z19.s, p4/M, z28.s, z2.s\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "fmla z23.s, p4/M, z28.s, z3.s\n"
+      "fmla z27.s, p4/M, z28.s, z4.s\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 57b\n"
+      "58:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "cmp x28, x20\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "ld1w { z28.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.s, p4/M, z29.s, z0.s\n"
+      "fmla z14.s, p4/M, z29.s, z1.s\n"
+      "fmla z18.s, p4/M, z29.s, z2.s\n"
+      "fmla z22.s, p4/M, z29.s, z3.s\n"
+      "fmla z26.s, p4/M, z29.s, z4.s\n"
+      "fmla z11.s, p4/M, z28.s, z0.s\n"
+      "fmla z15.s, p4/M, z28.s, z1.s\n"
+      "fmla z19.s, p4/M, z28.s, z2.s\n"
+      "fmla z23.s, p4/M, z28.s, z3.s\n"
+      "fmla z27.s, p4/M, z28.s, z4.s\n"
+      "bne 54b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "tbz %x[flags], #1, 59f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z29.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z28.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z29.s\n"
+      "fmin z9.s, p4/M, z9.s, z29.s\n"
+      "fmin z10.s, p4/M, z10.s, z29.s\n"
+      "fmin z11.s, p4/M, z11.s, z29.s\n"
+      "fmin z12.s, p4/M, z12.s, z29.s\n"
+      "fmin z13.s, p4/M, z13.s, z29.s\n"
+      "fmin z14.s, p4/M, z14.s, z29.s\n"
+      "fmin z15.s, p4/M, z15.s, z29.s\n"
+      "fmin z16.s, p4/M, z16.s, z29.s\n"
+      "fmin z17.s, p4/M, z17.s, z29.s\n"
+      "fmin z18.s, p4/M, z18.s, z29.s\n"
+      "fmin z19.s, p4/M, z19.s, z29.s\n"
+      "fmin z20.s, p4/M, z20.s, z29.s\n"
+      "fmin z21.s, p4/M, z21.s, z29.s\n"
+      "fmin z22.s, p4/M, z22.s, z29.s\n"
+      "fmin z23.s, p4/M, z23.s, z29.s\n"
+      "fmin z24.s, p4/M, z24.s, z29.s\n"
+      "fmin z25.s, p4/M, z25.s, z29.s\n"
+      "fmin z26.s, p4/M, z26.s, z29.s\n"
+      "fmin z27.s, p4/M, z27.s, z29.s\n"
+      "fmax z8.s, p4/M, z8.s, z28.s\n"
+      "fmax z9.s, p4/M, z9.s, z28.s\n"
+      "fmax z10.s, p4/M, z10.s, z28.s\n"
+      "fmax z11.s, p4/M, z11.s, z28.s\n"
+      "fmax z12.s, p4/M, z12.s, z28.s\n"
+      "fmax z13.s, p4/M, z13.s, z28.s\n"
+      "fmax z14.s, p4/M, z14.s, z28.s\n"
+      "fmax z15.s, p4/M, z15.s, z28.s\n"
+      "fmax z16.s, p4/M, z16.s, z28.s\n"
+      "fmax z17.s, p4/M, z17.s, z28.s\n"
+      "fmax z18.s, p4/M, z18.s, z28.s\n"
+      "fmax z19.s, p4/M, z19.s, z28.s\n"
+      "fmax z20.s, p4/M, z20.s, z28.s\n"
+      "fmax z21.s, p4/M, z21.s, z28.s\n"
+      "fmax z22.s, p4/M, z22.s, z28.s\n"
+      "fmax z23.s, p4/M, z23.s, z28.s\n"
+      "fmax z24.s, p4/M, z24.s, z28.s\n"
+      "fmax z25.s, p4/M, z25.s, z28.s\n"
+      "fmax z26.s, p4/M, z26.s, z28.s\n"
+      "fmax z27.s, p4/M, z27.s, z28.s\n"
+      "59:"  // Height 5: No activation
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x25]\n"
+      "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x24]\n"
+      "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x23]\n"
+      "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x22]\n"
+      "st1w { z25.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x22, #3, MUL VL]\n"
+      "60:"  // Height 5: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 50b\n"
+      "b 74f\n"
+      "61:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "62:"  // Height 6: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "cbz x12, 63f\n"
+      "ld1w { z8.s }, p4/Z, [x12]\n"
+      "ld1w { z9.s }, p4/Z, [x12, #1, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z10.s }, p4/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x12, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 65f\n"
+      "63:"  // Height 6: no bias
+      "tbz %x[flags], #0, 64f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x24]\n"
+      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x22]\n"
+      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x21]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p3/Z, [x20]\n"
+      "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 65f\n"
+      "64:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "65:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "66:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 68f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 68f\n"
+      "67:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "68:"  // Height 6: input setup done
+      "subs x27, x27, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1rw { z5.s }, p4/Z, [x21]\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 70f\n"
+      "69:"  // Height 6: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "add x25, x25, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z28.s, p4/M, z6.s, z5.s\n"
+      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x23, x23, #0x4\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "add x22, x22, #0x4\n"
+      "add x21, x21, #0x4\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "fmla z29.s, p4/M, z7.s, z5.s\n"
+      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "fmla z26.s, p4/M, z6.s, z4.s\n"
+      "fmla z30.s, p4/M, z6.s, z5.s\n"
+      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "fmla z31.s, p4/M, z7.s, z5.s\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1rw { z5.s }, p4/Z, [x21]\n"
+      "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 69b\n"
+      "70:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x28, x28, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "cmp x28, x20\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z28.s, p4/M, z6.s, z5.s\n"
+      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "fmla z29.s, p4/M, z7.s, z5.s\n"
+      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "fmla z26.s, p4/M, z6.s, z4.s\n"
+      "fmla z30.s, p4/M, z6.s, z5.s\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "fmla z31.s, p4/M, z7.s, z5.s\n"
+      "bne 66b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "tbz %x[flags], #1, 71f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z1.s\n"
+      "fmin z9.s, p4/M, z9.s, z1.s\n"
+      "fmin z10.s, p4/M, z10.s, z1.s\n"
+      "fmin z11.s, p4/M, z11.s, z1.s\n"
+      "fmin z12.s, p4/M, z12.s, z1.s\n"
+      "fmin z13.s, p4/M, z13.s, z1.s\n"
+      "fmin z14.s, p4/M, z14.s, z1.s\n"
+      "fmin z15.s, p4/M, z15.s, z1.s\n"
+      "fmin z16.s, p4/M, z16.s, z1.s\n"
+      "fmin z17.s, p4/M, z17.s, z1.s\n"
+      "fmin z18.s, p4/M, z18.s, z1.s\n"
+      "fmin z19.s, p4/M, z19.s, z1.s\n"
+      "fmin z20.s, p4/M, z20.s, z1.s\n"
+      "fmin z21.s, p4/M, z21.s, z1.s\n"
+      "fmin z22.s, p4/M, z22.s, z1.s\n"
+      "fmin z23.s, p4/M, z23.s, z1.s\n"
+      "fmin z24.s, p4/M, z24.s, z1.s\n"
+      "fmin z25.s, p4/M, z25.s, z1.s\n"
+      "fmin z26.s, p4/M, z26.s, z1.s\n"
+      "fmin z27.s, p4/M, z27.s, z1.s\n"
+      "fmin z28.s, p4/M, z28.s, z1.s\n"
+      "fmin z29.s, p4/M, z29.s, z1.s\n"
+      "fmin z30.s, p4/M, z30.s, z1.s\n"
+      "fmin z31.s, p4/M, z31.s, z1.s\n"
+      "fmax z8.s, p4/M, z8.s, z0.s\n"
+      "fmax z9.s, p4/M, z9.s, z0.s\n"
+      "fmax z10.s, p4/M, z10.s, z0.s\n"
+      "fmax z11.s, p4/M, z11.s, z0.s\n"
+      "fmax z12.s, p4/M, z12.s, z0.s\n"
+      "fmax z13.s, p4/M, z13.s, z0.s\n"
+      "fmax z14.s, p4/M, z14.s, z0.s\n"
+      "fmax z15.s, p4/M, z15.s, z0.s\n"
+      "fmax z16.s, p4/M, z16.s, z0.s\n"
+      "fmax z17.s, p4/M, z17.s, z0.s\n"
+      "fmax z18.s, p4/M, z18.s, z0.s\n"
+      "fmax z19.s, p4/M, z19.s, z0.s\n"
+      "fmax z20.s, p4/M, z20.s, z0.s\n"
+      "fmax z21.s, p4/M, z21.s, z0.s\n"
+      "fmax z22.s, p4/M, z22.s, z0.s\n"
+      "fmax z23.s, p4/M, z23.s, z0.s\n"
+      "fmax z24.s, p4/M, z24.s, z0.s\n"
+      "fmax z25.s, p4/M, z25.s, z0.s\n"
+      "fmax z26.s, p4/M, z26.s, z0.s\n"
+      "fmax z27.s, p4/M, z27.s, z0.s\n"
+      "fmax z28.s, p4/M, z28.s, z0.s\n"
+      "fmax z29.s, p4/M, z29.s, z0.s\n"
+      "fmax z30.s, p4/M, z30.s, z0.s\n"
+      "fmax z31.s, p4/M, z31.s, z0.s\n"
+      "71:"  // Height 6: No activation
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x25]\n"
+      "st1w { z13.s }, p2, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x24]\n"
+      "st1w { z17.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x23]\n"
+      "st1w { z21.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x22]\n"
+      "st1w { z25.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z28.s }, p3, [x21]\n"
+      "st1w { z29.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p0, [x21, #3, MUL VL]\n"
+      "72:"  // Height 6: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 62b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 74f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 73f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "73:"  // Update direct input
+      "mov x20, #0x18\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "74:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
index dee9a107ff..e1581f2026 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,18 +10,18 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -102,32 +102,32 @@ void sve_hybrid_fp32_mla_6x4VL (
       "cmp %x[M], #0x2\n"
       "bgt 27f\n"
       "beq 14f\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x9, %x[bias]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 3f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 3f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
       "b 5f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 4f\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
       "b 5f\n"
       "4:"  // Height 1: no accumulate
       "mov z8.b, #0x0\n"
@@ -135,178 +135,175 @@ void sve_hybrid_fp32_mla_6x4VL (
       "mov z10.b, #0x0\n"
       "mov z11.b, #0x0\n"
       "5:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 8f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 8f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
       "b 8f\n"
       "7:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "8:"  // Height 1: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "ble 10f\n"
       "9:"  // Height 1: Multiply loop: Main loop head
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "cmp x26, #0x4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1w { z16.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z10.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z8.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z10.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[2]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z17.s, z0.s[2]\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[3]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
+      "add x26, x26, #0x10\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Single iteration only
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1w { z16.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z17.s, z0.s[0]\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
       "ble 11f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[1]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.s, z17.s, z0.s[1]\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
       "ble 11f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[2]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.s, z17.s, z0.s[2]\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
       "ble 11f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[3]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
       "11:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 6b\n"
       "tbz %x[flags], #1, 12f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
       "12:"  // Height 1: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
       "13:"  // Height 1: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 2b\n"
       "b 80f\n"
       "14:"  // Height 2
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "15:"  // Height 2: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 16f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 16f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
       "mov z13.d, z9.d\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "addvl x12, x12, #4\n"
       "b 18f\n"
       "16:"  // Height 2: no bias
       "tbz %x[flags], #0, 17f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 18f\n"
       "17:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -318,242 +315,236 @@ void sve_hybrid_fp32_mla_6x4VL (
       "mov z14.b, #0x0\n"
       "mov z15.b, #0x0\n"
       "18:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "19:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 21f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 21f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
       "b 21f\n"
       "20:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #2\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
       "21:"  // Height 2: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "ble 23f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x4\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z1.s }, p0/Z, [x26]\n"
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      "sub x27, x27, #0x4\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z1.s[0]\n"
+      "fmla z12.s, z17.s, z0.s[0]\n"
+      "fmla z9.s, z16.s, z1.s[0]\n"
+      "fmla z13.s, z16.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z17.s, z1.s[0]\n"
+      "fmla z14.s, z17.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #4, MUL VL]\n"
+      "cmp x27, #0x4\n"
+      "fmla z11.s, z16.s, z1.s[0]\n"
+      "fmla z15.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z8.s, z17.s, z1.s[1]\n"
+      "fmla z12.s, z17.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x4\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z9.s, z16.s, z1.s[1]\n"
+      "fmla z13.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z10.s, z17.s, z1.s[1]\n"
+      "fmla z14.s, z17.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.s, z16.s, z1.s[1]\n"
+      "fmla z15.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z17.s, z1.s[2]\n"
+      "fmla z12.s, z17.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.s, z16.s, z1.s[2]\n"
+      "fmla z13.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z17.s, z1.s[2]\n"
+      "fmla z14.s, z17.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.s, z16.s, z1.s[2]\n"
+      "fmla z15.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z17.s, z1.s[3]\n"
+      "fmla z12.s, z17.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.s, z16.s, z1.s[3]\n"
+      "fmla z13.s, z16.s, z0.s[3]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.s, z17.s, z1.s[3]\n"
+      "fmla z14.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z1.s[3]\n"
+      "fmla z15.s, z16.s, z0.s[3]\n"
       "bgt 22b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[0]\n"
+      "fmla z12.s, z17.s, z1.s[0]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "fmla z13.s, z16.s, z1.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z17.s, z0.s[0]\n"
+      "fmla z14.s, z17.s, z1.s[0]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
+      "fmla z15.s, z16.s, z1.s[0]\n"
       "ble 24f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[1]\n"
+      "fmla z12.s, z17.s, z1.s[1]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "fmla z13.s, z16.s, z1.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.s, z17.s, z0.s[1]\n"
+      "fmla z14.s, z17.s, z1.s[1]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
+      "fmla z15.s, z16.s, z1.s[1]\n"
       "ble 24f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[2]\n"
+      "fmla z12.s, z17.s, z1.s[2]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "fmla z13.s, z16.s, z1.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z10.s, z17.s, z0.s[2]\n"
+      "fmla z14.s, z17.s, z1.s[2]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
+      "fmla z15.s, z16.s, z1.s[2]\n"
       "ble 24f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[3]\n"
+      "fmla z12.s, z17.s, z1.s[3]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "fmla z13.s, z16.s, z1.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z14.s, z17.s, z1.s[3]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
+      "fmla z15.s, z16.s, z1.s[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 19b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
       "tbz %x[flags], #1, 25f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmin z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z0.s\n"
-      "fmin z14.s, p5/M, z14.s, z0.s\n"
-      "fmin z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z1.s\n"
-      "fmax z14.s, p5/M, z14.s, z1.s\n"
-      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z15.s, p5/M, z15.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z15.s, p5/M, z15.s, z16.s\n"
       "25:"  // Height 2: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
       "26:"  // Height 2: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 15b\n"
       "b 80f\n"
       "27:"  // Height 3
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "28:"  // Height 3: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 29f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 29f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
       "mov z13.d, z9.d\n"
-      "addvl x9, x9, #4\n"
-      "mov z17.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
       "b 31f\n"
       "29:"  // Height 3: no bias
       "tbz %x[flags], #0, 30f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20]\n"
+      "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 31f\n"
       "30:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -569,305 +560,296 @@ void sve_hybrid_fp32_mla_6x4VL (
       "mov z18.b, #0x0\n"
       "mov z19.b, #0x0\n"
       "31:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 34f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 34f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
       "b 34f\n"
       "33:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "34:"  // Height 3: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "ble 36f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x24]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z21.s, z2.s[0]\n"
+      "fmla z12.s, z21.s, z1.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.s, z21.s, z0.s[0]\n"
+      "fmla z9.s, z20.s, z2.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[0]\n"
+      "fmla z17.s, z20.s, z0.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "cmp x27, #0x4\n"
+      "fmla z10.s, z21.s, z2.s[0]\n"
+      "fmla z14.s, z21.s, z1.s[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x23]\n"
+      "fmla z18.s, z21.s, z0.s[0]\n"
+      "fmla z11.s, z20.s, z2.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x4\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z15.s, z20.s, z1.s[0]\n"
+      "fmla z19.s, z20.s, z0.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.s, z21.s, z2.s[1]\n"
+      "fmla z12.s, z21.s, z1.s[1]\n"
+      "fmla z16.s, z21.s, z0.s[1]\n"
+      "fmla z9.s, z20.s, z2.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[1]\n"
+      "fmla z17.s, z20.s, z0.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z10.s, z21.s, z2.s[1]\n"
+      "fmla z14.s, z21.s, z1.s[1]\n"
+      "fmla z18.s, z21.s, z0.s[1]\n"
+      "fmla z11.s, z20.s, z2.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.s, z20.s, z1.s[1]\n"
+      "fmla z19.s, z20.s, z0.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z21.s, z2.s[2]\n"
+      "fmla z12.s, z21.s, z1.s[2]\n"
+      "fmla z16.s, z21.s, z0.s[2]\n"
+      "fmla z9.s, z20.s, z2.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[2]\n"
+      "fmla z17.s, z20.s, z0.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z21.s, z2.s[2]\n"
+      "fmla z14.s, z21.s, z1.s[2]\n"
+      "fmla z18.s, z21.s, z0.s[2]\n"
+      "fmla z11.s, z20.s, z2.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.s, z20.s, z1.s[2]\n"
+      "fmla z19.s, z20.s, z0.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z21.s, z2.s[3]\n"
+      "fmla z12.s, z21.s, z1.s[3]\n"
+      "fmla z16.s, z21.s, z0.s[3]\n"
+      "fmla z9.s, z20.s, z2.s[3]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[3]\n"
+      "fmla z17.s, z20.s, z0.s[3]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.s, z21.s, z2.s[3]\n"
+      "fmla z14.s, z21.s, z1.s[3]\n"
+      "fmla z18.s, z21.s, z0.s[3]\n"
+      "fmla z11.s, z20.s, z2.s[3]\n"
+      "fmla z15.s, z20.s, z1.s[3]\n"
+      "fmla z19.s, z20.s, z0.s[3]\n"
       "bgt 35b\n"
       "36:"  // Height 3: Multiply loop: Single iteration only
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z21.s, z0.s[0]\n"
+      "fmla z12.s, z21.s, z1.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.s, z21.s, z2.s[0]\n"
+      "fmla z9.s, z20.s, z0.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[0]\n"
+      "fmla z17.s, z20.s, z2.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z10.s, z21.s, z0.s[0]\n"
+      "fmla z14.s, z21.s, z1.s[0]\n"
+      "fmla z18.s, z21.s, z2.s[0]\n"
+      "fmla z11.s, z20.s, z0.s[0]\n"
+      "fmla z15.s, z20.s, z1.s[0]\n"
+      "fmla z19.s, z20.s, z2.s[0]\n"
       "ble 37f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z21.s, z0.s[1]\n"
+      "fmla z12.s, z21.s, z1.s[1]\n"
+      "fmla z16.s, z21.s, z2.s[1]\n"
+      "fmla z9.s, z20.s, z0.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.s, z20.s, z1.s[1]\n"
+      "fmla z17.s, z20.s, z2.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z10.s, z21.s, z0.s[1]\n"
+      "fmla z14.s, z21.s, z1.s[1]\n"
+      "fmla z18.s, z21.s, z2.s[1]\n"
+      "fmla z11.s, z20.s, z0.s[1]\n"
+      "fmla z15.s, z20.s, z1.s[1]\n"
+      "fmla z19.s, z20.s, z2.s[1]\n"
       "ble 37f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z21.s, z0.s[2]\n"
+      "fmla z12.s, z21.s, z1.s[2]\n"
+      "fmla z16.s, z21.s, z2.s[2]\n"
+      "fmla z9.s, z20.s, z0.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z13.s, z20.s, z1.s[2]\n"
+      "fmla z17.s, z20.s, z2.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z10.s, z21.s, z0.s[2]\n"
+      "fmla z14.s, z21.s, z1.s[2]\n"
+      "fmla z18.s, z21.s, z2.s[2]\n"
+      "fmla z11.s, z20.s, z0.s[2]\n"
+      "fmla z15.s, z20.s, z1.s[2]\n"
+      "fmla z19.s, z20.s, z2.s[2]\n"
       "ble 37f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z21.s, z0.s[3]\n"
+      "fmla z12.s, z21.s, z1.s[3]\n"
+      "fmla z16.s, z21.s, z2.s[3]\n"
+      "fmla z9.s, z20.s, z0.s[3]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[3]\n"
+      "fmla z17.s, z20.s, z2.s[3]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z10.s, z21.s, z0.s[3]\n"
+      "fmla z14.s, z21.s, z1.s[3]\n"
+      "fmla z18.s, z21.s, z2.s[3]\n"
+      "fmla z11.s, z20.s, z0.s[3]\n"
+      "fmla z15.s, z20.s, z1.s[3]\n"
+      "fmla z19.s, z20.s, z2.s[3]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 32b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
       "tbz %x[flags], #1, 38f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmin z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z0.s\n"
-      "fmin z14.s, p5/M, z14.s, z0.s\n"
-      "fmin z15.s, p5/M, z15.s, z0.s\n"
-      "fmin z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z1.s\n"
-      "fmax z14.s, p5/M, z14.s, z1.s\n"
-      "fmax z15.s, p5/M, z15.s, z1.s\n"
-      "fmax z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z0.s\n"
-      "fmin z18.s, p5/M, z18.s, z0.s\n"
-      "fmin z19.s, p5/M, z19.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z1.s\n"
-      "fmax z18.s, p5/M, z18.s, z1.s\n"
-      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z21.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z20.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z21.s\n"
+      "fmin z9.s, p5/M, z9.s, z21.s\n"
+      "fmin z10.s, p5/M, z10.s, z21.s\n"
+      "fmin z11.s, p5/M, z11.s, z21.s\n"
+      "fmin z12.s, p5/M, z12.s, z21.s\n"
+      "fmin z13.s, p5/M, z13.s, z21.s\n"
+      "fmin z14.s, p5/M, z14.s, z21.s\n"
+      "fmin z15.s, p5/M, z15.s, z21.s\n"
+      "fmin z16.s, p5/M, z16.s, z21.s\n"
+      "fmin z17.s, p5/M, z17.s, z21.s\n"
+      "fmin z18.s, p5/M, z18.s, z21.s\n"
+      "fmin z19.s, p5/M, z19.s, z21.s\n"
+      "fmax z8.s, p5/M, z8.s, z20.s\n"
+      "fmax z9.s, p5/M, z9.s, z20.s\n"
+      "fmax z10.s, p5/M, z10.s, z20.s\n"
+      "fmax z11.s, p5/M, z11.s, z20.s\n"
+      "fmax z12.s, p5/M, z12.s, z20.s\n"
+      "fmax z13.s, p5/M, z13.s, z20.s\n"
+      "fmax z14.s, p5/M, z14.s, z20.s\n"
+      "fmax z15.s, p5/M, z15.s, z20.s\n"
+      "fmax z16.s, p5/M, z16.s, z20.s\n"
+      "fmax z17.s, p5/M, z17.s, z20.s\n"
+      "fmax z18.s, p5/M, z18.s, z20.s\n"
+      "fmax z19.s, p5/M, z19.s, z20.s\n"
       "38:"  // Height 3: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
       "39:"  // Height 3: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 28b\n"
       "b 80f\n"
       "40:"  // Height 4
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "41:"  // Height 4: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 42f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 42f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "mov z20.d, z8.d\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
       "mov z13.d, z9.d\n"
-      "mov z17.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
       "mov z21.d, z9.d\n"
       "mov z22.d, z10.d\n"
       "mov z23.d, z11.d\n"
       "b 44f\n"
       "42:"  // Height 4: no bias
       "tbz %x[flags], #0, 43f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21]\n"
+      "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 44f\n"
       "43:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -887,333 +869,321 @@ void sve_hybrid_fp32_mla_6x4VL (
       "mov z22.b, #0x0\n"
       "mov z23.b, #0x0\n"
       "44:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "45:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 47f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
       "b 47f\n"
       "46:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "47:"  // Height 4: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "ble 49f\n"
       "48:"  // Height 4: Multiply loop: Main loop head
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z3.s }, p0/Z, [x26]\n"
+      "ld1rqw { z2.s }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x4\n"
       "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      "cmp x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[0]\n"
+      "fmla z12.s, z25.s, z2.s[0]\n"
+      "fmla z16.s, z25.s, z1.s[0]\n"
+      "fmla z20.s, z25.s, z0.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x23]\n"
+      "fmla z9.s, z24.s, z3.s[0]\n"
+      "fmla z13.s, z24.s, z2.s[0]\n"
       "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1rqw { z3.s }, p0/Z, [x22]\n"
       "add x23, x23, #0x10\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x4\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z17.s, z24.s, z1.s[0]\n"
+      "fmla z21.s, z24.s, z0.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z25.s, z3.s[0]\n"
+      "fmla z14.s, z25.s, z2.s[0]\n"
+      "fmla z18.s, z25.s, z1.s[0]\n"
+      "fmla z22.s, z25.s, z0.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.s, z24.s, z3.s[0]\n"
+      "fmla z15.s, z24.s, z2.s[0]\n"
+      "fmla z19.s, z24.s, z1.s[0]\n"
+      "fmla z23.s, z24.s, z0.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[1]\n"
+      "fmla z12.s, z25.s, z2.s[1]\n"
+      "fmla z16.s, z25.s, z1.s[1]\n"
+      "fmla z20.s, z25.s, z0.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.s, z24.s, z3.s[1]\n"
+      "fmla z13.s, z24.s, z2.s[1]\n"
+      "fmla z17.s, z24.s, z1.s[1]\n"
+      "fmla z21.s, z24.s, z0.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z10.s, z25.s, z3.s[1]\n"
+      "fmla z14.s, z25.s, z2.s[1]\n"
+      "fmla z18.s, z25.s, z1.s[1]\n"
+      "fmla z22.s, z25.s, z0.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.s, z24.s, z3.s[1]\n"
+      "fmla z15.s, z24.s, z2.s[1]\n"
+      "fmla z19.s, z24.s, z1.s[1]\n"
+      "fmla z23.s, z24.s, z0.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[2]\n"
+      "fmla z12.s, z25.s, z2.s[2]\n"
+      "fmla z16.s, z25.s, z1.s[2]\n"
+      "fmla z20.s, z25.s, z0.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.s, z24.s, z3.s[2]\n"
+      "fmla z13.s, z24.s, z2.s[2]\n"
+      "fmla z17.s, z24.s, z1.s[2]\n"
+      "fmla z21.s, z24.s, z0.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z25.s, z3.s[2]\n"
+      "fmla z14.s, z25.s, z2.s[2]\n"
+      "fmla z18.s, z25.s, z1.s[2]\n"
+      "fmla z22.s, z25.s, z0.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.s, z24.s, z3.s[2]\n"
+      "fmla z15.s, z24.s, z2.s[2]\n"
+      "fmla z19.s, z24.s, z1.s[2]\n"
+      "fmla z23.s, z24.s, z0.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[3]\n"
+      "fmla z12.s, z25.s, z2.s[3]\n"
+      "fmla z16.s, z25.s, z1.s[3]\n"
+      "fmla z20.s, z25.s, z0.s[3]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.s, z24.s, z3.s[3]\n"
+      "fmla z13.s, z24.s, z2.s[3]\n"
+      "fmla z17.s, z24.s, z1.s[3]\n"
+      "fmla z21.s, z24.s, z0.s[3]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.s, z25.s, z3.s[3]\n"
+      "fmla z14.s, z25.s, z2.s[3]\n"
+      "fmla z18.s, z25.s, z1.s[3]\n"
+      "fmla z22.s, z25.s, z0.s[3]\n"
+      "fmla z11.s, z24.s, z3.s[3]\n"
+      "fmla z15.s, z24.s, z2.s[3]\n"
+      "fmla z19.s, z24.s, z1.s[3]\n"
+      "fmla z23.s, z24.s, z0.s[3]\n"
       "bgt 48b\n"
       "49:"  // Height 4: Multiply loop: Single iteration only
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1rqw { z3.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z0.s[0]\n"
+      "fmla z12.s, z25.s, z1.s[0]\n"
+      "fmla z16.s, z25.s, z2.s[0]\n"
+      "fmla z20.s, z25.s, z3.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z24.s, z0.s[0]\n"
+      "fmla z13.s, z24.s, z1.s[0]\n"
+      "fmla z17.s, z24.s, z2.s[0]\n"
+      "fmla z21.s, z24.s, z3.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
+      "fmla z10.s, z25.s, z0.s[0]\n"
+      "fmla z14.s, z25.s, z1.s[0]\n"
+      "fmla z18.s, z25.s, z2.s[0]\n"
+      "fmla z22.s, z25.s, z3.s[0]\n"
+      "fmla z11.s, z24.s, z0.s[0]\n"
+      "fmla z15.s, z24.s, z1.s[0]\n"
+      "fmla z19.s, z24.s, z2.s[0]\n"
+      "fmla z23.s, z24.s, z3.s[0]\n"
       "ble 50f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z0.s[1]\n"
+      "fmla z12.s, z25.s, z1.s[1]\n"
+      "fmla z16.s, z25.s, z2.s[1]\n"
+      "fmla z20.s, z25.s, z3.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.s, z24.s, z0.s[1]\n"
+      "fmla z13.s, z24.s, z1.s[1]\n"
+      "fmla z17.s, z24.s, z2.s[1]\n"
+      "fmla z21.s, z24.s, z3.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
+      "fmla z10.s, z25.s, z0.s[1]\n"
+      "fmla z14.s, z25.s, z1.s[1]\n"
+      "fmla z18.s, z25.s, z2.s[1]\n"
+      "fmla z22.s, z25.s, z3.s[1]\n"
+      "fmla z11.s, z24.s, z0.s[1]\n"
+      "fmla z15.s, z24.s, z1.s[1]\n"
+      "fmla z19.s, z24.s, z2.s[1]\n"
+      "fmla z23.s, z24.s, z3.s[1]\n"
       "ble 50f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z0.s[2]\n"
+      "fmla z12.s, z25.s, z1.s[2]\n"
+      "fmla z16.s, z25.s, z2.s[2]\n"
+      "fmla z20.s, z25.s, z3.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z9.s, z24.s, z0.s[2]\n"
+      "fmla z13.s, z24.s, z1.s[2]\n"
+      "fmla z17.s, z24.s, z2.s[2]\n"
+      "fmla z21.s, z24.s, z3.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
+      "fmla z10.s, z25.s, z0.s[2]\n"
+      "fmla z14.s, z25.s, z1.s[2]\n"
+      "fmla z18.s, z25.s, z2.s[2]\n"
+      "fmla z22.s, z25.s, z3.s[2]\n"
+      "fmla z11.s, z24.s, z0.s[2]\n"
+      "fmla z15.s, z24.s, z1.s[2]\n"
+      "fmla z19.s, z24.s, z2.s[2]\n"
+      "fmla z23.s, z24.s, z3.s[2]\n"
       "ble 50f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z0.s[3]\n"
+      "fmla z12.s, z25.s, z1.s[3]\n"
+      "fmla z16.s, z25.s, z2.s[3]\n"
+      "fmla z20.s, z25.s, z3.s[3]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z24.s, z0.s[3]\n"
+      "fmla z13.s, z24.s, z1.s[3]\n"
+      "fmla z17.s, z24.s, z2.s[3]\n"
+      "fmla z21.s, z24.s, z3.s[3]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z10.s, z25.s, z0.s[3]\n"
+      "fmla z14.s, z25.s, z1.s[3]\n"
+      "fmla z18.s, z25.s, z2.s[3]\n"
+      "fmla z22.s, z25.s, z3.s[3]\n"
+      "fmla z11.s, z24.s, z0.s[3]\n"
+      "fmla z15.s, z24.s, z1.s[3]\n"
+      "fmla z19.s, z24.s, z2.s[3]\n"
+      "fmla z23.s, z24.s, z3.s[3]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 45b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "tbz %x[flags], #1, 51f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmin z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z0.s\n"
-      "fmin z14.s, p5/M, z14.s, z0.s\n"
-      "fmin z15.s, p5/M, z15.s, z0.s\n"
-      "fmin z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z1.s\n"
-      "fmax z14.s, p5/M, z14.s, z1.s\n"
-      "fmax z15.s, p5/M, z15.s, z1.s\n"
-      "fmax z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z0.s\n"
-      "fmin z18.s, p5/M, z18.s, z0.s\n"
-      "fmin z19.s, p5/M, z19.s, z0.s\n"
-      "fmin z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z1.s\n"
-      "fmax z18.s, p5/M, z18.s, z1.s\n"
-      "fmax z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z0.s\n"
-      "fmin z22.s, p5/M, z22.s, z0.s\n"
-      "fmin z23.s, p5/M, z23.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z1.s\n"
-      "fmax z22.s, p5/M, z22.s, z1.s\n"
-      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z15.s, p5/M, z15.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmin z20.s, p5/M, z20.s, z25.s\n"
+      "fmin z21.s, p5/M, z21.s, z25.s\n"
+      "fmin z22.s, p5/M, z22.s, z25.s\n"
+      "fmin z23.s, p5/M, z23.s, z25.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z15.s, p5/M, z15.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z20.s, p5/M, z20.s, z24.s\n"
+      "fmax z21.s, p5/M, z21.s, z24.s\n"
+      "fmax z22.s, p5/M, z22.s, z24.s\n"
+      "fmax z23.s, p5/M, z23.s, z24.s\n"
       "51:"  // Height 4: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x22]\n"
-      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x23]\n"
+      "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
       "52:"  // Height 4: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 41b\n"
       "b 80f\n"
       "53:"  // Height 5
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
       "54:"  // Height 5: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 55f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 55f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "mov z20.d, z8.d\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
       "mov z13.d, z9.d\n"
-      "mov z17.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
       "mov z21.d, z9.d\n"
       "mov z22.d, z10.d\n"
       "mov z23.d, z11.d\n"
@@ -1224,31 +1194,31 @@ void sve_hybrid_fp32_mla_6x4VL (
       "b 57f\n"
       "55:"  // Height 5: no bias
       "tbz %x[flags], #0, 56f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x21]\n"
-      "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22]\n"
+      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x20]\n"
+      "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 57f\n"
       "56:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -1272,390 +1242,375 @@ void sve_hybrid_fp32_mla_6x4VL (
       "mov z26.b, #0x0\n"
       "mov z27.b, #0x0\n"
       "57:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "58:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 60f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
-      "add x21, x21, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 60f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
       "b 60f\n"
       "59:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "60:"  // Height 5: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "ble 62f\n"
       "61:"  // Height 5: Multiply loop: Main loop head
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x4\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z1.s }, p0/Z, [x23]\n"
+      "cmp x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqw { z0.s }, p0/Z, [x22]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z29.s, z4.s[0]\n"
+      "fmla z12.s, z29.s, z3.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.s, z29.s, z2.s[0]\n"
+      "fmla z20.s, z29.s, z1.s[0]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x23]\n"
+      "fmla z24.s, z29.s, z0.s[0]\n"
+      "fmla z9.s, z28.s, z4.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      "fmla z13.s, z28.s, z3.s[0]\n"
+      "fmla z17.s, z28.s, z2.s[0]\n"
       "add x23, x23, #0x10\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x21, x21, #0x10\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x4\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z21.s, z28.s, z1.s[0]\n"
+      "fmla z25.s, z28.s, z0.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z29.s, z4.s[0]\n"
+      "fmla z14.s, z29.s, z3.s[0]\n"
+      "fmla z18.s, z29.s, z2.s[0]\n"
+      "fmla z22.s, z29.s, z1.s[0]\n"
+      "fmla z26.s, z29.s, z0.s[0]\n"
+      "fmla z11.s, z28.s, z4.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z15.s, z28.s, z3.s[0]\n"
+      "fmla z19.s, z28.s, z2.s[0]\n"
+      "fmla z23.s, z28.s, z1.s[0]\n"
+      "fmla z27.s, z28.s, z0.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.s, z29.s, z4.s[1]\n"
+      "fmla z12.s, z29.s, z3.s[1]\n"
+      "fmla z16.s, z29.s, z2.s[1]\n"
+      "fmla z20.s, z29.s, z1.s[1]\n"
+      "fmla z24.s, z29.s, z0.s[1]\n"
+      "fmla z9.s, z28.s, z4.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.s, z28.s, z3.s[1]\n"
+      "fmla z17.s, z28.s, z2.s[1]\n"
+      "fmla z21.s, z28.s, z1.s[1]\n"
+      "fmla z25.s, z28.s, z0.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
+      "fmla z10.s, z29.s, z4.s[1]\n"
+      "fmla z14.s, z29.s, z3.s[1]\n"
+      "fmla z18.s, z29.s, z2.s[1]\n"
+      "fmla z22.s, z29.s, z1.s[1]\n"
+      "fmla z26.s, z29.s, z0.s[1]\n"
+      "fmla z11.s, z28.s, z4.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.s, z28.s, z3.s[1]\n"
+      "fmla z19.s, z28.s, z2.s[1]\n"
+      "fmla z23.s, z28.s, z1.s[1]\n"
+      "fmla z27.s, z28.s, z0.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z29.s, z4.s[2]\n"
+      "fmla z12.s, z29.s, z3.s[2]\n"
+      "fmla z16.s, z29.s, z2.s[2]\n"
+      "fmla z20.s, z29.s, z1.s[2]\n"
+      "fmla z24.s, z29.s, z0.s[2]\n"
+      "fmla z9.s, z28.s, z4.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.s, z28.s, z3.s[2]\n"
+      "fmla z17.s, z28.s, z2.s[2]\n"
+      "fmla z21.s, z28.s, z1.s[2]\n"
+      "fmla z25.s, z28.s, z0.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z29.s, z4.s[2]\n"
+      "fmla z14.s, z29.s, z3.s[2]\n"
+      "fmla z18.s, z29.s, z2.s[2]\n"
+      "fmla z22.s, z29.s, z1.s[2]\n"
+      "fmla z26.s, z29.s, z0.s[2]\n"
+      "fmla z11.s, z28.s, z4.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.s, z28.s, z3.s[2]\n"
+      "fmla z19.s, z28.s, z2.s[2]\n"
+      "fmla z23.s, z28.s, z1.s[2]\n"
+      "fmla z27.s, z28.s, z0.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z29.s, z4.s[3]\n"
+      "fmla z12.s, z29.s, z3.s[3]\n"
+      "fmla z16.s, z29.s, z2.s[3]\n"
+      "fmla z20.s, z29.s, z1.s[3]\n"
+      "fmla z24.s, z29.s, z0.s[3]\n"
+      "fmla z9.s, z28.s, z4.s[3]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.s, z28.s, z3.s[3]\n"
+      "fmla z17.s, z28.s, z2.s[3]\n"
+      "fmla z21.s, z28.s, z1.s[3]\n"
+      "fmla z25.s, z28.s, z0.s[3]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.s, z29.s, z4.s[3]\n"
+      "fmla z14.s, z29.s, z3.s[3]\n"
+      "fmla z18.s, z29.s, z2.s[3]\n"
+      "fmla z22.s, z29.s, z1.s[3]\n"
+      "fmla z26.s, z29.s, z0.s[3]\n"
+      "fmla z11.s, z28.s, z4.s[3]\n"
+      "fmla z15.s, z28.s, z3.s[3]\n"
+      "fmla z19.s, z28.s, z2.s[3]\n"
+      "fmla z23.s, z28.s, z1.s[3]\n"
+      "fmla z27.s, z28.s, z0.s[3]\n"
       "bgt 61b\n"
       "62:"  // Height 5: Multiply loop: Single iteration only
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1rqw { z3.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "add x21, x21, #0x10\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z29.s, z0.s[0]\n"
+      "fmla z12.s, z29.s, z1.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.s, z29.s, z2.s[0]\n"
+      "fmla z20.s, z29.s, z3.s[0]\n"
+      "fmla z24.s, z29.s, z4.s[0]\n"
+      "fmla z9.s, z28.s, z0.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z28.s, z1.s[0]\n"
+      "fmla z17.s, z28.s, z2.s[0]\n"
+      "fmla z21.s, z28.s, z3.s[0]\n"
+      "fmla z25.s, z28.s, z4.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
+      "fmla z10.s, z29.s, z0.s[0]\n"
+      "fmla z14.s, z29.s, z1.s[0]\n"
+      "fmla z18.s, z29.s, z2.s[0]\n"
+      "fmla z22.s, z29.s, z3.s[0]\n"
+      "fmla z26.s, z29.s, z4.s[0]\n"
+      "fmla z11.s, z28.s, z0.s[0]\n"
+      "fmla z15.s, z28.s, z1.s[0]\n"
+      "fmla z19.s, z28.s, z2.s[0]\n"
+      "fmla z23.s, z28.s, z3.s[0]\n"
+      "fmla z27.s, z28.s, z4.s[0]\n"
       "ble 63f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z29.s, z0.s[1]\n"
+      "fmla z12.s, z29.s, z1.s[1]\n"
+      "fmla z16.s, z29.s, z2.s[1]\n"
+      "fmla z20.s, z29.s, z3.s[1]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.s, z29.s, z4.s[1]\n"
+      "fmla z9.s, z28.s, z0.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z28.s, z1.s[1]\n"
+      "fmla z17.s, z28.s, z2.s[1]\n"
+      "fmla z21.s, z28.s, z3.s[1]\n"
+      "fmla z25.s, z28.s, z4.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
+      "fmla z10.s, z29.s, z0.s[1]\n"
+      "fmla z14.s, z29.s, z1.s[1]\n"
+      "fmla z18.s, z29.s, z2.s[1]\n"
+      "fmla z22.s, z29.s, z3.s[1]\n"
+      "fmla z26.s, z29.s, z4.s[1]\n"
+      "fmla z11.s, z28.s, z0.s[1]\n"
+      "fmla z15.s, z28.s, z1.s[1]\n"
+      "fmla z19.s, z28.s, z2.s[1]\n"
+      "fmla z23.s, z28.s, z3.s[1]\n"
+      "fmla z27.s, z28.s, z4.s[1]\n"
       "ble 63f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z29.s, z0.s[2]\n"
+      "fmla z12.s, z29.s, z1.s[2]\n"
+      "fmla z16.s, z29.s, z2.s[2]\n"
+      "fmla z20.s, z29.s, z3.s[2]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.s, z29.s, z4.s[2]\n"
+      "fmla z9.s, z28.s, z0.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z28.s, z1.s[2]\n"
+      "fmla z17.s, z28.s, z2.s[2]\n"
+      "fmla z21.s, z28.s, z3.s[2]\n"
+      "fmla z25.s, z28.s, z4.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
+      "fmla z10.s, z29.s, z0.s[2]\n"
+      "fmla z14.s, z29.s, z1.s[2]\n"
+      "fmla z18.s, z29.s, z2.s[2]\n"
+      "fmla z22.s, z29.s, z3.s[2]\n"
+      "fmla z26.s, z29.s, z4.s[2]\n"
+      "fmla z11.s, z28.s, z0.s[2]\n"
+      "fmla z15.s, z28.s, z1.s[2]\n"
+      "fmla z19.s, z28.s, z2.s[2]\n"
+      "fmla z23.s, z28.s, z3.s[2]\n"
+      "fmla z27.s, z28.s, z4.s[2]\n"
       "ble 63f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z29.s, z0.s[3]\n"
+      "fmla z12.s, z29.s, z1.s[3]\n"
+      "fmla z16.s, z29.s, z2.s[3]\n"
+      "fmla z20.s, z29.s, z3.s[3]\n"
+      "fmla z24.s, z29.s, z4.s[3]\n"
+      "fmla z9.s, z28.s, z0.s[3]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z28.s, z1.s[3]\n"
+      "fmla z17.s, z28.s, z2.s[3]\n"
+      "fmla z21.s, z28.s, z3.s[3]\n"
+      "fmla z25.s, z28.s, z4.s[3]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
+      "fmla z10.s, z29.s, z0.s[3]\n"
+      "fmla z14.s, z29.s, z1.s[3]\n"
+      "fmla z18.s, z29.s, z2.s[3]\n"
+      "fmla z22.s, z29.s, z3.s[3]\n"
+      "fmla z26.s, z29.s, z4.s[3]\n"
+      "fmla z11.s, z28.s, z0.s[3]\n"
+      "fmla z15.s, z28.s, z1.s[3]\n"
+      "fmla z19.s, z28.s, z2.s[3]\n"
+      "fmla z23.s, z28.s, z3.s[3]\n"
+      "fmla z27.s, z28.s, z4.s[3]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 58b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "tbz %x[flags], #1, 64f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmin z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z0.s\n"
-      "fmin z14.s, p5/M, z14.s, z0.s\n"
-      "fmin z15.s, p5/M, z15.s, z0.s\n"
-      "fmin z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z1.s\n"
-      "fmax z14.s, p5/M, z14.s, z1.s\n"
-      "fmax z15.s, p5/M, z15.s, z1.s\n"
-      "fmax z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z0.s\n"
-      "fmin z18.s, p5/M, z18.s, z0.s\n"
-      "fmin z19.s, p5/M, z19.s, z0.s\n"
-      "fmin z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z1.s\n"
-      "fmax z18.s, p5/M, z18.s, z1.s\n"
-      "fmax z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z0.s\n"
-      "fmin z22.s, p5/M, z22.s, z0.s\n"
-      "fmin z23.s, p5/M, z23.s, z0.s\n"
-      "fmin z24.s, p5/M, z24.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z1.s\n"
-      "fmax z22.s, p5/M, z22.s, z1.s\n"
-      "fmax z23.s, p5/M, z23.s, z1.s\n"
-      "fmax z24.s, p5/M, z24.s, z1.s\n"
-      "fmin z25.s, p5/M, z25.s, z0.s\n"
-      "fmin z26.s, p5/M, z26.s, z0.s\n"
-      "fmin z27.s, p5/M, z27.s, z0.s\n"
-      "fmax z25.s, p5/M, z25.s, z1.s\n"
-      "fmax z26.s, p5/M, z26.s, z1.s\n"
-      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z29.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z28.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z29.s\n"
+      "fmin z9.s, p5/M, z9.s, z29.s\n"
+      "fmin z10.s, p5/M, z10.s, z29.s\n"
+      "fmin z11.s, p5/M, z11.s, z29.s\n"
+      "fmin z12.s, p5/M, z12.s, z29.s\n"
+      "fmin z13.s, p5/M, z13.s, z29.s\n"
+      "fmin z14.s, p5/M, z14.s, z29.s\n"
+      "fmin z15.s, p5/M, z15.s, z29.s\n"
+      "fmin z16.s, p5/M, z16.s, z29.s\n"
+      "fmin z17.s, p5/M, z17.s, z29.s\n"
+      "fmin z18.s, p5/M, z18.s, z29.s\n"
+      "fmin z19.s, p5/M, z19.s, z29.s\n"
+      "fmin z20.s, p5/M, z20.s, z29.s\n"
+      "fmin z21.s, p5/M, z21.s, z29.s\n"
+      "fmin z22.s, p5/M, z22.s, z29.s\n"
+      "fmin z23.s, p5/M, z23.s, z29.s\n"
+      "fmin z24.s, p5/M, z24.s, z29.s\n"
+      "fmin z25.s, p5/M, z25.s, z29.s\n"
+      "fmin z26.s, p5/M, z26.s, z29.s\n"
+      "fmin z27.s, p5/M, z27.s, z29.s\n"
+      "fmax z8.s, p5/M, z8.s, z28.s\n"
+      "fmax z9.s, p5/M, z9.s, z28.s\n"
+      "fmax z10.s, p5/M, z10.s, z28.s\n"
+      "fmax z11.s, p5/M, z11.s, z28.s\n"
+      "fmax z12.s, p5/M, z12.s, z28.s\n"
+      "fmax z13.s, p5/M, z13.s, z28.s\n"
+      "fmax z14.s, p5/M, z14.s, z28.s\n"
+      "fmax z15.s, p5/M, z15.s, z28.s\n"
+      "fmax z16.s, p5/M, z16.s, z28.s\n"
+      "fmax z17.s, p5/M, z17.s, z28.s\n"
+      "fmax z18.s, p5/M, z18.s, z28.s\n"
+      "fmax z19.s, p5/M, z19.s, z28.s\n"
+      "fmax z20.s, p5/M, z20.s, z28.s\n"
+      "fmax z21.s, p5/M, z21.s, z28.s\n"
+      "fmax z22.s, p5/M, z22.s, z28.s\n"
+      "fmax z23.s, p5/M, z23.s, z28.s\n"
+      "fmax z24.s, p5/M, z24.s, z28.s\n"
+      "fmax z25.s, p5/M, z25.s, z28.s\n"
+      "fmax z26.s, p5/M, z26.s, z28.s\n"
+      "fmax z27.s, p5/M, z27.s, z28.s\n"
       "64:"  // Height 5: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x22]\n"
-      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z24.s }, p4, [x21]\n"
-      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x23]\n"
+      "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x22]\n"
+      "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
       "65:"  // Height 5: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 54b\n"
       "b 80f\n"
       "66:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "mov x12, %x[bias]\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[bias]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x19, #0x18\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "67:"  // Height 6: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x11\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x11\n"
-      "cbz x9, 68f\n"
-      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 68f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
       "mov z12.d, z8.d\n"
-      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "mov z16.d, z8.d\n"
-      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "mov z20.d, z8.d\n"
-      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
       "mov z13.d, z9.d\n"
-      "mov z17.d, z9.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
       "mov z14.d, z10.d\n"
       "mov z15.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "addvl x12, x12, #4\n"
       "mov z18.d, z10.d\n"
       "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
       "mov z21.d, z9.d\n"
       "mov z22.d, z10.d\n"
       "mov z23.d, z11.d\n"
@@ -1670,18 +1625,18 @@ void sve_hybrid_fp32_mla_6x4VL (
       "b 70f\n"
       "68:"  // Height 6: no bias
       "tbz %x[flags], #0, 69f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
       "ld1w { z12.s }, p4/Z, [x24]\n"
-      "add x21, x22, x19, LSL #2\n"
       "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "add x20, x21, x19, LSL #2\n"
       "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
       "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
       "ld1w { z16.s }, p4/Z, [x23]\n"
@@ -1727,429 +1682,410 @@ void sve_hybrid_fp32_mla_6x4VL (
       "mov z30.b, #0x0\n"
       "mov z31.b, #0x0\n"
       "70:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "71:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 72f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 73f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
-      "add x21, x21, x19, LSL #2\n"
-      "add x20, x20, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 73f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
       "b 73f\n"
       "72:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "73:"  // Height 6: input setup done
-      "cmp x26, #0x4\n"
+      "cmp x27, #0x4\n"
       "ble 75f\n"
       "74:"  // Height 6: Multiply loop: Main loop head
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "sub x26, x26, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z7.s }, p0/Z, [x26]\n"
+      "ld1rqw { z6.s }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x4\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z4.s }, p0/Z, [x23]\n"
+      "cmp x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      "ld1w { z1.s }, p5/Z, [x10]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[0]\n"
+      "fmla z12.s, z1.s, z6.s[0]\n"
+      "fmla z16.s, z1.s, z5.s[0]\n"
+      "fmla z20.s, z1.s, z4.s[0]\n"
       "add x23, x23, #0x10\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "ld1rqw { z5.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z3.s[0]\n"
+      "fmla z28.s, z1.s, z2.s[0]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x4\n"
-      "fmla z28.s, z6.s, z5.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "fmla z29.s, z7.s, z5.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "fmla z30.s, z6.s, z5.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
-      "fmla z31.s, z7.s, z5.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "fmla z28.s, z6.s, z5.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "fmla z29.s, z7.s, z5.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[0]\n"
+      "fmla z13.s, z0.s, z6.s[0]\n"
+      "fmla z17.s, z0.s, z5.s[0]\n"
+      "fmla z21.s, z0.s, z4.s[0]\n"
+      "fmla z25.s, z0.s, z3.s[0]\n"
+      "fmla z29.s, z0.s, z2.s[0]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z1.s, z7.s[0]\n"
+      "fmla z14.s, z1.s, z6.s[0]\n"
+      "fmla z18.s, z1.s, z5.s[0]\n"
+      "fmla z22.s, z1.s, z4.s[0]\n"
+      "fmla z26.s, z1.s, z3.s[0]\n"
+      "fmla z30.s, z1.s, z2.s[0]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.s, z0.s, z7.s[0]\n"
+      "fmla z15.s, z0.s, z6.s[0]\n"
+      "fmla z19.s, z0.s, z5.s[0]\n"
+      "fmla z23.s, z0.s, z4.s[0]\n"
+      "fmla z27.s, z0.s, z3.s[0]\n"
+      "fmla z31.s, z0.s, z2.s[0]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[1]\n"
+      "fmla z12.s, z1.s, z6.s[1]\n"
+      "fmla z16.s, z1.s, z5.s[1]\n"
+      "fmla z20.s, z1.s, z4.s[1]\n"
+      "fmla z24.s, z1.s, z3.s[1]\n"
+      "fmla z28.s, z1.s, z2.s[1]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[1]\n"
+      "fmla z13.s, z0.s, z6.s[1]\n"
+      "fmla z17.s, z0.s, z5.s[1]\n"
+      "fmla z21.s, z0.s, z4.s[1]\n"
+      "fmla z25.s, z0.s, z3.s[1]\n"
+      "fmla z29.s, z0.s, z2.s[1]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "fmla z30.s, z6.s, z5.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
-      "fmla z31.s, z7.s, z5.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "fmla z28.s, z6.s, z5.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "fmla z29.s, z7.s, z5.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "fmla z30.s, z6.s, z5.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
-      "fmla z31.s, z7.s, z5.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "fmla z28.s, z6.s, z5.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "fmla z29.s, z7.s, z5.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z30.s, z6.s, z5.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
-      "fmla z31.s, z7.s, z5.s[3]\n"
+      "fmla z10.s, z1.s, z7.s[1]\n"
+      "fmla z14.s, z1.s, z6.s[1]\n"
+      "fmla z18.s, z1.s, z5.s[1]\n"
+      "fmla z22.s, z1.s, z4.s[1]\n"
+      "fmla z26.s, z1.s, z3.s[1]\n"
+      "fmla z30.s, z1.s, z2.s[1]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.s, z0.s, z7.s[1]\n"
+      "fmla z15.s, z0.s, z6.s[1]\n"
+      "fmla z19.s, z0.s, z5.s[1]\n"
+      "fmla z23.s, z0.s, z4.s[1]\n"
+      "fmla z27.s, z0.s, z3.s[1]\n"
+      "fmla z31.s, z0.s, z2.s[1]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[2]\n"
+      "fmla z12.s, z1.s, z6.s[2]\n"
+      "fmla z16.s, z1.s, z5.s[2]\n"
+      "fmla z20.s, z1.s, z4.s[2]\n"
+      "fmla z24.s, z1.s, z3.s[2]\n"
+      "fmla z28.s, z1.s, z2.s[2]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[2]\n"
+      "fmla z13.s, z0.s, z6.s[2]\n"
+      "fmla z17.s, z0.s, z5.s[2]\n"
+      "fmla z21.s, z0.s, z4.s[2]\n"
+      "fmla z25.s, z0.s, z3.s[2]\n"
+      "fmla z29.s, z0.s, z2.s[2]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z1.s, z7.s[2]\n"
+      "fmla z14.s, z1.s, z6.s[2]\n"
+      "fmla z18.s, z1.s, z5.s[2]\n"
+      "fmla z22.s, z1.s, z4.s[2]\n"
+      "fmla z26.s, z1.s, z3.s[2]\n"
+      "fmla z30.s, z1.s, z2.s[2]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.s, z0.s, z7.s[2]\n"
+      "fmla z15.s, z0.s, z6.s[2]\n"
+      "fmla z19.s, z0.s, z5.s[2]\n"
+      "fmla z23.s, z0.s, z4.s[2]\n"
+      "fmla z27.s, z0.s, z3.s[2]\n"
+      "fmla z31.s, z0.s, z2.s[2]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[3]\n"
+      "fmla z12.s, z1.s, z6.s[3]\n"
+      "fmla z16.s, z1.s, z5.s[3]\n"
+      "fmla z20.s, z1.s, z4.s[3]\n"
+      "fmla z24.s, z1.s, z3.s[3]\n"
+      "fmla z28.s, z1.s, z2.s[3]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[3]\n"
+      "fmla z13.s, z0.s, z6.s[3]\n"
+      "fmla z17.s, z0.s, z5.s[3]\n"
+      "fmla z21.s, z0.s, z4.s[3]\n"
+      "fmla z25.s, z0.s, z3.s[3]\n"
+      "fmla z29.s, z0.s, z2.s[3]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.s, z1.s, z7.s[3]\n"
+      "fmla z14.s, z1.s, z6.s[3]\n"
+      "fmla z18.s, z1.s, z5.s[3]\n"
+      "fmla z22.s, z1.s, z4.s[3]\n"
+      "fmla z26.s, z1.s, z3.s[3]\n"
+      "fmla z30.s, z1.s, z2.s[3]\n"
+      "fmla z11.s, z0.s, z7.s[3]\n"
+      "fmla z15.s, z0.s, z6.s[3]\n"
+      "fmla z19.s, z0.s, z5.s[3]\n"
+      "fmla z23.s, z0.s, z4.s[3]\n"
+      "fmla z27.s, z0.s, z3.s[3]\n"
+      "fmla z31.s, z0.s, z2.s[3]\n"
       "bgt 74b\n"
       "75:"  // Height 6: Multiply loop: Single iteration only
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "whilelt p0.s, XZR, x26\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x25]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1rqw { z3.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "ld1rqw { z5.s }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "fmla z28.s, z6.s, z5.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "fmla z29.s, z7.s, z5.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "ld1rqw { z5.s }, p0/Z, [x21]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z7.s, z0.s[0]\n"
+      "fmla z12.s, z7.s, z1.s[0]\n"
+      "fmla z16.s, z7.s, z2.s[0]\n"
+      "fmla z20.s, z7.s, z3.s[0]\n"
+      "fmla z24.s, z7.s, z4.s[0]\n"
+      "fmla z28.s, z7.s, z5.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z6.s, z0.s[0]\n"
+      "fmla z13.s, z6.s, z1.s[0]\n"
+      "fmla z17.s, z6.s, z2.s[0]\n"
+      "fmla z21.s, z6.s, z3.s[0]\n"
+      "fmla z25.s, z6.s, z4.s[0]\n"
+      "fmla z29.s, z6.s, z5.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "fmla z30.s, z6.s, z5.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
-      "fmla z31.s, z7.s, z5.s[0]\n"
+      "fmla z10.s, z7.s, z0.s[0]\n"
+      "fmla z14.s, z7.s, z1.s[0]\n"
+      "fmla z18.s, z7.s, z2.s[0]\n"
+      "fmla z22.s, z7.s, z3.s[0]\n"
+      "fmla z26.s, z7.s, z4.s[0]\n"
+      "fmla z30.s, z7.s, z5.s[0]\n"
+      "fmla z11.s, z6.s, z0.s[0]\n"
+      "fmla z15.s, z6.s, z1.s[0]\n"
+      "fmla z19.s, z6.s, z2.s[0]\n"
+      "fmla z23.s, z6.s, z3.s[0]\n"
+      "fmla z27.s, z6.s, z4.s[0]\n"
+      "fmla z31.s, z6.s, z5.s[0]\n"
       "ble 76f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "fmla z28.s, z6.s, z5.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "fmla z29.s, z7.s, z5.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z7.s, z0.s[1]\n"
+      "fmla z12.s, z7.s, z1.s[1]\n"
+      "fmla z16.s, z7.s, z2.s[1]\n"
+      "fmla z20.s, z7.s, z3.s[1]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.s, z7.s, z4.s[1]\n"
+      "fmla z28.s, z7.s, z5.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z6.s, z0.s[1]\n"
+      "fmla z13.s, z6.s, z1.s[1]\n"
+      "fmla z17.s, z6.s, z2.s[1]\n"
+      "fmla z21.s, z6.s, z3.s[1]\n"
+      "fmla z25.s, z6.s, z4.s[1]\n"
+      "fmla z29.s, z6.s, z5.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "fmla z30.s, z6.s, z5.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
-      "fmla z31.s, z7.s, z5.s[1]\n"
+      "fmla z10.s, z7.s, z0.s[1]\n"
+      "fmla z14.s, z7.s, z1.s[1]\n"
+      "fmla z18.s, z7.s, z2.s[1]\n"
+      "fmla z22.s, z7.s, z3.s[1]\n"
+      "fmla z26.s, z7.s, z4.s[1]\n"
+      "fmla z30.s, z7.s, z5.s[1]\n"
+      "fmla z11.s, z6.s, z0.s[1]\n"
+      "fmla z15.s, z6.s, z1.s[1]\n"
+      "fmla z19.s, z6.s, z2.s[1]\n"
+      "fmla z23.s, z6.s, z3.s[1]\n"
+      "fmla z27.s, z6.s, z4.s[1]\n"
+      "fmla z31.s, z6.s, z5.s[1]\n"
       "ble 76f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "subs x26, x26, #0x1\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "fmla z28.s, z6.s, z5.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "fmla z29.s, z7.s, z5.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z7.s, z0.s[2]\n"
+      "fmla z12.s, z7.s, z1.s[2]\n"
+      "fmla z16.s, z7.s, z2.s[2]\n"
+      "fmla z20.s, z7.s, z3.s[2]\n"
+      "subs x27, x27, #0x1\n"
+      "fmla z24.s, z7.s, z4.s[2]\n"
+      "fmla z28.s, z7.s, z5.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z6.s, z0.s[2]\n"
+      "fmla z13.s, z6.s, z1.s[2]\n"
+      "fmla z17.s, z6.s, z2.s[2]\n"
+      "fmla z21.s, z6.s, z3.s[2]\n"
+      "fmla z25.s, z6.s, z4.s[2]\n"
+      "fmla z29.s, z6.s, z5.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "fmla z30.s, z6.s, z5.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
-      "fmla z31.s, z7.s, z5.s[2]\n"
+      "fmla z10.s, z7.s, z0.s[2]\n"
+      "fmla z14.s, z7.s, z1.s[2]\n"
+      "fmla z18.s, z7.s, z2.s[2]\n"
+      "fmla z22.s, z7.s, z3.s[2]\n"
+      "fmla z26.s, z7.s, z4.s[2]\n"
+      "fmla z30.s, z7.s, z5.s[2]\n"
+      "fmla z11.s, z6.s, z0.s[2]\n"
+      "fmla z15.s, z6.s, z1.s[2]\n"
+      "fmla z19.s, z6.s, z2.s[2]\n"
+      "fmla z23.s, z6.s, z3.s[2]\n"
+      "fmla z27.s, z6.s, z4.s[2]\n"
+      "fmla z31.s, z6.s, z5.s[2]\n"
       "ble 76f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "fmla z28.s, z6.s, z5.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "fmla z29.s, z7.s, z5.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z7.s, z0.s[3]\n"
+      "fmla z12.s, z7.s, z1.s[3]\n"
+      "fmla z16.s, z7.s, z2.s[3]\n"
+      "fmla z20.s, z7.s, z3.s[3]\n"
+      "fmla z24.s, z7.s, z4.s[3]\n"
+      "fmla z28.s, z7.s, z5.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z6.s, z0.s[3]\n"
+      "fmla z13.s, z6.s, z1.s[3]\n"
+      "fmla z17.s, z6.s, z2.s[3]\n"
+      "fmla z21.s, z6.s, z3.s[3]\n"
+      "fmla z25.s, z6.s, z4.s[3]\n"
+      "fmla z29.s, z6.s, z5.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z30.s, z6.s, z5.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
-      "fmla z31.s, z7.s, z5.s[3]\n"
+      "fmla z10.s, z7.s, z0.s[3]\n"
+      "fmla z14.s, z7.s, z1.s[3]\n"
+      "fmla z18.s, z7.s, z2.s[3]\n"
+      "fmla z22.s, z7.s, z3.s[3]\n"
+      "fmla z26.s, z7.s, z4.s[3]\n"
+      "fmla z30.s, z7.s, z5.s[3]\n"
+      "fmla z11.s, z6.s, z0.s[3]\n"
+      "fmla z15.s, z6.s, z1.s[3]\n"
+      "fmla z19.s, z6.s, z2.s[3]\n"
+      "fmla z23.s, z6.s, z3.s[3]\n"
+      "fmla z27.s, z6.s, z4.s[3]\n"
+      "fmla z31.s, z6.s, z5.s[3]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 71b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x28, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "tbz %x[flags], #1, 77f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z1.s }, p5/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z0.s }, p5/Z, [x19]\n"
-      "fmin z8.s, p5/M, z8.s, z0.s\n"
-      "fmin z9.s, p5/M, z9.s, z0.s\n"
-      "fmin z10.s, p5/M, z10.s, z0.s\n"
-      "fmin z11.s, p5/M, z11.s, z0.s\n"
-      "fmin z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z1.s\n"
-      "fmax z9.s, p5/M, z9.s, z1.s\n"
-      "fmax z10.s, p5/M, z10.s, z1.s\n"
-      "fmax z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z0.s\n"
-      "fmin z14.s, p5/M, z14.s, z0.s\n"
-      "fmin z15.s, p5/M, z15.s, z0.s\n"
-      "fmin z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z1.s\n"
-      "fmax z14.s, p5/M, z14.s, z1.s\n"
-      "fmax z15.s, p5/M, z15.s, z1.s\n"
-      "fmax z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z0.s\n"
-      "fmin z18.s, p5/M, z18.s, z0.s\n"
-      "fmin z19.s, p5/M, z19.s, z0.s\n"
-      "fmin z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z1.s\n"
-      "fmax z18.s, p5/M, z18.s, z1.s\n"
-      "fmax z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z0.s\n"
-      "fmin z22.s, p5/M, z22.s, z0.s\n"
-      "fmin z23.s, p5/M, z23.s, z0.s\n"
-      "fmin z24.s, p5/M, z24.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z1.s\n"
-      "fmax z22.s, p5/M, z22.s, z1.s\n"
-      "fmax z23.s, p5/M, z23.s, z1.s\n"
-      "fmax z24.s, p5/M, z24.s, z1.s\n"
-      "fmin z25.s, p5/M, z25.s, z0.s\n"
-      "fmin z26.s, p5/M, z26.s, z0.s\n"
-      "fmin z27.s, p5/M, z27.s, z0.s\n"
-      "fmin z28.s, p5/M, z28.s, z0.s\n"
-      "fmax z25.s, p5/M, z25.s, z1.s\n"
-      "fmax z26.s, p5/M, z26.s, z1.s\n"
-      "fmax z27.s, p5/M, z27.s, z1.s\n"
-      "fmax z28.s, p5/M, z28.s, z1.s\n"
-      "fmin z29.s, p5/M, z29.s, z0.s\n"
-      "fmin z30.s, p5/M, z30.s, z0.s\n"
-      "fmin z31.s, p5/M, z31.s, z0.s\n"
-      "fmax z29.s, p5/M, z29.s, z1.s\n"
-      "fmax z30.s, p5/M, z30.s, z1.s\n"
-      "fmax z31.s, p5/M, z31.s, z1.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmin z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z1.s\n"
+      "fmin z14.s, p5/M, z14.s, z1.s\n"
+      "fmin z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z1.s\n"
+      "fmin z18.s, p5/M, z18.s, z1.s\n"
+      "fmin z19.s, p5/M, z19.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z1.s\n"
+      "fmin z22.s, p5/M, z22.s, z1.s\n"
+      "fmin z23.s, p5/M, z23.s, z1.s\n"
+      "fmin z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z1.s\n"
+      "fmin z26.s, p5/M, z26.s, z1.s\n"
+      "fmin z27.s, p5/M, z27.s, z1.s\n"
+      "fmin z28.s, p5/M, z28.s, z1.s\n"
+      "fmin z29.s, p5/M, z29.s, z1.s\n"
+      "fmin z30.s, p5/M, z30.s, z1.s\n"
+      "fmin z31.s, p5/M, z31.s, z1.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z0.s\n"
+      "fmax z14.s, p5/M, z14.s, z0.s\n"
+      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z0.s\n"
+      "fmax z18.s, p5/M, z18.s, z0.s\n"
+      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z0.s\n"
+      "fmax z22.s, p5/M, z22.s, z0.s\n"
+      "fmax z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z0.s\n"
+      "fmax z26.s, p5/M, z26.s, z0.s\n"
+      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z28.s, p5/M, z28.s, z0.s\n"
+      "fmax z29.s, p5/M, z29.s, z0.s\n"
+      "fmax z30.s, p5/M, z30.s, z0.s\n"
+      "fmax z31.s, p5/M, z31.s, z0.s\n"
       "77:"  // Height 6: No activation
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x22]\n"
-      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z24.s }, p4, [x21]\n"
-      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
-      "st1w { z28.s }, p4, [x20]\n"
-      "st1w { z29.s }, p3, [x20, #1, MUL VL]\n"
-      "st1w { z30.s }, p2, [x20, #2, MUL VL]\n"
-      "st1w { z31.s }, p1, [x20, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x25]\n"
+      "st1w { z13.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x23]\n"
+      "st1w { z21.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x22]\n"
+      "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z28.s }, p4, [x21]\n"
+      "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
       "78:"  // Height 6: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
       "bgt 67b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 80f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 79f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "79:"  // Update direct input
-      "mov x19, #0x18\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x18\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "80:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
index 2273d97d5f..a353c9d660 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,19 +10,19 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
 
@@ -38,11 +38,13 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void sve_hybrid_fp32_mla_8x1VL( ARGLIST );
+void sve_hybrid_fp32_mla_8x1VL_a64fx( ARGLIST );
 
 class cls_sve_hybrid_fp32_mla_8x1VL
 {
 public:
-    typedef float operand_type;
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,16 +70,23 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 8, 1, 1> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 8, 1, 1> transforms = {};
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_fp32_mla_8x1VL;
-    cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *)
+    cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_hybrid_fp32_mla_8x1VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
new file mode 100644
index 0000000000..344341205b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
@@ -0,0 +1,1142 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_8x1VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x8\n"
+      "bge 85f\n"
+      "cmp %x[M], #0x6\n"
+      "bgt 73f\n"
+      "beq 61f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "mov x14, %x[bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p0.s, x20, x13\n"
+      "cbz x14, 3f\n"
+      "ld1w { z24.s }, p1/Z, [x14]\n"
+      "addvl x14, x14, #1\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z24.s }, p0/Z, [x11]\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z24.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x10, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "cbnz x10, 8f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x28, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "subs x9, x9, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x28, x28, #0x4\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "addvl x12, x12, #1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "addvl x12, x12, #1\n"
+      "bne 6b\n"
+      "tbz %x[flags], #1, 11f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "11:"  // Height 1: No activation
+      "st1w { z24.s }, p0, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "12:"  // Height 1: Writeback done
+      "decw x13\n"
+      "cmp x13, XZR\n"
+      "bgt 2b\n"
+      "b 98f\n"
+      "13:"  // Height 2
+      "mov x14, %x[bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "14:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p0.s, x20, x13\n"
+      "cbz x14, 15f\n"
+      "ld1w { z24.s }, p1/Z, [x14]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x14, x14, #1\n"
+      "b 17f\n"
+      "15:"  // Height 2: no bias
+      "tbz %x[flags], #0, 16f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x11, x20, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x11]\n"
+      "ld1w { z25.s }, p0/Z, [x20]\n"
+      "b 17f\n"
+      "16:"  // Height 2: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x10, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "cbnz x10, 20f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "20:"  // Height 2: input setup done
+      "subs x9, x9, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "ble 22f\n"
+      "21:"  // Height 2: Multiply loop: Main loop
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x28, x28, #0x4\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "add x27, x27, #0x4\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "bgt 21b\n"
+      "22:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "bne 18b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "tbz %x[flags], #1, 23f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "23:"  // Height 2: No activation
+      "st1w { z24.s }, p0, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p0, [x27]\n"
+      "24:"  // Height 2: Writeback done
+      "decw x13\n"
+      "cmp x13, XZR\n"
+      "bgt 14b\n"
+      "b 98f\n"
+      "25:"  // Height 3
+      "mov x14, %x[bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "26:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p0.s, x20, x13\n"
+      "cbz x14, 27f\n"
+      "ld1w { z24.s }, p1/Z, [x14]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
+      "b 29f\n"
+      "27:"  // Height 3: no bias
+      "tbz %x[flags], #0, 28f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x11, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x11]\n"
+      "ld1w { z25.s }, p0/Z, [x21]\n"
+      "ld1w { z26.s }, p0/Z, [x20]\n"
+      "b 29f\n"
+      "28:"  // Height 3: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x10, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x10, 32f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "32:"  // Height 3: input setup done
+      "subs x9, x9, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x28, x28, #0x4\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "add x27, x27, #0x4\n"
+      "add x26, x26, #0x4\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "addvl x12, x12, #1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "bne 30b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "tbz %x[flags], #1, 35f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "35:"  // Height 3: No activation
+      "st1w { z24.s }, p0, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p0, [x27]\n"
+      "st1w { z26.s }, p0, [x26]\n"
+      "36:"  // Height 3: Writeback done
+      "decw x13\n"
+      "cmp x13, XZR\n"
+      "bgt 26b\n"
+      "b 98f\n"
+      "37:"  // Height 4
+      "mov x14, %x[bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "38:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p0.s, x20, x13\n"
+      "cbz x14, 39f\n"
+      "ld1w { z24.s }, p1/Z, [x14]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
+      "mov z27.d, z24.d\n"
+      "b 41f\n"
+      "39:"  // Height 4: no bias
+      "tbz %x[flags], #0, 40f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x11, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x11]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z25.s }, p0/Z, [x22]\n"
+      "ld1w { z26.s }, p0/Z, [x21]\n"
+      "ld1w { z27.s }, p0/Z, [x20]\n"
+      "b 41f\n"
+      "40:"  // Height 4: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "41:"  // Height 4: setup done
+      "mov x10, #0x0\n"
+      "42:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 43f\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "cbnz x10, 44f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "b 44f\n"
+      "43:"  // Height 4: setup direct input
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "44:"  // Height 4: input setup done
+      "subs x9, x9, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "ld1rw { z3.s }, p1/Z, [x25]\n"
+      "ble 46f\n"
+      "45:"  // Height 4: Multiply loop: Main loop
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x28, x28, #0x4\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "add x27, x27, #0x4\n"
+      "add x26, x26, #0x4\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "add x25, x25, #0x4\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "addvl x12, x12, #1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "ld1rw { z3.s }, p1/Z, [x25]\n"
+      "bgt 45b\n"
+      "46:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "bne 42b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "tbz %x[flags], #1, 47f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmin z27.s, p1/M, z27.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "fmax z27.s, p1/M, z27.s, z16.s\n"
+      "47:"  // Height 4: No activation
+      "st1w { z24.s }, p0, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p0, [x27]\n"
+      "st1w { z26.s }, p0, [x26]\n"
+      "st1w { z27.s }, p0, [x25]\n"
+      "48:"  // Height 4: Writeback done
+      "decw x13\n"
+      "cmp x13, XZR\n"
+      "bgt 38b\n"
+      "b 98f\n"
+      "49:"  // Height 5
+      "mov x14, %x[bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "50:"  // Height 5: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p0.s, x20, x13\n"
+      "cbz x14, 51f\n"
+      "ld1w { z24.s }, p1/Z, [x14]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "b 53f\n"
+      "51:"  // Height 5: no bias
+      "tbz %x[flags], #0, 52f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x11, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x11]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z25.s }, p0/Z, [x23]\n"
+      "ld1w { z26.s }, p0/Z, [x22]\n"
+      "ld1w { z27.s }, p0/Z, [x21]\n"
+      "ld1w { z28.s }, p0/Z, [x20]\n"
+      "b 53f\n"
+      "52:"  // Height 5: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "53:"  // Height 5: setup done
+      "mov x10, #0x0\n"
+      "54:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "cbnz x10, 56f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 56f\n"
+      "55:"  // Height 5: setup direct input
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "56:"  // Height 5: input setup done
+      "subs x9, x9, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "ld1rw { z3.s }, p1/Z, [x25]\n"
+      "ld1rw { z4.s }, p1/Z, [x24]\n"
+      "ble 58f\n"
+      "57:"  // Height 5: Multiply loop: Main loop
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x28, x28, #0x4\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "add x27, x27, #0x4\n"
+      "add x26, x26, #0x4\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "add x25, x25, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "addvl x12, x12, #1\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "ld1rw { z3.s }, p1/Z, [x25]\n"
+      "ld1rw { z4.s }, p1/Z, [x24]\n"
+      "bgt 57b\n"
+      "58:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "bne 54b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "tbz %x[flags], #1, 59f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmin z27.s, p1/M, z27.s, z17.s\n"
+      "fmin z28.s, p1/M, z28.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "fmax z27.s, p1/M, z27.s, z16.s\n"
+      "fmax z28.s, p1/M, z28.s, z16.s\n"
+      "59:"  // Height 5: No activation
+      "st1w { z24.s }, p0, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p0, [x27]\n"
+      "st1w { z26.s }, p0, [x26]\n"
+      "st1w { z27.s }, p0, [x25]\n"
+      "st1w { z28.s }, p0, [x24]\n"
+      "60:"  // Height 5: Writeback done
+      "decw x13\n"
+      "cmp x13, XZR\n"
+      "bgt 50b\n"
+      "b 98f\n"
+      "61:"  // Height 6
+      "mov x14, %x[bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "62:"  // Height 6: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p0.s, x20, x13\n"
+      "cbz x14, 63f\n"
+      "ld1w { z24.s }, p1/Z, [x14]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "b 65f\n"
+      "63:"  // Height 6: no bias
+      "tbz %x[flags], #0, 64f\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x11, x24, LSL #2\n"
+      "add x20, x23, x24, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x11]\n"
+      "add x22, x20, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
+      "ld1w { z25.s }, p0/Z, [x23]\n"
+      "ld1w { z26.s }, p0/Z, [x20]\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z27.s }, p0/Z, [x22]\n"
+      "ld1w { z28.s }, p0/Z, [x21]\n"
+      "ld1w { z29.s }, p0/Z, [x20]\n"
+      "b 65f\n"
+      "64:"  // Height 6: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "65:"  // Height 6: setup done
+      "mov x10, #0x0\n"
+      "66:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "cbnz x10, 68f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 68f\n"
+      "67:"  // Height 6: setup direct input
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "68:"  // Height 6: input setup done
+      "subs x9, x9, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "ld1rw { z3.s }, p1/Z, [x25]\n"
+      "ld1rw { z4.s }, p1/Z, [x24]\n"
+      "ld1rw { z5.s }, p1/Z, [x23]\n"
+      "ble 70f\n"
+      "69:"  // Height 6: Multiply loop: Main loop
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x28, x28, #0x4\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "add x27, x27, #0x4\n"
+      "add x26, x26, #0x4\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "add x25, x25, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "add x23, x23, #0x4\n"
+      "addvl x12, x12, #1\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "ld1rw { z3.s }, p1/Z, [x25]\n"
+      "ld1rw { z4.s }, p1/Z, [x24]\n"
+      "ld1rw { z5.s }, p1/Z, [x23]\n"
+      "bgt 69b\n"
+      "70:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
+      "bne 66b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "tbz %x[flags], #1, 71f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmin z27.s, p1/M, z27.s, z17.s\n"
+      "fmin z28.s, p1/M, z28.s, z17.s\n"
+      "fmin z29.s, p1/M, z29.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "fmax z27.s, p1/M, z27.s, z16.s\n"
+      "fmax z28.s, p1/M, z28.s, z16.s\n"
+      "fmax z29.s, p1/M, z29.s, z16.s\n"
+      "71:"  // Height 6: No activation
+      "st1w { z24.s }, p0, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p0, [x27]\n"
+      "st1w { z26.s }, p0, [x26]\n"
+      "st1w { z27.s }, p0, [x25]\n"
+      "st1w { z28.s }, p0, [x24]\n"
+      "st1w { z29.s }, p0, [x23]\n"
+      "72:"  // Height 6: Writeback done
+      "decw x13\n"
+      "cmp x13, XZR\n"
+      "bgt 62b\n"
+      "b 98f\n"
+      "73:"  // Height 7
+      "mov x14, %x[bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "74:"  // Height 7: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p0.s, x20, x13\n"
+      "cbz x14, 75f\n"
+      "ld1w { z24.s }, p1/Z, [x14]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "mov z30.d, z24.d\n"
+      "b 77f\n"
+      "75:"  // Height 7: no bias
+      "tbz %x[flags], #0, 76f\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x11, x24, LSL #2\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x11]\n"
+      "add x23, x20, x24, LSL #2\n"
+      "add x22, x23, x24, LSL #2\n"
+      "ld1w { z25.s }, p0/Z, [x21]\n"
+      "ld1w { z26.s }, p0/Z, [x20]\n"
+      "add x21, x22, x24, LSL #2\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z27.s }, p0/Z, [x23]\n"
+      "ld1w { z28.s }, p0/Z, [x22]\n"
+      "ld1w { z29.s }, p0/Z, [x21]\n"
+      "ld1w { z30.s }, p0/Z, [x20]\n"
+      "b 77f\n"
+      "76:"  // Height 7: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "77:"  // Height 7: setup done
+      "mov x10, #0x0\n"
+      "78:"  // Height 7: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 79f\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "cbnz x10, 80f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 80f\n"
+      "79:"  // Height 7: setup direct input
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "80:"  // Height 7: input setup done
+      "subs x9, x9, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "ld1rw { z3.s }, p1/Z, [x25]\n"
+      "ld1rw { z4.s }, p1/Z, [x24]\n"
+      "ld1rw { z5.s }, p1/Z, [x23]\n"
+      "ld1rw { z6.s }, p1/Z, [x22]\n"
+      "ble 82f\n"
+      "81:"  // Height 7: Multiply loop: Main loop
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x28, x28, #0x4\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "add x27, x27, #0x4\n"
+      "add x26, x26, #0x4\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "add x25, x25, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "add x23, x23, #0x4\n"
+      "add x22, x22, #0x4\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
+      "addvl x12, x12, #1\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "fmla z30.s, p1/M, z16.s, z6.s\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "ld1rw { z3.s }, p1/Z, [x25]\n"
+      "ld1rw { z4.s }, p1/Z, [x24]\n"
+      "ld1rw { z5.s }, p1/Z, [x23]\n"
+      "ld1rw { z6.s }, p1/Z, [x22]\n"
+      "bgt 81b\n"
+      "82:"  // Height 7: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
+      "fmla z30.s, p1/M, z16.s, z6.s\n"
+      "bne 78b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmin z27.s, p1/M, z27.s, z17.s\n"
+      "fmin z28.s, p1/M, z28.s, z17.s\n"
+      "fmin z29.s, p1/M, z29.s, z17.s\n"
+      "fmin z30.s, p1/M, z30.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "fmax z27.s, p1/M, z27.s, z16.s\n"
+      "fmax z28.s, p1/M, z28.s, z16.s\n"
+      "fmax z29.s, p1/M, z29.s, z16.s\n"
+      "fmax z30.s, p1/M, z30.s, z16.s\n"
+      "83:"  // Height 7: No activation
+      "st1w { z24.s }, p0, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p0, [x27]\n"
+      "st1w { z26.s }, p0, [x26]\n"
+      "st1w { z27.s }, p0, [x25]\n"
+      "st1w { z28.s }, p0, [x24]\n"
+      "st1w { z29.s }, p0, [x23]\n"
+      "st1w { z30.s }, p0, [x22]\n"
+      "84:"  // Height 7: Writeback done
+      "decw x13\n"
+      "cmp x13, XZR\n"
+      "bgt 74b\n"
+      "b 98f\n"
+      "85:"  // Height 8
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x20\n"
+      "mov x14, %x[bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "86:"  // Height 8: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p0.s, x20, x13\n"
+      "cbz x14, 87f\n"
+      "ld1w { z24.s }, p1/Z, [x14]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "mov z30.d, z24.d\n"
+      "mov z31.d, z24.d\n"
+      "b 89f\n"
+      "87:"  // Height 8: no bias
+      "tbz %x[flags], #0, 88f\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x11, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x11]\n"
+      "add x23, x21, x24, LSL #2\n"
+      "add x20, x23, x24, LSL #2\n"
+      "ld1w { z25.s }, p0/Z, [x22]\n"
+      "ld1w { z26.s }, p0/Z, [x21]\n"
+      "add x22, x20, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
+      "ld1w { z27.s }, p0/Z, [x23]\n"
+      "ld1w { z28.s }, p0/Z, [x20]\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z29.s }, p0/Z, [x22]\n"
+      "ld1w { z30.s }, p0/Z, [x21]\n"
+      "ld1w { z31.s }, p0/Z, [x20]\n"
+      "b 89f\n"
+      "88:"  // Height 8: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "89:"  // Height 8: setup done
+      "mov x10, #0x0\n"
+      "90:"  // Height 8: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 91f\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "ldr x21, [x20, #0x38]\n"
+      "cbnz x10, 92f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 92f\n"
+      "91:"  // Height 8: setup direct input
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "92:"  // Height 8: input setup done
+      "subs x9, x9, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "ld1rw { z3.s }, p1/Z, [x25]\n"
+      "ld1rw { z4.s }, p1/Z, [x24]\n"
+      "ld1rw { z5.s }, p1/Z, [x23]\n"
+      "ld1rw { z6.s }, p1/Z, [x22]\n"
+      "ld1rw { z7.s }, p1/Z, [x21]\n"
+      "ble 94f\n"
+      "93:"  // Height 8: Multiply loop: Main loop
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x28, x28, #0x4\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "add x27, x27, #0x4\n"
+      "add x26, x26, #0x4\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "add x25, x25, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "add x23, x23, #0x4\n"
+      "add x22, x22, #0x4\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
+      "ld1rw { z0.s }, p1/Z, [x28]\n"
+      "add x21, x21, #0x4\n"
+      "addvl x12, x12, #1\n"
+      "ld1rw { z1.s }, p1/Z, [x27]\n"
+      "fmla z30.s, p1/M, z16.s, z6.s\n"
+      "fmla z31.s, p1/M, z16.s, z7.s\n"
+      "ld1rw { z2.s }, p1/Z, [x26]\n"
+      "ld1rw { z3.s }, p1/Z, [x25]\n"
+      "ld1rw { z4.s }, p1/Z, [x24]\n"
+      "ld1rw { z5.s }, p1/Z, [x23]\n"
+      "ld1rw { z6.s }, p1/Z, [x22]\n"
+      "ld1rw { z7.s }, p1/Z, [x21]\n"
+      "bgt 93b\n"
+      "94:"  // Height 8: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "addvl x12, x12, #1\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
+      "fmla z30.s, p1/M, z16.s, z6.s\n"
+      "fmla z31.s, p1/M, z16.s, z7.s\n"
+      "bne 90b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "tbz %x[flags], #1, 95f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmin z27.s, p1/M, z27.s, z17.s\n"
+      "fmin z28.s, p1/M, z28.s, z17.s\n"
+      "fmin z29.s, p1/M, z29.s, z17.s\n"
+      "fmin z30.s, p1/M, z30.s, z17.s\n"
+      "fmin z31.s, p1/M, z31.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "fmax z27.s, p1/M, z27.s, z16.s\n"
+      "fmax z28.s, p1/M, z28.s, z16.s\n"
+      "fmax z29.s, p1/M, z29.s, z16.s\n"
+      "fmax z30.s, p1/M, z30.s, z16.s\n"
+      "fmax z31.s, p1/M, z31.s, z16.s\n"
+      "95:"  // Height 8: No activation
+      "st1w { z24.s }, p0, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p0, [x27]\n"
+      "st1w { z26.s }, p0, [x26]\n"
+      "st1w { z27.s }, p0, [x25]\n"
+      "st1w { z28.s }, p0, [x24]\n"
+      "st1w { z29.s }, p0, [x23]\n"
+      "st1w { z30.s }, p0, [x22]\n"
+      "st1w { z31.s }, p0, [x21]\n"
+      "96:"  // Height 8: Writeback done
+      "decw x13\n"
+      "cmp x13, XZR\n"
+      "bgt 86b\n"
+      "subs %x[M], %x[M], #0x8\n"
+      "beq 98f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 97f\n"
+      "add x21, x21, #0x8\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "97:"  // Update direct input
+      "mov x20, #0x20\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "98:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
index 863325f7f5..161c85e5f3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,18 +10,18 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -105,400 +105,382 @@ void sve_hybrid_fp32_mla_8x1VL (
       "cmp %x[M], #0x2\n"
       "bgt 27f\n"
       "beq 14f\n"
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x11, %x[bias]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p1.s, x19, x13\n"
-      "cbz x11, 3f\n"
-      "ld1w { z24.s }, p2/Z, [x11]\n"
-      "addvl x11, x11, #1\n"
+      "mov x20, #0x0\n"
+      "whilelt p1.s, x20, x13\n"
+      "cbz x14, 3f\n"
+      "ld1w { z24.s }, p2/Z, [x14]\n"
+      "addvl x14, x14, #1\n"
       "b 5f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 4f\n"
-      "ld1w { z24.s }, p1/Z, [x10]\n"
+      "ld1w { z24.s }, p1/Z, [x11]\n"
       "b 5f\n"
       "4:"  // Height 1: no accumulate
       "mov z24.b, #0x0\n"
       "5:"  // Height 1: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "cbnz x9, 8f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "cbnz x10, 8f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
       "b 8f\n"
       "7:"  // Height 1: setup direct input
-      "mov x27, %x[input_ptr]\n"
+      "mov x28, %x[input_ptr]\n"
       "8:"  // Height 1: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "ble 10f\n"
       "9:"  // Height 1: Multiply loop: Main loop head
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "sub x28, x28, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x27, x27, #0x10\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z0.s }, p0/Z, [x28]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "sub x9, x9, #0x4\n"
+      "cmp x9, #0x4\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "add x28, x28, #0x10\n"
       "addvl x12, x12, #4\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Single iteration only
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "subs x28, x28, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "add x27, x27, #0x10\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z0.s }, p0/Z, [x28]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
       "addvl x12, x12, #1\n"
       "ble 11f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "subs x28, x28, #0x1\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
       "addvl x12, x12, #1\n"
       "ble 11f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "subs x28, x28, #0x1\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
       "addvl x12, x12, #1\n"
       "ble 11f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
       "addvl x12, x12, #1\n"
       "11:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 6b\n"
       "tbz %x[flags], #1, 12f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "fmin z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z24.s, p2/M, z24.s, z16.s\n"
       "12:"  // Height 1: No activation
-      "st1w { z24.s }, p1, [x10]\n"
-      "addvl x10, x10, #1\n"
+      "st1w { z24.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
       "13:"  // Height 1: Writeback done
       "decw x13\n"
       "cmp x13, XZR\n"
       "bgt 2b\n"
       "b 106f\n"
       "14:"  // Height 2
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "15:"  // Height 2: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p1.s, x19, x13\n"
-      "cbz x11, 16f\n"
-      "ld1w { z24.s }, p2/Z, [x11]\n"
+      "mov x20, #0x0\n"
+      "whilelt p1.s, x20, x13\n"
+      "cbz x14, 16f\n"
+      "ld1w { z24.s }, p2/Z, [x14]\n"
       "mov z25.d, z24.d\n"
-      "addvl x11, x11, #1\n"
+      "addvl x14, x14, #1\n"
       "b 18f\n"
       "16:"  // Height 2: no bias
       "tbz %x[flags], #0, 17f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z24.s }, p1/Z, [x10]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x26]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x11, x20, LSL #2\n"
+      "ld1w { z24.s }, p1/Z, [x11]\n"
+      "ld1w { z25.s }, p1/Z, [x20]\n"
       "b 18f\n"
       "17:"  // Height 2: no accumulate
       "mov z24.b, #0x0\n"
       "mov z25.b, #0x0\n"
       "18:"  // Height 2: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "19:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "cbnz x9, 21f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "cbnz x10, 21f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
       "b 21f\n"
       "20:"  // Height 2: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
       "21:"  // Height 2: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "ble 23f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "sub x28, x28, #0x4\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
+      "sub x9, x9, #0x4\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z1.s[0]\n"
+      "fmla z25.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z24.s, z16.s, z1.s[1]\n"
+      "fmla z25.s, z16.s, z0.s[1]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "fmla z24.s, z17.s, z1.s[2]\n"
+      "fmla z25.s, z17.s, z0.s[2]\n"
+      "cmp x9, #0x4\n"
+      "add x28, x28, #0x10\n"
+      "fmla z24.s, z16.s, z1.s[3]\n"
+      "fmla z25.s, z16.s, z0.s[3]\n"
       "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
       "bgt 22b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "subs x28, x28, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "add x26, x26, #0x10\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z0.s }, p0/Z, [x28]\n"
+      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "subs x9, x9, #0x1\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
       "addvl x12, x12, #1\n"
       "ble 24f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
       "addvl x12, x12, #1\n"
       "ble 24f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
       "addvl x12, x12, #1\n"
       "ble 24f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 19b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x26, x10, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
       "tbz %x[flags], #1, 25f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "fmin z24.s, p2/M, z24.s, z17.s\n"
+      "fmin z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z24.s, p2/M, z24.s, z16.s\n"
+      "fmax z25.s, p2/M, z25.s, z16.s\n"
       "25:"  // Height 2: No activation
-      "st1w { z24.s }, p1, [x10]\n"
-      "addvl x10, x10, #1\n"
-      "st1w { z25.s }, p1, [x26]\n"
+      "st1w { z24.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p1, [x27]\n"
       "26:"  // Height 2: Writeback done
       "decw x13\n"
       "cmp x13, XZR\n"
       "bgt 15b\n"
       "b 106f\n"
       "27:"  // Height 3
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "28:"  // Height 3: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p1.s, x19, x13\n"
-      "cbz x11, 29f\n"
-      "ld1w { z24.s }, p2/Z, [x11]\n"
+      "mov x20, #0x0\n"
+      "whilelt p1.s, x20, x13\n"
+      "cbz x14, 29f\n"
+      "ld1w { z24.s }, p2/Z, [x14]\n"
       "mov z25.d, z24.d\n"
-      "addvl x11, x11, #1\n"
       "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
       "b 31f\n"
       "29:"  // Height 3: no bias
       "tbz %x[flags], #0, 30f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z24.s }, p1/Z, [x10]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x26]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "ld1w { z26.s }, p1/Z, [x25]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x11, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z24.s }, p1/Z, [x11]\n"
+      "ld1w { z25.s }, p1/Z, [x21]\n"
+      "ld1w { z26.s }, p1/Z, [x20]\n"
       "b 31f\n"
       "30:"  // Height 3: no accumulate
       "mov z24.b, #0x0\n"
       "mov z25.b, #0x0\n"
       "mov z26.b, #0x0\n"
       "31:"  // Height 3: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "cbnz x9, 34f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x10, 34f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
       "b 34f\n"
       "33:"  // Height 3: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
       "34:"  // Height 3: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "ble 36f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "sub x28, x28, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z2.s }, p0/Z, [x28]\n"
+      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "sub x9, x9, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z2.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
+      "fmla z26.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z24.s, z16.s, z2.s[1]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "cmp x9, #0x4\n"
+      "fmla z24.s, z17.s, z2.s[2]\n"
+      "fmla z25.s, z17.s, z1.s[2]\n"
+      "add x28, x28, #0x10\n"
       "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x25]\n"
+      "fmla z26.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z2.s[3]\n"
       "add x26, x26, #0x10\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
+      "fmla z26.s, z16.s, z0.s[3]\n"
       "bgt 35b\n"
       "36:"  // Height 3: Multiply loop: Single iteration only
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "subs x28, x28, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "add x25, x25, #0x10\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z0.s }, p0/Z, [x28]\n"
+      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "subs x9, x9, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
       "addvl x12, x12, #1\n"
       "ble 37f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
       "ble 37f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
       "ble 37f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 32b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
       "tbz %x[flags], #1, 38f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "fmin z24.s, p2/M, z24.s, z17.s\n"
+      "fmin z25.s, p2/M, z25.s, z17.s\n"
+      "fmin z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z24.s, p2/M, z24.s, z16.s\n"
+      "fmax z25.s, p2/M, z25.s, z16.s\n"
+      "fmax z26.s, p2/M, z26.s, z16.s\n"
       "38:"  // Height 3: No activation
-      "st1w { z24.s }, p1, [x10]\n"
-      "addvl x10, x10, #1\n"
-      "st1w { z25.s }, p1, [x26]\n"
-      "st1w { z26.s }, p1, [x25]\n"
+      "st1w { z24.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p1, [x27]\n"
+      "st1w { z26.s }, p1, [x26]\n"
       "39:"  // Height 3: Writeback done
       "decw x13\n"
       "cmp x13, XZR\n"
       "bgt 28b\n"
       "b 106f\n"
       "40:"  // Height 4
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "41:"  // Height 4: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p1.s, x19, x13\n"
-      "cbz x11, 42f\n"
-      "ld1w { z24.s }, p2/Z, [x11]\n"
+      "mov x20, #0x0\n"
+      "whilelt p1.s, x20, x13\n"
+      "cbz x14, 42f\n"
+      "ld1w { z24.s }, p2/Z, [x14]\n"
       "mov z25.d, z24.d\n"
-      "addvl x11, x11, #1\n"
       "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
       "mov z27.d, z24.d\n"
       "b 44f\n"
       "42:"  // Height 4: no bias
       "tbz %x[flags], #0, 43f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z24.s }, p1/Z, [x10]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x26]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "ld1w { z26.s }, p1/Z, [x25]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "ld1w { z27.s }, p1/Z, [x24]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x11, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z24.s }, p1/Z, [x11]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z25.s }, p1/Z, [x22]\n"
+      "ld1w { z26.s }, p1/Z, [x21]\n"
+      "ld1w { z27.s }, p1/Z, [x20]\n"
       "b 44f\n"
       "43:"  // Height 4: no accumulate
       "mov z24.b, #0x0\n"
@@ -506,176 +488,164 @@ void sve_hybrid_fp32_mla_8x1VL (
       "mov z26.b, #0x0\n"
       "mov z27.b, #0x0\n"
       "44:"  // Height 4: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "45:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "cbnz x9, 47f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "cbnz x10, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
       "b 47f\n"
       "46:"  // Height 4: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "47:"  // Height 4: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "ble 49f\n"
       "48:"  // Height 4: Multiply loop: Main loop head
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "sub x28, x28, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "ld1rqw { z2.s }, p0/Z, [x27]\n"
+      "sub x9, x9, #0x4\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "cmp x9, #0x4\n"
+      "add x28, x28, #0x10\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z3.s[0]\n"
+      "fmla z25.s, z16.s, z2.s[0]\n"
+      "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z26.s, z16.s, z1.s[0]\n"
+      "fmla z27.s, z16.s, z0.s[0]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "fmla z24.s, z18.s, z3.s[1]\n"
+      "fmla z25.s, z18.s, z2.s[1]\n"
       "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x25]\n"
       "add x26, x26, #0x10\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "fmla z26.s, z18.s, z1.s[1]\n"
+      "fmla z27.s, z18.s, z0.s[1]\n"
       "add x25, x25, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z24.s, z17.s, z3.s[2]\n"
+      "fmla z25.s, z17.s, z2.s[2]\n"
+      "fmla z26.s, z17.s, z1.s[2]\n"
+      "fmla z27.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z3.s[3]\n"
+      "fmla z25.s, z16.s, z2.s[3]\n"
+      "fmla z26.s, z16.s, z1.s[3]\n"
+      "fmla z27.s, z16.s, z0.s[3]\n"
       "bgt 48b\n"
       "49:"  // Height 4: Multiply loop: Single iteration only
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "subs x28, x28, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "add x24, x24, #0x10\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z0.s }, p0/Z, [x28]\n"
+      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "subs x9, x9, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
       "addvl x12, x12, #1\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z3.s[0]\n"
       "ble 50f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
+      "fmla z27.s, z16.s, z3.s[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
       "ble 50f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
+      "fmla z27.s, z16.s, z3.s[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
       "ble 50f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 45b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
       "tbz %x[flags], #1, 51f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "fmin z24.s, p2/M, z24.s, z17.s\n"
+      "fmin z25.s, p2/M, z25.s, z17.s\n"
+      "fmin z26.s, p2/M, z26.s, z17.s\n"
+      "fmin z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z24.s, p2/M, z24.s, z16.s\n"
+      "fmax z25.s, p2/M, z25.s, z16.s\n"
+      "fmax z26.s, p2/M, z26.s, z16.s\n"
+      "fmax z27.s, p2/M, z27.s, z16.s\n"
       "51:"  // Height 4: No activation
-      "st1w { z24.s }, p1, [x10]\n"
-      "addvl x10, x10, #1\n"
-      "st1w { z25.s }, p1, [x26]\n"
-      "st1w { z26.s }, p1, [x25]\n"
-      "st1w { z27.s }, p1, [x24]\n"
+      "st1w { z24.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p1, [x27]\n"
+      "st1w { z26.s }, p1, [x26]\n"
+      "st1w { z27.s }, p1, [x25]\n"
       "52:"  // Height 4: Writeback done
       "decw x13\n"
       "cmp x13, XZR\n"
       "bgt 41b\n"
       "b 106f\n"
       "53:"  // Height 5
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "54:"  // Height 5: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p1.s, x19, x13\n"
-      "cbz x11, 55f\n"
-      "ld1w { z24.s }, p2/Z, [x11]\n"
+      "mov x20, #0x0\n"
+      "whilelt p1.s, x20, x13\n"
+      "cbz x14, 55f\n"
+      "ld1w { z24.s }, p2/Z, [x14]\n"
       "mov z25.d, z24.d\n"
-      "addvl x11, x11, #1\n"
       "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
       "mov z27.d, z24.d\n"
       "mov z28.d, z24.d\n"
       "b 57f\n"
       "55:"  // Height 5: no bias
       "tbz %x[flags], #0, 56f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z24.s }, p1/Z, [x10]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x26]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "ld1w { z26.s }, p1/Z, [x25]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "ld1w { z27.s }, p1/Z, [x24]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z28.s }, p1/Z, [x23]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x11, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z24.s }, p1/Z, [x11]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z25.s }, p1/Z, [x23]\n"
+      "ld1w { z26.s }, p1/Z, [x22]\n"
+      "ld1w { z27.s }, p1/Z, [x21]\n"
+      "ld1w { z28.s }, p1/Z, [x20]\n"
       "b 57f\n"
       "56:"  // Height 5: no accumulate
       "mov z24.b, #0x0\n"
@@ -684,200 +654,185 @@ void sve_hybrid_fp32_mla_8x1VL (
       "mov z27.b, #0x0\n"
       "mov z28.b, #0x0\n"
       "57:"  // Height 5: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "58:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x23, [x20, #0x20]\n"
-      "cbnz x9, 60f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "cbnz x10, 60f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
       "b 60f\n"
       "59:"  // Height 5: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "60:"  // Height 5: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "ble 62f\n"
       "61:"  // Height 5: Multiply loop: Main loop head
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "sub x28, x28, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z4.s }, p0/Z, [x28]\n"
+      "ld1rqw { z3.s }, p0/Z, [x27]\n"
+      "sub x9, x9, #0x4\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "cmp x9, #0x4\n"
+      "add x28, x28, #0x10\n"
+      "ld1rqw { z0.s }, p0/Z, [x24]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z4.s[0]\n"
+      "fmla z25.s, z16.s, z3.s[0]\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z1.s[0]\n"
+      "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "fmla z28.s, z16.s, z0.s[0]\n"
+      "fmla z24.s, z18.s, z4.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
       "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x25]\n"
+      "fmla z25.s, z18.s, z3.s[1]\n"
+      "fmla z26.s, z18.s, z2.s[1]\n"
       "add x26, x26, #0x10\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1rqw { z3.s }, p0/Z, [x24]\n"
       "add x25, x25, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x23]\n"
+      "fmla z27.s, z18.s, z1.s[1]\n"
+      "fmla z28.s, z18.s, z0.s[1]\n"
       "add x24, x24, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
+      "fmla z24.s, z17.s, z4.s[2]\n"
+      "fmla z25.s, z17.s, z3.s[2]\n"
+      "fmla z26.s, z17.s, z2.s[2]\n"
+      "fmla z27.s, z17.s, z1.s[2]\n"
+      "fmla z28.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z4.s[3]\n"
+      "fmla z25.s, z16.s, z3.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z1.s[3]\n"
+      "fmla z28.s, z16.s, z0.s[3]\n"
       "bgt 61b\n"
       "62:"  // Height 5: Multiply loop: Single iteration only
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "subs x28, x28, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "add x23, x23, #0x10\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z0.s }, p0/Z, [x28]\n"
+      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "subs x9, x9, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "ld1rqw { z4.s }, p0/Z, [x24]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z3.s[0]\n"
       "addvl x12, x12, #1\n"
+      "fmla z28.s, z16.s, z4.s[0]\n"
       "ble 63f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
+      "fmla z27.s, z16.s, z3.s[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
+      "fmla z28.s, z16.s, z4.s[1]\n"
       "ble 63f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
+      "fmla z27.s, z16.s, z3.s[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
+      "fmla z28.s, z16.s, z4.s[2]\n"
       "ble 63f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
+      "fmla z28.s, z16.s, z4.s[3]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 58b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
       "tbz %x[flags], #1, 64f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "fmin z24.s, p2/M, z24.s, z17.s\n"
+      "fmin z25.s, p2/M, z25.s, z17.s\n"
+      "fmin z26.s, p2/M, z26.s, z17.s\n"
+      "fmin z27.s, p2/M, z27.s, z17.s\n"
+      "fmin z28.s, p2/M, z28.s, z17.s\n"
+      "fmax z24.s, p2/M, z24.s, z16.s\n"
+      "fmax z25.s, p2/M, z25.s, z16.s\n"
+      "fmax z26.s, p2/M, z26.s, z16.s\n"
+      "fmax z27.s, p2/M, z27.s, z16.s\n"
+      "fmax z28.s, p2/M, z28.s, z16.s\n"
       "64:"  // Height 5: No activation
-      "st1w { z24.s }, p1, [x10]\n"
-      "addvl x10, x10, #1\n"
-      "st1w { z25.s }, p1, [x26]\n"
-      "st1w { z26.s }, p1, [x25]\n"
-      "st1w { z27.s }, p1, [x24]\n"
-      "st1w { z28.s }, p1, [x23]\n"
+      "st1w { z24.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p1, [x27]\n"
+      "st1w { z26.s }, p1, [x26]\n"
+      "st1w { z27.s }, p1, [x25]\n"
+      "st1w { z28.s }, p1, [x24]\n"
       "65:"  // Height 5: Writeback done
       "decw x13\n"
       "cmp x13, XZR\n"
       "bgt 54b\n"
       "b 106f\n"
       "66:"  // Height 6
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "67:"  // Height 6: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p1.s, x19, x13\n"
-      "cbz x11, 68f\n"
-      "ld1w { z24.s }, p2/Z, [x11]\n"
+      "mov x20, #0x0\n"
+      "whilelt p1.s, x20, x13\n"
+      "cbz x14, 68f\n"
+      "ld1w { z24.s }, p2/Z, [x14]\n"
       "mov z25.d, z24.d\n"
-      "addvl x11, x11, #1\n"
       "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
       "mov z27.d, z24.d\n"
       "mov z28.d, z24.d\n"
       "mov z29.d, z24.d\n"
       "b 70f\n"
       "68:"  // Height 6: no bias
       "tbz %x[flags], #0, 69f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z24.s }, p1/Z, [x10]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x26]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "ld1w { z26.s }, p1/Z, [x25]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "ld1w { z27.s }, p1/Z, [x24]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z28.s }, p1/Z, [x23]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z29.s }, p1/Z, [x22]\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x11, x24, LSL #2\n"
+      "add x20, x23, x24, LSL #2\n"
+      "ld1w { z24.s }, p1/Z, [x11]\n"
+      "add x22, x20, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
+      "ld1w { z25.s }, p1/Z, [x23]\n"
+      "ld1w { z26.s }, p1/Z, [x20]\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z27.s }, p1/Z, [x22]\n"
+      "ld1w { z28.s }, p1/Z, [x21]\n"
+      "ld1w { z29.s }, p1/Z, [x20]\n"
       "b 70f\n"
       "69:"  // Height 6: no accumulate
       "mov z24.b, #0x0\n"
@@ -887,203 +842,185 @@ void sve_hybrid_fp32_mla_8x1VL (
       "mov z28.b, #0x0\n"
       "mov z29.b, #0x0\n"
       "70:"  // Height 6: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "71:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 72f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x23, [x20, #0x20]\n"
-      "ldr x22, [x20, #0x28]\n"
-      "cbnz x9, 73f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "cbnz x10, 73f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
       "b 73f\n"
       "72:"  // Height 6: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "73:"  // Height 6: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "ble 75f\n"
       "74:"  // Height 6: Multiply loop: Main loop head
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "sub x28, x28, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z5.s }, p0/Z, [x28]\n"
+      "ld1rqw { z4.s }, p0/Z, [x27]\n"
+      "sub x9, x9, #0x4\n"
+      "ld1rqw { z3.s }, p0/Z, [x26]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
+      "cmp x9, #0x4\n"
+      "add x28, x28, #0x10\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      "add x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "ld1w { z19.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z19.s, z5.s[0]\n"
+      "fmla z25.s, z19.s, z4.s[0]\n"
+      "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z26.s, z19.s, z3.s[0]\n"
+      "fmla z27.s, z19.s, z2.s[0]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "fmla z28.s, z19.s, z1.s[0]\n"
+      "fmla z29.s, z19.s, z0.s[0]\n"
       "add x25, x25, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1rqw { z5.s }, p0/Z, [x22]\n"
+      "fmla z24.s, z18.s, z5.s[1]\n"
+      "fmla z25.s, z18.s, z4.s[1]\n"
       "add x23, x23, #0x10\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
+      "fmla z26.s, z18.s, z3.s[1]\n"
+      "fmla z27.s, z18.s, z2.s[1]\n"
+      "fmla z28.s, z18.s, z1.s[1]\n"
+      "fmla z29.s, z18.s, z0.s[1]\n"
+      "fmla z24.s, z17.s, z5.s[2]\n"
+      "fmla z25.s, z17.s, z4.s[2]\n"
+      "fmla z26.s, z17.s, z3.s[2]\n"
+      "fmla z27.s, z17.s, z2.s[2]\n"
+      "fmla z28.s, z17.s, z1.s[2]\n"
+      "fmla z29.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z5.s[3]\n"
+      "fmla z25.s, z16.s, z4.s[3]\n"
+      "fmla z26.s, z16.s, z3.s[3]\n"
+      "fmla z27.s, z16.s, z2.s[3]\n"
+      "fmla z28.s, z16.s, z1.s[3]\n"
+      "fmla z29.s, z16.s, z0.s[3]\n"
       "bgt 74b\n"
       "75:"  // Height 6: Multiply loop: Single iteration only
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "subs x28, x28, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "ld1rqw { z5.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
-      "add x22, x22, #0x10\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z0.s }, p0/Z, [x28]\n"
+      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "subs x9, x9, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "ld1rqw { z4.s }, p0/Z, [x24]\n"
+      "ld1rqw { z5.s }, p0/Z, [x23]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
       "addvl x12, x12, #1\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z3.s[0]\n"
+      "fmla z28.s, z16.s, z4.s[0]\n"
+      "fmla z29.s, z16.s, z5.s[0]\n"
       "ble 76f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
+      "fmla z27.s, z16.s, z3.s[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
+      "fmla z28.s, z16.s, z4.s[1]\n"
+      "fmla z29.s, z16.s, z5.s[1]\n"
       "ble 76f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
+      "fmla z27.s, z16.s, z3.s[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
+      "fmla z28.s, z16.s, z4.s[2]\n"
+      "fmla z29.s, z16.s, z5.s[2]\n"
       "ble 76f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
+      "fmla z28.s, z16.s, z4.s[3]\n"
+      "fmla z29.s, z16.s, z5.s[3]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 71b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
       "tbz %x[flags], #1, 77f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
-      "fmin z29.s, p2/M, z29.s, z16.s\n"
-      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "fmin z24.s, p2/M, z24.s, z17.s\n"
+      "fmin z25.s, p2/M, z25.s, z17.s\n"
+      "fmin z26.s, p2/M, z26.s, z17.s\n"
+      "fmin z27.s, p2/M, z27.s, z17.s\n"
+      "fmin z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z17.s\n"
+      "fmax z24.s, p2/M, z24.s, z16.s\n"
+      "fmax z25.s, p2/M, z25.s, z16.s\n"
+      "fmax z26.s, p2/M, z26.s, z16.s\n"
+      "fmax z27.s, p2/M, z27.s, z16.s\n"
+      "fmax z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z16.s\n"
       "77:"  // Height 6: No activation
-      "st1w { z24.s }, p1, [x10]\n"
-      "addvl x10, x10, #1\n"
-      "st1w { z25.s }, p1, [x26]\n"
-      "st1w { z26.s }, p1, [x25]\n"
-      "st1w { z27.s }, p1, [x24]\n"
-      "st1w { z28.s }, p1, [x23]\n"
-      "st1w { z29.s }, p1, [x22]\n"
+      "st1w { z24.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p1, [x27]\n"
+      "st1w { z26.s }, p1, [x26]\n"
+      "st1w { z27.s }, p1, [x25]\n"
+      "st1w { z28.s }, p1, [x24]\n"
+      "st1w { z29.s }, p1, [x23]\n"
       "78:"  // Height 6: Writeback done
       "decw x13\n"
       "cmp x13, XZR\n"
       "bgt 67b\n"
       "b 106f\n"
       "79:"  // Height 7
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "80:"  // Height 7: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p1.s, x19, x13\n"
-      "cbz x11, 81f\n"
-      "ld1w { z24.s }, p2/Z, [x11]\n"
+      "mov x20, #0x0\n"
+      "whilelt p1.s, x20, x13\n"
+      "cbz x14, 81f\n"
+      "ld1w { z24.s }, p2/Z, [x14]\n"
       "mov z25.d, z24.d\n"
-      "addvl x11, x11, #1\n"
       "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
       "mov z27.d, z24.d\n"
       "mov z28.d, z24.d\n"
       "mov z29.d, z24.d\n"
@@ -1091,20 +1028,20 @@ void sve_hybrid_fp32_mla_8x1VL (
       "b 83f\n"
       "81:"  // Height 7: no bias
       "tbz %x[flags], #0, 82f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z24.s }, p1/Z, [x10]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x26]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "ld1w { z26.s }, p1/Z, [x25]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "ld1w { z27.s }, p1/Z, [x24]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z28.s }, p1/Z, [x23]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z29.s }, p1/Z, [x22]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "ld1w { z30.s }, p1/Z, [x21]\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x11, x24, LSL #2\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z24.s }, p1/Z, [x11]\n"
+      "add x23, x20, x24, LSL #2\n"
+      "add x22, x23, x24, LSL #2\n"
+      "ld1w { z25.s }, p1/Z, [x21]\n"
+      "ld1w { z26.s }, p1/Z, [x20]\n"
+      "add x21, x22, x24, LSL #2\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z27.s }, p1/Z, [x23]\n"
+      "ld1w { z28.s }, p1/Z, [x22]\n"
+      "ld1w { z29.s }, p1/Z, [x21]\n"
+      "ld1w { z30.s }, p1/Z, [x20]\n"
       "b 83f\n"
       "82:"  // Height 7: no accumulate
       "mov z24.b, #0x0\n"
@@ -1115,227 +1052,206 @@ void sve_hybrid_fp32_mla_8x1VL (
       "mov z29.b, #0x0\n"
       "mov z30.b, #0x0\n"
       "83:"  // Height 7: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "84:"  // Height 7: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 85f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x23, [x20, #0x20]\n"
-      "ldr x22, [x20, #0x28]\n"
-      "ldr x21, [x20, #0x30]\n"
-      "cbnz x9, 86f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
-      "add x21, x21, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "cbnz x10, 86f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
       "b 86f\n"
       "85:"  // Height 7: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "86:"  // Height 7: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "ble 88f\n"
       "87:"  // Height 7: Multiply loop: Main loop head
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "sub x28, x28, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z6.s }, p0/Z, [x28]\n"
+      "ld1rqw { z5.s }, p0/Z, [x27]\n"
+      "sub x9, x9, #0x4\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "cmp x9, #0x4\n"
+      "add x28, x28, #0x10\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z1.s }, p0/Z, [x23]\n"
       "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x25]\n"
       "add x26, x26, #0x10\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x22]\n"
+      "ld1w { z19.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z19.s, z6.s[0]\n"
+      "fmla z25.s, z19.s, z5.s[0]\n"
+      "fmla z26.s, z19.s, z4.s[0]\n"
+      "fmla z27.s, z19.s, z3.s[0]\n"
+      "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "fmla z28.s, z19.s, z2.s[0]\n"
+      "fmla z29.s, z19.s, z1.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x23]\n"
+      "fmla z30.s, z19.s, z0.s[0]\n"
+      "fmla z24.s, z18.s, z6.s[1]\n"
       "add x24, x24, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1rqw { z5.s }, p0/Z, [x22]\n"
       "add x23, x23, #0x10\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "ld1rqw { z6.s }, p0/Z, [x21]\n"
+      "fmla z25.s, z18.s, z5.s[1]\n"
+      "fmla z26.s, z18.s, z4.s[1]\n"
       "add x22, x22, #0x10\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x21, x21, #0x10\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
-      "fmla z30.s, z8.s, z6.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z30.s, z9.s, z6.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
-      "fmla z30.s, z10.s, z6.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
-      "fmla z30.s, z11.s, z6.s[3]\n"
+      "fmla z27.s, z18.s, z3.s[1]\n"
+      "fmla z28.s, z18.s, z2.s[1]\n"
+      "fmla z29.s, z18.s, z1.s[1]\n"
+      "fmla z30.s, z18.s, z0.s[1]\n"
+      "fmla z24.s, z17.s, z6.s[2]\n"
+      "fmla z25.s, z17.s, z5.s[2]\n"
+      "fmla z26.s, z17.s, z4.s[2]\n"
+      "fmla z27.s, z17.s, z3.s[2]\n"
+      "fmla z28.s, z17.s, z2.s[2]\n"
+      "fmla z29.s, z17.s, z1.s[2]\n"
+      "fmla z30.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z6.s[3]\n"
+      "fmla z25.s, z16.s, z5.s[3]\n"
+      "fmla z26.s, z16.s, z4.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
+      "fmla z28.s, z16.s, z2.s[3]\n"
+      "fmla z29.s, z16.s, z1.s[3]\n"
+      "fmla z30.s, z16.s, z0.s[3]\n"
       "bgt 87b\n"
       "88:"  // Height 7: Multiply loop: Single iteration only
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "subs x28, x28, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "ld1rqw { z5.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
-      "ld1rqw { z6.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z30.s, z8.s, z6.s[0]\n"
-      "add x21, x21, #0x10\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z0.s }, p0/Z, [x28]\n"
+      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "subs x9, x9, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "ld1rqw { z4.s }, p0/Z, [x24]\n"
+      "ld1rqw { z5.s }, p0/Z, [x23]\n"
+      "ld1rqw { z6.s }, p0/Z, [x22]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z3.s[0]\n"
       "addvl x12, x12, #1\n"
+      "fmla z28.s, z16.s, z4.s[0]\n"
+      "fmla z29.s, z16.s, z5.s[0]\n"
+      "fmla z30.s, z16.s, z6.s[0]\n"
       "ble 89f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
+      "fmla z27.s, z16.s, z3.s[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
-      "fmla z30.s, z9.s, z6.s[1]\n"
+      "fmla z28.s, z16.s, z4.s[1]\n"
+      "fmla z29.s, z16.s, z5.s[1]\n"
+      "fmla z30.s, z16.s, z6.s[1]\n"
       "ble 89f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
+      "fmla z27.s, z16.s, z3.s[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
-      "fmla z30.s, z10.s, z6.s[2]\n"
+      "fmla z28.s, z16.s, z4.s[2]\n"
+      "fmla z29.s, z16.s, z5.s[2]\n"
+      "fmla z30.s, z16.s, z6.s[2]\n"
       "ble 89f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
-      "fmla z30.s, z11.s, z6.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
+      "fmla z28.s, z16.s, z4.s[3]\n"
+      "fmla z29.s, z16.s, z5.s[3]\n"
+      "fmla z30.s, z16.s, z6.s[3]\n"
       "89:"  // Height 7: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 84b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "tbz %x[flags], #1, 90f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
-      "fmin z29.s, p2/M, z29.s, z16.s\n"
-      "fmin z30.s, p2/M, z30.s, z16.s\n"
-      "fmax z29.s, p2/M, z29.s, z17.s\n"
-      "fmax z30.s, p2/M, z30.s, z17.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "fmin z24.s, p2/M, z24.s, z17.s\n"
+      "fmin z25.s, p2/M, z25.s, z17.s\n"
+      "fmin z26.s, p2/M, z26.s, z17.s\n"
+      "fmin z27.s, p2/M, z27.s, z17.s\n"
+      "fmin z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z17.s\n"
+      "fmin z30.s, p2/M, z30.s, z17.s\n"
+      "fmax z24.s, p2/M, z24.s, z16.s\n"
+      "fmax z25.s, p2/M, z25.s, z16.s\n"
+      "fmax z26.s, p2/M, z26.s, z16.s\n"
+      "fmax z27.s, p2/M, z27.s, z16.s\n"
+      "fmax z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z16.s\n"
+      "fmax z30.s, p2/M, z30.s, z16.s\n"
       "90:"  // Height 7: No activation
-      "st1w { z24.s }, p1, [x10]\n"
-      "addvl x10, x10, #1\n"
-      "st1w { z25.s }, p1, [x26]\n"
-      "st1w { z26.s }, p1, [x25]\n"
-      "st1w { z27.s }, p1, [x24]\n"
-      "st1w { z28.s }, p1, [x23]\n"
-      "st1w { z29.s }, p1, [x22]\n"
-      "st1w { z30.s }, p1, [x21]\n"
+      "st1w { z24.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p1, [x27]\n"
+      "st1w { z26.s }, p1, [x26]\n"
+      "st1w { z27.s }, p1, [x25]\n"
+      "st1w { z28.s }, p1, [x24]\n"
+      "st1w { z29.s }, p1, [x23]\n"
+      "st1w { z30.s }, p1, [x22]\n"
       "91:"  // Height 7: Writeback done
       "decw x13\n"
       "cmp x13, XZR\n"
       "bgt 80b\n"
       "b 106f\n"
       "92:"  // Height 8
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x20\n"
+      "mov x14, %x[bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x11, %x[bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x10, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x19, #0x20\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "93:"  // Height 8: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p1.s, x19, x13\n"
-      "cbz x11, 94f\n"
-      "ld1w { z24.s }, p2/Z, [x11]\n"
+      "mov x20, #0x0\n"
+      "whilelt p1.s, x20, x13\n"
+      "cbz x14, 94f\n"
+      "ld1w { z24.s }, p2/Z, [x14]\n"
       "mov z25.d, z24.d\n"
-      "addvl x11, x11, #1\n"
       "mov z26.d, z24.d\n"
+      "addvl x14, x14, #1\n"
       "mov z27.d, z24.d\n"
       "mov z28.d, z24.d\n"
       "mov z29.d, z24.d\n"
@@ -1344,21 +1260,21 @@ void sve_hybrid_fp32_mla_8x1VL (
       "b 96f\n"
       "94:"  // Height 8: no bias
       "tbz %x[flags], #0, 95f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z24.s }, p1/Z, [x10]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x26]\n"
-      "add x25, x26, x19, LSL #2\n"
-      "ld1w { z26.s }, p1/Z, [x25]\n"
-      "add x24, x25, x19, LSL #2\n"
-      "ld1w { z27.s }, p1/Z, [x24]\n"
-      "add x23, x24, x19, LSL #2\n"
-      "ld1w { z28.s }, p1/Z, [x23]\n"
-      "add x22, x23, x19, LSL #2\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x11, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
+      "ld1w { z24.s }, p1/Z, [x11]\n"
+      "add x23, x21, x24, LSL #2\n"
+      "add x20, x23, x24, LSL #2\n"
+      "ld1w { z25.s }, p1/Z, [x22]\n"
+      "ld1w { z26.s }, p1/Z, [x21]\n"
+      "add x22, x20, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
+      "ld1w { z27.s }, p1/Z, [x23]\n"
+      "ld1w { z28.s }, p1/Z, [x20]\n"
+      "add x20, x21, x24, LSL #2\n"
       "ld1w { z29.s }, p1/Z, [x22]\n"
-      "add x21, x22, x19, LSL #2\n"
       "ld1w { z30.s }, p1/Z, [x21]\n"
-      "add x20, x21, x19, LSL #2\n"
       "ld1w { z31.s }, p1/Z, [x20]\n"
       "b 96f\n"
       "95:"  // Height 8: no accumulate
@@ -1371,249 +1287,224 @@ void sve_hybrid_fp32_mla_8x1VL (
       "mov z30.b, #0x0\n"
       "mov z31.b, #0x0\n"
       "96:"  // Height 8: setup done
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "97:"  // Height 8: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w28, [x20, x9, LSL #0x2]\n"
+      "ldr w9, [x20, x10, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 98f\n"
-      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x27, [x20, #0x0]\n"
-      "ldr x26, [x20, #0x8]\n"
-      "ldr x25, [x20, #0x10]\n"
-      "ldr x24, [x20, #0x18]\n"
-      "ldr x23, [x20, #0x20]\n"
-      "ldr x22, [x20, #0x28]\n"
-      "ldr x21, [x20, #0x30]\n"
-      "ldr x20, [x20, #0x38]\n"
-      "cbnz x9, 99f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x27, x27, x19, LSL #2\n"
-      "add x26, x26, x19, LSL #2\n"
-      "add x25, x25, x19, LSL #2\n"
-      "add x24, x24, x19, LSL #2\n"
-      "add x23, x23, x19, LSL #2\n"
-      "add x22, x22, x19, LSL #2\n"
-      "add x21, x21, x19, LSL #2\n"
-      "add x20, x20, x19, LSL #2\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "ldr x21, [x20, #0x38]\n"
+      "cbnz x10, 99f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x28, x28, x20, LSL #2\n"
+      "add x27, x27, x20, LSL #2\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
       "b 99f\n"
       "98:"  // Height 8: setup direct input
-      "mov x27, %x[input_ptr]\n"
-      "add x26, x27, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "mov x28, %x[input_ptr]\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "99:"  // Height 8: input setup done
-      "cmp x28, #0x4\n"
+      "cmp x9, #0x4\n"
       "ble 101f\n"
       "100:"  // Height 8: Multiply loop: Main loop head
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "sub x28, x28, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z7.s }, p0/Z, [x28]\n"
+      "ld1rqw { z6.s }, p0/Z, [x27]\n"
+      "sub x9, x9, #0x4\n"
+      "ld1rqw { z5.s }, p0/Z, [x26]\n"
+      "ld1rqw { z4.s }, p0/Z, [x25]\n"
+      "cmp x9, #0x4\n"
+      "add x28, x28, #0x10\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "ld1rqw { z2.s }, p0/Z, [x23]\n"
       "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x25]\n"
       "add x26, x26, #0x10\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "ld1rqw { z1.s }, p0/Z, [x22]\n"
+      "ld1rqw { z0.s }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1rqw { z5.s }, p0/Z, [x22]\n"
+      "ld1w { z19.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z19.s, z7.s[0]\n"
+      "fmla z25.s, z19.s, z6.s[0]\n"
+      "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z26.s, z19.s, z5.s[0]\n"
+      "fmla z27.s, z19.s, z4.s[0]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "fmla z28.s, z19.s, z3.s[0]\n"
+      "fmla z29.s, z19.s, z2.s[0]\n"
       "add x23, x23, #0x10\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "ld1rqw { z6.s }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "ld1rqw { z7.s }, p0/Z, [x20]\n"
+      "fmla z30.s, z19.s, z1.s[0]\n"
+      "fmla z31.s, z19.s, z0.s[0]\n"
       "add x21, x21, #0x10\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z30.s, z8.s, z6.s[0]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
-      "fmla z31.s, z8.s, z7.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla z30.s, z9.s, z6.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla z31.s, z9.s, z7.s[1]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
-      "fmla z30.s, z10.s, z6.s[2]\n"
-      "fmla z31.s, z10.s, z7.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
-      "fmla z30.s, z11.s, z6.s[3]\n"
-      "fmla z31.s, z11.s, z7.s[3]\n"
+      "fmla z24.s, z18.s, z7.s[1]\n"
+      "fmla z25.s, z18.s, z6.s[1]\n"
+      "fmla z26.s, z18.s, z5.s[1]\n"
+      "fmla z27.s, z18.s, z4.s[1]\n"
+      "fmla z28.s, z18.s, z3.s[1]\n"
+      "fmla z29.s, z18.s, z2.s[1]\n"
+      "fmla z30.s, z18.s, z1.s[1]\n"
+      "fmla z31.s, z18.s, z0.s[1]\n"
+      "fmla z24.s, z17.s, z7.s[2]\n"
+      "fmla z25.s, z17.s, z6.s[2]\n"
+      "fmla z26.s, z17.s, z5.s[2]\n"
+      "fmla z27.s, z17.s, z4.s[2]\n"
+      "fmla z28.s, z17.s, z3.s[2]\n"
+      "fmla z29.s, z17.s, z2.s[2]\n"
+      "fmla z30.s, z17.s, z1.s[2]\n"
+      "fmla z31.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z7.s[3]\n"
+      "fmla z25.s, z16.s, z6.s[3]\n"
+      "fmla z26.s, z16.s, z5.s[3]\n"
+      "fmla z27.s, z16.s, z4.s[3]\n"
+      "fmla z28.s, z16.s, z3.s[3]\n"
+      "fmla z29.s, z16.s, z2.s[3]\n"
+      "fmla z30.s, z16.s, z1.s[3]\n"
+      "fmla z31.s, z16.s, z0.s[3]\n"
       "bgt 100b\n"
       "101:"  // Height 8: Multiply loop: Single iteration only
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "whilelt p0.s, XZR, x28\n"
-      "subs x28, x28, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x27]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "ld1rqw { z5.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
-      "ld1rqw { z6.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z30.s, z8.s, z6.s[0]\n"
-      "ld1rqw { z7.s }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "fmla z31.s, z8.s, z7.s[0]\n"
-      "add x20, x20, #0x10\n"
+      "whilelt p0.s, XZR, x9\n"
+      "ld1rqw { z0.s }, p0/Z, [x28]\n"
+      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "subs x9, x9, #0x1\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "ld1rqw { z4.s }, p0/Z, [x24]\n"
+      "ld1rqw { z5.s }, p0/Z, [x23]\n"
+      "ld1rqw { z6.s }, p0/Z, [x22]\n"
+      "ld1rqw { z7.s }, p0/Z, [x21]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
       "addvl x12, x12, #1\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z3.s[0]\n"
+      "fmla z28.s, z16.s, z4.s[0]\n"
+      "fmla z29.s, z16.s, z5.s[0]\n"
+      "fmla z30.s, z16.s, z6.s[0]\n"
+      "fmla z31.s, z16.s, z7.s[0]\n"
       "ble 102f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
+      "fmla z27.s, z16.s, z3.s[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
-      "fmla z30.s, z9.s, z6.s[1]\n"
-      "fmla z31.s, z9.s, z7.s[1]\n"
+      "fmla z28.s, z16.s, z4.s[1]\n"
+      "fmla z29.s, z16.s, z5.s[1]\n"
+      "fmla z30.s, z16.s, z6.s[1]\n"
+      "fmla z31.s, z16.s, z7.s[1]\n"
       "ble 102f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "subs x28, x28, #0x1\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "subs x9, x9, #0x1\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
+      "fmla z27.s, z16.s, z3.s[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
-      "fmla z30.s, z10.s, z6.s[2]\n"
-      "fmla z31.s, z10.s, z7.s[2]\n"
+      "fmla z28.s, z16.s, z4.s[2]\n"
+      "fmla z29.s, z16.s, z5.s[2]\n"
+      "fmla z30.s, z16.s, z6.s[2]\n"
+      "fmla z31.s, z16.s, z7.s[2]\n"
       "ble 102f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
-      "fmla z30.s, z11.s, z6.s[3]\n"
-      "fmla z31.s, z11.s, z7.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
+      "fmla z28.s, z16.s, z4.s[3]\n"
+      "fmla z29.s, z16.s, z5.s[3]\n"
+      "fmla z30.s, z16.s, z6.s[3]\n"
+      "fmla z31.s, z16.s, z7.s[3]\n"
       "102:"  // Height 8: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x9, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x10, x10, #0x1\n"
+      "cmp x10, x20\n"
       "bne 97b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x26, x10, x19, LSL #2\n"
-      "add x25, x26, x19, LSL #2\n"
-      "add x24, x25, x19, LSL #2\n"
-      "add x23, x24, x19, LSL #2\n"
-      "add x22, x23, x19, LSL #2\n"
-      "add x21, x22, x19, LSL #2\n"
-      "add x20, x21, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x27, x11, x20, LSL #2\n"
+      "add x26, x27, x20, LSL #2\n"
+      "add x25, x26, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "tbz %x[flags], #1, 103f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
-      "fmin z29.s, p2/M, z29.s, z16.s\n"
-      "fmin z30.s, p2/M, z30.s, z16.s\n"
-      "fmin z31.s, p2/M, z31.s, z16.s\n"
-      "fmax z29.s, p2/M, z29.s, z17.s\n"
-      "fmax z30.s, p2/M, z30.s, z17.s\n"
-      "fmax z31.s, p2/M, z31.s, z17.s\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "fmin z24.s, p2/M, z24.s, z17.s\n"
+      "fmin z25.s, p2/M, z25.s, z17.s\n"
+      "fmin z26.s, p2/M, z26.s, z17.s\n"
+      "fmin z27.s, p2/M, z27.s, z17.s\n"
+      "fmin z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z17.s\n"
+      "fmin z30.s, p2/M, z30.s, z17.s\n"
+      "fmin z31.s, p2/M, z31.s, z17.s\n"
+      "fmax z24.s, p2/M, z24.s, z16.s\n"
+      "fmax z25.s, p2/M, z25.s, z16.s\n"
+      "fmax z26.s, p2/M, z26.s, z16.s\n"
+      "fmax z27.s, p2/M, z27.s, z16.s\n"
+      "fmax z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z16.s\n"
+      "fmax z30.s, p2/M, z30.s, z16.s\n"
+      "fmax z31.s, p2/M, z31.s, z16.s\n"
       "103:"  // Height 8: No activation
-      "st1w { z24.s }, p1, [x10]\n"
-      "addvl x10, x10, #1\n"
-      "st1w { z25.s }, p1, [x26]\n"
-      "st1w { z26.s }, p1, [x25]\n"
-      "st1w { z27.s }, p1, [x24]\n"
-      "st1w { z28.s }, p1, [x23]\n"
-      "st1w { z29.s }, p1, [x22]\n"
-      "st1w { z30.s }, p1, [x21]\n"
-      "st1w { z31.s }, p1, [x20]\n"
+      "st1w { z24.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z25.s }, p1, [x27]\n"
+      "st1w { z26.s }, p1, [x26]\n"
+      "st1w { z27.s }, p1, [x25]\n"
+      "st1w { z28.s }, p1, [x24]\n"
+      "st1w { z29.s }, p1, [x23]\n"
+      "st1w { z30.s }, p1, [x22]\n"
+      "st1w { z31.s }, p1, [x21]\n"
       "104:"  // Height 8: Writeback done
       "decw x13\n"
       "cmp x13, XZR\n"
       "bgt 93b\n"
       "subs %x[M], %x[M], #0x8\n"
       "beq 106f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 105f\n"
-      "add x20, x20, #0x8\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x8\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "105:"  // Update direct input
-      "mov x19, #0x20\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x20\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "106:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
new file mode 100644
index 0000000000..66c106d2eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_fp32bf16fp32_mmla_4x6VL( ARGLIST );
+
+class cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL
+{
+public:
+    typedef float lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 6;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 4, 12, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 16.63 };
+                case CPUModel::A510:
+                    return { 5.42 };
+                case CPUModel::V1:
+                    return { 20.83 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_fp32bf16fp32_mmla_4x6VL;
+    cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
new file mode 100644
index 0000000000..2b2a0684f9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
@@ -0,0 +1,1305 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p7.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 40f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 27f\n"
+      "beq 14f\n"
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p6.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p5.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p4.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x9\n"
+      "cbz x10, 3f\n"
+      "ld1w { z8.s }, p7/Z, [x10]\n"
+      "ld1w { z9.s }, p7/Z, [x10, #1, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
+      "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "addvl x10, x10, #6\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z21.s }, p6/Z, [x27]\n"
+      "ld1w { z20.s }, p5/Z, [x27, #1, MUL VL]\n"
+      "zip1 z8.d, z21.d, z14.d\n"
+      "zip2 z14.d, z21.d, z14.d\n"
+      "ld1w { z23.s }, p4/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+      "zip1 z9.d, z20.d, z15.d\n"
+      "zip2 z15.d, z20.d, z15.d\n"
+      "ld1w { z21.s }, p2/Z, [x27, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
+      "zip1 z10.d, z23.d, z16.d\n"
+      "zip2 z16.d, z23.d, z16.d\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "zip1 z12.d, z21.d, z18.d\n"
+      "zip2 z18.d, z21.d, z18.d\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x26, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 8f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "cmp x25, #0x4\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z21.h }, p7/Z, [x28]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6475e708  // bfmmla z8.s, z24.h, z21.h\n"
+      ".inst 0x6474e70e  // bfmmla z14.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #4, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6475e70a  // bfmmla z10.s, z24.h, z21.h\n"
+      ".inst 0x6474e710  // bfmmla z16.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      "sub x25, x25, #0x4\n"
+      "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      "cmp x25, #0x4\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
+      "add x24, x24, #0x10\n"
+      "addvl x28, x28, #-4\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z23.s }, p0/Z, [x24]\n"
+      ".inst 0x658abef7  // bfcvt z23.h, p7/M, z23.s\n"
+      "uzp1 z23.h, z23.h, z23.h\n"
+      "ld1h { z21.h }, p7/Z, [x28]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6475e6e8  // bfmmla z8.s, z23.h, z21.h\n"
+      ".inst 0x6474e6ee  // bfmmla z14.s, z23.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6475e6e9  // bfmmla z9.s, z23.h, z21.h\n"
+      ".inst 0x6474e6ef  // bfmmla z15.s, z23.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #4, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6475e6ea  // bfmmla z10.s, z23.h, z21.h\n"
+      ".inst 0x6474e6f0  // bfmmla z16.s, z23.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x6475e6eb  // bfmmla z11.s, z23.h, z21.h\n"
+      ".inst 0x6474e6f1  // bfmmla z17.s, z23.h, z20.h\n"
+      "ld1h { z20.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x6474e6ec  // bfmmla z12.s, z23.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6476e6f2  // bfmmla z18.s, z23.h, z22.h\n"
+      ".inst 0x6475e6ed  // bfmmla z13.s, z23.h, z21.h\n"
+      ".inst 0x6474e6f3  // bfmmla z19.s, z23.h, z20.h\n"
+      "addvl x28, x28, #-4\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 6b\n"
+      "uzp1 z8.d, z8.d, z14.d\n"
+      "uzp1 z9.d, z9.d, z15.d\n"
+      "uzp1 z10.d, z10.d, z16.d\n"
+      "uzp1 z11.d, z11.d, z17.d\n"
+      "uzp1 z12.d, z12.d, z18.d\n"
+      "uzp1 z13.d, z13.d, z19.d\n"
+      "tbz %x[flags], #1, 12f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z21.s }, p7/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z20.s }, p7/Z, [x20]\n"
+      "fmin z8.s, p7/M, z8.s, z21.s\n"
+      "fmin z9.s, p7/M, z9.s, z21.s\n"
+      "fmin z10.s, p7/M, z10.s, z21.s\n"
+      "fmin z11.s, p7/M, z11.s, z21.s\n"
+      "fmin z12.s, p7/M, z12.s, z21.s\n"
+      "fmin z13.s, p7/M, z13.s, z21.s\n"
+      "fmax z8.s, p7/M, z8.s, z20.s\n"
+      "fmax z9.s, p7/M, z9.s, z20.s\n"
+      "fmax z10.s, p7/M, z10.s, z20.s\n"
+      "fmax z11.s, p7/M, z11.s, z20.s\n"
+      "fmax z12.s, p7/M, z12.s, z20.s\n"
+      "fmax z13.s, p7/M, z13.s, z20.s\n"
+      "12:"  // Height 1: No activation
+      "st1w { z8.s }, p6, [x27]\n"
+      "st1w { z9.s }, p5, [x27, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x27, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x27, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x27, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x27, #5, MUL VL]\n"
+      "addvl x27, x27, #6\n"
+      "13:"  // Height 1: Writeback done
+      "decw x9, ALL, MUL #6\n"
+      "cmp x9, XZR\n"
+      "bgt 2b\n"
+      "b 54f\n"
+      "14:"  // Height 2
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "15:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p6.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p5.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p4.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x9\n"
+      "cbz x10, 16f\n"
+      "ld1w { z8.s }, p7/Z, [x10]\n"
+      "ld1w { z9.s }, p7/Z, [x10, #1, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
+      "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "addvl x10, x10, #6\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "b 18f\n"
+      "16:"  // Height 2: no bias
+      "tbz %x[flags], #0, 17f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x27, x20, LSL #2\n"
+      "ld1w { z16.s }, p6/Z, [x27]\n"
+      "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x27, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x20]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x20, #5, MUL VL]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "zip1 z12.d, z21.d, z18.d\n"
+      "zip2 z18.d, z21.d, z18.d\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "b 18f\n"
+      "17:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "18:"  // Height 2: setup done
+      "mov x26, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 21f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "21:"  // Height 2: input setup done
+      "cmp x25, #0x4\n"
+      "ble 23f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      "ld1rqw { z20.s }, p0/Z, [x23]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      ".inst 0x658abe94  // bfcvt z20.h, p7/M, z20.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z23.h }, p7/Z, [x28]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "uzp1 z20.h, z20.h, z20.h\n"
+      "trn1 z24.d, z24.d, z20.d\n"
+      "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6477e708  // bfmmla z8.s, z24.h, z23.h\n"
+      ".inst 0x6476e70e  // bfmmla z14.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x6477e70a  // bfmmla z10.s, z24.h, z23.h\n"
+      ".inst 0x6476e710  // bfmmla z16.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      "sub x25, x25, #0x4\n"
+      "cmp x25, #0x4\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
+      "addvl x28, x28, #-4\n"
+      "bgt 22b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      "ld1rqw { z20.s }, p0/Z, [x23]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      ".inst 0x658abe94  // bfcvt z20.h, p7/M, z20.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z23.h }, p7/Z, [x28]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "uzp1 z20.h, z20.h, z20.h\n"
+      "trn1 z24.d, z24.d, z20.d\n"
+      "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6477e708  // bfmmla z8.s, z24.h, z23.h\n"
+      ".inst 0x6476e70e  // bfmmla z14.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x6477e70a  // bfmmla z10.s, z24.h, z23.h\n"
+      ".inst 0x6476e710  // bfmmla z16.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
+      "addvl x28, x28, #-4\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 19b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z4.d, z8.d, z14.d\n"
+      "uzp2 z8.d, z8.d, z14.d\n"
+      "add x23, x27, x20, LSL #2\n"
+      "uzp1 z14.d, z9.d, z15.d\n"
+      "uzp2 z9.d, z9.d, z15.d\n"
+      "uzp1 z15.d, z10.d, z16.d\n"
+      "uzp2 z10.d, z10.d, z16.d\n"
+      "uzp1 z16.d, z11.d, z17.d\n"
+      "uzp2 z11.d, z11.d, z17.d\n"
+      "uzp1 z17.d, z12.d, z18.d\n"
+      "uzp2 z12.d, z12.d, z18.d\n"
+      "uzp1 z18.d, z13.d, z19.d\n"
+      "uzp2 z13.d, z13.d, z19.d\n"
+      "tbz %x[flags], #1, 25f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z20.s }, p7/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z19.s }, p7/Z, [x20]\n"
+      "fmin z4.s, p7/M, z4.s, z20.s\n"
+      "fmin z14.s, p7/M, z14.s, z20.s\n"
+      "fmin z15.s, p7/M, z15.s, z20.s\n"
+      "fmin z16.s, p7/M, z16.s, z20.s\n"
+      "fmin z17.s, p7/M, z17.s, z20.s\n"
+      "fmin z18.s, p7/M, z18.s, z20.s\n"
+      "fmin z8.s, p7/M, z8.s, z20.s\n"
+      "fmin z9.s, p7/M, z9.s, z20.s\n"
+      "fmin z10.s, p7/M, z10.s, z20.s\n"
+      "fmin z11.s, p7/M, z11.s, z20.s\n"
+      "fmin z12.s, p7/M, z12.s, z20.s\n"
+      "fmin z13.s, p7/M, z13.s, z20.s\n"
+      "fmax z4.s, p7/M, z4.s, z19.s\n"
+      "fmax z14.s, p7/M, z14.s, z19.s\n"
+      "fmax z15.s, p7/M, z15.s, z19.s\n"
+      "fmax z16.s, p7/M, z16.s, z19.s\n"
+      "fmax z17.s, p7/M, z17.s, z19.s\n"
+      "fmax z18.s, p7/M, z18.s, z19.s\n"
+      "fmax z8.s, p7/M, z8.s, z19.s\n"
+      "fmax z9.s, p7/M, z9.s, z19.s\n"
+      "fmax z10.s, p7/M, z10.s, z19.s\n"
+      "fmax z11.s, p7/M, z11.s, z19.s\n"
+      "fmax z12.s, p7/M, z12.s, z19.s\n"
+      "fmax z13.s, p7/M, z13.s, z19.s\n"
+      "25:"  // Height 2: No activation
+      "st1w { z4.s }, p6, [x27]\n"
+      "st1w { z14.s }, p5, [x27, #1, MUL VL]\n"
+      "st1w { z15.s }, p4, [x27, #2, MUL VL]\n"
+      "st1w { z16.s }, p3, [x27, #3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x27, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x27, #5, MUL VL]\n"
+      "addvl x27, x27, #6\n"
+      "st1w { z8.s }, p6, [x23]\n"
+      "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
+      "26:"  // Height 2: Writeback done
+      "decw x9, ALL, MUL #6\n"
+      "cmp x9, XZR\n"
+      "bgt 15b\n"
+      "b 54f\n"
+      "27:"  // Height 3
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "28:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p6.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p5.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p4.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x9\n"
+      "cbz x10, 29f\n"
+      "ld1w { z8.s }, p7/Z, [x10]\n"
+      "ld1w { z9.s }, p7/Z, [x10, #1, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
+      "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "addvl x10, x10, #6\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z26.d, z14.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z27.d, z15.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z28.d, z16.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z29.d, z17.d\n"
+      "mov z24.d, z12.d\n"
+      "mov z30.d, z18.d\n"
+      "mov z25.d, z13.d\n"
+      "mov z31.d, z19.d\n"
+      "b 31f\n"
+      "29:"  // Height 3: no bias
+      "tbz %x[flags], #0, 30f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x27, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z16.s }, p6/Z, [x27]\n"
+      "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x27, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x21]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "ld1w { z21.s }, p6/Z, [x20]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "ld1w { z22.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x20, #2, MUL VL]\n"
+      "zip1 z12.d, z24.d, z18.d\n"
+      "zip2 z18.d, z24.d, z18.d\n"
+      "ld1w { z24.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #4, MUL VL]\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n"
+      "zip1 z20.d, z21.d, z26.d\n"
+      "zip2 z26.d, z21.d, z26.d\n"
+      "zip1 z21.d, z22.d, z27.d\n"
+      "zip2 z27.d, z22.d, z27.d\n"
+      "zip1 z22.d, z23.d, z28.d\n"
+      "zip2 z28.d, z23.d, z28.d\n"
+      "zip1 z23.d, z24.d, z29.d\n"
+      "zip2 z29.d, z24.d, z29.d\n"
+      "zip1 z24.d, z25.d, z30.d\n"
+      "zip2 z30.d, z25.d, z30.d\n"
+      "zip1 z25.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 31f\n"
+      "30:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "31:"  // Height 3: setup done
+      "mov x26, #0x0\n"
+      "32:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 34f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 34f\n"
+      "33:"  // Height 3: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "34:"  // Height 3: input setup done
+      "cmp x25, #0x4\n"
+      "ble 36f\n"
+      "35:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "trn1 z5.d, z5.d, z0.d\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6463e4a8  // bfmmla z8.s, z5.h, z3.h\n"
+      ".inst 0x6463e494  // bfmmla z20.s, z4.h, z3.h\n"
+      ".inst 0x6462e4ae  // bfmmla z14.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
+      "sub x25, x25, #0x4\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
+      "cmp x25, #0x4\n"
+      ".inst 0x6461e495  // bfmmla z21.s, z4.h, z1.h\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6460e49b  // bfmmla z27.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x6463e4aa  // bfmmla z10.s, z5.h, z3.h\n"
+      ".inst 0x6463e496  // bfmmla z22.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b0  // bfmmla z16.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6462e49c  // bfmmla z28.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6461e497  // bfmmla z23.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b1  // bfmmla z17.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6463e4ac  // bfmmla z12.s, z5.h, z3.h\n"
+      "addvl x28, x28, #-4\n"
+      ".inst 0x6463e498  // bfmmla z24.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b2  // bfmmla z18.s, z5.h, z2.h\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ad  // bfmmla z13.s, z5.h, z1.h\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b3  // bfmmla z19.s, z5.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "bgt 35b\n"
+      "36:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "trn1 z5.d, z5.d, z0.d\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6463e4a8  // bfmmla z8.s, z5.h, z3.h\n"
+      ".inst 0x6463e494  // bfmmla z20.s, z4.h, z3.h\n"
+      ".inst 0x6462e4ae  // bfmmla z14.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6461e495  // bfmmla z21.s, z4.h, z1.h\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x6460e49b  // bfmmla z27.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x6463e4aa  // bfmmla z10.s, z5.h, z3.h\n"
+      ".inst 0x6463e496  // bfmmla z22.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b0  // bfmmla z16.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x6462e49c  // bfmmla z28.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x6461e497  // bfmmla z23.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b1  // bfmmla z17.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6463e4ac  // bfmmla z12.s, z5.h, z3.h\n"
+      "addvl x28, x28, #-4\n"
+      ".inst 0x6463e498  // bfmmla z24.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b2  // bfmmla z18.s, z5.h, z2.h\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ad  // bfmmla z13.s, z5.h, z1.h\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b3  // bfmmla z19.s, z5.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "37:"  // Height 3: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 32b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "uzp1 z4.d, z8.d, z14.d\n"
+      "uzp2 z8.d, z8.d, z14.d\n"
+      "uzp1 z14.d, z9.d, z15.d\n"
+      "uzp2 z9.d, z9.d, z15.d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp1 z15.d, z10.d, z16.d\n"
+      "uzp2 z10.d, z10.d, z16.d\n"
+      "uzp1 z16.d, z11.d, z17.d\n"
+      "uzp2 z11.d, z11.d, z17.d\n"
+      "uzp1 z17.d, z12.d, z18.d\n"
+      "uzp2 z12.d, z12.d, z18.d\n"
+      "uzp1 z18.d, z13.d, z19.d\n"
+      "uzp2 z13.d, z13.d, z19.d\n"
+      "uzp1 z20.d, z20.d, z26.d\n"
+      "uzp1 z21.d, z21.d, z27.d\n"
+      "uzp1 z22.d, z22.d, z28.d\n"
+      "uzp1 z23.d, z23.d, z29.d\n"
+      "uzp1 z24.d, z24.d, z30.d\n"
+      "uzp1 z25.d, z25.d, z31.d\n"
+      "tbz %x[flags], #1, 38f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p7/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z19.s }, p7/Z, [x20]\n"
+      "fmin z4.s, p7/M, z4.s, z0.s\n"
+      "fmin z14.s, p7/M, z14.s, z0.s\n"
+      "fmin z15.s, p7/M, z15.s, z0.s\n"
+      "fmin z16.s, p7/M, z16.s, z0.s\n"
+      "fmin z17.s, p7/M, z17.s, z0.s\n"
+      "fmin z18.s, p7/M, z18.s, z0.s\n"
+      "fmin z8.s, p7/M, z8.s, z0.s\n"
+      "fmin z9.s, p7/M, z9.s, z0.s\n"
+      "fmin z10.s, p7/M, z10.s, z0.s\n"
+      "fmin z11.s, p7/M, z11.s, z0.s\n"
+      "fmin z12.s, p7/M, z12.s, z0.s\n"
+      "fmin z13.s, p7/M, z13.s, z0.s\n"
+      "fmin z20.s, p7/M, z20.s, z0.s\n"
+      "fmin z21.s, p7/M, z21.s, z0.s\n"
+      "fmin z22.s, p7/M, z22.s, z0.s\n"
+      "fmin z23.s, p7/M, z23.s, z0.s\n"
+      "fmin z24.s, p7/M, z24.s, z0.s\n"
+      "fmin z25.s, p7/M, z25.s, z0.s\n"
+      "fmax z4.s, p7/M, z4.s, z19.s\n"
+      "fmax z14.s, p7/M, z14.s, z19.s\n"
+      "fmax z15.s, p7/M, z15.s, z19.s\n"
+      "fmax z16.s, p7/M, z16.s, z19.s\n"
+      "fmax z17.s, p7/M, z17.s, z19.s\n"
+      "fmax z18.s, p7/M, z18.s, z19.s\n"
+      "fmax z8.s, p7/M, z8.s, z19.s\n"
+      "fmax z9.s, p7/M, z9.s, z19.s\n"
+      "fmax z10.s, p7/M, z10.s, z19.s\n"
+      "fmax z11.s, p7/M, z11.s, z19.s\n"
+      "fmax z12.s, p7/M, z12.s, z19.s\n"
+      "fmax z13.s, p7/M, z13.s, z19.s\n"
+      "fmax z20.s, p7/M, z20.s, z19.s\n"
+      "fmax z21.s, p7/M, z21.s, z19.s\n"
+      "fmax z22.s, p7/M, z22.s, z19.s\n"
+      "fmax z23.s, p7/M, z23.s, z19.s\n"
+      "fmax z24.s, p7/M, z24.s, z19.s\n"
+      "fmax z25.s, p7/M, z25.s, z19.s\n"
+      "38:"  // Height 3: No activation
+      "st1w { z4.s }, p6, [x27]\n"
+      "st1w { z14.s }, p5, [x27, #1, MUL VL]\n"
+      "st1w { z15.s }, p4, [x27, #2, MUL VL]\n"
+      "st1w { z16.s }, p3, [x27, #3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x27, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x27, #5, MUL VL]\n"
+      "addvl x27, x27, #6\n"
+      "st1w { z8.s }, p6, [x23]\n"
+      "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
+      "st1w { z20.s }, p6, [x22]\n"
+      "st1w { z21.s }, p5, [x22, #1, MUL VL]\n"
+      "st1w { z22.s }, p4, [x22, #2, MUL VL]\n"
+      "st1w { z23.s }, p3, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p2, [x22, #4, MUL VL]\n"
+      "st1w { z25.s }, p1, [x22, #5, MUL VL]\n"
+      "39:"  // Height 3: Writeback done
+      "decw x9, ALL, MUL #6\n"
+      "cmp x9, XZR\n"
+      "bgt 28b\n"
+      "b 54f\n"
+      "40:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x10\n"
+      "mov x10, %x[bias]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "41:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p6.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p5.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p4.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x9\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x9\n"
+      "cbz x10, 42f\n"
+      "ld1w { z8.s }, p7/Z, [x10]\n"
+      "ld1w { z9.s }, p7/Z, [x10, #1, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p7/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z11.s }, p7/Z, [x10, #3, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z12.s }, p7/Z, [x10, #4, MUL VL]\n"
+      "ld1w { z13.s }, p7/Z, [x10, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "addvl x10, x10, #6\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z26.d, z14.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z27.d, z15.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z28.d, z16.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z29.d, z17.d\n"
+      "mov z24.d, z12.d\n"
+      "mov z30.d, z18.d\n"
+      "mov z25.d, z13.d\n"
+      "mov z31.d, z19.d\n"
+      "b 44f\n"
+      "42:"  // Height 4: no bias
+      "tbz %x[flags], #0, 43f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x27, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z16.s }, p6/Z, [x27]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x27, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x22]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n"
+      "ld1w { z21.s }, p6/Z, [x21]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "zip1 z12.d, z24.d, z18.d\n"
+      "zip2 z18.d, z24.d, z18.d\n"
+      "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "ld1w { z26.s }, p6/Z, [x20]\n"
+      "zip1 z20.d, z21.d, z26.d\n"
+      "zip2 z26.d, z21.d, z26.d\n"
+      "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n"
+      "zip1 z21.d, z22.d, z27.d\n"
+      "zip2 z27.d, z22.d, z27.d\n"
+      "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n"
+      "zip1 z22.d, z23.d, z28.d\n"
+      "zip2 z28.d, z23.d, z28.d\n"
+      "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n"
+      "zip1 z23.d, z24.d, z29.d\n"
+      "zip2 z29.d, z24.d, z29.d\n"
+      "zip1 z24.d, z25.d, z30.d\n"
+      "zip2 z30.d, z25.d, z30.d\n"
+      "zip1 z25.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 44f\n"
+      "43:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "44:"  // Height 4: setup done
+      "mov x26, #0x0\n"
+      "45:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 46f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 47f\n"
+      "46:"  // Height 4: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "47:"  // Height 4: input setup done
+      "cmp x25, #0x4\n"
+      "ble 49f\n"
+      "48:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z7.s }, p0/Z, [x24]\n"
+      "ld1rqw { z6.s }, p0/Z, [x23]\n"
+      ".inst 0x658abce7  // bfcvt z7.h, p7/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x22]\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      ".inst 0x658abcc6  // bfcvt z6.h, p7/M, z6.s\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6463e4e8  // bfmmla z8.s, z7.h, z3.h\n"
+      "sub x25, x25, #0x4\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6462e4ee  // bfmmla z14.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
+      "cmp x25, #0x4\n"
+      ".inst 0x6461e4b5  // bfmmla z21.s, z5.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6460e4bb  // bfmmla z27.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x6463e4ea  // bfmmla z10.s, z7.h, z3.h\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f0  // bfmmla z16.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6462e4bc  // bfmmla z28.s, z5.h, z2.h\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6461e4b7  // bfmmla z23.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f1  // bfmmla z17.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6463e4ec  // bfmmla z12.s, z7.h, z3.h\n"
+      "addvl x28, x28, #-4\n"
+      ".inst 0x6463e4b8  // bfmmla z24.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f2  // bfmmla z18.s, z7.h, z2.h\n"
+      ".inst 0x6462e4be  // bfmmla z30.s, z5.h, z2.h\n"
+      ".inst 0x6461e4ed  // bfmmla z13.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b9  // bfmmla z25.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f3  // bfmmla z19.s, z7.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
+      "bgt 48b\n"
+      "49:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x25\n"
+      "ld1rqw { z7.s }, p0/Z, [x24]\n"
+      "ld1rqw { z6.s }, p0/Z, [x23]\n"
+      ".inst 0x658abce7  // bfcvt z7.h, p7/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x22]\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      ".inst 0x658abcc6  // bfcvt z6.h, p7/M, z6.s\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6463e4e8  // bfmmla z8.s, z7.h, z3.h\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6462e4ee  // bfmmla z14.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6461e4b5  // bfmmla z21.s, z5.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x6460e4bb  // bfmmla z27.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x6463e4ea  // bfmmla z10.s, z7.h, z3.h\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f0  // bfmmla z16.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x6462e4bc  // bfmmla z28.s, z5.h, z2.h\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x6461e4b7  // bfmmla z23.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f1  // bfmmla z17.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6463e4ec  // bfmmla z12.s, z7.h, z3.h\n"
+      "addvl x28, x28, #-4\n"
+      ".inst 0x6463e4b8  // bfmmla z24.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f2  // bfmmla z18.s, z7.h, z2.h\n"
+      ".inst 0x6462e4be  // bfmmla z30.s, z5.h, z2.h\n"
+      ".inst 0x6461e4ed  // bfmmla z13.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b9  // bfmmla z25.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f3  // bfmmla z19.s, z7.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
+      "50:"  // Height 4: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 45b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp1 z4.d, z8.d, z14.d\n"
+      "uzp2 z8.d, z8.d, z14.d\n"
+      "uzp1 z14.d, z9.d, z15.d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 z9.d, z9.d, z15.d\n"
+      "uzp1 z15.d, z10.d, z16.d\n"
+      "uzp2 z10.d, z10.d, z16.d\n"
+      "uzp1 z16.d, z11.d, z17.d\n"
+      "uzp2 z11.d, z11.d, z17.d\n"
+      "uzp1 z17.d, z12.d, z18.d\n"
+      "uzp2 z12.d, z12.d, z18.d\n"
+      "uzp1 z18.d, z13.d, z19.d\n"
+      "uzp2 z13.d, z13.d, z19.d\n"
+      "uzp1 z19.d, z20.d, z26.d\n"
+      "uzp2 z20.d, z20.d, z26.d\n"
+      "uzp1 z26.d, z21.d, z27.d\n"
+      "uzp2 z21.d, z21.d, z27.d\n"
+      "uzp1 z27.d, z22.d, z28.d\n"
+      "uzp2 z22.d, z22.d, z28.d\n"
+      "uzp1 z28.d, z23.d, z29.d\n"
+      "uzp2 z23.d, z23.d, z29.d\n"
+      "uzp1 z29.d, z24.d, z30.d\n"
+      "uzp2 z24.d, z24.d, z30.d\n"
+      "uzp1 z30.d, z25.d, z31.d\n"
+      "uzp2 z25.d, z25.d, z31.d\n"
+      "tbz %x[flags], #1, 51f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p7/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p7/Z, [x20]\n"
+      "fmin z4.s, p7/M, z4.s, z1.s\n"
+      "fmin z14.s, p7/M, z14.s, z1.s\n"
+      "fmin z15.s, p7/M, z15.s, z1.s\n"
+      "fmin z16.s, p7/M, z16.s, z1.s\n"
+      "fmin z17.s, p7/M, z17.s, z1.s\n"
+      "fmin z18.s, p7/M, z18.s, z1.s\n"
+      "fmin z8.s, p7/M, z8.s, z1.s\n"
+      "fmin z9.s, p7/M, z9.s, z1.s\n"
+      "fmin z10.s, p7/M, z10.s, z1.s\n"
+      "fmin z11.s, p7/M, z11.s, z1.s\n"
+      "fmin z12.s, p7/M, z12.s, z1.s\n"
+      "fmin z13.s, p7/M, z13.s, z1.s\n"
+      "fmin z19.s, p7/M, z19.s, z1.s\n"
+      "fmin z26.s, p7/M, z26.s, z1.s\n"
+      "fmin z27.s, p7/M, z27.s, z1.s\n"
+      "fmin z28.s, p7/M, z28.s, z1.s\n"
+      "fmin z29.s, p7/M, z29.s, z1.s\n"
+      "fmin z30.s, p7/M, z30.s, z1.s\n"
+      "fmin z20.s, p7/M, z20.s, z1.s\n"
+      "fmin z21.s, p7/M, z21.s, z1.s\n"
+      "fmin z22.s, p7/M, z22.s, z1.s\n"
+      "fmin z23.s, p7/M, z23.s, z1.s\n"
+      "fmin z24.s, p7/M, z24.s, z1.s\n"
+      "fmin z25.s, p7/M, z25.s, z1.s\n"
+      "fmax z4.s, p7/M, z4.s, z0.s\n"
+      "fmax z14.s, p7/M, z14.s, z0.s\n"
+      "fmax z15.s, p7/M, z15.s, z0.s\n"
+      "fmax z16.s, p7/M, z16.s, z0.s\n"
+      "fmax z17.s, p7/M, z17.s, z0.s\n"
+      "fmax z18.s, p7/M, z18.s, z0.s\n"
+      "fmax z8.s, p7/M, z8.s, z0.s\n"
+      "fmax z9.s, p7/M, z9.s, z0.s\n"
+      "fmax z10.s, p7/M, z10.s, z0.s\n"
+      "fmax z11.s, p7/M, z11.s, z0.s\n"
+      "fmax z12.s, p7/M, z12.s, z0.s\n"
+      "fmax z13.s, p7/M, z13.s, z0.s\n"
+      "fmax z19.s, p7/M, z19.s, z0.s\n"
+      "fmax z26.s, p7/M, z26.s, z0.s\n"
+      "fmax z27.s, p7/M, z27.s, z0.s\n"
+      "fmax z28.s, p7/M, z28.s, z0.s\n"
+      "fmax z29.s, p7/M, z29.s, z0.s\n"
+      "fmax z30.s, p7/M, z30.s, z0.s\n"
+      "fmax z20.s, p7/M, z20.s, z0.s\n"
+      "fmax z21.s, p7/M, z21.s, z0.s\n"
+      "fmax z22.s, p7/M, z22.s, z0.s\n"
+      "fmax z23.s, p7/M, z23.s, z0.s\n"
+      "fmax z24.s, p7/M, z24.s, z0.s\n"
+      "fmax z25.s, p7/M, z25.s, z0.s\n"
+      "51:"  // Height 4: No activation
+      "st1w { z4.s }, p6, [x27]\n"
+      "st1w { z14.s }, p5, [x27, #1, MUL VL]\n"
+      "st1w { z15.s }, p4, [x27, #2, MUL VL]\n"
+      "st1w { z16.s }, p3, [x27, #3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x27, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x27, #5, MUL VL]\n"
+      "addvl x27, x27, #6\n"
+      "st1w { z8.s }, p6, [x23]\n"
+      "st1w { z9.s }, p5, [x23, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x23, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x23, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x23, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x23, #5, MUL VL]\n"
+      "st1w { z19.s }, p6, [x22]\n"
+      "st1w { z26.s }, p5, [x22, #1, MUL VL]\n"
+      "st1w { z27.s }, p4, [x22, #2, MUL VL]\n"
+      "st1w { z28.s }, p3, [x22, #3, MUL VL]\n"
+      "st1w { z29.s }, p2, [x22, #4, MUL VL]\n"
+      "st1w { z30.s }, p1, [x22, #5, MUL VL]\n"
+      "st1w { z20.s }, p6, [x21]\n"
+      "st1w { z21.s }, p5, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p4, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p3, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p2, [x21, #4, MUL VL]\n"
+      "st1w { z25.s }, p1, [x21, #5, MUL VL]\n"
+      "52:"  // Height 4: Writeback done
+      "decw x9, ALL, MUL #6\n"
+      "cmp x9, XZR\n"
+      "bgt 41b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 54f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 53f\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "53:"  // Update direct input
+      "mov x20, #0x10\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "54:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
new file mode 100644
index 0000000000..15b7dd721c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_fp32bf16fp32_mmla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL
+{
+public:
+    typedef float lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 14.06 };
+                case CPUModel::A510:
+                    return { 5.31 };
+                case CPUModel::V1:
+                    return { 17.32 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_fp32bf16fp32_mmla_6x4VL;
+    cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..0d2b47ec39
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
@@ -0,0 +1,1792 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 66f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 53f\n"
+      "beq 40f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 27f\n"
+      "beq 14f\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 3f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z16.d, z12.d\n"
+      "zip2 z12.d, z16.d, z12.d\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 8f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "cmp x27, #0x4\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z18.s }, p0/Z, [x26]\n"
+      ".inst 0x658ab652  // bfcvt z18.h, p5/M, z18.s\n"
+      "uzp1 z18.h, z18.h, z18.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "add x26, x26, #0x10\n"
+      "addvl x10, x10, #8\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z18.s }, p0/Z, [x26]\n"
+      ".inst 0x658ab652  // bfcvt z18.h, p5/M, z18.s\n"
+      "uzp1 z18.h, z18.h, z18.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "addvl x10, x10, #8\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 6b\n"
+      "uzp1 z8.d, z8.d, z12.d\n"
+      "uzp1 z9.d, z9.d, z13.d\n"
+      "uzp1 z10.d, z10.d, z14.d\n"
+      "uzp1 z11.d, z11.d, z15.d\n"
+      "tbz %x[flags], #1, 12f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "12:"  // Height 1: No activation
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "13:"  // Height 1: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 2b\n"
+      "b 80f\n"
+      "14:"  // Height 2
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "15:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 16f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "b 18f\n"
+      "16:"  // Height 2: no bias
+      "tbz %x[flags], #0, 17f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 18f\n"
+      "17:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "18:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 21f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "21:"  // Height 2: input setup done
+      "cmp x27, #0x4\n"
+      "ble 23f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z19.s }, p0/Z, [x26]\n"
+      "ld1rqw { z18.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab673  // bfcvt z19.h, p5/M, z19.s\n"
+      ".inst 0x658ab652  // bfcvt z18.h, p5/M, z18.s\n"
+      "uzp1 z19.h, z19.h, z19.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z18.h, z18.h, z18.h\n"
+      "trn1 z19.d, z19.d, z18.d\n"
+      ".inst 0x6471e668  // bfmmla z8.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6470e66c  // bfmmla z12.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e669  // bfmmla z9.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6470e66d  // bfmmla z13.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e66a  // bfmmla z10.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6470e66e  // bfmmla z14.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6471e66b  // bfmmla z11.s, z19.h, z17.h\n"
+      ".inst 0x6470e66f  // bfmmla z15.s, z19.h, z16.h\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "addvl x10, x10, #8\n"
+      "bgt 22b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z19.s }, p0/Z, [x26]\n"
+      "ld1rqw { z18.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab673  // bfcvt z19.h, p5/M, z19.s\n"
+      ".inst 0x658ab652  // bfcvt z18.h, p5/M, z18.s\n"
+      "uzp1 z19.h, z19.h, z19.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z18.h, z18.h, z18.h\n"
+      "trn1 z19.d, z19.d, z18.d\n"
+      ".inst 0x6471e668  // bfmmla z8.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6470e66c  // bfmmla z12.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e669  // bfmmla z9.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6470e66d  // bfmmla z13.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e66a  // bfmmla z10.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6470e66e  // bfmmla z14.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6471e66b  // bfmmla z11.s, z19.h, z17.h\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6470e66f  // bfmmla z15.s, z19.h, z16.h\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 19b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z6.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "add x25, x9, x20, LSL #2\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "tbz %x[flags], #1, 25f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z6.s, p5/M, z6.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z6.s, p5/M, z6.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "25:"  // Height 2: No activation
+      "st1w { z6.s }, p4, [x9]\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "26:"  // Height 2: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 15b\n"
+      "b 80f\n"
+      "27:"  // Height 3
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "28:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 29f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "b 31f\n"
+      "29:"  // Height 3: no bias
+      "tbz %x[flags], #0, 30f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x20]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 31f\n"
+      "30:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "31:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "32:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 34f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "b 34f\n"
+      "33:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "34:"  // Height 3: input setup done
+      "cmp x27, #0x4\n"
+      "ble 36f\n"
+      "35:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z28.s }, p0/Z, [x26]\n"
+      "ld1rqw { z27.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab79c  // bfcvt z28.h, p5/M, z28.s\n"
+      "ld1rqw { z26.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab77b  // bfcvt z27.h, p5/M, z27.s\n"
+      "uzp1 z28.h, z28.h, z28.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "uzp1 z27.h, z27.h, z27.h\n"
+      ".inst 0x658ab75a  // bfcvt z26.h, p5/M, z26.s\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "sub x27, x27, #0x4\n"
+      "trn1 z28.d, z28.d, z27.d\n"
+      "uzp1 z26.h, z26.h, z26.h\n"
+      ".inst 0x6479e788  // bfmmla z8.s, z28.h, z25.h\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e78c  // bfmmla z12.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e789  // bfmmla z9.s, z28.h, z25.h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      ".inst 0x6478e78d  // bfmmla z13.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e78a  // bfmmla z10.s, z28.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      ".inst 0x6478e78e  // bfmmla z14.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6479e78b  // bfmmla z11.s, z28.h, z25.h\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      ".inst 0x6478e78f  // bfmmla z15.s, z28.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "bgt 35b\n"
+      "36:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z28.s }, p0/Z, [x26]\n"
+      "ld1rqw { z27.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab79c  // bfcvt z28.h, p5/M, z28.s\n"
+      "ld1rqw { z26.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab77b  // bfcvt z27.h, p5/M, z27.s\n"
+      "uzp1 z28.h, z28.h, z28.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "uzp1 z27.h, z27.h, z27.h\n"
+      ".inst 0x658ab75a  // bfcvt z26.h, p5/M, z26.s\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "trn1 z28.d, z28.d, z27.d\n"
+      "uzp1 z26.h, z26.h, z26.h\n"
+      ".inst 0x6479e788  // bfmmla z8.s, z28.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e78c  // bfmmla z12.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e789  // bfmmla z9.s, z28.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      ".inst 0x6478e78d  // bfmmla z13.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e78a  // bfmmla z10.s, z28.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      ".inst 0x6478e78e  // bfmmla z14.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6479e78b  // bfmmla z11.s, z28.h, z25.h\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      ".inst 0x6478e78f  // bfmmla z15.s, z28.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "37:"  // Height 3: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 32b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "uzp1 z6.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "tbz %x[flags], #1, 38f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z6.s, p5/M, z6.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmax z6.s, p5/M, z6.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
+      "38:"  // Height 3: No activation
+      "st1w { z6.s }, p4, [x9]\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x24]\n"
+      "st1w { z17.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #3, MUL VL]\n"
+      "39:"  // Height 3: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 28b\n"
+      "b 80f\n"
+      "40:"  // Height 4
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "41:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 42f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "b 44f\n"
+      "42:"  // Height 4: no bias
+      "tbz %x[flags], #0, 43f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x21]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 44f\n"
+      "43:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "44:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "45:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 46f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "b 47f\n"
+      "46:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "47:"  // Height 4: input setup done
+      "cmp x27, #0x4\n"
+      "ble 49f\n"
+      "48:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z29.s }, p0/Z, [x26]\n"
+      "ld1rqw { z28.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab7bd  // bfcvt z29.h, p5/M, z29.s\n"
+      "ld1rqw { z27.s }, p0/Z, [x24]\n"
+      "ld1rqw { z26.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab79c  // bfcvt z28.h, p5/M, z28.s\n"
+      ".inst 0x658ab77b  // bfcvt z27.h, p5/M, z27.s\n"
+      ".inst 0x658ab75a  // bfcvt z26.h, p5/M, z26.s\n"
+      "uzp1 z29.h, z29.h, z29.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z28.h, z28.h, z28.h\n"
+      "uzp1 z27.h, z27.h, z27.h\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "uzp1 z26.h, z26.h, z26.h\n"
+      "trn1 z29.d, z29.d, z28.d\n"
+      ".inst 0x6479e7a8  // bfmmla z8.s, z29.h, z25.h\n"
+      "add x26, x26, #0x10\n"
+      "trn1 z27.d, z27.d, z26.d\n"
+      ".inst 0x6479e770  // bfmmla z16.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ac  // bfmmla z12.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6478e774  // bfmmla z20.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e7a9  // bfmmla z9.s, z29.h, z25.h\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6479e771  // bfmmla z17.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ad  // bfmmla z13.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6478e775  // bfmmla z21.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e7aa  // bfmmla z10.s, z29.h, z25.h\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6479e772  // bfmmla z18.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ae  // bfmmla z14.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e776  // bfmmla z22.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6479e7ab  // bfmmla z11.s, z29.h, z25.h\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6479e773  // bfmmla z19.s, z27.h, z25.h\n"
+      ".inst 0x6478e7af  // bfmmla z15.s, z29.h, z24.h\n"
+      ".inst 0x6478e777  // bfmmla z23.s, z27.h, z24.h\n"
+      "bgt 48b\n"
+      "49:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z29.s }, p0/Z, [x26]\n"
+      "ld1rqw { z28.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab7bd  // bfcvt z29.h, p5/M, z29.s\n"
+      "ld1rqw { z27.s }, p0/Z, [x24]\n"
+      "ld1rqw { z26.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab79c  // bfcvt z28.h, p5/M, z28.s\n"
+      ".inst 0x658ab77b  // bfcvt z27.h, p5/M, z27.s\n"
+      ".inst 0x658ab75a  // bfcvt z26.h, p5/M, z26.s\n"
+      "uzp1 z29.h, z29.h, z29.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z28.h, z28.h, z28.h\n"
+      "uzp1 z27.h, z27.h, z27.h\n"
+      "uzp1 z26.h, z26.h, z26.h\n"
+      "trn1 z29.d, z29.d, z28.d\n"
+      ".inst 0x6479e7a8  // bfmmla z8.s, z29.h, z25.h\n"
+      "trn1 z27.d, z27.d, z26.d\n"
+      ".inst 0x6479e770  // bfmmla z16.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ac  // bfmmla z12.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6478e774  // bfmmla z20.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e7a9  // bfmmla z9.s, z29.h, z25.h\n"
+      ".inst 0x6479e771  // bfmmla z17.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ad  // bfmmla z13.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6478e775  // bfmmla z21.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e7aa  // bfmmla z10.s, z29.h, z25.h\n"
+      ".inst 0x6479e772  // bfmmla z18.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ae  // bfmmla z14.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e776  // bfmmla z22.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6479e7ab  // bfmmla z11.s, z29.h, z25.h\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6479e773  // bfmmla z19.s, z27.h, z25.h\n"
+      ".inst 0x6478e7af  // bfmmla z15.s, z29.h, z24.h\n"
+      ".inst 0x6478e777  // bfmmla z23.s, z27.h, z24.h\n"
+      "50:"  // Height 4: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 45b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z6.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "tbz %x[flags], #1, 51f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z6.s, p5/M, z6.s, z24.s\n"
+      "fmin z12.s, p5/M, z12.s, z24.s\n"
+      "fmin z13.s, p5/M, z13.s, z24.s\n"
+      "fmin z14.s, p5/M, z14.s, z24.s\n"
+      "fmin z8.s, p5/M, z8.s, z24.s\n"
+      "fmin z9.s, p5/M, z9.s, z24.s\n"
+      "fmin z10.s, p5/M, z10.s, z24.s\n"
+      "fmin z11.s, p5/M, z11.s, z24.s\n"
+      "fmin z15.s, p5/M, z15.s, z24.s\n"
+      "fmin z20.s, p5/M, z20.s, z24.s\n"
+      "fmin z21.s, p5/M, z21.s, z24.s\n"
+      "fmin z22.s, p5/M, z22.s, z24.s\n"
+      "fmin z16.s, p5/M, z16.s, z24.s\n"
+      "fmin z17.s, p5/M, z17.s, z24.s\n"
+      "fmin z18.s, p5/M, z18.s, z24.s\n"
+      "fmin z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z6.s, p5/M, z6.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
+      "51:"  // Height 4: No activation
+      "st1w { z6.s }, p4, [x9]\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x24]\n"
+      "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "52:"  // Height 4: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 41b\n"
+      "b 80f\n"
+      "53:"  // Height 5
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "54:"  // Height 5: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 55f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z28.d, z12.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z29.d, z13.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z30.d, z14.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z31.d, z15.d\n"
+      "b 57f\n"
+      "55:"  // Height 5: no bias
+      "tbz %x[flags], #0, 56f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 57f\n"
+      "56:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "57:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "58:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 59f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 60f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "b 60f\n"
+      "59:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "60:"  // Height 5: input setup done
+      "cmp x27, #0x4\n"
+      "ble 62f\n"
+      "61:"  // Height 5: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z6.s }, p0/Z, [x26]\n"
+      "ld1rqw { z5.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab4c6  // bfcvt z6.h, p5/M, z6.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x24]\n"
+      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab4a5  // bfcvt z5.h, p5/M, z5.s\n"
+      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x22]\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "sub x27, x27, #0x4\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "cmp x27, #0x4\n"
+      "add x26, x26, #0x10\n"
+      "trn1 z6.d, z6.d, z5.d\n"
+      "trn1 z4.d, z4.d, z3.d\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
+      "add x25, x25, #0x10\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      ".inst 0x6461e490  // bfmmla z16.s, z4.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e494  // bfmmla z20.s, z4.h, z0.h\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6461e491  // bfmmla z17.s, z4.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e495  // bfmmla z21.s, z4.h, z0.h\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
+      ".inst 0x6461e492  // bfmmla z18.s, z4.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e496  // bfmmla z22.s, z4.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6461e493  // bfmmla z19.s, z4.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e497  // bfmmla z23.s, z4.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
+      "bgt 61b\n"
+      "62:"  // Height 5: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z6.s }, p0/Z, [x26]\n"
+      "ld1rqw { z5.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab4c6  // bfcvt z6.h, p5/M, z6.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x24]\n"
+      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab4a5  // bfcvt z5.h, p5/M, z5.s\n"
+      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x22]\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "trn1 z6.d, z6.d, z5.d\n"
+      "trn1 z4.d, z4.d, z3.d\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      ".inst 0x6461e490  // bfmmla z16.s, z4.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e494  // bfmmla z20.s, z4.h, z0.h\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
+      ".inst 0x6461e491  // bfmmla z17.s, z4.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e495  // bfmmla z21.s, z4.h, z0.h\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
+      ".inst 0x6461e492  // bfmmla z18.s, z4.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e496  // bfmmla z22.s, z4.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6461e493  // bfmmla z19.s, z4.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e497  // bfmmla z23.s, z4.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
+      "63:"  // Height 5: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 58b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z6.d, z8.d, z12.d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "tbz %x[flags], #1, 64f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z6.s, p5/M, z6.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z6.s, p5/M, z6.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
+      "fmax z24.s, p5/M, z24.s, z23.s\n"
+      "fmax z25.s, p5/M, z25.s, z23.s\n"
+      "fmax z26.s, p5/M, z26.s, z23.s\n"
+      "fmax z27.s, p5/M, z27.s, z23.s\n"
+      "64:"  // Height 5: No activation
+      "st1w { z6.s }, p4, [x9]\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x24]\n"
+      "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x22]\n"
+      "st1w { z25.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x22, #3, MUL VL]\n"
+      "65:"  // Height 5: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 54b\n"
+      "b 80f\n"
+      "66:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "mov x12, %x[bias]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "67:"  // Height 6: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "cbz x12, 68f\n"
+      "ld1w { z8.s }, p5/Z, [x12]\n"
+      "ld1w { z9.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x12, x12, #4\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z28.d, z12.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z29.d, z13.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z30.d, z14.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z31.d, z15.d\n"
+      "b 70f\n"
+      "68:"  // Height 6: no bias
+      "tbz %x[flags], #0, 69f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z17.s }, p4/Z, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z17.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip2 z12.d, z17.d, z12.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z20.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z14.d, z20.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 70f\n"
+      "69:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "70:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "71:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 72f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 73f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20, LSL #2\n"
+      "add x25, x25, x20, LSL #2\n"
+      "add x24, x24, x20, LSL #2\n"
+      "add x23, x23, x20, LSL #2\n"
+      "add x22, x22, x20, LSL #2\n"
+      "add x21, x21, x20, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
+      "73:"  // Height 6: input setup done
+      "cmp x27, #0x4\n"
+      "ble 75f\n"
+      "74:"  // Height 6: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z7.s }, p0/Z, [x26]\n"
+      "ld1rqw { z6.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab4e7  // bfcvt z7.h, p5/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z4.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab4c6  // bfcvt z6.h, p5/M, z6.s\n"
+      ".inst 0x658ab4a5  // bfcvt z5.h, p5/M, z5.s\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
+      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "sub x27, x27, #0x4\n"
+      "cmp x27, #0x4\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
+      "add x24, x24, #0x10\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      "trn1 z3.d, z3.d, z2.d\n"
+      ".inst 0x6461e4b0  // bfmmla z16.s, z5.h, z1.h\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6461e478  // bfmmla z24.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6460e4b4  // bfmmla z20.s, z5.h, z0.h\n"
+      ".inst 0x6460e47c  // bfmmla z28.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b1  // bfmmla z17.s, z5.h, z1.h\n"
+      ".inst 0x6461e479  // bfmmla z25.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4b5  // bfmmla z21.s, z5.h, z0.h\n"
+      ".inst 0x6460e47d  // bfmmla z29.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b2  // bfmmla z18.s, z5.h, z1.h\n"
+      ".inst 0x6461e47a  // bfmmla z26.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4b6  // bfmmla z22.s, z5.h, z0.h\n"
+      ".inst 0x6460e47e  // bfmmla z30.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b3  // bfmmla z19.s, z5.h, z1.h\n"
+      ".inst 0x6461e47b  // bfmmla z27.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6460e47f  // bfmmla z31.s, z3.h, z0.h\n"
+      "bgt 74b\n"
+      "75:"  // Height 6: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x27\n"
+      "ld1rqw { z7.s }, p0/Z, [x26]\n"
+      "ld1rqw { z6.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab4e7  // bfcvt z7.h, p5/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z4.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab4c6  // bfcvt z6.h, p5/M, z6.s\n"
+      ".inst 0x658ab4a5  // bfcvt z5.h, p5/M, z5.s\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
+      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      "trn1 z3.d, z3.d, z2.d\n"
+      ".inst 0x6461e4b0  // bfmmla z16.s, z5.h, z1.h\n"
+      ".inst 0x6461e478  // bfmmla z24.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e4b4  // bfmmla z20.s, z5.h, z0.h\n"
+      ".inst 0x6460e47c  // bfmmla z28.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b1  // bfmmla z17.s, z5.h, z1.h\n"
+      ".inst 0x6461e479  // bfmmla z25.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4b5  // bfmmla z21.s, z5.h, z0.h\n"
+      ".inst 0x6460e47d  // bfmmla z29.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b2  // bfmmla z18.s, z5.h, z1.h\n"
+      ".inst 0x6461e47a  // bfmmla z26.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4b6  // bfmmla z22.s, z5.h, z0.h\n"
+      ".inst 0x6460e47e  // bfmmla z30.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b3  // bfmmla z19.s, z5.h, z1.h\n"
+      ".inst 0x6461e47b  // bfmmla z27.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6460e47f  // bfmmla z31.s, z3.h, z0.h\n"
+      "76:"  // Height 6: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 71b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x25, x9, x20, LSL #2\n"
+      "add x24, x25, x20, LSL #2\n"
+      "uzp1 z6.d, z8.d, z12.d\n"
+      "add x23, x24, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "tbz %x[flags], #1, 77f\n"
+      "add x20, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x20]\n"
+      "fmin z6.s, p5/M, z6.s, z1.s\n"
+      "fmin z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z1.s\n"
+      "fmin z14.s, p5/M, z14.s, z1.s\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmin z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z1.s\n"
+      "fmin z22.s, p5/M, z22.s, z1.s\n"
+      "fmin z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z1.s\n"
+      "fmin z18.s, p5/M, z18.s, z1.s\n"
+      "fmin z19.s, p5/M, z19.s, z1.s\n"
+      "fmin z23.s, p5/M, z23.s, z1.s\n"
+      "fmin z28.s, p5/M, z28.s, z1.s\n"
+      "fmin z29.s, p5/M, z29.s, z1.s\n"
+      "fmin z30.s, p5/M, z30.s, z1.s\n"
+      "fmin z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z1.s\n"
+      "fmin z26.s, p5/M, z26.s, z1.s\n"
+      "fmin z27.s, p5/M, z27.s, z1.s\n"
+      "fmax z6.s, p5/M, z6.s, z0.s\n"
+      "fmax z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z0.s\n"
+      "fmax z14.s, p5/M, z14.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z0.s\n"
+      "fmax z22.s, p5/M, z22.s, z0.s\n"
+      "fmax z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z0.s\n"
+      "fmax z18.s, p5/M, z18.s, z0.s\n"
+      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z28.s, p5/M, z28.s, z0.s\n"
+      "fmax z29.s, p5/M, z29.s, z0.s\n"
+      "fmax z30.s, p5/M, z30.s, z0.s\n"
+      "fmax z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z0.s\n"
+      "fmax z26.s, p5/M, z26.s, z0.s\n"
+      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "77:"  // Height 6: No activation
+      "st1w { z6.s }, p4, [x9]\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x25]\n"
+      "st1w { z9.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x25, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x24]\n"
+      "st1w { z20.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z23.s }, p4, [x22]\n"
+      "st1w { z28.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z29.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z30.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x21]\n"
+      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "78:"  // Height 6: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 67b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 80f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 79f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "79:"  // Update direct input
+      "mov x20, #0x18\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "80:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
index bc93ced25b..ffc1606b3f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,21 +10,22 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -42,7 +43,8 @@ void sve_hybrid_s8qa_dot_4x4VL( ARGLIST );
 class cls_sve_hybrid_s8qa_dot_4x4VL
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,7 +70,21 @@ public:
         return false;
     }
 
-    StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 4, 4, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 29.89 };
+                case CPUModel::A510:
+                    return { 17.12 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_s8qa_dot_4x4VL;
@@ -80,4 +96,4 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
index 50b9ba524d..b7c523466e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,18 +10,18 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -85,230 +85,227 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "cmp %x[M], #0x2\n"
       "bgt 29f\n"
       "beq 15f\n"
+      "mov x10, %x[col_bias]\n"
       "mov z11.s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
       "mov z15.b, #0x1\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x27, %x[col_bias]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x26, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
       "mov z16.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z17.s, #0x0\n"
-      "whilelt p1.b, x19, x9\n"
       "mov z18.s, #0x0\n"
       "mov z19.s, #0x0\n"
       "3:"  // Height 1: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "cbnz x25, 6f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
       "b 6f\n"
       "5:"  // Height 1: setup direct input
-      "mov x23, %x[input_ptr]\n"
+      "mov x24, %x[input_ptr]\n"
       "6:"  // Height 1: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "ble 9f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "sdot z16.s, z20.b, z0.b[0]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "sdot z17.s, z21.b, z0.b[0]\n"
+      "sdot z18.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z19.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "sdot z16.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "sdot z17.s, z21.b, z0.b[1]\n"
+      "sdot z18.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
+      "sdot z19.s, z20.b, z0.b[1]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "sdot z16.s, z22.b, z0.b[2]\n"
+      "sdot z17.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "sdot z18.s, z21.b, z0.b[2]\n"
+      "sdot z19.s, z20.b, z0.b[2]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "sdot z16.s, z22.b, z0.b[3]\n"
+      "sdot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "sdot z18.s, z21.b, z0.b[3]\n"
+      "sdot z19.s, z20.b, z0.b[3]\n"
+      "add x24, x24, #0x10\n"
       "tbnz %x[flags], #31, 8f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "cmp x24, #0x10\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
       "bgt 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1b { z22.b }, p2/Z, [x28]\n"
+      "subs x25, x25, #0x4\n"
+      "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "sdot z16.s, z22.b, z0.b[0]\n"
+      "sdot z17.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z18.s, z21.b, z0.b[0]\n"
+      "sdot z19.s, z20.b, z0.b[0]\n"
       "addvl x28, x28, #4\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
       "ble 10f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "sdot z16.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z17.s, z22.b, z0.b[1]\n"
+      "sdot z18.s, z21.b, z0.b[1]\n"
+      "sdot z19.s, z20.b, z0.b[1]\n"
       "addvl x28, x28, #4\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
       "ble 10f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "sdot z16.s, z20.b, z0.b[2]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z17.s, z22.b, z0.b[2]\n"
+      "sdot z18.s, z21.b, z0.b[2]\n"
+      "sdot z19.s, z20.b, z0.b[2]\n"
       "addvl x28, x28, #4\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
       "ble 10f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z21.b, z0.b[3]\n"
+      "sdot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z18.s, z21.b, z0.b[3]\n"
+      "sdot z19.s, z20.b, z0.b[3]\n"
       "addvl x28, x28, #4\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
       "10:"  // Height 1: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 11f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "11:"  // Height 1: Multiply loop: unique 2: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 4b\n"
       "tbnz %x[flags], #31, 12f\n"
-      "add x19, %x[qp], %[b_offset]\n"
-      "ld1rw { z1.s }, p2/Z, [x19]\n"
-      "neg z1.s, p2/M, z1.s\n"
-      "mov x19, #0x4\n"
-      "whilelt p0.s, XZR, x19\n"
+      "mov x20, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
       "saddv d11, p0, z11.s\n"
       "mov z11.s, z11.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "neg z20.s, p2/M, z20.s\n"
+      "mul z11.s, p2/M, z11.s, z20.s\n"
       "12:"  // Height 1: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x27]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1w { z23.s }, p2/Z, [x10]\n"
+      "ld1w { z22.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
-      "add x19, %x[qp], %[per_layer_mul]\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
-      "addvl x27, x27, #4\n"
-      "add z16.s, z16.s, z0.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z17.s, z17.s, z1.s\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z16.s, z16.s, z23.s\n"
+      "add z17.s, z17.s, z22.s\n"
+      "add z18.s, z18.s, z21.s\n"
+      "add z19.s, z19.s, z20.s\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0x04b47610  // sqrdmulh z16.s, z16.s, z20.s\n"
+      ".inst 0x04b47631  // sqrdmulh z17.s, z17.s, z20.s\n"
+      "addvl x10, x10, #4\n"
+      ".inst 0x04b47652  // sqrdmulh z18.s, z18.s, z20.s\n"
+      ".inst 0x04b47673  // sqrdmulh z19.s, z19.s, z20.s\n"
       "tbz %x[flags], #5, 13f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z19.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
+      "and z23.d, z16.d, z0.d\n"
+      "and z22.d, z17.d, z0.d\n"
+      "and z21.d, z18.d, z0.d\n"
+      "and z20.d, z19.d, z0.d\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z23.s\n"
+      "sqadd z17.s, z17.s, z22.s\n"
+      "sqadd z18.s, z18.s, z21.s\n"
+      "sqadd z19.s, z19.s, z20.s\n"
       "13:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add x19, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z20.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
-      "add x19, %x[qp], %[minval]\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "ld1rw { z5.s }, p2/Z, [x19]\n"
-      "add x19, %x[qp], %[maxval]\n"
+      "add z17.s, z17.s, z20.s\n"
+      "add z18.s, z18.s, z20.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "ld1rw { z6.s }, p2/Z, [x19]\n"
-      "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z20.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z21.s\n"
+      "smin z17.s, p2/M, z17.s, z21.s\n"
+      "smin z18.s, p2/M, z18.s, z21.s\n"
+      "smin z19.s, p2/M, z19.s, z21.s\n"
+      "smax z16.s, p2/M, z16.s, z20.s\n"
+      "smax z17.s, p2/M, z17.s, z20.s\n"
+      "smax z18.s, p2/M, z18.s, z20.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z19.s, p2/M, z19.s, z20.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x26]\n"
-      "addvl x26, x26, #1\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
       "14:"  // Height 1: Writeback done
       "decw x9, ALL, MUL #4\n"
       "cmp x9, XZR\n"
       "bgt 2b\n"
       "b 58f\n"
       "15:"  // Height 2
+      "mov x10, %x[col_bias]\n"
       "mov z11.s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "mov z12.s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "mov z15.b, #0x1\n"
-      "mov x26, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "16:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
       "mov z16.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z17.s, #0x0\n"
-      "whilelt p1.b, x19, x9\n"
       "mov z18.s, #0x0\n"
       "mov z19.s, #0x0\n"
       "mov z20.s, #0x0\n"
@@ -316,308 +313,302 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "mov z22.s, #0x0\n"
       "mov z23.s, #0x0\n"
       "17:"  // Height 2: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "18:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 19f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "cbnz x25, 20f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 20f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 20f\n"
       "19:"  // Height 2: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
       "20:"  // Height 2: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "ble 23f\n"
       "21:"  // Height 2: Multiply loop: Main loop head
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x24, x24, #0x10\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z24.b, z0.b[0]\n"
+      "sdot z20.s, z24.b, z1.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z17.s, z26.b, z0.b[0]\n"
+      "sdot z21.s, z26.b, z1.b[0]\n"
+      "sdot z18.s, z24.b, z0.b[0]\n"
+      "sdot z22.s, z24.b, z1.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "sdot z19.s, z25.b, z0.b[0]\n"
+      "sdot z23.s, z25.b, z1.b[0]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "sdot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "sdot z23.s, z4.b, z1.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "sdot z21.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "sdot z23.s, z8.b, z1.b[2]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
+      "sdot z16.s, z24.b, z0.b[1]\n"
+      "sdot z20.s, z24.b, z1.b[1]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "sdot z17.s, z27.b, z0.b[1]\n"
+      "sdot z21.s, z27.b, z1.b[1]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "sdot z18.s, z26.b, z0.b[1]\n"
+      "sdot z22.s, z26.b, z1.b[1]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "sdot z19.s, z25.b, z0.b[1]\n"
+      "sdot z23.s, z25.b, z1.b[1]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "sdot z16.s, z24.b, z0.b[2]\n"
+      "sdot z20.s, z24.b, z1.b[2]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      "sdot z17.s, z30.b, z0.b[2]\n"
+      "sdot z21.s, z30.b, z1.b[2]\n"
+      "sdot z18.s, z29.b, z0.b[2]\n"
+      "sdot z22.s, z29.b, z1.b[2]\n"
+      "sdot z19.s, z28.b, z0.b[2]\n"
+      "sdot z23.s, z28.b, z1.b[2]\n"
+      "sdot z16.s, z27.b, z0.b[3]\n"
+      "sdot z20.s, z27.b, z1.b[3]\n"
+      "sdot z17.s, z26.b, z0.b[3]\n"
+      "sdot z21.s, z26.b, z1.b[3]\n"
+      "sdot z18.s, z25.b, z0.b[3]\n"
+      "sdot z22.s, z25.b, z1.b[3]\n"
+      "sdot z19.s, z24.b, z0.b[3]\n"
+      "sdot z23.s, z24.b, z1.b[3]\n"
       "tbnz %x[flags], #31, 22f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z12.s, z1.b, z15.b\n"
       "22:"  // Height 2: Multiply loop: unique 3: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x10\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
       "bgt 21b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x25, x25, #0x4\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z24.b, z0.b[0]\n"
+      "sdot z20.s, z24.b, z1.b[0]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z17.s, z26.b, z0.b[0]\n"
+      "sdot z21.s, z26.b, z1.b[0]\n"
+      "sdot z18.s, z25.b, z0.b[0]\n"
+      "sdot z22.s, z25.b, z1.b[0]\n"
       "addvl x28, x28, #4\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z24.b, z0.b[0]\n"
+      "sdot z23.s, z24.b, z1.b[0]\n"
       "ble 24f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "sdot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
+      "ld1b { z27.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "sdot z16.s, z27.b, z0.b[1]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z27.b, z1.b[1]\n"
+      "sdot z17.s, z26.b, z0.b[1]\n"
+      "sdot z21.s, z26.b, z1.b[1]\n"
+      "sdot z18.s, z25.b, z0.b[1]\n"
       "addvl x28, x28, #4\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "sdot z23.s, z4.b, z1.b[1]\n"
+      "sdot z22.s, z25.b, z1.b[1]\n"
+      "sdot z19.s, z24.b, z0.b[1]\n"
+      "sdot z23.s, z24.b, z1.b[1]\n"
       "ble 24f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
+      "ld1b { z27.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "sdot z16.s, z27.b, z0.b[2]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z27.b, z1.b[2]\n"
+      "sdot z17.s, z26.b, z0.b[2]\n"
+      "sdot z21.s, z26.b, z1.b[2]\n"
+      "sdot z18.s, z25.b, z0.b[2]\n"
       "addvl x28, x28, #4\n"
-      "sdot z21.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "sdot z23.s, z8.b, z1.b[2]\n"
+      "sdot z22.s, z25.b, z1.b[2]\n"
+      "sdot z19.s, z24.b, z0.b[2]\n"
+      "sdot z23.s, z24.b, z1.b[2]\n"
       "ble 24f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z24.b, z0.b[3]\n"
+      "sdot z20.s, z24.b, z1.b[3]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z17.s, z26.b, z0.b[3]\n"
+      "sdot z21.s, z26.b, z1.b[3]\n"
+      "sdot z18.s, z25.b, z0.b[3]\n"
+      "sdot z22.s, z25.b, z1.b[3]\n"
       "addvl x28, x28, #4\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
+      "sdot z19.s, z24.b, z0.b[3]\n"
+      "sdot z23.s, z24.b, z1.b[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 25f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z12.s, z1.b, z15.b\n"
       "25:"  // Height 2: Multiply loop: unique 4: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 18b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x26, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
       "tbnz %x[flags], #31, 26f\n"
-      "add x19, %x[qp], %[b_offset]\n"
-      "ld1rw { z2.s }, p2/Z, [x19]\n"
-      "neg z2.s, p2/M, z2.s\n"
-      "mov x19, #0x4\n"
-      "whilelt p0.s, XZR, x19\n"
+      "mov x20, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       "saddv d11, p0, z11.s\n"
-      "saddv d12, p0, z12.s\n"
       "mov z11.s, z11.s[0]\n"
+      "saddv d12, p0, z12.s\n"
+      "neg z24.s, p2/M, z24.s\n"
       "mov z12.s, z12.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z2.s\n"
-      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "mul z11.s, p2/M, z11.s, z24.s\n"
+      "mul z12.s, p2/M, z12.s, z24.s\n"
       "26:"  // Height 2: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x27]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1w { z28.s }, p2/Z, [x10]\n"
+      "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
-      "add x19, %x[qp], %[per_layer_mul]\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
-      "addvl x27, x27, #4\n"
+      "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
       "add z21.s, z21.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z16.s, z16.s, z28.s\n"
+      "add z17.s, z17.s, z27.s\n"
+      "addvl x10, x10, #4\n"
+      "add z18.s, z18.s, z26.s\n"
+      "add z19.s, z19.s, z25.s\n"
+      "add z20.s, z20.s, z28.s\n"
+      "add z21.s, z21.s, z27.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z22.s, z22.s, z26.s\n"
+      "add z23.s, z23.s, z25.s\n"
+      ".inst 0x04b87610  // sqrdmulh z16.s, z16.s, z24.s\n"
+      ".inst 0x04b87631  // sqrdmulh z17.s, z17.s, z24.s\n"
+      ".inst 0x04b87652  // sqrdmulh z18.s, z18.s, z24.s\n"
+      ".inst 0x04b87673  // sqrdmulh z19.s, z19.s, z24.s\n"
+      ".inst 0x04b87694  // sqrdmulh z20.s, z20.s, z24.s\n"
+      ".inst 0x04b876b5  // sqrdmulh z21.s, z21.s, z24.s\n"
+      ".inst 0x04b876d6  // sqrdmulh z22.s, z22.s, z24.s\n"
+      ".inst 0x04b876f7  // sqrdmulh z23.s, z23.s, z24.s\n"
       "tbz %x[flags], #5, 27f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "and z9.d, z21.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
+      "and z24.d, z16.d, z0.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z24.s\n"
+      "and z30.d, z17.d, z0.d\n"
+      "and z29.d, z18.d, z0.d\n"
+      "and z28.d, z19.d, z0.d\n"
+      "and z27.d, z20.d, z0.d\n"
+      "and z26.d, z21.d, z0.d\n"
+      "and z25.d, z22.d, z0.d\n"
+      "and z24.d, z23.d, z0.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z30.s\n"
+      "sqadd z18.s, z18.s, z29.s\n"
+      "sqadd z19.s, z19.s, z28.s\n"
+      "sqadd z20.s, z20.s, z27.s\n"
+      "sqadd z21.s, z21.s, z26.s\n"
+      "sqadd z22.s, z22.s, z25.s\n"
+      "sqadd z23.s, z23.s, z24.s\n"
       "27:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add x19, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z24.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
-      "add x19, %x[qp], %[minval]\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "ld1rw { z5.s }, p2/Z, [x19]\n"
-      "add x19, %x[qp], %[maxval]\n"
+      "add z17.s, z17.s, z24.s\n"
+      "add z18.s, z18.s, z24.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "ld1rw { z6.s }, p2/Z, [x19]\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z19.s, z19.s, z24.s\n"
+      "add z20.s, z20.s, z24.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "uzp1 z17.h, z18.h, z19.h\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x26]\n"
-      "add z21.s, z21.s, z4.s\n"
-      "addvl x26, x26, #1\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "add z21.s, z21.s, z24.s\n"
+      "add z22.s, z22.s, z24.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "add z23.s, z23.s, z24.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z25.s\n"
+      "smin z17.s, p2/M, z17.s, z25.s\n"
+      "smin z18.s, p2/M, z18.s, z25.s\n"
+      "smin z19.s, p2/M, z19.s, z25.s\n"
+      "smin z20.s, p2/M, z20.s, z25.s\n"
+      "smin z21.s, p2/M, z21.s, z25.s\n"
+      "smin z22.s, p2/M, z22.s, z25.s\n"
+      "smin z23.s, p2/M, z23.s, z25.s\n"
+      "smax z16.s, p2/M, z16.s, z24.s\n"
+      "smax z17.s, p2/M, z17.s, z24.s\n"
+      "smax z18.s, p2/M, z18.s, z24.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z19.s, p2/M, z19.s, z24.s\n"
+      "smax z20.s, p2/M, z20.s, z24.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z21.s, p2/M, z21.s, z24.s\n"
+      "smax z22.s, p2/M, z22.s, z24.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "st1b { z20.b }, p1, [x22]\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smax z23.s, p2/M, z23.s, z24.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "st1b { z20.b }, p1, [x23]\n"
+      "addvl x27, x27, #1\n"
       "28:"  // Height 2: Writeback done
       "decw x9, ALL, MUL #4\n"
       "cmp x9, XZR\n"
       "bgt 16b\n"
       "b 58f\n"
       "29:"  // Height 3
+      "mov x10, %x[col_bias]\n"
       "mov z11.s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "mov z12.s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "mov z13.s, #0x0\n"
-      "mov x26, %x[output_ptr]\n"
       "mov z15.b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "30:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
       "mov z16.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z17.s, #0x0\n"
-      "whilelt p1.b, x19, x9\n"
       "mov z18.s, #0x0\n"
       "mov z19.s, #0x0\n"
       "mov z20.s, #0x0\n"
@@ -629,403 +620,394 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "mov z26.s, #0x0\n"
       "mov z27.s, #0x0\n"
       "31:"  // Height 3: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "cbnz x25, 34f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 34f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
       "b 34f\n"
       "33:"  // Height 3: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "34:"  // Height 3: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "ble 37f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x24, x24, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "ld1b { z28.b }, p2/Z, [x28]\n"
+      "sdot z16.s, z28.b, z0.b[0]\n"
+      "sdot z20.s, z28.b, z1.b[0]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "sdot z24.s, z28.b, z2.b[0]\n"
+      "sdot z17.s, z30.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z21.s, z30.b, z1.b[0]\n"
+      "sdot z25.s, z30.b, z2.b[0]\n"
+      "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "sdot z18.s, z29.b, z0.b[0]\n"
+      "sdot z22.s, z29.b, z1.b[0]\n"
+      "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "sdot z26.s, z29.b, z2.b[0]\n"
+      "sdot z19.s, z28.b, z0.b[0]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      "sdot z23.s, z28.b, z1.b[0]\n"
+      "sdot z27.s, z28.b, z2.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "sdot z16.s, z3.b, z0.b[1]\n"
+      "sdot z20.s, z3.b, z1.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x21]\n"
+      "sdot z24.s, z3.b, z2.b[1]\n"
+      "sdot z17.s, z31.b, z0.b[1]\n"
+      "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n"
       "add x22, x22, #0x10\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z24.s, z4.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "sdot z25.s, z5.b, z2.b[0]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
-      "addvl x28, x28, #16\n"
-      "sdot z26.s, z6.b, z2.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
-      "sdot z27.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "sdot z20.s, z8.b, z1.b[1]\n"
-      "sdot z24.s, z8.b, z2.b[1]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
-      "sdot z25.s, z9.b, z2.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "sdot z26.s, z10.b, z2.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "sdot z23.s, z4.b, z1.b[1]\n"
-      "sdot z27.s, z4.b, z2.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "sdot z24.s, z5.b, z2.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "sdot z21.s, z6.b, z1.b[2]\n"
-      "sdot z25.s, z6.b, z2.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z26.s, z7.b, z2.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "sdot z23.s, z8.b, z1.b[2]\n"
-      "sdot z27.s, z8.b, z2.b[2]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "sdot z24.s, z9.b, z2.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
-      "sdot z25.s, z10.b, z2.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
-      "sdot z26.s, z4.b, z2.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
-      "sdot z27.s, z5.b, z2.b[3]\n"
+      "sdot z21.s, z31.b, z1.b[1]\n"
+      "sdot z25.s, z31.b, z2.b[1]\n"
+      "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "sdot z18.s, z30.b, z0.b[1]\n"
+      "sdot z22.s, z30.b, z1.b[1]\n"
+      "sdot z26.s, z30.b, z2.b[1]\n"
+      "sdot z19.s, z29.b, z0.b[1]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "sdot z23.s, z29.b, z1.b[1]\n"
+      "sdot z27.s, z29.b, z2.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "sdot z16.s, z28.b, z0.b[2]\n"
+      "sdot z20.s, z28.b, z1.b[2]\n"
+      "sdot z24.s, z28.b, z2.b[2]\n"
+      "sdot z17.s, z5.b, z0.b[2]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "sdot z21.s, z5.b, z1.b[2]\n"
+      "sdot z25.s, z5.b, z2.b[2]\n"
+      "sdot z18.s, z4.b, z0.b[2]\n"
+      "sdot z22.s, z4.b, z1.b[2]\n"
+      "sdot z26.s, z4.b, z2.b[2]\n"
+      "sdot z19.s, z3.b, z0.b[2]\n"
+      "sdot z23.s, z3.b, z1.b[2]\n"
+      "sdot z27.s, z3.b, z2.b[2]\n"
+      "sdot z16.s, z31.b, z0.b[3]\n"
+      "sdot z20.s, z31.b, z1.b[3]\n"
+      "sdot z24.s, z31.b, z2.b[3]\n"
+      "sdot z17.s, z30.b, z0.b[3]\n"
+      "sdot z21.s, z30.b, z1.b[3]\n"
+      "sdot z25.s, z30.b, z2.b[3]\n"
+      "sdot z18.s, z29.b, z0.b[3]\n"
+      "sdot z22.s, z29.b, z1.b[3]\n"
+      "sdot z26.s, z29.b, z2.b[3]\n"
+      "sdot z19.s, z28.b, z0.b[3]\n"
+      "sdot z23.s, z28.b, z1.b[3]\n"
+      "sdot z27.s, z28.b, z2.b[3]\n"
       "tbnz %x[flags], #31, 36f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z12.s, z1.b, z15.b\n"
       "sdot z13.s, z2.b, z15.b\n"
       "36:"  // Height 3: Multiply loop: unique 5: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x10\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
       "bgt 35b\n"
       "37:"  // Height 3: Multiply loop: Single iteration only
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z24.s, z4.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x25, x25, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "ld1b { z28.b }, p2/Z, [x28]\n"
+      "sdot z16.s, z28.b, z0.b[0]\n"
+      "sdot z20.s, z28.b, z1.b[0]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "sdot z24.s, z28.b, z2.b[0]\n"
+      "sdot z17.s, z30.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z21.s, z30.b, z1.b[0]\n"
+      "sdot z25.s, z30.b, z2.b[0]\n"
       "addvl x28, x28, #4\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
-      "sdot z25.s, z5.b, z2.b[0]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
-      "sdot z26.s, z6.b, z2.b[0]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
-      "sdot z27.s, z7.b, z2.b[0]\n"
+      "sdot z18.s, z29.b, z0.b[0]\n"
+      "sdot z22.s, z29.b, z1.b[0]\n"
+      "sdot z26.s, z29.b, z2.b[0]\n"
+      "sdot z19.s, z28.b, z0.b[0]\n"
+      "sdot z23.s, z28.b, z1.b[0]\n"
+      "sdot z27.s, z28.b, z2.b[0]\n"
       "ble 38f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "sdot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z24.s, z8.b, z2.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "sdot z16.s, z31.b, z0.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z31.b, z1.b[1]\n"
+      "sdot z24.s, z31.b, z2.b[1]\n"
+      "sdot z17.s, z30.b, z0.b[1]\n"
+      "sdot z21.s, z30.b, z1.b[1]\n"
       "addvl x28, x28, #4\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
-      "sdot z25.s, z9.b, z2.b[1]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "sdot z26.s, z10.b, z2.b[1]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "sdot z23.s, z4.b, z1.b[1]\n"
-      "sdot z27.s, z4.b, z2.b[1]\n"
+      "sdot z25.s, z30.b, z2.b[1]\n"
+      "sdot z18.s, z29.b, z0.b[1]\n"
+      "sdot z22.s, z29.b, z1.b[1]\n"
+      "sdot z26.s, z29.b, z2.b[1]\n"
+      "sdot z19.s, z28.b, z0.b[1]\n"
+      "sdot z23.s, z28.b, z1.b[1]\n"
+      "sdot z27.s, z28.b, z2.b[1]\n"
       "ble 38f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z24.s, z5.b, z2.b[2]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "sdot z16.s, z31.b, z0.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z31.b, z1.b[2]\n"
+      "sdot z24.s, z31.b, z2.b[2]\n"
+      "sdot z17.s, z30.b, z0.b[2]\n"
+      "sdot z21.s, z30.b, z1.b[2]\n"
       "addvl x28, x28, #4\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "sdot z21.s, z6.b, z1.b[2]\n"
-      "sdot z25.s, z6.b, z2.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z26.s, z7.b, z2.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "sdot z23.s, z8.b, z1.b[2]\n"
-      "sdot z27.s, z8.b, z2.b[2]\n"
+      "sdot z25.s, z30.b, z2.b[2]\n"
+      "sdot z18.s, z29.b, z0.b[2]\n"
+      "sdot z22.s, z29.b, z1.b[2]\n"
+      "sdot z26.s, z29.b, z2.b[2]\n"
+      "sdot z19.s, z28.b, z0.b[2]\n"
+      "sdot z23.s, z28.b, z1.b[2]\n"
+      "sdot z27.s, z28.b, z2.b[2]\n"
       "ble 38f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z24.s, z9.b, z2.b[3]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z31.b, z0.b[3]\n"
+      "sdot z20.s, z31.b, z1.b[3]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z24.s, z31.b, z2.b[3]\n"
+      "sdot z17.s, z30.b, z0.b[3]\n"
+      "sdot z21.s, z30.b, z1.b[3]\n"
+      "sdot z25.s, z30.b, z2.b[3]\n"
       "addvl x28, x28, #4\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
-      "sdot z25.s, z10.b, z2.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
-      "sdot z26.s, z4.b, z2.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
-      "sdot z27.s, z5.b, z2.b[3]\n"
+      "sdot z18.s, z29.b, z0.b[3]\n"
+      "sdot z22.s, z29.b, z1.b[3]\n"
+      "sdot z26.s, z29.b, z2.b[3]\n"
+      "sdot z19.s, z28.b, z0.b[3]\n"
+      "sdot z23.s, z28.b, z1.b[3]\n"
+      "sdot z27.s, z28.b, z2.b[3]\n"
       "38:"  // Height 3: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 39f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z12.s, z1.b, z15.b\n"
       "sdot z13.s, z2.b, z15.b\n"
       "39:"  // Height 3: Multiply loop: unique 6: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 32b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x26, x19\n"
-      "add x21, x22, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "tbnz %x[flags], #31, 40f\n"
-      "add x19, %x[qp], %[b_offset]\n"
-      "ld1rw { z3.s }, p2/Z, [x19]\n"
-      "neg z3.s, p2/M, z3.s\n"
-      "mov x19, #0x4\n"
-      "whilelt p0.s, XZR, x19\n"
+      "mov x20, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
       "saddv d11, p0, z11.s\n"
+      "mov z11.s, z11.s[0]\n"
       "saddv d12, p0, z12.s\n"
       "saddv d13, p0, z13.s\n"
-      "mov z11.s, z11.s[0]\n"
       "mov z12.s, z12.s[0]\n"
       "mov z13.s, z13.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z3.s\n"
-      "mul z12.s, p2/M, z12.s, z3.s\n"
-      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "neg z28.s, p2/M, z28.s\n"
+      "mul z11.s, p2/M, z11.s, z28.s\n"
+      "mul z12.s, p2/M, z12.s, z28.s\n"
+      "mul z13.s, p2/M, z13.s, z28.s\n"
       "40:"  // Height 3: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x27]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
-      "add x19, %x[qp], %[per_layer_mul]\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
-      "addvl x27, x27, #4\n"
+      "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
       "add z21.s, z21.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z24.s, z24.s, z13.s\n"
       "add z25.s, z25.s, z13.s\n"
+      "addvl x10, x10, #4\n"
       "add z26.s, z26.s, z13.s\n"
       "add z27.s, z27.s, z13.s\n"
       "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
+      "add z17.s, z17.s, z31.s\n"
+      "add z18.s, z18.s, z30.s\n"
+      "add z19.s, z19.s, z29.s\n"
       "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
+      "add z21.s, z21.s, z31.s\n"
+      "add z22.s, z22.s, z30.s\n"
+      "add z23.s, z23.s, z29.s\n"
       "add z24.s, z24.s, z0.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "add z25.s, z25.s, z31.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z26.s, z26.s, z30.s\n"
+      "add z27.s, z27.s, z29.s\n"
+      ".inst 0x04bc7610  // sqrdmulh z16.s, z16.s, z28.s\n"
+      ".inst 0x04bc7631  // sqrdmulh z17.s, z17.s, z28.s\n"
+      ".inst 0x04bc7652  // sqrdmulh z18.s, z18.s, z28.s\n"
+      ".inst 0x04bc7673  // sqrdmulh z19.s, z19.s, z28.s\n"
+      ".inst 0x04bc7694  // sqrdmulh z20.s, z20.s, z28.s\n"
+      ".inst 0x04bc76b5  // sqrdmulh z21.s, z21.s, z28.s\n"
+      ".inst 0x04bc76d6  // sqrdmulh z22.s, z22.s, z28.s\n"
+      ".inst 0x04bc76f7  // sqrdmulh z23.s, z23.s, z28.s\n"
+      ".inst 0x04bc7718  // sqrdmulh z24.s, z24.s, z28.s\n"
+      ".inst 0x04bc7739  // sqrdmulh z25.s, z25.s, z28.s\n"
+      ".inst 0x04bc775a  // sqrdmulh z26.s, z26.s, z28.s\n"
+      ".inst 0x04bc777b  // sqrdmulh z27.s, z27.s, z28.s\n"
       "tbz %x[flags], #5, 41f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "and z9.d, z21.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z24.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "and z6.d, z25.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z24.s, z24.s, z5.s\n"
-      "and z7.d, z26.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z8.d, z27.d, z0.d\n"
-      "sqadd z25.s, z25.s, z6.s\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "sqadd z27.s, z27.s, z8.s\n"
+      "and z1.d, z16.d, z0.d\n"
+      "and z31.d, z17.d, z0.d\n"
+      "and z30.d, z18.d, z0.d\n"
+      "and z29.d, z19.d, z0.d\n"
+      "and z28.d, z20.d, z0.d\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z1.s\n"
+      "sqadd z17.s, z17.s, z31.s\n"
+      "sqadd z18.s, z18.s, z30.s\n"
+      "sqadd z19.s, z19.s, z29.s\n"
+      "sqadd z20.s, z20.s, z28.s\n"
+      "and z3.d, z21.d, z0.d\n"
+      "and z2.d, z22.d, z0.d\n"
+      "and z1.d, z23.d, z0.d\n"
+      "and z31.d, z24.d, z0.d\n"
+      "and z30.d, z25.d, z0.d\n"
+      "and z29.d, z26.d, z0.d\n"
+      "and z28.d, z27.d, z0.d\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z3.s\n"
+      "sqadd z22.s, z22.s, z2.s\n"
+      "sqadd z23.s, z23.s, z1.s\n"
+      "sqadd z24.s, z24.s, z31.s\n"
+      "sqadd z25.s, z25.s, z30.s\n"
+      "sqadd z26.s, z26.s, z29.s\n"
+      "sqadd z27.s, z27.s, z28.s\n"
       "41:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add x19, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z28.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
-      "add x19, %x[qp], %[minval]\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "ld1rw { z5.s }, p2/Z, [x19]\n"
-      "add x19, %x[qp], %[maxval]\n"
+      "add z17.s, z17.s, z28.s\n"
+      "add z18.s, z18.s, z28.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "ld1rw { z6.s }, p2/Z, [x19]\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z19.s, z19.s, z28.s\n"
+      "add z20.s, z20.s, z28.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "uzp1 z17.h, z18.h, z19.h\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x26]\n"
-      "add z21.s, z21.s, z4.s\n"
-      "addvl x26, x26, #1\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "add z21.s, z21.s, z28.s\n"
+      "add z22.s, z22.s, z28.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "add z23.s, z23.s, z28.s\n"
+      "add z24.s, z24.s, z28.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "uzp1 z20.h, z20.h, z21.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
+      "add z25.s, z25.s, z28.s\n"
+      "add z26.s, z26.s, z28.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "st1b { z20.b }, p1, [x22]\n"
-      "add z26.s, z26.s, z4.s\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "add z27.s, z27.s, z4.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z28.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z29.s\n"
+      "smin z17.s, p2/M, z17.s, z29.s\n"
+      "smin z18.s, p2/M, z18.s, z29.s\n"
+      "smin z19.s, p2/M, z19.s, z29.s\n"
+      "smin z20.s, p2/M, z20.s, z29.s\n"
+      "smin z21.s, p2/M, z21.s, z29.s\n"
+      "smin z22.s, p2/M, z22.s, z29.s\n"
+      "smin z23.s, p2/M, z23.s, z29.s\n"
+      "smin z24.s, p2/M, z24.s, z29.s\n"
+      "smin z25.s, p2/M, z25.s, z29.s\n"
+      "smin z26.s, p2/M, z26.s, z29.s\n"
+      "smin z27.s, p2/M, z27.s, z29.s\n"
+      "smax z16.s, p2/M, z16.s, z28.s\n"
+      "smax z17.s, p2/M, z17.s, z28.s\n"
+      "smax z18.s, p2/M, z18.s, z28.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z19.s, p2/M, z19.s, z28.s\n"
+      "smax z20.s, p2/M, z20.s, z28.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z21.s, p2/M, z21.s, z28.s\n"
+      "smax z22.s, p2/M, z22.s, z28.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smax z23.s, p2/M, z23.s, z28.s\n"
+      "smax z24.s, p2/M, z24.s, z28.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z28.s\n"
+      "smax z26.s, p2/M, z26.s, z28.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x21]\n"
+      "st1b { z20.b }, p1, [x23]\n"
+      "smax z27.s, p2/M, z27.s, z28.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x22]\n"
+      "addvl x27, x27, #1\n"
       "42:"  // Height 3: Writeback done
       "decw x9, ALL, MUL #4\n"
       "cmp x9, XZR\n"
       "bgt 30b\n"
       "b 58f\n"
       "43:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x4\n"
+      "mov x10, %x[col_bias]\n"
       "mov z11.s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "mov z12.s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "bic %x[flags], %x[flags], #0x80000000\n"
       "mov z13.s, #0x0\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x26, %x[output_ptr]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
       "mov z14.s, #0x0\n"
-      "mov x19, #0x4\n"
       "mov z15.b, #0x1\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "44:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
       "mov z16.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z17.s, #0x0\n"
-      "whilelt p1.b, x19, x9\n"
       "mov z18.s, #0x0\n"
       "mov z19.s, #0x0\n"
       "mov z20.s, #0x0\n"
@@ -1041,237 +1023,229 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "mov z30.s, #0x0\n"
       "mov z31.s, #0x0\n"
       "45:"  // Height 4: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "46:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 47f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "ldr x20, [x20, #0x18]\n"
-      "cbnz x25, 48f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 48f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
       "b 48f\n"
       "47:"  // Height 4: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "48:"  // Height 4: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "ble 51f\n"
       "49:"  // Height 4: Multiply loop: Main loop head
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x24, x24, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
       "add x23, x23, #0x10\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z24.s, z4.b, z2.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z5.b, z0.b[0]\n"
+      "sdot z20.s, z5.b, z1.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z24.s, z5.b, z2.b[0]\n"
+      "sdot z28.s, z5.b, z3.b[0]\n"
+      "sdot z17.s, z4.b, z0.b[0]\n"
+      "sdot z21.s, z4.b, z1.b[0]\n"
       "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "sdot z28.s, z4.b, z3.b[0]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "sdot z29.s, z5.b, z3.b[0]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "sdot z25.s, z4.b, z2.b[0]\n"
+      "sdot z29.s, z4.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "sdot z18.s, z10.b, z0.b[0]\n"
+      "sdot z22.s, z10.b, z1.b[0]\n"
       "addvl x28, x28, #16\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "sdot z26.s, z6.b, z2.b[0]\n"
-      "sdot z30.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
-      "sdot z27.s, z7.b, z2.b[0]\n"
-      "sdot z31.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "sdot z26.s, z10.b, z2.b[0]\n"
+      "sdot z30.s, z10.b, z3.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      "sdot z19.s, z9.b, z0.b[0]\n"
+      "sdot z23.s, z9.b, z1.b[0]\n"
+      "sdot z27.s, z9.b, z2.b[0]\n"
+      "sdot z31.s, z9.b, z3.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
       "sdot z16.s, z8.b, z0.b[1]\n"
       "sdot z20.s, z8.b, z1.b[1]\n"
       "sdot z24.s, z8.b, z2.b[1]\n"
       "sdot z28.s, z8.b, z3.b[1]\n"
       "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
-      "sdot z25.s, z9.b, z2.b[1]\n"
-      "sdot z29.s, z9.b, z3.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "sdot z26.s, z10.b, z2.b[1]\n"
-      "sdot z30.s, z10.b, z3.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "sdot z23.s, z4.b, z1.b[1]\n"
-      "sdot z27.s, z4.b, z2.b[1]\n"
-      "sdot z31.s, z4.b, z3.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "sdot z24.s, z5.b, z2.b[2]\n"
-      "sdot z28.s, z5.b, z3.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "sdot z21.s, z6.b, z1.b[2]\n"
-      "sdot z25.s, z6.b, z2.b[2]\n"
-      "sdot z29.s, z6.b, z3.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z26.s, z7.b, z2.b[2]\n"
-      "sdot z30.s, z7.b, z3.b[2]\n"
+      "sdot z17.s, z7.b, z0.b[1]\n"
+      "sdot z21.s, z7.b, z1.b[1]\n"
+      "sdot z25.s, z7.b, z2.b[1]\n"
+      "sdot z29.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "sdot z18.s, z6.b, z0.b[1]\n"
+      "sdot z22.s, z6.b, z1.b[1]\n"
+      "sdot z26.s, z6.b, z2.b[1]\n"
+      "sdot z30.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "sdot z19.s, z5.b, z0.b[1]\n"
+      "sdot z23.s, z5.b, z1.b[1]\n"
+      "sdot z27.s, z5.b, z2.b[1]\n"
+      "sdot z31.s, z5.b, z3.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "sdot z16.s, z4.b, z0.b[2]\n"
+      "sdot z20.s, z4.b, z1.b[2]\n"
+      "sdot z24.s, z4.b, z2.b[2]\n"
+      "sdot z28.s, z4.b, z3.b[2]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "sdot z17.s, z10.b, z0.b[2]\n"
+      "sdot z21.s, z10.b, z1.b[2]\n"
+      "sdot z25.s, z10.b, z2.b[2]\n"
+      "sdot z29.s, z10.b, z3.b[2]\n"
+      "sdot z18.s, z9.b, z0.b[2]\n"
+      "sdot z22.s, z9.b, z1.b[2]\n"
+      "sdot z26.s, z9.b, z2.b[2]\n"
+      "sdot z30.s, z9.b, z3.b[2]\n"
       "sdot z19.s, z8.b, z0.b[2]\n"
       "sdot z23.s, z8.b, z1.b[2]\n"
       "sdot z27.s, z8.b, z2.b[2]\n"
       "sdot z31.s, z8.b, z3.b[2]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "sdot z24.s, z9.b, z2.b[3]\n"
-      "sdot z28.s, z9.b, z3.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
-      "sdot z25.s, z10.b, z2.b[3]\n"
-      "sdot z29.s, z10.b, z3.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
-      "sdot z26.s, z4.b, z2.b[3]\n"
-      "sdot z30.s, z4.b, z3.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
-      "sdot z27.s, z5.b, z2.b[3]\n"
-      "sdot z31.s, z5.b, z3.b[3]\n"
+      "sdot z16.s, z7.b, z0.b[3]\n"
+      "sdot z20.s, z7.b, z1.b[3]\n"
+      "sdot z24.s, z7.b, z2.b[3]\n"
+      "sdot z28.s, z7.b, z3.b[3]\n"
+      "sdot z17.s, z6.b, z0.b[3]\n"
+      "sdot z21.s, z6.b, z1.b[3]\n"
+      "sdot z25.s, z6.b, z2.b[3]\n"
+      "sdot z29.s, z6.b, z3.b[3]\n"
+      "sdot z18.s, z5.b, z0.b[3]\n"
+      "sdot z22.s, z5.b, z1.b[3]\n"
+      "sdot z26.s, z5.b, z2.b[3]\n"
+      "sdot z30.s, z5.b, z3.b[3]\n"
+      "sdot z19.s, z4.b, z0.b[3]\n"
+      "sdot z23.s, z4.b, z1.b[3]\n"
+      "sdot z27.s, z4.b, z2.b[3]\n"
+      "sdot z31.s, z4.b, z3.b[3]\n"
       "tbnz %x[flags], #31, 50f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z12.s, z1.b, z15.b\n"
       "sdot z13.s, z2.b, z15.b\n"
       "sdot z14.s, z3.b, z15.b\n"
       "50:"  // Height 4: Multiply loop: unique 7: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x10\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
       "bgt 49b\n"
       "51:"  // Height 4: Multiply loop: Single iteration only
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z24.s, z4.b, z2.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x25, x25, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z7.b, z0.b[0]\n"
+      "sdot z20.s, z7.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z24.s, z7.b, z2.b[0]\n"
+      "sdot z28.s, z7.b, z3.b[0]\n"
+      "sdot z17.s, z6.b, z0.b[0]\n"
+      "sdot z21.s, z6.b, z1.b[0]\n"
       "addvl x28, x28, #4\n"
-      "sdot z28.s, z4.b, z3.b[0]\n"
-      "sdot z25.s, z5.b, z2.b[0]\n"
-      "sdot z29.s, z5.b, z3.b[0]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
-      "sdot z26.s, z6.b, z2.b[0]\n"
-      "sdot z30.s, z6.b, z3.b[0]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
-      "sdot z27.s, z7.b, z2.b[0]\n"
-      "sdot z31.s, z7.b, z3.b[0]\n"
+      "sdot z25.s, z6.b, z2.b[0]\n"
+      "sdot z29.s, z6.b, z3.b[0]\n"
+      "sdot z18.s, z5.b, z0.b[0]\n"
+      "sdot z22.s, z5.b, z1.b[0]\n"
+      "sdot z26.s, z5.b, z2.b[0]\n"
+      "sdot z30.s, z5.b, z3.b[0]\n"
+      "sdot z19.s, z4.b, z0.b[0]\n"
+      "sdot z23.s, z4.b, z1.b[0]\n"
+      "sdot z27.s, z4.b, z2.b[0]\n"
+      "sdot z31.s, z4.b, z3.b[0]\n"
       "ble 52f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "sdot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z24.s, z8.b, z2.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "sdot z16.s, z7.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
       "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z7.b, z1.b[1]\n"
+      "sdot z24.s, z7.b, z2.b[1]\n"
+      "sdot z28.s, z7.b, z3.b[1]\n"
+      "sdot z17.s, z6.b, z0.b[1]\n"
       "addvl x28, x28, #4\n"
-      "sdot z28.s, z8.b, z3.b[1]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
-      "sdot z25.s, z9.b, z2.b[1]\n"
-      "sdot z29.s, z9.b, z3.b[1]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "sdot z26.s, z10.b, z2.b[1]\n"
-      "sdot z30.s, z10.b, z3.b[1]\n"
+      "sdot z21.s, z6.b, z1.b[1]\n"
+      "sdot z25.s, z6.b, z2.b[1]\n"
+      "sdot z29.s, z6.b, z3.b[1]\n"
+      "sdot z18.s, z5.b, z0.b[1]\n"
+      "sdot z22.s, z5.b, z1.b[1]\n"
+      "sdot z26.s, z5.b, z2.b[1]\n"
+      "sdot z30.s, z5.b, z3.b[1]\n"
       "sdot z19.s, z4.b, z0.b[1]\n"
       "sdot z23.s, z4.b, z1.b[1]\n"
       "sdot z27.s, z4.b, z2.b[1]\n"
       "sdot z31.s, z4.b, z3.b[1]\n"
       "ble 52f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
       "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z24.s, z5.b, z2.b[2]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z28.s, z5.b, z3.b[2]\n"
+      "subs x25, x25, #0x4\n"
+      "sdot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z7.b, z1.b[2]\n"
+      "sdot z24.s, z7.b, z2.b[2]\n"
+      "sdot z28.s, z7.b, z3.b[2]\n"
       "sdot z17.s, z6.b, z0.b[2]\n"
+      "addvl x28, x28, #4\n"
       "sdot z21.s, z6.b, z1.b[2]\n"
       "sdot z25.s, z6.b, z2.b[2]\n"
       "sdot z29.s, z6.b, z3.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z26.s, z7.b, z2.b[2]\n"
-      "sdot z30.s, z7.b, z3.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "sdot z23.s, z8.b, z1.b[2]\n"
-      "sdot z27.s, z8.b, z2.b[2]\n"
-      "sdot z31.s, z8.b, z3.b[2]\n"
+      "sdot z18.s, z5.b, z0.b[2]\n"
+      "sdot z22.s, z5.b, z1.b[2]\n"
+      "sdot z26.s, z5.b, z2.b[2]\n"
+      "sdot z30.s, z5.b, z3.b[2]\n"
+      "sdot z19.s, z4.b, z0.b[2]\n"
+      "sdot z23.s, z4.b, z1.b[2]\n"
+      "sdot z27.s, z4.b, z2.b[2]\n"
+      "sdot z31.s, z4.b, z3.b[2]\n"
       "ble 52f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z24.s, z9.b, z2.b[3]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z7.b, z0.b[3]\n"
+      "sdot z20.s, z7.b, z1.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z24.s, z7.b, z2.b[3]\n"
+      "sdot z28.s, z7.b, z3.b[3]\n"
+      "sdot z17.s, z6.b, z0.b[3]\n"
+      "sdot z21.s, z6.b, z1.b[3]\n"
       "addvl x28, x28, #4\n"
-      "sdot z28.s, z9.b, z3.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
-      "sdot z25.s, z10.b, z2.b[3]\n"
-      "sdot z29.s, z10.b, z3.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
-      "sdot z26.s, z4.b, z2.b[3]\n"
-      "sdot z30.s, z4.b, z3.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
-      "sdot z27.s, z5.b, z2.b[3]\n"
-      "sdot z31.s, z5.b, z3.b[3]\n"
+      "sdot z25.s, z6.b, z2.b[3]\n"
+      "sdot z29.s, z6.b, z3.b[3]\n"
+      "sdot z18.s, z5.b, z0.b[3]\n"
+      "sdot z22.s, z5.b, z1.b[3]\n"
+      "sdot z26.s, z5.b, z2.b[3]\n"
+      "sdot z30.s, z5.b, z3.b[3]\n"
+      "sdot z19.s, z4.b, z0.b[3]\n"
+      "sdot z23.s, z4.b, z1.b[3]\n"
+      "sdot z27.s, z4.b, z2.b[3]\n"
+      "sdot z31.s, z4.b, z3.b[3]\n"
       "52:"  // Height 4: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 53f\n"
       "sdot z11.s, z0.b, z15.b\n"
@@ -1279,254 +1253,249 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "sdot z13.s, z2.b, z15.b\n"
       "sdot z14.s, z3.b, z15.b\n"
       "53:"  // Height 4: Multiply loop: unique 8: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 46b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x26, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "add x21, x22, x20\n"
       "tbnz %x[flags], #31, 54f\n"
-      "add x19, %x[qp], %[b_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
-      "neg z4.s, p2/M, z4.s\n"
-      "mov x19, #0x4\n"
-      "whilelt p0.s, XZR, x19\n"
+      "mov x20, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
       "saddv d11, p0, z11.s\n"
+      "mov z11.s, z11.s[0]\n"
       "saddv d12, p0, z12.s\n"
       "saddv d13, p0, z13.s\n"
-      "saddv d14, p0, z14.s\n"
-      "mov z11.s, z11.s[0]\n"
       "mov z12.s, z12.s[0]\n"
       "mov z13.s, z13.s[0]\n"
+      "saddv d14, p0, z14.s\n"
+      "neg z0.s, p2/M, z0.s\n"
       "mov z14.s, z14.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z4.s\n"
-      "mul z12.s, p2/M, z12.s, z4.s\n"
-      "mul z13.s, p2/M, z13.s, z4.s\n"
-      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "mul z11.s, p2/M, z11.s, z0.s\n"
+      "mul z12.s, p2/M, z12.s, z0.s\n"
+      "mul z13.s, p2/M, z13.s, z0.s\n"
+      "mul z14.s, p2/M, z14.s, z0.s\n"
       "54:"  // Height 4: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x27]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1w { z4.s }, p2/Z, [x10]\n"
+      "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
-      "add x19, %x[qp], %[per_layer_mul]\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
-      "addvl x27, x27, #4\n"
+      "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
       "add z21.s, z21.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z24.s, z24.s, z13.s\n"
       "add z25.s, z25.s, z13.s\n"
+      "addvl x10, x10, #4\n"
       "add z26.s, z26.s, z13.s\n"
       "add z27.s, z27.s, z13.s\n"
       "add z28.s, z28.s, z14.s\n"
       "add z29.s, z29.s, z14.s\n"
       "add z30.s, z30.s, z14.s\n"
       "add z31.s, z31.s, z14.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      "add z28.s, z28.s, z0.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z29.s, z29.s, z1.s\n"
-      "add z30.s, z30.s, z2.s\n"
-      "add z31.s, z31.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
-      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
-      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
-      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
-      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z0.s\n"
+      "add z18.s, z18.s, z3.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "add z21.s, z21.s, z0.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z23.s, z23.s, z2.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z0.s\n"
+      "add z26.s, z26.s, z3.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add z28.s, z28.s, z4.s\n"
+      "add z29.s, z29.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z30.s, z30.s, z3.s\n"
+      "add z31.s, z31.s, z2.s\n"
+      ".inst 0x04a17610  // sqrdmulh z16.s, z16.s, z1.s\n"
+      ".inst 0x04a17631  // sqrdmulh z17.s, z17.s, z1.s\n"
+      ".inst 0x04a17652  // sqrdmulh z18.s, z18.s, z1.s\n"
+      ".inst 0x04a17673  // sqrdmulh z19.s, z19.s, z1.s\n"
+      ".inst 0x04a17694  // sqrdmulh z20.s, z20.s, z1.s\n"
+      ".inst 0x04a176b5  // sqrdmulh z21.s, z21.s, z1.s\n"
+      ".inst 0x04a176d6  // sqrdmulh z22.s, z22.s, z1.s\n"
+      ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+      ".inst 0x04a17718  // sqrdmulh z24.s, z24.s, z1.s\n"
+      ".inst 0x04a17739  // sqrdmulh z25.s, z25.s, z1.s\n"
+      ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+      ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+      ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+      ".inst 0x04a177bd  // sqrdmulh z29.s, z29.s, z1.s\n"
+      ".inst 0x04a177de  // sqrdmulh z30.s, z30.s, z1.s\n"
+      ".inst 0x04a177ff  // sqrdmulh z31.s, z31.s, z1.s\n"
       "tbz %x[flags], #5, 55f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "and z9.d, z21.d, z0.d\n"
+      "and z2.d, z16.d, z0.d\n"
+      "and z1.d, z17.d, z0.d\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z2.s\n"
+      "sqadd z17.s, z17.s, z1.s\n"
+      "and z7.d, z18.d, z0.d\n"
+      "and z6.d, z19.d, z0.d\n"
+      "and z5.d, z20.d, z0.d\n"
+      "and z4.d, z21.d, z0.d\n"
+      "and z3.d, z22.d, z0.d\n"
+      "and z2.d, z23.d, z0.d\n"
+      "and z1.d, z24.d, z0.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z24.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "and z6.d, z25.d, z0.d\n"
       "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z24.s, z24.s, z5.s\n"
-      "and z7.d, z26.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z8.d, z27.d, z0.d\n"
-      "and z9.d, z28.d, z0.d\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z25.s, z25.s, z6.s\n"
-      "and z10.d, z29.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "and z4.d, z30.d, z0.d\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "and z5.d, z31.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z27.s, z27.s, z8.s\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z7.s\n"
+      "sqadd z19.s, z19.s, z6.s\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z4.s\n"
+      "sqadd z22.s, z22.s, z3.s\n"
+      "sqadd z23.s, z23.s, z2.s\n"
+      "sqadd z24.s, z24.s, z1.s\n"
+      "and z7.d, z25.d, z0.d\n"
+      "and z6.d, z26.d, z0.d\n"
+      "and z5.d, z27.d, z0.d\n"
+      "and z4.d, z28.d, z0.d\n"
+      "and z3.d, z29.d, z0.d\n"
+      "and z2.d, z30.d, z0.d\n"
+      "and z1.d, z31.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z28.s, z28.s, z9.s\n"
-      "sqadd z29.s, z29.s, z10.s\n"
-      "sqadd z30.s, z30.s, z4.s\n"
-      "sqadd z31.s, z31.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z7.s\n"
+      "sqadd z26.s, z26.s, z6.s\n"
+      "sqadd z27.s, z27.s, z5.s\n"
+      "sqadd z28.s, z28.s, z4.s\n"
+      "sqadd z29.s, z29.s, z3.s\n"
+      "sqadd z30.s, z30.s, z2.s\n"
+      "sqadd z31.s, z31.s, z1.s\n"
       "55:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add x19, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z2.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
-      "add x19, %x[qp], %[minval]\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "ld1rw { z5.s }, p2/Z, [x19]\n"
-      "add x19, %x[qp], %[maxval]\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z2.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "ld1rw { z6.s }, p2/Z, [x19]\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z20.s, z20.s, z2.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "uzp1 z17.h, z18.h, z19.h\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x26]\n"
-      "add z21.s, z21.s, z4.s\n"
-      "addvl x26, x26, #1\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z2.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "add z23.s, z23.s, z2.s\n"
+      "add z24.s, z24.s, z2.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "uzp1 z20.h, z20.h, z21.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z2.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "st1b { z20.b }, p1, [x22]\n"
-      "add z26.s, z26.s, z4.s\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "add z27.s, z27.s, z4.s\n"
       ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "uzp1 z24.h, z24.h, z25.h\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "add z28.s, z28.s, z4.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add z28.s, z28.s, z2.s\n"
       ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "smin z28.s, p2/M, z28.s, z6.s\n"
       ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
-      "add z29.s, z29.s, z4.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "smax z28.s, p2/M, z28.s, z5.s\n"
-      "add z30.s, z30.s, z4.s\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x21]\n"
-      "smin z29.s, p2/M, z29.s, z6.s\n"
-      "smin z30.s, p2/M, z30.s, z6.s\n"
+      "add z29.s, z29.s, z2.s\n"
+      "add z30.s, z30.s, z2.s\n"
       ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
-      "smax z29.s, p2/M, z29.s, z5.s\n"
-      "smax z30.s, p2/M, z30.s, z5.s\n"
-      "add z31.s, z31.s, z4.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add z31.s, z31.s, z2.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z1.s\n"
+      "smin z17.s, p2/M, z17.s, z1.s\n"
+      "smin z18.s, p2/M, z18.s, z1.s\n"
+      "smin z19.s, p2/M, z19.s, z1.s\n"
+      "smin z20.s, p2/M, z20.s, z1.s\n"
+      "smin z21.s, p2/M, z21.s, z1.s\n"
+      "smin z22.s, p2/M, z22.s, z1.s\n"
+      "smin z23.s, p2/M, z23.s, z1.s\n"
+      "smin z24.s, p2/M, z24.s, z1.s\n"
+      "smin z25.s, p2/M, z25.s, z1.s\n"
+      "smin z26.s, p2/M, z26.s, z1.s\n"
+      "smin z27.s, p2/M, z27.s, z1.s\n"
+      "smin z28.s, p2/M, z28.s, z1.s\n"
+      "smin z29.s, p2/M, z29.s, z1.s\n"
+      "smin z30.s, p2/M, z30.s, z1.s\n"
+      "smin z31.s, p2/M, z31.s, z1.s\n"
+      "smax z16.s, p2/M, z16.s, z0.s\n"
+      "smax z17.s, p2/M, z17.s, z0.s\n"
+      "smax z18.s, p2/M, z18.s, z0.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z19.s, p2/M, z19.s, z0.s\n"
+      "smax z20.s, p2/M, z20.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z21.s, p2/M, z21.s, z0.s\n"
+      "smax z22.s, p2/M, z22.s, z0.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smax z23.s, p2/M, z23.s, z0.s\n"
+      "smax z24.s, p2/M, z24.s, z0.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z0.s\n"
+      "smax z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "st1b { z20.b }, p1, [x23]\n"
+      "smax z27.s, p2/M, z27.s, z0.s\n"
+      "smax z28.s, p2/M, z28.s, z0.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "smax z29.s, p2/M, z29.s, z0.s\n"
+      "smax z30.s, p2/M, z30.s, z0.s\n"
       "uzp1 z28.h, z28.h, z29.h\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p1, [x20]\n"
+      "st1b { z24.b }, p1, [x22]\n"
+      "smax z31.s, p2/M, z31.s, z0.s\n"
+      "uzp1 z16.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z16.b\n"
+      "st1b { z28.b }, p1, [x21]\n"
+      "addvl x27, x27, #1\n"
       "56:"  // Height 4: Writeback done
       "decw x9, ALL, MUL #4\n"
       "cmp x9, XZR\n"
       "bgt 44b\n"
       "subs %x[M], %x[M], #0x4\n"
       "beq 58f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 57f\n"
-      "add x20, x20, #0x4\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "57:"  // Update direct input
-      "mov x19, #0x4\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x4\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "58:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
-      : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
new file mode 100644
index 0000000000..ae922e9743
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_s8qa_mmla_4x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8qa_mmla_4x4VL
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 4, 8, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 47.37 };
+                case CPUModel::A510:
+                    return { 20.88 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8qa_mmla_4x4VL;
+    cls_sve_hybrid_s8qa_mmla_4x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
new file mode 100644
index 0000000000..e0628364f4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
@@ -0,0 +1,1417 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qa_mmla_4x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z11.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x26, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x25, #0x10\n"
+      "ble 9f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189810  // smmla z16.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "trn2 z1.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45199814  // smmla z20.s, z0.b, z25.b\n"
+      ".inst 0x45189811  // smmla z17.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x451a9815  // smmla z21.s, z0.b, z26.b\n"
+      ".inst 0x45199812  // smmla z18.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45189816  // smmla z22.s, z0.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x451a9813  // smmla z19.s, z0.b, z26.b\n"
+      ".inst 0x45199817  // smmla z23.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45189830  // smmla z16.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x451a9834  // smmla z20.s, z1.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45199831  // smmla z17.s, z1.b, z25.b\n"
+      ".inst 0x45189835  // smmla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x451b9832  // smmla z18.s, z1.b, z27.b\n"
+      ".inst 0x451a9836  // smmla z22.s, z1.b, z26.b\n"
+      ".inst 0x45199833  // smmla z19.s, z1.b, z25.b\n"
+      ".inst 0x45189837  // smmla z23.s, z1.b, z24.b\n"
+      "add x24, x24, #0x10\n"
+      "tbnz %x[flags], #31, 8f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "8:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
+      "bgt 7b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z27.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189810  // smmla z16.s, z0.b, z24.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x8\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "trn2 z1.d, z1.d, z27.d\n"
+      ".inst 0x451a9814  // smmla z20.s, z0.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45199811  // smmla z17.s, z0.b, z25.b\n"
+      ".inst 0x45189815  // smmla z21.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x451b9812  // smmla z18.s, z0.b, z27.b\n"
+      ".inst 0x451a9816  // smmla z22.s, z0.b, z26.b\n"
+      ".inst 0x45199813  // smmla z19.s, z0.b, z25.b\n"
+      ".inst 0x45189817  // smmla z23.s, z0.b, z24.b\n"
+      "addvl x28, x28, #8\n"
+      "ble 10f\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189830  // smmla z16.s, z1.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45189834  // smmla z20.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45199831  // smmla z17.s, z1.b, z25.b\n"
+      ".inst 0x45189835  // smmla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45199832  // smmla z18.s, z1.b, z25.b\n"
+      ".inst 0x45189836  // smmla z22.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45199833  // smmla z19.s, z1.b, z25.b\n"
+      ".inst 0x45189837  // smmla z23.s, z1.b, z24.b\n"
+      "addvl x28, x28, #8\n"
+      "10:"  // Height 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 11f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "11:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 4b\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z16.d\n"
+      "tbnz %x[flags], #31, 12f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "neg z16.s, p2/M, z16.s\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z16.s\n"
+      "12:"  // Height 1: skip row sum fixup
+      "add z23.s, z23.s, z11.s\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z22.s }, p2/Z, [x10]\n"
+      "ld1w { z21.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z20.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z23.s, z23.s, z22.s\n"
+      "add z17.s, z17.s, z21.s\n"
+      "add z18.s, z18.s, z20.s\n"
+      "add z19.s, z19.s, z16.s\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0x04b076f7  // sqrdmulh z23.s, z23.s, z16.s\n"
+      ".inst 0x04b07631  // sqrdmulh z17.s, z17.s, z16.s\n"
+      "addvl x10, x10, #4\n"
+      ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+      ".inst 0x04b07673  // sqrdmulh z19.s, z19.s, z16.s\n"
+      "tbz %x[flags], #5, 13f\n"
+      "and z22.d, z23.d, z0.d\n"
+      "and z21.d, z17.d, z0.d\n"
+      "and z20.d, z18.d, z0.d\n"
+      "and z16.d, z19.d, z0.d\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z22.s\n"
+      "sqadd z17.s, z17.s, z21.s\n"
+      "sqadd z18.s, z18.s, z20.s\n"
+      "sqadd z19.s, z19.s, z16.s\n"
+      "13:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add z23.s, z23.s, z16.s\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "add z17.s, z17.s, z16.s\n"
+      "add z18.s, z18.s, z16.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z16.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z20.s\n"
+      "smin z17.s, p2/M, z17.s, z20.s\n"
+      "smin z18.s, p2/M, z18.s, z20.s\n"
+      "smin z19.s, p2/M, z19.s, z20.s\n"
+      "smax z23.s, p2/M, z23.s, z16.s\n"
+      "smax z17.s, p2/M, z17.s, z16.s\n"
+      "smax z18.s, p2/M, z18.s, z16.s\n"
+      "uzp1 z23.h, z23.h, z17.h\n"
+      "smax z19.s, p2/M, z19.s, z16.s\n"
+      "uzp1 z16.h, z18.h, z19.h\n"
+      "uzp1 z23.b, z23.b, z16.b\n"
+      "st1b { z23.b }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "14:"  // Height 1: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 2b\n"
+      "b 58f\n"
+      "15:"  // Height 2
+      "mov x10, %x[col_bias]\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z15.b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "16:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x26, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 20f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "20:"  // Height 2: input setup done
+      "cmp x25, #0x10\n"
+      "ble 23f\n"
+      "21:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z26.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189810  // smmla z16.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "trn2 z1.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45199814  // smmla z20.s, z0.b, z25.b\n"
+      ".inst 0x45189811  // smmla z17.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x451a9815  // smmla z21.s, z0.b, z26.b\n"
+      ".inst 0x45199812  // smmla z18.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45189816  // smmla z22.s, z0.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x451a9813  // smmla z19.s, z0.b, z26.b\n"
+      ".inst 0x45199817  // smmla z23.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45189830  // smmla z16.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x451a9834  // smmla z20.s, z1.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45199831  // smmla z17.s, z1.b, z25.b\n"
+      ".inst 0x45189835  // smmla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x451b9832  // smmla z18.s, z1.b, z27.b\n"
+      ".inst 0x451a9836  // smmla z22.s, z1.b, z26.b\n"
+      ".inst 0x45199833  // smmla z19.s, z1.b, z25.b\n"
+      ".inst 0x45189837  // smmla z23.s, z1.b, z24.b\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "tbnz %x[flags], #31, 22f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "22:"  // Height 2: Multiply loop: unique 3: skip row sum
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
+      "bgt 21b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z27.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189810  // smmla z16.s, z0.b, z24.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x8\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "trn2 z1.d, z1.d, z27.d\n"
+      ".inst 0x451a9814  // smmla z20.s, z0.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45199811  // smmla z17.s, z0.b, z25.b\n"
+      ".inst 0x45189815  // smmla z21.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x451b9812  // smmla z18.s, z0.b, z27.b\n"
+      ".inst 0x451a9816  // smmla z22.s, z0.b, z26.b\n"
+      ".inst 0x45199813  // smmla z19.s, z0.b, z25.b\n"
+      ".inst 0x45189817  // smmla z23.s, z0.b, z24.b\n"
+      "addvl x28, x28, #8\n"
+      "ble 24f\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189830  // smmla z16.s, z1.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45189834  // smmla z20.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45199831  // smmla z17.s, z1.b, z25.b\n"
+      ".inst 0x45189835  // smmla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45199832  // smmla z18.s, z1.b, z25.b\n"
+      ".inst 0x45189836  // smmla z22.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45199833  // smmla z19.s, z1.b, z25.b\n"
+      ".inst 0x45189837  // smmla z23.s, z1.b, z24.b\n"
+      "addvl x28, x28, #8\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 25f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "25:"  // Height 2: Multiply loop: unique 4: skip row sum
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 18b\n"
+      "uzp1 z24.d, z16.d, z20.d\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "add x23, x27, x20\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z24.d\n"
+      "tbnz %x[flags], #31, 26f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "neg z24.s, p2/M, z24.s\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z24.s\n"
+      "mul z12.s, p2/M, z12.s, z24.s\n"
+      "26:"  // Height 2: skip row sum fixup
+      "add z23.s, z23.s, z11.s\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z28.s }, p2/Z, [x10]\n"
+      "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add z16.s, z16.s, z12.s\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z23.s, z23.s, z28.s\n"
+      "add z20.s, z20.s, z27.s\n"
+      "addvl x10, x10, #4\n"
+      "add z21.s, z21.s, z26.s\n"
+      "add z22.s, z22.s, z25.s\n"
+      "add z16.s, z16.s, z28.s\n"
+      "add z17.s, z17.s, z27.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z18.s, z18.s, z26.s\n"
+      "add z19.s, z19.s, z25.s\n"
+      ".inst 0x04b876f7  // sqrdmulh z23.s, z23.s, z24.s\n"
+      ".inst 0x04b87694  // sqrdmulh z20.s, z20.s, z24.s\n"
+      ".inst 0x04b876b5  // sqrdmulh z21.s, z21.s, z24.s\n"
+      ".inst 0x04b876d6  // sqrdmulh z22.s, z22.s, z24.s\n"
+      ".inst 0x04b87610  // sqrdmulh z16.s, z16.s, z24.s\n"
+      ".inst 0x04b87631  // sqrdmulh z17.s, z17.s, z24.s\n"
+      ".inst 0x04b87652  // sqrdmulh z18.s, z18.s, z24.s\n"
+      ".inst 0x04b87673  // sqrdmulh z19.s, z19.s, z24.s\n"
+      "tbz %x[flags], #5, 27f\n"
+      "and z24.d, z23.d, z0.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z24.s\n"
+      "and z30.d, z20.d, z0.d\n"
+      "and z29.d, z21.d, z0.d\n"
+      "and z28.d, z22.d, z0.d\n"
+      "and z27.d, z16.d, z0.d\n"
+      "and z26.d, z17.d, z0.d\n"
+      "and z25.d, z18.d, z0.d\n"
+      "and z24.d, z19.d, z0.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z30.s\n"
+      "sqadd z21.s, z21.s, z29.s\n"
+      "sqadd z22.s, z22.s, z28.s\n"
+      "sqadd z16.s, z16.s, z27.s\n"
+      "sqadd z17.s, z17.s, z26.s\n"
+      "sqadd z18.s, z18.s, z25.s\n"
+      "sqadd z19.s, z19.s, z24.s\n"
+      "27:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add z23.s, z23.s, z24.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "add z20.s, z20.s, z24.s\n"
+      "add z21.s, z21.s, z24.s\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z22.s, z22.s, z24.s\n"
+      "add z16.s, z16.s, z24.s\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "add z17.s, z17.s, z24.s\n"
+      "add z18.s, z18.s, z24.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z24.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z25.s\n"
+      "smin z20.s, p2/M, z20.s, z25.s\n"
+      "smin z21.s, p2/M, z21.s, z25.s\n"
+      "smin z22.s, p2/M, z22.s, z25.s\n"
+      "smin z16.s, p2/M, z16.s, z25.s\n"
+      "smin z17.s, p2/M, z17.s, z25.s\n"
+      "smin z18.s, p2/M, z18.s, z25.s\n"
+      "smin z19.s, p2/M, z19.s, z25.s\n"
+      "smax z23.s, p2/M, z23.s, z24.s\n"
+      "smax z20.s, p2/M, z20.s, z24.s\n"
+      "smax z21.s, p2/M, z21.s, z24.s\n"
+      "uzp1 z23.h, z23.h, z20.h\n"
+      "smax z22.s, p2/M, z22.s, z24.s\n"
+      "smax z16.s, p2/M, z16.s, z24.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "uzp1 z23.b, z23.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z24.s\n"
+      "smax z18.s, p2/M, z18.s, z24.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z23.b }, p1, [x27]\n"
+      "smax z19.s, p2/M, z19.s, z24.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x23]\n"
+      "addvl x27, x27, #1\n"
+      "28:"  // Height 2: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 16b\n"
+      "b 58f\n"
+      "29:"  // Height 3
+      "mov x10, %x[col_bias]\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "30:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "31:"  // Height 3: setup done
+      "mov x26, #0x0\n"
+      "32:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 34f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 34f\n"
+      "33:"  // Height 3: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "34:"  // Height 3: input setup done
+      "cmp x25, #0x10\n"
+      "ble 37f\n"
+      "35:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45049810  // smmla z16.s, z0.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45049858  // smmla z24.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45059814  // smmla z20.s, z0.b, z5.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4505985c  // smmla z28.s, z2.b, z5.b\n"
+      ".inst 0x45049811  // smmla z17.s, z0.b, z4.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45049859  // smmla z25.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45099815  // smmla z21.s, z0.b, z9.b\n"
+      ".inst 0x4509985d  // smmla z29.s, z2.b, z9.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45089812  // smmla z18.s, z0.b, z8.b\n"
+      ".inst 0x4508985a  // smmla z26.s, z2.b, z8.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45079816  // smmla z22.s, z0.b, z7.b\n"
+      ".inst 0x4507985e  // smmla z30.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45069813  // smmla z19.s, z0.b, z6.b\n"
+      ".inst 0x4506985b  // smmla z27.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      ".inst 0x4505985f  // smmla z31.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
+      ".inst 0x45049878  // smmla z24.s, z3.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x450a9834  // smmla z20.s, z1.b, z10.b\n"
+      ".inst 0x450a987c  // smmla z28.s, z3.b, z10.b\n"
+      ".inst 0x45099831  // smmla z17.s, z1.b, z9.b\n"
+      ".inst 0x45099879  // smmla z25.s, z3.b, z9.b\n"
+      ".inst 0x45089835  // smmla z21.s, z1.b, z8.b\n"
+      ".inst 0x4508987d  // smmla z29.s, z3.b, z8.b\n"
+      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      ".inst 0x4507987a  // smmla z26.s, z3.b, z7.b\n"
+      ".inst 0x45069836  // smmla z22.s, z1.b, z6.b\n"
+      ".inst 0x4506987e  // smmla z30.s, z3.b, z6.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
+      ".inst 0x45049837  // smmla z23.s, z1.b, z4.b\n"
+      ".inst 0x4504987f  // smmla z31.s, z3.b, z4.b\n"
+      "tbnz %x[flags], #31, 36f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "sdot z13.s, z3.b, z15.b\n"
+      "36:"  // Height 3: Multiply loop: unique 5: skip row sum
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
+      "bgt 35b\n"
+      "37:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "subs x25, x25, #0x8\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45049814  // smmla z20.s, z0.b, z4.b\n"
+      ".inst 0x4504985c  // smmla z28.s, z2.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45099811  // smmla z17.s, z0.b, z9.b\n"
+      ".inst 0x45099859  // smmla z25.s, z2.b, z9.b\n"
+      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
+      ".inst 0x4508985d  // smmla z29.s, z2.b, z8.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45079812  // smmla z18.s, z0.b, z7.b\n"
+      ".inst 0x4507985a  // smmla z26.s, z2.b, z7.b\n"
+      ".inst 0x45069816  // smmla z22.s, z0.b, z6.b\n"
+      ".inst 0x4506985e  // smmla z30.s, z2.b, z6.b\n"
+      ".inst 0x45059813  // smmla z19.s, z0.b, z5.b\n"
+      ".inst 0x4505985b  // smmla z27.s, z2.b, z5.b\n"
+      ".inst 0x45049817  // smmla z23.s, z0.b, z4.b\n"
+      ".inst 0x4504985f  // smmla z31.s, z2.b, z4.b\n"
+      "ble 38f\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
+      ".inst 0x45049878  // smmla z24.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45059834  // smmla z20.s, z1.b, z5.b\n"
+      ".inst 0x4505987c  // smmla z28.s, z3.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45049831  // smmla z17.s, z1.b, z4.b\n"
+      ".inst 0x45049879  // smmla z25.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45089835  // smmla z21.s, z1.b, z8.b\n"
+      ".inst 0x4508987d  // smmla z29.s, z3.b, z8.b\n"
+      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      ".inst 0x4507987a  // smmla z26.s, z3.b, z7.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45069836  // smmla z22.s, z1.b, z6.b\n"
+      ".inst 0x4506987e  // smmla z30.s, z3.b, z6.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
+      ".inst 0x45049837  // smmla z23.s, z1.b, z4.b\n"
+      ".inst 0x4504987f  // smmla z31.s, z3.b, z4.b\n"
+      "38:"  // Height 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 39f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "sdot z13.s, z3.b, z15.b\n"
+      "39:"  // Height 3: Multiply loop: unique 6: skip row sum
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 32b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z0.d, z16.d, z20.d\n"
+      "add x23, x27, x20\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "add x22, x23, x20\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z0.d\n"
+      "tbnz %x[flags], #31, 40f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
+      "neg z23.s, p2/M, z23.s\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z23.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z12.s, p2/M, z12.s, z23.s\n"
+      "mul z13.s, p2/M, z13.s, z23.s\n"
+      "40:"  // Height 3: skip row sum fixup
+      "add z31.s, z31.s, z11.s\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add z16.s, z16.s, z12.s\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "addvl x10, x10, #4\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z31.s, z31.s, z0.s\n"
+      "add z20.s, z20.s, z30.s\n"
+      "add z21.s, z21.s, z29.s\n"
+      "add z22.s, z22.s, z28.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z30.s\n"
+      "add z18.s, z18.s, z29.s\n"
+      "add z19.s, z19.s, z28.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z30.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z26.s, z26.s, z29.s\n"
+      "add z27.s, z27.s, z28.s\n"
+      ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+      ".inst 0x04b77694  // sqrdmulh z20.s, z20.s, z23.s\n"
+      ".inst 0x04b776b5  // sqrdmulh z21.s, z21.s, z23.s\n"
+      ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+      ".inst 0x04b77610  // sqrdmulh z16.s, z16.s, z23.s\n"
+      ".inst 0x04b77631  // sqrdmulh z17.s, z17.s, z23.s\n"
+      ".inst 0x04b77652  // sqrdmulh z18.s, z18.s, z23.s\n"
+      ".inst 0x04b77673  // sqrdmulh z19.s, z19.s, z23.s\n"
+      ".inst 0x04b77718  // sqrdmulh z24.s, z24.s, z23.s\n"
+      ".inst 0x04b77739  // sqrdmulh z25.s, z25.s, z23.s\n"
+      ".inst 0x04b7775a  // sqrdmulh z26.s, z26.s, z23.s\n"
+      ".inst 0x04b7777b  // sqrdmulh z27.s, z27.s, z23.s\n"
+      "tbz %x[flags], #5, 41f\n"
+      "and z1.d, z31.d, z0.d\n"
+      "and z30.d, z20.d, z0.d\n"
+      "and z29.d, z21.d, z0.d\n"
+      "and z28.d, z22.d, z0.d\n"
+      "and z23.d, z16.d, z0.d\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z1.s\n"
+      "sqadd z20.s, z20.s, z30.s\n"
+      "sqadd z21.s, z21.s, z29.s\n"
+      "sqadd z22.s, z22.s, z28.s\n"
+      "sqadd z16.s, z16.s, z23.s\n"
+      "and z3.d, z17.d, z0.d\n"
+      "and z2.d, z18.d, z0.d\n"
+      "and z1.d, z19.d, z0.d\n"
+      "and z30.d, z24.d, z0.d\n"
+      "and z29.d, z25.d, z0.d\n"
+      "and z28.d, z26.d, z0.d\n"
+      "and z23.d, z27.d, z0.d\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z3.s\n"
+      "sqadd z18.s, z18.s, z2.s\n"
+      "sqadd z19.s, z19.s, z1.s\n"
+      "sqadd z24.s, z24.s, z30.s\n"
+      "sqadd z25.s, z25.s, z29.s\n"
+      "sqadd z26.s, z26.s, z28.s\n"
+      "sqadd z27.s, z27.s, z23.s\n"
+      "41:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add z31.s, z31.s, z23.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "add z20.s, z20.s, z23.s\n"
+      "add z21.s, z21.s, z23.s\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z22.s, z22.s, z23.s\n"
+      "add z16.s, z16.s, z23.s\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "add z17.s, z17.s, z23.s\n"
+      "add z18.s, z18.s, z23.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "add z19.s, z19.s, z23.s\n"
+      "add z24.s, z24.s, z23.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "add z25.s, z25.s, z23.s\n"
+      "add z26.s, z26.s, z23.s\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z23.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z28.s\n"
+      "smin z20.s, p2/M, z20.s, z28.s\n"
+      "smin z21.s, p2/M, z21.s, z28.s\n"
+      "smin z22.s, p2/M, z22.s, z28.s\n"
+      "smin z16.s, p2/M, z16.s, z28.s\n"
+      "smin z17.s, p2/M, z17.s, z28.s\n"
+      "smin z18.s, p2/M, z18.s, z28.s\n"
+      "smin z19.s, p2/M, z19.s, z28.s\n"
+      "smin z24.s, p2/M, z24.s, z28.s\n"
+      "smin z25.s, p2/M, z25.s, z28.s\n"
+      "smin z26.s, p2/M, z26.s, z28.s\n"
+      "smin z27.s, p2/M, z27.s, z28.s\n"
+      "smax z31.s, p2/M, z31.s, z23.s\n"
+      "smax z20.s, p2/M, z20.s, z23.s\n"
+      "smax z21.s, p2/M, z21.s, z23.s\n"
+      "uzp1 z31.h, z31.h, z20.h\n"
+      "smax z22.s, p2/M, z22.s, z23.s\n"
+      "smax z16.s, p2/M, z16.s, z23.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "uzp1 z31.b, z31.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z23.s\n"
+      "smax z18.s, p2/M, z18.s, z23.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z31.b }, p1, [x27]\n"
+      "smax z19.s, p2/M, z19.s, z23.s\n"
+      "smax z24.s, p2/M, z24.s, z23.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z25.s, p2/M, z25.s, z23.s\n"
+      "smax z26.s, p2/M, z26.s, z23.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "st1b { z16.b }, p1, [x23]\n"
+      "smax z27.s, p2/M, z27.s, z23.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x22]\n"
+      "addvl x27, x27, #1\n"
+      "42:"  // Height 3: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 30b\n"
+      "b 58f\n"
+      "43:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x4\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "44:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "45:"  // Height 4: setup done
+      "mov x26, #0x0\n"
+      "46:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 47f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 48f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 48f\n"
+      "47:"  // Height 4: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "48:"  // Height 4: input setup done
+      "cmp x25, #0x10\n"
+      "ble 51f\n"
+      "49:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45049810  // smmla z16.s, z0.b, z4.b\n"
+      ".inst 0x45049858  // smmla z24.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45049814  // smmla z20.s, z0.b, z4.b\n"
+      ".inst 0x4504985c  // smmla z28.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45059811  // smmla z17.s, z0.b, z5.b\n"
+      ".inst 0x45059859  // smmla z25.s, z2.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45049815  // smmla z21.s, z0.b, z4.b\n"
+      ".inst 0x4504985d  // smmla z29.s, z2.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45089812  // smmla z18.s, z0.b, z8.b\n"
+      ".inst 0x4508985a  // smmla z26.s, z2.b, z8.b\n"
+      ".inst 0x45079816  // smmla z22.s, z0.b, z7.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x4507985e  // smmla z30.s, z2.b, z7.b\n"
+      ".inst 0x45069813  // smmla z19.s, z0.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x4506985b  // smmla z27.s, z2.b, z6.b\n"
+      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4505985f  // smmla z31.s, z2.b, z5.b\n"
+      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x45049878  // smmla z24.s, z3.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x450a9834  // smmla z20.s, z1.b, z10.b\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x450a987c  // smmla z28.s, z3.b, z10.b\n"
+      ".inst 0x45099831  // smmla z17.s, z1.b, z9.b\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45099879  // smmla z25.s, z3.b, z9.b\n"
+      ".inst 0x45089835  // smmla z21.s, z1.b, z8.b\n"
+      ".inst 0x4508987d  // smmla z29.s, z3.b, z8.b\n"
+      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      ".inst 0x4507987a  // smmla z26.s, z3.b, z7.b\n"
+      ".inst 0x45069836  // smmla z22.s, z1.b, z6.b\n"
+      ".inst 0x4506987e  // smmla z30.s, z3.b, z6.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
+      ".inst 0x45049837  // smmla z23.s, z1.b, z4.b\n"
+      ".inst 0x4504987f  // smmla z31.s, z3.b, z4.b\n"
+      "tbnz %x[flags], #31, 50f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "sdot z13.s, z3.b, z15.b\n"
+      "50:"  // Height 4: Multiply loop: unique 7: skip row sum
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
+      "bgt 49b\n"
+      "51:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45049810  // smmla z16.s, z0.b, z4.b\n"
+      ".inst 0x45049858  // smmla z24.s, z2.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "subs x25, x25, #0x8\n"
+      ".inst 0x45059814  // smmla z20.s, z0.b, z5.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4505985c  // smmla z28.s, z2.b, z5.b\n"
+      ".inst 0x45049811  // smmla z17.s, z0.b, z4.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45049859  // smmla z25.s, z2.b, z4.b\n"
+      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x4508985d  // smmla z29.s, z2.b, z8.b\n"
+      ".inst 0x45079812  // smmla z18.s, z0.b, z7.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507985a  // smmla z26.s, z2.b, z7.b\n"
+      ".inst 0x45069816  // smmla z22.s, z0.b, z6.b\n"
+      ".inst 0x4506985e  // smmla z30.s, z2.b, z6.b\n"
+      ".inst 0x45059813  // smmla z19.s, z0.b, z5.b\n"
+      ".inst 0x4505985b  // smmla z27.s, z2.b, z5.b\n"
+      ".inst 0x45049817  // smmla z23.s, z0.b, z4.b\n"
+      ".inst 0x4504985f  // smmla z31.s, z2.b, z4.b\n"
+      "ble 52f\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
+      ".inst 0x45049878  // smmla z24.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45059834  // smmla z20.s, z1.b, z5.b\n"
+      ".inst 0x4505987c  // smmla z28.s, z3.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45049831  // smmla z17.s, z1.b, z4.b\n"
+      ".inst 0x45049879  // smmla z25.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45089835  // smmla z21.s, z1.b, z8.b\n"
+      ".inst 0x4508987d  // smmla z29.s, z3.b, z8.b\n"
+      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      ".inst 0x4507987a  // smmla z26.s, z3.b, z7.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45069836  // smmla z22.s, z1.b, z6.b\n"
+      ".inst 0x4506987e  // smmla z30.s, z3.b, z6.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
+      ".inst 0x45049837  // smmla z23.s, z1.b, z4.b\n"
+      ".inst 0x4504987f  // smmla z31.s, z3.b, z4.b\n"
+      "52:"  // Height 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 53f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "sdot z13.s, z3.b, z15.b\n"
+      "53:"  // Height 4: Multiply loop: unique 8: skip row sum
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 46b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z0.d, z16.d, z20.d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "add x21, x22, x20\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z0.d\n"
+      "tbnz %x[flags], #31, 54f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
+      "neg z0.s, p2/M, z0.s\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z0.s\n"
+      "mov z14.s, z13.s[3]\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z12.s, p2/M, z12.s, z0.s\n"
+      "mul z13.s, p2/M, z13.s, z0.s\n"
+      "mul z14.s, p2/M, z14.s, z0.s\n"
+      "54:"  // Height 4: skip row sum fixup
+      "add z31.s, z31.s, z11.s\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z4.s }, p2/Z, [x10]\n"
+      "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add z16.s, z16.s, z12.s\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z23.s, z23.s, z13.s\n"
+      "add z28.s, z28.s, z13.s\n"
+      "addvl x10, x10, #4\n"
+      "add z29.s, z29.s, z13.s\n"
+      "add z30.s, z30.s, z13.s\n"
+      "add z24.s, z24.s, z14.s\n"
+      "add z25.s, z25.s, z14.s\n"
+      "add z26.s, z26.s, z14.s\n"
+      "add z27.s, z27.s, z14.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z3.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z0.s\n"
+      "add z18.s, z18.s, z3.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z28.s, z28.s, z0.s\n"
+      "add z29.s, z29.s, z3.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z26.s, z26.s, z3.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      ".inst 0x04a177ff  // sqrdmulh z31.s, z31.s, z1.s\n"
+      ".inst 0x04a17694  // sqrdmulh z20.s, z20.s, z1.s\n"
+      ".inst 0x04a176b5  // sqrdmulh z21.s, z21.s, z1.s\n"
+      ".inst 0x04a176d6  // sqrdmulh z22.s, z22.s, z1.s\n"
+      ".inst 0x04a17610  // sqrdmulh z16.s, z16.s, z1.s\n"
+      ".inst 0x04a17631  // sqrdmulh z17.s, z17.s, z1.s\n"
+      ".inst 0x04a17652  // sqrdmulh z18.s, z18.s, z1.s\n"
+      ".inst 0x04a17673  // sqrdmulh z19.s, z19.s, z1.s\n"
+      ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+      ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+      ".inst 0x04a177bd  // sqrdmulh z29.s, z29.s, z1.s\n"
+      ".inst 0x04a177de  // sqrdmulh z30.s, z30.s, z1.s\n"
+      ".inst 0x04a17718  // sqrdmulh z24.s, z24.s, z1.s\n"
+      ".inst 0x04a17739  // sqrdmulh z25.s, z25.s, z1.s\n"
+      ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+      ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+      "tbz %x[flags], #5, 55f\n"
+      "and z2.d, z31.d, z0.d\n"
+      "and z1.d, z20.d, z0.d\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z2.s\n"
+      "sqadd z20.s, z20.s, z1.s\n"
+      "and z7.d, z21.d, z0.d\n"
+      "and z6.d, z22.d, z0.d\n"
+      "and z5.d, z16.d, z0.d\n"
+      "and z4.d, z17.d, z0.d\n"
+      "and z3.d, z18.d, z0.d\n"
+      "and z2.d, z19.d, z0.d\n"
+      "and z1.d, z23.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z7.s\n"
+      "sqadd z22.s, z22.s, z6.s\n"
+      "sqadd z16.s, z16.s, z5.s\n"
+      "sqadd z17.s, z17.s, z4.s\n"
+      "sqadd z18.s, z18.s, z3.s\n"
+      "sqadd z19.s, z19.s, z2.s\n"
+      "sqadd z23.s, z23.s, z1.s\n"
+      "and z7.d, z28.d, z0.d\n"
+      "and z6.d, z29.d, z0.d\n"
+      "and z5.d, z30.d, z0.d\n"
+      "and z4.d, z24.d, z0.d\n"
+      "and z3.d, z25.d, z0.d\n"
+      "and z2.d, z26.d, z0.d\n"
+      "and z1.d, z27.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z7.s\n"
+      "sqadd z29.s, z29.s, z6.s\n"
+      "sqadd z30.s, z30.s, z5.s\n"
+      "sqadd z24.s, z24.s, z4.s\n"
+      "sqadd z25.s, z25.s, z3.s\n"
+      "sqadd z26.s, z26.s, z2.s\n"
+      "sqadd z27.s, z27.s, z1.s\n"
+      "55:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add z31.s, z31.s, z2.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "add z20.s, z20.s, z2.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z16.s, z16.s, z2.s\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z23.s, z23.s, z2.s\n"
+      ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
+      ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
+      "add z28.s, z28.s, z2.s\n"
+      "add z29.s, z29.s, z2.s\n"
+      ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z24.s, z24.s, z2.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z1.s\n"
+      "smin z20.s, p2/M, z20.s, z1.s\n"
+      "smin z21.s, p2/M, z21.s, z1.s\n"
+      "smin z22.s, p2/M, z22.s, z1.s\n"
+      "smin z16.s, p2/M, z16.s, z1.s\n"
+      "smin z17.s, p2/M, z17.s, z1.s\n"
+      "smin z18.s, p2/M, z18.s, z1.s\n"
+      "smin z19.s, p2/M, z19.s, z1.s\n"
+      "smin z23.s, p2/M, z23.s, z1.s\n"
+      "smin z28.s, p2/M, z28.s, z1.s\n"
+      "smin z29.s, p2/M, z29.s, z1.s\n"
+      "smin z30.s, p2/M, z30.s, z1.s\n"
+      "smin z24.s, p2/M, z24.s, z1.s\n"
+      "smin z25.s, p2/M, z25.s, z1.s\n"
+      "smin z26.s, p2/M, z26.s, z1.s\n"
+      "smin z27.s, p2/M, z27.s, z1.s\n"
+      "smax z31.s, p2/M, z31.s, z0.s\n"
+      "smax z20.s, p2/M, z20.s, z0.s\n"
+      "smax z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z31.h, z31.h, z20.h\n"
+      "smax z22.s, p2/M, z22.s, z0.s\n"
+      "smax z16.s, p2/M, z16.s, z0.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "uzp1 z31.b, z31.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z0.s\n"
+      "smax z18.s, p2/M, z18.s, z0.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z31.b }, p1, [x27]\n"
+      "smax z19.s, p2/M, z19.s, z0.s\n"
+      "smax z23.s, p2/M, z23.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z28.s, p2/M, z28.s, z0.s\n"
+      "smax z29.s, p2/M, z29.s, z0.s\n"
+      "uzp1 z23.h, z23.h, z28.h\n"
+      "st1b { z16.b }, p1, [x23]\n"
+      "smax z30.s, p2/M, z30.s, z0.s\n"
+      "smax z24.s, p2/M, z24.s, z0.s\n"
+      "uzp1 z16.h, z29.h, z30.h\n"
+      "uzp1 z23.b, z23.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z0.s\n"
+      "smax z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "st1b { z23.b }, p1, [x22]\n"
+      "smax z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x21]\n"
+      "addvl x27, x27, #1\n"
+      "56:"  // Height 4: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 44b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 58f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 57f\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "57:"  // Update direct input
+      "mov x20, #0x4\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "58:"  // Exit
+      : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
index 61927236ad..056ae7a616 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,21 +10,22 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -42,7 +43,8 @@ void sve_hybrid_s8qs_dot_6x4VL( ARGLIST );
 class cls_sve_hybrid_s8qs_dot_6x4VL
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,7 +70,21 @@ public:
         return false;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 30.13 };
+                case CPUModel::A510:
+                    return { 19.77 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_s8qs_dot_6x4VL;
@@ -80,4 +96,4 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
index f901a814f9..c28717a37e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,18 +10,18 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -95,140 +95,137 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "cmp %x[M], #0x2\n"
       "bgt 27f\n"
       "beq 14f\n"
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
+      "mov x11, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x9, %x[output_ptr]\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
       "mov z8.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z9.s, #0x0\n"
-      "whilelt p1.b, x19, x10\n"
       "mov z10.s, #0x0\n"
       "mov z11.s, #0x0\n"
       "3:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 6f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
       "b 6f\n"
       "5:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "6:"  // Height 1: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 8f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
-      "addvl x28, x28, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1b { z16.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z10.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "sdot z8.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "sdot z10.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
+      "add x26, x26, #0x10\n"
       "bgt 7b\n"
       "8:"  // Height 1: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1b { z16.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[0]\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
+      "addvl x9, x9, #4\n"
       "ble 9f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[1]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z10.s, z17.b, z0.b[1]\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
+      "addvl x9, x9, #4\n"
       "ble 9f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
+      "addvl x9, x9, #4\n"
       "ble 9f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
+      "addvl x9, x9, #4\n"
       "9:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 4b\n"
-      "ld1w { z0.s }, p2/Z, [x11]\n"
-      "add z8.s, z8.s, z0.s\n"
-      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
-      "add z9.s, z9.s, z1.s\n"
-      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
-      "addvl x11, x11, #4\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
+      "ld1w { z17.s }, p2/Z, [x14]\n"
+      "ld1w { z16.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "add z8.s, z8.s, z17.s\n"
+      "add z9.s, z9.s, z16.s\n"
+      "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add z10.s, z10.s, z17.s\n"
+      "add z11.s, z11.s, z16.s\n"
+      "addvl x14, x14, #4\n"
       "tbz %x[flags], #4, 10f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -237,20 +234,20 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
       "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "addvl x12, x12, #4\n"
       "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
       "addvl x13, x13, #4\n"
       "b 11f\n"
       "10:"  // Height 1: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
-      "mov z2.d, z0.d\n"
-      "mov z3.d, z0.d\n"
       "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
       "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
       "mov z7.d, z4.d\n"
       "11:"  // Height 1: parameters loaded
       ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
@@ -258,63 +255,63 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
       ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
       "tbz %x[flags], #5, 12f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
+      "and z19.d, z8.d, z0.d\n"
+      "and z18.d, z9.d, z1.d\n"
+      "and z17.d, z10.d, z2.d\n"
+      "and z16.d, z11.d, z3.d\n"
+      "asr z19.s, z19.s, #0x1f\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z19.s\n"
+      "sqadd z9.s, z9.s, z18.s\n"
+      "sqadd z10.s, z10.s, z17.s\n"
+      "sqadd z11.s, z11.s, z16.s\n"
       "12:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "add z8.s, z8.s, z16.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
-      "add x24, %x[qp], %[minval]\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "ld1rw { z5.s }, p2/Z, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
+      "add z9.s, z9.s, z16.s\n"
+      "add z10.s, z10.s, z16.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
-      "ld1rw { z6.s }, p2/Z, [x24]\n"
-      "add z8.s, z8.s, z4.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "add z11.s, z11.s, z16.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z17.s\n"
+      "smin z9.s, p2/M, z9.s, z17.s\n"
+      "smin z10.s, p2/M, z10.s, z17.s\n"
+      "smin z11.s, p2/M, z11.s, z17.s\n"
+      "smax z8.s, p2/M, z8.s, z16.s\n"
+      "smax z9.s, p2/M, z9.s, z16.s\n"
+      "smax z10.s, p2/M, z10.s, z16.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "st1b { z8.b }, p1, [x9]\n"
-      "addvl x9, x9, #1\n"
+      "smax z11.s, p2/M, z11.s, z16.s\n"
+      "uzp1 z16.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z16.b\n"
+      "st1b { z8.b }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
       "13:"  // Height 1: Writeback done
       "decw x10, ALL, MUL #4\n"
       "cmp x10, XZR\n"
       "bgt 2b\n"
       "b 80f\n"
       "14:"  // Height 2
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x9, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "15:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
       "mov z8.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z9.s, #0x0\n"
-      "whilelt p1.b, x19, x10\n"
       "mov z10.s, #0x0\n"
       "mov z11.s, #0x0\n"
       "mov z12.s, #0x0\n"
@@ -322,173 +319,167 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "mov z14.s, #0x0\n"
       "mov z15.s, #0x0\n"
       "16:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "17:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 18f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 19f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 19f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
       "b 19f\n"
       "18:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
       "19:"  // Height 2: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 21f\n"
       "20:"  // Height 2: Multiply loop: Main loop head
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[0]\n"
+      "sdot z12.s, z17.b, z0.b[0]\n"
+      "sdot z9.s, z16.b, z1.b[0]\n"
+      "sdot z13.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[0]\n"
+      "sdot z14.s, z17.b, z0.b[0]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      "sdot z11.s, z16.b, z1.b[0]\n"
+      "sdot z15.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z8.s, z17.b, z1.b[1]\n"
+      "sdot z12.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
-      "addvl x28, x28, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z9.s, z16.b, z1.b[1]\n"
+      "sdot z13.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      "sdot z10.s, z17.b, z1.b[1]\n"
+      "sdot z14.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "sdot z11.s, z16.b, z1.b[1]\n"
+      "sdot z15.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[2]\n"
+      "sdot z12.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "sdot z9.s, z16.b, z1.b[2]\n"
+      "sdot z13.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[2]\n"
+      "sdot z14.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "sdot z11.s, z16.b, z1.b[2]\n"
+      "sdot z15.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[3]\n"
+      "sdot z12.s, z17.b, z0.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "sdot z9.s, z16.b, z1.b[3]\n"
+      "sdot z13.s, z16.b, z0.b[3]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[3]\n"
+      "sdot z14.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z1.b[3]\n"
+      "sdot z15.s, z16.b, z0.b[3]\n"
       "bgt 20b\n"
       "21:"  // Height 2: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[0]\n"
+      "sdot z12.s, z17.b, z1.b[0]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "sdot z13.s, z16.b, z1.b[0]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[0]\n"
+      "sdot z14.s, z17.b, z1.b[0]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
+      "sdot z15.s, z16.b, z1.b[0]\n"
       "ble 22f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[1]\n"
+      "sdot z12.s, z17.b, z1.b[1]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "sdot z13.s, z16.b, z1.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z10.s, z17.b, z0.b[1]\n"
+      "sdot z14.s, z17.b, z1.b[1]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
+      "sdot z15.s, z16.b, z1.b[1]\n"
       "ble 22f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z12.s, z17.b, z1.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "sdot z13.s, z16.b, z1.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z14.s, z17.b, z1.b[2]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
+      "sdot z15.s, z16.b, z1.b[2]\n"
       "ble 22f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z12.s, z17.b, z1.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "sdot z13.s, z16.b, z1.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z14.s, z17.b, z1.b[3]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
+      "sdot z15.s, z16.b, z1.b[3]\n"
       "22:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 17b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z0.s }, p2/Z, [x11]\n"
-      "add z8.s, z8.s, z0.s\n"
-      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
-      "add x23, x9, x19\n"
-      "add z12.s, z12.s, z0.s\n"
-      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
-      "add z9.s, z9.s, z1.s\n"
-      "addvl x11, x11, #4\n"
-      "add z13.s, z13.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z14.s, z14.s, z2.s\n"
-      "add z15.s, z15.s, z3.s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z19.s }, p2/Z, [x14]\n"
+      "add x26, x11, x20\n"
+      "add z8.s, z8.s, z19.s\n"
+      "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "add z9.s, z9.s, z18.s\n"
+      "add z10.s, z10.s, z17.s\n"
+      "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add z11.s, z11.s, z16.s\n"
+      "add z12.s, z12.s, z19.s\n"
+      "addvl x14, x14, #4\n"
+      "add z13.s, z13.s, z18.s\n"
+      "add z14.s, z14.s, z17.s\n"
+      "add z15.s, z15.s, z16.s\n"
       "tbz %x[flags], #4, 23f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -497,20 +488,20 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
       "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "addvl x12, x12, #4\n"
       "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
       "addvl x13, x13, #4\n"
       "b 24f\n"
       "23:"  // Height 2: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
-      "mov z2.d, z0.d\n"
-      "mov z3.d, z0.d\n"
       "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
       "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
       "mov z7.d, z4.d\n"
       "24:"  // Height 2: parameters loaded
       ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
@@ -522,95 +513,95 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
       ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
       "tbz %x[flags], #5, 25f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z4.d, z12.d, z0.d\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "and z5.d, z13.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z6.d, z14.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z12.s, z12.s, z4.s\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z13.s, z13.s, z5.s\n"
-      "sqadd z14.s, z14.s, z6.s\n"
-      "sqadd z15.s, z15.s, z7.s\n"
+      "and z19.d, z8.d, z0.d\n"
+      "and z18.d, z9.d, z1.d\n"
+      "and z17.d, z10.d, z2.d\n"
+      "and z16.d, z11.d, z3.d\n"
+      "asr z19.s, z19.s, #0x1f\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z19.s\n"
+      "sqadd z9.s, z9.s, z18.s\n"
+      "sqadd z10.s, z10.s, z17.s\n"
+      "sqadd z11.s, z11.s, z16.s\n"
+      "and z19.d, z12.d, z0.d\n"
+      "and z18.d, z13.d, z1.d\n"
+      "and z17.d, z14.d, z2.d\n"
+      "and z16.d, z15.d, z3.d\n"
+      "asr z19.s, z19.s, #0x1f\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z19.s\n"
+      "sqadd z13.s, z13.s, z18.s\n"
+      "sqadd z14.s, z14.s, z17.s\n"
+      "sqadd z15.s, z15.s, z16.s\n"
       "25:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "add z8.s, z8.s, z17.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
-      "add x24, %x[qp], %[minval]\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "ld1rw { z5.s }, p2/Z, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
+      "add z9.s, z9.s, z17.s\n"
+      "add z10.s, z10.s, z17.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
-      "ld1rw { z6.s }, p2/Z, [x24]\n"
       ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
-      "add z8.s, z8.s, z4.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add z12.s, z12.s, z4.s\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "uzp1 z8.h, z8.h, z9.h\n"
+      "add z11.s, z11.s, z17.s\n"
+      "add z12.s, z12.s, z17.s\n"
       ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "st1b { z8.b }, p1, [x9]\n"
-      "add z13.s, z13.s, z4.s\n"
-      "addvl x9, x9, #1\n"
       ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      "add z13.s, z13.s, z17.s\n"
+      "add z14.s, z14.s, z17.s\n"
       ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      "add z14.s, z14.s, z4.s\n"
-      "add z15.s, z15.s, z4.s\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "add z15.s, z15.s, z17.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z16.s\n"
+      "smin z9.s, p2/M, z9.s, z16.s\n"
+      "smin z10.s, p2/M, z10.s, z16.s\n"
+      "smin z11.s, p2/M, z11.s, z16.s\n"
+      "smin z12.s, p2/M, z12.s, z16.s\n"
+      "smin z13.s, p2/M, z13.s, z16.s\n"
+      "smin z14.s, p2/M, z14.s, z16.s\n"
+      "smin z15.s, p2/M, z15.s, z16.s\n"
+      "smax z8.s, p2/M, z8.s, z17.s\n"
+      "smax z9.s, p2/M, z9.s, z17.s\n"
+      "smax z10.s, p2/M, z10.s, z17.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "smax z11.s, p2/M, z11.s, z17.s\n"
+      "smax z12.s, p2/M, z12.s, z17.s\n"
+      "uzp1 z16.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z16.b\n"
+      "smax z13.s, p2/M, z13.s, z17.s\n"
+      "smax z14.s, p2/M, z14.s, z17.s\n"
       "uzp1 z12.h, z12.h, z13.h\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p1, [x23]\n"
+      "st1b { z8.b }, p1, [x11]\n"
+      "smax z15.s, p2/M, z15.s, z17.s\n"
+      "uzp1 z16.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z16.b\n"
+      "st1b { z12.b }, p1, [x26]\n"
+      "addvl x11, x11, #1\n"
       "26:"  // Height 2: Writeback done
       "decw x10, ALL, MUL #4\n"
       "cmp x10, XZR\n"
       "bgt 15b\n"
       "b 80f\n"
       "27:"  // Height 3
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x9, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "28:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
       "mov z8.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z9.s, #0x0\n"
-      "whilelt p1.b, x19, x10\n"
       "mov z10.s, #0x0\n"
       "mov z11.s, #0x0\n"
       "mov z12.s, #0x0\n"
@@ -622,219 +613,210 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "mov z18.s, #0x0\n"
       "mov z19.s, #0x0\n"
       "29:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "30:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 31f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 32f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 32f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
       "b 32f\n"
       "31:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "32:"  // Height 3: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 34f\n"
       "33:"  // Height 3: Multiply loop: Main loop head
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1b { z21.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z21.b, z2.b[0]\n"
+      "sdot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z16.s, z21.b, z0.b[0]\n"
+      "sdot z9.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[0]\n"
+      "sdot z17.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      "sdot z10.s, z21.b, z2.b[0]\n"
+      "sdot z14.s, z21.b, z1.b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "sdot z18.s, z21.b, z0.b[0]\n"
+      "sdot z11.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
-      "addvl x28, x28, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z15.s, z20.b, z1.b[0]\n"
+      "sdot z19.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[1]\n"
+      "sdot z12.s, z21.b, z1.b[1]\n"
+      "sdot z16.s, z21.b, z0.b[1]\n"
+      "sdot z9.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[1]\n"
+      "sdot z17.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      "sdot z10.s, z21.b, z2.b[1]\n"
+      "sdot z14.s, z21.b, z1.b[1]\n"
+      "sdot z18.s, z21.b, z0.b[1]\n"
+      "sdot z11.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "sdot z15.s, z20.b, z1.b[1]\n"
+      "sdot z19.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[2]\n"
+      "sdot z12.s, z21.b, z1.b[2]\n"
+      "sdot z16.s, z21.b, z0.b[2]\n"
+      "sdot z9.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[2]\n"
+      "sdot z17.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z21.b, z2.b[2]\n"
+      "sdot z14.s, z21.b, z1.b[2]\n"
+      "sdot z18.s, z21.b, z0.b[2]\n"
+      "sdot z11.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "sdot z15.s, z20.b, z1.b[2]\n"
+      "sdot z19.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[3]\n"
+      "sdot z12.s, z21.b, z1.b[3]\n"
+      "sdot z16.s, z21.b, z0.b[3]\n"
+      "sdot z9.s, z20.b, z2.b[3]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[3]\n"
+      "sdot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sdot z10.s, z21.b, z2.b[3]\n"
+      "sdot z14.s, z21.b, z1.b[3]\n"
+      "sdot z18.s, z21.b, z0.b[3]\n"
+      "sdot z11.s, z20.b, z2.b[3]\n"
+      "sdot z15.s, z20.b, z1.b[3]\n"
+      "sdot z19.s, z20.b, z0.b[3]\n"
       "bgt 33b\n"
       "34:"  // Height 3: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1b { z21.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z21.b, z0.b[0]\n"
+      "sdot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z16.s, z21.b, z2.b[0]\n"
+      "sdot z9.s, z20.b, z0.b[0]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[0]\n"
+      "sdot z17.s, z20.b, z2.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z21.b, z0.b[0]\n"
+      "sdot z14.s, z21.b, z1.b[0]\n"
+      "sdot z18.s, z21.b, z2.b[0]\n"
+      "sdot z11.s, z20.b, z0.b[0]\n"
+      "sdot z15.s, z20.b, z1.b[0]\n"
+      "sdot z19.s, z20.b, z2.b[0]\n"
       "ble 35f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x9]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[1]\n"
+      "sdot z12.s, z21.b, z1.b[1]\n"
+      "sdot z16.s, z21.b, z2.b[1]\n"
+      "sdot z9.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z13.s, z20.b, z1.b[1]\n"
+      "sdot z17.s, z20.b, z2.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z21.b, z0.b[1]\n"
+      "sdot z14.s, z21.b, z1.b[1]\n"
+      "sdot z18.s, z21.b, z2.b[1]\n"
+      "sdot z11.s, z20.b, z0.b[1]\n"
+      "sdot z15.s, z20.b, z1.b[1]\n"
+      "sdot z19.s, z20.b, z2.b[1]\n"
       "ble 35f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
+      "ld1b { z21.b }, p2/Z, [x9]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[2]\n"
+      "sdot z12.s, z21.b, z1.b[2]\n"
+      "sdot z16.s, z21.b, z2.b[2]\n"
+      "sdot z9.s, z20.b, z0.b[2]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z13.s, z20.b, z1.b[2]\n"
+      "sdot z17.s, z20.b, z2.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z21.b, z0.b[2]\n"
+      "sdot z14.s, z21.b, z1.b[2]\n"
+      "sdot z18.s, z21.b, z2.b[2]\n"
+      "sdot z11.s, z20.b, z0.b[2]\n"
+      "sdot z15.s, z20.b, z1.b[2]\n"
+      "sdot z19.s, z20.b, z2.b[2]\n"
       "ble 35f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
+      "ld1b { z21.b }, p2/Z, [x9]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[3]\n"
+      "sdot z12.s, z21.b, z1.b[3]\n"
+      "sdot z16.s, z21.b, z2.b[3]\n"
+      "sdot z9.s, z20.b, z0.b[3]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[3]\n"
+      "sdot z17.s, z20.b, z2.b[3]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z21.b, z0.b[3]\n"
+      "sdot z14.s, z21.b, z1.b[3]\n"
+      "sdot z18.s, z21.b, z2.b[3]\n"
+      "sdot z11.s, z20.b, z0.b[3]\n"
+      "sdot z15.s, z20.b, z1.b[3]\n"
+      "sdot z19.s, z20.b, z2.b[3]\n"
       "35:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 30b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z0.s }, p2/Z, [x11]\n"
-      "add z8.s, z8.s, z0.s\n"
-      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
-      "add x23, x9, x19\n"
-      "add z12.s, z12.s, z0.s\n"
-      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
-      "add x22, x23, x19\n"
-      "add z16.s, z16.s, z0.s\n"
-      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
-      "addvl x11, x11, #4\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add z13.s, z13.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z14.s, z14.s, z2.s\n"
-      "add z15.s, z15.s, z3.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z23.s }, p2/Z, [x14]\n"
+      "add x26, x11, x20\n"
+      "add x25, x26, x20\n"
+      "ld1w { z22.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "add z8.s, z8.s, z23.s\n"
+      "add z9.s, z9.s, z22.s\n"
+      "ld1w { z20.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add z10.s, z10.s, z21.s\n"
+      "add z11.s, z11.s, z20.s\n"
+      "addvl x14, x14, #4\n"
+      "add z12.s, z12.s, z23.s\n"
+      "add z13.s, z13.s, z22.s\n"
+      "add z14.s, z14.s, z21.s\n"
+      "add z15.s, z15.s, z20.s\n"
+      "add z16.s, z16.s, z23.s\n"
+      "add z17.s, z17.s, z22.s\n"
+      "add z18.s, z18.s, z21.s\n"
+      "add z19.s, z19.s, z20.s\n"
       "tbz %x[flags], #4, 36f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -843,20 +825,20 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
       "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "addvl x12, x12, #4\n"
       "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
       "addvl x13, x13, #4\n"
       "b 37f\n"
       "36:"  // Height 3: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
-      "mov z2.d, z0.d\n"
-      "mov z3.d, z0.d\n"
       "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
       "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
       "mov z7.d, z4.d\n"
       "37:"  // Height 3: parameters loaded
       ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
@@ -872,127 +854,127 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
       ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
       "tbz %x[flags], #5, 38f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z4.d, z12.d, z0.d\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "and z5.d, z13.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z6.d, z14.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z12.s, z12.s, z4.s\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z13.s, z13.s, z5.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z14.s, z14.s, z6.s\n"
-      "and z5.d, z17.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z15.s, z15.s, z7.s\n"
-      "and z6.d, z18.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z7.d, z19.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
+      "and z23.d, z8.d, z0.d\n"
+      "and z22.d, z9.d, z1.d\n"
+      "and z21.d, z10.d, z2.d\n"
+      "and z20.d, z11.d, z3.d\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z23.s\n"
+      "sqadd z9.s, z9.s, z22.s\n"
+      "sqadd z10.s, z10.s, z21.s\n"
+      "sqadd z11.s, z11.s, z20.s\n"
+      "and z23.d, z12.d, z0.d\n"
+      "and z22.d, z13.d, z1.d\n"
+      "and z21.d, z14.d, z2.d\n"
+      "and z20.d, z15.d, z3.d\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z23.s\n"
+      "sqadd z13.s, z13.s, z22.s\n"
+      "sqadd z14.s, z14.s, z21.s\n"
+      "sqadd z15.s, z15.s, z20.s\n"
+      "and z23.d, z16.d, z0.d\n"
+      "and z22.d, z17.d, z1.d\n"
+      "and z21.d, z18.d, z2.d\n"
+      "and z20.d, z19.d, z3.d\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z23.s\n"
+      "sqadd z17.s, z17.s, z22.s\n"
+      "sqadd z18.s, z18.s, z21.s\n"
+      "sqadd z19.s, z19.s, z20.s\n"
       "38:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "add z8.s, z8.s, z21.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
-      "add x24, %x[qp], %[minval]\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "ld1rw { z5.s }, p2/Z, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
+      "add z9.s, z9.s, z21.s\n"
+      "add z10.s, z10.s, z21.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
-      "ld1rw { z6.s }, p2/Z, [x24]\n"
       ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
-      "add z8.s, z8.s, z4.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add z12.s, z12.s, z4.s\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "uzp1 z8.h, z8.h, z9.h\n"
+      "add z11.s, z11.s, z21.s\n"
+      "add z12.s, z12.s, z21.s\n"
       ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "st1b { z8.b }, p1, [x9]\n"
-      "add z13.s, z13.s, z4.s\n"
-      "addvl x9, x9, #1\n"
       ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      "add z13.s, z13.s, z21.s\n"
+      "add z14.s, z14.s, z21.s\n"
       ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
+      "add z15.s, z15.s, z21.s\n"
+      "add z16.s, z16.s, z21.s\n"
       ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
-      "add z14.s, z14.s, z4.s\n"
-      "add z15.s, z15.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "uzp1 z12.h, z12.h, z13.h\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
       ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
+      "add z17.s, z17.s, z21.s\n"
+      "add z18.s, z18.s, z21.s\n"
       ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p1, [x23]\n"
-      "add z18.s, z18.s, z4.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z21.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z20.s\n"
+      "smin z9.s, p2/M, z9.s, z20.s\n"
+      "smin z10.s, p2/M, z10.s, z20.s\n"
+      "smin z11.s, p2/M, z11.s, z20.s\n"
+      "smin z12.s, p2/M, z12.s, z20.s\n"
+      "smin z13.s, p2/M, z13.s, z20.s\n"
+      "smin z14.s, p2/M, z14.s, z20.s\n"
+      "smin z15.s, p2/M, z15.s, z20.s\n"
+      "smin z16.s, p2/M, z16.s, z20.s\n"
+      "smin z17.s, p2/M, z17.s, z20.s\n"
+      "smin z18.s, p2/M, z18.s, z20.s\n"
+      "smin z19.s, p2/M, z19.s, z20.s\n"
+      "smax z8.s, p2/M, z8.s, z21.s\n"
+      "smax z9.s, p2/M, z9.s, z21.s\n"
+      "smax z10.s, p2/M, z10.s, z21.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "smax z11.s, p2/M, z11.s, z21.s\n"
+      "smax z12.s, p2/M, z12.s, z21.s\n"
+      "uzp1 z20.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z20.b\n"
+      "smax z13.s, p2/M, z13.s, z21.s\n"
+      "smax z14.s, p2/M, z14.s, z21.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "st1b { z8.b }, p1, [x11]\n"
+      "smax z15.s, p2/M, z15.s, z21.s\n"
+      "smax z16.s, p2/M, z16.s, z21.s\n"
+      "uzp1 z20.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z21.s\n"
+      "smax z18.s, p2/M, z18.s, z21.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "st1b { z12.b }, p1, [x26]\n"
+      "smax z19.s, p2/M, z19.s, z21.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x22]\n"
+      "st1b { z16.b }, p1, [x25]\n"
+      "addvl x11, x11, #1\n"
       "39:"  // Height 3: Writeback done
       "decw x10, ALL, MUL #4\n"
       "cmp x10, XZR\n"
       "bgt 28b\n"
       "b 80f\n"
       "40:"  // Height 4
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x9, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "41:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
       "mov z8.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z9.s, #0x0\n"
-      "whilelt p1.b, x19, x10\n"
       "mov z10.s, #0x0\n"
       "mov z11.s, #0x0\n"
       "mov z12.s, #0x0\n"
@@ -1008,265 +990,253 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "mov z22.s, #0x0\n"
       "mov z23.s, #0x0\n"
       "42:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "43:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 44f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 45f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 45f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 45f\n"
       "44:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "45:"  // Height 4: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 47f\n"
       "46:"  // Height 4: Multiply loop: Main loop head
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z3.b }, p0/Z, [x26]\n"
+      "ld1rqb { z2.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z0.b }, p0/Z, [x23]\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[0]\n"
+      "sdot z12.s, z25.b, z2.b[0]\n"
+      "sdot z16.s, z25.b, z1.b[0]\n"
+      "sdot z20.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "sdot z9.s, z24.b, z3.b[0]\n"
+      "sdot z13.s, z24.b, z2.b[0]\n"
       "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
       "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
-      "addvl x28, x28, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z17.s, z24.b, z1.b[0]\n"
+      "sdot z21.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[0]\n"
+      "sdot z14.s, z25.b, z2.b[0]\n"
+      "sdot z18.s, z25.b, z1.b[0]\n"
+      "sdot z22.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[0]\n"
+      "sdot z15.s, z24.b, z2.b[0]\n"
+      "sdot z19.s, z24.b, z1.b[0]\n"
+      "sdot z23.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[1]\n"
+      "sdot z12.s, z25.b, z2.b[1]\n"
+      "sdot z16.s, z25.b, z1.b[1]\n"
+      "sdot z20.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[1]\n"
+      "sdot z13.s, z24.b, z2.b[1]\n"
+      "sdot z17.s, z24.b, z1.b[1]\n"
+      "sdot z21.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      "sdot z10.s, z25.b, z3.b[1]\n"
+      "sdot z14.s, z25.b, z2.b[1]\n"
+      "sdot z18.s, z25.b, z1.b[1]\n"
+      "sdot z22.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[1]\n"
+      "sdot z15.s, z24.b, z2.b[1]\n"
+      "sdot z19.s, z24.b, z1.b[1]\n"
+      "sdot z23.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[2]\n"
+      "sdot z12.s, z25.b, z2.b[2]\n"
+      "sdot z16.s, z25.b, z1.b[2]\n"
+      "sdot z20.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[2]\n"
+      "sdot z13.s, z24.b, z2.b[2]\n"
+      "sdot z17.s, z24.b, z1.b[2]\n"
+      "sdot z21.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[2]\n"
+      "sdot z14.s, z25.b, z2.b[2]\n"
+      "sdot z18.s, z25.b, z1.b[2]\n"
+      "sdot z22.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[2]\n"
+      "sdot z15.s, z24.b, z2.b[2]\n"
+      "sdot z19.s, z24.b, z1.b[2]\n"
+      "sdot z23.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[3]\n"
+      "sdot z12.s, z25.b, z2.b[3]\n"
+      "sdot z16.s, z25.b, z1.b[3]\n"
+      "sdot z20.s, z25.b, z0.b[3]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[3]\n"
+      "sdot z13.s, z24.b, z2.b[3]\n"
+      "sdot z17.s, z24.b, z1.b[3]\n"
+      "sdot z21.s, z24.b, z0.b[3]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[3]\n"
+      "sdot z14.s, z25.b, z2.b[3]\n"
+      "sdot z18.s, z25.b, z1.b[3]\n"
+      "sdot z22.s, z25.b, z0.b[3]\n"
+      "sdot z11.s, z24.b, z3.b[3]\n"
+      "sdot z15.s, z24.b, z2.b[3]\n"
+      "sdot z19.s, z24.b, z1.b[3]\n"
+      "sdot z23.s, z24.b, z0.b[3]\n"
       "bgt 46b\n"
       "47:"  // Height 4: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[0]\n"
+      "sdot z12.s, z25.b, z1.b[0]\n"
+      "sdot z16.s, z25.b, z2.b[0]\n"
+      "sdot z20.s, z25.b, z3.b[0]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[0]\n"
+      "sdot z13.s, z24.b, z1.b[0]\n"
+      "sdot z17.s, z24.b, z2.b[0]\n"
+      "sdot z21.s, z24.b, z3.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z25.b, z0.b[0]\n"
+      "sdot z14.s, z25.b, z1.b[0]\n"
+      "sdot z18.s, z25.b, z2.b[0]\n"
+      "sdot z22.s, z25.b, z3.b[0]\n"
+      "sdot z11.s, z24.b, z0.b[0]\n"
+      "sdot z15.s, z24.b, z1.b[0]\n"
+      "sdot z19.s, z24.b, z2.b[0]\n"
+      "sdot z23.s, z24.b, z3.b[0]\n"
       "ble 48f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[1]\n"
+      "sdot z12.s, z25.b, z1.b[1]\n"
+      "sdot z16.s, z25.b, z2.b[1]\n"
+      "sdot z20.s, z25.b, z3.b[1]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z9.s, z24.b, z0.b[1]\n"
+      "sdot z13.s, z24.b, z1.b[1]\n"
+      "sdot z17.s, z24.b, z2.b[1]\n"
+      "sdot z21.s, z24.b, z3.b[1]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z25.b, z0.b[1]\n"
+      "sdot z14.s, z25.b, z1.b[1]\n"
+      "sdot z18.s, z25.b, z2.b[1]\n"
+      "sdot z22.s, z25.b, z3.b[1]\n"
+      "sdot z11.s, z24.b, z0.b[1]\n"
+      "sdot z15.s, z24.b, z1.b[1]\n"
+      "sdot z19.s, z24.b, z2.b[1]\n"
+      "sdot z23.s, z24.b, z3.b[1]\n"
       "ble 48f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[2]\n"
+      "sdot z12.s, z25.b, z1.b[2]\n"
+      "sdot z16.s, z25.b, z2.b[2]\n"
+      "sdot z20.s, z25.b, z3.b[2]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z9.s, z24.b, z0.b[2]\n"
+      "sdot z13.s, z24.b, z1.b[2]\n"
+      "sdot z17.s, z24.b, z2.b[2]\n"
+      "sdot z21.s, z24.b, z3.b[2]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z25.b, z0.b[2]\n"
+      "sdot z14.s, z25.b, z1.b[2]\n"
+      "sdot z18.s, z25.b, z2.b[2]\n"
+      "sdot z22.s, z25.b, z3.b[2]\n"
+      "sdot z11.s, z24.b, z0.b[2]\n"
+      "sdot z15.s, z24.b, z1.b[2]\n"
+      "sdot z19.s, z24.b, z2.b[2]\n"
+      "sdot z23.s, z24.b, z3.b[2]\n"
       "ble 48f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[3]\n"
+      "sdot z12.s, z25.b, z1.b[3]\n"
+      "sdot z16.s, z25.b, z2.b[3]\n"
+      "sdot z20.s, z25.b, z3.b[3]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[3]\n"
+      "sdot z13.s, z24.b, z1.b[3]\n"
+      "sdot z17.s, z24.b, z2.b[3]\n"
+      "sdot z21.s, z24.b, z3.b[3]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z25.b, z0.b[3]\n"
+      "sdot z14.s, z25.b, z1.b[3]\n"
+      "sdot z18.s, z25.b, z2.b[3]\n"
+      "sdot z22.s, z25.b, z3.b[3]\n"
+      "sdot z11.s, z24.b, z0.b[3]\n"
+      "sdot z15.s, z24.b, z1.b[3]\n"
+      "sdot z19.s, z24.b, z2.b[3]\n"
+      "sdot z23.s, z24.b, z3.b[3]\n"
       "48:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 43b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z0.s }, p2/Z, [x11]\n"
-      "add z8.s, z8.s, z0.s\n"
-      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
-      "add x23, x9, x19\n"
-      "add z12.s, z12.s, z0.s\n"
-      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
-      "add x22, x23, x19\n"
-      "add z16.s, z16.s, z0.s\n"
-      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
-      "add x21, x22, x19\n"
-      "add z9.s, z9.s, z1.s\n"
-      "addvl x11, x11, #4\n"
-      "add z13.s, z13.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z14.s, z14.s, z2.s\n"
-      "add z15.s, z15.s, z3.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z27.s }, p2/Z, [x14]\n"
+      "add x26, x11, x20\n"
+      "add x25, x26, x20\n"
+      "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "add x24, x25, x20\n"
+      "add z8.s, z8.s, z27.s\n"
+      "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add z9.s, z9.s, z26.s\n"
+      "add z10.s, z10.s, z25.s\n"
+      "addvl x14, x14, #4\n"
+      "add z11.s, z11.s, z24.s\n"
+      "add z12.s, z12.s, z27.s\n"
+      "add z13.s, z13.s, z26.s\n"
+      "add z14.s, z14.s, z25.s\n"
+      "add z15.s, z15.s, z24.s\n"
+      "add z16.s, z16.s, z27.s\n"
+      "add z17.s, z17.s, z26.s\n"
+      "add z18.s, z18.s, z25.s\n"
+      "add z19.s, z19.s, z24.s\n"
+      "add z20.s, z20.s, z27.s\n"
+      "add z21.s, z21.s, z26.s\n"
+      "add z22.s, z22.s, z25.s\n"
+      "add z23.s, z23.s, z24.s\n"
       "tbz %x[flags], #4, 49f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -1275,20 +1245,20 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
       "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "addvl x12, x12, #4\n"
       "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
       "addvl x13, x13, #4\n"
       "b 50f\n"
       "49:"  // Height 4: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
-      "mov z2.d, z0.d\n"
-      "mov z3.d, z0.d\n"
       "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
       "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
       "mov z7.d, z4.d\n"
       "50:"  // Height 4: parameters loaded
       ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
@@ -1308,159 +1278,159 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a676d6  // sqrdmulh z22.s, z22.s, z6.s\n"
       ".inst 0x04a776f7  // sqrdmulh z23.s, z23.s, z7.s\n"
       "tbz %x[flags], #5, 51f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z4.d, z12.d, z0.d\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "and z5.d, z13.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z6.d, z14.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z12.s, z12.s, z4.s\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z13.s, z13.s, z5.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z14.s, z14.s, z6.s\n"
-      "and z5.d, z17.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z15.s, z15.s, z7.s\n"
-      "and z6.d, z18.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z7.d, z19.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "and z4.d, z20.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "and z5.d, z21.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "and z6.d, z22.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z20.s, z20.s, z4.s\n"
-      "and z7.d, z23.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z21.s, z21.s, z5.s\n"
-      "sqadd z22.s, z22.s, z6.s\n"
-      "sqadd z23.s, z23.s, z7.s\n"
+      "and z27.d, z8.d, z0.d\n"
+      "and z26.d, z9.d, z1.d\n"
+      "and z25.d, z10.d, z2.d\n"
+      "and z24.d, z11.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z27.s\n"
+      "sqadd z9.s, z9.s, z26.s\n"
+      "sqadd z10.s, z10.s, z25.s\n"
+      "sqadd z11.s, z11.s, z24.s\n"
+      "and z27.d, z12.d, z0.d\n"
+      "and z26.d, z13.d, z1.d\n"
+      "and z25.d, z14.d, z2.d\n"
+      "and z24.d, z15.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z27.s\n"
+      "sqadd z13.s, z13.s, z26.s\n"
+      "sqadd z14.s, z14.s, z25.s\n"
+      "sqadd z15.s, z15.s, z24.s\n"
+      "and z27.d, z16.d, z0.d\n"
+      "and z26.d, z17.d, z1.d\n"
+      "and z25.d, z18.d, z2.d\n"
+      "and z24.d, z19.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z27.s\n"
+      "sqadd z17.s, z17.s, z26.s\n"
+      "sqadd z18.s, z18.s, z25.s\n"
+      "sqadd z19.s, z19.s, z24.s\n"
+      "and z27.d, z20.d, z0.d\n"
+      "and z26.d, z21.d, z1.d\n"
+      "and z25.d, z22.d, z2.d\n"
+      "and z24.d, z23.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z27.s\n"
+      "sqadd z21.s, z21.s, z26.s\n"
+      "sqadd z22.s, z22.s, z25.s\n"
+      "sqadd z23.s, z23.s, z24.s\n"
       "51:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "add z8.s, z8.s, z25.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
-      "add x24, %x[qp], %[minval]\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "ld1rw { z5.s }, p2/Z, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
+      "add z9.s, z9.s, z25.s\n"
+      "add z10.s, z10.s, z25.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
-      "ld1rw { z6.s }, p2/Z, [x24]\n"
       ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
-      "add z8.s, z8.s, z4.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add z12.s, z12.s, z4.s\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "uzp1 z8.h, z8.h, z9.h\n"
+      "add z11.s, z11.s, z25.s\n"
+      "add z12.s, z12.s, z25.s\n"
       ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "st1b { z8.b }, p1, [x9]\n"
-      "add z13.s, z13.s, z4.s\n"
-      "addvl x9, x9, #1\n"
       ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      "add z13.s, z13.s, z25.s\n"
+      "add z14.s, z14.s, z25.s\n"
       ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
+      "add z15.s, z15.s, z25.s\n"
+      "add z16.s, z16.s, z25.s\n"
       ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
-      "add z14.s, z14.s, z4.s\n"
-      "add z15.s, z15.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "uzp1 z12.h, z12.h, z13.h\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
       ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
+      "add z17.s, z17.s, z25.s\n"
+      "add z18.s, z18.s, z25.s\n"
       ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p1, [x23]\n"
-      "add z18.s, z18.s, z4.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "add z19.s, z19.s, z4.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "uzp1 z16.h, z16.h, z17.h\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "add z19.s, z19.s, z25.s\n"
+      "add z20.s, z20.s, z25.s\n"
       ".inst 0x44828835  // srshl z21.s, p2/M, z21.s, z1.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
       ".inst 0x44828856  // srshl z22.s, p2/M, z22.s, z2.s\n"
-      "add z21.s, z21.s, z4.s\n"
+      "add z21.s, z21.s, z25.s\n"
+      "add z22.s, z22.s, z25.s\n"
+      ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "add z23.s, z23.s, z25.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z24.s\n"
+      "smin z9.s, p2/M, z9.s, z24.s\n"
+      "smin z10.s, p2/M, z10.s, z24.s\n"
+      "smin z11.s, p2/M, z11.s, z24.s\n"
+      "smin z12.s, p2/M, z12.s, z24.s\n"
+      "smin z13.s, p2/M, z13.s, z24.s\n"
+      "smin z14.s, p2/M, z14.s, z24.s\n"
+      "smin z15.s, p2/M, z15.s, z24.s\n"
+      "smin z16.s, p2/M, z16.s, z24.s\n"
+      "smin z17.s, p2/M, z17.s, z24.s\n"
+      "smin z18.s, p2/M, z18.s, z24.s\n"
+      "smin z19.s, p2/M, z19.s, z24.s\n"
+      "smin z20.s, p2/M, z20.s, z24.s\n"
+      "smin z21.s, p2/M, z21.s, z24.s\n"
+      "smin z22.s, p2/M, z22.s, z24.s\n"
+      "smin z23.s, p2/M, z23.s, z24.s\n"
+      "smax z8.s, p2/M, z8.s, z25.s\n"
+      "smax z9.s, p2/M, z9.s, z25.s\n"
+      "smax z10.s, p2/M, z10.s, z25.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "smax z11.s, p2/M, z11.s, z25.s\n"
+      "smax z12.s, p2/M, z12.s, z25.s\n"
+      "uzp1 z24.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z24.b\n"
+      "smax z13.s, p2/M, z13.s, z25.s\n"
+      "smax z14.s, p2/M, z14.s, z25.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "st1b { z8.b }, p1, [x11]\n"
+      "smax z15.s, p2/M, z15.s, z25.s\n"
+      "smax z16.s, p2/M, z16.s, z25.s\n"
+      "uzp1 z24.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z24.b\n"
+      "smax z17.s, p2/M, z17.s, z25.s\n"
+      "smax z18.s, p2/M, z18.s, z25.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z12.b }, p1, [x26]\n"
+      "smax z19.s, p2/M, z19.s, z25.s\n"
+      "smax z20.s, p2/M, z20.s, z25.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "add z22.s, z22.s, z4.s\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x22]\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "add z23.s, z23.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z25.s\n"
+      "smax z22.s, p2/M, z22.s, z25.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "st1b { z20.b }, p1, [x21]\n"
+      "st1b { z16.b }, p1, [x25]\n"
+      "smax z23.s, p2/M, z23.s, z25.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "st1b { z20.b }, p1, [x24]\n"
+      "addvl x11, x11, #1\n"
       "52:"  // Height 4: Writeback done
       "decw x10, ALL, MUL #4\n"
       "cmp x10, XZR\n"
       "bgt 41b\n"
       "b 80f\n"
       "53:"  // Height 5
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x9, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "54:"  // Height 5: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
       "mov z8.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z9.s, #0x0\n"
-      "whilelt p1.b, x19, x10\n"
       "mov z10.s, #0x0\n"
       "mov z11.s, #0x0\n"
       "mov z12.s, #0x0\n"
@@ -1480,311 +1450,296 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "mov z26.s, #0x0\n"
       "mov z27.s, #0x0\n"
       "55:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "56:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 57f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 58f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 58f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
       "b 58f\n"
       "57:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "58:"  // Height 5: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 60f\n"
       "59:"  // Height 5: Multiply loop: Main loop head
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z4.b }, p0/Z, [x26]\n"
+      "ld1rqb { z3.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x22]\n"
+      "ld1b { z29.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z29.b, z4.b[0]\n"
+      "sdot z12.s, z29.b, z3.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z16.s, z29.b, z2.b[0]\n"
+      "sdot z20.s, z29.b, z1.b[0]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "sdot z24.s, z29.b, z0.b[0]\n"
+      "sdot z9.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "sdot z13.s, z28.b, z3.b[0]\n"
+      "sdot z17.s, z28.b, z2.b[0]\n"
       "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
-      "addvl x28, x28, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z21.s, z28.b, z1.b[0]\n"
+      "sdot z25.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[0]\n"
+      "sdot z14.s, z29.b, z3.b[0]\n"
+      "sdot z18.s, z29.b, z2.b[0]\n"
+      "sdot z22.s, z29.b, z1.b[0]\n"
+      "sdot z26.s, z29.b, z0.b[0]\n"
+      "sdot z11.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[0]\n"
+      "sdot z19.s, z28.b, z2.b[0]\n"
+      "sdot z23.s, z28.b, z1.b[0]\n"
+      "sdot z27.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[1]\n"
+      "sdot z12.s, z29.b, z3.b[1]\n"
+      "sdot z16.s, z29.b, z2.b[1]\n"
+      "sdot z20.s, z29.b, z1.b[1]\n"
+      "sdot z24.s, z29.b, z0.b[1]\n"
+      "sdot z9.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[1]\n"
+      "sdot z17.s, z28.b, z2.b[1]\n"
+      "sdot z21.s, z28.b, z1.b[1]\n"
+      "sdot z25.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      "sdot z10.s, z29.b, z4.b[1]\n"
+      "sdot z14.s, z29.b, z3.b[1]\n"
+      "sdot z18.s, z29.b, z2.b[1]\n"
+      "sdot z22.s, z29.b, z1.b[1]\n"
+      "sdot z26.s, z29.b, z0.b[1]\n"
+      "sdot z11.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[1]\n"
+      "sdot z19.s, z28.b, z2.b[1]\n"
+      "sdot z23.s, z28.b, z1.b[1]\n"
+      "sdot z27.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[2]\n"
+      "sdot z12.s, z29.b, z3.b[2]\n"
+      "sdot z16.s, z29.b, z2.b[2]\n"
+      "sdot z20.s, z29.b, z1.b[2]\n"
+      "sdot z24.s, z29.b, z0.b[2]\n"
+      "sdot z9.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[2]\n"
+      "sdot z17.s, z28.b, z2.b[2]\n"
+      "sdot z21.s, z28.b, z1.b[2]\n"
+      "sdot z25.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[2]\n"
+      "sdot z14.s, z29.b, z3.b[2]\n"
+      "sdot z18.s, z29.b, z2.b[2]\n"
+      "sdot z22.s, z29.b, z1.b[2]\n"
+      "sdot z26.s, z29.b, z0.b[2]\n"
+      "sdot z11.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[2]\n"
+      "sdot z19.s, z28.b, z2.b[2]\n"
+      "sdot z23.s, z28.b, z1.b[2]\n"
+      "sdot z27.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[3]\n"
+      "sdot z12.s, z29.b, z3.b[3]\n"
+      "sdot z16.s, z29.b, z2.b[3]\n"
+      "sdot z20.s, z29.b, z1.b[3]\n"
+      "sdot z24.s, z29.b, z0.b[3]\n"
+      "sdot z9.s, z28.b, z4.b[3]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[3]\n"
+      "sdot z17.s, z28.b, z2.b[3]\n"
+      "sdot z21.s, z28.b, z1.b[3]\n"
+      "sdot z25.s, z28.b, z0.b[3]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[3]\n"
+      "sdot z14.s, z29.b, z3.b[3]\n"
+      "sdot z18.s, z29.b, z2.b[3]\n"
+      "sdot z22.s, z29.b, z1.b[3]\n"
+      "sdot z26.s, z29.b, z0.b[3]\n"
+      "sdot z11.s, z28.b, z4.b[3]\n"
+      "sdot z15.s, z28.b, z3.b[3]\n"
+      "sdot z19.s, z28.b, z2.b[3]\n"
+      "sdot z23.s, z28.b, z1.b[3]\n"
+      "sdot z27.s, z28.b, z0.b[3]\n"
       "bgt 59b\n"
       "60:"  // Height 5: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1b { z29.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z29.b, z0.b[0]\n"
+      "sdot z12.s, z29.b, z1.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z16.s, z29.b, z2.b[0]\n"
+      "sdot z20.s, z29.b, z3.b[0]\n"
+      "sdot z24.s, z29.b, z4.b[0]\n"
+      "sdot z9.s, z28.b, z0.b[0]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[0]\n"
+      "sdot z17.s, z28.b, z2.b[0]\n"
+      "sdot z21.s, z28.b, z3.b[0]\n"
+      "sdot z25.s, z28.b, z4.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z29.b, z0.b[0]\n"
+      "sdot z14.s, z29.b, z1.b[0]\n"
+      "sdot z18.s, z29.b, z2.b[0]\n"
+      "sdot z22.s, z29.b, z3.b[0]\n"
+      "sdot z26.s, z29.b, z4.b[0]\n"
+      "sdot z11.s, z28.b, z0.b[0]\n"
+      "sdot z15.s, z28.b, z1.b[0]\n"
+      "sdot z19.s, z28.b, z2.b[0]\n"
+      "sdot z23.s, z28.b, z3.b[0]\n"
+      "sdot z27.s, z28.b, z4.b[0]\n"
       "ble 61f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x9]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[1]\n"
+      "sdot z12.s, z29.b, z1.b[1]\n"
+      "sdot z16.s, z29.b, z2.b[1]\n"
+      "sdot z20.s, z29.b, z3.b[1]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z24.s, z29.b, z4.b[1]\n"
+      "sdot z9.s, z28.b, z0.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[1]\n"
+      "sdot z17.s, z28.b, z2.b[1]\n"
+      "sdot z21.s, z28.b, z3.b[1]\n"
+      "sdot z25.s, z28.b, z4.b[1]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z29.b, z0.b[1]\n"
+      "sdot z14.s, z29.b, z1.b[1]\n"
+      "sdot z18.s, z29.b, z2.b[1]\n"
+      "sdot z22.s, z29.b, z3.b[1]\n"
+      "sdot z26.s, z29.b, z4.b[1]\n"
+      "sdot z11.s, z28.b, z0.b[1]\n"
+      "sdot z15.s, z28.b, z1.b[1]\n"
+      "sdot z19.s, z28.b, z2.b[1]\n"
+      "sdot z23.s, z28.b, z3.b[1]\n"
+      "sdot z27.s, z28.b, z4.b[1]\n"
       "ble 61f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x9]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[2]\n"
+      "sdot z12.s, z29.b, z1.b[2]\n"
+      "sdot z16.s, z29.b, z2.b[2]\n"
+      "sdot z20.s, z29.b, z3.b[2]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z24.s, z29.b, z4.b[2]\n"
+      "sdot z9.s, z28.b, z0.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[2]\n"
+      "sdot z17.s, z28.b, z2.b[2]\n"
+      "sdot z21.s, z28.b, z3.b[2]\n"
+      "sdot z25.s, z28.b, z4.b[2]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z29.b, z0.b[2]\n"
+      "sdot z14.s, z29.b, z1.b[2]\n"
+      "sdot z18.s, z29.b, z2.b[2]\n"
+      "sdot z22.s, z29.b, z3.b[2]\n"
+      "sdot z26.s, z29.b, z4.b[2]\n"
+      "sdot z11.s, z28.b, z0.b[2]\n"
+      "sdot z15.s, z28.b, z1.b[2]\n"
+      "sdot z19.s, z28.b, z2.b[2]\n"
+      "sdot z23.s, z28.b, z3.b[2]\n"
+      "sdot z27.s, z28.b, z4.b[2]\n"
       "ble 61f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
+      "ld1b { z29.b }, p2/Z, [x9]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[3]\n"
+      "sdot z12.s, z29.b, z1.b[3]\n"
+      "sdot z16.s, z29.b, z2.b[3]\n"
+      "sdot z20.s, z29.b, z3.b[3]\n"
+      "sdot z24.s, z29.b, z4.b[3]\n"
+      "sdot z9.s, z28.b, z0.b[3]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[3]\n"
+      "sdot z17.s, z28.b, z2.b[3]\n"
+      "sdot z21.s, z28.b, z3.b[3]\n"
+      "sdot z25.s, z28.b, z4.b[3]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z29.b, z0.b[3]\n"
+      "sdot z14.s, z29.b, z1.b[3]\n"
+      "sdot z18.s, z29.b, z2.b[3]\n"
+      "sdot z22.s, z29.b, z3.b[3]\n"
+      "sdot z26.s, z29.b, z4.b[3]\n"
+      "sdot z11.s, z28.b, z0.b[3]\n"
+      "sdot z15.s, z28.b, z1.b[3]\n"
+      "sdot z19.s, z28.b, z2.b[3]\n"
+      "sdot z23.s, z28.b, z3.b[3]\n"
+      "sdot z27.s, z28.b, z4.b[3]\n"
       "61:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 56b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z0.s }, p2/Z, [x11]\n"
-      "add z8.s, z8.s, z0.s\n"
-      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
-      "add x23, x9, x19\n"
-      "add z12.s, z12.s, z0.s\n"
-      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
-      "add x22, x23, x19\n"
-      "add z16.s, z16.s, z0.s\n"
-      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
-      "add x21, x22, x19\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add x20, x21, x19\n"
-      "add z13.s, z13.s, z1.s\n"
-      "addvl x11, x11, #4\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z14.s, z14.s, z2.s\n"
-      "add z15.s, z15.s, z3.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x11, x20\n"
+      "ld1w { z31.s }, p2/Z, [x14]\n"
+      "add x25, x26, x20\n"
+      "ld1w { z30.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "add x24, x25, x20\n"
+      "add x23, x24, x20\n"
+      "ld1w { z28.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add z8.s, z8.s, z31.s\n"
+      "add z9.s, z9.s, z30.s\n"
+      "addvl x14, x14, #4\n"
+      "add z10.s, z10.s, z29.s\n"
+      "add z11.s, z11.s, z28.s\n"
+      "add z12.s, z12.s, z31.s\n"
+      "add z13.s, z13.s, z30.s\n"
+      "add z14.s, z14.s, z29.s\n"
+      "add z15.s, z15.s, z28.s\n"
+      "add z16.s, z16.s, z31.s\n"
+      "add z17.s, z17.s, z30.s\n"
+      "add z18.s, z18.s, z29.s\n"
+      "add z19.s, z19.s, z28.s\n"
+      "add z20.s, z20.s, z31.s\n"
+      "add z21.s, z21.s, z30.s\n"
+      "add z22.s, z22.s, z29.s\n"
+      "add z23.s, z23.s, z28.s\n"
+      "add z24.s, z24.s, z31.s\n"
+      "add z25.s, z25.s, z30.s\n"
+      "add z26.s, z26.s, z29.s\n"
+      "add z27.s, z27.s, z28.s\n"
       "tbz %x[flags], #4, 62f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -1793,20 +1748,20 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
       "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "addvl x12, x12, #4\n"
       "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
       "addvl x13, x13, #4\n"
       "b 63f\n"
       "62:"  // Height 5: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
-      "mov z2.d, z0.d\n"
-      "mov z3.d, z0.d\n"
       "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
       "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
       "mov z7.d, z4.d\n"
       "63:"  // Height 5: parameters loaded
       ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
@@ -1830,194 +1785,194 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
       ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
       "tbz %x[flags], #5, 64f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z4.d, z12.d, z0.d\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "and z5.d, z13.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z6.d, z14.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z12.s, z12.s, z4.s\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z13.s, z13.s, z5.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z14.s, z14.s, z6.s\n"
-      "and z5.d, z17.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z15.s, z15.s, z7.s\n"
-      "and z6.d, z18.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z7.d, z19.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "and z4.d, z20.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "and z5.d, z21.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "and z6.d, z22.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z20.s, z20.s, z4.s\n"
-      "and z7.d, z23.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z21.s, z21.s, z5.s\n"
-      "and z4.d, z24.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z22.s, z22.s, z6.s\n"
-      "and z5.d, z25.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z23.s, z23.s, z7.s\n"
-      "and z6.d, z26.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z24.s, z24.s, z4.s\n"
-      "and z7.d, z27.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z25.s, z25.s, z5.s\n"
-      "sqadd z26.s, z26.s, z6.s\n"
-      "sqadd z27.s, z27.s, z7.s\n"
+      "and z31.d, z8.d, z0.d\n"
+      "and z30.d, z9.d, z1.d\n"
+      "and z29.d, z10.d, z2.d\n"
+      "and z28.d, z11.d, z3.d\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z31.s\n"
+      "sqadd z9.s, z9.s, z30.s\n"
+      "sqadd z10.s, z10.s, z29.s\n"
+      "sqadd z11.s, z11.s, z28.s\n"
+      "and z31.d, z12.d, z0.d\n"
+      "and z30.d, z13.d, z1.d\n"
+      "and z29.d, z14.d, z2.d\n"
+      "and z28.d, z15.d, z3.d\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z31.s\n"
+      "sqadd z13.s, z13.s, z30.s\n"
+      "sqadd z14.s, z14.s, z29.s\n"
+      "sqadd z15.s, z15.s, z28.s\n"
+      "and z31.d, z16.d, z0.d\n"
+      "and z30.d, z17.d, z1.d\n"
+      "and z29.d, z18.d, z2.d\n"
+      "and z28.d, z19.d, z3.d\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z31.s\n"
+      "sqadd z17.s, z17.s, z30.s\n"
+      "sqadd z18.s, z18.s, z29.s\n"
+      "sqadd z19.s, z19.s, z28.s\n"
+      "and z31.d, z20.d, z0.d\n"
+      "and z30.d, z21.d, z1.d\n"
+      "and z29.d, z22.d, z2.d\n"
+      "and z28.d, z23.d, z3.d\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z31.s\n"
+      "sqadd z21.s, z21.s, z30.s\n"
+      "sqadd z22.s, z22.s, z29.s\n"
+      "sqadd z23.s, z23.s, z28.s\n"
+      "and z31.d, z24.d, z0.d\n"
+      "and z30.d, z25.d, z1.d\n"
+      "and z29.d, z26.d, z2.d\n"
+      "and z28.d, z27.d, z3.d\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z31.s\n"
+      "sqadd z25.s, z25.s, z30.s\n"
+      "sqadd z26.s, z26.s, z29.s\n"
+      "sqadd z27.s, z27.s, z28.s\n"
       "64:"  // Height 5: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "add z8.s, z8.s, z29.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
-      "add x24, %x[qp], %[minval]\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "ld1rw { z5.s }, p2/Z, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
+      "add z9.s, z9.s, z29.s\n"
+      "add z10.s, z10.s, z29.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
-      "ld1rw { z6.s }, p2/Z, [x24]\n"
       ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
-      "add z8.s, z8.s, z4.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add z12.s, z12.s, z4.s\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "uzp1 z8.h, z8.h, z9.h\n"
+      "add z11.s, z11.s, z29.s\n"
+      "add z12.s, z12.s, z29.s\n"
       ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "st1b { z8.b }, p1, [x9]\n"
-      "add z13.s, z13.s, z4.s\n"
-      "addvl x9, x9, #1\n"
       ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      "add z13.s, z13.s, z29.s\n"
+      "add z14.s, z14.s, z29.s\n"
       ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
+      "add z15.s, z15.s, z29.s\n"
+      "add z16.s, z16.s, z29.s\n"
       ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
-      "add z14.s, z14.s, z4.s\n"
-      "add z15.s, z15.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "uzp1 z12.h, z12.h, z13.h\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
       ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
+      "add z17.s, z17.s, z29.s\n"
+      "add z18.s, z18.s, z29.s\n"
       ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p1, [x23]\n"
-      "add z18.s, z18.s, z4.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "add z19.s, z19.s, z4.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "uzp1 z16.h, z16.h, z17.h\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "add z19.s, z19.s, z29.s\n"
+      "add z20.s, z20.s, z29.s\n"
       ".inst 0x44828835  // srshl z21.s, p2/M, z21.s, z1.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
       ".inst 0x44828856  // srshl z22.s, p2/M, z22.s, z2.s\n"
-      "add z21.s, z21.s, z4.s\n"
-      "uzp1 z17.h, z18.h, z19.h\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x22]\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "add z21.s, z21.s, z29.s\n"
+      "add z22.s, z22.s, z29.s\n"
       ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "add z23.s, z23.s, z29.s\n"
+      "add z24.s, z24.s, z29.s\n"
       ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "uzp1 z20.h, z20.h, z21.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
       ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "add z26.s, z26.s, z4.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
+      "add z25.s, z25.s, z29.s\n"
+      "add z26.s, z26.s, z29.s\n"
       ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z29.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z28.s\n"
+      "smin z9.s, p2/M, z9.s, z28.s\n"
+      "smin z10.s, p2/M, z10.s, z28.s\n"
+      "smin z11.s, p2/M, z11.s, z28.s\n"
+      "smin z12.s, p2/M, z12.s, z28.s\n"
+      "smin z13.s, p2/M, z13.s, z28.s\n"
+      "smin z14.s, p2/M, z14.s, z28.s\n"
+      "smin z15.s, p2/M, z15.s, z28.s\n"
+      "smin z16.s, p2/M, z16.s, z28.s\n"
+      "smin z17.s, p2/M, z17.s, z28.s\n"
+      "smin z18.s, p2/M, z18.s, z28.s\n"
+      "smin z19.s, p2/M, z19.s, z28.s\n"
+      "smin z20.s, p2/M, z20.s, z28.s\n"
+      "smin z21.s, p2/M, z21.s, z28.s\n"
+      "smin z22.s, p2/M, z22.s, z28.s\n"
+      "smin z23.s, p2/M, z23.s, z28.s\n"
+      "smin z24.s, p2/M, z24.s, z28.s\n"
+      "smin z25.s, p2/M, z25.s, z28.s\n"
+      "smin z26.s, p2/M, z26.s, z28.s\n"
+      "smin z27.s, p2/M, z27.s, z28.s\n"
+      "smax z8.s, p2/M, z8.s, z29.s\n"
+      "smax z9.s, p2/M, z9.s, z29.s\n"
+      "smax z10.s, p2/M, z10.s, z29.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "smax z11.s, p2/M, z11.s, z29.s\n"
+      "smax z12.s, p2/M, z12.s, z29.s\n"
+      "uzp1 z28.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z28.b\n"
+      "smax z13.s, p2/M, z13.s, z29.s\n"
+      "smax z14.s, p2/M, z14.s, z29.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "st1b { z8.b }, p1, [x11]\n"
+      "smax z15.s, p2/M, z15.s, z29.s\n"
+      "smax z16.s, p2/M, z16.s, z29.s\n"
+      "uzp1 z28.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z28.b\n"
+      "smax z17.s, p2/M, z17.s, z29.s\n"
+      "smax z18.s, p2/M, z18.s, z29.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z12.b }, p1, [x26]\n"
+      "smax z19.s, p2/M, z19.s, z29.s\n"
+      "smax z20.s, p2/M, z20.s, z29.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z21.s, p2/M, z21.s, z29.s\n"
+      "smax z22.s, p2/M, z22.s, z29.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "st1b { z16.b }, p1, [x25]\n"
+      "smax z23.s, p2/M, z23.s, z29.s\n"
+      "smax z24.s, p2/M, z24.s, z29.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z29.s\n"
+      "smax z26.s, p2/M, z26.s, z29.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "st1b { z20.b }, p1, [x21]\n"
-      "add z27.s, z27.s, z4.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x20]\n"
+      "st1b { z20.b }, p1, [x24]\n"
+      "smax z27.s, p2/M, z27.s, z29.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "addvl x11, x11, #1\n"
       "65:"  // Height 5: Writeback done
       "decw x10, ALL, MUL #4\n"
       "cmp x10, XZR\n"
       "bgt 54b\n"
       "b 80f\n"
       "66:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x6\n"
+      "mov x14, %x[col_bias]\n"
       "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
-      "mov x11, %x[col_bias]\n"
       "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
-      "mov x9, %x[output_ptr]\n"
       "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x20, #0x6\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "mov x11, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "67:"  // Height 6: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
       "mov z8.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z9.s, #0x0\n"
-      "whilelt p1.b, x19, x10\n"
       "mov z10.s, #0x0\n"
       "mov z11.s, #0x0\n"
       "mov z12.s, #0x0\n"
@@ -2041,357 +1996,339 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "mov z30.s, #0x0\n"
       "mov z31.s, #0x0\n"
       "68:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "69:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 70f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 71f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 71f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
       "b 71f\n"
       "70:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "71:"  // Height 6: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 73f\n"
       "72:"  // Height 6: Multiply loop: Main loop head
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z6.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z4.b }, p0/Z, [x23]\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z2.b }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1b { z1.b }, p2/Z, [x9]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[0]\n"
+      "sdot z12.s, z1.b, z6.b[0]\n"
+      "sdot z16.s, z1.b, z5.b[0]\n"
+      "sdot z20.s, z1.b, z4.b[0]\n"
       "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "sdot z24.s, z1.b, z3.b[0]\n"
+      "sdot z28.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x20, x20, #0x10\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
-      "sdot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "sdot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z30.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "sdot z31.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "sdot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
-      "addvl x28, x28, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z30.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "sdot z31.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "sdot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z30.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "sdot z31.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "sdot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z30.s, z6.b, z5.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
-      "sdot z31.s, z7.b, z5.b[3]\n"
+      "sdot z9.s, z0.b, z7.b[0]\n"
+      "sdot z13.s, z0.b, z6.b[0]\n"
+      "sdot z17.s, z0.b, z5.b[0]\n"
+      "sdot z21.s, z0.b, z4.b[0]\n"
+      "sdot z25.s, z0.b, z3.b[0]\n"
+      "sdot z29.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[0]\n"
+      "sdot z14.s, z1.b, z6.b[0]\n"
+      "sdot z18.s, z1.b, z5.b[0]\n"
+      "sdot z22.s, z1.b, z4.b[0]\n"
+      "sdot z26.s, z1.b, z3.b[0]\n"
+      "sdot z30.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[0]\n"
+      "sdot z15.s, z0.b, z6.b[0]\n"
+      "sdot z19.s, z0.b, z5.b[0]\n"
+      "sdot z23.s, z0.b, z4.b[0]\n"
+      "sdot z27.s, z0.b, z3.b[0]\n"
+      "sdot z31.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[1]\n"
+      "sdot z12.s, z1.b, z6.b[1]\n"
+      "sdot z16.s, z1.b, z5.b[1]\n"
+      "sdot z20.s, z1.b, z4.b[1]\n"
+      "sdot z24.s, z1.b, z3.b[1]\n"
+      "sdot z28.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[1]\n"
+      "sdot z13.s, z0.b, z6.b[1]\n"
+      "sdot z17.s, z0.b, z5.b[1]\n"
+      "sdot z21.s, z0.b, z4.b[1]\n"
+      "sdot z25.s, z0.b, z3.b[1]\n"
+      "sdot z29.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      "sdot z10.s, z1.b, z7.b[1]\n"
+      "sdot z14.s, z1.b, z6.b[1]\n"
+      "sdot z18.s, z1.b, z5.b[1]\n"
+      "sdot z22.s, z1.b, z4.b[1]\n"
+      "sdot z26.s, z1.b, z3.b[1]\n"
+      "sdot z30.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[1]\n"
+      "sdot z15.s, z0.b, z6.b[1]\n"
+      "sdot z19.s, z0.b, z5.b[1]\n"
+      "sdot z23.s, z0.b, z4.b[1]\n"
+      "sdot z27.s, z0.b, z3.b[1]\n"
+      "sdot z31.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[2]\n"
+      "sdot z12.s, z1.b, z6.b[2]\n"
+      "sdot z16.s, z1.b, z5.b[2]\n"
+      "sdot z20.s, z1.b, z4.b[2]\n"
+      "sdot z24.s, z1.b, z3.b[2]\n"
+      "sdot z28.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[2]\n"
+      "sdot z13.s, z0.b, z6.b[2]\n"
+      "sdot z17.s, z0.b, z5.b[2]\n"
+      "sdot z21.s, z0.b, z4.b[2]\n"
+      "sdot z25.s, z0.b, z3.b[2]\n"
+      "sdot z29.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[2]\n"
+      "sdot z14.s, z1.b, z6.b[2]\n"
+      "sdot z18.s, z1.b, z5.b[2]\n"
+      "sdot z22.s, z1.b, z4.b[2]\n"
+      "sdot z26.s, z1.b, z3.b[2]\n"
+      "sdot z30.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[2]\n"
+      "sdot z15.s, z0.b, z6.b[2]\n"
+      "sdot z19.s, z0.b, z5.b[2]\n"
+      "sdot z23.s, z0.b, z4.b[2]\n"
+      "sdot z27.s, z0.b, z3.b[2]\n"
+      "sdot z31.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[3]\n"
+      "sdot z12.s, z1.b, z6.b[3]\n"
+      "sdot z16.s, z1.b, z5.b[3]\n"
+      "sdot z20.s, z1.b, z4.b[3]\n"
+      "sdot z24.s, z1.b, z3.b[3]\n"
+      "sdot z28.s, z1.b, z2.b[3]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[3]\n"
+      "sdot z13.s, z0.b, z6.b[3]\n"
+      "sdot z17.s, z0.b, z5.b[3]\n"
+      "sdot z21.s, z0.b, z4.b[3]\n"
+      "sdot z25.s, z0.b, z3.b[3]\n"
+      "sdot z29.s, z0.b, z2.b[3]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[3]\n"
+      "sdot z14.s, z1.b, z6.b[3]\n"
+      "sdot z18.s, z1.b, z5.b[3]\n"
+      "sdot z22.s, z1.b, z4.b[3]\n"
+      "sdot z26.s, z1.b, z3.b[3]\n"
+      "sdot z30.s, z1.b, z2.b[3]\n"
+      "sdot z11.s, z0.b, z7.b[3]\n"
+      "sdot z15.s, z0.b, z6.b[3]\n"
+      "sdot z19.s, z0.b, z5.b[3]\n"
+      "sdot z23.s, z0.b, z4.b[3]\n"
+      "sdot z27.s, z0.b, z3.b[3]\n"
+      "sdot z31.s, z0.b, z2.b[3]\n"
       "bgt 72b\n"
       "73:"  // Height 6: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1rqb { z5.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "add x20, x20, #0x10\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "sdot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "sdot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z30.s, z6.b, z5.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "sdot z31.s, z7.b, z5.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "ld1b { z7.b }, p2/Z, [x9]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[0]\n"
+      "sdot z12.s, z7.b, z1.b[0]\n"
+      "sdot z16.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z7.b, z3.b[0]\n"
+      "sdot z24.s, z7.b, z4.b[0]\n"
+      "sdot z28.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[0]\n"
+      "sdot z13.s, z6.b, z1.b[0]\n"
+      "sdot z17.s, z6.b, z2.b[0]\n"
+      "sdot z21.s, z6.b, z3.b[0]\n"
+      "sdot z25.s, z6.b, z4.b[0]\n"
+      "sdot z29.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z7.b, z0.b[0]\n"
+      "sdot z14.s, z7.b, z1.b[0]\n"
+      "sdot z18.s, z7.b, z2.b[0]\n"
+      "sdot z22.s, z7.b, z3.b[0]\n"
+      "sdot z26.s, z7.b, z4.b[0]\n"
+      "sdot z30.s, z7.b, z5.b[0]\n"
+      "sdot z11.s, z6.b, z0.b[0]\n"
+      "sdot z15.s, z6.b, z1.b[0]\n"
+      "sdot z19.s, z6.b, z2.b[0]\n"
+      "sdot z23.s, z6.b, z3.b[0]\n"
+      "sdot z27.s, z6.b, z4.b[0]\n"
+      "sdot z31.s, z6.b, z5.b[0]\n"
       "ble 74f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "sdot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z30.s, z6.b, z5.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "sdot z31.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x9]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[1]\n"
+      "sdot z12.s, z7.b, z1.b[1]\n"
+      "sdot z16.s, z7.b, z2.b[1]\n"
+      "sdot z20.s, z7.b, z3.b[1]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z24.s, z7.b, z4.b[1]\n"
+      "sdot z28.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[1]\n"
+      "sdot z13.s, z6.b, z1.b[1]\n"
+      "sdot z17.s, z6.b, z2.b[1]\n"
+      "sdot z21.s, z6.b, z3.b[1]\n"
+      "sdot z25.s, z6.b, z4.b[1]\n"
+      "sdot z29.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z7.b, z0.b[1]\n"
+      "sdot z14.s, z7.b, z1.b[1]\n"
+      "sdot z18.s, z7.b, z2.b[1]\n"
+      "sdot z22.s, z7.b, z3.b[1]\n"
+      "sdot z26.s, z7.b, z4.b[1]\n"
+      "sdot z30.s, z7.b, z5.b[1]\n"
+      "sdot z11.s, z6.b, z0.b[1]\n"
+      "sdot z15.s, z6.b, z1.b[1]\n"
+      "sdot z19.s, z6.b, z2.b[1]\n"
+      "sdot z23.s, z6.b, z3.b[1]\n"
+      "sdot z27.s, z6.b, z4.b[1]\n"
+      "sdot z31.s, z6.b, z5.b[1]\n"
       "ble 74f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "sdot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z30.s, z6.b, z5.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "sdot z31.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x9]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[2]\n"
+      "sdot z12.s, z7.b, z1.b[2]\n"
+      "sdot z16.s, z7.b, z2.b[2]\n"
+      "sdot z20.s, z7.b, z3.b[2]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z24.s, z7.b, z4.b[2]\n"
+      "sdot z28.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[2]\n"
+      "sdot z13.s, z6.b, z1.b[2]\n"
+      "sdot z17.s, z6.b, z2.b[2]\n"
+      "sdot z21.s, z6.b, z3.b[2]\n"
+      "sdot z25.s, z6.b, z4.b[2]\n"
+      "sdot z29.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z7.b, z0.b[2]\n"
+      "sdot z14.s, z7.b, z1.b[2]\n"
+      "sdot z18.s, z7.b, z2.b[2]\n"
+      "sdot z22.s, z7.b, z3.b[2]\n"
+      "sdot z26.s, z7.b, z4.b[2]\n"
+      "sdot z30.s, z7.b, z5.b[2]\n"
+      "sdot z11.s, z6.b, z0.b[2]\n"
+      "sdot z15.s, z6.b, z1.b[2]\n"
+      "sdot z19.s, z6.b, z2.b[2]\n"
+      "sdot z23.s, z6.b, z3.b[2]\n"
+      "sdot z27.s, z6.b, z4.b[2]\n"
+      "sdot z31.s, z6.b, z5.b[2]\n"
       "ble 74f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "sdot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z30.s, z6.b, z5.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
-      "sdot z31.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x9]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[3]\n"
+      "sdot z12.s, z7.b, z1.b[3]\n"
+      "sdot z16.s, z7.b, z2.b[3]\n"
+      "sdot z20.s, z7.b, z3.b[3]\n"
+      "sdot z24.s, z7.b, z4.b[3]\n"
+      "sdot z28.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[3]\n"
+      "sdot z13.s, z6.b, z1.b[3]\n"
+      "sdot z17.s, z6.b, z2.b[3]\n"
+      "sdot z21.s, z6.b, z3.b[3]\n"
+      "sdot z25.s, z6.b, z4.b[3]\n"
+      "sdot z29.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z7.b, z0.b[3]\n"
+      "sdot z14.s, z7.b, z1.b[3]\n"
+      "sdot z18.s, z7.b, z2.b[3]\n"
+      "sdot z22.s, z7.b, z3.b[3]\n"
+      "sdot z26.s, z7.b, z4.b[3]\n"
+      "sdot z30.s, z7.b, z5.b[3]\n"
+      "sdot z11.s, z6.b, z0.b[3]\n"
+      "sdot z15.s, z6.b, z1.b[3]\n"
+      "sdot z19.s, z6.b, z2.b[3]\n"
+      "sdot z23.s, z6.b, z3.b[3]\n"
+      "sdot z27.s, z6.b, z4.b[3]\n"
+      "sdot z31.s, z6.b, z5.b[3]\n"
       "74:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 69b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z0.s }, p2/Z, [x11]\n"
-      "add z8.s, z8.s, z0.s\n"
-      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
-      "add x23, x9, x19\n"
-      "add z12.s, z12.s, z0.s\n"
-      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
-      "add x22, x23, x19\n"
-      "add z16.s, z16.s, z0.s\n"
-      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
-      "add x21, x22, x19\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add x20, x21, x19\n"
-      "add z13.s, z13.s, z1.s\n"
-      "add x19, x20, x19\n"
-      "add z10.s, z10.s, z2.s\n"
-      "addvl x11, x11, #4\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z14.s, z14.s, z2.s\n"
-      "add z15.s, z15.s, z3.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      "add z28.s, z28.s, z0.s\n"
-      "add z29.s, z29.s, z1.s\n"
-      "add z30.s, z30.s, z2.s\n"
-      "add z31.s, z31.s, z3.s\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x11, x20\n"
+      "add x25, x26, x20\n"
+      "ld1w { z3.s }, p2/Z, [x14]\n"
+      "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "add x24, x25, x20\n"
+      "add x23, x24, x20\n"
+      "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add x22, x23, x20\n"
+      "add z8.s, z8.s, z3.s\n"
+      "add z9.s, z9.s, z2.s\n"
+      "add z10.s, z10.s, z1.s\n"
+      "add z11.s, z11.s, z0.s\n"
+      "addvl x14, x14, #4\n"
+      "add z12.s, z12.s, z3.s\n"
+      "add z13.s, z13.s, z2.s\n"
+      "add z14.s, z14.s, z1.s\n"
+      "add z15.s, z15.s, z0.s\n"
+      "add z16.s, z16.s, z3.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z1.s\n"
+      "add z19.s, z19.s, z0.s\n"
+      "add z20.s, z20.s, z3.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z1.s\n"
+      "add z23.s, z23.s, z0.s\n"
+      "add z24.s, z24.s, z3.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z1.s\n"
+      "add z27.s, z27.s, z0.s\n"
+      "add z28.s, z28.s, z3.s\n"
+      "add z29.s, z29.s, z2.s\n"
+      "add z30.s, z30.s, z1.s\n"
+      "add z31.s, z31.s, z0.s\n"
       "tbz %x[flags], #4, 75f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -2400,20 +2337,20 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
       "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "addvl x12, x12, #4\n"
       "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
       "addvl x13, x13, #4\n"
       "b 76f\n"
       "75:"  // Height 6: per layer parameters
-      "add x24, %x[qp], %[per_layer_right_shift]\n"
-      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
-      "add x24, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
-      "mov z2.d, z0.d\n"
-      "mov z3.d, z0.d\n"
       "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
       "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
       "mov z7.d, z4.d\n"
       "76:"  // Height 6: parameters loaded
       ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
@@ -2441,228 +2378,227 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a677de  // sqrdmulh z30.s, z30.s, z6.s\n"
       ".inst 0x04a777ff  // sqrdmulh z31.s, z31.s, z7.s\n"
       "tbz %x[flags], #5, 77f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
+      "and z7.d, z8.d, z0.d\n"
+      "and z6.d, z9.d, z1.d\n"
+      "and z5.d, z10.d, z2.d\n"
+      "and z4.d, z11.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "and z4.d, z12.d, z0.d\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "and z5.d, z13.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z6.d, z14.d, z2.d\n"
       "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z12.s, z12.s, z4.s\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z13.s, z13.s, z5.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z14.s, z14.s, z6.s\n"
-      "and z5.d, z17.d, z1.d\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z15.s, z15.s, z7.s\n"
-      "and z6.d, z18.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z7.d, z19.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "and z4.d, z20.d, z0.d\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "and z5.d, z21.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "and z6.d, z22.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z20.s, z20.s, z4.s\n"
-      "and z7.d, z23.d, z3.d\n"
+      "sqadd z8.s, z8.s, z7.s\n"
+      "sqadd z9.s, z9.s, z6.s\n"
+      "sqadd z10.s, z10.s, z5.s\n"
+      "sqadd z11.s, z11.s, z4.s\n"
+      "and z7.d, z12.d, z0.d\n"
+      "and z6.d, z13.d, z1.d\n"
+      "and z5.d, z14.d, z2.d\n"
+      "and z4.d, z15.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z21.s, z21.s, z5.s\n"
-      "and z4.d, z24.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z22.s, z22.s, z6.s\n"
-      "and z5.d, z25.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z23.s, z23.s, z7.s\n"
-      "and z6.d, z26.d, z2.d\n"
       "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z24.s, z24.s, z4.s\n"
-      "and z7.d, z27.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z7.s\n"
+      "sqadd z13.s, z13.s, z6.s\n"
+      "sqadd z14.s, z14.s, z5.s\n"
+      "sqadd z15.s, z15.s, z4.s\n"
+      "and z7.d, z16.d, z0.d\n"
+      "and z6.d, z17.d, z1.d\n"
+      "and z5.d, z18.d, z2.d\n"
+      "and z4.d, z19.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z25.s, z25.s, z5.s\n"
-      "and z4.d, z28.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z26.s, z26.s, z6.s\n"
-      "and z5.d, z29.d, z1.d\n"
+      "sqadd z16.s, z16.s, z7.s\n"
+      "sqadd z17.s, z17.s, z6.s\n"
+      "sqadd z18.s, z18.s, z5.s\n"
+      "sqadd z19.s, z19.s, z4.s\n"
+      "and z7.d, z20.d, z0.d\n"
+      "and z6.d, z21.d, z1.d\n"
+      "and z5.d, z22.d, z2.d\n"
+      "and z4.d, z23.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z27.s, z27.s, z7.s\n"
-      "and z6.d, z30.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z7.s\n"
+      "sqadd z21.s, z21.s, z6.s\n"
+      "sqadd z22.s, z22.s, z5.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "and z7.d, z24.d, z0.d\n"
+      "and z6.d, z25.d, z1.d\n"
+      "and z5.d, z26.d, z2.d\n"
+      "and z4.d, z27.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z28.s, z28.s, z4.s\n"
-      "and z7.d, z31.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z7.s\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "sqadd z26.s, z26.s, z5.s\n"
+      "sqadd z27.s, z27.s, z4.s\n"
+      "and z7.d, z28.d, z0.d\n"
+      "and z6.d, z29.d, z1.d\n"
+      "and z5.d, z30.d, z2.d\n"
+      "and z4.d, z31.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z29.s, z29.s, z5.s\n"
-      "sqadd z30.s, z30.s, z6.s\n"
-      "sqadd z31.s, z31.s, z7.s\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z7.s\n"
+      "sqadd z29.s, z29.s, z6.s\n"
+      "sqadd z30.s, z30.s, z5.s\n"
+      "sqadd z31.s, z31.s, z4.s\n"
       "77:"  // Height 6: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add x24, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "add z8.s, z8.s, z4.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
-      "add x24, %x[qp], %[minval]\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "ld1rw { z5.s }, p2/Z, [x24]\n"
-      "add x24, %x[qp], %[maxval]\n"
-      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
-      "ld1rw { z6.s }, p2/Z, [x24]\n"
-      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
-      "add z8.s, z8.s, z4.s\n"
       "add z9.s, z9.s, z4.s\n"
       "add z10.s, z10.s, z4.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
       "add z11.s, z11.s, z4.s\n"
       "add z12.s, z12.s, z4.s\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "uzp1 z8.h, z8.h, z9.h\n"
       ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "st1b { z8.b }, p1, [x9]\n"
-      "add z13.s, z13.s, z4.s\n"
-      "addvl x9, x9, #1\n"
       ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      "add z13.s, z13.s, z4.s\n"
+      "add z14.s, z14.s, z4.s\n"
       ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
-      "add z14.s, z14.s, z4.s\n"
       "add z15.s, z15.s, z4.s\n"
       "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "uzp1 z12.h, z12.h, z13.h\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
       ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p1, [x23]\n"
+      "add z17.s, z17.s, z4.s\n"
       "add z18.s, z18.s, z4.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "add z19.s, z19.s, z4.s\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "uzp1 z16.h, z16.h, z17.h\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "add z19.s, z19.s, z4.s\n"
       "add z20.s, z20.s, z4.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
       ".inst 0x44828835  // srshl z21.s, p2/M, z21.s, z1.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
       ".inst 0x44828856  // srshl z22.s, p2/M, z22.s, z2.s\n"
       "add z21.s, z21.s, z4.s\n"
-      "uzp1 z17.h, z18.h, z19.h\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
       "add z22.s, z22.s, z4.s\n"
-      "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x22]\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
       ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
       "add z23.s, z23.s, z4.s\n"
       "add z24.s, z24.s, z4.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "uzp1 z20.h, z20.h, z21.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
       ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "add z25.s, z25.s, z4.s\n"
       "add z26.s, z26.s, z4.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
       ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
-      "uzp1 z24.h, z24.h, z25.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "st1b { z20.b }, p1, [x21]\n"
-      "add z27.s, z27.s, z4.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
       ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
-      ".inst 0x4482883d  // srshl z29.s, p2/M, z29.s, z1.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "add z27.s, z27.s, z4.s\n"
       "add z28.s, z28.s, z4.s\n"
-      "add z29.s, z29.s, z4.s\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "smin z28.s, p2/M, z28.s, z6.s\n"
-      "smin z29.s, p2/M, z29.s, z6.s\n"
+      ".inst 0x4482883d  // srshl z29.s, p2/M, z29.s, z1.s\n"
       ".inst 0x4482885e  // srshl z30.s, p2/M, z30.s, z2.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "smax z28.s, p2/M, z28.s, z5.s\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x20]\n"
+      "add z29.s, z29.s, z4.s\n"
       "add z30.s, z30.s, z4.s\n"
-      "smax z29.s, p2/M, z29.s, z5.s\n"
       ".inst 0x4482887f  // srshl z31.s, p2/M, z31.s, z3.s\n"
-      "smin z30.s, p2/M, z30.s, z6.s\n"
-      "uzp1 z28.h, z28.h, z29.h\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
       "add z31.s, z31.s, z4.s\n"
-      "smax z30.s, p2/M, z30.s, z5.s\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p1, [x19]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z0.s\n"
+      "smin z9.s, p2/M, z9.s, z0.s\n"
+      "smin z10.s, p2/M, z10.s, z0.s\n"
+      "smin z11.s, p2/M, z11.s, z0.s\n"
+      "smin z12.s, p2/M, z12.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z0.s\n"
+      "smin z14.s, p2/M, z14.s, z0.s\n"
+      "smin z15.s, p2/M, z15.s, z0.s\n"
+      "smin z16.s, p2/M, z16.s, z0.s\n"
+      "smin z17.s, p2/M, z17.s, z0.s\n"
+      "smin z18.s, p2/M, z18.s, z0.s\n"
+      "smin z19.s, p2/M, z19.s, z0.s\n"
+      "smin z20.s, p2/M, z20.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z0.s\n"
+      "smin z22.s, p2/M, z22.s, z0.s\n"
+      "smin z23.s, p2/M, z23.s, z0.s\n"
+      "smin z24.s, p2/M, z24.s, z0.s\n"
+      "smin z25.s, p2/M, z25.s, z0.s\n"
+      "smin z26.s, p2/M, z26.s, z0.s\n"
+      "smin z27.s, p2/M, z27.s, z0.s\n"
+      "smin z28.s, p2/M, z28.s, z0.s\n"
+      "smin z29.s, p2/M, z29.s, z0.s\n"
+      "smin z30.s, p2/M, z30.s, z0.s\n"
+      "smin z31.s, p2/M, z31.s, z0.s\n"
+      "smax z8.s, p2/M, z8.s, z1.s\n"
+      "smax z9.s, p2/M, z9.s, z1.s\n"
+      "smax z10.s, p2/M, z10.s, z1.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "smax z11.s, p2/M, z11.s, z1.s\n"
+      "smax z12.s, p2/M, z12.s, z1.s\n"
+      "uzp1 z0.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z0.b\n"
+      "smax z13.s, p2/M, z13.s, z1.s\n"
+      "smax z14.s, p2/M, z14.s, z1.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "st1b { z8.b }, p1, [x11]\n"
+      "smax z15.s, p2/M, z15.s, z1.s\n"
+      "smax z16.s, p2/M, z16.s, z1.s\n"
+      "uzp1 z0.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z0.b\n"
+      "smax z17.s, p2/M, z17.s, z1.s\n"
+      "smax z18.s, p2/M, z18.s, z1.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z12.b }, p1, [x26]\n"
+      "smax z19.s, p2/M, z19.s, z1.s\n"
+      "smax z20.s, p2/M, z20.s, z1.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z21.s, p2/M, z21.s, z1.s\n"
+      "smax z22.s, p2/M, z22.s, z1.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "st1b { z16.b }, p1, [x25]\n"
+      "smax z23.s, p2/M, z23.s, z1.s\n"
+      "smax z24.s, p2/M, z24.s, z1.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z1.s\n"
+      "smax z26.s, p2/M, z26.s, z1.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "st1b { z20.b }, p1, [x24]\n"
+      "smax z27.s, p2/M, z27.s, z1.s\n"
+      "smax z28.s, p2/M, z28.s, z1.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "smax z29.s, p2/M, z29.s, z1.s\n"
+      "smax z30.s, p2/M, z30.s, z1.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "smax z31.s, p2/M, z31.s, z1.s\n"
+      "uzp1 z16.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z16.b\n"
+      "st1b { z28.b }, p1, [x22]\n"
+      "addvl x11, x11, #1\n"
       "78:"  // Height 6: Writeback done
       "decw x10, ALL, MUL #4\n"
       "cmp x10, XZR\n"
       "bgt 67b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 80f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 79f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "79:"  // Update direct input
-      "mov x19, #0x6\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "80:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
-      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
new file mode 100644
index 0000000000..b1b1135c73
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_s8qs_mmla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8qs_mmla_6x4VL
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 49.98 };
+                case CPUModel::A510:
+                    return { 22.62 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8qs_mmla_6x4VL;
+    cls_sve_hybrid_s8qs_mmla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..cd5f85411c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
@@ -0,0 +1,2430 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qs_mmla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+    struct KernelArgs {
+        const int32_t *multiplier_ptr = {};
+        const int32_t *shift_ptr = {};
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->per_channel_requant) {
+        flags |= 0x10;
+        ka.multiplier_ptr=qp->per_channel_muls + col_base;
+        ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+    }
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 66f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 53f\n"
+      "beq 40f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 27f\n"
+      "beq 14f\n"
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "ble 8f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45119a88  // smmla z8.s, z20.b, z17.b\n"
+      ".inst 0x45109a8c  // smmla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45119a89  // smmla z9.s, z20.b, z17.b\n"
+      ".inst 0x45109a8d  // smmla z13.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45119a8a  // smmla z10.s, z20.b, z17.b\n"
+      ".inst 0x45109a8e  // smmla z14.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45119a8b  // smmla z11.s, z20.b, z17.b\n"
+      ".inst 0x45109a8f  // smmla z15.s, z20.b, z16.b\n"
+      "add x26, x26, #0x10\n"
+      "bgt 7b\n"
+      "8:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "addvl x9, x9, #8\n"
+      "ble 9f\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119828  // smmla z8.s, z1.b, z17.b\n"
+      ".inst 0x4510982c  // smmla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119829  // smmla z9.s, z1.b, z17.b\n"
+      ".inst 0x4510982d  // smmla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4511982a  // smmla z10.s, z1.b, z17.b\n"
+      ".inst 0x4510982e  // smmla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x4511982b  // smmla z11.s, z1.b, z17.b\n"
+      ".inst 0x4510982f  // smmla z15.s, z1.b, z16.b\n"
+      "addvl x9, x9, #8\n"
+      "9:"  // Height 1: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 4b\n"
+      "uzp1 z8.d, z8.d, z12.d\n"
+      "uzp1 z9.d, z9.d, z13.d\n"
+      "ld1w { z19.s }, p2/Z, [x14]\n"
+      "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "uzp1 z10.d, z10.d, z14.d\n"
+      "uzp1 z11.d, z11.d, z15.d\n"
+      "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "mov z15.d, z8.d\n"
+      "add z15.s, z15.s, z19.s\n"
+      "addvl x14, x14, #4\n"
+      "add z9.s, z9.s, z18.s\n"
+      "add z10.s, z10.s, z17.s\n"
+      "add z11.s, z11.s, z16.s\n"
+      "tbz %x[flags], #4, 10f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "addvl x13, x13, #4\n"
+      "b 11f\n"
+      "10:"  // Height 1: per layer parameters
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "mov z1.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z7.d, z4.d\n"
+      "11:"  // Height 1: parameters loaded
+      ".inst 0x04a475ef  // sqrdmulh z15.s, z15.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      "tbz %x[flags], #5, 12f\n"
+      "and z19.d, z15.d, z0.d\n"
+      "and z18.d, z9.d, z1.d\n"
+      "and z17.d, z10.d, z2.d\n"
+      "and z16.d, z11.d, z3.d\n"
+      "asr z19.s, z19.s, #0x1f\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z19.s\n"
+      "sqadd z9.s, z9.s, z18.s\n"
+      "sqadd z10.s, z10.s, z17.s\n"
+      "sqadd z11.s, z11.s, z16.s\n"
+      "12:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
+      "add z15.s, z15.s, z17.s\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "add z9.s, z9.s, z17.s\n"
+      "add z10.s, z10.s, z17.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "add z11.s, z11.s, z17.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z31.s }, p2/Z, [x20]\n"
+      "smin z15.s, p2/M, z15.s, z16.s\n"
+      "smin z9.s, p2/M, z9.s, z16.s\n"
+      "smin z10.s, p2/M, z10.s, z16.s\n"
+      "smin z11.s, p2/M, z11.s, z16.s\n"
+      "smax z15.s, p2/M, z15.s, z31.s\n"
+      "smax z9.s, p2/M, z9.s, z31.s\n"
+      "smax z10.s, p2/M, z10.s, z31.s\n"
+      "uzp1 z15.h, z15.h, z9.h\n"
+      "smax z11.s, p2/M, z11.s, z31.s\n"
+      "uzp1 z16.h, z10.h, z11.h\n"
+      "uzp1 z15.b, z15.b, z16.b\n"
+      "st1b { z15.b }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "13:"  // Height 1: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 2b\n"
+      "b 80f\n"
+      "14:"  // Height 2
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "15:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "16:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "17:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 18f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 19f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "b 19f\n"
+      "18:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "19:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "ble 21f\n"
+      "20:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45119a88  // smmla z8.s, z20.b, z17.b\n"
+      ".inst 0x45109a8c  // smmla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45119a89  // smmla z9.s, z20.b, z17.b\n"
+      ".inst 0x45109a8d  // smmla z13.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45119a8a  // smmla z10.s, z20.b, z17.b\n"
+      ".inst 0x45109a8e  // smmla z14.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45119a8b  // smmla z11.s, z20.b, z17.b\n"
+      ".inst 0x45109a8f  // smmla z15.s, z20.b, z16.b\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "bgt 20b\n"
+      "21:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "addvl x9, x9, #8\n"
+      "ble 22f\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119828  // smmla z8.s, z1.b, z17.b\n"
+      ".inst 0x4510982c  // smmla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119829  // smmla z9.s, z1.b, z17.b\n"
+      ".inst 0x4510982d  // smmla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4511982a  // smmla z10.s, z1.b, z17.b\n"
+      ".inst 0x4510982e  // smmla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x4511982b  // smmla z11.s, z1.b, z17.b\n"
+      ".inst 0x4510982f  // smmla z15.s, z1.b, z16.b\n"
+      "addvl x9, x9, #8\n"
+      "22:"  // Height 2: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 17b\n"
+      "uzp1 z20.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z19.s }, p2/Z, [x14]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add x26, x11, x20\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "addvl x14, x14, #4\n"
+      "mov z15.d, z20.d\n"
+      "add z15.s, z15.s, z19.s\n"
+      "add z12.s, z12.s, z18.s\n"
+      "add z13.s, z13.s, z17.s\n"
+      "add z14.s, z14.s, z16.s\n"
+      "add z8.s, z8.s, z19.s\n"
+      "add z9.s, z9.s, z18.s\n"
+      "add z10.s, z10.s, z17.s\n"
+      "add z11.s, z11.s, z16.s\n"
+      "tbz %x[flags], #4, 23f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "addvl x13, x13, #4\n"
+      "b 24f\n"
+      "23:"  // Height 2: per layer parameters
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "mov z1.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z7.d, z4.d\n"
+      "24:"  // Height 2: parameters loaded
+      ".inst 0x04a475ef  // sqrdmulh z15.s, z15.s, z4.s\n"
+      ".inst 0x04a5758c  // sqrdmulh z12.s, z12.s, z5.s\n"
+      ".inst 0x04a675ad  // sqrdmulh z13.s, z13.s, z6.s\n"
+      ".inst 0x04a775ce  // sqrdmulh z14.s, z14.s, z7.s\n"
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      "tbz %x[flags], #5, 25f\n"
+      "and z19.d, z15.d, z0.d\n"
+      "and z18.d, z12.d, z1.d\n"
+      "and z17.d, z13.d, z2.d\n"
+      "and z16.d, z14.d, z3.d\n"
+      "asr z19.s, z19.s, #0x1f\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z19.s\n"
+      "sqadd z12.s, z12.s, z18.s\n"
+      "sqadd z13.s, z13.s, z17.s\n"
+      "sqadd z14.s, z14.s, z16.s\n"
+      "and z18.d, z8.d, z0.d\n"
+      "and z24.d, z9.d, z1.d\n"
+      "and z17.d, z10.d, z2.d\n"
+      "and z16.d, z11.d, z3.d\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z18.s\n"
+      "sqadd z9.s, z9.s, z24.s\n"
+      "sqadd z10.s, z10.s, z17.s\n"
+      "sqadd z11.s, z11.s, z16.s\n"
+      "25:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
+      "add z15.s, z15.s, z17.s\n"
+      ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
+      ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
+      "add z12.s, z12.s, z17.s\n"
+      "add z13.s, z13.s, z17.s\n"
+      ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add z14.s, z14.s, z17.s\n"
+      "add z8.s, z8.s, z17.s\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "add z9.s, z9.s, z17.s\n"
+      "add z10.s, z10.s, z17.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "add z11.s, z11.s, z17.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "smin z15.s, p2/M, z15.s, z16.s\n"
+      "smin z12.s, p2/M, z12.s, z16.s\n"
+      "smin z13.s, p2/M, z13.s, z16.s\n"
+      "smin z14.s, p2/M, z14.s, z16.s\n"
+      "smin z8.s, p2/M, z8.s, z16.s\n"
+      "smin z9.s, p2/M, z9.s, z16.s\n"
+      "smin z10.s, p2/M, z10.s, z16.s\n"
+      "smin z11.s, p2/M, z11.s, z16.s\n"
+      "smax z15.s, p2/M, z15.s, z17.s\n"
+      "smax z12.s, p2/M, z12.s, z17.s\n"
+      "smax z13.s, p2/M, z13.s, z17.s\n"
+      "uzp1 z15.h, z15.h, z12.h\n"
+      "smax z14.s, p2/M, z14.s, z17.s\n"
+      "smax z8.s, p2/M, z8.s, z17.s\n"
+      "uzp1 z16.h, z13.h, z14.h\n"
+      "uzp1 z15.b, z15.b, z16.b\n"
+      "smax z9.s, p2/M, z9.s, z17.s\n"
+      "smax z10.s, p2/M, z10.s, z17.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "st1b { z15.b }, p1, [x11]\n"
+      "smax z11.s, p2/M, z11.s, z17.s\n"
+      "uzp1 z16.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z16.b\n"
+      "st1b { z8.b }, p1, [x26]\n"
+      "addvl x11, x11, #1\n"
+      "26:"  // Height 2: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 15b\n"
+      "b 80f\n"
+      "27:"  // Height 3
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "28:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 32f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "32:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "trn1 z27.d, z30.d, z24.d\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "trn1 z26.d, z28.d, z29.d\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199b68  // smmla z8.s, z27.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b6c  // smmla z12.s, z27.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199b69  // smmla z9.s, z27.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z29.d\n"
+      ".inst 0x45189b6d  // smmla z13.s, z27.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x45199b6a  // smmla z10.s, z27.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45189b6e  // smmla z14.s, z27.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x45199b6b  // smmla z11.s, z27.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x45189b6f  // smmla z15.s, z27.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45199bc8  // smmla z8.s, z30.b, z25.b\n"
+      ".inst 0x45199b90  // smmla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45189bcc  // smmla z12.s, z30.b, z24.b\n"
+      ".inst 0x45189b94  // smmla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45199bc9  // smmla z9.s, z30.b, z25.b\n"
+      ".inst 0x45199b91  // smmla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x45189bcd  // smmla z13.s, z30.b, z24.b\n"
+      ".inst 0x45189b95  // smmla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45199bca  // smmla z10.s, z30.b, z25.b\n"
+      ".inst 0x45199b92  // smmla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x45189bce  // smmla z14.s, z30.b, z24.b\n"
+      ".inst 0x45189b96  // smmla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x45199bcb  // smmla z11.s, z30.b, z25.b\n"
+      ".inst 0x45199b93  // smmla z19.s, z28.b, z25.b\n"
+      ".inst 0x45189bcf  // smmla z15.s, z30.b, z24.b\n"
+      ".inst 0x45189b97  // smmla z23.s, z28.b, z24.b\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "trn1 z27.d, z1.d, z24.d\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "trn1 z26.d, z3.d, z28.d\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199b68  // smmla z8.s, z27.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b6c  // smmla z12.s, z27.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199b69  // smmla z9.s, z27.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x45189b6d  // smmla z13.s, z27.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z28.d\n"
+      ".inst 0x45199b6a  // smmla z10.s, z27.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45189b6e  // smmla z14.s, z27.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45199b6b  // smmla z11.s, z27.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      ".inst 0x45189b6f  // smmla z15.s, z27.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ble 35f\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199828  // smmla z8.s, z1.b, z25.b\n"
+      ".inst 0x45199870  // smmla z16.s, z3.b, z25.b\n"
+      ".inst 0x4518982c  // smmla z12.s, z1.b, z24.b\n"
+      ".inst 0x45189874  // smmla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199829  // smmla z9.s, z1.b, z25.b\n"
+      ".inst 0x45199871  // smmla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4518982d  // smmla z13.s, z1.b, z24.b\n"
+      ".inst 0x45189875  // smmla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4519982a  // smmla z10.s, z1.b, z25.b\n"
+      ".inst 0x45199872  // smmla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4518982e  // smmla z14.s, z1.b, z24.b\n"
+      ".inst 0x45189876  // smmla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4519982b  // smmla z11.s, z1.b, z25.b\n"
+      ".inst 0x45199873  // smmla z19.s, z3.b, z25.b\n"
+      ".inst 0x4518982f  // smmla z15.s, z1.b, z24.b\n"
+      ".inst 0x45189877  // smmla z23.s, z3.b, z24.b\n"
+      "35:"  // Height 3: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 30b\n"
+      "uzp1 z28.d, z8.d, z12.d\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "ld1w { z27.s }, p2/Z, [x14]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add x26, x11, x20\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "add x25, x26, x20\n"
+      "addvl x14, x14, #4\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z28.d\n"
+      "add z23.s, z23.s, z27.s\n"
+      "add z12.s, z12.s, z26.s\n"
+      "add z13.s, z13.s, z25.s\n"
+      "add z14.s, z14.s, z24.s\n"
+      "add z8.s, z8.s, z27.s\n"
+      "add z9.s, z9.s, z26.s\n"
+      "add z10.s, z10.s, z25.s\n"
+      "add z11.s, z11.s, z24.s\n"
+      "add z16.s, z16.s, z27.s\n"
+      "add z17.s, z17.s, z26.s\n"
+      "add z18.s, z18.s, z25.s\n"
+      "add z19.s, z19.s, z24.s\n"
+      "tbz %x[flags], #4, 36f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "addvl x13, x13, #4\n"
+      "b 37f\n"
+      "36:"  // Height 3: per layer parameters
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "mov z1.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z7.d, z4.d\n"
+      "37:"  // Height 3: parameters loaded
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a5758c  // sqrdmulh z12.s, z12.s, z5.s\n"
+      ".inst 0x04a675ad  // sqrdmulh z13.s, z13.s, z6.s\n"
+      ".inst 0x04a775ce  // sqrdmulh z14.s, z14.s, z7.s\n"
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      "tbz %x[flags], #5, 38f\n"
+      "and z24.d, z23.d, z0.d\n"
+      "and z22.d, z12.d, z1.d\n"
+      "and z21.d, z13.d, z2.d\n"
+      "and z20.d, z14.d, z3.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z24.s\n"
+      "sqadd z12.s, z12.s, z22.s\n"
+      "sqadd z13.s, z13.s, z21.s\n"
+      "sqadd z14.s, z14.s, z20.s\n"
+      "and z24.d, z8.d, z0.d\n"
+      "and z22.d, z9.d, z1.d\n"
+      "and z21.d, z10.d, z2.d\n"
+      "and z20.d, z11.d, z3.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z24.s\n"
+      "sqadd z9.s, z9.s, z22.s\n"
+      "sqadd z10.s, z10.s, z21.s\n"
+      "sqadd z11.s, z11.s, z20.s\n"
+      "and z24.d, z16.d, z0.d\n"
+      "and z22.d, z17.d, z1.d\n"
+      "and z21.d, z18.d, z2.d\n"
+      "and z20.d, z19.d, z3.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z24.s\n"
+      "sqadd z17.s, z17.s, z22.s\n"
+      "sqadd z18.s, z18.s, z21.s\n"
+      "sqadd z19.s, z19.s, z20.s\n"
+      "38:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add z23.s, z23.s, z21.s\n"
+      ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
+      ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
+      "add z12.s, z12.s, z21.s\n"
+      "add z13.s, z13.s, z21.s\n"
+      ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add z14.s, z14.s, z21.s\n"
+      "add z8.s, z8.s, z21.s\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "add z9.s, z9.s, z21.s\n"
+      "add z10.s, z10.s, z21.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z11.s, z11.s, z21.s\n"
+      "add z16.s, z16.s, z21.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "add z17.s, z17.s, z21.s\n"
+      "add z18.s, z18.s, z21.s\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z21.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z20.s\n"
+      "smin z12.s, p2/M, z12.s, z20.s\n"
+      "smin z13.s, p2/M, z13.s, z20.s\n"
+      "smin z14.s, p2/M, z14.s, z20.s\n"
+      "smin z8.s, p2/M, z8.s, z20.s\n"
+      "smin z9.s, p2/M, z9.s, z20.s\n"
+      "smin z10.s, p2/M, z10.s, z20.s\n"
+      "smin z11.s, p2/M, z11.s, z20.s\n"
+      "smin z16.s, p2/M, z16.s, z20.s\n"
+      "smin z17.s, p2/M, z17.s, z20.s\n"
+      "smin z18.s, p2/M, z18.s, z20.s\n"
+      "smin z19.s, p2/M, z19.s, z20.s\n"
+      "smax z23.s, p2/M, z23.s, z21.s\n"
+      "smax z12.s, p2/M, z12.s, z21.s\n"
+      "smax z13.s, p2/M, z13.s, z21.s\n"
+      "uzp1 z23.h, z23.h, z12.h\n"
+      "smax z14.s, p2/M, z14.s, z21.s\n"
+      "smax z8.s, p2/M, z8.s, z21.s\n"
+      "uzp1 z20.h, z13.h, z14.h\n"
+      "uzp1 z23.b, z23.b, z20.b\n"
+      "smax z9.s, p2/M, z9.s, z21.s\n"
+      "smax z10.s, p2/M, z10.s, z21.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "st1b { z23.b }, p1, [x11]\n"
+      "smax z11.s, p2/M, z11.s, z21.s\n"
+      "smax z16.s, p2/M, z16.s, z21.s\n"
+      "uzp1 z20.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z21.s\n"
+      "smax z18.s, p2/M, z18.s, z21.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z8.b }, p1, [x26]\n"
+      "smax z19.s, p2/M, z19.s, z21.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x25]\n"
+      "addvl x11, x11, #1\n"
+      "39:"  // Height 3: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 28b\n"
+      "b 80f\n"
+      "40:"  // Height 4
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "41:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "42:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "43:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 44f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 45f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "45:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "ble 47f\n"
+      "46:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z29.d, z30.d, z24.d\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "trn1 z26.d, z28.d, z27.d\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199ba8  // smmla z8.s, z29.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189bac  // smmla z12.s, z29.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199ba9  // smmla z9.s, z29.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z27.d\n"
+      ".inst 0x45189bad  // smmla z13.s, z29.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x45199baa  // smmla z10.s, z29.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45189bae  // smmla z14.s, z29.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x45199bab  // smmla z11.s, z29.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x45189baf  // smmla z15.s, z29.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45199bc8  // smmla z8.s, z30.b, z25.b\n"
+      ".inst 0x45199b90  // smmla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45189bcc  // smmla z12.s, z30.b, z24.b\n"
+      ".inst 0x45189b94  // smmla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x45199bc9  // smmla z9.s, z30.b, z25.b\n"
+      ".inst 0x45199b91  // smmla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x45189bcd  // smmla z13.s, z30.b, z24.b\n"
+      ".inst 0x45189b95  // smmla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45199bca  // smmla z10.s, z30.b, z25.b\n"
+      ".inst 0x45199b92  // smmla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x45189bce  // smmla z14.s, z30.b, z24.b\n"
+      ".inst 0x45189b96  // smmla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x45199bcb  // smmla z11.s, z30.b, z25.b\n"
+      ".inst 0x45199b93  // smmla z19.s, z28.b, z25.b\n"
+      ".inst 0x45189bcf  // smmla z15.s, z30.b, z24.b\n"
+      ".inst 0x45189b97  // smmla z23.s, z28.b, z24.b\n"
+      "bgt 46b\n"
+      "47:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z28.d, z1.d, z24.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "trn1 z26.d, z3.d, z27.d\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199b88  // smmla z8.s, z28.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b8c  // smmla z12.s, z28.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199b89  // smmla z9.s, z28.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x45189b8d  // smmla z13.s, z28.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z27.d\n"
+      ".inst 0x45199b8a  // smmla z10.s, z28.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45189b8e  // smmla z14.s, z28.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45199b8b  // smmla z11.s, z28.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      ".inst 0x45189b8f  // smmla z15.s, z28.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ble 48f\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199828  // smmla z8.s, z1.b, z25.b\n"
+      ".inst 0x45199870  // smmla z16.s, z3.b, z25.b\n"
+      ".inst 0x4518982c  // smmla z12.s, z1.b, z24.b\n"
+      ".inst 0x45189874  // smmla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199829  // smmla z9.s, z1.b, z25.b\n"
+      ".inst 0x45199871  // smmla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4518982d  // smmla z13.s, z1.b, z24.b\n"
+      ".inst 0x45189875  // smmla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4519982a  // smmla z10.s, z1.b, z25.b\n"
+      ".inst 0x45199872  // smmla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4518982e  // smmla z14.s, z1.b, z24.b\n"
+      ".inst 0x45189876  // smmla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4519982b  // smmla z11.s, z1.b, z25.b\n"
+      ".inst 0x45199873  // smmla z19.s, z3.b, z25.b\n"
+      ".inst 0x4518982f  // smmla z15.s, z1.b, z24.b\n"
+      ".inst 0x45189877  // smmla z23.s, z3.b, z24.b\n"
+      "48:"  // Height 4: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 43b\n"
+      "uzp1 z28.d, z8.d, z12.d\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "ld1w { z27.s }, p2/Z, [x14]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add x26, x11, x20\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "add x25, x26, x20\n"
+      "add x24, x25, x20\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "addvl x14, x14, #4\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z28.d\n"
+      "add z23.s, z23.s, z27.s\n"
+      "add z12.s, z12.s, z26.s\n"
+      "add z13.s, z13.s, z25.s\n"
+      "add z14.s, z14.s, z24.s\n"
+      "add z8.s, z8.s, z27.s\n"
+      "add z9.s, z9.s, z26.s\n"
+      "add z10.s, z10.s, z25.s\n"
+      "add z11.s, z11.s, z24.s\n"
+      "add z15.s, z15.s, z27.s\n"
+      "add z20.s, z20.s, z26.s\n"
+      "add z21.s, z21.s, z25.s\n"
+      "add z22.s, z22.s, z24.s\n"
+      "add z16.s, z16.s, z27.s\n"
+      "add z17.s, z17.s, z26.s\n"
+      "add z18.s, z18.s, z25.s\n"
+      "add z19.s, z19.s, z24.s\n"
+      "tbz %x[flags], #4, 49f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "addvl x13, x13, #4\n"
+      "b 50f\n"
+      "49:"  // Height 4: per layer parameters
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "mov z1.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z7.d, z4.d\n"
+      "50:"  // Height 4: parameters loaded
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a5758c  // sqrdmulh z12.s, z12.s, z5.s\n"
+      ".inst 0x04a675ad  // sqrdmulh z13.s, z13.s, z6.s\n"
+      ".inst 0x04a775ce  // sqrdmulh z14.s, z14.s, z7.s\n"
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a475ef  // sqrdmulh z15.s, z15.s, z4.s\n"
+      ".inst 0x04a57694  // sqrdmulh z20.s, z20.s, z5.s\n"
+      ".inst 0x04a676b5  // sqrdmulh z21.s, z21.s, z6.s\n"
+      ".inst 0x04a776d6  // sqrdmulh z22.s, z22.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      "tbz %x[flags], #5, 51f\n"
+      "and z27.d, z23.d, z0.d\n"
+      "and z26.d, z12.d, z1.d\n"
+      "and z25.d, z13.d, z2.d\n"
+      "and z24.d, z14.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z27.s\n"
+      "sqadd z12.s, z12.s, z26.s\n"
+      "sqadd z13.s, z13.s, z25.s\n"
+      "sqadd z14.s, z14.s, z24.s\n"
+      "and z27.d, z8.d, z0.d\n"
+      "and z26.d, z9.d, z1.d\n"
+      "and z25.d, z10.d, z2.d\n"
+      "and z24.d, z11.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z27.s\n"
+      "sqadd z9.s, z9.s, z26.s\n"
+      "sqadd z10.s, z10.s, z25.s\n"
+      "sqadd z11.s, z11.s, z24.s\n"
+      "and z27.d, z15.d, z0.d\n"
+      "and z26.d, z20.d, z1.d\n"
+      "and z25.d, z21.d, z2.d\n"
+      "and z24.d, z22.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z27.s\n"
+      "sqadd z20.s, z20.s, z26.s\n"
+      "sqadd z21.s, z21.s, z25.s\n"
+      "sqadd z22.s, z22.s, z24.s\n"
+      "and z27.d, z16.d, z0.d\n"
+      "and z26.d, z17.d, z1.d\n"
+      "and z25.d, z18.d, z2.d\n"
+      "and z24.d, z19.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z27.s\n"
+      "sqadd z17.s, z17.s, z26.s\n"
+      "sqadd z18.s, z18.s, z25.s\n"
+      "sqadd z19.s, z19.s, z24.s\n"
+      "51:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add z23.s, z23.s, z25.s\n"
+      ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
+      ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
+      "add z12.s, z12.s, z25.s\n"
+      "add z13.s, z13.s, z25.s\n"
+      ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add z14.s, z14.s, z25.s\n"
+      "add z8.s, z8.s, z25.s\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "add z9.s, z9.s, z25.s\n"
+      "add z10.s, z10.s, z25.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
+      "add z11.s, z11.s, z25.s\n"
+      "add z15.s, z15.s, z25.s\n"
+      ".inst 0x44828834  // srshl z20.s, p2/M, z20.s, z1.s\n"
+      ".inst 0x44828855  // srshl z21.s, p2/M, z21.s, z2.s\n"
+      "add z20.s, z20.s, z25.s\n"
+      "add z21.s, z21.s, z25.s\n"
+      ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z22.s, z22.s, z25.s\n"
+      "add z16.s, z16.s, z25.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "add z17.s, z17.s, z25.s\n"
+      "add z18.s, z18.s, z25.s\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z25.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z24.s\n"
+      "smin z12.s, p2/M, z12.s, z24.s\n"
+      "smin z13.s, p2/M, z13.s, z24.s\n"
+      "smin z14.s, p2/M, z14.s, z24.s\n"
+      "smin z8.s, p2/M, z8.s, z24.s\n"
+      "smin z9.s, p2/M, z9.s, z24.s\n"
+      "smin z10.s, p2/M, z10.s, z24.s\n"
+      "smin z11.s, p2/M, z11.s, z24.s\n"
+      "smin z15.s, p2/M, z15.s, z24.s\n"
+      "smin z20.s, p2/M, z20.s, z24.s\n"
+      "smin z21.s, p2/M, z21.s, z24.s\n"
+      "smin z22.s, p2/M, z22.s, z24.s\n"
+      "smin z16.s, p2/M, z16.s, z24.s\n"
+      "smin z17.s, p2/M, z17.s, z24.s\n"
+      "smin z18.s, p2/M, z18.s, z24.s\n"
+      "smin z19.s, p2/M, z19.s, z24.s\n"
+      "smax z23.s, p2/M, z23.s, z25.s\n"
+      "smax z12.s, p2/M, z12.s, z25.s\n"
+      "smax z13.s, p2/M, z13.s, z25.s\n"
+      "uzp1 z23.h, z23.h, z12.h\n"
+      "smax z14.s, p2/M, z14.s, z25.s\n"
+      "smax z8.s, p2/M, z8.s, z25.s\n"
+      "uzp1 z24.h, z13.h, z14.h\n"
+      "uzp1 z23.b, z23.b, z24.b\n"
+      "smax z9.s, p2/M, z9.s, z25.s\n"
+      "smax z10.s, p2/M, z10.s, z25.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "st1b { z23.b }, p1, [x11]\n"
+      "smax z11.s, p2/M, z11.s, z25.s\n"
+      "smax z15.s, p2/M, z15.s, z25.s\n"
+      "uzp1 z23.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z23.b\n"
+      "smax z20.s, p2/M, z20.s, z25.s\n"
+      "smax z21.s, p2/M, z21.s, z25.s\n"
+      "uzp1 z15.h, z15.h, z20.h\n"
+      "st1b { z8.b }, p1, [x26]\n"
+      "smax z22.s, p2/M, z22.s, z25.s\n"
+      "smax z16.s, p2/M, z16.s, z25.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "uzp1 z15.b, z15.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z25.s\n"
+      "smax z18.s, p2/M, z18.s, z25.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z15.b }, p1, [x25]\n"
+      "smax z19.s, p2/M, z19.s, z25.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24]\n"
+      "addvl x11, x11, #1\n"
+      "52:"  // Height 4: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 41b\n"
+      "b 80f\n"
+      "53:"  // Height 5
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "54:"  // Height 5: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "55:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "56:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 57f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 58f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 58f\n"
+      "57:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "58:"  // Height 5: input setup done
+      "cmp x27, #0x10\n"
+      "ble 60f\n"
+      "59:"  // Height 5: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z6.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z7.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z5.d, z6.d, z1.d\n"
+      "trn2 z6.d, z6.d, z1.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "trn1 z3.d, z7.d, z2.d\n"
+      "trn2 z7.d, z7.d, z2.d\n"
+      "ld1b { z1.b }, p2/Z, [x9]\n"
+      "trn1 z2.d, z4.d, z0.d\n"
+      "trn2 z4.d, z4.d, z0.d\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x450198a8  // smmla z8.s, z5.b, z1.b\n"
+      ".inst 0x45019870  // smmla z16.s, z3.b, z1.b\n"
+      ".inst 0x45019858  // smmla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x450098ac  // smmla z12.s, z5.b, z0.b\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4500985c  // smmla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x450198a9  // smmla z9.s, z5.b, z1.b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45019871  // smmla z17.s, z3.b, z1.b\n"
+      ".inst 0x45019859  // smmla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x450098ad  // smmla z13.s, z5.b, z0.b\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4500985d  // smmla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x450198aa  // smmla z10.s, z5.b, z1.b\n"
+      ".inst 0x45019872  // smmla z18.s, z3.b, z1.b\n"
+      ".inst 0x4501985a  // smmla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x450098ae  // smmla z14.s, z5.b, z0.b\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x4500985e  // smmla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x450198ab  // smmla z11.s, z5.b, z1.b\n"
+      ".inst 0x45019873  // smmla z19.s, z3.b, z1.b\n"
+      ".inst 0x4501985b  // smmla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x450098af  // smmla z15.s, z5.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x4500985f  // smmla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x450198c8  // smmla z8.s, z6.b, z1.b\n"
+      ".inst 0x450198f0  // smmla z16.s, z7.b, z1.b\n"
+      ".inst 0x45019898  // smmla z24.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x450098cc  // smmla z12.s, z6.b, z0.b\n"
+      ".inst 0x450098f4  // smmla z20.s, z7.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x450198c9  // smmla z9.s, z6.b, z1.b\n"
+      ".inst 0x450198f1  // smmla z17.s, z7.b, z1.b\n"
+      ".inst 0x45019899  // smmla z25.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x450098cd  // smmla z13.s, z6.b, z0.b\n"
+      ".inst 0x450098f5  // smmla z21.s, z7.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x450198ca  // smmla z10.s, z6.b, z1.b\n"
+      ".inst 0x450198f2  // smmla z18.s, z7.b, z1.b\n"
+      ".inst 0x4501989a  // smmla z26.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x450098ce  // smmla z14.s, z6.b, z0.b\n"
+      ".inst 0x450098f6  // smmla z22.s, z7.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x450198cb  // smmla z11.s, z6.b, z1.b\n"
+      ".inst 0x450198f3  // smmla z19.s, z7.b, z1.b\n"
+      ".inst 0x4501989b  // smmla z27.s, z4.b, z1.b\n"
+      ".inst 0x450098cf  // smmla z15.s, z6.b, z0.b\n"
+      ".inst 0x450098f7  // smmla z23.s, z7.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
+      "bgt 59b\n"
+      "60:"  // Height 5: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z4.b }, p0/Z, [x25]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z7.d, z1.d, z4.d\n"
+      "trn2 z1.d, z1.d, z4.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x22]\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "ld1b { z2.b }, p2/Z, [x9]\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x450298e8  // smmla z8.s, z7.b, z2.b\n"
+      ".inst 0x450298d0  // smmla z16.s, z6.b, z2.b\n"
+      ".inst 0x45029898  // smmla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098d4  // smmla z20.s, z6.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x450298e9  // smmla z9.s, z7.b, z2.b\n"
+      ".inst 0x450298d1  // smmla z17.s, z6.b, z2.b\n"
+      ".inst 0x45029899  // smmla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098d5  // smmla z21.s, z6.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x450298ea  // smmla z10.s, z7.b, z2.b\n"
+      ".inst 0x450298d2  // smmla z18.s, z6.b, z2.b\n"
+      ".inst 0x4502989a  // smmla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098d6  // smmla z22.s, z6.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x450298eb  // smmla z11.s, z7.b, z2.b\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x450298d3  // smmla z19.s, z6.b, z2.b\n"
+      ".inst 0x4502989b  // smmla z27.s, z4.b, z2.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098d7  // smmla z23.s, z6.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
+      "ble 61f\n"
+      "ld1b { z2.b }, p2/Z, [x9]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45029828  // smmla z8.s, z1.b, z2.b\n"
+      ".inst 0x45029870  // smmla z16.s, z3.b, z2.b\n"
+      ".inst 0x450298b8  // smmla z24.s, z5.b, z2.b\n"
+      ".inst 0x4500982c  // smmla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
+      ".inst 0x450098bc  // smmla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45029829  // smmla z9.s, z1.b, z2.b\n"
+      ".inst 0x45029871  // smmla z17.s, z3.b, z2.b\n"
+      ".inst 0x450298b9  // smmla z25.s, z5.b, z2.b\n"
+      ".inst 0x4500982d  // smmla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
+      ".inst 0x450098bd  // smmla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4502982a  // smmla z10.s, z1.b, z2.b\n"
+      ".inst 0x45029872  // smmla z18.s, z3.b, z2.b\n"
+      ".inst 0x450298ba  // smmla z26.s, z5.b, z2.b\n"
+      ".inst 0x4500982e  // smmla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x450098be  // smmla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4502982b  // smmla z11.s, z1.b, z2.b\n"
+      ".inst 0x45029873  // smmla z19.s, z3.b, z2.b\n"
+      ".inst 0x450298bb  // smmla z27.s, z5.b, z2.b\n"
+      ".inst 0x4500982f  // smmla z15.s, z1.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x450098bf  // smmla z31.s, z5.b, z0.b\n"
+      "61:"  // Height 5: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 56b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z4.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "add x26, x11, x20\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "ld1w { z3.s }, p2/Z, [x14]\n"
+      "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "add x25, x26, x20\n"
+      "add x24, x25, x20\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "add x23, x24, x20\n"
+      "addvl x14, x14, #4\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z4.d\n"
+      "add z31.s, z31.s, z3.s\n"
+      "add z12.s, z12.s, z2.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z14.s, z14.s, z0.s\n"
+      "add z8.s, z8.s, z3.s\n"
+      "add z9.s, z9.s, z2.s\n"
+      "add z10.s, z10.s, z1.s\n"
+      "add z11.s, z11.s, z0.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z20.s, z20.s, z2.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z0.s\n"
+      "add z16.s, z16.s, z3.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z1.s\n"
+      "add z19.s, z19.s, z0.s\n"
+      "add z24.s, z24.s, z3.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z1.s\n"
+      "add z27.s, z27.s, z0.s\n"
+      "tbz %x[flags], #4, 62f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "addvl x13, x13, #4\n"
+      "b 63f\n"
+      "62:"  // Height 5: per layer parameters
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "mov z1.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z7.d, z4.d\n"
+      "63:"  // Height 5: parameters loaded
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      ".inst 0x04a5758c  // sqrdmulh z12.s, z12.s, z5.s\n"
+      ".inst 0x04a675ad  // sqrdmulh z13.s, z13.s, z6.s\n"
+      ".inst 0x04a775ce  // sqrdmulh z14.s, z14.s, z7.s\n"
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a475ef  // sqrdmulh z15.s, z15.s, z4.s\n"
+      ".inst 0x04a57694  // sqrdmulh z20.s, z20.s, z5.s\n"
+      ".inst 0x04a676b5  // sqrdmulh z21.s, z21.s, z6.s\n"
+      ".inst 0x04a776d6  // sqrdmulh z22.s, z22.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a57739  // sqrdmulh z25.s, z25.s, z5.s\n"
+      ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
+      ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
+      "tbz %x[flags], #5, 64f\n"
+      "and z30.d, z31.d, z0.d\n"
+      "and z29.d, z12.d, z1.d\n"
+      "and z28.d, z13.d, z2.d\n"
+      "and z23.d, z14.d, z3.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z30.s\n"
+      "sqadd z12.s, z12.s, z29.s\n"
+      "sqadd z13.s, z13.s, z28.s\n"
+      "sqadd z14.s, z14.s, z23.s\n"
+      "and z30.d, z8.d, z0.d\n"
+      "and z29.d, z9.d, z1.d\n"
+      "and z28.d, z10.d, z2.d\n"
+      "and z23.d, z11.d, z3.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z30.s\n"
+      "sqadd z9.s, z9.s, z29.s\n"
+      "sqadd z10.s, z10.s, z28.s\n"
+      "sqadd z11.s, z11.s, z23.s\n"
+      "and z30.d, z15.d, z0.d\n"
+      "and z29.d, z20.d, z1.d\n"
+      "and z28.d, z21.d, z2.d\n"
+      "and z23.d, z22.d, z3.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z30.s\n"
+      "sqadd z20.s, z20.s, z29.s\n"
+      "sqadd z21.s, z21.s, z28.s\n"
+      "sqadd z22.s, z22.s, z23.s\n"
+      "and z30.d, z16.d, z0.d\n"
+      "and z29.d, z17.d, z1.d\n"
+      "and z28.d, z18.d, z2.d\n"
+      "and z23.d, z19.d, z3.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z30.s\n"
+      "sqadd z17.s, z17.s, z29.s\n"
+      "sqadd z18.s, z18.s, z28.s\n"
+      "sqadd z19.s, z19.s, z23.s\n"
+      "and z30.d, z24.d, z0.d\n"
+      "and z29.d, z25.d, z1.d\n"
+      "and z28.d, z26.d, z2.d\n"
+      "and z23.d, z27.d, z3.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z30.s\n"
+      "sqadd z25.s, z25.s, z29.s\n"
+      "sqadd z26.s, z26.s, z28.s\n"
+      "sqadd z27.s, z27.s, z23.s\n"
+      "64:"  // Height 5: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add z31.s, z31.s, z28.s\n"
+      ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
+      ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
+      "add z12.s, z12.s, z28.s\n"
+      "add z13.s, z13.s, z28.s\n"
+      ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add z14.s, z14.s, z28.s\n"
+      "add z8.s, z8.s, z28.s\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "add z9.s, z9.s, z28.s\n"
+      "add z10.s, z10.s, z28.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
+      "add z11.s, z11.s, z28.s\n"
+      "add z15.s, z15.s, z28.s\n"
+      ".inst 0x44828834  // srshl z20.s, p2/M, z20.s, z1.s\n"
+      ".inst 0x44828855  // srshl z21.s, p2/M, z21.s, z2.s\n"
+      "add z20.s, z20.s, z28.s\n"
+      "add z21.s, z21.s, z28.s\n"
+      ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z22.s, z22.s, z28.s\n"
+      "add z16.s, z16.s, z28.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "add z17.s, z17.s, z28.s\n"
+      "add z18.s, z18.s, z28.s\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "add z19.s, z19.s, z28.s\n"
+      "add z24.s, z24.s, z28.s\n"
+      ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
+      ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
+      "add z25.s, z25.s, z28.s\n"
+      "add z26.s, z26.s, z28.s\n"
+      ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z28.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z23.s\n"
+      "smin z12.s, p2/M, z12.s, z23.s\n"
+      "smin z13.s, p2/M, z13.s, z23.s\n"
+      "smin z14.s, p2/M, z14.s, z23.s\n"
+      "smin z8.s, p2/M, z8.s, z23.s\n"
+      "smin z9.s, p2/M, z9.s, z23.s\n"
+      "smin z10.s, p2/M, z10.s, z23.s\n"
+      "smin z11.s, p2/M, z11.s, z23.s\n"
+      "smin z15.s, p2/M, z15.s, z23.s\n"
+      "smin z20.s, p2/M, z20.s, z23.s\n"
+      "smin z21.s, p2/M, z21.s, z23.s\n"
+      "smin z22.s, p2/M, z22.s, z23.s\n"
+      "smin z16.s, p2/M, z16.s, z23.s\n"
+      "smin z17.s, p2/M, z17.s, z23.s\n"
+      "smin z18.s, p2/M, z18.s, z23.s\n"
+      "smin z19.s, p2/M, z19.s, z23.s\n"
+      "smin z24.s, p2/M, z24.s, z23.s\n"
+      "smin z25.s, p2/M, z25.s, z23.s\n"
+      "smin z26.s, p2/M, z26.s, z23.s\n"
+      "smin z27.s, p2/M, z27.s, z23.s\n"
+      "smax z31.s, p2/M, z31.s, z28.s\n"
+      "smax z12.s, p2/M, z12.s, z28.s\n"
+      "smax z13.s, p2/M, z13.s, z28.s\n"
+      "uzp1 z31.h, z31.h, z12.h\n"
+      "smax z14.s, p2/M, z14.s, z28.s\n"
+      "smax z8.s, p2/M, z8.s, z28.s\n"
+      "uzp1 z23.h, z13.h, z14.h\n"
+      "uzp1 z31.b, z31.b, z23.b\n"
+      "smax z9.s, p2/M, z9.s, z28.s\n"
+      "smax z10.s, p2/M, z10.s, z28.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "st1b { z31.b }, p1, [x11]\n"
+      "smax z11.s, p2/M, z11.s, z28.s\n"
+      "smax z15.s, p2/M, z15.s, z28.s\n"
+      "uzp1 z23.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z23.b\n"
+      "smax z20.s, p2/M, z20.s, z28.s\n"
+      "smax z21.s, p2/M, z21.s, z28.s\n"
+      "uzp1 z15.h, z15.h, z20.h\n"
+      "st1b { z8.b }, p1, [x26]\n"
+      "smax z22.s, p2/M, z22.s, z28.s\n"
+      "smax z16.s, p2/M, z16.s, z28.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "uzp1 z15.b, z15.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z28.s\n"
+      "smax z18.s, p2/M, z18.s, z28.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z15.b }, p1, [x25]\n"
+      "smax z19.s, p2/M, z19.s, z28.s\n"
+      "smax z24.s, p2/M, z24.s, z28.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z25.s, p2/M, z25.s, z28.s\n"
+      "smax z26.s, p2/M, z26.s, z28.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "st1b { z16.b }, p1, [x24]\n"
+      "smax z27.s, p2/M, z27.s, z28.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "addvl x11, x11, #1\n"
+      "65:"  // Height 5: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 54b\n"
+      "b 80f\n"
+      "66:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x6\n"
+      "mov x14, %x[col_bias]\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x11, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "67:"  // Height 6: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x10\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "68:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "69:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 70f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 71f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 71f\n"
+      "70:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "71:"  // Height 6: input setup done
+      "cmp x27, #0x10\n"
+      "ble 73f\n"
+      "72:"  // Height 6: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z6.d, z7.d, z0.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "trn2 z7.d, z7.d, z0.d\n"
+      "trn1 z4.d, z5.d, z1.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z5.d, z5.d, z1.d\n"
+      "trn1 z2.d, z3.d, z0.d\n"
+      "trn2 z3.d, z3.d, z0.d\n"
+      "ld1b { z1.b }, p2/Z, [x9]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x450198c8  // smmla z8.s, z6.b, z1.b\n"
+      ".inst 0x45019890  // smmla z16.s, z4.b, z1.b\n"
+      ".inst 0x45019858  // smmla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x450098cc  // smmla z12.s, z6.b, z0.b\n"
+      ".inst 0x45009894  // smmla z20.s, z4.b, z0.b\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4500985c  // smmla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x450198c9  // smmla z9.s, z6.b, z1.b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45019891  // smmla z17.s, z4.b, z1.b\n"
+      ".inst 0x45019859  // smmla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x450098cd  // smmla z13.s, z6.b, z0.b\n"
+      ".inst 0x45009895  // smmla z21.s, z4.b, z0.b\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4500985d  // smmla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x450198ca  // smmla z10.s, z6.b, z1.b\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45019892  // smmla z18.s, z4.b, z1.b\n"
+      ".inst 0x4501985a  // smmla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x450098ce  // smmla z14.s, z6.b, z0.b\n"
+      ".inst 0x45009896  // smmla z22.s, z4.b, z0.b\n"
+      ".inst 0x4500985e  // smmla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x450198cb  // smmla z11.s, z6.b, z1.b\n"
+      ".inst 0x45019893  // smmla z19.s, z4.b, z1.b\n"
+      ".inst 0x4501985b  // smmla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x450098cf  // smmla z15.s, z6.b, z0.b\n"
+      ".inst 0x45009897  // smmla z23.s, z4.b, z0.b\n"
+      ".inst 0x4500985f  // smmla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x450198e8  // smmla z8.s, z7.b, z1.b\n"
+      ".inst 0x450198b0  // smmla z16.s, z5.b, z1.b\n"
+      ".inst 0x45019878  // smmla z24.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098b4  // smmla z20.s, z5.b, z0.b\n"
+      ".inst 0x4500987c  // smmla z28.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x450198e9  // smmla z9.s, z7.b, z1.b\n"
+      ".inst 0x450198b1  // smmla z17.s, z5.b, z1.b\n"
+      ".inst 0x45019879  // smmla z25.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098b5  // smmla z21.s, z5.b, z0.b\n"
+      ".inst 0x4500987d  // smmla z29.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x450198ea  // smmla z10.s, z7.b, z1.b\n"
+      ".inst 0x450198b2  // smmla z18.s, z5.b, z1.b\n"
+      ".inst 0x4501987a  // smmla z26.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098b6  // smmla z22.s, z5.b, z0.b\n"
+      ".inst 0x4500987e  // smmla z30.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x450198eb  // smmla z11.s, z7.b, z1.b\n"
+      ".inst 0x450198b3  // smmla z19.s, z5.b, z1.b\n"
+      ".inst 0x4501987b  // smmla z27.s, z3.b, z1.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098b7  // smmla z23.s, z5.b, z0.b\n"
+      ".inst 0x4500987f  // smmla z31.s, z3.b, z0.b\n"
+      "bgt 72b\n"
+      "73:"  // Height 6: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z7.d, z1.d, z0.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z0.d\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x22]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z2.b }, p2/Z, [x9]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x450298e8  // smmla z8.s, z7.b, z2.b\n"
+      ".inst 0x450298d0  // smmla z16.s, z6.b, z2.b\n"
+      ".inst 0x45029898  // smmla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098d4  // smmla z20.s, z6.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x450298e9  // smmla z9.s, z7.b, z2.b\n"
+      ".inst 0x450298d1  // smmla z17.s, z6.b, z2.b\n"
+      ".inst 0x45029899  // smmla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098d5  // smmla z21.s, z6.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x450298ea  // smmla z10.s, z7.b, z2.b\n"
+      ".inst 0x450298d2  // smmla z18.s, z6.b, z2.b\n"
+      ".inst 0x4502989a  // smmla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098d6  // smmla z22.s, z6.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x450298eb  // smmla z11.s, z7.b, z2.b\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x450298d3  // smmla z19.s, z6.b, z2.b\n"
+      ".inst 0x4502989b  // smmla z27.s, z4.b, z2.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098d7  // smmla z23.s, z6.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
+      "ble 74f\n"
+      "ld1b { z2.b }, p2/Z, [x9]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45029828  // smmla z8.s, z1.b, z2.b\n"
+      ".inst 0x45029870  // smmla z16.s, z3.b, z2.b\n"
+      ".inst 0x450298b8  // smmla z24.s, z5.b, z2.b\n"
+      ".inst 0x4500982c  // smmla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
+      ".inst 0x450098bc  // smmla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45029829  // smmla z9.s, z1.b, z2.b\n"
+      ".inst 0x45029871  // smmla z17.s, z3.b, z2.b\n"
+      ".inst 0x450298b9  // smmla z25.s, z5.b, z2.b\n"
+      ".inst 0x4500982d  // smmla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
+      ".inst 0x450098bd  // smmla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4502982a  // smmla z10.s, z1.b, z2.b\n"
+      ".inst 0x45029872  // smmla z18.s, z3.b, z2.b\n"
+      ".inst 0x450298ba  // smmla z26.s, z5.b, z2.b\n"
+      ".inst 0x4500982e  // smmla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x450098be  // smmla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4502982b  // smmla z11.s, z1.b, z2.b\n"
+      ".inst 0x45029873  // smmla z19.s, z3.b, z2.b\n"
+      ".inst 0x450298bb  // smmla z27.s, z5.b, z2.b\n"
+      ".inst 0x4500982f  // smmla z15.s, z1.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x450098bf  // smmla z31.s, z5.b, z0.b\n"
+      "74:"  // Height 6: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 69b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z4.d, z8.d, z12.d\n"
+      "add x26, x11, x20\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "add x25, x26, x20\n"
+      "ld1w { z3.s }, p2/Z, [x14]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add x24, x25, x20\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "add x23, x24, x20\n"
+      "add x22, x23, x20\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "addvl x14, x14, #4\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z4.d\n"
+      "add z31.s, z31.s, z3.s\n"
+      "add z12.s, z12.s, z2.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z14.s, z14.s, z0.s\n"
+      "add z8.s, z8.s, z3.s\n"
+      "add z9.s, z9.s, z2.s\n"
+      "add z10.s, z10.s, z1.s\n"
+      "add z11.s, z11.s, z0.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z20.s, z20.s, z2.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z0.s\n"
+      "add z16.s, z16.s, z3.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z1.s\n"
+      "add z19.s, z19.s, z0.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z28.s, z28.s, z2.s\n"
+      "add z29.s, z29.s, z1.s\n"
+      "add z30.s, z30.s, z0.s\n"
+      "add z24.s, z24.s, z3.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z1.s\n"
+      "add z27.s, z27.s, z0.s\n"
+      "tbz %x[flags], #4, 75f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "addvl x13, x13, #4\n"
+      "b 76f\n"
+      "75:"  // Height 6: per layer parameters
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "mov z1.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z2.d, z0.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z7.d, z4.d\n"
+      "76:"  // Height 6: parameters loaded
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      ".inst 0x04a5758c  // sqrdmulh z12.s, z12.s, z5.s\n"
+      ".inst 0x04a675ad  // sqrdmulh z13.s, z13.s, z6.s\n"
+      ".inst 0x04a775ce  // sqrdmulh z14.s, z14.s, z7.s\n"
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a475ef  // sqrdmulh z15.s, z15.s, z4.s\n"
+      ".inst 0x04a57694  // sqrdmulh z20.s, z20.s, z5.s\n"
+      ".inst 0x04a676b5  // sqrdmulh z21.s, z21.s, z6.s\n"
+      ".inst 0x04a776d6  // sqrdmulh z22.s, z22.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a5779c  // sqrdmulh z28.s, z28.s, z5.s\n"
+      ".inst 0x04a677bd  // sqrdmulh z29.s, z29.s, z6.s\n"
+      ".inst 0x04a777de  // sqrdmulh z30.s, z30.s, z7.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a57739  // sqrdmulh z25.s, z25.s, z5.s\n"
+      ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
+      ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
+      "tbz %x[flags], #5, 77f\n"
+      "and z7.d, z31.d, z0.d\n"
+      "and z6.d, z12.d, z1.d\n"
+      "and z5.d, z13.d, z2.d\n"
+      "and z4.d, z14.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z7.s\n"
+      "sqadd z12.s, z12.s, z6.s\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "sqadd z14.s, z14.s, z4.s\n"
+      "and z7.d, z8.d, z0.d\n"
+      "and z6.d, z9.d, z1.d\n"
+      "and z5.d, z10.d, z2.d\n"
+      "and z4.d, z11.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z7.s\n"
+      "sqadd z9.s, z9.s, z6.s\n"
+      "sqadd z10.s, z10.s, z5.s\n"
+      "sqadd z11.s, z11.s, z4.s\n"
+      "and z7.d, z15.d, z0.d\n"
+      "and z6.d, z20.d, z1.d\n"
+      "and z5.d, z21.d, z2.d\n"
+      "and z4.d, z22.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "sqadd z20.s, z20.s, z6.s\n"
+      "sqadd z21.s, z21.s, z5.s\n"
+      "sqadd z22.s, z22.s, z4.s\n"
+      "and z7.d, z16.d, z0.d\n"
+      "and z6.d, z17.d, z1.d\n"
+      "and z5.d, z18.d, z2.d\n"
+      "and z4.d, z19.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z7.s\n"
+      "sqadd z17.s, z17.s, z6.s\n"
+      "sqadd z18.s, z18.s, z5.s\n"
+      "sqadd z19.s, z19.s, z4.s\n"
+      "and z7.d, z23.d, z0.d\n"
+      "and z6.d, z28.d, z1.d\n"
+      "and z5.d, z29.d, z2.d\n"
+      "and z4.d, z30.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z7.s\n"
+      "sqadd z28.s, z28.s, z6.s\n"
+      "sqadd z29.s, z29.s, z5.s\n"
+      "sqadd z30.s, z30.s, z4.s\n"
+      "and z7.d, z24.d, z0.d\n"
+      "and z6.d, z25.d, z1.d\n"
+      "and z5.d, z26.d, z2.d\n"
+      "and z4.d, z27.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z7.s\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "sqadd z26.s, z26.s, z5.s\n"
+      "sqadd z27.s, z27.s, z4.s\n"
+      "77:"  // Height 6: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
+      ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "add z13.s, z13.s, z4.s\n"
+      ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      ".inst 0x44828834  // srshl z20.s, p2/M, z20.s, z1.s\n"
+      ".inst 0x44828855  // srshl z21.s, p2/M, z21.s, z2.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      ".inst 0x4482883c  // srshl z28.s, p2/M, z28.s, z1.s\n"
+      ".inst 0x4482885d  // srshl z29.s, p2/M, z29.s, z2.s\n"
+      "add z28.s, z28.s, z4.s\n"
+      "add z29.s, z29.s, z4.s\n"
+      ".inst 0x4482887e  // srshl z30.s, p2/M, z30.s, z3.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "add z30.s, z30.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
+      ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "add z26.s, z26.s, z4.s\n"
+      ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z4.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z0.s\n"
+      "smin z12.s, p2/M, z12.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z0.s\n"
+      "smin z14.s, p2/M, z14.s, z0.s\n"
+      "smin z8.s, p2/M, z8.s, z0.s\n"
+      "smin z9.s, p2/M, z9.s, z0.s\n"
+      "smin z10.s, p2/M, z10.s, z0.s\n"
+      "smin z11.s, p2/M, z11.s, z0.s\n"
+      "smin z15.s, p2/M, z15.s, z0.s\n"
+      "smin z20.s, p2/M, z20.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z0.s\n"
+      "smin z22.s, p2/M, z22.s, z0.s\n"
+      "smin z16.s, p2/M, z16.s, z0.s\n"
+      "smin z17.s, p2/M, z17.s, z0.s\n"
+      "smin z18.s, p2/M, z18.s, z0.s\n"
+      "smin z19.s, p2/M, z19.s, z0.s\n"
+      "smin z23.s, p2/M, z23.s, z0.s\n"
+      "smin z28.s, p2/M, z28.s, z0.s\n"
+      "smin z29.s, p2/M, z29.s, z0.s\n"
+      "smin z30.s, p2/M, z30.s, z0.s\n"
+      "smin z24.s, p2/M, z24.s, z0.s\n"
+      "smin z25.s, p2/M, z25.s, z0.s\n"
+      "smin z26.s, p2/M, z26.s, z0.s\n"
+      "smin z27.s, p2/M, z27.s, z0.s\n"
+      "smax z31.s, p2/M, z31.s, z1.s\n"
+      "smax z12.s, p2/M, z12.s, z1.s\n"
+      "smax z13.s, p2/M, z13.s, z1.s\n"
+      "uzp1 z31.h, z31.h, z12.h\n"
+      "smax z14.s, p2/M, z14.s, z1.s\n"
+      "smax z8.s, p2/M, z8.s, z1.s\n"
+      "uzp1 z0.h, z13.h, z14.h\n"
+      "uzp1 z31.b, z31.b, z0.b\n"
+      "smax z9.s, p2/M, z9.s, z1.s\n"
+      "smax z10.s, p2/M, z10.s, z1.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "st1b { z31.b }, p1, [x11]\n"
+      "smax z11.s, p2/M, z11.s, z1.s\n"
+      "smax z15.s, p2/M, z15.s, z1.s\n"
+      "uzp1 z31.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z31.b\n"
+      "smax z20.s, p2/M, z20.s, z1.s\n"
+      "smax z21.s, p2/M, z21.s, z1.s\n"
+      "uzp1 z15.h, z15.h, z20.h\n"
+      "st1b { z8.b }, p1, [x26]\n"
+      "smax z22.s, p2/M, z22.s, z1.s\n"
+      "smax z16.s, p2/M, z16.s, z1.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "uzp1 z15.b, z15.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z1.s\n"
+      "smax z18.s, p2/M, z18.s, z1.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z15.b }, p1, [x25]\n"
+      "smax z19.s, p2/M, z19.s, z1.s\n"
+      "smax z23.s, p2/M, z23.s, z1.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z28.s, p2/M, z28.s, z1.s\n"
+      "smax z29.s, p2/M, z29.s, z1.s\n"
+      "uzp1 z23.h, z23.h, z28.h\n"
+      "st1b { z16.b }, p1, [x24]\n"
+      "smax z30.s, p2/M, z30.s, z1.s\n"
+      "smax z24.s, p2/M, z24.s, z1.s\n"
+      "uzp1 z16.h, z29.h, z30.h\n"
+      "uzp1 z23.b, z23.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z1.s\n"
+      "smax z26.s, p2/M, z26.s, z1.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "st1b { z23.b }, p1, [x23]\n"
+      "smax z27.s, p2/M, z27.s, z1.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x22]\n"
+      "addvl x11, x11, #1\n"
+      "78:"  // Height 6: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 67b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 80f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 79f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "79:"  // Update direct input
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "80:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
index b2c376196f..cfa349f3aa 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,21 +10,22 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -38,11 +39,13 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void sve_hybrid_s8s32_dot_6x4VL( ARGLIST );
+void sve_hybrid_s8s32_dot_6x4VL_a64fx( ARGLIST );
 
 class cls_sve_hybrid_s8s32_dot_6x4VL
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int32_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,16 +71,54 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.56 };
+                case CPUModel::A510:
+                    return { 20.92 };
+                case CPUModel::V1:
+                    return { 62.24 };
+                case CPUModel::A64FX:
+                    return { 94.32 };
+            }
+        }
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.60, 15.53, 0.62 };
+                case CPUModel::A510:
+                    return { 22.77, 3.90, 0.47 };
+                case CPUModel::V1:
+                    return { 48.09, 16.24, 0.83 };
+                case CPUModel::A64FX:
+                    return { 100.19, 3.13, 0.43 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_s8s32_dot_6x4VL;
-    cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *)
+    cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_hybrid_s8s32_dot_6x4VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
new file mode 100644
index 0000000000..1a483210f3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
@@ -0,0 +1,1032 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_dot_6x4VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+    const int32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 51f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 41f\n"
+      "beq 31f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 21f\n"
+      "beq 11f\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 3f\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "b 4f\n"
+      "3:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add x26, x26, #0x4\n"
+      "sdot z10.s, z17.b, z0.b\n"
+      "sdot z11.s, z16.b, z0.b\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 8b\n"
+      "9:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "sdot z10.s, z17.b, z0.b\n"
+      "sdot z11.s, z16.b, z0.b\n"
+      "addvl x10, x10, #4\n"
+      "bne 5b\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "10:"  // Height 1: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 2b\n"
+      "b 62f\n"
+      "11:"  // Height 2
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "12:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 13f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x20]\n"
+      "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 14f\n"
+      "13:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "14:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "15:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 17f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "17:"  // Height 2: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 19f\n"
+      "18:"  // Height 2: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x26, x26, #0x4\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "subs x27, x27, #0x4\n"
+      "add x25, x25, #0x4\n"
+      "sdot z10.s, z17.b, z0.b\n"
+      "sdot z14.s, z17.b, z1.b\n"
+      "sdot z11.s, z16.b, z0.b\n"
+      "sdot z15.s, z16.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 18b\n"
+      "19:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "sdot z10.s, z17.b, z0.b\n"
+      "sdot z14.s, z17.b, z1.b\n"
+      "addvl x10, x10, #4\n"
+      "sdot z11.s, z16.b, z0.b\n"
+      "sdot z15.s, z16.b, z1.b\n"
+      "bne 15b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x20]\n"
+      "st1w { z13.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x20, #3, MUL VL]\n"
+      "20:"  // Height 2: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 12b\n"
+      "b 62f\n"
+      "21:"  // Height 3
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "22:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 23f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x21]\n"
+      "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x20]\n"
+      "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 24f\n"
+      "23:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "24:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "25:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 26f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 27f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "b 27f\n"
+      "26:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "27:"  // Height 3: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 29f\n"
+      "28:"  // Height 3: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x25, x25, #0x4\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add x24, x24, #0x4\n"
+      "sdot z10.s, z21.b, z0.b\n"
+      "sdot z14.s, z21.b, z1.b\n"
+      "sdot z18.s, z21.b, z2.b\n"
+      "sdot z11.s, z20.b, z0.b\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "sdot z15.s, z20.b, z1.b\n"
+      "sdot z19.s, z20.b, z2.b\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 28b\n"
+      "29:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x28, x28, #0x1\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "cmp x28, x20\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z21.b, z0.b\n"
+      "sdot z14.s, z21.b, z1.b\n"
+      "sdot z18.s, z21.b, z2.b\n"
+      "sdot z11.s, z20.b, z0.b\n"
+      "sdot z15.s, z20.b, z1.b\n"
+      "sdot z19.s, z20.b, z2.b\n"
+      "bne 25b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x21]\n"
+      "st1w { z13.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x20]\n"
+      "st1w { z17.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x20, #3, MUL VL]\n"
+      "30:"  // Height 3: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 22b\n"
+      "b 62f\n"
+      "31:"  // Height 4
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "32:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 33f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x22]\n"
+      "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x21]\n"
+      "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x20]\n"
+      "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "34:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "35:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 37f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 37f\n"
+      "36:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "37:"  // Height 4: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 39f\n"
+      "38:"  // Height 4: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x25, x25, #0x4\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z25.b, z0.b\n"
+      "sdot z14.s, z25.b, z1.b\n"
+      "sdot z18.s, z25.b, z2.b\n"
+      "sdot z22.s, z25.b, z3.b\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "sdot z11.s, z24.b, z0.b\n"
+      "sdot z15.s, z24.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "sdot z19.s, z24.b, z2.b\n"
+      "sdot z23.s, z24.b, z3.b\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 38b\n"
+      "39:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x28, x28, #0x1\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "cmp x28, x20\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z25.b, z0.b\n"
+      "sdot z14.s, z25.b, z1.b\n"
+      "sdot z18.s, z25.b, z2.b\n"
+      "sdot z22.s, z25.b, z3.b\n"
+      "sdot z11.s, z24.b, z0.b\n"
+      "sdot z15.s, z24.b, z1.b\n"
+      "sdot z19.s, z24.b, z2.b\n"
+      "sdot z23.s, z24.b, z3.b\n"
+      "bne 35b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x22]\n"
+      "st1w { z13.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x21]\n"
+      "st1w { z17.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x20]\n"
+      "st1w { z21.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x20, #3, MUL VL]\n"
+      "40:"  // Height 4: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 32b\n"
+      "b 62f\n"
+      "41:"  // Height 5
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "42:"  // Height 5: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 43f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 44f\n"
+      "43:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "44:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "45:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 46f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 47f\n"
+      "46:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "47:"  // Height 5: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 49f\n"
+      "48:"  // Height 5: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "add x25, x25, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "sdot z24.s, z6.b, z4.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x23, x23, #0x4\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "add x22, x22, #0x4\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "sdot z25.s, z7.b, z4.b\n"
+      "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z29.b, z0.b\n"
+      "sdot z14.s, z29.b, z1.b\n"
+      "sdot z18.s, z29.b, z2.b\n"
+      "sdot z22.s, z29.b, z3.b\n"
+      "sdot z26.s, z29.b, z4.b\n"
+      "sdot z11.s, z28.b, z0.b\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "sdot z15.s, z28.b, z1.b\n"
+      "sdot z19.s, z28.b, z2.b\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "sdot z23.s, z28.b, z3.b\n"
+      "sdot z27.s, z28.b, z4.b\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 48b\n"
+      "49:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x28, x28, #0x1\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "cmp x28, x20\n"
+      "sdot z24.s, z6.b, z4.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "sdot z25.s, z7.b, z4.b\n"
+      "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z29.b, z0.b\n"
+      "sdot z14.s, z29.b, z1.b\n"
+      "sdot z18.s, z29.b, z2.b\n"
+      "sdot z22.s, z29.b, z3.b\n"
+      "sdot z26.s, z29.b, z4.b\n"
+      "sdot z11.s, z28.b, z0.b\n"
+      "sdot z15.s, z28.b, z1.b\n"
+      "sdot z19.s, z28.b, z2.b\n"
+      "sdot z23.s, z28.b, z3.b\n"
+      "sdot z27.s, z28.b, z4.b\n"
+      "bne 45b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x21]\n"
+      "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x20]\n"
+      "st1w { z25.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x20, #3, MUL VL]\n"
+      "50:"  // Height 5: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 42b\n"
+      "b 62f\n"
+      "51:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "52:"  // Height 6: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 53f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x24]\n"
+      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x22]\n"
+      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x21]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p3/Z, [x20]\n"
+      "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 54f\n"
+      "53:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "54:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "55:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 56f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 57f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 57f\n"
+      "56:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "57:"  // Height 6: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1rw { z5.s }, p4/Z, [x21]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 59f\n"
+      "58:"  // Height 6: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "add x25, x25, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "sdot z24.s, z6.b, z4.b\n"
+      "sdot z28.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x23, x23, #0x4\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "add x22, x22, #0x4\n"
+      "add x21, x21, #0x4\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "sdot z25.s, z7.b, z4.b\n"
+      "sdot z29.s, z7.b, z5.b\n"
+      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z18.s, z6.b, z2.b\n"
+      "sdot z22.s, z6.b, z3.b\n"
+      "sdot z26.s, z6.b, z4.b\n"
+      "sdot z30.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "sdot z19.s, z7.b, z2.b\n"
+      "sdot z23.s, z7.b, z3.b\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "sdot z27.s, z7.b, z4.b\n"
+      "sdot z31.s, z7.b, z5.b\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1rw { z5.s }, p4/Z, [x21]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 58b\n"
+      "59:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x28, x28, #0x1\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "cmp x28, x20\n"
+      "sdot z24.s, z6.b, z4.b\n"
+      "sdot z28.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "sdot z25.s, z7.b, z4.b\n"
+      "sdot z29.s, z7.b, z5.b\n"
+      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z18.s, z6.b, z2.b\n"
+      "sdot z22.s, z6.b, z3.b\n"
+      "sdot z26.s, z6.b, z4.b\n"
+      "sdot z30.s, z6.b, z5.b\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "sdot z19.s, z7.b, z2.b\n"
+      "sdot z23.s, z7.b, z3.b\n"
+      "sdot z27.s, z7.b, z4.b\n"
+      "sdot z31.s, z7.b, z5.b\n"
+      "bne 55b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x24]\n"
+      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x23]\n"
+      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x22]\n"
+      "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x21]\n"
+      "st1w { z25.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z28.s }, p3, [x20]\n"
+      "st1w { z29.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z30.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z31.s }, p0, [x20, #3, MUL VL]\n"
+      "60:"  // Height 6: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 52b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 62f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "61:"  // Update direct input
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "62:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
index 8862b3665a..eeef192b66 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,18 +10,18 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -87,23 +87,23 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "cmp %x[M], #0x2\n"
       "bgt 23f\n"
       "beq 12f\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 3f\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
       "b 4f\n"
       "3:"  // Height 1: no accumulate
       "mov z8.s, #0x0\n"
@@ -111,151 +111,148 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "mov z10.s, #0x0\n"
       "mov z11.s, #0x0\n"
       "4:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "5:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 6f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 7f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
       "b 7f\n"
       "6:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "7:"  // Height 1: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 9f\n"
       "8:"  // Height 1: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1b { z16.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z10.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "sdot z8.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "sdot z10.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
+      "add x26, x26, #0x10\n"
       "bgt 8b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1b { z16.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[0]\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
+      "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[1]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z10.s, z17.b, z0.b[1]\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
+      "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
+      "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
+      "addvl x10, x10, #4\n"
       "10:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 5b\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
       "11:"  // Height 1: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 2b\n"
       "b 68f\n"
       "12:"  // Height 2
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "13:"  // Height 2: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 14f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x23]\n"
-      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 15f\n"
       "14:"  // Height 2: no accumulate
       "mov z8.s, #0x0\n"
@@ -267,203 +264,197 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "mov z14.s, #0x0\n"
       "mov z15.s, #0x0\n"
       "15:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "16:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 17f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 18f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 18f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
       "b 18f\n"
       "17:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
       "18:"  // Height 2: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 20f\n"
       "19:"  // Height 2: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[0]\n"
+      "sdot z12.s, z17.b, z0.b[0]\n"
+      "sdot z9.s, z16.b, z1.b[0]\n"
+      "sdot z13.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[0]\n"
+      "sdot z14.s, z17.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      "sdot z11.s, z16.b, z1.b[0]\n"
+      "sdot z15.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z8.s, z17.b, z1.b[1]\n"
+      "sdot z12.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z9.s, z16.b, z1.b[1]\n"
+      "sdot z13.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "sdot z10.s, z17.b, z1.b[1]\n"
+      "sdot z14.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "sdot z11.s, z16.b, z1.b[1]\n"
+      "sdot z15.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[2]\n"
+      "sdot z12.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "sdot z9.s, z16.b, z1.b[2]\n"
+      "sdot z13.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[2]\n"
+      "sdot z14.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "sdot z11.s, z16.b, z1.b[2]\n"
+      "sdot z15.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[3]\n"
+      "sdot z12.s, z17.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "sdot z9.s, z16.b, z1.b[3]\n"
+      "sdot z13.s, z16.b, z0.b[3]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[3]\n"
+      "sdot z14.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z1.b[3]\n"
+      "sdot z15.s, z16.b, z0.b[3]\n"
       "bgt 19b\n"
       "20:"  // Height 2: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[0]\n"
+      "sdot z12.s, z17.b, z1.b[0]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "sdot z13.s, z16.b, z1.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[0]\n"
+      "sdot z14.s, z17.b, z1.b[0]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
+      "sdot z15.s, z16.b, z1.b[0]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[1]\n"
+      "sdot z12.s, z17.b, z1.b[1]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "sdot z13.s, z16.b, z1.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z10.s, z17.b, z0.b[1]\n"
+      "sdot z14.s, z17.b, z1.b[1]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
+      "sdot z15.s, z16.b, z1.b[1]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z12.s, z17.b, z1.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "sdot z13.s, z16.b, z1.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z14.s, z17.b, z1.b[2]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
+      "sdot z15.s, z16.b, z1.b[2]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z12.s, z17.b, z1.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "sdot z13.s, z16.b, z1.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z14.s, z17.b, z1.b[3]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
+      "sdot z15.s, z16.b, z1.b[3]\n"
       "21:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 16b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x23]\n"
-      "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x20]\n"
+      "st1w { z13.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x20, #3, MUL VL]\n"
       "22:"  // Height 2: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 13b\n"
       "b 68f\n"
       "23:"  // Height 3
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "24:"  // Height 3: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 25f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x23]\n"
-      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x22]\n"
-      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20]\n"
+      "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 26f\n"
       "25:"  // Height 3: no accumulate
       "mov z8.s, #0x0\n"
@@ -479,254 +470,245 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "mov z18.s, #0x0\n"
       "mov z19.s, #0x0\n"
       "26:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "27:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 28f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 29f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 29f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
       "b 29f\n"
       "28:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "29:"  // Height 3: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 31f\n"
       "30:"  // Height 3: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z21.b, z2.b[0]\n"
+      "sdot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z16.s, z21.b, z0.b[0]\n"
+      "sdot z9.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[0]\n"
+      "sdot z17.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      "sdot z10.s, z21.b, z2.b[0]\n"
+      "sdot z14.s, z21.b, z1.b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "sdot z18.s, z21.b, z0.b[0]\n"
+      "sdot z11.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z15.s, z20.b, z1.b[0]\n"
+      "sdot z19.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[1]\n"
+      "sdot z12.s, z21.b, z1.b[1]\n"
+      "sdot z16.s, z21.b, z0.b[1]\n"
+      "sdot z9.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[1]\n"
+      "sdot z17.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "sdot z10.s, z21.b, z2.b[1]\n"
+      "sdot z14.s, z21.b, z1.b[1]\n"
+      "sdot z18.s, z21.b, z0.b[1]\n"
+      "sdot z11.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "sdot z15.s, z20.b, z1.b[1]\n"
+      "sdot z19.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[2]\n"
+      "sdot z12.s, z21.b, z1.b[2]\n"
+      "sdot z16.s, z21.b, z0.b[2]\n"
+      "sdot z9.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[2]\n"
+      "sdot z17.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z21.b, z2.b[2]\n"
+      "sdot z14.s, z21.b, z1.b[2]\n"
+      "sdot z18.s, z21.b, z0.b[2]\n"
+      "sdot z11.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "sdot z15.s, z20.b, z1.b[2]\n"
+      "sdot z19.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[3]\n"
+      "sdot z12.s, z21.b, z1.b[3]\n"
+      "sdot z16.s, z21.b, z0.b[3]\n"
+      "sdot z9.s, z20.b, z2.b[3]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[3]\n"
+      "sdot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sdot z10.s, z21.b, z2.b[3]\n"
+      "sdot z14.s, z21.b, z1.b[3]\n"
+      "sdot z18.s, z21.b, z0.b[3]\n"
+      "sdot z11.s, z20.b, z2.b[3]\n"
+      "sdot z15.s, z20.b, z1.b[3]\n"
+      "sdot z19.s, z20.b, z0.b[3]\n"
       "bgt 30b\n"
       "31:"  // Height 3: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z21.b, z0.b[0]\n"
+      "sdot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z16.s, z21.b, z2.b[0]\n"
+      "sdot z9.s, z20.b, z0.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[0]\n"
+      "sdot z17.s, z20.b, z2.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z21.b, z0.b[0]\n"
+      "sdot z14.s, z21.b, z1.b[0]\n"
+      "sdot z18.s, z21.b, z2.b[0]\n"
+      "sdot z11.s, z20.b, z0.b[0]\n"
+      "sdot z15.s, z20.b, z1.b[0]\n"
+      "sdot z19.s, z20.b, z2.b[0]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[1]\n"
+      "sdot z12.s, z21.b, z1.b[1]\n"
+      "sdot z16.s, z21.b, z2.b[1]\n"
+      "sdot z9.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z13.s, z20.b, z1.b[1]\n"
+      "sdot z17.s, z20.b, z2.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z21.b, z0.b[1]\n"
+      "sdot z14.s, z21.b, z1.b[1]\n"
+      "sdot z18.s, z21.b, z2.b[1]\n"
+      "sdot z11.s, z20.b, z0.b[1]\n"
+      "sdot z15.s, z20.b, z1.b[1]\n"
+      "sdot z19.s, z20.b, z2.b[1]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[2]\n"
+      "sdot z12.s, z21.b, z1.b[2]\n"
+      "sdot z16.s, z21.b, z2.b[2]\n"
+      "sdot z9.s, z20.b, z0.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z13.s, z20.b, z1.b[2]\n"
+      "sdot z17.s, z20.b, z2.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z21.b, z0.b[2]\n"
+      "sdot z14.s, z21.b, z1.b[2]\n"
+      "sdot z18.s, z21.b, z2.b[2]\n"
+      "sdot z11.s, z20.b, z0.b[2]\n"
+      "sdot z15.s, z20.b, z1.b[2]\n"
+      "sdot z19.s, z20.b, z2.b[2]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[3]\n"
+      "sdot z12.s, z21.b, z1.b[3]\n"
+      "sdot z16.s, z21.b, z2.b[3]\n"
+      "sdot z9.s, z20.b, z0.b[3]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[3]\n"
+      "sdot z17.s, z20.b, z2.b[3]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z21.b, z0.b[3]\n"
+      "sdot z14.s, z21.b, z1.b[3]\n"
+      "sdot z18.s, z21.b, z2.b[3]\n"
+      "sdot z11.s, z20.b, z0.b[3]\n"
+      "sdot z15.s, z20.b, z1.b[3]\n"
+      "sdot z19.s, z20.b, z2.b[3]\n"
       "32:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 27b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x23]\n"
-      "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x22]\n"
-      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x21]\n"
+      "st1w { z13.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
       "33:"  // Height 3: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 24b\n"
       "b 68f\n"
       "34:"  // Height 4
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "35:"  // Height 4: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 36f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "ld1w { z12.s }, p4/Z, [x23]\n"
-      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x22]\n"
-      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x21]\n"
-      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21]\n"
+      "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 37f\n"
       "36:"  // Height 4: no accumulate
       "mov z8.s, #0x0\n"
@@ -746,290 +728,278 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "mov z22.s, #0x0\n"
       "mov z23.s, #0x0\n"
       "37:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "38:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 39f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 40f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 40f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 40f\n"
       "39:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "40:"  // Height 4: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 42f\n"
       "41:"  // Height 4: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z3.b }, p0/Z, [x26]\n"
+      "ld1rqb { z2.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z0.b }, p0/Z, [x23]\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[0]\n"
+      "sdot z12.s, z25.b, z2.b[0]\n"
+      "sdot z16.s, z25.b, z1.b[0]\n"
+      "sdot z20.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "sdot z9.s, z24.b, z3.b[0]\n"
+      "sdot z13.s, z24.b, z2.b[0]\n"
       "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
       "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z17.s, z24.b, z1.b[0]\n"
+      "sdot z21.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[0]\n"
+      "sdot z14.s, z25.b, z2.b[0]\n"
+      "sdot z18.s, z25.b, z1.b[0]\n"
+      "sdot z22.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[0]\n"
+      "sdot z15.s, z24.b, z2.b[0]\n"
+      "sdot z19.s, z24.b, z1.b[0]\n"
+      "sdot z23.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[1]\n"
+      "sdot z12.s, z25.b, z2.b[1]\n"
+      "sdot z16.s, z25.b, z1.b[1]\n"
+      "sdot z20.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[1]\n"
+      "sdot z13.s, z24.b, z2.b[1]\n"
+      "sdot z17.s, z24.b, z1.b[1]\n"
+      "sdot z21.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "sdot z10.s, z25.b, z3.b[1]\n"
+      "sdot z14.s, z25.b, z2.b[1]\n"
+      "sdot z18.s, z25.b, z1.b[1]\n"
+      "sdot z22.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[1]\n"
+      "sdot z15.s, z24.b, z2.b[1]\n"
+      "sdot z19.s, z24.b, z1.b[1]\n"
+      "sdot z23.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[2]\n"
+      "sdot z12.s, z25.b, z2.b[2]\n"
+      "sdot z16.s, z25.b, z1.b[2]\n"
+      "sdot z20.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[2]\n"
+      "sdot z13.s, z24.b, z2.b[2]\n"
+      "sdot z17.s, z24.b, z1.b[2]\n"
+      "sdot z21.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[2]\n"
+      "sdot z14.s, z25.b, z2.b[2]\n"
+      "sdot z18.s, z25.b, z1.b[2]\n"
+      "sdot z22.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[2]\n"
+      "sdot z15.s, z24.b, z2.b[2]\n"
+      "sdot z19.s, z24.b, z1.b[2]\n"
+      "sdot z23.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[3]\n"
+      "sdot z12.s, z25.b, z2.b[3]\n"
+      "sdot z16.s, z25.b, z1.b[3]\n"
+      "sdot z20.s, z25.b, z0.b[3]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[3]\n"
+      "sdot z13.s, z24.b, z2.b[3]\n"
+      "sdot z17.s, z24.b, z1.b[3]\n"
+      "sdot z21.s, z24.b, z0.b[3]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[3]\n"
+      "sdot z14.s, z25.b, z2.b[3]\n"
+      "sdot z18.s, z25.b, z1.b[3]\n"
+      "sdot z22.s, z25.b, z0.b[3]\n"
+      "sdot z11.s, z24.b, z3.b[3]\n"
+      "sdot z15.s, z24.b, z2.b[3]\n"
+      "sdot z19.s, z24.b, z1.b[3]\n"
+      "sdot z23.s, z24.b, z0.b[3]\n"
       "bgt 41b\n"
       "42:"  // Height 4: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[0]\n"
+      "sdot z12.s, z25.b, z1.b[0]\n"
+      "sdot z16.s, z25.b, z2.b[0]\n"
+      "sdot z20.s, z25.b, z3.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[0]\n"
+      "sdot z13.s, z24.b, z1.b[0]\n"
+      "sdot z17.s, z24.b, z2.b[0]\n"
+      "sdot z21.s, z24.b, z3.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z25.b, z0.b[0]\n"
+      "sdot z14.s, z25.b, z1.b[0]\n"
+      "sdot z18.s, z25.b, z2.b[0]\n"
+      "sdot z22.s, z25.b, z3.b[0]\n"
+      "sdot z11.s, z24.b, z0.b[0]\n"
+      "sdot z15.s, z24.b, z1.b[0]\n"
+      "sdot z19.s, z24.b, z2.b[0]\n"
+      "sdot z23.s, z24.b, z3.b[0]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[1]\n"
+      "sdot z12.s, z25.b, z1.b[1]\n"
+      "sdot z16.s, z25.b, z2.b[1]\n"
+      "sdot z20.s, z25.b, z3.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z9.s, z24.b, z0.b[1]\n"
+      "sdot z13.s, z24.b, z1.b[1]\n"
+      "sdot z17.s, z24.b, z2.b[1]\n"
+      "sdot z21.s, z24.b, z3.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z25.b, z0.b[1]\n"
+      "sdot z14.s, z25.b, z1.b[1]\n"
+      "sdot z18.s, z25.b, z2.b[1]\n"
+      "sdot z22.s, z25.b, z3.b[1]\n"
+      "sdot z11.s, z24.b, z0.b[1]\n"
+      "sdot z15.s, z24.b, z1.b[1]\n"
+      "sdot z19.s, z24.b, z2.b[1]\n"
+      "sdot z23.s, z24.b, z3.b[1]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[2]\n"
+      "sdot z12.s, z25.b, z1.b[2]\n"
+      "sdot z16.s, z25.b, z2.b[2]\n"
+      "sdot z20.s, z25.b, z3.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z9.s, z24.b, z0.b[2]\n"
+      "sdot z13.s, z24.b, z1.b[2]\n"
+      "sdot z17.s, z24.b, z2.b[2]\n"
+      "sdot z21.s, z24.b, z3.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z25.b, z0.b[2]\n"
+      "sdot z14.s, z25.b, z1.b[2]\n"
+      "sdot z18.s, z25.b, z2.b[2]\n"
+      "sdot z22.s, z25.b, z3.b[2]\n"
+      "sdot z11.s, z24.b, z0.b[2]\n"
+      "sdot z15.s, z24.b, z1.b[2]\n"
+      "sdot z19.s, z24.b, z2.b[2]\n"
+      "sdot z23.s, z24.b, z3.b[2]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[3]\n"
+      "sdot z12.s, z25.b, z1.b[3]\n"
+      "sdot z16.s, z25.b, z2.b[3]\n"
+      "sdot z20.s, z25.b, z3.b[3]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[3]\n"
+      "sdot z13.s, z24.b, z1.b[3]\n"
+      "sdot z17.s, z24.b, z2.b[3]\n"
+      "sdot z21.s, z24.b, z3.b[3]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z25.b, z0.b[3]\n"
+      "sdot z14.s, z25.b, z1.b[3]\n"
+      "sdot z18.s, z25.b, z2.b[3]\n"
+      "sdot z22.s, z25.b, z3.b[3]\n"
+      "sdot z11.s, z24.b, z0.b[3]\n"
+      "sdot z15.s, z24.b, z1.b[3]\n"
+      "sdot z19.s, z24.b, z2.b[3]\n"
+      "sdot z23.s, z24.b, z3.b[3]\n"
       "43:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 38b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "st1w { z12.s }, p4, [x23]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x22]\n"
-      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x21]\n"
-      "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x22]\n"
+      "st1w { z13.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x20]\n"
+      "st1w { z21.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x20, #3, MUL VL]\n"
       "44:"  // Height 4: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 35b\n"
       "b 68f\n"
       "45:"  // Height 5
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "46:"  // Height 5: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 47f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
       "ld1w { z12.s }, p4/Z, [x23]\n"
-      "add x20, x21, x19, LSL #2\n"
       "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
       "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
       "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
@@ -1068,293 +1038,278 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "mov z26.s, #0x0\n"
       "mov z27.s, #0x0\n"
       "48:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "49:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 50f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 51f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 51f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
       "b 51f\n"
       "50:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "51:"  // Height 5: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 53f\n"
       "52:"  // Height 5: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z4.b }, p0/Z, [x26]\n"
+      "ld1rqb { z3.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x22]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z29.b, z4.b[0]\n"
+      "sdot z12.s, z29.b, z3.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z16.s, z29.b, z2.b[0]\n"
+      "sdot z20.s, z29.b, z1.b[0]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "sdot z24.s, z29.b, z0.b[0]\n"
+      "sdot z9.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "sdot z13.s, z28.b, z3.b[0]\n"
+      "sdot z17.s, z28.b, z2.b[0]\n"
       "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z21.s, z28.b, z1.b[0]\n"
+      "sdot z25.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[0]\n"
+      "sdot z14.s, z29.b, z3.b[0]\n"
+      "sdot z18.s, z29.b, z2.b[0]\n"
+      "sdot z22.s, z29.b, z1.b[0]\n"
+      "sdot z26.s, z29.b, z0.b[0]\n"
+      "sdot z11.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[0]\n"
+      "sdot z19.s, z28.b, z2.b[0]\n"
+      "sdot z23.s, z28.b, z1.b[0]\n"
+      "sdot z27.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[1]\n"
+      "sdot z12.s, z29.b, z3.b[1]\n"
+      "sdot z16.s, z29.b, z2.b[1]\n"
+      "sdot z20.s, z29.b, z1.b[1]\n"
+      "sdot z24.s, z29.b, z0.b[1]\n"
+      "sdot z9.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[1]\n"
+      "sdot z17.s, z28.b, z2.b[1]\n"
+      "sdot z21.s, z28.b, z1.b[1]\n"
+      "sdot z25.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "sdot z10.s, z29.b, z4.b[1]\n"
+      "sdot z14.s, z29.b, z3.b[1]\n"
+      "sdot z18.s, z29.b, z2.b[1]\n"
+      "sdot z22.s, z29.b, z1.b[1]\n"
+      "sdot z26.s, z29.b, z0.b[1]\n"
+      "sdot z11.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[1]\n"
+      "sdot z19.s, z28.b, z2.b[1]\n"
+      "sdot z23.s, z28.b, z1.b[1]\n"
+      "sdot z27.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[2]\n"
+      "sdot z12.s, z29.b, z3.b[2]\n"
+      "sdot z16.s, z29.b, z2.b[2]\n"
+      "sdot z20.s, z29.b, z1.b[2]\n"
+      "sdot z24.s, z29.b, z0.b[2]\n"
+      "sdot z9.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[2]\n"
+      "sdot z17.s, z28.b, z2.b[2]\n"
+      "sdot z21.s, z28.b, z1.b[2]\n"
+      "sdot z25.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[2]\n"
+      "sdot z14.s, z29.b, z3.b[2]\n"
+      "sdot z18.s, z29.b, z2.b[2]\n"
+      "sdot z22.s, z29.b, z1.b[2]\n"
+      "sdot z26.s, z29.b, z0.b[2]\n"
+      "sdot z11.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[2]\n"
+      "sdot z19.s, z28.b, z2.b[2]\n"
+      "sdot z23.s, z28.b, z1.b[2]\n"
+      "sdot z27.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[3]\n"
+      "sdot z12.s, z29.b, z3.b[3]\n"
+      "sdot z16.s, z29.b, z2.b[3]\n"
+      "sdot z20.s, z29.b, z1.b[3]\n"
+      "sdot z24.s, z29.b, z0.b[3]\n"
+      "sdot z9.s, z28.b, z4.b[3]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[3]\n"
+      "sdot z17.s, z28.b, z2.b[3]\n"
+      "sdot z21.s, z28.b, z1.b[3]\n"
+      "sdot z25.s, z28.b, z0.b[3]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[3]\n"
+      "sdot z14.s, z29.b, z3.b[3]\n"
+      "sdot z18.s, z29.b, z2.b[3]\n"
+      "sdot z22.s, z29.b, z1.b[3]\n"
+      "sdot z26.s, z29.b, z0.b[3]\n"
+      "sdot z11.s, z28.b, z4.b[3]\n"
+      "sdot z15.s, z28.b, z3.b[3]\n"
+      "sdot z19.s, z28.b, z2.b[3]\n"
+      "sdot z23.s, z28.b, z1.b[3]\n"
+      "sdot z27.s, z28.b, z0.b[3]\n"
       "bgt 52b\n"
       "53:"  // Height 5: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z29.b, z0.b[0]\n"
+      "sdot z12.s, z29.b, z1.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z16.s, z29.b, z2.b[0]\n"
+      "sdot z20.s, z29.b, z3.b[0]\n"
+      "sdot z24.s, z29.b, z4.b[0]\n"
+      "sdot z9.s, z28.b, z0.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[0]\n"
+      "sdot z17.s, z28.b, z2.b[0]\n"
+      "sdot z21.s, z28.b, z3.b[0]\n"
+      "sdot z25.s, z28.b, z4.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z29.b, z0.b[0]\n"
+      "sdot z14.s, z29.b, z1.b[0]\n"
+      "sdot z18.s, z29.b, z2.b[0]\n"
+      "sdot z22.s, z29.b, z3.b[0]\n"
+      "sdot z26.s, z29.b, z4.b[0]\n"
+      "sdot z11.s, z28.b, z0.b[0]\n"
+      "sdot z15.s, z28.b, z1.b[0]\n"
+      "sdot z19.s, z28.b, z2.b[0]\n"
+      "sdot z23.s, z28.b, z3.b[0]\n"
+      "sdot z27.s, z28.b, z4.b[0]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[1]\n"
+      "sdot z12.s, z29.b, z1.b[1]\n"
+      "sdot z16.s, z29.b, z2.b[1]\n"
+      "sdot z20.s, z29.b, z3.b[1]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z24.s, z29.b, z4.b[1]\n"
+      "sdot z9.s, z28.b, z0.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[1]\n"
+      "sdot z17.s, z28.b, z2.b[1]\n"
+      "sdot z21.s, z28.b, z3.b[1]\n"
+      "sdot z25.s, z28.b, z4.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z29.b, z0.b[1]\n"
+      "sdot z14.s, z29.b, z1.b[1]\n"
+      "sdot z18.s, z29.b, z2.b[1]\n"
+      "sdot z22.s, z29.b, z3.b[1]\n"
+      "sdot z26.s, z29.b, z4.b[1]\n"
+      "sdot z11.s, z28.b, z0.b[1]\n"
+      "sdot z15.s, z28.b, z1.b[1]\n"
+      "sdot z19.s, z28.b, z2.b[1]\n"
+      "sdot z23.s, z28.b, z3.b[1]\n"
+      "sdot z27.s, z28.b, z4.b[1]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[2]\n"
+      "sdot z12.s, z29.b, z1.b[2]\n"
+      "sdot z16.s, z29.b, z2.b[2]\n"
+      "sdot z20.s, z29.b, z3.b[2]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z24.s, z29.b, z4.b[2]\n"
+      "sdot z9.s, z28.b, z0.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[2]\n"
+      "sdot z17.s, z28.b, z2.b[2]\n"
+      "sdot z21.s, z28.b, z3.b[2]\n"
+      "sdot z25.s, z28.b, z4.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z29.b, z0.b[2]\n"
+      "sdot z14.s, z29.b, z1.b[2]\n"
+      "sdot z18.s, z29.b, z2.b[2]\n"
+      "sdot z22.s, z29.b, z3.b[2]\n"
+      "sdot z26.s, z29.b, z4.b[2]\n"
+      "sdot z11.s, z28.b, z0.b[2]\n"
+      "sdot z15.s, z28.b, z1.b[2]\n"
+      "sdot z19.s, z28.b, z2.b[2]\n"
+      "sdot z23.s, z28.b, z3.b[2]\n"
+      "sdot z27.s, z28.b, z4.b[2]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[3]\n"
+      "sdot z12.s, z29.b, z1.b[3]\n"
+      "sdot z16.s, z29.b, z2.b[3]\n"
+      "sdot z20.s, z29.b, z3.b[3]\n"
+      "sdot z24.s, z29.b, z4.b[3]\n"
+      "sdot z9.s, z28.b, z0.b[3]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[3]\n"
+      "sdot z17.s, z28.b, z2.b[3]\n"
+      "sdot z21.s, z28.b, z3.b[3]\n"
+      "sdot z25.s, z28.b, z4.b[3]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z29.b, z0.b[3]\n"
+      "sdot z14.s, z29.b, z1.b[3]\n"
+      "sdot z18.s, z29.b, z2.b[3]\n"
+      "sdot z22.s, z29.b, z3.b[3]\n"
+      "sdot z26.s, z29.b, z4.b[3]\n"
+      "sdot z11.s, z28.b, z0.b[3]\n"
+      "sdot z15.s, z28.b, z1.b[3]\n"
+      "sdot z19.s, z28.b, z2.b[3]\n"
+      "sdot z23.s, z28.b, z3.b[3]\n"
+      "sdot z27.s, z28.b, z4.b[3]\n"
       "54:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 49b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
       "st1w { z12.s }, p4, [x23]\n"
-      "add x20, x21, x19, LSL #2\n"
       "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
-      "addvl x28, x28, #4\n"
       "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
       "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
       "st1w { z16.s }, p4, [x22]\n"
@@ -1370,57 +1325,57 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
       "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
       "55:"  // Height 5: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 46b\n"
       "b 68f\n"
       "56:"  // Height 6
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "mov x20, #0x18\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "57:"  // Height 6: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 58f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "ld1w { z12.s }, p4/Z, [x23]\n"
-      "add x20, x21, x19, LSL #2\n"
-      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "add x19, x20, x19, LSL #2\n"
-      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x22]\n"
-      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x21]\n"
-      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x20]\n"
-      "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
-      "ld1w { z28.s }, p4/Z, [x19]\n"
-      "ld1w { z29.s }, p3/Z, [x19, #1, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x19, #2, MUL VL]\n"
-      "ld1w { z31.s }, p1/Z, [x19, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x23]\n"
+      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x21]\n"
+      "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 59f\n"
       "58:"  // Height 6: no accumulate
       "mov z8.s, #0x0\n"
@@ -1448,375 +1403,356 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "mov z30.s, #0x0\n"
       "mov z31.s, #0x0\n"
       "59:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "60:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 61f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 62f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
       "b 62f\n"
       "61:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "62:"  // Height 6: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 64f\n"
       "63:"  // Height 6: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z6.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z4.b }, p0/Z, [x23]\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z2.b }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[0]\n"
+      "sdot z12.s, z1.b, z6.b[0]\n"
+      "sdot z16.s, z1.b, z5.b[0]\n"
+      "sdot z20.s, z1.b, z4.b[0]\n"
       "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "sdot z24.s, z1.b, z3.b[0]\n"
+      "sdot z28.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x20, x20, #0x10\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
-      "sdot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "sdot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z30.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "sdot z31.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "sdot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z30.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "sdot z31.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "sdot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z30.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "sdot z31.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "sdot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z30.s, z6.b, z5.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
-      "sdot z31.s, z7.b, z5.b[3]\n"
+      "sdot z9.s, z0.b, z7.b[0]\n"
+      "sdot z13.s, z0.b, z6.b[0]\n"
+      "sdot z17.s, z0.b, z5.b[0]\n"
+      "sdot z21.s, z0.b, z4.b[0]\n"
+      "sdot z25.s, z0.b, z3.b[0]\n"
+      "sdot z29.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[0]\n"
+      "sdot z14.s, z1.b, z6.b[0]\n"
+      "sdot z18.s, z1.b, z5.b[0]\n"
+      "sdot z22.s, z1.b, z4.b[0]\n"
+      "sdot z26.s, z1.b, z3.b[0]\n"
+      "sdot z30.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[0]\n"
+      "sdot z15.s, z0.b, z6.b[0]\n"
+      "sdot z19.s, z0.b, z5.b[0]\n"
+      "sdot z23.s, z0.b, z4.b[0]\n"
+      "sdot z27.s, z0.b, z3.b[0]\n"
+      "sdot z31.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[1]\n"
+      "sdot z12.s, z1.b, z6.b[1]\n"
+      "sdot z16.s, z1.b, z5.b[1]\n"
+      "sdot z20.s, z1.b, z4.b[1]\n"
+      "sdot z24.s, z1.b, z3.b[1]\n"
+      "sdot z28.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[1]\n"
+      "sdot z13.s, z0.b, z6.b[1]\n"
+      "sdot z17.s, z0.b, z5.b[1]\n"
+      "sdot z21.s, z0.b, z4.b[1]\n"
+      "sdot z25.s, z0.b, z3.b[1]\n"
+      "sdot z29.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "sdot z10.s, z1.b, z7.b[1]\n"
+      "sdot z14.s, z1.b, z6.b[1]\n"
+      "sdot z18.s, z1.b, z5.b[1]\n"
+      "sdot z22.s, z1.b, z4.b[1]\n"
+      "sdot z26.s, z1.b, z3.b[1]\n"
+      "sdot z30.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[1]\n"
+      "sdot z15.s, z0.b, z6.b[1]\n"
+      "sdot z19.s, z0.b, z5.b[1]\n"
+      "sdot z23.s, z0.b, z4.b[1]\n"
+      "sdot z27.s, z0.b, z3.b[1]\n"
+      "sdot z31.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[2]\n"
+      "sdot z12.s, z1.b, z6.b[2]\n"
+      "sdot z16.s, z1.b, z5.b[2]\n"
+      "sdot z20.s, z1.b, z4.b[2]\n"
+      "sdot z24.s, z1.b, z3.b[2]\n"
+      "sdot z28.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[2]\n"
+      "sdot z13.s, z0.b, z6.b[2]\n"
+      "sdot z17.s, z0.b, z5.b[2]\n"
+      "sdot z21.s, z0.b, z4.b[2]\n"
+      "sdot z25.s, z0.b, z3.b[2]\n"
+      "sdot z29.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[2]\n"
+      "sdot z14.s, z1.b, z6.b[2]\n"
+      "sdot z18.s, z1.b, z5.b[2]\n"
+      "sdot z22.s, z1.b, z4.b[2]\n"
+      "sdot z26.s, z1.b, z3.b[2]\n"
+      "sdot z30.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[2]\n"
+      "sdot z15.s, z0.b, z6.b[2]\n"
+      "sdot z19.s, z0.b, z5.b[2]\n"
+      "sdot z23.s, z0.b, z4.b[2]\n"
+      "sdot z27.s, z0.b, z3.b[2]\n"
+      "sdot z31.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[3]\n"
+      "sdot z12.s, z1.b, z6.b[3]\n"
+      "sdot z16.s, z1.b, z5.b[3]\n"
+      "sdot z20.s, z1.b, z4.b[3]\n"
+      "sdot z24.s, z1.b, z3.b[3]\n"
+      "sdot z28.s, z1.b, z2.b[3]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[3]\n"
+      "sdot z13.s, z0.b, z6.b[3]\n"
+      "sdot z17.s, z0.b, z5.b[3]\n"
+      "sdot z21.s, z0.b, z4.b[3]\n"
+      "sdot z25.s, z0.b, z3.b[3]\n"
+      "sdot z29.s, z0.b, z2.b[3]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[3]\n"
+      "sdot z14.s, z1.b, z6.b[3]\n"
+      "sdot z18.s, z1.b, z5.b[3]\n"
+      "sdot z22.s, z1.b, z4.b[3]\n"
+      "sdot z26.s, z1.b, z3.b[3]\n"
+      "sdot z30.s, z1.b, z2.b[3]\n"
+      "sdot z11.s, z0.b, z7.b[3]\n"
+      "sdot z15.s, z0.b, z6.b[3]\n"
+      "sdot z19.s, z0.b, z5.b[3]\n"
+      "sdot z23.s, z0.b, z4.b[3]\n"
+      "sdot z27.s, z0.b, z3.b[3]\n"
+      "sdot z31.s, z0.b, z2.b[3]\n"
       "bgt 63b\n"
       "64:"  // Height 6: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1rqb { z5.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "add x20, x20, #0x10\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "sdot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "sdot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z30.s, z6.b, z5.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "sdot z31.s, z7.b, z5.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[0]\n"
+      "sdot z12.s, z7.b, z1.b[0]\n"
+      "sdot z16.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z7.b, z3.b[0]\n"
+      "sdot z24.s, z7.b, z4.b[0]\n"
+      "sdot z28.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[0]\n"
+      "sdot z13.s, z6.b, z1.b[0]\n"
+      "sdot z17.s, z6.b, z2.b[0]\n"
+      "sdot z21.s, z6.b, z3.b[0]\n"
+      "sdot z25.s, z6.b, z4.b[0]\n"
+      "sdot z29.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z7.b, z0.b[0]\n"
+      "sdot z14.s, z7.b, z1.b[0]\n"
+      "sdot z18.s, z7.b, z2.b[0]\n"
+      "sdot z22.s, z7.b, z3.b[0]\n"
+      "sdot z26.s, z7.b, z4.b[0]\n"
+      "sdot z30.s, z7.b, z5.b[0]\n"
+      "sdot z11.s, z6.b, z0.b[0]\n"
+      "sdot z15.s, z6.b, z1.b[0]\n"
+      "sdot z19.s, z6.b, z2.b[0]\n"
+      "sdot z23.s, z6.b, z3.b[0]\n"
+      "sdot z27.s, z6.b, z4.b[0]\n"
+      "sdot z31.s, z6.b, z5.b[0]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "sdot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z30.s, z6.b, z5.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "sdot z31.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[1]\n"
+      "sdot z12.s, z7.b, z1.b[1]\n"
+      "sdot z16.s, z7.b, z2.b[1]\n"
+      "sdot z20.s, z7.b, z3.b[1]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z24.s, z7.b, z4.b[1]\n"
+      "sdot z28.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[1]\n"
+      "sdot z13.s, z6.b, z1.b[1]\n"
+      "sdot z17.s, z6.b, z2.b[1]\n"
+      "sdot z21.s, z6.b, z3.b[1]\n"
+      "sdot z25.s, z6.b, z4.b[1]\n"
+      "sdot z29.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z7.b, z0.b[1]\n"
+      "sdot z14.s, z7.b, z1.b[1]\n"
+      "sdot z18.s, z7.b, z2.b[1]\n"
+      "sdot z22.s, z7.b, z3.b[1]\n"
+      "sdot z26.s, z7.b, z4.b[1]\n"
+      "sdot z30.s, z7.b, z5.b[1]\n"
+      "sdot z11.s, z6.b, z0.b[1]\n"
+      "sdot z15.s, z6.b, z1.b[1]\n"
+      "sdot z19.s, z6.b, z2.b[1]\n"
+      "sdot z23.s, z6.b, z3.b[1]\n"
+      "sdot z27.s, z6.b, z4.b[1]\n"
+      "sdot z31.s, z6.b, z5.b[1]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "sdot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z30.s, z6.b, z5.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "sdot z31.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[2]\n"
+      "sdot z12.s, z7.b, z1.b[2]\n"
+      "sdot z16.s, z7.b, z2.b[2]\n"
+      "sdot z20.s, z7.b, z3.b[2]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z24.s, z7.b, z4.b[2]\n"
+      "sdot z28.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[2]\n"
+      "sdot z13.s, z6.b, z1.b[2]\n"
+      "sdot z17.s, z6.b, z2.b[2]\n"
+      "sdot z21.s, z6.b, z3.b[2]\n"
+      "sdot z25.s, z6.b, z4.b[2]\n"
+      "sdot z29.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z7.b, z0.b[2]\n"
+      "sdot z14.s, z7.b, z1.b[2]\n"
+      "sdot z18.s, z7.b, z2.b[2]\n"
+      "sdot z22.s, z7.b, z3.b[2]\n"
+      "sdot z26.s, z7.b, z4.b[2]\n"
+      "sdot z30.s, z7.b, z5.b[2]\n"
+      "sdot z11.s, z6.b, z0.b[2]\n"
+      "sdot z15.s, z6.b, z1.b[2]\n"
+      "sdot z19.s, z6.b, z2.b[2]\n"
+      "sdot z23.s, z6.b, z3.b[2]\n"
+      "sdot z27.s, z6.b, z4.b[2]\n"
+      "sdot z31.s, z6.b, z5.b[2]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "sdot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z30.s, z6.b, z5.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
-      "sdot z31.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[3]\n"
+      "sdot z12.s, z7.b, z1.b[3]\n"
+      "sdot z16.s, z7.b, z2.b[3]\n"
+      "sdot z20.s, z7.b, z3.b[3]\n"
+      "sdot z24.s, z7.b, z4.b[3]\n"
+      "sdot z28.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[3]\n"
+      "sdot z13.s, z6.b, z1.b[3]\n"
+      "sdot z17.s, z6.b, z2.b[3]\n"
+      "sdot z21.s, z6.b, z3.b[3]\n"
+      "sdot z25.s, z6.b, z4.b[3]\n"
+      "sdot z29.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "sdot z10.s, z7.b, z0.b[3]\n"
+      "sdot z14.s, z7.b, z1.b[3]\n"
+      "sdot z18.s, z7.b, z2.b[3]\n"
+      "sdot z22.s, z7.b, z3.b[3]\n"
+      "sdot z26.s, z7.b, z4.b[3]\n"
+      "sdot z30.s, z7.b, z5.b[3]\n"
+      "sdot z11.s, z6.b, z0.b[3]\n"
+      "sdot z15.s, z6.b, z1.b[3]\n"
+      "sdot z19.s, z6.b, z2.b[3]\n"
+      "sdot z23.s, z6.b, z3.b[3]\n"
+      "sdot z27.s, z6.b, z4.b[3]\n"
+      "sdot z31.s, z6.b, z5.b[3]\n"
       "65:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 60b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "st1w { z12.s }, p4, [x23]\n"
-      "add x20, x21, x19, LSL #2\n"
-      "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
-      "add x19, x20, x19, LSL #2\n"
-      "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x22]\n"
-      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x21]\n"
-      "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
-      "st1w { z24.s }, p4, [x20]\n"
-      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
-      "st1w { z28.s }, p4, [x19]\n"
-      "st1w { z29.s }, p3, [x19, #1, MUL VL]\n"
-      "st1w { z30.s }, p2, [x19, #2, MUL VL]\n"
-      "st1w { z31.s }, p1, [x19, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x24]\n"
+      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x22]\n"
+      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x21]\n"
+      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z28.s }, p4, [x20]\n"
+      "st1w { z29.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x20, #3, MUL VL]\n"
       "66:"  // Height 6: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 57b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 68f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 67f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "67:"  // Update direct input
-      "mov x19, #0x6\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "68:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
new file mode 100644
index 0000000000..686295496e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int32_t>, \
+    const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_s8s32_mmla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8s32_mmla_6x4VL
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 54.42 };
+                case CPUModel::A510:
+                    return { 24.21 };
+                case CPUModel::V1:
+                    return { 104.92 };
+            }
+        }
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 54.99, 15.37, 0.62 };
+                case CPUModel::A510:
+                    return { 23.87, 3.89, 0.37 };
+                case CPUModel::V1:
+                    return { 75.14, 15.87, 0.83 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8s32_mmla_6x4VL;
+    cls_sve_hybrid_s8s32_mmla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..f66b6345ea
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
@@ -0,0 +1,1674 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_mmla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+    const int32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 56f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 45f\n"
+      "beq 34f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 23f\n"
+      "beq 12f\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 3f\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 4f\n"
+      "3:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "ble 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45119a88  // smmla z8.s, z20.b, z17.b\n"
+      ".inst 0x45109a8c  // smmla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45119a89  // smmla z9.s, z20.b, z17.b\n"
+      ".inst 0x45109a8d  // smmla z13.s, z20.b, z16.b\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45109a8a  // smmla z10.s, z20.b, z16.b\n"
+      ".inst 0x45079a8e  // smmla z14.s, z20.b, z7.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45119a8b  // smmla z11.s, z20.b, z17.b\n"
+      ".inst 0x45109a8f  // smmla z15.s, z20.b, z16.b\n"
+      "add x26, x26, #0x10\n"
+      "bgt 8b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "addvl x10, x10, #8\n"
+      "ble 10f\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119828  // smmla z8.s, z1.b, z17.b\n"
+      ".inst 0x4510982c  // smmla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119829  // smmla z9.s, z1.b, z17.b\n"
+      ".inst 0x4510982d  // smmla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4511982a  // smmla z10.s, z1.b, z17.b\n"
+      ".inst 0x4510982e  // smmla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x4511982b  // smmla z11.s, z1.b, z17.b\n"
+      ".inst 0x4510982f  // smmla z15.s, z1.b, z16.b\n"
+      "addvl x10, x10, #8\n"
+      "10:"  // Height 1: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 5b\n"
+      "uzp1 z8.d, z8.d, z12.d\n"
+      "uzp1 z9.d, z9.d, z13.d\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "uzp1 z10.d, z10.d, z14.d\n"
+      "uzp1 z11.d, z11.d, z15.d\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "11:"  // Height 1: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 2b\n"
+      "b 68f\n"
+      "12:"  // Height 2
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "13:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 14f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z18.s }, p4/Z, [x9]\n"
+      "ld1w { z2.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "zip1 z8.d, z18.d, z12.d\n"
+      "zip2 z12.d, z18.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z2.d, z13.d\n"
+      "zip2 z13.d, z2.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 15f\n"
+      "14:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "15:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "16:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 18f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "b 18f\n"
+      "17:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "18:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "ble 20f\n"
+      "19:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45119a88  // smmla z8.s, z20.b, z17.b\n"
+      ".inst 0x45109a8c  // smmla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45119a89  // smmla z9.s, z20.b, z17.b\n"
+      ".inst 0x45109a8d  // smmla z13.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45119a8a  // smmla z10.s, z20.b, z17.b\n"
+      ".inst 0x45109a8e  // smmla z14.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45119a8b  // smmla z11.s, z20.b, z17.b\n"
+      ".inst 0x45109a8f  // smmla z15.s, z20.b, z16.b\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "bgt 19b\n"
+      "20:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "addvl x10, x10, #8\n"
+      "ble 21f\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119828  // smmla z8.s, z1.b, z17.b\n"
+      ".inst 0x4510982c  // smmla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119829  // smmla z9.s, z1.b, z17.b\n"
+      ".inst 0x4510982d  // smmla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4511982a  // smmla z10.s, z1.b, z17.b\n"
+      ".inst 0x4510982e  // smmla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x4511982b  // smmla z11.s, z1.b, z17.b\n"
+      ".inst 0x4510982f  // smmla z15.s, z1.b, z16.b\n"
+      "addvl x10, x10, #8\n"
+      "21:"  // Height 2: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 16b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "uzp1 z16.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z17.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z16.s }, p4, [x9]\n"
+      "uzp1 z16.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z17.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp1 z2.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z16.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x20]\n"
+      "st1w { z9.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x20, #3, MUL VL]\n"
+      "22:"  // Height 2: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 13b\n"
+      "b 68f\n"
+      "23:"  // Height 3
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "24:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 25f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x20]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 26f\n"
+      "25:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "26:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "27:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 28f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 29f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "b 29f\n"
+      "28:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "29:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "ble 31f\n"
+      "30:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "trn1 z27.d, z30.d, z24.d\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "trn1 z26.d, z28.d, z29.d\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199b68  // smmla z8.s, z27.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b6c  // smmla z12.s, z27.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199b69  // smmla z9.s, z27.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z29.d\n"
+      ".inst 0x45189b6d  // smmla z13.s, z27.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x45199b6a  // smmla z10.s, z27.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45189b6e  // smmla z14.s, z27.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x45199b6b  // smmla z11.s, z27.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x45189b6f  // smmla z15.s, z27.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45199bc8  // smmla z8.s, z30.b, z25.b\n"
+      ".inst 0x45199b90  // smmla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45189bcc  // smmla z12.s, z30.b, z24.b\n"
+      ".inst 0x45189b94  // smmla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45199bc9  // smmla z9.s, z30.b, z25.b\n"
+      ".inst 0x45199b91  // smmla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45189bcd  // smmla z13.s, z30.b, z24.b\n"
+      ".inst 0x45189b95  // smmla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45199bca  // smmla z10.s, z30.b, z25.b\n"
+      ".inst 0x45199b92  // smmla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45189bce  // smmla z14.s, z30.b, z24.b\n"
+      ".inst 0x45189b96  // smmla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45199bcb  // smmla z11.s, z30.b, z25.b\n"
+      ".inst 0x45199b93  // smmla z19.s, z28.b, z25.b\n"
+      ".inst 0x45189bcf  // smmla z15.s, z30.b, z24.b\n"
+      ".inst 0x45189b97  // smmla z23.s, z28.b, z24.b\n"
+      "bgt 30b\n"
+      "31:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "trn1 z27.d, z1.d, z24.d\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "trn1 z26.d, z3.d, z28.d\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199b68  // smmla z8.s, z27.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b6c  // smmla z12.s, z27.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199b69  // smmla z9.s, z27.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x45189b6d  // smmla z13.s, z27.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z28.d\n"
+      ".inst 0x45199b6a  // smmla z10.s, z27.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45189b6e  // smmla z14.s, z27.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x45199b6b  // smmla z11.s, z27.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      ".inst 0x45189b6f  // smmla z15.s, z27.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ble 32f\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199828  // smmla z8.s, z1.b, z25.b\n"
+      ".inst 0x45199870  // smmla z16.s, z3.b, z25.b\n"
+      ".inst 0x4518982c  // smmla z12.s, z1.b, z24.b\n"
+      ".inst 0x45189874  // smmla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199829  // smmla z9.s, z1.b, z25.b\n"
+      ".inst 0x45199871  // smmla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x4518982d  // smmla z13.s, z1.b, z24.b\n"
+      ".inst 0x45189875  // smmla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4519982a  // smmla z10.s, z1.b, z25.b\n"
+      ".inst 0x45199872  // smmla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x4518982e  // smmla z14.s, z1.b, z24.b\n"
+      ".inst 0x45189876  // smmla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x4519982b  // smmla z11.s, z1.b, z25.b\n"
+      ".inst 0x45199873  // smmla z19.s, z3.b, z25.b\n"
+      ".inst 0x4518982f  // smmla z15.s, z1.b, z24.b\n"
+      ".inst 0x45189877  // smmla z23.s, z3.b, z24.b\n"
+      "32:"  // Height 3: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 27b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "uzp1 z25.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z24.d, z9.d, z13.d\n"
+      "st1w { z25.s }, p4, [x9]\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z25.d, z10.d, z14.d\n"
+      "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z24.d, z11.d, z15.d\n"
+      "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "st1w { z8.s }, p4, [x21]\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "st1w { z9.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
+      "33:"  // Height 3: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 24b\n"
+      "b 68f\n"
+      "34:"  // Height 4
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "35:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 36f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x21]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 37f\n"
+      "36:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "37:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "38:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 39f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 40f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 40f\n"
+      "39:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "40:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "ble 42f\n"
+      "41:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z29.d, z30.d, z24.d\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "trn1 z26.d, z28.d, z27.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199ba8  // smmla z8.s, z29.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189bac  // smmla z12.s, z29.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199ba9  // smmla z9.s, z29.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z27.d\n"
+      ".inst 0x45189bad  // smmla z13.s, z29.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x45199baa  // smmla z10.s, z29.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45189bae  // smmla z14.s, z29.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x45199bab  // smmla z11.s, z29.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x45189baf  // smmla z15.s, z29.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45199bc8  // smmla z8.s, z30.b, z25.b\n"
+      ".inst 0x45199b90  // smmla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45189bcc  // smmla z12.s, z30.b, z24.b\n"
+      ".inst 0x45189b94  // smmla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x45199bc9  // smmla z9.s, z30.b, z25.b\n"
+      ".inst 0x45199b91  // smmla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45189bcd  // smmla z13.s, z30.b, z24.b\n"
+      ".inst 0x45189b95  // smmla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45199bca  // smmla z10.s, z30.b, z25.b\n"
+      ".inst 0x45199b92  // smmla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45189bce  // smmla z14.s, z30.b, z24.b\n"
+      ".inst 0x45189b96  // smmla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45199bcb  // smmla z11.s, z30.b, z25.b\n"
+      ".inst 0x45199b93  // smmla z19.s, z28.b, z25.b\n"
+      ".inst 0x45189bcf  // smmla z15.s, z30.b, z24.b\n"
+      ".inst 0x45189b97  // smmla z23.s, z28.b, z24.b\n"
+      "bgt 41b\n"
+      "42:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z28.d, z1.d, z24.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "trn1 z26.d, z3.d, z27.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199b88  // smmla z8.s, z28.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b8c  // smmla z12.s, z28.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199b89  // smmla z9.s, z28.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x45189b8d  // smmla z13.s, z28.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z27.d\n"
+      ".inst 0x45199b8a  // smmla z10.s, z28.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45189b8e  // smmla z14.s, z28.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x45199b8b  // smmla z11.s, z28.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      ".inst 0x45189b8f  // smmla z15.s, z28.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ble 43f\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199828  // smmla z8.s, z1.b, z25.b\n"
+      ".inst 0x45199870  // smmla z16.s, z3.b, z25.b\n"
+      ".inst 0x4518982c  // smmla z12.s, z1.b, z24.b\n"
+      ".inst 0x45189874  // smmla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199829  // smmla z9.s, z1.b, z25.b\n"
+      ".inst 0x45199871  // smmla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x4518982d  // smmla z13.s, z1.b, z24.b\n"
+      ".inst 0x45189875  // smmla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4519982a  // smmla z10.s, z1.b, z25.b\n"
+      ".inst 0x45199872  // smmla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x4518982e  // smmla z14.s, z1.b, z24.b\n"
+      ".inst 0x45189876  // smmla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x4519982b  // smmla z11.s, z1.b, z25.b\n"
+      ".inst 0x45199873  // smmla z19.s, z3.b, z25.b\n"
+      ".inst 0x4518982f  // smmla z15.s, z1.b, z24.b\n"
+      ".inst 0x45189877  // smmla z23.s, z3.b, z24.b\n"
+      "43:"  // Height 4: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 38b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp1 z25.d, z8.d, z12.d\n"
+      "add x20, x21, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z24.d, z9.d, z13.d\n"
+      "st1w { z25.s }, p4, [x9]\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z25.d, z10.d, z14.d\n"
+      "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z24.d, z11.d, z15.d\n"
+      "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z25.d, z16.d, z20.d\n"
+      "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z24.d, z17.d, z21.d\n"
+      "st1w { z8.s }, p4, [x22]\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z9.s }, p3, [x22, #1, MUL VL]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z20.d, z19.d, z23.d\n"
+      "st1w { z10.s }, p2, [x22, #2, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "st1w { z11.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z25.s }, p4, [x21]\n"
+      "st1w { z24.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
+      "44:"  // Height 4: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 35b\n"
+      "b 68f\n"
+      "45:"  // Height 5
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "46:"  // Height 5: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 48f\n"
+      "47:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "48:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "49:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 51f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 51f\n"
+      "50:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "51:"  // Height 5: input setup done
+      "cmp x27, #0x10\n"
+      "ble 53f\n"
+      "52:"  // Height 5: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z6.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z7.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z5.d, z6.d, z1.d\n"
+      "trn2 z6.d, z6.d, z1.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "trn1 z3.d, z7.d, z2.d\n"
+      "trn2 z7.d, z7.d, z2.d\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "trn1 z2.d, z4.d, z0.d\n"
+      "trn2 z4.d, z4.d, z0.d\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x450198a8  // smmla z8.s, z5.b, z1.b\n"
+      ".inst 0x45019870  // smmla z16.s, z3.b, z1.b\n"
+      ".inst 0x45019858  // smmla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x450098ac  // smmla z12.s, z5.b, z0.b\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4500985c  // smmla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x450198a9  // smmla z9.s, z5.b, z1.b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45019871  // smmla z17.s, z3.b, z1.b\n"
+      ".inst 0x45019859  // smmla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x450098ad  // smmla z13.s, z5.b, z0.b\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4500985d  // smmla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x450198aa  // smmla z10.s, z5.b, z1.b\n"
+      ".inst 0x45019872  // smmla z18.s, z3.b, z1.b\n"
+      ".inst 0x4501985a  // smmla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x450098ae  // smmla z14.s, z5.b, z0.b\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x4500985e  // smmla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x450198ab  // smmla z11.s, z5.b, z1.b\n"
+      ".inst 0x45019873  // smmla z19.s, z3.b, z1.b\n"
+      ".inst 0x4501985b  // smmla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x450098af  // smmla z15.s, z5.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x4500985f  // smmla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x450198c8  // smmla z8.s, z6.b, z1.b\n"
+      ".inst 0x450198f0  // smmla z16.s, z7.b, z1.b\n"
+      ".inst 0x45019898  // smmla z24.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x450098cc  // smmla z12.s, z6.b, z0.b\n"
+      ".inst 0x450098f4  // smmla z20.s, z7.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x450198c9  // smmla z9.s, z6.b, z1.b\n"
+      ".inst 0x450198f1  // smmla z17.s, z7.b, z1.b\n"
+      ".inst 0x45019899  // smmla z25.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x450098cd  // smmla z13.s, z6.b, z0.b\n"
+      ".inst 0x450098f5  // smmla z21.s, z7.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x450198ca  // smmla z10.s, z6.b, z1.b\n"
+      ".inst 0x450198f2  // smmla z18.s, z7.b, z1.b\n"
+      ".inst 0x4501989a  // smmla z26.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x450098ce  // smmla z14.s, z6.b, z0.b\n"
+      ".inst 0x450098f6  // smmla z22.s, z7.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x450198cb  // smmla z11.s, z6.b, z1.b\n"
+      ".inst 0x450198f3  // smmla z19.s, z7.b, z1.b\n"
+      ".inst 0x4501989b  // smmla z27.s, z4.b, z1.b\n"
+      ".inst 0x450098cf  // smmla z15.s, z6.b, z0.b\n"
+      ".inst 0x450098f7  // smmla z23.s, z7.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
+      "bgt 52b\n"
+      "53:"  // Height 5: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z4.b }, p0/Z, [x25]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z7.d, z1.d, z4.d\n"
+      "trn2 z1.d, z1.d, z4.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x22]\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x450298e8  // smmla z8.s, z7.b, z2.b\n"
+      ".inst 0x450298d0  // smmla z16.s, z6.b, z2.b\n"
+      ".inst 0x45029898  // smmla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098d4  // smmla z20.s, z6.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x450298e9  // smmla z9.s, z7.b, z2.b\n"
+      ".inst 0x450298d1  // smmla z17.s, z6.b, z2.b\n"
+      ".inst 0x45029899  // smmla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098d5  // smmla z21.s, z6.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x450298ea  // smmla z10.s, z7.b, z2.b\n"
+      ".inst 0x450298d2  // smmla z18.s, z6.b, z2.b\n"
+      ".inst 0x4502989a  // smmla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098d6  // smmla z22.s, z6.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x450298eb  // smmla z11.s, z7.b, z2.b\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x450298d3  // smmla z19.s, z6.b, z2.b\n"
+      ".inst 0x4502989b  // smmla z27.s, z4.b, z2.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098d7  // smmla z23.s, z6.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
+      "ble 54f\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45029828  // smmla z8.s, z1.b, z2.b\n"
+      ".inst 0x45029870  // smmla z16.s, z3.b, z2.b\n"
+      ".inst 0x450298b8  // smmla z24.s, z5.b, z2.b\n"
+      ".inst 0x4500982c  // smmla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
+      ".inst 0x450098bc  // smmla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45029829  // smmla z9.s, z1.b, z2.b\n"
+      ".inst 0x45029871  // smmla z17.s, z3.b, z2.b\n"
+      ".inst 0x450298b9  // smmla z25.s, z5.b, z2.b\n"
+      ".inst 0x4500982d  // smmla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
+      ".inst 0x450098bd  // smmla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4502982a  // smmla z10.s, z1.b, z2.b\n"
+      ".inst 0x45029872  // smmla z18.s, z3.b, z2.b\n"
+      ".inst 0x450298ba  // smmla z26.s, z5.b, z2.b\n"
+      ".inst 0x4500982e  // smmla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x450098be  // smmla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x4502982b  // smmla z11.s, z1.b, z2.b\n"
+      ".inst 0x45029873  // smmla z19.s, z3.b, z2.b\n"
+      ".inst 0x450298bb  // smmla z27.s, z5.b, z2.b\n"
+      ".inst 0x4500982f  // smmla z15.s, z1.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x450098bf  // smmla z31.s, z5.b, z0.b\n"
+      "54:"  // Height 5: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 49b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp1 z2.d, z8.d, z12.d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z1.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z0.d, z10.d, z14.d\n"
+      "st1w { z2.s }, p4, [x9]\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z2.d, z11.d, z15.d\n"
+      "st1w { z1.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z1.d, z16.d, z20.d\n"
+      "st1w { z0.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z0.d, z17.d, z21.d\n"
+      "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z20.d, z19.d, z23.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "st1w { z1.s }, p4, [x22]\n"
+      "st1w { z0.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+      "55:"  // Height 5: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 46b\n"
+      "b 68f\n"
+      "56:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "57:"  // Height 6: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 58f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z17.s }, p4/Z, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z17.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip2 z12.d, z17.d, z12.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z20.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z14.d, z20.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 59f\n"
+      "58:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "59:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "60:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 62f\n"
+      "61:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "62:"  // Height 6: input setup done
+      "cmp x27, #0x10\n"
+      "ble 64f\n"
+      "63:"  // Height 6: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z6.d, z7.d, z0.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "trn2 z7.d, z7.d, z0.d\n"
+      "trn1 z4.d, z5.d, z1.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z5.d, z5.d, z1.d\n"
+      "trn1 z2.d, z3.d, z0.d\n"
+      "trn2 z3.d, z3.d, z0.d\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x450198c8  // smmla z8.s, z6.b, z1.b\n"
+      ".inst 0x45019890  // smmla z16.s, z4.b, z1.b\n"
+      ".inst 0x45019858  // smmla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x450098cc  // smmla z12.s, z6.b, z0.b\n"
+      ".inst 0x45009894  // smmla z20.s, z4.b, z0.b\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4500985c  // smmla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x450198c9  // smmla z9.s, z6.b, z1.b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45019891  // smmla z17.s, z4.b, z1.b\n"
+      ".inst 0x45019859  // smmla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x450098cd  // smmla z13.s, z6.b, z0.b\n"
+      ".inst 0x45009895  // smmla z21.s, z4.b, z0.b\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4500985d  // smmla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x450198ca  // smmla z10.s, z6.b, z1.b\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45019892  // smmla z18.s, z4.b, z1.b\n"
+      ".inst 0x4501985a  // smmla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x450098ce  // smmla z14.s, z6.b, z0.b\n"
+      ".inst 0x45009896  // smmla z22.s, z4.b, z0.b\n"
+      ".inst 0x4500985e  // smmla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x450198cb  // smmla z11.s, z6.b, z1.b\n"
+      ".inst 0x45019893  // smmla z19.s, z4.b, z1.b\n"
+      ".inst 0x4501985b  // smmla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x450098cf  // smmla z15.s, z6.b, z0.b\n"
+      ".inst 0x45009897  // smmla z23.s, z4.b, z0.b\n"
+      ".inst 0x4500985f  // smmla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x450198e8  // smmla z8.s, z7.b, z1.b\n"
+      ".inst 0x450198b0  // smmla z16.s, z5.b, z1.b\n"
+      ".inst 0x45019878  // smmla z24.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098b4  // smmla z20.s, z5.b, z0.b\n"
+      ".inst 0x4500987c  // smmla z28.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x450198e9  // smmla z9.s, z7.b, z1.b\n"
+      ".inst 0x450198b1  // smmla z17.s, z5.b, z1.b\n"
+      ".inst 0x45019879  // smmla z25.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098b5  // smmla z21.s, z5.b, z0.b\n"
+      ".inst 0x4500987d  // smmla z29.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x450198ea  // smmla z10.s, z7.b, z1.b\n"
+      ".inst 0x450198b2  // smmla z18.s, z5.b, z1.b\n"
+      ".inst 0x4501987a  // smmla z26.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098b6  // smmla z22.s, z5.b, z0.b\n"
+      ".inst 0x4500987e  // smmla z30.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x450198eb  // smmla z11.s, z7.b, z1.b\n"
+      ".inst 0x450198b3  // smmla z19.s, z5.b, z1.b\n"
+      ".inst 0x4501987b  // smmla z27.s, z3.b, z1.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098b7  // smmla z23.s, z5.b, z0.b\n"
+      ".inst 0x4500987f  // smmla z31.s, z3.b, z0.b\n"
+      "bgt 63b\n"
+      "64:"  // Height 6: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z7.d, z1.d, z0.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z0.d\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x22]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x450298e8  // smmla z8.s, z7.b, z2.b\n"
+      ".inst 0x450298d0  // smmla z16.s, z6.b, z2.b\n"
+      ".inst 0x45029898  // smmla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098d4  // smmla z20.s, z6.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x450298e9  // smmla z9.s, z7.b, z2.b\n"
+      ".inst 0x450298d1  // smmla z17.s, z6.b, z2.b\n"
+      ".inst 0x45029899  // smmla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098d5  // smmla z21.s, z6.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x450298ea  // smmla z10.s, z7.b, z2.b\n"
+      ".inst 0x450298d2  // smmla z18.s, z6.b, z2.b\n"
+      ".inst 0x4502989a  // smmla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098d6  // smmla z22.s, z6.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x450298eb  // smmla z11.s, z7.b, z2.b\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x450298d3  // smmla z19.s, z6.b, z2.b\n"
+      ".inst 0x4502989b  // smmla z27.s, z4.b, z2.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098d7  // smmla z23.s, z6.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
+      "ble 65f\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45029828  // smmla z8.s, z1.b, z2.b\n"
+      ".inst 0x45029870  // smmla z16.s, z3.b, z2.b\n"
+      ".inst 0x450298b8  // smmla z24.s, z5.b, z2.b\n"
+      ".inst 0x4500982c  // smmla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
+      ".inst 0x450098bc  // smmla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45029829  // smmla z9.s, z1.b, z2.b\n"
+      ".inst 0x45029871  // smmla z17.s, z3.b, z2.b\n"
+      ".inst 0x450298b9  // smmla z25.s, z5.b, z2.b\n"
+      ".inst 0x4500982d  // smmla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
+      ".inst 0x450098bd  // smmla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4502982a  // smmla z10.s, z1.b, z2.b\n"
+      ".inst 0x45029872  // smmla z18.s, z3.b, z2.b\n"
+      ".inst 0x450298ba  // smmla z26.s, z5.b, z2.b\n"
+      ".inst 0x4500982e  // smmla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x450098be  // smmla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x4502982b  // smmla z11.s, z1.b, z2.b\n"
+      ".inst 0x45029873  // smmla z19.s, z3.b, z2.b\n"
+      ".inst 0x450298bb  // smmla z27.s, z5.b, z2.b\n"
+      ".inst 0x4500982f  // smmla z15.s, z1.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x450098bf  // smmla z31.s, z5.b, z0.b\n"
+      "65:"  // Height 6: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 60b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 z0.d, z8.d, z12.d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x20, x21, x20, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "st1w { z0.s }, p4, [x9]\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "st1w { z15.s }, p4, [x23]\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z23.s }, p4, [x21]\n"
+      "st1w { z28.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z29.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z30.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+      "66:"  // Height 6: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 57b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 68f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "67:"  // Update direct input
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "68:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
index cfb8adfc87..11fe5ce7e3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,21 +10,22 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -42,7 +43,8 @@ void sve_hybrid_u8qa_dot_4x4VL( ARGLIST );
 class cls_sve_hybrid_u8qa_dot_4x4VL
 {
 public:
-    typedef uint8_t operand_type;
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
     typedef uint8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,7 +70,21 @@ public:
         return false;
     }
 
-    StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 4, 4, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 29.89 };
+                case CPUModel::A510:
+                    return { 17.12 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_u8qa_dot_4x4VL;
@@ -80,4 +96,4 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
index 373d82930b..e74b424888 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,18 +10,18 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -85,230 +85,227 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "cmp %x[M], #0x2\n"
       "bgt 29f\n"
       "beq 15f\n"
+      "mov x10, %x[col_bias]\n"
       "mov z11.s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
       "mov z15.b, #0x1\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x27, %x[col_bias]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
-      "mov x26, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
       "mov z16.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z17.s, #0x0\n"
-      "whilelt p1.b, x19, x9\n"
       "mov z18.s, #0x0\n"
       "mov z19.s, #0x0\n"
       "3:"  // Height 1: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "cbnz x25, 6f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
       "b 6f\n"
       "5:"  // Height 1: setup direct input
-      "mov x23, %x[input_ptr]\n"
+      "mov x24, %x[input_ptr]\n"
       "6:"  // Height 1: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "ble 9f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x23, #0x10\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "udot z16.s, z20.b, z0.b[0]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "udot z17.s, z21.b, z0.b[0]\n"
+      "udot z18.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z19.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "udot z16.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "udot z17.s, z21.b, z0.b[1]\n"
+      "udot z18.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
+      "udot z19.s, z20.b, z0.b[1]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "udot z16.s, z22.b, z0.b[2]\n"
+      "udot z17.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "udot z18.s, z21.b, z0.b[2]\n"
+      "udot z19.s, z20.b, z0.b[2]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "udot z16.s, z22.b, z0.b[3]\n"
+      "udot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "udot z18.s, z21.b, z0.b[3]\n"
+      "udot z19.s, z20.b, z0.b[3]\n"
+      "add x24, x24, #0x10\n"
       "tbnz %x[flags], #31, 8f\n"
       "udot z11.s, z0.b, z15.b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "cmp x24, #0x10\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
       "bgt 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x23, #0x10\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1b { z22.b }, p2/Z, [x28]\n"
+      "subs x25, x25, #0x4\n"
+      "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "udot z16.s, z22.b, z0.b[0]\n"
+      "udot z17.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z18.s, z21.b, z0.b[0]\n"
+      "udot z19.s, z20.b, z0.b[0]\n"
       "addvl x28, x28, #4\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
       "ble 10f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "udot z16.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z17.s, z22.b, z0.b[1]\n"
+      "udot z18.s, z21.b, z0.b[1]\n"
+      "udot z19.s, z20.b, z0.b[1]\n"
       "addvl x28, x28, #4\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
       "ble 10f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "udot z16.s, z20.b, z0.b[2]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z17.s, z22.b, z0.b[2]\n"
+      "udot z18.s, z21.b, z0.b[2]\n"
+      "udot z19.s, z20.b, z0.b[2]\n"
       "addvl x28, x28, #4\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
       "ble 10f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z21.b, z0.b[3]\n"
+      "udot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z18.s, z21.b, z0.b[3]\n"
+      "udot z19.s, z20.b, z0.b[3]\n"
       "addvl x28, x28, #4\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
       "10:"  // Height 1: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 11f\n"
       "udot z11.s, z0.b, z15.b\n"
       "11:"  // Height 1: Multiply loop: unique 2: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 4b\n"
       "tbnz %x[flags], #31, 12f\n"
-      "add x19, %x[qp], %[b_offset]\n"
-      "ld1rw { z1.s }, p2/Z, [x19]\n"
-      "neg z1.s, p2/M, z1.s\n"
-      "mov x19, #0x4\n"
-      "whilelt p0.s, XZR, x19\n"
+      "mov x20, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
       "uaddv d11, p0, z11.s\n"
       "mov z11.s, z11.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "neg z20.s, p2/M, z20.s\n"
+      "mul z11.s, p2/M, z11.s, z20.s\n"
       "12:"  // Height 1: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x27]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1w { z23.s }, p2/Z, [x10]\n"
+      "ld1w { z22.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
-      "add x19, %x[qp], %[per_layer_mul]\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
-      "addvl x27, x27, #4\n"
-      "add z16.s, z16.s, z0.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z17.s, z17.s, z1.s\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z16.s, z16.s, z23.s\n"
+      "add z17.s, z17.s, z22.s\n"
+      "add z18.s, z18.s, z21.s\n"
+      "add z19.s, z19.s, z20.s\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0x04b47610  // sqrdmulh z16.s, z16.s, z20.s\n"
+      ".inst 0x04b47631  // sqrdmulh z17.s, z17.s, z20.s\n"
+      "addvl x10, x10, #4\n"
+      ".inst 0x04b47652  // sqrdmulh z18.s, z18.s, z20.s\n"
+      ".inst 0x04b47673  // sqrdmulh z19.s, z19.s, z20.s\n"
       "tbz %x[flags], #5, 13f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z19.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
+      "and z23.d, z16.d, z0.d\n"
+      "and z22.d, z17.d, z0.d\n"
+      "and z21.d, z18.d, z0.d\n"
+      "and z20.d, z19.d, z0.d\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z23.s\n"
+      "sqadd z17.s, z17.s, z22.s\n"
+      "sqadd z18.s, z18.s, z21.s\n"
+      "sqadd z19.s, z19.s, z20.s\n"
       "13:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add x19, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z20.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
-      "add x19, %x[qp], %[minval]\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "ld1rw { z5.s }, p2/Z, [x19]\n"
-      "add x19, %x[qp], %[maxval]\n"
+      "add z17.s, z17.s, z20.s\n"
+      "add z18.s, z18.s, z20.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "ld1rw { z6.s }, p2/Z, [x19]\n"
-      "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z20.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z21.s\n"
+      "smin z17.s, p2/M, z17.s, z21.s\n"
+      "smin z18.s, p2/M, z18.s, z21.s\n"
+      "smin z19.s, p2/M, z19.s, z21.s\n"
+      "smax z16.s, p2/M, z16.s, z20.s\n"
+      "smax z17.s, p2/M, z17.s, z20.s\n"
+      "smax z18.s, p2/M, z18.s, z20.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z19.s, p2/M, z19.s, z20.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x26]\n"
-      "addvl x26, x26, #1\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
       "14:"  // Height 1: Writeback done
       "decw x9, ALL, MUL #4\n"
       "cmp x9, XZR\n"
       "bgt 2b\n"
       "b 58f\n"
       "15:"  // Height 2
+      "mov x10, %x[col_bias]\n"
       "mov z11.s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "mov z12.s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "mov z15.b, #0x1\n"
-      "mov x26, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "16:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
       "mov z16.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z17.s, #0x0\n"
-      "whilelt p1.b, x19, x9\n"
       "mov z18.s, #0x0\n"
       "mov z19.s, #0x0\n"
       "mov z20.s, #0x0\n"
@@ -316,308 +313,302 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "mov z22.s, #0x0\n"
       "mov z23.s, #0x0\n"
       "17:"  // Height 2: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "18:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 19f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "cbnz x25, 20f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 20f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 20f\n"
       "19:"  // Height 2: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
       "20:"  // Height 2: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "ble 23f\n"
       "21:"  // Height 2: Multiply loop: Main loop head
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x22, #0x10\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x24, x24, #0x10\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z24.b, z0.b[0]\n"
+      "udot z20.s, z24.b, z1.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z17.s, z26.b, z0.b[0]\n"
+      "udot z21.s, z26.b, z1.b[0]\n"
+      "udot z18.s, z24.b, z0.b[0]\n"
+      "udot z22.s, z24.b, z1.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "udot z19.s, z25.b, z0.b[0]\n"
+      "udot z23.s, z25.b, z1.b[0]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "udot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "udot z23.s, z4.b, z1.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "udot z21.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "udot z23.s, z8.b, z1.b[2]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
+      "udot z16.s, z24.b, z0.b[1]\n"
+      "udot z20.s, z24.b, z1.b[1]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "udot z17.s, z27.b, z0.b[1]\n"
+      "udot z21.s, z27.b, z1.b[1]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "udot z18.s, z26.b, z0.b[1]\n"
+      "udot z22.s, z26.b, z1.b[1]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "udot z19.s, z25.b, z0.b[1]\n"
+      "udot z23.s, z25.b, z1.b[1]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "udot z16.s, z24.b, z0.b[2]\n"
+      "udot z20.s, z24.b, z1.b[2]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      "udot z17.s, z30.b, z0.b[2]\n"
+      "udot z21.s, z30.b, z1.b[2]\n"
+      "udot z18.s, z29.b, z0.b[2]\n"
+      "udot z22.s, z29.b, z1.b[2]\n"
+      "udot z19.s, z28.b, z0.b[2]\n"
+      "udot z23.s, z28.b, z1.b[2]\n"
+      "udot z16.s, z27.b, z0.b[3]\n"
+      "udot z20.s, z27.b, z1.b[3]\n"
+      "udot z17.s, z26.b, z0.b[3]\n"
+      "udot z21.s, z26.b, z1.b[3]\n"
+      "udot z18.s, z25.b, z0.b[3]\n"
+      "udot z22.s, z25.b, z1.b[3]\n"
+      "udot z19.s, z24.b, z0.b[3]\n"
+      "udot z23.s, z24.b, z1.b[3]\n"
       "tbnz %x[flags], #31, 22f\n"
       "udot z11.s, z0.b, z15.b\n"
       "udot z12.s, z1.b, z15.b\n"
       "22:"  // Height 2: Multiply loop: unique 3: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x10\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
       "bgt 21b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x22, #0x10\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x25, x25, #0x4\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z24.b, z0.b[0]\n"
+      "udot z20.s, z24.b, z1.b[0]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z17.s, z26.b, z0.b[0]\n"
+      "udot z21.s, z26.b, z1.b[0]\n"
+      "udot z18.s, z25.b, z0.b[0]\n"
+      "udot z22.s, z25.b, z1.b[0]\n"
       "addvl x28, x28, #4\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z24.b, z0.b[0]\n"
+      "udot z23.s, z24.b, z1.b[0]\n"
       "ble 24f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "udot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
+      "ld1b { z27.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "udot z16.s, z27.b, z0.b[1]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z27.b, z1.b[1]\n"
+      "udot z17.s, z26.b, z0.b[1]\n"
+      "udot z21.s, z26.b, z1.b[1]\n"
+      "udot z18.s, z25.b, z0.b[1]\n"
       "addvl x28, x28, #4\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "udot z23.s, z4.b, z1.b[1]\n"
+      "udot z22.s, z25.b, z1.b[1]\n"
+      "udot z19.s, z24.b, z0.b[1]\n"
+      "udot z23.s, z24.b, z1.b[1]\n"
       "ble 24f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
+      "ld1b { z27.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "udot z16.s, z27.b, z0.b[2]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z27.b, z1.b[2]\n"
+      "udot z17.s, z26.b, z0.b[2]\n"
+      "udot z21.s, z26.b, z1.b[2]\n"
+      "udot z18.s, z25.b, z0.b[2]\n"
       "addvl x28, x28, #4\n"
-      "udot z21.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "udot z23.s, z8.b, z1.b[2]\n"
+      "udot z22.s, z25.b, z1.b[2]\n"
+      "udot z19.s, z24.b, z0.b[2]\n"
+      "udot z23.s, z24.b, z1.b[2]\n"
       "ble 24f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z24.b, z0.b[3]\n"
+      "udot z20.s, z24.b, z1.b[3]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z17.s, z26.b, z0.b[3]\n"
+      "udot z21.s, z26.b, z1.b[3]\n"
+      "udot z18.s, z25.b, z0.b[3]\n"
+      "udot z22.s, z25.b, z1.b[3]\n"
       "addvl x28, x28, #4\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
+      "udot z19.s, z24.b, z0.b[3]\n"
+      "udot z23.s, z24.b, z1.b[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 25f\n"
       "udot z11.s, z0.b, z15.b\n"
       "udot z12.s, z1.b, z15.b\n"
       "25:"  // Height 2: Multiply loop: unique 4: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 18b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x26, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
       "tbnz %x[flags], #31, 26f\n"
-      "add x19, %x[qp], %[b_offset]\n"
-      "ld1rw { z2.s }, p2/Z, [x19]\n"
-      "neg z2.s, p2/M, z2.s\n"
-      "mov x19, #0x4\n"
-      "whilelt p0.s, XZR, x19\n"
+      "mov x20, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       "uaddv d11, p0, z11.s\n"
-      "uaddv d12, p0, z12.s\n"
       "mov z11.s, z11.s[0]\n"
+      "uaddv d12, p0, z12.s\n"
+      "neg z24.s, p2/M, z24.s\n"
       "mov z12.s, z12.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z2.s\n"
-      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "mul z11.s, p2/M, z11.s, z24.s\n"
+      "mul z12.s, p2/M, z12.s, z24.s\n"
       "26:"  // Height 2: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x27]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1w { z28.s }, p2/Z, [x10]\n"
+      "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
-      "add x19, %x[qp], %[per_layer_mul]\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
-      "addvl x27, x27, #4\n"
+      "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
       "add z21.s, z21.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z16.s, z16.s, z28.s\n"
+      "add z17.s, z17.s, z27.s\n"
+      "addvl x10, x10, #4\n"
+      "add z18.s, z18.s, z26.s\n"
+      "add z19.s, z19.s, z25.s\n"
+      "add z20.s, z20.s, z28.s\n"
+      "add z21.s, z21.s, z27.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z22.s, z22.s, z26.s\n"
+      "add z23.s, z23.s, z25.s\n"
+      ".inst 0x04b87610  // sqrdmulh z16.s, z16.s, z24.s\n"
+      ".inst 0x04b87631  // sqrdmulh z17.s, z17.s, z24.s\n"
+      ".inst 0x04b87652  // sqrdmulh z18.s, z18.s, z24.s\n"
+      ".inst 0x04b87673  // sqrdmulh z19.s, z19.s, z24.s\n"
+      ".inst 0x04b87694  // sqrdmulh z20.s, z20.s, z24.s\n"
+      ".inst 0x04b876b5  // sqrdmulh z21.s, z21.s, z24.s\n"
+      ".inst 0x04b876d6  // sqrdmulh z22.s, z22.s, z24.s\n"
+      ".inst 0x04b876f7  // sqrdmulh z23.s, z23.s, z24.s\n"
       "tbz %x[flags], #5, 27f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "and z9.d, z21.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
+      "and z24.d, z16.d, z0.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z24.s\n"
+      "and z30.d, z17.d, z0.d\n"
+      "and z29.d, z18.d, z0.d\n"
+      "and z28.d, z19.d, z0.d\n"
+      "and z27.d, z20.d, z0.d\n"
+      "and z26.d, z21.d, z0.d\n"
+      "and z25.d, z22.d, z0.d\n"
+      "and z24.d, z23.d, z0.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z30.s\n"
+      "sqadd z18.s, z18.s, z29.s\n"
+      "sqadd z19.s, z19.s, z28.s\n"
+      "sqadd z20.s, z20.s, z27.s\n"
+      "sqadd z21.s, z21.s, z26.s\n"
+      "sqadd z22.s, z22.s, z25.s\n"
+      "sqadd z23.s, z23.s, z24.s\n"
       "27:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add x19, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z24.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
-      "add x19, %x[qp], %[minval]\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "ld1rw { z5.s }, p2/Z, [x19]\n"
-      "add x19, %x[qp], %[maxval]\n"
+      "add z17.s, z17.s, z24.s\n"
+      "add z18.s, z18.s, z24.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "ld1rw { z6.s }, p2/Z, [x19]\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z19.s, z19.s, z24.s\n"
+      "add z20.s, z20.s, z24.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "uzp1 z17.h, z18.h, z19.h\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x26]\n"
-      "add z21.s, z21.s, z4.s\n"
-      "addvl x26, x26, #1\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "add z21.s, z21.s, z24.s\n"
+      "add z22.s, z22.s, z24.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "add z23.s, z23.s, z24.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z25.s\n"
+      "smin z17.s, p2/M, z17.s, z25.s\n"
+      "smin z18.s, p2/M, z18.s, z25.s\n"
+      "smin z19.s, p2/M, z19.s, z25.s\n"
+      "smin z20.s, p2/M, z20.s, z25.s\n"
+      "smin z21.s, p2/M, z21.s, z25.s\n"
+      "smin z22.s, p2/M, z22.s, z25.s\n"
+      "smin z23.s, p2/M, z23.s, z25.s\n"
+      "smax z16.s, p2/M, z16.s, z24.s\n"
+      "smax z17.s, p2/M, z17.s, z24.s\n"
+      "smax z18.s, p2/M, z18.s, z24.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z19.s, p2/M, z19.s, z24.s\n"
+      "smax z20.s, p2/M, z20.s, z24.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z21.s, p2/M, z21.s, z24.s\n"
+      "smax z22.s, p2/M, z22.s, z24.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "st1b { z20.b }, p1, [x22]\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smax z23.s, p2/M, z23.s, z24.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "st1b { z20.b }, p1, [x23]\n"
+      "addvl x27, x27, #1\n"
       "28:"  // Height 2: Writeback done
       "decw x9, ALL, MUL #4\n"
       "cmp x9, XZR\n"
       "bgt 16b\n"
       "b 58f\n"
       "29:"  // Height 3
+      "mov x10, %x[col_bias]\n"
       "mov z11.s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "mov z12.s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "mov z13.s, #0x0\n"
-      "mov x26, %x[output_ptr]\n"
       "mov z15.b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
       "30:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
       "mov z16.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z17.s, #0x0\n"
-      "whilelt p1.b, x19, x9\n"
       "mov z18.s, #0x0\n"
       "mov z19.s, #0x0\n"
       "mov z20.s, #0x0\n"
@@ -629,403 +620,394 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "mov z26.s, #0x0\n"
       "mov z27.s, #0x0\n"
       "31:"  // Height 3: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "cbnz x25, 34f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 34f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
       "b 34f\n"
       "33:"  // Height 3: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "34:"  // Height 3: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "ble 37f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x24, x24, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "ld1b { z28.b }, p2/Z, [x28]\n"
+      "udot z16.s, z28.b, z0.b[0]\n"
+      "udot z20.s, z28.b, z1.b[0]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "udot z24.s, z28.b, z2.b[0]\n"
+      "udot z17.s, z30.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z21.s, z30.b, z1.b[0]\n"
+      "udot z25.s, z30.b, z2.b[0]\n"
+      "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "udot z18.s, z29.b, z0.b[0]\n"
+      "udot z22.s, z29.b, z1.b[0]\n"
+      "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "udot z26.s, z29.b, z2.b[0]\n"
+      "udot z19.s, z28.b, z0.b[0]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      "udot z23.s, z28.b, z1.b[0]\n"
+      "udot z27.s, z28.b, z2.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "udot z16.s, z3.b, z0.b[1]\n"
+      "udot z20.s, z3.b, z1.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x21]\n"
+      "udot z24.s, z3.b, z2.b[1]\n"
+      "udot z17.s, z31.b, z0.b[1]\n"
+      "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n"
       "add x22, x22, #0x10\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x21, x21, #0x10\n"
-      "udot z24.s, z4.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "udot z25.s, z5.b, z2.b[0]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
-      "addvl x28, x28, #16\n"
-      "udot z26.s, z6.b, z2.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
-      "udot z27.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "udot z20.s, z8.b, z1.b[1]\n"
-      "udot z24.s, z8.b, z2.b[1]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
-      "udot z25.s, z9.b, z2.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "udot z26.s, z10.b, z2.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "udot z23.s, z4.b, z1.b[1]\n"
-      "udot z27.s, z4.b, z2.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "udot z24.s, z5.b, z2.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "udot z21.s, z6.b, z1.b[2]\n"
-      "udot z25.s, z6.b, z2.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z26.s, z7.b, z2.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "udot z23.s, z8.b, z1.b[2]\n"
-      "udot z27.s, z8.b, z2.b[2]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "udot z24.s, z9.b, z2.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
-      "udot z25.s, z10.b, z2.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
-      "udot z26.s, z4.b, z2.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
-      "udot z27.s, z5.b, z2.b[3]\n"
+      "udot z21.s, z31.b, z1.b[1]\n"
+      "udot z25.s, z31.b, z2.b[1]\n"
+      "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "udot z18.s, z30.b, z0.b[1]\n"
+      "udot z22.s, z30.b, z1.b[1]\n"
+      "udot z26.s, z30.b, z2.b[1]\n"
+      "udot z19.s, z29.b, z0.b[1]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "udot z23.s, z29.b, z1.b[1]\n"
+      "udot z27.s, z29.b, z2.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "udot z16.s, z28.b, z0.b[2]\n"
+      "udot z20.s, z28.b, z1.b[2]\n"
+      "udot z24.s, z28.b, z2.b[2]\n"
+      "udot z17.s, z5.b, z0.b[2]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "udot z21.s, z5.b, z1.b[2]\n"
+      "udot z25.s, z5.b, z2.b[2]\n"
+      "udot z18.s, z4.b, z0.b[2]\n"
+      "udot z22.s, z4.b, z1.b[2]\n"
+      "udot z26.s, z4.b, z2.b[2]\n"
+      "udot z19.s, z3.b, z0.b[2]\n"
+      "udot z23.s, z3.b, z1.b[2]\n"
+      "udot z27.s, z3.b, z2.b[2]\n"
+      "udot z16.s, z31.b, z0.b[3]\n"
+      "udot z20.s, z31.b, z1.b[3]\n"
+      "udot z24.s, z31.b, z2.b[3]\n"
+      "udot z17.s, z30.b, z0.b[3]\n"
+      "udot z21.s, z30.b, z1.b[3]\n"
+      "udot z25.s, z30.b, z2.b[3]\n"
+      "udot z18.s, z29.b, z0.b[3]\n"
+      "udot z22.s, z29.b, z1.b[3]\n"
+      "udot z26.s, z29.b, z2.b[3]\n"
+      "udot z19.s, z28.b, z0.b[3]\n"
+      "udot z23.s, z28.b, z1.b[3]\n"
+      "udot z27.s, z28.b, z2.b[3]\n"
       "tbnz %x[flags], #31, 36f\n"
       "udot z11.s, z0.b, z15.b\n"
       "udot z12.s, z1.b, z15.b\n"
       "udot z13.s, z2.b, z15.b\n"
       "36:"  // Height 3: Multiply loop: unique 5: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x10\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
       "bgt 35b\n"
       "37:"  // Height 3: Multiply loop: Single iteration only
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x21, x21, #0x10\n"
-      "udot z24.s, z4.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x25, x25, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "ld1b { z28.b }, p2/Z, [x28]\n"
+      "udot z16.s, z28.b, z0.b[0]\n"
+      "udot z20.s, z28.b, z1.b[0]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "udot z24.s, z28.b, z2.b[0]\n"
+      "udot z17.s, z30.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z21.s, z30.b, z1.b[0]\n"
+      "udot z25.s, z30.b, z2.b[0]\n"
       "addvl x28, x28, #4\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
-      "udot z25.s, z5.b, z2.b[0]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
-      "udot z26.s, z6.b, z2.b[0]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
-      "udot z27.s, z7.b, z2.b[0]\n"
+      "udot z18.s, z29.b, z0.b[0]\n"
+      "udot z22.s, z29.b, z1.b[0]\n"
+      "udot z26.s, z29.b, z2.b[0]\n"
+      "udot z19.s, z28.b, z0.b[0]\n"
+      "udot z23.s, z28.b, z1.b[0]\n"
+      "udot z27.s, z28.b, z2.b[0]\n"
       "ble 38f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "udot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "udot z24.s, z8.b, z2.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "udot z16.s, z31.b, z0.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z31.b, z1.b[1]\n"
+      "udot z24.s, z31.b, z2.b[1]\n"
+      "udot z17.s, z30.b, z0.b[1]\n"
+      "udot z21.s, z30.b, z1.b[1]\n"
       "addvl x28, x28, #4\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
-      "udot z25.s, z9.b, z2.b[1]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "udot z26.s, z10.b, z2.b[1]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "udot z23.s, z4.b, z1.b[1]\n"
-      "udot z27.s, z4.b, z2.b[1]\n"
+      "udot z25.s, z30.b, z2.b[1]\n"
+      "udot z18.s, z29.b, z0.b[1]\n"
+      "udot z22.s, z29.b, z1.b[1]\n"
+      "udot z26.s, z29.b, z2.b[1]\n"
+      "udot z19.s, z28.b, z0.b[1]\n"
+      "udot z23.s, z28.b, z1.b[1]\n"
+      "udot z27.s, z28.b, z2.b[1]\n"
       "ble 38f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "udot z24.s, z5.b, z2.b[2]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "udot z16.s, z31.b, z0.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z31.b, z1.b[2]\n"
+      "udot z24.s, z31.b, z2.b[2]\n"
+      "udot z17.s, z30.b, z0.b[2]\n"
+      "udot z21.s, z30.b, z1.b[2]\n"
       "addvl x28, x28, #4\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "udot z21.s, z6.b, z1.b[2]\n"
-      "udot z25.s, z6.b, z2.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z26.s, z7.b, z2.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "udot z23.s, z8.b, z1.b[2]\n"
-      "udot z27.s, z8.b, z2.b[2]\n"
+      "udot z25.s, z30.b, z2.b[2]\n"
+      "udot z18.s, z29.b, z0.b[2]\n"
+      "udot z22.s, z29.b, z1.b[2]\n"
+      "udot z26.s, z29.b, z2.b[2]\n"
+      "udot z19.s, z28.b, z0.b[2]\n"
+      "udot z23.s, z28.b, z1.b[2]\n"
+      "udot z27.s, z28.b, z2.b[2]\n"
       "ble 38f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "udot z24.s, z9.b, z2.b[3]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z31.b, z0.b[3]\n"
+      "udot z20.s, z31.b, z1.b[3]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z24.s, z31.b, z2.b[3]\n"
+      "udot z17.s, z30.b, z0.b[3]\n"
+      "udot z21.s, z30.b, z1.b[3]\n"
+      "udot z25.s, z30.b, z2.b[3]\n"
       "addvl x28, x28, #4\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
-      "udot z25.s, z10.b, z2.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
-      "udot z26.s, z4.b, z2.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
-      "udot z27.s, z5.b, z2.b[3]\n"
+      "udot z18.s, z29.b, z0.b[3]\n"
+      "udot z22.s, z29.b, z1.b[3]\n"
+      "udot z26.s, z29.b, z2.b[3]\n"
+      "udot z19.s, z28.b, z0.b[3]\n"
+      "udot z23.s, z28.b, z1.b[3]\n"
+      "udot z27.s, z28.b, z2.b[3]\n"
       "38:"  // Height 3: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 39f\n"
       "udot z11.s, z0.b, z15.b\n"
       "udot z12.s, z1.b, z15.b\n"
       "udot z13.s, z2.b, z15.b\n"
       "39:"  // Height 3: Multiply loop: unique 6: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 32b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x26, x19\n"
-      "add x21, x22, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "tbnz %x[flags], #31, 40f\n"
-      "add x19, %x[qp], %[b_offset]\n"
-      "ld1rw { z3.s }, p2/Z, [x19]\n"
-      "neg z3.s, p2/M, z3.s\n"
-      "mov x19, #0x4\n"
-      "whilelt p0.s, XZR, x19\n"
+      "mov x20, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
       "uaddv d11, p0, z11.s\n"
+      "mov z11.s, z11.s[0]\n"
       "uaddv d12, p0, z12.s\n"
       "uaddv d13, p0, z13.s\n"
-      "mov z11.s, z11.s[0]\n"
       "mov z12.s, z12.s[0]\n"
       "mov z13.s, z13.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z3.s\n"
-      "mul z12.s, p2/M, z12.s, z3.s\n"
-      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "neg z28.s, p2/M, z28.s\n"
+      "mul z11.s, p2/M, z11.s, z28.s\n"
+      "mul z12.s, p2/M, z12.s, z28.s\n"
+      "mul z13.s, p2/M, z13.s, z28.s\n"
       "40:"  // Height 3: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x27]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
-      "add x19, %x[qp], %[per_layer_mul]\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
-      "addvl x27, x27, #4\n"
+      "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
       "add z21.s, z21.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z24.s, z24.s, z13.s\n"
       "add z25.s, z25.s, z13.s\n"
+      "addvl x10, x10, #4\n"
       "add z26.s, z26.s, z13.s\n"
       "add z27.s, z27.s, z13.s\n"
       "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
+      "add z17.s, z17.s, z31.s\n"
+      "add z18.s, z18.s, z30.s\n"
+      "add z19.s, z19.s, z29.s\n"
       "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
+      "add z21.s, z21.s, z31.s\n"
+      "add z22.s, z22.s, z30.s\n"
+      "add z23.s, z23.s, z29.s\n"
       "add z24.s, z24.s, z0.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "add z25.s, z25.s, z31.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z26.s, z26.s, z30.s\n"
+      "add z27.s, z27.s, z29.s\n"
+      ".inst 0x04bc7610  // sqrdmulh z16.s, z16.s, z28.s\n"
+      ".inst 0x04bc7631  // sqrdmulh z17.s, z17.s, z28.s\n"
+      ".inst 0x04bc7652  // sqrdmulh z18.s, z18.s, z28.s\n"
+      ".inst 0x04bc7673  // sqrdmulh z19.s, z19.s, z28.s\n"
+      ".inst 0x04bc7694  // sqrdmulh z20.s, z20.s, z28.s\n"
+      ".inst 0x04bc76b5  // sqrdmulh z21.s, z21.s, z28.s\n"
+      ".inst 0x04bc76d6  // sqrdmulh z22.s, z22.s, z28.s\n"
+      ".inst 0x04bc76f7  // sqrdmulh z23.s, z23.s, z28.s\n"
+      ".inst 0x04bc7718  // sqrdmulh z24.s, z24.s, z28.s\n"
+      ".inst 0x04bc7739  // sqrdmulh z25.s, z25.s, z28.s\n"
+      ".inst 0x04bc775a  // sqrdmulh z26.s, z26.s, z28.s\n"
+      ".inst 0x04bc777b  // sqrdmulh z27.s, z27.s, z28.s\n"
       "tbz %x[flags], #5, 41f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "and z9.d, z21.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z24.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "and z6.d, z25.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z24.s, z24.s, z5.s\n"
-      "and z7.d, z26.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z8.d, z27.d, z0.d\n"
-      "sqadd z25.s, z25.s, z6.s\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "sqadd z27.s, z27.s, z8.s\n"
+      "and z1.d, z16.d, z0.d\n"
+      "and z31.d, z17.d, z0.d\n"
+      "and z30.d, z18.d, z0.d\n"
+      "and z29.d, z19.d, z0.d\n"
+      "and z28.d, z20.d, z0.d\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z1.s\n"
+      "sqadd z17.s, z17.s, z31.s\n"
+      "sqadd z18.s, z18.s, z30.s\n"
+      "sqadd z19.s, z19.s, z29.s\n"
+      "sqadd z20.s, z20.s, z28.s\n"
+      "and z3.d, z21.d, z0.d\n"
+      "and z2.d, z22.d, z0.d\n"
+      "and z1.d, z23.d, z0.d\n"
+      "and z31.d, z24.d, z0.d\n"
+      "and z30.d, z25.d, z0.d\n"
+      "and z29.d, z26.d, z0.d\n"
+      "and z28.d, z27.d, z0.d\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z3.s\n"
+      "sqadd z22.s, z22.s, z2.s\n"
+      "sqadd z23.s, z23.s, z1.s\n"
+      "sqadd z24.s, z24.s, z31.s\n"
+      "sqadd z25.s, z25.s, z30.s\n"
+      "sqadd z26.s, z26.s, z29.s\n"
+      "sqadd z27.s, z27.s, z28.s\n"
       "41:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add x19, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z28.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
-      "add x19, %x[qp], %[minval]\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "ld1rw { z5.s }, p2/Z, [x19]\n"
-      "add x19, %x[qp], %[maxval]\n"
+      "add z17.s, z17.s, z28.s\n"
+      "add z18.s, z18.s, z28.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "ld1rw { z6.s }, p2/Z, [x19]\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z19.s, z19.s, z28.s\n"
+      "add z20.s, z20.s, z28.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "uzp1 z17.h, z18.h, z19.h\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x26]\n"
-      "add z21.s, z21.s, z4.s\n"
-      "addvl x26, x26, #1\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "add z21.s, z21.s, z28.s\n"
+      "add z22.s, z22.s, z28.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "add z23.s, z23.s, z28.s\n"
+      "add z24.s, z24.s, z28.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "uzp1 z20.h, z20.h, z21.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
+      "add z25.s, z25.s, z28.s\n"
+      "add z26.s, z26.s, z28.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "st1b { z20.b }, p1, [x22]\n"
-      "add z26.s, z26.s, z4.s\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "add z27.s, z27.s, z4.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z28.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z29.s\n"
+      "smin z17.s, p2/M, z17.s, z29.s\n"
+      "smin z18.s, p2/M, z18.s, z29.s\n"
+      "smin z19.s, p2/M, z19.s, z29.s\n"
+      "smin z20.s, p2/M, z20.s, z29.s\n"
+      "smin z21.s, p2/M, z21.s, z29.s\n"
+      "smin z22.s, p2/M, z22.s, z29.s\n"
+      "smin z23.s, p2/M, z23.s, z29.s\n"
+      "smin z24.s, p2/M, z24.s, z29.s\n"
+      "smin z25.s, p2/M, z25.s, z29.s\n"
+      "smin z26.s, p2/M, z26.s, z29.s\n"
+      "smin z27.s, p2/M, z27.s, z29.s\n"
+      "smax z16.s, p2/M, z16.s, z28.s\n"
+      "smax z17.s, p2/M, z17.s, z28.s\n"
+      "smax z18.s, p2/M, z18.s, z28.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z19.s, p2/M, z19.s, z28.s\n"
+      "smax z20.s, p2/M, z20.s, z28.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z21.s, p2/M, z21.s, z28.s\n"
+      "smax z22.s, p2/M, z22.s, z28.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smax z23.s, p2/M, z23.s, z28.s\n"
+      "smax z24.s, p2/M, z24.s, z28.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z28.s\n"
+      "smax z26.s, p2/M, z26.s, z28.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x21]\n"
+      "st1b { z20.b }, p1, [x23]\n"
+      "smax z27.s, p2/M, z27.s, z28.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x22]\n"
+      "addvl x27, x27, #1\n"
       "42:"  // Height 3: Writeback done
       "decw x9, ALL, MUL #4\n"
       "cmp x9, XZR\n"
       "bgt 30b\n"
       "b 58f\n"
       "43:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x4\n"
+      "mov x10, %x[col_bias]\n"
       "mov z11.s, #0x0\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x27, %x[col_bias]\n"
       "mov z12.s, #0x0\n"
-      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "bic %x[flags], %x[flags], #0x80000000\n"
       "mov z13.s, #0x0\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x26, %x[output_ptr]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
       "mov z14.s, #0x0\n"
-      "mov x19, #0x4\n"
       "mov z15.b, #0x1\n"
-      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "44:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
       "mov z16.s, #0x0\n"
-      "mov x19, #0x0\n"
       "mov z17.s, #0x0\n"
-      "whilelt p1.b, x19, x9\n"
       "mov z18.s, #0x0\n"
       "mov z19.s, #0x0\n"
       "mov z20.s, #0x0\n"
@@ -1041,237 +1023,229 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "mov z30.s, #0x0\n"
       "mov z31.s, #0x0\n"
       "45:"  // Height 4: setup done
-      "mov x25, #0x0\n"
+      "mov x26, #0x0\n"
       "46:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 47f\n"
-      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x23, [x20, #0x0]\n"
-      "ldr x22, [x20, #0x8]\n"
-      "ldr x21, [x20, #0x10]\n"
-      "ldr x20, [x20, #0x18]\n"
-      "cbnz x25, 48f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 48f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
       "b 48f\n"
       "47:"  // Height 4: setup direct input
-      "mov x23, %x[input_ptr]\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "48:"  // Height 4: input setup done
-      "cmp x24, #0x10\n"
+      "cmp x25, #0x10\n"
       "ble 51f\n"
       "49:"  // Height 4: Multiply loop: Main loop head
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x24, x24, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
       "add x23, x23, #0x10\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "udot z24.s, z4.b, z2.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z5.b, z0.b[0]\n"
+      "udot z20.s, z5.b, z1.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z24.s, z5.b, z2.b[0]\n"
+      "udot z28.s, z5.b, z3.b[0]\n"
+      "udot z17.s, z4.b, z0.b[0]\n"
+      "udot z21.s, z4.b, z1.b[0]\n"
       "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "udot z28.s, z4.b, z3.b[0]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "udot z29.s, z5.b, z3.b[0]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "udot z25.s, z4.b, z2.b[0]\n"
+      "udot z29.s, z4.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "udot z18.s, z10.b, z0.b[0]\n"
+      "udot z22.s, z10.b, z1.b[0]\n"
       "addvl x28, x28, #16\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "udot z26.s, z6.b, z2.b[0]\n"
-      "udot z30.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
-      "udot z27.s, z7.b, z2.b[0]\n"
-      "udot z31.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "udot z26.s, z10.b, z2.b[0]\n"
+      "udot z30.s, z10.b, z3.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      "udot z19.s, z9.b, z0.b[0]\n"
+      "udot z23.s, z9.b, z1.b[0]\n"
+      "udot z27.s, z9.b, z2.b[0]\n"
+      "udot z31.s, z9.b, z3.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
       "udot z16.s, z8.b, z0.b[1]\n"
       "udot z20.s, z8.b, z1.b[1]\n"
       "udot z24.s, z8.b, z2.b[1]\n"
       "udot z28.s, z8.b, z3.b[1]\n"
       "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
-      "udot z25.s, z9.b, z2.b[1]\n"
-      "udot z29.s, z9.b, z3.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "udot z26.s, z10.b, z2.b[1]\n"
-      "udot z30.s, z10.b, z3.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "udot z23.s, z4.b, z1.b[1]\n"
-      "udot z27.s, z4.b, z2.b[1]\n"
-      "udot z31.s, z4.b, z3.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "udot z24.s, z5.b, z2.b[2]\n"
-      "udot z28.s, z5.b, z3.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "udot z21.s, z6.b, z1.b[2]\n"
-      "udot z25.s, z6.b, z2.b[2]\n"
-      "udot z29.s, z6.b, z3.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z26.s, z7.b, z2.b[2]\n"
-      "udot z30.s, z7.b, z3.b[2]\n"
+      "udot z17.s, z7.b, z0.b[1]\n"
+      "udot z21.s, z7.b, z1.b[1]\n"
+      "udot z25.s, z7.b, z2.b[1]\n"
+      "udot z29.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "udot z18.s, z6.b, z0.b[1]\n"
+      "udot z22.s, z6.b, z1.b[1]\n"
+      "udot z26.s, z6.b, z2.b[1]\n"
+      "udot z30.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "udot z19.s, z5.b, z0.b[1]\n"
+      "udot z23.s, z5.b, z1.b[1]\n"
+      "udot z27.s, z5.b, z2.b[1]\n"
+      "udot z31.s, z5.b, z3.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "udot z16.s, z4.b, z0.b[2]\n"
+      "udot z20.s, z4.b, z1.b[2]\n"
+      "udot z24.s, z4.b, z2.b[2]\n"
+      "udot z28.s, z4.b, z3.b[2]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "udot z17.s, z10.b, z0.b[2]\n"
+      "udot z21.s, z10.b, z1.b[2]\n"
+      "udot z25.s, z10.b, z2.b[2]\n"
+      "udot z29.s, z10.b, z3.b[2]\n"
+      "udot z18.s, z9.b, z0.b[2]\n"
+      "udot z22.s, z9.b, z1.b[2]\n"
+      "udot z26.s, z9.b, z2.b[2]\n"
+      "udot z30.s, z9.b, z3.b[2]\n"
       "udot z19.s, z8.b, z0.b[2]\n"
       "udot z23.s, z8.b, z1.b[2]\n"
       "udot z27.s, z8.b, z2.b[2]\n"
       "udot z31.s, z8.b, z3.b[2]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "udot z24.s, z9.b, z2.b[3]\n"
-      "udot z28.s, z9.b, z3.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
-      "udot z25.s, z10.b, z2.b[3]\n"
-      "udot z29.s, z10.b, z3.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
-      "udot z26.s, z4.b, z2.b[3]\n"
-      "udot z30.s, z4.b, z3.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
-      "udot z27.s, z5.b, z2.b[3]\n"
-      "udot z31.s, z5.b, z3.b[3]\n"
+      "udot z16.s, z7.b, z0.b[3]\n"
+      "udot z20.s, z7.b, z1.b[3]\n"
+      "udot z24.s, z7.b, z2.b[3]\n"
+      "udot z28.s, z7.b, z3.b[3]\n"
+      "udot z17.s, z6.b, z0.b[3]\n"
+      "udot z21.s, z6.b, z1.b[3]\n"
+      "udot z25.s, z6.b, z2.b[3]\n"
+      "udot z29.s, z6.b, z3.b[3]\n"
+      "udot z18.s, z5.b, z0.b[3]\n"
+      "udot z22.s, z5.b, z1.b[3]\n"
+      "udot z26.s, z5.b, z2.b[3]\n"
+      "udot z30.s, z5.b, z3.b[3]\n"
+      "udot z19.s, z4.b, z0.b[3]\n"
+      "udot z23.s, z4.b, z1.b[3]\n"
+      "udot z27.s, z4.b, z2.b[3]\n"
+      "udot z31.s, z4.b, z3.b[3]\n"
       "tbnz %x[flags], #31, 50f\n"
       "udot z11.s, z0.b, z15.b\n"
       "udot z12.s, z1.b, z15.b\n"
       "udot z13.s, z2.b, z15.b\n"
       "udot z14.s, z3.b, z15.b\n"
       "50:"  // Height 4: Multiply loop: unique 7: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "cmp x24, #0x10\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
       "bgt 49b\n"
       "51:"  // Height 4: Multiply loop: Single iteration only
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "whilelt p0.b, XZR, x24\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x23]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "udot z24.s, z4.b, z2.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x25, x25, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z7.b, z0.b[0]\n"
+      "udot z20.s, z7.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z24.s, z7.b, z2.b[0]\n"
+      "udot z28.s, z7.b, z3.b[0]\n"
+      "udot z17.s, z6.b, z0.b[0]\n"
+      "udot z21.s, z6.b, z1.b[0]\n"
       "addvl x28, x28, #4\n"
-      "udot z28.s, z4.b, z3.b[0]\n"
-      "udot z25.s, z5.b, z2.b[0]\n"
-      "udot z29.s, z5.b, z3.b[0]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
-      "udot z26.s, z6.b, z2.b[0]\n"
-      "udot z30.s, z6.b, z3.b[0]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
-      "udot z27.s, z7.b, z2.b[0]\n"
-      "udot z31.s, z7.b, z3.b[0]\n"
+      "udot z25.s, z6.b, z2.b[0]\n"
+      "udot z29.s, z6.b, z3.b[0]\n"
+      "udot z18.s, z5.b, z0.b[0]\n"
+      "udot z22.s, z5.b, z1.b[0]\n"
+      "udot z26.s, z5.b, z2.b[0]\n"
+      "udot z30.s, z5.b, z3.b[0]\n"
+      "udot z19.s, z4.b, z0.b[0]\n"
+      "udot z23.s, z4.b, z1.b[0]\n"
+      "udot z27.s, z4.b, z2.b[0]\n"
+      "udot z31.s, z4.b, z3.b[0]\n"
       "ble 52f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "udot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "udot z24.s, z8.b, z2.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "udot z16.s, z7.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
       "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z7.b, z1.b[1]\n"
+      "udot z24.s, z7.b, z2.b[1]\n"
+      "udot z28.s, z7.b, z3.b[1]\n"
+      "udot z17.s, z6.b, z0.b[1]\n"
       "addvl x28, x28, #4\n"
-      "udot z28.s, z8.b, z3.b[1]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
-      "udot z25.s, z9.b, z2.b[1]\n"
-      "udot z29.s, z9.b, z3.b[1]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "udot z26.s, z10.b, z2.b[1]\n"
-      "udot z30.s, z10.b, z3.b[1]\n"
+      "udot z21.s, z6.b, z1.b[1]\n"
+      "udot z25.s, z6.b, z2.b[1]\n"
+      "udot z29.s, z6.b, z3.b[1]\n"
+      "udot z18.s, z5.b, z0.b[1]\n"
+      "udot z22.s, z5.b, z1.b[1]\n"
+      "udot z26.s, z5.b, z2.b[1]\n"
+      "udot z30.s, z5.b, z3.b[1]\n"
       "udot z19.s, z4.b, z0.b[1]\n"
       "udot z23.s, z4.b, z1.b[1]\n"
       "udot z27.s, z4.b, z2.b[1]\n"
       "udot z31.s, z4.b, z3.b[1]\n"
       "ble 52f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
       "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x24, x24, #0x4\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "udot z24.s, z5.b, z2.b[2]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "udot z28.s, z5.b, z3.b[2]\n"
+      "subs x25, x25, #0x4\n"
+      "udot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z7.b, z1.b[2]\n"
+      "udot z24.s, z7.b, z2.b[2]\n"
+      "udot z28.s, z7.b, z3.b[2]\n"
       "udot z17.s, z6.b, z0.b[2]\n"
+      "addvl x28, x28, #4\n"
       "udot z21.s, z6.b, z1.b[2]\n"
       "udot z25.s, z6.b, z2.b[2]\n"
       "udot z29.s, z6.b, z3.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z26.s, z7.b, z2.b[2]\n"
-      "udot z30.s, z7.b, z3.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "udot z23.s, z8.b, z1.b[2]\n"
-      "udot z27.s, z8.b, z2.b[2]\n"
-      "udot z31.s, z8.b, z3.b[2]\n"
+      "udot z18.s, z5.b, z0.b[2]\n"
+      "udot z22.s, z5.b, z1.b[2]\n"
+      "udot z26.s, z5.b, z2.b[2]\n"
+      "udot z30.s, z5.b, z3.b[2]\n"
+      "udot z19.s, z4.b, z0.b[2]\n"
+      "udot z23.s, z4.b, z1.b[2]\n"
+      "udot z27.s, z4.b, z2.b[2]\n"
+      "udot z31.s, z4.b, z3.b[2]\n"
       "ble 52f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "udot z24.s, z9.b, z2.b[3]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z7.b, z0.b[3]\n"
+      "udot z20.s, z7.b, z1.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z24.s, z7.b, z2.b[3]\n"
+      "udot z28.s, z7.b, z3.b[3]\n"
+      "udot z17.s, z6.b, z0.b[3]\n"
+      "udot z21.s, z6.b, z1.b[3]\n"
       "addvl x28, x28, #4\n"
-      "udot z28.s, z9.b, z3.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
-      "udot z25.s, z10.b, z2.b[3]\n"
-      "udot z29.s, z10.b, z3.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
-      "udot z26.s, z4.b, z2.b[3]\n"
-      "udot z30.s, z4.b, z3.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
-      "udot z27.s, z5.b, z2.b[3]\n"
-      "udot z31.s, z5.b, z3.b[3]\n"
+      "udot z25.s, z6.b, z2.b[3]\n"
+      "udot z29.s, z6.b, z3.b[3]\n"
+      "udot z18.s, z5.b, z0.b[3]\n"
+      "udot z22.s, z5.b, z1.b[3]\n"
+      "udot z26.s, z5.b, z2.b[3]\n"
+      "udot z30.s, z5.b, z3.b[3]\n"
+      "udot z19.s, z4.b, z0.b[3]\n"
+      "udot z23.s, z4.b, z1.b[3]\n"
+      "udot z27.s, z4.b, z2.b[3]\n"
+      "udot z31.s, z4.b, z3.b[3]\n"
       "52:"  // Height 4: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 53f\n"
       "udot z11.s, z0.b, z15.b\n"
@@ -1279,254 +1253,249 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "udot z13.s, z2.b, z15.b\n"
       "udot z14.s, z3.b, z15.b\n"
       "53:"  // Height 4: Multiply loop: unique 8: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x25, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
       "bne 46b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x26, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "add x21, x22, x20\n"
       "tbnz %x[flags], #31, 54f\n"
-      "add x19, %x[qp], %[b_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
-      "neg z4.s, p2/M, z4.s\n"
-      "mov x19, #0x4\n"
-      "whilelt p0.s, XZR, x19\n"
+      "mov x20, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
       "uaddv d11, p0, z11.s\n"
+      "mov z11.s, z11.s[0]\n"
       "uaddv d12, p0, z12.s\n"
       "uaddv d13, p0, z13.s\n"
-      "uaddv d14, p0, z14.s\n"
-      "mov z11.s, z11.s[0]\n"
       "mov z12.s, z12.s[0]\n"
       "mov z13.s, z13.s[0]\n"
+      "uaddv d14, p0, z14.s\n"
+      "neg z0.s, p2/M, z0.s\n"
       "mov z14.s, z14.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z4.s\n"
-      "mul z12.s, p2/M, z12.s, z4.s\n"
-      "mul z13.s, p2/M, z13.s, z4.s\n"
-      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "mul z11.s, p2/M, z11.s, z0.s\n"
+      "mul z12.s, p2/M, z12.s, z0.s\n"
+      "mul z13.s, p2/M, z13.s, z0.s\n"
+      "mul z14.s, p2/M, z14.s, z0.s\n"
       "54:"  // Height 4: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x27]\n"
-      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1w { z4.s }, p2/Z, [x10]\n"
+      "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
-      "add x19, %x[qp], %[per_layer_mul]\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
-      "addvl x27, x27, #4\n"
+      "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
       "add z21.s, z21.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z24.s, z24.s, z13.s\n"
       "add z25.s, z25.s, z13.s\n"
+      "addvl x10, x10, #4\n"
       "add z26.s, z26.s, z13.s\n"
       "add z27.s, z27.s, z13.s\n"
       "add z28.s, z28.s, z14.s\n"
       "add z29.s, z29.s, z14.s\n"
       "add z30.s, z30.s, z14.s\n"
       "add z31.s, z31.s, z14.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      "add z28.s, z28.s, z0.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z29.s, z29.s, z1.s\n"
-      "add z30.s, z30.s, z2.s\n"
-      "add z31.s, z31.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
-      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
-      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
-      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
-      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z0.s\n"
+      "add z18.s, z18.s, z3.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "add z21.s, z21.s, z0.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z23.s, z23.s, z2.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z0.s\n"
+      "add z26.s, z26.s, z3.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add z28.s, z28.s, z4.s\n"
+      "add z29.s, z29.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z30.s, z30.s, z3.s\n"
+      "add z31.s, z31.s, z2.s\n"
+      ".inst 0x04a17610  // sqrdmulh z16.s, z16.s, z1.s\n"
+      ".inst 0x04a17631  // sqrdmulh z17.s, z17.s, z1.s\n"
+      ".inst 0x04a17652  // sqrdmulh z18.s, z18.s, z1.s\n"
+      ".inst 0x04a17673  // sqrdmulh z19.s, z19.s, z1.s\n"
+      ".inst 0x04a17694  // sqrdmulh z20.s, z20.s, z1.s\n"
+      ".inst 0x04a176b5  // sqrdmulh z21.s, z21.s, z1.s\n"
+      ".inst 0x04a176d6  // sqrdmulh z22.s, z22.s, z1.s\n"
+      ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+      ".inst 0x04a17718  // sqrdmulh z24.s, z24.s, z1.s\n"
+      ".inst 0x04a17739  // sqrdmulh z25.s, z25.s, z1.s\n"
+      ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+      ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+      ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+      ".inst 0x04a177bd  // sqrdmulh z29.s, z29.s, z1.s\n"
+      ".inst 0x04a177de  // sqrdmulh z30.s, z30.s, z1.s\n"
+      ".inst 0x04a177ff  // sqrdmulh z31.s, z31.s, z1.s\n"
       "tbz %x[flags], #5, 55f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "and z9.d, z21.d, z0.d\n"
+      "and z2.d, z16.d, z0.d\n"
+      "and z1.d, z17.d, z0.d\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z2.s\n"
+      "sqadd z17.s, z17.s, z1.s\n"
+      "and z7.d, z18.d, z0.d\n"
+      "and z6.d, z19.d, z0.d\n"
+      "and z5.d, z20.d, z0.d\n"
+      "and z4.d, z21.d, z0.d\n"
+      "and z3.d, z22.d, z0.d\n"
+      "and z2.d, z23.d, z0.d\n"
+      "and z1.d, z24.d, z0.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z24.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "and z6.d, z25.d, z0.d\n"
       "asr z6.s, z6.s, #0x1f\n"
-      "sqadd z24.s, z24.s, z5.s\n"
-      "and z7.d, z26.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z8.d, z27.d, z0.d\n"
-      "and z9.d, z28.d, z0.d\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z25.s, z25.s, z6.s\n"
-      "and z10.d, z29.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "and z4.d, z30.d, z0.d\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "and z5.d, z31.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z27.s, z27.s, z8.s\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z7.s\n"
+      "sqadd z19.s, z19.s, z6.s\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z4.s\n"
+      "sqadd z22.s, z22.s, z3.s\n"
+      "sqadd z23.s, z23.s, z2.s\n"
+      "sqadd z24.s, z24.s, z1.s\n"
+      "and z7.d, z25.d, z0.d\n"
+      "and z6.d, z26.d, z0.d\n"
+      "and z5.d, z27.d, z0.d\n"
+      "and z4.d, z28.d, z0.d\n"
+      "and z3.d, z29.d, z0.d\n"
+      "and z2.d, z30.d, z0.d\n"
+      "and z1.d, z31.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z28.s, z28.s, z9.s\n"
-      "sqadd z29.s, z29.s, z10.s\n"
-      "sqadd z30.s, z30.s, z4.s\n"
-      "sqadd z31.s, z31.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z7.s\n"
+      "sqadd z26.s, z26.s, z6.s\n"
+      "sqadd z27.s, z27.s, z5.s\n"
+      "sqadd z28.s, z28.s, z4.s\n"
+      "sqadd z29.s, z29.s, z3.s\n"
+      "sqadd z30.s, z30.s, z2.s\n"
+      "sqadd z31.s, z31.s, z1.s\n"
       "55:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add x19, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z2.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
-      "add x19, %x[qp], %[minval]\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "ld1rw { z5.s }, p2/Z, [x19]\n"
-      "add x19, %x[qp], %[maxval]\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z2.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "ld1rw { z6.s }, p2/Z, [x19]\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z20.s, z20.s, z2.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "uzp1 z17.h, z18.h, z19.h\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x26]\n"
-      "add z21.s, z21.s, z4.s\n"
-      "addvl x26, x26, #1\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z2.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "add z23.s, z23.s, z2.s\n"
+      "add z24.s, z24.s, z2.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "uzp1 z20.h, z20.h, z21.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z2.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "st1b { z20.b }, p1, [x22]\n"
-      "add z26.s, z26.s, z4.s\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "add z27.s, z27.s, z4.s\n"
       ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "uzp1 z24.h, z24.h, z25.h\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "add z28.s, z28.s, z4.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add z28.s, z28.s, z2.s\n"
       ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "smin z28.s, p2/M, z28.s, z6.s\n"
       ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
-      "add z29.s, z29.s, z4.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "smax z28.s, p2/M, z28.s, z5.s\n"
-      "add z30.s, z30.s, z4.s\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x21]\n"
-      "smin z29.s, p2/M, z29.s, z6.s\n"
-      "smin z30.s, p2/M, z30.s, z6.s\n"
+      "add z29.s, z29.s, z2.s\n"
+      "add z30.s, z30.s, z2.s\n"
       ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
-      "smax z29.s, p2/M, z29.s, z5.s\n"
-      "smax z30.s, p2/M, z30.s, z5.s\n"
-      "add z31.s, z31.s, z4.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add z31.s, z31.s, z2.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z1.s\n"
+      "smin z17.s, p2/M, z17.s, z1.s\n"
+      "smin z18.s, p2/M, z18.s, z1.s\n"
+      "smin z19.s, p2/M, z19.s, z1.s\n"
+      "smin z20.s, p2/M, z20.s, z1.s\n"
+      "smin z21.s, p2/M, z21.s, z1.s\n"
+      "smin z22.s, p2/M, z22.s, z1.s\n"
+      "smin z23.s, p2/M, z23.s, z1.s\n"
+      "smin z24.s, p2/M, z24.s, z1.s\n"
+      "smin z25.s, p2/M, z25.s, z1.s\n"
+      "smin z26.s, p2/M, z26.s, z1.s\n"
+      "smin z27.s, p2/M, z27.s, z1.s\n"
+      "smin z28.s, p2/M, z28.s, z1.s\n"
+      "smin z29.s, p2/M, z29.s, z1.s\n"
+      "smin z30.s, p2/M, z30.s, z1.s\n"
+      "smin z31.s, p2/M, z31.s, z1.s\n"
+      "smax z16.s, p2/M, z16.s, z0.s\n"
+      "smax z17.s, p2/M, z17.s, z0.s\n"
+      "smax z18.s, p2/M, z18.s, z0.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z19.s, p2/M, z19.s, z0.s\n"
+      "smax z20.s, p2/M, z20.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z21.s, p2/M, z21.s, z0.s\n"
+      "smax z22.s, p2/M, z22.s, z0.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smax z23.s, p2/M, z23.s, z0.s\n"
+      "smax z24.s, p2/M, z24.s, z0.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z0.s\n"
+      "smax z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "st1b { z20.b }, p1, [x23]\n"
+      "smax z27.s, p2/M, z27.s, z0.s\n"
+      "smax z28.s, p2/M, z28.s, z0.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "smax z29.s, p2/M, z29.s, z0.s\n"
+      "smax z30.s, p2/M, z30.s, z0.s\n"
       "uzp1 z28.h, z28.h, z29.h\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p1, [x20]\n"
+      "st1b { z24.b }, p1, [x22]\n"
+      "smax z31.s, p2/M, z31.s, z0.s\n"
+      "uzp1 z16.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z16.b\n"
+      "st1b { z28.b }, p1, [x21]\n"
+      "addvl x27, x27, #1\n"
       "56:"  // Height 4: Writeback done
       "decw x9, ALL, MUL #4\n"
       "cmp x9, XZR\n"
       "bgt 44b\n"
       "subs %x[M], %x[M], #0x4\n"
       "beq 58f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 57f\n"
-      "add x20, x20, #0x4\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "57:"  // Update direct input
-      "mov x19, #0x4\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x4\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "58:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
-      : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
new file mode 100644
index 0000000000..5de68cc738
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<uint8_t>, \
+    size_t, size_t, \
+    const uint8_t *, \
+    IndirectOutputArg<uint8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_u8qa_mmla_4x4VL( ARGLIST );
+
+class cls_sve_hybrid_u8qa_mmla_4x4VL
+{
+public:
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
+    typedef uint8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<uint32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 4, 8, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 47.30 };
+                case CPUModel::A510:
+                    return { 20.91 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_u8qa_mmla_4x4VL;
+    cls_sve_hybrid_u8qa_mmla_4x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
new file mode 100644
index 0000000000..69894bec41
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
@@ -0,0 +1,1417 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8qa_mmla_4x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z11.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x26, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "cbnz x26, 6f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x25, #0x10\n"
+      "ble 9f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89810  // ummla z16.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "trn2 z1.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45d99814  // ummla z20.s, z0.b, z25.b\n"
+      ".inst 0x45d89811  // ummla z17.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45da9815  // ummla z21.s, z0.b, z26.b\n"
+      ".inst 0x45d99812  // ummla z18.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45d89816  // ummla z22.s, z0.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45da9813  // ummla z19.s, z0.b, z26.b\n"
+      ".inst 0x45d99817  // ummla z23.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45d89830  // ummla z16.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45da9834  // ummla z20.s, z1.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45d99831  // ummla z17.s, z1.b, z25.b\n"
+      ".inst 0x45d89835  // ummla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45db9832  // ummla z18.s, z1.b, z27.b\n"
+      ".inst 0x45da9836  // ummla z22.s, z1.b, z26.b\n"
+      ".inst 0x45d99833  // ummla z19.s, z1.b, z25.b\n"
+      ".inst 0x45d89837  // ummla z23.s, z1.b, z24.b\n"
+      "add x24, x24, #0x10\n"
+      "tbnz %x[flags], #31, 8f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "8:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
+      "bgt 7b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z27.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89810  // ummla z16.s, z0.b, z24.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x8\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "trn2 z1.d, z1.d, z27.d\n"
+      ".inst 0x45da9814  // ummla z20.s, z0.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45d99811  // ummla z17.s, z0.b, z25.b\n"
+      ".inst 0x45d89815  // ummla z21.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45db9812  // ummla z18.s, z0.b, z27.b\n"
+      ".inst 0x45da9816  // ummla z22.s, z0.b, z26.b\n"
+      ".inst 0x45d99813  // ummla z19.s, z0.b, z25.b\n"
+      ".inst 0x45d89817  // ummla z23.s, z0.b, z24.b\n"
+      "addvl x28, x28, #8\n"
+      "ble 10f\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89830  // ummla z16.s, z1.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45d89834  // ummla z20.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45d99831  // ummla z17.s, z1.b, z25.b\n"
+      ".inst 0x45d89835  // ummla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45d99832  // ummla z18.s, z1.b, z25.b\n"
+      ".inst 0x45d89836  // ummla z22.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45d99833  // ummla z19.s, z1.b, z25.b\n"
+      ".inst 0x45d89837  // ummla z23.s, z1.b, z24.b\n"
+      "addvl x28, x28, #8\n"
+      "10:"  // Height 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 11f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "11:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 4b\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z16.d\n"
+      "tbnz %x[flags], #31, 12f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "neg z16.s, p2/M, z16.s\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z16.s\n"
+      "12:"  // Height 1: skip row sum fixup
+      "add z23.s, z23.s, z11.s\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z22.s }, p2/Z, [x10]\n"
+      "ld1w { z21.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z20.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z23.s, z23.s, z22.s\n"
+      "add z17.s, z17.s, z21.s\n"
+      "add z18.s, z18.s, z20.s\n"
+      "add z19.s, z19.s, z16.s\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0x04b076f7  // sqrdmulh z23.s, z23.s, z16.s\n"
+      ".inst 0x04b07631  // sqrdmulh z17.s, z17.s, z16.s\n"
+      "addvl x10, x10, #4\n"
+      ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+      ".inst 0x04b07673  // sqrdmulh z19.s, z19.s, z16.s\n"
+      "tbz %x[flags], #5, 13f\n"
+      "and z22.d, z23.d, z0.d\n"
+      "and z21.d, z17.d, z0.d\n"
+      "and z20.d, z18.d, z0.d\n"
+      "and z16.d, z19.d, z0.d\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z22.s\n"
+      "sqadd z17.s, z17.s, z21.s\n"
+      "sqadd z18.s, z18.s, z20.s\n"
+      "sqadd z19.s, z19.s, z16.s\n"
+      "13:"  // Height 1: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add z23.s, z23.s, z16.s\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "add z17.s, z17.s, z16.s\n"
+      "add z18.s, z18.s, z16.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z16.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z20.s\n"
+      "smin z17.s, p2/M, z17.s, z20.s\n"
+      "smin z18.s, p2/M, z18.s, z20.s\n"
+      "smin z19.s, p2/M, z19.s, z20.s\n"
+      "smax z23.s, p2/M, z23.s, z16.s\n"
+      "smax z17.s, p2/M, z17.s, z16.s\n"
+      "smax z18.s, p2/M, z18.s, z16.s\n"
+      "uzp1 z23.h, z23.h, z17.h\n"
+      "smax z19.s, p2/M, z19.s, z16.s\n"
+      "uzp1 z16.h, z18.h, z19.h\n"
+      "uzp1 z23.b, z23.b, z16.b\n"
+      "st1b { z23.b }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "14:"  // Height 1: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 2b\n"
+      "b 58f\n"
+      "15:"  // Height 2
+      "mov x10, %x[col_bias]\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z15.b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "16:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x26, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "cbnz x26, 20f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "20:"  // Height 2: input setup done
+      "cmp x25, #0x10\n"
+      "ble 23f\n"
+      "21:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z26.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89810  // ummla z16.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "trn2 z1.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45d99814  // ummla z20.s, z0.b, z25.b\n"
+      ".inst 0x45d89811  // ummla z17.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45da9815  // ummla z21.s, z0.b, z26.b\n"
+      ".inst 0x45d99812  // ummla z18.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45d89816  // ummla z22.s, z0.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45da9813  // ummla z19.s, z0.b, z26.b\n"
+      ".inst 0x45d99817  // ummla z23.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45d89830  // ummla z16.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45da9834  // ummla z20.s, z1.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45d99831  // ummla z17.s, z1.b, z25.b\n"
+      ".inst 0x45d89835  // ummla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45db9832  // ummla z18.s, z1.b, z27.b\n"
+      ".inst 0x45da9836  // ummla z22.s, z1.b, z26.b\n"
+      ".inst 0x45d99833  // ummla z19.s, z1.b, z25.b\n"
+      ".inst 0x45d89837  // ummla z23.s, z1.b, z24.b\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "tbnz %x[flags], #31, 22f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "22:"  // Height 2: Multiply loop: unique 3: skip row sum
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
+      "bgt 21b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z27.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89810  // ummla z16.s, z0.b, z24.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x8\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "trn2 z1.d, z1.d, z27.d\n"
+      ".inst 0x45da9814  // ummla z20.s, z0.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45d99811  // ummla z17.s, z0.b, z25.b\n"
+      ".inst 0x45d89815  // ummla z21.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45db9812  // ummla z18.s, z0.b, z27.b\n"
+      ".inst 0x45da9816  // ummla z22.s, z0.b, z26.b\n"
+      ".inst 0x45d99813  // ummla z19.s, z0.b, z25.b\n"
+      ".inst 0x45d89817  // ummla z23.s, z0.b, z24.b\n"
+      "addvl x28, x28, #8\n"
+      "ble 24f\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89830  // ummla z16.s, z1.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45d89834  // ummla z20.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45d99831  // ummla z17.s, z1.b, z25.b\n"
+      ".inst 0x45d89835  // ummla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45d99832  // ummla z18.s, z1.b, z25.b\n"
+      ".inst 0x45d89836  // ummla z22.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45d99833  // ummla z19.s, z1.b, z25.b\n"
+      ".inst 0x45d89837  // ummla z23.s, z1.b, z24.b\n"
+      "addvl x28, x28, #8\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 25f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "25:"  // Height 2: Multiply loop: unique 4: skip row sum
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 18b\n"
+      "uzp1 z24.d, z16.d, z20.d\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "add x23, x27, x20\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z24.d\n"
+      "tbnz %x[flags], #31, 26f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "neg z24.s, p2/M, z24.s\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z24.s\n"
+      "mul z12.s, p2/M, z12.s, z24.s\n"
+      "26:"  // Height 2: skip row sum fixup
+      "add z23.s, z23.s, z11.s\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z28.s }, p2/Z, [x10]\n"
+      "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add z16.s, z16.s, z12.s\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z23.s, z23.s, z28.s\n"
+      "add z20.s, z20.s, z27.s\n"
+      "addvl x10, x10, #4\n"
+      "add z21.s, z21.s, z26.s\n"
+      "add z22.s, z22.s, z25.s\n"
+      "add z16.s, z16.s, z28.s\n"
+      "add z17.s, z17.s, z27.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z18.s, z18.s, z26.s\n"
+      "add z19.s, z19.s, z25.s\n"
+      ".inst 0x04b876f7  // sqrdmulh z23.s, z23.s, z24.s\n"
+      ".inst 0x04b87694  // sqrdmulh z20.s, z20.s, z24.s\n"
+      ".inst 0x04b876b5  // sqrdmulh z21.s, z21.s, z24.s\n"
+      ".inst 0x04b876d6  // sqrdmulh z22.s, z22.s, z24.s\n"
+      ".inst 0x04b87610  // sqrdmulh z16.s, z16.s, z24.s\n"
+      ".inst 0x04b87631  // sqrdmulh z17.s, z17.s, z24.s\n"
+      ".inst 0x04b87652  // sqrdmulh z18.s, z18.s, z24.s\n"
+      ".inst 0x04b87673  // sqrdmulh z19.s, z19.s, z24.s\n"
+      "tbz %x[flags], #5, 27f\n"
+      "and z24.d, z23.d, z0.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z24.s\n"
+      "and z30.d, z20.d, z0.d\n"
+      "and z29.d, z21.d, z0.d\n"
+      "and z28.d, z22.d, z0.d\n"
+      "and z27.d, z16.d, z0.d\n"
+      "and z26.d, z17.d, z0.d\n"
+      "and z25.d, z18.d, z0.d\n"
+      "and z24.d, z19.d, z0.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z30.s\n"
+      "sqadd z21.s, z21.s, z29.s\n"
+      "sqadd z22.s, z22.s, z28.s\n"
+      "sqadd z16.s, z16.s, z27.s\n"
+      "sqadd z17.s, z17.s, z26.s\n"
+      "sqadd z18.s, z18.s, z25.s\n"
+      "sqadd z19.s, z19.s, z24.s\n"
+      "27:"  // Height 2: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add z23.s, z23.s, z24.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "add z20.s, z20.s, z24.s\n"
+      "add z21.s, z21.s, z24.s\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z22.s, z22.s, z24.s\n"
+      "add z16.s, z16.s, z24.s\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "add z17.s, z17.s, z24.s\n"
+      "add z18.s, z18.s, z24.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z24.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z25.s\n"
+      "smin z20.s, p2/M, z20.s, z25.s\n"
+      "smin z21.s, p2/M, z21.s, z25.s\n"
+      "smin z22.s, p2/M, z22.s, z25.s\n"
+      "smin z16.s, p2/M, z16.s, z25.s\n"
+      "smin z17.s, p2/M, z17.s, z25.s\n"
+      "smin z18.s, p2/M, z18.s, z25.s\n"
+      "smin z19.s, p2/M, z19.s, z25.s\n"
+      "smax z23.s, p2/M, z23.s, z24.s\n"
+      "smax z20.s, p2/M, z20.s, z24.s\n"
+      "smax z21.s, p2/M, z21.s, z24.s\n"
+      "uzp1 z23.h, z23.h, z20.h\n"
+      "smax z22.s, p2/M, z22.s, z24.s\n"
+      "smax z16.s, p2/M, z16.s, z24.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "uzp1 z23.b, z23.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z24.s\n"
+      "smax z18.s, p2/M, z18.s, z24.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z23.b }, p1, [x27]\n"
+      "smax z19.s, p2/M, z19.s, z24.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x23]\n"
+      "addvl x27, x27, #1\n"
+      "28:"  // Height 2: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 16b\n"
+      "b 58f\n"
+      "29:"  // Height 3
+      "mov x10, %x[col_bias]\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "30:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "31:"  // Height 3: setup done
+      "mov x26, #0x0\n"
+      "32:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x26, 34f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 34f\n"
+      "33:"  // Height 3: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "34:"  // Height 3: input setup done
+      "cmp x25, #0x10\n"
+      "ble 37f\n"
+      "35:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45c49810  // ummla z16.s, z0.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45c49858  // ummla z24.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c59814  // ummla z20.s, z0.b, z5.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c5985c  // ummla z28.s, z2.b, z5.b\n"
+      ".inst 0x45c49811  // ummla z17.s, z0.b, z4.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45c49859  // ummla z25.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45c99815  // ummla z21.s, z0.b, z9.b\n"
+      ".inst 0x45c9985d  // ummla z29.s, z2.b, z9.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45c89812  // ummla z18.s, z0.b, z8.b\n"
+      ".inst 0x45c8985a  // ummla z26.s, z2.b, z8.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45c79816  // ummla z22.s, z0.b, z7.b\n"
+      ".inst 0x45c7985e  // ummla z30.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45c69813  // ummla z19.s, z0.b, z6.b\n"
+      ".inst 0x45c6985b  // ummla z27.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      ".inst 0x45c5985f  // ummla z31.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
+      ".inst 0x45c49878  // ummla z24.s, z3.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45ca9834  // ummla z20.s, z1.b, z10.b\n"
+      ".inst 0x45ca987c  // ummla z28.s, z3.b, z10.b\n"
+      ".inst 0x45c99831  // ummla z17.s, z1.b, z9.b\n"
+      ".inst 0x45c99879  // ummla z25.s, z3.b, z9.b\n"
+      ".inst 0x45c89835  // ummla z21.s, z1.b, z8.b\n"
+      ".inst 0x45c8987d  // ummla z29.s, z3.b, z8.b\n"
+      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      ".inst 0x45c7987a  // ummla z26.s, z3.b, z7.b\n"
+      ".inst 0x45c69836  // ummla z22.s, z1.b, z6.b\n"
+      ".inst 0x45c6987e  // ummla z30.s, z3.b, z6.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
+      ".inst 0x45c49837  // ummla z23.s, z1.b, z4.b\n"
+      ".inst 0x45c4987f  // ummla z31.s, z3.b, z4.b\n"
+      "tbnz %x[flags], #31, 36f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "udot z13.s, z3.b, z15.b\n"
+      "36:"  // Height 3: Multiply loop: unique 5: skip row sum
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
+      "bgt 35b\n"
+      "37:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "subs x25, x25, #0x8\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c49814  // ummla z20.s, z0.b, z4.b\n"
+      ".inst 0x45c4985c  // ummla z28.s, z2.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45c99811  // ummla z17.s, z0.b, z9.b\n"
+      ".inst 0x45c99859  // ummla z25.s, z2.b, z9.b\n"
+      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
+      ".inst 0x45c8985d  // ummla z29.s, z2.b, z8.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c79812  // ummla z18.s, z0.b, z7.b\n"
+      ".inst 0x45c7985a  // ummla z26.s, z2.b, z7.b\n"
+      ".inst 0x45c69816  // ummla z22.s, z0.b, z6.b\n"
+      ".inst 0x45c6985e  // ummla z30.s, z2.b, z6.b\n"
+      ".inst 0x45c59813  // ummla z19.s, z0.b, z5.b\n"
+      ".inst 0x45c5985b  // ummla z27.s, z2.b, z5.b\n"
+      ".inst 0x45c49817  // ummla z23.s, z0.b, z4.b\n"
+      ".inst 0x45c4985f  // ummla z31.s, z2.b, z4.b\n"
+      "ble 38f\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
+      ".inst 0x45c49878  // ummla z24.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c59834  // ummla z20.s, z1.b, z5.b\n"
+      ".inst 0x45c5987c  // ummla z28.s, z3.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c49831  // ummla z17.s, z1.b, z4.b\n"
+      ".inst 0x45c49879  // ummla z25.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45c89835  // ummla z21.s, z1.b, z8.b\n"
+      ".inst 0x45c8987d  // ummla z29.s, z3.b, z8.b\n"
+      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      ".inst 0x45c7987a  // ummla z26.s, z3.b, z7.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c69836  // ummla z22.s, z1.b, z6.b\n"
+      ".inst 0x45c6987e  // ummla z30.s, z3.b, z6.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
+      ".inst 0x45c49837  // ummla z23.s, z1.b, z4.b\n"
+      ".inst 0x45c4987f  // ummla z31.s, z3.b, z4.b\n"
+      "38:"  // Height 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 39f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "udot z13.s, z3.b, z15.b\n"
+      "39:"  // Height 3: Multiply loop: unique 6: skip row sum
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 32b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z0.d, z16.d, z20.d\n"
+      "add x23, x27, x20\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "add x22, x23, x20\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z0.d\n"
+      "tbnz %x[flags], #31, 40f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
+      "neg z23.s, p2/M, z23.s\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z23.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z12.s, p2/M, z12.s, z23.s\n"
+      "mul z13.s, p2/M, z13.s, z23.s\n"
+      "40:"  // Height 3: skip row sum fixup
+      "add z31.s, z31.s, z11.s\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add z16.s, z16.s, z12.s\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "addvl x10, x10, #4\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z31.s, z31.s, z0.s\n"
+      "add z20.s, z20.s, z30.s\n"
+      "add z21.s, z21.s, z29.s\n"
+      "add z22.s, z22.s, z28.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z30.s\n"
+      "add z18.s, z18.s, z29.s\n"
+      "add z19.s, z19.s, z28.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z30.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z26.s, z26.s, z29.s\n"
+      "add z27.s, z27.s, z28.s\n"
+      ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+      ".inst 0x04b77694  // sqrdmulh z20.s, z20.s, z23.s\n"
+      ".inst 0x04b776b5  // sqrdmulh z21.s, z21.s, z23.s\n"
+      ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+      ".inst 0x04b77610  // sqrdmulh z16.s, z16.s, z23.s\n"
+      ".inst 0x04b77631  // sqrdmulh z17.s, z17.s, z23.s\n"
+      ".inst 0x04b77652  // sqrdmulh z18.s, z18.s, z23.s\n"
+      ".inst 0x04b77673  // sqrdmulh z19.s, z19.s, z23.s\n"
+      ".inst 0x04b77718  // sqrdmulh z24.s, z24.s, z23.s\n"
+      ".inst 0x04b77739  // sqrdmulh z25.s, z25.s, z23.s\n"
+      ".inst 0x04b7775a  // sqrdmulh z26.s, z26.s, z23.s\n"
+      ".inst 0x04b7777b  // sqrdmulh z27.s, z27.s, z23.s\n"
+      "tbz %x[flags], #5, 41f\n"
+      "and z1.d, z31.d, z0.d\n"
+      "and z30.d, z20.d, z0.d\n"
+      "and z29.d, z21.d, z0.d\n"
+      "and z28.d, z22.d, z0.d\n"
+      "and z23.d, z16.d, z0.d\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z1.s\n"
+      "sqadd z20.s, z20.s, z30.s\n"
+      "sqadd z21.s, z21.s, z29.s\n"
+      "sqadd z22.s, z22.s, z28.s\n"
+      "sqadd z16.s, z16.s, z23.s\n"
+      "and z3.d, z17.d, z0.d\n"
+      "and z2.d, z18.d, z0.d\n"
+      "and z1.d, z19.d, z0.d\n"
+      "and z30.d, z24.d, z0.d\n"
+      "and z29.d, z25.d, z0.d\n"
+      "and z28.d, z26.d, z0.d\n"
+      "and z23.d, z27.d, z0.d\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z3.s\n"
+      "sqadd z18.s, z18.s, z2.s\n"
+      "sqadd z19.s, z19.s, z1.s\n"
+      "sqadd z24.s, z24.s, z30.s\n"
+      "sqadd z25.s, z25.s, z29.s\n"
+      "sqadd z26.s, z26.s, z28.s\n"
+      "sqadd z27.s, z27.s, z23.s\n"
+      "41:"  // Height 3: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add z31.s, z31.s, z23.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "add z20.s, z20.s, z23.s\n"
+      "add z21.s, z21.s, z23.s\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z22.s, z22.s, z23.s\n"
+      "add z16.s, z16.s, z23.s\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "add z17.s, z17.s, z23.s\n"
+      "add z18.s, z18.s, z23.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "add z19.s, z19.s, z23.s\n"
+      "add z24.s, z24.s, z23.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "add z25.s, z25.s, z23.s\n"
+      "add z26.s, z26.s, z23.s\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z23.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z28.s\n"
+      "smin z20.s, p2/M, z20.s, z28.s\n"
+      "smin z21.s, p2/M, z21.s, z28.s\n"
+      "smin z22.s, p2/M, z22.s, z28.s\n"
+      "smin z16.s, p2/M, z16.s, z28.s\n"
+      "smin z17.s, p2/M, z17.s, z28.s\n"
+      "smin z18.s, p2/M, z18.s, z28.s\n"
+      "smin z19.s, p2/M, z19.s, z28.s\n"
+      "smin z24.s, p2/M, z24.s, z28.s\n"
+      "smin z25.s, p2/M, z25.s, z28.s\n"
+      "smin z26.s, p2/M, z26.s, z28.s\n"
+      "smin z27.s, p2/M, z27.s, z28.s\n"
+      "smax z31.s, p2/M, z31.s, z23.s\n"
+      "smax z20.s, p2/M, z20.s, z23.s\n"
+      "smax z21.s, p2/M, z21.s, z23.s\n"
+      "uzp1 z31.h, z31.h, z20.h\n"
+      "smax z22.s, p2/M, z22.s, z23.s\n"
+      "smax z16.s, p2/M, z16.s, z23.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "uzp1 z31.b, z31.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z23.s\n"
+      "smax z18.s, p2/M, z18.s, z23.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z31.b }, p1, [x27]\n"
+      "smax z19.s, p2/M, z19.s, z23.s\n"
+      "smax z24.s, p2/M, z24.s, z23.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z25.s, p2/M, z25.s, z23.s\n"
+      "smax z26.s, p2/M, z26.s, z23.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "st1b { z16.b }, p1, [x23]\n"
+      "smax z27.s, p2/M, z27.s, z23.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x22]\n"
+      "addvl x27, x27, #1\n"
+      "42:"  // Height 3: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 30b\n"
+      "b 58f\n"
+      "43:"  // Height 4
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x4\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[output_ptr]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "44:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p1.b, x20, x9\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "45:"  // Height 4: setup done
+      "mov x26, #0x0\n"
+      "46:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w25, [x20, x26, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 47f\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
+      "cbnz x26, 48f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 48f\n"
+      "47:"  // Height 4: setup direct input
+      "mov x24, %x[input_ptr]\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "48:"  // Height 4: input setup done
+      "cmp x25, #0x10\n"
+      "ble 51f\n"
+      "49:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45c49810  // ummla z16.s, z0.b, z4.b\n"
+      ".inst 0x45c49858  // ummla z24.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45c49814  // ummla z20.s, z0.b, z4.b\n"
+      ".inst 0x45c4985c  // ummla z28.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45c59811  // ummla z17.s, z0.b, z5.b\n"
+      ".inst 0x45c59859  // ummla z25.s, z2.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c49815  // ummla z21.s, z0.b, z4.b\n"
+      ".inst 0x45c4985d  // ummla z29.s, z2.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45c89812  // ummla z18.s, z0.b, z8.b\n"
+      ".inst 0x45c8985a  // ummla z26.s, z2.b, z8.b\n"
+      ".inst 0x45c79816  // ummla z22.s, z0.b, z7.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x45c7985e  // ummla z30.s, z2.b, z7.b\n"
+      ".inst 0x45c69813  // ummla z19.s, z0.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45c6985b  // ummla z27.s, z2.b, z6.b\n"
+      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45c5985f  // ummla z31.s, z2.b, z5.b\n"
+      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x45c49878  // ummla z24.s, z3.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45ca9834  // ummla z20.s, z1.b, z10.b\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x45ca987c  // ummla z28.s, z3.b, z10.b\n"
+      ".inst 0x45c99831  // ummla z17.s, z1.b, z9.b\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45c99879  // ummla z25.s, z3.b, z9.b\n"
+      ".inst 0x45c89835  // ummla z21.s, z1.b, z8.b\n"
+      ".inst 0x45c8987d  // ummla z29.s, z3.b, z8.b\n"
+      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      ".inst 0x45c7987a  // ummla z26.s, z3.b, z7.b\n"
+      ".inst 0x45c69836  // ummla z22.s, z1.b, z6.b\n"
+      ".inst 0x45c6987e  // ummla z30.s, z3.b, z6.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
+      ".inst 0x45c49837  // ummla z23.s, z1.b, z4.b\n"
+      ".inst 0x45c4987f  // ummla z31.s, z3.b, z4.b\n"
+      "tbnz %x[flags], #31, 50f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "udot z13.s, z3.b, z15.b\n"
+      "50:"  // Height 4: Multiply loop: unique 7: skip row sum
+      "sub x25, x25, #0x10\n"
+      "cmp x25, #0x10\n"
+      "bgt 49b\n"
+      "51:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x25\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45c49810  // ummla z16.s, z0.b, z4.b\n"
+      ".inst 0x45c49858  // ummla z24.s, z2.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "subs x25, x25, #0x8\n"
+      ".inst 0x45c59814  // ummla z20.s, z0.b, z5.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45c5985c  // ummla z28.s, z2.b, z5.b\n"
+      ".inst 0x45c49811  // ummla z17.s, z0.b, z4.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c49859  // ummla z25.s, z2.b, z4.b\n"
+      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45c8985d  // ummla z29.s, z2.b, z8.b\n"
+      ".inst 0x45c79812  // ummla z18.s, z0.b, z7.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c7985a  // ummla z26.s, z2.b, z7.b\n"
+      ".inst 0x45c69816  // ummla z22.s, z0.b, z6.b\n"
+      ".inst 0x45c6985e  // ummla z30.s, z2.b, z6.b\n"
+      ".inst 0x45c59813  // ummla z19.s, z0.b, z5.b\n"
+      ".inst 0x45c5985b  // ummla z27.s, z2.b, z5.b\n"
+      ".inst 0x45c49817  // ummla z23.s, z0.b, z4.b\n"
+      ".inst 0x45c4985f  // ummla z31.s, z2.b, z4.b\n"
+      "ble 52f\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
+      ".inst 0x45c49878  // ummla z24.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c59834  // ummla z20.s, z1.b, z5.b\n"
+      ".inst 0x45c5987c  // ummla z28.s, z3.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c49831  // ummla z17.s, z1.b, z4.b\n"
+      ".inst 0x45c49879  // ummla z25.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45c89835  // ummla z21.s, z1.b, z8.b\n"
+      ".inst 0x45c8987d  // ummla z29.s, z3.b, z8.b\n"
+      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      ".inst 0x45c7987a  // ummla z26.s, z3.b, z7.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c69836  // ummla z22.s, z1.b, z6.b\n"
+      ".inst 0x45c6987e  // ummla z30.s, z3.b, z6.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
+      ".inst 0x45c49837  // ummla z23.s, z1.b, z4.b\n"
+      ".inst 0x45c4987f  // ummla z31.s, z3.b, z4.b\n"
+      "52:"  // Height 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 53f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "udot z13.s, z3.b, z15.b\n"
+      "53:"  // Height 4: Multiply loop: unique 8: skip row sum
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x26, x26, #0x1\n"
+      "cmp x26, x20\n"
+      "bne 46b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z0.d, z16.d, z20.d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "add x21, x22, x20\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z0.d\n"
+      "tbnz %x[flags], #31, 54f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
+      "neg z0.s, p2/M, z0.s\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z0.s\n"
+      "mov z14.s, z13.s[3]\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z12.s, p2/M, z12.s, z0.s\n"
+      "mul z13.s, p2/M, z13.s, z0.s\n"
+      "mul z14.s, p2/M, z14.s, z0.s\n"
+      "54:"  // Height 4: skip row sum fixup
+      "add z31.s, z31.s, z11.s\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z4.s }, p2/Z, [x10]\n"
+      "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add z16.s, z16.s, z12.s\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z23.s, z23.s, z13.s\n"
+      "add z28.s, z28.s, z13.s\n"
+      "addvl x10, x10, #4\n"
+      "add z29.s, z29.s, z13.s\n"
+      "add z30.s, z30.s, z13.s\n"
+      "add z24.s, z24.s, z14.s\n"
+      "add z25.s, z25.s, z14.s\n"
+      "add z26.s, z26.s, z14.s\n"
+      "add z27.s, z27.s, z14.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z3.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z0.s\n"
+      "add z18.s, z18.s, z3.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z28.s, z28.s, z0.s\n"
+      "add z29.s, z29.s, z3.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z26.s, z26.s, z3.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      ".inst 0x04a177ff  // sqrdmulh z31.s, z31.s, z1.s\n"
+      ".inst 0x04a17694  // sqrdmulh z20.s, z20.s, z1.s\n"
+      ".inst 0x04a176b5  // sqrdmulh z21.s, z21.s, z1.s\n"
+      ".inst 0x04a176d6  // sqrdmulh z22.s, z22.s, z1.s\n"
+      ".inst 0x04a17610  // sqrdmulh z16.s, z16.s, z1.s\n"
+      ".inst 0x04a17631  // sqrdmulh z17.s, z17.s, z1.s\n"
+      ".inst 0x04a17652  // sqrdmulh z18.s, z18.s, z1.s\n"
+      ".inst 0x04a17673  // sqrdmulh z19.s, z19.s, z1.s\n"
+      ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+      ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+      ".inst 0x04a177bd  // sqrdmulh z29.s, z29.s, z1.s\n"
+      ".inst 0x04a177de  // sqrdmulh z30.s, z30.s, z1.s\n"
+      ".inst 0x04a17718  // sqrdmulh z24.s, z24.s, z1.s\n"
+      ".inst 0x04a17739  // sqrdmulh z25.s, z25.s, z1.s\n"
+      ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+      ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+      "tbz %x[flags], #5, 55f\n"
+      "and z2.d, z31.d, z0.d\n"
+      "and z1.d, z20.d, z0.d\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z2.s\n"
+      "sqadd z20.s, z20.s, z1.s\n"
+      "and z7.d, z21.d, z0.d\n"
+      "and z6.d, z22.d, z0.d\n"
+      "and z5.d, z16.d, z0.d\n"
+      "and z4.d, z17.d, z0.d\n"
+      "and z3.d, z18.d, z0.d\n"
+      "and z2.d, z19.d, z0.d\n"
+      "and z1.d, z23.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z7.s\n"
+      "sqadd z22.s, z22.s, z6.s\n"
+      "sqadd z16.s, z16.s, z5.s\n"
+      "sqadd z17.s, z17.s, z4.s\n"
+      "sqadd z18.s, z18.s, z3.s\n"
+      "sqadd z19.s, z19.s, z2.s\n"
+      "sqadd z23.s, z23.s, z1.s\n"
+      "and z7.d, z28.d, z0.d\n"
+      "and z6.d, z29.d, z0.d\n"
+      "and z5.d, z30.d, z0.d\n"
+      "and z4.d, z24.d, z0.d\n"
+      "and z3.d, z25.d, z0.d\n"
+      "and z2.d, z26.d, z0.d\n"
+      "and z1.d, z27.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z7.s\n"
+      "sqadd z29.s, z29.s, z6.s\n"
+      "sqadd z30.s, z30.s, z5.s\n"
+      "sqadd z24.s, z24.s, z4.s\n"
+      "sqadd z25.s, z25.s, z3.s\n"
+      "sqadd z26.s, z26.s, z2.s\n"
+      "sqadd z27.s, z27.s, z1.s\n"
+      "55:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add z31.s, z31.s, z2.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "add z20.s, z20.s, z2.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z16.s, z16.s, z2.s\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z23.s, z23.s, z2.s\n"
+      ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
+      ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
+      "add z28.s, z28.s, z2.s\n"
+      "add z29.s, z29.s, z2.s\n"
+      ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z24.s, z24.s, z2.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z1.s\n"
+      "smin z20.s, p2/M, z20.s, z1.s\n"
+      "smin z21.s, p2/M, z21.s, z1.s\n"
+      "smin z22.s, p2/M, z22.s, z1.s\n"
+      "smin z16.s, p2/M, z16.s, z1.s\n"
+      "smin z17.s, p2/M, z17.s, z1.s\n"
+      "smin z18.s, p2/M, z18.s, z1.s\n"
+      "smin z19.s, p2/M, z19.s, z1.s\n"
+      "smin z23.s, p2/M, z23.s, z1.s\n"
+      "smin z28.s, p2/M, z28.s, z1.s\n"
+      "smin z29.s, p2/M, z29.s, z1.s\n"
+      "smin z30.s, p2/M, z30.s, z1.s\n"
+      "smin z24.s, p2/M, z24.s, z1.s\n"
+      "smin z25.s, p2/M, z25.s, z1.s\n"
+      "smin z26.s, p2/M, z26.s, z1.s\n"
+      "smin z27.s, p2/M, z27.s, z1.s\n"
+      "smax z31.s, p2/M, z31.s, z0.s\n"
+      "smax z20.s, p2/M, z20.s, z0.s\n"
+      "smax z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z31.h, z31.h, z20.h\n"
+      "smax z22.s, p2/M, z22.s, z0.s\n"
+      "smax z16.s, p2/M, z16.s, z0.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "uzp1 z31.b, z31.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z0.s\n"
+      "smax z18.s, p2/M, z18.s, z0.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "st1b { z31.b }, p1, [x27]\n"
+      "smax z19.s, p2/M, z19.s, z0.s\n"
+      "smax z23.s, p2/M, z23.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "smax z28.s, p2/M, z28.s, z0.s\n"
+      "smax z29.s, p2/M, z29.s, z0.s\n"
+      "uzp1 z23.h, z23.h, z28.h\n"
+      "st1b { z16.b }, p1, [x23]\n"
+      "smax z30.s, p2/M, z30.s, z0.s\n"
+      "smax z24.s, p2/M, z24.s, z0.s\n"
+      "uzp1 z16.h, z29.h, z30.h\n"
+      "uzp1 z23.b, z23.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z0.s\n"
+      "smax z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "st1b { z23.b }, p1, [x22]\n"
+      "smax z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x21]\n"
+      "addvl x27, x27, #1\n"
+      "56:"  // Height 4: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 44b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 58f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 57f\n"
+      "add x21, x21, #0x4\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "57:"  // Update direct input
+      "mov x20, #0x4\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "58:"  // Exit
+      : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
index 4ea1d17c4e..e9197e8ec5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,21 +10,22 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -38,11 +39,13 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void sve_hybrid_u8u32_dot_6x4VL( ARGLIST );
+void sve_hybrid_u8u32_dot_6x4VL_a64fx( ARGLIST );
 
 class cls_sve_hybrid_u8u32_dot_6x4VL
 {
 public:
-    typedef uint8_t operand_type;
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
     typedef uint32_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,16 +71,54 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.56 };
+                case CPUModel::A510:
+                    return { 20.98 };
+                case CPUModel::V1:
+                    return { 62.19 };
+                case CPUModel::A64FX:
+                    return { 91.23 };
+            }
+        }
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.59, 15.67, 0.61 };
+                case CPUModel::A510:
+                    return { 22.75, 3.90, 0.47 };
+                case CPUModel::V1:
+                    return { 48.09, 16.24, 0.83 };
+                case CPUModel::A64FX:
+                    return { 101.62, 3.15, 0.42 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_u8u32_dot_6x4VL;
-    cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *)
+    cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_hybrid_u8u32_dot_6x4VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
new file mode 100644
index 0000000000..4d0f44982a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
@@ -0,0 +1,1032 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_dot_6x4VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+    const uint32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 51f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 41f\n"
+      "beq 31f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 21f\n"
+      "beq 11f\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 3f\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "b 4f\n"
+      "3:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add x26, x26, #0x4\n"
+      "udot z10.s, z17.b, z0.b\n"
+      "udot z11.s, z16.b, z0.b\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 8b\n"
+      "9:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "udot z10.s, z17.b, z0.b\n"
+      "udot z11.s, z16.b, z0.b\n"
+      "addvl x10, x10, #4\n"
+      "bne 5b\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "10:"  // Height 1: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 2b\n"
+      "b 62f\n"
+      "11:"  // Height 2
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "12:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 13f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x20]\n"
+      "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 14f\n"
+      "13:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "14:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "15:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 17f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "17:"  // Height 2: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 19f\n"
+      "18:"  // Height 2: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x26, x26, #0x4\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "subs x27, x27, #0x4\n"
+      "add x25, x25, #0x4\n"
+      "udot z10.s, z17.b, z0.b\n"
+      "udot z14.s, z17.b, z1.b\n"
+      "udot z11.s, z16.b, z0.b\n"
+      "udot z15.s, z16.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 18b\n"
+      "19:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "udot z10.s, z17.b, z0.b\n"
+      "udot z14.s, z17.b, z1.b\n"
+      "addvl x10, x10, #4\n"
+      "udot z11.s, z16.b, z0.b\n"
+      "udot z15.s, z16.b, z1.b\n"
+      "bne 15b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x20]\n"
+      "st1w { z13.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x20, #3, MUL VL]\n"
+      "20:"  // Height 2: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 12b\n"
+      "b 62f\n"
+      "21:"  // Height 3
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "22:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 23f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x21]\n"
+      "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x20]\n"
+      "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 24f\n"
+      "23:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "24:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "25:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 26f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 27f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "b 27f\n"
+      "26:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "27:"  // Height 3: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 29f\n"
+      "28:"  // Height 3: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x4\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x25, x25, #0x4\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add x24, x24, #0x4\n"
+      "udot z10.s, z21.b, z0.b\n"
+      "udot z14.s, z21.b, z1.b\n"
+      "udot z18.s, z21.b, z2.b\n"
+      "udot z11.s, z20.b, z0.b\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "udot z15.s, z20.b, z1.b\n"
+      "udot z19.s, z20.b, z2.b\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 28b\n"
+      "29:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x28, x28, #0x1\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "cmp x28, x20\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z21.b, z0.b\n"
+      "udot z14.s, z21.b, z1.b\n"
+      "udot z18.s, z21.b, z2.b\n"
+      "udot z11.s, z20.b, z0.b\n"
+      "udot z15.s, z20.b, z1.b\n"
+      "udot z19.s, z20.b, z2.b\n"
+      "bne 25b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x21]\n"
+      "st1w { z13.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x20]\n"
+      "st1w { z17.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x20, #3, MUL VL]\n"
+      "30:"  // Height 3: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 22b\n"
+      "b 62f\n"
+      "31:"  // Height 4
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "32:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 33f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x22]\n"
+      "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x21]\n"
+      "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x20]\n"
+      "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "34:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "35:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 37f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 37f\n"
+      "36:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "37:"  // Height 4: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 39f\n"
+      "38:"  // Height 4: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x4\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x25, x25, #0x4\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z25.b, z0.b\n"
+      "udot z14.s, z25.b, z1.b\n"
+      "udot z18.s, z25.b, z2.b\n"
+      "udot z22.s, z25.b, z3.b\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "udot z11.s, z24.b, z0.b\n"
+      "udot z15.s, z24.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "udot z19.s, z24.b, z2.b\n"
+      "udot z23.s, z24.b, z3.b\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 38b\n"
+      "39:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x28, x28, #0x1\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "cmp x28, x20\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z25.b, z0.b\n"
+      "udot z14.s, z25.b, z1.b\n"
+      "udot z18.s, z25.b, z2.b\n"
+      "udot z22.s, z25.b, z3.b\n"
+      "udot z11.s, z24.b, z0.b\n"
+      "udot z15.s, z24.b, z1.b\n"
+      "udot z19.s, z24.b, z2.b\n"
+      "udot z23.s, z24.b, z3.b\n"
+      "bne 35b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x22]\n"
+      "st1w { z13.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x21]\n"
+      "st1w { z17.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x20]\n"
+      "st1w { z21.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x20, #3, MUL VL]\n"
+      "40:"  // Height 4: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 32b\n"
+      "b 62f\n"
+      "41:"  // Height 5
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "42:"  // Height 5: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 43f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 44f\n"
+      "43:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "44:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "45:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 46f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 47f\n"
+      "46:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "47:"  // Height 5: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 49f\n"
+      "48:"  // Height 5: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x4\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "add x25, x25, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "udot z24.s, z6.b, z4.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x23, x23, #0x4\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "add x22, x22, #0x4\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "udot z25.s, z7.b, z4.b\n"
+      "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z29.b, z0.b\n"
+      "udot z14.s, z29.b, z1.b\n"
+      "udot z18.s, z29.b, z2.b\n"
+      "udot z22.s, z29.b, z3.b\n"
+      "udot z26.s, z29.b, z4.b\n"
+      "udot z11.s, z28.b, z0.b\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "udot z15.s, z28.b, z1.b\n"
+      "udot z19.s, z28.b, z2.b\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "udot z23.s, z28.b, z3.b\n"
+      "udot z27.s, z28.b, z4.b\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 48b\n"
+      "49:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x28, x28, #0x1\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "cmp x28, x20\n"
+      "udot z24.s, z6.b, z4.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "udot z25.s, z7.b, z4.b\n"
+      "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z29.b, z0.b\n"
+      "udot z14.s, z29.b, z1.b\n"
+      "udot z18.s, z29.b, z2.b\n"
+      "udot z22.s, z29.b, z3.b\n"
+      "udot z26.s, z29.b, z4.b\n"
+      "udot z11.s, z28.b, z0.b\n"
+      "udot z15.s, z28.b, z1.b\n"
+      "udot z19.s, z28.b, z2.b\n"
+      "udot z23.s, z28.b, z3.b\n"
+      "udot z27.s, z28.b, z4.b\n"
+      "bne 45b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x21]\n"
+      "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x20]\n"
+      "st1w { z25.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x20, #3, MUL VL]\n"
+      "50:"  // Height 5: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 42b\n"
+      "b 62f\n"
+      "51:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "52:"  // Height 6: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p0.s, x20, x11\n"
+      "tbz %x[flags], #0, 53f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x24]\n"
+      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x22]\n"
+      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x21]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p3/Z, [x20]\n"
+      "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 54f\n"
+      "53:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "54:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "55:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 56f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 57f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 57f\n"
+      "56:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "57:"  // Height 6: input setup done
+      "subs x27, x27, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1rw { z5.s }, p4/Z, [x21]\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "ble 59f\n"
+      "58:"  // Height 6: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x26, x26, #0x4\n"
+      "subs x27, x27, #0x4\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "add x25, x25, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "udot z24.s, z6.b, z4.b\n"
+      "udot z28.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "add x23, x23, #0x4\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "add x22, x22, #0x4\n"
+      "add x21, x21, #0x4\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "udot z25.s, z7.b, z4.b\n"
+      "udot z29.s, z7.b, z5.b\n"
+      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z18.s, z6.b, z2.b\n"
+      "udot z22.s, z6.b, z3.b\n"
+      "udot z26.s, z6.b, z4.b\n"
+      "udot z30.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x10]\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x26]\n"
+      "ld1rw { z1.s }, p4/Z, [x25]\n"
+      "udot z19.s, z7.b, z2.b\n"
+      "udot z23.s, z7.b, z3.b\n"
+      "ld1rw { z2.s }, p4/Z, [x24]\n"
+      "ld1rw { z3.s }, p4/Z, [x23]\n"
+      "udot z27.s, z7.b, z4.b\n"
+      "udot z31.s, z7.b, z5.b\n"
+      "ld1rw { z4.s }, p4/Z, [x22]\n"
+      "ld1rw { z5.s }, p4/Z, [x21]\n"
+      "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
+      "bgt 58b\n"
+      "59:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x28, x28, #0x1\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "cmp x28, x20\n"
+      "udot z24.s, z6.b, z4.b\n"
+      "udot z28.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "udot z25.s, z7.b, z4.b\n"
+      "udot z29.s, z7.b, z5.b\n"
+      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z18.s, z6.b, z2.b\n"
+      "udot z22.s, z6.b, z3.b\n"
+      "udot z26.s, z6.b, z4.b\n"
+      "udot z30.s, z6.b, z5.b\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "udot z19.s, z7.b, z2.b\n"
+      "udot z23.s, z7.b, z3.b\n"
+      "udot z27.s, z7.b, z4.b\n"
+      "udot z31.s, z7.b, z5.b\n"
+      "bne 55b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p3, [x24]\n"
+      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x23]\n"
+      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x22]\n"
+      "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x21]\n"
+      "st1w { z25.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z28.s }, p3, [x20]\n"
+      "st1w { z29.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z30.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z31.s }, p0, [x20, #3, MUL VL]\n"
+      "60:"  // Height 6: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 52b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 62f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "61:"  // Update direct input
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "62:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
index 97f6665d85..7871c0b003 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,18 +10,18 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -87,23 +87,23 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "cmp %x[M], #0x2\n"
       "bgt 23f\n"
       "beq 12f\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x28, %x[output_ptr]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 3f\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
       "b 4f\n"
       "3:"  // Height 1: no accumulate
       "mov z8.s, #0x0\n"
@@ -111,151 +111,148 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "mov z10.s, #0x0\n"
       "mov z11.s, #0x0\n"
       "4:"  // Height 1: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "5:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 6f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "cbnz x27, 7f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
       "b 7f\n"
       "6:"  // Height 1: setup direct input
-      "mov x25, %x[input_ptr]\n"
+      "mov x26, %x[input_ptr]\n"
       "7:"  // Height 1: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 9f\n"
       "8:"  // Height 1: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1b { z16.b }, p5/Z, [x10]\n"
+      "udot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z10.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z11.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "udot z8.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "udot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "udot z10.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "udot z11.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[2]\n"
+      "udot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z17.b, z0.b[2]\n"
+      "udot z11.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[3]\n"
+      "udot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      "udot z10.s, z17.b, z0.b[3]\n"
+      "udot z11.s, z16.b, z0.b[3]\n"
+      "add x26, x26, #0x10\n"
       "bgt 8b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1b { z16.b }, p5/Z, [x10]\n"
+      "udot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z17.b, z0.b[0]\n"
+      "udot z11.s, z16.b, z0.b[0]\n"
+      "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "addvl x9, x9, #4\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[1]\n"
+      "udot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z10.s, z17.b, z0.b[1]\n"
+      "udot z11.s, z16.b, z0.b[1]\n"
+      "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "addvl x9, x9, #4\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[2]\n"
+      "udot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z10.s, z17.b, z0.b[2]\n"
+      "udot z11.s, z16.b, z0.b[2]\n"
+      "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[3]\n"
+      "udot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z17.b, z0.b[3]\n"
+      "udot z11.s, z16.b, z0.b[3]\n"
+      "addvl x10, x10, #4\n"
       "10:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 5b\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
       "11:"  // Height 1: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 2b\n"
       "b 68f\n"
       "12:"  // Height 2
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "13:"  // Height 2: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 14f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x23]\n"
-      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 15f\n"
       "14:"  // Height 2: no accumulate
       "mov z8.s, #0x0\n"
@@ -267,203 +264,197 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "mov z14.s, #0x0\n"
       "mov z15.s, #0x0\n"
       "15:"  // Height 2: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "16:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 17f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "cbnz x27, 18f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 18f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
       "b 18f\n"
       "17:"  // Height 2: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
       "18:"  // Height 2: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 20f\n"
       "19:"  // Height 2: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z1.b[0]\n"
+      "udot z12.s, z17.b, z0.b[0]\n"
+      "udot z9.s, z16.b, z1.b[0]\n"
+      "udot z13.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z17.b, z1.b[0]\n"
+      "udot z14.s, z17.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      "udot z11.s, z16.b, z1.b[0]\n"
+      "udot z15.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "udot z8.s, z17.b, z1.b[1]\n"
+      "udot z12.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z9.s, z16.b, z1.b[1]\n"
+      "udot z13.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "udot z10.s, z17.b, z1.b[1]\n"
+      "udot z14.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "udot z11.s, z16.b, z1.b[1]\n"
+      "udot z15.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z17.b, z1.b[2]\n"
+      "udot z12.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "udot z9.s, z16.b, z1.b[2]\n"
+      "udot z13.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z17.b, z1.b[2]\n"
+      "udot z14.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "udot z11.s, z16.b, z1.b[2]\n"
+      "udot z15.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z17.b, z1.b[3]\n"
+      "udot z12.s, z17.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "udot z9.s, z16.b, z1.b[3]\n"
+      "udot z13.s, z16.b, z0.b[3]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "udot z10.s, z17.b, z1.b[3]\n"
+      "udot z14.s, z17.b, z0.b[3]\n"
+      "udot z11.s, z16.b, z1.b[3]\n"
+      "udot z15.s, z16.b, z0.b[3]\n"
       "bgt 19b\n"
       "20:"  // Height 2: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[0]\n"
+      "udot z12.s, z17.b, z1.b[0]\n"
+      "udot z9.s, z16.b, z0.b[0]\n"
+      "udot z13.s, z16.b, z1.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z17.b, z0.b[0]\n"
+      "udot z14.s, z17.b, z1.b[0]\n"
+      "addvl x10, x10, #4\n"
+      "udot z11.s, z16.b, z0.b[0]\n"
+      "udot z15.s, z16.b, z1.b[0]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[1]\n"
+      "udot z12.s, z17.b, z1.b[1]\n"
+      "udot z9.s, z16.b, z0.b[1]\n"
+      "udot z13.s, z16.b, z1.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z10.s, z17.b, z0.b[1]\n"
+      "udot z14.s, z17.b, z1.b[1]\n"
+      "addvl x10, x10, #4\n"
+      "udot z11.s, z16.b, z0.b[1]\n"
+      "udot z15.s, z16.b, z1.b[1]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[2]\n"
+      "udot z12.s, z17.b, z1.b[2]\n"
+      "udot z9.s, z16.b, z0.b[2]\n"
+      "udot z13.s, z16.b, z1.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z10.s, z17.b, z0.b[2]\n"
+      "udot z14.s, z17.b, z1.b[2]\n"
+      "addvl x10, x10, #4\n"
+      "udot z11.s, z16.b, z0.b[2]\n"
+      "udot z15.s, z16.b, z1.b[2]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[3]\n"
+      "udot z12.s, z17.b, z1.b[3]\n"
+      "udot z9.s, z16.b, z0.b[3]\n"
+      "udot z13.s, z16.b, z1.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z17.b, z0.b[3]\n"
+      "udot z14.s, z17.b, z1.b[3]\n"
+      "addvl x10, x10, #4\n"
+      "udot z11.s, z16.b, z0.b[3]\n"
+      "udot z15.s, z16.b, z1.b[3]\n"
       "21:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 16b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x23]\n"
-      "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x20]\n"
+      "st1w { z13.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x20, #3, MUL VL]\n"
       "22:"  // Height 2: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 13b\n"
       "b 68f\n"
       "23:"  // Height 3
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "24:"  // Height 3: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 25f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x23]\n"
-      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x22]\n"
-      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20]\n"
+      "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 26f\n"
       "25:"  // Height 3: no accumulate
       "mov z8.s, #0x0\n"
@@ -479,254 +470,245 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "mov z18.s, #0x0\n"
       "mov z19.s, #0x0\n"
       "26:"  // Height 3: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "27:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 28f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "cbnz x27, 29f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 29f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
       "b 29f\n"
       "28:"  // Height 3: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "29:"  // Height 3: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 31f\n"
       "30:"  // Height 3: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "udot z8.s, z21.b, z2.b[0]\n"
+      "udot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z16.s, z21.b, z0.b[0]\n"
+      "udot z9.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[0]\n"
+      "udot z17.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      "udot z10.s, z21.b, z2.b[0]\n"
+      "udot z14.s, z21.b, z1.b[0]\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "udot z18.s, z21.b, z0.b[0]\n"
+      "udot z11.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x23, x23, #0x10\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z15.s, z20.b, z1.b[0]\n"
+      "udot z19.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "udot z8.s, z21.b, z2.b[1]\n"
+      "udot z12.s, z21.b, z1.b[1]\n"
+      "udot z16.s, z21.b, z0.b[1]\n"
+      "udot z9.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[1]\n"
+      "udot z17.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "udot z10.s, z21.b, z2.b[1]\n"
+      "udot z14.s, z21.b, z1.b[1]\n"
+      "udot z18.s, z21.b, z0.b[1]\n"
+      "udot z11.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "udot z15.s, z20.b, z1.b[1]\n"
+      "udot z19.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z21.b, z2.b[2]\n"
+      "udot z12.s, z21.b, z1.b[2]\n"
+      "udot z16.s, z21.b, z0.b[2]\n"
+      "udot z9.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[2]\n"
+      "udot z17.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z21.b, z2.b[2]\n"
+      "udot z14.s, z21.b, z1.b[2]\n"
+      "udot z18.s, z21.b, z0.b[2]\n"
+      "udot z11.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "udot z15.s, z20.b, z1.b[2]\n"
+      "udot z19.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z21.b, z2.b[3]\n"
+      "udot z12.s, z21.b, z1.b[3]\n"
+      "udot z16.s, z21.b, z0.b[3]\n"
+      "udot z9.s, z20.b, z2.b[3]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[3]\n"
+      "udot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "udot z10.s, z21.b, z2.b[3]\n"
+      "udot z14.s, z21.b, z1.b[3]\n"
+      "udot z18.s, z21.b, z0.b[3]\n"
+      "udot z11.s, z20.b, z2.b[3]\n"
+      "udot z15.s, z20.b, z1.b[3]\n"
+      "udot z19.s, z20.b, z0.b[3]\n"
       "bgt 30b\n"
       "31:"  // Height 3: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "add x23, x23, #0x10\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "udot z8.s, z21.b, z0.b[0]\n"
+      "udot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z16.s, z21.b, z2.b[0]\n"
+      "udot z9.s, z20.b, z0.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[0]\n"
+      "udot z17.s, z20.b, z2.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z21.b, z0.b[0]\n"
+      "udot z14.s, z21.b, z1.b[0]\n"
+      "udot z18.s, z21.b, z2.b[0]\n"
+      "udot z11.s, z20.b, z0.b[0]\n"
+      "udot z15.s, z20.b, z1.b[0]\n"
+      "udot z19.s, z20.b, z2.b[0]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z21.b, z0.b[1]\n"
+      "udot z12.s, z21.b, z1.b[1]\n"
+      "udot z16.s, z21.b, z2.b[1]\n"
+      "udot z9.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z13.s, z20.b, z1.b[1]\n"
+      "udot z17.s, z20.b, z2.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z21.b, z0.b[1]\n"
+      "udot z14.s, z21.b, z1.b[1]\n"
+      "udot z18.s, z21.b, z2.b[1]\n"
+      "udot z11.s, z20.b, z0.b[1]\n"
+      "udot z15.s, z20.b, z1.b[1]\n"
+      "udot z19.s, z20.b, z2.b[1]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z21.b, z0.b[2]\n"
+      "udot z12.s, z21.b, z1.b[2]\n"
+      "udot z16.s, z21.b, z2.b[2]\n"
+      "udot z9.s, z20.b, z0.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z13.s, z20.b, z1.b[2]\n"
+      "udot z17.s, z20.b, z2.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z21.b, z0.b[2]\n"
+      "udot z14.s, z21.b, z1.b[2]\n"
+      "udot z18.s, z21.b, z2.b[2]\n"
+      "udot z11.s, z20.b, z0.b[2]\n"
+      "udot z15.s, z20.b, z1.b[2]\n"
+      "udot z19.s, z20.b, z2.b[2]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z21.b, z0.b[3]\n"
+      "udot z12.s, z21.b, z1.b[3]\n"
+      "udot z16.s, z21.b, z2.b[3]\n"
+      "udot z9.s, z20.b, z0.b[3]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[3]\n"
+      "udot z17.s, z20.b, z2.b[3]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z21.b, z0.b[3]\n"
+      "udot z14.s, z21.b, z1.b[3]\n"
+      "udot z18.s, z21.b, z2.b[3]\n"
+      "udot z11.s, z20.b, z0.b[3]\n"
+      "udot z15.s, z20.b, z1.b[3]\n"
+      "udot z19.s, z20.b, z2.b[3]\n"
       "32:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 27b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z12.s }, p4, [x23]\n"
-      "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x22]\n"
-      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x21]\n"
+      "st1w { z13.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
       "33:"  // Height 3: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 24b\n"
       "b 68f\n"
       "34:"  // Height 4
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "35:"  // Height 4: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 36f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "ld1w { z12.s }, p4/Z, [x23]\n"
-      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x22]\n"
-      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x21]\n"
-      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21]\n"
+      "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 37f\n"
       "36:"  // Height 4: no accumulate
       "mov z8.s, #0x0\n"
@@ -746,290 +728,278 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "mov z22.s, #0x0\n"
       "mov z23.s, #0x0\n"
       "37:"  // Height 4: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "38:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 39f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "cbnz x27, 40f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 40f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 40f\n"
       "39:"  // Height 4: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "40:"  // Height 4: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 42f\n"
       "41:"  // Height 4: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z3.b }, p0/Z, [x26]\n"
+      "ld1rqb { z2.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z0.b }, p0/Z, [x23]\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z25.b, z3.b[0]\n"
+      "udot z12.s, z25.b, z2.b[0]\n"
+      "udot z16.s, z25.b, z1.b[0]\n"
+      "udot z20.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "udot z9.s, z24.b, z3.b[0]\n"
+      "udot z13.s, z24.b, z2.b[0]\n"
       "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
       "add x23, x23, #0x10\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
+      "udot z17.s, z24.b, z1.b[0]\n"
+      "udot z21.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z25.b, z3.b[0]\n"
+      "udot z14.s, z25.b, z2.b[0]\n"
+      "udot z18.s, z25.b, z1.b[0]\n"
+      "udot z22.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "udot z11.s, z24.b, z3.b[0]\n"
+      "udot z15.s, z24.b, z2.b[0]\n"
+      "udot z19.s, z24.b, z1.b[0]\n"
+      "udot z23.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "udot z8.s, z25.b, z3.b[1]\n"
+      "udot z12.s, z25.b, z2.b[1]\n"
+      "udot z16.s, z25.b, z1.b[1]\n"
+      "udot z20.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "udot z9.s, z24.b, z3.b[1]\n"
+      "udot z13.s, z24.b, z2.b[1]\n"
+      "udot z17.s, z24.b, z1.b[1]\n"
+      "udot z21.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "udot z10.s, z25.b, z3.b[1]\n"
+      "udot z14.s, z25.b, z2.b[1]\n"
+      "udot z18.s, z25.b, z1.b[1]\n"
+      "udot z22.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "udot z11.s, z24.b, z3.b[1]\n"
+      "udot z15.s, z24.b, z2.b[1]\n"
+      "udot z19.s, z24.b, z1.b[1]\n"
+      "udot z23.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z25.b, z3.b[2]\n"
+      "udot z12.s, z25.b, z2.b[2]\n"
+      "udot z16.s, z25.b, z1.b[2]\n"
+      "udot z20.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "udot z9.s, z24.b, z3.b[2]\n"
+      "udot z13.s, z24.b, z2.b[2]\n"
+      "udot z17.s, z24.b, z1.b[2]\n"
+      "udot z21.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z25.b, z3.b[2]\n"
+      "udot z14.s, z25.b, z2.b[2]\n"
+      "udot z18.s, z25.b, z1.b[2]\n"
+      "udot z22.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "udot z11.s, z24.b, z3.b[2]\n"
+      "udot z15.s, z24.b, z2.b[2]\n"
+      "udot z19.s, z24.b, z1.b[2]\n"
+      "udot z23.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z25.b, z3.b[3]\n"
+      "udot z12.s, z25.b, z2.b[3]\n"
+      "udot z16.s, z25.b, z1.b[3]\n"
+      "udot z20.s, z25.b, z0.b[3]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "udot z9.s, z24.b, z3.b[3]\n"
+      "udot z13.s, z24.b, z2.b[3]\n"
+      "udot z17.s, z24.b, z1.b[3]\n"
+      "udot z21.s, z24.b, z0.b[3]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "udot z10.s, z25.b, z3.b[3]\n"
+      "udot z14.s, z25.b, z2.b[3]\n"
+      "udot z18.s, z25.b, z1.b[3]\n"
+      "udot z22.s, z25.b, z0.b[3]\n"
+      "udot z11.s, z24.b, z3.b[3]\n"
+      "udot z15.s, z24.b, z2.b[3]\n"
+      "udot z19.s, z24.b, z1.b[3]\n"
+      "udot z23.s, z24.b, z0.b[3]\n"
       "bgt 41b\n"
       "42:"  // Height 4: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "add x22, x22, #0x10\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z25.b, z0.b[0]\n"
+      "udot z12.s, z25.b, z1.b[0]\n"
+      "udot z16.s, z25.b, z2.b[0]\n"
+      "udot z20.s, z25.b, z3.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z24.b, z0.b[0]\n"
+      "udot z13.s, z24.b, z1.b[0]\n"
+      "udot z17.s, z24.b, z2.b[0]\n"
+      "udot z21.s, z24.b, z3.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z25.b, z0.b[0]\n"
+      "udot z14.s, z25.b, z1.b[0]\n"
+      "udot z18.s, z25.b, z2.b[0]\n"
+      "udot z22.s, z25.b, z3.b[0]\n"
+      "udot z11.s, z24.b, z0.b[0]\n"
+      "udot z15.s, z24.b, z1.b[0]\n"
+      "udot z19.s, z24.b, z2.b[0]\n"
+      "udot z23.s, z24.b, z3.b[0]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z25.b, z0.b[1]\n"
+      "udot z12.s, z25.b, z1.b[1]\n"
+      "udot z16.s, z25.b, z2.b[1]\n"
+      "udot z20.s, z25.b, z3.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z9.s, z24.b, z0.b[1]\n"
+      "udot z13.s, z24.b, z1.b[1]\n"
+      "udot z17.s, z24.b, z2.b[1]\n"
+      "udot z21.s, z24.b, z3.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z25.b, z0.b[1]\n"
+      "udot z14.s, z25.b, z1.b[1]\n"
+      "udot z18.s, z25.b, z2.b[1]\n"
+      "udot z22.s, z25.b, z3.b[1]\n"
+      "udot z11.s, z24.b, z0.b[1]\n"
+      "udot z15.s, z24.b, z1.b[1]\n"
+      "udot z19.s, z24.b, z2.b[1]\n"
+      "udot z23.s, z24.b, z3.b[1]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z25.b, z0.b[2]\n"
+      "udot z12.s, z25.b, z1.b[2]\n"
+      "udot z16.s, z25.b, z2.b[2]\n"
+      "udot z20.s, z25.b, z3.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z9.s, z24.b, z0.b[2]\n"
+      "udot z13.s, z24.b, z1.b[2]\n"
+      "udot z17.s, z24.b, z2.b[2]\n"
+      "udot z21.s, z24.b, z3.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z25.b, z0.b[2]\n"
+      "udot z14.s, z25.b, z1.b[2]\n"
+      "udot z18.s, z25.b, z2.b[2]\n"
+      "udot z22.s, z25.b, z3.b[2]\n"
+      "udot z11.s, z24.b, z0.b[2]\n"
+      "udot z15.s, z24.b, z1.b[2]\n"
+      "udot z19.s, z24.b, z2.b[2]\n"
+      "udot z23.s, z24.b, z3.b[2]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z25.b, z0.b[3]\n"
+      "udot z12.s, z25.b, z1.b[3]\n"
+      "udot z16.s, z25.b, z2.b[3]\n"
+      "udot z20.s, z25.b, z3.b[3]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z24.b, z0.b[3]\n"
+      "udot z13.s, z24.b, z1.b[3]\n"
+      "udot z17.s, z24.b, z2.b[3]\n"
+      "udot z21.s, z24.b, z3.b[3]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z25.b, z0.b[3]\n"
+      "udot z14.s, z25.b, z1.b[3]\n"
+      "udot z18.s, z25.b, z2.b[3]\n"
+      "udot z22.s, z25.b, z3.b[3]\n"
+      "udot z11.s, z24.b, z0.b[3]\n"
+      "udot z15.s, z24.b, z1.b[3]\n"
+      "udot z19.s, z24.b, z2.b[3]\n"
+      "udot z23.s, z24.b, z3.b[3]\n"
       "43:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 38b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "st1w { z12.s }, p4, [x23]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x22]\n"
-      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x21]\n"
-      "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x22]\n"
+      "st1w { z13.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x20]\n"
+      "st1w { z21.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x20, #3, MUL VL]\n"
       "44:"  // Height 4: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 35b\n"
       "b 68f\n"
       "45:"  // Height 5
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
       "46:"  // Height 5: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 47f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
       "ld1w { z12.s }, p4/Z, [x23]\n"
-      "add x20, x21, x19, LSL #2\n"
       "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
       "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
       "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
@@ -1068,293 +1038,278 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "mov z26.s, #0x0\n"
       "mov z27.s, #0x0\n"
       "48:"  // Height 5: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "49:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 50f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "cbnz x27, 51f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 51f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
       "b 51f\n"
       "50:"  // Height 5: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "51:"  // Height 5: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 53f\n"
       "52:"  // Height 5: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z4.b }, p0/Z, [x26]\n"
+      "ld1rqb { z3.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x22]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "udot z8.s, z29.b, z4.b[0]\n"
+      "udot z12.s, z29.b, z3.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z16.s, z29.b, z2.b[0]\n"
+      "udot z20.s, z29.b, z1.b[0]\n"
       "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "udot z24.s, z29.b, z0.b[0]\n"
+      "udot z9.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "udot z13.s, z28.b, z3.b[0]\n"
+      "udot z17.s, z28.b, z2.b[0]\n"
       "add x23, x23, #0x10\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x21, x21, #0x10\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
-      "udot z24.s, z6.b, z4.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "udot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "udot z26.s, z6.b, z4.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
-      "udot z27.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
-      "udot z24.s, z6.b, z4.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "udot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "udot z26.s, z6.b, z4.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
-      "udot z27.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
-      "udot z24.s, z6.b, z4.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "udot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "udot z26.s, z6.b, z4.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
-      "udot z27.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "udot z24.s, z6.b, z4.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "udot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z26.s, z6.b, z4.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
-      "udot z27.s, z7.b, z4.b[3]\n"
+      "udot z21.s, z28.b, z1.b[0]\n"
+      "udot z25.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z29.b, z4.b[0]\n"
+      "udot z14.s, z29.b, z3.b[0]\n"
+      "udot z18.s, z29.b, z2.b[0]\n"
+      "udot z22.s, z29.b, z1.b[0]\n"
+      "udot z26.s, z29.b, z0.b[0]\n"
+      "udot z11.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "udot z15.s, z28.b, z3.b[0]\n"
+      "udot z19.s, z28.b, z2.b[0]\n"
+      "udot z23.s, z28.b, z1.b[0]\n"
+      "udot z27.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "udot z8.s, z29.b, z4.b[1]\n"
+      "udot z12.s, z29.b, z3.b[1]\n"
+      "udot z16.s, z29.b, z2.b[1]\n"
+      "udot z20.s, z29.b, z1.b[1]\n"
+      "udot z24.s, z29.b, z0.b[1]\n"
+      "udot z9.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "udot z13.s, z28.b, z3.b[1]\n"
+      "udot z17.s, z28.b, z2.b[1]\n"
+      "udot z21.s, z28.b, z1.b[1]\n"
+      "udot z25.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "udot z10.s, z29.b, z4.b[1]\n"
+      "udot z14.s, z29.b, z3.b[1]\n"
+      "udot z18.s, z29.b, z2.b[1]\n"
+      "udot z22.s, z29.b, z1.b[1]\n"
+      "udot z26.s, z29.b, z0.b[1]\n"
+      "udot z11.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "udot z15.s, z28.b, z3.b[1]\n"
+      "udot z19.s, z28.b, z2.b[1]\n"
+      "udot z23.s, z28.b, z1.b[1]\n"
+      "udot z27.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z29.b, z4.b[2]\n"
+      "udot z12.s, z29.b, z3.b[2]\n"
+      "udot z16.s, z29.b, z2.b[2]\n"
+      "udot z20.s, z29.b, z1.b[2]\n"
+      "udot z24.s, z29.b, z0.b[2]\n"
+      "udot z9.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "udot z13.s, z28.b, z3.b[2]\n"
+      "udot z17.s, z28.b, z2.b[2]\n"
+      "udot z21.s, z28.b, z1.b[2]\n"
+      "udot z25.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z29.b, z4.b[2]\n"
+      "udot z14.s, z29.b, z3.b[2]\n"
+      "udot z18.s, z29.b, z2.b[2]\n"
+      "udot z22.s, z29.b, z1.b[2]\n"
+      "udot z26.s, z29.b, z0.b[2]\n"
+      "udot z11.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "udot z15.s, z28.b, z3.b[2]\n"
+      "udot z19.s, z28.b, z2.b[2]\n"
+      "udot z23.s, z28.b, z1.b[2]\n"
+      "udot z27.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z29.b, z4.b[3]\n"
+      "udot z12.s, z29.b, z3.b[3]\n"
+      "udot z16.s, z29.b, z2.b[3]\n"
+      "udot z20.s, z29.b, z1.b[3]\n"
+      "udot z24.s, z29.b, z0.b[3]\n"
+      "udot z9.s, z28.b, z4.b[3]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "udot z13.s, z28.b, z3.b[3]\n"
+      "udot z17.s, z28.b, z2.b[3]\n"
+      "udot z21.s, z28.b, z1.b[3]\n"
+      "udot z25.s, z28.b, z0.b[3]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "udot z10.s, z29.b, z4.b[3]\n"
+      "udot z14.s, z29.b, z3.b[3]\n"
+      "udot z18.s, z29.b, z2.b[3]\n"
+      "udot z22.s, z29.b, z1.b[3]\n"
+      "udot z26.s, z29.b, z0.b[3]\n"
+      "udot z11.s, z28.b, z4.b[3]\n"
+      "udot z15.s, z28.b, z3.b[3]\n"
+      "udot z19.s, z28.b, z2.b[3]\n"
+      "udot z23.s, z28.b, z1.b[3]\n"
+      "udot z27.s, z28.b, z0.b[3]\n"
       "bgt 52b\n"
       "53:"  // Height 5: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "add x21, x21, #0x10\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
-      "udot z24.s, z6.b, z4.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "udot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "udot z26.s, z6.b, z4.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
-      "udot z27.s, z7.b, z4.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "udot z8.s, z29.b, z0.b[0]\n"
+      "udot z12.s, z29.b, z1.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z16.s, z29.b, z2.b[0]\n"
+      "udot z20.s, z29.b, z3.b[0]\n"
+      "udot z24.s, z29.b, z4.b[0]\n"
+      "udot z9.s, z28.b, z0.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z28.b, z1.b[0]\n"
+      "udot z17.s, z28.b, z2.b[0]\n"
+      "udot z21.s, z28.b, z3.b[0]\n"
+      "udot z25.s, z28.b, z4.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z29.b, z0.b[0]\n"
+      "udot z14.s, z29.b, z1.b[0]\n"
+      "udot z18.s, z29.b, z2.b[0]\n"
+      "udot z22.s, z29.b, z3.b[0]\n"
+      "udot z26.s, z29.b, z4.b[0]\n"
+      "udot z11.s, z28.b, z0.b[0]\n"
+      "udot z15.s, z28.b, z1.b[0]\n"
+      "udot z19.s, z28.b, z2.b[0]\n"
+      "udot z23.s, z28.b, z3.b[0]\n"
+      "udot z27.s, z28.b, z4.b[0]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
-      "udot z24.s, z6.b, z4.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "udot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "udot z26.s, z6.b, z4.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
-      "udot z27.s, z7.b, z4.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z29.b, z0.b[1]\n"
+      "udot z12.s, z29.b, z1.b[1]\n"
+      "udot z16.s, z29.b, z2.b[1]\n"
+      "udot z20.s, z29.b, z3.b[1]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z24.s, z29.b, z4.b[1]\n"
+      "udot z9.s, z28.b, z0.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z28.b, z1.b[1]\n"
+      "udot z17.s, z28.b, z2.b[1]\n"
+      "udot z21.s, z28.b, z3.b[1]\n"
+      "udot z25.s, z28.b, z4.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z29.b, z0.b[1]\n"
+      "udot z14.s, z29.b, z1.b[1]\n"
+      "udot z18.s, z29.b, z2.b[1]\n"
+      "udot z22.s, z29.b, z3.b[1]\n"
+      "udot z26.s, z29.b, z4.b[1]\n"
+      "udot z11.s, z28.b, z0.b[1]\n"
+      "udot z15.s, z28.b, z1.b[1]\n"
+      "udot z19.s, z28.b, z2.b[1]\n"
+      "udot z23.s, z28.b, z3.b[1]\n"
+      "udot z27.s, z28.b, z4.b[1]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
-      "udot z24.s, z6.b, z4.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "udot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "udot z26.s, z6.b, z4.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
-      "udot z27.s, z7.b, z4.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z29.b, z0.b[2]\n"
+      "udot z12.s, z29.b, z1.b[2]\n"
+      "udot z16.s, z29.b, z2.b[2]\n"
+      "udot z20.s, z29.b, z3.b[2]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z24.s, z29.b, z4.b[2]\n"
+      "udot z9.s, z28.b, z0.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z28.b, z1.b[2]\n"
+      "udot z17.s, z28.b, z2.b[2]\n"
+      "udot z21.s, z28.b, z3.b[2]\n"
+      "udot z25.s, z28.b, z4.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z29.b, z0.b[2]\n"
+      "udot z14.s, z29.b, z1.b[2]\n"
+      "udot z18.s, z29.b, z2.b[2]\n"
+      "udot z22.s, z29.b, z3.b[2]\n"
+      "udot z26.s, z29.b, z4.b[2]\n"
+      "udot z11.s, z28.b, z0.b[2]\n"
+      "udot z15.s, z28.b, z1.b[2]\n"
+      "udot z19.s, z28.b, z2.b[2]\n"
+      "udot z23.s, z28.b, z3.b[2]\n"
+      "udot z27.s, z28.b, z4.b[2]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "udot z24.s, z6.b, z4.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "udot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z26.s, z6.b, z4.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
-      "udot z27.s, z7.b, z4.b[3]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z29.b, z0.b[3]\n"
+      "udot z12.s, z29.b, z1.b[3]\n"
+      "udot z16.s, z29.b, z2.b[3]\n"
+      "udot z20.s, z29.b, z3.b[3]\n"
+      "udot z24.s, z29.b, z4.b[3]\n"
+      "udot z9.s, z28.b, z0.b[3]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z28.b, z1.b[3]\n"
+      "udot z17.s, z28.b, z2.b[3]\n"
+      "udot z21.s, z28.b, z3.b[3]\n"
+      "udot z25.s, z28.b, z4.b[3]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z29.b, z0.b[3]\n"
+      "udot z14.s, z29.b, z1.b[3]\n"
+      "udot z18.s, z29.b, z2.b[3]\n"
+      "udot z22.s, z29.b, z3.b[3]\n"
+      "udot z26.s, z29.b, z4.b[3]\n"
+      "udot z11.s, z28.b, z0.b[3]\n"
+      "udot z15.s, z28.b, z1.b[3]\n"
+      "udot z19.s, z28.b, z2.b[3]\n"
+      "udot z23.s, z28.b, z3.b[3]\n"
+      "udot z27.s, z28.b, z4.b[3]\n"
       "54:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 49b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
       "st1w { z12.s }, p4, [x23]\n"
-      "add x20, x21, x19, LSL #2\n"
       "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
-      "addvl x28, x28, #4\n"
       "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
       "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
       "st1w { z16.s }, p4, [x22]\n"
@@ -1370,57 +1325,57 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
       "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
       "55:"  // Height 5: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 46b\n"
       "b 68f\n"
       "56:"  // Height 6
-      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x28, %x[output_ptr]\n"
-      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "mov x20, #0x18\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "57:"  // Height 6: Column loop
-      "mov x19, #0x0\n"
-      "whilelt p4.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p3.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p2.s, x19, x10\n"
-      "incw x19\n"
-      "whilelt p1.s, x19, x10\n"
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 58f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z8.s }, p4/Z, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "ld1w { z12.s }, p4/Z, [x23]\n"
-      "add x20, x21, x19, LSL #2\n"
-      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "add x19, x20, x19, LSL #2\n"
-      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x22]\n"
-      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x21]\n"
-      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x20]\n"
-      "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
-      "ld1w { z28.s }, p4/Z, [x19]\n"
-      "ld1w { z29.s }, p3/Z, [x19, #1, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x19, #2, MUL VL]\n"
-      "ld1w { z31.s }, p1/Z, [x19, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x23]\n"
+      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x21]\n"
+      "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 59f\n"
       "58:"  // Height 6: no accumulate
       "mov z8.s, #0x0\n"
@@ -1448,375 +1403,356 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "mov z30.s, #0x0\n"
       "mov z31.s, #0x0\n"
       "59:"  // Height 6: setup done
-      "mov x27, #0x0\n"
+      "mov x28, #0x0\n"
       "60:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 61f\n"
-      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
-      "add x20, x20, x19, LSL #3\n"
-      "ldr x25, [x20, #0x0]\n"
-      "ldr x24, [x20, #0x8]\n"
-      "ldr x23, [x20, #0x10]\n"
-      "ldr x22, [x20, #0x18]\n"
-      "ldr x21, [x20, #0x20]\n"
-      "ldr x20, [x20, #0x28]\n"
-      "cbnz x27, 62f\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x21, x21, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
       "b 62f\n"
       "61:"  // Height 6: setup direct input
-      "mov x25, %x[input_ptr]\n"
-      "add x24, x25, x19\n"
-      "add x23, x24, x19\n"
-      "add x22, x23, x19\n"
-      "add x21, x22, x19\n"
-      "add x20, x21, x19\n"
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "62:"  // Height 6: input setup done
-      "cmp x26, #0x10\n"
+      "cmp x27, #0x10\n"
       "ble 64f\n"
       "63:"  // Height 6: Multiply loop: Main loop head
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "sub x26, x26, #0x10\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z6.b }, p0/Z, [x25]\n"
+      "sub x27, x27, #0x10\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z4.b }, p0/Z, [x23]\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z2.b }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z1.b, z7.b[0]\n"
+      "udot z12.s, z1.b, z6.b[0]\n"
+      "udot z16.s, z1.b, z5.b[0]\n"
+      "udot z20.s, z1.b, z4.b[0]\n"
       "add x23, x23, #0x10\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
       "add x22, x22, #0x10\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "udot z24.s, z1.b, z3.b[0]\n"
+      "udot z28.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x20, x20, #0x10\n"
-      "udot z24.s, z6.b, z4.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
-      "udot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "udot z25.s, z7.b, z4.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "udot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "udot z26.s, z6.b, z4.b[0]\n"
-      "udot z30.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
-      "udot z27.s, z7.b, z4.b[0]\n"
-      "udot z31.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
-      "udot z24.s, z6.b, z4.b[1]\n"
-      "udot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "udot z25.s, z7.b, z4.b[1]\n"
-      "udot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
-      "addvl x9, x9, #16\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "udot z26.s, z6.b, z4.b[1]\n"
-      "udot z30.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
-      "udot z27.s, z7.b, z4.b[1]\n"
-      "udot z31.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
-      "udot z24.s, z6.b, z4.b[2]\n"
-      "udot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "udot z25.s, z7.b, z4.b[2]\n"
-      "udot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "udot z26.s, z6.b, z4.b[2]\n"
-      "udot z30.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
-      "udot z27.s, z7.b, z4.b[2]\n"
-      "udot z31.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "udot z24.s, z6.b, z4.b[3]\n"
-      "udot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "udot z25.s, z7.b, z4.b[3]\n"
-      "udot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z26.s, z6.b, z4.b[3]\n"
-      "udot z30.s, z6.b, z5.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
-      "udot z27.s, z7.b, z4.b[3]\n"
-      "udot z31.s, z7.b, z5.b[3]\n"
+      "udot z9.s, z0.b, z7.b[0]\n"
+      "udot z13.s, z0.b, z6.b[0]\n"
+      "udot z17.s, z0.b, z5.b[0]\n"
+      "udot z21.s, z0.b, z4.b[0]\n"
+      "udot z25.s, z0.b, z3.b[0]\n"
+      "udot z29.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z1.b, z7.b[0]\n"
+      "udot z14.s, z1.b, z6.b[0]\n"
+      "udot z18.s, z1.b, z5.b[0]\n"
+      "udot z22.s, z1.b, z4.b[0]\n"
+      "udot z26.s, z1.b, z3.b[0]\n"
+      "udot z30.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "udot z11.s, z0.b, z7.b[0]\n"
+      "udot z15.s, z0.b, z6.b[0]\n"
+      "udot z19.s, z0.b, z5.b[0]\n"
+      "udot z23.s, z0.b, z4.b[0]\n"
+      "udot z27.s, z0.b, z3.b[0]\n"
+      "udot z31.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "udot z8.s, z1.b, z7.b[1]\n"
+      "udot z12.s, z1.b, z6.b[1]\n"
+      "udot z16.s, z1.b, z5.b[1]\n"
+      "udot z20.s, z1.b, z4.b[1]\n"
+      "udot z24.s, z1.b, z3.b[1]\n"
+      "udot z28.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "udot z9.s, z0.b, z7.b[1]\n"
+      "udot z13.s, z0.b, z6.b[1]\n"
+      "udot z17.s, z0.b, z5.b[1]\n"
+      "udot z21.s, z0.b, z4.b[1]\n"
+      "udot z25.s, z0.b, z3.b[1]\n"
+      "udot z29.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "udot z10.s, z1.b, z7.b[1]\n"
+      "udot z14.s, z1.b, z6.b[1]\n"
+      "udot z18.s, z1.b, z5.b[1]\n"
+      "udot z22.s, z1.b, z4.b[1]\n"
+      "udot z26.s, z1.b, z3.b[1]\n"
+      "udot z30.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "udot z11.s, z0.b, z7.b[1]\n"
+      "udot z15.s, z0.b, z6.b[1]\n"
+      "udot z19.s, z0.b, z5.b[1]\n"
+      "udot z23.s, z0.b, z4.b[1]\n"
+      "udot z27.s, z0.b, z3.b[1]\n"
+      "udot z31.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z1.b, z7.b[2]\n"
+      "udot z12.s, z1.b, z6.b[2]\n"
+      "udot z16.s, z1.b, z5.b[2]\n"
+      "udot z20.s, z1.b, z4.b[2]\n"
+      "udot z24.s, z1.b, z3.b[2]\n"
+      "udot z28.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "udot z9.s, z0.b, z7.b[2]\n"
+      "udot z13.s, z0.b, z6.b[2]\n"
+      "udot z17.s, z0.b, z5.b[2]\n"
+      "udot z21.s, z0.b, z4.b[2]\n"
+      "udot z25.s, z0.b, z3.b[2]\n"
+      "udot z29.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z1.b, z7.b[2]\n"
+      "udot z14.s, z1.b, z6.b[2]\n"
+      "udot z18.s, z1.b, z5.b[2]\n"
+      "udot z22.s, z1.b, z4.b[2]\n"
+      "udot z26.s, z1.b, z3.b[2]\n"
+      "udot z30.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "udot z11.s, z0.b, z7.b[2]\n"
+      "udot z15.s, z0.b, z6.b[2]\n"
+      "udot z19.s, z0.b, z5.b[2]\n"
+      "udot z23.s, z0.b, z4.b[2]\n"
+      "udot z27.s, z0.b, z3.b[2]\n"
+      "udot z31.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z1.b, z7.b[3]\n"
+      "udot z12.s, z1.b, z6.b[3]\n"
+      "udot z16.s, z1.b, z5.b[3]\n"
+      "udot z20.s, z1.b, z4.b[3]\n"
+      "udot z24.s, z1.b, z3.b[3]\n"
+      "udot z28.s, z1.b, z2.b[3]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "udot z9.s, z0.b, z7.b[3]\n"
+      "udot z13.s, z0.b, z6.b[3]\n"
+      "udot z17.s, z0.b, z5.b[3]\n"
+      "udot z21.s, z0.b, z4.b[3]\n"
+      "udot z25.s, z0.b, z3.b[3]\n"
+      "udot z29.s, z0.b, z2.b[3]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "udot z10.s, z1.b, z7.b[3]\n"
+      "udot z14.s, z1.b, z6.b[3]\n"
+      "udot z18.s, z1.b, z5.b[3]\n"
+      "udot z22.s, z1.b, z4.b[3]\n"
+      "udot z26.s, z1.b, z3.b[3]\n"
+      "udot z30.s, z1.b, z2.b[3]\n"
+      "udot z11.s, z0.b, z7.b[3]\n"
+      "udot z15.s, z0.b, z6.b[3]\n"
+      "udot z19.s, z0.b, z5.b[3]\n"
+      "udot z23.s, z0.b, z4.b[3]\n"
+      "udot z27.s, z0.b, z3.b[3]\n"
+      "udot z31.s, z0.b, z2.b[3]\n"
       "bgt 63b\n"
       "64:"  // Height 6: Multiply loop: Single iteration only
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "whilelt p0.b, XZR, x26\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "ld1rqb { z0.b }, p0/Z, [x25]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "ld1rqb { z5.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
-      "add x20, x20, #0x10\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "udot z24.s, z6.b, z4.b[0]\n"
-      "udot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "udot z25.s, z7.b, z4.b[0]\n"
-      "udot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "udot z26.s, z6.b, z4.b[0]\n"
-      "udot z30.s, z6.b, z5.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
-      "udot z27.s, z7.b, z4.b[0]\n"
-      "udot z31.s, z7.b, z5.b[0]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z7.b, z0.b[0]\n"
+      "udot z12.s, z7.b, z1.b[0]\n"
+      "udot z16.s, z7.b, z2.b[0]\n"
+      "udot z20.s, z7.b, z3.b[0]\n"
+      "udot z24.s, z7.b, z4.b[0]\n"
+      "udot z28.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z6.b, z0.b[0]\n"
+      "udot z13.s, z6.b, z1.b[0]\n"
+      "udot z17.s, z6.b, z2.b[0]\n"
+      "udot z21.s, z6.b, z3.b[0]\n"
+      "udot z25.s, z6.b, z4.b[0]\n"
+      "udot z29.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z7.b, z0.b[0]\n"
+      "udot z14.s, z7.b, z1.b[0]\n"
+      "udot z18.s, z7.b, z2.b[0]\n"
+      "udot z22.s, z7.b, z3.b[0]\n"
+      "udot z26.s, z7.b, z4.b[0]\n"
+      "udot z30.s, z7.b, z5.b[0]\n"
+      "udot z11.s, z6.b, z0.b[0]\n"
+      "udot z15.s, z6.b, z1.b[0]\n"
+      "udot z19.s, z6.b, z2.b[0]\n"
+      "udot z23.s, z6.b, z3.b[0]\n"
+      "udot z27.s, z6.b, z4.b[0]\n"
+      "udot z31.s, z6.b, z5.b[0]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
-      "udot z24.s, z6.b, z4.b[1]\n"
-      "udot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "udot z25.s, z7.b, z4.b[1]\n"
-      "udot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "udot z26.s, z6.b, z4.b[1]\n"
-      "udot z30.s, z6.b, z5.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
-      "udot z27.s, z7.b, z4.b[1]\n"
-      "udot z31.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z7.b, z0.b[1]\n"
+      "udot z12.s, z7.b, z1.b[1]\n"
+      "udot z16.s, z7.b, z2.b[1]\n"
+      "udot z20.s, z7.b, z3.b[1]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z24.s, z7.b, z4.b[1]\n"
+      "udot z28.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z6.b, z0.b[1]\n"
+      "udot z13.s, z6.b, z1.b[1]\n"
+      "udot z17.s, z6.b, z2.b[1]\n"
+      "udot z21.s, z6.b, z3.b[1]\n"
+      "udot z25.s, z6.b, z4.b[1]\n"
+      "udot z29.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z7.b, z0.b[1]\n"
+      "udot z14.s, z7.b, z1.b[1]\n"
+      "udot z18.s, z7.b, z2.b[1]\n"
+      "udot z22.s, z7.b, z3.b[1]\n"
+      "udot z26.s, z7.b, z4.b[1]\n"
+      "udot z30.s, z7.b, z5.b[1]\n"
+      "udot z11.s, z6.b, z0.b[1]\n"
+      "udot z15.s, z6.b, z1.b[1]\n"
+      "udot z19.s, z6.b, z2.b[1]\n"
+      "udot z23.s, z6.b, z3.b[1]\n"
+      "udot z27.s, z6.b, z4.b[1]\n"
+      "udot z31.s, z6.b, z5.b[1]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "subs x26, x26, #0x4\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
-      "udot z24.s, z6.b, z4.b[2]\n"
-      "udot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "udot z25.s, z7.b, z4.b[2]\n"
-      "udot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "udot z26.s, z6.b, z4.b[2]\n"
-      "udot z30.s, z6.b, z5.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
-      "udot z27.s, z7.b, z4.b[2]\n"
-      "udot z31.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z7.b, z0.b[2]\n"
+      "udot z12.s, z7.b, z1.b[2]\n"
+      "udot z16.s, z7.b, z2.b[2]\n"
+      "udot z20.s, z7.b, z3.b[2]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z24.s, z7.b, z4.b[2]\n"
+      "udot z28.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z6.b, z0.b[2]\n"
+      "udot z13.s, z6.b, z1.b[2]\n"
+      "udot z17.s, z6.b, z2.b[2]\n"
+      "udot z21.s, z6.b, z3.b[2]\n"
+      "udot z25.s, z6.b, z4.b[2]\n"
+      "udot z29.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z7.b, z0.b[2]\n"
+      "udot z14.s, z7.b, z1.b[2]\n"
+      "udot z18.s, z7.b, z2.b[2]\n"
+      "udot z22.s, z7.b, z3.b[2]\n"
+      "udot z26.s, z7.b, z4.b[2]\n"
+      "udot z30.s, z7.b, z5.b[2]\n"
+      "udot z11.s, z6.b, z0.b[2]\n"
+      "udot z15.s, z6.b, z1.b[2]\n"
+      "udot z19.s, z6.b, z2.b[2]\n"
+      "udot z23.s, z6.b, z3.b[2]\n"
+      "udot z27.s, z6.b, z4.b[2]\n"
+      "udot z31.s, z6.b, z5.b[2]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x9]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "udot z24.s, z6.b, z4.b[3]\n"
-      "udot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "udot z25.s, z7.b, z4.b[3]\n"
-      "udot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "addvl x9, x9, #4\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z26.s, z6.b, z4.b[3]\n"
-      "udot z30.s, z6.b, z5.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
-      "udot z27.s, z7.b, z4.b[3]\n"
-      "udot z31.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z7.b, z0.b[3]\n"
+      "udot z12.s, z7.b, z1.b[3]\n"
+      "udot z16.s, z7.b, z2.b[3]\n"
+      "udot z20.s, z7.b, z3.b[3]\n"
+      "udot z24.s, z7.b, z4.b[3]\n"
+      "udot z28.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z6.b, z0.b[3]\n"
+      "udot z13.s, z6.b, z1.b[3]\n"
+      "udot z17.s, z6.b, z2.b[3]\n"
+      "udot z21.s, z6.b, z3.b[3]\n"
+      "udot z25.s, z6.b, z4.b[3]\n"
+      "udot z29.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "udot z10.s, z7.b, z0.b[3]\n"
+      "udot z14.s, z7.b, z1.b[3]\n"
+      "udot z18.s, z7.b, z2.b[3]\n"
+      "udot z22.s, z7.b, z3.b[3]\n"
+      "udot z26.s, z7.b, z4.b[3]\n"
+      "udot z30.s, z7.b, z5.b[3]\n"
+      "udot z11.s, z6.b, z0.b[3]\n"
+      "udot z15.s, z6.b, z1.b[3]\n"
+      "udot z19.s, z6.b, z2.b[3]\n"
+      "udot z23.s, z6.b, z3.b[3]\n"
+      "udot z27.s, z6.b, z4.b[3]\n"
+      "udot z31.s, z6.b, z5.b[3]\n"
       "65:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "cmp x27, x19\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
       "bne 60b\n"
-      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "st1w { z8.s }, p4, [x28]\n"
-      "add x23, x28, x19, LSL #2\n"
-      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
-      "add x22, x23, x19, LSL #2\n"
-      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
-      "add x21, x22, x19, LSL #2\n"
-      "st1w { z12.s }, p4, [x23]\n"
-      "add x20, x21, x19, LSL #2\n"
-      "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
-      "add x19, x20, x19, LSL #2\n"
-      "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
-      "addvl x28, x28, #4\n"
-      "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x22]\n"
-      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x21]\n"
-      "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
-      "st1w { z24.s }, p4, [x20]\n"
-      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
-      "st1w { z28.s }, p4, [x19]\n"
-      "st1w { z29.s }, p3, [x19, #1, MUL VL]\n"
-      "st1w { z30.s }, p2, [x19, #2, MUL VL]\n"
-      "st1w { z31.s }, p1, [x19, #3, MUL VL]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z12.s }, p4, [x24]\n"
+      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x22]\n"
+      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x21]\n"
+      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z28.s }, p4, [x20]\n"
+      "st1w { z29.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x20, #3, MUL VL]\n"
       "66:"  // Height 6: Writeback done
-      "decw x10, ALL, MUL #4\n"
-      "cmp x10, XZR\n"
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
       "bgt 57b\n"
       "subs %x[M], %x[M], #0x6\n"
       "beq 68f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 67f\n"
-      "add x20, x20, #0x6\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "67:"  // Update direct input
-      "mov x19, #0x6\n"
-      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "68:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
new file mode 100644
index 0000000000..8c6a3dba7d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<uint8_t>, \
+    size_t, size_t, \
+    const uint8_t *, \
+    IndirectOutputArg<uint32_t>, \
+    const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_u8u32_mmla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_u8u32_mmla_6x4VL
+{
+public:
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<uint32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 54.45 };
+                case CPUModel::A510:
+                    return { 24.22 };
+                case CPUModel::V1:
+                    return { 105.16 };
+            }
+        }
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 54.90, 15.69, 0.62 };
+                case CPUModel::A510:
+                    return { 26.80, 3.89, 0.47 };
+                case CPUModel::V1:
+                    return { 75.14, 15.87, 0.83 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_u8u32_mmla_6x4VL;
+    cls_sve_hybrid_u8u32_mmla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..9269576d90
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
@@ -0,0 +1,1674 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_mmla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+    const uint32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 56f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 45f\n"
+      "beq 34f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 23f\n"
+      "beq 12f\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 3f\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 4f\n"
+      "3:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "ble 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19a48  // ummla z8.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4c  // ummla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19a49  // ummla z9.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4d  // ummla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d19a4a  // ummla z10.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4e  // ummla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45d19a4b  // ummla z11.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4f  // ummla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45d19a88  // ummla z8.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8c  // ummla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45d19a89  // ummla z9.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8d  // ummla z13.s, z20.b, z16.b\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45d09a8a  // ummla z10.s, z20.b, z16.b\n"
+      ".inst 0x45c79a8e  // ummla z14.s, z20.b, z7.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45d19a8b  // ummla z11.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8f  // ummla z15.s, z20.b, z16.b\n"
+      "add x26, x26, #0x10\n"
+      "bgt 8b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19a48  // ummla z8.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4c  // ummla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19a49  // ummla z9.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4d  // ummla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d19a4a  // ummla z10.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4e  // ummla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45d19a4b  // ummla z11.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4f  // ummla z15.s, z18.b, z16.b\n"
+      "addvl x10, x10, #8\n"
+      "ble 10f\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19828  // ummla z8.s, z1.b, z17.b\n"
+      ".inst 0x45d0982c  // ummla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19829  // ummla z9.s, z1.b, z17.b\n"
+      ".inst 0x45d0982d  // ummla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d1982a  // ummla z10.s, z1.b, z17.b\n"
+      ".inst 0x45d0982e  // ummla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45d1982b  // ummla z11.s, z1.b, z17.b\n"
+      ".inst 0x45d0982f  // ummla z15.s, z1.b, z16.b\n"
+      "addvl x10, x10, #8\n"
+      "10:"  // Height 1: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 5b\n"
+      "uzp1 z8.d, z8.d, z12.d\n"
+      "uzp1 z9.d, z9.d, z13.d\n"
+      "st1w { z8.s }, p4, [x9]\n"
+      "uzp1 z10.d, z10.d, z14.d\n"
+      "uzp1 z11.d, z11.d, z15.d\n"
+      "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "11:"  // Height 1: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 2b\n"
+      "b 68f\n"
+      "12:"  // Height 2
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "13:"  // Height 2: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 14f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z18.s }, p4/Z, [x9]\n"
+      "ld1w { z2.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "zip1 z8.d, z18.d, z12.d\n"
+      "zip2 z12.d, z18.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z2.d, z13.d\n"
+      "zip2 z13.d, z2.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 15f\n"
+      "14:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "15:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "16:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "cbnz x28, 18f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "b 18f\n"
+      "17:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "18:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "ble 20f\n"
+      "19:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19a48  // ummla z8.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4c  // ummla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19a49  // ummla z9.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4d  // ummla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d19a4a  // ummla z10.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4e  // ummla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45d19a4b  // ummla z11.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4f  // ummla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45d19a88  // ummla z8.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8c  // ummla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45d19a89  // ummla z9.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8d  // ummla z13.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45d19a8a  // ummla z10.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8e  // ummla z14.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45d19a8b  // ummla z11.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8f  // ummla z15.s, z20.b, z16.b\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "bgt 19b\n"
+      "20:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19a48  // ummla z8.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4c  // ummla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19a49  // ummla z9.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4d  // ummla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d19a4a  // ummla z10.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4e  // ummla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45d19a4b  // ummla z11.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4f  // ummla z15.s, z18.b, z16.b\n"
+      "addvl x10, x10, #8\n"
+      "ble 21f\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19828  // ummla z8.s, z1.b, z17.b\n"
+      ".inst 0x45d0982c  // ummla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19829  // ummla z9.s, z1.b, z17.b\n"
+      ".inst 0x45d0982d  // ummla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d1982a  // ummla z10.s, z1.b, z17.b\n"
+      ".inst 0x45d0982e  // ummla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45d1982b  // ummla z11.s, z1.b, z17.b\n"
+      ".inst 0x45d0982f  // ummla z15.s, z1.b, z16.b\n"
+      "addvl x10, x10, #8\n"
+      "21:"  // Height 2: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 16b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "uzp1 z16.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z17.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z16.s }, p4, [x9]\n"
+      "uzp1 z16.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z17.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp1 z2.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z16.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z8.s }, p4, [x20]\n"
+      "st1w { z9.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x20, #3, MUL VL]\n"
+      "22:"  // Height 2: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 13b\n"
+      "b 68f\n"
+      "23:"  // Height 3
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "24:"  // Height 3: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 25f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x20]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 26f\n"
+      "25:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "26:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "27:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 28f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "cbnz x28, 29f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "b 29f\n"
+      "28:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "29:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "ble 31f\n"
+      "30:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "trn1 z27.d, z30.d, z24.d\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "trn1 z26.d, z28.d, z29.d\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99b68  // ummla z8.s, z27.b, z25.b\n"
+      ".inst 0x45d99b50  // ummla z16.s, z26.b, z25.b\n"
+      ".inst 0x45d89b6c  // ummla z12.s, z27.b, z24.b\n"
+      ".inst 0x45d89b54  // ummla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99b69  // ummla z9.s, z27.b, z25.b\n"
+      ".inst 0x45d99b51  // ummla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z29.d\n"
+      ".inst 0x45d89b6d  // ummla z13.s, z27.b, z24.b\n"
+      ".inst 0x45d89b55  // ummla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x45d99b6a  // ummla z10.s, z27.b, z25.b\n"
+      ".inst 0x45d99b52  // ummla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45d89b6e  // ummla z14.s, z27.b, z24.b\n"
+      ".inst 0x45d89b56  // ummla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x45d99b6b  // ummla z11.s, z27.b, z25.b\n"
+      ".inst 0x45d99b53  // ummla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x45d89b6f  // ummla z15.s, z27.b, z24.b\n"
+      ".inst 0x45d89b57  // ummla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45d99bc8  // ummla z8.s, z30.b, z25.b\n"
+      ".inst 0x45d99b90  // ummla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45d89bcc  // ummla z12.s, z30.b, z24.b\n"
+      ".inst 0x45d89b94  // ummla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45d99bc9  // ummla z9.s, z30.b, z25.b\n"
+      ".inst 0x45d99b91  // ummla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45d89bcd  // ummla z13.s, z30.b, z24.b\n"
+      ".inst 0x45d89b95  // ummla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45d99bca  // ummla z10.s, z30.b, z25.b\n"
+      ".inst 0x45d99b92  // ummla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45d89bce  // ummla z14.s, z30.b, z24.b\n"
+      ".inst 0x45d89b96  // ummla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45d99bcb  // ummla z11.s, z30.b, z25.b\n"
+      ".inst 0x45d99b93  // ummla z19.s, z28.b, z25.b\n"
+      ".inst 0x45d89bcf  // ummla z15.s, z30.b, z24.b\n"
+      ".inst 0x45d89b97  // ummla z23.s, z28.b, z24.b\n"
+      "bgt 30b\n"
+      "31:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "trn1 z27.d, z1.d, z24.d\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "trn1 z26.d, z3.d, z28.d\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99b68  // ummla z8.s, z27.b, z25.b\n"
+      ".inst 0x45d99b50  // ummla z16.s, z26.b, z25.b\n"
+      ".inst 0x45d89b6c  // ummla z12.s, z27.b, z24.b\n"
+      ".inst 0x45d89b54  // ummla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99b69  // ummla z9.s, z27.b, z25.b\n"
+      ".inst 0x45d99b51  // ummla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x45d89b6d  // ummla z13.s, z27.b, z24.b\n"
+      ".inst 0x45d89b55  // ummla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z28.d\n"
+      ".inst 0x45d99b6a  // ummla z10.s, z27.b, z25.b\n"
+      ".inst 0x45d99b52  // ummla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45d89b6e  // ummla z14.s, z27.b, z24.b\n"
+      ".inst 0x45d89b56  // ummla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x45d99b6b  // ummla z11.s, z27.b, z25.b\n"
+      ".inst 0x45d99b53  // ummla z19.s, z26.b, z25.b\n"
+      ".inst 0x45d89b6f  // ummla z15.s, z27.b, z24.b\n"
+      ".inst 0x45d89b57  // ummla z23.s, z26.b, z24.b\n"
+      "ble 32f\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99828  // ummla z8.s, z1.b, z25.b\n"
+      ".inst 0x45d99870  // ummla z16.s, z3.b, z25.b\n"
+      ".inst 0x45d8982c  // ummla z12.s, z1.b, z24.b\n"
+      ".inst 0x45d89874  // ummla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99829  // ummla z9.s, z1.b, z25.b\n"
+      ".inst 0x45d99871  // ummla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45d8982d  // ummla z13.s, z1.b, z24.b\n"
+      ".inst 0x45d89875  // ummla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d9982a  // ummla z10.s, z1.b, z25.b\n"
+      ".inst 0x45d99872  // ummla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45d8982e  // ummla z14.s, z1.b, z24.b\n"
+      ".inst 0x45d89876  // ummla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x45d9982b  // ummla z11.s, z1.b, z25.b\n"
+      ".inst 0x45d99873  // ummla z19.s, z3.b, z25.b\n"
+      ".inst 0x45d8982f  // ummla z15.s, z1.b, z24.b\n"
+      ".inst 0x45d89877  // ummla z23.s, z3.b, z24.b\n"
+      "32:"  // Height 3: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 27b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "uzp1 z25.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z24.d, z9.d, z13.d\n"
+      "st1w { z25.s }, p4, [x9]\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z25.d, z10.d, z14.d\n"
+      "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z24.d, z11.d, z15.d\n"
+      "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "st1w { z8.s }, p4, [x21]\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "st1w { z9.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
+      "33:"  // Height 3: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 24b\n"
+      "b 68f\n"
+      "34:"  // Height 4
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "35:"  // Height 4: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 36f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x21]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 37f\n"
+      "36:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "37:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "38:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 39f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "cbnz x28, 40f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "b 40f\n"
+      "39:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "40:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "ble 42f\n"
+      "41:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z29.d, z30.d, z24.d\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "trn1 z26.d, z28.d, z27.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99ba8  // ummla z8.s, z29.b, z25.b\n"
+      ".inst 0x45d99b50  // ummla z16.s, z26.b, z25.b\n"
+      ".inst 0x45d89bac  // ummla z12.s, z29.b, z24.b\n"
+      ".inst 0x45d89b54  // ummla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99ba9  // ummla z9.s, z29.b, z25.b\n"
+      ".inst 0x45d99b51  // ummla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z27.d\n"
+      ".inst 0x45d89bad  // ummla z13.s, z29.b, z24.b\n"
+      ".inst 0x45d89b55  // ummla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x45d99baa  // ummla z10.s, z29.b, z25.b\n"
+      ".inst 0x45d99b52  // ummla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "cmp x27, #0x10\n"
+      ".inst 0x45d89bae  // ummla z14.s, z29.b, z24.b\n"
+      ".inst 0x45d89b56  // ummla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x45d99bab  // ummla z11.s, z29.b, z25.b\n"
+      ".inst 0x45d99b53  // ummla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x45d89baf  // ummla z15.s, z29.b, z24.b\n"
+      ".inst 0x45d89b57  // ummla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45d99bc8  // ummla z8.s, z30.b, z25.b\n"
+      ".inst 0x45d99b90  // ummla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45d89bcc  // ummla z12.s, z30.b, z24.b\n"
+      ".inst 0x45d89b94  // ummla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x45d99bc9  // ummla z9.s, z30.b, z25.b\n"
+      ".inst 0x45d99b91  // ummla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45d89bcd  // ummla z13.s, z30.b, z24.b\n"
+      ".inst 0x45d89b95  // ummla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45d99bca  // ummla z10.s, z30.b, z25.b\n"
+      ".inst 0x45d99b92  // ummla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45d89bce  // ummla z14.s, z30.b, z24.b\n"
+      ".inst 0x45d89b96  // ummla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45d99bcb  // ummla z11.s, z30.b, z25.b\n"
+      ".inst 0x45d99b93  // ummla z19.s, z28.b, z25.b\n"
+      ".inst 0x45d89bcf  // ummla z15.s, z30.b, z24.b\n"
+      ".inst 0x45d89b97  // ummla z23.s, z28.b, z24.b\n"
+      "bgt 41b\n"
+      "42:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z28.d, z1.d, z24.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "trn1 z26.d, z3.d, z27.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99b88  // ummla z8.s, z28.b, z25.b\n"
+      ".inst 0x45d99b50  // ummla z16.s, z26.b, z25.b\n"
+      ".inst 0x45d89b8c  // ummla z12.s, z28.b, z24.b\n"
+      ".inst 0x45d89b54  // ummla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99b89  // ummla z9.s, z28.b, z25.b\n"
+      ".inst 0x45d99b51  // ummla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x45d89b8d  // ummla z13.s, z28.b, z24.b\n"
+      ".inst 0x45d89b55  // ummla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z27.d\n"
+      ".inst 0x45d99b8a  // ummla z10.s, z28.b, z25.b\n"
+      ".inst 0x45d99b52  // ummla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45d89b8e  // ummla z14.s, z28.b, z24.b\n"
+      ".inst 0x45d89b56  // ummla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x45d99b8b  // ummla z11.s, z28.b, z25.b\n"
+      ".inst 0x45d99b53  // ummla z19.s, z26.b, z25.b\n"
+      ".inst 0x45d89b8f  // ummla z15.s, z28.b, z24.b\n"
+      ".inst 0x45d89b57  // ummla z23.s, z26.b, z24.b\n"
+      "ble 43f\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99828  // ummla z8.s, z1.b, z25.b\n"
+      ".inst 0x45d99870  // ummla z16.s, z3.b, z25.b\n"
+      ".inst 0x45d8982c  // ummla z12.s, z1.b, z24.b\n"
+      ".inst 0x45d89874  // ummla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99829  // ummla z9.s, z1.b, z25.b\n"
+      ".inst 0x45d99871  // ummla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45d8982d  // ummla z13.s, z1.b, z24.b\n"
+      ".inst 0x45d89875  // ummla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d9982a  // ummla z10.s, z1.b, z25.b\n"
+      ".inst 0x45d99872  // ummla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45d8982e  // ummla z14.s, z1.b, z24.b\n"
+      ".inst 0x45d89876  // ummla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x45d9982b  // ummla z11.s, z1.b, z25.b\n"
+      ".inst 0x45d99873  // ummla z19.s, z3.b, z25.b\n"
+      ".inst 0x45d8982f  // ummla z15.s, z1.b, z24.b\n"
+      ".inst 0x45d89877  // ummla z23.s, z3.b, z24.b\n"
+      "43:"  // Height 4: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 38b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp1 z25.d, z8.d, z12.d\n"
+      "add x20, x21, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z24.d, z9.d, z13.d\n"
+      "st1w { z25.s }, p4, [x9]\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z25.d, z10.d, z14.d\n"
+      "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z24.d, z11.d, z15.d\n"
+      "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z25.d, z16.d, z20.d\n"
+      "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z24.d, z17.d, z21.d\n"
+      "st1w { z8.s }, p4, [x22]\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z9.s }, p3, [x22, #1, MUL VL]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z20.d, z19.d, z23.d\n"
+      "st1w { z10.s }, p2, [x22, #2, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "st1w { z11.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z25.s }, p4, [x21]\n"
+      "st1w { z24.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
+      "44:"  // Height 4: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 35b\n"
+      "b 68f\n"
+      "45:"  // Height 5
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "46:"  // Height 5: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 47f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 48f\n"
+      "47:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "48:"  // Height 5: setup done
+      "mov x28, #0x0\n"
+      "49:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x28, 51f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "b 51f\n"
+      "50:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "51:"  // Height 5: input setup done
+      "cmp x27, #0x10\n"
+      "ble 53f\n"
+      "52:"  // Height 5: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z6.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z7.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z5.d, z6.d, z1.d\n"
+      "trn2 z6.d, z6.d, z1.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "trn1 z3.d, z7.d, z2.d\n"
+      "trn2 z7.d, z7.d, z2.d\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "trn1 z2.d, z4.d, z0.d\n"
+      "trn2 z4.d, z4.d, z0.d\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c198a8  // ummla z8.s, z5.b, z1.b\n"
+      ".inst 0x45c19870  // ummla z16.s, z3.b, z1.b\n"
+      ".inst 0x45c19858  // ummla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x45c098ac  // ummla z12.s, z5.b, z0.b\n"
+      ".inst 0x45c09874  // ummla z20.s, z3.b, z0.b\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x45c0985c  // ummla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c198a9  // ummla z9.s, z5.b, z1.b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45c19871  // ummla z17.s, z3.b, z1.b\n"
+      ".inst 0x45c19859  // ummla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45c098ad  // ummla z13.s, z5.b, z0.b\n"
+      ".inst 0x45c09875  // ummla z21.s, z3.b, z0.b\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x45c0985d  // ummla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c198aa  // ummla z10.s, z5.b, z1.b\n"
+      ".inst 0x45c19872  // ummla z18.s, z3.b, z1.b\n"
+      ".inst 0x45c1985a  // ummla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c098ae  // ummla z14.s, z5.b, z0.b\n"
+      ".inst 0x45c09876  // ummla z22.s, z3.b, z0.b\n"
+      ".inst 0x45c0985e  // ummla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x45c198ab  // ummla z11.s, z5.b, z1.b\n"
+      ".inst 0x45c19873  // ummla z19.s, z3.b, z1.b\n"
+      ".inst 0x45c1985b  // ummla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x45c098af  // ummla z15.s, z5.b, z0.b\n"
+      ".inst 0x45c09877  // ummla z23.s, z3.b, z0.b\n"
+      ".inst 0x45c0985f  // ummla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45c198c8  // ummla z8.s, z6.b, z1.b\n"
+      ".inst 0x45c198f0  // ummla z16.s, z7.b, z1.b\n"
+      ".inst 0x45c19898  // ummla z24.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x45c098cc  // ummla z12.s, z6.b, z0.b\n"
+      ".inst 0x45c098f4  // ummla z20.s, z7.b, z0.b\n"
+      ".inst 0x45c0989c  // ummla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45c198c9  // ummla z9.s, z6.b, z1.b\n"
+      ".inst 0x45c198f1  // ummla z17.s, z7.b, z1.b\n"
+      ".inst 0x45c19899  // ummla z25.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45c098cd  // ummla z13.s, z6.b, z0.b\n"
+      ".inst 0x45c098f5  // ummla z21.s, z7.b, z0.b\n"
+      ".inst 0x45c0989d  // ummla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45c198ca  // ummla z10.s, z6.b, z1.b\n"
+      ".inst 0x45c198f2  // ummla z18.s, z7.b, z1.b\n"
+      ".inst 0x45c1989a  // ummla z26.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45c098ce  // ummla z14.s, z6.b, z0.b\n"
+      ".inst 0x45c098f6  // ummla z22.s, z7.b, z0.b\n"
+      ".inst 0x45c0989e  // ummla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45c198cb  // ummla z11.s, z6.b, z1.b\n"
+      ".inst 0x45c198f3  // ummla z19.s, z7.b, z1.b\n"
+      ".inst 0x45c1989b  // ummla z27.s, z4.b, z1.b\n"
+      ".inst 0x45c098cf  // ummla z15.s, z6.b, z0.b\n"
+      ".inst 0x45c098f7  // ummla z23.s, z7.b, z0.b\n"
+      ".inst 0x45c0989f  // ummla z31.s, z4.b, z0.b\n"
+      "bgt 52b\n"
+      "53:"  // Height 5: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z4.b }, p0/Z, [x25]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z7.d, z1.d, z4.d\n"
+      "trn2 z1.d, z1.d, z4.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x22]\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c298e8  // ummla z8.s, z7.b, z2.b\n"
+      ".inst 0x45c298d0  // ummla z16.s, z6.b, z2.b\n"
+      ".inst 0x45c29898  // ummla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x45c098ec  // ummla z12.s, z7.b, z0.b\n"
+      ".inst 0x45c098d4  // ummla z20.s, z6.b, z0.b\n"
+      ".inst 0x45c0989c  // ummla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c298e9  // ummla z9.s, z7.b, z2.b\n"
+      ".inst 0x45c298d1  // ummla z17.s, z6.b, z2.b\n"
+      ".inst 0x45c29899  // ummla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45c098ed  // ummla z13.s, z7.b, z0.b\n"
+      ".inst 0x45c098d5  // ummla z21.s, z6.b, z0.b\n"
+      ".inst 0x45c0989d  // ummla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c298ea  // ummla z10.s, z7.b, z2.b\n"
+      ".inst 0x45c298d2  // ummla z18.s, z6.b, z2.b\n"
+      ".inst 0x45c2989a  // ummla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c098ee  // ummla z14.s, z7.b, z0.b\n"
+      ".inst 0x45c098d6  // ummla z22.s, z6.b, z0.b\n"
+      ".inst 0x45c0989e  // ummla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45c298eb  // ummla z11.s, z7.b, z2.b\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x45c298d3  // ummla z19.s, z6.b, z2.b\n"
+      ".inst 0x45c2989b  // ummla z27.s, z4.b, z2.b\n"
+      ".inst 0x45c098ef  // ummla z15.s, z7.b, z0.b\n"
+      ".inst 0x45c098d7  // ummla z23.s, z6.b, z0.b\n"
+      ".inst 0x45c0989f  // ummla z31.s, z4.b, z0.b\n"
+      "ble 54f\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c29828  // ummla z8.s, z1.b, z2.b\n"
+      ".inst 0x45c29870  // ummla z16.s, z3.b, z2.b\n"
+      ".inst 0x45c298b8  // ummla z24.s, z5.b, z2.b\n"
+      ".inst 0x45c0982c  // ummla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x45c09874  // ummla z20.s, z3.b, z0.b\n"
+      ".inst 0x45c098bc  // ummla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c29829  // ummla z9.s, z1.b, z2.b\n"
+      ".inst 0x45c29871  // ummla z17.s, z3.b, z2.b\n"
+      ".inst 0x45c298b9  // ummla z25.s, z5.b, z2.b\n"
+      ".inst 0x45c0982d  // ummla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45c09875  // ummla z21.s, z3.b, z0.b\n"
+      ".inst 0x45c098bd  // ummla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c2982a  // ummla z10.s, z1.b, z2.b\n"
+      ".inst 0x45c29872  // ummla z18.s, z3.b, z2.b\n"
+      ".inst 0x45c298ba  // ummla z26.s, z5.b, z2.b\n"
+      ".inst 0x45c0982e  // ummla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c09876  // ummla z22.s, z3.b, z0.b\n"
+      ".inst 0x45c098be  // ummla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x45c2982b  // ummla z11.s, z1.b, z2.b\n"
+      ".inst 0x45c29873  // ummla z19.s, z3.b, z2.b\n"
+      ".inst 0x45c298bb  // ummla z27.s, z5.b, z2.b\n"
+      ".inst 0x45c0982f  // ummla z15.s, z1.b, z0.b\n"
+      ".inst 0x45c09877  // ummla z23.s, z3.b, z0.b\n"
+      ".inst 0x45c098bf  // ummla z31.s, z5.b, z0.b\n"
+      "54:"  // Height 5: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 49b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x9, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
+      "uzp1 z2.d, z8.d, z12.d\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z1.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z0.d, z10.d, z14.d\n"
+      "st1w { z2.s }, p4, [x9]\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z2.d, z11.d, z15.d\n"
+      "st1w { z1.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z1.d, z16.d, z20.d\n"
+      "st1w { z0.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z0.d, z17.d, z21.d\n"
+      "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z20.d, z19.d, z23.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "st1w { z1.s }, p4, [x22]\n"
+      "st1w { z0.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+      "55:"  // Height 5: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 46b\n"
+      "b 68f\n"
+      "56:"  // Height 6
+      "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x20, #0x18\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
+      "57:"  // Height 6: Column loop
+      "mov x20, #0x0\n"
+      "whilelt p4.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p3.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p2.s, x20, x11\n"
+      "incw x20\n"
+      "whilelt p1.s, x20, x11\n"
+      "tbz %x[flags], #0, 58f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "ld1w { z17.s }, p4/Z, [x9]\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z17.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip2 z12.d, z17.d, z12.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z20.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z14.d, z20.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
+      "b 59f\n"
+      "58:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "59:"  // Height 6: setup done
+      "mov x28, #0x0\n"
+      "60:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x28, 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x22, x22, x20\n"
+      "add x21, x21, x20\n"
+      "b 62f\n"
+      "61:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
+      "62:"  // Height 6: input setup done
+      "cmp x27, #0x10\n"
+      "ble 64f\n"
+      "63:"  // Height 6: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z6.d, z7.d, z0.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "trn2 z7.d, z7.d, z0.d\n"
+      "trn1 z4.d, z5.d, z1.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z5.d, z5.d, z1.d\n"
+      "trn1 z2.d, z3.d, z0.d\n"
+      "trn2 z3.d, z3.d, z0.d\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c198c8  // ummla z8.s, z6.b, z1.b\n"
+      ".inst 0x45c19890  // ummla z16.s, z4.b, z1.b\n"
+      ".inst 0x45c19858  // ummla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sub x27, x27, #0x10\n"
+      ".inst 0x45c098cc  // ummla z12.s, z6.b, z0.b\n"
+      ".inst 0x45c09894  // ummla z20.s, z4.b, z0.b\n"
+      "cmp x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x45c0985c  // ummla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c198c9  // ummla z9.s, z6.b, z1.b\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45c19891  // ummla z17.s, z4.b, z1.b\n"
+      ".inst 0x45c19859  // ummla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45c098cd  // ummla z13.s, z6.b, z0.b\n"
+      ".inst 0x45c09895  // ummla z21.s, z4.b, z0.b\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x45c0985d  // ummla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c198ca  // ummla z10.s, z6.b, z1.b\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45c19892  // ummla z18.s, z4.b, z1.b\n"
+      ".inst 0x45c1985a  // ummla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c098ce  // ummla z14.s, z6.b, z0.b\n"
+      ".inst 0x45c09896  // ummla z22.s, z4.b, z0.b\n"
+      ".inst 0x45c0985e  // ummla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x45c198cb  // ummla z11.s, z6.b, z1.b\n"
+      ".inst 0x45c19893  // ummla z19.s, z4.b, z1.b\n"
+      ".inst 0x45c1985b  // ummla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x45c098cf  // ummla z15.s, z6.b, z0.b\n"
+      ".inst 0x45c09897  // ummla z23.s, z4.b, z0.b\n"
+      ".inst 0x45c0985f  // ummla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45c198e8  // ummla z8.s, z7.b, z1.b\n"
+      ".inst 0x45c198b0  // ummla z16.s, z5.b, z1.b\n"
+      ".inst 0x45c19878  // ummla z24.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x45c098ec  // ummla z12.s, z7.b, z0.b\n"
+      ".inst 0x45c098b4  // ummla z20.s, z5.b, z0.b\n"
+      ".inst 0x45c0987c  // ummla z28.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45c198e9  // ummla z9.s, z7.b, z1.b\n"
+      ".inst 0x45c198b1  // ummla z17.s, z5.b, z1.b\n"
+      ".inst 0x45c19879  // ummla z25.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45c098ed  // ummla z13.s, z7.b, z0.b\n"
+      ".inst 0x45c098b5  // ummla z21.s, z5.b, z0.b\n"
+      ".inst 0x45c0987d  // ummla z29.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45c198ea  // ummla z10.s, z7.b, z1.b\n"
+      ".inst 0x45c198b2  // ummla z18.s, z5.b, z1.b\n"
+      ".inst 0x45c1987a  // ummla z26.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45c098ee  // ummla z14.s, z7.b, z0.b\n"
+      ".inst 0x45c098b6  // ummla z22.s, z5.b, z0.b\n"
+      ".inst 0x45c0987e  // ummla z30.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45c198eb  // ummla z11.s, z7.b, z1.b\n"
+      ".inst 0x45c198b3  // ummla z19.s, z5.b, z1.b\n"
+      ".inst 0x45c1987b  // ummla z27.s, z3.b, z1.b\n"
+      ".inst 0x45c098ef  // ummla z15.s, z7.b, z0.b\n"
+      ".inst 0x45c098b7  // ummla z23.s, z5.b, z0.b\n"
+      ".inst 0x45c0987f  // ummla z31.s, z3.b, z0.b\n"
+      "bgt 63b\n"
+      "64:"  // Height 6: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x27\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z7.d, z1.d, z0.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z0.d\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x22]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c298e8  // ummla z8.s, z7.b, z2.b\n"
+      ".inst 0x45c298d0  // ummla z16.s, z6.b, z2.b\n"
+      ".inst 0x45c29898  // ummla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "subs x27, x27, #0x8\n"
+      ".inst 0x45c098ec  // ummla z12.s, z7.b, z0.b\n"
+      ".inst 0x45c098d4  // ummla z20.s, z6.b, z0.b\n"
+      ".inst 0x45c0989c  // ummla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c298e9  // ummla z9.s, z7.b, z2.b\n"
+      ".inst 0x45c298d1  // ummla z17.s, z6.b, z2.b\n"
+      ".inst 0x45c29899  // ummla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45c098ed  // ummla z13.s, z7.b, z0.b\n"
+      ".inst 0x45c098d5  // ummla z21.s, z6.b, z0.b\n"
+      ".inst 0x45c0989d  // ummla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c298ea  // ummla z10.s, z7.b, z2.b\n"
+      ".inst 0x45c298d2  // ummla z18.s, z6.b, z2.b\n"
+      ".inst 0x45c2989a  // ummla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c098ee  // ummla z14.s, z7.b, z0.b\n"
+      ".inst 0x45c098d6  // ummla z22.s, z6.b, z0.b\n"
+      ".inst 0x45c0989e  // ummla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45c298eb  // ummla z11.s, z7.b, z2.b\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x45c298d3  // ummla z19.s, z6.b, z2.b\n"
+      ".inst 0x45c2989b  // ummla z27.s, z4.b, z2.b\n"
+      ".inst 0x45c098ef  // ummla z15.s, z7.b, z0.b\n"
+      ".inst 0x45c098d7  // ummla z23.s, z6.b, z0.b\n"
+      ".inst 0x45c0989f  // ummla z31.s, z4.b, z0.b\n"
+      "ble 65f\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c29828  // ummla z8.s, z1.b, z2.b\n"
+      ".inst 0x45c29870  // ummla z16.s, z3.b, z2.b\n"
+      ".inst 0x45c298b8  // ummla z24.s, z5.b, z2.b\n"
+      ".inst 0x45c0982c  // ummla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x45c09874  // ummla z20.s, z3.b, z0.b\n"
+      ".inst 0x45c098bc  // ummla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c29829  // ummla z9.s, z1.b, z2.b\n"
+      ".inst 0x45c29871  // ummla z17.s, z3.b, z2.b\n"
+      ".inst 0x45c298b9  // ummla z25.s, z5.b, z2.b\n"
+      ".inst 0x45c0982d  // ummla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45c09875  // ummla z21.s, z3.b, z0.b\n"
+      ".inst 0x45c098bd  // ummla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c2982a  // ummla z10.s, z1.b, z2.b\n"
+      ".inst 0x45c29872  // ummla z18.s, z3.b, z2.b\n"
+      ".inst 0x45c298ba  // ummla z26.s, z5.b, z2.b\n"
+      ".inst 0x45c0982e  // ummla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c09876  // ummla z22.s, z3.b, z0.b\n"
+      ".inst 0x45c098be  // ummla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x45c2982b  // ummla z11.s, z1.b, z2.b\n"
+      ".inst 0x45c29873  // ummla z19.s, z3.b, z2.b\n"
+      ".inst 0x45c298bb  // ummla z27.s, z5.b, z2.b\n"
+      ".inst 0x45c0982f  // ummla z15.s, z1.b, z0.b\n"
+      ".inst 0x45c09877  // ummla z23.s, z3.b, z0.b\n"
+      ".inst 0x45c098bf  // ummla z31.s, z5.b, z0.b\n"
+      "65:"  // Height 6: Multiply loop: multiply skip
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 60b\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x9, x20, LSL #2\n"
+      "add x23, x24, x20, LSL #2\n"
+      "uzp1 z0.d, z8.d, z12.d\n"
+      "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x20, x21, x20, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "st1w { z0.s }, p4, [x9]\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "st1w { z15.s }, p4, [x23]\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z23.s }, p4, [x21]\n"
+      "st1w { z28.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z29.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z30.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+      "66:"  // Height 6: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 57b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 68f\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "add x21, x21, #0x6\n"
+      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "67:"  // Update direct input
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
+      "b 1b\n"
+      "68:"  // Exit
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
index 12bb758b68..1ae035c614 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,51 +22,74 @@
  * SOFTWARE.
  */
 #pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
 #include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, \
+    float *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_dot_8x3VL( ARGLIST );
 
-class cls_sve_interleaved_bf16fp32_dot_8x3VL {
+class cls_sve_interleaved_bf16fp32_dot_8x3VL
+{
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return get_vector_length<float>() * 3;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return get_vector_length<float>() * 3;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 2;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 3, 2, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 2, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_bf16fp32_dot_8x3VL;
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.92, 3.74, 7.14 };
+                case CPUModel::A510:
+                    return { 7.54, 3.77, 2.43 };
+                case CPUModel::V1:
+                    return { 31.82, 5.11, 11.20 };
+            }
+        }
 
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_bf16fp32_dot_8x3VL;
     cls_sve_interleaved_bf16fp32_dot_8x3VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
-#endif // __ARM_FEATURE_SVE
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
index adee900337..e507bc5551 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,309 +21,230 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const bfloat16 *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
+void sve_interleaved_bf16fp32_dot_8x3VL(
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
-    K /= 2;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const bfloat16 *a_ptr0 = a_ptr;
-        const bfloat16 *b_ptr = Bpanel;
+    ka.K = (K/2) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.h\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z17.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "mov z18.s, #0\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
-                ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
-                ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"
-                ".inst 0x64714096 // bfdot z22.s, z4.h, z1.h[2]\n"
-                ".inst 0x64794097 // bfdot z23.s, z4.h, z1.h[3]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                ".inst 0x646040ac // bfdot z12.s, z5.h, z0.h[0]\n"
-                ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n"
-                ".inst 0x647040ae // bfdot z14.s, z5.h, z0.h[2]\n"
-                ".inst 0x647840af // bfdot z15.s, z5.h, z0.h[3]\n"
-                ".inst 0x646140b8 // bfdot z24.s, z5.h, z1.h[0]\n"
-                ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n"
-                ".inst 0x647140ba // bfdot z26.s, z5.h, z1.h[2]\n"
-                ".inst 0x647940bb // bfdot z27.s, z5.h, z1.h[3]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x646040d0 // bfdot z16.s, z6.h, z0.h[0]\n"
-                ".inst 0x646840d1 // bfdot z17.s, z6.h, z0.h[1]\n"
-                ".inst 0x647040d2 // bfdot z18.s, z6.h, z0.h[2]\n"
-                ".inst 0x647840d3 // bfdot z19.s, z6.h, z0.h[3]\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                ".inst 0x646140dc // bfdot z28.s, z6.h, z1.h[0]\n"
-                ".inst 0x646940dd // bfdot z29.s, z6.h, z1.h[1]\n"
-                ".inst 0x647140de // bfdot z30.s, z6.h, z1.h[2]\n"
-                ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x64624088 // bfdot z8.s, z4.h, z2.h[0]\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x646a4089 // bfdot z9.s, z4.h, z2.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x6472408a // bfdot z10.s, z4.h, z2.h[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                ".inst 0x647a408b // bfdot z11.s, z4.h, z2.h[3]\n"
-                ".inst 0x64634094 // bfdot z20.s, z4.h, z3.h[0]\n"
-                ".inst 0x646b4095 // bfdot z21.s, z4.h, z3.h[1]\n"
-                ".inst 0x64734096 // bfdot z22.s, z4.h, z3.h[2]\n"
-                ".inst 0x647b4097 // bfdot z23.s, z4.h, z3.h[3]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x646240ac // bfdot z12.s, z5.h, z2.h[0]\n"
-                ".inst 0x646a40ad // bfdot z13.s, z5.h, z2.h[1]\n"
-                ".inst 0x647240ae // bfdot z14.s, z5.h, z2.h[2]\n"
-                ".inst 0x647a40af // bfdot z15.s, z5.h, z2.h[3]\n"
-                ".inst 0x646340b8 // bfdot z24.s, z5.h, z3.h[0]\n"
-                ".inst 0x646b40b9 // bfdot z25.s, z5.h, z3.h[1]\n"
-                ".inst 0x647340ba // bfdot z26.s, z5.h, z3.h[2]\n"
-                ".inst 0x647b40bb // bfdot z27.s, z5.h, z3.h[3]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
-                ".inst 0x646a40d1 // bfdot z17.s, z6.h, z2.h[1]\n"
-                ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
-                ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x646340dc // bfdot z28.s, z6.h, z3.h[0]\n"
-                ".inst 0x646b40dd // bfdot z29.s, z6.h, z3.h[1]\n"
-                ".inst 0x647340de // bfdot z30.s, z6.h, z3.h[2]\n"
-                ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
-                ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
-                ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
-                ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"
-                ".inst 0x64714096 // bfdot z22.s, z4.h, z1.h[2]\n"
-                ".inst 0x64794097 // bfdot z23.s, z4.h, z1.h[3]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                ".inst 0x646040ac // bfdot z12.s, z5.h, z0.h[0]\n"
-                ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n"
-                ".inst 0x647040ae // bfdot z14.s, z5.h, z0.h[2]\n"
-                ".inst 0x647840af // bfdot z15.s, z5.h, z0.h[3]\n"
-                ".inst 0x646140b8 // bfdot z24.s, z5.h, z1.h[0]\n"
-                ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n"
-                ".inst 0x647140ba // bfdot z26.s, z5.h, z1.h[2]\n"
-                ".inst 0x647940bb // bfdot z27.s, z5.h, z1.h[3]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x646040d0 // bfdot z16.s, z6.h, z0.h[0]\n"
-                ".inst 0x646840d1 // bfdot z17.s, z6.h, z0.h[1]\n"
-                ".inst 0x647040d2 // bfdot z18.s, z6.h, z0.h[2]\n"
-                ".inst 0x647840d3 // bfdot z19.s, z6.h, z0.h[3]\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                ".inst 0x646140dc // bfdot z28.s, z6.h, z1.h[0]\n"
-                ".inst 0x646940dd // bfdot z29.s, z6.h, z1.h[1]\n"
-                ".inst 0x647140de // bfdot z30.s, z6.h, z1.h[2]\n"
-                ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x64624088 // bfdot z8.s, z4.h, z2.h[0]\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x646a4089 // bfdot z9.s, z4.h, z2.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                ".inst 0x6472408a // bfdot z10.s, z4.h, z2.h[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                ".inst 0x647a408b // bfdot z11.s, z4.h, z2.h[3]\n"
-                ".inst 0x64634094 // bfdot z20.s, z4.h, z3.h[0]\n"
-                ".inst 0x646b4095 // bfdot z21.s, z4.h, z3.h[1]\n"
-                ".inst 0x64734096 // bfdot z22.s, z4.h, z3.h[2]\n"
-                ".inst 0x647b4097 // bfdot z23.s, z4.h, z3.h[3]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x646240ac // bfdot z12.s, z5.h, z2.h[0]\n"
-                ".inst 0x646a40ad // bfdot z13.s, z5.h, z2.h[1]\n"
-                ".inst 0x647240ae // bfdot z14.s, z5.h, z2.h[2]\n"
-                ".inst 0x647a40af // bfdot z15.s, z5.h, z2.h[3]\n"
-                ".inst 0x646340b8 // bfdot z24.s, z5.h, z3.h[0]\n"
-                ".inst 0x646b40b9 // bfdot z25.s, z5.h, z3.h[1]\n"
-                ".inst 0x647340ba // bfdot z26.s, z5.h, z3.h[2]\n"
-                ".inst 0x647b40bb // bfdot z27.s, z5.h, z3.h[3]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
-                ".inst 0x646a40d1 // bfdot z17.s, z6.h, z2.h[1]\n"
-                ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
-                ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n"
-                ".inst 0x646340dc // bfdot z28.s, z6.h, z3.h[0]\n"
-                ".inst 0x646b40dd // bfdot z29.s, z6.h, z3.h[1]\n"
-                ".inst 0x647340de // bfdot z30.s, z6.h, z3.h[2]\n"
-                ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
-                ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
-                ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
-                ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
-                ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"
-                ".inst 0x64714096 // bfdot z22.s, z4.h, z1.h[2]\n"
-                ".inst 0x64794097 // bfdot z23.s, z4.h, z1.h[3]\n"
-                ".inst 0x646040ac // bfdot z12.s, z5.h, z0.h[0]\n"
-                ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n"
-                ".inst 0x647040ae // bfdot z14.s, z5.h, z0.h[2]\n"
-                ".inst 0x647840af // bfdot z15.s, z5.h, z0.h[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                ".inst 0x646140b8 // bfdot z24.s, z5.h, z1.h[0]\n"
-                ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n"
-                ".inst 0x647140ba // bfdot z26.s, z5.h, z1.h[2]\n"
-                ".inst 0x647940bb // bfdot z27.s, z5.h, z1.h[3]\n"
-                ".inst 0x646040d0 // bfdot z16.s, z6.h, z0.h[0]\n"
-                ".inst 0x646840d1 // bfdot z17.s, z6.h, z0.h[1]\n"
-                ".inst 0x647040d2 // bfdot z18.s, z6.h, z0.h[2]\n"
-                ".inst 0x647840d3 // bfdot z19.s, z6.h, z0.h[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                ".inst 0x646140dc // bfdot z28.s, z6.h, z1.h[0]\n"
-                ".inst 0x646940dd // bfdot z29.s, z6.h, z1.h[1]\n"
-                ".inst 0x647140de // bfdot z30.s, z6.h, z1.h[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
-                ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
-                ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"
-                ".inst 0x64714096 // bfdot z22.s, z4.h, z1.h[2]\n"
-                ".inst 0x64794097 // bfdot z23.s, z4.h, z1.h[3]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x646040ac // bfdot z12.s, z5.h, z0.h[0]\n"
-                ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n"
-                ".inst 0x647040ae // bfdot z14.s, z5.h, z0.h[2]\n"
-                ".inst 0x647840af // bfdot z15.s, z5.h, z0.h[3]\n"
-                ".inst 0x646140b8 // bfdot z24.s, z5.h, z1.h[0]\n"
-                ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n"
-                ".inst 0x647140ba // bfdot z26.s, z5.h, z1.h[2]\n"
-                ".inst 0x647940bb // bfdot z27.s, z5.h, z1.h[3]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x646040d0 // bfdot z16.s, z6.h, z0.h[0]\n"
-                ".inst 0x646840d1 // bfdot z17.s, z6.h, z0.h[1]\n"
-                ".inst 0x647040d2 // bfdot z18.s, z6.h, z0.h[2]\n"
-                ".inst 0x647840d3 // bfdot z19.s, z6.h, z0.h[3]\n"
-                ".inst 0x646140dc // bfdot z28.s, z6.h, z1.h[0]\n"
-                ".inst 0x646940dd // bfdot z29.s, z6.h, z1.h[1]\n"
-                ".inst 0x647140de // bfdot z30.s, z6.h, z1.h[2]\n"
-                ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x64624088 // bfdot z8.s, z4.h, z2.h[0]\n"
-                ".inst 0x646a4089 // bfdot z9.s, z4.h, z2.h[1]\n"
-                ".inst 0x6472408a // bfdot z10.s, z4.h, z2.h[2]\n"
-                ".inst 0x647a408b // bfdot z11.s, z4.h, z2.h[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                ".inst 0x64634094 // bfdot z20.s, z4.h, z3.h[0]\n"
-                ".inst 0x646b4095 // bfdot z21.s, z4.h, z3.h[1]\n"
-                ".inst 0x64734096 // bfdot z22.s, z4.h, z3.h[2]\n"
-                ".inst 0x647b4097 // bfdot z23.s, z4.h, z3.h[3]\n"
-                ".inst 0x646240ac // bfdot z12.s, z5.h, z2.h[0]\n"
-                ".inst 0x646a40ad // bfdot z13.s, z5.h, z2.h[1]\n"
-                ".inst 0x647240ae // bfdot z14.s, z5.h, z2.h[2]\n"
-                ".inst 0x647a40af // bfdot z15.s, z5.h, z2.h[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                ".inst 0x646340b8 // bfdot z24.s, z5.h, z3.h[0]\n"
-                ".inst 0x646b40b9 // bfdot z25.s, z5.h, z3.h[1]\n"
-                ".inst 0x647340ba // bfdot z26.s, z5.h, z3.h[2]\n"
-                ".inst 0x647b40bb // bfdot z27.s, z5.h, z3.h[3]\n"
-                ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
-                ".inst 0x646a40d1 // bfdot z17.s, z6.h, z2.h[1]\n"
-                ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
-                ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                ".inst 0x646340dc // bfdot z28.s, z6.h, z3.h[0]\n"
-                ".inst 0x646b40dd // bfdot z29.s, z6.h, z3.h[1]\n"
-                ".inst 0x647340de // bfdot z30.s, z6.h, z3.h[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n"
-                "4:\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "st1w z25.s, p0, [%[c_ptr]]\n"
-                "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "ld1h { z4.h }, p0/Z, [x22]\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "ld1h { z6.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      ".inst 0x64604088  // bfdot z8.s, z4.h, z0.h[0]\n"
+      ".inst 0x6468408b  // bfdot z11.s, z4.h, z0.h[1]\n"
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #32]\n"
+      ".inst 0x6470408e  // bfdot z14.s, z4.h, z0.h[2]\n"
+      ".inst 0x64784091  // bfdot z17.s, z4.h, z0.h[3]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x64614094  // bfdot z20.s, z4.h, z1.h[0]\n"
+      ".inst 0x64694097  // bfdot z23.s, z4.h, z1.h[1]\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x6471409a  // bfdot z26.s, z4.h, z1.h[2]\n"
+      ".inst 0x6479409d  // bfdot z29.s, z4.h, z1.h[3]\n"
+      "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x646040a9  // bfdot z9.s, z5.h, z0.h[0]\n"
+      ".inst 0x646840ac  // bfdot z12.s, z5.h, z0.h[1]\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x647040af  // bfdot z15.s, z5.h, z0.h[2]\n"
+      ".inst 0x647840b2  // bfdot z18.s, z5.h, z0.h[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x646140b5  // bfdot z21.s, z5.h, z1.h[0]\n"
+      ".inst 0x646940b8  // bfdot z24.s, z5.h, z1.h[1]\n"
+      ".inst 0x647140bb  // bfdot z27.s, z5.h, z1.h[2]\n"
+      ".inst 0x647940be  // bfdot z30.s, z5.h, z1.h[3]\n"
+      "ld1h { z5.h }, p0/Z, [x22, #4, MUL VL]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646840cd  // bfdot z13.s, z6.h, z0.h[1]\n"
+      ".inst 0x647040d0  // bfdot z16.s, z6.h, z0.h[2]\n"
+      ".inst 0x647840d3  // bfdot z19.s, z6.h, z0.h[3]\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x646140d6  // bfdot z22.s, z6.h, z1.h[0]\n"
+      ".inst 0x646940d9  // bfdot z25.s, z6.h, z1.h[1]\n"
+      ".inst 0x647140dc  // bfdot z28.s, z6.h, z1.h[2]\n"
+      ".inst 0x647940df  // bfdot z31.s, z6.h, z1.h[3]\n"
+      "ld1h { z2.h }, p0/Z, [x22, #5, MUL VL]\n"
+      "addvl x22, x22, #6\n"
+      ".inst 0x64634088  // bfdot z8.s, z4.h, z3.h[0]\n"
+      ".inst 0x646b408b  // bfdot z11.s, z4.h, z3.h[1]\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      ".inst 0x6473408e  // bfdot z14.s, z4.h, z3.h[2]\n"
+      ".inst 0x647b4091  // bfdot z17.s, z4.h, z3.h[3]\n"
+      ".inst 0x64674094  // bfdot z20.s, z4.h, z7.h[0]\n"
+      ".inst 0x646f4097  // bfdot z23.s, z4.h, z7.h[1]\n"
+      ".inst 0x6477409a  // bfdot z26.s, z4.h, z7.h[2]\n"
+      ".inst 0x647f409d  // bfdot z29.s, z4.h, z7.h[3]\n"
+      "ld1h { z4.h }, p0/Z, [x22]\n"
+      ".inst 0x646340a9  // bfdot z9.s, z5.h, z3.h[0]\n"
+      ".inst 0x646b40ac  // bfdot z12.s, z5.h, z3.h[1]\n"
+      ".inst 0x647340af  // bfdot z15.s, z5.h, z3.h[2]\n"
+      ".inst 0x647b40b2  // bfdot z18.s, z5.h, z3.h[3]\n"
+      ".inst 0x646740b5  // bfdot z21.s, z5.h, z7.h[0]\n"
+      ".inst 0x646f40b8  // bfdot z24.s, z5.h, z7.h[1]\n"
+      ".inst 0x647740bb  // bfdot z27.s, z5.h, z7.h[2]\n"
+      ".inst 0x647f40be  // bfdot z30.s, z5.h, z7.h[3]\n"
+      "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6463404a  // bfdot z10.s, z2.h, z3.h[0]\n"
+      ".inst 0x646b404d  // bfdot z13.s, z2.h, z3.h[1]\n"
+      ".inst 0x64734050  // bfdot z16.s, z2.h, z3.h[2]\n"
+      ".inst 0x647b4053  // bfdot z19.s, z2.h, z3.h[3]\n"
+      ".inst 0x64674056  // bfdot z22.s, z2.h, z7.h[0]\n"
+      ".inst 0x646f4059  // bfdot z25.s, z2.h, z7.h[1]\n"
+      ".inst 0x6477405c  // bfdot z28.s, z2.h, z7.h[2]\n"
+      ".inst 0x647f405f  // bfdot z31.s, z2.h, z7.h[3]\n"
+      "ld1h { z6.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      ".inst 0x64604088  // bfdot z8.s, z4.h, z0.h[0]\n"
+      ".inst 0x6468408b  // bfdot z11.s, z4.h, z0.h[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      ".inst 0x6470408e  // bfdot z14.s, z4.h, z0.h[2]\n"
+      ".inst 0x64784091  // bfdot z17.s, z4.h, z0.h[3]\n"
+      "addvl x22, x22, #3\n"
+      ".inst 0x64614094  // bfdot z20.s, z4.h, z1.h[0]\n"
+      ".inst 0x64694097  // bfdot z23.s, z4.h, z1.h[1]\n"
+      ".inst 0x6471409a  // bfdot z26.s, z4.h, z1.h[2]\n"
+      ".inst 0x6479409d  // bfdot z29.s, z4.h, z1.h[3]\n"
+      ".inst 0x646040a9  // bfdot z9.s, z5.h, z0.h[0]\n"
+      ".inst 0x646840ac  // bfdot z12.s, z5.h, z0.h[1]\n"
+      ".inst 0x647040af  // bfdot z15.s, z5.h, z0.h[2]\n"
+      ".inst 0x647840b2  // bfdot z18.s, z5.h, z0.h[3]\n"
+      ".inst 0x646140b5  // bfdot z21.s, z5.h, z1.h[0]\n"
+      ".inst 0x646940b8  // bfdot z24.s, z5.h, z1.h[1]\n"
+      ".inst 0x647140bb  // bfdot z27.s, z5.h, z1.h[2]\n"
+      ".inst 0x647940be  // bfdot z30.s, z5.h, z1.h[3]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646840cd  // bfdot z13.s, z6.h, z0.h[1]\n"
+      ".inst 0x647040d0  // bfdot z16.s, z6.h, z0.h[2]\n"
+      ".inst 0x647840d3  // bfdot z19.s, z6.h, z0.h[3]\n"
+      ".inst 0x646140d6  // bfdot z22.s, z6.h, z1.h[0]\n"
+      ".inst 0x646940d9  // bfdot z25.s, z6.h, z1.h[1]\n"
+      ".inst 0x647140dc  // bfdot z28.s, z6.h, z1.h[2]\n"
+      ".inst 0x647940df  // bfdot z31.s, z6.h, z1.h[3]\n"
+      "cbz x20, 5f\n"
+      "ld1rqh { z4.h }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ld1h { z2.h }, p0/Z, [x22]\n"
+      "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x64644048  // bfdot z8.s, z2.h, z4.h[0]\n"
+      "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x646c404b  // bfdot z11.s, z2.h, z4.h[1]\n"
+      ".inst 0x6474404e  // bfdot z14.s, z2.h, z4.h[2]\n"
+      ".inst 0x647c4051  // bfdot z17.s, z2.h, z4.h[3]\n"
+      ".inst 0x64634054  // bfdot z20.s, z2.h, z3.h[0]\n"
+      "addvl x22, x22, #3\n"
+      ".inst 0x646b4057  // bfdot z23.s, z2.h, z3.h[1]\n"
+      ".inst 0x6473405a  // bfdot z26.s, z2.h, z3.h[2]\n"
+      ".inst 0x647b405d  // bfdot z29.s, z2.h, z3.h[3]\n"
+      ".inst 0x64644029  // bfdot z9.s, z1.h, z4.h[0]\n"
+      ".inst 0x646c402c  // bfdot z12.s, z1.h, z4.h[1]\n"
+      ".inst 0x6474402f  // bfdot z15.s, z1.h, z4.h[2]\n"
+      ".inst 0x647c4032  // bfdot z18.s, z1.h, z4.h[3]\n"
+      ".inst 0x64634035  // bfdot z21.s, z1.h, z3.h[0]\n"
+      ".inst 0x646b4038  // bfdot z24.s, z1.h, z3.h[1]\n"
+      ".inst 0x6473403b  // bfdot z27.s, z1.h, z3.h[2]\n"
+      ".inst 0x647b403e  // bfdot z30.s, z1.h, z3.h[3]\n"
+      ".inst 0x6464400a  // bfdot z10.s, z0.h, z4.h[0]\n"
+      ".inst 0x646c400d  // bfdot z13.s, z0.h, z4.h[1]\n"
+      ".inst 0x64744010  // bfdot z16.s, z0.h, z4.h[2]\n"
+      ".inst 0x647c4013  // bfdot z19.s, z0.h, z4.h[3]\n"
+      ".inst 0x64634016  // bfdot z22.s, z0.h, z3.h[0]\n"
+      ".inst 0x646b4019  // bfdot z25.s, z0.h, z3.h[1]\n"
+      ".inst 0x6473401c  // bfdot z28.s, z0.h, z3.h[2]\n"
+      ".inst 0x647b401f  // bfdot z31.s, z0.h, z3.h[3]\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x23, x23, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index 2889dd7f0f..c5096ff4ba 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,51 +22,86 @@
  * SOFTWARE.
  */
 #pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
 #include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, \
+    float *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_mmla_8x3VL( ARGLIST );
 
-class cls_sve_interleaved_bf16fp32_mmla_8x3VL {
+class cls_sve_interleaved_bf16fp32_mmla_8x3VL
+{
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return get_vector_length<float>() * 3;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return get_vector_length<float>() * 3;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 4;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_bf16fp32_mmla_8x3VL;
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.41, 4.30, 7.14 };
+                case CPUModel::A510:
+                    return { 7.78, 4.01, 2.43 };
+                case CPUModel::V1:
+                    return { 62.50, 5.09, 11.32 };
+            }
+        }
+
+
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 30.86, 2.36, 5.28 };
+                case CPUModel::A510:
+                    return { 7.75, 2.47, 2.39 };
+                case CPUModel::V1:
+                    return { 47.63, 5.11, 6.80 };
+            }
+        }
 
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_bf16fp32_mmla_8x3VL;
     cls_sve_interleaved_bf16fp32_mmla_8x3VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
-#endif // __ARM_FEATURE_SVE
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
index e43404e608..ba7185752a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,377 +21,277 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const bfloat16 *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
+void sve_interleaved_bf16fp32_mmla_8x3VL(
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
-    K /= 4;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const bfloat16 *a_ptr0 = a_ptr;
-        const bfloat16 *b_ptr = Bpanel;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.h\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "mov z17.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z18.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #4\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
-                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
-                ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
-                ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
-                ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
-                ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
-                ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
-                ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
-                ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
-                ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
-                ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
-                ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
-                ".inst 0x6464e47e // bfmmla z30.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
-                "addvl %[b_ptr], %[b_ptr], #12\n"
-                ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n"
-                ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
-                ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
-                ".inst 0x6467e47b // bfmmla z27.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
-                ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
-                ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
-                ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
-                ".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n"
-                ".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
-                ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
-                ".inst 0x6466e432 // bfmmla z18.s, z1.h, z6.h\n"
-                ".inst 0x6466e458 // bfmmla z24.s, z2.h, z6.h\n"
-                ".inst 0x6466e47e // bfmmla z30.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
-                ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
-                ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
-                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
-                ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
-                ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
-                ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
-                ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
-                ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
-                ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
-                ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
-                ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
-                ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
-                ".inst 0x6464e47e // bfmmla z30.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
-                "addvl %[b_ptr], %[b_ptr], #14\n"
-                ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n"
-                ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
-                ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
-                ".inst 0x6467e47b // bfmmla z27.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
-                ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
-                ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
-                ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
-                ".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n"
-                ".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
-                ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
-                ".inst 0x6466e432 // bfmmla z18.s, z1.h, z6.h\n"
-                ".inst 0x6466e458 // bfmmla z24.s, z2.h, z6.h\n"
-                ".inst 0x6466e47e // bfmmla z30.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
-                ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
-                ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
-                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
-                ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
-                ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
-                ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
-                ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
-                ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
-                ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
-                ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
-                ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
-                ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
-                ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
-                ".inst 0x6464e47e // bfmmla z30.s, z3.h, z4.h\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
-                "addvl %[b_ptr], %[b_ptr], #8\n"
-                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
-                ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
-                ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
-                ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
-                ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
-                ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
-                ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
-                ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
-                ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
-                ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
-                ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
-                ".inst 0x6464e47e // bfmmla z30.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
-                ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
-                ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
-                ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n"
-                ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
-                ".inst 0x6467e47b // bfmmla z27.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
-                ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
-                ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
-                ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n"
-                ".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
-                ".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n"
-                ".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
-                ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
-                ".inst 0x6466e432 // bfmmla z18.s, z1.h, z6.h\n"
-                ".inst 0x6466e458 // bfmmla z24.s, z2.h, z6.h\n"
-                ".inst 0x6466e47e // bfmmla z30.s, z3.h, z6.h\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                "4:\n"
-                "uzp2 z2.d, z16.d, z17.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "uzp2 z3.d, z18.d, z19.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "uzp1 z4.d, z20.d, z21.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "uzp1 z5.d, z22.d, z23.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "uzp1 z6.d, z24.d, z25.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "uzp2 z7.d, z20.d, z21.d\n"
-                "st1w z0.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "uzp2 z0.d, z22.d, z23.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "uzp2 z1.d, z24.d, z25.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "uzp1 z2.d, z26.d, z27.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "uzp1 z3.d, z28.d, z29.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "uzp1 z4.d, z30.d, z31.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "uzp2 z5.d, z26.d, z27.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "uzp2 z6.d, z28.d, z29.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "uzp2 z7.d, z30.d, z31.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ld1h { z4.h }, p0/Z, [x22]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "addvl x22, x22, #2\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6465e40b  // bfmmla z11.s, z0.h, z5.h\n"
+      ".inst 0x6464e42e  // bfmmla z14.s, z1.h, z4.h\n"
+      ".inst 0x6465e431  // bfmmla z17.s, z1.h, z5.h\n"
+      "ld1h { z7.h }, p0/Z, [x22]\n"
+      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
+      ".inst 0x6465e457  // bfmmla z23.s, z2.h, z5.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6464e4da  // bfmmla z26.s, z6.h, z4.h\n"
+      ".inst 0x6465e4dd  // bfmmla z29.s, z6.h, z5.h\n"
+      "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6463e40c  // bfmmla z12.s, z0.h, z3.h\n"
+      ".inst 0x6467e42f  // bfmmla z15.s, z1.h, z7.h\n"
+      ".inst 0x6463e432  // bfmmla z18.s, z1.h, z3.h\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      ".inst 0x6463e458  // bfmmla z24.s, z2.h, z3.h\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x6467e4db  // bfmmla z27.s, z6.h, z7.h\n"
+      ".inst 0x6463e4de  // bfmmla z30.s, z6.h, z3.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #4, MUL VL]\n"
+      ".inst 0x6465e40a  // bfmmla z10.s, z0.h, z5.h\n"
+      ".inst 0x6464e40d  // bfmmla z13.s, z0.h, z4.h\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #16]\n"
+      ".inst 0x6465e430  // bfmmla z16.s, z1.h, z5.h\n"
+      ".inst 0x6464e433  // bfmmla z19.s, z1.h, z4.h\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #32]\n"
+      ".inst 0x6465e456  // bfmmla z22.s, z2.h, z5.h\n"
+      ".inst 0x6464e459  // bfmmla z25.s, z2.h, z4.h\n"
+      "ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x6465e4dc  // bfmmla z28.s, z6.h, z5.h\n"
+      ".inst 0x6464e4df  // bfmmla z31.s, z6.h, z4.h\n"
+      "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #64]\n"
+      "ld1h { z2.h }, p0/Z, [x22, #6, MUL VL]\n"
+      ".inst 0x6463e408  // bfmmla z8.s, z0.h, z3.h\n"
+      "ld1h { z4.h }, p0/Z, [x22, #7, MUL VL]\n"
+      "addvl x22, x22, #16\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6463e42e  // bfmmla z14.s, z1.h, z3.h\n"
+      ".inst 0x6467e431  // bfmmla z17.s, z1.h, z7.h\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6467e4b7  // bfmmla z23.s, z5.h, z7.h\n"
+      ".inst 0x6463e4da  // bfmmla z26.s, z6.h, z3.h\n"
+      ".inst 0x6467e4dd  // bfmmla z29.s, z6.h, z7.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #-8, MUL VL]\n"
+      "ld1h { z7.h }, p0/Z, [x22, #-7, MUL VL]\n"
+      ".inst 0x6462e409  // bfmmla z9.s, z0.h, z2.h\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6462e42f  // bfmmla z15.s, z1.h, z2.h\n"
+      ".inst 0x6464e432  // bfmmla z18.s, z1.h, z4.h\n"
+      ".inst 0x6462e4b5  // bfmmla z21.s, z5.h, z2.h\n"
+      ".inst 0x6464e4b8  // bfmmla z24.s, z5.h, z4.h\n"
+      ".inst 0x6462e4db  // bfmmla z27.s, z6.h, z2.h\n"
+      ".inst 0x6464e4de  // bfmmla z30.s, z6.h, z4.h\n"
+      "ld1h { z4.h }, p0/Z, [x22, #-6, MUL VL]\n"
+      ".inst 0x6463e40a  // bfmmla z10.s, z0.h, z3.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #80]\n"
+      ".inst 0x6463e430  // bfmmla z16.s, z1.h, z3.h\n"
+      ".inst 0x6467e433  // bfmmla z19.s, z1.h, z7.h\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #96]\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
+      "ld1h { z5.h }, p0/Z, [x22, #-5, MUL VL]\n"
+      ".inst 0x6463e4dc  // bfmmla z28.s, z6.h, z3.h\n"
+      ".inst 0x6467e4df  // bfmmla z31.s, z6.h, z7.h\n"
+      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "addvl x22, x22, #-4\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6465e40b  // bfmmla z11.s, z0.h, z5.h\n"
+      ".inst 0x6464e42e  // bfmmla z14.s, z1.h, z4.h\n"
+      ".inst 0x6465e431  // bfmmla z17.s, z1.h, z5.h\n"
+      "ld1h { z6.h }, p0/Z, [x22]\n"
+      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
+      ".inst 0x6465e457  // bfmmla z23.s, z2.h, z5.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6464e4fa  // bfmmla z26.s, z7.h, z4.h\n"
+      ".inst 0x6465e4fd  // bfmmla z29.s, z7.h, z5.h\n"
+      "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6463e40c  // bfmmla z12.s, z0.h, z3.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6463e432  // bfmmla z18.s, z1.h, z3.h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6463e458  // bfmmla z24.s, z2.h, z3.h\n"
+      "addvl x22, x22, #4\n"
+      ".inst 0x6466e4fb  // bfmmla z27.s, z7.h, z6.h\n"
+      ".inst 0x6463e4fe  // bfmmla z30.s, z7.h, z3.h\n"
+      ".inst 0x6465e40a  // bfmmla z10.s, z0.h, z5.h\n"
+      ".inst 0x6464e40d  // bfmmla z13.s, z0.h, z4.h\n"
+      ".inst 0x6465e430  // bfmmla z16.s, z1.h, z5.h\n"
+      ".inst 0x6464e433  // bfmmla z19.s, z1.h, z4.h\n"
+      ".inst 0x6465e456  // bfmmla z22.s, z2.h, z5.h\n"
+      ".inst 0x6464e459  // bfmmla z25.s, z2.h, z4.h\n"
+      ".inst 0x6465e4fc  // bfmmla z28.s, z7.h, z5.h\n"
+      ".inst 0x6464e4ff  // bfmmla z31.s, z7.h, z4.h\n"
+      "cbz x20, 5f\n"
+      "ld1h { z1.h }, p0/Z, [x22]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1h { z0.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6460e4eb  // bfmmla z11.s, z7.h, z0.h\n"
+      "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqh { z4.h }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x6461e4ce  // bfmmla z14.s, z6.h, z1.h\n"
+      ".inst 0x6460e4d1  // bfmmla z17.s, z6.h, z0.h\n"
+      ".inst 0x6461e4b4  // bfmmla z20.s, z5.h, z1.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6461e49a  // bfmmla z26.s, z4.h, z1.h\n"
+      "ld1h { z2.h }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z1.h }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1h { z0.h }, p0/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x6463e4e9  // bfmmla z9.s, z7.h, z3.h\n"
+      ".inst 0x6462e4ec  // bfmmla z12.s, z7.h, z2.h\n"
+      "addvl x22, x22, #6\n"
+      ".inst 0x6463e4cf  // bfmmla z15.s, z6.h, z3.h\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x6463e4b5  // bfmmla z21.s, z5.h, z3.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6463e49b  // bfmmla z27.s, z4.h, z3.h\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6461e4d0  // bfmmla z16.s, z6.h, z1.h\n"
+      ".inst 0x6460e4d3  // bfmmla z19.s, z6.h, z0.h\n"
+      ".inst 0x6461e4b6  // bfmmla z22.s, z5.h, z1.h\n"
+      ".inst 0x6460e4b9  // bfmmla z25.s, z5.h, z0.h\n"
+      ".inst 0x6461e49c  // bfmmla z28.s, z4.h, z1.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
+      "5:"  // multiply loop done
+      "uzp1 z0.d, z8.d, z11.d\n"
+      "uzp2 z8.d, z8.d, z11.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+      "uzp1 z0.d, z9.d, z12.d\n"
+      "uzp2 z9.d, z9.d, z12.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "uzp1 z0.d, z10.d, z13.d\n"
+      "uzp2 z10.d, z10.d, z13.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "uzp1 z0.d, z14.d, z17.d\n"
+      "uzp2 z14.d, z14.d, z17.d\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "uzp1 z1.d, z15.d, z18.d\n"
+      "subs x23, x23, #0x1\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "uzp2 z15.d, z15.d, z18.d\n"
+      "uzp1 z17.d, z16.d, z19.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "uzp2 z16.d, z16.d, z19.d\n"
+      "uzp1 z0.d, z20.d, z23.d\n"
+      "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "uzp2 z20.d, z20.d, z23.d\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "uzp1 z23.d, z21.d, z24.d\n"
+      "uzp2 z21.d, z21.d, z24.d\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "uzp1 z19.d, z22.d, z25.d\n"
+      "uzp2 z22.d, z22.d, z25.d\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "uzp1 z18.d, z26.d, z29.d\n"
+      "uzp2 z26.d, z26.d, z29.d\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "uzp1 z17.d, z27.d, z30.d\n"
+      "uzp2 z27.d, z27.d, z30.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "uzp1 z16.d, z28.d, z31.d\n"
+      "uzp2 z28.d, z28.d, z31.d\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
index eb946d9dfa..6c54167763 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,51 +22,83 @@
  * SOFTWARE.
  */
 #pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const __fp16 *, const __fp16 *, \
+    __fp16 *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_fp16_mla_8x3VL(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void sve_interleaved_fp16_mla_8x3VL( ARGLIST );
+void sve_interleaved_fp16_mla_8x3VL_a64fx( ARGLIST );
 
-class cls_sve_interleaved_fp16_mla_8x3VL {
+class cls_sve_interleaved_fp16_mla_8x3VL
+{
 public:
     typedef __fp16 operand_type;
     typedef __fp16 result_type;
 
-    typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return get_vector_length<__fp16>() * 3;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return get_vector_length<__fp16>() * 3;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 1;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_fp16_mla_8x3VL;
+        if (std::is_same<T, __fp16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.96, 3.85, 6.91 };
+                case CPUModel::A510:
+                    return { 13.84, 2.07, 2.52 };
+                case CPUModel::V1:
+                    return { 31.90, 5.15, 10.34 };
+                case CPUModel::A64FX:
+                    return { 44.34, 3.23, 7.06 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
-    cls_sve_interleaved_fp16_mla_8x3VL(const CPUInfo *)
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_fp16_mla_8x3VL;
+    cls_sve_interleaved_fp16_mla_8x3VL(const CPUInfo *ci)
     {
-
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_interleaved_fp16_mla_8x3VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
-#endif // __ARM_FEATURE_SVE
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
new file mode 100644
index 0000000000..609277d889
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void sve_interleaved_fp16_mla_8x3VL_a64fx(
+    const __fp16 *Apanel,
+    const __fp16 *Bpanel,
+    __fp16 *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const __fp16 *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
+
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ld1h { z0.h }, p0/Z, [x22]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla z8.h, p0/M, z0.h, z3.h\n"
+      "fmla z9.h, p0/M, z1.h, z3.h\n"
+      "sub x20, x20, #0x2\n"
+      "fmla z10.h, p0/M, z2.h, z3.h\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z11.h, p0/M, z0.h, z4.h\n"
+      "fmla z12.h, p0/M, z1.h, z4.h\n"
+      "fmla z13.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
+      "fmla z14.h, p0/M, z0.h, z5.h\n"
+      "fmla z15.h, p0/M, z1.h, z5.h\n"
+      "cmp x20, #0x2\n"
+      "fmla z16.h, p0/M, z2.h, z5.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z17.h, p0/M, z0.h, z6.h\n"
+      "fmla z18.h, p0/M, z1.h, z6.h\n"
+      "fmla z19.h, p0/M, z2.h, z6.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z0.h, z7.h\n"
+      "fmla z21.h, p0/M, z1.h, z7.h\n"
+      "fmla z22.h, p0/M, z2.h, z7.h\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z23.h, p0/M, z0.h, z4.h\n"
+      "fmla z24.h, p0/M, z1.h, z4.h\n"
+      "fmla z25.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #18]\n"
+      "fmla z26.h, p0/M, z0.h, z3.h\n"
+      "fmla z27.h, p0/M, z1.h, z3.h\n"
+      "fmla z28.h, p0/M, z2.h, z3.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z29.h, p0/M, z0.h, z5.h\n"
+      "ld1h { z6.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "fmla z30.h, p0/M, z1.h, z5.h\n"
+      "fmla z31.h, p0/M, z2.h, z5.h\n"
+      "ld1h { z2.h }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1h { z5.h }, p0/Z, [x22, #5, MUL VL]\n"
+      "fmla z8.h, p0/M, z6.h, z7.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #22]\n"
+      "fmla z9.h, p0/M, z2.h, z7.h\n"
+      "fmla z10.h, p0/M, z5.h, z7.h\n"
+      "fmla z11.h, p0/M, z6.h, z4.h\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z12.h, p0/M, z2.h, z4.h\n"
+      "fmla z13.h, p0/M, z5.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #26]\n"
+      "fmla z14.h, p0/M, z6.h, z3.h\n"
+      "fmla z15.h, p0/M, z2.h, z3.h\n"
+      "addvl x22, x22, #6\n"
+      "fmla z16.h, p0/M, z5.h, z3.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z17.h, p0/M, z6.h, z1.h\n"
+      "fmla z18.h, p0/M, z2.h, z1.h\n"
+      "fmla z19.h, p0/M, z5.h, z1.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #30]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z20.h, p0/M, z6.h, z7.h\n"
+      "fmla z21.h, p0/M, z2.h, z7.h\n"
+      "fmla z22.h, p0/M, z5.h, z7.h\n"
+      "fmla z23.h, p0/M, z6.h, z4.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "fmla z24.h, p0/M, z2.h, z4.h\n"
+      "fmla z25.h, p0/M, z5.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
+      "fmla z26.h, p0/M, z6.h, z0.h\n"
+      "fmla z27.h, p0/M, z2.h, z0.h\n"
+      "fmla z28.h, p0/M, z5.h, z0.h\n"
+      "fmla z29.h, p0/M, z6.h, z1.h\n"
+      "ld1h { z0.h }, p0/Z, [x22]\n"
+      "fmla z30.h, p0/M, z2.h, z1.h\n"
+      "fmla z31.h, p0/M, z5.h, z1.h\n"
+      "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "fmla z8.h, p0/M, z0.h, z3.h\n"
+      "fmla z9.h, p0/M, z1.h, z3.h\n"
+      "addvl x22, x22, #3\n"
+      "fmla z10.h, p0/M, z2.h, z3.h\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z11.h, p0/M, z0.h, z4.h\n"
+      "fmla z12.h, p0/M, z1.h, z4.h\n"
+      "fmla z13.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
+      "fmla z14.h, p0/M, z0.h, z5.h\n"
+      "fmla z15.h, p0/M, z1.h, z5.h\n"
+      "fmla z16.h, p0/M, z2.h, z5.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z17.h, p0/M, z0.h, z6.h\n"
+      "fmla z18.h, p0/M, z1.h, z6.h\n"
+      "fmla z19.h, p0/M, z2.h, z6.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z0.h, z7.h\n"
+      "fmla z21.h, p0/M, z1.h, z7.h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla z22.h, p0/M, z2.h, z7.h\n"
+      "fmla z23.h, p0/M, z0.h, z4.h\n"
+      "fmla z24.h, p0/M, z1.h, z4.h\n"
+      "fmla z25.h, p0/M, z2.h, z4.h\n"
+      "fmla z26.h, p0/M, z0.h, z5.h\n"
+      "fmla z27.h, p0/M, z1.h, z5.h\n"
+      "fmla z28.h, p0/M, z2.h, z5.h\n"
+      "fmla z29.h, p0/M, z0.h, z3.h\n"
+      "fmla z30.h, p0/M, z1.h, z3.h\n"
+      "fmla z31.h, p0/M, z2.h, z3.h\n"
+      "cbz x20, 5f\n"
+      "ld1h { z6.h }, p0/Z, [x22]\n"
+      "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "fmla z8.h, p0/M, z6.h, z3.h\n"
+      "ld1rh { z2.h }, p0/Z, [%x[Apanel], #2]\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #4]\n"
+      "fmla z9.h, p0/M, z5.h, z3.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n"
+      "fmla z10.h, p0/M, z4.h, z3.h\n"
+      "fmla z11.h, p0/M, z6.h, z2.h\n"
+      "fmla z12.h, p0/M, z5.h, z2.h\n"
+      "fmla z13.h, p0/M, z4.h, z2.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z14.h, p0/M, z6.h, z1.h\n"
+      "fmla z15.h, p0/M, z5.h, z1.h\n"
+      "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n"
+      "fmla z16.h, p0/M, z4.h, z1.h\n"
+      "fmla z17.h, p0/M, z6.h, z0.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z18.h, p0/M, z5.h, z0.h\n"
+      "fmla z19.h, p0/M, z4.h, z0.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z6.h, z3.h\n"
+      "fmla z21.h, p0/M, z5.h, z3.h\n"
+      "addvl x22, x22, #3\n"
+      "fmla z22.h, p0/M, z4.h, z3.h\n"
+      "fmla z23.h, p0/M, z6.h, z2.h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla z24.h, p0/M, z5.h, z2.h\n"
+      "fmla z25.h, p0/M, z4.h, z2.h\n"
+      "fmla z26.h, p0/M, z6.h, z1.h\n"
+      "fmla z27.h, p0/M, z5.h, z1.h\n"
+      "fmla z28.h, p0/M, z4.h, z1.h\n"
+      "fmla z29.h, p0/M, z6.h, z0.h\n"
+      "fmla z30.h, p0/M, z5.h, z0.h\n"
+      "fmla z31.h, p0/M, z4.h, z0.h\n"
+      "5:"  // multiply loop done
+      "st1h { z8.h }, p0, [%x[Cpanel]]\n"
+      "subs x23, x23, #0x1\n"
+      "st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z12.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z13.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z14.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z15.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1h { z16.h }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1h { z17.h }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1h { z18.h }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1h { z19.h }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1h { z20.h }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1h { z21.h }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1h { z22.h }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1h { z23.h }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1h { z24.h }, p0, [%x[Cpanel]]\n"
+      "st1h { z25.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z26.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z28.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z29.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z30.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z31.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
index 46b8770409..3b16c97e2c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,299 +21,225 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
-
-#include "../../asmlib.hpp"
+#include <cstddef>
 
 namespace arm_gemm {
 
-void sve_interleaved_fp16_mla_8x3VL(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
-    const __fp16 *a_ptr = Apanel;
-    __fp16 *c_ptr = Cpanel;
-
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+void sve_interleaved_fp16_mla_8x3VL(
+    const __fp16 *Apanel,
+    const __fp16 *Bpanel,
+    __fp16 *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const __fp16 *a_ptr0 = a_ptr;
-        const __fp16 *b_ptr = Bpanel;
+    struct KernelArgs {
+        size_t K = {};
+        const __fp16 *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
 
-            __asm __volatile (
-                "mov z8.h, #0\n"
-                "ptrue p0.h\n"
-                "mov z9.h, #0\n"
-                "mov z10.h, #0\n"
-                "mov z11.h, #0\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                "mov z12.h, #0\n"
-                "ld1h z2.h, p0/z, [%[b_ptr]]\n"
-                "mov z13.h, #0\n"
-                "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z14.h, #0\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "mov z15.h, #0\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                "mov z16.h, #0\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                "mov z17.h, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "mov z18.h, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "mov z19.h, #0\n"
-                "mov z20.h, #0\n"
-                "mov z21.h, #0\n"
-                "mov z22.h, #0\n"
-                "mov z23.h, #0\n"
-                "mov z24.h, #0\n"
-                "mov z25.h, #0\n"
-                "mov z26.h, #0\n"
-                "mov z27.h, #0\n"
-                "mov z28.h, #0\n"
-                "mov z29.h, #0\n"
-                "mov z30.h, #0\n"
-                "mov z31.h, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                "fmla z8.h, z2.h, z0.h[0]\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.h, z2.h, z0.h[1]\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.h, z2.h, z0.h[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                "fmla z11.h, z2.h, z0.h[3]\n"
-                "fmla z12.h, z2.h, z0.h[4]\n"
-                "fmla z13.h, z2.h, z0.h[5]\n"
-                "fmla z14.h, z2.h, z0.h[6]\n"
-                "fmla z15.h, z2.h, z0.h[7]\n"
-                "ld1h z2.h, p0/z, [%[b_ptr]]\n"
-                "fmla z16.h, z3.h, z0.h[0]\n"
-                "fmla z17.h, z3.h, z0.h[1]\n"
-                "fmla z18.h, z3.h, z0.h[2]\n"
-                "fmla z19.h, z3.h, z0.h[3]\n"
-                "fmla z20.h, z3.h, z0.h[4]\n"
-                "fmla z21.h, z3.h, z0.h[5]\n"
-                "fmla z22.h, z3.h, z0.h[6]\n"
-                "fmla z23.h, z3.h, z0.h[7]\n"
-                "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "fmla z24.h, z4.h, z0.h[0]\n"
-                "fmla z25.h, z4.h, z0.h[1]\n"
-                "fmla z26.h, z4.h, z0.h[2]\n"
-                "fmla z27.h, z4.h, z0.h[3]\n"
-                "fmla z28.h, z4.h, z0.h[4]\n"
-                "fmla z29.h, z4.h, z0.h[5]\n"
-                "fmla z30.h, z4.h, z0.h[6]\n"
-                "fmla z31.h, z4.h, z0.h[7]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "fmla z8.h, z5.h, z1.h[0]\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                "fmla z9.h, z5.h, z1.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "fmla z10.h, z5.h, z1.h[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "fmla z11.h, z5.h, z1.h[3]\n"
-                "fmla z12.h, z5.h, z1.h[4]\n"
-                "fmla z13.h, z5.h, z1.h[5]\n"
-                "fmla z14.h, z5.h, z1.h[6]\n"
-                "fmla z15.h, z5.h, z1.h[7]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "fmla z16.h, z6.h, z1.h[0]\n"
-                "fmla z17.h, z6.h, z1.h[1]\n"
-                "fmla z18.h, z6.h, z1.h[2]\n"
-                "fmla z19.h, z6.h, z1.h[3]\n"
-                "fmla z20.h, z6.h, z1.h[4]\n"
-                "fmla z21.h, z6.h, z1.h[5]\n"
-                "fmla z22.h, z6.h, z1.h[6]\n"
-                "fmla z23.h, z6.h, z1.h[7]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "fmla z24.h, z7.h, z1.h[0]\n"
-                "fmla z25.h, z7.h, z1.h[1]\n"
-                "fmla z26.h, z7.h, z1.h[2]\n"
-                "fmla z27.h, z7.h, z1.h[3]\n"
-                "fmla z28.h, z7.h, z1.h[4]\n"
-                "fmla z29.h, z7.h, z1.h[5]\n"
-                "fmla z30.h, z7.h, z1.h[6]\n"
-                "fmla z31.h, z7.h, z1.h[7]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                "fmla z8.h, z2.h, z0.h[0]\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.h, z2.h, z0.h[1]\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.h, z2.h, z0.h[2]\n"
-                "fmla z11.h, z2.h, z0.h[3]\n"
-                "fmla z12.h, z2.h, z0.h[4]\n"
-                "fmla z13.h, z2.h, z0.h[5]\n"
-                "fmla z14.h, z2.h, z0.h[6]\n"
-                "fmla z15.h, z2.h, z0.h[7]\n"
-                "ld1h z2.h, p0/z, [%[b_ptr]]\n"
-                "fmla z16.h, z3.h, z0.h[0]\n"
-                "fmla z17.h, z3.h, z0.h[1]\n"
-                "fmla z18.h, z3.h, z0.h[2]\n"
-                "fmla z19.h, z3.h, z0.h[3]\n"
-                "fmla z20.h, z3.h, z0.h[4]\n"
-                "fmla z21.h, z3.h, z0.h[5]\n"
-                "fmla z22.h, z3.h, z0.h[6]\n"
-                "fmla z23.h, z3.h, z0.h[7]\n"
-                "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "fmla z24.h, z4.h, z0.h[0]\n"
-                "fmla z25.h, z4.h, z0.h[1]\n"
-                "fmla z26.h, z4.h, z0.h[2]\n"
-                "fmla z27.h, z4.h, z0.h[3]\n"
-                "fmla z28.h, z4.h, z0.h[4]\n"
-                "fmla z29.h, z4.h, z0.h[5]\n"
-                "fmla z30.h, z4.h, z0.h[6]\n"
-                "fmla z31.h, z4.h, z0.h[7]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "fmla z8.h, z5.h, z1.h[0]\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                "fmla z9.h, z5.h, z1.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x10\n"
-                "fmla z10.h, z5.h, z1.h[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "fmla z11.h, z5.h, z1.h[3]\n"
-                "fmla z12.h, z5.h, z1.h[4]\n"
-                "fmla z13.h, z5.h, z1.h[5]\n"
-                "fmla z14.h, z5.h, z1.h[6]\n"
-                "fmla z15.h, z5.h, z1.h[7]\n"
-                "fmla z16.h, z6.h, z1.h[0]\n"
-                "fmla z17.h, z6.h, z1.h[1]\n"
-                "fmla z18.h, z6.h, z1.h[2]\n"
-                "fmla z19.h, z6.h, z1.h[3]\n"
-                "fmla z20.h, z6.h, z1.h[4]\n"
-                "fmla z21.h, z6.h, z1.h[5]\n"
-                "fmla z22.h, z6.h, z1.h[6]\n"
-                "fmla z23.h, z6.h, z1.h[7]\n"
-                "fmla z24.h, z7.h, z1.h[0]\n"
-                "fmla z25.h, z7.h, z1.h[1]\n"
-                "fmla z26.h, z7.h, z1.h[2]\n"
-                "fmla z27.h, z7.h, z1.h[3]\n"
-                "fmla z28.h, z7.h, z1.h[4]\n"
-                "fmla z29.h, z7.h, z1.h[5]\n"
-                "fmla z30.h, z7.h, z1.h[6]\n"
-                "fmla z31.h, z7.h, z1.h[7]\n"
-                "fmla z8.h, z2.h, z0.h[0]\n"
-                "fmla z9.h, z2.h, z0.h[1]\n"
-                "fmla z10.h, z2.h, z0.h[2]\n"
-                "fmla z11.h, z2.h, z0.h[3]\n"
-                "st1h z8.h, p0, [%[c_ptr]]\n"
-                "fmla z12.h, z2.h, z0.h[4]\n"
-                "fmla z13.h, z2.h, z0.h[5]\n"
-                "fmla z14.h, z2.h, z0.h[6]\n"
-                "fmla z15.h, z2.h, z0.h[7]\n"
-                "fmla z16.h, z3.h, z0.h[0]\n"
-                "fmla z17.h, z3.h, z0.h[1]\n"
-                "fmla z18.h, z3.h, z0.h[2]\n"
-                "fmla z19.h, z3.h, z0.h[3]\n"
-                "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
-                "fmla z20.h, z3.h, z0.h[4]\n"
-                "fmla z21.h, z3.h, z0.h[5]\n"
-                "fmla z22.h, z3.h, z0.h[6]\n"
-                "fmla z23.h, z3.h, z0.h[7]\n"
-                "fmla z24.h, z4.h, z0.h[0]\n"
-                "fmla z25.h, z4.h, z0.h[1]\n"
-                "fmla z26.h, z4.h, z0.h[2]\n"
-                "fmla z27.h, z4.h, z0.h[3]\n"
-                "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
-                "fmla z28.h, z4.h, z0.h[4]\n"
-                "fmla z29.h, z4.h, z0.h[5]\n"
-                "fmla z30.h, z4.h, z0.h[6]\n"
-                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
-                "fmla z31.h, z4.h, z0.h[7]\n"
-                "b 4f\n"
-                "3:\n"
-                "fmla z8.h, z2.h, z0.h[0]\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.h, z2.h, z0.h[1]\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.h, z2.h, z0.h[2]\n"
-                "fmla z11.h, z2.h, z0.h[3]\n"
-                "fmla z12.h, z2.h, z0.h[4]\n"
-                "fmla z13.h, z2.h, z0.h[5]\n"
-                "fmla z14.h, z2.h, z0.h[6]\n"
-                "fmla z15.h, z2.h, z0.h[7]\n"
-                "fmla z16.h, z3.h, z0.h[0]\n"
-                "fmla z17.h, z3.h, z0.h[1]\n"
-                "fmla z18.h, z3.h, z0.h[2]\n"
-                "fmla z19.h, z3.h, z0.h[3]\n"
-                "fmla z20.h, z3.h, z0.h[4]\n"
-                "fmla z21.h, z3.h, z0.h[5]\n"
-                "fmla z22.h, z3.h, z0.h[6]\n"
-                "fmla z23.h, z3.h, z0.h[7]\n"
-                "fmla z24.h, z4.h, z0.h[0]\n"
-                "fmla z25.h, z4.h, z0.h[1]\n"
-                "fmla z26.h, z4.h, z0.h[2]\n"
-                "fmla z27.h, z4.h, z0.h[3]\n"
-                "fmla z28.h, z4.h, z0.h[4]\n"
-                "fmla z29.h, z4.h, z0.h[5]\n"
-                "fmla z30.h, z4.h, z0.h[6]\n"
-                "fmla z31.h, z4.h, z0.h[7]\n"
-                "fmla z8.h, z5.h, z1.h[0]\n"
-                "fmla z9.h, z5.h, z1.h[1]\n"
-                "fmla z10.h, z5.h, z1.h[2]\n"
-                "fmla z11.h, z5.h, z1.h[3]\n"
-                "st1h z8.h, p0, [%[c_ptr]]\n"
-                "fmla z12.h, z5.h, z1.h[4]\n"
-                "fmla z13.h, z5.h, z1.h[5]\n"
-                "fmla z14.h, z5.h, z1.h[6]\n"
-                "fmla z15.h, z5.h, z1.h[7]\n"
-                "fmla z16.h, z6.h, z1.h[0]\n"
-                "fmla z17.h, z6.h, z1.h[1]\n"
-                "fmla z18.h, z6.h, z1.h[2]\n"
-                "fmla z19.h, z6.h, z1.h[3]\n"
-                "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
-                "fmla z20.h, z6.h, z1.h[4]\n"
-                "fmla z21.h, z6.h, z1.h[5]\n"
-                "fmla z22.h, z6.h, z1.h[6]\n"
-                "fmla z23.h, z6.h, z1.h[7]\n"
-                "fmla z24.h, z7.h, z1.h[0]\n"
-                "fmla z25.h, z7.h, z1.h[1]\n"
-                "fmla z26.h, z7.h, z1.h[2]\n"
-                "fmla z27.h, z7.h, z1.h[3]\n"
-                "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
-                "fmla z28.h, z7.h, z1.h[4]\n"
-                "fmla z29.h, z7.h, z1.h[5]\n"
-                "fmla z30.h, z7.h, z1.h[6]\n"
-                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
-                "fmla z31.h, z7.h, z1.h[7]\n"
-                "4:\n"
-                "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "st1h z26.h, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "st1h z11.h, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "st1h z19.h, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "st1h z27.h, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "st1h z12.h, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "st1h z20.h, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "st1h z28.h, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "st1h z13.h, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "st1h z21.h, p0, [%[c_ptr]]\n"
-                "st1h z29.h, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1h z14.h, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1h z22.h, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1h z30.h, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1h z15.h, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1h z23.h, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1h z31.h, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1h { z2.h }, p0/Z, [x22]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla z8.h, z2.h, z0.h[0]\n"
+      "fmla z11.h, z2.h, z0.h[1]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z14.h, z2.h, z0.h[2]\n"
+      "fmla z17.h, z2.h, z0.h[3]\n"
+      "ld1h { z6.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "fmla z20.h, z2.h, z0.h[4]\n"
+      "fmla z23.h, z2.h, z0.h[5]\n"
+      "ld1h { z5.h }, p0/Z, [x22, #4, MUL VL]\n"
+      "fmla z26.h, z2.h, z0.h[6]\n"
+      "fmla z29.h, z2.h, z0.h[7]\n"
+      "ld1h { z1.h }, p0/Z, [x22, #5, MUL VL]\n"
+      "fmla z9.h, z3.h, z0.h[0]\n"
+      "fmla z12.h, z3.h, z0.h[1]\n"
+      "addvl x22, x22, #6\n"
+      "fmla z15.h, z3.h, z0.h[2]\n"
+      "fmla z18.h, z3.h, z0.h[3]\n"
+      "sub x20, x20, #0x2\n"
+      "fmla z21.h, z3.h, z0.h[4]\n"
+      "fmla z24.h, z3.h, z0.h[5]\n"
+      "cmp x20, #0x2\n"
+      "fmla z27.h, z3.h, z0.h[6]\n"
+      "fmla z30.h, z3.h, z0.h[7]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z10.h, z4.h, z0.h[0]\n"
+      "fmla z13.h, z4.h, z0.h[1]\n"
+      "ld1h { z2.h }, p0/Z, [x22]\n"
+      "fmla z16.h, z4.h, z0.h[2]\n"
+      "fmla z19.h, z4.h, z0.h[3]\n"
+      "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "fmla z22.h, z4.h, z0.h[4]\n"
+      "fmla z25.h, z4.h, z0.h[5]\n"
+      "fmla z28.h, z4.h, z0.h[6]\n"
+      "fmla z31.h, z4.h, z0.h[7]\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "fmla z8.h, z6.h, z7.h[0]\n"
+      "fmla z11.h, z6.h, z7.h[1]\n"
+      "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "fmla z14.h, z6.h, z7.h[2]\n"
+      "fmla z17.h, z6.h, z7.h[3]\n"
+      "fmla z20.h, z6.h, z7.h[4]\n"
+      "fmla z23.h, z6.h, z7.h[5]\n"
+      "fmla z26.h, z6.h, z7.h[6]\n"
+      "fmla z29.h, z6.h, z7.h[7]\n"
+      "fmla z9.h, z5.h, z7.h[0]\n"
+      "fmla z12.h, z5.h, z7.h[1]\n"
+      "fmla z15.h, z5.h, z7.h[2]\n"
+      "fmla z18.h, z5.h, z7.h[3]\n"
+      "fmla z21.h, z5.h, z7.h[4]\n"
+      "fmla z24.h, z5.h, z7.h[5]\n"
+      "fmla z27.h, z5.h, z7.h[6]\n"
+      "fmla z30.h, z5.h, z7.h[7]\n"
+      "fmla z10.h, z1.h, z7.h[0]\n"
+      "fmla z13.h, z1.h, z7.h[1]\n"
+      "fmla z16.h, z1.h, z7.h[2]\n"
+      "fmla z19.h, z1.h, z7.h[3]\n"
+      "fmla z22.h, z1.h, z7.h[4]\n"
+      "fmla z25.h, z1.h, z7.h[5]\n"
+      "fmla z28.h, z1.h, z7.h[6]\n"
+      "fmla z31.h, z1.h, z7.h[7]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "fmla z8.h, z2.h, z0.h[0]\n"
+      "fmla z11.h, z2.h, z0.h[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla z14.h, z2.h, z0.h[2]\n"
+      "fmla z17.h, z2.h, z0.h[3]\n"
+      "addvl x22, x22, #3\n"
+      "fmla z20.h, z2.h, z0.h[4]\n"
+      "fmla z23.h, z2.h, z0.h[5]\n"
+      "fmla z26.h, z2.h, z0.h[6]\n"
+      "fmla z29.h, z2.h, z0.h[7]\n"
+      "fmla z9.h, z3.h, z0.h[0]\n"
+      "fmla z12.h, z3.h, z0.h[1]\n"
+      "fmla z15.h, z3.h, z0.h[2]\n"
+      "fmla z18.h, z3.h, z0.h[3]\n"
+      "fmla z21.h, z3.h, z0.h[4]\n"
+      "fmla z24.h, z3.h, z0.h[5]\n"
+      "fmla z27.h, z3.h, z0.h[6]\n"
+      "fmla z30.h, z3.h, z0.h[7]\n"
+      "fmla z10.h, z4.h, z0.h[0]\n"
+      "fmla z13.h, z4.h, z0.h[1]\n"
+      "fmla z16.h, z4.h, z0.h[2]\n"
+      "fmla z19.h, z4.h, z0.h[3]\n"
+      "fmla z22.h, z4.h, z0.h[4]\n"
+      "fmla z25.h, z4.h, z0.h[5]\n"
+      "fmla z28.h, z4.h, z0.h[6]\n"
+      "fmla z31.h, z4.h, z0.h[7]\n"
+      "cbz x20, 5f\n"
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "ld1h { z2.h }, p0/Z, [x22]\n"
+      "fmla z8.h, z2.h, z3.h[0]\n"
+      "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "fmla z11.h, z2.h, z3.h[1]\n"
+      "fmla z14.h, z2.h, z3.h[2]\n"
+      "fmla z17.h, z2.h, z3.h[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla z20.h, z2.h, z3.h[4]\n"
+      "fmla z23.h, z2.h, z3.h[5]\n"
+      "addvl x22, x22, #3\n"
+      "fmla z26.h, z2.h, z3.h[6]\n"
+      "fmla z29.h, z2.h, z3.h[7]\n"
+      "fmla z9.h, z1.h, z3.h[0]\n"
+      "fmla z12.h, z1.h, z3.h[1]\n"
+      "fmla z15.h, z1.h, z3.h[2]\n"
+      "fmla z18.h, z1.h, z3.h[3]\n"
+      "fmla z21.h, z1.h, z3.h[4]\n"
+      "fmla z24.h, z1.h, z3.h[5]\n"
+      "fmla z27.h, z1.h, z3.h[6]\n"
+      "fmla z30.h, z1.h, z3.h[7]\n"
+      "fmla z10.h, z0.h, z3.h[0]\n"
+      "fmla z13.h, z0.h, z3.h[1]\n"
+      "fmla z16.h, z0.h, z3.h[2]\n"
+      "fmla z19.h, z0.h, z3.h[3]\n"
+      "fmla z22.h, z0.h, z3.h[4]\n"
+      "fmla z25.h, z0.h, z3.h[5]\n"
+      "fmla z28.h, z0.h, z3.h[6]\n"
+      "fmla z31.h, z0.h, z3.h[7]\n"
+      "5:"  // multiply loop done
+      "st1h { z8.h }, p0, [%x[Cpanel]]\n"
+      "subs x23, x23, #0x1\n"
+      "st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z12.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z13.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z14.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z15.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1h { z16.h }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1h { z17.h }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1h { z18.h }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1h { z19.h }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1h { z20.h }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1h { z21.h }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1h { z22.h }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1h { z23.h }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1h { z24.h }, p0, [%x[Cpanel]]\n"
+      "st1h { z25.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z26.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z28.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z29.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z30.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z31.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
index b84ba83b6a..23ab7ce10a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,51 +22,83 @@
  * SOFTWARE.
  */
 #pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const float *, const float *, \
+    float *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_fp32_mla_8x3VL(const float *, const float *, float *, int, int, int);
+void sve_interleaved_fp32_mla_8x3VL( ARGLIST );
+void sve_interleaved_fp32_mla_8x3VL_a64fx( ARGLIST );
 
-class cls_sve_interleaved_fp32_mla_8x3VL {
+class cls_sve_interleaved_fp32_mla_8x3VL
+{
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return get_vector_length<float>() * 3;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return get_vector_length<float>() * 3;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 1;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_fp32_mla_8x3VL;
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 7.2307, 3.876, 2.932 };
+                case CPUModel::A64FX:
+                    return { 26.52, 3.42, 4.59 };
+                case CPUModel::A510:
+                    return { 6.25, 3.84, 2.47 };
+                case CPUModel::V1:
+                    return { 15.15, 9.24, 6.42 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
-    cls_sve_interleaved_fp32_mla_8x3VL(const CPUInfo *)
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_fp32_mla_8x3VL;
+    cls_sve_interleaved_fp32_mla_8x3VL(const CPUInfo *ci)
     {
-
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_interleaved_fp32_mla_8x3VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
-#endif // __ARM_FEATURE_SVE
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
new file mode 100644
index 0000000000..0b13913717
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void sve_interleaved_fp32_mla_8x3VL_a64fx(
+    const float *Apanel,
+    const float *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const float *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
+
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ld1w { z0.s }, p0/Z, [x22]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "ld1w { z2.s }, p0/Z, [x22, #2, MUL VL]\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla z8.s, p0/M, z0.s, z3.s\n"
+      "fmla z9.s, p0/M, z1.s, z3.s\n"
+      "sub x20, x20, #0x2\n"
+      "fmla z10.s, p0/M, z2.s, z3.s\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z11.s, p0/M, z0.s, z4.s\n"
+      "fmla z12.s, p0/M, z1.s, z4.s\n"
+      "fmla z13.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z14.s, p0/M, z0.s, z5.s\n"
+      "fmla z15.s, p0/M, z1.s, z5.s\n"
+      "cmp x20, #0x2\n"
+      "fmla z16.s, p0/M, z2.s, z5.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z17.s, p0/M, z0.s, z6.s\n"
+      "fmla z18.s, p0/M, z1.s, z6.s\n"
+      "fmla z19.s, p0/M, z2.s, z6.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z0.s, z7.s\n"
+      "fmla z21.s, p0/M, z1.s, z7.s\n"
+      "fmla z22.s, p0/M, z2.s, z7.s\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
+      "fmla z23.s, p0/M, z0.s, z4.s\n"
+      "fmla z24.s, p0/M, z1.s, z4.s\n"
+      "fmla z25.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
+      "fmla z26.s, p0/M, z0.s, z3.s\n"
+      "fmla z27.s, p0/M, z1.s, z3.s\n"
+      "fmla z28.s, p0/M, z2.s, z3.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
+      "fmla z29.s, p0/M, z0.s, z5.s\n"
+      "ld1w { z6.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "fmla z30.s, p0/M, z1.s, z5.s\n"
+      "fmla z31.s, p0/M, z2.s, z5.s\n"
+      "ld1w { z2.s }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z5.s }, p0/Z, [x22, #5, MUL VL]\n"
+      "fmla z8.s, p0/M, z6.s, z7.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+      "fmla z9.s, p0/M, z2.s, z7.s\n"
+      "fmla z10.s, p0/M, z5.s, z7.s\n"
+      "fmla z11.s, p0/M, z6.s, z4.s\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+      "fmla z12.s, p0/M, z2.s, z4.s\n"
+      "fmla z13.s, p0/M, z5.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
+      "fmla z14.s, p0/M, z6.s, z3.s\n"
+      "fmla z15.s, p0/M, z2.s, z3.s\n"
+      "addvl x22, x22, #6\n"
+      "fmla z16.s, p0/M, z5.s, z3.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+      "fmla z17.s, p0/M, z6.s, z1.s\n"
+      "fmla z18.s, p0/M, z2.s, z1.s\n"
+      "fmla z19.s, p0/M, z5.s, z1.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "fmla z20.s, p0/M, z6.s, z7.s\n"
+      "fmla z21.s, p0/M, z2.s, z7.s\n"
+      "fmla z22.s, p0/M, z5.s, z7.s\n"
+      "fmla z23.s, p0/M, z6.s, z4.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "fmla z24.s, p0/M, z2.s, z4.s\n"
+      "fmla z25.s, p0/M, z5.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "fmla z26.s, p0/M, z6.s, z0.s\n"
+      "fmla z27.s, p0/M, z2.s, z0.s\n"
+      "fmla z28.s, p0/M, z5.s, z0.s\n"
+      "fmla z29.s, p0/M, z6.s, z1.s\n"
+      "ld1w { z0.s }, p0/Z, [x22]\n"
+      "fmla z30.s, p0/M, z2.s, z1.s\n"
+      "fmla z31.s, p0/M, z5.s, z1.s\n"
+      "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z2.s }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "fmla z8.s, p0/M, z0.s, z3.s\n"
+      "fmla z9.s, p0/M, z1.s, z3.s\n"
+      "addvl x22, x22, #3\n"
+      "fmla z10.s, p0/M, z2.s, z3.s\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z11.s, p0/M, z0.s, z4.s\n"
+      "fmla z12.s, p0/M, z1.s, z4.s\n"
+      "fmla z13.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z14.s, p0/M, z0.s, z5.s\n"
+      "fmla z15.s, p0/M, z1.s, z5.s\n"
+      "fmla z16.s, p0/M, z2.s, z5.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z17.s, p0/M, z0.s, z6.s\n"
+      "fmla z18.s, p0/M, z1.s, z6.s\n"
+      "fmla z19.s, p0/M, z2.s, z6.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z0.s, z7.s\n"
+      "fmla z21.s, p0/M, z1.s, z7.s\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z22.s, p0/M, z2.s, z7.s\n"
+      "fmla z23.s, p0/M, z0.s, z4.s\n"
+      "fmla z24.s, p0/M, z1.s, z4.s\n"
+      "fmla z25.s, p0/M, z2.s, z4.s\n"
+      "fmla z26.s, p0/M, z0.s, z5.s\n"
+      "fmla z27.s, p0/M, z1.s, z5.s\n"
+      "fmla z28.s, p0/M, z2.s, z5.s\n"
+      "fmla z29.s, p0/M, z0.s, z3.s\n"
+      "fmla z30.s, p0/M, z1.s, z3.s\n"
+      "fmla z31.s, p0/M, z2.s, z3.s\n"
+      "cbz x20, 5f\n"
+      "ld1w { z6.s }, p0/Z, [x22]\n"
+      "ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z4.s }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "fmla z8.s, p0/M, z6.s, z3.s\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z9.s, p0/M, z5.s, z3.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z10.s, p0/M, z4.s, z3.s\n"
+      "fmla z11.s, p0/M, z6.s, z2.s\n"
+      "fmla z12.s, p0/M, z5.s, z2.s\n"
+      "fmla z13.s, p0/M, z4.s, z2.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z14.s, p0/M, z6.s, z1.s\n"
+      "fmla z15.s, p0/M, z5.s, z1.s\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z16.s, p0/M, z4.s, z1.s\n"
+      "fmla z17.s, p0/M, z6.s, z0.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z18.s, p0/M, z5.s, z0.s\n"
+      "fmla z19.s, p0/M, z4.s, z0.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z6.s, z3.s\n"
+      "fmla z21.s, p0/M, z5.s, z3.s\n"
+      "addvl x22, x22, #3\n"
+      "fmla z22.s, p0/M, z4.s, z3.s\n"
+      "fmla z23.s, p0/M, z6.s, z2.s\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z24.s, p0/M, z5.s, z2.s\n"
+      "fmla z25.s, p0/M, z4.s, z2.s\n"
+      "fmla z26.s, p0/M, z6.s, z1.s\n"
+      "fmla z27.s, p0/M, z5.s, z1.s\n"
+      "fmla z28.s, p0/M, z4.s, z1.s\n"
+      "fmla z29.s, p0/M, z6.s, z0.s\n"
+      "fmla z30.s, p0/M, z5.s, z0.s\n"
+      "fmla z31.s, p0/M, z4.s, z0.s\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x23, x23, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
index 1e05a308b5..c7f32ff7a9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,308 +21,229 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
-
-#include "../../asmlib.hpp"
+#include <cstddef>
 
 namespace arm_gemm {
 
-void sve_interleaved_fp32_mla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+void sve_interleaved_fp32_mla_8x3VL(
+    const float *Apanel,
+    const float *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
+    struct KernelArgs {
+        size_t K = {};
+        const float *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
 
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.s\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z17.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "mov z18.s, #0\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                "fmla z8.s, z4.s, z0.s[0]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.s, z4.s, z0.s[1]\n"
-                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.s, z4.s, z0.s[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                "fmla z11.s, z4.s, z0.s[3]\n"
-                "fmla z20.s, z4.s, z1.s[0]\n"
-                "fmla z21.s, z4.s, z1.s[1]\n"
-                "fmla z22.s, z4.s, z1.s[2]\n"
-                "fmla z23.s, z4.s, z1.s[3]\n"
-                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
-                "fmla z12.s, z5.s, z0.s[0]\n"
-                "fmla z13.s, z5.s, z0.s[1]\n"
-                "fmla z14.s, z5.s, z0.s[2]\n"
-                "fmla z15.s, z5.s, z0.s[3]\n"
-                "fmla z24.s, z5.s, z1.s[0]\n"
-                "fmla z25.s, z5.s, z1.s[1]\n"
-                "fmla z26.s, z5.s, z1.s[2]\n"
-                "fmla z27.s, z5.s, z1.s[3]\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "fmla z16.s, z6.s, z0.s[0]\n"
-                "fmla z17.s, z6.s, z0.s[1]\n"
-                "fmla z18.s, z6.s, z0.s[2]\n"
-                "fmla z19.s, z6.s, z0.s[3]\n"
-                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
-                "fmla z28.s, z6.s, z1.s[0]\n"
-                "fmla z29.s, z6.s, z1.s[1]\n"
-                "fmla z30.s, z6.s, z1.s[2]\n"
-                "fmla z31.s, z6.s, z1.s[3]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "fmla z8.s, z4.s, z2.s[0]\n"
-                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
-                "fmla z9.s, z4.s, z2.s[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "fmla z10.s, z4.s, z2.s[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "fmla z11.s, z4.s, z2.s[3]\n"
-                "fmla z20.s, z4.s, z3.s[0]\n"
-                "fmla z21.s, z4.s, z3.s[1]\n"
-                "fmla z22.s, z4.s, z3.s[2]\n"
-                "fmla z23.s, z4.s, z3.s[3]\n"
-                "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "fmla z12.s, z5.s, z2.s[0]\n"
-                "fmla z13.s, z5.s, z2.s[1]\n"
-                "fmla z14.s, z5.s, z2.s[2]\n"
-                "fmla z15.s, z5.s, z2.s[3]\n"
-                "fmla z24.s, z5.s, z3.s[0]\n"
-                "fmla z25.s, z5.s, z3.s[1]\n"
-                "fmla z26.s, z5.s, z3.s[2]\n"
-                "fmla z27.s, z5.s, z3.s[3]\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "fmla z16.s, z6.s, z2.s[0]\n"
-                "fmla z17.s, z6.s, z2.s[1]\n"
-                "fmla z18.s, z6.s, z2.s[2]\n"
-                "fmla z19.s, z6.s, z2.s[3]\n"
-                "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n"
-                "fmla z28.s, z6.s, z3.s[0]\n"
-                "fmla z29.s, z6.s, z3.s[1]\n"
-                "fmla z30.s, z6.s, z3.s[2]\n"
-                "fmla z31.s, z6.s, z3.s[3]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                "fmla z8.s, z4.s, z0.s[0]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.s, z4.s, z0.s[1]\n"
-                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.s, z4.s, z0.s[2]\n"
-                "fmla z11.s, z4.s, z0.s[3]\n"
-                "fmla z20.s, z4.s, z1.s[0]\n"
-                "fmla z21.s, z4.s, z1.s[1]\n"
-                "fmla z22.s, z4.s, z1.s[2]\n"
-                "fmla z23.s, z4.s, z1.s[3]\n"
-                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
-                "fmla z12.s, z5.s, z0.s[0]\n"
-                "fmla z13.s, z5.s, z0.s[1]\n"
-                "fmla z14.s, z5.s, z0.s[2]\n"
-                "fmla z15.s, z5.s, z0.s[3]\n"
-                "fmla z24.s, z5.s, z1.s[0]\n"
-                "fmla z25.s, z5.s, z1.s[1]\n"
-                "fmla z26.s, z5.s, z1.s[2]\n"
-                "fmla z27.s, z5.s, z1.s[3]\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "fmla z16.s, z6.s, z0.s[0]\n"
-                "fmla z17.s, z6.s, z0.s[1]\n"
-                "fmla z18.s, z6.s, z0.s[2]\n"
-                "fmla z19.s, z6.s, z0.s[3]\n"
-                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
-                "fmla z28.s, z6.s, z1.s[0]\n"
-                "fmla z29.s, z6.s, z1.s[1]\n"
-                "fmla z30.s, z6.s, z1.s[2]\n"
-                "fmla z31.s, z6.s, z1.s[3]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "fmla z8.s, z4.s, z2.s[0]\n"
-                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
-                "fmla z9.s, z4.s, z2.s[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "fmla z10.s, z4.s, z2.s[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "fmla z11.s, z4.s, z2.s[3]\n"
-                "fmla z20.s, z4.s, z3.s[0]\n"
-                "fmla z21.s, z4.s, z3.s[1]\n"
-                "fmla z22.s, z4.s, z3.s[2]\n"
-                "fmla z23.s, z4.s, z3.s[3]\n"
-                "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "fmla z12.s, z5.s, z2.s[0]\n"
-                "fmla z13.s, z5.s, z2.s[1]\n"
-                "fmla z14.s, z5.s, z2.s[2]\n"
-                "fmla z15.s, z5.s, z2.s[3]\n"
-                "fmla z24.s, z5.s, z3.s[0]\n"
-                "fmla z25.s, z5.s, z3.s[1]\n"
-                "fmla z26.s, z5.s, z3.s[2]\n"
-                "fmla z27.s, z5.s, z3.s[3]\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "fmla z16.s, z6.s, z2.s[0]\n"
-                "fmla z17.s, z6.s, z2.s[1]\n"
-                "fmla z18.s, z6.s, z2.s[2]\n"
-                "fmla z19.s, z6.s, z2.s[3]\n"
-                "fmla z28.s, z6.s, z3.s[0]\n"
-                "fmla z29.s, z6.s, z3.s[1]\n"
-                "fmla z30.s, z6.s, z3.s[2]\n"
-                "fmla z31.s, z6.s, z3.s[3]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z8.s, z4.s, z0.s[0]\n"
-                "fmla z9.s, z4.s, z0.s[1]\n"
-                "fmla z10.s, z4.s, z0.s[2]\n"
-                "fmla z11.s, z4.s, z0.s[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "fmla z20.s, z4.s, z1.s[0]\n"
-                "fmla z21.s, z4.s, z1.s[1]\n"
-                "fmla z22.s, z4.s, z1.s[2]\n"
-                "fmla z23.s, z4.s, z1.s[3]\n"
-                "fmla z12.s, z5.s, z0.s[0]\n"
-                "fmla z13.s, z5.s, z0.s[1]\n"
-                "fmla z14.s, z5.s, z0.s[2]\n"
-                "fmla z15.s, z5.s, z0.s[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "fmla z24.s, z5.s, z1.s[0]\n"
-                "fmla z25.s, z5.s, z1.s[1]\n"
-                "fmla z26.s, z5.s, z1.s[2]\n"
-                "fmla z27.s, z5.s, z1.s[3]\n"
-                "fmla z16.s, z6.s, z0.s[0]\n"
-                "fmla z17.s, z6.s, z0.s[1]\n"
-                "fmla z18.s, z6.s, z0.s[2]\n"
-                "fmla z19.s, z6.s, z0.s[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "fmla z28.s, z6.s, z1.s[0]\n"
-                "fmla z29.s, z6.s, z1.s[1]\n"
-                "fmla z30.s, z6.s, z1.s[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "fmla z31.s, z6.s, z1.s[3]\n"
-                "b 4f\n"
-                "3:\n"
-                "fmla z8.s, z4.s, z0.s[0]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.s, z4.s, z0.s[1]\n"
-                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.s, z4.s, z0.s[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "fmla z11.s, z4.s, z0.s[3]\n"
-                "fmla z20.s, z4.s, z1.s[0]\n"
-                "fmla z21.s, z4.s, z1.s[1]\n"
-                "fmla z22.s, z4.s, z1.s[2]\n"
-                "fmla z23.s, z4.s, z1.s[3]\n"
-                "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "fmla z12.s, z5.s, z0.s[0]\n"
-                "fmla z13.s, z5.s, z0.s[1]\n"
-                "fmla z14.s, z5.s, z0.s[2]\n"
-                "fmla z15.s, z5.s, z0.s[3]\n"
-                "fmla z24.s, z5.s, z1.s[0]\n"
-                "fmla z25.s, z5.s, z1.s[1]\n"
-                "fmla z26.s, z5.s, z1.s[2]\n"
-                "fmla z27.s, z5.s, z1.s[3]\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "fmla z16.s, z6.s, z0.s[0]\n"
-                "fmla z17.s, z6.s, z0.s[1]\n"
-                "fmla z18.s, z6.s, z0.s[2]\n"
-                "fmla z19.s, z6.s, z0.s[3]\n"
-                "fmla z28.s, z6.s, z1.s[0]\n"
-                "fmla z29.s, z6.s, z1.s[1]\n"
-                "fmla z30.s, z6.s, z1.s[2]\n"
-                "fmla z31.s, z6.s, z1.s[3]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z8.s, z4.s, z2.s[0]\n"
-                "fmla z9.s, z4.s, z2.s[1]\n"
-                "fmla z10.s, z4.s, z2.s[2]\n"
-                "fmla z11.s, z4.s, z2.s[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "fmla z20.s, z4.s, z3.s[0]\n"
-                "fmla z21.s, z4.s, z3.s[1]\n"
-                "fmla z22.s, z4.s, z3.s[2]\n"
-                "fmla z23.s, z4.s, z3.s[3]\n"
-                "fmla z12.s, z5.s, z2.s[0]\n"
-                "fmla z13.s, z5.s, z2.s[1]\n"
-                "fmla z14.s, z5.s, z2.s[2]\n"
-                "fmla z15.s, z5.s, z2.s[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "fmla z24.s, z5.s, z3.s[0]\n"
-                "fmla z25.s, z5.s, z3.s[1]\n"
-                "fmla z26.s, z5.s, z3.s[2]\n"
-                "fmla z27.s, z5.s, z3.s[3]\n"
-                "fmla z16.s, z6.s, z2.s[0]\n"
-                "fmla z17.s, z6.s, z2.s[1]\n"
-                "fmla z18.s, z6.s, z2.s[2]\n"
-                "fmla z19.s, z6.s, z2.s[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "fmla z28.s, z6.s, z3.s[0]\n"
-                "fmla z29.s, z6.s, z3.s[1]\n"
-                "fmla z30.s, z6.s, z3.s[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "fmla z31.s, z6.s, z3.s[3]\n"
-                "4:\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "st1w z25.s, p0, [%[c_ptr]]\n"
-                "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "ld1w { z4.s }, p0/Z, [x22]\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "ld1w { z6.s }, p0/Z, [x22, #2, MUL VL]\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla z8.s, z4.s, z0.s[0]\n"
+      "fmla z11.s, z4.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+      "fmla z14.s, z4.s, z0.s[2]\n"
+      "fmla z17.s, z4.s, z0.s[3]\n"
+      "ld1rqw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+      "fmla z20.s, z4.s, z1.s[0]\n"
+      "fmla z23.s, z4.s, z1.s[1]\n"
+      "sub x20, x20, #0x2\n"
+      "fmla z26.s, z4.s, z1.s[2]\n"
+      "fmla z29.s, z4.s, z1.s[3]\n"
+      "ld1w { z4.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "fmla z9.s, z5.s, z0.s[0]\n"
+      "fmla z12.s, z5.s, z0.s[1]\n"
+      "cmp x20, #0x2\n"
+      "fmla z15.s, z5.s, z0.s[2]\n"
+      "fmla z18.s, z5.s, z0.s[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "fmla z21.s, z5.s, z1.s[0]\n"
+      "fmla z24.s, z5.s, z1.s[1]\n"
+      "fmla z27.s, z5.s, z1.s[2]\n"
+      "fmla z30.s, z5.s, z1.s[3]\n"
+      "ld1w { z5.s }, p0/Z, [x22, #4, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z13.s, z6.s, z0.s[1]\n"
+      "fmla z16.s, z6.s, z0.s[2]\n"
+      "fmla z19.s, z6.s, z0.s[3]\n"
+      "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
+      "fmla z22.s, z6.s, z1.s[0]\n"
+      "fmla z25.s, z6.s, z1.s[1]\n"
+      "fmla z28.s, z6.s, z1.s[2]\n"
+      "fmla z31.s, z6.s, z1.s[3]\n"
+      "ld1w { z2.s }, p0/Z, [x22, #5, MUL VL]\n"
+      "addvl x22, x22, #6\n"
+      "fmla z8.s, z4.s, z3.s[0]\n"
+      "fmla z11.s, z4.s, z3.s[1]\n"
+      "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z14.s, z4.s, z3.s[2]\n"
+      "fmla z17.s, z4.s, z3.s[3]\n"
+      "fmla z20.s, z4.s, z7.s[0]\n"
+      "fmla z23.s, z4.s, z7.s[1]\n"
+      "fmla z26.s, z4.s, z7.s[2]\n"
+      "fmla z29.s, z4.s, z7.s[3]\n"
+      "ld1w { z4.s }, p0/Z, [x22]\n"
+      "fmla z9.s, z5.s, z3.s[0]\n"
+      "fmla z12.s, z5.s, z3.s[1]\n"
+      "fmla z15.s, z5.s, z3.s[2]\n"
+      "fmla z18.s, z5.s, z3.s[3]\n"
+      "fmla z21.s, z5.s, z7.s[0]\n"
+      "fmla z24.s, z5.s, z7.s[1]\n"
+      "fmla z27.s, z5.s, z7.s[2]\n"
+      "fmla z30.s, z5.s, z7.s[3]\n"
+      "ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n"
+      "fmla z10.s, z2.s, z3.s[0]\n"
+      "fmla z13.s, z2.s, z3.s[1]\n"
+      "fmla z16.s, z2.s, z3.s[2]\n"
+      "fmla z19.s, z2.s, z3.s[3]\n"
+      "fmla z22.s, z2.s, z7.s[0]\n"
+      "fmla z25.s, z2.s, z7.s[1]\n"
+      "fmla z28.s, z2.s, z7.s[2]\n"
+      "fmla z31.s, z2.s, z7.s[3]\n"
+      "ld1w { z6.s }, p0/Z, [x22, #2, MUL VL]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "fmla z8.s, z4.s, z0.s[0]\n"
+      "fmla z11.s, z4.s, z0.s[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z14.s, z4.s, z0.s[2]\n"
+      "fmla z17.s, z4.s, z0.s[3]\n"
+      "addvl x22, x22, #3\n"
+      "fmla z20.s, z4.s, z1.s[0]\n"
+      "fmla z23.s, z4.s, z1.s[1]\n"
+      "fmla z26.s, z4.s, z1.s[2]\n"
+      "fmla z29.s, z4.s, z1.s[3]\n"
+      "fmla z9.s, z5.s, z0.s[0]\n"
+      "fmla z12.s, z5.s, z0.s[1]\n"
+      "fmla z15.s, z5.s, z0.s[2]\n"
+      "fmla z18.s, z5.s, z0.s[3]\n"
+      "fmla z21.s, z5.s, z1.s[0]\n"
+      "fmla z24.s, z5.s, z1.s[1]\n"
+      "fmla z27.s, z5.s, z1.s[2]\n"
+      "fmla z30.s, z5.s, z1.s[3]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z13.s, z6.s, z0.s[1]\n"
+      "fmla z16.s, z6.s, z0.s[2]\n"
+      "fmla z19.s, z6.s, z0.s[3]\n"
+      "fmla z22.s, z6.s, z1.s[0]\n"
+      "fmla z25.s, z6.s, z1.s[1]\n"
+      "fmla z28.s, z6.s, z1.s[2]\n"
+      "fmla z31.s, z6.s, z1.s[3]\n"
+      "cbz x20, 5f\n"
+      "ld1rqw { z4.s }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ld1w { z2.s }, p0/Z, [x22]\n"
+      "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
+      "fmla z8.s, z2.s, z4.s[0]\n"
+      "ld1w { z0.s }, p0/Z, [x22, #2, MUL VL]\n"
+      "fmla z11.s, z2.s, z4.s[1]\n"
+      "fmla z14.s, z2.s, z4.s[2]\n"
+      "fmla z17.s, z2.s, z4.s[3]\n"
+      "fmla z20.s, z2.s, z3.s[0]\n"
+      "addvl x22, x22, #3\n"
+      "fmla z23.s, z2.s, z3.s[1]\n"
+      "fmla z26.s, z2.s, z3.s[2]\n"
+      "fmla z29.s, z2.s, z3.s[3]\n"
+      "fmla z9.s, z1.s, z4.s[0]\n"
+      "fmla z12.s, z1.s, z4.s[1]\n"
+      "fmla z15.s, z1.s, z4.s[2]\n"
+      "fmla z18.s, z1.s, z4.s[3]\n"
+      "fmla z21.s, z1.s, z3.s[0]\n"
+      "fmla z24.s, z1.s, z3.s[1]\n"
+      "fmla z27.s, z1.s, z3.s[2]\n"
+      "fmla z30.s, z1.s, z3.s[3]\n"
+      "fmla z10.s, z0.s, z4.s[0]\n"
+      "fmla z13.s, z0.s, z4.s[1]\n"
+      "fmla z16.s, z0.s, z4.s[2]\n"
+      "fmla z19.s, z0.s, z4.s[3]\n"
+      "fmla z22.s, z0.s, z3.s[0]\n"
+      "fmla z25.s, z0.s, z3.s[1]\n"
+      "fmla z28.s, z0.s, z3.s[2]\n"
+      "fmla z31.s, z0.s, z3.s[3]\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x23, x23, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
index 96216960ff..a355262fe2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 
 #include "../std_transforms_sve.hpp"
@@ -69,4 +69,4 @@ public:
 
 } // namespace arm_gemm
 
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
index 39daf0ff20..a50cd95157 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 
 #include "../../asmlib.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index 3e16915cd4..cf3069f828 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,52 +22,97 @@
  * SOFTWARE.
  */
 #pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
-#ifdef __ARM_FEATURE_SVE
-
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const int8_t *, const int8_t *, \
+    int32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_s8s32_dot_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_dot_8x3VL( ARGLIST );
+void sve_interleaved_s8s32_dot_8x3VL_a64fx( ARGLIST );
 
-class cls_sve_interleaved_s8s32_dot_8x3VL {
+class cls_sve_interleaved_s8s32_dot_8x3VL
+{
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
 
-    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return get_vector_length<int32_t>() * 3;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return get_vector_length<int32_t>() * 3;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 4;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_s8s32_dot_8x3VL;
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.66, 4.10, 7.99 };
+                case CPUModel::V1:
+                    return { 63.30, 4.97, 11.35 };
+                case CPUModel::A510:
+                    return { 27.42, 3.47, 2.88 };
+                case CPUModel::A64FX:
+                    return { 109.18, 3.88, 7.85 };
+            }
+        }
 
-    cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *)
-    {
 
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.67, 3.57, 0.50 };
+                case CPUModel::V1:
+                    return { 52.24, 7.49, 0.80 };
+                case CPUModel::A510:
+                    return { 27.47, 1.70, 0.28 };
+                case CPUModel::A64FX:
+                    return { 109.92, 2.36, 0.41 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_s8s32_dot_8x3VL;
+    cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_interleaved_s8s32_dot_8x3VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
-#endif // __ARM_FEATURE_SVE
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
new file mode 100644
index 0000000000..c668a7b746
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_interleaved_s8s32_dot_8x3VL_a64fx(
+    const int8_t *Apanel,
+    const int8_t *Bpanel,
+    int32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const int8_t *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
+
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ld1b { z0.b }, p0/Z, [x22]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "sdot z8.s, z0.b, z3.b\n"
+      "sdot z9.s, z1.b, z3.b\n"
+      "sub x20, x20, #0x2\n"
+      "sdot z10.s, z2.b, z3.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
+      "sdot z11.s, z0.b, z4.b\n"
+      "sdot z12.s, z1.b, z4.b\n"
+      "sdot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "sdot z14.s, z0.b, z5.b\n"
+      "sdot z15.s, z1.b, z5.b\n"
+      "cmp x20, #0x2\n"
+      "sdot z16.s, z2.b, z5.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
+      "sdot z17.s, z0.b, z6.b\n"
+      "sdot z18.s, z1.b, z6.b\n"
+      "sdot z19.s, z2.b, z6.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+      "sdot z20.s, z0.b, z7.b\n"
+      "sdot z21.s, z1.b, z7.b\n"
+      "sdot z22.s, z2.b, z7.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
+      "sdot z23.s, z0.b, z4.b\n"
+      "sdot z24.s, z1.b, z4.b\n"
+      "sdot z25.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
+      "sdot z26.s, z0.b, z3.b\n"
+      "sdot z27.s, z1.b, z3.b\n"
+      "sdot z28.s, z2.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
+      "sdot z29.s, z0.b, z5.b\n"
+      "ld1b { z6.b }, p0/Z, [x22, #3, MUL VL]\n"
+      "sdot z30.s, z1.b, z5.b\n"
+      "sdot z31.s, z2.b, z5.b\n"
+      "ld1b { z2.b }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z7.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+      "sdot z9.s, z2.b, z7.b\n"
+      "sdot z10.s, z5.b, z7.b\n"
+      "sdot z11.s, z6.b, z4.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+      "sdot z12.s, z2.b, z4.b\n"
+      "sdot z13.s, z5.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
+      "sdot z14.s, z6.b, z3.b\n"
+      "sdot z15.s, z2.b, z3.b\n"
+      "addvl x22, x22, #6\n"
+      "sdot z16.s, z5.b, z3.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+      "sdot z17.s, z6.b, z1.b\n"
+      "sdot z18.s, z2.b, z1.b\n"
+      "sdot z19.s, z5.b, z1.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "sdot z20.s, z6.b, z7.b\n"
+      "sdot z21.s, z2.b, z7.b\n"
+      "sdot z22.s, z5.b, z7.b\n"
+      "sdot z23.s, z6.b, z4.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "sdot z24.s, z2.b, z4.b\n"
+      "sdot z25.s, z5.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "sdot z26.s, z6.b, z0.b\n"
+      "sdot z27.s, z2.b, z0.b\n"
+      "sdot z28.s, z5.b, z0.b\n"
+      "sdot z29.s, z6.b, z1.b\n"
+      "ld1b { z0.b }, p0/Z, [x22]\n"
+      "sdot z30.s, z2.b, z1.b\n"
+      "sdot z31.s, z5.b, z1.b\n"
+      "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "sdot z8.s, z0.b, z3.b\n"
+      "sdot z9.s, z1.b, z3.b\n"
+      "addvl x22, x22, #3\n"
+      "sdot z10.s, z2.b, z3.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
+      "sdot z11.s, z0.b, z4.b\n"
+      "sdot z12.s, z1.b, z4.b\n"
+      "sdot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "sdot z14.s, z0.b, z5.b\n"
+      "sdot z15.s, z1.b, z5.b\n"
+      "sdot z16.s, z2.b, z5.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "sdot z17.s, z0.b, z6.b\n"
+      "sdot z18.s, z1.b, z6.b\n"
+      "sdot z19.s, z2.b, z6.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+      "sdot z20.s, z0.b, z7.b\n"
+      "sdot z21.s, z1.b, z7.b\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "sdot z22.s, z2.b, z7.b\n"
+      "sdot z23.s, z0.b, z4.b\n"
+      "sdot z24.s, z1.b, z4.b\n"
+      "sdot z25.s, z2.b, z4.b\n"
+      "sdot z26.s, z0.b, z5.b\n"
+      "sdot z27.s, z1.b, z5.b\n"
+      "sdot z28.s, z2.b, z5.b\n"
+      "sdot z29.s, z0.b, z3.b\n"
+      "sdot z30.s, z1.b, z3.b\n"
+      "sdot z31.s, z2.b, z3.b\n"
+      "cbz x20, 5f\n"
+      "ld1b { z6.b }, p0/Z, [x22]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "sdot z8.s, z6.b, z3.b\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+      "sdot z9.s, z5.b, z3.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+      "sdot z10.s, z4.b, z3.b\n"
+      "sdot z11.s, z6.b, z2.b\n"
+      "sdot z12.s, z5.b, z2.b\n"
+      "sdot z13.s, z4.b, z2.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z15.s, z5.b, z1.b\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+      "sdot z16.s, z4.b, z1.b\n"
+      "sdot z17.s, z6.b, z0.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+      "sdot z18.s, z5.b, z0.b\n"
+      "sdot z19.s, z4.b, z0.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "sdot z21.s, z5.b, z3.b\n"
+      "addvl x22, x22, #3\n"
+      "sdot z22.s, z4.b, z3.b\n"
+      "sdot z23.s, z6.b, z2.b\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "sdot z24.s, z5.b, z2.b\n"
+      "sdot z25.s, z4.b, z2.b\n"
+      "sdot z26.s, z6.b, z1.b\n"
+      "sdot z27.s, z5.b, z1.b\n"
+      "sdot z28.s, z4.b, z1.b\n"
+      "sdot z29.s, z6.b, z0.b\n"
+      "sdot z30.s, z5.b, z0.b\n"
+      "sdot z31.s, z4.b, z0.b\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x23, x23, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
index 674c2400bf..f6e1a75c15 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,309 +21,230 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_s8s32_dot_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const int8_t *a_ptr = Apanel;
-    int32_t *c_ptr = Cpanel;
+void sve_interleaved_s8s32_dot_8x3VL(
+    const int8_t *Apanel,
+    const int8_t *Bpanel,
+    int32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
-    K /= 4;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t K = {};
+        const int8_t *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr = Bpanel;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.b\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z17.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "mov z18.s, #0\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                "sdot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "sdot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "sdot z10.s, z4.b, z0.b[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                "sdot z11.s, z4.b, z0.b[3]\n"
-                "sdot z20.s, z4.b, z1.b[0]\n"
-                "sdot z21.s, z4.b, z1.b[1]\n"
-                "sdot z22.s, z4.b, z1.b[2]\n"
-                "sdot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "sdot z12.s, z5.b, z0.b[0]\n"
-                "sdot z13.s, z5.b, z0.b[1]\n"
-                "sdot z14.s, z5.b, z0.b[2]\n"
-                "sdot z15.s, z5.b, z0.b[3]\n"
-                "sdot z24.s, z5.b, z1.b[0]\n"
-                "sdot z25.s, z5.b, z1.b[1]\n"
-                "sdot z26.s, z5.b, z1.b[2]\n"
-                "sdot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "sdot z16.s, z6.b, z0.b[0]\n"
-                "sdot z17.s, z6.b, z0.b[1]\n"
-                "sdot z18.s, z6.b, z0.b[2]\n"
-                "sdot z19.s, z6.b, z0.b[3]\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "sdot z28.s, z6.b, z1.b[0]\n"
-                "sdot z29.s, z6.b, z1.b[1]\n"
-                "sdot z30.s, z6.b, z1.b[2]\n"
-                "sdot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "sdot z8.s, z4.b, z2.b[0]\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "sdot z9.s, z4.b, z2.b[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "sdot z10.s, z4.b, z2.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "sdot z11.s, z4.b, z2.b[3]\n"
-                "sdot z20.s, z4.b, z3.b[0]\n"
-                "sdot z21.s, z4.b, z3.b[1]\n"
-                "sdot z22.s, z4.b, z3.b[2]\n"
-                "sdot z23.s, z4.b, z3.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "sdot z12.s, z5.b, z2.b[0]\n"
-                "sdot z13.s, z5.b, z2.b[1]\n"
-                "sdot z14.s, z5.b, z2.b[2]\n"
-                "sdot z15.s, z5.b, z2.b[3]\n"
-                "sdot z24.s, z5.b, z3.b[0]\n"
-                "sdot z25.s, z5.b, z3.b[1]\n"
-                "sdot z26.s, z5.b, z3.b[2]\n"
-                "sdot z27.s, z5.b, z3.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "sdot z16.s, z6.b, z2.b[0]\n"
-                "sdot z17.s, z6.b, z2.b[1]\n"
-                "sdot z18.s, z6.b, z2.b[2]\n"
-                "sdot z19.s, z6.b, z2.b[3]\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                "sdot z28.s, z6.b, z3.b[0]\n"
-                "sdot z29.s, z6.b, z3.b[1]\n"
-                "sdot z30.s, z6.b, z3.b[2]\n"
-                "sdot z31.s, z6.b, z3.b[3]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                "sdot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "sdot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "sdot z10.s, z4.b, z0.b[2]\n"
-                "sdot z11.s, z4.b, z0.b[3]\n"
-                "sdot z20.s, z4.b, z1.b[0]\n"
-                "sdot z21.s, z4.b, z1.b[1]\n"
-                "sdot z22.s, z4.b, z1.b[2]\n"
-                "sdot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "sdot z12.s, z5.b, z0.b[0]\n"
-                "sdot z13.s, z5.b, z0.b[1]\n"
-                "sdot z14.s, z5.b, z0.b[2]\n"
-                "sdot z15.s, z5.b, z0.b[3]\n"
-                "sdot z24.s, z5.b, z1.b[0]\n"
-                "sdot z25.s, z5.b, z1.b[1]\n"
-                "sdot z26.s, z5.b, z1.b[2]\n"
-                "sdot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "sdot z16.s, z6.b, z0.b[0]\n"
-                "sdot z17.s, z6.b, z0.b[1]\n"
-                "sdot z18.s, z6.b, z0.b[2]\n"
-                "sdot z19.s, z6.b, z0.b[3]\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "sdot z28.s, z6.b, z1.b[0]\n"
-                "sdot z29.s, z6.b, z1.b[1]\n"
-                "sdot z30.s, z6.b, z1.b[2]\n"
-                "sdot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "sdot z8.s, z4.b, z2.b[0]\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "sdot z9.s, z4.b, z2.b[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "sdot z10.s, z4.b, z2.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "sdot z11.s, z4.b, z2.b[3]\n"
-                "sdot z20.s, z4.b, z3.b[0]\n"
-                "sdot z21.s, z4.b, z3.b[1]\n"
-                "sdot z22.s, z4.b, z3.b[2]\n"
-                "sdot z23.s, z4.b, z3.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "sdot z12.s, z5.b, z2.b[0]\n"
-                "sdot z13.s, z5.b, z2.b[1]\n"
-                "sdot z14.s, z5.b, z2.b[2]\n"
-                "sdot z15.s, z5.b, z2.b[3]\n"
-                "sdot z24.s, z5.b, z3.b[0]\n"
-                "sdot z25.s, z5.b, z3.b[1]\n"
-                "sdot z26.s, z5.b, z3.b[2]\n"
-                "sdot z27.s, z5.b, z3.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "sdot z16.s, z6.b, z2.b[0]\n"
-                "sdot z17.s, z6.b, z2.b[1]\n"
-                "sdot z18.s, z6.b, z2.b[2]\n"
-                "sdot z19.s, z6.b, z2.b[3]\n"
-                "sdot z28.s, z6.b, z3.b[0]\n"
-                "sdot z29.s, z6.b, z3.b[1]\n"
-                "sdot z30.s, z6.b, z3.b[2]\n"
-                "sdot z31.s, z6.b, z3.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "sdot z8.s, z4.b, z0.b[0]\n"
-                "sdot z9.s, z4.b, z0.b[1]\n"
-                "sdot z10.s, z4.b, z0.b[2]\n"
-                "sdot z11.s, z4.b, z0.b[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "sdot z20.s, z4.b, z1.b[0]\n"
-                "sdot z21.s, z4.b, z1.b[1]\n"
-                "sdot z22.s, z4.b, z1.b[2]\n"
-                "sdot z23.s, z4.b, z1.b[3]\n"
-                "sdot z12.s, z5.b, z0.b[0]\n"
-                "sdot z13.s, z5.b, z0.b[1]\n"
-                "sdot z14.s, z5.b, z0.b[2]\n"
-                "sdot z15.s, z5.b, z0.b[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "sdot z24.s, z5.b, z1.b[0]\n"
-                "sdot z25.s, z5.b, z1.b[1]\n"
-                "sdot z26.s, z5.b, z1.b[2]\n"
-                "sdot z27.s, z5.b, z1.b[3]\n"
-                "sdot z16.s, z6.b, z0.b[0]\n"
-                "sdot z17.s, z6.b, z0.b[1]\n"
-                "sdot z18.s, z6.b, z0.b[2]\n"
-                "sdot z19.s, z6.b, z0.b[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "sdot z28.s, z6.b, z1.b[0]\n"
-                "sdot z29.s, z6.b, z1.b[1]\n"
-                "sdot z30.s, z6.b, z1.b[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "sdot z31.s, z6.b, z1.b[3]\n"
-                "b 4f\n"
-                "3:\n"
-                "sdot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "sdot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "sdot z10.s, z4.b, z0.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "sdot z11.s, z4.b, z0.b[3]\n"
-                "sdot z20.s, z4.b, z1.b[0]\n"
-                "sdot z21.s, z4.b, z1.b[1]\n"
-                "sdot z22.s, z4.b, z1.b[2]\n"
-                "sdot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "sdot z12.s, z5.b, z0.b[0]\n"
-                "sdot z13.s, z5.b, z0.b[1]\n"
-                "sdot z14.s, z5.b, z0.b[2]\n"
-                "sdot z15.s, z5.b, z0.b[3]\n"
-                "sdot z24.s, z5.b, z1.b[0]\n"
-                "sdot z25.s, z5.b, z1.b[1]\n"
-                "sdot z26.s, z5.b, z1.b[2]\n"
-                "sdot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "sdot z16.s, z6.b, z0.b[0]\n"
-                "sdot z17.s, z6.b, z0.b[1]\n"
-                "sdot z18.s, z6.b, z0.b[2]\n"
-                "sdot z19.s, z6.b, z0.b[3]\n"
-                "sdot z28.s, z6.b, z1.b[0]\n"
-                "sdot z29.s, z6.b, z1.b[1]\n"
-                "sdot z30.s, z6.b, z1.b[2]\n"
-                "sdot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "sdot z8.s, z4.b, z2.b[0]\n"
-                "sdot z9.s, z4.b, z2.b[1]\n"
-                "sdot z10.s, z4.b, z2.b[2]\n"
-                "sdot z11.s, z4.b, z2.b[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "sdot z20.s, z4.b, z3.b[0]\n"
-                "sdot z21.s, z4.b, z3.b[1]\n"
-                "sdot z22.s, z4.b, z3.b[2]\n"
-                "sdot z23.s, z4.b, z3.b[3]\n"
-                "sdot z12.s, z5.b, z2.b[0]\n"
-                "sdot z13.s, z5.b, z2.b[1]\n"
-                "sdot z14.s, z5.b, z2.b[2]\n"
-                "sdot z15.s, z5.b, z2.b[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "sdot z24.s, z5.b, z3.b[0]\n"
-                "sdot z25.s, z5.b, z3.b[1]\n"
-                "sdot z26.s, z5.b, z3.b[2]\n"
-                "sdot z27.s, z5.b, z3.b[3]\n"
-                "sdot z16.s, z6.b, z2.b[0]\n"
-                "sdot z17.s, z6.b, z2.b[1]\n"
-                "sdot z18.s, z6.b, z2.b[2]\n"
-                "sdot z19.s, z6.b, z2.b[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "sdot z28.s, z6.b, z3.b[0]\n"
-                "sdot z29.s, z6.b, z3.b[1]\n"
-                "sdot z30.s, z6.b, z3.b[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "sdot z31.s, z6.b, z3.b[3]\n"
-                "4:\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "st1w z25.s, p0, [%[c_ptr]]\n"
-                "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "ld1b { z4.b }, p0/Z, [x22]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "sdot z8.s, z4.b, z0.b[0]\n"
+      "sdot z11.s, z4.b, z0.b[1]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #32]\n"
+      "sdot z14.s, z4.b, z0.b[2]\n"
+      "sdot z17.s, z4.b, z0.b[3]\n"
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel], #48]\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
+      "sdot z23.s, z4.b, z1.b[1]\n"
+      "sub x20, x20, #0x2\n"
+      "sdot z26.s, z4.b, z1.b[2]\n"
+      "sdot z29.s, z4.b, z1.b[3]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+      "sdot z9.s, z5.b, z0.b[0]\n"
+      "sdot z12.s, z5.b, z0.b[1]\n"
+      "cmp x20, #0x2\n"
+      "sdot z15.s, z5.b, z0.b[2]\n"
+      "sdot z18.s, z5.b, z0.b[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "sdot z21.s, z5.b, z1.b[0]\n"
+      "sdot z24.s, z5.b, z1.b[1]\n"
+      "sdot z27.s, z5.b, z1.b[2]\n"
+      "sdot z30.s, z5.b, z1.b[3]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #4, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z13.s, z6.b, z0.b[1]\n"
+      "sdot z16.s, z6.b, z0.b[2]\n"
+      "sdot z19.s, z6.b, z0.b[3]\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "sdot z22.s, z6.b, z1.b[0]\n"
+      "sdot z25.s, z6.b, z1.b[1]\n"
+      "sdot z28.s, z6.b, z1.b[2]\n"
+      "sdot z31.s, z6.b, z1.b[3]\n"
+      "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
+      "addvl x22, x22, #6\n"
+      "sdot z8.s, z4.b, z3.b[0]\n"
+      "sdot z11.s, z4.b, z3.b[1]\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "sdot z14.s, z4.b, z3.b[2]\n"
+      "sdot z17.s, z4.b, z3.b[3]\n"
+      "sdot z20.s, z4.b, z7.b[0]\n"
+      "sdot z23.s, z4.b, z7.b[1]\n"
+      "sdot z26.s, z4.b, z7.b[2]\n"
+      "sdot z29.s, z4.b, z7.b[3]\n"
+      "ld1b { z4.b }, p0/Z, [x22]\n"
+      "sdot z9.s, z5.b, z3.b[0]\n"
+      "sdot z12.s, z5.b, z3.b[1]\n"
+      "sdot z15.s, z5.b, z3.b[2]\n"
+      "sdot z18.s, z5.b, z3.b[3]\n"
+      "sdot z21.s, z5.b, z7.b[0]\n"
+      "sdot z24.s, z5.b, z7.b[1]\n"
+      "sdot z27.s, z5.b, z7.b[2]\n"
+      "sdot z30.s, z5.b, z7.b[3]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "sdot z10.s, z2.b, z3.b[0]\n"
+      "sdot z13.s, z2.b, z3.b[1]\n"
+      "sdot z16.s, z2.b, z3.b[2]\n"
+      "sdot z19.s, z2.b, z3.b[3]\n"
+      "sdot z22.s, z2.b, z7.b[0]\n"
+      "sdot z25.s, z2.b, z7.b[1]\n"
+      "sdot z28.s, z2.b, z7.b[2]\n"
+      "sdot z31.s, z2.b, z7.b[3]\n"
+      "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "sdot z8.s, z4.b, z0.b[0]\n"
+      "sdot z11.s, z4.b, z0.b[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "sdot z14.s, z4.b, z0.b[2]\n"
+      "sdot z17.s, z4.b, z0.b[3]\n"
+      "addvl x22, x22, #3\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
+      "sdot z23.s, z4.b, z1.b[1]\n"
+      "sdot z26.s, z4.b, z1.b[2]\n"
+      "sdot z29.s, z4.b, z1.b[3]\n"
+      "sdot z9.s, z5.b, z0.b[0]\n"
+      "sdot z12.s, z5.b, z0.b[1]\n"
+      "sdot z15.s, z5.b, z0.b[2]\n"
+      "sdot z18.s, z5.b, z0.b[3]\n"
+      "sdot z21.s, z5.b, z1.b[0]\n"
+      "sdot z24.s, z5.b, z1.b[1]\n"
+      "sdot z27.s, z5.b, z1.b[2]\n"
+      "sdot z30.s, z5.b, z1.b[3]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z13.s, z6.b, z0.b[1]\n"
+      "sdot z16.s, z6.b, z0.b[2]\n"
+      "sdot z19.s, z6.b, z0.b[3]\n"
+      "sdot z22.s, z6.b, z1.b[0]\n"
+      "sdot z25.s, z6.b, z1.b[1]\n"
+      "sdot z28.s, z6.b, z1.b[2]\n"
+      "sdot z31.s, z6.b, z1.b[3]\n"
+      "cbz x20, 5f\n"
+      "ld1rqb { z4.b }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #16]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ld1b { z2.b }, p0/Z, [x22]\n"
+      "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "sdot z8.s, z2.b, z4.b[0]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "sdot z11.s, z2.b, z4.b[1]\n"
+      "sdot z14.s, z2.b, z4.b[2]\n"
+      "sdot z17.s, z2.b, z4.b[3]\n"
+      "sdot z20.s, z2.b, z3.b[0]\n"
+      "addvl x22, x22, #3\n"
+      "sdot z23.s, z2.b, z3.b[1]\n"
+      "sdot z26.s, z2.b, z3.b[2]\n"
+      "sdot z29.s, z2.b, z3.b[3]\n"
+      "sdot z9.s, z1.b, z4.b[0]\n"
+      "sdot z12.s, z1.b, z4.b[1]\n"
+      "sdot z15.s, z1.b, z4.b[2]\n"
+      "sdot z18.s, z1.b, z4.b[3]\n"
+      "sdot z21.s, z1.b, z3.b[0]\n"
+      "sdot z24.s, z1.b, z3.b[1]\n"
+      "sdot z27.s, z1.b, z3.b[2]\n"
+      "sdot z30.s, z1.b, z3.b[3]\n"
+      "sdot z10.s, z0.b, z4.b[0]\n"
+      "sdot z13.s, z0.b, z4.b[1]\n"
+      "sdot z16.s, z0.b, z4.b[2]\n"
+      "sdot z19.s, z0.b, z4.b[3]\n"
+      "sdot z22.s, z0.b, z3.b[0]\n"
+      "sdot z25.s, z0.b, z3.b[1]\n"
+      "sdot z28.s, z0.b, z3.b[2]\n"
+      "sdot z31.s, z0.b, z3.b[3]\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x23, x23, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index 02b3451c54..82734abfbe 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,52 +22,85 @@
  * SOFTWARE.
  */
 #pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
-#ifdef __ARM_FEATURE_SVE
-
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const int8_t *, const int8_t *, \
+    int32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_mmla_8x3VL( ARGLIST );
 
-class cls_sve_interleaved_s8s32_mmla_8x3VL {
+class cls_sve_interleaved_s8s32_mmla_8x3VL
+{
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
 
-    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return get_vector_length<int32_t>() * 3;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return get_vector_length<int32_t>() * 3;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 8;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_s8s32_mmla_8x3VL;
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 61.98, 3.90, 7.94 };
+                case CPUModel::V1:
+                    return { 123.42, 5.00, 11.52 };
+                case CPUModel::A510:
+                    return { 43.14, 3.62, 2.90 };
+            }
+        }
+
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 61.97, 3.64, 0.50 };
+                case CPUModel::V1:
+                    return { 95.28, 7.99, 0.79 };
+                case CPUModel::A510:
+                    return { 43.36, 1.86, 0.28 };
+            }
+        }
 
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_s8s32_mmla_8x3VL;
     cls_sve_interleaved_s8s32_mmla_8x3VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
-#endif // __ARM_FEATURE_SVE
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
index 578aa01732..bfed5000fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,377 +21,277 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const int8_t *a_ptr = Apanel;
-    int32_t *c_ptr = Cpanel;
+void sve_interleaved_s8s32_mmla_8x3VL(
+    const int8_t *Apanel,
+    const int8_t *Bpanel,
+    int32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
-    K /= 8;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t K = {};
+        const int8_t *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr = Bpanel;
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.b\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "mov z17.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z18.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #4\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
-                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
-                ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
-                ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"
-                ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
-                ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
-                ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
-                ".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n"
-                ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n"
-                ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x4504980c // smmla z12.s, z0.b, z4.b\n"
-                ".inst 0x45049832 // smmla z18.s, z1.b, z4.b\n"
-                ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
-                ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x45069808 // smmla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
-                "addvl %[b_ptr], %[b_ptr], #12\n"
-                ".inst 0x4506987a // smmla z26.s, z3.b, z6.b\n"
-                ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
-                ".inst 0x4507982f // smmla z15.s, z1.b, z7.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x45079855 // smmla z21.s, z2.b, z7.b\n"
-                ".inst 0x4507987b // smmla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x4504980a // smmla z10.s, z0.b, z4.b\n"
-                ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
-                ".inst 0x45049856 // smmla z22.s, z2.b, z4.b\n"
-                ".inst 0x4504987c // smmla z28.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n"
-                ".inst 0x45059831 // smmla z17.s, z1.b, z5.b\n"
-                ".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n"
-                ".inst 0x4505987d // smmla z29.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
-                ".inst 0x45069832 // smmla z18.s, z1.b, z6.b\n"
-                ".inst 0x45069858 // smmla z24.s, z2.b, z6.b\n"
-                ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x4507980d // smmla z13.s, z0.b, z7.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45079833 // smmla z19.s, z1.b, z7.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
-                ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
-                ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
-                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
-                ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"
-                ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
-                ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
-                ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
-                ".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n"
-                ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n"
-                ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x4504980c // smmla z12.s, z0.b, z4.b\n"
-                ".inst 0x45049832 // smmla z18.s, z1.b, z4.b\n"
-                ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
-                ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x45069808 // smmla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
-                "addvl %[b_ptr], %[b_ptr], #14\n"
-                ".inst 0x4506987a // smmla z26.s, z3.b, z6.b\n"
-                ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
-                ".inst 0x4507982f // smmla z15.s, z1.b, z7.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x45079855 // smmla z21.s, z2.b, z7.b\n"
-                ".inst 0x4507987b // smmla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x4504980a // smmla z10.s, z0.b, z4.b\n"
-                ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
-                ".inst 0x45049856 // smmla z22.s, z2.b, z4.b\n"
-                ".inst 0x4504987c // smmla z28.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n"
-                ".inst 0x45059831 // smmla z17.s, z1.b, z5.b\n"
-                ".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n"
-                ".inst 0x4505987d // smmla z29.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
-                ".inst 0x45069832 // smmla z18.s, z1.b, z6.b\n"
-                ".inst 0x45069858 // smmla z24.s, z2.b, z6.b\n"
-                ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x4507980d // smmla z13.s, z0.b, z7.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45079833 // smmla z19.s, z1.b, z7.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
-                ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
-                ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
-                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
-                ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
-                ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"
-                ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
-                ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
-                ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
-                ".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n"
-                ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n"
-                ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n"
-                ".inst 0x4504980c // smmla z12.s, z0.b, z4.b\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                ".inst 0x45049832 // smmla z18.s, z1.b, z4.b\n"
-                ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
-                ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
-                "addvl %[b_ptr], %[b_ptr], #8\n"
-                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
-                ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
-                ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"
-                ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
-                ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
-                ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
-                ".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n"
-                ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n"
-                ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x4504980c // smmla z12.s, z0.b, z4.b\n"
-                ".inst 0x45049832 // smmla z18.s, z1.b, z4.b\n"
-                ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
-                ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x45069808 // smmla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
-                ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
-                ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
-                ".inst 0x4506987a // smmla z26.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x4507982f // smmla z15.s, z1.b, z7.b\n"
-                ".inst 0x45079855 // smmla z21.s, z2.b, z7.b\n"
-                ".inst 0x4507987b // smmla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x4504980a // smmla z10.s, z0.b, z4.b\n"
-                ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
-                ".inst 0x45049856 // smmla z22.s, z2.b, z4.b\n"
-                ".inst 0x4504987c // smmla z28.s, z3.b, z4.b\n"
-                ".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n"
-                ".inst 0x45059831 // smmla z17.s, z1.b, z5.b\n"
-                ".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n"
-                ".inst 0x4505987d // smmla z29.s, z3.b, z5.b\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
-                ".inst 0x45069832 // smmla z18.s, z1.b, z6.b\n"
-                ".inst 0x45069858 // smmla z24.s, z2.b, z6.b\n"
-                ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x4507980d // smmla z13.s, z0.b, z7.b\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x45079833 // smmla z19.s, z1.b, z7.b\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                "4:\n"
-                "uzp2 z2.d, z16.d, z17.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "uzp2 z3.d, z18.d, z19.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "uzp1 z4.d, z20.d, z21.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "uzp1 z5.d, z22.d, z23.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "uzp1 z6.d, z24.d, z25.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "uzp2 z7.d, z20.d, z21.d\n"
-                "st1w z0.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "uzp2 z0.d, z22.d, z23.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "uzp2 z1.d, z24.d, z25.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "uzp1 z2.d, z26.d, z27.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "uzp1 z3.d, z28.d, z29.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "uzp1 z4.d, z30.d, z31.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "uzp2 z5.d, z26.d, z27.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "uzp2 z6.d, z28.d, z29.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "uzp2 z7.d, z30.d, z31.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ld1b { z4.b }, p0/Z, [x22]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "addvl x22, x22, #2\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45049808  // smmla z8.s, z0.b, z4.b\n"
+      ".inst 0x4505980b  // smmla z11.s, z0.b, z5.b\n"
+      ".inst 0x4504982e  // smmla z14.s, z1.b, z4.b\n"
+      ".inst 0x45059831  // smmla z17.s, z1.b, z5.b\n"
+      "ld1b { z7.b }, p0/Z, [x22]\n"
+      ".inst 0x45049854  // smmla z20.s, z2.b, z4.b\n"
+      ".inst 0x45059857  // smmla z23.s, z2.b, z5.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x450498da  // smmla z26.s, z6.b, z4.b\n"
+      ".inst 0x450598dd  // smmla z29.s, z6.b, z5.b\n"
+      "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x4503980c  // smmla z12.s, z0.b, z3.b\n"
+      ".inst 0x4507982f  // smmla z15.s, z1.b, z7.b\n"
+      ".inst 0x45039832  // smmla z18.s, z1.b, z3.b\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x45079855  // smmla z21.s, z2.b, z7.b\n"
+      ".inst 0x45039858  // smmla z24.s, z2.b, z3.b\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x450798db  // smmla z27.s, z6.b, z7.b\n"
+      ".inst 0x450398de  // smmla z30.s, z6.b, z3.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #4, MUL VL]\n"
+      ".inst 0x4505980a  // smmla z10.s, z0.b, z5.b\n"
+      ".inst 0x4504980d  // smmla z13.s, z0.b, z4.b\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n"
+      ".inst 0x45059830  // smmla z16.s, z1.b, z5.b\n"
+      ".inst 0x45049833  // smmla z19.s, z1.b, z4.b\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n"
+      ".inst 0x45059856  // smmla z22.s, z2.b, z5.b\n"
+      ".inst 0x45049859  // smmla z25.s, z2.b, z4.b\n"
+      "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x450598dc  // smmla z28.s, z6.b, z5.b\n"
+      ".inst 0x450498df  // smmla z31.s, z6.b, z4.b\n"
+      "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #64]\n"
+      "ld1b { z2.b }, p0/Z, [x22, #6, MUL VL]\n"
+      ".inst 0x45039808  // smmla z8.s, z0.b, z3.b\n"
+      "ld1b { z4.b }, p0/Z, [x22, #7, MUL VL]\n"
+      "addvl x22, x22, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x4503982e  // smmla z14.s, z1.b, z3.b\n"
+      ".inst 0x45079831  // smmla z17.s, z1.b, z7.b\n"
+      ".inst 0x450398b4  // smmla z20.s, z5.b, z3.b\n"
+      ".inst 0x450798b7  // smmla z23.s, z5.b, z7.b\n"
+      ".inst 0x450398da  // smmla z26.s, z6.b, z3.b\n"
+      ".inst 0x450798dd  // smmla z29.s, z6.b, z7.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n"
+      "ld1b { z7.b }, p0/Z, [x22, #-7, MUL VL]\n"
+      ".inst 0x45029809  // smmla z9.s, z0.b, z2.b\n"
+      ".inst 0x4504980c  // smmla z12.s, z0.b, z4.b\n"
+      ".inst 0x4502982f  // smmla z15.s, z1.b, z2.b\n"
+      ".inst 0x45049832  // smmla z18.s, z1.b, z4.b\n"
+      ".inst 0x450298b5  // smmla z21.s, z5.b, z2.b\n"
+      ".inst 0x450498b8  // smmla z24.s, z5.b, z4.b\n"
+      ".inst 0x450298db  // smmla z27.s, z6.b, z2.b\n"
+      ".inst 0x450498de  // smmla z30.s, z6.b, z4.b\n"
+      "ld1b { z4.b }, p0/Z, [x22, #-6, MUL VL]\n"
+      ".inst 0x4503980a  // smmla z10.s, z0.b, z3.b\n"
+      ".inst 0x4507980d  // smmla z13.s, z0.b, z7.b\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n"
+      ".inst 0x45039830  // smmla z16.s, z1.b, z3.b\n"
+      ".inst 0x45079833  // smmla z19.s, z1.b, z7.b\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n"
+      ".inst 0x450398b6  // smmla z22.s, z5.b, z3.b\n"
+      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
+      "ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n"
+      ".inst 0x450398dc  // smmla z28.s, z6.b, z3.b\n"
+      ".inst 0x450798df  // smmla z31.s, z6.b, z7.b\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "addvl x22, x22, #-4\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45049808  // smmla z8.s, z0.b, z4.b\n"
+      ".inst 0x4505980b  // smmla z11.s, z0.b, z5.b\n"
+      ".inst 0x4504982e  // smmla z14.s, z1.b, z4.b\n"
+      ".inst 0x45059831  // smmla z17.s, z1.b, z5.b\n"
+      "ld1b { z6.b }, p0/Z, [x22]\n"
+      ".inst 0x45049854  // smmla z20.s, z2.b, z4.b\n"
+      ".inst 0x45059857  // smmla z23.s, z2.b, z5.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x450498fa  // smmla z26.s, z7.b, z4.b\n"
+      ".inst 0x450598fd  // smmla z29.s, z7.b, z5.b\n"
+      "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x45069809  // smmla z9.s, z0.b, z6.b\n"
+      ".inst 0x4503980c  // smmla z12.s, z0.b, z3.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45039832  // smmla z18.s, z1.b, z3.b\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x45039858  // smmla z24.s, z2.b, z3.b\n"
+      "addvl x22, x22, #4\n"
+      ".inst 0x450698fb  // smmla z27.s, z7.b, z6.b\n"
+      ".inst 0x450398fe  // smmla z30.s, z7.b, z3.b\n"
+      ".inst 0x4505980a  // smmla z10.s, z0.b, z5.b\n"
+      ".inst 0x4504980d  // smmla z13.s, z0.b, z4.b\n"
+      ".inst 0x45059830  // smmla z16.s, z1.b, z5.b\n"
+      ".inst 0x45049833  // smmla z19.s, z1.b, z4.b\n"
+      ".inst 0x45059856  // smmla z22.s, z2.b, z5.b\n"
+      ".inst 0x45049859  // smmla z25.s, z2.b, z4.b\n"
+      ".inst 0x450598fc  // smmla z28.s, z7.b, z5.b\n"
+      ".inst 0x450498ff  // smmla z31.s, z7.b, z4.b\n"
+      "cbz x20, 5f\n"
+      "ld1b { z1.b }, p0/Z, [x22]\n"
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x450198e8  // smmla z8.s, z7.b, z1.b\n"
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x450098eb  // smmla z11.s, z7.b, z0.b\n"
+      "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqb { z4.b }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x450198ce  // smmla z14.s, z6.b, z1.b\n"
+      ".inst 0x450098d1  // smmla z17.s, z6.b, z0.b\n"
+      ".inst 0x450198b4  // smmla z20.s, z5.b, z1.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x450098b7  // smmla z23.s, z5.b, z0.b\n"
+      ".inst 0x4501989a  // smmla z26.s, z4.b, z1.b\n"
+      "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x450398e9  // smmla z9.s, z7.b, z3.b\n"
+      ".inst 0x450298ec  // smmla z12.s, z7.b, z2.b\n"
+      "addvl x22, x22, #6\n"
+      ".inst 0x450398cf  // smmla z15.s, z6.b, z3.b\n"
+      ".inst 0x450298d2  // smmla z18.s, z6.b, z2.b\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x450398b5  // smmla z21.s, z5.b, z3.b\n"
+      ".inst 0x450298b8  // smmla z24.s, z5.b, z2.b\n"
+      ".inst 0x4503989b  // smmla z27.s, z4.b, z3.b\n"
+      ".inst 0x4502989e  // smmla z30.s, z4.b, z2.b\n"
+      ".inst 0x450198ea  // smmla z10.s, z7.b, z1.b\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450198d0  // smmla z16.s, z6.b, z1.b\n"
+      ".inst 0x450098d3  // smmla z19.s, z6.b, z0.b\n"
+      ".inst 0x450198b6  // smmla z22.s, z5.b, z1.b\n"
+      ".inst 0x450098b9  // smmla z25.s, z5.b, z0.b\n"
+      ".inst 0x4501989c  // smmla z28.s, z4.b, z1.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
+      "5:"  // multiply loop done
+      "uzp1 z0.d, z8.d, z11.d\n"
+      "uzp2 z8.d, z8.d, z11.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+      "uzp1 z0.d, z9.d, z12.d\n"
+      "uzp2 z9.d, z9.d, z12.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "uzp1 z0.d, z10.d, z13.d\n"
+      "uzp2 z10.d, z10.d, z13.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "uzp1 z0.d, z14.d, z17.d\n"
+      "uzp2 z14.d, z14.d, z17.d\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "uzp1 z1.d, z15.d, z18.d\n"
+      "subs x23, x23, #0x1\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "uzp2 z15.d, z15.d, z18.d\n"
+      "uzp1 z17.d, z16.d, z19.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "uzp2 z16.d, z16.d, z19.d\n"
+      "uzp1 z0.d, z20.d, z23.d\n"
+      "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "uzp2 z20.d, z20.d, z23.d\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "uzp1 z23.d, z21.d, z24.d\n"
+      "uzp2 z21.d, z21.d, z24.d\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "uzp1 z19.d, z22.d, z25.d\n"
+      "uzp2 z22.d, z22.d, z25.d\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "uzp1 z18.d, z26.d, z29.d\n"
+      "uzp2 z26.d, z26.d, z29.d\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "uzp1 z17.d, z27.d, z30.d\n"
+      "uzp2 z27.d, z27.d, z30.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "uzp1 z16.d, z28.d, z31.d\n"
+      "uzp2 z28.d, z28.d, z31.d\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index 832a224199..c0b215ccb4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,52 +22,97 @@
  * SOFTWARE.
  */
 #pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
-#ifdef __ARM_FEATURE_SVE
-
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const uint8_t *, const uint8_t *, \
+    uint32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_dot_8x3VL( ARGLIST );
+void sve_interleaved_u8u32_dot_8x3VL_a64fx( ARGLIST );
 
-class cls_sve_interleaved_u8u32_dot_8x3VL {
+class cls_sve_interleaved_u8u32_dot_8x3VL
+{
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
 
-    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return get_vector_length<uint32_t>() * 3;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return get_vector_length<uint32_t>() * 3;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 4;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_u8u32_dot_8x3VL;
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.66, 4.11, 7.94 };
+                case CPUModel::A510:
+                    return { 27.44, 3.41, 2.90 };
+                case CPUModel::V1:
+                    return { 63.30, 4.97, 11.52 };
+                case CPUModel::A64FX:
+                    return { 109.76, 3.88, 6.76 };
+            }
+        }
 
-    cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *)
-    {
 
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.67, 4.04, 0.50 };
+                case CPUModel::A510:
+                    return { 27.45, 1.65, 0.28 };
+                case CPUModel::V1:
+                    return { 52.24, 7.49, 0.80 };
+                case CPUModel::A64FX:
+                    return { 110.18, 2.34, 0.40 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_u8u32_dot_8x3VL;
+    cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_interleaved_u8u32_dot_8x3VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
-#endif // __ARM_FEATURE_SVE
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
new file mode 100644
index 0000000000..79e794a834
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_interleaved_u8u32_dot_8x3VL_a64fx(
+    const uint8_t *Apanel,
+    const uint8_t *Bpanel,
+    uint32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
+
+    struct KernelArgs {
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
+
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ld1b { z0.b }, p0/Z, [x22]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "udot z8.s, z0.b, z3.b\n"
+      "udot z9.s, z1.b, z3.b\n"
+      "sub x20, x20, #0x2\n"
+      "udot z10.s, z2.b, z3.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
+      "udot z11.s, z0.b, z4.b\n"
+      "udot z12.s, z1.b, z4.b\n"
+      "udot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "udot z14.s, z0.b, z5.b\n"
+      "udot z15.s, z1.b, z5.b\n"
+      "cmp x20, #0x2\n"
+      "udot z16.s, z2.b, z5.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
+      "udot z17.s, z0.b, z6.b\n"
+      "udot z18.s, z1.b, z6.b\n"
+      "udot z19.s, z2.b, z6.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+      "udot z20.s, z0.b, z7.b\n"
+      "udot z21.s, z1.b, z7.b\n"
+      "udot z22.s, z2.b, z7.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
+      "udot z23.s, z0.b, z4.b\n"
+      "udot z24.s, z1.b, z4.b\n"
+      "udot z25.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
+      "udot z26.s, z0.b, z3.b\n"
+      "udot z27.s, z1.b, z3.b\n"
+      "udot z28.s, z2.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
+      "udot z29.s, z0.b, z5.b\n"
+      "ld1b { z6.b }, p0/Z, [x22, #3, MUL VL]\n"
+      "udot z30.s, z1.b, z5.b\n"
+      "udot z31.s, z2.b, z5.b\n"
+      "ld1b { z2.b }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z7.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+      "udot z9.s, z2.b, z7.b\n"
+      "udot z10.s, z5.b, z7.b\n"
+      "udot z11.s, z6.b, z4.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+      "udot z12.s, z2.b, z4.b\n"
+      "udot z13.s, z5.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
+      "udot z14.s, z6.b, z3.b\n"
+      "udot z15.s, z2.b, z3.b\n"
+      "addvl x22, x22, #6\n"
+      "udot z16.s, z5.b, z3.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+      "udot z17.s, z6.b, z1.b\n"
+      "udot z18.s, z2.b, z1.b\n"
+      "udot z19.s, z5.b, z1.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "udot z20.s, z6.b, z7.b\n"
+      "udot z21.s, z2.b, z7.b\n"
+      "udot z22.s, z5.b, z7.b\n"
+      "udot z23.s, z6.b, z4.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "udot z24.s, z2.b, z4.b\n"
+      "udot z25.s, z5.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "udot z26.s, z6.b, z0.b\n"
+      "udot z27.s, z2.b, z0.b\n"
+      "udot z28.s, z5.b, z0.b\n"
+      "udot z29.s, z6.b, z1.b\n"
+      "ld1b { z0.b }, p0/Z, [x22]\n"
+      "udot z30.s, z2.b, z1.b\n"
+      "udot z31.s, z5.b, z1.b\n"
+      "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "udot z8.s, z0.b, z3.b\n"
+      "udot z9.s, z1.b, z3.b\n"
+      "addvl x22, x22, #3\n"
+      "udot z10.s, z2.b, z3.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
+      "udot z11.s, z0.b, z4.b\n"
+      "udot z12.s, z1.b, z4.b\n"
+      "udot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "udot z14.s, z0.b, z5.b\n"
+      "udot z15.s, z1.b, z5.b\n"
+      "udot z16.s, z2.b, z5.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "udot z17.s, z0.b, z6.b\n"
+      "udot z18.s, z1.b, z6.b\n"
+      "udot z19.s, z2.b, z6.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+      "udot z20.s, z0.b, z7.b\n"
+      "udot z21.s, z1.b, z7.b\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "udot z22.s, z2.b, z7.b\n"
+      "udot z23.s, z0.b, z4.b\n"
+      "udot z24.s, z1.b, z4.b\n"
+      "udot z25.s, z2.b, z4.b\n"
+      "udot z26.s, z0.b, z5.b\n"
+      "udot z27.s, z1.b, z5.b\n"
+      "udot z28.s, z2.b, z5.b\n"
+      "udot z29.s, z0.b, z3.b\n"
+      "udot z30.s, z1.b, z3.b\n"
+      "udot z31.s, z2.b, z3.b\n"
+      "cbz x20, 5f\n"
+      "ld1b { z6.b }, p0/Z, [x22]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "udot z8.s, z6.b, z3.b\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+      "udot z9.s, z5.b, z3.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+      "udot z10.s, z4.b, z3.b\n"
+      "udot z11.s, z6.b, z2.b\n"
+      "udot z12.s, z5.b, z2.b\n"
+      "udot z13.s, z4.b, z2.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z15.s, z5.b, z1.b\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+      "udot z16.s, z4.b, z1.b\n"
+      "udot z17.s, z6.b, z0.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+      "udot z18.s, z5.b, z0.b\n"
+      "udot z19.s, z4.b, z0.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "udot z21.s, z5.b, z3.b\n"
+      "addvl x22, x22, #3\n"
+      "udot z22.s, z4.b, z3.b\n"
+      "udot z23.s, z6.b, z2.b\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "udot z24.s, z5.b, z2.b\n"
+      "udot z25.s, z4.b, z2.b\n"
+      "udot z26.s, z6.b, z1.b\n"
+      "udot z27.s, z5.b, z1.b\n"
+      "udot z28.s, z4.b, z1.b\n"
+      "udot z29.s, z6.b, z0.b\n"
+      "udot z30.s, z5.b, z0.b\n"
+      "udot z31.s, z4.b, z0.b\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x23, x23, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
index 891869c767..1c88336c2d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,309 +21,230 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
+void sve_interleaved_u8u32_dot_8x3VL(
+    const uint8_t *Apanel,
+    const uint8_t *Bpanel,
+    uint32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
-    K /= 4;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.b\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z17.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "mov z18.s, #0\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                "udot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "udot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "udot z10.s, z4.b, z0.b[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                "udot z11.s, z4.b, z0.b[3]\n"
-                "udot z20.s, z4.b, z1.b[0]\n"
-                "udot z21.s, z4.b, z1.b[1]\n"
-                "udot z22.s, z4.b, z1.b[2]\n"
-                "udot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "udot z12.s, z5.b, z0.b[0]\n"
-                "udot z13.s, z5.b, z0.b[1]\n"
-                "udot z14.s, z5.b, z0.b[2]\n"
-                "udot z15.s, z5.b, z0.b[3]\n"
-                "udot z24.s, z5.b, z1.b[0]\n"
-                "udot z25.s, z5.b, z1.b[1]\n"
-                "udot z26.s, z5.b, z1.b[2]\n"
-                "udot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "udot z16.s, z6.b, z0.b[0]\n"
-                "udot z17.s, z6.b, z0.b[1]\n"
-                "udot z18.s, z6.b, z0.b[2]\n"
-                "udot z19.s, z6.b, z0.b[3]\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "udot z28.s, z6.b, z1.b[0]\n"
-                "udot z29.s, z6.b, z1.b[1]\n"
-                "udot z30.s, z6.b, z1.b[2]\n"
-                "udot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "udot z8.s, z4.b, z2.b[0]\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "udot z9.s, z4.b, z2.b[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "udot z10.s, z4.b, z2.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "udot z11.s, z4.b, z2.b[3]\n"
-                "udot z20.s, z4.b, z3.b[0]\n"
-                "udot z21.s, z4.b, z3.b[1]\n"
-                "udot z22.s, z4.b, z3.b[2]\n"
-                "udot z23.s, z4.b, z3.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "udot z12.s, z5.b, z2.b[0]\n"
-                "udot z13.s, z5.b, z2.b[1]\n"
-                "udot z14.s, z5.b, z2.b[2]\n"
-                "udot z15.s, z5.b, z2.b[3]\n"
-                "udot z24.s, z5.b, z3.b[0]\n"
-                "udot z25.s, z5.b, z3.b[1]\n"
-                "udot z26.s, z5.b, z3.b[2]\n"
-                "udot z27.s, z5.b, z3.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "udot z16.s, z6.b, z2.b[0]\n"
-                "udot z17.s, z6.b, z2.b[1]\n"
-                "udot z18.s, z6.b, z2.b[2]\n"
-                "udot z19.s, z6.b, z2.b[3]\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                "udot z28.s, z6.b, z3.b[0]\n"
-                "udot z29.s, z6.b, z3.b[1]\n"
-                "udot z30.s, z6.b, z3.b[2]\n"
-                "udot z31.s, z6.b, z3.b[3]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                "udot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "udot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "udot z10.s, z4.b, z0.b[2]\n"
-                "udot z11.s, z4.b, z0.b[3]\n"
-                "udot z20.s, z4.b, z1.b[0]\n"
-                "udot z21.s, z4.b, z1.b[1]\n"
-                "udot z22.s, z4.b, z1.b[2]\n"
-                "udot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "udot z12.s, z5.b, z0.b[0]\n"
-                "udot z13.s, z5.b, z0.b[1]\n"
-                "udot z14.s, z5.b, z0.b[2]\n"
-                "udot z15.s, z5.b, z0.b[3]\n"
-                "udot z24.s, z5.b, z1.b[0]\n"
-                "udot z25.s, z5.b, z1.b[1]\n"
-                "udot z26.s, z5.b, z1.b[2]\n"
-                "udot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "udot z16.s, z6.b, z0.b[0]\n"
-                "udot z17.s, z6.b, z0.b[1]\n"
-                "udot z18.s, z6.b, z0.b[2]\n"
-                "udot z19.s, z6.b, z0.b[3]\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "udot z28.s, z6.b, z1.b[0]\n"
-                "udot z29.s, z6.b, z1.b[1]\n"
-                "udot z30.s, z6.b, z1.b[2]\n"
-                "udot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "udot z8.s, z4.b, z2.b[0]\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "udot z9.s, z4.b, z2.b[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "udot z10.s, z4.b, z2.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "udot z11.s, z4.b, z2.b[3]\n"
-                "udot z20.s, z4.b, z3.b[0]\n"
-                "udot z21.s, z4.b, z3.b[1]\n"
-                "udot z22.s, z4.b, z3.b[2]\n"
-                "udot z23.s, z4.b, z3.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "udot z12.s, z5.b, z2.b[0]\n"
-                "udot z13.s, z5.b, z2.b[1]\n"
-                "udot z14.s, z5.b, z2.b[2]\n"
-                "udot z15.s, z5.b, z2.b[3]\n"
-                "udot z24.s, z5.b, z3.b[0]\n"
-                "udot z25.s, z5.b, z3.b[1]\n"
-                "udot z26.s, z5.b, z3.b[2]\n"
-                "udot z27.s, z5.b, z3.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "udot z16.s, z6.b, z2.b[0]\n"
-                "udot z17.s, z6.b, z2.b[1]\n"
-                "udot z18.s, z6.b, z2.b[2]\n"
-                "udot z19.s, z6.b, z2.b[3]\n"
-                "udot z28.s, z6.b, z3.b[0]\n"
-                "udot z29.s, z6.b, z3.b[1]\n"
-                "udot z30.s, z6.b, z3.b[2]\n"
-                "udot z31.s, z6.b, z3.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "udot z8.s, z4.b, z0.b[0]\n"
-                "udot z9.s, z4.b, z0.b[1]\n"
-                "udot z10.s, z4.b, z0.b[2]\n"
-                "udot z11.s, z4.b, z0.b[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "udot z20.s, z4.b, z1.b[0]\n"
-                "udot z21.s, z4.b, z1.b[1]\n"
-                "udot z22.s, z4.b, z1.b[2]\n"
-                "udot z23.s, z4.b, z1.b[3]\n"
-                "udot z12.s, z5.b, z0.b[0]\n"
-                "udot z13.s, z5.b, z0.b[1]\n"
-                "udot z14.s, z5.b, z0.b[2]\n"
-                "udot z15.s, z5.b, z0.b[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "udot z24.s, z5.b, z1.b[0]\n"
-                "udot z25.s, z5.b, z1.b[1]\n"
-                "udot z26.s, z5.b, z1.b[2]\n"
-                "udot z27.s, z5.b, z1.b[3]\n"
-                "udot z16.s, z6.b, z0.b[0]\n"
-                "udot z17.s, z6.b, z0.b[1]\n"
-                "udot z18.s, z6.b, z0.b[2]\n"
-                "udot z19.s, z6.b, z0.b[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "udot z28.s, z6.b, z1.b[0]\n"
-                "udot z29.s, z6.b, z1.b[1]\n"
-                "udot z30.s, z6.b, z1.b[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "udot z31.s, z6.b, z1.b[3]\n"
-                "b 4f\n"
-                "3:\n"
-                "udot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "udot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "udot z10.s, z4.b, z0.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "udot z11.s, z4.b, z0.b[3]\n"
-                "udot z20.s, z4.b, z1.b[0]\n"
-                "udot z21.s, z4.b, z1.b[1]\n"
-                "udot z22.s, z4.b, z1.b[2]\n"
-                "udot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "udot z12.s, z5.b, z0.b[0]\n"
-                "udot z13.s, z5.b, z0.b[1]\n"
-                "udot z14.s, z5.b, z0.b[2]\n"
-                "udot z15.s, z5.b, z0.b[3]\n"
-                "udot z24.s, z5.b, z1.b[0]\n"
-                "udot z25.s, z5.b, z1.b[1]\n"
-                "udot z26.s, z5.b, z1.b[2]\n"
-                "udot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "udot z16.s, z6.b, z0.b[0]\n"
-                "udot z17.s, z6.b, z0.b[1]\n"
-                "udot z18.s, z6.b, z0.b[2]\n"
-                "udot z19.s, z6.b, z0.b[3]\n"
-                "udot z28.s, z6.b, z1.b[0]\n"
-                "udot z29.s, z6.b, z1.b[1]\n"
-                "udot z30.s, z6.b, z1.b[2]\n"
-                "udot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "udot z8.s, z4.b, z2.b[0]\n"
-                "udot z9.s, z4.b, z2.b[1]\n"
-                "udot z10.s, z4.b, z2.b[2]\n"
-                "udot z11.s, z4.b, z2.b[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "udot z20.s, z4.b, z3.b[0]\n"
-                "udot z21.s, z4.b, z3.b[1]\n"
-                "udot z22.s, z4.b, z3.b[2]\n"
-                "udot z23.s, z4.b, z3.b[3]\n"
-                "udot z12.s, z5.b, z2.b[0]\n"
-                "udot z13.s, z5.b, z2.b[1]\n"
-                "udot z14.s, z5.b, z2.b[2]\n"
-                "udot z15.s, z5.b, z2.b[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "udot z24.s, z5.b, z3.b[0]\n"
-                "udot z25.s, z5.b, z3.b[1]\n"
-                "udot z26.s, z5.b, z3.b[2]\n"
-                "udot z27.s, z5.b, z3.b[3]\n"
-                "udot z16.s, z6.b, z2.b[0]\n"
-                "udot z17.s, z6.b, z2.b[1]\n"
-                "udot z18.s, z6.b, z2.b[2]\n"
-                "udot z19.s, z6.b, z2.b[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "udot z28.s, z6.b, z3.b[0]\n"
-                "udot z29.s, z6.b, z3.b[1]\n"
-                "udot z30.s, z6.b, z3.b[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "udot z31.s, z6.b, z3.b[3]\n"
-                "4:\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "st1w z25.s, p0, [%[c_ptr]]\n"
-                "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "ld1b { z4.b }, p0/Z, [x22]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "udot z8.s, z4.b, z0.b[0]\n"
+      "udot z11.s, z4.b, z0.b[1]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #32]\n"
+      "udot z14.s, z4.b, z0.b[2]\n"
+      "udot z17.s, z4.b, z0.b[3]\n"
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel], #48]\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
+      "udot z23.s, z4.b, z1.b[1]\n"
+      "sub x20, x20, #0x2\n"
+      "udot z26.s, z4.b, z1.b[2]\n"
+      "udot z29.s, z4.b, z1.b[3]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+      "udot z9.s, z5.b, z0.b[0]\n"
+      "udot z12.s, z5.b, z0.b[1]\n"
+      "cmp x20, #0x2\n"
+      "udot z15.s, z5.b, z0.b[2]\n"
+      "udot z18.s, z5.b, z0.b[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "udot z21.s, z5.b, z1.b[0]\n"
+      "udot z24.s, z5.b, z1.b[1]\n"
+      "udot z27.s, z5.b, z1.b[2]\n"
+      "udot z30.s, z5.b, z1.b[3]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #4, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z13.s, z6.b, z0.b[1]\n"
+      "udot z16.s, z6.b, z0.b[2]\n"
+      "udot z19.s, z6.b, z0.b[3]\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "udot z22.s, z6.b, z1.b[0]\n"
+      "udot z25.s, z6.b, z1.b[1]\n"
+      "udot z28.s, z6.b, z1.b[2]\n"
+      "udot z31.s, z6.b, z1.b[3]\n"
+      "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
+      "addvl x22, x22, #6\n"
+      "udot z8.s, z4.b, z3.b[0]\n"
+      "udot z11.s, z4.b, z3.b[1]\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "udot z14.s, z4.b, z3.b[2]\n"
+      "udot z17.s, z4.b, z3.b[3]\n"
+      "udot z20.s, z4.b, z7.b[0]\n"
+      "udot z23.s, z4.b, z7.b[1]\n"
+      "udot z26.s, z4.b, z7.b[2]\n"
+      "udot z29.s, z4.b, z7.b[3]\n"
+      "ld1b { z4.b }, p0/Z, [x22]\n"
+      "udot z9.s, z5.b, z3.b[0]\n"
+      "udot z12.s, z5.b, z3.b[1]\n"
+      "udot z15.s, z5.b, z3.b[2]\n"
+      "udot z18.s, z5.b, z3.b[3]\n"
+      "udot z21.s, z5.b, z7.b[0]\n"
+      "udot z24.s, z5.b, z7.b[1]\n"
+      "udot z27.s, z5.b, z7.b[2]\n"
+      "udot z30.s, z5.b, z7.b[3]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "udot z10.s, z2.b, z3.b[0]\n"
+      "udot z13.s, z2.b, z3.b[1]\n"
+      "udot z16.s, z2.b, z3.b[2]\n"
+      "udot z19.s, z2.b, z3.b[3]\n"
+      "udot z22.s, z2.b, z7.b[0]\n"
+      "udot z25.s, z2.b, z7.b[1]\n"
+      "udot z28.s, z2.b, z7.b[2]\n"
+      "udot z31.s, z2.b, z7.b[3]\n"
+      "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "udot z8.s, z4.b, z0.b[0]\n"
+      "udot z11.s, z4.b, z0.b[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "udot z14.s, z4.b, z0.b[2]\n"
+      "udot z17.s, z4.b, z0.b[3]\n"
+      "addvl x22, x22, #3\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
+      "udot z23.s, z4.b, z1.b[1]\n"
+      "udot z26.s, z4.b, z1.b[2]\n"
+      "udot z29.s, z4.b, z1.b[3]\n"
+      "udot z9.s, z5.b, z0.b[0]\n"
+      "udot z12.s, z5.b, z0.b[1]\n"
+      "udot z15.s, z5.b, z0.b[2]\n"
+      "udot z18.s, z5.b, z0.b[3]\n"
+      "udot z21.s, z5.b, z1.b[0]\n"
+      "udot z24.s, z5.b, z1.b[1]\n"
+      "udot z27.s, z5.b, z1.b[2]\n"
+      "udot z30.s, z5.b, z1.b[3]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z13.s, z6.b, z0.b[1]\n"
+      "udot z16.s, z6.b, z0.b[2]\n"
+      "udot z19.s, z6.b, z0.b[3]\n"
+      "udot z22.s, z6.b, z1.b[0]\n"
+      "udot z25.s, z6.b, z1.b[1]\n"
+      "udot z28.s, z6.b, z1.b[2]\n"
+      "udot z31.s, z6.b, z1.b[3]\n"
+      "cbz x20, 5f\n"
+      "ld1rqb { z4.b }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #16]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ld1b { z2.b }, p0/Z, [x22]\n"
+      "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "udot z8.s, z2.b, z4.b[0]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "udot z11.s, z2.b, z4.b[1]\n"
+      "udot z14.s, z2.b, z4.b[2]\n"
+      "udot z17.s, z2.b, z4.b[3]\n"
+      "udot z20.s, z2.b, z3.b[0]\n"
+      "addvl x22, x22, #3\n"
+      "udot z23.s, z2.b, z3.b[1]\n"
+      "udot z26.s, z2.b, z3.b[2]\n"
+      "udot z29.s, z2.b, z3.b[3]\n"
+      "udot z9.s, z1.b, z4.b[0]\n"
+      "udot z12.s, z1.b, z4.b[1]\n"
+      "udot z15.s, z1.b, z4.b[2]\n"
+      "udot z18.s, z1.b, z4.b[3]\n"
+      "udot z21.s, z1.b, z3.b[0]\n"
+      "udot z24.s, z1.b, z3.b[1]\n"
+      "udot z27.s, z1.b, z3.b[2]\n"
+      "udot z30.s, z1.b, z3.b[3]\n"
+      "udot z10.s, z0.b, z4.b[0]\n"
+      "udot z13.s, z0.b, z4.b[1]\n"
+      "udot z16.s, z0.b, z4.b[2]\n"
+      "udot z19.s, z0.b, z4.b[3]\n"
+      "udot z22.s, z0.b, z3.b[0]\n"
+      "udot z25.s, z0.b, z3.b[1]\n"
+      "udot z28.s, z0.b, z3.b[2]\n"
+      "udot z31.s, z0.b, z3.b[3]\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x23, x23, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index 4fdaab84bd..067d0bf258 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,52 +22,85 @@
  * SOFTWARE.
  */
 #pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
-#ifdef __ARM_FEATURE_SVE
-
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const uint8_t *, const uint8_t *, \
+    uint32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_mmla_8x3VL( ARGLIST );
 
-class cls_sve_interleaved_u8u32_mmla_8x3VL {
+class cls_sve_interleaved_u8u32_mmla_8x3VL
+{
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
 
-    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
-    static unsigned int out_width()
+    static constexpr unsigned int out_height()
     {
-        return get_vector_length<uint32_t>() * 3;
+        return 8;
     }
 
-    static unsigned int out_height()
+    static unsigned int out_width()
     {
-        return 8;
+        return get_vector_length<uint32_t>() * 3;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 8;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_u8u32_mmla_8x3VL;
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 61.97, 4.11, 7.93 };
+                case CPUModel::A510:
+                    return { 43.18, 3.57, 2.89 };
+                case CPUModel::V1:
+                    return { 123.47, 5.03, 11.76 };
+            }
+        }
+
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 62.00, 4.08, 0.51 };
+                case CPUModel::A510:
+                    return { 38.02, 1.85, 0.28 };
+                case CPUModel::V1:
+                    return { 95.28, 7.99, 0.79 };
+            }
+        }
 
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_u8u32_mmla_8x3VL;
     cls_sve_interleaved_u8u32_mmla_8x3VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
-#endif // __ARM_FEATURE_SVE
+#undef ARGLIST
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
index fa08a9d091..28449ea99b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,377 +21,277 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
+void sve_interleaved_u8u32_mmla_8x3VL(
+    const uint8_t *Apanel,
+    const uint8_t *Bpanel,
+    uint32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
-    K /= 8;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+        size_t bblocks = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
+    ka.bblocks = bblocks;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.b\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "mov z17.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z18.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #4\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
-                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
-                ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
-                ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"
-                ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
-                ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
-                ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
-                ".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n"
-                ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n"
-                ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x45c4980c // ummla z12.s, z0.b, z4.b\n"
-                ".inst 0x45c49832 // ummla z18.s, z1.b, z4.b\n"
-                ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
-                ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x45c69808 // ummla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
-                "addvl %[b_ptr], %[b_ptr], #12\n"
-                ".inst 0x45c6987a // ummla z26.s, z3.b, z6.b\n"
-                ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
-                ".inst 0x45c7982f // ummla z15.s, z1.b, z7.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x45c79855 // ummla z21.s, z2.b, z7.b\n"
-                ".inst 0x45c7987b // ummla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x45c4980a // ummla z10.s, z0.b, z4.b\n"
-                ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
-                ".inst 0x45c49856 // ummla z22.s, z2.b, z4.b\n"
-                ".inst 0x45c4987c // ummla z28.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n"
-                ".inst 0x45c59831 // ummla z17.s, z1.b, z5.b\n"
-                ".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n"
-                ".inst 0x45c5987d // ummla z29.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
-                ".inst 0x45c69832 // ummla z18.s, z1.b, z6.b\n"
-                ".inst 0x45c69858 // ummla z24.s, z2.b, z6.b\n"
-                ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x45c7980d // ummla z13.s, z0.b, z7.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45c79833 // ummla z19.s, z1.b, z7.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
-                ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
-                ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
-                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
-                ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"
-                ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
-                ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
-                ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
-                ".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n"
-                ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n"
-                ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x45c4980c // ummla z12.s, z0.b, z4.b\n"
-                ".inst 0x45c49832 // ummla z18.s, z1.b, z4.b\n"
-                ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
-                ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x45c69808 // ummla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
-                "addvl %[b_ptr], %[b_ptr], #14\n"
-                ".inst 0x45c6987a // ummla z26.s, z3.b, z6.b\n"
-                ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
-                ".inst 0x45c7982f // ummla z15.s, z1.b, z7.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x45c79855 // ummla z21.s, z2.b, z7.b\n"
-                ".inst 0x45c7987b // ummla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x45c4980a // ummla z10.s, z0.b, z4.b\n"
-                ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
-                ".inst 0x45c49856 // ummla z22.s, z2.b, z4.b\n"
-                ".inst 0x45c4987c // ummla z28.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n"
-                ".inst 0x45c59831 // ummla z17.s, z1.b, z5.b\n"
-                ".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n"
-                ".inst 0x45c5987d // ummla z29.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
-                ".inst 0x45c69832 // ummla z18.s, z1.b, z6.b\n"
-                ".inst 0x45c69858 // ummla z24.s, z2.b, z6.b\n"
-                ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x45c7980d // ummla z13.s, z0.b, z7.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45c79833 // ummla z19.s, z1.b, z7.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
-                ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
-                ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
-                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
-                ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
-                ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"
-                ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
-                ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
-                ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
-                ".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n"
-                ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n"
-                ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n"
-                ".inst 0x45c4980c // ummla z12.s, z0.b, z4.b\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                ".inst 0x45c49832 // ummla z18.s, z1.b, z4.b\n"
-                ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
-                ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
-                "addvl %[b_ptr], %[b_ptr], #8\n"
-                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
-                ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
-                ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"
-                ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
-                ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
-                ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
-                ".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n"
-                ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n"
-                ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x45c4980c // ummla z12.s, z0.b, z4.b\n"
-                ".inst 0x45c49832 // ummla z18.s, z1.b, z4.b\n"
-                ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
-                ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x45c69808 // ummla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
-                ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
-                ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
-                ".inst 0x45c6987a // ummla z26.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x45c7982f // ummla z15.s, z1.b, z7.b\n"
-                ".inst 0x45c79855 // ummla z21.s, z2.b, z7.b\n"
-                ".inst 0x45c7987b // ummla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x45c4980a // ummla z10.s, z0.b, z4.b\n"
-                ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
-                ".inst 0x45c49856 // ummla z22.s, z2.b, z4.b\n"
-                ".inst 0x45c4987c // ummla z28.s, z3.b, z4.b\n"
-                ".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n"
-                ".inst 0x45c59831 // ummla z17.s, z1.b, z5.b\n"
-                ".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n"
-                ".inst 0x45c5987d // ummla z29.s, z3.b, z5.b\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
-                ".inst 0x45c69832 // ummla z18.s, z1.b, z6.b\n"
-                ".inst 0x45c69858 // ummla z24.s, z2.b, z6.b\n"
-                ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x45c7980d // ummla z13.s, z0.b, z7.b\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x45c79833 // ummla z19.s, z1.b, z7.b\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                "4:\n"
-                "uzp2 z2.d, z16.d, z17.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "uzp2 z3.d, z18.d, z19.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "uzp1 z4.d, z20.d, z21.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "uzp1 z5.d, z22.d, z23.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "uzp1 z6.d, z24.d, z25.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "uzp2 z7.d, z20.d, z21.d\n"
-                "st1w z0.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "uzp2 z0.d, z22.d, z23.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "uzp2 z1.d, z24.d, z25.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "uzp1 z2.d, z26.d, z27.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "uzp1 z3.d, z28.d, z29.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "uzp1 z4.d, z30.d, z31.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "uzp2 z5.d, z26.d, z27.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "uzp2 z6.d, z28.d, z29.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "uzp2 z7.d, z30.d, z31.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "mov x21, %x[Apanel]\n"
+      "2:"  // Width loop
+      "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x20, #0x2\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ld1b { z4.b }, p0/Z, [x22]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "addvl x22, x22, #2\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45c49808  // ummla z8.s, z0.b, z4.b\n"
+      ".inst 0x45c5980b  // ummla z11.s, z0.b, z5.b\n"
+      ".inst 0x45c4982e  // ummla z14.s, z1.b, z4.b\n"
+      ".inst 0x45c59831  // ummla z17.s, z1.b, z5.b\n"
+      "ld1b { z7.b }, p0/Z, [x22]\n"
+      ".inst 0x45c49854  // ummla z20.s, z2.b, z4.b\n"
+      ".inst 0x45c59857  // ummla z23.s, z2.b, z5.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x45c498da  // ummla z26.s, z6.b, z4.b\n"
+      ".inst 0x45c598dd  // ummla z29.s, z6.b, z5.b\n"
+      "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c3980c  // ummla z12.s, z0.b, z3.b\n"
+      ".inst 0x45c7982f  // ummla z15.s, z1.b, z7.b\n"
+      ".inst 0x45c39832  // ummla z18.s, z1.b, z3.b\n"
+      "sub x20, x20, #0x2\n"
+      ".inst 0x45c79855  // ummla z21.s, z2.b, z7.b\n"
+      ".inst 0x45c39858  // ummla z24.s, z2.b, z3.b\n"
+      "cmp x20, #0x2\n"
+      ".inst 0x45c798db  // ummla z27.s, z6.b, z7.b\n"
+      ".inst 0x45c398de  // ummla z30.s, z6.b, z3.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #4, MUL VL]\n"
+      ".inst 0x45c5980a  // ummla z10.s, z0.b, z5.b\n"
+      ".inst 0x45c4980d  // ummla z13.s, z0.b, z4.b\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n"
+      ".inst 0x45c59830  // ummla z16.s, z1.b, z5.b\n"
+      ".inst 0x45c49833  // ummla z19.s, z1.b, z4.b\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n"
+      ".inst 0x45c59856  // ummla z22.s, z2.b, z5.b\n"
+      ".inst 0x45c49859  // ummla z25.s, z2.b, z4.b\n"
+      "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x45c598dc  // ummla z28.s, z6.b, z5.b\n"
+      ".inst 0x45c498df  // ummla z31.s, z6.b, z4.b\n"
+      "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #64]\n"
+      "ld1b { z2.b }, p0/Z, [x22, #6, MUL VL]\n"
+      ".inst 0x45c39808  // ummla z8.s, z0.b, z3.b\n"
+      "ld1b { z4.b }, p0/Z, [x22, #7, MUL VL]\n"
+      "addvl x22, x22, #16\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c3982e  // ummla z14.s, z1.b, z3.b\n"
+      ".inst 0x45c79831  // ummla z17.s, z1.b, z7.b\n"
+      ".inst 0x45c398b4  // ummla z20.s, z5.b, z3.b\n"
+      ".inst 0x45c798b7  // ummla z23.s, z5.b, z7.b\n"
+      ".inst 0x45c398da  // ummla z26.s, z6.b, z3.b\n"
+      ".inst 0x45c798dd  // ummla z29.s, z6.b, z7.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n"
+      "ld1b { z7.b }, p0/Z, [x22, #-7, MUL VL]\n"
+      ".inst 0x45c29809  // ummla z9.s, z0.b, z2.b\n"
+      ".inst 0x45c4980c  // ummla z12.s, z0.b, z4.b\n"
+      ".inst 0x45c2982f  // ummla z15.s, z1.b, z2.b\n"
+      ".inst 0x45c49832  // ummla z18.s, z1.b, z4.b\n"
+      ".inst 0x45c298b5  // ummla z21.s, z5.b, z2.b\n"
+      ".inst 0x45c498b8  // ummla z24.s, z5.b, z4.b\n"
+      ".inst 0x45c298db  // ummla z27.s, z6.b, z2.b\n"
+      ".inst 0x45c498de  // ummla z30.s, z6.b, z4.b\n"
+      "ld1b { z4.b }, p0/Z, [x22, #-6, MUL VL]\n"
+      ".inst 0x45c3980a  // ummla z10.s, z0.b, z3.b\n"
+      ".inst 0x45c7980d  // ummla z13.s, z0.b, z7.b\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n"
+      ".inst 0x45c39830  // ummla z16.s, z1.b, z3.b\n"
+      ".inst 0x45c79833  // ummla z19.s, z1.b, z7.b\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n"
+      ".inst 0x45c398b6  // ummla z22.s, z5.b, z3.b\n"
+      ".inst 0x45c798b9  // ummla z25.s, z5.b, z7.b\n"
+      "ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n"
+      ".inst 0x45c398dc  // ummla z28.s, z6.b, z3.b\n"
+      ".inst 0x45c798df  // ummla z31.s, z6.b, z7.b\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "addvl x22, x22, #-4\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45c49808  // ummla z8.s, z0.b, z4.b\n"
+      ".inst 0x45c5980b  // ummla z11.s, z0.b, z5.b\n"
+      ".inst 0x45c4982e  // ummla z14.s, z1.b, z4.b\n"
+      ".inst 0x45c59831  // ummla z17.s, z1.b, z5.b\n"
+      "ld1b { z6.b }, p0/Z, [x22]\n"
+      ".inst 0x45c49854  // ummla z20.s, z2.b, z4.b\n"
+      ".inst 0x45c59857  // ummla z23.s, z2.b, z5.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x45c498fa  // ummla z26.s, z7.b, z4.b\n"
+      ".inst 0x45c598fd  // ummla z29.s, z7.b, z5.b\n"
+      "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x45c69809  // ummla z9.s, z0.b, z6.b\n"
+      ".inst 0x45c3980c  // ummla z12.s, z0.b, z3.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45c39832  // ummla z18.s, z1.b, z3.b\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      ".inst 0x45c39858  // ummla z24.s, z2.b, z3.b\n"
+      "addvl x22, x22, #4\n"
+      ".inst 0x45c698fb  // ummla z27.s, z7.b, z6.b\n"
+      ".inst 0x45c398fe  // ummla z30.s, z7.b, z3.b\n"
+      ".inst 0x45c5980a  // ummla z10.s, z0.b, z5.b\n"
+      ".inst 0x45c4980d  // ummla z13.s, z0.b, z4.b\n"
+      ".inst 0x45c59830  // ummla z16.s, z1.b, z5.b\n"
+      ".inst 0x45c49833  // ummla z19.s, z1.b, z4.b\n"
+      ".inst 0x45c59856  // ummla z22.s, z2.b, z5.b\n"
+      ".inst 0x45c49859  // ummla z25.s, z2.b, z4.b\n"
+      ".inst 0x45c598fc  // ummla z28.s, z7.b, z5.b\n"
+      ".inst 0x45c498ff  // ummla z31.s, z7.b, z4.b\n"
+      "cbz x20, 5f\n"
+      "ld1b { z1.b }, p0/Z, [x22]\n"
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45c198e8  // ummla z8.s, z7.b, z1.b\n"
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x45c098eb  // ummla z11.s, z7.b, z0.b\n"
+      "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqb { z4.b }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x45c198ce  // ummla z14.s, z6.b, z1.b\n"
+      ".inst 0x45c098d1  // ummla z17.s, z6.b, z0.b\n"
+      ".inst 0x45c198b4  // ummla z20.s, z5.b, z1.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x45c098b7  // ummla z23.s, z5.b, z0.b\n"
+      ".inst 0x45c1989a  // ummla z26.s, z4.b, z1.b\n"
+      "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x45c0989d  // ummla z29.s, z4.b, z0.b\n"
+      "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x45c398e9  // ummla z9.s, z7.b, z3.b\n"
+      ".inst 0x45c298ec  // ummla z12.s, z7.b, z2.b\n"
+      "addvl x22, x22, #6\n"
+      ".inst 0x45c398cf  // ummla z15.s, z6.b, z3.b\n"
+      ".inst 0x45c298d2  // ummla z18.s, z6.b, z2.b\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x45c398b5  // ummla z21.s, z5.b, z3.b\n"
+      ".inst 0x45c298b8  // ummla z24.s, z5.b, z2.b\n"
+      ".inst 0x45c3989b  // ummla z27.s, z4.b, z3.b\n"
+      ".inst 0x45c2989e  // ummla z30.s, z4.b, z2.b\n"
+      ".inst 0x45c198ea  // ummla z10.s, z7.b, z1.b\n"
+      ".inst 0x45c098ed  // ummla z13.s, z7.b, z0.b\n"
+      ".inst 0x45c198d0  // ummla z16.s, z6.b, z1.b\n"
+      ".inst 0x45c098d3  // ummla z19.s, z6.b, z0.b\n"
+      ".inst 0x45c198b6  // ummla z22.s, z5.b, z1.b\n"
+      ".inst 0x45c098b9  // ummla z25.s, z5.b, z0.b\n"
+      ".inst 0x45c1989c  // ummla z28.s, z4.b, z1.b\n"
+      ".inst 0x45c0989f  // ummla z31.s, z4.b, z0.b\n"
+      "5:"  // multiply loop done
+      "uzp1 z0.d, z8.d, z11.d\n"
+      "uzp2 z8.d, z8.d, z11.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+      "uzp1 z0.d, z9.d, z12.d\n"
+      "uzp2 z9.d, z9.d, z12.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "uzp1 z0.d, z10.d, z13.d\n"
+      "uzp2 z10.d, z10.d, z13.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "uzp1 z0.d, z14.d, z17.d\n"
+      "uzp2 z14.d, z14.d, z17.d\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "uzp1 z1.d, z15.d, z18.d\n"
+      "subs x23, x23, #0x1\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "uzp2 z15.d, z15.d, z18.d\n"
+      "uzp1 z17.d, z16.d, z19.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "uzp2 z16.d, z16.d, z19.d\n"
+      "uzp1 z0.d, z20.d, z23.d\n"
+      "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "uzp2 z20.d, z20.d, z23.d\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "uzp1 z23.d, z21.d, z24.d\n"
+      "uzp2 z21.d, z21.d, z24.d\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "uzp1 z19.d, z22.d, z25.d\n"
+      "uzp2 z22.d, z22.d, z25.d\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "uzp1 z18.d, z26.d, z29.d\n"
+      "uzp2 z26.d, z26.d, z29.d\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "uzp1 z17.d, z27.d, z30.d\n"
+      "uzp2 z27.d, z27.d, z30.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "uzp1 z16.d, z28.d, z31.d\n"
+      "uzp2 z28.d, z28.d, z31.d\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp
deleted file mode 100644
index 2097d76a54..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_smallK_hybrid_fp32_mla_8x1VL(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-
-class cls_sve_smallK_hybrid_fp32_mla_8x1VL
-{
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 8;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<float>() * 1;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 1;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 8, 1, 1> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=sve_smallK_hybrid_fp32_mla_8x1VL;
-
-    cls_sve_smallK_hybrid_fp32_mla_8x1VL(const CPUInfo *)
-    {
-
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
deleted file mode 100644
index e07cfa8218..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
+++ /dev/null
@@ -1,18807 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_smallK_hybrid_fp32_mla_8x1VL(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
-    const long loops_count = iceildiv(N, (int)get_vector_length<float>()) - 1;
-    const long ldab = lda * sizeof(float);
-    const long ldcb = ldc * sizeof(float);
-    const long odd_depth  = (K % 4) ? (K % 4) : 4;
-    const long last_width = N - (loops_count * get_vector_length<float>());
-    float nullbias[64];
-    if (!bias) {
-        memset(nullbias, 0, (1 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    for (int y0=0; y0<M; y0+=8) {
-        long loops = loops_count;
-        long oob_rows = std::max(8 - (M-y0), 0);
-        long temp = 0;
-        const float *b_ptr0 = B;
-        const float *biasptr = bias ? bias : nullbias;
-        const uint64_t biasinc = bias ? get_vector_length<float>() * 1*sizeof(float) : 0;
-        const float *a_ptr0 = A + (y0 * lda);
-
-        float *c_ptr0 = C + (y0 * ldc);
-
-        switch(K) {
-            case 1:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "mov z26.d, z24.d\n"
-                    "mov z27.d, z24.d\n"
-                    "mov z28.d, z24.d\n"
-                    "mov z29.d, z24.d\n"
-                    "mov z30.d, z24.d\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z25.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z26.d, z24.d\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z27.d, z24.d\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.d, z24.d\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z25.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z26.d, z24.d\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z27.d, z24.d\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.d, z24.d\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "mov z25.d, z24.d\n"
-                    "mov z26.d, z24.d\n"
-                    "mov z27.d, z24.d\n"
-                    "mov z28.d, z24.d\n"
-                    "mov z29.d, z24.d\n"
-                    "mov z30.d, z24.d\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 2:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "mov z26.d, z24.d\n"
-                    "mov z27.d, z24.d\n"
-                    "mov z28.d, z24.d\n"
-                    "mov z29.d, z24.d\n"
-                    "mov z30.d, z24.d\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z25.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z26.d, z24.d\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z28.d, z24.d\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "mov z30.d, z24.d\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z25.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z26.d, z24.d\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z27.d, z24.d\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.d, z24.d\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "mov z25.d, z24.d\n"
-                    "mov z26.d, z24.d\n"
-                    "mov z27.d, z24.d\n"
-                    "mov z28.d, z24.d\n"
-                    "mov z29.d, z24.d\n"
-                    "mov z30.d, z24.d\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 3:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "mov z26.d, z24.d\n"
-                    "mov z27.d, z24.d\n"
-                    "mov z28.d, z24.d\n"
-                    "mov z29.d, z24.d\n"
-                    "mov z30.d, z24.d\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z25.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z26.d, z24.d\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z28.d, z24.d\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "mov z30.d, z24.d\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z25.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z26.d, z24.d\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z28.d, z24.d\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "mov z30.d, z24.d\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "mov z25.d, z24.d\n"
-                    "mov z26.d, z24.d\n"
-                    "mov z27.d, z24.d\n"
-                    "mov z28.d, z24.d\n"
-                    "mov z29.d, z24.d\n"
-                    "mov z30.d, z24.d\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 4:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "mov z26.d, z24.d\n"
-                    "mov z27.d, z24.d\n"
-                    "mov z28.d, z24.d\n"
-                    "mov z29.d, z24.d\n"
-                    "mov z30.d, z24.d\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z25.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z26.d, z24.d\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z27.d, z24.d\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.d, z24.d\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z25.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z26.d, z24.d\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z28.d, z24.d\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "mov z30.d, z24.d\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "mov z25.d, z24.d\n"
-                    "mov z26.d, z24.d\n"
-                    "mov z27.d, z24.d\n"
-                    "mov z28.d, z24.d\n"
-                    "mov z29.d, z24.d\n"
-                    "mov z30.d, z24.d\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 5:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z27.d, z24.d\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z27.d, z24.d\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 6:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z27.d, z24.d\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z27.d, z24.d\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 7:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z27.d, z24.d\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 8:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 9:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 10:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 11:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 12:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 13:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 14:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 15:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 16:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 17:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 18:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 19:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 20:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 21:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 22:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 23:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            default:
-            case 24:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.s\n"
-                    "whilelt p6.s, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z31.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p7/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "mov z25.d, z24.d\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "ld1w z24.s, p0/z, [%[biasptr]]\n"
-                    "add %[biasptr], %[biasptr], %[biasinc]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                    "mov z25.d, z24.d\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                    "mov z26.d, z24.d\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                    "mov z27.d, z24.d\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                    "mov z28.d, z24.d\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                    "mov z29.d, z24.d\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                    "mov z30.d, z24.d\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                    "mov z31.d, z24.d\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x10]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x10]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x10]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x10]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x20]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x20]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x20]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x20]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "ld1w z16.s, p7/z, [%[b_ptr0]]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "ld1w z17.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "ld1w z18.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x30]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x30]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x30]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x30]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "ld1w z19.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "ld1w z20.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "ld1w z21.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "ld1w z22.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p7/z, [a_ptr4, #0x40]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p7/z, [a_ptr5, #0x40]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p7/z, [a_ptr6, #0x40]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "ld1w z23.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "fmla z24.s, z16.s, z0.s[0]\n"
-                    "ld1rqw z7.s, p7/z, [a_ptr7, #0x40]\n"
-                    "fmla z25.s, z16.s, z1.s[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "fmla z26.s, z16.s, z2.s[0]\n"
-                    "fmla z27.s, z16.s, z3.s[0]\n"
-                    "fmla z28.s, z16.s, z4.s[0]\n"
-                    "fmla z29.s, z16.s, z5.s[0]\n"
-                    "fmla z30.s, z16.s, z6.s[0]\n"
-                    "fmla z31.s, z16.s, z7.s[0]\n"
-                    "fmla z24.s, z17.s, z0.s[1]\n"
-                    "fmla z25.s, z17.s, z1.s[1]\n"
-                    "fmla z26.s, z17.s, z2.s[1]\n"
-                    "fmla z27.s, z17.s, z3.s[1]\n"
-                    "fmla z28.s, z17.s, z4.s[1]\n"
-                    "fmla z29.s, z17.s, z5.s[1]\n"
-                    "fmla z30.s, z17.s, z6.s[1]\n"
-                    "fmla z31.s, z17.s, z7.s[1]\n"
-                    "fmla z24.s, z18.s, z0.s[2]\n"
-                    "fmla z25.s, z18.s, z1.s[2]\n"
-                    "fmla z26.s, z18.s, z2.s[2]\n"
-                    "fmla z27.s, z18.s, z3.s[2]\n"
-                    "fmla z28.s, z18.s, z4.s[2]\n"
-                    "fmla z29.s, z18.s, z5.s[2]\n"
-                    "fmla z30.s, z18.s, z6.s[2]\n"
-                    "fmla z31.s, z18.s, z7.s[2]\n"
-                    "fmla z24.s, z19.s, z0.s[3]\n"
-                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
-                    "fmla z25.s, z19.s, z1.s[3]\n"
-                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
-                    "fmla z26.s, z19.s, z2.s[3]\n"
-                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
-                    "fmla z27.s, z19.s, z3.s[3]\n"
-                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
-                    "fmla z28.s, z19.s, z4.s[3]\n"
-                    "ld1rqw z4.s, p6/z, [a_ptr4, #0x50]\n"
-                    "fmla z29.s, z19.s, z5.s[3]\n"
-                    "ld1rqw z5.s, p6/z, [a_ptr5, #0x50]\n"
-                    "fmla z30.s, z19.s, z6.s[3]\n"
-                    "ld1rqw z6.s, p6/z, [a_ptr6, #0x50]\n"
-                    "fmla z31.s, z19.s, z7.s[3]\n"
-                    "ld1rqw z7.s, p6/z, [a_ptr7, #0x50]\n"
-                    "fmla z24.s, z20.s, z0.s[0]\n"
-                    "fmla z25.s, z20.s, z1.s[0]\n"
-                    "fmla z26.s, z20.s, z2.s[0]\n"
-                    "fmla z27.s, z20.s, z3.s[0]\n"
-                    "fmla z28.s, z20.s, z4.s[0]\n"
-                    "fmla z29.s, z20.s, z5.s[0]\n"
-                    "fmla z30.s, z20.s, z6.s[0]\n"
-                    "fmla z31.s, z20.s, z7.s[0]\n"
-                    "fmla z24.s, z21.s, z0.s[1]\n"
-                    "fmla z25.s, z21.s, z1.s[1]\n"
-                    "fmla z26.s, z21.s, z2.s[1]\n"
-                    "fmla z27.s, z21.s, z3.s[1]\n"
-                    "fmla z28.s, z21.s, z4.s[1]\n"
-                    "fmla z29.s, z21.s, z5.s[1]\n"
-                    "fmla z30.s, z21.s, z6.s[1]\n"
-                    "fmla z31.s, z21.s, z7.s[1]\n"
-                    "fmla z24.s, z22.s, z0.s[2]\n"
-                    "fmla z25.s, z22.s, z1.s[2]\n"
-                    "fmla z26.s, z22.s, z2.s[2]\n"
-                    "fmla z27.s, z22.s, z3.s[2]\n"
-                    "fmla z28.s, z22.s, z4.s[2]\n"
-                    "fmla z29.s, z22.s, z5.s[2]\n"
-                    "fmla z30.s, z22.s, z6.s[2]\n"
-                    "fmla z31.s, z22.s, z7.s[2]\n"
-                    "fmla z24.s, z23.s, z0.s[3]\n"
-                    "fmla z25.s, z23.s, z1.s[3]\n"
-                    "fmla z26.s, z23.s, z2.s[3]\n"
-                    "fmla z27.s, z23.s, z3.s[3]\n"
-                    "fmla z28.s, z23.s, z4.s[3]\n"
-                    "fmla z29.s, z23.s, z5.s[3]\n"
-                    "fmla z30.s, z23.s, z6.s[3]\n"
-                    "fmla z31.s, z23.s, z7.s[3]\n"
-                    "5:\n"
-                    "ld1rw z22.s, p7/z, [%[minptr]]\n"
-                    "ld1rw z23.s, p7/z, [%[maxptr]]\n"
-                    "fmax z24.s, p7/m, z24.s, z22.s\n"
-                    "fmax z25.s, p7/m, z25.s, z22.s\n"
-                    "fmax z26.s, p7/m, z26.s, z22.s\n"
-                    "fmax z27.s, p7/m, z27.s, z22.s\n"
-                    "fmin z24.s, p7/m, z24.s, z23.s\n"
-                    "fmin z25.s, p7/m, z25.s, z23.s\n"
-                    "fmin z26.s, p7/m, z26.s, z23.s\n"
-                    "fmin z27.s, p7/m, z27.s, z23.s\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "fmax z28.s, p7/m, z28.s, z22.s\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "fmax z29.s, p7/m, z29.s, z22.s\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "fmax z30.s, p7/m, z30.s, z22.s\n"
-                    "fmin z28.s, p7/m, z28.s, z23.s\n"
-                    "fmax z31.s, p7/m, z31.s, z22.s\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "fmin z29.s, p7/m, z29.s, z23.s\n"
-                    "fmin z30.s, p7/m, z30.s, z23.s\n"
-                    "fmin z31.s, p7/m, z31.s, z23.s\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [biasptr] "+r" (biasptr)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp
deleted file mode 100644
index e50c05ba39..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include <cstdint>
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-
-class cls_sve_smallK_hybrid_s8s32_dot_8x1VL
-{
-public:
-    typedef int8_t operand_type;
-    typedef int32_t result_type;
-
-    typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 8;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<int32_t>() * 1;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 4;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return false;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 8, 1, 4> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=sve_smallK_hybrid_s8s32_dot_8x1VL;
-
-    cls_sve_smallK_hybrid_s8s32_dot_8x1VL(const CPUInfo *)
-    {
-
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
deleted file mode 100644
index 98004e98a5..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
+++ /dev/null
@@ -1,8971 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
-    const long loops_count = iceildiv(N, (int)get_vector_length<int32_t>()) - 1;
-    const long ldab = lda * sizeof(int8_t);
-    const long ldcb = ldc * sizeof(int32_t);
-    const long odd_depth  = (K % 16) ? (K % 16) : 16;
-    const long last_width = N - (loops_count * get_vector_length<int32_t>());
-    const long odds_count = K % 4;
-    K = (K + 3) / 4;
-
-    for (int y0=0; y0<M; y0+=8) {
-        long loops = loops_count;
-        long oob_rows = std::max(8 - (M-y0), 0);
-        long odds = odds_count;
-        long temp = 0;
-        const int8_t *b_ptr0 = B;
-        const int8_t *a_ptr0 = A + (y0 * lda);
-
-        int32_t *c_ptr0 = C + (y0 * ldc);
-
-        switch(K) {
-            case 1:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 2:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 3:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 4:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 5:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 6:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 7:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 8:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 9:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 10:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 11:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 12:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 13:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 14:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 15:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            default:
-            case 16:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp
deleted file mode 100644
index 60184be043..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include <cstdint>
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-
-class cls_sve_smallK_hybrid_u8u32_dot_8x1VL
-{
-public:
-    typedef uint8_t operand_type;
-    typedef uint32_t result_type;
-
-    typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 8;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<uint32_t>() * 1;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 4;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return false;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 8, 1, 4> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=sve_smallK_hybrid_u8u32_dot_8x1VL;
-
-    cls_sve_smallK_hybrid_u8u32_dot_8x1VL(const CPUInfo *)
-    {
-
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
deleted file mode 100644
index 6a8553216b..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
+++ /dev/null
@@ -1,8971 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool) {
-    const long loops_count = iceildiv(N, (int)get_vector_length<uint32_t>()) - 1;
-    const long ldab = lda * sizeof(uint8_t);
-    const long ldcb = ldc * sizeof(uint32_t);
-    const long odd_depth  = (K % 16) ? (K % 16) : 16;
-    const long last_width = N - (loops_count * get_vector_length<uint32_t>());
-    const long odds_count = K % 4;
-    K = (K + 3) / 4;
-
-    for (int y0=0; y0<M; y0+=8) {
-        long loops = loops_count;
-        long oob_rows = std::max(8 - (M-y0), 0);
-        long odds = odds_count;
-        long temp = 0;
-        const uint8_t *b_ptr0 = B;
-        const uint8_t *a_ptr0 = A + (y0 * lda);
-
-        uint32_t *c_ptr0 = C + (y0 * ldc);
-
-        switch(K) {
-            case 1:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 2:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 3:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 4:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "mov z25.s, #0\n"
-                    "mov z26.s, #0\n"
-                    "mov z27.s, #0\n"
-                    "mov z28.s, #0\n"
-                    "mov z29.s, #0\n"
-                    "mov z30.s, #0\n"
-                    "mov z31.s, #0\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 5:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 6:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 7:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 8:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 9:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 10:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 11:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 12:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 13:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 14:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 15:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            default:
-            case 16:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
-                    "ptrue p7.b\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "cbz %[loops], 2f\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z24.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "b 5f\n"
-                    "2:\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "mov z29.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "mov z30.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "mov z31.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "5:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp
new file mode 100644
index 0000000000..a7525e5ec1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* As some of the merges need these headers, but are all included in the
+ * arm_gemm namespace, put these headers here.  */
+#include <algorithm>
+
+#include <arm_neon.h>
+
+#include "arm_gemm.hpp"
+#include "asmlib.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+
+namespace arm_gemm {
+
+#include "merges/list-fp16.hpp"
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp
new file mode 100644
index 0000000000..d3665534a5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* As some of the merges need these headers, but are all included in the
+ * arm_gemm namespace, put these headers here.  */
+#include <algorithm>
+
+#include <arm_neon.h>
+
+#include "arm_gemm.hpp"
+#include "asmlib.hpp"
+#include "bfloat.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+
+namespace arm_gemm {
+
+#include "merges/list-sve.hpp"
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
index 17566db375..e100d9fe46 100644
--- a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018, 2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,21 +25,25 @@
 /* As some of the merges need these headers, but are all included in the
  * arm_gemm namespace, put these headers here.  */
 #include <algorithm>
-#include <limits>
 
 #include <arm_neon.h>
 
 #include "arm_gemm.hpp"
 #include "asmlib.hpp"
+#include "bfloat.hpp"
 #include "utils.hpp"
 
 namespace arm_gemm {
 
 template<unsigned int twidth, unsigned int height, bool sve=false, typename Tin, typename Tout>
 void MergeResults(Tout * out, const Tin * in, int ldc, int y0, int ymax, int x0, int xmax, const Tout *bias, Activation act, bool append) {
+    // NOTE: The following code is disabled to avoid calling get_vector_length(), so templated MergeResults will not
+    // be correct for SVE cases.  This is OK as we have specialisations for all needed SVE cases anyway.
+    //
     // For SVE cases, multiply the width up by the vector length.
     // Use the *input* type to determine this, since this will be what the kernel operated on.
-    const int width = twidth * (sve ? get_vector_length<Tin>() : 1);
+    // const int width = twidth * (sve ? get_vector_length<Tin>() : 1);
+    const int width = twidth;
 
     const int full_y_blocks = (ymax - y0) / height;
     const int y_remainder = (ymax - y0) % height;
@@ -111,4 +115,8 @@ template void MergeResults<12u, 8u, false, float, __fp16>(__fp16*, float const*,
 template void MergeResults<8u, 6u, false, float, __fp16>(__fp16*, float const*, int, int, int, int, int, __fp16 const*, Activation, bool);
 #endif
 
+#if defined(__arm__) && defined(ARM_COMPUTE_ENABLE_BF16)
+template void MergeResults<8u, 6u, false, float, bfloat16>(bfloat16*, float const*, int, int, int, int, int, bfloat16 const*, Activation, bool);
+#endif
+
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
index bea455ca67..989bb17dfb 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -274,7 +274,13 @@ void MergeResults<8, 6, false>(float *out, const float *in, const int ldout, con
                         "VMIN.f32	q6, q6, %q[maxv]\n"
                         "VMIN.f32	q7, q7, %q[maxv]\n"
                         "VST1.32	{d12-d15}, [%[outptr3]]!\n"
+                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                      [inptr] "+r" (inptr)
+                    : [minv] "w" (minv), [maxv] "w" (maxv), [biasptr] "r" (biasptr)
+                    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
+                    );
 
+                    __asm __volatile (
                         // Rows 4-5
                         "VLD1.32	{d8-d11},   [%[inptr]]!\n"
                         "VLD1.32	{d12-d15},  [%[inptr]]!\n"
@@ -296,7 +302,7 @@ void MergeResults<8, 6, false>(float *out, const float *in, const int ldout, con
                         "VMIN.f32	q6, q6, %q[maxv]\n"
                         "VMIN.f32	q7, q7, %q[maxv]\n"
                         "VST1.32	{d12-d15}, [%[outptr5]]!\n"
-                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                    : [outptr3] "+r" (outptr3),
                       [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [inptr] "+r" (inptr)
                     : [minv] "w" (minv), [maxv] "w" (maxv), [biasptr] "r" (biasptr)
                     : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
index a81d4504ae..ba47e0aa54 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(ARM_COMPUTE_ENABLE_FP16))
 
 template<>
 void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const __fp16 *bias, Activation act, bool append)
@@ -86,7 +86,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -140,7 +140,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -217,7 +217,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -317,7 +317,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -439,7 +439,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -584,7 +584,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -752,7 +752,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -944,7 +944,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -1150,7 +1150,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -1204,7 +1204,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -1278,7 +1278,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -1372,7 +1372,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -1485,7 +1485,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -1618,7 +1618,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -1771,7 +1771,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -1945,7 +1945,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifndef ARM_COMPUTE_ENABLE_FP16
                                 ".arch  armv8.2-a+fp16\n"
 #endif
                                 "dup v0.8h, %[maxval].h[0]\n"
@@ -2112,4 +2112,4 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
     }
 }
 
-#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // __aarch64__ && (FP16_KERNELS || ARM_COMPUTE_ENABLE_FP16)
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_bf16_8x12.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_bf16_8x12.hpp
new file mode 100644
index 0000000000..a57a855e31
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_bf16_8x12.hpp
@@ -0,0 +1,2809 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+template<>
+void MergeResults<12, 8, false>(
+    bfloat16 *out_ptr,
+    const float * in_ptr,
+    const int ldout,
+    const int y0, const int ymax,
+    const int x0, const int xmax,
+    const bfloat16 *bias,
+    Activation act,
+    bool accumulate)
+{
+    float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            minval = 0;
+            break;
+    }
+
+    size_t rows = ymax-y0;
+    size_t cols = xmax-x0;
+
+    out_ptr += (y0 * ldout) + x0;
+    bias = (bias == nullptr) ? nullptr : bias + x0;
+
+    __asm__ __volatile__(
+      "cbz %x[cols], 108f\n"
+      "cbz %x[rows], 108f\n"
+      "mov x11, #0x20\n"
+      "dup v13.4s, %w[maxval]\n"
+      "dup v12.4s, %w[minval]\n"
+      "mul x11, %x[ldout], x11\n"
+      "cbnz %x[accumulate], 66f\n"
+      "1:"  // Initial: Row loop
+      "cmp %x[rows], #0x7\n"
+      "bgt 58f\n"
+      "beq 50f\n"
+      "cmp %x[rows], #0x5\n"
+      "bgt 42f\n"
+      "beq 34f\n"
+      "cmp %x[rows], #0x3\n"
+      "bgt 26f\n"
+      "beq 18f\n"
+      "cmp %x[rows], #0x1\n"
+      "bgt 10f\n"
+      "2:"  // Initial: Height 1
+      "mov x10, %x[cols]\n"
+      "mov x9, %x[out_ptr]\n"
+      "mov x28, %x[bias]\n"
+      "cmp x10, #0xc\n"
+      "blt 6f\n"
+      "3:"  // Initial: Height 1: Block loop
+      "cbnz %x[bias], 4f\n"
+      "movi v21.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "b 5f\n"
+      "4:"  // Initial: Height 1: Width 3: bias
+      "ldr d18, [x28, #0x0]\n"
+      "ldr d17, [x28, #0x8]\n"
+      "ldr d16, [x28, #0x10]\n"
+      "shll v21.4s, v18.4h, #0x10\n"
+      "shll v20.4s, v17.4h, #0x10\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "5:"  // Initial: Height 1: Width 3: init done
+      "ldr q18, [%x[in_ptr], #0x0]\n"
+      "ldr q17, [%x[in_ptr], #0x10]\n"
+      "sub x10, x10, #0xc\n"
+      "add x28, x28, #0x18\n"
+      "ldr q16, [%x[in_ptr], #0x20]\n"
+      "cmp x10, #0xc\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fadd v18.4s, v18.4s, v21.4s\n"
+      "fadd v17.4s, v17.4s, v20.4s\n"
+      "fadd v16.4s, v16.4s, v19.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d18, [x9, #0x0]\n"
+      "str d17, [x9, #0x8]\n"
+      "str d16, [x9, #0x10]\n"
+      "add x9, x9, #0x18\n"
+      "bge 3b\n"
+      "6:"  // Initial: Height 1: no full blocks
+      "cbz x10, 9f\n"
+      "mov x20, %x[in_ptr]\n"
+      "7:"  // Initial: Height 1: Single loop
+      "movi v17.16b, #0x0\n"
+      "cbz %x[bias], 8f\n"
+      "ldr h16, [x28, #0x0]\n"
+      "shll v17.4s, v16.4h, #0x10\n"
+      "8:"  // Initial: Height 1: Scalar: no bias
+      "ldr s16, [%x[in_ptr], #0x0]\n"
+      "subs x10, x10, #0x1\n"
+      "add x28, x28, #0x2\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v16.4s, v16.4s, v17.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str h16, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      "bne 7b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "9:"  // Initial: Height 1: no oddments
+      "b 108f\n"
+      "10:"  // Initial: Height 2
+      "mov x10, %x[cols]\n"
+      "mov x9, %x[out_ptr]\n"
+      "mov x28, %x[bias]\n"
+      "cmp x10, #0xc\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "blt 14f\n"
+      "11:"  // Initial: Height 2: Block loop
+      "cbnz %x[bias], 12f\n"
+      "movi v24.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "b 13f\n"
+      "12:"  // Initial: Height 2: Width 3: bias
+      "ldr d18, [x28, #0x0]\n"
+      "ldr d17, [x28, #0x8]\n"
+      "ldr d16, [x28, #0x10]\n"
+      "shll v24.4s, v18.4h, #0x10\n"
+      "shll v23.4s, v17.4h, #0x10\n"
+      "shll v22.4s, v16.4h, #0x10\n"
+      "13:"  // Initial: Height 2: Width 3: init done
+      "ldr q16, [%x[in_ptr], #0x0]\n"
+      "ldr q20, [%x[in_ptr], #0x10]\n"
+      "sub x10, x10, #0xc\n"
+      "add x28, x28, #0x18\n"
+      "ldr q19, [%x[in_ptr], #0x20]\n"
+      "ldr q18, [%x[in_ptr], #0x30]\n"
+      "cmp x10, #0xc\n"
+      "ldr q17, [%x[in_ptr], #0x40]\n"
+      "ldr q21, [%x[in_ptr], #0x50]\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fadd v16.4s, v16.4s, v24.4s\n"
+      "fadd v20.4s, v20.4s, v23.4s\n"
+      "fadd v19.4s, v19.4s, v22.4s\n"
+      "fadd v18.4s, v18.4s, v24.4s\n"
+      "fadd v17.4s, v17.4s, v23.4s\n"
+      "fadd v21.4s, v21.4s, v22.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      "str d16, [x9, #0x0]\n"
+      ".inst 0x0ea16ab0  // bfcvtn v16.4h, v21.4s\n"
+      "str d20, [x9, #0x8]\n"
+      "str d19, [x9, #0x10]\n"
+      "add x9, x9, #0x18\n"
+      "str d18, [x27, #0x0]\n"
+      "str d17, [x27, #0x8]\n"
+      "str d16, [x27, #0x10]\n"
+      "add x27, x27, #0x18\n"
+      "bge 11b\n"
+      "14:"  // Initial: Height 2: no full blocks
+      "cbz x10, 17f\n"
+      "mov x20, %x[in_ptr]\n"
+      "15:"  // Initial: Height 2: Single loop
+      "movi v18.16b, #0x0\n"
+      "cbz %x[bias], 16f\n"
+      "ldr h16, [x28, #0x0]\n"
+      "shll v18.4s, v16.4h, #0x10\n"
+      "16:"  // Initial: Height 2: Scalar: no bias
+      "ldr s17, [%x[in_ptr], #0x0]\n"
+      "ldr s16, [%x[in_ptr], #0x30]\n"
+      "subs x10, x10, #0x1\n"
+      "add x28, x28, #0x2\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v17.4s, v17.4s, v18.4s\n"
+      "fadd v16.4s, v16.4s, v18.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str h17, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      "str h16, [x27, #0x0]\n"
+      "add x27, x27, #0x2\n"
+      "bne 15b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "17:"  // Initial: Height 2: no oddments
+      "b 108f\n"
+      "18:"  // Initial: Height 3
+      "mov x10, %x[cols]\n"
+      "mov x9, %x[out_ptr]\n"
+      "mov x28, %x[bias]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "blt 22f\n"
+      "19:"  // Initial: Height 3: Block loop
+      "cbnz %x[bias], 20f\n"
+      "movi v27.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "b 21f\n"
+      "20:"  // Initial: Height 3: Width 3: bias
+      "ldr d18, [x28, #0x0]\n"
+      "ldr d17, [x28, #0x8]\n"
+      "ldr d16, [x28, #0x10]\n"
+      "shll v27.4s, v18.4h, #0x10\n"
+      "shll v26.4s, v17.4h, #0x10\n"
+      "shll v25.4s, v16.4h, #0x10\n"
+      "21:"  // Initial: Height 3: Width 3: init done
+      "ldr q18, [%x[in_ptr], #0x0]\n"
+      "ldr q17, [%x[in_ptr], #0x10]\n"
+      "sub x10, x10, #0xc\n"
+      "add x28, x28, #0x18\n"
+      "ldr q16, [%x[in_ptr], #0x20]\n"
+      "ldr q21, [%x[in_ptr], #0x30]\n"
+      "cmp x10, #0xc\n"
+      "ldr q20, [%x[in_ptr], #0x40]\n"
+      "ldr q19, [%x[in_ptr], #0x50]\n"
+      "ldr q24, [%x[in_ptr], #0x60]\n"
+      "ldr q23, [%x[in_ptr], #0x70]\n"
+      "fadd v18.4s, v18.4s, v27.4s\n"
+      "fadd v17.4s, v17.4s, v26.4s\n"
+      "ldr q22, [%x[in_ptr], #0x80]\n"
+      "fadd v16.4s, v16.4s, v25.4s\n"
+      "fadd v21.4s, v21.4s, v27.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fadd v20.4s, v20.4s, v26.4s\n"
+      "fadd v19.4s, v19.4s, v25.4s\n"
+      "fadd v24.4s, v24.4s, v27.4s\n"
+      "fadd v23.4s, v23.4s, v26.4s\n"
+      "fadd v22.4s, v22.4s, v25.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      "str d18, [x9, #0x0]\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16b12  // bfcvtn v18.4h, v24.4s\n"
+      "str d17, [x9, #0x8]\n"
+      "str d16, [x9, #0x10]\n"
+      ".inst 0x0ea16af1  // bfcvtn v17.4h, v23.4s\n"
+      ".inst 0x0ea16ad0  // bfcvtn v16.4h, v22.4s\n"
+      "add x9, x9, #0x18\n"
+      "str d21, [x27, #0x0]\n"
+      "str d20, [x27, #0x8]\n"
+      "str d19, [x27, #0x10]\n"
+      "add x27, x27, #0x18\n"
+      "str d18, [x26, #0x0]\n"
+      "str d17, [x26, #0x8]\n"
+      "str d16, [x26, #0x10]\n"
+      "add x26, x26, #0x18\n"
+      "bge 19b\n"
+      "22:"  // Initial: Height 3: no full blocks
+      "cbz x10, 25f\n"
+      "mov x20, %x[in_ptr]\n"
+      "23:"  // Initial: Height 3: Single loop
+      "movi v19.16b, #0x0\n"
+      "cbz %x[bias], 24f\n"
+      "ldr h16, [x28, #0x0]\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "24:"  // Initial: Height 3: Scalar: no bias
+      "ldr s16, [%x[in_ptr], #0x0]\n"
+      "ldr s17, [%x[in_ptr], #0x30]\n"
+      "subs x10, x10, #0x1\n"
+      "add x28, x28, #0x2\n"
+      "ldr s18, [%x[in_ptr], #0x60]\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v16.4s, v16.4s, v19.4s\n"
+      "fadd v17.4s, v17.4s, v19.4s\n"
+      "fadd v18.4s, v18.4s, v19.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      "str h16, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      ".inst 0x0ea16a50  // bfcvtn v16.4h, v18.4s\n"
+      "str h17, [x27, #0x0]\n"
+      "add x27, x27, #0x2\n"
+      "str h16, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "bne 23b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "25:"  // Initial: Height 3: no oddments
+      "b 108f\n"
+      "26:"  // Initial: Height 4
+      "mov x9, %x[out_ptr]\n"
+      "mov x10, %x[cols]\n"
+      "mov x28, %x[bias]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "blt 30f\n"
+      "27:"  // Initial: Height 4: Block loop
+      "cbnz %x[bias], 28f\n"
+      "movi v30.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "b 29f\n"
+      "28:"  // Initial: Height 4: Width 3: bias
+      "ldr d18, [x28, #0x0]\n"
+      "ldr d17, [x28, #0x8]\n"
+      "ldr d16, [x28, #0x10]\n"
+      "shll v30.4s, v18.4h, #0x10\n"
+      "shll v29.4s, v17.4h, #0x10\n"
+      "shll v28.4s, v16.4h, #0x10\n"
+      "29:"  // Initial: Height 4: Width 3: init done
+      "ldr q19, [%x[in_ptr], #0x0]\n"
+      "ldr q18, [%x[in_ptr], #0x10]\n"
+      "sub x10, x10, #0xc\n"
+      "add x28, x28, #0x18\n"
+      "ldr q17, [%x[in_ptr], #0x20]\n"
+      "ldr q16, [%x[in_ptr], #0x30]\n"
+      "cmp x10, #0xc\n"
+      "ldr q23, [%x[in_ptr], #0x40]\n"
+      "ldr q22, [%x[in_ptr], #0x50]\n"
+      "ldr q21, [%x[in_ptr], #0x60]\n"
+      "ldr q20, [%x[in_ptr], #0x70]\n"
+      "fadd v19.4s, v19.4s, v30.4s\n"
+      "fadd v18.4s, v18.4s, v29.4s\n"
+      "ldr q27, [%x[in_ptr], #0x80]\n"
+      "ldr q26, [%x[in_ptr], #0x90]\n"
+      "fadd v17.4s, v17.4s, v28.4s\n"
+      "fadd v16.4s, v16.4s, v30.4s\n"
+      "ldr q25, [%x[in_ptr], #0xa0]\n"
+      "ldr q24, [%x[in_ptr], #0xb0]\n"
+      "fadd v23.4s, v23.4s, v29.4s\n"
+      "fadd v22.4s, v22.4s, v28.4s\n"
+      "fadd v21.4s, v21.4s, v30.4s\n"
+      "fadd v20.4s, v20.4s, v29.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fadd v27.4s, v27.4s, v28.4s\n"
+      "fadd v26.4s, v26.4s, v30.4s\n"
+      "fadd v25.4s, v25.4s, v29.4s\n"
+      "fadd v24.4s, v24.4s, v28.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
+      ".inst 0x0ea16ad6  // bfcvtn v22.4h, v22.4s\n"
+      "str d19, [x9, #0x0]\n"
+      "str d18, [x9, #0x8]\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      "str d17, [x9, #0x10]\n"
+      ".inst 0x0ea16b73  // bfcvtn v19.4h, v27.4s\n"
+      ".inst 0x0ea16b52  // bfcvtn v18.4h, v26.4s\n"
+      "add x9, x9, #0x18\n"
+      "str d16, [x27, #0x0]\n"
+      ".inst 0x0ea16b31  // bfcvtn v17.4h, v25.4s\n"
+      ".inst 0x0ea16b10  // bfcvtn v16.4h, v24.4s\n"
+      "str d23, [x27, #0x8]\n"
+      "str d22, [x27, #0x10]\n"
+      "add x27, x27, #0x18\n"
+      "str d21, [x26, #0x0]\n"
+      "str d20, [x26, #0x8]\n"
+      "str d19, [x26, #0x10]\n"
+      "add x26, x26, #0x18\n"
+      "str d18, [x25, #0x0]\n"
+      "str d17, [x25, #0x8]\n"
+      "str d16, [x25, #0x10]\n"
+      "add x25, x25, #0x18\n"
+      "bge 27b\n"
+      "30:"  // Initial: Height 4: no full blocks
+      "cbz x10, 33f\n"
+      "mov x20, %x[in_ptr]\n"
+      "31:"  // Initial: Height 4: Single loop
+      "movi v20.16b, #0x0\n"
+      "cbz %x[bias], 32f\n"
+      "ldr h16, [x28, #0x0]\n"
+      "shll v20.4s, v16.4h, #0x10\n"
+      "32:"  // Initial: Height 4: Scalar: no bias
+      "ldr s16, [%x[in_ptr], #0x0]\n"
+      "ldr s18, [%x[in_ptr], #0x30]\n"
+      "subs x10, x10, #0x1\n"
+      "add x28, x28, #0x2\n"
+      "ldr s17, [%x[in_ptr], #0x60]\n"
+      "ldr s19, [%x[in_ptr], #0x90]\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v16.4s, v16.4s, v20.4s\n"
+      "fadd v18.4s, v18.4s, v20.4s\n"
+      "fadd v17.4s, v17.4s, v20.4s\n"
+      "fadd v19.4s, v19.4s, v20.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      "str h16, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      ".inst 0x0ea16a70  // bfcvtn v16.4h, v19.4s\n"
+      "str h18, [x27, #0x0]\n"
+      "add x27, x27, #0x2\n"
+      "str h17, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "str h16, [x25, #0x0]\n"
+      "add x25, x25, #0x2\n"
+      "bne 31b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "33:"  // Initial: Height 4: no oddments
+      "b 108f\n"
+      "34:"  // Initial: Height 5
+      "mov x9, %x[out_ptr]\n"
+      "mov x10, %x[cols]\n"
+      "mov x28, %x[bias]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "blt 38f\n"
+      "35:"  // Initial: Height 5: Block loop
+      "cbnz %x[bias], 36f\n"
+      "movi v1.16b, #0x0\n"
+      "movi v0.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "b 37f\n"
+      "36:"  // Initial: Height 5: Width 3: bias
+      "ldr d18, [x28, #0x0]\n"
+      "ldr d17, [x28, #0x8]\n"
+      "ldr d16, [x28, #0x10]\n"
+      "shll v1.4s, v18.4h, #0x10\n"
+      "shll v0.4s, v17.4h, #0x10\n"
+      "shll v31.4s, v16.4h, #0x10\n"
+      "37:"  // Initial: Height 5: Width 3: init done
+      "ldr q16, [%x[in_ptr], #0x0]\n"
+      "ldr q20, [%x[in_ptr], #0x10]\n"
+      "sub x10, x10, #0xc\n"
+      "add x28, x28, #0x18\n"
+      "ldr q19, [%x[in_ptr], #0x20]\n"
+      "ldr q18, [%x[in_ptr], #0x30]\n"
+      "cmp x10, #0xc\n"
+      "ldr q17, [%x[in_ptr], #0x40]\n"
+      "ldr q30, [%x[in_ptr], #0x50]\n"
+      "ldr q24, [%x[in_ptr], #0x60]\n"
+      "ldr q23, [%x[in_ptr], #0x70]\n"
+      "fadd v16.4s, v16.4s, v1.4s\n"
+      "fadd v20.4s, v20.4s, v0.4s\n"
+      "ldr q22, [%x[in_ptr], #0x80]\n"
+      "ldr q21, [%x[in_ptr], #0x90]\n"
+      "fadd v19.4s, v19.4s, v31.4s\n"
+      "fadd v18.4s, v18.4s, v1.4s\n"
+      "ldr q29, [%x[in_ptr], #0xa0]\n"
+      "ldr q28, [%x[in_ptr], #0xb0]\n"
+      "fadd v17.4s, v17.4s, v0.4s\n"
+      "fadd v30.4s, v30.4s, v31.4s\n"
+      "ldr q27, [%x[in_ptr], #0xc0]\n"
+      "ldr q26, [%x[in_ptr], #0xd0]\n"
+      "fadd v24.4s, v24.4s, v1.4s\n"
+      "fadd v23.4s, v23.4s, v0.4s\n"
+      "ldr q25, [%x[in_ptr], #0xe0]\n"
+      "fadd v22.4s, v22.4s, v31.4s\n"
+      "fadd v21.4s, v21.4s, v1.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fadd v29.4s, v29.4s, v0.4s\n"
+      "fadd v28.4s, v28.4s, v31.4s\n"
+      "fadd v27.4s, v27.4s, v1.4s\n"
+      "fadd v26.4s, v26.4s, v0.4s\n"
+      "fadd v25.4s, v25.4s, v31.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v30.4s, v30.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v29.4s, v29.4s, v13.4s\n"
+      "fmin v28.4s, v28.4s, v13.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v30.4s, v30.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v29.4s, v29.4s, v12.4s\n"
+      "fmax v28.4s, v28.4s, v12.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      "str d16, [x9, #0x0]\n"
+      ".inst 0x0ea16bd0  // bfcvtn v16.4h, v30.4s\n"
+      ".inst 0x0ea16b18  // bfcvtn v24.4h, v24.4s\n"
+      "str d20, [x9, #0x8]\n"
+      "str d19, [x9, #0x10]\n"
+      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
+      ".inst 0x0ea16ad6  // bfcvtn v22.4h, v22.4s\n"
+      "add x9, x9, #0x18\n"
+      "str d18, [x27, #0x0]\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      ".inst 0x0ea16bb4  // bfcvtn v20.4h, v29.4s\n"
+      "str d17, [x27, #0x8]\n"
+      ".inst 0x0ea16b93  // bfcvtn v19.4h, v28.4s\n"
+      ".inst 0x0ea16b72  // bfcvtn v18.4h, v27.4s\n"
+      "str d16, [x27, #0x10]\n"
+      ".inst 0x0ea16b51  // bfcvtn v17.4h, v26.4s\n"
+      ".inst 0x0ea16b30  // bfcvtn v16.4h, v25.4s\n"
+      "add x27, x27, #0x18\n"
+      "str d24, [x26, #0x0]\n"
+      "str d23, [x26, #0x8]\n"
+      "str d22, [x26, #0x10]\n"
+      "add x26, x26, #0x18\n"
+      "str d21, [x25, #0x0]\n"
+      "str d20, [x25, #0x8]\n"
+      "str d19, [x25, #0x10]\n"
+      "add x25, x25, #0x18\n"
+      "str d18, [x24, #0x0]\n"
+      "str d17, [x24, #0x8]\n"
+      "str d16, [x24, #0x10]\n"
+      "add x24, x24, #0x18\n"
+      "bge 35b\n"
+      "38:"  // Initial: Height 5: no full blocks
+      "cbz x10, 41f\n"
+      "mov x20, %x[in_ptr]\n"
+      "39:"  // Initial: Height 5: Single loop
+      "movi v21.16b, #0x0\n"
+      "cbz %x[bias], 40f\n"
+      "ldr h16, [x28, #0x0]\n"
+      "shll v21.4s, v16.4h, #0x10\n"
+      "40:"  // Initial: Height 5: Scalar: no bias
+      "ldr s16, [%x[in_ptr], #0x0]\n"
+      "ldr s19, [%x[in_ptr], #0x30]\n"
+      "subs x10, x10, #0x1\n"
+      "add x28, x28, #0x2\n"
+      "ldr s18, [%x[in_ptr], #0x60]\n"
+      "ldr s17, [%x[in_ptr], #0x90]\n"
+      "ldr s20, [%x[in_ptr], #0xc0]\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v16.4s, v16.4s, v21.4s\n"
+      "fadd v19.4s, v19.4s, v21.4s\n"
+      "fadd v18.4s, v18.4s, v21.4s\n"
+      "fadd v17.4s, v17.4s, v21.4s\n"
+      "fadd v20.4s, v20.4s, v21.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      "str h16, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      ".inst 0x0ea16a90  // bfcvtn v16.4h, v20.4s\n"
+      "str h19, [x27, #0x0]\n"
+      "add x27, x27, #0x2\n"
+      "str h18, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "str h17, [x25, #0x0]\n"
+      "add x25, x25, #0x2\n"
+      "str h16, [x24, #0x0]\n"
+      "add x24, x24, #0x2\n"
+      "bne 39b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "41:"  // Initial: Height 5: no oddments
+      "b 108f\n"
+      "42:"  // Initial: Height 6
+      "mov x9, %x[out_ptr]\n"
+      "mov x10, %x[cols]\n"
+      "mov x28, %x[bias]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "add x23, x24, %x[ldout], LSL #1\n"
+      "blt 46f\n"
+      "43:"  // Initial: Height 6: Block loop
+      "cbnz %x[bias], 44f\n"
+      "movi v4.16b, #0x0\n"
+      "movi v3.16b, #0x0\n"
+      "movi v2.16b, #0x0\n"
+      "b 45f\n"
+      "44:"  // Initial: Height 6: Width 3: bias
+      "ldr d18, [x28, #0x0]\n"
+      "ldr d17, [x28, #0x8]\n"
+      "ldr d16, [x28, #0x10]\n"
+      "shll v4.4s, v18.4h, #0x10\n"
+      "shll v3.4s, v17.4h, #0x10\n"
+      "shll v2.4s, v16.4h, #0x10\n"
+      "45:"  // Initial: Height 6: Width 3: init done
+      "ldr q21, [%x[in_ptr], #0x0]\n"
+      "ldr q16, [%x[in_ptr], #0x10]\n"
+      "sub x10, x10, #0xc\n"
+      "add x28, x28, #0x18\n"
+      "ldr q20, [%x[in_ptr], #0x20]\n"
+      "ldr q19, [%x[in_ptr], #0x30]\n"
+      "cmp x10, #0xc\n"
+      "ldr q18, [%x[in_ptr], #0x40]\n"
+      "ldr q17, [%x[in_ptr], #0x50]\n"
+      "ldr q1, [%x[in_ptr], #0x60]\n"
+      "ldr q26, [%x[in_ptr], #0x70]\n"
+      "fadd v21.4s, v21.4s, v4.4s\n"
+      "fadd v16.4s, v16.4s, v3.4s\n"
+      "ldr q25, [%x[in_ptr], #0x80]\n"
+      "ldr q24, [%x[in_ptr], #0x90]\n"
+      "fadd v20.4s, v20.4s, v2.4s\n"
+      "fadd v19.4s, v19.4s, v4.4s\n"
+      "ldr q23, [%x[in_ptr], #0xa0]\n"
+      "ldr q22, [%x[in_ptr], #0xb0]\n"
+      "fadd v18.4s, v18.4s, v3.4s\n"
+      "fadd v17.4s, v17.4s, v2.4s\n"
+      "ldr q0, [%x[in_ptr], #0xc0]\n"
+      "ldr q31, [%x[in_ptr], #0xd0]\n"
+      "fadd v1.4s, v1.4s, v4.4s\n"
+      "fadd v26.4s, v26.4s, v3.4s\n"
+      "ldr q30, [%x[in_ptr], #0xe0]\n"
+      "ldr q29, [%x[in_ptr], #0xf0]\n"
+      "fadd v25.4s, v25.4s, v2.4s\n"
+      "fadd v24.4s, v24.4s, v4.4s\n"
+      "ldr q28, [%x[in_ptr], #0x100]\n"
+      "ldr q27, [%x[in_ptr], #0x110]\n"
+      "fadd v23.4s, v23.4s, v3.4s\n"
+      "fadd v22.4s, v22.4s, v2.4s\n"
+      "fadd v0.4s, v0.4s, v4.4s\n"
+      "fadd v31.4s, v31.4s, v3.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fadd v30.4s, v30.4s, v2.4s\n"
+      "fadd v29.4s, v29.4s, v4.4s\n"
+      "fadd v28.4s, v28.4s, v3.4s\n"
+      "fadd v27.4s, v27.4s, v2.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v1.4s, v1.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmin v0.4s, v0.4s, v13.4s\n"
+      "fmin v31.4s, v31.4s, v13.4s\n"
+      "fmin v30.4s, v30.4s, v13.4s\n"
+      "fmin v29.4s, v29.4s, v13.4s\n"
+      "fmin v28.4s, v28.4s, v13.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v1.4s, v1.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      "fmax v0.4s, v0.4s, v12.4s\n"
+      "fmax v31.4s, v31.4s, v12.4s\n"
+      "fmax v30.4s, v30.4s, v12.4s\n"
+      "fmax v29.4s, v29.4s, v12.4s\n"
+      "fmax v28.4s, v28.4s, v12.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      "str d21, [x9, #0x0]\n"
+      "str d16, [x9, #0x8]\n"
+      ".inst 0x0ea16830  // bfcvtn v16.4h, v1.4s\n"
+      ".inst 0x0ea16b5a  // bfcvtn v26.4h, v26.4s\n"
+      "str d20, [x9, #0x10]\n"
+      ".inst 0x0ea16b39  // bfcvtn v25.4h, v25.4s\n"
+      ".inst 0x0ea16b18  // bfcvtn v24.4h, v24.4s\n"
+      "add x9, x9, #0x18\n"
+      "str d19, [x27, #0x0]\n"
+      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
+      ".inst 0x0ea16ad6  // bfcvtn v22.4h, v22.4s\n"
+      "str d18, [x27, #0x8]\n"
+      ".inst 0x0ea16815  // bfcvtn v21.4h, v0.4s\n"
+      ".inst 0x0ea16bf4  // bfcvtn v20.4h, v31.4s\n"
+      "str d17, [x27, #0x10]\n"
+      ".inst 0x0ea16bd3  // bfcvtn v19.4h, v30.4s\n"
+      ".inst 0x0ea16bb2  // bfcvtn v18.4h, v29.4s\n"
+      "add x27, x27, #0x18\n"
+      "str d16, [x26, #0x0]\n"
+      ".inst 0x0ea16b91  // bfcvtn v17.4h, v28.4s\n"
+      ".inst 0x0ea16b70  // bfcvtn v16.4h, v27.4s\n"
+      "str d26, [x26, #0x8]\n"
+      "str d25, [x26, #0x10]\n"
+      "add x26, x26, #0x18\n"
+      "str d24, [x25, #0x0]\n"
+      "str d23, [x25, #0x8]\n"
+      "str d22, [x25, #0x10]\n"
+      "add x25, x25, #0x18\n"
+      "str d21, [x24, #0x0]\n"
+      "str d20, [x24, #0x8]\n"
+      "str d19, [x24, #0x10]\n"
+      "add x24, x24, #0x18\n"
+      "str d18, [x23, #0x0]\n"
+      "str d17, [x23, #0x8]\n"
+      "str d16, [x23, #0x10]\n"
+      "add x23, x23, #0x18\n"
+      "bge 43b\n"
+      "46:"  // Initial: Height 6: no full blocks
+      "cbz x10, 49f\n"
+      "mov x20, %x[in_ptr]\n"
+      "47:"  // Initial: Height 6: Single loop
+      "movi v22.16b, #0x0\n"
+      "cbz %x[bias], 48f\n"
+      "ldr h16, [x28, #0x0]\n"
+      "shll v22.4s, v16.4h, #0x10\n"
+      "48:"  // Initial: Height 6: Scalar: no bias
+      "ldr s16, [%x[in_ptr], #0x0]\n"
+      "ldr s20, [%x[in_ptr], #0x30]\n"
+      "subs x10, x10, #0x1\n"
+      "add x28, x28, #0x2\n"
+      "ldr s19, [%x[in_ptr], #0x60]\n"
+      "ldr s18, [%x[in_ptr], #0x90]\n"
+      "ldr s17, [%x[in_ptr], #0xc0]\n"
+      "ldr s21, [%x[in_ptr], #0xf0]\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v16.4s, v16.4s, v22.4s\n"
+      "fadd v20.4s, v20.4s, v22.4s\n"
+      "fadd v19.4s, v19.4s, v22.4s\n"
+      "fadd v18.4s, v18.4s, v22.4s\n"
+      "fadd v17.4s, v17.4s, v22.4s\n"
+      "fadd v21.4s, v21.4s, v22.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      "str h16, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      ".inst 0x0ea16ab0  // bfcvtn v16.4h, v21.4s\n"
+      "str h20, [x27, #0x0]\n"
+      "add x27, x27, #0x2\n"
+      "str h19, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "str h18, [x25, #0x0]\n"
+      "add x25, x25, #0x2\n"
+      "str h17, [x24, #0x0]\n"
+      "add x24, x24, #0x2\n"
+      "str h16, [x23, #0x0]\n"
+      "add x23, x23, #0x2\n"
+      "bne 47b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "49:"  // Initial: Height 6: no oddments
+      "b 108f\n"
+      "50:"  // Initial: Height 7
+      "mov x9, %x[out_ptr]\n"
+      "mov x10, %x[cols]\n"
+      "mov x28, %x[bias]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "add x23, x24, %x[ldout], LSL #1\n"
+      "add x22, x23, %x[ldout], LSL #1\n"
+      "blt 54f\n"
+      "51:"  // Initial: Height 7: Block loop
+      "cbnz %x[bias], 52f\n"
+      "movi v7.16b, #0x0\n"
+      "movi v6.16b, #0x0\n"
+      "movi v5.16b, #0x0\n"
+      "b 53f\n"
+      "52:"  // Initial: Height 7: Width 3: bias
+      "ldr d18, [x28, #0x0]\n"
+      "ldr d17, [x28, #0x8]\n"
+      "ldr d16, [x28, #0x10]\n"
+      "shll v7.4s, v18.4h, #0x10\n"
+      "shll v6.4s, v17.4h, #0x10\n"
+      "shll v5.4s, v16.4h, #0x10\n"
+      "53:"  // Initial: Height 7: Width 3: init done
+      "ldr q18, [%x[in_ptr], #0x0]\n"
+      "ldr q17, [%x[in_ptr], #0x10]\n"
+      "sub x10, x10, #0xc\n"
+      "add x28, x28, #0x18\n"
+      "ldr q16, [%x[in_ptr], #0x20]\n"
+      "ldr q21, [%x[in_ptr], #0x30]\n"
+      "cmp x10, #0xc\n"
+      "ldr q20, [%x[in_ptr], #0x40]\n"
+      "ldr q19, [%x[in_ptr], #0x50]\n"
+      "ldr q4, [%x[in_ptr], #0x60]\n"
+      "ldr q3, [%x[in_ptr], #0x70]\n"
+      "fadd v18.4s, v18.4s, v7.4s\n"
+      "fadd v17.4s, v17.4s, v6.4s\n"
+      "ldr q2, [%x[in_ptr], #0x80]\n"
+      "ldr q27, [%x[in_ptr], #0x90]\n"
+      "fadd v16.4s, v16.4s, v5.4s\n"
+      "fadd v21.4s, v21.4s, v7.4s\n"
+      "ldr q26, [%x[in_ptr], #0xa0]\n"
+      "ldr q25, [%x[in_ptr], #0xb0]\n"
+      "fadd v20.4s, v20.4s, v6.4s\n"
+      "fadd v19.4s, v19.4s, v5.4s\n"
+      "ldr q24, [%x[in_ptr], #0xc0]\n"
+      "ldr q23, [%x[in_ptr], #0xd0]\n"
+      "fadd v4.4s, v4.4s, v7.4s\n"
+      "fadd v3.4s, v3.4s, v6.4s\n"
+      "ldr q22, [%x[in_ptr], #0xe0]\n"
+      "ldr q1, [%x[in_ptr], #0xf0]\n"
+      "fadd v2.4s, v2.4s, v5.4s\n"
+      "fadd v27.4s, v27.4s, v7.4s\n"
+      "ldr q0, [%x[in_ptr], #0x100]\n"
+      "ldr q31, [%x[in_ptr], #0x110]\n"
+      "fadd v26.4s, v26.4s, v6.4s\n"
+      "fadd v25.4s, v25.4s, v5.4s\n"
+      "ldr q30, [%x[in_ptr], #0x120]\n"
+      "ldr q29, [%x[in_ptr], #0x130]\n"
+      "fadd v24.4s, v24.4s, v7.4s\n"
+      "fadd v23.4s, v23.4s, v6.4s\n"
+      "ldr q28, [%x[in_ptr], #0x140]\n"
+      "fadd v22.4s, v22.4s, v5.4s\n"
+      "fadd v1.4s, v1.4s, v7.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fadd v0.4s, v0.4s, v6.4s\n"
+      "fadd v31.4s, v31.4s, v5.4s\n"
+      "fadd v30.4s, v30.4s, v7.4s\n"
+      "fadd v29.4s, v29.4s, v6.4s\n"
+      "fadd v28.4s, v28.4s, v5.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v4.4s, v4.4s, v13.4s\n"
+      "fmin v3.4s, v3.4s, v13.4s\n"
+      "fmin v2.4s, v2.4s, v13.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmin v1.4s, v1.4s, v13.4s\n"
+      "fmin v0.4s, v0.4s, v13.4s\n"
+      "fmin v31.4s, v31.4s, v13.4s\n"
+      "fmin v30.4s, v30.4s, v13.4s\n"
+      "fmin v29.4s, v29.4s, v13.4s\n"
+      "fmin v28.4s, v28.4s, v13.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v4.4s, v4.4s, v12.4s\n"
+      "fmax v3.4s, v3.4s, v12.4s\n"
+      "fmax v2.4s, v2.4s, v12.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      "fmax v1.4s, v1.4s, v12.4s\n"
+      "fmax v0.4s, v0.4s, v12.4s\n"
+      "fmax v31.4s, v31.4s, v12.4s\n"
+      "fmax v30.4s, v30.4s, v12.4s\n"
+      "fmax v29.4s, v29.4s, v12.4s\n"
+      "fmax v28.4s, v28.4s, v12.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      "str d18, [x9, #0x0]\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16892  // bfcvtn v18.4h, v4.4s\n"
+      "str d17, [x9, #0x8]\n"
+      "str d16, [x9, #0x10]\n"
+      ".inst 0x0ea16871  // bfcvtn v17.4h, v3.4s\n"
+      ".inst 0x0ea16850  // bfcvtn v16.4h, v2.4s\n"
+      "add x9, x9, #0x18\n"
+      "str d21, [x27, #0x0]\n"
+      ".inst 0x0ea16b7b  // bfcvtn v27.4h, v27.4s\n"
+      ".inst 0x0ea16b5a  // bfcvtn v26.4h, v26.4s\n"
+      "str d20, [x27, #0x8]\n"
+      ".inst 0x0ea16b39  // bfcvtn v25.4h, v25.4s\n"
+      ".inst 0x0ea16b18  // bfcvtn v24.4h, v24.4s\n"
+      "str d19, [x27, #0x10]\n"
+      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
+      ".inst 0x0ea16ad6  // bfcvtn v22.4h, v22.4s\n"
+      "add x27, x27, #0x18\n"
+      "str d18, [x26, #0x0]\n"
+      ".inst 0x0ea16835  // bfcvtn v21.4h, v1.4s\n"
+      ".inst 0x0ea16814  // bfcvtn v20.4h, v0.4s\n"
+      "str d17, [x26, #0x8]\n"
+      ".inst 0x0ea16bf3  // bfcvtn v19.4h, v31.4s\n"
+      ".inst 0x0ea16bd2  // bfcvtn v18.4h, v30.4s\n"
+      "str d16, [x26, #0x10]\n"
+      ".inst 0x0ea16bb1  // bfcvtn v17.4h, v29.4s\n"
+      ".inst 0x0ea16b90  // bfcvtn v16.4h, v28.4s\n"
+      "add x26, x26, #0x18\n"
+      "str d27, [x25, #0x0]\n"
+      "str d26, [x25, #0x8]\n"
+      "str d25, [x25, #0x10]\n"
+      "add x25, x25, #0x18\n"
+      "str d24, [x24, #0x0]\n"
+      "str d23, [x24, #0x8]\n"
+      "str d22, [x24, #0x10]\n"
+      "add x24, x24, #0x18\n"
+      "str d21, [x23, #0x0]\n"
+      "str d20, [x23, #0x8]\n"
+      "str d19, [x23, #0x10]\n"
+      "add x23, x23, #0x18\n"
+      "str d18, [x22, #0x0]\n"
+      "str d17, [x22, #0x8]\n"
+      "str d16, [x22, #0x10]\n"
+      "add x22, x22, #0x18\n"
+      "bge 51b\n"
+      "54:"  // Initial: Height 7: no full blocks
+      "cbz x10, 57f\n"
+      "mov x20, %x[in_ptr]\n"
+      "55:"  // Initial: Height 7: Single loop
+      "movi v23.16b, #0x0\n"
+      "cbz %x[bias], 56f\n"
+      "ldr h16, [x28, #0x0]\n"
+      "shll v23.4s, v16.4h, #0x10\n"
+      "56:"  // Initial: Height 7: Scalar: no bias
+      "ldr s16, [%x[in_ptr], #0x0]\n"
+      "ldr s21, [%x[in_ptr], #0x30]\n"
+      "subs x10, x10, #0x1\n"
+      "add x28, x28, #0x2\n"
+      "ldr s20, [%x[in_ptr], #0x60]\n"
+      "ldr s19, [%x[in_ptr], #0x90]\n"
+      "ldr s18, [%x[in_ptr], #0xc0]\n"
+      "ldr s17, [%x[in_ptr], #0xf0]\n"
+      "ldr s22, [%x[in_ptr], #0x120]\n"
+      "fadd v16.4s, v16.4s, v23.4s\n"
+      "fadd v21.4s, v21.4s, v23.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v20.4s, v20.4s, v23.4s\n"
+      "fadd v19.4s, v19.4s, v23.4s\n"
+      "fadd v18.4s, v18.4s, v23.4s\n"
+      "fadd v17.4s, v17.4s, v23.4s\n"
+      "fadd v22.4s, v22.4s, v23.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      "str h16, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16ad0  // bfcvtn v16.4h, v22.4s\n"
+      "str h21, [x27, #0x0]\n"
+      "add x27, x27, #0x2\n"
+      "str h20, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "str h19, [x25, #0x0]\n"
+      "add x25, x25, #0x2\n"
+      "str h18, [x24, #0x0]\n"
+      "add x24, x24, #0x2\n"
+      "str h17, [x23, #0x0]\n"
+      "add x23, x23, #0x2\n"
+      "str h16, [x22, #0x0]\n"
+      "add x22, x22, #0x2\n"
+      "bne 55b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "57:"  // Initial: Height 7: no oddments
+      "b 108f\n"
+      "58:"  // Initial: Height 8
+      "mov x9, %x[out_ptr]\n"
+      "mov x10, %x[cols]\n"
+      "mov x28, %x[bias]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "add x23, x24, %x[ldout], LSL #1\n"
+      "add x22, x23, %x[ldout], LSL #1\n"
+      "add x21, x22, %x[ldout], LSL #1\n"
+      "blt 62f\n"
+      "59:"  // Initial: Height 8: Block loop
+      "cbnz %x[bias], 60f\n"
+      "movi v10.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v8.16b, #0x0\n"
+      "b 61f\n"
+      "60:"  // Initial: Height 8: Width 3: bias
+      "ldr d18, [x28, #0x0]\n"
+      "ldr d17, [x28, #0x8]\n"
+      "ldr d16, [x28, #0x10]\n"
+      "shll v10.4s, v18.4h, #0x10\n"
+      "shll v9.4s, v17.4h, #0x10\n"
+      "shll v8.4s, v16.4h, #0x10\n"
+      "61:"  // Initial: Height 8: Width 3: init done
+      "ldr q18, [%x[in_ptr], #0x0]\n"
+      "ldr q17, [%x[in_ptr], #0x10]\n"
+      "sub x10, x10, #0xc\n"
+      "add x28, x28, #0x18\n"
+      "ldr q16, [%x[in_ptr], #0x20]\n"
+      "ldr q22, [%x[in_ptr], #0x30]\n"
+      "cmp x10, #0xc\n"
+      "ldr q21, [%x[in_ptr], #0x40]\n"
+      "ldr q20, [%x[in_ptr], #0x50]\n"
+      "ldr q19, [%x[in_ptr], #0x60]\n"
+      "ldr q7, [%x[in_ptr], #0x70]\n"
+      "fadd v18.4s, v18.4s, v10.4s\n"
+      "fadd v17.4s, v17.4s, v9.4s\n"
+      "ldr q6, [%x[in_ptr], #0x80]\n"
+      "ldr q5, [%x[in_ptr], #0x90]\n"
+      "fadd v16.4s, v16.4s, v8.4s\n"
+      "fadd v22.4s, v22.4s, v10.4s\n"
+      "ldr q29, [%x[in_ptr], #0xa0]\n"
+      "ldr q28, [%x[in_ptr], #0xb0]\n"
+      "fadd v21.4s, v21.4s, v9.4s\n"
+      "fadd v20.4s, v20.4s, v8.4s\n"
+      "ldr q27, [%x[in_ptr], #0xc0]\n"
+      "ldr q26, [%x[in_ptr], #0xd0]\n"
+      "fadd v19.4s, v19.4s, v10.4s\n"
+      "fadd v7.4s, v7.4s, v9.4s\n"
+      "ldr q25, [%x[in_ptr], #0xe0]\n"
+      "ldr q24, [%x[in_ptr], #0xf0]\n"
+      "fadd v6.4s, v6.4s, v8.4s\n"
+      "fadd v5.4s, v5.4s, v10.4s\n"
+      "ldr q23, [%x[in_ptr], #0x100]\n"
+      "ldr q4, [%x[in_ptr], #0x110]\n"
+      "fadd v29.4s, v29.4s, v9.4s\n"
+      "fadd v28.4s, v28.4s, v8.4s\n"
+      "ldr q3, [%x[in_ptr], #0x120]\n"
+      "ldr q2, [%x[in_ptr], #0x130]\n"
+      "fadd v27.4s, v27.4s, v10.4s\n"
+      "fadd v26.4s, v26.4s, v9.4s\n"
+      "ldr q1, [%x[in_ptr], #0x140]\n"
+      "ldr q0, [%x[in_ptr], #0x150]\n"
+      "fadd v25.4s, v25.4s, v8.4s\n"
+      "fadd v24.4s, v24.4s, v10.4s\n"
+      "ldr q31, [%x[in_ptr], #0x160]\n"
+      "ldr q30, [%x[in_ptr], #0x170]\n"
+      "fadd v23.4s, v23.4s, v9.4s\n"
+      "fadd v4.4s, v4.4s, v8.4s\n"
+      "fadd v3.4s, v3.4s, v10.4s\n"
+      "fadd v2.4s, v2.4s, v9.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fadd v1.4s, v1.4s, v8.4s\n"
+      "fadd v0.4s, v0.4s, v10.4s\n"
+      "fadd v31.4s, v31.4s, v9.4s\n"
+      "fadd v30.4s, v30.4s, v8.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v7.4s, v7.4s, v13.4s\n"
+      "fmin v6.4s, v6.4s, v13.4s\n"
+      "fmin v5.4s, v5.4s, v13.4s\n"
+      "fmin v29.4s, v29.4s, v13.4s\n"
+      "fmin v28.4s, v28.4s, v13.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v4.4s, v4.4s, v13.4s\n"
+      "fmin v3.4s, v3.4s, v13.4s\n"
+      "fmin v2.4s, v2.4s, v13.4s\n"
+      "fmin v1.4s, v1.4s, v13.4s\n"
+      "fmin v0.4s, v0.4s, v13.4s\n"
+      "fmin v31.4s, v31.4s, v13.4s\n"
+      "fmin v30.4s, v30.4s, v13.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v7.4s, v7.4s, v12.4s\n"
+      "fmax v6.4s, v6.4s, v12.4s\n"
+      "fmax v5.4s, v5.4s, v12.4s\n"
+      "fmax v29.4s, v29.4s, v12.4s\n"
+      "fmax v28.4s, v28.4s, v12.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v4.4s, v4.4s, v12.4s\n"
+      "fmax v3.4s, v3.4s, v12.4s\n"
+      "fmax v2.4s, v2.4s, v12.4s\n"
+      "fmax v1.4s, v1.4s, v12.4s\n"
+      "fmax v0.4s, v0.4s, v12.4s\n"
+      "fmax v31.4s, v31.4s, v12.4s\n"
+      "fmax v30.4s, v30.4s, v12.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16ad6  // bfcvtn v22.4h, v22.4s\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      "str d18, [x9, #0x0]\n"
+      "str d17, [x9, #0x8]\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea168f2  // bfcvtn v18.4h, v7.4s\n"
+      "str d16, [x9, #0x10]\n"
+      ".inst 0x0ea168d1  // bfcvtn v17.4h, v6.4s\n"
+      ".inst 0x0ea168b0  // bfcvtn v16.4h, v5.4s\n"
+      "add x9, x9, #0x18\n"
+      "str d22, [x27, #0x0]\n"
+      ".inst 0x0ea16bbd  // bfcvtn v29.4h, v29.4s\n"
+      ".inst 0x0ea16b9c  // bfcvtn v28.4h, v28.4s\n"
+      "str d21, [x27, #0x8]\n"
+      ".inst 0x0ea16b7b  // bfcvtn v27.4h, v27.4s\n"
+      ".inst 0x0ea16b5a  // bfcvtn v26.4h, v26.4s\n"
+      "str d20, [x27, #0x10]\n"
+      ".inst 0x0ea16b39  // bfcvtn v25.4h, v25.4s\n"
+      ".inst 0x0ea16b18  // bfcvtn v24.4h, v24.4s\n"
+      "add x27, x27, #0x18\n"
+      "str d19, [x26, #0x0]\n"
+      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
+      ".inst 0x0ea16896  // bfcvtn v22.4h, v4.4s\n"
+      "str d18, [x26, #0x8]\n"
+      ".inst 0x0ea16875  // bfcvtn v21.4h, v3.4s\n"
+      ".inst 0x0ea16854  // bfcvtn v20.4h, v2.4s\n"
+      "str d17, [x26, #0x10]\n"
+      ".inst 0x0ea16833  // bfcvtn v19.4h, v1.4s\n"
+      ".inst 0x0ea16812  // bfcvtn v18.4h, v0.4s\n"
+      "add x26, x26, #0x18\n"
+      "str d16, [x25, #0x0]\n"
+      ".inst 0x0ea16bf1  // bfcvtn v17.4h, v31.4s\n"
+      ".inst 0x0ea16bd0  // bfcvtn v16.4h, v30.4s\n"
+      "str d29, [x25, #0x8]\n"
+      "str d28, [x25, #0x10]\n"
+      "add x25, x25, #0x18\n"
+      "str d27, [x24, #0x0]\n"
+      "str d26, [x24, #0x8]\n"
+      "str d25, [x24, #0x10]\n"
+      "add x24, x24, #0x18\n"
+      "str d24, [x23, #0x0]\n"
+      "str d23, [x23, #0x8]\n"
+      "str d22, [x23, #0x10]\n"
+      "add x23, x23, #0x18\n"
+      "str d21, [x22, #0x0]\n"
+      "str d20, [x22, #0x8]\n"
+      "str d19, [x22, #0x10]\n"
+      "add x22, x22, #0x18\n"
+      "str d18, [x21, #0x0]\n"
+      "str d17, [x21, #0x8]\n"
+      "str d16, [x21, #0x10]\n"
+      "add x21, x21, #0x18\n"
+      "bge 59b\n"
+      "62:"  // Initial: Height 8: no full blocks
+      "cbz x10, 65f\n"
+      "mov x20, %x[in_ptr]\n"
+      "63:"  // Initial: Height 8: Single loop
+      "movi v24.16b, #0x0\n"
+      "cbz %x[bias], 64f\n"
+      "ldr h16, [x28, #0x0]\n"
+      "shll v24.4s, v16.4h, #0x10\n"
+      "64:"  // Initial: Height 8: Scalar: no bias
+      "ldr s17, [%x[in_ptr], #0x0]\n"
+      "ldr s16, [%x[in_ptr], #0x30]\n"
+      "subs x10, x10, #0x1\n"
+      "add x28, x28, #0x2\n"
+      "ldr s21, [%x[in_ptr], #0x60]\n"
+      "ldr s20, [%x[in_ptr], #0x90]\n"
+      "ldr s19, [%x[in_ptr], #0xc0]\n"
+      "ldr s18, [%x[in_ptr], #0xf0]\n"
+      "ldr s23, [%x[in_ptr], #0x120]\n"
+      "ldr s22, [%x[in_ptr], #0x150]\n"
+      "fadd v17.4s, v17.4s, v24.4s\n"
+      "fadd v16.4s, v16.4s, v24.4s\n"
+      "fadd v21.4s, v21.4s, v24.4s\n"
+      "fadd v20.4s, v20.4s, v24.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v19.4s, v19.4s, v24.4s\n"
+      "fadd v18.4s, v18.4s, v24.4s\n"
+      "fadd v23.4s, v23.4s, v24.4s\n"
+      "fadd v22.4s, v22.4s, v24.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v16.4s, v16.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v16.4s, v16.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      "str h17, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      "str h16, [x27, #0x0]\n"
+      ".inst 0x0ea16af1  // bfcvtn v17.4h, v23.4s\n"
+      ".inst 0x0ea16ad0  // bfcvtn v16.4h, v22.4s\n"
+      "add x27, x27, #0x2\n"
+      "str h21, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "str h20, [x25, #0x0]\n"
+      "add x25, x25, #0x2\n"
+      "str h19, [x24, #0x0]\n"
+      "add x24, x24, #0x2\n"
+      "str h18, [x23, #0x0]\n"
+      "add x23, x23, #0x2\n"
+      "str h17, [x22, #0x0]\n"
+      "add x22, x22, #0x2\n"
+      "str h16, [x21, #0x0]\n"
+      "add x21, x21, #0x2\n"
+      "bne 63b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "65:"  // Initial: Height 8: no oddments
+      "subs %x[rows], %x[rows], #0x8\n"
+      "add %x[out_ptr], %x[out_ptr], x11\n"
+      "bgt 1b\n"
+      "b 108f\n"
+      "66:"  // Accumulate
+      "67:"  // Accumulate: Row loop
+      "cmp %x[rows], #0x7\n"
+      "bgt 103f\n"
+      "beq 98f\n"
+      "cmp %x[rows], #0x5\n"
+      "bgt 93f\n"
+      "beq 88f\n"
+      "cmp %x[rows], #0x3\n"
+      "bgt 83f\n"
+      "beq 78f\n"
+      "cmp %x[rows], #0x1\n"
+      "bgt 73f\n"
+      "68:"  // Accumulate: Height 1
+      "mov x10, %x[cols]\n"
+      "mov x9, %x[out_ptr]\n"
+      "cmp x10, #0xc\n"
+      "blt 70f\n"
+      "69:"  // Accumulate: Height 1: Block loop
+      "ldr d16, [x9, #0x0]\n"
+      "ldr q19, [%x[in_ptr], #0x0]\n"
+      "sub x10, x10, #0xc\n"
+      "ldr q18, [%x[in_ptr], #0x10]\n"
+      "ldr q17, [%x[in_ptr], #0x20]\n"
+      "cmp x10, #0xc\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v19.4s, v19.4s, v16.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      ".inst 0x0ea16a70  // bfcvtn v16.4h, v19.4s\n"
+      "str d16, [x9, #0x0]\n"
+      "ldr d16, [x9, #0x8]\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v18.4s, v18.4s, v16.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      ".inst 0x0ea16a50  // bfcvtn v16.4h, v18.4s\n"
+      "str d16, [x9, #0x8]\n"
+      "ldr d16, [x9, #0x10]\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v17.4s, v17.4s, v16.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      ".inst 0x0ea16a30  // bfcvtn v16.4h, v17.4s\n"
+      "str d16, [x9, #0x10]\n"
+      "add x9, x9, #0x18\n"
+      "bge 69b\n"
+      "70:"  // Accumulate: Height 1: no full blocks
+      "cbz x10, 72f\n"
+      "mov x20, %x[in_ptr]\n"
+      "71:"  // Accumulate: Height 1: Single loop
+      "ldr h16, [x9, #0x0]\n"
+      "ldr s17, [%x[in_ptr], #0x0]\n"
+      "subs x10, x10, #0x1\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v17.4s, v17.4s, v16.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      ".inst 0x0ea16a30  // bfcvtn v16.4h, v17.4s\n"
+      "str h16, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      "bne 71b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "72:"  // Accumulate: Height 1: no oddments
+      "b 108f\n"
+      "73:"  // Accumulate: Height 2
+      "mov x10, %x[cols]\n"
+      "mov x9, %x[out_ptr]\n"
+      "cmp x10, #0xc\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "blt 75f\n"
+      "74:"  // Accumulate: Height 2: Block loop
+      "ldr d17, [x9, #0x0]\n"
+      "ldr d16, [x27, #0x0]\n"
+      "sub x10, x10, #0xc\n"
+      "ldr q23, [%x[in_ptr], #0x0]\n"
+      "ldr q22, [%x[in_ptr], #0x30]\n"
+      "cmp x10, #0xc\n"
+      "ldr q21, [%x[in_ptr], #0x10]\n"
+      "ldr q20, [%x[in_ptr], #0x40]\n"
+      "ldr q19, [%x[in_ptr], #0x20]\n"
+      "ldr q18, [%x[in_ptr], #0x50]\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fadd v23.4s, v23.4s, v17.4s\n"
+      "fadd v22.4s, v22.4s, v16.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      ".inst 0x0ea16af0  // bfcvtn v16.4h, v23.4s\n"
+      ".inst 0x0ea16ad1  // bfcvtn v17.4h, v22.4s\n"
+      "str d16, [x9, #0x0]\n"
+      "ldr d16, [x9, #0x8]\n"
+      "str d17, [x27, #0x0]\n"
+      "shll v17.4s, v16.4h, #0x10\n"
+      "ldr d16, [x27, #0x8]\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v21.4s, v21.4s, v17.4s\n"
+      "fadd v20.4s, v20.4s, v16.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      ".inst 0x0ea16ab0  // bfcvtn v16.4h, v21.4s\n"
+      ".inst 0x0ea16a91  // bfcvtn v17.4h, v20.4s\n"
+      "str d16, [x9, #0x8]\n"
+      "ldr d16, [x9, #0x10]\n"
+      "str d17, [x27, #0x8]\n"
+      "shll v17.4s, v16.4h, #0x10\n"
+      "ldr d16, [x27, #0x10]\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v19.4s, v19.4s, v17.4s\n"
+      "fadd v18.4s, v18.4s, v16.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      ".inst 0x0ea16a71  // bfcvtn v17.4h, v19.4s\n"
+      ".inst 0x0ea16a50  // bfcvtn v16.4h, v18.4s\n"
+      "str d17, [x9, #0x10]\n"
+      "add x9, x9, #0x18\n"
+      "str d16, [x27, #0x10]\n"
+      "add x27, x27, #0x18\n"
+      "bge 74b\n"
+      "75:"  // Accumulate: Height 2: no full blocks
+      "cbz x10, 77f\n"
+      "mov x20, %x[in_ptr]\n"
+      "76:"  // Accumulate: Height 2: Single loop
+      "ldr h17, [x9, #0x0]\n"
+      "ldr h16, [x27, #0x0]\n"
+      "subs x10, x10, #0x1\n"
+      "ldr s19, [%x[in_ptr], #0x0]\n"
+      "ldr s18, [%x[in_ptr], #0x30]\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v19.4s, v19.4s, v17.4s\n"
+      "fadd v18.4s, v18.4s, v16.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      ".inst 0x0ea16a70  // bfcvtn v16.4h, v19.4s\n"
+      "str h16, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      ".inst 0x0ea16a50  // bfcvtn v16.4h, v18.4s\n"
+      "str h16, [x27, #0x0]\n"
+      "add x27, x27, #0x2\n"
+      "bne 76b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "77:"  // Accumulate: Height 2: no oddments
+      "b 108f\n"
+      "78:"  // Accumulate: Height 3
+      "mov x10, %x[cols]\n"
+      "mov x9, %x[out_ptr]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "blt 80f\n"
+      "79:"  // Accumulate: Height 3: Block loop
+      "ldr d18, [x9, #0x0]\n"
+      "ldr d17, [x27, #0x0]\n"
+      "sub x10, x10, #0xc\n"
+      "ldr d16, [x26, #0x0]\n"
+      "ldr q27, [%x[in_ptr], #0x0]\n"
+      "cmp x10, #0xc\n"
+      "ldr q26, [%x[in_ptr], #0x30]\n"
+      "ldr q25, [%x[in_ptr], #0x60]\n"
+      "ldr q24, [%x[in_ptr], #0x10]\n"
+      "ldr q23, [%x[in_ptr], #0x40]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "ldr q22, [%x[in_ptr], #0x70]\n"
+      "ldr q21, [%x[in_ptr], #0x20]\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "ldr q20, [%x[in_ptr], #0x50]\n"
+      "ldr q19, [%x[in_ptr], #0x80]\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fadd v27.4s, v27.4s, v18.4s\n"
+      "fadd v26.4s, v26.4s, v17.4s\n"
+      "fadd v25.4s, v25.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      ".inst 0x0ea16b72  // bfcvtn v18.4h, v27.4s\n"
+      ".inst 0x0ea16b50  // bfcvtn v16.4h, v26.4s\n"
+      ".inst 0x0ea16b31  // bfcvtn v17.4h, v25.4s\n"
+      "str d18, [x9, #0x0]\n"
+      "str d16, [x27, #0x0]\n"
+      "ldr d16, [x9, #0x8]\n"
+      "str d17, [x26, #0x0]\n"
+      "ldr d17, [x27, #0x8]\n"
+      "shll v18.4s, v16.4h, #0x10\n"
+      "ldr d16, [x26, #0x8]\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v24.4s, v24.4s, v18.4s\n"
+      "fadd v23.4s, v23.4s, v17.4s\n"
+      "fadd v22.4s, v22.4s, v16.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      ".inst 0x0ea16b10  // bfcvtn v16.4h, v24.4s\n"
+      ".inst 0x0ea16af2  // bfcvtn v18.4h, v23.4s\n"
+      "str d16, [x9, #0x8]\n"
+      ".inst 0x0ea16ad1  // bfcvtn v17.4h, v22.4s\n"
+      "ldr d16, [x9, #0x10]\n"
+      "str d18, [x27, #0x8]\n"
+      "str d17, [x26, #0x8]\n"
+      "shll v18.4s, v16.4h, #0x10\n"
+      "ldr d17, [x27, #0x10]\n"
+      "ldr d16, [x26, #0x10]\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v21.4s, v21.4s, v18.4s\n"
+      "fadd v20.4s, v20.4s, v17.4s\n"
+      "fadd v19.4s, v19.4s, v16.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      ".inst 0x0ea16ab0  // bfcvtn v16.4h, v21.4s\n"
+      ".inst 0x0ea16a91  // bfcvtn v17.4h, v20.4s\n"
+      "str d16, [x9, #0x10]\n"
+      "add x9, x9, #0x18\n"
+      ".inst 0x0ea16a70  // bfcvtn v16.4h, v19.4s\n"
+      "str d17, [x27, #0x10]\n"
+      "add x27, x27, #0x18\n"
+      "str d16, [x26, #0x10]\n"
+      "add x26, x26, #0x18\n"
+      "bge 79b\n"
+      "80:"  // Accumulate: Height 3: no full blocks
+      "cbz x10, 82f\n"
+      "mov x20, %x[in_ptr]\n"
+      "81:"  // Accumulate: Height 3: Single loop
+      "ldr h18, [x9, #0x0]\n"
+      "ldr h17, [x27, #0x0]\n"
+      "subs x10, x10, #0x1\n"
+      "ldr h16, [x26, #0x0]\n"
+      "ldr s21, [%x[in_ptr], #0x0]\n"
+      "ldr s20, [%x[in_ptr], #0x30]\n"
+      "ldr s19, [%x[in_ptr], #0x60]\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v21.4s, v21.4s, v18.4s\n"
+      "fadd v20.4s, v20.4s, v17.4s\n"
+      "fadd v19.4s, v19.4s, v16.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      ".inst 0x0ea16ab0  // bfcvtn v16.4h, v21.4s\n"
+      ".inst 0x0ea16a91  // bfcvtn v17.4h, v20.4s\n"
+      "str h16, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      ".inst 0x0ea16a70  // bfcvtn v16.4h, v19.4s\n"
+      "str h17, [x27, #0x0]\n"
+      "add x27, x27, #0x2\n"
+      "str h16, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "bne 81b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "82:"  // Accumulate: Height 3: no oddments
+      "b 108f\n"
+      "83:"  // Accumulate: Height 4
+      "mov x9, %x[out_ptr]\n"
+      "mov x10, %x[cols]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "blt 85f\n"
+      "84:"  // Accumulate: Height 4: Block loop
+      "ldr d19, [x9, #0x0]\n"
+      "ldr d18, [x27, #0x0]\n"
+      "sub x10, x10, #0xc\n"
+      "ldr d17, [x26, #0x0]\n"
+      "ldr d16, [x25, #0x0]\n"
+      "cmp x10, #0xc\n"
+      "ldr q31, [%x[in_ptr], #0x0]\n"
+      "ldr q30, [%x[in_ptr], #0x30]\n"
+      "ldr q29, [%x[in_ptr], #0x60]\n"
+      "ldr q28, [%x[in_ptr], #0x90]\n"
+      "shll v19.4s, v19.4h, #0x10\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "ldr q27, [%x[in_ptr], #0x10]\n"
+      "ldr q26, [%x[in_ptr], #0x40]\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "ldr q25, [%x[in_ptr], #0x70]\n"
+      "ldr q24, [%x[in_ptr], #0xa0]\n"
+      "ldr q23, [%x[in_ptr], #0x20]\n"
+      "ldr q22, [%x[in_ptr], #0x50]\n"
+      "fadd v31.4s, v31.4s, v19.4s\n"
+      "fadd v30.4s, v30.4s, v18.4s\n"
+      "ldr q21, [%x[in_ptr], #0x80]\n"
+      "ldr q20, [%x[in_ptr], #0xb0]\n"
+      "fadd v29.4s, v29.4s, v17.4s\n"
+      "fadd v28.4s, v28.4s, v16.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fmin v31.4s, v31.4s, v13.4s\n"
+      "fmin v30.4s, v30.4s, v13.4s\n"
+      "fmin v29.4s, v29.4s, v13.4s\n"
+      "fmin v28.4s, v28.4s, v13.4s\n"
+      "fmax v31.4s, v31.4s, v12.4s\n"
+      "fmax v30.4s, v30.4s, v12.4s\n"
+      "fmax v29.4s, v29.4s, v12.4s\n"
+      "fmax v28.4s, v28.4s, v12.4s\n"
+      ".inst 0x0ea16bf3  // bfcvtn v19.4h, v31.4s\n"
+      ".inst 0x0ea16bd0  // bfcvtn v16.4h, v30.4s\n"
+      ".inst 0x0ea16bb2  // bfcvtn v18.4h, v29.4s\n"
+      ".inst 0x0ea16b91  // bfcvtn v17.4h, v28.4s\n"
+      "str d19, [x9, #0x0]\n"
+      "str d16, [x27, #0x0]\n"
+      "ldr d16, [x9, #0x8]\n"
+      "str d18, [x26, #0x0]\n"
+      "str d17, [x25, #0x0]\n"
+      "ldr d18, [x27, #0x8]\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "ldr d17, [x26, #0x8]\n"
+      "ldr d16, [x25, #0x8]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "fadd v27.4s, v27.4s, v19.4s\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v26.4s, v26.4s, v18.4s\n"
+      "fadd v25.4s, v25.4s, v17.4s\n"
+      "fadd v24.4s, v24.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      ".inst 0x0ea16b71  // bfcvtn v17.4h, v27.4s\n"
+      ".inst 0x0ea16b53  // bfcvtn v19.4h, v26.4s\n"
+      ".inst 0x0ea16b30  // bfcvtn v16.4h, v25.4s\n"
+      "str d17, [x9, #0x8]\n"
+      ".inst 0x0ea16b12  // bfcvtn v18.4h, v24.4s\n"
+      "ldr d17, [x9, #0x10]\n"
+      "str d19, [x27, #0x8]\n"
+      "str d16, [x26, #0x8]\n"
+      "ldr d16, [x27, #0x10]\n"
+      "str d18, [x25, #0x8]\n"
+      "shll v19.4s, v17.4h, #0x10\n"
+      "ldr d17, [x26, #0x10]\n"
+      "shll v18.4s, v16.4h, #0x10\n"
+      "ldr d16, [x25, #0x10]\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "fadd v23.4s, v23.4s, v19.4s\n"
+      "fadd v22.4s, v22.4s, v18.4s\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v21.4s, v21.4s, v17.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fadd v20.4s, v20.4s, v16.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      ".inst 0x0ea16af1  // bfcvtn v17.4h, v23.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      ".inst 0x0ea16ad0  // bfcvtn v16.4h, v22.4s\n"
+      "str d17, [x9, #0x10]\n"
+      "add x9, x9, #0x18\n"
+      ".inst 0x0ea16ab1  // bfcvtn v17.4h, v21.4s\n"
+      "str d16, [x27, #0x10]\n"
+      "add x27, x27, #0x18\n"
+      ".inst 0x0ea16a90  // bfcvtn v16.4h, v20.4s\n"
+      "str d17, [x26, #0x10]\n"
+      "add x26, x26, #0x18\n"
+      "str d16, [x25, #0x10]\n"
+      "add x25, x25, #0x18\n"
+      "bge 84b\n"
+      "85:"  // Accumulate: Height 4: no full blocks
+      "cbz x10, 87f\n"
+      "mov x20, %x[in_ptr]\n"
+      "86:"  // Accumulate: Height 4: Single loop
+      "ldr h19, [x9, #0x0]\n"
+      "ldr h18, [x27, #0x0]\n"
+      "subs x10, x10, #0x1\n"
+      "ldr h17, [x26, #0x0]\n"
+      "ldr h16, [x25, #0x0]\n"
+      "ldr s23, [%x[in_ptr], #0x0]\n"
+      "ldr s22, [%x[in_ptr], #0x30]\n"
+      "ldr s21, [%x[in_ptr], #0x60]\n"
+      "ldr s20, [%x[in_ptr], #0x90]\n"
+      "shll v19.4s, v19.4h, #0x10\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v23.4s, v23.4s, v19.4s\n"
+      "fadd v22.4s, v22.4s, v18.4s\n"
+      "fadd v21.4s, v21.4s, v17.4s\n"
+      "fadd v20.4s, v20.4s, v16.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      ".inst 0x0ea16af3  // bfcvtn v19.4h, v23.4s\n"
+      ".inst 0x0ea16ad2  // bfcvtn v18.4h, v22.4s\n"
+      ".inst 0x0ea16ab1  // bfcvtn v17.4h, v21.4s\n"
+      ".inst 0x0ea16a90  // bfcvtn v16.4h, v20.4s\n"
+      "str h19, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      "str h18, [x27, #0x0]\n"
+      "add x27, x27, #0x2\n"
+      "str h17, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "str h16, [x25, #0x0]\n"
+      "add x25, x25, #0x2\n"
+      "bne 86b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "87:"  // Accumulate: Height 4: no oddments
+      "b 108f\n"
+      "88:"  // Accumulate: Height 5
+      "mov x9, %x[out_ptr]\n"
+      "mov x10, %x[cols]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "blt 90f\n"
+      "89:"  // Accumulate: Height 5: Block loop
+      "ldr d20, [x9, #0x0]\n"
+      "ldr d19, [x27, #0x0]\n"
+      "sub x10, x10, #0xc\n"
+      "ldr d18, [x26, #0x0]\n"
+      "ldr d17, [x25, #0x0]\n"
+      "cmp x10, #0xc\n"
+      "ldr d16, [x24, #0x0]\n"
+      "ldr q3, [%x[in_ptr], #0x0]\n"
+      "ldr q2, [%x[in_ptr], #0x30]\n"
+      "ldr q1, [%x[in_ptr], #0x60]\n"
+      "shll v20.4s, v20.4h, #0x10\n"
+      "shll v19.4s, v19.4h, #0x10\n"
+      "ldr q0, [%x[in_ptr], #0x90]\n"
+      "ldr q31, [%x[in_ptr], #0xc0]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "ldr q30, [%x[in_ptr], #0x10]\n"
+      "ldr q29, [%x[in_ptr], #0x40]\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "ldr q28, [%x[in_ptr], #0x70]\n"
+      "ldr q27, [%x[in_ptr], #0xa0]\n"
+      "fadd v3.4s, v3.4s, v20.4s\n"
+      "fadd v2.4s, v2.4s, v19.4s\n"
+      "ldr q26, [%x[in_ptr], #0xd0]\n"
+      "ldr q25, [%x[in_ptr], #0x20]\n"
+      "fadd v1.4s, v1.4s, v18.4s\n"
+      "fadd v0.4s, v0.4s, v17.4s\n"
+      "ldr q24, [%x[in_ptr], #0x50]\n"
+      "ldr q23, [%x[in_ptr], #0x80]\n"
+      "fadd v31.4s, v31.4s, v16.4s\n"
+      "ldr q22, [%x[in_ptr], #0xb0]\n"
+      "ldr q21, [%x[in_ptr], #0xe0]\n"
+      "fmin v3.4s, v3.4s, v13.4s\n"
+      "fmin v2.4s, v2.4s, v13.4s\n"
+      "fmin v1.4s, v1.4s, v13.4s\n"
+      "fmin v0.4s, v0.4s, v13.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fmin v31.4s, v31.4s, v13.4s\n"
+      "fmax v3.4s, v3.4s, v12.4s\n"
+      "fmax v2.4s, v2.4s, v12.4s\n"
+      "fmax v1.4s, v1.4s, v12.4s\n"
+      "fmax v0.4s, v0.4s, v12.4s\n"
+      "fmax v31.4s, v31.4s, v12.4s\n"
+      ".inst 0x0ea16874  // bfcvtn v20.4h, v3.4s\n"
+      ".inst 0x0ea16853  // bfcvtn v19.4h, v2.4s\n"
+      ".inst 0x0ea16831  // bfcvtn v17.4h, v1.4s\n"
+      ".inst 0x0ea16810  // bfcvtn v16.4h, v0.4s\n"
+      ".inst 0x0ea16bf2  // bfcvtn v18.4h, v31.4s\n"
+      "str d20, [x9, #0x0]\n"
+      "str d19, [x27, #0x0]\n"
+      "str d17, [x26, #0x0]\n"
+      "ldr d17, [x9, #0x8]\n"
+      "str d16, [x25, #0x0]\n"
+      "ldr d16, [x27, #0x8]\n"
+      "str d18, [x24, #0x0]\n"
+      "ldr d18, [x26, #0x8]\n"
+      "shll v20.4s, v17.4h, #0x10\n"
+      "ldr d17, [x25, #0x8]\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "ldr d16, [x24, #0x8]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "fadd v30.4s, v30.4s, v20.4s\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v29.4s, v29.4s, v19.4s\n"
+      "fadd v28.4s, v28.4s, v18.4s\n"
+      "fadd v27.4s, v27.4s, v17.4s\n"
+      "fmin v30.4s, v30.4s, v13.4s\n"
+      "fadd v26.4s, v26.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v13.4s\n"
+      "fmin v28.4s, v28.4s, v13.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmax v30.4s, v30.4s, v12.4s\n"
+      "fmax v29.4s, v29.4s, v12.4s\n"
+      "fmax v28.4s, v28.4s, v12.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      ".inst 0x0ea16bd2  // bfcvtn v18.4h, v30.4s\n"
+      ".inst 0x0ea16bb3  // bfcvtn v19.4h, v29.4s\n"
+      ".inst 0x0ea16b91  // bfcvtn v17.4h, v28.4s\n"
+      ".inst 0x0ea16b70  // bfcvtn v16.4h, v27.4s\n"
+      "str d18, [x9, #0x8]\n"
+      ".inst 0x0ea16b52  // bfcvtn v18.4h, v26.4s\n"
+      "str d19, [x27, #0x8]\n"
+      "str d17, [x26, #0x8]\n"
+      "ldr d17, [x9, #0x10]\n"
+      "str d16, [x25, #0x8]\n"
+      "ldr d16, [x27, #0x10]\n"
+      "str d18, [x24, #0x8]\n"
+      "ldr d18, [x26, #0x10]\n"
+      "shll v20.4s, v17.4h, #0x10\n"
+      "ldr d17, [x25, #0x10]\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "ldr d16, [x24, #0x10]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "fadd v25.4s, v25.4s, v20.4s\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v24.4s, v24.4s, v19.4s\n"
+      "fadd v23.4s, v23.4s, v18.4s\n"
+      "fadd v22.4s, v22.4s, v17.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fadd v21.4s, v21.4s, v16.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      ".inst 0x0ea16b30  // bfcvtn v16.4h, v25.4s\n"
+      ".inst 0x0ea16b13  // bfcvtn v19.4h, v24.4s\n"
+      ".inst 0x0ea16af2  // bfcvtn v18.4h, v23.4s\n"
+      ".inst 0x0ea16ad1  // bfcvtn v17.4h, v22.4s\n"
+      "str d16, [x9, #0x10]\n"
+      "add x9, x9, #0x18\n"
+      ".inst 0x0ea16ab0  // bfcvtn v16.4h, v21.4s\n"
+      "str d19, [x27, #0x10]\n"
+      "add x27, x27, #0x18\n"
+      "str d18, [x26, #0x10]\n"
+      "add x26, x26, #0x18\n"
+      "str d17, [x25, #0x10]\n"
+      "add x25, x25, #0x18\n"
+      "str d16, [x24, #0x10]\n"
+      "add x24, x24, #0x18\n"
+      "bge 89b\n"
+      "90:"  // Accumulate: Height 5: no full blocks
+      "cbz x10, 92f\n"
+      "mov x20, %x[in_ptr]\n"
+      "91:"  // Accumulate: Height 5: Single loop
+      "ldr h20, [x9, #0x0]\n"
+      "ldr h19, [x27, #0x0]\n"
+      "subs x10, x10, #0x1\n"
+      "ldr h18, [x26, #0x0]\n"
+      "ldr h17, [x25, #0x0]\n"
+      "ldr h16, [x24, #0x0]\n"
+      "ldr s25, [%x[in_ptr], #0x0]\n"
+      "ldr s24, [%x[in_ptr], #0x30]\n"
+      "ldr s23, [%x[in_ptr], #0x60]\n"
+      "shll v20.4s, v20.4h, #0x10\n"
+      "shll v19.4s, v19.4h, #0x10\n"
+      "ldr s22, [%x[in_ptr], #0x90]\n"
+      "ldr s21, [%x[in_ptr], #0xc0]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v25.4s, v25.4s, v20.4s\n"
+      "fadd v24.4s, v24.4s, v19.4s\n"
+      "fadd v23.4s, v23.4s, v18.4s\n"
+      "fadd v22.4s, v22.4s, v17.4s\n"
+      "fadd v21.4s, v21.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmin v21.4s, v21.4s, v13.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      "fmax v21.4s, v21.4s, v12.4s\n"
+      ".inst 0x0ea16b34  // bfcvtn v20.4h, v25.4s\n"
+      ".inst 0x0ea16b13  // bfcvtn v19.4h, v24.4s\n"
+      ".inst 0x0ea16af2  // bfcvtn v18.4h, v23.4s\n"
+      ".inst 0x0ea16ad1  // bfcvtn v17.4h, v22.4s\n"
+      ".inst 0x0ea16ab0  // bfcvtn v16.4h, v21.4s\n"
+      "str h20, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      "str h19, [x27, #0x0]\n"
+      "add x27, x27, #0x2\n"
+      "str h18, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "str h17, [x25, #0x0]\n"
+      "add x25, x25, #0x2\n"
+      "str h16, [x24, #0x0]\n"
+      "add x24, x24, #0x2\n"
+      "bne 91b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "92:"  // Accumulate: Height 5: no oddments
+      "b 108f\n"
+      "93:"  // Accumulate: Height 6
+      "mov x9, %x[out_ptr]\n"
+      "mov x10, %x[cols]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "add x23, x24, %x[ldout], LSL #1\n"
+      "blt 95f\n"
+      "94:"  // Accumulate: Height 6: Block loop
+      "ldr d21, [x9, #0x0]\n"
+      "ldr d20, [x27, #0x0]\n"
+      "sub x10, x10, #0xc\n"
+      "ldr d19, [x26, #0x0]\n"
+      "ldr d18, [x25, #0x0]\n"
+      "cmp x10, #0xc\n"
+      "ldr d17, [x24, #0x0]\n"
+      "ldr d16, [x23, #0x0]\n"
+      "ldr q6, [%x[in_ptr], #0x0]\n"
+      "ldr q5, [%x[in_ptr], #0x30]\n"
+      "shll v22.4s, v21.4h, #0x10\n"
+      "shll v21.4s, v20.4h, #0x10\n"
+      "ldr q4, [%x[in_ptr], #0x60]\n"
+      "ldr q3, [%x[in_ptr], #0x90]\n"
+      "shll v20.4s, v19.4h, #0x10\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "ldr q2, [%x[in_ptr], #0xc0]\n"
+      "ldr q19, [%x[in_ptr], #0xf0]\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "ldr q1, [%x[in_ptr], #0x10]\n"
+      "ldr q0, [%x[in_ptr], #0x40]\n"
+      "fadd v6.4s, v6.4s, v22.4s\n"
+      "fadd v5.4s, v5.4s, v21.4s\n"
+      "ldr q31, [%x[in_ptr], #0x70]\n"
+      "ldr q30, [%x[in_ptr], #0xa0]\n"
+      "fadd v4.4s, v4.4s, v20.4s\n"
+      "fadd v3.4s, v3.4s, v18.4s\n"
+      "ldr q29, [%x[in_ptr], #0xd0]\n"
+      "ldr q28, [%x[in_ptr], #0x100]\n"
+      "fadd v2.4s, v2.4s, v17.4s\n"
+      "fadd v19.4s, v19.4s, v16.4s\n"
+      "ldr q27, [%x[in_ptr], #0x20]\n"
+      "ldr q26, [%x[in_ptr], #0x50]\n"
+      "fmin v6.4s, v6.4s, v13.4s\n"
+      "fmin v5.4s, v5.4s, v13.4s\n"
+      "ldr q25, [%x[in_ptr], #0x80]\n"
+      "ldr q24, [%x[in_ptr], #0xb0]\n"
+      "fmin v4.4s, v4.4s, v13.4s\n"
+      "fmin v3.4s, v3.4s, v13.4s\n"
+      "ldr q23, [%x[in_ptr], #0xe0]\n"
+      "ldr q22, [%x[in_ptr], #0x110]\n"
+      "fmin v2.4s, v2.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmax v6.4s, v6.4s, v12.4s\n"
+      "fmax v5.4s, v5.4s, v12.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fmax v4.4s, v4.4s, v12.4s\n"
+      "fmax v3.4s, v3.4s, v12.4s\n"
+      "fmax v2.4s, v2.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      ".inst 0x0ea168d5  // bfcvtn v21.4h, v6.4s\n"
+      ".inst 0x0ea168b4  // bfcvtn v20.4h, v5.4s\n"
+      ".inst 0x0ea16892  // bfcvtn v18.4h, v4.4s\n"
+      ".inst 0x0ea16871  // bfcvtn v17.4h, v3.4s\n"
+      ".inst 0x0ea16850  // bfcvtn v16.4h, v2.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      "str d21, [x9, #0x0]\n"
+      "str d20, [x27, #0x0]\n"
+      "str d18, [x26, #0x0]\n"
+      "ldr d18, [x9, #0x8]\n"
+      "str d17, [x25, #0x0]\n"
+      "ldr d17, [x27, #0x8]\n"
+      "str d16, [x24, #0x0]\n"
+      "ldr d16, [x26, #0x8]\n"
+      "str d19, [x23, #0x0]\n"
+      "shll v21.4s, v18.4h, #0x10\n"
+      "ldr d18, [x25, #0x8]\n"
+      "shll v20.4s, v17.4h, #0x10\n"
+      "ldr d17, [x24, #0x8]\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "ldr d16, [x23, #0x8]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "fadd v1.4s, v1.4s, v21.4s\n"
+      "fadd v0.4s, v0.4s, v20.4s\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v31.4s, v31.4s, v19.4s\n"
+      "fadd v30.4s, v30.4s, v18.4s\n"
+      "fmin v1.4s, v1.4s, v13.4s\n"
+      "fmin v0.4s, v0.4s, v13.4s\n"
+      "fadd v29.4s, v29.4s, v17.4s\n"
+      "fadd v28.4s, v28.4s, v16.4s\n"
+      "fmin v31.4s, v31.4s, v13.4s\n"
+      "fmin v30.4s, v30.4s, v13.4s\n"
+      "fmax v1.4s, v1.4s, v12.4s\n"
+      "fmax v0.4s, v0.4s, v12.4s\n"
+      "fmin v29.4s, v29.4s, v13.4s\n"
+      "fmin v28.4s, v28.4s, v13.4s\n"
+      "fmax v31.4s, v31.4s, v12.4s\n"
+      "fmax v30.4s, v30.4s, v12.4s\n"
+      ".inst 0x0ea16832  // bfcvtn v18.4h, v1.4s\n"
+      ".inst 0x0ea16810  // bfcvtn v16.4h, v0.4s\n"
+      "fmax v29.4s, v29.4s, v12.4s\n"
+      "fmax v28.4s, v28.4s, v12.4s\n"
+      ".inst 0x0ea16bf4  // bfcvtn v20.4h, v31.4s\n"
+      ".inst 0x0ea16bd1  // bfcvtn v17.4h, v30.4s\n"
+      "str d18, [x9, #0x8]\n"
+      "str d16, [x27, #0x8]\n"
+      ".inst 0x0ea16bb3  // bfcvtn v19.4h, v29.4s\n"
+      ".inst 0x0ea16b92  // bfcvtn v18.4h, v28.4s\n"
+      "ldr d16, [x9, #0x10]\n"
+      "str d20, [x26, #0x8]\n"
+      "str d17, [x25, #0x8]\n"
+      "ldr d17, [x27, #0x10]\n"
+      "str d19, [x24, #0x8]\n"
+      "shll v21.4s, v16.4h, #0x10\n"
+      "ldr d16, [x26, #0x10]\n"
+      "str d18, [x23, #0x8]\n"
+      "ldr d18, [x25, #0x10]\n"
+      "shll v20.4s, v17.4h, #0x10\n"
+      "ldr d17, [x24, #0x10]\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "fadd v27.4s, v27.4s, v21.4s\n"
+      "ldr d16, [x23, #0x10]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "fadd v26.4s, v26.4s, v20.4s\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v25.4s, v25.4s, v19.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fadd v24.4s, v24.4s, v18.4s\n"
+      "fadd v23.4s, v23.4s, v17.4s\n"
+      "fadd v22.4s, v22.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      ".inst 0x0ea16b71  // bfcvtn v17.4h, v27.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      ".inst 0x0ea16b50  // bfcvtn v16.4h, v26.4s\n"
+      "str d17, [x9, #0x10]\n"
+      "add x9, x9, #0x18\n"
+      ".inst 0x0ea16b33  // bfcvtn v19.4h, v25.4s\n"
+      ".inst 0x0ea16b12  // bfcvtn v18.4h, v24.4s\n"
+      ".inst 0x0ea16af1  // bfcvtn v17.4h, v23.4s\n"
+      "str d16, [x27, #0x10]\n"
+      "add x27, x27, #0x18\n"
+      ".inst 0x0ea16ad0  // bfcvtn v16.4h, v22.4s\n"
+      "str d19, [x26, #0x10]\n"
+      "add x26, x26, #0x18\n"
+      "str d18, [x25, #0x10]\n"
+      "add x25, x25, #0x18\n"
+      "str d17, [x24, #0x10]\n"
+      "add x24, x24, #0x18\n"
+      "str d16, [x23, #0x10]\n"
+      "add x23, x23, #0x18\n"
+      "bge 94b\n"
+      "95:"  // Accumulate: Height 6: no full blocks
+      "cbz x10, 97f\n"
+      "mov x20, %x[in_ptr]\n"
+      "96:"  // Accumulate: Height 6: Single loop
+      "ldr h21, [x9, #0x0]\n"
+      "ldr h20, [x27, #0x0]\n"
+      "subs x10, x10, #0x1\n"
+      "ldr h19, [x26, #0x0]\n"
+      "ldr h18, [x25, #0x0]\n"
+      "ldr h17, [x24, #0x0]\n"
+      "ldr h16, [x23, #0x0]\n"
+      "ldr s27, [%x[in_ptr], #0x0]\n"
+      "ldr s26, [%x[in_ptr], #0x30]\n"
+      "shll v21.4s, v21.4h, #0x10\n"
+      "shll v20.4s, v20.4h, #0x10\n"
+      "ldr s25, [%x[in_ptr], #0x60]\n"
+      "ldr s24, [%x[in_ptr], #0x90]\n"
+      "shll v19.4s, v19.4h, #0x10\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "ldr s23, [%x[in_ptr], #0xc0]\n"
+      "ldr s22, [%x[in_ptr], #0xf0]\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v27.4s, v27.4s, v21.4s\n"
+      "fadd v26.4s, v26.4s, v20.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v25.4s, v25.4s, v19.4s\n"
+      "fadd v24.4s, v24.4s, v18.4s\n"
+      "fadd v23.4s, v23.4s, v17.4s\n"
+      "fadd v22.4s, v22.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      ".inst 0x0ea16b75  // bfcvtn v21.4h, v27.4s\n"
+      ".inst 0x0ea16b54  // bfcvtn v20.4h, v26.4s\n"
+      ".inst 0x0ea16b33  // bfcvtn v19.4h, v25.4s\n"
+      ".inst 0x0ea16b12  // bfcvtn v18.4h, v24.4s\n"
+      ".inst 0x0ea16af1  // bfcvtn v17.4h, v23.4s\n"
+      ".inst 0x0ea16ad0  // bfcvtn v16.4h, v22.4s\n"
+      "str h21, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      "str h20, [x27, #0x0]\n"
+      "add x27, x27, #0x2\n"
+      "str h19, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "str h18, [x25, #0x0]\n"
+      "add x25, x25, #0x2\n"
+      "str h17, [x24, #0x0]\n"
+      "add x24, x24, #0x2\n"
+      "str h16, [x23, #0x0]\n"
+      "add x23, x23, #0x2\n"
+      "bne 96b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "97:"  // Accumulate: Height 6: no oddments
+      "b 108f\n"
+      "98:"  // Accumulate: Height 7
+      "mov x9, %x[out_ptr]\n"
+      "mov x10, %x[cols]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "add x23, x24, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "add x22, x23, %x[ldout], LSL #1\n"
+      "blt 100f\n"
+      "99:"  // Accumulate: Height 7: Block loop
+      "ldr d22, [x9, #0x0]\n"
+      "ldr d21, [x27, #0x0]\n"
+      "sub x10, x10, #0xc\n"
+      "ldr d20, [x26, #0x0]\n"
+      "ldr d19, [x25, #0x0]\n"
+      "cmp x10, #0xc\n"
+      "ldr d18, [x24, #0x0]\n"
+      "ldr d17, [x23, #0x0]\n"
+      "ldr d16, [x22, #0x0]\n"
+      "ldr q9, [%x[in_ptr], #0x0]\n"
+      "shll v24.4s, v22.4h, #0x10\n"
+      "shll v23.4s, v21.4h, #0x10\n"
+      "ldr q8, [%x[in_ptr], #0x30]\n"
+      "ldr q7, [%x[in_ptr], #0x60]\n"
+      "shll v21.4s, v20.4h, #0x10\n"
+      "shll v19.4s, v19.4h, #0x10\n"
+      "ldr q6, [%x[in_ptr], #0x90]\n"
+      "ldr q5, [%x[in_ptr], #0xc0]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "ldr q20, [%x[in_ptr], #0xf0]\n"
+      "ldr q22, [%x[in_ptr], #0x120]\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v9.4s, v9.4s, v24.4s\n"
+      "ldr q4, [%x[in_ptr], #0x10]\n"
+      "ldr q3, [%x[in_ptr], #0x40]\n"
+      "fadd v8.4s, v8.4s, v23.4s\n"
+      "fadd v7.4s, v7.4s, v21.4s\n"
+      "ldr q2, [%x[in_ptr], #0x70]\n"
+      "ldr q1, [%x[in_ptr], #0xa0]\n"
+      "fadd v6.4s, v6.4s, v19.4s\n"
+      "fadd v5.4s, v5.4s, v18.4s\n"
+      "ldr q0, [%x[in_ptr], #0xd0]\n"
+      "ldr q31, [%x[in_ptr], #0x100]\n"
+      "fadd v20.4s, v20.4s, v17.4s\n"
+      "fadd v22.4s, v22.4s, v16.4s\n"
+      "ldr q30, [%x[in_ptr], #0x130]\n"
+      "ldr q29, [%x[in_ptr], #0x20]\n"
+      "fmin v9.4s, v9.4s, v13.4s\n"
+      "fmin v8.4s, v8.4s, v13.4s\n"
+      "ldr q28, [%x[in_ptr], #0x50]\n"
+      "ldr q27, [%x[in_ptr], #0x80]\n"
+      "fmin v7.4s, v7.4s, v13.4s\n"
+      "fmin v6.4s, v6.4s, v13.4s\n"
+      "ldr q26, [%x[in_ptr], #0xb0]\n"
+      "ldr q25, [%x[in_ptr], #0xe0]\n"
+      "fmin v5.4s, v5.4s, v13.4s\n"
+      "fmin v20.4s, v20.4s, v13.4s\n"
+      "ldr q24, [%x[in_ptr], #0x110]\n"
+      "ldr q23, [%x[in_ptr], #0x140]\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmax v9.4s, v9.4s, v12.4s\n"
+      "fmax v8.4s, v8.4s, v12.4s\n"
+      "fmax v7.4s, v7.4s, v12.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fmax v6.4s, v6.4s, v12.4s\n"
+      "fmax v5.4s, v5.4s, v12.4s\n"
+      "fmax v20.4s, v20.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      ".inst 0x0ea16935  // bfcvtn v21.4h, v9.4s\n"
+      ".inst 0x0ea16913  // bfcvtn v19.4h, v8.4s\n"
+      ".inst 0x0ea168f0  // bfcvtn v16.4h, v7.4s\n"
+      ".inst 0x0ea168d2  // bfcvtn v18.4h, v6.4s\n"
+      ".inst 0x0ea168b1  // bfcvtn v17.4h, v5.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      "str d21, [x9, #0x0]\n"
+      "str d19, [x27, #0x0]\n"
+      ".inst 0x0ea16ad3  // bfcvtn v19.4h, v22.4s\n"
+      "str d16, [x26, #0x0]\n"
+      "ldr d16, [x9, #0x8]\n"
+      "str d18, [x25, #0x0]\n"
+      "ldr d18, [x27, #0x8]\n"
+      "str d17, [x24, #0x0]\n"
+      "ldr d17, [x26, #0x8]\n"
+      "str d20, [x23, #0x0]\n"
+      "shll v22.4s, v16.4h, #0x10\n"
+      "ldr d16, [x25, #0x8]\n"
+      "str d19, [x22, #0x0]\n"
+      "shll v21.4s, v18.4h, #0x10\n"
+      "ldr d18, [x24, #0x8]\n"
+      "shll v20.4s, v17.4h, #0x10\n"
+      "ldr d17, [x23, #0x8]\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "fadd v4.4s, v4.4s, v22.4s\n"
+      "ldr d16, [x22, #0x8]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "fadd v3.4s, v3.4s, v21.4s\n"
+      "fadd v2.4s, v2.4s, v20.4s\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v1.4s, v1.4s, v19.4s\n"
+      "fadd v0.4s, v0.4s, v18.4s\n"
+      "fmin v4.4s, v4.4s, v13.4s\n"
+      "fadd v31.4s, v31.4s, v17.4s\n"
+      "fmin v3.4s, v3.4s, v13.4s\n"
+      "fadd v30.4s, v30.4s, v16.4s\n"
+      "fmin v2.4s, v2.4s, v13.4s\n"
+      "fmin v1.4s, v1.4s, v13.4s\n"
+      "fmin v0.4s, v0.4s, v13.4s\n"
+      "fmin v31.4s, v31.4s, v13.4s\n"
+      "fmax v4.4s, v4.4s, v12.4s\n"
+      "fmin v30.4s, v30.4s, v13.4s\n"
+      "fmax v3.4s, v3.4s, v12.4s\n"
+      "fmax v2.4s, v2.4s, v12.4s\n"
+      "fmax v1.4s, v1.4s, v12.4s\n"
+      "fmax v0.4s, v0.4s, v12.4s\n"
+      "fmax v31.4s, v31.4s, v12.4s\n"
+      "fmax v30.4s, v30.4s, v12.4s\n"
+      ".inst 0x0ea16893  // bfcvtn v19.4h, v4.4s\n"
+      ".inst 0x0ea16875  // bfcvtn v21.4h, v3.4s\n"
+      ".inst 0x0ea16850  // bfcvtn v16.4h, v2.4s\n"
+      ".inst 0x0ea16832  // bfcvtn v18.4h, v1.4s\n"
+      ".inst 0x0ea16811  // bfcvtn v17.4h, v0.4s\n"
+      "str d19, [x9, #0x8]\n"
+      ".inst 0x0ea16bf4  // bfcvtn v20.4h, v31.4s\n"
+      ".inst 0x0ea16bd3  // bfcvtn v19.4h, v30.4s\n"
+      "str d21, [x27, #0x8]\n"
+      "str d16, [x26, #0x8]\n"
+      "ldr d16, [x9, #0x10]\n"
+      "str d18, [x25, #0x8]\n"
+      "ldr d18, [x27, #0x10]\n"
+      "str d17, [x24, #0x8]\n"
+      "ldr d17, [x26, #0x10]\n"
+      "str d20, [x23, #0x8]\n"
+      "shll v22.4s, v16.4h, #0x10\n"
+      "ldr d16, [x25, #0x10]\n"
+      "str d19, [x22, #0x8]\n"
+      "shll v21.4s, v18.4h, #0x10\n"
+      "ldr d18, [x24, #0x10]\n"
+      "shll v20.4s, v17.4h, #0x10\n"
+      "ldr d17, [x23, #0x10]\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "fadd v29.4s, v29.4s, v22.4s\n"
+      "ldr d16, [x22, #0x10]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "fadd v28.4s, v28.4s, v21.4s\n"
+      "fadd v27.4s, v27.4s, v20.4s\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v26.4s, v26.4s, v19.4s\n"
+      "fadd v25.4s, v25.4s, v18.4s\n"
+      "fmin v29.4s, v29.4s, v13.4s\n"
+      "fadd v24.4s, v24.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v13.4s\n"
+      "fadd v23.4s, v23.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmax v29.4s, v29.4s, v12.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmax v28.4s, v28.4s, v12.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      ".inst 0x0ea16bb0  // bfcvtn v16.4h, v29.4s\n"
+      ".inst 0x0ea16b95  // bfcvtn v21.4h, v28.4s\n"
+      ".inst 0x0ea16b74  // bfcvtn v20.4h, v27.4s\n"
+      ".inst 0x0ea16b53  // bfcvtn v19.4h, v26.4s\n"
+      ".inst 0x0ea16b32  // bfcvtn v18.4h, v25.4s\n"
+      "str d16, [x9, #0x10]\n"
+      "add x9, x9, #0x18\n"
+      ".inst 0x0ea16b11  // bfcvtn v17.4h, v24.4s\n"
+      ".inst 0x0ea16af0  // bfcvtn v16.4h, v23.4s\n"
+      "str d21, [x27, #0x10]\n"
+      "add x27, x27, #0x18\n"
+      "str d20, [x26, #0x10]\n"
+      "add x26, x26, #0x18\n"
+      "str d19, [x25, #0x10]\n"
+      "add x25, x25, #0x18\n"
+      "str d18, [x24, #0x10]\n"
+      "add x24, x24, #0x18\n"
+      "str d17, [x23, #0x10]\n"
+      "add x23, x23, #0x18\n"
+      "str d16, [x22, #0x10]\n"
+      "add x22, x22, #0x18\n"
+      "bge 99b\n"
+      "100:"  // Accumulate: Height 7: no full blocks
+      "cbz x10, 102f\n"
+      "mov x20, %x[in_ptr]\n"
+      "101:"  // Accumulate: Height 7: Single loop
+      "ldr h22, [x9, #0x0]\n"
+      "ldr h21, [x27, #0x0]\n"
+      "subs x10, x10, #0x1\n"
+      "ldr h20, [x26, #0x0]\n"
+      "ldr h19, [x25, #0x0]\n"
+      "ldr h18, [x24, #0x0]\n"
+      "ldr h17, [x23, #0x0]\n"
+      "ldr h16, [x22, #0x0]\n"
+      "ldr s29, [%x[in_ptr], #0x0]\n"
+      "shll v28.4s, v22.4h, #0x10\n"
+      "shll v27.4s, v21.4h, #0x10\n"
+      "ldr s26, [%x[in_ptr], #0x30]\n"
+      "ldr s25, [%x[in_ptr], #0x60]\n"
+      "shll v21.4s, v20.4h, #0x10\n"
+      "shll v20.4s, v19.4h, #0x10\n"
+      "ldr s24, [%x[in_ptr], #0x90]\n"
+      "ldr s23, [%x[in_ptr], #0xc0]\n"
+      "shll v19.4s, v18.4h, #0x10\n"
+      "shll v18.4s, v17.4h, #0x10\n"
+      "ldr s17, [%x[in_ptr], #0xf0]\n"
+      "ldr s22, [%x[in_ptr], #0x120]\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v29.4s, v29.4s, v28.4s\n"
+      "fadd v26.4s, v26.4s, v27.4s\n"
+      "fadd v25.4s, v25.4s, v21.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v24.4s, v24.4s, v20.4s\n"
+      "fadd v23.4s, v23.4s, v19.4s\n"
+      "fadd v17.4s, v17.4s, v18.4s\n"
+      "fadd v22.4s, v22.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v17.4s, v17.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmax v29.4s, v29.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v17.4s, v17.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      ".inst 0x0ea16bb5  // bfcvtn v21.4h, v29.4s\n"
+      ".inst 0x0ea16b50  // bfcvtn v16.4h, v26.4s\n"
+      ".inst 0x0ea16b34  // bfcvtn v20.4h, v25.4s\n"
+      ".inst 0x0ea16b13  // bfcvtn v19.4h, v24.4s\n"
+      ".inst 0x0ea16af2  // bfcvtn v18.4h, v23.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      "str h21, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      "str h16, [x27, #0x0]\n"
+      ".inst 0x0ea16ad0  // bfcvtn v16.4h, v22.4s\n"
+      "add x27, x27, #0x2\n"
+      "str h20, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "str h19, [x25, #0x0]\n"
+      "add x25, x25, #0x2\n"
+      "str h18, [x24, #0x0]\n"
+      "add x24, x24, #0x2\n"
+      "str h17, [x23, #0x0]\n"
+      "add x23, x23, #0x2\n"
+      "str h16, [x22, #0x0]\n"
+      "add x22, x22, #0x2\n"
+      "bne 101b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "102:"  // Accumulate: Height 7: no oddments
+      "b 108f\n"
+      "103:"  // Accumulate: Height 8
+      "mov x9, %x[out_ptr]\n"
+      "mov x10, %x[cols]\n"
+      "add x27, x9, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "add x23, x24, %x[ldout], LSL #1\n"
+      "cmp x10, #0xc\n"
+      "add x22, x23, %x[ldout], LSL #1\n"
+      "add x21, x22, %x[ldout], LSL #1\n"
+      "blt 105f\n"
+      "104:"  // Accumulate: Height 8: Block loop
+      "ldr d23, [x9, #0x0]\n"
+      "ldr d22, [x27, #0x0]\n"
+      "sub x10, x10, #0xc\n"
+      "ldr d21, [x26, #0x0]\n"
+      "ldr d20, [x25, #0x0]\n"
+      "cmp x10, #0xc\n"
+      "ldr d19, [x24, #0x0]\n"
+      "ldr d18, [x23, #0x0]\n"
+      "ldr d17, [x22, #0x0]\n"
+      "ldr d16, [x21, #0x0]\n"
+      "shll v26.4s, v23.4h, #0x10\n"
+      "shll v25.4s, v22.4h, #0x10\n"
+      "ldr q11, [%x[in_ptr], #0x0]\n"
+      "ldr q10, [%x[in_ptr], #0x30]\n"
+      "shll v24.4s, v21.4h, #0x10\n"
+      "shll v23.4s, v20.4h, #0x10\n"
+      "ldr q9, [%x[in_ptr], #0x60]\n"
+      "ldr q8, [%x[in_ptr], #0x90]\n"
+      "shll v21.4s, v19.4h, #0x10\n"
+      "shll v20.4s, v18.4h, #0x10\n"
+      "ldr q18, [%x[in_ptr], #0xc0]\n"
+      "ldr q19, [%x[in_ptr], #0xf0]\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "ldr q7, [%x[in_ptr], #0x120]\n"
+      "ldr q22, [%x[in_ptr], #0x150]\n"
+      "fadd v11.4s, v11.4s, v26.4s\n"
+      "fadd v10.4s, v10.4s, v25.4s\n"
+      "ldr q6, [%x[in_ptr], #0x10]\n"
+      "ldr q5, [%x[in_ptr], #0x40]\n"
+      "fadd v9.4s, v9.4s, v24.4s\n"
+      "fadd v8.4s, v8.4s, v23.4s\n"
+      "ldr q4, [%x[in_ptr], #0x70]\n"
+      "ldr q3, [%x[in_ptr], #0xa0]\n"
+      "fadd v18.4s, v18.4s, v21.4s\n"
+      "fadd v19.4s, v19.4s, v20.4s\n"
+      "ldr q2, [%x[in_ptr], #0xd0]\n"
+      "ldr q1, [%x[in_ptr], #0x100]\n"
+      "fadd v7.4s, v7.4s, v17.4s\n"
+      "fadd v22.4s, v22.4s, v16.4s\n"
+      "ldr q0, [%x[in_ptr], #0x130]\n"
+      "ldr q31, [%x[in_ptr], #0x160]\n"
+      "fmin v11.4s, v11.4s, v13.4s\n"
+      "fmin v10.4s, v10.4s, v13.4s\n"
+      "ldr q30, [%x[in_ptr], #0x20]\n"
+      "ldr q29, [%x[in_ptr], #0x50]\n"
+      "fmin v9.4s, v9.4s, v13.4s\n"
+      "fmin v8.4s, v8.4s, v13.4s\n"
+      "ldr q28, [%x[in_ptr], #0x80]\n"
+      "ldr q27, [%x[in_ptr], #0xb0]\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "ldr q26, [%x[in_ptr], #0xe0]\n"
+      "ldr q25, [%x[in_ptr], #0x110]\n"
+      "fmin v7.4s, v7.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "ldr q24, [%x[in_ptr], #0x140]\n"
+      "ldr q23, [%x[in_ptr], #0x170]\n"
+      "fmax v11.4s, v11.4s, v12.4s\n"
+      "fmax v10.4s, v10.4s, v12.4s\n"
+      "fmax v9.4s, v9.4s, v12.4s\n"
+      "fmax v8.4s, v8.4s, v12.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x180\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v7.4s, v7.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      ".inst 0x0ea16975  // bfcvtn v21.4h, v11.4s\n"
+      ".inst 0x0ea16954  // bfcvtn v20.4h, v10.4s\n"
+      ".inst 0x0ea16931  // bfcvtn v17.4h, v9.4s\n"
+      ".inst 0x0ea16910  // bfcvtn v16.4h, v8.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      "str d21, [x9, #0x0]\n"
+      "str d20, [x27, #0x0]\n"
+      ".inst 0x0ea168f5  // bfcvtn v21.4h, v7.4s\n"
+      ".inst 0x0ea16ad4  // bfcvtn v20.4h, v22.4s\n"
+      "str d17, [x26, #0x0]\n"
+      "ldr d17, [x9, #0x8]\n"
+      "str d16, [x25, #0x0]\n"
+      "ldr d16, [x27, #0x8]\n"
+      "str d18, [x24, #0x0]\n"
+      "ldr d18, [x26, #0x8]\n"
+      "str d19, [x23, #0x0]\n"
+      "shll v19.4s, v17.4h, #0x10\n"
+      "ldr d17, [x25, #0x8]\n"
+      "str d21, [x22, #0x0]\n"
+      "shll v22.4s, v16.4h, #0x10\n"
+      "ldr d16, [x24, #0x8]\n"
+      "str d20, [x21, #0x0]\n"
+      "shll v21.4s, v18.4h, #0x10\n"
+      "ldr d18, [x23, #0x8]\n"
+      "shll v20.4s, v17.4h, #0x10\n"
+      "fadd v6.4s, v6.4s, v19.4s\n"
+      "ldr d17, [x22, #0x8]\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "fadd v5.4s, v5.4s, v22.4s\n"
+      "ldr d16, [x21, #0x8]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "fadd v4.4s, v4.4s, v21.4s\n"
+      "fadd v3.4s, v3.4s, v20.4s\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "fadd v2.4s, v2.4s, v19.4s\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v1.4s, v1.4s, v18.4s\n"
+      "fmin v6.4s, v6.4s, v13.4s\n"
+      "fmin v5.4s, v5.4s, v13.4s\n"
+      "fadd v0.4s, v0.4s, v17.4s\n"
+      "fmin v4.4s, v4.4s, v13.4s\n"
+      "fadd v31.4s, v31.4s, v16.4s\n"
+      "fmin v3.4s, v3.4s, v13.4s\n"
+      "fmin v2.4s, v2.4s, v13.4s\n"
+      "fmin v1.4s, v1.4s, v13.4s\n"
+      "fmin v0.4s, v0.4s, v13.4s\n"
+      "fmax v6.4s, v6.4s, v12.4s\n"
+      "fmin v31.4s, v31.4s, v13.4s\n"
+      "fmax v5.4s, v5.4s, v12.4s\n"
+      "fmax v4.4s, v4.4s, v12.4s\n"
+      "fmax v3.4s, v3.4s, v12.4s\n"
+      "fmax v2.4s, v2.4s, v12.4s\n"
+      "fmax v1.4s, v1.4s, v12.4s\n"
+      "fmax v0.4s, v0.4s, v12.4s\n"
+      "fmax v31.4s, v31.4s, v12.4s\n"
+      ".inst 0x0ea168d5  // bfcvtn v21.4h, v6.4s\n"
+      ".inst 0x0ea168b4  // bfcvtn v20.4h, v5.4s\n"
+      ".inst 0x0ea16891  // bfcvtn v17.4h, v4.4s\n"
+      ".inst 0x0ea16870  // bfcvtn v16.4h, v3.4s\n"
+      ".inst 0x0ea16852  // bfcvtn v18.4h, v2.4s\n"
+      ".inst 0x0ea16833  // bfcvtn v19.4h, v1.4s\n"
+      "str d21, [x9, #0x8]\n"
+      "str d20, [x27, #0x8]\n"
+      ".inst 0x0ea16815  // bfcvtn v21.4h, v0.4s\n"
+      ".inst 0x0ea16bf4  // bfcvtn v20.4h, v31.4s\n"
+      "str d17, [x26, #0x8]\n"
+      "ldr d17, [x9, #0x10]\n"
+      "str d16, [x25, #0x8]\n"
+      "ldr d16, [x27, #0x10]\n"
+      "str d18, [x24, #0x8]\n"
+      "ldr d18, [x26, #0x10]\n"
+      "str d19, [x23, #0x8]\n"
+      "shll v19.4s, v17.4h, #0x10\n"
+      "ldr d17, [x25, #0x10]\n"
+      "str d21, [x22, #0x8]\n"
+      "shll v22.4s, v16.4h, #0x10\n"
+      "ldr d16, [x24, #0x10]\n"
+      "str d20, [x21, #0x8]\n"
+      "shll v21.4s, v18.4h, #0x10\n"
+      "ldr d18, [x23, #0x10]\n"
+      "shll v20.4s, v17.4h, #0x10\n"
+      "fadd v30.4s, v30.4s, v19.4s\n"
+      "ldr d17, [x22, #0x10]\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "fadd v29.4s, v29.4s, v22.4s\n"
+      "ldr d16, [x21, #0x10]\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "fadd v28.4s, v28.4s, v21.4s\n"
+      "fadd v27.4s, v27.4s, v20.4s\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "fadd v26.4s, v26.4s, v19.4s\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "fadd v25.4s, v25.4s, v18.4s\n"
+      "fmin v30.4s, v30.4s, v13.4s\n"
+      "fmin v29.4s, v29.4s, v13.4s\n"
+      "fadd v24.4s, v24.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v13.4s\n"
+      "fadd v23.4s, v23.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v13.4s\n"
+      "fmin v26.4s, v26.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmax v30.4s, v30.4s, v12.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmax v29.4s, v29.4s, v12.4s\n"
+      "fmax v28.4s, v28.4s, v12.4s\n"
+      "fmax v27.4s, v27.4s, v12.4s\n"
+      "fmax v26.4s, v26.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      ".inst 0x0ea16bd1  // bfcvtn v17.4h, v30.4s\n"
+      ".inst 0x0ea16bb0  // bfcvtn v16.4h, v29.4s\n"
+      ".inst 0x0ea16b95  // bfcvtn v21.4h, v28.4s\n"
+      ".inst 0x0ea16b74  // bfcvtn v20.4h, v27.4s\n"
+      ".inst 0x0ea16b53  // bfcvtn v19.4h, v26.4s\n"
+      ".inst 0x0ea16b32  // bfcvtn v18.4h, v25.4s\n"
+      "str d17, [x9, #0x10]\n"
+      "add x9, x9, #0x18\n"
+      "str d16, [x27, #0x10]\n"
+      ".inst 0x0ea16b11  // bfcvtn v17.4h, v24.4s\n"
+      ".inst 0x0ea16af0  // bfcvtn v16.4h, v23.4s\n"
+      "add x27, x27, #0x18\n"
+      "str d21, [x26, #0x10]\n"
+      "add x26, x26, #0x18\n"
+      "str d20, [x25, #0x10]\n"
+      "add x25, x25, #0x18\n"
+      "str d19, [x24, #0x10]\n"
+      "add x24, x24, #0x18\n"
+      "str d18, [x23, #0x10]\n"
+      "add x23, x23, #0x18\n"
+      "str d17, [x22, #0x10]\n"
+      "add x22, x22, #0x18\n"
+      "str d16, [x21, #0x10]\n"
+      "add x21, x21, #0x18\n"
+      "bge 104b\n"
+      "105:"  // Accumulate: Height 8: no full blocks
+      "cbz x10, 107f\n"
+      "mov x20, %x[in_ptr]\n"
+      "106:"  // Accumulate: Height 8: Single loop
+      "ldr h23, [x9, #0x0]\n"
+      "ldr h22, [x27, #0x0]\n"
+      "subs x10, x10, #0x1\n"
+      "ldr h21, [x26, #0x0]\n"
+      "ldr h20, [x25, #0x0]\n"
+      "ldr h19, [x24, #0x0]\n"
+      "ldr h18, [x23, #0x0]\n"
+      "ldr h17, [x22, #0x0]\n"
+      "ldr h16, [x21, #0x0]\n"
+      "shll v31.4s, v23.4h, #0x10\n"
+      "shll v30.4s, v22.4h, #0x10\n"
+      "ldr s29, [%x[in_ptr], #0x0]\n"
+      "ldr s28, [%x[in_ptr], #0x30]\n"
+      "shll v27.4s, v21.4h, #0x10\n"
+      "shll v26.4s, v20.4h, #0x10\n"
+      "ldr s25, [%x[in_ptr], #0x60]\n"
+      "ldr s24, [%x[in_ptr], #0x90]\n"
+      "shll v21.4s, v19.4h, #0x10\n"
+      "shll v20.4s, v18.4h, #0x10\n"
+      "ldr s19, [%x[in_ptr], #0xc0]\n"
+      "ldr s18, [%x[in_ptr], #0xf0]\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "ldr s23, [%x[in_ptr], #0x120]\n"
+      "ldr s22, [%x[in_ptr], #0x150]\n"
+      "fadd v29.4s, v29.4s, v31.4s\n"
+      "fadd v28.4s, v28.4s, v30.4s\n"
+      "fadd v25.4s, v25.4s, v27.4s\n"
+      "fadd v24.4s, v24.4s, v26.4s\n"
+      "add %x[in_ptr], %x[in_ptr], #0x4\n"
+      "fadd v19.4s, v19.4s, v21.4s\n"
+      "fadd v18.4s, v18.4s, v20.4s\n"
+      "fadd v23.4s, v23.4s, v17.4s\n"
+      "fadd v22.4s, v22.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v13.4s\n"
+      "fmin v28.4s, v28.4s, v13.4s\n"
+      "fmin v25.4s, v25.4s, v13.4s\n"
+      "fmin v24.4s, v24.4s, v13.4s\n"
+      "fmin v19.4s, v19.4s, v13.4s\n"
+      "fmin v18.4s, v18.4s, v13.4s\n"
+      "fmin v23.4s, v23.4s, v13.4s\n"
+      "fmin v22.4s, v22.4s, v13.4s\n"
+      "fmax v29.4s, v29.4s, v12.4s\n"
+      "fmax v28.4s, v28.4s, v12.4s\n"
+      "fmax v25.4s, v25.4s, v12.4s\n"
+      "fmax v24.4s, v24.4s, v12.4s\n"
+      "fmax v19.4s, v19.4s, v12.4s\n"
+      "fmax v18.4s, v18.4s, v12.4s\n"
+      "fmax v23.4s, v23.4s, v12.4s\n"
+      "fmax v22.4s, v22.4s, v12.4s\n"
+      ".inst 0x0ea16bb1  // bfcvtn v17.4h, v29.4s\n"
+      ".inst 0x0ea16b90  // bfcvtn v16.4h, v28.4s\n"
+      ".inst 0x0ea16b35  // bfcvtn v21.4h, v25.4s\n"
+      ".inst 0x0ea16b14  // bfcvtn v20.4h, v24.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      "str h17, [x9, #0x0]\n"
+      "add x9, x9, #0x2\n"
+      "str h16, [x27, #0x0]\n"
+      ".inst 0x0ea16af1  // bfcvtn v17.4h, v23.4s\n"
+      ".inst 0x0ea16ad0  // bfcvtn v16.4h, v22.4s\n"
+      "add x27, x27, #0x2\n"
+      "str h21, [x26, #0x0]\n"
+      "add x26, x26, #0x2\n"
+      "str h20, [x25, #0x0]\n"
+      "add x25, x25, #0x2\n"
+      "str h19, [x24, #0x0]\n"
+      "add x24, x24, #0x2\n"
+      "str h18, [x23, #0x0]\n"
+      "add x23, x23, #0x2\n"
+      "str h17, [x22, #0x0]\n"
+      "add x22, x22, #0x2\n"
+      "str h16, [x21, #0x0]\n"
+      "add x21, x21, #0x2\n"
+      "bne 106b\n"
+      "add %x[in_ptr], x20, #0x180\n"
+      "107:"  // Accumulate: Height 8: no oddments
+      "subs %x[rows], %x[rows], #0x8\n"
+      "add %x[out_ptr], %x[out_ptr], x11\n"
+      "bgt 67b\n"
+      "108:"  // Exit
+      : [in_ptr] "+&r" (in_ptr), [out_ptr] "+&r" (out_ptr), [rows] "+&r" (rows)
+      : [accumulate] "r" (accumulate), [bias] "r" (bias), [cols] "r" (cols), [ldout] "r" (ldout), [maxval] "r" (maxval), [minval] "r" (minval)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/merges/list-fp16.hpp b/src/core/NEON/kernels/arm_gemm/merges/list-fp16.hpp
new file mode 100644
index 0000000000..c1356347df
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/list-fp16.hpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "a64_merge_fp16_24x8.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/merges/list-sve.hpp b/src/core/NEON/kernels/arm_gemm/merges/list-sve.hpp
new file mode 100644
index 0000000000..d11740e5c8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/list-sve.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "sve_merge_fp16_3VLx8.hpp"
+#include "sve_merge_fp32_3VLx8.hpp"
+#include "sve_merge_fp32_bf16_8x3VL.hpp"
+#include "sve_merge_s32_3VLx8.hpp"
+#include "sve_merge_u32_3VLx8.hpp"
+\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/merges/list.hpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
index 825c2fd020..fd6be5b69b 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,13 +22,9 @@
  * SOFTWARE.
  */
 #include "a32_merge_float_8x6.hpp"
-#include "a64_merge_fp16_24x8.hpp"
 #include "a64_merge_fp32_12x8.hpp"
+#include "a64_merge_fp32_bf16_8x12.hpp"
 #include "a64_merge_s32_12x8.hpp"
 #include "a64_merge_s32_4x4.hpp"
 #include "a64_merge_u32_12x8.hpp"
-#include "a64_merge_u32_4x4.hpp"
-#include "sve_merge_fp16_3VLx8.hpp"
-#include "sve_merge_fp32_3VLx8.hpp"
-#include "sve_merge_s32_3VLx8.hpp"
-#include "sve_merge_u32_3VLx8.hpp"
+#include "a64_merge_u32_4x4.hpp"
+\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
index cf1d10329b..a211a03697 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020,2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 template<>
 void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const __fp16 *bias, Activation act, bool append)
@@ -1029,25 +1029,25 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
                             "whilelt p1.h, %[p], %[w]\n"
-                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
-                            "inch %[p], all, mul #1\n"
-                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1h z13.h, p0/z, [%[inptr]]\n"
+                            "inch %[p], all, mul #1\n"
+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "fadd z13.h, z13.h, z2.h\n"
                             "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
                             "whilelt p2.h, %[p], %[w]\n"
-                            "fadd z13.h, z13.h, z2.h\n"
+                            "fmin z13.h, p0/m, z13.h, z0.h\n"
+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "fadd z14.h, z14.h, z3.h\n"
                             "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
                             "addvl %[inptr], %[inptr], #24\n"
-                            "fmin z13.h, p0/m, z13.h, z0.h\n"
+                            "fmax z13.h, p0/m, z13.h, z1.h\n"
                             "fmin z14.h, p1/m, z14.h, z0.h\n"
                             "fadd z15.h, z15.h, z4.h\n"
-                            "fmax z13.h, p0/m, z13.h, z1.h\n"
+                            "st1h z13.h, p0, [%[outptr0]]\n"
                             "fmax z14.h, p1/m, z14.h, z1.h\n"
                             "fmin z15.h, p2/m, z15.h, z0.h\n"
-                            "st1h z13.h, p0, [%[outptr0]]\n"
-                            "fmax z15.h, p2/m, z15.h, z1.h\n"
                             "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
@@ -1073,42 +1073,42 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
                             "whilelt p1.h, %[p], %[w]\n"
-                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
                             "inch %[p], all, mul #1\n"
-                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                            "whilelt p2.h, %[p], %[w]\n"
+                            "fadd z13.h, z13.h, z2.h\n"
+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.h, %[p], %[w]\n"
+                            "fadd z16.h, z16.h, z2.h\n"
+                            "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
+                            "fmin z13.h, p0/m, z13.h, z0.h\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "fadd z13.h, z13.h, z2.h\n"
-                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
                             "fadd z14.h, z14.h, z3.h\n"
-                            "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "fmin z16.h, p0/m, z16.h, z0.h\n"
+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "fmax z13.h, p0/m, z13.h, z1.h\n"
                             "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fadd z17.h, z17.h, z3.h\n"
                             "addvl %[inptr], %[inptr], #24\n"
-                            "fmin z13.h, p0/m, z13.h, z0.h\n"
                             "fmin z14.h, p1/m, z14.h, z0.h\n"
+                            "st1h z13.h, p0, [%[outptr0]]\n"
                             "fadd z15.h, z15.h, z4.h\n"
-                            "fadd z16.h, z16.h, z2.h\n"
-                            "fmax z13.h, p0/m, z13.h, z1.h\n"
+                            "fmax z16.h, p0/m, z16.h, z1.h\n"
+                            "fmin z17.h, p1/m, z17.h, z0.h\n"
                             "fmax z14.h, p1/m, z14.h, z1.h\n"
                             "fmin z15.h, p2/m, z15.h, z0.h\n"
-                            "fmin z16.h, p0/m, z16.h, z0.h\n"
-                            "st1h z13.h, p0, [%[outptr0]]\n"
-                            "fadd z17.h, z17.h, z3.h\n"
                             "fadd z18.h, z18.h, z4.h\n"
-                            "fmax z15.h, p2/m, z15.h, z1.h\n"
+                            "fmax z17.h, p1/m, z17.h, z1.h\n"
                             "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-                            "fmax z16.h, p0/m, z16.h, z1.h\n"
-                            "fmin z17.h, p1/m, z17.h, z0.h\n"
+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
                             "fmin z18.h, p2/m, z18.h, z0.h\n"
                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
-                            "fmax z17.h, p1/m, z17.h, z1.h\n"
-                            "st1h z16.h, p0, [%[outptr1]]\n"
                             "fmax z18.h, p2/m, z18.h, z1.h\n"
+                            "st1h z16.h, p0, [%[outptr1]]\n"
                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
@@ -1135,60 +1135,60 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
                             "whilelt p1.h, %[p], %[w]\n"
-                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
                             "inch %[p], all, mul #1\n"
-                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                            "whilelt p2.h, %[p], %[w]\n"
-                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.h, z13.h, z2.h\n"
-                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.h, %[p], %[w]\n"
+                            "fadd z16.h, z16.h, z2.h\n"
                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.h, p0/m, z13.h, z0.h\n"
-                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.h, z15.h, z4.h\n"
                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.h, z16.h, z2.h\n"
-                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.h, p0/m, z13.h, z1.h\n"
-                            "addvl %[inptr], %[inptr], #24\n"
-                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.h, z15.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.h, p0/m, z16.h, z0.h\n"
+                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.h, z17.h, z3.h\n"
-                            "fadd z18.h, z18.h, z4.h\n"
+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fmax z16.h, p0/m, z16.h, z1.h\n"
                             "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fadd z17.h, z17.h, z3.h\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "fmax z15.h, p2/m, z15.h, z1.h\n"
-                            "fmax z16.h, p0/m, z16.h, z1.h\n"
+                            "fadd z18.h, z18.h, z4.h\n"
                             "fmin z17.h, p1/m, z17.h, z0.h\n"
-                            "fmin z18.h, p2/m, z18.h, z0.h\n"
-                            "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.h, z19.h, z2.h\n"
+                            "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
+                            "fadd z20.h, z20.h, z3.h\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
                             "fmax z17.h, p1/m, z17.h, z1.h\n"
                             "st1h z16.h, p0, [%[outptr1]]\n"
-                            "fmax z18.h, p2/m, z18.h, z1.h\n"
+                            "fmin z18.h, p2/m, z18.h, z0.h\n"
                             "fmin z19.h, p0/m, z19.h, z0.h\n"
-                            "fadd z20.h, z20.h, z3.h\n"
+                            "fmin z20.h, p1/m, z20.h, z0.h\n"
                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
                             "fadd z13.h, z13.h, z4.h\n"
+                            "fmax z18.h, p2/m, z18.h, z1.h\n"
                             "fmax z19.h, p0/m, z19.h, z1.h\n"
+                            "fmax z20.h, p1/m, z20.h, z1.h\n"
+                            "fmin z13.h, p2/m, z13.h, z0.h\n"
                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
-                            "fmin z20.h, p1/m, z20.h, z0.h\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
-                            "fmin z13.h, p2/m, z13.h, z0.h\n"
-                            "st1h z19.h, p0, [%[outptr2]]\n"
-                            "fmax z20.h, p1/m, z20.h, z1.h\n"
                             "fmax z13.h, p2/m, z13.h, z1.h\n"
+                            "st1h z19.h, p0, [%[outptr2]]\n"
                             "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
@@ -1215,75 +1215,75 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
                             "whilelt p1.h, %[p], %[w]\n"
-                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
                             "inch %[p], all, mul #1\n"
-                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                            "whilelt p2.h, %[p], %[w]\n"
-                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.h, z13.h, z2.h\n"
-                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.h, %[p], %[w]\n"
+                            "fadd z16.h, z16.h, z2.h\n"
                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.h, p0/m, z13.h, z0.h\n"
-                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.h, z15.h, z4.h\n"
                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.h, z16.h, z2.h\n"
-                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.h, p0/m, z13.h, z1.h\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                            "addvl %[inptr], %[inptr], #24\n"
-                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.h, z15.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.h, p0/m, z16.h, z0.h\n"
+                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.h, z17.h, z3.h\n"
-                            "fadd z18.h, z18.h, z4.h\n"
+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fmax z16.h, p0/m, z16.h, z1.h\n"
                             "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-                            "fmax z15.h, p2/m, z15.h, z1.h\n"
+                            "fadd z17.h, z17.h, z3.h\n"
                             "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
-                            "fmax z16.h, p0/m, z16.h, z1.h\n"
+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "fadd z18.h, z18.h, z4.h\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "fmin z17.h, p1/m, z17.h, z0.h\n"
-                            "fmin z18.h, p2/m, z18.h, z0.h\n"
                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.h, z19.h, z2.h\n"
                             "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
-                            "fadd z20.h, z20.h, z3.h\n"
+                            "fmin z18.h, p2/m, z18.h, z0.h\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
                             "fmax z17.h, p1/m, z17.h, z1.h\n"
                             "st1h z16.h, p0, [%[outptr1]]\n"
-                            "fmax z18.h, p2/m, z18.h, z1.h\n"
-                            "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
                             "fmin z19.h, p0/m, z19.h, z0.h\n"
-                            "fmin z20.h, p1/m, z20.h, z0.h\n"
+                            "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
+                            "fmax z18.h, p2/m, z18.h, z1.h\n"
+                            "fadd z20.h, z20.h, z3.h\n"
                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
                             "fadd z13.h, z13.h, z4.h\n"
-                            "fadd z14.h, z14.h, z2.h\n"
                             "fmax z19.h, p0/m, z19.h, z1.h\n"
+                            "fadd z14.h, z14.h, z2.h\n"
                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
-                            "fmax z20.h, p1/m, z20.h, z1.h\n"
+                            "fmin z20.h, p1/m, z20.h, z0.h\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "fmin z13.h, p2/m, z13.h, z0.h\n"
                             "st1h z19.h, p0, [%[outptr2]]\n"
                             "fmin z14.h, p0/m, z14.h, z0.h\n"
+                            "fmax z20.h, p1/m, z20.h, z1.h\n"
                             "fadd z15.h, z15.h, z3.h\n"
-                            "fadd z16.h, z16.h, z4.h\n"
-                            "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
                             "fmax z13.h, p2/m, z13.h, z1.h\n"
                             "fmax z14.h, p0/m, z14.h, z1.h\n"
+                            "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
+                            "fadd z16.h, z16.h, z4.h\n"
                             "fmin z15.h, p1/m, z15.h, z0.h\n"
-                            "fmin z16.h, p2/m, z16.h, z0.h\n"
                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "fmax z15.h, p1/m, z15.h, z1.h\n"
+                            "fmin z16.h, p2/m, z16.h, z0.h\n"
                             "st1h z14.h, p0, [%[outptr3]]\n"
                             "fmax z16.h, p2/m, z16.h, z1.h\n"
                             "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
@@ -1312,93 +1312,93 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
                             "whilelt p1.h, %[p], %[w]\n"
-                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
                             "inch %[p], all, mul #1\n"
-                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                            "whilelt p2.h, %[p], %[w]\n"
-                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.h, z13.h, z2.h\n"
-                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.h, %[p], %[w]\n"
+                            "fadd z16.h, z16.h, z2.h\n"
                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.h, p0/m, z13.h, z0.h\n"
-                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.h, z15.h, z4.h\n"
                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.h, z16.h, z2.h\n"
-                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.h, p0/m, z13.h, z1.h\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.h, z15.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.h, p0/m, z16.h, z0.h\n"
+                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.h, z17.h, z3.h\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "fmax z16.h, p0/m, z16.h, z1.h\n"
+                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fadd z17.h, z17.h, z3.h\n"
                             "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
-                            "fmin z17.h, p1/m, z17.h, z0.h\n"
-                            "addvl %[inptr], %[inptr], #24\n"
+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "fadd z18.h, z18.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmin z17.h, p1/m, z17.h, z0.h\n"
                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.h, z19.h, z2.h\n"
                             "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
-                            "fmax z17.h, p1/m, z17.h, z1.h\n"
-                            "addvl %[outptr0], %[outptr0], #3\n"
                             "fmin z18.h, p2/m, z18.h, z0.h\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmax z17.h, p1/m, z17.h, z1.h\n"
                             "st1h z16.h, p0, [%[outptr1]]\n"
                             "fmin z19.h, p0/m, z19.h, z0.h\n"
                             "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
+                            "fmax z18.h, p2/m, z18.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
                             "fadd z20.h, z20.h, z3.h\n"
-                            "fadd z13.h, z13.h, z4.h\n"
                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
-                            "fmax z18.h, p2/m, z18.h, z1.h\n"
-                            "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
                             "fmax z19.h, p0/m, z19.h, z1.h\n"
+                            "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
+                            "fadd z13.h, z13.h, z4.h\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "fmin z20.h, p1/m, z20.h, z0.h\n"
-                            "fmin z13.h, p2/m, z13.h, z0.h\n"
                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
                             "fadd z14.h, z14.h, z2.h\n"
                             "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
-                            "fadd z15.h, z15.h, z3.h\n"
+                            "fmin z13.h, p2/m, z13.h, z0.h\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "fmax z20.h, p1/m, z20.h, z1.h\n"
                             "st1h z19.h, p0, [%[outptr2]]\n"
-                            "fmax z13.h, p2/m, z13.h, z1.h\n"
-                            "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
                             "fmin z14.h, p0/m, z14.h, z0.h\n"
-                            "fmin z15.h, p1/m, z15.h, z0.h\n"
+                            "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
+                            "fmax z13.h, p2/m, z13.h, z1.h\n"
+                            "fadd z15.h, z15.h, z3.h\n"
                             "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
                             "fadd z16.h, z16.h, z4.h\n"
-                            "fadd z17.h, z17.h, z2.h\n"
                             "fmax z14.h, p0/m, z14.h, z1.h\n"
+                            "fadd z17.h, z17.h, z2.h\n"
                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
-                            "fmax z15.h, p1/m, z15.h, z1.h\n"
+                            "fmin z15.h, p1/m, z15.h, z0.h\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "fmin z16.h, p2/m, z16.h, z0.h\n"
                             "st1h z14.h, p0, [%[outptr3]]\n"
                             "fmin z17.h, p0/m, z17.h, z0.h\n"
+                            "fmax z15.h, p1/m, z15.h, z1.h\n"
                             "fadd z18.h, z18.h, z3.h\n"
-                            "fadd z19.h, z19.h, z4.h\n"
-                            "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
                             "fmax z16.h, p2/m, z16.h, z1.h\n"
                             "fmax z17.h, p0/m, z17.h, z1.h\n"
+                            "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
+                            "fadd z19.h, z19.h, z4.h\n"
                             "fmin z18.h, p1/m, z18.h, z0.h\n"
-                            "fmin z19.h, p2/m, z19.h, z0.h\n"
                             "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
                             "addvl %[outptr3], %[outptr3], #3\n"
                             "fmax z18.h, p1/m, z18.h, z1.h\n"
+                            "fmin z19.h, p2/m, z19.h, z0.h\n"
                             "st1h z17.h, p0, [%[outptr4]]\n"
                             "fmax z19.h, p2/m, z19.h, z1.h\n"
                             "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
@@ -1427,111 +1427,111 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
                             "whilelt p1.h, %[p], %[w]\n"
-                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
                             "inch %[p], all, mul #1\n"
-                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                            "whilelt p2.h, %[p], %[w]\n"
-                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.h, z13.h, z2.h\n"
-                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.h, %[p], %[w]\n"
+                            "fadd z16.h, z16.h, z2.h\n"
                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.h, p0/m, z13.h, z0.h\n"
-                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.h, z15.h, z4.h\n"
                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.h, z16.h, z2.h\n"
-                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.h, p0/m, z13.h, z1.h\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.h, z15.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.h, p0/m, z16.h, z0.h\n"
+                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.h, z17.h, z3.h\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "fmax z16.h, p0/m, z16.h, z1.h\n"
+                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fadd z17.h, z17.h, z3.h\n"
                             "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
-                            "fmin z17.h, p1/m, z17.h, z0.h\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "fadd z18.h, z18.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmin z17.h, p1/m, z17.h, z0.h\n"
                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.h, z19.h, z2.h\n"
                             "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
-                            "fmax z17.h, p1/m, z17.h, z1.h\n"
-                            "addvl %[outptr0], %[outptr0], #3\n"
                             "fmin z18.h, p2/m, z18.h, z0.h\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmax z17.h, p1/m, z17.h, z1.h\n"
                             "st1h z16.h, p0, [%[outptr1]]\n"
                             "fmin z19.h, p0/m, z19.h, z0.h\n"
                             "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
-                            "fadd z20.h, z20.h, z3.h\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "fmax z18.h, p2/m, z18.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "fadd z20.h, z20.h, z3.h\n"
                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
                             "fmax z19.h, p0/m, z19.h, z1.h\n"
                             "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
-                            "fmin z20.h, p1/m, z20.h, z0.h\n"
-                            "addvl %[inptr], %[inptr], #24\n"
                             "fadd z13.h, z13.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmin z20.h, p1/m, z20.h, z0.h\n"
                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
                             "fadd z14.h, z14.h, z2.h\n"
                             "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
-                            "fmax z20.h, p1/m, z20.h, z1.h\n"
-                            "addvl %[outptr1], %[outptr1], #3\n"
                             "fmin z13.h, p2/m, z13.h, z0.h\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmax z20.h, p1/m, z20.h, z1.h\n"
                             "st1h z19.h, p0, [%[outptr2]]\n"
                             "fmin z14.h, p0/m, z14.h, z0.h\n"
                             "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
+                            "fmax z13.h, p2/m, z13.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "fadd z15.h, z15.h, z3.h\n"
-                            "fadd z16.h, z16.h, z4.h\n"
                             "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
-                            "fmax z13.h, p2/m, z13.h, z1.h\n"
-                            "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
                             "fmax z14.h, p0/m, z14.h, z1.h\n"
+                            "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
+                            "fadd z16.h, z16.h, z4.h\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "fmin z15.h, p1/m, z15.h, z0.h\n"
-                            "fmin z16.h, p2/m, z16.h, z0.h\n"
                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
                             "fadd z17.h, z17.h, z2.h\n"
                             "ld1h z13.h, p1/z, [x8]\n"
-                            "fadd z18.h, z18.h, z3.h\n"
+                            "fmin z16.h, p2/m, z16.h, z0.h\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "fmax z15.h, p1/m, z15.h, z1.h\n"
                             "st1h z14.h, p0, [%[outptr3]]\n"
-                            "fmax z16.h, p2/m, z16.h, z1.h\n"
-                            "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
                             "fmin z17.h, p0/m, z17.h, z0.h\n"
-                            "fmin z18.h, p1/m, z18.h, z0.h\n"
+                            "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
+                            "fmax z16.h, p2/m, z16.h, z1.h\n"
+                            "fadd z18.h, z18.h, z3.h\n"
                             "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
                             "fadd z19.h, z19.h, z4.h\n"
-                            "fadd z20.h, z20.h, z2.h\n"
                             "fmax z17.h, p0/m, z17.h, z1.h\n"
+                            "fadd z20.h, z20.h, z2.h\n"
                             "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
-                            "fmax z18.h, p1/m, z18.h, z1.h\n"
+                            "fmin z18.h, p1/m, z18.h, z0.h\n"
                             "addvl %[outptr3], %[outptr3], #3\n"
                             "fmin z19.h, p2/m, z19.h, z0.h\n"
                             "st1h z17.h, p0, [%[outptr4]]\n"
                             "fmin z20.h, p0/m, z20.h, z0.h\n"
+                            "fmax z18.h, p1/m, z18.h, z1.h\n"
                             "fadd z13.h, z13.h, z3.h\n"
-                            "fadd z14.h, z14.h, z4.h\n"
-                            "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
                             "fmax z19.h, p2/m, z19.h, z1.h\n"
                             "fmax z20.h, p0/m, z20.h, z1.h\n"
+                            "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
+                            "fadd z14.h, z14.h, z4.h\n"
                             "fmin z13.h, p1/m, z13.h, z0.h\n"
-                            "fmin z14.h, p2/m, z14.h, z0.h\n"
                             "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
                             "addvl %[outptr4], %[outptr4], #3\n"
                             "fmax z13.h, p1/m, z13.h, z1.h\n"
+                            "fmin z14.h, p2/m, z14.h, z0.h\n"
                             "st1h z20.h, p0, [%[outptr5]]\n"
                             "fmax z14.h, p2/m, z14.h, z1.h\n"
                             "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
@@ -1560,129 +1560,129 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
                             "whilelt p1.h, %[p], %[w]\n"
-                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
                             "inch %[p], all, mul #1\n"
-                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                            "whilelt p2.h, %[p], %[w]\n"
-                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.h, z13.h, z2.h\n"
-                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.h, %[p], %[w]\n"
+                            "fadd z16.h, z16.h, z2.h\n"
                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.h, p0/m, z13.h, z0.h\n"
-                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.h, z15.h, z4.h\n"
                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.h, z16.h, z2.h\n"
-                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.h, p0/m, z13.h, z1.h\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.h, z15.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.h, p0/m, z16.h, z0.h\n"
+                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.h, z17.h, z3.h\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "fmax z16.h, p0/m, z16.h, z1.h\n"
+                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fadd z17.h, z17.h, z3.h\n"
                             "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
-                            "fmin z17.h, p1/m, z17.h, z0.h\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "fadd z18.h, z18.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmin z17.h, p1/m, z17.h, z0.h\n"
                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.h, z19.h, z2.h\n"
                             "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
-                            "fmax z17.h, p1/m, z17.h, z1.h\n"
-                            "addvl %[outptr0], %[outptr0], #3\n"
                             "fmin z18.h, p2/m, z18.h, z0.h\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmax z17.h, p1/m, z17.h, z1.h\n"
                             "st1h z16.h, p0, [%[outptr1]]\n"
                             "fmin z19.h, p0/m, z19.h, z0.h\n"
                             "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
-                            "fadd z20.h, z20.h, z3.h\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "fmax z18.h, p2/m, z18.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "fadd z20.h, z20.h, z3.h\n"
                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
                             "fmax z19.h, p0/m, z19.h, z1.h\n"
                             "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
-                            "fmin z20.h, p1/m, z20.h, z0.h\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
                             "fadd z13.h, z13.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmin z20.h, p1/m, z20.h, z0.h\n"
                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
                             "fadd z14.h, z14.h, z2.h\n"
                             "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
-                            "fmax z20.h, p1/m, z20.h, z1.h\n"
-                            "addvl %[outptr1], %[outptr1], #3\n"
                             "fmin z13.h, p2/m, z13.h, z0.h\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmax z20.h, p1/m, z20.h, z1.h\n"
                             "st1h z19.h, p0, [%[outptr2]]\n"
                             "fmin z14.h, p0/m, z14.h, z0.h\n"
                             "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
-                            "fadd z15.h, z15.h, z3.h\n"
-                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
                             "fmax z13.h, p2/m, z13.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "fadd z15.h, z15.h, z3.h\n"
                             "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
                             "fmax z14.h, p0/m, z14.h, z1.h\n"
                             "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
-                            "fmin z15.h, p1/m, z15.h, z0.h\n"
-                            "addvl %[inptr], %[inptr], #24\n"
                             "fadd z16.h, z16.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "fmin z15.h, p1/m, z15.h, z0.h\n"
                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
                             "fadd z17.h, z17.h, z2.h\n"
                             "ld1h z13.h, p1/z, [x8]\n"
-                            "fmax z15.h, p1/m, z15.h, z1.h\n"
-                            "addvl %[outptr2], %[outptr2], #3\n"
                             "fmin z16.h, p2/m, z16.h, z0.h\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "fmax z15.h, p1/m, z15.h, z1.h\n"
                             "st1h z14.h, p0, [%[outptr3]]\n"
                             "fmin z17.h, p0/m, z17.h, z0.h\n"
                             "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
+                            "fmax z16.h, p2/m, z16.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
                             "fadd z18.h, z18.h, z3.h\n"
-                            "fadd z19.h, z19.h, z4.h\n"
                             "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
-                            "fmax z16.h, p2/m, z16.h, z1.h\n"
-                            "ld1h z15.h, p0/z, [x8, #2, MUL VL]\n"
                             "fmax z17.h, p0/m, z17.h, z1.h\n"
+                            "ld1h z15.h, p0/z, [x8, #2, MUL VL]\n"
+                            "fadd z19.h, z19.h, z4.h\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "fmin z18.h, p1/m, z18.h, z0.h\n"
-                            "fmin z19.h, p2/m, z19.h, z0.h\n"
                             "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
                             "fadd z20.h, z20.h, z2.h\n"
                             "ld1h z16.h, p1/z, [x8, #3, MUL VL]\n"
-                            "fadd z13.h, z13.h, z3.h\n"
+                            "fmin z19.h, p2/m, z19.h, z0.h\n"
                             "addvl %[outptr3], %[outptr3], #3\n"
                             "fmax z18.h, p1/m, z18.h, z1.h\n"
                             "st1h z17.h, p0, [%[outptr4]]\n"
-                            "fmax z19.h, p2/m, z19.h, z1.h\n"
-                            "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
                             "fmin z20.h, p0/m, z20.h, z0.h\n"
-                            "fmin z13.h, p1/m, z13.h, z0.h\n"
+                            "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
+                            "fmax z19.h, p2/m, z19.h, z1.h\n"
+                            "fadd z13.h, z13.h, z3.h\n"
                             "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
                             "fadd z14.h, z14.h, z4.h\n"
-                            "fadd z15.h, z15.h, z2.h\n"
                             "fmax z20.h, p0/m, z20.h, z1.h\n"
+                            "fadd z15.h, z15.h, z2.h\n"
                             "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
-                            "fmax z13.h, p1/m, z13.h, z1.h\n"
+                            "fmin z13.h, p1/m, z13.h, z0.h\n"
                             "addvl %[outptr4], %[outptr4], #3\n"
                             "fmin z14.h, p2/m, z14.h, z0.h\n"
                             "st1h z20.h, p0, [%[outptr5]]\n"
                             "fmin z15.h, p0/m, z15.h, z0.h\n"
+                            "fmax z13.h, p1/m, z13.h, z1.h\n"
                             "fadd z16.h, z16.h, z3.h\n"
-                            "fadd z17.h, z17.h, z4.h\n"
-                            "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
                             "fmax z14.h, p2/m, z14.h, z1.h\n"
                             "fmax z15.h, p0/m, z15.h, z1.h\n"
+                            "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
+                            "fadd z17.h, z17.h, z4.h\n"
                             "fmin z16.h, p1/m, z16.h, z0.h\n"
-                            "fmin z17.h, p2/m, z17.h, z0.h\n"
                             "st1h z14.h, p2, [%[outptr5], #2, MUL VL]\n"
                             "addvl %[outptr5], %[outptr5], #3\n"
                             "fmax z16.h, p1/m, z16.h, z1.h\n"
+                            "fmin z17.h, p2/m, z17.h, z0.h\n"
                             "st1h z15.h, p0, [%[outptr6]]\n"
                             "fmax z17.h, p2/m, z17.h, z1.h\n"
                             "st1h z16.h, p1, [%[outptr6], #1, MUL VL]\n"
@@ -1712,142 +1712,142 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
                             "whilelt p1.h, %[p], %[w]\n"
-                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
                             "inch %[p], all, mul #1\n"
-                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                            "whilelt p2.h, %[p], %[w]\n"
-                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.h, z13.h, z2.h\n"
-                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.h, %[p], %[w]\n"
+                            "fadd z16.h, z16.h, z2.h\n"
                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.h, p0/m, z13.h, z0.h\n"
-                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.h, z15.h, z4.h\n"
                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.h, z16.h, z2.h\n"
-                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.h, z14.h, z3.h\n"
+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.h, p0/m, z13.h, z1.h\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.h, z15.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.h, p0/m, z16.h, z0.h\n"
+                            "st1h z13.h, p0, [%[outptr0]]\n"
+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.h, z17.h, z3.h\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "fmax z16.h, p0/m, z16.h, z1.h\n"
+                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fadd z17.h, z17.h, z3.h\n"
                             "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
-                            "fmin z17.h, p1/m, z17.h, z0.h\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "fadd z18.h, z18.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmin z17.h, p1/m, z17.h, z0.h\n"
                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.h, z19.h, z2.h\n"
                             "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
-                            "fmax z17.h, p1/m, z17.h, z1.h\n"
-                            "addvl %[outptr0], %[outptr0], #3\n"
                             "fmin z18.h, p2/m, z18.h, z0.h\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmax z17.h, p1/m, z17.h, z1.h\n"
                             "st1h z16.h, p0, [%[outptr1]]\n"
                             "fmin z19.h, p0/m, z19.h, z0.h\n"
                             "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
-                            "fadd z20.h, z20.h, z3.h\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "fmax z18.h, p2/m, z18.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "fadd z20.h, z20.h, z3.h\n"
                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
                             "fmax z19.h, p0/m, z19.h, z1.h\n"
                             "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
-                            "fmin z20.h, p1/m, z20.h, z0.h\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
                             "fadd z13.h, z13.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmin z20.h, p1/m, z20.h, z0.h\n"
                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
                             "fadd z14.h, z14.h, z2.h\n"
                             "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
-                            "fmax z20.h, p1/m, z20.h, z1.h\n"
-                            "addvl %[outptr1], %[outptr1], #3\n"
                             "fmin z13.h, p2/m, z13.h, z0.h\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmax z20.h, p1/m, z20.h, z1.h\n"
                             "st1h z19.h, p0, [%[outptr2]]\n"
                             "fmin z14.h, p0/m, z14.h, z0.h\n"
                             "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
-                            "fadd z15.h, z15.h, z3.h\n"
-                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
                             "fmax z13.h, p2/m, z13.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "fadd z15.h, z15.h, z3.h\n"
                             "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
                             "fmax z14.h, p0/m, z14.h, z1.h\n"
                             "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
-                            "fmin z15.h, p1/m, z15.h, z0.h\n"
-                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
                             "fadd z16.h, z16.h, z4.h\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "fmin z15.h, p1/m, z15.h, z0.h\n"
                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
                             "fadd z17.h, z17.h, z2.h\n"
                             "ld1h z13.h, p1/z, [x8]\n"
-                            "fmax z15.h, p1/m, z15.h, z1.h\n"
-                            "addvl %[outptr2], %[outptr2], #3\n"
                             "fmin z16.h, p2/m, z16.h, z0.h\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "fmax z15.h, p1/m, z15.h, z1.h\n"
                             "st1h z14.h, p0, [%[outptr3]]\n"
                             "fmin z17.h, p0/m, z17.h, z0.h\n"
                             "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
-                            "fadd z18.h, z18.h, z3.h\n"
-                            "addvl %[inptr], %[inptr], #24\n"
                             "fmax z16.h, p2/m, z16.h, z1.h\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+                            "fadd z18.h, z18.h, z3.h\n"
                             "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
                             "fmax z17.h, p0/m, z17.h, z1.h\n"
                             "ld1h z15.h, p0/z, [x8, #2, MUL VL]\n"
-                            "fmin z18.h, p1/m, z18.h, z0.h\n"
                             "fadd z19.h, z19.h, z4.h\n"
+                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+                            "fmin z18.h, p1/m, z18.h, z0.h\n"
                             "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
                             "fadd z20.h, z20.h, z2.h\n"
                             "ld1h z16.h, p1/z, [x8, #3, MUL VL]\n"
-                            "fadd z13.h, z13.h, z3.h\n"
+                            "fmin z19.h, p2/m, z19.h, z0.h\n"
                             "addvl %[outptr3], %[outptr3], #3\n"
                             "fmax z18.h, p1/m, z18.h, z1.h\n"
                             "st1h z17.h, p0, [%[outptr4]]\n"
-                            "fmin z19.h, p2/m, z19.h, z0.h\n"
-                            "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
                             "fmin z20.h, p0/m, z20.h, z0.h\n"
-                            "fmin z13.h, p1/m, z13.h, z0.h\n"
-                            "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
-                            "fadd z14.h, z14.h, z4.h\n"
-                            "ld1h z18.h, p0/z, [x8, #5, MUL VL]\n"
+                            "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
                             "fmax z19.h, p2/m, z19.h, z1.h\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                            "fadd z13.h, z13.h, z3.h\n"
+                            "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
                             "fmax z20.h, p0/m, z20.h, z1.h\n"
-                            "fmax z13.h, p1/m, z13.h, z1.h\n"
-                            "fmin z14.h, p2/m, z14.h, z0.h\n"
-                            "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
+                            "ld1h z18.h, p0/z, [x8, #5, MUL VL]\n"
+                            "fadd z14.h, z14.h, z4.h\n"
                             "fadd z15.h, z15.h, z2.h\n"
+                            "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
+                            "fmin z13.h, p1/m, z13.h, z0.h\n"
                             "ld1h z19.h, p1/z, [x8, #6, MUL VL]\n"
                             "fadd z16.h, z16.h, z3.h\n"
                             "addvl %[outptr4], %[outptr4], #3\n"
-                            "fmax z14.h, p2/m, z14.h, z1.h\n"
+                            "fmin z14.h, p2/m, z14.h, z0.h\n"
                             "st1h z20.h, p0, [%[outptr5]]\n"
-                            "fmin z15.h, p0/m, z15.h, z0.h\n"
+                            "fmax z13.h, p1/m, z13.h, z1.h\n"
                             "ld1h z20.h, p2/z, [x8, #7, MUL VL]\n"
+                            "fmin z15.h, p0/m, z15.h, z0.h\n"
                             "fmin z16.h, p1/m, z16.h, z0.h\n"
-                            "fadd z17.h, z17.h, z4.h\n"
+                            "fmax z14.h, p2/m, z14.h, z1.h\n"
                             "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
-                            "fadd z18.h, z18.h, z2.h\n"
+                            "fadd z17.h, z17.h, z4.h\n"
                             "fmax z15.h, p0/m, z15.h, z1.h\n"
                             "fmax z16.h, p1/m, z16.h, z1.h\n"
                             "st1h z14.h, p2, [%[outptr5], #2, MUL VL]\n"
-                            "fmin z17.h, p2/m, z17.h, z0.h\n"
+                            "fadd z18.h, z18.h, z2.h\n"
                             "addvl %[outptr5], %[outptr5], #3\n"
-                            "fmin z18.h, p0/m, z18.h, z0.h\n"
+                            "fmin z17.h, p2/m, z17.h, z0.h\n"
                             "st1h z15.h, p0, [%[outptr6]]\n"
                             "fadd z19.h, z19.h, z3.h\n"
-                            "fmax z17.h, p2/m, z17.h, z1.h\n"
+                            "fmin z18.h, p0/m, z18.h, z0.h\n"
                             "fadd z20.h, z20.h, z4.h\n"
                             "st1h z16.h, p1, [%[outptr6], #1, MUL VL]\n"
-                            "fmax z18.h, p0/m, z18.h, z1.h\n"
+                            "fmax z17.h, p2/m, z17.h, z1.h\n"
                             "fmin z19.h, p1/m, z19.h, z0.h\n"
+                            "fmax z18.h, p0/m, z18.h, z1.h\n"
                             "fmin z20.h, p2/m, z20.h, z0.h\n"
                             "st1h z17.h, p2, [%[outptr6], #2, MUL VL]\n"
                             "addvl %[outptr6], %[outptr6], #3\n"
@@ -1872,4 +1872,4 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
     }
 }
 
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
index b0d10c085d..2da48922e3 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020,2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 template<>
 void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float *bias, Activation act, bool append)
@@ -1029,25 +1029,25 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-                            "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z13.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "fadd z13.s, z13.s, z2.s\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
                             "whilelt p2.s, %[p], %[w]\n"
-                            "fadd z13.s, z13.s, z2.s\n"
+                            "fmin z13.s, p0/m, z13.s, z0.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "fadd z14.s, z14.s, z3.s\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
                             "addvl %[inptr], %[inptr], #24\n"
-                            "fmin z13.s, p0/m, z13.s, z0.s\n"
+                            "fmax z13.s, p0/m, z13.s, z1.s\n"
                             "fmin z14.s, p1/m, z14.s, z0.s\n"
                             "fadd z15.s, z15.s, z4.s\n"
-                            "fmax z13.s, p0/m, z13.s, z1.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
                             "fmax z14.s, p1/m, z14.s, z1.s\n"
                             "fmin z15.s, p2/m, z15.s, z0.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "fmax z15.s, p2/m, z15.s, z1.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
@@ -1073,42 +1073,42 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "fadd z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "fadd z16.s, z16.s, z2.s\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
+                            "fmin z13.s, p0/m, z13.s, z0.s\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "fadd z13.s, z13.s, z2.s\n"
-                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "fadd z14.s, z14.s, z3.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "fmin z16.s, p0/m, z16.s, z0.s\n"
+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "fmax z13.s, p0/m, z13.s, z1.s\n"
                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fadd z17.s, z17.s, z3.s\n"
                             "addvl %[inptr], %[inptr], #24\n"
-                            "fmin z13.s, p0/m, z13.s, z0.s\n"
                             "fmin z14.s, p1/m, z14.s, z0.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
                             "fadd z15.s, z15.s, z4.s\n"
-                            "fadd z16.s, z16.s, z2.s\n"
-                            "fmax z13.s, p0/m, z13.s, z1.s\n"
+                            "fmax z16.s, p0/m, z16.s, z1.s\n"
+                            "fmin z17.s, p1/m, z17.s, z0.s\n"
                             "fmax z14.s, p1/m, z14.s, z1.s\n"
                             "fmin z15.s, p2/m, z15.s, z0.s\n"
-                            "fmin z16.s, p0/m, z16.s, z0.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "fadd z17.s, z17.s, z3.s\n"
                             "fadd z18.s, z18.s, z4.s\n"
-                            "fmax z15.s, p2/m, z15.s, z1.s\n"
+                            "fmax z17.s, p1/m, z17.s, z1.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "fmax z16.s, p0/m, z16.s, z1.s\n"
-                            "fmin z17.s, p1/m, z17.s, z0.s\n"
+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
                             "fmin z18.s, p2/m, z18.s, z0.s\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
-                            "fmax z17.s, p1/m, z17.s, z1.s\n"
-                            "st1w z16.s, p0, [%[outptr1]]\n"
                             "fmax z18.s, p2/m, z18.s, z1.s\n"
+                            "st1w z16.s, p0, [%[outptr1]]\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
@@ -1135,60 +1135,60 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
-                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.s, z13.s, z2.s\n"
-                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "fadd z16.s, z16.s, z2.s\n"
                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.s, p0/m, z13.s, z0.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.s, z15.s, z4.s\n"
                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.s, z16.s, z2.s\n"
-                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.s, p0/m, z13.s, z1.s\n"
-                            "addvl %[inptr], %[inptr], #24\n"
-                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.s, z15.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.s, p0/m, z16.s, z0.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.s, z17.s, z3.s\n"
-                            "fadd z18.s, z18.s, z4.s\n"
+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fmax z16.s, p0/m, z16.s, z1.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fadd z17.s, z17.s, z3.s\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "fmax z15.s, p2/m, z15.s, z1.s\n"
-                            "fmax z16.s, p0/m, z16.s, z1.s\n"
+                            "fadd z18.s, z18.s, z4.s\n"
                             "fmin z17.s, p1/m, z17.s, z0.s\n"
-                            "fmin z18.s, p2/m, z18.s, z0.s\n"
-                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.s, z19.s, z2.s\n"
+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
+                            "fadd z20.s, z20.s, z3.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
                             "fmax z17.s, p1/m, z17.s, z1.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "fmax z18.s, p2/m, z18.s, z1.s\n"
+                            "fmin z18.s, p2/m, z18.s, z0.s\n"
                             "fmin z19.s, p0/m, z19.s, z0.s\n"
-                            "fadd z20.s, z20.s, z3.s\n"
+                            "fmin z20.s, p1/m, z20.s, z0.s\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
                             "fadd z13.s, z13.s, z4.s\n"
+                            "fmax z18.s, p2/m, z18.s, z1.s\n"
                             "fmax z19.s, p0/m, z19.s, z1.s\n"
+                            "fmax z20.s, p1/m, z20.s, z1.s\n"
+                            "fmin z13.s, p2/m, z13.s, z0.s\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                            "fmin z20.s, p1/m, z20.s, z0.s\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
-                            "fmin z13.s, p2/m, z13.s, z0.s\n"
-                            "st1w z19.s, p0, [%[outptr2]]\n"
-                            "fmax z20.s, p1/m, z20.s, z1.s\n"
                             "fmax z13.s, p2/m, z13.s, z1.s\n"
+                            "st1w z19.s, p0, [%[outptr2]]\n"
                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
@@ -1215,75 +1215,75 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
-                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.s, z13.s, z2.s\n"
-                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "fadd z16.s, z16.s, z2.s\n"
                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.s, p0/m, z13.s, z0.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.s, z15.s, z4.s\n"
                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.s, z16.s, z2.s\n"
-                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.s, p0/m, z13.s, z1.s\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                            "addvl %[inptr], %[inptr], #24\n"
-                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.s, z15.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.s, p0/m, z16.s, z0.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.s, z17.s, z3.s\n"
-                            "fadd z18.s, z18.s, z4.s\n"
+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fmax z16.s, p0/m, z16.s, z1.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "fmax z15.s, p2/m, z15.s, z1.s\n"
+                            "fadd z17.s, z17.s, z3.s\n"
                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "fmax z16.s, p0/m, z16.s, z1.s\n"
+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "fadd z18.s, z18.s, z4.s\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "fmin z17.s, p1/m, z17.s, z0.s\n"
-                            "fmin z18.s, p2/m, z18.s, z0.s\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.s, z19.s, z2.s\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "fadd z20.s, z20.s, z3.s\n"
+                            "fmin z18.s, p2/m, z18.s, z0.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
                             "fmax z17.s, p1/m, z17.s, z1.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "fmax z18.s, p2/m, z18.s, z1.s\n"
-                            "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
                             "fmin z19.s, p0/m, z19.s, z0.s\n"
-                            "fmin z20.s, p1/m, z20.s, z0.s\n"
+                            "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
+                            "fmax z18.s, p2/m, z18.s, z1.s\n"
+                            "fadd z20.s, z20.s, z3.s\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
                             "fadd z13.s, z13.s, z4.s\n"
-                            "fadd z14.s, z14.s, z2.s\n"
                             "fmax z19.s, p0/m, z19.s, z1.s\n"
+                            "fadd z14.s, z14.s, z2.s\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                            "fmax z20.s, p1/m, z20.s, z1.s\n"
+                            "fmin z20.s, p1/m, z20.s, z0.s\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "fmin z13.s, p2/m, z13.s, z0.s\n"
                             "st1w z19.s, p0, [%[outptr2]]\n"
                             "fmin z14.s, p0/m, z14.s, z0.s\n"
+                            "fmax z20.s, p1/m, z20.s, z1.s\n"
                             "fadd z15.s, z15.s, z3.s\n"
-                            "fadd z16.s, z16.s, z4.s\n"
-                            "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
                             "fmax z13.s, p2/m, z13.s, z1.s\n"
                             "fmax z14.s, p0/m, z14.s, z1.s\n"
+                            "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
+                            "fadd z16.s, z16.s, z4.s\n"
                             "fmin z15.s, p1/m, z15.s, z0.s\n"
-                            "fmin z16.s, p2/m, z16.s, z0.s\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "fmax z15.s, p1/m, z15.s, z1.s\n"
+                            "fmin z16.s, p2/m, z16.s, z0.s\n"
                             "st1w z14.s, p0, [%[outptr3]]\n"
                             "fmax z16.s, p2/m, z16.s, z1.s\n"
                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
@@ -1312,93 +1312,93 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
-                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.s, z13.s, z2.s\n"
-                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "fadd z16.s, z16.s, z2.s\n"
                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.s, p0/m, z13.s, z0.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.s, z15.s, z4.s\n"
                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.s, z16.s, z2.s\n"
-                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.s, p0/m, z13.s, z1.s\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.s, z15.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.s, p0/m, z16.s, z0.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.s, z17.s, z3.s\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "fmax z16.s, p0/m, z16.s, z1.s\n"
+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fadd z17.s, z17.s, z3.s\n"
                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "fmin z17.s, p1/m, z17.s, z0.s\n"
-                            "addvl %[inptr], %[inptr], #24\n"
+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "fadd z18.s, z18.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmin z17.s, p1/m, z17.s, z0.s\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.s, z19.s, z2.s\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "fmax z17.s, p1/m, z17.s, z1.s\n"
-                            "addvl %[outptr0], %[outptr0], #3\n"
                             "fmin z18.s, p2/m, z18.s, z0.s\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmax z17.s, p1/m, z17.s, z1.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
                             "fmin z19.s, p0/m, z19.s, z0.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
+                            "fmax z18.s, p2/m, z18.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
                             "fadd z20.s, z20.s, z3.s\n"
-                            "fadd z13.s, z13.s, z4.s\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                            "fmax z18.s, p2/m, z18.s, z1.s\n"
-                            "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
                             "fmax z19.s, p0/m, z19.s, z1.s\n"
+                            "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fadd z13.s, z13.s, z4.s\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "fmin z20.s, p1/m, z20.s, z0.s\n"
-                            "fmin z13.s, p2/m, z13.s, z0.s\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "fadd z14.s, z14.s, z2.s\n"
                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
-                            "fadd z15.s, z15.s, z3.s\n"
+                            "fmin z13.s, p2/m, z13.s, z0.s\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "fmax z20.s, p1/m, z20.s, z1.s\n"
                             "st1w z19.s, p0, [%[outptr2]]\n"
-                            "fmax z13.s, p2/m, z13.s, z1.s\n"
-                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
                             "fmin z14.s, p0/m, z14.s, z0.s\n"
-                            "fmin z15.s, p1/m, z15.s, z0.s\n"
+                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
+                            "fmax z13.s, p2/m, z13.s, z1.s\n"
+                            "fadd z15.s, z15.s, z3.s\n"
                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
                             "fadd z16.s, z16.s, z4.s\n"
-                            "fadd z17.s, z17.s, z2.s\n"
                             "fmax z14.s, p0/m, z14.s, z1.s\n"
+                            "fadd z17.s, z17.s, z2.s\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                            "fmax z15.s, p1/m, z15.s, z1.s\n"
+                            "fmin z15.s, p1/m, z15.s, z0.s\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "fmin z16.s, p2/m, z16.s, z0.s\n"
                             "st1w z14.s, p0, [%[outptr3]]\n"
                             "fmin z17.s, p0/m, z17.s, z0.s\n"
+                            "fmax z15.s, p1/m, z15.s, z1.s\n"
                             "fadd z18.s, z18.s, z3.s\n"
-                            "fadd z19.s, z19.s, z4.s\n"
-                            "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
                             "fmax z16.s, p2/m, z16.s, z1.s\n"
                             "fmax z17.s, p0/m, z17.s, z1.s\n"
+                            "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
+                            "fadd z19.s, z19.s, z4.s\n"
                             "fmin z18.s, p1/m, z18.s, z0.s\n"
-                            "fmin z19.s, p2/m, z19.s, z0.s\n"
                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
                             "addvl %[outptr3], %[outptr3], #3\n"
                             "fmax z18.s, p1/m, z18.s, z1.s\n"
+                            "fmin z19.s, p2/m, z19.s, z0.s\n"
                             "st1w z17.s, p0, [%[outptr4]]\n"
                             "fmax z19.s, p2/m, z19.s, z1.s\n"
                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
@@ -1427,111 +1427,111 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
-                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.s, z13.s, z2.s\n"
-                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "fadd z16.s, z16.s, z2.s\n"
                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.s, p0/m, z13.s, z0.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.s, z15.s, z4.s\n"
                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.s, z16.s, z2.s\n"
-                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.s, p0/m, z13.s, z1.s\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.s, z15.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.s, p0/m, z16.s, z0.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.s, z17.s, z3.s\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "fmax z16.s, p0/m, z16.s, z1.s\n"
+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fadd z17.s, z17.s, z3.s\n"
                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "fmin z17.s, p1/m, z17.s, z0.s\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "fadd z18.s, z18.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmin z17.s, p1/m, z17.s, z0.s\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.s, z19.s, z2.s\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "fmax z17.s, p1/m, z17.s, z1.s\n"
-                            "addvl %[outptr0], %[outptr0], #3\n"
                             "fmin z18.s, p2/m, z18.s, z0.s\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmax z17.s, p1/m, z17.s, z1.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
                             "fmin z19.s, p0/m, z19.s, z0.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                            "fadd z20.s, z20.s, z3.s\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "fmax z18.s, p2/m, z18.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "fadd z20.s, z20.s, z3.s\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
                             "fmax z19.s, p0/m, z19.s, z1.s\n"
                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-                            "fmin z20.s, p1/m, z20.s, z0.s\n"
-                            "addvl %[inptr], %[inptr], #24\n"
                             "fadd z13.s, z13.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmin z20.s, p1/m, z20.s, z0.s\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "fadd z14.s, z14.s, z2.s\n"
                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
-                            "fmax z20.s, p1/m, z20.s, z1.s\n"
-                            "addvl %[outptr1], %[outptr1], #3\n"
                             "fmin z13.s, p2/m, z13.s, z0.s\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmax z20.s, p1/m, z20.s, z1.s\n"
                             "st1w z19.s, p0, [%[outptr2]]\n"
                             "fmin z14.s, p0/m, z14.s, z0.s\n"
                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
+                            "fmax z13.s, p2/m, z13.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "fadd z15.s, z15.s, z3.s\n"
-                            "fadd z16.s, z16.s, z4.s\n"
                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                            "fmax z13.s, p2/m, z13.s, z1.s\n"
-                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
                             "fmax z14.s, p0/m, z14.s, z1.s\n"
+                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "fadd z16.s, z16.s, z4.s\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "fmin z15.s, p1/m, z15.s, z0.s\n"
-                            "fmin z16.s, p2/m, z16.s, z0.s\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
                             "fadd z17.s, z17.s, z2.s\n"
                             "ld1w z13.s, p1/z, [x8]\n"
-                            "fadd z18.s, z18.s, z3.s\n"
+                            "fmin z16.s, p2/m, z16.s, z0.s\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "fmax z15.s, p1/m, z15.s, z1.s\n"
                             "st1w z14.s, p0, [%[outptr3]]\n"
-                            "fmax z16.s, p2/m, z16.s, z1.s\n"
-                            "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
                             "fmin z17.s, p0/m, z17.s, z0.s\n"
-                            "fmin z18.s, p1/m, z18.s, z0.s\n"
+                            "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
+                            "fmax z16.s, p2/m, z16.s, z1.s\n"
+                            "fadd z18.s, z18.s, z3.s\n"
                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
                             "fadd z19.s, z19.s, z4.s\n"
-                            "fadd z20.s, z20.s, z2.s\n"
                             "fmax z17.s, p0/m, z17.s, z1.s\n"
+                            "fadd z20.s, z20.s, z2.s\n"
                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
-                            "fmax z18.s, p1/m, z18.s, z1.s\n"
+                            "fmin z18.s, p1/m, z18.s, z0.s\n"
                             "addvl %[outptr3], %[outptr3], #3\n"
                             "fmin z19.s, p2/m, z19.s, z0.s\n"
                             "st1w z17.s, p0, [%[outptr4]]\n"
                             "fmin z20.s, p0/m, z20.s, z0.s\n"
+                            "fmax z18.s, p1/m, z18.s, z1.s\n"
                             "fadd z13.s, z13.s, z3.s\n"
-                            "fadd z14.s, z14.s, z4.s\n"
-                            "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
                             "fmax z19.s, p2/m, z19.s, z1.s\n"
                             "fmax z20.s, p0/m, z20.s, z1.s\n"
+                            "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
+                            "fadd z14.s, z14.s, z4.s\n"
                             "fmin z13.s, p1/m, z13.s, z0.s\n"
-                            "fmin z14.s, p2/m, z14.s, z0.s\n"
                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
                             "addvl %[outptr4], %[outptr4], #3\n"
                             "fmax z13.s, p1/m, z13.s, z1.s\n"
+                            "fmin z14.s, p2/m, z14.s, z0.s\n"
                             "st1w z20.s, p0, [%[outptr5]]\n"
                             "fmax z14.s, p2/m, z14.s, z1.s\n"
                             "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
@@ -1560,129 +1560,129 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
-                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.s, z13.s, z2.s\n"
-                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "fadd z16.s, z16.s, z2.s\n"
                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.s, p0/m, z13.s, z0.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.s, z15.s, z4.s\n"
                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.s, z16.s, z2.s\n"
-                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.s, p0/m, z13.s, z1.s\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.s, z15.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.s, p0/m, z16.s, z0.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.s, z17.s, z3.s\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "fmax z16.s, p0/m, z16.s, z1.s\n"
+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fadd z17.s, z17.s, z3.s\n"
                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "fmin z17.s, p1/m, z17.s, z0.s\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "fadd z18.s, z18.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmin z17.s, p1/m, z17.s, z0.s\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.s, z19.s, z2.s\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "fmax z17.s, p1/m, z17.s, z1.s\n"
-                            "addvl %[outptr0], %[outptr0], #3\n"
                             "fmin z18.s, p2/m, z18.s, z0.s\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmax z17.s, p1/m, z17.s, z1.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
                             "fmin z19.s, p0/m, z19.s, z0.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                            "fadd z20.s, z20.s, z3.s\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "fmax z18.s, p2/m, z18.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "fadd z20.s, z20.s, z3.s\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
                             "fmax z19.s, p0/m, z19.s, z1.s\n"
                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-                            "fmin z20.s, p1/m, z20.s, z0.s\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
                             "fadd z13.s, z13.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmin z20.s, p1/m, z20.s, z0.s\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "fadd z14.s, z14.s, z2.s\n"
                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
-                            "fmax z20.s, p1/m, z20.s, z1.s\n"
-                            "addvl %[outptr1], %[outptr1], #3\n"
                             "fmin z13.s, p2/m, z13.s, z0.s\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmax z20.s, p1/m, z20.s, z1.s\n"
                             "st1w z19.s, p0, [%[outptr2]]\n"
                             "fmin z14.s, p0/m, z14.s, z0.s\n"
                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-                            "fadd z15.s, z15.s, z3.s\n"
-                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
                             "fmax z13.s, p2/m, z13.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "fadd z15.s, z15.s, z3.s\n"
                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
                             "fmax z14.s, p0/m, z14.s, z1.s\n"
                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-                            "fmin z15.s, p1/m, z15.s, z0.s\n"
-                            "addvl %[inptr], %[inptr], #24\n"
                             "fadd z16.s, z16.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "fmin z15.s, p1/m, z15.s, z0.s\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
                             "fadd z17.s, z17.s, z2.s\n"
                             "ld1w z13.s, p1/z, [x8]\n"
-                            "fmax z15.s, p1/m, z15.s, z1.s\n"
-                            "addvl %[outptr2], %[outptr2], #3\n"
                             "fmin z16.s, p2/m, z16.s, z0.s\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "fmax z15.s, p1/m, z15.s, z1.s\n"
                             "st1w z14.s, p0, [%[outptr3]]\n"
                             "fmin z17.s, p0/m, z17.s, z0.s\n"
                             "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
+                            "fmax z16.s, p2/m, z16.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
                             "fadd z18.s, z18.s, z3.s\n"
-                            "fadd z19.s, z19.s, z4.s\n"
                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
-                            "fmax z16.s, p2/m, z16.s, z1.s\n"
-                            "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
                             "fmax z17.s, p0/m, z17.s, z1.s\n"
+                            "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
+                            "fadd z19.s, z19.s, z4.s\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "fmin z18.s, p1/m, z18.s, z0.s\n"
-                            "fmin z19.s, p2/m, z19.s, z0.s\n"
                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
                             "fadd z20.s, z20.s, z2.s\n"
                             "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
-                            "fadd z13.s, z13.s, z3.s\n"
+                            "fmin z19.s, p2/m, z19.s, z0.s\n"
                             "addvl %[outptr3], %[outptr3], #3\n"
                             "fmax z18.s, p1/m, z18.s, z1.s\n"
                             "st1w z17.s, p0, [%[outptr4]]\n"
-                            "fmax z19.s, p2/m, z19.s, z1.s\n"
-                            "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
                             "fmin z20.s, p0/m, z20.s, z0.s\n"
-                            "fmin z13.s, p1/m, z13.s, z0.s\n"
+                            "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
+                            "fmax z19.s, p2/m, z19.s, z1.s\n"
+                            "fadd z13.s, z13.s, z3.s\n"
                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
                             "fadd z14.s, z14.s, z4.s\n"
-                            "fadd z15.s, z15.s, z2.s\n"
                             "fmax z20.s, p0/m, z20.s, z1.s\n"
+                            "fadd z15.s, z15.s, z2.s\n"
                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
-                            "fmax z13.s, p1/m, z13.s, z1.s\n"
+                            "fmin z13.s, p1/m, z13.s, z0.s\n"
                             "addvl %[outptr4], %[outptr4], #3\n"
                             "fmin z14.s, p2/m, z14.s, z0.s\n"
                             "st1w z20.s, p0, [%[outptr5]]\n"
                             "fmin z15.s, p0/m, z15.s, z0.s\n"
+                            "fmax z13.s, p1/m, z13.s, z1.s\n"
                             "fadd z16.s, z16.s, z3.s\n"
-                            "fadd z17.s, z17.s, z4.s\n"
-                            "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
                             "fmax z14.s, p2/m, z14.s, z1.s\n"
                             "fmax z15.s, p0/m, z15.s, z1.s\n"
+                            "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
+                            "fadd z17.s, z17.s, z4.s\n"
                             "fmin z16.s, p1/m, z16.s, z0.s\n"
-                            "fmin z17.s, p2/m, z17.s, z0.s\n"
                             "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
                             "addvl %[outptr5], %[outptr5], #3\n"
                             "fmax z16.s, p1/m, z16.s, z1.s\n"
+                            "fmin z17.s, p2/m, z17.s, z0.s\n"
                             "st1w z15.s, p0, [%[outptr6]]\n"
                             "fmax z17.s, p2/m, z17.s, z1.s\n"
                             "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
@@ -1712,142 +1712,142 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
-                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fadd z13.s, z13.s, z2.s\n"
-                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "fadd z16.s, z16.s, z2.s\n"
                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "fmin z13.s, p0/m, z13.s, z0.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "fadd z15.s, z15.s, z4.s\n"
                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "fadd z16.s, z16.s, z2.s\n"
-                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fadd z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
                             "fmax z13.s, p0/m, z13.s, z1.s\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "fadd z15.s, z15.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
                             "fmin z16.s, p0/m, z16.s, z0.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "fadd z17.s, z17.s, z3.s\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "fmax z16.s, p0/m, z16.s, z1.s\n"
+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "fadd z17.s, z17.s, z3.s\n"
                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "fmin z17.s, p1/m, z17.s, z0.s\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "fadd z18.s, z18.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmin z17.s, p1/m, z17.s, z0.s\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "fadd z19.s, z19.s, z2.s\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "fmax z17.s, p1/m, z17.s, z1.s\n"
-                            "addvl %[outptr0], %[outptr0], #3\n"
                             "fmin z18.s, p2/m, z18.s, z0.s\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmax z17.s, p1/m, z17.s, z1.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
                             "fmin z19.s, p0/m, z19.s, z0.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                            "fadd z20.s, z20.s, z3.s\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "fmax z18.s, p2/m, z18.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "fadd z20.s, z20.s, z3.s\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
                             "fmax z19.s, p0/m, z19.s, z1.s\n"
                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-                            "fmin z20.s, p1/m, z20.s, z0.s\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
                             "fadd z13.s, z13.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmin z20.s, p1/m, z20.s, z0.s\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "fadd z14.s, z14.s, z2.s\n"
                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
-                            "fmax z20.s, p1/m, z20.s, z1.s\n"
-                            "addvl %[outptr1], %[outptr1], #3\n"
                             "fmin z13.s, p2/m, z13.s, z0.s\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmax z20.s, p1/m, z20.s, z1.s\n"
                             "st1w z19.s, p0, [%[outptr2]]\n"
                             "fmin z14.s, p0/m, z14.s, z0.s\n"
                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-                            "fadd z15.s, z15.s, z3.s\n"
-                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
                             "fmax z13.s, p2/m, z13.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "fadd z15.s, z15.s, z3.s\n"
                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
                             "fmax z14.s, p0/m, z14.s, z1.s\n"
                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-                            "fmin z15.s, p1/m, z15.s, z0.s\n"
-                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
                             "fadd z16.s, z16.s, z4.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "fmin z15.s, p1/m, z15.s, z0.s\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
                             "fadd z17.s, z17.s, z2.s\n"
                             "ld1w z13.s, p1/z, [x8]\n"
-                            "fmax z15.s, p1/m, z15.s, z1.s\n"
-                            "addvl %[outptr2], %[outptr2], #3\n"
                             "fmin z16.s, p2/m, z16.s, z0.s\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "fmax z15.s, p1/m, z15.s, z1.s\n"
                             "st1w z14.s, p0, [%[outptr3]]\n"
                             "fmin z17.s, p0/m, z17.s, z0.s\n"
                             "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
-                            "fadd z18.s, z18.s, z3.s\n"
-                            "addvl %[inptr], %[inptr], #24\n"
                             "fmax z16.s, p2/m, z16.s, z1.s\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+                            "fadd z18.s, z18.s, z3.s\n"
                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
                             "fmax z17.s, p0/m, z17.s, z1.s\n"
                             "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
-                            "fmin z18.s, p1/m, z18.s, z0.s\n"
                             "fadd z19.s, z19.s, z4.s\n"
+                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+                            "fmin z18.s, p1/m, z18.s, z0.s\n"
                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
                             "fadd z20.s, z20.s, z2.s\n"
                             "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
-                            "fadd z13.s, z13.s, z3.s\n"
+                            "fmin z19.s, p2/m, z19.s, z0.s\n"
                             "addvl %[outptr3], %[outptr3], #3\n"
                             "fmax z18.s, p1/m, z18.s, z1.s\n"
                             "st1w z17.s, p0, [%[outptr4]]\n"
-                            "fmin z19.s, p2/m, z19.s, z0.s\n"
-                            "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
                             "fmin z20.s, p0/m, z20.s, z0.s\n"
-                            "fmin z13.s, p1/m, z13.s, z0.s\n"
-                            "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
-                            "fadd z14.s, z14.s, z4.s\n"
-                            "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
+                            "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
                             "fmax z19.s, p2/m, z19.s, z1.s\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                            "fadd z13.s, z13.s, z3.s\n"
+                            "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
                             "fmax z20.s, p0/m, z20.s, z1.s\n"
-                            "fmax z13.s, p1/m, z13.s, z1.s\n"
-                            "fmin z14.s, p2/m, z14.s, z0.s\n"
-                            "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
+                            "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
+                            "fadd z14.s, z14.s, z4.s\n"
                             "fadd z15.s, z15.s, z2.s\n"
+                            "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
+                            "fmin z13.s, p1/m, z13.s, z0.s\n"
                             "ld1w z19.s, p1/z, [x8, #6, MUL VL]\n"
                             "fadd z16.s, z16.s, z3.s\n"
                             "addvl %[outptr4], %[outptr4], #3\n"
-                            "fmax z14.s, p2/m, z14.s, z1.s\n"
+                            "fmin z14.s, p2/m, z14.s, z0.s\n"
                             "st1w z20.s, p0, [%[outptr5]]\n"
-                            "fmin z15.s, p0/m, z15.s, z0.s\n"
+                            "fmax z13.s, p1/m, z13.s, z1.s\n"
                             "ld1w z20.s, p2/z, [x8, #7, MUL VL]\n"
+                            "fmin z15.s, p0/m, z15.s, z0.s\n"
                             "fmin z16.s, p1/m, z16.s, z0.s\n"
-                            "fadd z17.s, z17.s, z4.s\n"
+                            "fmax z14.s, p2/m, z14.s, z1.s\n"
                             "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
-                            "fadd z18.s, z18.s, z2.s\n"
+                            "fadd z17.s, z17.s, z4.s\n"
                             "fmax z15.s, p0/m, z15.s, z1.s\n"
                             "fmax z16.s, p1/m, z16.s, z1.s\n"
                             "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
-                            "fmin z17.s, p2/m, z17.s, z0.s\n"
+                            "fadd z18.s, z18.s, z2.s\n"
                             "addvl %[outptr5], %[outptr5], #3\n"
-                            "fmin z18.s, p0/m, z18.s, z0.s\n"
+                            "fmin z17.s, p2/m, z17.s, z0.s\n"
                             "st1w z15.s, p0, [%[outptr6]]\n"
                             "fadd z19.s, z19.s, z3.s\n"
-                            "fmax z17.s, p2/m, z17.s, z1.s\n"
+                            "fmin z18.s, p0/m, z18.s, z0.s\n"
                             "fadd z20.s, z20.s, z4.s\n"
                             "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
-                            "fmax z18.s, p0/m, z18.s, z1.s\n"
+                            "fmax z17.s, p2/m, z17.s, z1.s\n"
                             "fmin z19.s, p1/m, z19.s, z0.s\n"
+                            "fmax z18.s, p0/m, z18.s, z1.s\n"
                             "fmin z20.s, p2/m, z20.s, z0.s\n"
                             "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
                             "addvl %[outptr6], %[outptr6], #3\n"
@@ -1872,4 +1872,4 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
     }
 }
 
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_bf16_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_bf16_8x3VL.hpp
new file mode 100644
index 0000000000..5d4a8bf347
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_bf16_8x3VL.hpp
@@ -0,0 +1,2137 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+template<>
+void MergeResults<3, 8, true>(
+    bfloat16 *out_ptr,
+    const float * in_ptr,
+    const int ldout,
+    const int y0, const int ymax,
+    const int x0, const int xmax,
+    const bfloat16 *bias,
+    Activation act,
+    bool accumulate)
+{
+    float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            minval = 0;
+            break;
+    }
+
+    size_t rows = ymax-y0;
+    size_t cols = xmax-x0;
+
+    out_ptr += (y0 * ldout) + x0;
+    bias = (bias == nullptr) ? nullptr : bias + x0;
+
+    __asm__ __volatile__(
+      "ptrue p3.b\n"
+      "cbz %x[cols], 52f\n"
+      "cbz %x[rows], 52f\n"
+      "mov x12, #0x20\n"
+      "dup z12.s, %w[maxval]\n"
+      "dup z11.s, %w[minval]\n"
+      "mul x12, %x[ldout], x12\n"
+      "cbnz %x[accumulate], 34f\n"
+      "1:"  // Initial: Row loop
+      "cmp %x[rows], #0x7\n"
+      "bgt 30f\n"
+      "beq 26f\n"
+      "cmp %x[rows], #0x5\n"
+      "bgt 22f\n"
+      "beq 18f\n"
+      "cmp %x[rows], #0x3\n"
+      "bgt 14f\n"
+      "beq 10f\n"
+      "cmp %x[rows], #0x1\n"
+      "bgt 6f\n"
+      "2:"  // Initial: Height 1
+      "mov x11, %x[cols]\n"
+      "mov x10, %x[out_ptr]\n"
+      "mov x9, %x[bias]\n"
+      "3:"  // Initial: Height 1: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p0.s, x21, x11\n"
+      "incw x21\n"
+      "cbnz %x[bias], 4f\n"
+      "mov z21.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "b 5f\n"
+      "4:"  // Initial: Height 1: Width 3: bias
+      "ld1h { z18.s }, p2/Z, [x9]\n"
+      "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+      "lsl z21.s, z18.s, #0x10\n"
+      "lsl z20.s, z17.s, #0x10\n"
+      "lsl z19.s, z16.s, #0x10\n"
+      "5:"  // Initial: Height 1: Width 3: init done
+      "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n"
+      "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "decw x11, ALL, MUL #3\n"
+      "inch x9, ALL, MUL #3\n"
+      "ld1w { z18.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "fadd z17.s, z17.s, z21.s\n"
+      "fadd z16.s, z16.s, z20.s\n"
+      "cmp x11, XZR\n"
+      "fadd z18.s, z18.s, z19.s\n"
+      "fmin z17.s, p3/M, z17.s, z12.s\n"
+      "fmin z16.s, p3/M, z16.s, z12.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fmax z17.s, p3/M, z17.s, z11.s\n"
+      "fmax z16.s, p3/M, z16.s, z11.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      ".inst 0x658aae31  // bfcvt z17.h, p3/M, z17.s\n"
+      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
+      "st1h { z17.s }, p2, [x10]\n"
+      "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+      ".inst 0x658aae50  // bfcvt z16.h, p3/M, z18.s\n"
+      "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+      "inch x10, ALL, MUL #3\n"
+      "bgt 3b\n"
+      "b 52f\n"
+      "6:"  // Initial: Height 2
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "mov x9, %x[bias]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "7:"  // Initial: Height 2: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p0.s, x21, x11\n"
+      "incw x21\n"
+      "cbnz %x[bias], 8f\n"
+      "mov z24.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "b 9f\n"
+      "8:"  // Initial: Height 2: Width 3: bias
+      "ld1h { z18.s }, p2/Z, [x9]\n"
+      "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+      "lsl z24.s, z18.s, #0x10\n"
+      "lsl z23.s, z17.s, #0x10\n"
+      "lsl z22.s, z16.s, #0x10\n"
+      "9:"  // Initial: Height 2: Width 3: init done
+      "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n"
+      "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "decw x11, ALL, MUL #3\n"
+      "inch x9, ALL, MUL #3\n"
+      "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "ld1w { z21.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "fadd z17.s, z17.s, z24.s\n"
+      "fadd z16.s, z16.s, z23.s\n"
+      "cmp x11, XZR\n"
+      "fadd z19.s, z19.s, z22.s\n"
+      "fadd z18.s, z18.s, z24.s\n"
+      "fadd z21.s, z21.s, z23.s\n"
+      "fadd z20.s, z20.s, z22.s\n"
+      "fmin z17.s, p3/M, z17.s, z12.s\n"
+      "fmin z16.s, p3/M, z16.s, z12.s\n"
+      "fmin z19.s, p3/M, z19.s, z12.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fmin z20.s, p3/M, z20.s, z12.s\n"
+      "fmax z17.s, p3/M, z17.s, z11.s\n"
+      "fmax z16.s, p3/M, z16.s, z11.s\n"
+      "fmax z19.s, p3/M, z19.s, z11.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "fmax z20.s, p3/M, z20.s, z11.s\n"
+      ".inst 0x658aae31  // bfcvt z17.h, p3/M, z17.s\n"
+      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
+      ".inst 0x658aae73  // bfcvt z19.h, p3/M, z19.s\n"
+      ".inst 0x658aae52  // bfcvt z18.h, p3/M, z18.s\n"
+      "st1h { z17.s }, p2, [x10]\n"
+      "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+      ".inst 0x658aaeb1  // bfcvt z17.h, p3/M, z21.s\n"
+      ".inst 0x658aae90  // bfcvt z16.h, p3/M, z20.s\n"
+      "st1h { z19.s }, p0, [x10, #2, MUL VL]\n"
+      "inch x10, ALL, MUL #3\n"
+      "st1h { z18.s }, p2, [x28]\n"
+      "st1h { z17.s }, p1, [x28, #1, MUL VL]\n"
+      "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+      "inch x28, ALL, MUL #3\n"
+      "bgt 7b\n"
+      "b 52f\n"
+      "10:"  // Initial: Height 3
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "mov x9, %x[bias]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "11:"  // Initial: Height 3: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p0.s, x21, x11\n"
+      "incw x21\n"
+      "cbnz %x[bias], 12f\n"
+      "mov z27.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "b 13f\n"
+      "12:"  // Initial: Height 3: Width 3: bias
+      "ld1h { z18.s }, p2/Z, [x9]\n"
+      "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+      "lsl z27.s, z18.s, #0x10\n"
+      "lsl z26.s, z17.s, #0x10\n"
+      "lsl z25.s, z16.s, #0x10\n"
+      "13:"  // Initial: Height 3: Width 3: init done
+      "ld1w { z18.s }, p2/Z, [%x[in_ptr]]\n"
+      "ld1w { z17.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "decw x11, ALL, MUL #3\n"
+      "inch x9, ALL, MUL #3\n"
+      "ld1w { z16.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "fadd z18.s, z18.s, z27.s\n"
+      "fadd z17.s, z17.s, z26.s\n"
+      "ld1w { z22.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "fadd z16.s, z16.s, z25.s\n"
+      "fadd z21.s, z21.s, z27.s\n"
+      "cmp x11, XZR\n"
+      "fadd z20.s, z20.s, z26.s\n"
+      "fadd z19.s, z19.s, z25.s\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "fadd z24.s, z24.s, z27.s\n"
+      "fadd z23.s, z23.s, z26.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fmin z17.s, p3/M, z17.s, z12.s\n"
+      "fadd z22.s, z22.s, z25.s\n"
+      "fmin z16.s, p3/M, z16.s, z12.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fmin z20.s, p3/M, z20.s, z12.s\n"
+      "fmin z19.s, p3/M, z19.s, z12.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      "fmax z17.s, p3/M, z17.s, z11.s\n"
+      "fmax z16.s, p3/M, z16.s, z11.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "fmax z20.s, p3/M, z20.s, z11.s\n"
+      "fmax z19.s, p3/M, z19.s, z11.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      ".inst 0x658aae52  // bfcvt z18.h, p3/M, z18.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      ".inst 0x658aae31  // bfcvt z17.h, p3/M, z17.s\n"
+      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
+      ".inst 0x658aaeb5  // bfcvt z21.h, p3/M, z21.s\n"
+      ".inst 0x658aae94  // bfcvt z20.h, p3/M, z20.s\n"
+      "st1h { z18.s }, p2, [x10]\n"
+      ".inst 0x658aae73  // bfcvt z19.h, p3/M, z19.s\n"
+      ".inst 0x658aaf12  // bfcvt z18.h, p3/M, z24.s\n"
+      "st1h { z17.s }, p1, [x10, #1, MUL VL]\n"
+      "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+      ".inst 0x658aaef1  // bfcvt z17.h, p3/M, z23.s\n"
+      ".inst 0x658aaed0  // bfcvt z16.h, p3/M, z22.s\n"
+      "inch x10, ALL, MUL #3\n"
+      "st1h { z21.s }, p2, [x28]\n"
+      "st1h { z20.s }, p1, [x28, #1, MUL VL]\n"
+      "st1h { z19.s }, p0, [x28, #2, MUL VL]\n"
+      "inch x28, ALL, MUL #3\n"
+      "st1h { z18.s }, p2, [x27]\n"
+      "st1h { z17.s }, p1, [x27, #1, MUL VL]\n"
+      "st1h { z16.s }, p0, [x27, #2, MUL VL]\n"
+      "inch x27, ALL, MUL #3\n"
+      "bgt 11b\n"
+      "b 52f\n"
+      "14:"  // Initial: Height 4
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "mov x9, %x[bias]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "15:"  // Initial: Height 4: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p0.s, x21, x11\n"
+      "incw x21\n"
+      "cbnz %x[bias], 16f\n"
+      "mov z30.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "b 17f\n"
+      "16:"  // Initial: Height 4: Width 3: bias
+      "ld1h { z18.s }, p2/Z, [x9]\n"
+      "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+      "lsl z30.s, z18.s, #0x10\n"
+      "lsl z29.s, z17.s, #0x10\n"
+      "lsl z28.s, z16.s, #0x10\n"
+      "17:"  // Initial: Height 4: Width 3: init done
+      "ld1w { z18.s }, p2/Z, [%x[in_ptr]]\n"
+      "ld1w { z17.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "decw x11, ALL, MUL #3\n"
+      "inch x9, ALL, MUL #3\n"
+      "ld1w { z16.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "ld1w { z22.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "fadd z18.s, z18.s, z30.s\n"
+      "fadd z17.s, z17.s, z29.s\n"
+      "ld1w { z19.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x20, #-7, MUL VL]\n"
+      "fadd z16.s, z16.s, z28.s\n"
+      "fadd z24.s, z24.s, z30.s\n"
+      "ld1w { z26.s }, p1/Z, [x20, #-6, MUL VL]\n"
+      "ld1w { z25.s }, p0/Z, [x20, #-5, MUL VL]\n"
+      "fadd z23.s, z23.s, z29.s\n"
+      "fadd z22.s, z22.s, z28.s\n"
+      "fadd z21.s, z21.s, z30.s\n"
+      "fadd z20.s, z20.s, z29.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fmin z17.s, p3/M, z17.s, z12.s\n"
+      "fadd z19.s, z19.s, z28.s\n"
+      "fadd z27.s, z27.s, z30.s\n"
+      "fmin z16.s, p3/M, z16.s, z12.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fadd z26.s, z26.s, z29.s\n"
+      "fadd z25.s, z25.s, z28.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fmin z20.s, p3/M, z20.s, z12.s\n"
+      "fmin z19.s, p3/M, z19.s, z12.s\n"
+      "fmin z27.s, p3/M, z27.s, z12.s\n"
+      "fmin z26.s, p3/M, z26.s, z12.s\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      "fmax z17.s, p3/M, z17.s, z11.s\n"
+      "fmax z16.s, p3/M, z16.s, z11.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "fmax z20.s, p3/M, z20.s, z11.s\n"
+      ".inst 0x658aae52  // bfcvt z18.h, p3/M, z18.s\n"
+      ".inst 0x658aae31  // bfcvt z17.h, p3/M, z17.s\n"
+      "fmax z19.s, p3/M, z19.s, z11.s\n"
+      "fmax z27.s, p3/M, z27.s, z11.s\n"
+      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
+      ".inst 0x658aaf18  // bfcvt z24.h, p3/M, z24.s\n"
+      "fmax z26.s, p3/M, z26.s, z11.s\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      ".inst 0x658aaef7  // bfcvt z23.h, p3/M, z23.s\n"
+      ".inst 0x658aaed6  // bfcvt z22.h, p3/M, z22.s\n"
+      "cmp x11, XZR\n"
+      "st1h { z18.s }, p2, [x10]\n"
+      ".inst 0x658aaeb5  // bfcvt z21.h, p3/M, z21.s\n"
+      ".inst 0x658aae94  // bfcvt z20.h, p3/M, z20.s\n"
+      "st1h { z17.s }, p1, [x10, #1, MUL VL]\n"
+      ".inst 0x658aae73  // bfcvt z19.h, p3/M, z19.s\n"
+      ".inst 0x658aaf72  // bfcvt z18.h, p3/M, z27.s\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+      ".inst 0x658aaf51  // bfcvt z17.h, p3/M, z26.s\n"
+      ".inst 0x658aaf30  // bfcvt z16.h, p3/M, z25.s\n"
+      "inch x10, ALL, MUL #3\n"
+      "st1h { z24.s }, p2, [x28]\n"
+      "st1h { z23.s }, p1, [x28, #1, MUL VL]\n"
+      "st1h { z22.s }, p0, [x28, #2, MUL VL]\n"
+      "inch x28, ALL, MUL #3\n"
+      "st1h { z21.s }, p2, [x27]\n"
+      "st1h { z20.s }, p1, [x27, #1, MUL VL]\n"
+      "st1h { z19.s }, p0, [x27, #2, MUL VL]\n"
+      "inch x27, ALL, MUL #3\n"
+      "st1h { z18.s }, p2, [x26]\n"
+      "st1h { z17.s }, p1, [x26, #1, MUL VL]\n"
+      "st1h { z16.s }, p0, [x26, #2, MUL VL]\n"
+      "inch x26, ALL, MUL #3\n"
+      "bgt 15b\n"
+      "b 52f\n"
+      "18:"  // Initial: Height 5
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "mov x9, %x[bias]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "19:"  // Initial: Height 5: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p0.s, x21, x11\n"
+      "incw x21\n"
+      "cbnz %x[bias], 20f\n"
+      "mov z1.b, #0x0\n"
+      "mov z0.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "b 21f\n"
+      "20:"  // Initial: Height 5: Width 3: bias
+      "ld1h { z18.s }, p2/Z, [x9]\n"
+      "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+      "lsl z1.s, z18.s, #0x10\n"
+      "lsl z0.s, z17.s, #0x10\n"
+      "lsl z31.s, z16.s, #0x10\n"
+      "21:"  // Initial: Height 5: Width 3: init done
+      "ld1w { z21.s }, p2/Z, [%x[in_ptr]]\n"
+      "ld1w { z20.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "decw x11, ALL, MUL #3\n"
+      "inch x9, ALL, MUL #3\n"
+      "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "ld1w { z17.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "ld1w { z16.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "fadd z21.s, z21.s, z1.s\n"
+      "fadd z20.s, z20.s, z0.s\n"
+      "ld1w { z22.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #-7, MUL VL]\n"
+      "fadd z19.s, z19.s, z31.s\n"
+      "fadd z18.s, z18.s, z1.s\n"
+      "ld1w { z29.s }, p1/Z, [x20, #-6, MUL VL]\n"
+      "ld1w { z28.s }, p0/Z, [x20, #-5, MUL VL]\n"
+      "fadd z17.s, z17.s, z0.s\n"
+      "fadd z16.s, z16.s, z31.s\n"
+      "ld1w { z27.s }, p2/Z, [x20, #-4, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n"
+      "fadd z24.s, z24.s, z1.s\n"
+      "fadd z23.s, z23.s, z0.s\n"
+      "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n"
+      "fadd z22.s, z22.s, z31.s\n"
+      "fadd z30.s, z30.s, z1.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fadd z29.s, z29.s, z0.s\n"
+      "fadd z28.s, z28.s, z31.s\n"
+      "fmin z20.s, p3/M, z20.s, z12.s\n"
+      "fmin z19.s, p3/M, z19.s, z12.s\n"
+      "fadd z27.s, z27.s, z1.s\n"
+      "fadd z26.s, z26.s, z0.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fmin z17.s, p3/M, z17.s, z12.s\n"
+      "fadd z25.s, z25.s, z31.s\n"
+      "fmin z16.s, p3/M, z16.s, z12.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "fmin z30.s, p3/M, z30.s, z12.s\n"
+      "fmin z29.s, p3/M, z29.s, z12.s\n"
+      "fmin z28.s, p3/M, z28.s, z12.s\n"
+      "fmin z27.s, p3/M, z27.s, z12.s\n"
+      "fmin z26.s, p3/M, z26.s, z12.s\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "fmax z20.s, p3/M, z20.s, z11.s\n"
+      "fmax z19.s, p3/M, z19.s, z11.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      "fmax z17.s, p3/M, z17.s, z11.s\n"
+      "fmax z16.s, p3/M, z16.s, z11.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      ".inst 0x658aaeb5  // bfcvt z21.h, p3/M, z21.s\n"
+      ".inst 0x658aae94  // bfcvt z20.h, p3/M, z20.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      "fmax z30.s, p3/M, z30.s, z11.s\n"
+      ".inst 0x658aae73  // bfcvt z19.h, p3/M, z19.s\n"
+      ".inst 0x658aae52  // bfcvt z18.h, p3/M, z18.s\n"
+      "fmax z29.s, p3/M, z29.s, z11.s\n"
+      "fmax z28.s, p3/M, z28.s, z11.s\n"
+      ".inst 0x658aae31  // bfcvt z17.h, p3/M, z17.s\n"
+      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
+      "fmax z27.s, p3/M, z27.s, z11.s\n"
+      "fmax z26.s, p3/M, z26.s, z11.s\n"
+      "st1h { z21.s }, p2, [x10]\n"
+      ".inst 0x658aaf18  // bfcvt z24.h, p3/M, z24.s\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      "cmp x11, XZR\n"
+      "st1h { z20.s }, p1, [x10, #1, MUL VL]\n"
+      ".inst 0x658aaef7  // bfcvt z23.h, p3/M, z23.s\n"
+      "st1h { z19.s }, p0, [x10, #2, MUL VL]\n"
+      ".inst 0x658aaed6  // bfcvt z22.h, p3/M, z22.s\n"
+      ".inst 0x658aafd5  // bfcvt z21.h, p3/M, z30.s\n"
+      "inch x10, ALL, MUL #3\n"
+      "st1h { z18.s }, p2, [x28]\n"
+      ".inst 0x658aafb4  // bfcvt z20.h, p3/M, z29.s\n"
+      ".inst 0x658aaf93  // bfcvt z19.h, p3/M, z28.s\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "st1h { z17.s }, p1, [x28, #1, MUL VL]\n"
+      ".inst 0x658aaf72  // bfcvt z18.h, p3/M, z27.s\n"
+      ".inst 0x658aaf51  // bfcvt z17.h, p3/M, z26.s\n"
+      "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+      ".inst 0x658aaf30  // bfcvt z16.h, p3/M, z25.s\n"
+      "inch x28, ALL, MUL #3\n"
+      "st1h { z24.s }, p2, [x27]\n"
+      "st1h { z23.s }, p1, [x27, #1, MUL VL]\n"
+      "st1h { z22.s }, p0, [x27, #2, MUL VL]\n"
+      "inch x27, ALL, MUL #3\n"
+      "st1h { z21.s }, p2, [x26]\n"
+      "st1h { z20.s }, p1, [x26, #1, MUL VL]\n"
+      "st1h { z19.s }, p0, [x26, #2, MUL VL]\n"
+      "inch x26, ALL, MUL #3\n"
+      "st1h { z18.s }, p2, [x25]\n"
+      "st1h { z17.s }, p1, [x25, #1, MUL VL]\n"
+      "st1h { z16.s }, p0, [x25, #2, MUL VL]\n"
+      "inch x25, ALL, MUL #3\n"
+      "bgt 19b\n"
+      "b 52f\n"
+      "22:"  // Initial: Height 6
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "mov x9, %x[bias]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "23:"  // Initial: Height 6: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p0.s, x21, x11\n"
+      "incw x21\n"
+      "cbnz %x[bias], 24f\n"
+      "mov z4.b, #0x0\n"
+      "mov z3.b, #0x0\n"
+      "mov z2.b, #0x0\n"
+      "b 25f\n"
+      "24:"  // Initial: Height 6: Width 3: bias
+      "ld1h { z18.s }, p2/Z, [x9]\n"
+      "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+      "lsl z4.s, z18.s, #0x10\n"
+      "lsl z3.s, z17.s, #0x10\n"
+      "lsl z2.s, z16.s, #0x10\n"
+      "25:"  // Initial: Height 6: Width 3: init done
+      "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n"
+      "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "decw x11, ALL, MUL #3\n"
+      "inch x9, ALL, MUL #3\n"
+      "ld1w { z21.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "ld1w { z18.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "ld1w { z1.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "fadd z17.s, z17.s, z4.s\n"
+      "fadd z16.s, z16.s, z3.s\n"
+      "ld1w { z25.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x20, #-7, MUL VL]\n"
+      "fadd z21.s, z21.s, z2.s\n"
+      "fadd z20.s, z20.s, z4.s\n"
+      "ld1w { z23.s }, p1/Z, [x20, #-6, MUL VL]\n"
+      "ld1w { z22.s }, p0/Z, [x20, #-5, MUL VL]\n"
+      "fadd z19.s, z19.s, z3.s\n"
+      "fadd z18.s, z18.s, z2.s\n"
+      "ld1w { z31.s }, p2/Z, [x20, #-4, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x20, #-3, MUL VL]\n"
+      "fadd z1.s, z1.s, z4.s\n"
+      "fadd z0.s, z0.s, z3.s\n"
+      "ld1w { z29.s }, p0/Z, [x20, #-2, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x20, #-1, MUL VL]\n"
+      "fadd z25.s, z25.s, z2.s\n"
+      "fadd z24.s, z24.s, z4.s\n"
+      "ld1w { z27.s }, p1/Z, [x20]\n"
+      "ld1w { z26.s }, p0/Z, [x20, #1, MUL VL]\n"
+      "fadd z23.s, z23.s, z3.s\n"
+      "fadd z22.s, z22.s, z2.s\n"
+      "fadd z31.s, z31.s, z4.s\n"
+      "fadd z30.s, z30.s, z3.s\n"
+      "fmin z17.s, p3/M, z17.s, z12.s\n"
+      "fmin z16.s, p3/M, z16.s, z12.s\n"
+      "fadd z29.s, z29.s, z2.s\n"
+      "fadd z28.s, z28.s, z4.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fmin z20.s, p3/M, z20.s, z12.s\n"
+      "fadd z27.s, z27.s, z3.s\n"
+      "fadd z26.s, z26.s, z2.s\n"
+      "fmin z19.s, p3/M, z19.s, z12.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fmin z1.s, p3/M, z1.s, z12.s\n"
+      "fmin z0.s, p3/M, z0.s, z12.s\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "fmin z31.s, p3/M, z31.s, z12.s\n"
+      "fmin z30.s, p3/M, z30.s, z12.s\n"
+      "fmin z29.s, p3/M, z29.s, z12.s\n"
+      "fmin z28.s, p3/M, z28.s, z12.s\n"
+      "fmin z27.s, p3/M, z27.s, z12.s\n"
+      "fmin z26.s, p3/M, z26.s, z12.s\n"
+      "fmax z17.s, p3/M, z17.s, z11.s\n"
+      "fmax z16.s, p3/M, z16.s, z11.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "fmax z20.s, p3/M, z20.s, z11.s\n"
+      "fmax z19.s, p3/M, z19.s, z11.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      "fmax z1.s, p3/M, z1.s, z11.s\n"
+      "fmax z0.s, p3/M, z0.s, z11.s\n"
+      ".inst 0x658aae31  // bfcvt z17.h, p3/M, z17.s\n"
+      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      ".inst 0x658aaeb5  // bfcvt z21.h, p3/M, z21.s\n"
+      ".inst 0x658aae94  // bfcvt z20.h, p3/M, z20.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      ".inst 0x658aae73  // bfcvt z19.h, p3/M, z19.s\n"
+      ".inst 0x658aae52  // bfcvt z18.h, p3/M, z18.s\n"
+      "fmax z31.s, p3/M, z31.s, z11.s\n"
+      "fmax z30.s, p3/M, z30.s, z11.s\n"
+      "st1h { z17.s }, p2, [x10]\n"
+      ".inst 0x658aac31  // bfcvt z17.h, p3/M, z1.s\n"
+      "fmax z29.s, p3/M, z29.s, z11.s\n"
+      "fmax z28.s, p3/M, z28.s, z11.s\n"
+      "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+      ".inst 0x658aac10  // bfcvt z16.h, p3/M, z0.s\n"
+      "fmax z27.s, p3/M, z27.s, z11.s\n"
+      "fmax z26.s, p3/M, z26.s, z11.s\n"
+      "st1h { z21.s }, p0, [x10, #2, MUL VL]\n"
+      ".inst 0x658aaf39  // bfcvt z25.h, p3/M, z25.s\n"
+      "cmp x11, XZR\n"
+      "st1h { z20.s }, p2, [x28]\n"
+      ".inst 0x658aaf18  // bfcvt z24.h, p3/M, z24.s\n"
+      ".inst 0x658aaef7  // bfcvt z23.h, p3/M, z23.s\n"
+      "st1h { z19.s }, p1, [x28, #1, MUL VL]\n"
+      ".inst 0x658aaed6  // bfcvt z22.h, p3/M, z22.s\n"
+      ".inst 0x658aaff5  // bfcvt z21.h, p3/M, z31.s\n"
+      "inch x10, ALL, MUL #3\n"
+      "st1h { z18.s }, p0, [x28, #2, MUL VL]\n"
+      ".inst 0x658aafd4  // bfcvt z20.h, p3/M, z30.s\n"
+      ".inst 0x658aafb3  // bfcvt z19.h, p3/M, z29.s\n"
+      "inch x28, ALL, MUL #3\n"
+      "st1h { z17.s }, p2, [x27]\n"
+      ".inst 0x658aaf92  // bfcvt z18.h, p3/M, z28.s\n"
+      ".inst 0x658aaf71  // bfcvt z17.h, p3/M, z27.s\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "st1h { z16.s }, p1, [x27, #1, MUL VL]\n"
+      ".inst 0x658aaf50  // bfcvt z16.h, p3/M, z26.s\n"
+      "st1h { z25.s }, p0, [x27, #2, MUL VL]\n"
+      "inch x27, ALL, MUL #3\n"
+      "st1h { z24.s }, p2, [x26]\n"
+      "st1h { z23.s }, p1, [x26, #1, MUL VL]\n"
+      "st1h { z22.s }, p0, [x26, #2, MUL VL]\n"
+      "inch x26, ALL, MUL #3\n"
+      "st1h { z21.s }, p2, [x25]\n"
+      "st1h { z20.s }, p1, [x25, #1, MUL VL]\n"
+      "st1h { z19.s }, p0, [x25, #2, MUL VL]\n"
+      "inch x25, ALL, MUL #3\n"
+      "st1h { z18.s }, p2, [x24]\n"
+      "st1h { z17.s }, p1, [x24, #1, MUL VL]\n"
+      "st1h { z16.s }, p0, [x24, #2, MUL VL]\n"
+      "inch x24, ALL, MUL #3\n"
+      "bgt 23b\n"
+      "b 52f\n"
+      "26:"  // Initial: Height 7
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "mov x9, %x[bias]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "add x23, x24, %x[ldout], LSL #1\n"
+      "27:"  // Initial: Height 7: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p0.s, x21, x11\n"
+      "incw x21\n"
+      "cbnz %x[bias], 28f\n"
+      "mov z7.b, #0x0\n"
+      "mov z6.b, #0x0\n"
+      "mov z5.b, #0x0\n"
+      "b 29f\n"
+      "28:"  // Initial: Height 7: Width 3: bias
+      "ld1h { z18.s }, p2/Z, [x9]\n"
+      "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+      "lsl z7.s, z18.s, #0x10\n"
+      "lsl z6.s, z17.s, #0x10\n"
+      "lsl z5.s, z16.s, #0x10\n"
+      "29:"  // Initial: Height 7: Width 3: init done
+      "ld1w { z19.s }, p2/Z, [%x[in_ptr]]\n"
+      "ld1w { z18.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "decw x11, ALL, MUL #3\n"
+      "inch x9, ALL, MUL #3\n"
+      "ld1w { z17.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "ld1w { z21.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "ld1w { z4.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "ld1w { z3.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "fadd z19.s, z19.s, z7.s\n"
+      "fadd z18.s, z18.s, z6.s\n"
+      "ld1w { z2.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "ld1w { z1.s }, p2/Z, [x20, #-7, MUL VL]\n"
+      "fadd z17.s, z17.s, z5.s\n"
+      "fadd z16.s, z16.s, z7.s\n"
+      "ld1w { z26.s }, p1/Z, [x20, #-6, MUL VL]\n"
+      "ld1w { z25.s }, p0/Z, [x20, #-5, MUL VL]\n"
+      "fadd z21.s, z21.s, z6.s\n"
+      "fadd z20.s, z20.s, z5.s\n"
+      "ld1w { z24.s }, p2/Z, [x20, #-4, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #-3, MUL VL]\n"
+      "fadd z4.s, z4.s, z7.s\n"
+      "fadd z3.s, z3.s, z6.s\n"
+      "ld1w { z22.s }, p0/Z, [x20, #-2, MUL VL]\n"
+      "ld1w { z0.s }, p2/Z, [x20, #-1, MUL VL]\n"
+      "fadd z2.s, z2.s, z5.s\n"
+      "fadd z1.s, z1.s, z7.s\n"
+      "ld1w { z31.s }, p1/Z, [x20]\n"
+      "ld1w { z30.s }, p0/Z, [x20, #1, MUL VL]\n"
+      "fadd z26.s, z26.s, z6.s\n"
+      "fadd z25.s, z25.s, z5.s\n"
+      "ld1w { z29.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z28.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "fadd z24.s, z24.s, z7.s\n"
+      "fadd z23.s, z23.s, z6.s\n"
+      "ld1w { z27.s }, p0/Z, [x20, #4, MUL VL]\n"
+      "fadd z22.s, z22.s, z5.s\n"
+      "fadd z0.s, z0.s, z7.s\n"
+      "fmin z19.s, p3/M, z19.s, z12.s\n"
+      "fadd z31.s, z31.s, z6.s\n"
+      "fadd z30.s, z30.s, z5.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fmin z17.s, p3/M, z17.s, z12.s\n"
+      "fadd z29.s, z29.s, z7.s\n"
+      "fadd z28.s, z28.s, z6.s\n"
+      "fmin z16.s, p3/M, z16.s, z12.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fadd z27.s, z27.s, z5.s\n"
+      "fmin z20.s, p3/M, z20.s, z12.s\n"
+      "fmin z4.s, p3/M, z4.s, z12.s\n"
+      "fmin z3.s, p3/M, z3.s, z12.s\n"
+      "fmin z2.s, p3/M, z2.s, z12.s\n"
+      "fmin z1.s, p3/M, z1.s, z12.s\n"
+      "fmin z26.s, p3/M, z26.s, z12.s\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "fmin z0.s, p3/M, z0.s, z12.s\n"
+      "fmin z31.s, p3/M, z31.s, z12.s\n"
+      "fmin z30.s, p3/M, z30.s, z12.s\n"
+      "fmin z29.s, p3/M, z29.s, z12.s\n"
+      "fmin z28.s, p3/M, z28.s, z12.s\n"
+      "fmin z27.s, p3/M, z27.s, z12.s\n"
+      "fmax z19.s, p3/M, z19.s, z11.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      "fmax z17.s, p3/M, z17.s, z11.s\n"
+      "fmax z16.s, p3/M, z16.s, z11.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "fmax z20.s, p3/M, z20.s, z11.s\n"
+      "fmax z4.s, p3/M, z4.s, z11.s\n"
+      "fmax z3.s, p3/M, z3.s, z11.s\n"
+      ".inst 0x658aae73  // bfcvt z19.h, p3/M, z19.s\n"
+      ".inst 0x658aae52  // bfcvt z18.h, p3/M, z18.s\n"
+      "fmax z2.s, p3/M, z2.s, z11.s\n"
+      "fmax z1.s, p3/M, z1.s, z11.s\n"
+      ".inst 0x658aae31  // bfcvt z17.h, p3/M, z17.s\n"
+      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
+      "fmax z26.s, p3/M, z26.s, z11.s\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      ".inst 0x658aaeb5  // bfcvt z21.h, p3/M, z21.s\n"
+      ".inst 0x658aae94  // bfcvt z20.h, p3/M, z20.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "st1h { z19.s }, p2, [x10]\n"
+      ".inst 0x658aac93  // bfcvt z19.h, p3/M, z4.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      "fmax z0.s, p3/M, z0.s, z11.s\n"
+      "st1h { z18.s }, p1, [x10, #1, MUL VL]\n"
+      ".inst 0x658aac72  // bfcvt z18.h, p3/M, z3.s\n"
+      "fmax z31.s, p3/M, z31.s, z11.s\n"
+      "fmax z30.s, p3/M, z30.s, z11.s\n"
+      "st1h { z17.s }, p0, [x10, #2, MUL VL]\n"
+      ".inst 0x658aac51  // bfcvt z17.h, p3/M, z2.s\n"
+      "fmax z29.s, p3/M, z29.s, z11.s\n"
+      "fmax z28.s, p3/M, z28.s, z11.s\n"
+      "st1h { z16.s }, p2, [x28]\n"
+      ".inst 0x658aac30  // bfcvt z16.h, p3/M, z1.s\n"
+      "fmax z27.s, p3/M, z27.s, z11.s\n"
+      "cmp x11, XZR\n"
+      "st1h { z21.s }, p1, [x28, #1, MUL VL]\n"
+      ".inst 0x658aaf5a  // bfcvt z26.h, p3/M, z26.s\n"
+      "st1h { z20.s }, p0, [x28, #2, MUL VL]\n"
+      ".inst 0x658aaf39  // bfcvt z25.h, p3/M, z25.s\n"
+      ".inst 0x658aaf18  // bfcvt z24.h, p3/M, z24.s\n"
+      "inch x10, ALL, MUL #3\n"
+      "st1h { z19.s }, p2, [x27]\n"
+      ".inst 0x658aaef7  // bfcvt z23.h, p3/M, z23.s\n"
+      ".inst 0x658aaed6  // bfcvt z22.h, p3/M, z22.s\n"
+      "inch x28, ALL, MUL #3\n"
+      "st1h { z18.s }, p1, [x27, #1, MUL VL]\n"
+      ".inst 0x658aac15  // bfcvt z21.h, p3/M, z0.s\n"
+      ".inst 0x658aaff4  // bfcvt z20.h, p3/M, z31.s\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "st1h { z17.s }, p0, [x27, #2, MUL VL]\n"
+      ".inst 0x658aafd3  // bfcvt z19.h, p3/M, z30.s\n"
+      ".inst 0x658aafb2  // bfcvt z18.h, p3/M, z29.s\n"
+      "inch x27, ALL, MUL #3\n"
+      "st1h { z16.s }, p2, [x26]\n"
+      ".inst 0x658aaf91  // bfcvt z17.h, p3/M, z28.s\n"
+      ".inst 0x658aaf70  // bfcvt z16.h, p3/M, z27.s\n"
+      "st1h { z26.s }, p1, [x26, #1, MUL VL]\n"
+      "st1h { z25.s }, p0, [x26, #2, MUL VL]\n"
+      "inch x26, ALL, MUL #3\n"
+      "st1h { z24.s }, p2, [x25]\n"
+      "st1h { z23.s }, p1, [x25, #1, MUL VL]\n"
+      "st1h { z22.s }, p0, [x25, #2, MUL VL]\n"
+      "inch x25, ALL, MUL #3\n"
+      "st1h { z21.s }, p2, [x24]\n"
+      "st1h { z20.s }, p1, [x24, #1, MUL VL]\n"
+      "st1h { z19.s }, p0, [x24, #2, MUL VL]\n"
+      "inch x24, ALL, MUL #3\n"
+      "st1h { z18.s }, p2, [x23]\n"
+      "st1h { z17.s }, p1, [x23, #1, MUL VL]\n"
+      "st1h { z16.s }, p0, [x23, #2, MUL VL]\n"
+      "inch x23, ALL, MUL #3\n"
+      "bgt 27b\n"
+      "b 52f\n"
+      "30:"  // Initial: Height 8
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "mov x9, %x[bias]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "add x23, x24, %x[ldout], LSL #1\n"
+      "add x22, x23, %x[ldout], LSL #1\n"
+      "31:"  // Initial: Height 8: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "whilelt p0.s, x21, x11\n"
+      "incw x21\n"
+      "cbnz %x[bias], 32f\n"
+      "mov z10.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z8.b, #0x0\n"
+      "b 33f\n"
+      "32:"  // Initial: Height 8: Width 3: bias
+      "ld1h { z18.s }, p2/Z, [x9]\n"
+      "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+      "lsl z10.s, z18.s, #0x10\n"
+      "lsl z9.s, z17.s, #0x10\n"
+      "lsl z8.s, z16.s, #0x10\n"
+      "33:"  // Initial: Height 8: Width 3: init done
+      "ld1w { z21.s }, p2/Z, [%x[in_ptr]]\n"
+      "ld1w { z20.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "decw x11, ALL, MUL #3\n"
+      "inch x9, ALL, MUL #3\n"
+      "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "ld1w { z17.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "ld1w { z16.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "ld1w { z7.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "ld1w { z6.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "fadd z21.s, z21.s, z10.s\n"
+      "fadd z20.s, z20.s, z9.s\n"
+      "ld1w { z5.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "ld1w { z4.s }, p2/Z, [x20, #-7, MUL VL]\n"
+      "fadd z19.s, z19.s, z8.s\n"
+      "fadd z18.s, z18.s, z10.s\n"
+      "ld1w { z3.s }, p1/Z, [x20, #-6, MUL VL]\n"
+      "ld1w { z2.s }, p0/Z, [x20, #-5, MUL VL]\n"
+      "fadd z17.s, z17.s, z9.s\n"
+      "fadd z16.s, z16.s, z8.s\n"
+      "ld1w { z27.s }, p2/Z, [x20, #-4, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n"
+      "fadd z7.s, z7.s, z10.s\n"
+      "fadd z6.s, z6.s, z9.s\n"
+      "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x20, #-1, MUL VL]\n"
+      "fadd z5.s, z5.s, z8.s\n"
+      "fadd z4.s, z4.s, z10.s\n"
+      "ld1w { z23.s }, p1/Z, [x20]\n"
+      "ld1w { z22.s }, p0/Z, [x20, #1, MUL VL]\n"
+      "fadd z3.s, z3.s, z9.s\n"
+      "fadd z2.s, z2.s, z8.s\n"
+      "ld1w { z1.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "fadd z27.s, z27.s, z10.s\n"
+      "fadd z26.s, z26.s, z9.s\n"
+      "ld1w { z31.s }, p0/Z, [x20, #4, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #5, MUL VL]\n"
+      "fadd z25.s, z25.s, z8.s\n"
+      "fadd z24.s, z24.s, z10.s\n"
+      "ld1w { z29.s }, p1/Z, [x20, #6, MUL VL]\n"
+      "ld1w { z28.s }, p0/Z, [x20, #7, MUL VL]\n"
+      "fadd z23.s, z23.s, z9.s\n"
+      "fadd z22.s, z22.s, z8.s\n"
+      "fadd z1.s, z1.s, z10.s\n"
+      "fadd z0.s, z0.s, z9.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fmin z20.s, p3/M, z20.s, z12.s\n"
+      "fadd z31.s, z31.s, z8.s\n"
+      "fadd z30.s, z30.s, z10.s\n"
+      "fmin z19.s, p3/M, z19.s, z12.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fadd z29.s, z29.s, z9.s\n"
+      "fadd z28.s, z28.s, z8.s\n"
+      "fmin z17.s, p3/M, z17.s, z12.s\n"
+      "fmin z16.s, p3/M, z16.s, z12.s\n"
+      "fmin z7.s, p3/M, z7.s, z12.s\n"
+      "fmin z6.s, p3/M, z6.s, z12.s\n"
+      "fmin z5.s, p3/M, z5.s, z12.s\n"
+      "fmin z4.s, p3/M, z4.s, z12.s\n"
+      "fmin z3.s, p3/M, z3.s, z12.s\n"
+      "fmin z2.s, p3/M, z2.s, z12.s\n"
+      "fmin z27.s, p3/M, z27.s, z12.s\n"
+      "fmin z26.s, p3/M, z26.s, z12.s\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "fmin z1.s, p3/M, z1.s, z12.s\n"
+      "fmin z0.s, p3/M, z0.s, z12.s\n"
+      "fmin z31.s, p3/M, z31.s, z12.s\n"
+      "fmin z30.s, p3/M, z30.s, z12.s\n"
+      "fmin z29.s, p3/M, z29.s, z12.s\n"
+      "fmin z28.s, p3/M, z28.s, z12.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "fmax z20.s, p3/M, z20.s, z11.s\n"
+      "fmax z19.s, p3/M, z19.s, z11.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      "fmax z17.s, p3/M, z17.s, z11.s\n"
+      "fmax z16.s, p3/M, z16.s, z11.s\n"
+      "fmax z7.s, p3/M, z7.s, z11.s\n"
+      "fmax z6.s, p3/M, z6.s, z11.s\n"
+      ".inst 0x658aaeb5  // bfcvt z21.h, p3/M, z21.s\n"
+      ".inst 0x658aae94  // bfcvt z20.h, p3/M, z20.s\n"
+      "fmax z5.s, p3/M, z5.s, z11.s\n"
+      "fmax z4.s, p3/M, z4.s, z11.s\n"
+      ".inst 0x658aae73  // bfcvt z19.h, p3/M, z19.s\n"
+      ".inst 0x658aae52  // bfcvt z18.h, p3/M, z18.s\n"
+      "fmax z3.s, p3/M, z3.s, z11.s\n"
+      "fmax z2.s, p3/M, z2.s, z11.s\n"
+      ".inst 0x658aae31  // bfcvt z17.h, p3/M, z17.s\n"
+      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
+      "fmax z27.s, p3/M, z27.s, z11.s\n"
+      "fmax z26.s, p3/M, z26.s, z11.s\n"
+      "st1h { z21.s }, p2, [x10]\n"
+      ".inst 0x658aacf5  // bfcvt z21.h, p3/M, z7.s\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      "st1h { z20.s }, p1, [x10, #1, MUL VL]\n"
+      ".inst 0x658aacd4  // bfcvt z20.h, p3/M, z6.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      "st1h { z19.s }, p0, [x10, #2, MUL VL]\n"
+      ".inst 0x658aacb3  // bfcvt z19.h, p3/M, z5.s\n"
+      "fmax z1.s, p3/M, z1.s, z11.s\n"
+      "fmax z0.s, p3/M, z0.s, z11.s\n"
+      "st1h { z18.s }, p2, [x28]\n"
+      ".inst 0x658aac92  // bfcvt z18.h, p3/M, z4.s\n"
+      "fmax z31.s, p3/M, z31.s, z11.s\n"
+      "fmax z30.s, p3/M, z30.s, z11.s\n"
+      "st1h { z17.s }, p1, [x28, #1, MUL VL]\n"
+      ".inst 0x658aac71  // bfcvt z17.h, p3/M, z3.s\n"
+      "fmax z29.s, p3/M, z29.s, z11.s\n"
+      "fmax z28.s, p3/M, z28.s, z11.s\n"
+      "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+      ".inst 0x658aac50  // bfcvt z16.h, p3/M, z2.s\n"
+      "cmp x11, XZR\n"
+      "st1h { z21.s }, p2, [x27]\n"
+      ".inst 0x658aaf7b  // bfcvt z27.h, p3/M, z27.s\n"
+      ".inst 0x658aaf5a  // bfcvt z26.h, p3/M, z26.s\n"
+      "st1h { z20.s }, p1, [x27, #1, MUL VL]\n"
+      ".inst 0x658aaf39  // bfcvt z25.h, p3/M, z25.s\n"
+      ".inst 0x658aaf18  // bfcvt z24.h, p3/M, z24.s\n"
+      "inch x10, ALL, MUL #3\n"
+      "st1h { z19.s }, p0, [x27, #2, MUL VL]\n"
+      ".inst 0x658aaef7  // bfcvt z23.h, p3/M, z23.s\n"
+      ".inst 0x658aaed6  // bfcvt z22.h, p3/M, z22.s\n"
+      "inch x28, ALL, MUL #3\n"
+      "st1h { z18.s }, p2, [x26]\n"
+      ".inst 0x658aac35  // bfcvt z21.h, p3/M, z1.s\n"
+      ".inst 0x658aac14  // bfcvt z20.h, p3/M, z0.s\n"
+      "inch x27, ALL, MUL #3\n"
+      "st1h { z17.s }, p1, [x26, #1, MUL VL]\n"
+      ".inst 0x658aaff3  // bfcvt z19.h, p3/M, z31.s\n"
+      ".inst 0x658aafd2  // bfcvt z18.h, p3/M, z30.s\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "st1h { z16.s }, p0, [x26, #2, MUL VL]\n"
+      ".inst 0x658aafb1  // bfcvt z17.h, p3/M, z29.s\n"
+      ".inst 0x658aaf90  // bfcvt z16.h, p3/M, z28.s\n"
+      "inch x26, ALL, MUL #3\n"
+      "st1h { z27.s }, p2, [x25]\n"
+      "st1h { z26.s }, p1, [x25, #1, MUL VL]\n"
+      "st1h { z25.s }, p0, [x25, #2, MUL VL]\n"
+      "inch x25, ALL, MUL #3\n"
+      "st1h { z24.s }, p2, [x24]\n"
+      "st1h { z23.s }, p1, [x24, #1, MUL VL]\n"
+      "st1h { z22.s }, p0, [x24, #2, MUL VL]\n"
+      "inch x24, ALL, MUL #3\n"
+      "st1h { z21.s }, p2, [x23]\n"
+      "st1h { z20.s }, p1, [x23, #1, MUL VL]\n"
+      "st1h { z19.s }, p0, [x23, #2, MUL VL]\n"
+      "inch x23, ALL, MUL #3\n"
+      "st1h { z18.s }, p2, [x22]\n"
+      "st1h { z17.s }, p1, [x22, #1, MUL VL]\n"
+      "st1h { z16.s }, p0, [x22, #2, MUL VL]\n"
+      "inch x22, ALL, MUL #3\n"
+      "bgt 31b\n"
+      "subs %x[rows], %x[rows], #0x8\n"
+      "add %x[out_ptr], %x[out_ptr], x12\n"
+      "bgt 1b\n"
+      "b 52f\n"
+      "34:"  // Accumulate
+      "35:"  // Accumulate: Row loop
+      "cmp %x[rows], #0x7\n"
+      "bgt 50f\n"
+      "beq 48f\n"
+      "cmp %x[rows], #0x5\n"
+      "bgt 46f\n"
+      "beq 44f\n"
+      "cmp %x[rows], #0x3\n"
+      "bgt 42f\n"
+      "beq 40f\n"
+      "cmp %x[rows], #0x1\n"
+      "bgt 38f\n"
+      "36:"  // Accumulate: Height 1
+      "mov x11, %x[cols]\n"
+      "mov x10, %x[out_ptr]\n"
+      "37:"  // Accumulate: Height 1: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "ld1h { z16.s }, p2/Z, [x10]\n"
+      "ld1w { z19.s }, p2/Z, [%x[in_ptr]]\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "fadd z19.s, z19.s, z16.s\n"
+      "fmin z19.s, p3/M, z19.s, z12.s\n"
+      "ld1w { z18.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "whilelt p0.s, x21, x11\n"
+      "decw x11, ALL, MUL #3\n"
+      "incw x21\n"
+      "fmax z19.s, p3/M, z19.s, z11.s\n"
+      "ld1w { z17.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "cmp x11, XZR\n"
+      ".inst 0x658aae70  // bfcvt z16.h, p3/M, z19.s\n"
+      "st1h { z16.s }, p2, [x10]\n"
+      "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z18.s, z18.s, z16.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      ".inst 0x658aae50  // bfcvt z16.h, p3/M, z18.s\n"
+      "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z17.s, z17.s, z16.s\n"
+      "fmin z17.s, p3/M, z17.s, z12.s\n"
+      "fmax z17.s, p3/M, z17.s, z11.s\n"
+      ".inst 0x658aae30  // bfcvt z16.h, p3/M, z17.s\n"
+      "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+      "inch x10, ALL, MUL #3\n"
+      "bgt 37b\n"
+      "b 52f\n"
+      "38:"  // Accumulate: Height 2
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "39:"  // Accumulate: Height 2: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "ld1h { z17.s }, p2/Z, [x10]\n"
+      "ld1h { z16.s }, p2/Z, [x28]\n"
+      "ld1w { z23.s }, p2/Z, [%x[in_ptr]]\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "ld1w { z22.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "fadd z23.s, z23.s, z17.s\n"
+      "fadd z22.s, z22.s, z16.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "ld1w { z21.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "whilelt p0.s, x21, x11\n"
+      "decw x11, ALL, MUL #3\n"
+      "incw x21\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z18.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "cmp x11, XZR\n"
+      ".inst 0x658aaef1  // bfcvt z17.h, p3/M, z23.s\n"
+      ".inst 0x658aaed0  // bfcvt z16.h, p3/M, z22.s\n"
+      "st1h { z17.s }, p2, [x10]\n"
+      "st1h { z16.s }, p2, [x28]\n"
+      "ld1h { z17.s }, p1/Z, [x10, #1, MUL VL]\n"
+      "ld1h { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z21.s, z21.s, z17.s\n"
+      "fadd z20.s, z20.s, z16.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fmin z20.s, p3/M, z20.s, z12.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "fmax z20.s, p3/M, z20.s, z11.s\n"
+      ".inst 0x658aaeb0  // bfcvt z16.h, p3/M, z21.s\n"
+      "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+      ".inst 0x658aae90  // bfcvt z16.h, p3/M, z20.s\n"
+      "ld1h { z17.s }, p0/Z, [x10, #2, MUL VL]\n"
+      "st1h { z16.s }, p1, [x28, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x28, #2, MUL VL]\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z19.s, z19.s, z17.s\n"
+      "fadd z18.s, z18.s, z16.s\n"
+      "fmin z19.s, p3/M, z19.s, z12.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fmax z19.s, p3/M, z19.s, z11.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      ".inst 0x658aae70  // bfcvt z16.h, p3/M, z19.s\n"
+      "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+      "inch x10, ALL, MUL #3\n"
+      ".inst 0x658aae50  // bfcvt z16.h, p3/M, z18.s\n"
+      "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+      "inch x28, ALL, MUL #3\n"
+      "bgt 39b\n"
+      "b 52f\n"
+      "40:"  // Accumulate: Height 3
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "41:"  // Accumulate: Height 3: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "ld1h { z18.s }, p2/Z, [x10]\n"
+      "ld1h { z17.s }, p2/Z, [x28]\n"
+      "ld1h { z16.s }, p2/Z, [x27]\n"
+      "ld1w { z26.s }, p2/Z, [%x[in_ptr]]\n"
+      "lsl z19.s, z18.s, #0x10\n"
+      "ld1w { z25.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "ld1w { z18.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z26.s, z26.s, z19.s\n"
+      "fadd z25.s, z25.s, z17.s\n"
+      "ld1w { z24.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "fadd z18.s, z18.s, z16.s\n"
+      "fmin z26.s, p3/M, z26.s, z12.s\n"
+      "whilelt p0.s, x21, x11\n"
+      "decw x11, ALL, MUL #3\n"
+      "incw x21\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fmax z26.s, p3/M, z26.s, z11.s\n"
+      "ld1w { z21.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "ld1w { z19.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "cmp x11, XZR\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      ".inst 0x658aaf51  // bfcvt z17.h, p3/M, z26.s\n"
+      ".inst 0x658aaf30  // bfcvt z16.h, p3/M, z25.s\n"
+      "st1h { z17.s }, p2, [x10]\n"
+      "st1h { z16.s }, p2, [x28]\n"
+      ".inst 0x658aae51  // bfcvt z17.h, p3/M, z18.s\n"
+      "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+      "st1h { z17.s }, p2, [x27]\n"
+      "ld1h { z17.s }, p1/Z, [x28, #1, MUL VL]\n"
+      "lsl z18.s, z16.s, #0x10\n"
+      "ld1h { z16.s }, p1/Z, [x27, #1, MUL VL]\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z24.s, z24.s, z18.s\n"
+      "fadd z23.s, z23.s, z17.s\n"
+      "fadd z22.s, z22.s, z16.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      ".inst 0x658aaf10  // bfcvt z16.h, p3/M, z24.s\n"
+      "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+      ".inst 0x658aaef2  // bfcvt z18.h, p3/M, z23.s\n"
+      ".inst 0x658aaed1  // bfcvt z17.h, p3/M, z22.s\n"
+      "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n"
+      "st1h { z18.s }, p1, [x28, #1, MUL VL]\n"
+      "st1h { z17.s }, p1, [x27, #1, MUL VL]\n"
+      "ld1h { z17.s }, p0/Z, [x28, #2, MUL VL]\n"
+      "lsl z18.s, z16.s, #0x10\n"
+      "ld1h { z16.s }, p0/Z, [x27, #2, MUL VL]\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z21.s, z21.s, z18.s\n"
+      "fadd z20.s, z20.s, z17.s\n"
+      "fadd z19.s, z19.s, z16.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fmin z20.s, p3/M, z20.s, z12.s\n"
+      "fmin z19.s, p3/M, z19.s, z12.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "fmax z20.s, p3/M, z20.s, z11.s\n"
+      "fmax z19.s, p3/M, z19.s, z11.s\n"
+      ".inst 0x658aaeb0  // bfcvt z16.h, p3/M, z21.s\n"
+      "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+      "inch x10, ALL, MUL #3\n"
+      ".inst 0x658aae91  // bfcvt z17.h, p3/M, z20.s\n"
+      ".inst 0x658aae70  // bfcvt z16.h, p3/M, z19.s\n"
+      "st1h { z17.s }, p0, [x28, #2, MUL VL]\n"
+      "inch x28, ALL, MUL #3\n"
+      "st1h { z16.s }, p0, [x27, #2, MUL VL]\n"
+      "inch x27, ALL, MUL #3\n"
+      "bgt 41b\n"
+      "b 52f\n"
+      "42:"  // Accumulate: Height 4
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "43:"  // Accumulate: Height 4: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "ld1h { z19.s }, p2/Z, [x10]\n"
+      "ld1h { z18.s }, p2/Z, [x28]\n"
+      "ld1h { z17.s }, p2/Z, [x27]\n"
+      "ld1h { z16.s }, p2/Z, [x26]\n"
+      "ld1w { z30.s }, p2/Z, [%x[in_ptr]]\n"
+      "lsl z20.s, z19.s, #0x10\n"
+      "ld1w { z29.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "ld1w { z28.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x20, #-7, MUL VL]\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z30.s, z30.s, z20.s\n"
+      "fadd z29.s, z29.s, z18.s\n"
+      "ld1w { z27.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "ld1w { z25.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x20, #-6, MUL VL]\n"
+      "whilelt p0.s, x21, x11\n"
+      "decw x11, ALL, MUL #3\n"
+      "fadd z28.s, z28.s, z17.s\n"
+      "fadd z19.s, z19.s, z16.s\n"
+      "incw x21\n"
+      "fmin z30.s, p3/M, z30.s, z12.s\n"
+      "fmin z29.s, p3/M, z29.s, z12.s\n"
+      "ld1w { z23.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z22.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "ld1w { z21.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "ld1w { z20.s }, p0/Z, [x20, #-5, MUL VL]\n"
+      "cmp x11, XZR\n"
+      "fmin z28.s, p3/M, z28.s, z12.s\n"
+      "fmin z19.s, p3/M, z19.s, z12.s\n"
+      "fmax z30.s, p3/M, z30.s, z11.s\n"
+      "fmax z29.s, p3/M, z29.s, z11.s\n"
+      "fmax z28.s, p3/M, z28.s, z11.s\n"
+      "fmax z19.s, p3/M, z19.s, z11.s\n"
+      ".inst 0x658aafd2  // bfcvt z18.h, p3/M, z30.s\n"
+      ".inst 0x658aafb1  // bfcvt z17.h, p3/M, z29.s\n"
+      ".inst 0x658aaf90  // bfcvt z16.h, p3/M, z28.s\n"
+      "st1h { z18.s }, p2, [x10]\n"
+      "st1h { z17.s }, p2, [x28]\n"
+      ".inst 0x658aae71  // bfcvt z17.h, p3/M, z19.s\n"
+      "st1h { z16.s }, p2, [x27]\n"
+      "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+      "st1h { z17.s }, p2, [x26]\n"
+      "ld1h { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z17.s }, p1/Z, [x27, #1, MUL VL]\n"
+      "lsl z19.s, z16.s, #0x10\n"
+      "ld1h { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "fadd z27.s, z27.s, z19.s\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z26.s, z26.s, z18.s\n"
+      "fadd z25.s, z25.s, z17.s\n"
+      "fadd z24.s, z24.s, z16.s\n"
+      "fmin z27.s, p3/M, z27.s, z12.s\n"
+      "fmin z26.s, p3/M, z26.s, z12.s\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fmax z27.s, p3/M, z27.s, z11.s\n"
+      "fmax z26.s, p3/M, z26.s, z11.s\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      ".inst 0x658aaf71  // bfcvt z17.h, p3/M, z27.s\n"
+      ".inst 0x658aaf50  // bfcvt z16.h, p3/M, z26.s\n"
+      "st1h { z17.s }, p1, [x10, #1, MUL VL]\n"
+      "st1h { z16.s }, p1, [x28, #1, MUL VL]\n"
+      ".inst 0x658aaf32  // bfcvt z18.h, p3/M, z25.s\n"
+      ".inst 0x658aaf11  // bfcvt z17.h, p3/M, z24.s\n"
+      "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n"
+      "st1h { z18.s }, p1, [x27, #1, MUL VL]\n"
+      "st1h { z17.s }, p1, [x26, #1, MUL VL]\n"
+      "ld1h { z18.s }, p0/Z, [x28, #2, MUL VL]\n"
+      "lsl z19.s, z16.s, #0x10\n"
+      "ld1h { z17.s }, p0/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x26, #2, MUL VL]\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "fadd z23.s, z23.s, z19.s\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z22.s, z22.s, z18.s\n"
+      "fadd z21.s, z21.s, z17.s\n"
+      "fadd z20.s, z20.s, z16.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fmin z20.s, p3/M, z20.s, z12.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "fmax z20.s, p3/M, z20.s, z11.s\n"
+      ".inst 0x658aaef1  // bfcvt z17.h, p3/M, z23.s\n"
+      ".inst 0x658aaed0  // bfcvt z16.h, p3/M, z22.s\n"
+      "st1h { z17.s }, p0, [x10, #2, MUL VL]\n"
+      "inch x10, ALL, MUL #3\n"
+      "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+      "inch x28, ALL, MUL #3\n"
+      ".inst 0x658aaeb1  // bfcvt z17.h, p3/M, z21.s\n"
+      ".inst 0x658aae90  // bfcvt z16.h, p3/M, z20.s\n"
+      "st1h { z17.s }, p0, [x27, #2, MUL VL]\n"
+      "inch x27, ALL, MUL #3\n"
+      "st1h { z16.s }, p0, [x26, #2, MUL VL]\n"
+      "inch x26, ALL, MUL #3\n"
+      "bgt 43b\n"
+      "b 52f\n"
+      "44:"  // Accumulate: Height 5
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "45:"  // Accumulate: Height 5: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "ld1h { z20.s }, p2/Z, [x10]\n"
+      "ld1h { z19.s }, p2/Z, [x28]\n"
+      "ld1h { z18.s }, p2/Z, [x27]\n"
+      "ld1h { z17.s }, p2/Z, [x26]\n"
+      "ld1h { z16.s }, p2/Z, [x25]\n"
+      "ld1w { z1.s }, p2/Z, [%x[in_ptr]]\n"
+      "lsl z22.s, z20.s, #0x10\n"
+      "ld1w { z0.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "lsl z21.s, z19.s, #0x10\n"
+      "ld1w { z31.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "whilelt p1.s, x21, x11\n"
+      "lsl z19.s, z18.s, #0x10\n"
+      "ld1w { z20.s }, p2/Z, [x20, #-7, MUL VL]\n"
+      "lsl z18.s, z17.s, #0x10\n"
+      "ld1w { z17.s }, p2/Z, [x20, #-4, MUL VL]\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z1.s, z1.s, z22.s\n"
+      "incw x21\n"
+      "fadd z0.s, z0.s, z21.s\n"
+      "ld1w { z30.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "ld1w { z29.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "fadd z31.s, z31.s, z19.s\n"
+      "fadd z20.s, z20.s, z18.s\n"
+      "ld1w { z28.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x20, #-6, MUL VL]\n"
+      "fadd z17.s, z17.s, z16.s\n"
+      "fmin z1.s, p3/M, z1.s, z12.s\n"
+      "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n"
+      "whilelt p0.s, x21, x11\n"
+      "fmin z0.s, p3/M, z0.s, z12.s\n"
+      "fmin z31.s, p3/M, z31.s, z12.s\n"
+      "fmin z20.s, p3/M, z20.s, z12.s\n"
+      "fmin z17.s, p3/M, z17.s, z12.s\n"
+      "fmax z1.s, p3/M, z1.s, z11.s\n"
+      "ld1w { z25.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z24.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "decw x11, ALL, MUL #3\n"
+      "fmax z0.s, p3/M, z0.s, z11.s\n"
+      "fmax z31.s, p3/M, z31.s, z11.s\n"
+      "ld1w { z23.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "ld1w { z22.s }, p0/Z, [x20, #-5, MUL VL]\n"
+      "fmax z20.s, p3/M, z20.s, z11.s\n"
+      "fmax z17.s, p3/M, z17.s, z11.s\n"
+      "ld1w { z21.s }, p0/Z, [x20, #-2, MUL VL]\n"
+      ".inst 0x658aac30  // bfcvt z16.h, p3/M, z1.s\n"
+      "cmp x11, XZR\n"
+      "incw x21\n"
+      ".inst 0x658aac13  // bfcvt z19.h, p3/M, z0.s\n"
+      ".inst 0x658aaff2  // bfcvt z18.h, p3/M, z31.s\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "st1h { z16.s }, p2, [x10]\n"
+      ".inst 0x658aae90  // bfcvt z16.h, p3/M, z20.s\n"
+      ".inst 0x658aae31  // bfcvt z17.h, p3/M, z17.s\n"
+      "st1h { z19.s }, p2, [x28]\n"
+      "st1h { z18.s }, p2, [x27]\n"
+      "st1h { z16.s }, p2, [x26]\n"
+      "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+      "st1h { z17.s }, p2, [x25]\n"
+      "ld1h { z19.s }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z18.s }, p1/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z17.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "lsl z20.s, z16.s, #0x10\n"
+      "ld1h { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "lsl z19.s, z19.s, #0x10\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z30.s, z30.s, z20.s\n"
+      "fadd z29.s, z29.s, z19.s\n"
+      "fadd z28.s, z28.s, z18.s\n"
+      "fadd z27.s, z27.s, z17.s\n"
+      "fadd z26.s, z26.s, z16.s\n"
+      "fmin z30.s, p3/M, z30.s, z12.s\n"
+      "fmin z29.s, p3/M, z29.s, z12.s\n"
+      "fmin z28.s, p3/M, z28.s, z12.s\n"
+      "fmin z27.s, p3/M, z27.s, z12.s\n"
+      "fmin z26.s, p3/M, z26.s, z12.s\n"
+      "fmax z30.s, p3/M, z30.s, z11.s\n"
+      "fmax z29.s, p3/M, z29.s, z11.s\n"
+      "fmax z28.s, p3/M, z28.s, z11.s\n"
+      "fmax z27.s, p3/M, z27.s, z11.s\n"
+      "fmax z26.s, p3/M, z26.s, z11.s\n"
+      ".inst 0x658aafd2  // bfcvt z18.h, p3/M, z30.s\n"
+      ".inst 0x658aafb1  // bfcvt z17.h, p3/M, z29.s\n"
+      ".inst 0x658aaf90  // bfcvt z16.h, p3/M, z28.s\n"
+      "st1h { z18.s }, p1, [x10, #1, MUL VL]\n"
+      "st1h { z17.s }, p1, [x28, #1, MUL VL]\n"
+      ".inst 0x658aaf72  // bfcvt z18.h, p3/M, z27.s\n"
+      ".inst 0x658aaf51  // bfcvt z17.h, p3/M, z26.s\n"
+      "st1h { z16.s }, p1, [x27, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n"
+      "st1h { z18.s }, p1, [x26, #1, MUL VL]\n"
+      "st1h { z17.s }, p1, [x25, #1, MUL VL]\n"
+      "ld1h { z19.s }, p0/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z18.s }, p0/Z, [x27, #2, MUL VL]\n"
+      "lsl z20.s, z16.s, #0x10\n"
+      "ld1h { z17.s }, p0/Z, [x26, #2, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "lsl z19.s, z19.s, #0x10\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "fadd z25.s, z25.s, z20.s\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z24.s, z24.s, z19.s\n"
+      "fadd z23.s, z23.s, z18.s\n"
+      "fadd z22.s, z22.s, z17.s\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "fadd z21.s, z21.s, z16.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      ".inst 0x658aaf31  // bfcvt z17.h, p3/M, z25.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      ".inst 0x658aaf10  // bfcvt z16.h, p3/M, z24.s\n"
+      "st1h { z17.s }, p0, [x10, #2, MUL VL]\n"
+      "inch x10, ALL, MUL #3\n"
+      ".inst 0x658aaef2  // bfcvt z18.h, p3/M, z23.s\n"
+      ".inst 0x658aaed1  // bfcvt z17.h, p3/M, z22.s\n"
+      "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+      "inch x28, ALL, MUL #3\n"
+      ".inst 0x658aaeb0  // bfcvt z16.h, p3/M, z21.s\n"
+      "st1h { z18.s }, p0, [x27, #2, MUL VL]\n"
+      "inch x27, ALL, MUL #3\n"
+      "st1h { z17.s }, p0, [x26, #2, MUL VL]\n"
+      "inch x26, ALL, MUL #3\n"
+      "st1h { z16.s }, p0, [x25, #2, MUL VL]\n"
+      "inch x25, ALL, MUL #3\n"
+      "bgt 45b\n"
+      "b 52f\n"
+      "46:"  // Accumulate: Height 6
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "47:"  // Accumulate: Height 6: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "ld1h { z21.s }, p2/Z, [x10]\n"
+      "ld1h { z20.s }, p2/Z, [x28]\n"
+      "ld1h { z19.s }, p2/Z, [x27]\n"
+      "ld1h { z18.s }, p2/Z, [x26]\n"
+      "ld1h { z17.s }, p2/Z, [x25]\n"
+      "ld1h { z16.s }, p2/Z, [x24]\n"
+      "ld1w { z6.s }, p2/Z, [%x[in_ptr]]\n"
+      "lsl z22.s, z21.s, #0x10\n"
+      "ld1w { z5.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "lsl z21.s, z20.s, #0x10\n"
+      "ld1w { z4.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "lsl z20.s, z19.s, #0x10\n"
+      "ld1w { z3.s }, p2/Z, [x20, #-7, MUL VL]\n"
+      "lsl z19.s, z18.s, #0x10\n"
+      "ld1w { z2.s }, p2/Z, [x20, #-4, MUL VL]\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "ld1w { z18.s }, p2/Z, [x20, #-1, MUL VL]\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z6.s, z6.s, z22.s\n"
+      "fadd z5.s, z5.s, z21.s\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "fadd z4.s, z4.s, z20.s\n"
+      "fadd z3.s, z3.s, z19.s\n"
+      "fadd z2.s, z2.s, z17.s\n"
+      "fadd z18.s, z18.s, z16.s\n"
+      "fmin z6.s, p3/M, z6.s, z12.s\n"
+      "fmin z5.s, p3/M, z5.s, z12.s\n"
+      "ld1w { z1.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "whilelt p0.s, x21, x11\n"
+      "decw x11, ALL, MUL #3\n"
+      "fmin z4.s, p3/M, z4.s, z12.s\n"
+      "fmin z3.s, p3/M, z3.s, z12.s\n"
+      "ld1w { z31.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x20, #-6, MUL VL]\n"
+      "fmin z2.s, p3/M, z2.s, z12.s\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "ld1w { z29.s }, p1/Z, [x20, #-3, MUL VL]\n"
+      "ld1w { z28.s }, p1/Z, [x20]\n"
+      "fmax z6.s, p3/M, z6.s, z11.s\n"
+      "fmax z5.s, p3/M, z5.s, z11.s\n"
+      "ld1w { z27.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z26.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "fmax z4.s, p3/M, z4.s, z11.s\n"
+      "fmax z3.s, p3/M, z3.s, z11.s\n"
+      "ld1w { z25.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "ld1w { z24.s }, p0/Z, [x20, #-5, MUL VL]\n"
+      "fmax z2.s, p3/M, z2.s, z11.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      "ld1w { z23.s }, p0/Z, [x20, #-2, MUL VL]\n"
+      "ld1w { z22.s }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x658aacd5  // bfcvt z21.h, p3/M, z6.s\n"
+      ".inst 0x658aacb4  // bfcvt z20.h, p3/M, z5.s\n"
+      "cmp x11, XZR\n"
+      "incw x21\n"
+      ".inst 0x658aac93  // bfcvt z19.h, p3/M, z4.s\n"
+      ".inst 0x658aac71  // bfcvt z17.h, p3/M, z3.s\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      ".inst 0x658aac50  // bfcvt z16.h, p3/M, z2.s\n"
+      ".inst 0x658aae52  // bfcvt z18.h, p3/M, z18.s\n"
+      "st1h { z21.s }, p2, [x10]\n"
+      "st1h { z20.s }, p2, [x28]\n"
+      "st1h { z19.s }, p2, [x27]\n"
+      "st1h { z17.s }, p2, [x26]\n"
+      "ld1h { z17.s }, p1/Z, [x10, #1, MUL VL]\n"
+      "st1h { z16.s }, p2, [x25]\n"
+      "ld1h { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+      "st1h { z18.s }, p2, [x24]\n"
+      "ld1h { z19.s }, p1/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z18.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "lsl z21.s, z17.s, #0x10\n"
+      "ld1h { z17.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "lsl z20.s, z16.s, #0x10\n"
+      "ld1h { z16.s }, p1/Z, [x24, #1, MUL VL]\n"
+      "lsl z19.s, z19.s, #0x10\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "fadd z1.s, z1.s, z21.s\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z0.s, z0.s, z20.s\n"
+      "fadd z31.s, z31.s, z19.s\n"
+      "fadd z30.s, z30.s, z18.s\n"
+      "fmin z1.s, p3/M, z1.s, z12.s\n"
+      "fadd z29.s, z29.s, z17.s\n"
+      "fadd z28.s, z28.s, z16.s\n"
+      "fmin z0.s, p3/M, z0.s, z12.s\n"
+      "fmin z31.s, p3/M, z31.s, z12.s\n"
+      "fmin z30.s, p3/M, z30.s, z12.s\n"
+      "fmin z29.s, p3/M, z29.s, z12.s\n"
+      "fmax z1.s, p3/M, z1.s, z11.s\n"
+      "fmin z28.s, p3/M, z28.s, z12.s\n"
+      "fmax z0.s, p3/M, z0.s, z11.s\n"
+      "fmax z31.s, p3/M, z31.s, z11.s\n"
+      "fmax z30.s, p3/M, z30.s, z11.s\n"
+      "fmax z29.s, p3/M, z29.s, z11.s\n"
+      "fmax z28.s, p3/M, z28.s, z11.s\n"
+      ".inst 0x658aac34  // bfcvt z20.h, p3/M, z1.s\n"
+      ".inst 0x658aac12  // bfcvt z18.h, p3/M, z0.s\n"
+      ".inst 0x658aaff3  // bfcvt z19.h, p3/M, z31.s\n"
+      ".inst 0x658aafd1  // bfcvt z17.h, p3/M, z30.s\n"
+      ".inst 0x658aafb0  // bfcvt z16.h, p3/M, z29.s\n"
+      "st1h { z20.s }, p1, [x10, #1, MUL VL]\n"
+      "st1h { z18.s }, p1, [x28, #1, MUL VL]\n"
+      ".inst 0x658aaf92  // bfcvt z18.h, p3/M, z28.s\n"
+      "st1h { z19.s }, p1, [x27, #1, MUL VL]\n"
+      "st1h { z17.s }, p1, [x26, #1, MUL VL]\n"
+      "ld1h { z17.s }, p0/Z, [x10, #2, MUL VL]\n"
+      "st1h { z16.s }, p1, [x25, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x28, #2, MUL VL]\n"
+      "st1h { z18.s }, p1, [x24, #1, MUL VL]\n"
+      "ld1h { z19.s }, p0/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z18.s }, p0/Z, [x26, #2, MUL VL]\n"
+      "lsl z21.s, z17.s, #0x10\n"
+      "ld1h { z17.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "lsl z20.s, z16.s, #0x10\n"
+      "ld1h { z16.s }, p0/Z, [x24, #2, MUL VL]\n"
+      "lsl z19.s, z19.s, #0x10\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "fadd z27.s, z27.s, z21.s\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z26.s, z26.s, z20.s\n"
+      "fadd z25.s, z25.s, z19.s\n"
+      "fadd z24.s, z24.s, z18.s\n"
+      "fmin z27.s, p3/M, z27.s, z12.s\n"
+      "fadd z23.s, z23.s, z17.s\n"
+      "fadd z22.s, z22.s, z16.s\n"
+      "fmin z26.s, p3/M, z26.s, z12.s\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmax z27.s, p3/M, z27.s, z11.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "fmax z26.s, p3/M, z26.s, z11.s\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      ".inst 0x658aaf74  // bfcvt z20.h, p3/M, z27.s\n"
+      ".inst 0x658aaf50  // bfcvt z16.h, p3/M, z26.s\n"
+      ".inst 0x658aaf33  // bfcvt z19.h, p3/M, z25.s\n"
+      ".inst 0x658aaf12  // bfcvt z18.h, p3/M, z24.s\n"
+      ".inst 0x658aaef1  // bfcvt z17.h, p3/M, z23.s\n"
+      "st1h { z20.s }, p0, [x10, #2, MUL VL]\n"
+      "inch x10, ALL, MUL #3\n"
+      "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+      ".inst 0x658aaed0  // bfcvt z16.h, p3/M, z22.s\n"
+      "inch x28, ALL, MUL #3\n"
+      "st1h { z19.s }, p0, [x27, #2, MUL VL]\n"
+      "inch x27, ALL, MUL #3\n"
+      "st1h { z18.s }, p0, [x26, #2, MUL VL]\n"
+      "inch x26, ALL, MUL #3\n"
+      "st1h { z17.s }, p0, [x25, #2, MUL VL]\n"
+      "inch x25, ALL, MUL #3\n"
+      "st1h { z16.s }, p0, [x24, #2, MUL VL]\n"
+      "inch x24, ALL, MUL #3\n"
+      "bgt 47b\n"
+      "b 52f\n"
+      "48:"  // Accumulate: Height 7
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "add x23, x24, %x[ldout], LSL #1\n"
+      "49:"  // Accumulate: Height 7: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "ld1h { z22.s }, p2/Z, [x10]\n"
+      "ld1h { z21.s }, p2/Z, [x28]\n"
+      "ld1h { z20.s }, p2/Z, [x27]\n"
+      "ld1h { z19.s }, p2/Z, [x26]\n"
+      "ld1h { z18.s }, p2/Z, [x25]\n"
+      "ld1h { z17.s }, p2/Z, [x24]\n"
+      "ld1h { z16.s }, p2/Z, [x23]\n"
+      "ld1w { z8.s }, p2/Z, [%x[in_ptr]]\n"
+      "lsl z25.s, z22.s, #0x10\n"
+      "lsl z24.s, z21.s, #0x10\n"
+      "ld1w { z21.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "ld1w { z7.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "lsl z20.s, z20.s, #0x10\n"
+      "lsl z19.s, z19.s, #0x10\n"
+      "ld1w { z23.s }, p2/Z, [x20, #-7, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x20, #-4, MUL VL]\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "ld1w { z5.s }, p2/Z, [x20, #-1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z8.s, z8.s, z25.s\n"
+      "fadd z21.s, z21.s, z24.s\n"
+      "fadd z7.s, z7.s, z20.s\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "fadd z23.s, z23.s, z19.s\n"
+      "fadd z6.s, z6.s, z18.s\n"
+      "fadd z5.s, z5.s, z17.s\n"
+      "fadd z22.s, z22.s, z16.s\n"
+      "fmin z8.s, p3/M, z8.s, z12.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fmin z7.s, p3/M, z7.s, z12.s\n"
+      "ld1w { z4.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "ld1w { z3.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "whilelt p0.s, x21, x11\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z6.s, p3/M, z6.s, z12.s\n"
+      "ld1w { z2.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "ld1w { z1.s }, p1/Z, [x20, #-6, MUL VL]\n"
+      "fmin z5.s, p3/M, z5.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "ld1w { z0.s }, p1/Z, [x20, #-3, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20]\n"
+      "fmax z8.s, p3/M, z8.s, z11.s\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "ld1w { z30.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z29.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "fmax z7.s, p3/M, z7.s, z11.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "ld1w { z28.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "fmax z6.s, p3/M, z6.s, z11.s\n"
+      "fmax z5.s, p3/M, z5.s, z11.s\n"
+      "ld1w { z26.s }, p0/Z, [x20, #-5, MUL VL]\n"
+      "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      ".inst 0x658aad13  // bfcvt z19.h, p3/M, z8.s\n"
+      ".inst 0x658aaeb5  // bfcvt z21.h, p3/M, z21.s\n"
+      "ld1w { z24.s }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x658aacf4  // bfcvt z20.h, p3/M, z7.s\n"
+      ".inst 0x658aaef2  // bfcvt z18.h, p3/M, z23.s\n"
+      "ld1w { z23.s }, p0/Z, [x20, #4, MUL VL]\n"
+      "decw x11, ALL, MUL #3\n"
+      ".inst 0x658aacd1  // bfcvt z17.h, p3/M, z6.s\n"
+      ".inst 0x658aacb0  // bfcvt z16.h, p3/M, z5.s\n"
+      "incw x21\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "st1h { z19.s }, p2, [x10]\n"
+      ".inst 0x658aaed3  // bfcvt z19.h, p3/M, z22.s\n"
+      "st1h { z21.s }, p2, [x28]\n"
+      "cmp x11, XZR\n"
+      "st1h { z20.s }, p2, [x27]\n"
+      "st1h { z18.s }, p2, [x26]\n"
+      "ld1h { z18.s }, p1/Z, [x10, #1, MUL VL]\n"
+      "st1h { z17.s }, p2, [x25]\n"
+      "ld1h { z17.s }, p1/Z, [x28, #1, MUL VL]\n"
+      "st1h { z16.s }, p2, [x24]\n"
+      "ld1h { z16.s }, p1/Z, [x27, #1, MUL VL]\n"
+      "st1h { z19.s }, p2, [x23]\n"
+      "ld1h { z19.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "lsl z22.s, z18.s, #0x10\n"
+      "ld1h { z18.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "lsl z21.s, z17.s, #0x10\n"
+      "ld1h { z17.s }, p1/Z, [x24, #1, MUL VL]\n"
+      "lsl z20.s, z16.s, #0x10\n"
+      "ld1h { z16.s }, p1/Z, [x23, #1, MUL VL]\n"
+      "lsl z19.s, z19.s, #0x10\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "fadd z4.s, z4.s, z22.s\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z3.s, z3.s, z21.s\n"
+      "fadd z2.s, z2.s, z20.s\n"
+      "fadd z1.s, z1.s, z19.s\n"
+      "fadd z0.s, z0.s, z18.s\n"
+      "fadd z31.s, z31.s, z17.s\n"
+      "fmin z4.s, p3/M, z4.s, z12.s\n"
+      "fadd z30.s, z30.s, z16.s\n"
+      "fmin z3.s, p3/M, z3.s, z12.s\n"
+      "fmin z2.s, p3/M, z2.s, z12.s\n"
+      "fmin z1.s, p3/M, z1.s, z12.s\n"
+      "fmin z0.s, p3/M, z0.s, z12.s\n"
+      "fmin z31.s, p3/M, z31.s, z12.s\n"
+      "fmax z4.s, p3/M, z4.s, z11.s\n"
+      "fmin z30.s, p3/M, z30.s, z12.s\n"
+      "fmax z3.s, p3/M, z3.s, z11.s\n"
+      "fmax z2.s, p3/M, z2.s, z11.s\n"
+      "fmax z1.s, p3/M, z1.s, z11.s\n"
+      "fmax z0.s, p3/M, z0.s, z11.s\n"
+      "fmax z31.s, p3/M, z31.s, z11.s\n"
+      ".inst 0x658aac90  // bfcvt z16.h, p3/M, z4.s\n"
+      "fmax z30.s, p3/M, z30.s, z11.s\n"
+      ".inst 0x658aac74  // bfcvt z20.h, p3/M, z3.s\n"
+      ".inst 0x658aac53  // bfcvt z19.h, p3/M, z2.s\n"
+      ".inst 0x658aac32  // bfcvt z18.h, p3/M, z1.s\n"
+      "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+      ".inst 0x658aac11  // bfcvt z17.h, p3/M, z0.s\n"
+      ".inst 0x658aaff0  // bfcvt z16.h, p3/M, z31.s\n"
+      "st1h { z20.s }, p1, [x28, #1, MUL VL]\n"
+      "st1h { z19.s }, p1, [x27, #1, MUL VL]\n"
+      ".inst 0x658aafd3  // bfcvt z19.h, p3/M, z30.s\n"
+      "st1h { z18.s }, p1, [x26, #1, MUL VL]\n"
+      "ld1h { z18.s }, p0/Z, [x10, #2, MUL VL]\n"
+      "st1h { z17.s }, p1, [x25, #1, MUL VL]\n"
+      "ld1h { z17.s }, p0/Z, [x28, #2, MUL VL]\n"
+      "st1h { z16.s }, p1, [x24, #1, MUL VL]\n"
+      "ld1h { z16.s }, p0/Z, [x27, #2, MUL VL]\n"
+      "st1h { z19.s }, p1, [x23, #1, MUL VL]\n"
+      "ld1h { z19.s }, p0/Z, [x26, #2, MUL VL]\n"
+      "lsl z22.s, z18.s, #0x10\n"
+      "ld1h { z18.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "lsl z21.s, z17.s, #0x10\n"
+      "ld1h { z17.s }, p0/Z, [x24, #2, MUL VL]\n"
+      "lsl z20.s, z16.s, #0x10\n"
+      "ld1h { z16.s }, p0/Z, [x23, #2, MUL VL]\n"
+      "lsl z19.s, z19.s, #0x10\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "fadd z29.s, z29.s, z22.s\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z28.s, z28.s, z21.s\n"
+      "fadd z27.s, z27.s, z20.s\n"
+      "fadd z26.s, z26.s, z19.s\n"
+      "fadd z25.s, z25.s, z18.s\n"
+      "fadd z24.s, z24.s, z17.s\n"
+      "fmin z29.s, p3/M, z29.s, z12.s\n"
+      "fadd z23.s, z23.s, z16.s\n"
+      "fmin z28.s, p3/M, z28.s, z12.s\n"
+      "fmin z27.s, p3/M, z27.s, z12.s\n"
+      "fmin z26.s, p3/M, z26.s, z12.s\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fmax z29.s, p3/M, z29.s, z11.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmax z28.s, p3/M, z28.s, z11.s\n"
+      "fmax z27.s, p3/M, z27.s, z11.s\n"
+      "fmax z26.s, p3/M, z26.s, z11.s\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      ".inst 0x658aafb1  // bfcvt z17.h, p3/M, z29.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      ".inst 0x658aaf94  // bfcvt z20.h, p3/M, z28.s\n"
+      ".inst 0x658aaf70  // bfcvt z16.h, p3/M, z27.s\n"
+      ".inst 0x658aaf53  // bfcvt z19.h, p3/M, z26.s\n"
+      "st1h { z17.s }, p0, [x10, #2, MUL VL]\n"
+      "inch x10, ALL, MUL #3\n"
+      ".inst 0x658aaf32  // bfcvt z18.h, p3/M, z25.s\n"
+      ".inst 0x658aaf11  // bfcvt z17.h, p3/M, z24.s\n"
+      "st1h { z20.s }, p0, [x28, #2, MUL VL]\n"
+      "inch x28, ALL, MUL #3\n"
+      "st1h { z16.s }, p0, [x27, #2, MUL VL]\n"
+      ".inst 0x658aaef0  // bfcvt z16.h, p3/M, z23.s\n"
+      "inch x27, ALL, MUL #3\n"
+      "st1h { z19.s }, p0, [x26, #2, MUL VL]\n"
+      "inch x26, ALL, MUL #3\n"
+      "st1h { z18.s }, p0, [x25, #2, MUL VL]\n"
+      "inch x25, ALL, MUL #3\n"
+      "st1h { z17.s }, p0, [x24, #2, MUL VL]\n"
+      "inch x24, ALL, MUL #3\n"
+      "st1h { z16.s }, p0, [x23, #2, MUL VL]\n"
+      "inch x23, ALL, MUL #3\n"
+      "bgt 49b\n"
+      "b 52f\n"
+      "50:"  // Accumulate: Height 8
+      "mov x10, %x[out_ptr]\n"
+      "mov x11, %x[cols]\n"
+      "add x28, x10, %x[ldout], LSL #1\n"
+      "add x27, x28, %x[ldout], LSL #1\n"
+      "add x26, x27, %x[ldout], LSL #1\n"
+      "add x25, x26, %x[ldout], LSL #1\n"
+      "add x24, x25, %x[ldout], LSL #1\n"
+      "add x23, x24, %x[ldout], LSL #1\n"
+      "add x22, x23, %x[ldout], LSL #1\n"
+      "51:"  // Accumulate: Height 8: Block loop
+      "mov x21, #0x0\n"
+      "addvl x20, %x[in_ptr], #16\n"
+      "whilelt p2.s, x21, x11\n"
+      "incw x21\n"
+      "ld1h { z23.s }, p2/Z, [x10]\n"
+      "ld1h { z22.s }, p2/Z, [x28]\n"
+      "ld1h { z21.s }, p2/Z, [x27]\n"
+      "ld1h { z20.s }, p2/Z, [x26]\n"
+      "ld1h { z19.s }, p2/Z, [x25]\n"
+      "ld1h { z18.s }, p2/Z, [x24]\n"
+      "ld1h { z17.s }, p2/Z, [x23]\n"
+      "ld1h { z16.s }, p2/Z, [x22]\n"
+      "lsl z31.s, z23.s, #0x10\n"
+      "lsl z30.s, z22.s, #0x10\n"
+      "ld1w { z29.s }, p2/Z, [%x[in_ptr]]\n"
+      "ld1w { z28.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+      "lsl z27.s, z21.s, #0x10\n"
+      "lsl z26.s, z20.s, #0x10\n"
+      "ld1w { z21.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #-7, MUL VL]\n"
+      "lsl z20.s, z19.s, #0x10\n"
+      "lsl z19.s, z18.s, #0x10\n"
+      "ld1w { z18.s }, p2/Z, [x20, #-4, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x20, #-1, MUL VL]\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "ld1w { z23.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #5, MUL VL]\n"
+      "fadd z29.s, z29.s, z31.s\n"
+      "fadd z28.s, z28.s, z30.s\n"
+      "fadd z21.s, z21.s, z27.s\n"
+      "fadd z25.s, z25.s, z26.s\n"
+      "whilelt p1.s, x21, x11\n"
+      "incw x21\n"
+      "fadd z18.s, z18.s, z20.s\n"
+      "fadd z24.s, z24.s, z19.s\n"
+      "fadd z23.s, z23.s, z17.s\n"
+      "fadd z22.s, z22.s, z16.s\n"
+      "fmin z29.s, p3/M, z29.s, z12.s\n"
+      "fmin z28.s, p3/M, z28.s, z12.s\n"
+      "fmin z21.s, p3/M, z21.s, z12.s\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "ld1w { z6.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+      "ld1w { z5.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+      "fmin z18.s, p3/M, z18.s, z12.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "ld1w { z4.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+      "ld1w { z3.s }, p1/Z, [x20, #-6, MUL VL]\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmin z22.s, p3/M, z22.s, z12.s\n"
+      "ld1w { z2.s }, p1/Z, [x20, #-3, MUL VL]\n"
+      "ld1w { z1.s }, p1/Z, [x20]\n"
+      "fmax z29.s, p3/M, z29.s, z11.s\n"
+      "fmax z28.s, p3/M, z28.s, z11.s\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #6, MUL VL]\n"
+      "fmax z21.s, p3/M, z21.s, z11.s\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      "fmax z18.s, p3/M, z18.s, z11.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "fmax z22.s, p3/M, z22.s, z11.s\n"
+      ".inst 0x658aafb4  // bfcvt z20.h, p3/M, z29.s\n"
+      ".inst 0x658aaf93  // bfcvt z19.h, p3/M, z28.s\n"
+      ".inst 0x658aaeb5  // bfcvt z21.h, p3/M, z21.s\n"
+      ".inst 0x658aaf30  // bfcvt z16.h, p3/M, z25.s\n"
+      "whilelt p0.s, x21, x11\n"
+      "decw x11, ALL, MUL #3\n"
+      ".inst 0x658aae52  // bfcvt z18.h, p3/M, z18.s\n"
+      ".inst 0x658aaf11  // bfcvt z17.h, p3/M, z24.s\n"
+      "incw x21\n"
+      "st1h { z20.s }, p2, [x10]\n"
+      "st1h { z19.s }, p2, [x28]\n"
+      ".inst 0x658aaef4  // bfcvt z20.h, p3/M, z23.s\n"
+      ".inst 0x658aaed3  // bfcvt z19.h, p3/M, z22.s\n"
+      "st1h { z21.s }, p2, [x27]\n"
+      "ld1w { z30.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+      "ld1w { z29.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+      "cmp x11, XZR\n"
+      "st1h { z16.s }, p2, [x26]\n"
+      "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z28.s }, p0/Z, [x20, #-8, MUL VL]\n"
+      "addvl %x[in_ptr], %x[in_ptr], #24\n"
+      "st1h { z18.s }, p2, [x25]\n"
+      "ld1h { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #-5, MUL VL]\n"
+      "st1h { z17.s }, p2, [x24]\n"
+      "ld1h { z17.s }, p1/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z26.s }, p0/Z, [x20, #-2, MUL VL]\n"
+      "st1h { z20.s }, p2, [x23]\n"
+      "ld1h { z20.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "ld1w { z25.s }, p0/Z, [x20, #1, MUL VL]\n"
+      "st1h { z19.s }, p2, [x22]\n"
+      "ld1h { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "lsl z22.s, z18.s, #0x10\n"
+      "ld1w { z24.s }, p0/Z, [x20, #4, MUL VL]\n"
+      "ld1h { z18.s }, p1/Z, [x24, #1, MUL VL]\n"
+      "lsl z21.s, z17.s, #0x10\n"
+      "ld1w { z23.s }, p0/Z, [x20, #7, MUL VL]\n"
+      "ld1h { z17.s }, p1/Z, [x23, #1, MUL VL]\n"
+      "lsl z20.s, z20.s, #0x10\n"
+      "fadd z6.s, z6.s, z16.s\n"
+      "ld1h { z16.s }, p1/Z, [x22, #1, MUL VL]\n"
+      "lsl z19.s, z19.s, #0x10\n"
+      "fadd z5.s, z5.s, z22.s\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "fadd z4.s, z4.s, z21.s\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fmin z6.s, p3/M, z6.s, z12.s\n"
+      "fadd z3.s, z3.s, z20.s\n"
+      "fadd z2.s, z2.s, z19.s\n"
+      "fmin z5.s, p3/M, z5.s, z12.s\n"
+      "fadd z1.s, z1.s, z18.s\n"
+      "fmin z4.s, p3/M, z4.s, z12.s\n"
+      "fadd z0.s, z0.s, z17.s\n"
+      "fadd z31.s, z31.s, z16.s\n"
+      "fmax z6.s, p3/M, z6.s, z11.s\n"
+      "fmin z3.s, p3/M, z3.s, z12.s\n"
+      "fmin z2.s, p3/M, z2.s, z12.s\n"
+      "fmax z5.s, p3/M, z5.s, z11.s\n"
+      "fmin z1.s, p3/M, z1.s, z12.s\n"
+      "fmin z0.s, p3/M, z0.s, z12.s\n"
+      "fmin z31.s, p3/M, z31.s, z12.s\n"
+      "fmax z4.s, p3/M, z4.s, z11.s\n"
+      ".inst 0x658aacd0  // bfcvt z16.h, p3/M, z6.s\n"
+      "fmax z3.s, p3/M, z3.s, z11.s\n"
+      "fmax z2.s, p3/M, z2.s, z11.s\n"
+      ".inst 0x658aacb1  // bfcvt z17.h, p3/M, z5.s\n"
+      "fmax z1.s, p3/M, z1.s, z11.s\n"
+      "fmax z0.s, p3/M, z0.s, z11.s\n"
+      "fmax z31.s, p3/M, z31.s, z11.s\n"
+      "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+      ".inst 0x658aac90  // bfcvt z16.h, p3/M, z4.s\n"
+      "st1h { z17.s }, p1, [x28, #1, MUL VL]\n"
+      ".inst 0x658aac75  // bfcvt z21.h, p3/M, z3.s\n"
+      ".inst 0x658aac52  // bfcvt z18.h, p3/M, z2.s\n"
+      ".inst 0x658aac31  // bfcvt z17.h, p3/M, z1.s\n"
+      ".inst 0x658aac14  // bfcvt z20.h, p3/M, z0.s\n"
+      "st1h { z16.s }, p1, [x27, #1, MUL VL]\n"
+      ".inst 0x658aaff3  // bfcvt z19.h, p3/M, z31.s\n"
+      "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n"
+      "st1h { z21.s }, p1, [x26, #1, MUL VL]\n"
+      "st1h { z18.s }, p1, [x25, #1, MUL VL]\n"
+      "ld1h { z18.s }, p0/Z, [x28, #2, MUL VL]\n"
+      "st1h { z17.s }, p1, [x24, #1, MUL VL]\n"
+      "ld1h { z17.s }, p0/Z, [x27, #2, MUL VL]\n"
+      "st1h { z20.s }, p1, [x23, #1, MUL VL]\n"
+      "ld1h { z20.s }, p0/Z, [x26, #2, MUL VL]\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "st1h { z19.s }, p1, [x22, #1, MUL VL]\n"
+      "ld1h { z19.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "lsl z22.s, z18.s, #0x10\n"
+      "ld1h { z18.s }, p0/Z, [x24, #2, MUL VL]\n"
+      "lsl z21.s, z17.s, #0x10\n"
+      "ld1h { z17.s }, p0/Z, [x23, #2, MUL VL]\n"
+      "lsl z20.s, z20.s, #0x10\n"
+      "fadd z30.s, z30.s, z16.s\n"
+      "ld1h { z16.s }, p0/Z, [x22, #2, MUL VL]\n"
+      "lsl z19.s, z19.s, #0x10\n"
+      "lsl z18.s, z18.s, #0x10\n"
+      "fadd z29.s, z29.s, z22.s\n"
+      "lsl z17.s, z17.s, #0x10\n"
+      "fadd z28.s, z28.s, z21.s\n"
+      "lsl z16.s, z16.s, #0x10\n"
+      "fadd z27.s, z27.s, z20.s\n"
+      "fmin z30.s, p3/M, z30.s, z12.s\n"
+      "fadd z26.s, z26.s, z19.s\n"
+      "fadd z25.s, z25.s, z18.s\n"
+      "fmin z29.s, p3/M, z29.s, z12.s\n"
+      "fadd z24.s, z24.s, z17.s\n"
+      "fmin z28.s, p3/M, z28.s, z12.s\n"
+      "fadd z23.s, z23.s, z16.s\n"
+      "fmin z27.s, p3/M, z27.s, z12.s\n"
+      "fmax z30.s, p3/M, z30.s, z11.s\n"
+      "fmin z26.s, p3/M, z26.s, z12.s\n"
+      "fmin z25.s, p3/M, z25.s, z12.s\n"
+      "fmax z29.s, p3/M, z29.s, z11.s\n"
+      "fmin z24.s, p3/M, z24.s, z12.s\n"
+      "fmin z23.s, p3/M, z23.s, z12.s\n"
+      "fmax z28.s, p3/M, z28.s, z11.s\n"
+      "fmax z27.s, p3/M, z27.s, z11.s\n"
+      ".inst 0x658aafd0  // bfcvt z16.h, p3/M, z30.s\n"
+      "fmax z26.s, p3/M, z26.s, z11.s\n"
+      "fmax z25.s, p3/M, z25.s, z11.s\n"
+      ".inst 0x658aafb1  // bfcvt z17.h, p3/M, z29.s\n"
+      "fmax z24.s, p3/M, z24.s, z11.s\n"
+      "fmax z23.s, p3/M, z23.s, z11.s\n"
+      "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+      ".inst 0x658aaf90  // bfcvt z16.h, p3/M, z28.s\n"
+      ".inst 0x658aaf74  // bfcvt z20.h, p3/M, z27.s\n"
+      "inch x10, ALL, MUL #3\n"
+      "st1h { z17.s }, p0, [x28, #2, MUL VL]\n"
+      "inch x28, ALL, MUL #3\n"
+      ".inst 0x658aaf53  // bfcvt z19.h, p3/M, z26.s\n"
+      ".inst 0x658aaf32  // bfcvt z18.h, p3/M, z25.s\n"
+      "st1h { z16.s }, p0, [x27, #2, MUL VL]\n"
+      "inch x27, ALL, MUL #3\n"
+      ".inst 0x658aaf11  // bfcvt z17.h, p3/M, z24.s\n"
+      ".inst 0x658aaef0  // bfcvt z16.h, p3/M, z23.s\n"
+      "st1h { z20.s }, p0, [x26, #2, MUL VL]\n"
+      "inch x26, ALL, MUL #3\n"
+      "st1h { z19.s }, p0, [x25, #2, MUL VL]\n"
+      "inch x25, ALL, MUL #3\n"
+      "st1h { z18.s }, p0, [x24, #2, MUL VL]\n"
+      "inch x24, ALL, MUL #3\n"
+      "st1h { z17.s }, p0, [x23, #2, MUL VL]\n"
+      "inch x23, ALL, MUL #3\n"
+      "st1h { z16.s }, p0, [x22, #2, MUL VL]\n"
+      "inch x22, ALL, MUL #3\n"
+      "bgt 51b\n"
+      "subs %x[rows], %x[rows], #0x8\n"
+      "add %x[out_ptr], %x[out_ptr], x12\n"
+      "bgt 35b\n"
+      "52:"  // Exit
+      : [in_ptr] "+&r" (in_ptr), [out_ptr] "+&r" (out_ptr), [rows] "+&r" (rows)
+      : [accumulate] "r" (accumulate), [bias] "r" (bias), [cols] "r" (cols), [ldout] "r" (ldout), [maxval] "r" (maxval), [minval] "r" (minval)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
index 34b6fe3ef5..115ba59459 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020,2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 template<>
 void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append)
@@ -781,19 +781,19 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-                            "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z13.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "add z13.s, z13.s, z2.s\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
                             "whilelt p2.s, %[p], %[w]\n"
-                            "add z13.s, z13.s, z2.s\n"
                             "add z14.s, z14.s, z3.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
                             "addvl %[inptr], %[inptr], #24\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "add z15.s, z15.s, z4.s\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
@@ -817,27 +817,27 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
-                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "add z14.s, z14.s, z3.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "add z17.s, z17.s, z3.s\n"
                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
                             "addvl %[inptr], %[inptr], #24\n"
+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
                             "add z15.s, z15.s, z4.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "add z17.s, z17.s, z3.s\n"
                             "add z18.s, z18.s, z4.s\n"
-                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
@@ -865,38 +865,38 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "add z19.s, z19.s, z2.s\n"
-                            "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
                             "addvl %[inptr], %[inptr], #24\n"
+                            "add z18.s, z18.s, z4.s\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z19.s, z19.s, z2.s\n"
+                            "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
@@ -925,44 +925,44 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "add z18.s, z18.s, z4.s\n"
+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "add z19.s, z19.s, z2.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "add z14.s, z14.s, z2.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "addvl %[inptr], %[inptr], #24\n"
-                            "add z15.s, z15.s, z3.s\n"
+                            "add z14.s, z14.s, z2.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
+                            "add z15.s, z15.s, z3.s\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
                             "add z16.s, z16.s, z4.s\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
@@ -996,49 +996,49 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "add z18.s, z18.s, z4.s\n"
+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "add z19.s, z19.s, z2.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "add z14.s, z14.s, z2.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "add z15.s, z15.s, z3.s\n"
+                            "add z14.s, z14.s, z2.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "add z15.s, z15.s, z3.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                            "addvl %[inptr], %[inptr], #24\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
                             "add z16.s, z16.s, z4.s\n"
                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "add z17.s, z17.s, z2.s\n"
@@ -1079,61 +1079,61 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "add z18.s, z18.s, z4.s\n"
+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "add z19.s, z19.s, z2.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "add z14.s, z14.s, z2.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "add z15.s, z15.s, z3.s\n"
+                            "add z14.s, z14.s, z2.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "add z15.s, z15.s, z3.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
                             "add z16.s, z16.s, z4.s\n"
                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "add z17.s, z17.s, z2.s\n"
                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
-                            "addvl %[inptr], %[inptr], #24\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "st1w z19.s, p0, [%[outptr2]]\n"
-                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "add z18.s, z18.s, z3.s\n"
+                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                            "add z19.s, z19.s, z4.s\n"
                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "add z19.s, z19.s, z4.s\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "add z20.s, z20.s, z2.s\n"
@@ -1174,63 +1174,63 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "add z18.s, z18.s, z4.s\n"
+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "add z19.s, z19.s, z2.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "add z14.s, z14.s, z2.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "add z15.s, z15.s, z3.s\n"
+                            "add z14.s, z14.s, z2.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "add z15.s, z15.s, z3.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
                             "add z16.s, z16.s, z4.s\n"
                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "add z17.s, z17.s, z2.s\n"
                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "st1w z19.s, p0, [%[outptr2]]\n"
-                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
                             "add z18.s, z18.s, z3.s\n"
                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-                            "addvl %[inptr], %[inptr], #24\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "add z19.s, z19.s, z4.s\n"
+                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "add z20.s, z20.s, z2.s\n"
@@ -1282,64 +1282,64 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "add z18.s, z18.s, z4.s\n"
+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "add z19.s, z19.s, z2.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "add z14.s, z14.s, z2.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "add z15.s, z15.s, z3.s\n"
+                            "add z14.s, z14.s, z2.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "add z15.s, z15.s, z3.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
                             "add z16.s, z16.s, z4.s\n"
                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "add z17.s, z17.s, z2.s\n"
                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "st1w z19.s, p0, [%[outptr2]]\n"
-                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
                             "add z18.s, z18.s, z3.s\n"
                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                            "addvl %[inptr], %[inptr], #24\n"
+                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
                             "add z19.s, z19.s, z4.s\n"
                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "add z20.s, z20.s, z2.s\n"
@@ -1394,4 +1394,4 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
     }
 }
 
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
index c4b2bb56d6..358ed79989 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020,2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SVE
 
 template<>
 void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append)
@@ -781,19 +781,19 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-                            "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z13.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                            "add z13.s, z13.s, z2.s\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
                             "whilelt p2.s, %[p], %[w]\n"
-                            "add z13.s, z13.s, z2.s\n"
                             "add z14.s, z14.s, z3.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
                             "addvl %[inptr], %[inptr], #24\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
+                            "add z15.s, z15.s, z4.s\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
@@ -817,27 +817,27 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
-                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "add z14.s, z14.s, z3.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
+                            "add z17.s, z17.s, z3.s\n"
                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
                             "addvl %[inptr], %[inptr], #24\n"
+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
                             "add z15.s, z15.s, z4.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "add z17.s, z17.s, z3.s\n"
                             "add z18.s, z18.s, z4.s\n"
-                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
@@ -865,38 +865,38 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-                            "add z19.s, z19.s, z2.s\n"
-                            "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
                             "addvl %[inptr], %[inptr], #24\n"
+                            "add z18.s, z18.s, z4.s\n"
                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z19.s, z19.s, z2.s\n"
+                            "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
@@ -925,44 +925,44 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "add z18.s, z18.s, z4.s\n"
+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "add z19.s, z19.s, z2.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "add z14.s, z14.s, z2.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "addvl %[inptr], %[inptr], #24\n"
-                            "add z15.s, z15.s, z3.s\n"
+                            "add z14.s, z14.s, z2.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
+                            "add z15.s, z15.s, z3.s\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
                             "add z16.s, z16.s, z4.s\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
@@ -996,49 +996,49 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "add z18.s, z18.s, z4.s\n"
+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "add z19.s, z19.s, z2.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "add z14.s, z14.s, z2.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "add z15.s, z15.s, z3.s\n"
+                            "add z14.s, z14.s, z2.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "add z15.s, z15.s, z3.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                            "addvl %[inptr], %[inptr], #24\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
                             "add z16.s, z16.s, z4.s\n"
                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "add z17.s, z17.s, z2.s\n"
@@ -1079,61 +1079,61 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "add z18.s, z18.s, z4.s\n"
+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "add z19.s, z19.s, z2.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "add z14.s, z14.s, z2.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "add z15.s, z15.s, z3.s\n"
+                            "add z14.s, z14.s, z2.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "add z15.s, z15.s, z3.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
                             "add z16.s, z16.s, z4.s\n"
                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "add z17.s, z17.s, z2.s\n"
                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
-                            "addvl %[inptr], %[inptr], #24\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "st1w z19.s, p0, [%[outptr2]]\n"
-                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "add z18.s, z18.s, z3.s\n"
+                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                            "add z19.s, z19.s, z4.s\n"
                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "add z19.s, z19.s, z4.s\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "add z20.s, z20.s, z2.s\n"
@@ -1174,63 +1174,63 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "add z18.s, z18.s, z4.s\n"
+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "add z19.s, z19.s, z2.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "add z14.s, z14.s, z2.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "add z15.s, z15.s, z3.s\n"
+                            "add z14.s, z14.s, z2.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "add z15.s, z15.s, z3.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
                             "add z16.s, z16.s, z4.s\n"
                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "add z17.s, z17.s, z2.s\n"
                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "st1w z19.s, p0, [%[outptr2]]\n"
-                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
                             "add z18.s, z18.s, z3.s\n"
                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-                            "addvl %[inptr], %[inptr], #24\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "add z19.s, z19.s, z4.s\n"
+                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "add z20.s, z20.s, z2.s\n"
@@ -1282,64 +1282,64 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
                             "whilelt p1.s, %[p], %[w]\n"
-                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
                             "incw %[p], all, mul #1\n"
-                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
-                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z13.s, z13.s, z2.s\n"
+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
+                            "whilelt p2.s, %[p], %[w]\n"
+                            "add z16.s, z16.s, z2.s\n"
+                            "st1w z13.s, p0, [%[outptr0]]\n"
+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                            "add z13.s, z13.s, z2.s\n"
+                            "add z14.s, z14.s, z3.s\n"
+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                            "add z14.s, z14.s, z3.s\n"
-                            "st1w z13.s, p0, [%[outptr0]]\n"
-                            "add z15.s, z15.s, z4.s\n"
-                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                            "add z16.s, z16.s, z2.s\n"
-                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
                             "add z17.s, z17.s, z3.s\n"
                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                            "add z18.s, z18.s, z4.s\n"
+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "add z15.s, z15.s, z4.s\n"
+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "add z18.s, z18.s, z4.s\n"
+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
                             "add z19.s, z19.s, z2.s\n"
                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
                             "add z20.s, z20.s, z3.s\n"
-                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                            "add z13.s, z13.s, z4.s\n"
+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
-                            "add z14.s, z14.s, z2.s\n"
                             "addvl %[outptr0], %[outptr0], #3\n"
+                            "add z13.s, z13.s, z4.s\n"
                             "st1w z16.s, p0, [%[outptr1]]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                            "add z15.s, z15.s, z3.s\n"
+                            "add z14.s, z14.s, z2.s\n"
                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "add z15.s, z15.s, z3.s\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
                             "add z16.s, z16.s, z4.s\n"
                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
                             "addvl %[outptr1], %[outptr1], #3\n"
                             "add z17.s, z17.s, z2.s\n"
                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
-                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
                             "st1w z19.s, p0, [%[outptr2]]\n"
-                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
                             "add z18.s, z18.s, z3.s\n"
                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                            "addvl %[inptr], %[inptr], #24\n"
+                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
                             "add z19.s, z19.s, z4.s\n"
                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "addvl %[inptr], %[inptr], #24\n"
                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
                             "addvl %[outptr2], %[outptr2], #3\n"
                             "add z20.s, z20.s, z2.s\n"
@@ -1394,4 +1394,4 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
     }
 }
 
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/misc-sve.cpp b/src/core/NEON/kernels/arm_gemm/misc-sve.cpp
new file mode 100644
index 0000000000..ffe098109f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/misc-sve.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+namespace arm_gemm {
+
+namespace utils {
+
+#ifdef ARM_COMPUTE_ENABLE_SME
+namespace sme {
+
+unsigned long raw_vector_length() {
+    static unsigned long res=0;
+
+    if (!res) {
+        __asm __volatile(
+            ".inst 0xd503477f  // SMSTART ZA\n"
+            "cntb     %0\n"
+            ".inst 0xd503467f  // SMSTOP\n"
+            : "=r" (res)
+            :
+            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
+              "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+        );
+    }
+
+    return res;
+}
+
+} // namespace sme
+#endif // ARM_COMPUTE_ENABLE_SME
+
+} // namespace utils
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/misc.cpp b/src/core/NEON/kernels/arm_gemm/misc.cpp
index 229e6b56f9..87310d996d 100644
--- a/src/core/NEON/kernels/arm_gemm/misc.cpp
+++ b/src/core/NEON/kernels/arm_gemm/misc.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,11 @@
 #ifndef NO_MULTI_THREADING
 #include <mutex>
 #endif
+#include <cstdint>
+
+#include "arm_gemm.hpp"
+#include "kernel_weight_format.hpp"
+#include "utils.hpp"
 
 namespace arm_gemm {
 
@@ -32,4 +37,43 @@ namespace arm_gemm {
 std::mutex report_mutex;
 #endif
 
-} // namespace arm_gemm
-\ No newline at end of file
+WeightFormat get_weight_format(const KernelWeightFormat kwf, size_t element_size) {
+    if (kwf==KernelWeightFormat::NON_FIXED) {
+        return WeightFormat::UNSPECIFIED;
+    }
+
+    uint32_t kwf_i = static_cast<uint32_t>(kwf);
+    uint32_t wf_i = 0;
+
+    const auto block_bytes = (kwf_i >> 8) & 0xf;
+    const auto vector_count = (kwf_i >> 12) & 0xf;
+
+    uint32_t vector_bytes;
+
+    // For fast mode BF16 kernels set the appropriate bit and override element size to 2.
+    if (kwf_i & 0x10) {
+        element_size = 2;
+        wf_i |= 0x10;
+    }
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+    // Get total bytes in vector output
+    if (kwf_i & 0x1) {
+        vector_bytes = vector_count * get_vector_length<uint8_t>();
+    } else {
+#else
+    if (1) {
+#endif
+        vector_bytes = vector_count * 16;
+    }
+
+    auto input_blocking = block_bytes / element_size;
+    auto output_blocking = vector_bytes / block_bytes;
+
+    wf_i |= (input_blocking << 20);
+    wf_i |= (output_blocking << 8);
+
+    return static_cast<WeightFormat>(wf_i);
+}
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
index fdb4f584d8..d35825c428 100644
--- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -179,13 +179,18 @@ public:
         return _subgemm->get_B_pretransposed_array_size() + col_sum_size();
     }
 
-    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
-        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(buffer);
-        _subgemm->pretranspose_B_array(reinterpret_cast<void *>(buffer_int + col_sum_size()), B, ldb, B_multi_stride);
+    void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+        _col_sums = reinterpret_cast<int32_t *>(in_buffer);
+        col_sums_pretransposed(B, ldb, B_multi_stride);
+    }
 
-        _col_sums = reinterpret_cast<int32_t *>(buffer);
+    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override {
+        assert(!transposed);
 
-        col_sums_pretransposed(B, ldb, B_multi_stride);
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(buffer);
+        _subgemm->pretranspose_B_array(reinterpret_cast<void *>(buffer_int + col_sum_size()), B, ldb, B_multi_stride, transposed);
+
+        requantize_bias(buffer, B, ldb, B_multi_stride);
     }
 
     void set_pretransposed_B_data(void *buffer) override {
@@ -198,6 +203,19 @@ public:
         _params.bias = bias;
         _params.bias_multi_stride = bias_multi_stride;
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c = _subgemm->get_config();
+
+        std::string n = "quantize_wrapper[";
+        n.append(c.filter);
+        n.append("]");
+
+        c.method = GemmMethod::QUANTIZE_WRAPPER;
+        c.filter = n;
+
+        return c;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp
index 111d01ed3a..6da9f4be0e 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.cpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -1142,6 +1142,64 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
 template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);
 template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);
 
+void dequantize_block_32(const DequantizeFloat &qp, unsigned int width, unsigned int height,
+                         const int32_t* in_ptr, unsigned int in_stride, float *out_ptr, unsigned int out_stride,
+                         const float* bias_ptr, bool accumulate, const Activation &act)
+{
+    const float32x4_t vscale = vdupq_n_f32(qp.scale);
+    float maxval = std::numeric_limits<float>::infinity();
+    float minval = -std::numeric_limits<float>::infinity();
+
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            minval = 0;
+            break;
+    }
+
+    const float32x4_t vmin = vdupq_n_f32(minval);
+    const float32x4_t vmax = vdupq_n_f32(maxval);
+
+    for(unsigned int row=0; row<height; row++) {
+        auto row_in_ptr = in_ptr + (row * in_stride);
+        auto row_out_ptr = out_ptr + (row * out_stride);
+        unsigned int col=0;
+        if (width >= 4) {
+            for(; col <= (width - 4); col+= 4) {
+                const int32x4_t vin = vld1q_s32(row_in_ptr + col);
+                float32x4_t vdeq = vmulq_f32(vcvtq_f32_s32(vin), vscale);
+                if(bias_ptr) {
+                    const float32x4_t bin = vld1q_f32(bias_ptr + col);
+                    vdeq = vaddq_f32(vdeq, bin);
+                }
+                if(accumulate) {
+                    vdeq = vaddq_f32(vdeq, vld1q_f32(row_out_ptr + col));
+                }
+                vdeq = vminq_f32(vmaxq_f32(vdeq, vmin), vmax);
+                vst1q_f32(reinterpret_cast<float *>(row_out_ptr + col), vdeq);
+            }
+        }
+        // left-over elements
+        for(; col < width; ++col) {
+            const int32_t val = *(row_in_ptr + col);
+            float res = static_cast<float>(val * qp.scale);
+            if(bias_ptr) {
+                res += static_cast<float>(*(bias_ptr + col));
+            }
+            if(accumulate) {
+                res += *(row_out_ptr + col);
+            }
+            res = std::min(std::max(res, minval), maxval);
+            *(row_out_ptr + col) = res;
+        }
+    }
+}
+
 } // namespace arm_gemm
 
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.hpp b/src/core/NEON/kernels/arm_gemm/quantized.hpp
index 3f3443025c..bc64fd967b 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,11 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
                       unsigned int multi, unsigned int first_col);
 
 template<typename T>
-void row_sums_indirect(unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
+void row_sums_indirect(size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
                        size_t M, int32_t *output_ptr, const Requantize32 *qp);
 
+void dequantize_block_32(const DequantizeFloat &qp, unsigned int width, unsigned int height,
+                         const int32_t* input, unsigned int in_stride, float *output, unsigned int out_stride,
+                         const float *row_bias, bool not_first_pass, const Activation &act);
+
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
index 5433676558..94cd7ddfeb 100644
--- a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -34,7 +34,7 @@ namespace arm_gemm {
 
 template<>
 void row_sums_indirect(
-    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
     size_t M, int32_t *out_ptr, const Requantize32 *qp
 )
 {
@@ -63,8 +63,8 @@ void row_sums_indirect(
     ka.string_lengths = string_lengths;
 
     __asm__ __volatile__(
-      "add x19, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x19]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x20]\n"
       "neg v2.4s, v2.4s\n"
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
@@ -76,97 +76,97 @@ void row_sums_indirect(
       "bgt 35f\n"
       "beq 18f\n"
       "movi v1.8h, #0x0\n"
-      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v0.4s, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "mov x9, #0x0\n"
-      "mov x28, #0x0\n"
       "2:"  // Height 1: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 3f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "cbnz x28, 4f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "cbnz x9, 4f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
       "b 4f\n"
       "3:"  // Height 1: setup direct input
-      "mov x26, %x[input_ptr]\n"
+      "mov x27, %x[input_ptr]\n"
       "4:"  // Height 1: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 8f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 7f\n"
       "5:"  // Height 1: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "cmp x9, #0x7e\n"
-      "add x26, x26, #0x10\n"
+      "ldr q31, [x27, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
       "blt 6f\n"
       "sadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "6:"  // Height 1: Multiply loop: unique 1: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "sadalp v1.8h, v31.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
+      "add x10, x10, #0x1\n"
       "bge 5b\n"
       "7:"  // Height 1: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "add x26, x26, #0x10\n"
+      "ldr q31, [x27, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "sadalp v1.8h, v31.16b\n"
+      "add x27, x27, #0x10\n"
       "8:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x27, 17f\n"
-      "tbz x27, #3, 12f\n"
-      "ldr d31, [x26], #0x8\n"
-      "tbz x27, #2, 10f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "tbz x27, #1, 9f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[14], [x26]\n"
+      "cbz x28, 17f\n"
+      "tbz x28, #3, 12f\n"
+      "ldr d31, [x27], #0x8\n"
+      "tbz x28, #2, 10f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "tbz x28, #1, 9f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[14], [x27]\n"
       "b 16f\n"
       "9:"  // Height 1: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[12], [x26]\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[12], [x27]\n"
       "b 16f\n"
       "10:"  // Height 1: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 11f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[10], [x26]\n"
+      "tbz x28, #1, 11f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[10], [x27]\n"
       "b 16f\n"
       "11:"  // Height 1: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[8], [x26]\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[8], [x27]\n"
       "b 16f\n"
       "12:"  // Height 1: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 14f\n"
-      "ldr s31, [x26], #0x4\n"
-      "tbz x27, #1, 13f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[6], [x26]\n"
+      "tbz x28, #2, 14f\n"
+      "ldr s31, [x27], #0x4\n"
+      "tbz x28, #1, 13f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[6], [x27]\n"
       "b 16f\n"
       "13:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[4], [x26]\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[4], [x27]\n"
       "b 16f\n"
       "14:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 15f\n"
-      "ldr h31, [x26], #0x2\n"
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[2], [x26]\n"
+      "tbz x28, #1, 15f\n"
+      "ldr h31, [x27], #0x2\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[2], [x27]\n"
       "b 16f\n"
       "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
       "16:"  // Height 1: Multiply loop: Ragged operand read: Done
       "sadalp v1.8h, v31.16b\n"
       "17:"  // Height 1: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x20\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x21\n"
       "bne 2b\n"
       "sadalp v0.4s, v1.8h\n"
       "addp v0.4s, v0.4s, v0.4s\n"
@@ -176,126 +176,126 @@ void row_sums_indirect(
       "b 104f\n"
       "18:"  // Height 2
       "movi v1.8h, #0x0\n"
-      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "mov x9, #0x0\n"
       "movi v0.4s, #0x0\n"
-      "mov x28, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v30.8h, #0x0\n"
       "movi v29.4s, #0x0\n"
+      "mov x9, #0x0\n"
       "19:"  // Height 2: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "ldr x25, [x19, #0x8]\n"
-      "cbnz x28, 21f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
-      "add x25, x25, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "cbnz x9, 21f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
       "b 21f\n"
       "20:"  // Height 2: setup direct input
-      "mov x26, %x[input_ptr]\n"
-      "add x25, x26, %x[input_offset]\n"
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, %x[input_offset]\n"
       "21:"  // Height 2: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 25f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 24f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "cmp x9, #0x7e\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "add x25, x25, #0x10\n"
       "blt 23f\n"
       "sadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
+      "mov x10, #0x0\n"
       "sadalp v29.4s, v30.8h\n"
       "movi v30.8h, #0x0\n"
-      "mov x9, #0x0\n"
       "23:"  // Height 2: Multiply loop: unique 2: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "sadalp v1.8h, v31.16b\n"
       "sadalp v30.8h, v28.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
+      "add x10, x10, #0x1\n"
       "bge 22b\n"
       "24:"  // Height 2: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "add x26, x26, #0x10\n"
-      "add x25, x25, #0x10\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "sadalp v1.8h, v31.16b\n"
       "sadalp v30.8h, v28.16b\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       "25:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x27, 34f\n"
-      "tbz x27, #3, 29f\n"
-      "ldr d31, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "tbz x27, #2, 27f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "tbz x27, #1, 26f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "ld1 { v28.h }[6], [x25], #0x2\n"
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[14], [x26]\n"
-      "ld1 { v28.b }[14], [x25]\n"
+      "cbz x28, 34f\n"
+      "tbz x28, #3, 29f\n"
+      "ldr d31, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "tbz x28, #2, 27f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "tbz x28, #1, 26f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
       "b 33f\n"
       "26:"  // Height 2: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[12], [x26]\n"
-      "ld1 { v28.b }[12], [x25]\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
       "b 33f\n"
       "27:"  // Height 2: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 28f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "ld1 { v28.h }[4], [x25], #0x2\n"
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[10], [x26]\n"
-      "ld1 { v28.b }[10], [x25]\n"
+      "tbz x28, #1, 28f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
       "b 33f\n"
       "28:"  // Height 2: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[8], [x26]\n"
-      "ld1 { v28.b }[8], [x25]\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
       "b 33f\n"
       "29:"  // Height 2: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 31f\n"
-      "ldr s31, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "tbz x27, #1, 30f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
+      "tbz x28, #2, 31f\n"
+      "ldr s31, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "tbz x28, #1, 30f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
       "b 33f\n"
       "30:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
       "b 33f\n"
       "31:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 32f\n"
-      "ldr h31, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
+      "tbz x28, #1, 32f\n"
+      "ldr h31, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
       "b 33f\n"
       "32:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
       "33:"  // Height 2: Multiply loop: Ragged operand read: Done
       "sadalp v1.8h, v31.16b\n"
       "sadalp v30.8h, v28.16b\n"
       "34:"  // Height 2: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x20\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x21\n"
       "bne 19b\n"
       "sadalp v0.4s, v1.8h\n"
       "sadalp v29.4s, v30.8h\n"
@@ -306,354 +306,354 @@ void row_sums_indirect(
       "b 104f\n"
       "35:"  // Height 3
       "movi v1.8h, #0x0\n"
-      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "mov x9, #0x0\n"
       "movi v0.4s, #0x0\n"
-      "mov x28, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v30.8h, #0x0\n"
       "movi v29.4s, #0x0\n"
+      "mov x9, #0x0\n"
       "movi v27.8h, #0x0\n"
       "movi v26.4s, #0x0\n"
       "36:"  // Height 3: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 37f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "ldr x25, [x19, #0x8]\n"
-      "ldr x24, [x19, #0x10]\n"
-      "cbnz x28, 38f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "cbnz x9, 38f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
       "b 38f\n"
       "37:"  // Height 3: setup direct input
-      "mov x26, %x[input_ptr]\n"
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, %x[input_offset]\n"
       "add x25, x26, %x[input_offset]\n"
-      "add x24, x25, %x[input_offset]\n"
       "38:"  // Height 3: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 42f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 41f\n"
       "39:"  // Height 3: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "cmp x9, #0x7e\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
+      "ldr q25, [x25, #0x0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "add x24, x24, #0x10\n"
       "blt 40f\n"
       "sadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
+      "mov x10, #0x0\n"
       "sadalp v29.4s, v30.8h\n"
       "movi v30.8h, #0x0\n"
       "sadalp v26.4s, v27.8h\n"
       "movi v27.8h, #0x0\n"
-      "mov x9, #0x0\n"
       "40:"  // Height 3: Multiply loop: unique 3: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "sadalp v1.8h, v31.16b\n"
       "sadalp v30.8h, v28.16b\n"
       "sadalp v27.8h, v25.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
+      "add x10, x10, #0x1\n"
       "bge 39b\n"
       "41:"  // Height 3: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "add x26, x26, #0x10\n"
-      "add x25, x25, #0x10\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "sadalp v1.8h, v31.16b\n"
+      "ldr q25, [x25, #0x0]\n"
       "sadalp v30.8h, v28.16b\n"
       "sadalp v27.8h, v25.16b\n"
-      "add x24, x24, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
       "42:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x27, 51f\n"
-      "tbz x27, #3, 46f\n"
-      "ldr d31, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d25, [x24], #0x8\n"
-      "tbz x27, #2, 44f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v25.s }[2], [x24], #0x4\n"
-      "tbz x27, #1, 43f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "ld1 { v28.h }[6], [x25], #0x2\n"
-      "ld1 { v25.h }[6], [x24], #0x2\n"
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[14], [x26]\n"
-      "ld1 { v28.b }[14], [x25]\n"
-      "ld1 { v25.b }[14], [x24]\n"
+      "cbz x28, 51f\n"
+      "tbz x28, #3, 46f\n"
+      "ldr d31, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "tbz x28, #2, 44f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "tbz x28, #1, 43f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
       "b 50f\n"
       "43:"  // Height 3: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[12], [x26]\n"
-      "ld1 { v28.b }[12], [x25]\n"
-      "ld1 { v25.b }[12], [x24]\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
       "b 50f\n"
       "44:"  // Height 3: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 45f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "ld1 { v28.h }[4], [x25], #0x2\n"
-      "ld1 { v25.h }[4], [x24], #0x2\n"
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[10], [x26]\n"
-      "ld1 { v28.b }[10], [x25]\n"
-      "ld1 { v25.b }[10], [x24]\n"
+      "tbz x28, #1, 45f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
       "b 50f\n"
       "45:"  // Height 3: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[8], [x26]\n"
-      "ld1 { v28.b }[8], [x25]\n"
-      "ld1 { v25.b }[8], [x24]\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
       "b 50f\n"
       "46:"  // Height 3: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 48f\n"
-      "ldr s31, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s25, [x24], #0x4\n"
-      "tbz x27, #1, 47f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v25.h }[2], [x24], #0x2\n"
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v25.b }[6], [x24]\n"
+      "tbz x28, #2, 48f\n"
+      "ldr s31, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "tbz x28, #1, 47f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
       "b 50f\n"
       "47:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v25.b }[4], [x24]\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
       "b 50f\n"
       "48:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 49f\n"
-      "ldr h31, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h25, [x24], #0x2\n"
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v25.b }[2], [x24]\n"
+      "tbz x28, #1, 49f\n"
+      "ldr h31, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
       "b 50f\n"
       "49:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b25, [x24, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
       "50:"  // Height 3: Multiply loop: Ragged operand read: Done
       "sadalp v1.8h, v31.16b\n"
       "sadalp v30.8h, v28.16b\n"
       "sadalp v27.8h, v25.16b\n"
       "51:"  // Height 3: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x20\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x21\n"
       "bne 36b\n"
       "sadalp v0.4s, v1.8h\n"
       "sadalp v29.4s, v30.8h\n"
-      "addp v0.4s, v0.4s, v29.4s\n"
       "sadalp v26.4s, v27.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v26.4s, v26.4s, v26.4s\n"
       "addp v0.4s, v0.4s, v0.4s\n"
       "addp v26.4s, v26.4s, v26.4s\n"
       "mul v0.4s, v0.4s, v2.4s\n"
       "str d0, [%x[out_ptr]], #0x8\n"
-      "addp v26.4s, v26.4s, v26.4s\n"
       "mul v26.4s, v26.4s, v2.4s\n"
       "str s26, [%x[out_ptr]], #0x4\n"
       "b 104f\n"
       "52:"  // Height 4
       "movi v1.8h, #0x0\n"
-      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "mov x9, #0x0\n"
       "movi v0.4s, #0x0\n"
-      "mov x28, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v30.8h, #0x0\n"
       "movi v29.4s, #0x0\n"
+      "mov x9, #0x0\n"
       "movi v27.8h, #0x0\n"
       "movi v26.4s, #0x0\n"
       "movi v24.8h, #0x0\n"
       "movi v23.4s, #0x0\n"
       "53:"  // Height 4: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 54f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "ldr x25, [x19, #0x8]\n"
-      "ldr x24, [x19, #0x10]\n"
-      "ldr x23, [x19, #0x18]\n"
-      "cbnz x28, 55f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x9, 55f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
       "b 55f\n"
       "54:"  // Height 4: setup direct input
-      "mov x26, %x[input_ptr]\n"
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, %x[input_offset]\n"
       "add x25, x26, %x[input_offset]\n"
       "add x24, x25, %x[input_offset]\n"
-      "add x23, x24, %x[input_offset]\n"
       "55:"  // Height 4: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 59f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 58f\n"
       "56:"  // Height 4: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "cmp x9, #0x7e\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "add x23, x23, #0x10\n"
       "blt 57f\n"
       "sadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
+      "mov x10, #0x0\n"
       "sadalp v29.4s, v30.8h\n"
       "movi v30.8h, #0x0\n"
       "sadalp v26.4s, v27.8h\n"
       "movi v27.8h, #0x0\n"
       "sadalp v23.4s, v24.8h\n"
       "movi v24.8h, #0x0\n"
-      "mov x9, #0x0\n"
       "57:"  // Height 4: Multiply loop: unique 4: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "sadalp v1.8h, v31.16b\n"
       "sadalp v30.8h, v28.16b\n"
       "sadalp v27.8h, v25.16b\n"
       "sadalp v24.8h, v22.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
+      "add x10, x10, #0x1\n"
       "bge 56b\n"
       "58:"  // Height 4: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "add x26, x26, #0x10\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "sadalp v1.8h, v31.16b\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "sadalp v30.8h, v28.16b\n"
       "sadalp v27.8h, v25.16b\n"
       "sadalp v24.8h, v22.16b\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "add x23, x23, #0x10\n"
       "59:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x27, 68f\n"
-      "tbz x27, #3, 63f\n"
-      "ldr d31, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d25, [x24], #0x8\n"
-      "ldr d22, [x23], #0x8\n"
-      "tbz x27, #2, 61f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v25.s }[2], [x24], #0x4\n"
-      "ld1 { v22.s }[2], [x23], #0x4\n"
-      "tbz x27, #1, 60f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "ld1 { v28.h }[6], [x25], #0x2\n"
-      "ld1 { v25.h }[6], [x24], #0x2\n"
-      "ld1 { v22.h }[6], [x23], #0x2\n"
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[14], [x26]\n"
-      "ld1 { v28.b }[14], [x25]\n"
-      "ld1 { v25.b }[14], [x24]\n"
-      "ld1 { v22.b }[14], [x23]\n"
+      "cbz x28, 68f\n"
+      "tbz x28, #3, 63f\n"
+      "ldr d31, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "tbz x28, #2, 61f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v22.s }[2], [x24], #0x4\n"
+      "tbz x28, #1, 60f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v22.h }[6], [x24], #0x2\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v22.b }[14], [x24]\n"
       "b 67f\n"
       "60:"  // Height 4: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[12], [x26]\n"
-      "ld1 { v28.b }[12], [x25]\n"
-      "ld1 { v25.b }[12], [x24]\n"
-      "ld1 { v22.b }[12], [x23]\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v22.b }[12], [x24]\n"
       "b 67f\n"
       "61:"  // Height 4: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 62f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "ld1 { v28.h }[4], [x25], #0x2\n"
-      "ld1 { v25.h }[4], [x24], #0x2\n"
-      "ld1 { v22.h }[4], [x23], #0x2\n"
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[10], [x26]\n"
-      "ld1 { v28.b }[10], [x25]\n"
-      "ld1 { v25.b }[10], [x24]\n"
-      "ld1 { v22.b }[10], [x23]\n"
+      "tbz x28, #1, 62f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v22.h }[4], [x24], #0x2\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v22.b }[10], [x24]\n"
       "b 67f\n"
       "62:"  // Height 4: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[8], [x26]\n"
-      "ld1 { v28.b }[8], [x25]\n"
-      "ld1 { v25.b }[8], [x24]\n"
-      "ld1 { v22.b }[8], [x23]\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v22.b }[8], [x24]\n"
       "b 67f\n"
       "63:"  // Height 4: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 65f\n"
-      "ldr s31, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s25, [x24], #0x4\n"
-      "ldr s22, [x23], #0x4\n"
-      "tbz x27, #1, 64f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v25.h }[2], [x24], #0x2\n"
-      "ld1 { v22.h }[2], [x23], #0x2\n"
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v25.b }[6], [x24]\n"
-      "ld1 { v22.b }[6], [x23]\n"
+      "tbz x28, #2, 65f\n"
+      "ldr s31, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "tbz x28, #1, 64f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v22.h }[2], [x24], #0x2\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v22.b }[6], [x24]\n"
       "b 67f\n"
       "64:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v25.b }[4], [x24]\n"
-      "ld1 { v22.b }[4], [x23]\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v22.b }[4], [x24]\n"
       "b 67f\n"
       "65:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 66f\n"
-      "ldr h31, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h25, [x24], #0x2\n"
-      "ldr h22, [x23], #0x2\n"
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v25.b }[2], [x24]\n"
-      "ld1 { v22.b }[2], [x23]\n"
+      "tbz x28, #1, 66f\n"
+      "ldr h31, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h22, [x24], #0x2\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v22.b }[2], [x24]\n"
       "b 67f\n"
       "66:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b25, [x24, #0x0]\n"
-      "ldr b22, [x23, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b22, [x24, #0x0]\n"
       "67:"  // Height 4: Multiply loop: Ragged operand read: Done
       "sadalp v1.8h, v31.16b\n"
       "sadalp v30.8h, v28.16b\n"
       "sadalp v27.8h, v25.16b\n"
       "sadalp v24.8h, v22.16b\n"
       "68:"  // Height 4: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x20\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x21\n"
       "bne 53b\n"
       "sadalp v0.4s, v1.8h\n"
       "sadalp v29.4s, v30.8h\n"
-      "addp v0.4s, v0.4s, v29.4s\n"
       "sadalp v26.4s, v27.8h\n"
       "sadalp v23.4s, v24.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
       "addp v29.4s, v26.4s, v23.4s\n"
       "addp v0.4s, v0.4s, v29.4s\n"
       "mul v0.4s, v0.4s, v2.4s\n"
@@ -661,12 +661,12 @@ void row_sums_indirect(
       "b 104f\n"
       "69:"  // Height 5
       "movi v1.8h, #0x0\n"
-      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "mov x9, #0x0\n"
       "movi v0.4s, #0x0\n"
-      "mov x28, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v30.8h, #0x0\n"
       "movi v29.4s, #0x0\n"
+      "mov x9, #0x0\n"
       "movi v27.8h, #0x0\n"
       "movi v26.4s, #0x0\n"
       "movi v24.8h, #0x0\n"
@@ -674,50 +674,51 @@ void row_sums_indirect(
       "movi v21.8h, #0x0\n"
       "movi v20.4s, #0x0\n"
       "70:"  // Height 5: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 71f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "ldr x25, [x19, #0x8]\n"
-      "ldr x24, [x19, #0x10]\n"
-      "ldr x23, [x19, #0x18]\n"
-      "ldr x22, [x19, #0x20]\n"
-      "cbnz x28, 72f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x23, [x20, #0x20]\n"
+      "cbnz x9, 72f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 72f\n"
       "71:"  // Height 5: setup direct input
-      "mov x26, %x[input_ptr]\n"
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, %x[input_offset]\n"
       "add x25, x26, %x[input_offset]\n"
       "add x24, x25, %x[input_offset]\n"
       "add x23, x24, %x[input_offset]\n"
-      "add x22, x23, %x[input_offset]\n"
       "72:"  // Height 5: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 76f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 75f\n"
       "73:"  // Height 5: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "ldr q19, [x22, #0x0]\n"
-      "cmp x9, #0x7e\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
+      "ldr q19, [x23, #0x0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "add x22, x22, #0x10\n"
       "blt 74f\n"
       "sadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
+      "mov x10, #0x0\n"
       "sadalp v29.4s, v30.8h\n"
       "movi v30.8h, #0x0\n"
       "sadalp v26.4s, v27.8h\n"
@@ -726,139 +727,138 @@ void row_sums_indirect(
       "movi v24.8h, #0x0\n"
       "sadalp v20.4s, v21.8h\n"
       "movi v21.8h, #0x0\n"
-      "mov x9, #0x0\n"
       "74:"  // Height 5: Multiply loop: unique 5: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "sadalp v1.8h, v31.16b\n"
       "sadalp v30.8h, v28.16b\n"
       "sadalp v27.8h, v25.16b\n"
       "sadalp v24.8h, v22.16b\n"
+      "add x10, x10, #0x1\n"
       "sadalp v21.8h, v19.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
       "bge 73b\n"
       "75:"  // Height 5: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "ldr q19, [x22, #0x0]\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "sadalp v1.8h, v31.16b\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "sadalp v30.8h, v28.16b\n"
       "sadalp v27.8h, v25.16b\n"
+      "ldr q19, [x23, #0x0]\n"
       "sadalp v24.8h, v22.16b\n"
       "sadalp v21.8h, v19.16b\n"
+      "add x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "add x22, x22, #0x10\n"
       "76:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x27, 85f\n"
-      "tbz x27, #3, 80f\n"
-      "ldr d31, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d25, [x24], #0x8\n"
-      "ldr d22, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "tbz x27, #2, 78f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v25.s }[2], [x24], #0x4\n"
-      "ld1 { v22.s }[2], [x23], #0x4\n"
-      "ld1 { v19.s }[2], [x22], #0x4\n"
-      "tbz x27, #1, 77f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "ld1 { v28.h }[6], [x25], #0x2\n"
-      "ld1 { v25.h }[6], [x24], #0x2\n"
-      "ld1 { v22.h }[6], [x23], #0x2\n"
-      "ld1 { v19.h }[6], [x22], #0x2\n"
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[14], [x26]\n"
-      "ld1 { v28.b }[14], [x25]\n"
-      "ld1 { v25.b }[14], [x24]\n"
-      "ld1 { v22.b }[14], [x23]\n"
-      "ld1 { v19.b }[14], [x22]\n"
+      "cbz x28, 85f\n"
+      "tbz x28, #3, 80f\n"
+      "ldr d31, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x28, #2, 78f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v22.s }[2], [x24], #0x4\n"
+      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "tbz x28, #1, 77f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v22.h }[6], [x24], #0x2\n"
+      "ld1 { v19.h }[6], [x23], #0x2\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v22.b }[14], [x24]\n"
+      "ld1 { v19.b }[14], [x23]\n"
       "b 84f\n"
       "77:"  // Height 5: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[12], [x26]\n"
-      "ld1 { v28.b }[12], [x25]\n"
-      "ld1 { v25.b }[12], [x24]\n"
-      "ld1 { v22.b }[12], [x23]\n"
-      "ld1 { v19.b }[12], [x22]\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v22.b }[12], [x24]\n"
+      "ld1 { v19.b }[12], [x23]\n"
       "b 84f\n"
       "78:"  // Height 5: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 79f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "ld1 { v28.h }[4], [x25], #0x2\n"
-      "ld1 { v25.h }[4], [x24], #0x2\n"
-      "ld1 { v22.h }[4], [x23], #0x2\n"
-      "ld1 { v19.h }[4], [x22], #0x2\n"
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[10], [x26]\n"
-      "ld1 { v28.b }[10], [x25]\n"
-      "ld1 { v25.b }[10], [x24]\n"
-      "ld1 { v22.b }[10], [x23]\n"
-      "ld1 { v19.b }[10], [x22]\n"
+      "tbz x28, #1, 79f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v22.h }[4], [x24], #0x2\n"
+      "ld1 { v19.h }[4], [x23], #0x2\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v22.b }[10], [x24]\n"
+      "ld1 { v19.b }[10], [x23]\n"
       "b 84f\n"
       "79:"  // Height 5: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[8], [x26]\n"
-      "ld1 { v28.b }[8], [x25]\n"
-      "ld1 { v25.b }[8], [x24]\n"
-      "ld1 { v22.b }[8], [x23]\n"
-      "ld1 { v19.b }[8], [x22]\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v22.b }[8], [x24]\n"
+      "ld1 { v19.b }[8], [x23]\n"
       "b 84f\n"
       "80:"  // Height 5: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 82f\n"
-      "ldr s31, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s25, [x24], #0x4\n"
-      "ldr s22, [x23], #0x4\n"
-      "ldr s19, [x22], #0x4\n"
-      "tbz x27, #1, 81f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v25.h }[2], [x24], #0x2\n"
-      "ld1 { v22.h }[2], [x23], #0x2\n"
-      "ld1 { v19.h }[2], [x22], #0x2\n"
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v25.b }[6], [x24]\n"
-      "ld1 { v22.b }[6], [x23]\n"
-      "ld1 { v19.b }[6], [x22]\n"
+      "tbz x28, #2, 82f\n"
+      "ldr s31, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "tbz x28, #1, 81f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v22.h }[2], [x24], #0x2\n"
+      "ld1 { v19.h }[2], [x23], #0x2\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v22.b }[6], [x24]\n"
+      "ld1 { v19.b }[6], [x23]\n"
       "b 84f\n"
       "81:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v25.b }[4], [x24]\n"
-      "ld1 { v22.b }[4], [x23]\n"
-      "ld1 { v19.b }[4], [x22]\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v22.b }[4], [x24]\n"
+      "ld1 { v19.b }[4], [x23]\n"
       "b 84f\n"
       "82:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 83f\n"
-      "ldr h31, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h25, [x24], #0x2\n"
-      "ldr h22, [x23], #0x2\n"
-      "ldr h19, [x22], #0x2\n"
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v25.b }[2], [x24]\n"
-      "ld1 { v22.b }[2], [x23]\n"
-      "ld1 { v19.b }[2], [x22]\n"
+      "tbz x28, #1, 83f\n"
+      "ldr h31, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h22, [x24], #0x2\n"
+      "ldr h19, [x23], #0x2\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v22.b }[2], [x24]\n"
+      "ld1 { v19.b }[2], [x23]\n"
       "b 84f\n"
       "83:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b25, [x24, #0x0]\n"
-      "ldr b22, [x23, #0x0]\n"
-      "ldr b19, [x22, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b22, [x24, #0x0]\n"
+      "ldr b19, [x23, #0x0]\n"
       "84:"  // Height 5: Multiply loop: Ragged operand read: Done
       "sadalp v1.8h, v31.16b\n"
       "sadalp v30.8h, v28.16b\n"
@@ -866,32 +866,32 @@ void row_sums_indirect(
       "sadalp v24.8h, v22.16b\n"
       "sadalp v21.8h, v19.16b\n"
       "85:"  // Height 5: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x20\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x21\n"
       "bne 70b\n"
       "sadalp v0.4s, v1.8h\n"
       "sadalp v29.4s, v30.8h\n"
-      "addp v0.4s, v0.4s, v29.4s\n"
       "sadalp v26.4s, v27.8h\n"
       "sadalp v23.4s, v24.8h\n"
-      "addp v29.4s, v26.4s, v23.4s\n"
       "sadalp v20.4s, v21.8h\n"
       "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
       "addp v20.4s, v20.4s, v20.4s\n"
       "mul v0.4s, v0.4s, v2.4s\n"
       "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
-      "addp v20.4s, v20.4s, v20.4s\n"
       "mul v20.4s, v20.4s, v2.4s\n"
       "str s20, [%x[out_ptr]], #0x4\n"
       "b 104f\n"
       "86:"  // Height 6
       "movi v1.8h, #0x0\n"
-      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "mov x9, #0x0\n"
       "movi v0.4s, #0x0\n"
-      "mov x28, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w22, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v30.8h, #0x0\n"
       "movi v29.4s, #0x0\n"
+      "mov x9, #0x0\n"
       "movi v27.8h, #0x0\n"
       "movi v26.4s, #0x0\n"
       "movi v24.8h, #0x0\n"
@@ -901,55 +901,56 @@ void row_sums_indirect(
       "movi v18.8h, #0x0\n"
       "movi v17.4s, #0x0\n"
       "87:"  // Height 6: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 88f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "ldr x25, [x19, #0x8]\n"
-      "ldr x24, [x19, #0x10]\n"
-      "ldr x23, [x19, #0x18]\n"
-      "ldr x22, [x19, #0x20]\n"
-      "ldr x20, [x19, #0x28]\n"
-      "cbnz x28, 89f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x23, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x9, 89f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x21, x21, x20\n"
       "b 89f\n"
       "88:"  // Height 6: setup direct input
-      "mov x26, %x[input_ptr]\n"
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, %x[input_offset]\n"
       "add x25, x26, %x[input_offset]\n"
       "add x24, x25, %x[input_offset]\n"
       "add x23, x24, %x[input_offset]\n"
-      "add x22, x23, %x[input_offset]\n"
-      "add x20, x22, %x[input_offset]\n"
+      "add x21, x23, %x[input_offset]\n"
       "89:"  // Height 6: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 93f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 92f\n"
       "90:"  // Height 6: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "ldr q19, [x22, #0x0]\n"
-      "ldr q16, [x20, #0x0]\n"
-      "cmp x9, #0x7e\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
+      "ldr q19, [x23, #0x0]\n"
+      "ldr q16, [x21, #0x0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "add x22, x22, #0x10\n"
-      "add x20, x20, #0x10\n"
+      "add x21, x21, #0x10\n"
       "blt 91f\n"
       "sadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
+      "mov x10, #0x0\n"
       "sadalp v29.4s, v30.8h\n"
       "movi v30.8h, #0x0\n"
       "sadalp v26.4s, v27.8h\n"
@@ -960,158 +961,157 @@ void row_sums_indirect(
       "movi v21.8h, #0x0\n"
       "sadalp v17.4s, v18.8h\n"
       "movi v18.8h, #0x0\n"
-      "mov x9, #0x0\n"
       "91:"  // Height 6: Multiply loop: unique 6: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "sadalp v1.8h, v31.16b\n"
       "sadalp v30.8h, v28.16b\n"
       "sadalp v27.8h, v25.16b\n"
       "sadalp v24.8h, v22.16b\n"
+      "add x10, x10, #0x1\n"
       "sadalp v21.8h, v19.16b\n"
       "sadalp v18.8h, v16.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
       "bge 90b\n"
       "92:"  // Height 6: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "ldr q19, [x22, #0x0]\n"
-      "ldr q16, [x20, #0x0]\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "sadalp v1.8h, v31.16b\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "sadalp v30.8h, v28.16b\n"
       "sadalp v27.8h, v25.16b\n"
+      "ldr q19, [x23, #0x0]\n"
+      "ldr q16, [x21, #0x0]\n"
       "sadalp v24.8h, v22.16b\n"
       "sadalp v21.8h, v19.16b\n"
       "sadalp v18.8h, v16.16b\n"
+      "add x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "add x22, x22, #0x10\n"
-      "add x20, x20, #0x10\n"
+      "add x21, x21, #0x10\n"
       "93:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x27, 102f\n"
-      "tbz x27, #3, 97f\n"
-      "ldr d31, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d25, [x24], #0x8\n"
-      "ldr d22, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d16, [x20], #0x8\n"
-      "tbz x27, #2, 95f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v25.s }[2], [x24], #0x4\n"
-      "ld1 { v22.s }[2], [x23], #0x4\n"
-      "ld1 { v19.s }[2], [x22], #0x4\n"
-      "ld1 { v16.s }[2], [x20], #0x4\n"
-      "tbz x27, #1, 94f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "ld1 { v28.h }[6], [x25], #0x2\n"
-      "ld1 { v25.h }[6], [x24], #0x2\n"
-      "ld1 { v22.h }[6], [x23], #0x2\n"
-      "ld1 { v19.h }[6], [x22], #0x2\n"
-      "ld1 { v16.h }[6], [x20], #0x2\n"
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[14], [x26]\n"
-      "ld1 { v28.b }[14], [x25]\n"
-      "ld1 { v25.b }[14], [x24]\n"
-      "ld1 { v22.b }[14], [x23]\n"
-      "ld1 { v19.b }[14], [x22]\n"
-      "ld1 { v16.b }[14], [x20]\n"
+      "cbz x28, 102f\n"
+      "tbz x28, #3, 97f\n"
+      "ldr d31, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d16, [x21], #0x8\n"
+      "tbz x28, #2, 95f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v22.s }[2], [x24], #0x4\n"
+      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "ld1 { v16.s }[2], [x21], #0x4\n"
+      "tbz x28, #1, 94f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v22.h }[6], [x24], #0x2\n"
+      "ld1 { v19.h }[6], [x23], #0x2\n"
+      "ld1 { v16.h }[6], [x21], #0x2\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v22.b }[14], [x24]\n"
+      "ld1 { v19.b }[14], [x23]\n"
+      "ld1 { v16.b }[14], [x21]\n"
       "b 101f\n"
       "94:"  // Height 6: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[12], [x26]\n"
-      "ld1 { v28.b }[12], [x25]\n"
-      "ld1 { v25.b }[12], [x24]\n"
-      "ld1 { v22.b }[12], [x23]\n"
-      "ld1 { v19.b }[12], [x22]\n"
-      "ld1 { v16.b }[12], [x20]\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v22.b }[12], [x24]\n"
+      "ld1 { v19.b }[12], [x23]\n"
+      "ld1 { v16.b }[12], [x21]\n"
       "b 101f\n"
       "95:"  // Height 6: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 96f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "ld1 { v28.h }[4], [x25], #0x2\n"
-      "ld1 { v25.h }[4], [x24], #0x2\n"
-      "ld1 { v22.h }[4], [x23], #0x2\n"
-      "ld1 { v19.h }[4], [x22], #0x2\n"
-      "ld1 { v16.h }[4], [x20], #0x2\n"
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[10], [x26]\n"
-      "ld1 { v28.b }[10], [x25]\n"
-      "ld1 { v25.b }[10], [x24]\n"
-      "ld1 { v22.b }[10], [x23]\n"
-      "ld1 { v19.b }[10], [x22]\n"
-      "ld1 { v16.b }[10], [x20]\n"
+      "tbz x28, #1, 96f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v22.h }[4], [x24], #0x2\n"
+      "ld1 { v19.h }[4], [x23], #0x2\n"
+      "ld1 { v16.h }[4], [x21], #0x2\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v22.b }[10], [x24]\n"
+      "ld1 { v19.b }[10], [x23]\n"
+      "ld1 { v16.b }[10], [x21]\n"
       "b 101f\n"
       "96:"  // Height 6: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[8], [x26]\n"
-      "ld1 { v28.b }[8], [x25]\n"
-      "ld1 { v25.b }[8], [x24]\n"
-      "ld1 { v22.b }[8], [x23]\n"
-      "ld1 { v19.b }[8], [x22]\n"
-      "ld1 { v16.b }[8], [x20]\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v22.b }[8], [x24]\n"
+      "ld1 { v19.b }[8], [x23]\n"
+      "ld1 { v16.b }[8], [x21]\n"
       "b 101f\n"
       "97:"  // Height 6: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 99f\n"
-      "ldr s31, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s25, [x24], #0x4\n"
-      "ldr s22, [x23], #0x4\n"
-      "ldr s19, [x22], #0x4\n"
-      "ldr s16, [x20], #0x4\n"
-      "tbz x27, #1, 98f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v25.h }[2], [x24], #0x2\n"
-      "ld1 { v22.h }[2], [x23], #0x2\n"
-      "ld1 { v19.h }[2], [x22], #0x2\n"
-      "ld1 { v16.h }[2], [x20], #0x2\n"
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v25.b }[6], [x24]\n"
-      "ld1 { v22.b }[6], [x23]\n"
-      "ld1 { v19.b }[6], [x22]\n"
-      "ld1 { v16.b }[6], [x20]\n"
+      "tbz x28, #2, 99f\n"
+      "ldr s31, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "ldr s16, [x21], #0x4\n"
+      "tbz x28, #1, 98f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v22.h }[2], [x24], #0x2\n"
+      "ld1 { v19.h }[2], [x23], #0x2\n"
+      "ld1 { v16.h }[2], [x21], #0x2\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v22.b }[6], [x24]\n"
+      "ld1 { v19.b }[6], [x23]\n"
+      "ld1 { v16.b }[6], [x21]\n"
       "b 101f\n"
       "98:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v25.b }[4], [x24]\n"
-      "ld1 { v22.b }[4], [x23]\n"
-      "ld1 { v19.b }[4], [x22]\n"
-      "ld1 { v16.b }[4], [x20]\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v22.b }[4], [x24]\n"
+      "ld1 { v19.b }[4], [x23]\n"
+      "ld1 { v16.b }[4], [x21]\n"
       "b 101f\n"
       "99:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 100f\n"
-      "ldr h31, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h25, [x24], #0x2\n"
-      "ldr h22, [x23], #0x2\n"
-      "ldr h19, [x22], #0x2\n"
-      "ldr h16, [x20], #0x2\n"
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v25.b }[2], [x24]\n"
-      "ld1 { v22.b }[2], [x23]\n"
-      "ld1 { v19.b }[2], [x22]\n"
-      "ld1 { v16.b }[2], [x20]\n"
+      "tbz x28, #1, 100f\n"
+      "ldr h31, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h22, [x24], #0x2\n"
+      "ldr h19, [x23], #0x2\n"
+      "ldr h16, [x21], #0x2\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v22.b }[2], [x24]\n"
+      "ld1 { v19.b }[2], [x23]\n"
+      "ld1 { v16.b }[2], [x21]\n"
       "b 101f\n"
       "100:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b25, [x24, #0x0]\n"
-      "ldr b22, [x23, #0x0]\n"
-      "ldr b19, [x22, #0x0]\n"
-      "ldr b16, [x20, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b22, [x24, #0x0]\n"
+      "ldr b19, [x23, #0x0]\n"
+      "ldr b16, [x21, #0x0]\n"
       "101:"  // Height 6: Multiply loop: Ragged operand read: Done
       "sadalp v1.8h, v31.16b\n"
       "sadalp v30.8h, v28.16b\n"
@@ -1120,23 +1120,23 @@ void row_sums_indirect(
       "sadalp v21.8h, v19.16b\n"
       "sadalp v18.8h, v16.16b\n"
       "102:"  // Height 6: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x21\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x22\n"
       "bne 87b\n"
       "sadalp v0.4s, v1.8h\n"
       "sadalp v29.4s, v30.8h\n"
-      "addp v0.4s, v0.4s, v29.4s\n"
+      "subs %x[M], %x[M], #0x6\n"
       "sadalp v26.4s, v27.8h\n"
       "sadalp v23.4s, v24.8h\n"
-      "addp v29.4s, v26.4s, v23.4s\n"
       "sadalp v20.4s, v21.8h\n"
       "sadalp v17.4s, v18.8h\n"
       "addp v0.4s, v0.4s, v29.4s\n"
-      "subs %x[M], %x[M], #0x6\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
       "addp v20.4s, v20.4s, v17.4s\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
       "mul v0.4s, v0.4s, v2.4s\n"
       "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
-      "addp v20.4s, v20.4s, v20.4s\n"
       "mul v20.4s, v20.4s, v2.4s\n"
       "str d20, [%x[out_ptr]], #0x8\n"
       "beq 104f\n"
@@ -1144,14 +1144,14 @@ void row_sums_indirect(
       "add %x[input_offset], %x[input_offset], #0x6\n"
       "b 1b\n"
       "103:"  // Update direct input
-      "mov x19, #0x6\n"
-      "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n"
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, %x[input_offset], %x[input_ptr]\n"
       "b 1b\n"
       "104:"  // Exit
 
-      : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr)
+      : [M] "+&r" (M), [input_offset] "+&r" (input_offset), [input_ptr] "+&r" (input_ptr), [out_ptr] "+&r" (out_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
-      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
index f5709d92ac..2ab0397fda 100644
--- a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifdef __aarch64__
@@ -34,7 +34,7 @@ namespace arm_gemm {
 
 template<>
 void row_sums_indirect(
-    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
     size_t M, int32_t *out_ptr, const Requantize32 *qp
 )
 {
@@ -63,8 +63,8 @@ void row_sums_indirect(
     ka.string_lengths = string_lengths;
 
     __asm__ __volatile__(
-      "add x19, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x19]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x20]\n"
       "neg v2.4s, v2.4s\n"
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
@@ -76,97 +76,97 @@ void row_sums_indirect(
       "bgt 35f\n"
       "beq 18f\n"
       "movi v1.8h, #0x0\n"
-      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v0.4s, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "mov x9, #0x0\n"
-      "mov x28, #0x0\n"
       "2:"  // Height 1: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 3f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "cbnz x28, 4f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "cbnz x9, 4f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
       "b 4f\n"
       "3:"  // Height 1: setup direct input
-      "mov x26, %x[input_ptr]\n"
+      "mov x27, %x[input_ptr]\n"
       "4:"  // Height 1: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 8f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 7f\n"
       "5:"  // Height 1: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "cmp x9, #0x7e\n"
-      "add x26, x26, #0x10\n"
+      "ldr q31, [x27, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
       "blt 6f\n"
       "uadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
-      "mov x9, #0x0\n"
+      "mov x10, #0x0\n"
       "6:"  // Height 1: Multiply loop: unique 1: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "uadalp v1.8h, v31.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
+      "add x10, x10, #0x1\n"
       "bge 5b\n"
       "7:"  // Height 1: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "add x26, x26, #0x10\n"
+      "ldr q31, [x27, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "uadalp v1.8h, v31.16b\n"
+      "add x27, x27, #0x10\n"
       "8:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x27, 17f\n"
-      "tbz x27, #3, 12f\n"
-      "ldr d31, [x26], #0x8\n"
-      "tbz x27, #2, 10f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "tbz x27, #1, 9f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[14], [x26]\n"
+      "cbz x28, 17f\n"
+      "tbz x28, #3, 12f\n"
+      "ldr d31, [x27], #0x8\n"
+      "tbz x28, #2, 10f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "tbz x28, #1, 9f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[14], [x27]\n"
       "b 16f\n"
       "9:"  // Height 1: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[12], [x26]\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[12], [x27]\n"
       "b 16f\n"
       "10:"  // Height 1: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 11f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[10], [x26]\n"
+      "tbz x28, #1, 11f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[10], [x27]\n"
       "b 16f\n"
       "11:"  // Height 1: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[8], [x26]\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[8], [x27]\n"
       "b 16f\n"
       "12:"  // Height 1: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 14f\n"
-      "ldr s31, [x26], #0x4\n"
-      "tbz x27, #1, 13f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[6], [x26]\n"
+      "tbz x28, #2, 14f\n"
+      "ldr s31, [x27], #0x4\n"
+      "tbz x28, #1, 13f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[6], [x27]\n"
       "b 16f\n"
       "13:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[4], [x26]\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[4], [x27]\n"
       "b 16f\n"
       "14:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 15f\n"
-      "ldr h31, [x26], #0x2\n"
-      "tbz x27, #0, 16f\n"
-      "ld1 { v31.b }[2], [x26]\n"
+      "tbz x28, #1, 15f\n"
+      "ldr h31, [x27], #0x2\n"
+      "tbz x28, #0, 16f\n"
+      "ld1 { v31.b }[2], [x27]\n"
       "b 16f\n"
       "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
       "16:"  // Height 1: Multiply loop: Ragged operand read: Done
       "uadalp v1.8h, v31.16b\n"
       "17:"  // Height 1: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x20\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x21\n"
       "bne 2b\n"
       "uadalp v0.4s, v1.8h\n"
       "addp v0.4s, v0.4s, v0.4s\n"
@@ -176,126 +176,126 @@ void row_sums_indirect(
       "b 104f\n"
       "18:"  // Height 2
       "movi v1.8h, #0x0\n"
-      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "mov x9, #0x0\n"
       "movi v0.4s, #0x0\n"
-      "mov x28, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v30.8h, #0x0\n"
       "movi v29.4s, #0x0\n"
+      "mov x9, #0x0\n"
       "19:"  // Height 2: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "ldr x25, [x19, #0x8]\n"
-      "cbnz x28, 21f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
-      "add x25, x25, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "cbnz x9, 21f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
       "b 21f\n"
       "20:"  // Height 2: setup direct input
-      "mov x26, %x[input_ptr]\n"
-      "add x25, x26, %x[input_offset]\n"
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, %x[input_offset]\n"
       "21:"  // Height 2: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 25f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 24f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "cmp x9, #0x7e\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "add x25, x25, #0x10\n"
       "blt 23f\n"
       "uadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
+      "mov x10, #0x0\n"
       "uadalp v29.4s, v30.8h\n"
       "movi v30.8h, #0x0\n"
-      "mov x9, #0x0\n"
       "23:"  // Height 2: Multiply loop: unique 2: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "uadalp v1.8h, v31.16b\n"
       "uadalp v30.8h, v28.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
+      "add x10, x10, #0x1\n"
       "bge 22b\n"
       "24:"  // Height 2: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "add x26, x26, #0x10\n"
-      "add x25, x25, #0x10\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "uadalp v1.8h, v31.16b\n"
       "uadalp v30.8h, v28.16b\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       "25:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x27, 34f\n"
-      "tbz x27, #3, 29f\n"
-      "ldr d31, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "tbz x27, #2, 27f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "tbz x27, #1, 26f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "ld1 { v28.h }[6], [x25], #0x2\n"
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[14], [x26]\n"
-      "ld1 { v28.b }[14], [x25]\n"
+      "cbz x28, 34f\n"
+      "tbz x28, #3, 29f\n"
+      "ldr d31, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "tbz x28, #2, 27f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "tbz x28, #1, 26f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
       "b 33f\n"
       "26:"  // Height 2: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[12], [x26]\n"
-      "ld1 { v28.b }[12], [x25]\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
       "b 33f\n"
       "27:"  // Height 2: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 28f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "ld1 { v28.h }[4], [x25], #0x2\n"
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[10], [x26]\n"
-      "ld1 { v28.b }[10], [x25]\n"
+      "tbz x28, #1, 28f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
       "b 33f\n"
       "28:"  // Height 2: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[8], [x26]\n"
-      "ld1 { v28.b }[8], [x25]\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
       "b 33f\n"
       "29:"  // Height 2: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 31f\n"
-      "ldr s31, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "tbz x27, #1, 30f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
+      "tbz x28, #2, 31f\n"
+      "ldr s31, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "tbz x28, #1, 30f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
       "b 33f\n"
       "30:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
       "b 33f\n"
       "31:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 32f\n"
-      "ldr h31, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "tbz x27, #0, 33f\n"
-      "ld1 { v31.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
+      "tbz x28, #1, 32f\n"
+      "ldr h31, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "tbz x28, #0, 33f\n"
+      "ld1 { v31.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
       "b 33f\n"
       "32:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
       "33:"  // Height 2: Multiply loop: Ragged operand read: Done
       "uadalp v1.8h, v31.16b\n"
       "uadalp v30.8h, v28.16b\n"
       "34:"  // Height 2: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x20\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x21\n"
       "bne 19b\n"
       "uadalp v0.4s, v1.8h\n"
       "uadalp v29.4s, v30.8h\n"
@@ -306,354 +306,354 @@ void row_sums_indirect(
       "b 104f\n"
       "35:"  // Height 3
       "movi v1.8h, #0x0\n"
-      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "mov x9, #0x0\n"
       "movi v0.4s, #0x0\n"
-      "mov x28, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v30.8h, #0x0\n"
       "movi v29.4s, #0x0\n"
+      "mov x9, #0x0\n"
       "movi v27.8h, #0x0\n"
       "movi v26.4s, #0x0\n"
       "36:"  // Height 3: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 37f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "ldr x25, [x19, #0x8]\n"
-      "ldr x24, [x19, #0x10]\n"
-      "cbnz x28, 38f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "cbnz x9, 38f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
       "b 38f\n"
       "37:"  // Height 3: setup direct input
-      "mov x26, %x[input_ptr]\n"
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, %x[input_offset]\n"
       "add x25, x26, %x[input_offset]\n"
-      "add x24, x25, %x[input_offset]\n"
       "38:"  // Height 3: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 42f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 41f\n"
       "39:"  // Height 3: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "cmp x9, #0x7e\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
+      "ldr q25, [x25, #0x0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "add x24, x24, #0x10\n"
       "blt 40f\n"
       "uadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
+      "mov x10, #0x0\n"
       "uadalp v29.4s, v30.8h\n"
       "movi v30.8h, #0x0\n"
       "uadalp v26.4s, v27.8h\n"
       "movi v27.8h, #0x0\n"
-      "mov x9, #0x0\n"
       "40:"  // Height 3: Multiply loop: unique 3: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "uadalp v1.8h, v31.16b\n"
       "uadalp v30.8h, v28.16b\n"
       "uadalp v27.8h, v25.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
+      "add x10, x10, #0x1\n"
       "bge 39b\n"
       "41:"  // Height 3: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "add x26, x26, #0x10\n"
-      "add x25, x25, #0x10\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "uadalp v1.8h, v31.16b\n"
+      "ldr q25, [x25, #0x0]\n"
       "uadalp v30.8h, v28.16b\n"
       "uadalp v27.8h, v25.16b\n"
-      "add x24, x24, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
       "42:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x27, 51f\n"
-      "tbz x27, #3, 46f\n"
-      "ldr d31, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d25, [x24], #0x8\n"
-      "tbz x27, #2, 44f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v25.s }[2], [x24], #0x4\n"
-      "tbz x27, #1, 43f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "ld1 { v28.h }[6], [x25], #0x2\n"
-      "ld1 { v25.h }[6], [x24], #0x2\n"
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[14], [x26]\n"
-      "ld1 { v28.b }[14], [x25]\n"
-      "ld1 { v25.b }[14], [x24]\n"
+      "cbz x28, 51f\n"
+      "tbz x28, #3, 46f\n"
+      "ldr d31, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "tbz x28, #2, 44f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "tbz x28, #1, 43f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
       "b 50f\n"
       "43:"  // Height 3: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[12], [x26]\n"
-      "ld1 { v28.b }[12], [x25]\n"
-      "ld1 { v25.b }[12], [x24]\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
       "b 50f\n"
       "44:"  // Height 3: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 45f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "ld1 { v28.h }[4], [x25], #0x2\n"
-      "ld1 { v25.h }[4], [x24], #0x2\n"
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[10], [x26]\n"
-      "ld1 { v28.b }[10], [x25]\n"
-      "ld1 { v25.b }[10], [x24]\n"
+      "tbz x28, #1, 45f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
       "b 50f\n"
       "45:"  // Height 3: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[8], [x26]\n"
-      "ld1 { v28.b }[8], [x25]\n"
-      "ld1 { v25.b }[8], [x24]\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
       "b 50f\n"
       "46:"  // Height 3: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 48f\n"
-      "ldr s31, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s25, [x24], #0x4\n"
-      "tbz x27, #1, 47f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v25.h }[2], [x24], #0x2\n"
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v25.b }[6], [x24]\n"
+      "tbz x28, #2, 48f\n"
+      "ldr s31, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "tbz x28, #1, 47f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
       "b 50f\n"
       "47:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v25.b }[4], [x24]\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
       "b 50f\n"
       "48:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 49f\n"
-      "ldr h31, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h25, [x24], #0x2\n"
-      "tbz x27, #0, 50f\n"
-      "ld1 { v31.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v25.b }[2], [x24]\n"
+      "tbz x28, #1, 49f\n"
+      "ldr h31, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "tbz x28, #0, 50f\n"
+      "ld1 { v31.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
       "b 50f\n"
       "49:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b25, [x24, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
       "50:"  // Height 3: Multiply loop: Ragged operand read: Done
       "uadalp v1.8h, v31.16b\n"
       "uadalp v30.8h, v28.16b\n"
       "uadalp v27.8h, v25.16b\n"
       "51:"  // Height 3: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x20\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x21\n"
       "bne 36b\n"
       "uadalp v0.4s, v1.8h\n"
       "uadalp v29.4s, v30.8h\n"
-      "addp v0.4s, v0.4s, v29.4s\n"
       "uadalp v26.4s, v27.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v26.4s, v26.4s, v26.4s\n"
       "addp v0.4s, v0.4s, v0.4s\n"
       "addp v26.4s, v26.4s, v26.4s\n"
       "mul v0.4s, v0.4s, v2.4s\n"
       "str d0, [%x[out_ptr]], #0x8\n"
-      "addp v26.4s, v26.4s, v26.4s\n"
       "mul v26.4s, v26.4s, v2.4s\n"
       "str s26, [%x[out_ptr]], #0x4\n"
       "b 104f\n"
       "52:"  // Height 4
       "movi v1.8h, #0x0\n"
-      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "mov x9, #0x0\n"
       "movi v0.4s, #0x0\n"
-      "mov x28, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v30.8h, #0x0\n"
       "movi v29.4s, #0x0\n"
+      "mov x9, #0x0\n"
       "movi v27.8h, #0x0\n"
       "movi v26.4s, #0x0\n"
       "movi v24.8h, #0x0\n"
       "movi v23.4s, #0x0\n"
       "53:"  // Height 4: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 54f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "ldr x25, [x19, #0x8]\n"
-      "ldr x24, [x19, #0x10]\n"
-      "ldr x23, [x19, #0x18]\n"
-      "cbnz x28, 55f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x9, 55f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
       "b 55f\n"
       "54:"  // Height 4: setup direct input
-      "mov x26, %x[input_ptr]\n"
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, %x[input_offset]\n"
       "add x25, x26, %x[input_offset]\n"
       "add x24, x25, %x[input_offset]\n"
-      "add x23, x24, %x[input_offset]\n"
       "55:"  // Height 4: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 59f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 58f\n"
       "56:"  // Height 4: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "cmp x9, #0x7e\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "add x23, x23, #0x10\n"
       "blt 57f\n"
       "uadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
+      "mov x10, #0x0\n"
       "uadalp v29.4s, v30.8h\n"
       "movi v30.8h, #0x0\n"
       "uadalp v26.4s, v27.8h\n"
       "movi v27.8h, #0x0\n"
       "uadalp v23.4s, v24.8h\n"
       "movi v24.8h, #0x0\n"
-      "mov x9, #0x0\n"
       "57:"  // Height 4: Multiply loop: unique 4: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "uadalp v1.8h, v31.16b\n"
       "uadalp v30.8h, v28.16b\n"
       "uadalp v27.8h, v25.16b\n"
       "uadalp v24.8h, v22.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
+      "add x10, x10, #0x1\n"
       "bge 56b\n"
       "58:"  // Height 4: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "add x26, x26, #0x10\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "uadalp v1.8h, v31.16b\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "uadalp v30.8h, v28.16b\n"
       "uadalp v27.8h, v25.16b\n"
       "uadalp v24.8h, v22.16b\n"
+      "add x27, x27, #0x10\n"
+      "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "add x23, x23, #0x10\n"
       "59:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x27, 68f\n"
-      "tbz x27, #3, 63f\n"
-      "ldr d31, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d25, [x24], #0x8\n"
-      "ldr d22, [x23], #0x8\n"
-      "tbz x27, #2, 61f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v25.s }[2], [x24], #0x4\n"
-      "ld1 { v22.s }[2], [x23], #0x4\n"
-      "tbz x27, #1, 60f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "ld1 { v28.h }[6], [x25], #0x2\n"
-      "ld1 { v25.h }[6], [x24], #0x2\n"
-      "ld1 { v22.h }[6], [x23], #0x2\n"
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[14], [x26]\n"
-      "ld1 { v28.b }[14], [x25]\n"
-      "ld1 { v25.b }[14], [x24]\n"
-      "ld1 { v22.b }[14], [x23]\n"
+      "cbz x28, 68f\n"
+      "tbz x28, #3, 63f\n"
+      "ldr d31, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "tbz x28, #2, 61f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v22.s }[2], [x24], #0x4\n"
+      "tbz x28, #1, 60f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v22.h }[6], [x24], #0x2\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v22.b }[14], [x24]\n"
       "b 67f\n"
       "60:"  // Height 4: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[12], [x26]\n"
-      "ld1 { v28.b }[12], [x25]\n"
-      "ld1 { v25.b }[12], [x24]\n"
-      "ld1 { v22.b }[12], [x23]\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v22.b }[12], [x24]\n"
       "b 67f\n"
       "61:"  // Height 4: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 62f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "ld1 { v28.h }[4], [x25], #0x2\n"
-      "ld1 { v25.h }[4], [x24], #0x2\n"
-      "ld1 { v22.h }[4], [x23], #0x2\n"
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[10], [x26]\n"
-      "ld1 { v28.b }[10], [x25]\n"
-      "ld1 { v25.b }[10], [x24]\n"
-      "ld1 { v22.b }[10], [x23]\n"
+      "tbz x28, #1, 62f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v22.h }[4], [x24], #0x2\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v22.b }[10], [x24]\n"
       "b 67f\n"
       "62:"  // Height 4: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[8], [x26]\n"
-      "ld1 { v28.b }[8], [x25]\n"
-      "ld1 { v25.b }[8], [x24]\n"
-      "ld1 { v22.b }[8], [x23]\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v22.b }[8], [x24]\n"
       "b 67f\n"
       "63:"  // Height 4: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 65f\n"
-      "ldr s31, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s25, [x24], #0x4\n"
-      "ldr s22, [x23], #0x4\n"
-      "tbz x27, #1, 64f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v25.h }[2], [x24], #0x2\n"
-      "ld1 { v22.h }[2], [x23], #0x2\n"
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v25.b }[6], [x24]\n"
-      "ld1 { v22.b }[6], [x23]\n"
+      "tbz x28, #2, 65f\n"
+      "ldr s31, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "tbz x28, #1, 64f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v22.h }[2], [x24], #0x2\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v22.b }[6], [x24]\n"
       "b 67f\n"
       "64:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v25.b }[4], [x24]\n"
-      "ld1 { v22.b }[4], [x23]\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v22.b }[4], [x24]\n"
       "b 67f\n"
       "65:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 66f\n"
-      "ldr h31, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h25, [x24], #0x2\n"
-      "ldr h22, [x23], #0x2\n"
-      "tbz x27, #0, 67f\n"
-      "ld1 { v31.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v25.b }[2], [x24]\n"
-      "ld1 { v22.b }[2], [x23]\n"
+      "tbz x28, #1, 66f\n"
+      "ldr h31, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h22, [x24], #0x2\n"
+      "tbz x28, #0, 67f\n"
+      "ld1 { v31.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v22.b }[2], [x24]\n"
       "b 67f\n"
       "66:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b25, [x24, #0x0]\n"
-      "ldr b22, [x23, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b22, [x24, #0x0]\n"
       "67:"  // Height 4: Multiply loop: Ragged operand read: Done
       "uadalp v1.8h, v31.16b\n"
       "uadalp v30.8h, v28.16b\n"
       "uadalp v27.8h, v25.16b\n"
       "uadalp v24.8h, v22.16b\n"
       "68:"  // Height 4: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x20\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x21\n"
       "bne 53b\n"
       "uadalp v0.4s, v1.8h\n"
       "uadalp v29.4s, v30.8h\n"
-      "addp v0.4s, v0.4s, v29.4s\n"
       "uadalp v26.4s, v27.8h\n"
       "uadalp v23.4s, v24.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
       "addp v29.4s, v26.4s, v23.4s\n"
       "addp v0.4s, v0.4s, v29.4s\n"
       "mul v0.4s, v0.4s, v2.4s\n"
@@ -661,12 +661,12 @@ void row_sums_indirect(
       "b 104f\n"
       "69:"  // Height 5
       "movi v1.8h, #0x0\n"
-      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "mov x9, #0x0\n"
       "movi v0.4s, #0x0\n"
-      "mov x28, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v30.8h, #0x0\n"
       "movi v29.4s, #0x0\n"
+      "mov x9, #0x0\n"
       "movi v27.8h, #0x0\n"
       "movi v26.4s, #0x0\n"
       "movi v24.8h, #0x0\n"
@@ -674,50 +674,51 @@ void row_sums_indirect(
       "movi v21.8h, #0x0\n"
       "movi v20.4s, #0x0\n"
       "70:"  // Height 5: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 71f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "ldr x25, [x19, #0x8]\n"
-      "ldr x24, [x19, #0x10]\n"
-      "ldr x23, [x19, #0x18]\n"
-      "ldr x22, [x19, #0x20]\n"
-      "cbnz x28, 72f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x23, [x20, #0x20]\n"
+      "cbnz x9, 72f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
       "b 72f\n"
       "71:"  // Height 5: setup direct input
-      "mov x26, %x[input_ptr]\n"
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, %x[input_offset]\n"
       "add x25, x26, %x[input_offset]\n"
       "add x24, x25, %x[input_offset]\n"
       "add x23, x24, %x[input_offset]\n"
-      "add x22, x23, %x[input_offset]\n"
       "72:"  // Height 5: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 76f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 75f\n"
       "73:"  // Height 5: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "ldr q19, [x22, #0x0]\n"
-      "cmp x9, #0x7e\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
+      "ldr q19, [x23, #0x0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "add x22, x22, #0x10\n"
       "blt 74f\n"
       "uadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
+      "mov x10, #0x0\n"
       "uadalp v29.4s, v30.8h\n"
       "movi v30.8h, #0x0\n"
       "uadalp v26.4s, v27.8h\n"
@@ -726,139 +727,138 @@ void row_sums_indirect(
       "movi v24.8h, #0x0\n"
       "uadalp v20.4s, v21.8h\n"
       "movi v21.8h, #0x0\n"
-      "mov x9, #0x0\n"
       "74:"  // Height 5: Multiply loop: unique 5: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "uadalp v1.8h, v31.16b\n"
       "uadalp v30.8h, v28.16b\n"
       "uadalp v27.8h, v25.16b\n"
       "uadalp v24.8h, v22.16b\n"
+      "add x10, x10, #0x1\n"
       "uadalp v21.8h, v19.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
       "bge 73b\n"
       "75:"  // Height 5: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "ldr q19, [x22, #0x0]\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "uadalp v1.8h, v31.16b\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "uadalp v30.8h, v28.16b\n"
       "uadalp v27.8h, v25.16b\n"
+      "ldr q19, [x23, #0x0]\n"
       "uadalp v24.8h, v22.16b\n"
       "uadalp v21.8h, v19.16b\n"
+      "add x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "add x22, x22, #0x10\n"
       "76:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x27, 85f\n"
-      "tbz x27, #3, 80f\n"
-      "ldr d31, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d25, [x24], #0x8\n"
-      "ldr d22, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "tbz x27, #2, 78f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v25.s }[2], [x24], #0x4\n"
-      "ld1 { v22.s }[2], [x23], #0x4\n"
-      "ld1 { v19.s }[2], [x22], #0x4\n"
-      "tbz x27, #1, 77f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "ld1 { v28.h }[6], [x25], #0x2\n"
-      "ld1 { v25.h }[6], [x24], #0x2\n"
-      "ld1 { v22.h }[6], [x23], #0x2\n"
-      "ld1 { v19.h }[6], [x22], #0x2\n"
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[14], [x26]\n"
-      "ld1 { v28.b }[14], [x25]\n"
-      "ld1 { v25.b }[14], [x24]\n"
-      "ld1 { v22.b }[14], [x23]\n"
-      "ld1 { v19.b }[14], [x22]\n"
+      "cbz x28, 85f\n"
+      "tbz x28, #3, 80f\n"
+      "ldr d31, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x28, #2, 78f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v22.s }[2], [x24], #0x4\n"
+      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "tbz x28, #1, 77f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v22.h }[6], [x24], #0x2\n"
+      "ld1 { v19.h }[6], [x23], #0x2\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v22.b }[14], [x24]\n"
+      "ld1 { v19.b }[14], [x23]\n"
       "b 84f\n"
       "77:"  // Height 5: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[12], [x26]\n"
-      "ld1 { v28.b }[12], [x25]\n"
-      "ld1 { v25.b }[12], [x24]\n"
-      "ld1 { v22.b }[12], [x23]\n"
-      "ld1 { v19.b }[12], [x22]\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v22.b }[12], [x24]\n"
+      "ld1 { v19.b }[12], [x23]\n"
       "b 84f\n"
       "78:"  // Height 5: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 79f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "ld1 { v28.h }[4], [x25], #0x2\n"
-      "ld1 { v25.h }[4], [x24], #0x2\n"
-      "ld1 { v22.h }[4], [x23], #0x2\n"
-      "ld1 { v19.h }[4], [x22], #0x2\n"
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[10], [x26]\n"
-      "ld1 { v28.b }[10], [x25]\n"
-      "ld1 { v25.b }[10], [x24]\n"
-      "ld1 { v22.b }[10], [x23]\n"
-      "ld1 { v19.b }[10], [x22]\n"
+      "tbz x28, #1, 79f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v22.h }[4], [x24], #0x2\n"
+      "ld1 { v19.h }[4], [x23], #0x2\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v22.b }[10], [x24]\n"
+      "ld1 { v19.b }[10], [x23]\n"
       "b 84f\n"
       "79:"  // Height 5: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[8], [x26]\n"
-      "ld1 { v28.b }[8], [x25]\n"
-      "ld1 { v25.b }[8], [x24]\n"
-      "ld1 { v22.b }[8], [x23]\n"
-      "ld1 { v19.b }[8], [x22]\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v22.b }[8], [x24]\n"
+      "ld1 { v19.b }[8], [x23]\n"
       "b 84f\n"
       "80:"  // Height 5: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 82f\n"
-      "ldr s31, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s25, [x24], #0x4\n"
-      "ldr s22, [x23], #0x4\n"
-      "ldr s19, [x22], #0x4\n"
-      "tbz x27, #1, 81f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v25.h }[2], [x24], #0x2\n"
-      "ld1 { v22.h }[2], [x23], #0x2\n"
-      "ld1 { v19.h }[2], [x22], #0x2\n"
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v25.b }[6], [x24]\n"
-      "ld1 { v22.b }[6], [x23]\n"
-      "ld1 { v19.b }[6], [x22]\n"
+      "tbz x28, #2, 82f\n"
+      "ldr s31, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "tbz x28, #1, 81f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v22.h }[2], [x24], #0x2\n"
+      "ld1 { v19.h }[2], [x23], #0x2\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v22.b }[6], [x24]\n"
+      "ld1 { v19.b }[6], [x23]\n"
       "b 84f\n"
       "81:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v25.b }[4], [x24]\n"
-      "ld1 { v22.b }[4], [x23]\n"
-      "ld1 { v19.b }[4], [x22]\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v22.b }[4], [x24]\n"
+      "ld1 { v19.b }[4], [x23]\n"
       "b 84f\n"
       "82:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 83f\n"
-      "ldr h31, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h25, [x24], #0x2\n"
-      "ldr h22, [x23], #0x2\n"
-      "ldr h19, [x22], #0x2\n"
-      "tbz x27, #0, 84f\n"
-      "ld1 { v31.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v25.b }[2], [x24]\n"
-      "ld1 { v22.b }[2], [x23]\n"
-      "ld1 { v19.b }[2], [x22]\n"
+      "tbz x28, #1, 83f\n"
+      "ldr h31, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h22, [x24], #0x2\n"
+      "ldr h19, [x23], #0x2\n"
+      "tbz x28, #0, 84f\n"
+      "ld1 { v31.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v22.b }[2], [x24]\n"
+      "ld1 { v19.b }[2], [x23]\n"
       "b 84f\n"
       "83:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b25, [x24, #0x0]\n"
-      "ldr b22, [x23, #0x0]\n"
-      "ldr b19, [x22, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b22, [x24, #0x0]\n"
+      "ldr b19, [x23, #0x0]\n"
       "84:"  // Height 5: Multiply loop: Ragged operand read: Done
       "uadalp v1.8h, v31.16b\n"
       "uadalp v30.8h, v28.16b\n"
@@ -866,32 +866,32 @@ void row_sums_indirect(
       "uadalp v24.8h, v22.16b\n"
       "uadalp v21.8h, v19.16b\n"
       "85:"  // Height 5: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x20\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x21\n"
       "bne 70b\n"
       "uadalp v0.4s, v1.8h\n"
       "uadalp v29.4s, v30.8h\n"
-      "addp v0.4s, v0.4s, v29.4s\n"
       "uadalp v26.4s, v27.8h\n"
       "uadalp v23.4s, v24.8h\n"
-      "addp v29.4s, v26.4s, v23.4s\n"
       "uadalp v20.4s, v21.8h\n"
       "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
       "addp v20.4s, v20.4s, v20.4s\n"
       "mul v0.4s, v0.4s, v2.4s\n"
       "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
-      "addp v20.4s, v20.4s, v20.4s\n"
       "mul v20.4s, v20.4s, v2.4s\n"
       "str s20, [%x[out_ptr]], #0x4\n"
       "b 104f\n"
       "86:"  // Height 6
       "movi v1.8h, #0x0\n"
-      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "mov x9, #0x0\n"
       "movi v0.4s, #0x0\n"
-      "mov x28, #0x0\n"
+      "mov x10, #0x0\n"
+      "ldr w22, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "movi v30.8h, #0x0\n"
       "movi v29.4s, #0x0\n"
+      "mov x9, #0x0\n"
       "movi v27.8h, #0x0\n"
       "movi v26.4s, #0x0\n"
       "movi v24.8h, #0x0\n"
@@ -901,55 +901,56 @@ void row_sums_indirect(
       "movi v18.8h, #0x0\n"
       "movi v17.4s, #0x0\n"
       "87:"  // Height 6: String loop
-      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x20, x9, LSL #0x2]\n"
       "tbz %x[flags], #3, 88f\n"
-      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x19, x19, %x[input_offset], LSL #3\n"
-      "ldr x26, [x19, #0x0]\n"
-      "ldr x25, [x19, #0x8]\n"
-      "ldr x24, [x19, #0x10]\n"
-      "ldr x23, [x19, #0x18]\n"
-      "ldr x22, [x19, #0x20]\n"
-      "ldr x20, [x19, #0x28]\n"
-      "cbnz x28, 89f\n"
-      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x26, x26, x19\n"
-      "add x25, x25, x19\n"
-      "add x24, x24, x19\n"
-      "add x23, x23, x19\n"
-      "add x22, x22, x19\n"
-      "add x20, x20, x19\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, %x[input_offset], LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x23, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
+      "cbnz x9, 89f\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
+      "add x25, x25, x20\n"
+      "add x24, x24, x20\n"
+      "add x23, x23, x20\n"
+      "add x21, x21, x20\n"
       "b 89f\n"
       "88:"  // Height 6: setup direct input
-      "mov x26, %x[input_ptr]\n"
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, %x[input_offset]\n"
       "add x25, x26, %x[input_offset]\n"
       "add x24, x25, %x[input_offset]\n"
       "add x23, x24, %x[input_offset]\n"
-      "add x22, x23, %x[input_offset]\n"
-      "add x20, x22, %x[input_offset]\n"
+      "add x21, x23, %x[input_offset]\n"
       "89:"  // Height 6: input setup done
-      "cmp x27, #0x10\n"
+      "cmp x28, #0x10\n"
       "blt 93f\n"
-      "cmp x27, #0x20\n"
+      "cmp x28, #0x20\n"
       "blt 92f\n"
       "90:"  // Height 6: Multiply loop: Main loop head
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "ldr q19, [x22, #0x0]\n"
-      "ldr q16, [x20, #0x0]\n"
-      "cmp x9, #0x7e\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "cmp x10, #0x7e\n"
+      "add x27, x27, #0x10\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
+      "ldr q19, [x23, #0x0]\n"
+      "ldr q16, [x21, #0x0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "add x22, x22, #0x10\n"
-      "add x20, x20, #0x10\n"
+      "add x21, x21, #0x10\n"
       "blt 91f\n"
       "uadalp v0.4s, v1.8h\n"
       "movi v1.8h, #0x0\n"
+      "mov x10, #0x0\n"
       "uadalp v29.4s, v30.8h\n"
       "movi v30.8h, #0x0\n"
       "uadalp v26.4s, v27.8h\n"
@@ -960,158 +961,157 @@ void row_sums_indirect(
       "movi v21.8h, #0x0\n"
       "uadalp v17.4s, v18.8h\n"
       "movi v18.8h, #0x0\n"
-      "mov x9, #0x0\n"
       "91:"  // Height 6: Multiply loop: unique 6: no collapse
+      "sub x28, x28, #0x10\n"
+      "cmp x28, #0x20\n"
       "uadalp v1.8h, v31.16b\n"
       "uadalp v30.8h, v28.16b\n"
       "uadalp v27.8h, v25.16b\n"
       "uadalp v24.8h, v22.16b\n"
+      "add x10, x10, #0x1\n"
       "uadalp v21.8h, v19.16b\n"
       "uadalp v18.8h, v16.16b\n"
-      "add x9, x9, #0x1\n"
-      "sub x27, x27, #0x10\n"
-      "cmp x27, #0x20\n"
       "bge 90b\n"
       "92:"  // Height 6: Multiply loop: Single iteration only
-      "sub x27, x27, #0x10\n"
-      "ldr q31, [x26, #0x0]\n"
-      "ldr q28, [x25, #0x0]\n"
-      "ldr q25, [x24, #0x0]\n"
-      "ldr q22, [x23, #0x0]\n"
-      "ldr q19, [x22, #0x0]\n"
-      "ldr q16, [x20, #0x0]\n"
+      "ldr q31, [x27, #0x0]\n"
+      "ldr q28, [x26, #0x0]\n"
+      "sub x28, x28, #0x10\n"
       "uadalp v1.8h, v31.16b\n"
+      "ldr q25, [x25, #0x0]\n"
+      "ldr q22, [x24, #0x0]\n"
       "uadalp v30.8h, v28.16b\n"
       "uadalp v27.8h, v25.16b\n"
+      "ldr q19, [x23, #0x0]\n"
+      "ldr q16, [x21, #0x0]\n"
       "uadalp v24.8h, v22.16b\n"
       "uadalp v21.8h, v19.16b\n"
       "uadalp v18.8h, v16.16b\n"
+      "add x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "add x22, x22, #0x10\n"
-      "add x20, x20, #0x10\n"
+      "add x21, x21, #0x10\n"
       "93:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x27, 102f\n"
-      "tbz x27, #3, 97f\n"
-      "ldr d31, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d25, [x24], #0x8\n"
-      "ldr d22, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d16, [x20], #0x8\n"
-      "tbz x27, #2, 95f\n"
-      "ld1 { v31.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v25.s }[2], [x24], #0x4\n"
-      "ld1 { v22.s }[2], [x23], #0x4\n"
-      "ld1 { v19.s }[2], [x22], #0x4\n"
-      "ld1 { v16.s }[2], [x20], #0x4\n"
-      "tbz x27, #1, 94f\n"
-      "ld1 { v31.h }[6], [x26], #0x2\n"
-      "ld1 { v28.h }[6], [x25], #0x2\n"
-      "ld1 { v25.h }[6], [x24], #0x2\n"
-      "ld1 { v22.h }[6], [x23], #0x2\n"
-      "ld1 { v19.h }[6], [x22], #0x2\n"
-      "ld1 { v16.h }[6], [x20], #0x2\n"
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[14], [x26]\n"
-      "ld1 { v28.b }[14], [x25]\n"
-      "ld1 { v25.b }[14], [x24]\n"
-      "ld1 { v22.b }[14], [x23]\n"
-      "ld1 { v19.b }[14], [x22]\n"
-      "ld1 { v16.b }[14], [x20]\n"
+      "cbz x28, 102f\n"
+      "tbz x28, #3, 97f\n"
+      "ldr d31, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d16, [x21], #0x8\n"
+      "tbz x28, #2, 95f\n"
+      "ld1 { v31.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v22.s }[2], [x24], #0x4\n"
+      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "ld1 { v16.s }[2], [x21], #0x4\n"
+      "tbz x28, #1, 94f\n"
+      "ld1 { v31.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v22.h }[6], [x24], #0x2\n"
+      "ld1 { v19.h }[6], [x23], #0x2\n"
+      "ld1 { v16.h }[6], [x21], #0x2\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v22.b }[14], [x24]\n"
+      "ld1 { v19.b }[14], [x23]\n"
+      "ld1 { v16.b }[14], [x21]\n"
       "b 101f\n"
       "94:"  // Height 6: Multiply loop: Ragged operand read: partial_1_12
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[12], [x26]\n"
-      "ld1 { v28.b }[12], [x25]\n"
-      "ld1 { v25.b }[12], [x24]\n"
-      "ld1 { v22.b }[12], [x23]\n"
-      "ld1 { v19.b }[12], [x22]\n"
-      "ld1 { v16.b }[12], [x20]\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v22.b }[12], [x24]\n"
+      "ld1 { v19.b }[12], [x23]\n"
+      "ld1 { v16.b }[12], [x21]\n"
       "b 101f\n"
       "95:"  // Height 6: Multiply loop: Ragged operand read: partial_2_8
-      "tbz x27, #1, 96f\n"
-      "ld1 { v31.h }[4], [x26], #0x2\n"
-      "ld1 { v28.h }[4], [x25], #0x2\n"
-      "ld1 { v25.h }[4], [x24], #0x2\n"
-      "ld1 { v22.h }[4], [x23], #0x2\n"
-      "ld1 { v19.h }[4], [x22], #0x2\n"
-      "ld1 { v16.h }[4], [x20], #0x2\n"
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[10], [x26]\n"
-      "ld1 { v28.b }[10], [x25]\n"
-      "ld1 { v25.b }[10], [x24]\n"
-      "ld1 { v22.b }[10], [x23]\n"
-      "ld1 { v19.b }[10], [x22]\n"
-      "ld1 { v16.b }[10], [x20]\n"
+      "tbz x28, #1, 96f\n"
+      "ld1 { v31.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v22.h }[4], [x24], #0x2\n"
+      "ld1 { v19.h }[4], [x23], #0x2\n"
+      "ld1 { v16.h }[4], [x21], #0x2\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v22.b }[10], [x24]\n"
+      "ld1 { v19.b }[10], [x23]\n"
+      "ld1 { v16.b }[10], [x21]\n"
       "b 101f\n"
       "96:"  // Height 6: Multiply loop: Ragged operand read: partial_1_8
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[8], [x26]\n"
-      "ld1 { v28.b }[8], [x25]\n"
-      "ld1 { v25.b }[8], [x24]\n"
-      "ld1 { v22.b }[8], [x23]\n"
-      "ld1 { v19.b }[8], [x22]\n"
-      "ld1 { v16.b }[8], [x20]\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v22.b }[8], [x24]\n"
+      "ld1 { v19.b }[8], [x23]\n"
+      "ld1 { v16.b }[8], [x21]\n"
       "b 101f\n"
       "97:"  // Height 6: Multiply loop: Ragged operand read: partial_4_0
-      "tbz x27, #2, 99f\n"
-      "ldr s31, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s25, [x24], #0x4\n"
-      "ldr s22, [x23], #0x4\n"
-      "ldr s19, [x22], #0x4\n"
-      "ldr s16, [x20], #0x4\n"
-      "tbz x27, #1, 98f\n"
-      "ld1 { v31.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v25.h }[2], [x24], #0x2\n"
-      "ld1 { v22.h }[2], [x23], #0x2\n"
-      "ld1 { v19.h }[2], [x22], #0x2\n"
-      "ld1 { v16.h }[2], [x20], #0x2\n"
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v25.b }[6], [x24]\n"
-      "ld1 { v22.b }[6], [x23]\n"
-      "ld1 { v19.b }[6], [x22]\n"
-      "ld1 { v16.b }[6], [x20]\n"
+      "tbz x28, #2, 99f\n"
+      "ldr s31, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "ldr s16, [x21], #0x4\n"
+      "tbz x28, #1, 98f\n"
+      "ld1 { v31.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v22.h }[2], [x24], #0x2\n"
+      "ld1 { v19.h }[2], [x23], #0x2\n"
+      "ld1 { v16.h }[2], [x21], #0x2\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v22.b }[6], [x24]\n"
+      "ld1 { v19.b }[6], [x23]\n"
+      "ld1 { v16.b }[6], [x21]\n"
       "b 101f\n"
       "98:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v25.b }[4], [x24]\n"
-      "ld1 { v22.b }[4], [x23]\n"
-      "ld1 { v19.b }[4], [x22]\n"
-      "ld1 { v16.b }[4], [x20]\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v22.b }[4], [x24]\n"
+      "ld1 { v19.b }[4], [x23]\n"
+      "ld1 { v16.b }[4], [x21]\n"
       "b 101f\n"
       "99:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
-      "tbz x27, #1, 100f\n"
-      "ldr h31, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h25, [x24], #0x2\n"
-      "ldr h22, [x23], #0x2\n"
-      "ldr h19, [x22], #0x2\n"
-      "ldr h16, [x20], #0x2\n"
-      "tbz x27, #0, 101f\n"
-      "ld1 { v31.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v25.b }[2], [x24]\n"
-      "ld1 { v22.b }[2], [x23]\n"
-      "ld1 { v19.b }[2], [x22]\n"
-      "ld1 { v16.b }[2], [x20]\n"
+      "tbz x28, #1, 100f\n"
+      "ldr h31, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h22, [x24], #0x2\n"
+      "ldr h19, [x23], #0x2\n"
+      "ldr h16, [x21], #0x2\n"
+      "tbz x28, #0, 101f\n"
+      "ld1 { v31.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v22.b }[2], [x24]\n"
+      "ld1 { v19.b }[2], [x23]\n"
+      "ld1 { v16.b }[2], [x21]\n"
       "b 101f\n"
       "100:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b31, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b25, [x24, #0x0]\n"
-      "ldr b22, [x23, #0x0]\n"
-      "ldr b19, [x22, #0x0]\n"
-      "ldr b16, [x20, #0x0]\n"
+      "ldr b31, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b22, [x24, #0x0]\n"
+      "ldr b19, [x23, #0x0]\n"
+      "ldr b16, [x21, #0x0]\n"
       "101:"  // Height 6: Multiply loop: Ragged operand read: Done
       "uadalp v1.8h, v31.16b\n"
       "uadalp v30.8h, v28.16b\n"
@@ -1120,23 +1120,23 @@ void row_sums_indirect(
       "uadalp v21.8h, v19.16b\n"
       "uadalp v18.8h, v16.16b\n"
       "102:"  // Height 6: Multiply loop: No odd multiplies
-      "add x28, x28, #0x1\n"
-      "cmp x28, x21\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x22\n"
       "bne 87b\n"
       "uadalp v0.4s, v1.8h\n"
       "uadalp v29.4s, v30.8h\n"
-      "addp v0.4s, v0.4s, v29.4s\n"
+      "subs %x[M], %x[M], #0x6\n"
       "uadalp v26.4s, v27.8h\n"
       "uadalp v23.4s, v24.8h\n"
-      "addp v29.4s, v26.4s, v23.4s\n"
       "uadalp v20.4s, v21.8h\n"
       "uadalp v17.4s, v18.8h\n"
       "addp v0.4s, v0.4s, v29.4s\n"
-      "subs %x[M], %x[M], #0x6\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
       "addp v20.4s, v20.4s, v17.4s\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
       "mul v0.4s, v0.4s, v2.4s\n"
       "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
-      "addp v20.4s, v20.4s, v20.4s\n"
       "mul v20.4s, v20.4s, v2.4s\n"
       "str d20, [%x[out_ptr]], #0x8\n"
       "beq 104f\n"
@@ -1144,14 +1144,14 @@ void row_sums_indirect(
       "add %x[input_offset], %x[input_offset], #0x6\n"
       "b 1b\n"
       "103:"  // Update direct input
-      "mov x19, #0x6\n"
-      "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n"
+      "mov x20, #0x6\n"
+      "madd %x[input_ptr], x20, %x[input_offset], %x[input_ptr]\n"
       "b 1b\n"
       "104:"  // Exit
 
-      : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr)
+      : [M] "+&r" (M), [input_offset] "+&r" (input_offset), [input_ptr] "+&r" (input_ptr), [out_ptr] "+&r" (out_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
-      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
index 4669be9993..a9cbf4ec8d 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2018-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,9 +63,14 @@ public:
         ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
     }
 
+    bool PrepareB_supports_transpose() const {
+        return false;
+    }
+
     template<typename TIn>
     void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
-                  const int xmax, const int k0, const int kmax) const {
+                  const int xmax, const int k0, const int kmax, bool transposed) const {
+        assert(!transposed);
         Transform<width, block,  true>(out, in, stride, x0, xmax, k0, kmax);
     }
 
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed_trB.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed_trB.hpp
new file mode 100644
index 0000000000..1db716455f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed_trB.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018-2020, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "convolver.hpp"
+#include "mergeresults.hpp"
+#include "transform.hpp"
+#include "interleave_indirect.hpp"
+
+namespace arm_gemm {
+
+/*
+ * Define "standard" transforms for the blocked GEMMs with fixed vector
+ * length.  This version supports accepting the RHS/B matrix in transposed
+ * format.
+ *
+ * This assumes that A is interleaved 'height' ways, B is interleaved
+ * 'width' ways and transposed, and that the merge needs to work in 'height'
+ * x 'width' blocks.
+ *
+ * The optional 'block' parameter is for kernels using dot-product type
+ * instructions like UDOT and SDOT.
+ */
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width, unsigned int block=1, bool integrate_sums=false>
+class StdTransformsFixedTRB
+{
+public:
+    template<typename TIn>
+    void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
+                  const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) const {
+        Interleave<height, block, VLType::None>(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
+                           const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        IndirectInterleave<height, block, VLType::None>(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
+                              const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    bool PrepareB_supports_transpose() const {
+        return true;
+    }
+
+    template<typename TIn>
+    void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
+                  const int xmax, const int k0, const int kmax, bool transposed) const {
+        if (transposed) {
+            Transform<width, block, false>(out, in, stride, x0, xmax, k0, kmax);
+        } else {
+            Transform<width, block,  true>(out, in, stride, x0, xmax, k0, kmax);
+        }
+    }
+
+    template<typename TOut>
+    void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut *bias, const Activation act, bool append) const {
+        MergeResults<width, height>(out, in, stride, y0, ymax, x0, xmax, bias, act, append);
+    }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp
new file mode 100644
index 0000000000..40f61626a1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "interleave_indirect.hpp"
+#include "transform.hpp"
+
+namespace arm_gemm {
+
+/*
+ * Define "standard" transforms for the blocked GEMMs for SVE.
+ *
+ * This assumes that A is interleaved 'height' ways, B is interleaved
+ * 'width'xVL ways and transposed, and that the merge needs to work in
+ * 'height' x 'width'xVL blocks.
+ *
+ * The optional 'block' parameter is for kernels using dot-product type
+ * instructions like UDOT and SDOT.
+ */
+template<typename TOperand, typename TResult, unsigned int height_vectors, unsigned int width_vectors, unsigned int block=1, bool integrate_sums=false>
+class StdTransformsSME
+{
+public:
+    template<typename TIn>
+    void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
+                  const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        Interleave<height_vectors, block, VLType::SME>(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
+                           const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        IndirectInterleave<height_vectors, block, VLType::SME>(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
+                              const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        ConvolutionInterleave<height_vectors, block, VLType::SME>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    bool PrepareB_supports_transpose() const {
+        return false;
+    }
+
+    template<typename TIn>
+    void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
+                  const int xmax, const int k0, const int kmax, bool transposed) {
+        assert (!transposed);
+        Transform<width_vectors, block,  true, VLType::SME>(out, in, stride, x0, xmax, k0, kmax);
+    }
+
+    template<typename TOut>
+    void Merge(TOut *, const TResult *, int, int, int, int, int, const TOut *, const Activation, bool) {
+        // Separate merge not supported for SME.
+    }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
index 3256d919ea..c516bfc456 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018,2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,9 +61,14 @@ public:
         ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
     }
 
+    bool PrepareB_supports_transpose() const {
+        return false;
+    }
+
     template<typename TIn>
     void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
-                  const int xmax, const int k0, const int kmax) {
+                  const int xmax, const int k0, const int kmax, bool transposed) {
+        assert (!transposed);
         Transform<width_vectors, block,  true, VLType::SVE>(out, in, stride, x0, xmax, k0, kmax);
     }
 
diff --git a/src/core/NEON/kernels/arm_gemm/transform-sve.cpp b/src/core/NEON/kernels/arm_gemm/transform-sve.cpp
new file mode 100644
index 0000000000..d01a9b0fd0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transform-sve.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "utils.hpp"
+
+#include "bfloat.hpp"
+#include "transform.hpp"
+
+#if !defined(_WIN64) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
+
+
+namespace arm_gemm {
+
+#include "transforms/list-sve.hpp"
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transform.cpp b/src/core/NEON/kernels/arm_gemm/transform.cpp
new file mode 100644
index 0000000000..06d9e2416c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transform.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "utils.hpp"
+
+#include "bfloat.hpp"
+
+#if !defined(_WIN64) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
+
+namespace arm_gemm {
+
+/*
+ * Generic transform.
+ *
+ * Assuming the untransposed case, this works by first reading <BlockBy>
+ * consecutive values from the first input row.  This same number of values
+ * are then read from the next <IntBy-1> rows.  Now return to the first
+ * input row and repeat.
+ *
+ * Need to cope with the work requested in either dimension not actually
+ * being a multiple of the block sizes.
+ */
+template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, VLType vlt>
+struct TransformImpl {
+    template <typename TOut, typename TIn>
+    static void Transform(TOut* out, const TIn* const in, const int stride,
+                          const int y0, const int ymax, const int x0, const int xmax) {
+        // NOTE: This code is disabled to avoid the call to get_vector_length(), so templated transforms will not be
+        // correct for SVE.  This is not an issue as we have specializations for all SVE cases.
+        // For SVE cases we multiply the interleave factor by the vector length.
+        // const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length<TOut>() / BlockBy : 1);
+        const unsigned int IntBy = tIntBy;
+
+        const int n_whole_y_blocks = (ymax - y0) / IntBy;
+        const int y_remainders = (ymax - y0) % IntBy;
+        const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
+
+        const int n_whole_x_blocks = (xmax - x0) / BlockBy;
+        const int x_remainders = (xmax - x0) % BlockBy;
+        const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
+
+        // "Y" loop: advance down the rows of the source IntBy rows at a time.
+        // Set up fill_rows to show the number rows to copy from, and blank_rows
+        // for the number of blank rows to add.
+        for (int y_block=0 ; y_block < n_y_blocks; y_block++) {
+            int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+            int blank_rows = IntBy - fill_rows;
+
+            int y_base = y0 + (y_block * IntBy);
+
+            // So now advance along this block of rows, BlockBy columns at a time.
+            for (int x_block=0 ; x_block < n_x_blocks; x_block++) {
+                int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+                int blank_cols = BlockBy - fill_cols;
+
+                int x_base = x0 + (x_block * BlockBy);
+
+                for (int row = 0; row < fill_rows; row++) {
+                    for (int col = 0; col < fill_cols; col++) {
+                        // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
+                        if (Transposed) {
+                            *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
+                        } else {
+                            *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
+                        }
+                    }
+                    // "col" tail - row is in range but column is out of range.
+                    for (int col=0; col < blank_cols; col++) {
+                        *out++ = static_cast<TOut>(0);
+                    }
+                }
+                // "row" tail - row is out of range so fill with zeros always.
+                TOut zeroval = static_cast<TOut>(0);
+                int pads = blank_rows * (fill_cols + blank_cols);
+
+                for (int i=0; i<pads; i++) {
+                    out[i] = zeroval;
+                }
+
+                out += pads;
+            }
+        }
+    }
+
+    template <typename T>
+    static void Transform(T* out, const T* const in, const int stride,
+                                 const int k0, const int kmax, const int x0, const int xmax) {
+        Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
+    }
+};
+
+/*****************************************************************************/
+template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, VLType vlt=VLType::None, typename TOut, typename TIn>
+void Transform(
+  TOut* out, const TIn* const in, const int stride,
+  const int k0, const int kmax, const int x0, const int xmax
+) {
+  // Redirect to a specialised implementation predicated on argument size.
+  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn), vlt>::Transform(
+    out, in, stride, k0, kmax, x0, xmax
+  );
+}
+/*****************************************************************************/
+
+#include "transforms/list.hpp"
+
+// We don't have assembler transforms for AArch32, generate templated ones here.
+#ifdef __arm__
+template void Transform<8, 1, true, VLType::None>(float *, const float *, int, int, int, int, int);
+#if defined(ARM_COMPUTE_ENABLE_FP16)
+template void Transform<8, 1, true, VLType::None>(float *, const __fp16 *, int, int, int, int, int);
+#endif // defined(ARM_COMPUTE_ENABLE_FP16)
+#ifdef ARM_COMPUTE_ENABLE_BF16
+template void Transform<8, 1, true, VLType::None>(float *, const bfloat16 *, int, int, int, int, int);
+#endif // ARM_COMPUTE_ENABLE_BF16
+#endif // AArch32
+
+#if defined(ARM_COMPUTE_ENABLE_FP16)
+template void Transform<12, 1, false, VLType::None>(float *, const __fp16 *, int, int, int, int, int);
+#endif // defined(ARM_COMPUTE_ENABLE_FP16)
+#ifdef ARM_COMPUTE_ENABLE_BF16
+template void Transform<12, 1, false, VLType::None>(float *, const bfloat16 *, int, int, int, int, int);
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index 5efeee5d35..f46e6c5fa3 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,96 +27,10 @@
 
 namespace arm_gemm {
 
-/*
- * Generic transform.
- *
- * Assuming the untransposed case, this works by first reading <BlockBy>
- * consecutive values from the first input row.  This same number of values
- * are then read from the next <IntBy-1> rows.  Now return to the first
- * input row and repeat.
- *
- * Need to cope with the work requested in either dimension not actually
- * being a multiple of the block sizes.
- */
-template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, VLType vlt>
-struct TransformImpl {
-    template <typename TOut, typename TIn>
-    static void Transform(TOut* out, const TIn* const in, const int stride,
-                          const int y0, const int ymax, const int x0, const int xmax) {
-        // For SVE cases we multiply the interleave factor by the vector length.
-        const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length<TOut>() / BlockBy : 1);
-
-        const int n_whole_y_blocks = (ymax - y0) / IntBy;
-        const int y_remainders = (ymax - y0) % IntBy;
-        const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
-
-        const int n_whole_x_blocks = (xmax - x0) / BlockBy;
-        const int x_remainders = (xmax - x0) % BlockBy;
-        const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
-
-        // "Y" loop: advance down the rows of the source IntBy rows at a time.
-        // Set up fill_rows to show the number rows to copy from, and blank_rows
-        // for the number of blank rows to add.
-        for (int y_block=0 ; y_block < n_y_blocks; y_block++) {
-            int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
-            int blank_rows = IntBy - fill_rows;
-
-            int y_base = y0 + (y_block * IntBy);
-
-            // So now advance along this block of rows, BlockBy columns at a time.
-            for (int x_block=0 ; x_block < n_x_blocks; x_block++) {
-                int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
-                int blank_cols = BlockBy - fill_cols;
-
-                int x_base = x0 + (x_block * BlockBy);
-
-                for (int row = 0; row < fill_rows; row++) {
-                    for (int col = 0; col < fill_cols; col++) {
-                        // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
-                        if (Transposed) {
-                            *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
-                        } else {
-                            *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
-                        }
-                    }
-                    // "col" tail - row is in range but column is out of range.
-                    for (int col=0; col < blank_cols; col++) {
-                        *out++ = static_cast<TOut>(0);
-                    }
-                }
-                // "row" tail - row is out of range so fill with zeros always.
-                TOut zeroval = static_cast<TOut>(0);
-                int pads = blank_rows * (fill_cols + blank_cols);
-
-                for (int i=0; i<pads; i++) {
-                    out[i] = zeroval;
-                }
-
-                out += pads;
-            }
-        }
-    }
-
-    template <typename T>
-    static inline void Transform(T* out, const T* const in, const int stride,
-                                 const int k0, const int kmax, const int x0, const int xmax) {
-        Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
-    }
-};
-
-/*****************************************************************************/
 template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, VLType vlt=VLType::None, typename TOut, typename TIn>
 void Transform(
   TOut* out, const TIn* const in, const int stride,
   const int k0, const int kmax, const int x0, const int xmax
-) {
-  // Redirect to a specialised implementation predicated on argument size.
-  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn), vlt>::Transform(
-    out, in, stride, k0, kmax, x0, xmax
-  );
-}
-/*****************************************************************************/
-
-#include "transforms/list.hpp"
+);
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
index 3ce1d328a7..b50c240a3a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,7 @@
 // Generic unblocked transposed 8x32-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
+void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -45,7 +45,7 @@ inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
 // Generic 16x16-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
+void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -59,7 +59,7 @@ inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
 
 // Specialised 16 x uint16_t version
 template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
+void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
   __asm volatile (
     "VLD1.32	{d0-d3}, [%[in0]]!\n"
     "VST1.32	{d0-d3}, [%[out]]\n"
@@ -72,7 +72,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(con
 }
 
 template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
+void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
   __asm volatile (
     "VLD1.32	{d0-d3}, [%[in0]]!\n"
     "VST1.32	{d0-d3}, [%[out]]!\n"
@@ -90,7 +90,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(con
 }
 
 template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
+void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
   __asm __volatile (
     "VLD1.32	{d0-d3}, [%[in0]]!\n"
     "VST1.32	{d0-d3}, [%[out]]!\n"
@@ -117,7 +117,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(con
 
 template <>
 template <>
-inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
+void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     uint16_t* out, const uint16_t* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
new file mode 100644
index 0000000000..8574d89226
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 32 * height * sizeof(uint32_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x20\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q15, [x25], #0x10\n"
+      "ldr q14, [x23], #0x10\n"
+      "sub x24, x24, #0x20\n"
+      "cmp x24, #0x20\n"
+      "ldr q13, [x22], #0x10\n"
+      "ldr q12, [x20], #0x10\n"
+      "ldr q11, [x25], #0x10\n"
+      "ldr q10, [x23], #0x10\n"
+      "ldr q9, [x22], #0x10\n"
+      "ldr q8, [x20], #0x10\n"
+      "ldr q7, [x25], #0x10\n"
+      "ldr q6, [x23], #0x10\n"
+      "ldr q5, [x22], #0x10\n"
+      "ldr q4, [x20], #0x10\n"
+      "ldr q3, [x25], #0x10\n"
+      "ldr q2, [x23], #0x10\n"
+      "ldr q1, [x22], #0x10\n"
+      "ldr q0, [x20], #0x10\n"
+      "ldr q31, [x25], #0x10\n"
+      "ldr q30, [x23], #0x10\n"
+      "ldr q29, [x22], #0x10\n"
+      "ldr q28, [x20], #0x10\n"
+      "ldr q27, [x25], #0x10\n"
+      "ldr q26, [x23], #0x10\n"
+      "ldr q25, [x22], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q15, [x21, #0x0]\n"
+      "str q11, [x21, #0x10]\n"
+      "str q7, [x21, #0x20]\n"
+      "str q3, [x21, #0x30]\n"
+      "str q31, [x21, #0x40]\n"
+      "str q27, [x21, #0x50]\n"
+      "str q23, [x21, #0x60]\n"
+      "str q19, [x21, #0x70]\n"
+      "str q14, [x21, #0x80]\n"
+      "str q10, [x21, #0x90]\n"
+      "str q6, [x21, #0xa0]\n"
+      "str q2, [x21, #0xb0]\n"
+      "str q30, [x21, #0xc0]\n"
+      "str q26, [x21, #0xd0]\n"
+      "str q22, [x21, #0xe0]\n"
+      "str q18, [x21, #0xf0]\n"
+      "str q13, [x21, #0x100]\n"
+      "str q9, [x21, #0x110]\n"
+      "str q5, [x21, #0x120]\n"
+      "str q1, [x21, #0x130]\n"
+      "str q29, [x21, #0x140]\n"
+      "str q25, [x21, #0x150]\n"
+      "str q21, [x21, #0x160]\n"
+      "str q17, [x21, #0x170]\n"
+      "str q12, [x21, #0x180]\n"
+      "str q8, [x21, #0x190]\n"
+      "str q4, [x21, #0x1a0]\n"
+      "str q0, [x21, #0x1b0]\n"
+      "str q28, [x21, #0x1c0]\n"
+      "str q24, [x21, #0x1d0]\n"
+      "str q20, [x21, #0x1e0]\n"
+      "str q16, [x21, #0x1f0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x24, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q31, [x25], #0x10\n"
+      "ldr q30, [x23], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q29, [x22], #0x10\n"
+      "ldr q28, [x20], #0x10\n"
+      "ldr q27, [x25], #0x10\n"
+      "ldr q26, [x23], #0x10\n"
+      "ldr q25, [x22], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q31, [x21, #0x0]\n"
+      "str q27, [x21, #0x10]\n"
+      "str q23, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "str q30, [x21, #0x80]\n"
+      "str q26, [x21, #0x90]\n"
+      "str q22, [x21, #0xa0]\n"
+      "str q18, [x21, #0xb0]\n"
+      "str q29, [x21, #0x100]\n"
+      "str q25, [x21, #0x110]\n"
+      "str q21, [x21, #0x120]\n"
+      "str q17, [x21, #0x130]\n"
+      "str q28, [x21, #0x180]\n"
+      "str q24, [x21, #0x190]\n"
+      "str q20, [x21, #0x1a0]\n"
+      "str q16, [x21, #0x1b0]\n"
+      "add x21, x21, #0x40\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q19, [x21, #0x0]\n"
+      "str q18, [x21, #0x80]\n"
+      "str q17, [x21, #0x100]\n"
+      "str q16, [x21, #0x180]\n"
+      "add x21, x21, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr s19, [x25], #0x4\n"
+      "ldr s18, [x23], #0x4\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "str s19, [x21, #0x0]\n"
+      "str s18, [x21, #0x80]\n"
+      "str s17, [x21, #0x100]\n"
+      "str s16, [x21, #0x180]\n"
+      "add x21, x21, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0x200\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x20, %x[width]\n"
+      "mov x25, %x[in]\n"
+      "cmp x20, #0x20\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x25], #0x10\n"
+      "sub x20, x20, #0x20\n"
+      "cmp x20, #0x20\n"
+      "ldr q21, [x25], #0x10\n"
+      "ldr q20, [x25], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "str q23, [x21, #0x0]\n"
+      "str q22, [x21, #0x10]\n"
+      "str q21, [x21, #0x20]\n"
+      "str q20, [x21, #0x30]\n"
+      "str q19, [x21, #0x40]\n"
+      "str q18, [x21, #0x50]\n"
+      "str q17, [x21, #0x60]\n"
+      "str q16, [x21, #0x70]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x25], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "str q19, [x21, #0x0]\n"
+      "str q18, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, #0x40\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr q16, [x25], #0x10\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr s16, [x25], #0x4\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "str s16, [x21, #0x0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x80\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<32, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_128(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
new file mode 100644
index 0000000000..cdf1f98608
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_12_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 4) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x30\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "sub x24, x24, #0x30\n"
+      "cmp x24, #0x30\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v31.16b, v21.16b, v17.16b\n"
+      "zip1 v22.16b, v20.16b, v16.16b\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v14.16b, v21.16b, v17.16b\n"
+      "zip2 v13.16b, v20.16b, v16.16b\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v30.16b, v19.16b, v17.16b\n"
+      "zip1 v29.16b, v18.16b, v16.16b\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v12.16b, v19.16b, v17.16b\n"
+      "zip2 v11.16b, v18.16b, v16.16b\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v10.16b, v21.16b, v17.16b\n"
+      "zip1 v9.16b, v20.16b, v16.16b\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v8.16b, v21.16b, v17.16b\n"
+      "zip2 v7.16b, v20.16b, v16.16b\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v6.16b, v19.16b, v17.16b\n"
+      "zip1 v5.16b, v18.16b, v16.16b\n"
+      "ldr q28, [x9], #0x10\n"
+      "ldr q27, [x28], #0x10\n"
+      "zip2 v4.16b, v19.16b, v17.16b\n"
+      "zip2 v3.16b, v18.16b, v16.16b\n"
+      "ldr q26, [x27], #0x10\n"
+      "ldr q25, [x26], #0x10\n"
+      "zip1 v2.16b, v28.16b, v26.16b\n"
+      "zip1 v1.16b, v27.16b, v25.16b\n"
+      "ldr q24, [x25], #0x10\n"
+      "ldr q23, [x23], #0x10\n"
+      "zip1 v16.16b, v31.16b, v22.16b\n"
+      "zip2 v22.16b, v31.16b, v22.16b\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "zip1 v0.16b, v24.16b, v21.16b\n"
+      "zip1 v31.16b, v23.16b, v20.16b\n"
+      "zip1 v19.16b, v14.16b, v13.16b\n"
+      "zip1 v18.16b, v30.16b, v29.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "zip2 v16.16b, v30.16b, v29.16b\n"
+      "zip1 v17.16b, v12.16b, v11.16b\n"
+      "str q22, [x21, #0x10]\n"
+      "str q19, [x21, #0x20]\n"
+      "zip2 v30.16b, v28.16b, v26.16b\n"
+      "zip2 v29.16b, v27.16b, v25.16b\n"
+      "str q18, [x21, #0x30]\n"
+      "zip2 v28.16b, v24.16b, v21.16b\n"
+      "zip2 v27.16b, v23.16b, v20.16b\n"
+      "str q16, [x21, #0x40]\n"
+      "zip2 v21.16b, v14.16b, v13.16b\n"
+      "zip1 v16.16b, v10.16b, v9.16b\n"
+      "str q17, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip2 v20.16b, v10.16b, v9.16b\n"
+      "zip2 v19.16b, v12.16b, v11.16b\n"
+      "zip1 v18.16b, v6.16b, v5.16b\n"
+      "zip2 v17.16b, v6.16b, v5.16b\n"
+      "str q21, [x21, #0x0]\n"
+      "str q16, [x21, #0x10]\n"
+      "zip1 v16.16b, v8.16b, v7.16b\n"
+      "zip2 v26.16b, v8.16b, v7.16b\n"
+      "str q20, [x21, #0x20]\n"
+      "zip1 v25.16b, v2.16b, v1.16b\n"
+      "zip1 v24.16b, v4.16b, v3.16b\n"
+      "str q19, [x21, #0x30]\n"
+      "zip2 v23.16b, v4.16b, v3.16b\n"
+      "zip1 v22.16b, v0.16b, v31.16b\n"
+      "str q18, [x21, #0x40]\n"
+      "zip2 v21.16b, v2.16b, v1.16b\n"
+      "zip1 v20.16b, v30.16b, v29.16b\n"
+      "str q17, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip2 v19.16b, v30.16b, v29.16b\n"
+      "zip2 v18.16b, v0.16b, v31.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v17.16b, v28.16b, v27.16b\n"
+      "zip2 v16.16b, v28.16b, v27.16b\n"
+      "str q26, [x21, #0x10]\n"
+      "str q25, [x21, #0x20]\n"
+      "str q24, [x21, #0x30]\n"
+      "str q23, [x21, #0x40]\n"
+      "str q22, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "str q21, [x21, #0x0]\n"
+      "str q20, [x21, #0x10]\n"
+      "str q19, [x21, #0x20]\n"
+      "str q18, [x21, #0x30]\n"
+      "str q17, [x21, #0x40]\n"
+      "str q16, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr d23, [x9], #0x8\n"
+      "ldr d22, [x28], #0x8\n"
+      "sub x24, x24, #0xc\n"
+      "cmp x24, #0xc\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d18, [x26], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d17, [x20], #0x8\n"
+      "ld1 { v23.s }[2], [x9], #0x4\n"
+      "ld1 { v22.s }[2], [x28], #0x4\n"
+      "ld1 { v19.s }[2], [x27], #0x4\n"
+      "ld1 { v18.s }[2], [x26], #0x4\n"
+      "zip1 v24.16b, v23.16b, v19.16b\n"
+      "zip1 v16.16b, v22.16b, v18.16b\n"
+      "ld1 { v21.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
+      "zip2 v19.16b, v23.16b, v19.16b\n"
+      "zip2 v18.16b, v22.16b, v18.16b\n"
+      "ld1 { v20.s }[2], [x22], #0x4\n"
+      "ld1 { v17.s }[2], [x20], #0x4\n"
+      "zip1 v23.16b, v21.16b, v20.16b\n"
+      "zip1 v22.16b, v25.16b, v17.16b\n"
+      "zip2 v21.16b, v21.16b, v20.16b\n"
+      "zip2 v20.16b, v25.16b, v17.16b\n"
+      "zip1 v17.16b, v24.16b, v16.16b\n"
+      "zip2 v16.16b, v24.16b, v16.16b\n"
+      "str q17, [x21, #0x0]\n"
+      "zip1 v19.16b, v19.16b, v18.16b\n"
+      "zip1 v18.16b, v23.16b, v22.16b\n"
+      "str q16, [x21, #0x10]\n"
+      "zip2 v17.16b, v23.16b, v22.16b\n"
+      "zip1 v16.16b, v21.16b, v20.16b\n"
+      "str q19, [x21, #0x20]\n"
+      "str q18, [x21, #0x30]\n"
+      "str q17, [x21, #0x40]\n"
+      "str q16, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "zip1 v18.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str q18, [x21, #0x0]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b19, [x9], #0x1\n"
+      "ldr b18, [x28], #0x1\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr b17, [x27], #0x1\n"
+      "ldr b16, [x26], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr b20, [x25], #0x1\n"
+      "ldr b19, [x23], #0x1\n"
+      "zip1 v18.16b, v17.16b, v16.16b\n"
+      "ldr b17, [x22], #0x1\n"
+      "ldr b16, [x20], #0x1\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str s18, [x21, #0x0]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x21, #0x30]\n"
+      "add x21, x21, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x8\n"
+      "add %x[out], %x[out], #0x60\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "mov x20, %x[width]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "csel x27, x27, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "cmp x20, #0x30\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "sub x20, x20, #0x30\n"
+      "cmp x20, #0x30\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v31.16b, v21.16b, v17.16b\n"
+      "zip1 v30.16b, v20.16b, v16.16b\n"
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v29.16b, v21.16b, v17.16b\n"
+      "zip2 v28.16b, v20.16b, v16.16b\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v27.16b, v19.16b, v17.16b\n"
+      "zip1 v26.16b, v18.16b, v16.16b\n"
+      "ldr q22, [x9], #0x10\n"
+      "ldr q21, [x28], #0x10\n"
+      "zip2 v25.16b, v19.16b, v17.16b\n"
+      "zip2 v20.16b, v18.16b, v16.16b\n"
+      "ldr q19, [x27], #0x10\n"
+      "ldr q18, [x26], #0x10\n"
+      "zip1 v24.16b, v22.16b, v19.16b\n"
+      "zip1 v23.16b, v21.16b, v18.16b\n"
+      "zip1 v16.16b, v31.16b, v30.16b\n"
+      "zip2 v17.16b, v31.16b, v30.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v16.16b, v29.16b, v28.16b\n"
+      "str q17, [x21, #0x10]\n"
+      "zip2 v22.16b, v22.16b, v19.16b\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip2 v21.16b, v21.16b, v18.16b\n"
+      "zip2 v18.16b, v29.16b, v28.16b\n"
+      "zip1 v16.16b, v27.16b, v26.16b\n"
+      "zip2 v17.16b, v27.16b, v26.16b\n"
+      "str q18, [x21, #0x0]\n"
+      "str q16, [x21, #0x10]\n"
+      "zip1 v16.16b, v25.16b, v20.16b\n"
+      "zip2 v20.16b, v25.16b, v20.16b\n"
+      "str q17, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip1 v19.16b, v24.16b, v23.16b\n"
+      "zip2 v18.16b, v24.16b, v23.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v17.16b, v22.16b, v21.16b\n"
+      "zip2 v16.16b, v22.16b, v21.16b\n"
+      "str q20, [x21, #0x10]\n"
+      "str q19, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "str q18, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr d19, [x9], #0x8\n"
+      "ldr d21, [x28], #0x8\n"
+      "sub x20, x20, #0xc\n"
+      "cmp x20, #0xc\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d16, [x26], #0x8\n"
+      "ld1 { v19.s }[2], [x9], #0x4\n"
+      "ld1 { v21.s }[2], [x28], #0x4\n"
+      "ld1 { v18.s }[2], [x27], #0x4\n"
+      "ld1 { v16.s }[2], [x26], #0x4\n"
+      "zip1 v20.16b, v19.16b, v18.16b\n"
+      "zip1 v17.16b, v21.16b, v16.16b\n"
+      "zip2 v19.16b, v19.16b, v18.16b\n"
+      "zip2 v18.16b, v21.16b, v16.16b\n"
+      "zip1 v16.16b, v20.16b, v17.16b\n"
+      "zip2 v17.16b, v20.16b, v17.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v16.16b, v19.16b, v18.16b\n"
+      "str q17, [x21, #0x10]\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr b19, [x9], #0x1\n"
+      "ldr b18, [x28], #0x1\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "ldr b17, [x27], #0x1\n"
+      "ldr b16, [x26], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x21, #0x0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x30\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<12, 4, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<12, 4, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
new file mode 100644
index 0000000000..da0809d4d6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_12_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 8) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 8) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "cmp %x[height], #0x7\n"
+      "add %x[in], x22, %x[in_stride]\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "csel x23, x23, %x[pad_row], GE\n"
+      "cmp %x[height], #0x5\n"
+      "mov x21, %x[width]\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x3\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "csel x27, x27, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "cmp x21, #0x30\n"
+      "mov x20, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q21, [x9], #0x10\n"
+      "ldr q25, [x28], #0x10\n"
+      "sub x21, x21, #0x30\n"
+      "cmp x21, #0x30\n"
+      "ldr q20, [x27], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x24], #0x10\n"
+      "zip1 v7.16b, v21.16b, v19.16b\n"
+      "zip1 v6.16b, v25.16b, v18.16b\n"
+      "ldr q17, [x23], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v28.16b, v20.16b, v17.16b\n"
+      "zip1 v27.16b, v24.16b, v16.16b\n"
+      "ldr q23, [x9], #0x10\n"
+      "ldr q22, [x28], #0x10\n"
+      "zip2 v5.16b, v21.16b, v19.16b\n"
+      "zip2 v4.16b, v20.16b, v17.16b\n"
+      "ldr q21, [x27], #0x10\n"
+      "ldr q20, [x26], #0x10\n"
+      "zip2 v3.16b, v25.16b, v18.16b\n"
+      "zip2 v2.16b, v24.16b, v16.16b\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x24], #0x10\n"
+      "zip1 v1.16b, v23.16b, v19.16b\n"
+      "zip1 v15.16b, v22.16b, v18.16b\n"
+      "ldr q17, [x23], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v0.16b, v21.16b, v17.16b\n"
+      "zip1 v31.16b, v20.16b, v16.16b\n"
+      "ldr q26, [x9], #0x10\n"
+      "ldr q30, [x28], #0x10\n"
+      "zip2 v14.16b, v23.16b, v19.16b\n"
+      "zip2 v13.16b, v21.16b, v17.16b\n"
+      "ldr q25, [x27], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip2 v12.16b, v22.16b, v18.16b\n"
+      "zip2 v11.16b, v20.16b, v16.16b\n"
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x24], #0x10\n"
+      "zip1 v10.16b, v26.16b, v23.16b\n"
+      "zip1 v9.16b, v30.16b, v22.16b\n"
+      "ldr q21, [x23], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "zip1 v29.16b, v25.16b, v21.16b\n"
+      "zip1 v8.16b, v24.16b, v17.16b\n"
+      "zip1 v19.16b, v7.16b, v28.16b\n"
+      "zip1 v16.16b, v6.16b, v27.16b\n"
+      "zip2 v28.16b, v7.16b, v28.16b\n"
+      "zip2 v18.16b, v6.16b, v27.16b\n"
+      "zip1 v27.16b, v5.16b, v4.16b\n"
+      "zip1 v20.16b, v3.16b, v2.16b\n"
+      "zip2 v7.16b, v26.16b, v23.16b\n"
+      "zip2 v26.16b, v25.16b, v21.16b\n"
+      "zip2 v6.16b, v30.16b, v22.16b\n"
+      "zip2 v25.16b, v24.16b, v17.16b\n"
+      "zip2 v5.16b, v5.16b, v4.16b\n"
+      "zip2 v4.16b, v3.16b, v2.16b\n"
+      "zip1 v3.16b, v1.16b, v0.16b\n"
+      "zip1 v2.16b, v15.16b, v31.16b\n"
+      "zip2 v1.16b, v1.16b, v0.16b\n"
+      "zip2 v0.16b, v15.16b, v31.16b\n"
+      "zip1 v31.16b, v14.16b, v13.16b\n"
+      "zip1 v30.16b, v12.16b, v11.16b\n"
+      "zip2 v24.16b, v14.16b, v13.16b\n"
+      "zip2 v23.16b, v12.16b, v11.16b\n"
+      "zip1 v22.16b, v10.16b, v29.16b\n"
+      "zip1 v21.16b, v9.16b, v8.16b\n"
+      "zip1 v17.16b, v19.16b, v16.16b\n"
+      "zip2 v16.16b, v19.16b, v16.16b\n"
+      "str q17, [x20, #0x0]\n"
+      "zip1 v19.16b, v28.16b, v18.16b\n"
+      "zip2 v18.16b, v28.16b, v18.16b\n"
+      "str q16, [x20, #0x10]\n"
+      "zip1 v17.16b, v27.16b, v20.16b\n"
+      "zip2 v16.16b, v27.16b, v20.16b\n"
+      "str q19, [x20, #0x20]\n"
+      "str q18, [x20, #0x30]\n"
+      "zip2 v29.16b, v10.16b, v29.16b\n"
+      "zip2 v20.16b, v9.16b, v8.16b\n"
+      "str q17, [x20, #0x40]\n"
+      "zip1 v28.16b, v7.16b, v26.16b\n"
+      "zip1 v27.16b, v6.16b, v25.16b\n"
+      "str q16, [x20, #0x50]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "zip2 v26.16b, v7.16b, v26.16b\n"
+      "zip2 v25.16b, v6.16b, v25.16b\n"
+      "zip1 v17.16b, v5.16b, v4.16b\n"
+      "zip2 v16.16b, v5.16b, v4.16b\n"
+      "str q17, [x20, #0x0]\n"
+      "zip1 v18.16b, v3.16b, v2.16b\n"
+      "zip2 v17.16b, v3.16b, v2.16b\n"
+      "str q16, [x20, #0x10]\n"
+      "zip1 v16.16b, v1.16b, v0.16b\n"
+      "zip2 v19.16b, v1.16b, v0.16b\n"
+      "str q18, [x20, #0x20]\n"
+      "str q17, [x20, #0x30]\n"
+      "zip1 v18.16b, v31.16b, v30.16b\n"
+      "zip2 v17.16b, v31.16b, v30.16b\n"
+      "str q16, [x20, #0x40]\n"
+      "zip1 v16.16b, v24.16b, v23.16b\n"
+      "zip2 v24.16b, v24.16b, v23.16b\n"
+      "str q19, [x20, #0x50]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "zip1 v23.16b, v22.16b, v21.16b\n"
+      "zip2 v22.16b, v22.16b, v21.16b\n"
+      "str q18, [x20, #0x0]\n"
+      "zip1 v21.16b, v29.16b, v20.16b\n"
+      "zip2 v20.16b, v29.16b, v20.16b\n"
+      "str q17, [x20, #0x10]\n"
+      "zip1 v19.16b, v28.16b, v27.16b\n"
+      "zip2 v18.16b, v28.16b, v27.16b\n"
+      "str q16, [x20, #0x20]\n"
+      "zip1 v17.16b, v26.16b, v25.16b\n"
+      "zip2 v16.16b, v26.16b, v25.16b\n"
+      "str q24, [x20, #0x30]\n"
+      "str q23, [x20, #0x40]\n"
+      "str q22, [x20, #0x50]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "str q21, [x20, #0x0]\n"
+      "str q20, [x20, #0x10]\n"
+      "str q19, [x20, #0x20]\n"
+      "str q18, [x20, #0x30]\n"
+      "str q17, [x20, #0x40]\n"
+      "str q16, [x20, #0x50]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x21, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr d23, [x9], #0x8\n"
+      "ldr d27, [x28], #0x8\n"
+      "sub x21, x21, #0xc\n"
+      "cmp x21, #0xc\n"
+      "ldr d21, [x27], #0x8\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d19, [x24], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "ld1 { v23.s }[2], [x9], #0x4\n"
+      "ld1 { v27.s }[2], [x28], #0x4\n"
+      "ld1 { v21.s }[2], [x27], #0x4\n"
+      "ld1 { v26.s }[2], [x26], #0x4\n"
+      "ld1 { v20.s }[2], [x25], #0x4\n"
+      "ld1 { v19.s }[2], [x24], #0x4\n"
+      "zip1 v25.16b, v23.16b, v20.16b\n"
+      "zip1 v24.16b, v27.16b, v19.16b\n"
+      "ld1 { v17.s }[2], [x23], #0x4\n"
+      "ld1 { v16.s }[2], [x22], #0x4\n"
+      "zip1 v22.16b, v21.16b, v17.16b\n"
+      "zip1 v18.16b, v26.16b, v16.16b\n"
+      "zip2 v23.16b, v23.16b, v20.16b\n"
+      "zip2 v21.16b, v21.16b, v17.16b\n"
+      "zip2 v20.16b, v27.16b, v19.16b\n"
+      "zip2 v17.16b, v26.16b, v16.16b\n"
+      "zip1 v19.16b, v25.16b, v22.16b\n"
+      "zip1 v16.16b, v24.16b, v18.16b\n"
+      "zip2 v22.16b, v25.16b, v22.16b\n"
+      "zip2 v18.16b, v24.16b, v18.16b\n"
+      "zip1 v21.16b, v23.16b, v21.16b\n"
+      "zip1 v20.16b, v20.16b, v17.16b\n"
+      "zip1 v17.16b, v19.16b, v16.16b\n"
+      "zip2 v16.16b, v19.16b, v16.16b\n"
+      "str q17, [x20, #0x0]\n"
+      "zip1 v19.16b, v22.16b, v18.16b\n"
+      "zip2 v18.16b, v22.16b, v18.16b\n"
+      "str q16, [x20, #0x10]\n"
+      "zip1 v17.16b, v21.16b, v20.16b\n"
+      "zip2 v16.16b, v21.16b, v20.16b\n"
+      "str q19, [x20, #0x20]\n"
+      "str q18, [x20, #0x30]\n"
+      "str q17, [x20, #0x40]\n"
+      "str q16, [x20, #0x50]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x21, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s18, [x9], #0x4\n"
+      "ldr s19, [x28], #0x4\n"
+      "sub x21, x21, #0x4\n"
+      "cmp x21, #0x4\n"
+      "ldr s21, [x27], #0x4\n"
+      "ldr s20, [x26], #0x4\n"
+      "ldr s17, [x25], #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "zip1 v18.16b, v18.16b, v17.16b\n"
+      "zip1 v19.16b, v19.16b, v16.16b\n"
+      "ldr s17, [x23], #0x4\n"
+      "ldr s16, [x22], #0x4\n"
+      "zip1 v17.16b, v21.16b, v17.16b\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "zip1 v18.16b, v18.16b, v17.16b\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "zip1 v17.16b, v18.16b, v16.16b\n"
+      "zip2 v16.16b, v18.16b, v16.16b\n"
+      "str q17, [x20, #0x0]\n"
+      "str q16, [x20, #0x10]\n"
+      "add x20, x20, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x21, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b19, [x9], #0x1\n"
+      "ldr b18, [x28], #0x1\n"
+      "sub x21, x21, #0x1\n"
+      "cmp x21, #0x1\n"
+      "ldr b21, [x27], #0x1\n"
+      "ldr b20, [x26], #0x1\n"
+      "ldr b17, [x25], #0x1\n"
+      "ldr b16, [x24], #0x1\n"
+      "zip1 v19.16b, v19.16b, v17.16b\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr b17, [x23], #0x1\n"
+      "ldr b16, [x22], #0x1\n"
+      "zip1 v17.16b, v21.16b, v17.16b\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str d16, [x20, #0x0]\n"
+      "add x20, x20, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x60\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<12, 8, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<12, 8, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
new file mode 100644
index 0000000000..cef468e9cc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 2) * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x18\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "sub x24, x24, #0x18\n"
+      "zip1 v10.8h, v19.8h, v18.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip2 v9.8h, v19.8h, v18.8h\n"
+      "zip1 v8.8h, v17.8h, v16.8h\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v7.8h, v17.8h, v16.8h\n"
+      "zip1 v6.8h, v19.8h, v18.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip2 v5.8h, v19.8h, v18.8h\n"
+      "zip1 v4.8h, v17.8h, v16.8h\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip1 v3.8h, v21.8h, v18.8h\n"
+      "zip2 v2.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v1.8h, v17.8h, v16.8h\n"
+      "cmp x24, #0x18\n"
+      "ldr q20, [x25], #0x10\n"
+      "ldr q19, [x23], #0x10\n"
+      "zip1 v0.8h, v20.8h, v19.8h\n"
+      "zip2 v31.8h, v21.8h, v18.8h\n"
+      "ldr q30, [x22], #0x10\n"
+      "ldr q29, [x20], #0x10\n"
+      "zip1 v28.8h, v30.8h, v29.8h\n"
+      "zip2 v27.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x9], #0x10\n"
+      "ldr q16, [x28], #0x10\n"
+      "zip1 v26.8h, v17.8h, v16.8h\n"
+      "zip2 v25.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v24.8h, v17.8h, v16.8h\n"
+      "zip2 v23.8h, v17.8h, v16.8h\n"
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "zip2 v22.8h, v20.8h, v19.8h\n"
+      "zip1 v21.8h, v18.8h, v17.8h\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q10, [x21, #0x0]\n"
+      "zip2 v19.8h, v18.8h, v17.8h\n"
+      "str q9, [x21, #0x10]\n"
+      "zip2 v18.8h, v30.8h, v29.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "str q3, [x21, #0x20]\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q8, [x21, #0x30]\n"
+      "str q7, [x21, #0x40]\n"
+      "str q1, [x21, #0x50]\n"
+      "str q6, [x21, #0x60]\n"
+      "str q5, [x21, #0x70]\n"
+      "str q0, [x21, #0x80]\n"
+      "str q4, [x21, #0x90]\n"
+      "str q2, [x21, #0xa0]\n"
+      "str q28, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "str q31, [x21, #0x0]\n"
+      "str q26, [x21, #0x10]\n"
+      "str q25, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "str q24, [x21, #0x40]\n"
+      "str q23, [x21, #0x50]\n"
+      "str q22, [x21, #0x60]\n"
+      "str q21, [x21, #0x70]\n"
+      "str q19, [x21, #0x80]\n"
+      "str q18, [x21, #0x90]\n"
+      "str q17, [x21, #0xa0]\n"
+      "str q16, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q17, [x9], #0x10\n"
+      "ldr q16, [x28], #0x10\n"
+      "sub x24, x24, #0xc\n"
+      "cmp x24, #0xc\n"
+      "ldr q19, [x27], #0x10\n"
+      "ldr q18, [x26], #0x10\n"
+      "zip1 v28.8h, v17.8h, v16.8h\n"
+      "zip2 v27.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v26.8h, v19.8h, v18.8h\n"
+      "zip2 v25.8h, v19.8h, v18.8h\n"
+      "ldr q19, [x22], #0x10\n"
+      "ldr q18, [x20], #0x10\n"
+      "zip1 v24.8h, v17.8h, v16.8h\n"
+      "zip2 v23.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x9], #0x8\n"
+      "ldr d16, [x28], #0x8\n"
+      "zip1 v22.8h, v17.8h, v16.8h\n"
+      "zip1 v21.8h, v19.8h, v18.8h\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "zip2 v19.8h, v19.8h, v18.8h\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q28, [x21, #0x0]\n"
+      "str q27, [x21, #0x10]\n"
+      "str q22, [x21, #0x20]\n"
+      "str q26, [x21, #0x30]\n"
+      "str q25, [x21, #0x40]\n"
+      "str q20, [x21, #0x50]\n"
+      "str q24, [x21, #0x60]\n"
+      "str q23, [x21, #0x70]\n"
+      "str q18, [x21, #0x80]\n"
+      "str q21, [x21, #0x90]\n"
+      "str q19, [x21, #0xa0]\n"
+      "str q16, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x9], #0x8\n"
+      "ldr d18, [x28], #0x8\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v20.8h, v19.8h, v18.8h\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "str q20, [x21, #0x0]\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q19, [x21, #0x30]\n"
+      "str q18, [x21, #0x60]\n"
+      "str q16, [x21, #0x90]\n"
+      "add x21, x21, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x9], #0x2\n"
+      "ldr h18, [x28], #0x2\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr h17, [x27], #0x2\n"
+      "ldr h16, [x26], #0x2\n"
+      "zip1 v20.8h, v19.8h, v18.8h\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x25], #0x2\n"
+      "ldr h16, [x23], #0x2\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "str s20, [x21, #0x0]\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str s19, [x21, #0x30]\n"
+      "str s18, [x21, #0x60]\n"
+      "str s16, [x21, #0x90]\n"
+      "add x21, x21, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x8\n"
+      "add %x[out], %x[out], #0xc0\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x20, %x[width]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x28, %x[in_stride]\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "cmp x20, #0x18\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q17, [x9], #0x10\n"
+      "ldr q16, [x28], #0x10\n"
+      "sub x20, x20, #0x18\n"
+      "zip1 v22.8h, v17.8h, v16.8h\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v17.8h, v17.8h, v16.8h\n"
+      "zip1 v20.8h, v21.8h, v18.8h\n"
+      "ldr q19, [x9], #0x10\n"
+      "ldr q16, [x28], #0x10\n"
+      "str q22, [x21, #0x0]\n"
+      "cmp x20, #0x18\n"
+      "str q17, [x21, #0x10]\n"
+      "zip2 v18.8h, v21.8h, v18.8h\n"
+      "zip1 v17.8h, v19.8h, v16.8h\n"
+      "str q20, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip2 v16.8h, v19.8h, v16.8h\n"
+      "str q18, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q20, [x9], #0x10\n"
+      "ldr q17, [x28], #0x10\n"
+      "sub x20, x20, #0xc\n"
+      "cmp x20, #0xc\n"
+      "ldr d19, [x9], #0x8\n"
+      "ldr d16, [x28], #0x8\n"
+      "zip1 v18.8h, v20.8h, v17.8h\n"
+      "zip2 v17.8h, v20.8h, v17.8h\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "str q18, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d17, [x9], #0x8\n"
+      "ldr d16, [x28], #0x8\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h17, [x9], #0x2\n"
+      "ldr h16, [x28], #0x2\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str s16, [x21, #0x0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x30\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<12, 2, true, VLType::None>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
new file mode 100644
index 0000000000..4c02d0534d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 4) * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x18\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "sub x24, x24, #0x18\n"
+      "cmp x24, #0x18\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v13.8h, v21.8h, v17.8h\n"
+      "zip1 v12.8h, v20.8h, v16.8h\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v11.8h, v21.8h, v17.8h\n"
+      "zip2 v10.8h, v20.8h, v16.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v9.8h, v19.8h, v17.8h\n"
+      "zip1 v8.8h, v18.8h, v16.8h\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v7.8h, v19.8h, v17.8h\n"
+      "zip2 v6.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v27.8h, v21.8h, v17.8h\n"
+      "zip1 v22.8h, v20.8h, v16.8h\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v5.8h, v21.8h, v17.8h\n"
+      "zip2 v4.8h, v20.8h, v16.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v26.8h, v19.8h, v17.8h\n"
+      "zip1 v25.8h, v18.8h, v16.8h\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v3.8h, v19.8h, v17.8h\n"
+      "zip2 v2.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v24.8h, v21.8h, v17.8h\n"
+      "zip1 v23.8h, v20.8h, v16.8h\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v1.8h, v21.8h, v17.8h\n"
+      "zip2 v0.8h, v20.8h, v16.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v31.8h, v19.8h, v17.8h\n"
+      "zip1 v30.8h, v18.8h, v16.8h\n"
+      "zip2 v29.8h, v19.8h, v17.8h\n"
+      "zip2 v28.8h, v18.8h, v16.8h\n"
+      "zip1 v17.8h, v13.8h, v12.8h\n"
+      "zip2 v16.8h, v13.8h, v12.8h\n"
+      "str q17, [x21, #0x0]\n"
+      "zip1 v18.8h, v11.8h, v10.8h\n"
+      "zip2 v17.8h, v11.8h, v10.8h\n"
+      "str q16, [x21, #0x10]\n"
+      "zip1 v16.8h, v27.8h, v22.8h\n"
+      "zip2 v22.8h, v27.8h, v22.8h\n"
+      "str q18, [x21, #0x20]\n"
+      "zip1 v21.8h, v9.8h, v8.8h\n"
+      "zip2 v20.8h, v9.8h, v8.8h\n"
+      "str q17, [x21, #0x30]\n"
+      "zip1 v19.8h, v7.8h, v6.8h\n"
+      "zip2 v18.8h, v7.8h, v6.8h\n"
+      "str q16, [x21, #0x40]\n"
+      "zip1 v17.8h, v26.8h, v25.8h\n"
+      "zip2 v16.8h, v26.8h, v25.8h\n"
+      "str q22, [x21, #0x50]\n"
+      "str q21, [x21, #0x60]\n"
+      "zip1 v27.8h, v5.8h, v4.8h\n"
+      "zip2 v26.8h, v5.8h, v4.8h\n"
+      "str q20, [x21, #0x70]\n"
+      "zip1 v25.8h, v24.8h, v23.8h\n"
+      "zip2 v24.8h, v24.8h, v23.8h\n"
+      "str q19, [x21, #0x80]\n"
+      "zip1 v23.8h, v1.8h, v0.8h\n"
+      "zip2 v22.8h, v1.8h, v0.8h\n"
+      "str q18, [x21, #0x90]\n"
+      "zip1 v21.8h, v3.8h, v2.8h\n"
+      "zip2 v20.8h, v3.8h, v2.8h\n"
+      "str q17, [x21, #0xa0]\n"
+      "zip1 v19.8h, v31.8h, v30.8h\n"
+      "zip2 v18.8h, v31.8h, v30.8h\n"
+      "str q16, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip1 v17.8h, v29.8h, v28.8h\n"
+      "zip2 v16.8h, v29.8h, v28.8h\n"
+      "str q27, [x21, #0x0]\n"
+      "str q26, [x21, #0x10]\n"
+      "str q25, [x21, #0x20]\n"
+      "str q24, [x21, #0x30]\n"
+      "str q23, [x21, #0x40]\n"
+      "str q22, [x21, #0x50]\n"
+      "str q21, [x21, #0x60]\n"
+      "str q20, [x21, #0x70]\n"
+      "str q19, [x21, #0x80]\n"
+      "str q18, [x21, #0x90]\n"
+      "str q17, [x21, #0xa0]\n"
+      "str q16, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "sub x24, x24, #0xc\n"
+      "cmp x24, #0xc\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v25.8h, v19.8h, v17.8h\n"
+      "zip1 v24.8h, v18.8h, v16.8h\n"
+      "ldr q21, [x25], #0x10\n"
+      "ldr q20, [x23], #0x10\n"
+      "zip2 v31.8h, v19.8h, v17.8h\n"
+      "zip2 v23.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v30.8h, v21.8h, v17.8h\n"
+      "zip1 v29.8h, v20.8h, v16.8h\n"
+      "ldr d19, [x9], #0x8\n"
+      "ldr d18, [x28], #0x8\n"
+      "zip2 v28.8h, v21.8h, v17.8h\n"
+      "zip2 v27.8h, v20.8h, v16.8h\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v26.8h, v19.8h, v17.8h\n"
+      "zip1 v22.8h, v18.8h, v16.8h\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "zip1 v19.8h, v25.8h, v24.8h\n"
+      "zip2 v18.8h, v25.8h, v24.8h\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "zip1 v25.8h, v21.8h, v17.8h\n"
+      "zip1 v24.8h, v20.8h, v16.8h\n"
+      "zip1 v17.8h, v31.8h, v23.8h\n"
+      "zip2 v16.8h, v31.8h, v23.8h\n"
+      "str q19, [x21, #0x0]\n"
+      "zip1 v23.8h, v26.8h, v22.8h\n"
+      "zip2 v22.8h, v26.8h, v22.8h\n"
+      "str q18, [x21, #0x10]\n"
+      "zip1 v21.8h, v30.8h, v29.8h\n"
+      "zip2 v20.8h, v30.8h, v29.8h\n"
+      "str q17, [x21, #0x20]\n"
+      "zip1 v19.8h, v28.8h, v27.8h\n"
+      "zip2 v18.8h, v28.8h, v27.8h\n"
+      "str q16, [x21, #0x30]\n"
+      "zip1 v17.8h, v25.8h, v24.8h\n"
+      "zip2 v16.8h, v25.8h, v24.8h\n"
+      "str q23, [x21, #0x40]\n"
+      "str q22, [x21, #0x50]\n"
+      "str q21, [x21, #0x60]\n"
+      "str q20, [x21, #0x70]\n"
+      "str q19, [x21, #0x80]\n"
+      "str q18, [x21, #0x90]\n"
+      "str q17, [x21, #0xa0]\n"
+      "str q16, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x9], #0x8\n"
+      "ldr d18, [x28], #0x8\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v17.8h, v19.8h, v17.8h\n"
+      "zip1 v16.8h, v18.8h, v16.8h\n"
+      "ldr d18, [x25], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "zip2 v19.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "zip1 v18.8h, v18.8h, v17.8h\n"
+      "zip1 v16.8h, v21.8h, v16.8h\n"
+      "str q20, [x21, #0x0]\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "str q19, [x21, #0x10]\n"
+      "str q17, [x21, #0x60]\n"
+      "str q16, [x21, #0x70]\n"
+      "add x21, x21, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x9], #0x2\n"
+      "ldr h18, [x28], #0x2\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr h17, [x27], #0x2\n"
+      "ldr h16, [x26], #0x2\n"
+      "zip1 v17.8h, v19.8h, v17.8h\n"
+      "zip1 v16.8h, v18.8h, v16.8h\n"
+      "ldr h20, [x25], #0x2\n"
+      "ldr h19, [x23], #0x2\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "zip1 v17.8h, v20.8h, v17.8h\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "str d18, [x21, #0x0]\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str d16, [x21, #0x60]\n"
+      "add x21, x21, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x8\n"
+      "add %x[out], %x[out], #0xc0\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "mov x20, %x[width]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "csel x27, x27, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "cmp x20, #0x18\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "sub x20, x20, #0x18\n"
+      "cmp x20, #0x18\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v31.8h, v19.8h, v17.8h\n"
+      "zip1 v30.8h, v18.8h, v16.8h\n"
+      "ldr q22, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v29.8h, v19.8h, v17.8h\n"
+      "zip2 v28.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v27.8h, v22.8h, v17.8h\n"
+      "zip1 v21.8h, v20.8h, v16.8h\n"
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v26.8h, v22.8h, v17.8h\n"
+      "zip2 v20.8h, v20.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v25.8h, v19.8h, v17.8h\n"
+      "zip1 v24.8h, v18.8h, v16.8h\n"
+      "zip2 v23.8h, v19.8h, v17.8h\n"
+      "zip2 v22.8h, v18.8h, v16.8h\n"
+      "zip1 v17.8h, v31.8h, v30.8h\n"
+      "zip2 v16.8h, v31.8h, v30.8h\n"
+      "str q17, [x21, #0x0]\n"
+      "zip1 v19.8h, v29.8h, v28.8h\n"
+      "zip2 v18.8h, v29.8h, v28.8h\n"
+      "str q16, [x21, #0x10]\n"
+      "zip1 v17.8h, v27.8h, v21.8h\n"
+      "zip2 v16.8h, v27.8h, v21.8h\n"
+      "str q19, [x21, #0x20]\n"
+      "str q18, [x21, #0x30]\n"
+      "zip1 v21.8h, v26.8h, v20.8h\n"
+      "zip2 v20.8h, v26.8h, v20.8h\n"
+      "str q17, [x21, #0x40]\n"
+      "zip1 v19.8h, v25.8h, v24.8h\n"
+      "zip2 v18.8h, v25.8h, v24.8h\n"
+      "str q16, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip1 v17.8h, v23.8h, v22.8h\n"
+      "zip2 v16.8h, v23.8h, v22.8h\n"
+      "str q21, [x21, #0x0]\n"
+      "str q20, [x21, #0x10]\n"
+      "str q19, [x21, #0x20]\n"
+      "str q18, [x21, #0x30]\n"
+      "str q17, [x21, #0x40]\n"
+      "str q16, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q21, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "sub x20, x20, #0xc\n"
+      "cmp x20, #0xc\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v24.8h, v21.8h, v17.8h\n"
+      "zip1 v23.8h, v18.8h, v16.8h\n"
+      "ldr d20, [x9], #0x8\n"
+      "ldr d19, [x28], #0x8\n"
+      "zip2 v22.8h, v21.8h, v17.8h\n"
+      "zip2 v18.8h, v18.8h, v16.8h\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v21.8h, v20.8h, v17.8h\n"
+      "zip1 v20.8h, v19.8h, v16.8h\n"
+      "zip1 v17.8h, v24.8h, v23.8h\n"
+      "zip2 v16.8h, v24.8h, v23.8h\n"
+      "str q17, [x21, #0x0]\n"
+      "zip1 v19.8h, v22.8h, v18.8h\n"
+      "zip2 v18.8h, v22.8h, v18.8h\n"
+      "str q16, [x21, #0x10]\n"
+      "zip1 v17.8h, v21.8h, v20.8h\n"
+      "zip2 v16.8h, v21.8h, v20.8h\n"
+      "str q19, [x21, #0x20]\n"
+      "str q18, [x21, #0x30]\n"
+      "str q17, [x21, #0x40]\n"
+      "str q16, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d18, [x9], #0x8\n"
+      "ldr d19, [x28], #0x8\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v18.8h, v18.8h, v17.8h\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "str q17, [x21, #0x0]\n"
+      "str q16, [x21, #0x10]\n"
+      "add x21, x21, #0x20\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h19, [x9], #0x2\n"
+      "ldr h18, [x28], #0x2\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "ldr h17, [x27], #0x2\n"
+      "ldr h16, [x26], #0x2\n"
+      "zip1 v17.8h, v19.8h, v17.8h\n"
+      "zip1 v16.8h, v18.8h, v16.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str d16, [x21, #0x0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x60\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<12, 4, true, VLType::None>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_2x4(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..2a3208d18d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_12_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 4) * sizeof(bfloat16);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x18\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q15, [x9], #0x10\n"
+      "ldr q17, [x28], #0x10\n"
+      "sub x24, x24, #0x18\n"
+      "cmp x24, #0x18\n"
+      "ldr q16, [x27], #0x10\n"
+      "ldr q20, [x26], #0x10\n"
+      "zip1 v6.4s, v15.4s, v16.4s\n"
+      "zip1 v11.4s, v17.4s, v20.4s\n"
+      "ldr q2, [x25], #0x10\n"
+      "ldr q4, [x23], #0x10\n"
+      "zip2 v22.4s, v15.4s, v16.4s\n"
+      "zip2 v18.4s, v17.4s, v20.4s\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q26, [x20], #0x10\n"
+      "zip1 v9.4s, v2.4s, v17.4s\n"
+      "zip1 v10.4s, v4.4s, v26.4s\n"
+      "ldr q16, [x9], #0x10\n"
+      "ldr q27, [x28], #0x10\n"
+      "zip2 v3.4s, v2.4s, v17.4s\n"
+      "zip2 v30.4s, v4.4s, v26.4s\n"
+      "ldr q13, [x27], #0x10\n"
+      "ldr q1, [x26], #0x10\n"
+      "zip1 v23.4s, v16.4s, v13.4s\n"
+      "zip1 v5.4s, v27.4s, v1.4s\n"
+      "ldr q26, [x25], #0x10\n"
+      "ldr q14, [x23], #0x10\n"
+      "zip2 v0.4s, v16.4s, v13.4s\n"
+      "zip2 v2.4s, v27.4s, v1.4s\n"
+      "ldr q15, [x22], #0x10\n"
+      "ldr q8, [x20], #0x10\n"
+      "zip1 v31.4s, v26.4s, v15.4s\n"
+      "zip1 v4.4s, v14.4s, v8.4s\n"
+      "ldr q28, [x9], #0x10\n"
+      "ldr q19, [x28], #0x10\n"
+      "zip2 v21.4s, v26.4s, v15.4s\n"
+      "zip2 v16.4s, v14.4s, v8.4s\n"
+      "ldr q15, [x27], #0x10\n"
+      "ldr q1, [x26], #0x10\n"
+      "zip1 v17.4s, v28.4s, v15.4s\n"
+      "zip1 v8.4s, v19.4s, v1.4s\n"
+      "ldr q27, [x25], #0x10\n"
+      "ldr q20, [x23], #0x10\n"
+      "zip2 v7.4s, v28.4s, v15.4s\n"
+      "zip2 v15.4s, v19.4s, v1.4s\n"
+      "ldr q12, [x22], #0x10\n"
+      "ldr q25, [x20], #0x10\n"
+      "zip1 v14.4s, v27.4s, v12.4s\n"
+      "zip1 v26.4s, v20.4s, v25.4s\n"
+      "ldr q13, [x9], #0x10\n"
+      "ldr q29, [x28], #0x10\n"
+      "zip2 v28.4s, v27.4s, v12.4s\n"
+      "zip2 v12.4s, v20.4s, v25.4s\n"
+      "ldr q27, [x27], #0x10\n"
+      "ldr q20, [x26], #0x10\n"
+      "zip1 v19.4s, v13.4s, v27.4s\n"
+      "zip1 v25.4s, v29.4s, v20.4s\n"
+      "ldr q24, [x25], #0x10\n"
+      "ldr q1, [x23], #0x10\n"
+      "zip2 v27.4s, v13.4s, v27.4s\n"
+      "zip2 v13.4s, v29.4s, v20.4s\n"
+      "ldr q20, [x22], #0x10\n"
+      "zip1 v29.4s, v24.4s, v20.4s\n"
+      "zip2 v20.4s, v24.4s, v20.4s\n"
+      "zip1 v24.4s, v6.4s, v11.4s\n"
+      ".inst 0x0ea16b18  // bfcvtn v24.4h, v24.4s\n"
+      "zip2 v11.4s, v6.4s, v11.4s\n"
+      "ldr q6, [x20], #0x10\n"
+      ".inst 0x4ea16978  // bfcvtn2 v24.8h, v11.4s\n"
+      "zip1 v11.4s, v1.4s, v6.4s\n"
+      "zip2 v6.4s, v1.4s, v6.4s\n"
+      "zip1 v1.4s, v22.4s, v18.4s\n"
+      ".inst 0x0ea16821  // bfcvtn v1.4h, v1.4s\n"
+      "zip2 v18.4s, v22.4s, v18.4s\n"
+      "ldr q22, [x9], #0x10\n"
+      ".inst 0x4ea16a41  // bfcvtn2 v1.8h, v18.4s\n"
+      "zip1 v18.4s, v23.4s, v5.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      "zip2 v5.4s, v23.4s, v5.4s\n"
+      "ldr q23, [x28], #0x10\n"
+      ".inst 0x4ea168b2  // bfcvtn2 v18.8h, v5.4s\n"
+      "zip1 v5.4s, v0.4s, v2.4s\n"
+      ".inst 0x0ea168a5  // bfcvtn v5.4h, v5.4s\n"
+      "zip2 v0.4s, v0.4s, v2.4s\n"
+      "ldr q2, [x27], #0x10\n"
+      ".inst 0x4ea16805  // bfcvtn2 v5.8h, v0.4s\n"
+      "zip1 v0.4s, v22.4s, v2.4s\n"
+      "zip2 v2.4s, v22.4s, v2.4s\n"
+      "zip1 v22.4s, v17.4s, v8.4s\n"
+      ".inst 0x0ea16ad6  // bfcvtn v22.4h, v22.4s\n"
+      "zip2 v8.4s, v17.4s, v8.4s\n"
+      "ldr q17, [x26], #0x10\n"
+      ".inst 0x4ea16916  // bfcvtn2 v22.8h, v8.4s\n"
+      "zip1 v8.4s, v23.4s, v17.4s\n"
+      "zip2 v23.4s, v23.4s, v17.4s\n"
+      "zip1 v17.4s, v7.4s, v15.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      "zip2 v7.4s, v7.4s, v15.4s\n"
+      "ldr q15, [x25], #0x10\n"
+      ".inst 0x4ea168f1  // bfcvtn2 v17.8h, v7.4s\n"
+      "zip1 v7.4s, v9.4s, v10.4s\n"
+      ".inst 0x0ea168e7  // bfcvtn v7.4h, v7.4s\n"
+      "zip2 v10.4s, v9.4s, v10.4s\n"
+      "ldr q9, [x23], #0x10\n"
+      ".inst 0x4ea16947  // bfcvtn2 v7.8h, v10.4s\n"
+      "zip1 v10.4s, v3.4s, v30.4s\n"
+      ".inst 0x0ea1694a  // bfcvtn v10.4h, v10.4s\n"
+      "zip2 v30.4s, v3.4s, v30.4s\n"
+      "ldr q3, [x22], #0x10\n"
+      ".inst 0x4ea16bca  // bfcvtn2 v10.8h, v30.4s\n"
+      "zip1 v30.4s, v15.4s, v3.4s\n"
+      "zip2 v15.4s, v15.4s, v3.4s\n"
+      "zip1 v3.4s, v31.4s, v4.4s\n"
+      ".inst 0x0ea16863  // bfcvtn v3.4h, v3.4s\n"
+      "zip2 v31.4s, v31.4s, v4.4s\n"
+      "ldr q4, [x20], #0x10\n"
+      ".inst 0x4ea16be3  // bfcvtn2 v3.8h, v31.4s\n"
+      "zip1 v31.4s, v9.4s, v4.4s\n"
+      "zip2 v4.4s, v9.4s, v4.4s\n"
+      "zip1 v9.4s, v21.4s, v16.4s\n"
+      ".inst 0x0ea16929  // bfcvtn v9.4h, v9.4s\n"
+      "zip2 v16.4s, v21.4s, v16.4s\n"
+      "ldr q21, [x9], #0x10\n"
+      ".inst 0x4ea16a09  // bfcvtn2 v9.8h, v16.4s\n"
+      "zip1 v16.4s, v14.4s, v26.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v14.4s, v14.4s, v26.4s\n"
+      "ldr q26, [x28], #0x10\n"
+      ".inst 0x4ea169d0  // bfcvtn2 v16.8h, v14.4s\n"
+      "zip1 v14.4s, v28.4s, v12.4s\n"
+      ".inst 0x0ea169ce  // bfcvtn v14.4h, v14.4s\n"
+      "zip2 v12.4s, v28.4s, v12.4s\n"
+      "ldr q28, [x27], #0x10\n"
+      ".inst 0x4ea1698e  // bfcvtn2 v14.8h, v12.4s\n"
+      "zip1 v12.4s, v21.4s, v28.4s\n"
+      "zip2 v28.4s, v21.4s, v28.4s\n"
+      "zip1 v21.4s, v19.4s, v25.4s\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      "zip2 v19.4s, v19.4s, v25.4s\n"
+      "ldr q25, [x26], #0x10\n"
+      ".inst 0x4ea16a75  // bfcvtn2 v21.8h, v19.4s\n"
+      "zip1 v19.4s, v26.4s, v25.4s\n"
+      "zip2 v25.4s, v26.4s, v25.4s\n"
+      "zip1 v26.4s, v27.4s, v13.4s\n"
+      ".inst 0x0ea16b5a  // bfcvtn v26.4h, v26.4s\n"
+      "zip2 v13.4s, v27.4s, v13.4s\n"
+      "ldr q27, [x25], #0x10\n"
+      ".inst 0x4ea169ba  // bfcvtn2 v26.8h, v13.4s\n"
+      "zip1 v13.4s, v0.4s, v8.4s\n"
+      ".inst 0x0ea169ad  // bfcvtn v13.4h, v13.4s\n"
+      "zip2 v8.4s, v0.4s, v8.4s\n"
+      "ldr q0, [x23], #0x10\n"
+      ".inst 0x4ea1690d  // bfcvtn2 v13.8h, v8.4s\n"
+      "zip1 v8.4s, v2.4s, v23.4s\n"
+      ".inst 0x0ea16908  // bfcvtn v8.4h, v8.4s\n"
+      "zip2 v23.4s, v2.4s, v23.4s\n"
+      "ldr q2, [x22], #0x10\n"
+      ".inst 0x4ea16ae8  // bfcvtn2 v8.8h, v23.4s\n"
+      "ldr q23, [x20], #0x10\n"
+      "str q24, [x21, #0x0]\n"
+      "zip1 v24.4s, v27.4s, v2.4s\n"
+      "zip2 v27.4s, v27.4s, v2.4s\n"
+      "zip1 v2.4s, v0.4s, v23.4s\n"
+      "zip2 v23.4s, v0.4s, v23.4s\n"
+      "str q1, [x21, #0x10]\n"
+      "zip1 v0.4s, v12.4s, v19.4s\n"
+      "zip1 v1.4s, v28.4s, v25.4s\n"
+      "str q18, [x21, #0x20]\n"
+      "zip1 v18.4s, v29.4s, v11.4s\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "str q5, [x21, #0x30]\n"
+      "zip1 v5.4s, v20.4s, v6.4s\n"
+      "zip2 v19.4s, v12.4s, v19.4s\n"
+      "str q22, [x21, #0x40]\n"
+      "zip1 v12.4s, v30.4s, v31.4s\n"
+      "zip1 v22.4s, v15.4s, v4.4s\n"
+      "str q17, [x21, #0x50]\n"
+      "zip1 v17.4s, v24.4s, v2.4s\n"
+      ".inst 0x0ea16821  // bfcvtn v1.4h, v1.4s\n"
+      "str q7, [x21, #0x60]\n"
+      "zip1 v7.4s, v27.4s, v23.4s\n"
+      "zip2 v25.4s, v28.4s, v25.4s\n"
+      "str q10, [x21, #0x70]\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      "zip2 v29.4s, v29.4s, v11.4s\n"
+      "str q3, [x21, #0x80]\n"
+      ".inst 0x0ea168ab  // bfcvtn v11.4h, v5.4s\n"
+      "zip2 v10.4s, v20.4s, v6.4s\n"
+      "str q9, [x21, #0x90]\n"
+      ".inst 0x0ea16986  // bfcvtn v6.4h, v12.4s\n"
+      "zip2 v12.4s, v30.4s, v31.4s\n"
+      "str q16, [x21, #0xa0]\n"
+      ".inst 0x0ea16ac5  // bfcvtn v5.4h, v22.4s\n"
+      "zip2 v4.4s, v15.4s, v4.4s\n"
+      "str q14, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      ".inst 0x0ea16a2f  // bfcvtn v15.4h, v17.4s\n"
+      "zip2 v20.4s, v24.4s, v2.4s\n"
+      "str q21, [x21, #0x0]\n"
+      ".inst 0x0ea168fc  // bfcvtn v28.4h, v7.4s\n"
+      "zip2 v30.4s, v27.4s, v23.4s\n"
+      "str q26, [x21, #0x10]\n"
+      ".inst 0x4ea16a60  // bfcvtn2 v0.8h, v19.4s\n"
+      ".inst 0x4ea16b21  // bfcvtn2 v1.8h, v25.4s\n"
+      "str q13, [x21, #0x20]\n"
+      ".inst 0x4ea16bb2  // bfcvtn2 v18.8h, v29.4s\n"
+      ".inst 0x4ea1694b  // bfcvtn2 v11.8h, v10.4s\n"
+      "str q8, [x21, #0x30]\n"
+      ".inst 0x4ea16986  // bfcvtn2 v6.8h, v12.4s\n"
+      ".inst 0x4ea16885  // bfcvtn2 v5.8h, v4.4s\n"
+      "str q0, [x21, #0x40]\n"
+      ".inst 0x4ea16a8f  // bfcvtn2 v15.8h, v20.4s\n"
+      ".inst 0x4ea16bdc  // bfcvtn2 v28.8h, v30.4s\n"
+      "str q1, [x21, #0x50]\n"
+      "str q18, [x21, #0x60]\n"
+      "str q11, [x21, #0x70]\n"
+      "str q6, [x21, #0x80]\n"
+      "str q5, [x21, #0x90]\n"
+      "str q15, [x21, #0xa0]\n"
+      "str q28, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q20, [x9], #0x10\n"
+      "ldr q9, [x28], #0x10\n"
+      "sub x24, x24, #0xc\n"
+      "cmp x24, #0xc\n"
+      "ldr q8, [x27], #0x10\n"
+      "ldr q1, [x26], #0x10\n"
+      "zip1 v7.4s, v20.4s, v8.4s\n"
+      "zip1 v19.4s, v9.4s, v1.4s\n"
+      "ldr q6, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip2 v5.4s, v20.4s, v8.4s\n"
+      "zip2 v18.4s, v9.4s, v1.4s\n"
+      "ldr q27, [x22], #0x10\n"
+      "ldr q14, [x20], #0x10\n"
+      "zip1 v26.4s, v6.4s, v27.4s\n"
+      "zip1 v15.4s, v16.4s, v14.4s\n"
+      "ldr q1, [x9], #0x10\n"
+      "ldr q30, [x28], #0x10\n"
+      "zip2 v24.4s, v6.4s, v27.4s\n"
+      "zip2 v25.4s, v16.4s, v14.4s\n"
+      "ldr q13, [x27], #0x10\n"
+      "ldr q17, [x26], #0x10\n"
+      "zip1 v10.4s, v1.4s, v13.4s\n"
+      "zip1 v16.4s, v30.4s, v17.4s\n"
+      "ldr q4, [x25], #0x10\n"
+      "ldr q11, [x23], #0x10\n"
+      "zip2 v0.4s, v1.4s, v13.4s\n"
+      "zip2 v27.4s, v30.4s, v17.4s\n"
+      "ldr q28, [x22], #0x10\n"
+      "ldr q12, [x20], #0x10\n"
+      "zip1 v22.4s, v4.4s, v28.4s\n"
+      "zip1 v13.4s, v11.4s, v12.4s\n"
+      "ldr q31, [x9], #0x10\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v14.4s, v4.4s, v28.4s\n"
+      "zip2 v12.4s, v11.4s, v12.4s\n"
+      "ldr q2, [x27], #0x10\n"
+      "ldr q3, [x26], #0x10\n"
+      "zip1 v8.4s, v31.4s, v2.4s\n"
+      "zip1 v4.4s, v17.4s, v3.4s\n"
+      "ldr q23, [x25], #0x10\n"
+      "ldr q1, [x23], #0x10\n"
+      "zip2 v28.4s, v31.4s, v2.4s\n"
+      "zip2 v29.4s, v17.4s, v3.4s\n"
+      "ldr q11, [x22], #0x10\n"
+      "ldr q17, [x20], #0x10\n"
+      "zip1 v9.4s, v23.4s, v11.4s\n"
+      "zip1 v21.4s, v1.4s, v17.4s\n"
+      "zip2 v11.4s, v23.4s, v11.4s\n"
+      "zip2 v17.4s, v1.4s, v17.4s\n"
+      "zip1 v2.4s, v7.4s, v19.4s\n"
+      "zip1 v31.4s, v5.4s, v18.4s\n"
+      "zip1 v3.4s, v10.4s, v16.4s\n"
+      "zip1 v6.4s, v0.4s, v27.4s\n"
+      "zip1 v1.4s, v8.4s, v4.4s\n"
+      "zip1 v30.4s, v28.4s, v29.4s\n"
+      "zip1 v20.4s, v26.4s, v15.4s\n"
+      "zip1 v23.4s, v24.4s, v25.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "zip2 v7.4s, v7.4s, v19.4s\n"
+      "zip1 v19.4s, v22.4s, v13.4s\n"
+      ".inst 0x0ea16bff  // bfcvtn v31.4h, v31.4s\n"
+      "zip2 v18.4s, v5.4s, v18.4s\n"
+      "zip1 v5.4s, v14.4s, v12.4s\n"
+      ".inst 0x0ea16863  // bfcvtn v3.4h, v3.4s\n"
+      "zip2 v16.4s, v10.4s, v16.4s\n"
+      "zip1 v10.4s, v9.4s, v21.4s\n"
+      ".inst 0x0ea168c6  // bfcvtn v6.4h, v6.4s\n"
+      "zip2 v0.4s, v0.4s, v27.4s\n"
+      "zip1 v27.4s, v11.4s, v17.4s\n"
+      ".inst 0x0ea16821  // bfcvtn v1.4h, v1.4s\n"
+      "zip2 v4.4s, v8.4s, v4.4s\n"
+      ".inst 0x0ea16bde  // bfcvtn v30.4h, v30.4s\n"
+      "zip2 v29.4s, v28.4s, v29.4s\n"
+      ".inst 0x0ea16a9c  // bfcvtn v28.4h, v20.4s\n"
+      "zip2 v15.4s, v26.4s, v15.4s\n"
+      ".inst 0x0ea16ae8  // bfcvtn v8.4h, v23.4s\n"
+      "zip2 v26.4s, v24.4s, v25.4s\n"
+      ".inst 0x0ea16a79  // bfcvtn v25.4h, v19.4s\n"
+      "zip2 v24.4s, v22.4s, v13.4s\n"
+      ".inst 0x0ea168b7  // bfcvtn v23.4h, v5.4s\n"
+      "zip2 v22.4s, v14.4s, v12.4s\n"
+      ".inst 0x0ea16945  // bfcvtn v5.4h, v10.4s\n"
+      "zip2 v20.4s, v9.4s, v21.4s\n"
+      ".inst 0x0ea16b73  // bfcvtn v19.4h, v27.4s\n"
+      "zip2 v17.4s, v11.4s, v17.4s\n"
+      ".inst 0x4ea168e2  // bfcvtn2 v2.8h, v7.4s\n"
+      ".inst 0x4ea16a5f  // bfcvtn2 v31.8h, v18.4s\n"
+      "str q2, [x21, #0x0]\n"
+      ".inst 0x4ea16a03  // bfcvtn2 v3.8h, v16.4s\n"
+      ".inst 0x4ea16806  // bfcvtn2 v6.8h, v0.4s\n"
+      "str q31, [x21, #0x10]\n"
+      ".inst 0x4ea16881  // bfcvtn2 v1.8h, v4.4s\n"
+      ".inst 0x4ea16bbe  // bfcvtn2 v30.8h, v29.4s\n"
+      "str q3, [x21, #0x20]\n"
+      ".inst 0x4ea169fc  // bfcvtn2 v28.8h, v15.4s\n"
+      ".inst 0x4ea16b48  // bfcvtn2 v8.8h, v26.4s\n"
+      "str q6, [x21, #0x30]\n"
+      ".inst 0x4ea16b19  // bfcvtn2 v25.8h, v24.4s\n"
+      ".inst 0x4ea16ad7  // bfcvtn2 v23.8h, v22.4s\n"
+      "str q1, [x21, #0x40]\n"
+      ".inst 0x4ea16a85  // bfcvtn2 v5.8h, v20.4s\n"
+      ".inst 0x4ea16a33  // bfcvtn2 v19.8h, v17.4s\n"
+      "str q30, [x21, #0x50]\n"
+      "str q28, [x21, #0x60]\n"
+      "str q8, [x21, #0x70]\n"
+      "str q25, [x21, #0x80]\n"
+      "str q23, [x21, #0x90]\n"
+      "str q5, [x21, #0xa0]\n"
+      "str q19, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr q23, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v22.4s, v23.4s, v17.4s\n"
+      "zip1 v21.4s, v20.4s, v16.4s\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v28.4s, v23.4s, v17.4s\n"
+      "zip2 v20.4s, v20.4s, v16.4s\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v27.4s, v19.4s, v17.4s\n"
+      "zip1 v26.4s, v18.4s, v16.4s\n"
+      "zip2 v25.4s, v19.4s, v17.4s\n"
+      "zip2 v24.4s, v18.4s, v16.4s\n"
+      "zip1 v19.4s, v22.4s, v21.4s\n"
+      "zip1 v18.4s, v28.4s, v20.4s\n"
+      "zip1 v17.4s, v27.4s, v26.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
+      ".inst 0x0ea16a77  // bfcvtn v23.4h, v19.4s\n"
+      "zip2 v22.4s, v22.4s, v21.4s\n"
+      ".inst 0x0ea16a55  // bfcvtn v21.4h, v18.4s\n"
+      "zip2 v20.4s, v28.4s, v20.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v27.4s, v26.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
+      ".inst 0x4ea16ad7  // bfcvtn2 v23.8h, v22.4s\n"
+      ".inst 0x4ea16a95  // bfcvtn2 v21.8h, v20.4s\n"
+      "str q23, [x21, #0x0]\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q21, [x21, #0x10]\n"
+      "str q19, [x21, #0x60]\n"
+      "str q17, [x21, #0x70]\n"
+      "add x21, x21, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.4s, v20.4s, v17.4s\n"
+      "zip1 v16.4s, v19.4s, v16.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d18, [x21, #0x0]\n"
+      "str d16, [x21, #0x60]\n"
+      "add x21, x21, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x8\n"
+      "add %x[out], %x[out], #0xc0\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "mov x20, %x[width]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "csel x27, x27, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "cmp x20, #0x18\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q22, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "sub x20, x20, #0x18\n"
+      "cmp x20, #0x18\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v19.4s, v22.4s, v17.4s\n"
+      "zip1 v21.4s, v18.4s, v16.4s\n"
+      "ldr q24, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v10.4s, v22.4s, v17.4s\n"
+      "zip2 v2.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v22.4s, v24.4s, v17.4s\n"
+      "zip1 v4.4s, v20.4s, v16.4s\n"
+      "ldr q23, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v29.4s, v24.4s, v17.4s\n"
+      "zip2 v1.4s, v20.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v30.4s, v23.4s, v17.4s\n"
+      "zip1 v31.4s, v18.4s, v16.4s\n"
+      "ldr q24, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v23.4s, v23.4s, v17.4s\n"
+      "zip2 v28.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v25.4s, v24.4s, v17.4s\n"
+      "zip1 v26.4s, v20.4s, v16.4s\n"
+      "ldr q14, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v24.4s, v24.4s, v17.4s\n"
+      "zip2 v15.4s, v20.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v12.4s, v14.4s, v17.4s\n"
+      "zip1 v13.4s, v18.4s, v16.4s\n"
+      "ldr q7, [x9], #0x10\n"
+      "ldr q3, [x28], #0x10\n"
+      "zip2 v0.4s, v14.4s, v17.4s\n"
+      "zip2 v9.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v14.4s, v7.4s, v17.4s\n"
+      "zip1 v8.4s, v3.4s, v16.4s\n"
+      "zip2 v7.4s, v7.4s, v17.4s\n"
+      "zip2 v11.4s, v3.4s, v16.4s\n"
+      "zip1 v18.4s, v19.4s, v21.4s\n"
+      "zip1 v6.4s, v10.4s, v2.4s\n"
+      "zip1 v5.4s, v22.4s, v4.4s\n"
+      "zip1 v16.4s, v29.4s, v1.4s\n"
+      "zip1 v27.4s, v30.4s, v31.4s\n"
+      "zip1 v3.4s, v23.4s, v28.4s\n"
+      "zip1 v17.4s, v25.4s, v26.4s\n"
+      "zip1 v20.4s, v24.4s, v15.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      "zip2 v19.4s, v19.4s, v21.4s\n"
+      "zip1 v21.4s, v12.4s, v13.4s\n"
+      ".inst 0x0ea168c6  // bfcvtn v6.4h, v6.4s\n"
+      "zip2 v10.4s, v10.4s, v2.4s\n"
+      "zip1 v2.4s, v0.4s, v9.4s\n"
+      ".inst 0x0ea168a5  // bfcvtn v5.4h, v5.4s\n"
+      "zip2 v4.4s, v22.4s, v4.4s\n"
+      "zip1 v22.4s, v14.4s, v8.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v1.4s, v29.4s, v1.4s\n"
+      "zip1 v29.4s, v7.4s, v11.4s\n"
+      ".inst 0x0ea16b7b  // bfcvtn v27.4h, v27.4s\n"
+      "zip2 v30.4s, v30.4s, v31.4s\n"
+      ".inst 0x0ea1687f  // bfcvtn v31.4h, v3.4s\n"
+      "zip2 v23.4s, v23.4s, v28.4s\n"
+      ".inst 0x0ea16a23  // bfcvtn v3.4h, v17.4s\n"
+      "zip2 v28.4s, v25.4s, v26.4s\n"
+      ".inst 0x0ea16a9a  // bfcvtn v26.4h, v20.4s\n"
+      "zip2 v25.4s, v24.4s, v15.4s\n"
+      ".inst 0x0ea16ab8  // bfcvtn v24.4h, v21.4s\n"
+      "zip2 v12.4s, v12.4s, v13.4s\n"
+      ".inst 0x0ea16855  // bfcvtn v21.4h, v2.4s\n"
+      "zip2 v13.4s, v0.4s, v9.4s\n"
+      ".inst 0x0ea16ac2  // bfcvtn v2.4h, v22.4s\n"
+      "zip2 v0.4s, v14.4s, v8.4s\n"
+      ".inst 0x0ea16ba9  // bfcvtn v9.4h, v29.4s\n"
+      "zip2 v17.4s, v7.4s, v11.4s\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      ".inst 0x4ea16946  // bfcvtn2 v6.8h, v10.4s\n"
+      "str q18, [x21, #0x0]\n"
+      ".inst 0x4ea16885  // bfcvtn2 v5.8h, v4.4s\n"
+      ".inst 0x4ea16830  // bfcvtn2 v16.8h, v1.4s\n"
+      "str q6, [x21, #0x10]\n"
+      ".inst 0x4ea16bdb  // bfcvtn2 v27.8h, v30.4s\n"
+      ".inst 0x4ea16aff  // bfcvtn2 v31.8h, v23.4s\n"
+      "str q5, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      ".inst 0x4ea16b83  // bfcvtn2 v3.8h, v28.4s\n"
+      ".inst 0x4ea16b3a  // bfcvtn2 v26.8h, v25.4s\n"
+      "str q27, [x21, #0x40]\n"
+      ".inst 0x4ea16998  // bfcvtn2 v24.8h, v12.4s\n"
+      ".inst 0x4ea169b5  // bfcvtn2 v21.8h, v13.4s\n"
+      "str q31, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      ".inst 0x4ea16802  // bfcvtn2 v2.8h, v0.4s\n"
+      ".inst 0x4ea16a29  // bfcvtn2 v9.8h, v17.4s\n"
+      "str q3, [x21, #0x0]\n"
+      "str q26, [x21, #0x10]\n"
+      "str q24, [x21, #0x20]\n"
+      "str q21, [x21, #0x30]\n"
+      "str q2, [x21, #0x40]\n"
+      "str q9, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "sub x20, x20, #0xc\n"
+      "cmp x20, #0xc\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v26.4s, v19.4s, v17.4s\n"
+      "zip1 v25.4s, v18.4s, v16.4s\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v24.4s, v19.4s, v17.4s\n"
+      "zip2 v23.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v2.4s, v21.4s, v17.4s\n"
+      "zip1 v22.4s, v20.4s, v16.4s\n"
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v1.4s, v21.4s, v17.4s\n"
+      "zip2 v0.4s, v20.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v31.4s, v19.4s, v17.4s\n"
+      "zip1 v30.4s, v18.4s, v16.4s\n"
+      "zip2 v29.4s, v19.4s, v17.4s\n"
+      "zip2 v28.4s, v18.4s, v16.4s\n"
+      "zip1 v21.4s, v26.4s, v25.4s\n"
+      "zip1 v20.4s, v24.4s, v23.4s\n"
+      "zip1 v19.4s, v2.4s, v22.4s\n"
+      "zip1 v18.4s, v1.4s, v0.4s\n"
+      "zip1 v17.4s, v31.4s, v30.4s\n"
+      "zip1 v16.4s, v29.4s, v28.4s\n"
+      ".inst 0x0ea16abb  // bfcvtn v27.4h, v21.4s\n"
+      "zip2 v26.4s, v26.4s, v25.4s\n"
+      ".inst 0x0ea16a99  // bfcvtn v25.4h, v20.4s\n"
+      "zip2 v24.4s, v24.4s, v23.4s\n"
+      ".inst 0x0ea16a77  // bfcvtn v23.4h, v19.4s\n"
+      "zip2 v22.4s, v2.4s, v22.4s\n"
+      ".inst 0x0ea16a55  // bfcvtn v21.4h, v18.4s\n"
+      "zip2 v20.4s, v1.4s, v0.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v31.4s, v30.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v29.4s, v28.4s\n"
+      ".inst 0x4ea16b5b  // bfcvtn2 v27.8h, v26.4s\n"
+      ".inst 0x4ea16b19  // bfcvtn2 v25.8h, v24.4s\n"
+      "str q27, [x21, #0x0]\n"
+      ".inst 0x4ea16ad7  // bfcvtn2 v23.8h, v22.4s\n"
+      ".inst 0x4ea16a95  // bfcvtn2 v21.8h, v20.4s\n"
+      "str q25, [x21, #0x10]\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q23, [x21, #0x20]\n"
+      "str q21, [x21, #0x30]\n"
+      "str q19, [x21, #0x40]\n"
+      "str q17, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr q20, [x9], #0x10\n"
+      "ldr q19, [x28], #0x10\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v22.4s, v20.4s, v17.4s\n"
+      "zip1 v18.4s, v19.4s, v16.4s\n"
+      "zip2 v21.4s, v20.4s, v17.4s\n"
+      "zip2 v20.4s, v19.4s, v16.4s\n"
+      "zip1 v17.4s, v22.4s, v18.4s\n"
+      "zip1 v16.4s, v21.4s, v20.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v22.4s, v18.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v21.4s, v20.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q19, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "add x21, x21, #0x20\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d16, [x21, #0x0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x60\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 4, true, VLType::None>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
new file mode 100644
index 0000000000..4d9d5e7f43
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 12 * height * sizeof(int16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x18\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q3, [x25], #0x10\n"
+      "ldr q21, [x23], #0x10\n"
+      "sshll2 v20.8h, v3.16b, #0x0\n"
+      "sshll v2.8h, v21.8b, #0x0\n"
+      "ldr q1, [x22], #0x10\n"
+      "ldr q19, [x20], #0x10\n"
+      "sshll2 v18.8h, v1.16b, #0x0\n"
+      "sshll v0.8h, v19.8b, #0x0\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "sshll v31.8h, v17.8b, #0x0\n"
+      "sshll v30.8h, v16.8b, #0x0\n"
+      "ldr d29, [x23], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "sshll2 v27.8h, v21.16b, #0x0\n"
+      "sshll2 v26.8h, v19.16b, #0x0\n"
+      "dup v25.2d, v20.d[0]\n"
+      "dup v24.2d, v2.d[1]\n"
+      "sub x24, x24, #0x18\n"
+      "cmp x24, #0x18\n"
+      "dup v23.2d, v18.d[0]\n"
+      "dup v22.2d, v0.d[1]\n"
+      "dup v21.2d, v20.d[1]\n"
+      "dup v20.2d, v31.d[1]\n"
+      "dup v19.2d, v18.d[1]\n"
+      "dup v18.2d, v30.d[1]\n"
+      "sshll v17.8h, v3.8b, #0x0\n"
+      "sshll v16.8h, v1.8b, #0x0\n"
+      "str q17, [x21, #0x0]\n"
+      "mov v25.d[1], v2.d[0]\n"
+      "mov v24.d[1], v27.d[0]\n"
+      "str q25, [x21, #0x10]\n"
+      "mov v23.d[1], v0.d[0]\n"
+      "mov v22.d[1], v26.d[0]\n"
+      "str q24, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "sshll v17.8h, v29.8b, #0x0\n"
+      "sshll v16.8h, v28.8b, #0x0\n"
+      "str q23, [x21, #0x40]\n"
+      "mov v21.d[1], v31.d[0]\n"
+      "mov v20.d[1], v27.d[1]\n"
+      "str q22, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "mov v19.d[1], v30.d[0]\n"
+      "mov v18.d[1], v26.d[1]\n"
+      "str q21, [x21, #0x0]\n"
+      "str q20, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "str q18, [x21, #0x40]\n"
+      "str q16, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr d19, [x23], #0x8\n"
+      "ldr d18, [x20], #0x8\n"
+      "sub x24, x24, #0xc\n"
+      "cmp x24, #0xc\n"
+      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x20], #0x4\n"
+      "sshll v25.8h, v19.8b, #0x0\n"
+      "sshll v24.8h, v18.8b, #0x0\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "sshll2 v23.8h, v19.16b, #0x0\n"
+      "sshll2 v22.8h, v18.16b, #0x0\n"
+      "ld1 { v17.s }[2], [x25], #0x4\n"
+      "ld1 { v16.s }[2], [x22], #0x4\n"
+      "sshll2 v21.8h, v17.16b, #0x0\n"
+      "sshll2 v20.8h, v16.16b, #0x0\n"
+      "dup v19.2d, v25.d[1]\n"
+      "dup v18.2d, v24.d[1]\n"
+      "sshll v17.8h, v17.8b, #0x0\n"
+      "sshll v16.8h, v16.8b, #0x0\n"
+      "str q17, [x21, #0x0]\n"
+      "mov v21.d[1], v25.d[0]\n"
+      "mov v19.d[1], v23.d[0]\n"
+      "str q21, [x21, #0x10]\n"
+      "mov v20.d[1], v24.d[0]\n"
+      "mov v18.d[1], v22.d[0]\n"
+      "str q19, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "str q20, [x21, #0x40]\n"
+      "str q18, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s19, [x25], #0x4\n"
+      "ldr s18, [x23], #0x4\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "sshll v19.8h, v19.8b, #0x0\n"
+      "sshll v18.8h, v18.8b, #0x0\n"
+      "sshll v17.8h, v17.8b, #0x0\n"
+      "sshll v16.8h, v16.8b, #0x0\n"
+      "str d19, [x21, #0x0]\n"
+      "str d18, [x21, #0x18]\n"
+      "str d17, [x21, #0x30]\n"
+      "str d16, [x21, #0x48]\n"
+      "add x21, x21, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b19, [x25], #0x1\n"
+      "ldr b18, [x23], #0x1\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr b17, [x22], #0x1\n"
+      "ldr b16, [x20], #0x1\n"
+      "sshll v19.8h, v19.8b, #0x0\n"
+      "sshll v18.8h, v18.8b, #0x0\n"
+      "sshll v17.8h, v17.8b, #0x0\n"
+      "sshll v16.8h, v16.8b, #0x0\n"
+      "str h19, [x21, #0x0]\n"
+      "str h18, [x21, #0x18]\n"
+      "str h17, [x21, #0x30]\n"
+      "str h16, [x21, #0x48]\n"
+      "add x21, x21, #0x2\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0x60\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x20, %x[width]\n"
+      "mov x25, %x[in]\n"
+      "cmp x20, #0x18\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q20, [x25], #0x10\n"
+      "ldr d16, [x25], #0x8\n"
+      "sshll2 v19.8h, v20.16b, #0x0\n"
+      "sshll v18.8h, v16.8b, #0x0\n"
+      "dup v17.2d, v19.d[1]\n"
+      "sub x20, x20, #0x18\n"
+      "sshll v16.8h, v20.8b, #0x0\n"
+      "str q16, [x21, #0x0]\n"
+      "dup v16.2d, v19.d[0]\n"
+      "str d16, [x21, #0x10]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "cmp x20, #0x18\n"
+      "mov v17.d[1], v18.d[0]\n"
+      "dup v16.2d, v18.d[1]\n"
+      "str q17, [x21, #0x0]\n"
+      "str d16, [x21, #0x10]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr d16, [x25], #0x8\n"
+      "ld1 { v16.s }[2], [x25], #0x4\n"
+      "sub x20, x20, #0xc\n"
+      "cmp x20, #0xc\n"
+      "sshll v17.8h, v16.8b, #0x0\n"
+      "sshll2 v16.8h, v16.16b, #0x0\n"
+      "str q17, [x21, #0x0]\n"
+      "str d16, [x21, #0x10]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr s16, [x25], #0x4\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "sshll v16.8h, v16.8b, #0x0\n"
+      "str d16, [x21, #0x0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr b16, [x25], #0x1\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "sshll v16.8h, v16.8b, #0x0\n"
+      "str h16, [x21, #0x0]\n"
+      "add x21, x21, #0x2\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x18\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 1, true, VLType::None>(
+    int16_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_s8s16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
new file mode 100644
index 0000000000..b0cd7e4ef7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 12 * height * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x18\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q3, [x25], #0x10\n"
+      "ldr q21, [x23], #0x10\n"
+      "ushll2 v20.8h, v3.16b, #0x0\n"
+      "ushll v2.8h, v21.8b, #0x0\n"
+      "ldr q1, [x22], #0x10\n"
+      "ldr q19, [x20], #0x10\n"
+      "ushll2 v18.8h, v1.16b, #0x0\n"
+      "ushll v0.8h, v19.8b, #0x0\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "ushll v31.8h, v17.8b, #0x0\n"
+      "ushll v30.8h, v16.8b, #0x0\n"
+      "ldr d29, [x23], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "ushll2 v27.8h, v21.16b, #0x0\n"
+      "ushll2 v26.8h, v19.16b, #0x0\n"
+      "dup v25.2d, v20.d[0]\n"
+      "dup v24.2d, v2.d[1]\n"
+      "sub x24, x24, #0x18\n"
+      "cmp x24, #0x18\n"
+      "dup v23.2d, v18.d[0]\n"
+      "dup v22.2d, v0.d[1]\n"
+      "dup v21.2d, v20.d[1]\n"
+      "dup v20.2d, v31.d[1]\n"
+      "dup v19.2d, v18.d[1]\n"
+      "dup v18.2d, v30.d[1]\n"
+      "ushll v17.8h, v3.8b, #0x0\n"
+      "ushll v16.8h, v1.8b, #0x0\n"
+      "str q17, [x21, #0x0]\n"
+      "mov v25.d[1], v2.d[0]\n"
+      "mov v24.d[1], v27.d[0]\n"
+      "str q25, [x21, #0x10]\n"
+      "mov v23.d[1], v0.d[0]\n"
+      "mov v22.d[1], v26.d[0]\n"
+      "str q24, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "ushll v17.8h, v29.8b, #0x0\n"
+      "ushll v16.8h, v28.8b, #0x0\n"
+      "str q23, [x21, #0x40]\n"
+      "mov v21.d[1], v31.d[0]\n"
+      "mov v20.d[1], v27.d[1]\n"
+      "str q22, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "mov v19.d[1], v30.d[0]\n"
+      "mov v18.d[1], v26.d[1]\n"
+      "str q21, [x21, #0x0]\n"
+      "str q20, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "str q18, [x21, #0x40]\n"
+      "str q16, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr d19, [x23], #0x8\n"
+      "ldr d18, [x20], #0x8\n"
+      "sub x24, x24, #0xc\n"
+      "cmp x24, #0xc\n"
+      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x20], #0x4\n"
+      "ushll v25.8h, v19.8b, #0x0\n"
+      "ushll v24.8h, v18.8b, #0x0\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "ushll2 v23.8h, v19.16b, #0x0\n"
+      "ushll2 v22.8h, v18.16b, #0x0\n"
+      "ld1 { v17.s }[2], [x25], #0x4\n"
+      "ld1 { v16.s }[2], [x22], #0x4\n"
+      "ushll2 v21.8h, v17.16b, #0x0\n"
+      "ushll2 v20.8h, v16.16b, #0x0\n"
+      "dup v19.2d, v25.d[1]\n"
+      "dup v18.2d, v24.d[1]\n"
+      "ushll v17.8h, v17.8b, #0x0\n"
+      "ushll v16.8h, v16.8b, #0x0\n"
+      "str q17, [x21, #0x0]\n"
+      "mov v21.d[1], v25.d[0]\n"
+      "mov v19.d[1], v23.d[0]\n"
+      "str q21, [x21, #0x10]\n"
+      "mov v20.d[1], v24.d[0]\n"
+      "mov v18.d[1], v22.d[0]\n"
+      "str q19, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "str q20, [x21, #0x40]\n"
+      "str q18, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s19, [x25], #0x4\n"
+      "ldr s18, [x23], #0x4\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "ushll v19.8h, v19.8b, #0x0\n"
+      "ushll v18.8h, v18.8b, #0x0\n"
+      "ushll v17.8h, v17.8b, #0x0\n"
+      "ushll v16.8h, v16.8b, #0x0\n"
+      "str d19, [x21, #0x0]\n"
+      "str d18, [x21, #0x18]\n"
+      "str d17, [x21, #0x30]\n"
+      "str d16, [x21, #0x48]\n"
+      "add x21, x21, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b19, [x25], #0x1\n"
+      "ldr b18, [x23], #0x1\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr b17, [x22], #0x1\n"
+      "ldr b16, [x20], #0x1\n"
+      "ushll v19.8h, v19.8b, #0x0\n"
+      "ushll v18.8h, v18.8b, #0x0\n"
+      "ushll v17.8h, v17.8b, #0x0\n"
+      "ushll v16.8h, v16.8b, #0x0\n"
+      "str h19, [x21, #0x0]\n"
+      "str h18, [x21, #0x18]\n"
+      "str h17, [x21, #0x30]\n"
+      "str h16, [x21, #0x48]\n"
+      "add x21, x21, #0x2\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0x60\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x20, %x[width]\n"
+      "mov x25, %x[in]\n"
+      "cmp x20, #0x18\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q20, [x25], #0x10\n"
+      "ldr d16, [x25], #0x8\n"
+      "ushll2 v19.8h, v20.16b, #0x0\n"
+      "ushll v18.8h, v16.8b, #0x0\n"
+      "dup v17.2d, v19.d[1]\n"
+      "sub x20, x20, #0x18\n"
+      "ushll v16.8h, v20.8b, #0x0\n"
+      "str q16, [x21, #0x0]\n"
+      "dup v16.2d, v19.d[0]\n"
+      "str d16, [x21, #0x10]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "cmp x20, #0x18\n"
+      "mov v17.d[1], v18.d[0]\n"
+      "dup v16.2d, v18.d[1]\n"
+      "str q17, [x21, #0x0]\n"
+      "str d16, [x21, #0x10]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr d16, [x25], #0x8\n"
+      "ld1 { v16.s }[2], [x25], #0x4\n"
+      "sub x20, x20, #0xc\n"
+      "cmp x20, #0xc\n"
+      "ushll v17.8h, v16.8b, #0x0\n"
+      "ushll2 v16.8h, v16.16b, #0x0\n"
+      "str q17, [x21, #0x0]\n"
+      "str d16, [x21, #0x10]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr s16, [x25], #0x4\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "ushll v16.8h, v16.8b, #0x0\n"
+      "str d16, [x21, #0x0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr b16, [x25], #0x1\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "ushll v16.8h, v16.8b, #0x0\n"
+      "str h16, [x21, #0x0]\n"
+      "add x21, x21, #0x2\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x18\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 1, true, VLType::None>(
+    uint16_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_u8u16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
deleted file mode 100644
index f6233ef503..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "transpose_interleave_common.hpp"
-
-// Generic unblocked transposed 6x32-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<6, 1, true, 4, 4, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a 12 x uint16_t specialisation
-  TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t *>(in),
-    stride*2, x0*2, xmax*2, k0, kmax
-  );
-}
-
-// Generic 12x16-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a uint16_t specialisation
-  Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t *>(in),
-    stride, x0, xmax, k0, kmax
-  );
-}
-
-// Specialised 12 x uint16_t version
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
-  __asm volatile (
-    "LDR q0, [%[in0]]\n"
-    "STR q0, [%[out]]\n"
-    "LDR d1, [%[in0], #0x10]\n"
-    "STR d1, [%[out], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x18\n"
-    ASM_PREFETCH("[%[in0], #192]")
-    : [in0] "+r" (in0),
-      [out] "+r" (out)
-    :
-    : "v0", "v1", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
-  __asm volatile (
-    "LDR q0, [%[in0]]\n"
-    "LDR d1, [%[in0], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x18\n"
-    ASM_PREFETCH("[%[in0], #192]")
-
-    "LDR x21, [%[in1]]\n"
-    "LDR q2, [%[in1], #0x08]\n"
-    "INS v1.d[1], x21\n"
-    "ADD %x[in1], %x[in1], #0x18\n"
-    "STP q0, q1, [%[out]]\n"
-    "STR q2, [%x[out], #0x20]\n"
-    ASM_PREFETCH("[%[in1], #192]")
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1),
-      [out] "+r" (out)
-    :
-    : "x21", "v0", "v1", "v2", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
-  __asm __volatile (
-    "LDR q0, [%x[in0]], #0x10\n"
-    "STR q0, [%x[out]]\n"
-    "LDR d1, [%x[in0]], #0x08\n"
-    ASM_PREFETCH("[%[in0], #192]")
-    "STR d1, [%x[out], #0x10]\n"
-
-    "LDR q0, [%x[in1]], #0x10\n"
-    "STR q0, [%x[out], #0x18]\n"
-    "LDR d1, [%x[in1]], #0x08\n"
-    ASM_PREFETCH("[%[in1], #192]")
-    "STR d1, [%x[out], #0x28]\n"
-
-    "LDR q0, [%x[in2]], #0x10\n"
-    "STR q0, [%x[out], #0x30]\n"
-    "LDR d1, [%x[in2]], #0x08\n"
-    ASM_PREFETCH("[%[in2], #192]")
-    "STR d1, [%x[out], #0x40]\n"
-
-    "LDR q0, [%x[in3]], #0x10\n"
-    "STR q0, [%x[out], #0x48]\n"
-    "LDR d1, [%x[in3]], #0x08\n"
-    ASM_PREFETCH("[%[in3], #192]")
-    "STR d1, [%x[out], #0x58]\n"
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1),
-      [in2] "+r" (in2),
-      [in3] "+r" (in3),
-      [out] "+r" (out)
-    :
-    : "v0", "v1", "memory"
-  );
-}
-
-template <>
-template <>
-inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
-    uint16_t* out, const uint16_t* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
deleted file mode 100644
index c0f3e17d31..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
-
-#include "transpose_interleave_common.hpp"
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out) {
-    __asm __volatile (
-        "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2	v1.4s, v0.8h\n"
-        "FCVTL	v0.4s, v0.4h\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    d2, [%[in0]], #8\n"
-        "FCVTL	v2.4s, v2.4h\n"
-        "STR    q2, [%[out], #32]\n"
-    : [in0] "+r" (in0), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out) {
-    __asm __volatile (
-        "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2	v1.4s, v0.8h\n"
-        "FCVTL	v0.4s, v0.4h\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    d2, [%[in0]], #8\n"
-        "FCVTL	v2.4s, v2.4h\n"
-        "LDR	q3, [%[in1]], #16\n"
-        "FCVTL2	v4.4s, v3.8h\n"
-        "FCVTL	v3.4s, v3.4h\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "LDR	d5, [%[in1]], #8\n"
-        "FCVTL	v5.4s, v5.4h\n"
-        "STP    q4, q5, [%[out], #64]\n"
-    : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out) {
-    __asm __volatile (
-        "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2	v1.4s, v0.8h\n"
-        "FCVTL	v0.4s, v0.4h\n"
-        "STP    q0, q1, [%[out]]\n"
-        "LDR    d2, [%[in0]], #8\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "FCVTL	v2.4s, v2.4h\n"
-        "LDR	q3, [%[in1]], #16\n"
-        "FCVTL2	v4.4s, v3.8h\n"
-        "FCVTL	v3.4s, v3.4h\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        "LDR	d5, [%[in1]], #8\n"
-        "FCVTL	v5.4s, v5.4h\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "STP    q4, q5, [%[out], #64]\n"
-        "LDR	q6, [%[in2]], #16\n"
-        "FCVTL2	v7.4s, v6.8h\n"
-        "FCVTL	v6.4s, v6.4h\n"
-        "STP    q6, q7, [%[out], #96]\n"
-        "LDR	d8, [%[in2]], #8\n"
-        "FCVTL	v8.4s, v8.4h\n"
-        ASM_PREFETCH("[%[in2], #192]")
-        "LDR	q9, [%[in3]], #16\n"
-        "FCVTL2	v10.4s, v9.8h\n"
-        "FCVTL	v9.4s, v9.4h\n"
-        "STP    q8, q9, [%[out], #128]\n"
-        "LDR	d11, [%[in3]], #8\n"
-        "FCVTL	v11.4s, v11.4h\n"
-        "STP    q10, q11, [%[out], #160]\n"
-        ASM_PREFETCH("[%[in3], #192]")
-
-    : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
-    );
-}
-
-template <>
-template <>
-inline void TransformImpl<12, 1, true, 4, 2, VLType::None>::Transform(
-    float* out, const __fp16* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
new file mode 100644
index 0000000000..0399f8becc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_16(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 4 * height * sizeof(uint32_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[width]\n"
+      "mov x23, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "add x22, x25, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "cmp x24, #0x4\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "cmp x24, #0x4\n"
+      "str q19, [x23, #0x0]\n"
+      "str q18, [x23, #0x10]\n"
+      "str q17, [x23, #0x20]\n"
+      "str q16, [x23, #0x30]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cbz x24, 5f\n"
+      "movi v16.4s, #0x0\n"
+      "str q16, [x23, #0x0]\n"
+      "str q16, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "str q16, [x23, #0x30]\n"
+      "4:"  // Main row loop: width 1 loop: loop
+      "ldr s19, [x25], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "sub x24, x24, #0x1\n"
+      "ldr s17, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "cmp x24, #0x1\n"
+      "str s19, [x23, #0x0]\n"
+      "str s18, [x23, #0x10]\n"
+      "str s17, [x23, #0x20]\n"
+      "str s16, [x23, #0x30]\n"
+      "add x23, x23, #0x4\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: odd col skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0x40\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x20, %x[width]\n"
+      "mov x25, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "cmp x20, #0x4\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Column loop
+      "ldr q16, [x25], #0x10\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "str q16, [x23, #0x0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Column loop skip
+      "cbz x20, 11f\n"
+      "movi v16.4s, #0x0\n"
+      "str q16, [x23, #0x0]\n"
+      "10:"  // Tail row loop: width 1 loop: loop
+      "ldr s16, [x25], #0x4\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "str s16, [x23, #0x0]\n"
+      "add x23, x23, #0x4\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: odd col skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x10\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
new file mode 100644
index 0000000000..f3a1dde73f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_16_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 4) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x10\n"
+      "blt 8f\n"
+      "1:"  // Main row loop: Head
+      "mov x17, %x[in]\n"
+      "add x16, x17, %x[in_stride]\n"
+      "add x15, x16, %x[in_stride]\n"
+      "add x14, x15, %x[in_stride]\n"
+      "add x13, x14, %x[in_stride]\n"
+      "add x12, x13, %x[in_stride]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x10\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x10\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q21, [x17], #0x10\n"
+      "ldr q20, [x16], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q17, [x15], #0x10\n"
+      "ldr q16, [x14], #0x10\n"
+      "zip1 v3.16b, v21.16b, v17.16b\n"
+      "zip1 v2.16b, v20.16b, v16.16b\n"
+      "ldr q19, [x13], #0x10\n"
+      "ldr q18, [x12], #0x10\n"
+      "zip2 v1.16b, v21.16b, v17.16b\n"
+      "zip2 v0.16b, v20.16b, v16.16b\n"
+      "ldr q17, [x11], #0x10\n"
+      "ldr q16, [x10], #0x10\n"
+      "zip1 v31.16b, v19.16b, v17.16b\n"
+      "zip1 v30.16b, v18.16b, v16.16b\n"
+      "ldr q25, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v24.16b, v19.16b, v17.16b\n"
+      "zip2 v23.16b, v18.16b, v16.16b\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v22.16b, v25.16b, v17.16b\n"
+      "zip1 v21.16b, v20.16b, v16.16b\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v29.16b, v25.16b, v17.16b\n"
+      "zip2 v20.16b, v20.16b, v16.16b\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v28.16b, v19.16b, v17.16b\n"
+      "zip1 v27.16b, v18.16b, v16.16b\n"
+      "zip2 v26.16b, v19.16b, v17.16b\n"
+      "zip2 v25.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v3.16b, v2.16b\n"
+      "zip2 v17.16b, v3.16b, v2.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v16.16b, v1.16b, v0.16b\n"
+      "zip2 v19.16b, v1.16b, v0.16b\n"
+      "str q17, [x21, #0x10]\n"
+      "zip1 v18.16b, v31.16b, v30.16b\n"
+      "zip2 v17.16b, v31.16b, v30.16b\n"
+      "str q16, [x21, #0x20]\n"
+      "zip1 v16.16b, v24.16b, v23.16b\n"
+      "zip2 v24.16b, v24.16b, v23.16b\n"
+      "str q19, [x21, #0x30]\n"
+      "zip1 v23.16b, v22.16b, v21.16b\n"
+      "zip2 v22.16b, v22.16b, v21.16b\n"
+      "str q18, [x21, #0x40]\n"
+      "zip1 v21.16b, v29.16b, v20.16b\n"
+      "zip2 v20.16b, v29.16b, v20.16b\n"
+      "str q17, [x21, #0x50]\n"
+      "zip1 v19.16b, v28.16b, v27.16b\n"
+      "zip2 v18.16b, v28.16b, v27.16b\n"
+      "str q16, [x21, #0x60]\n"
+      "zip1 v17.16b, v26.16b, v25.16b\n"
+      "zip2 v16.16b, v26.16b, v25.16b\n"
+      "str q24, [x21, #0x70]\n"
+      "str q23, [x21, #0x80]\n"
+      "str q22, [x21, #0x90]\n"
+      "str q21, [x21, #0xa0]\n"
+      "str q20, [x21, #0xb0]\n"
+      "str q19, [x21, #0xc0]\n"
+      "str q18, [x21, #0xd0]\n"
+      "str q17, [x21, #0xe0]\n"
+      "str q16, [x21, #0xf0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 4 loop: loop
+      "ldr s19, [x17], #0x4\n"
+      "ldr s18, [x16], #0x4\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr s17, [x15], #0x4\n"
+      "ldr s16, [x14], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr s19, [x13], #0x4\n"
+      "ldr s18, [x12], #0x4\n"
+      "zip1 v22.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x11], #0x4\n"
+      "ldr s16, [x10], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "zip1 v21.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "zip1 v18.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str q22, [x21, #0x0]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q21, [x21, #0x40]\n"
+      "str q18, [x21, #0x80]\n"
+      "str q16, [x21, #0xc0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr b19, [x17], #0x1\n"
+      "ldr b18, [x16], #0x1\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr b17, [x15], #0x1\n"
+      "ldr b16, [x14], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr b19, [x13], #0x1\n"
+      "ldr b18, [x12], #0x1\n"
+      "zip1 v22.16b, v17.16b, v16.16b\n"
+      "ldr b17, [x11], #0x1\n"
+      "ldr b16, [x10], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr b19, [x9], #0x1\n"
+      "ldr b18, [x28], #0x1\n"
+      "zip1 v21.16b, v17.16b, v16.16b\n"
+      "ldr b17, [x27], #0x1\n"
+      "ldr b16, [x26], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr b20, [x25], #0x1\n"
+      "ldr b19, [x23], #0x1\n"
+      "zip1 v18.16b, v17.16b, v16.16b\n"
+      "ldr b17, [x22], #0x1\n"
+      "ldr b16, [x20], #0x1\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str s22, [x21, #0x0]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s21, [x21, #0x40]\n"
+      "str s18, [x21, #0x80]\n"
+      "str s16, [x21, #0xc0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x10\n"
+      "add %x[out], %x[out], #0x100\n"
+      "bge 1b\n"
+      "cbz %x[height], 16f\n"
+      "8:"  // Main loop skip
+      "9:"  // Tail row loop: Head
+      "mov x17, %x[in]\n"
+      "add x16, x17, %x[in_stride]\n"
+      "add x15, x16, %x[in_stride]\n"
+      "mov x20, %x[width]\n"
+      "add x14, x15, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x14, %x[in_stride]\n"
+      "csel x14, x14, %x[pad_row], GT\n"
+      "csel x15, x15, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x16, x16, %x[pad_row], GT\n"
+      "cmp x20, #0x10\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "ldr q20, [x17], #0x10\n"
+      "ldr q21, [x16], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "ldr q19, [x15], #0x10\n"
+      "ldr q16, [x14], #0x10\n"
+      "zip1 v18.16b, v20.16b, v19.16b\n"
+      "zip1 v17.16b, v21.16b, v16.16b\n"
+      "zip2 v20.16b, v20.16b, v19.16b\n"
+      "zip2 v19.16b, v21.16b, v16.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "zip2 v18.16b, v18.16b, v17.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip2 v16.16b, v20.16b, v19.16b\n"
+      "str q18, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: width 4 loop: loop
+      "ldr s19, [x17], #0x4\n"
+      "ldr s18, [x16], #0x4\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "ldr s17, [x15], #0x4\n"
+      "ldr s16, [x14], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 1 loop: loop
+      "ldr b19, [x17], #0x1\n"
+      "ldr b18, [x16], #0x1\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "ldr b17, [x15], #0x1\n"
+      "ldr b16, [x14], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x21, #0x0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x40\n"
+      "bge 9b\n"
+      "16:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 4, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<16, 4, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
new file mode 100644
index 0000000000..7c7e91e666
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_16_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 8) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 8) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "cmp %x[height], #0x7\n"
+      "add %x[in], x22, %x[in_stride]\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "csel x23, x23, %x[pad_row], GE\n"
+      "cmp %x[height], #0x5\n"
+      "mov x21, %x[width]\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x3\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "csel x27, x27, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "cmp x21, #0x20\n"
+      "mov x20, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q23, [x9], #0x10\n"
+      "ldr q22, [x28], #0x10\n"
+      "sub x21, x21, #0x20\n"
+      "cmp x21, #0x20\n"
+      "ldr q20, [x27], #0x10\n"
+      "ldr q21, [x26], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x24], #0x10\n"
+      "zip1 v5.16b, v23.16b, v19.16b\n"
+      "zip1 v4.16b, v22.16b, v18.16b\n"
+      "ldr q17, [x23], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v3.16b, v20.16b, v17.16b\n"
+      "zip1 v31.16b, v21.16b, v16.16b\n"
+      "ldr q25, [x9], #0x10\n"
+      "ldr q24, [x28], #0x10\n"
+      "zip2 v2.16b, v23.16b, v19.16b\n"
+      "zip2 v30.16b, v20.16b, v17.16b\n"
+      "ldr q23, [x27], #0x10\n"
+      "ldr q20, [x26], #0x10\n"
+      "zip2 v22.16b, v22.16b, v18.16b\n"
+      "zip2 v21.16b, v21.16b, v16.16b\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x24], #0x10\n"
+      "zip1 v29.16b, v25.16b, v19.16b\n"
+      "zip1 v28.16b, v24.16b, v18.16b\n"
+      "ldr q17, [x23], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v27.16b, v23.16b, v17.16b\n"
+      "zip1 v26.16b, v20.16b, v16.16b\n"
+      "zip2 v1.16b, v25.16b, v19.16b\n"
+      "zip2 v25.16b, v23.16b, v17.16b\n"
+      "zip2 v24.16b, v24.16b, v18.16b\n"
+      "zip2 v16.16b, v20.16b, v16.16b\n"
+      "zip1 v0.16b, v5.16b, v3.16b\n"
+      "zip1 v17.16b, v4.16b, v31.16b\n"
+      "zip2 v20.16b, v5.16b, v3.16b\n"
+      "zip2 v19.16b, v4.16b, v31.16b\n"
+      "zip1 v31.16b, v2.16b, v30.16b\n"
+      "zip1 v18.16b, v22.16b, v21.16b\n"
+      "zip2 v30.16b, v2.16b, v30.16b\n"
+      "zip2 v23.16b, v22.16b, v21.16b\n"
+      "zip1 v22.16b, v29.16b, v27.16b\n"
+      "zip1 v21.16b, v28.16b, v26.16b\n"
+      "zip2 v29.16b, v29.16b, v27.16b\n"
+      "zip2 v28.16b, v28.16b, v26.16b\n"
+      "zip1 v27.16b, v1.16b, v25.16b\n"
+      "zip1 v26.16b, v24.16b, v16.16b\n"
+      "zip2 v25.16b, v1.16b, v25.16b\n"
+      "zip2 v24.16b, v24.16b, v16.16b\n"
+      "zip1 v16.16b, v0.16b, v17.16b\n"
+      "zip2 v17.16b, v0.16b, v17.16b\n"
+      "str q16, [x20, #0x0]\n"
+      "zip1 v16.16b, v20.16b, v19.16b\n"
+      "zip2 v20.16b, v20.16b, v19.16b\n"
+      "str q17, [x20, #0x10]\n"
+      "zip1 v19.16b, v31.16b, v18.16b\n"
+      "zip2 v18.16b, v31.16b, v18.16b\n"
+      "str q16, [x20, #0x20]\n"
+      "zip1 v17.16b, v30.16b, v23.16b\n"
+      "zip2 v16.16b, v30.16b, v23.16b\n"
+      "str q20, [x20, #0x30]\n"
+      "str q19, [x20, #0x40]\n"
+      "zip1 v23.16b, v22.16b, v21.16b\n"
+      "zip2 v22.16b, v22.16b, v21.16b\n"
+      "str q18, [x20, #0x50]\n"
+      "zip1 v21.16b, v29.16b, v28.16b\n"
+      "zip2 v20.16b, v29.16b, v28.16b\n"
+      "str q17, [x20, #0x60]\n"
+      "zip1 v19.16b, v27.16b, v26.16b\n"
+      "zip2 v18.16b, v27.16b, v26.16b\n"
+      "str q16, [x20, #0x70]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "zip1 v17.16b, v25.16b, v24.16b\n"
+      "zip2 v16.16b, v25.16b, v24.16b\n"
+      "str q23, [x20, #0x0]\n"
+      "str q22, [x20, #0x10]\n"
+      "str q21, [x20, #0x20]\n"
+      "str q20, [x20, #0x30]\n"
+      "str q19, [x20, #0x40]\n"
+      "str q18, [x20, #0x50]\n"
+      "str q17, [x20, #0x60]\n"
+      "str q16, [x20, #0x70]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x21, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q25, [x9], #0x10\n"
+      "ldr q27, [x28], #0x10\n"
+      "sub x21, x21, #0x10\n"
+      "cmp x21, #0x10\n"
+      "ldr q26, [x27], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "ldr q22, [x25], #0x10\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v20.16b, v25.16b, v22.16b\n"
+      "zip1 v23.16b, v27.16b, v21.16b\n"
+      "ldr q17, [x23], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v19.16b, v26.16b, v17.16b\n"
+      "zip1 v18.16b, v24.16b, v16.16b\n"
+      "zip2 v25.16b, v25.16b, v22.16b\n"
+      "zip2 v22.16b, v26.16b, v17.16b\n"
+      "zip2 v21.16b, v27.16b, v21.16b\n"
+      "zip2 v16.16b, v24.16b, v16.16b\n"
+      "zip1 v24.16b, v20.16b, v19.16b\n"
+      "zip1 v17.16b, v23.16b, v18.16b\n"
+      "zip2 v20.16b, v20.16b, v19.16b\n"
+      "zip2 v19.16b, v23.16b, v18.16b\n"
+      "zip1 v23.16b, v25.16b, v22.16b\n"
+      "zip1 v18.16b, v21.16b, v16.16b\n"
+      "zip2 v22.16b, v25.16b, v22.16b\n"
+      "zip2 v21.16b, v21.16b, v16.16b\n"
+      "zip1 v16.16b, v24.16b, v17.16b\n"
+      "zip2 v17.16b, v24.16b, v17.16b\n"
+      "str q16, [x20, #0x0]\n"
+      "zip1 v16.16b, v20.16b, v19.16b\n"
+      "zip2 v20.16b, v20.16b, v19.16b\n"
+      "str q17, [x20, #0x10]\n"
+      "zip1 v19.16b, v23.16b, v18.16b\n"
+      "zip2 v18.16b, v23.16b, v18.16b\n"
+      "str q16, [x20, #0x20]\n"
+      "zip1 v17.16b, v22.16b, v21.16b\n"
+      "zip2 v16.16b, v22.16b, v21.16b\n"
+      "str q20, [x20, #0x30]\n"
+      "str q19, [x20, #0x40]\n"
+      "str q18, [x20, #0x50]\n"
+      "str q17, [x20, #0x60]\n"
+      "str q16, [x20, #0x70]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x21, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s18, [x9], #0x4\n"
+      "ldr s19, [x28], #0x4\n"
+      "sub x21, x21, #0x4\n"
+      "cmp x21, #0x4\n"
+      "ldr s21, [x27], #0x4\n"
+      "ldr s20, [x26], #0x4\n"
+      "ldr s17, [x25], #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "zip1 v18.16b, v18.16b, v17.16b\n"
+      "zip1 v19.16b, v19.16b, v16.16b\n"
+      "ldr s17, [x23], #0x4\n"
+      "ldr s16, [x22], #0x4\n"
+      "zip1 v17.16b, v21.16b, v17.16b\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "zip1 v18.16b, v18.16b, v17.16b\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "zip1 v17.16b, v18.16b, v16.16b\n"
+      "zip2 v16.16b, v18.16b, v16.16b\n"
+      "str q17, [x20, #0x0]\n"
+      "str q16, [x20, #0x10]\n"
+      "add x20, x20, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x21, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b19, [x9], #0x1\n"
+      "ldr b18, [x28], #0x1\n"
+      "sub x21, x21, #0x1\n"
+      "cmp x21, #0x1\n"
+      "ldr b21, [x27], #0x1\n"
+      "ldr b20, [x26], #0x1\n"
+      "ldr b17, [x25], #0x1\n"
+      "ldr b16, [x24], #0x1\n"
+      "zip1 v19.16b, v19.16b, v17.16b\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr b17, [x23], #0x1\n"
+      "ldr b16, [x22], #0x1\n"
+      "zip1 v17.16b, v21.16b, v17.16b\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str d16, [x20, #0x0]\n"
+      "add x20, x20, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x80\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 8, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<16, 8, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
new file mode 100644
index 0000000000..b4515cbfd4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 2) * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 8f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x10\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q17, [x9], #0x10\n"
+      "ldr q16, [x28], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q19, [x27], #0x10\n"
+      "ldr q18, [x26], #0x10\n"
+      "zip1 v1.8h, v17.8h, v16.8h\n"
+      "zip2 v0.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v31.8h, v19.8h, v18.8h\n"
+      "zip2 v30.8h, v19.8h, v18.8h\n"
+      "ldr q29, [x22], #0x10\n"
+      "ldr q18, [x20], #0x10\n"
+      "zip1 v28.8h, v17.8h, v16.8h\n"
+      "zip2 v27.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x9], #0x10\n"
+      "ldr q16, [x28], #0x10\n"
+      "zip1 v26.8h, v17.8h, v16.8h\n"
+      "zip2 v25.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v24.8h, v17.8h, v16.8h\n"
+      "zip2 v23.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v22.8h, v17.8h, v16.8h\n"
+      "zip2 v21.8h, v17.8h, v16.8h\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v19.8h, v29.8h, v18.8h\n"
+      "zip2 v18.8h, v29.8h, v18.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q1, [x21, #0x0]\n"
+      "str q0, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q25, [x21, #0x30]\n"
+      "str q31, [x21, #0x40]\n"
+      "str q30, [x21, #0x50]\n"
+      "str q24, [x21, #0x60]\n"
+      "str q23, [x21, #0x70]\n"
+      "str q28, [x21, #0x80]\n"
+      "str q27, [x21, #0x90]\n"
+      "str q22, [x21, #0xa0]\n"
+      "str q21, [x21, #0xb0]\n"
+      "str q19, [x21, #0xc0]\n"
+      "str q18, [x21, #0xd0]\n"
+      "str q17, [x21, #0xe0]\n"
+      "str q16, [x21, #0xf0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x9], #0x8\n"
+      "ldr d18, [x28], #0x8\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v20.8h, v19.8h, v18.8h\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "str q20, [x21, #0x0]\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q19, [x21, #0x40]\n"
+      "str q18, [x21, #0x80]\n"
+      "str q16, [x21, #0xc0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x9], #0x2\n"
+      "ldr h18, [x28], #0x2\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr h17, [x27], #0x2\n"
+      "ldr h16, [x26], #0x2\n"
+      "zip1 v20.8h, v19.8h, v18.8h\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x25], #0x2\n"
+      "ldr h16, [x23], #0x2\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "str s20, [x21, #0x0]\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str s19, [x21, #0x40]\n"
+      "str s18, [x21, #0x80]\n"
+      "str s16, [x21, #0xc0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x8\n"
+      "add %x[out], %x[out], #0x100\n"
+      "bge 1b\n"
+      "cbz %x[height], 16f\n"
+      "8:"  // Main loop skip
+      "9:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x20, %x[width]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x28, %x[in_stride]\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "cmp x20, #0x10\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "ldr q18, [x9], #0x10\n"
+      "ldr q17, [x28], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "ldr q20, [x9], #0x10\n"
+      "ldr q16, [x28], #0x10\n"
+      "zip1 v19.8h, v18.8h, v17.8h\n"
+      "zip2 v18.8h, v18.8h, v17.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q19, [x21, #0x0]\n"
+      "str q18, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: width 4 loop: loop
+      "ldr d17, [x9], #0x8\n"
+      "ldr d16, [x28], #0x8\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 1 loop: loop
+      "ldr h17, [x9], #0x2\n"
+      "ldr h16, [x28], #0x2\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str s16, [x21, #0x0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x40\n"
+      "bge 9b\n"
+      "16:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 2, true, VLType::None>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
new file mode 100644
index 0000000000..ac67467240
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 4) * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x20\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q23, [x9], #0x10\n"
+      "ldr q29, [x28], #0x10\n"
+      "sub x24, x24, #0x20\n"
+      "cmp x24, #0x20\n"
+      "ldr q13, [x27], #0x10\n"
+      "ldr q12, [x26], #0x10\n"
+      "zip1 v20.8h, v23.8h, v13.8h\n"
+      "zip1 v28.8h, v29.8h, v12.8h\n"
+      "ldr q18, [x25], #0x10\n"
+      "ldr q9, [x23], #0x10\n"
+      "zip2 v22.8h, v23.8h, v13.8h\n"
+      "zip2 v1.8h, v29.8h, v12.8h\n"
+      "ldr q27, [x22], #0x10\n"
+      "ldr q3, [x20], #0x10\n"
+      "zip1 v4.8h, v18.8h, v27.8h\n"
+      "zip1 v26.8h, v9.8h, v3.8h\n"
+      "ldr q17, [x9], #0x10\n"
+      "ldr q2, [x28], #0x10\n"
+      "zip2 v15.8h, v18.8h, v27.8h\n"
+      "zip2 v12.8h, v9.8h, v3.8h\n"
+      "ldr q23, [x27], #0x10\n"
+      "ldr q14, [x26], #0x10\n"
+      "zip1 v19.8h, v17.8h, v23.8h\n"
+      "zip1 v21.8h, v2.8h, v14.8h\n"
+      "ldr q6, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v27.8h, v17.8h, v23.8h\n"
+      "zip2 v17.8h, v2.8h, v14.8h\n"
+      "ldr q0, [x22], #0x10\n"
+      "ldr q3, [x20], #0x10\n"
+      "zip1 v16.8h, v6.8h, v0.8h\n"
+      "zip1 v30.8h, v18.8h, v3.8h\n"
+      "ldr q2, [x9], #0x10\n"
+      "ldr q13, [x28], #0x10\n"
+      "zip2 v31.8h, v6.8h, v0.8h\n"
+      "zip2 v8.8h, v18.8h, v3.8h\n"
+      "ldr q14, [x27], #0x10\n"
+      "ldr q3, [x26], #0x10\n"
+      "zip1 v11.8h, v2.8h, v14.8h\n"
+      "zip1 v29.8h, v13.8h, v3.8h\n"
+      "ldr q25, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v23.8h, v2.8h, v14.8h\n"
+      "zip2 v10.8h, v13.8h, v3.8h\n"
+      "ldr q7, [x22], #0x10\n"
+      "ldr q6, [x20], #0x10\n"
+      "zip1 v14.8h, v25.8h, v7.8h\n"
+      "zip1 v13.8h, v18.8h, v6.8h\n"
+      "ldr q2, [x9], #0x10\n"
+      "ldr q5, [x28], #0x10\n"
+      "zip2 v9.8h, v25.8h, v7.8h\n"
+      "zip2 v7.8h, v18.8h, v6.8h\n"
+      "ldr q6, [x27], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v25.8h, v2.8h, v6.8h\n"
+      "zip1 v3.8h, v5.8h, v24.8h\n"
+      "ldr q0, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v2.8h, v2.8h, v6.8h\n"
+      "zip2 v24.8h, v5.8h, v24.8h\n"
+      "ldr q5, [x22], #0x10\n"
+      "zip1 v6.8h, v0.8h, v5.8h\n"
+      "zip2 v5.8h, v0.8h, v5.8h\n"
+      "zip1 v0.8h, v20.8h, v28.8h\n"
+      "zip2 v28.8h, v20.8h, v28.8h\n"
+      "ldr q20, [x20], #0x10\n"
+      "str q0, [x21, #0x0]\n"
+      "zip1 v0.8h, v18.8h, v20.8h\n"
+      "zip2 v20.8h, v18.8h, v20.8h\n"
+      "str q28, [x21, #0x10]\n"
+      "zip1 v18.8h, v22.8h, v1.8h\n"
+      "zip2 v28.8h, v22.8h, v1.8h\n"
+      "str q18, [x21, #0x20]\n"
+      "zip1 v22.8h, v19.8h, v21.8h\n"
+      "zip2 v19.8h, v19.8h, v21.8h\n"
+      "str q28, [x21, #0x30]\n"
+      "zip1 v18.8h, v27.8h, v17.8h\n"
+      "zip2 v17.8h, v27.8h, v17.8h\n"
+      "str q22, [x21, #0x40]\n"
+      "zip1 v27.8h, v4.8h, v26.8h\n"
+      "zip2 v26.8h, v4.8h, v26.8h\n"
+      "str q19, [x21, #0x50]\n"
+      "zip1 v22.8h, v15.8h, v12.8h\n"
+      "zip2 v21.8h, v15.8h, v12.8h\n"
+      "str q18, [x21, #0x60]\n"
+      "zip1 v19.8h, v16.8h, v30.8h\n"
+      "zip2 v18.8h, v16.8h, v30.8h\n"
+      "str q17, [x21, #0x70]\n"
+      "zip1 v17.8h, v31.8h, v8.8h\n"
+      "zip2 v16.8h, v31.8h, v8.8h\n"
+      "str q27, [x21, #0x80]\n"
+      "str q26, [x21, #0x90]\n"
+      "zip1 v31.8h, v11.8h, v29.8h\n"
+      "zip2 v30.8h, v11.8h, v29.8h\n"
+      "str q22, [x21, #0xa0]\n"
+      "zip1 v29.8h, v23.8h, v10.8h\n"
+      "zip2 v28.8h, v23.8h, v10.8h\n"
+      "str q21, [x21, #0xb0]\n"
+      "zip1 v27.8h, v25.8h, v3.8h\n"
+      "zip2 v26.8h, v25.8h, v3.8h\n"
+      "str q19, [x21, #0xc0]\n"
+      "zip1 v25.8h, v2.8h, v24.8h\n"
+      "zip2 v24.8h, v2.8h, v24.8h\n"
+      "str q18, [x21, #0xd0]\n"
+      "zip1 v23.8h, v14.8h, v13.8h\n"
+      "zip2 v22.8h, v14.8h, v13.8h\n"
+      "str q17, [x21, #0xe0]\n"
+      "zip1 v21.8h, v9.8h, v7.8h\n"
+      "zip2 v19.8h, v9.8h, v7.8h\n"
+      "str q16, [x21, #0xf0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip1 v2.8h, v6.8h, v0.8h\n"
+      "zip2 v18.8h, v6.8h, v0.8h\n"
+      "zip1 v17.8h, v5.8h, v20.8h\n"
+      "zip2 v16.8h, v5.8h, v20.8h\n"
+      "str q31, [x21, #0x0]\n"
+      "str q30, [x21, #0x10]\n"
+      "str q29, [x21, #0x20]\n"
+      "str q28, [x21, #0x30]\n"
+      "str q27, [x21, #0x40]\n"
+      "str q26, [x21, #0x50]\n"
+      "str q25, [x21, #0x60]\n"
+      "str q24, [x21, #0x70]\n"
+      "str q23, [x21, #0x80]\n"
+      "str q22, [x21, #0x90]\n"
+      "str q21, [x21, #0xa0]\n"
+      "str q19, [x21, #0xb0]\n"
+      "str q2, [x21, #0xc0]\n"
+      "str q18, [x21, #0xd0]\n"
+      "str q17, [x21, #0xe0]\n"
+      "str q16, [x21, #0xf0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v3.8h, v21.8h, v17.8h\n"
+      "zip1 v2.8h, v20.8h, v16.8h\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v1.8h, v21.8h, v17.8h\n"
+      "zip2 v24.8h, v20.8h, v16.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v0.8h, v19.8h, v17.8h\n"
+      "zip1 v31.8h, v18.8h, v16.8h\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v30.8h, v19.8h, v17.8h\n"
+      "zip2 v29.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v23.8h, v21.8h, v17.8h\n"
+      "zip1 v22.8h, v20.8h, v16.8h\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v21.8h, v21.8h, v17.8h\n"
+      "zip2 v20.8h, v20.8h, v16.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v28.8h, v19.8h, v17.8h\n"
+      "zip1 v27.8h, v18.8h, v16.8h\n"
+      "zip2 v26.8h, v19.8h, v17.8h\n"
+      "zip2 v25.8h, v18.8h, v16.8h\n"
+      "zip1 v16.8h, v3.8h, v2.8h\n"
+      "zip2 v17.8h, v3.8h, v2.8h\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v16.8h, v1.8h, v24.8h\n"
+      "zip2 v19.8h, v1.8h, v24.8h\n"
+      "str q17, [x21, #0x10]\n"
+      "zip1 v18.8h, v23.8h, v22.8h\n"
+      "zip2 v17.8h, v23.8h, v22.8h\n"
+      "str q16, [x21, #0x20]\n"
+      "zip1 v16.8h, v21.8h, v20.8h\n"
+      "zip2 v24.8h, v21.8h, v20.8h\n"
+      "str q19, [x21, #0x30]\n"
+      "zip1 v23.8h, v0.8h, v31.8h\n"
+      "zip2 v22.8h, v0.8h, v31.8h\n"
+      "str q18, [x21, #0x40]\n"
+      "zip1 v21.8h, v30.8h, v29.8h\n"
+      "zip2 v20.8h, v30.8h, v29.8h\n"
+      "str q17, [x21, #0x50]\n"
+      "zip1 v19.8h, v28.8h, v27.8h\n"
+      "zip2 v18.8h, v28.8h, v27.8h\n"
+      "str q16, [x21, #0x60]\n"
+      "zip1 v17.8h, v26.8h, v25.8h\n"
+      "zip2 v16.8h, v26.8h, v25.8h\n"
+      "str q24, [x21, #0x70]\n"
+      "str q23, [x21, #0x80]\n"
+      "str q22, [x21, #0x90]\n"
+      "str q21, [x21, #0xa0]\n"
+      "str q20, [x21, #0xb0]\n"
+      "str q19, [x21, #0xc0]\n"
+      "str q18, [x21, #0xd0]\n"
+      "str q17, [x21, #0xe0]\n"
+      "str q16, [x21, #0xf0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x9], #0x8\n"
+      "ldr d18, [x28], #0x8\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v17.8h, v19.8h, v17.8h\n"
+      "zip1 v16.8h, v18.8h, v16.8h\n"
+      "ldr d18, [x25], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "zip2 v19.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "zip1 v18.8h, v18.8h, v17.8h\n"
+      "zip1 v16.8h, v21.8h, v16.8h\n"
+      "str q20, [x21, #0x0]\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "str q19, [x21, #0x10]\n"
+      "str q17, [x21, #0x80]\n"
+      "str q16, [x21, #0x90]\n"
+      "add x21, x21, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x9], #0x2\n"
+      "ldr h18, [x28], #0x2\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr h17, [x27], #0x2\n"
+      "ldr h16, [x26], #0x2\n"
+      "zip1 v17.8h, v19.8h, v17.8h\n"
+      "zip1 v16.8h, v18.8h, v16.8h\n"
+      "ldr h20, [x25], #0x2\n"
+      "ldr h19, [x23], #0x2\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "zip1 v17.8h, v20.8h, v17.8h\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "str d18, [x21, #0x0]\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str d16, [x21, #0x80]\n"
+      "add x21, x21, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x8\n"
+      "add %x[out], %x[out], #0x100\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "mov x20, %x[width]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "csel x27, x27, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "cmp x20, #0x20\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "sub x20, x20, #0x20\n"
+      "cmp x20, #0x20\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v4.8h, v21.8h, v17.8h\n"
+      "zip1 v3.8h, v20.8h, v16.8h\n"
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v2.8h, v21.8h, v17.8h\n"
+      "zip2 v1.8h, v20.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v0.8h, v19.8h, v17.8h\n"
+      "zip1 v31.8h, v18.8h, v16.8h\n"
+      "ldr q24, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v30.8h, v19.8h, v17.8h\n"
+      "zip2 v23.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v22.8h, v24.8h, v17.8h\n"
+      "zip1 v21.8h, v20.8h, v16.8h\n"
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v29.8h, v24.8h, v17.8h\n"
+      "zip2 v28.8h, v20.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v27.8h, v19.8h, v17.8h\n"
+      "zip1 v26.8h, v18.8h, v16.8h\n"
+      "zip2 v25.8h, v19.8h, v17.8h\n"
+      "zip2 v24.8h, v18.8h, v16.8h\n"
+      "zip1 v16.8h, v4.8h, v3.8h\n"
+      "zip2 v17.8h, v4.8h, v3.8h\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v16.8h, v2.8h, v1.8h\n"
+      "zip2 v20.8h, v2.8h, v1.8h\n"
+      "str q17, [x21, #0x10]\n"
+      "zip1 v19.8h, v0.8h, v31.8h\n"
+      "zip2 v18.8h, v0.8h, v31.8h\n"
+      "str q16, [x21, #0x20]\n"
+      "zip1 v17.8h, v30.8h, v23.8h\n"
+      "zip2 v16.8h, v30.8h, v23.8h\n"
+      "str q20, [x21, #0x30]\n"
+      "str q19, [x21, #0x40]\n"
+      "zip1 v23.8h, v22.8h, v21.8h\n"
+      "zip2 v22.8h, v22.8h, v21.8h\n"
+      "str q18, [x21, #0x50]\n"
+      "zip1 v21.8h, v29.8h, v28.8h\n"
+      "zip2 v20.8h, v29.8h, v28.8h\n"
+      "str q17, [x21, #0x60]\n"
+      "zip1 v19.8h, v27.8h, v26.8h\n"
+      "zip2 v18.8h, v27.8h, v26.8h\n"
+      "str q16, [x21, #0x70]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip1 v17.8h, v25.8h, v24.8h\n"
+      "zip2 v16.8h, v25.8h, v24.8h\n"
+      "str q23, [x21, #0x0]\n"
+      "str q22, [x21, #0x10]\n"
+      "str q21, [x21, #0x20]\n"
+      "str q20, [x21, #0x30]\n"
+      "str q19, [x21, #0x40]\n"
+      "str q18, [x21, #0x50]\n"
+      "str q17, [x21, #0x60]\n"
+      "str q16, [x21, #0x70]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v25.8h, v19.8h, v17.8h\n"
+      "zip1 v24.8h, v18.8h, v16.8h\n"
+      "ldr q22, [x9], #0x10\n"
+      "ldr q21, [x28], #0x10\n"
+      "zip2 v20.8h, v19.8h, v17.8h\n"
+      "zip2 v19.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v23.8h, v22.8h, v17.8h\n"
+      "zip1 v18.8h, v21.8h, v16.8h\n"
+      "zip2 v22.8h, v22.8h, v17.8h\n"
+      "zip2 v21.8h, v21.8h, v16.8h\n"
+      "zip1 v16.8h, v25.8h, v24.8h\n"
+      "zip2 v17.8h, v25.8h, v24.8h\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v16.8h, v20.8h, v19.8h\n"
+      "zip2 v20.8h, v20.8h, v19.8h\n"
+      "str q17, [x21, #0x10]\n"
+      "zip1 v19.8h, v23.8h, v18.8h\n"
+      "zip2 v18.8h, v23.8h, v18.8h\n"
+      "str q16, [x21, #0x20]\n"
+      "zip1 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v22.8h, v21.8h\n"
+      "str q20, [x21, #0x30]\n"
+      "str q19, [x21, #0x40]\n"
+      "str q18, [x21, #0x50]\n"
+      "str q17, [x21, #0x60]\n"
+      "str q16, [x21, #0x70]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d18, [x9], #0x8\n"
+      "ldr d19, [x28], #0x8\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v18.8h, v18.8h, v17.8h\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "str q17, [x21, #0x0]\n"
+      "str q16, [x21, #0x10]\n"
+      "add x21, x21, #0x20\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h19, [x9], #0x2\n"
+      "ldr h18, [x28], #0x2\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "ldr h17, [x27], #0x2\n"
+      "ldr h16, [x26], #0x2\n"
+      "zip1 v17.8h, v19.8h, v17.8h\n"
+      "zip1 v16.8h, v18.8h, v16.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str d16, [x21, #0x0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x80\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 4, true, VLType::None>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_2x4(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..b9fe8b126a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 4) * sizeof(bfloat16);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 8f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x10\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q13, [x9], #0x10\n"
+      "ldr q12, [x28], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q1, [x27], #0x10\n"
+      "ldr q9, [x26], #0x10\n"
+      "zip1 v19.4s, v13.4s, v1.4s\n"
+      "zip1 v14.4s, v12.4s, v9.4s\n"
+      "ldr q15, [x25], #0x10\n"
+      "ldr q4, [x23], #0x10\n"
+      "zip2 v8.4s, v13.4s, v1.4s\n"
+      "zip2 v28.4s, v12.4s, v9.4s\n"
+      "ldr q0, [x22], #0x10\n"
+      "ldr q1, [x20], #0x10\n"
+      "zip1 v16.4s, v15.4s, v0.4s\n"
+      "zip1 v5.4s, v4.4s, v1.4s\n"
+      "ldr q25, [x9], #0x10\n"
+      "ldr q24, [x28], #0x10\n"
+      "zip2 v3.4s, v15.4s, v0.4s\n"
+      "zip2 v2.4s, v4.4s, v1.4s\n"
+      "ldr q21, [x27], #0x10\n"
+      "ldr q30, [x26], #0x10\n"
+      "zip1 v18.4s, v25.4s, v21.4s\n"
+      "zip1 v27.4s, v24.4s, v30.4s\n"
+      "ldr q22, [x25], #0x10\n"
+      "ldr q20, [x23], #0x10\n"
+      "zip2 v9.4s, v25.4s, v21.4s\n"
+      "zip2 v10.4s, v24.4s, v30.4s\n"
+      "ldr q1, [x22], #0x10\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip1 v25.4s, v22.4s, v1.4s\n"
+      "zip1 v7.4s, v20.4s, v21.4s\n"
+      "ldr q31, [x9], #0x10\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v30.4s, v22.4s, v1.4s\n"
+      "zip2 v20.4s, v20.4s, v21.4s\n"
+      "ldr q15, [x27], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v6.4s, v31.4s, v15.4s\n"
+      "zip1 v4.4s, v17.4s, v24.4s\n"
+      "ldr q12, [x25], #0x10\n"
+      "ldr q29, [x23], #0x10\n"
+      "zip2 v22.4s, v31.4s, v15.4s\n"
+      "zip2 v26.4s, v17.4s, v24.4s\n"
+      "ldr q0, [x22], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "zip1 v17.4s, v12.4s, v0.4s\n"
+      "zip1 v31.4s, v29.4s, v24.4s\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q1, [x28], #0x10\n"
+      "zip2 v23.4s, v12.4s, v0.4s\n"
+      "zip2 v24.4s, v29.4s, v24.4s\n"
+      "ldr q11, [x27], #0x10\n"
+      "ldr q29, [x26], #0x10\n"
+      "zip1 v0.4s, v21.4s, v11.4s\n"
+      "zip1 v13.4s, v1.4s, v29.4s\n"
+      "ldr q15, [x25], #0x10\n"
+      "ldr q12, [x23], #0x10\n"
+      "zip2 v21.4s, v21.4s, v11.4s\n"
+      "zip2 v29.4s, v1.4s, v29.4s\n"
+      "ldr q1, [x22], #0x10\n"
+      "zip1 v11.4s, v15.4s, v1.4s\n"
+      "zip2 v1.4s, v15.4s, v1.4s\n"
+      "zip1 v15.4s, v19.4s, v14.4s\n"
+      ".inst 0x0ea169ef  // bfcvtn v15.4h, v15.4s\n"
+      "zip2 v14.4s, v19.4s, v14.4s\n"
+      "ldr q19, [x20], #0x10\n"
+      ".inst 0x4ea169cf  // bfcvtn2 v15.8h, v14.4s\n"
+      "str q15, [x21, #0x0]\n"
+      "zip1 v14.4s, v12.4s, v19.4s\n"
+      "zip2 v15.4s, v12.4s, v19.4s\n"
+      "zip1 v12.4s, v8.4s, v28.4s\n"
+      "zip1 v19.4s, v18.4s, v27.4s\n"
+      ".inst 0x0ea1698c  // bfcvtn v12.4h, v12.4s\n"
+      "zip2 v28.4s, v8.4s, v28.4s\n"
+      "zip1 v8.4s, v9.4s, v10.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      "zip2 v18.4s, v18.4s, v27.4s\n"
+      "zip1 v27.4s, v6.4s, v4.4s\n"
+      ".inst 0x0ea16908  // bfcvtn v8.4h, v8.4s\n"
+      "zip2 v10.4s, v9.4s, v10.4s\n"
+      "zip1 v9.4s, v22.4s, v26.4s\n"
+      ".inst 0x0ea16b7b  // bfcvtn v27.4h, v27.4s\n"
+      "zip2 v6.4s, v6.4s, v4.4s\n"
+      "zip1 v4.4s, v0.4s, v13.4s\n"
+      ".inst 0x0ea16929  // bfcvtn v9.4h, v9.4s\n"
+      "zip2 v22.4s, v22.4s, v26.4s\n"
+      "zip1 v26.4s, v21.4s, v29.4s\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      "zip2 v13.4s, v0.4s, v13.4s\n"
+      "zip1 v0.4s, v16.4s, v5.4s\n"
+      ".inst 0x0ea16b5a  // bfcvtn v26.4h, v26.4s\n"
+      "zip2 v21.4s, v21.4s, v29.4s\n"
+      "zip1 v29.4s, v3.4s, v2.4s\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "zip2 v5.4s, v16.4s, v5.4s\n"
+      "zip1 v16.4s, v25.4s, v7.4s\n"
+      ".inst 0x0ea16bbd  // bfcvtn v29.4h, v29.4s\n"
+      "zip2 v2.4s, v3.4s, v2.4s\n"
+      "zip1 v3.4s, v30.4s, v20.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v7.4s, v25.4s, v7.4s\n"
+      "zip1 v25.4s, v17.4s, v31.4s\n"
+      ".inst 0x0ea16863  // bfcvtn v3.4h, v3.4s\n"
+      "zip2 v30.4s, v30.4s, v20.4s\n"
+      "zip1 v20.4s, v23.4s, v24.4s\n"
+      ".inst 0x0ea16b39  // bfcvtn v25.4h, v25.4s\n"
+      "zip2 v17.4s, v17.4s, v31.4s\n"
+      "zip1 v31.4s, v11.4s, v14.4s\n"
+      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      "zip2 v24.4s, v23.4s, v24.4s\n"
+      "zip1 v23.4s, v1.4s, v15.4s\n"
+      ".inst 0x0ea16bff  // bfcvtn v31.4h, v31.4s\n"
+      "zip2 v14.4s, v11.4s, v14.4s\n"
+      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
+      "zip2 v1.4s, v1.4s, v15.4s\n"
+      ".inst 0x4ea16b8c  // bfcvtn2 v12.8h, v28.4s\n"
+      "str q12, [x21, #0x10]\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16948  // bfcvtn2 v8.8h, v10.4s\n"
+      "str q19, [x21, #0x20]\n"
+      ".inst 0x4ea168db  // bfcvtn2 v27.8h, v6.4s\n"
+      ".inst 0x4ea16ac9  // bfcvtn2 v9.8h, v22.4s\n"
+      "str q8, [x21, #0x30]\n"
+      ".inst 0x4ea169a4  // bfcvtn2 v4.8h, v13.4s\n"
+      ".inst 0x4ea16aba  // bfcvtn2 v26.8h, v21.4s\n"
+      "str q27, [x21, #0x40]\n"
+      ".inst 0x4ea168a0  // bfcvtn2 v0.8h, v5.4s\n"
+      ".inst 0x4ea1685d  // bfcvtn2 v29.8h, v2.4s\n"
+      "str q9, [x21, #0x50]\n"
+      ".inst 0x4ea168f0  // bfcvtn2 v16.8h, v7.4s\n"
+      ".inst 0x4ea16bc3  // bfcvtn2 v3.8h, v30.4s\n"
+      "str q4, [x21, #0x60]\n"
+      ".inst 0x4ea16a39  // bfcvtn2 v25.8h, v17.4s\n"
+      ".inst 0x4ea16b14  // bfcvtn2 v20.8h, v24.4s\n"
+      "str q26, [x21, #0x70]\n"
+      ".inst 0x4ea169df  // bfcvtn2 v31.8h, v14.4s\n"
+      ".inst 0x4ea16837  // bfcvtn2 v23.8h, v1.4s\n"
+      "str q0, [x21, #0x80]\n"
+      "str q29, [x21, #0x90]\n"
+      "str q16, [x21, #0xa0]\n"
+      "str q3, [x21, #0xb0]\n"
+      "str q25, [x21, #0xc0]\n"
+      "str q20, [x21, #0xd0]\n"
+      "str q31, [x21, #0xe0]\n"
+      "str q23, [x21, #0xf0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 4 loop: loop
+      "ldr q23, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v22.4s, v23.4s, v17.4s\n"
+      "zip1 v21.4s, v20.4s, v16.4s\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v28.4s, v23.4s, v17.4s\n"
+      "zip2 v20.4s, v20.4s, v16.4s\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v27.4s, v19.4s, v17.4s\n"
+      "zip1 v26.4s, v18.4s, v16.4s\n"
+      "zip2 v25.4s, v19.4s, v17.4s\n"
+      "zip2 v24.4s, v18.4s, v16.4s\n"
+      "zip1 v19.4s, v22.4s, v21.4s\n"
+      "zip1 v18.4s, v28.4s, v20.4s\n"
+      "zip1 v17.4s, v27.4s, v26.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
+      ".inst 0x0ea16a77  // bfcvtn v23.4h, v19.4s\n"
+      "zip2 v22.4s, v22.4s, v21.4s\n"
+      ".inst 0x0ea16a55  // bfcvtn v21.4h, v18.4s\n"
+      "zip2 v20.4s, v28.4s, v20.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v27.4s, v26.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
+      ".inst 0x4ea16ad7  // bfcvtn2 v23.8h, v22.4s\n"
+      ".inst 0x4ea16a95  // bfcvtn2 v21.8h, v20.4s\n"
+      "str q23, [x21, #0x0]\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q21, [x21, #0x10]\n"
+      "str q19, [x21, #0x80]\n"
+      "str q17, [x21, #0x90]\n"
+      "add x21, x21, #0x20\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.4s, v20.4s, v17.4s\n"
+      "zip1 v16.4s, v19.4s, v16.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d18, [x21, #0x0]\n"
+      "str d16, [x21, #0x80]\n"
+      "add x21, x21, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x8\n"
+      "add %x[out], %x[out], #0x100\n"
+      "bge 1b\n"
+      "cbz %x[height], 16f\n"
+      "8:"  // Main loop skip
+      "9:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "mov x20, %x[width]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "csel x27, x27, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "cmp x20, #0x10\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v30.4s, v21.4s, v17.4s\n"
+      "zip1 v29.4s, v20.4s, v16.4s\n"
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v28.4s, v21.4s, v17.4s\n"
+      "zip2 v27.4s, v20.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v26.4s, v19.4s, v17.4s\n"
+      "zip1 v25.4s, v18.4s, v16.4s\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v8.4s, v19.4s, v17.4s\n"
+      "zip2 v24.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v7.4s, v21.4s, v17.4s\n"
+      "zip1 v6.4s, v20.4s, v16.4s\n"
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v5.4s, v21.4s, v17.4s\n"
+      "zip2 v4.4s, v20.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v3.4s, v19.4s, v17.4s\n"
+      "zip1 v2.4s, v18.4s, v16.4s\n"
+      "zip2 v1.4s, v19.4s, v17.4s\n"
+      "zip2 v0.4s, v18.4s, v16.4s\n"
+      "zip1 v23.4s, v30.4s, v29.4s\n"
+      "zip1 v22.4s, v28.4s, v27.4s\n"
+      "zip1 v21.4s, v26.4s, v25.4s\n"
+      "zip1 v20.4s, v8.4s, v24.4s\n"
+      "zip1 v19.4s, v7.4s, v6.4s\n"
+      "zip1 v18.4s, v5.4s, v4.4s\n"
+      "zip1 v17.4s, v3.4s, v2.4s\n"
+      "zip1 v16.4s, v1.4s, v0.4s\n"
+      ".inst 0x0ea16aff  // bfcvtn v31.4h, v23.4s\n"
+      "zip2 v30.4s, v30.4s, v29.4s\n"
+      ".inst 0x0ea16add  // bfcvtn v29.4h, v22.4s\n"
+      "zip2 v28.4s, v28.4s, v27.4s\n"
+      ".inst 0x0ea16abb  // bfcvtn v27.4h, v21.4s\n"
+      "zip2 v26.4s, v26.4s, v25.4s\n"
+      ".inst 0x0ea16a99  // bfcvtn v25.4h, v20.4s\n"
+      "zip2 v24.4s, v8.4s, v24.4s\n"
+      ".inst 0x0ea16a77  // bfcvtn v23.4h, v19.4s\n"
+      "zip2 v22.4s, v7.4s, v6.4s\n"
+      ".inst 0x0ea16a55  // bfcvtn v21.4h, v18.4s\n"
+      "zip2 v20.4s, v5.4s, v4.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v3.4s, v2.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v1.4s, v0.4s\n"
+      ".inst 0x4ea16bdf  // bfcvtn2 v31.8h, v30.4s\n"
+      ".inst 0x4ea16b9d  // bfcvtn2 v29.8h, v28.4s\n"
+      "str q31, [x21, #0x0]\n"
+      ".inst 0x4ea16b5b  // bfcvtn2 v27.8h, v26.4s\n"
+      ".inst 0x4ea16b19  // bfcvtn2 v25.8h, v24.4s\n"
+      "str q29, [x21, #0x10]\n"
+      ".inst 0x4ea16ad7  // bfcvtn2 v23.8h, v22.4s\n"
+      ".inst 0x4ea16a95  // bfcvtn2 v21.8h, v20.4s\n"
+      "str q27, [x21, #0x20]\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q25, [x21, #0x30]\n"
+      "str q23, [x21, #0x40]\n"
+      "str q21, [x21, #0x50]\n"
+      "str q19, [x21, #0x60]\n"
+      "str q17, [x21, #0x70]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: width 4 loop: loop
+      "ldr q20, [x9], #0x10\n"
+      "ldr q19, [x28], #0x10\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v22.4s, v20.4s, v17.4s\n"
+      "zip1 v18.4s, v19.4s, v16.4s\n"
+      "zip2 v21.4s, v20.4s, v17.4s\n"
+      "zip2 v20.4s, v19.4s, v16.4s\n"
+      "zip1 v17.4s, v22.4s, v18.4s\n"
+      "zip1 v16.4s, v21.4s, v20.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v22.4s, v18.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v21.4s, v20.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q19, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "add x21, x21, #0x20\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 1 loop: loop
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d16, [x21, #0x0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x80\n"
+      "bge 9b\n"
+      "16:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<16, 4, true, VLType::None>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
new file mode 100644
index 0000000000..46211ad4e4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_24(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 12 * height * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x18\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q1, [x25], #0x10\n"
+      "ldr q0, [x22], #0x10\n"
+      "sub x24, x24, #0x18\n"
+      "cmp x24, #0x18\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q31, [x23], #0x10\n"
+      "dup v30.2d, v17.d[0]\n"
+      "dup v29.2d, v31.d[1]\n"
+      "ldr q16, [x22], #0x10\n"
+      "ldr q28, [x20], #0x10\n"
+      "dup v27.2d, v16.d[0]\n"
+      "dup v26.2d, v28.d[1]\n"
+      "ldr q25, [x25], #0x10\n"
+      "ldr q24, [x22], #0x10\n"
+      "dup v23.2d, v17.d[1]\n"
+      "dup v22.2d, v25.d[1]\n"
+      "ldr q21, [x23], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "dup v19.2d, v16.d[1]\n"
+      "dup v18.2d, v24.d[1]\n"
+      "ldr q17, [x23], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "mov v30.d[1], v31.d[0]\n"
+      "mov v29.d[1], v21.d[0]\n"
+      "mov v27.d[1], v28.d[0]\n"
+      "mov v26.d[1], v20.d[0]\n"
+      "str q1, [x21, #0x0]\n"
+      "str q30, [x21, #0x10]\n"
+      "mov v23.d[1], v25.d[0]\n"
+      "mov v22.d[1], v21.d[1]\n"
+      "str q29, [x21, #0x20]\n"
+      "mov v19.d[1], v24.d[0]\n"
+      "mov v18.d[1], v20.d[1]\n"
+      "str q0, [x21, #0x30]\n"
+      "str q27, [x21, #0x40]\n"
+      "str q26, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "str q23, [x21, #0x0]\n"
+      "str q22, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "str q18, [x21, #0x40]\n"
+      "str q16, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q17, [x23], #0x10\n"
+      "ldr q23, [x20], #0x10\n"
+      "dup v22.2d, v17.d[1]\n"
+      "dup v21.2d, v23.d[1]\n"
+      "ldr q20, [x25], #0x10\n"
+      "ldr q19, [x22], #0x10\n"
+      "sub x24, x24, #0xc\n"
+      "cmp x24, #0xc\n"
+      "ldr d18, [x25], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov v18.d[1], v17.d[0]\n"
+      "mov v22.d[1], v16.d[0]\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "mov v17.d[1], v23.d[0]\n"
+      "mov v21.d[1], v16.d[0]\n"
+      "str q20, [x21, #0x0]\n"
+      "str q18, [x21, #0x10]\n"
+      "str q22, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "str q17, [x21, #0x40]\n"
+      "str q21, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x25], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "str d19, [x21, #0x0]\n"
+      "str d18, [x21, #0x18]\n"
+      "str d17, [x21, #0x30]\n"
+      "str d16, [x21, #0x48]\n"
+      "add x21, x21, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x25], #0x2\n"
+      "ldr h18, [x23], #0x2\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr h17, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "str h19, [x21, #0x0]\n"
+      "str h18, [x21, #0x18]\n"
+      "str h17, [x21, #0x30]\n"
+      "str h16, [x21, #0x48]\n"
+      "add x21, x21, #0x2\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0x60\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x20, %x[width]\n"
+      "mov x25, %x[in]\n"
+      "cmp x20, #0x18\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q19, [x25], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "dup v18.2d, v16.d[1]\n"
+      "sub x20, x20, #0x18\n"
+      "ldr q17, [x25], #0x10\n"
+      "dup v16.2d, v16.d[0]\n"
+      "str q19, [x21, #0x0]\n"
+      "cmp x20, #0x18\n"
+      "str d16, [x21, #0x10]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "mov v18.d[1], v17.d[0]\n"
+      "dup v16.2d, v17.d[1]\n"
+      "str q18, [x21, #0x0]\n"
+      "str d16, [x21, #0x10]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q17, [x25], #0x10\n"
+      "ldr d16, [x25], #0x8\n"
+      "sub x20, x20, #0xc\n"
+      "cmp x20, #0xc\n"
+      "str q17, [x21, #0x0]\n"
+      "str d16, [x21, #0x10]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d16, [x25], #0x8\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "str d16, [x21, #0x0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h16, [x25], #0x2\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "str h16, [x21, #0x0]\n"
+      "add x21, x21, #0x2\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x18\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<6, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<12, 1, true, VLType::None>(
+    int16_t *out, const int16_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int16_t) / 2,
+        stride * sizeof(int16_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<12, 1, true, VLType::None>(
+    uint16_t *out, const uint16_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint16_t) / 2,
+        stride * sizeof(uint16_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..1cb7bc4445
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
@@ -0,0 +1,786 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_24_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 24 * roundup<size_t>(height, 4) * sizeof(bfloat16);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x18\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q15, [x9], #0x10\n"
+      "ldr q1, [x28], #0x10\n"
+      "sub x24, x24, #0x18\n"
+      "cmp x24, #0x18\n"
+      "ldr q0, [x27], #0x10\n"
+      "ldr q27, [x26], #0x10\n"
+      "zip1 v18.4s, v15.4s, v0.4s\n"
+      "zip1 v20.4s, v1.4s, v27.4s\n"
+      "ldr q13, [x25], #0x10\n"
+      "ldr q14, [x23], #0x10\n"
+      "zip2 v16.4s, v15.4s, v0.4s\n"
+      "zip2 v3.4s, v1.4s, v27.4s\n"
+      "ldr q12, [x22], #0x10\n"
+      "ldr q11, [x20], #0x10\n"
+      "zip1 v4.4s, v13.4s, v12.4s\n"
+      "zip1 v28.4s, v14.4s, v11.4s\n"
+      "ldr q5, [x9], #0x10\n"
+      "ldr q30, [x28], #0x10\n"
+      "zip2 v23.4s, v13.4s, v12.4s\n"
+      "zip2 v19.4s, v14.4s, v11.4s\n"
+      "ldr q25, [x27], #0x10\n"
+      "ldr q11, [x26], #0x10\n"
+      "zip1 v21.4s, v5.4s, v25.4s\n"
+      "zip1 v14.4s, v30.4s, v11.4s\n"
+      "ldr q6, [x25], #0x10\n"
+      "ldr q27, [x23], #0x10\n"
+      "zip2 v29.4s, v5.4s, v25.4s\n"
+      "zip2 v17.4s, v30.4s, v11.4s\n"
+      "ldr q2, [x22], #0x10\n"
+      "ldr q10, [x20], #0x10\n"
+      "zip1 v11.4s, v6.4s, v2.4s\n"
+      "zip1 v1.4s, v27.4s, v10.4s\n"
+      "ldr q8, [x9], #0x10\n"
+      "ldr q5, [x28], #0x10\n"
+      "zip2 v24.4s, v6.4s, v2.4s\n"
+      "zip2 v0.4s, v27.4s, v10.4s\n"
+      "ldr q6, [x27], #0x10\n"
+      "ldr q31, [x26], #0x10\n"
+      "zip1 v12.4s, v8.4s, v6.4s\n"
+      "zip1 v10.4s, v5.4s, v31.4s\n"
+      "ldr q30, [x25], #0x10\n"
+      "ldr q2, [x23], #0x10\n"
+      "zip2 v9.4s, v8.4s, v6.4s\n"
+      "zip2 v13.4s, v5.4s, v31.4s\n"
+      "ldr q7, [x22], #0x10\n"
+      "ldr q8, [x20], #0x10\n"
+      "zip1 v27.4s, v30.4s, v7.4s\n"
+      "zip1 v31.4s, v2.4s, v8.4s\n"
+      "ldr q5, [x9], #0x10\n"
+      "ldr q26, [x28], #0x10\n"
+      "zip2 v22.4s, v30.4s, v7.4s\n"
+      "zip2 v8.4s, v2.4s, v8.4s\n"
+      "ldr q2, [x27], #0x10\n"
+      "ldr q6, [x26], #0x10\n"
+      "zip1 v25.4s, v5.4s, v2.4s\n"
+      "zip1 v15.4s, v26.4s, v6.4s\n"
+      "ldr q7, [x25], #0x10\n"
+      "ldr q30, [x23], #0x10\n"
+      "zip2 v5.4s, v5.4s, v2.4s\n"
+      "zip2 v26.4s, v26.4s, v6.4s\n"
+      "ldr q2, [x22], #0x10\n"
+      "zip1 v6.4s, v7.4s, v2.4s\n"
+      "zip2 v7.4s, v7.4s, v2.4s\n"
+      "zip1 v2.4s, v18.4s, v20.4s\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "zip2 v20.4s, v18.4s, v20.4s\n"
+      "ldr q18, [x20], #0x10\n"
+      ".inst 0x4ea16a82  // bfcvtn2 v2.8h, v20.4s\n"
+      "zip1 v20.4s, v30.4s, v18.4s\n"
+      "zip2 v18.4s, v30.4s, v18.4s\n"
+      "zip1 v30.4s, v16.4s, v3.4s\n"
+      ".inst 0x0ea16bde  // bfcvtn v30.4h, v30.4s\n"
+      "zip2 v3.4s, v16.4s, v3.4s\n"
+      "ldr q16, [x9], #0x10\n"
+      ".inst 0x4ea1687e  // bfcvtn2 v30.8h, v3.4s\n"
+      "zip1 v3.4s, v21.4s, v14.4s\n"
+      ".inst 0x0ea16863  // bfcvtn v3.4h, v3.4s\n"
+      "zip2 v21.4s, v21.4s, v14.4s\n"
+      "ldr q14, [x28], #0x10\n"
+      ".inst 0x4ea16aa3  // bfcvtn2 v3.8h, v21.4s\n"
+      "zip1 v21.4s, v29.4s, v17.4s\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      "zip2 v29.4s, v29.4s, v17.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      ".inst 0x4ea16bb5  // bfcvtn2 v21.8h, v29.4s\n"
+      "zip1 v29.4s, v16.4s, v17.4s\n"
+      "zip2 v16.4s, v16.4s, v17.4s\n"
+      "zip1 v17.4s, v12.4s, v10.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      "zip2 v10.4s, v12.4s, v10.4s\n"
+      "ldr q12, [x26], #0x10\n"
+      ".inst 0x4ea16951  // bfcvtn2 v17.8h, v10.4s\n"
+      "zip1 v10.4s, v14.4s, v12.4s\n"
+      "zip2 v14.4s, v14.4s, v12.4s\n"
+      "zip1 v12.4s, v9.4s, v13.4s\n"
+      ".inst 0x0ea1698c  // bfcvtn v12.4h, v12.4s\n"
+      "zip2 v13.4s, v9.4s, v13.4s\n"
+      "ldr q9, [x25], #0x10\n"
+      ".inst 0x4ea169ac  // bfcvtn2 v12.8h, v13.4s\n"
+      "zip1 v13.4s, v25.4s, v15.4s\n"
+      ".inst 0x0ea169ad  // bfcvtn v13.4h, v13.4s\n"
+      "zip2 v25.4s, v25.4s, v15.4s\n"
+      "ldr q15, [x23], #0x10\n"
+      ".inst 0x4ea16b2d  // bfcvtn2 v13.8h, v25.4s\n"
+      "zip1 v25.4s, v5.4s, v26.4s\n"
+      ".inst 0x0ea16b39  // bfcvtn v25.4h, v25.4s\n"
+      "zip2 v5.4s, v5.4s, v26.4s\n"
+      "ldr q26, [x22], #0x10\n"
+      ".inst 0x4ea168b9  // bfcvtn2 v25.8h, v5.4s\n"
+      "zip1 v5.4s, v9.4s, v26.4s\n"
+      "zip2 v9.4s, v9.4s, v26.4s\n"
+      "zip1 v26.4s, v29.4s, v10.4s\n"
+      ".inst 0x0ea16b5a  // bfcvtn v26.4h, v26.4s\n"
+      "zip2 v10.4s, v29.4s, v10.4s\n"
+      "ldr q29, [x20], #0x10\n"
+      ".inst 0x4ea1695a  // bfcvtn2 v26.8h, v10.4s\n"
+      "zip1 v10.4s, v15.4s, v29.4s\n"
+      "zip2 v15.4s, v15.4s, v29.4s\n"
+      "zip1 v29.4s, v16.4s, v14.4s\n"
+      ".inst 0x0ea16bbd  // bfcvtn v29.4h, v29.4s\n"
+      "zip2 v14.4s, v16.4s, v14.4s\n"
+      "ldr q16, [x9], #0x10\n"
+      ".inst 0x4ea169dd  // bfcvtn2 v29.8h, v14.4s\n"
+      "zip1 v14.4s, v4.4s, v28.4s\n"
+      ".inst 0x0ea169ce  // bfcvtn v14.4h, v14.4s\n"
+      "zip2 v4.4s, v4.4s, v28.4s\n"
+      "ldr q28, [x28], #0x10\n"
+      ".inst 0x4ea1688e  // bfcvtn2 v14.8h, v4.4s\n"
+      "zip1 v4.4s, v23.4s, v19.4s\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      "zip2 v19.4s, v23.4s, v19.4s\n"
+      "ldr q23, [x27], #0x10\n"
+      ".inst 0x4ea16a64  // bfcvtn2 v4.8h, v19.4s\n"
+      "zip1 v19.4s, v16.4s, v23.4s\n"
+      "zip2 v16.4s, v16.4s, v23.4s\n"
+      "zip1 v23.4s, v11.4s, v1.4s\n"
+      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
+      "zip2 v1.4s, v11.4s, v1.4s\n"
+      "ldr q11, [x26], #0x10\n"
+      ".inst 0x4ea16837  // bfcvtn2 v23.8h, v1.4s\n"
+      "zip1 v1.4s, v28.4s, v11.4s\n"
+      "zip2 v28.4s, v28.4s, v11.4s\n"
+      "zip1 v11.4s, v19.4s, v1.4s\n"
+      ".inst 0x0ea1696b  // bfcvtn v11.4h, v11.4s\n"
+      "zip2 v19.4s, v19.4s, v1.4s\n"
+      "ldr q1, [x25], #0x10\n"
+      ".inst 0x4ea16a6b  // bfcvtn2 v11.8h, v19.4s\n"
+      "zip1 v19.4s, v16.4s, v28.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      "zip2 v16.4s, v16.4s, v28.4s\n"
+      "ldr q28, [x23], #0x10\n"
+      ".inst 0x4ea16a13  // bfcvtn2 v19.8h, v16.4s\n"
+      "zip1 v16.4s, v24.4s, v0.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v24.4s, v24.4s, v0.4s\n"
+      "ldr q0, [x22], #0x10\n"
+      ".inst 0x4ea16b10  // bfcvtn2 v16.8h, v24.4s\n"
+      "ldr q24, [x20], #0x10\n"
+      "str q2, [x21, #0x0]\n"
+      "zip1 v2.4s, v1.4s, v0.4s\n"
+      "zip2 v0.4s, v1.4s, v0.4s\n"
+      "zip1 v1.4s, v28.4s, v24.4s\n"
+      "zip2 v28.4s, v28.4s, v24.4s\n"
+      "str q30, [x21, #0x10]\n"
+      "zip1 v24.4s, v27.4s, v31.4s\n"
+      "zip1 v30.4s, v22.4s, v8.4s\n"
+      "str q3, [x21, #0x20]\n"
+      "zip1 v3.4s, v6.4s, v20.4s\n"
+      ".inst 0x0ea16b18  // bfcvtn v24.4h, v24.4s\n"
+      "str q21, [x21, #0x30]\n"
+      "zip1 v21.4s, v7.4s, v18.4s\n"
+      "zip2 v31.4s, v27.4s, v31.4s\n"
+      "str q17, [x21, #0x40]\n"
+      "zip1 v17.4s, v5.4s, v10.4s\n"
+      "zip1 v27.4s, v9.4s, v15.4s\n"
+      "str q12, [x21, #0x50]\n"
+      "zip1 v12.4s, v2.4s, v1.4s\n"
+      ".inst 0x0ea16bde  // bfcvtn v30.4h, v30.4s\n"
+      "str q13, [x21, #0x60]\n"
+      "zip1 v13.4s, v0.4s, v28.4s\n"
+      "zip2 v22.4s, v22.4s, v8.4s\n"
+      "str q25, [x21, #0x70]\n"
+      ".inst 0x0ea16879  // bfcvtn v25.4h, v3.4s\n"
+      "zip2 v8.4s, v6.4s, v20.4s\n"
+      "str q26, [x21, #0x80]\n"
+      ".inst 0x0ea16aa3  // bfcvtn v3.4h, v21.4s\n"
+      "zip2 v18.4s, v7.4s, v18.4s\n"
+      "str q29, [x21, #0x90]\n"
+      ".inst 0x0ea16a27  // bfcvtn v7.4h, v17.4s\n"
+      "zip2 v21.4s, v5.4s, v10.4s\n"
+      "str q11, [x21, #0xa0]\n"
+      ".inst 0x0ea16b65  // bfcvtn v5.4h, v27.4s\n"
+      "zip2 v15.4s, v9.4s, v15.4s\n"
+      "str q19, [x21, #0xb0]\n"
+      ".inst 0x0ea16991  // bfcvtn v17.4h, v12.4s\n"
+      "zip2 v20.4s, v2.4s, v1.4s\n"
+      "str q14, [x21, #0xc0]\n"
+      ".inst 0x0ea169bb  // bfcvtn v27.4h, v13.4s\n"
+      "zip2 v29.4s, v0.4s, v28.4s\n"
+      "str q4, [x21, #0xd0]\n"
+      ".inst 0x4ea16bf8  // bfcvtn2 v24.8h, v31.4s\n"
+      ".inst 0x4ea16ade  // bfcvtn2 v30.8h, v22.4s\n"
+      "str q23, [x21, #0xe0]\n"
+      ".inst 0x4ea16919  // bfcvtn2 v25.8h, v8.4s\n"
+      ".inst 0x4ea16a43  // bfcvtn2 v3.8h, v18.4s\n"
+      "str q16, [x21, #0xf0]\n"
+      ".inst 0x4ea16aa7  // bfcvtn2 v7.8h, v21.4s\n"
+      ".inst 0x4ea169e5  // bfcvtn2 v5.8h, v15.4s\n"
+      "str q24, [x21, #0x100]\n"
+      ".inst 0x4ea16a91  // bfcvtn2 v17.8h, v20.4s\n"
+      ".inst 0x4ea16bbb  // bfcvtn2 v27.8h, v29.4s\n"
+      "str q30, [x21, #0x110]\n"
+      "str q25, [x21, #0x120]\n"
+      "str q3, [x21, #0x130]\n"
+      "str q7, [x21, #0x140]\n"
+      "str q5, [x21, #0x150]\n"
+      "str q17, [x21, #0x160]\n"
+      "str q27, [x21, #0x170]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x24, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q9, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q15, [x27], #0x10\n"
+      "ldr q17, [x26], #0x10\n"
+      "zip1 v14.4s, v9.4s, v15.4s\n"
+      "zip1 v11.4s, v18.4s, v17.4s\n"
+      "ldr q7, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip2 v12.4s, v9.4s, v15.4s\n"
+      "zip2 v6.4s, v18.4s, v17.4s\n"
+      "ldr q15, [x22], #0x10\n"
+      "ldr q3, [x20], #0x10\n"
+      "zip1 v30.4s, v7.4s, v15.4s\n"
+      "zip1 v20.4s, v16.4s, v3.4s\n"
+      "ldr q17, [x9], #0x10\n"
+      "ldr q9, [x28], #0x10\n"
+      "zip2 v1.4s, v7.4s, v15.4s\n"
+      "zip2 v24.4s, v16.4s, v3.4s\n"
+      "ldr q10, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v0.4s, v17.4s, v10.4s\n"
+      "zip1 v8.4s, v9.4s, v16.4s\n"
+      "ldr q7, [x25], #0x10\n"
+      "ldr q2, [x23], #0x10\n"
+      "zip2 v17.4s, v17.4s, v10.4s\n"
+      "zip2 v3.4s, v9.4s, v16.4s\n"
+      "ldr q9, [x22], #0x10\n"
+      "ldr q10, [x20], #0x10\n"
+      "zip1 v25.4s, v7.4s, v9.4s\n"
+      "zip1 v23.4s, v2.4s, v10.4s\n"
+      "ldr q31, [x9], #0x10\n"
+      "ldr q21, [x28], #0x10\n"
+      "zip2 v16.4s, v7.4s, v9.4s\n"
+      "zip2 v27.4s, v2.4s, v10.4s\n"
+      "ldr q26, [x27], #0x10\n"
+      "ldr q19, [x26], #0x10\n"
+      "zip1 v2.4s, v31.4s, v26.4s\n"
+      "zip1 v7.4s, v21.4s, v19.4s\n"
+      "ldr q29, [x25], #0x10\n"
+      "ldr q13, [x23], #0x10\n"
+      "zip2 v31.4s, v31.4s, v26.4s\n"
+      "zip2 v19.4s, v21.4s, v19.4s\n"
+      "ldr q4, [x22], #0x10\n"
+      "ldr q18, [x20], #0x10\n"
+      "zip1 v26.4s, v29.4s, v4.4s\n"
+      "zip1 v15.4s, v13.4s, v18.4s\n"
+      "ldr q9, [x9], #0x10\n"
+      "ldr q22, [x28], #0x10\n"
+      "zip2 v4.4s, v29.4s, v4.4s\n"
+      "zip2 v18.4s, v13.4s, v18.4s\n"
+      "ldr q29, [x27], #0x10\n"
+      "ldr q10, [x26], #0x10\n"
+      "zip1 v21.4s, v9.4s, v29.4s\n"
+      "zip1 v5.4s, v22.4s, v10.4s\n"
+      "ldr q28, [x25], #0x10\n"
+      "ldr q13, [x23], #0x10\n"
+      "zip2 v29.4s, v9.4s, v29.4s\n"
+      "zip2 v9.4s, v22.4s, v10.4s\n"
+      "ldr q22, [x22], #0x10\n"
+      "zip1 v10.4s, v28.4s, v22.4s\n"
+      "zip2 v28.4s, v28.4s, v22.4s\n"
+      "zip1 v22.4s, v14.4s, v11.4s\n"
+      ".inst 0x0ea16ad6  // bfcvtn v22.4h, v22.4s\n"
+      "zip2 v11.4s, v14.4s, v11.4s\n"
+      "ldr q14, [x20], #0x10\n"
+      ".inst 0x4ea16976  // bfcvtn2 v22.8h, v11.4s\n"
+      "str q22, [x21, #0x0]\n"
+      "zip1 v22.4s, v13.4s, v14.4s\n"
+      "zip2 v14.4s, v13.4s, v14.4s\n"
+      "zip1 v13.4s, v12.4s, v6.4s\n"
+      "zip1 v11.4s, v0.4s, v8.4s\n"
+      ".inst 0x0ea169ad  // bfcvtn v13.4h, v13.4s\n"
+      "zip2 v12.4s, v12.4s, v6.4s\n"
+      "zip1 v6.4s, v17.4s, v3.4s\n"
+      ".inst 0x0ea1696b  // bfcvtn v11.4h, v11.4s\n"
+      "zip2 v0.4s, v0.4s, v8.4s\n"
+      "zip1 v8.4s, v2.4s, v7.4s\n"
+      ".inst 0x0ea168c6  // bfcvtn v6.4h, v6.4s\n"
+      "zip2 v3.4s, v17.4s, v3.4s\n"
+      "zip1 v17.4s, v31.4s, v19.4s\n"
+      ".inst 0x0ea16908  // bfcvtn v8.4h, v8.4s\n"
+      "zip2 v2.4s, v2.4s, v7.4s\n"
+      "zip1 v7.4s, v21.4s, v5.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      "zip2 v31.4s, v31.4s, v19.4s\n"
+      "zip1 v19.4s, v29.4s, v9.4s\n"
+      ".inst 0x0ea168e7  // bfcvtn v7.4h, v7.4s\n"
+      "zip2 v21.4s, v21.4s, v5.4s\n"
+      "zip1 v5.4s, v30.4s, v20.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      "zip2 v29.4s, v29.4s, v9.4s\n"
+      "zip1 v9.4s, v1.4s, v24.4s\n"
+      ".inst 0x0ea168a5  // bfcvtn v5.4h, v5.4s\n"
+      "zip2 v20.4s, v30.4s, v20.4s\n"
+      "zip1 v30.4s, v25.4s, v23.4s\n"
+      ".inst 0x0ea16929  // bfcvtn v9.4h, v9.4s\n"
+      "zip2 v1.4s, v1.4s, v24.4s\n"
+      "zip1 v24.4s, v16.4s, v27.4s\n"
+      ".inst 0x0ea16bde  // bfcvtn v30.4h, v30.4s\n"
+      "zip2 v23.4s, v25.4s, v23.4s\n"
+      "zip1 v25.4s, v26.4s, v15.4s\n"
+      ".inst 0x0ea16b18  // bfcvtn v24.4h, v24.4s\n"
+      "zip2 v27.4s, v16.4s, v27.4s\n"
+      "zip1 v16.4s, v4.4s, v18.4s\n"
+      ".inst 0x0ea16b39  // bfcvtn v25.4h, v25.4s\n"
+      "zip2 v15.4s, v26.4s, v15.4s\n"
+      "zip1 v26.4s, v10.4s, v22.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v4.4s, v18.4s\n"
+      "zip1 v4.4s, v28.4s, v14.4s\n"
+      ".inst 0x0ea16b5a  // bfcvtn v26.4h, v26.4s\n"
+      "zip2 v10.4s, v10.4s, v22.4s\n"
+      ".inst 0x0ea16896  // bfcvtn v22.4h, v4.4s\n"
+      "zip2 v4.4s, v28.4s, v14.4s\n"
+      ".inst 0x4ea1698d  // bfcvtn2 v13.8h, v12.4s\n"
+      "str q13, [x21, #0x10]\n"
+      ".inst 0x4ea1680b  // bfcvtn2 v11.8h, v0.4s\n"
+      ".inst 0x4ea16866  // bfcvtn2 v6.8h, v3.4s\n"
+      "str q11, [x21, #0x20]\n"
+      ".inst 0x4ea16848  // bfcvtn2 v8.8h, v2.4s\n"
+      ".inst 0x4ea16bf1  // bfcvtn2 v17.8h, v31.4s\n"
+      "str q6, [x21, #0x30]\n"
+      ".inst 0x4ea16aa7  // bfcvtn2 v7.8h, v21.4s\n"
+      ".inst 0x4ea16bb3  // bfcvtn2 v19.8h, v29.4s\n"
+      "str q8, [x21, #0x40]\n"
+      ".inst 0x4ea16a85  // bfcvtn2 v5.8h, v20.4s\n"
+      ".inst 0x4ea16829  // bfcvtn2 v9.8h, v1.4s\n"
+      "str q17, [x21, #0x50]\n"
+      ".inst 0x4ea16afe  // bfcvtn2 v30.8h, v23.4s\n"
+      ".inst 0x4ea16b78  // bfcvtn2 v24.8h, v27.4s\n"
+      "str q7, [x21, #0x60]\n"
+      ".inst 0x4ea169f9  // bfcvtn2 v25.8h, v15.4s\n"
+      ".inst 0x4ea16a50  // bfcvtn2 v16.8h, v18.4s\n"
+      "str q19, [x21, #0x70]\n"
+      ".inst 0x4ea1695a  // bfcvtn2 v26.8h, v10.4s\n"
+      ".inst 0x4ea16896  // bfcvtn2 v22.8h, v4.4s\n"
+      "str q5, [x21, #0xc0]\n"
+      "str q9, [x21, #0xd0]\n"
+      "str q30, [x21, #0xe0]\n"
+      "str q24, [x21, #0xf0]\n"
+      "str q25, [x21, #0x100]\n"
+      "str q16, [x21, #0x110]\n"
+      "str q26, [x21, #0x120]\n"
+      "str q22, [x21, #0x130]\n"
+      "add x21, x21, #0x80\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr q23, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v22.4s, v23.4s, v17.4s\n"
+      "zip1 v21.4s, v20.4s, v16.4s\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v28.4s, v23.4s, v17.4s\n"
+      "zip2 v20.4s, v20.4s, v16.4s\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v27.4s, v19.4s, v17.4s\n"
+      "zip1 v26.4s, v18.4s, v16.4s\n"
+      "zip2 v25.4s, v19.4s, v17.4s\n"
+      "zip2 v24.4s, v18.4s, v16.4s\n"
+      "zip1 v19.4s, v22.4s, v21.4s\n"
+      "zip1 v18.4s, v28.4s, v20.4s\n"
+      "zip1 v17.4s, v27.4s, v26.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
+      ".inst 0x0ea16a77  // bfcvtn v23.4h, v19.4s\n"
+      "zip2 v22.4s, v22.4s, v21.4s\n"
+      ".inst 0x0ea16a55  // bfcvtn v21.4h, v18.4s\n"
+      "zip2 v20.4s, v28.4s, v20.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v27.4s, v26.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
+      ".inst 0x4ea16ad7  // bfcvtn2 v23.8h, v22.4s\n"
+      ".inst 0x4ea16a95  // bfcvtn2 v21.8h, v20.4s\n"
+      "str q23, [x21, #0x0]\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q21, [x21, #0x10]\n"
+      "str q19, [x21, #0xc0]\n"
+      "str q17, [x21, #0xd0]\n"
+      "add x21, x21, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.4s, v20.4s, v17.4s\n"
+      "zip1 v16.4s, v19.4s, v16.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d18, [x21, #0x0]\n"
+      "str d16, [x21, #0xc0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x8\n"
+      "add %x[out], %x[out], #0x180\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "mov x20, %x[width]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "csel x27, x27, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "cmp x20, #0x18\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q20, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "sub x20, x20, #0x18\n"
+      "cmp x20, #0x18\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v9.4s, v20.4s, v17.4s\n"
+      "zip1 v30.4s, v18.4s, v16.4s\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q19, [x28], #0x10\n"
+      "zip2 v17.4s, v20.4s, v17.4s\n"
+      "zip2 v5.4s, v18.4s, v16.4s\n"
+      "ldr q18, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v0.4s, v21.4s, v18.4s\n"
+      "zip1 v3.4s, v19.4s, v16.4s\n"
+      "ldr q23, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v1.4s, v21.4s, v18.4s\n"
+      "zip2 v16.4s, v19.4s, v16.4s\n"
+      "ldr q19, [x27], #0x10\n"
+      "ldr q18, [x26], #0x10\n"
+      "zip1 v4.4s, v23.4s, v19.4s\n"
+      "zip1 v2.4s, v20.4s, v18.4s\n"
+      "ldr q22, [x9], #0x10\n"
+      "ldr q21, [x28], #0x10\n"
+      "zip2 v27.4s, v23.4s, v19.4s\n"
+      "zip2 v28.4s, v20.4s, v18.4s\n"
+      "ldr q20, [x27], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v25.4s, v22.4s, v20.4s\n"
+      "zip1 v26.4s, v21.4s, v24.4s\n"
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v14.4s, v22.4s, v20.4s\n"
+      "zip2 v12.4s, v21.4s, v24.4s\n"
+      "ldr q31, [x27], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v15.4s, v19.4s, v31.4s\n"
+      "zip1 v13.4s, v18.4s, v24.4s\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q11, [x28], #0x10\n"
+      "zip2 v20.4s, v19.4s, v31.4s\n"
+      "zip2 v10.4s, v18.4s, v24.4s\n"
+      "ldr q22, [x27], #0x10\n"
+      "ldr q23, [x26], #0x10\n"
+      "zip1 v19.4s, v21.4s, v22.4s\n"
+      "zip1 v18.4s, v11.4s, v23.4s\n"
+      "zip2 v6.4s, v21.4s, v22.4s\n"
+      "zip2 v11.4s, v11.4s, v23.4s\n"
+      "zip1 v8.4s, v9.4s, v30.4s\n"
+      "zip1 v21.4s, v17.4s, v5.4s\n"
+      "zip1 v7.4s, v0.4s, v3.4s\n"
+      "zip1 v31.4s, v1.4s, v16.4s\n"
+      "zip1 v29.4s, v4.4s, v2.4s\n"
+      "zip1 v22.4s, v27.4s, v28.4s\n"
+      "zip1 v24.4s, v25.4s, v26.4s\n"
+      "zip1 v23.4s, v14.4s, v12.4s\n"
+      ".inst 0x0ea16908  // bfcvtn v8.4h, v8.4s\n"
+      "zip2 v9.4s, v9.4s, v30.4s\n"
+      "zip1 v30.4s, v15.4s, v13.4s\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      "zip2 v5.4s, v17.4s, v5.4s\n"
+      "zip1 v17.4s, v20.4s, v10.4s\n"
+      ".inst 0x0ea168e7  // bfcvtn v7.4h, v7.4s\n"
+      "zip2 v0.4s, v0.4s, v3.4s\n"
+      "zip1 v3.4s, v19.4s, v18.4s\n"
+      ".inst 0x0ea16bff  // bfcvtn v31.4h, v31.4s\n"
+      "zip2 v16.4s, v1.4s, v16.4s\n"
+      "zip1 v1.4s, v6.4s, v11.4s\n"
+      ".inst 0x0ea16bbd  // bfcvtn v29.4h, v29.4s\n"
+      "zip2 v2.4s, v4.4s, v2.4s\n"
+      ".inst 0x0ea16ac4  // bfcvtn v4.4h, v22.4s\n"
+      "zip2 v27.4s, v27.4s, v28.4s\n"
+      ".inst 0x0ea16b1c  // bfcvtn v28.4h, v24.4s\n"
+      "zip2 v25.4s, v25.4s, v26.4s\n"
+      ".inst 0x0ea16afa  // bfcvtn v26.4h, v23.4s\n"
+      "zip2 v14.4s, v14.4s, v12.4s\n"
+      ".inst 0x0ea16bd8  // bfcvtn v24.4h, v30.4s\n"
+      "zip2 v13.4s, v15.4s, v13.4s\n"
+      ".inst 0x0ea16a2f  // bfcvtn v15.4h, v17.4s\n"
+      "zip2 v12.4s, v20.4s, v10.4s\n"
+      ".inst 0x0ea16874  // bfcvtn v20.4h, v3.4s\n"
+      "zip2 v10.4s, v19.4s, v18.4s\n"
+      ".inst 0x0ea16831  // bfcvtn v17.4h, v1.4s\n"
+      "zip2 v18.4s, v6.4s, v11.4s\n"
+      ".inst 0x4ea16928  // bfcvtn2 v8.8h, v9.4s\n"
+      ".inst 0x4ea168b5  // bfcvtn2 v21.8h, v5.4s\n"
+      "str q8, [x21, #0x0]\n"
+      ".inst 0x4ea16807  // bfcvtn2 v7.8h, v0.4s\n"
+      ".inst 0x4ea16a1f  // bfcvtn2 v31.8h, v16.4s\n"
+      "str q21, [x21, #0x10]\n"
+      ".inst 0x4ea1685d  // bfcvtn2 v29.8h, v2.4s\n"
+      ".inst 0x4ea16b64  // bfcvtn2 v4.8h, v27.4s\n"
+      "str q7, [x21, #0x20]\n"
+      ".inst 0x4ea16b3c  // bfcvtn2 v28.8h, v25.4s\n"
+      ".inst 0x4ea169da  // bfcvtn2 v26.8h, v14.4s\n"
+      "str q31, [x21, #0x30]\n"
+      ".inst 0x4ea169b8  // bfcvtn2 v24.8h, v13.4s\n"
+      ".inst 0x4ea1698f  // bfcvtn2 v15.8h, v12.4s\n"
+      "str q29, [x21, #0x40]\n"
+      ".inst 0x4ea16954  // bfcvtn2 v20.8h, v10.4s\n"
+      ".inst 0x4ea16a51  // bfcvtn2 v17.8h, v18.4s\n"
+      "str q4, [x21, #0x50]\n"
+      "str q28, [x21, #0x60]\n"
+      "str q26, [x21, #0x70]\n"
+      "str q24, [x21, #0x80]\n"
+      "str q15, [x21, #0x90]\n"
+      "str q20, [x21, #0xa0]\n"
+      "str q17, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v30.4s, v21.4s, v17.4s\n"
+      "zip1 v29.4s, v20.4s, v16.4s\n"
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v28.4s, v21.4s, v17.4s\n"
+      "zip2 v27.4s, v20.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v26.4s, v19.4s, v17.4s\n"
+      "zip1 v25.4s, v18.4s, v16.4s\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v8.4s, v19.4s, v17.4s\n"
+      "zip2 v24.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v7.4s, v21.4s, v17.4s\n"
+      "zip1 v6.4s, v20.4s, v16.4s\n"
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v5.4s, v21.4s, v17.4s\n"
+      "zip2 v4.4s, v20.4s, v16.4s\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v3.4s, v19.4s, v17.4s\n"
+      "zip1 v2.4s, v18.4s, v16.4s\n"
+      "zip2 v1.4s, v19.4s, v17.4s\n"
+      "zip2 v0.4s, v18.4s, v16.4s\n"
+      "zip1 v23.4s, v30.4s, v29.4s\n"
+      "zip1 v22.4s, v28.4s, v27.4s\n"
+      "zip1 v21.4s, v26.4s, v25.4s\n"
+      "zip1 v20.4s, v8.4s, v24.4s\n"
+      "zip1 v19.4s, v7.4s, v6.4s\n"
+      "zip1 v18.4s, v5.4s, v4.4s\n"
+      "zip1 v17.4s, v3.4s, v2.4s\n"
+      "zip1 v16.4s, v1.4s, v0.4s\n"
+      ".inst 0x0ea16aff  // bfcvtn v31.4h, v23.4s\n"
+      "zip2 v30.4s, v30.4s, v29.4s\n"
+      ".inst 0x0ea16add  // bfcvtn v29.4h, v22.4s\n"
+      "zip2 v28.4s, v28.4s, v27.4s\n"
+      ".inst 0x0ea16abb  // bfcvtn v27.4h, v21.4s\n"
+      "zip2 v26.4s, v26.4s, v25.4s\n"
+      ".inst 0x0ea16a99  // bfcvtn v25.4h, v20.4s\n"
+      "zip2 v24.4s, v8.4s, v24.4s\n"
+      ".inst 0x0ea16a77  // bfcvtn v23.4h, v19.4s\n"
+      "zip2 v22.4s, v7.4s, v6.4s\n"
+      ".inst 0x0ea16a55  // bfcvtn v21.4h, v18.4s\n"
+      "zip2 v20.4s, v5.4s, v4.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v3.4s, v2.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v1.4s, v0.4s\n"
+      ".inst 0x4ea16bdf  // bfcvtn2 v31.8h, v30.4s\n"
+      ".inst 0x4ea16b9d  // bfcvtn2 v29.8h, v28.4s\n"
+      "str q31, [x21, #0x0]\n"
+      ".inst 0x4ea16b5b  // bfcvtn2 v27.8h, v26.4s\n"
+      ".inst 0x4ea16b19  // bfcvtn2 v25.8h, v24.4s\n"
+      "str q29, [x21, #0x10]\n"
+      ".inst 0x4ea16ad7  // bfcvtn2 v23.8h, v22.4s\n"
+      ".inst 0x4ea16a95  // bfcvtn2 v21.8h, v20.4s\n"
+      "str q27, [x21, #0x20]\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q25, [x21, #0x30]\n"
+      "str q23, [x21, #0x40]\n"
+      "str q21, [x21, #0x50]\n"
+      "str q19, [x21, #0x60]\n"
+      "str q17, [x21, #0x70]\n"
+      "add x21, x21, #0x80\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr q20, [x9], #0x10\n"
+      "ldr q19, [x28], #0x10\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v22.4s, v20.4s, v17.4s\n"
+      "zip1 v18.4s, v19.4s, v16.4s\n"
+      "zip2 v21.4s, v20.4s, v17.4s\n"
+      "zip2 v20.4s, v19.4s, v16.4s\n"
+      "zip1 v17.4s, v22.4s, v18.4s\n"
+      "zip1 v16.4s, v21.4s, v20.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v22.4s, v18.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v21.4s, v20.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q19, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "add x21, x21, #0x20\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d16, [x21, #0x0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0xc0\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<24, 4, true, VLType::None>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
new file mode 100644
index 0000000000..dcaf69d2a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 12 * height * sizeof(float);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x18\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "sub x24, x24, #0x18\n"
+      "shll v26.4s, v18.4h, #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "ldr q25, [x20], #0x10\n"
+      "shll2 v24.4s, v18.8h, #0x10\n"
+      "shll v5.4s, v17.4h, #0x10\n"
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "shll v21.4s, v23.4h, #0x10\n"
+      "shll2 v4.4s, v17.8h, #0x10\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q3, [x20], #0x10\n"
+      "shll v2.4s, v22.4h, #0x10\n"
+      "shll v1.4s, v16.4h, #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "shll2 v0.4s, v16.8h, #0x10\n"
+      "shll v31.4s, v20.4h, #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "shll v30.4s, v25.4h, #0x10\n"
+      "shll2 v29.4s, v25.8h, #0x10\n"
+      "shll v28.4s, v3.4h, #0x10\n"
+      "str q26, [x21, #0x0]\n"
+      "cmp x24, #0x18\n"
+      "shll2 v27.4s, v23.8h, #0x10\n"
+      "str q24, [x21, #0x10]\n"
+      "shll v26.4s, v19.4h, #0x10\n"
+      "shll2 v25.4s, v19.8h, #0x10\n"
+      "str q21, [x21, #0x20]\n"
+      "shll2 v24.4s, v22.8h, #0x10\n"
+      "shll v23.4s, v18.4h, #0x10\n"
+      "str q5, [x21, #0x30]\n"
+      "shll2 v22.4s, v18.8h, #0x10\n"
+      "shll2 v21.4s, v20.8h, #0x10\n"
+      "str q4, [x21, #0x40]\n"
+      "shll v20.4s, v17.4h, #0x10\n"
+      "shll2 v19.4s, v17.8h, #0x10\n"
+      "str q2, [x21, #0x50]\n"
+      "shll2 v18.4s, v3.8h, #0x10\n"
+      "shll v17.4s, v16.4h, #0x10\n"
+      "str q1, [x21, #0x60]\n"
+      "shll2 v16.4s, v16.8h, #0x10\n"
+      "str q0, [x21, #0x70]\n"
+      "str q31, [x21, #0x80]\n"
+      "str q30, [x21, #0x90]\n"
+      "str q29, [x21, #0xa0]\n"
+      "str q28, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "str q27, [x21, #0x0]\n"
+      "str q26, [x21, #0x10]\n"
+      "str q25, [x21, #0x20]\n"
+      "str q24, [x21, #0x30]\n"
+      "str q23, [x21, #0x40]\n"
+      "str q22, [x21, #0x50]\n"
+      "str q21, [x21, #0x60]\n"
+      "str q20, [x21, #0x70]\n"
+      "str q19, [x21, #0x80]\n"
+      "str q18, [x21, #0x90]\n"
+      "str q17, [x21, #0xa0]\n"
+      "str q16, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q16, [x25], #0x10\n"
+      "ldr q21, [x23], #0x10\n"
+      "sub x24, x24, #0xc\n"
+      "cmp x24, #0xc\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q27, [x20], #0x10\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "shll2 v26.4s, v16.8h, #0x10\n"
+      "ldr d16, [x25], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "shll v25.4s, v16.4h, #0x10\n"
+      "shll v24.4s, v21.4h, #0x10\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "shll2 v23.4s, v21.8h, #0x10\n"
+      "shll v22.4s, v18.4h, #0x10\n"
+      "shll v21.4s, v20.4h, #0x10\n"
+      "shll2 v20.4s, v20.8h, #0x10\n"
+      "str q19, [x21, #0x0]\n"
+      "shll v19.4s, v17.4h, #0x10\n"
+      "shll v18.4s, v27.4h, #0x10\n"
+      "str q26, [x21, #0x10]\n"
+      "shll2 v17.4s, v27.8h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "str q25, [x21, #0x20]\n"
+      "str q24, [x21, #0x30]\n"
+      "str q23, [x21, #0x40]\n"
+      "str q22, [x21, #0x50]\n"
+      "str q21, [x21, #0x60]\n"
+      "str q20, [x21, #0x70]\n"
+      "str q19, [x21, #0x80]\n"
+      "str q18, [x21, #0x90]\n"
+      "str q17, [x21, #0xa0]\n"
+      "str q16, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x25], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "shll v19.4s, v19.4h, #0x10\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "str q19, [x21, #0x0]\n"
+      "str q18, [x21, #0x30]\n"
+      "str q17, [x21, #0x60]\n"
+      "str q16, [x21, #0x90]\n"
+      "add x21, x21, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x25], #0x2\n"
+      "ldr h18, [x23], #0x2\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr h17, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "shll v19.4s, v19.4h, #0x10\n"
+      "shll v18.4s, v18.4h, #0x10\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "str s19, [x21, #0x0]\n"
+      "str s18, [x21, #0x30]\n"
+      "str s17, [x21, #0x60]\n"
+      "str s16, [x21, #0x90]\n"
+      "add x21, x21, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0xc0\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x20, %x[width]\n"
+      "mov x25, %x[in]\n"
+      "cmp x20, #0x18\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q16, [x25], #0x10\n"
+      "ldr q20, [x25], #0x10\n"
+      "sub x20, x20, #0x18\n"
+      "shll v18.4s, v16.4h, #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "shll2 v17.4s, v16.8h, #0x10\n"
+      "shll v16.4s, v20.4h, #0x10\n"
+      "str q18, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "cmp x20, #0x18\n"
+      "shll2 v18.4s, v20.8h, #0x10\n"
+      "shll v17.4s, v19.4h, #0x10\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "shll2 v16.4s, v19.8h, #0x10\n"
+      "str q18, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q17, [x25], #0x10\n"
+      "ldr d18, [x25], #0x8\n"
+      "sub x20, x20, #0xc\n"
+      "cmp x20, #0xc\n"
+      "shll v16.4s, v17.4h, #0x10\n"
+      "shll2 v17.4s, v17.8h, #0x10\n"
+      "str q16, [x21, #0x0]\n"
+      "shll v16.4s, v18.4h, #0x10\n"
+      "str q17, [x21, #0x10]\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d16, [x25], #0x8\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h16, [x25], #0x2\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "str s16, [x21, #0x0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x30\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 1, true, VLType::None>(
+    float *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24_bf16fp32(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
new file mode 100644
index 0000000000..966b75664e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_24_fp16fp32(float *out, const __fp16 *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 12 * height * sizeof(float);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x18\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "sub x24, x24, #0x18\n"
+      "fcvtl v26.4s, v18.4h\n"
+      "ldr q16, [x22], #0x10\n"
+      "ldr q25, [x20], #0x10\n"
+      "fcvtl2 v24.4s, v18.8h\n"
+      "fcvtl v5.4s, v17.4h\n"
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "fcvtl v21.4s, v23.4h\n"
+      "fcvtl2 v4.4s, v17.8h\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q3, [x20], #0x10\n"
+      "fcvtl v2.4s, v22.4h\n"
+      "fcvtl v1.4s, v16.4h\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "fcvtl2 v0.4s, v16.8h\n"
+      "fcvtl v31.4s, v20.4h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "fcvtl v30.4s, v25.4h\n"
+      "fcvtl2 v29.4s, v25.8h\n"
+      "fcvtl v28.4s, v3.4h\n"
+      "str q26, [x21, #0x0]\n"
+      "cmp x24, #0x18\n"
+      "fcvtl2 v27.4s, v23.8h\n"
+      "str q24, [x21, #0x10]\n"
+      "fcvtl v26.4s, v19.4h\n"
+      "fcvtl2 v25.4s, v19.8h\n"
+      "str q21, [x21, #0x20]\n"
+      "fcvtl2 v24.4s, v22.8h\n"
+      "fcvtl v23.4s, v18.4h\n"
+      "str q5, [x21, #0x30]\n"
+      "fcvtl2 v22.4s, v18.8h\n"
+      "fcvtl2 v21.4s, v20.8h\n"
+      "str q4, [x21, #0x40]\n"
+      "fcvtl v20.4s, v17.4h\n"
+      "fcvtl2 v19.4s, v17.8h\n"
+      "str q2, [x21, #0x50]\n"
+      "fcvtl2 v18.4s, v3.8h\n"
+      "fcvtl v17.4s, v16.4h\n"
+      "str q1, [x21, #0x60]\n"
+      "fcvtl2 v16.4s, v16.8h\n"
+      "str q0, [x21, #0x70]\n"
+      "str q31, [x21, #0x80]\n"
+      "str q30, [x21, #0x90]\n"
+      "str q29, [x21, #0xa0]\n"
+      "str q28, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "str q27, [x21, #0x0]\n"
+      "str q26, [x21, #0x10]\n"
+      "str q25, [x21, #0x20]\n"
+      "str q24, [x21, #0x30]\n"
+      "str q23, [x21, #0x40]\n"
+      "str q22, [x21, #0x50]\n"
+      "str q21, [x21, #0x60]\n"
+      "str q20, [x21, #0x70]\n"
+      "str q19, [x21, #0x80]\n"
+      "str q18, [x21, #0x90]\n"
+      "str q17, [x21, #0xa0]\n"
+      "str q16, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q16, [x25], #0x10\n"
+      "ldr q21, [x23], #0x10\n"
+      "sub x24, x24, #0xc\n"
+      "cmp x24, #0xc\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q27, [x20], #0x10\n"
+      "fcvtl v19.4s, v16.4h\n"
+      "fcvtl2 v26.4s, v16.8h\n"
+      "ldr d16, [x25], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "fcvtl v25.4s, v16.4h\n"
+      "fcvtl v24.4s, v21.4h\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "fcvtl2 v23.4s, v21.8h\n"
+      "fcvtl v22.4s, v18.4h\n"
+      "fcvtl v21.4s, v20.4h\n"
+      "fcvtl2 v20.4s, v20.8h\n"
+      "str q19, [x21, #0x0]\n"
+      "fcvtl v19.4s, v17.4h\n"
+      "fcvtl v18.4s, v27.4h\n"
+      "str q26, [x21, #0x10]\n"
+      "fcvtl2 v17.4s, v27.8h\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "str q25, [x21, #0x20]\n"
+      "str q24, [x21, #0x30]\n"
+      "str q23, [x21, #0x40]\n"
+      "str q22, [x21, #0x50]\n"
+      "str q21, [x21, #0x60]\n"
+      "str q20, [x21, #0x70]\n"
+      "str q19, [x21, #0x80]\n"
+      "str q18, [x21, #0x90]\n"
+      "str q17, [x21, #0xa0]\n"
+      "str q16, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x25], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "fcvtl v19.4s, v19.4h\n"
+      "fcvtl v18.4s, v18.4h\n"
+      "fcvtl v17.4s, v17.4h\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "str q19, [x21, #0x0]\n"
+      "str q18, [x21, #0x30]\n"
+      "str q17, [x21, #0x60]\n"
+      "str q16, [x21, #0x90]\n"
+      "add x21, x21, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x25], #0x2\n"
+      "ldr h18, [x23], #0x2\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr h17, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "fcvtl v19.4s, v19.4h\n"
+      "fcvtl v18.4s, v18.4h\n"
+      "fcvtl v17.4s, v17.4h\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "str s19, [x21, #0x0]\n"
+      "str s18, [x21, #0x30]\n"
+      "str s17, [x21, #0x60]\n"
+      "str s16, [x21, #0x90]\n"
+      "add x21, x21, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0xc0\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x20, %x[width]\n"
+      "mov x25, %x[in]\n"
+      "cmp x20, #0x18\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q16, [x25], #0x10\n"
+      "ldr q20, [x25], #0x10\n"
+      "sub x20, x20, #0x18\n"
+      "fcvtl v18.4s, v16.4h\n"
+      "ldr q19, [x25], #0x10\n"
+      "fcvtl2 v17.4s, v16.8h\n"
+      "fcvtl v16.4s, v20.4h\n"
+      "str q18, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "cmp x20, #0x18\n"
+      "fcvtl2 v18.4s, v20.8h\n"
+      "fcvtl v17.4s, v19.4h\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "fcvtl2 v16.4s, v19.8h\n"
+      "str q18, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q17, [x25], #0x10\n"
+      "ldr d18, [x25], #0x8\n"
+      "sub x20, x20, #0xc\n"
+      "cmp x20, #0xc\n"
+      "fcvtl v16.4s, v17.4h\n"
+      "fcvtl2 v17.4s, v17.8h\n"
+      "str q16, [x21, #0x0]\n"
+      "fcvtl v16.4s, v18.4h\n"
+      "str q17, [x21, #0x10]\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d16, [x25], #0x8\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h16, [x25], #0x2\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "str s16, [x21, #0x0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x30\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 1, true, VLType::None>(
+    float *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24_fp16fp32(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
deleted file mode 100644
index bcbe2b84d8..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "transpose_interleave_common.hpp"
-
-// Generic unblocked transposed 12x32-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<12, 1, true, 4, 4, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a 24 x uint16_t specialisation
-  TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t * const>(in),
-    stride*2, x0*2, xmax*2, k0, kmax
-  );
-}
-
-// Generic 24x16-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a uint16_t specialisation
-  Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t * const>(in),
-    stride, x0, xmax, k0, kmax
-  );
-}
-
-// Specialised 24 x uint16_t version
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
-    __asm __volatile (
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    q2, [%[in0]], #16\n"
-        "STR    q2, [%[out], #32]\n"
-    : [in0] "+r" (in0), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) {
-    __asm __volatile (
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    q2, [%[in0]], #16\n"
-        "LDP	q3, q4, [%[in1]], #32\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "LDR	q5, [%[in1]], #16\n"
-        "STP    q4, q5, [%[out], #64]\n"
-    : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
-    __asm __volatile (
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n"
-        "LDR    q2, [%[in0]], #16\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDP	q3, q4, [%[in1]], #32\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        "LDR	q5, [%[in1]], #16\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "STP    q4, q5, [%[out], #64]\n"
-        "LDP	q6, q7, [%[in2]], #32\n"
-        "STP    q6, q7, [%[out], #96]\n"
-        "LDR	q8, [%[in2]], #16\n"
-        ASM_PREFETCH("[%[in2], #192]")
-        "LDP	q9, q10, [%[in3]], #32\n"
-        "STP    q8, q9, [%[out], #128]\n"
-        "LDR	q11, [%[in3]], #16\n"
-        "STP    q10, q11, [%[out], #160]\n"
-        ASM_PREFETCH("[%[in3], #192]")
-
-    : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
-    );
-}
-
-template <>
-template <>
-inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
-    uint16_t* out, const uint16_t* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif  // __arch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
new file mode 100644
index 0000000000..4a22675028
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_32_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 32 * roundup<size_t>(height, 4) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x10\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x17, %x[in]\n"
+      "add x16, x17, %x[in_stride]\n"
+      "add x15, x16, %x[in_stride]\n"
+      "add x14, x15, %x[in_stride]\n"
+      "add x13, x14, %x[in_stride]\n"
+      "add x12, x13, %x[in_stride]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x20\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x10\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q6, [x17], #0x10\n"
+      "ldr q31, [x16], #0x10\n"
+      "sub x24, x24, #0x20\n"
+      "cmp x24, #0x20\n"
+      "ldr q7, [x15], #0x10\n"
+      "ldr q0, [x14], #0x10\n"
+      "zip1 v9.16b, v6.16b, v7.16b\n"
+      "zip1 v20.16b, v31.16b, v0.16b\n"
+      "ldr q24, [x13], #0x10\n"
+      "ldr q19, [x12], #0x10\n"
+      "zip2 v30.16b, v6.16b, v7.16b\n"
+      "zip2 v12.16b, v31.16b, v0.16b\n"
+      "ldr q23, [x11], #0x10\n"
+      "ldr q17, [x10], #0x10\n"
+      "zip1 v13.16b, v24.16b, v23.16b\n"
+      "zip1 v16.16b, v19.16b, v17.16b\n"
+      "ldr q0, [x9], #0x10\n"
+      "ldr q31, [x28], #0x10\n"
+      "zip2 v15.16b, v24.16b, v23.16b\n"
+      "zip2 v11.16b, v19.16b, v17.16b\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q4, [x26], #0x10\n"
+      "zip1 v1.16b, v0.16b, v17.16b\n"
+      "zip1 v21.16b, v31.16b, v4.16b\n"
+      "ldr q28, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v0.16b, v0.16b, v17.16b\n"
+      "zip2 v26.16b, v31.16b, v4.16b\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q19, [x20], #0x10\n"
+      "zip1 v23.16b, v28.16b, v17.16b\n"
+      "zip1 v25.16b, v18.16b, v19.16b\n"
+      "ldr q2, [x17], #0x10\n"
+      "ldr q3, [x16], #0x10\n"
+      "zip2 v7.16b, v28.16b, v17.16b\n"
+      "zip2 v8.16b, v18.16b, v19.16b\n"
+      "ldr q22, [x15], #0x10\n"
+      "ldr q27, [x14], #0x10\n"
+      "zip1 v19.16b, v2.16b, v22.16b\n"
+      "zip1 v17.16b, v3.16b, v27.16b\n"
+      "ldr q6, [x13], #0x10\n"
+      "ldr q4, [x12], #0x10\n"
+      "zip2 v24.16b, v2.16b, v22.16b\n"
+      "zip2 v22.16b, v3.16b, v27.16b\n"
+      "ldr q14, [x11], #0x10\n"
+      "ldr q18, [x10], #0x10\n"
+      "zip1 v29.16b, v6.16b, v14.16b\n"
+      "zip1 v31.16b, v4.16b, v18.16b\n"
+      "ldr q2, [x9], #0x10\n"
+      "ldr q10, [x28], #0x10\n"
+      "zip2 v28.16b, v6.16b, v14.16b\n"
+      "zip2 v27.16b, v4.16b, v18.16b\n"
+      "ldr q6, [x27], #0x10\n"
+      "ldr q5, [x26], #0x10\n"
+      "zip1 v14.16b, v2.16b, v6.16b\n"
+      "zip1 v4.16b, v10.16b, v5.16b\n"
+      "ldr q3, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v6.16b, v2.16b, v6.16b\n"
+      "zip2 v10.16b, v10.16b, v5.16b\n"
+      "ldr q5, [x22], #0x10\n"
+      "zip1 v2.16b, v3.16b, v5.16b\n"
+      "zip2 v3.16b, v3.16b, v5.16b\n"
+      "zip1 v5.16b, v9.16b, v20.16b\n"
+      "zip2 v20.16b, v9.16b, v20.16b\n"
+      "ldr q9, [x20], #0x10\n"
+      "str q5, [x21, #0x0]\n"
+      "zip1 v5.16b, v18.16b, v9.16b\n"
+      "zip2 v9.16b, v18.16b, v9.16b\n"
+      "str q20, [x21, #0x10]\n"
+      "zip1 v18.16b, v30.16b, v12.16b\n"
+      "zip2 v30.16b, v30.16b, v12.16b\n"
+      "str q18, [x21, #0x20]\n"
+      "zip1 v20.16b, v19.16b, v17.16b\n"
+      "zip2 v12.16b, v19.16b, v17.16b\n"
+      "str q30, [x21, #0x30]\n"
+      "zip1 v18.16b, v24.16b, v22.16b\n"
+      "zip2 v17.16b, v24.16b, v22.16b\n"
+      "str q20, [x21, #0x40]\n"
+      "zip1 v30.16b, v13.16b, v16.16b\n"
+      "zip2 v24.16b, v13.16b, v16.16b\n"
+      "str q12, [x21, #0x50]\n"
+      "zip1 v22.16b, v15.16b, v11.16b\n"
+      "zip2 v20.16b, v15.16b, v11.16b\n"
+      "str q18, [x21, #0x60]\n"
+      "zip1 v19.16b, v29.16b, v31.16b\n"
+      "zip2 v18.16b, v29.16b, v31.16b\n"
+      "str q17, [x21, #0x70]\n"
+      "zip1 v17.16b, v28.16b, v27.16b\n"
+      "zip2 v16.16b, v28.16b, v27.16b\n"
+      "str q30, [x21, #0x80]\n"
+      "zip1 v31.16b, v1.16b, v21.16b\n"
+      "zip2 v1.16b, v1.16b, v21.16b\n"
+      "str q24, [x21, #0x90]\n"
+      "zip1 v30.16b, v0.16b, v26.16b\n"
+      "zip2 v29.16b, v0.16b, v26.16b\n"
+      "str q22, [x21, #0xa0]\n"
+      "zip1 v28.16b, v14.16b, v4.16b\n"
+      "zip2 v27.16b, v14.16b, v4.16b\n"
+      "str q20, [x21, #0xb0]\n"
+      "zip1 v26.16b, v6.16b, v10.16b\n"
+      "zip2 v24.16b, v6.16b, v10.16b\n"
+      "str q19, [x21, #0xc0]\n"
+      "zip1 v14.16b, v23.16b, v25.16b\n"
+      "zip2 v22.16b, v23.16b, v25.16b\n"
+      "str q18, [x21, #0xd0]\n"
+      "zip1 v21.16b, v7.16b, v8.16b\n"
+      "zip2 v20.16b, v7.16b, v8.16b\n"
+      "str q17, [x21, #0xe0]\n"
+      "zip1 v19.16b, v2.16b, v5.16b\n"
+      "zip2 v18.16b, v2.16b, v5.16b\n"
+      "str q16, [x21, #0xf0]\n"
+      "zip1 v17.16b, v3.16b, v9.16b\n"
+      "zip2 v16.16b, v3.16b, v9.16b\n"
+      "str q31, [x21, #0x100]\n"
+      "str q1, [x21, #0x110]\n"
+      "str q30, [x21, #0x120]\n"
+      "str q29, [x21, #0x130]\n"
+      "str q28, [x21, #0x140]\n"
+      "str q27, [x21, #0x150]\n"
+      "str q26, [x21, #0x160]\n"
+      "str q24, [x21, #0x170]\n"
+      "str q14, [x21, #0x180]\n"
+      "str q22, [x21, #0x190]\n"
+      "str q21, [x21, #0x1a0]\n"
+      "str q20, [x21, #0x1b0]\n"
+      "str q19, [x21, #0x1c0]\n"
+      "str q18, [x21, #0x1d0]\n"
+      "str q17, [x21, #0x1e0]\n"
+      "str q16, [x21, #0x1f0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x24, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q21, [x17], #0x10\n"
+      "ldr q20, [x16], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q17, [x15], #0x10\n"
+      "ldr q16, [x14], #0x10\n"
+      "zip1 v3.16b, v21.16b, v17.16b\n"
+      "zip1 v2.16b, v20.16b, v16.16b\n"
+      "ldr q19, [x13], #0x10\n"
+      "ldr q18, [x12], #0x10\n"
+      "zip2 v1.16b, v21.16b, v17.16b\n"
+      "zip2 v0.16b, v20.16b, v16.16b\n"
+      "ldr q17, [x11], #0x10\n"
+      "ldr q16, [x10], #0x10\n"
+      "zip1 v31.16b, v19.16b, v17.16b\n"
+      "zip1 v30.16b, v18.16b, v16.16b\n"
+      "ldr q24, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v29.16b, v19.16b, v17.16b\n"
+      "zip2 v23.16b, v18.16b, v16.16b\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v22.16b, v24.16b, v17.16b\n"
+      "zip1 v21.16b, v20.16b, v16.16b\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v28.16b, v24.16b, v17.16b\n"
+      "zip2 v20.16b, v20.16b, v16.16b\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v27.16b, v19.16b, v17.16b\n"
+      "zip1 v26.16b, v18.16b, v16.16b\n"
+      "zip2 v25.16b, v19.16b, v17.16b\n"
+      "zip2 v24.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v3.16b, v2.16b\n"
+      "zip2 v18.16b, v3.16b, v2.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v17.16b, v1.16b, v0.16b\n"
+      "zip2 v16.16b, v1.16b, v0.16b\n"
+      "str q18, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "zip1 v19.16b, v31.16b, v30.16b\n"
+      "zip2 v18.16b, v31.16b, v30.16b\n"
+      "str q16, [x21, #0x30]\n"
+      "zip1 v17.16b, v29.16b, v23.16b\n"
+      "zip2 v16.16b, v29.16b, v23.16b\n"
+      "str q19, [x21, #0x80]\n"
+      "zip1 v23.16b, v22.16b, v21.16b\n"
+      "zip2 v22.16b, v22.16b, v21.16b\n"
+      "str q18, [x21, #0x90]\n"
+      "zip1 v21.16b, v28.16b, v20.16b\n"
+      "zip2 v20.16b, v28.16b, v20.16b\n"
+      "str q17, [x21, #0xa0]\n"
+      "zip1 v19.16b, v27.16b, v26.16b\n"
+      "zip2 v18.16b, v27.16b, v26.16b\n"
+      "str q16, [x21, #0xb0]\n"
+      "zip1 v17.16b, v25.16b, v24.16b\n"
+      "zip2 v16.16b, v25.16b, v24.16b\n"
+      "str q23, [x21, #0x100]\n"
+      "str q22, [x21, #0x110]\n"
+      "str q21, [x21, #0x120]\n"
+      "str q20, [x21, #0x130]\n"
+      "str q19, [x21, #0x180]\n"
+      "str q18, [x21, #0x190]\n"
+      "str q17, [x21, #0x1a0]\n"
+      "str q16, [x21, #0x1b0]\n"
+      "add x21, x21, #0x40\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s19, [x17], #0x4\n"
+      "ldr s18, [x16], #0x4\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr s17, [x15], #0x4\n"
+      "ldr s16, [x14], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr s19, [x13], #0x4\n"
+      "ldr s18, [x12], #0x4\n"
+      "zip1 v22.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x11], #0x4\n"
+      "ldr s16, [x10], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "zip1 v21.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "zip1 v18.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str q22, [x21, #0x0]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q21, [x21, #0x80]\n"
+      "str q18, [x21, #0x100]\n"
+      "str q16, [x21, #0x180]\n"
+      "add x21, x21, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b19, [x17], #0x1\n"
+      "ldr b18, [x16], #0x1\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr b17, [x15], #0x1\n"
+      "ldr b16, [x14], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr b19, [x13], #0x1\n"
+      "ldr b18, [x12], #0x1\n"
+      "zip1 v22.16b, v17.16b, v16.16b\n"
+      "ldr b17, [x11], #0x1\n"
+      "ldr b16, [x10], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr b19, [x9], #0x1\n"
+      "ldr b18, [x28], #0x1\n"
+      "zip1 v21.16b, v17.16b, v16.16b\n"
+      "ldr b17, [x27], #0x1\n"
+      "ldr b16, [x26], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr b20, [x25], #0x1\n"
+      "ldr b19, [x23], #0x1\n"
+      "zip1 v18.16b, v17.16b, v16.16b\n"
+      "ldr b17, [x22], #0x1\n"
+      "ldr b16, [x20], #0x1\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str s22, [x21, #0x0]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s21, [x21, #0x80]\n"
+      "str s18, [x21, #0x100]\n"
+      "str s16, [x21, #0x180]\n"
+      "add x21, x21, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x10\n"
+      "add %x[out], %x[out], #0x200\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x17, %x[in]\n"
+      "add x16, x17, %x[in_stride]\n"
+      "add x15, x16, %x[in_stride]\n"
+      "mov x20, %x[width]\n"
+      "add x14, x15, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x14, %x[in_stride]\n"
+      "csel x14, x14, %x[pad_row], GT\n"
+      "csel x15, x15, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x16, x16, %x[pad_row], GT\n"
+      "cmp x20, #0x20\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q19, [x17], #0x10\n"
+      "ldr q18, [x16], #0x10\n"
+      "sub x20, x20, #0x20\n"
+      "cmp x20, #0x20\n"
+      "ldr q17, [x15], #0x10\n"
+      "ldr q16, [x14], #0x10\n"
+      "zip1 v25.16b, v19.16b, v17.16b\n"
+      "zip1 v24.16b, v18.16b, v16.16b\n"
+      "ldr q22, [x17], #0x10\n"
+      "ldr q21, [x16], #0x10\n"
+      "zip2 v20.16b, v19.16b, v17.16b\n"
+      "zip2 v19.16b, v18.16b, v16.16b\n"
+      "ldr q17, [x15], #0x10\n"
+      "ldr q16, [x14], #0x10\n"
+      "zip1 v23.16b, v22.16b, v17.16b\n"
+      "zip1 v18.16b, v21.16b, v16.16b\n"
+      "zip2 v22.16b, v22.16b, v17.16b\n"
+      "zip2 v21.16b, v21.16b, v16.16b\n"
+      "zip1 v16.16b, v25.16b, v24.16b\n"
+      "zip2 v17.16b, v25.16b, v24.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v16.16b, v20.16b, v19.16b\n"
+      "zip2 v20.16b, v20.16b, v19.16b\n"
+      "str q17, [x21, #0x10]\n"
+      "zip1 v19.16b, v23.16b, v18.16b\n"
+      "zip2 v18.16b, v23.16b, v18.16b\n"
+      "str q16, [x21, #0x20]\n"
+      "zip1 v17.16b, v22.16b, v21.16b\n"
+      "zip2 v16.16b, v22.16b, v21.16b\n"
+      "str q20, [x21, #0x30]\n"
+      "str q19, [x21, #0x40]\n"
+      "str q18, [x21, #0x50]\n"
+      "str q17, [x21, #0x60]\n"
+      "str q16, [x21, #0x70]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q20, [x17], #0x10\n"
+      "ldr q21, [x16], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "ldr q19, [x15], #0x10\n"
+      "ldr q16, [x14], #0x10\n"
+      "zip1 v18.16b, v20.16b, v19.16b\n"
+      "zip1 v17.16b, v21.16b, v16.16b\n"
+      "zip2 v20.16b, v20.16b, v19.16b\n"
+      "zip2 v19.16b, v21.16b, v16.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "zip2 v18.16b, v18.16b, v17.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip2 v16.16b, v20.16b, v19.16b\n"
+      "str q18, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, #0x40\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr s19, [x17], #0x4\n"
+      "ldr s18, [x16], #0x4\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "ldr s17, [x15], #0x4\n"
+      "ldr s16, [x14], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr b19, [x17], #0x1\n"
+      "ldr b18, [x16], #0x1\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "ldr b17, [x15], #0x1\n"
+      "ldr b16, [x14], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x21, #0x0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x80\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<32, 4, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_32_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<32, 4, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_32_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
new file mode 100644
index 0000000000..237536697c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 32 * roundup<size_t>(height, 2) * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 12f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x40\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q14, [x25], #0x10\n"
+      "ldr q10, [x23], #0x10\n"
+      "sub x24, x24, #0x40\n"
+      "zip1 v12.8h, v14.8h, v10.8h\n"
+      "ldr q5, [x22], #0x10\n"
+      "ldr q3, [x20], #0x10\n"
+      "zip2 v31.8h, v14.8h, v10.8h\n"
+      "zip1 v19.8h, v5.8h, v3.8h\n"
+      "ldr q27, [x25], #0x10\n"
+      "ldr q25, [x23], #0x10\n"
+      "zip1 v11.8h, v27.8h, v25.8h\n"
+      "zip2 v24.8h, v27.8h, v25.8h\n"
+      "ldr q6, [x22], #0x10\n"
+      "ldr q29, [x20], #0x10\n"
+      "zip2 v15.8h, v5.8h, v3.8h\n"
+      "zip1 v18.8h, v6.8h, v29.8h\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q9, [x23], #0x10\n"
+      "zip1 v0.8h, v17.8h, v9.8h\n"
+      "zip2 v9.8h, v17.8h, v9.8h\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "zip2 v8.8h, v6.8h, v29.8h\n"
+      "zip1 v30.8h, v21.8h, v20.8h\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q5, [x23], #0x10\n"
+      "zip1 v13.8h, v17.8h, v5.8h\n"
+      "zip2 v25.8h, v17.8h, v5.8h\n"
+      "ldr q7, [x22], #0x10\n"
+      "ldr q29, [x20], #0x10\n"
+      "zip2 v27.8h, v21.8h, v20.8h\n"
+      "zip1 v14.8h, v7.8h, v29.8h\n"
+      "ldr q28, [x25], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "zip2 v1.8h, v7.8h, v29.8h\n"
+      "cmp x24, #0x40\n"
+      "ldr q10, [x22], #0x10\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip1 v16.8h, v28.8h, v17.8h\n"
+      "zip2 v17.8h, v28.8h, v17.8h\n"
+      "ldr q5, [x25], #0x10\n"
+      "ldr q20, [x23], #0x10\n"
+      "zip1 v3.8h, v5.8h, v20.8h\n"
+      "zip2 v7.8h, v5.8h, v20.8h\n"
+      "ldr q22, [x22], #0x10\n"
+      "ldr q29, [x20], #0x10\n"
+      "zip1 v2.8h, v10.8h, v21.8h\n"
+      "zip2 v5.8h, v10.8h, v21.8h\n"
+      "ldr q21, [x25], #0x10\n"
+      "ldr q20, [x23], #0x10\n"
+      "zip1 v4.8h, v21.8h, v20.8h\n"
+      "zip2 v28.8h, v21.8h, v20.8h\n"
+      "ldr q6, [x22], #0x10\n"
+      "ldr q10, [x20], #0x10\n"
+      "zip1 v26.8h, v22.8h, v29.8h\n"
+      "zip2 v20.8h, v22.8h, v29.8h\n"
+      "ldr q29, [x25], #0x10\n"
+      "ldr q23, [x23], #0x10\n"
+      "zip1 v21.8h, v29.8h, v23.8h\n"
+      "zip2 v23.8h, v29.8h, v23.8h\n"
+      "ldr q22, [x22], #0x10\n"
+      "ldr q29, [x20], #0x10\n"
+      "str q12, [x21, #0x0]\n"
+      "zip1 v12.8h, v6.8h, v10.8h\n"
+      "str q31, [x21, #0x10]\n"
+      "zip2 v6.8h, v6.8h, v10.8h\n"
+      "zip1 v31.8h, v22.8h, v29.8h\n"
+      "str q11, [x21, #0x20]\n"
+      "zip2 v11.8h, v22.8h, v29.8h\n"
+      "str q24, [x21, #0x30]\n"
+      "str q0, [x21, #0x40]\n"
+      "str q9, [x21, #0x50]\n"
+      "str q13, [x21, #0x60]\n"
+      "str q25, [x21, #0x70]\n"
+      "str q19, [x21, #0x80]\n"
+      "str q15, [x21, #0x90]\n"
+      "str q18, [x21, #0xa0]\n"
+      "str q8, [x21, #0xb0]\n"
+      "str q30, [x21, #0xc0]\n"
+      "str q27, [x21, #0xd0]\n"
+      "str q14, [x21, #0xe0]\n"
+      "str q1, [x21, #0xf0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "str q16, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q3, [x21, #0x20]\n"
+      "str q7, [x21, #0x30]\n"
+      "str q4, [x21, #0x40]\n"
+      "str q28, [x21, #0x50]\n"
+      "str q21, [x21, #0x60]\n"
+      "str q23, [x21, #0x70]\n"
+      "str q2, [x21, #0x80]\n"
+      "str q5, [x21, #0x90]\n"
+      "str q26, [x21, #0xa0]\n"
+      "str q20, [x21, #0xb0]\n"
+      "str q12, [x21, #0xc0]\n"
+      "str q6, [x21, #0xd0]\n"
+      "str q31, [x21, #0xe0]\n"
+      "str q11, [x21, #0xf0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0x20\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "sub x24, x24, #0x20\n"
+      "cmp x24, #0x20\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q18, [x20], #0x10\n"
+      "zip1 v1.8h, v17.8h, v16.8h\n"
+      "zip2 v0.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v31.8h, v17.8h, v16.8h\n"
+      "zip2 v30.8h, v17.8h, v16.8h\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q19, [x20], #0x10\n"
+      "zip1 v29.8h, v21.8h, v18.8h\n"
+      "zip2 v28.8h, v21.8h, v18.8h\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v27.8h, v17.8h, v16.8h\n"
+      "zip2 v26.8h, v17.8h, v16.8h\n"
+      "ldr q25, [x22], #0x10\n"
+      "ldr q18, [x20], #0x10\n"
+      "zip1 v24.8h, v20.8h, v19.8h\n"
+      "zip2 v23.8h, v20.8h, v19.8h\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v22.8h, v17.8h, v16.8h\n"
+      "zip2 v21.8h, v17.8h, v16.8h\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v19.8h, v25.8h, v18.8h\n"
+      "zip2 v18.8h, v25.8h, v18.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q1, [x21, #0x0]\n"
+      "str q0, [x21, #0x10]\n"
+      "str q31, [x21, #0x20]\n"
+      "str q30, [x21, #0x30]\n"
+      "str q27, [x21, #0x40]\n"
+      "str q26, [x21, #0x50]\n"
+      "str q22, [x21, #0x60]\n"
+      "str q21, [x21, #0x70]\n"
+      "str q29, [x21, #0x80]\n"
+      "str q28, [x21, #0x90]\n"
+      "str q24, [x21, #0xa0]\n"
+      "str q23, [x21, #0xb0]\n"
+      "str q19, [x21, #0xc0]\n"
+      "str q18, [x21, #0xd0]\n"
+      "str q17, [x21, #0xe0]\n"
+      "str q16, [x21, #0xf0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x10\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 16 loop: loop
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q24, [x22], #0x10\n"
+      "ldr q23, [x20], #0x10\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "zip2 v18.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v22.8h, v17.8h, v16.8h\n"
+      "zip2 v21.8h, v17.8h, v16.8h\n"
+      "ldr q20, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q19, [x21, #0x0]\n"
+      "zip1 v19.8h, v24.8h, v23.8h\n"
+      "str q18, [x21, #0x10]\n"
+      "zip2 v18.8h, v24.8h, v23.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "str q22, [x21, #0x20]\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q21, [x21, #0x30]\n"
+      "str q19, [x21, #0x80]\n"
+      "str q18, [x21, #0x90]\n"
+      "str q17, [x21, #0xa0]\n"
+      "str q16, [x21, #0xb0]\n"
+      "add x21, x21, #0x40\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 16 loop: skip
+      "cmp x24, #0x4\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x25], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d17, [x20], #0x8\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x21, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 11f\n"
+      "10:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x25], #0x2\n"
+      "ldr h16, [x23], #0x2\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr h18, [x22], #0x2\n"
+      "ldr h17, [x20], #0x2\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "str s16, [x21, #0x0]\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str s16, [x21, #0x80]\n"
+      "add x21, x21, #0x4\n"
+      "bge 10b\n"
+      "11:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0x100\n"
+      "bge 1b\n"
+      "cbz %x[height], 24f\n"
+      "12:"  // Main loop skip
+      "13:"  // Tail row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x20, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "cmp x20, #0x40\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Unroll column loop
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "sub x20, x20, #0x40\n"
+      "zip1 v0.8h, v18.8h, v17.8h\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip2 v31.8h, v18.8h, v17.8h\n"
+      "zip1 v30.8h, v19.8h, v16.8h\n"
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "zip2 v29.8h, v19.8h, v16.8h\n"
+      "zip1 v28.8h, v18.8h, v17.8h\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip2 v27.8h, v18.8h, v17.8h\n"
+      "zip1 v26.8h, v19.8h, v16.8h\n"
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "zip2 v25.8h, v19.8h, v16.8h\n"
+      "cmp x20, #0x40\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v24.8h, v18.8h, v17.8h\n"
+      "zip2 v23.8h, v18.8h, v17.8h\n"
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "zip1 v22.8h, v19.8h, v16.8h\n"
+      "zip2 v21.8h, v19.8h, v16.8h\n"
+      "ldr q20, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "str q0, [x21, #0x0]\n"
+      "zip1 v19.8h, v18.8h, v17.8h\n"
+      "str q31, [x21, #0x10]\n"
+      "zip2 v18.8h, v18.8h, v17.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "str q30, [x21, #0x20]\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q29, [x21, #0x30]\n"
+      "str q28, [x21, #0x40]\n"
+      "str q27, [x21, #0x50]\n"
+      "str q26, [x21, #0x60]\n"
+      "str q25, [x21, #0x70]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q23, [x21, #0x10]\n"
+      "str q22, [x21, #0x20]\n"
+      "str q21, [x21, #0x30]\n"
+      "str q19, [x21, #0x40]\n"
+      "str q18, [x21, #0x50]\n"
+      "str q17, [x21, #0x60]\n"
+      "str q16, [x21, #0x70]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0x20\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: Column loop
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "sub x20, x20, #0x20\n"
+      "cmp x20, #0x20\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v24.8h, v18.8h, v17.8h\n"
+      "zip2 v23.8h, v18.8h, v17.8h\n"
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "zip1 v22.8h, v19.8h, v16.8h\n"
+      "zip2 v21.8h, v19.8h, v16.8h\n"
+      "ldr q20, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v19.8h, v18.8h, v17.8h\n"
+      "zip2 v18.8h, v18.8h, v17.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q24, [x21, #0x0]\n"
+      "str q23, [x21, #0x10]\n"
+      "str q22, [x21, #0x20]\n"
+      "str q21, [x21, #0x30]\n"
+      "str q19, [x21, #0x40]\n"
+      "str q18, [x21, #0x50]\n"
+      "str q17, [x21, #0x60]\n"
+      "str q16, [x21, #0x70]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x10\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 16 loop: loop
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "ldr q20, [x25], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v19.8h, v18.8h, v17.8h\n"
+      "zip2 v18.8h, v18.8h, v17.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q19, [x21, #0x0]\n"
+      "str q18, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, #0x40\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 16 loop: skip
+      "cmp x20, #0x4\n"
+      "blt 21f\n"
+      "20:"  // Tail row loop: width 4 loop: loop
+      "ldr d17, [x25], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 20b\n"
+      "21:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 23f\n"
+      "22:"  // Tail row loop: width 1 loop: loop
+      "ldr h17, [x25], #0x2\n"
+      "ldr h16, [x23], #0x2\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str s16, [x21, #0x0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 22b\n"
+      "23:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x80\n"
+      "bge 13b\n"
+      "24:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<32, 2, true, VLType::None>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_32_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
new file mode 100644
index 0000000000..f35752d5a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_48(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 24 * height * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x18\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q27, [x25], #0x10\n"
+      "ldr q26, [x23], #0x10\n"
+      "sub x24, x24, #0x18\n"
+      "cmp x24, #0x18\n"
+      "ldr q25, [x22], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q27, [x21, #0x0]\n"
+      "str q23, [x21, #0x10]\n"
+      "str q19, [x21, #0x20]\n"
+      "str q26, [x21, #0x30]\n"
+      "str q22, [x21, #0x40]\n"
+      "str q18, [x21, #0x50]\n"
+      "str q25, [x21, #0x60]\n"
+      "str q21, [x21, #0x70]\n"
+      "str q17, [x21, #0x80]\n"
+      "str q24, [x21, #0x90]\n"
+      "str q20, [x21, #0xa0]\n"
+      "str q16, [x21, #0xb0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x24, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q23, [x21, #0x0]\n"
+      "str q19, [x21, #0x10]\n"
+      "str q22, [x21, #0x30]\n"
+      "str q18, [x21, #0x40]\n"
+      "str q21, [x21, #0x60]\n"
+      "str q17, [x21, #0x70]\n"
+      "str q20, [x21, #0x90]\n"
+      "str q16, [x21, #0xa0]\n"
+      "add x21, x21, #0x20\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x25], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "str d19, [x21, #0x0]\n"
+      "str d18, [x21, #0x30]\n"
+      "str d17, [x21, #0x60]\n"
+      "str d16, [x21, #0x90]\n"
+      "add x21, x21, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x25], #0x2\n"
+      "ldr h18, [x23], #0x2\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr h17, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "str h19, [x21, #0x0]\n"
+      "str h18, [x21, #0x30]\n"
+      "str h17, [x21, #0x60]\n"
+      "str h16, [x21, #0x90]\n"
+      "add x21, x21, #0x2\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0xc0\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x20, %x[width]\n"
+      "mov x25, %x[in]\n"
+      "cmp x20, #0x18\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "sub x20, x20, #0x18\n"
+      "cmp x20, #0x18\n"
+      "ldr q16, [x25], #0x10\n"
+      "str q18, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q16, [x21, #0x20]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "str q17, [x21, #0x0]\n"
+      "str q16, [x21, #0x10]\n"
+      "add x21, x21, #0x20\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d16, [x25], #0x8\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "str d16, [x21, #0x0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h16, [x25], #0x2\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "str h16, [x21, #0x0]\n"
+      "add x21, x21, #0x2\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x30\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<12, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_48(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<24, 1, true, VLType::None>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_48(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<6, 1, true, VLType::None>(
+    double *out, const double *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_48(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(double) / 2,
+        stride * sizeof(double),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
new file mode 100644
index 0000000000..6ef02ac044
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_4_1x16(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 16) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 16) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+      "1:"  // Main row loop: Head
+      "mov x17, %x[in]\n"
+      "add x16, x17, %x[in_stride]\n"
+      "add x15, x16, %x[in_stride]\n"
+      "add x14, x15, %x[in_stride]\n"
+      "add x13, x14, %x[in_stride]\n"
+      "add x12, x13, %x[in_stride]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "cmp %x[height], #0xf\n"
+      "add %x[in], x22, %x[in_stride]\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "csel x23, x23, %x[pad_row], GE\n"
+      "cmp %x[height], #0xd\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0xb\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "csel x27, x27, %x[pad_row], GE\n"
+      "cmp %x[height], #0x9\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "csel x9, x9, %x[pad_row], GE\n"
+      "cmp %x[height], #0x7\n"
+      "csel x10, x10, %x[pad_row], GT\n"
+      "csel x11, x11, %x[pad_row], GE\n"
+      "cmp %x[height], #0x5\n"
+      "mov x21, %x[width]\n"
+      "csel x12, x12, %x[pad_row], GT\n"
+      "csel x13, x13, %x[pad_row], GE\n"
+      "cmp %x[height], #0x3\n"
+      "csel x14, x14, %x[pad_row], GT\n"
+      "csel x15, x15, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x16, x16, %x[pad_row], GT\n"
+      "cmp x21, #0x10\n"
+      "mov x20, %x[out]\n"
+      "sub %x[height], %x[height], #0x10\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q3, [x17], #0x10\n"
+      "ldr q9, [x16], #0x10\n"
+      "sub x21, x21, #0x10\n"
+      "cmp x21, #0x10\n"
+      "ldr q2, [x15], #0x10\n"
+      "ldr q8, [x14], #0x10\n"
+      "ldr q0, [x13], #0x10\n"
+      "ldr q31, [x12], #0x10\n"
+      "ldr q30, [x11], #0x10\n"
+      "ldr q7, [x10], #0x10\n"
+      "ldr q29, [x9], #0x10\n"
+      "ldr q28, [x28], #0x10\n"
+      "zip1 v27.16b, v3.16b, v29.16b\n"
+      "zip1 v6.16b, v9.16b, v28.16b\n"
+      "ldr q25, [x27], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v26.16b, v2.16b, v25.16b\n"
+      "zip1 v1.16b, v8.16b, v24.16b\n"
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x24], #0x10\n"
+      "zip1 v21.16b, v0.16b, v23.16b\n"
+      "zip1 v20.16b, v31.16b, v22.16b\n"
+      "ldr q19, [x23], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v17.16b, v30.16b, v19.16b\n"
+      "zip1 v16.16b, v7.16b, v18.16b\n"
+      "zip2 v5.16b, v3.16b, v29.16b\n"
+      "zip2 v0.16b, v0.16b, v23.16b\n"
+      "zip2 v4.16b, v2.16b, v25.16b\n"
+      "zip2 v3.16b, v30.16b, v19.16b\n"
+      "zip2 v2.16b, v9.16b, v28.16b\n"
+      "zip2 v31.16b, v31.16b, v22.16b\n"
+      "zip2 v30.16b, v8.16b, v24.16b\n"
+      "zip2 v29.16b, v7.16b, v18.16b\n"
+      "zip1 v25.16b, v27.16b, v21.16b\n"
+      "zip1 v24.16b, v26.16b, v17.16b\n"
+      "zip1 v23.16b, v6.16b, v20.16b\n"
+      "zip1 v22.16b, v1.16b, v16.16b\n"
+      "zip2 v28.16b, v27.16b, v21.16b\n"
+      "zip2 v27.16b, v26.16b, v17.16b\n"
+      "zip2 v26.16b, v6.16b, v20.16b\n"
+      "zip2 v21.16b, v1.16b, v16.16b\n"
+      "zip1 v1.16b, v5.16b, v0.16b\n"
+      "zip1 v20.16b, v4.16b, v3.16b\n"
+      "zip1 v19.16b, v2.16b, v31.16b\n"
+      "zip1 v16.16b, v30.16b, v29.16b\n"
+      "zip1 v18.16b, v25.16b, v24.16b\n"
+      "zip1 v17.16b, v23.16b, v22.16b\n"
+      "zip2 v25.16b, v25.16b, v24.16b\n"
+      "zip2 v24.16b, v23.16b, v22.16b\n"
+      "zip2 v0.16b, v5.16b, v0.16b\n"
+      "zip2 v23.16b, v4.16b, v3.16b\n"
+      "zip2 v31.16b, v2.16b, v31.16b\n"
+      "zip2 v22.16b, v30.16b, v29.16b\n"
+      "zip1 v30.16b, v28.16b, v27.16b\n"
+      "zip1 v29.16b, v26.16b, v21.16b\n"
+      "zip2 v28.16b, v28.16b, v27.16b\n"
+      "zip2 v27.16b, v26.16b, v21.16b\n"
+      "zip1 v26.16b, v1.16b, v20.16b\n"
+      "zip1 v21.16b, v19.16b, v16.16b\n"
+      "zip2 v20.16b, v1.16b, v20.16b\n"
+      "zip2 v19.16b, v19.16b, v16.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "zip2 v18.16b, v18.16b, v17.16b\n"
+      "str q16, [x20, #0x0]\n"
+      "zip1 v17.16b, v25.16b, v24.16b\n"
+      "zip2 v16.16b, v25.16b, v24.16b\n"
+      "str q18, [x20, #0x10]\n"
+      "str q17, [x20, #0x20]\n"
+      "zip1 v25.16b, v0.16b, v23.16b\n"
+      "zip1 v24.16b, v31.16b, v22.16b\n"
+      "str q16, [x20, #0x30]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "zip2 v23.16b, v0.16b, v23.16b\n"
+      "zip2 v22.16b, v31.16b, v22.16b\n"
+      "zip1 v16.16b, v30.16b, v29.16b\n"
+      "zip2 v17.16b, v30.16b, v29.16b\n"
+      "str q16, [x20, #0x0]\n"
+      "zip1 v16.16b, v28.16b, v27.16b\n"
+      "zip2 v18.16b, v28.16b, v27.16b\n"
+      "str q17, [x20, #0x10]\n"
+      "str q16, [x20, #0x20]\n"
+      "zip1 v17.16b, v26.16b, v21.16b\n"
+      "zip2 v16.16b, v26.16b, v21.16b\n"
+      "str q18, [x20, #0x30]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "zip1 v21.16b, v20.16b, v19.16b\n"
+      "zip2 v20.16b, v20.16b, v19.16b\n"
+      "str q17, [x20, #0x0]\n"
+      "zip1 v19.16b, v25.16b, v24.16b\n"
+      "zip2 v18.16b, v25.16b, v24.16b\n"
+      "str q16, [x20, #0x10]\n"
+      "zip1 v17.16b, v23.16b, v22.16b\n"
+      "zip2 v16.16b, v23.16b, v22.16b\n"
+      "str q21, [x20, #0x20]\n"
+      "str q20, [x20, #0x30]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "str q19, [x20, #0x0]\n"
+      "str q18, [x20, #0x10]\n"
+      "str q17, [x20, #0x20]\n"
+      "str q16, [x20, #0x30]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x21, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr s21, [x17], #0x4\n"
+      "ldr s23, [x16], #0x4\n"
+      "sub x21, x21, #0x4\n"
+      "cmp x21, #0x4\n"
+      "ldr s20, [x15], #0x4\n"
+      "ldr s22, [x14], #0x4\n"
+      "ldr s19, [x13], #0x4\n"
+      "ldr s18, [x12], #0x4\n"
+      "ldr s25, [x11], #0x4\n"
+      "ldr s24, [x10], #0x4\n"
+      "ldr s17, [x9], #0x4\n"
+      "ldr s16, [x28], #0x4\n"
+      "zip1 v21.16b, v21.16b, v17.16b\n"
+      "zip1 v23.16b, v23.16b, v16.16b\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v20.16b, v20.16b, v17.16b\n"
+      "zip1 v22.16b, v22.16b, v16.16b\n"
+      "ldr s17, [x25], #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "zip1 v19.16b, v19.16b, v17.16b\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr s17, [x23], #0x4\n"
+      "ldr s16, [x22], #0x4\n"
+      "zip1 v17.16b, v25.16b, v17.16b\n"
+      "zip1 v16.16b, v24.16b, v16.16b\n"
+      "zip1 v21.16b, v21.16b, v19.16b\n"
+      "zip1 v20.16b, v20.16b, v17.16b\n"
+      "zip1 v19.16b, v23.16b, v18.16b\n"
+      "zip1 v16.16b, v22.16b, v16.16b\n"
+      "zip1 v18.16b, v21.16b, v20.16b\n"
+      "zip1 v17.16b, v19.16b, v16.16b\n"
+      "zip2 v20.16b, v21.16b, v20.16b\n"
+      "zip2 v19.16b, v19.16b, v16.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "zip2 v18.16b, v18.16b, v17.16b\n"
+      "str q16, [x20, #0x0]\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip2 v16.16b, v20.16b, v19.16b\n"
+      "str q18, [x20, #0x10]\n"
+      "str q17, [x20, #0x20]\n"
+      "str q16, [x20, #0x30]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x21, #0x1\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr b23, [x17], #0x1\n"
+      "ldr b22, [x16], #0x1\n"
+      "sub x21, x21, #0x1\n"
+      "cmp x21, #0x1\n"
+      "ldr b21, [x15], #0x1\n"
+      "ldr b20, [x14], #0x1\n"
+      "ldr b19, [x13], #0x1\n"
+      "ldr b18, [x12], #0x1\n"
+      "ldr b25, [x11], #0x1\n"
+      "ldr b24, [x10], #0x1\n"
+      "ldr b17, [x9], #0x1\n"
+      "ldr b16, [x28], #0x1\n"
+      "zip1 v23.16b, v23.16b, v17.16b\n"
+      "zip1 v22.16b, v22.16b, v16.16b\n"
+      "ldr b17, [x27], #0x1\n"
+      "ldr b16, [x26], #0x1\n"
+      "zip1 v21.16b, v21.16b, v17.16b\n"
+      "zip1 v20.16b, v20.16b, v16.16b\n"
+      "ldr b17, [x25], #0x1\n"
+      "ldr b16, [x24], #0x1\n"
+      "zip1 v19.16b, v19.16b, v17.16b\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr b17, [x23], #0x1\n"
+      "ldr b16, [x22], #0x1\n"
+      "zip1 v17.16b, v25.16b, v17.16b\n"
+      "zip1 v16.16b, v24.16b, v16.16b\n"
+      "zip1 v19.16b, v23.16b, v19.16b\n"
+      "zip1 v17.16b, v21.16b, v17.16b\n"
+      "zip1 v18.16b, v22.16b, v18.16b\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x20, #0x0]\n"
+      "add x20, x20, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x40\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 16, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_4_1x16(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 16, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_4_1x16(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
new file mode 100644
index 0000000000..5667820865
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_4_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 4) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x10\n"
+      "blt 8f\n"
+      "1:"  // Main row loop: Head
+      "mov x17, %x[in]\n"
+      "add x16, x17, %x[in_stride]\n"
+      "add x15, x16, %x[in_stride]\n"
+      "add x14, x15, %x[in_stride]\n"
+      "add x13, x14, %x[in_stride]\n"
+      "add x12, x13, %x[in_stride]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x10\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x10\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q21, [x17], #0x10\n"
+      "ldr q20, [x16], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q17, [x15], #0x10\n"
+      "ldr q16, [x14], #0x10\n"
+      "zip1 v3.16b, v21.16b, v17.16b\n"
+      "zip1 v2.16b, v20.16b, v16.16b\n"
+      "ldr q19, [x13], #0x10\n"
+      "ldr q18, [x12], #0x10\n"
+      "zip2 v1.16b, v21.16b, v17.16b\n"
+      "zip2 v0.16b, v20.16b, v16.16b\n"
+      "ldr q17, [x11], #0x10\n"
+      "ldr q16, [x10], #0x10\n"
+      "zip1 v31.16b, v19.16b, v17.16b\n"
+      "zip1 v30.16b, v18.16b, v16.16b\n"
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip2 v29.16b, v19.16b, v17.16b\n"
+      "zip2 v28.16b, v18.16b, v16.16b\n"
+      "ldr q17, [x27], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v23.16b, v21.16b, v17.16b\n"
+      "zip1 v22.16b, v20.16b, v16.16b\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "zip2 v27.16b, v21.16b, v17.16b\n"
+      "zip2 v26.16b, v20.16b, v16.16b\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v21.16b, v19.16b, v17.16b\n"
+      "zip1 v20.16b, v18.16b, v16.16b\n"
+      "zip2 v25.16b, v19.16b, v17.16b\n"
+      "zip2 v24.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v3.16b, v2.16b\n"
+      "zip1 v18.16b, v31.16b, v30.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "zip1 v17.16b, v23.16b, v22.16b\n"
+      "zip1 v16.16b, v21.16b, v20.16b\n"
+      "str q18, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "zip2 v19.16b, v3.16b, v2.16b\n"
+      "zip2 v18.16b, v31.16b, v30.16b\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip2 v17.16b, v23.16b, v22.16b\n"
+      "zip2 v16.16b, v21.16b, v20.16b\n"
+      "str q19, [x21, #0x0]\n"
+      "zip1 v23.16b, v1.16b, v0.16b\n"
+      "zip1 v22.16b, v29.16b, v28.16b\n"
+      "str q18, [x21, #0x10]\n"
+      "zip1 v21.16b, v27.16b, v26.16b\n"
+      "zip1 v20.16b, v25.16b, v24.16b\n"
+      "str q17, [x21, #0x20]\n"
+      "zip2 v19.16b, v1.16b, v0.16b\n"
+      "zip2 v18.16b, v29.16b, v28.16b\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip2 v17.16b, v27.16b, v26.16b\n"
+      "zip2 v16.16b, v25.16b, v24.16b\n"
+      "str q23, [x21, #0x0]\n"
+      "str q22, [x21, #0x10]\n"
+      "str q21, [x21, #0x20]\n"
+      "str q20, [x21, #0x30]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "str q19, [x21, #0x0]\n"
+      "str q18, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x24, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr s19, [x17], #0x4\n"
+      "ldr s18, [x16], #0x4\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr s17, [x15], #0x4\n"
+      "ldr s16, [x14], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr s19, [x13], #0x4\n"
+      "ldr s18, [x12], #0x4\n"
+      "zip1 v22.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x11], #0x4\n"
+      "ldr s16, [x10], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "zip1 v21.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s16, [x26], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "zip1 v18.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q22, [x21, #0x0]\n"
+      "str q21, [x21, #0x10]\n"
+      "str q18, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x24, #0x1\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr b19, [x17], #0x1\n"
+      "ldr b18, [x16], #0x1\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr b17, [x15], #0x1\n"
+      "ldr b16, [x14], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr b19, [x13], #0x1\n"
+      "ldr b18, [x12], #0x1\n"
+      "zip1 v22.16b, v17.16b, v16.16b\n"
+      "ldr b17, [x11], #0x1\n"
+      "ldr b16, [x10], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr b19, [x9], #0x1\n"
+      "ldr b18, [x28], #0x1\n"
+      "zip1 v21.16b, v17.16b, v16.16b\n"
+      "ldr b17, [x27], #0x1\n"
+      "ldr b16, [x26], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "ldr b20, [x25], #0x1\n"
+      "ldr b19, [x23], #0x1\n"
+      "zip1 v18.16b, v17.16b, v16.16b\n"
+      "ldr b17, [x22], #0x1\n"
+      "ldr b16, [x20], #0x1\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str s22, [x21, #0x0]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s21, [x21, #0x10]\n"
+      "str s18, [x21, #0x20]\n"
+      "str s16, [x21, #0x30]\n"
+      "add x21, x21, #0x4\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x10\n"
+      "add %x[out], %x[out], #0x40\n"
+      "bge 1b\n"
+      "cbz %x[height], 16f\n"
+      "8:"  // Main loop skip
+      "9:"  // Tail row loop: Head
+      "mov x17, %x[in]\n"
+      "add x16, x17, %x[in_stride]\n"
+      "add x15, x16, %x[in_stride]\n"
+      "mov x20, %x[width]\n"
+      "add x14, x15, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x14, %x[in_stride]\n"
+      "csel x14, x14, %x[pad_row], GT\n"
+      "csel x15, x15, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x16, x16, %x[pad_row], GT\n"
+      "cmp x20, #0x10\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 11f\n"
+      "10:"  // Tail row loop: Unroll column loop
+      "ldr q19, [x17], #0x10\n"
+      "ldr q21, [x16], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "ldr q18, [x15], #0x10\n"
+      "ldr q16, [x14], #0x10\n"
+      "zip1 v20.16b, v19.16b, v18.16b\n"
+      "zip1 v17.16b, v21.16b, v16.16b\n"
+      "zip2 v19.16b, v19.16b, v18.16b\n"
+      "zip2 v18.16b, v21.16b, v16.16b\n"
+      "zip1 v16.16b, v20.16b, v17.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip2 v16.16b, v20.16b, v17.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip1 v17.16b, v19.16b, v18.16b\n"
+      "zip2 v16.16b, v19.16b, v18.16b\n"
+      "str q17, [x21, #0x0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr s19, [x17], #0x4\n"
+      "ldr s18, [x16], #0x4\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "ldr s17, [x15], #0x4\n"
+      "ldr s16, [x14], #0x4\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x1\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 1 loop: loop
+      "ldr b19, [x17], #0x1\n"
+      "ldr b18, [x16], #0x1\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "ldr b17, [x15], #0x1\n"
+      "ldr b16, [x14], #0x1\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x21, #0x0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x10\n"
+      "bge 9b\n"
+      "16:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 4, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_4_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 4, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_4_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..98200c50c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_4_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 4) * sizeof(bfloat16);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 8f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x28, %x[width]\n"
+      "mov x27, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "add x26, x9, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "cmp x28, #0x8\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x26], #0x10\n"
+      "sub x28, x28, #0x8\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x24], #0x10\n"
+      "cmp x28, #0x8\n"
+      "ldr q1, [x23], #0x10\n"
+      "ldr q0, [x22], #0x10\n"
+      "ldr q31, [x21], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x9], #0x10\n"
+      "ldr q22, [x26], #0x10\n"
+      "zip1 v30.4s, v19.4s, v17.4s\n"
+      "zip1 v29.4s, v18.4s, v16.4s\n"
+      "ldr q21, [x25], #0x10\n"
+      "ldr q20, [x24], #0x10\n"
+      "zip2 v28.4s, v19.4s, v17.4s\n"
+      "zip2 v27.4s, v18.4s, v16.4s\n"
+      "ldr q19, [x23], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v26.4s, v1.4s, v31.4s\n"
+      "zip1 v25.4s, v0.4s, v24.4s\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip2 v8.4s, v1.4s, v31.4s\n"
+      "zip2 v24.4s, v0.4s, v24.4s\n"
+      "zip1 v7.4s, v23.4s, v21.4s\n"
+      "zip1 v6.4s, v22.4s, v20.4s\n"
+      "zip2 v5.4s, v23.4s, v21.4s\n"
+      "zip2 v4.4s, v22.4s, v20.4s\n"
+      "zip1 v3.4s, v19.4s, v17.4s\n"
+      "zip1 v2.4s, v18.4s, v16.4s\n"
+      "zip2 v1.4s, v19.4s, v17.4s\n"
+      "zip2 v0.4s, v18.4s, v16.4s\n"
+      "zip1 v23.4s, v30.4s, v29.4s\n"
+      "zip1 v22.4s, v28.4s, v27.4s\n"
+      "zip1 v21.4s, v26.4s, v25.4s\n"
+      "zip1 v20.4s, v8.4s, v24.4s\n"
+      "zip1 v19.4s, v7.4s, v6.4s\n"
+      "zip1 v18.4s, v5.4s, v4.4s\n"
+      "zip1 v17.4s, v3.4s, v2.4s\n"
+      "zip1 v16.4s, v1.4s, v0.4s\n"
+      ".inst 0x0ea16aff  // bfcvtn v31.4h, v23.4s\n"
+      "zip2 v30.4s, v30.4s, v29.4s\n"
+      ".inst 0x0ea16add  // bfcvtn v29.4h, v22.4s\n"
+      "zip2 v28.4s, v28.4s, v27.4s\n"
+      ".inst 0x0ea16abb  // bfcvtn v27.4h, v21.4s\n"
+      "zip2 v26.4s, v26.4s, v25.4s\n"
+      ".inst 0x0ea16a99  // bfcvtn v25.4h, v20.4s\n"
+      "zip2 v24.4s, v8.4s, v24.4s\n"
+      ".inst 0x0ea16a77  // bfcvtn v23.4h, v19.4s\n"
+      "zip2 v22.4s, v7.4s, v6.4s\n"
+      ".inst 0x0ea16a55  // bfcvtn v21.4h, v18.4s\n"
+      "zip2 v20.4s, v5.4s, v4.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v3.4s, v2.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v1.4s, v0.4s\n"
+      ".inst 0x4ea16bdf  // bfcvtn2 v31.8h, v30.4s\n"
+      ".inst 0x4ea16b9d  // bfcvtn2 v29.8h, v28.4s\n"
+      ".inst 0x4ea16b5b  // bfcvtn2 v27.8h, v26.4s\n"
+      ".inst 0x4ea16b19  // bfcvtn2 v25.8h, v24.4s\n"
+      ".inst 0x4ea16ad7  // bfcvtn2 v23.8h, v22.4s\n"
+      ".inst 0x4ea16a95  // bfcvtn2 v21.8h, v20.4s\n"
+      "str q31, [x27, #0x0]\n"
+      "str q29, [x27, #0x10]\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q27, [x27, #0x20]\n"
+      "str q25, [x27, #0x30]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "str q23, [x27, #0x0]\n"
+      "str q21, [x27, #0x10]\n"
+      "str q19, [x27, #0x20]\n"
+      "str q17, [x27, #0x30]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x28, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q25, [x9], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "sub x28, x28, #0x4\n"
+      "ldr q21, [x25], #0x10\n"
+      "ldr q20, [x24], #0x10\n"
+      "cmp x28, #0x4\n"
+      "ldr q23, [x23], #0x10\n"
+      "ldr q19, [x22], #0x10\n"
+      "ldr q18, [x21], #0x10\n"
+      "ldr q17, [x20], #0x10\n"
+      "zip1 v22.4s, v25.4s, v21.4s\n"
+      "zip1 v16.4s, v24.4s, v20.4s\n"
+      "zip2 v21.4s, v25.4s, v21.4s\n"
+      "zip2 v20.4s, v24.4s, v20.4s\n"
+      "zip1 v27.4s, v23.4s, v18.4s\n"
+      "zip1 v26.4s, v19.4s, v17.4s\n"
+      "zip2 v25.4s, v23.4s, v18.4s\n"
+      "zip2 v24.4s, v19.4s, v17.4s\n"
+      "zip1 v19.4s, v22.4s, v16.4s\n"
+      "zip1 v18.4s, v21.4s, v20.4s\n"
+      "zip1 v17.4s, v27.4s, v26.4s\n"
+      "zip2 v23.4s, v22.4s, v16.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
+      "zip2 v22.4s, v21.4s, v20.4s\n"
+      ".inst 0x0ea16a75  // bfcvtn v21.4h, v19.4s\n"
+      ".inst 0x0ea16a54  // bfcvtn v20.4h, v18.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v27.4s, v26.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
+      ".inst 0x4ea16af5  // bfcvtn2 v21.8h, v23.4s\n"
+      ".inst 0x4ea16ad4  // bfcvtn2 v20.8h, v22.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q21, [x27, #0x0]\n"
+      "str q20, [x27, #0x10]\n"
+      "str q19, [x27, #0x20]\n"
+      "str q17, [x27, #0x30]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cbz x28, 7f\n"
+      "movi v16.16b, #0x0\n"
+      "str q16, [x27, #0x0]\n"
+      "str q16, [x27, #0x10]\n"
+      "str q16, [x27, #0x20]\n"
+      "str q16, [x27, #0x30]\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr s23, [x9], #0x4\n"
+      "ldr s22, [x26], #0x4\n"
+      "sub x28, x28, #0x1\n"
+      "ldr s19, [x25], #0x4\n"
+      "ldr s17, [x24], #0x4\n"
+      "cmp x28, #0x1\n"
+      "ldr s21, [x23], #0x4\n"
+      "ldr s20, [x22], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v19.4s, v23.4s, v19.4s\n"
+      "zip1 v17.4s, v22.4s, v17.4s\n"
+      "zip1 v18.4s, v21.4s, v18.4s\n"
+      "zip1 v16.4s, v20.4s, v16.4s\n"
+      "zip1 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d17, [x27, #0x0]\n"
+      "str d16, [x27, #0x20]\n"
+      "add x27, x27, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: odd col skip
+      "cmp %x[height], #0x8\n"
+      "add %x[out], %x[out], #0x40\n"
+      "bge 1b\n"
+      "cbz %x[height], 16f\n"
+      "8:"  // Main loop skip
+      "9:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x20, %x[width]\n"
+      "cmp %x[height], #0x3\n"
+      "mov x27, %x[out]\n"
+      "add x26, x9, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "cmp %x[height], #0x1\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "cmp x20, #0x8\n"
+      "blt 11f\n"
+      "10:"  // Tail row loop: Unroll column loop
+      "ldr q25, [x9], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "sub x20, x20, #0x8\n"
+      "ldr q21, [x25], #0x10\n"
+      "ldr q20, [x24], #0x10\n"
+      "cmp x20, #0x8\n"
+      "ldr q23, [x9], #0x10\n"
+      "ldr q19, [x26], #0x10\n"
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x24], #0x10\n"
+      "zip1 v22.4s, v25.4s, v21.4s\n"
+      "zip1 v16.4s, v24.4s, v20.4s\n"
+      "zip2 v21.4s, v25.4s, v21.4s\n"
+      "zip2 v20.4s, v24.4s, v20.4s\n"
+      "zip1 v27.4s, v23.4s, v18.4s\n"
+      "zip1 v26.4s, v19.4s, v17.4s\n"
+      "zip2 v25.4s, v23.4s, v18.4s\n"
+      "zip2 v24.4s, v19.4s, v17.4s\n"
+      "zip1 v19.4s, v22.4s, v16.4s\n"
+      "zip1 v18.4s, v21.4s, v20.4s\n"
+      "zip1 v17.4s, v27.4s, v26.4s\n"
+      "zip2 v23.4s, v22.4s, v16.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
+      "zip2 v22.4s, v21.4s, v20.4s\n"
+      ".inst 0x0ea16a75  // bfcvtn v21.4h, v19.4s\n"
+      ".inst 0x0ea16a54  // bfcvtn v20.4h, v18.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v27.4s, v26.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
+      ".inst 0x4ea16af5  // bfcvtn2 v21.8h, v23.4s\n"
+      ".inst 0x4ea16ad4  // bfcvtn2 v20.8h, v22.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q21, [x27, #0x0]\n"
+      "str q20, [x27, #0x10]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "str q19, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x26], #0x10\n"
+      "sub x20, x20, #0x4\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q17, [x24], #0x10\n"
+      "cmp x20, #0x4\n"
+      "zip1 v18.4s, v21.4s, v19.4s\n"
+      "zip1 v16.4s, v20.4s, v17.4s\n"
+      "zip2 v21.4s, v21.4s, v19.4s\n"
+      "zip2 v20.4s, v20.4s, v17.4s\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "zip2 v19.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v21.4s, v20.4s\n"
+      ".inst 0x0ea16a32  // bfcvtn v18.4h, v17.4s\n"
+      "zip2 v17.4s, v21.4s, v20.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "str q18, [x27, #0x0]\n"
+      "str q16, [x27, #0x10]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cbz x20, 15f\n"
+      "movi v16.16b, #0x0\n"
+      "str q16, [x27, #0x0]\n"
+      "str q16, [x27, #0x10]\n"
+      "14:"  // Tail row loop: width 1 loop: loop
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x26], #0x4\n"
+      "sub x20, x20, #0x1\n"
+      "ldr s17, [x25], #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "cmp x20, #0x1\n"
+      "zip1 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d16, [x27, #0x0]\n"
+      "add x27, x27, #0x8\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: odd col skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x20\n"
+      "bge 9b\n"
+      "16:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<4, 4, true, VLType::None>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_4_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
new file mode 100644
index 0000000000..328274a488
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_64(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 32 * height * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x20\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q31, [x25], #0x10\n"
+      "ldr q30, [x23], #0x10\n"
+      "sub x24, x24, #0x20\n"
+      "cmp x24, #0x20\n"
+      "ldr q29, [x22], #0x10\n"
+      "ldr q28, [x20], #0x10\n"
+      "ldr q27, [x25], #0x10\n"
+      "ldr q26, [x23], #0x10\n"
+      "ldr q25, [x22], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q31, [x21, #0x0]\n"
+      "str q27, [x21, #0x10]\n"
+      "str q23, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "str q30, [x21, #0x40]\n"
+      "str q26, [x21, #0x50]\n"
+      "str q22, [x21, #0x60]\n"
+      "str q18, [x21, #0x70]\n"
+      "str q29, [x21, #0x80]\n"
+      "str q25, [x21, #0x90]\n"
+      "str q21, [x21, #0xa0]\n"
+      "str q17, [x21, #0xb0]\n"
+      "str q28, [x21, #0xc0]\n"
+      "str q24, [x21, #0xd0]\n"
+      "str q20, [x21, #0xe0]\n"
+      "str q16, [x21, #0xf0]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x24, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q23, [x21, #0x0]\n"
+      "str q19, [x21, #0x10]\n"
+      "str q22, [x21, #0x40]\n"
+      "str q18, [x21, #0x50]\n"
+      "str q21, [x21, #0x80]\n"
+      "str q17, [x21, #0x90]\n"
+      "str q20, [x21, #0xc0]\n"
+      "str q16, [x21, #0xd0]\n"
+      "add x21, x21, #0x20\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x25], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "str d19, [x21, #0x0]\n"
+      "str d18, [x21, #0x40]\n"
+      "str d17, [x21, #0x80]\n"
+      "str d16, [x21, #0xc0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x25], #0x2\n"
+      "ldr h18, [x23], #0x2\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr h17, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "str h19, [x21, #0x0]\n"
+      "str h18, [x21, #0x40]\n"
+      "str h17, [x21, #0x80]\n"
+      "str h16, [x21, #0xc0]\n"
+      "add x21, x21, #0x2\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0x100\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x20, %x[width]\n"
+      "mov x25, %x[in]\n"
+      "cmp x20, #0x20\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x25], #0x10\n"
+      "sub x20, x20, #0x20\n"
+      "cmp x20, #0x20\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "str q19, [x21, #0x0]\n"
+      "str q18, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "str q17, [x21, #0x0]\n"
+      "str q16, [x21, #0x10]\n"
+      "add x21, x21, #0x20\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d16, [x25], #0x8\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "str d16, [x21, #0x0]\n"
+      "add x21, x21, #0x8\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h16, [x25], #0x2\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "str h16, [x21, #0x0]\n"
+      "add x21, x21, #0x2\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x40\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_64(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<32, 1, true, VLType::None>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_64(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<32, 1, true, VLType::None>(
+    uint16_t *out, const uint16_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_64(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint16_t) / 2,
+        stride * sizeof(uint16_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
deleted file mode 100644
index df68740bb4..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "transpose_interleave_common.hpp"
-
-// Generic unblocked transposed 8x32-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a 16 x uint16_t specialisation
-  TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t *>(in),
-    stride*2, x0*2, xmax*2, k0, kmax
-  );
-}
-
-// Generic 16x16-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a uint16_t specialisation
-  Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t *>(in),
-    stride, x0, xmax, k0, kmax
-  );
-}
-
-// Specialised 16 x uint16_t version
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *const out) {
-  __asm volatile (
-    "LDR q0, [%[in0]]\n"
-    "STR q0, [%[out]]\n"
-    "LDR q1, [%[in0], #0x10]\n"
-    "STR q1, [%[out], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x20\n"
-    ASM_PREFETCH("[%[in0], #192]")
-    : [in0] "+r" (in0)
-    : [out] "r" (out)
-    : "v0", "v1", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *const out) {
-  __asm volatile (
-    "LDR q0, [%[in0]]\n"
-    "STR q0, [%[out]]\n"
-    "LDR q1, [%[in0], #0x10]\n"
-    "STR q1, [%[out], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x20\n"
-    ASM_PREFETCH("[%[in0], #192]")
-
-    "LDR q2, [%[in1]]\n"
-    "STR q2, [%[out], #0x20]\n"
-    "LDR q3, [%[in1], #0x10]\n"
-    "STR q3, [%[out], #0x30]\n"
-    "ADD %x[in1], %x[in1], #0x20\n"
-    ASM_PREFETCH("[%[in1], #192]")
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1)
-    : [out] "r" (out)
-    : "v0", "v1", "v2", "v3", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *const out) {
-  __asm __volatile (
-    "LDR q0, [%[in0]]\n"
-    "STR q0, [%[out]]\n"
-    "LDR q1, [%[in0], #0x10]\n"
-    "STR q1, [%[out], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x20\n"
-    ASM_PREFETCH("[%[in0], #192]")
-
-    "LDR q2, [%[in1]]\n"
-    "STR q2, [%[out], #0x20]\n"
-    "LDR q3, [%[in1], #0x10]\n"
-    "STR q3, [%[out], #0x30]\n"
-    "ADD %x[in1], %x[in1], #0x20\n"
-    ASM_PREFETCH("[%[in1], #192]")
-
-    "LDR q0, [%[in2]]\n"
-    "STR q0, [%[out], #0x40]\n"
-    "LDR q1, [%[in2], #0x10]\n"
-    "STR q1, [%[out], #0x50]\n"
-    "ADD %x[in2], %x[in2], #0x20\n"
-    ASM_PREFETCH("[%[in2], #192]")
-
-    "LDR q2, [%[in3]]\n"
-    "STR q2, [%[out], #0x60]\n"
-    "LDR q3, [%[in3], #0x10]\n"
-    "STR q3, [%[out], #0x70]\n"
-    "ADD %x[in3], %x[in3], #0x20\n"
-    ASM_PREFETCH("[%[in3], #192]")
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1),
-      [in2] "+r" (in2),
-      [in3] "+r" (in3)
-    : [out] "r" (out)
-    : "v0", "v1", "v2", "v3", "memory"
-  );
-}
-
-template <>
-template <>
-inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
-    uint16_t* out, const uint16_t* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
new file mode 100644
index 0000000000..feb469ab0e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_96(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 24 * height * sizeof(uint32_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[width]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x20, x22, %x[in_stride]\n"
+      "cmp x24, #0x18\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q7, [x25], #0x10\n"
+      "ldr q6, [x23], #0x10\n"
+      "sub x24, x24, #0x18\n"
+      "cmp x24, #0x18\n"
+      "ldr q5, [x22], #0x10\n"
+      "ldr q4, [x20], #0x10\n"
+      "ldr q3, [x25], #0x10\n"
+      "ldr q2, [x23], #0x10\n"
+      "ldr q1, [x22], #0x10\n"
+      "ldr q0, [x20], #0x10\n"
+      "ldr q31, [x25], #0x10\n"
+      "ldr q30, [x23], #0x10\n"
+      "ldr q29, [x22], #0x10\n"
+      "ldr q28, [x20], #0x10\n"
+      "ldr q27, [x25], #0x10\n"
+      "ldr q26, [x23], #0x10\n"
+      "ldr q25, [x22], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q7, [x21, #0x0]\n"
+      "str q3, [x21, #0x10]\n"
+      "str q31, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "str q23, [x21, #0x40]\n"
+      "str q19, [x21, #0x50]\n"
+      "str q6, [x21, #0x60]\n"
+      "str q2, [x21, #0x70]\n"
+      "str q30, [x21, #0x80]\n"
+      "str q26, [x21, #0x90]\n"
+      "str q22, [x21, #0xa0]\n"
+      "str q18, [x21, #0xb0]\n"
+      "str q5, [x21, #0xc0]\n"
+      "str q1, [x21, #0xd0]\n"
+      "str q29, [x21, #0xe0]\n"
+      "str q25, [x21, #0xf0]\n"
+      "str q21, [x21, #0x100]\n"
+      "str q17, [x21, #0x110]\n"
+      "str q4, [x21, #0x120]\n"
+      "str q0, [x21, #0x130]\n"
+      "str q28, [x21, #0x140]\n"
+      "str q24, [x21, #0x150]\n"
+      "str q20, [x21, #0x160]\n"
+      "str q16, [x21, #0x170]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x24, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q31, [x25], #0x10\n"
+      "ldr q30, [x23], #0x10\n"
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "ldr q29, [x22], #0x10\n"
+      "ldr q28, [x20], #0x10\n"
+      "ldr q27, [x25], #0x10\n"
+      "ldr q26, [x23], #0x10\n"
+      "ldr q25, [x22], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x25], #0x10\n"
+      "ldr q22, [x23], #0x10\n"
+      "ldr q21, [x22], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q31, [x21, #0x0]\n"
+      "str q27, [x21, #0x10]\n"
+      "str q23, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "str q30, [x21, #0x60]\n"
+      "str q26, [x21, #0x70]\n"
+      "str q22, [x21, #0x80]\n"
+      "str q18, [x21, #0x90]\n"
+      "str q29, [x21, #0xc0]\n"
+      "str q25, [x21, #0xd0]\n"
+      "str q21, [x21, #0xe0]\n"
+      "str q17, [x21, #0xf0]\n"
+      "str q28, [x21, #0x120]\n"
+      "str q24, [x21, #0x130]\n"
+      "str q20, [x21, #0x140]\n"
+      "str q16, [x21, #0x150]\n"
+      "add x21, x21, #0x40\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x24, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q19, [x21, #0x0]\n"
+      "str q18, [x21, #0x60]\n"
+      "str q17, [x21, #0xc0]\n"
+      "str q16, [x21, #0x120]\n"
+      "add x21, x21, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x24, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr s19, [x25], #0x4\n"
+      "ldr s18, [x23], #0x4\n"
+      "sub x24, x24, #0x1\n"
+      "cmp x24, #0x1\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "str s19, [x21, #0x0]\n"
+      "str s18, [x21, #0x60]\n"
+      "str s17, [x21, #0xc0]\n"
+      "str s16, [x21, #0x120]\n"
+      "add x21, x21, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0x180\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+      "11:"  // Tail row loop: Head
+      "mov x20, %x[width]\n"
+      "mov x25, %x[in]\n"
+      "cmp x20, #0x18\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q21, [x25], #0x10\n"
+      "ldr q20, [x25], #0x10\n"
+      "sub x20, x20, #0x18\n"
+      "cmp x20, #0x18\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "str q21, [x21, #0x0]\n"
+      "str q20, [x21, #0x10]\n"
+      "str q19, [x21, #0x20]\n"
+      "str q18, [x21, #0x30]\n"
+      "str q17, [x21, #0x40]\n"
+      "str q16, [x21, #0x50]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x20, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q19, [x25], #0x10\n"
+      "ldr q18, [x25], #0x10\n"
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "str q19, [x21, #0x0]\n"
+      "str q18, [x21, #0x10]\n"
+      "str q17, [x21, #0x20]\n"
+      "str q16, [x21, #0x30]\n"
+      "add x21, x21, #0x40\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x20, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr q16, [x25], #0x10\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x4\n"
+      "str q16, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x20, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr s16, [x25], #0x4\n"
+      "sub x20, x20, #0x1\n"
+      "cmp x20, #0x1\n"
+      "str s16, [x21, #0x0]\n"
+      "add x21, x21, #0x4\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x60\n"
+      "bge 11b\n"
+      "20:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<24, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_96(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp
new file mode 100644
index 0000000000..1e6c3d35f4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023,2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SME
+#include "sme_transpose_interleave_16VL_1x4.hpp"
+#include "sme_transpose_interleave_16VL_2x2_fp32bf16.hpp"
+#include "sme_transpose_interleave_16VL_2x2.hpp"
+#include "sme_transpose_interleave_16VL.hpp"
+#include "sme_transpose_interleave_1VL_1x4.hpp"
+#include "sme_transpose_interleave_1VL_2x2_fp32bf16.hpp"
+#include "sme_transpose_interleave_1VL_2x2.hpp"
+#include "sme_transpose_interleave_1VL.hpp"
+#include "sme_transpose_interleave_2VL_1x4.hpp"
+#include "sme_transpose_interleave_2VL_2x2.hpp"
+#include "sme_transpose_interleave_2VL_2x2_fp32bf16.hpp"
+#include "sme_transpose_interleave_2VL.hpp"
+#include "sme_transpose_interleave_4VL_1x4.hpp"
+#include "sme_transpose_interleave_4VL_2x2.hpp"
+#include "sme_transpose_interleave_4VL_2x2_fp32bf16.hpp"
+#include "sme_transpose_interleave_4VL.hpp"
+#endif // ARM_COMPUTE_ENABLE_SME
+#include "sve_transpose_interleave_12VL_2x4_fp32bf16.hpp"
+#include "sve_transpose_interleave_1VL_1x4.hpp"
+#include "sve_transpose_interleave_1VL.hpp"
+#include "sve_transpose_interleave_2VL_2x4_fp32bf16.hpp"
+#include "sve_transpose_interleave_3VL_1x4.hpp"
+#include "sve_transpose_interleave_3VL_2x2.hpp"
+#include "sve_transpose_interleave_3VL.hpp"
+#include "sve_transpose_interleave_4VL_1x4.hpp"
+#include "sve_transpose_interleave_4VL_2x2.hpp"
+#include "sve_transpose_interleave_4VL.hpp"
+#include "sve_transpose_interleave_6VL_1x8.hpp"
+#include "sve_transpose_interleave_6VL_2x4_fp32bf16.hpp"
+#include "sve_transpose_interleave_6VL_2x4.hpp"
+#include "sve_transpose_interleave_6VL_4x2.hpp"
+#include "sve_transpose_interleave_8VL_1x4.hpp"
+#include "sve_transpose_interleave_8VL_1x8.hpp"
+#include "sve_transpose_interleave_8VL_2x2.hpp"
+#include "sve_transpose_interleave_8VL_2x4.hpp"
+#include "sve_transpose_interleave_8VL_2x4_fp32bf16.hpp"
+#include "sve_transpose_interleave_8VL.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index e092c729ba..1ce319efee 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,29 @@
  * SOFTWARE.
  */
 #include "a32_transpose_interleave_8way_32bit.hpp"
-#include "a64_transpose_interleave_12way_16bit.hpp"
-#include "a64_transpose_interleave_12way_half_to_float.hpp"
-#include "a64_transpose_interleave_24way_16bit.hpp"
-#include "a64_transpose_interleave_8way_32bit.hpp"
+#include "a64_transpose_interleave_12_1x4.hpp"
+#include "a64_transpose_interleave_12_1x8.hpp"
+#include "a64_transpose_interleave_12_2x2.hpp"
+#include "a64_transpose_interleave_12_2x4_fp32bf16.hpp"
+#include "a64_transpose_interleave_12_2x4.hpp"
+#include "a64_transpose_interleave_128.hpp"
+#include "a64_transpose_interleave_12_s8s16.hpp"
+#include "a64_transpose_interleave_12_u8u16.hpp"
+#include "a64_transpose_interleave_16_1x4.hpp"
+#include "a64_transpose_interleave_16_1x8.hpp"
+#include "a64_transpose_interleave_16_2x2.hpp"
+#include "a64_transpose_interleave_16_2x4.hpp"
+#include "a64_transpose_interleave_16_2x4_fp32bf16.hpp"
+#include "a64_transpose_interleave_16.hpp"
+#include "a64_transpose_interleave_24_bf16fp32.hpp"
+#include "a64_transpose_interleave_24_fp16fp32.hpp"
+#include "a64_transpose_interleave_24_2x4_fp32bf16.hpp"
+#include "a64_transpose_interleave_24.hpp"
+#include "a64_transpose_interleave_32_1x4.hpp"
+#include "a64_transpose_interleave_32_2x2.hpp"
+#include "a64_transpose_interleave_4_1x16.hpp"
+#include "a64_transpose_interleave_4_1x4.hpp"
+#include "a64_transpose_interleave_4_2x4_fp32bf16.hpp"
+#include "a64_transpose_interleave_48.hpp"
+#include "a64_transpose_interleave_64.hpp"
+#include "a64_transpose_interleave_96.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
new file mode 100644
index 0000000000..a4d480c405
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_16VL(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 16 * height * sme::get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p7.b\n"
+      "1:"  // Main row loop: Head
+      "mov x23, %x[in]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z31.s }, p0/Z, [x23]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z30.s }, p0/Z, [x23, #1, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z29.s }, p0/Z, [x23, #2, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z28.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z27.s }, p0/Z, [x23, #4, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z26.s }, p0/Z, [x23, #5, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z25.s }, p0/Z, [x23, #6, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z24.s }, p0/Z, [x23, #7, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "decw x20\n"
+      "whilelt p6.s, XZR, x20\n"
+      "decw x20\n"
+      "whilelt p5.s, XZR, x20\n"
+      "decw x20\n"
+      "whilelt p4.s, XZR, x20\n"
+      "decw x20\n"
+      "whilelt p3.s, XZR, x20\n"
+      "decw x20\n"
+      "whilelt p2.s, XZR, x20\n"
+      "decw x20\n"
+      "whilelt p1.s, XZR, x20\n"
+      "decw x20\n"
+      "addvl x23, x23, #16\n"
+      "ld1w { z23.s }, p0/Z, [x23, #-8, MUL VL]\n"
+      "whilelt p0.s, XZR, x20\n"
+      "mov x20, x22\n"
+      "ld1w { z22.s }, p6/Z, [x23, #-7, MUL VL]\n"
+      "decw x21, ALL, MUL #16\n"
+      "ld1w { z21.s }, p5/Z, [x23, #-6, MUL VL]\n"
+      "cmp x21, #0x0\n"
+      "ld1w { z20.s }, p4/Z, [x23, #-5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1w { z19.s }, p3/Z, [x23, #-4, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x23, #-3, MUL VL]\n"
+      "ld1w { z17.s }, p1/Z, [x23, #-2, MUL VL]\n"
+      "ld1w { z16.s }, p0/Z, [x23, #-1, MUL VL]\n"
+      "st1w { z31.s }, p7, [x20]\n"
+      "st1w { z30.s }, p7, [x20, #1, MUL VL]\n"
+      "st1w { z29.s }, p7, [x20, #2, MUL VL]\n"
+      "st1w { z28.s }, p7, [x20, #3, MUL VL]\n"
+      "st1w { z27.s }, p7, [x20, #4, MUL VL]\n"
+      "st1w { z26.s }, p7, [x20, #5, MUL VL]\n"
+      "st1w { z25.s }, p7, [x20, #6, MUL VL]\n"
+      "st1w { z24.s }, p7, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1w { z23.s }, p7, [x20, #-8, MUL VL]\n"
+      "st1w { z22.s }, p7, [x20, #-7, MUL VL]\n"
+      "st1w { z21.s }, p7, [x20, #-6, MUL VL]\n"
+      "st1w { z20.s }, p7, [x20, #-5, MUL VL]\n"
+      "st1w { z19.s }, p7, [x20, #-4, MUL VL]\n"
+      "st1w { z18.s }, p7, [x20, #-3, MUL VL]\n"
+      "st1w { z17.s }, p7, [x20, #-2, MUL VL]\n"
+      "st1w { z16.s }, p7, [x20, #-1, MUL VL]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 1, true, VLType::SME>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_16VL(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
new file mode 100644
index 0000000000..552abfc1c6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_16VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 4) * sme::get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p4.b\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "mov x22, %x[out]\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p3.b, XZR, x20\n"
+      "ld1b { z20.b }, p3/Z, [x26]\n"
+      "decb x20\n"
+      "whilelt p2.b, XZR, x20\n"
+      "ld1b { z18.b }, p2/Z, [x26, #1, MUL VL]\n"
+      "decb x20\n"
+      "whilelt p1.b, XZR, x20\n"
+      "ld1b { z17.b }, p3/Z, [x25]\n"
+      "decb x20\n"
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z19.b }, p2/Z, [x25, #1, MUL VL]\n"
+      "ld1b { z16.b }, p3/Z, [x24]\n"
+      "zip1 z25.b, z20.b, z16.b\n"
+      "zip2 z24.b, z20.b, z16.b\n"
+      "mov x20, x22\n"
+      "ld1b { z16.b }, p2/Z, [x24, #1, MUL VL]\n"
+      "zip1 z22.b, z18.b, z16.b\n"
+      "zip2 z21.b, z18.b, z16.b\n"
+      "decw x21, ALL, MUL #16\n"
+      "ld1b { z16.b }, p3/Z, [x23]\n"
+      "zip1 z18.b, z17.b, z16.b\n"
+      "zip2 z17.b, z17.b, z16.b\n"
+      "cmp x21, #0x0\n"
+      "ld1b { z16.b }, p2/Z, [x23, #1, MUL VL]\n"
+      "zip1 z20.b, z19.b, z16.b\n"
+      "zip2 z16.b, z19.b, z16.b\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1b { z19.b }, p1/Z, [x26, #2, MUL VL]\n"
+      "zip1 z23.b, z25.b, z18.b\n"
+      "zip2 z0.b, z25.b, z18.b\n"
+      "ld1b { z18.b }, p0/Z, [x26, #3, MUL VL]\n"
+      "zip1 z31.b, z24.b, z17.b\n"
+      "zip2 z30.b, z24.b, z17.b\n"
+      "addvl x26, x26, #4\n"
+      "ld1b { z17.b }, p1/Z, [x25, #2, MUL VL]\n"
+      "zip1 z29.b, z22.b, z20.b\n"
+      "zip2 z28.b, z22.b, z20.b\n"
+      "ld1b { z22.b }, p0/Z, [x25, #3, MUL VL]\n"
+      "zip1 z27.b, z21.b, z16.b\n"
+      "zip2 z26.b, z21.b, z16.b\n"
+      "addvl x25, x25, #4\n"
+      "ld1b { z16.b }, p1/Z, [x24, #2, MUL VL]\n"
+      "zip1 z21.b, z19.b, z16.b\n"
+      "zip2 z20.b, z19.b, z16.b\n"
+      "ld1b { z16.b }, p0/Z, [x24, #3, MUL VL]\n"
+      "zip1 z25.b, z18.b, z16.b\n"
+      "zip2 z24.b, z18.b, z16.b\n"
+      "addvl x24, x24, #4\n"
+      "ld1b { z16.b }, p1/Z, [x23, #2, MUL VL]\n"
+      "zip1 z19.b, z17.b, z16.b\n"
+      "zip2 z18.b, z17.b, z16.b\n"
+      "ld1b { z16.b }, p0/Z, [x23, #3, MUL VL]\n"
+      "zip1 z17.b, z22.b, z16.b\n"
+      "zip2 z16.b, z22.b, z16.b\n"
+      "addvl x23, x23, #4\n"
+      "st1b { z23.b }, p4, [x20]\n"
+      "zip1 z23.b, z21.b, z19.b\n"
+      "zip2 z22.b, z21.b, z19.b\n"
+      "st1b { z0.b }, p4, [x20, #1, MUL VL]\n"
+      "zip1 z21.b, z20.b, z18.b\n"
+      "zip2 z20.b, z20.b, z18.b\n"
+      "st1b { z31.b }, p4, [x20, #2, MUL VL]\n"
+      "zip1 z19.b, z25.b, z17.b\n"
+      "zip2 z18.b, z25.b, z17.b\n"
+      "st1b { z30.b }, p4, [x20, #3, MUL VL]\n"
+      "zip1 z17.b, z24.b, z16.b\n"
+      "zip2 z16.b, z24.b, z16.b\n"
+      "st1b { z29.b }, p4, [x20, #4, MUL VL]\n"
+      "st1b { z28.b }, p4, [x20, #5, MUL VL]\n"
+      "st1b { z27.b }, p4, [x20, #6, MUL VL]\n"
+      "st1b { z26.b }, p4, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1b { z23.b }, p4, [x20, #-8, MUL VL]\n"
+      "st1b { z22.b }, p4, [x20, #-7, MUL VL]\n"
+      "st1b { z21.b }, p4, [x20, #-6, MUL VL]\n"
+      "st1b { z20.b }, p4, [x20, #-5, MUL VL]\n"
+      "st1b { z19.b }, p4, [x20, #-4, MUL VL]\n"
+      "st1b { z18.b }, p4, [x20, #-3, MUL VL]\n"
+      "st1b { z17.b }, p4, [x20, #-2, MUL VL]\n"
+      "st1b { z16.b }, p4, [x20, #-1, MUL VL]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 4, true, VLType::SME>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_16VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<16, 4, true, VLType::SME>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_16VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
new file mode 100644
index 0000000000..dac6b06f1e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_16VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p6.b\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "cmp %x[height], #0x1\n"
+      "add x24, x25, %x[in_stride]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x22, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x21, x22\n"
+      "mov x20, x23\n"
+      "whilelt p1.h, XZR, x21\n"
+      "dech x21\n"
+      "whilelt p0.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z21.h }, p1/Z, [x25]\n"
+      "whilelt p5.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z20.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "whilelt p4.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z25.h }, p5/Z, [x25, #2, MUL VL]\n"
+      "whilelt p3.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z24.h }, p4/Z, [x25, #3, MUL VL]\n"
+      "whilelt p2.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z19.h }, p1/Z, [x24]\n"
+      "whilelt p1.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z18.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "whilelt p0.h, XZR, x21\n"
+      "ld1h { z17.h }, p5/Z, [x24, #2, MUL VL]\n"
+      "decw x22, ALL, MUL #16\n"
+      "ld1h { z16.h }, p4/Z, [x24, #3, MUL VL]\n"
+      "zip1 z23.h, z21.h, z19.h\n"
+      "zip2 z22.h, z21.h, z19.h\n"
+      "cmp x22, #0x0\n"
+      "ld1h { z21.h }, p3/Z, [x25, #4, MUL VL]\n"
+      "zip1 z31.h, z20.h, z18.h\n"
+      "zip2 z30.h, z20.h, z18.h\n"
+      "add x23, x23, %x[out_stride]\n"
+      "ld1h { z20.h }, p2/Z, [x25, #5, MUL VL]\n"
+      "zip1 z29.h, z25.h, z17.h\n"
+      "zip2 z28.h, z25.h, z17.h\n"
+      "ld1h { z27.h }, p1/Z, [x25, #6, MUL VL]\n"
+      "zip1 z26.h, z24.h, z16.h\n"
+      "zip2 z25.h, z24.h, z16.h\n"
+      "ld1h { z24.h }, p0/Z, [x25, #7, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "ld1h { z19.h }, p3/Z, [x24, #4, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x24, #5, MUL VL]\n"
+      "ld1h { z17.h }, p1/Z, [x24, #6, MUL VL]\n"
+      "ld1h { z16.h }, p0/Z, [x24, #7, MUL VL]\n"
+      "st1h { z23.h }, p6, [x20]\n"
+      "addvl x24, x24, #8\n"
+      "zip1 z23.h, z21.h, z19.h\n"
+      "st1h { z22.h }, p6, [x20, #1, MUL VL]\n"
+      "zip2 z22.h, z21.h, z19.h\n"
+      "zip1 z21.h, z20.h, z18.h\n"
+      "st1h { z31.h }, p6, [x20, #2, MUL VL]\n"
+      "zip2 z20.h, z20.h, z18.h\n"
+      "zip1 z19.h, z27.h, z17.h\n"
+      "st1h { z30.h }, p6, [x20, #3, MUL VL]\n"
+      "zip2 z18.h, z27.h, z17.h\n"
+      "zip1 z17.h, z24.h, z16.h\n"
+      "st1h { z29.h }, p6, [x20, #4, MUL VL]\n"
+      "zip2 z16.h, z24.h, z16.h\n"
+      "st1h { z28.h }, p6, [x20, #5, MUL VL]\n"
+      "st1h { z26.h }, p6, [x20, #6, MUL VL]\n"
+      "st1h { z25.h }, p6, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z23.h }, p6, [x20, #-8, MUL VL]\n"
+      "st1h { z22.h }, p6, [x20, #-7, MUL VL]\n"
+      "st1h { z21.h }, p6, [x20, #-6, MUL VL]\n"
+      "st1h { z20.h }, p6, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p6, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p6, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p6, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p6, [x20, #-1, MUL VL]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 2, true, VLType::SME>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_16VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<16, 2, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_16VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
new file mode 100644
index 0000000000..2756327815
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_16VL_2x2_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p7.b\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "mov x23, %x[out]\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x22, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x21, x22\n"
+      "whilelt p1.s, XZR, x21\n"
+      "ld1w { z16.s }, p1/Z, [x25]\n"
+      ".inst 0x658abe00  // bfcvt z0.h, p7/M, z16.s\n"
+      "decw x21\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z16.s }, p0/Z, [x25, #1, MUL VL]\n"
+      ".inst 0x658abe1f  // bfcvt z31.h, p7/M, z16.s\n"
+      "decw x21\n"
+      "whilelt p6.s, XZR, x21\n"
+      "ld1w { z16.s }, p6/Z, [x25, #2, MUL VL]\n"
+      ".inst 0x658abe1e  // bfcvt z30.h, p7/M, z16.s\n"
+      "decw x21\n"
+      "whilelt p5.s, XZR, x21\n"
+      "ld1w { z16.s }, p5/Z, [x25, #3, MUL VL]\n"
+      ".inst 0x658abe1d  // bfcvt z29.h, p7/M, z16.s\n"
+      "decw x21\n"
+      "whilelt p4.s, XZR, x21\n"
+      "ld1w { z16.s }, p4/Z, [x25, #4, MUL VL]\n"
+      ".inst 0x658abe1c  // bfcvt z28.h, p7/M, z16.s\n"
+      "decw x21\n"
+      "whilelt p3.s, XZR, x21\n"
+      "ld1w { z16.s }, p3/Z, [x25, #5, MUL VL]\n"
+      ".inst 0x658abe1b  // bfcvt z27.h, p7/M, z16.s\n"
+      "decw x21\n"
+      "whilelt p2.s, XZR, x21\n"
+      "ld1w { z16.s }, p2/Z, [x25, #6, MUL VL]\n"
+      ".inst 0x658abe1a  // bfcvt z26.h, p7/M, z16.s\n"
+      "decw x21\n"
+      "ld1w { z16.s }, p1/Z, [x24]\n"
+      "whilelt p1.s, XZR, x21\n"
+      ".inst 0x648abe00  // bfcvtnt z0.h, p7/M, z16.s\n"
+      "decw x21\n"
+      "ld1w { z16.s }, p1/Z, [x25, #7, MUL VL]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0x658abe19  // bfcvt z25.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x24, #1, MUL VL]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "decw x21\n"
+      ".inst 0x648abe1f  // bfcvtnt z31.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x25, #-8, MUL VL]\n"
+      ".inst 0x658abe18  // bfcvt z24.h, p7/M, z16.s\n"
+      "mov x20, x23\n"
+      "decw x22, ALL, MUL #16\n"
+      "ld1w { z16.s }, p6/Z, [x24, #2, MUL VL]\n"
+      "whilelt p6.s, XZR, x21\n"
+      "decw x21\n"
+      ".inst 0x648abe1e  // bfcvtnt z30.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p6/Z, [x25, #-7, MUL VL]\n"
+      ".inst 0x658abe17  // bfcvt z23.h, p7/M, z16.s\n"
+      "add x23, x23, %x[out_stride]\n"
+      "ld1w { z16.s }, p5/Z, [x24, #3, MUL VL]\n"
+      "whilelt p5.s, XZR, x21\n"
+      "decw x21\n"
+      ".inst 0x648abe1d  // bfcvtnt z29.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p5/Z, [x25, #-6, MUL VL]\n"
+      ".inst 0x658abe16  // bfcvt z22.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p4/Z, [x24, #4, MUL VL]\n"
+      "whilelt p4.s, XZR, x21\n"
+      "decw x21\n"
+      ".inst 0x648abe1c  // bfcvtnt z28.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p4/Z, [x25, #-5, MUL VL]\n"
+      ".inst 0x658abe15  // bfcvt z21.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p3/Z, [x24, #5, MUL VL]\n"
+      "whilelt p3.s, XZR, x21\n"
+      "decw x21\n"
+      ".inst 0x648abe1b  // bfcvtnt z27.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p3/Z, [x25, #-4, MUL VL]\n"
+      ".inst 0x658abe14  // bfcvt z20.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x24, #6, MUL VL]\n"
+      "whilelt p2.s, XZR, x21\n"
+      "decw x21\n"
+      ".inst 0x648abe1a  // bfcvtnt z26.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x25, #-3, MUL VL]\n"
+      ".inst 0x658abe13  // bfcvt z19.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x24, #7, MUL VL]\n"
+      "whilelt p1.s, XZR, x21\n"
+      "decw x21\n"
+      ".inst 0x648abe19  // bfcvtnt z25.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x25, #-2, MUL VL]\n"
+      "addvl x24, x24, #16\n"
+      ".inst 0x658abe12  // bfcvt z18.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x24, #-8, MUL VL]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "cmp x22, #0x0\n"
+      ".inst 0x648abe18  // bfcvtnt z24.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x25, #-1, MUL VL]\n"
+      ".inst 0x658abe11  // bfcvt z17.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p6/Z, [x24, #-7, MUL VL]\n"
+      ".inst 0x648abe17  // bfcvtnt z23.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p5/Z, [x24, #-6, MUL VL]\n"
+      ".inst 0x648abe16  // bfcvtnt z22.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p4/Z, [x24, #-5, MUL VL]\n"
+      ".inst 0x648abe15  // bfcvtnt z21.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p3/Z, [x24, #-4, MUL VL]\n"
+      ".inst 0x648abe14  // bfcvtnt z20.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x24, #-3, MUL VL]\n"
+      ".inst 0x648abe13  // bfcvtnt z19.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x24, #-2, MUL VL]\n"
+      ".inst 0x648abe12  // bfcvtnt z18.h, p7/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x24, #-1, MUL VL]\n"
+      "st1h { z0.h }, p7, [x20]\n"
+      ".inst 0x648abe11  // bfcvtnt z17.h, p7/M, z16.s\n"
+      "st1h { z31.h }, p7, [x20, #1, MUL VL]\n"
+      "st1h { z30.h }, p7, [x20, #2, MUL VL]\n"
+      "st1h { z29.h }, p7, [x20, #3, MUL VL]\n"
+      "st1h { z28.h }, p7, [x20, #4, MUL VL]\n"
+      "st1h { z27.h }, p7, [x20, #5, MUL VL]\n"
+      "st1h { z26.h }, p7, [x20, #6, MUL VL]\n"
+      "st1h { z25.h }, p7, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z24.h }, p7, [x20, #-8, MUL VL]\n"
+      "st1h { z23.h }, p7, [x20, #-7, MUL VL]\n"
+      "st1h { z22.h }, p7, [x20, #-6, MUL VL]\n"
+      "st1h { z21.h }, p7, [x20, #-5, MUL VL]\n"
+      "st1h { z20.h }, p7, [x20, #-4, MUL VL]\n"
+      "st1h { z19.h }, p7, [x20, #-3, MUL VL]\n"
+      "st1h { z18.h }, p7, [x20, #-2, MUL VL]\n"
+      "st1h { z17.h }, p7, [x20, #-1, MUL VL]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<16, 2, true, VLType::SME>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_16VL_2x2_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
new file mode 100644
index 0000000000..a6ddb8fec0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_1VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 1 * height * sme::get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cmp %x[height], #0x4\n"
+      "ptrue p1.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "mov x23, %x[width]\n"
+      "cnth x21, ALL, MUL #4\n"
+      "add x20, x24, %x[in_stride]\n"
+      "cmp x23, x21\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z31.h }, p1/Z, [x26]\n"
+      "sub x23, x23, x21\n"
+      "cmp x23, x21\n"
+      "ld1h { z30.h }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1h { z29.h }, p1/Z, [x26, #2, MUL VL]\n"
+      "ld1h { z28.h }, p1/Z, [x26, #3, MUL VL]\n"
+      "addvl x26, x26, #4\n"
+      "ld1h { z27.h }, p1/Z, [x25]\n"
+      "ld1h { z26.h }, p1/Z, [x25, #1, MUL VL]\n"
+      "ld1h { z25.h }, p1/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z24.h }, p1/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "ld1h { z23.h }, p1/Z, [x24]\n"
+      "ld1h { z22.h }, p1/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z21.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z20.h }, p1/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "ld1h { z19.h }, p1/Z, [x20]\n"
+      "ld1h { z18.h }, p1/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z17.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z16.h }, p1/Z, [x20, #3, MUL VL]\n"
+      "st1h { z31.h }, p1, [x22]\n"
+      "addvl x20, x20, #4\n"
+      "st1h { z27.h }, p1, [x22, #1, MUL VL]\n"
+      "st1h { z23.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z30.h }, p1, [x22]\n"
+      "st1h { z26.h }, p1, [x22, #1, MUL VL]\n"
+      "st1h { z22.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z18.h }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z29.h }, p1, [x22]\n"
+      "st1h { z25.h }, p1, [x22, #1, MUL VL]\n"
+      "st1h { z21.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z17.h }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z28.h }, p1, [x22]\n"
+      "st1h { z24.h }, p1, [x22, #1, MUL VL]\n"
+      "st1h { z20.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z16.h }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x23, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.h, XZR, x23\n"
+      "dech x23\n"
+      "ld1h { z19.h }, p0/Z, [x26]\n"
+      "cmp x23, #0x0\n"
+      "addvl x26, x26, #1\n"
+      "ld1h { z18.h }, p0/Z, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "ld1h { z17.h }, p0/Z, [x24]\n"
+      "addvl x24, x24, #1\n"
+      "ld1h { z16.h }, p0/Z, [x20]\n"
+      "addvl x20, x20, #1\n"
+      "st1h { z19.h }, p1, [x22]\n"
+      "st1h { z18.h }, p1, [x22, #1, MUL VL]\n"
+      "st1h { z17.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z16.h }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #4\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x21, %x[width]\n"
+      "cnth x20, ALL, MUL #4\n"
+      "mov x26, %x[in]\n"
+      "cmp x21, x20\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z19.h }, p1/Z, [x26]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1h { z18.h }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1h { z17.h }, p1/Z, [x26, #2, MUL VL]\n"
+      "ld1h { z16.h }, p1/Z, [x26, #3, MUL VL]\n"
+      "st1h { z19.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "addvl x26, x26, #4\n"
+      "st1h { z18.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z17.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z16.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.h, XZR, x21\n"
+      "dech x21\n"
+      "ld1h { z16.h }, p0/Z, [x26]\n"
+      "st1h { z16.h }, p1, [x22]\n"
+      "cmp x21, #0x0\n"
+      "addvl x26, x26, #1\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #1\n"
+      "bge 7b\n"
+      "12:"  // Done
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<1, 1, true, VLType::SME>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_1VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<1, 1, true, VLType::SME>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_1VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<1, 1, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_1VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
new file mode 100644
index 0000000000..399a52e233
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_1VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 1 * roundup<size_t>(height, 4) * sme::get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "mov x22, %x[width]\n"
+      "cntb x21\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "cmp x22, x21\n"
+      "mov x20, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z17.b }, p1/Z, [x26]\n"
+      "sub x22, x22, x21\n"
+      "cmp x22, x21\n"
+      "ld1b { z18.b }, p1/Z, [x25]\n"
+      "addvl x26, x26, #1\n"
+      "addvl x25, x25, #1\n"
+      "ld1b { z16.b }, p1/Z, [x24]\n"
+      "zip1 z20.b, z17.b, z16.b\n"
+      "zip2 z19.b, z17.b, z16.b\n"
+      "addvl x24, x24, #1\n"
+      "ld1b { z16.b }, p1/Z, [x23]\n"
+      "zip1 z17.b, z18.b, z16.b\n"
+      "zip2 z18.b, z18.b, z16.b\n"
+      "addvl x23, x23, #1\n"
+      "zip1 z16.b, z20.b, z17.b\n"
+      "st1b { z16.b }, p1, [x20]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "zip2 z16.b, z20.b, z17.b\n"
+      "st1b { z16.b }, p1, [x20]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "zip1 z17.b, z19.b, z18.b\n"
+      "zip2 z16.b, z19.b, z18.b\n"
+      "st1b { z17.b }, p1, [x20]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "st1b { z16.b }, p1, [x20]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x22\n"
+      "ld1b { z17.b }, p0/Z, [x26]\n"
+      "decw x22\n"
+      "ld1b { z18.b }, p0/Z, [x25]\n"
+      "cmp x22, #0x0\n"
+      "incd x26, ALL, MUL #2\n"
+      "ld1b { z16.b }, p0/Z, [x24]\n"
+      "zip1 z17.b, z17.b, z16.b\n"
+      "incd x25, ALL, MUL #2\n"
+      "incd x24, ALL, MUL #2\n"
+      "ld1b { z16.b }, p0/Z, [x23]\n"
+      "zip1 z16.b, z18.b, z16.b\n"
+      "incd x23, ALL, MUL #2\n"
+      "zip1 z16.b, z17.b, z16.b\n"
+      "st1b { z16.b }, p1, [x20]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #1\n"
+      "bge 1b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<1, 4, true, VLType::SME>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_1VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<1, 4, true, VLType::SME>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_1VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
new file mode 100644
index 0000000000..6318e29a79
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_1VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 1 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cmp %x[height], #0x4\n"
+      "ptrue p1.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "mov x23, %x[width]\n"
+      "cnth x21, ALL, MUL #2\n"
+      "add x20, x24, %x[in_stride]\n"
+      "cmp x23, x21\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z17.h }, p1/Z, [x26]\n"
+      "sub x23, x23, x21\n"
+      "cmp x23, x21\n"
+      "ld1h { z16.h }, p1/Z, [x25]\n"
+      "zip1 z24.h, z17.h, z16.h\n"
+      "zip2 z23.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p1/Z, [x24]\n"
+      "ld1h { z16.h }, p1/Z, [x20]\n"
+      "zip1 z22.h, z17.h, z16.h\n"
+      "zip2 z21.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p1/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "ld1h { z16.h }, p1/Z, [x25, #1, MUL VL]\n"
+      "zip1 z20.h, z17.h, z16.h\n"
+      "addvl x25, x25, #2\n"
+      "zip2 z19.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p1/Z, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "ld1h { z16.h }, p1/Z, [x20, #1, MUL VL]\n"
+      "st1h { z24.h }, p1, [x22]\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "addvl x20, x20, #2\n"
+      "st1h { z22.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z16.h, z18.h, z16.h\n"
+      "st1h { z23.h }, p1, [x22]\n"
+      "st1h { z21.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z20.h }, p1, [x22]\n"
+      "st1h { z17.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z19.h }, p1, [x22]\n"
+      "st1h { z16.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x23, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.h, XZR, x23\n"
+      "ld1h { z17.h }, p0/Z, [x26]\n"
+      "decw x23\n"
+      "ld1h { z16.h }, p0/Z, [x25]\n"
+      "cmp x23, #0x0\n"
+      "incd x26, ALL, MUL #4\n"
+      "zip1 z18.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p0/Z, [x24]\n"
+      "incd x25, ALL, MUL #4\n"
+      "incd x24, ALL, MUL #4\n"
+      "ld1h { z16.h }, p0/Z, [x20]\n"
+      "incd x20, ALL, MUL #4\n"
+      "zip1 z16.h, z17.h, z16.h\n"
+      "st1h { z18.h }, p1, [x22]\n"
+      "st1h { z16.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #2\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "mov x21, %x[width]\n"
+      "cnth x20, ALL, MUL #2\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z18.h }, p1/Z, [x26]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1h { z16.h }, p1/Z, [x25]\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "zip2 z19.h, z18.h, z16.h\n"
+      "ld1h { z18.h }, p1/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "ld1h { z16.h }, p1/Z, [x25, #1, MUL VL]\n"
+      "st1h { z17.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "st1h { z19.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "addvl x25, x25, #2\n"
+      "zip2 z16.h, z18.h, z16.h\n"
+      "st1h { z17.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z16.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.h, XZR, x21\n"
+      "ld1h { z17.h }, p0/Z, [x26]\n"
+      "decw x21\n"
+      "ld1h { z16.h }, p0/Z, [x25]\n"
+      "cmp x21, #0x0\n"
+      "incd x26, ALL, MUL #4\n"
+      "zip1 z16.h, z17.h, z16.h\n"
+      "incd x25, ALL, MUL #4\n"
+      "st1h { z16.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #1\n"
+      "bge 7b\n"
+      "12:"  // Done
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<1, 2, true, VLType::SME>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_1VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<1, 2, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_1VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
new file mode 100644
index 0000000000..b90063028d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_1VL_2x2_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 1 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cmp %x[height], #0x4\n"
+      "ptrue p1.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "mov x23, %x[width]\n"
+      "cnth x21, ALL, MUL #2\n"
+      "add x20, x24, %x[in_stride]\n"
+      "cmp x23, x21\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z16.s }, p1/Z, [x26]\n"
+      ".inst 0x658aa618  // bfcvt z24.h, p1/M, z16.s\n"
+      "sub x23, x23, x21\n"
+      "cmp x23, x21\n"
+      "ld1w { z16.s }, p1/Z, [x24]\n"
+      ".inst 0x658aa617  // bfcvt z23.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x658aa616  // bfcvt z22.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x24, #1, MUL VL]\n"
+      ".inst 0x658aa615  // bfcvt z21.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x26, #2, MUL VL]\n"
+      ".inst 0x658aa614  // bfcvt z20.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x24, #2, MUL VL]\n"
+      ".inst 0x658aa613  // bfcvt z19.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x26, #3, MUL VL]\n"
+      ".inst 0x658aa612  // bfcvt z18.h, p1/M, z16.s\n"
+      "addvl x26, x26, #4\n"
+      "ld1w { z16.s }, p1/Z, [x24, #3, MUL VL]\n"
+      ".inst 0x658aa611  // bfcvt z17.h, p1/M, z16.s\n"
+      "addvl x24, x24, #4\n"
+      "ld1w { z16.s }, p1/Z, [x25]\n"
+      ".inst 0x648aa618  // bfcvtnt z24.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x20]\n"
+      ".inst 0x648aa617  // bfcvtnt z23.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+      ".inst 0x648aa616  // bfcvtnt z22.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x648aa615  // bfcvtnt z21.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0x648aa614  // bfcvtnt z20.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x20, #2, MUL VL]\n"
+      ".inst 0x648aa613  // bfcvtnt z19.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0x648aa612  // bfcvtnt z18.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "st1h { z24.h }, p1, [x22]\n"
+      "addvl x20, x20, #4\n"
+      ".inst 0x648aa611  // bfcvtnt z17.h, p1/M, z16.s\n"
+      "st1h { z23.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z22.h }, p1, [x22]\n"
+      "st1h { z21.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z20.h }, p1, [x22]\n"
+      "st1h { z19.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z18.h }, p1, [x22]\n"
+      "st1h { z17.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x23, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.s, XZR, x23\n"
+      "ld1w { z16.s }, p0/Z, [x26]\n"
+      ".inst 0x658aa612  // bfcvt z18.h, p1/M, z16.s\n"
+      "decw x23\n"
+      "ld1w { z16.s }, p0/Z, [x24]\n"
+      ".inst 0x658aa611  // bfcvt z17.h, p1/M, z16.s\n"
+      "cmp x23, #0x0\n"
+      "addvl x26, x26, #1\n"
+      "ld1w { z16.s }, p0/Z, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "addvl x24, x24, #1\n"
+      ".inst 0x648aa612  // bfcvtnt z18.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x20]\n"
+      "addvl x20, x20, #1\n"
+      ".inst 0x648aa611  // bfcvtnt z17.h, p1/M, z16.s\n"
+      "st1h { z18.h }, p1, [x22]\n"
+      "st1h { z17.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #2\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "mov x21, %x[width]\n"
+      "cnth x20, ALL, MUL #2\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1w { z16.s }, p1/Z, [x26]\n"
+      ".inst 0x658aa614  // bfcvt z20.h, p1/M, z16.s\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x658aa613  // bfcvt z19.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x26, #2, MUL VL]\n"
+      ".inst 0x658aa612  // bfcvt z18.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x26, #3, MUL VL]\n"
+      ".inst 0x658aa611  // bfcvt z17.h, p1/M, z16.s\n"
+      "addvl x26, x26, #4\n"
+      "ld1w { z16.s }, p1/Z, [x25]\n"
+      ".inst 0x648aa614  // bfcvtnt z20.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+      ".inst 0x648aa613  // bfcvtnt z19.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0x648aa612  // bfcvtnt z18.h, p1/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "st1h { z20.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "addvl x25, x25, #4\n"
+      "st1h { z19.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      ".inst 0x648aa611  // bfcvtnt z17.h, p1/M, z16.s\n"
+      "st1h { z18.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z17.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z16.s }, p0/Z, [x26]\n"
+      ".inst 0x658aa611  // bfcvt z17.h, p1/M, z16.s\n"
+      "decw x21\n"
+      "ld1w { z16.s }, p0/Z, [x25]\n"
+      "cmp x21, #0x0\n"
+      "addvl x26, x26, #1\n"
+      ".inst 0x648aa611  // bfcvtnt z17.h, p1/M, z16.s\n"
+      "addvl x25, x25, #1\n"
+      "st1h { z17.h }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #1\n"
+      "bge 7b\n"
+      "12:"  // Done
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<1, 2, true, VLType::SME>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_1VL_2x2_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
new file mode 100644
index 0000000000..f827197ab7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_2VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 2 * height * sme::get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cmp %x[height], #0x4\n"
+      "ptrue p2.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "mov x23, %x[width]\n"
+      "cnth x20, ALL, MUL #4\n"
+      "add x21, x24, %x[in_stride]\n"
+      "cmp x23, x20\n"
+      "add %x[in], x21, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "sub x23, x23, x20\n"
+      "ld1h { z31.h }, p2/Z, [x26]\n"
+      "cmp x23, x20\n"
+      "ld1h { z30.h }, p2/Z, [x26, #1, MUL VL]\n"
+      "ld1h { z29.h }, p2/Z, [x26, #2, MUL VL]\n"
+      "ld1h { z28.h }, p2/Z, [x26, #3, MUL VL]\n"
+      "addvl x26, x26, #4\n"
+      "ld1h { z27.h }, p2/Z, [x25]\n"
+      "ld1h { z26.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "ld1h { z25.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z24.h }, p2/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "ld1h { z23.h }, p2/Z, [x24]\n"
+      "ld1h { z22.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z21.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z20.h }, p2/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "ld1h { z19.h }, p2/Z, [x21]\n"
+      "ld1h { z18.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z17.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x21, #3, MUL VL]\n"
+      "st1h { z31.h }, p2, [x22]\n"
+      "addvl x21, x21, #4\n"
+      "st1h { z30.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z27.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+      "st1h { z23.h }, p2, [x22, #4, MUL VL]\n"
+      "st1h { z22.h }, p2, [x22, #5, MUL VL]\n"
+      "st1h { z19.h }, p2, [x22, #6, MUL VL]\n"
+      "st1h { z18.h }, p2, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z29.h }, p2, [x22]\n"
+      "st1h { z28.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z25.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z24.h }, p2, [x22, #3, MUL VL]\n"
+      "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+      "st1h { z20.h }, p2, [x22, #5, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #6, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x23, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x23\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z23.h }, p1/Z, [x26]\n"
+      "dech x20\n"
+      "dech x23, ALL, MUL #2\n"
+      "ld1h { z22.h }, p1/Z, [x25]\n"
+      "whilelt p0.h, XZR, x20\n"
+      "cmp x23, #0x0\n"
+      "ld1h { z21.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "ld1h { z20.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      "ld1h { z19.h }, p1/Z, [x24]\n"
+      "ld1h { z18.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "ld1h { z17.h }, p1/Z, [x21]\n"
+      "ld1h { z16.h }, p0/Z, [x21, #1, MUL VL]\n"
+      "addvl x21, x21, #2\n"
+      "st1h { z23.h }, p2, [x22]\n"
+      "st1h { z21.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
+      "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x22, #5, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #6, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x21, %x[width]\n"
+      "cnth x20, ALL, MUL #4\n"
+      "mov x26, %x[in]\n"
+      "cmp x21, x20\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "sub x21, x21, x20\n"
+      "ld1h { z19.h }, p2/Z, [x26]\n"
+      "cmp x21, x20\n"
+      "ld1h { z18.h }, p2/Z, [x26, #1, MUL VL]\n"
+      "ld1h { z17.h }, p2/Z, [x26, #2, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x26, #3, MUL VL]\n"
+      "st1h { z19.h }, p2, [x22]\n"
+      "addvl x26, x26, #4\n"
+      "st1h { z18.h }, p2, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z17.h }, p2, [x22]\n"
+      "st1h { z16.h }, p2, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z17.h }, p0/Z, [x26]\n"
+      "dech x20\n"
+      "dech x21, ALL, MUL #2\n"
+      "whilelt p0.h, XZR, x20\n"
+      "cmp x21, #0x0\n"
+      "ld1h { z16.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22]\n"
+      "addvl x26, x26, #2\n"
+      "st1h { z16.h }, p2, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #2\n"
+      "bge 7b\n"
+      "12:"  // Done
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<2, 1, true, VLType::SME>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_2VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<2, 1, true, VLType::SME>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_2VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<2, 1, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_2VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
new file mode 100644
index 0000000000..c471d66e17
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_2VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 2 * roundup<size_t>(height, 4) * sme::get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "mov x22, %x[width]\n"
+      "cntb x21\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "cmp x22, x21\n"
+      "mov x20, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z17.b }, p1/Z, [x26]\n"
+      "sub x22, x22, x21\n"
+      "cmp x22, x21\n"
+      "ld1b { z18.b }, p1/Z, [x25]\n"
+      "addvl x26, x26, #1\n"
+      "addvl x25, x25, #1\n"
+      "ld1b { z16.b }, p1/Z, [x24]\n"
+      "zip1 z20.b, z17.b, z16.b\n"
+      "zip2 z19.b, z17.b, z16.b\n"
+      "addvl x24, x24, #1\n"
+      "ld1b { z17.b }, p1/Z, [x23]\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "zip2 z18.b, z18.b, z17.b\n"
+      "addvl x23, x23, #1\n"
+      "zip1 z17.b, z20.b, z16.b\n"
+      "zip2 z16.b, z20.b, z16.b\n"
+      "st1b { z17.b }, p1, [x20]\n"
+      "st1b { z16.b }, p1, [x20, #1, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "zip1 z17.b, z19.b, z18.b\n"
+      "zip2 z16.b, z19.b, z18.b\n"
+      "st1b { z17.b }, p1, [x20]\n"
+      "st1b { z16.b }, p1, [x20, #1, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x22\n"
+      "ld1b { z18.b }, p0/Z, [x26]\n"
+      "decw x22, ALL, MUL #2\n"
+      "ld1b { z17.b }, p0/Z, [x25]\n"
+      "cmp x22, #0x0\n"
+      "incd x26, ALL, MUL #4\n"
+      "ld1b { z16.b }, p0/Z, [x24]\n"
+      "zip1 z18.b, z18.b, z16.b\n"
+      "incd x25, ALL, MUL #4\n"
+      "incd x24, ALL, MUL #4\n"
+      "ld1b { z16.b }, p0/Z, [x23]\n"
+      "zip1 z16.b, z17.b, z16.b\n"
+      "incd x23, ALL, MUL #4\n"
+      "zip1 z17.b, z18.b, z16.b\n"
+      "zip2 z16.b, z18.b, z16.b\n"
+      "st1b { z17.b }, p1, [x20]\n"
+      "st1b { z16.b }, p1, [x20, #1, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #2\n"
+      "bge 1b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<2, 4, true, VLType::SME>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_2VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<2, 4, true, VLType::SME>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_2VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
new file mode 100644
index 0000000000..5f967fa615
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_2VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 2 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cmp %x[height], #0x4\n"
+      "ptrue p1.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "mov x23, %x[width]\n"
+      "cnth x21, ALL, MUL #2\n"
+      "add x20, x24, %x[in_stride]\n"
+      "cmp x23, x21\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z17.h }, p1/Z, [x26]\n"
+      "sub x23, x23, x21\n"
+      "cmp x23, x21\n"
+      "ld1h { z16.h }, p1/Z, [x25]\n"
+      "zip1 z24.h, z17.h, z16.h\n"
+      "zip2 z23.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p1/Z, [x24]\n"
+      "ld1h { z16.h }, p1/Z, [x20]\n"
+      "zip1 z22.h, z17.h, z16.h\n"
+      "zip2 z21.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p1/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "ld1h { z16.h }, p1/Z, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      "zip1 z20.h, z17.h, z16.h\n"
+      "zip2 z19.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p1/Z, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "ld1h { z16.h }, p1/Z, [x20, #1, MUL VL]\n"
+      "st1h { z24.h }, p1, [x22]\n"
+      "addvl x20, x20, #2\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "st1h { z23.h }, p1, [x22, #1, MUL VL]\n"
+      "zip2 z16.h, z18.h, z16.h\n"
+      "st1h { z22.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z21.h }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z20.h }, p1, [x22]\n"
+      "st1h { z19.h }, p1, [x22, #1, MUL VL]\n"
+      "st1h { z17.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z16.h }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x23, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.h, XZR, x23\n"
+      "ld1h { z17.h }, p0/Z, [x26]\n"
+      "decw x23, ALL, MUL #2\n"
+      "ld1h { z16.h }, p0/Z, [x25]\n"
+      "cmp x23, #0x0\n"
+      "addvl x26, x26, #1\n"
+      "zip1 z20.h, z17.h, z16.h\n"
+      "ld1h { z19.h }, p0/Z, [x24]\n"
+      "addvl x25, x25, #1\n"
+      "addvl x24, x24, #1\n"
+      "zip2 z18.h, z17.h, z16.h\n"
+      "ld1h { z16.h }, p0/Z, [x20]\n"
+      "addvl x20, x20, #1\n"
+      "zip1 z17.h, z19.h, z16.h\n"
+      "zip2 z16.h, z19.h, z16.h\n"
+      "st1h { z20.h }, p1, [x22]\n"
+      "st1h { z18.h }, p1, [x22, #1, MUL VL]\n"
+      "st1h { z17.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z16.h }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #4\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "mov x21, %x[width]\n"
+      "cnth x20, ALL, MUL #2\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z18.h }, p1/Z, [x26]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1h { z16.h }, p1/Z, [x25]\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "zip2 z19.h, z18.h, z16.h\n"
+      "ld1h { z18.h }, p1/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "ld1h { z16.h }, p1/Z, [x25, #1, MUL VL]\n"
+      "st1h { z17.h }, p1, [x22]\n"
+      "addvl x25, x25, #2\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "st1h { z19.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z16.h, z18.h, z16.h\n"
+      "st1h { z17.h }, p1, [x22]\n"
+      "st1h { z16.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.h, XZR, x21\n"
+      "ld1h { z18.h }, p0/Z, [x26]\n"
+      "decw x21, ALL, MUL #2\n"
+      "ld1h { z16.h }, p0/Z, [x25]\n"
+      "cmp x21, #0x0\n"
+      "addvl x26, x26, #1\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "addvl x25, x25, #1\n"
+      "zip2 z16.h, z18.h, z16.h\n"
+      "st1h { z17.h }, p1, [x22]\n"
+      "st1h { z16.h }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #2\n"
+      "bge 7b\n"
+      "12:"  // Done
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<2, 2, true, VLType::SME>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_2VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<2, 2, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_2VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
new file mode 100644
index 0000000000..f22b833821
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_2VL_2x2_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 2 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cmp %x[height], #0x4\n"
+      "ptrue p2.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "mov x23, %x[width]\n"
+      "cnth x20, ALL, MUL #2\n"
+      "add x21, x24, %x[in_stride]\n"
+      "cmp x23, x20\n"
+      "add %x[in], x21, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z16.s }, p2/Z, [x26]\n"
+      ".inst 0x658aaa18  // bfcvt z24.h, p2/M, z16.s\n"
+      "sub x23, x23, x20\n"
+      "cmp x23, x20\n"
+      "ld1w { z16.s }, p2/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x658aaa17  // bfcvt z23.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x24]\n"
+      ".inst 0x658aaa16  // bfcvt z22.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x24, #1, MUL VL]\n"
+      ".inst 0x658aaa15  // bfcvt z21.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x26, #2, MUL VL]\n"
+      ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x26, #3, MUL VL]\n"
+      ".inst 0x658aaa13  // bfcvt z19.h, p2/M, z16.s\n"
+      "addvl x26, x26, #4\n"
+      "ld1w { z16.s }, p2/Z, [x24, #2, MUL VL]\n"
+      ".inst 0x658aaa12  // bfcvt z18.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x24, #3, MUL VL]\n"
+      ".inst 0x658aaa11  // bfcvt z17.h, p2/M, z16.s\n"
+      "addvl x24, x24, #4\n"
+      "ld1w { z16.s }, p2/Z, [x25]\n"
+      ".inst 0x648aaa18  // bfcvtnt z24.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x25, #1, MUL VL]\n"
+      ".inst 0x648aaa17  // bfcvtnt z23.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x21]\n"
+      ".inst 0x648aaa16  // bfcvtnt z22.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x21, #1, MUL VL]\n"
+      ".inst 0x648aaa15  // bfcvtnt z21.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x25, #2, MUL VL]\n"
+      ".inst 0x648aaa14  // bfcvtnt z20.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0x648aaa13  // bfcvtnt z19.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x21, #2, MUL VL]\n"
+      ".inst 0x648aaa12  // bfcvtnt z18.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x21, #3, MUL VL]\n"
+      "st1h { z24.h }, p2, [x22]\n"
+      "addvl x21, x21, #4\n"
+      ".inst 0x648aaa11  // bfcvtnt z17.h, p2/M, z16.s\n"
+      "st1h { z23.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z20.h }, p2, [x22]\n"
+      "st1h { z19.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x23, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x23\n"
+      "whilelt p1.s, XZR, x20\n"
+      "ld1w { z16.s }, p1/Z, [x26]\n"
+      ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z16.s }, p0/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x658aaa13  // bfcvt z19.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x24]\n"
+      ".inst 0x658aaa12  // bfcvt z18.h, p2/M, z16.s\n"
+      "decw x23, ALL, MUL #2\n"
+      "cmp x23, #0x0\n"
+      "ld1w { z16.s }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0x658aaa11  // bfcvt z17.h, p2/M, z16.s\n"
+      "addvl x26, x26, #2\n"
+      "addvl x24, x24, #2\n"
+      "ld1w { z16.s }, p1/Z, [x25]\n"
+      ".inst 0x648aaa14  // bfcvtnt z20.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      ".inst 0x648aaa13  // bfcvtnt z19.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x21]\n"
+      ".inst 0x648aaa12  // bfcvtnt z18.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x21, #1, MUL VL]\n"
+      "addvl x21, x21, #2\n"
+      ".inst 0x648aaa11  // bfcvtnt z17.h, p2/M, z16.s\n"
+      "st1h { z20.h }, p2, [x22]\n"
+      "st1h { z19.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #4\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "mov x21, %x[width]\n"
+      "cnth x20, ALL, MUL #2\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1w { z16.s }, p2/Z, [x26]\n"
+      ".inst 0x658aaa14  // bfcvt z20.h, p2/M, z16.s\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1w { z16.s }, p2/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x658aaa13  // bfcvt z19.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x26, #2, MUL VL]\n"
+      ".inst 0x658aaa12  // bfcvt z18.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x26, #3, MUL VL]\n"
+      ".inst 0x658aaa11  // bfcvt z17.h, p2/M, z16.s\n"
+      "addvl x26, x26, #4\n"
+      "ld1w { z16.s }, p2/Z, [x25]\n"
+      ".inst 0x648aaa14  // bfcvtnt z20.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x25, #1, MUL VL]\n"
+      ".inst 0x648aaa13  // bfcvtnt z19.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x25, #2, MUL VL]\n"
+      ".inst 0x648aaa12  // bfcvtnt z18.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x25, #3, MUL VL]\n"
+      "st1h { z20.h }, p2, [x22]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0x648aaa11  // bfcvtnt z17.h, p2/M, z16.s\n"
+      "st1h { z19.h }, p2, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z18.h }, p2, [x22]\n"
+      "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p1.s, XZR, x20\n"
+      "ld1w { z16.s }, p1/Z, [x26]\n"
+      ".inst 0x658aaa12  // bfcvt z18.h, p2/M, z16.s\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z16.s }, p0/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x658aaa11  // bfcvt z17.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x25]\n"
+      "decw x21, ALL, MUL #2\n"
+      "cmp x21, #0x0\n"
+      ".inst 0x648aaa12  // bfcvtnt z18.h, p2/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x25, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "addvl x25, x25, #2\n"
+      ".inst 0x648aaa11  // bfcvtnt z17.h, p2/M, z16.s\n"
+      "st1h { z18.h }, p2, [x22]\n"
+      "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #2\n"
+      "bge 7b\n"
+      "12:"  // Done
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<2, 2, true, VLType::SME>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_2VL_2x2_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
new file mode 100644
index 0000000000..14636e3218
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 4 * height * sme::get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cmp %x[height], #0x4\n"
+      "ptrue p4.b\n"
+      "blt 4f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p3.h, XZR, x20\n"
+      "ld1h { z31.h }, p3/Z, [x26]\n"
+      "dech x20\n"
+      "whilelt p2.h, XZR, x20\n"
+      "ld1h { z30.h }, p2/Z, [x26, #1, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z29.h }, p1/Z, [x26, #2, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z28.h }, p0/Z, [x26, #3, MUL VL]\n"
+      "mov x20, x22\n"
+      "dech x21, ALL, MUL #4\n"
+      "ld1h { z27.h }, p3/Z, [x25]\n"
+      "ld1h { z26.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "cmp x21, #0x0\n"
+      "addvl x26, x26, #4\n"
+      "ld1h { z25.h }, p1/Z, [x25, #2, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1h { z24.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "ld1h { z23.h }, p3/Z, [x24]\n"
+      "ld1h { z22.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z21.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z20.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "ld1h { z19.h }, p3/Z, [x23]\n"
+      "ld1h { z18.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "st1h { z31.h }, p4, [x20]\n"
+      "addvl x23, x23, #4\n"
+      "st1h { z30.h }, p4, [x20, #1, MUL VL]\n"
+      "st1h { z29.h }, p4, [x20, #2, MUL VL]\n"
+      "st1h { z28.h }, p4, [x20, #3, MUL VL]\n"
+      "st1h { z27.h }, p4, [x20, #4, MUL VL]\n"
+      "st1h { z26.h }, p4, [x20, #5, MUL VL]\n"
+      "st1h { z25.h }, p4, [x20, #6, MUL VL]\n"
+      "st1h { z24.h }, p4, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z23.h }, p4, [x20, #-8, MUL VL]\n"
+      "st1h { z22.h }, p4, [x20, #-7, MUL VL]\n"
+      "st1h { z21.h }, p4, [x20, #-6, MUL VL]\n"
+      "st1h { z20.h }, p4, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p4, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p4, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p4, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p4, [x20, #-1, MUL VL]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      "cbz %x[height], 8f\n"
+      "4:"  // Main loop skip
+      "5:"  // Tail row loop: Head
+      "mov x26, %x[in]\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x21, %x[width]\n"
+      "6:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z19.h }, p0/Z, [x26]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z18.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z17.h }, p0/Z, [x26, #2, MUL VL]\n"
+      "dech x20\n"
+      "dech x21, ALL, MUL #4\n"
+      "whilelt p0.h, XZR, x20\n"
+      "cmp x21, #0x0\n"
+      "ld1h { z16.h }, p0/Z, [x26, #3, MUL VL]\n"
+      "st1h { z19.h }, p4, [x22]\n"
+      "addvl x26, x26, #4\n"
+      "st1h { z18.h }, p4, [x22, #1, MUL VL]\n"
+      "st1h { z17.h }, p4, [x22, #2, MUL VL]\n"
+      "st1h { z16.h }, p4, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 6b\n"
+      "7:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #4\n"
+      "bge 5b\n"
+      "8:"  // Done
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 1, true, VLType::SME>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_4VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 1, true, VLType::SME>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_4VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 1, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_4VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
new file mode 100644
index 0000000000..2d46a481f3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_4VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 4) * sme::get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x22, %x[in_stride]\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "csel x23, x23, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "mov x21, %x[out]\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x20, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z17.b }, p0/Z, [x25]\n"
+      "decw x20, ALL, MUL #4\n"
+      "ld1b { z19.b }, p0/Z, [x24]\n"
+      "cmp x20, #0x0\n"
+      "addvl x25, x25, #1\n"
+      "ld1b { z16.b }, p0/Z, [x23]\n"
+      "zip1 z18.b, z17.b, z16.b\n"
+      "zip2 z20.b, z17.b, z16.b\n"
+      "addvl x24, x24, #1\n"
+      "ld1b { z16.b }, p0/Z, [x22]\n"
+      "zip1 z17.b, z19.b, z16.b\n"
+      "zip2 z19.b, z19.b, z16.b\n"
+      "addvl x23, x23, #1\n"
+      "addvl x22, x22, #1\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "zip2 z18.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x21]\n"
+      "zip1 z17.b, z20.b, z19.b\n"
+      "zip2 z16.b, z20.b, z19.b\n"
+      "st1b { z18.b }, p1, [x21, #1, MUL VL]\n"
+      "st1b { z17.b }, p1, [x21, #2, MUL VL]\n"
+      "st1b { z16.b }, p1, [x21, #3, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #4\n"
+      "bge 1b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 4, true, VLType::SME>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_4VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 4, true, VLType::SME>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_4VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
new file mode 100644
index 0000000000..002a12479a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cmp %x[height], #0x4\n"
+      "ptrue p2.b\n"
+      "blt 4f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z19.h }, p1/Z, [x26]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z18.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "ld1h { z17.h }, p1/Z, [x25]\n"
+      "decw x21, ALL, MUL #4\n"
+      "cmp x21, #0x0\n"
+      "zip1 z24.h, z19.h, z17.h\n"
+      "ld1h { z16.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "addvl x25, x25, #2\n"
+      "zip2 z23.h, z19.h, z17.h\n"
+      "ld1h { z17.h }, p1/Z, [x24]\n"
+      "zip1 z22.h, z18.h, z16.h\n"
+      "zip2 z21.h, z18.h, z16.h\n"
+      "ld1h { z20.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "ld1h { z16.h }, p1/Z, [x23]\n"
+      "zip1 z19.h, z17.h, z16.h\n"
+      "zip2 z18.h, z17.h, z16.h\n"
+      "ld1h { z16.h }, p0/Z, [x23, #1, MUL VL]\n"
+      "addvl x23, x23, #2\n"
+      "zip1 z17.h, z20.h, z16.h\n"
+      "zip2 z16.h, z20.h, z16.h\n"
+      "st1h { z24.h }, p2, [x22]\n"
+      "st1h { z23.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+      "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x22, #5, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #6, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      "cbz %x[height], 8f\n"
+      "4:"  // Main loop skip
+      "5:"  // Tail row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x21, %x[width]\n"
+      "6:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z18.h }, p1/Z, [x26]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z20.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "ld1h { z17.h }, p1/Z, [x25]\n"
+      "decw x21, ALL, MUL #4\n"
+      "cmp x21, #0x0\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "ld1h { z16.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "addvl x25, x25, #2\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "zip1 z17.h, z20.h, z16.h\n"
+      "zip2 z16.h, z20.h, z16.h\n"
+      "st1h { z19.h }, p2, [x22]\n"
+      "st1h { z18.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 6b\n"
+      "7:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #4\n"
+      "bge 5b\n"
+      "8:"  // Done
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 2, true, VLType::SME>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_4VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 2, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_4VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
new file mode 100644
index 0000000000..2a43f34f71
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_4VL_2x2_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cmp %x[height], #0x4\n"
+      "ptrue p4.b\n"
+      "blt 4f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p3.s, XZR, x20\n"
+      "ld1w { z16.s }, p3/Z, [x26]\n"
+      ".inst 0x658ab218  // bfcvt z24.h, p4/M, z16.s\n"
+      "decw x20\n"
+      "whilelt p2.s, XZR, x20\n"
+      "ld1w { z16.s }, p2/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x658ab217  // bfcvt z23.h, p4/M, z16.s\n"
+      "decw x20\n"
+      "whilelt p1.s, XZR, x20\n"
+      "ld1w { z16.s }, p1/Z, [x26, #2, MUL VL]\n"
+      ".inst 0x658ab216  // bfcvt z22.h, p4/M, z16.s\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z16.s }, p0/Z, [x26, #3, MUL VL]\n"
+      ".inst 0x658ab215  // bfcvt z21.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p3/Z, [x24]\n"
+      ".inst 0x658ab214  // bfcvt z20.h, p4/M, z16.s\n"
+      "decw x21, ALL, MUL #4\n"
+      "cmp x21, #0x0\n"
+      "ld1w { z16.s }, p2/Z, [x24, #1, MUL VL]\n"
+      ".inst 0x658ab213  // bfcvt z19.h, p4/M, z16.s\n"
+      "addvl x26, x26, #4\n"
+      "ld1w { z16.s }, p1/Z, [x24, #2, MUL VL]\n"
+      ".inst 0x658ab212  // bfcvt z18.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x24, #3, MUL VL]\n"
+      ".inst 0x658ab211  // bfcvt z17.h, p4/M, z16.s\n"
+      "addvl x24, x24, #4\n"
+      "ld1w { z16.s }, p3/Z, [x25]\n"
+      ".inst 0x648ab218  // bfcvtnt z24.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x25, #1, MUL VL]\n"
+      ".inst 0x648ab217  // bfcvtnt z23.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0x648ab216  // bfcvtnt z22.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0x648ab215  // bfcvtnt z21.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      ".inst 0x648ab214  // bfcvtnt z20.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x23, #1, MUL VL]\n"
+      ".inst 0x648ab213  // bfcvtnt z19.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0x648ab212  // bfcvtnt z18.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      ".inst 0x648ab211  // bfcvtnt z17.h, p4/M, z16.s\n"
+      "st1h { z24.h }, p4, [x22]\n"
+      "st1h { z23.h }, p4, [x22, #1, MUL VL]\n"
+      "st1h { z22.h }, p4, [x22, #2, MUL VL]\n"
+      "st1h { z21.h }, p4, [x22, #3, MUL VL]\n"
+      "st1h { z20.h }, p4, [x22, #4, MUL VL]\n"
+      "st1h { z19.h }, p4, [x22, #5, MUL VL]\n"
+      "st1h { z18.h }, p4, [x22, #6, MUL VL]\n"
+      "st1h { z17.h }, p4, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      "cbz %x[height], 8f\n"
+      "4:"  // Main loop skip
+      "5:"  // Tail row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x21, %x[width]\n"
+      "6:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p3.s, XZR, x20\n"
+      "ld1w { z16.s }, p3/Z, [x26]\n"
+      ".inst 0x658ab214  // bfcvt z20.h, p4/M, z16.s\n"
+      "decw x20\n"
+      "whilelt p2.s, XZR, x20\n"
+      "ld1w { z16.s }, p2/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x658ab213  // bfcvt z19.h, p4/M, z16.s\n"
+      "decw x20\n"
+      "whilelt p1.s, XZR, x20\n"
+      "ld1w { z16.s }, p1/Z, [x26, #2, MUL VL]\n"
+      ".inst 0x658ab212  // bfcvt z18.h, p4/M, z16.s\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z16.s }, p0/Z, [x26, #3, MUL VL]\n"
+      ".inst 0x658ab211  // bfcvt z17.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p3/Z, [x25]\n"
+      "decw x21, ALL, MUL #4\n"
+      "cmp x21, #0x0\n"
+      ".inst 0x648ab214  // bfcvtnt z20.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x25, #1, MUL VL]\n"
+      "addvl x26, x26, #4\n"
+      ".inst 0x648ab213  // bfcvtnt z19.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0x648ab212  // bfcvtnt z18.h, p4/M, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0x648ab211  // bfcvtnt z17.h, p4/M, z16.s\n"
+      "st1h { z20.h }, p4, [x22]\n"
+      "st1h { z19.h }, p4, [x22, #1, MUL VL]\n"
+      "st1h { z18.h }, p4, [x22, #2, MUL VL]\n"
+      "st1h { z17.h }, p4, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 6b\n"
+      "7:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #4\n"
+      "bge 5b\n"
+      "8:"  // Done
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<4, 2, true, VLType::SME>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_4VL_2x2_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp
new file mode 100644
index 0000000000..be9ad666a9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 8 * height * sme::get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cmp %x[height], #0x2\n"
+      "ptrue p7.b\n"
+      "blt 4f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "mov x23, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x22, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x21, x22\n"
+      "whilelt p0.h, XZR, x21\n"
+      "ld1h { z31.h }, p0/Z, [x25]\n"
+      "dech x21\n"
+      "whilelt p6.h, XZR, x21\n"
+      "ld1h { z30.h }, p6/Z, [x25, #1, MUL VL]\n"
+      "dech x21\n"
+      "whilelt p5.h, XZR, x21\n"
+      "ld1h { z29.h }, p5/Z, [x25, #2, MUL VL]\n"
+      "dech x21\n"
+      "whilelt p4.h, XZR, x21\n"
+      "ld1h { z28.h }, p4/Z, [x25, #3, MUL VL]\n"
+      "dech x21\n"
+      "whilelt p3.h, XZR, x21\n"
+      "ld1h { z27.h }, p3/Z, [x25, #4, MUL VL]\n"
+      "dech x21\n"
+      "whilelt p2.h, XZR, x21\n"
+      "ld1h { z26.h }, p2/Z, [x25, #5, MUL VL]\n"
+      "dech x21\n"
+      "whilelt p1.h, XZR, x21\n"
+      "ld1h { z25.h }, p1/Z, [x25, #6, MUL VL]\n"
+      "dech x21\n"
+      "mov x20, x23\n"
+      "ld1h { z24.h }, p0/Z, [x24]\n"
+      "whilelt p0.h, XZR, x21\n"
+      "dech x22, ALL, MUL #8\n"
+      "ld1h { z23.h }, p0/Z, [x25, #7, MUL VL]\n"
+      "ld1h { z22.h }, p6/Z, [x24, #1, MUL VL]\n"
+      "cmp x22, #0x0\n"
+      "addvl x25, x25, #8\n"
+      "ld1h { z21.h }, p5/Z, [x24, #2, MUL VL]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "ld1h { z20.h }, p4/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z19.h }, p3/Z, [x24, #4, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x24, #5, MUL VL]\n"
+      "ld1h { z17.h }, p1/Z, [x24, #6, MUL VL]\n"
+      "ld1h { z16.h }, p0/Z, [x24, #7, MUL VL]\n"
+      "st1h { z31.h }, p7, [x20]\n"
+      "addvl x24, x24, #8\n"
+      "st1h { z30.h }, p7, [x20, #1, MUL VL]\n"
+      "st1h { z29.h }, p7, [x20, #2, MUL VL]\n"
+      "st1h { z28.h }, p7, [x20, #3, MUL VL]\n"
+      "st1h { z27.h }, p7, [x20, #4, MUL VL]\n"
+      "st1h { z26.h }, p7, [x20, #5, MUL VL]\n"
+      "st1h { z25.h }, p7, [x20, #6, MUL VL]\n"
+      "st1h { z23.h }, p7, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z24.h }, p7, [x20, #-8, MUL VL]\n"
+      "st1h { z22.h }, p7, [x20, #-7, MUL VL]\n"
+      "st1h { z21.h }, p7, [x20, #-6, MUL VL]\n"
+      "st1h { z20.h }, p7, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p7, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p7, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p7, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p7, [x20, #-1, MUL VL]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x2\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      "cbz %x[height], 8f\n"
+      "4:"  // Main loop skip
+      "5:"  // Tail row loop: Head
+      "mov x25, %x[in]\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x23, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x21, %x[width]\n"
+      "6:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z23.h }, p0/Z, [x25]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z22.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z21.h }, p0/Z, [x25, #2, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z20.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z19.h }, p0/Z, [x25, #4, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z18.h }, p0/Z, [x25, #5, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z17.h }, p0/Z, [x25, #6, MUL VL]\n"
+      "dech x20\n"
+      "dech x21, ALL, MUL #8\n"
+      "whilelt p0.h, XZR, x20\n"
+      "cmp x21, #0x0\n"
+      "ld1h { z16.h }, p0/Z, [x25, #7, MUL VL]\n"
+      "st1h { z23.h }, p7, [x23]\n"
+      "addvl x25, x25, #8\n"
+      "st1h { z22.h }, p7, [x23, #1, MUL VL]\n"
+      "st1h { z21.h }, p7, [x23, #2, MUL VL]\n"
+      "st1h { z20.h }, p7, [x23, #3, MUL VL]\n"
+      "st1h { z19.h }, p7, [x23, #4, MUL VL]\n"
+      "st1h { z18.h }, p7, [x23, #5, MUL VL]\n"
+      "st1h { z17.h }, p7, [x23, #6, MUL VL]\n"
+      "st1h { z16.h }, p7, [x23, #7, MUL VL]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bgt 6b\n"
+      "7:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 5b\n"
+      "8:"  // Done
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp
new file mode 100644
index 0000000000..45d2e24258
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 4) * sme::get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p2.b\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "mov x22, %x[out]\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p1.b, XZR, x20\n"
+      "ld1b { z19.b }, p1/Z, [x26]\n"
+      "decb x20\n"
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z17.b }, p0/Z, [x26, #1, MUL VL]\n"
+      "ld1b { z18.b }, p1/Z, [x25]\n"
+      "decw x21, ALL, MUL #8\n"
+      "cmp x21, #0x0\n"
+      "ld1b { z21.b }, p0/Z, [x25, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "addvl x25, x25, #2\n"
+      "ld1b { z16.b }, p1/Z, [x24]\n"
+      "zip1 z24.b, z19.b, z16.b\n"
+      "zip2 z20.b, z19.b, z16.b\n"
+      "ld1b { z16.b }, p0/Z, [x24, #1, MUL VL]\n"
+      "zip1 z23.b, z17.b, z16.b\n"
+      "zip2 z22.b, z17.b, z16.b\n"
+      "addvl x24, x24, #2\n"
+      "ld1b { z16.b }, p1/Z, [x23]\n"
+      "zip1 z17.b, z18.b, z16.b\n"
+      "zip2 z19.b, z18.b, z16.b\n"
+      "ld1b { z16.b }, p0/Z, [x23, #1, MUL VL]\n"
+      "zip1 z18.b, z21.b, z16.b\n"
+      "zip2 z21.b, z21.b, z16.b\n"
+      "addvl x23, x23, #2\n"
+      "zip1 z16.b, z24.b, z17.b\n"
+      "zip2 z17.b, z24.b, z17.b\n"
+      "st1b { z16.b }, p2, [x22]\n"
+      "zip1 z16.b, z20.b, z19.b\n"
+      "zip2 z20.b, z20.b, z19.b\n"
+      "st1b { z17.b }, p2, [x22, #1, MUL VL]\n"
+      "zip1 z19.b, z23.b, z18.b\n"
+      "zip2 z18.b, z23.b, z18.b\n"
+      "st1b { z16.b }, p2, [x22, #2, MUL VL]\n"
+      "zip1 z17.b, z22.b, z21.b\n"
+      "zip2 z16.b, z22.b, z21.b\n"
+      "st1b { z20.b }, p2, [x22, #3, MUL VL]\n"
+      "st1b { z19.b }, p2, [x22, #4, MUL VL]\n"
+      "st1b { z18.b }, p2, [x22, #5, MUL VL]\n"
+      "st1b { z17.b }, p2, [x22, #6, MUL VL]\n"
+      "st1b { z16.b }, p2, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 4, true, VLType::SME>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 4, true, VLType::SME>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp
new file mode 100644
index 0000000000..ec7c415e27
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p4.b\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p3.h, XZR, x20\n"
+      "ld1h { z20.h }, p3/Z, [x24]\n"
+      "dech x20\n"
+      "whilelt p2.h, XZR, x20\n"
+      "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z24.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z17.h }, p3/Z, [x23]\n"
+      "decw x21, ALL, MUL #8\n"
+      "cmp x21, #0x0\n"
+      "zip1 z23.h, z20.h, z17.h\n"
+      "ld1h { z16.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "zip2 z22.h, z20.h, z17.h\n"
+      "zip1 z21.h, z19.h, z16.h\n"
+      "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "zip2 z20.h, z19.h, z16.h\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "zip1 z17.h, z24.h, z16.h\n"
+      "zip2 z16.h, z24.h, z16.h\n"
+      "st1h { z23.h }, p4, [x22]\n"
+      "st1h { z22.h }, p4, [x22, #1, MUL VL]\n"
+      "st1h { z21.h }, p4, [x22, #2, MUL VL]\n"
+      "st1h { z20.h }, p4, [x22, #3, MUL VL]\n"
+      "st1h { z19.h }, p4, [x22, #4, MUL VL]\n"
+      "st1h { z18.h }, p4, [x22, #5, MUL VL]\n"
+      "st1h { z17.h }, p4, [x22, #6, MUL VL]\n"
+      "st1h { z16.h }, p4, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 2, true, VLType::SME>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 2, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..f627fe575f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_12VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p6.b\n"
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "mov x25, %x[width]\n"
+      "cnth x24, ALL, MUL #6\n"
+      "add x23, x26, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "csel x26, x26, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "cmp x25, x24\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z22.s }, p6/Z, [x28]\n"
+      "ld1w { z7.s }, p6/Z, [x28, #1, MUL VL]\n"
+      "mov x21, x22\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1w { z19.s }, p6/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z18.s }, p6/Z, [x28, #3, MUL VL]\n"
+      "mov x20, x22\n"
+      "sub x25, x25, x24\n"
+      "ld1w { z5.s }, p6/Z, [x28, #4, MUL VL]\n"
+      "ld1w { z25.s }, p6/Z, [x28, #5, MUL VL]\n"
+      "cmp x25, x24\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1w { z20.s }, p6/Z, [x28, #6, MUL VL]\n"
+      "ld1w { z23.s }, p6/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #12\n"
+      "ld1w { z4.s }, p6/Z, [x26]\n"
+      "ld1w { z10.s }, p6/Z, [x26, #1, MUL VL]\n"
+      "zip1 z14.s, z22.s, z4.s\n"
+      "zip2 z22.s, z22.s, z4.s\n"
+      "ld1w { z28.s }, p6/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z27.s }, p6/Z, [x26, #3, MUL VL]\n"
+      "zip1 z24.s, z7.s, z10.s\n"
+      "zip2 z15.s, z7.s, z10.s\n"
+      "ld1w { z7.s }, p6/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z2.s }, p6/Z, [x26, #5, MUL VL]\n"
+      "zip1 z9.s, z19.s, z28.s\n"
+      "zip2 z0.s, z19.s, z28.s\n"
+      "ld1w { z19.s }, p6/Z, [x26, #6, MUL VL]\n"
+      "ld1w { z16.s }, p6/Z, [x26, #7, MUL VL]\n"
+      "addvl x26, x26, #12\n"
+      "zip1 z1.s, z18.s, z27.s\n"
+      "ld1w { z30.s }, p6/Z, [x28, #-4, MUL VL]\n"
+      "ld1w { z29.s }, p6/Z, [x28, #-3, MUL VL]\n"
+      "zip2 z17.s, z18.s, z27.s\n"
+      ".inst 0x658ab9d5  // bfcvt z21.h, p6/M, z14.s\n"
+      "ld1w { z31.s }, p6/Z, [x27]\n"
+      "ld1w { z8.s }, p6/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x658abacc  // bfcvt z12.h, p6/M, z22.s\n"
+      ".inst 0x658abb0e  // bfcvt z14.h, p6/M, z24.s\n"
+      "ld1w { z22.s }, p6/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z28.s }, p6/Z, [x27, #3, MUL VL]\n"
+      ".inst 0x658ab9ea  // bfcvt z10.h, p6/M, z15.s\n"
+      ".inst 0x658ab92f  // bfcvt z15.h, p6/M, z9.s\n"
+      "ld1w { z27.s }, p6/Z, [x27, #4, MUL VL]\n"
+      "ld1w { z13.s }, p6/Z, [x27, #5, MUL VL]\n"
+      ".inst 0x658ab803  // bfcvt z3.h, p6/M, z0.s\n"
+      ".inst 0x658ab832  // bfcvt z18.h, p6/M, z1.s\n"
+      "ld1w { z26.s }, p6/Z, [x27, #6, MUL VL]\n"
+      "ld1w { z9.s }, p6/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #12\n"
+      ".inst 0x658aba26  // bfcvt z6.h, p6/M, z17.s\n"
+      "ld1w { z1.s }, p6/Z, [x26, #-4, MUL VL]\n"
+      "ld1w { z0.s }, p6/Z, [x26, #-3, MUL VL]\n"
+      "zip1 z17.s, z5.s, z7.s\n"
+      "zip2 z5.s, z5.s, z7.s\n"
+      "ld1w { z24.s }, p6/Z, [x23]\n"
+      "ld1w { z11.s }, p6/Z, [x23, #1, MUL VL]\n"
+      "zip1 z7.s, z31.s, z24.s\n"
+      "zip2 z31.s, z31.s, z24.s\n"
+      "ld1w { z4.s }, p6/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z24.s }, p6/Z, [x23, #3, MUL VL]\n"
+      ".inst 0x648ab8f5  // bfcvtnt z21.h, p6/M, z7.s\n"
+      "zip1 z7.s, z8.s, z11.s\n"
+      "zip2 z11.s, z8.s, z11.s\n"
+      "ld1w { z8.s }, p6/Z, [x23, #4, MUL VL]\n"
+      ".inst 0x648abbec  // bfcvtnt z12.h, p6/M, z31.s\n"
+      "ld1w { z31.s }, p6/Z, [x23, #5, MUL VL]\n"
+      ".inst 0x648ab8ee  // bfcvtnt z14.h, p6/M, z7.s\n"
+      "ld1w { z7.s }, p6/Z, [x23, #6, MUL VL]\n"
+      ".inst 0x648ab96a  // bfcvtnt z10.h, p6/M, z11.s\n"
+      "zip1 z11.s, z22.s, z4.s\n"
+      "zip2 z4.s, z22.s, z4.s\n"
+      "ld1w { z22.s }, p6/Z, [x23, #7, MUL VL]\n"
+      "addvl x23, x23, #12\n"
+      ".inst 0x648ab96f  // bfcvtnt z15.h, p6/M, z11.s\n"
+      "ld1w { z11.s }, p6/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x648ab883  // bfcvtnt z3.h, p6/M, z4.s\n"
+      "zip1 z4.s, z28.s, z24.s\n"
+      "zip2 z24.s, z28.s, z24.s\n"
+      "ld1w { z28.s }, p6/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x648ab892  // bfcvtnt z18.h, p6/M, z4.s\n"
+      "ld1w { z4.s }, p6/Z, [x27, #-4, MUL VL]\n"
+      ".inst 0x648abb06  // bfcvtnt z6.h, p6/M, z24.s\n"
+      "zip1 z24.s, z25.s, z2.s\n"
+      "zip2 z25.s, z25.s, z2.s\n"
+      "zip1 z2.s, z20.s, z19.s\n"
+      "zip2 z20.s, z20.s, z19.s\n"
+      "zip1 z19.s, z23.s, z16.s\n"
+      "zip2 z16.s, z23.s, z16.s\n"
+      "zip1 z23.s, z30.s, z1.s\n"
+      "zip2 z30.s, z30.s, z1.s\n"
+      "zip1 z1.s, z29.s, z0.s\n"
+      "zip2 z0.s, z29.s, z0.s\n"
+      ".inst 0x658aba31  // bfcvt z17.h, p6/M, z17.s\n"
+      "zip1 z29.s, z27.s, z8.s\n"
+      ".inst 0x658ab8a5  // bfcvt z5.h, p6/M, z5.s\n"
+      "zip2 z27.s, z27.s, z8.s\n"
+      "ld1w { z8.s }, p6/Z, [x27, #-3, MUL VL]\n"
+      ".inst 0x658abb18  // bfcvt z24.h, p6/M, z24.s\n"
+      ".inst 0x658abb39  // bfcvt z25.h, p6/M, z25.s\n"
+      ".inst 0x658ab842  // bfcvt z2.h, p6/M, z2.s\n"
+      ".inst 0x658aba94  // bfcvt z20.h, p6/M, z20.s\n"
+      ".inst 0x658aba73  // bfcvt z19.h, p6/M, z19.s\n"
+      ".inst 0x658aba10  // bfcvt z16.h, p6/M, z16.s\n"
+      ".inst 0x658abaf7  // bfcvt z23.h, p6/M, z23.s\n"
+      ".inst 0x658abbde  // bfcvt z30.h, p6/M, z30.s\n"
+      ".inst 0x658ab821  // bfcvt z1.h, p6/M, z1.s\n"
+      ".inst 0x658ab800  // bfcvt z0.h, p6/M, z0.s\n"
+      ".inst 0x648abbb1  // bfcvtnt z17.h, p6/M, z29.s\n"
+      "ld1w { z29.s }, p6/Z, [x26, #-2, MUL VL]\n"
+      ".inst 0x648abb65  // bfcvtnt z5.h, p6/M, z27.s\n"
+      "zip1 z27.s, z13.s, z31.s\n"
+      "zip2 z31.s, z13.s, z31.s\n"
+      "ld1w { z13.s }, p6/Z, [x26, #-1, MUL VL]\n"
+      ".inst 0x648abb78  // bfcvtnt z24.h, p6/M, z27.s\n"
+      "ld1w { z27.s }, p6/Z, [x23, #-4, MUL VL]\n"
+      ".inst 0x648abbf9  // bfcvtnt z25.h, p6/M, z31.s\n"
+      "zip1 z31.s, z26.s, z7.s\n"
+      "zip2 z26.s, z26.s, z7.s\n"
+      "ld1w { z7.s }, p6/Z, [x23, #-3, MUL VL]\n"
+      ".inst 0x648abbe2  // bfcvtnt z2.h, p6/M, z31.s\n"
+      "ld1w { z31.s }, p6/Z, [x27, #-2, MUL VL]\n"
+      ".inst 0x648abb54  // bfcvtnt z20.h, p6/M, z26.s\n"
+      "zip1 z26.s, z9.s, z22.s\n"
+      "zip2 z9.s, z9.s, z22.s\n"
+      "ld1w { z22.s }, p6/Z, [x27, #-1, MUL VL]\n"
+      ".inst 0x648abb53  // bfcvtnt z19.h, p6/M, z26.s\n"
+      "ld1w { z26.s }, p6/Z, [x23, #-2, MUL VL]\n"
+      ".inst 0x648ab930  // bfcvtnt z16.h, p6/M, z9.s\n"
+      "ld1w { z9.s }, p6/Z, [x23, #-1, MUL VL]\n"
+      "st1h { z21.h }, p6, [x21]\n"
+      "zip1 z21.s, z4.s, z27.s\n"
+      "zip2 z27.s, z4.s, z27.s\n"
+      "zip1 z4.s, z8.s, z7.s\n"
+      "zip2 z8.s, z8.s, z7.s\n"
+      "st1h { z12.h }, p6, [x21, #1, MUL VL]\n"
+      "zip1 z7.s, z11.s, z29.s\n"
+      "zip2 z11.s, z11.s, z29.s\n"
+      "st1h { z14.h }, p6, [x21, #2, MUL VL]\n"
+      "zip1 z29.s, z28.s, z13.s\n"
+      "zip2 z12.s, z28.s, z13.s\n"
+      "st1h { z10.h }, p6, [x21, #3, MUL VL]\n"
+      "st1h { z15.h }, p6, [x21, #4, MUL VL]\n"
+      ".inst 0x648abab7  // bfcvtnt z23.h, p6/M, z21.s\n"
+      ".inst 0x648abb7e  // bfcvtnt z30.h, p6/M, z27.s\n"
+      "st1h { z3.h }, p6, [x21, #5, MUL VL]\n"
+      ".inst 0x648ab881  // bfcvtnt z1.h, p6/M, z4.s\n"
+      ".inst 0x648ab900  // bfcvtnt z0.h, p6/M, z8.s\n"
+      "st1h { z18.h }, p6, [x21, #6, MUL VL]\n"
+      ".inst 0x658ab8e8  // bfcvt z8.h, p6/M, z7.s\n"
+      "zip1 z27.s, z31.s, z26.s\n"
+      "st1h { z6.h }, p6, [x21, #7, MUL VL]\n"
+      "addvl x21, x21, #12\n"
+      ".inst 0x658ab96e  // bfcvt z14.h, p6/M, z11.s\n"
+      "zip2 z28.s, z31.s, z26.s\n"
+      ".inst 0x658abbbd  // bfcvt z29.h, p6/M, z29.s\n"
+      "zip1 z21.s, z22.s, z9.s\n"
+      "st1h { z17.h }, p6, [x21, #-4, MUL VL]\n"
+      ".inst 0x658ab992  // bfcvt z18.h, p6/M, z12.s\n"
+      "zip2 z17.s, z22.s, z9.s\n"
+      "st1h { z5.h }, p6, [x21, #-3, MUL VL]\n"
+      "st1h { z24.h }, p6, [x21, #-2, MUL VL]\n"
+      ".inst 0x648abb68  // bfcvtnt z8.h, p6/M, z27.s\n"
+      ".inst 0x648abb8e  // bfcvtnt z14.h, p6/M, z28.s\n"
+      "st1h { z25.h }, p6, [x21, #-1, MUL VL]\n"
+      ".inst 0x648ababd  // bfcvtnt z29.h, p6/M, z21.s\n"
+      ".inst 0x648aba32  // bfcvtnt z18.h, p6/M, z17.s\n"
+      "st1h { z2.h }, p6, [x20]\n"
+      "st1h { z20.h }, p6, [x20, #1, MUL VL]\n"
+      "st1h { z19.h }, p6, [x20, #2, MUL VL]\n"
+      "st1h { z16.h }, p6, [x20, #3, MUL VL]\n"
+      "st1h { z23.h }, p6, [x20, #4, MUL VL]\n"
+      "st1h { z30.h }, p6, [x20, #5, MUL VL]\n"
+      "st1h { z1.h }, p6, [x20, #6, MUL VL]\n"
+      "st1h { z0.h }, p6, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      "st1h { z8.h }, p6, [x20, #-4, MUL VL]\n"
+      "st1h { z14.h }, p6, [x20, #-3, MUL VL]\n"
+      "st1h { z29.h }, p6, [x20, #-2, MUL VL]\n"
+      "st1h { z18.h }, p6, [x20, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x25, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x25\n"
+      "whilelt p5.s, XZR, x20\n"
+      "ld1w { z22.s }, p5/Z, [x28]\n"
+      "ld1w { z21.s }, p5/Z, [x26]\n"
+      "decw x20\n"
+      "whilelt p4.s, XZR, x20\n"
+      "ld1w { z20.s }, p4/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x26, #1, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p3.s, XZR, x20\n"
+      "ld1w { z18.s }, p3/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z17.s }, p3/Z, [x26, #2, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p2.s, XZR, x20\n"
+      "ld1w { z30.s }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x26, #3, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p1.s, XZR, x20\n"
+      "ld1w { z13.s }, p1/Z, [x28, #4, MUL VL]\n"
+      "ld1w { z29.s }, p5/Z, [x27]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z12.s }, p0/Z, [x28, #5, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z11.s }, p3/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "zip1 z27.s, z22.s, z21.s\n"
+      "zip2 z26.s, z22.s, z21.s\n"
+      "ld1w { z9.s }, p1/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z8.s }, p0/Z, [x26, #5, MUL VL]\n"
+      "zip1 z25.s, z20.s, z19.s\n"
+      "zip2 z24.s, z20.s, z19.s\n"
+      "ld1w { z23.s }, p5/Z, [x23]\n"
+      "ld1w { z22.s }, p4/Z, [x23, #1, MUL VL]\n"
+      "zip1 z21.s, z18.s, z17.s\n"
+      "zip2 z20.s, z18.s, z17.s\n"
+      "ld1w { z19.s }, p3/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x23, #3, MUL VL]\n"
+      "zip1 z17.s, z30.s, z16.s\n"
+      "zip2 z16.s, z30.s, z16.s\n"
+      "ld1w { z7.s }, p1/Z, [x27, #4, MUL VL]\n"
+      "ld1w { z6.s }, p0/Z, [x27, #5, MUL VL]\n"
+      ".inst 0x658abb65  // bfcvt z5.h, p6/M, z27.s\n"
+      "zip1 z4.s, z29.s, z23.s\n"
+      "ld1w { z3.s }, p1/Z, [x23, #4, MUL VL]\n"
+      "ld1w { z2.s }, p0/Z, [x23, #5, MUL VL]\n"
+      ".inst 0x658abb41  // bfcvt z1.h, p6/M, z26.s\n"
+      "zip2 z0.s, z29.s, z23.s\n"
+      ".inst 0x658abb3f  // bfcvt z31.h, p6/M, z25.s\n"
+      "zip1 z30.s, z28.s, z22.s\n"
+      "mov x20, x22\n"
+      "decd x25, ALL, MUL #12\n"
+      ".inst 0x658abb1d  // bfcvt z29.h, p6/M, z24.s\n"
+      "zip2 z28.s, z28.s, z22.s\n"
+      "cmp x25, #0x0\n"
+      "addvl x28, x28, #6\n"
+      ".inst 0x658ababb  // bfcvt z27.h, p6/M, z21.s\n"
+      "zip1 z23.s, z11.s, z19.s\n"
+      "addvl x27, x27, #6\n"
+      "addvl x26, x26, #6\n"
+      ".inst 0x658aba9a  // bfcvt z26.h, p6/M, z20.s\n"
+      "zip2 z22.s, z11.s, z19.s\n"
+      "addvl x23, x23, #6\n"
+      "add x22, x22, %x[out_stride]\n"
+      ".inst 0x658aba39  // bfcvt z25.h, p6/M, z17.s\n"
+      "zip1 z21.s, z10.s, z18.s\n"
+      ".inst 0x658aba18  // bfcvt z24.h, p6/M, z16.s\n"
+      "zip2 z20.s, z10.s, z18.s\n"
+      "zip1 z19.s, z13.s, z9.s\n"
+      "zip2 z18.s, z13.s, z9.s\n"
+      "zip1 z17.s, z12.s, z8.s\n"
+      "zip2 z16.s, z12.s, z8.s\n"
+      ".inst 0x648ab885  // bfcvtnt z5.h, p6/M, z4.s\n"
+      ".inst 0x648ab801  // bfcvtnt z1.h, p6/M, z0.s\n"
+      "st1h { z5.h }, p6, [x20]\n"
+      ".inst 0x648abbdf  // bfcvtnt z31.h, p6/M, z30.s\n"
+      ".inst 0x648abb9d  // bfcvtnt z29.h, p6/M, z28.s\n"
+      "st1h { z1.h }, p6, [x20, #1, MUL VL]\n"
+      ".inst 0x648abafb  // bfcvtnt z27.h, p6/M, z23.s\n"
+      ".inst 0x648abada  // bfcvtnt z26.h, p6/M, z22.s\n"
+      "st1h { z31.h }, p6, [x20, #2, MUL VL]\n"
+      ".inst 0x648abab9  // bfcvtnt z25.h, p6/M, z21.s\n"
+      ".inst 0x648aba98  // bfcvtnt z24.h, p6/M, z20.s\n"
+      "st1h { z29.h }, p6, [x20, #3, MUL VL]\n"
+      ".inst 0x658aba77  // bfcvt z23.h, p6/M, z19.s\n"
+      "zip1 z22.s, z7.s, z3.s\n"
+      "st1h { z27.h }, p6, [x20, #4, MUL VL]\n"
+      ".inst 0x658aba55  // bfcvt z21.h, p6/M, z18.s\n"
+      "zip2 z20.s, z7.s, z3.s\n"
+      "st1h { z26.h }, p6, [x20, #5, MUL VL]\n"
+      ".inst 0x658aba33  // bfcvt z19.h, p6/M, z17.s\n"
+      "zip1 z18.s, z6.s, z2.s\n"
+      "st1h { z25.h }, p6, [x20, #6, MUL VL]\n"
+      ".inst 0x658aba11  // bfcvt z17.h, p6/M, z16.s\n"
+      "zip2 z16.s, z6.s, z2.s\n"
+      "st1h { z24.h }, p6, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      ".inst 0x648abad7  // bfcvtnt z23.h, p6/M, z22.s\n"
+      ".inst 0x648aba95  // bfcvtnt z21.h, p6/M, z20.s\n"
+      "st1h { z23.h }, p6, [x20, #-4, MUL VL]\n"
+      ".inst 0x648aba53  // bfcvtnt z19.h, p6/M, z18.s\n"
+      ".inst 0x648aba11  // bfcvtnt z17.h, p6/M, z16.s\n"
+      "st1h { z21.h }, p6, [x20, #-3, MUL VL]\n"
+      "st1h { z19.h }, p6, [x20, #-2, MUL VL]\n"
+      "st1h { z17.h }, p6, [x20, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #12\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 4, true, VLType::SVE>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_12VL_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
new file mode 100644
index 0000000000..b33c4f6c2d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 1 * height * get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "ptrue p1.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "mov x25, %x[width]\n"
+      "cntw x24, ALL, MUL #2\n"
+      "add x23, x26, %x[in_stride]\n"
+      "add x21, x23, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "cmp x25, x24\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "sub x25, x25, x24\n"
+      "ld1w { z23.s }, p1/Z, [x26]\n"
+      "ld1w { z22.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "cmp x25, x24\n"
+      "ld1w { z21.s }, p1/Z, [x23]\n"
+      "ld1w { z20.s }, p1/Z, [x23, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "addvl x23, x23, #2\n"
+      "ld1w { z19.s }, p1/Z, [x21]\n"
+      "ld1w { z18.s }, p1/Z, [x21, #1, MUL VL]\n"
+      "addvl x21, x21, #2\n"
+      "ld1w { z17.s }, p1/Z, [x20]\n"
+      "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+      "st1w { z23.s }, p1, [x22]\n"
+      "addvl x20, x20, #2\n"
+      "st1w { z21.s }, p1, [x22, #1, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z17.s }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1w { z22.s }, p1, [x22]\n"
+      "st1w { z20.s }, p1, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z16.s }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x25, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.s, XZR, x25\n"
+      "decw x25\n"
+      "ld1w { z19.s }, p0/Z, [x26]\n"
+      "ld1w { z18.s }, p0/Z, [x23]\n"
+      "cmp x25, #0x0\n"
+      "addvl x26, x26, #1\n"
+      "ld1w { z17.s }, p0/Z, [x21]\n"
+      "ld1w { z16.s }, p0/Z, [x20]\n"
+      "addvl x23, x23, #1\n"
+      "addvl x21, x21, #1\n"
+      "st1w { z19.s }, p1, [x22]\n"
+      "addvl x20, x20, #1\n"
+      "st1w { z18.s }, p1, [x22, #1, MUL VL]\n"
+      "st1w { z17.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z16.s }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #4\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x21, %x[width]\n"
+      "cntw x20, ALL, MUL #2\n"
+      "mov x26, %x[in]\n"
+      "cmp x21, x20\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "sub x21, x21, x20\n"
+      "ld1w { z17.s }, p1/Z, [x26]\n"
+      "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "st1w { z17.s }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "cmp x21, x20\n"
+      "st1w { z16.s }, p1, [x22]\n"
+      "addvl x26, x26, #2\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.s, XZR, x21\n"
+      "decw x21\n"
+      "ld1w { z16.s }, p0/Z, [x26]\n"
+      "st1w { z16.s }, p1, [x22]\n"
+      "cmp x21, #0x0\n"
+      "addvl x26, x26, #1\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #1\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<1, 1, true, VLType::SVE>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_1VL(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
new file mode 100644
index 0000000000..e468787815
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_1VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 1 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "ptrue p1.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x10, %x[in]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "mov x25, %x[width]\n"
+      "cntb x24, ALL, MUL #2\n"
+      "add x23, x26, %x[in_stride]\n"
+      "add x21, x23, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "cmp x25, x24\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z20.b }, p1/Z, [x10]\n"
+      "ld1b { z18.b }, p1/Z, [x9]\n"
+      "sub x25, x25, x24\n"
+      "cmp x25, x24\n"
+      "ld1b { z17.b }, p1/Z, [x28]\n"
+      "ld1b { z16.b }, p1/Z, [x27]\n"
+      "zip1 z25.b, z20.b, z17.b\n"
+      "zip1 z24.b, z18.b, z16.b\n"
+      "ld1b { z21.b }, p1/Z, [x26]\n"
+      "ld1b { z19.b }, p1/Z, [x23]\n"
+      "zip2 z2.b, z20.b, z17.b\n"
+      "zip2 z1.b, z18.b, z16.b\n"
+      "ld1b { z18.b }, p1/Z, [x21]\n"
+      "ld1b { z17.b }, p1/Z, [x20]\n"
+      "zip1 z20.b, z21.b, z18.b\n"
+      "zip1 z16.b, z19.b, z17.b\n"
+      "ld1b { z0.b }, p1/Z, [x10, #1, MUL VL]\n"
+      "ld1b { z31.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "zip2 z30.b, z21.b, z18.b\n"
+      "zip2 z29.b, z19.b, z17.b\n"
+      "ld1b { z23.b }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z22.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "zip1 z19.b, z25.b, z24.b\n"
+      "zip1 z18.b, z20.b, z16.b\n"
+      "ld1b { z28.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1b { z27.b }, p1/Z, [x23, #1, MUL VL]\n"
+      "zip2 z17.b, z25.b, z24.b\n"
+      "zip2 z16.b, z20.b, z16.b\n"
+      "ld1b { z21.b }, p1/Z, [x21, #1, MUL VL]\n"
+      "ld1b { z20.b }, p1/Z, [x20, #1, MUL VL]\n"
+      "st1b { z19.b }, p1, [x22]\n"
+      "zip1 z26.b, z0.b, z23.b\n"
+      "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z25.b, z31.b, z22.b\n"
+      "zip1 z24.b, z28.b, z21.b\n"
+      "st1b { z17.b }, p1, [x22]\n"
+      "zip1 z19.b, z27.b, z20.b\n"
+      "zip1 z17.b, z2.b, z1.b\n"
+      "addvl x10, x10, #2\n"
+      "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z18.b, z30.b, z29.b\n"
+      "zip2 z16.b, z2.b, z1.b\n"
+      "st1b { z17.b }, p1, [x22]\n"
+      "zip2 z17.b, z30.b, z29.b\n"
+      "zip2 z23.b, z0.b, z23.b\n"
+      "addvl x9, x9, #2\n"
+      "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z22.b, z31.b, z22.b\n"
+      "zip2 z21.b, z28.b, z21.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "zip2 z20.b, z27.b, z20.b\n"
+      "zip1 z16.b, z26.b, z25.b\n"
+      "addvl x28, x28, #2\n"
+      "st1b { z17.b }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z18.b, z24.b, z19.b\n"
+      "zip2 z17.b, z26.b, z25.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "zip2 z16.b, z24.b, z19.b\n"
+      "zip1 z19.b, z23.b, z22.b\n"
+      "addvl x27, x27, #2\n"
+      "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z18.b, z21.b, z20.b\n"
+      "addvl x26, x26, #2\n"
+      "st1b { z17.b }, p1, [x22]\n"
+      "addvl x23, x23, #2\n"
+      "addvl x21, x21, #2\n"
+      "zip2 z17.b, z23.b, z22.b\n"
+      "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "addvl x20, x20, #2\n"
+      "zip2 z16.b, z21.b, z20.b\n"
+      "st1b { z19.b }, p1, [x22]\n"
+      "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1b { z17.b }, p1, [x22]\n"
+      "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x25, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x25\n"
+      "ld1b { z19.b }, p0/Z, [x10]\n"
+      "ld1b { z18.b }, p0/Z, [x9]\n"
+      "decw x25\n"
+      "ld1b { z17.b }, p0/Z, [x28]\n"
+      "ld1b { z16.b }, p0/Z, [x27]\n"
+      "zip1 z21.b, z19.b, z17.b\n"
+      "zip1 z20.b, z18.b, z16.b\n"
+      "ld1b { z18.b }, p0/Z, [x26]\n"
+      "ld1b { z19.b }, p0/Z, [x23]\n"
+      "cmp x25, #0x0\n"
+      "incd x10, ALL, MUL #2\n"
+      "ld1b { z17.b }, p0/Z, [x21]\n"
+      "ld1b { z16.b }, p0/Z, [x20]\n"
+      "zip1 z18.b, z18.b, z17.b\n"
+      "zip1 z16.b, z19.b, z16.b\n"
+      "incd x9, ALL, MUL #2\n"
+      "incd x28, ALL, MUL #2\n"
+      "zip1 z17.b, z21.b, z20.b\n"
+      "zip1 z16.b, z18.b, z16.b\n"
+      "incd x27, ALL, MUL #2\n"
+      "incd x26, ALL, MUL #2\n"
+      "st1b { z17.b }, p1, [x22]\n"
+      "incd x23, ALL, MUL #2\n"
+      "incd x21, ALL, MUL #2\n"
+      "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
+      "incd x20, ALL, MUL #2\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x8\n"
+      "addvl %x[out], %x[out], #2\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x10, %x[in]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "mov x21, %x[width]\n"
+      "cntb x20, ALL, MUL #2\n"
+      "add x27, x28, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x27, %x[in_stride]\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "csel x28, x28, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x9, x9, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1b { z21.b }, p1/Z, [x10]\n"
+      "ld1b { z18.b }, p1/Z, [x9]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1b { z17.b }, p1/Z, [x28]\n"
+      "ld1b { z16.b }, p1/Z, [x27]\n"
+      "zip1 z20.b, z21.b, z17.b\n"
+      "zip1 z19.b, z18.b, z16.b\n"
+      "ld1b { z24.b }, p1/Z, [x10, #1, MUL VL]\n"
+      "ld1b { z23.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "zip2 z22.b, z21.b, z17.b\n"
+      "zip2 z21.b, z18.b, z16.b\n"
+      "ld1b { z18.b }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z17.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "zip1 z16.b, z20.b, z19.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z16.b, z20.b, z19.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z20.b, z24.b, z18.b\n"
+      "zip1 z19.b, z23.b, z17.b\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "zip1 z16.b, z22.b, z21.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z16.b, z22.b, z21.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z18.b, z24.b, z18.b\n"
+      "zip2 z17.b, z23.b, z17.b\n"
+      "zip1 z16.b, z20.b, z19.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z16.b, z20.b, z19.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "addvl x28, x28, #2\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "addvl x27, x27, #2\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.b, XZR, x21\n"
+      "ld1b { z19.b }, p0/Z, [x10]\n"
+      "ld1b { z18.b }, p0/Z, [x9]\n"
+      "decw x21\n"
+      "ld1b { z17.b }, p0/Z, [x28]\n"
+      "ld1b { z16.b }, p0/Z, [x27]\n"
+      "zip1 z17.b, z19.b, z17.b\n"
+      "zip1 z16.b, z18.b, z16.b\n"
+      "cmp x21, #0x0\n"
+      "incd x10, ALL, MUL #2\n"
+      "zip1 z16.b, z17.b, z16.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "incd x9, ALL, MUL #2\n"
+      "incd x28, ALL, MUL #2\n"
+      "incd x27, ALL, MUL #2\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #1\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<1, 4, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_1VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<1, 4, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_1VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..f66fcdc994
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_2VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 2 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "mov x25, %x[width]\n"
+      "cnth x24\n"
+      "cmp %x[height], #0x3\n"
+      "mov x23, %x[out]\n"
+      "add x22, x26, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "csel x20, x20, %x[pad_row], GT\n"
+      "csel x21, x21, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "cmp x25, x24\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z18.s }, p1/Z, [x26]\n"
+      "ld1w { z17.s }, p1/Z, [x21]\n"
+      "sub x25, x25, x24\n"
+      "ld1w { z21.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+      "cmp x25, x24\n"
+      "addvl x26, x26, #2\n"
+      "ld1w { z26.s }, p1/Z, [x22]\n"
+      "ld1w { z20.s }, p1/Z, [x20]\n"
+      "addvl x21, x21, #2\n"
+      "zip1 z19.s, z18.s, z17.s\n"
+      "zip2 z18.s, z18.s, z17.s\n"
+      "ld1w { z25.s }, p1/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x20, #1, MUL VL]\n"
+      "addvl x22, x22, #2\n"
+      "zip1 z17.s, z21.s, z16.s\n"
+      "zip2 z16.s, z21.s, z16.s\n"
+      "addvl x20, x20, #2\n"
+      ".inst 0x658aa677  // bfcvt z23.h, p1/M, z19.s\n"
+      "zip1 z22.s, z26.s, z20.s\n"
+      ".inst 0x658aa655  // bfcvt z21.h, p1/M, z18.s\n"
+      "zip2 z20.s, z26.s, z20.s\n"
+      ".inst 0x658aa633  // bfcvt z19.h, p1/M, z17.s\n"
+      "zip1 z18.s, z25.s, z24.s\n"
+      ".inst 0x658aa611  // bfcvt z17.h, p1/M, z16.s\n"
+      "zip2 z16.s, z25.s, z24.s\n"
+      ".inst 0x648aa6d7  // bfcvtnt z23.h, p1/M, z22.s\n"
+      ".inst 0x648aa695  // bfcvtnt z21.h, p1/M, z20.s\n"
+      ".inst 0x648aa653  // bfcvtnt z19.h, p1/M, z18.s\n"
+      ".inst 0x648aa611  // bfcvtnt z17.h, p1/M, z16.s\n"
+      "st1h { z23.h }, p1, [x23]\n"
+      "st1h { z21.h }, p1, [x23, #1, MUL VL]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "st1h { z19.h }, p1, [x23]\n"
+      "st1h { z17.h }, p1, [x23, #1, MUL VL]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x25, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.s, XZR, x25\n"
+      "decd x25, ALL, MUL #2\n"
+      "ld1w { z19.s }, p0/Z, [x26]\n"
+      "addvl x26, x26, #1\n"
+      "ld1w { z16.s }, p0/Z, [x21]\n"
+      "addvl x21, x21, #1\n"
+      "ld1w { z20.s }, p0/Z, [x22]\n"
+      "addvl x22, x22, #1\n"
+      "ld1w { z18.s }, p0/Z, [x20]\n"
+      "addvl x20, x20, #1\n"
+      "cmp x25, #0x0\n"
+      "zip1 z17.s, z19.s, z16.s\n"
+      "zip2 z16.s, z19.s, z16.s\n"
+      "zip1 z19.s, z20.s, z18.s\n"
+      "zip2 z18.s, z20.s, z18.s\n"
+      ".inst 0x658aa631  // bfcvt z17.h, p1/M, z17.s\n"
+      ".inst 0x658aa610  // bfcvt z16.h, p1/M, z16.s\n"
+      ".inst 0x648aa671  // bfcvtnt z17.h, p1/M, z19.s\n"
+      ".inst 0x648aa650  // bfcvtnt z16.h, p1/M, z18.s\n"
+      "st1h { z17.h }, p1, [x23]\n"
+      "st1h { z16.h }, p1, [x23, #1, MUL VL]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #2\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<2, 4, true, VLType::SVE>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_2VL_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
new file mode 100644
index 0000000000..546800fa69
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 3 * height * get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "ptrue p3.b\n"
+      "blt 4f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p2.h, XZR, x20\n"
+      "ld1h { z27.h }, p2/Z, [x26]\n"
+      "ld1h { z26.h }, p2/Z, [x25]\n"
+      "dech x20\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z25.h }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1h { z24.h }, p1/Z, [x25, #1, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z23.h }, p0/Z, [x26, #2, MUL VL]\n"
+      "ld1h { z22.h }, p0/Z, [x25, #2, MUL VL]\n"
+      "mov x20, x22\n"
+      "dech x21, ALL, MUL #3\n"
+      "ld1h { z21.h }, p2/Z, [x24]\n"
+      "ld1h { z20.h }, p1/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x23]\n"
+      "cmp x21, #0x0\n"
+      "addvl x26, x26, #3\n"
+      "ld1h { z17.h }, p1/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z16.h }, p0/Z, [x23, #2, MUL VL]\n"
+      "st1h { z27.h }, p3, [x20]\n"
+      "addvl x25, x25, #3\n"
+      "st1h { z25.h }, p3, [x20, #1, MUL VL]\n"
+      "addvl x24, x24, #3\n"
+      "addvl x23, x23, #3\n"
+      "st1h { z23.h }, p3, [x20, #2, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z26.h }, p3, [x20, #3, MUL VL]\n"
+      "st1h { z24.h }, p3, [x20, #4, MUL VL]\n"
+      "st1h { z22.h }, p3, [x20, #5, MUL VL]\n"
+      "st1h { z21.h }, p3, [x20, #6, MUL VL]\n"
+      "st1h { z20.h }, p3, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      "st1h { z19.h }, p3, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p3, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p3, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p3, [x20, #-1, MUL VL]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #12\n"
+      "bge 1b\n"
+      "cbz %x[height], 8f\n"
+      "4:"  // Main loop skip
+      "5:"  // Tail row loop: Head
+      "mov x26, %x[in]\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x21, %x[width]\n"
+      "6:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z18.h }, p0/Z, [x26]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z17.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "dech x20\n"
+      "dech x21, ALL, MUL #3\n"
+      "whilelt p0.h, XZR, x20\n"
+      "cmp x21, #0x0\n"
+      "ld1h { z16.h }, p0/Z, [x26, #2, MUL VL]\n"
+      "st1h { z18.h }, p3, [x22]\n"
+      "addvl x26, x26, #3\n"
+      "st1h { z17.h }, p3, [x22, #1, MUL VL]\n"
+      "st1h { z16.h }, p3, [x22, #2, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 6b\n"
+      "7:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #3\n"
+      "bge 5b\n"
+      "8:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<3, 1, true, VLType::SVE>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<3, 1, true, VLType::SVE>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<3, 1, true, VLType::SVE>(
+    double *out, const double *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(double) / 2,
+        stride * sizeof(double),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
new file mode 100644
index 0000000000..a44141c109
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_3VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 3 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "ptrue p1.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x10, %x[in]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "mov x25, %x[width]\n"
+      "cntb x24, ALL, MUL #3\n"
+      "add x23, x26, %x[in_stride]\n"
+      "add x21, x23, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "cmp x25, x24\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z21.b }, p1/Z, [x10]\n"
+      "ld1b { z20.b }, p1/Z, [x9]\n"
+      "sub x25, x25, x24\n"
+      "cmp x25, x24\n"
+      "ld1b { z17.b }, p1/Z, [x28]\n"
+      "ld1b { z16.b }, p1/Z, [x27]\n"
+      "zip1 z31.b, z21.b, z17.b\n"
+      "zip1 z22.b, z20.b, z16.b\n"
+      "ld1b { z19.b }, p1/Z, [x26]\n"
+      "ld1b { z18.b }, p1/Z, [x23]\n"
+      "zip2 z14.b, z21.b, z17.b\n"
+      "zip2 z13.b, z20.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x21]\n"
+      "ld1b { z16.b }, p1/Z, [x20]\n"
+      "zip1 z30.b, z19.b, z17.b\n"
+      "zip1 z29.b, z18.b, z16.b\n"
+      "ld1b { z21.b }, p1/Z, [x10, #1, MUL VL]\n"
+      "ld1b { z20.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "zip2 z12.b, z19.b, z17.b\n"
+      "zip2 z11.b, z18.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z16.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "zip1 z10.b, z21.b, z17.b\n"
+      "zip1 z9.b, z20.b, z16.b\n"
+      "ld1b { z19.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1b { z18.b }, p1/Z, [x23, #1, MUL VL]\n"
+      "zip2 z8.b, z21.b, z17.b\n"
+      "zip2 z7.b, z20.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x21, #1, MUL VL]\n"
+      "ld1b { z16.b }, p1/Z, [x20, #1, MUL VL]\n"
+      "zip1 z6.b, z19.b, z17.b\n"
+      "zip1 z5.b, z18.b, z16.b\n"
+      "ld1b { z28.b }, p1/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z27.b }, p1/Z, [x9, #2, MUL VL]\n"
+      "zip2 z4.b, z19.b, z17.b\n"
+      "zip2 z3.b, z18.b, z16.b\n"
+      "ld1b { z26.b }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z25.b }, p1/Z, [x27, #2, MUL VL]\n"
+      "zip1 z2.b, z28.b, z26.b\n"
+      "zip1 z1.b, z27.b, z25.b\n"
+      "ld1b { z24.b }, p1/Z, [x26, #2, MUL VL]\n"
+      "ld1b { z23.b }, p1/Z, [x23, #2, MUL VL]\n"
+      "zip1 z16.b, z31.b, z22.b\n"
+      "zip2 z22.b, z31.b, z22.b\n"
+      "ld1b { z21.b }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1b { z20.b }, p1/Z, [x20, #2, MUL VL]\n"
+      "zip1 z0.b, z24.b, z21.b\n"
+      "zip1 z31.b, z23.b, z20.b\n"
+      "zip1 z19.b, z14.b, z13.b\n"
+      "zip1 z18.b, z30.b, z29.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "addvl x10, x10, #3\n"
+      "zip2 z16.b, z30.b, z29.b\n"
+      "zip1 z17.b, z12.b, z11.b\n"
+      "st1b { z22.b }, p1, [x22, #1, MUL VL]\n"
+      "addvl x9, x9, #3\n"
+      "st1b { z19.b }, p1, [x22, #2, MUL VL]\n"
+      "zip2 z30.b, z28.b, z26.b\n"
+      "zip2 z29.b, z27.b, z25.b\n"
+      "addvl x28, x28, #3\n"
+      "st1b { z18.b }, p1, [x22, #3, MUL VL]\n"
+      "zip2 z28.b, z24.b, z21.b\n"
+      "zip2 z27.b, z23.b, z20.b\n"
+      "addvl x27, x27, #3\n"
+      "st1b { z16.b }, p1, [x22, #4, MUL VL]\n"
+      "zip2 z21.b, z14.b, z13.b\n"
+      "zip1 z16.b, z10.b, z9.b\n"
+      "addvl x26, x26, #3\n"
+      "st1b { z17.b }, p1, [x22, #5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z20.b, z10.b, z9.b\n"
+      "zip2 z19.b, z12.b, z11.b\n"
+      "zip1 z18.b, z6.b, z5.b\n"
+      "zip2 z17.b, z6.b, z5.b\n"
+      "st1b { z21.b }, p1, [x22]\n"
+      "addvl x23, x23, #3\n"
+      "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
+      "zip1 z16.b, z8.b, z7.b\n"
+      "zip2 z26.b, z8.b, z7.b\n"
+      "addvl x21, x21, #3\n"
+      "st1b { z20.b }, p1, [x22, #2, MUL VL]\n"
+      "zip1 z25.b, z2.b, z1.b\n"
+      "zip1 z24.b, z4.b, z3.b\n"
+      "addvl x20, x20, #3\n"
+      "st1b { z19.b }, p1, [x22, #3, MUL VL]\n"
+      "zip2 z23.b, z4.b, z3.b\n"
+      "zip1 z22.b, z0.b, z31.b\n"
+      "st1b { z18.b }, p1, [x22, #4, MUL VL]\n"
+      "zip2 z21.b, z2.b, z1.b\n"
+      "zip1 z20.b, z30.b, z29.b\n"
+      "st1b { z17.b }, p1, [x22, #5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z19.b, z30.b, z29.b\n"
+      "zip2 z18.b, z0.b, z31.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "zip1 z17.b, z28.b, z27.b\n"
+      "zip2 z16.b, z28.b, z27.b\n"
+      "st1b { z26.b }, p1, [x22, #1, MUL VL]\n"
+      "st1b { z25.b }, p1, [x22, #2, MUL VL]\n"
+      "st1b { z24.b }, p1, [x22, #3, MUL VL]\n"
+      "st1b { z23.b }, p1, [x22, #4, MUL VL]\n"
+      "st1b { z22.b }, p1, [x22, #5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1b { z21.b }, p1, [x22]\n"
+      "st1b { z20.b }, p1, [x22, #1, MUL VL]\n"
+      "st1b { z19.b }, p1, [x22, #2, MUL VL]\n"
+      "st1b { z18.b }, p1, [x22, #3, MUL VL]\n"
+      "st1b { z17.b }, p1, [x22, #4, MUL VL]\n"
+      "st1b { z16.b }, p1, [x22, #5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x25, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x25\n"
+      "ld1b { z19.b }, p0/Z, [x10]\n"
+      "ld1b { z18.b }, p0/Z, [x9]\n"
+      "decw x25, ALL, MUL #3\n"
+      "ld1b { z17.b }, p0/Z, [x28]\n"
+      "ld1b { z16.b }, p0/Z, [x27]\n"
+      "zip1 z26.b, z19.b, z17.b\n"
+      "zip1 z25.b, z18.b, z16.b\n"
+      "ld1b { z21.b }, p0/Z, [x26]\n"
+      "ld1b { z20.b }, p0/Z, [x23]\n"
+      "zip2 z24.b, z19.b, z17.b\n"
+      "zip2 z19.b, z18.b, z16.b\n"
+      "ld1b { z18.b }, p0/Z, [x21]\n"
+      "ld1b { z16.b }, p0/Z, [x20]\n"
+      "zip1 z23.b, z21.b, z18.b\n"
+      "zip1 z17.b, z20.b, z16.b\n"
+      "zip2 z22.b, z21.b, z18.b\n"
+      "zip2 z16.b, z20.b, z16.b\n"
+      "cmp x25, #0x0\n"
+      "incd x10, ALL, MUL #6\n"
+      "incd x9, ALL, MUL #6\n"
+      "incd x28, ALL, MUL #6\n"
+      "zip1 z21.b, z26.b, z25.b\n"
+      "zip2 z20.b, z26.b, z25.b\n"
+      "incd x27, ALL, MUL #6\n"
+      "incd x26, ALL, MUL #6\n"
+      "zip1 z19.b, z24.b, z19.b\n"
+      "zip1 z18.b, z23.b, z17.b\n"
+      "incd x23, ALL, MUL #6\n"
+      "incd x21, ALL, MUL #6\n"
+      "zip2 z17.b, z23.b, z17.b\n"
+      "zip1 z16.b, z22.b, z16.b\n"
+      "incd x20, ALL, MUL #6\n"
+      "st1b { z21.b }, p1, [x22]\n"
+      "st1b { z20.b }, p1, [x22, #1, MUL VL]\n"
+      "st1b { z19.b }, p1, [x22, #2, MUL VL]\n"
+      "st1b { z18.b }, p1, [x22, #3, MUL VL]\n"
+      "st1b { z17.b }, p1, [x22, #4, MUL VL]\n"
+      "st1b { z16.b }, p1, [x22, #5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x8\n"
+      "addvl %x[out], %x[out], #6\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x10, %x[in]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "mov x21, %x[width]\n"
+      "cntb x20, ALL, MUL #3\n"
+      "add x27, x28, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x27, %x[in_stride]\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "csel x28, x28, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x9, x9, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1b { z21.b }, p1/Z, [x10]\n"
+      "ld1b { z20.b }, p1/Z, [x9]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1b { z17.b }, p1/Z, [x28]\n"
+      "ld1b { z16.b }, p1/Z, [x27]\n"
+      "zip1 z31.b, z21.b, z17.b\n"
+      "zip1 z30.b, z20.b, z16.b\n"
+      "ld1b { z19.b }, p1/Z, [x10, #1, MUL VL]\n"
+      "ld1b { z18.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "zip2 z29.b, z21.b, z17.b\n"
+      "zip2 z28.b, z20.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z16.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "zip1 z27.b, z19.b, z17.b\n"
+      "zip1 z26.b, z18.b, z16.b\n"
+      "ld1b { z22.b }, p1/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z21.b }, p1/Z, [x9, #2, MUL VL]\n"
+      "zip2 z25.b, z19.b, z17.b\n"
+      "zip2 z20.b, z18.b, z16.b\n"
+      "ld1b { z19.b }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z18.b }, p1/Z, [x27, #2, MUL VL]\n"
+      "zip1 z24.b, z22.b, z19.b\n"
+      "zip1 z23.b, z21.b, z18.b\n"
+      "zip1 z16.b, z31.b, z30.b\n"
+      "zip2 z17.b, z31.b, z30.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "addvl x10, x10, #3\n"
+      "zip1 z16.b, z29.b, z28.b\n"
+      "st1b { z17.b }, p1, [x22, #1, MUL VL]\n"
+      "zip2 z22.b, z22.b, z19.b\n"
+      "addvl x9, x9, #3\n"
+      "st1b { z16.b }, p1, [x22, #2, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z21.b, z21.b, z18.b\n"
+      "zip2 z18.b, z29.b, z28.b\n"
+      "zip1 z16.b, z27.b, z26.b\n"
+      "zip2 z17.b, z27.b, z26.b\n"
+      "st1b { z18.b }, p1, [x22]\n"
+      "addvl x28, x28, #3\n"
+      "st1b { z16.b }, p1, [x22, #1, MUL VL]\n"
+      "zip1 z16.b, z25.b, z20.b\n"
+      "zip2 z20.b, z25.b, z20.b\n"
+      "addvl x27, x27, #3\n"
+      "st1b { z17.b }, p1, [x22, #2, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z19.b, z24.b, z23.b\n"
+      "zip2 z18.b, z24.b, z23.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "zip1 z17.b, z22.b, z21.b\n"
+      "zip2 z16.b, z22.b, z21.b\n"
+      "st1b { z20.b }, p1, [x22, #1, MUL VL]\n"
+      "st1b { z19.b }, p1, [x22, #2, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1b { z18.b }, p1, [x22]\n"
+      "st1b { z17.b }, p1, [x22, #1, MUL VL]\n"
+      "st1b { z16.b }, p1, [x22, #2, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.b, XZR, x21\n"
+      "ld1b { z19.b }, p0/Z, [x10]\n"
+      "ld1b { z21.b }, p0/Z, [x9]\n"
+      "decw x21, ALL, MUL #3\n"
+      "ld1b { z18.b }, p0/Z, [x28]\n"
+      "ld1b { z16.b }, p0/Z, [x27]\n"
+      "zip1 z20.b, z19.b, z18.b\n"
+      "zip1 z17.b, z21.b, z16.b\n"
+      "zip2 z19.b, z19.b, z18.b\n"
+      "zip2 z16.b, z21.b, z16.b\n"
+      "cmp x21, #0x0\n"
+      "incd x10, ALL, MUL #6\n"
+      "incd x9, ALL, MUL #6\n"
+      "incd x28, ALL, MUL #6\n"
+      "zip1 z18.b, z20.b, z17.b\n"
+      "zip2 z17.b, z20.b, z17.b\n"
+      "incd x27, ALL, MUL #6\n"
+      "zip1 z16.b, z19.b, z16.b\n"
+      "st1b { z18.b }, p1, [x22]\n"
+      "st1b { z17.b }, p1, [x22, #1, MUL VL]\n"
+      "st1b { z16.b }, p1, [x22, #2, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #3\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<3, 4, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<3, 4, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
new file mode 100644
index 0000000000..36a15a16b3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_3VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 3 * roundup<size_t>(height, 2) * get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "ptrue p2.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x12, %x[in]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "mov x27, %x[width]\n"
+      "cnth x26, ALL, MUL #3\n"
+      "add x25, x28, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp x27, x26\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z17.h }, p2/Z, [x12]\n"
+      "ld1h { z23.h }, p2/Z, [x12, #1, MUL VL]\n"
+      "mov x21, x22\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1h { z16.h }, p2/Z, [x11]\n"
+      "ld1h { z20.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "zip1 z9.h, z17.h, z16.h\n"
+      "zip2 z8.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x10]\n"
+      "ld1h { z22.h }, p2/Z, [x10, #1, MUL VL]\n"
+      "zip1 z7.h, z23.h, z20.h\n"
+      "mov x20, x22\n"
+      "ld1h { z16.h }, p2/Z, [x9]\n"
+      "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "zip1 z6.h, z17.h, z16.h\n"
+      "zip2 z5.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p2/Z, [x28]\n"
+      "ld1h { z17.h }, p2/Z, [x25]\n"
+      "zip1 z4.h, z22.h, z21.h\n"
+      "zip1 z3.h, z18.h, z17.h\n"
+      "ld1h { z19.h }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "zip2 z2.h, z18.h, z17.h\n"
+      "zip2 z1.h, z23.h, z20.h\n"
+      "ld1h { z18.h }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "zip1 z0.h, z19.h, z16.h\n"
+      "zip2 z31.h, z19.h, z16.h\n"
+      "ld1h { z20.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z30.h }, p2/Z, [x28, #2, MUL VL]\n"
+      "zip2 z29.h, z22.h, z21.h\n"
+      "zip1 z28.h, z18.h, z17.h\n"
+      "ld1h { z16.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "ld1h { z19.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "zip1 z27.h, z20.h, z16.h\n"
+      "zip2 z26.h, z18.h, z17.h\n"
+      "ld1h { z17.h }, p2/Z, [x24]\n"
+      "ld1h { z18.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "zip2 z25.h, z20.h, z16.h\n"
+      "zip1 z24.h, z30.h, z19.h\n"
+      "ld1h { z23.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x23]\n"
+      "zip1 z22.h, z17.h, z16.h\n"
+      "zip2 z21.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "st1h { z9.h }, p2, [x21]\n"
+      "zip1 z20.h, z18.h, z17.h\n"
+      "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
+      "sub x27, x27, x26\n"
+      "cmp x27, x26\n"
+      "zip2 z19.h, z30.h, z19.h\n"
+      "st1h { z7.h }, p2, [x21, #2, MUL VL]\n"
+      "addvl x12, x12, #3\n"
+      "addvl x11, x11, #3\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "st1h { z6.h }, p2, [x21, #3, MUL VL]\n"
+      "addvl x10, x10, #3\n"
+      "addvl x9, x9, #3\n"
+      "zip1 z17.h, z23.h, z16.h\n"
+      "st1h { z5.h }, p2, [x21, #4, MUL VL]\n"
+      "addvl x28, x28, #3\n"
+      "addvl x25, x25, #3\n"
+      "zip2 z16.h, z23.h, z16.h\n"
+      "st1h { z4.h }, p2, [x21, #5, MUL VL]\n"
+      "addvl x24, x24, #3\n"
+      "addvl x23, x23, #3\n"
+      "st1h { z3.h }, p2, [x21, #6, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z2.h }, p2, [x21, #7, MUL VL]\n"
+      "addvl x21, x21, #12\n"
+      "st1h { z27.h }, p2, [x21, #-4, MUL VL]\n"
+      "st1h { z22.h }, p2, [x21, #-3, MUL VL]\n"
+      "st1h { z21.h }, p2, [x21, #-2, MUL VL]\n"
+      "st1h { z20.h }, p2, [x21, #-1, MUL VL]\n"
+      "st1h { z1.h }, p2, [x20]\n"
+      "st1h { z0.h }, p2, [x20, #1, MUL VL]\n"
+      "st1h { z31.h }, p2, [x20, #2, MUL VL]\n"
+      "st1h { z29.h }, p2, [x20, #3, MUL VL]\n"
+      "st1h { z28.h }, p2, [x20, #4, MUL VL]\n"
+      "st1h { z26.h }, p2, [x20, #5, MUL VL]\n"
+      "st1h { z25.h }, p2, [x20, #6, MUL VL]\n"
+      "st1h { z24.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x27, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x27\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z0.h }, p1/Z, [x12]\n"
+      "ld1h { z16.h }, p1/Z, [x11]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z21.h }, p0/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z31.h }, p1/Z, [x10]\n"
+      "ld1h { z30.h }, p0/Z, [x10, #1, MUL VL]\n"
+      "mov x20, x22\n"
+      "decw x27, ALL, MUL #3\n"
+      "ld1h { z18.h }, p1/Z, [x9]\n"
+      "ld1h { z29.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "ld1h { z28.h }, p1/Z, [x28]\n"
+      "ld1h { z20.h }, p1/Z, [x25]\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "ld1h { z27.h }, p0/Z, [x28, #1, MUL VL]\n"
+      "addvl x28, x28, #1\n"
+      "ld1h { z26.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #1\n"
+      "ld1h { z25.h }, p1/Z, [x24]\n"
+      "ld1h { z24.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #1\n"
+      "zip1 z17.h, z0.h, z16.h\n"
+      "ld1h { z23.h }, p1/Z, [x23]\n"
+      "ld1h { z22.h }, p0/Z, [x23, #1, MUL VL]\n"
+      "addvl x23, x23, #1\n"
+      "zip2 z16.h, z0.h, z16.h\n"
+      "zip1 z21.h, z21.h, z19.h\n"
+      "zip1 z19.h, z31.h, z18.h\n"
+      "st1h { z17.h }, p2, [x20]\n"
+      "cmp x27, #0x0\n"
+      "zip2 z18.h, z31.h, z18.h\n"
+      "zip1 z17.h, z30.h, z29.h\n"
+      "st1h { z16.h }, p2, [x20, #1, MUL VL]\n"
+      "incd x12, ALL, MUL #4\n"
+      "zip1 z16.h, z28.h, z20.h\n"
+      "zip2 z20.h, z28.h, z20.h\n"
+      "st1h { z21.h }, p2, [x20, #2, MUL VL]\n"
+      "incd x11, ALL, MUL #4\n"
+      "st1h { z19.h }, p2, [x20, #3, MUL VL]\n"
+      "incd x10, ALL, MUL #4\n"
+      "incd x9, ALL, MUL #4\n"
+      "zip1 z19.h, z27.h, z26.h\n"
+      "st1h { z18.h }, p2, [x20, #4, MUL VL]\n"
+      "incd x28, ALL, MUL #4\n"
+      "incd x25, ALL, MUL #4\n"
+      "zip1 z18.h, z25.h, z23.h\n"
+      "st1h { z17.h }, p2, [x20, #5, MUL VL]\n"
+      "incd x24, ALL, MUL #4\n"
+      "incd x23, ALL, MUL #4\n"
+      "zip2 z17.h, z25.h, z23.h\n"
+      "st1h { z16.h }, p2, [x20, #6, MUL VL]\n"
+      "zip1 z16.h, z24.h, z22.h\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z20.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x8\n"
+      "addvl %x[out], %x[out], #12\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x12, %x[in]\n"
+      "mov x21, %x[width]\n"
+      "cnth x20, ALL, MUL #3\n"
+      "add x11, x12, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x11, %x[in_stride]\n"
+      "csel x11, x11, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z17.h }, p2/Z, [x12]\n"
+      "ld1h { z22.h }, p2/Z, [x12, #1, MUL VL]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1h { z16.h }, p2/Z, [x11]\n"
+      "ld1h { z21.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "zip1 z18.h, z17.h, z16.h\n"
+      "zip2 z17.h, z17.h, z16.h\n"
+      "ld1h { z20.h }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z19.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "zip1 z16.h, z22.h, z21.h\n"
+      "st1h { z18.h }, p2, [x22]\n"
+      "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
+      "addvl x12, x12, #3\n"
+      "addvl x11, x11, #3\n"
+      "zip2 z18.h, z22.h, z21.h\n"
+      "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z17.h, z20.h, z19.h\n"
+      "zip2 z16.h, z20.h, z19.h\n"
+      "st1h { z18.h }, p2, [x22]\n"
+      "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z20.h }, p0/Z, [x12]\n"
+      "ld1h { z17.h }, p0/Z, [x11]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z19.h }, p0/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z16.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "decw x21, ALL, MUL #3\n"
+      "addvl x12, x12, #1\n"
+      "zip1 z18.h, z20.h, z17.h\n"
+      "zip2 z17.h, z20.h, z17.h\n"
+      "addvl x11, x11, #1\n"
+      "cmp x21, #0x0\n"
+      "zip1 z16.h, z19.h, z16.h\n"
+      "st1h { z18.h }, p2, [x22]\n"
+      "incd x12, ALL, MUL #4\n"
+      "incd x11, ALL, MUL #4\n"
+      "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #3\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<3, 2, true, VLType::SVE>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
new file mode 100644
index 0000000000..e661e2698a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 4 * height * get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "ptrue p4.b\n"
+      "blt 4f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p3.h, XZR, x20\n"
+      "ld1h { z31.h }, p3/Z, [x26]\n"
+      "ld1h { z30.h }, p3/Z, [x25]\n"
+      "dech x20\n"
+      "whilelt p2.h, XZR, x20\n"
+      "ld1h { z29.h }, p2/Z, [x26, #1, MUL VL]\n"
+      "ld1h { z28.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z27.h }, p1/Z, [x26, #2, MUL VL]\n"
+      "ld1h { z26.h }, p1/Z, [x25, #2, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z25.h }, p0/Z, [x26, #3, MUL VL]\n"
+      "ld1h { z24.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "mov x20, x22\n"
+      "dech x21, ALL, MUL #4\n"
+      "ld1h { z23.h }, p3/Z, [x24]\n"
+      "ld1h { z22.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z21.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z20.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "cmp x21, #0x0\n"
+      "addvl x26, x26, #4\n"
+      "ld1h { z19.h }, p3/Z, [x23]\n"
+      "ld1h { z18.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "addvl x24, x24, #4\n"
+      "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "st1h { z31.h }, p4, [x20]\n"
+      "addvl x23, x23, #4\n"
+      "st1h { z29.h }, p4, [x20, #1, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z27.h }, p4, [x20, #2, MUL VL]\n"
+      "st1h { z25.h }, p4, [x20, #3, MUL VL]\n"
+      "st1h { z30.h }, p4, [x20, #4, MUL VL]\n"
+      "st1h { z28.h }, p4, [x20, #5, MUL VL]\n"
+      "st1h { z26.h }, p4, [x20, #6, MUL VL]\n"
+      "st1h { z24.h }, p4, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z23.h }, p4, [x20, #-8, MUL VL]\n"
+      "st1h { z22.h }, p4, [x20, #-7, MUL VL]\n"
+      "st1h { z21.h }, p4, [x20, #-6, MUL VL]\n"
+      "st1h { z20.h }, p4, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p4, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p4, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p4, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p4, [x20, #-1, MUL VL]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      "cbz %x[height], 8f\n"
+      "4:"  // Main loop skip
+      "5:"  // Tail row loop: Head
+      "mov x26, %x[in]\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x21, %x[width]\n"
+      "6:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z19.h }, p0/Z, [x26]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z18.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z17.h }, p0/Z, [x26, #2, MUL VL]\n"
+      "dech x20\n"
+      "dech x21, ALL, MUL #4\n"
+      "whilelt p0.h, XZR, x20\n"
+      "cmp x21, #0x0\n"
+      "ld1h { z16.h }, p0/Z, [x26, #3, MUL VL]\n"
+      "st1h { z19.h }, p4, [x22]\n"
+      "addvl x26, x26, #4\n"
+      "st1h { z18.h }, p4, [x22, #1, MUL VL]\n"
+      "st1h { z17.h }, p4, [x22, #2, MUL VL]\n"
+      "st1h { z16.h }, p4, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 6b\n"
+      "7:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #4\n"
+      "bge 5b\n"
+      "8:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 1, true, VLType::SVE>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 1, true, VLType::SVE>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 1, true, VLType::SVE>(
+    double *out, const double *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(double) / 2,
+        stride * sizeof(double),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
new file mode 100644
index 0000000000..03a78f72f1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_4VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "ptrue p1.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x10, %x[in]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "mov x25, %x[width]\n"
+      "cntb x24, ALL, MUL #2\n"
+      "add x23, x26, %x[in_stride]\n"
+      "add x21, x23, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "cmp x25, x24\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z21.b }, p1/Z, [x10]\n"
+      "ld1b { z20.b }, p1/Z, [x9]\n"
+      "sub x25, x25, x24\n"
+      "cmp x25, x24\n"
+      "ld1b { z17.b }, p1/Z, [x28]\n"
+      "ld1b { z16.b }, p1/Z, [x27]\n"
+      "zip1 z4.b, z21.b, z17.b\n"
+      "zip1 z3.b, z20.b, z16.b\n"
+      "ld1b { z19.b }, p1/Z, [x26]\n"
+      "ld1b { z18.b }, p1/Z, [x23]\n"
+      "zip2 z2.b, z21.b, z17.b\n"
+      "zip2 z1.b, z20.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x21]\n"
+      "ld1b { z16.b }, p1/Z, [x20]\n"
+      "zip1 z0.b, z19.b, z17.b\n"
+      "zip1 z31.b, z18.b, z16.b\n"
+      "ld1b { z24.b }, p1/Z, [x10, #1, MUL VL]\n"
+      "ld1b { z20.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "zip2 z30.b, z19.b, z17.b\n"
+      "zip2 z23.b, z18.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z16.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "zip1 z22.b, z24.b, z17.b\n"
+      "zip1 z21.b, z20.b, z16.b\n"
+      "ld1b { z19.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1b { z18.b }, p1/Z, [x23, #1, MUL VL]\n"
+      "zip2 z29.b, z24.b, z17.b\n"
+      "zip2 z28.b, z20.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x21, #1, MUL VL]\n"
+      "ld1b { z16.b }, p1/Z, [x20, #1, MUL VL]\n"
+      "zip1 z27.b, z19.b, z17.b\n"
+      "zip1 z26.b, z18.b, z16.b\n"
+      "zip2 z25.b, z19.b, z17.b\n"
+      "zip2 z24.b, z18.b, z16.b\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "zip1 z16.b, z4.b, z3.b\n"
+      "zip2 z17.b, z4.b, z3.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "addvl x28, x28, #2\n"
+      "zip1 z16.b, z2.b, z1.b\n"
+      "zip2 z20.b, z2.b, z1.b\n"
+      "st1b { z17.b }, p1, [x22, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "zip1 z19.b, z0.b, z31.b\n"
+      "zip2 z18.b, z0.b, z31.b\n"
+      "st1b { z16.b }, p1, [x22, #2, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "zip1 z17.b, z30.b, z23.b\n"
+      "zip2 z16.b, z30.b, z23.b\n"
+      "st1b { z20.b }, p1, [x22, #3, MUL VL]\n"
+      "addvl x23, x23, #2\n"
+      "st1b { z19.b }, p1, [x22, #4, MUL VL]\n"
+      "addvl x21, x21, #2\n"
+      "addvl x20, x20, #2\n"
+      "zip1 z23.b, z22.b, z21.b\n"
+      "st1b { z18.b }, p1, [x22, #5, MUL VL]\n"
+      "zip2 z22.b, z22.b, z21.b\n"
+      "zip1 z21.b, z29.b, z28.b\n"
+      "st1b { z17.b }, p1, [x22, #6, MUL VL]\n"
+      "zip2 z20.b, z29.b, z28.b\n"
+      "zip1 z19.b, z27.b, z26.b\n"
+      "st1b { z16.b }, p1, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z18.b, z27.b, z26.b\n"
+      "zip1 z17.b, z25.b, z24.b\n"
+      "zip2 z16.b, z25.b, z24.b\n"
+      "st1b { z23.b }, p1, [x22]\n"
+      "st1b { z22.b }, p1, [x22, #1, MUL VL]\n"
+      "st1b { z21.b }, p1, [x22, #2, MUL VL]\n"
+      "st1b { z20.b }, p1, [x22, #3, MUL VL]\n"
+      "st1b { z19.b }, p1, [x22, #4, MUL VL]\n"
+      "st1b { z18.b }, p1, [x22, #5, MUL VL]\n"
+      "st1b { z17.b }, p1, [x22, #6, MUL VL]\n"
+      "st1b { z16.b }, p1, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x25, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x25\n"
+      "ld1b { z19.b }, p0/Z, [x10]\n"
+      "ld1b { z18.b }, p0/Z, [x9]\n"
+      "decw x25, ALL, MUL #4\n"
+      "ld1b { z17.b }, p0/Z, [x28]\n"
+      "ld1b { z16.b }, p0/Z, [x27]\n"
+      "zip1 z27.b, z19.b, z17.b\n"
+      "zip1 z26.b, z18.b, z16.b\n"
+      "ld1b { z22.b }, p0/Z, [x26]\n"
+      "ld1b { z21.b }, p0/Z, [x23]\n"
+      "zip2 z25.b, z19.b, z17.b\n"
+      "zip2 z20.b, z18.b, z16.b\n"
+      "ld1b { z19.b }, p0/Z, [x21]\n"
+      "ld1b { z16.b }, p0/Z, [x20]\n"
+      "zip1 z18.b, z22.b, z19.b\n"
+      "zip1 z17.b, z21.b, z16.b\n"
+      "zip2 z24.b, z22.b, z19.b\n"
+      "zip2 z16.b, z21.b, z16.b\n"
+      "cmp x25, #0x0\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "addvl x28, x28, #1\n"
+      "zip1 z23.b, z27.b, z26.b\n"
+      "zip2 z22.b, z27.b, z26.b\n"
+      "addvl x27, x27, #1\n"
+      "addvl x26, x26, #1\n"
+      "zip1 z21.b, z25.b, z20.b\n"
+      "zip2 z20.b, z25.b, z20.b\n"
+      "addvl x23, x23, #1\n"
+      "addvl x21, x21, #1\n"
+      "zip1 z19.b, z18.b, z17.b\n"
+      "zip2 z18.b, z18.b, z17.b\n"
+      "addvl x20, x20, #1\n"
+      "zip1 z17.b, z24.b, z16.b\n"
+      "zip2 z16.b, z24.b, z16.b\n"
+      "st1b { z23.b }, p1, [x22]\n"
+      "st1b { z22.b }, p1, [x22, #1, MUL VL]\n"
+      "st1b { z21.b }, p1, [x22, #2, MUL VL]\n"
+      "st1b { z20.b }, p1, [x22, #3, MUL VL]\n"
+      "st1b { z19.b }, p1, [x22, #4, MUL VL]\n"
+      "st1b { z18.b }, p1, [x22, #5, MUL VL]\n"
+      "st1b { z17.b }, p1, [x22, #6, MUL VL]\n"
+      "st1b { z16.b }, p1, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x8\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x10, %x[in]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "mov x21, %x[width]\n"
+      "cntb x20, ALL, MUL #2\n"
+      "add x27, x28, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x27, %x[in_stride]\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "csel x28, x28, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x9, x9, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1b { z21.b }, p1/Z, [x10]\n"
+      "ld1b { z19.b }, p1/Z, [x9]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1b { z17.b }, p1/Z, [x28]\n"
+      "ld1b { z16.b }, p1/Z, [x27]\n"
+      "zip1 z26.b, z21.b, z17.b\n"
+      "zip1 z25.b, z19.b, z16.b\n"
+      "ld1b { z20.b }, p1/Z, [x10, #1, MUL VL]\n"
+      "ld1b { z18.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "zip2 z24.b, z21.b, z17.b\n"
+      "zip2 z19.b, z19.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z16.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "zip1 z23.b, z20.b, z17.b\n"
+      "zip1 z22.b, z18.b, z16.b\n"
+      "zip2 z21.b, z20.b, z17.b\n"
+      "zip2 z20.b, z18.b, z16.b\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "zip1 z16.b, z26.b, z25.b\n"
+      "zip2 z18.b, z26.b, z25.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "addvl x28, x28, #2\n"
+      "zip1 z17.b, z24.b, z19.b\n"
+      "zip2 z16.b, z24.b, z19.b\n"
+      "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "st1b { z17.b }, p1, [x22, #2, MUL VL]\n"
+      "zip1 z19.b, z23.b, z22.b\n"
+      "zip2 z18.b, z23.b, z22.b\n"
+      "st1b { z16.b }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z17.b, z21.b, z20.b\n"
+      "zip2 z16.b, z21.b, z20.b\n"
+      "st1b { z19.b }, p1, [x22]\n"
+      "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+      "st1b { z17.b }, p1, [x22, #2, MUL VL]\n"
+      "st1b { z16.b }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.b, XZR, x21\n"
+      "ld1b { z20.b }, p0/Z, [x10]\n"
+      "ld1b { z21.b }, p0/Z, [x9]\n"
+      "decw x21, ALL, MUL #4\n"
+      "ld1b { z19.b }, p0/Z, [x28]\n"
+      "ld1b { z16.b }, p0/Z, [x27]\n"
+      "zip1 z18.b, z20.b, z19.b\n"
+      "zip1 z17.b, z21.b, z16.b\n"
+      "zip2 z20.b, z20.b, z19.b\n"
+      "zip2 z16.b, z21.b, z16.b\n"
+      "cmp x21, #0x0\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "addvl x28, x28, #1\n"
+      "zip1 z19.b, z18.b, z17.b\n"
+      "zip2 z18.b, z18.b, z17.b\n"
+      "addvl x27, x27, #1\n"
+      "zip1 z17.b, z20.b, z16.b\n"
+      "zip2 z16.b, z20.b, z16.b\n"
+      "st1b { z19.b }, p1, [x22]\n"
+      "st1b { z18.b }, p1, [x22, #1, MUL VL]\n"
+      "st1b { z17.b }, p1, [x22, #2, MUL VL]\n"
+      "st1b { z16.b }, p1, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #4\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 4, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 4, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
new file mode 100644
index 0000000000..b196799cfe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 2) * get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "ptrue p2.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x12, %x[in]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "mov x27, %x[width]\n"
+      "cnth x26, ALL, MUL #4\n"
+      "add x25, x28, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp x27, x26\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z18.h }, p2/Z, [x12]\n"
+      "ld1h { z20.h }, p2/Z, [x12, #1, MUL VL]\n"
+      "mov x21, x22\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1h { z17.h }, p2/Z, [x11]\n"
+      "ld1h { z16.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "zip1 z25.h, z18.h, z17.h\n"
+      "zip2 z24.h, z18.h, z17.h\n"
+      "ld1h { z19.h }, p2/Z, [x10]\n"
+      "ld1h { z18.h }, p2/Z, [x10, #1, MUL VL]\n"
+      "zip1 z23.h, z20.h, z16.h\n"
+      "zip2 z15.h, z20.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x9]\n"
+      "ld1h { z16.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "zip1 z14.h, z19.h, z17.h\n"
+      "zip2 z13.h, z19.h, z17.h\n"
+      "ld1h { z17.h }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z19.h }, p2/Z, [x12, #3, MUL VL]\n"
+      "zip1 z12.h, z18.h, z16.h\n"
+      "zip2 z11.h, z18.h, z16.h\n"
+      "ld1h { z16.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x11, #3, MUL VL]\n"
+      "mov x20, x22\n"
+      "zip1 z10.h, z17.h, z16.h\n"
+      "ld1h { z21.h }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z20.h }, p2/Z, [x10, #3, MUL VL]\n"
+      "zip2 z9.h, z17.h, z16.h\n"
+      "zip1 z8.h, z19.h, z18.h\n"
+      "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x9, #3, MUL VL]\n"
+      "zip2 z7.h, z19.h, z18.h\n"
+      "zip1 z6.h, z21.h, z17.h\n"
+      "ld1h { z19.h }, p2/Z, [x28]\n"
+      "ld1h { z18.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "zip2 z5.h, z21.h, z17.h\n"
+      "zip1 z4.h, z20.h, z16.h\n"
+      "ld1h { z22.h }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z3.h }, p2/Z, [x28, #3, MUL VL]\n"
+      "zip2 z2.h, z20.h, z16.h\n"
+      "sub x27, x27, x26\n"
+      "ld1h { z17.h }, p2/Z, [x25]\n"
+      "ld1h { z16.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "zip1 z1.h, z19.h, z17.h\n"
+      "zip2 z0.h, z19.h, z17.h\n"
+      "ld1h { z21.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z20.h }, p2/Z, [x25, #3, MUL VL]\n"
+      "zip1 z31.h, z18.h, z16.h\n"
+      "zip2 z30.h, z18.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x24]\n"
+      "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "cmp x27, x26\n"
+      "addvl x12, x12, #4\n"
+      "ld1h { z29.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z28.h }, p2/Z, [x24, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "addvl x10, x10, #4\n"
+      "ld1h { z16.h }, p2/Z, [x23]\n"
+      "ld1h { z18.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "zip1 z27.h, z17.h, z16.h\n"
+      "zip2 z26.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x23, #3, MUL VL]\n"
+      "st1h { z25.h }, p2, [x21]\n"
+      "zip1 z25.h, z19.h, z18.h\n"
+      "st1h { z24.h }, p2, [x21, #1, MUL VL]\n"
+      "zip2 z24.h, z19.h, z18.h\n"
+      "addvl x9, x9, #4\n"
+      "addvl x28, x28, #4\n"
+      "st1h { z23.h }, p2, [x21, #2, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "addvl x24, x24, #4\n"
+      "zip1 z23.h, z22.h, z21.h\n"
+      "st1h { z15.h }, p2, [x21, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "zip2 z22.h, z22.h, z21.h\n"
+      "zip1 z21.h, z3.h, z20.h\n"
+      "st1h { z14.h }, p2, [x21, #4, MUL VL]\n"
+      "zip2 z20.h, z3.h, z20.h\n"
+      "zip1 z19.h, z29.h, z17.h\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z13.h }, p2, [x21, #5, MUL VL]\n"
+      "zip2 z18.h, z29.h, z17.h\n"
+      "zip1 z17.h, z28.h, z16.h\n"
+      "st1h { z12.h }, p2, [x21, #6, MUL VL]\n"
+      "zip2 z16.h, z28.h, z16.h\n"
+      "st1h { z11.h }, p2, [x21, #7, MUL VL]\n"
+      "addvl x21, x21, #16\n"
+      "st1h { z1.h }, p2, [x21, #-8, MUL VL]\n"
+      "st1h { z0.h }, p2, [x21, #-7, MUL VL]\n"
+      "st1h { z31.h }, p2, [x21, #-6, MUL VL]\n"
+      "st1h { z30.h }, p2, [x21, #-5, MUL VL]\n"
+      "st1h { z27.h }, p2, [x21, #-4, MUL VL]\n"
+      "st1h { z26.h }, p2, [x21, #-3, MUL VL]\n"
+      "st1h { z25.h }, p2, [x21, #-2, MUL VL]\n"
+      "st1h { z24.h }, p2, [x21, #-1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x20]\n"
+      "st1h { z9.h }, p2, [x20, #1, MUL VL]\n"
+      "st1h { z8.h }, p2, [x20, #2, MUL VL]\n"
+      "st1h { z7.h }, p2, [x20, #3, MUL VL]\n"
+      "st1h { z6.h }, p2, [x20, #4, MUL VL]\n"
+      "st1h { z5.h }, p2, [x20, #5, MUL VL]\n"
+      "st1h { z4.h }, p2, [x20, #6, MUL VL]\n"
+      "st1h { z2.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z23.h }, p2, [x20, #-8, MUL VL]\n"
+      "st1h { z22.h }, p2, [x20, #-7, MUL VL]\n"
+      "st1h { z21.h }, p2, [x20, #-6, MUL VL]\n"
+      "st1h { z20.h }, p2, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x27, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x27\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z22.h }, p1/Z, [x12]\n"
+      "ld1h { z21.h }, p1/Z, [x11]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z20.h }, p0/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x10]\n"
+      "ld1h { z24.h }, p0/Z, [x10, #1, MUL VL]\n"
+      "mov x20, x22\n"
+      "decw x27, ALL, MUL #4\n"
+      "ld1h { z17.h }, p1/Z, [x9]\n"
+      "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "zip1 z31.h, z22.h, z21.h\n"
+      "zip2 z23.h, z22.h, z21.h\n"
+      "ld1h { z30.h }, p1/Z, [x28]\n"
+      "ld1h { z29.h }, p0/Z, [x28, #1, MUL VL]\n"
+      "zip1 z22.h, z20.h, z19.h\n"
+      "zip2 z28.h, z20.h, z19.h\n"
+      "ld1h { z21.h }, p1/Z, [x25]\n"
+      "ld1h { z27.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "zip1 z20.h, z18.h, z17.h\n"
+      "zip2 z19.h, z18.h, z17.h\n"
+      "ld1h { z18.h }, p1/Z, [x24]\n"
+      "ld1h { z26.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "zip1 z25.h, z24.h, z16.h\n"
+      "zip2 z24.h, z24.h, z16.h\n"
+      "ld1h { z17.h }, p1/Z, [x23]\n"
+      "ld1h { z16.h }, p0/Z, [x23, #1, MUL VL]\n"
+      "st1h { z31.h }, p2, [x20]\n"
+      "cmp x27, #0x0\n"
+      "st1h { z23.h }, p2, [x20, #1, MUL VL]\n"
+      "addvl x12, x12, #2\n"
+      "addvl x11, x11, #2\n"
+      "zip1 z23.h, z30.h, z21.h\n"
+      "st1h { z22.h }, p2, [x20, #2, MUL VL]\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "zip2 z22.h, z30.h, z21.h\n"
+      "st1h { z28.h }, p2, [x20, #3, MUL VL]\n"
+      "addvl x28, x28, #2\n"
+      "addvl x25, x25, #2\n"
+      "zip1 z21.h, z29.h, z27.h\n"
+      "st1h { z20.h }, p2, [x20, #4, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "addvl x23, x23, #2\n"
+      "zip2 z20.h, z29.h, z27.h\n"
+      "st1h { z19.h }, p2, [x20, #5, MUL VL]\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z25.h }, p2, [x20, #6, MUL VL]\n"
+      "zip1 z17.h, z26.h, z16.h\n"
+      "zip2 z16.h, z26.h, z16.h\n"
+      "st1h { z24.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z23.h }, p2, [x20, #-8, MUL VL]\n"
+      "st1h { z22.h }, p2, [x20, #-7, MUL VL]\n"
+      "st1h { z21.h }, p2, [x20, #-6, MUL VL]\n"
+      "st1h { z20.h }, p2, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x8\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x12, %x[in]\n"
+      "mov x21, %x[width]\n"
+      "cnth x20, ALL, MUL #4\n"
+      "add x11, x12, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x11, %x[in_stride]\n"
+      "csel x11, x11, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z18.h }, p2/Z, [x12]\n"
+      "ld1h { z20.h }, p2/Z, [x12, #1, MUL VL]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1h { z17.h }, p2/Z, [x11]\n"
+      "ld1h { z16.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "zip1 z23.h, z18.h, z17.h\n"
+      "zip2 z19.h, z18.h, z17.h\n"
+      "ld1h { z18.h }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x12, #3, MUL VL]\n"
+      "zip1 z21.h, z20.h, z16.h\n"
+      "zip2 z20.h, z20.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x11, #3, MUL VL]\n"
+      "st1h { z23.h }, p2, [x22]\n"
+      "addvl x12, x12, #4\n"
+      "st1h { z19.h }, p2, [x22, #1, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "st1h { z21.h }, p2, [x22, #2, MUL VL]\n"
+      "zip1 z17.h, z22.h, z16.h\n"
+      "zip2 z16.h, z22.h, z16.h\n"
+      "st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z19.h }, p2, [x22]\n"
+      "st1h { z18.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z18.h }, p0/Z, [x12]\n"
+      "ld1h { z17.h }, p0/Z, [x11]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z20.h }, p0/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z16.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "decw x21, ALL, MUL #4\n"
+      "cmp x21, #0x0\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "addvl x12, x12, #2\n"
+      "addvl x11, x11, #2\n"
+      "zip1 z17.h, z20.h, z16.h\n"
+      "zip2 z16.h, z20.h, z16.h\n"
+      "st1h { z19.h }, p2, [x22]\n"
+      "st1h { z18.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #3, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #4\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 2, true, VLType::SVE>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
new file mode 100644
index 0000000000..68fe2d0cbe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_6VL_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 8) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 6 * roundup<size_t>(height, 8) * get_vector_length<uint64_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "1:"  // Main row loop: Head
+      "mov x10, %x[in]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x7\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x5\n"
+      "mov x22, %x[width]\n"
+      "cntb x21, ALL, MUL #3\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "csel x26, x26, %x[pad_row], GE\n"
+      "cmp %x[height], #0x3\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "csel x28, x28, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x9, x9, %x[pad_row], GT\n"
+      "cmp x22, x21\n"
+      "mov x20, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z21.b }, p1/Z, [x10]\n"
+      "ld1b { z25.b }, p1/Z, [x9]\n"
+      "sub x22, x22, x21\n"
+      "cmp x22, x21\n"
+      "ld1b { z20.b }, p1/Z, [x28]\n"
+      "ld1b { z24.b }, p1/Z, [x27]\n"
+      "ld1b { z19.b }, p1/Z, [x26]\n"
+      "ld1b { z18.b }, p1/Z, [x25]\n"
+      "zip1 z7.b, z21.b, z19.b\n"
+      "zip1 z6.b, z25.b, z18.b\n"
+      "ld1b { z17.b }, p1/Z, [x24]\n"
+      "ld1b { z16.b }, p1/Z, [x23]\n"
+      "zip1 z28.b, z20.b, z17.b\n"
+      "zip1 z27.b, z24.b, z16.b\n"
+      "ld1b { z23.b }, p1/Z, [x10, #1, MUL VL]\n"
+      "ld1b { z22.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "zip2 z5.b, z21.b, z19.b\n"
+      "zip2 z4.b, z20.b, z17.b\n"
+      "ld1b { z21.b }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z20.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "zip2 z3.b, z25.b, z18.b\n"
+      "zip2 z2.b, z24.b, z16.b\n"
+      "ld1b { z19.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1b { z18.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "zip1 z1.b, z23.b, z19.b\n"
+      "zip1 z15.b, z22.b, z18.b\n"
+      "ld1b { z17.b }, p1/Z, [x24, #1, MUL VL]\n"
+      "ld1b { z16.b }, p1/Z, [x23, #1, MUL VL]\n"
+      "zip1 z0.b, z21.b, z17.b\n"
+      "zip1 z31.b, z20.b, z16.b\n"
+      "ld1b { z26.b }, p1/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z30.b }, p1/Z, [x9, #2, MUL VL]\n"
+      "zip2 z14.b, z23.b, z19.b\n"
+      "zip2 z13.b, z21.b, z17.b\n"
+      "ld1b { z25.b }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p1/Z, [x27, #2, MUL VL]\n"
+      "zip2 z12.b, z22.b, z18.b\n"
+      "zip2 z11.b, z20.b, z16.b\n"
+      "ld1b { z23.b }, p1/Z, [x26, #2, MUL VL]\n"
+      "ld1b { z22.b }, p1/Z, [x25, #2, MUL VL]\n"
+      "zip1 z10.b, z26.b, z23.b\n"
+      "zip1 z9.b, z30.b, z22.b\n"
+      "ld1b { z21.b }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1b { z17.b }, p1/Z, [x23, #2, MUL VL]\n"
+      "zip1 z29.b, z25.b, z21.b\n"
+      "zip1 z8.b, z24.b, z17.b\n"
+      "zip1 z19.b, z7.b, z28.b\n"
+      "zip1 z16.b, z6.b, z27.b\n"
+      "addvl x10, x10, #3\n"
+      "addvl x9, x9, #3\n"
+      "zip2 z28.b, z7.b, z28.b\n"
+      "zip2 z18.b, z6.b, z27.b\n"
+      "addvl x28, x28, #3\n"
+      "addvl x27, x27, #3\n"
+      "zip1 z27.b, z5.b, z4.b\n"
+      "zip1 z20.b, z3.b, z2.b\n"
+      "addvl x26, x26, #3\n"
+      "addvl x25, x25, #3\n"
+      "zip2 z7.b, z26.b, z23.b\n"
+      "zip2 z26.b, z25.b, z21.b\n"
+      "addvl x24, x24, #3\n"
+      "addvl x23, x23, #3\n"
+      "zip2 z6.b, z30.b, z22.b\n"
+      "zip2 z25.b, z24.b, z17.b\n"
+      "zip2 z5.b, z5.b, z4.b\n"
+      "zip2 z4.b, z3.b, z2.b\n"
+      "zip1 z3.b, z1.b, z0.b\n"
+      "zip1 z2.b, z15.b, z31.b\n"
+      "zip2 z1.b, z1.b, z0.b\n"
+      "zip2 z0.b, z15.b, z31.b\n"
+      "zip1 z31.b, z14.b, z13.b\n"
+      "zip1 z30.b, z12.b, z11.b\n"
+      "zip2 z24.b, z14.b, z13.b\n"
+      "zip2 z23.b, z12.b, z11.b\n"
+      "zip1 z22.b, z10.b, z29.b\n"
+      "zip1 z21.b, z9.b, z8.b\n"
+      "zip1 z17.b, z19.b, z16.b\n"
+      "zip2 z16.b, z19.b, z16.b\n"
+      "st1b { z17.b }, p1, [x20]\n"
+      "zip1 z19.b, z28.b, z18.b\n"
+      "zip2 z18.b, z28.b, z18.b\n"
+      "st1b { z16.b }, p1, [x20, #1, MUL VL]\n"
+      "zip1 z17.b, z27.b, z20.b\n"
+      "zip2 z16.b, z27.b, z20.b\n"
+      "st1b { z19.b }, p1, [x20, #2, MUL VL]\n"
+      "st1b { z18.b }, p1, [x20, #3, MUL VL]\n"
+      "zip2 z29.b, z10.b, z29.b\n"
+      "zip2 z20.b, z9.b, z8.b\n"
+      "st1b { z17.b }, p1, [x20, #4, MUL VL]\n"
+      "zip1 z28.b, z7.b, z26.b\n"
+      "zip1 z27.b, z6.b, z25.b\n"
+      "st1b { z16.b }, p1, [x20, #5, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "zip2 z26.b, z7.b, z26.b\n"
+      "zip2 z25.b, z6.b, z25.b\n"
+      "zip1 z17.b, z5.b, z4.b\n"
+      "zip2 z16.b, z5.b, z4.b\n"
+      "st1b { z17.b }, p1, [x20]\n"
+      "zip1 z18.b, z3.b, z2.b\n"
+      "zip2 z17.b, z3.b, z2.b\n"
+      "st1b { z16.b }, p1, [x20, #1, MUL VL]\n"
+      "zip1 z16.b, z1.b, z0.b\n"
+      "zip2 z19.b, z1.b, z0.b\n"
+      "st1b { z18.b }, p1, [x20, #2, MUL VL]\n"
+      "st1b { z17.b }, p1, [x20, #3, MUL VL]\n"
+      "zip1 z18.b, z31.b, z30.b\n"
+      "zip2 z17.b, z31.b, z30.b\n"
+      "st1b { z16.b }, p1, [x20, #4, MUL VL]\n"
+      "zip1 z16.b, z24.b, z23.b\n"
+      "zip2 z24.b, z24.b, z23.b\n"
+      "st1b { z19.b }, p1, [x20, #5, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "zip1 z23.b, z22.b, z21.b\n"
+      "zip2 z22.b, z22.b, z21.b\n"
+      "st1b { z18.b }, p1, [x20]\n"
+      "zip1 z21.b, z29.b, z20.b\n"
+      "zip2 z20.b, z29.b, z20.b\n"
+      "st1b { z17.b }, p1, [x20, #1, MUL VL]\n"
+      "zip1 z19.b, z28.b, z27.b\n"
+      "zip2 z18.b, z28.b, z27.b\n"
+      "st1b { z16.b }, p1, [x20, #2, MUL VL]\n"
+      "zip1 z17.b, z26.b, z25.b\n"
+      "zip2 z16.b, z26.b, z25.b\n"
+      "st1b { z24.b }, p1, [x20, #3, MUL VL]\n"
+      "st1b { z23.b }, p1, [x20, #4, MUL VL]\n"
+      "st1b { z22.b }, p1, [x20, #5, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "st1b { z21.b }, p1, [x20]\n"
+      "st1b { z20.b }, p1, [x20, #1, MUL VL]\n"
+      "st1b { z19.b }, p1, [x20, #2, MUL VL]\n"
+      "st1b { z18.b }, p1, [x20, #3, MUL VL]\n"
+      "st1b { z17.b }, p1, [x20, #4, MUL VL]\n"
+      "st1b { z16.b }, p1, [x20, #5, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x22\n"
+      "ld1b { z23.b }, p0/Z, [x10]\n"
+      "ld1b { z27.b }, p0/Z, [x9]\n"
+      "decd x22, ALL, MUL #6\n"
+      "ld1b { z21.b }, p0/Z, [x28]\n"
+      "ld1b { z26.b }, p0/Z, [x27]\n"
+      "cmp x22, #0x0\n"
+      "incd x10, ALL, MUL #6\n"
+      "ld1b { z20.b }, p0/Z, [x26]\n"
+      "ld1b { z19.b }, p0/Z, [x25]\n"
+      "zip1 z25.b, z23.b, z20.b\n"
+      "zip1 z24.b, z27.b, z19.b\n"
+      "ld1b { z17.b }, p0/Z, [x24]\n"
+      "ld1b { z16.b }, p0/Z, [x23]\n"
+      "zip1 z22.b, z21.b, z17.b\n"
+      "zip1 z18.b, z26.b, z16.b\n"
+      "zip2 z23.b, z23.b, z20.b\n"
+      "zip2 z21.b, z21.b, z17.b\n"
+      "incd x9, ALL, MUL #6\n"
+      "incd x28, ALL, MUL #6\n"
+      "zip2 z20.b, z27.b, z19.b\n"
+      "zip2 z17.b, z26.b, z16.b\n"
+      "incd x27, ALL, MUL #6\n"
+      "incd x26, ALL, MUL #6\n"
+      "zip1 z19.b, z25.b, z22.b\n"
+      "zip1 z16.b, z24.b, z18.b\n"
+      "incd x25, ALL, MUL #6\n"
+      "incd x24, ALL, MUL #6\n"
+      "zip2 z22.b, z25.b, z22.b\n"
+      "zip2 z18.b, z24.b, z18.b\n"
+      "incd x23, ALL, MUL #6\n"
+      "zip1 z21.b, z23.b, z21.b\n"
+      "zip1 z20.b, z20.b, z17.b\n"
+      "zip1 z17.b, z19.b, z16.b\n"
+      "zip2 z16.b, z19.b, z16.b\n"
+      "st1b { z17.b }, p1, [x20]\n"
+      "zip1 z19.b, z22.b, z18.b\n"
+      "zip2 z18.b, z22.b, z18.b\n"
+      "st1b { z16.b }, p1, [x20, #1, MUL VL]\n"
+      "zip1 z17.b, z21.b, z20.b\n"
+      "zip2 z16.b, z21.b, z20.b\n"
+      "st1b { z19.b }, p1, [x20, #2, MUL VL]\n"
+      "st1b { z18.b }, p1, [x20, #3, MUL VL]\n"
+      "st1b { z17.b }, p1, [x20, #4, MUL VL]\n"
+      "st1b { z16.b }, p1, [x20, #5, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #6\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<6, 8, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_6VL_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<6, 8, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_6VL_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
new file mode 100644
index 0000000000..910fc6cb02
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 6 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "ptrue p2.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x12, %x[in]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "mov x27, %x[width]\n"
+      "cnth x26, ALL, MUL #3\n"
+      "add x25, x28, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp x27, x26\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z18.h }, p2/Z, [x12]\n"
+      "ld1h { z13.h }, p2/Z, [x12, #1, MUL VL]\n"
+      "mov x21, x22\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1h { z17.h }, p2/Z, [x11]\n"
+      "ld1h { z12.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "mov x20, x22\n"
+      "sub x27, x27, x26\n"
+      "ld1h { z16.h }, p2/Z, [x10]\n"
+      "ld1h { z11.h }, p2/Z, [x10, #1, MUL VL]\n"
+      "zip1 z23.h, z18.h, z16.h\n"
+      "zip2 z29.h, z18.h, z16.h\n"
+      "ld1h { z16.h }, p2/Z, [x9]\n"
+      "ld1h { z10.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "zip1 z22.h, z17.h, z16.h\n"
+      "zip2 z28.h, z17.h, z16.h\n"
+      "ld1h { z27.h }, p2/Z, [x28]\n"
+      "ld1h { z26.h }, p2/Z, [x25]\n"
+      "zip1 z21.h, z13.h, z11.h\n"
+      "zip1 z20.h, z12.h, z10.h\n"
+      "ld1h { z18.h }, p2/Z, [x24]\n"
+      "ld1h { z19.h }, p2/Z, [x23]\n"
+      "zip1 z17.h, z27.h, z18.h\n"
+      "zip1 z16.h, z26.h, z19.h\n"
+      "ld1h { z9.h }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z8.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "zip1 z25.h, z23.h, z22.h\n"
+      "zip2 z24.h, z23.h, z22.h\n"
+      "ld1h { z23.h }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z7.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "zip1 z22.h, z29.h, z28.h\n"
+      "zip2 z6.h, z29.h, z28.h\n"
+      "ld1h { z28.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z5.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "zip1 z4.h, z21.h, z20.h\n"
+      "zip2 z3.h, z21.h, z20.h\n"
+      "ld1h { z21.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z20.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "zip1 z2.h, z17.h, z16.h\n"
+      "zip2 z1.h, z17.h, z16.h\n"
+      "ld1h { z0.h }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z31.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "zip2 z18.h, z27.h, z18.h\n"
+      "zip2 z17.h, z26.h, z19.h\n"
+      "ld1h { z30.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z29.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z19.h, z28.h, z21.h\n"
+      "zip1 z16.h, z5.h, z20.h\n"
+      "st1h { z25.h }, p2, [x21]\n"
+      "zip2 z27.h, z13.h, z11.h\n"
+      "zip2 z26.h, z12.h, z10.h\n"
+      "cmp x27, x26\n"
+      "st1h { z24.h }, p2, [x21, #1, MUL VL]\n"
+      "zip1 z25.h, z9.h, z23.h\n"
+      "zip1 z24.h, z8.h, z7.h\n"
+      "addvl x12, x12, #3\n"
+      "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+      "zip2 z23.h, z9.h, z23.h\n"
+      "zip2 z22.h, z8.h, z7.h\n"
+      "addvl x11, x11, #3\n"
+      "st1h { z6.h }, p2, [x21, #3, MUL VL]\n"
+      "zip2 z28.h, z28.h, z21.h\n"
+      "zip2 z21.h, z5.h, z20.h\n"
+      "addvl x10, x10, #3\n"
+      "st1h { z4.h }, p2, [x21, #4, MUL VL]\n"
+      "zip1 z20.h, z18.h, z17.h\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "addvl x9, x9, #3\n"
+      "st1h { z3.h }, p2, [x21, #5, MUL VL]\n"
+      "zip1 z17.h, z19.h, z16.h\n"
+      "zip2 z16.h, z19.h, z16.h\n"
+      "addvl x28, x28, #3\n"
+      "st1h { z2.h }, p2, [x21, #6, MUL VL]\n"
+      "zip1 z19.h, z27.h, z26.h\n"
+      "zip2 z27.h, z27.h, z26.h\n"
+      "addvl x25, x25, #3\n"
+      "st1h { z1.h }, p2, [x21, #7, MUL VL]\n"
+      "addvl x21, x21, #12\n"
+      "zip1 z26.h, z25.h, z24.h\n"
+      "zip2 z25.h, z25.h, z24.h\n"
+      "st1h { z20.h }, p2, [x21, #-4, MUL VL]\n"
+      "zip1 z24.h, z23.h, z22.h\n"
+      "zip2 z23.h, z23.h, z22.h\n"
+      "addvl x24, x24, #3\n"
+      "st1h { z18.h }, p2, [x21, #-3, MUL VL]\n"
+      "zip1 z22.h, z28.h, z21.h\n"
+      "zip2 z21.h, z28.h, z21.h\n"
+      "addvl x23, x23, #3\n"
+      "st1h { z17.h }, p2, [x21, #-2, MUL VL]\n"
+      "zip1 z18.h, z0.h, z30.h\n"
+      "zip1 z17.h, z31.h, z29.h\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z16.h }, p2, [x21, #-1, MUL VL]\n"
+      "zip2 z20.h, z0.h, z30.h\n"
+      "zip2 z16.h, z31.h, z29.h\n"
+      "st1h { z19.h }, p2, [x20]\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "st1h { z27.h }, p2, [x20, #1, MUL VL]\n"
+      "zip1 z17.h, z20.h, z16.h\n"
+      "zip2 z16.h, z20.h, z16.h\n"
+      "st1h { z26.h }, p2, [x20, #2, MUL VL]\n"
+      "st1h { z25.h }, p2, [x20, #3, MUL VL]\n"
+      "st1h { z24.h }, p2, [x20, #4, MUL VL]\n"
+      "st1h { z23.h }, p2, [x20, #5, MUL VL]\n"
+      "st1h { z22.h }, p2, [x20, #6, MUL VL]\n"
+      "st1h { z21.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x27, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x27\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z17.h }, p1/Z, [x12]\n"
+      "ld1h { z19.h }, p1/Z, [x11]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z22.h }, p0/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z21.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z16.h }, p1/Z, [x10]\n"
+      "ld1h { z20.h }, p0/Z, [x10, #1, MUL VL]\n"
+      "zip1 z25.h, z17.h, z16.h\n"
+      "zip2 z24.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p1/Z, [x9]\n"
+      "ld1h { z17.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "zip1 z16.h, z19.h, z18.h\n"
+      "zip2 z19.h, z19.h, z18.h\n"
+      "ld1h { z0.h }, p1/Z, [x28]\n"
+      "ld1h { z31.h }, p1/Z, [x25]\n"
+      "zip1 z23.h, z22.h, z20.h\n"
+      "zip1 z22.h, z21.h, z17.h\n"
+      "ld1h { z30.h }, p1/Z, [x24]\n"
+      "ld1h { z29.h }, p1/Z, [x23]\n"
+      "zip1 z21.h, z0.h, z30.h\n"
+      "zip1 z18.h, z31.h, z29.h\n"
+      "ld1h { z28.h }, p0/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z27.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "mov x20, x22\n"
+      "decd x27, ALL, MUL #6\n"
+      "ld1h { z20.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z26.h }, p0/Z, [x23, #1, MUL VL]\n"
+      "addvl x12, x12, #1\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "zip1 z17.h, z25.h, z16.h\n"
+      "zip2 z16.h, z25.h, z16.h\n"
+      "addvl x28, x28, #1\n"
+      "addvl x25, x25, #1\n"
+      "zip1 z25.h, z24.h, z19.h\n"
+      "zip2 z19.h, z24.h, z19.h\n"
+      "addvl x24, x24, #1\n"
+      "addvl x23, x23, #1\n"
+      "zip1 z24.h, z23.h, z22.h\n"
+      "zip2 z23.h, z23.h, z22.h\n"
+      "zip1 z22.h, z21.h, z18.h\n"
+      "zip2 z21.h, z21.h, z18.h\n"
+      "st1h { z17.h }, p2, [x20]\n"
+      "cmp x27, #0x0\n"
+      "zip2 z18.h, z0.h, z30.h\n"
+      "zip2 z17.h, z31.h, z29.h\n"
+      "st1h { z16.h }, p2, [x20, #1, MUL VL]\n"
+      "incd x12, ALL, MUL #4\n"
+      "zip1 z20.h, z28.h, z20.h\n"
+      "zip1 z16.h, z27.h, z26.h\n"
+      "st1h { z25.h }, p2, [x20, #2, MUL VL]\n"
+      "incd x11, ALL, MUL #4\n"
+      "st1h { z19.h }, p2, [x20, #3, MUL VL]\n"
+      "incd x10, ALL, MUL #4\n"
+      "incd x9, ALL, MUL #4\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "st1h { z24.h }, p2, [x20, #4, MUL VL]\n"
+      "incd x28, ALL, MUL #4\n"
+      "incd x25, ALL, MUL #4\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "st1h { z23.h }, p2, [x20, #5, MUL VL]\n"
+      "incd x24, ALL, MUL #4\n"
+      "incd x23, ALL, MUL #4\n"
+      "zip1 z17.h, z20.h, z16.h\n"
+      "st1h { z22.h }, p2, [x20, #6, MUL VL]\n"
+      "zip2 z16.h, z20.h, z16.h\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z21.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x8\n"
+      "addvl %x[out], %x[out], #12\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x12, %x[in]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "mov x21, %x[width]\n"
+      "cnth x20, ALL, MUL #3\n"
+      "add x9, x10, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x9, %x[in_stride]\n"
+      "csel x9, x9, %x[pad_row], GT\n"
+      "csel x10, x10, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x11, x11, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z18.h }, p2/Z, [x12]\n"
+      "ld1h { z24.h }, p2/Z, [x12, #1, MUL VL]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1h { z17.h }, p2/Z, [x11]\n"
+      "ld1h { z23.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x10]\n"
+      "ld1h { z22.h }, p2/Z, [x10, #1, MUL VL]\n"
+      "zip1 z31.h, z18.h, z16.h\n"
+      "zip2 z30.h, z18.h, z16.h\n"
+      "ld1h { z16.h }, p2/Z, [x9]\n"
+      "ld1h { z20.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "zip1 z29.h, z17.h, z16.h\n"
+      "zip2 z28.h, z17.h, z16.h\n"
+      "ld1h { z19.h }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "zip1 z27.h, z24.h, z22.h\n"
+      "zip1 z21.h, z23.h, z20.h\n"
+      "ld1h { z17.h }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "zip2 z26.h, z24.h, z22.h\n"
+      "zip2 z20.h, z23.h, z20.h\n"
+      "zip1 z25.h, z19.h, z17.h\n"
+      "zip1 z24.h, z18.h, z16.h\n"
+      "addvl x12, x12, #3\n"
+      "addvl x11, x11, #3\n"
+      "zip2 z23.h, z19.h, z17.h\n"
+      "zip2 z22.h, z18.h, z16.h\n"
+      "addvl x10, x10, #3\n"
+      "addvl x9, x9, #3\n"
+      "zip1 z17.h, z31.h, z29.h\n"
+      "zip2 z16.h, z31.h, z29.h\n"
+      "st1h { z17.h }, p2, [x22]\n"
+      "zip1 z19.h, z30.h, z28.h\n"
+      "zip2 z18.h, z30.h, z28.h\n"
+      "st1h { z16.h }, p2, [x22, #1, MUL VL]\n"
+      "zip1 z17.h, z27.h, z21.h\n"
+      "zip2 z16.h, z27.h, z21.h\n"
+      "st1h { z19.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z18.h }, p2, [x22, #3, MUL VL]\n"
+      "zip1 z21.h, z26.h, z20.h\n"
+      "zip2 z20.h, z26.h, z20.h\n"
+      "st1h { z17.h }, p2, [x22, #4, MUL VL]\n"
+      "zip1 z19.h, z25.h, z24.h\n"
+      "zip2 z18.h, z25.h, z24.h\n"
+      "st1h { z16.h }, p2, [x22, #5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z17.h, z23.h, z22.h\n"
+      "zip2 z16.h, z23.h, z22.h\n"
+      "st1h { z21.h }, p2, [x22]\n"
+      "st1h { z20.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z19.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z18.h }, p2, [x22, #3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #4, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z22.h }, p1/Z, [x12]\n"
+      "ld1h { z25.h }, p1/Z, [x11]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z24.h }, p0/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z21.h }, p1/Z, [x10]\n"
+      "ld1h { z20.h }, p0/Z, [x10, #1, MUL VL]\n"
+      "decd x21, ALL, MUL #6\n"
+      "addvl x12, x12, #1\n"
+      "ld1h { z18.h }, p1/Z, [x9]\n"
+      "ld1h { z17.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "addvl x11, x11, #1\n"
+      "addvl x10, x10, #1\n"
+      "addvl x9, x9, #1\n"
+      "zip1 z19.h, z22.h, z21.h\n"
+      "zip1 z16.h, z25.h, z18.h\n"
+      "cmp x21, #0x0\n"
+      "zip2 z22.h, z22.h, z21.h\n"
+      "zip2 z18.h, z25.h, z18.h\n"
+      "incd x12, ALL, MUL #4\n"
+      "incd x11, ALL, MUL #4\n"
+      "zip1 z21.h, z24.h, z20.h\n"
+      "zip1 z20.h, z23.h, z17.h\n"
+      "incd x10, ALL, MUL #4\n"
+      "incd x9, ALL, MUL #4\n"
+      "zip1 z17.h, z19.h, z16.h\n"
+      "zip2 z16.h, z19.h, z16.h\n"
+      "st1h { z17.h }, p2, [x22]\n"
+      "zip1 z19.h, z22.h, z18.h\n"
+      "zip2 z18.h, z22.h, z18.h\n"
+      "st1h { z16.h }, p2, [x22, #1, MUL VL]\n"
+      "zip1 z17.h, z21.h, z20.h\n"
+      "zip2 z16.h, z21.h, z20.h\n"
+      "st1h { z19.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z18.h }, p2, [x22, #3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #4, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #6\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<6, 4, true, VLType::SVE>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_6VL_2x4(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..f0f10d2f43
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_6VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 6 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p3.b\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "mov x23, %x[width]\n"
+      "cnth x20, ALL, MUL #3\n"
+      "add x22, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x22, %x[in_stride]\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "cmp x23, x20\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z17.s }, p3/Z, [x26]\n"
+      "ld1w { z18.s }, p3/Z, [x26, #1, MUL VL]\n"
+      "sub x23, x23, x20\n"
+      "cmp x23, x20\n"
+      "ld1w { z19.s }, p3/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x24]\n"
+      "zip1 z21.s, z17.s, z16.s\n"
+      "zip2 z20.s, z17.s, z16.s\n"
+      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x24, #2, MUL VL]\n"
+      "zip1 z29.s, z18.s, z17.s\n"
+      "zip2 z28.s, z18.s, z17.s\n"
+      "ld1w { z17.s }, p3/Z, [x26, #3, MUL VL]\n"
+      "ld1w { z18.s }, p3/Z, [x26, #4, MUL VL]\n"
+      "zip1 z27.s, z19.s, z16.s\n"
+      "zip2 z26.s, z19.s, z16.s\n"
+      "ld1w { z19.s }, p3/Z, [x26, #5, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x24, #3, MUL VL]\n"
+      "zip1 z25.s, z17.s, z16.s\n"
+      "zip2 z24.s, z17.s, z16.s\n"
+      "ld1w { z17.s }, p3/Z, [x24, #4, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x24, #5, MUL VL]\n"
+      "zip1 z12.s, z18.s, z17.s\n"
+      "zip2 z11.s, z18.s, z17.s\n"
+      "ld1w { z18.s }, p3/Z, [x25]\n"
+      "ld1w { z23.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "zip1 z10.s, z19.s, z16.s\n"
+      "zip2 z9.s, z19.s, z16.s\n"
+      "ld1w { z22.s }, p3/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z17.s }, p3/Z, [x22]\n"
+      ".inst 0x658aaea8  // bfcvt z8.h, p3/M, z21.s\n"
+      "zip1 z7.s, z18.s, z17.s\n"
+      "ld1w { z16.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x658aae86  // bfcvt z6.h, p3/M, z20.s\n"
+      "zip2 z5.s, z18.s, z17.s\n"
+      "ld1w { z20.s }, p3/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z19.s }, p3/Z, [x25, #4, MUL VL]\n"
+      ".inst 0x658aafa4  // bfcvt z4.h, p3/M, z29.s\n"
+      "zip1 z3.s, z23.s, z16.s\n"
+      "ld1w { z2.s }, p3/Z, [x25, #5, MUL VL]\n"
+      "ld1w { z18.s }, p3/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x658aaf81  // bfcvt z1.h, p3/M, z28.s\n"
+      "zip2 z0.s, z23.s, z16.s\n"
+      "ld1w { z17.s }, p3/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x658aaf7f  // bfcvt z31.h, p3/M, z27.s\n"
+      "zip1 z30.s, z22.s, z21.s\n"
+      ".inst 0x658aaf5d  // bfcvt z29.h, p3/M, z26.s\n"
+      "zip2 z28.s, z22.s, z21.s\n"
+      "addvl x26, x26, #6\n"
+      "addvl x25, x25, #6\n"
+      ".inst 0x658aaf3b  // bfcvt z27.h, p3/M, z25.s\n"
+      "zip1 z26.s, z20.s, z18.s\n"
+      "addvl x24, x24, #6\n"
+      "addvl x22, x22, #6\n"
+      ".inst 0x658aaf19  // bfcvt z25.h, p3/M, z24.s\n"
+      "zip2 z24.s, z20.s, z18.s\n"
+      ".inst 0x658aad97  // bfcvt z23.h, p3/M, z12.s\n"
+      "zip1 z22.s, z19.s, z17.s\n"
+      ".inst 0x658aad75  // bfcvt z21.h, p3/M, z11.s\n"
+      "zip2 z20.s, z19.s, z17.s\n"
+      ".inst 0x658aad53  // bfcvt z19.h, p3/M, z10.s\n"
+      "zip1 z18.s, z2.s, z16.s\n"
+      ".inst 0x658aad31  // bfcvt z17.h, p3/M, z9.s\n"
+      "zip2 z16.s, z2.s, z16.s\n"
+      ".inst 0x648aace8  // bfcvtnt z8.h, p3/M, z7.s\n"
+      ".inst 0x648aaca6  // bfcvtnt z6.h, p3/M, z5.s\n"
+      "st1h { z8.h }, p3, [x21]\n"
+      ".inst 0x648aac64  // bfcvtnt z4.h, p3/M, z3.s\n"
+      ".inst 0x648aac01  // bfcvtnt z1.h, p3/M, z0.s\n"
+      "st1h { z6.h }, p3, [x21, #1, MUL VL]\n"
+      ".inst 0x648aafdf  // bfcvtnt z31.h, p3/M, z30.s\n"
+      ".inst 0x648aaf9d  // bfcvtnt z29.h, p3/M, z28.s\n"
+      "st1h { z4.h }, p3, [x21, #2, MUL VL]\n"
+      "st1h { z1.h }, p3, [x21, #3, MUL VL]\n"
+      ".inst 0x648aaf5b  // bfcvtnt z27.h, p3/M, z26.s\n"
+      ".inst 0x648aaf19  // bfcvtnt z25.h, p3/M, z24.s\n"
+      "st1h { z31.h }, p3, [x21, #4, MUL VL]\n"
+      ".inst 0x648aaed7  // bfcvtnt z23.h, p3/M, z22.s\n"
+      ".inst 0x648aae95  // bfcvtnt z21.h, p3/M, z20.s\n"
+      "st1h { z29.h }, p3, [x21, #5, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      ".inst 0x648aae53  // bfcvtnt z19.h, p3/M, z18.s\n"
+      ".inst 0x648aae11  // bfcvtnt z17.h, p3/M, z16.s\n"
+      "st1h { z27.h }, p3, [x21]\n"
+      "st1h { z25.h }, p3, [x21, #1, MUL VL]\n"
+      "st1h { z23.h }, p3, [x21, #2, MUL VL]\n"
+      "st1h { z21.h }, p3, [x21, #3, MUL VL]\n"
+      "st1h { z19.h }, p3, [x21, #4, MUL VL]\n"
+      "st1h { z17.h }, p3, [x21, #5, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x23, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x23\n"
+      "whilelt p2.s, XZR, x20\n"
+      "ld1w { z20.s }, p2/Z, [x26]\n"
+      "ld1w { z19.s }, p2/Z, [x24]\n"
+      "decw x20\n"
+      "whilelt p1.s, XZR, x20\n"
+      "ld1w { z18.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1w { z17.s }, p1/Z, [x24, #1, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z25.s }, p0/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z16.s }, p0/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x25]\n"
+      "ld1w { z30.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "zip1 z23.s, z20.s, z19.s\n"
+      "zip2 z22.s, z20.s, z19.s\n"
+      "ld1w { z29.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x22]\n"
+      "zip1 z20.s, z18.s, z17.s\n"
+      "zip2 z19.s, z18.s, z17.s\n"
+      "ld1w { z18.s }, p1/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z28.s }, p0/Z, [x22, #2, MUL VL]\n"
+      "zip1 z17.s, z25.s, z16.s\n"
+      "zip2 z16.s, z25.s, z16.s\n"
+      "decd x23, ALL, MUL #6\n"
+      ".inst 0x658aaefb  // bfcvt z27.h, p3/M, z23.s\n"
+      "zip1 z26.s, z24.s, z21.s\n"
+      "cmp x23, #0x0\n"
+      ".inst 0x658aaed9  // bfcvt z25.h, p3/M, z22.s\n"
+      "zip2 z24.s, z24.s, z21.s\n"
+      "addvl x26, x26, #3\n"
+      "addvl x25, x25, #3\n"
+      ".inst 0x658aae97  // bfcvt z23.h, p3/M, z20.s\n"
+      "zip1 z22.s, z30.s, z18.s\n"
+      "addvl x24, x24, #3\n"
+      "addvl x22, x22, #3\n"
+      ".inst 0x658aae75  // bfcvt z21.h, p3/M, z19.s\n"
+      "zip2 z20.s, z30.s, z18.s\n"
+      ".inst 0x658aae33  // bfcvt z19.h, p3/M, z17.s\n"
+      "zip1 z18.s, z29.s, z28.s\n"
+      ".inst 0x658aae11  // bfcvt z17.h, p3/M, z16.s\n"
+      "zip2 z16.s, z29.s, z28.s\n"
+      ".inst 0x648aaf5b  // bfcvtnt z27.h, p3/M, z26.s\n"
+      ".inst 0x648aaf19  // bfcvtnt z25.h, p3/M, z24.s\n"
+      "st1h { z27.h }, p3, [x21]\n"
+      ".inst 0x648aaed7  // bfcvtnt z23.h, p3/M, z22.s\n"
+      ".inst 0x648aae95  // bfcvtnt z21.h, p3/M, z20.s\n"
+      "st1h { z25.h }, p3, [x21, #1, MUL VL]\n"
+      ".inst 0x648aae53  // bfcvtnt z19.h, p3/M, z18.s\n"
+      ".inst 0x648aae11  // bfcvtnt z17.h, p3/M, z16.s\n"
+      "st1h { z23.h }, p3, [x21, #2, MUL VL]\n"
+      "st1h { z21.h }, p3, [x21, #3, MUL VL]\n"
+      "st1h { z19.h }, p3, [x21, #4, MUL VL]\n"
+      "st1h { z17.h }, p3, [x21, #5, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #6\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<6, 4, true, VLType::SVE>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_6VL_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
new file mode 100644
index 0000000000..c638eaacde
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_6VL_4x2(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint32_t *pad_row = reinterpret_cast<uint32_t *>(alloca(width * sizeof(uint32_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint32_t));
+    }
+
+    size_t out_stride = 6 * roundup<size_t>(height, 2) * get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "ptrue p3.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[width]\n"
+      "cntw x26, ALL, MUL #6\n"
+      "add x25, x28, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp x27, x26\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z18.s }, p3/Z, [x28]\n"
+      "ld1w { z17.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "mov x21, x22\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1w { z19.s }, p3/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x25]\n"
+      "zip1 z9.s, z18.s, z16.s\n"
+      "zip2 z8.s, z18.s, z16.s\n"
+      "ld1w { z16.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z18.s }, p3/Z, [x25, #2, MUL VL]\n"
+      "zip1 z7.s, z17.s, z16.s\n"
+      "zip2 z6.s, z17.s, z16.s\n"
+      "ld1w { z17.s }, p3/Z, [x24]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "zip1 z5.s, z19.s, z18.s\n"
+      "zip2 z4.s, z19.s, z18.s\n"
+      "ld1w { z18.s }, p3/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x28, #4, MUL VL]\n"
+      "zip1 z3.s, z17.s, z16.s\n"
+      "zip2 z2.s, z17.s, z16.s\n"
+      "ld1w { z20.s }, p3/Z, [x28, #5, MUL VL]\n"
+      "ld1w { z17.s }, p3/Z, [x25, #3, MUL VL]\n"
+      "mov x20, x22\n"
+      "zip1 z1.s, z18.s, z17.s\n"
+      "ld1w { z19.s }, p3/Z, [x25, #4, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x25, #5, MUL VL]\n"
+      "zip2 z0.s, z18.s, z17.s\n"
+      "zip1 z31.s, z21.s, z19.s\n"
+      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z17.s }, p3/Z, [x24, #2, MUL VL]\n"
+      "zip2 z30.s, z21.s, z19.s\n"
+      "zip1 z29.s, z20.s, z16.s\n"
+      "ld1w { z19.s }, p3/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z28.s }, p3/Z, [x24, #4, MUL VL]\n"
+      "zip2 z27.s, z20.s, z16.s\n"
+      "sub x27, x27, x26\n"
+      "ld1w { z26.s }, p3/Z, [x24, #5, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip1 z25.s, z18.s, z16.s\n"
+      "zip2 z24.s, z18.s, z16.s\n"
+      "ld1w { z16.s }, p3/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z18.s }, p3/Z, [x23, #3, MUL VL]\n"
+      "zip1 z23.s, z17.s, z16.s\n"
+      "zip2 z22.s, z17.s, z16.s\n"
+      "ld1w { z17.s }, p3/Z, [x23, #4, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23, #5, MUL VL]\n"
+      "st1w { z9.s }, p3, [x21]\n"
+      "zip1 z21.s, z19.s, z18.s\n"
+      "st1w { z8.s }, p3, [x21, #1, MUL VL]\n"
+      "zip2 z20.s, z19.s, z18.s\n"
+      "cmp x27, x26\n"
+      "addvl x28, x28, #6\n"
+      "st1w { z7.s }, p3, [x21, #2, MUL VL]\n"
+      "addvl x25, x25, #6\n"
+      "addvl x24, x24, #6\n"
+      "zip1 z19.s, z28.s, z17.s\n"
+      "st1w { z6.s }, p3, [x21, #3, MUL VL]\n"
+      "addvl x23, x23, #6\n"
+      "zip2 z18.s, z28.s, z17.s\n"
+      "zip1 z17.s, z26.s, z16.s\n"
+      "st1w { z5.s }, p3, [x21, #4, MUL VL]\n"
+      "zip2 z16.s, z26.s, z16.s\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1w { z4.s }, p3, [x21, #5, MUL VL]\n"
+      "st1w { z3.s }, p3, [x21, #6, MUL VL]\n"
+      "st1w { z2.s }, p3, [x21, #7, MUL VL]\n"
+      "addvl x21, x21, #12\n"
+      "st1w { z25.s }, p3, [x21, #-4, MUL VL]\n"
+      "st1w { z24.s }, p3, [x21, #-3, MUL VL]\n"
+      "st1w { z23.s }, p3, [x21, #-2, MUL VL]\n"
+      "st1w { z22.s }, p3, [x21, #-1, MUL VL]\n"
+      "st1w { z1.s }, p3, [x20]\n"
+      "st1w { z0.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z31.s }, p3, [x20, #2, MUL VL]\n"
+      "st1w { z30.s }, p3, [x20, #3, MUL VL]\n"
+      "st1w { z29.s }, p3, [x20, #4, MUL VL]\n"
+      "st1w { z27.s }, p3, [x20, #5, MUL VL]\n"
+      "st1w { z21.s }, p3, [x20, #6, MUL VL]\n"
+      "st1w { z20.s }, p3, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      "st1w { z19.s }, p3, [x20, #-4, MUL VL]\n"
+      "st1w { z18.s }, p3, [x20, #-3, MUL VL]\n"
+      "st1w { z17.s }, p3, [x20, #-2, MUL VL]\n"
+      "st1w { z16.s }, p3, [x20, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x27, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x27\n"
+      "whilelt p2.s, XZR, x20\n"
+      "ld1w { z19.s }, p2/Z, [x28]\n"
+      "ld1w { z18.s }, p2/Z, [x25]\n"
+      "decw x20\n"
+      "whilelt p1.s, XZR, x20\n"
+      "ld1w { z17.s }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z22.s }, p0/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z21.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x24]\n"
+      "ld1w { z27.s }, p2/Z, [x23]\n"
+      "mov x20, x22\n"
+      "decd x27, ALL, MUL #6\n"
+      "ld1w { z26.s }, p1/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z25.s }, p0/Z, [x24, #2, MUL VL]\n"
+      "zip1 z20.s, z19.s, z18.s\n"
+      "zip2 z19.s, z19.s, z18.s\n"
+      "ld1w { z24.s }, p1/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x23, #2, MUL VL]\n"
+      "zip1 z18.s, z17.s, z16.s\n"
+      "zip2 z17.s, z17.s, z16.s\n"
+      "zip1 z16.s, z22.s, z21.s\n"
+      "zip2 z22.s, z22.s, z21.s\n"
+      "st1w { z20.s }, p3, [x20]\n"
+      "cmp x27, #0x0\n"
+      "zip1 z21.s, z28.s, z27.s\n"
+      "zip2 z20.s, z28.s, z27.s\n"
+      "st1w { z19.s }, p3, [x20, #1, MUL VL]\n"
+      "addvl x28, x28, #3\n"
+      "st1w { z18.s }, p3, [x20, #2, MUL VL]\n"
+      "addvl x25, x25, #3\n"
+      "addvl x24, x24, #3\n"
+      "zip1 z19.s, z26.s, z24.s\n"
+      "st1w { z17.s }, p3, [x20, #3, MUL VL]\n"
+      "addvl x23, x23, #3\n"
+      "zip2 z18.s, z26.s, z24.s\n"
+      "zip1 z17.s, z25.s, z23.s\n"
+      "st1w { z16.s }, p3, [x20, #4, MUL VL]\n"
+      "zip2 z16.s, z25.s, z23.s\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1w { z22.s }, p3, [x20, #5, MUL VL]\n"
+      "st1w { z21.s }, p3, [x20, #6, MUL VL]\n"
+      "st1w { z20.s }, p3, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      "st1w { z19.s }, p3, [x20, #-4, MUL VL]\n"
+      "st1w { z18.s }, p3, [x20, #-3, MUL VL]\n"
+      "st1w { z17.s }, p3, [x20, #-2, MUL VL]\n"
+      "st1w { z16.s }, p3, [x20, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #12\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x21, %x[width]\n"
+      "cntw x20, ALL, MUL #6\n"
+      "add x25, x28, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1w { z17.s }, p3/Z, [x28]\n"
+      "ld1w { z19.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1w { z18.s }, p3/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x25]\n"
+      "zip1 z28.s, z17.s, z16.s\n"
+      "zip2 z20.s, z17.s, z16.s\n"
+      "ld1w { z17.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x25, #2, MUL VL]\n"
+      "zip1 z27.s, z19.s, z17.s\n"
+      "zip2 z26.s, z19.s, z17.s\n"
+      "ld1w { z19.s }, p3/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z25.s }, p3/Z, [x28, #4, MUL VL]\n"
+      "zip1 z24.s, z18.s, z16.s\n"
+      "zip2 z23.s, z18.s, z16.s\n"
+      "ld1w { z22.s }, p3/Z, [x28, #5, MUL VL]\n"
+      "ld1w { z18.s }, p3/Z, [x25, #3, MUL VL]\n"
+      "addvl x28, x28, #6\n"
+      "zip1 z21.s, z19.s, z18.s\n"
+      "ld1w { z17.s }, p3/Z, [x25, #4, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x25, #5, MUL VL]\n"
+      "st1w { z28.s }, p3, [x22]\n"
+      "addvl x25, x25, #6\n"
+      "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+      "zip2 z20.s, z19.s, z18.s\n"
+      "zip1 z19.s, z25.s, z17.s\n"
+      "st1w { z27.s }, p3, [x22, #2, MUL VL]\n"
+      "zip2 z18.s, z25.s, z17.s\n"
+      "zip1 z17.s, z22.s, z16.s\n"
+      "st1w { z26.s }, p3, [x22, #3, MUL VL]\n"
+      "zip2 z16.s, z22.s, z16.s\n"
+      "st1w { z24.s }, p3, [x22, #4, MUL VL]\n"
+      "st1w { z23.s }, p3, [x22, #5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1w { z21.s }, p3, [x22]\n"
+      "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z19.s }, p3, [x22, #2, MUL VL]\n"
+      "st1w { z18.s }, p3, [x22, #3, MUL VL]\n"
+      "st1w { z17.s }, p3, [x22, #4, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22, #5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z20.s }, p0/Z, [x28]\n"
+      "ld1w { z19.s }, p0/Z, [x25]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z18.s }, p0/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z17.s }, p0/Z, [x25, #1, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z22.s }, p0/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "decd x21, ALL, MUL #6\n"
+      "cmp x21, #0x0\n"
+      "zip1 z21.s, z20.s, z19.s\n"
+      "zip2 z20.s, z20.s, z19.s\n"
+      "addvl x28, x28, #3\n"
+      "addvl x25, x25, #3\n"
+      "zip1 z19.s, z18.s, z17.s\n"
+      "zip2 z18.s, z18.s, z17.s\n"
+      "zip1 z17.s, z22.s, z16.s\n"
+      "zip2 z16.s, z22.s, z16.s\n"
+      "st1w { z21.s }, p3, [x22]\n"
+      "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z19.s }, p3, [x22, #2, MUL VL]\n"
+      "st1w { z18.s }, p3, [x22, #3, MUL VL]\n"
+      "st1w { z17.s }, p3, [x22, #4, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22, #5, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #6\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<6, 2, true, VLType::SVE>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_6VL_4x2(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
new file mode 100644
index 0000000000..0526bd0596
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_8VL(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 8 * height * get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x2\n"
+      "ptrue p1.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "mov x25, %x[width]\n"
+      "cntw x24, ALL, MUL #16\n"
+      "add x23, x26, %x[in_stride]\n"
+      "cmp x25, x24\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z15.s }, p1/Z, [x26]\n"
+      "ld1w { z14.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "mov x21, x22\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1w { z13.s }, p1/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z12.s }, p1/Z, [x26, #3, MUL VL]\n"
+      "mov x20, x22\n"
+      "sub x25, x25, x24\n"
+      "ld1w { z11.s }, p1/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x26, #5, MUL VL]\n"
+      "cmp x25, x24\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1w { z9.s }, p1/Z, [x26, #6, MUL VL]\n"
+      "ld1w { z8.s }, p1/Z, [x26, #7, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ld1w { z7.s }, p1/Z, [x23]\n"
+      "ld1w { z6.s }, p1/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z5.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z4.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z3.s }, p1/Z, [x23, #4, MUL VL]\n"
+      "ld1w { z2.s }, p1/Z, [x23, #5, MUL VL]\n"
+      "ld1w { z1.s }, p1/Z, [x23, #6, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x23, #7, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "ld1w { z31.s }, p1/Z, [x26, #-8, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x26, #-7, MUL VL]\n"
+      "ld1w { z29.s }, p1/Z, [x26, #-6, MUL VL]\n"
+      "ld1w { z28.s }, p1/Z, [x26, #-5, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x26, #-4, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x26, #-3, MUL VL]\n"
+      "ld1w { z25.s }, p1/Z, [x26, #-2, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x26, #-1, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x23, #-8, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x23, #-7, MUL VL]\n"
+      "ld1w { z21.s }, p1/Z, [x23, #-6, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x23, #-5, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x23, #-4, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #-3, MUL VL]\n"
+      "ld1w { z17.s }, p1/Z, [x23, #-2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x23, #-1, MUL VL]\n"
+      "st1w { z15.s }, p1, [x21]\n"
+      "st1w { z14.s }, p1, [x21, #1, MUL VL]\n"
+      "st1w { z13.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z12.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z11.s }, p1, [x21, #4, MUL VL]\n"
+      "st1w { z10.s }, p1, [x21, #5, MUL VL]\n"
+      "st1w { z9.s }, p1, [x21, #6, MUL VL]\n"
+      "st1w { z8.s }, p1, [x21, #7, MUL VL]\n"
+      "addvl x21, x21, #16\n"
+      "st1w { z7.s }, p1, [x21, #-8, MUL VL]\n"
+      "st1w { z6.s }, p1, [x21, #-7, MUL VL]\n"
+      "st1w { z5.s }, p1, [x21, #-6, MUL VL]\n"
+      "st1w { z4.s }, p1, [x21, #-5, MUL VL]\n"
+      "st1w { z3.s }, p1, [x21, #-4, MUL VL]\n"
+      "st1w { z2.s }, p1, [x21, #-3, MUL VL]\n"
+      "st1w { z1.s }, p1, [x21, #-2, MUL VL]\n"
+      "st1w { z0.s }, p1, [x21, #-1, MUL VL]\n"
+      "st1w { z31.s }, p1, [x20]\n"
+      "st1w { z30.s }, p1, [x20, #1, MUL VL]\n"
+      "st1w { z29.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z28.s }, p1, [x20, #3, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #4, MUL VL]\n"
+      "st1w { z26.s }, p1, [x20, #5, MUL VL]\n"
+      "st1w { z25.s }, p1, [x20, #6, MUL VL]\n"
+      "st1w { z24.s }, p1, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1w { z23.s }, p1, [x20, #-8, MUL VL]\n"
+      "st1w { z22.s }, p1, [x20, #-7, MUL VL]\n"
+      "st1w { z21.s }, p1, [x20, #-6, MUL VL]\n"
+      "st1w { z20.s }, p1, [x20, #-5, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #-4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x20, #-3, MUL VL]\n"
+      "st1w { z17.s }, p1, [x20, #-2, MUL VL]\n"
+      "st1w { z16.s }, p1, [x20, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x25, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x25\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z31.s }, p0/Z, [x26]\n"
+      "ld1w { z30.s }, p0/Z, [x23]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z29.s }, p0/Z, [x26, #1, MUL VL]\n"
+      "ld1w { z28.s }, p0/Z, [x23, #1, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z27.s }, p0/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z26.s }, p0/Z, [x23, #2, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z25.s }, p0/Z, [x26, #3, MUL VL]\n"
+      "ld1w { z24.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z23.s }, p0/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z22.s }, p0/Z, [x23, #4, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z21.s }, p0/Z, [x26, #5, MUL VL]\n"
+      "ld1w { z20.s }, p0/Z, [x23, #5, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z19.s }, p0/Z, [x26, #6, MUL VL]\n"
+      "ld1w { z18.s }, p0/Z, [x23, #6, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z17.s }, p0/Z, [x26, #7, MUL VL]\n"
+      "ld1w { z16.s }, p0/Z, [x23, #7, MUL VL]\n"
+      "mov x20, x22\n"
+      "decw x25, ALL, MUL #8\n"
+      "st1w { z31.s }, p1, [x20]\n"
+      "st1w { z29.s }, p1, [x20, #1, MUL VL]\n"
+      "cmp x25, #0x0\n"
+      "addvl x26, x26, #8\n"
+      "st1w { z27.s }, p1, [x20, #2, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1w { z25.s }, p1, [x20, #3, MUL VL]\n"
+      "st1w { z23.s }, p1, [x20, #4, MUL VL]\n"
+      "st1w { z21.s }, p1, [x20, #5, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #6, MUL VL]\n"
+      "st1w { z17.s }, p1, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1w { z30.s }, p1, [x20, #-8, MUL VL]\n"
+      "st1w { z28.s }, p1, [x20, #-7, MUL VL]\n"
+      "st1w { z26.s }, p1, [x20, #-6, MUL VL]\n"
+      "st1w { z24.s }, p1, [x20, #-5, MUL VL]\n"
+      "st1w { z22.s }, p1, [x20, #-4, MUL VL]\n"
+      "st1w { z20.s }, p1, [x20, #-3, MUL VL]\n"
+      "st1w { z18.s }, p1, [x20, #-2, MUL VL]\n"
+      "st1w { z16.s }, p1, [x20, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x2\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x21, %x[width]\n"
+      "cntw x20, ALL, MUL #16\n"
+      "mov x26, %x[in]\n"
+      "cmp x21, x20\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1w { z31.s }, p1/Z, [x26]\n"
+      "ld1w { z30.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1w { z29.s }, p1/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z28.s }, p1/Z, [x26, #3, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x26, #5, MUL VL]\n"
+      "ld1w { z25.s }, p1/Z, [x26, #6, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x26, #7, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ld1w { z23.s }, p1/Z, [x26, #-8, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x26, #-7, MUL VL]\n"
+      "ld1w { z21.s }, p1/Z, [x26, #-6, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x26, #-5, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x26, #-4, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x26, #-3, MUL VL]\n"
+      "ld1w { z17.s }, p1/Z, [x26, #-2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x26, #-1, MUL VL]\n"
+      "st1w { z31.s }, p1, [x22]\n"
+      "st1w { z30.s }, p1, [x22, #1, MUL VL]\n"
+      "st1w { z29.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z28.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z27.s }, p1, [x22, #4, MUL VL]\n"
+      "st1w { z26.s }, p1, [x22, #5, MUL VL]\n"
+      "st1w { z25.s }, p1, [x22, #6, MUL VL]\n"
+      "st1w { z24.s }, p1, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1w { z23.s }, p1, [x22]\n"
+      "st1w { z22.s }, p1, [x22, #1, MUL VL]\n"
+      "st1w { z21.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #5, MUL VL]\n"
+      "st1w { z17.s }, p1, [x22, #6, MUL VL]\n"
+      "st1w { z16.s }, p1, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z23.s }, p0/Z, [x26]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z21.s }, p0/Z, [x26, #2, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z19.s }, p0/Z, [x26, #4, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z18.s }, p0/Z, [x26, #5, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z17.s }, p0/Z, [x26, #6, MUL VL]\n"
+      "decw x20\n"
+      "decw x21, ALL, MUL #8\n"
+      "whilelt p0.s, XZR, x20\n"
+      "cmp x21, #0x0\n"
+      "ld1w { z16.s }, p0/Z, [x26, #7, MUL VL]\n"
+      "st1w { z23.s }, p1, [x22]\n"
+      "addvl x26, x26, #8\n"
+      "st1w { z22.s }, p1, [x22, #1, MUL VL]\n"
+      "st1w { z21.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #5, MUL VL]\n"
+      "st1w { z17.s }, p1, [x22, #6, MUL VL]\n"
+      "st1w { z16.s }, p1, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 1, true, VLType::SVE>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
new file mode 100644
index 0000000000..98f0770d77
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_8VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "mov x23, %x[width]\n"
+      "cntb x20, ALL, MUL #8\n"
+      "add x22, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x22, %x[in_stride]\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "cmp x23, x20\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z7.b }, p2/Z, [x26]\n"
+      "ld1b { z24.b }, p2/Z, [x26, #1, MUL VL]\n"
+      "sub x23, x23, x20\n"
+      "cmp x23, x20\n"
+      "ld1b { z31.b }, p2/Z, [x25]\n"
+      "ld1b { z18.b }, p2/Z, [x25, #1, MUL VL]\n"
+      "ld1b { z19.b }, p2/Z, [x24]\n"
+      "ld1b { z25.b }, p2/Z, [x24, #1, MUL VL]\n"
+      "zip1 z23.b, z7.b, z19.b\n"
+      "zip2 z20.b, z7.b, z19.b\n"
+      "ld1b { z30.b }, p2/Z, [x22]\n"
+      "ld1b { z3.b }, p2/Z, [x22, #1, MUL VL]\n"
+      "zip1 z21.b, z31.b, z30.b\n"
+      "zip2 z19.b, z31.b, z30.b\n"
+      "ld1b { z16.b }, p2/Z, [x26, #2, MUL VL]\n"
+      "ld1b { z30.b }, p2/Z, [x26, #3, MUL VL]\n"
+      "zip1 z2.b, z24.b, z25.b\n"
+      "zip1 z17.b, z18.b, z3.b\n"
+      "ld1b { z29.b }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x25, #3, MUL VL]\n"
+      "zip2 z22.b, z24.b, z25.b\n"
+      "zip2 z4.b, z18.b, z3.b\n"
+      "ld1b { z0.b }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1b { z3.b }, p2/Z, [x24, #3, MUL VL]\n"
+      "zip1 z9.b, z16.b, z0.b\n"
+      "zip2 z14.b, z16.b, z0.b\n"
+      "ld1b { z18.b }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x22, #3, MUL VL]\n"
+      "zip1 z24.b, z29.b, z18.b\n"
+      "zip2 z11.b, z29.b, z18.b\n"
+      "ld1b { z1.b }, p2/Z, [x26, #4, MUL VL]\n"
+      "ld1b { z12.b }, p2/Z, [x26, #5, MUL VL]\n"
+      "zip1 z13.b, z30.b, z3.b\n"
+      "zip1 z15.b, z8.b, z16.b\n"
+      "ld1b { z5.b }, p2/Z, [x25, #4, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x25, #5, MUL VL]\n"
+      "zip2 z31.b, z30.b, z3.b\n"
+      "zip2 z30.b, z8.b, z16.b\n"
+      "ld1b { z16.b }, p2/Z, [x24, #4, MUL VL]\n"
+      "ld1b { z18.b }, p2/Z, [x24, #5, MUL VL]\n"
+      "zip1 z27.b, z1.b, z16.b\n"
+      "zip2 z10.b, z1.b, z16.b\n"
+      "ld1b { z7.b }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x22, #5, MUL VL]\n"
+      "zip1 z8.b, z5.b, z7.b\n"
+      "zip2 z26.b, z5.b, z7.b\n"
+      "ld1b { z3.b }, p2/Z, [x26, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x26, #7, MUL VL]\n"
+      "zip1 z6.b, z12.b, z18.b\n"
+      "zip1 z5.b, z29.b, z16.b\n"
+      "ld1b { z0.b }, p2/Z, [x25, #6, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x25, #7, MUL VL]\n"
+      "zip2 z12.b, z12.b, z18.b\n"
+      "zip2 z7.b, z29.b, z16.b\n"
+      "ld1b { z1.b }, p2/Z, [x24, #6, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x24, #7, MUL VL]\n"
+      "zip1 z16.b, z23.b, z21.b\n"
+      "zip2 z18.b, z23.b, z21.b\n"
+      "ld1b { z23.b }, p2/Z, [x22, #6, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x22, #7, MUL VL]\n"
+      "st1b { z16.b }, p2, [x21]\n"
+      "zip1 z16.b, z20.b, z19.b\n"
+      "zip2 z20.b, z20.b, z19.b\n"
+      "zip1 z19.b, z2.b, z17.b\n"
+      "st1b { z18.b }, p2, [x21, #1, MUL VL]\n"
+      "addvl x26, x26, #8\n"
+      "zip2 z18.b, z2.b, z17.b\n"
+      "zip1 z17.b, z22.b, z4.b\n"
+      "st1b { z16.b }, p2, [x21, #2, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "zip2 z16.b, z22.b, z4.b\n"
+      "st1b { z20.b }, p2, [x21, #3, MUL VL]\n"
+      "zip1 z4.b, z3.b, z1.b\n"
+      "addvl x24, x24, #8\n"
+      "st1b { z19.b }, p2, [x21, #4, MUL VL]\n"
+      "zip1 z22.b, z0.b, z23.b\n"
+      "zip2 z3.b, z3.b, z1.b\n"
+      "addvl x22, x22, #8\n"
+      "st1b { z18.b }, p2, [x21, #5, MUL VL]\n"
+      "zip2 z2.b, z0.b, z23.b\n"
+      "zip1 z1.b, z25.b, z29.b\n"
+      "st1b { z17.b }, p2, [x21, #6, MUL VL]\n"
+      "zip1 z0.b, z28.b, z21.b\n"
+      "zip2 z29.b, z25.b, z29.b\n"
+      "st1b { z16.b }, p2, [x21, #7, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip2 z28.b, z28.b, z21.b\n"
+      "zip1 z17.b, z9.b, z24.b\n"
+      "zip2 z16.b, z9.b, z24.b\n"
+      "zip1 z19.b, z14.b, z11.b\n"
+      "st1b { z17.b }, p2, [x21]\n"
+      "zip2 z18.b, z14.b, z11.b\n"
+      "zip1 z17.b, z13.b, z15.b\n"
+      "st1b { z16.b }, p2, [x21, #1, MUL VL]\n"
+      "zip2 z16.b, z13.b, z15.b\n"
+      "zip1 z21.b, z31.b, z30.b\n"
+      "st1b { z19.b }, p2, [x21, #2, MUL VL]\n"
+      "zip2 z20.b, z31.b, z30.b\n"
+      "st1b { z18.b }, p2, [x21, #3, MUL VL]\n"
+      "zip1 z19.b, z27.b, z8.b\n"
+      "st1b { z17.b }, p2, [x21, #4, MUL VL]\n"
+      "zip2 z18.b, z27.b, z8.b\n"
+      "zip1 z17.b, z10.b, z26.b\n"
+      "st1b { z16.b }, p2, [x21, #5, MUL VL]\n"
+      "zip2 z16.b, z10.b, z26.b\n"
+      "zip1 z27.b, z6.b, z5.b\n"
+      "st1b { z21.b }, p2, [x21, #6, MUL VL]\n"
+      "zip2 z26.b, z6.b, z5.b\n"
+      "zip1 z25.b, z12.b, z7.b\n"
+      "st1b { z20.b }, p2, [x21, #7, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "zip2 z24.b, z12.b, z7.b\n"
+      "zip1 z23.b, z4.b, z22.b\n"
+      "st1b { z19.b }, p2, [x21]\n"
+      "zip2 z22.b, z4.b, z22.b\n"
+      "zip1 z21.b, z3.b, z2.b\n"
+      "st1b { z18.b }, p2, [x21, #1, MUL VL]\n"
+      "zip2 z20.b, z3.b, z2.b\n"
+      "zip1 z19.b, z1.b, z0.b\n"
+      "st1b { z17.b }, p2, [x21, #2, MUL VL]\n"
+      "zip2 z18.b, z1.b, z0.b\n"
+      "zip1 z17.b, z29.b, z28.b\n"
+      "st1b { z16.b }, p2, [x21, #3, MUL VL]\n"
+      "zip2 z16.b, z29.b, z28.b\n"
+      "st1b { z27.b }, p2, [x21, #4, MUL VL]\n"
+      "st1b { z26.b }, p2, [x21, #5, MUL VL]\n"
+      "st1b { z25.b }, p2, [x21, #6, MUL VL]\n"
+      "st1b { z24.b }, p2, [x21, #7, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "st1b { z23.b }, p2, [x21]\n"
+      "st1b { z22.b }, p2, [x21, #1, MUL VL]\n"
+      "st1b { z21.b }, p2, [x21, #2, MUL VL]\n"
+      "st1b { z20.b }, p2, [x21, #3, MUL VL]\n"
+      "st1b { z19.b }, p2, [x21, #4, MUL VL]\n"
+      "st1b { z18.b }, p2, [x21, #5, MUL VL]\n"
+      "st1b { z17.b }, p2, [x21, #6, MUL VL]\n"
+      "st1b { z16.b }, p2, [x21, #7, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x23, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x23\n"
+      "whilelt p1.b, XZR, x20\n"
+      "ld1b { z23.b }, p1/Z, [x26]\n"
+      "ld1b { z22.b }, p1/Z, [x25]\n"
+      "decb x20\n"
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z21.b }, p0/Z, [x26, #1, MUL VL]\n"
+      "ld1b { z25.b }, p0/Z, [x25, #1, MUL VL]\n"
+      "ld1b { z19.b }, p1/Z, [x24]\n"
+      "ld1b { z20.b }, p0/Z, [x24, #1, MUL VL]\n"
+      "decw x23, ALL, MUL #8\n"
+      "zip1 z24.b, z23.b, z19.b\n"
+      "ld1b { z18.b }, p1/Z, [x22]\n"
+      "ld1b { z16.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "zip1 z17.b, z22.b, z18.b\n"
+      "zip2 z23.b, z23.b, z19.b\n"
+      "zip2 z19.b, z22.b, z18.b\n"
+      "zip1 z22.b, z21.b, z20.b\n"
+      "cmp x23, #0x0\n"
+      "addvl x26, x26, #2\n"
+      "zip1 z18.b, z25.b, z16.b\n"
+      "zip2 z21.b, z21.b, z20.b\n"
+      "addvl x25, x25, #2\n"
+      "addvl x24, x24, #2\n"
+      "zip2 z20.b, z25.b, z16.b\n"
+      "addvl x22, x22, #2\n"
+      "zip1 z16.b, z24.b, z17.b\n"
+      "st1b { z16.b }, p2, [x21]\n"
+      "zip2 z16.b, z24.b, z17.b\n"
+      "zip1 z17.b, z23.b, z19.b\n"
+      "st1b { z16.b }, p2, [x21, #1, MUL VL]\n"
+      "zip2 z16.b, z23.b, z19.b\n"
+      "zip1 z19.b, z22.b, z18.b\n"
+      "st1b { z17.b }, p2, [x21, #2, MUL VL]\n"
+      "zip2 z18.b, z22.b, z18.b\n"
+      "zip1 z17.b, z21.b, z20.b\n"
+      "st1b { z16.b }, p2, [x21, #3, MUL VL]\n"
+      "zip2 z16.b, z21.b, z20.b\n"
+      "st1b { z19.b }, p2, [x21, #4, MUL VL]\n"
+      "st1b { z18.b }, p2, [x21, #5, MUL VL]\n"
+      "st1b { z17.b }, p2, [x21, #6, MUL VL]\n"
+      "st1b { z16.b }, p2, [x21, #7, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 4, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 4, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
new file mode 100644
index 0000000000..3fa5292143
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_8VL_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 8) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 8) * get_vector_length<uint64_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "1:"  // Main row loop: Head
+      "mov x10, %x[in]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x7\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x5\n"
+      "mov x22, %x[width]\n"
+      "cntb x21, ALL, MUL #2\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "csel x26, x26, %x[pad_row], GE\n"
+      "cmp %x[height], #0x3\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "csel x28, x28, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x9, x9, %x[pad_row], GT\n"
+      "cmp x22, x21\n"
+      "mov x20, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z23.b }, p1/Z, [x10]\n"
+      "ld1b { z22.b }, p1/Z, [x9]\n"
+      "sub x22, x22, x21\n"
+      "cmp x22, x21\n"
+      "ld1b { z20.b }, p1/Z, [x28]\n"
+      "ld1b { z21.b }, p1/Z, [x27]\n"
+      "ld1b { z19.b }, p1/Z, [x26]\n"
+      "ld1b { z18.b }, p1/Z, [x25]\n"
+      "zip1 z5.b, z23.b, z19.b\n"
+      "zip1 z4.b, z22.b, z18.b\n"
+      "ld1b { z17.b }, p1/Z, [x24]\n"
+      "ld1b { z16.b }, p1/Z, [x23]\n"
+      "zip1 z3.b, z20.b, z17.b\n"
+      "zip1 z31.b, z21.b, z16.b\n"
+      "ld1b { z25.b }, p1/Z, [x10, #1, MUL VL]\n"
+      "ld1b { z24.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "zip2 z2.b, z23.b, z19.b\n"
+      "zip2 z30.b, z20.b, z17.b\n"
+      "ld1b { z23.b }, p1/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z20.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "zip2 z22.b, z22.b, z18.b\n"
+      "zip2 z21.b, z21.b, z16.b\n"
+      "ld1b { z19.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1b { z18.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "zip1 z29.b, z25.b, z19.b\n"
+      "zip1 z28.b, z24.b, z18.b\n"
+      "ld1b { z17.b }, p1/Z, [x24, #1, MUL VL]\n"
+      "ld1b { z16.b }, p1/Z, [x23, #1, MUL VL]\n"
+      "zip1 z27.b, z23.b, z17.b\n"
+      "zip1 z26.b, z20.b, z16.b\n"
+      "zip2 z1.b, z25.b, z19.b\n"
+      "zip2 z25.b, z23.b, z17.b\n"
+      "addvl x10, x10, #2\n"
+      "addvl x9, x9, #2\n"
+      "zip2 z24.b, z24.b, z18.b\n"
+      "zip2 z16.b, z20.b, z16.b\n"
+      "addvl x28, x28, #2\n"
+      "addvl x27, x27, #2\n"
+      "zip1 z0.b, z5.b, z3.b\n"
+      "zip1 z17.b, z4.b, z31.b\n"
+      "addvl x26, x26, #2\n"
+      "addvl x25, x25, #2\n"
+      "zip2 z20.b, z5.b, z3.b\n"
+      "zip2 z19.b, z4.b, z31.b\n"
+      "addvl x24, x24, #2\n"
+      "addvl x23, x23, #2\n"
+      "zip1 z31.b, z2.b, z30.b\n"
+      "zip1 z18.b, z22.b, z21.b\n"
+      "zip2 z30.b, z2.b, z30.b\n"
+      "zip2 z23.b, z22.b, z21.b\n"
+      "zip1 z22.b, z29.b, z27.b\n"
+      "zip1 z21.b, z28.b, z26.b\n"
+      "zip2 z29.b, z29.b, z27.b\n"
+      "zip2 z28.b, z28.b, z26.b\n"
+      "zip1 z27.b, z1.b, z25.b\n"
+      "zip1 z26.b, z24.b, z16.b\n"
+      "zip2 z25.b, z1.b, z25.b\n"
+      "zip2 z24.b, z24.b, z16.b\n"
+      "zip1 z16.b, z0.b, z17.b\n"
+      "zip2 z17.b, z0.b, z17.b\n"
+      "st1b { z16.b }, p1, [x20]\n"
+      "zip1 z16.b, z20.b, z19.b\n"
+      "zip2 z20.b, z20.b, z19.b\n"
+      "st1b { z17.b }, p1, [x20, #1, MUL VL]\n"
+      "zip1 z19.b, z31.b, z18.b\n"
+      "zip2 z18.b, z31.b, z18.b\n"
+      "st1b { z16.b }, p1, [x20, #2, MUL VL]\n"
+      "zip1 z17.b, z30.b, z23.b\n"
+      "zip2 z16.b, z30.b, z23.b\n"
+      "st1b { z20.b }, p1, [x20, #3, MUL VL]\n"
+      "st1b { z19.b }, p1, [x20, #4, MUL VL]\n"
+      "zip1 z23.b, z22.b, z21.b\n"
+      "zip2 z22.b, z22.b, z21.b\n"
+      "st1b { z18.b }, p1, [x20, #5, MUL VL]\n"
+      "zip1 z21.b, z29.b, z28.b\n"
+      "zip2 z20.b, z29.b, z28.b\n"
+      "st1b { z17.b }, p1, [x20, #6, MUL VL]\n"
+      "zip1 z19.b, z27.b, z26.b\n"
+      "zip2 z18.b, z27.b, z26.b\n"
+      "st1b { z16.b }, p1, [x20, #7, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "zip1 z17.b, z25.b, z24.b\n"
+      "zip2 z16.b, z25.b, z24.b\n"
+      "st1b { z23.b }, p1, [x20]\n"
+      "st1b { z22.b }, p1, [x20, #1, MUL VL]\n"
+      "st1b { z21.b }, p1, [x20, #2, MUL VL]\n"
+      "st1b { z20.b }, p1, [x20, #3, MUL VL]\n"
+      "st1b { z19.b }, p1, [x20, #4, MUL VL]\n"
+      "st1b { z18.b }, p1, [x20, #5, MUL VL]\n"
+      "st1b { z17.b }, p1, [x20, #6, MUL VL]\n"
+      "st1b { z16.b }, p1, [x20, #7, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x22\n"
+      "ld1b { z25.b }, p0/Z, [x10]\n"
+      "ld1b { z27.b }, p0/Z, [x9]\n"
+      "decd x22, ALL, MUL #8\n"
+      "ld1b { z26.b }, p0/Z, [x28]\n"
+      "ld1b { z24.b }, p0/Z, [x27]\n"
+      "cmp x22, #0x0\n"
+      "addvl x10, x10, #1\n"
+      "ld1b { z22.b }, p0/Z, [x26]\n"
+      "ld1b { z21.b }, p0/Z, [x25]\n"
+      "zip1 z20.b, z25.b, z22.b\n"
+      "zip1 z23.b, z27.b, z21.b\n"
+      "ld1b { z17.b }, p0/Z, [x24]\n"
+      "ld1b { z16.b }, p0/Z, [x23]\n"
+      "zip1 z19.b, z26.b, z17.b\n"
+      "zip1 z18.b, z24.b, z16.b\n"
+      "zip2 z25.b, z25.b, z22.b\n"
+      "zip2 z22.b, z26.b, z17.b\n"
+      "addvl x9, x9, #1\n"
+      "addvl x28, x28, #1\n"
+      "zip2 z21.b, z27.b, z21.b\n"
+      "zip2 z16.b, z24.b, z16.b\n"
+      "addvl x27, x27, #1\n"
+      "addvl x26, x26, #1\n"
+      "zip1 z24.b, z20.b, z19.b\n"
+      "zip1 z17.b, z23.b, z18.b\n"
+      "addvl x25, x25, #1\n"
+      "addvl x24, x24, #1\n"
+      "zip2 z20.b, z20.b, z19.b\n"
+      "zip2 z19.b, z23.b, z18.b\n"
+      "addvl x23, x23, #1\n"
+      "zip1 z23.b, z25.b, z22.b\n"
+      "zip1 z18.b, z21.b, z16.b\n"
+      "zip2 z22.b, z25.b, z22.b\n"
+      "zip2 z21.b, z21.b, z16.b\n"
+      "zip1 z16.b, z24.b, z17.b\n"
+      "zip2 z17.b, z24.b, z17.b\n"
+      "st1b { z16.b }, p1, [x20]\n"
+      "zip1 z16.b, z20.b, z19.b\n"
+      "zip2 z20.b, z20.b, z19.b\n"
+      "st1b { z17.b }, p1, [x20, #1, MUL VL]\n"
+      "zip1 z19.b, z23.b, z18.b\n"
+      "zip2 z18.b, z23.b, z18.b\n"
+      "st1b { z16.b }, p1, [x20, #2, MUL VL]\n"
+      "zip1 z17.b, z22.b, z21.b\n"
+      "zip2 z16.b, z22.b, z21.b\n"
+      "st1b { z20.b }, p1, [x20, #3, MUL VL]\n"
+      "st1b { z19.b }, p1, [x20, #4, MUL VL]\n"
+      "st1b { z18.b }, p1, [x20, #5, MUL VL]\n"
+      "st1b { z17.b }, p1, [x20, #6, MUL VL]\n"
+      "st1b { z16.b }, p1, [x20, #7, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 8, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 8, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
new file mode 100644
index 0000000000..02977ecf1e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 2) * get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "ptrue p4.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[width]\n"
+      "cnth x26, ALL, MUL #8\n"
+      "add x25, x28, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp x27, x26\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z30.h }, p4/Z, [x28]\n"
+      "ld1h { z12.h }, p4/Z, [x28, #1, MUL VL]\n"
+      "mov x21, x22\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1h { z31.h }, p4/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z18.h }, p4/Z, [x28, #3, MUL VL]\n"
+      "mov x20, x22\n"
+      "sub x27, x27, x26\n"
+      "ld1h { z20.h }, p4/Z, [x25]\n"
+      "ld1h { z17.h }, p4/Z, [x25, #1, MUL VL]\n"
+      "zip1 z3.h, z30.h, z20.h\n"
+      "zip2 z21.h, z30.h, z20.h\n"
+      "ld1h { z26.h }, p4/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z23.h }, p4/Z, [x25, #3, MUL VL]\n"
+      "zip1 z13.h, z12.h, z17.h\n"
+      "zip2 z0.h, z12.h, z17.h\n"
+      "ld1h { z2.h }, p4/Z, [x28, #4, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x28, #5, MUL VL]\n"
+      "zip1 z12.h, z31.h, z26.h\n"
+      "zip2 z14.h, z31.h, z26.h\n"
+      "ld1h { z17.h }, p4/Z, [x28, #6, MUL VL]\n"
+      "ld1h { z29.h }, p4/Z, [x28, #7, MUL VL]\n"
+      "zip1 z16.h, z18.h, z23.h\n"
+      "zip2 z15.h, z18.h, z23.h\n"
+      "ld1h { z9.h }, p4/Z, [x25, #4, MUL VL]\n"
+      "ld1h { z18.h }, p4/Z, [x25, #5, MUL VL]\n"
+      "zip1 z11.h, z2.h, z9.h\n"
+      "zip2 z5.h, z2.h, z9.h\n"
+      "ld1h { z7.h }, p4/Z, [x25, #6, MUL VL]\n"
+      "ld1h { z2.h }, p4/Z, [x25, #7, MUL VL]\n"
+      "zip1 z10.h, z24.h, z18.h\n"
+      "zip2 z6.h, z24.h, z18.h\n"
+      "ld1h { z19.h }, p4/Z, [x24]\n"
+      "ld1h { z18.h }, p4/Z, [x24, #1, MUL VL]\n"
+      "zip1 z9.h, z17.h, z7.h\n"
+      "zip2 z4.h, z17.h, z7.h\n"
+      "ld1h { z24.h }, p4/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z22.h }, p4/Z, [x24, #3, MUL VL]\n"
+      "zip1 z7.h, z29.h, z2.h\n"
+      "zip2 z8.h, z29.h, z2.h\n"
+      "ld1h { z25.h }, p4/Z, [x24, #4, MUL VL]\n"
+      "ld1h { z17.h }, p4/Z, [x24, #5, MUL VL]\n"
+      "cmp x27, x26\n"
+      "addvl x28, x28, #8\n"
+      "ld1h { z2.h }, p4/Z, [x24, #6, MUL VL]\n"
+      "ld1h { z30.h }, p4/Z, [x24, #7, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "addvl x24, x24, #8\n"
+      "ld1h { z20.h }, p4/Z, [x23]\n"
+      "ld1h { z27.h }, p4/Z, [x23, #1, MUL VL]\n"
+      "zip1 z31.h, z19.h, z20.h\n"
+      "zip2 z29.h, z19.h, z20.h\n"
+      "ld1h { z26.h }, p4/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z23.h }, p4/Z, [x23, #3, MUL VL]\n"
+      "zip1 z28.h, z18.h, z27.h\n"
+      "zip2 z1.h, z18.h, z27.h\n"
+      "ld1h { z20.h }, p4/Z, [x23, #4, MUL VL]\n"
+      "ld1h { z19.h }, p4/Z, [x23, #5, MUL VL]\n"
+      "zip1 z27.h, z24.h, z26.h\n"
+      "zip2 z26.h, z24.h, z26.h\n"
+      "ld1h { z18.h }, p4/Z, [x23, #6, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x23, #7, MUL VL]\n"
+      "st1h { z3.h }, p4, [x21]\n"
+      "zip1 z3.h, z22.h, z23.h\n"
+      "st1h { z21.h }, p4, [x21, #1, MUL VL]\n"
+      "zip2 z22.h, z22.h, z23.h\n"
+      "addvl x23, x23, #8\n"
+      "zip1 z23.h, z25.h, z20.h\n"
+      "st1h { z13.h }, p4, [x21, #2, MUL VL]\n"
+      "zip2 z25.h, z25.h, z20.h\n"
+      "zip1 z21.h, z17.h, z19.h\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z0.h }, p4, [x21, #3, MUL VL]\n"
+      "zip2 z20.h, z17.h, z19.h\n"
+      "zip1 z19.h, z2.h, z18.h\n"
+      "st1h { z12.h }, p4, [x21, #4, MUL VL]\n"
+      "zip2 z18.h, z2.h, z18.h\n"
+      "zip1 z17.h, z30.h, z24.h\n"
+      "st1h { z14.h }, p4, [x21, #5, MUL VL]\n"
+      "zip2 z13.h, z30.h, z24.h\n"
+      "st1h { z16.h }, p4, [x21, #6, MUL VL]\n"
+      "st1h { z15.h }, p4, [x21, #7, MUL VL]\n"
+      "addvl x21, x21, #16\n"
+      "st1h { z31.h }, p4, [x21, #-8, MUL VL]\n"
+      "st1h { z29.h }, p4, [x21, #-7, MUL VL]\n"
+      "st1h { z28.h }, p4, [x21, #-6, MUL VL]\n"
+      "st1h { z1.h }, p4, [x21, #-5, MUL VL]\n"
+      "st1h { z27.h }, p4, [x21, #-4, MUL VL]\n"
+      "st1h { z26.h }, p4, [x21, #-3, MUL VL]\n"
+      "st1h { z3.h }, p4, [x21, #-2, MUL VL]\n"
+      "st1h { z22.h }, p4, [x21, #-1, MUL VL]\n"
+      "st1h { z11.h }, p4, [x20]\n"
+      "st1h { z5.h }, p4, [x20, #1, MUL VL]\n"
+      "st1h { z10.h }, p4, [x20, #2, MUL VL]\n"
+      "st1h { z6.h }, p4, [x20, #3, MUL VL]\n"
+      "st1h { z9.h }, p4, [x20, #4, MUL VL]\n"
+      "st1h { z4.h }, p4, [x20, #5, MUL VL]\n"
+      "st1h { z7.h }, p4, [x20, #6, MUL VL]\n"
+      "st1h { z8.h }, p4, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z23.h }, p4, [x20, #-8, MUL VL]\n"
+      "st1h { z25.h }, p4, [x20, #-7, MUL VL]\n"
+      "st1h { z21.h }, p4, [x20, #-6, MUL VL]\n"
+      "st1h { z20.h }, p4, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p4, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p4, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p4, [x20, #-2, MUL VL]\n"
+      "st1h { z13.h }, p4, [x20, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x27, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x27\n"
+      "whilelt p3.h, XZR, x20\n"
+      "ld1h { z20.h }, p3/Z, [x28]\n"
+      "ld1h { z19.h }, p3/Z, [x25]\n"
+      "dech x20\n"
+      "whilelt p2.h, XZR, x20\n"
+      "ld1h { z18.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z17.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z25.h }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z16.h }, p1/Z, [x25, #2, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z0.h }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1h { z24.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "mov x20, x22\n"
+      "decw x27, ALL, MUL #8\n"
+      "ld1h { z31.h }, p3/Z, [x24]\n"
+      "ld1h { z30.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z29.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z28.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "zip1 z23.h, z20.h, z19.h\n"
+      "zip2 z22.h, z20.h, z19.h\n"
+      "ld1h { z21.h }, p3/Z, [x23]\n"
+      "ld1h { z27.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "zip1 z20.h, z18.h, z17.h\n"
+      "zip2 z19.h, z18.h, z17.h\n"
+      "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z26.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "zip1 z17.h, z25.h, z16.h\n"
+      "zip2 z16.h, z25.h, z16.h\n"
+      "zip1 z25.h, z0.h, z24.h\n"
+      "zip2 z24.h, z0.h, z24.h\n"
+      "st1h { z23.h }, p4, [x20]\n"
+      "cmp x27, #0x0\n"
+      "st1h { z22.h }, p4, [x20, #1, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "addvl x25, x25, #4\n"
+      "zip1 z23.h, z31.h, z21.h\n"
+      "st1h { z20.h }, p4, [x20, #2, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "addvl x23, x23, #4\n"
+      "zip2 z22.h, z31.h, z21.h\n"
+      "st1h { z19.h }, p4, [x20, #3, MUL VL]\n"
+      "zip1 z21.h, z30.h, z27.h\n"
+      "zip2 z20.h, z30.h, z27.h\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z17.h }, p4, [x20, #4, MUL VL]\n"
+      "zip1 z19.h, z29.h, z18.h\n"
+      "zip2 z18.h, z29.h, z18.h\n"
+      "st1h { z16.h }, p4, [x20, #5, MUL VL]\n"
+      "zip1 z17.h, z28.h, z26.h\n"
+      "zip2 z16.h, z28.h, z26.h\n"
+      "st1h { z25.h }, p4, [x20, #6, MUL VL]\n"
+      "st1h { z24.h }, p4, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z23.h }, p4, [x20, #-8, MUL VL]\n"
+      "st1h { z22.h }, p4, [x20, #-7, MUL VL]\n"
+      "st1h { z21.h }, p4, [x20, #-6, MUL VL]\n"
+      "st1h { z20.h }, p4, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p4, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p4, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p4, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p4, [x20, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x4\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x21, %x[width]\n"
+      "cnth x20, ALL, MUL #8\n"
+      "add x25, x28, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z17.h }, p4/Z, [x28]\n"
+      "ld1h { z20.h }, p4/Z, [x28, #1, MUL VL]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1h { z23.h }, p4/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z19.h }, p4/Z, [x28, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x25]\n"
+      "ld1h { z18.h }, p4/Z, [x25, #1, MUL VL]\n"
+      "zip1 z0.h, z17.h, z16.h\n"
+      "zip2 z22.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p4/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x25, #3, MUL VL]\n"
+      "zip1 z31.h, z20.h, z18.h\n"
+      "zip2 z30.h, z20.h, z18.h\n"
+      "ld1h { z21.h }, p4/Z, [x28, #4, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x28, #5, MUL VL]\n"
+      "zip1 z29.h, z23.h, z17.h\n"
+      "zip2 z28.h, z23.h, z17.h\n"
+      "ld1h { z27.h }, p4/Z, [x28, #6, MUL VL]\n"
+      "ld1h { z26.h }, p4/Z, [x28, #7, MUL VL]\n"
+      "zip1 z25.h, z19.h, z16.h\n"
+      "zip2 z24.h, z19.h, z16.h\n"
+      "ld1h { z19.h }, p4/Z, [x25, #4, MUL VL]\n"
+      "ld1h { z18.h }, p4/Z, [x25, #5, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      "zip1 z23.h, z21.h, z19.h\n"
+      "ld1h { z17.h }, p4/Z, [x25, #6, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x25, #7, MUL VL]\n"
+      "st1h { z0.h }, p4, [x22]\n"
+      "addvl x25, x25, #8\n"
+      "st1h { z22.h }, p4, [x22, #1, MUL VL]\n"
+      "zip2 z22.h, z21.h, z19.h\n"
+      "zip1 z21.h, z20.h, z18.h\n"
+      "st1h { z31.h }, p4, [x22, #2, MUL VL]\n"
+      "zip2 z20.h, z20.h, z18.h\n"
+      "zip1 z19.h, z27.h, z17.h\n"
+      "st1h { z30.h }, p4, [x22, #3, MUL VL]\n"
+      "zip2 z18.h, z27.h, z17.h\n"
+      "zip1 z17.h, z26.h, z16.h\n"
+      "st1h { z29.h }, p4, [x22, #4, MUL VL]\n"
+      "zip2 z16.h, z26.h, z16.h\n"
+      "st1h { z28.h }, p4, [x22, #5, MUL VL]\n"
+      "st1h { z25.h }, p4, [x22, #6, MUL VL]\n"
+      "st1h { z24.h }, p4, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z23.h }, p4, [x22]\n"
+      "st1h { z22.h }, p4, [x22, #1, MUL VL]\n"
+      "st1h { z21.h }, p4, [x22, #2, MUL VL]\n"
+      "st1h { z20.h }, p4, [x22, #3, MUL VL]\n"
+      "st1h { z19.h }, p4, [x22, #4, MUL VL]\n"
+      "st1h { z18.h }, p4, [x22, #5, MUL VL]\n"
+      "st1h { z17.h }, p4, [x22, #6, MUL VL]\n"
+      "st1h { z16.h }, p4, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z22.h }, p0/Z, [x28]\n"
+      "ld1h { z21.h }, p0/Z, [x25]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z20.h }, p0/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z18.h }, p0/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z17.h }, p0/Z, [x25, #2, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z24.h }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "decw x21, ALL, MUL #8\n"
+      "cmp x21, #0x0\n"
+      "zip1 z16.h, z22.h, z21.h\n"
+      "zip2 z22.h, z22.h, z21.h\n"
+      "addvl x28, x28, #4\n"
+      "addvl x25, x25, #4\n"
+      "zip1 z21.h, z20.h, z19.h\n"
+      "zip2 z20.h, z20.h, z19.h\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p4, [x22]\n"
+      "zip1 z17.h, z24.h, z23.h\n"
+      "zip2 z16.h, z24.h, z23.h\n"
+      "st1h { z22.h }, p4, [x22, #1, MUL VL]\n"
+      "st1h { z21.h }, p4, [x22, #2, MUL VL]\n"
+      "st1h { z20.h }, p4, [x22, #3, MUL VL]\n"
+      "st1h { z19.h }, p4, [x22, #4, MUL VL]\n"
+      "st1h { z18.h }, p4, [x22, #5, MUL VL]\n"
+      "st1h { z17.h }, p4, [x22, #6, MUL VL]\n"
+      "st1h { z16.h }, p4, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 2, true, VLType::SVE>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
new file mode 100644
index 0000000000..34799c60a6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_8VL_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "ptrue p2.b\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x12, %x[in]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "mov x27, %x[width]\n"
+      "cnth x26, ALL, MUL #4\n"
+      "add x25, x28, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp x27, x26\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z21.h }, p2/Z, [x12]\n"
+      "ld1h { z17.h }, p2/Z, [x12, #1, MUL VL]\n"
+      "mov x21, x22\n"
+      "add x22, x22, %x[out_stride]\n"
+      "ld1h { z31.h }, p2/Z, [x11]\n"
+      "ld1h { z5.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "mov x20, x22\n"
+      "sub x27, x27, x26\n"
+      "ld1h { z15.h }, p2/Z, [x10]\n"
+      "ld1h { z28.h }, p2/Z, [x10, #1, MUL VL]\n"
+      "zip1 z24.h, z21.h, z15.h\n"
+      "zip2 z29.h, z21.h, z15.h\n"
+      "ld1h { z6.h }, p2/Z, [x9]\n"
+      "ld1h { z4.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "zip1 z16.h, z31.h, z6.h\n"
+      "zip2 z18.h, z31.h, z6.h\n"
+      "ld1h { z3.h }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z25.h }, p2/Z, [x12, #3, MUL VL]\n"
+      "zip1 z20.h, z17.h, z28.h\n"
+      "zip1 z7.h, z5.h, z4.h\n"
+      "ld1h { z27.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x11, #3, MUL VL]\n"
+      "zip2 z2.h, z17.h, z28.h\n"
+      "zip2 z19.h, z5.h, z4.h\n"
+      "ld1h { z28.h }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z17.h }, p2/Z, [x10, #3, MUL VL]\n"
+      "zip1 z21.h, z24.h, z16.h\n"
+      "zip2 z24.h, z24.h, z16.h\n"
+      "ld1h { z5.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z1.h }, p2/Z, [x9, #3, MUL VL]\n"
+      "zip1 z14.h, z29.h, z18.h\n"
+      "zip2 z12.h, z29.h, z18.h\n"
+      "ld1h { z18.h }, p2/Z, [x28]\n"
+      "ld1h { z31.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "zip1 z11.h, z20.h, z7.h\n"
+      "zip2 z13.h, z20.h, z7.h\n"
+      "ld1h { z4.h }, p2/Z, [x25]\n"
+      "ld1h { z26.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "zip1 z15.h, z2.h, z19.h\n"
+      "zip2 z10.h, z2.h, z19.h\n"
+      "ld1h { z16.h }, p2/Z, [x24]\n"
+      "ld1h { z30.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "zip1 z19.h, z18.h, z16.h\n"
+      "zip2 z18.h, z18.h, z16.h\n"
+      "ld1h { z8.h }, p2/Z, [x23]\n"
+      "ld1h { z29.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "zip1 z20.h, z4.h, z8.h\n"
+      "zip2 z0.h, z4.h, z8.h\n"
+      "ld1h { z6.h }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z8.h }, p2/Z, [x28, #3, MUL VL]\n"
+      "zip1 z23.h, z31.h, z30.h\n"
+      "zip1 z16.h, z26.h, z29.h\n"
+      "ld1h { z9.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z7.h }, p2/Z, [x25, #3, MUL VL]\n"
+      "zip2 z31.h, z31.h, z30.h\n"
+      "zip2 z30.h, z26.h, z29.h\n"
+      "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x24, #3, MUL VL]\n"
+      "zip1 z29.h, z3.h, z28.h\n"
+      "zip1 z4.h, z27.h, z5.h\n"
+      "zip2 z28.h, z3.h, z28.h\n"
+      "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z27.h, z27.h, z5.h\n"
+      "ld1h { z5.h }, p2/Z, [x23, #3, MUL VL]\n"
+      "st1h { z21.h }, p2, [x21]\n"
+      "zip1 z21.h, z25.h, z17.h\n"
+      "zip2 z25.h, z25.h, z17.h\n"
+      "cmp x27, x26\n"
+      "st1h { z24.h }, p2, [x21, #1, MUL VL]\n"
+      "zip1 z24.h, z22.h, z1.h\n"
+      "zip2 z22.h, z22.h, z1.h\n"
+      "addvl x12, x12, #4\n"
+      "st1h { z14.h }, p2, [x21, #2, MUL VL]\n"
+      "zip1 z17.h, z19.h, z20.h\n"
+      "zip2 z20.h, z19.h, z20.h\n"
+      "addvl x11, x11, #4\n"
+      "st1h { z12.h }, p2, [x21, #3, MUL VL]\n"
+      "zip1 z19.h, z18.h, z0.h\n"
+      "zip2 z18.h, z18.h, z0.h\n"
+      "addvl x10, x10, #4\n"
+      "st1h { z11.h }, p2, [x21, #4, MUL VL]\n"
+      "zip1 z14.h, z23.h, z16.h\n"
+      "zip2 z16.h, z23.h, z16.h\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z13.h }, p2, [x21, #5, MUL VL]\n"
+      "zip1 z23.h, z31.h, z30.h\n"
+      "zip2 z1.h, z31.h, z30.h\n"
+      "addvl x28, x28, #4\n"
+      "st1h { z15.h }, p2, [x21, #6, MUL VL]\n"
+      "zip1 z0.h, z29.h, z4.h\n"
+      "zip2 z31.h, z29.h, z4.h\n"
+      "addvl x25, x25, #4\n"
+      "st1h { z10.h }, p2, [x21, #7, MUL VL]\n"
+      "addvl x21, x21, #16\n"
+      "zip1 z30.h, z28.h, z27.h\n"
+      "zip2 z29.h, z28.h, z27.h\n"
+      "st1h { z17.h }, p2, [x21, #-8, MUL VL]\n"
+      "zip1 z13.h, z21.h, z24.h\n"
+      "zip2 z27.h, z21.h, z24.h\n"
+      "addvl x24, x24, #4\n"
+      "st1h { z20.h }, p2, [x21, #-7, MUL VL]\n"
+      "zip1 z28.h, z25.h, z22.h\n"
+      "zip2 z25.h, z25.h, z22.h\n"
+      "addvl x23, x23, #4\n"
+      "st1h { z19.h }, p2, [x21, #-6, MUL VL]\n"
+      "zip1 z22.h, z6.h, z2.h\n"
+      "zip1 z21.h, z9.h, z3.h\n"
+      "add x22, x22, %x[out_stride]\n"
+      "st1h { z18.h }, p2, [x21, #-5, MUL VL]\n"
+      "zip2 z20.h, z6.h, z2.h\n"
+      "zip2 z19.h, z9.h, z3.h\n"
+      "st1h { z14.h }, p2, [x21, #-4, MUL VL]\n"
+      "zip1 z18.h, z8.h, z26.h\n"
+      "zip1 z17.h, z7.h, z5.h\n"
+      "st1h { z16.h }, p2, [x21, #-3, MUL VL]\n"
+      "zip2 z24.h, z8.h, z26.h\n"
+      "zip2 z16.h, z7.h, z5.h\n"
+      "st1h { z23.h }, p2, [x21, #-2, MUL VL]\n"
+      "zip1 z23.h, z22.h, z21.h\n"
+      "zip2 z22.h, z22.h, z21.h\n"
+      "st1h { z1.h }, p2, [x21, #-1, MUL VL]\n"
+      "zip1 z21.h, z20.h, z19.h\n"
+      "zip2 z20.h, z20.h, z19.h\n"
+      "st1h { z0.h }, p2, [x20]\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "st1h { z31.h }, p2, [x20, #1, MUL VL]\n"
+      "zip1 z17.h, z24.h, z16.h\n"
+      "zip2 z16.h, z24.h, z16.h\n"
+      "st1h { z30.h }, p2, [x20, #2, MUL VL]\n"
+      "st1h { z29.h }, p2, [x20, #3, MUL VL]\n"
+      "st1h { z13.h }, p2, [x20, #4, MUL VL]\n"
+      "st1h { z27.h }, p2, [x20, #5, MUL VL]\n"
+      "st1h { z28.h }, p2, [x20, #6, MUL VL]\n"
+      "st1h { z25.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z23.h }, p2, [x20, #-8, MUL VL]\n"
+      "st1h { z22.h }, p2, [x20, #-7, MUL VL]\n"
+      "st1h { z21.h }, p2, [x20, #-6, MUL VL]\n"
+      "st1h { z20.h }, p2, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x27, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x27\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z17.h }, p1/Z, [x12]\n"
+      "ld1h { z19.h }, p1/Z, [x11]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z24.h }, p0/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z16.h }, p1/Z, [x10]\n"
+      "ld1h { z20.h }, p0/Z, [x10, #1, MUL VL]\n"
+      "zip1 z1.h, z17.h, z16.h\n"
+      "zip2 z22.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p1/Z, [x9]\n"
+      "ld1h { z17.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "zip1 z16.h, z19.h, z18.h\n"
+      "zip2 z19.h, z19.h, z18.h\n"
+      "ld1h { z0.h }, p1/Z, [x28]\n"
+      "ld1h { z31.h }, p0/Z, [x28, #1, MUL VL]\n"
+      "zip1 z25.h, z24.h, z20.h\n"
+      "zip1 z21.h, z23.h, z17.h\n"
+      "ld1h { z30.h }, p1/Z, [x25]\n"
+      "ld1h { z29.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "zip2 z28.h, z24.h, z20.h\n"
+      "zip2 z24.h, z23.h, z17.h\n"
+      "ld1h { z20.h }, p1/Z, [x24]\n"
+      "ld1h { z27.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "mov x20, x22\n"
+      "decd x27, ALL, MUL #8\n"
+      "ld1h { z23.h }, p1/Z, [x23]\n"
+      "ld1h { z26.h }, p0/Z, [x23, #1, MUL VL]\n"
+      "zip1 z18.h, z1.h, z16.h\n"
+      "zip2 z17.h, z1.h, z16.h\n"
+      "zip1 z16.h, z22.h, z19.h\n"
+      "zip2 z19.h, z22.h, z19.h\n"
+      "st1h { z18.h }, p2, [x20]\n"
+      "cmp x27, #0x0\n"
+      "zip1 z22.h, z25.h, z21.h\n"
+      "zip2 z21.h, z25.h, z21.h\n"
+      "st1h { z17.h }, p2, [x20, #1, MUL VL]\n"
+      "addvl x12, x12, #2\n"
+      "zip1 z25.h, z28.h, z24.h\n"
+      "zip2 z18.h, z28.h, z24.h\n"
+      "st1h { z16.h }, p2, [x20, #2, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      "zip1 z17.h, z0.h, z20.h\n"
+      "zip1 z16.h, z30.h, z23.h\n"
+      "st1h { z19.h }, p2, [x20, #3, MUL VL]\n"
+      "addvl x10, x10, #2\n"
+      "zip2 z20.h, z0.h, z20.h\n"
+      "zip2 z19.h, z30.h, z23.h\n"
+      "st1h { z22.h }, p2, [x20, #4, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      "zip1 z24.h, z31.h, z27.h\n"
+      "zip1 z23.h, z29.h, z26.h\n"
+      "st1h { z21.h }, p2, [x20, #5, MUL VL]\n"
+      "addvl x28, x28, #2\n"
+      "zip2 z22.h, z31.h, z27.h\n"
+      "zip2 z21.h, z29.h, z26.h\n"
+      "st1h { z25.h }, p2, [x20, #6, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      "st1h { z18.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "addvl x24, x24, #2\n"
+      "zip1 z18.h, z17.h, z16.h\n"
+      "addvl x23, x23, #2\n"
+      "zip2 z17.h, z17.h, z16.h\n"
+      "zip1 z16.h, z20.h, z19.h\n"
+      "st1h { z18.h }, p2, [x20, #-8, MUL VL]\n"
+      "zip2 z20.h, z20.h, z19.h\n"
+      "zip1 z19.h, z24.h, z23.h\n"
+      "st1h { z17.h }, p2, [x20, #-7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip2 z18.h, z24.h, z23.h\n"
+      "zip1 z17.h, z22.h, z21.h\n"
+      "st1h { z16.h }, p2, [x20, #-6, MUL VL]\n"
+      "zip2 z16.h, z22.h, z21.h\n"
+      "st1h { z20.h }, p2, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x8\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+      "7:"  // Tail row loop: Head
+      "mov x12, %x[in]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "mov x21, %x[width]\n"
+      "cnth x20, ALL, MUL #4\n"
+      "add x9, x10, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x9, %x[in_stride]\n"
+      "csel x9, x9, %x[pad_row], GT\n"
+      "csel x10, x10, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x11, x11, %x[pad_row], GT\n"
+      "cmp x21, x20\n"
+      "mov x22, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z17.h }, p2/Z, [x12]\n"
+      "ld1h { z22.h }, p2/Z, [x12, #1, MUL VL]\n"
+      "sub x21, x21, x20\n"
+      "cmp x21, x20\n"
+      "ld1h { z19.h }, p2/Z, [x11]\n"
+      "ld1h { z21.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x10]\n"
+      "ld1h { z18.h }, p2/Z, [x10, #1, MUL VL]\n"
+      "zip1 z4.h, z17.h, z16.h\n"
+      "zip2 z3.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x9]\n"
+      "ld1h { z16.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "zip1 z2.h, z19.h, z17.h\n"
+      "zip2 z1.h, z19.h, z17.h\n"
+      "ld1h { z17.h }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z24.h }, p2/Z, [x12, #3, MUL VL]\n"
+      "zip1 z0.h, z22.h, z18.h\n"
+      "zip1 z31.h, z21.h, z16.h\n"
+      "ld1h { z20.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1h { z19.h }, p2/Z, [x11, #3, MUL VL]\n"
+      "zip2 z30.h, z22.h, z18.h\n"
+      "zip2 z23.h, z21.h, z16.h\n"
+      "ld1h { z16.h }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x10, #3, MUL VL]\n"
+      "zip1 z22.h, z17.h, z16.h\n"
+      "zip2 z29.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x9, #3, MUL VL]\n"
+      "zip1 z21.h, z20.h, z17.h\n"
+      "zip2 z28.h, z20.h, z17.h\n"
+      "zip1 z27.h, z24.h, z18.h\n"
+      "zip1 z26.h, z19.h, z16.h\n"
+      "addvl x12, x12, #4\n"
+      "addvl x11, x11, #4\n"
+      "zip2 z25.h, z24.h, z18.h\n"
+      "zip2 z24.h, z19.h, z16.h\n"
+      "addvl x10, x10, #4\n"
+      "addvl x9, x9, #4\n"
+      "zip1 z16.h, z4.h, z2.h\n"
+      "zip2 z17.h, z4.h, z2.h\n"
+      "st1h { z16.h }, p2, [x22]\n"
+      "zip1 z16.h, z3.h, z1.h\n"
+      "zip2 z20.h, z3.h, z1.h\n"
+      "st1h { z17.h }, p2, [x22, #1, MUL VL]\n"
+      "zip1 z19.h, z0.h, z31.h\n"
+      "zip2 z18.h, z0.h, z31.h\n"
+      "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+      "zip1 z17.h, z30.h, z23.h\n"
+      "zip2 z16.h, z30.h, z23.h\n"
+      "st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
+      "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+      "zip1 z23.h, z22.h, z21.h\n"
+      "zip2 z22.h, z22.h, z21.h\n"
+      "st1h { z18.h }, p2, [x22, #5, MUL VL]\n"
+      "zip1 z21.h, z29.h, z28.h\n"
+      "zip2 z20.h, z29.h, z28.h\n"
+      "st1h { z17.h }, p2, [x22, #6, MUL VL]\n"
+      "zip1 z19.h, z27.h, z26.h\n"
+      "zip2 z18.h, z27.h, z26.h\n"
+      "st1h { z16.h }, p2, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "zip1 z17.h, z25.h, z24.h\n"
+      "zip2 z16.h, z25.h, z24.h\n"
+      "st1h { z23.h }, p2, [x22]\n"
+      "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z21.h }, p2, [x22, #2, MUL VL]\n"
+      "st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
+      "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x22, #5, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #6, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x21, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z23.h }, p1/Z, [x12]\n"
+      "ld1h { z22.h }, p1/Z, [x11]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z21.h }, p0/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z25.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x10]\n"
+      "ld1h { z20.h }, p0/Z, [x10, #1, MUL VL]\n"
+      "decd x21, ALL, MUL #8\n"
+      "zip1 z24.h, z23.h, z19.h\n"
+      "ld1h { z18.h }, p1/Z, [x9]\n"
+      "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "zip1 z17.h, z22.h, z18.h\n"
+      "zip2 z23.h, z23.h, z19.h\n"
+      "zip2 z19.h, z22.h, z18.h\n"
+      "zip1 z22.h, z21.h, z20.h\n"
+      "cmp x21, #0x0\n"
+      "addvl x12, x12, #2\n"
+      "zip1 z18.h, z25.h, z16.h\n"
+      "zip2 z21.h, z21.h, z20.h\n"
+      "addvl x11, x11, #2\n"
+      "addvl x10, x10, #2\n"
+      "zip2 z20.h, z25.h, z16.h\n"
+      "addvl x9, x9, #2\n"
+      "zip1 z16.h, z24.h, z17.h\n"
+      "st1h { z16.h }, p2, [x22]\n"
+      "zip2 z16.h, z24.h, z17.h\n"
+      "zip1 z17.h, z23.h, z19.h\n"
+      "st1h { z16.h }, p2, [x22, #1, MUL VL]\n"
+      "zip2 z16.h, z23.h, z19.h\n"
+      "zip1 z19.h, z22.h, z18.h\n"
+      "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+      "zip2 z18.h, z22.h, z18.h\n"
+      "zip1 z17.h, z21.h, z20.h\n"
+      "st1h { z16.h }, p2, [x22, #3, MUL VL]\n"
+      "zip2 z16.h, z21.h, z20.h\n"
+      "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x22, #5, MUL VL]\n"
+      "st1h { z17.h }, p2, [x22, #6, MUL VL]\n"
+      "st1h { z16.h }, p2, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 7b\n"
+      "12:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 4, true, VLType::SVE>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_2x4(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..5a48e579ae
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_8VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "mov x23, %x[width]\n"
+      "cnth x20, ALL, MUL #4\n"
+      "add x22, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x22, %x[in_stride]\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "cmp x23, x20\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z19.s }, p4/Z, [x26]\n"
+      "ld1w { z18.s }, p4/Z, [x26, #1, MUL VL]\n"
+      "sub x23, x23, x20\n"
+      "cmp x23, x20\n"
+      "ld1w { z20.s }, p4/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x26, #3, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x24]\n"
+      "ld1w { z17.s }, p4/Z, [x24, #1, MUL VL]\n"
+      "zip1 z22.s, z19.s, z23.s\n"
+      "zip2 z21.s, z19.s, z23.s\n"
+      "ld1w { z31.s }, p4/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x24, #3, MUL VL]\n"
+      "zip1 z9.s, z18.s, z17.s\n"
+      "zip2 z7.s, z18.s, z17.s\n"
+      "ld1w { z19.s }, p4/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z18.s }, p4/Z, [x26, #5, MUL VL]\n"
+      "zip1 z6.s, z20.s, z31.s\n"
+      "zip2 z5.s, z20.s, z31.s\n"
+      "ld1w { z15.s }, p4/Z, [x26, #6, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x26, #7, MUL VL]\n"
+      "zip1 z3.s, z24.s, z16.s\n"
+      "zip2 z2.s, z24.s, z16.s\n"
+      "ld1w { z16.s }, p4/Z, [x24, #4, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x24, #5, MUL VL]\n"
+      "zip1 z1.s, z19.s, z16.s\n"
+      "zip2 z0.s, z19.s, z16.s\n"
+      "ld1w { z16.s }, p4/Z, [x24, #6, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x24, #7, MUL VL]\n"
+      "zip1 z31.s, z18.s, z17.s\n"
+      "zip2 z30.s, z18.s, z17.s\n"
+      "ld1w { z18.s }, p4/Z, [x25]\n"
+      "ld1w { z17.s }, p4/Z, [x25, #1, MUL VL]\n"
+      "zip1 z29.s, z15.s, z16.s\n"
+      "zip2 z28.s, z15.s, z16.s\n"
+      "ld1w { z16.s }, p4/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x25, #3, MUL VL]\n"
+      "zip1 z27.s, z20.s, z19.s\n"
+      "zip2 z26.s, z20.s, z19.s\n"
+      "ld1w { z11.s }, p4/Z, [x22]\n"
+      "ld1w { z8.s }, p4/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x658ab2d8  // bfcvt z24.h, p4/M, z22.s\n"
+      "zip1 z25.s, z18.s, z11.s\n"
+      "ld1w { z4.s }, p4/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z22.s }, p4/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x658ab2af  // bfcvt z15.h, p4/M, z21.s\n"
+      "zip2 z14.s, z18.s, z11.s\n"
+      "ld1w { z21.s }, p4/Z, [x25, #4, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25, #5, MUL VL]\n"
+      ".inst 0x658ab12d  // bfcvt z13.h, p4/M, z9.s\n"
+      "zip1 z12.s, z17.s, z8.s\n"
+      "ld1w { z11.s }, p4/Z, [x25, #6, MUL VL]\n"
+      "ld1w { z10.s }, p4/Z, [x25, #7, MUL VL]\n"
+      ".inst 0x658ab0e9  // bfcvt z9.h, p4/M, z7.s\n"
+      "zip2 z8.s, z17.s, z8.s\n"
+      "ld1w { z19.s }, p4/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z18.s }, p4/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x658ab0c7  // bfcvt z7.h, p4/M, z6.s\n"
+      "zip1 z6.s, z16.s, z4.s\n"
+      "ld1w { z17.s }, p4/Z, [x22, #6, MUL VL]\n"
+      ".inst 0x658ab0a5  // bfcvt z5.h, p4/M, z5.s\n"
+      "zip2 z4.s, z16.s, z4.s\n"
+      "ld1w { z16.s }, p4/Z, [x22, #7, MUL VL]\n"
+      ".inst 0x658ab063  // bfcvt z3.h, p4/M, z3.s\n"
+      ".inst 0x658ab042  // bfcvt z2.h, p4/M, z2.s\n"
+      "addvl x26, x26, #8\n"
+      "addvl x25, x25, #8\n"
+      ".inst 0x658ab021  // bfcvt z1.h, p4/M, z1.s\n"
+      ".inst 0x658ab000  // bfcvt z0.h, p4/M, z0.s\n"
+      "addvl x24, x24, #8\n"
+      "addvl x22, x22, #8\n"
+      ".inst 0x658ab3ff  // bfcvt z31.h, p4/M, z31.s\n"
+      ".inst 0x658ab3de  // bfcvt z30.h, p4/M, z30.s\n"
+      ".inst 0x658ab3bd  // bfcvt z29.h, p4/M, z29.s\n"
+      ".inst 0x658ab39c  // bfcvt z28.h, p4/M, z28.s\n"
+      ".inst 0x658ab37b  // bfcvt z27.h, p4/M, z27.s\n"
+      ".inst 0x658ab35a  // bfcvt z26.h, p4/M, z26.s\n"
+      ".inst 0x648ab338  // bfcvtnt z24.h, p4/M, z25.s\n"
+      "zip1 z25.s, z23.s, z22.s\n"
+      "st1h { z24.h }, p4, [x21]\n"
+      "zip2 z24.s, z23.s, z22.s\n"
+      "zip1 z23.s, z21.s, z19.s\n"
+      "zip2 z22.s, z21.s, z19.s\n"
+      "zip1 z21.s, z20.s, z18.s\n"
+      "zip2 z20.s, z20.s, z18.s\n"
+      "zip1 z19.s, z11.s, z17.s\n"
+      "zip2 z18.s, z11.s, z17.s\n"
+      "zip1 z17.s, z10.s, z16.s\n"
+      "zip2 z16.s, z10.s, z16.s\n"
+      ".inst 0x648ab1cf  // bfcvtnt z15.h, p4/M, z14.s\n"
+      "st1h { z15.h }, p4, [x21, #1, MUL VL]\n"
+      ".inst 0x648ab18d  // bfcvtnt z13.h, p4/M, z12.s\n"
+      ".inst 0x648ab109  // bfcvtnt z9.h, p4/M, z8.s\n"
+      "st1h { z13.h }, p4, [x21, #2, MUL VL]\n"
+      ".inst 0x648ab0c7  // bfcvtnt z7.h, p4/M, z6.s\n"
+      ".inst 0x648ab085  // bfcvtnt z5.h, p4/M, z4.s\n"
+      "st1h { z9.h }, p4, [x21, #3, MUL VL]\n"
+      ".inst 0x648ab323  // bfcvtnt z3.h, p4/M, z25.s\n"
+      ".inst 0x648ab302  // bfcvtnt z2.h, p4/M, z24.s\n"
+      "st1h { z7.h }, p4, [x21, #4, MUL VL]\n"
+      "st1h { z5.h }, p4, [x21, #5, MUL VL]\n"
+      ".inst 0x648ab2e1  // bfcvtnt z1.h, p4/M, z23.s\n"
+      ".inst 0x648ab2c0  // bfcvtnt z0.h, p4/M, z22.s\n"
+      "st1h { z3.h }, p4, [x21, #6, MUL VL]\n"
+      ".inst 0x648ab2bf  // bfcvtnt z31.h, p4/M, z21.s\n"
+      ".inst 0x648ab29e  // bfcvtnt z30.h, p4/M, z20.s\n"
+      "st1h { z2.h }, p4, [x21, #7, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      ".inst 0x648ab27d  // bfcvtnt z29.h, p4/M, z19.s\n"
+      ".inst 0x648ab25c  // bfcvtnt z28.h, p4/M, z18.s\n"
+      ".inst 0x648ab23b  // bfcvtnt z27.h, p4/M, z17.s\n"
+      ".inst 0x648ab21a  // bfcvtnt z26.h, p4/M, z16.s\n"
+      "st1h { z1.h }, p4, [x21]\n"
+      "st1h { z0.h }, p4, [x21, #1, MUL VL]\n"
+      "st1h { z31.h }, p4, [x21, #2, MUL VL]\n"
+      "st1h { z30.h }, p4, [x21, #3, MUL VL]\n"
+      "st1h { z29.h }, p4, [x21, #4, MUL VL]\n"
+      "st1h { z28.h }, p4, [x21, #5, MUL VL]\n"
+      "st1h { z27.h }, p4, [x21, #6, MUL VL]\n"
+      "st1h { z26.h }, p4, [x21, #7, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x23, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x23\n"
+      "whilelt p3.s, XZR, x20\n"
+      "ld1w { z22.s }, p3/Z, [x26]\n"
+      "ld1w { z21.s }, p3/Z, [x24]\n"
+      "decw x20\n"
+      "whilelt p2.s, XZR, x20\n"
+      "ld1w { z20.s }, p2/Z, [x26, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p1.s, XZR, x20\n"
+      "ld1w { z18.s }, p1/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z17.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z28.s }, p0/Z, [x26, #3, MUL VL]\n"
+      "ld1w { z16.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z27.s }, p3/Z, [x25]\n"
+      "ld1w { z3.s }, p2/Z, [x25, #1, MUL VL]\n"
+      "zip1 z26.s, z22.s, z21.s\n"
+      "zip2 z25.s, z22.s, z21.s\n"
+      "ld1w { z2.s }, p1/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z1.s }, p0/Z, [x25, #3, MUL VL]\n"
+      "zip1 z24.s, z20.s, z19.s\n"
+      "zip2 z23.s, z20.s, z19.s\n"
+      "ld1w { z22.s }, p3/Z, [x22]\n"
+      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "zip1 z20.s, z18.s, z17.s\n"
+      "zip2 z19.s, z18.s, z17.s\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z0.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "zip1 z17.s, z28.s, z16.s\n"
+      "zip2 z16.s, z28.s, z16.s\n"
+      "decd x23, ALL, MUL #8\n"
+      ".inst 0x658ab35f  // bfcvt z31.h, p4/M, z26.s\n"
+      "zip1 z30.s, z27.s, z22.s\n"
+      "cmp x23, #0x0\n"
+      ".inst 0x658ab33d  // bfcvt z29.h, p4/M, z25.s\n"
+      "zip2 z28.s, z27.s, z22.s\n"
+      "addvl x26, x26, #4\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0x658ab31b  // bfcvt z27.h, p4/M, z24.s\n"
+      "zip1 z26.s, z3.s, z21.s\n"
+      "addvl x24, x24, #4\n"
+      "addvl x22, x22, #4\n"
+      ".inst 0x658ab2f9  // bfcvt z25.h, p4/M, z23.s\n"
+      "zip2 z24.s, z3.s, z21.s\n"
+      ".inst 0x658ab297  // bfcvt z23.h, p4/M, z20.s\n"
+      "zip1 z22.s, z2.s, z18.s\n"
+      ".inst 0x658ab275  // bfcvt z21.h, p4/M, z19.s\n"
+      "zip2 z20.s, z2.s, z18.s\n"
+      ".inst 0x658ab233  // bfcvt z19.h, p4/M, z17.s\n"
+      "zip1 z18.s, z1.s, z0.s\n"
+      ".inst 0x658ab211  // bfcvt z17.h, p4/M, z16.s\n"
+      "zip2 z16.s, z1.s, z0.s\n"
+      ".inst 0x648ab3df  // bfcvtnt z31.h, p4/M, z30.s\n"
+      ".inst 0x648ab39d  // bfcvtnt z29.h, p4/M, z28.s\n"
+      "st1h { z31.h }, p4, [x21]\n"
+      ".inst 0x648ab35b  // bfcvtnt z27.h, p4/M, z26.s\n"
+      ".inst 0x648ab319  // bfcvtnt z25.h, p4/M, z24.s\n"
+      "st1h { z29.h }, p4, [x21, #1, MUL VL]\n"
+      ".inst 0x648ab2d7  // bfcvtnt z23.h, p4/M, z22.s\n"
+      ".inst 0x648ab295  // bfcvtnt z21.h, p4/M, z20.s\n"
+      "st1h { z27.h }, p4, [x21, #2, MUL VL]\n"
+      ".inst 0x648ab253  // bfcvtnt z19.h, p4/M, z18.s\n"
+      ".inst 0x648ab211  // bfcvtnt z17.h, p4/M, z16.s\n"
+      "st1h { z25.h }, p4, [x21, #3, MUL VL]\n"
+      "st1h { z23.h }, p4, [x21, #4, MUL VL]\n"
+      "st1h { z21.h }, p4, [x21, #5, MUL VL]\n"
+      "st1h { z19.h }, p4, [x21, #6, MUL VL]\n"
+      "st1h { z17.h }, p4, [x21, #7, MUL VL]\n"
+      "add x21, x21, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<8, 4, true, VLType::SVE>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
index a3216c494f..02367bd7e7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@ struct TransposeInterleaveCommon {
     }
   }
 
-  static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) {
+  static void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) {
     const auto ldin = stride;
 
     TOut *outarray = out;
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index 6d483a3b9d..11b1bd3e05 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,11 @@
 
 #pragma once
 
-#include "arm_gemm.hpp"
+#include "src/cpu/kernels/assembly/arm_gemm.hpp"
 
 #include <cstddef>
+#include <limits>
+#include <tuple>
 
 // Macro for unreachable code (e.g. impossible default cases on switch)
 #define UNREACHABLE(why)  __builtin_unreachable()
@@ -37,6 +39,29 @@
 namespace arm_gemm {
 
 template<typename T>
+std::string get_type_name() {
+#ifdef __GNUC__
+    std::string s = __PRETTY_FUNCTION__;
+
+    auto start = s.find("cls_");
+
+    if (start==std::string::npos) {
+        return "(unknown)";
+    }
+
+    for(size_t x = start+4; x<s.size(); x++) {
+        if (s[x] == ';' || s[x] == ']') {
+            return s.substr(start+4, x-(start+4));
+        }
+    }
+
+    return "(unknown)";
+#else
+    return "(unsupported)";
+#endif
+}
+
+template<typename T>
 inline T iceildiv(const T a, const T b) {
     return (a + b - 1) / b;
 }
@@ -55,6 +80,8 @@ inline T roundup(const T a, const T b) {
 enum class VLType {
     None,
     SVE,
+    SME,
+    SME2
 };
 
 template<typename T>
@@ -141,40 +168,94 @@ struct IndirectInputArg {
 };
 
 namespace utils {
-namespace {
 
-#ifdef __ARM_FEATURE_SVE
-template<size_t sz>
-inline unsigned long get_vector_length_sz() {
-    unsigned long v;
+// get_vector_length(): Returns SVE vector length for type "T".
+//
+// It is required that this can be compiled by a compiler in non-SVE mode, but it must be prevented from running (at
+// runtime) if SVE is not enabled.  Typically this is used by switchyard/driver code which is built in normal mode
+// which then calls SVE kernels (compiled accordingly) iff SVE is detected at runtime.
+template <typename T>
+inline unsigned long get_vector_length() {
+#if defined(__aarch64__)
+    uint64_t vl;
 
-    __asm (
-        "cntb	%0"
-        : "=r" (v)
+    __asm __volatile (
+        ".inst 0x0420e3e0\n" // CNTB X0, ALL, MUL #1
+        "mov %0, X0\n"
+        : "=r" (vl)
+        :
+        : "x0"
     );
 
-    return v / sz;
+    return vl / sizeof(T);
+#else // !defined(__aarch64__)
+    return 16 / sizeof(T);
+#endif // defined(__aarch64__)
 }
 
-#define VEC_LEN_SPEC(sz, opcode) template <> inline unsigned long get_vector_length_sz<sz>() { unsigned long v; __asm ( opcode " %0" : "=r" (v)); return v; }
-
-VEC_LEN_SPEC(8, "cntd")
-VEC_LEN_SPEC(4, "cntw")
-VEC_LEN_SPEC(2, "cnth")
-VEC_LEN_SPEC(1, "cntb")
-#endif
+#ifdef ARM_COMPUTE_ENABLE_SME
+namespace sme {
 
-} // anonymous namespace
+// function from misc-sve.cpp
+extern unsigned int raw_vector_length();
 
 template <typename T>
 inline unsigned long get_vector_length() {
-#ifdef __ARM_FEATURE_SVE
-    return get_vector_length_sz<sizeof(T)>();
-#else
-    return 16 / sizeof(T);
-#endif
+    return raw_vector_length() / sizeof(T);
+}
+
+} // namespace sme
+#endif // ARM_COMPUTE_ENABLE_SME
+
+// get_vector_length(VLType): Returns vector length for type "T".
+//
+// This has the same requirements and constraints as the SVE-only form above, so we call into that code for SVE.
+
+template <typename T>
+inline unsigned long get_vector_length(VLType vl_type) {
+  switch (vl_type) {
+#ifdef ARM_COMPUTE_ENABLE_SME
+    case VLType::SME:
+      return sme::get_vector_length<T>();
+#endif // ARM_COMPUTE_ENABLE_SME
+    case VLType::SVE:
+      return get_vector_length<T>();
+    default:
+      return 16 / sizeof(T);
+  }
+}
+
+// get_default_activation_values(): Returns the default values for activation min and max for integer activation.
+template <typename T>
+inline std::tuple<T, T> get_default_activation_values()
+{
+    const T min = static_cast<T>(std::numeric_limits<T>::min());
+    const T max = static_cast<T>(std::numeric_limits<T>::max());
+
+    return std::make_tuple(min, max);
+}
+
+// get_default_activation_values(): Returns the default values for activation min and max for float activation.
+template <>
+inline std::tuple<float, float> get_default_activation_values()
+{
+    const float min = static_cast<float>(-std::numeric_limits<float>::infinity());
+    const float max = static_cast<float>(std::numeric_limits<float>::infinity());
+
+    return std::make_tuple(min, max);
 }
 
+#if defined(__ARM_FP16_ARGS)
+// get_default_activation_values(): Returns the default values for activation min and max for __fp16 activation.
+template <>
+inline std::tuple<__fp16, __fp16> get_default_activation_values()
+{
+    const __fp16 min = static_cast<__fp16>(-std::numeric_limits<float>::infinity());
+    const __fp16 max = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+
+    return std::make_tuple(min, max);
+}
+#endif  // defined(__ARM_FP16_ARGS)
 } // utils namespace
 } // arm_gemm namespace
 
diff --git a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
deleted file mode 100644
index a956898403..0000000000
--- a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-#define SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** This class is a wrapper for the depthwise convolution assembly kernels.  */
-class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthwiseConvolutionAssemblyKernelWrapper";
-    }
-
-    /** Default constructor */
-    NEDepthwiseConvolutionAssemblyKernelWrapper()
-        : _kernel(nullptr)
-    {
-    }
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
-    /** Default Move Constructor. */
-    NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in] kernel Pointer to an assembly kernel implementation.
-     */
-    void configure(depthwise::IDepthwiseConvolution *kernel)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
-        _kernel = kernel;
-        Window win;
-        win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1));
-        INEKernel::configure(win);
-    }
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-        auto first = window.x().start();
-        auto last  = window.x().end();
-        _kernel->run(first, last, info.thread_id);
-    }
-
-private:
-    depthwise::IDepthwiseConvolution *_kernel;
-};
-} // namespace arm_compute
-#endif /* SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/NEON/kernels/assembly/common.hpp b/src/core/NEON/kernels/assembly/common.hpp
new file mode 100644
index 0000000000..d82d11cae0
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/common.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace arm_conv
+{
+struct PaddingValues
+{
+    unsigned int left, top, right, bottom;
+};
+
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
new file mode 100644
index 0000000000..13c2d314e4
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "arm_gemm_local.hpp"
+#include "depthwise_common.hpp"
+#include "premultiply.hpp"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+struct DepthwiseConfig
+{
+    DepthwiseMethod method = DepthwiseMethod::DEFAULT;
+    std::string     filter = "";
+
+    DepthwiseConfig(DepthwiseMethod method) : method(method){};
+    DepthwiseConfig(){};
+};
+
+struct DepthwiseArgs
+{
+    const CPUInfo *cpu_info;
+
+    unsigned int kernel_rows, kernel_cols;
+    unsigned int stride_rows, stride_cols;
+    unsigned int dilation_rows, dilation_cols;
+
+    unsigned int n_batches, input_rows, input_cols, input_channels;
+    unsigned int output_rows, output_cols;
+    unsigned int channel_multiplier;
+
+    PaddingValues padding;
+
+    arm_gemm::Activation activation;
+
+    const DepthwiseConfig *config;
+
+    bool fast_mode = false;
+
+    DepthwiseArgs(const CPUInfo       *cpu_info,
+                  unsigned int         kernel_rows,
+                  unsigned int         kernel_cols,
+                  unsigned int         stride_rows,
+                  unsigned int         stride_cols,
+                  unsigned int         dilation_rows,
+                  unsigned int         dilation_cols,
+                  unsigned int         n_batches,
+                  unsigned int         input_rows,
+                  unsigned int         input_cols,
+                  unsigned int         input_channels,
+                  unsigned int         output_rows,
+                  unsigned int         output_cols,
+                  unsigned int         channel_multiplier,
+                  PaddingValues        padding,
+                  arm_gemm::Activation activation,
+
+                  const DepthwiseConfig *config)
+        : cpu_info(cpu_info),
+          kernel_rows(kernel_rows),
+          kernel_cols(kernel_cols),
+          stride_rows(stride_rows),
+          stride_cols(stride_cols),
+          dilation_rows(dilation_rows),
+          dilation_cols(dilation_cols),
+          n_batches(n_batches),
+          input_rows(input_rows),
+          input_cols(input_cols),
+          input_channels(input_channels),
+          output_rows(output_rows),
+          output_cols(output_cols),
+          channel_multiplier(channel_multiplier),
+          padding(padding),
+          activation(activation),
+          config(config)
+    {
+    }
+
+    DepthwiseArgs(const CPUInfo         *cpu_info,
+                  unsigned int           kernel_rows,
+                  unsigned int           kernel_cols,
+                  unsigned int           stride_rows,
+                  unsigned int           stride_cols,
+                  unsigned int           n_batches,
+                  unsigned int           input_rows,
+                  unsigned int           input_cols,
+                  unsigned int           input_channels,
+                  unsigned int           output_rows,
+                  unsigned int           output_cols,
+                  unsigned int           channel_multiplier,
+                  PaddingValues          padding,
+                  arm_gemm::Activation   activation,
+                  const DepthwiseConfig *config)
+        : DepthwiseArgs(cpu_info,
+                        kernel_rows,
+                        kernel_cols,
+                        stride_rows,
+                        stride_cols,
+                        1,
+                        1,
+                        n_batches,
+                        input_rows,
+                        input_cols,
+                        input_channels,
+                        output_rows,
+                        output_cols,
+                        channel_multiplier,
+                        padding,
+                        activation,
+                        config)
+    {
+    }
+};
+
+template <typename TInput>
+struct Tile
+{
+    TInput *array;
+
+    unsigned int tile_rows     = 0;
+    unsigned int tile_cols     = 0;
+    unsigned int tile_channels = 0;
+
+    Tile(TInput *array, unsigned int tile_rows, unsigned int tile_cols, unsigned int tile_channels)
+        : array(array), tile_rows(tile_rows), tile_cols(tile_cols), tile_channels(tile_channels)
+    {
+    }
+
+    Tile() : Tile(nullptr, 0, 0, 0)
+    {
+    }
+
+    void load_from(const TInput      *input,
+                   const unsigned int ld_row,
+                   const unsigned int ld_col,
+                   const unsigned int n_rows,
+                   const unsigned int n_cols,
+                   const int          input_i,
+                   const int          input_j,
+                   const unsigned int channel_multiplier) const
+    {
+        const auto pad_top  = input_i < 0 ? -input_i : 0;
+        const auto pad_left = input_j < 0 ? -input_j : 0;
+
+        const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top;
+        const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left;
+
+        if (padded_rows < tile_rows || padded_cols < tile_cols)
+        {
+            memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput));
+        }
+
+        do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row,
+                               ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
+                               tile_cols * tile_channels, tile_channels, padded_rows, padded_cols,
+                               tile_channels / channel_multiplier, channel_multiplier);
+    }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthwiseCommon : public IDepthwiseCommon
+{
+protected:
+    const DepthwiseArgs m_args; // Copy of arguments
+    std::string         m_name{};
+
+public:
+    DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){};
+    DepthwiseCommon(DepthwiseCommon &)            = delete;
+    DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
+
+    std::string name() const override
+    {
+        return m_name;
+    }
+
+    void set_name(std::string name)
+    {
+        // Only allow the name to be set once
+        if (m_name.empty())
+        {
+            m_name = name;
+        }
+    }
+
+    void execute(const void *const  input,
+                 const void *const  parameters,
+                 void *const        output,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
+    {
+        const size_t ld_input_col    = m_args.input_channels;
+        const size_t ld_input_row    = ld_input_col * m_args.input_cols;
+        const size_t ld_input_batch  = ld_input_row * m_args.input_rows;
+        const size_t ld_output_col   = m_args.input_channels * m_args.channel_multiplier;
+        const size_t ld_output_row   = ld_output_col * m_args.output_cols;
+        const size_t ld_output_batch = ld_output_row * m_args.output_rows;
+
+        execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row,
+                ld_output_batch, working_space, thread_id, n_threads);
+    }
+
+    void execute(const void *const  input,
+                 size_t             ld_input_col,
+                 size_t             ld_input_row,
+                 size_t             ld_input_batch,
+                 const void *const  parameters,
+                 void *const        output,
+                 size_t             ld_output_col,
+                 size_t             ld_output_row,
+                 size_t             ld_output_batch,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
+    {
+        execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input,
+                ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output,
+                ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads);
+    }
+
+    void execute(unsigned int         batches,
+                 unsigned int         input_height,
+                 unsigned int         input_width,
+                 unsigned int         channels,
+                 const PaddingValues &padding,
+                 const void          *input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const void          *parameters,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void                *output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         n_threads) const override final
+    {
+        // Construct a new set of arguments to reflect that we might have been
+        // passed different input/output tensors. Dilation is handled at this
+        // level; so we set the dilation in the arguments to zero.
+        DepthwiseArgs args(this->m_args);
+        args.n_batches      = batches;
+        args.input_rows     = input_height;
+        args.input_cols     = input_width;
+        args.input_channels = channels;
+        args.output_rows    = output_height;
+        args.output_cols    = output_width;
+        args.padding        = padding;
+        args.dilation_rows = args.dilation_cols = 1;
+
+        auto ld_input_col_d  = ld_input_col * m_args.dilation_cols;
+        auto ld_input_row_d  = ld_input_row * m_args.dilation_rows;
+        auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
+        auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
+
+        for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
+        {
+            size_t start_i;
+            std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) =
+                get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows,
+                                              m_args.kernel_rows, m_args.stride_rows, padding.top);
+
+            auto input_row  = static_cast<const TInput *>(input) + start_i * ld_input_row;
+            auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
+
+            if (args.output_rows)
+            {
+                for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+                {
+                    size_t start_j;
+                    std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) =
+                        get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols,
+                                                      m_args.kernel_cols, m_args.stride_cols, padding.left);
+
+                    const TInput *input_col  = input_row + start_j * ld_input_col;
+                    TOutput      *output_col = output_row + dcol * ld_output_col;
+
+                    if (args.output_cols)
+                    {
+                        this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch,
+                                               parameters, output_col, ld_output_col_d, ld_output_row_d,
+                                               ld_output_batch, working_space, thread_id, n_threads);
+                    }
+                }
+            }
+        }
+    }
+
+protected:
+    virtual void execute_internal(const DepthwiseArgs &instance_args,
+                                  const void          *input,
+                                  size_t               ld_input_col,
+                                  size_t               ld_input_row,
+                                  size_t               ld_input_batch,
+                                  const void          *parameters,
+                                  void                *output,
+                                  size_t               ld_output_col,
+                                  size_t               ld_output_row,
+                                  size_t               ld_output_batch,
+                                  void                *working_space,
+                                  unsigned int         thread_id,
+                                  unsigned int         n_threads) const = 0;
+
+    virtual bool uses_premultiply() const
+    {
+        return true;
+    }
+};
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput>
+using UniqueDepthwiseCommon = std::unique_ptr<DepthwiseCommon<TInput, TWeight, TOutput>>;
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+KernelDescription get_depthwise_method(const DepthwiseArgs &, const OutputStage & = {});
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &, const OutputStage & = {});
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+std::vector<KernelDescription> get_compatible_kernels(const DepthwiseArgs &, const OutputStage & = {});
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
new file mode 100644
index 0000000000..5ff848e281
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "common.hpp"
+#include <cstddef>
+#include <tuple>
+
+namespace arm_conv
+{
+namespace depthwise
+{
+using arm_gemm::Nothing;
+
+enum class DepthwiseMethod
+{
+    DEFAULT,
+    DEPTHFIRST,
+    PLANAR,
+};
+
+struct KernelDescription
+{
+    DepthwiseMethod method         = DepthwiseMethod::DEFAULT;
+    std::string     name           = "";
+    bool            is_default     = false;
+    uint64_t        cycle_estimate = 0;
+
+    KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate)
+        : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate)
+    {
+    }
+
+    KernelDescription() noexcept {};
+};
+
+class IDepthwiseCommon
+{
+public:
+    virtual ~IDepthwiseCommon() = default;
+
+    // Get the name of the depthwise implementation
+    virtual std::string name() const = 0;
+
+    // Determine the amount of storage space required for the rearranged weights
+    // and bias.
+    virtual size_t get_storage_size(void) const = 0;
+
+    // Rearrange the weights and biases into a storage buffer.
+    // Accepts a pointer to a buffer into which to store the packed parameters, a
+    // pointer the bias vector (which may be nullptr in the case of no bias) and
+    // a pointer to the array of weights (stored in HWIO order).
+    virtual void pack_parameters(
+        void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0;
+
+    // Determine the amount of working space required
+    virtual size_t get_working_size(unsigned int n_threads) const = 0;
+
+    // Execute the convolution over the specified area of memory.
+    virtual void execute(const void  *input,      // Pointer to input tensor
+                         const void  *parameters, // Packed parameters buffer
+                         void        *output,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+
+    virtual void execute(const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+
+    virtual void execute(unsigned int batches,
+                         unsigned int input_height,
+                         unsigned int input_width,
+                         unsigned int channels,
+                         const PaddingValues &,
+                         const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+};
+
+// To handle a dilation factor of D execute the kernel once for each d in
+// [0..D). Each `d` corresponds to a portion or "view" of the input and output
+// tensors. The output view corresponds to every Dth pixel starting from `d`;
+// this function computes how many pixels are covered. The input view consists
+// of an amount of before padding, every Dth pixel starting from an offset, and
+// some after padding.  This function computes the start padding, input offset,
+// number of valid input pixels, and the after padding.
+//
+// Returns
+// - Number of valid output pixels corresponding to `d`
+// - Number of valid input pixels corresponding to `d`
+// - Offset of the first pixel corresponding to `d`
+// - Amount of padding in the view for `d`
+std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size,
+                                                                                 size_t in_size,
+                                                                                 size_t d,
+                                                                                 size_t dilation_factor,
+                                                                                 size_t kernel_size,
+                                                                                 size_t stride,
+                                                                                 size_t pad_before);
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index fdc18aef39..045f9f95d3 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,9 @@
  */
 
 #pragma once
-
 #ifdef CYCLE_PROFILING
 #include "profiler.hpp"
-#endif // CYCLE_PROFILING
+#endif
 
 namespace arm_conv
 {
@@ -69,54 +68,42 @@ public:
     virtual size_t get_working_size(unsigned int num_threads) const = 0;
 
     // Execute pooling over the specified area of memory.
-    virtual void execute(
-        const void *const input,
-        void *const       output,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
-
-    virtual void execute(
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute(const void *const input,
+                         void *const       output,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-    virtual void execute(
-        unsigned int      batches,
-        unsigned int      height,
-        unsigned int      width,
-        unsigned int      channels,
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        const PaddingValues &,
-        unsigned int output_height,
-        unsigned int output_width,
-        void *const  output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int num_threads) const = 0;
-};
+    virtual void execute(const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         void *const       output,
+                         size_t            ld_output_col,
+                         size_t            ld_output_row,
+                         size_t            ld_output_batch,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-struct Nothing
-{
-};
-
-template <typename TInput, typename TOutput, class OutputStage = Nothing>
-class PoolingCommon : public IPoolingCommon
-{
+    virtual void execute(unsigned int      batches,
+                         unsigned int      height,
+                         unsigned int      width,
+                         unsigned int      channels,
+                         const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         const PaddingValues &,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void *const  output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int num_threads) const = 0;
 };
 
 } // namespace pooling
diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp
index 2325bd08ca..89d594298e 100644
--- a/src/core/NEON/kernels/assembly/pooling.hpp
+++ b/src/core/NEON/kernels/assembly/pooling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,8 +27,6 @@
 #include "arm_gemm_local.hpp"
 #include "pool_common.hpp"
 
-#include <memory>
-
 namespace arm_conv
 {
 namespace pooling
@@ -38,9 +36,8 @@ struct PoolingConfig
     PoolingMethod method = PoolingMethod::DEFAULT;
     std::string   filter = "";
 
-    PoolingConfig(PoolingMethod method)
-        : method(method) {};
-    PoolingConfig() {};
+    PoolingConfig(PoolingMethod method) : method(method){};
+    PoolingConfig(){};
 };
 
 struct PoolingArgs
@@ -59,36 +56,50 @@ struct PoolingArgs
 
     const PoolingConfig *config;
 
-    PoolingArgs(
-        const CPUInfo       *cpu_info,
-        PoolingType          pool_type,
-        const PoolingWindow &window,
-        const PoolingStride &stride,
-        bool                 exclude_padding,
-        unsigned int         n_batches,
-        unsigned int         input_rows,
-        unsigned int         input_cols,
-        unsigned int         n_channels,
-        unsigned int         output_rows,
-        unsigned int         output_cols,
-        const PaddingValues &padding,
-        const PoolingConfig *cfg)
-        : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
-          n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg)
+    PoolingArgs(const CPUInfo       *cpu_info,
+                PoolingType          pool_type,
+                const PoolingWindow &window,
+                const PoolingStride &stride,
+                bool                 exclude_padding,
+                unsigned int         n_batches,
+                unsigned int         input_rows,
+                unsigned int         input_cols,
+                unsigned int         n_channels,
+                unsigned int         output_rows,
+                unsigned int         output_cols,
+                const PaddingValues &padding,
+                const PoolingConfig *cfg)
+        : cpu_info(cpu_info),
+          pool_type(pool_type),
+          pool_window(window),
+          pool_stride(stride),
+          exclude_padding(exclude_padding),
+          n_batches(n_batches),
+          input_rows(input_rows),
+          input_cols(input_cols),
+          n_channels(n_channels),
+          output_rows(output_rows),
+          output_cols(output_cols),
+          padding(padding),
+          config(cfg)
     {
         // If either of the pooling window dimensions are set to zero, meaning
         // "pool everything", then replace with the corresponding input dimension.
-        if(pool_window.rows == 0)
+        if (pool_window.rows == 0)
         {
             pool_window.rows = input_rows;
         }
-        if(pool_window.cols == 0)
+        if (pool_window.cols == 0)
         {
             pool_window.cols = input_cols;
         }
     }
 };
 
+struct Nothing
+{
+};
+
 struct Requantize32
 {
     int32_t input_offset  = 0;
@@ -98,20 +109,117 @@ struct Requantize32
     int32_t per_layer_right_shift = 0;
     int32_t per_layer_mul         = 0;
 
-    Requantize32(int32_t input_offset, int32_t output_offset,
-                 int32_t per_layer_left_shift, int32_t per_layer_right_shift,
+    Requantize32(int32_t input_offset,
+                 int32_t output_offset,
+                 int32_t per_layer_left_shift,
+                 int32_t per_layer_right_shift,
                  int32_t per_layer_mul)
-        : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul)
+        : input_offset(input_offset),
+          output_offset(output_offset),
+          per_layer_left_shift(per_layer_left_shift),
+          per_layer_right_shift(per_layer_right_shift),
+          per_layer_mul(per_layer_mul)
     {
     }
 };
 
-template <typename TInput, typename TOutput, class OutputStage = Nothing>
-using UniquePoolingCommon = std::unique_ptr<PoolingCommon<TInput, TOutput, OutputStage>>;
+template <typename TInput, typename TOutput>
+class PoolingCommon : public IPoolingCommon
+{
+protected:
+    const PoolingArgs m_args;
+
+public:
+    PoolingCommon(const PoolingArgs &args) : m_args(args)
+    {
+    }
+    PoolingCommon(PoolingCommon &)            = delete;
+    PoolingCommon &operator=(PoolingCommon &) = delete;
+
+    size_t get_working_size(unsigned int) const override = 0;
+
+    // Execute pooling over the specified area of memory.
+    void execute(const void *const input,
+                 void *const       output,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
+    {
+        this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols,
+                      m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels,
+                      m_args.n_channels * m_args.output_cols,
+                      m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id,
+                      num_threads);
+    }
+
+    void execute(const void *const input,
+                 size_t            ld_input_col,
+                 size_t            ld_input_row,
+                 size_t            ld_input_batch,
+                 void *const       output,
+                 size_t            ld_output_col,
+                 size_t            ld_output_row,
+                 size_t            ld_output_batch,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
+    {
+        this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col,
+                      ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output,
+                      ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads);
+    }
+
+    void execute(unsigned int         batches,
+                 unsigned int         height,
+                 unsigned int         width,
+                 unsigned int         channels,
+                 const void *const    input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const PaddingValues &padding,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void *const          output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         num_threads) const override
+    {
+        this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row,
+                               ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row,
+                               ld_output_batch, working_space, thread_id, num_threads);
+    }
+
+protected:
+    virtual void execute_internal(unsigned int batches,
+                                  unsigned int height,
+                                  unsigned int width,
+                                  unsigned int channels,
+                                  const PaddingValues &,
+                                  const void *const input,
+                                  size_t            ld_input_col,
+                                  size_t            ld_input_row,
+                                  size_t            ld_input_batch,
+                                  unsigned int      output_height,
+                                  unsigned int      output_width,
+                                  void *const       output,
+                                  size_t            ld_output_col,
+                                  size_t            ld_output_row,
+                                  size_t            ld_output_batch,
+                                  void             *working_space,
+                                  unsigned int      thread_id,
+                                  unsigned int      num_threads) const = 0;
+};
+
+template <typename TInput, typename TOutput>
+using UniquePoolingCommon = std::unique_ptr<PoolingCommon<TInput, TOutput>>;
 
 // Get a pooling engine
 template <typename TInput, typename TOutput = TInput, class OutputStage = Nothing>
-UniquePoolingCommon<TInput, TOutput, OutputStage> pooling(const PoolingArgs &, const OutputStage & = {});
+UniquePoolingCommon<TInput, TOutput> pooling(const PoolingArgs &, const OutputStage & = {});
 
 } // namespace pooling
 } // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp
new file mode 100644
index 0000000000..fb97cf8baf
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/premultiply.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+void do_premultiply_float_6(const float       *in_ptr,
+                            const unsigned int ld_row,
+                            const unsigned int ld_col,
+                            float             *out_ptr,
+                            const unsigned int out_ld_row,
+                            const unsigned int out_ld_col,
+                            const unsigned int tile_rows,
+                            const unsigned int tile_cols,
+                            const unsigned     input_channels);
+
+template <typename T>
+void do_premultiply(const T           *in_ptr,
+                    const unsigned int ld_row,
+                    const unsigned int ld_col,
+                    T                 *out_ptr,
+                    const unsigned int out_ld_row,
+                    const unsigned int out_ld_col,
+                    const unsigned int tile_rows,
+                    const unsigned int tile_cols,
+                    const unsigned     input_channels,
+                    const unsigned int channel_multiplier)
+{
+    if (sizeof(T) == 4 && channel_multiplier == 6)
+    {
+        do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col,
+                               tile_rows, tile_cols, input_channels);
+    }
+    else
+    {
+        for (unsigned int i = 0; i < tile_rows; i++)
+        {
+            const T *ip2 = in_ptr + i * ld_row;
+            T       *op2 = out_ptr + i * out_ld_row;
+            for (unsigned int j = 0; j < tile_cols; j++)
+            {
+                const T *ip = ip2;
+                T       *op = op2;
+                for (unsigned int c = 0; c < input_channels; c++)
+                {
+                    T val = *ip;
+                    ip++;
+
+                    for (unsigned int r = 0; r < channel_multiplier; r++)
+                    {
+                        op[r] = val;
+                    }
+                    op += channel_multiplier;
+                }
+                ip2 += ld_col;
+                op2 += out_ld_col;
+            }
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp
new file mode 100644
index 0000000000..dbf95d23cd
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/winograd.hpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+
+namespace arm_conv
+{
+struct Shape2D
+{
+    unsigned int rows, cols;
+};
+
+struct ConvolutionArgs
+{
+    unsigned int         n_batches;
+    Shape2D              input_shape;
+    unsigned int         n_input_channels;
+    unsigned int         pad_top, pad_left;
+    Shape2D              output_shape;
+    unsigned int         n_output_channels;
+    Shape2D              kernel_shape;
+    arm_gemm::Activation activation;
+
+    ConvolutionArgs(unsigned int                n_batches,
+                    const Shape2D              &input_shape,
+                    unsigned int                n_input_channels,
+                    unsigned int                pad_top,
+                    unsigned int                pad_left,
+                    const Shape2D              &output_shape,
+                    unsigned int                n_output_channels,
+                    const Shape2D               kernel_shape,
+                    const arm_gemm::Activation &activation = {})
+        : n_batches(n_batches),
+          input_shape(input_shape),
+          n_input_channels(n_input_channels),
+          pad_top(pad_top),
+          pad_left(pad_left),
+          output_shape(output_shape),
+          n_output_channels(n_output_channels),
+          kernel_shape(kernel_shape),
+          activation(activation)
+    {
+    }
+};
+
+namespace winograd
+{
+/* Constrain the selected Winograd implementation.
+ */
+struct WinogradConfig
+{
+    unsigned int output_rows = 0, output_cols = 0;
+    std::string  input_transform_filter  = "";
+    std::string  output_transform_filter = "";
+    std::string  weight_transform_filter = "";
+};
+
+/* Struct describing (suggested) memory layout within the Winograd domain.
+ */
+struct WinogradDomainSpec
+{
+    size_t weight_matrix_size_bytes, input_matrix_size_bytes, output_matrix_size_bytes;
+
+    size_t weight_ld_matrix, weight_ld_row;
+    size_t input_ld_batch, input_ld_matrix, input_ld_row;
+    size_t output_ld_batch, output_ld_matrix, output_ld_row;
+};
+
+class ITransformCommon
+{
+public:
+    virtual ~ITransformCommon() = default;
+
+    // Get the name of the transform
+    virtual const std::string &get_name(void) const = 0;
+};
+
+namespace weight_transform
+{
+class ITransform : public ITransformCommon
+{
+public:
+    ~ITransform() = default;
+
+    virtual unsigned int get_kernel_rows(void) const = 0;
+    virtual unsigned int get_kernel_cols(void) const = 0;
+
+    virtual unsigned int get_transformed_tile_rows(void) const = 0;
+    virtual unsigned int get_transformed_tile_cols(void) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 size_t                    ld_input_channel,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
+    {
+        this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix,
+                      wds.weight_ld_row, thread_id, n_threads);
+    }
+
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         size_t                 ld_input_channel,
+                         void                  *outptr,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
+};
+
+} // namespace weight_transform
+
+namespace input_transform
+{
+class ITransform : public ITransformCommon
+{
+public:
+    ~ITransform() = default;
+
+    virtual unsigned int get_input_rows(void) const = 0;
+    virtual unsigned int get_input_cols(void) const = 0;
+
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_batch,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
+    {
+        this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix,
+                      wds.input_ld_row, working_space, thread_id, n_threads);
+    }
+
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
+};
+
+} // namespace input_transform
+
+namespace output_transform
+{
+class ITransform : public ITransformCommon
+{
+public:
+    ~ITransform() = default;
+
+    virtual unsigned int get_input_rows(void) const = 0;
+    virtual unsigned int get_input_cols(void) const = 0;
+
+    virtual unsigned int get_output_rows(void) const = 0;
+    virtual unsigned int get_output_cols(void) const = 0;
+
+    virtual unsigned int get_kernel_rows(void) const = 0;
+    virtual unsigned int get_kernel_cols(void) const = 0;
+
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 const WinogradDomainSpec &wds,
+                 const void               *bias,
+                 void                     *outptr,
+                 size_t                    ld_out_batch,
+                 size_t                    ld_out_row,
+                 size_t                    ld_out_col,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
+    {
+        this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr,
+                      ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads);
+    }
+
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_matrix,
+                         size_t                 ld_in_row,
+                         const void            *bias,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_row,
+                         size_t                 ld_out_col,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
+};
+
+} // namespace output_transform
+
+struct WinogradImpl
+{
+    const output_transform::ITransform *output_transform = nullptr;
+    const weight_transform::ITransform *weight_transform = nullptr;
+    const input_transform::ITransform  *input_transform  = nullptr;
+    std::unique_ptr<arm_gemm::GemmArgs> gemm_args;
+    WinogradDomainSpec                  winograd_spec;
+};
+
+/* Get pointers to Winograd transforms for the given convolution problem.
+ *
+ * Assigns to the pointers in the `dest` struct and returns true or false to
+ * indicate whether the given problem can be executed or not.
+ */
+template <typename TIn,
+          typename TWeight      = TIn,
+          typename TOut         = TIn,
+          typename TWinogradIn  = TIn,
+          typename TWinogradOut = TOut>
+bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation
+                        const CPUInfo *,
+                        const ConvolutionArgs &,
+                        int  max_threads,
+                        bool fast_mode,
+                        const WinogradConfig *,
+                        const arm_gemm::GemmConfig *);
+
+} // namespace winograd
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
index ed5254a0a4..e3d9b670b3 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
@@ -37,12 +38,26 @@ namespace arm_compute
 {
 namespace
 {
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                      float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor             *src,
+                                      ITensor             *dst,
+                                      const ITensor       *mean,
+                                      const ITensor       *var,
+                                      const ITensor       *beta,
+                                      const ITensor       *gamma,
+                                      float                epsilon,
+                                      ActivationLayerInfo &act_info,
+                                      const Window        &window);
 
 template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                         float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor             *src,
+                         ITensor             *dst,
+                         const ITensor       *mean,
+                         const ITensor       *var,
+                         const ITensor       *beta,
+                         const ITensor       *gamma,
+                         float                epsilon,
+                         ActivationLayerInfo &act_info,
+                         const Window        &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
@@ -57,86 +72,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     T activation_functor(act_info);
 
     const auto epsilon_vec = wrapper::vdup_n(static_cast<float16_t>(epsilon), ExactTagType{});
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = wrapper::vloadq(input_mean + x);
-            const auto var_vec   = wrapper::vloadq(input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
-            const auto beta_vec  = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
-
-            // Calculate denominator
-            const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(act_info.enabled())
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const auto mean_vec  = wrapper::vloadq(input_mean + x);
+                const auto var_vec   = wrapper::vloadq(input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr)
+                                           ? wrapper::vloadq(input_gamma + x)
+                                           : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
+                const auto beta_vec  = (input_beta != nullptr)
+                                           ? wrapper::vloadq(input_beta + x)
+                                           : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
+
+                // Calculate denominator
+                const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
             }
 
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Conctruct vectors
-            const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
-            const float16_t beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
-            const float16_t denominator = sqrt(input_var[x] + epsilon);
-            const float16_t numerator   = input_ptr[x] - input_mean[x];
-            const float16_t x_bar       = numerator / denominator;
-            float16_t       res         = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+                const float16_t beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+                const float16_t denominator = sqrt(input_var[x] + epsilon);
+                const float16_t numerator   = input_ptr[x] - input_mean[x];
+                const float16_t x_bar       = numerator / denominator;
+                float16_t       res         = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *reinterpret_cast<float16_t *>(output_ptr + x) = res;
             }
-
-            // Store results
-            *reinterpret_cast<float16_t *>(output_ptr + x) = res;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 
 // Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
-    { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>> },
-    { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>> },
-    { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+    {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>>},
+    {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>>},
+    {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>>}};
+} // namespace
 namespace cpu
 {
-void fp16_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_neon_batch_normalization(ITensor             *src,
+                                   ITensor             *dst,
+                                   const ITensor       *mean,
+                                   const ITensor       *var,
+                                   const ITensor       *beta,
+                                   const ITensor       *gamma,
+                                   float                epsilon,
+                                   ActivationLayerInfo &act_info,
+                                   const Window        &window)
 {
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
     }
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
index d6e22e1843..4e1654ee6b 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
@@ -36,12 +37,26 @@ namespace arm_compute
 {
 namespace
 {
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                      float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor             *src,
+                                      ITensor             *dst,
+                                      const ITensor       *mean,
+                                      const ITensor       *var,
+                                      const ITensor       *beta,
+                                      const ITensor       *gamma,
+                                      float                epsilon,
+                                      ActivationLayerInfo &act_info,
+                                      const Window        &window);
 
 template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                         float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor             *src,
+                         ITensor             *dst,
+                         const ITensor       *mean,
+                         const ITensor       *var,
+                         const ITensor       *beta,
+                         const ITensor       *gamma,
+                         float                epsilon,
+                         ActivationLayerInfo &act_info,
+                         const Window        &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
@@ -56,86 +71,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     T activation_functor(act_info);
 
     const auto epsilon_vec = wrapper::vdup_n(static_cast<float>(epsilon), ExactTagType{});
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = wrapper::vloadq(input_mean + x);
-            const auto var_vec   = wrapper::vloadq(input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
-            const auto beta_vec  = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
-
-            // Calculate denominator
-            const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(act_info.enabled())
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const auto mean_vec  = wrapper::vloadq(input_mean + x);
+                const auto var_vec   = wrapper::vloadq(input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr)
+                                           ? wrapper::vloadq(input_gamma + x)
+                                           : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
+                const auto beta_vec  = (input_beta != nullptr)
+                                           ? wrapper::vloadq(input_beta + x)
+                                           : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
+
+                // Calculate denominator
+                const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
             }
 
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Conctruct vectors
-            const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
-            const float beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
-            const float denominator = sqrt(input_var[x] + epsilon);
-            const float numerator   = input_ptr[x] - input_mean[x];
-            const float x_bar       = numerator / denominator;
-            float       res         = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+                const float beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+                const float denominator = sqrt(input_var[x] + epsilon);
+                const float numerator   = input_ptr[x] - input_mean[x];
+                const float x_bar       = numerator / denominator;
+                float       res         = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *reinterpret_cast<float *>(output_ptr + x) = res;
             }
-
-            // Store results
-            *reinterpret_cast<float *>(output_ptr + x) = res;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 
 // Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
-    { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>> },
-    { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>> },
-    { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+    {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>>},
+    {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>>},
+    {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>>}};
+} // namespace
 namespace cpu
 {
-void fp32_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_neon_batch_normalization(ITensor             *src,
+                                   ITensor             *dst,
+                                   const ITensor       *mean,
+                                   const ITensor       *var,
+                                   const ITensor       *beta,
+                                   const ITensor       *gamma,
+                                   float                epsilon,
+                                   ActivationLayerInfo &act_info,
+                                   const Window        &window)
 {
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
     }
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
index a715b9d3ee..48caaa3e63 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,29 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/SVEMath.h"
 
 #include <cmath>
 #include <cstddef>
 
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                  float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_sve_batch_normalization(ITensor             *src,
+                                  ITensor             *dst,
+                                  const ITensor       *mean,
+                                  const ITensor       *var,
+                                  const ITensor       *beta,
+                                  const ITensor       *gamma,
+                                  float                epsilon,
+                                  ActivationLayerInfo &act_info,
+                                  const Window        &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
@@ -48,69 +57,74 @@ void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     const auto epsilon_vec = svdup_n_f16(epsilon);
     const auto const_1     = svdup_n_f16(1.f);
     const auto const_0     = svdup_n_f16(0.f);
     const auto va          = svdup_n_f16(act_info.a());
     const auto vb          = svdup_n_f16(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = svld1_f16(pg, input_mean + x);
-            const auto var_vec   = svld1_f16(pg, input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
-            const auto beta_vec  = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
 
-            // Calculate denominator
-            const auto tmp         = svadd_f16_z(pg, var_vec, epsilon_vec);
-            auto       denominator = svrsqrte_f16(tmp);
-            denominator            = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
-            denominator            = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                // Conctruct vectors
+                const auto mean_vec  = svld1_f16(pg, input_mean + x);
+                const auto var_vec   = svld1_f16(pg, input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
+                const auto beta_vec  = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
 
-            // Calculate x bar
-            const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
-            const auto x_bar     = svmul_f16_z(pg, numerator, denominator);
-            auto       res       = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
+                // Calculate denominator
+                const auto tmp         = svadd_f16_z(pg, var_vec, epsilon_vec);
+                auto       denominator = svrsqrte_f16(tmp);
+                denominator =
+                    svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
+                denominator =
+                    svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
 
-            // Perform fused activation
-            if(act_info.enabled())
-            {
-                if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
-                {
-                    res = svmax_f16_z(pg, const_0, res);
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                {
-                    res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                // Calculate x bar
+                const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
+                const auto x_bar     = svmul_f16_z(pg, numerator, denominator);
+                auto       res       = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
                 {
-                    res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
+                    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+                    {
+                        res = svmax_f16_z(pg, const_0, res);
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                    {
+                        res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                    {
+                        res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
+                    }
                 }
-            }
 
-            // Store results
-            svst1_f16(pg, output_ptr + x, res);
+                // Store results
+                svst1_f16(pg, output_ptr + x, res);
 
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
index 7cc570d8aa..df4fbfe607 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,29 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/SVEMath.h"
 
 #include <cmath>
 #include <cstddef>
 
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                  float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_sve_batch_normalization(ITensor             *src,
+                                  ITensor             *dst,
+                                  const ITensor       *mean,
+                                  const ITensor       *var,
+                                  const ITensor       *beta,
+                                  const ITensor       *gamma,
+                                  float                epsilon,
+                                  ActivationLayerInfo &act_info,
+                                  const Window        &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
@@ -48,69 +57,74 @@ void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     const auto epsilon_vec = svdup_n_f32(epsilon);
     const auto const_1     = svdup_n_f32(1.f);
     const auto const_0     = svdup_n_f32(0.f);
     const auto va          = svdup_n_f32(act_info.a());
     const auto vb          = svdup_n_f32(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = svld1_f32(pg, input_mean + x);
-            const auto var_vec   = svld1_f32(pg, input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
-            const auto beta_vec  = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
 
-            // Calculate denominator
-            const auto tmp         = svadd_f32_z(pg, var_vec, epsilon_vec);
-            auto       denominator = svrsqrte_f32(tmp);
-            denominator            = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
-            denominator            = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
+            {
+                // Conctruct vectors
+                const auto mean_vec  = svld1_f32(pg, input_mean + x);
+                const auto var_vec   = svld1_f32(pg, input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
+                const auto beta_vec  = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
 
-            // Calculate x bar
-            const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
-            const auto x_bar     = svmul_f32_z(pg, numerator, denominator);
-            auto       res       = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
+                // Calculate denominator
+                const auto tmp         = svadd_f32_z(pg, var_vec, epsilon_vec);
+                auto       denominator = svrsqrte_f32(tmp);
+                denominator =
+                    svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
+                denominator =
+                    svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
 
-            // Perform fused activation
-            if(act_info.enabled())
-            {
-                if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
-                {
-                    res = svmax_f32_z(pg, const_0, res);
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                {
-                    res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                // Calculate x bar
+                const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
+                const auto x_bar     = svmul_f32_z(pg, numerator, denominator);
+                auto       res       = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
                 {
-                    res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
+                    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+                    {
+                        res = svmax_f32_z(pg, const_0, res);
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                    {
+                        res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                    {
+                        res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
+                    }
                 }
-            }
 
-            // Store results
-            svst1_f32(pg, output_ptr + x, res);
+                // Store results
+                svst1_f32(pg, output_ptr + x, res);
 
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    input, output);
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h
index 8e0ea36f5a..c619788125 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/list.h
+++ b/src/core/NEON/kernels/batchnormalization/impl/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,24 +21,39 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H
-#define SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H
+#define ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H
 
 namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name)                                                                              \
-    void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, \
-                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name)                                                        \
+    void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, \
+                   const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_neon_batch_normalization);
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization);
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp32_neon_batch_normalization);
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp32_sve_batch_normalization);
 
-#undef DECLARE_ACTIVATION_KERNEL
+#define DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(func_name)                                                         \
+    void func_name(const Window &window, ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, \
+                   const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo act_info)
+
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_relu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_brelu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_lubrelu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_relu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_brelu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_lubrelu);
+
+#undef DECLARE_BATCH_NORMALIZATION_KERNEL
+#undef DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL
+
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H
diff --git a/src/core/NEON/kernels/convolution/common/padding.cpp b/src/core/NEON/kernels/convolution/common/padding.cpp
index f57706fef6..5960e66968 100644
--- a/src/core/NEON/kernels/convolution/common/padding.cpp
+++ b/src/core/NEON/kernels/convolution/common/padding.cpp
@@ -81,7 +81,7 @@ template void copy_and_pad_tile(
 
 template void copy_and_pad_tile(
   unsigned int, unsigned int, unsigned int,
-  const float *, unsigned int, unsigned int,
+  float const *, unsigned int, unsigned int,
   float *, unsigned int, unsigned int,
   unsigned int, unsigned int, unsigned int, unsigned int, float
 );
diff --git a/src/core/NEON/kernels/convolution/common/padding.hpp b/src/core/NEON/kernels/convolution/common/padding.hpp
index b6f95872c0..397d902e29 100644
--- a/src/core/NEON/kernels/convolution/common/padding.hpp
+++ b/src/core/NEON/kernels/convolution/common/padding.hpp
@@ -34,20 +34,20 @@ namespace padding
  */
 template <typename T>
 void copy_and_pad_tile(
-  unsigned int tile_rows,
-  unsigned int tile_cols,
-  unsigned int n_channels,
-  const T *inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  T* outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride,
-  unsigned int pad_top,
-  unsigned int pad_left,
-  unsigned int pad_bottom,
-  unsigned int pad_right,
-  T pad_value=static_cast<T>(0)
+  const unsigned int tile_rows,
+  const unsigned int tile_cols,
+  const unsigned int n_channels,
+  const T * const inptr,
+  const unsigned int in_row_stride,
+  const unsigned int in_col_stride,
+  T* const outptr,
+  const unsigned int out_row_stride,
+  const unsigned int out_col_stride,
+  const unsigned int pad_top,
+  const unsigned int pad_left,
+  const unsigned int pad_bottom,
+  const unsigned int pad_right,
+  const T pad_value=static_cast<T>(0)
 );
 
 /** Copy a tile and remove padding elements in the output.
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp
deleted file mode 100644
index 70d6689731..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <arm_neon.h>
-#include "activation.hpp"
-#include "padding.hpp"
-
-namespace depthwise
-{
-
-namespace nck = neon_convolution_kernels;
-
-class IDepthwiseConvolution
-{
-  public:
-    virtual ~IDepthwiseConvolution() = default;
-
-    virtual int output_size(
-      int dim_size,
-      unsigned int padding_before,
-      unsigned int padding_after
-    ) const = 0;
-
-    /* Set input tensor and stride. */
-    virtual void set_input(const void *inptr) = 0;
-    virtual void set_input(const void *inptr, int column_stride) = 0;
-    virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0;
-    virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0;
-
-    /* Set output tensor and stride. */
-    virtual void set_output(void *outptr) = 0;
-    virtual void set_output(void *outptr, int column_stride) = 0;
-    virtual void set_output(void *outptr, int row_stride, int column_stride) = 0;
-    virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0;
-
-    /* Weights and biases are re-ordered to improve memory access patterns. Use
-     * these methods to determine the size of the re-pack buffer and to set the
-     * address (and implicitly reorder the weights and biases into) the buffer.
-     */
-    virtual size_t get_packed_params_size(void) const = 0;
-    virtual void set_packed_params_buffer(void *) = 0;
-
-    virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0;
-    virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0;
-    virtual void pack_params(
-      void *buffer,
-      const void* weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const = 0;
-
-    /* Working space is used to pad tensors on the fly. Before running any
-     * inference check the amount of space required, allocate and provide a
-     * pointer to the convolution engine.
-     */
-    virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
-    virtual void set_working_space(void *) = 0;
-
-    virtual unsigned int get_window(void) const = 0;
-    virtual void run(
-      unsigned int start,
-      unsigned int stop,
-      unsigned int threadid=0
-    ) = 0;
-};
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut,
-  typename Derived
->
-class DepthwiseConvolutionBase : public IDepthwiseConvolution
-{
-  public:
-    // Information about the specific convolution instance
-    using InputType = TIn;
-    using BiasType = TBias;
-    using OutputType = TOut;
-    static constexpr int output_tile_rows = OutputTileRows;
-    static constexpr int output_tile_cols = OutputTileCols;
-    static constexpr int kernel_rows = KernelRows;
-    static constexpr int kernel_cols = KernelCols;
-    static constexpr int stride_rows = StrideRows;
-    static constexpr int stride_cols = StrideCols;
-    static constexpr int inner_tile_rows = stride_rows * (output_tile_rows - 1) + kernel_rows;
-    static constexpr int inner_tile_cols = stride_cols * (output_tile_cols - 1) + kernel_cols;
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in] n_batches Number of batches tensors.
-     * @param[in] n_input_rows Number of rows in input tensor.
-     * @param[in] n_input_cols Number of columns in input tensor.
-     * @param[in] n_channels Number of channels in input and output tensors.
-     */
-    DepthwiseConvolutionBase(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in] n_batches Number of batches tensors.
-     * @param[in] n_input_rows Number of rows in input tensor.
-     * @param[in] n_input_cols Number of columns in input tensor.
-     * @param[in] n_channels Number of channels in input and output tensors.
-     */
-    DepthwiseConvolutionBase(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    // Cannot copy or move a DepthwiseConvolution.
-    DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
-    DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
-
-    /* Set input tensor and stride. */
-    void set_input(const void *inptr) override;
-    void set_input(const void *inptr, int column_stride) override;
-    void set_input(const void *inptr, int row_stride, int column_stride) override;
-    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /* Set output tensor and stride. */
-    void set_output(void *outptr) override;
-    void set_output(void *outptr, int column_stride) override;
-    void set_output(void *outptr, int row_stride, int column_stride) override;
-    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /** Get the number of output rows/columns.
-     *
-     * @param[in] dim_size Number of elements in the dimension (rows/columns)
-     * @param[in] same_padding True if the padding is SAME, otherwise false.
-     */
-    static int get_output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    );
-
-    int output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    ) const override;
-
-    /* Determine how much memory is required to store the packed weights and
-     * biases.
-     */
-    size_t get_packed_params_size(void) const override;
-
-    /* Set the buffer for the packed weights and biases, and perform the
-     * packing.
-     */
-    void set_packed_params_buffer(void *buffer) override;
-
-    void pack_params(const void *weights, const void *biases=nullptr) const override;
-
-    void pack_params(
-      void *buffer,
-      const void *weights,
-      const void *biases=nullptr
-    ) const override;
-
-    void pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const override;
-
-    /** Query the amount of working space required.
-     * @param[in] The largest number of threads which will be used to execute
-     *            the kernel.
-     */
-    size_t get_working_space_size(unsigned int n_threads=1) const override;
-
-    /** Set the working space buffer.
-     */
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work to be performed by an instance of the operator.
-     */
-    unsigned int get_window(void) const override;
-
-    /** Perform a portion of the work associated with the operator.
-     *
-     * Will perform the window of work described by $[start, stop)$.
-     *
-     * @param[in] start Start of the window of work to perform.
-     * @param[in] stop End of the work to perform.
-     * @param[in] ID of the thread performing the work.
-     */
-    void run(
-      unsigned int start,
-      unsigned int stop,
-      unsigned int threadid=0
-    ) override;
-
-  protected:
-    /** Get the value to use to pad the tensor.
-     */
-    TIn _input_padding_value(void) const;
-
-    /** Implementation of the parameter packing.
-     */
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    /** Process a tile-row of the tensors.
-     */
-    void process_tile_row(
-      unsigned int threadid,
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      OutputType* outptr,
-      int row_pad_in_top,
-      int row_pad_in_left,
-      int row_pad_in_bottom,
-      int row_pad_out_bottom,
-      int n_tiles,
-      int n_input_cols,
-      int n_output_cols
-    );
-
-    /** Process a single tile of the tensor.
-     *
-     * This method will apply input/output padding (if required) and call the
-     * depthwise tile implementation.
-     */
-    void process_tile(
-      unsigned int threadid,
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      OutputType* outptr,
-      int pad_in_top,
-      int pad_in_left,
-      int pad_in_bottom,
-      int pad_in_right,
-      int pad_out_bottom,
-      int pad_out_right
-    );
-
-    /** Perform depthwise convolution on a single tile.
-     */
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      OutputType* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptrs[inner_tile_rows][inner_tile_cols],
-      OutputType* outptrs[output_tile_rows][output_tile_cols]
-    );
-
-    int n_channels(void) const;
-
-  private:
-    // Member variables of instances of a convolution engine.
-    const InputType* _input;
-    OutputType* _output;
-    void* _packed_parameters;
-    void* _working_space;  // Per-thread working space
-    const int _n_batches, _n_input_rows, _n_input_cols, _n_channels,
-              _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols;
-    const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right;
-    const nck::ActivationFunction _activation;
-
-    // Stride information for a convolution instance
-    int _input_col_stride, _input_row_stride, _input_batch_stride;
-    int _output_col_stride, _output_row_stride, _output_batch_stride;
-
-    // Methods for getting access to working space
-    size_t _get_input_working_space_size(void) const;
-    size_t _get_output_working_space_size(void) const;
-
-    void *_get_input_working_space(unsigned int threadid) const;
-    void *_get_output_working_space(unsigned int threadid) const;
-};
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut
->
-class DepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  TIn, TBias, TOut,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    TIn, TBias, TOut
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    TIn, TBias, TOut,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      TIn, TBias, TOut
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    using Base::DepthwiseConvolutionBase;
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const TIn* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      TOut* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      OutputType* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float, float, float
-> : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float, float, float,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float, float, float
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float, float, float,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      float, float, float
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      float* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      float* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
-> : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float16_t, float16_t, float16_t,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float16_t, float16_t, float16_t
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float16_t, float16_t, float16_t,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      float16_t, float16_t, float16_t
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float16_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      float16_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float16_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      float16_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
deleted file mode 100644
index 864c6e24a0..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ /dev/null
@@ -1,1168 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x26, %[inptr0], %[input_row_stride]\n"
-    "add x21, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x23, %[outptr0], %[output_row_stride]\n"
-    "add x27, x26, %[input_row_stride]\n"
-    "add x22, x21, %[input_col_stride1]\n"
-    "and x24, %[n_channels], #3\n"
-    "add x28, x27, %[input_row_stride]\n"
-    "lsr x25, %[n_channels], #2\n"
-    "cbz x25, 4f\n"
-    "1:\n"
-    "ldr q15, [%[wbptr]]\n"
-    "subs x25, x25, #1\n"
-    "mov v3.16b, v15.16b\n"
-    "ldr q14, [%[wbptr], #16]\n"
-    "mov v1.16b, v15.16b\n"
-    "ldr q13, [%[wbptr], #32]\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr q12, [%[wbptr], #48]\n"
-    "mov v0.16b, v15.16b\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "ldr q9, [%[wbptr], #96]\n"
-    "ldr q8, [%[wbptr], #112]\n"
-    "ldr q7, [%[wbptr], #128]\n"
-    "ldr q6, [%[wbptr], #144]\n"
-    "ldr q24, [%[inptr0]]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "ldr q22, [x26]\n"
-    "fmla v1.4s, v22.4s, v14.4s\n"
-    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v19.4s, v14.4s\n"
-    "ldr q18, [x27]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "ldr q21, [x26, %[input_col_stride1]]\n"
-    "fmla v1.4s, v18.4s, v11.4s\n"
-    "ldr q17, [%[inptr0], x21]\n"
-    "ldr q20, [x28]\n"
-    "ldr q5, [x27, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v13.4s\n"
-    "fmla v3.4s, v18.4s, v8.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v3.4s, v21.4s, v10.4s\n"
-    "ldr q19, [x26, x21]\n"
-    "fmla v1.4s, v21.4s, v13.4s\n"
-    "ldr q23, [%[inptr0], x22]\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "ldr q22, [x28, %[input_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v14.4s\n"
-    "ldr q21, [x27, x21]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr q18, [x26, x22]\n"
-    "fmla v2.4s, v17.4s, v13.4s\n"
-    "ldr q16, [x28, x21]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "ldr q20, [x27, x22]\n"
-    "fmla v3.4s, v5.4s, v7.4s\n"
-    "ldr q4, [x28, x22]\n"
-    "fmla v2.4s, v5.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v1.4s, v5.4s, v10.4s\n"
-    "ldr q15, [%[wbptr]]\n"
-    "fmla v0.4s, v5.4s, v11.4s\n"
-    "ldr q14, [%[wbptr], #16]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v1.4s, v19.4s, v12.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v2.4s, v19.4s, v10.4s\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "fmla v0.4s, v19.4s, v13.4s\n"
-    "ldr q24, [%[inptr0]]\n"
-    "fmla v1.4s, v22.4s, v7.4s\n"
-    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "ldr q17, [%[inptr0], x21]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "ldr q13, [%[wbptr], #32]\n"
-    "fmla v3.4s, v21.4s, v6.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v1.4s, v21.4s, v9.4s\n"
-    "ldr q22, [x26]\n"
-    "fmla v2.4s, v21.4s, v7.4s\n"
-    "ldr q8, [%[wbptr], #112]\n"
-    "str q3, [%[outptr0]]\n"
-    "fmla v0.4s, v21.4s, v10.4s\n"
-    "fmla v1.4s, v16.4s, v6.4s\n"
-    "ldr q21, [x26, %[input_col_stride1]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v0.4s, v18.4s, v12.4s\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "str q1, [x23]\n"
-    "mov v3.16b, v15.16b\n"
-    "fmla v2.4s, v20.4s, v6.4s\n"
-    "ldr q18, [x27]\n"
-    "fmla v0.4s, v16.4s, v7.4s\n"
-    "ldr q12, [%[wbptr], #48]\n"
-    "mov v1.16b, v15.16b\n"
-    "ldr q5, [x27, %[input_col_stride1]]\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "fmla v0.4s, v20.4s, v9.4s\n"
-    "ldr q7, [%[wbptr], #128]\n"
-    "mov v2.16b, v15.16b\n"
-    "add x28, x28, #16\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "ldr q20, [x28]\n"
-    "fmla v0.4s, v4.4s, v6.4s\n"
-    "ldr q9, [%[wbptr], #96]\n"
-    "fmla v1.4s, v22.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v3.4s, v19.4s, v13.4s\n"
-    "subs x25, x25, #1\n"
-    "str q0, [x23, %[output_col_stride1]]\n"
-    "fmla v2.4s, v19.4s, v14.4s\n"
-    "ldr q6, [%[wbptr], #144]\n"
-    "add x23, x23, #16\n"
-    "fmla v3.4s, v18.4s, v8.4s\n"
-    "fmla v1.4s, v18.4s, v11.4s\n"
-    "mov v0.16b, v15.16b\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v3.4s, v21.4s, v10.4s\n"
-    "ldr q19, [x26, x21]\n"
-    "fmla v1.4s, v21.4s, v13.4s\n"
-    "ldr q23, [%[inptr0], x22]\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "ldr q22, [x28, %[input_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v14.4s\n"
-    "ldr q21, [x27, x21]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr q18, [x26, x22]\n"
-    "fmla v2.4s, v17.4s, v13.4s\n"
-    "ldr q16, [x28, x21]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "ldr q20, [x27, x22]\n"
-    "fmla v3.4s, v5.4s, v7.4s\n"
-    "ldr q4, [x28, x22]\n"
-    "fmla v2.4s, v5.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v1.4s, v5.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v5.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v1.4s, v19.4s, v12.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v2.4s, v19.4s, v10.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v0.4s, v19.4s, v13.4s\n"
-    "fmla v3.4s, v21.4s, v6.4s\n"
-    "fmla v1.4s, v22.4s, v7.4s\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "str q3, [%[outptr0]]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "fmla v1.4s, v21.4s, v9.4s\n"
-    "fmla v2.4s, v21.4s, v7.4s\n"
-    "fmla v0.4s, v21.4s, v10.4s\n"
-    "fmla v1.4s, v16.4s, v6.4s\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "fmla v0.4s, v18.4s, v12.4s\n"
-    "str q1, [x23]\n"
-    "fmla v2.4s, v20.4s, v6.4s\n"
-    "fmla v0.4s, v16.4s, v7.4s\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v20.4s, v9.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v0.4s, v4.4s, v6.4s\n"
-    "str q0, [x23, %[output_col_stride1]]\n"
-    "add x23, x23, #16\n"
-    "4:\n"
-    "cbz x24, 7f\n"
-    "ldr s15, [%[wbptr]]\n"
-    "mov v3.16b, v15.16b\n"
-    "ldr s14, [%[wbptr], #4]\n"
-    "mov v1.16b, v15.16b\n"
-    "ldr s13, [%[wbptr], #8]\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr s12, [%[wbptr], #12]\n"
-    "mov v0.16b, v15.16b\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "subs x24, x24, #1\n"
-    "ldr s9, [%[wbptr], #24]\n"
-    "ldr s8, [%[wbptr], #28]\n"
-    "ldr s7, [%[wbptr], #32]\n"
-    "ldr s6, [%[wbptr], #36]\n"
-    "ldr s24, [%[inptr0]]\n"
-    "ldr s22, [x26]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v1.4s, v22.4s, v14.4s\n"
-    "ldr s18, [x27]\n"
-    "fmla v2.4s, v19.4s, v14.4s\n"
-    "ldr s21, [x26, %[input_col_stride1]]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "ldr s17, [%[inptr0], x21]\n"
-    "fmla v1.4s, v18.4s, v11.4s\n"
-    "ldr s20, [x28]\n"
-    "ldr s5, [x27, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v13.4s\n"
-    "fmla v3.4s, v18.4s, v8.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v3.4s, v21.4s, v10.4s\n"
-    "ldr s19, [x26, x21]\n"
-    "fmla v1.4s, v21.4s, v13.4s\n"
-    "ldr s23, [%[inptr0], x22]\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "ldr s22, [x28, %[input_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v14.4s\n"
-    "ldr s21, [x27, x21]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr s18, [x26, x22]\n"
-    "fmla v2.4s, v17.4s, v13.4s\n"
-    "ldr s16, [x28, x21]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "ldr s20, [x27, x22]\n"
-    "fmla v3.4s, v5.4s, v7.4s\n"
-    "ldr s4, [x28, x22]\n"
-    "fmla v2.4s, v5.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v1.4s, v5.4s, v10.4s\n"
-    "ldr s15, [%[wbptr]]\n"
-    "fmla v0.4s, v5.4s, v11.4s\n"
-    "ldr s14, [%[wbptr], #4]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v1.4s, v19.4s, v12.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v2.4s, v19.4s, v10.4s\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "fmla v0.4s, v19.4s, v13.4s\n"
-    "ldr s24, [%[inptr0]]\n"
-    "fmla v1.4s, v22.4s, v7.4s\n"
-    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "ldr s17, [%[inptr0], x21]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "ldr s13, [%[wbptr], #8]\n"
-    "fmla v3.4s, v21.4s, v6.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v1.4s, v21.4s, v9.4s\n"
-    "ldr s22, [x26]\n"
-    "fmla v2.4s, v21.4s, v7.4s\n"
-    "ldr s8, [%[wbptr], #28]\n"
-    "str s3, [%[outptr0]]\n"
-    "fmla v0.4s, v21.4s, v10.4s\n"
-    "fmla v1.4s, v16.4s, v6.4s\n"
-    "ldr s21, [x26, %[input_col_stride1]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v0.4s, v18.4s, v12.4s\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "str s1, [x23]\n"
-    "mov v3.16b, v15.16b\n"
-    "fmla v2.4s, v20.4s, v6.4s\n"
-    "ldr s18, [x27]\n"
-    "fmla v0.4s, v16.4s, v7.4s\n"
-    "ldr s12, [%[wbptr], #12]\n"
-    "mov v1.16b, v15.16b\n"
-    "ldr s5, [x27, %[input_col_stride1]]\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "fmla v0.4s, v20.4s, v9.4s\n"
-    "ldr s7, [%[wbptr], #32]\n"
-    "mov v2.16b, v15.16b\n"
-    "add x28, x28, #4\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "ldr s20, [x28]\n"
-    "fmla v0.4s, v4.4s, v6.4s\n"
-    "ldr s9, [%[wbptr], #24]\n"
-    "fmla v1.4s, v22.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v3.4s, v19.4s, v13.4s\n"
-    "subs x24, x24, #1\n"
-    "str s0, [x23, %[output_col_stride1]]\n"
-    "fmla v2.4s, v19.4s, v14.4s\n"
-    "ldr s6, [%[wbptr], #36]\n"
-    "add x23, x23, #4\n"
-    "fmla v3.4s, v18.4s, v8.4s\n"
-    "fmla v1.4s, v18.4s, v11.4s\n"
-    "mov v0.16b, v15.16b\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v3.4s, v21.4s, v10.4s\n"
-    "ldr s19, [x26, x21]\n"
-    "fmla v1.4s, v21.4s, v13.4s\n"
-    "ldr s23, [%[inptr0], x22]\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "ldr s22, [x28, %[input_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v14.4s\n"
-    "ldr s21, [x27, x21]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr s18, [x26, x22]\n"
-    "fmla v2.4s, v17.4s, v13.4s\n"
-    "ldr s16, [x28, x21]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "ldr s20, [x27, x22]\n"
-    "fmla v3.4s, v5.4s, v7.4s\n"
-    "ldr s4, [x28, x22]\n"
-    "fmla v2.4s, v5.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v1.4s, v5.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v5.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v1.4s, v19.4s, v12.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v2.4s, v19.4s, v10.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v0.4s, v19.4s, v13.4s\n"
-    "fmla v3.4s, v21.4s, v6.4s\n"
-    "fmla v1.4s, v22.4s, v7.4s\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "str s3, [%[outptr0]]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "fmla v1.4s, v21.4s, v9.4s\n"
-    "fmla v2.4s, v21.4s, v7.4s\n"
-    "fmla v0.4s, v21.4s, v10.4s\n"
-    "fmla v1.4s, v16.4s, v6.4s\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "fmla v0.4s, v18.4s, v12.4s\n"
-    "str s1, [x23]\n"
-    "fmla v2.4s, v20.4s, v6.4s\n"
-    "fmla v0.4s, v16.4s, v7.4s\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v20.4s, v9.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v0.4s, v4.4s, v6.4s\n"
-    "str s0, [x23, %[output_col_stride1]]\n"
-    "add x23, x23, #4\n"
-    "7:\n"
-    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x21, %[inptr0], %[input_row_stride]\n"
-    "add x24, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x22, %[outptr0], %[output_row_stride]\n"
-    "add x23, x21, %[input_row_stride]\n"
-    "add x27, x24, %[input_col_stride1]\n"
-    "and x25, %[n_channels], #3\n"
-    "add x28, x23, %[input_row_stride]\n"
-    "lsr x26, %[n_channels], #2\n"
-    "cbz x26, 4f\n"
-    "1:\n"
-    "ldr q11, [%[wbptr]]\n"
-    "subs x26, x26, #1\n"
-    "mov v17.16b, v11.16b\n"
-    "ldr q13, [%[wbptr], #16]\n"
-    "mov v15.16b, v11.16b\n"
-    "ldr q4, [%[wbptr], #32]\n"
-    "mov v16.16b, v11.16b\n"
-    "ldr q2, [%[wbptr], #48]\n"
-    "mov v14.16b, v11.16b\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "ldr q1, [%[wbptr], #96]\n"
-    "ldr q12, [%[wbptr], #112]\n"
-    "ldr q0, [%[wbptr], #128]\n"
-    "ldr q3, [%[wbptr], #144]\n"
-    "ldr q6, [%[inptr0]]\n"
-    "fmla v17.4s, v6.4s, v13.4s\n"
-    "ldr q27, [x21]\n"
-    "fmla v15.4s, v27.4s, v13.4s\n"
-    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "ldr q24, [x23]\n"
-    "fmla v17.4s, v27.4s, v5.4s\n"
-    "ldr q22, [x21, %[input_col_stride1]]\n"
-    "ldr q9, [%[inptr0], x24]\n"
-    "ldr q8, [x28]\n"
-    "ldr q20, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v17.4s, v24.4s, v12.4s\n"
-    "ldr q26, [x21, x24]\n"
-    "fmla v15.4s, v24.4s, v5.4s\n"
-    "ldr q27, [%[inptr0], x27]\n"
-    "fmla v16.4s, v22.4s, v5.4s\n"
-    "ldr q25, [x28, %[input_col_stride1]]\n"
-    "fmla v17.4s, v22.4s, v10.4s\n"
-    "ldr q24, [x23, x24]\n"
-    "fmla v15.4s, v22.4s, v4.4s\n"
-    "ldr q21, [x21, x27]\n"
-    "fmla v14.4s, v22.4s, v13.4s\n"
-    "ldr q7, [x28, x24]\n"
-    "fmla v17.4s, v9.4s, v2.4s\n"
-    "ldr q19, [x23, x27]\n"
-    "fmla v16.4s, v9.4s, v4.4s\n"
-    "ldr q18, [x28, x27]\n"
-    "fmla v15.4s, v8.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v20.4s, v0.4s\n"
-    "ldr q11, [%[wbptr]]\n"
-    "fmla v16.4s, v20.4s, v12.4s\n"
-    "ldr q13, [%[wbptr], #16]\n"
-    "fmla v15.4s, v20.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v14.4s, v20.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v17.4s, v26.4s, v1.4s\n"
-    "ldr q6, [%[inptr0]]\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr q9, [%[inptr0], x24]\n"
-    "fmla v15.4s, v25.4s, v0.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v16.4s, v27.4s, v2.4s\n"
-    "ldr q27, [x21]\n"
-    "fmla v14.4s, v25.4s, v12.4s\n"
-    "ldr q4, [%[wbptr], #32]\n"
-    "fmla v17.4s, v24.4s, v3.4s\n"
-    "ldr q22, [x21, %[input_col_stride1]]\n"
-    "fmla v15.4s, v24.4s, v1.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v16.4s, v24.4s, v0.4s\n"
-    "ldr q12, [%[wbptr], #112]\n"
-    "fmla v14.4s, v24.4s, v10.4s\n"
-    "ldr q24, [x23]\n"
-    "fmla v15.4s, v7.4s, v3.4s\n"
-    "ldr q20, [x23, %[input_col_stride1]]\n"
-    "fmla v16.4s, v21.4s, v1.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v14.4s, v21.4s, v2.4s\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "movi v26.16b, #0\n"
-    "ldr q8, [x28]\n"
-    "fmla v16.4s, v19.4s, v3.4s\n"
-    "subs x26, x26, #1\n"
-    "fmla v14.4s, v7.4s, v0.4s\n"
-    "ldr q2, [%[wbptr], #48]\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str q17, [%[outptr0]]\n"
-    "str q16, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v14.4s, v19.4s, v1.4s\n"
-    "str q15, [x22]\n"
-    "mov v17.16b, v11.16b\n"
-    "mov v15.16b, v11.16b\n"
-    "ldr q0, [%[wbptr], #128]\n"
-    "fmla v14.4s, v18.4s, v3.4s\n"
-    "ldr q1, [%[wbptr], #96]\n"
-    "mov v16.16b, v11.16b\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v17.4s, v6.4s, v13.4s\n"
-    "fmla v15.4s, v27.4s, v13.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "ldr q3, [%[wbptr], #144]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "str q14, [x22, %[output_col_stride1]]\n"
-    "mov v14.16b, v11.16b\n"
-    "add x22, x22, #16\n"
-    "fmla v17.4s, v27.4s, v5.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v17.4s, v24.4s, v12.4s\n"
-    "ldr q26, [x21, x24]\n"
-    "fmla v15.4s, v24.4s, v5.4s\n"
-    "ldr q27, [%[inptr0], x27]\n"
-    "fmla v16.4s, v22.4s, v5.4s\n"
-    "ldr q25, [x28, %[input_col_stride1]]\n"
-    "fmla v17.4s, v22.4s, v10.4s\n"
-    "ldr q24, [x23, x24]\n"
-    "fmla v15.4s, v22.4s, v4.4s\n"
-    "ldr q21, [x21, x27]\n"
-    "fmla v14.4s, v22.4s, v13.4s\n"
-    "ldr q7, [x28, x24]\n"
-    "fmla v17.4s, v9.4s, v2.4s\n"
-    "ldr q19, [x23, x27]\n"
-    "fmla v16.4s, v9.4s, v4.4s\n"
-    "ldr q18, [x28, x27]\n"
-    "fmla v15.4s, v8.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v20.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v20.4s, v12.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v15.4s, v20.4s, v10.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v14.4s, v20.4s, v5.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v17.4s, v26.4s, v1.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v17.4s, v24.4s, v3.4s\n"
-    "fmla v16.4s, v27.4s, v2.4s\n"
-    "fmla v15.4s, v25.4s, v0.4s\n"
-    "fmla v14.4s, v25.4s, v12.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmla v16.4s, v24.4s, v0.4s\n"
-    "str q17, [%[outptr0]]\n"
-    "fmla v15.4s, v24.4s, v1.4s\n"
-    "fmla v14.4s, v24.4s, v10.4s\n"
-    "fmla v16.4s, v21.4s, v1.4s\n"
-    "fmla v15.4s, v7.4s, v3.4s\n"
-    "fmla v14.4s, v21.4s, v2.4s\n"
-    "fmla v16.4s, v19.4s, v3.4s\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "fmla v14.4s, v7.4s, v0.4s\n"
-    "str q15, [x22]\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "fmla v14.4s, v19.4s, v1.4s\n"
-    "str q16, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v14.4s, v18.4s, v3.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "str q14, [x22, %[output_col_stride1]]\n"
-    "add x22, x22, #16\n"
-    "4:\n"
-    "cbz x25, 7f\n"
-    "ldr s11, [%[wbptr]]\n"
-    "mov v17.16b, v11.16b\n"
-    "ldr s13, [%[wbptr], #4]\n"
-    "mov v15.16b, v11.16b\n"
-    "ldr s4, [%[wbptr], #8]\n"
-    "mov v16.16b, v11.16b\n"
-    "ldr s2, [%[wbptr], #12]\n"
-    "mov v14.16b, v11.16b\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "subs x25, x25, #1\n"
-    "ldr s1, [%[wbptr], #24]\n"
-    "ldr s12, [%[wbptr], #28]\n"
-    "ldr s0, [%[wbptr], #32]\n"
-    "ldr s3, [%[wbptr], #36]\n"
-    "ldr s6, [%[inptr0]]\n"
-    "ldr s27, [x21]\n"
-    "fmla v17.4s, v6.4s, v13.4s\n"
-    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v15.4s, v27.4s, v13.4s\n"
-    "ldr s24, [x23]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "ldr s22, [x21, %[input_col_stride1]]\n"
-    "fmla v17.4s, v27.4s, v5.4s\n"
-    "ldr s9, [%[inptr0], x24]\n"
-    "ldr s8, [x28]\n"
-    "ldr s20, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v17.4s, v24.4s, v12.4s\n"
-    "ldr s26, [x21, x24]\n"
-    "fmla v15.4s, v24.4s, v5.4s\n"
-    "ldr s27, [%[inptr0], x27]\n"
-    "fmla v16.4s, v22.4s, v5.4s\n"
-    "ldr s25, [x28, %[input_col_stride1]]\n"
-    "fmla v17.4s, v22.4s, v10.4s\n"
-    "ldr s24, [x23, x24]\n"
-    "fmla v15.4s, v22.4s, v4.4s\n"
-    "ldr s21, [x21, x27]\n"
-    "fmla v14.4s, v22.4s, v13.4s\n"
-    "ldr s7, [x28, x24]\n"
-    "fmla v17.4s, v9.4s, v2.4s\n"
-    "ldr s19, [x23, x27]\n"
-    "fmla v16.4s, v9.4s, v4.4s\n"
-    "ldr s18, [x28, x27]\n"
-    "fmla v15.4s, v8.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v20.4s, v0.4s\n"
-    "ldr s11, [%[wbptr]]\n"
-    "fmla v16.4s, v20.4s, v12.4s\n"
-    "ldr s13, [%[wbptr], #4]\n"
-    "fmla v15.4s, v20.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v14.4s, v20.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v17.4s, v26.4s, v1.4s\n"
-    "ldr s6, [%[inptr0]]\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr s9, [%[inptr0], x24]\n"
-    "fmla v15.4s, v25.4s, v0.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v16.4s, v27.4s, v2.4s\n"
-    "ldr s27, [x21]\n"
-    "fmla v14.4s, v25.4s, v12.4s\n"
-    "ldr s4, [%[wbptr], #8]\n"
-    "fmla v17.4s, v24.4s, v3.4s\n"
-    "ldr s22, [x21, %[input_col_stride1]]\n"
-    "fmla v15.4s, v24.4s, v1.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v16.4s, v24.4s, v0.4s\n"
-    "ldr s12, [%[wbptr], #28]\n"
-    "fmla v14.4s, v24.4s, v10.4s\n"
-    "ldr s24, [x23]\n"
-    "fmla v15.4s, v7.4s, v3.4s\n"
-    "ldr s20, [x23, %[input_col_stride1]]\n"
-    "fmla v16.4s, v21.4s, v1.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v14.4s, v21.4s, v2.4s\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "movi v26.16b, #0\n"
-    "ldr s8, [x28]\n"
-    "fmla v16.4s, v19.4s, v3.4s\n"
-    "subs x25, x25, #1\n"
-    "fmla v14.4s, v7.4s, v0.4s\n"
-    "ldr s2, [%[wbptr], #12]\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str s17, [%[outptr0]]\n"
-    "str s16, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v14.4s, v19.4s, v1.4s\n"
-    "str s15, [x22]\n"
-    "mov v17.16b, v11.16b\n"
-    "mov v15.16b, v11.16b\n"
-    "ldr s0, [%[wbptr], #32]\n"
-    "fmla v14.4s, v18.4s, v3.4s\n"
-    "ldr s1, [%[wbptr], #24]\n"
-    "mov v16.16b, v11.16b\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v17.4s, v6.4s, v13.4s\n"
-    "fmla v15.4s, v27.4s, v13.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "ldr s3, [%[wbptr], #36]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "str s14, [x22, %[output_col_stride1]]\n"
-    "mov v14.16b, v11.16b\n"
-    "add x22, x22, #4\n"
-    "fmla v17.4s, v27.4s, v5.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v17.4s, v24.4s, v12.4s\n"
-    "ldr s26, [x21, x24]\n"
-    "fmla v15.4s, v24.4s, v5.4s\n"
-    "ldr s27, [%[inptr0], x27]\n"
-    "fmla v16.4s, v22.4s, v5.4s\n"
-    "ldr s25, [x28, %[input_col_stride1]]\n"
-    "fmla v17.4s, v22.4s, v10.4s\n"
-    "ldr s24, [x23, x24]\n"
-    "fmla v15.4s, v22.4s, v4.4s\n"
-    "ldr s21, [x21, x27]\n"
-    "fmla v14.4s, v22.4s, v13.4s\n"
-    "ldr s7, [x28, x24]\n"
-    "fmla v17.4s, v9.4s, v2.4s\n"
-    "ldr s19, [x23, x27]\n"
-    "fmla v16.4s, v9.4s, v4.4s\n"
-    "ldr s18, [x28, x27]\n"
-    "fmla v15.4s, v8.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v20.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v20.4s, v12.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v15.4s, v20.4s, v10.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v14.4s, v20.4s, v5.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v17.4s, v26.4s, v1.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v17.4s, v24.4s, v3.4s\n"
-    "fmla v16.4s, v27.4s, v2.4s\n"
-    "fmla v15.4s, v25.4s, v0.4s\n"
-    "fmla v14.4s, v25.4s, v12.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmla v16.4s, v24.4s, v0.4s\n"
-    "str s17, [%[outptr0]]\n"
-    "fmla v15.4s, v24.4s, v1.4s\n"
-    "fmla v14.4s, v24.4s, v10.4s\n"
-    "fmla v16.4s, v21.4s, v1.4s\n"
-    "fmla v15.4s, v7.4s, v3.4s\n"
-    "fmla v14.4s, v21.4s, v2.4s\n"
-    "fmla v16.4s, v19.4s, v3.4s\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "fmla v14.4s, v7.4s, v0.4s\n"
-    "str s15, [x22]\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "fmla v14.4s, v19.4s, v1.4s\n"
-    "str s16, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v14.4s, v18.4s, v3.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "str s14, [x22, %[output_col_stride1]]\n"
-    "add x22, x22, #4\n"
-    "7:\n"
-    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
-    : [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x21, %[inptr0], %[input_row_stride]\n"
-    "add x23, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x24, %[outptr0], %[output_row_stride]\n"
-    "add x27, x21, %[input_row_stride]\n"
-    "add x22, x23, %[input_col_stride1]\n"
-    "and x25, %[n_channels], #3\n"
-    "add x28, x27, %[input_row_stride]\n"
-    "lsr x26, %[n_channels], #2\n"
-    "cbz x26, 4f\n"
-    "1:\n"
-    "ldr q19, [%[wbptr]]\n"
-    "subs x26, x26, #1\n"
-    "mov v3.16b, v19.16b\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "mov v2.16b, v19.16b\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "mov v0.16b, v19.16b\n"
-    "ldr q13, [%[wbptr], #64]\n"
-    "ldr q23, [%[wbptr], #80]\n"
-    "ldr q15, [%[wbptr], #96]\n"
-    "ldr q20, [%[wbptr], #112]\n"
-    "ldr q21, [%[wbptr], #128]\n"
-    "ldr q14, [%[wbptr], #144]\n"
-    "ldr q16, [%[inptr0]]\n"
-    "fmla v3.4s, v16.4s, v12.4s\n"
-    "ldr q28, [x21]\n"
-    "fmla v1.4s, v28.4s, v12.4s\n"
-    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr q24, [x27]\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "ldr q8, [x21, %[input_col_stride1]]\n"
-    "ldr q9, [%[inptr0], x23]\n"
-    "ldr q18, [x28]\n"
-    "ldr q6, [x27, %[input_col_stride1]]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v3.4s, v24.4s, v20.4s\n"
-    "ldr q25, [x21, x23]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "ldr q28, [%[inptr0], x22]\n"
-    "fmla v2.4s, v8.4s, v13.4s\n"
-    "ldr q24, [x28, %[input_col_stride1]]\n"
-    "fmla v3.4s, v8.4s, v23.4s\n"
-    "ldr q27, [x27, x23]\n"
-    "fmla v1.4s, v8.4s, v11.4s\n"
-    "ldr q7, [x21, x22]\n"
-    "fmla v0.4s, v8.4s, v12.4s\n"
-    "ldr q17, [x28, x23]\n"
-    "fmla v3.4s, v9.4s, v10.4s\n"
-    "ldr q5, [x27, x22]\n"
-    "fmla v2.4s, v9.4s, v11.4s\n"
-    "ldr q4, [x28, x22]\n"
-    "fmla v1.4s, v18.4s, v20.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v3.4s, v6.4s, v21.4s\n"
-    "ldr q19, [%[wbptr]]\n"
-    "fmla v2.4s, v6.4s, v20.4s\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "fmla v1.4s, v6.4s, v23.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v6.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v3.4s, v25.4s, v15.4s\n"
-    "ldr q16, [%[inptr0]]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v25.4s, v23.4s\n"
-    "ldr q13, [%[wbptr], #64]\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "ldr q9, [%[inptr0], x23]\n"
-    "fmla v1.4s, v24.4s, v21.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "ldr q28, [x21]\n"
-    "fmla v0.4s, v24.4s, v20.4s\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "ldr q8, [x21, %[input_col_stride1]]\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v2.4s, v27.4s, v21.4s\n"
-    "ldr q20, [%[wbptr], #112]\n"
-    "fmla v0.4s, v27.4s, v23.4s\n"
-    "ldr q24, [x27]\n"
-    "fmla v1.4s, v17.4s, v14.4s\n"
-    "ldr q6, [x27, %[input_col_stride1]]\n"
-    "fmla v2.4s, v7.4s, v15.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v0.4s, v7.4s, v10.4s\n"
-    "ldr q23, [%[wbptr], #80]\n"
-    "movi v25.16b, #0\n"
-    "ldr q18, [x28]\n"
-    "fmla v2.4s, v5.4s, v14.4s\n"
-    "subs x26, x26, #1\n"
-    "fmla v0.4s, v17.4s, v21.4s\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "fmov v26.4s, #6.0\n"
-    "fmax v3.4s, v3.4s, v25.4s\n"
-    "fmax v2.4s, v2.4s, v25.4s\n"
-    "fmax v1.4s, v1.4s, v25.4s\n"
-    "fmla v0.4s, v5.4s, v15.4s\n"
-    "ldr q21, [%[wbptr], #128]\n"
-    "fmin v3.4s, v3.4s, v26.4s\n"
-    "fmin v2.4s, v2.4s, v26.4s\n"
-    "fmin v1.4s, v1.4s, v26.4s\n"
-    "str q3, [%[outptr0]]\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v4.4s, v14.4s\n"
-    "str q1, [x24]\n"
-    "mov v3.16b, v19.16b\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr q15, [%[wbptr], #96]\n"
-    "fmax v0.4s, v0.4s, v25.4s\n"
-    "ldr q14, [%[wbptr], #144]\n"
-    "mov v2.16b, v19.16b\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmin v0.4s, v0.4s, v26.4s\n"
-    "fmla v3.4s, v16.4s, v12.4s\n"
-    "fmla v1.4s, v28.4s, v12.4s\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "str q0, [x24, %[output_col_stride1]]\n"
-    "mov v0.16b, v19.16b\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v3.4s, v24.4s, v20.4s\n"
-    "ldr q25, [x21, x23]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "ldr q28, [%[inptr0], x22]\n"
-    "fmla v2.4s, v8.4s, v13.4s\n"
-    "ldr q24, [x28, %[input_col_stride1]]\n"
-    "fmla v3.4s, v8.4s, v23.4s\n"
-    "ldr q27, [x27, x23]\n"
-    "fmla v1.4s, v8.4s, v11.4s\n"
-    "ldr q7, [x21, x22]\n"
-    "fmla v0.4s, v8.4s, v12.4s\n"
-    "ldr q17, [x28, x23]\n"
-    "fmla v3.4s, v9.4s, v10.4s\n"
-    "ldr q5, [x27, x22]\n"
-    "fmla v2.4s, v9.4s, v11.4s\n"
-    "ldr q4, [x28, x22]\n"
-    "fmla v1.4s, v18.4s, v20.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v3.4s, v6.4s, v21.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v2.4s, v6.4s, v20.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v1.4s, v6.4s, v23.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v0.4s, v6.4s, v13.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v3.4s, v25.4s, v15.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v23.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "movi v25.16b, #0\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "fmov v26.4s, #6.0\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v21.4s\n"
-    "fmla v0.4s, v24.4s, v20.4s\n"
-    "fmax v3.4s, v3.4s, v25.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v2.4s, v27.4s, v21.4s\n"
-    "fmla v0.4s, v27.4s, v23.4s\n"
-    "fmin v3.4s, v3.4s, v26.4s\n"
-    "str q3, [%[outptr0]]\n"
-    "fmla v2.4s, v7.4s, v15.4s\n"
-    "fmla v0.4s, v7.4s, v10.4s\n"
-    "fmla v1.4s, v17.4s, v14.4s\n"
-    "fmla v2.4s, v5.4s, v14.4s\n"
-    "fmla v0.4s, v17.4s, v21.4s\n"
-    "fmax v1.4s, v1.4s, v25.4s\n"
-    "fmax v2.4s, v2.4s, v25.4s\n"
-    "fmla v0.4s, v5.4s, v15.4s\n"
-    "fmin v1.4s, v1.4s, v26.4s\n"
-    "fmin v2.4s, v2.4s, v26.4s\n"
-    "str q1, [x24]\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v4.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmax v0.4s, v0.4s, v25.4s\n"
-    "fmin v0.4s, v0.4s, v26.4s\n"
-    "str q0, [x24, %[output_col_stride1]]\n"
-    "add x24, x24, #16\n"
-    "4:\n"
-    "cbz x25, 7f\n"
-    "ldr s19, [%[wbptr]]\n"
-    "mov v3.16b, v19.16b\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "mov v2.16b, v19.16b\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "mov v0.16b, v19.16b\n"
-    "ldr s13, [%[wbptr], #16]\n"
-    "ldr s23, [%[wbptr], #20]\n"
-    "subs x25, x25, #1\n"
-    "ldr s15, [%[wbptr], #24]\n"
-    "ldr s20, [%[wbptr], #28]\n"
-    "ldr s21, [%[wbptr], #32]\n"
-    "ldr s14, [%[wbptr], #36]\n"
-    "ldr s16, [%[inptr0]]\n"
-    "ldr s28, [x21]\n"
-    "fmla v3.4s, v16.4s, v12.4s\n"
-    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v1.4s, v28.4s, v12.4s\n"
-    "ldr s24, [x27]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr s8, [x21, %[input_col_stride1]]\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "ldr s9, [%[inptr0], x23]\n"
-    "ldr s18, [x28]\n"
-    "ldr s6, [x27, %[input_col_stride1]]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v3.4s, v24.4s, v20.4s\n"
-    "ldr s25, [x21, x23]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "ldr s28, [%[inptr0], x22]\n"
-    "fmla v2.4s, v8.4s, v13.4s\n"
-    "ldr s24, [x28, %[input_col_stride1]]\n"
-    "fmla v3.4s, v8.4s, v23.4s\n"
-    "ldr s27, [x27, x23]\n"
-    "fmla v1.4s, v8.4s, v11.4s\n"
-    "ldr s7, [x21, x22]\n"
-    "fmla v0.4s, v8.4s, v12.4s\n"
-    "ldr s17, [x28, x23]\n"
-    "fmla v3.4s, v9.4s, v10.4s\n"
-    "ldr s5, [x27, x22]\n"
-    "fmla v2.4s, v9.4s, v11.4s\n"
-    "ldr s4, [x28, x22]\n"
-    "fmla v1.4s, v18.4s, v20.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v3.4s, v6.4s, v21.4s\n"
-    "ldr s19, [%[wbptr]]\n"
-    "fmla v2.4s, v6.4s, v20.4s\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "fmla v1.4s, v6.4s, v23.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v6.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v3.4s, v25.4s, v15.4s\n"
-    "ldr s16, [%[inptr0]]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v25.4s, v23.4s\n"
-    "ldr s13, [%[wbptr], #16]\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "ldr s9, [%[inptr0], x23]\n"
-    "fmla v1.4s, v24.4s, v21.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "ldr s28, [x21]\n"
-    "fmla v0.4s, v24.4s, v20.4s\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "ldr s8, [x21, %[input_col_stride1]]\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v2.4s, v27.4s, v21.4s\n"
-    "ldr s20, [%[wbptr], #28]\n"
-    "fmla v0.4s, v27.4s, v23.4s\n"
-    "ldr s24, [x27]\n"
-    "fmla v1.4s, v17.4s, v14.4s\n"
-    "ldr s6, [x27, %[input_col_stride1]]\n"
-    "fmla v2.4s, v7.4s, v15.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v0.4s, v7.4s, v10.4s\n"
-    "ldr s23, [%[wbptr], #20]\n"
-    "movi v25.16b, #0\n"
-    "ldr s18, [x28]\n"
-    "fmla v2.4s, v5.4s, v14.4s\n"
-    "subs x25, x25, #1\n"
-    "fmla v0.4s, v17.4s, v21.4s\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "fmov v26.4s, #6.0\n"
-    "fmax v3.4s, v3.4s, v25.4s\n"
-    "fmax v2.4s, v2.4s, v25.4s\n"
-    "fmax v1.4s, v1.4s, v25.4s\n"
-    "fmla v0.4s, v5.4s, v15.4s\n"
-    "ldr s21, [%[wbptr], #32]\n"
-    "fmin v3.4s, v3.4s, v26.4s\n"
-    "fmin v2.4s, v2.4s, v26.4s\n"
-    "fmin v1.4s, v1.4s, v26.4s\n"
-    "str s3, [%[outptr0]]\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v4.4s, v14.4s\n"
-    "str s1, [x24]\n"
-    "mov v3.16b, v19.16b\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr s15, [%[wbptr], #24]\n"
-    "fmax v0.4s, v0.4s, v25.4s\n"
-    "ldr s14, [%[wbptr], #36]\n"
-    "mov v2.16b, v19.16b\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmin v0.4s, v0.4s, v26.4s\n"
-    "fmla v3.4s, v16.4s, v12.4s\n"
-    "fmla v1.4s, v28.4s, v12.4s\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "str s0, [x24, %[output_col_stride1]]\n"
-    "mov v0.16b, v19.16b\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v3.4s, v24.4s, v20.4s\n"
-    "ldr s25, [x21, x23]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "ldr s28, [%[inptr0], x22]\n"
-    "fmla v2.4s, v8.4s, v13.4s\n"
-    "ldr s24, [x28, %[input_col_stride1]]\n"
-    "fmla v3.4s, v8.4s, v23.4s\n"
-    "ldr s27, [x27, x23]\n"
-    "fmla v1.4s, v8.4s, v11.4s\n"
-    "ldr s7, [x21, x22]\n"
-    "fmla v0.4s, v8.4s, v12.4s\n"
-    "ldr s17, [x28, x23]\n"
-    "fmla v3.4s, v9.4s, v10.4s\n"
-    "ldr s5, [x27, x22]\n"
-    "fmla v2.4s, v9.4s, v11.4s\n"
-    "ldr s4, [x28, x22]\n"
-    "fmla v1.4s, v18.4s, v20.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v3.4s, v6.4s, v21.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v2.4s, v6.4s, v20.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v1.4s, v6.4s, v23.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v0.4s, v6.4s, v13.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v3.4s, v25.4s, v15.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v23.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "movi v25.16b, #0\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "fmov v26.4s, #6.0\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v21.4s\n"
-    "fmla v0.4s, v24.4s, v20.4s\n"
-    "fmax v3.4s, v3.4s, v25.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v2.4s, v27.4s, v21.4s\n"
-    "fmla v0.4s, v27.4s, v23.4s\n"
-    "fmin v3.4s, v3.4s, v26.4s\n"
-    "str s3, [%[outptr0]]\n"
-    "fmla v2.4s, v7.4s, v15.4s\n"
-    "fmla v0.4s, v7.4s, v10.4s\n"
-    "fmla v1.4s, v17.4s, v14.4s\n"
-    "fmla v2.4s, v5.4s, v14.4s\n"
-    "fmla v0.4s, v17.4s, v21.4s\n"
-    "fmax v1.4s, v1.4s, v25.4s\n"
-    "fmax v2.4s, v2.4s, v25.4s\n"
-    "fmla v0.4s, v5.4s, v15.4s\n"
-    "fmin v1.4s, v1.4s, v26.4s\n"
-    "fmin v2.4s, v2.4s, v26.4s\n"
-    "str s1, [x24]\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v4.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmax v0.4s, v0.4s, v25.4s\n"
-    "fmin v0.4s, v0.4s, v26.4s\n"
-    "str s0, [x24, %[output_col_stride1]]\n"
-    "add x24, x24, #4\n"
-    "7:\n"
-    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-#endif  // __aarch64__
-
-template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
deleted file mode 100644
index 2554436172..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ /dev/null
@@ -1,2809 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x23, %[inptr0], %[input_row_stride]\n"
-    "add x19, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x22, %[outptr0], %[output_row_stride]\n"
-    "add x24, x23, %[input_row_stride]\n"
-    "add x20, x19, %[input_col_stride1]\n"
-    "and x27, %[n_channels], #3\n"
-    "add x25, x24, %[input_row_stride]\n"
-    "add x21, x20, %[input_col_stride1]\n"
-    "lsr x28, %[n_channels], #2\n"
-    "add x26, x25, %[input_row_stride]\n"
-    "cbz x28, 4f\n"
-    "1:\n"
-    "ldr q14, [%[wbptr]]\n"
-    "subs x28, x28, #1\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr q8, [%[wbptr], #16]\n"
-    "mov v10.16b, v14.16b\n"
-    "ldr q7, [%[wbptr], #32]\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr q6, [%[wbptr], #48]\n"
-    "mov v9.16b, v14.16b\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "ldr q15, [%[inptr0]]\n"
-    "fmla v12.4s, v15.4s, v8.4s\n"
-    "ldr q20, [x23]\n"
-    "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q17, [x24]\n"
-    "fmla v10.4s, v17.4s, v8.4s\n"
-    "ldr q16, [x23, %[input_col_stride1]]\n"
-    "fmla v12.4s, v20.4s, v5.4s\n"
-    "ldr q18, [%[inptr0], x19]\n"
-    "ldr q14, [x25]\n"
-    "ldr q15, [x24, %[input_col_stride1]]\n"
-    "fmla v12.4s, v13.4s, v7.4s\n"
-    "fmla v12.4s, v17.4s, v2.4s\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v11.4s, v18.4s, v8.4s\n"
-    "ldr q19, [x23, x19]\n"
-    "fmla v10.4s, v14.4s, v5.4s\n"
-    "ldr q20, [%[inptr0], x20]\n"
-    "fmla v12.4s, v15.4s, v1.4s\n"
-    "ldr q14, [x26]\n"
-    "fmla v11.4s, v19.4s, v5.4s\n"
-    "ldr q13, [x25, %[input_col_stride1]]\n"
-    "fmla v10.4s, v15.4s, v7.4s\n"
-    "ldr q17, [x24, x19]\n"
-    "fmla v12.4s, v19.4s, v3.4s\n"
-    "ldr q19, [x23, x20]\n"
-    "fmla v11.4s, v20.4s, v7.4s\n"
-    "ldr q18, [%[inptr0], x21]\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "ldr q16, [x26, %[input_col_stride1]]\n"
-    "fmla v12.4s, v17.4s, v0.4s\n"
-    "ldr q14, [x25, x19]\n"
-    "fmla v11.4s, v17.4s, v2.4s\n"
-    "ldr q15, [x24, x20]\n"
-    "fmla v10.4s, v13.4s, v4.4s\n"
-    "ldr q13, [x23, x21]\n"
-    "str q12, [%[outptr0]]\n"
-    "fmla v9.4s, v17.4s, v8.4s\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr q12, [x26, x19]\n"
-    "fmla v10.4s, v17.4s, v6.4s\n"
-    "ldr q20, [x25, x20]\n"
-    "fmla v9.4s, v14.4s, v5.4s\n"
-    "ldr q17, [x24, x21]\n"
-    "fmla v11.4s, v18.4s, v6.4s\n"
-    "ldr q19, [x26, x20]\n"
-    "fmla v10.4s, v16.4s, v1.4s\n"
-    "ldr q18, [x25, x21]\n"
-    "fmla v9.4s, v15.4s, v7.4s\n"
-    "ldr q16, [x26, x21]\n"
-    "fmla v11.4s, v15.4s, v1.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr q14, [%[wbptr]]\n"
-    "fmla v9.4s, v12.4s, v2.4s\n"
-    "ldr q8, [%[wbptr], #16]\n"
-    "fmla v11.4s, v13.4s, v3.4s\n"
-    "ldr q7, [%[wbptr], #32]\n"
-    "fmla v10.4s, v12.4s, v0.4s\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "fmla v9.4s, v20.4s, v4.4s\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "fmla v11.4s, v17.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "str q10, [x22]\n"
-    "mov v12.16b, v14.16b\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "str q11, [%[outptr0], %[output_col_stride1]]\n"
-    "mov v10.16b, v14.16b\n"
-    "mov v11.16b, v14.16b\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr q6, [%[wbptr], #48]\n"
-    "ldr q15, [%[inptr0]]\n"
-    "add x23, x23, #16\n"
-    "fmla v12.4s, v15.4s, v8.4s\n"
-    "ldr q20, [x23]\n"
-    "fmla v9.4s, v18.4s, v3.4s\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
-    "add x24, x24, #16\n"
-    "fmla v12.4s, v20.4s, v5.4s\n"
-    "ldr q17, [x24]\n"
-    "fmla v9.4s, v16.4s, v0.4s\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "fmla v10.4s, v17.4s, v8.4s\n"
-    "ldr q16, [x23, %[input_col_stride1]]\n"
-    "fmla v12.4s, v13.4s, v7.4s\n"
-    "ldr q18, [%[inptr0], x19]\n"
-    "str q9, [x22, %[output_col_stride1]]\n"
-    "add x25, x25, #16\n"
-    "mov v9.16b, v14.16b\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "fmla v12.4s, v17.4s, v2.4s\n"
-    "ldr q14, [x25]\n"
-    "ldr q15, [x24, %[input_col_stride1]]\n"
-    "add x26, x26, #16\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "add x22, x22, #16\n"
-    "subs x28, x28, #1\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v11.4s, v18.4s, v8.4s\n"
-    "ldr q19, [x23, x19]\n"
-    "fmla v10.4s, v14.4s, v5.4s\n"
-    "ldr q20, [%[inptr0], x20]\n"
-    "fmla v12.4s, v15.4s, v1.4s\n"
-    "ldr q14, [x26]\n"
-    "fmla v11.4s, v19.4s, v5.4s\n"
-    "ldr q13, [x25, %[input_col_stride1]]\n"
-    "fmla v10.4s, v15.4s, v7.4s\n"
-    "ldr q17, [x24, x19]\n"
-    "fmla v12.4s, v19.4s, v3.4s\n"
-    "ldr q19, [x23, x20]\n"
-    "fmla v11.4s, v20.4s, v7.4s\n"
-    "ldr q18, [%[inptr0], x21]\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "ldr q16, [x26, %[input_col_stride1]]\n"
-    "fmla v12.4s, v17.4s, v0.4s\n"
-    "ldr q14, [x25, x19]\n"
-    "fmla v11.4s, v17.4s, v2.4s\n"
-    "ldr q15, [x24, x20]\n"
-    "fmla v10.4s, v13.4s, v4.4s\n"
-    "ldr q13, [x23, x21]\n"
-    "str q12, [%[outptr0]]\n"
-    "fmla v9.4s, v17.4s, v8.4s\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr q12, [x26, x19]\n"
-    "fmla v10.4s, v17.4s, v6.4s\n"
-    "ldr q20, [x25, x20]\n"
-    "fmla v9.4s, v14.4s, v5.4s\n"
-    "ldr q17, [x24, x21]\n"
-    "fmla v11.4s, v18.4s, v6.4s\n"
-    "ldr q19, [x26, x20]\n"
-    "fmla v10.4s, v16.4s, v1.4s\n"
-    "ldr q18, [x25, x21]\n"
-    "fmla v9.4s, v15.4s, v7.4s\n"
-    "ldr q16, [x26, x21]\n"
-    "fmla v11.4s, v15.4s, v1.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v12.4s, v2.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v11.4s, v13.4s, v3.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v10.4s, v12.4s, v0.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v9.4s, v20.4s, v4.4s\n"
-    "add x25, x25, #16\n"
-    "fmla v11.4s, v17.4s, v0.4s\n"
-    "add x26, x26, #16\n"
-    "str q10, [x22]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "str q11, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v9.4s, v18.4s, v3.4s\n"
-    "fmla v9.4s, v16.4s, v0.4s\n"
-    "str q9, [x22, %[output_col_stride1]]\n"
-    "add x22, x22, #16\n"
-    "4:\n"
-    "cbz x27, 7f\n"
-    "ldr s14, [%[wbptr]]\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr s8, [%[wbptr], #4]\n"
-    "mov v10.16b, v14.16b\n"
-    "ldr s7, [%[wbptr], #8]\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr s6, [%[wbptr], #12]\n"
-    "mov v9.16b, v14.16b\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "subs x27, x27, #1\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "ldr s15, [%[inptr0]]\n"
-    "ldr s20, [x23]\n"
-    "fmla v12.4s, v15.4s, v8.4s\n"
-    "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s17, [x24]\n"
-    "ldr s16, [x23, %[input_col_stride1]]\n"
-    "fmla v10.4s, v17.4s, v8.4s\n"
-    "ldr s18, [%[inptr0], x19]\n"
-    "fmla v12.4s, v20.4s, v5.4s\n"
-    "ldr s14, [x25]\n"
-    "ldr s15, [x24, %[input_col_stride1]]\n"
-    "fmla v12.4s, v13.4s, v7.4s\n"
-    "fmla v12.4s, v17.4s, v2.4s\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v11.4s, v18.4s, v8.4s\n"
-    "ldr s19, [x23, x19]\n"
-    "fmla v10.4s, v14.4s, v5.4s\n"
-    "ldr s20, [%[inptr0], x20]\n"
-    "fmla v12.4s, v15.4s, v1.4s\n"
-    "ldr s14, [x26]\n"
-    "fmla v11.4s, v19.4s, v5.4s\n"
-    "ldr s13, [x25, %[input_col_stride1]]\n"
-    "fmla v10.4s, v15.4s, v7.4s\n"
-    "ldr s17, [x24, x19]\n"
-    "fmla v12.4s, v19.4s, v3.4s\n"
-    "ldr s19, [x23, x20]\n"
-    "fmla v11.4s, v20.4s, v7.4s\n"
-    "ldr s18, [%[inptr0], x21]\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "ldr s16, [x26, %[input_col_stride1]]\n"
-    "fmla v12.4s, v17.4s, v0.4s\n"
-    "ldr s14, [x25, x19]\n"
-    "fmla v11.4s, v17.4s, v2.4s\n"
-    "ldr s15, [x24, x20]\n"
-    "fmla v10.4s, v13.4s, v4.4s\n"
-    "ldr s13, [x23, x21]\n"
-    "str s12, [%[outptr0]]\n"
-    "fmla v9.4s, v17.4s, v8.4s\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr s12, [x26, x19]\n"
-    "fmla v10.4s, v17.4s, v6.4s\n"
-    "ldr s20, [x25, x20]\n"
-    "fmla v9.4s, v14.4s, v5.4s\n"
-    "ldr s17, [x24, x21]\n"
-    "fmla v11.4s, v18.4s, v6.4s\n"
-    "ldr s19, [x26, x20]\n"
-    "fmla v10.4s, v16.4s, v1.4s\n"
-    "ldr s18, [x25, x21]\n"
-    "fmla v9.4s, v15.4s, v7.4s\n"
-    "ldr s16, [x26, x21]\n"
-    "fmla v11.4s, v15.4s, v1.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr s14, [%[wbptr]]\n"
-    "fmla v9.4s, v12.4s, v2.4s\n"
-    "ldr s8, [%[wbptr], #4]\n"
-    "fmla v11.4s, v13.4s, v3.4s\n"
-    "ldr s7, [%[wbptr], #8]\n"
-    "fmla v10.4s, v12.4s, v0.4s\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "fmla v9.4s, v20.4s, v4.4s\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "fmla v11.4s, v17.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "str s10, [x22]\n"
-    "mov v12.16b, v14.16b\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "str s11, [%[outptr0], %[output_col_stride1]]\n"
-    "mov v10.16b, v14.16b\n"
-    "mov v11.16b, v14.16b\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr s6, [%[wbptr], #12]\n"
-    "ldr s15, [%[inptr0]]\n"
-    "add x23, x23, #4\n"
-    "fmla v12.4s, v15.4s, v8.4s\n"
-    "ldr s20, [x23]\n"
-    "fmla v9.4s, v18.4s, v3.4s\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
-    "add x24, x24, #4\n"
-    "fmla v12.4s, v20.4s, v5.4s\n"
-    "ldr s17, [x24]\n"
-    "fmla v9.4s, v16.4s, v0.4s\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "fmla v10.4s, v17.4s, v8.4s\n"
-    "ldr s16, [x23, %[input_col_stride1]]\n"
-    "fmla v12.4s, v13.4s, v7.4s\n"
-    "ldr s18, [%[inptr0], x19]\n"
-    "str s9, [x22, %[output_col_stride1]]\n"
-    "add x25, x25, #4\n"
-    "mov v9.16b, v14.16b\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "fmla v12.4s, v17.4s, v2.4s\n"
-    "ldr s14, [x25]\n"
-    "ldr s15, [x24, %[input_col_stride1]]\n"
-    "add x26, x26, #4\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "add x22, x22, #4\n"
-    "subs x27, x27, #1\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v11.4s, v18.4s, v8.4s\n"
-    "ldr s19, [x23, x19]\n"
-    "fmla v10.4s, v14.4s, v5.4s\n"
-    "ldr s20, [%[inptr0], x20]\n"
-    "fmla v12.4s, v15.4s, v1.4s\n"
-    "ldr s14, [x26]\n"
-    "fmla v11.4s, v19.4s, v5.4s\n"
-    "ldr s13, [x25, %[input_col_stride1]]\n"
-    "fmla v10.4s, v15.4s, v7.4s\n"
-    "ldr s17, [x24, x19]\n"
-    "fmla v12.4s, v19.4s, v3.4s\n"
-    "ldr s19, [x23, x20]\n"
-    "fmla v11.4s, v20.4s, v7.4s\n"
-    "ldr s18, [%[inptr0], x21]\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "ldr s16, [x26, %[input_col_stride1]]\n"
-    "fmla v12.4s, v17.4s, v0.4s\n"
-    "ldr s14, [x25, x19]\n"
-    "fmla v11.4s, v17.4s, v2.4s\n"
-    "ldr s15, [x24, x20]\n"
-    "fmla v10.4s, v13.4s, v4.4s\n"
-    "ldr s13, [x23, x21]\n"
-    "str s12, [%[outptr0]]\n"
-    "fmla v9.4s, v17.4s, v8.4s\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr s12, [x26, x19]\n"
-    "fmla v10.4s, v17.4s, v6.4s\n"
-    "ldr s20, [x25, x20]\n"
-    "fmla v9.4s, v14.4s, v5.4s\n"
-    "ldr s17, [x24, x21]\n"
-    "fmla v11.4s, v18.4s, v6.4s\n"
-    "ldr s19, [x26, x20]\n"
-    "fmla v10.4s, v16.4s, v1.4s\n"
-    "ldr s18, [x25, x21]\n"
-    "fmla v9.4s, v15.4s, v7.4s\n"
-    "ldr s16, [x26, x21]\n"
-    "fmla v11.4s, v15.4s, v1.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v12.4s, v2.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v11.4s, v13.4s, v3.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v10.4s, v12.4s, v0.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v9.4s, v20.4s, v4.4s\n"
-    "add x25, x25, #4\n"
-    "fmla v11.4s, v17.4s, v0.4s\n"
-    "add x26, x26, #4\n"
-    "str s10, [x22]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "str s11, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v9.4s, v18.4s, v3.4s\n"
-    "fmla v9.4s, v16.4s, v0.4s\n"
-    "str s9, [x22, %[output_col_stride1]]\n"
-    "add x22, x22, #4\n"
-    "7:\n"
-    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
-    : [n_channels] "r" ((long) n_channels), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
-  __asm __volatile(
-    "mov x23, xzr\n"
-    "mov x24, xzr\n"
-    "and x25, %[n_channels], #3\n"
-    "lsr x26, %[n_channels], #2\n"
-    "cbz x26, 4f\n"
-    "1:\n"
-    "ldr q13, [%[wbptr]]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "mov v10.16b, v13.16b\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "mov v8.16b, v13.16b\n"
-    "ldr q6, [%[wbptr], #32]\n"
-    "mov v9.16b, v13.16b\n"
-    "ldr q5, [%[wbptr], #48]\n"
-    "mov v7.16b, v13.16b\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "ldr x27, [%[inptrs], 120]\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "subs x26, x26, #1\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "ldr q14, [x19, x23]\n"
-    "fmla v10.4s, v14.4s, v12.4s\n"
-    "ldr q18, [x20, x23]\n"
-    "ldr q14, [x21, x23]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "ldr q19, [x19, x23]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "fmla v10.4s, v18.4s, v11.4s\n"
-    "ldr q15, [x20, x23]\n"
-    "ldr q18, [x21, x23]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "ldr q13, [x19, x23]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v8.4s, v14.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v10.4s, v15.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v9.4s, v13.4s, v12.4s\n"
-    "ldr q14, [x20, x23]\n"
-    "ldr q17, [x19, x23]\n"
-    "ldr x22, [%[inptrs], 160]\n"
-    "fmla v8.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 128]\n"
-    "fmla v10.4s, v13.4s, v5.4s\n"
-    "ldr q15, [x22, x23]\n"
-    "fmla v9.4s, v14.4s, v11.4s\n"
-    "ldr q19, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v8.4s, v18.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 168]\n"
-    "fmla v10.4s, v18.4s, v1.4s\n"
-    "ldr q13, [x21, x23]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr q18, [x20, x23]\n"
-    "fmla v7.4s, v13.4s, v12.4s\n"
-    "ldr q17, [x19, x23]\n"
-    "fmla v8.4s, v15.4s, v2.4s\n"
-    "ldr q15, [x22, x23]\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr x27, [%[inptrs], 136]\n"
-    "fmla v9.4s, v13.4s, v2.4s\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v8.4s, v19.4s, v4.4s\n"
-    "ldr q19, [x21, x23]\n"
-    "fmla v10.4s, v13.4s, v0.4s\n"
-    "ldr q12, [x20, x23]\n"
-    "fmla v9.4s, v18.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 176]\n"
-    "fmla v7.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 144]\n"
-    "fmla v8.4s, v13.4s, v5.4s\n"
-    "ldr q11, [x22, x23]\n"
-    "ldr q13, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v9.4s, v17.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 184]\n"
-    "fmla v7.4s, v19.4s, v6.4s\n"
-    "ldr q14, [x21, x23]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr q17, [x22, x23]\n"
-    "ldr x27, [%[inptrs], 152]\n"
-    "ldr x22, [%[inptrs], 192]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str q10, [x21, x24]\n"
-    "fmla v7.4s, v11.4s, v2.4s\n"
-    "fmla v8.4s, v16.4s, v3.4s\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr q15, [x22, x23]\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v9.4s, v12.4s, v3.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v7.4s, v13.4s, v4.4s\n"
-    "ldr q13, [%[wbptr]]\n"
-    "fmla v8.4s, v11.4s, v0.4s\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "mov v10.16b, v13.16b\n"
-    "ldr q6, [%[wbptr], #32]\n"
-    "fmla v9.4s, v14.4s, v0.4s\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "fmla v7.4s, v14.4s, v5.4s\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "str q8, [x28, x24]\n"
-    "add x23, x23, #16\n"
-    "mov v8.16b, v13.16b\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "str q9, [x21, x24]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "fmla v7.4s, v17.4s, v1.4s\n"
-    "ldr q5, [%[wbptr], #48]\n"
-    "mov v9.16b, v13.16b\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr x27, [%[inptrs], 120]\n"
-    "subs x26, x26, #1\n"
-    "fmla v7.4s, v16.4s, v3.4s\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "ldr q14, [x19, x23]\n"
-    "fmla v10.4s, v14.4s, v12.4s\n"
-    "ldr q18, [x20, x23]\n"
-    "ldr q14, [x21, x23]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "fmla v7.4s, v15.4s, v0.4s\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "ldr q19, [x19, x23]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "fmla v10.4s, v18.4s, v11.4s\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr q15, [x20, x23]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "str q7, [x28, x24]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "mov v7.16b, v13.16b\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr q13, [x19, x23]\n"
-    "ldr q18, [x21, x23]\n"
-    "add x24, x24, #16\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v8.4s, v14.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v10.4s, v15.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v9.4s, v13.4s, v12.4s\n"
-    "ldr q14, [x20, x23]\n"
-    "ldr q17, [x19, x23]\n"
-    "ldr x22, [%[inptrs], 160]\n"
-    "fmla v8.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 128]\n"
-    "fmla v10.4s, v13.4s, v5.4s\n"
-    "ldr q15, [x22, x23]\n"
-    "fmla v9.4s, v14.4s, v11.4s\n"
-    "ldr q19, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v8.4s, v18.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 168]\n"
-    "fmla v10.4s, v18.4s, v1.4s\n"
-    "ldr q13, [x21, x23]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr q18, [x20, x23]\n"
-    "fmla v7.4s, v13.4s, v12.4s\n"
-    "ldr q17, [x19, x23]\n"
-    "fmla v8.4s, v15.4s, v2.4s\n"
-    "ldr q15, [x22, x23]\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr x27, [%[inptrs], 136]\n"
-    "fmla v9.4s, v13.4s, v2.4s\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v8.4s, v19.4s, v4.4s\n"
-    "ldr q19, [x21, x23]\n"
-    "fmla v10.4s, v13.4s, v0.4s\n"
-    "ldr q12, [x20, x23]\n"
-    "fmla v9.4s, v18.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 176]\n"
-    "fmla v7.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 144]\n"
-    "fmla v8.4s, v13.4s, v5.4s\n"
-    "ldr q11, [x22, x23]\n"
-    "ldr q13, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v9.4s, v17.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 184]\n"
-    "fmla v7.4s, v19.4s, v6.4s\n"
-    "ldr q14, [x21, x23]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr q17, [x22, x23]\n"
-    "ldr x27, [%[inptrs], 152]\n"
-    "ldr x22, [%[inptrs], 192]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str q10, [x21, x24]\n"
-    "fmla v7.4s, v11.4s, v2.4s\n"
-    "fmla v8.4s, v16.4s, v3.4s\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr q15, [x22, x23]\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v9.4s, v12.4s, v3.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v7.4s, v13.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v8.4s, v11.4s, v0.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v9.4s, v14.4s, v0.4s\n"
-    "fmla v7.4s, v14.4s, v5.4s\n"
-    "str q8, [x28, x24]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "str q9, [x21, x24]\n"
-    "fmla v7.4s, v17.4s, v1.4s\n"
-    "fmla v7.4s, v16.4s, v3.4s\n"
-    "fmla v7.4s, v15.4s, v0.4s\n"
-    "str q7, [x28, x24]\n"
-    "add x24, x24, #16\n"
-    "4:\n"
-    "cbz x25, 7f\n"
-    "ldr s13, [%[wbptr]]\n"
-    "mov v10.16b, v13.16b\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "mov v8.16b, v13.16b\n"
-    "ldr s6, [%[wbptr], #8]\n"
-    "mov v9.16b, v13.16b\n"
-    "ldr s5, [%[wbptr], #12]\n"
-    "mov v7.16b, v13.16b\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "ldr x27, [%[inptrs], 120]\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "subs x25, x25, #1\n"
-    "ldr s14, [x19, x23]\n"
-    "ldr s18, [x20, x23]\n"
-    "fmla v10.4s, v14.4s, v12.4s\n"
-    "ldr s14, [x21, x23]\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "ldr s19, [x19, x23]\n"
-    "fmla v10.4s, v18.4s, v11.4s\n"
-    "ldr s15, [x20, x23]\n"
-    "ldr s18, [x21, x23]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "ldr s13, [x19, x23]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v8.4s, v14.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v10.4s, v15.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v9.4s, v13.4s, v12.4s\n"
-    "ldr s14, [x20, x23]\n"
-    "ldr s17, [x19, x23]\n"
-    "ldr x22, [%[inptrs], 160]\n"
-    "fmla v8.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 128]\n"
-    "fmla v10.4s, v13.4s, v5.4s\n"
-    "ldr s15, [x22, x23]\n"
-    "fmla v9.4s, v14.4s, v11.4s\n"
-    "ldr s19, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v8.4s, v18.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 168]\n"
-    "fmla v10.4s, v18.4s, v1.4s\n"
-    "ldr s13, [x21, x23]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr s18, [x20, x23]\n"
-    "fmla v7.4s, v13.4s, v12.4s\n"
-    "ldr s17, [x19, x23]\n"
-    "fmla v8.4s, v15.4s, v2.4s\n"
-    "ldr s15, [x22, x23]\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr x27, [%[inptrs], 136]\n"
-    "fmla v9.4s, v13.4s, v2.4s\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v8.4s, v19.4s, v4.4s\n"
-    "ldr s19, [x21, x23]\n"
-    "fmla v10.4s, v13.4s, v0.4s\n"
-    "ldr s12, [x20, x23]\n"
-    "fmla v9.4s, v18.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 176]\n"
-    "fmla v7.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 144]\n"
-    "fmla v8.4s, v13.4s, v5.4s\n"
-    "ldr s11, [x22, x23]\n"
-    "ldr s13, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v9.4s, v17.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 184]\n"
-    "fmla v7.4s, v19.4s, v6.4s\n"
-    "ldr s14, [x21, x23]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr s17, [x22, x23]\n"
-    "ldr x27, [%[inptrs], 152]\n"
-    "ldr x22, [%[inptrs], 192]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str s10, [x21, x24]\n"
-    "fmla v7.4s, v11.4s, v2.4s\n"
-    "fmla v8.4s, v16.4s, v3.4s\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr s15, [x22, x23]\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v9.4s, v12.4s, v3.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v7.4s, v13.4s, v4.4s\n"
-    "ldr s13, [%[wbptr]]\n"
-    "fmla v8.4s, v11.4s, v0.4s\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "mov v10.16b, v13.16b\n"
-    "ldr s6, [%[wbptr], #8]\n"
-    "fmla v9.4s, v14.4s, v0.4s\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "fmla v7.4s, v14.4s, v5.4s\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "str s8, [x28, x24]\n"
-    "add x23, x23, #4\n"
-    "mov v8.16b, v13.16b\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "str s9, [x21, x24]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "fmla v7.4s, v17.4s, v1.4s\n"
-    "ldr s5, [%[wbptr], #12]\n"
-    "mov v9.16b, v13.16b\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr x27, [%[inptrs], 120]\n"
-    "subs x25, x25, #1\n"
-    "fmla v7.4s, v16.4s, v3.4s\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "ldr s14, [x19, x23]\n"
-    "fmla v10.4s, v14.4s, v12.4s\n"
-    "ldr s18, [x20, x23]\n"
-    "ldr s14, [x21, x23]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "fmla v7.4s, v15.4s, v0.4s\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "ldr s19, [x19, x23]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "fmla v10.4s, v18.4s, v11.4s\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr s15, [x20, x23]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "str s7, [x28, x24]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "mov v7.16b, v13.16b\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr s13, [x19, x23]\n"
-    "ldr s18, [x21, x23]\n"
-    "add x24, x24, #4\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v8.4s, v14.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v10.4s, v15.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v9.4s, v13.4s, v12.4s\n"
-    "ldr s14, [x20, x23]\n"
-    "ldr s17, [x19, x23]\n"
-    "ldr x22, [%[inptrs], 160]\n"
-    "fmla v8.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 128]\n"
-    "fmla v10.4s, v13.4s, v5.4s\n"
-    "ldr s15, [x22, x23]\n"
-    "fmla v9.4s, v14.4s, v11.4s\n"
-    "ldr s19, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v8.4s, v18.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 168]\n"
-    "fmla v10.4s, v18.4s, v1.4s\n"
-    "ldr s13, [x21, x23]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr s18, [x20, x23]\n"
-    "fmla v7.4s, v13.4s, v12.4s\n"
-    "ldr s17, [x19, x23]\n"
-    "fmla v8.4s, v15.4s, v2.4s\n"
-    "ldr s15, [x22, x23]\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr x27, [%[inptrs], 136]\n"
-    "fmla v9.4s, v13.4s, v2.4s\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v8.4s, v19.4s, v4.4s\n"
-    "ldr s19, [x21, x23]\n"
-    "fmla v10.4s, v13.4s, v0.4s\n"
-    "ldr s12, [x20, x23]\n"
-    "fmla v9.4s, v18.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 176]\n"
-    "fmla v7.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 144]\n"
-    "fmla v8.4s, v13.4s, v5.4s\n"
-    "ldr s11, [x22, x23]\n"
-    "ldr s13, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v9.4s, v17.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 184]\n"
-    "fmla v7.4s, v19.4s, v6.4s\n"
-    "ldr s14, [x21, x23]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr s17, [x22, x23]\n"
-    "ldr x27, [%[inptrs], 152]\n"
-    "ldr x22, [%[inptrs], 192]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str s10, [x21, x24]\n"
-    "fmla v7.4s, v11.4s, v2.4s\n"
-    "fmla v8.4s, v16.4s, v3.4s\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr s15, [x22, x23]\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v9.4s, v12.4s, v3.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v7.4s, v13.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v8.4s, v11.4s, v0.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v9.4s, v14.4s, v0.4s\n"
-    "fmla v7.4s, v14.4s, v5.4s\n"
-    "str s8, [x28, x24]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "str s9, [x21, x24]\n"
-    "fmla v7.4s, v17.4s, v1.4s\n"
-    "fmla v7.4s, v16.4s, v3.4s\n"
-    "fmla v7.4s, v15.4s, v0.4s\n"
-    "str s7, [x28, x24]\n"
-    "add x24, x24, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr)
-    : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x24, %[inptr0], %[input_row_stride]\n"
-    "add x27, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x19, %[outptr0], %[output_row_stride]\n"
-    "add x25, x24, %[input_row_stride]\n"
-    "add x23, x27, %[input_col_stride1]\n"
-    "and x20, %[n_channels], #3\n"
-    "add x28, x25, %[input_row_stride]\n"
-    "add x22, x23, %[input_col_stride1]\n"
-    "lsr x21, %[n_channels], #2\n"
-    "add x26, x28, %[input_row_stride]\n"
-    "cbz x21, 4f\n"
-    "1:\n"
-    "ldr q16, [%[wbptr]]\n"
-    "subs x21, x21, #1\n"
-    "mov v3.16b, v16.16b\n"
-    "ldr q4, [%[wbptr], #16]\n"
-    "mov v1.16b, v16.16b\n"
-    "ldr q5, [%[wbptr], #32]\n"
-    "mov v2.16b, v16.16b\n"
-    "ldr q12, [%[wbptr], #48]\n"
-    "mov v0.16b, v16.16b\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "ldr q6, [%[wbptr], #96]\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "ldr q7, [%[wbptr], #144]\n"
-    "ldr q21, [%[inptr0]]\n"
-    "fmla v3.4s, v21.4s, v4.4s\n"
-    "ldr q23, [x24]\n"
-    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q14, [x25]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr q13, [x24, %[input_col_stride1]]\n"
-    "fmla v3.4s, v23.4s, v11.4s\n"
-    "ldr q18, [%[inptr0], x27]\n"
-    "ldr q15, [x28]\n"
-    "ldr q22, [x25, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v5.4s\n"
-    "fmla v3.4s, v14.4s, v9.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v3.4s, v13.4s, v10.4s\n"
-    "ldr q17, [x24, x27]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr q20, [%[inptr0], x23]\n"
-    "fmla v1.4s, v15.4s, v11.4s\n"
-    "ldr q19, [x26]\n"
-    "fmla v3.4s, v18.4s, v12.4s\n"
-    "ldr q13, [x28, %[input_col_stride1]]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr q14, [x25, x27]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "ldr q15, [x24, x23]\n"
-    "fmla v3.4s, v22.4s, v8.4s\n"
-    "ldr q16, [%[inptr0], x22]\n"
-    "fmla v2.4s, v20.4s, v5.4s\n"
-    "ldr q20, [x26, %[input_col_stride1]]\n"
-    "fmla v1.4s, v19.4s, v9.4s\n"
-    "ldr q19, [x28, x27]\n"
-    "fmla v3.4s, v17.4s, v6.4s\n"
-    "ldr q21, [x25, x23]\n"
-    "fmla v2.4s, v14.4s, v9.4s\n"
-    "ldr q22, [x24, x22]\n"
-    "fmla v1.4s, v13.4s, v10.4s\n"
-    "ldr q23, [x26, x27]\n"
-    "fmla v3.4s, v14.4s, v7.4s\n"
-    "ldr q18, [x28, x23]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "ldr q13, [x25, x22]\n"
-    "fmla v1.4s, v14.4s, v12.4s\n"
-    "ldr q14, [x26, x23]\n"
-    "fmla v2.4s, v15.4s, v10.4s\n"
-    "ldr q17, [x28, x22]\n"
-    "fmla v0.4s, v19.4s, v11.4s\n"
-    "ldr q15, [x26, x22]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr q16, [%[wbptr]]\n"
-    "fmla v0.4s, v21.4s, v5.4s\n"
-    "ldr q4, [%[wbptr], #16]\n"
-    "fmla v1.4s, v19.4s, v6.4s\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "fmla v2.4s, v21.4s, v8.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "ldr q5, [%[wbptr], #32]\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v2.4s, v22.4s, v6.4s\n"
-    "ldr q21, [%[inptr0]]\n"
-    "fmla v0.4s, v18.4s, v10.4s\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "movi v20.16b, #0\n"
-    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v13.4s, v7.4s\n"
-    "ldr q18, [%[inptr0], x27]\n"
-    "fmla v0.4s, v13.4s, v12.4s\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "fmax v3.4s, v3.4s, v20.4s\n"
-    "add x24, x24, #16\n"
-    "fmax v2.4s, v2.4s, v20.4s\n"
-    "ldr q23, [x24]\n"
-    "str q3, [%[outptr0]]\n"
-    "fmla v0.4s, v14.4s, v8.4s\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v1.4s, v1.4s, v20.4s\n"
-    "mov v3.16b, v16.16b\n"
-    "ldr q12, [%[wbptr], #48]\n"
-    "str q1, [x19]\n"
-    "fmla v0.4s, v17.4s, v6.4s\n"
-    "mov v1.16b, v16.16b\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "mov v2.16b, v16.16b\n"
-    "ldr q13, [x24, %[input_col_stride1]]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "ldr q6, [%[wbptr], #96]\n"
-    "fmla v3.4s, v21.4s, v4.4s\n"
-    "add x25, x25, #16\n"
-    "ldr q14, [x25]\n"
-    "add x28, x28, #16\n"
-    "fmax v0.4s, v0.4s, v20.4s\n"
-    "ldr q7, [%[wbptr], #144]\n"
-    "fmla v3.4s, v23.4s, v11.4s\n"
-    "ldr q15, [x28]\n"
-    "str q0, [x19, %[output_col_stride1]]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "mov v0.16b, v16.16b\n"
-    "ldr q22, [x25, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v5.4s\n"
-    "add x26, x26, #16\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "add x19, x19, #16\n"
-    "subs x21, x21, #1\n"
-    "fmla v3.4s, v14.4s, v9.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v3.4s, v13.4s, v10.4s\n"
-    "ldr q17, [x24, x27]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr q20, [%[inptr0], x23]\n"
-    "fmla v1.4s, v15.4s, v11.4s\n"
-    "ldr q19, [x26]\n"
-    "fmla v3.4s, v18.4s, v12.4s\n"
-    "ldr q13, [x28, %[input_col_stride1]]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr q14, [x25, x27]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "ldr q15, [x24, x23]\n"
-    "fmla v3.4s, v22.4s, v8.4s\n"
-    "ldr q16, [%[inptr0], x22]\n"
-    "fmla v2.4s, v20.4s, v5.4s\n"
-    "ldr q20, [x26, %[input_col_stride1]]\n"
-    "fmla v1.4s, v19.4s, v9.4s\n"
-    "ldr q19, [x28, x27]\n"
-    "fmla v3.4s, v17.4s, v6.4s\n"
-    "ldr q21, [x25, x23]\n"
-    "fmla v2.4s, v14.4s, v9.4s\n"
-    "ldr q22, [x24, x22]\n"
-    "fmla v1.4s, v13.4s, v10.4s\n"
-    "ldr q23, [x26, x27]\n"
-    "fmla v3.4s, v14.4s, v7.4s\n"
-    "ldr q18, [x28, x23]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "ldr q13, [x25, x22]\n"
-    "fmla v1.4s, v14.4s, v12.4s\n"
-    "ldr q14, [x26, x23]\n"
-    "fmla v2.4s, v15.4s, v10.4s\n"
-    "ldr q17, [x28, x22]\n"
-    "fmla v0.4s, v19.4s, v11.4s\n"
-    "ldr q15, [x26, x22]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v21.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v1.4s, v19.4s, v6.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v2.4s, v21.4s, v8.4s\n"
-    "add x25, x25, #16\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v2.4s, v22.4s, v6.4s\n"
-    "movi v20.16b, #0\n"
-    "fmla v0.4s, v18.4s, v10.4s\n"
-    "fmax v3.4s, v3.4s, v20.4s\n"
-    "fmla v2.4s, v13.4s, v7.4s\n"
-    "fmax v1.4s, v1.4s, v20.4s\n"
-    "str q3, [%[outptr0]]\n"
-    "fmla v0.4s, v13.4s, v12.4s\n"
-    "str q1, [x19]\n"
-    "fmax v2.4s, v2.4s, v20.4s\n"
-    "fmla v0.4s, v14.4s, v8.4s\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v0.4s, v17.4s, v6.4s\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmax v0.4s, v0.4s, v20.4s\n"
-    "str q0, [x19, %[output_col_stride1]]\n"
-    "add x19, x19, #16\n"
-    "4:\n"
-    "cbz x20, 7f\n"
-    "ldr s16, [%[wbptr]]\n"
-    "mov v3.16b, v16.16b\n"
-    "ldr s4, [%[wbptr], #4]\n"
-    "mov v1.16b, v16.16b\n"
-    "ldr s5, [%[wbptr], #8]\n"
-    "mov v2.16b, v16.16b\n"
-    "ldr s12, [%[wbptr], #12]\n"
-    "mov v0.16b, v16.16b\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "subs x20, x20, #1\n"
-    "ldr s6, [%[wbptr], #24]\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "ldr s7, [%[wbptr], #36]\n"
-    "ldr s21, [%[inptr0]]\n"
-    "ldr s23, [x24]\n"
-    "fmla v3.4s, v21.4s, v4.4s\n"
-    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s14, [x25]\n"
-    "ldr s13, [x24, %[input_col_stride1]]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr s18, [%[inptr0], x27]\n"
-    "fmla v3.4s, v23.4s, v11.4s\n"
-    "ldr s15, [x28]\n"
-    "ldr s22, [x25, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v5.4s\n"
-    "fmla v3.4s, v14.4s, v9.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v3.4s, v13.4s, v10.4s\n"
-    "ldr s17, [x24, x27]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr s20, [%[inptr0], x23]\n"
-    "fmla v1.4s, v15.4s, v11.4s\n"
-    "ldr s19, [x26]\n"
-    "fmla v3.4s, v18.4s, v12.4s\n"
-    "ldr s13, [x28, %[input_col_stride1]]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr s14, [x25, x27]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "ldr s15, [x24, x23]\n"
-    "fmla v3.4s, v22.4s, v8.4s\n"
-    "ldr s16, [%[inptr0], x22]\n"
-    "fmla v2.4s, v20.4s, v5.4s\n"
-    "ldr s20, [x26, %[input_col_stride1]]\n"
-    "fmla v1.4s, v19.4s, v9.4s\n"
-    "ldr s19, [x28, x27]\n"
-    "fmla v3.4s, v17.4s, v6.4s\n"
-    "ldr s21, [x25, x23]\n"
-    "fmla v2.4s, v14.4s, v9.4s\n"
-    "ldr s22, [x24, x22]\n"
-    "fmla v1.4s, v13.4s, v10.4s\n"
-    "ldr s23, [x26, x27]\n"
-    "fmla v3.4s, v14.4s, v7.4s\n"
-    "ldr s18, [x28, x23]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "ldr s13, [x25, x22]\n"
-    "fmla v1.4s, v14.4s, v12.4s\n"
-    "ldr s14, [x26, x23]\n"
-    "fmla v2.4s, v15.4s, v10.4s\n"
-    "ldr s17, [x28, x22]\n"
-    "fmla v0.4s, v19.4s, v11.4s\n"
-    "ldr s15, [x26, x22]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr s16, [%[wbptr]]\n"
-    "fmla v0.4s, v21.4s, v5.4s\n"
-    "ldr s4, [%[wbptr], #4]\n"
-    "fmla v1.4s, v19.4s, v6.4s\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "fmla v2.4s, v21.4s, v8.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "ldr s5, [%[wbptr], #8]\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v2.4s, v22.4s, v6.4s\n"
-    "ldr s21, [%[inptr0]]\n"
-    "fmla v0.4s, v18.4s, v10.4s\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "movi v20.16b, #0\n"
-    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v13.4s, v7.4s\n"
-    "ldr s18, [%[inptr0], x27]\n"
-    "fmla v0.4s, v13.4s, v12.4s\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "fmax v3.4s, v3.4s, v20.4s\n"
-    "add x24, x24, #4\n"
-    "fmax v2.4s, v2.4s, v20.4s\n"
-    "ldr s23, [x24]\n"
-    "str s3, [%[outptr0]]\n"
-    "fmla v0.4s, v14.4s, v8.4s\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v1.4s, v1.4s, v20.4s\n"
-    "mov v3.16b, v16.16b\n"
-    "ldr s12, [%[wbptr], #12]\n"
-    "str s1, [x19]\n"
-    "fmla v0.4s, v17.4s, v6.4s\n"
-    "mov v1.16b, v16.16b\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "mov v2.16b, v16.16b\n"
-    "ldr s13, [x24, %[input_col_stride1]]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "ldr s6, [%[wbptr], #24]\n"
-    "fmla v3.4s, v21.4s, v4.4s\n"
-    "add x25, x25, #4\n"
-    "ldr s14, [x25]\n"
-    "add x28, x28, #4\n"
-    "fmax v0.4s, v0.4s, v20.4s\n"
-    "ldr s7, [%[wbptr], #36]\n"
-    "fmla v3.4s, v23.4s, v11.4s\n"
-    "ldr s15, [x28]\n"
-    "str s0, [x19, %[output_col_stride1]]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "mov v0.16b, v16.16b\n"
-    "ldr s22, [x25, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v5.4s\n"
-    "add x26, x26, #4\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "add x19, x19, #4\n"
-    "subs x20, x20, #1\n"
-    "fmla v3.4s, v14.4s, v9.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v3.4s, v13.4s, v10.4s\n"
-    "ldr s17, [x24, x27]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr s20, [%[inptr0], x23]\n"
-    "fmla v1.4s, v15.4s, v11.4s\n"
-    "ldr s19, [x26]\n"
-    "fmla v3.4s, v18.4s, v12.4s\n"
-    "ldr s13, [x28, %[input_col_stride1]]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr s14, [x25, x27]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "ldr s15, [x24, x23]\n"
-    "fmla v3.4s, v22.4s, v8.4s\n"
-    "ldr s16, [%[inptr0], x22]\n"
-    "fmla v2.4s, v20.4s, v5.4s\n"
-    "ldr s20, [x26, %[input_col_stride1]]\n"
-    "fmla v1.4s, v19.4s, v9.4s\n"
-    "ldr s19, [x28, x27]\n"
-    "fmla v3.4s, v17.4s, v6.4s\n"
-    "ldr s21, [x25, x23]\n"
-    "fmla v2.4s, v14.4s, v9.4s\n"
-    "ldr s22, [x24, x22]\n"
-    "fmla v1.4s, v13.4s, v10.4s\n"
-    "ldr s23, [x26, x27]\n"
-    "fmla v3.4s, v14.4s, v7.4s\n"
-    "ldr s18, [x28, x23]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "ldr s13, [x25, x22]\n"
-    "fmla v1.4s, v14.4s, v12.4s\n"
-    "ldr s14, [x26, x23]\n"
-    "fmla v2.4s, v15.4s, v10.4s\n"
-    "ldr s17, [x28, x22]\n"
-    "fmla v0.4s, v19.4s, v11.4s\n"
-    "ldr s15, [x26, x22]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v21.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v1.4s, v19.4s, v6.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v2.4s, v21.4s, v8.4s\n"
-    "add x25, x25, #4\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v2.4s, v22.4s, v6.4s\n"
-    "movi v20.16b, #0\n"
-    "fmla v0.4s, v18.4s, v10.4s\n"
-    "fmax v3.4s, v3.4s, v20.4s\n"
-    "fmla v2.4s, v13.4s, v7.4s\n"
-    "fmax v1.4s, v1.4s, v20.4s\n"
-    "str s3, [%[outptr0]]\n"
-    "fmla v0.4s, v13.4s, v12.4s\n"
-    "str s1, [x19]\n"
-    "fmax v2.4s, v2.4s, v20.4s\n"
-    "fmla v0.4s, v14.4s, v8.4s\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v0.4s, v17.4s, v6.4s\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmax v0.4s, v0.4s, v20.4s\n"
-    "str s0, [x19, %[output_col_stride1]]\n"
-    "add x19, x19, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
-    : [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
-  __asm __volatile(
-    "mov x22, xzr\n"
-    "mov x26, xzr\n"
-    "and x23, %[n_channels], #3\n"
-    "lsr x24, %[n_channels], #2\n"
-    "cbz x24, 4f\n"
-    "1:\n"
-    "ldr q14, [%[wbptr]]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr q13, [%[wbptr], #16]\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "mov v2.16b, v14.16b\n"
-    "ldr q4, [%[wbptr], #48]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr q12, [%[wbptr], #64]\n"
-    "ldr q9, [%[wbptr], #80]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr q8, [%[wbptr], #96]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr q7, [%[wbptr], #112]\n"
-    "ldr x25, [%[inptrs], 120]\n"
-    "ldr q6, [%[wbptr], #128]\n"
-    "subs x24, x24, #1\n"
-    "ldr q5, [%[wbptr], #144]\n"
-    "ldr q15, [x19, x22]\n"
-    "fmla v3.4s, v15.4s, v13.4s\n"
-    "ldr q17, [x20, x22]\n"
-    "ldr q16, [x21, x22]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr q15, [x25, x22]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "ldr q10, [x19, x22]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr q17, [x20, x22]\n"
-    "ldr q14, [x21, x22]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "ldr q18, [x19, x22]\n"
-    "fmla v3.4s, v10.4s, v11.4s\n"
-    "fmla v3.4s, v16.4s, v7.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v1.4s, v16.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v3.4s, v17.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v2.4s, v18.4s, v13.4s\n"
-    "ldr q16, [x20, x22]\n"
-    "movi v10.16b, #0\n"
-    "ldr q17, [x19, x22]\n"
-    "fmla v1.4s, v15.4s, v12.4s\n"
-    "ldr x27, [%[inptrs], 160]\n"
-    "fmla v3.4s, v18.4s, v4.4s\n"
-    "ldr x25, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr q18, [x27, x22]\n"
-    "ldr q15, [x25, x22]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "fmla v1.4s, v14.4s, v11.4s\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "fmla v3.4s, v14.4s, v6.4s\n"
-    "ldr q14, [x21, x22]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr q17, [x20, x22]\n"
-    "fmla v0.4s, v14.4s, v13.4s\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v1.4s, v18.4s, v7.4s\n"
-    "ldr x27, [%[inptrs], 168]\n"
-    "fmla v3.4s, v16.4s, v8.4s\n"
-    "ldr q18, [x19, x22]\n"
-    "fmla v2.4s, v14.4s, v7.4s\n"
-    "ldr q13, [x27, x22]\n"
-    "ldr x25, [%[inptrs], 136]\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v1.4s, v15.4s, v9.4s\n"
-    "ldr x27, [%[inptrs], 176]\n"
-    "fmla v3.4s, v14.4s, v5.4s\n"
-    "ldr q16, [x25, x22]\n"
-    "fmla v2.4s, v17.4s, v9.4s\n"
-    "ldr q17, [x21, x22]\n"
-    "fmla v0.4s, v16.4s, v12.4s\n"
-    "ldr q12, [x20, x22]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr q15, [x27, x22]\n"
-    "fmax v3.4s, v3.4s, v10.4s\n"
-    "ldr x25, [%[inptrs], 144]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v0.4s, v17.4s, v11.4s\n"
-    "ldr q14, [x25, x22]\n"
-    "fmla v1.4s, v13.4s, v6.4s\n"
-    "ldr q11, [x21, x22]\n"
-    "ldr x27, [%[inptrs], 184]\n"
-    "ldr x25, [%[inptrs], 152]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str q3, [x21, x26]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmla v1.4s, v16.4s, v8.4s\n"
-    "ldr q18, [x27, x22]\n"
-    "ldr q17, [x25, x22]\n"
-    "ldr x27, [%[inptrs], 192]\n"
-    "fmla v2.4s, v12.4s, v8.4s\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v0.4s, v14.4s, v9.4s\n"
-    "ldr q16, [x27, x22]\n"
-    "fmla v1.4s, v15.4s, v5.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "ldr q14, [%[wbptr]]\n"
-    "add x22, x22, #16\n"
-    "fmla v2.4s, v11.4s, v5.4s\n"
-    "ldr q13, [%[wbptr], #16]\n"
-    "fmla v0.4s, v11.4s, v4.4s\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "fmax v1.4s, v1.4s, v10.4s\n"
-    "ldr q12, [%[wbptr], #64]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr q9, [%[wbptr], #80]\n"
-    "fmax v2.4s, v2.4s, v10.4s\n"
-    "ldr q7, [%[wbptr], #112]\n"
-    "str q1, [x28, x26]\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr q4, [%[wbptr], #48]\n"
-    "str q2, [x21, x26]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "mov v2.16b, v14.16b\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v17.4s, v8.4s\n"
-    "ldr q6, [%[wbptr], #128]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr x25, [%[inptrs], 120]\n"
-    "subs x24, x24, #1\n"
-    "ldr q15, [x19, x22]\n"
-    "fmla v0.4s, v16.4s, v5.4s\n"
-    "ldr q8, [%[wbptr], #96]\n"
-    "fmla v3.4s, v15.4s, v13.4s\n"
-    "ldr q17, [x20, x22]\n"
-    "ldr q16, [x21, x22]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr q15, [x25, x22]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "fmax v0.4s, v0.4s, v10.4s\n"
-    "ldr q5, [%[wbptr], #144]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr q10, [x19, x22]\n"
-    "ldr q17, [x20, x22]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "str q0, [x28, x26]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr q18, [x19, x22]\n"
-    "fmla v3.4s, v10.4s, v11.4s\n"
-    "ldr q14, [x21, x22]\n"
-    "add x26, x26, #16\n"
-    "fmla v3.4s, v16.4s, v7.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v1.4s, v16.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v3.4s, v17.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v2.4s, v18.4s, v13.4s\n"
-    "ldr q16, [x20, x22]\n"
-    "movi v10.16b, #0\n"
-    "ldr q17, [x19, x22]\n"
-    "fmla v1.4s, v15.4s, v12.4s\n"
-    "ldr x27, [%[inptrs], 160]\n"
-    "fmla v3.4s, v18.4s, v4.4s\n"
-    "ldr x25, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr q18, [x27, x22]\n"
-    "ldr q15, [x25, x22]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "fmla v1.4s, v14.4s, v11.4s\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "fmla v3.4s, v14.4s, v6.4s\n"
-    "ldr q14, [x21, x22]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr q17, [x20, x22]\n"
-    "fmla v0.4s, v14.4s, v13.4s\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v1.4s, v18.4s, v7.4s\n"
-    "ldr x27, [%[inptrs], 168]\n"
-    "fmla v3.4s, v16.4s, v8.4s\n"
-    "ldr q18, [x19, x22]\n"
-    "fmla v2.4s, v14.4s, v7.4s\n"
-    "ldr q13, [x27, x22]\n"
-    "ldr x25, [%[inptrs], 136]\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v1.4s, v15.4s, v9.4s\n"
-    "ldr x27, [%[inptrs], 176]\n"
-    "fmla v3.4s, v14.4s, v5.4s\n"
-    "ldr q16, [x25, x22]\n"
-    "fmla v2.4s, v17.4s, v9.4s\n"
-    "ldr q17, [x21, x22]\n"
-    "fmla v0.4s, v16.4s, v12.4s\n"
-    "ldr q12, [x20, x22]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr q15, [x27, x22]\n"
-    "fmax v3.4s, v3.4s, v10.4s\n"
-    "ldr x25, [%[inptrs], 144]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v0.4s, v17.4s, v11.4s\n"
-    "ldr q14, [x25, x22]\n"
-    "fmla v1.4s, v13.4s, v6.4s\n"
-    "ldr q11, [x21, x22]\n"
-    "ldr x27, [%[inptrs], 184]\n"
-    "ldr x25, [%[inptrs], 152]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str q3, [x21, x26]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmla v1.4s, v16.4s, v8.4s\n"
-    "ldr q18, [x27, x22]\n"
-    "ldr q17, [x25, x22]\n"
-    "ldr x27, [%[inptrs], 192]\n"
-    "fmla v2.4s, v12.4s, v8.4s\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v0.4s, v14.4s, v9.4s\n"
-    "ldr q16, [x27, x22]\n"
-    "fmla v1.4s, v15.4s, v5.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "add x22, x22, #16\n"
-    "fmla v2.4s, v11.4s, v5.4s\n"
-    "fmla v0.4s, v11.4s, v4.4s\n"
-    "fmax v1.4s, v1.4s, v10.4s\n"
-    "fmax v2.4s, v2.4s, v10.4s\n"
-    "str q1, [x28, x26]\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "str q2, [x21, x26]\n"
-    "fmla v0.4s, v17.4s, v8.4s\n"
-    "fmla v0.4s, v16.4s, v5.4s\n"
-    "fmax v0.4s, v0.4s, v10.4s\n"
-    "str q0, [x28, x26]\n"
-    "add x26, x26, #16\n"
-    "4:\n"
-    "cbz x23, 7f\n"
-    "ldr s14, [%[wbptr]]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr s13, [%[wbptr], #4]\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "mov v2.16b, v14.16b\n"
-    "ldr s4, [%[wbptr], #12]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr s12, [%[wbptr], #16]\n"
-    "ldr s9, [%[wbptr], #20]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr s8, [%[wbptr], #24]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr s7, [%[wbptr], #28]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr s6, [%[wbptr], #32]\n"
-    "ldr x25, [%[inptrs], 120]\n"
-    "ldr s5, [%[wbptr], #36]\n"
-    "subs x23, x23, #1\n"
-    "ldr s15, [x19, x22]\n"
-    "ldr s17, [x20, x22]\n"
-    "fmla v3.4s, v15.4s, v13.4s\n"
-    "ldr s16, [x21, x22]\n"
-    "ldr s15, [x25, x22]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "ldr s10, [x19, x22]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr s17, [x20, x22]\n"
-    "ldr s14, [x21, x22]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "ldr s18, [x19, x22]\n"
-    "fmla v3.4s, v10.4s, v11.4s\n"
-    "fmla v3.4s, v16.4s, v7.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v1.4s, v16.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v3.4s, v17.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v2.4s, v18.4s, v13.4s\n"
-    "ldr s16, [x20, x22]\n"
-    "movi v10.16b, #0\n"
-    "ldr s17, [x19, x22]\n"
-    "fmla v1.4s, v15.4s, v12.4s\n"
-    "ldr x27, [%[inptrs], 160]\n"
-    "fmla v3.4s, v18.4s, v4.4s\n"
-    "ldr x25, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr s18, [x27, x22]\n"
-    "ldr s15, [x25, x22]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "fmla v1.4s, v14.4s, v11.4s\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "fmla v3.4s, v14.4s, v6.4s\n"
-    "ldr s14, [x21, x22]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr s17, [x20, x22]\n"
-    "fmla v0.4s, v14.4s, v13.4s\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v1.4s, v18.4s, v7.4s\n"
-    "ldr x27, [%[inptrs], 168]\n"
-    "fmla v3.4s, v16.4s, v8.4s\n"
-    "ldr s18, [x19, x22]\n"
-    "fmla v2.4s, v14.4s, v7.4s\n"
-    "ldr s13, [x27, x22]\n"
-    "ldr x25, [%[inptrs], 136]\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v1.4s, v15.4s, v9.4s\n"
-    "ldr x27, [%[inptrs], 176]\n"
-    "fmla v3.4s, v14.4s, v5.4s\n"
-    "ldr s16, [x25, x22]\n"
-    "fmla v2.4s, v17.4s, v9.4s\n"
-    "ldr s17, [x21, x22]\n"
-    "fmla v0.4s, v16.4s, v12.4s\n"
-    "ldr s12, [x20, x22]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr s15, [x27, x22]\n"
-    "fmax v3.4s, v3.4s, v10.4s\n"
-    "ldr x25, [%[inptrs], 144]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v0.4s, v17.4s, v11.4s\n"
-    "ldr s14, [x25, x22]\n"
-    "fmla v1.4s, v13.4s, v6.4s\n"
-    "ldr s11, [x21, x22]\n"
-    "ldr x27, [%[inptrs], 184]\n"
-    "ldr x25, [%[inptrs], 152]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str s3, [x21, x26]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmla v1.4s, v16.4s, v8.4s\n"
-    "ldr s18, [x27, x22]\n"
-    "ldr s17, [x25, x22]\n"
-    "ldr x27, [%[inptrs], 192]\n"
-    "fmla v2.4s, v12.4s, v8.4s\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v0.4s, v14.4s, v9.4s\n"
-    "ldr s16, [x27, x22]\n"
-    "fmla v1.4s, v15.4s, v5.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "ldr s14, [%[wbptr]]\n"
-    "add x22, x22, #4\n"
-    "fmla v2.4s, v11.4s, v5.4s\n"
-    "ldr s13, [%[wbptr], #4]\n"
-    "fmla v0.4s, v11.4s, v4.4s\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "fmax v1.4s, v1.4s, v10.4s\n"
-    "ldr s12, [%[wbptr], #16]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr s9, [%[wbptr], #20]\n"
-    "fmax v2.4s, v2.4s, v10.4s\n"
-    "ldr s7, [%[wbptr], #28]\n"
-    "str s1, [x28, x26]\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr s4, [%[wbptr], #12]\n"
-    "str s2, [x21, x26]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "mov v2.16b, v14.16b\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v17.4s, v8.4s\n"
-    "ldr s6, [%[wbptr], #32]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr x25, [%[inptrs], 120]\n"
-    "subs x23, x23, #1\n"
-    "ldr s15, [x19, x22]\n"
-    "fmla v0.4s, v16.4s, v5.4s\n"
-    "ldr s8, [%[wbptr], #24]\n"
-    "fmla v3.4s, v15.4s, v13.4s\n"
-    "ldr s17, [x20, x22]\n"
-    "ldr s16, [x21, x22]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr s15, [x25, x22]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "fmax v0.4s, v0.4s, v10.4s\n"
-    "ldr s5, [%[wbptr], #36]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr s10, [x19, x22]\n"
-    "ldr s17, [x20, x22]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "str s0, [x28, x26]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr s18, [x19, x22]\n"
-    "fmla v3.4s, v10.4s, v11.4s\n"
-    "ldr s14, [x21, x22]\n"
-    "add x26, x26, #4\n"
-    "fmla v3.4s, v16.4s, v7.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v1.4s, v16.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v3.4s, v17.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v2.4s, v18.4s, v13.4s\n"
-    "ldr s16, [x20, x22]\n"
-    "movi v10.16b, #0\n"
-    "ldr s17, [x19, x22]\n"
-    "fmla v1.4s, v15.4s, v12.4s\n"
-    "ldr x27, [%[inptrs], 160]\n"
-    "fmla v3.4s, v18.4s, v4.4s\n"
-    "ldr x25, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr s18, [x27, x22]\n"
-    "ldr s15, [x25, x22]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "fmla v1.4s, v14.4s, v11.4s\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "fmla v3.4s, v14.4s, v6.4s\n"
-    "ldr s14, [x21, x22]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr s17, [x20, x22]\n"
-    "fmla v0.4s, v14.4s, v13.4s\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v1.4s, v18.4s, v7.4s\n"
-    "ldr x27, [%[inptrs], 168]\n"
-    "fmla v3.4s, v16.4s, v8.4s\n"
-    "ldr s18, [x19, x22]\n"
-    "fmla v2.4s, v14.4s, v7.4s\n"
-    "ldr s13, [x27, x22]\n"
-    "ldr x25, [%[inptrs], 136]\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v1.4s, v15.4s, v9.4s\n"
-    "ldr x27, [%[inptrs], 176]\n"
-    "fmla v3.4s, v14.4s, v5.4s\n"
-    "ldr s16, [x25, x22]\n"
-    "fmla v2.4s, v17.4s, v9.4s\n"
-    "ldr s17, [x21, x22]\n"
-    "fmla v0.4s, v16.4s, v12.4s\n"
-    "ldr s12, [x20, x22]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr s15, [x27, x22]\n"
-    "fmax v3.4s, v3.4s, v10.4s\n"
-    "ldr x25, [%[inptrs], 144]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v0.4s, v17.4s, v11.4s\n"
-    "ldr s14, [x25, x22]\n"
-    "fmla v1.4s, v13.4s, v6.4s\n"
-    "ldr s11, [x21, x22]\n"
-    "ldr x27, [%[inptrs], 184]\n"
-    "ldr x25, [%[inptrs], 152]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str s3, [x21, x26]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmla v1.4s, v16.4s, v8.4s\n"
-    "ldr s18, [x27, x22]\n"
-    "ldr s17, [x25, x22]\n"
-    "ldr x27, [%[inptrs], 192]\n"
-    "fmla v2.4s, v12.4s, v8.4s\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v0.4s, v14.4s, v9.4s\n"
-    "ldr s16, [x27, x22]\n"
-    "fmla v1.4s, v15.4s, v5.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "add x22, x22, #4\n"
-    "fmla v2.4s, v11.4s, v5.4s\n"
-    "fmla v0.4s, v11.4s, v4.4s\n"
-    "fmax v1.4s, v1.4s, v10.4s\n"
-    "fmax v2.4s, v2.4s, v10.4s\n"
-    "str s1, [x28, x26]\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "str s2, [x21, x26]\n"
-    "fmla v0.4s, v17.4s, v8.4s\n"
-    "fmla v0.4s, v16.4s, v5.4s\n"
-    "fmax v0.4s, v0.4s, v10.4s\n"
-    "str s0, [x28, x26]\n"
-    "add x26, x26, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr)
-    : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x21, %[inptr0], %[input_row_stride]\n"
-    "add x23, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x24, %[outptr0], %[output_row_stride]\n"
-    "add x28, x21, %[input_row_stride]\n"
-    "add x26, x23, %[input_col_stride1]\n"
-    "and x19, %[n_channels], #3\n"
-    "add x27, x28, %[input_row_stride]\n"
-    "add x25, x26, %[input_col_stride1]\n"
-    "lsr x20, %[n_channels], #2\n"
-    "add x22, x27, %[input_row_stride]\n"
-    "cbz x20, 4f\n"
-    "1:\n"
-    "ldr q14, [%[wbptr]]\n"
-    "subs x20, x20, #1\n"
-    "mov v5.16b, v14.16b\n"
-    "ldr q0, [%[wbptr], #16]\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr q1, [%[wbptr], #32]\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr q2, [%[wbptr], #48]\n"
-    "mov v10.16b, v14.16b\n"
-    "ldr q6, [%[wbptr], #64]\n"
-    "ldr q3, [%[wbptr], #80]\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "ldr q4, [%[wbptr], #112]\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "ldr q9, [%[wbptr], #144]\n"
-    "ldr q19, [%[inptr0]]\n"
-    "fmla v5.4s, v19.4s, v0.4s\n"
-    "ldr q15, [x21]\n"
-    "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q16, [x28]\n"
-    "fmla v11.4s, v16.4s, v0.4s\n"
-    "ldr q23, [x21, %[input_col_stride1]]\n"
-    "fmla v5.4s, v15.4s, v6.4s\n"
-    "ldr q18, [%[inptr0], x23]\n"
-    "ldr q17, [x27]\n"
-    "ldr q13, [x28, %[input_col_stride1]]\n"
-    "fmla v5.4s, v21.4s, v1.4s\n"
-    "fmla v5.4s, v16.4s, v4.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v5.4s, v23.4s, v3.4s\n"
-    "ldr q21, [x21, x23]\n"
-    "fmla v12.4s, v18.4s, v0.4s\n"
-    "ldr q20, [%[inptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v6.4s\n"
-    "ldr q19, [x22]\n"
-    "fmla v5.4s, v18.4s, v2.4s\n"
-    "ldr q15, [x27, %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v6.4s\n"
-    "ldr q16, [x28, x23]\n"
-    "fmla v11.4s, v13.4s, v1.4s\n"
-    "ldr q17, [x21, x26]\n"
-    "fmla v5.4s, v13.4s, v8.4s\n"
-    "ldr q14, [%[inptr0], x25]\n"
-    "fmla v12.4s, v20.4s, v1.4s\n"
-    "ldr q20, [x22, %[input_col_stride1]]\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr q19, [x27, x23]\n"
-    "fmla v5.4s, v21.4s, v7.4s\n"
-    "ldr q22, [x28, x26]\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "ldr q21, [x21, x25]\n"
-    "fmla v11.4s, v15.4s, v3.4s\n"
-    "ldr q23, [x22, x23]\n"
-    "fmla v5.4s, v16.4s, v9.4s\n"
-    "ldr q18, [x27, x26]\n"
-    "fmla v10.4s, v16.4s, v0.4s\n"
-    "ldr q15, [x28, x25]\n"
-    "fmla v11.4s, v16.4s, v2.4s\n"
-    "ldr q16, [x22, x26]\n"
-    "fmla v12.4s, v17.4s, v3.4s\n"
-    "ldr q17, [x27, x25]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr q13, [x22, x25]\n"
-    "fmla v11.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v12.4s, v14.4s, v2.4s\n"
-    "ldr q14, [%[wbptr]]\n"
-    "fmla v10.4s, v22.4s, v1.4s\n"
-    "ldr q0, [%[wbptr], #16]\n"
-    "fmla v11.4s, v19.4s, v7.4s\n"
-    "ldr q6, [%[wbptr], #64]\n"
-    "fmla v12.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v10.4s, v23.4s, v4.4s\n"
-    "ldr q1, [%[wbptr], #32]\n"
-    "fmla v11.4s, v23.4s, v9.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v12.4s, v21.4s, v7.4s\n"
-    "ldr q19, [%[inptr0]]\n"
-    "fmla v10.4s, v18.4s, v3.4s\n"
-    "ldr q4, [%[wbptr], #112]\n"
-    "movi v20.16b, #0\n"
-    "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v12.4s, v15.4s, v9.4s\n"
-    "ldr q18, [%[inptr0], x23]\n"
-    "fmla v10.4s, v15.4s, v2.4s\n"
-    "ldr q3, [%[wbptr], #80]\n"
-    "fmov v22.4s, #6.0\n"
-    "add x21, x21, #16\n"
-    "fmax v5.4s, v5.4s, v20.4s\n"
-    "ldr q15, [x21]\n"
-    "fmla v10.4s, v16.4s, v8.4s\n"
-    "ldr q2, [%[wbptr], #48]\n"
-    "fmin v5.4s, v5.4s, v22.4s\n"
-    "ldr q23, [x21, %[input_col_stride1]]\n"
-    "fmax v12.4s, v12.4s, v20.4s\n"
-    "add x28, x28, #16\n"
-    "str q5, [%[outptr0]]\n"
-    "fmla v10.4s, v17.4s, v7.4s\n"
-    "fmin v12.4s, v12.4s, v22.4s\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "fmax v11.4s, v11.4s, v20.4s\n"
-    "ldr q16, [x28]\n"
-    "str q12, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v13.4s, v9.4s\n"
-    "fmin v11.4s, v11.4s, v22.4s\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "mov v5.16b, v14.16b\n"
-    "ldr q13, [x28, %[input_col_stride1]]\n"
-    "str q11, [x24]\n"
-    "fmax v10.4s, v10.4s, v20.4s\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr q9, [%[wbptr], #144]\n"
-    "fmin v10.4s, v10.4s, v22.4s\n"
-    "add x27, x27, #16\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr q17, [x27]\n"
-    "str q10, [x24, %[output_col_stride1]]\n"
-    "fmla v5.4s, v19.4s, v0.4s\n"
-    "mov v10.16b, v14.16b\n"
-    "add x22, x22, #16\n"
-    "fmla v11.4s, v16.4s, v0.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v5.4s, v15.4s, v6.4s\n"
-    "add x24, x24, #16\n"
-    "subs x20, x20, #1\n"
-    "fmla v5.4s, v21.4s, v1.4s\n"
-    "fmla v5.4s, v16.4s, v4.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v5.4s, v23.4s, v3.4s\n"
-    "ldr q21, [x21, x23]\n"
-    "fmla v12.4s, v18.4s, v0.4s\n"
-    "ldr q20, [%[inptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v6.4s\n"
-    "ldr q19, [x22]\n"
-    "fmla v5.4s, v18.4s, v2.4s\n"
-    "ldr q15, [x27, %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v6.4s\n"
-    "ldr q16, [x28, x23]\n"
-    "fmla v11.4s, v13.4s, v1.4s\n"
-    "ldr q17, [x21, x26]\n"
-    "fmla v5.4s, v13.4s, v8.4s\n"
-    "ldr q14, [%[inptr0], x25]\n"
-    "fmla v12.4s, v20.4s, v1.4s\n"
-    "ldr q20, [x22, %[input_col_stride1]]\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr q19, [x27, x23]\n"
-    "fmla v5.4s, v21.4s, v7.4s\n"
-    "ldr q22, [x28, x26]\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "ldr q21, [x21, x25]\n"
-    "fmla v11.4s, v15.4s, v3.4s\n"
-    "ldr q23, [x22, x23]\n"
-    "fmla v5.4s, v16.4s, v9.4s\n"
-    "ldr q18, [x27, x26]\n"
-    "fmla v10.4s, v16.4s, v0.4s\n"
-    "ldr q15, [x28, x25]\n"
-    "fmla v11.4s, v16.4s, v2.4s\n"
-    "ldr q16, [x22, x26]\n"
-    "fmla v12.4s, v17.4s, v3.4s\n"
-    "ldr q17, [x27, x25]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr q13, [x22, x25]\n"
-    "fmla v11.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v12.4s, v14.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v10.4s, v22.4s, v1.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v11.4s, v19.4s, v7.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v12.4s, v22.4s, v8.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v10.4s, v23.4s, v4.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v11.4s, v23.4s, v9.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v12.4s, v21.4s, v7.4s\n"
-    "movi v20.16b, #0\n"
-    "fmla v10.4s, v18.4s, v3.4s\n"
-    "fmov v22.4s, #6.0\n"
-    "fmax v5.4s, v5.4s, v20.4s\n"
-    "fmax v11.4s, v11.4s, v20.4s\n"
-    "fmla v12.4s, v15.4s, v9.4s\n"
-    "fmla v10.4s, v15.4s, v2.4s\n"
-    "fmin v5.4s, v5.4s, v22.4s\n"
-    "fmin v11.4s, v11.4s, v22.4s\n"
-    "fmax v12.4s, v12.4s, v20.4s\n"
-    "str q5, [%[outptr0]]\n"
-    "str q11, [x24]\n"
-    "fmla v10.4s, v16.4s, v8.4s\n"
-    "fmin v12.4s, v12.4s, v22.4s\n"
-    "str q12, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v17.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v10.4s, v13.4s, v9.4s\n"
-    "fmax v10.4s, v10.4s, v20.4s\n"
-    "fmin v10.4s, v10.4s, v22.4s\n"
-    "str q10, [x24, %[output_col_stride1]]\n"
-    "add x24, x24, #16\n"
-    "4:\n"
-    "cbz x19, 7f\n"
-    "ldr s14, [%[wbptr]]\n"
-    "mov v5.16b, v14.16b\n"
-    "ldr s0, [%[wbptr], #4]\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr s1, [%[wbptr], #8]\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr s2, [%[wbptr], #12]\n"
-    "mov v10.16b, v14.16b\n"
-    "ldr s6, [%[wbptr], #16]\n"
-    "ldr s3, [%[wbptr], #20]\n"
-    "subs x19, x19, #1\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "ldr s4, [%[wbptr], #28]\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "ldr s9, [%[wbptr], #36]\n"
-    "ldr s19, [%[inptr0]]\n"
-    "ldr s15, [x21]\n"
-    "fmla v5.4s, v19.4s, v0.4s\n"
-    "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s16, [x28]\n"
-    "ldr s23, [x21, %[input_col_stride1]]\n"
-    "fmla v11.4s, v16.4s, v0.4s\n"
-    "ldr s18, [%[inptr0], x23]\n"
-    "fmla v5.4s, v15.4s, v6.4s\n"
-    "ldr s17, [x27]\n"
-    "ldr s13, [x28, %[input_col_stride1]]\n"
-    "fmla v5.4s, v21.4s, v1.4s\n"
-    "fmla v5.4s, v16.4s, v4.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v5.4s, v23.4s, v3.4s\n"
-    "ldr s21, [x21, x23]\n"
-    "fmla v12.4s, v18.4s, v0.4s\n"
-    "ldr s20, [%[inptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v6.4s\n"
-    "ldr s19, [x22]\n"
-    "fmla v5.4s, v18.4s, v2.4s\n"
-    "ldr s15, [x27, %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v6.4s\n"
-    "ldr s16, [x28, x23]\n"
-    "fmla v11.4s, v13.4s, v1.4s\n"
-    "ldr s17, [x21, x26]\n"
-    "fmla v5.4s, v13.4s, v8.4s\n"
-    "ldr s14, [%[inptr0], x25]\n"
-    "fmla v12.4s, v20.4s, v1.4s\n"
-    "ldr s20, [x22, %[input_col_stride1]]\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr s19, [x27, x23]\n"
-    "fmla v5.4s, v21.4s, v7.4s\n"
-    "ldr s22, [x28, x26]\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "ldr s21, [x21, x25]\n"
-    "fmla v11.4s, v15.4s, v3.4s\n"
-    "ldr s23, [x22, x23]\n"
-    "fmla v5.4s, v16.4s, v9.4s\n"
-    "ldr s18, [x27, x26]\n"
-    "fmla v10.4s, v16.4s, v0.4s\n"
-    "ldr s15, [x28, x25]\n"
-    "fmla v11.4s, v16.4s, v2.4s\n"
-    "ldr s16, [x22, x26]\n"
-    "fmla v12.4s, v17.4s, v3.4s\n"
-    "ldr s17, [x27, x25]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr s13, [x22, x25]\n"
-    "fmla v11.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v12.4s, v14.4s, v2.4s\n"
-    "ldr s14, [%[wbptr]]\n"
-    "fmla v10.4s, v22.4s, v1.4s\n"
-    "ldr s0, [%[wbptr], #4]\n"
-    "fmla v11.4s, v19.4s, v7.4s\n"
-    "ldr s6, [%[wbptr], #16]\n"
-    "fmla v12.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v10.4s, v23.4s, v4.4s\n"
-    "ldr s1, [%[wbptr], #8]\n"
-    "fmla v11.4s, v23.4s, v9.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v12.4s, v21.4s, v7.4s\n"
-    "ldr s19, [%[inptr0]]\n"
-    "fmla v10.4s, v18.4s, v3.4s\n"
-    "ldr s4, [%[wbptr], #28]\n"
-    "movi v20.16b, #0\n"
-    "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v12.4s, v15.4s, v9.4s\n"
-    "ldr s18, [%[inptr0], x23]\n"
-    "fmla v10.4s, v15.4s, v2.4s\n"
-    "ldr s3, [%[wbptr], #20]\n"
-    "fmov v22.4s, #6.0\n"
-    "add x21, x21, #4\n"
-    "fmax v5.4s, v5.4s, v20.4s\n"
-    "ldr s15, [x21]\n"
-    "fmla v10.4s, v16.4s, v8.4s\n"
-    "ldr s2, [%[wbptr], #12]\n"
-    "fmin v5.4s, v5.4s, v22.4s\n"
-    "ldr s23, [x21, %[input_col_stride1]]\n"
-    "fmax v12.4s, v12.4s, v20.4s\n"
-    "add x28, x28, #4\n"
-    "str s5, [%[outptr0]]\n"
-    "fmla v10.4s, v17.4s, v7.4s\n"
-    "fmin v12.4s, v12.4s, v22.4s\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "fmax v11.4s, v11.4s, v20.4s\n"
-    "ldr s16, [x28]\n"
-    "str s12, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v13.4s, v9.4s\n"
-    "fmin v11.4s, v11.4s, v22.4s\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "mov v5.16b, v14.16b\n"
-    "ldr s13, [x28, %[input_col_stride1]]\n"
-    "str s11, [x24]\n"
-    "fmax v10.4s, v10.4s, v20.4s\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr s9, [%[wbptr], #36]\n"
-    "fmin v10.4s, v10.4s, v22.4s\n"
-    "add x27, x27, #4\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr s17, [x27]\n"
-    "str s10, [x24, %[output_col_stride1]]\n"
-    "fmla v5.4s, v19.4s, v0.4s\n"
-    "mov v10.16b, v14.16b\n"
-    "add x22, x22, #4\n"
-    "fmla v11.4s, v16.4s, v0.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v5.4s, v15.4s, v6.4s\n"
-    "add x24, x24, #4\n"
-    "subs x19, x19, #1\n"
-    "fmla v5.4s, v21.4s, v1.4s\n"
-    "fmla v5.4s, v16.4s, v4.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v5.4s, v23.4s, v3.4s\n"
-    "ldr s21, [x21, x23]\n"
-    "fmla v12.4s, v18.4s, v0.4s\n"
-    "ldr s20, [%[inptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v6.4s\n"
-    "ldr s19, [x22]\n"
-    "fmla v5.4s, v18.4s, v2.4s\n"
-    "ldr s15, [x27, %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v6.4s\n"
-    "ldr s16, [x28, x23]\n"
-    "fmla v11.4s, v13.4s, v1.4s\n"
-    "ldr s17, [x21, x26]\n"
-    "fmla v5.4s, v13.4s, v8.4s\n"
-    "ldr s14, [%[inptr0], x25]\n"
-    "fmla v12.4s, v20.4s, v1.4s\n"
-    "ldr s20, [x22, %[input_col_stride1]]\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr s19, [x27, x23]\n"
-    "fmla v5.4s, v21.4s, v7.4s\n"
-    "ldr s22, [x28, x26]\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "ldr s21, [x21, x25]\n"
-    "fmla v11.4s, v15.4s, v3.4s\n"
-    "ldr s23, [x22, x23]\n"
-    "fmla v5.4s, v16.4s, v9.4s\n"
-    "ldr s18, [x27, x26]\n"
-    "fmla v10.4s, v16.4s, v0.4s\n"
-    "ldr s15, [x28, x25]\n"
-    "fmla v11.4s, v16.4s, v2.4s\n"
-    "ldr s16, [x22, x26]\n"
-    "fmla v12.4s, v17.4s, v3.4s\n"
-    "ldr s17, [x27, x25]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr s13, [x22, x25]\n"
-    "fmla v11.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v12.4s, v14.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v10.4s, v22.4s, v1.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v11.4s, v19.4s, v7.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v12.4s, v22.4s, v8.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v10.4s, v23.4s, v4.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v11.4s, v23.4s, v9.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v12.4s, v21.4s, v7.4s\n"
-    "movi v20.16b, #0\n"
-    "fmla v10.4s, v18.4s, v3.4s\n"
-    "fmov v22.4s, #6.0\n"
-    "fmax v5.4s, v5.4s, v20.4s\n"
-    "fmax v11.4s, v11.4s, v20.4s\n"
-    "fmla v12.4s, v15.4s, v9.4s\n"
-    "fmla v10.4s, v15.4s, v2.4s\n"
-    "fmin v5.4s, v5.4s, v22.4s\n"
-    "fmin v11.4s, v11.4s, v22.4s\n"
-    "fmax v12.4s, v12.4s, v20.4s\n"
-    "str s5, [%[outptr0]]\n"
-    "str s11, [x24]\n"
-    "fmla v10.4s, v16.4s, v8.4s\n"
-    "fmin v12.4s, v12.4s, v22.4s\n"
-    "str s12, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v17.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v10.4s, v13.4s, v9.4s\n"
-    "fmax v10.4s, v10.4s, v20.4s\n"
-    "fmin v10.4s, v10.4s, v22.4s\n"
-    "str s10, [x24, %[output_col_stride1]]\n"
-    "add x24, x24, #4\n"
-    "7:\n"
-    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
-    : [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
-  __asm __volatile(
-    "mov x27, xzr\n"
-    "mov x28, xzr\n"
-    "and x26, %[n_channels], #3\n"
-    "lsr x25, %[n_channels], #2\n"
-    "cbz x25, 4f\n"
-    "1:\n"
-    "ldr q15, [%[wbptr]]\n"
-    "ldr x21, [%[inptrs], 0]\n"
-    "mov v8.16b, v15.16b\n"
-    "ldr q14, [%[wbptr], #16]\n"
-    "mov v3.16b, v15.16b\n"
-    "ldr q10, [%[wbptr], #32]\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr q7, [%[wbptr], #48]\n"
-    "mov v4.16b, v15.16b\n"
-    "ldr q13, [%[wbptr], #64]\n"
-    "ldr q5, [%[wbptr], #80]\n"
-    "ldr x22, [%[inptrs], 40]\n"
-    "ldr q0, [%[wbptr], #96]\n"
-    "ldr x20, [%[inptrs], 80]\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "ldr x23, [%[inptrs], 120]\n"
-    "ldr q6, [%[wbptr], #128]\n"
-    "subs x25, x25, #1\n"
-    "ldr q1, [%[wbptr], #144]\n"
-    "ldr q17, [x21, x27]\n"
-    "fmla v8.4s, v17.4s, v14.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "ldr q16, [x20, x27]\n"
-    "ldr x21, [%[inptrs], 8]\n"
-    "ldr q17, [x23, x27]\n"
-    "ldr x22, [%[inptrs], 48]\n"
-    "ldr q11, [x21, x27]\n"
-    "ldr x20, [%[inptrs], 88]\n"
-    "fmla v8.4s, v18.4s, v13.4s\n"
-    "ldr q19, [x22, x27]\n"
-    "ldr q15, [x20, x27]\n"
-    "ldr x21, [%[inptrs], 16]\n"
-    "ldr q12, [x21, x27]\n"
-    "fmla v8.4s, v11.4s, v10.4s\n"
-    "fmla v8.4s, v16.4s, v9.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v3.4s, v16.4s, v14.4s\n"
-    "ldr x22, [%[inptrs], 56]\n"
-    "fmla v8.4s, v19.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 24]\n"
-    "fmla v2.4s, v12.4s, v14.4s\n"
-    "ldr q16, [x22, x27]\n"
-    "movi v11.16b, #0\n"
-    "ldr q18, [x21, x27]\n"
-    "fmla v3.4s, v17.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v8.4s, v12.4s, v7.4s\n"
-    "ldr x23, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v13.4s\n"
-    "ldr q19, [x20, x27]\n"
-    "fmov v12.4s, #6.0\n"
-    "ldr q17, [x23, x27]\n"
-    "fmla v3.4s, v15.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 96]\n"
-    "fmla v8.4s, v15.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 64]\n"
-    "fmla v2.4s, v18.4s, v10.4s\n"
-    "ldr q15, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v14.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "ldr x21, [%[inptrs], 32]\n"
-    "fmla v8.4s, v16.4s, v0.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v2.4s, v15.4s, v9.4s\n"
-    "ldr q19, [x21, x27]\n"
-    "ldr q16, [x20, x27]\n"
-    "ldr x23, [%[inptrs], 136]\n"
-    "fmla v3.4s, v17.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 104]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr q14, [x23, x27]\n"
-    "fmla v2.4s, v18.4s, v5.4s\n"
-    "ldr q17, [x20, x27]\n"
-    "fmla v4.4s, v14.4s, v13.4s\n"
-    "ldr x22, [%[inptrs], 72]\n"
-    "fmla v3.4s, v15.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmax v8.4s, v8.4s, v11.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "fmla v2.4s, v19.4s, v7.4s\n"
-    "ldr q13, [x20, x27]\n"
-    "fmla v4.4s, v17.4s, v10.4s\n"
-    "ldr x23, [%[inptrs], 144]\n"
-    "fmla v3.4s, v16.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 112]\n"
-    "fmin v8.4s, v8.4s, v12.4s\n"
-    "ldr q10, [x23, x27]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr q15, [x20, x27]\n"
-    "fmla v4.4s, v13.4s, v9.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v3.4s, v14.4s, v0.4s\n"
-    "ldr x23, [%[inptrs], 152]\n"
-    "ldr q9, [x20, x27]\n"
-    "ldr x22, [%[outptrs], 0]\n"
-    "fmla v2.4s, v18.4s, v0.4s\n"
-    "ldr q19, [x23, x27]\n"
-    "str q8, [x22, x28]\n"
-    "fmla v4.4s, v10.4s, v5.4s\n"
-    "fmla v3.4s, v13.4s, v1.4s\n"
-    "ldr x20, [%[inptrs], 192]\n"
-    "ldr x22, [%[outptrs], 8]\n"
-    "ldr x24, [%[outptrs], 16]\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v15.4s, v1.4s\n"
-    "ldr q16, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v7.4s\n"
-    "ldr q15, [%[wbptr]]\n"
-    "fmax v3.4s, v3.4s, v11.4s\n"
-    "ldr q14, [%[wbptr], #16]\n"
-    "mov v8.16b, v15.16b\n"
-    "ldr q10, [%[wbptr], #32]\n"
-    "fmax v2.4s, v2.4s, v11.4s\n"
-    "ldr q13, [%[wbptr], #64]\n"
-    "fmla v4.4s, v9.4s, v6.4s\n"
-    "ldr q7, [%[wbptr], #48]\n"
-    "fmin v3.4s, v3.4s, v12.4s\n"
-    "ldr q5, [%[wbptr], #80]\n"
-    "fmin v2.4s, v2.4s, v12.4s\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "add x27, x27, #16\n"
-    "str q3, [x24, x28]\n"
-    "fmla v4.4s, v19.4s, v0.4s\n"
-    "str q2, [x22, x28]\n"
-    "mov v3.16b, v15.16b\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr q6, [%[wbptr], #128]\n"
-    "ldr x24, [%[outptrs], 24]\n"
-    "ldr x21, [%[inptrs], 0]\n"
-    "ldr x22, [%[inptrs], 40]\n"
-    "fmla v4.4s, v16.4s, v1.4s\n"
-    "ldr q0, [%[wbptr], #96]\n"
-    "ldr q17, [x21, x27]\n"
-    "ldr x20, [%[inptrs], 80]\n"
-    "fmla v8.4s, v17.4s, v14.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "ldr q16, [x20, x27]\n"
-    "ldr x21, [%[inptrs], 8]\n"
-    "fmax v4.4s, v4.4s, v11.4s\n"
-    "ldr q1, [%[wbptr], #144]\n"
-    "ldr q11, [x21, x27]\n"
-    "ldr x22, [%[inptrs], 48]\n"
-    "fmla v8.4s, v18.4s, v13.4s\n"
-    "ldr x21, [%[inptrs], 16]\n"
-    "fmin v4.4s, v4.4s, v12.4s\n"
-    "ldr q19, [x22, x27]\n"
-    "ldr q12, [x21, x27]\n"
-    "ldr x23, [%[inptrs], 120]\n"
-    "ldr x20, [%[inptrs], 88]\n"
-    "subs x25, x25, #1\n"
-    "str q4, [x24, x28]\n"
-    "mov v4.16b, v15.16b\n"
-    "ldr q17, [x23, x27]\n"
-    "fmla v8.4s, v11.4s, v10.4s\n"
-    "ldr q15, [x20, x27]\n"
-    "add x28, x28, #16\n"
-    "fmla v8.4s, v16.4s, v9.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v3.4s, v16.4s, v14.4s\n"
-    "ldr x22, [%[inptrs], 56]\n"
-    "fmla v8.4s, v19.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 24]\n"
-    "fmla v2.4s, v12.4s, v14.4s\n"
-    "ldr q16, [x22, x27]\n"
-    "movi v11.16b, #0\n"
-    "ldr q18, [x21, x27]\n"
-    "fmla v3.4s, v17.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v8.4s, v12.4s, v7.4s\n"
-    "ldr x23, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v13.4s\n"
-    "ldr q19, [x20, x27]\n"
-    "fmov v12.4s, #6.0\n"
-    "ldr q17, [x23, x27]\n"
-    "fmla v3.4s, v15.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 96]\n"
-    "fmla v8.4s, v15.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 64]\n"
-    "fmla v2.4s, v18.4s, v10.4s\n"
-    "ldr q15, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v14.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "ldr x21, [%[inptrs], 32]\n"
-    "fmla v8.4s, v16.4s, v0.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v2.4s, v15.4s, v9.4s\n"
-    "ldr q19, [x21, x27]\n"
-    "ldr q16, [x20, x27]\n"
-    "ldr x23, [%[inptrs], 136]\n"
-    "fmla v3.4s, v17.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 104]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr q14, [x23, x27]\n"
-    "fmla v2.4s, v18.4s, v5.4s\n"
-    "ldr q17, [x20, x27]\n"
-    "fmla v4.4s, v14.4s, v13.4s\n"
-    "ldr x22, [%[inptrs], 72]\n"
-    "fmla v3.4s, v15.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmax v8.4s, v8.4s, v11.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "fmla v2.4s, v19.4s, v7.4s\n"
-    "ldr q13, [x20, x27]\n"
-    "fmla v4.4s, v17.4s, v10.4s\n"
-    "ldr x23, [%[inptrs], 144]\n"
-    "fmla v3.4s, v16.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 112]\n"
-    "fmin v8.4s, v8.4s, v12.4s\n"
-    "ldr q10, [x23, x27]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr q15, [x20, x27]\n"
-    "fmla v4.4s, v13.4s, v9.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v3.4s, v14.4s, v0.4s\n"
-    "ldr x23, [%[inptrs], 152]\n"
-    "ldr q9, [x20, x27]\n"
-    "ldr x22, [%[outptrs], 0]\n"
-    "fmla v2.4s, v18.4s, v0.4s\n"
-    "ldr q19, [x23, x27]\n"
-    "str q8, [x22, x28]\n"
-    "fmla v4.4s, v10.4s, v5.4s\n"
-    "fmla v3.4s, v13.4s, v1.4s\n"
-    "ldr x20, [%[inptrs], 192]\n"
-    "ldr x22, [%[outptrs], 8]\n"
-    "ldr x24, [%[outptrs], 16]\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v15.4s, v1.4s\n"
-    "ldr q16, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v7.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmax v3.4s, v3.4s, v11.4s\n"
-    "add x27, x27, #16\n"
-    "fmax v2.4s, v2.4s, v11.4s\n"
-    "fmla v4.4s, v9.4s, v6.4s\n"
-    "fmin v3.4s, v3.4s, v12.4s\n"
-    "fmin v2.4s, v2.4s, v12.4s\n"
-    "str q3, [x24, x28]\n"
-    "fmla v4.4s, v19.4s, v0.4s\n"
-    "str q2, [x22, x28]\n"
-    "ldr x24, [%[outptrs], 24]\n"
-    "fmla v4.4s, v16.4s, v1.4s\n"
-    "fmax v4.4s, v4.4s, v11.4s\n"
-    "fmin v4.4s, v4.4s, v12.4s\n"
-    "str q4, [x24, x28]\n"
-    "add x28, x28, #16\n"
-    "4:\n"
-    "cbz x26, 7f\n"
-    "ldr s15, [%[wbptr]]\n"
-    "mov v8.16b, v15.16b\n"
-    "ldr s14, [%[wbptr], #4]\n"
-    "mov v3.16b, v15.16b\n"
-    "ldr s10, [%[wbptr], #8]\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr s7, [%[wbptr], #12]\n"
-    "mov v4.16b, v15.16b\n"
-    "ldr s13, [%[wbptr], #16]\n"
-    "ldr s5, [%[wbptr], #20]\n"
-    "ldr x21, [%[inptrs], 0]\n"
-    "ldr s0, [%[wbptr], #24]\n"
-    "ldr x22, [%[inptrs], 40]\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "ldr x20, [%[inptrs], 80]\n"
-    "ldr s6, [%[wbptr], #32]\n"
-    "ldr x23, [%[inptrs], 120]\n"
-    "ldr s1, [%[wbptr], #36]\n"
-    "subs x26, x26, #1\n"
-    "ldr s17, [x21, x27]\n"
-    "ldr s18, [x22, x27]\n"
-    "fmla v8.4s, v17.4s, v14.4s\n"
-    "ldr s16, [x20, x27]\n"
-    "ldr s17, [x23, x27]\n"
-    "ldr x21, [%[inptrs], 8]\n"
-    "ldr x22, [%[inptrs], 48]\n"
-    "ldr x20, [%[inptrs], 88]\n"
-    "ldr s11, [x21, x27]\n"
-    "fmla v8.4s, v18.4s, v13.4s\n"
-    "ldr s19, [x22, x27]\n"
-    "ldr s15, [x20, x27]\n"
-    "ldr x21, [%[inptrs], 16]\n"
-    "ldr s12, [x21, x27]\n"
-    "fmla v8.4s, v11.4s, v10.4s\n"
-    "fmla v8.4s, v16.4s, v9.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v3.4s, v16.4s, v14.4s\n"
-    "ldr x22, [%[inptrs], 56]\n"
-    "fmla v8.4s, v19.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 24]\n"
-    "fmla v2.4s, v12.4s, v14.4s\n"
-    "ldr s16, [x22, x27]\n"
-    "movi v11.16b, #0\n"
-    "ldr s18, [x21, x27]\n"
-    "fmla v3.4s, v17.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v8.4s, v12.4s, v7.4s\n"
-    "ldr x23, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v13.4s\n"
-    "ldr s19, [x20, x27]\n"
-    "fmov v12.4s, #6.0\n"
-    "ldr s17, [x23, x27]\n"
-    "fmla v3.4s, v15.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 96]\n"
-    "fmla v8.4s, v15.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 64]\n"
-    "fmla v2.4s, v18.4s, v10.4s\n"
-    "ldr s15, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v14.4s\n"
-    "ldr s18, [x22, x27]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "ldr x21, [%[inptrs], 32]\n"
-    "fmla v8.4s, v16.4s, v0.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v2.4s, v15.4s, v9.4s\n"
-    "ldr s19, [x21, x27]\n"
-    "ldr s16, [x20, x27]\n"
-    "ldr x23, [%[inptrs], 136]\n"
-    "fmla v3.4s, v17.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 104]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr s14, [x23, x27]\n"
-    "fmla v2.4s, v18.4s, v5.4s\n"
-    "ldr s17, [x20, x27]\n"
-    "fmla v4.4s, v14.4s, v13.4s\n"
-    "ldr x22, [%[inptrs], 72]\n"
-    "fmla v3.4s, v15.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmax v8.4s, v8.4s, v11.4s\n"
-    "ldr s18, [x22, x27]\n"
-    "fmla v2.4s, v19.4s, v7.4s\n"
-    "ldr s13, [x20, x27]\n"
-    "fmla v4.4s, v17.4s, v10.4s\n"
-    "ldr x23, [%[inptrs], 144]\n"
-    "fmla v3.4s, v16.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 112]\n"
-    "fmin v8.4s, v8.4s, v12.4s\n"
-    "ldr s10, [x23, x27]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr s15, [x20, x27]\n"
-    "fmla v4.4s, v13.4s, v9.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v3.4s, v14.4s, v0.4s\n"
-    "ldr x23, [%[inptrs], 152]\n"
-    "ldr s9, [x20, x27]\n"
-    "ldr x22, [%[outptrs], 0]\n"
-    "fmla v2.4s, v18.4s, v0.4s\n"
-    "ldr s19, [x23, x27]\n"
-    "str s8, [x22, x28]\n"
-    "fmla v4.4s, v10.4s, v5.4s\n"
-    "fmla v3.4s, v13.4s, v1.4s\n"
-    "ldr x20, [%[inptrs], 192]\n"
-    "ldr x22, [%[outptrs], 8]\n"
-    "ldr x24, [%[outptrs], 16]\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v15.4s, v1.4s\n"
-    "ldr s16, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v7.4s\n"
-    "ldr s15, [%[wbptr]]\n"
-    "fmax v3.4s, v3.4s, v11.4s\n"
-    "ldr s14, [%[wbptr], #4]\n"
-    "mov v8.16b, v15.16b\n"
-    "ldr s10, [%[wbptr], #8]\n"
-    "fmax v2.4s, v2.4s, v11.4s\n"
-    "ldr s13, [%[wbptr], #16]\n"
-    "fmla v4.4s, v9.4s, v6.4s\n"
-    "ldr s7, [%[wbptr], #12]\n"
-    "fmin v3.4s, v3.4s, v12.4s\n"
-    "ldr s5, [%[wbptr], #20]\n"
-    "fmin v2.4s, v2.4s, v12.4s\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "add x27, x27, #4\n"
-    "str s3, [x24, x28]\n"
-    "fmla v4.4s, v19.4s, v0.4s\n"
-    "str s2, [x22, x28]\n"
-    "mov v3.16b, v15.16b\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr s6, [%[wbptr], #32]\n"
-    "ldr x24, [%[outptrs], 24]\n"
-    "ldr x21, [%[inptrs], 0]\n"
-    "ldr x22, [%[inptrs], 40]\n"
-    "fmla v4.4s, v16.4s, v1.4s\n"
-    "ldr s0, [%[wbptr], #24]\n"
-    "ldr s17, [x21, x27]\n"
-    "ldr x20, [%[inptrs], 80]\n"
-    "fmla v8.4s, v17.4s, v14.4s\n"
-    "ldr s18, [x22, x27]\n"
-    "ldr s16, [x20, x27]\n"
-    "ldr x21, [%[inptrs], 8]\n"
-    "fmax v4.4s, v4.4s, v11.4s\n"
-    "ldr s1, [%[wbptr], #36]\n"
-    "ldr s11, [x21, x27]\n"
-    "ldr x22, [%[inptrs], 48]\n"
-    "fmla v8.4s, v18.4s, v13.4s\n"
-    "ldr x21, [%[inptrs], 16]\n"
-    "fmin v4.4s, v4.4s, v12.4s\n"
-    "ldr s19, [x22, x27]\n"
-    "ldr s12, [x21, x27]\n"
-    "ldr x23, [%[inptrs], 120]\n"
-    "ldr x20, [%[inptrs], 88]\n"
-    "subs x26, x26, #1\n"
-    "str s4, [x24, x28]\n"
-    "mov v4.16b, v15.16b\n"
-    "ldr s17, [x23, x27]\n"
-    "fmla v8.4s, v11.4s, v10.4s\n"
-    "ldr s15, [x20, x27]\n"
-    "add x28, x28, #4\n"
-    "fmla v8.4s, v16.4s, v9.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v3.4s, v16.4s, v14.4s\n"
-    "ldr x22, [%[inptrs], 56]\n"
-    "fmla v8.4s, v19.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 24]\n"
-    "fmla v2.4s, v12.4s, v14.4s\n"
-    "ldr s16, [x22, x27]\n"
-    "movi v11.16b, #0\n"
-    "ldr s18, [x21, x27]\n"
-    "fmla v3.4s, v17.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v8.4s, v12.4s, v7.4s\n"
-    "ldr x23, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v13.4s\n"
-    "ldr s19, [x20, x27]\n"
-    "fmov v12.4s, #6.0\n"
-    "ldr s17, [x23, x27]\n"
-    "fmla v3.4s, v15.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 96]\n"
-    "fmla v8.4s, v15.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 64]\n"
-    "fmla v2.4s, v18.4s, v10.4s\n"
-    "ldr s15, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v14.4s\n"
-    "ldr s18, [x22, x27]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "ldr x21, [%[inptrs], 32]\n"
-    "fmla v8.4s, v16.4s, v0.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v2.4s, v15.4s, v9.4s\n"
-    "ldr s19, [x21, x27]\n"
-    "ldr s16, [x20, x27]\n"
-    "ldr x23, [%[inptrs], 136]\n"
-    "fmla v3.4s, v17.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 104]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr s14, [x23, x27]\n"
-    "fmla v2.4s, v18.4s, v5.4s\n"
-    "ldr s17, [x20, x27]\n"
-    "fmla v4.4s, v14.4s, v13.4s\n"
-    "ldr x22, [%[inptrs], 72]\n"
-    "fmla v3.4s, v15.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmax v8.4s, v8.4s, v11.4s\n"
-    "ldr s18, [x22, x27]\n"
-    "fmla v2.4s, v19.4s, v7.4s\n"
-    "ldr s13, [x20, x27]\n"
-    "fmla v4.4s, v17.4s, v10.4s\n"
-    "ldr x23, [%[inptrs], 144]\n"
-    "fmla v3.4s, v16.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 112]\n"
-    "fmin v8.4s, v8.4s, v12.4s\n"
-    "ldr s10, [x23, x27]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr s15, [x20, x27]\n"
-    "fmla v4.4s, v13.4s, v9.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v3.4s, v14.4s, v0.4s\n"
-    "ldr x23, [%[inptrs], 152]\n"
-    "ldr s9, [x20, x27]\n"
-    "ldr x22, [%[outptrs], 0]\n"
-    "fmla v2.4s, v18.4s, v0.4s\n"
-    "ldr s19, [x23, x27]\n"
-    "str s8, [x22, x28]\n"
-    "fmla v4.4s, v10.4s, v5.4s\n"
-    "fmla v3.4s, v13.4s, v1.4s\n"
-    "ldr x20, [%[inptrs], 192]\n"
-    "ldr x22, [%[outptrs], 8]\n"
-    "ldr x24, [%[outptrs], 16]\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v15.4s, v1.4s\n"
-    "ldr s16, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v7.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmax v3.4s, v3.4s, v11.4s\n"
-    "add x27, x27, #4\n"
-    "fmax v2.4s, v2.4s, v11.4s\n"
-    "fmla v4.4s, v9.4s, v6.4s\n"
-    "fmin v3.4s, v3.4s, v12.4s\n"
-    "fmin v2.4s, v2.4s, v12.4s\n"
-    "str s3, [x24, x28]\n"
-    "fmla v4.4s, v19.4s, v0.4s\n"
-    "str s2, [x22, x28]\n"
-    "ldr x24, [%[outptrs], 24]\n"
-    "fmla v4.4s, v16.4s, v1.4s\n"
-    "fmax v4.4s, v4.4s, v11.4s\n"
-    "fmin v4.4s, v4.4s, v12.4s\n"
-    "str s4, [x24, x28]\n"
-    "add x28, x28, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr)
-    : [inptrs] "r" (inptrs), [outptrs] "r" (outptrs), [n_channels] "r" ((long) n_channels)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-#endif  // __aarch64__
-
-template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
deleted file mode 100644
index 2142c431ac..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ /dev/null
@@ -1,2341 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x20, %[inptr0], %[input_row_stride]\n"
-    "add x13, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x24, %[outptr0], %[output_row_stride]\n"
-    "add x21, x20, %[input_row_stride]\n"
-    "add x14, x13, #64\n"
-    "add x15, x13, %[input_col_stride1]\n"
-    "add x22, x21, %[input_row_stride]\n"
-    "add x16, x15, #64\n"
-    "add x17, x15, %[input_col_stride1]\n"
-    "add x23, x22, %[input_row_stride]\n"
-    "add x9, x17, #64\n"
-    "add x25, x24, %[output_row_stride]\n"
-    "add x26, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x27, %[n_channels], #3\n"
-    "lsr x28, %[n_channels], #2\n"
-    "cbz x28, 4f\n"
-    "1:\n"
-    "ldr q25, [%[wbptr]]\n"
-    "subs x28, x28, #1\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr q16, [%[wbptr], #16]\n"
-    "mov v13.16b, v25.16b\n"
-    "ldr q7, [%[wbptr], #32]\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr q6, [%[wbptr], #48]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "mov v12.16b, v25.16b\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "mov v14.16b, v25.16b\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "mov v9.16b, v25.16b\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "mov v11.16b, v25.16b\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "mov v8.16b, v25.16b\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "ldr q26, [%[inptr0]]\n"
-    "ldr q28, [x20]\n"
-    "fmla v17.4s, v26.4s, v16.4s\n"
-    "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v13.4s, v28.4s, v16.4s\n"
-    "ldr q27, [x21]\n"
-    "fmla v15.4s, v29.4s, v16.4s\n"
-    "ldr q21, [x20, %[input_col_stride1]]\n"
-    "fmla v17.4s, v28.4s, v5.4s\n"
-    "ldr q20, [%[inptr0], x13]\n"
-    "ldr q23, [x22]\n"
-    "ldr q19, [x21, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "fmla v17.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [%[inptr0], x19]\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "prfm pldl1keep, [x20, x19]\n"
-    "prfm pldl1keep, [%[inptr0], x14]\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "prfm pldl1keep, [x21, x19]\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "ldr q30, [x20, x13]\n"
-    "fmla v13.4s, v27.4s, v5.4s\n"
-    "ldr q29, [%[inptr0], x15]\n"
-    "fmla v10.4s, v27.4s, v16.4s\n"
-    "ldr q28, [x23]\n"
-    "fmla v17.4s, v21.4s, v4.4s\n"
-    "ldr q24, [x22, %[input_col_stride1]]\n"
-    "fmla v13.4s, v21.4s, v7.4s\n"
-    "ldr q18, [x21, x13]\n"
-    "fmla v15.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [x20, x14]\n"
-    "fmla v12.4s, v21.4s, v16.4s\n"
-    "ldr q22, [x20, x15]\n"
-    "fmla v17.4s, v20.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v15.4s, v20.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v14.4s, v20.4s, v16.4s\n"
-    "ldr q25, [%[inptr0], x17]\n"
-    "fmla v13.4s, v23.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x19]\n"
-    "fmla v10.4s, v23.4s, v5.4s\n"
-    "ldr q26, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v19.4s, v1.4s\n"
-    "prfm pldl1keep, [x21, x14]\n"
-    "fmla v13.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v15.4s, v19.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x9]\n"
-    "fmla v10.4s, v19.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, x19]\n"
-    "fmla v12.4s, v19.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, x14]\n"
-    "fmla v9.4s, v19.4s, v16.4s\n"
-    "ldr q27, [x22, x13]\n"
-    "fmla v17.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v13.4s, v30.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x9]\n"
-    "fmla v15.4s, v30.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x14]\n"
-    "fmla v12.4s, v30.4s, v7.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x21, x9]\n"
-    "fmla v11.4s, v30.4s, v16.4s\n"
-    "ldr q21, [x21, x15]\n"
-    "fmla v15.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr q20, [x20, x17]\n"
-    "fmla v10.4s, v28.4s, v2.4s\n"
-    "ldr q19, [x23, x13]\n"
-    "fmla v13.4s, v24.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x9]\n"
-    "fmla v12.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x9]\n"
-    "fmla v10.4s, v24.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v9.4s, v24.4s, v5.4s\n"
-    "ldr q23, [x22, x15]\n"
-    "fmla v17.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v13.4s, v18.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v15.4s, v18.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "str q17, [%[outptr0]]\n"
-    "fmla v10.4s, v18.4s, v6.4s\n"
-    "fmla v12.4s, v18.4s, v4.4s\n"
-    "ldr q17, [x21, x17]\n"
-    "fmla v14.4s, v18.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x19]\n"
-    "fmla v9.4s, v18.4s, v7.4s\n"
-    "prfm pldl1keep, [%[inptr0], x14]\n"
-    "fmla v11.4s, v18.4s, v5.4s\n"
-    "add x20, x20, #16\n"
-    "fmla v8.4s, v18.4s, v16.4s\n"
-    "ldr q24, [x23, x15]\n"
-    "fmla v15.4s, v22.4s, v3.4s\n"
-    "ldr q18, [x22, x17]\n"
-    "fmla v12.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "fmla v14.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x19]\n"
-    "fmla v11.4s, v22.4s, v7.4s\n"
-    "ldr q22, [x23, x17]\n"
-    "fmla v10.4s, v26.4s, v1.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v14.4s, v25.4s, v6.4s\n"
-    "ldr q25, [%[wbptr]]\n"
-    "fmla v9.4s, v26.4s, v2.4s\n"
-    "ldr q16, [%[wbptr], #16]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "fmla v10.4s, v27.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x19]\n"
-    "fmla v12.4s, v27.4s, v1.4s\n"
-    "add x22, x22, #16\n"
-    "str q13, [x24]\n"
-    "fmla v9.4s, v27.4s, v4.4s\n"
-    "fmla v11.4s, v27.4s, v2.4s\n"
-    "ldr q26, [%[inptr0]]\n"
-    "fmla v8.4s, v27.4s, v5.4s\n"
-    "ldr q28, [x20]\n"
-    "fmla v15.4s, v21.4s, v0.4s\n"
-    "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v14.4s, v21.4s, v1.4s\n"
-    "add x23, x23, #16\n"
-    "str q15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "fmla v8.4s, v21.4s, v7.4s\n"
-    "ldr q27, [x21]\n"
-    "fmla v14.4s, v20.4s, v3.4s\n"
-    "ldr q21, [x20, %[input_col_stride1]]\n"
-    "fmla v11.4s, v20.4s, v6.4s\n"
-    "ldr q20, [%[inptr0], x13]\n"
-    "fmla v10.4s, v19.4s, v0.4s\n"
-    "subs x28, x28, #1\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v8.4s, v19.4s, v2.4s\n"
-    "fmla v12.4s, v23.4s, v0.4s\n"
-    "ldr q7, [%[wbptr], #32]\n"
-    "str q10, [x25]\n"
-    "fmla v11.4s, v23.4s, v1.4s\n"
-    "fmla v9.4s, v23.4s, v3.4s\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "str q12, [x24, %[output_col_stride1]]\n"
-    "fmla v8.4s, v23.4s, v4.4s\n"
-    "fmla v14.4s, v17.4s, v0.4s\n"
-    "ldr q23, [x22]\n"
-    "fmla v11.4s, v17.4s, v3.4s\n"
-    "ldr q19, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v17.4s, v6.4s\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "str q14, [%[outptr0], x26]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "fmla v11.4s, v18.4s, v0.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v8.4s, v24.4s, v1.4s\n"
-    "ldr q6, [%[wbptr], #48]\n"
-    "str q9, [x25, %[output_col_stride1]]\n"
-    "mov v17.16b, v25.16b\n"
-    "str q11, [x24, x26]\n"
-    "mov v13.16b, v25.16b\n"
-    "fmla v8.4s, v18.4s, v3.4s\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "mov v15.16b, v25.16b\n"
-    "add x24, x24, #16\n"
-    "mov v10.16b, v25.16b\n"
-    "mov v12.16b, v25.16b\n"
-    "fmla v8.4s, v22.4s, v0.4s\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "mov v14.16b, v25.16b\n"
-    "mov v9.16b, v25.16b\n"
-    "mov v11.16b, v25.16b\n"
-    "fmla v17.4s, v26.4s, v16.4s\n"
-    "str q8, [x25, x26]\n"
-    "fmla v13.4s, v28.4s, v16.4s\n"
-    "mov v8.16b, v25.16b\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "fmla v17.4s, v28.4s, v5.4s\n"
-    "fmla v15.4s, v29.4s, v16.4s\n"
-    "add x25, x25, #16\n"
-    "fmla v17.4s, v29.4s, v7.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "ldr q30, [x20, x13]\n"
-    "fmla v13.4s, v27.4s, v5.4s\n"
-    "ldr q29, [%[inptr0], x15]\n"
-    "fmla v10.4s, v27.4s, v16.4s\n"
-    "ldr q28, [x23]\n"
-    "fmla v17.4s, v21.4s, v4.4s\n"
-    "ldr q24, [x22, %[input_col_stride1]]\n"
-    "fmla v13.4s, v21.4s, v7.4s\n"
-    "ldr q18, [x21, x13]\n"
-    "fmla v15.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [x20, x14]\n"
-    "fmla v12.4s, v21.4s, v16.4s\n"
-    "ldr q22, [x20, x15]\n"
-    "fmla v17.4s, v20.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v15.4s, v20.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v14.4s, v20.4s, v16.4s\n"
-    "ldr q25, [%[inptr0], x17]\n"
-    "fmla v13.4s, v23.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x19]\n"
-    "fmla v10.4s, v23.4s, v5.4s\n"
-    "ldr q26, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v19.4s, v1.4s\n"
-    "prfm pldl1keep, [x21, x14]\n"
-    "fmla v13.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v15.4s, v19.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x9]\n"
-    "fmla v10.4s, v19.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, x19]\n"
-    "fmla v12.4s, v19.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, x14]\n"
-    "fmla v9.4s, v19.4s, v16.4s\n"
-    "ldr q27, [x22, x13]\n"
-    "fmla v17.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v13.4s, v30.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x9]\n"
-    "fmla v15.4s, v30.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x14]\n"
-    "fmla v12.4s, v30.4s, v7.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x21, x9]\n"
-    "fmla v11.4s, v30.4s, v16.4s\n"
-    "ldr q21, [x21, x15]\n"
-    "fmla v15.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr q20, [x20, x17]\n"
-    "fmla v10.4s, v28.4s, v2.4s\n"
-    "ldr q19, [x23, x13]\n"
-    "fmla v13.4s, v24.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x9]\n"
-    "fmla v12.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x9]\n"
-    "fmla v10.4s, v24.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v9.4s, v24.4s, v5.4s\n"
-    "ldr q23, [x22, x15]\n"
-    "fmla v17.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v13.4s, v18.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v15.4s, v18.4s, v1.4s\n"
-    "add x20, x20, #16\n"
-    "str q17, [%[outptr0]]\n"
-    "fmla v10.4s, v18.4s, v6.4s\n"
-    "fmla v12.4s, v18.4s, v4.4s\n"
-    "ldr q17, [x21, x17]\n"
-    "fmla v14.4s, v18.4s, v2.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v9.4s, v18.4s, v7.4s\n"
-    "fmla v11.4s, v18.4s, v5.4s\n"
-    "fmla v8.4s, v18.4s, v16.4s\n"
-    "ldr q24, [x23, x15]\n"
-    "fmla v15.4s, v22.4s, v3.4s\n"
-    "ldr q18, [x22, x17]\n"
-    "fmla v12.4s, v22.4s, v6.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v14.4s, v22.4s, v4.4s\n"
-    "fmla v11.4s, v22.4s, v7.4s\n"
-    "fmla v10.4s, v26.4s, v1.4s\n"
-    "ldr q22, [x23, x17]\n"
-    "fmla v9.4s, v26.4s, v2.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v14.4s, v25.4s, v6.4s\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "fmla v10.4s, v27.4s, v3.4s\n"
-    "fmla v12.4s, v27.4s, v1.4s\n"
-    "fmla v9.4s, v27.4s, v4.4s\n"
-    "fmla v11.4s, v27.4s, v2.4s\n"
-    "str q13, [x24]\n"
-    "fmla v8.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v21.4s, v0.4s\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "fmla v14.4s, v21.4s, v1.4s\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "fmla v8.4s, v21.4s, v7.4s\n"
-    "str q15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v19.4s, v0.4s\n"
-    "fmla v14.4s, v20.4s, v3.4s\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v11.4s, v20.4s, v6.4s\n"
-    "fmla v8.4s, v19.4s, v2.4s\n"
-    "str q10, [x25]\n"
-    "fmla v12.4s, v23.4s, v0.4s\n"
-    "fmla v9.4s, v23.4s, v3.4s\n"
-    "fmla v14.4s, v17.4s, v0.4s\n"
-    "fmla v11.4s, v23.4s, v1.4s\n"
-    "fmla v8.4s, v23.4s, v4.4s\n"
-    "str q12, [x24, %[output_col_stride1]]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "str q14, [%[outptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v3.4s\n"
-    "fmla v8.4s, v17.4s, v6.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "str q9, [x25, %[output_col_stride1]]\n"
-    "fmla v11.4s, v18.4s, v0.4s\n"
-    "fmla v8.4s, v24.4s, v1.4s\n"
-    "str q11, [x24, x26]\n"
-    "fmla v8.4s, v18.4s, v3.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v8.4s, v22.4s, v0.4s\n"
-    "str q8, [x25, x26]\n"
-    "add x25, x25, #16\n"
-    "4:\n"
-    "cbz x27, 7f\n"
-    "ldr s25, [%[wbptr]]\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr s16, [%[wbptr], #4]\n"
-    "mov v13.16b, v25.16b\n"
-    "ldr s7, [%[wbptr], #8]\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr s6, [%[wbptr], #12]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "mov v12.16b, v25.16b\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "mov v14.16b, v25.16b\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "mov v9.16b, v25.16b\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "mov v11.16b, v25.16b\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "mov v8.16b, v25.16b\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "ldr s26, [%[inptr0]]\n"
-    "subs x27, x27, #1\n"
-    "fmla v17.4s, v26.4s, v16.4s\n"
-    "ldr s28, [x20]\n"
-    "fmla v13.4s, v28.4s, v16.4s\n"
-    "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v15.4s, v29.4s, v16.4s\n"
-    "ldr s27, [x21]\n"
-    "fmla v17.4s, v28.4s, v5.4s\n"
-    "ldr s21, [x20, %[input_col_stride1]]\n"
-    "ldr s20, [%[inptr0], x13]\n"
-    "ldr s23, [x22]\n"
-    "ldr s19, [x21, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v17.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x19]\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "prfm pldl1keep, [x20, x19]\n"
-    "prfm pldl1keep, [%[inptr0], x14]\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "prfm pldl1keep, [x21, x19]\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "ldr s30, [x20, x13]\n"
-    "fmla v13.4s, v27.4s, v5.4s\n"
-    "ldr s29, [%[inptr0], x15]\n"
-    "fmla v10.4s, v27.4s, v16.4s\n"
-    "ldr s28, [x23]\n"
-    "fmla v17.4s, v21.4s, v4.4s\n"
-    "ldr s24, [x22, %[input_col_stride1]]\n"
-    "fmla v13.4s, v21.4s, v7.4s\n"
-    "ldr s18, [x21, x13]\n"
-    "fmla v15.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [x20, x14]\n"
-    "fmla v12.4s, v21.4s, v16.4s\n"
-    "ldr s22, [x20, x15]\n"
-    "fmla v17.4s, v20.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v15.4s, v20.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v14.4s, v20.4s, v16.4s\n"
-    "ldr s25, [%[inptr0], x17]\n"
-    "fmla v13.4s, v23.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x19]\n"
-    "fmla v10.4s, v23.4s, v5.4s\n"
-    "ldr s26, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v19.4s, v1.4s\n"
-    "prfm pldl1keep, [x21, x14]\n"
-    "fmla v13.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v15.4s, v19.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x9]\n"
-    "fmla v10.4s, v19.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, x19]\n"
-    "fmla v12.4s, v19.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, x14]\n"
-    "fmla v9.4s, v19.4s, v16.4s\n"
-    "ldr s27, [x22, x13]\n"
-    "fmla v17.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v13.4s, v30.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x9]\n"
-    "fmla v15.4s, v30.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x14]\n"
-    "fmla v12.4s, v30.4s, v7.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x21, x9]\n"
-    "fmla v11.4s, v30.4s, v16.4s\n"
-    "ldr s21, [x21, x15]\n"
-    "fmla v15.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr s20, [x20, x17]\n"
-    "fmla v10.4s, v28.4s, v2.4s\n"
-    "ldr s19, [x23, x13]\n"
-    "fmla v13.4s, v24.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x9]\n"
-    "fmla v12.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x9]\n"
-    "fmla v10.4s, v24.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v9.4s, v24.4s, v5.4s\n"
-    "ldr s23, [x22, x15]\n"
-    "fmla v17.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v13.4s, v18.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v15.4s, v18.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "str s17, [%[outptr0]]\n"
-    "fmla v10.4s, v18.4s, v6.4s\n"
-    "fmla v12.4s, v18.4s, v4.4s\n"
-    "ldr s17, [x21, x17]\n"
-    "fmla v14.4s, v18.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x19]\n"
-    "fmla v9.4s, v18.4s, v7.4s\n"
-    "prfm pldl1keep, [%[inptr0], x14]\n"
-    "fmla v11.4s, v18.4s, v5.4s\n"
-    "add x20, x20, #4\n"
-    "fmla v8.4s, v18.4s, v16.4s\n"
-    "ldr s24, [x23, x15]\n"
-    "fmla v15.4s, v22.4s, v3.4s\n"
-    "ldr s18, [x22, x17]\n"
-    "fmla v12.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "fmla v14.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x19]\n"
-    "fmla v11.4s, v22.4s, v7.4s\n"
-    "ldr s22, [x23, x17]\n"
-    "fmla v10.4s, v26.4s, v1.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v14.4s, v25.4s, v6.4s\n"
-    "ldr s25, [%[wbptr]]\n"
-    "fmla v9.4s, v26.4s, v2.4s\n"
-    "ldr s16, [%[wbptr], #4]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "fmla v10.4s, v27.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x19]\n"
-    "fmla v12.4s, v27.4s, v1.4s\n"
-    "add x22, x22, #4\n"
-    "str s13, [x24]\n"
-    "fmla v9.4s, v27.4s, v4.4s\n"
-    "fmla v11.4s, v27.4s, v2.4s\n"
-    "ldr s26, [%[inptr0]]\n"
-    "fmla v8.4s, v27.4s, v5.4s\n"
-    "ldr s28, [x20]\n"
-    "fmla v15.4s, v21.4s, v0.4s\n"
-    "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v14.4s, v21.4s, v1.4s\n"
-    "add x23, x23, #4\n"
-    "str s15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "fmla v8.4s, v21.4s, v7.4s\n"
-    "ldr s27, [x21]\n"
-    "fmla v14.4s, v20.4s, v3.4s\n"
-    "ldr s21, [x20, %[input_col_stride1]]\n"
-    "fmla v11.4s, v20.4s, v6.4s\n"
-    "ldr s20, [%[inptr0], x13]\n"
-    "fmla v10.4s, v19.4s, v0.4s\n"
-    "subs x27, x27, #1\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v8.4s, v19.4s, v2.4s\n"
-    "fmla v12.4s, v23.4s, v0.4s\n"
-    "ldr s7, [%[wbptr], #8]\n"
-    "str s10, [x25]\n"
-    "fmla v11.4s, v23.4s, v1.4s\n"
-    "fmla v9.4s, v23.4s, v3.4s\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "str s12, [x24, %[output_col_stride1]]\n"
-    "fmla v8.4s, v23.4s, v4.4s\n"
-    "fmla v14.4s, v17.4s, v0.4s\n"
-    "ldr s23, [x22]\n"
-    "fmla v11.4s, v17.4s, v3.4s\n"
-    "ldr s19, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v17.4s, v6.4s\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "str s14, [%[outptr0], x26]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "fmla v11.4s, v18.4s, v0.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v8.4s, v24.4s, v1.4s\n"
-    "ldr s6, [%[wbptr], #12]\n"
-    "str s9, [x25, %[output_col_stride1]]\n"
-    "mov v17.16b, v25.16b\n"
-    "str s11, [x24, x26]\n"
-    "mov v13.16b, v25.16b\n"
-    "fmla v8.4s, v18.4s, v3.4s\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "mov v15.16b, v25.16b\n"
-    "add x24, x24, #4\n"
-    "mov v10.16b, v25.16b\n"
-    "mov v12.16b, v25.16b\n"
-    "fmla v8.4s, v22.4s, v0.4s\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "mov v14.16b, v25.16b\n"
-    "mov v9.16b, v25.16b\n"
-    "mov v11.16b, v25.16b\n"
-    "fmla v17.4s, v26.4s, v16.4s\n"
-    "str s8, [x25, x26]\n"
-    "fmla v13.4s, v28.4s, v16.4s\n"
-    "mov v8.16b, v25.16b\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "fmla v17.4s, v28.4s, v5.4s\n"
-    "fmla v15.4s, v29.4s, v16.4s\n"
-    "add x25, x25, #4\n"
-    "fmla v17.4s, v29.4s, v7.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "ldr s30, [x20, x13]\n"
-    "fmla v13.4s, v27.4s, v5.4s\n"
-    "ldr s29, [%[inptr0], x15]\n"
-    "fmla v10.4s, v27.4s, v16.4s\n"
-    "ldr s28, [x23]\n"
-    "fmla v17.4s, v21.4s, v4.4s\n"
-    "ldr s24, [x22, %[input_col_stride1]]\n"
-    "fmla v13.4s, v21.4s, v7.4s\n"
-    "ldr s18, [x21, x13]\n"
-    "fmla v15.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [x20, x14]\n"
-    "fmla v12.4s, v21.4s, v16.4s\n"
-    "ldr s22, [x20, x15]\n"
-    "fmla v17.4s, v20.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v15.4s, v20.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v14.4s, v20.4s, v16.4s\n"
-    "ldr s25, [%[inptr0], x17]\n"
-    "fmla v13.4s, v23.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x19]\n"
-    "fmla v10.4s, v23.4s, v5.4s\n"
-    "ldr s26, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v19.4s, v1.4s\n"
-    "prfm pldl1keep, [x21, x14]\n"
-    "fmla v13.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v15.4s, v19.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x9]\n"
-    "fmla v10.4s, v19.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, x19]\n"
-    "fmla v12.4s, v19.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, x14]\n"
-    "fmla v9.4s, v19.4s, v16.4s\n"
-    "ldr s27, [x22, x13]\n"
-    "fmla v17.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v13.4s, v30.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x9]\n"
-    "fmla v15.4s, v30.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x14]\n"
-    "fmla v12.4s, v30.4s, v7.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x21, x9]\n"
-    "fmla v11.4s, v30.4s, v16.4s\n"
-    "ldr s21, [x21, x15]\n"
-    "fmla v15.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr s20, [x20, x17]\n"
-    "fmla v10.4s, v28.4s, v2.4s\n"
-    "ldr s19, [x23, x13]\n"
-    "fmla v13.4s, v24.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x9]\n"
-    "fmla v12.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x9]\n"
-    "fmla v10.4s, v24.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v9.4s, v24.4s, v5.4s\n"
-    "ldr s23, [x22, x15]\n"
-    "fmla v17.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v13.4s, v18.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v15.4s, v18.4s, v1.4s\n"
-    "add x20, x20, #4\n"
-    "str s17, [%[outptr0]]\n"
-    "fmla v10.4s, v18.4s, v6.4s\n"
-    "fmla v12.4s, v18.4s, v4.4s\n"
-    "ldr s17, [x21, x17]\n"
-    "fmla v14.4s, v18.4s, v2.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v9.4s, v18.4s, v7.4s\n"
-    "fmla v11.4s, v18.4s, v5.4s\n"
-    "fmla v8.4s, v18.4s, v16.4s\n"
-    "ldr s24, [x23, x15]\n"
-    "fmla v15.4s, v22.4s, v3.4s\n"
-    "ldr s18, [x22, x17]\n"
-    "fmla v12.4s, v22.4s, v6.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v14.4s, v22.4s, v4.4s\n"
-    "fmla v11.4s, v22.4s, v7.4s\n"
-    "fmla v10.4s, v26.4s, v1.4s\n"
-    "ldr s22, [x23, x17]\n"
-    "fmla v9.4s, v26.4s, v2.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v14.4s, v25.4s, v6.4s\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "fmla v10.4s, v27.4s, v3.4s\n"
-    "fmla v12.4s, v27.4s, v1.4s\n"
-    "fmla v9.4s, v27.4s, v4.4s\n"
-    "fmla v11.4s, v27.4s, v2.4s\n"
-    "str s13, [x24]\n"
-    "fmla v8.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v21.4s, v0.4s\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "fmla v14.4s, v21.4s, v1.4s\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "fmla v8.4s, v21.4s, v7.4s\n"
-    "str s15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v19.4s, v0.4s\n"
-    "fmla v14.4s, v20.4s, v3.4s\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v11.4s, v20.4s, v6.4s\n"
-    "fmla v8.4s, v19.4s, v2.4s\n"
-    "str s10, [x25]\n"
-    "fmla v12.4s, v23.4s, v0.4s\n"
-    "fmla v9.4s, v23.4s, v3.4s\n"
-    "fmla v14.4s, v17.4s, v0.4s\n"
-    "fmla v11.4s, v23.4s, v1.4s\n"
-    "fmla v8.4s, v23.4s, v4.4s\n"
-    "str s12, [x24, %[output_col_stride1]]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "str s14, [%[outptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v3.4s\n"
-    "fmla v8.4s, v17.4s, v6.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "str s9, [x25, %[output_col_stride1]]\n"
-    "fmla v11.4s, v18.4s, v0.4s\n"
-    "fmla v8.4s, v24.4s, v1.4s\n"
-    "str s11, [x24, x26]\n"
-    "fmla v8.4s, v18.4s, v3.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v8.4s, v22.4s, v0.4s\n"
-    "str s8, [x25, x26]\n"
-    "add x25, x25, #4\n"
-    "7:\n"
-    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x25, %[inptr0], %[input_row_stride]\n"
-    "add x16, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x21, %[outptr0], %[output_row_stride]\n"
-    "add x22, x25, %[input_row_stride]\n"
-    "add x23, x16, #64\n"
-    "add x26, x16, %[input_col_stride1]\n"
-    "add x13, x22, %[input_row_stride]\n"
-    "add x20, x26, #64\n"
-    "add x9, x26, %[input_col_stride1]\n"
-    "add x24, x13, %[input_row_stride]\n"
-    "add x15, x9, #64\n"
-    "add x14, x21, %[output_row_stride]\n"
-    "add x19, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x27, %[n_channels], #3\n"
-    "lsr x28, %[n_channels], #2\n"
-    "cbz x28, 4f\n"
-    "1:\n"
-    "ldr q20, [%[wbptr]]\n"
-    "subs x28, x28, #1\n"
-    "mov v4.16b, v20.16b\n"
-    "ldr q15, [%[wbptr], #16]\n"
-    "mov v1.16b, v20.16b\n"
-    "ldr q0, [%[wbptr], #32]\n"
-    "mov v3.16b, v20.16b\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "mov v7.16b, v20.16b\n"
-    "ldr q16, [%[wbptr], #64]\n"
-    "mov v9.16b, v20.16b\n"
-    "ldr q12, [%[wbptr], #80]\n"
-    "mov v2.16b, v20.16b\n"
-    "ldr q17, [%[wbptr], #96]\n"
-    "mov v6.16b, v20.16b\n"
-    "ldr q11, [%[wbptr], #112]\n"
-    "mov v8.16b, v20.16b\n"
-    "ldr q10, [%[wbptr], #128]\n"
-    "mov v5.16b, v20.16b\n"
-    "ldr q14, [%[wbptr], #144]\n"
-    "ldr q27, [%[inptr0]]\n"
-    "ldr q24, [x25]\n"
-    "fmla v4.4s, v27.4s, v15.4s\n"
-    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q21, [x22]\n"
-    "ldr q19, [x25, %[input_col_stride1]]\n"
-    "ldr q31, [%[inptr0], x16]\n"
-    "ldr q28, [x13]\n"
-    "fmla v4.4s, v24.4s, v16.4s\n"
-    "ldr q18, [x22, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x25, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x17]\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "prfm pldl1keep, [x25, x17]\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "prfm pldl1keep, [x13, #64]\n"
-    "prfm pldl1keep, [x22, x17]\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v1.4s, v24.4s, v15.4s\n"
-    "ldr q24, [x25, x16]\n"
-    "fmla v4.4s, v22.4s, v0.4s\n"
-    "ldr q29, [%[inptr0], x26]\n"
-    "fmla v3.4s, v22.4s, v15.4s\n"
-    "ldr q30, [x24]\n"
-    "fmla v1.4s, v21.4s, v16.4s\n"
-    "ldr q25, [x13, %[input_col_stride1]]\n"
-    "fmla v4.4s, v21.4s, v11.4s\n"
-    "prfm pldl1keep, [x25, x23]\n"
-    "fmla v7.4s, v21.4s, v15.4s\n"
-    "ldr q26, [x22, x16]\n"
-    "fmla v1.4s, v19.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v4.4s, v19.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v3.4s, v19.4s, v16.4s\n"
-    "prfm pldl1keep, [x13, x17]\n"
-    "fmla v9.4s, v19.4s, v15.4s\n"
-    "ldr q23, [x25, x26]\n"
-    "fmla v4.4s, v31.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x23]\n"
-    "fmla v3.4s, v31.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x20]\n"
-    "fmla v2.4s, v31.4s, v15.4s\n"
-    "ldr q20, [%[inptr0], x9]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "ldr q28, [x24, %[input_col_stride1]]\n"
-    "fmla v4.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x17]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "prfm pldl1keep, [x13, x23]\n"
-    "fmla v3.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x22, x20]\n"
-    "fmla v7.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x15]\n"
-    "fmla v9.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x23]\n"
-    "fmla v6.4s, v18.4s, v15.4s\n"
-    "ldr q27, [x13, x16]\n"
-    "fmla v4.4s, v24.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, x20]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x15]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, x20]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "prfm pldl1keep, [x13, x15]\n"
-    "fmla v2.4s, v24.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v8.4s, v24.4s, v15.4s\n"
-    "ldr q24, [x22, x26]\n"
-    "fmla v3.4s, v29.4s, v13.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v29.4s, v0.4s\n"
-    "ldr q22, [x25, x9]\n"
-    "fmla v7.4s, v30.4s, v11.4s\n"
-    "ldr q21, [x24, x16]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v25.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v7.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v6.4s, v25.4s, v16.4s\n"
-    "ldr q19, [x13, x26]\n"
-    "fmla v4.4s, v26.4s, v14.4s\n"
-    "prfm pldl1keep, [%[inptr0], x17]\n"
-    "fmla v1.4s, v26.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v3.4s, v26.4s, v10.4s\n"
-    "add x25, x25, #16\n"
-    "fmla v7.4s, v26.4s, v13.4s\n"
-    "prfm pldl1keep, [x25, #64]\n"
-    "fmla v9.4s, v26.4s, v12.4s\n"
-    "prfm pldl1keep, [x25, x17]\n"
-    "fmla v2.4s, v26.4s, v11.4s\n"
-    "subs x28, x28, #1\n"
-    "fmla v6.4s, v26.4s, v0.4s\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "fmla v5.4s, v26.4s, v15.4s\n"
-    "ldr q26, [x22, x9]\n"
-    "fmla v3.4s, v23.4s, v17.4s\n"
-    "ldr q18, [x24, x26]\n"
-    "fmla v9.4s, v23.4s, v13.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v8.4s, v23.4s, v0.4s\n"
-    "ldr q23, [x13, x9]\n"
-    "fmla v7.4s, v28.4s, v10.4s\n"
-    "prfm pldl1keep, [x22, x17]\n"
-    "fmla v2.4s, v20.4s, v13.4s\n"
-    "ldr q25, [x24, x9]\n"
-    "fmla v6.4s, v28.4s, v11.4s\n"
-    "ldr q20, [%[wbptr]]\n"
-    "fmla v1.4s, v27.4s, v14.4s\n"
-    "add x13, x13, #16\n"
-    "fmla v7.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, #64]\n"
-    "fmla v9.4s, v27.4s, v10.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v6.4s, v27.4s, v12.4s\n"
-    "fmla v8.4s, v27.4s, v11.4s\n"
-    "fmla v5.4s, v27.4s, v16.4s\n"
-    "ldr q15, [%[wbptr], #16]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "ldr q27, [%[inptr0]]\n"
-    "fmla v9.4s, v24.4s, v17.4s\n"
-    "fmla v2.4s, v24.4s, v10.4s\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "fmla v8.4s, v24.4s, v12.4s\n"
-    "fmla v5.4s, v24.4s, v0.4s\n"
-    "ldr q16, [%[wbptr], #64]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "ldr q24, [x25]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v7.4s, v21.4s, v14.4s\n"
-    "fmla v6.4s, v21.4s, v10.4s\n"
-    "fmla v5.4s, v21.4s, v11.4s\n"
-    "ldr q0, [%[wbptr], #32]\n"
-    "fmla v9.4s, v19.4s, v14.4s\n"
-    "ldr q21, [x22]\n"
-    "fmla v6.4s, v19.4s, v17.4s\n"
-    "fmla v8.4s, v19.4s, v10.4s\n"
-    "fmla v5.4s, v19.4s, v12.4s\n"
-    "ldr q11, [%[wbptr], #112]\n"
-    "fmla v2.4s, v26.4s, v14.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v8.4s, v26.4s, v17.4s\n"
-    "fmla v6.4s, v18.4s, v14.4s\n"
-    "fmla v5.4s, v26.4s, v13.4s\n"
-    "ldr q12, [%[wbptr], #80]\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "ldr q19, [x25, %[input_col_stride1]]\n"
-    "fmla v8.4s, v23.4s, v14.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "str q4, [%[outptr0]]\n"
-    "fmla v5.4s, v18.4s, v10.4s\n"
-    "str q3, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "str q2, [%[outptr0], x19]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "str q1, [x21]\n"
-    "fmax v9.4s, v9.4s, v29.4s\n"
-    "fmax v8.4s, v8.4s, v29.4s\n"
-    "ldr q10, [%[wbptr], #128]\n"
-    "str q9, [x21, %[output_col_stride1]]\n"
-    "fmla v5.4s, v25.4s, v14.4s\n"
-    "str q8, [x21, x19]\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "ldr q17, [%[wbptr], #96]\n"
-    "str q7, [x14]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "str q6, [x14, %[output_col_stride1]]\n"
-    "mov v4.16b, v20.16b\n"
-    "str q5, [x14, x19]\n"
-    "mov v1.16b, v20.16b\n"
-    "mov v3.16b, v20.16b\n"
-    "ldr q14, [%[wbptr], #144]\n"
-    "mov v7.16b, v20.16b\n"
-    "ldr q31, [%[inptr0], x16]\n"
-    "mov v9.16b, v20.16b\n"
-    "ldr q28, [x13]\n"
-    "mov v2.16b, v20.16b\n"
-    "ldr q18, [x22, %[input_col_stride1]]\n"
-    "mov v6.16b, v20.16b\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "mov v8.16b, v20.16b\n"
-    "add x21, x21, #16\n"
-    "mov v5.16b, v20.16b\n"
-    "add x14, x14, #16\n"
-    "fmla v4.4s, v27.4s, v15.4s\n"
-    "fmla v4.4s, v24.4s, v16.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v1.4s, v24.4s, v15.4s\n"
-    "ldr q24, [x25, x16]\n"
-    "fmla v4.4s, v22.4s, v0.4s\n"
-    "ldr q29, [%[inptr0], x26]\n"
-    "fmla v3.4s, v22.4s, v15.4s\n"
-    "ldr q30, [x24]\n"
-    "fmla v1.4s, v21.4s, v16.4s\n"
-    "ldr q25, [x13, %[input_col_stride1]]\n"
-    "fmla v4.4s, v21.4s, v11.4s\n"
-    "prfm pldl1keep, [x25, x23]\n"
-    "fmla v7.4s, v21.4s, v15.4s\n"
-    "ldr q26, [x22, x16]\n"
-    "fmla v1.4s, v19.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v4.4s, v19.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v3.4s, v19.4s, v16.4s\n"
-    "prfm pldl1keep, [x13, x17]\n"
-    "fmla v9.4s, v19.4s, v15.4s\n"
-    "ldr q23, [x25, x26]\n"
-    "fmla v4.4s, v31.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x23]\n"
-    "fmla v3.4s, v31.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x20]\n"
-    "fmla v2.4s, v31.4s, v15.4s\n"
-    "ldr q20, [%[inptr0], x9]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "ldr q28, [x24, %[input_col_stride1]]\n"
-    "fmla v4.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x17]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "prfm pldl1keep, [x13, x23]\n"
-    "fmla v3.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x22, x20]\n"
-    "fmla v7.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x15]\n"
-    "fmla v9.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x23]\n"
-    "fmla v6.4s, v18.4s, v15.4s\n"
-    "ldr q27, [x13, x16]\n"
-    "fmla v4.4s, v24.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, x20]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x15]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, x20]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "prfm pldl1keep, [x13, x15]\n"
-    "fmla v2.4s, v24.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v8.4s, v24.4s, v15.4s\n"
-    "ldr q24, [x22, x26]\n"
-    "fmla v3.4s, v29.4s, v13.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v29.4s, v0.4s\n"
-    "ldr q22, [x25, x9]\n"
-    "fmla v7.4s, v30.4s, v11.4s\n"
-    "ldr q21, [x24, x16]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v25.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v7.4s, v25.4s, v12.4s\n"
-    "add x25, x25, #16\n"
-    "fmla v6.4s, v25.4s, v16.4s\n"
-    "ldr q19, [x13, x26]\n"
-    "fmla v4.4s, v26.4s, v14.4s\n"
-    "fmla v1.4s, v26.4s, v17.4s\n"
-    "fmla v3.4s, v26.4s, v10.4s\n"
-    "fmla v7.4s, v26.4s, v13.4s\n"
-    "fmla v9.4s, v26.4s, v12.4s\n"
-    "fmla v2.4s, v26.4s, v11.4s\n"
-    "fmla v6.4s, v26.4s, v0.4s\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "fmla v5.4s, v26.4s, v15.4s\n"
-    "ldr q26, [x22, x9]\n"
-    "fmla v3.4s, v23.4s, v17.4s\n"
-    "ldr q18, [x24, x26]\n"
-    "fmla v9.4s, v23.4s, v13.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "fmla v8.4s, v23.4s, v0.4s\n"
-    "fmla v7.4s, v28.4s, v10.4s\n"
-    "ldr q23, [x13, x9]\n"
-    "fmla v6.4s, v28.4s, v11.4s\n"
-    "ldr q25, [x24, x9]\n"
-    "fmla v2.4s, v20.4s, v13.4s\n"
-    "add x13, x13, #16\n"
-    "fmla v1.4s, v27.4s, v14.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v7.4s, v27.4s, v17.4s\n"
-    "fmla v9.4s, v27.4s, v10.4s\n"
-    "fmla v6.4s, v27.4s, v12.4s\n"
-    "fmla v8.4s, v27.4s, v11.4s\n"
-    "fmla v5.4s, v27.4s, v16.4s\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "fmla v9.4s, v24.4s, v17.4s\n"
-    "fmla v2.4s, v24.4s, v10.4s\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "fmla v8.4s, v24.4s, v12.4s\n"
-    "fmla v5.4s, v24.4s, v0.4s\n"
-    "fmla v7.4s, v21.4s, v14.4s\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "fmla v9.4s, v19.4s, v14.4s\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "fmla v6.4s, v21.4s, v10.4s\n"
-    "fmla v5.4s, v21.4s, v11.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v2.4s, v26.4s, v14.4s\n"
-    "fmla v6.4s, v19.4s, v17.4s\n"
-    "fmla v8.4s, v19.4s, v10.4s\n"
-    "fmla v5.4s, v19.4s, v12.4s\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "fmla v6.4s, v18.4s, v14.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "str q4, [%[outptr0]]\n"
-    "fmla v8.4s, v26.4s, v17.4s\n"
-    "str q3, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v5.4s, v26.4s, v13.4s\n"
-    "str q2, [%[outptr0], x19]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "fmla v8.4s, v23.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "str q1, [x21]\n"
-    "fmla v5.4s, v18.4s, v10.4s\n"
-    "fmax v9.4s, v9.4s, v29.4s\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v8.4s, v8.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "str q9, [x21, %[output_col_stride1]]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "str q8, [x21, x19]\n"
-    "str q7, [x14]\n"
-    "str q6, [x14, %[output_col_stride1]]\n"
-    "add x21, x21, #16\n"
-    "fmla v5.4s, v25.4s, v14.4s\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "str q5, [x14, x19]\n"
-    "add x14, x14, #16\n"
-    "4:\n"
-    "cbz x27, 7f\n"
-    "ldr s20, [%[wbptr]]\n"
-    "mov v4.16b, v20.16b\n"
-    "ldr s15, [%[wbptr], #4]\n"
-    "mov v1.16b, v20.16b\n"
-    "ldr s0, [%[wbptr], #8]\n"
-    "mov v3.16b, v20.16b\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "mov v7.16b, v20.16b\n"
-    "ldr s16, [%[wbptr], #16]\n"
-    "mov v9.16b, v20.16b\n"
-    "ldr s12, [%[wbptr], #20]\n"
-    "mov v2.16b, v20.16b\n"
-    "ldr s17, [%[wbptr], #24]\n"
-    "mov v6.16b, v20.16b\n"
-    "ldr s11, [%[wbptr], #28]\n"
-    "mov v8.16b, v20.16b\n"
-    "ldr s10, [%[wbptr], #32]\n"
-    "mov v5.16b, v20.16b\n"
-    "ldr s14, [%[wbptr], #36]\n"
-    "ldr s27, [%[inptr0]]\n"
-    "subs x27, x27, #1\n"
-    "fmla v4.4s, v27.4s, v15.4s\n"
-    "ldr s24, [x25]\n"
-    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s21, [x22]\n"
-    "ldr s19, [x25, %[input_col_stride1]]\n"
-    "ldr s31, [%[inptr0], x16]\n"
-    "fmla v4.4s, v24.4s, v16.4s\n"
-    "ldr s28, [x13]\n"
-    "ldr s18, [x22, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x25, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x17]\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "prfm pldl1keep, [x25, x17]\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "prfm pldl1keep, [x13, #64]\n"
-    "prfm pldl1keep, [x22, x17]\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v1.4s, v24.4s, v15.4s\n"
-    "ldr s24, [x25, x16]\n"
-    "fmla v4.4s, v22.4s, v0.4s\n"
-    "ldr s29, [%[inptr0], x26]\n"
-    "fmla v3.4s, v22.4s, v15.4s\n"
-    "ldr s30, [x24]\n"
-    "fmla v1.4s, v21.4s, v16.4s\n"
-    "ldr s25, [x13, %[input_col_stride1]]\n"
-    "fmla v4.4s, v21.4s, v11.4s\n"
-    "prfm pldl1keep, [x25, x23]\n"
-    "fmla v7.4s, v21.4s, v15.4s\n"
-    "ldr s26, [x22, x16]\n"
-    "fmla v1.4s, v19.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v4.4s, v19.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v3.4s, v19.4s, v16.4s\n"
-    "prfm pldl1keep, [x13, x17]\n"
-    "fmla v9.4s, v19.4s, v15.4s\n"
-    "ldr s23, [x25, x26]\n"
-    "fmla v4.4s, v31.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x23]\n"
-    "fmla v3.4s, v31.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x20]\n"
-    "fmla v2.4s, v31.4s, v15.4s\n"
-    "ldr s20, [%[inptr0], x9]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "ldr s28, [x24, %[input_col_stride1]]\n"
-    "fmla v4.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x17]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "prfm pldl1keep, [x13, x23]\n"
-    "fmla v3.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x22, x20]\n"
-    "fmla v7.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x15]\n"
-    "fmla v9.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x23]\n"
-    "fmla v6.4s, v18.4s, v15.4s\n"
-    "ldr s27, [x13, x16]\n"
-    "fmla v4.4s, v24.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, x20]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x15]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, x20]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "prfm pldl1keep, [x13, x15]\n"
-    "fmla v2.4s, v24.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v8.4s, v24.4s, v15.4s\n"
-    "ldr s24, [x22, x26]\n"
-    "fmla v3.4s, v29.4s, v13.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v29.4s, v0.4s\n"
-    "ldr s22, [x25, x9]\n"
-    "fmla v7.4s, v30.4s, v11.4s\n"
-    "ldr s21, [x24, x16]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v25.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v7.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v6.4s, v25.4s, v16.4s\n"
-    "ldr s19, [x13, x26]\n"
-    "fmla v4.4s, v26.4s, v14.4s\n"
-    "prfm pldl1keep, [%[inptr0], x17]\n"
-    "fmla v1.4s, v26.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v3.4s, v26.4s, v10.4s\n"
-    "add x25, x25, #4\n"
-    "fmla v7.4s, v26.4s, v13.4s\n"
-    "prfm pldl1keep, [x25, #64]\n"
-    "fmla v9.4s, v26.4s, v12.4s\n"
-    "prfm pldl1keep, [x25, x17]\n"
-    "fmla v2.4s, v26.4s, v11.4s\n"
-    "subs x27, x27, #1\n"
-    "fmla v6.4s, v26.4s, v0.4s\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "fmla v5.4s, v26.4s, v15.4s\n"
-    "ldr s26, [x22, x9]\n"
-    "fmla v3.4s, v23.4s, v17.4s\n"
-    "ldr s18, [x24, x26]\n"
-    "fmla v9.4s, v23.4s, v13.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v8.4s, v23.4s, v0.4s\n"
-    "ldr s23, [x13, x9]\n"
-    "fmla v7.4s, v28.4s, v10.4s\n"
-    "prfm pldl1keep, [x22, x17]\n"
-    "fmla v2.4s, v20.4s, v13.4s\n"
-    "ldr s25, [x24, x9]\n"
-    "fmla v6.4s, v28.4s, v11.4s\n"
-    "ldr s20, [%[wbptr]]\n"
-    "fmla v1.4s, v27.4s, v14.4s\n"
-    "add x13, x13, #4\n"
-    "fmla v7.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, #64]\n"
-    "fmla v9.4s, v27.4s, v10.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v6.4s, v27.4s, v12.4s\n"
-    "fmla v8.4s, v27.4s, v11.4s\n"
-    "fmla v5.4s, v27.4s, v16.4s\n"
-    "ldr s15, [%[wbptr], #4]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "ldr s27, [%[inptr0]]\n"
-    "fmla v9.4s, v24.4s, v17.4s\n"
-    "fmla v2.4s, v24.4s, v10.4s\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "fmla v8.4s, v24.4s, v12.4s\n"
-    "fmla v5.4s, v24.4s, v0.4s\n"
-    "ldr s16, [%[wbptr], #16]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "ldr s24, [x25]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v7.4s, v21.4s, v14.4s\n"
-    "fmla v6.4s, v21.4s, v10.4s\n"
-    "fmla v5.4s, v21.4s, v11.4s\n"
-    "ldr s0, [%[wbptr], #8]\n"
-    "fmla v9.4s, v19.4s, v14.4s\n"
-    "ldr s21, [x22]\n"
-    "fmla v6.4s, v19.4s, v17.4s\n"
-    "fmla v8.4s, v19.4s, v10.4s\n"
-    "fmla v5.4s, v19.4s, v12.4s\n"
-    "ldr s11, [%[wbptr], #28]\n"
-    "fmla v2.4s, v26.4s, v14.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v8.4s, v26.4s, v17.4s\n"
-    "fmla v6.4s, v18.4s, v14.4s\n"
-    "fmla v5.4s, v26.4s, v13.4s\n"
-    "ldr s12, [%[wbptr], #20]\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "ldr s19, [x25, %[input_col_stride1]]\n"
-    "fmla v8.4s, v23.4s, v14.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "str s4, [%[outptr0]]\n"
-    "fmla v5.4s, v18.4s, v10.4s\n"
-    "str s3, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "str s2, [%[outptr0], x19]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "str s1, [x21]\n"
-    "fmax v9.4s, v9.4s, v29.4s\n"
-    "fmax v8.4s, v8.4s, v29.4s\n"
-    "ldr s10, [%[wbptr], #32]\n"
-    "str s9, [x21, %[output_col_stride1]]\n"
-    "fmla v5.4s, v25.4s, v14.4s\n"
-    "str s8, [x21, x19]\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "ldr s17, [%[wbptr], #24]\n"
-    "str s7, [x14]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "str s6, [x14, %[output_col_stride1]]\n"
-    "mov v4.16b, v20.16b\n"
-    "str s5, [x14, x19]\n"
-    "mov v1.16b, v20.16b\n"
-    "mov v3.16b, v20.16b\n"
-    "ldr s14, [%[wbptr], #36]\n"
-    "mov v7.16b, v20.16b\n"
-    "ldr s31, [%[inptr0], x16]\n"
-    "mov v9.16b, v20.16b\n"
-    "ldr s28, [x13]\n"
-    "mov v2.16b, v20.16b\n"
-    "ldr s18, [x22, %[input_col_stride1]]\n"
-    "mov v6.16b, v20.16b\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "mov v8.16b, v20.16b\n"
-    "add x21, x21, #4\n"
-    "mov v5.16b, v20.16b\n"
-    "add x14, x14, #4\n"
-    "fmla v4.4s, v27.4s, v15.4s\n"
-    "fmla v4.4s, v24.4s, v16.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v1.4s, v24.4s, v15.4s\n"
-    "ldr s24, [x25, x16]\n"
-    "fmla v4.4s, v22.4s, v0.4s\n"
-    "ldr s29, [%[inptr0], x26]\n"
-    "fmla v3.4s, v22.4s, v15.4s\n"
-    "ldr s30, [x24]\n"
-    "fmla v1.4s, v21.4s, v16.4s\n"
-    "ldr s25, [x13, %[input_col_stride1]]\n"
-    "fmla v4.4s, v21.4s, v11.4s\n"
-    "prfm pldl1keep, [x25, x23]\n"
-    "fmla v7.4s, v21.4s, v15.4s\n"
-    "ldr s26, [x22, x16]\n"
-    "fmla v1.4s, v19.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v4.4s, v19.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v3.4s, v19.4s, v16.4s\n"
-    "prfm pldl1keep, [x13, x17]\n"
-    "fmla v9.4s, v19.4s, v15.4s\n"
-    "ldr s23, [x25, x26]\n"
-    "fmla v4.4s, v31.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x23]\n"
-    "fmla v3.4s, v31.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x20]\n"
-    "fmla v2.4s, v31.4s, v15.4s\n"
-    "ldr s20, [%[inptr0], x9]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "ldr s28, [x24, %[input_col_stride1]]\n"
-    "fmla v4.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x17]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "prfm pldl1keep, [x13, x23]\n"
-    "fmla v3.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x22, x20]\n"
-    "fmla v7.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x15]\n"
-    "fmla v9.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x23]\n"
-    "fmla v6.4s, v18.4s, v15.4s\n"
-    "ldr s27, [x13, x16]\n"
-    "fmla v4.4s, v24.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, x20]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x15]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, x20]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "prfm pldl1keep, [x13, x15]\n"
-    "fmla v2.4s, v24.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v8.4s, v24.4s, v15.4s\n"
-    "ldr s24, [x22, x26]\n"
-    "fmla v3.4s, v29.4s, v13.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v29.4s, v0.4s\n"
-    "ldr s22, [x25, x9]\n"
-    "fmla v7.4s, v30.4s, v11.4s\n"
-    "ldr s21, [x24, x16]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v25.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v7.4s, v25.4s, v12.4s\n"
-    "add x25, x25, #4\n"
-    "fmla v6.4s, v25.4s, v16.4s\n"
-    "ldr s19, [x13, x26]\n"
-    "fmla v4.4s, v26.4s, v14.4s\n"
-    "fmla v1.4s, v26.4s, v17.4s\n"
-    "fmla v3.4s, v26.4s, v10.4s\n"
-    "fmla v7.4s, v26.4s, v13.4s\n"
-    "fmla v9.4s, v26.4s, v12.4s\n"
-    "fmla v2.4s, v26.4s, v11.4s\n"
-    "fmla v6.4s, v26.4s, v0.4s\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "fmla v5.4s, v26.4s, v15.4s\n"
-    "ldr s26, [x22, x9]\n"
-    "fmla v3.4s, v23.4s, v17.4s\n"
-    "ldr s18, [x24, x26]\n"
-    "fmla v9.4s, v23.4s, v13.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "fmla v8.4s, v23.4s, v0.4s\n"
-    "fmla v7.4s, v28.4s, v10.4s\n"
-    "ldr s23, [x13, x9]\n"
-    "fmla v6.4s, v28.4s, v11.4s\n"
-    "ldr s25, [x24, x9]\n"
-    "fmla v2.4s, v20.4s, v13.4s\n"
-    "add x13, x13, #4\n"
-    "fmla v1.4s, v27.4s, v14.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v7.4s, v27.4s, v17.4s\n"
-    "fmla v9.4s, v27.4s, v10.4s\n"
-    "fmla v6.4s, v27.4s, v12.4s\n"
-    "fmla v8.4s, v27.4s, v11.4s\n"
-    "fmla v5.4s, v27.4s, v16.4s\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "fmla v9.4s, v24.4s, v17.4s\n"
-    "fmla v2.4s, v24.4s, v10.4s\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "fmla v8.4s, v24.4s, v12.4s\n"
-    "fmla v5.4s, v24.4s, v0.4s\n"
-    "fmla v7.4s, v21.4s, v14.4s\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "fmla v9.4s, v19.4s, v14.4s\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "fmla v6.4s, v21.4s, v10.4s\n"
-    "fmla v5.4s, v21.4s, v11.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v2.4s, v26.4s, v14.4s\n"
-    "fmla v6.4s, v19.4s, v17.4s\n"
-    "fmla v8.4s, v19.4s, v10.4s\n"
-    "fmla v5.4s, v19.4s, v12.4s\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "fmla v6.4s, v18.4s, v14.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "str s4, [%[outptr0]]\n"
-    "fmla v8.4s, v26.4s, v17.4s\n"
-    "str s3, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v5.4s, v26.4s, v13.4s\n"
-    "str s2, [%[outptr0], x19]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "fmla v8.4s, v23.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "str s1, [x21]\n"
-    "fmla v5.4s, v18.4s, v10.4s\n"
-    "fmax v9.4s, v9.4s, v29.4s\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v8.4s, v8.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "str s9, [x21, %[output_col_stride1]]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "str s8, [x21, x19]\n"
-    "str s7, [x14]\n"
-    "str s6, [x14, %[output_col_stride1]]\n"
-    "add x21, x21, #4\n"
-    "fmla v5.4s, v25.4s, v14.4s\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "str s5, [x14, x19]\n"
-    "add x14, x14, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x17, %[inptr0], %[input_row_stride]\n"
-    "add x9, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x25, %[outptr0], %[output_row_stride]\n"
-    "add x14, x17, %[input_row_stride]\n"
-    "add x22, x9, #64\n"
-    "add x15, x9, %[input_col_stride1]\n"
-    "add x21, x14, %[input_row_stride]\n"
-    "add x16, x15, #64\n"
-    "add x24, x15, %[input_col_stride1]\n"
-    "add x26, x21, %[input_row_stride]\n"
-    "add x23, x24, #64\n"
-    "add x13, x25, %[output_row_stride]\n"
-    "add x27, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x19, %[n_channels], #3\n"
-    "lsr x20, %[n_channels], #2\n"
-    "cbz x20, 4f\n"
-    "1:\n"
-    "ldr q19, [%[wbptr]]\n"
-    "subs x20, x20, #1\n"
-    "mov v8.16b, v19.16b\n"
-    "ldr q17, [%[wbptr], #16]\n"
-    "mov v5.16b, v19.16b\n"
-    "ldr q16, [%[wbptr], #32]\n"
-    "mov v7.16b, v19.16b\n"
-    "ldr q15, [%[wbptr], #48]\n"
-    "mov v2.16b, v19.16b\n"
-    "ldr q14, [%[wbptr], #64]\n"
-    "mov v4.16b, v19.16b\n"
-    "ldr q13, [%[wbptr], #80]\n"
-    "mov v6.16b, v19.16b\n"
-    "ldr q12, [%[wbptr], #96]\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr q11, [%[wbptr], #112]\n"
-    "mov v3.16b, v19.16b\n"
-    "ldr q10, [%[wbptr], #128]\n"
-    "mov v0.16b, v19.16b\n"
-    "ldr q9, [%[wbptr], #144]\n"
-    "ldr q25, [%[inptr0]]\n"
-    "ldr q27, [x17]\n"
-    "fmla v8.4s, v25.4s, v17.4s\n"
-    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q20, [x14]\n"
-    "ldr q22, [x17, %[input_col_stride1]]\n"
-    "ldr q28, [%[inptr0], x9]\n"
-    "ldr q23, [x21]\n"
-    "fmla v8.4s, v27.4s, v14.4s\n"
-    "ldr q18, [x14, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x17, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "prfm pldl1keep, [x14, #64]\n"
-    "prfm pldl1keep, [x17, x28]\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "prfm pldl1keep, [x14, x28]\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v5.4s, v27.4s, v17.4s\n"
-    "ldr q27, [x17, x9]\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "ldr q30, [%[inptr0], x15]\n"
-    "fmla v7.4s, v26.4s, v17.4s\n"
-    "ldr q31, [x26]\n"
-    "fmla v5.4s, v20.4s, v14.4s\n"
-    "ldr q24, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x17, x22]\n"
-    "fmla v2.4s, v20.4s, v17.4s\n"
-    "ldr q29, [x14, x9]\n"
-    "fmla v5.4s, v22.4s, v16.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v7.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x21, x28]\n"
-    "fmla v4.4s, v22.4s, v17.4s\n"
-    "ldr q21, [x17, x15]\n"
-    "fmla v8.4s, v28.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x22]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x16]\n"
-    "fmla v6.4s, v28.4s, v17.4s\n"
-    "ldr q19, [%[inptr0], x24]\n"
-    "fmla v5.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "ldr q28, [x26, %[input_col_stride1]]\n"
-    "fmla v8.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x26, x28]\n"
-    "fmla v5.4s, v18.4s, v13.4s\n"
-    "prfm pldl1keep, [x21, x22]\n"
-    "fmla v7.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x14, x16]\n"
-    "fmla v2.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x23]\n"
-    "fmla v4.4s, v18.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x22]\n"
-    "fmla v1.4s, v18.4s, v17.4s\n"
-    "ldr q25, [x21, x9]\n"
-    "fmla v8.4s, v27.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v5.4s, v27.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x23]\n"
-    "fmla v7.4s, v27.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, x16]\n"
-    "fmla v4.4s, v27.4s, v16.4s\n"
-    "prfm pldl1keep, [x21, x23]\n"
-    "fmla v6.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x23]\n"
-    "fmla v3.4s, v27.4s, v17.4s\n"
-    "ldr q27, [x14, x15]\n"
-    "fmla v7.4s, v30.4s, v15.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v6.4s, v30.4s, v16.4s\n"
-    "ldr q26, [x17, x24]\n"
-    "fmla v2.4s, v31.4s, v11.4s\n"
-    "ldr q20, [x26, x9]\n"
-    "fmla v5.4s, v24.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v4.4s, v24.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v2.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v1.4s, v24.4s, v14.4s\n"
-    "ldr q18, [x21, x15]\n"
-    "fmla v8.4s, v29.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "fmla v5.4s, v29.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v7.4s, v29.4s, v10.4s\n"
-    "add x17, x17, #16\n"
-    "fmla v2.4s, v29.4s, v15.4s\n"
-    "prfm pldl1keep, [x17, #64]\n"
-    "fmla v4.4s, v29.4s, v13.4s\n"
-    "prfm pldl1keep, [x17, x28]\n"
-    "fmla v6.4s, v29.4s, v11.4s\n"
-    "subs x20, x20, #1\n"
-    "fmla v1.4s, v29.4s, v16.4s\n"
-    "fmla v3.4s, v29.4s, v14.4s\n"
-    "fmla v0.4s, v29.4s, v17.4s\n"
-    "ldr q22, [x14, x24]\n"
-    "fmla v7.4s, v21.4s, v12.4s\n"
-    "ldr q23, [x26, x15]\n"
-    "fmla v4.4s, v21.4s, v15.4s\n"
-    "add x14, x14, #16\n"
-    "fmla v6.4s, v21.4s, v13.4s\n"
-    "prfm pldl1keep, [x14, #64]\n"
-    "fmla v3.4s, v21.4s, v16.4s\n"
-    "ldr q24, [x21, x24]\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "prfm pldl1keep, [x14, x28]\n"
-    "fmla v6.4s, v19.4s, v15.4s\n"
-    "ldr q21, [x26, x24]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "ldr q19, [%[wbptr]]\n"
-    "fmla v5.4s, v25.4s, v9.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v2.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v1.4s, v25.4s, v13.4s\n"
-    "fmla v3.4s, v25.4s, v11.4s\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "ldr q17, [%[wbptr], #16]\n"
-    "fmla v7.4s, v27.4s, v9.4s\n"
-    "ldr q25, [%[inptr0]]\n"
-    "fmla v4.4s, v27.4s, v12.4s\n"
-    "fmla v6.4s, v27.4s, v10.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v3.4s, v27.4s, v13.4s\n"
-    "fmla v0.4s, v27.4s, v16.4s\n"
-    "ldr q14, [%[wbptr], #64]\n"
-    "fmla v6.4s, v26.4s, v12.4s\n"
-    "ldr q27, [x17]\n"
-    "fmla v3.4s, v26.4s, v15.4s\n"
-    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v20.4s, v9.4s\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v0.4s, v20.4s, v11.4s\n"
-    "ldr q16, [%[wbptr], #32]\n"
-    "fmla v4.4s, v18.4s, v9.4s\n"
-    "ldr q20, [x14]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v18.4s, v13.4s\n"
-    "ldr q11, [%[wbptr], #112]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "movi v30.16b, #0\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v0.4s, v22.4s, v15.4s\n"
-    "ldr q13, [%[wbptr], #80]\n"
-    "fmov v29.4s, #6.0\n"
-    "fmax v8.4s, v8.4s, v30.4s\n"
-    "fmla v3.4s, v24.4s, v9.4s\n"
-    "fmax v7.4s, v7.4s, v30.4s\n"
-    "fmla v0.4s, v23.4s, v10.4s\n"
-    "ldr q15, [%[wbptr], #48]\n"
-    "fmin v8.4s, v8.4s, v29.4s\n"
-    "ldr q22, [x17, %[input_col_stride1]]\n"
-    "fmin v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v30.4s\n"
-    "str q8, [%[outptr0]]\n"
-    "fmla v0.4s, v24.4s, v12.4s\n"
-    "str q7, [%[outptr0], %[output_col_stride1]]\n"
-    "fmin v6.4s, v6.4s, v29.4s\n"
-    "fmax v5.4s, v5.4s, v30.4s\n"
-    "ldr q10, [%[wbptr], #128]\n"
-    "str q6, [%[outptr0], x27]\n"
-    "fmla v0.4s, v21.4s, v9.4s\n"
-    "fmin v5.4s, v5.4s, v29.4s\n"
-    "ldr q12, [%[wbptr], #96]\n"
-    "fmax v4.4s, v4.4s, v30.4s\n"
-    "ldr q28, [%[inptr0], x9]\n"
-    "str q5, [x25]\n"
-    "fmax v3.4s, v3.4s, v30.4s\n"
-    "fmin v4.4s, v4.4s, v29.4s\n"
-    "ldr q9, [%[wbptr], #144]\n"
-    "fmin v3.4s, v3.4s, v29.4s\n"
-    "ldr q23, [x21]\n"
-    "str q4, [x25, %[output_col_stride1]]\n"
-    "fmax v2.4s, v2.4s, v30.4s\n"
-    "str q3, [x25, x27]\n"
-    "fmax v1.4s, v1.4s, v30.4s\n"
-    "fmin v2.4s, v2.4s, v29.4s\n"
-    "ldr q18, [x14, %[input_col_stride1]]\n"
-    "fmin v1.4s, v1.4s, v29.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "str q2, [x13]\n"
-    "fmax v0.4s, v0.4s, v30.4s\n"
-    "str q1, [x13, %[output_col_stride1]]\n"
-    "mov v8.16b, v19.16b\n"
-    "fmin v0.4s, v0.4s, v29.4s\n"
-    "add x25, x25, #16\n"
-    "mov v5.16b, v19.16b\n"
-    "mov v7.16b, v19.16b\n"
-    "str q0, [x13, x27]\n"
-    "mov v2.16b, v19.16b\n"
-    "mov v4.16b, v19.16b\n"
-    "add x13, x13, #16\n"
-    "mov v6.16b, v19.16b\n"
-    "mov v1.16b, v19.16b\n"
-    "mov v3.16b, v19.16b\n"
-    "mov v0.16b, v19.16b\n"
-    "fmla v8.4s, v25.4s, v17.4s\n"
-    "fmla v8.4s, v27.4s, v14.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v5.4s, v27.4s, v17.4s\n"
-    "ldr q27, [x17, x9]\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "ldr q30, [%[inptr0], x15]\n"
-    "fmla v7.4s, v26.4s, v17.4s\n"
-    "ldr q31, [x26]\n"
-    "fmla v5.4s, v20.4s, v14.4s\n"
-    "ldr q24, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x17, x22]\n"
-    "fmla v2.4s, v20.4s, v17.4s\n"
-    "ldr q29, [x14, x9]\n"
-    "fmla v5.4s, v22.4s, v16.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v7.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x21, x28]\n"
-    "fmla v4.4s, v22.4s, v17.4s\n"
-    "ldr q21, [x17, x15]\n"
-    "fmla v8.4s, v28.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x22]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x16]\n"
-    "fmla v6.4s, v28.4s, v17.4s\n"
-    "ldr q19, [%[inptr0], x24]\n"
-    "fmla v5.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "ldr q28, [x26, %[input_col_stride1]]\n"
-    "fmla v8.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x26, x28]\n"
-    "fmla v5.4s, v18.4s, v13.4s\n"
-    "prfm pldl1keep, [x21, x22]\n"
-    "fmla v7.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x14, x16]\n"
-    "fmla v2.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x23]\n"
-    "fmla v4.4s, v18.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x22]\n"
-    "fmla v1.4s, v18.4s, v17.4s\n"
-    "ldr q25, [x21, x9]\n"
-    "fmla v8.4s, v27.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v5.4s, v27.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x23]\n"
-    "fmla v7.4s, v27.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, x16]\n"
-    "fmla v4.4s, v27.4s, v16.4s\n"
-    "prfm pldl1keep, [x21, x23]\n"
-    "fmla v6.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x23]\n"
-    "fmla v3.4s, v27.4s, v17.4s\n"
-    "ldr q27, [x14, x15]\n"
-    "fmla v7.4s, v30.4s, v15.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v6.4s, v30.4s, v16.4s\n"
-    "ldr q26, [x17, x24]\n"
-    "fmla v2.4s, v31.4s, v11.4s\n"
-    "ldr q20, [x26, x9]\n"
-    "fmla v5.4s, v24.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v4.4s, v24.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v2.4s, v24.4s, v13.4s\n"
-    "add x17, x17, #16\n"
-    "fmla v1.4s, v24.4s, v14.4s\n"
-    "ldr q18, [x21, x15]\n"
-    "fmla v8.4s, v29.4s, v9.4s\n"
-    "fmla v5.4s, v29.4s, v12.4s\n"
-    "fmla v7.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v15.4s\n"
-    "fmla v4.4s, v29.4s, v13.4s\n"
-    "fmla v6.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v16.4s\n"
-    "fmla v3.4s, v29.4s, v14.4s\n"
-    "fmla v0.4s, v29.4s, v17.4s\n"
-    "ldr q22, [x14, x24]\n"
-    "fmla v7.4s, v21.4s, v12.4s\n"
-    "ldr q23, [x26, x15]\n"
-    "fmla v4.4s, v21.4s, v15.4s\n"
-    "add x14, x14, #16\n"
-    "fmla v6.4s, v21.4s, v13.4s\n"
-    "fmla v3.4s, v21.4s, v16.4s\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "ldr q24, [x21, x24]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "ldr q21, [x26, x24]\n"
-    "fmla v6.4s, v19.4s, v15.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v5.4s, v25.4s, v9.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v2.4s, v25.4s, v12.4s\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v13.4s\n"
-    "fmla v3.4s, v25.4s, v11.4s\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "fmla v7.4s, v27.4s, v9.4s\n"
-    "fmla v4.4s, v27.4s, v12.4s\n"
-    "fmla v6.4s, v27.4s, v10.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v3.4s, v27.4s, v13.4s\n"
-    "fmla v0.4s, v27.4s, v16.4s\n"
-    "fmla v2.4s, v20.4s, v9.4s\n"
-    "fmla v6.4s, v26.4s, v12.4s\n"
-    "fmla v4.4s, v18.4s, v9.4s\n"
-    "fmla v3.4s, v26.4s, v15.4s\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v0.4s, v20.4s, v11.4s\n"
-    "movi v30.16b, #0\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "fmov v29.4s, #6.0\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v18.4s, v13.4s\n"
-    "fmax v8.4s, v8.4s, v30.4s\n"
-    "fmax v7.4s, v7.4s, v30.4s\n"
-    "fmax v6.4s, v6.4s, v30.4s\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v0.4s, v22.4s, v15.4s\n"
-    "fmin v8.4s, v8.4s, v29.4s\n"
-    "fmin v7.4s, v7.4s, v29.4s\n"
-    "fmin v6.4s, v6.4s, v29.4s\n"
-    "str q8, [%[outptr0]]\n"
-    "fmla v3.4s, v24.4s, v9.4s\n"
-    "str q7, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v23.4s, v10.4s\n"
-    "str q6, [%[outptr0], x27]\n"
-    "fmax v5.4s, v5.4s, v30.4s\n"
-    "fmax v4.4s, v4.4s, v30.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v0.4s, v24.4s, v12.4s\n"
-    "fmin v5.4s, v5.4s, v29.4s\n"
-    "fmin v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v30.4s\n"
-    "str q5, [x25]\n"
-    "fmax v2.4s, v2.4s, v30.4s\n"
-    "str q4, [x25, %[output_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v9.4s\n"
-    "fmin v3.4s, v3.4s, v29.4s\n"
-    "fmin v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v30.4s\n"
-    "str q3, [x25, x27]\n"
-    "str q2, [x13]\n"
-    "fmin v1.4s, v1.4s, v29.4s\n"
-    "fmax v0.4s, v0.4s, v30.4s\n"
-    "add x25, x25, #16\n"
-    "str q1, [x13, %[output_col_stride1]]\n"
-    "fmin v0.4s, v0.4s, v29.4s\n"
-    "str q0, [x13, x27]\n"
-    "add x13, x13, #16\n"
-    "4:\n"
-    "cbz x19, 7f\n"
-    "ldr s19, [%[wbptr]]\n"
-    "mov v8.16b, v19.16b\n"
-    "ldr s17, [%[wbptr], #4]\n"
-    "mov v5.16b, v19.16b\n"
-    "ldr s16, [%[wbptr], #8]\n"
-    "mov v7.16b, v19.16b\n"
-    "ldr s15, [%[wbptr], #12]\n"
-    "mov v2.16b, v19.16b\n"
-    "ldr s14, [%[wbptr], #16]\n"
-    "mov v4.16b, v19.16b\n"
-    "ldr s13, [%[wbptr], #20]\n"
-    "mov v6.16b, v19.16b\n"
-    "ldr s12, [%[wbptr], #24]\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr s11, [%[wbptr], #28]\n"
-    "mov v3.16b, v19.16b\n"
-    "ldr s10, [%[wbptr], #32]\n"
-    "mov v0.16b, v19.16b\n"
-    "ldr s9, [%[wbptr], #36]\n"
-    "ldr s25, [%[inptr0]]\n"
-    "subs x19, x19, #1\n"
-    "fmla v8.4s, v25.4s, v17.4s\n"
-    "ldr s27, [x17]\n"
-    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s20, [x14]\n"
-    "ldr s22, [x17, %[input_col_stride1]]\n"
-    "ldr s28, [%[inptr0], x9]\n"
-    "fmla v8.4s, v27.4s, v14.4s\n"
-    "ldr s23, [x21]\n"
-    "ldr s18, [x14, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x17, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "prfm pldl1keep, [x14, #64]\n"
-    "prfm pldl1keep, [x17, x28]\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "prfm pldl1keep, [x14, x28]\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v5.4s, v27.4s, v17.4s\n"
-    "ldr s27, [x17, x9]\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "ldr s30, [%[inptr0], x15]\n"
-    "fmla v7.4s, v26.4s, v17.4s\n"
-    "ldr s31, [x26]\n"
-    "fmla v5.4s, v20.4s, v14.4s\n"
-    "ldr s24, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x17, x22]\n"
-    "fmla v2.4s, v20.4s, v17.4s\n"
-    "ldr s29, [x14, x9]\n"
-    "fmla v5.4s, v22.4s, v16.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v7.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x21, x28]\n"
-    "fmla v4.4s, v22.4s, v17.4s\n"
-    "ldr s21, [x17, x15]\n"
-    "fmla v8.4s, v28.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x22]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x16]\n"
-    "fmla v6.4s, v28.4s, v17.4s\n"
-    "ldr s19, [%[inptr0], x24]\n"
-    "fmla v5.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "ldr s28, [x26, %[input_col_stride1]]\n"
-    "fmla v8.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x26, x28]\n"
-    "fmla v5.4s, v18.4s, v13.4s\n"
-    "prfm pldl1keep, [x21, x22]\n"
-    "fmla v7.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x14, x16]\n"
-    "fmla v2.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x23]\n"
-    "fmla v4.4s, v18.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x22]\n"
-    "fmla v1.4s, v18.4s, v17.4s\n"
-    "ldr s25, [x21, x9]\n"
-    "fmla v8.4s, v27.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v5.4s, v27.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x23]\n"
-    "fmla v7.4s, v27.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, x16]\n"
-    "fmla v4.4s, v27.4s, v16.4s\n"
-    "prfm pldl1keep, [x21, x23]\n"
-    "fmla v6.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x23]\n"
-    "fmla v3.4s, v27.4s, v17.4s\n"
-    "ldr s27, [x14, x15]\n"
-    "fmla v7.4s, v30.4s, v15.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v6.4s, v30.4s, v16.4s\n"
-    "ldr s26, [x17, x24]\n"
-    "fmla v2.4s, v31.4s, v11.4s\n"
-    "ldr s20, [x26, x9]\n"
-    "fmla v5.4s, v24.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v4.4s, v24.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v2.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v1.4s, v24.4s, v14.4s\n"
-    "ldr s18, [x21, x15]\n"
-    "fmla v8.4s, v29.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "fmla v5.4s, v29.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v7.4s, v29.4s, v10.4s\n"
-    "add x17, x17, #4\n"
-    "fmla v2.4s, v29.4s, v15.4s\n"
-    "prfm pldl1keep, [x17, #64]\n"
-    "fmla v4.4s, v29.4s, v13.4s\n"
-    "prfm pldl1keep, [x17, x28]\n"
-    "fmla v6.4s, v29.4s, v11.4s\n"
-    "subs x19, x19, #1\n"
-    "fmla v1.4s, v29.4s, v16.4s\n"
-    "fmla v3.4s, v29.4s, v14.4s\n"
-    "fmla v0.4s, v29.4s, v17.4s\n"
-    "ldr s22, [x14, x24]\n"
-    "fmla v7.4s, v21.4s, v12.4s\n"
-    "ldr s23, [x26, x15]\n"
-    "fmla v4.4s, v21.4s, v15.4s\n"
-    "add x14, x14, #4\n"
-    "fmla v6.4s, v21.4s, v13.4s\n"
-    "prfm pldl1keep, [x14, #64]\n"
-    "fmla v3.4s, v21.4s, v16.4s\n"
-    "ldr s24, [x21, x24]\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "prfm pldl1keep, [x14, x28]\n"
-    "fmla v6.4s, v19.4s, v15.4s\n"
-    "ldr s21, [x26, x24]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "ldr s19, [%[wbptr]]\n"
-    "fmla v5.4s, v25.4s, v9.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v2.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v1.4s, v25.4s, v13.4s\n"
-    "fmla v3.4s, v25.4s, v11.4s\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "ldr s17, [%[wbptr], #4]\n"
-    "fmla v7.4s, v27.4s, v9.4s\n"
-    "ldr s25, [%[inptr0]]\n"
-    "fmla v4.4s, v27.4s, v12.4s\n"
-    "fmla v6.4s, v27.4s, v10.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v3.4s, v27.4s, v13.4s\n"
-    "fmla v0.4s, v27.4s, v16.4s\n"
-    "ldr s14, [%[wbptr], #16]\n"
-    "fmla v6.4s, v26.4s, v12.4s\n"
-    "ldr s27, [x17]\n"
-    "fmla v3.4s, v26.4s, v15.4s\n"
-    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v20.4s, v9.4s\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v0.4s, v20.4s, v11.4s\n"
-    "ldr s16, [%[wbptr], #8]\n"
-    "fmla v4.4s, v18.4s, v9.4s\n"
-    "ldr s20, [x14]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v18.4s, v13.4s\n"
-    "ldr s11, [%[wbptr], #28]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "movi v30.16b, #0\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v0.4s, v22.4s, v15.4s\n"
-    "ldr s13, [%[wbptr], #20]\n"
-    "fmov v29.4s, #6.0\n"
-    "fmax v8.4s, v8.4s, v30.4s\n"
-    "fmla v3.4s, v24.4s, v9.4s\n"
-    "fmax v7.4s, v7.4s, v30.4s\n"
-    "fmla v0.4s, v23.4s, v10.4s\n"
-    "ldr s15, [%[wbptr], #12]\n"
-    "fmin v8.4s, v8.4s, v29.4s\n"
-    "ldr s22, [x17, %[input_col_stride1]]\n"
-    "fmin v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v30.4s\n"
-    "str s8, [%[outptr0]]\n"
-    "fmla v0.4s, v24.4s, v12.4s\n"
-    "str s7, [%[outptr0], %[output_col_stride1]]\n"
-    "fmin v6.4s, v6.4s, v29.4s\n"
-    "fmax v5.4s, v5.4s, v30.4s\n"
-    "ldr s10, [%[wbptr], #32]\n"
-    "str s6, [%[outptr0], x27]\n"
-    "fmla v0.4s, v21.4s, v9.4s\n"
-    "fmin v5.4s, v5.4s, v29.4s\n"
-    "ldr s12, [%[wbptr], #24]\n"
-    "fmax v4.4s, v4.4s, v30.4s\n"
-    "ldr s28, [%[inptr0], x9]\n"
-    "str s5, [x25]\n"
-    "fmax v3.4s, v3.4s, v30.4s\n"
-    "fmin v4.4s, v4.4s, v29.4s\n"
-    "ldr s9, [%[wbptr], #36]\n"
-    "fmin v3.4s, v3.4s, v29.4s\n"
-    "ldr s23, [x21]\n"
-    "str s4, [x25, %[output_col_stride1]]\n"
-    "fmax v2.4s, v2.4s, v30.4s\n"
-    "str s3, [x25, x27]\n"
-    "fmax v1.4s, v1.4s, v30.4s\n"
-    "fmin v2.4s, v2.4s, v29.4s\n"
-    "ldr s18, [x14, %[input_col_stride1]]\n"
-    "fmin v1.4s, v1.4s, v29.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "str s2, [x13]\n"
-    "fmax v0.4s, v0.4s, v30.4s\n"
-    "str s1, [x13, %[output_col_stride1]]\n"
-    "mov v8.16b, v19.16b\n"
-    "fmin v0.4s, v0.4s, v29.4s\n"
-    "add x25, x25, #4\n"
-    "mov v5.16b, v19.16b\n"
-    "mov v7.16b, v19.16b\n"
-    "str s0, [x13, x27]\n"
-    "mov v2.16b, v19.16b\n"
-    "mov v4.16b, v19.16b\n"
-    "add x13, x13, #4\n"
-    "mov v6.16b, v19.16b\n"
-    "mov v1.16b, v19.16b\n"
-    "mov v3.16b, v19.16b\n"
-    "mov v0.16b, v19.16b\n"
-    "fmla v8.4s, v25.4s, v17.4s\n"
-    "fmla v8.4s, v27.4s, v14.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v5.4s, v27.4s, v17.4s\n"
-    "ldr s27, [x17, x9]\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "ldr s30, [%[inptr0], x15]\n"
-    "fmla v7.4s, v26.4s, v17.4s\n"
-    "ldr s31, [x26]\n"
-    "fmla v5.4s, v20.4s, v14.4s\n"
-    "ldr s24, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x17, x22]\n"
-    "fmla v2.4s, v20.4s, v17.4s\n"
-    "ldr s29, [x14, x9]\n"
-    "fmla v5.4s, v22.4s, v16.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v7.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x21, x28]\n"
-    "fmla v4.4s, v22.4s, v17.4s\n"
-    "ldr s21, [x17, x15]\n"
-    "fmla v8.4s, v28.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x22]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x16]\n"
-    "fmla v6.4s, v28.4s, v17.4s\n"
-    "ldr s19, [%[inptr0], x24]\n"
-    "fmla v5.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "ldr s28, [x26, %[input_col_stride1]]\n"
-    "fmla v8.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x26, x28]\n"
-    "fmla v5.4s, v18.4s, v13.4s\n"
-    "prfm pldl1keep, [x21, x22]\n"
-    "fmla v7.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x14, x16]\n"
-    "fmla v2.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x23]\n"
-    "fmla v4.4s, v18.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x22]\n"
-    "fmla v1.4s, v18.4s, v17.4s\n"
-    "ldr s25, [x21, x9]\n"
-    "fmla v8.4s, v27.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v5.4s, v27.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x23]\n"
-    "fmla v7.4s, v27.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, x16]\n"
-    "fmla v4.4s, v27.4s, v16.4s\n"
-    "prfm pldl1keep, [x21, x23]\n"
-    "fmla v6.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x23]\n"
-    "fmla v3.4s, v27.4s, v17.4s\n"
-    "ldr s27, [x14, x15]\n"
-    "fmla v7.4s, v30.4s, v15.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v6.4s, v30.4s, v16.4s\n"
-    "ldr s26, [x17, x24]\n"
-    "fmla v2.4s, v31.4s, v11.4s\n"
-    "ldr s20, [x26, x9]\n"
-    "fmla v5.4s, v24.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v4.4s, v24.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v2.4s, v24.4s, v13.4s\n"
-    "add x17, x17, #4\n"
-    "fmla v1.4s, v24.4s, v14.4s\n"
-    "ldr s18, [x21, x15]\n"
-    "fmla v8.4s, v29.4s, v9.4s\n"
-    "fmla v5.4s, v29.4s, v12.4s\n"
-    "fmla v7.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v15.4s\n"
-    "fmla v4.4s, v29.4s, v13.4s\n"
-    "fmla v6.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v16.4s\n"
-    "fmla v3.4s, v29.4s, v14.4s\n"
-    "fmla v0.4s, v29.4s, v17.4s\n"
-    "ldr s22, [x14, x24]\n"
-    "fmla v7.4s, v21.4s, v12.4s\n"
-    "ldr s23, [x26, x15]\n"
-    "fmla v4.4s, v21.4s, v15.4s\n"
-    "add x14, x14, #4\n"
-    "fmla v6.4s, v21.4s, v13.4s\n"
-    "fmla v3.4s, v21.4s, v16.4s\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "ldr s24, [x21, x24]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "ldr s21, [x26, x24]\n"
-    "fmla v6.4s, v19.4s, v15.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v5.4s, v25.4s, v9.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v2.4s, v25.4s, v12.4s\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v13.4s\n"
-    "fmla v3.4s, v25.4s, v11.4s\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "fmla v7.4s, v27.4s, v9.4s\n"
-    "fmla v4.4s, v27.4s, v12.4s\n"
-    "fmla v6.4s, v27.4s, v10.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v3.4s, v27.4s, v13.4s\n"
-    "fmla v0.4s, v27.4s, v16.4s\n"
-    "fmla v2.4s, v20.4s, v9.4s\n"
-    "fmla v6.4s, v26.4s, v12.4s\n"
-    "fmla v4.4s, v18.4s, v9.4s\n"
-    "fmla v3.4s, v26.4s, v15.4s\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v0.4s, v20.4s, v11.4s\n"
-    "movi v30.16b, #0\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "fmov v29.4s, #6.0\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v18.4s, v13.4s\n"
-    "fmax v8.4s, v8.4s, v30.4s\n"
-    "fmax v7.4s, v7.4s, v30.4s\n"
-    "fmax v6.4s, v6.4s, v30.4s\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v0.4s, v22.4s, v15.4s\n"
-    "fmin v8.4s, v8.4s, v29.4s\n"
-    "fmin v7.4s, v7.4s, v29.4s\n"
-    "fmin v6.4s, v6.4s, v29.4s\n"
-    "str s8, [%[outptr0]]\n"
-    "fmla v3.4s, v24.4s, v9.4s\n"
-    "str s7, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v23.4s, v10.4s\n"
-    "str s6, [%[outptr0], x27]\n"
-    "fmax v5.4s, v5.4s, v30.4s\n"
-    "fmax v4.4s, v4.4s, v30.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v0.4s, v24.4s, v12.4s\n"
-    "fmin v5.4s, v5.4s, v29.4s\n"
-    "fmin v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v30.4s\n"
-    "str s5, [x25]\n"
-    "fmax v2.4s, v2.4s, v30.4s\n"
-    "str s4, [x25, %[output_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v9.4s\n"
-    "fmin v3.4s, v3.4s, v29.4s\n"
-    "fmin v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v30.4s\n"
-    "str s3, [x25, x27]\n"
-    "str s2, [x13]\n"
-    "fmin v1.4s, v1.4s, v29.4s\n"
-    "fmax v0.4s, v0.4s, v30.4s\n"
-    "add x25, x25, #4\n"
-    "str s1, [x13, %[output_col_stride1]]\n"
-    "fmin v0.4s, v0.4s, v29.4s\n"
-    "str s0, [x13, x27]\n"
-    "add x13, x13, #4\n"
-    "7:\n"
-    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
-    : [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-#endif  // __aarch64__
-
-template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
deleted file mode 100644
index b798b8cdbe..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ /dev/null
@@ -1,769 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void* weight_bias_ptr,
-  const float* input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float* output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x15, %[inptr0], %[input_row_stride]\n"
-    "add x26, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x21, %[outptr0], %[output_row_stride]\n"
-    "add x16, x15, %[input_row_stride]\n"
-    "add x27, x26, %[input_col_stride1]\n"
-    "add x22, x21, %[output_row_stride]\n"
-    "add x17, x16, %[input_row_stride]\n"
-    "add x28, x27, %[input_col_stride1]\n"
-    "add x23, %[output_col_stride1], %[output_col_stride1]\n"
-    "add x9, x17, %[input_row_stride]\n"
-    "add x13, x28, %[input_col_stride1]\n"
-    "and x24, %[n_channels], #3\n"
-    "add x19, x9, %[input_row_stride]\n"
-    "add x14, x13, %[input_col_stride1]\n"
-    "lsr x25, %[n_channels], #2\n"
-    "add x20, x19, %[input_row_stride]\n"
-    "cbz x25, 4f\n"
-    "1:\n"
-    "ldr q27, [%[wbptr]]\n"
-    "subs x25, x25, #1\n"
-    "mov v17.16b, v27.16b\n"
-    "ldr q6, [%[wbptr], #16]\n"
-    "mov v16.16b, v27.16b\n"
-    "ldr q14, [%[wbptr], #32]\n"
-    "mov v15.16b, v27.16b\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "mov v2.16b, v27.16b\n"
-    "ldr q12, [%[wbptr], #64]\n"
-    "mov v4.16b, v27.16b\n"
-    "ldr q11, [%[wbptr], #80]\n"
-    "mov v5.16b, v27.16b\n"
-    "ldr q10, [%[wbptr], #96]\n"
-    "mov v1.16b, v27.16b\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "mov v3.16b, v27.16b\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "mov v0.16b, v27.16b\n"
-    "ldr q7, [%[wbptr], #144]\n"
-    "ldr q29, [%[inptr0]]\n"
-    "ldr q28, [x15]\n"
-    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q22, [x16]\n"
-    "ldr q20, [x15, %[input_col_stride1]]\n"
-    "ldr q19, [%[inptr0], x26]\n"
-    "ldr q30, [x17]\n"
-    "ldr q18, [x16, %[input_col_stride1]]\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v17.4s, v29.4s, v6.4s\n"
-    "ldr q21, [x15, x26]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "ldr q27, [%[inptr0], x27]\n"
-    "fmla v15.4s, v19.4s, v6.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v28.4s, v12.4s\n"
-    "ldr q25, [x9]\n"
-    "fmla v16.4s, v30.4s, v12.4s\n"
-    "ldr q24, [x17, %[input_col_stride1]]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v17.4s, v26.4s, v14.4s\n"
-    "ldr q23, [x16, x26]\n"
-    "fmla v16.4s, v18.4s, v14.4s\n"
-    "subs x25, x25, #1\n"
-    "fmla v15.4s, v27.4s, v14.4s\n"
-    "ldr q26, [x15, x27]\n"
-    "fmla v17.4s, v22.4s, v9.4s\n"
-    "ldr q22, [%[inptr0], x28]\n"
-    "fmla v16.4s, v25.4s, v9.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v23.4s, v9.4s\n"
-    "ldr q30, [x19]\n"
-    "fmla v17.4s, v20.4s, v11.4s\n"
-    "ldr q29, [x9, %[input_col_stride1]]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "ldr q28, [x17, x26]\n"
-    "fmla v4.4s, v23.4s, v6.4s\n"
-    "fmla v15.4s, v26.4s, v11.4s\n"
-    "fmla v17.4s, v19.4s, v13.4s\n"
-    "ldr q24, [x16, x27]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "ldr q25, [x15, x28]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "fmla v5.4s, v22.4s, v6.4s\n"
-    "fmla v17.4s, v18.4s, v8.4s\n"
-    "ldr q19, [%[inptr0], x13]\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr q18, [x20]\n"
-    "fmla v16.4s, v29.4s, v8.4s\n"
-    "ldr q22, [x19, %[input_col_stride1]]\n"
-    "fmla v17.4s, v21.4s, v10.4s\n"
-    "ldr q26, [x9, x26]\n"
-    "fmla v2.4s, v29.4s, v14.4s\n"
-    "ldr q20, [x17, x27]\n"
-    "fmla v16.4s, v28.4s, v10.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v17.4s, v23.4s, v7.4s\n"
-    "ldr q27, [x16, x28]\n"
-    "fmla v15.4s, v24.4s, v8.4s\n"
-    "ldr q30, [x15, x13]\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "ldr q24, [%[inptr0], x14]\n"
-    "str q17, [%[outptr0]]\n"
-    "fmla v5.4s, v25.4s, v12.4s\n"
-    "fmla v15.4s, v25.4s, v10.4s\n"
-    "ldr q28, [x20, %[input_col_stride1]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "ldr q17, [x19, x26]\n"
-    "fmla v5.4s, v19.4s, v14.4s\n"
-    "ldr q18, [x9, x27]\n"
-    "fmla v16.4s, v26.4s, v7.4s\n"
-    "ldr q25, [x17, x28]\n"
-    "fmla v2.4s, v22.4s, v11.4s\n"
-    "ldr q22, [x16, x13]\n"
-    "fmla v4.4s, v26.4s, v9.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "str q16, [x21]\n"
-    "fmla v1.4s, v26.4s, v6.4s\n"
-    "fmla v2.4s, v26.4s, v13.4s\n"
-    "ldr q21, [x15, x14]\n"
-    "fmla v4.4s, v20.4s, v11.4s\n"
-    "ldr q23, [x20, x26]\n"
-    "fmla v15.4s, v27.4s, v7.4s\n"
-    "ldr q19, [x19, x27]\n"
-    "fmla v5.4s, v27.4s, v9.4s\n"
-    "add x15, x15, #16\n"
-    "fmla v4.4s, v27.4s, v13.4s\n"
-    "fmla v3.4s, v27.4s, v6.4s\n"
-    "str q15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v2.4s, v28.4s, v8.4s\n"
-    "fmla v5.4s, v30.4s, v11.4s\n"
-    "ldr q29, [x9, x28]\n"
-    "fmla v1.4s, v17.4s, v12.4s\n"
-    "ldr q27, [x17, x13]\n"
-    "fmla v2.4s, v17.4s, v10.4s\n"
-    "ldr q28, [x16, x14]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "ldr q26, [x20, x27]\n"
-    "fmla v4.4s, v18.4s, v8.4s\n"
-    "ldr q20, [x19, x28]\n"
-    "fmla v1.4s, v18.4s, v14.4s\n"
-    "ldr q17, [x9, x13]\n"
-    "fmla v3.4s, v25.4s, v12.4s\n"
-    "ldr q18, [x17, x14]\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "ldr q16, [x20, x28]\n"
-    "fmla v5.4s, v22.4s, v8.4s\n"
-    "add x16, x16, #16\n"
-    "fmla v3.4s, v22.4s, v14.4s\n"
-    "ldr q15, [x19, x13]\n"
-    "fmla v2.4s, v23.4s, v7.4s\n"
-    "add x17, x17, #16\n"
-    "fmla v5.4s, v21.4s, v10.4s\n"
-    "ldr q21, [x9, x14]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr q23, [x20, x13]\n"
-    "str q2, [x22]\n"
-    "fmla v4.4s, v29.4s, v7.4s\n"
-    "fmla v3.4s, v29.4s, v9.4s\n"
-    "ldr q24, [x19, x14]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "ldr q25, [x20, x14]\n"
-    "str q4, [x21, %[output_col_stride1]]\n"
-    "fmla v0.4s, v29.4s, v6.4s\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "ldr q27, [%[wbptr]]\n"
-    "fmla v1.4s, v29.4s, v13.4s\n"
-    "ldr q29, [%[inptr0]]\n"
-    "fmla v5.4s, v28.4s, v7.4s\n"
-    "ldr q6, [%[wbptr], #16]\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "ldr q28, [x15]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
-    "str q5, [%[outptr0], x23]\n"
-    "fmla v0.4s, v20.4s, v12.4s\n"
-    "fmla v3.4s, v17.4s, v8.4s\n"
-    "ldr q22, [x16]\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "ldr q20, [x15, %[input_col_stride1]]\n"
-    "fmla v0.4s, v17.4s, v14.4s\n"
-    "ldr q12, [%[wbptr], #64]\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "ldr q19, [%[inptr0], x26]\n"
-    "fmla v1.4s, v16.4s, v7.4s\n"
-    "ldr q30, [x17]\n"
-    "fmla v0.4s, v16.4s, v9.4s\n"
-    "ldr q14, [%[wbptr], #32]\n"
-    "fmla v3.4s, v21.4s, v7.4s\n"
-    "ldr q18, [x16, %[input_col_stride1]]\n"
-    "str q1, [x22, %[output_col_stride1]]\n"
-    "mov v17.16b, v27.16b\n"
-    "fmla v0.4s, v15.4s, v11.4s\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "str q3, [x21, x23]\n"
-    "mov v16.16b, v27.16b\n"
-    "mov v15.16b, v27.16b\n"
-    "add x9, x9, #16\n"
-    "fmla v0.4s, v21.4s, v13.4s\n"
-    "ldr q11, [%[wbptr], #80]\n"
-    "mov v2.16b, v27.16b\n"
-    "add x19, x19, #16\n"
-    "mov v4.16b, v27.16b\n"
-    "add x20, x20, #16\n"
-    "fmla v0.4s, v23.4s, v8.4s\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "mov v5.16b, v27.16b\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "mov v1.16b, v27.16b\n"
-    "add x21, x21, #16\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "mov v3.16b, v27.16b\n"
-    "fmla v0.4s, v25.4s, v7.4s\n"
-    "ldr q10, [%[wbptr], #96]\n"
-    "str q0, [x22, x23]\n"
-    "mov v0.16b, v27.16b\n"
-    "ldr q7, [%[wbptr], #144]\n"
-    "add x22, x22, #16\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v17.4s, v29.4s, v6.4s\n"
-    "ldr q21, [x15, x26]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "ldr q27, [%[inptr0], x27]\n"
-    "fmla v15.4s, v19.4s, v6.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v28.4s, v12.4s\n"
-    "ldr q25, [x9]\n"
-    "fmla v16.4s, v30.4s, v12.4s\n"
-    "ldr q24, [x17, %[input_col_stride1]]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v17.4s, v26.4s, v14.4s\n"
-    "ldr q23, [x16, x26]\n"
-    "fmla v16.4s, v18.4s, v14.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v27.4s, v14.4s\n"
-    "ldr q26, [x15, x27]\n"
-    "fmla v17.4s, v22.4s, v9.4s\n"
-    "ldr q22, [%[inptr0], x28]\n"
-    "fmla v16.4s, v25.4s, v9.4s\n"
-    "ldr q30, [x19]\n"
-    "fmla v15.4s, v23.4s, v9.4s\n"
-    "fmla v4.4s, v23.4s, v6.4s\n"
-    "fmla v17.4s, v20.4s, v11.4s\n"
-    "ldr q29, [x9, %[input_col_stride1]]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "ldr q28, [x17, x26]\n"
-    "fmla v15.4s, v26.4s, v11.4s\n"
-    "ldr q24, [x16, x27]\n"
-    "fmla v17.4s, v19.4s, v13.4s\n"
-    "ldr q25, [x15, x28]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "ldr q19, [%[inptr0], x13]\n"
-    "fmla v17.4s, v18.4s, v8.4s\n"
-    "ldr q18, [x20]\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr q22, [x19, %[input_col_stride1]]\n"
-    "fmla v16.4s, v29.4s, v8.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v17.4s, v21.4s, v10.4s\n"
-    "ldr q26, [x9, x26]\n"
-    "fmla v2.4s, v29.4s, v14.4s\n"
-    "ldr q20, [x17, x27]\n"
-    "fmla v16.4s, v28.4s, v10.4s\n"
-    "ldr q27, [x16, x28]\n"
-    "fmla v17.4s, v23.4s, v7.4s\n"
-    "ldr q30, [x15, x13]\n"
-    "fmla v15.4s, v24.4s, v8.4s\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "fmla v5.4s, v25.4s, v12.4s\n"
-    "ldr q24, [%[inptr0], x14]\n"
-    "str q17, [%[outptr0]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "fmla v15.4s, v25.4s, v10.4s\n"
-    "ldr q28, [x20, %[input_col_stride1]]\n"
-    "fmla v5.4s, v19.4s, v14.4s\n"
-    "ldr q17, [x19, x26]\n"
-    "fmla v2.4s, v22.4s, v11.4s\n"
-    "ldr q18, [x9, x27]\n"
-    "fmla v16.4s, v26.4s, v7.4s\n"
-    "ldr q25, [x17, x28]\n"
-    "fmla v4.4s, v26.4s, v9.4s\n"
-    "ldr q22, [x16, x13]\n"
-    "fmla v2.4s, v26.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "str q16, [x21]\n"
-    "fmla v1.4s, v26.4s, v6.4s\n"
-    "fmla v4.4s, v20.4s, v11.4s\n"
-    "ldr q21, [x15, x14]\n"
-    "fmla v15.4s, v27.4s, v7.4s\n"
-    "ldr q23, [x20, x26]\n"
-    "fmla v5.4s, v27.4s, v9.4s\n"
-    "ldr q19, [x19, x27]\n"
-    "fmla v4.4s, v27.4s, v13.4s\n"
-    "add x15, x15, #16\n"
-    "str q15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v3.4s, v27.4s, v6.4s\n"
-    "fmla v5.4s, v30.4s, v11.4s\n"
-    "ldr q29, [x9, x28]\n"
-    "fmla v2.4s, v28.4s, v8.4s\n"
-    "ldr q27, [x17, x13]\n"
-    "fmla v1.4s, v17.4s, v12.4s\n"
-    "ldr q28, [x16, x14]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "ldr q26, [x20, x27]\n"
-    "fmla v2.4s, v17.4s, v10.4s\n"
-    "ldr q20, [x19, x28]\n"
-    "fmla v4.4s, v18.4s, v8.4s\n"
-    "ldr q17, [x9, x13]\n"
-    "fmla v1.4s, v18.4s, v14.4s\n"
-    "ldr q18, [x17, x14]\n"
-    "fmla v3.4s, v25.4s, v12.4s\n"
-    "add x16, x16, #16\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "ldr q16, [x20, x28]\n"
-    "fmla v5.4s, v22.4s, v8.4s\n"
-    "add x17, x17, #16\n"
-    "fmla v3.4s, v22.4s, v14.4s\n"
-    "ldr q15, [x19, x13]\n"
-    "fmla v2.4s, v23.4s, v7.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v5.4s, v21.4s, v10.4s\n"
-    "ldr q21, [x9, x14]\n"
-    "fmla v4.4s, v29.4s, v7.4s\n"
-    "ldr q23, [x20, x13]\n"
-    "str q2, [x22]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v3.4s, v29.4s, v9.4s\n"
-    "ldr q24, [x19, x14]\n"
-    "str q4, [x21, %[output_col_stride1]]\n"
-    "fmla v0.4s, v29.4s, v6.4s\n"
-    "fmla v1.4s, v29.4s, v13.4s\n"
-    "ldr q25, [x20, x14]\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v5.4s, v28.4s, v7.4s\n"
-    "add x19, x19, #16\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "add x20, x20, #16\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "fmla v0.4s, v20.4s, v12.4s\n"
-    "str q5, [%[outptr0], x23]\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v3.4s, v17.4s, v8.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v0.4s, v17.4s, v14.4s\n"
-    "fmla v1.4s, v16.4s, v7.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v16.4s, v9.4s\n"
-    "str q1, [x22, %[output_col_stride1]]\n"
-    "fmla v3.4s, v21.4s, v7.4s\n"
-    "fmla v0.4s, v15.4s, v11.4s\n"
-    "str q3, [x21, x23]\n"
-    "fmla v0.4s, v21.4s, v13.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v0.4s, v23.4s, v8.4s\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v0.4s, v25.4s, v7.4s\n"
-    "str q0, [x22, x23]\n"
-    "add x22, x22, #16\n"
-    "4:\n"
-    "cbz x24, 7f\n"
-    "ldr s27, [%[wbptr]]\n"
-    "mov v17.16b, v27.16b\n"
-    "ldr s6, [%[wbptr], #4]\n"
-    "mov v16.16b, v27.16b\n"
-    "ldr s14, [%[wbptr], #8]\n"
-    "mov v15.16b, v27.16b\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "mov v2.16b, v27.16b\n"
-    "ldr s12, [%[wbptr], #16]\n"
-    "mov v4.16b, v27.16b\n"
-    "ldr s11, [%[wbptr], #20]\n"
-    "mov v5.16b, v27.16b\n"
-    "ldr s10, [%[wbptr], #24]\n"
-    "mov v1.16b, v27.16b\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "mov v3.16b, v27.16b\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "mov v0.16b, v27.16b\n"
-    "ldr s7, [%[wbptr], #36]\n"
-    "ldr s29, [%[inptr0]]\n"
-    "subs x24, x24, #1\n"
-    "ldr s28, [x15]\n"
-    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s22, [x16]\n"
-    "ldr s20, [x15, %[input_col_stride1]]\n"
-    "ldr s19, [%[inptr0], x26]\n"
-    "ldr s30, [x17]\n"
-    "ldr s18, [x16, %[input_col_stride1]]\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v17.4s, v29.4s, v6.4s\n"
-    "ldr s21, [x15, x26]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "ldr s27, [%[inptr0], x27]\n"
-    "fmla v15.4s, v19.4s, v6.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v28.4s, v12.4s\n"
-    "ldr s25, [x9]\n"
-    "fmla v16.4s, v30.4s, v12.4s\n"
-    "ldr s24, [x17, %[input_col_stride1]]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v17.4s, v26.4s, v14.4s\n"
-    "ldr s23, [x16, x26]\n"
-    "fmla v16.4s, v18.4s, v14.4s\n"
-    "subs x24, x24, #1\n"
-    "fmla v15.4s, v27.4s, v14.4s\n"
-    "ldr s26, [x15, x27]\n"
-    "fmla v17.4s, v22.4s, v9.4s\n"
-    "ldr s22, [%[inptr0], x28]\n"
-    "fmla v16.4s, v25.4s, v9.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v23.4s, v9.4s\n"
-    "ldr s30, [x19]\n"
-    "fmla v17.4s, v20.4s, v11.4s\n"
-    "ldr s29, [x9, %[input_col_stride1]]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "ldr s28, [x17, x26]\n"
-    "fmla v4.4s, v23.4s, v6.4s\n"
-    "fmla v15.4s, v26.4s, v11.4s\n"
-    "fmla v17.4s, v19.4s, v13.4s\n"
-    "ldr s24, [x16, x27]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "ldr s25, [x15, x28]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "fmla v5.4s, v22.4s, v6.4s\n"
-    "fmla v17.4s, v18.4s, v8.4s\n"
-    "ldr s19, [%[inptr0], x13]\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr s18, [x20]\n"
-    "fmla v16.4s, v29.4s, v8.4s\n"
-    "ldr s22, [x19, %[input_col_stride1]]\n"
-    "fmla v17.4s, v21.4s, v10.4s\n"
-    "ldr s26, [x9, x26]\n"
-    "fmla v2.4s, v29.4s, v14.4s\n"
-    "ldr s20, [x17, x27]\n"
-    "fmla v16.4s, v28.4s, v10.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v17.4s, v23.4s, v7.4s\n"
-    "ldr s27, [x16, x28]\n"
-    "fmla v15.4s, v24.4s, v8.4s\n"
-    "ldr s30, [x15, x13]\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "ldr s24, [%[inptr0], x14]\n"
-    "str s17, [%[outptr0]]\n"
-    "fmla v5.4s, v25.4s, v12.4s\n"
-    "fmla v15.4s, v25.4s, v10.4s\n"
-    "ldr s28, [x20, %[input_col_stride1]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "ldr s17, [x19, x26]\n"
-    "fmla v5.4s, v19.4s, v14.4s\n"
-    "ldr s18, [x9, x27]\n"
-    "fmla v16.4s, v26.4s, v7.4s\n"
-    "ldr s25, [x17, x28]\n"
-    "fmla v2.4s, v22.4s, v11.4s\n"
-    "ldr s22, [x16, x13]\n"
-    "fmla v4.4s, v26.4s, v9.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "str s16, [x21]\n"
-    "fmla v1.4s, v26.4s, v6.4s\n"
-    "fmla v2.4s, v26.4s, v13.4s\n"
-    "ldr s21, [x15, x14]\n"
-    "fmla v4.4s, v20.4s, v11.4s\n"
-    "ldr s23, [x20, x26]\n"
-    "fmla v15.4s, v27.4s, v7.4s\n"
-    "ldr s19, [x19, x27]\n"
-    "fmla v5.4s, v27.4s, v9.4s\n"
-    "add x15, x15, #4\n"
-    "fmla v4.4s, v27.4s, v13.4s\n"
-    "fmla v3.4s, v27.4s, v6.4s\n"
-    "str s15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v2.4s, v28.4s, v8.4s\n"
-    "fmla v5.4s, v30.4s, v11.4s\n"
-    "ldr s29, [x9, x28]\n"
-    "fmla v1.4s, v17.4s, v12.4s\n"
-    "ldr s27, [x17, x13]\n"
-    "fmla v2.4s, v17.4s, v10.4s\n"
-    "ldr s28, [x16, x14]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "ldr s26, [x20, x27]\n"
-    "fmla v4.4s, v18.4s, v8.4s\n"
-    "ldr s20, [x19, x28]\n"
-    "fmla v1.4s, v18.4s, v14.4s\n"
-    "ldr s17, [x9, x13]\n"
-    "fmla v3.4s, v25.4s, v12.4s\n"
-    "ldr s18, [x17, x14]\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "ldr s16, [x20, x28]\n"
-    "fmla v5.4s, v22.4s, v8.4s\n"
-    "add x16, x16, #4\n"
-    "fmla v3.4s, v22.4s, v14.4s\n"
-    "ldr s15, [x19, x13]\n"
-    "fmla v2.4s, v23.4s, v7.4s\n"
-    "add x17, x17, #4\n"
-    "fmla v5.4s, v21.4s, v10.4s\n"
-    "ldr s21, [x9, x14]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr s23, [x20, x13]\n"
-    "str s2, [x22]\n"
-    "fmla v4.4s, v29.4s, v7.4s\n"
-    "fmla v3.4s, v29.4s, v9.4s\n"
-    "ldr s24, [x19, x14]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "ldr s25, [x20, x14]\n"
-    "str s4, [x21, %[output_col_stride1]]\n"
-    "fmla v0.4s, v29.4s, v6.4s\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "ldr s27, [%[wbptr]]\n"
-    "fmla v1.4s, v29.4s, v13.4s\n"
-    "ldr s29, [%[inptr0]]\n"
-    "fmla v5.4s, v28.4s, v7.4s\n"
-    "ldr s6, [%[wbptr], #4]\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "ldr s28, [x15]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
-    "str s5, [%[outptr0], x23]\n"
-    "fmla v0.4s, v20.4s, v12.4s\n"
-    "fmla v3.4s, v17.4s, v8.4s\n"
-    "ldr s22, [x16]\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "ldr s20, [x15, %[input_col_stride1]]\n"
-    "fmla v0.4s, v17.4s, v14.4s\n"
-    "ldr s12, [%[wbptr], #16]\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "ldr s19, [%[inptr0], x26]\n"
-    "fmla v1.4s, v16.4s, v7.4s\n"
-    "ldr s30, [x17]\n"
-    "fmla v0.4s, v16.4s, v9.4s\n"
-    "ldr s14, [%[wbptr], #8]\n"
-    "fmla v3.4s, v21.4s, v7.4s\n"
-    "ldr s18, [x16, %[input_col_stride1]]\n"
-    "str s1, [x22, %[output_col_stride1]]\n"
-    "mov v17.16b, v27.16b\n"
-    "fmla v0.4s, v15.4s, v11.4s\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "str s3, [x21, x23]\n"
-    "mov v16.16b, v27.16b\n"
-    "mov v15.16b, v27.16b\n"
-    "add x9, x9, #4\n"
-    "fmla v0.4s, v21.4s, v13.4s\n"
-    "ldr s11, [%[wbptr], #20]\n"
-    "mov v2.16b, v27.16b\n"
-    "add x19, x19, #4\n"
-    "mov v4.16b, v27.16b\n"
-    "add x20, x20, #4\n"
-    "fmla v0.4s, v23.4s, v8.4s\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "mov v5.16b, v27.16b\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "mov v1.16b, v27.16b\n"
-    "add x21, x21, #4\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "mov v3.16b, v27.16b\n"
-    "fmla v0.4s, v25.4s, v7.4s\n"
-    "ldr s10, [%[wbptr], #24]\n"
-    "str s0, [x22, x23]\n"
-    "mov v0.16b, v27.16b\n"
-    "ldr s7, [%[wbptr], #36]\n"
-    "add x22, x22, #4\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v17.4s, v29.4s, v6.4s\n"
-    "ldr s21, [x15, x26]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "ldr s27, [%[inptr0], x27]\n"
-    "fmla v15.4s, v19.4s, v6.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v28.4s, v12.4s\n"
-    "ldr s25, [x9]\n"
-    "fmla v16.4s, v30.4s, v12.4s\n"
-    "ldr s24, [x17, %[input_col_stride1]]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v17.4s, v26.4s, v14.4s\n"
-    "ldr s23, [x16, x26]\n"
-    "fmla v16.4s, v18.4s, v14.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v27.4s, v14.4s\n"
-    "ldr s26, [x15, x27]\n"
-    "fmla v17.4s, v22.4s, v9.4s\n"
-    "ldr s22, [%[inptr0], x28]\n"
-    "fmla v16.4s, v25.4s, v9.4s\n"
-    "ldr s30, [x19]\n"
-    "fmla v15.4s, v23.4s, v9.4s\n"
-    "fmla v4.4s, v23.4s, v6.4s\n"
-    "fmla v17.4s, v20.4s, v11.4s\n"
-    "ldr s29, [x9, %[input_col_stride1]]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "ldr s28, [x17, x26]\n"
-    "fmla v15.4s, v26.4s, v11.4s\n"
-    "ldr s24, [x16, x27]\n"
-    "fmla v17.4s, v19.4s, v13.4s\n"
-    "ldr s25, [x15, x28]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "ldr s19, [%[inptr0], x13]\n"
-    "fmla v17.4s, v18.4s, v8.4s\n"
-    "ldr s18, [x20]\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr s22, [x19, %[input_col_stride1]]\n"
-    "fmla v16.4s, v29.4s, v8.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v17.4s, v21.4s, v10.4s\n"
-    "ldr s26, [x9, x26]\n"
-    "fmla v2.4s, v29.4s, v14.4s\n"
-    "ldr s20, [x17, x27]\n"
-    "fmla v16.4s, v28.4s, v10.4s\n"
-    "ldr s27, [x16, x28]\n"
-    "fmla v17.4s, v23.4s, v7.4s\n"
-    "ldr s30, [x15, x13]\n"
-    "fmla v15.4s, v24.4s, v8.4s\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "fmla v5.4s, v25.4s, v12.4s\n"
-    "ldr s24, [%[inptr0], x14]\n"
-    "str s17, [%[outptr0]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "fmla v15.4s, v25.4s, v10.4s\n"
-    "ldr s28, [x20, %[input_col_stride1]]\n"
-    "fmla v5.4s, v19.4s, v14.4s\n"
-    "ldr s17, [x19, x26]\n"
-    "fmla v2.4s, v22.4s, v11.4s\n"
-    "ldr s18, [x9, x27]\n"
-    "fmla v16.4s, v26.4s, v7.4s\n"
-    "ldr s25, [x17, x28]\n"
-    "fmla v4.4s, v26.4s, v9.4s\n"
-    "ldr s22, [x16, x13]\n"
-    "fmla v2.4s, v26.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "str s16, [x21]\n"
-    "fmla v1.4s, v26.4s, v6.4s\n"
-    "fmla v4.4s, v20.4s, v11.4s\n"
-    "ldr s21, [x15, x14]\n"
-    "fmla v15.4s, v27.4s, v7.4s\n"
-    "ldr s23, [x20, x26]\n"
-    "fmla v5.4s, v27.4s, v9.4s\n"
-    "ldr s19, [x19, x27]\n"
-    "fmla v4.4s, v27.4s, v13.4s\n"
-    "add x15, x15, #4\n"
-    "str s15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v3.4s, v27.4s, v6.4s\n"
-    "fmla v5.4s, v30.4s, v11.4s\n"
-    "ldr s29, [x9, x28]\n"
-    "fmla v2.4s, v28.4s, v8.4s\n"
-    "ldr s27, [x17, x13]\n"
-    "fmla v1.4s, v17.4s, v12.4s\n"
-    "ldr s28, [x16, x14]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "ldr s26, [x20, x27]\n"
-    "fmla v2.4s, v17.4s, v10.4s\n"
-    "ldr s20, [x19, x28]\n"
-    "fmla v4.4s, v18.4s, v8.4s\n"
-    "ldr s17, [x9, x13]\n"
-    "fmla v1.4s, v18.4s, v14.4s\n"
-    "ldr s18, [x17, x14]\n"
-    "fmla v3.4s, v25.4s, v12.4s\n"
-    "add x16, x16, #4\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "ldr s16, [x20, x28]\n"
-    "fmla v5.4s, v22.4s, v8.4s\n"
-    "add x17, x17, #4\n"
-    "fmla v3.4s, v22.4s, v14.4s\n"
-    "ldr s15, [x19, x13]\n"
-    "fmla v2.4s, v23.4s, v7.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v5.4s, v21.4s, v10.4s\n"
-    "ldr s21, [x9, x14]\n"
-    "fmla v4.4s, v29.4s, v7.4s\n"
-    "ldr s23, [x20, x13]\n"
-    "str s2, [x22]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v3.4s, v29.4s, v9.4s\n"
-    "ldr s24, [x19, x14]\n"
-    "str s4, [x21, %[output_col_stride1]]\n"
-    "fmla v0.4s, v29.4s, v6.4s\n"
-    "fmla v1.4s, v29.4s, v13.4s\n"
-    "ldr s25, [x20, x14]\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v5.4s, v28.4s, v7.4s\n"
-    "add x19, x19, #4\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "add x20, x20, #4\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "fmla v0.4s, v20.4s, v12.4s\n"
-    "str s5, [%[outptr0], x23]\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v3.4s, v17.4s, v8.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v0.4s, v17.4s, v14.4s\n"
-    "fmla v1.4s, v16.4s, v7.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v16.4s, v9.4s\n"
-    "str s1, [x22, %[output_col_stride1]]\n"
-    "fmla v3.4s, v21.4s, v7.4s\n"
-    "fmla v0.4s, v15.4s, v11.4s\n"
-    "str s3, [x21, x23]\n"
-    "fmla v0.4s, v21.4s, v13.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v0.4s, v23.4s, v8.4s\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v0.4s, v25.4s, v7.4s\n"
-    "str s0, [x22, x23]\n"
-    "add x22, x22, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr), [inptr0] "+r" (input), [outptr0] "+r" (output)
-    : [n_channels] "r" ((long long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x13", "x14", "memory"
-  );
-}
-#endif  // __aarch64__
-
-template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
deleted file mode 100644
index 89d1f2238b..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ /dev/null
@@ -1,6018 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x8, %[inptr0], %[input_row_stride]\n"
-    "add x15, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x23, %[outptr0], %[output_row_stride]\n"
-    "add x9, x8, %[input_row_stride]\n"
-    "add x16, x15, #64\n"
-    "add x17, x15, %[input_col_stride1]\n"
-    "add x10, x9, %[input_row_stride]\n"
-    "add x7, x17, #64\n"
-    "add x19, x17, %[input_col_stride1]\n"
-    "add x11, x10, %[input_row_stride]\n"
-    "add x20, x19, #64\n"
-    "add x21, x19, %[input_col_stride1]\n"
-    "add x12, x11, %[input_row_stride]\n"
-    "add x22, x21, #64\n"
-    "add x24, x23, %[output_row_stride]\n"
-    "add x25, x24, %[output_row_stride]\n"
-    "add x26, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x13, %[n_channels], #3\n"
-    "add x27, x26, %[output_col_stride1]\n"
-    "lsr x14, %[n_channels], #2\n"
-    "cbz x14, 4f\n"
-    "1:\n"
-    "ldr q14, [%[wbptr]]\n"
-    "subs x14, x14, #1\n"
-    "mov v17.16b, v14.16b\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "mov v23.16b, v14.16b\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "mov v24.16b, v14.16b\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "mov v20.16b, v14.16b\n"
-    "ldr q9, [%[wbptr], #64]\n"
-    "mov v16.16b, v14.16b\n"
-    "ldr q8, [%[wbptr], #80]\n"
-    "mov v13.16b, v14.16b\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr q6, [%[wbptr], #112]\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr q5, [%[wbptr], #128]\n"
-    "mov v2.16b, v14.16b\n"
-    "ldr q4, [%[wbptr], #144]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr q29, [%[inptr0]]\n"
-    "fmla v17.4s, v29.4s, v12.4s\n"
-    "ldr q28, [x8]\n"
-    "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q25, [x9]\n"
-    "ldr q26, [x8, %[input_col_stride1]]\n"
-    "ldr q27, [%[inptr0], x15]\n"
-    "ldr q15, [x10]\n"
-    "ldr q18, [x9, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x8, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "prfm pldl1keep, [x8, x28]\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "prfm pldl1keep, [x9, x28]\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v17.4s, v28.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x16]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr q22, [x8, x15]\n"
-    "fmla v24.4s, v30.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "ldr q29, [%[inptr0], x17]\n"
-    "fmla v23.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x11, #64]\n"
-    "fmla v20.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x10, x28]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "ldr q25, [x11]\n"
-    "fmla v23.4s, v26.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x7]\n"
-    "fmla v17.4s, v26.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v16.4s, v26.4s, v12.4s\n"
-    "ldr q28, [x10, %[input_col_stride1]]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, #64]\n"
-    "fmla v17.4s, v27.4s, v10.4s\n"
-    "prfm pldl1keep, [x11, x28]\n"
-    "fmla v13.4s, v27.4s, v12.4s\n"
-    "ldr q19, [x9, x15]\n"
-    "fmla v23.4s, v15.4s, v6.4s\n"
-    "prfm pldl1keep, [x10, x16]\n"
-    "fmla v20.4s, v15.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v0.4s, v15.4s, v12.4s\n"
-    "ldr q21, [x8, x17]\n"
-    "fmla v17.4s, v18.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x20]\n"
-    "fmla v23.4s, v18.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v24.4s, v18.4s, v6.4s\n"
-    "prfm pldl1keep, [x12, x28]\n"
-    "fmla v20.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x16]\n"
-    "fmla v16.4s, v18.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x7]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "ldr q27, [%[inptr0], x19]\n"
-    "fmla v17.4s, v22.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x20]\n"
-    "fmla v23.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x8, x22]\n"
-    "fmla v24.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x16]\n"
-    "fmla v16.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x7]\n"
-    "fmla v13.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x20]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr q18, [x12]\n"
-    "fmla v24.4s, v29.4s, v10.4s\n"
-    "prfm pldl1keep, [x9, x22]\n"
-    "fmla v13.4s, v29.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x7]\n"
-    "fmla v3.4s, v29.4s, v12.4s\n"
-    "ldr q22, [x11, %[input_col_stride1]]\n"
-    "fmla v20.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x20]\n"
-    "fmla v0.4s, v25.4s, v9.4s\n"
-    "ldr q25, [x10, x15]\n"
-    "fmla v23.4s, v28.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, x22]\n"
-    "fmla v20.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x20]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x22]\n"
-    "fmla v0.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x22]\n"
-    "fmla v1.4s, v28.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v23.4s, v19.4s, v7.4s\n"
-    "subs x14, x14, #1\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v10.4s\n"
-    "str q17, [%[outptr0]]\n"
-    "mov v15.16b, v14.16b\n"
-    "fmla v16.4s, v19.4s, v8.4s\n"
-    "fmla v13.4s, v19.4s, v6.4s\n"
-    "fmla v15.4s, v28.4s, v12.4s\n"
-    "ldr q29, [x9, x17]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v2.4s, v19.4s, v9.4s\n"
-    "fmla v24.4s, v21.4s, v7.4s\n"
-    "fmla v16.4s, v21.4s, v10.4s\n"
-    "fmla v13.4s, v21.4s, v8.4s\n"
-    "fmla v3.4s, v21.4s, v9.4s\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v18.16b, v14.16b\n"
-    "fmla v20.4s, v22.4s, v5.4s\n"
-    "fmla v13.4s, v27.4s, v10.4s\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "mov v17.16b, v14.16b\n"
-    "fmla v18.4s, v19.4s, v12.4s\n"
-    "mov v19.16b, v14.16b\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "fmla v17.4s, v21.4s, v12.4s\n"
-    "ldr q26, [x8, x19]\n"
-    "fmla v1.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v9.4s\n"
-    "mov v22.16b, v14.16b\n"
-    "mov v21.16b, v14.16b\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v20.4s, v25.4s, v7.4s\n"
-    "fmla v16.4s, v25.4s, v5.4s\n"
-    "fmla v0.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "str q23, [x23]\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "ldr q28, [%[inptr0], x21]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr q30, [x12, %[input_col_stride1]]\n"
-    "fmla v24.4s, v29.4s, v4.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v16.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "str q24, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v1.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "ldr q27, [x11, x15]\n"
-    "fmla v3.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "fmla v22.4s, v29.4s, v12.4s\n"
-    "ldr q23, [x10, x17]\n"
-    "fmla v13.4s, v26.4s, v7.4s\n"
-    "fmla v2.4s, v26.4s, v10.4s\n"
-    "fmla v3.4s, v26.4s, v8.4s\n"
-    "fmla v17.4s, v26.4s, v11.4s\n"
-    "fmla v0.4s, v30.4s, v5.4s\n"
-    "ldr q24, [x9, x19]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "ldr q29, [x8, x21]\n"
-    "fmla v3.4s, v28.4s, v10.4s\n"
-    "ldr q14, [x12, x15]\n"
-    "fmla v20.4s, v27.4s, v4.4s\n"
-    "add x8, x8, #16\n"
-    "fmla v0.4s, v27.4s, v7.4s\n"
-    "prfm pldl1keep, [x8, #64]\n"
-    "fmla v1.4s, v27.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x28]\n"
-    "str q20, [x24]\n"
-    "fmla v15.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v6.4s\n"
-    "ldr q25, [x11, x17]\n"
-    "fmla v19.4s, v27.4s, v9.4s\n"
-    "ldr q30, [x10, x19]\n"
-    "fmla v16.4s, v23.4s, v4.4s\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "fmla v2.4s, v23.4s, v5.4s\n"
-    "fmla v15.4s, v23.4s, v10.4s\n"
-    "fmla v18.4s, v23.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v6.4s\n"
-    "str q16, [x23, %[output_col_stride1]]\n"
-    "fmla v19.4s, v23.4s, v11.4s\n"
-    "fmla v22.4s, v23.4s, v9.4s\n"
-    "ldr q26, [x9, x21]\n"
-    "fmla v21.4s, v23.4s, v12.4s\n"
-    "ldr q27, [x12, x17]\n"
-    "fmla v13.4s, v24.4s, v4.4s\n"
-    "ldr q20, [x11, x19]\n"
-    "fmla v2.4s, v24.4s, v7.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v3.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "str q13, [%[outptr0], x26]\n"
-    "fmla v18.4s, v24.4s, v10.4s\n"
-    "fmla v17.4s, v24.4s, v8.4s\n"
-    "ldr q23, [x10, x21]\n"
-    "fmla v22.4s, v24.4s, v11.4s\n"
-    "ldr q24, [x12, x19]\n"
-    "fmla v3.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x28]\n"
-    "fmla v17.4s, v29.4s, v10.4s\n"
-    "ldr q16, [x11, x21]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "add x10, x10, #16\n"
-    "fmla v15.4s, v14.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v19.4s, v14.4s, v6.4s\n"
-    "ldr q13, [x12, x21]\n"
-    "str q0, [x25]\n"
-    "fmla v1.4s, v25.4s, v4.4s\n"
-    "fmla v15.4s, v25.4s, v7.4s\n"
-    "ldr q14, [%[wbptr]]\n"
-    "fmla v18.4s, v25.4s, v5.4s\n"
-    "add x11, x11, #16\n"
-    "str q1, [x24, %[output_col_stride1]]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "fmla v22.4s, v25.4s, v6.4s\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "fmla v21.4s, v25.4s, v9.4s\n"
-    "ldr q29, [%[inptr0]]\n"
-    "fmla v2.4s, v30.4s, v4.4s\n"
-    "ldr q28, [x8]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "add x12, x12, #16\n"
-    "fmla v17.4s, v30.4s, v5.4s\n"
-    "fmla v19.4s, v30.4s, v10.4s\n"
-    "str q2, [x23, x26]\n"
-    "fmla v22.4s, v30.4s, v8.4s\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr q9, [%[wbptr], #64]\n"
-    "fmla v3.4s, v26.4s, v4.4s\n"
-    "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v17.4s, v26.4s, v7.4s\n"
-    "ldr q25, [x9]\n"
-    "fmla v22.4s, v26.4s, v10.4s\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "str q3, [%[outptr0], x27]\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v19.4s, v27.4s, v5.4s\n"
-    "ldr q26, [x8, %[input_col_stride1]]\n"
-    "fmla v21.4s, v27.4s, v6.4s\n"
-    "ldr q27, [%[inptr0], x15]\n"
-    "str q15, [x25, %[output_col_stride1]]\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v19.4s, v20.4s, v7.4s\n"
-    "ldr q15, [x10]\n"
-    "fmla v22.4s, v20.4s, v5.4s\n"
-    "ldr q6, [%[wbptr], #112]\n"
-    "str q18, [x24, x26]\n"
-    "fmla v21.4s, v20.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "ldr q18, [x9, %[input_col_stride1]]\n"
-    "fmla v22.4s, v23.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v21.4s, v23.4s, v10.4s\n"
-    "ldr q8, [%[wbptr], #80]\n"
-    "str q17, [x23, x27]\n"
-    "fmla v19.4s, v24.4s, v4.4s\n"
-    "fmla v22.4s, v16.4s, v4.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v21.4s, v24.4s, v5.4s\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "str q19, [x25, x26]\n"
-    "mov v17.16b, v14.16b\n"
-    "str q22, [x24, x27]\n"
-    "mov v23.16b, v14.16b\n"
-    "fmla v21.4s, v16.4s, v7.4s\n"
-    "ldr q5, [%[wbptr], #128]\n"
-    "mov v24.16b, v14.16b\n"
-    "add x24, x24, #16\n"
-    "mov v20.16b, v14.16b\n"
-    "mov v16.16b, v14.16b\n"
-    "fmla v21.4s, v13.4s, v4.4s\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "mov v13.16b, v14.16b\n"
-    "mov v0.16b, v14.16b\n"
-    "mov v1.16b, v14.16b\n"
-    "mov v2.16b, v14.16b\n"
-    "str q21, [x25, x27]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr q4, [%[wbptr], #144]\n"
-    "add x25, x25, #16\n"
-    "fmla v17.4s, v29.4s, v12.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v17.4s, v28.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x16]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr q22, [x8, x15]\n"
-    "fmla v24.4s, v30.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "ldr q29, [%[inptr0], x17]\n"
-    "fmla v23.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x11, #64]\n"
-    "fmla v20.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x10, x28]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "ldr q25, [x11]\n"
-    "fmla v23.4s, v26.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x7]\n"
-    "fmla v17.4s, v26.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v16.4s, v26.4s, v12.4s\n"
-    "ldr q28, [x10, %[input_col_stride1]]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, #64]\n"
-    "fmla v17.4s, v27.4s, v10.4s\n"
-    "prfm pldl1keep, [x11, x28]\n"
-    "fmla v13.4s, v27.4s, v12.4s\n"
-    "ldr q19, [x9, x15]\n"
-    "fmla v23.4s, v15.4s, v6.4s\n"
-    "prfm pldl1keep, [x10, x16]\n"
-    "fmla v20.4s, v15.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v0.4s, v15.4s, v12.4s\n"
-    "ldr q21, [x8, x17]\n"
-    "fmla v17.4s, v18.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x20]\n"
-    "fmla v23.4s, v18.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v24.4s, v18.4s, v6.4s\n"
-    "prfm pldl1keep, [x12, x28]\n"
-    "fmla v20.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x16]\n"
-    "fmla v16.4s, v18.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x7]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "ldr q27, [%[inptr0], x19]\n"
-    "fmla v17.4s, v22.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x20]\n"
-    "fmla v23.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x8, x22]\n"
-    "fmla v24.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x16]\n"
-    "fmla v16.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x7]\n"
-    "fmla v13.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x20]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr q18, [x12]\n"
-    "fmla v24.4s, v29.4s, v10.4s\n"
-    "prfm pldl1keep, [x9, x22]\n"
-    "fmla v13.4s, v29.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x7]\n"
-    "fmla v3.4s, v29.4s, v12.4s\n"
-    "ldr q22, [x11, %[input_col_stride1]]\n"
-    "fmla v20.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x20]\n"
-    "fmla v0.4s, v25.4s, v9.4s\n"
-    "ldr q25, [x10, x15]\n"
-    "fmla v23.4s, v28.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, x22]\n"
-    "fmla v20.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x20]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x22]\n"
-    "fmla v0.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x22]\n"
-    "fmla v1.4s, v28.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v23.4s, v19.4s, v7.4s\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v10.4s\n"
-    "fmla v16.4s, v19.4s, v8.4s\n"
-    "str q17, [%[outptr0]]\n"
-    "mov v15.16b, v14.16b\n"
-    "fmla v13.4s, v19.4s, v6.4s\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v15.4s, v28.4s, v12.4s\n"
-    "ldr q29, [x9, x17]\n"
-    "fmla v2.4s, v19.4s, v9.4s\n"
-    "fmla v24.4s, v21.4s, v7.4s\n"
-    "fmla v16.4s, v21.4s, v10.4s\n"
-    "fmla v13.4s, v21.4s, v8.4s\n"
-    "fmla v3.4s, v21.4s, v9.4s\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v18.16b, v14.16b\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "fmla v13.4s, v27.4s, v10.4s\n"
-    "fmla v20.4s, v22.4s, v5.4s\n"
-    "fmla v18.4s, v19.4s, v12.4s\n"
-    "ldr q26, [x8, x19]\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "ldr q28, [%[inptr0], x21]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v1.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v9.4s\n"
-    "mov v17.16b, v14.16b\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v20.4s, v25.4s, v7.4s\n"
-    "fmla v16.4s, v25.4s, v5.4s\n"
-    "fmla v17.4s, v21.4s, v12.4s\n"
-    "ldr q30, [x12, %[input_col_stride1]]\n"
-    "str q23, [x23]\n"
-    "mov v19.16b, v14.16b\n"
-    "fmla v0.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "mov v22.16b, v14.16b\n"
-    "mov v21.16b, v14.16b\n"
-    "fmla v24.4s, v29.4s, v4.4s\n"
-    "fmla v16.4s, v29.4s, v7.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v1.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v3.4s, v29.4s, v6.4s\n"
-    "str q24, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "ldr q27, [x11, x15]\n"
-    "fmla v22.4s, v29.4s, v12.4s\n"
-    "ldr q23, [x10, x17]\n"
-    "fmla v13.4s, v26.4s, v7.4s\n"
-    "fmla v2.4s, v26.4s, v10.4s\n"
-    "fmla v3.4s, v26.4s, v8.4s\n"
-    "fmla v17.4s, v26.4s, v11.4s\n"
-    "fmla v0.4s, v30.4s, v5.4s\n"
-    "ldr q24, [x9, x19]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "ldr q29, [x8, x21]\n"
-    "fmla v3.4s, v28.4s, v10.4s\n"
-    "ldr q14, [x12, x15]\n"
-    "fmla v20.4s, v27.4s, v4.4s\n"
-    "add x8, x8, #16\n"
-    "fmla v0.4s, v27.4s, v7.4s\n"
-    "fmla v1.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v6.4s\n"
-    "str q20, [x24]\n"
-    "fmla v19.4s, v27.4s, v9.4s\n"
-    "fmla v16.4s, v23.4s, v4.4s\n"
-    "ldr q25, [x11, x17]\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "ldr q30, [x10, x19]\n"
-    "fmla v2.4s, v23.4s, v5.4s\n"
-    "fmla v15.4s, v23.4s, v10.4s\n"
-    "str q16, [x23, %[output_col_stride1]]\n"
-    "fmla v18.4s, v23.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v6.4s\n"
-    "ldr q26, [x9, x21]\n"
-    "fmla v19.4s, v23.4s, v11.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v22.4s, v23.4s, v9.4s\n"
-    "fmla v21.4s, v23.4s, v12.4s\n"
-    "fmla v13.4s, v24.4s, v4.4s\n"
-    "ldr q27, [x12, x17]\n"
-    "fmla v2.4s, v24.4s, v7.4s\n"
-    "ldr q20, [x11, x19]\n"
-    "fmla v3.4s, v24.4s, v5.4s\n"
-    "fmla v18.4s, v24.4s, v10.4s\n"
-    "str q13, [%[outptr0], x26]\n"
-    "fmla v17.4s, v24.4s, v8.4s\n"
-    "fmla v22.4s, v24.4s, v11.4s\n"
-    "ldr q23, [x10, x21]\n"
-    "fmla v3.4s, v29.4s, v7.4s\n"
-    "ldr q24, [x12, x19]\n"
-    "fmla v17.4s, v29.4s, v10.4s\n"
-    "ldr q16, [x11, x21]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "add x10, x10, #16\n"
-    "fmla v15.4s, v14.4s, v5.4s\n"
-    "add x11, x11, #16\n"
-    "fmla v19.4s, v14.4s, v6.4s\n"
-    "ldr q13, [x12, x21]\n"
-    "str q0, [x25]\n"
-    "fmla v1.4s, v25.4s, v4.4s\n"
-    "fmla v15.4s, v25.4s, v7.4s\n"
-    "add x12, x12, #16\n"
-    "fmla v18.4s, v25.4s, v5.4s\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "str q1, [x24, %[output_col_stride1]]\n"
-    "fmla v22.4s, v25.4s, v6.4s\n"
-    "fmla v21.4s, v25.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v4.4s\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "fmla v17.4s, v30.4s, v5.4s\n"
-    "fmla v19.4s, v30.4s, v10.4s\n"
-    "fmla v22.4s, v30.4s, v8.4s\n"
-    "str q2, [x23, x26]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "fmla v3.4s, v26.4s, v4.4s\n"
-    "fmla v17.4s, v26.4s, v7.4s\n"
-    "fmla v22.4s, v26.4s, v10.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v19.4s, v27.4s, v5.4s\n"
-    "fmla v21.4s, v27.4s, v6.4s\n"
-    "str q3, [%[outptr0], x27]\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "str q15, [x25, %[output_col_stride1]]\n"
-    "fmla v22.4s, v20.4s, v5.4s\n"
-    "fmla v19.4s, v20.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "str q18, [x24, x26]\n"
-    "fmla v21.4s, v20.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "fmla v22.4s, v23.4s, v7.4s\n"
-    "fmla v19.4s, v24.4s, v4.4s\n"
-    "fmla v21.4s, v23.4s, v10.4s\n"
-    "str q17, [x23, x27]\n"
-    "fmla v22.4s, v16.4s, v4.4s\n"
-    "str q19, [x25, x26]\n"
-    "add x23, x23, #16\n"
-    "fmla v21.4s, v24.4s, v5.4s\n"
-    "str q22, [x24, x27]\n"
-    "add x24, x24, #16\n"
-    "fmla v21.4s, v16.4s, v7.4s\n"
-    "fmla v21.4s, v13.4s, v4.4s\n"
-    "str q21, [x25, x27]\n"
-    "add x25, x25, #16\n"
-    "4:\n"
-    "cbz x13, 7f\n"
-    "ldr s14, [%[wbptr]]\n"
-    "mov v17.16b, v14.16b\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "mov v23.16b, v14.16b\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "mov v24.16b, v14.16b\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "mov v20.16b, v14.16b\n"
-    "ldr s9, [%[wbptr], #16]\n"
-    "mov v16.16b, v14.16b\n"
-    "ldr s8, [%[wbptr], #20]\n"
-    "mov v13.16b, v14.16b\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr s6, [%[wbptr], #28]\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr s5, [%[wbptr], #32]\n"
-    "mov v2.16b, v14.16b\n"
-    "ldr s4, [%[wbptr], #36]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr s29, [%[inptr0]]\n"
-    "fmla v17.4s, v29.4s, v12.4s\n"
-    "ldr s28, [x8]\n"
-    "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
-    "subs x13, x13, #1\n"
-    "ldr s25, [x9]\n"
-    "ldr s26, [x8, %[input_col_stride1]]\n"
-    "ldr s27, [%[inptr0], x15]\n"
-    "ldr s15, [x10]\n"
-    "ldr s18, [x9, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x8, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "prfm pldl1keep, [x8, x28]\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "prfm pldl1keep, [x9, x28]\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v17.4s, v28.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x16]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr s22, [x8, x15]\n"
-    "fmla v24.4s, v30.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "ldr s29, [%[inptr0], x17]\n"
-    "fmla v23.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x11, #64]\n"
-    "fmla v20.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x10, x28]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "ldr s25, [x11]\n"
-    "fmla v23.4s, v26.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x7]\n"
-    "fmla v17.4s, v26.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v16.4s, v26.4s, v12.4s\n"
-    "ldr s28, [x10, %[input_col_stride1]]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, #64]\n"
-    "fmla v17.4s, v27.4s, v10.4s\n"
-    "prfm pldl1keep, [x11, x28]\n"
-    "fmla v13.4s, v27.4s, v12.4s\n"
-    "ldr s19, [x9, x15]\n"
-    "fmla v23.4s, v15.4s, v6.4s\n"
-    "prfm pldl1keep, [x10, x16]\n"
-    "fmla v20.4s, v15.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v0.4s, v15.4s, v12.4s\n"
-    "ldr s21, [x8, x17]\n"
-    "fmla v17.4s, v18.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x20]\n"
-    "fmla v23.4s, v18.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v24.4s, v18.4s, v6.4s\n"
-    "prfm pldl1keep, [x12, x28]\n"
-    "fmla v20.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x16]\n"
-    "fmla v16.4s, v18.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x7]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "ldr s27, [%[inptr0], x19]\n"
-    "fmla v17.4s, v22.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x20]\n"
-    "fmla v23.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x8, x22]\n"
-    "fmla v24.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x16]\n"
-    "fmla v16.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x7]\n"
-    "fmla v13.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x20]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr s18, [x12]\n"
-    "fmla v24.4s, v29.4s, v10.4s\n"
-    "prfm pldl1keep, [x9, x22]\n"
-    "fmla v13.4s, v29.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x7]\n"
-    "fmla v3.4s, v29.4s, v12.4s\n"
-    "ldr s22, [x11, %[input_col_stride1]]\n"
-    "fmla v20.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x20]\n"
-    "fmla v0.4s, v25.4s, v9.4s\n"
-    "ldr s25, [x10, x15]\n"
-    "fmla v23.4s, v28.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, x22]\n"
-    "fmla v20.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x20]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x22]\n"
-    "fmla v0.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x22]\n"
-    "fmla v1.4s, v28.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v23.4s, v19.4s, v7.4s\n"
-    "subs x13, x13, #1\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v10.4s\n"
-    "str s17, [%[outptr0]]\n"
-    "mov v15.16b, v14.16b\n"
-    "fmla v16.4s, v19.4s, v8.4s\n"
-    "fmla v13.4s, v19.4s, v6.4s\n"
-    "fmla v15.4s, v28.4s, v12.4s\n"
-    "ldr s29, [x9, x17]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v2.4s, v19.4s, v9.4s\n"
-    "fmla v24.4s, v21.4s, v7.4s\n"
-    "fmla v16.4s, v21.4s, v10.4s\n"
-    "fmla v13.4s, v21.4s, v8.4s\n"
-    "fmla v3.4s, v21.4s, v9.4s\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v18.16b, v14.16b\n"
-    "fmla v20.4s, v22.4s, v5.4s\n"
-    "fmla v13.4s, v27.4s, v10.4s\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "mov v17.16b, v14.16b\n"
-    "fmla v18.4s, v19.4s, v12.4s\n"
-    "mov v19.16b, v14.16b\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "fmla v17.4s, v21.4s, v12.4s\n"
-    "ldr s26, [x8, x19]\n"
-    "fmla v1.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v9.4s\n"
-    "mov v22.16b, v14.16b\n"
-    "mov v21.16b, v14.16b\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v20.4s, v25.4s, v7.4s\n"
-    "fmla v16.4s, v25.4s, v5.4s\n"
-    "fmla v0.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "str s23, [x23]\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "ldr s28, [%[inptr0], x21]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr s30, [x12, %[input_col_stride1]]\n"
-    "fmla v24.4s, v29.4s, v4.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v16.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "str s24, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v1.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "ldr s27, [x11, x15]\n"
-    "fmla v3.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "fmla v22.4s, v29.4s, v12.4s\n"
-    "ldr s23, [x10, x17]\n"
-    "fmla v13.4s, v26.4s, v7.4s\n"
-    "fmla v2.4s, v26.4s, v10.4s\n"
-    "fmla v3.4s, v26.4s, v8.4s\n"
-    "fmla v17.4s, v26.4s, v11.4s\n"
-    "fmla v0.4s, v30.4s, v5.4s\n"
-    "ldr s24, [x9, x19]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "ldr s29, [x8, x21]\n"
-    "fmla v3.4s, v28.4s, v10.4s\n"
-    "ldr s14, [x12, x15]\n"
-    "fmla v20.4s, v27.4s, v4.4s\n"
-    "add x8, x8, #4\n"
-    "fmla v0.4s, v27.4s, v7.4s\n"
-    "prfm pldl1keep, [x8, #64]\n"
-    "fmla v1.4s, v27.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x28]\n"
-    "str s20, [x24]\n"
-    "fmla v15.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v6.4s\n"
-    "ldr s25, [x11, x17]\n"
-    "fmla v19.4s, v27.4s, v9.4s\n"
-    "ldr s30, [x10, x19]\n"
-    "fmla v16.4s, v23.4s, v4.4s\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "fmla v2.4s, v23.4s, v5.4s\n"
-    "fmla v15.4s, v23.4s, v10.4s\n"
-    "fmla v18.4s, v23.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v6.4s\n"
-    "str s16, [x23, %[output_col_stride1]]\n"
-    "fmla v19.4s, v23.4s, v11.4s\n"
-    "fmla v22.4s, v23.4s, v9.4s\n"
-    "ldr s26, [x9, x21]\n"
-    "fmla v21.4s, v23.4s, v12.4s\n"
-    "ldr s27, [x12, x17]\n"
-    "fmla v13.4s, v24.4s, v4.4s\n"
-    "ldr s20, [x11, x19]\n"
-    "fmla v2.4s, v24.4s, v7.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v3.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "str s13, [%[outptr0], x26]\n"
-    "fmla v18.4s, v24.4s, v10.4s\n"
-    "fmla v17.4s, v24.4s, v8.4s\n"
-    "ldr s23, [x10, x21]\n"
-    "fmla v22.4s, v24.4s, v11.4s\n"
-    "ldr s24, [x12, x19]\n"
-    "fmla v3.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x28]\n"
-    "fmla v17.4s, v29.4s, v10.4s\n"
-    "ldr s16, [x11, x21]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "add x10, x10, #4\n"
-    "fmla v15.4s, v14.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v19.4s, v14.4s, v6.4s\n"
-    "ldr s13, [x12, x21]\n"
-    "str s0, [x25]\n"
-    "fmla v1.4s, v25.4s, v4.4s\n"
-    "fmla v15.4s, v25.4s, v7.4s\n"
-    "ldr s14, [%[wbptr]]\n"
-    "fmla v18.4s, v25.4s, v5.4s\n"
-    "add x11, x11, #4\n"
-    "str s1, [x24, %[output_col_stride1]]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "fmla v22.4s, v25.4s, v6.4s\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "fmla v21.4s, v25.4s, v9.4s\n"
-    "ldr s29, [%[inptr0]]\n"
-    "fmla v2.4s, v30.4s, v4.4s\n"
-    "ldr s28, [x8]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "add x12, x12, #4\n"
-    "fmla v17.4s, v30.4s, v5.4s\n"
-    "fmla v19.4s, v30.4s, v10.4s\n"
-    "str s2, [x23, x26]\n"
-    "fmla v22.4s, v30.4s, v8.4s\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr s9, [%[wbptr], #16]\n"
-    "fmla v3.4s, v26.4s, v4.4s\n"
-    "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v17.4s, v26.4s, v7.4s\n"
-    "ldr s25, [x9]\n"
-    "fmla v22.4s, v26.4s, v10.4s\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "str s3, [%[outptr0], x27]\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v19.4s, v27.4s, v5.4s\n"
-    "ldr s26, [x8, %[input_col_stride1]]\n"
-    "fmla v21.4s, v27.4s, v6.4s\n"
-    "ldr s27, [%[inptr0], x15]\n"
-    "str s15, [x25, %[output_col_stride1]]\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v19.4s, v20.4s, v7.4s\n"
-    "ldr s15, [x10]\n"
-    "fmla v22.4s, v20.4s, v5.4s\n"
-    "ldr s6, [%[wbptr], #28]\n"
-    "str s18, [x24, x26]\n"
-    "fmla v21.4s, v20.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "ldr s18, [x9, %[input_col_stride1]]\n"
-    "fmla v22.4s, v23.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v21.4s, v23.4s, v10.4s\n"
-    "ldr s8, [%[wbptr], #20]\n"
-    "str s17, [x23, x27]\n"
-    "fmla v19.4s, v24.4s, v4.4s\n"
-    "fmla v22.4s, v16.4s, v4.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v21.4s, v24.4s, v5.4s\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "str s19, [x25, x26]\n"
-    "mov v17.16b, v14.16b\n"
-    "str s22, [x24, x27]\n"
-    "mov v23.16b, v14.16b\n"
-    "fmla v21.4s, v16.4s, v7.4s\n"
-    "ldr s5, [%[wbptr], #32]\n"
-    "mov v24.16b, v14.16b\n"
-    "add x24, x24, #4\n"
-    "mov v20.16b, v14.16b\n"
-    "mov v16.16b, v14.16b\n"
-    "fmla v21.4s, v13.4s, v4.4s\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "mov v13.16b, v14.16b\n"
-    "mov v0.16b, v14.16b\n"
-    "mov v1.16b, v14.16b\n"
-    "mov v2.16b, v14.16b\n"
-    "str s21, [x25, x27]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr s4, [%[wbptr], #36]\n"
-    "add x25, x25, #4\n"
-    "fmla v17.4s, v29.4s, v12.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v17.4s, v28.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x16]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr s22, [x8, x15]\n"
-    "fmla v24.4s, v30.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "ldr s29, [%[inptr0], x17]\n"
-    "fmla v23.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x11, #64]\n"
-    "fmla v20.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x10, x28]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "ldr s25, [x11]\n"
-    "fmla v23.4s, v26.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x7]\n"
-    "fmla v17.4s, v26.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v16.4s, v26.4s, v12.4s\n"
-    "ldr s28, [x10, %[input_col_stride1]]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, #64]\n"
-    "fmla v17.4s, v27.4s, v10.4s\n"
-    "prfm pldl1keep, [x11, x28]\n"
-    "fmla v13.4s, v27.4s, v12.4s\n"
-    "ldr s19, [x9, x15]\n"
-    "fmla v23.4s, v15.4s, v6.4s\n"
-    "prfm pldl1keep, [x10, x16]\n"
-    "fmla v20.4s, v15.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v0.4s, v15.4s, v12.4s\n"
-    "ldr s21, [x8, x17]\n"
-    "fmla v17.4s, v18.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x20]\n"
-    "fmla v23.4s, v18.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v24.4s, v18.4s, v6.4s\n"
-    "prfm pldl1keep, [x12, x28]\n"
-    "fmla v20.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x16]\n"
-    "fmla v16.4s, v18.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x7]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "ldr s27, [%[inptr0], x19]\n"
-    "fmla v17.4s, v22.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x20]\n"
-    "fmla v23.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x8, x22]\n"
-    "fmla v24.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x16]\n"
-    "fmla v16.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x7]\n"
-    "fmla v13.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x20]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr s18, [x12]\n"
-    "fmla v24.4s, v29.4s, v10.4s\n"
-    "prfm pldl1keep, [x9, x22]\n"
-    "fmla v13.4s, v29.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x7]\n"
-    "fmla v3.4s, v29.4s, v12.4s\n"
-    "ldr s22, [x11, %[input_col_stride1]]\n"
-    "fmla v20.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x20]\n"
-    "fmla v0.4s, v25.4s, v9.4s\n"
-    "ldr s25, [x10, x15]\n"
-    "fmla v23.4s, v28.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, x22]\n"
-    "fmla v20.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x20]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x22]\n"
-    "fmla v0.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x22]\n"
-    "fmla v1.4s, v28.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v23.4s, v19.4s, v7.4s\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v10.4s\n"
-    "fmla v16.4s, v19.4s, v8.4s\n"
-    "str s17, [%[outptr0]]\n"
-    "mov v15.16b, v14.16b\n"
-    "fmla v13.4s, v19.4s, v6.4s\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v15.4s, v28.4s, v12.4s\n"
-    "ldr s29, [x9, x17]\n"
-    "fmla v2.4s, v19.4s, v9.4s\n"
-    "fmla v24.4s, v21.4s, v7.4s\n"
-    "fmla v16.4s, v21.4s, v10.4s\n"
-    "fmla v13.4s, v21.4s, v8.4s\n"
-    "fmla v3.4s, v21.4s, v9.4s\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v18.16b, v14.16b\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "fmla v13.4s, v27.4s, v10.4s\n"
-    "fmla v20.4s, v22.4s, v5.4s\n"
-    "fmla v18.4s, v19.4s, v12.4s\n"
-    "ldr s26, [x8, x19]\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "ldr s28, [%[inptr0], x21]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v1.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v9.4s\n"
-    "mov v17.16b, v14.16b\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v20.4s, v25.4s, v7.4s\n"
-    "fmla v16.4s, v25.4s, v5.4s\n"
-    "fmla v17.4s, v21.4s, v12.4s\n"
-    "ldr s30, [x12, %[input_col_stride1]]\n"
-    "str s23, [x23]\n"
-    "mov v19.16b, v14.16b\n"
-    "fmla v0.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "mov v22.16b, v14.16b\n"
-    "mov v21.16b, v14.16b\n"
-    "fmla v24.4s, v29.4s, v4.4s\n"
-    "fmla v16.4s, v29.4s, v7.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v1.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v3.4s, v29.4s, v6.4s\n"
-    "str s24, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "ldr s27, [x11, x15]\n"
-    "fmla v22.4s, v29.4s, v12.4s\n"
-    "ldr s23, [x10, x17]\n"
-    "fmla v13.4s, v26.4s, v7.4s\n"
-    "fmla v2.4s, v26.4s, v10.4s\n"
-    "fmla v3.4s, v26.4s, v8.4s\n"
-    "fmla v17.4s, v26.4s, v11.4s\n"
-    "fmla v0.4s, v30.4s, v5.4s\n"
-    "ldr s24, [x9, x19]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "ldr s29, [x8, x21]\n"
-    "fmla v3.4s, v28.4s, v10.4s\n"
-    "ldr s14, [x12, x15]\n"
-    "fmla v20.4s, v27.4s, v4.4s\n"
-    "add x8, x8, #4\n"
-    "fmla v0.4s, v27.4s, v7.4s\n"
-    "fmla v1.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v6.4s\n"
-    "str s20, [x24]\n"
-    "fmla v19.4s, v27.4s, v9.4s\n"
-    "fmla v16.4s, v23.4s, v4.4s\n"
-    "ldr s25, [x11, x17]\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "ldr s30, [x10, x19]\n"
-    "fmla v2.4s, v23.4s, v5.4s\n"
-    "fmla v15.4s, v23.4s, v10.4s\n"
-    "str s16, [x23, %[output_col_stride1]]\n"
-    "fmla v18.4s, v23.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v6.4s\n"
-    "ldr s26, [x9, x21]\n"
-    "fmla v19.4s, v23.4s, v11.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v22.4s, v23.4s, v9.4s\n"
-    "fmla v21.4s, v23.4s, v12.4s\n"
-    "fmla v13.4s, v24.4s, v4.4s\n"
-    "ldr s27, [x12, x17]\n"
-    "fmla v2.4s, v24.4s, v7.4s\n"
-    "ldr s20, [x11, x19]\n"
-    "fmla v3.4s, v24.4s, v5.4s\n"
-    "fmla v18.4s, v24.4s, v10.4s\n"
-    "str s13, [%[outptr0], x26]\n"
-    "fmla v17.4s, v24.4s, v8.4s\n"
-    "fmla v22.4s, v24.4s, v11.4s\n"
-    "ldr s23, [x10, x21]\n"
-    "fmla v3.4s, v29.4s, v7.4s\n"
-    "ldr s24, [x12, x19]\n"
-    "fmla v17.4s, v29.4s, v10.4s\n"
-    "ldr s16, [x11, x21]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "add x10, x10, #4\n"
-    "fmla v15.4s, v14.4s, v5.4s\n"
-    "add x11, x11, #4\n"
-    "fmla v19.4s, v14.4s, v6.4s\n"
-    "ldr s13, [x12, x21]\n"
-    "str s0, [x25]\n"
-    "fmla v1.4s, v25.4s, v4.4s\n"
-    "fmla v15.4s, v25.4s, v7.4s\n"
-    "add x12, x12, #4\n"
-    "fmla v18.4s, v25.4s, v5.4s\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "str s1, [x24, %[output_col_stride1]]\n"
-    "fmla v22.4s, v25.4s, v6.4s\n"
-    "fmla v21.4s, v25.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v4.4s\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "fmla v17.4s, v30.4s, v5.4s\n"
-    "fmla v19.4s, v30.4s, v10.4s\n"
-    "fmla v22.4s, v30.4s, v8.4s\n"
-    "str s2, [x23, x26]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "fmla v3.4s, v26.4s, v4.4s\n"
-    "fmla v17.4s, v26.4s, v7.4s\n"
-    "fmla v22.4s, v26.4s, v10.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v19.4s, v27.4s, v5.4s\n"
-    "fmla v21.4s, v27.4s, v6.4s\n"
-    "str s3, [%[outptr0], x27]\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "str s15, [x25, %[output_col_stride1]]\n"
-    "fmla v22.4s, v20.4s, v5.4s\n"
-    "fmla v19.4s, v20.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "str s18, [x24, x26]\n"
-    "fmla v21.4s, v20.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "fmla v22.4s, v23.4s, v7.4s\n"
-    "fmla v19.4s, v24.4s, v4.4s\n"
-    "fmla v21.4s, v23.4s, v10.4s\n"
-    "str s17, [x23, x27]\n"
-    "fmla v22.4s, v16.4s, v4.4s\n"
-    "str s19, [x25, x26]\n"
-    "add x23, x23, #4\n"
-    "fmla v21.4s, v24.4s, v5.4s\n"
-    "str s22, [x24, x27]\n"
-    "add x24, x24, #4\n"
-    "fmla v21.4s, v16.4s, v7.4s\n"
-    "fmla v21.4s, v13.4s, v4.4s\n"
-    "str s21, [x25, x27]\n"
-    "add x25, x25, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *inptrs[6][6],
-  float *outptrs[4][4]
-)
-{
-  __asm __volatile(
-    "mov x27, xzr\n"
-    "mov x28, xzr\n"
-    "and x15, %[n_channels], #3\n"
-    "lsr x16, %[n_channels], #2\n"
-    "cbz x16, 4f\n"
-    "1:\n"
-    "ldr q13, [%[wbptr]]\n"
-    "ldr x17, [%[inptrs], 0]\n"
-    "mov v18.16b, v13.16b\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "mov v22.16b, v13.16b\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "mov v23.16b, v13.16b\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "mov v19.16b, v13.16b\n"
-    "ldr q9, [%[wbptr], #64]\n"
-    "mov v17.16b, v13.16b\n"
-    "ldr q8, [%[wbptr], #80]\n"
-    "mov v14.16b, v13.16b\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "mov v0.16b, v13.16b\n"
-    "ldr q6, [%[wbptr], #112]\n"
-    "mov v1.16b, v13.16b\n"
-    "ldr q5, [%[wbptr], #128]\n"
-    "mov v2.16b, v13.16b\n"
-    "ldr q4, [%[wbptr], #144]\n"
-    "ldr q29, [x17, x27]\n"
-    "ldr x7, [%[inptrs], 48]\n"
-    "fmla v18.4s, v29.4s, v12.4s\n"
-    "ldr x17, [%[inptrs], 8]\n"
-    "ldr q27, [x7, x27]\n"
-    "ldr x19, [%[inptrs], 96]\n"
-    "ldr q28, [x17, x27]\n"
-    "ldr x7, [%[inptrs], 56]\n"
-    "ldr q25, [x19, x27]\n"
-    "ldr x17, [%[inptrs], 16]\n"
-    "ldr q16, [x7, x27]\n"
-    "ldr x20, [%[inptrs], 144]\n"
-    "ldr q15, [x17, x27]\n"
-    "ldr x19, [%[inptrs], 104]\n"
-    "ldr q21, [x20, x27]\n"
-    "subs x16, x16, #1\n"
-    "ldr q29, [x19, x27]\n"
-    "beq 3f\n"
-    "2:\n"
-    "mov v3.16b, v13.16b\n"
-    "ldr x7, [%[inptrs], 64]\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "ldr x17, [%[inptrs], 24]\n"
-    "fmla v22.4s, v27.4s, v12.4s\n"
-    "ldr q30, [x7, x27]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr x21, [%[inptrs], 192]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 152]\n"
-    "fmla v18.4s, v28.4s, v11.4s\n"
-    "ldr q24, [x17, x27]\n"
-    "fmla v22.4s, v25.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 112]\n"
-    "fmla v23.4s, v16.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 72]\n"
-    "fmla v17.4s, v16.4s, v12.4s\n"
-    "ldr x17, [%[inptrs], 32]\n"
-    "fmla v18.4s, v25.4s, v6.4s\n"
-    "ldr q31, [x21, x27]\n"
-    "fmla v22.4s, v16.4s, v11.4s\n"
-    "ldr x22, [%[inptrs], 240]\n"
-    "fmla v23.4s, v15.4s, v11.4s\n"
-    "ldr x21, [%[inptrs], 200]\n"
-    "fmla v14.4s, v15.4s, v12.4s\n"
-    "ldr x23, [%[outptrs], 0]\n"
-    "fmla v18.4s, v16.4s, v8.4s\n"
-    "ldr q25, [x20, x27]\n"
-    "fmla v22.4s, v21.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v19.4s, v21.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 32]\n"
-    "fmla v0.4s, v21.4s, v12.4s\n"
-    "ldr q21, [x19, x27]\n"
-    "fmla v18.4s, v15.4s, v10.4s\n"
-    "ldr q20, [x7, x27]\n"
-    "fmla v22.4s, v29.4s, v8.4s\n"
-    "ldr x19, [%[inptrs], 120]\n"
-    "fmla v23.4s, v29.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 80]\n"
-    "fmla v19.4s, v29.4s, v11.4s\n"
-    "ldr x25, [%[outptrs], 64]\n"
-    "fmla v18.4s, v29.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 96]\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "ldr q26, [x17, x27]\n"
-    "fmla v22.4s, v30.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 40]\n"
-    "fmla v23.4s, v30.4s, v8.4s\n"
-    "subs x16, x16, #1\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "fmla v14.4s, v30.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr q27, [x22, x27]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "ldr x22, [%[inptrs], 248]\n"
-    "fmla v23.4s, v24.4s, v10.4s\n"
-    "fmla v19.4s, v31.4s, v6.4s\n"
-    "fmla v14.4s, v24.4s, v11.4s\n"
-    "ldr q30, [x21, x27]\n"
-    "fmla v0.4s, v31.4s, v9.4s\n"
-    "ldr q24, [x20, x27]\n"
-    "fmla v22.4s, v25.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 208]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "fmla v1.4s, v25.4s, v9.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v21.4s, v4.4s\n"
-    "fmla v22.4s, v21.4s, v7.4s\n"
-    "fmla v23.4s, v21.4s, v5.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v14.4s, v21.4s, v6.4s\n"
-    "fmla v17.4s, v21.4s, v8.4s\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "str q18, [x23, x28]\n"
-    "mov v16.16b, v13.16b\n"
-    "fmla v2.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 8]\n"
-    "fmla v23.4s, v20.4s, v7.4s\n"
-    "fmla v14.4s, v20.4s, v8.4s\n"
-    "fmla v16.4s, v25.4s, v12.4s\n"
-    "ldr q25, [x19, x27]\n"
-    "fmla v17.4s, v20.4s, v10.4s\n"
-    "ldr x19, [%[inptrs], 128]\n"
-    "fmla v2.4s, v20.4s, v11.4s\n"
-    "fmla v3.4s, v20.4s, v9.4s\n"
-    "fmla v14.4s, v26.4s, v10.4s\n"
-    "fmla v0.4s, v27.4s, v6.4s\n"
-    "mov v15.16b, v13.16b\n"
-    "fmla v19.4s, v30.4s, v5.4s\n"
-    "fmla v1.4s, v30.4s, v6.4s\n"
-    "fmla v16.4s, v30.4s, v9.4s\n"
-    "fmla v3.4s, v26.4s, v11.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "ldr q27, [x17, x27]\n"
-    "fmla v0.4s, v30.4s, v8.4s\n"
-    "ldr q28, [x22, x27]\n"
-    "fmla v22.4s, v24.4s, v4.4s\n"
-    "ldr x7, [%[inptrs], 88]\n"
-    "fmla v19.4s, v24.4s, v7.4s\n"
-    "ldr x22, [%[inptrs], 256]\n"
-    "fmla v17.4s, v24.4s, v5.4s\n"
-    "ldr x17, [%[inptrs], 0]\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v8.4s\n"
-    "str q22, [x24, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v2.4s, v24.4s, v6.4s\n"
-    "ldr x24, [%[outptrs], 40]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "fmla v18.4s, v20.4s, v12.4s\n"
-    "ldr q22, [x21, x27]\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 216]\n"
-    "fmla v17.4s, v25.4s, v7.4s\n"
-    "fmla v14.4s, v25.4s, v5.4s\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v8.4s\n"
-    "fmla v3.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "str q23, [x23, x28]\n"
-    "mov v21.16b, v13.16b\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "fmla v2.4s, v29.4s, v10.4s\n"
-    "fmla v21.4s, v24.4s, v12.4s\n"
-    "ldr q30, [x20, x27]\n"
-    "fmla v3.4s, v29.4s, v8.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "ldr q31, [x19, x27]\n"
-    "fmla v0.4s, v28.4s, v5.4s\n"
-    "ldr x19, [%[inptrs], 136]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "ldr q26, [x7, x27]\n"
-    "fmla v3.4s, v27.4s, v10.4s\n"
-    "ldr q23, [x22, x27]\n"
-    "fmla v19.4s, v22.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 264]\n"
-    "fmla v0.4s, v22.4s, v7.4s\n"
-    "ldr x7, [%[inptrs], 48]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "fmla v16.4s, v22.4s, v8.4s\n"
-    "fmla v15.4s, v22.4s, v6.4s\n"
-    "fmla v21.4s, v22.4s, v9.4s\n"
-    "str q19, [x25, x28]\n"
-    "mov v24.16b, v13.16b\n"
-    "mov v20.16b, v13.16b\n"
-    "ldr q27, [x21, x27]\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 224]\n"
-    "fmla v24.4s, v25.4s, v12.4s\n"
-    "ldr q28, [x20, x27]\n"
-    "fmla v1.4s, v30.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v2.4s, v30.4s, v5.4s\n"
-    "ldr x25, [%[outptrs], 72]\n"
-    "str q17, [x24, x28]\n"
-    "fmla v16.4s, v30.4s, v10.4s\n"
-    "fmla v15.4s, v30.4s, v8.4s\n"
-    "ldr q22, [x19, x27]\n"
-    "fmla v18.4s, v30.4s, v6.4s\n"
-    "ldr x24, [%[outptrs], 48]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr x19, [%[inptrs], 96]\n"
-    "fmla v24.4s, v30.4s, v9.4s\n"
-    "fmla v20.4s, v30.4s, v12.4s\n"
-    "fmla v14.4s, v31.4s, v4.4s\n"
-    "ldr q30, [x22, x27]\n"
-    "fmla v2.4s, v31.4s, v7.4s\n"
-    "ldr q19, [x21, x27]\n"
-    "fmla v3.4s, v31.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 272]\n"
-    "fmla v15.4s, v31.4s, v10.4s\n"
-    "ldr x21, [%[inptrs], 232]\n"
-    "str q14, [x23, x28]\n"
-    "fmla v18.4s, v31.4s, v8.4s\n"
-    "fmla v24.4s, v31.4s, v11.4s\n"
-    "ldr q31, [x20, x27]\n"
-    "fmla v3.4s, v26.4s, v7.4s\n"
-    "ldr q17, [x22, x27]\n"
-    "fmla v0.4s, v23.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 280]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr q14, [x21, x27]\n"
-    "fmla v16.4s, v23.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 24]\n"
-    "fmla v21.4s, v23.4s, v6.4s\n"
-    "ldr q26, [x22, x27]\n"
-    "str q0, [x26, x28]\n"
-    "fmla v1.4s, v27.4s, v4.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "ldr q13, [%[wbptr]]\n"
-    "fmla v16.4s, v27.4s, v7.4s\n"
-    "ldr x26, [%[outptrs], 104]\n"
-    "fmla v21.4s, v27.4s, v8.4s\n"
-    "add x27, x27, #16\n"
-    "str q1, [x25, x28]\n"
-    "fmla v24.4s, v27.4s, v6.4s\n"
-    "fmla v20.4s, v27.4s, v9.4s\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "fmla v2.4s, v28.4s, v4.4s\n"
-    "ldr q29, [x17, x27]\n"
-    "fmla v15.4s, v28.4s, v7.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "fmla v18.4s, v28.4s, v5.4s\n"
-    "ldr x25, [%[outptrs], 80]\n"
-    "fmla v21.4s, v28.4s, v10.4s\n"
-    "ldr x17, [%[inptrs], 8]\n"
-    "str q2, [x24, x28]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "fmla v20.4s, v28.4s, v11.4s\n"
-    "ldr q9, [%[wbptr], #64]\n"
-    "fmla v3.4s, v22.4s, v4.4s\n"
-    "ldr q28, [x17, x27]\n"
-    "fmla v18.4s, v22.4s, v7.4s\n"
-    "ldr q25, [x19, x27]\n"
-    "fmla v24.4s, v22.4s, v10.4s\n"
-    "ldr x24, [%[outptrs], 56]\n"
-    "fmla v16.4s, v30.4s, v4.4s\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "str q3, [x23, x28]\n"
-    "fmla v21.4s, v30.4s, v5.4s\n"
-    "fmla v20.4s, v30.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 56]\n"
-    "fmla v15.4s, v19.4s, v4.4s\n"
-    "ldr x17, [%[inptrs], 16]\n"
-    "str q16, [x26, x28]\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v21.4s, v19.4s, v7.4s\n"
-    "ldr q16, [x7, x27]\n"
-    "fmla v20.4s, v19.4s, v8.4s\n"
-    "ldr q6, [%[wbptr], #112]\n"
-    "str q15, [x25, x28]\n"
-    "fmla v18.4s, v31.4s, v4.4s\n"
-    "fmla v24.4s, v31.4s, v7.4s\n"
-    "ldr q15, [x17, x27]\n"
-    "fmla v21.4s, v17.4s, v4.4s\n"
-    "ldr x25, [%[outptrs], 88]\n"
-    "fmla v20.4s, v31.4s, v10.4s\n"
-    "ldr q8, [%[wbptr], #80]\n"
-    "str q18, [x24, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v24.4s, v14.4s, v4.4s\n"
-    "ldr x26, [%[outptrs], 112]\n"
-    "mov v22.16b, v13.16b\n"
-    "ldr x20, [%[inptrs], 144]\n"
-    "str q21, [x26, x28]\n"
-    "fmla v20.4s, v17.4s, v5.4s\n"
-    "mov v23.16b, v13.16b\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "str q24, [x25, x28]\n"
-    "mov v19.16b, v13.16b\n"
-    "mov v17.16b, v13.16b\n"
-    "ldr q21, [x20, x27]\n"
-    "fmla v20.4s, v14.4s, v7.4s\n"
-    "ldr q5, [%[wbptr], #128]\n"
-    "mov v14.16b, v13.16b\n"
-    "ldr x26, [%[outptrs], 120]\n"
-    "mov v0.16b, v13.16b\n"
-    "ldr x19, [%[inptrs], 104]\n"
-    "mov v1.16b, v13.16b\n"
-    "mov v2.16b, v13.16b\n"
-    "fmla v20.4s, v26.4s, v4.4s\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "fmla v18.4s, v29.4s, v12.4s\n"
-    "ldr q29, [x19, x27]\n"
-    "str q20, [x26, x28]\n"
-    "ldr q4, [%[wbptr], #144]\n"
-    "add x28, x28, #16\n"
-    "bne 2b\n"
-    "3:\n"
-    "mov v3.16b, v13.16b\n"
-    "ldr x7, [%[inptrs], 64]\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "ldr x17, [%[inptrs], 24]\n"
-    "fmla v22.4s, v27.4s, v12.4s\n"
-    "ldr q30, [x7, x27]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr x21, [%[inptrs], 192]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 152]\n"
-    "fmla v18.4s, v28.4s, v11.4s\n"
-    "ldr q24, [x17, x27]\n"
-    "fmla v22.4s, v25.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 112]\n"
-    "fmla v23.4s, v16.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 72]\n"
-    "fmla v17.4s, v16.4s, v12.4s\n"
-    "ldr x17, [%[inptrs], 32]\n"
-    "fmla v18.4s, v25.4s, v6.4s\n"
-    "ldr q31, [x21, x27]\n"
-    "fmla v22.4s, v16.4s, v11.4s\n"
-    "ldr x22, [%[inptrs], 240]\n"
-    "fmla v23.4s, v15.4s, v11.4s\n"
-    "ldr x21, [%[inptrs], 200]\n"
-    "fmla v14.4s, v15.4s, v12.4s\n"
-    "ldr x23, [%[outptrs], 0]\n"
-    "fmla v18.4s, v16.4s, v8.4s\n"
-    "ldr q25, [x20, x27]\n"
-    "fmla v22.4s, v21.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v19.4s, v21.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 32]\n"
-    "fmla v0.4s, v21.4s, v12.4s\n"
-    "ldr q21, [x19, x27]\n"
-    "fmla v18.4s, v15.4s, v10.4s\n"
-    "ldr q20, [x7, x27]\n"
-    "fmla v22.4s, v29.4s, v8.4s\n"
-    "ldr x19, [%[inptrs], 120]\n"
-    "fmla v23.4s, v29.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 80]\n"
-    "fmla v19.4s, v29.4s, v11.4s\n"
-    "ldr x25, [%[outptrs], 64]\n"
-    "fmla v18.4s, v29.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 96]\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "ldr q26, [x17, x27]\n"
-    "fmla v22.4s, v30.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 40]\n"
-    "fmla v23.4s, v30.4s, v8.4s\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "fmla v14.4s, v30.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "mov v16.16b, v13.16b\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "fmla v19.4s, v31.4s, v6.4s\n"
-    "fmla v0.4s, v31.4s, v9.4s\n"
-    "mov v15.16b, v13.16b\n"
-    "fmla v23.4s, v24.4s, v10.4s\n"
-    "fmla v14.4s, v24.4s, v11.4s\n"
-    "ldr q27, [x22, x27]\n"
-    "fmla v22.4s, v25.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 248]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "fmla v1.4s, v25.4s, v9.4s\n"
-    "fmla v16.4s, v25.4s, v12.4s\n"
-    "ldr q30, [x21, x27]\n"
-    "fmla v18.4s, v21.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 208]\n"
-    "fmla v22.4s, v21.4s, v7.4s\n"
-    "fmla v23.4s, v21.4s, v5.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v17.4s, v21.4s, v8.4s\n"
-    "fmla v14.4s, v21.4s, v6.4s\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "str q18, [x23, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v2.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 8]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "ldr q24, [x20, x27]\n"
-    "fmla v23.4s, v20.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v17.4s, v20.4s, v10.4s\n"
-    "fmla v14.4s, v20.4s, v8.4s\n"
-    "fmla v2.4s, v20.4s, v11.4s\n"
-    "fmla v3.4s, v20.4s, v9.4s\n"
-    "fmla v18.4s, v20.4s, v12.4s\n"
-    "ldr q25, [x19, x27]\n"
-    "fmla v0.4s, v27.4s, v6.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v14.4s, v26.4s, v10.4s\n"
-    "ldr x19, [%[inptrs], 128]\n"
-    "fmla v3.4s, v26.4s, v11.4s\n"
-    "ldr q27, [x17, x27]\n"
-    "fmla v19.4s, v30.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 88]\n"
-    "fmla v0.4s, v30.4s, v8.4s\n"
-    "fmla v1.4s, v30.4s, v6.4s\n"
-    "fmla v16.4s, v30.4s, v9.4s\n"
-    "ldr q28, [x22, x27]\n"
-    "fmla v22.4s, v24.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 256]\n"
-    "fmla v19.4s, v24.4s, v7.4s\n"
-    "fmla v17.4s, v24.4s, v5.4s\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v8.4s\n"
-    "fmla v2.4s, v24.4s, v6.4s\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "str q22, [x24, x28]\n"
-    "mov v21.16b, v13.16b\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 40]\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v17.4s, v25.4s, v7.4s\n"
-    "fmla v21.4s, v24.4s, v12.4s\n"
-    "ldr q22, [x21, x27]\n"
-    "fmla v14.4s, v25.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 216]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v8.4s\n"
-    "str q23, [x23, x28]\n"
-    "mov v24.16b, v13.16b\n"
-    "mov v20.16b, v13.16b\n"
-    "ldr x23, [%[outptrs], 16]\n"
-    "fmla v3.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "fmla v24.4s, v25.4s, v12.4s\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr q30, [x20, x27]\n"
-    "fmla v2.4s, v29.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmla v3.4s, v29.4s, v8.4s\n"
-    "fmla v0.4s, v28.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "ldr q31, [x19, x27]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "ldr q26, [x7, x27]\n"
-    "fmla v19.4s, v22.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 136]\n"
-    "fmla v3.4s, v27.4s, v10.4s\n"
-    "ldr q23, [x22, x27]\n"
-    "fmla v0.4s, v22.4s, v7.4s\n"
-    "ldr x22, [%[inptrs], 264]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "fmla v16.4s, v22.4s, v8.4s\n"
-    "str q19, [x25, x28]\n"
-    "fmla v15.4s, v22.4s, v6.4s\n"
-    "fmla v21.4s, v22.4s, v9.4s\n"
-    "ldr q27, [x21, x27]\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "ldr q28, [x20, x27]\n"
-    "fmla v1.4s, v30.4s, v7.4s\n"
-    "ldr x21, [%[inptrs], 224]\n"
-    "fmla v2.4s, v30.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v16.4s, v30.4s, v10.4s\n"
-    "ldr x25, [%[outptrs], 72]\n"
-    "str q17, [x24, x28]\n"
-    "fmla v15.4s, v30.4s, v8.4s\n"
-    "fmla v18.4s, v30.4s, v6.4s\n"
-    "ldr q22, [x19, x27]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr x24, [%[outptrs], 48]\n"
-    "fmla v24.4s, v30.4s, v9.4s\n"
-    "fmla v20.4s, v30.4s, v12.4s\n"
-    "fmla v14.4s, v31.4s, v4.4s\n"
-    "ldr q30, [x22, x27]\n"
-    "fmla v2.4s, v31.4s, v7.4s\n"
-    "ldr q19, [x21, x27]\n"
-    "fmla v3.4s, v31.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 272]\n"
-    "fmla v15.4s, v31.4s, v10.4s\n"
-    "ldr x21, [%[inptrs], 232]\n"
-    "str q14, [x23, x28]\n"
-    "fmla v18.4s, v31.4s, v8.4s\n"
-    "fmla v24.4s, v31.4s, v11.4s\n"
-    "ldr q31, [x20, x27]\n"
-    "fmla v3.4s, v26.4s, v7.4s\n"
-    "ldr q17, [x22, x27]\n"
-    "fmla v0.4s, v23.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 280]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr q14, [x21, x27]\n"
-    "fmla v16.4s, v23.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 24]\n"
-    "fmla v21.4s, v23.4s, v6.4s\n"
-    "ldr q26, [x22, x27]\n"
-    "str q0, [x26, x28]\n"
-    "fmla v1.4s, v27.4s, v4.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 104]\n"
-    "fmla v16.4s, v27.4s, v7.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v21.4s, v27.4s, v8.4s\n"
-    "fmla v24.4s, v27.4s, v6.4s\n"
-    "str q1, [x25, x28]\n"
-    "fmla v20.4s, v27.4s, v9.4s\n"
-    "fmla v2.4s, v28.4s, v4.4s\n"
-    "ldr x25, [%[outptrs], 80]\n"
-    "fmla v15.4s, v28.4s, v7.4s\n"
-    "fmla v18.4s, v28.4s, v5.4s\n"
-    "fmla v21.4s, v28.4s, v10.4s\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "fmla v20.4s, v28.4s, v11.4s\n"
-    "fmla v3.4s, v22.4s, v4.4s\n"
-    "str q2, [x24, x28]\n"
-    "fmla v16.4s, v30.4s, v4.4s\n"
-    "fmla v18.4s, v22.4s, v7.4s\n"
-    "ldr x24, [%[outptrs], 56]\n"
-    "fmla v24.4s, v22.4s, v10.4s\n"
-    "fmla v21.4s, v30.4s, v5.4s\n"
-    "str q3, [x23, x28]\n"
-    "fmla v20.4s, v30.4s, v6.4s\n"
-    "str q16, [x26, x28]\n"
-    "fmla v15.4s, v19.4s, v4.4s\n"
-    "fmla v18.4s, v31.4s, v4.4s\n"
-    "ldr x26, [%[outptrs], 112]\n"
-    "fmla v21.4s, v19.4s, v7.4s\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v8.4s\n"
-    "str q15, [x25, x28]\n"
-    "str q18, [x24, x28]\n"
-    "ldr x25, [%[outptrs], 88]\n"
-    "fmla v24.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v17.4s, v4.4s\n"
-    "fmla v20.4s, v31.4s, v10.4s\n"
-    "str q21, [x26, x28]\n"
-    "fmla v20.4s, v17.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 120]\n"
-    "fmla v24.4s, v14.4s, v4.4s\n"
-    "fmla v20.4s, v14.4s, v7.4s\n"
-    "str q24, [x25, x28]\n"
-    "fmla v20.4s, v26.4s, v4.4s\n"
-    "str q20, [x26, x28]\n"
-    "add x28, x28, #16\n"
-    "4:\n"
-    "cbz x15, 7f\n"
-    "ldr s13, [%[wbptr]]\n"
-    "mov v18.16b, v13.16b\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "mov v22.16b, v13.16b\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "mov v23.16b, v13.16b\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "mov v19.16b, v13.16b\n"
-    "ldr s9, [%[wbptr], #16]\n"
-    "mov v17.16b, v13.16b\n"
-    "ldr s8, [%[wbptr], #20]\n"
-    "mov v14.16b, v13.16b\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "mov v0.16b, v13.16b\n"
-    "ldr s6, [%[wbptr], #28]\n"
-    "mov v1.16b, v13.16b\n"
-    "ldr s5, [%[wbptr], #32]\n"
-    "mov v2.16b, v13.16b\n"
-    "ldr s4, [%[wbptr], #36]\n"
-    "ldr x17, [%[inptrs], 0]\n"
-    "ldr x7, [%[inptrs], 48]\n"
-    "ldr x19, [%[inptrs], 96]\n"
-    "ldr x20, [%[inptrs], 144]\n"
-    "subs x15, x15, #1\n"
-    "ldr s29, [x17, x27]\n"
-    "fmla v18.4s, v29.4s, v12.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "ldr s25, [x19, x27]\n"
-    "ldr x17, [%[inptrs], 8]\n"
-    "ldr s21, [x20, x27]\n"
-    "ldr x7, [%[inptrs], 56]\n"
-    "ldr s28, [x17, x27]\n"
-    "ldr x19, [%[inptrs], 104]\n"
-    "ldr s16, [x7, x27]\n"
-    "ldr x17, [%[inptrs], 16]\n"
-    "ldr s29, [x19, x27]\n"
-    "ldr s15, [x17, x27]\n"
-    "beq 6f\n"
-    "5:\n"
-    "mov v3.16b, v13.16b\n"
-    "ldr x7, [%[inptrs], 64]\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "ldr x17, [%[inptrs], 24]\n"
-    "fmla v22.4s, v27.4s, v12.4s\n"
-    "ldr s30, [x7, x27]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr x21, [%[inptrs], 192]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 152]\n"
-    "fmla v18.4s, v28.4s, v11.4s\n"
-    "ldr s24, [x17, x27]\n"
-    "fmla v22.4s, v25.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 112]\n"
-    "fmla v23.4s, v16.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 72]\n"
-    "fmla v17.4s, v16.4s, v12.4s\n"
-    "ldr x17, [%[inptrs], 32]\n"
-    "fmla v18.4s, v25.4s, v6.4s\n"
-    "ldr s31, [x21, x27]\n"
-    "fmla v22.4s, v16.4s, v11.4s\n"
-    "ldr x22, [%[inptrs], 240]\n"
-    "fmla v23.4s, v15.4s, v11.4s\n"
-    "ldr x21, [%[inptrs], 200]\n"
-    "fmla v14.4s, v15.4s, v12.4s\n"
-    "ldr x23, [%[outptrs], 0]\n"
-    "fmla v18.4s, v16.4s, v8.4s\n"
-    "ldr s25, [x20, x27]\n"
-    "fmla v22.4s, v21.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v19.4s, v21.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 32]\n"
-    "fmla v0.4s, v21.4s, v12.4s\n"
-    "ldr s21, [x19, x27]\n"
-    "fmla v18.4s, v15.4s, v10.4s\n"
-    "ldr s20, [x7, x27]\n"
-    "fmla v22.4s, v29.4s, v8.4s\n"
-    "ldr x19, [%[inptrs], 120]\n"
-    "fmla v23.4s, v29.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 80]\n"
-    "fmla v19.4s, v29.4s, v11.4s\n"
-    "ldr x25, [%[outptrs], 64]\n"
-    "fmla v18.4s, v29.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 96]\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "ldr s26, [x17, x27]\n"
-    "fmla v22.4s, v30.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 40]\n"
-    "fmla v23.4s, v30.4s, v8.4s\n"
-    "subs x15, x15, #1\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "fmla v14.4s, v30.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr s27, [x22, x27]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "ldr x22, [%[inptrs], 248]\n"
-    "fmla v23.4s, v24.4s, v10.4s\n"
-    "fmla v19.4s, v31.4s, v6.4s\n"
-    "fmla v14.4s, v24.4s, v11.4s\n"
-    "ldr s30, [x21, x27]\n"
-    "fmla v0.4s, v31.4s, v9.4s\n"
-    "ldr s24, [x20, x27]\n"
-    "fmla v22.4s, v25.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 208]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "fmla v1.4s, v25.4s, v9.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v21.4s, v4.4s\n"
-    "fmla v22.4s, v21.4s, v7.4s\n"
-    "fmla v23.4s, v21.4s, v5.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v14.4s, v21.4s, v6.4s\n"
-    "fmla v17.4s, v21.4s, v8.4s\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "str s18, [x23, x28]\n"
-    "mov v16.16b, v13.16b\n"
-    "fmla v2.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 8]\n"
-    "fmla v23.4s, v20.4s, v7.4s\n"
-    "fmla v14.4s, v20.4s, v8.4s\n"
-    "fmla v16.4s, v25.4s, v12.4s\n"
-    "ldr s25, [x19, x27]\n"
-    "fmla v17.4s, v20.4s, v10.4s\n"
-    "ldr x19, [%[inptrs], 128]\n"
-    "fmla v2.4s, v20.4s, v11.4s\n"
-    "fmla v3.4s, v20.4s, v9.4s\n"
-    "fmla v14.4s, v26.4s, v10.4s\n"
-    "fmla v0.4s, v27.4s, v6.4s\n"
-    "mov v15.16b, v13.16b\n"
-    "fmla v19.4s, v30.4s, v5.4s\n"
-    "fmla v1.4s, v30.4s, v6.4s\n"
-    "fmla v16.4s, v30.4s, v9.4s\n"
-    "fmla v3.4s, v26.4s, v11.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "ldr s27, [x17, x27]\n"
-    "fmla v0.4s, v30.4s, v8.4s\n"
-    "ldr s28, [x22, x27]\n"
-    "fmla v22.4s, v24.4s, v4.4s\n"
-    "ldr x7, [%[inptrs], 88]\n"
-    "fmla v19.4s, v24.4s, v7.4s\n"
-    "ldr x22, [%[inptrs], 256]\n"
-    "fmla v17.4s, v24.4s, v5.4s\n"
-    "ldr x17, [%[inptrs], 0]\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v8.4s\n"
-    "str s22, [x24, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v2.4s, v24.4s, v6.4s\n"
-    "ldr x24, [%[outptrs], 40]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "fmla v18.4s, v20.4s, v12.4s\n"
-    "ldr s22, [x21, x27]\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 216]\n"
-    "fmla v17.4s, v25.4s, v7.4s\n"
-    "fmla v14.4s, v25.4s, v5.4s\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v8.4s\n"
-    "fmla v3.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "str s23, [x23, x28]\n"
-    "mov v21.16b, v13.16b\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "fmla v2.4s, v29.4s, v10.4s\n"
-    "fmla v21.4s, v24.4s, v12.4s\n"
-    "ldr s30, [x20, x27]\n"
-    "fmla v3.4s, v29.4s, v8.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "ldr s31, [x19, x27]\n"
-    "fmla v0.4s, v28.4s, v5.4s\n"
-    "ldr x19, [%[inptrs], 136]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "ldr s26, [x7, x27]\n"
-    "fmla v3.4s, v27.4s, v10.4s\n"
-    "ldr s23, [x22, x27]\n"
-    "fmla v19.4s, v22.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 264]\n"
-    "fmla v0.4s, v22.4s, v7.4s\n"
-    "ldr x7, [%[inptrs], 48]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "fmla v16.4s, v22.4s, v8.4s\n"
-    "fmla v15.4s, v22.4s, v6.4s\n"
-    "fmla v21.4s, v22.4s, v9.4s\n"
-    "str s19, [x25, x28]\n"
-    "mov v24.16b, v13.16b\n"
-    "mov v20.16b, v13.16b\n"
-    "ldr s27, [x21, x27]\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 224]\n"
-    "fmla v24.4s, v25.4s, v12.4s\n"
-    "ldr s28, [x20, x27]\n"
-    "fmla v1.4s, v30.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v2.4s, v30.4s, v5.4s\n"
-    "ldr x25, [%[outptrs], 72]\n"
-    "str s17, [x24, x28]\n"
-    "fmla v16.4s, v30.4s, v10.4s\n"
-    "fmla v15.4s, v30.4s, v8.4s\n"
-    "ldr s22, [x19, x27]\n"
-    "fmla v18.4s, v30.4s, v6.4s\n"
-    "ldr x24, [%[outptrs], 48]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr x19, [%[inptrs], 96]\n"
-    "fmla v24.4s, v30.4s, v9.4s\n"
-    "fmla v20.4s, v30.4s, v12.4s\n"
-    "fmla v14.4s, v31.4s, v4.4s\n"
-    "ldr s30, [x22, x27]\n"
-    "fmla v2.4s, v31.4s, v7.4s\n"
-    "ldr s19, [x21, x27]\n"
-    "fmla v3.4s, v31.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 272]\n"
-    "fmla v15.4s, v31.4s, v10.4s\n"
-    "ldr x21, [%[inptrs], 232]\n"
-    "str s14, [x23, x28]\n"
-    "fmla v18.4s, v31.4s, v8.4s\n"
-    "fmla v24.4s, v31.4s, v11.4s\n"
-    "ldr s31, [x20, x27]\n"
-    "fmla v3.4s, v26.4s, v7.4s\n"
-    "ldr s17, [x22, x27]\n"
-    "fmla v0.4s, v23.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 280]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr s14, [x21, x27]\n"
-    "fmla v16.4s, v23.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 24]\n"
-    "fmla v21.4s, v23.4s, v6.4s\n"
-    "ldr s26, [x22, x27]\n"
-    "str s0, [x26, x28]\n"
-    "fmla v1.4s, v27.4s, v4.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "ldr s13, [%[wbptr]]\n"
-    "fmla v16.4s, v27.4s, v7.4s\n"
-    "ldr x26, [%[outptrs], 104]\n"
-    "fmla v21.4s, v27.4s, v8.4s\n"
-    "add x27, x27, #4\n"
-    "str s1, [x25, x28]\n"
-    "fmla v24.4s, v27.4s, v6.4s\n"
-    "fmla v20.4s, v27.4s, v9.4s\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "fmla v2.4s, v28.4s, v4.4s\n"
-    "ldr s29, [x17, x27]\n"
-    "fmla v15.4s, v28.4s, v7.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "fmla v18.4s, v28.4s, v5.4s\n"
-    "ldr x25, [%[outptrs], 80]\n"
-    "fmla v21.4s, v28.4s, v10.4s\n"
-    "ldr x17, [%[inptrs], 8]\n"
-    "str s2, [x24, x28]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "fmla v20.4s, v28.4s, v11.4s\n"
-    "ldr s9, [%[wbptr], #16]\n"
-    "fmla v3.4s, v22.4s, v4.4s\n"
-    "ldr s28, [x17, x27]\n"
-    "fmla v18.4s, v22.4s, v7.4s\n"
-    "ldr s25, [x19, x27]\n"
-    "fmla v24.4s, v22.4s, v10.4s\n"
-    "ldr x24, [%[outptrs], 56]\n"
-    "fmla v16.4s, v30.4s, v4.4s\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "str s3, [x23, x28]\n"
-    "fmla v21.4s, v30.4s, v5.4s\n"
-    "fmla v20.4s, v30.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 56]\n"
-    "fmla v15.4s, v19.4s, v4.4s\n"
-    "ldr x17, [%[inptrs], 16]\n"
-    "str s16, [x26, x28]\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v21.4s, v19.4s, v7.4s\n"
-    "ldr s16, [x7, x27]\n"
-    "fmla v20.4s, v19.4s, v8.4s\n"
-    "ldr s6, [%[wbptr], #28]\n"
-    "str s15, [x25, x28]\n"
-    "fmla v18.4s, v31.4s, v4.4s\n"
-    "fmla v24.4s, v31.4s, v7.4s\n"
-    "ldr s15, [x17, x27]\n"
-    "fmla v21.4s, v17.4s, v4.4s\n"
-    "ldr x25, [%[outptrs], 88]\n"
-    "fmla v20.4s, v31.4s, v10.4s\n"
-    "ldr s8, [%[wbptr], #20]\n"
-    "str s18, [x24, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v24.4s, v14.4s, v4.4s\n"
-    "ldr x26, [%[outptrs], 112]\n"
-    "mov v22.16b, v13.16b\n"
-    "ldr x20, [%[inptrs], 144]\n"
-    "str s21, [x26, x28]\n"
-    "fmla v20.4s, v17.4s, v5.4s\n"
-    "mov v23.16b, v13.16b\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "str s24, [x25, x28]\n"
-    "mov v19.16b, v13.16b\n"
-    "mov v17.16b, v13.16b\n"
-    "ldr s21, [x20, x27]\n"
-    "fmla v20.4s, v14.4s, v7.4s\n"
-    "ldr s5, [%[wbptr], #32]\n"
-    "mov v14.16b, v13.16b\n"
-    "ldr x26, [%[outptrs], 120]\n"
-    "mov v0.16b, v13.16b\n"
-    "ldr x19, [%[inptrs], 104]\n"
-    "mov v1.16b, v13.16b\n"
-    "mov v2.16b, v13.16b\n"
-    "fmla v20.4s, v26.4s, v4.4s\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "fmla v18.4s, v29.4s, v12.4s\n"
-    "ldr s29, [x19, x27]\n"
-    "str s20, [x26, x28]\n"
-    "ldr s4, [%[wbptr], #36]\n"
-    "add x28, x28, #4\n"
-    "bne 5b\n"
-    "6:\n"
-    "mov v3.16b, v13.16b\n"
-    "ldr x7, [%[inptrs], 64]\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "ldr x17, [%[inptrs], 24]\n"
-    "fmla v22.4s, v27.4s, v12.4s\n"
-    "ldr s30, [x7, x27]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr x21, [%[inptrs], 192]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 152]\n"
-    "fmla v18.4s, v28.4s, v11.4s\n"
-    "ldr s24, [x17, x27]\n"
-    "fmla v22.4s, v25.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 112]\n"
-    "fmla v23.4s, v16.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 72]\n"
-    "fmla v17.4s, v16.4s, v12.4s\n"
-    "ldr x17, [%[inptrs], 32]\n"
-    "fmla v18.4s, v25.4s, v6.4s\n"
-    "ldr s31, [x21, x27]\n"
-    "fmla v22.4s, v16.4s, v11.4s\n"
-    "ldr x22, [%[inptrs], 240]\n"
-    "fmla v23.4s, v15.4s, v11.4s\n"
-    "ldr x21, [%[inptrs], 200]\n"
-    "fmla v14.4s, v15.4s, v12.4s\n"
-    "ldr x23, [%[outptrs], 0]\n"
-    "fmla v18.4s, v16.4s, v8.4s\n"
-    "ldr s25, [x20, x27]\n"
-    "fmla v22.4s, v21.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v19.4s, v21.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 32]\n"
-    "fmla v0.4s, v21.4s, v12.4s\n"
-    "ldr s21, [x19, x27]\n"
-    "fmla v18.4s, v15.4s, v10.4s\n"
-    "ldr s20, [x7, x27]\n"
-    "fmla v22.4s, v29.4s, v8.4s\n"
-    "ldr x19, [%[inptrs], 120]\n"
-    "fmla v23.4s, v29.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 80]\n"
-    "fmla v19.4s, v29.4s, v11.4s\n"
-    "ldr x25, [%[outptrs], 64]\n"
-    "fmla v18.4s, v29.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 96]\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "ldr s26, [x17, x27]\n"
-    "fmla v22.4s, v30.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 40]\n"
-    "fmla v23.4s, v30.4s, v8.4s\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "fmla v14.4s, v30.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "mov v16.16b, v13.16b\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "fmla v19.4s, v31.4s, v6.4s\n"
-    "fmla v0.4s, v31.4s, v9.4s\n"
-    "mov v15.16b, v13.16b\n"
-    "fmla v23.4s, v24.4s, v10.4s\n"
-    "fmla v14.4s, v24.4s, v11.4s\n"
-    "ldr s27, [x22, x27]\n"
-    "fmla v22.4s, v25.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 248]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "fmla v1.4s, v25.4s, v9.4s\n"
-    "fmla v16.4s, v25.4s, v12.4s\n"
-    "ldr s30, [x21, x27]\n"
-    "fmla v18.4s, v21.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 208]\n"
-    "fmla v22.4s, v21.4s, v7.4s\n"
-    "fmla v23.4s, v21.4s, v5.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v17.4s, v21.4s, v8.4s\n"
-    "fmla v14.4s, v21.4s, v6.4s\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "str s18, [x23, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v2.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 8]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "ldr s24, [x20, x27]\n"
-    "fmla v23.4s, v20.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v17.4s, v20.4s, v10.4s\n"
-    "fmla v14.4s, v20.4s, v8.4s\n"
-    "fmla v2.4s, v20.4s, v11.4s\n"
-    "fmla v3.4s, v20.4s, v9.4s\n"
-    "fmla v18.4s, v20.4s, v12.4s\n"
-    "ldr s25, [x19, x27]\n"
-    "fmla v0.4s, v27.4s, v6.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v14.4s, v26.4s, v10.4s\n"
-    "ldr x19, [%[inptrs], 128]\n"
-    "fmla v3.4s, v26.4s, v11.4s\n"
-    "ldr s27, [x17, x27]\n"
-    "fmla v19.4s, v30.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 88]\n"
-    "fmla v0.4s, v30.4s, v8.4s\n"
-    "fmla v1.4s, v30.4s, v6.4s\n"
-    "fmla v16.4s, v30.4s, v9.4s\n"
-    "ldr s28, [x22, x27]\n"
-    "fmla v22.4s, v24.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 256]\n"
-    "fmla v19.4s, v24.4s, v7.4s\n"
-    "fmla v17.4s, v24.4s, v5.4s\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v8.4s\n"
-    "fmla v2.4s, v24.4s, v6.4s\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "str s22, [x24, x28]\n"
-    "mov v21.16b, v13.16b\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 40]\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v17.4s, v25.4s, v7.4s\n"
-    "fmla v21.4s, v24.4s, v12.4s\n"
-    "ldr s22, [x21, x27]\n"
-    "fmla v14.4s, v25.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 216]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v8.4s\n"
-    "str s23, [x23, x28]\n"
-    "mov v24.16b, v13.16b\n"
-    "mov v20.16b, v13.16b\n"
-    "ldr x23, [%[outptrs], 16]\n"
-    "fmla v3.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "fmla v24.4s, v25.4s, v12.4s\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr s30, [x20, x27]\n"
-    "fmla v2.4s, v29.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmla v3.4s, v29.4s, v8.4s\n"
-    "fmla v0.4s, v28.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "ldr s31, [x19, x27]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "ldr s26, [x7, x27]\n"
-    "fmla v19.4s, v22.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 136]\n"
-    "fmla v3.4s, v27.4s, v10.4s\n"
-    "ldr s23, [x22, x27]\n"
-    "fmla v0.4s, v22.4s, v7.4s\n"
-    "ldr x22, [%[inptrs], 264]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "fmla v16.4s, v22.4s, v8.4s\n"
-    "str s19, [x25, x28]\n"
-    "fmla v15.4s, v22.4s, v6.4s\n"
-    "fmla v21.4s, v22.4s, v9.4s\n"
-    "ldr s27, [x21, x27]\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "ldr s28, [x20, x27]\n"
-    "fmla v1.4s, v30.4s, v7.4s\n"
-    "ldr x21, [%[inptrs], 224]\n"
-    "fmla v2.4s, v30.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v16.4s, v30.4s, v10.4s\n"
-    "ldr x25, [%[outptrs], 72]\n"
-    "str s17, [x24, x28]\n"
-    "fmla v15.4s, v30.4s, v8.4s\n"
-    "fmla v18.4s, v30.4s, v6.4s\n"
-    "ldr s22, [x19, x27]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr x24, [%[outptrs], 48]\n"
-    "fmla v24.4s, v30.4s, v9.4s\n"
-    "fmla v20.4s, v30.4s, v12.4s\n"
-    "fmla v14.4s, v31.4s, v4.4s\n"
-    "ldr s30, [x22, x27]\n"
-    "fmla v2.4s, v31.4s, v7.4s\n"
-    "ldr s19, [x21, x27]\n"
-    "fmla v3.4s, v31.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 272]\n"
-    "fmla v15.4s, v31.4s, v10.4s\n"
-    "ldr x21, [%[inptrs], 232]\n"
-    "str s14, [x23, x28]\n"
-    "fmla v18.4s, v31.4s, v8.4s\n"
-    "fmla v24.4s, v31.4s, v11.4s\n"
-    "ldr s31, [x20, x27]\n"
-    "fmla v3.4s, v26.4s, v7.4s\n"
-    "ldr s17, [x22, x27]\n"
-    "fmla v0.4s, v23.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 280]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr s14, [x21, x27]\n"
-    "fmla v16.4s, v23.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 24]\n"
-    "fmla v21.4s, v23.4s, v6.4s\n"
-    "ldr s26, [x22, x27]\n"
-    "str s0, [x26, x28]\n"
-    "fmla v1.4s, v27.4s, v4.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 104]\n"
-    "fmla v16.4s, v27.4s, v7.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v21.4s, v27.4s, v8.4s\n"
-    "fmla v24.4s, v27.4s, v6.4s\n"
-    "str s1, [x25, x28]\n"
-    "fmla v20.4s, v27.4s, v9.4s\n"
-    "fmla v2.4s, v28.4s, v4.4s\n"
-    "ldr x25, [%[outptrs], 80]\n"
-    "fmla v15.4s, v28.4s, v7.4s\n"
-    "fmla v18.4s, v28.4s, v5.4s\n"
-    "fmla v21.4s, v28.4s, v10.4s\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "fmla v20.4s, v28.4s, v11.4s\n"
-    "fmla v3.4s, v22.4s, v4.4s\n"
-    "str s2, [x24, x28]\n"
-    "fmla v16.4s, v30.4s, v4.4s\n"
-    "fmla v18.4s, v22.4s, v7.4s\n"
-    "ldr x24, [%[outptrs], 56]\n"
-    "fmla v24.4s, v22.4s, v10.4s\n"
-    "fmla v21.4s, v30.4s, v5.4s\n"
-    "str s3, [x23, x28]\n"
-    "fmla v20.4s, v30.4s, v6.4s\n"
-    "str s16, [x26, x28]\n"
-    "fmla v15.4s, v19.4s, v4.4s\n"
-    "fmla v18.4s, v31.4s, v4.4s\n"
-    "ldr x26, [%[outptrs], 112]\n"
-    "fmla v21.4s, v19.4s, v7.4s\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v8.4s\n"
-    "str s15, [x25, x28]\n"
-    "str s18, [x24, x28]\n"
-    "ldr x25, [%[outptrs], 88]\n"
-    "fmla v24.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v17.4s, v4.4s\n"
-    "fmla v20.4s, v31.4s, v10.4s\n"
-    "str s21, [x26, x28]\n"
-    "fmla v20.4s, v17.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 120]\n"
-    "fmla v24.4s, v14.4s, v4.4s\n"
-    "fmla v20.4s, v14.4s, v7.4s\n"
-    "str s24, [x25, x28]\n"
-    "fmla v20.4s, v26.4s, v4.4s\n"
-    "str s20, [x26, x28]\n"
-    "add x28, x28, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr)
-    : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x9, %[inptr0], %[input_row_stride]\n"
-    "add x28, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x16, %[outptr0], %[output_row_stride]\n"
-    "add x24, x9, %[input_row_stride]\n"
-    "add x25, x28, #64\n"
-    "add x23, x28, %[input_col_stride1]\n"
-    "add x26, x24, %[input_row_stride]\n"
-    "add x11, x23, #64\n"
-    "add x12, x23, %[input_col_stride1]\n"
-    "add x10, x26, %[input_row_stride]\n"
-    "add x13, x12, #64\n"
-    "add x14, x12, %[input_col_stride1]\n"
-    "add x27, x10, %[input_row_stride]\n"
-    "add x15, x14, #64\n"
-    "add x17, x16, %[output_row_stride]\n"
-    "add x7, x17, %[output_row_stride]\n"
-    "add x19, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x21, %[n_channels], #3\n"
-    "add x20, x19, %[output_col_stride1]\n"
-    "lsr x22, %[n_channels], #2\n"
-    "cbz x22, 4f\n"
-    "1:\n"
-    "ldr q21, [%[wbptr]]\n"
-    "subs x22, x22, #1\n"
-    "mov v7.16b, v21.16b\n"
-    "ldr q20, [%[wbptr], #16]\n"
-    "mov v3.16b, v21.16b\n"
-    "ldr q14, [%[wbptr], #32]\n"
-    "mov v6.16b, v21.16b\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "mov v15.16b, v21.16b\n"
-    "ldr q17, [%[wbptr], #64]\n"
-    "mov v2.16b, v21.16b\n"
-    "ldr q12, [%[wbptr], #80]\n"
-    "mov v5.16b, v21.16b\n"
-    "ldr q11, [%[wbptr], #96]\n"
-    "mov v0.16b, v21.16b\n"
-    "ldr q10, [%[wbptr], #112]\n"
-    "mov v16.16b, v21.16b\n"
-    "ldr q9, [%[wbptr], #128]\n"
-    "mov v1.16b, v21.16b\n"
-    "ldr q8, [%[wbptr], #144]\n"
-    "mov v4.16b, v21.16b\n"
-    "ldr q22, [%[inptr0]]\n"
-    "fmla v7.4s, v22.4s, v20.4s\n"
-    "ldr q19, [x9]\n"
-    "fmla v3.4s, v19.4s, v20.4s\n"
-    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v6.4s, v23.4s, v20.4s\n"
-    "ldr q18, [x24]\n"
-    "fmla v7.4s, v19.4s, v17.4s\n"
-    "ldr q27, [x9, %[input_col_stride1]]\n"
-    "fmla v3.4s, v18.4s, v17.4s\n"
-    "ldr q28, [%[inptr0], x28]\n"
-    "fmla v15.4s, v18.4s, v20.4s\n"
-    "ldr q25, [x26]\n"
-    "fmla v7.4s, v23.4s, v14.4s\n"
-    "ldr q22, [x24, %[input_col_stride1]]\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x8]\n"
-    "fmla v7.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "prfm pldl1keep, [x9, x8]\n"
-    "prfm pldl1keep, [%[inptr0], x25]\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "prfm pldl1keep, [x24, x8]\n"
-    "fmla v7.4s, v27.4s, v12.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "mov v18.16b, v21.16b\n"
-    "ldr q23, [x9, x28]\n"
-    "mov v19.16b, v21.16b\n"
-    "prfm pldl1keep, [x9, x25]\n"
-    "fmla v6.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x11]\n"
-    "fmla v2.4s, v27.4s, v20.4s\n"
-    "ldr q24, [%[inptr0], x23]\n"
-    "fmla v7.4s, v28.4s, v13.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v6.4s, v28.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x8]\n"
-    "fmla v5.4s, v28.4s, v20.4s\n"
-    "ldr q26, [x10]\n"
-    "fmla v3.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x25]\n"
-    "fmla v15.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x9, x11]\n"
-    "fmla v0.4s, v25.4s, v20.4s\n"
-    "ldr q25, [x26, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x13]\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, #64]\n"
-    "fmla v6.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x10, x8]\n"
-    "fmla v15.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x25]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "prfm pldl1keep, [x24, x11]\n"
-    "fmla v16.4s, v22.4s, v20.4s\n"
-    "ldr q22, [x24, x28]\n"
-    "fmla v7.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x13]\n"
-    "fmla v3.4s, v23.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v6.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, x8]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "prfm pldl1keep, [x10, x25]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "prfm pldl1keep, [x26, x11]\n"
-    "fmla v1.4s, v23.4s, v20.4s\n"
-    "ldr q23, [x9, x23]\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x24, x13]\n"
-    "fmla v5.4s, v24.4s, v14.4s\n"
-    "prfm pldl1keep, [x9, x15]\n"
-    "fmla v4.4s, v24.4s, v20.4s\n"
-    "ldr q24, [%[inptr0], x12]\n"
-    "fmla v15.4s, v26.4s, v10.4s\n"
-    "prfm pldl1keep, [x27, x25]\n"
-    "fmla v0.4s, v26.4s, v17.4s\n"
-    "ldr q29, [x27]\n"
-    "fmla v3.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x11]\n"
-    "fmla v15.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x26, x13]\n"
-    "fmla v2.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "prfm pldl1keep, [x27, x11]\n"
-    "fmla v16.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x10, x13]\n"
-    "fmla v18.4s, v25.4s, v20.4s\n"
-    "ldr q26, [x10, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, x15]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x27, x13]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x15]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x27, x15]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v5.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v22.4s, v14.4s\n"
-    "subs x22, x22, #1\n"
-    "fmla v1.4s, v22.4s, v17.4s\n"
-    "fmla v19.4s, v22.4s, v20.4s\n"
-    "mov v22.16b, v21.16b\n"
-    "fmla v6.4s, v23.4s, v11.4s\n"
-    "fmla v2.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v23.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v14.4s\n"
-    "fmla v4.4s, v23.4s, v17.4s\n"
-    "fmla v22.4s, v23.4s, v20.4s\n"
-    "ldr q27, [x26, x28]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "fmla v0.4s, v29.4s, v10.4s\n"
-    "mov v23.16b, v21.16b\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "mov v25.16b, v21.16b\n"
-    "mov v24.16b, v21.16b\n"
-    "fmla v15.4s, v26.4s, v9.4s\n"
-    "fmla v0.4s, v26.4s, v12.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v18.4s, v26.4s, v17.4s\n"
-    "fmla v3.4s, v27.4s, v8.4s\n"
-    "ldr q29, [x24, x23]\n"
-    "fmla v15.4s, v27.4s, v11.4s\n"
-    "fmla v2.4s, v27.4s, v9.4s\n"
-    "fmla v0.4s, v27.4s, v13.4s\n"
-    "fmla v16.4s, v27.4s, v12.4s\n"
-    "fmla v1.4s, v27.4s, v10.4s\n"
-    "fmla v18.4s, v27.4s, v14.4s\n"
-    "fmla v19.4s, v27.4s, v17.4s\n"
-    "fmla v23.4s, v27.4s, v20.4s\n"
-    "fmla v6.4s, v29.4s, v8.4s\n"
-    "ldr q28, [x9, x12]\n"
-    "fmla v2.4s, v29.4s, v11.4s\n"
-    "fmla v5.4s, v29.4s, v9.4s\n"
-    "fmla v16.4s, v29.4s, v13.4s\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "fmla v4.4s, v29.4s, v10.4s\n"
-    "fmla v19.4s, v29.4s, v14.4s\n"
-    "fmla v22.4s, v29.4s, v17.4s\n"
-    "fmla v25.4s, v29.4s, v20.4s\n"
-    "fmla v5.4s, v28.4s, v11.4s\n"
-    "ldr q21, [%[inptr0], x14]\n"
-    "fmla v1.4s, v28.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v22.4s, v28.4s, v14.4s\n"
-    "ldr q26, [x27, %[input_col_stride1]]\n"
-    "fmla v0.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x8]\n"
-    "fmla v4.4s, v21.4s, v13.4s\n"
-    "ldr q21, [x10, x28]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr q29, [x26, x23]\n"
-    "fmla v15.4s, v21.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x25]\n"
-    "fmla v0.4s, v21.4s, v11.4s\n"
-    "fmla v16.4s, v21.4s, v9.4s\n"
-    "fmla v18.4s, v21.4s, v12.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v23.4s, v21.4s, v17.4s\n"
-    "ldr q21, [x24, x12]\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v16.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v9.4s\n"
-    "fmla v18.4s, v29.4s, v13.4s\n"
-    "fmla v19.4s, v29.4s, v12.4s\n"
-    "fmla v22.4s, v29.4s, v10.4s\n"
-    "fmla v23.4s, v29.4s, v14.4s\n"
-    "fmla v25.4s, v29.4s, v17.4s\n"
-    "fmla v24.4s, v29.4s, v20.4s\n"
-    "ldr q28, [x9, x14]\n"
-    "fmla v5.4s, v21.4s, v8.4s\n"
-    "ldr q27, [x27, x28]\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v4.4s, v21.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "fmla v19.4s, v21.4s, v13.4s\n"
-    "prfm pldl1keep, [x9, x8]\n"
-    "fmla v22.4s, v21.4s, v12.4s\n"
-    "fmla v25.4s, v21.4s, v14.4s\n"
-    "fmla v4.4s, v28.4s, v11.4s\n"
-    "ldr q20, [x10, x23]\n"
-    "fmla v0.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "fmla v22.4s, v28.4s, v13.4s\n"
-    "ldr q26, [x26, x12]\n"
-    "fmla v23.4s, v27.4s, v10.4s\n"
-    "ldr q21, [x24, x14]\n"
-    "fmla v16.4s, v20.4s, v8.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v18.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v19.4s, v20.4s, v9.4s\n"
-    "prfm pldl1keep, [x24, x8]\n"
-    "fmla v23.4s, v20.4s, v12.4s\n"
-    "fmla v25.4s, v20.4s, v10.4s\n"
-    "fmla v24.4s, v20.4s, v17.4s\n"
-    "ldr q28, [x27, x23]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr q20, [x10, x12]\n"
-    "fmla v19.4s, v26.4s, v11.4s\n"
-    "fmla v22.4s, v26.4s, v9.4s\n"
-    "fmla v23.4s, v26.4s, v13.4s\n"
-    "fmla v25.4s, v26.4s, v12.4s\n"
-    "fmla v24.4s, v26.4s, v14.4s\n"
-    "ldr q17, [x26, x14]\n"
-    "fmla v4.4s, v21.4s, v8.4s\n"
-    "ldr q26, [x27, x12]\n"
-    "fmla v22.4s, v21.4s, v11.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v25.4s, v21.4s, v13.4s\n"
-    "ldr q27, [x10, x14]\n"
-    "fmla v18.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v23.4s, v28.4s, v9.4s\n"
-    "add x10, x10, #16\n"
-    "fmla v24.4s, v28.4s, v10.4s\n"
-    "ldr q28, [x27, x14]\n"
-    "fmla v19.4s, v20.4s, v8.4s\n"
-    "ldr q21, [%[wbptr]]\n"
-    "fmla v23.4s, v20.4s, v11.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v25.4s, v20.4s, v9.4s\n"
-    "fmla v24.4s, v20.4s, v12.4s\n"
-    "fmla v22.4s, v17.4s, v8.4s\n"
-    "ldr q20, [%[wbptr], #16]\n"
-    "fmla v23.4s, v26.4s, v8.4s\n"
-    "ldr q14, [%[wbptr], #32]\n"
-    "fmla v24.4s, v17.4s, v13.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v25.4s, v17.4s, v11.4s\n"
-    "ldr q17, [%[wbptr], #64]\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "str q7, [%[outptr0]]\n"
-    "fmla v25.4s, v27.4s, v8.4s\n"
-    "str q6, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "ldr q12, [%[wbptr], #80]\n"
-    "str q5, [%[outptr0], x19]\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "ldr q10, [%[wbptr], #112]\n"
-    "str q4, [%[outptr0], x20]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "str q3, [x16]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr q11, [%[wbptr], #96]\n"
-    "str q2, [x16, %[output_col_stride1]]\n"
-    "fmax v22.4s, v22.4s, v29.4s\n"
-    "str q1, [x16, x19]\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str q22, [x16, x20]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "str q15, [x17]\n"
-    "fmax v19.4s, v19.4s, v29.4s\n"
-    "str q16, [x17, %[output_col_stride1]]\n"
-    "fmax v25.4s, v25.4s, v29.4s\n"
-    "str q19, [x17, x19]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str q25, [x17, x20]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "str q0, [x7]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str q18, [x7, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "str q23, [x7, x19]\n"
-    "mov v7.16b, v21.16b\n"
-    "str q24, [x7, x20]\n"
-    "mov v3.16b, v21.16b\n"
-    "mov v6.16b, v21.16b\n"
-    "ldr q9, [%[wbptr], #128]\n"
-    "mov v15.16b, v21.16b\n"
-    "ldr q8, [%[wbptr], #144]\n"
-    "mov v2.16b, v21.16b\n"
-    "ldr q22, [%[inptr0]]\n"
-    "mov v5.16b, v21.16b\n"
-    "ldr q19, [x9]\n"
-    "mov v0.16b, v21.16b\n"
-    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
-    "mov v16.16b, v21.16b\n"
-    "ldr q18, [x24]\n"
-    "mov v1.16b, v21.16b\n"
-    "ldr q27, [x9, %[input_col_stride1]]\n"
-    "mov v4.16b, v21.16b\n"
-    "ldr q28, [%[inptr0], x28]\n"
-    "fmla v7.4s, v22.4s, v20.4s\n"
-    "ldr q25, [x26]\n"
-    "fmla v3.4s, v19.4s, v20.4s\n"
-    "ldr q22, [x24, %[input_col_stride1]]\n"
-    "fmla v6.4s, v23.4s, v20.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v7.4s, v19.4s, v17.4s\n"
-    "add x16, x16, #16\n"
-    "fmla v3.4s, v18.4s, v17.4s\n"
-    "add x17, x17, #16\n"
-    "fmla v15.4s, v18.4s, v20.4s\n"
-    "add x7, x7, #16\n"
-    "fmla v7.4s, v23.4s, v14.4s\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "fmla v7.4s, v18.4s, v10.4s\n"
-    "fmla v7.4s, v27.4s, v12.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "mov v18.16b, v21.16b\n"
-    "ldr q23, [x9, x28]\n"
-    "mov v19.16b, v21.16b\n"
-    "prfm pldl1keep, [x9, x25]\n"
-    "fmla v6.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x11]\n"
-    "fmla v2.4s, v27.4s, v20.4s\n"
-    "ldr q24, [%[inptr0], x23]\n"
-    "fmla v7.4s, v28.4s, v13.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v6.4s, v28.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x8]\n"
-    "fmla v5.4s, v28.4s, v20.4s\n"
-    "ldr q26, [x10]\n"
-    "fmla v3.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x25]\n"
-    "fmla v15.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x9, x11]\n"
-    "fmla v0.4s, v25.4s, v20.4s\n"
-    "ldr q25, [x26, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x13]\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, #64]\n"
-    "fmla v6.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x10, x8]\n"
-    "fmla v15.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x25]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "prfm pldl1keep, [x24, x11]\n"
-    "fmla v16.4s, v22.4s, v20.4s\n"
-    "ldr q22, [x24, x28]\n"
-    "fmla v7.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x13]\n"
-    "fmla v3.4s, v23.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v6.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, x8]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "prfm pldl1keep, [x10, x25]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "prfm pldl1keep, [x26, x11]\n"
-    "fmla v1.4s, v23.4s, v20.4s\n"
-    "ldr q23, [x9, x23]\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x24, x13]\n"
-    "fmla v5.4s, v24.4s, v14.4s\n"
-    "prfm pldl1keep, [x9, x15]\n"
-    "fmla v4.4s, v24.4s, v20.4s\n"
-    "ldr q24, [%[inptr0], x12]\n"
-    "fmla v15.4s, v26.4s, v10.4s\n"
-    "prfm pldl1keep, [x27, x25]\n"
-    "fmla v0.4s, v26.4s, v17.4s\n"
-    "ldr q29, [x27]\n"
-    "fmla v3.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x11]\n"
-    "fmla v15.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x26, x13]\n"
-    "fmla v2.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "prfm pldl1keep, [x27, x11]\n"
-    "fmla v16.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x10, x13]\n"
-    "fmla v18.4s, v25.4s, v20.4s\n"
-    "ldr q26, [x10, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, x15]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x27, x13]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x15]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x27, x15]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v5.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v22.4s, v14.4s\n"
-    "fmla v1.4s, v22.4s, v17.4s\n"
-    "fmla v19.4s, v22.4s, v20.4s\n"
-    "ldr q27, [x26, x28]\n"
-    "fmla v6.4s, v23.4s, v11.4s\n"
-    "fmla v2.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v23.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v14.4s\n"
-    "fmla v4.4s, v23.4s, v17.4s\n"
-    "fmla v0.4s, v29.4s, v10.4s\n"
-    "mov v22.16b, v21.16b\n"
-    "fmla v15.4s, v26.4s, v9.4s\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v22.4s, v23.4s, v20.4s\n"
-    "ldr q29, [x24, x23]\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "ldr q28, [x9, x12]\n"
-    "fmla v0.4s, v26.4s, v12.4s\n"
-    "fmla v18.4s, v26.4s, v17.4s\n"
-    "mov v23.16b, v21.16b\n"
-    "fmla v3.4s, v27.4s, v8.4s\n"
-    "fmla v15.4s, v27.4s, v11.4s\n"
-    "fmla v2.4s, v27.4s, v9.4s\n"
-    "fmla v0.4s, v27.4s, v13.4s\n"
-    "fmla v16.4s, v27.4s, v12.4s\n"
-    "fmla v1.4s, v27.4s, v10.4s\n"
-    "fmla v18.4s, v27.4s, v14.4s\n"
-    "fmla v19.4s, v27.4s, v17.4s\n"
-    "fmla v23.4s, v27.4s, v20.4s\n"
-    "mov v25.16b, v21.16b\n"
-    "mov v24.16b, v21.16b\n"
-    "fmla v6.4s, v29.4s, v8.4s\n"
-    "fmla v2.4s, v29.4s, v11.4s\n"
-    "fmla v5.4s, v29.4s, v9.4s\n"
-    "fmla v16.4s, v29.4s, v13.4s\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "fmla v4.4s, v29.4s, v10.4s\n"
-    "fmla v19.4s, v29.4s, v14.4s\n"
-    "fmla v22.4s, v29.4s, v17.4s\n"
-    "fmla v25.4s, v29.4s, v20.4s\n"
-    "ldr q21, [%[inptr0], x14]\n"
-    "fmla v5.4s, v28.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v1.4s, v28.4s, v13.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v22.4s, v28.4s, v14.4s\n"
-    "ldr q26, [x27, %[input_col_stride1]]\n"
-    "fmla v0.4s, v26.4s, v9.4s\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "fmla v4.4s, v21.4s, v13.4s\n"
-    "ldr q21, [x10, x28]\n"
-    "fmla v15.4s, v21.4s, v8.4s\n"
-    "ldr q29, [x26, x23]\n"
-    "fmla v0.4s, v21.4s, v11.4s\n"
-    "fmla v16.4s, v21.4s, v9.4s\n"
-    "fmla v18.4s, v21.4s, v12.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v23.4s, v21.4s, v17.4s\n"
-    "ldr q21, [x24, x12]\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v16.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v9.4s\n"
-    "fmla v18.4s, v29.4s, v13.4s\n"
-    "fmla v19.4s, v29.4s, v12.4s\n"
-    "fmla v22.4s, v29.4s, v10.4s\n"
-    "fmla v23.4s, v29.4s, v14.4s\n"
-    "fmla v25.4s, v29.4s, v17.4s\n"
-    "fmla v24.4s, v29.4s, v20.4s\n"
-    "ldr q28, [x9, x14]\n"
-    "fmla v5.4s, v21.4s, v8.4s\n"
-    "ldr q27, [x27, x28]\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v4.4s, v21.4s, v9.4s\n"
-    "fmla v19.4s, v21.4s, v13.4s\n"
-    "fmla v22.4s, v21.4s, v12.4s\n"
-    "fmla v25.4s, v21.4s, v14.4s\n"
-    "fmla v0.4s, v27.4s, v8.4s\n"
-    "ldr q20, [x10, x23]\n"
-    "fmla v4.4s, v28.4s, v11.4s\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "fmla v22.4s, v28.4s, v13.4s\n"
-    "ldr q26, [x26, x12]\n"
-    "fmla v23.4s, v27.4s, v10.4s\n"
-    "ldr q21, [x24, x14]\n"
-    "fmla v16.4s, v20.4s, v8.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v18.4s, v20.4s, v11.4s\n"
-    "fmla v19.4s, v20.4s, v9.4s\n"
-    "fmla v23.4s, v20.4s, v12.4s\n"
-    "fmla v25.4s, v20.4s, v10.4s\n"
-    "fmla v24.4s, v20.4s, v17.4s\n"
-    "ldr q28, [x27, x23]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr q20, [x10, x12]\n"
-    "fmla v19.4s, v26.4s, v11.4s\n"
-    "fmla v22.4s, v26.4s, v9.4s\n"
-    "fmla v23.4s, v26.4s, v13.4s\n"
-    "fmla v25.4s, v26.4s, v12.4s\n"
-    "fmla v24.4s, v26.4s, v14.4s\n"
-    "ldr q17, [x26, x14]\n"
-    "fmla v4.4s, v21.4s, v8.4s\n"
-    "ldr q26, [x27, x12]\n"
-    "fmla v22.4s, v21.4s, v11.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v25.4s, v21.4s, v13.4s\n"
-    "ldr q27, [x10, x14]\n"
-    "fmla v18.4s, v28.4s, v8.4s\n"
-    "add x10, x10, #16\n"
-    "fmla v23.4s, v28.4s, v9.4s\n"
-    "fmla v24.4s, v28.4s, v10.4s\n"
-    "fmla v19.4s, v20.4s, v8.4s\n"
-    "ldr q28, [x27, x14]\n"
-    "fmla v25.4s, v20.4s, v9.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v23.4s, v20.4s, v11.4s\n"
-    "fmla v24.4s, v20.4s, v12.4s\n"
-    "fmla v22.4s, v17.4s, v8.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v25.4s, v17.4s, v11.4s\n"
-    "fmla v24.4s, v17.4s, v13.4s\n"
-    "fmla v23.4s, v26.4s, v8.4s\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmla v25.4s, v27.4s, v8.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "str q7, [%[outptr0]]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "str q6, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "str q5, [%[outptr0], x19]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "str q4, [%[outptr0], x20]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "str q3, [x16]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "str q2, [x16, %[output_col_stride1]]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "str q1, [x16, x19]\n"
-    "fmax v22.4s, v22.4s, v29.4s\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "str q22, [x16, x20]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "str q15, [x17]\n"
-    "fmax v19.4s, v19.4s, v29.4s\n"
-    "str q16, [x17, %[output_col_stride1]]\n"
-    "fmax v25.4s, v25.4s, v29.4s\n"
-    "str q19, [x17, x19]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str q25, [x17, x20]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "str q0, [x7]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str q18, [x7, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "str q23, [x7, x19]\n"
-    "add x16, x16, #16\n"
-    "str q24, [x7, x20]\n"
-    "add x17, x17, #16\n"
-    "add x7, x7, #16\n"
-    "4:\n"
-    "cbz x21, 7f\n"
-    "ldr s21, [%[wbptr]]\n"
-    "mov v7.16b, v21.16b\n"
-    "ldr s20, [%[wbptr], #4]\n"
-    "mov v3.16b, v21.16b\n"
-    "ldr s14, [%[wbptr], #8]\n"
-    "mov v6.16b, v21.16b\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "mov v15.16b, v21.16b\n"
-    "ldr s17, [%[wbptr], #16]\n"
-    "mov v2.16b, v21.16b\n"
-    "ldr s12, [%[wbptr], #20]\n"
-    "mov v5.16b, v21.16b\n"
-    "ldr s11, [%[wbptr], #24]\n"
-    "mov v0.16b, v21.16b\n"
-    "ldr s10, [%[wbptr], #28]\n"
-    "mov v16.16b, v21.16b\n"
-    "ldr s9, [%[wbptr], #32]\n"
-    "mov v1.16b, v21.16b\n"
-    "ldr s8, [%[wbptr], #36]\n"
-    "mov v4.16b, v21.16b\n"
-    "ldr s22, [%[inptr0]]\n"
-    "fmla v7.4s, v22.4s, v20.4s\n"
-    "ldr s19, [x9]\n"
-    "fmla v3.4s, v19.4s, v20.4s\n"
-    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v6.4s, v23.4s, v20.4s\n"
-    "ldr s18, [x24]\n"
-    "fmla v7.4s, v19.4s, v17.4s\n"
-    "ldr s27, [x9, %[input_col_stride1]]\n"
-    "fmla v3.4s, v18.4s, v17.4s\n"
-    "ldr s28, [%[inptr0], x28]\n"
-    "fmla v15.4s, v18.4s, v20.4s\n"
-    "ldr s25, [x26]\n"
-    "fmla v7.4s, v23.4s, v14.4s\n"
-    "ldr s22, [x24, %[input_col_stride1]]\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "subs x21, x21, #1\n"
-    "prfm pldl1keep, [%[inptr0], x8]\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v7.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x9, x8]\n"
-    "prfm pldl1keep, [%[inptr0], x25]\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "prfm pldl1keep, [x24, x8]\n"
-    "fmla v7.4s, v27.4s, v12.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "mov v18.16b, v21.16b\n"
-    "ldr s23, [x9, x28]\n"
-    "mov v19.16b, v21.16b\n"
-    "prfm pldl1keep, [x9, x25]\n"
-    "fmla v6.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x11]\n"
-    "fmla v2.4s, v27.4s, v20.4s\n"
-    "ldr s24, [%[inptr0], x23]\n"
-    "fmla v7.4s, v28.4s, v13.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v6.4s, v28.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x8]\n"
-    "fmla v5.4s, v28.4s, v20.4s\n"
-    "ldr s26, [x10]\n"
-    "fmla v3.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x25]\n"
-    "fmla v15.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x9, x11]\n"
-    "fmla v0.4s, v25.4s, v20.4s\n"
-    "ldr s25, [x26, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x13]\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, #64]\n"
-    "fmla v6.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x10, x8]\n"
-    "fmla v15.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x25]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "prfm pldl1keep, [x24, x11]\n"
-    "fmla v16.4s, v22.4s, v20.4s\n"
-    "ldr s22, [x24, x28]\n"
-    "fmla v7.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x13]\n"
-    "fmla v3.4s, v23.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v6.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, x8]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "prfm pldl1keep, [x10, x25]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "prfm pldl1keep, [x26, x11]\n"
-    "fmla v1.4s, v23.4s, v20.4s\n"
-    "ldr s23, [x9, x23]\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x24, x13]\n"
-    "fmla v5.4s, v24.4s, v14.4s\n"
-    "prfm pldl1keep, [x9, x15]\n"
-    "fmla v4.4s, v24.4s, v20.4s\n"
-    "ldr s24, [%[inptr0], x12]\n"
-    "fmla v15.4s, v26.4s, v10.4s\n"
-    "prfm pldl1keep, [x27, x25]\n"
-    "fmla v0.4s, v26.4s, v17.4s\n"
-    "ldr s29, [x27]\n"
-    "fmla v3.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x11]\n"
-    "fmla v15.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x26, x13]\n"
-    "fmla v2.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "prfm pldl1keep, [x27, x11]\n"
-    "fmla v16.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x10, x13]\n"
-    "fmla v18.4s, v25.4s, v20.4s\n"
-    "ldr s26, [x10, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, x15]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x27, x13]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x15]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x27, x15]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v5.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v22.4s, v14.4s\n"
-    "subs x21, x21, #1\n"
-    "fmla v1.4s, v22.4s, v17.4s\n"
-    "fmla v19.4s, v22.4s, v20.4s\n"
-    "mov v22.16b, v21.16b\n"
-    "fmla v6.4s, v23.4s, v11.4s\n"
-    "fmla v2.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v23.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v14.4s\n"
-    "fmla v4.4s, v23.4s, v17.4s\n"
-    "fmla v22.4s, v23.4s, v20.4s\n"
-    "ldr s27, [x26, x28]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "fmla v0.4s, v29.4s, v10.4s\n"
-    "mov v23.16b, v21.16b\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "mov v25.16b, v21.16b\n"
-    "mov v24.16b, v21.16b\n"
-    "fmla v15.4s, v26.4s, v9.4s\n"
-    "fmla v0.4s, v26.4s, v12.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v18.4s, v26.4s, v17.4s\n"
-    "fmla v3.4s, v27.4s, v8.4s\n"
-    "ldr s29, [x24, x23]\n"
-    "fmla v15.4s, v27.4s, v11.4s\n"
-    "fmla v2.4s, v27.4s, v9.4s\n"
-    "fmla v0.4s, v27.4s, v13.4s\n"
-    "fmla v16.4s, v27.4s, v12.4s\n"
-    "fmla v1.4s, v27.4s, v10.4s\n"
-    "fmla v18.4s, v27.4s, v14.4s\n"
-    "fmla v19.4s, v27.4s, v17.4s\n"
-    "fmla v23.4s, v27.4s, v20.4s\n"
-    "fmla v6.4s, v29.4s, v8.4s\n"
-    "ldr s28, [x9, x12]\n"
-    "fmla v2.4s, v29.4s, v11.4s\n"
-    "fmla v5.4s, v29.4s, v9.4s\n"
-    "fmla v16.4s, v29.4s, v13.4s\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "fmla v4.4s, v29.4s, v10.4s\n"
-    "fmla v19.4s, v29.4s, v14.4s\n"
-    "fmla v22.4s, v29.4s, v17.4s\n"
-    "fmla v25.4s, v29.4s, v20.4s\n"
-    "fmla v5.4s, v28.4s, v11.4s\n"
-    "ldr s21, [%[inptr0], x14]\n"
-    "fmla v1.4s, v28.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v22.4s, v28.4s, v14.4s\n"
-    "ldr s26, [x27, %[input_col_stride1]]\n"
-    "fmla v0.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x8]\n"
-    "fmla v4.4s, v21.4s, v13.4s\n"
-    "ldr s21, [x10, x28]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr s29, [x26, x23]\n"
-    "fmla v15.4s, v21.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x25]\n"
-    "fmla v0.4s, v21.4s, v11.4s\n"
-    "fmla v16.4s, v21.4s, v9.4s\n"
-    "fmla v18.4s, v21.4s, v12.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v23.4s, v21.4s, v17.4s\n"
-    "ldr s21, [x24, x12]\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v16.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v9.4s\n"
-    "fmla v18.4s, v29.4s, v13.4s\n"
-    "fmla v19.4s, v29.4s, v12.4s\n"
-    "fmla v22.4s, v29.4s, v10.4s\n"
-    "fmla v23.4s, v29.4s, v14.4s\n"
-    "fmla v25.4s, v29.4s, v17.4s\n"
-    "fmla v24.4s, v29.4s, v20.4s\n"
-    "ldr s28, [x9, x14]\n"
-    "fmla v5.4s, v21.4s, v8.4s\n"
-    "ldr s27, [x27, x28]\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v4.4s, v21.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "fmla v19.4s, v21.4s, v13.4s\n"
-    "prfm pldl1keep, [x9, x8]\n"
-    "fmla v22.4s, v21.4s, v12.4s\n"
-    "fmla v25.4s, v21.4s, v14.4s\n"
-    "fmla v4.4s, v28.4s, v11.4s\n"
-    "ldr s20, [x10, x23]\n"
-    "fmla v0.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "fmla v22.4s, v28.4s, v13.4s\n"
-    "ldr s26, [x26, x12]\n"
-    "fmla v23.4s, v27.4s, v10.4s\n"
-    "ldr s21, [x24, x14]\n"
-    "fmla v16.4s, v20.4s, v8.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v18.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v19.4s, v20.4s, v9.4s\n"
-    "prfm pldl1keep, [x24, x8]\n"
-    "fmla v23.4s, v20.4s, v12.4s\n"
-    "fmla v25.4s, v20.4s, v10.4s\n"
-    "fmla v24.4s, v20.4s, v17.4s\n"
-    "ldr s28, [x27, x23]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr s20, [x10, x12]\n"
-    "fmla v19.4s, v26.4s, v11.4s\n"
-    "fmla v22.4s, v26.4s, v9.4s\n"
-    "fmla v23.4s, v26.4s, v13.4s\n"
-    "fmla v25.4s, v26.4s, v12.4s\n"
-    "fmla v24.4s, v26.4s, v14.4s\n"
-    "ldr s17, [x26, x14]\n"
-    "fmla v4.4s, v21.4s, v8.4s\n"
-    "ldr s26, [x27, x12]\n"
-    "fmla v22.4s, v21.4s, v11.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v25.4s, v21.4s, v13.4s\n"
-    "ldr s27, [x10, x14]\n"
-    "fmla v18.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v23.4s, v28.4s, v9.4s\n"
-    "add x10, x10, #4\n"
-    "fmla v24.4s, v28.4s, v10.4s\n"
-    "ldr s28, [x27, x14]\n"
-    "fmla v19.4s, v20.4s, v8.4s\n"
-    "ldr s21, [%[wbptr]]\n"
-    "fmla v23.4s, v20.4s, v11.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v25.4s, v20.4s, v9.4s\n"
-    "fmla v24.4s, v20.4s, v12.4s\n"
-    "fmla v22.4s, v17.4s, v8.4s\n"
-    "ldr s20, [%[wbptr], #4]\n"
-    "fmla v23.4s, v26.4s, v8.4s\n"
-    "ldr s14, [%[wbptr], #8]\n"
-    "fmla v24.4s, v17.4s, v13.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v25.4s, v17.4s, v11.4s\n"
-    "ldr s17, [%[wbptr], #16]\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "str s7, [%[outptr0]]\n"
-    "fmla v25.4s, v27.4s, v8.4s\n"
-    "str s6, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "ldr s12, [%[wbptr], #20]\n"
-    "str s5, [%[outptr0], x19]\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "ldr s10, [%[wbptr], #28]\n"
-    "str s4, [%[outptr0], x20]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "str s3, [x16]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr s11, [%[wbptr], #24]\n"
-    "str s2, [x16, %[output_col_stride1]]\n"
-    "fmax v22.4s, v22.4s, v29.4s\n"
-    "str s1, [x16, x19]\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str s22, [x16, x20]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "str s15, [x17]\n"
-    "fmax v19.4s, v19.4s, v29.4s\n"
-    "str s16, [x17, %[output_col_stride1]]\n"
-    "fmax v25.4s, v25.4s, v29.4s\n"
-    "str s19, [x17, x19]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str s25, [x17, x20]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "str s0, [x7]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str s18, [x7, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "str s23, [x7, x19]\n"
-    "mov v7.16b, v21.16b\n"
-    "str s24, [x7, x20]\n"
-    "mov v3.16b, v21.16b\n"
-    "mov v6.16b, v21.16b\n"
-    "ldr s9, [%[wbptr], #32]\n"
-    "mov v15.16b, v21.16b\n"
-    "ldr s8, [%[wbptr], #36]\n"
-    "mov v2.16b, v21.16b\n"
-    "ldr s22, [%[inptr0]]\n"
-    "mov v5.16b, v21.16b\n"
-    "ldr s19, [x9]\n"
-    "mov v0.16b, v21.16b\n"
-    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
-    "mov v16.16b, v21.16b\n"
-    "ldr s18, [x24]\n"
-    "mov v1.16b, v21.16b\n"
-    "ldr s27, [x9, %[input_col_stride1]]\n"
-    "mov v4.16b, v21.16b\n"
-    "ldr s28, [%[inptr0], x28]\n"
-    "fmla v7.4s, v22.4s, v20.4s\n"
-    "ldr s25, [x26]\n"
-    "fmla v3.4s, v19.4s, v20.4s\n"
-    "ldr s22, [x24, %[input_col_stride1]]\n"
-    "fmla v6.4s, v23.4s, v20.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v7.4s, v19.4s, v17.4s\n"
-    "add x16, x16, #4\n"
-    "fmla v3.4s, v18.4s, v17.4s\n"
-    "add x17, x17, #4\n"
-    "fmla v15.4s, v18.4s, v20.4s\n"
-    "add x7, x7, #4\n"
-    "fmla v7.4s, v23.4s, v14.4s\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "fmla v7.4s, v18.4s, v10.4s\n"
-    "fmla v7.4s, v27.4s, v12.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "mov v18.16b, v21.16b\n"
-    "ldr s23, [x9, x28]\n"
-    "mov v19.16b, v21.16b\n"
-    "prfm pldl1keep, [x9, x25]\n"
-    "fmla v6.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x11]\n"
-    "fmla v2.4s, v27.4s, v20.4s\n"
-    "ldr s24, [%[inptr0], x23]\n"
-    "fmla v7.4s, v28.4s, v13.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v6.4s, v28.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x8]\n"
-    "fmla v5.4s, v28.4s, v20.4s\n"
-    "ldr s26, [x10]\n"
-    "fmla v3.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x25]\n"
-    "fmla v15.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x9, x11]\n"
-    "fmla v0.4s, v25.4s, v20.4s\n"
-    "ldr s25, [x26, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x13]\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, #64]\n"
-    "fmla v6.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x10, x8]\n"
-    "fmla v15.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x25]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "prfm pldl1keep, [x24, x11]\n"
-    "fmla v16.4s, v22.4s, v20.4s\n"
-    "ldr s22, [x24, x28]\n"
-    "fmla v7.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x13]\n"
-    "fmla v3.4s, v23.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v6.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, x8]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "prfm pldl1keep, [x10, x25]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "prfm pldl1keep, [x26, x11]\n"
-    "fmla v1.4s, v23.4s, v20.4s\n"
-    "ldr s23, [x9, x23]\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x24, x13]\n"
-    "fmla v5.4s, v24.4s, v14.4s\n"
-    "prfm pldl1keep, [x9, x15]\n"
-    "fmla v4.4s, v24.4s, v20.4s\n"
-    "ldr s24, [%[inptr0], x12]\n"
-    "fmla v15.4s, v26.4s, v10.4s\n"
-    "prfm pldl1keep, [x27, x25]\n"
-    "fmla v0.4s, v26.4s, v17.4s\n"
-    "ldr s29, [x27]\n"
-    "fmla v3.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x11]\n"
-    "fmla v15.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x26, x13]\n"
-    "fmla v2.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "prfm pldl1keep, [x27, x11]\n"
-    "fmla v16.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x10, x13]\n"
-    "fmla v18.4s, v25.4s, v20.4s\n"
-    "ldr s26, [x10, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, x15]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x27, x13]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x15]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x27, x15]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v5.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v22.4s, v14.4s\n"
-    "fmla v1.4s, v22.4s, v17.4s\n"
-    "fmla v19.4s, v22.4s, v20.4s\n"
-    "ldr s27, [x26, x28]\n"
-    "fmla v6.4s, v23.4s, v11.4s\n"
-    "fmla v2.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v23.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v14.4s\n"
-    "fmla v4.4s, v23.4s, v17.4s\n"
-    "fmla v0.4s, v29.4s, v10.4s\n"
-    "mov v22.16b, v21.16b\n"
-    "fmla v15.4s, v26.4s, v9.4s\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v22.4s, v23.4s, v20.4s\n"
-    "ldr s29, [x24, x23]\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "ldr s28, [x9, x12]\n"
-    "fmla v0.4s, v26.4s, v12.4s\n"
-    "fmla v18.4s, v26.4s, v17.4s\n"
-    "mov v23.16b, v21.16b\n"
-    "fmla v3.4s, v27.4s, v8.4s\n"
-    "fmla v15.4s, v27.4s, v11.4s\n"
-    "fmla v2.4s, v27.4s, v9.4s\n"
-    "fmla v0.4s, v27.4s, v13.4s\n"
-    "fmla v16.4s, v27.4s, v12.4s\n"
-    "fmla v1.4s, v27.4s, v10.4s\n"
-    "fmla v18.4s, v27.4s, v14.4s\n"
-    "fmla v19.4s, v27.4s, v17.4s\n"
-    "fmla v23.4s, v27.4s, v20.4s\n"
-    "mov v25.16b, v21.16b\n"
-    "mov v24.16b, v21.16b\n"
-    "fmla v6.4s, v29.4s, v8.4s\n"
-    "fmla v2.4s, v29.4s, v11.4s\n"
-    "fmla v5.4s, v29.4s, v9.4s\n"
-    "fmla v16.4s, v29.4s, v13.4s\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "fmla v4.4s, v29.4s, v10.4s\n"
-    "fmla v19.4s, v29.4s, v14.4s\n"
-    "fmla v22.4s, v29.4s, v17.4s\n"
-    "fmla v25.4s, v29.4s, v20.4s\n"
-    "ldr s21, [%[inptr0], x14]\n"
-    "fmla v5.4s, v28.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v1.4s, v28.4s, v13.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v22.4s, v28.4s, v14.4s\n"
-    "ldr s26, [x27, %[input_col_stride1]]\n"
-    "fmla v0.4s, v26.4s, v9.4s\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "fmla v4.4s, v21.4s, v13.4s\n"
-    "ldr s21, [x10, x28]\n"
-    "fmla v15.4s, v21.4s, v8.4s\n"
-    "ldr s29, [x26, x23]\n"
-    "fmla v0.4s, v21.4s, v11.4s\n"
-    "fmla v16.4s, v21.4s, v9.4s\n"
-    "fmla v18.4s, v21.4s, v12.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v23.4s, v21.4s, v17.4s\n"
-    "ldr s21, [x24, x12]\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v16.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v9.4s\n"
-    "fmla v18.4s, v29.4s, v13.4s\n"
-    "fmla v19.4s, v29.4s, v12.4s\n"
-    "fmla v22.4s, v29.4s, v10.4s\n"
-    "fmla v23.4s, v29.4s, v14.4s\n"
-    "fmla v25.4s, v29.4s, v17.4s\n"
-    "fmla v24.4s, v29.4s, v20.4s\n"
-    "ldr s28, [x9, x14]\n"
-    "fmla v5.4s, v21.4s, v8.4s\n"
-    "ldr s27, [x27, x28]\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v4.4s, v21.4s, v9.4s\n"
-    "fmla v19.4s, v21.4s, v13.4s\n"
-    "fmla v22.4s, v21.4s, v12.4s\n"
-    "fmla v25.4s, v21.4s, v14.4s\n"
-    "fmla v0.4s, v27.4s, v8.4s\n"
-    "ldr s20, [x10, x23]\n"
-    "fmla v4.4s, v28.4s, v11.4s\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "fmla v22.4s, v28.4s, v13.4s\n"
-    "ldr s26, [x26, x12]\n"
-    "fmla v23.4s, v27.4s, v10.4s\n"
-    "ldr s21, [x24, x14]\n"
-    "fmla v16.4s, v20.4s, v8.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v18.4s, v20.4s, v11.4s\n"
-    "fmla v19.4s, v20.4s, v9.4s\n"
-    "fmla v23.4s, v20.4s, v12.4s\n"
-    "fmla v25.4s, v20.4s, v10.4s\n"
-    "fmla v24.4s, v20.4s, v17.4s\n"
-    "ldr s28, [x27, x23]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr s20, [x10, x12]\n"
-    "fmla v19.4s, v26.4s, v11.4s\n"
-    "fmla v22.4s, v26.4s, v9.4s\n"
-    "fmla v23.4s, v26.4s, v13.4s\n"
-    "fmla v25.4s, v26.4s, v12.4s\n"
-    "fmla v24.4s, v26.4s, v14.4s\n"
-    "ldr s17, [x26, x14]\n"
-    "fmla v4.4s, v21.4s, v8.4s\n"
-    "ldr s26, [x27, x12]\n"
-    "fmla v22.4s, v21.4s, v11.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v25.4s, v21.4s, v13.4s\n"
-    "ldr s27, [x10, x14]\n"
-    "fmla v18.4s, v28.4s, v8.4s\n"
-    "add x10, x10, #4\n"
-    "fmla v23.4s, v28.4s, v9.4s\n"
-    "fmla v24.4s, v28.4s, v10.4s\n"
-    "fmla v19.4s, v20.4s, v8.4s\n"
-    "ldr s28, [x27, x14]\n"
-    "fmla v25.4s, v20.4s, v9.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v23.4s, v20.4s, v11.4s\n"
-    "fmla v24.4s, v20.4s, v12.4s\n"
-    "fmla v22.4s, v17.4s, v8.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v25.4s, v17.4s, v11.4s\n"
-    "fmla v24.4s, v17.4s, v13.4s\n"
-    "fmla v23.4s, v26.4s, v8.4s\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmla v25.4s, v27.4s, v8.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "str s7, [%[outptr0]]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "str s6, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "str s5, [%[outptr0], x19]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "str s4, [%[outptr0], x20]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "str s3, [x16]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "str s2, [x16, %[output_col_stride1]]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "str s1, [x16, x19]\n"
-    "fmax v22.4s, v22.4s, v29.4s\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "str s22, [x16, x20]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "str s15, [x17]\n"
-    "fmax v19.4s, v19.4s, v29.4s\n"
-    "str s16, [x17, %[output_col_stride1]]\n"
-    "fmax v25.4s, v25.4s, v29.4s\n"
-    "str s19, [x17, x19]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str s25, [x17, x20]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "str s0, [x7]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str s18, [x7, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "str s23, [x7, x19]\n"
-    "add x16, x16, #4\n"
-    "str s24, [x7, x20]\n"
-    "add x17, x17, #4\n"
-    "add x7, x7, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *inptrs[6][6],
-  float *outptrs[4][4]
-)
-{
-  __asm __volatile(
-    "mov x27, xzr\n"
-    "mov x28, xzr\n"
-    "and x19, %[n_channels], #3\n"
-    "lsr x26, %[n_channels], #2\n"
-    "cbz x26, 4f\n"
-    "1:\n"
-    "ldr q25, [%[wbptr]]\n"
-    "ldr x25, [%[inptrs], 0]\n"
-    "mov v2.16b, v25.16b\n"
-    "ldr q22, [%[wbptr], #16]\n"
-    "mov v16.16b, v25.16b\n"
-    "ldr q9, [%[wbptr], #32]\n"
-    "mov v18.16b, v25.16b\n"
-    "ldr q8, [%[wbptr], #48]\n"
-    "mov v13.16b, v25.16b\n"
-    "ldr q19, [%[wbptr], #64]\n"
-    "mov v0.16b, v25.16b\n"
-    "ldr q7, [%[wbptr], #80]\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr q6, [%[wbptr], #96]\n"
-    "mov v14.16b, v25.16b\n"
-    "ldr q5, [%[wbptr], #112]\n"
-    "mov v12.16b, v25.16b\n"
-    "ldr q4, [%[wbptr], #128]\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr q3, [%[wbptr], #144]\n"
-    "ldr q27, [x25, x27]\n"
-    "ldr x17, [%[inptrs], 48]\n"
-    "fmla v2.4s, v27.4s, v22.4s\n"
-    "ldr x25, [%[inptrs], 8]\n"
-    "ldr q26, [x17, x27]\n"
-    "ldr x24, [%[inptrs], 96]\n"
-    "fmla v16.4s, v26.4s, v22.4s\n"
-    "ldr q31, [x25, x27]\n"
-    "ldr q28, [x24, x27]\n"
-    "ldr x17, [%[inptrs], 56]\n"
-    "fmla v2.4s, v26.4s, v19.4s\n"
-    "ldr x25, [%[inptrs], 16]\n"
-    "ldr q29, [x17, x27]\n"
-    "ldr x7, [%[inptrs], 144]\n"
-    "ldr x24, [%[inptrs], 104]\n"
-    "subs x26, x26, #1\n"
-    "ldr q30, [x25, x27]\n"
-    "ldr q27, [x7, x27]\n"
-    "ldr q21, [x24, x27]\n"
-    "fmla v2.4s, v31.4s, v9.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "mov v1.16b, v25.16b\n"
-    "ldr x17, [%[inptrs], 64]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr x25, [%[inptrs], 24]\n"
-    "fmla v18.4s, v31.4s, v22.4s\n"
-    "ldr q23, [x17, x27]\n"
-    "fmla v2.4s, v28.4s, v5.4s\n"
-    "ldr x15, [%[inptrs], 192]\n"
-    "fmla v16.4s, v28.4s, v19.4s\n"
-    "ldr x7, [%[inptrs], 152]\n"
-    "fmla v13.4s, v28.4s, v22.4s\n"
-    "ldr q26, [x25, x27]\n"
-    "fmla v18.4s, v29.4s, v19.4s\n"
-    "ldr x24, [%[inptrs], 112]\n"
-    "fmla v2.4s, v29.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 72]\n"
-    "fmla v16.4s, v29.4s, v9.4s\n"
-    "ldr x25, [%[inptrs], 32]\n"
-    "fmla v0.4s, v29.4s, v22.4s\n"
-    "ldr q28, [x15, x27]\n"
-    "fmla v18.4s, v30.4s, v9.4s\n"
-    "ldr x16, [%[inptrs], 240]\n"
-    "fmla v2.4s, v30.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 200]\n"
-    "fmla v17.4s, v30.4s, v22.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v16.4s, v27.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 160]\n"
-    "fmla v13.4s, v27.4s, v19.4s\n"
-    "ldr x20, [%[outptrs], 0]\n"
-    "fmla v14.4s, v27.4s, v22.4s\n"
-    "ldr q20, [x24, x27]\n"
-    "fmla v2.4s, v21.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 120]\n"
-    "fmla v16.4s, v21.4s, v7.4s\n"
-    "ldr x21, [%[outptrs], 32]\n"
-    "fmla v18.4s, v21.4s, v5.4s\n"
-    "ldr x22, [%[outptrs], 64]\n"
-    "fmla v13.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 96]\n"
-    "fmla v0.4s, v21.4s, v19.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v12.4s, v21.4s, v22.4s\n"
-    "ldr q24, [x17, x27]\n"
-    "fmla v2.4s, v23.4s, v6.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v23.4s, v8.4s\n"
-    "ldr x17, [%[inptrs], 80]\n"
-    "fmla v18.4s, v23.4s, v7.4s\n"
-    "subs x26, x26, #1\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "fmla v17.4s, v23.4s, v19.4s\n"
-    "fmla v15.4s, v23.4s, v22.4s\n"
-    "ldr q23, [x25, x27]\n"
-    "fmla v1.4s, v26.4s, v22.4s\n"
-    "ldr x25, [%[inptrs], 40]\n"
-    "fmla v18.4s, v26.4s, v8.4s\n"
-    "fmla v13.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v26.4s, v9.4s\n"
-    "ldr q30, [x16, x27]\n"
-    "fmla v14.4s, v28.4s, v19.4s\n"
-    "ldr q26, [x15, x27]\n"
-    "fmla v16.4s, v29.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 248]\n"
-    "fmla v13.4s, v29.4s, v7.4s\n"
-    "ldr x15, [%[inptrs], 208]\n"
-    "fmla v0.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v19.4s\n"
-    "fmla v14.4s, v29.4s, v9.4s\n"
-    "fmla v10.4s, v29.4s, v22.4s\n"
-    "mov v11.16b, v25.16b\n"
-    "fmla v2.4s, v20.4s, v3.4s\n"
-    "fmla v16.4s, v20.4s, v6.4s\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v13.4s, v20.4s, v8.4s\n"
-    "fmla v0.4s, v20.4s, v7.4s\n"
-    "fmla v17.4s, v20.4s, v5.4s\n"
-    "fmla v12.4s, v20.4s, v9.4s\n"
-    "fmla v15.4s, v20.4s, v19.4s\n"
-    "fmla v11.4s, v20.4s, v22.4s\n"
-    "mov v21.16b, v25.16b\n"
-    "fmla v18.4s, v24.4s, v6.4s\n"
-    "fmla v0.4s, v24.4s, v8.4s\n"
-    "fmla v1.4s, v24.4s, v19.4s\n"
-    "fmla v17.4s, v24.4s, v7.4s\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "mov v20.16b, v25.16b\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "fmla v21.4s, v24.4s, v22.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 168]\n"
-    "fmla v17.4s, v23.4s, v8.4s\n"
-    "ldr q30, [x24, x27]\n"
-    "fmla v13.4s, v26.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 128]\n"
-    "fmla v14.4s, v26.4s, v7.4s\n"
-    "fmla v12.4s, v26.4s, v5.4s\n"
-    "fmla v10.4s, v26.4s, v19.4s\n"
-    "ldr q31, [x17, x27]\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "ldr x17, [%[inptrs], 88]\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v0.4s, v27.4s, v4.4s\n"
-    "fmla v14.4s, v27.4s, v8.4s\n"
-    "fmla v12.4s, v27.4s, v7.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "fmla v10.4s, v27.4s, v9.4s\n"
-    "fmla v11.4s, v27.4s, v19.4s\n"
-    "fmla v20.4s, v27.4s, v22.4s\n"
-    "mov v24.16b, v25.16b\n"
-    "mov v23.16b, v25.16b\n"
-    "fmla v18.4s, v30.4s, v3.4s\n"
-    "fmla v0.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "fmla v12.4s, v30.4s, v8.4s\n"
-    "fmla v15.4s, v30.4s, v7.4s\n"
-    "fmla v1.4s, v30.4s, v5.4s\n"
-    "fmla v11.4s, v30.4s, v9.4s\n"
-    "fmla v21.4s, v30.4s, v19.4s\n"
-    "fmla v24.4s, v30.4s, v22.4s\n"
-    "ldr q25, [x25, x27]\n"
-    "fmla v17.4s, v31.4s, v6.4s\n"
-    "ldr x25, [%[inptrs], 0]\n"
-    "fmla v15.4s, v31.4s, v8.4s\n"
-    "fmla v1.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v31.4s, v9.4s\n"
-    "ldr q26, [x16, x27]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 256]\n"
-    "fmla v10.4s, v26.4s, v5.4s\n"
-    "ldr q31, [x15, x27]\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v13.4s, v31.4s, v3.4s\n"
-    "ldr x15, [%[inptrs], 216]\n"
-    "fmla v14.4s, v31.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 176]\n"
-    "fmla v12.4s, v31.4s, v4.4s\n"
-    "fmla v10.4s, v31.4s, v7.4s\n"
-    "fmla v11.4s, v31.4s, v5.4s\n"
-    "fmla v20.4s, v31.4s, v19.4s\n"
-    "fmla v0.4s, v29.4s, v3.4s\n"
-    "ldr q28, [x24, x27]\n"
-    "fmla v15.4s, v29.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 136]\n"
-    "fmla v12.4s, v29.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v8.4s\n"
-    "fmla v11.4s, v29.4s, v7.4s\n"
-    "fmla v21.4s, v29.4s, v5.4s\n"
-    "fmla v20.4s, v29.4s, v9.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v23.4s, v29.4s, v22.4s\n"
-    "ldr q25, [x17, x27]\n"
-    "fmla v17.4s, v28.4s, v3.4s\n"
-    "ldr q29, [x16, x27]\n"
-    "fmla v15.4s, v28.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 264]\n"
-    "fmla v1.4s, v28.4s, v4.4s\n"
-    "ldr x17, [%[inptrs], 48]\n"
-    "fmla v11.4s, v28.4s, v8.4s\n"
-    "fmla v21.4s, v28.4s, v7.4s\n"
-    "fmla v24.4s, v28.4s, v9.4s\n"
-    "ldr q22, [x15, x27]\n"
-    "fmla v14.4s, v29.4s, v3.4s\n"
-    "ldr x15, [%[inptrs], 224]\n"
-    "fmla v1.4s, v25.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v4.4s\n"
-    "fmla v21.4s, v25.4s, v8.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "fmla v20.4s, v29.4s, v5.4s\n"
-    "ldr q26, [x24, x27]\n"
-    "fmla v12.4s, v22.4s, v3.4s\n"
-    "ldr x7, [%[inptrs], 184]\n"
-    "fmla v10.4s, v22.4s, v6.4s\n"
-    "ldr x24, [%[inptrs], 96]\n"
-    "fmla v11.4s, v22.4s, v4.4s\n"
-    "fmla v24.4s, v22.4s, v5.4s\n"
-    "fmla v20.4s, v22.4s, v7.4s\n"
-    "fmla v23.4s, v22.4s, v19.4s\n"
-    "fmla v15.4s, v27.4s, v3.4s\n"
-    "ldr q25, [x16, x27]\n"
-    "fmla v21.4s, v27.4s, v4.4s\n"
-    "ldr q31, [x15, x27]\n"
-    "fmla v11.4s, v27.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 272]\n"
-    "fmla v20.4s, v27.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 232]\n"
-    "fmla v24.4s, v27.4s, v7.4s\n"
-    "fmla v23.4s, v27.4s, v9.4s\n"
-    "fmla v1.4s, v26.4s, v3.4s\n"
-    "ldr q22, [x7, x27]\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr q19, [x16, x27]\n"
-    "fmla v10.4s, v25.4s, v3.4s\n"
-    "ldr x16, [%[inptrs], 280]\n"
-    "fmla v24.4s, v26.4s, v8.4s\n"
-    "ldr q28, [x15, x27]\n"
-    "fmla v20.4s, v25.4s, v4.4s\n"
-    "ldr x7, [%[inptrs], 144]\n"
-    "fmla v23.4s, v25.4s, v5.4s\n"
-    "ldr q30, [x16, x27]\n"
-    "fmla v11.4s, v31.4s, v3.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v24.4s, v31.4s, v4.4s\n"
-    "ldr q27, [x25, x27]\n"
-    "fmla v20.4s, v31.4s, v6.4s\n"
-    "ldr x25, [%[inptrs], 8]\n"
-    "fmla v23.4s, v31.4s, v7.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v21.4s, v22.4s, v3.4s\n"
-    "ldr q26, [x17, x27]\n"
-    "fmla v24.4s, v22.4s, v6.4s\n"
-    "ldr x17, [%[inptrs], 56]\n"
-    "fmla v20.4s, v19.4s, v3.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmla v23.4s, v22.4s, v8.4s\n"
-    "ldr q25, [%[wbptr]]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "ldr q22, [%[wbptr], #16]\n"
-    "str q2, [x20, x28]\n"
-    "fmla v24.4s, v28.4s, v3.4s\n"
-    "fmax v17.4s, v17.4s, v29.4s\n"
-    "ldr q9, [%[wbptr], #32]\n"
-    "fmla v23.4s, v19.4s, v4.4s\n"
-    "ldr q8, [%[wbptr], #48]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr q19, [%[wbptr], #64]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 8]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str q18, [x20, x28]\n"
-    "fmla v23.4s, v28.4s, v6.4s\n"
-    "str q16, [x21, x28]\n"
-    "fmax v21.4s, v21.4s, v29.4s\n"
-    "fmax v13.4s, v13.4s, v29.4s\n"
-    "ldr q7, [%[wbptr], #80]\n"
-    "fmax v12.4s, v12.4s, v29.4s\n"
-    "ldr q5, [%[wbptr], #112]\n"
-    "fmla v23.4s, v30.4s, v3.4s\n"
-    "ldr q6, [%[wbptr], #96]\n"
-    "str q13, [x22, x28]\n"
-    "fmax v11.4s, v11.4s, v29.4s\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "ldr q4, [%[wbptr], #128]\n"
-    "fmax v14.4s, v14.4s, v29.4s\n"
-    "ldr q31, [x25, x27]\n"
-    "fmax v10.4s, v10.4s, v29.4s\n"
-    "ldr q3, [%[wbptr], #144]\n"
-    "fmax v20.4s, v20.4s, v29.4s\n"
-    "ldr q28, [x24, x27]\n"
-    "str q14, [x23, x28]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "mov v2.16b, v25.16b\n"
-    "ldr q29, [x17, x27]\n"
-    "ldr x20, [%[outptrs], 16]\n"
-    "ldr x21, [%[outptrs], 40]\n"
-    "ldr x22, [%[outptrs], 72]\n"
-    "ldr x23, [%[outptrs], 104]\n"
-    "ldr x25, [%[inptrs], 16]\n"
-    "ldr x24, [%[inptrs], 104]\n"
-    "str q17, [x20, x28]\n"
-    "mov v16.16b, v25.16b\n"
-    "str q0, [x21, x28]\n"
-    "mov v18.16b, v25.16b\n"
-    "str q12, [x22, x28]\n"
-    "mov v13.16b, v25.16b\n"
-    "str q10, [x23, x28]\n"
-    "mov v0.16b, v25.16b\n"
-    "fmla v2.4s, v27.4s, v22.4s\n"
-    "ldr q30, [x25, x27]\n"
-    "fmla v16.4s, v26.4s, v22.4s\n"
-    "ldr x20, [%[outptrs], 24]\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr x21, [%[outptrs], 48]\n"
-    "str q1, [x20, x28]\n"
-    "mov v14.16b, v25.16b\n"
-    "str q15, [x21, x28]\n"
-    "mov v12.16b, v25.16b\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr x21, [%[outptrs], 56]\n"
-    "fmla v2.4s, v26.4s, v19.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "str q21, [x21, x28]\n"
-    "ldr x22, [%[outptrs], 80]\n"
-    "ldr q21, [x24, x27]\n"
-    "ldr x23, [%[outptrs], 112]\n"
-    "str q11, [x22, x28]\n"
-    "fmla v2.4s, v31.4s, v9.4s\n"
-    "str q20, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 88]\n"
-    "ldr x23, [%[outptrs], 120]\n"
-    "str q24, [x22, x28]\n"
-    "str q23, [x23, x28]\n"
-    "add x28, x28, #16\n"
-    "bne 2b\n"
-    "3:\n"
-    "mov v1.16b, v25.16b\n"
-    "ldr x17, [%[inptrs], 64]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr x25, [%[inptrs], 24]\n"
-    "mov v11.16b, v25.16b\n"
-    "ldr x15, [%[inptrs], 192]\n"
-    "fmla v18.4s, v31.4s, v22.4s\n"
-    "ldr q23, [x17, x27]\n"
-    "fmla v2.4s, v28.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 152]\n"
-    "fmla v16.4s, v28.4s, v19.4s\n"
-    "ldr x24, [%[inptrs], 112]\n"
-    "fmla v13.4s, v28.4s, v22.4s\n"
-    "ldr q26, [x25, x27]\n"
-    "fmla v18.4s, v29.4s, v19.4s\n"
-    "ldr x17, [%[inptrs], 72]\n"
-    "fmla v2.4s, v29.4s, v7.4s\n"
-    "ldr x25, [%[inptrs], 32]\n"
-    "fmla v16.4s, v29.4s, v9.4s\n"
-    "ldr x16, [%[inptrs], 240]\n"
-    "fmla v0.4s, v29.4s, v22.4s\n"
-    "ldr q28, [x15, x27]\n"
-    "fmla v18.4s, v30.4s, v9.4s\n"
-    "ldr x15, [%[inptrs], 200]\n"
-    "fmla v2.4s, v30.4s, v8.4s\n"
-    "ldr x20, [%[outptrs], 0]\n"
-    "fmla v17.4s, v30.4s, v22.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v16.4s, v27.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 160]\n"
-    "fmla v13.4s, v27.4s, v19.4s\n"
-    "ldr x21, [%[outptrs], 32]\n"
-    "fmla v14.4s, v27.4s, v22.4s\n"
-    "ldr q20, [x24, x27]\n"
-    "fmla v2.4s, v21.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 120]\n"
-    "fmla v16.4s, v21.4s, v7.4s\n"
-    "ldr x22, [%[outptrs], 64]\n"
-    "fmla v18.4s, v21.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 96]\n"
-    "fmla v13.4s, v21.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v0.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v12.4s, v21.4s, v22.4s\n"
-    "ldr q24, [x17, x27]\n"
-    "fmla v2.4s, v23.4s, v6.4s\n"
-    "ldr x17, [%[inptrs], 80]\n"
-    "fmla v16.4s, v23.4s, v8.4s\n"
-    "fmla v18.4s, v23.4s, v7.4s\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "fmla v17.4s, v23.4s, v19.4s\n"
-    "fmla v15.4s, v23.4s, v22.4s\n"
-    "ldr q23, [x25, x27]\n"
-    "fmla v1.4s, v26.4s, v22.4s\n"
-    "ldr x25, [%[inptrs], 40]\n"
-    "fmla v18.4s, v26.4s, v8.4s\n"
-    "fmla v13.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v26.4s, v9.4s\n"
-    "ldr q30, [x16, x27]\n"
-    "fmla v14.4s, v28.4s, v19.4s\n"
-    "ldr q26, [x15, x27]\n"
-    "fmla v16.4s, v29.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 248]\n"
-    "fmla v13.4s, v29.4s, v7.4s\n"
-    "ldr x15, [%[inptrs], 208]\n"
-    "fmla v0.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v19.4s\n"
-    "fmla v14.4s, v29.4s, v9.4s\n"
-    "fmla v10.4s, v29.4s, v22.4s\n"
-    "mov v21.16b, v25.16b\n"
-    "fmla v2.4s, v20.4s, v3.4s\n"
-    "fmla v16.4s, v20.4s, v6.4s\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v13.4s, v20.4s, v8.4s\n"
-    "fmla v0.4s, v20.4s, v7.4s\n"
-    "fmla v17.4s, v20.4s, v5.4s\n"
-    "fmla v12.4s, v20.4s, v9.4s\n"
-    "fmla v15.4s, v20.4s, v19.4s\n"
-    "fmla v11.4s, v20.4s, v22.4s\n"
-    "mov v20.16b, v25.16b\n"
-    "fmla v18.4s, v24.4s, v6.4s\n"
-    "fmla v0.4s, v24.4s, v8.4s\n"
-    "fmla v1.4s, v24.4s, v19.4s\n"
-    "fmla v17.4s, v24.4s, v7.4s\n"
-    "fmla v21.4s, v24.4s, v22.4s\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "ldr q30, [x24, x27]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 168]\n"
-    "fmla v17.4s, v23.4s, v8.4s\n"
-    "ldr q31, [x17, x27]\n"
-    "fmla v13.4s, v26.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 128]\n"
-    "fmla v14.4s, v26.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 88]\n"
-    "fmla v12.4s, v26.4s, v5.4s\n"
-    "fmla v10.4s, v26.4s, v19.4s\n"
-    "mov v24.16b, v25.16b\n"
-    "mov v23.16b, v25.16b\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v0.4s, v27.4s, v4.4s\n"
-    "fmla v14.4s, v27.4s, v8.4s\n"
-    "fmla v12.4s, v27.4s, v7.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "fmla v10.4s, v27.4s, v9.4s\n"
-    "fmla v11.4s, v27.4s, v19.4s\n"
-    "fmla v20.4s, v27.4s, v22.4s\n"
-    "ldr q25, [x25, x27]\n"
-    "fmla v18.4s, v30.4s, v3.4s\n"
-    "fmla v0.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "fmla v12.4s, v30.4s, v8.4s\n"
-    "fmla v15.4s, v30.4s, v7.4s\n"
-    "fmla v1.4s, v30.4s, v5.4s\n"
-    "fmla v11.4s, v30.4s, v9.4s\n"
-    "fmla v21.4s, v30.4s, v19.4s\n"
-    "fmla v24.4s, v30.4s, v22.4s\n"
-    "ldr q26, [x16, x27]\n"
-    "fmla v17.4s, v31.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 256]\n"
-    "fmla v15.4s, v31.4s, v8.4s\n"
-    "fmla v1.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v31.4s, v9.4s\n"
-    "ldr q31, [x15, x27]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr x15, [%[inptrs], 216]\n"
-    "fmla v10.4s, v26.4s, v5.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "ldr q28, [x24, x27]\n"
-    "fmla v13.4s, v31.4s, v3.4s\n"
-    "ldr x7, [%[inptrs], 176]\n"
-    "fmla v14.4s, v31.4s, v6.4s\n"
-    "ldr x24, [%[inptrs], 136]\n"
-    "fmla v12.4s, v31.4s, v4.4s\n"
-    "fmla v10.4s, v31.4s, v7.4s\n"
-    "fmla v11.4s, v31.4s, v5.4s\n"
-    "fmla v20.4s, v31.4s, v19.4s\n"
-    "fmla v0.4s, v29.4s, v3.4s\n"
-    "ldr q25, [x17, x27]\n"
-    "fmla v15.4s, v29.4s, v4.4s\n"
-    "fmla v21.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v8.4s\n"
-    "fmla v11.4s, v29.4s, v7.4s\n"
-    "fmla v20.4s, v29.4s, v9.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v23.4s, v29.4s, v22.4s\n"
-    "fmla v17.4s, v28.4s, v3.4s\n"
-    "ldr q29, [x16, x27]\n"
-    "fmla v15.4s, v28.4s, v6.4s\n"
-    "ldr q22, [x15, x27]\n"
-    "fmla v1.4s, v28.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 264]\n"
-    "fmla v11.4s, v28.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 224]\n"
-    "fmla v21.4s, v28.4s, v7.4s\n"
-    "fmla v24.4s, v28.4s, v9.4s\n"
-    "fmla v14.4s, v29.4s, v3.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "fmla v1.4s, v25.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 184]\n"
-    "fmla v10.4s, v29.4s, v4.4s\n"
-    "fmla v20.4s, v29.4s, v5.4s\n"
-    "fmla v21.4s, v25.4s, v8.4s\n"
-    "ldr q26, [x24, x27]\n"
-    "fmla v12.4s, v22.4s, v3.4s\n"
-    "ldr q25, [x16, x27]\n"
-    "fmla v11.4s, v22.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 272]\n"
-    "fmla v10.4s, v22.4s, v6.4s\n"
-    "fmla v20.4s, v22.4s, v7.4s\n"
-    "fmla v24.4s, v22.4s, v5.4s\n"
-    "fmla v23.4s, v22.4s, v19.4s\n"
-    "fmla v15.4s, v27.4s, v3.4s\n"
-    "ldr q31, [x15, x27]\n"
-    "fmla v11.4s, v27.4s, v6.4s\n"
-    "ldr q22, [x7, x27]\n"
-    "fmla v21.4s, v27.4s, v4.4s\n"
-    "ldr x15, [%[inptrs], 232]\n"
-    "fmla v20.4s, v27.4s, v8.4s\n"
-    "fmla v24.4s, v27.4s, v7.4s\n"
-    "fmla v23.4s, v27.4s, v9.4s\n"
-    "ldr q19, [x16, x27]\n"
-    "fmla v1.4s, v26.4s, v3.4s\n"
-    "ldr q28, [x15, x27]\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 280]\n"
-    "fmla v24.4s, v26.4s, v8.4s\n"
-    "fmla v10.4s, v25.4s, v3.4s\n"
-    "fmla v20.4s, v25.4s, v4.4s\n"
-    "ldr q30, [x16, x27]\n"
-    "fmla v23.4s, v25.4s, v5.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v11.4s, v31.4s, v3.4s\n"
-    "fmla v21.4s, v22.4s, v3.4s\n"
-    "fmla v24.4s, v31.4s, v4.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v20.4s, v31.4s, v6.4s\n"
-    "fmla v23.4s, v31.4s, v7.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "fmla v24.4s, v22.4s, v6.4s\n"
-    "fmax v17.4s, v17.4s, v29.4s\n"
-    "fmla v20.4s, v19.4s, v3.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "str q2, [x20, x28]\n"
-    "fmla v23.4s, v22.4s, v8.4s\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 8]\n"
-    "fmla v24.4s, v28.4s, v3.4s\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str q18, [x20, x28]\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str q16, [x21, x28]\n"
-    "fmla v23.4s, v19.4s, v4.4s\n"
-    "fmax v21.4s, v21.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 16]\n"
-    "fmax v13.4s, v13.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 40]\n"
-    "str q17, [x20, x28]\n"
-    "fmax v12.4s, v12.4s, v29.4s\n"
-    "str q0, [x21, x28]\n"
-    "fmla v23.4s, v28.4s, v6.4s\n"
-    "str q13, [x22, x28]\n"
-    "fmax v11.4s, v11.4s, v29.4s\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 24]\n"
-    "fmax v14.4s, v14.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 48]\n"
-    "str q1, [x20, x28]\n"
-    "fmla v23.4s, v30.4s, v3.4s\n"
-    "str q15, [x21, x28]\n"
-    "fmax v10.4s, v10.4s, v29.4s\n"
-    "str q14, [x23, x28]\n"
-    "fmax v20.4s, v20.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 56]\n"
-    "ldr x22, [%[outptrs], 72]\n"
-    "ldr x23, [%[outptrs], 104]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str q21, [x21, x28]\n"
-    "str q12, [x22, x28]\n"
-    "str q10, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 80]\n"
-    "ldr x23, [%[outptrs], 112]\n"
-    "str q11, [x22, x28]\n"
-    "str q20, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 88]\n"
-    "ldr x23, [%[outptrs], 120]\n"
-    "str q24, [x22, x28]\n"
-    "str q23, [x23, x28]\n"
-    "add x28, x28, #16\n"
-    "4:\n"
-    "cbz x19, 7f\n"
-    "ldr s25, [%[wbptr]]\n"
-    "mov v2.16b, v25.16b\n"
-    "ldr s22, [%[wbptr], #4]\n"
-    "mov v16.16b, v25.16b\n"
-    "ldr s9, [%[wbptr], #8]\n"
-    "mov v18.16b, v25.16b\n"
-    "ldr s8, [%[wbptr], #12]\n"
-    "mov v13.16b, v25.16b\n"
-    "ldr s19, [%[wbptr], #16]\n"
-    "mov v0.16b, v25.16b\n"
-    "ldr s7, [%[wbptr], #20]\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr s6, [%[wbptr], #24]\n"
-    "mov v14.16b, v25.16b\n"
-    "ldr s5, [%[wbptr], #28]\n"
-    "mov v12.16b, v25.16b\n"
-    "ldr s4, [%[wbptr], #32]\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr s3, [%[wbptr], #36]\n"
-    "ldr x25, [%[inptrs], 0]\n"
-    "ldr x17, [%[inptrs], 48]\n"
-    "ldr x24, [%[inptrs], 96]\n"
-    "ldr x7, [%[inptrs], 144]\n"
-    "subs x19, x19, #1\n"
-    "ldr s27, [x25, x27]\n"
-    "fmla v2.4s, v27.4s, v22.4s\n"
-    "ldr s26, [x17, x27]\n"
-    "fmla v16.4s, v26.4s, v22.4s\n"
-    "ldr s28, [x24, x27]\n"
-    "ldr s27, [x7, x27]\n"
-    "ldr x25, [%[inptrs], 8]\n"
-    "ldr x17, [%[inptrs], 56]\n"
-    "ldr x24, [%[inptrs], 104]\n"
-    "ldr s31, [x25, x27]\n"
-    "fmla v2.4s, v26.4s, v19.4s\n"
-    "ldr s29, [x17, x27]\n"
-    "ldr s21, [x24, x27]\n"
-    "ldr x25, [%[inptrs], 16]\n"
-    "ldr s30, [x25, x27]\n"
-    "fmla v2.4s, v31.4s, v9.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "mov v1.16b, v25.16b\n"
-    "ldr x17, [%[inptrs], 64]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr x25, [%[inptrs], 24]\n"
-    "fmla v18.4s, v31.4s, v22.4s\n"
-    "ldr s23, [x17, x27]\n"
-    "fmla v2.4s, v28.4s, v5.4s\n"
-    "ldr x15, [%[inptrs], 192]\n"
-    "fmla v16.4s, v28.4s, v19.4s\n"
-    "ldr x7, [%[inptrs], 152]\n"
-    "fmla v13.4s, v28.4s, v22.4s\n"
-    "ldr s26, [x25, x27]\n"
-    "fmla v18.4s, v29.4s, v19.4s\n"
-    "ldr x24, [%[inptrs], 112]\n"
-    "fmla v2.4s, v29.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 72]\n"
-    "fmla v16.4s, v29.4s, v9.4s\n"
-    "ldr x25, [%[inptrs], 32]\n"
-    "fmla v0.4s, v29.4s, v22.4s\n"
-    "ldr s28, [x15, x27]\n"
-    "fmla v18.4s, v30.4s, v9.4s\n"
-    "ldr x16, [%[inptrs], 240]\n"
-    "fmla v2.4s, v30.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 200]\n"
-    "fmla v17.4s, v30.4s, v22.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v16.4s, v27.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 160]\n"
-    "fmla v13.4s, v27.4s, v19.4s\n"
-    "ldr x20, [%[outptrs], 0]\n"
-    "fmla v14.4s, v27.4s, v22.4s\n"
-    "ldr s20, [x24, x27]\n"
-    "fmla v2.4s, v21.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 120]\n"
-    "fmla v16.4s, v21.4s, v7.4s\n"
-    "ldr x21, [%[outptrs], 32]\n"
-    "fmla v18.4s, v21.4s, v5.4s\n"
-    "ldr x22, [%[outptrs], 64]\n"
-    "fmla v13.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 96]\n"
-    "fmla v0.4s, v21.4s, v19.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v12.4s, v21.4s, v22.4s\n"
-    "ldr s24, [x17, x27]\n"
-    "fmla v2.4s, v23.4s, v6.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v23.4s, v8.4s\n"
-    "ldr x17, [%[inptrs], 80]\n"
-    "fmla v18.4s, v23.4s, v7.4s\n"
-    "subs x19, x19, #1\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "fmla v17.4s, v23.4s, v19.4s\n"
-    "fmla v15.4s, v23.4s, v22.4s\n"
-    "ldr s23, [x25, x27]\n"
-    "fmla v1.4s, v26.4s, v22.4s\n"
-    "ldr x25, [%[inptrs], 40]\n"
-    "fmla v18.4s, v26.4s, v8.4s\n"
-    "fmla v13.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v26.4s, v9.4s\n"
-    "ldr s30, [x16, x27]\n"
-    "fmla v14.4s, v28.4s, v19.4s\n"
-    "ldr s26, [x15, x27]\n"
-    "fmla v16.4s, v29.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 248]\n"
-    "fmla v13.4s, v29.4s, v7.4s\n"
-    "ldr x15, [%[inptrs], 208]\n"
-    "fmla v0.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v19.4s\n"
-    "fmla v14.4s, v29.4s, v9.4s\n"
-    "fmla v10.4s, v29.4s, v22.4s\n"
-    "mov v11.16b, v25.16b\n"
-    "fmla v2.4s, v20.4s, v3.4s\n"
-    "fmla v16.4s, v20.4s, v6.4s\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v13.4s, v20.4s, v8.4s\n"
-    "fmla v0.4s, v20.4s, v7.4s\n"
-    "fmla v17.4s, v20.4s, v5.4s\n"
-    "fmla v12.4s, v20.4s, v9.4s\n"
-    "fmla v15.4s, v20.4s, v19.4s\n"
-    "fmla v11.4s, v20.4s, v22.4s\n"
-    "mov v21.16b, v25.16b\n"
-    "fmla v18.4s, v24.4s, v6.4s\n"
-    "fmla v0.4s, v24.4s, v8.4s\n"
-    "fmla v1.4s, v24.4s, v19.4s\n"
-    "fmla v17.4s, v24.4s, v7.4s\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "mov v20.16b, v25.16b\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "fmla v21.4s, v24.4s, v22.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 168]\n"
-    "fmla v17.4s, v23.4s, v8.4s\n"
-    "ldr s30, [x24, x27]\n"
-    "fmla v13.4s, v26.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 128]\n"
-    "fmla v14.4s, v26.4s, v7.4s\n"
-    "fmla v12.4s, v26.4s, v5.4s\n"
-    "fmla v10.4s, v26.4s, v19.4s\n"
-    "ldr s31, [x17, x27]\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "ldr x17, [%[inptrs], 88]\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v0.4s, v27.4s, v4.4s\n"
-    "fmla v14.4s, v27.4s, v8.4s\n"
-    "fmla v12.4s, v27.4s, v7.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "fmla v10.4s, v27.4s, v9.4s\n"
-    "fmla v11.4s, v27.4s, v19.4s\n"
-    "fmla v20.4s, v27.4s, v22.4s\n"
-    "mov v24.16b, v25.16b\n"
-    "mov v23.16b, v25.16b\n"
-    "fmla v18.4s, v30.4s, v3.4s\n"
-    "fmla v0.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "fmla v12.4s, v30.4s, v8.4s\n"
-    "fmla v15.4s, v30.4s, v7.4s\n"
-    "fmla v1.4s, v30.4s, v5.4s\n"
-    "fmla v11.4s, v30.4s, v9.4s\n"
-    "fmla v21.4s, v30.4s, v19.4s\n"
-    "fmla v24.4s, v30.4s, v22.4s\n"
-    "ldr s25, [x25, x27]\n"
-    "fmla v17.4s, v31.4s, v6.4s\n"
-    "ldr x25, [%[inptrs], 0]\n"
-    "fmla v15.4s, v31.4s, v8.4s\n"
-    "fmla v1.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v31.4s, v9.4s\n"
-    "ldr s26, [x16, x27]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 256]\n"
-    "fmla v10.4s, v26.4s, v5.4s\n"
-    "ldr s31, [x15, x27]\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v13.4s, v31.4s, v3.4s\n"
-    "ldr x15, [%[inptrs], 216]\n"
-    "fmla v14.4s, v31.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 176]\n"
-    "fmla v12.4s, v31.4s, v4.4s\n"
-    "fmla v10.4s, v31.4s, v7.4s\n"
-    "fmla v11.4s, v31.4s, v5.4s\n"
-    "fmla v20.4s, v31.4s, v19.4s\n"
-    "fmla v0.4s, v29.4s, v3.4s\n"
-    "ldr s28, [x24, x27]\n"
-    "fmla v15.4s, v29.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 136]\n"
-    "fmla v12.4s, v29.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v8.4s\n"
-    "fmla v11.4s, v29.4s, v7.4s\n"
-    "fmla v21.4s, v29.4s, v5.4s\n"
-    "fmla v20.4s, v29.4s, v9.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v23.4s, v29.4s, v22.4s\n"
-    "ldr s25, [x17, x27]\n"
-    "fmla v17.4s, v28.4s, v3.4s\n"
-    "ldr s29, [x16, x27]\n"
-    "fmla v15.4s, v28.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 264]\n"
-    "fmla v1.4s, v28.4s, v4.4s\n"
-    "ldr x17, [%[inptrs], 48]\n"
-    "fmla v11.4s, v28.4s, v8.4s\n"
-    "fmla v21.4s, v28.4s, v7.4s\n"
-    "fmla v24.4s, v28.4s, v9.4s\n"
-    "ldr s22, [x15, x27]\n"
-    "fmla v14.4s, v29.4s, v3.4s\n"
-    "ldr x15, [%[inptrs], 224]\n"
-    "fmla v1.4s, v25.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v4.4s\n"
-    "fmla v21.4s, v25.4s, v8.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "fmla v20.4s, v29.4s, v5.4s\n"
-    "ldr s26, [x24, x27]\n"
-    "fmla v12.4s, v22.4s, v3.4s\n"
-    "ldr x7, [%[inptrs], 184]\n"
-    "fmla v10.4s, v22.4s, v6.4s\n"
-    "ldr x24, [%[inptrs], 96]\n"
-    "fmla v11.4s, v22.4s, v4.4s\n"
-    "fmla v24.4s, v22.4s, v5.4s\n"
-    "fmla v20.4s, v22.4s, v7.4s\n"
-    "fmla v23.4s, v22.4s, v19.4s\n"
-    "fmla v15.4s, v27.4s, v3.4s\n"
-    "ldr s25, [x16, x27]\n"
-    "fmla v21.4s, v27.4s, v4.4s\n"
-    "ldr s31, [x15, x27]\n"
-    "fmla v11.4s, v27.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 272]\n"
-    "fmla v20.4s, v27.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 232]\n"
-    "fmla v24.4s, v27.4s, v7.4s\n"
-    "fmla v23.4s, v27.4s, v9.4s\n"
-    "fmla v1.4s, v26.4s, v3.4s\n"
-    "ldr s22, [x7, x27]\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr s19, [x16, x27]\n"
-    "fmla v10.4s, v25.4s, v3.4s\n"
-    "ldr x16, [%[inptrs], 280]\n"
-    "fmla v24.4s, v26.4s, v8.4s\n"
-    "ldr s28, [x15, x27]\n"
-    "fmla v20.4s, v25.4s, v4.4s\n"
-    "ldr x7, [%[inptrs], 144]\n"
-    "fmla v23.4s, v25.4s, v5.4s\n"
-    "ldr s30, [x16, x27]\n"
-    "fmla v11.4s, v31.4s, v3.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v24.4s, v31.4s, v4.4s\n"
-    "ldr s27, [x25, x27]\n"
-    "fmla v20.4s, v31.4s, v6.4s\n"
-    "ldr x25, [%[inptrs], 8]\n"
-    "fmla v23.4s, v31.4s, v7.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v21.4s, v22.4s, v3.4s\n"
-    "ldr s26, [x17, x27]\n"
-    "fmla v24.4s, v22.4s, v6.4s\n"
-    "ldr x17, [%[inptrs], 56]\n"
-    "fmla v20.4s, v19.4s, v3.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmla v23.4s, v22.4s, v8.4s\n"
-    "ldr s25, [%[wbptr]]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "ldr s22, [%[wbptr], #4]\n"
-    "str s2, [x20, x28]\n"
-    "fmla v24.4s, v28.4s, v3.4s\n"
-    "fmax v17.4s, v17.4s, v29.4s\n"
-    "ldr s9, [%[wbptr], #8]\n"
-    "fmla v23.4s, v19.4s, v4.4s\n"
-    "ldr s8, [%[wbptr], #12]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr s19, [%[wbptr], #16]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 8]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str s18, [x20, x28]\n"
-    "fmla v23.4s, v28.4s, v6.4s\n"
-    "str s16, [x21, x28]\n"
-    "fmax v21.4s, v21.4s, v29.4s\n"
-    "fmax v13.4s, v13.4s, v29.4s\n"
-    "ldr s7, [%[wbptr], #20]\n"
-    "fmax v12.4s, v12.4s, v29.4s\n"
-    "ldr s5, [%[wbptr], #28]\n"
-    "fmla v23.4s, v30.4s, v3.4s\n"
-    "ldr s6, [%[wbptr], #24]\n"
-    "str s13, [x22, x28]\n"
-    "fmax v11.4s, v11.4s, v29.4s\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "ldr s4, [%[wbptr], #32]\n"
-    "fmax v14.4s, v14.4s, v29.4s\n"
-    "ldr s31, [x25, x27]\n"
-    "fmax v10.4s, v10.4s, v29.4s\n"
-    "ldr s3, [%[wbptr], #36]\n"
-    "fmax v20.4s, v20.4s, v29.4s\n"
-    "ldr s28, [x24, x27]\n"
-    "str s14, [x23, x28]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "mov v2.16b, v25.16b\n"
-    "ldr s29, [x17, x27]\n"
-    "ldr x20, [%[outptrs], 16]\n"
-    "ldr x21, [%[outptrs], 40]\n"
-    "ldr x22, [%[outptrs], 72]\n"
-    "ldr x23, [%[outptrs], 104]\n"
-    "ldr x25, [%[inptrs], 16]\n"
-    "ldr x24, [%[inptrs], 104]\n"
-    "str s17, [x20, x28]\n"
-    "mov v16.16b, v25.16b\n"
-    "str s0, [x21, x28]\n"
-    "mov v18.16b, v25.16b\n"
-    "str s12, [x22, x28]\n"
-    "mov v13.16b, v25.16b\n"
-    "str s10, [x23, x28]\n"
-    "mov v0.16b, v25.16b\n"
-    "fmla v2.4s, v27.4s, v22.4s\n"
-    "ldr s30, [x25, x27]\n"
-    "fmla v16.4s, v26.4s, v22.4s\n"
-    "ldr x20, [%[outptrs], 24]\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr x21, [%[outptrs], 48]\n"
-    "str s1, [x20, x28]\n"
-    "mov v14.16b, v25.16b\n"
-    "str s15, [x21, x28]\n"
-    "mov v12.16b, v25.16b\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr x21, [%[outptrs], 56]\n"
-    "fmla v2.4s, v26.4s, v19.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "str s21, [x21, x28]\n"
-    "ldr x22, [%[outptrs], 80]\n"
-    "ldr s21, [x24, x27]\n"
-    "ldr x23, [%[outptrs], 112]\n"
-    "str s11, [x22, x28]\n"
-    "fmla v2.4s, v31.4s, v9.4s\n"
-    "str s20, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 88]\n"
-    "ldr x23, [%[outptrs], 120]\n"
-    "str s24, [x22, x28]\n"
-    "str s23, [x23, x28]\n"
-    "add x28, x28, #4\n"
-    "bne 5b\n"
-    "6:\n"
-    "mov v1.16b, v25.16b\n"
-    "ldr x17, [%[inptrs], 64]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr x25, [%[inptrs], 24]\n"
-    "mov v11.16b, v25.16b\n"
-    "ldr x15, [%[inptrs], 192]\n"
-    "fmla v18.4s, v31.4s, v22.4s\n"
-    "ldr s23, [x17, x27]\n"
-    "fmla v2.4s, v28.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 152]\n"
-    "fmla v16.4s, v28.4s, v19.4s\n"
-    "ldr x24, [%[inptrs], 112]\n"
-    "fmla v13.4s, v28.4s, v22.4s\n"
-    "ldr s26, [x25, x27]\n"
-    "fmla v18.4s, v29.4s, v19.4s\n"
-    "ldr x17, [%[inptrs], 72]\n"
-    "fmla v2.4s, v29.4s, v7.4s\n"
-    "ldr x25, [%[inptrs], 32]\n"
-    "fmla v16.4s, v29.4s, v9.4s\n"
-    "ldr x16, [%[inptrs], 240]\n"
-    "fmla v0.4s, v29.4s, v22.4s\n"
-    "ldr s28, [x15, x27]\n"
-    "fmla v18.4s, v30.4s, v9.4s\n"
-    "ldr x15, [%[inptrs], 200]\n"
-    "fmla v2.4s, v30.4s, v8.4s\n"
-    "ldr x20, [%[outptrs], 0]\n"
-    "fmla v17.4s, v30.4s, v22.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v16.4s, v27.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 160]\n"
-    "fmla v13.4s, v27.4s, v19.4s\n"
-    "ldr x21, [%[outptrs], 32]\n"
-    "fmla v14.4s, v27.4s, v22.4s\n"
-    "ldr s20, [x24, x27]\n"
-    "fmla v2.4s, v21.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 120]\n"
-    "fmla v16.4s, v21.4s, v7.4s\n"
-    "ldr x22, [%[outptrs], 64]\n"
-    "fmla v18.4s, v21.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 96]\n"
-    "fmla v13.4s, v21.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v0.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v12.4s, v21.4s, v22.4s\n"
-    "ldr s24, [x17, x27]\n"
-    "fmla v2.4s, v23.4s, v6.4s\n"
-    "ldr x17, [%[inptrs], 80]\n"
-    "fmla v16.4s, v23.4s, v8.4s\n"
-    "fmla v18.4s, v23.4s, v7.4s\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "fmla v17.4s, v23.4s, v19.4s\n"
-    "fmla v15.4s, v23.4s, v22.4s\n"
-    "ldr s23, [x25, x27]\n"
-    "fmla v1.4s, v26.4s, v22.4s\n"
-    "ldr x25, [%[inptrs], 40]\n"
-    "fmla v18.4s, v26.4s, v8.4s\n"
-    "fmla v13.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v26.4s, v9.4s\n"
-    "ldr s30, [x16, x27]\n"
-    "fmla v14.4s, v28.4s, v19.4s\n"
-    "ldr s26, [x15, x27]\n"
-    "fmla v16.4s, v29.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 248]\n"
-    "fmla v13.4s, v29.4s, v7.4s\n"
-    "ldr x15, [%[inptrs], 208]\n"
-    "fmla v0.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v19.4s\n"
-    "fmla v14.4s, v29.4s, v9.4s\n"
-    "fmla v10.4s, v29.4s, v22.4s\n"
-    "mov v21.16b, v25.16b\n"
-    "fmla v2.4s, v20.4s, v3.4s\n"
-    "fmla v16.4s, v20.4s, v6.4s\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v13.4s, v20.4s, v8.4s\n"
-    "fmla v0.4s, v20.4s, v7.4s\n"
-    "fmla v17.4s, v20.4s, v5.4s\n"
-    "fmla v12.4s, v20.4s, v9.4s\n"
-    "fmla v15.4s, v20.4s, v19.4s\n"
-    "fmla v11.4s, v20.4s, v22.4s\n"
-    "mov v20.16b, v25.16b\n"
-    "fmla v18.4s, v24.4s, v6.4s\n"
-    "fmla v0.4s, v24.4s, v8.4s\n"
-    "fmla v1.4s, v24.4s, v19.4s\n"
-    "fmla v17.4s, v24.4s, v7.4s\n"
-    "fmla v21.4s, v24.4s, v22.4s\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "ldr s30, [x24, x27]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 168]\n"
-    "fmla v17.4s, v23.4s, v8.4s\n"
-    "ldr s31, [x17, x27]\n"
-    "fmla v13.4s, v26.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 128]\n"
-    "fmla v14.4s, v26.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 88]\n"
-    "fmla v12.4s, v26.4s, v5.4s\n"
-    "fmla v10.4s, v26.4s, v19.4s\n"
-    "mov v24.16b, v25.16b\n"
-    "mov v23.16b, v25.16b\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v0.4s, v27.4s, v4.4s\n"
-    "fmla v14.4s, v27.4s, v8.4s\n"
-    "fmla v12.4s, v27.4s, v7.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "fmla v10.4s, v27.4s, v9.4s\n"
-    "fmla v11.4s, v27.4s, v19.4s\n"
-    "fmla v20.4s, v27.4s, v22.4s\n"
-    "ldr s25, [x25, x27]\n"
-    "fmla v18.4s, v30.4s, v3.4s\n"
-    "fmla v0.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "fmla v12.4s, v30.4s, v8.4s\n"
-    "fmla v15.4s, v30.4s, v7.4s\n"
-    "fmla v1.4s, v30.4s, v5.4s\n"
-    "fmla v11.4s, v30.4s, v9.4s\n"
-    "fmla v21.4s, v30.4s, v19.4s\n"
-    "fmla v24.4s, v30.4s, v22.4s\n"
-    "ldr s26, [x16, x27]\n"
-    "fmla v17.4s, v31.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 256]\n"
-    "fmla v15.4s, v31.4s, v8.4s\n"
-    "fmla v1.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v31.4s, v9.4s\n"
-    "ldr s31, [x15, x27]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr x15, [%[inptrs], 216]\n"
-    "fmla v10.4s, v26.4s, v5.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "ldr s28, [x24, x27]\n"
-    "fmla v13.4s, v31.4s, v3.4s\n"
-    "ldr x7, [%[inptrs], 176]\n"
-    "fmla v14.4s, v31.4s, v6.4s\n"
-    "ldr x24, [%[inptrs], 136]\n"
-    "fmla v12.4s, v31.4s, v4.4s\n"
-    "fmla v10.4s, v31.4s, v7.4s\n"
-    "fmla v11.4s, v31.4s, v5.4s\n"
-    "fmla v20.4s, v31.4s, v19.4s\n"
-    "fmla v0.4s, v29.4s, v3.4s\n"
-    "ldr s25, [x17, x27]\n"
-    "fmla v15.4s, v29.4s, v4.4s\n"
-    "fmla v21.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v8.4s\n"
-    "fmla v11.4s, v29.4s, v7.4s\n"
-    "fmla v20.4s, v29.4s, v9.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v23.4s, v29.4s, v22.4s\n"
-    "fmla v17.4s, v28.4s, v3.4s\n"
-    "ldr s29, [x16, x27]\n"
-    "fmla v15.4s, v28.4s, v6.4s\n"
-    "ldr s22, [x15, x27]\n"
-    "fmla v1.4s, v28.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 264]\n"
-    "fmla v11.4s, v28.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 224]\n"
-    "fmla v21.4s, v28.4s, v7.4s\n"
-    "fmla v24.4s, v28.4s, v9.4s\n"
-    "fmla v14.4s, v29.4s, v3.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "fmla v1.4s, v25.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 184]\n"
-    "fmla v10.4s, v29.4s, v4.4s\n"
-    "fmla v20.4s, v29.4s, v5.4s\n"
-    "fmla v21.4s, v25.4s, v8.4s\n"
-    "ldr s26, [x24, x27]\n"
-    "fmla v12.4s, v22.4s, v3.4s\n"
-    "ldr s25, [x16, x27]\n"
-    "fmla v11.4s, v22.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 272]\n"
-    "fmla v10.4s, v22.4s, v6.4s\n"
-    "fmla v20.4s, v22.4s, v7.4s\n"
-    "fmla v24.4s, v22.4s, v5.4s\n"
-    "fmla v23.4s, v22.4s, v19.4s\n"
-    "fmla v15.4s, v27.4s, v3.4s\n"
-    "ldr s31, [x15, x27]\n"
-    "fmla v11.4s, v27.4s, v6.4s\n"
-    "ldr s22, [x7, x27]\n"
-    "fmla v21.4s, v27.4s, v4.4s\n"
-    "ldr x15, [%[inptrs], 232]\n"
-    "fmla v20.4s, v27.4s, v8.4s\n"
-    "fmla v24.4s, v27.4s, v7.4s\n"
-    "fmla v23.4s, v27.4s, v9.4s\n"
-    "ldr s19, [x16, x27]\n"
-    "fmla v1.4s, v26.4s, v3.4s\n"
-    "ldr s28, [x15, x27]\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 280]\n"
-    "fmla v24.4s, v26.4s, v8.4s\n"
-    "fmla v10.4s, v25.4s, v3.4s\n"
-    "fmla v20.4s, v25.4s, v4.4s\n"
-    "ldr s30, [x16, x27]\n"
-    "fmla v23.4s, v25.4s, v5.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v11.4s, v31.4s, v3.4s\n"
-    "fmla v21.4s, v22.4s, v3.4s\n"
-    "fmla v24.4s, v31.4s, v4.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v20.4s, v31.4s, v6.4s\n"
-    "fmla v23.4s, v31.4s, v7.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "fmla v24.4s, v22.4s, v6.4s\n"
-    "fmax v17.4s, v17.4s, v29.4s\n"
-    "fmla v20.4s, v19.4s, v3.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "str s2, [x20, x28]\n"
-    "fmla v23.4s, v22.4s, v8.4s\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 8]\n"
-    "fmla v24.4s, v28.4s, v3.4s\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str s18, [x20, x28]\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str s16, [x21, x28]\n"
-    "fmla v23.4s, v19.4s, v4.4s\n"
-    "fmax v21.4s, v21.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 16]\n"
-    "fmax v13.4s, v13.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 40]\n"
-    "str s17, [x20, x28]\n"
-    "fmax v12.4s, v12.4s, v29.4s\n"
-    "str s0, [x21, x28]\n"
-    "fmla v23.4s, v28.4s, v6.4s\n"
-    "str s13, [x22, x28]\n"
-    "fmax v11.4s, v11.4s, v29.4s\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 24]\n"
-    "fmax v14.4s, v14.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 48]\n"
-    "str s1, [x20, x28]\n"
-    "fmla v23.4s, v30.4s, v3.4s\n"
-    "str s15, [x21, x28]\n"
-    "fmax v10.4s, v10.4s, v29.4s\n"
-    "str s14, [x23, x28]\n"
-    "fmax v20.4s, v20.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 56]\n"
-    "ldr x22, [%[outptrs], 72]\n"
-    "ldr x23, [%[outptrs], 104]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str s21, [x21, x28]\n"
-    "str s12, [x22, x28]\n"
-    "str s10, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 80]\n"
-    "ldr x23, [%[outptrs], 112]\n"
-    "str s11, [x22, x28]\n"
-    "str s20, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 88]\n"
-    "ldr x23, [%[outptrs], 120]\n"
-    "str s24, [x22, x28]\n"
-    "str s23, [x23, x28]\n"
-    "add x28, x28, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr)
-    : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x24, %[inptr0], %[input_row_stride]\n"
-    "add x13, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x8, %[outptr0], %[output_row_stride]\n"
-    "add x9, x24, %[input_row_stride]\n"
-    "add x10, x13, #64\n"
-    "add x19, x13, %[input_col_stride1]\n"
-    "add x20, x9, %[input_row_stride]\n"
-    "add x21, x19, #64\n"
-    "add x17, x19, %[input_col_stride1]\n"
-    "add x22, x20, %[input_row_stride]\n"
-    "add x7, x17, #64\n"
-    "add x11, x17, %[input_col_stride1]\n"
-    "add x23, x22, %[input_row_stride]\n"
-    "add x12, x11, #64\n"
-    "add x25, x8, %[output_row_stride]\n"
-    "add x26, x25, %[output_row_stride]\n"
-    "add x27, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x14, %[n_channels], #3\n"
-    "add x28, x27, %[output_col_stride1]\n"
-    "lsr x15, %[n_channels], #2\n"
-    "cbz x15, 4f\n"
-    "1:\n"
-    "ldr q23, [%[wbptr]]\n"
-    "subs x15, x15, #1\n"
-    "mov v12.16b, v23.16b\n"
-    "ldr q20, [%[wbptr], #16]\n"
-    "mov v8.16b, v23.16b\n"
-    "ldr q6, [%[wbptr], #32]\n"
-    "mov v11.16b, v23.16b\n"
-    "ldr q5, [%[wbptr], #48]\n"
-    "mov v16.16b, v23.16b\n"
-    "ldr q19, [%[wbptr], #64]\n"
-    "mov v7.16b, v23.16b\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "mov v10.16b, v23.16b\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "mov v14.16b, v23.16b\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "mov v15.16b, v23.16b\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "mov v17.16b, v23.16b\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "mov v9.16b, v23.16b\n"
-    "ldr q28, [%[inptr0]]\n"
-    "fmla v12.4s, v28.4s, v20.4s\n"
-    "ldr q25, [x24]\n"
-    "fmla v8.4s, v25.4s, v20.4s\n"
-    "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v11.4s, v18.4s, v20.4s\n"
-    "ldr q30, [x9]\n"
-    "fmla v12.4s, v25.4s, v19.4s\n"
-    "ldr q29, [x24, %[input_col_stride1]]\n"
-    "fmla v8.4s, v30.4s, v19.4s\n"
-    "ldr q24, [%[inptr0], x13]\n"
-    "fmla v16.4s, v30.4s, v20.4s\n"
-    "ldr q27, [x20]\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "ldr q22, [x9, %[input_col_stride1]]\n"
-    "fmla v8.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v12.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "prfm pldl1keep, [x24, x16]\n"
-    "prfm pldl1keep, [%[inptr0], x10]\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v12.4s, v29.4s, v4.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "mov v13.16b, v23.16b\n"
-    "ldr q21, [x24, x13]\n"
-    "mov v18.16b, v23.16b\n"
-    "prfm pldl1keep, [x24, x10]\n"
-    "fmla v11.4s, v29.4s, v19.4s\n"
-    "prfm pldl1keep, [%[inptr0], x21]\n"
-    "fmla v7.4s, v29.4s, v20.4s\n"
-    "ldr q25, [%[inptr0], x19]\n"
-    "fmla v12.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v11.4s, v24.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v10.4s, v24.4s, v20.4s\n"
-    "ldr q24, [x22]\n"
-    "fmla v8.4s, v27.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x10]\n"
-    "fmla v16.4s, v27.4s, v19.4s\n"
-    "prfm pldl1keep, [x24, x21]\n"
-    "fmla v14.4s, v27.4s, v20.4s\n"
-    "ldr q26, [x20, %[input_col_stride1]]\n"
-    "fmla v12.4s, v22.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v8.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v11.4s, v22.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x10]\n"
-    "fmla v7.4s, v22.4s, v19.4s\n"
-    "prfm pldl1keep, [x9, x21]\n"
-    "fmla v15.4s, v22.4s, v20.4s\n"
-    "ldr q30, [x9, x13]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x24, x7]\n"
-    "fmla v8.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x12]\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v7.4s, v21.4s, v6.4s\n"
-    "prfm pldl1keep, [x22, x10]\n"
-    "fmla v10.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [x20, x21]\n"
-    "fmla v17.4s, v21.4s, v20.4s\n"
-    "ldr q22, [x24, x19]\n"
-    "fmla v11.4s, v25.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v10.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x24, x12]\n"
-    "fmla v9.4s, v25.4s, v20.4s\n"
-    "ldr q21, [%[inptr0], x17]\n"
-    "fmla v16.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x10]\n"
-    "fmla v14.4s, v24.4s, v19.4s\n"
-    "ldr q24, [x23]\n"
-    "fmla v8.4s, v26.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x21]\n"
-    "fmla v16.4s, v26.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x7]\n"
-    "fmla v7.4s, v26.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x12]\n"
-    "fmla v14.4s, v26.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x21]\n"
-    "fmla v15.4s, v26.4s, v19.4s\n"
-    "prfm pldl1keep, [x22, x7]\n"
-    "fmla v13.4s, v26.4s, v20.4s\n"
-    "ldr q26, [x22, %[input_col_stride1]]\n"
-    "fmla v12.4s, v30.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, x12]\n"
-    "fmla v8.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x23, x7]\n"
-    "fmla v11.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x12]\n"
-    "fmla v16.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x23, x12]\n"
-    "fmla v7.4s, v30.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v10.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "subs x15, x15, #1\n"
-    "fmla v17.4s, v30.4s, v19.4s\n"
-    "fmla v18.4s, v30.4s, v20.4s\n"
-    "mov v25.16b, v23.16b\n"
-    "fmla v11.4s, v22.4s, v3.4s\n"
-    "fmla v7.4s, v22.4s, v5.4s\n"
-    "fmla v10.4s, v22.4s, v4.4s\n"
-    "fmla v17.4s, v22.4s, v6.4s\n"
-    "fmla v9.4s, v22.4s, v19.4s\n"
-    "fmla v25.4s, v22.4s, v20.4s\n"
-    "ldr q27, [x20, x13]\n"
-    "fmla v10.4s, v21.4s, v5.4s\n"
-    "fmla v14.4s, v24.4s, v2.4s\n"
-    "mov v22.16b, v23.16b\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "mov v24.16b, v23.16b\n"
-    "mov v21.16b, v23.16b\n"
-    "fmla v16.4s, v26.4s, v1.4s\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v13.4s, v26.4s, v19.4s\n"
-    "fmla v8.4s, v27.4s, v0.4s\n"
-    "ldr q28, [x9, x19]\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v7.4s, v27.4s, v1.4s\n"
-    "fmla v14.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v18.4s, v27.4s, v19.4s\n"
-    "fmla v22.4s, v27.4s, v20.4s\n"
-    "fmla v11.4s, v28.4s, v0.4s\n"
-    "ldr q29, [x24, x17]\n"
-    "fmla v7.4s, v28.4s, v3.4s\n"
-    "fmla v10.4s, v28.4s, v1.4s\n"
-    "fmla v15.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v28.4s, v4.4s\n"
-    "fmla v9.4s, v28.4s, v2.4s\n"
-    "fmla v18.4s, v28.4s, v6.4s\n"
-    "fmla v25.4s, v28.4s, v19.4s\n"
-    "fmla v24.4s, v28.4s, v20.4s\n"
-    "fmla v10.4s, v29.4s, v3.4s\n"
-    "ldr q23, [%[inptr0], x11]\n"
-    "fmla v17.4s, v29.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v9.4s, v29.4s, v4.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v25.4s, v29.4s, v6.4s\n"
-    "ldr q30, [x23, %[input_col_stride1]]\n"
-    "fmla v14.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v9.4s, v23.4s, v5.4s\n"
-    "ldr q23, [x22, x13]\n"
-    "fmla v13.4s, v30.4s, v2.4s\n"
-    "ldr q29, [x20, x19]\n"
-    "fmla v16.4s, v23.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x10]\n"
-    "fmla v14.4s, v23.4s, v3.4s\n"
-    "fmla v15.4s, v23.4s, v1.4s\n"
-    "fmla v13.4s, v23.4s, v4.4s\n"
-    "fmla v18.4s, v23.4s, v2.4s\n"
-    "fmla v22.4s, v23.4s, v19.4s\n"
-    "ldr q23, [x9, x17]\n"
-    "fmla v7.4s, v29.4s, v0.4s\n"
-    "fmla v15.4s, v29.4s, v3.4s\n"
-    "fmla v17.4s, v29.4s, v1.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v2.4s\n"
-    "fmla v22.4s, v29.4s, v6.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v21.4s, v29.4s, v20.4s\n"
-    "ldr q26, [x24, x11]\n"
-    "fmla v10.4s, v23.4s, v0.4s\n"
-    "ldr q28, [x23, x13]\n"
-    "fmla v17.4s, v23.4s, v3.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v9.4s, v23.4s, v1.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v18.4s, v23.4s, v5.4s\n"
-    "prfm pldl1keep, [x24, x16]\n"
-    "fmla v25.4s, v23.4s, v4.4s\n"
-    "fmla v24.4s, v23.4s, v6.4s\n"
-    "fmla v9.4s, v26.4s, v3.4s\n"
-    "ldr q20, [x22, x19]\n"
-    "fmla v14.4s, v28.4s, v0.4s\n"
-    "fmla v13.4s, v28.4s, v1.4s\n"
-    "fmla v25.4s, v26.4s, v5.4s\n"
-    "ldr q26, [x20, x17]\n"
-    "fmla v22.4s, v28.4s, v2.4s\n"
-    "ldr q23, [x9, x11]\n"
-    "fmla v15.4s, v20.4s, v0.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v13.4s, v20.4s, v3.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "fmla v18.4s, v20.4s, v1.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v22.4s, v20.4s, v4.4s\n"
-    "fmla v24.4s, v20.4s, v2.4s\n"
-    "fmla v21.4s, v20.4s, v19.4s\n"
-    "ldr q27, [x23, x19]\n"
-    "fmla v17.4s, v26.4s, v0.4s\n"
-    "ldr q20, [x22, x17]\n"
-    "fmla v18.4s, v26.4s, v3.4s\n"
-    "fmla v25.4s, v26.4s, v1.4s\n"
-    "fmla v22.4s, v26.4s, v5.4s\n"
-    "fmla v24.4s, v26.4s, v4.4s\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr q19, [x20, x11]\n"
-    "fmla v9.4s, v23.4s, v0.4s\n"
-    "ldr q28, [x23, x17]\n"
-    "fmla v25.4s, v23.4s, v3.4s\n"
-    "add x20, x20, #16\n"
-    "fmla v24.4s, v23.4s, v5.4s\n"
-    "ldr q29, [x22, x11]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "fmla v22.4s, v27.4s, v1.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v21.4s, v27.4s, v2.4s\n"
-    "ldr q30, [x23, x11]\n"
-    "fmla v18.4s, v20.4s, v0.4s\n"
-    "ldr q23, [%[wbptr]]\n"
-    "fmla v22.4s, v20.4s, v3.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v24.4s, v20.4s, v1.4s\n"
-    "fmla v21.4s, v20.4s, v4.4s\n"
-    "fmla v25.4s, v19.4s, v0.4s\n"
-    "ldr q20, [%[wbptr], #16]\n"
-    "fmla v22.4s, v28.4s, v0.4s\n"
-    "ldr q6, [%[wbptr], #32]\n"
-    "fmla v21.4s, v19.4s, v5.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v24.4s, v19.4s, v3.4s\n"
-    "ldr q19, [%[wbptr], #64]\n"
-    "fmax v12.4s, v12.4s, v26.4s\n"
-    "fmax v11.4s, v11.4s, v26.4s\n"
-    "fmla v21.4s, v28.4s, v1.4s\n"
-    "ldr q5, [%[wbptr], #48]\n"
-    "fmla v24.4s, v29.4s, v0.4s\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "fmax v10.4s, v10.4s, v26.4s\n"
-    "fmax v9.4s, v9.4s, v26.4s\n"
-    "fmla v21.4s, v29.4s, v3.4s\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "fmov v27.4s, #6.0\n"
-    "fmax v8.4s, v8.4s, v26.4s\n"
-    "fmax v7.4s, v7.4s, v26.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmla v21.4s, v30.4s, v0.4s\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "fmin v12.4s, v12.4s, v27.4s\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "fmin v11.4s, v11.4s, v27.4s\n"
-    "fmin v10.4s, v10.4s, v27.4s\n"
-    "str q12, [%[outptr0]]\n"
-    "fmin v9.4s, v9.4s, v27.4s\n"
-    "str q11, [%[outptr0], %[output_col_stride1]]\n"
-    "fmin v8.4s, v8.4s, v27.4s\n"
-    "str q10, [%[outptr0], x27]\n"
-    "fmin v7.4s, v7.4s, v27.4s\n"
-    "str q9, [%[outptr0], x28]\n"
-    "fmin v17.4s, v17.4s, v27.4s\n"
-    "str q8, [x8]\n"
-    "fmax v25.4s, v25.4s, v26.4s\n"
-    "str q7, [x8, %[output_col_stride1]]\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str q17, [x8, x27]\n"
-    "fmin v25.4s, v25.4s, v27.4s\n"
-    "fmin v16.4s, v16.4s, v27.4s\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "str q25, [x8, x28]\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "str q16, [x25]\n"
-    "fmax v18.4s, v18.4s, v26.4s\n"
-    "fmin v15.4s, v15.4s, v27.4s\n"
-    "ldr q28, [%[inptr0]]\n"
-    "fmin v18.4s, v18.4s, v27.4s\n"
-    "ldr q25, [x24]\n"
-    "str q15, [x25, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v26.4s\n"
-    "str q18, [x25, x27]\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "fmin v24.4s, v24.4s, v27.4s\n"
-    "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
-    "fmin v14.4s, v14.4s, v27.4s\n"
-    "ldr q30, [x9]\n"
-    "str q24, [x25, x28]\n"
-    "fmax v13.4s, v13.4s, v26.4s\n"
-    "str q14, [x26]\n"
-    "fmax v22.4s, v22.4s, v26.4s\n"
-    "fmin v13.4s, v13.4s, v27.4s\n"
-    "ldr q29, [x24, %[input_col_stride1]]\n"
-    "fmin v22.4s, v22.4s, v27.4s\n"
-    "ldr q24, [%[inptr0], x13]\n"
-    "str q13, [x26, %[output_col_stride1]]\n"
-    "fmax v21.4s, v21.4s, v26.4s\n"
-    "str q22, [x26, x27]\n"
-    "mov v12.16b, v23.16b\n"
-    "fmin v21.4s, v21.4s, v27.4s\n"
-    "ldr q27, [x20]\n"
-    "mov v8.16b, v23.16b\n"
-    "ldr q22, [x9, %[input_col_stride1]]\n"
-    "str q21, [x26, x28]\n"
-    "mov v11.16b, v23.16b\n"
-    "mov v16.16b, v23.16b\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "mov v7.16b, v23.16b\n"
-    "add x8, x8, #16\n"
-    "mov v10.16b, v23.16b\n"
-    "add x25, x25, #16\n"
-    "mov v14.16b, v23.16b\n"
-    "add x26, x26, #16\n"
-    "mov v15.16b, v23.16b\n"
-    "mov v17.16b, v23.16b\n"
-    "mov v9.16b, v23.16b\n"
-    "fmla v12.4s, v28.4s, v20.4s\n"
-    "fmla v8.4s, v25.4s, v20.4s\n"
-    "fmla v11.4s, v18.4s, v20.4s\n"
-    "fmla v16.4s, v30.4s, v20.4s\n"
-    "fmla v12.4s, v25.4s, v19.4s\n"
-    "fmla v8.4s, v30.4s, v19.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "fmla v8.4s, v29.4s, v6.4s\n"
-    "fmla v12.4s, v30.4s, v2.4s\n"
-    "fmla v12.4s, v29.4s, v4.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "mov v13.16b, v23.16b\n"
-    "ldr q21, [x24, x13]\n"
-    "mov v18.16b, v23.16b\n"
-    "prfm pldl1keep, [x24, x10]\n"
-    "fmla v11.4s, v29.4s, v19.4s\n"
-    "prfm pldl1keep, [%[inptr0], x21]\n"
-    "fmla v7.4s, v29.4s, v20.4s\n"
-    "ldr q25, [%[inptr0], x19]\n"
-    "fmla v12.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v11.4s, v24.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v10.4s, v24.4s, v20.4s\n"
-    "ldr q24, [x22]\n"
-    "fmla v8.4s, v27.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x10]\n"
-    "fmla v16.4s, v27.4s, v19.4s\n"
-    "prfm pldl1keep, [x24, x21]\n"
-    "fmla v14.4s, v27.4s, v20.4s\n"
-    "ldr q26, [x20, %[input_col_stride1]]\n"
-    "fmla v12.4s, v22.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v8.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v11.4s, v22.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x10]\n"
-    "fmla v7.4s, v22.4s, v19.4s\n"
-    "prfm pldl1keep, [x9, x21]\n"
-    "fmla v15.4s, v22.4s, v20.4s\n"
-    "ldr q30, [x9, x13]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x24, x7]\n"
-    "fmla v8.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x12]\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v7.4s, v21.4s, v6.4s\n"
-    "prfm pldl1keep, [x22, x10]\n"
-    "fmla v10.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [x20, x21]\n"
-    "fmla v17.4s, v21.4s, v20.4s\n"
-    "ldr q22, [x24, x19]\n"
-    "fmla v11.4s, v25.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v10.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x24, x12]\n"
-    "fmla v9.4s, v25.4s, v20.4s\n"
-    "ldr q21, [%[inptr0], x17]\n"
-    "fmla v16.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x10]\n"
-    "fmla v14.4s, v24.4s, v19.4s\n"
-    "ldr q24, [x23]\n"
-    "fmla v8.4s, v26.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x21]\n"
-    "fmla v16.4s, v26.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x7]\n"
-    "fmla v7.4s, v26.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x12]\n"
-    "fmla v14.4s, v26.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x21]\n"
-    "fmla v15.4s, v26.4s, v19.4s\n"
-    "prfm pldl1keep, [x22, x7]\n"
-    "fmla v13.4s, v26.4s, v20.4s\n"
-    "ldr q26, [x22, %[input_col_stride1]]\n"
-    "fmla v12.4s, v30.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, x12]\n"
-    "fmla v8.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x23, x7]\n"
-    "fmla v11.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x12]\n"
-    "fmla v16.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x23, x12]\n"
-    "fmla v7.4s, v30.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v10.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v19.4s\n"
-    "fmla v18.4s, v30.4s, v20.4s\n"
-    "ldr q27, [x20, x13]\n"
-    "fmla v11.4s, v22.4s, v3.4s\n"
-    "fmla v7.4s, v22.4s, v5.4s\n"
-    "fmla v10.4s, v22.4s, v4.4s\n"
-    "fmla v17.4s, v22.4s, v6.4s\n"
-    "fmla v9.4s, v22.4s, v19.4s\n"
-    "fmla v14.4s, v24.4s, v2.4s\n"
-    "mov v25.16b, v23.16b\n"
-    "fmla v16.4s, v26.4s, v1.4s\n"
-    "fmla v10.4s, v21.4s, v5.4s\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v25.4s, v22.4s, v20.4s\n"
-    "ldr q28, [x9, x19]\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "ldr q29, [x24, x17]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "fmla v13.4s, v26.4s, v19.4s\n"
-    "mov v22.16b, v23.16b\n"
-    "fmla v8.4s, v27.4s, v0.4s\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v7.4s, v27.4s, v1.4s\n"
-    "fmla v14.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v18.4s, v27.4s, v19.4s\n"
-    "fmla v22.4s, v27.4s, v20.4s\n"
-    "mov v24.16b, v23.16b\n"
-    "mov v21.16b, v23.16b\n"
-    "fmla v11.4s, v28.4s, v0.4s\n"
-    "fmla v7.4s, v28.4s, v3.4s\n"
-    "fmla v10.4s, v28.4s, v1.4s\n"
-    "fmla v15.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v28.4s, v4.4s\n"
-    "fmla v9.4s, v28.4s, v2.4s\n"
-    "fmla v18.4s, v28.4s, v6.4s\n"
-    "fmla v25.4s, v28.4s, v19.4s\n"
-    "fmla v24.4s, v28.4s, v20.4s\n"
-    "ldr q23, [%[inptr0], x11]\n"
-    "fmla v10.4s, v29.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v17.4s, v29.4s, v5.4s\n"
-    "fmla v9.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v6.4s\n"
-    "ldr q30, [x23, %[input_col_stride1]]\n"
-    "fmla v14.4s, v30.4s, v1.4s\n"
-    "fmla v13.4s, v30.4s, v2.4s\n"
-    "fmla v9.4s, v23.4s, v5.4s\n"
-    "ldr q23, [x22, x13]\n"
-    "fmla v16.4s, v23.4s, v0.4s\n"
-    "ldr q29, [x20, x19]\n"
-    "fmla v14.4s, v23.4s, v3.4s\n"
-    "fmla v15.4s, v23.4s, v1.4s\n"
-    "fmla v13.4s, v23.4s, v4.4s\n"
-    "fmla v18.4s, v23.4s, v2.4s\n"
-    "fmla v22.4s, v23.4s, v19.4s\n"
-    "ldr q23, [x9, x17]\n"
-    "fmla v7.4s, v29.4s, v0.4s\n"
-    "fmla v15.4s, v29.4s, v3.4s\n"
-    "fmla v17.4s, v29.4s, v1.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v2.4s\n"
-    "fmla v22.4s, v29.4s, v6.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v21.4s, v29.4s, v20.4s\n"
-    "ldr q26, [x24, x11]\n"
-    "fmla v10.4s, v23.4s, v0.4s\n"
-    "ldr q28, [x23, x13]\n"
-    "fmla v17.4s, v23.4s, v3.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v9.4s, v23.4s, v1.4s\n"
-    "fmla v18.4s, v23.4s, v5.4s\n"
-    "fmla v25.4s, v23.4s, v4.4s\n"
-    "fmla v24.4s, v23.4s, v6.4s\n"
-    "fmla v14.4s, v28.4s, v0.4s\n"
-    "ldr q20, [x22, x19]\n"
-    "fmla v9.4s, v26.4s, v3.4s\n"
-    "fmla v13.4s, v28.4s, v1.4s\n"
-    "fmla v25.4s, v26.4s, v5.4s\n"
-    "ldr q26, [x20, x17]\n"
-    "fmla v22.4s, v28.4s, v2.4s\n"
-    "ldr q23, [x9, x11]\n"
-    "fmla v15.4s, v20.4s, v0.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v13.4s, v20.4s, v3.4s\n"
-    "fmla v18.4s, v20.4s, v1.4s\n"
-    "fmla v22.4s, v20.4s, v4.4s\n"
-    "fmla v24.4s, v20.4s, v2.4s\n"
-    "fmla v21.4s, v20.4s, v19.4s\n"
-    "ldr q27, [x23, x19]\n"
-    "fmla v17.4s, v26.4s, v0.4s\n"
-    "ldr q20, [x22, x17]\n"
-    "fmla v18.4s, v26.4s, v3.4s\n"
-    "fmla v25.4s, v26.4s, v1.4s\n"
-    "fmla v22.4s, v26.4s, v5.4s\n"
-    "fmla v24.4s, v26.4s, v4.4s\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr q19, [x20, x11]\n"
-    "fmla v9.4s, v23.4s, v0.4s\n"
-    "ldr q28, [x23, x17]\n"
-    "fmla v25.4s, v23.4s, v3.4s\n"
-    "add x20, x20, #16\n"
-    "fmla v24.4s, v23.4s, v5.4s\n"
-    "ldr q29, [x22, x11]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v22.4s, v27.4s, v1.4s\n"
-    "fmla v21.4s, v27.4s, v2.4s\n"
-    "fmla v18.4s, v20.4s, v0.4s\n"
-    "ldr q30, [x23, x11]\n"
-    "fmla v24.4s, v20.4s, v1.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v22.4s, v20.4s, v3.4s\n"
-    "fmla v21.4s, v20.4s, v4.4s\n"
-    "fmla v25.4s, v19.4s, v0.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v24.4s, v19.4s, v3.4s\n"
-    "fmov v27.4s, #6.0\n"
-    "fmla v21.4s, v19.4s, v5.4s\n"
-    "fmla v22.4s, v28.4s, v0.4s\n"
-    "fmax v12.4s, v12.4s, v26.4s\n"
-    "fmax v11.4s, v11.4s, v26.4s\n"
-    "fmla v24.4s, v29.4s, v0.4s\n"
-    "fmax v10.4s, v10.4s, v26.4s\n"
-    "fmla v21.4s, v28.4s, v1.4s\n"
-    "fmin v12.4s, v12.4s, v27.4s\n"
-    "fmin v11.4s, v11.4s, v27.4s\n"
-    "fmin v10.4s, v10.4s, v27.4s\n"
-    "str q12, [%[outptr0]]\n"
-    "fmax v9.4s, v9.4s, v26.4s\n"
-    "str q11, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v21.4s, v29.4s, v3.4s\n"
-    "str q10, [%[outptr0], x27]\n"
-    "fmin v9.4s, v9.4s, v27.4s\n"
-    "fmax v8.4s, v8.4s, v26.4s\n"
-    "fmax v7.4s, v7.4s, v26.4s\n"
-    "str q9, [%[outptr0], x28]\n"
-    "fmla v21.4s, v30.4s, v0.4s\n"
-    "fmin v8.4s, v8.4s, v27.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmin v7.4s, v7.4s, v27.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "str q8, [x8]\n"
-    "fmax v25.4s, v25.4s, v26.4s\n"
-    "str q7, [x8, %[output_col_stride1]]\n"
-    "fmin v17.4s, v17.4s, v27.4s\n"
-    "fmin v25.4s, v25.4s, v27.4s\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str q17, [x8, x27]\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "str q25, [x8, x28]\n"
-    "fmin v16.4s, v16.4s, v27.4s\n"
-    "fmin v15.4s, v15.4s, v27.4s\n"
-    "add x8, x8, #16\n"
-    "str q16, [x25]\n"
-    "fmax v18.4s, v18.4s, v26.4s\n"
-    "str q15, [x25, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v26.4s\n"
-    "fmin v18.4s, v18.4s, v27.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "fmin v24.4s, v24.4s, v27.4s\n"
-    "fmax v13.4s, v13.4s, v26.4s\n"
-    "str q18, [x25, x27]\n"
-    "fmin v14.4s, v14.4s, v27.4s\n"
-    "str q24, [x25, x28]\n"
-    "fmin v13.4s, v13.4s, v27.4s\n"
-    "str q14, [x26]\n"
-    "fmax v22.4s, v22.4s, v26.4s\n"
-    "str q13, [x26, %[output_col_stride1]]\n"
-    "fmax v21.4s, v21.4s, v26.4s\n"
-    "fmin v22.4s, v22.4s, v27.4s\n"
-    "add x25, x25, #16\n"
-    "fmin v21.4s, v21.4s, v27.4s\n"
-    "str q22, [x26, x27]\n"
-    "str q21, [x26, x28]\n"
-    "add x26, x26, #16\n"
-    "4:\n"
-    "cbz x14, 7f\n"
-    "ldr s23, [%[wbptr]]\n"
-    "mov v12.16b, v23.16b\n"
-    "ldr s20, [%[wbptr], #4]\n"
-    "mov v8.16b, v23.16b\n"
-    "ldr s6, [%[wbptr], #8]\n"
-    "mov v11.16b, v23.16b\n"
-    "ldr s5, [%[wbptr], #12]\n"
-    "mov v16.16b, v23.16b\n"
-    "ldr s19, [%[wbptr], #16]\n"
-    "mov v7.16b, v23.16b\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "mov v10.16b, v23.16b\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "mov v14.16b, v23.16b\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "mov v15.16b, v23.16b\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "mov v17.16b, v23.16b\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "mov v9.16b, v23.16b\n"
-    "ldr s28, [%[inptr0]]\n"
-    "fmla v12.4s, v28.4s, v20.4s\n"
-    "ldr s25, [x24]\n"
-    "fmla v8.4s, v25.4s, v20.4s\n"
-    "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v11.4s, v18.4s, v20.4s\n"
-    "ldr s30, [x9]\n"
-    "fmla v12.4s, v25.4s, v19.4s\n"
-    "ldr s29, [x24, %[input_col_stride1]]\n"
-    "fmla v8.4s, v30.4s, v19.4s\n"
-    "ldr s24, [%[inptr0], x13]\n"
-    "fmla v16.4s, v30.4s, v20.4s\n"
-    "ldr s27, [x20]\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "ldr s22, [x9, %[input_col_stride1]]\n"
-    "fmla v8.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "subs x14, x14, #1\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "fmla v12.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [x24, x16]\n"
-    "prfm pldl1keep, [%[inptr0], x10]\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v12.4s, v29.4s, v4.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "mov v13.16b, v23.16b\n"
-    "ldr s21, [x24, x13]\n"
-    "mov v18.16b, v23.16b\n"
-    "prfm pldl1keep, [x24, x10]\n"
-    "fmla v11.4s, v29.4s, v19.4s\n"
-    "prfm pldl1keep, [%[inptr0], x21]\n"
-    "fmla v7.4s, v29.4s, v20.4s\n"
-    "ldr s25, [%[inptr0], x19]\n"
-    "fmla v12.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v11.4s, v24.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v10.4s, v24.4s, v20.4s\n"
-    "ldr s24, [x22]\n"
-    "fmla v8.4s, v27.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x10]\n"
-    "fmla v16.4s, v27.4s, v19.4s\n"
-    "prfm pldl1keep, [x24, x21]\n"
-    "fmla v14.4s, v27.4s, v20.4s\n"
-    "ldr s26, [x20, %[input_col_stride1]]\n"
-    "fmla v12.4s, v22.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v8.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v11.4s, v22.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x10]\n"
-    "fmla v7.4s, v22.4s, v19.4s\n"
-    "prfm pldl1keep, [x9, x21]\n"
-    "fmla v15.4s, v22.4s, v20.4s\n"
-    "ldr s30, [x9, x13]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x24, x7]\n"
-    "fmla v8.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x12]\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v7.4s, v21.4s, v6.4s\n"
-    "prfm pldl1keep, [x22, x10]\n"
-    "fmla v10.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [x20, x21]\n"
-    "fmla v17.4s, v21.4s, v20.4s\n"
-    "ldr s22, [x24, x19]\n"
-    "fmla v11.4s, v25.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v10.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x24, x12]\n"
-    "fmla v9.4s, v25.4s, v20.4s\n"
-    "ldr s21, [%[inptr0], x17]\n"
-    "fmla v16.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x10]\n"
-    "fmla v14.4s, v24.4s, v19.4s\n"
-    "ldr s24, [x23]\n"
-    "fmla v8.4s, v26.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x21]\n"
-    "fmla v16.4s, v26.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x7]\n"
-    "fmla v7.4s, v26.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x12]\n"
-    "fmla v14.4s, v26.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x21]\n"
-    "fmla v15.4s, v26.4s, v19.4s\n"
-    "prfm pldl1keep, [x22, x7]\n"
-    "fmla v13.4s, v26.4s, v20.4s\n"
-    "ldr s26, [x22, %[input_col_stride1]]\n"
-    "fmla v12.4s, v30.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, x12]\n"
-    "fmla v8.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x23, x7]\n"
-    "fmla v11.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x12]\n"
-    "fmla v16.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x23, x12]\n"
-    "fmla v7.4s, v30.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v10.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "subs x14, x14, #1\n"
-    "fmla v17.4s, v30.4s, v19.4s\n"
-    "fmla v18.4s, v30.4s, v20.4s\n"
-    "mov v25.16b, v23.16b\n"
-    "fmla v11.4s, v22.4s, v3.4s\n"
-    "fmla v7.4s, v22.4s, v5.4s\n"
-    "fmla v10.4s, v22.4s, v4.4s\n"
-    "fmla v17.4s, v22.4s, v6.4s\n"
-    "fmla v9.4s, v22.4s, v19.4s\n"
-    "fmla v25.4s, v22.4s, v20.4s\n"
-    "ldr s27, [x20, x13]\n"
-    "fmla v10.4s, v21.4s, v5.4s\n"
-    "fmla v14.4s, v24.4s, v2.4s\n"
-    "mov v22.16b, v23.16b\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "mov v24.16b, v23.16b\n"
-    "mov v21.16b, v23.16b\n"
-    "fmla v16.4s, v26.4s, v1.4s\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v13.4s, v26.4s, v19.4s\n"
-    "fmla v8.4s, v27.4s, v0.4s\n"
-    "ldr s28, [x9, x19]\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v7.4s, v27.4s, v1.4s\n"
-    "fmla v14.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v18.4s, v27.4s, v19.4s\n"
-    "fmla v22.4s, v27.4s, v20.4s\n"
-    "fmla v11.4s, v28.4s, v0.4s\n"
-    "ldr s29, [x24, x17]\n"
-    "fmla v7.4s, v28.4s, v3.4s\n"
-    "fmla v10.4s, v28.4s, v1.4s\n"
-    "fmla v15.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v28.4s, v4.4s\n"
-    "fmla v9.4s, v28.4s, v2.4s\n"
-    "fmla v18.4s, v28.4s, v6.4s\n"
-    "fmla v25.4s, v28.4s, v19.4s\n"
-    "fmla v24.4s, v28.4s, v20.4s\n"
-    "fmla v10.4s, v29.4s, v3.4s\n"
-    "ldr s23, [%[inptr0], x11]\n"
-    "fmla v17.4s, v29.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v9.4s, v29.4s, v4.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v25.4s, v29.4s, v6.4s\n"
-    "ldr s30, [x23, %[input_col_stride1]]\n"
-    "fmla v14.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v9.4s, v23.4s, v5.4s\n"
-    "ldr s23, [x22, x13]\n"
-    "fmla v13.4s, v30.4s, v2.4s\n"
-    "ldr s29, [x20, x19]\n"
-    "fmla v16.4s, v23.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x10]\n"
-    "fmla v14.4s, v23.4s, v3.4s\n"
-    "fmla v15.4s, v23.4s, v1.4s\n"
-    "fmla v13.4s, v23.4s, v4.4s\n"
-    "fmla v18.4s, v23.4s, v2.4s\n"
-    "fmla v22.4s, v23.4s, v19.4s\n"
-    "ldr s23, [x9, x17]\n"
-    "fmla v7.4s, v29.4s, v0.4s\n"
-    "fmla v15.4s, v29.4s, v3.4s\n"
-    "fmla v17.4s, v29.4s, v1.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v2.4s\n"
-    "fmla v22.4s, v29.4s, v6.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v21.4s, v29.4s, v20.4s\n"
-    "ldr s26, [x24, x11]\n"
-    "fmla v10.4s, v23.4s, v0.4s\n"
-    "ldr s28, [x23, x13]\n"
-    "fmla v17.4s, v23.4s, v3.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v9.4s, v23.4s, v1.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v18.4s, v23.4s, v5.4s\n"
-    "prfm pldl1keep, [x24, x16]\n"
-    "fmla v25.4s, v23.4s, v4.4s\n"
-    "fmla v24.4s, v23.4s, v6.4s\n"
-    "fmla v9.4s, v26.4s, v3.4s\n"
-    "ldr s20, [x22, x19]\n"
-    "fmla v14.4s, v28.4s, v0.4s\n"
-    "fmla v13.4s, v28.4s, v1.4s\n"
-    "fmla v25.4s, v26.4s, v5.4s\n"
-    "ldr s26, [x20, x17]\n"
-    "fmla v22.4s, v28.4s, v2.4s\n"
-    "ldr s23, [x9, x11]\n"
-    "fmla v15.4s, v20.4s, v0.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v13.4s, v20.4s, v3.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "fmla v18.4s, v20.4s, v1.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v22.4s, v20.4s, v4.4s\n"
-    "fmla v24.4s, v20.4s, v2.4s\n"
-    "fmla v21.4s, v20.4s, v19.4s\n"
-    "ldr s27, [x23, x19]\n"
-    "fmla v17.4s, v26.4s, v0.4s\n"
-    "ldr s20, [x22, x17]\n"
-    "fmla v18.4s, v26.4s, v3.4s\n"
-    "fmla v25.4s, v26.4s, v1.4s\n"
-    "fmla v22.4s, v26.4s, v5.4s\n"
-    "fmla v24.4s, v26.4s, v4.4s\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr s19, [x20, x11]\n"
-    "fmla v9.4s, v23.4s, v0.4s\n"
-    "ldr s28, [x23, x17]\n"
-    "fmla v25.4s, v23.4s, v3.4s\n"
-    "add x20, x20, #4\n"
-    "fmla v24.4s, v23.4s, v5.4s\n"
-    "ldr s29, [x22, x11]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "fmla v22.4s, v27.4s, v1.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v21.4s, v27.4s, v2.4s\n"
-    "ldr s30, [x23, x11]\n"
-    "fmla v18.4s, v20.4s, v0.4s\n"
-    "ldr s23, [%[wbptr]]\n"
-    "fmla v22.4s, v20.4s, v3.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v24.4s, v20.4s, v1.4s\n"
-    "fmla v21.4s, v20.4s, v4.4s\n"
-    "fmla v25.4s, v19.4s, v0.4s\n"
-    "ldr s20, [%[wbptr], #4]\n"
-    "fmla v22.4s, v28.4s, v0.4s\n"
-    "ldr s6, [%[wbptr], #8]\n"
-    "fmla v21.4s, v19.4s, v5.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v24.4s, v19.4s, v3.4s\n"
-    "ldr s19, [%[wbptr], #16]\n"
-    "fmax v12.4s, v12.4s, v26.4s\n"
-    "fmax v11.4s, v11.4s, v26.4s\n"
-    "fmla v21.4s, v28.4s, v1.4s\n"
-    "ldr s5, [%[wbptr], #12]\n"
-    "fmla v24.4s, v29.4s, v0.4s\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "fmax v10.4s, v10.4s, v26.4s\n"
-    "fmax v9.4s, v9.4s, v26.4s\n"
-    "fmla v21.4s, v29.4s, v3.4s\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "fmov v27.4s, #6.0\n"
-    "fmax v8.4s, v8.4s, v26.4s\n"
-    "fmax v7.4s, v7.4s, v26.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmla v21.4s, v30.4s, v0.4s\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "fmin v12.4s, v12.4s, v27.4s\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "fmin v11.4s, v11.4s, v27.4s\n"
-    "fmin v10.4s, v10.4s, v27.4s\n"
-    "str s12, [%[outptr0]]\n"
-    "fmin v9.4s, v9.4s, v27.4s\n"
-    "str s11, [%[outptr0], %[output_col_stride1]]\n"
-    "fmin v8.4s, v8.4s, v27.4s\n"
-    "str s10, [%[outptr0], x27]\n"
-    "fmin v7.4s, v7.4s, v27.4s\n"
-    "str s9, [%[outptr0], x28]\n"
-    "fmin v17.4s, v17.4s, v27.4s\n"
-    "str s8, [x8]\n"
-    "fmax v25.4s, v25.4s, v26.4s\n"
-    "str s7, [x8, %[output_col_stride1]]\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str s17, [x8, x27]\n"
-    "fmin v25.4s, v25.4s, v27.4s\n"
-    "fmin v16.4s, v16.4s, v27.4s\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "str s25, [x8, x28]\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "str s16, [x25]\n"
-    "fmax v18.4s, v18.4s, v26.4s\n"
-    "fmin v15.4s, v15.4s, v27.4s\n"
-    "ldr s28, [%[inptr0]]\n"
-    "fmin v18.4s, v18.4s, v27.4s\n"
-    "ldr s25, [x24]\n"
-    "str s15, [x25, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v26.4s\n"
-    "str s18, [x25, x27]\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "fmin v24.4s, v24.4s, v27.4s\n"
-    "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
-    "fmin v14.4s, v14.4s, v27.4s\n"
-    "ldr s30, [x9]\n"
-    "str s24, [x25, x28]\n"
-    "fmax v13.4s, v13.4s, v26.4s\n"
-    "str s14, [x26]\n"
-    "fmax v22.4s, v22.4s, v26.4s\n"
-    "fmin v13.4s, v13.4s, v27.4s\n"
-    "ldr s29, [x24, %[input_col_stride1]]\n"
-    "fmin v22.4s, v22.4s, v27.4s\n"
-    "ldr s24, [%[inptr0], x13]\n"
-    "str s13, [x26, %[output_col_stride1]]\n"
-    "fmax v21.4s, v21.4s, v26.4s\n"
-    "str s22, [x26, x27]\n"
-    "mov v12.16b, v23.16b\n"
-    "fmin v21.4s, v21.4s, v27.4s\n"
-    "ldr s27, [x20]\n"
-    "mov v8.16b, v23.16b\n"
-    "ldr s22, [x9, %[input_col_stride1]]\n"
-    "str s21, [x26, x28]\n"
-    "mov v11.16b, v23.16b\n"
-    "mov v16.16b, v23.16b\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "mov v7.16b, v23.16b\n"
-    "add x8, x8, #4\n"
-    "mov v10.16b, v23.16b\n"
-    "add x25, x25, #4\n"
-    "mov v14.16b, v23.16b\n"
-    "add x26, x26, #4\n"
-    "mov v15.16b, v23.16b\n"
-    "mov v17.16b, v23.16b\n"
-    "mov v9.16b, v23.16b\n"
-    "fmla v12.4s, v28.4s, v20.4s\n"
-    "fmla v8.4s, v25.4s, v20.4s\n"
-    "fmla v11.4s, v18.4s, v20.4s\n"
-    "fmla v16.4s, v30.4s, v20.4s\n"
-    "fmla v12.4s, v25.4s, v19.4s\n"
-    "fmla v8.4s, v30.4s, v19.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "fmla v8.4s, v29.4s, v6.4s\n"
-    "fmla v12.4s, v30.4s, v2.4s\n"
-    "fmla v12.4s, v29.4s, v4.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "mov v13.16b, v23.16b\n"
-    "ldr s21, [x24, x13]\n"
-    "mov v18.16b, v23.16b\n"
-    "prfm pldl1keep, [x24, x10]\n"
-    "fmla v11.4s, v29.4s, v19.4s\n"
-    "prfm pldl1keep, [%[inptr0], x21]\n"
-    "fmla v7.4s, v29.4s, v20.4s\n"
-    "ldr s25, [%[inptr0], x19]\n"
-    "fmla v12.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v11.4s, v24.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v10.4s, v24.4s, v20.4s\n"
-    "ldr s24, [x22]\n"
-    "fmla v8.4s, v27.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x10]\n"
-    "fmla v16.4s, v27.4s, v19.4s\n"
-    "prfm pldl1keep, [x24, x21]\n"
-    "fmla v14.4s, v27.4s, v20.4s\n"
-    "ldr s26, [x20, %[input_col_stride1]]\n"
-    "fmla v12.4s, v22.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v8.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v11.4s, v22.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x10]\n"
-    "fmla v7.4s, v22.4s, v19.4s\n"
-    "prfm pldl1keep, [x9, x21]\n"
-    "fmla v15.4s, v22.4s, v20.4s\n"
-    "ldr s30, [x9, x13]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x24, x7]\n"
-    "fmla v8.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x12]\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v7.4s, v21.4s, v6.4s\n"
-    "prfm pldl1keep, [x22, x10]\n"
-    "fmla v10.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [x20, x21]\n"
-    "fmla v17.4s, v21.4s, v20.4s\n"
-    "ldr s22, [x24, x19]\n"
-    "fmla v11.4s, v25.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v10.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x24, x12]\n"
-    "fmla v9.4s, v25.4s, v20.4s\n"
-    "ldr s21, [%[inptr0], x17]\n"
-    "fmla v16.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x10]\n"
-    "fmla v14.4s, v24.4s, v19.4s\n"
-    "ldr s24, [x23]\n"
-    "fmla v8.4s, v26.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x21]\n"
-    "fmla v16.4s, v26.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x7]\n"
-    "fmla v7.4s, v26.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x12]\n"
-    "fmla v14.4s, v26.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x21]\n"
-    "fmla v15.4s, v26.4s, v19.4s\n"
-    "prfm pldl1keep, [x22, x7]\n"
-    "fmla v13.4s, v26.4s, v20.4s\n"
-    "ldr s26, [x22, %[input_col_stride1]]\n"
-    "fmla v12.4s, v30.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, x12]\n"
-    "fmla v8.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x23, x7]\n"
-    "fmla v11.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x12]\n"
-    "fmla v16.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x23, x12]\n"
-    "fmla v7.4s, v30.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v10.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v19.4s\n"
-    "fmla v18.4s, v30.4s, v20.4s\n"
-    "ldr s27, [x20, x13]\n"
-    "fmla v11.4s, v22.4s, v3.4s\n"
-    "fmla v7.4s, v22.4s, v5.4s\n"
-    "fmla v10.4s, v22.4s, v4.4s\n"
-    "fmla v17.4s, v22.4s, v6.4s\n"
-    "fmla v9.4s, v22.4s, v19.4s\n"
-    "fmla v14.4s, v24.4s, v2.4s\n"
-    "mov v25.16b, v23.16b\n"
-    "fmla v16.4s, v26.4s, v1.4s\n"
-    "fmla v10.4s, v21.4s, v5.4s\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v25.4s, v22.4s, v20.4s\n"
-    "ldr s28, [x9, x19]\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "ldr s29, [x24, x17]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "fmla v13.4s, v26.4s, v19.4s\n"
-    "mov v22.16b, v23.16b\n"
-    "fmla v8.4s, v27.4s, v0.4s\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v7.4s, v27.4s, v1.4s\n"
-    "fmla v14.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v18.4s, v27.4s, v19.4s\n"
-    "fmla v22.4s, v27.4s, v20.4s\n"
-    "mov v24.16b, v23.16b\n"
-    "mov v21.16b, v23.16b\n"
-    "fmla v11.4s, v28.4s, v0.4s\n"
-    "fmla v7.4s, v28.4s, v3.4s\n"
-    "fmla v10.4s, v28.4s, v1.4s\n"
-    "fmla v15.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v28.4s, v4.4s\n"
-    "fmla v9.4s, v28.4s, v2.4s\n"
-    "fmla v18.4s, v28.4s, v6.4s\n"
-    "fmla v25.4s, v28.4s, v19.4s\n"
-    "fmla v24.4s, v28.4s, v20.4s\n"
-    "ldr s23, [%[inptr0], x11]\n"
-    "fmla v10.4s, v29.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v17.4s, v29.4s, v5.4s\n"
-    "fmla v9.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v6.4s\n"
-    "ldr s30, [x23, %[input_col_stride1]]\n"
-    "fmla v14.4s, v30.4s, v1.4s\n"
-    "fmla v13.4s, v30.4s, v2.4s\n"
-    "fmla v9.4s, v23.4s, v5.4s\n"
-    "ldr s23, [x22, x13]\n"
-    "fmla v16.4s, v23.4s, v0.4s\n"
-    "ldr s29, [x20, x19]\n"
-    "fmla v14.4s, v23.4s, v3.4s\n"
-    "fmla v15.4s, v23.4s, v1.4s\n"
-    "fmla v13.4s, v23.4s, v4.4s\n"
-    "fmla v18.4s, v23.4s, v2.4s\n"
-    "fmla v22.4s, v23.4s, v19.4s\n"
-    "ldr s23, [x9, x17]\n"
-    "fmla v7.4s, v29.4s, v0.4s\n"
-    "fmla v15.4s, v29.4s, v3.4s\n"
-    "fmla v17.4s, v29.4s, v1.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v2.4s\n"
-    "fmla v22.4s, v29.4s, v6.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v21.4s, v29.4s, v20.4s\n"
-    "ldr s26, [x24, x11]\n"
-    "fmla v10.4s, v23.4s, v0.4s\n"
-    "ldr s28, [x23, x13]\n"
-    "fmla v17.4s, v23.4s, v3.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v9.4s, v23.4s, v1.4s\n"
-    "fmla v18.4s, v23.4s, v5.4s\n"
-    "fmla v25.4s, v23.4s, v4.4s\n"
-    "fmla v24.4s, v23.4s, v6.4s\n"
-    "fmla v14.4s, v28.4s, v0.4s\n"
-    "ldr s20, [x22, x19]\n"
-    "fmla v9.4s, v26.4s, v3.4s\n"
-    "fmla v13.4s, v28.4s, v1.4s\n"
-    "fmla v25.4s, v26.4s, v5.4s\n"
-    "ldr s26, [x20, x17]\n"
-    "fmla v22.4s, v28.4s, v2.4s\n"
-    "ldr s23, [x9, x11]\n"
-    "fmla v15.4s, v20.4s, v0.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v13.4s, v20.4s, v3.4s\n"
-    "fmla v18.4s, v20.4s, v1.4s\n"
-    "fmla v22.4s, v20.4s, v4.4s\n"
-    "fmla v24.4s, v20.4s, v2.4s\n"
-    "fmla v21.4s, v20.4s, v19.4s\n"
-    "ldr s27, [x23, x19]\n"
-    "fmla v17.4s, v26.4s, v0.4s\n"
-    "ldr s20, [x22, x17]\n"
-    "fmla v18.4s, v26.4s, v3.4s\n"
-    "fmla v25.4s, v26.4s, v1.4s\n"
-    "fmla v22.4s, v26.4s, v5.4s\n"
-    "fmla v24.4s, v26.4s, v4.4s\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr s19, [x20, x11]\n"
-    "fmla v9.4s, v23.4s, v0.4s\n"
-    "ldr s28, [x23, x17]\n"
-    "fmla v25.4s, v23.4s, v3.4s\n"
-    "add x20, x20, #4\n"
-    "fmla v24.4s, v23.4s, v5.4s\n"
-    "ldr s29, [x22, x11]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v22.4s, v27.4s, v1.4s\n"
-    "fmla v21.4s, v27.4s, v2.4s\n"
-    "fmla v18.4s, v20.4s, v0.4s\n"
-    "ldr s30, [x23, x11]\n"
-    "fmla v24.4s, v20.4s, v1.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v22.4s, v20.4s, v3.4s\n"
-    "fmla v21.4s, v20.4s, v4.4s\n"
-    "fmla v25.4s, v19.4s, v0.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v24.4s, v19.4s, v3.4s\n"
-    "fmov v27.4s, #6.0\n"
-    "fmla v21.4s, v19.4s, v5.4s\n"
-    "fmla v22.4s, v28.4s, v0.4s\n"
-    "fmax v12.4s, v12.4s, v26.4s\n"
-    "fmax v11.4s, v11.4s, v26.4s\n"
-    "fmla v24.4s, v29.4s, v0.4s\n"
-    "fmax v10.4s, v10.4s, v26.4s\n"
-    "fmla v21.4s, v28.4s, v1.4s\n"
-    "fmin v12.4s, v12.4s, v27.4s\n"
-    "fmin v11.4s, v11.4s, v27.4s\n"
-    "fmin v10.4s, v10.4s, v27.4s\n"
-    "str s12, [%[outptr0]]\n"
-    "fmax v9.4s, v9.4s, v26.4s\n"
-    "str s11, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v21.4s, v29.4s, v3.4s\n"
-    "str s10, [%[outptr0], x27]\n"
-    "fmin v9.4s, v9.4s, v27.4s\n"
-    "fmax v8.4s, v8.4s, v26.4s\n"
-    "fmax v7.4s, v7.4s, v26.4s\n"
-    "str s9, [%[outptr0], x28]\n"
-    "fmla v21.4s, v30.4s, v0.4s\n"
-    "fmin v8.4s, v8.4s, v27.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmin v7.4s, v7.4s, v27.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "str s8, [x8]\n"
-    "fmax v25.4s, v25.4s, v26.4s\n"
-    "str s7, [x8, %[output_col_stride1]]\n"
-    "fmin v17.4s, v17.4s, v27.4s\n"
-    "fmin v25.4s, v25.4s, v27.4s\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str s17, [x8, x27]\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "str s25, [x8, x28]\n"
-    "fmin v16.4s, v16.4s, v27.4s\n"
-    "fmin v15.4s, v15.4s, v27.4s\n"
-    "add x8, x8, #4\n"
-    "str s16, [x25]\n"
-    "fmax v18.4s, v18.4s, v26.4s\n"
-    "str s15, [x25, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v26.4s\n"
-    "fmin v18.4s, v18.4s, v27.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "fmin v24.4s, v24.4s, v27.4s\n"
-    "fmax v13.4s, v13.4s, v26.4s\n"
-    "str s18, [x25, x27]\n"
-    "fmin v14.4s, v14.4s, v27.4s\n"
-    "str s24, [x25, x28]\n"
-    "fmin v13.4s, v13.4s, v27.4s\n"
-    "str s14, [x26]\n"
-    "fmax v22.4s, v22.4s, v26.4s\n"
-    "str s13, [x26, %[output_col_stride1]]\n"
-    "fmax v21.4s, v21.4s, v26.4s\n"
-    "fmin v22.4s, v22.4s, v27.4s\n"
-    "add x25, x25, #4\n"
-    "fmin v21.4s, v21.4s, v27.4s\n"
-    "str s22, [x26, x27]\n"
-    "str s21, [x26, x28]\n"
-    "add x26, x26, #4\n"
-    "7:\n"
-    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
-  );
-}
-
-#endif  // __aarch64__
-
-template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
deleted file mode 100644
index 27bfb843f6..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "impl_dilated.hpp"
-
-template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>;
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
deleted file mode 100644
index 1bae815613..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <deque>
-#include <functional>
-#include <memory>
-
-#include "depthwise.hpp"
-
-namespace depthwise
-{
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut
->
-class DilatedDepthwiseConvolution : public IDepthwiseConvolution
-{
-  public:
-    /** Create a new dilated depthwise convolution engine.
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    /** Create a new dilated depthwise convolution engine.
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    // Cannot copy or move a DilatedDepthwiseConvolution.
-    DilatedDepthwiseConvolution(DilatedDepthwiseConvolution&) = delete;
-    DilatedDepthwiseConvolution operator=(DilatedDepthwiseConvolution&) = delete;
-
-    /* Set input tensor and stride. */
-    void set_input(const void *inptr) override;
-    void set_input(const void *inptr, int column_stride) override;
-    void set_input(const void *inptr, int row_stride, int column_stride) override;
-    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /* Set output tensor and stride. */
-    void set_output(void *outptr) override;
-    void set_output(void *outptr, int column_stride) override;
-    void set_output(void *outptr, int row_stride, int column_stride) override;
-    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
-
-    static int get_output_size(
-      int dim_size,
-      unsigned int padding_before,
-      unsigned int padding_after,
-      int dilation_factor
-    );
-
-    int output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    ) const override;
-
-    /* Weights and biases are re-ordered to improve memory access patterns. Use
-     * these methods to determine the size of the re-pack buffer and to set the
-     * address (and implicitly reorder the weights and biases into) the buffer.
-     */
-    size_t get_packed_params_size(void) const override;
-    void set_packed_params_buffer(void *) override;
-
-    void pack_params(const void *weights, const void *biases=nullptr) const override;
-    void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const override;
-    void pack_params(
-      void *buffer,
-      const void* weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const override;
-
-    /* Working space is used to pad tensors on the fly. Before running any
-     * inference check the amount of space required, allocate and provide a
-     * pointer to the convolution engine.
-     */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *) override;
-
-    unsigned int get_window(void) const override;
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    /** Protected constructor which also accepts a function to construct a new
-     * subconvolution
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right,
-      std::function<IDepthwiseConvolution *(int, int, int, int, int, int, nck::ActivationFunction, unsigned int, unsigned int, unsigned int, unsigned int)> subconvfn
-    );
-
-    const int _dilation_factor;
-    const int _n_input_rows, _n_input_cols, _n_channels;
-    const int _padding_top, _padding_left;
-    const int _n_output_rows, _n_output_cols;
-
-    /* Dilated depthwise convolution is performed through repeated calls to
-     * non-dilated convolutions. If the dilation factor is $n$, then we perform
-     * $(n + 1)^2$ depthwise convolutions.
-     */
-    using BaseDepthwise = DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      TIn, TBias, TOut
-    >;
-    std::deque<std::deque<std::unique_ptr<IDepthwiseConvolution>>> _convs;
-};
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
deleted file mode 100644
index e56583d6b3..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "depthwise_quantized_dilated.hpp"
-#include "impl_dilated.hpp"
-
-namespace depthwise {
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
-                                   KernelCols, StrideRows, StrideCols>::
-    QAsymm8DilatedDepthwiseConvolution(
-        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-        int dilation_factor, nck::ActivationFunction activation,
-        const qasymm8::QAsymm8Params &weight_quantisation,
-        const qasymm8::QAsymm8Params &input_quantisation,
-        const qasymm8::QAsymm8Params &output_quantisation,
-        unsigned int padding_top, unsigned int padding_left,
-        unsigned int padding_bottom, unsigned int padding_right)
-    : QAsymm8DilatedDepthwiseConvolution(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          QAsymm8DilatedDepthwiseConvolution::get_output_size(
-              n_input_rows, padding_top, padding_bottom, dilation_factor),
-          QAsymm8DilatedDepthwiseConvolution::get_output_size(
-              n_input_cols, padding_left, padding_right, dilation_factor),
-          activation, weight_quantisation, input_quantisation,
-          output_quantisation, padding_top, padding_left, padding_bottom,
-          padding_right) {}
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
-                                   KernelCols, StrideRows, StrideCols>::
-    QAsymm8DilatedDepthwiseConvolution(
-        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-        int dilation_factor, int n_output_rows, int n_output_cols,
-        nck::ActivationFunction activation,
-        const qasymm8::QAsymm8Params &weight_quantisation,
-        const qasymm8::QAsymm8Params &input_quantisation,
-        const qasymm8::QAsymm8Params &output_quantisation,
-        unsigned int padding_top, unsigned int padding_left,
-        unsigned int padding_bottom, unsigned int padding_right)
-    : QAsymm8DilatedDepthwiseConvolution(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          n_output_rows, n_output_cols, activation, weight_quantisation,
-          input_quantisation, output_quantisation,
-          qasymm8::QAsymm8RescaleParams::make_rescale_params(
-              weight_quantisation, input_quantisation, output_quantisation),
-          padding_top, padding_left, padding_bottom, padding_right) {}
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
-                                   KernelCols, StrideRows, StrideCols>::
-    QAsymm8DilatedDepthwiseConvolution(
-        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-        int dilation_factor, nck::ActivationFunction activation,
-        const qasymm8::QAsymm8Params &weight_quantisation,
-        const qasymm8::QAsymm8Params &input_quantisation,
-        const qasymm8::QAsymm8Params &output_quantisation,
-        const qasymm8::QAsymm8RescaleParams &rescale_parameters,
-        unsigned int padding_top, unsigned int padding_left,
-        unsigned int padding_bottom, unsigned int padding_right)
-    : QAsymm8DilatedDepthwiseConvolution(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          QAsymm8DilatedDepthwiseConvolution::get_output_size(
-              n_input_rows, padding_top, padding_bottom, dilation_factor),
-          QAsymm8DilatedDepthwiseConvolution::get_output_size(
-              n_input_cols, padding_left, padding_right, dilation_factor),
-          activation, weight_quantisation, input_quantisation,
-          output_quantisation, rescale_parameters, padding_top, padding_left,
-          padding_bottom, padding_right) {}
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
-                                   KernelCols, StrideRows, StrideCols>::
-    QAsymm8DilatedDepthwiseConvolution(
-        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-        int dilation_factor, int n_output_rows, int n_output_cols,
-        nck::ActivationFunction activation,
-        const qasymm8::QAsymm8Params &weight_quantisation,
-        const qasymm8::QAsymm8Params &input_quantisation,
-        const qasymm8::QAsymm8Params &output_quantisation,
-        const qasymm8::QAsymm8RescaleParams &rescale_parameters,
-        unsigned int padding_top, unsigned int padding_left,
-        unsigned int padding_bottom, unsigned int padding_right)
-    : DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
-                                  KernelCols, StrideRows, StrideCols, uint8_t,
-                                  int32_t, uint8_t>(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          n_output_rows, n_output_cols, activation, padding_top, padding_left,
-          padding_bottom, padding_right,
-          [weight_quantisation, input_quantisation, output_quantisation,
-           rescale_parameters](
-              const int n_batches, const int n_input_rows,
-              const int n_input_cols, const int n_channels,
-              const int n_output_rows, const int n_output_cols,
-              const nck::ActivationFunction activation,
-              const unsigned int padding_top, const unsigned int padding_left,
-              const unsigned int padding_bottom,
-              const unsigned int padding_right) -> IDepthwiseConvolution * {
-            return new QAsymm8DepthwiseConvolution<
-                OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                StrideRows, StrideCols>(
-                n_batches, n_input_rows, n_input_cols, n_channels,
-                n_output_rows, n_output_cols, activation, weight_quantisation,
-                input_quantisation, output_quantisation, rescale_parameters,
-                padding_top, padding_left, padding_bottom, padding_right);
-          }) {}
-
-} // namespace depthwise
-
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>;
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>;
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>;
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>;
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp
deleted file mode 100644
index 99f0f53792..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp16_fp16.hpp"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace depthwise
-{
-template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>;
-template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>;
-template class DepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>;
-template class DepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>;
-}  // namespace depthwise
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp
deleted file mode 100644
index c13dd70a61..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>;
-template class DepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>;
-template class DepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>;
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
deleted file mode 100644
index bddae51135..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "impl_base.hpp"
-
-// TODO Move to common utilities somewhere
-template <size_t Size> struct DType { };
-template <> struct DType<1> { using scalar_type = uint8_t; };
-template <> struct DType<2> { using scalar_type = uint16_t; };
-template <> struct DType<4> { using scalar_type = uint32_t; };
-
-namespace depthwise
-{
-
-template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
-void PackParameters<KernelRows, KernelColumns, WeightSize, BiasSize>::execute(
-  unsigned int n_channels,
-  void *buffer,
-  const void *weights,
-  const unsigned int weight_row_stride,
-  const unsigned int weight_col_stride,
-  const void *biases
-)
-{
-  using TWeight = typename DType<WeightSize>::scalar_type;
-  using TBias = typename DType<BiasSize>::scalar_type;
-
-  auto buffer_ptr = static_cast<uint8_t *>(buffer);
-  auto weights_ptr = static_cast<const TWeight *>(weights);
-  auto biases_ptr = static_cast<const TBias *>(biases);
-
-  const unsigned int veclen = 16 / WeightSize;
-  for (; n_channels >= veclen; n_channels -= veclen)
-  {
-    // Copy biases
-    for (unsigned int i = 0; i < veclen; i++)
-    {
-      auto ptr = reinterpret_cast<TBias *>(buffer_ptr);
-      *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++);
-      buffer_ptr += BiasSize;
-    }
-
-    // Copy weights
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelColumns; j++)
-      {
-        for (unsigned int c = 0; c < veclen; c++)
-        {
-          *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride + c];
-          buffer_ptr += WeightSize;
-        }
-      }
-    }
-    weights_ptr += veclen;
-  }
-  for (; n_channels; n_channels--)
-  {
-    // Copy bias
-    auto ptr = reinterpret_cast<TBias *>(buffer_ptr);
-    *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++);
-    buffer_ptr += BiasSize;
-
-    // Copy weights
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelColumns; j++)
-      {
-        *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride];
-        buffer_ptr += WeightSize;
-      }
-    }
-    weights_ptr++;
-  }
-}
-
-template struct PackParameters<3, 3, 2ul, 2ul>;
-template struct PackParameters<3, 3, 4ul, 4ul>;
-template struct PackParameters<5, 5, 2ul, 2ul>;
-template struct PackParameters<5, 5, 4ul, 4ul>;
-}  // namespace
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
deleted file mode 100644
index b09f620475..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_qa8_qa8.hpp"
-
-namespace depthwise
-{
-template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>;
-template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>;
-template class QAsymm8DepthwiseConvolution<2, 2, 5, 5, 1, 1>;
-template class QAsymm8DepthwiseConvolution<2, 2, 5, 5, 2, 2>;
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp
deleted file mode 100644
index 1ae48b9417..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_qa8_qs8_per_channel.hpp"
-
-namespace depthwise {
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>;
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>;
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>;
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>;
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
deleted file mode 100644
index 4343f6ad45..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "depthwise.hpp"
-#include "qasymm8.hpp"
-#include "qsymm8.hpp"
-#pragma once
-
-using namespace neon_convolution_kernels;
-using namespace qasymm8;
-
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b)
-{
-  return vqrdmulhq_s32(a, b);
-}
-
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
-{
-  return vqrdmulhq_n_s32(a, b);
-}
-
-inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
-{
-  return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
-}
-
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift)
-{
-  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
-  const int32x4_t fixed = vqaddq_s32(x, fixup);
-  return vrshlq_s32(fixed, shift);
-}
-
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
-{
-  const int32x4_t shift = vdupq_n_s32(-exponent);
-  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
-  const int32x4_t fixed = vqaddq_s32(x, fixup);
-  return vrshlq_s32(fixed, shift);
-}
-
-inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
-{
-  const int32x2_t shift = vdup_n_s32(-exponent);
-  const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
-  const int32x2_t fixed = vqadd_s32(x, fixup);
-  return vrshl_s32(fixed, shift);
-}
-
-inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
-{
-  const int32x2_t xs = vdup_n_s32(x);
-  return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
-}
-
-namespace depthwise
-{
-
-namespace nck = neon_convolution_kernels;
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  uint8_t, int32_t, uint8_t,
-  QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    uint8_t, int32_t, uint8_t,
-    QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
-  >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    uint8_t _input_padding_value(void) const;
-
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      uint8_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-
-  private:
-    // Quantization parameters
-    const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant;
-    const qasymm8::QAsymm8RescaleParams rescale_parameters;
-};
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  uint8_t, int32_t, uint8_t,
-  QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    uint8_t, int32_t, uint8_t,
-    QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
-  >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-  QSymm8HybridPerChannelDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  QSymm8HybridPerChannelDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  size_t get_packed_params_size(void) const override
-  {
-      return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t));
-
-  }
-
-  protected:
-    uint8_t _input_padding_value(void) const;
-
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      uint8_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-
-  private:
-    // Quantization parameters
-    const qsymm8::QSymm8PerChannelParams _weights_quant;
-    const qasymm8::QAsymm8Params _input_quant, _output_quant;
-    const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters;
-};
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
deleted file mode 100644
index a11b0981c9..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "depthwise_dilated.hpp"
-#include "depthwise_quantized.hpp"
-
-namespace depthwise {
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-class QAsymm8DilatedDepthwiseConvolution
-    : public DilatedDepthwiseConvolution<
-          OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
-          StrideCols, uint8_t, int32_t, uint8_t> {
-public:
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      const qasymm8::QAsymm8RescaleParams &rescale_parameters,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-};
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp
deleted file mode 100644
index 266d13d6fc..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ /dev/null
@@ -1,505 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include <algorithm>
-#include <cstdint>
-#include "depthwise.hpp"
-#include "padding.hpp"
-#include "utils.hpp"
-
-#pragma once
-
-#define MEMBERFN(TOUT) template <\
-  unsigned int OutputTileRows, unsigned int OutputTileColumns,\
-  unsigned int KernelRows, unsigned int KernelColumns,\
-  unsigned int StrideRows, unsigned int StrideColumns,\
-  typename TIn, typename TBias, typename TOut,\
-  typename Derived\
-> TOUT DepthwiseConvolutionBase<\
-  OutputTileRows, OutputTileColumns,\
-  KernelRows, KernelColumns,\
-  StrideRows, StrideColumns,\
-  TIn, TBias, TOut, Derived\
->
-
-using namespace neon_convolution_kernels;
-
-namespace depthwise
-{
-
-template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
-struct PackParameters
-{
-  static void execute(
-    unsigned int n_channels,
-    void *buffer,
-    const void *weights,
-    unsigned int weight_row_stride,
-    unsigned int weight_col_stride,
-    const void *biases
-  );
-};
-
-const unsigned int CHANNEL_BLOCK = 16;
-
-MEMBERFN(int)::get_output_size(
-  const int dim_size, const unsigned int padding_before, const unsigned int padding_after
-)
-{
-  return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows);
-}
-
-MEMBERFN(int)::output_size(
-  const int dim_size, const unsigned int padding_before, const unsigned int padding_after
-) const
-{
-  return get_output_size(dim_size, padding_before, padding_after);
-}
-
-MEMBERFN()::DepthwiseConvolutionBase(
-  const int n_batches,
-  const int n_input_rows,
-  const int n_input_cols,
-  const int n_channels,
-  ActivationFunction activation,
-  const unsigned int padding_top,
-  const unsigned int padding_left,
-  const unsigned int padding_bottom,
-  const unsigned int padding_right
-) : DepthwiseConvolutionBase(
-      n_batches, n_input_rows, n_input_cols, n_channels,
-      get_output_size(n_input_rows, padding_top, padding_bottom),
-      get_output_size(n_input_cols, padding_left, padding_right),
-      activation,
-      padding_top, padding_left, padding_bottom, padding_right
-    )
-{
-}
-
-MEMBERFN()::DepthwiseConvolutionBase(
-  const int n_batches,
-  const int n_input_rows,
-  const int n_input_cols,
-  const int n_channels,
-  const int n_output_rows,
-  const int n_output_cols,
-  ActivationFunction activation,
-  const unsigned int padding_top,
-  const unsigned int padding_left,
-  const unsigned int padding_bottom,
-  const unsigned int padding_right
-) : _input(nullptr), _output(nullptr),
-    _packed_parameters(nullptr),
-    _working_space(nullptr),
-    _n_batches(n_batches),
-    _n_input_rows(n_input_rows),
-    _n_input_cols(n_input_cols),
-    _n_channels(n_channels),
-    _n_output_rows(n_output_rows),
-    _n_output_cols(n_output_cols),
-    _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)),
-    _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)),
-    _padding_top(padding_top),
-    _padding_left(padding_left),
-    _padding_bottom(padding_bottom),
-    _padding_right(padding_right),
-    _activation(activation),
-    _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0),
-    _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0)
-{
-}
-
-MEMBERFN(void)::set_input(const void* const inptr)
-{
-  set_input(inptr, _n_channels);
-}
-
-MEMBERFN(void)::set_input(const void* const inptr, const int ld_col)
-{
-  set_input(inptr, _n_input_cols * ld_col, ld_col);
-}
-
-MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col)
-{
-  set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col);
-}
-
-MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col)
-{
-  _input = static_cast<const TIn *>(inptr);
-  _input_batch_stride = ld_batch;
-  _input_row_stride = ld_row;
-  _input_col_stride = ld_col;
-}
-
-MEMBERFN(void)::set_output(void* const outptr)
-{
-  set_output(outptr, _n_channels);
-}
-
-MEMBERFN(void)::set_output(void* const outptr, const int ld_col)
-{
-  set_output(outptr, _n_output_cols * ld_col, ld_col);
-}
-
-MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col)
-{
-  set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col);
-}
-
-MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col)
-{
-  _output = static_cast<TOut *>(outptr);
-  _output_batch_stride = ld_batch;
-  _output_row_stride = ld_row;
-  _output_col_stride = ld_col;
-}
-
-MEMBERFN(size_t)::get_packed_params_size(void) const
-{
-  return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
-}
-
-MEMBERFN(void)::set_packed_params_buffer(void *buffer)
-{
-  _packed_parameters = buffer;
-}
-
-MEMBERFN(void)::pack_params(const void *weights, const void *biases) const
-{
-  static_cast<const Derived *>(this)->pack_params(_packed_parameters, weights, biases);
-}
-
-MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const
-{
-  const unsigned int weight_col_stride = _n_channels;
-  const unsigned int weight_row_stride = KernelColumns * weight_col_stride;
-  static_cast<const Derived *>(this)->pack_params(
-    buffer, weights, weight_row_stride, weight_col_stride, biases
-  );
-}
-
-MEMBERFN(void)::pack_params(
-  void * const buffer,
-  const void * const weights,
-  const unsigned int weight_row_stride,
-  const unsigned int weight_col_stride,
-  const void * const biases
-) const
-{
-  static_cast<const Derived *>(this)->_pack_params(
-    buffer, weights, weight_row_stride, weight_col_stride, biases
-  );
-}
-
-MEMBERFN(void)::_pack_params(
-  void * const buffer,
-  const void * const weights,
-  const unsigned int weight_row_stride,
-  const unsigned int weight_col_stride,
-  const void * const biases
-) const
-{
-  // Default implementation
-  PackParameters<KernelRows, KernelColumns, sizeof(TIn), sizeof(TOut)>::execute(
-    _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases
-  );
-}
-
-MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
-{
-  return nthreads * (
-    _get_input_working_space_size() + _get_output_working_space_size()
-  );
-}
-
-MEMBERFN(void)::set_working_space(void *buffer)
-{
-  _working_space = buffer;
-}
-
-MEMBERFN(size_t)::_get_input_working_space_size(void) const
-{
-  return sizeof(TIn) * _n_channels;
-}
-
-MEMBERFN(size_t)::_get_output_working_space_size(void) const
-{
-  return sizeof(TOut) * _n_channels;
-}
-
-MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const
-{
-  return static_cast<uint8_t*>(_working_space) + threadid * (
-    _get_input_working_space_size() + _get_output_working_space_size()
-  );
-}
-
-MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const
-{
-  return static_cast<uint8_t*>(_get_input_working_space(threadid)) + _get_input_working_space_size();
-}
-
-MEMBERFN(unsigned int)::get_window() const
-{
-  // Parallelise over blocks of channels.
-  return iceildiv(_n_channels, CHANNEL_BLOCK);
-}
-
-MEMBERFN(void)::run(
-  const unsigned int start,
-  const unsigned int stop,
-  const unsigned int threadid
-)
-{
-  // Clear the input padding buffer
-  TIn *buf = static_cast<TIn *>(_get_input_working_space(threadid));
-  const TIn pad_value = static_cast<Derived *>(this)->_input_padding_value();
-  for (int n = 0; n < _n_channels; n++)
-  {
-    buf[n] = pad_value;
-  }
-
-  // Parallelise over blocks of channels
-  const auto start_channel = CHANNEL_BLOCK * start;
-  const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop);
-  const auto params_size_per_channel = this->get_packed_params_size()/_n_channels;
-
-  // Compute top and bottom padding for input and output
-  const int input_pad_top = _padding_top;
-  const int input_pad_left = _padding_left;
-  constexpr int tile_overlap = kernel_rows - stride_rows;
-
-  // Perform the convolution by calling `process_tile_row` for each tile row in
-  // each batch.
-  for (int batch = 0; batch < _n_batches; batch++)
-  {
-    const TIn* const inptr_batch = _input + batch*_input_batch_stride;
-    TOut* const outptr_batch = _output + batch*_output_batch_stride;
-
-    // Loop over rows of tiles
-    for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++)
-    {
-      // Pointer to the row
-      const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top;
-      const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride);
-      TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride;
-
-      // Input padding (top + bottom) for the row
-      const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top;
-      const int input_row_bottom = input_row_top + inner_tile_rows;
-      const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0;
-      const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows);
-
-      // Output padding (bottom) for the row
-      const int output_row_bottom = (tile_i + 1)*output_tile_rows;
-      const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
-
-      // Get the offset into the packed parameters
-      const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) +
-        start_channel*params_size_per_channel;
-
-      // Process the row
-      process_tile_row(
-        threadid,
-        stop_channel - start_channel,
-        params_ptr,
-        inptr_row + start_channel,
-        outptr_row + start_channel,
-        input_row_pad_top, input_pad_left, input_row_pad_bottom,
-        output_row_pad_bottom,
-        _n_tile_cols, _n_input_cols, _n_output_cols
-      );
-    }
-  }
-}
-
-MEMBERFN(void)::process_tile_row(
-  const unsigned int threadid,
-  const int n_channels,
-  const void* const packed_params,
-  const TIn* const inptr,
-  TOut* const outptr,
-  const int row_pad_in_top,
-  const int row_pad_in_left,
-  const int row_pad_in_bottom,
-  const int row_pad_out_bottom,
-  const int n_tiles,
-  const int n_input_cols,
-  const int n_output_cols
-)
-{
-  constexpr int tile_overlap = kernel_cols - stride_cols;
-
-  // Loop over columns of tiles
-  for (int tile_j = 0; tile_j < n_tiles; tile_j++)
-  {
-    // Input padding (left + right) for the tile
-    const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0;
-    const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left;
-    const int t_in_end = t_in_start + inner_tile_cols;
-    const int t_pad_in_right = std::max(0, t_in_end - n_input_cols);
-
-    // Output padding (right) for the tile
-    const int t_out_end = (tile_j + 1) * output_tile_cols;
-    const int t_pad_out_right = std::max(0, t_out_end - n_output_cols);
-
-    // Get pointers into the inputs and outputs
-    const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
-    const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride);
-    TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride;
-
-    // Process just this tile
-    process_tile(
-      threadid, n_channels, packed_params, inptr_col, outptr_col,
-      row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,  // Input paddings
-      row_pad_out_bottom, t_pad_out_right  // Output paddings
-    );
-  }
-}
-
-MEMBERFN(TIn)::_input_padding_value(void) const
-{
-  return static_cast<TIn>(0);
-}
-
-MEMBERFN(void)::process_tile(
-  const unsigned int threadid,
-  const int n_channels,
-  const void* const packed_params,
-  const TIn* const inptr,
-  TOut* const outptr,
-  const int pad_in_top,
-  const int pad_in_left,
-  const int pad_in_bottom,
-  const int pad_in_right,
-  const int pad_out_bottom,
-  const int pad_out_right
-)
-{
-  Derived * dthis = static_cast<Derived *>(this);
-  const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right;
-  const bool pad_output = pad_out_bottom || pad_out_right;
-
-  if (!pad_input && !pad_output)
-  {
-    switch(_activation)
-    {
-      case ActivationFunction::ReLU:
-        dthis->template execute_tile<ActivationFunction::ReLU>(
-          n_channels, packed_params,
-          inptr, _input_row_stride, _input_col_stride,
-          outptr, _output_row_stride, _output_col_stride
-        );
-        break;
-      case ActivationFunction::ReLU6:
-        dthis->template execute_tile<ActivationFunction::ReLU6>(
-          n_channels, packed_params,
-          inptr, _input_row_stride, _input_col_stride,
-          outptr, _output_row_stride, _output_col_stride
-        );
-        break;
-      default:
-        dthis->template execute_tile<ActivationFunction::None>(
-          n_channels, packed_params,
-          inptr, _input_row_stride, _input_col_stride,
-          outptr, _output_row_stride, _output_col_stride
-        );
-        break;
-    }
-  }
-  else
-  {
-    // Create arrays of input and output pointers, pointing padded elements to
-    // the working space padding buffers provided.
-    const TIn *inptrs[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        if (i < pad_in_top || (inner_tile_rows - pad_in_bottom) <= i ||
-            j < pad_in_left || (inner_tile_cols - pad_in_right) <= j)
-        {
-          // Padded input
-          inptrs[i][j] = static_cast<const TIn *>(_get_input_working_space(threadid));
-        }
-        else
-        {
-          inptrs[i][j] = inptr + (i - pad_in_top)*_input_row_stride + (j - pad_in_left)*_input_col_stride;
-        }
-      }
-    }
-
-    TOut *outptrs[output_tile_rows][output_tile_cols];
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        if (i < (output_tile_rows - pad_out_bottom) &&
-            j < (output_tile_cols - pad_out_right))
-        {
-          outptrs[i][j] = outptr + i*_output_row_stride + j*_output_col_stride;
-        }
-        else
-        {
-          outptrs[i][j] = static_cast<TOut *>(_get_output_working_space(threadid));
-        }
-      }
-    }
-
-    switch(_activation)
-    {
-      case ActivationFunction::ReLU:
-        dthis->template execute_tile<ActivationFunction::ReLU>(
-          n_channels, packed_params, inptrs, outptrs
-        );
-        break;
-      case ActivationFunction::ReLU6:
-        dthis->template execute_tile<ActivationFunction::ReLU6>(
-          n_channels, packed_params, inptrs, outptrs
-        );
-        break;
-      default:
-        dthis->template execute_tile<ActivationFunction::None>(
-          n_channels, packed_params, inptrs, outptrs
-        );
-        break;
-    }
-  }
-}
-
-MEMBERFN(int)::n_channels(void) const
-{
-  return _n_channels;
-}
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
deleted file mode 100644
index 4130188187..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "depthwise_dilated.hpp"
-#include "utils.hpp"
-
-#define MEMBERFN(TOUT)                                                         \
-  template <unsigned int OutputTileRows, unsigned int OutputTileColumns,       \
-            unsigned int KernelRows, unsigned int KernelColumns,               \
-            unsigned int StrideRows, unsigned int StrideColumns, typename TIn, \
-            typename TBias, typename TOut>                                     \
-  TOUT DilatedDepthwiseConvolution<OutputTileRows, OutputTileColumns,          \
-                                   KernelRows, KernelColumns, StrideRows,      \
-                                   StrideColumns, TIn, TBias, TOut>
-
-namespace depthwise {
-
-MEMBERFN()
-::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows,
-                              const int n_input_cols, const int n_channels,
-                              const int dilation_factor,
-                              nck::ActivationFunction activation,
-                              const unsigned int padding_top,
-                              const unsigned int padding_left,
-                              const unsigned int padding_bottom,
-                              const unsigned int padding_right)
-    : DilatedDepthwiseConvolution(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          DilatedDepthwiseConvolution::get_output_size(
-              n_input_rows, padding_top, padding_bottom, dilation_factor),
-          DilatedDepthwiseConvolution::get_output_size(
-              n_input_cols, padding_left, padding_right, dilation_factor),
-          activation, padding_top, padding_left, padding_bottom,
-          padding_right) {}
-
-MEMBERFN()
-::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows,
-                              const int n_input_cols, const int n_channels,
-                              const int dilation_factor,
-                              const int n_output_rows, const int n_output_cols,
-                              nck::ActivationFunction activation,
-                              const unsigned int padding_top,
-                              const unsigned int padding_left,
-                              const unsigned int, // padding_bottom
-                              const unsigned int  // padding_right
-                              )
-    : DilatedDepthwiseConvolution(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          n_output_rows, n_output_cols, activation, padding_top, padding_left,
-          0, 0,
-          // Function which creates a new (standard) depthwise convolution
-          [](const int n_batches, const int n_input_rows,
-             const int n_input_cols, const int n_channels,
-             const int n_output_rows, const int n_output_cols,
-             const nck::ActivationFunction activation,
-             const unsigned int padding_top, const unsigned int padding_left,
-             const unsigned int padding_bottom,
-             const unsigned int padding_right) -> IDepthwiseConvolution * {
-            return new DepthwiseConvolution<
-                OutputTileRows, OutputTileColumns, KernelRows, KernelColumns,
-                StrideRows, StrideColumns, TIn, TBias, TOut>(
-                n_batches, n_input_rows, n_input_cols, n_channels,
-                n_output_rows, n_output_cols, activation, padding_top,
-                padding_left, padding_bottom, padding_right);
-          }) {}
-
-MEMBERFN()
-::DilatedDepthwiseConvolution(
-    const int n_batches, const int n_input_rows, const int n_input_cols,
-    const int n_channels, const int dilation_factor, const int n_output_rows,
-    const int n_output_cols, nck::ActivationFunction activation,
-    const unsigned int padding_top, const unsigned int padding_left,
-    const unsigned int, // padding_bottom
-    const unsigned int, // padding_right
-    std::function<IDepthwiseConvolution *(
-        int, int, int, int, int, int, nck::ActivationFunction, unsigned int,
-        unsigned int, unsigned int, unsigned int)>
-        subconvfn // Function to create a new convolution
-    )
-    : _dilation_factor(dilation_factor), _n_input_rows(n_input_rows),
-      _n_input_cols(n_input_cols), _n_channels(n_channels),
-      _padding_top(static_cast<int>(padding_top)),
-      _padding_left(static_cast<int>(padding_left)),
-      _n_output_rows(n_output_rows), _n_output_cols(n_output_cols),
-      _convs(_dilation_factor) {
-  // Instantiate the base convolutions
-  for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) {
-    // Compute properties of this row of base convolutions
-    const int row_top =
-        i * StrideRows - _padding_top; // -ve values are in the padding
-    const int row_pad_top =
-        row_top < 0 ? iceildiv(-row_top, dilation_factor) : 0;
-
-    const int _n_input_rows = iceildiv(n_input_rows - i, dilation_factor);
-    const int _n_output_rows = iceildiv(n_output_rows - i, dilation_factor);
-
-    for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) {
-      // Compute properties of the base convolution
-      const int col_left =
-          j * StrideColumns - padding_left; // -ve values are in the padding
-      const int col_pad_left =
-          col_left < 0 ? iceildiv(-col_left, dilation_factor) : 0;
-
-      const int _n_input_cols = iceildiv(n_input_cols - j, dilation_factor);
-      const int _n_output_cols = iceildiv(n_output_cols - j, dilation_factor);
-
-      // Create new depthwise convolution engine and include it in the vector
-      // of engines. The new depthwise convolution engine is created by calling
-      // the delegate function we received as an argument.
-      _convs[i].emplace_back(subconvfn(
-          n_batches, _n_input_rows, _n_input_cols, n_channels, _n_output_rows,
-          _n_output_cols, activation,
-          // Note: since we have computed the output tensor size we don't need
-          // to explicitly provide bottom and right padding values to the
-          // depthwise convolution.
-          row_pad_top, col_pad_left, 0, 0));
-    }
-  }
-}
-
-MEMBERFN(void)::set_input(const void *const inptr) {
-  set_input(inptr, _n_channels);
-}
-
-MEMBERFN(void)::set_input(const void *const inptr, const int ldcol) {
-  set_input(inptr, _n_input_cols * ldcol, ldcol);
-}
-
-MEMBERFN(void)
-::set_input(const void *const inptr, const int ldrow, const int ldcol) {
-  set_input(inptr, _n_input_rows * ldrow, ldrow, ldcol);
-}
-
-MEMBERFN(void)
-::set_input(const void *const inptr, const int ldbatch, const int ldrow,
-            const int ldcol) {
-  // Compute dilated strides
-  const int ldrow_dilated = ldrow * _dilation_factor;
-  const int ldcol_dilated = ldcol * _dilation_factor;
-
-  // Pass input parameters on to base convolutions
-  for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) {
-    const int top_pos =
-        i * StrideRows - _padding_top +
-        ((static_cast<int>(i * StrideRows) < _padding_top)
-             ? iceildiv(_padding_top - i * StrideRows, _dilation_factor) *
-                   _dilation_factor
-             : 0);
-    const TIn *const inptr_i =
-        static_cast<const TIn *>(inptr) + top_pos * ldrow;
-
-    for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) {
-      int left_pos = j * StrideColumns - _padding_left;
-      while (left_pos < 0)
-        left_pos += _dilation_factor;
-
-      // Modify the pointer to point to the first element of the dilated input
-      // tensor, then set the input for this convolution engine.
-      const void *const inptr_ij = inptr_i + left_pos * ldcol;
-      _convs[i][j]->set_input(inptr_ij, ldbatch, ldrow_dilated, ldcol_dilated);
-    }
-  }
-}
-
-MEMBERFN(void)::set_output(void *const outptr) {
-  set_output(outptr, _n_channels);
-}
-
-MEMBERFN(void)::set_output(void *const outptr, const int ldcol) {
-  set_output(outptr, _n_output_cols * ldcol, ldcol);
-}
-
-MEMBERFN(void)
-::set_output(void *const outptr, const int ldrow, const int ldcol) {
-  set_output(outptr, _n_output_rows * ldrow, ldrow, ldcol);
-}
-
-MEMBERFN(void)
-::set_output(void *const outptr, const int ldbatch, const int ldrow,
-             const int ldcol) {
-  // Compute dilated strides
-  const int ldrow_dilated = ldrow * _dilation_factor;
-  const int ldcol_dilated = ldcol * _dilation_factor;
-
-  // Pass input parameters on to base convolutions
-  for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) {
-    for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) {
-      // Modify the pointer to point to the first element of the dilated input
-      // tensor, then set the input for this convolution engine.
-      void *const outptr_ij =
-          static_cast<TOut *>(outptr) + i * ldrow + j * ldcol;
-      _convs[i][j]->set_output(outptr_ij, ldbatch, ldrow_dilated,
-                               ldcol_dilated);
-    }
-  }
-}
-
-MEMBERFN(int)
-::get_output_size(const int dim_size, const unsigned int padding_before,
-                  const unsigned int padding_after, const int dilation_factor) {
-  const int input_size =
-      dim_size + static_cast<int>(padding_before + padding_after);
-  const int window_size = (KernelRows - 1) * dilation_factor + 1;
-  return iceildiv(input_size - window_size + 1, StrideRows);
-}
-
-MEMBERFN(int)
-::output_size(const int dim_size, const unsigned int padding_before,
-              const unsigned int padding_after) const {
-  return get_output_size(dim_size, padding_before, padding_after,
-                         _dilation_factor);
-}
-
-MEMBERFN(size_t)::get_packed_params_size(void) const {
-  return _convs[0][0]->get_packed_params_size();
-}
-
-MEMBERFN(void)::set_packed_params_buffer(void *buffer) {
-  // Set the buffer for all convolution engines
-  for (auto &&row : _convs) {
-    for (auto &&conv : row) {
-      conv->set_packed_params_buffer(buffer);
-    }
-  }
-}
-
-MEMBERFN(void)
-::pack_params(const void *const weights, const void *const biases) const {
-  _convs[0][0]->pack_params(weights, biases);
-}
-
-MEMBERFN(void)
-::pack_params(void *const buffer, const void *const weights,
-              const void *const biases) const {
-  _convs[0][0]->pack_params(buffer, weights, biases);
-}
-
-MEMBERFN(void)
-::pack_params(void *const buffer, const void *const weights,
-              const unsigned int ldrow, const unsigned int ldcol,
-              const void *const biases) const {
-  _convs[0][0]->pack_params(buffer, weights, ldrow, ldcol, biases);
-}
-
-MEMBERFN(size_t)::get_working_space_size(unsigned int nthreads) const {
-  return _convs[0][0]->get_working_space_size(nthreads);
-}
-
-MEMBERFN(void)::set_working_space(void *const ws) {
-  // Use the same working space set for all contained depthwise engines.
-  for (auto &&row : _convs) {
-    for (auto &&conv : row) {
-      conv->set_working_space(ws);
-    }
-  }
-}
-
-MEMBERFN(unsigned int)::get_window(void) const {
-  return _convs[0][0]->get_window();
-}
-
-MEMBERFN(void)
-::run(const unsigned int start, const unsigned int stop,
-      const unsigned int threadid) {
-  // Run each contained convolution in turn
-  for (auto &&row : _convs) {
-    for (auto &&conv : row) {
-      conv->run(start, stop, threadid);
-    }
-  }
-}
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
deleted file mode 100644
index a00a1ef04a..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
+++ /dev/null
@@ -1,439 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include "arm.hpp"
-#include "impl_base.hpp"
-
-#pragma once
-
-using namespace neon_convolution_kernels;
-
-namespace depthwise
-{
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
->::DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  ActivationFunction activation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-      n_batches, n_input_rows, n_input_cols, n_channels, activation,
-      padding_top, padding_left, padding_bottom, padding_right
-    )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
->::DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  int n_output_rows, int n_output_cols,
-  ActivationFunction activation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-      n_batches, n_input_rows, n_input_cols, n_channels,
-      n_output_rows, n_output_cols, activation,
-      padding_top, padding_left, padding_bottom, padding_right
-    )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
->::execute_tile(
-  int n_channels,
-  const void *weights_biases_ptr,
-  const float16_t *input,
-  const unsigned int in_row_stride,
-  const unsigned int in_col_stride,
-  float16_t *output,
-  const unsigned int out_row_stride,
-  const unsigned int out_col_stride
-)
-{
-  // Instantiate pointers
-  const float16_t* __restrict__ inptr_base = input;
-  float16_t* __restrict__ outptr_base = output;
-  const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
-
-  // Perform the depthwise convolution
-  int channels_remaining = n_channels;
-  for (; channels_remaining >= 8; channels_remaining -= 8)
-  {
-    // Load input tile
-    float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      const float16_t* const inptr_row = inptr_base + i*in_row_stride;
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = vld1q_f16(inptr_row + j*in_col_stride);
-      }
-    }
-    inptr_base += 8;
-
-    // Load weights tile
-    float16x8_t vbias = vld1q_f16(params);
-    params += 8;
-
-    float16x8_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = vld1q_f16(params);
-        params += 8;
-      }
-    }
-
-    // Perform the convolution
-    float16x8_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        v[out_i][out_j] = vbias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const unsigned int j = base_j + in_j;
-
-            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-            v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      float16_t* const outptr_row = outptr_base + i*out_row_stride;
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
-      }
-    }
-    outptr_base += 8;
-  }
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load input tile
-    float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      const float16_t* const inptr_row = inptr_base + i*in_row_stride;
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = *(inptr_row + j*in_col_stride);
-      }
-    }
-    inptr_base++;
-
-    // Load weights tile
-    float16_t bias = *(params++);
-    float16_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = *(params++);
-      }
-    }
-
-    // Perform the convolution
-    float16_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        // Clear the accumulator
-        v[out_i][out_j] = bias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const int j = base_j + in_j;
-            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      float16_t* const outptr_row = outptr_base + i*out_row_stride;
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        *(outptr_row + j*out_col_stride) = v[i][j];
-      }
-    }
-    outptr_base++;
-  }
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
->::execute_tile(
-  int n_channels,
-  const void *weights_biases_ptr,
-  const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
-  // Instantiate pointers
-  const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
-  int n = 0;
-
-  // Perform the depthwise convolution
-  int channels_remaining = n_channels;
-  for (; channels_remaining >= 8; channels_remaining -= 8, n += 8)
-  {
-    // Load input tile
-    float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = vld1q_f16(inptrs[i][j] + n);
-      }
-    }
-
-    // Load weights tile
-    float16x8_t vbias = vld1q_f16(params);
-    params += 8;
-
-    float16x8_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = vld1q_f16(params);
-        params += 8;
-      }
-    }
-
-    // Perform the convolution
-    float16x8_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        v[out_i][out_j] = vbias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const unsigned int j = base_j + in_j;
-
-            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-            v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        vst1q_f16(outptrs[i][j] + n, v[i][j]);
-      }
-    }
-  }
-  for (; channels_remaining; channels_remaining--, n++)
-  {
-    // Load input tile
-    float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = *(inptrs[i][j] + n);
-      }
-    }
-
-    // Load weights tile
-    float16_t bias = *(params++);
-    float16_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = *(params++);
-      }
-    }
-
-    // Perform the convolution
-    float16_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        // Clear the accumulator
-        v[out_i][out_j] = bias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const int j = base_j + in_j;
-            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        *(outptrs[i][j] + n) = v[i][j];
-      }
-    }
-  }
-}
-
-}  // namespace depthwise
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
deleted file mode 100644
index b0d8126a40..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ /dev/null
@@ -1,438 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include "arm.hpp"
-#include "impl_base.hpp"
-
-#pragma once
-
-using namespace neon_convolution_kernels;
-
-namespace depthwise
-{
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float, float, float
->::DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  ActivationFunction activation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-      n_batches, n_input_rows, n_input_cols, n_channels, activation,
-      padding_top, padding_left, padding_bottom, padding_right
-    )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float, float, float
->::DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  int n_output_rows, int n_output_cols,
-  ActivationFunction activation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-      n_batches, n_input_rows, n_input_cols, n_channels,
-      n_output_rows, n_output_cols, activation,
-      padding_top, padding_left, padding_bottom, padding_right
-    )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float, float, float
->::execute_tile(
-  int n_channels,
-  const void *weights_biases_ptr,
-  const float *input,
-  const unsigned int in_row_stride,
-  const unsigned int in_col_stride,
-  float *output,
-  const unsigned int out_row_stride,
-  const unsigned int out_col_stride
-)
-{
-  // Instantiate pointers
-  const float* __restrict__ inptr_base = input;
-  float* __restrict__ outptr_base = output;
-  const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
-
-  // Perform the depthwise convolution
-  int channels_remaining = n_channels;
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Load input tile
-    float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      const float* const inptr_row = inptr_base + i*in_row_stride;
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = vld1q_f32(inptr_row + j*in_col_stride);
-      }
-    }
-    inptr_base += 4;
-
-    // Load weights tile
-    float32x4_t vbias = vld1q_f32(params);
-    params += 4;
-
-    float32x4_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = vld1q_f32(params);
-        params += 4;
-      }
-    }
-
-    // Perform the convolution
-    float32x4_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        v[out_i][out_j] = vbias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const unsigned int j = base_j + in_j;
-
-            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-            v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      float* const outptr_row = outptr_base + i*out_row_stride;
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
-      }
-    }
-    outptr_base += 4;
-  }
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load input tile
-    float u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      const float* const inptr_row = inptr_base + i*in_row_stride;
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = *(inptr_row + j*in_col_stride);
-      }
-    }
-    inptr_base++;
-
-    // Load weights tile
-    float bias = *(params++);
-    float w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = *(params++);
-      }
-    }
-
-    // Perform the convolution
-    float v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        // Clear the accumulator
-        v[out_i][out_j] = bias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const int j = base_j + in_j;
-            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      float* const outptr_row = outptr_base + i*out_row_stride;
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        *(outptr_row + j*out_col_stride) = v[i][j];
-      }
-    }
-    outptr_base++;
-  }
-}
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float, float, float
->::execute_tile(
-  int n_channels,
-  const void *weights_biases_ptr,
-  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
-  const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
-
-  // Perform the depthwise convolution
-  int channels_remaining = n_channels;
-  int n = 0;
-  for (; channels_remaining >= 4; channels_remaining -= 4, n += 4)
-  {
-    // Load input tile
-    float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = vld1q_f32(inptrs[i][j] + n);
-      }
-    }
-
-    // Load weights tile
-    float32x4_t vbias = vld1q_f32(params);
-    params += 4;
-
-    float32x4_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = vld1q_f32(params);
-        params += 4;
-      }
-    }
-
-    // Perform the convolution
-    float32x4_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        v[out_i][out_j] = vbias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const unsigned int j = base_j + in_j;
-
-            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-            v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        vst1q_f32(outptrs[i][j] + n, v[i][j]);
-      }
-    }
-  }
-  for (; channels_remaining; channels_remaining--, n++)
-  {
-    // Load input tile
-    float u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = *(inptrs[i][j] + n);
-      }
-    }
-
-    // Load weights tile
-    float bias = *(params++);
-    float w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = *(params++);
-      }
-    }
-
-    // Perform the convolution
-    float v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        // Clear the accumulator
-        v[out_i][out_j] = bias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const int j = base_j + in_j;
-            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        *(outptrs[i][j] + n) = v[i][j];
-      }
-    }
-  }
-}
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
deleted file mode 100644
index e8b4c7bc0f..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include <limits>
-
-#include "arm.hpp"
-#include "impl_base.hpp"
-#include "depthwise_quantized.hpp"
-
-namespace depthwise
-{
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  const ActivationFunction activation,
-  const QAsymm8Params& weight_quantisation,
-  const QAsymm8Params& input_quantisation,
-  const QAsymm8Params& output_quantisation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : QAsymm8DepthwiseConvolution(
-    n_batches, n_input_rows, n_input_cols, n_channels,
-    activation, weight_quantisation, input_quantisation, output_quantisation,
-    QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
-    padding_top, padding_left, padding_bottom, padding_right
-  )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  int n_output_rows, int n_output_cols,
-  const ActivationFunction activation,
-  const QAsymm8Params& weight_quantisation,
-  const QAsymm8Params& input_quantisation,
-  const QAsymm8Params& output_quantisation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : QAsymm8DepthwiseConvolution(
-    n_batches, n_input_rows, n_input_cols, n_channels,
-    n_output_rows, n_output_cols,
-    activation, weight_quantisation, input_quantisation, output_quantisation,
-    QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
-    padding_top, padding_left, padding_bottom, padding_right
-  )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  const ActivationFunction activation,
-  const QAsymm8Params& weight_quantisation,
-  const QAsymm8Params& input_quantisation,
-  const QAsymm8Params& output_quantisation,
-  const QAsymm8RescaleParams& rescale_params,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-    n_batches, n_input_rows, n_input_cols, n_channels, activation,
-    padding_top, padding_left, padding_bottom, padding_right
-  ),
-  _weights_quant(weight_quantisation),
-  _inputs_quant(input_quantisation),
-  _output_quant(output_quantisation),
-  rescale_parameters(rescale_params)
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  int n_output_rows, int n_output_cols,
-  const ActivationFunction activation,
-  const QAsymm8Params& weight_quantisation,
-  const QAsymm8Params& input_quantisation,
-  const QAsymm8Params& output_quantisation,
-  const QAsymm8RescaleParams& rescale_params,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-    n_batches, n_input_rows, n_input_cols, n_channels,
-    n_output_rows, n_output_cols, activation,
-    padding_top, padding_left, padding_bottom, padding_right
-  ),
-  _weights_quant(weight_quantisation),
-  _inputs_quant(input_quantisation),
-  _output_quant(output_quantisation),
-  rescale_parameters(rescale_params)
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-uint8_t QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_input_padding_value(void) const
-{
-  return _inputs_quant.offset;
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-void QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_pack_params(
-  void * const buffer,
-  const void * const weights,
-  const unsigned int weight_row_stride,
-  const unsigned int weight_col_stride,
-  const void * const biases
-) const
-{
-  const uint8_t *wptr = static_cast<const uint8_t *>(weights);
-  const int32_t *bptr = static_cast<const int32_t *>(biases);
-  uint8_t *outptr = static_cast<uint8_t *>(buffer);
-
-  // We set the vector length to use doubles on both Aarch64 and Aarch32.  NOTE
-  // For SVE set this to half the vector length.
-  unsigned int veclen = 8;
-
-  // While there are channels left to process, pack a vector length of them at
-  // a time and reduce the size of vector used as the size of the tensor
-  // decreases.
-  for (
-    unsigned int n_channels = this->n_channels(); n_channels;
-    n_channels -= veclen,
-    outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
-  )
-  {
-    // NOTE Ignore this section if using SVE, the vector length remains the
-    // same and we just don't fill a full register for the tail.
-    while (n_channels < veclen)
-    {
-      // Reduce the vector length to either 8 or 1 (scalar)
-      // TODO Support more vector lengths in `execute_tile`.
-      veclen = (veclen == 16) ? 8 : 1;
-    }
-
-    // Get pointers to bias and weight portions of the output structure.
-    int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
-    uint8_t *out_wptr = outptr + veclen*sizeof(int32_t);
-
-    // Copy a vector length of elements
-    for (unsigned int n = 0; n < veclen && n < n_channels; n++)
-    {
-      const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
-      out_bptr[n] = bias;
-
-      for (unsigned int i = 0; i < KernelRows; i++)
-      {
-        uint8_t *row_outptr = out_wptr + i*KernelCols*veclen;
-        for (unsigned int j = 0; j < KernelCols; j++)
-        {
-          uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
-          row_outptr[j*veclen + n] = w;
-        }
-      }
-      wptr++;
-    }
-  }
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename FInput, typename FOutput
->
-static inline void tilefn(
-  int n_channels,
-  const void* packed_params,
-  FInput &get_input_ptr,
-  FOutput &get_output_ptr,
-  const int32_t clamp_max,
-  const int32_t clamp_min,
-  const uint8_t input_offset,
-  const uint8_t weight_offset,
-  const uint8_t output_offset,
-  const int32_t requant_multiplier,
-  const int32_t requant_shift
-)
-{
-  constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
-  constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
-
-  // Offset into channels
-  int channel = 0;
-
-  // Byte type pointer to weights and biases
-  const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
-
-  for (; n_channels >= 8; n_channels -= 8, channel += 8)
-  {
-    const int32x4_t biases[2] = {
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
-    };
-    wbptr += 8*sizeof(int32_t);
-
-    int16x8_t weights[KernelRows][KernelCols];
-    const uint8x8_t woffset = vdup_n_u8(weight_offset);
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        const uint8x8_t w = vld1_u8(wbptr);
-        weights[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(w, woffset));
-        wbptr += 8;
-      }
-    }
-
-    int16x8_t inputs[InnerTileRows][InnerTileCols];
-    const uint8x8_t ioffset = vdup_n_u8(input_offset);
-    for (unsigned int i = 0; i < InnerTileRows; i++)
-    {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
-      {
-        const auto x = vld1_u8(get_input_ptr(i, j, channel));
-        inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
-      }
-    }
-
-    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
-    {
-      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
-      {
-        int32x4_t acc_a = biases[0], acc_b = biases[1];
-
-        for (unsigned int wi = 0; wi < KernelRows; wi++)
-        {
-          for (unsigned int wj = 0; wj < KernelCols; wj++)
-          {
-            const auto w = weights[wi][wj];
-            const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
-#ifndef __aarch64__
-            acc_a = vmlal_s16(acc_a, vget_low_s16(w), vget_low_s16(x));
-            acc_b = vmlal_s16(acc_b, vget_high_s16(w), vget_high_s16(x));
-#else
-            asm("smlal  %[acc_a].4s, %[w].4h, %[x].4h\n"
-                "smlal2 %[acc_b].4s, %[w].8h, %[x].8h\n"
-                : [acc_a] "+w"(acc_a), [acc_b] "+w"(acc_b)
-                : [w] "w"(w), [x] "w"(x));
-#endif // __aarch64__
-          }
-        }
-
-        int32x4_t final_accs[2];
-        for (unsigned int i = 0; i < 2; i++)
-        {
-          const int32x4_t y = rounding_divide_by_exp2(
-              saturating_doubling_high_mul((i == 0 ? acc_a : acc_b), requant_multiplier),
-              requant_shift);
-          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
-          final_accs[i] = vaddq_s32(y, offset);
-          final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
-          final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
-        }
-
-#ifndef __aarch64__
-        const int16x8x2_t zelems = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
-                                             vreinterpretq_s16_s32(final_accs[1]));
-        const int8x16_t elems = vreinterpretq_s8_s16(zelems.val[0]);
-
-        const int8x16x2_t zoutput = vuzpq_s8(elems, elems);
-        const uint8x8_t output =
-                vget_low_u8(vreinterpretq_u8_s8(zoutput.val[0]));
-        vst1_u8(get_output_ptr(oi, oj, channel), output);
-#else
-        const int8x16_t elems = vreinterpretq_s8_s16(
-            vuzp1q_s16(vreinterpretq_s16_s32(final_accs[0]),
-                       vreinterpretq_s16_s32(final_accs[1])));
-        const uint8x8_t output =
-            vget_low_u8(vreinterpretq_u8_s8(vuzp1q_s8(elems, elems)));
-        vst1_u8(get_output_ptr(oi, oj, channel), output);
-#endif // __aarch64__
-      }
-    }
-  }
-  for (; n_channels; n_channels--, channel++)
-  {
-    // Load bias
-    const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
-    wbptr += sizeof(int32_t);
-
-    // Load weights
-    int16_t weights[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        weights[i][j] = *(wbptr++) - weight_offset;
-      }
-    }
-
-    // Load the input activations
-    int16_t inputs[InnerTileRows][InnerTileCols];
-    for (unsigned int i = 0; i < InnerTileRows; i++)
-    {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
-      {
-        inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
-      }
-    }
-
-    // Perform the convolution
-    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
-    {
-      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
-      {
-        int32_t acc = bias;
-
-        for (unsigned int wi = 0; wi < KernelRows; wi++)
-        {
-          for (unsigned int wj = 0; wj < KernelCols; wj++)
-          {
-            const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
-            acc += w * x;
-          }
-        }
-
-        // Requantize
-        acc = rounding_divide_by_exp2(
-            saturating_doubling_high_mul(acc, requant_multiplier),
-            requant_shift);
-        acc += output_offset;
-        acc = std::max(acc, clamp_min);
-        acc = std::min(acc, clamp_max);
-        uint8_t output = static_cast<uint8_t>(acc);
-        *(get_output_ptr(oi, oj, channel)) = output;
-      }
-    }
-  }
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename FInput, typename FOutput
->
-static inline void execute_tilefn(
-  int n_channels,
-  const void* packed_params,
-  const nck::ActivationFunction actfn,
-  FInput &get_input_ptr,
-  FOutput &get_output_ptr,
-  const QAsymm8Params &input_quant,
-  const QAsymm8Params &weight_quant,
-  const QAsymm8Params &output_quant,
-  const QAsymm8RescaleParams &requant
-) {
-  // Compute min/max clamp values
-  int32_t clamp_min = std::numeric_limits<uint8_t>::min();
-  int32_t clamp_max = std::numeric_limits<uint8_t>::max();
-
-  if (actfn == nck::ActivationFunction::ReLU ||
-      actfn == nck::ActivationFunction::ReLU6) {
-    const int32_t bottom_rail = output_quant.offset;
-    clamp_min = std::max(clamp_min, bottom_rail);
-  }
-
-  if (actfn == nck::ActivationFunction::ReLU6) {
-    const int32_t top_rail = output_quant.quantize(6.0f);
-    clamp_max = std::min(clamp_max, top_rail);
-  }
-
-  // Call the tile execution method
-  tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
-         StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr,
-                     clamp_max, clamp_min, input_quant.offset,
-                     weight_quant.offset, output_quant.offset,
-                     requant.multiplier, requant.shift);
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <nck::ActivationFunction Activation>
-void QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-  int n_channels,
-  const void* packed_params,
-  const uint8_t* inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  uint8_t* outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride
-) {
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
-      const int i, const int j, const int channel) {
-    return inptr + i * in_row_stride + j * in_col_stride + channel;
-  };
-
-  const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
-      const int i, const int j, const int channel) {
-    return outptr + i * out_row_stride + j * out_col_stride + channel;
-  };
-
-  execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                 StrideRows, StrideCols>(
-      n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
-      _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <nck::ActivationFunction Activation>
-void QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-  int n_channels,
-  const void* packed_params,
-  const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-) {
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptrs](const int i, const int j,
-                                      const int channel) {
-    return inptrs[i][j] + channel;
-  };
-
-  const auto get_output_ptr = [outptrs](const int i, const int j,
-                                        const int channel) {
-    return outptrs[i][j] + channel;
-  };
-
-  // Call the tile execution method
-  execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                 StrideRows, StrideCols>(
-      n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
-      _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
-}
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
deleted file mode 100644
index 68e20d98a9..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
+++ /dev/null
@@ -1,457 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include <limits>
-
-#include "arm.hpp"
-#include "impl_base.hpp"
-#include "depthwise_quantized.hpp"
-
-#pragma once
-
-namespace {
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename FInput, typename FOutput
->
-static inline void tilefn_hybrid(
-  int n_channels,
-  const void* packed_params,
-  FInput &get_input_ptr,
-  FOutput &get_output_ptr,
-  int32_t clamp_min,
-  int32_t clamp_max,
-  uint8_t input_offset,
-  uint8_t output_offset
-)
-{
-  constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
-  constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
-
-  // Offset into channels
-  int channel = 0;
-
-  // Byte type pointer to weights and biases
-  const int8_t *wbptr = static_cast<const int8_t *>(packed_params);
-
-  for (; n_channels >= 8; n_channels -= 8, channel += 8)
-  {
-    const int32x4_t biases[2] = {
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
-    };
-    const int32x4_t multipliers[2] = {
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12),
-    };
-    const int32x4_t shifts[2] = {
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 16),
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 20),
-    };
-    wbptr += 24*sizeof(int32_t);
-
-    int16x8_t weights[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        const auto w = vld1_s8(wbptr);
-        weights[i][j] = reinterpret_cast<int16x8_t>(vmovl_s8(w));
-        wbptr += 8;
-      }
-    }
-
-    int16x8_t inputs[InnerTileRows][InnerTileCols];
-    const uint8x8_t ioffset = vdup_n_u8(input_offset);
-    for (unsigned int i = 0; i < InnerTileRows; i++)
-    {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
-      {
-        const auto x = vld1_u8(get_input_ptr(i, j, channel));
-        inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
-      }
-    }
-
-    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
-    {
-      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
-      {
-        int32x4_t accs[2];
-        for (unsigned int i = 0; i < 2; i++)
-        {
-          accs[i] = biases[i];
-        }
-
-        for (unsigned int wi = 0; wi < KernelRows; wi++)
-        {
-          for (unsigned int wj = 0; wj < KernelCols; wj++)
-          {
-            const auto w = weights[wi][wj];
-            const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
-            accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
-            accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
-          }
-        }
-
-        int32x4_t final_accs[2];
-        for (unsigned int i = 0; i < 2; i++)
-        {
-          const int32x4_t y = rounding_divide_by_exp2(
-              saturating_doubling_high_mul(accs[i], multipliers[i]),
-              shifts[i]);
-          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
-          final_accs[i] = vaddq_s32(y, offset);
-          final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
-          final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
-        }
-
-        const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
-                                         vreinterpretq_s16_s32(final_accs[1]));
-        const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
-        const uint8x8_t output =
-                    vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
-
-        vst1_u8(get_output_ptr(oi, oj, channel), output);
-      }
-    }
-  }
-
-  for (; n_channels; n_channels--, channel++)
-  {
-    // Load bias
-    const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
-    const int32_t multiplier = *reinterpret_cast<const int32_t *>(wbptr + sizeof(int32_t));
-    const int32_t shift = *reinterpret_cast<const int32_t *>(wbptr + 2*sizeof(int32_t));
-
-    wbptr += 3*sizeof(int32_t);
-
-    // Load weights
-    int16_t weights[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        weights[i][j] = *(wbptr++);
-      }
-    }
-
-    // Load the input activations
-    int16_t inputs[InnerTileRows][InnerTileCols];
-    for (unsigned int i = 0; i < InnerTileRows; i++)
-    {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
-      {
-        inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
-      }
-    }
-
-    // Perform the convolution
-    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
-    {
-      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
-      {
-        int32_t acc = bias;
-
-        for (unsigned int wi = 0; wi < KernelRows; wi++)
-        {
-          for (unsigned int wj = 0; wj < KernelCols; wj++)
-          {
-            const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
-            acc += w * x;
-          }
-        }
-
-        // Requantize
-        acc = rounding_divide_by_exp2(
-            saturating_doubling_high_mul(acc, multiplier),
-            -shift);
-        acc += output_offset;
-        acc = std::max(acc, clamp_min);
-        acc = std::min(acc, clamp_max);
-        uint8_t output = static_cast<uint8_t>(acc);
-        *(get_output_ptr(oi, oj, channel)) = output;
-      }
-    }
-  }
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename FInput, typename FOutput
->
-static inline void execute_tilefn_hybrid(
-  int n_channels,
-  const void* packed_params,
-  const ActivationFunction actfn,
-  const qasymm8::QAsymm8Params &input_quant,
-  const qasymm8::QAsymm8Params &output_quant,
-  FInput &get_input_ptr,
-  FOutput &get_output_ptr) {
-
-  // Compute min/max clamp values
-  int32_t clamp_min = std::numeric_limits<uint8_t>::min();
-  int32_t clamp_max = std::numeric_limits<uint8_t>::max();
-
-  if (actfn == ActivationFunction::ReLU) {
-    clamp_min = output_quant.offset;
-  }
-
-  // Disabling Relu6 for now
-  if (actfn == ActivationFunction::ReLU6) {
-    const int32_t top_rail = output_quant.quantize(6.0f);
-    clamp_max = std::min(clamp_max, top_rail);
-  }
-
-  // Call the tile execution method
-  tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
-         StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_min, clamp_max, input_quant.offset, output_quant.offset);
-}
-}
-
-
-
-namespace depthwise {
-using namespace qsymm8;
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QSymm8HybridPerChannelDepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  const ActivationFunction activation,
-  const QSymm8PerChannelParams& weight_quantisation,
-  const qasymm8::QAsymm8Params& input_quantisation,
-  const qasymm8::QAsymm8Params& output_quantisation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : QSymm8HybridPerChannelDepthwiseConvolution(
-    n_batches, n_input_rows, n_input_cols, n_channels,
-    activation, weight_quantisation, input_quantisation, output_quantisation,
-    QSymm8PerChannelRescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
-    padding_top, padding_left, padding_bottom, padding_right
-  )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QSymm8HybridPerChannelDepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  const ActivationFunction activation,
-  const QSymm8PerChannelParams& weight_quantisation,
-  const qasymm8::QAsymm8Params& input_quantisation,
-  const qasymm8::QAsymm8Params& output_quantisation,
-  const QSymm8PerChannelRescaleParams& rescale_params,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-      n_batches, n_input_rows, n_input_cols, n_channels, activation,
-      padding_top, padding_left, padding_bottom, padding_right
-  ),
-  _weights_quant(weight_quantisation),
-  _input_quant(input_quantisation),
-  _output_quant(output_quantisation),
-  _rescale_parameters(rescale_params)
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-uint8_t QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_input_padding_value(void) const
-{
-  return _input_quant.offset;
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-void QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_pack_params(
-  void * const buffer,
-  const void * const weights,
-  const unsigned int weight_row_stride,
-  const unsigned int weight_col_stride,
-  const void * const biases
-) const
-{
-  const int8_t *wptr = static_cast<const int8_t *>(weights);
-  const int32_t *bptr = static_cast<const int32_t *>(biases);
-  const int32_t *mptr = static_cast<const int32_t *>(_rescale_parameters.multipliers.data());
-  const int32_t *sptr = static_cast<const int32_t *>(_rescale_parameters.shifts.data());
-  int8_t *outptr = static_cast<int8_t *>(buffer);
-
-  // We set the vector length to use doubles on both Aarch64 and Aarch32.  NOTE
-  // For SVE set this to half the vector length.
-  unsigned int veclen = 8;
-
-  // While there are channels left to process, pack a vector length of them at
-  // a time and reduce the size of vector used as the size of the tensor
-  // decreases.
-  for (
-    unsigned int n_channels = this->n_channels(); n_channels;
-    n_channels -= veclen,
-    outptr += veclen*(3*sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
-  )
-  {
-    // NOTE Ignore this section if using SVE, the vector length remains the
-    // same and we just don't fill a full register for the tail.
-    while (n_channels < veclen)
-    {
-      // Reduce the vector length to either 8 or 1 (scalar)
-      // TODO Support more vector lengths in `execute_tile`.
-      veclen = (veclen == 16) ? 8 : 1;
-    }
-
-    // Get pointers to bias and weight portions of the output structure.
-    int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
-    int32_t *out_mptr = reinterpret_cast<int32_t *>(outptr + veclen*sizeof(int32_t));
-    int32_t *out_sptr = reinterpret_cast<int32_t *>(outptr + 2*veclen*sizeof(int32_t));
-    int8_t  *out_wptr = outptr + 3*veclen*sizeof(int32_t);
-
-    // Copy a vector length of elements
-    for (unsigned int n = 0; n < veclen && n < n_channels; n++)
-    {
-      const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
-      const int32_t multiplier = (mptr != nullptr) ? *(mptr++) : 0;
-      const int32_t shift = (sptr != nullptr) ? *(sptr++) : 0;
-
-      out_bptr[n] = bias;
-      out_mptr[n] = multiplier;
-      out_sptr[n] = -shift;
-
-      for (unsigned int i = 0; i < KernelRows; i++)
-      {
-        int8_t *row_outptr = out_wptr + i*KernelCols*veclen;
-        for (unsigned int j = 0; j < KernelCols; j++)
-        {
-          int8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
-          row_outptr[j*veclen + n] = w;
-        }
-      }
-      wptr++;
-    }
-  }
-}
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-  int n_channels,
-  const void* packed_params,
-  const uint8_t* inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  uint8_t* outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride
-) {
-
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
-      const int i, const int j, const int channel) {
-    return inptr + i * in_row_stride + j * in_col_stride + channel;
-  };
-
-  const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
-      const int i, const int j, const int channel) {
-    return outptr + i * out_row_stride + j * out_col_stride + channel;
-  };
-
-  execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                 StrideRows, StrideCols>(
-      n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-  int n_channels,
-  const void* packed_params,
-  const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-) {
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptrs](const int i, const int j,
-                                      const int channel) {
-    return inptrs[i][j] + channel;
-  };
-
-  const auto get_output_ptr = [outptrs](const int i, const int j,
-                                        const int channel) {
-    return outptrs[i][j] + channel;
-  };
-
-  // Call the tile execution method
-  execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                 StrideRows, StrideCols>(
-      n_channels, packed_params, Activation,  _input_quant, _output_quant, get_input_ptr, get_output_ptr);
-}
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transform.hpp b/src/core/NEON/kernels/convolution/winograd/input_transform.hpp
new file mode 100644
index 0000000000..265551288d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/input_transform.hpp
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "winograd.hpp"
+
+#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
+#include <algorithm>
+#include <cstring>
+#include <functional>
+
+namespace arm_conv {
+namespace winograd {
+namespace input_transform {
+
+namespace {
+
+template <typename T>
+constexpr T iceildiv(const T a, const T b)
+{
+  return (a + b - 1) / b;
+}
+
+}
+
+/* Driver class for the Winograd input transforms.
+ *
+ * This provides a base implementation which handles iteration over the input
+ * tensor; subclasses are responsible for managing working space and executing
+ * the transform on individual tiles.
+ */
+template <typename TIn, typename TOut=TIn>
+class TransformBase : public ITransform
+{
+  const std::string m_name;
+  const unsigned int m_input_rows, m_input_cols;
+
+  protected:
+  virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
+  {
+    return 0;
+  }
+
+  virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
+  {
+    // Nothing to do
+  }
+
+  virtual void execute_tile(
+    unsigned int n_channels,
+    const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
+    TOut *outptr, size_t ld_out_matrix,
+    unsigned int pad_top, unsigned int valid_rows,
+    unsigned int pad_left, unsigned int valid_cols,
+    void *working_space
+  ) const = 0;
+
+  void execute_internal(
+    const ConvolutionArgs &args,
+    const TIn *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
+    TOut *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
+    void *working_space, unsigned int thread_id, unsigned int n_threads
+  ) const
+  {
+    // Get the working space for this thread, and initialise it.
+    working_space = reinterpret_cast<char *>(working_space) +
+                    this->get_working_space_per_thread(args) * thread_id;
+    this->initialise_thread_working_space(args, working_space);
+
+    // Get tile traversal parameters
+    const auto tile_stride_rows = std::max(1u, m_input_rows - args.kernel_shape.rows + 1);
+    const auto tile_stride_cols = std::max(1u, m_input_cols - args.kernel_shape.cols + 1);
+    const auto n_tile_rows = iceildiv(
+      args.output_shape.rows, m_input_rows - args.kernel_shape.rows + 1);
+    const auto n_tile_cols = iceildiv(
+      args.output_shape.cols, m_input_cols - args.kernel_shape.cols + 1);
+
+    // Execute over all batches
+    for (unsigned int batch = 0; batch < args.n_batches; batch++)
+    {
+      auto outptr_tile = outptr + thread_id * n_tile_cols * ld_out_row;
+
+      // For a single batch, stripe the rows over the threads.
+      for (auto tile_i = thread_id; tile_i < n_tile_rows; tile_i += n_threads)
+      {
+        // Compute pointers and padding for this row of tiles
+        const auto start_i = tile_i * tile_stride_rows;
+        const auto pad_top = start_i < args.pad_top ? args.pad_top - start_i : 0;
+        const auto inptr_row = inptr + (pad_top ? 0 : start_i - args.pad_top) * ld_in_row;
+        const auto valid_rows = args.input_shape.rows - (pad_top ? 0 : start_i - args.pad_top);
+
+        // Iterate over columns
+        for (auto tile_j = 0u; tile_j < n_tile_cols; tile_j++)
+        {
+          // Compute pointers and padding for this tile, then delegate to
+          // execute the kernel.
+          const auto start_j = tile_j * tile_stride_cols;
+          const auto pad_left = start_j < args.pad_left ? args.pad_left - start_j : 0;
+          const auto inptr_tile = inptr_row + (pad_left ? 0 : start_j - args.pad_left) * ld_in_col;
+          const auto valid_cols = args.input_shape.cols - (pad_left ? 0 : start_j - args.pad_left);
+
+          this->execute_tile(
+            args.n_input_channels,
+            inptr_tile, ld_in_row, ld_in_col,
+            outptr_tile, ld_out_matrix,
+            pad_top, valid_rows, pad_left, valid_cols,
+            working_space
+          );
+          outptr_tile += ld_out_row;
+        }
+
+        outptr_tile += (n_threads - 1) * n_tile_cols * ld_out_row;
+      }
+
+      inptr += ld_in_batch;
+      outptr += ld_out_batch;
+    }
+  }
+
+  public:
+  TransformBase(const std::string &name, unsigned int input_rows, unsigned int input_cols)
+  : m_name(name), m_input_rows(input_rows), m_input_cols(input_cols)
+  {
+  }
+
+  const std::string &get_name(void) const override { return m_name; }
+
+  unsigned int get_input_rows(void) const override final { return m_input_rows; }
+  unsigned int get_input_cols(void) const override final { return m_input_cols; }
+
+  size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
+  {
+    return n_threads * this->get_working_space_per_thread(args);
+  }
+
+  void execute(
+    const ConvolutionArgs &args,
+    const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
+    void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
+    void *working_space, unsigned int thread_id, unsigned int n_threads
+  ) const override
+  {
+    execute_internal(
+      args,
+      reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_row, ld_in_col,
+      reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_matrix, ld_out_row,
+      working_space, thread_id, n_threads
+    );
+  }
+};
+
+template <typename TIn, typename TOut=TIn>
+class TransformDirect : public TransformBase<TIn, TOut>
+{
+  using Kernel = std::function<void(
+    unsigned int,  // Number of channels
+    const TIn *,  size_t, size_t,  // Pointer to first valid input element, row and column stride
+    unsigned int, unsigned int, unsigned int, unsigned int,  // Top, left, bottom and right padding
+    TOut *, size_t  // Base output pointer, stride between matrices
+  )>;
+  const Kernel m_kernel;
+
+  protected:
+  void execute_tile(
+    unsigned int n_channels,
+    const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
+    TOut *outptr, size_t ld_out_matrix,
+    unsigned int pad_top, unsigned int valid_rows,
+    unsigned int pad_left, unsigned int valid_cols,
+    void *
+  ) const override
+  {
+    const auto end_i = this->get_input_rows() - pad_top;
+    const auto pad_bottom = end_i < valid_rows ? 0 : end_i - valid_rows;
+    const auto end_j = this->get_input_cols() - pad_left;
+    const auto pad_right = end_j < valid_cols ? 0 : end_j - valid_cols;
+
+    // Execute the kernel
+    m_kernel(
+      n_channels, inptr, ld_in_row, ld_in_col,
+      pad_top, pad_left, pad_bottom, pad_right,
+      outptr, ld_out_matrix
+    );
+  }
+
+  public:
+  TransformDirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
+  : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
+  {
+  }
+};
+
+template <typename TIn, typename TOut=TIn>
+class TransformIndirect : public TransformBase<TIn, TOut>
+{
+  using Kernel = std::function<void(
+    unsigned int,  // Number of channels
+    const TIn *const *,  // Input pointers (one per point)
+    TOut *, size_t   // Base output pointer, stride between matrices
+  )>;
+  const Kernel m_kernel;
+
+  struct Workspace
+  {
+    const TIn **inptrs;
+    const TIn *input_buffer;
+  };
+
+  size_t sizeof_inptr_array(void) const
+  {
+    return sizeof(const TIn **) * this->get_input_rows() * this->get_input_cols();
+  }
+
+  protected:
+  size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
+  {
+    return sizeof(Workspace) + sizeof_inptr_array() + sizeof(TIn) * args.n_input_channels;
+  }
+
+  void initialise_thread_working_space(const ConvolutionArgs &args, void *buffer) const override
+  {
+    Workspace *ws = reinterpret_cast<Workspace *>(buffer);
+    buffer = ws + 1;
+
+    ws->inptrs = reinterpret_cast<const TIn **>(buffer);
+    buffer = reinterpret_cast<char *>(buffer) + sizeof_inptr_array();
+
+    ws->input_buffer = reinterpret_cast<const TIn *>(buffer);
+    memset(buffer, 0, sizeof(TIn) * args.n_input_channels);
+  }
+
+  void execute_tile(
+    unsigned int n_channels,
+    const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
+    TOut *outptr, size_t ld_out_matrix,
+    unsigned int pad_top, unsigned int valid_rows,
+    unsigned int pad_left, unsigned int valid_cols,
+    void *working_space
+  ) const override
+  {
+    // Get the working space
+    auto ws = reinterpret_cast<Workspace *>(working_space);
+
+    // Construct the input pointer array based on the given arguments
+    fill_pointer_array<const TIn>(
+      ws->inptrs, this->get_input_rows(), this->get_input_cols(),
+      inptr, ld_in_row, ld_in_col,
+      ws->input_buffer,
+      pad_top, valid_rows,
+      pad_left, valid_cols
+    );
+
+    // Execute the kernel
+    m_kernel(n_channels, ws->inptrs, outptr, ld_out_matrix);
+  }
+
+  public:
+  TransformIndirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
+  : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
+  {
+  }
+};
+
+template <typename TIn, typename TOut=TIn>
+class TransformUnpadded : public TransformBase<TIn, TOut>
+{
+  using Kernel = std::function<void(
+    unsigned int,  // Number of channels
+    const TIn *,  size_t, size_t,  // Pointer to first input element, row and column stride
+    TOut *, size_t // Base output pointer, stride between matrices
+  )>;
+  const Kernel m_kernel;
+
+  protected:
+  size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
+  {
+    const auto input_points = this->get_input_rows() * this->get_input_cols();
+    return sizeof(TIn) * input_points * args.n_input_channels;
+  }
+
+  void execute_tile(
+    unsigned int n_channels,
+    const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
+    TOut *const outptr, const size_t ld_out_matrix,
+    const unsigned int pad_top, const unsigned int valid_rows,
+    const unsigned int pad_left, const unsigned int valid_cols,
+    void *const working_space
+  ) const override
+  {
+    // If there's any padding, then copy the valid portion of the tensor into
+    // the working space and reset the pointer, row and column strides to point
+    // at this copy of the data.
+    if (pad_top || valid_rows < this->get_input_rows() ||
+        pad_left || valid_cols < this->get_input_cols())
+    {
+      const auto patch_ld_col = n_channels;
+      const auto patch_ld_row = patch_ld_col * this->get_input_cols();
+      auto patch = reinterpret_cast<TIn *>(working_space) +
+                   pad_top*patch_ld_row + pad_left*patch_ld_col;
+
+      // Fill the input patch with padding
+      memset(working_space, 0, sizeof(TIn) * this->get_input_rows() * patch_ld_row);
+
+      // Determine the bounds for which to copy
+      const auto last_i = std::min(valid_rows + pad_top, this->get_input_rows());
+      const auto last_j = std::min(valid_cols + pad_left, this->get_input_cols());
+
+      // Copy across the valid portion of the patch
+      for (auto i = pad_top; i < last_i; i++)
+      {
+        auto inptr_col = inptr;
+        inptr += ld_in_row;
+
+        auto patch_col = patch;
+        patch += patch_ld_row;
+
+        for (auto j = pad_left; j < last_j; j++)
+        {
+          // Perform the copy and progress both input and patch pointers
+          memcpy(patch_col, inptr_col, n_channels * sizeof(TIn));
+          inptr_col += ld_in_col;
+          patch_col += patch_ld_col;
+        }
+      }
+
+      // Override the input pointer and strides
+      inptr = reinterpret_cast<const TIn *>(working_space);
+      ld_in_col = patch_ld_col;
+      ld_in_row = patch_ld_row;
+    }
+
+    // Call the kernel
+    m_kernel(n_channels, inptr, ld_in_row, ld_in_col, outptr, ld_out_matrix);
+  }
+
+  public:
+  TransformUnpadded(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
+  : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
+  {
+  }
+
+  /* Utility method which can be used to get a transposed version of a kernel,
+   * this just calls the kernel with the input row and column strides reversed.
+   */
+  static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
+  {
+    return [kernel] (
+      const unsigned int n_channels,
+      const TIn *const inptr, const size_t ld_in_row, const size_t ld_in_col,
+      TOut *const outptr, const size_t ld_out_matrix
+    ) {
+      kernel(n_channels, inptr, ld_in_col, ld_in_row, outptr, ld_out_matrix);
+    };
+  }
+};
+
+}  // namespace input_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp
new file mode 100644
index 0000000000..ad759b225e
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+#include <arm_neon.h>
+#include <cstddef>
+
+namespace arm_conv {
+namespace winograd {
+namespace input_transform {
+
+void a64_fp16_6x6(
+    const unsigned int n_channels,
+    const __fp16* const input_base,
+    const size_t input_row_stride,
+    const size_t input_col_stride,
+    __fp16* outptr,
+    const size_t matrix_stride
+)
+{
+    constexpr int inner_tile_rows = 6;
+    constexpr int inner_tile_cols = 6;
+
+    // Get pointers into the input tile
+    const __fp16 *x_ptrs[inner_tile_rows][inner_tile_cols];
+    for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
+    {
+        // Get a pointer into the row
+        const __fp16* const row_ptr = input_base + xi*input_row_stride;
+
+        for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
+        {
+            x_ptrs[i][j] = row_ptr + xj*input_col_stride;
+        }
+    }
+
+    // Matrices used/computed in this kernel.
+    __fp16 x[inner_tile_rows][inner_tile_cols];
+    __fp16 XTx[inner_tile_rows][inner_tile_cols];
+    __fp16 U[inner_tile_rows][inner_tile_cols];
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+        for (int j = 0; j < inner_tile_cols; j++)
+        {
+            x[i][j] = XTx[i][j] = 0.0f;
+        }
+    }
+
+    // Perform the Winograd input transformation for each channel in the input
+    // tensor.
+    int channels_remaining = n_channels;
+    for (; channels_remaining >= 8; channels_remaining -= 8)
+    {
+        // Matrices used/computed in this kernel
+        float16x8_t x[inner_tile_rows][inner_tile_cols];
+        float16x8_t XTx[inner_tile_rows][inner_tile_cols];
+        float16x8_t U[inner_tile_rows][inner_tile_cols];
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            for (int j = 0; j < inner_tile_cols; j++)
+            {
+                x[i][j] = vdupq_n_f16(0.0f);
+                XTx[i][j] = vdupq_n_f16(0.0f);
+            }
+        }
+
+        // Read a 6x6 tile in the Winograd domain
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            for (int j = 0; j < inner_tile_cols; j++)
+            {
+                x[i][j] = vld1q_f16(x_ptrs[i][j]);
+                x_ptrs[i][j] += 8;
+            }
+        }
+
+        // Compute XT . x
+        for (int j = 0; j < inner_tile_cols; j++)
+        {
+            // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+            XTx[0][j] = vsubq_f16(vaddq_f16(x[4][j], vmulq_f16(x[0][j], vdupq_n_f16(4.0f))), vmulq_f16(x[2][j], vdupq_n_f16(5.0f)));
+
+            // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+            XTx[1][j] = vsubq_f16(vaddq_f16(x[3][j], x[4][j]), vmulq_f16(vaddq_f16(x[1][j], x[2][j]),  vdupq_n_f16(4.0f)));
+
+            // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+            XTx[2][j] = vaddq_f16(vsubq_f16(x[4][j], x[3][j]), vmulq_f16(vsubq_f16(x[1][j], x[2][j]), vdupq_n_f16(4.0f)));
+
+            // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+            XTx[3][j] = vaddq_f16(vsubq_f16(x[4][j], x[2][j]), vmulq_f16(vsubq_f16(x[3][j], x[1][j]), vdupq_n_f16(2.0f)));
+
+            // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+            XTx[4][j] = vaddq_f16(vsubq_f16(x[4][j], x[2][j]), vmulq_f16(vsubq_f16(x[1][j], x[3][j]), vdupq_n_f16(2.0f)));
+
+            // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+            XTx[5][j] = vsubq_f16(vaddq_f16(x[5][j], vmulq_f16(x[1][j], vdupq_n_f16(4.0f))), vmulq_f16(x[3][j], vdupq_n_f16(5.0f)));
+        }
+
+        // Compute U = XT . x . X
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+            U[i][0] = vsubq_f16(vaddq_f16(XTx[i][4], vmulq_f16(XTx[i][0], vdupq_n_f16(4.0f))), vmulq_f16(XTx[i][2], vdupq_n_f16(5.0f)));
+
+            // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+            U[i][1] = vsubq_f16(vaddq_f16(XTx[i][3], XTx[i][4]), vmulq_f16(vaddq_f16(XTx[i][1], XTx[i][2]), vdupq_n_f16(4.0f)));
+
+            // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+            U[i][2] = vaddq_f16(vsubq_f16(XTx[i][4], XTx[i][3]), vmulq_f16(vsubq_f16(XTx[i][1], XTx[i][2]), vdupq_n_f16(4.0f)));
+
+            // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+            U[i][3] = vaddq_f16(vsubq_f16(XTx[i][4], XTx[i][2]), vmulq_f16(vsubq_f16(XTx[i][3], XTx[i][1]), vdupq_n_f16(2.0f)));
+
+            // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+            U[i][4] = vaddq_f16(vsubq_f16(XTx[i][4], XTx[i][2]), vmulq_f16(vsubq_f16(XTx[i][1], XTx[i][3]), vdupq_n_f16(2.0f)));
+
+            // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+            U[i][5] = vsubq_f16(vaddq_f16(XTx[i][5], vmulq_f16(XTx[i][1], vdupq_n_f16(4.0f))), vmulq_f16(XTx[i][3], vdupq_n_f16(5.0f)));
+        }
+
+        // Store the transformed matrix
+        for (int i = 0, m = 0; i < inner_tile_rows; i++)
+        {
+            for (int j = 0; j < inner_tile_cols; j++, m++)
+            {
+                vst1q_f16(outptr + m*matrix_stride, U[i][j]);
+            }
+        }
+        outptr += 8;
+    }
+    for (; channels_remaining >= 4; channels_remaining -= 4)
+    {
+        // Matrices used/computed in this kernel
+        float16x4_t x[inner_tile_rows][inner_tile_cols];
+        float16x4_t XTx[inner_tile_rows][inner_tile_cols];
+        float16x4_t U[inner_tile_rows][inner_tile_cols];
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            for (int j = 0; j < inner_tile_cols; j++)
+            {
+                x[i][j] = vdup_n_f16(0.0f);
+                XTx[i][j] = vdup_n_f16(0.0f);
+            }
+        }
+
+        // Read a 6x6 tile in the Winograd domain
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            for (int j = 0; j < inner_tile_cols; j++)
+            {
+                x[i][j] = vld1_f16(x_ptrs[i][j]);
+                x_ptrs[i][j] += 4;
+            }
+        }
+
+        // Compute XT . x
+        for (int j = 0; j < inner_tile_cols; j++)
+        {
+            // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+            XTx[0][j] = vsub_f16(vadd_f16(x[4][j], vmul_f16(x[0][j], vdup_n_f16(4.0f))), vmul_f16(x[2][j], vdup_n_f16(5.0f)));
+
+            // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+            XTx[1][j] = vsub_f16(vadd_f16(x[3][j], x[4][j]), vmul_f16(vadd_f16(x[1][j], x[2][j]),  vdup_n_f16(4.0f)));
+
+            // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+            XTx[2][j] = vadd_f16(vsub_f16(x[4][j], x[3][j]), vmul_f16(vsub_f16(x[1][j], x[2][j]), vdup_n_f16(4.0f)));
+
+            // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+            XTx[3][j] = vadd_f16(vsub_f16(x[4][j], x[2][j]), vmul_f16(vsub_f16(x[3][j], x[1][j]), vdup_n_f16(2.0f)));
+
+            // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+            XTx[4][j] = vadd_f16(vsub_f16(x[4][j], x[2][j]), vmul_f16(vsub_f16(x[1][j], x[3][j]), vdup_n_f16(2.0f)));
+
+            // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+            XTx[5][j] = vsub_f16(vadd_f16(x[5][j], vmul_f16(x[1][j], vdup_n_f16(4.0f))), vmul_f16(x[3][j], vdup_n_f16(5.0f)));
+        }
+
+        // Compute U = XT . x . X
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+            U[i][0] = vsub_f16(vadd_f16(XTx[i][4], vmul_f16(XTx[i][0], vdup_n_f16(4.0f))), vmul_f16(XTx[i][2], vdup_n_f16(5.0f)));
+
+            // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+            U[i][1] = vsub_f16(vadd_f16(XTx[i][3], XTx[i][4]), vmul_f16(vadd_f16(XTx[i][1], XTx[i][2]), vdup_n_f16(4.0f)));
+
+            // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+            U[i][2] = vadd_f16(vsub_f16(XTx[i][4], XTx[i][3]), vmul_f16(vsub_f16(XTx[i][1], XTx[i][2]), vdup_n_f16(4.0f)));
+
+            // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+            U[i][3] = vadd_f16(vsub_f16(XTx[i][4], XTx[i][2]), vmul_f16(vsub_f16(XTx[i][3], XTx[i][1]), vdup_n_f16(2.0f)));
+
+            // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+            U[i][4] = vadd_f16(vsub_f16(XTx[i][4], XTx[i][2]), vmul_f16(vsub_f16(XTx[i][1], XTx[i][3]), vdup_n_f16(2.0f)));
+
+            // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+            U[i][5] = vsub_f16(vadd_f16(XTx[i][5], vmul_f16(XTx[i][1], vdup_n_f16(4.0f))), vmul_f16(XTx[i][3], vdup_n_f16(5.0f)));
+        }
+
+        // Store the transformed matrix
+        for (int i = 0, m = 0; i < inner_tile_rows; i++)
+        {
+            for (int j = 0; j < inner_tile_cols; j++, m++)
+            {
+                vst1_f16(outptr + m*matrix_stride, U[i][j]);
+            }
+        }
+        outptr += 4;
+    }
+    for (; channels_remaining; channels_remaining--)
+    {
+        // Load x
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            for (int j = 0; j < inner_tile_cols; j++)
+            {
+                x[i][j] = *(x_ptrs[i][j]++);
+            }
+        }
+
+        // Compute XT . x
+        for (int j = 0; j < inner_tile_cols; j++)
+        {
+            XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+            XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+            XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+            XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+            XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+            XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+        }
+
+        // Compute U = XT . x . X
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+            U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+            U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+            U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+            U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+            U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+        }
+
+        // Store the transformed matrix
+        for (int i = 0, m = 0; i < inner_tile_rows; i++)
+        {
+            for (int j = 0; j < inner_tile_cols; j++, m++)
+            {
+                *(outptr + m*matrix_stride) = U[i][j];
+            }
+        }
+        outptr++;
+    }
+}
+
+}  // namespace input_transform
+}  // namespace winograd
+}  // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp32_6x6.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp32_6x6.cpp
new file mode 100644
index 0000000000..a2c04e0d8d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp32_6x6.cpp
@@ -0,0 +1,1140 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+#include <cstddef>
+
+namespace arm_conv {
+namespace winograd {
+namespace input_transform {
+
+void a64_fp32_6x6(
+  unsigned int n_channels,
+  const float *input_base,
+  const size_t input_row_stride,
+  const size_t input_col_stride,
+  float *matrix_base,
+  const size_t matrix_stride
+)
+{
+  const float pcoeffs[4] = {1.0f, 2.0f, 4.0f, 5.0f};
+  __asm__ __volatile__(
+    "ldr q0, [%[pcoeffs]]\n"
+    "add x25, %[inptr0], %[input_row_stride]\n"
+    "add x10, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x16, x25, %[input_row_stride]\n"
+    "add x8, x10, %[input_col_stride1]\n"
+    "add x26, x16, %[input_row_stride]\n"
+    "add x20, x8, %[input_col_stride1]\n"
+    "add x17, x26, %[input_row_stride]\n"
+    "add x21, x20, %[input_col_stride1]\n"
+    "add x27, x17, %[input_row_stride]\n"
+    "add x28, %[outptr0], %[output_row_stride]\n"
+    "add x11, %[output_col_stride1], %[output_col_stride1]\n"
+    "add x22, x28, %[output_row_stride]\n"
+    "add x13, x11, %[output_col_stride1]\n"
+    "add x12, x22, %[output_row_stride]\n"
+    "add x23, x13, %[output_col_stride1]\n"
+    "add x14, x12, %[output_row_stride]\n"
+    "add x15, x23, %[output_col_stride1]\n"
+    "add x24, x14, %[output_row_stride]\n"
+    "cmp %w[n_channels], #4\n"
+    "blt 2f\n"
+    "1:\n"
+    "ldr q8, [%[inptr0], x20]\n"
+    "ldr q2, [%[inptr0], x10]\n"
+    "mov v14.16b, v8.16b\n"
+    "ldr q9, [%[inptr0]]\n"
+    "mov v10.16b, v8.16b\n"
+    "ldr q1, [%[inptr0], x21]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "ldr q4, [%[inptr0], x8]\n"
+    "mov v9.16b, v8.16b\n"
+    "ldr q12, [%[inptr0], %[input_col_stride1]]\n"
+    "fmls v10.4s, v12.4s, v0.s[2]\n"
+    "ldr q5, [x16, x20]\n"
+    "fmls v14.4s, v2.4s, v0.s[3]\n"
+    "ldr q20, [x16, x10]\n"
+    "fmla v9.4s, v12.4s, v0.s[2]\n"
+    "ldr q3, [x16]\n"
+    "fmls v10.4s, v2.4s, v0.s[2]\n"
+    "ldr q6, [x16, x21]\n"
+    "mov v7.16b, v8.16b\n"
+    "ldr q16, [x16, x8]\n"
+    "fmls v9.4s, v2.4s, v0.s[2]\n"
+    "ldr q22, [x16, %[input_col_stride1]]\n"
+    "fadd v10.4s, v10.4s, v4.4s\n"
+    "ldr q17, [x17, x20]\n"
+    "fmls v7.4s, v12.4s, v0.s[1]\n"
+    "ldr q15, [x17, x10]\n"
+    "fsub v9.4s, v9.4s, v4.4s\n"
+    "ldr q19, [x17]\n"
+    "mov v8.16b, v8.16b\n"
+    "ldr q18, [x17, x21]\n"
+    "fsub v7.4s, v7.4s, v2.4s\n"
+    "ldr q13, [x17, x8]\n"
+    "fmla v7.4s, v4.4s, v0.s[1]\n"
+    "ldr q21, [x17, %[input_col_stride1]]\n"
+    "fmla v8.4s, v12.4s, v0.s[1]\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "mov v11.16b, v1.16b\n"
+    "add x16, x16, #16\n"
+    "mov v1.16b, v5.16b\n"
+    "add x17, x17, #16\n"
+    "fsub v8.4s, v8.4s, v2.4s\n"
+    "fmla v11.4s, v12.4s, v0.s[2]\n"
+    "fmls v8.4s, v4.4s, v0.s[1]\n"
+    "fmla v1.4s, v3.4s, v0.s[2]\n"
+    "mov v2.16b, v5.16b\n"
+    "mov v3.16b, v5.16b\n"
+    "fmls v11.4s, v4.4s, v0.s[3]\n"
+    "mov v4.16b, v5.16b\n"
+    "fmls v1.4s, v20.4s, v0.s[3]\n"
+    "fmls v2.4s, v22.4s, v0.s[2]\n"
+    "fmla v3.4s, v22.4s, v0.s[2]\n"
+    "fmls v4.4s, v22.4s, v0.s[1]\n"
+    "mov v5.16b, v5.16b\n"
+    "mov v6.16b, v6.16b\n"
+    "fmls v2.4s, v20.4s, v0.s[2]\n"
+    "mov v12.16b, v17.16b\n"
+    "fmls v3.4s, v20.4s, v0.s[2]\n"
+    "fsub v4.4s, v4.4s, v20.4s\n"
+    "fmla v4.4s, v16.4s, v0.s[1]\n"
+    "fmla v5.4s, v22.4s, v0.s[1]\n"
+    "fadd v2.4s, v2.4s, v16.4s\n"
+    "fmla v6.4s, v22.4s, v0.s[2]\n"
+    "fsub v3.4s, v3.4s, v16.4s\n"
+    "fmla v12.4s, v19.4s, v0.s[2]\n"
+    "fsub v5.4s, v5.4s, v20.4s\n"
+    "mov v19.16b, v17.16b\n"
+    "fmls v5.4s, v16.4s, v0.s[1]\n"
+    "fmls v6.4s, v16.4s, v0.s[3]\n"
+    "fmls v12.4s, v15.4s, v0.s[3]\n"
+    "fmls v19.4s, v21.4s, v0.s[2]\n"
+    "mov v20.16b, v17.16b\n"
+    "mov v16.16b, v17.16b\n"
+    "mov v17.16b, v17.16b\n"
+    "mov v18.16b, v18.16b\n"
+    "fmls v19.4s, v15.4s, v0.s[2]\n"
+    "fmla v20.4s, v21.4s, v0.s[2]\n"
+    "fmls v16.4s, v21.4s, v0.s[1]\n"
+    "fmla v17.4s, v21.4s, v0.s[1]\n"
+    "fmla v18.4s, v21.4s, v0.s[2]\n"
+    "mov v23.16b, v12.16b\n"
+    "fadd v19.4s, v19.4s, v13.4s\n"
+    "fmls v20.4s, v15.4s, v0.s[2]\n"
+    "fsub v16.4s, v16.4s, v15.4s\n"
+    "fsub v17.4s, v17.4s, v15.4s\n"
+    "fmla v16.4s, v13.4s, v0.s[1]\n"
+    "fmls v17.4s, v13.4s, v0.s[1]\n"
+    "fsub v20.4s, v20.4s, v13.4s\n"
+    "fmls v18.4s, v13.4s, v0.s[3]\n"
+    "fmla v23.4s, v14.4s, v0.s[2]\n"
+    "mov v15.16b, v19.16b\n"
+    "mov v14.16b, v20.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "fmla v15.4s, v10.4s, v0.s[2]\n"
+    "mov v10.16b, v17.16b\n"
+    "fmls v23.4s, v1.4s, v0.s[3]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "fmla v24.4s, v7.4s, v0.s[2]\n"
+    "fmla v10.4s, v8.4s, v0.s[2]\n"
+    "fmls v15.4s, v2.4s, v0.s[3]\n"
+    "mov v7.16b, v18.16b\n"
+    "str q23, [%[outptr0]]\n"
+    "fmls v14.4s, v3.4s, v0.s[3]\n"
+    "fmls v24.4s, v4.4s, v0.s[3]\n"
+    "fmls v10.4s, v5.4s, v0.s[3]\n"
+    "str q15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v7.4s, v11.4s, v0.s[2]\n"
+    "str q14, [%[outptr0], x11]\n"
+    "str q24, [%[outptr0], x13]\n"
+    "str q10, [%[outptr0], x23]\n"
+    "fmls v7.4s, v6.4s, v0.s[3]\n"
+    "str q7, [%[outptr0], x15]\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "mov v26.16b, v12.16b\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr q11, [x25, x20]\n"
+    "mov v10.16b, v11.16b\n"
+    "ldr q23, [x25, x10]\n"
+    "mov v9.16b, v11.16b\n"
+    "ldr q7, [x25]\n"
+    "fmla v10.4s, v7.4s, v0.s[2]\n"
+    "ldr q13, [x25, x21]\n"
+    "mov v7.16b, v11.16b\n"
+    "ldr q31, [x25, x8]\n"
+    "mov v8.16b, v11.16b\n"
+    "ldr q21, [x25, %[input_col_stride1]]\n"
+    "fmls v10.4s, v23.4s, v0.s[3]\n"
+    "ldr q30, [x26, x20]\n"
+    "fmls v9.4s, v21.4s, v0.s[2]\n"
+    "ldr q29, [x26, x10]\n"
+    "fmla v7.4s, v21.4s, v0.s[2]\n"
+    "ldr q22, [x26]\n"
+    "fmls v8.4s, v21.4s, v0.s[1]\n"
+    "ldr q24, [x26, x21]\n"
+    "fmls v9.4s, v23.4s, v0.s[2]\n"
+    "ldr q27, [x26, x8]\n"
+    "fmls v7.4s, v23.4s, v0.s[2]\n"
+    "ldr q28, [x26, %[input_col_stride1]]\n"
+    "fsub v8.4s, v8.4s, v23.4s\n"
+    "add x25, x25, #16\n"
+    "fadd v9.4s, v9.4s, v31.4s\n"
+    "add x26, x26, #16\n"
+    "fsub v7.4s, v7.4s, v31.4s\n"
+    "fmla v8.4s, v31.4s, v0.s[1]\n"
+    "mov v11.16b, v11.16b\n"
+    "mov v15.16b, v13.16b\n"
+    "mov v14.16b, v30.16b\n"
+    "mov v13.16b, v30.16b\n"
+    "fmla v11.4s, v21.4s, v0.s[1]\n"
+    "fmla v15.4s, v21.4s, v0.s[2]\n"
+    "fmla v14.4s, v22.4s, v0.s[2]\n"
+    "fmls v13.4s, v28.4s, v0.s[2]\n"
+    "mov v21.16b, v30.16b\n"
+    "mov v22.16b, v30.16b\n"
+    "fsub v11.4s, v11.4s, v23.4s\n"
+    "fmls v15.4s, v31.4s, v0.s[3]\n"
+    "fmls v11.4s, v31.4s, v0.s[1]\n"
+    "fmls v14.4s, v29.4s, v0.s[3]\n"
+    "fmls v13.4s, v29.4s, v0.s[2]\n"
+    "fmla v21.4s, v28.4s, v0.s[2]\n"
+    "fmls v22.4s, v28.4s, v0.s[1]\n"
+    "mov v23.16b, v30.16b\n"
+    "mov v24.16b, v24.16b\n"
+    "fmls v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v13.4s, v13.4s, v27.4s\n"
+    "fmls v21.4s, v29.4s, v0.s[2]\n"
+    "fsub v22.4s, v22.4s, v29.4s\n"
+    "fmla v23.4s, v28.4s, v0.s[1]\n"
+    "fmla v22.4s, v27.4s, v0.s[1]\n"
+    "fmla v24.4s, v28.4s, v0.s[2]\n"
+    "fsub v21.4s, v21.4s, v27.4s\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fsub v23.4s, v23.4s, v29.4s\n"
+    "fmls v25.4s, v9.4s, v0.s[2]\n"
+    "fmls v23.4s, v27.4s, v0.s[1]\n"
+    "fmls v24.4s, v27.4s, v0.s[3]\n"
+    "fadd v26.4s, v26.4s, v14.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str q26, [x28]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "fmls v27.4s, v7.4s, v0.s[2]\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v17.16b\n"
+    "mov v29.16b, v18.16b\n"
+    "fadd v25.4s, v25.4s, v13.4s\n"
+    "fmls v31.4s, v8.4s, v0.s[2]\n"
+    "str q25, [x28, %[output_col_stride1]]\n"
+    "fmls v27.4s, v3.4s, v0.s[2]\n"
+    "fmls v30.4s, v11.4s, v0.s[2]\n"
+    "fmls v29.4s, v15.4s, v0.s[2]\n"
+    "fmls v31.4s, v4.4s, v0.s[2]\n"
+    "mov v26.16b, v12.16b\n"
+    "fadd v27.4s, v27.4s, v21.4s\n"
+    "mov v25.16b, v19.16b\n"
+    "str q27, [x28, x11]\n"
+    "fmls v30.4s, v5.4s, v0.s[2]\n"
+    "fadd v31.4s, v31.4s, v22.4s\n"
+    "fmls v29.4s, v6.4s, v0.s[2]\n"
+    "str q31, [x28, x13]\n"
+    "fmla v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v30.4s, v30.4s, v23.4s\n"
+    "fmla v25.4s, v9.4s, v0.s[2]\n"
+    "str q30, [x28, x23]\n"
+    "fadd v29.4s, v29.4s, v24.4s\n"
+    "str q29, [x28, x15]\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "add x28, x28, #16\n"
+    "mov v30.16b, v20.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "fsub v26.4s, v26.4s, v14.4s\n"
+    "mov v28.16b, v17.16b\n"
+    "str q26, [x22]\n"
+    "fsub v25.4s, v25.4s, v13.4s\n"
+    "str q25, [x22, %[output_col_stride1]]\n"
+    "fmla v30.4s, v7.4s, v0.s[2]\n"
+    "fmla v29.4s, v8.4s, v0.s[2]\n"
+    "fmla v28.4s, v11.4s, v0.s[2]\n"
+    "mov v26.16b, v18.16b\n"
+    "mov v25.16b, v12.16b\n"
+    "fmls v30.4s, v3.4s, v0.s[2]\n"
+    "mov v31.16b, v19.16b\n"
+    "fmls v29.4s, v4.4s, v0.s[2]\n"
+    "fmls v28.4s, v5.4s, v0.s[2]\n"
+    "fmla v26.4s, v15.4s, v0.s[2]\n"
+    "fmls v25.4s, v10.4s, v0.s[1]\n"
+    "fsub v30.4s, v30.4s, v21.4s\n"
+    "fmls v31.4s, v9.4s, v0.s[1]\n"
+    "str q30, [x22, x11]\n"
+    "fsub v29.4s, v29.4s, v22.4s\n"
+    "str q29, [x22, x13]\n"
+    "fsub v28.4s, v28.4s, v23.4s\n"
+    "str q28, [x22, x23]\n"
+    "fmls v26.4s, v6.4s, v0.s[2]\n"
+    "fsub v25.4s, v25.4s, v1.4s\n"
+    "fsub v31.4s, v31.4s, v2.4s\n"
+    "fmla v25.4s, v14.4s, v0.s[1]\n"
+    "fmla v31.4s, v13.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v24.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str q26, [x22, x15]\n"
+    "mov v26.16b, v16.16b\n"
+    "str q25, [x12]\n"
+    "fmls v27.4s, v7.4s, v0.s[1]\n"
+    "str q31, [x12, %[output_col_stride1]]\n"
+    "fmls v26.4s, v8.4s, v0.s[1]\n"
+    "mov v25.16b, v17.16b\n"
+    "add x22, x22, #16\n"
+    "fsub v27.4s, v27.4s, v3.4s\n"
+    "mov v28.16b, v18.16b\n"
+    "fmla v27.4s, v21.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v4.4s\n"
+    "fmla v26.4s, v22.4s, v0.s[1]\n"
+    "fmls v25.4s, v11.4s, v0.s[1]\n"
+    "fmls v28.4s, v15.4s, v0.s[1]\n"
+    "mov v12.16b, v12.16b\n"
+    "str q27, [x12, x11]\n"
+    "mov v19.16b, v19.16b\n"
+    "str q26, [x12, x13]\n"
+    "fsub v25.4s, v25.4s, v5.4s\n"
+    "fmla v25.4s, v23.4s, v0.s[1]\n"
+    "fsub v28.4s, v28.4s, v6.4s\n"
+    "fmla v28.4s, v24.4s, v0.s[1]\n"
+    "fmla v12.4s, v10.4s, v0.s[1]\n"
+    "fmla v19.4s, v9.4s, v0.s[1]\n"
+    "mov v20.16b, v20.16b\n"
+    "str q25, [x12, x23]\n"
+    "mov v16.16b, v16.16b\n"
+    "str q28, [x12, x15]\n"
+    "fsub v12.4s, v12.4s, v1.4s\n"
+    "fmls v12.4s, v14.4s, v0.s[1]\n"
+    "add x12, x12, #16\n"
+    "fsub v19.4s, v19.4s, v2.4s\n"
+    "fmla v20.4s, v7.4s, v0.s[1]\n"
+    "fmls v19.4s, v13.4s, v0.s[1]\n"
+    "fmla v16.4s, v8.4s, v0.s[1]\n"
+    "str q12, [x14]\n"
+    "mov v1.16b, v17.16b\n"
+    "fsub v20.4s, v20.4s, v3.4s\n"
+    "mov v17.16b, v18.16b\n"
+    "str q19, [x14, %[output_col_stride1]]\n"
+    "fmls v20.4s, v21.4s, v0.s[1]\n"
+    "fsub v16.4s, v16.4s, v4.4s\n"
+    "fmla v1.4s, v11.4s, v0.s[1]\n"
+    "fmls v16.4s, v22.4s, v0.s[1]\n"
+    "fmla v17.4s, v15.4s, v0.s[1]\n"
+    "str q20, [x14, x11]\n"
+    "fsub v1.4s, v1.4s, v5.4s\n"
+    "str q16, [x14, x13]\n"
+    "fmls v1.4s, v23.4s, v0.s[1]\n"
+    "fsub v17.4s, v17.4s, v6.4s\n"
+    "fmls v17.4s, v24.4s, v0.s[1]\n"
+    "str q1, [x14, x23]\n"
+    "str q17, [x14, x15]\n"
+    "add x14, x14, #16\n"
+    "ldr q2, [x27, x20]\n"
+    "mov v4.16b, v2.16b\n"
+    "ldr q17, [x27, x10]\n"
+    "mov v12.16b, v2.16b\n"
+    "ldr q18, [x27]\n"
+    "fmla v4.4s, v18.4s, v0.s[2]\n"
+    "ldr q3, [x27, x21]\n"
+    "mov v6.16b, v2.16b\n"
+    "ldr q5, [x27, x8]\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr q18, [x27, %[input_col_stride1]]\n"
+    "fmls v4.4s, v17.4s, v0.s[3]\n"
+    "add x27, x27, #16\n"
+    "fmls v12.4s, v18.4s, v0.s[2]\n"
+    "sub %w[n_channels], %w[n_channels], #4\n"
+    "fmla v6.4s, v18.4s, v0.s[2]\n"
+    "cmp %w[n_channels], #4\n"
+    "fmls v1.4s, v18.4s, v0.s[1]\n"
+    "mov v2.16b, v2.16b\n"
+    "fmls v12.4s, v17.4s, v0.s[2]\n"
+    "mov v3.16b, v3.16b\n"
+    "fmls v6.4s, v17.4s, v0.s[2]\n"
+    "fmla v2.4s, v18.4s, v0.s[1]\n"
+    "fsub v1.4s, v1.4s, v17.4s\n"
+    "fmla v3.4s, v18.4s, v0.s[2]\n"
+    "fadd v12.4s, v12.4s, v5.4s\n"
+    "fmla v1.4s, v5.4s, v0.s[1]\n"
+    "fsub v6.4s, v6.4s, v5.4s\n"
+    "fsub v2.4s, v2.4s, v17.4s\n"
+    "fmls v2.4s, v5.4s, v0.s[1]\n"
+    "fmls v3.4s, v5.4s, v0.s[3]\n"
+    "mov v4.16b, v4.16b\n"
+    "mov v16.16b, v12.16b\n"
+    "mov v5.16b, v6.16b\n"
+    "mov v6.16b, v1.16b\n"
+    "fmla v4.4s, v10.4s, v0.s[2]\n"
+    "fmla v16.4s, v9.4s, v0.s[2]\n"
+    "fmla v5.4s, v7.4s, v0.s[2]\n"
+    "fmla v6.4s, v8.4s, v0.s[2]\n"
+    "mov v9.16b, v2.16b\n"
+    "mov v10.16b, v3.16b\n"
+    "fmls v4.4s, v14.4s, v0.s[3]\n"
+    "fmls v16.4s, v13.4s, v0.s[3]\n"
+    "fmls v5.4s, v21.4s, v0.s[3]\n"
+    "fmls v6.4s, v22.4s, v0.s[3]\n"
+    "fmla v9.4s, v11.4s, v0.s[2]\n"
+    "fmla v10.4s, v15.4s, v0.s[2]\n"
+    "str q4, [x24]\n"
+    "str q16, [x24, %[output_col_stride1]]\n"
+    "str q5, [x24, x11]\n"
+    "str q6, [x24, x13]\n"
+    "fmls v9.4s, v23.4s, v0.s[3]\n"
+    "fmls v10.4s, v24.4s, v0.s[3]\n"
+    "str q9, [x24, x23]\n"
+    "str q10, [x24, x15]\n"
+    "add x24, x24, #16\n"
+    "bge 1b\n"
+    "2:\n"
+    "cmp %w[n_channels], #2\n"
+    "blt 3f\n"
+    "ldr d8, [%[inptr0], x20]\n"
+    "mov v14.16b, v8.16b\n"
+    "ldr d2, [%[inptr0], x10]\n"
+    "mov v10.16b, v8.16b\n"
+    "ldr d9, [%[inptr0]]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "ldr d1, [%[inptr0], x21]\n"
+    "mov v9.16b, v8.16b\n"
+    "ldr d4, [%[inptr0], x8]\n"
+    "mov v7.16b, v8.16b\n"
+    "ldr d12, [%[inptr0], %[input_col_stride1]]\n"
+    "fmls v14.4s, v2.4s, v0.s[3]\n"
+    "ldr d5, [x16, x20]\n"
+    "fmls v10.4s, v12.4s, v0.s[2]\n"
+    "ldr d20, [x16, x10]\n"
+    "fmla v9.4s, v12.4s, v0.s[2]\n"
+    "ldr d3, [x16]\n"
+    "fmls v7.4s, v12.4s, v0.s[1]\n"
+    "ldr d6, [x16, x21]\n"
+    "fmls v10.4s, v2.4s, v0.s[2]\n"
+    "ldr d16, [x16, x8]\n"
+    "fmls v9.4s, v2.4s, v0.s[2]\n"
+    "ldr d22, [x16, %[input_col_stride1]]\n"
+    "fsub v7.4s, v7.4s, v2.4s\n"
+    "ldr d17, [x17, x20]\n"
+    "fadd v10.4s, v10.4s, v4.4s\n"
+    "ldr d15, [x17, x10]\n"
+    "fsub v9.4s, v9.4s, v4.4s\n"
+    "ldr d19, [x17]\n"
+    "fmla v7.4s, v4.4s, v0.s[1]\n"
+    "ldr d18, [x17, x21]\n"
+    "mov v8.16b, v8.16b\n"
+    "ldr d13, [x17, x8]\n"
+    "mov v11.16b, v1.16b\n"
+    "ldr d21, [x17, %[input_col_stride1]]\n"
+    "fmla v8.4s, v12.4s, v0.s[1]\n"
+    "add %[inptr0], %[inptr0], #8\n"
+    "fmla v11.4s, v12.4s, v0.s[2]\n"
+    "add x16, x16, #8\n"
+    "mov v1.16b, v5.16b\n"
+    "add x17, x17, #8\n"
+    "fsub v8.4s, v8.4s, v2.4s\n"
+    "mov v2.16b, v5.16b\n"
+    "fmls v8.4s, v4.4s, v0.s[1]\n"
+    "fmls v11.4s, v4.4s, v0.s[3]\n"
+    "fmla v1.4s, v3.4s, v0.s[2]\n"
+    "fmls v2.4s, v22.4s, v0.s[2]\n"
+    "mov v3.16b, v5.16b\n"
+    "mov v4.16b, v5.16b\n"
+    "mov v5.16b, v5.16b\n"
+    "mov v6.16b, v6.16b\n"
+    "fmls v1.4s, v20.4s, v0.s[3]\n"
+    "fmls v2.4s, v20.4s, v0.s[2]\n"
+    "fmla v3.4s, v22.4s, v0.s[2]\n"
+    "fmls v4.4s, v22.4s, v0.s[1]\n"
+    "fmla v5.4s, v22.4s, v0.s[1]\n"
+    "fmla v6.4s, v22.4s, v0.s[2]\n"
+    "fadd v2.4s, v2.4s, v16.4s\n"
+    "mov v12.16b, v17.16b\n"
+    "fmls v3.4s, v20.4s, v0.s[2]\n"
+    "fsub v4.4s, v4.4s, v20.4s\n"
+    "fmla v4.4s, v16.4s, v0.s[1]\n"
+    "fsub v5.4s, v5.4s, v20.4s\n"
+    "fmls v5.4s, v16.4s, v0.s[1]\n"
+    "fmls v6.4s, v16.4s, v0.s[3]\n"
+    "fsub v3.4s, v3.4s, v16.4s\n"
+    "fmla v12.4s, v19.4s, v0.s[2]\n"
+    "mov v19.16b, v17.16b\n"
+    "mov v20.16b, v17.16b\n"
+    "mov v16.16b, v17.16b\n"
+    "mov v17.16b, v17.16b\n"
+    "fmls v12.4s, v15.4s, v0.s[3]\n"
+    "fmls v19.4s, v21.4s, v0.s[2]\n"
+    "fmla v20.4s, v21.4s, v0.s[2]\n"
+    "fmls v16.4s, v21.4s, v0.s[1]\n"
+    "fmla v17.4s, v21.4s, v0.s[1]\n"
+    "mov v18.16b, v18.16b\n"
+    "fmls v19.4s, v15.4s, v0.s[2]\n"
+    "mov v23.16b, v12.16b\n"
+    "fmls v20.4s, v15.4s, v0.s[2]\n"
+    "fsub v16.4s, v16.4s, v15.4s\n"
+    "fmla v16.4s, v13.4s, v0.s[1]\n"
+    "fsub v17.4s, v17.4s, v15.4s\n"
+    "fadd v19.4s, v19.4s, v13.4s\n"
+    "fmls v17.4s, v13.4s, v0.s[1]\n"
+    "fsub v20.4s, v20.4s, v13.4s\n"
+    "fmla v18.4s, v21.4s, v0.s[2]\n"
+    "fmla v23.4s, v14.4s, v0.s[2]\n"
+    "mov v15.16b, v19.16b\n"
+    "mov v14.16b, v20.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "fmls v18.4s, v13.4s, v0.s[3]\n"
+    "fmla v15.4s, v10.4s, v0.s[2]\n"
+    "fmls v23.4s, v1.4s, v0.s[3]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "fmla v24.4s, v7.4s, v0.s[2]\n"
+    "mov v10.16b, v17.16b\n"
+    "fmls v15.4s, v2.4s, v0.s[3]\n"
+    "mov v7.16b, v18.16b\n"
+    "str d23, [%[outptr0]]\n"
+    "fmls v14.4s, v3.4s, v0.s[3]\n"
+    "fmls v24.4s, v4.4s, v0.s[3]\n"
+    "fmla v10.4s, v8.4s, v0.s[2]\n"
+    "str d15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v7.4s, v11.4s, v0.s[2]\n"
+    "str d14, [%[outptr0], x11]\n"
+    "fmls v10.4s, v5.4s, v0.s[3]\n"
+    "str d24, [%[outptr0], x13]\n"
+    "fmls v7.4s, v6.4s, v0.s[3]\n"
+    "str d10, [%[outptr0], x23]\n"
+    "str d7, [%[outptr0], x15]\n"
+    "add %[outptr0], %[outptr0], #8\n"
+    "mov v26.16b, v12.16b\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr d11, [x25, x20]\n"
+    "mov v10.16b, v11.16b\n"
+    "ldr d23, [x25, x10]\n"
+    "mov v9.16b, v11.16b\n"
+    "ldr d7, [x25]\n"
+    "fmla v10.4s, v7.4s, v0.s[2]\n"
+    "ldr d13, [x25, x21]\n"
+    "mov v7.16b, v11.16b\n"
+    "ldr d31, [x25, x8]\n"
+    "mov v8.16b, v11.16b\n"
+    "ldr d21, [x25, %[input_col_stride1]]\n"
+    "fmls v10.4s, v23.4s, v0.s[3]\n"
+    "ldr d30, [x26, x20]\n"
+    "fmls v9.4s, v21.4s, v0.s[2]\n"
+    "ldr d29, [x26, x10]\n"
+    "fmla v7.4s, v21.4s, v0.s[2]\n"
+    "ldr d22, [x26]\n"
+    "fmls v8.4s, v21.4s, v0.s[1]\n"
+    "ldr d24, [x26, x21]\n"
+    "fmls v9.4s, v23.4s, v0.s[2]\n"
+    "ldr d27, [x26, x8]\n"
+    "fmls v7.4s, v23.4s, v0.s[2]\n"
+    "ldr d28, [x26, %[input_col_stride1]]\n"
+    "fsub v8.4s, v8.4s, v23.4s\n"
+    "add x25, x25, #8\n"
+    "fadd v9.4s, v9.4s, v31.4s\n"
+    "add x26, x26, #8\n"
+    "fsub v7.4s, v7.4s, v31.4s\n"
+    "fmla v8.4s, v31.4s, v0.s[1]\n"
+    "mov v11.16b, v11.16b\n"
+    "mov v15.16b, v13.16b\n"
+    "mov v14.16b, v30.16b\n"
+    "mov v13.16b, v30.16b\n"
+    "fmla v11.4s, v21.4s, v0.s[1]\n"
+    "fmla v15.4s, v21.4s, v0.s[2]\n"
+    "fmla v14.4s, v22.4s, v0.s[2]\n"
+    "fmls v13.4s, v28.4s, v0.s[2]\n"
+    "mov v21.16b, v30.16b\n"
+    "mov v22.16b, v30.16b\n"
+    "fsub v11.4s, v11.4s, v23.4s\n"
+    "fmls v15.4s, v31.4s, v0.s[3]\n"
+    "fmls v11.4s, v31.4s, v0.s[1]\n"
+    "fmls v14.4s, v29.4s, v0.s[3]\n"
+    "fmls v13.4s, v29.4s, v0.s[2]\n"
+    "fmla v21.4s, v28.4s, v0.s[2]\n"
+    "fmls v22.4s, v28.4s, v0.s[1]\n"
+    "mov v23.16b, v30.16b\n"
+    "mov v24.16b, v24.16b\n"
+    "fmls v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v13.4s, v13.4s, v27.4s\n"
+    "fmls v21.4s, v29.4s, v0.s[2]\n"
+    "fsub v22.4s, v22.4s, v29.4s\n"
+    "fmla v23.4s, v28.4s, v0.s[1]\n"
+    "fmla v22.4s, v27.4s, v0.s[1]\n"
+    "fmla v24.4s, v28.4s, v0.s[2]\n"
+    "fsub v21.4s, v21.4s, v27.4s\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fsub v23.4s, v23.4s, v29.4s\n"
+    "fmls v25.4s, v9.4s, v0.s[2]\n"
+    "fmls v23.4s, v27.4s, v0.s[1]\n"
+    "fmls v24.4s, v27.4s, v0.s[3]\n"
+    "fadd v26.4s, v26.4s, v14.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str d26, [x28]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "fmls v27.4s, v7.4s, v0.s[2]\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v17.16b\n"
+    "mov v29.16b, v18.16b\n"
+    "fadd v25.4s, v25.4s, v13.4s\n"
+    "fmls v31.4s, v8.4s, v0.s[2]\n"
+    "str d25, [x28, %[output_col_stride1]]\n"
+    "fmls v27.4s, v3.4s, v0.s[2]\n"
+    "fmls v30.4s, v11.4s, v0.s[2]\n"
+    "fmls v29.4s, v15.4s, v0.s[2]\n"
+    "fmls v31.4s, v4.4s, v0.s[2]\n"
+    "mov v26.16b, v12.16b\n"
+    "fadd v27.4s, v27.4s, v21.4s\n"
+    "mov v25.16b, v19.16b\n"
+    "str d27, [x28, x11]\n"
+    "fmls v30.4s, v5.4s, v0.s[2]\n"
+    "fadd v31.4s, v31.4s, v22.4s\n"
+    "fmls v29.4s, v6.4s, v0.s[2]\n"
+    "str d31, [x28, x13]\n"
+    "fmla v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v30.4s, v30.4s, v23.4s\n"
+    "fmla v25.4s, v9.4s, v0.s[2]\n"
+    "str d30, [x28, x23]\n"
+    "fadd v29.4s, v29.4s, v24.4s\n"
+    "str d29, [x28, x15]\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "add x28, x28, #8\n"
+    "mov v30.16b, v20.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "fsub v26.4s, v26.4s, v14.4s\n"
+    "mov v28.16b, v17.16b\n"
+    "str d26, [x22]\n"
+    "fsub v25.4s, v25.4s, v13.4s\n"
+    "str d25, [x22, %[output_col_stride1]]\n"
+    "fmla v30.4s, v7.4s, v0.s[2]\n"
+    "fmla v29.4s, v8.4s, v0.s[2]\n"
+    "fmla v28.4s, v11.4s, v0.s[2]\n"
+    "mov v26.16b, v18.16b\n"
+    "mov v25.16b, v12.16b\n"
+    "fmls v30.4s, v3.4s, v0.s[2]\n"
+    "mov v31.16b, v19.16b\n"
+    "fmls v29.4s, v4.4s, v0.s[2]\n"
+    "fmls v28.4s, v5.4s, v0.s[2]\n"
+    "fmla v26.4s, v15.4s, v0.s[2]\n"
+    "fmls v25.4s, v10.4s, v0.s[1]\n"
+    "fsub v30.4s, v30.4s, v21.4s\n"
+    "fmls v31.4s, v9.4s, v0.s[1]\n"
+    "str d30, [x22, x11]\n"
+    "fsub v29.4s, v29.4s, v22.4s\n"
+    "str d29, [x22, x13]\n"
+    "fsub v28.4s, v28.4s, v23.4s\n"
+    "str d28, [x22, x23]\n"
+    "fmls v26.4s, v6.4s, v0.s[2]\n"
+    "fsub v25.4s, v25.4s, v1.4s\n"
+    "fsub v31.4s, v31.4s, v2.4s\n"
+    "fmla v25.4s, v14.4s, v0.s[1]\n"
+    "fmla v31.4s, v13.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v24.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str d26, [x22, x15]\n"
+    "mov v26.16b, v16.16b\n"
+    "str d25, [x12]\n"
+    "fmls v27.4s, v7.4s, v0.s[1]\n"
+    "str d31, [x12, %[output_col_stride1]]\n"
+    "fmls v26.4s, v8.4s, v0.s[1]\n"
+    "mov v25.16b, v17.16b\n"
+    "add x22, x22, #8\n"
+    "fsub v27.4s, v27.4s, v3.4s\n"
+    "mov v28.16b, v18.16b\n"
+    "fmla v27.4s, v21.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v4.4s\n"
+    "fmla v26.4s, v22.4s, v0.s[1]\n"
+    "fmls v25.4s, v11.4s, v0.s[1]\n"
+    "fmls v28.4s, v15.4s, v0.s[1]\n"
+    "mov v12.16b, v12.16b\n"
+    "str d27, [x12, x11]\n"
+    "mov v19.16b, v19.16b\n"
+    "str d26, [x12, x13]\n"
+    "fsub v25.4s, v25.4s, v5.4s\n"
+    "fmla v25.4s, v23.4s, v0.s[1]\n"
+    "fsub v28.4s, v28.4s, v6.4s\n"
+    "fmla v28.4s, v24.4s, v0.s[1]\n"
+    "fmla v12.4s, v10.4s, v0.s[1]\n"
+    "fmla v19.4s, v9.4s, v0.s[1]\n"
+    "mov v20.16b, v20.16b\n"
+    "str d25, [x12, x23]\n"
+    "mov v16.16b, v16.16b\n"
+    "str d28, [x12, x15]\n"
+    "fsub v12.4s, v12.4s, v1.4s\n"
+    "fmls v12.4s, v14.4s, v0.s[1]\n"
+    "add x12, x12, #8\n"
+    "fsub v19.4s, v19.4s, v2.4s\n"
+    "fmla v20.4s, v7.4s, v0.s[1]\n"
+    "fmls v19.4s, v13.4s, v0.s[1]\n"
+    "fmla v16.4s, v8.4s, v0.s[1]\n"
+    "str d12, [x14]\n"
+    "mov v1.16b, v17.16b\n"
+    "fsub v20.4s, v20.4s, v3.4s\n"
+    "mov v17.16b, v18.16b\n"
+    "str d19, [x14, %[output_col_stride1]]\n"
+    "fmls v20.4s, v21.4s, v0.s[1]\n"
+    "fsub v16.4s, v16.4s, v4.4s\n"
+    "fmla v1.4s, v11.4s, v0.s[1]\n"
+    "fmls v16.4s, v22.4s, v0.s[1]\n"
+    "fmla v17.4s, v15.4s, v0.s[1]\n"
+    "str d20, [x14, x11]\n"
+    "fsub v1.4s, v1.4s, v5.4s\n"
+    "str d16, [x14, x13]\n"
+    "fmls v1.4s, v23.4s, v0.s[1]\n"
+    "fsub v17.4s, v17.4s, v6.4s\n"
+    "fmls v17.4s, v24.4s, v0.s[1]\n"
+    "str d1, [x14, x23]\n"
+    "str d17, [x14, x15]\n"
+    "add x14, x14, #8\n"
+    "ldr d2, [x27, x20]\n"
+    "mov v4.16b, v2.16b\n"
+    "ldr d17, [x27, x10]\n"
+    "mov v12.16b, v2.16b\n"
+    "ldr d18, [x27]\n"
+    "fmla v4.4s, v18.4s, v0.s[2]\n"
+    "ldr d3, [x27, x21]\n"
+    "mov v6.16b, v2.16b\n"
+    "ldr d5, [x27, x8]\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr d18, [x27, %[input_col_stride1]]\n"
+    "fmls v4.4s, v17.4s, v0.s[3]\n"
+    "add x27, x27, #8\n"
+    "fmls v12.4s, v18.4s, v0.s[2]\n"
+    "sub %w[n_channels], %w[n_channels], #2\n"
+    "fmla v6.4s, v18.4s, v0.s[2]\n"
+    "fmls v1.4s, v18.4s, v0.s[1]\n"
+    "mov v2.16b, v2.16b\n"
+    "mov v3.16b, v3.16b\n"
+    "fmls v12.4s, v17.4s, v0.s[2]\n"
+    "mov v4.16b, v4.16b\n"
+    "fmls v6.4s, v17.4s, v0.s[2]\n"
+    "fsub v1.4s, v1.4s, v17.4s\n"
+    "fmla v1.4s, v5.4s, v0.s[1]\n"
+    "fmla v2.4s, v18.4s, v0.s[1]\n"
+    "fadd v12.4s, v12.4s, v5.4s\n"
+    "fmla v3.4s, v18.4s, v0.s[2]\n"
+    "fsub v6.4s, v6.4s, v5.4s\n"
+    "fmla v4.4s, v10.4s, v0.s[2]\n"
+    "fsub v2.4s, v2.4s, v17.4s\n"
+    "mov v16.16b, v12.16b\n"
+    "fmls v2.4s, v5.4s, v0.s[1]\n"
+    "fmls v3.4s, v5.4s, v0.s[3]\n"
+    "fmls v4.4s, v14.4s, v0.s[3]\n"
+    "fmla v16.4s, v9.4s, v0.s[2]\n"
+    "mov v5.16b, v6.16b\n"
+    "mov v6.16b, v1.16b\n"
+    "mov v9.16b, v2.16b\n"
+    "mov v10.16b, v3.16b\n"
+    "str d4, [x24]\n"
+    "fmls v16.4s, v13.4s, v0.s[3]\n"
+    "fmla v5.4s, v7.4s, v0.s[2]\n"
+    "fmla v6.4s, v8.4s, v0.s[2]\n"
+    "fmla v9.4s, v11.4s, v0.s[2]\n"
+    "fmla v10.4s, v15.4s, v0.s[2]\n"
+    "str d16, [x24, %[output_col_stride1]]\n"
+    "fmls v5.4s, v21.4s, v0.s[3]\n"
+    "fmls v6.4s, v22.4s, v0.s[3]\n"
+    "fmls v9.4s, v23.4s, v0.s[3]\n"
+    "fmls v10.4s, v24.4s, v0.s[3]\n"
+    "str d5, [x24, x11]\n"
+    "str d6, [x24, x13]\n"
+    "str d9, [x24, x23]\n"
+    "str d10, [x24, x15]\n"
+    "add x24, x24, #8\n"
+    "3:\n"
+    "cbz %w[n_channels], 4f\n"
+    "ldr s8, [%[inptr0], x20]\n"
+    "mov v14.16b, v8.16b\n"
+    "ldr s2, [%[inptr0], x10]\n"
+    "mov v10.16b, v8.16b\n"
+    "ldr s9, [%[inptr0]]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "ldr s1, [%[inptr0], x21]\n"
+    "mov v9.16b, v8.16b\n"
+    "ldr s4, [%[inptr0], x8]\n"
+    "mov v7.16b, v8.16b\n"
+    "ldr s12, [%[inptr0], %[input_col_stride1]]\n"
+    "fmls v14.4s, v2.4s, v0.s[3]\n"
+    "ldr s5, [x16, x20]\n"
+    "fmls v10.4s, v12.4s, v0.s[2]\n"
+    "ldr s20, [x16, x10]\n"
+    "fmla v9.4s, v12.4s, v0.s[2]\n"
+    "ldr s3, [x16]\n"
+    "fmls v7.4s, v12.4s, v0.s[1]\n"
+    "ldr s6, [x16, x21]\n"
+    "fmls v10.4s, v2.4s, v0.s[2]\n"
+    "ldr s16, [x16, x8]\n"
+    "fmls v9.4s, v2.4s, v0.s[2]\n"
+    "ldr s22, [x16, %[input_col_stride1]]\n"
+    "fsub v7.4s, v7.4s, v2.4s\n"
+    "ldr s17, [x17, x20]\n"
+    "fadd v10.4s, v10.4s, v4.4s\n"
+    "ldr s15, [x17, x10]\n"
+    "fsub v9.4s, v9.4s, v4.4s\n"
+    "ldr s19, [x17]\n"
+    "fmla v7.4s, v4.4s, v0.s[1]\n"
+    "ldr s18, [x17, x21]\n"
+    "mov v8.16b, v8.16b\n"
+    "ldr s13, [x17, x8]\n"
+    "mov v11.16b, v1.16b\n"
+    "ldr s21, [x17, %[input_col_stride1]]\n"
+    "fmla v8.4s, v12.4s, v0.s[1]\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v11.4s, v12.4s, v0.s[2]\n"
+    "add x16, x16, #4\n"
+    "mov v1.16b, v5.16b\n"
+    "add x17, x17, #4\n"
+    "fsub v8.4s, v8.4s, v2.4s\n"
+    "mov v2.16b, v5.16b\n"
+    "fmls v8.4s, v4.4s, v0.s[1]\n"
+    "fmls v11.4s, v4.4s, v0.s[3]\n"
+    "fmla v1.4s, v3.4s, v0.s[2]\n"
+    "fmls v2.4s, v22.4s, v0.s[2]\n"
+    "mov v3.16b, v5.16b\n"
+    "mov v4.16b, v5.16b\n"
+    "mov v5.16b, v5.16b\n"
+    "mov v6.16b, v6.16b\n"
+    "fmls v1.4s, v20.4s, v0.s[3]\n"
+    "fmls v2.4s, v20.4s, v0.s[2]\n"
+    "fmla v3.4s, v22.4s, v0.s[2]\n"
+    "fmls v4.4s, v22.4s, v0.s[1]\n"
+    "fmla v5.4s, v22.4s, v0.s[1]\n"
+    "fmla v6.4s, v22.4s, v0.s[2]\n"
+    "fadd v2.4s, v2.4s, v16.4s\n"
+    "mov v12.16b, v17.16b\n"
+    "fmls v3.4s, v20.4s, v0.s[2]\n"
+    "fsub v4.4s, v4.4s, v20.4s\n"
+    "fmla v4.4s, v16.4s, v0.s[1]\n"
+    "fsub v5.4s, v5.4s, v20.4s\n"
+    "fmls v5.4s, v16.4s, v0.s[1]\n"
+    "fmls v6.4s, v16.4s, v0.s[3]\n"
+    "fsub v3.4s, v3.4s, v16.4s\n"
+    "fmla v12.4s, v19.4s, v0.s[2]\n"
+    "mov v19.16b, v17.16b\n"
+    "mov v20.16b, v17.16b\n"
+    "mov v16.16b, v17.16b\n"
+    "mov v17.16b, v17.16b\n"
+    "fmls v12.4s, v15.4s, v0.s[3]\n"
+    "fmls v19.4s, v21.4s, v0.s[2]\n"
+    "fmla v20.4s, v21.4s, v0.s[2]\n"
+    "fmls v16.4s, v21.4s, v0.s[1]\n"
+    "fmla v17.4s, v21.4s, v0.s[1]\n"
+    "mov v18.16b, v18.16b\n"
+    "fmls v19.4s, v15.4s, v0.s[2]\n"
+    "mov v23.16b, v12.16b\n"
+    "fmls v20.4s, v15.4s, v0.s[2]\n"
+    "fsub v16.4s, v16.4s, v15.4s\n"
+    "fmla v16.4s, v13.4s, v0.s[1]\n"
+    "fsub v17.4s, v17.4s, v15.4s\n"
+    "fadd v19.4s, v19.4s, v13.4s\n"
+    "fmls v17.4s, v13.4s, v0.s[1]\n"
+    "fsub v20.4s, v20.4s, v13.4s\n"
+    "fmla v18.4s, v21.4s, v0.s[2]\n"
+    "fmla v23.4s, v14.4s, v0.s[2]\n"
+    "mov v15.16b, v19.16b\n"
+    "mov v14.16b, v20.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "fmls v18.4s, v13.4s, v0.s[3]\n"
+    "fmla v15.4s, v10.4s, v0.s[2]\n"
+    "fmls v23.4s, v1.4s, v0.s[3]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "fmla v24.4s, v7.4s, v0.s[2]\n"
+    "mov v10.16b, v17.16b\n"
+    "fmls v15.4s, v2.4s, v0.s[3]\n"
+    "mov v7.16b, v18.16b\n"
+    "str s23, [%[outptr0]]\n"
+    "fmls v14.4s, v3.4s, v0.s[3]\n"
+    "fmls v24.4s, v4.4s, v0.s[3]\n"
+    "fmla v10.4s, v8.4s, v0.s[2]\n"
+    "str s15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v7.4s, v11.4s, v0.s[2]\n"
+    "str s14, [%[outptr0], x11]\n"
+    "fmls v10.4s, v5.4s, v0.s[3]\n"
+    "str s24, [%[outptr0], x13]\n"
+    "fmls v7.4s, v6.4s, v0.s[3]\n"
+    "str s10, [%[outptr0], x23]\n"
+    "str s7, [%[outptr0], x15]\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "mov v26.16b, v12.16b\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr s11, [x25, x20]\n"
+    "mov v10.16b, v11.16b\n"
+    "ldr s23, [x25, x10]\n"
+    "mov v9.16b, v11.16b\n"
+    "ldr s7, [x25]\n"
+    "fmla v10.4s, v7.4s, v0.s[2]\n"
+    "ldr s13, [x25, x21]\n"
+    "mov v7.16b, v11.16b\n"
+    "ldr s31, [x25, x8]\n"
+    "mov v8.16b, v11.16b\n"
+    "ldr s21, [x25, %[input_col_stride1]]\n"
+    "fmls v10.4s, v23.4s, v0.s[3]\n"
+    "ldr s30, [x26, x20]\n"
+    "fmls v9.4s, v21.4s, v0.s[2]\n"
+    "ldr s29, [x26, x10]\n"
+    "fmla v7.4s, v21.4s, v0.s[2]\n"
+    "ldr s22, [x26]\n"
+    "fmls v8.4s, v21.4s, v0.s[1]\n"
+    "ldr s24, [x26, x21]\n"
+    "fmls v9.4s, v23.4s, v0.s[2]\n"
+    "ldr s27, [x26, x8]\n"
+    "fmls v7.4s, v23.4s, v0.s[2]\n"
+    "ldr s28, [x26, %[input_col_stride1]]\n"
+    "fsub v8.4s, v8.4s, v23.4s\n"
+    "add x25, x25, #4\n"
+    "fadd v9.4s, v9.4s, v31.4s\n"
+    "add x26, x26, #4\n"
+    "fsub v7.4s, v7.4s, v31.4s\n"
+    "fmla v8.4s, v31.4s, v0.s[1]\n"
+    "mov v11.16b, v11.16b\n"
+    "mov v15.16b, v13.16b\n"
+    "mov v14.16b, v30.16b\n"
+    "mov v13.16b, v30.16b\n"
+    "fmla v11.4s, v21.4s, v0.s[1]\n"
+    "fmla v15.4s, v21.4s, v0.s[2]\n"
+    "fmla v14.4s, v22.4s, v0.s[2]\n"
+    "fmls v13.4s, v28.4s, v0.s[2]\n"
+    "mov v21.16b, v30.16b\n"
+    "mov v22.16b, v30.16b\n"
+    "fsub v11.4s, v11.4s, v23.4s\n"
+    "fmls v15.4s, v31.4s, v0.s[3]\n"
+    "fmls v11.4s, v31.4s, v0.s[1]\n"
+    "fmls v14.4s, v29.4s, v0.s[3]\n"
+    "fmls v13.4s, v29.4s, v0.s[2]\n"
+    "fmla v21.4s, v28.4s, v0.s[2]\n"
+    "fmls v22.4s, v28.4s, v0.s[1]\n"
+    "mov v23.16b, v30.16b\n"
+    "mov v24.16b, v24.16b\n"
+    "fmls v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v13.4s, v13.4s, v27.4s\n"
+    "fmls v21.4s, v29.4s, v0.s[2]\n"
+    "fsub v22.4s, v22.4s, v29.4s\n"
+    "fmla v23.4s, v28.4s, v0.s[1]\n"
+    "fmla v22.4s, v27.4s, v0.s[1]\n"
+    "fmla v24.4s, v28.4s, v0.s[2]\n"
+    "fsub v21.4s, v21.4s, v27.4s\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fsub v23.4s, v23.4s, v29.4s\n"
+    "fmls v25.4s, v9.4s, v0.s[2]\n"
+    "fmls v23.4s, v27.4s, v0.s[1]\n"
+    "fmls v24.4s, v27.4s, v0.s[3]\n"
+    "fadd v26.4s, v26.4s, v14.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str s26, [x28]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "fmls v27.4s, v7.4s, v0.s[2]\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v17.16b\n"
+    "mov v29.16b, v18.16b\n"
+    "fadd v25.4s, v25.4s, v13.4s\n"
+    "fmls v31.4s, v8.4s, v0.s[2]\n"
+    "str s25, [x28, %[output_col_stride1]]\n"
+    "fmls v27.4s, v3.4s, v0.s[2]\n"
+    "fmls v30.4s, v11.4s, v0.s[2]\n"
+    "fmls v29.4s, v15.4s, v0.s[2]\n"
+    "fmls v31.4s, v4.4s, v0.s[2]\n"
+    "mov v26.16b, v12.16b\n"
+    "fadd v27.4s, v27.4s, v21.4s\n"
+    "mov v25.16b, v19.16b\n"
+    "str s27, [x28, x11]\n"
+    "fmls v30.4s, v5.4s, v0.s[2]\n"
+    "fadd v31.4s, v31.4s, v22.4s\n"
+    "fmls v29.4s, v6.4s, v0.s[2]\n"
+    "str s31, [x28, x13]\n"
+    "fmla v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v30.4s, v30.4s, v23.4s\n"
+    "fmla v25.4s, v9.4s, v0.s[2]\n"
+    "str s30, [x28, x23]\n"
+    "fadd v29.4s, v29.4s, v24.4s\n"
+    "str s29, [x28, x15]\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "add x28, x28, #4\n"
+    "mov v30.16b, v20.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "fsub v26.4s, v26.4s, v14.4s\n"
+    "mov v28.16b, v17.16b\n"
+    "str s26, [x22]\n"
+    "fsub v25.4s, v25.4s, v13.4s\n"
+    "str s25, [x22, %[output_col_stride1]]\n"
+    "fmla v30.4s, v7.4s, v0.s[2]\n"
+    "fmla v29.4s, v8.4s, v0.s[2]\n"
+    "fmla v28.4s, v11.4s, v0.s[2]\n"
+    "mov v26.16b, v18.16b\n"
+    "mov v25.16b, v12.16b\n"
+    "fmls v30.4s, v3.4s, v0.s[2]\n"
+    "mov v31.16b, v19.16b\n"
+    "fmls v29.4s, v4.4s, v0.s[2]\n"
+    "fmls v28.4s, v5.4s, v0.s[2]\n"
+    "fmla v26.4s, v15.4s, v0.s[2]\n"
+    "fmls v25.4s, v10.4s, v0.s[1]\n"
+    "fsub v30.4s, v30.4s, v21.4s\n"
+    "fmls v31.4s, v9.4s, v0.s[1]\n"
+    "str s30, [x22, x11]\n"
+    "fsub v29.4s, v29.4s, v22.4s\n"
+    "str s29, [x22, x13]\n"
+    "fsub v28.4s, v28.4s, v23.4s\n"
+    "str s28, [x22, x23]\n"
+    "fmls v26.4s, v6.4s, v0.s[2]\n"
+    "fsub v25.4s, v25.4s, v1.4s\n"
+    "fsub v31.4s, v31.4s, v2.4s\n"
+    "fmla v25.4s, v14.4s, v0.s[1]\n"
+    "fmla v31.4s, v13.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v24.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str s26, [x22, x15]\n"
+    "mov v26.16b, v16.16b\n"
+    "str s25, [x12]\n"
+    "fmls v27.4s, v7.4s, v0.s[1]\n"
+    "str s31, [x12, %[output_col_stride1]]\n"
+    "fmls v26.4s, v8.4s, v0.s[1]\n"
+    "mov v25.16b, v17.16b\n"
+    "add x22, x22, #4\n"
+    "fsub v27.4s, v27.4s, v3.4s\n"
+    "mov v28.16b, v18.16b\n"
+    "fmla v27.4s, v21.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v4.4s\n"
+    "fmla v26.4s, v22.4s, v0.s[1]\n"
+    "fmls v25.4s, v11.4s, v0.s[1]\n"
+    "fmls v28.4s, v15.4s, v0.s[1]\n"
+    "mov v12.16b, v12.16b\n"
+    "str s27, [x12, x11]\n"
+    "mov v19.16b, v19.16b\n"
+    "str s26, [x12, x13]\n"
+    "fsub v25.4s, v25.4s, v5.4s\n"
+    "fmla v25.4s, v23.4s, v0.s[1]\n"
+    "fsub v28.4s, v28.4s, v6.4s\n"
+    "fmla v28.4s, v24.4s, v0.s[1]\n"
+    "fmla v12.4s, v10.4s, v0.s[1]\n"
+    "fmla v19.4s, v9.4s, v0.s[1]\n"
+    "mov v20.16b, v20.16b\n"
+    "str s25, [x12, x23]\n"
+    "mov v16.16b, v16.16b\n"
+    "str s28, [x12, x15]\n"
+    "fsub v12.4s, v12.4s, v1.4s\n"
+    "fmls v12.4s, v14.4s, v0.s[1]\n"
+    "add x12, x12, #4\n"
+    "fsub v19.4s, v19.4s, v2.4s\n"
+    "fmla v20.4s, v7.4s, v0.s[1]\n"
+    "fmls v19.4s, v13.4s, v0.s[1]\n"
+    "fmla v16.4s, v8.4s, v0.s[1]\n"
+    "str s12, [x14]\n"
+    "mov v1.16b, v17.16b\n"
+    "fsub v20.4s, v20.4s, v3.4s\n"
+    "mov v17.16b, v18.16b\n"
+    "str s19, [x14, %[output_col_stride1]]\n"
+    "fmls v20.4s, v21.4s, v0.s[1]\n"
+    "fsub v16.4s, v16.4s, v4.4s\n"
+    "fmla v1.4s, v11.4s, v0.s[1]\n"
+    "fmls v16.4s, v22.4s, v0.s[1]\n"
+    "fmla v17.4s, v15.4s, v0.s[1]\n"
+    "str s20, [x14, x11]\n"
+    "fsub v1.4s, v1.4s, v5.4s\n"
+    "str s16, [x14, x13]\n"
+    "fmls v1.4s, v23.4s, v0.s[1]\n"
+    "fsub v17.4s, v17.4s, v6.4s\n"
+    "fmls v17.4s, v24.4s, v0.s[1]\n"
+    "str s1, [x14, x23]\n"
+    "str s17, [x14, x15]\n"
+    "add x14, x14, #4\n"
+    "ldr s2, [x27, x20]\n"
+    "mov v4.16b, v2.16b\n"
+    "ldr s17, [x27, x10]\n"
+    "mov v12.16b, v2.16b\n"
+    "ldr s18, [x27]\n"
+    "fmla v4.4s, v18.4s, v0.s[2]\n"
+    "ldr s3, [x27, x21]\n"
+    "mov v6.16b, v2.16b\n"
+    "ldr s5, [x27, x8]\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr s18, [x27, %[input_col_stride1]]\n"
+    "fmls v4.4s, v17.4s, v0.s[3]\n"
+    "add x27, x27, #4\n"
+    "fmls v12.4s, v18.4s, v0.s[2]\n"
+    "fmla v6.4s, v18.4s, v0.s[2]\n"
+    "fmls v1.4s, v18.4s, v0.s[1]\n"
+    "mov v2.16b, v2.16b\n"
+    "mov v3.16b, v3.16b\n"
+    "mov v4.16b, v4.16b\n"
+    "fmls v12.4s, v17.4s, v0.s[2]\n"
+    "fmls v6.4s, v17.4s, v0.s[2]\n"
+    "fsub v1.4s, v1.4s, v17.4s\n"
+    "fmla v2.4s, v18.4s, v0.s[1]\n"
+    "fmla v1.4s, v5.4s, v0.s[1]\n"
+    "fmla v3.4s, v18.4s, v0.s[2]\n"
+    "fadd v12.4s, v12.4s, v5.4s\n"
+    "fsub v6.4s, v6.4s, v5.4s\n"
+    "fsub v2.4s, v2.4s, v17.4s\n"
+    "fmla v4.4s, v10.4s, v0.s[2]\n"
+    "fmls v2.4s, v5.4s, v0.s[1]\n"
+    "fmls v3.4s, v5.4s, v0.s[3]\n"
+    "mov v16.16b, v12.16b\n"
+    "mov v5.16b, v6.16b\n"
+    "fmls v4.4s, v14.4s, v0.s[3]\n"
+    "mov v6.16b, v1.16b\n"
+    "fmla v16.4s, v9.4s, v0.s[2]\n"
+    "fmla v5.4s, v7.4s, v0.s[2]\n"
+    "fmla v6.4s, v8.4s, v0.s[2]\n"
+    "mov v9.16b, v2.16b\n"
+    "str s4, [x24]\n"
+    "mov v10.16b, v3.16b\n"
+    "fmls v16.4s, v13.4s, v0.s[3]\n"
+    "fmls v5.4s, v21.4s, v0.s[3]\n"
+    "fmls v6.4s, v22.4s, v0.s[3]\n"
+    "fmla v9.4s, v11.4s, v0.s[2]\n"
+    "fmla v10.4s, v15.4s, v0.s[2]\n"
+    "str s16, [x24, %[output_col_stride1]]\n"
+    "str s5, [x24, x11]\n"
+    "fmls v9.4s, v23.4s, v0.s[3]\n"
+    "str s6, [x24, x13]\n"
+    "fmls v10.4s, v24.4s, v0.s[3]\n"
+    "str s9, [x24, x23]\n"
+    "str s10, [x24, x15]\n"
+    "add x24, x24, #4\n"
+    "4:\n"
+    : [outptr0] "+r" (matrix_base),
+      [n_channels] "+r" (n_channels),
+      [inptr0] "+r" (input_base)
+    : [pcoeffs] "r" (pcoeffs),
+      [output_row_stride] "r" (6 * matrix_stride * sizeof(float)),
+      [output_col_stride1] "r" (matrix_stride * sizeof(float)),
+      [input_row_stride] "r" (input_row_stride * sizeof(float)),
+      [input_col_stride1] "r" (input_col_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
+      "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+      "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8",
+      "v9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x10", "x8",
+      "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
+
+}  // namespace input_transform
+}  // namespace winograd
+}  // namespace arm_conv
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_1x8.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_1x8.cpp
new file mode 100644
index 0000000000..3e1fc491f1
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_1x8.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace input_transform {
+
+void arm_fp32_1x8(
+  const unsigned int n_channels,
+  const float * input_base,
+  size_t,  // We don't need to stride over rows
+  size_t input_col_stride,
+  float *outptr,
+  size_t matrix_stride
+)
+{
+  constexpr int inner_tile_cols = 8;
+
+  // Get pointers into the input tile
+  const float *x_ptrs[inner_tile_cols];
+  for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
+  {
+    x_ptrs[j] = input_base + xj*input_col_stride;
+  }
+
+  // Vectors used/computed in this kernel.
+  float x[inner_tile_cols];
+  float U[inner_tile_cols];
+
+  for (int j = 0; j < inner_tile_cols; j++)
+  {
+    x[j] = 0.0f;
+  }
+
+  // Perform the Winograd input transformation for each channel in the input
+  // tensor.
+  int channels_remaining = n_channels;
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    float32x4_t x[inner_tile_cols], U[inner_tile_cols];
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = vdupq_n_f32(0.0f);
+    }
+
+    // Load x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = vld1q_f32(x_ptrs[j]);
+      x_ptrs[j] += 4;
+    }
+
+    // Compute U = x . X
+    U[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
+    U[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
+    U[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
+    U[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
+    U[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
+    U[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
+    U[6] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
+    U[7] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
+
+    // Store the transformed vector
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      vst1q_f32(outptr + j*matrix_stride, U[j]);
+    }
+    outptr += 4;
+  }
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    float32x2_t x[inner_tile_cols], U[inner_tile_cols];
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = vdup_n_f32(0.0f);
+    }
+
+    // Load x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = vld1_f32(x_ptrs[j]);
+      x_ptrs[j] += 2;
+    }
+
+    // Compute U = x . X
+    U[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
+    U[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
+    U[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
+    U[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
+    U[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
+    U[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
+    U[6] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
+    U[7] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
+
+    // Store the transformed vector
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      vst1_f32(outptr + j*matrix_stride, U[j]);
+    }
+    outptr += 2;
+  }
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = *(x_ptrs[j]++);
+    }
+
+    // Compute U = x . X
+    U[0] = x[0]*-36 + x[4]*-14 + x[2]*49 + x[6]*1;
+    U[1] = x[5]*-1 + x[1]*-36 + x[4]*-13 + x[3]*13 + x[2]*36 + x[6]*1;
+    U[2] = x[3]*-13 + x[4]*-13 + x[1]*36 + x[2]*36 + x[5]*1 + x[6]*1;
+    U[3] = x[1]*-18 + x[4]*-10 + x[5]*-2 + x[2]*9 + x[3]*20 + x[6]*1;
+    U[4] = x[3]*-20 + x[4]*-10 + x[5]*2 + x[2]*9 + x[1]*18 + x[6]*1;
+    U[5] = x[1]*-12 + x[4]*-5 + x[5]*-3 + x[2]*4 + x[3]*15 + x[6]*1;
+    U[6] = x[3]*-15 + x[4]*-5 + x[5]*3 + x[2]*4 + x[1]*12 + x[6]*1;
+    U[7] = x[1]*-36 + x[5]*-14 + x[3]*49 + x[7]*1;
+
+    // Store the transformed vector
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      *(outptr + j*matrix_stride) = U[j];
+    }
+    outptr++;
+  }
+}
+
+}  // namespace input_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_4x4.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_4x4.cpp
new file mode 100644
index 0000000000..a4e6b433c7
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_4x4.cpp
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace input_transform {
+
+void arm_fp32_4x4(
+  const unsigned int n_channels,
+  const float *input_base,
+  const size_t input_row_stride,
+  const size_t input_col_stride,
+  float *outptr,
+  const size_t matrix_stride
+)
+{
+  constexpr int inner_tile_rows = 4, inner_tile_cols = 4;
+
+  // Get pointers into the input tile
+  const float *x_ptrs[inner_tile_rows][inner_tile_cols];
+  for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
+  {
+    // Get a pointer into the row
+    const float* const row_ptr = input_base + xi*input_row_stride;
+
+    for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
+    {
+      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
+    }
+  }
+
+  // Matrices used/computed in this kernel.
+  float x[inner_tile_rows][inner_tile_cols];
+  float XTx[inner_tile_rows][inner_tile_cols];
+  float U[inner_tile_rows][inner_tile_cols];
+
+  for (int i = 0; i < inner_tile_rows; i++)
+  {
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[i][j] = XTx[i][j] = 0.0f;
+    }
+  }
+
+  // Perform the Winograd input transformation for each channel in the input
+  // tensor.
+  int channels_remaining = n_channels;
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used/computed in this kernel.
+    float32x4_t x[inner_tile_rows][inner_tile_cols];
+    float32x4_t XTx[inner_tile_rows][inner_tile_cols];
+    float32x4_t U[inner_tile_rows][inner_tile_cols];
+
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vdupq_n_f32(0.0f);
+        XTx[i][j] = vdupq_n_f32(0.0f);
+      }
+    }
+
+    // Load x
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vld1q_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 4;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      // XTx[0][j] = x[0][j] - x[2][j];
+      XTx[0][j] = vsubq_f32(x[0][j], x[2][j]);
+
+      // XTx[1][j] = x[1][j] + x[2][j];
+      XTx[1][j] = vaddq_f32(x[1][j], x[2][j]);
+
+      // XTx[2][j] = x[2][j] - x[1][j];
+      XTx[2][j] = vsubq_f32(x[2][j], x[1][j]);
+
+      // XTx[3][j] = x[1][j] - x[3][j];
+      XTx[3][j] = vsubq_f32(x[1][j], x[3][j]);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      // U[i][0] = XTx[i][0] - XTx[i][2];
+      U[i][0] = vsubq_f32(XTx[i][0], XTx[i][2]);
+
+      // U[i][1] = XTx[i][1] + XTx[i][2];
+      U[i][1] = vaddq_f32(XTx[i][1], XTx[i][2]);
+
+      // U[i][2] = XTx[i][2] - XTx[i][1];
+      U[i][2] = vsubq_f32(XTx[i][2], XTx[i][1]);
+
+      // U[i][3] = XTx[i][1] - XTx[i][3];
+      U[i][3] = vsubq_f32(XTx[i][1], XTx[i][3]);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        vst1q_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 4;
+  }
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used/computed in this kernel.
+    float32x2_t x[inner_tile_rows][inner_tile_cols];
+    float32x2_t XTx[inner_tile_rows][inner_tile_cols];
+    float32x2_t U[inner_tile_rows][inner_tile_cols];
+
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vdup_n_f32(0.0f);
+        XTx[i][j] = vdup_n_f32(0.0f);
+      }
+    }
+
+    // Load x
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vld1_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 2;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      // XTx[0][j] = x[0][j] - x[2][j];
+      XTx[0][j] = vsub_f32(x[0][j], x[2][j]);
+
+      // XTx[1][j] = x[1][j] + x[2][j];
+      XTx[1][j] = vadd_f32(x[1][j], x[2][j]);
+
+      // XTx[2][j] = x[2][j] - x[1][j];
+      XTx[2][j] = vsub_f32(x[2][j], x[1][j]);
+
+      // XTx[3][j] = x[1][j] - x[3][j];
+      XTx[3][j] = vsub_f32(x[1][j], x[3][j]);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      // U[i][0] = XTx[i][0] - XTx[i][2];
+      U[i][0] = vsub_f32(XTx[i][0], XTx[i][2]);
+
+      // U[i][1] = XTx[i][1] + XTx[i][2];
+      U[i][1] = vadd_f32(XTx[i][1], XTx[i][2]);
+
+      // U[i][2] = XTx[i][2] - XTx[i][1];
+      U[i][2] = vsub_f32(XTx[i][2], XTx[i][1]);
+
+      // U[i][3] = XTx[i][1] - XTx[i][3];
+      U[i][3] = vsub_f32(XTx[i][1], XTx[i][3]);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        vst1_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 2;
+  }
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load x
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = *(x_ptrs[i][j]++);
+      }
+    }
+
+    // Compute XT . x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      XTx[0][j] = x[0][j] - x[2][j];
+      XTx[1][j] = x[1][j] + x[2][j];
+      XTx[2][j] = x[2][j] - x[1][j];
+      XTx[3][j] = x[1][j] - x[3][j];
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      U[i][0] = XTx[i][0] - XTx[i][2];
+      U[i][1] = XTx[i][1] + XTx[i][2];
+      U[i][2] = XTx[i][2] - XTx[i][1];
+      U[i][3] = XTx[i][1] - XTx[i][3];
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        *(outptr + m*matrix_stride) = U[i][j];
+      }
+    }
+    outptr++;
+  }
+}
+
+}  // namespace input_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_6x6.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_6x6.cpp
new file mode 100644
index 0000000000..4adc45768e
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_6x6.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __aarch64__
+
+#include <arm_neon.h>
+#include <cstddef>
+
+namespace arm_conv {
+namespace winograd {
+namespace input_transform {
+
+void arm_fp32_6x6(
+  unsigned int n_channels,
+  const float* const input_base,
+  const size_t input_row_stride,
+  const size_t input_col_stride,
+  float* outptr,
+  const size_t matrix_stride
+)
+{
+  constexpr int inner_tile_rows = 6;
+  constexpr int inner_tile_cols = 6;
+
+  // Get pointers into the input tile
+  const float *x_ptrs[inner_tile_rows][inner_tile_cols];
+  for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
+  {
+    // Get a pointer into the row
+    const float* const row_ptr = input_base + xi*input_row_stride;
+
+    for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
+    {
+      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
+    }
+  }
+
+  // Matrices used/computed in this kernel.
+  float x[inner_tile_rows][inner_tile_cols];
+  float XTx[inner_tile_rows][inner_tile_cols];
+  float U[inner_tile_rows][inner_tile_cols];
+  for (int i = 0; i < inner_tile_rows; i++)
+  {
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[i][j] = XTx[i][j] = 0.0f;
+    }
+  }
+
+  // Perform the Winograd input transformation for each channel in the input
+  // tensor.
+  int channels_remaining = n_channels;
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used/computed in this kernel
+    float32x2_t x[inner_tile_rows][inner_tile_cols];
+    float32x2_t XTx[inner_tile_rows][inner_tile_cols];
+    float32x2_t U[inner_tile_rows][inner_tile_cols];
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vdup_n_f32(0.0f);
+        XTx[i][j] = vdup_n_f32(0.0f);
+      }
+    }
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vld1_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 2;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+      XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
+
+      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+      XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+      XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+      XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
+
+      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+      XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
+
+      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+      XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+      U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
+
+      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+      U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+      U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+      U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
+
+      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+      U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
+
+      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+      U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        vst1_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 2;
+  }
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load x
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = *(x_ptrs[i][j]++);
+      }
+    }
+
+    // Compute XT . x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+      XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+      XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+      XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+      XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+      XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+      U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+      U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+      U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+      U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+      U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        *(outptr + m*matrix_stride) = U[i][j];
+      }
+    }
+    outptr++;
+  }
+}
+
+}  // namespace input_transform
+}  // namespace winograd
+}  // namespace arm_conv
+
+#endif // ! __aarch64__
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms/sme_fp32_mla_6x6.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms/sme_fp32_mla_6x6.cpp
new file mode 100644
index 0000000000..f446e7ea8b
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms/sme_fp32_mla_6x6.cpp
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+#include <cstddef>
+
+namespace arm_conv {
+namespace winograd {
+namespace input_transform {
+
+void sme_fp32_mla_6x6(
+  const unsigned int num_channels,
+  const float *input,
+  const size_t input_row_stride,
+  const size_t input_col_stride,
+  float *output,
+  const size_t output_col_stride
+)
+{
+  const float B_values[4] = { 1.0f, 2.0f, 4.0f, 5.0f };
+  long long_channels = num_channels;
+
+  // Generated by armasmgen (February 04th, 2021)
+  __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "fmov z16.s, #4.0\n"
+      "ptrue p1.b\n"
+      "ld1rqw { z2.s }, p1/Z, [%x[B_values]]\n"
+      "add x16, %x[input_row_0], %x[input_row_stride], LSL #2\n"
+      "add x15, %x[output_row_0], %x[output_row_stride], LSL #2\n"
+      "add x14, %x[input_row_0], %x[input_row_stride], LSL #3\n"
+      "add x13, %x[output_row_0], %x[output_row_stride], LSL #3\n"
+      "add x12, x14, %x[input_row_stride], LSL #2\n"
+      "add x11, x13, %x[output_row_stride], LSL #2\n"
+      "add x10, %x[input_row_0], %x[input_row_stride], LSL #4\n"
+      "add x9, %x[output_row_0], %x[output_row_stride], LSL #4\n"
+      "add x28, x10, %x[input_row_stride], LSL #2\n"
+      "add x27, x9, %x[output_row_stride], LSL #2\n"
+      "lsl x26, %x[input_col_1_stride], #0x1\n"
+      "lsl x25, %x[output_col_1_stride], #0x1\n"
+      "add x24, x26, %x[input_col_1_stride]\n"
+      "add x23, x25, %x[output_col_1_stride]\n"
+      "lsl x22, %x[input_col_1_stride], #0x2\n"
+      "lsl x21, %x[output_col_1_stride], #0x2\n"
+      "add x20, x22, %x[input_col_1_stride]\n"
+      "add x8, x21, %x[output_col_1_stride]\n"
+      "whilelt p0.s, XZR, %x[num_channels]\n"
+      "beq 2f\n"
+      "1:"  // channel_loop
+      "ld1w { z31.s }, p0/Z, [%x[input_row_0]]\n"
+      "decw %x[num_channels]\n"
+      "ld1w { z28.s }, p0/Z, [%x[input_row_0], %x[input_col_1_stride], LSL #2]\n"
+      "fmul z13.s, z28.s, z2.s[1]\n"
+      "ld1w { z27.s }, p0/Z, [%x[input_row_0], x26, LSL #2]\n"
+      "ld1w { z11.s }, p0/Z, [%x[input_row_0], x24, LSL #2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "ld1w { z7.s }, p0/Z, [%x[input_row_0], x22, LSL #2]\n"
+      "fsub z15.s, z7.s, z27.s\n"
+      "fmad z31.s, p1/M, z16.s, z7.s\n"
+      "ld1w { z3.s }, p0/Z, [%x[input_row_0], x20, LSL #2]\n"
+      "fmla z13.s, z11.s, z2.s[1]\n"
+      "ld1w { z12.s }, p0/Z, [x14]\n"
+      "incb %x[input_row_0]\n"
+      "fmls z31.s, z27.s, z2.s[3]\n"
+      "ld1w { z14.s }, p0/Z, [x14, %x[input_col_1_stride], LSL #2]\n"
+      "fsub z25.s, z15.s, z13.s\n"
+      "fadd z8.s, z13.s, z15.s\n"
+      "ld1w { z24.s }, p0/Z, [x14, x26, LSL #2]\n"
+      "fmsb z27.s, p1/M, z16.s, z7.s\n"
+      "ld1w { z22.s }, p0/Z, [x14, x24, LSL #2]\n"
+      "fmul z7.s, z28.s, z2.s[2]\n"
+      "ld1w { z1.s }, p0/Z, [x14, x22, LSL #2]\n"
+      "fsub z15.s, z1.s, z24.s\n"
+      "fneg z7.s, p1/M, z7.s\n"
+      "ld1w { z20.s }, p0/Z, [x14, x20, LSL #2]\n"
+      "fadd z7.s, z7.s, z11.s\n"
+      "ld1w { z29.s }, p0/Z, [x10]\n"
+      "incb x14\n"
+      "fmad z28.s, p1/M, z16.s, z3.s\n"
+      "ld1w { z10.s }, p0/Z, [x10, %x[input_col_1_stride], LSL #2]\n"
+      "fmad z12.s, p1/M, z16.s, z1.s\n"
+      "ld1w { z18.s }, p0/Z, [x10, x26, LSL #2]\n"
+      "fmul z13.s, z14.s, z2.s[1]\n"
+      "ld1w { z19.s }, p0/Z, [x10, x24, LSL #2]\n"
+      "fadd z17.s, z7.s, z27.s\n"
+      "ld1w { z9.s }, p0/Z, [x10, x22, LSL #2]\n"
+      "fsub z27.s, z27.s, z7.s\n"
+      "fmls z28.s, z11.s, z2.s[3]\n"
+      "ld1w { z21.s }, p0/Z, [x10, x20, LSL #2]\n"
+      "incb x10\n"
+      "fmls z12.s, z24.s, z2.s[3]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z22.s, z2.s[1]\n"
+      "fsub z30.s, z15.s, z13.s\n"
+      "fadd z4.s, z13.s, z15.s\n"
+      "fmsb z24.s, p1/M, z16.s, z1.s\n"
+      "fsub z15.s, z9.s, z18.s\n"
+      "fmul z1.s, z14.s, z2.s[2]\n"
+      "fmad z14.s, p1/M, z16.s, z20.s\n"
+      "fmad z29.s, p1/M, z16.s, z9.s\n"
+      "fmul z13.s, z10.s, z2.s[1]\n"
+      "fneg z1.s, p1/M, z1.s\n"
+      "fadd z1.s, z1.s, z22.s\n"
+      "fmls z14.s, z22.s, z2.s[3]\n"
+      "fmls z29.s, z18.s, z2.s[3]\n"
+      "fadd z5.s, z1.s, z24.s\n"
+      "fsub z24.s, z24.s, z1.s\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z19.s, z2.s[1]\n"
+      "fsub z23.s, z15.s, z13.s\n"
+      "fadd z11.s, z13.s, z15.s\n"
+      "fmsb z18.s, p1/M, z16.s, z9.s\n"
+      "fmul z9.s, z10.s, z2.s[2]\n"
+      "fmad z10.s, p1/M, z16.s, z21.s\n"
+      "fmad z31.s, p1/M, z16.s, z29.s\n"
+      "fmad z8.s, p1/M, z16.s, z11.s\n"
+      "fneg z9.s, p1/M, z9.s\n"
+      "fadd z9.s, z9.s, z19.s\n"
+      "fmls z10.s, z19.s, z2.s[3]\n"
+      "fmls z31.s, z12.s, z2.s[3]\n"
+      "st1w { z31.s }, p0, [%x[output_row_0]]\n"
+      "fadd z26.s, z9.s, z18.s\n"
+      "fsub z18.s, z18.s, z9.s\n"
+      "fmls z8.s, z4.s, z2.s[3]\n"
+      "fmad z25.s, p1/M, z16.s, z23.s\n"
+      "fmad z28.s, p1/M, z16.s, z10.s\n"
+      "fmad z17.s, p1/M, z16.s, z26.s\n"
+      "fmad z27.s, p1/M, z16.s, z18.s\n"
+      "fmls z25.s, z30.s, z2.s[3]\n"
+      "fmls z28.s, z14.s, z2.s[3]\n"
+      "fmls z17.s, z5.s, z2.s[3]\n"
+      "st1w { z17.s }, p0, [%x[output_row_0], %x[output_col_1_stride], LSL #2]\n"
+      "fmls z27.s, z24.s, z2.s[3]\n"
+      "st1w { z27.s }, p0, [%x[output_row_0], x25, LSL #2]\n"
+      "st1w { z8.s }, p0, [%x[output_row_0], x23, LSL #2]\n"
+      "st1w { z25.s }, p0, [%x[output_row_0], x21, LSL #2]\n"
+      "st1w { z28.s }, p0, [%x[output_row_0], x8, LSL #2]\n"
+      "incb %x[output_row_0]\n"
+      "ld1w { z19.s }, p0/Z, [x16]\n"
+      "ld1w { z7.s }, p0/Z, [x16, %x[input_col_1_stride], LSL #2]\n"
+      "fmul z13.s, z7.s, z2.s[1]\n"
+      "ld1w { z6.s }, p0/Z, [x16, x26, LSL #2]\n"
+      "ld1w { z27.s }, p0/Z, [x16, x24, LSL #2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "ld1w { z25.s }, p0/Z, [x16, x22, LSL #2]\n"
+      "fsub z15.s, z25.s, z6.s\n"
+      "fmad z19.s, p1/M, z16.s, z25.s\n"
+      "ld1w { z20.s }, p0/Z, [x16, x20, LSL #2]\n"
+      "fmla z13.s, z27.s, z2.s[1]\n"
+      "ld1w { z0.s }, p0/Z, [x12]\n"
+      "incb x16\n"
+      "fmls z19.s, z6.s, z2.s[3]\n"
+      "ld1w { z31.s }, p0/Z, [x12, %x[input_col_1_stride], LSL #2]\n"
+      "fsub z8.s, z15.s, z13.s\n"
+      "fadd z28.s, z13.s, z15.s\n"
+      "ld1w { z1.s }, p0/Z, [x12, x26, LSL #2]\n"
+      "fmsb z6.s, p1/M, z16.s, z25.s\n"
+      "ld1w { z21.s }, p0/Z, [x12, x24, LSL #2]\n"
+      "fmul z25.s, z7.s, z2.s[2]\n"
+      "ld1w { z22.s }, p0/Z, [x12, x22, LSL #2]\n"
+      "fsub z15.s, z22.s, z1.s\n"
+      "fneg z25.s, p1/M, z25.s\n"
+      "ld1w { z17.s }, p0/Z, [x12, x20, LSL #2]\n"
+      "fadd z25.s, z25.s, z27.s\n"
+      "incb x12\n"
+      "fmad z7.s, p1/M, z16.s, z20.s\n"
+      "fmad z0.s, p1/M, z16.s, z22.s\n"
+      "fmul z13.s, z31.s, z2.s[1]\n"
+      "fadd z3.s, z25.s, z6.s\n"
+      "fsub z6.s, z6.s, z25.s\n"
+      "fmls z7.s, z27.s, z2.s[3]\n"
+      "fmls z0.s, z1.s, z2.s[3]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z21.s, z2.s[1]\n"
+      "fsub z9.s, z15.s, z13.s\n"
+      "fadd z27.s, z13.s, z15.s\n"
+      "fmsb z1.s, p1/M, z16.s, z22.s\n"
+      "fsub z15.s, z29.s, z12.s\n"
+      "fmul z22.s, z31.s, z2.s[2]\n"
+      "fmad z31.s, p1/M, z16.s, z17.s\n"
+      "fmul z13.s, z19.s, z2.s[1]\n"
+      "fmsb z12.s, p1/M, z16.s, z29.s\n"
+      "fneg z22.s, p1/M, z22.s\n"
+      "fadd z22.s, z22.s, z21.s\n"
+      "fmls z31.s, z21.s, z2.s[3]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fadd z25.s, z22.s, z1.s\n"
+      "fsub z1.s, z1.s, z22.s\n"
+      "fmla z13.s, z0.s, z2.s[1]\n"
+      "fmul z29.s, z19.s, z2.s[2]\n"
+      "fadd z22.s, z13.s, z15.s\n"
+      "st1w { z22.s }, p0, [x11]\n"
+      "fneg z29.s, p1/M, z29.s\n"
+      "fsub z22.s, z15.s, z13.s\n"
+      "fadd z29.s, z29.s, z0.s\n"
+      "st1w { z22.s }, p0, [x9]\n"
+      "fadd z22.s, z29.s, z12.s\n"
+      "fsub z15.s, z26.s, z5.s\n"
+      "fmul z13.s, z3.s, z2.s[1]\n"
+      "fsub z12.s, z12.s, z29.s\n"
+      "fmsb z5.s, p1/M, z16.s, z26.s\n"
+      "fmul z26.s, z3.s, z2.s[2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z25.s, z2.s[1]\n"
+      "fneg z26.s, p1/M, z26.s\n"
+      "fadd z26.s, z26.s, z25.s\n"
+      "fadd z21.s, z13.s, z15.s\n"
+      "st1w { z21.s }, p0, [x11, %x[output_col_1_stride], LSL #2]\n"
+      "fsub z21.s, z15.s, z13.s\n"
+      "fmul z13.s, z6.s, z2.s[1]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "st1w { z21.s }, p0, [x9, %x[output_col_1_stride], LSL #2]\n"
+      "fadd z21.s, z26.s, z5.s\n"
+      "fsub z15.s, z18.s, z24.s\n"
+      "fmla z13.s, z1.s, z2.s[1]\n"
+      "fsub z5.s, z5.s, z26.s\n"
+      "fmsb z24.s, p1/M, z16.s, z18.s\n"
+      "fmul z18.s, z6.s, z2.s[2]\n"
+      "fadd z20.s, z13.s, z15.s\n"
+      "st1w { z20.s }, p0, [x11, x25, LSL #2]\n"
+      "fneg z18.s, p1/M, z18.s\n"
+      "fsub z20.s, z15.s, z13.s\n"
+      "fadd z18.s, z18.s, z1.s\n"
+      "st1w { z20.s }, p0, [x9, x25, LSL #2]\n"
+      "fadd z20.s, z18.s, z24.s\n"
+      "fsub z15.s, z11.s, z4.s\n"
+      "fmul z13.s, z28.s, z2.s[1]\n"
+      "fsub z24.s, z24.s, z18.s\n"
+      "fmsb z4.s, p1/M, z16.s, z11.s\n"
+      "fmul z11.s, z28.s, z2.s[2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z27.s, z2.s[1]\n"
+      "fneg z11.s, p1/M, z11.s\n"
+      "fadd z11.s, z11.s, z27.s\n"
+      "fadd z26.s, z13.s, z15.s\n"
+      "st1w { z26.s }, p0, [x11, x23, LSL #2]\n"
+      "fsub z26.s, z15.s, z13.s\n"
+      "fmul z13.s, z8.s, z2.s[1]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "st1w { z26.s }, p0, [x9, x23, LSL #2]\n"
+      "fadd z26.s, z11.s, z4.s\n"
+      "fsub z15.s, z23.s, z30.s\n"
+      "fmla z13.s, z9.s, z2.s[1]\n"
+      "fsub z4.s, z4.s, z11.s\n"
+      "fmsb z30.s, p1/M, z16.s, z23.s\n"
+      "fmul z23.s, z8.s, z2.s[2]\n"
+      "fadd z18.s, z13.s, z15.s\n"
+      "st1w { z18.s }, p0, [x11, x21, LSL #2]\n"
+      "fneg z23.s, p1/M, z23.s\n"
+      "fsub z18.s, z15.s, z13.s\n"
+      "fadd z23.s, z23.s, z9.s\n"
+      "st1w { z18.s }, p0, [x9, x21, LSL #2]\n"
+      "fadd z18.s, z23.s, z30.s\n"
+      "fsub z15.s, z10.s, z14.s\n"
+      "fmul z13.s, z7.s, z2.s[1]\n"
+      "fsub z30.s, z30.s, z23.s\n"
+      "fmsb z14.s, p1/M, z16.s, z10.s\n"
+      "fmul z10.s, z7.s, z2.s[2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z31.s, z2.s[1]\n"
+      "fneg z10.s, p1/M, z10.s\n"
+      "fadd z10.s, z10.s, z31.s\n"
+      "fadd z17.s, z13.s, z15.s\n"
+      "st1w { z17.s }, p0, [x11, x8, LSL #2]\n"
+      "fsub z17.s, z15.s, z13.s\n"
+      "incb x11\n"
+      "st1w { z17.s }, p0, [x9, x8, LSL #2]\n"
+      "fadd z17.s, z10.s, z14.s\n"
+      "fsub z14.s, z14.s, z10.s\n"
+      "st1w { z22.s }, p0, [x15]\n"
+      "incb x9\n"
+      "st1w { z12.s }, p0, [x13]\n"
+      "st1w { z21.s }, p0, [x15, %x[output_col_1_stride], LSL #2]\n"
+      "st1w { z5.s }, p0, [x13, %x[output_col_1_stride], LSL #2]\n"
+      "st1w { z20.s }, p0, [x15, x25, LSL #2]\n"
+      "st1w { z24.s }, p0, [x13, x25, LSL #2]\n"
+      "st1w { z26.s }, p0, [x15, x23, LSL #2]\n"
+      "st1w { z4.s }, p0, [x13, x23, LSL #2]\n"
+      "st1w { z18.s }, p0, [x15, x21, LSL #2]\n"
+      "st1w { z30.s }, p0, [x13, x21, LSL #2]\n"
+      "st1w { z17.s }, p0, [x15, x8, LSL #2]\n"
+      "incb x15\n"
+      "st1w { z14.s }, p0, [x13, x8, LSL #2]\n"
+      "incb x13\n"
+      "ld1w { z23.s }, p0/Z, [x28]\n"
+      "ld1w { z22.s }, p0/Z, [x28, %x[input_col_1_stride], LSL #2]\n"
+      "fmul z13.s, z22.s, z2.s[1]\n"
+      "ld1w { z21.s }, p0/Z, [x28, x26, LSL #2]\n"
+      "ld1w { z20.s }, p0/Z, [x28, x24, LSL #2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "ld1w { z26.s }, p0/Z, [x28, x22, LSL #2]\n"
+      "fsub z15.s, z26.s, z21.s\n"
+      "fmad z23.s, p1/M, z16.s, z26.s\n"
+      "ld1w { z18.s }, p0/Z, [x28, x20, LSL #2]\n"
+      "fmla z13.s, z20.s, z2.s[1]\n"
+      "incb x28\n"
+      "fmls z23.s, z21.s, z2.s[3]\n"
+      "fsub z17.s, z15.s, z13.s\n"
+      "fadd z30.s, z13.s, z15.s\n"
+      "fmsb z21.s, p1/M, z16.s, z26.s\n"
+      "fmul z26.s, z22.s, z2.s[2]\n"
+      "fmad z22.s, p1/M, z16.s, z18.s\n"
+      "fmad z19.s, p1/M, z16.s, z23.s\n"
+      "fmad z28.s, p1/M, z16.s, z30.s\n"
+      "fneg z26.s, p1/M, z26.s\n"
+      "fadd z26.s, z26.s, z20.s\n"
+      "fmls z22.s, z20.s, z2.s[3]\n"
+      "fmls z19.s, z0.s, z2.s[3]\n"
+      "st1w { z19.s }, p0, [x27]\n"
+      "fadd z23.s, z26.s, z21.s\n"
+      "fsub z21.s, z21.s, z26.s\n"
+      "fmls z28.s, z27.s, z2.s[3]\n"
+      "fmad z8.s, p1/M, z16.s, z17.s\n"
+      "fmad z7.s, p1/M, z16.s, z22.s\n"
+      "fmad z3.s, p1/M, z16.s, z23.s\n"
+      "fmad z6.s, p1/M, z16.s, z21.s\n"
+      "fmls z8.s, z9.s, z2.s[3]\n"
+      "fmls z7.s, z31.s, z2.s[3]\n"
+      "fmls z3.s, z25.s, z2.s[3]\n"
+      "st1w { z3.s }, p0, [x27, %x[output_col_1_stride], LSL #2]\n"
+      "fmls z6.s, z1.s, z2.s[3]\n"
+      "st1w { z6.s }, p0, [x27, x25, LSL #2]\n"
+      "st1w { z28.s }, p0, [x27, x23, LSL #2]\n"
+      "st1w { z8.s }, p0, [x27, x21, LSL #2]\n"
+      "st1w { z7.s }, p0, [x27, x8, LSL #2]\n"
+      "incb x27\n"
+      "whilelt p0.s, XZR, %x[num_channels]\n"
+      "bne 1b\n"
+      "2:"  // channel_loop_end
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [input_row_0] "+&r" (input), [num_channels] "+&r" (long_channels), [output_row_0] "+&r" (output)
+      : [B_values] "r" (B_values), [input_col_1_stride] "r" ((long) input_col_stride), [input_row_stride] "r" ((long) input_row_stride), [output_col_1_stride] "r" ((long) output_col_stride), [output_row_stride] "r" (6 * (long) output_col_stride)
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x8", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace input_transform
+}  // namespace winograd
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms/sve_fp32_6x6.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms/sve_fp32_6x6.cpp
new file mode 100644
index 0000000000..7b387e1247
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms/sve_fp32_6x6.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#include <cstddef>
+
+namespace arm_conv {
+namespace winograd {
+namespace input_transform {
+
+void sve_fp32_6x6(
+  const unsigned int num_channels,
+  const float *input,
+  const size_t input_row_stride,
+  const size_t input_col_stride,
+  float *output,
+  const size_t output_col_stride
+)
+{
+  const float B_values[4] = { 1.0f, 2.0f, 4.0f, 5.0f };
+  long long_channels = num_channels;
+
+  // Generated by armasmgen (February 04th, 2021)
+  __asm__ __volatile__(
+      "fmov z16.s, #4.0\n"
+      "ptrue p1.b\n"
+      "ld1rqw { z2.s }, p1/Z, [%x[B_values]]\n"
+      "add x16, %x[input_row_0], %x[input_row_stride], LSL #2\n"
+      "add x15, %x[output_row_0], %x[output_row_stride], LSL #2\n"
+      "add x14, %x[input_row_0], %x[input_row_stride], LSL #3\n"
+      "add x13, %x[output_row_0], %x[output_row_stride], LSL #3\n"
+      "add x12, x14, %x[input_row_stride], LSL #2\n"
+      "add x11, x13, %x[output_row_stride], LSL #2\n"
+      "add x10, %x[input_row_0], %x[input_row_stride], LSL #4\n"
+      "add x9, %x[output_row_0], %x[output_row_stride], LSL #4\n"
+      "add x28, x10, %x[input_row_stride], LSL #2\n"
+      "add x27, x9, %x[output_row_stride], LSL #2\n"
+      "lsl x26, %x[input_col_1_stride], #0x1\n"
+      "lsl x25, %x[output_col_1_stride], #0x1\n"
+      "add x24, x26, %x[input_col_1_stride]\n"
+      "add x23, x25, %x[output_col_1_stride]\n"
+      "lsl x22, %x[input_col_1_stride], #0x2\n"
+      "lsl x21, %x[output_col_1_stride], #0x2\n"
+      "add x20, x22, %x[input_col_1_stride]\n"
+      "add x8, x21, %x[output_col_1_stride]\n"
+      "whilelt p0.s, XZR, %x[num_channels]\n"
+      "beq 2f\n"
+      "1:"  // channel_loop
+      "ld1w { z31.s }, p0/Z, [%x[input_row_0]]\n"
+      "decw %x[num_channels]\n"
+      "ld1w { z28.s }, p0/Z, [%x[input_row_0], %x[input_col_1_stride], LSL #2]\n"
+      "fmul z13.s, z28.s, z2.s[1]\n"
+      "ld1w { z27.s }, p0/Z, [%x[input_row_0], x26, LSL #2]\n"
+      "ld1w { z11.s }, p0/Z, [%x[input_row_0], x24, LSL #2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "ld1w { z7.s }, p0/Z, [%x[input_row_0], x22, LSL #2]\n"
+      "fsub z15.s, z7.s, z27.s\n"
+      "fmad z31.s, p1/M, z16.s, z7.s\n"
+      "ld1w { z3.s }, p0/Z, [%x[input_row_0], x20, LSL #2]\n"
+      "fmla z13.s, z11.s, z2.s[1]\n"
+      "ld1w { z12.s }, p0/Z, [x14]\n"
+      "incb %x[input_row_0]\n"
+      "fmls z31.s, z27.s, z2.s[3]\n"
+      "ld1w { z14.s }, p0/Z, [x14, %x[input_col_1_stride], LSL #2]\n"
+      "fsub z25.s, z15.s, z13.s\n"
+      "fadd z8.s, z13.s, z15.s\n"
+      "ld1w { z24.s }, p0/Z, [x14, x26, LSL #2]\n"
+      "fmsb z27.s, p1/M, z16.s, z7.s\n"
+      "ld1w { z22.s }, p0/Z, [x14, x24, LSL #2]\n"
+      "fmul z7.s, z28.s, z2.s[2]\n"
+      "ld1w { z1.s }, p0/Z, [x14, x22, LSL #2]\n"
+      "fsub z15.s, z1.s, z24.s\n"
+      "fneg z7.s, p1/M, z7.s\n"
+      "ld1w { z20.s }, p0/Z, [x14, x20, LSL #2]\n"
+      "fadd z7.s, z7.s, z11.s\n"
+      "ld1w { z29.s }, p0/Z, [x10]\n"
+      "incb x14\n"
+      "fmad z28.s, p1/M, z16.s, z3.s\n"
+      "ld1w { z10.s }, p0/Z, [x10, %x[input_col_1_stride], LSL #2]\n"
+      "fmad z12.s, p1/M, z16.s, z1.s\n"
+      "ld1w { z18.s }, p0/Z, [x10, x26, LSL #2]\n"
+      "fmul z13.s, z14.s, z2.s[1]\n"
+      "ld1w { z19.s }, p0/Z, [x10, x24, LSL #2]\n"
+      "fadd z17.s, z7.s, z27.s\n"
+      "ld1w { z9.s }, p0/Z, [x10, x22, LSL #2]\n"
+      "fsub z27.s, z27.s, z7.s\n"
+      "fmls z28.s, z11.s, z2.s[3]\n"
+      "ld1w { z21.s }, p0/Z, [x10, x20, LSL #2]\n"
+      "incb x10\n"
+      "fmls z12.s, z24.s, z2.s[3]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z22.s, z2.s[1]\n"
+      "fsub z30.s, z15.s, z13.s\n"
+      "fadd z4.s, z13.s, z15.s\n"
+      "fmsb z24.s, p1/M, z16.s, z1.s\n"
+      "fsub z15.s, z9.s, z18.s\n"
+      "fmul z1.s, z14.s, z2.s[2]\n"
+      "fmad z14.s, p1/M, z16.s, z20.s\n"
+      "fmad z29.s, p1/M, z16.s, z9.s\n"
+      "fmul z13.s, z10.s, z2.s[1]\n"
+      "fneg z1.s, p1/M, z1.s\n"
+      "fadd z1.s, z1.s, z22.s\n"
+      "fmls z14.s, z22.s, z2.s[3]\n"
+      "fmls z29.s, z18.s, z2.s[3]\n"
+      "fadd z5.s, z1.s, z24.s\n"
+      "fsub z24.s, z24.s, z1.s\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z19.s, z2.s[1]\n"
+      "fsub z23.s, z15.s, z13.s\n"
+      "fadd z11.s, z13.s, z15.s\n"
+      "fmsb z18.s, p1/M, z16.s, z9.s\n"
+      "fmul z9.s, z10.s, z2.s[2]\n"
+      "fmad z10.s, p1/M, z16.s, z21.s\n"
+      "fmad z31.s, p1/M, z16.s, z29.s\n"
+      "fmad z8.s, p1/M, z16.s, z11.s\n"
+      "fneg z9.s, p1/M, z9.s\n"
+      "fadd z9.s, z9.s, z19.s\n"
+      "fmls z10.s, z19.s, z2.s[3]\n"
+      "fmls z31.s, z12.s, z2.s[3]\n"
+      "st1w { z31.s }, p0, [%x[output_row_0]]\n"
+      "fadd z26.s, z9.s, z18.s\n"
+      "fsub z18.s, z18.s, z9.s\n"
+      "fmls z8.s, z4.s, z2.s[3]\n"
+      "fmad z25.s, p1/M, z16.s, z23.s\n"
+      "fmad z28.s, p1/M, z16.s, z10.s\n"
+      "fmad z17.s, p1/M, z16.s, z26.s\n"
+      "fmad z27.s, p1/M, z16.s, z18.s\n"
+      "fmls z25.s, z30.s, z2.s[3]\n"
+      "fmls z28.s, z14.s, z2.s[3]\n"
+      "fmls z17.s, z5.s, z2.s[3]\n"
+      "st1w { z17.s }, p0, [%x[output_row_0], %x[output_col_1_stride], LSL #2]\n"
+      "fmls z27.s, z24.s, z2.s[3]\n"
+      "st1w { z27.s }, p0, [%x[output_row_0], x25, LSL #2]\n"
+      "st1w { z8.s }, p0, [%x[output_row_0], x23, LSL #2]\n"
+      "st1w { z25.s }, p0, [%x[output_row_0], x21, LSL #2]\n"
+      "st1w { z28.s }, p0, [%x[output_row_0], x8, LSL #2]\n"
+      "incb %x[output_row_0]\n"
+      "ld1w { z19.s }, p0/Z, [x16]\n"
+      "ld1w { z7.s }, p0/Z, [x16, %x[input_col_1_stride], LSL #2]\n"
+      "fmul z13.s, z7.s, z2.s[1]\n"
+      "ld1w { z6.s }, p0/Z, [x16, x26, LSL #2]\n"
+      "ld1w { z27.s }, p0/Z, [x16, x24, LSL #2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "ld1w { z25.s }, p0/Z, [x16, x22, LSL #2]\n"
+      "fsub z15.s, z25.s, z6.s\n"
+      "fmad z19.s, p1/M, z16.s, z25.s\n"
+      "ld1w { z20.s }, p0/Z, [x16, x20, LSL #2]\n"
+      "fmla z13.s, z27.s, z2.s[1]\n"
+      "ld1w { z0.s }, p0/Z, [x12]\n"
+      "incb x16\n"
+      "fmls z19.s, z6.s, z2.s[3]\n"
+      "ld1w { z31.s }, p0/Z, [x12, %x[input_col_1_stride], LSL #2]\n"
+      "fsub z8.s, z15.s, z13.s\n"
+      "fadd z28.s, z13.s, z15.s\n"
+      "ld1w { z1.s }, p0/Z, [x12, x26, LSL #2]\n"
+      "fmsb z6.s, p1/M, z16.s, z25.s\n"
+      "ld1w { z21.s }, p0/Z, [x12, x24, LSL #2]\n"
+      "fmul z25.s, z7.s, z2.s[2]\n"
+      "ld1w { z22.s }, p0/Z, [x12, x22, LSL #2]\n"
+      "fsub z15.s, z22.s, z1.s\n"
+      "fneg z25.s, p1/M, z25.s\n"
+      "ld1w { z17.s }, p0/Z, [x12, x20, LSL #2]\n"
+      "fadd z25.s, z25.s, z27.s\n"
+      "incb x12\n"
+      "fmad z7.s, p1/M, z16.s, z20.s\n"
+      "fmad z0.s, p1/M, z16.s, z22.s\n"
+      "fmul z13.s, z31.s, z2.s[1]\n"
+      "fadd z3.s, z25.s, z6.s\n"
+      "fsub z6.s, z6.s, z25.s\n"
+      "fmls z7.s, z27.s, z2.s[3]\n"
+      "fmls z0.s, z1.s, z2.s[3]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z21.s, z2.s[1]\n"
+      "fsub z9.s, z15.s, z13.s\n"
+      "fadd z27.s, z13.s, z15.s\n"
+      "fmsb z1.s, p1/M, z16.s, z22.s\n"
+      "fsub z15.s, z29.s, z12.s\n"
+      "fmul z22.s, z31.s, z2.s[2]\n"
+      "fmad z31.s, p1/M, z16.s, z17.s\n"
+      "fmul z13.s, z19.s, z2.s[1]\n"
+      "fmsb z12.s, p1/M, z16.s, z29.s\n"
+      "fneg z22.s, p1/M, z22.s\n"
+      "fadd z22.s, z22.s, z21.s\n"
+      "fmls z31.s, z21.s, z2.s[3]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fadd z25.s, z22.s, z1.s\n"
+      "fsub z1.s, z1.s, z22.s\n"
+      "fmla z13.s, z0.s, z2.s[1]\n"
+      "fmul z29.s, z19.s, z2.s[2]\n"
+      "fadd z22.s, z13.s, z15.s\n"
+      "st1w { z22.s }, p0, [x11]\n"
+      "fneg z29.s, p1/M, z29.s\n"
+      "fsub z22.s, z15.s, z13.s\n"
+      "fadd z29.s, z29.s, z0.s\n"
+      "st1w { z22.s }, p0, [x9]\n"
+      "fadd z22.s, z29.s, z12.s\n"
+      "fsub z15.s, z26.s, z5.s\n"
+      "fmul z13.s, z3.s, z2.s[1]\n"
+      "fsub z12.s, z12.s, z29.s\n"
+      "fmsb z5.s, p1/M, z16.s, z26.s\n"
+      "fmul z26.s, z3.s, z2.s[2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z25.s, z2.s[1]\n"
+      "fneg z26.s, p1/M, z26.s\n"
+      "fadd z26.s, z26.s, z25.s\n"
+      "fadd z21.s, z13.s, z15.s\n"
+      "st1w { z21.s }, p0, [x11, %x[output_col_1_stride], LSL #2]\n"
+      "fsub z21.s, z15.s, z13.s\n"
+      "fmul z13.s, z6.s, z2.s[1]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "st1w { z21.s }, p0, [x9, %x[output_col_1_stride], LSL #2]\n"
+      "fadd z21.s, z26.s, z5.s\n"
+      "fsub z15.s, z18.s, z24.s\n"
+      "fmla z13.s, z1.s, z2.s[1]\n"
+      "fsub z5.s, z5.s, z26.s\n"
+      "fmsb z24.s, p1/M, z16.s, z18.s\n"
+      "fmul z18.s, z6.s, z2.s[2]\n"
+      "fadd z20.s, z13.s, z15.s\n"
+      "st1w { z20.s }, p0, [x11, x25, LSL #2]\n"
+      "fneg z18.s, p1/M, z18.s\n"
+      "fsub z20.s, z15.s, z13.s\n"
+      "fadd z18.s, z18.s, z1.s\n"
+      "st1w { z20.s }, p0, [x9, x25, LSL #2]\n"
+      "fadd z20.s, z18.s, z24.s\n"
+      "fsub z15.s, z11.s, z4.s\n"
+      "fmul z13.s, z28.s, z2.s[1]\n"
+      "fsub z24.s, z24.s, z18.s\n"
+      "fmsb z4.s, p1/M, z16.s, z11.s\n"
+      "fmul z11.s, z28.s, z2.s[2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z27.s, z2.s[1]\n"
+      "fneg z11.s, p1/M, z11.s\n"
+      "fadd z11.s, z11.s, z27.s\n"
+      "fadd z26.s, z13.s, z15.s\n"
+      "st1w { z26.s }, p0, [x11, x23, LSL #2]\n"
+      "fsub z26.s, z15.s, z13.s\n"
+      "fmul z13.s, z8.s, z2.s[1]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "st1w { z26.s }, p0, [x9, x23, LSL #2]\n"
+      "fadd z26.s, z11.s, z4.s\n"
+      "fsub z15.s, z23.s, z30.s\n"
+      "fmla z13.s, z9.s, z2.s[1]\n"
+      "fsub z4.s, z4.s, z11.s\n"
+      "fmsb z30.s, p1/M, z16.s, z23.s\n"
+      "fmul z23.s, z8.s, z2.s[2]\n"
+      "fadd z18.s, z13.s, z15.s\n"
+      "st1w { z18.s }, p0, [x11, x21, LSL #2]\n"
+      "fneg z23.s, p1/M, z23.s\n"
+      "fsub z18.s, z15.s, z13.s\n"
+      "fadd z23.s, z23.s, z9.s\n"
+      "st1w { z18.s }, p0, [x9, x21, LSL #2]\n"
+      "fadd z18.s, z23.s, z30.s\n"
+      "fsub z15.s, z10.s, z14.s\n"
+      "fmul z13.s, z7.s, z2.s[1]\n"
+      "fsub z30.s, z30.s, z23.s\n"
+      "fmsb z14.s, p1/M, z16.s, z10.s\n"
+      "fmul z10.s, z7.s, z2.s[2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "fmla z13.s, z31.s, z2.s[1]\n"
+      "fneg z10.s, p1/M, z10.s\n"
+      "fadd z10.s, z10.s, z31.s\n"
+      "fadd z17.s, z13.s, z15.s\n"
+      "st1w { z17.s }, p0, [x11, x8, LSL #2]\n"
+      "fsub z17.s, z15.s, z13.s\n"
+      "incb x11\n"
+      "st1w { z17.s }, p0, [x9, x8, LSL #2]\n"
+      "fadd z17.s, z10.s, z14.s\n"
+      "fsub z14.s, z14.s, z10.s\n"
+      "st1w { z22.s }, p0, [x15]\n"
+      "incb x9\n"
+      "st1w { z12.s }, p0, [x13]\n"
+      "st1w { z21.s }, p0, [x15, %x[output_col_1_stride], LSL #2]\n"
+      "st1w { z5.s }, p0, [x13, %x[output_col_1_stride], LSL #2]\n"
+      "st1w { z20.s }, p0, [x15, x25, LSL #2]\n"
+      "st1w { z24.s }, p0, [x13, x25, LSL #2]\n"
+      "st1w { z26.s }, p0, [x15, x23, LSL #2]\n"
+      "st1w { z4.s }, p0, [x13, x23, LSL #2]\n"
+      "st1w { z18.s }, p0, [x15, x21, LSL #2]\n"
+      "st1w { z30.s }, p0, [x13, x21, LSL #2]\n"
+      "st1w { z17.s }, p0, [x15, x8, LSL #2]\n"
+      "incb x15\n"
+      "st1w { z14.s }, p0, [x13, x8, LSL #2]\n"
+      "incb x13\n"
+      "ld1w { z23.s }, p0/Z, [x28]\n"
+      "ld1w { z22.s }, p0/Z, [x28, %x[input_col_1_stride], LSL #2]\n"
+      "fmul z13.s, z22.s, z2.s[1]\n"
+      "ld1w { z21.s }, p0/Z, [x28, x26, LSL #2]\n"
+      "ld1w { z20.s }, p0/Z, [x28, x24, LSL #2]\n"
+      "fneg z13.s, p1/M, z13.s\n"
+      "ld1w { z26.s }, p0/Z, [x28, x22, LSL #2]\n"
+      "fsub z15.s, z26.s, z21.s\n"
+      "fmad z23.s, p1/M, z16.s, z26.s\n"
+      "ld1w { z18.s }, p0/Z, [x28, x20, LSL #2]\n"
+      "fmla z13.s, z20.s, z2.s[1]\n"
+      "incb x28\n"
+      "fmls z23.s, z21.s, z2.s[3]\n"
+      "fsub z17.s, z15.s, z13.s\n"
+      "fadd z30.s, z13.s, z15.s\n"
+      "fmsb z21.s, p1/M, z16.s, z26.s\n"
+      "fmul z26.s, z22.s, z2.s[2]\n"
+      "fmad z22.s, p1/M, z16.s, z18.s\n"
+      "fmad z19.s, p1/M, z16.s, z23.s\n"
+      "fmad z28.s, p1/M, z16.s, z30.s\n"
+      "fneg z26.s, p1/M, z26.s\n"
+      "fadd z26.s, z26.s, z20.s\n"
+      "fmls z22.s, z20.s, z2.s[3]\n"
+      "fmls z19.s, z0.s, z2.s[3]\n"
+      "st1w { z19.s }, p0, [x27]\n"
+      "fadd z23.s, z26.s, z21.s\n"
+      "fsub z21.s, z21.s, z26.s\n"
+      "fmls z28.s, z27.s, z2.s[3]\n"
+      "fmad z8.s, p1/M, z16.s, z17.s\n"
+      "fmad z7.s, p1/M, z16.s, z22.s\n"
+      "fmad z3.s, p1/M, z16.s, z23.s\n"
+      "fmad z6.s, p1/M, z16.s, z21.s\n"
+      "fmls z8.s, z9.s, z2.s[3]\n"
+      "fmls z7.s, z31.s, z2.s[3]\n"
+      "fmls z3.s, z25.s, z2.s[3]\n"
+      "st1w { z3.s }, p0, [x27, %x[output_col_1_stride], LSL #2]\n"
+      "fmls z6.s, z1.s, z2.s[3]\n"
+      "st1w { z6.s }, p0, [x27, x25, LSL #2]\n"
+      "st1w { z28.s }, p0, [x27, x23, LSL #2]\n"
+      "st1w { z8.s }, p0, [x27, x21, LSL #2]\n"
+      "st1w { z7.s }, p0, [x27, x8, LSL #2]\n"
+      "incb x27\n"
+      "whilelt p0.s, XZR, %x[num_channels]\n"
+      "bne 1b\n"
+      "2:"  // channel_loop_end
+
+      : [input_row_0] "+&r" (input), [num_channels] "+&r" (long_channels), [output_row_0] "+&r" (output)
+      : [B_values] "r" (B_values), [input_col_1_stride] "r" ((long) input_col_stride), [input_row_stride] "r" ((long) input_row_stride), [output_col_1_stride] "r" ((long) output_col_stride), [output_row_stride] "r" (6 * (long) output_col_stride)
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x8", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace input_transform
+}  // namespace winograd
+}  // namespace arm_conv
+
+#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp
new file mode 100644
index 0000000000..35d61fa94d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+#include "input_transform.hpp"
+#include "winograd_implementations.hpp"
+
+#include <memory>
+#include <string>
+
+namespace arm_conv {
+namespace winograd {
+namespace input_transform {
+
+void a64_fp16_6x6(unsigned int, const __fp16 *, size_t, size_t, __fp16 *, size_t);
+
+#define IMPL(HEIGHT, WIDTH, FUNC, DRIVER) new Transform ## DRIVER <__fp16, __fp16>(#FUNC, HEIGHT, WIDTH, FUNC)
+
+static const TransformImplementation<__fp16> transforms_fp16[] = {
+  { IMPL(6, 6, a64_fp16_6x6, Unpadded) },
+  { nullptr },
+};
+
+template <>
+const TransformImplementation<__fp16> *implementation_list(void)
+{
+  return transforms_fp16;
+}
+
+}  // namespace input_transform
+}  // namespace winograd
+}  // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms_fp32.cpp
new file mode 100644
index 0000000000..df633903ca
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms_fp32.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "input_transform.hpp"
+#include "winograd_implementations.hpp"
+
+#include <memory>
+#include <string>
+
+namespace arm_conv {
+namespace winograd {
+namespace input_transform {
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+void sme_fp32_mla_6x6(unsigned int, const float *, size_t, size_t, float *, size_t);
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+void sve_fp32_6x6(unsigned int, const float *, size_t, size_t, float *, size_t);
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+void a64_fp32_6x6(unsigned int, const float *, size_t, size_t, float *, size_t);
+#else  // defined(__aarch64__)
+void arm_fp32_6x6(unsigned int, const float *, size_t, size_t, float *, size_t);
+#endif  // defined(__aarch64__)
+void arm_fp32_4x4(unsigned int, const float *, size_t, size_t, float *, size_t);
+void arm_fp32_1x8(unsigned int, const float *, size_t, size_t, float *, size_t);
+
+#define IMPL(HEIGHT, WIDTH, FUNC, DRIVER) new Transform ## DRIVER <float, float>(#FUNC, HEIGHT, WIDTH, FUNC)
+
+static const TransformImplementation<float> transforms_fp32[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+  { IMPL(6, 6, sme_fp32_mla_6x6, Unpadded), MethodConstraints::RequiresSME },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+  { IMPL(6, 6, sve_fp32_6x6, Unpadded), MethodConstraints::RequiresSVE },
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+  { IMPL(6, 6, a64_fp32_6x6, Unpadded) },
+#else  // defined(__aarch64__)
+  { IMPL(6, 6, arm_fp32_6x6, Unpadded) },
+#endif  // defined(__aarch64__)
+  { IMPL(4, 4, arm_fp32_4x4, Unpadded) },
+  { IMPL(1, 8, arm_fp32_1x8, Unpadded) },
+  { new TransformUnpadded<float, float>("arm_fp32_1x8", 8, 1, TransformUnpadded<float, float>::get_transposed_kernel(arm_fp32_1x8)) },
+  { nullptr },
+};
+
+template <>
+const TransformImplementation<float> *implementation_list(void)
+{
+  return transforms_fp32;
+}
+
+}  // namespace input_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transform.hpp b/src/core/NEON/kernels/convolution/winograd/output_transform.hpp
new file mode 100644
index 0000000000..971cc99cd2
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transform.hpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "winograd.hpp"
+
+#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <functional>
+#include <limits>
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+/* Driver class for the Winograd output transforms.
+ *
+ * This provides a base implementation which handles iteration over the output
+ * tensor; subclasses are responsible for managing working space and executing
+ * the transform on individual tiles.
+ */
+template <typename TIn, typename TOut=TIn>
+class TransformBase : public ITransform
+{
+  const std::string m_name;
+  const unsigned int m_output_rows, m_output_cols;
+  const unsigned int m_kernel_rows, m_kernel_cols;
+
+  protected:
+  virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
+  {
+    return 0;
+  }
+
+  virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
+  {
+    // Nothing to do
+  }
+
+  virtual void execute_tile(
+    unsigned int n_channels,
+    const TIn *inptr, size_t ld_in_matrix,
+    const TIn *bias,
+    TOut *outptr, size_t ld_out_row, size_t ld_out_col,
+    TOut activation_min, TOut activation_max,
+    unsigned int valid_rows, unsigned int valid_cols,
+    void *working_space
+  ) const = 0;
+
+  void execute_internal(
+    const ConvolutionArgs &args,
+    const TIn *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
+    const TIn *bias,
+    TOut *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
+    void *working_space, unsigned int thread_id, unsigned int n_threads
+  ) const
+  {
+    // Get the working space for this thread, and initialise it.
+    working_space = reinterpret_cast<char *>(working_space) +
+                    this->get_working_space_per_thread(args) * thread_id;
+    this->initialise_thread_working_space(args, working_space);
+
+    // Get the activation values
+    auto activation_min = static_cast<TOut>(-std::numeric_limits<float>::infinity());
+    auto activation_max = static_cast<TOut>(+std::numeric_limits<float>::infinity());
+    switch (args.activation.type)
+    {
+      case arm_gemm::Activation::Type::BoundedReLU:
+        activation_max = static_cast<TOut>(args.activation.param1);
+        // Fall through
+      case arm_gemm::Activation::Type::ReLU:
+        activation_min = static_cast<TOut>(0);
+        break;
+      default:
+        break;
+    }
+
+    // Determine the number of tiles in a row, we use this to get the right
+    // offset into the input data.
+    const auto n_tile_cols = (args.output_shape.cols + this->get_output_cols() - 1) / this->get_output_cols();
+
+    // Execute over all batches
+    for (unsigned int batch = 0; batch < args.n_batches; batch++)
+    {
+      auto inptr_row = inptr + thread_id*n_tile_cols*ld_in_row;
+      auto outptr_row = outptr + thread_id*ld_out_row*this->get_output_rows();
+      inptr += ld_in_batch;
+      outptr += ld_out_batch;
+
+      // Stripe rows of tiles over threads.
+      for (auto out_i = thread_id * this->get_output_rows();
+           out_i < args.output_shape.rows;
+           out_i += n_threads * this->get_output_rows())
+      {
+        auto inptr_tile = inptr_row;
+        auto outptr_tile = outptr_row;
+        inptr_row += n_threads * n_tile_cols * ld_in_row;
+        outptr_row += n_threads * this->get_output_rows() * ld_out_row;
+
+        // Iterate over all columns
+        for (auto out_j = 0u; out_j < args.output_shape.cols;
+             out_j += this->get_output_cols())
+        {
+          // Execute the tile
+          this->execute_tile(
+            args.n_output_channels,
+            inptr_tile, ld_in_matrix,
+            bias,
+            outptr_tile, ld_out_row, ld_out_col,
+            activation_min, activation_max,
+            args.output_shape.rows - out_i,  // Number of valid rows remaining
+            args.output_shape.cols - out_j,  // Number of valid columns remaining
+            working_space
+          );
+
+          // Progress the pointers
+          inptr_tile += ld_in_row;
+          outptr_tile += this->get_output_cols() * ld_out_col;
+        }
+      }
+    }
+  }
+
+  public:
+  TransformBase(const std::string &name,
+                unsigned int output_rows, unsigned int output_cols,
+                unsigned int kernel_rows, unsigned int kernel_cols)
+  : m_name(name),
+    m_output_rows(output_rows), m_output_cols(output_cols),
+    m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols)
+  {
+  }
+
+  const std::string &get_name(void) const override { return m_name; }
+
+  unsigned int get_input_rows(void) const override final { return m_kernel_rows + m_output_rows - 1; }
+  unsigned int get_input_cols(void) const override final { return m_kernel_cols + m_output_cols - 1; }
+
+  unsigned int get_output_rows(void) const override final { return m_output_rows; }
+  unsigned int get_output_cols(void) const override final { return m_output_cols; }
+
+  unsigned int get_kernel_rows(void) const override final { return m_kernel_rows; }
+  unsigned int get_kernel_cols(void) const override final { return m_kernel_cols; }
+
+  size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
+  {
+    return n_threads * this->get_working_space_per_thread(args);
+  }
+
+  void execute(
+    const ConvolutionArgs &args,
+    const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
+    const void *bias,
+    void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
+    void *working_space, unsigned int thread_id, unsigned int n_threads
+  ) const override
+  {
+    execute_internal(
+      args,
+      reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_matrix, ld_in_row,
+      reinterpret_cast<const TIn *>(bias),
+      reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_row, ld_out_col,
+      working_space, thread_id, n_threads
+    );
+  }
+};
+
+template <typename TIn, typename TOut=TIn>
+class TransformUnpadded : public TransformBase<TIn, TOut>
+{
+  using Kernel = std::function<void(
+    unsigned int n_channels,
+    const TIn *inptr, size_t ld_in_matrix,
+    const TIn *bias,
+    TOut *outptr, size_t ld_out_row, size_t ld_out_col,
+    TOut activation_min, TOut activation_max
+  )>;
+  const Kernel m_kernel;
+
+  protected:
+  size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
+  {
+    // We create a buffer the size of the output tile
+    const auto n_output_points = this->get_output_rows() * this->get_output_cols();
+    return sizeof(TOut) * n_output_points * args.n_output_channels;
+  }
+
+  void execute_tile(
+    unsigned int n_channels,
+    const TIn *inptr, size_t ld_in_matrix,
+    const TIn *bias,
+    TOut *outptr, size_t ld_out_row, size_t ld_out_col,
+    TOut activation_min, TOut activation_max,
+    unsigned int valid_rows, unsigned int valid_cols,
+    void *working_space
+  ) const override final
+  {
+    // Get copies of the output tensor parameters
+    auto kernel_outptr = outptr;
+    auto kernel_ld_out_row = ld_out_row, kernel_ld_out_col = ld_out_col;
+
+    // If there's padding on either the left or the right, then we execute the
+    // kernel into the output buffer and then perform a copy.
+    if (valid_rows < this->get_output_rows() ||
+        valid_cols < this->get_output_cols())
+    {
+      // Override the kernel output parameters
+      kernel_outptr = reinterpret_cast<TOut *>(working_space);
+      kernel_ld_out_col = n_channels;
+      kernel_ld_out_row = kernel_ld_out_col * this->get_output_cols();
+    }
+
+    // Execute the kernel
+    m_kernel(
+      n_channels,
+      inptr, ld_in_matrix,
+      bias,
+      kernel_outptr, kernel_ld_out_row, kernel_ld_out_col,
+      activation_min, activation_max
+    );
+
+    // If necessary, copy from the working space into the destination tensor.
+    if (valid_rows < this->get_output_rows() ||
+        valid_cols < this->get_output_cols())
+    {
+      const auto last_row = std::min(valid_rows, this->get_output_rows());
+      const auto last_col = std::min(valid_cols, this->get_output_cols());
+
+      for (auto i = 0u; i < last_row; i++)
+      {
+        auto patch_tile = kernel_outptr;
+        auto out_tile = outptr;
+        kernel_outptr += kernel_ld_out_row;
+        outptr += ld_out_row;
+
+        for (auto j = 0u; j < last_col; j++)
+        {
+          memcpy(out_tile, patch_tile, sizeof(TOut) * n_channels);
+          patch_tile += kernel_ld_out_col;
+          out_tile += ld_out_col;
+        }
+      }
+    }
+  }
+
+  public:
+  TransformUnpadded(const std::string &name,
+                    unsigned int output_rows, unsigned int output_cols,
+                    unsigned int kernel_rows, unsigned int kernel_cols,
+                    const Kernel kernel)
+  : TransformBase<TIn, TOut>(name, output_rows, output_cols, kernel_rows, kernel_cols),
+    m_kernel(kernel)
+  {
+  }
+
+  /* Utility method to get a transposed variant of a kernel, this transposed
+   * version simply calls the original kernel with the output row and column
+   * strides swapped.
+   */
+  static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
+  {
+    return [kernel] (
+      const unsigned int n_channels,
+      const TIn *const inptr, const size_t ld_in_matrix,
+      const TIn *const bias,
+      TOut *const outptr, const size_t ld_out_row, const size_t ld_out_col,
+      const TOut activation_min, const TOut activation_max
+    ) {
+      kernel(n_channels, inptr, ld_in_matrix, bias,
+             outptr, ld_out_col, ld_out_row,
+             activation_min, activation_max);
+    };
+  }
+};
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp
new file mode 100644
index 0000000000..295005a2ee
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstddef>
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+void a64_fp16_4x4_3x3(
+    unsigned int n_channels,
+    const __fp16* inptr,
+    size_t matrix_stride,
+    const __fp16* bptr,
+    __fp16* const output,
+    size_t output_row_stride,
+    size_t output_col_stride,
+    __fp16 output_min,
+    __fp16 output_max
+)
+{
+    constexpr int output_tile_rows = 4, output_tile_cols = 4;
+
+    // Construct a map to the output cells
+    __fp16 *outptrs[output_tile_rows][output_tile_cols];
+    for (int i = 0; i < output_tile_rows; i++)
+    {
+        for (int j = 0; j < output_tile_cols; j++)
+        {
+            outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
+        }
+    }
+
+    // For each channel of the output
+    int channels_remaining = n_channels;
+
+#ifdef __aarch64__
+    for (; channels_remaining >= 8; channels_remaining -= 8)
+  {
+    // Matrices used and computed during this transform
+    float16x8_t F[6][6], FZ[6][4], f[4][4], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = vld1q_f16(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 8;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vaddq_f16(vaddq_f16(vaddq_f16(F[i][0], F[i][1]), vaddq_f16(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+      FZ[i][1] = vaddq_f16(vsubq_f16(F[i][1], F[i][2]), vmulq_f16(vsubq_f16(F[i][3], F[i][4]), vdupq_n_f16(2.0f)));
+
+      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+      FZ[i][2] = vaddq_f16(vaddq_f16(F[i][1], F[i][2]), vmulq_f16(vaddq_f16(F[i][3], F[i][4]), vdupq_n_f16(4.0f)));
+
+      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+      FZ[i][3] = vaddq_f16(vaddq_f16(vsubq_f16(F[i][1], F[i][2]), vmulq_f16(vsubq_f16(F[i][3], F[i][4]), vdupq_n_f16(8.0f))), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 4; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vaddq_f16(vaddq_f16(vaddq_f16(FZ[0][j], FZ[1][j]), vaddq_f16(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+      f[1][j] = vaddq_f16(vsubq_f16(FZ[1][j], FZ[2][j]), vmulq_f16(vsubq_f16(FZ[3][j], FZ[4][j]), vdupq_n_f16(2.0f)));
+
+      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+      f[2][j] = vaddq_f16(vaddq_f16(FZ[1][j], FZ[2][j]), vmulq_f16(vaddq_f16(FZ[3][j], FZ[4][j]), vdupq_n_f16(4.0f)));
+
+      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+      f[3][j] = vaddq_f16(vaddq_f16(vsubq_f16(FZ[1][j], FZ[2][j]), vmulq_f16(vsubq_f16(FZ[3][j], FZ[4][j]), vdupq_n_f16(8.0f))), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = vld1q_f16(bptr);
+      bptr += 8;
+    }
+    else
+    {
+      b = vdupq_n_f16(0.0f);
+    }
+    for (int i = 0; i < output_tile_rows; i++)
+    {
+      for (int j = 0; j < output_tile_cols; j++)
+      {
+        const auto y =
+            vmaxq_f16(vminq_f16(vaddq_f16(f[i][j], b), vdupq_n_f16(output_max)),
+                     vdupq_n_f16(output_min));
+        vst1q_f16(outptrs[i][j], y);
+        outptrs[i][j] += 8;
+      }
+    }
+  }
+#endif  // __aarch64__
+#ifdef __arm_any__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used and computed during this transform
+    float16x4_t F[6][6], FZ[6][4], f[4][4], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = vld1_f16(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 4;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vadd_f16(vadd_f16(vadd_f16(F[i][0], F[i][1]), vadd_f16(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+      FZ[i][1] = vadd_f16(vsub_f16(F[i][1], F[i][2]), vmul_f16(vsub_f16(F[i][3], F[i][4]), vdup_n_f16(2.0f)));
+
+      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+      FZ[i][2] = vadd_f16(vadd_f16(F[i][1], F[i][2]), vmul_f16(vadd_f16(F[i][3], F[i][4]), vdup_n_f16(4.0f)));
+
+      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+      FZ[i][3] = vadd_f16(vadd_f16(vsub_f16(F[i][1], F[i][2]), vmul_f16(vsub_f16(F[i][3], F[i][4]), vdup_n_f16(8.0f))), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 4; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vadd_f16(vadd_f16(vadd_f16(FZ[0][j], FZ[1][j]), vadd_f16(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+      f[1][j] = vadd_f16(vsub_f16(FZ[1][j], FZ[2][j]), vmul_f16(vsub_f16(FZ[3][j], FZ[4][j]), vdup_n_f16(2.0f)));
+
+      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+      f[2][j] = vadd_f16(vadd_f16(FZ[1][j], FZ[2][j]), vmul_f16(vadd_f16(FZ[3][j], FZ[4][j]), vdup_n_f16(4.0f)));
+
+      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+      f[3][j] = vadd_f16(vadd_f16(vsub_f16(FZ[1][j], FZ[2][j]), vmul_f16(vsub_f16(FZ[3][j], FZ[4][j]), vdup_n_f16(8.0f))), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = vld1_f16(bptr);
+      bptr += 4;
+    }
+    else
+    {
+      b = vdup_n_f16(0.0f);
+    }
+    for (int i = 0; i < output_tile_rows; i++)
+    {
+      for (int j = 0; j < output_tile_cols; j++)
+      {
+        const auto y =
+            vmax_f16(vmin_f16(vadd_f16(f[i][j], b), vdup_n_f16(output_max)),
+                     vdup_n_f16(output_min));
+        vst1_f16(outptrs[i][j], y);
+        outptrs[i][j] += 4;
+      }
+    }
+  }
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
+    {
+        // Matrices used and computed during this transform
+        __fp16 F[6][6], FZ[6][4], f[4][4], b;
+
+        // Read a 6x6 tile in the Winograd domain
+        for (int i = 0, m = 0; i < 6; i++)
+        {
+            for (int j = 0; j < 6; j++, m++)
+            {
+                F[i][j] = *(inptr + m*matrix_stride);
+            }
+        }
+        inptr++;
+
+        // Compute the matrix F Z
+        for (int i = 0; i < 6; i++)
+        {
+            FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+            FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+            FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+            FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+        }
+
+        // Compute the output tile f = ZT F Z
+        for (int j = 0; j < 4; j++)
+        {
+            f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+            f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+            f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+            f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+        }
+
+        // Write out the output tile
+        if (bptr != nullptr)
+        {
+            b = *(bptr++);
+        }
+        else
+        {
+            b = 0.0f;
+        }
+        for (int i = 0; i < output_tile_rows; i++)
+        {
+            for (int j = 0; j < output_tile_cols; j++)
+            {
+                const auto y = std::max(std::min<__fp16>(f[i][j] + b, output_max), output_min);
+                *(outptrs[i][j]++) = y;
+            }
+        }
+    }
+}
+
+} // namespace output_transform
+} // namespace winograd
+} // namespace arm_conv
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x2_1x7.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x2_1x7.cpp
new file mode 100644
index 0000000000..8c6cf9725e
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x2_1x7.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+void arm_fp32_1x2_1x7(
+  unsigned int n_channels,
+  const float* inptr,
+  size_t matrix_stride,
+  const float* bptr,
+  float *outptr,
+  size_t,  // No need to stride across rows
+  size_t output_col_stride,
+  float output_min,
+  float output_max
+)
+{
+  constexpr auto inner_tile_cols = 8u, output_tile_cols = 2u;
+
+  // For each channel of the output
+  for (; n_channels >= 4; n_channels -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (auto j = 0u; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1q_f32(inptr + j*matrix_stride);
+    }
+    inptr += 4;
+
+    f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1q_f32(bptr);
+      bptr += 4;
+    }
+    for (auto j = 0u; j < output_tile_cols; j++)
+    {
+      const auto y = vminq_f32(vmaxq_f32(f[j] + b, vdupq_n_f32(output_min)),
+                               vdupq_n_f32(output_max));
+      vst1q_f32(outptr + j*output_col_stride, y);
+    }
+    outptr += 4;
+  }
+  for (; n_channels >= 2; n_channels -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (auto j = 0u; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1_f32(inptr + j*matrix_stride);
+    }
+    inptr += 2;
+
+    f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    for (auto j = 0u; j < output_tile_cols; j++)
+    {
+      const auto y = vmin_f32(vmax_f32(f[j] + b, vdup_n_f32(output_min)),
+                              vdup_n_f32(output_max));
+      vst1_f32(outptr + j*output_col_stride, y);
+    }
+    outptr += 2;
+  }
+  if (n_channels)
+  {
+    // Matrices used and computed during this transform
+    float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
+
+    // Read a 1x8 tile in the Winograd domain
+    for (auto j = 0u; j < inner_tile_cols; j++)
+    {
+      F[j] = *(inptr + j*matrix_stride);
+    }
+
+    f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
+    f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1 + F[7]*1;
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = *(bptr++);
+    }
+    for (auto j = 0u; j < output_tile_cols; j++)
+    {
+      *(outptr + j*output_col_stride) = std::max(std::min(f[j] + b, output_max), output_min);
+    }
+  }
+}
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp
new file mode 100644
index 0000000000..ac05f23221
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+void arm_fp32_1x4_1x5(
+  unsigned int n_channels,
+  const float* inptr,
+  size_t matrix_stride,
+  const float* bptr,
+  float *outptr,
+  size_t,  // No need to stride across rows
+  size_t output_col_stride,
+  float output_min,
+  float output_max
+)
+{
+  constexpr auto inner_tile_cols = 8u, output_tile_cols = 4u;
+
+  // For each channel of the output
+  for (; n_channels >= 4; n_channels -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (auto j = 0u; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1q_f32(inptr + j*matrix_stride);
+    }
+    inptr += 4;
+
+    f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+    f[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+    f[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1q_f32(bptr);
+      bptr += 4;
+    }
+    for (auto j = 0u; j < output_tile_cols; j++)
+    {
+      const auto y =
+          vmaxq_f32(vminq_f32(vaddq_f32(f[j], b), vdupq_n_f32(output_max)),
+                    vdupq_n_f32(output_min));
+      vst1q_f32(outptr + j*output_col_stride, y);
+    }
+    outptr += 4;
+  }
+  for (; n_channels >= 2; n_channels -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (auto j = 0u; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1_f32(inptr + j*matrix_stride);
+    }
+    inptr += 2;
+
+    f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+    f[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+    f[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    for (auto j = 0u; j < output_tile_cols; j++)
+    {
+      const auto y =
+          vmax_f32(vmin_f32(vadd_f32(f[j], b), vdup_n_f32(output_max)),
+                   vdup_n_f32(output_min));
+      vst1_f32(outptr + j*output_col_stride, y);
+    }
+    outptr += 2;
+  }
+  for (; n_channels; n_channels--)
+  {
+    // Matrices used and computed during this transform
+    float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
+
+    // Read a 1x8 tile in the Winograd domain
+    for (auto j = 0u; j < inner_tile_cols; j++)
+    {
+      F[j] = *(inptr + j*matrix_stride);
+    }
+    inptr++;
+
+    f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
+    f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1;
+    f[2] = F[3]*4 + F[4]*4 + F[5]*9 + F[6]*9 + F[1]*1 + F[2]*1;
+    f[3] = F[1]*-1 + F[5]*-27 + F[3]*-8 + F[4]*8 + F[6]*27 + F[2]*1 + F[7]*1;
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = *(bptr++);
+    }
+    for (auto j = 0u; j < output_tile_cols; j++)
+    {
+      const auto y = std::max(std::min(f[j] + b, output_max), output_min);
+      *(outptr + j*output_col_stride) = y;
+    }
+    outptr++;
+  }
+}
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x6_1x3.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x6_1x3.cpp
new file mode 100644
index 0000000000..154dc6fe1a
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x6_1x3.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+void arm_fp32_1x6_1x3(
+  unsigned int n_channels,
+  const float* inptr,
+  size_t matrix_stride,
+  const float* bptr,
+  float *outptr,
+  size_t,  // No need to stride across rows
+  size_t output_col_stride,
+  float output_min,
+  float output_max
+)
+{
+  constexpr unsigned int inner_tile_cols = 8, output_tile_cols = 6;
+
+  // For each channel of the output
+  for (; n_channels >= 4; n_channels -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (auto j = 0u; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1q_f32(inptr + j*matrix_stride);
+    }
+    inptr += 4;
+
+    f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+    f[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+    f[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+    f[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 81), F[5], 81), F[4], 16), F[3], 16);
+    f[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 243), F[4], 32), F[3], -32), F[5], -243), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1q_f32(bptr);
+      bptr += 4;
+    }
+    for (auto j = 0u; j < output_tile_cols; j++)
+    {
+      const auto y = vminq_f32(vmaxq_f32(f[j] + b, vdupq_n_f32(output_min)),
+                               vdupq_n_f32(output_max));
+      vst1q_f32(outptr + j*output_col_stride, y);
+    }
+    outptr += 4;
+  }
+  for (; n_channels >= 2; n_channels -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (auto j = 0u; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1_f32(inptr + j*matrix_stride);
+    }
+    inptr += 2;
+
+    f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+    f[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+    f[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+    f[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 81), F[5], 81), F[4], 16), F[3], 16);
+    f[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 243), F[4], 32), F[3], -32), F[5], -243), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    for (auto j = 0u; j < output_tile_cols; j++)
+    {
+      const auto y = vmin_f32(vmax_f32(f[j] + b, vdup_n_f32(output_min)),
+                              vdup_n_f32(output_max));
+      vst1_f32(outptr + j*output_col_stride, y);
+    }
+    outptr += 2;
+  }
+  for (; n_channels; n_channels--)
+  {
+    // Matrices used and computed during this transform
+    float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
+
+    // Read a 1x8 tile in the Winograd domain
+    for (auto j = 0u; j < inner_tile_cols; j++)
+    {
+      F[j] = *(inptr + j*matrix_stride);
+    }
+    inptr++;
+
+    f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
+    f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1;
+    f[2] = F[3]*4 + F[4]*4 + F[5]*9 + F[6]*9 + F[1]*1 + F[2]*1;
+    f[3] = F[1]*-1 + F[5]*-27 + F[3]*-8 + F[4]*8 + F[6]*27 + F[2]*1;
+    f[4] = F[3]*16 + F[4]*16 + F[5]*81 + F[6]*81 + F[1]*1 + F[2]*1;
+    f[5] = F[1]*-1 + F[5]*-243 + F[3]*-32 + F[4]*32 + F[6]*243 + F[2]*1 + F[7]*1;
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = *(bptr++);
+    }
+    for (auto j = 0u; j < output_tile_cols; j++)
+    {
+      *(outptr + j*output_col_stride) = std::max(std::min(f[j] + b, output_max), output_min);
+    }
+    outptr++;
+  }
+}
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_3x3.cpp
new file mode 100644
index 0000000000..28f042bcbf
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_3x3.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+void arm_fp32_2x2_3x3(
+  unsigned int n_channels,
+  const float* inptr,
+  size_t matrix_stride,
+  const float* bptr,
+  float *outptr,
+  size_t output_row_stride,
+  size_t output_col_stride,
+  float output_min,
+  float output_max
+)
+{
+  constexpr auto output_tile_rows = 2u, output_tile_cols = 2u;
+
+  // For each channel of the output
+  for (; n_channels >= 4; n_channels -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[4][4], FZ[4][2], f[2][2], b;
+
+    // Read a 4x4 tile in the Winograd domain
+    for (auto i = 0u, m = 0u; i < 4; i++)
+    {
+      for (auto j = 0u; j < 4; j++, m++)
+      {
+        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 4;
+
+    // Compute the matrix F Z
+    for (auto i = 0u; i < 4; i++)
+    {
+      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+      FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
+
+      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+      FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (auto j = 0u; j < 2; j++)
+    {
+      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+      f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+      f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+    }
+
+    // Load the bias vector
+    if (bptr != nullptr)
+    {
+      b = vld1q_f32(bptr);
+      bptr += 4;
+    }
+    else
+    {
+      b = vdupq_n_f32(0.0f);
+    }
+
+    // Write out the output tile
+    for (auto i = 0u; i < output_tile_rows; i++)
+    {
+      for (auto j = 0u; j < output_tile_cols; j++)
+      {
+        const auto y =
+            vmaxq_f32(vminq_f32(vaddq_f32(f[i][j], b), vdupq_n_f32(output_max)),
+                      vdupq_n_f32(output_min));
+        vst1q_f32(outptr + i*output_row_stride + j*output_col_stride, y);
+      }
+    }
+    outptr += 4;
+  }
+  for (; n_channels >= 2; n_channels -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[4][4], FZ[4][2], f[2][2], b;
+
+    // Read a 4x4 tile in the Winograd domain
+    for (auto i = 0u, m = 0u; i < 4; i++)
+    {
+      for (auto j = 0u; j < 4; j++, m++)
+      {
+        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 2;
+
+    // Compute the matrix F Z
+    for (auto i = 0u; i < 4; i++)
+    {
+      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+      FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+
+      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+      FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (auto j = 0u; j < 2; j++)
+    {
+      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+      f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+      f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+    }
+
+    // Load the bias vector
+    if (bptr != nullptr)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    else
+    {
+      b = vdup_n_f32(0.0f);
+    }
+
+    // Write out the output tile
+    for (auto i = 0u; i < output_tile_rows; i++)
+    {
+      for (auto j = 0u; j < output_tile_cols; j++)
+      {
+        const auto y =
+            vmax_f32(vmin_f32(vadd_f32(f[i][j], b), vdup_n_f32(output_max)),
+                     vdup_n_f32(output_min));
+        vst1_f32(outptr + i*output_row_stride + j*output_col_stride, y);
+      }
+    }
+    outptr += 2;
+  }
+  for (; n_channels; n_channels--)
+  {
+    // Matrices used and computed during this transform
+    float F[4][4], FZ[4][2], f[2][2], b;
+
+    // Read a 4x4 tile in the Winograd domain
+    for (auto i = 0u, m = 0u; i < 4; i++)
+    {
+      for (auto j = 0u; j < 4; j++, m++)
+      {
+        F[i][j] = *(inptr + m*matrix_stride);
+      }
+    }
+    inptr++;
+
+    // Compute the matrix F Z
+    for (auto i = 0u; i < 4; i++)
+    {
+      FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+      FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (auto j = 0u; j < 2; j++)
+    {
+      f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+      f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+    }
+
+    // Load the bias
+    if (bptr != nullptr)
+    {
+      b = *(bptr++);
+    }
+    else
+    {
+      b = 0.0f;
+    }
+
+    // Write out the output tile
+    for (auto i = 0u; i < output_tile_rows; i++)
+    {
+      for (auto j = 0u; j < output_tile_cols; j++)
+      {
+        const auto y = std::max(std::min(f[i][j] + b, output_max), output_min);
+        *(outptr + i*output_row_stride + j*output_col_stride) = y;
+      }
+    }
+    outptr++;
+  }
+}
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_5x5.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_5x5.cpp
new file mode 100644
index 0000000000..8e5ba74ac3
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_5x5.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+void arm_fp32_2x2_5x5(
+  unsigned int n_channels,
+  const float* inptr,
+  size_t matrix_stride,
+  const float* bptr,
+  float *outptr,
+  size_t output_row_stride,
+  size_t output_col_stride,
+  float output_min,
+  float output_max
+)
+{
+  constexpr auto output_tile_rows = 2u, output_tile_cols = 2u;
+
+  // For each channel of the output
+  for (; n_channels >= 4; n_channels -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[6][6], FZ[6][2], f[2][2], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (auto i = 0u, m = 0u; i < 6; i++)
+    {
+      for (auto j = 0u; j < 6; j++, m++)
+      {
+        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 4;
+
+    // Compute the matrix F Z
+    for (auto i = 0u; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+      FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (auto j = 0u; j < 2; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+      f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = vld1q_f32(bptr);
+      bptr += 4;
+    }
+    else
+    {
+      b = vdupq_n_f32(0.0f);
+    }
+    for (auto i = 0u; i < output_tile_rows; i++)
+    {
+      for (auto j = 0u; j < output_tile_cols; j++)
+      {
+        const auto y =
+            vmaxq_f32(vminq_f32(vaddq_f32(f[i][j], b), vdupq_n_f32(output_max)),
+                      vdupq_n_f32(output_min));
+        vst1q_f32(outptr + i*output_row_stride + j*output_col_stride, y);
+      }
+    }
+    outptr += 4;
+  }
+  for (; n_channels >= 2; n_channels -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[6][6], FZ[6][2], f[2][2], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (auto i = 0u, m = 0u; i < 6; i++)
+    {
+      for (auto j = 0u; j < 6; j++, m++)
+      {
+        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 2;
+
+    // Compute the matrix F Z
+    for (auto i = 0u; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+      FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (auto j = 0u; j < 2; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+      f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    else
+    {
+      b = vdup_n_f32(0.0f);
+    }
+    for (auto i = 0u; i < output_tile_rows; i++)
+    {
+      for (auto j = 0u; j < output_tile_cols; j++)
+      {
+        const auto y =
+            vmax_f32(vmin_f32(vadd_f32(f[i][j], b), vdup_n_f32(output_max)),
+                     vdup_n_f32(output_min));
+        vst1_f32(outptr + i*output_row_stride + j*output_col_stride, y);
+      }
+    }
+    outptr += 2;
+  }
+  if (n_channels)
+  {
+    // Matrices used and computed during this transform
+    float F[6][6], FZ[6][2], f[2][2], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (auto i = 0u, m = 0u; i < 6; i++)
+    {
+      for (auto j = 0u; j < 6; j++, m++)
+      {
+        F[i][j] = *(inptr + m*matrix_stride);
+      }
+    }
+
+    // Compute the matrix F Z
+    for (auto i = 0u; i < 6; i++)
+    {
+      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (auto j = 0u; j < 2; j++)
+    {
+      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = *(bptr++);
+    }
+    else
+    {
+      b = 0.0f;
+    }
+    for (auto i = 0u; i < output_tile_rows; i++)
+    {
+      for (auto j = 0u; j < output_tile_cols; j++)
+      {
+        const auto y = std::max(std::min(f[i][j] + b, output_max), output_min);
+        *(outptr + i*output_row_stride + j*output_col_stride) = y;
+      }
+    }
+  }
+}
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_4x4_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_4x4_3x3.cpp
new file mode 100644
index 0000000000..72c43019fa
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_4x4_3x3.cpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+void arm_fp32_4x4_3x3(
+  unsigned int n_channels,
+  const float* inptr,
+  size_t matrix_stride,
+  const float* bptr,
+  float *outptr,
+  size_t output_row_stride,
+  size_t output_col_stride,
+  float output_min,
+  float output_max
+)
+{
+  constexpr auto output_tile_rows = 4u, output_tile_cols = 4u;
+
+  // For each channel of the output
+  for (; n_channels >= 4; n_channels -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[6][6], FZ[6][4], f[4][4], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (auto i = 0u, m = 0u; i < 6; i++)
+    {
+      for (auto j = 0u; j < 6; j++, m++)
+      {
+        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 4;
+
+    // Compute the matrix F Z
+    for (auto i = 0u; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+      FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
+
+      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+      FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
+
+      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+      FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (auto j = 0u; j < 4; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+      f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+      f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+      f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = vld1q_f32(bptr);
+      bptr += 4;
+    }
+    else
+    {
+      b = vdupq_n_f32(0.0f);
+    }
+    for (auto i = 0u; i < output_tile_rows; i++)
+    {
+      for (auto j = 0u; j < output_tile_cols; j++)
+      {
+        const auto y =
+            vmaxq_f32(vminq_f32(vaddq_f32(f[i][j], b), vdupq_n_f32(output_max)),
+                     vdupq_n_f32(output_min));
+        vst1q_f32(outptr + i*output_row_stride + j*output_col_stride, y);
+      }
+    }
+    outptr += 4;
+  }
+  for (; n_channels >= 2; n_channels -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[6][6], FZ[6][4], f[4][4], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (auto i = 0u, m = 0u; i < 6; i++)
+    {
+      for (auto j = 0u; j < 6; j++, m++)
+      {
+        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 2;
+
+    // Compute the matrix F Z
+    for (auto i = 0u; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+      FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
+
+      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+      FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+
+      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+      FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (auto j = 0u; j < 4; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+      f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+      f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+      f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    else
+    {
+      b = vdup_n_f32(0.0f);
+    }
+    for (auto i = 0u; i < output_tile_rows; i++)
+    {
+      for (auto j = 0u; j < output_tile_cols; j++)
+      {
+        const auto y =
+            vmax_f32(vmin_f32(vadd_f32(f[i][j], b), vdup_n_f32(output_max)),
+                     vdup_n_f32(output_min));
+        vst1_f32(outptr + i*output_row_stride + j*output_col_stride, y);
+      }
+    }
+    outptr += 2;
+  }
+  for (; n_channels; n_channels--)
+  {
+    // Matrices used and computed during this transform
+    float F[6][6], FZ[6][4], f[4][4], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (auto i = 0u, m = 0u; i < 6; i++)
+    {
+      for (auto j = 0u; j < 6; j++, m++)
+      {
+        F[i][j] = *(inptr + m*matrix_stride);
+      }
+    }
+    inptr++;
+
+    // Compute the matrix F Z
+    for (auto i = 0u; i < 6; i++)
+    {
+      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+      FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+      FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (auto j = 0u; j < 4; j++)
+    {
+      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+      f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+      f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = *(bptr++);
+    }
+    else
+    {
+      b = 0.0f;
+    }
+    for (auto i = 0u; i < output_tile_rows; i++)
+    {
+      for (auto j = 0u; j < output_tile_cols; j++)
+      {
+        const auto y = std::max(std::min(f[i][j] + b, output_max), output_min);
+        *(outptr + i*output_row_stride + j*output_col_stride) = y;
+      }
+    }
+    outptr++;
+  }
+}
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/sme_fp32_mopa_4x4_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/sme_fp32_mopa_4x4_3x3.cpp
new file mode 100644
index 0000000000..043914d590
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/sme_fp32_mopa_4x4_3x3.cpp
@@ -0,0 +1,891 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+#include <cstddef>
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+void sme_fp32_mopa_4x4_3x3(
+  unsigned int n_channels,
+  const float* inptr,
+  size_t matrix_stride,
+  const float* bptr,
+  float* const output,
+  size_t output_row_stride,
+  size_t output_col_stride,
+  float output_min,
+  float output_max
+)
+{
+  // The below assembler uses the Kronecker product and the "vec trick" to
+  // implement the Winograd output transform (y = AT Y A) using the SME
+  // array.  This code REQUIRES that the vectors are 512b long (or longer, if
+  // we add some predication).
+  //
+  // The "vec trick" uses the identity $vec(AT Y A) = (AT (x) AT) vec(Y)$ to
+  // convert the chain of matrix multiplications into a matrix-vector
+  // product.  We then stack multiple channels of vec(Y) together to allow us
+  // to perform multiple channels of the transformation simultaneously.
+  //
+  // Since the complete matrix (AT (x) AT) is quite big [16 x 36], we compute
+  // it on the fly. To do so, we store two representations of the matrix AT.
+  // The first representation (the outer terms) contains, within each quad,
+  // four coefficients of the matrix AT.
+  const float outer_terms[32] = {
+     1, 1,  1, 1,
+     0, 1, -1, 2,
+     0, 1,  1, 4,
+     0, 1, -1, 8,
+    // The following rows are continuations of the first four rows, and each
+    // contains two columns of padding values which aren't used in the
+    // computation but are there to ensure that the coefficients end up in
+    // the right quads of the vector into which they're read.
+     1, 0,  0, 0,
+    -2, 0,  0, 0,
+     4, 0,  0, 0,
+    -8, 1,  0, 0
+  };
+  // This should be loaded completely into two Z registers.
+  //
+  // We can then use by-element FMLA to construct columns of (AT (x) AT) by
+  // multiplying elements of the outer terms against the following inner
+  // terms (again split into quads, but expected to be loaded replicated such
+  // that each of the six required Z registers contains a repeated quad of
+  // the values).
+  const float inner_terms[24] = {
+    1,  0, 0,  0,
+    1,  1, 1,  1,
+    1, -1, 1, -1,
+    1,  2, 4,  8,
+    1, -2, 4, -8,
+    0,  0, 0,  1
+  };
+
+  struct Params
+  {
+    const float *outer_terms;
+    const float *inner_terms;
+    float act_min;
+    float act_max;
+
+    Params(const float *outer_terms,
+           const float *inner_terms,
+           float act_min,
+           float act_max)
+      : outer_terms(outer_terms), inner_terms(inner_terms),
+        act_min(act_min), act_max(act_max)
+    {
+    }
+  };
+
+  Params params(outer_terms, inner_terms, output_min, output_max);
+
+  __asm__ __volatile__(
+    "ldr x20, [%x[params], %[offsetof_Params_outer_terms]]\n"
+    ".inst 0xd503477f  // SMSTART ZA\n"
+    "ptrue p5.b\n"
+    "ld1rw { z12.s }, p5/Z, [%x[params], %[offsetof_Params_act_min]]\n"
+    "ld1rw { z10.s }, p5/Z, [%x[params], %[offsetof_Params_act_max]]\n"
+    "pfalse p8.b\n"
+    "ldr x8, [%x[params], %[offsetof_Params_inner_terms]]\n"
+    "ld1w { z6.s }, p5/Z, [x20]\n"
+    "ld1w { z7.s }, p5/Z, [x20, #1, MUL VL]\n"
+    "ld1rqw { z9.s }, p5/Z, [x8]\n"
+    "ld1rqw { z8.s }, p5/Z, [x8, #16]\n"
+    "ld1rqw { z15.s }, p5/Z, [x8, #32]\n"
+    "fmul z11.s, z9.s, z6.s[0]\n"
+    "fmul z5.s, z9.s, z6.s[1]\n"
+    "ld1rqw { z4.s }, p5/Z, [x8, #48]\n"
+    "ld1rqw { z3.s }, p5/Z, [x8, #64]\n"
+    "ld1rqw { z2.s }, p5/Z, [x8, #80]\n"
+    "cbz %x[bptr], 1f\n"
+    "ptrue p8.s\n"
+    "1:"  // Set bias predicate: Done
+    ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+    "fmov z1.s, #1.0\n"
+    "mov x25, #0x0\n"
+    "cntw x24\n"
+    "cntw x23, ALL, MUL #2\n"
+    "cntw x22, ALL, MUL #3\n"
+    "whilelt p4.s, x25, %x[n_channels]\n"
+    "whilelt p3.s, x24, %x[n_channels]\n"
+    "ld1w { z31.s }, p4/Z, [%x[inptr], x25, LSL #2]\n"
+    "ld1w { z30.s }, p3/Z, [%x[inptr], x24, LSL #2]\n"
+    "whilelt p2.s, x23, %x[n_channels]\n"
+    "whilelt p1.s, x22, %x[n_channels]\n"
+    "ld1w { z29.s }, p2/Z, [%x[inptr], x23, LSL #2]\n"
+    "add x21, %x[inptr], %x[matrix_stride], LSL #2\n"
+    "and p0.b, p5/Z, p8.b, p4.b\n"
+    "ld1w { z28.s }, p1/Z, [%x[inptr], x22, LSL #2]\n"
+    "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
+    "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
+    "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
+    "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
+    "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
+    "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "ld1w { z0.s }, p0/Z, [%x[bptr], x25, LSL #2]\n"
+    "and p0.b, p5/Z, p8.b, p3.b\n"
+    ".inst 0x8080b420  // fmopa za0.s, p5/M, p5/M, z1.s, z0.s\n"
+    "ld1w { z0.s }, p0/Z, [%x[bptr], x24, LSL #2]\n"
+    "and p0.b, p5/Z, p8.b, p2.b\n"
+    ".inst 0x8080b421  // fmopa za1.s, p5/M, p5/M, z1.s, z0.s\n"
+    "ld1w { z0.s }, p0/Z, [%x[bptr], x23, LSL #2]\n"
+    "and p0.b, p5/Z, p8.b, p1.b\n"
+    ".inst 0x8080b422  // fmopa za2.s, p5/M, p5/M, z1.s, z0.s\n"
+    "ld1w { z0.s }, p0/Z, [%x[bptr], x22, LSL #2]\n"
+    ".inst 0x8080b423  // fmopa za3.s, p5/M, p5/M, z1.s, z0.s\n"
+    "2:"  // Loop
+    ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    "mov x14, #0xc\n"
+    ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
+    "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
+    "whilelt p0.s, x25, %x[n_channels]\n"
+    "add x20, %x[output], %x[output_col_stride], LSL #2\n"
+    ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
+    "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
+    "add x8, %x[output], %x[output_row_stride], LSL #2\n"
+    ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
+    "fmul z11.s, z9.s, z6.s[2]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
+    "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
+    "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
+    "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
+    "fmul z5.s, z9.s, z6.s[3]\n"
+    "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
+    "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
+    "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
+    "fmul z11.s, z9.s, z7.s[0]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
+    "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
+    "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
+    "fmul z5.s, z9.s, z7.s[1]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
+    "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
+    "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
+    "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
+    "fmul z11.s, z8.s, z6.s[0]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
+    "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
+    "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
+    "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
+    "fmul z5.s, z8.s, z6.s[1]\n"
+    "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
+    "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
+    "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
+    "fmul z11.s, z8.s, z6.s[2]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
+    "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
+    "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
+    "fmul z5.s, z8.s, z6.s[3]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
+    "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
+    "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
+    "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
+    "fmul z11.s, z8.s, z7.s[0]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
+    "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
+    "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
+    "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
+    "fmul z5.s, z8.s, z7.s[1]\n"
+    "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
+    "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
+    "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
+    "fmul z11.s, z15.s, z6.s[0]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
+    "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
+    "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
+    "fmul z5.s, z15.s, z6.s[1]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
+    "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
+    "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
+    "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
+    "fmul z11.s, z15.s, z6.s[2]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
+    "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
+    "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
+    "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
+    "fmul z5.s, z15.s, z6.s[3]\n"
+    "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
+    "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
+    "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
+    "fmul z11.s, z15.s, z7.s[0]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
+    "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
+    "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
+    "fmul z5.s, z15.s, z7.s[1]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
+    "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
+    "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
+    "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
+    "fmul z11.s, z4.s, z6.s[0]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
+    "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
+    "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
+    "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
+    "fmul z5.s, z4.s, z6.s[1]\n"
+    "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
+    "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
+    "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
+    "fmul z11.s, z4.s, z6.s[2]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
+    "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
+    "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
+    "fmul z5.s, z4.s, z6.s[3]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
+    "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
+    "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
+    "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
+    "fmul z11.s, z4.s, z7.s[0]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
+    "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
+    "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
+    "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
+    "fmul z5.s, z4.s, z7.s[1]\n"
+    "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
+    "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
+    "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
+    "fmul z11.s, z3.s, z6.s[0]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
+    "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
+    "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
+    "fmul z5.s, z3.s, z6.s[1]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
+    "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
+    "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
+    "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
+    "fmul z11.s, z3.s, z6.s[2]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
+    "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
+    "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
+    "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
+    "fmul z5.s, z3.s, z6.s[3]\n"
+    "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
+    "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
+    "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
+    "fmul z11.s, z3.s, z7.s[0]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
+    "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
+    "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
+    "fmul z5.s, z3.s, z7.s[1]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
+    "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
+    "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
+    "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
+    "fmul z11.s, z2.s, z6.s[0]\n"
+    "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
+    "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
+    "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
+    "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
+    "fmul z5.s, z2.s, z6.s[1]\n"
+    "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
+    "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
+    "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
+    "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
+    "fmul z11.s, z2.s, z6.s[2]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
+    "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
+    "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
+    ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
+    "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
+    ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
+    "fmul z5.s, z2.s, z6.s[3]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
+    ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
+    "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
+    ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
+    ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
+    ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
+    "fmul z11.s, z2.s, z7.s[0]\n"
+    ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
+    ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
+    ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
+    ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
+    "fmul z5.s, z2.s, z7.s[1]\n"
+    ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
+    ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
+    ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
+    ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
+    "fmul z11.s, z9.s, z6.s[0]\n"
+    ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
+    ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
+    ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
+    ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
+    "fmul z5.s, z9.s, z6.s[1]\n"
+    ".inst 0xc082741f  // mova z31.s, p5/M, za0h.s[XZR]\n"
+    ".inst 0xc082541c  // mova z28.s, p5/M, za0h.s[x14]\n"
+    "fmin z31.s, p5/M, z31.s, z10.s\n"
+    ".inst 0xc082743b  // mova z27.s, p5/M, za0h.s[XZR, #1]\n"
+    "fmin z28.s, p5/M, z28.s, z10.s\n"
+    ".inst 0xc0825438  // mova z24.s, p5/M, za0h.s[x14, #1]\n"
+    "fmin z27.s, p5/M, z27.s, z10.s\n"
+    "mov x13, #0x4\n"
+    "mov x12, #0x8\n"
+    ".inst 0xc082341e  // mova z30.s, p5/M, za0h.s[x13]\n"
+    "fmin z24.s, p5/M, z24.s, z10.s\n"
+    ".inst 0xc082141d  // mova z29.s, p5/M, za0h.s[x12]\n"
+    "fmax z31.s, p5/M, z31.s, z12.s\n"
+    "fmin z30.s, p5/M, z30.s, z10.s\n"
+    ".inst 0xc082343a  // mova z26.s, p5/M, za0h.s[x13, #1]\n"
+    "fmin z29.s, p5/M, z29.s, z10.s\n"
+    "fmax z28.s, p5/M, z28.s, z12.s\n"
+    ".inst 0xc0821439  // mova z25.s, p5/M, za0h.s[x12, #1]\n"
+    "fmax z27.s, p5/M, z27.s, z12.s\n"
+    "fmin z26.s, p5/M, z26.s, z10.s\n"
+    ".inst 0xc0827457  // mova z23.s, p5/M, za0h.s[XZR, #2]\n"
+    "fmin z25.s, p5/M, z25.s, z10.s\n"
+    "fmax z24.s, p5/M, z24.s, z12.s\n"
+    ".inst 0xc0823456  // mova z22.s, p5/M, za0h.s[x13, #2]\n"
+    "fmax z30.s, p5/M, z30.s, z12.s\n"
+    "fmin z23.s, p5/M, z23.s, z10.s\n"
+    ".inst 0xc0821455  // mova z21.s, p5/M, za0h.s[x12, #2]\n"
+    "fmax z29.s, p5/M, z29.s, z12.s\n"
+    "fmin z22.s, p5/M, z22.s, z10.s\n"
+    ".inst 0xc0825454  // mova z20.s, p5/M, za0h.s[x14, #2]\n"
+    "fmax z26.s, p5/M, z26.s, z12.s\n"
+    "fmin z21.s, p5/M, z21.s, z10.s\n"
+    ".inst 0xc0827473  // mova z19.s, p5/M, za0h.s[XZR, #3]\n"
+    "fmax z25.s, p5/M, z25.s, z12.s\n"
+    "fmin z20.s, p5/M, z20.s, z10.s\n"
+    ".inst 0xc0823472  // mova z18.s, p5/M, za0h.s[x13, #3]\n"
+    "fmax z23.s, p5/M, z23.s, z12.s\n"
+    "fmin z19.s, p5/M, z19.s, z10.s\n"
+    ".inst 0xc0821471  // mova z17.s, p5/M, za0h.s[x12, #3]\n"
+    "fmax z22.s, p5/M, z22.s, z12.s\n"
+    "fmin z18.s, p5/M, z18.s, z10.s\n"
+    ".inst 0xc0825470  // mova z16.s, p5/M, za0h.s[x14, #3]\n"
+    "fmax z21.s, p5/M, z21.s, z12.s\n"
+    "fmin z17.s, p5/M, z17.s, z10.s\n"
+    "fmax z20.s, p5/M, z20.s, z12.s\n"
+    "fmin z16.s, p5/M, z16.s, z10.s\n"
+    "st1w { z31.s }, p0, [%x[output], x25, LSL #2]\n"
+    "fmax z19.s, p5/M, z19.s, z12.s\n"
+    "st1w { z30.s }, p0, [x20, x25, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "fmax z18.s, p5/M, z18.s, z12.s\n"
+    "st1w { z29.s }, p0, [x20, x25, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "fmax z17.s, p5/M, z17.s, z12.s\n"
+    "st1w { z28.s }, p0, [x20, x25, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "fmax z16.s, p5/M, z16.s, z12.s\n"
+    "st1w { z27.s }, p0, [x8, x25, LSL #2]\n"
+    "add x8, x8, %x[output_row_stride], LSL #2\n"
+    "st1w { z26.s }, p0, [x20, x25, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z25.s }, p0, [x20, x25, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z24.s }, p0, [x20, x25, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "st1w { z23.s }, p0, [x8, x25, LSL #2]\n"
+    "add x8, x8, %x[output_row_stride], LSL #2\n"
+    "st1w { z22.s }, p0, [x20, x25, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z21.s }, p0, [x20, x25, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z20.s }, p0, [x20, x25, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "st1w { z19.s }, p0, [x8, x25, LSL #2]\n"
+    "st1w { z18.s }, p0, [x20, x25, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z17.s }, p0, [x20, x25, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z16.s }, p0, [x20, x25, LSL #2]\n"
+    "whilelt p0.s, x24, %x[n_channels]\n"
+    "b.none 3f\n"
+    ".inst 0xc082749f  // mova z31.s, p5/M, za1h.s[XZR]\n"
+    ".inst 0xc082349e  // mova z30.s, p5/M, za1h.s[x13]\n"
+    "fmin z31.s, p5/M, z31.s, z10.s\n"
+    ".inst 0xc082149d  // mova z29.s, p5/M, za1h.s[x12]\n"
+    "fmin z30.s, p5/M, z30.s, z10.s\n"
+    ".inst 0xc082549c  // mova z28.s, p5/M, za1h.s[x14]\n"
+    "fmin z29.s, p5/M, z29.s, z10.s\n"
+    ".inst 0xc08274bb  // mova z27.s, p5/M, za1h.s[XZR, #1]\n"
+    "fmin z28.s, p5/M, z28.s, z10.s\n"
+    ".inst 0xc08234ba  // mova z26.s, p5/M, za1h.s[x13, #1]\n"
+    "fmax z31.s, p5/M, z31.s, z12.s\n"
+    "fmin z27.s, p5/M, z27.s, z10.s\n"
+    ".inst 0xc08214b9  // mova z25.s, p5/M, za1h.s[x12, #1]\n"
+    "fmax z30.s, p5/M, z30.s, z12.s\n"
+    "fmin z26.s, p5/M, z26.s, z10.s\n"
+    ".inst 0xc08254b8  // mova z24.s, p5/M, za1h.s[x14, #1]\n"
+    "fmax z29.s, p5/M, z29.s, z12.s\n"
+    "fmin z25.s, p5/M, z25.s, z10.s\n"
+    ".inst 0xc08274d7  // mova z23.s, p5/M, za1h.s[XZR, #2]\n"
+    "fmax z28.s, p5/M, z28.s, z12.s\n"
+    "fmin z24.s, p5/M, z24.s, z10.s\n"
+    ".inst 0xc08234d6  // mova z22.s, p5/M, za1h.s[x13, #2]\n"
+    "fmax z27.s, p5/M, z27.s, z12.s\n"
+    "fmin z23.s, p5/M, z23.s, z10.s\n"
+    ".inst 0xc08214d5  // mova z21.s, p5/M, za1h.s[x12, #2]\n"
+    "fmax z26.s, p5/M, z26.s, z12.s\n"
+    "fmin z22.s, p5/M, z22.s, z10.s\n"
+    "add x20, %x[output], %x[output_col_stride], LSL #2\n"
+    ".inst 0xc08254d4  // mova z20.s, p5/M, za1h.s[x14, #2]\n"
+    "fmax z25.s, p5/M, z25.s, z12.s\n"
+    "fmin z21.s, p5/M, z21.s, z10.s\n"
+    "add x8, %x[output], %x[output_row_stride], LSL #2\n"
+    ".inst 0xc08274f3  // mova z19.s, p5/M, za1h.s[XZR, #3]\n"
+    "fmax z24.s, p5/M, z24.s, z12.s\n"
+    "fmin z20.s, p5/M, z20.s, z10.s\n"
+    ".inst 0xc08234f2  // mova z18.s, p5/M, za1h.s[x13, #3]\n"
+    "fmax z23.s, p5/M, z23.s, z12.s\n"
+    "fmin z19.s, p5/M, z19.s, z10.s\n"
+    ".inst 0xc08214f1  // mova z17.s, p5/M, za1h.s[x12, #3]\n"
+    "fmax z22.s, p5/M, z22.s, z12.s\n"
+    "fmin z18.s, p5/M, z18.s, z10.s\n"
+    ".inst 0xc08254f0  // mova z16.s, p5/M, za1h.s[x14, #3]\n"
+    "fmax z21.s, p5/M, z21.s, z12.s\n"
+    "fmin z17.s, p5/M, z17.s, z10.s\n"
+    "fmax z20.s, p5/M, z20.s, z12.s\n"
+    "fmin z16.s, p5/M, z16.s, z10.s\n"
+    "st1w { z31.s }, p0, [%x[output], x24, LSL #2]\n"
+    "fmax z19.s, p5/M, z19.s, z12.s\n"
+    "st1w { z30.s }, p0, [x20, x24, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "fmax z18.s, p5/M, z18.s, z12.s\n"
+    "st1w { z29.s }, p0, [x20, x24, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "fmax z17.s, p5/M, z17.s, z12.s\n"
+    "st1w { z28.s }, p0, [x20, x24, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "fmax z16.s, p5/M, z16.s, z12.s\n"
+    "st1w { z27.s }, p0, [x8, x24, LSL #2]\n"
+    "add x8, x8, %x[output_row_stride], LSL #2\n"
+    "st1w { z26.s }, p0, [x20, x24, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z25.s }, p0, [x20, x24, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z24.s }, p0, [x20, x24, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "st1w { z23.s }, p0, [x8, x24, LSL #2]\n"
+    "add x8, x8, %x[output_row_stride], LSL #2\n"
+    "st1w { z22.s }, p0, [x20, x24, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z21.s }, p0, [x20, x24, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z20.s }, p0, [x20, x24, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "st1w { z19.s }, p0, [x8, x24, LSL #2]\n"
+    "st1w { z18.s }, p0, [x20, x24, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z17.s }, p0, [x20, x24, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z16.s }, p0, [x20, x24, LSL #2]\n"
+    "whilelt p0.s, x23, %x[n_channels]\n"
+    "b.none 3f\n"
+    ".inst 0xc082751f  // mova z31.s, p5/M, za2h.s[XZR]\n"
+    ".inst 0xc082351e  // mova z30.s, p5/M, za2h.s[x13]\n"
+    "fmin z31.s, p5/M, z31.s, z10.s\n"
+    ".inst 0xc082151d  // mova z29.s, p5/M, za2h.s[x12]\n"
+    "fmin z30.s, p5/M, z30.s, z10.s\n"
+    ".inst 0xc082551c  // mova z28.s, p5/M, za2h.s[x14]\n"
+    "fmin z29.s, p5/M, z29.s, z10.s\n"
+    ".inst 0xc082753b  // mova z27.s, p5/M, za2h.s[XZR, #1]\n"
+    "fmin z28.s, p5/M, z28.s, z10.s\n"
+    ".inst 0xc082353a  // mova z26.s, p5/M, za2h.s[x13, #1]\n"
+    "fmax z31.s, p5/M, z31.s, z12.s\n"
+    "fmin z27.s, p5/M, z27.s, z10.s\n"
+    ".inst 0xc0821539  // mova z25.s, p5/M, za2h.s[x12, #1]\n"
+    "fmax z30.s, p5/M, z30.s, z12.s\n"
+    "fmin z26.s, p5/M, z26.s, z10.s\n"
+    ".inst 0xc0825538  // mova z24.s, p5/M, za2h.s[x14, #1]\n"
+    "fmax z29.s, p5/M, z29.s, z12.s\n"
+    "fmin z25.s, p5/M, z25.s, z10.s\n"
+    ".inst 0xc0827557  // mova z23.s, p5/M, za2h.s[XZR, #2]\n"
+    "fmax z28.s, p5/M, z28.s, z12.s\n"
+    "fmin z24.s, p5/M, z24.s, z10.s\n"
+    ".inst 0xc0823556  // mova z22.s, p5/M, za2h.s[x13, #2]\n"
+    "fmax z27.s, p5/M, z27.s, z12.s\n"
+    "fmin z23.s, p5/M, z23.s, z10.s\n"
+    ".inst 0xc0821555  // mova z21.s, p5/M, za2h.s[x12, #2]\n"
+    "fmax z26.s, p5/M, z26.s, z12.s\n"
+    "fmin z22.s, p5/M, z22.s, z10.s\n"
+    "add x20, %x[output], %x[output_col_stride], LSL #2\n"
+    ".inst 0xc0825554  // mova z20.s, p5/M, za2h.s[x14, #2]\n"
+    "fmax z25.s, p5/M, z25.s, z12.s\n"
+    "fmin z21.s, p5/M, z21.s, z10.s\n"
+    "add x8, %x[output], %x[output_row_stride], LSL #2\n"
+    ".inst 0xc0827573  // mova z19.s, p5/M, za2h.s[XZR, #3]\n"
+    "fmax z24.s, p5/M, z24.s, z12.s\n"
+    "fmin z20.s, p5/M, z20.s, z10.s\n"
+    ".inst 0xc0823572  // mova z18.s, p5/M, za2h.s[x13, #3]\n"
+    "fmax z23.s, p5/M, z23.s, z12.s\n"
+    "fmin z19.s, p5/M, z19.s, z10.s\n"
+    ".inst 0xc0821571  // mova z17.s, p5/M, za2h.s[x12, #3]\n"
+    "fmax z22.s, p5/M, z22.s, z12.s\n"
+    "fmin z18.s, p5/M, z18.s, z10.s\n"
+    ".inst 0xc0825570  // mova z16.s, p5/M, za2h.s[x14, #3]\n"
+    "fmax z21.s, p5/M, z21.s, z12.s\n"
+    "fmin z17.s, p5/M, z17.s, z10.s\n"
+    "fmax z20.s, p5/M, z20.s, z12.s\n"
+    "fmin z16.s, p5/M, z16.s, z10.s\n"
+    "st1w { z31.s }, p0, [%x[output], x23, LSL #2]\n"
+    "fmax z19.s, p5/M, z19.s, z12.s\n"
+    "st1w { z30.s }, p0, [x20, x23, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "fmax z18.s, p5/M, z18.s, z12.s\n"
+    "st1w { z29.s }, p0, [x20, x23, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "fmax z17.s, p5/M, z17.s, z12.s\n"
+    "st1w { z28.s }, p0, [x20, x23, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "fmax z16.s, p5/M, z16.s, z12.s\n"
+    "st1w { z27.s }, p0, [x8, x23, LSL #2]\n"
+    "add x8, x8, %x[output_row_stride], LSL #2\n"
+    "st1w { z26.s }, p0, [x20, x23, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z25.s }, p0, [x20, x23, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z24.s }, p0, [x20, x23, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "st1w { z23.s }, p0, [x8, x23, LSL #2]\n"
+    "add x8, x8, %x[output_row_stride], LSL #2\n"
+    "st1w { z22.s }, p0, [x20, x23, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z21.s }, p0, [x20, x23, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z20.s }, p0, [x20, x23, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "st1w { z19.s }, p0, [x8, x23, LSL #2]\n"
+    "st1w { z18.s }, p0, [x20, x23, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z17.s }, p0, [x20, x23, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z16.s }, p0, [x20, x23, LSL #2]\n"
+    "whilelt p0.s, x22, %x[n_channels]\n"
+    "b.none 3f\n"
+    "fmov z1.s, #1.0\n"
+    ".inst 0xc082759f  // mova z31.s, p5/M, za3h.s[XZR]\n"
+    ".inst 0xc082359e  // mova z30.s, p5/M, za3h.s[x13]\n"
+    "fmin z31.s, p5/M, z31.s, z10.s\n"
+    ".inst 0xc082159d  // mova z29.s, p5/M, za3h.s[x12]\n"
+    "fmin z30.s, p5/M, z30.s, z10.s\n"
+    ".inst 0xc082559c  // mova z28.s, p5/M, za3h.s[x14]\n"
+    "fmin z29.s, p5/M, z29.s, z10.s\n"
+    ".inst 0xc08275bb  // mova z27.s, p5/M, za3h.s[XZR, #1]\n"
+    "fmin z28.s, p5/M, z28.s, z10.s\n"
+    ".inst 0xc08235ba  // mova z26.s, p5/M, za3h.s[x13, #1]\n"
+    "fmax z31.s, p5/M, z31.s, z12.s\n"
+    "fmin z27.s, p5/M, z27.s, z10.s\n"
+    ".inst 0xc08215b9  // mova z25.s, p5/M, za3h.s[x12, #1]\n"
+    "fmax z30.s, p5/M, z30.s, z12.s\n"
+    "fmin z26.s, p5/M, z26.s, z10.s\n"
+    ".inst 0xc08255b8  // mova z24.s, p5/M, za3h.s[x14, #1]\n"
+    "fmax z29.s, p5/M, z29.s, z12.s\n"
+    "fmin z25.s, p5/M, z25.s, z10.s\n"
+    ".inst 0xc08275d7  // mova z23.s, p5/M, za3h.s[XZR, #2]\n"
+    "fmax z28.s, p5/M, z28.s, z12.s\n"
+    "fmin z24.s, p5/M, z24.s, z10.s\n"
+    ".inst 0xc08235d6  // mova z22.s, p5/M, za3h.s[x13, #2]\n"
+    "fmax z27.s, p5/M, z27.s, z12.s\n"
+    "fmin z23.s, p5/M, z23.s, z10.s\n"
+    ".inst 0xc08215d5  // mova z21.s, p5/M, za3h.s[x12, #2]\n"
+    "fmax z26.s, p5/M, z26.s, z12.s\n"
+    "fmin z22.s, p5/M, z22.s, z10.s\n"
+    ".inst 0xc08255d4  // mova z20.s, p5/M, za3h.s[x14, #2]\n"
+    "fmax z25.s, p5/M, z25.s, z12.s\n"
+    "fmin z21.s, p5/M, z21.s, z10.s\n"
+    "add x20, %x[output], %x[output_col_stride], LSL #2\n"
+    ".inst 0xc08275f3  // mova z19.s, p5/M, za3h.s[XZR, #3]\n"
+    "fmax z24.s, p5/M, z24.s, z12.s\n"
+    "fmin z20.s, p5/M, z20.s, z10.s\n"
+    "add x8, %x[output], %x[output_row_stride], LSL #2\n"
+    ".inst 0xc08235f2  // mova z18.s, p5/M, za3h.s[x13, #3]\n"
+    "fmax z23.s, p5/M, z23.s, z12.s\n"
+    "fmin z19.s, p5/M, z19.s, z10.s\n"
+    "incw x25, ALL, MUL #4\n"
+    ".inst 0xc08215f1  // mova z17.s, p5/M, za3h.s[x12, #3]\n"
+    "fmax z22.s, p5/M, z22.s, z12.s\n"
+    "fmin z18.s, p5/M, z18.s, z10.s\n"
+    "incw x24, ALL, MUL #4\n"
+    ".inst 0xc08255f0  // mova z16.s, p5/M, za3h.s[x14, #3]\n"
+    "fmax z21.s, p5/M, z21.s, z12.s\n"
+    "fmin z17.s, p5/M, z17.s, z10.s\n"
+    "incw x23, ALL, MUL #4\n"
+    ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+    "fmax z20.s, p5/M, z20.s, z12.s\n"
+    "fmin z16.s, p5/M, z16.s, z10.s\n"
+    "add x21, %x[inptr], %x[matrix_stride], LSL #2\n"
+    "fmax z19.s, p5/M, z19.s, z12.s\n"
+    "st1w { z31.s }, p0, [%x[output], x22, LSL #2]\n"
+    "fmax z18.s, p5/M, z18.s, z12.s\n"
+    "st1w { z30.s }, p0, [x20, x22, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "fmax z17.s, p5/M, z17.s, z12.s\n"
+    "st1w { z29.s }, p0, [x20, x22, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "fmax z16.s, p5/M, z16.s, z12.s\n"
+    "st1w { z28.s }, p0, [x20, x22, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "st1w { z27.s }, p0, [x8, x22, LSL #2]\n"
+    "add x8, x8, %x[output_row_stride], LSL #2\n"
+    "st1w { z26.s }, p0, [x20, x22, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z25.s }, p0, [x20, x22, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z24.s }, p0, [x20, x22, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "st1w { z23.s }, p0, [x8, x22, LSL #2]\n"
+    "add x8, x8, %x[output_row_stride], LSL #2\n"
+    "st1w { z22.s }, p0, [x20, x22, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z21.s }, p0, [x20, x22, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z20.s }, p0, [x20, x22, LSL #2]\n"
+    "add x20, x8, %x[output_col_stride], LSL #2\n"
+    "st1w { z19.s }, p0, [x8, x22, LSL #2]\n"
+    "st1w { z18.s }, p0, [x20, x22, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z17.s }, p0, [x20, x22, LSL #2]\n"
+    "add x20, x20, %x[output_col_stride], LSL #2\n"
+    "st1w { z16.s }, p0, [x20, x22, LSL #2]\n"
+    "incw x22, ALL, MUL #4\n"
+    "whilelt p1.s, x22, %x[n_channels]\n"
+    "ld1w { z28.s }, p1/Z, [%x[inptr], x22, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "whilelt p2.s, x23, %x[n_channels]\n"
+    "whilelt p3.s, x24, %x[n_channels]\n"
+    "ld1w { z30.s }, p3/Z, [%x[inptr], x24, LSL #2]\n"
+    "whilelt p4.s, x25, %x[n_channels]\n"
+    "ld1w { z31.s }, p4/Z, [%x[inptr], x25, LSL #2]\n"
+    "and p0.b, p5/Z, p8.b, p4.b\n"
+    "ld1w { z29.s }, p2/Z, [%x[inptr], x23, LSL #2]\n"
+    "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
+    "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
+    "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
+    "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
+    "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
+    "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "add x21, x21, %x[matrix_stride], LSL #2\n"
+    "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
+    "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
+    "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
+    "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
+    "ld1w { z0.s }, p0/Z, [%x[bptr], x25, LSL #2]\n"
+    "and p0.b, p5/Z, p8.b, p3.b\n"
+    ".inst 0x8080b420  // fmopa za0.s, p5/M, p5/M, z1.s, z0.s\n"
+    "ld1w { z0.s }, p0/Z, [%x[bptr], x24, LSL #2]\n"
+    "and p0.b, p5/Z, p8.b, p2.b\n"
+    ".inst 0x8080b421  // fmopa za1.s, p5/M, p5/M, z1.s, z0.s\n"
+    "ld1w { z0.s }, p0/Z, [%x[bptr], x23, LSL #2]\n"
+    "and p0.b, p5/Z, p8.b, p1.b\n"
+    ".inst 0x8080b422  // fmopa za2.s, p5/M, p5/M, z1.s, z0.s\n"
+    "ld1w { z0.s }, p0/Z, [%x[bptr], x22, LSL #2]\n"
+    ".inst 0x8080b423  // fmopa za3.s, p5/M, p5/M, z1.s, z0.s\n"
+    "b.any 2b\n"
+    "3:"  // End
+    ".inst 0xd503467f  // SMSTOP\n"
+    :
+    : [bptr] "r" (bptr), [inptr] "r" (inptr), [matrix_stride] "r" (matrix_stride), [n_channels] "r" (n_channels), [offsetof_Params_act_max] "I" (offsetof(Params, act_max)), [offsetof_Params_act_min] "I" (offsetof(Params, act_min)), [offsetof_Params_inner_terms] "I" (offsetof(Params, inner_terms)), [offsetof_Params_outer_terms] "I" (offsetof(Params, outer_terms)), [output] "r" (output), [output_col_stride] "r" (output_col_stride), [output_row_stride] "r" (output_row_stride), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p8", "x12", "x13", "x14", "x8", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp
new file mode 100644
index 0000000000..c39b1dc083
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+#include "output_transform.hpp"
+#include "winograd_implementations.hpp"
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+void a64_fp16_4x4_3x3(unsigned int, const __fp16 *, size_t, const __fp16 *, __fp16 *, size_t, size_t, __fp16, __fp16);
+
+#define IMPL(OUT_HEIGHT, OUT_WIDTH, KERN_HEIGHT, KERN_WIDTH, FUNC, DRIVER) \
+  new Transform ## DRIVER <__fp16, __fp16>(#FUNC, OUT_HEIGHT, OUT_WIDTH, KERN_HEIGHT, KERN_WIDTH, FUNC)
+
+
+static const TransformImplementation<__fp16> transforms_fp16[] = {
+  { IMPL(4, 4, 3, 3, a64_fp16_4x4_3x3, Unpadded) },
+  { nullptr }
+};
+
+template <>
+const TransformImplementation<__fp16> *implementation_list(void)
+{
+  return transforms_fp16;
+}
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+\ No newline at end of file
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms_fp32.cpp
new file mode 100644
index 0000000000..0a7030324e
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms_fp32.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "output_transform.hpp"
+#include "winograd_implementations.hpp"
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+void sme_fp32_mopa_4x4_3x3(unsigned int, const float *, size_t, const float *, float *, size_t, size_t, float, float);
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#endif  // defined(__aarch64__)
+void arm_fp32_4x4_3x3(unsigned int, const float *, size_t, const float *, float *, size_t, size_t, float, float);
+void arm_fp32_2x2_3x3(unsigned int, const float *, size_t, const float *, float *, size_t, size_t, float, float);
+void arm_fp32_2x2_5x5(unsigned int, const float *, size_t, const float *, float *, size_t, size_t, float, float);
+void arm_fp32_1x6_1x3(unsigned int, const float *, size_t, const float *, float *, size_t, size_t, float, float);
+void arm_fp32_1x4_1x5(unsigned int, const float *, size_t, const float *, float *, size_t, size_t, float, float);
+void arm_fp32_1x2_1x7(unsigned int, const float *, size_t, const float *, float *, size_t, size_t, float, float);
+
+#define IMPL(OUT_HEIGHT, OUT_WIDTH, KERN_HEIGHT, KERN_WIDTH, FUNC, DRIVER) \
+  new Transform ## DRIVER <float, float>(#FUNC, OUT_HEIGHT, OUT_WIDTH, KERN_HEIGHT, KERN_WIDTH, FUNC)
+
+#define IMPL_T(OUT_HEIGHT, OUT_WIDTH, KERN_HEIGHT, KERN_WIDTH, FUNC, DRIVER) \
+  new Transform ## DRIVER <float, float>(#FUNC, OUT_HEIGHT, OUT_WIDTH, KERN_HEIGHT, KERN_WIDTH, Transform ## DRIVER <float, float>::get_transposed_kernel(FUNC))
+
+static const TransformImplementation<float> transforms_fp32[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+  { IMPL(4, 4, 3, 3, sme_fp32_mopa_4x4_3x3, Unpadded), MethodConstraints::RequiresSME },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
+#endif  // defined(__aarch64__)
+  { IMPL(4, 4, 3, 3, arm_fp32_4x4_3x3, Unpadded), MethodConstraints::LargerShape },
+  { IMPL(2, 2, 3, 3, arm_fp32_2x2_3x3, Unpadded) },
+  { IMPL(2, 2, 5, 5, arm_fp32_2x2_5x5, Unpadded) },
+  { IMPL(1, 6, 1, 3, arm_fp32_1x6_1x3, Unpadded) },
+  { IMPL_T(6, 1, 3, 1, arm_fp32_1x6_1x3, Unpadded) },
+  { IMPL(1, 4, 1, 5, arm_fp32_1x4_1x5, Unpadded) },
+  { IMPL_T(4, 1, 5, 1, arm_fp32_1x4_1x5, Unpadded) },
+  { IMPL(1, 2, 1, 7, arm_fp32_1x2_1x7, Unpadded) },
+  { IMPL_T(2, 1, 7, 1, arm_fp32_1x2_1x7, Unpadded) },
+  { nullptr }
+};
+
+template <>
+const TransformImplementation<float> *implementation_list(void)
+{
+  return transforms_fp32;
+}
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/padding.cpp b/src/core/NEON/kernels/convolution/winograd/padding.cpp
deleted file mode 100644
index 1d44c384d9..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/padding.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <cstring>
-#include <cstdint>
-
-#include "padding.hpp"
-
-namespace padding
-{
-
-template <typename T>
-void copy_and_pad_tile(
-  const unsigned int tile_rows,
-  const unsigned int tile_cols,
-  const unsigned int n_channels,
-  const T* const inptr,
-  const unsigned int in_row_stride,
-  const unsigned int in_col_stride,
-  T* const outptr,
-  const unsigned int out_row_stride,
-  const unsigned int out_col_stride,
-  const unsigned int pad_top,
-  const unsigned int pad_left,
-  const unsigned int pad_bottom,
-  const unsigned int pad_right,
-  const T pad_value
-)
-{
-  for (unsigned int out_i = 0; out_i < tile_rows; out_i++)
-  {
-    for (unsigned int out_j = 0; out_j < tile_cols; out_j++)
-    {
-      T* const output = outptr + out_i*out_row_stride + out_j*out_col_stride;
-
-      if (out_i < pad_top || tile_rows - pad_bottom <= out_i ||
-          out_j < pad_left || tile_cols - pad_right <= out_j)
-      {
-        for (unsigned int n = 0; n < n_channels; n++)
-        {
-          output[n] = pad_value;
-        }
-      }
-      else
-      {
-        const auto in_i = out_i - pad_top, in_j = out_j - pad_left;
-        const T* const input = inptr + in_i*in_row_stride + in_j*in_col_stride;
-        std::memcpy(output, input, n_channels * sizeof(T));
-      }
-    }
-  }
-}
-
-template void copy_and_pad_tile(
-  unsigned int, unsigned int, unsigned int,
-  const uint8_t *, unsigned int, unsigned int,
-  uint8_t *, unsigned int, unsigned int,
-  unsigned int, unsigned int, unsigned int, unsigned int, uint8_t
-);
-
-template void copy_and_pad_tile(
-  unsigned int, unsigned int, unsigned int,
-  const float *, unsigned int, unsigned int,
-  float *, unsigned int, unsigned int,
-  unsigned int, unsigned int, unsigned int, unsigned int, float
-);
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template void copy_and_pad_tile(
-    unsigned int, unsigned int, unsigned int,
-    const __fp16 *, unsigned int, unsigned int,
-    __fp16 *, unsigned int, unsigned int,
-    unsigned int, unsigned int, unsigned int, unsigned int, __fp16
-);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <unsigned int TileRows, unsigned int TileCols>
-void CopyCropped<TileRows, TileCols>::execute(
-  const size_t size,
-  const void * const inptr,
-  const size_t in_row_stride,
-  const size_t in_col_stride,
-  void * const outptr,
-  const size_t out_row_stride,
-  const size_t out_col_stride,
-  const unsigned int pad_top,
-  const unsigned int pad_left,
-  const unsigned int pad_bottom,
-  const unsigned int pad_right
-)
-{
-  for (unsigned int out_i = 0, in_i = pad_top; in_i < TileRows - pad_bottom; out_i++, in_i++)
-  {
-    for (unsigned int out_j = 0, in_j = pad_left; in_j < TileCols - pad_right; out_j++, in_j++)
-    {
-      std::memcpy(
-        static_cast<uint8_t *>(outptr) + out_i*out_row_stride + out_j*out_col_stride,
-        static_cast<const uint8_t *>(inptr) + in_i*in_row_stride + in_j*in_col_stride,
-        size
-      );
-    }
-  }
-}
-
-template class CopyCropped<2, 2>;
-template class CopyCropped<3, 3>;
-template class CopyCropped<4, 4>;
-
-template <typename T>
-void crop_and_copy_tile(
-  unsigned int tile_rows,
-  unsigned int tile_cols,
-  unsigned int n_channels,
-  const T *inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  T *outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride,
-  unsigned int crop_top,
-  unsigned int crop_left,
-  unsigned int crop_bottom,
-  unsigned int crop_right
-)
-{
-  for (unsigned int out_i = 0, in_i = crop_top; in_i < tile_rows - crop_bottom; out_i++, in_i++)
-  {
-    for (unsigned int out_j = 0, in_j = crop_left; in_j < tile_cols - crop_right; out_j++, in_j++)
-    {
-      std::memcpy(
-        outptr + out_i*out_row_stride + out_j*out_col_stride,
-        inptr + in_i*in_row_stride + in_j*in_col_stride,
-        sizeof(T) * n_channels
-      );
-    }
-  }
-}
-
-template void crop_and_copy_tile(
-  unsigned int tile_rows,
-  unsigned int tile_cols,
-  unsigned int n_channels,
-  const float *inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  float *outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride,
-  unsigned int crop_top,
-  unsigned int crop_left,
-  unsigned int crop_bottom,
-  unsigned int crop_right
-);
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template void crop_and_copy_tile(
-    unsigned int tile_rows,
-    unsigned int tile_cols,
-    unsigned int n_channels,
-    const __fp16 *inptr,
-    unsigned int in_row_stride,
-    unsigned int in_col_stride,
-    __fp16 *outptr,
-    unsigned int out_row_stride,
-    unsigned int out_col_stride,
-    unsigned int crop_top,
-    unsigned int crop_left,
-    unsigned int crop_bottom,
-    unsigned int crop_right
-);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-}  // namespace padding
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp b/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp
new file mode 100644
index 0000000000..5569bc1b89
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "winograd.hpp"
+#include <algorithm>
+#include <functional>
+
+namespace arm_conv {
+namespace winograd {
+namespace weight_transform {
+
+/* Driver class for the Winograd weight transforms.
+ */
+template <typename TIn, typename TOut=TIn>
+class Transform : public ITransform
+{
+  using Kernel = std::function<void(
+    unsigned int n_channels,  // Number of channels to transform
+    const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
+    TOut *outptr, size_t ld_out_matrix
+  )>;
+
+  const std::string m_name;
+  const unsigned int m_kernel_rows, m_kernel_cols;
+  const unsigned int m_transformed_tile_rows, m_transformed_tile_cols;
+  const Kernel m_kernel;
+
+  void execute_internal(
+    const ConvolutionArgs &args,
+    const TIn *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
+    TOut *outptr, size_t ld_out_matrix, size_t ld_out_row,
+    unsigned int thread_id, unsigned int n_threads
+  ) const
+  {
+    // Stripe groups of input channels over threads, this should reduce false
+    // sharing of the output matrix.
+    constexpr auto n_input_channels_per_thread = 16u;
+
+    // Get the initial offset for the input and output pointers
+    const auto offset = thread_id * n_input_channels_per_thread;
+    inptr += offset * ld_input_channel;
+    outptr += offset * ld_out_row;
+
+    for (auto start_ic = thread_id * n_input_channels_per_thread;
+         start_ic < args.n_input_channels;
+         start_ic += n_threads * n_input_channels_per_thread)
+    {
+      // Now iterate over the input channels assigned to this thread.
+      const auto end_ic = std::min(args.n_input_channels,
+                                   start_ic + n_input_channels_per_thread);
+      for (auto ic = start_ic; ic < end_ic; ic++)
+      {
+        m_kernel(args.n_output_channels, inptr, ld_in_row, ld_in_col,
+                 outptr, ld_out_matrix);
+        inptr += ld_input_channel;
+        outptr += ld_out_row;
+      }
+
+      // Progress the pointers to the account for the work not performed by
+      // this thread.
+      const auto skip = (n_threads - 1) * n_input_channels_per_thread;
+      inptr += skip * ld_input_channel;
+      outptr += skip * ld_out_row;
+    }
+  }
+
+  public:
+  Transform(
+    const std::string &name,
+    unsigned int kernel_rows, unsigned int kernel_cols,
+    unsigned int transformed_tile_rows, unsigned int transformed_tile_cols,
+    const Kernel kernel
+  )
+  : m_name(name),
+    m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+    m_transformed_tile_rows(transformed_tile_rows), m_transformed_tile_cols(transformed_tile_cols),
+    m_kernel(kernel)
+  {
+  }
+
+  const std::string &get_name(void) const override { return m_name; }
+
+  unsigned int get_kernel_rows(void) const override { return m_kernel_rows; }
+  unsigned int get_kernel_cols(void) const override { return m_kernel_cols; }
+
+  unsigned int get_transformed_tile_rows(void) const override { return m_transformed_tile_rows; }
+  unsigned int get_transformed_tile_cols(void) const override { return m_transformed_tile_cols; }
+
+  void execute(
+    const ConvolutionArgs &args,
+    const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
+    void *outptr, size_t ld_out_matrix, size_t ld_out_row,
+    unsigned int thread_id, unsigned int n_threads
+  ) const override
+  {
+    execute_internal(
+      args,
+      reinterpret_cast<const TIn *>(inptr), ld_in_row, ld_in_col, ld_input_channel,
+      reinterpret_cast<TOut *>(outptr), ld_out_matrix, ld_out_row,
+      thread_id, n_threads
+    );
+  }
+
+  /* Utility method to get a transposed variant of a kernel, this transposed
+   * version simply calls the original kernel with the input row and column
+   * strides swapped.
+   */
+  static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
+  {
+    return [kernel] (
+      const unsigned int n_channels,
+      const TIn *const inptr, const size_t ld_in_row, const size_t ld_in_col,
+      TOut *const outptr, const size_t ld_out
+    ) {
+      kernel(n_channels, inptr, ld_in_col, ld_in_row, outptr, ld_out);
+    };
+  }
+};
+
+}  // namespace weight_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp
new file mode 100644
index 0000000000..0d9a65890e
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+#include <cstddef>
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace weight_transform {
+
+void a64_fp16_4x4_3x3(
+    unsigned int n_channels,
+    const __fp16* inptr,  // NOTE: Data in HWIO order
+    const size_t ld_weight_row,
+    const size_t ld_weight_col,
+    __fp16* outptr,
+    const size_t matrix_stride
+)
+{
+#ifdef __aarch64__
+    for (; n_channels >= 8; n_channels -= 8)
+    {
+      // Matrices used and computed in this kernel
+      float16x8_t w[3][3], Ww[6][3], V[6][6];
+
+      // Read weights
+      for (int i = 0; i < 3; i++)
+      {
+        for (int j = 0; j < 3; j++)
+        {
+          w[i][j] = vld1q_f16(inptr + i*ld_weight_row + j*ld_weight_col);
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 3; j++)
+      {
+        // Ww[0][j] =  6*w[0][j];
+        Ww[0][j] = vmulq_n_f16(w[0][j], 6.0);
+
+        // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+        Ww[1][j] = vmulq_n_f16(vaddq_f16(vaddq_f16(w[0][j], w[1][j]), w[2][j]), -4.0);
+
+        // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+        Ww[2][j] = vmulq_n_f16(vsubq_f16(vsubq_f16(w[1][j], w[0][j]), w[2][j]), 4.0);
+
+        // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+        Ww[3][j] = vaddq_f16(vaddq_f16(w[0][j], vmulq_f16(w[1][j], vdupq_n_f16(2.0f))), vmulq_f16(w[2][j], vdupq_n_f16(4.0f)));
+
+        // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+        Ww[4][j] = vaddq_f16(vsubq_f16(w[0][j], vmulq_f16(w[1][j], vdupq_n_f16(2.0f))), vmulq_f16(w[2][j], vdupq_n_f16(4.0f)));
+
+        // Ww[5][j] = 24*w[2][j];
+        Ww[5][j] = vmulq_n_f16(w[2][j], 24.0f);
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < 6; i++)
+      {
+        const float recip576 = 1.0f / 576.0f;
+
+        // V[i][0] =  6*Ww[i][0];
+        V[i][0] = vmulq_n_f16(vmulq_n_f16(Ww[i][0], 6.0), recip576);
+
+        // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
+        V[i][1] = vmulq_n_f16(vmulq_n_f16(vaddq_f16(vaddq_f16(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
+
+        // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
+        V[i][2] = vmulq_n_f16(vmulq_n_f16(vsubq_f16(vsubq_f16(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
+
+        // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
+        V[i][3] = vmulq_n_f16(vaddq_f16(vaddq_f16(Ww[i][0], vmulq_f16(Ww[i][1], vdupq_n_f16(2.0f))), vmulq_f16(Ww[i][2], vdupq_n_f16(4.0f))), recip576);
+
+        // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
+        V[i][4] = vmulq_n_f16(vaddq_f16(vsubq_f16(Ww[i][0], vmulq_f16(Ww[i][1], vdupq_n_f16(2.0f))), vmulq_f16(Ww[i][2], vdupq_n_f16(4.0f))), recip576);
+
+        // V[i][5] = 24*Ww[i][2];
+        V[i][5] = vmulq_n_f16(vmulq_n_f16(Ww[i][2], 24.0f), recip576);
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          vst1q_f16(outptr + m*matrix_stride, V[i][j]);
+        }
+      }
+      inptr += 8;
+      outptr += 8;
+    }
+#endif  // __aarch64__
+#ifdef __arm_any__
+    for (; n_channels >= 4; n_channels -= 4)
+    {
+      // Matrices used and computed in this kernel
+      float16x4_t w[3][3], Ww[6][3], V[6][6];
+
+      // Read weights
+      for (int i = 0; i < 3; i++)
+      {
+        for (int j = 0; j < 3; j++)
+        {
+          w[i][j] = vld1_f16(inptr + i*ld_weight_row + j*ld_weight_col);
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 3; j++)
+      {
+        // Ww[0][j] =  6*w[0][j];
+        Ww[0][j] = vmul_n_f16(w[0][j], 6.0);
+
+        // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+        Ww[1][j] = vmul_n_f16(vadd_f16(vadd_f16(w[0][j], w[1][j]), w[2][j]), -4.0);
+
+        // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+        Ww[2][j] = vmul_n_f16(vsub_f16(vsub_f16(w[1][j], w[0][j]), w[2][j]), 4.0);
+
+        // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+        Ww[3][j] = vadd_f16(vadd_f16(w[0][j], vmul_f16(w[1][j], vdup_n_f16(2.0f))), vmul_f16(w[2][j], vdup_n_f16(4.0f)));
+
+        // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+        Ww[4][j] = vadd_f16(vsub_f16(w[0][j], vmul_f16(w[1][j], vdup_n_f16(2.0f))), vmul_f16(w[2][j], vdup_n_f16(4.0f)));
+
+        // Ww[5][j] = 24*w[2][j];
+        Ww[5][j] = vmul_n_f16(w[2][j], 24.0f);
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < 6; i++)
+      {
+        const float recip576 = 1.0f / 576.0f;
+
+        // V[i][0] =  6*Ww[i][0];
+        V[i][0] = vmul_n_f16(vmul_n_f16(Ww[i][0], 6.0), recip576);
+
+        // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
+        V[i][1] = vmul_n_f16(vmul_n_f16(vadd_f16(vadd_f16(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
+
+        // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
+        V[i][2] = vmul_n_f16(vmul_n_f16(vsub_f16(vsub_f16(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
+
+        // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
+        V[i][3] = vmul_n_f16(vadd_f16(vadd_f16(Ww[i][0], vmul_f16(Ww[i][1], vdup_n_f16(2.0f))), vmul_f16(Ww[i][2], vdup_n_f16(4.0f))), recip576);
+
+        // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
+        V[i][4] = vmul_n_f16(vadd_f16(vsub_f16(Ww[i][0], vmul_f16(Ww[i][1], vdup_n_f16(2.0f))), vmul_f16(Ww[i][2], vdup_n_f16(4.0f))), recip576);
+
+        // V[i][5] = 24*Ww[i][2];
+        V[i][5] = vmul_n_f16(vmul_n_f16(Ww[i][2], 24.0f), recip576);
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          vst1_f16(outptr + m*matrix_stride, V[i][j]);
+        }
+      }
+      inptr += 4;
+      outptr += 4;
+    }
+#endif  // __arm_any__
+    for (; n_channels; n_channels--)
+    {
+      // Matrices used and computed in this kernel
+      __fp16 w[3][3], Ww[6][3], V[6][6];
+
+      // Read weights
+      for (int i = 0; i < 3; i++)
+      {
+        for (int j = 0; j < 3; j++)
+        {
+          w[i][j] = *(inptr + i*ld_weight_row + j*ld_weight_col);
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 3; j++)
+      {
+        Ww[0][j] =  6*w[0][j];
+        Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+        Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+        Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+        Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+        Ww[5][j] = 24*w[2][j];
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < 6; i++)
+      {
+        V[i][0] = ( 6*Ww[i][0]) / 576.0;
+        V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
+        V[i][2] = (-4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
+        V[i][3] = ( 1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
+        V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
+        V[i][5] = (24*Ww[i][2]) / 576.0;
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          *(outptr + m*matrix_stride) = V[i][j];
+        }
+      }
+
+      inptr++;
+      outptr++;
+    }
+}
+
+}  // namespace weight_transform
+}  // namespace winograd
+}  // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_3x3.cpp
new file mode 100644
index 0000000000..ebfe03e6d9
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_3x3.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace weight_transform {
+
+void arm_fp32_2x2_3x3(
+  unsigned int n_channels,
+  const float *inptr, size_t ld_weight_row, size_t ld_weight_col,
+  float *outptr, size_t matrix_stride
+)
+{
+  constexpr auto inner_tile_i = 4u;
+  constexpr auto inner_tile_j = 4u;
+
+#ifdef __aarch64__
+  // For each output channel
+  for (; n_channels >= 4u; n_channels -= 4)
+  {
+    // Matrices used and computed in this kernel
+    float32x4_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+    // Read weights
+    for (int i = 0; i < 3; i++)
+    {
+      for (int j = 0; j < 3; j++)
+      {
+        w[i][j] = vld1q_f32(inptr + i*ld_weight_row + j*ld_weight_col);
+      }
+    }
+
+    // Compute the matrix W w
+    for (int j = 0; j < 3; j++)
+    {
+      Ww[0][j] = w[0][j];
+
+      // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+      Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+      // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+      Ww[2][j] = vmulq_n_f32(vaddq_f32(vsubq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+      Ww[3][j] = w[2][j];
+    }
+
+    // Compute V = W w WT
+    for (auto i = 0u; i < inner_tile_i; i++)
+    {
+      V[i][0] = Ww[i][0];
+
+      // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+      V[i][1] = vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+      // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+      V[i][2] = vmulq_n_f32(vaddq_f32(vsubq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+      V[i][3] = Ww[i][2];
+    }
+
+    // Store the transformed weights
+    for (auto i = 0u, m = 0u; i < inner_tile_i; i++)
+    {
+      for (auto j = 0u; j < inner_tile_j; j++, m++)
+      {
+        vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+      }
+    }
+
+    inptr += 4;
+    outptr += 4;
+  }
+#endif // __aarch64__
+  for (; n_channels >= 2u; n_channels -= 2)
+  {
+    // Matrices used and computed in this kernel
+    float32x2_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+    // Read weights
+    for (int i = 0; i < 3; i++)
+    {
+      for (int j = 0; j < 3; j++)
+      {
+        w[i][j] = vld1_f32(inptr + i*ld_weight_row + j*ld_weight_col);
+      }
+    }
+
+    // Compute the matrix W w
+    for (int j = 0; j < 3; j++)
+    {
+      Ww[0][j] = w[0][j];
+
+      // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+      Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+      // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+      Ww[2][j] = vmul_n_f32(vadd_f32(vsub_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+      Ww[3][j] = w[2][j];
+    }
+
+    // Compute V = W w WT
+    for (auto i = 0u; i < inner_tile_i; i++)
+    {
+      V[i][0] = Ww[i][0];
+
+      // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+      V[i][1] = vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+      // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+      V[i][2] = vmul_n_f32(vadd_f32(vsub_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+      V[i][3] = Ww[i][2];
+    }
+
+    // Store the transformed weights
+    for (auto i = 0u, m = 0u; i < inner_tile_i; i++)
+    {
+      for (auto j = 0u; j < inner_tile_j; j++, m++)
+      {
+        vst1_f32(outptr + m*matrix_stride, V[i][j]);
+      }
+    }
+
+    inptr += 2;
+    outptr += 2;
+  }
+  for (; n_channels; n_channels--)
+  {
+    // Matrices used and computed in this kernel
+    float w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+    // Read weights
+    for (int i = 0; i < 3; i++)
+    {
+      for (int j = 0; j < 3; j++)
+      {
+        w[i][j] = *(inptr + i*ld_weight_row + j*ld_weight_col);
+      }
+    }
+
+    // Compute the matrix W w
+    for (int j = 0; j < 3; j++)
+    {
+      Ww[0][j] = w[0][j];
+      Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+      Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+      Ww[3][j] = w[2][j];
+    }
+
+    // Compute V = W w WT
+    for (auto i = 0u; i < inner_tile_i; i++)
+    {
+      V[i][0] = Ww[i][0];
+      V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+      V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+      V[i][3] = Ww[i][2];
+    }
+
+    // Store the transformed weights
+    for (auto i = 0u, m = 0u; i < inner_tile_i; i++)
+    {
+      for (auto j = 0u; j < inner_tile_j; j++, m++)
+      {
+        *(outptr + m*matrix_stride) = V[i][j];
+      }
+    }
+
+    inptr++;
+    outptr++;
+  }
+}
+
+}  // namespace weight_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_5x5.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_5x5.cpp
new file mode 100644
index 0000000000..3b09218646
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_5x5.cpp
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace weight_transform {
+
+void arm_fp32_2x2_5x5(
+  unsigned int n_channels,
+  const float *inptr, const size_t ld_weight_row, const size_t ld_weight_col,
+  float *outptr, const size_t matrix_stride
+)
+{
+#ifdef __aarch64__
+  // For each output channel
+  for (; n_channels >= 4; n_channels -= 4)
+  {
+    // Matrices used and computed in this kernel
+    float32x4_t w[5][5], Ww[6][5], V[6][6];
+
+    // Read weights
+    for (int i = 0; i < 5; i++)
+    {
+      for (int j = 0; j < 5; j++)
+      {
+        w[i][j] = vld1q_f32(inptr + i*ld_weight_row + j*ld_weight_col);
+      }
+    }
+
+    // Compute the matrix W w
+    for (int j = 0; j < 5; j++)
+    {
+      // Ww[0][j] = w[0][j]/4.0f;
+      Ww[0][j] = vmulq_n_f32(w[0][j], 1.0f/4.0f);
+
+      // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+      Ww[1][j] = vmulq_n_f32(
+        vaddq_f32(
+          vaddq_f32(
+            vaddq_f32(w[1][j], w[0][j]),
+            vaddq_f32(w[3][j], w[2][j])
+          ),
+          w[4][j]
+        ),
+        -1.0f/6.0f
+      );
+
+      // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+      // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
+      Ww[2][j] = vmulq_n_f32(
+        vsubq_f32(
+          vaddq_f32(
+            vsubq_f32(w[1][j], w[0][j]),
+            vsubq_f32(w[3][j], w[2][j])
+          ),
+          w[4][j]
+        ),
+        1.0f/6.0f
+      );
+
+      // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+      Ww[3][j] = vmulq_n_f32(
+        vmlaq_n_f32(
+          vaddq_f32(
+            vaddq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
+            vaddq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+          ),
+          w[4][j], 2.0f
+        ),
+        1.0f/3.0f
+      );
+
+      // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+      Ww[4][j] = vmulq_n_f32(
+        vmlaq_n_f32(
+          vaddq_f32(
+            vsubq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
+            vsubq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+          ),
+          w[4][j], 2.0f
+        ),
+        1.0f/3.0f
+      );
+
+      // Ww[5][j] = w[4][j];
+      Ww[5][j] = w[4][j];
+    }
+
+    // Compute V = W w WT
+    for (int i = 0; i < 6; i++)
+    {
+      // V[i][0] = Ww[i][0]/4.0f;
+      V[i][0] = vmulq_n_f32(Ww[i][0], 1.0f/4.0f);
+
+      // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+      V[i][1] = vmulq_n_f32(
+        vaddq_f32(
+          vaddq_f32(
+            vaddq_f32(Ww[i][1], Ww[i][0]),
+            vaddq_f32(Ww[i][3], Ww[i][2])
+          ),
+          Ww[i][4]
+        ),
+        -1.0f/6.0f
+      );
+
+      // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+      // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
+      V[i][2] = vmulq_n_f32(
+        vsubq_f32(
+          vaddq_f32(
+            vsubq_f32(Ww[i][1], Ww[i][0]),
+            vsubq_f32(Ww[i][3], Ww[i][2])
+          ),
+          Ww[i][4]
+        ),
+        1.0f/6.0f
+      );
+
+      // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+      V[i][3] = vmulq_n_f32(
+        vmlaq_n_f32(
+          vaddq_f32(
+            vaddq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
+            vaddq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+          ),
+          Ww[i][4], 2.0f
+        ),
+        1.0f/3.0f
+      );
+
+      // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+      V[i][4] = vmulq_n_f32(
+        vmlaq_n_f32(
+          vaddq_f32(
+            vsubq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
+            vsubq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+          ),
+          Ww[i][4], 2.0f
+        ),
+        1.0f/3.0f
+      );
+
+      // V[i][5] = Ww[i][4];
+      V[i][5] = Ww[i][4];
+    }
+
+    // Store the transformed weights
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+      }
+    }
+
+    inptr += 4;
+    outptr += 4;
+  }
+#endif // __aarch64__
+  for (; n_channels >= 2; n_channels -= 2)
+  {
+    // Matrices used and computed in this kernel
+    float32x2_t w[5][5], Ww[6][5], V[6][6];
+
+    // Read weights
+    for (int i = 0; i < 5; i++)
+    {
+      for (int j = 0; j < 5; j++)
+      {
+        w[i][j] = vld1_f32(inptr + i*ld_weight_row + j*ld_weight_col);
+      }
+    }
+
+    // Compute the matrix W w
+    for (int j = 0; j < 5; j++)
+    {
+      // Ww[0][j] = w[0][j]/4.0f;
+      Ww[0][j] = vmul_n_f32(w[0][j], 1.0f/4.0f);
+
+      // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+      Ww[1][j] = vmul_n_f32(
+        vadd_f32(
+          vadd_f32(
+            vadd_f32(w[1][j], w[0][j]),
+            vadd_f32(w[3][j], w[2][j])
+          ),
+          w[4][j]
+        ),
+        -1.0f/6.0f
+      );
+
+      // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+      // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
+      Ww[2][j] = vmul_n_f32(
+        vsub_f32(
+          vadd_f32(
+            vsub_f32(w[1][j], w[0][j]),
+            vsub_f32(w[3][j], w[2][j])
+          ),
+          w[4][j]
+        ),
+        1.0f/6.0f
+      );
+
+      // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+      Ww[3][j] = vmul_n_f32(
+        vmla_n_f32(
+          vadd_f32(
+            vadd_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
+            vadd_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+          ),
+          w[4][j], 2.0f
+        ),
+        1.0f/3.0f
+      );
+
+      // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+      Ww[4][j] = vmul_n_f32(
+        vmla_n_f32(
+          vadd_f32(
+            vsub_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
+            vsub_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+          ),
+          w[4][j], 2.0f
+        ),
+        1.0f/3.0f
+      );
+
+      // Ww[5][j] = w[4][j];
+      Ww[5][j] = w[4][j];
+    }
+
+    // Compute V = W w WT
+    for (int i = 0; i < 6; i++)
+    {
+      // V[i][0] = Ww[i][0]/4.0f;
+      V[i][0] = vmul_n_f32(Ww[i][0], 1.0f/4.0f);
+
+      // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+      V[i][1] = vmul_n_f32(
+        vadd_f32(
+          vadd_f32(
+            vadd_f32(Ww[i][1], Ww[i][0]),
+            vadd_f32(Ww[i][3], Ww[i][2])
+          ),
+          Ww[i][4]
+        ),
+        -1.0f/6.0f
+      );
+
+      // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+      // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
+      V[i][2] = vmul_n_f32(
+        vsub_f32(
+          vadd_f32(
+            vsub_f32(Ww[i][1], Ww[i][0]),
+            vsub_f32(Ww[i][3], Ww[i][2])
+          ),
+          Ww[i][4]
+        ),
+        1.0f/6.0f
+      );
+
+      // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+      V[i][3] = vmul_n_f32(
+        vmla_n_f32(
+          vadd_f32(
+            vadd_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
+            vadd_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+          ),
+          Ww[i][4], 2.0f
+        ),
+        1.0f/3.0f
+      );
+
+      // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+      V[i][4] = vmul_n_f32(
+        vmla_n_f32(
+          vadd_f32(
+            vsub_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
+            vsub_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+          ),
+          Ww[i][4], 2.0f
+        ),
+        1.0f/3.0f
+      );
+
+      // V[i][5] = Ww[i][4];
+      V[i][5] = Ww[i][4];
+    }
+
+    // Store the transformed weights
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        vst1_f32(outptr + m*matrix_stride, V[i][j]);
+      }
+    }
+
+    inptr += 2;
+    outptr += 2;
+  }
+  for (; n_channels; n_channels--)
+  {
+    // Matrices used and computed in this kernel
+    float w[5][5], Ww[6][5], V[6][6];
+
+    // Read weights
+    for (int i = 0; i < 5; i++)
+    {
+      for (int j = 0; j < 5; j++)
+      {
+        w[i][j] = *(inptr + i*ld_weight_row + j*ld_weight_col);
+      }
+    }
+
+    // Compute the matrix W w
+    for (int j = 0; j < 5; j++)
+    {
+      Ww[0][j] = w[0][j]/4.0f;
+      Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+      Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+      Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+      Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+      Ww[5][j] = w[4][j];
+    }
+
+    // Compute V = W w WT
+    for (int i = 0; i < 6; i++)
+    {
+      V[i][0] = Ww[i][0]/4.0f;
+      V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+      V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+      V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+      V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+      V[i][5] = Ww[i][4];
+    }
+
+    // Store the transformed weights
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        *(outptr + m*matrix_stride) = V[i][j];
+      }
+    }
+
+    inptr++;
+    outptr++;
+  }
+}
+
+}  // namespace weight_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_4x4_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_4x4_3x3.cpp
new file mode 100644
index 0000000000..aad88caff8
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_4x4_3x3.cpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <arm_neon.h>
+
+namespace arm_conv {
+namespace winograd {
+namespace weight_transform {
+
+void arm_fp32_4x4_3x3(
+  unsigned int n_channels,
+  const float *inptr, const size_t ld_weight_row, const size_t ld_weight_col,
+  float *outptr, const size_t matrix_stride
+)
+{
+#ifdef __aarch64__
+  for (; n_channels >= 4; n_channels -= 4)
+  {
+    // Matrices used and computed in this kernel
+    float32x4_t w[3][3], Ww[6][3], V[6][6];
+
+    // Read weights
+    for (int i = 0; i < 3; i++)
+    {
+      for (int j = 0; j < 3; j++)
+      {
+        w[i][j] = vld1q_f32(inptr + i*ld_weight_row + j*ld_weight_col);
+      }
+    }
+
+    // Compute the matrix W w
+    for (int j = 0; j < 3; j++)
+    {
+      // Ww[0][j] =  6*w[0][j];
+      Ww[0][j] = vmulq_n_f32(w[0][j], 6.0);
+
+      // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+      Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
+
+      // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+      Ww[2][j] = vmulq_n_f32(vsubq_f32(vsubq_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
+
+      // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+      Ww[3][j] = vmlaq_n_f32(vmlaq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+      // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+      Ww[4][j] = vmlaq_n_f32(vmlsq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+      // Ww[5][j] = 24*w[2][j];
+      Ww[5][j] = vmulq_n_f32(w[2][j], 24.0f);
+    }
+
+    // Compute V = W w WT
+    for (int i = 0; i < 6; i++)
+    {
+      const float recip576 = 1.0f / 576.0f;
+
+      // V[i][0] =  6*Ww[i][0];
+      V[i][0] = vmulq_n_f32(vmulq_n_f32(Ww[i][0], 6.0), recip576);
+
+      // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
+      V[i][1] = vmulq_n_f32(vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
+
+      // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
+      V[i][2] = vmulq_n_f32(vmulq_n_f32(vsubq_f32(vsubq_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
+
+      // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
+      V[i][3] = vmulq_n_f32(vmlaq_n_f32(vmlaq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+      // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
+      V[i][4] = vmulq_n_f32(vmlaq_n_f32(vmlsq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+      // V[i][5] = 24*Ww[i][2];
+      V[i][5] = vmulq_n_f32(vmulq_n_f32(Ww[i][2], 24.0f), recip576);
+    }
+
+    // Store the transformed weights
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+      }
+    }
+
+    inptr += 4;
+    outptr += 4;
+  }
+#endif // __aarch64__
+  for (; n_channels >= 2; n_channels -= 2)
+  {
+    // Matrices used and computed in this kernel
+    float32x2_t w[3][3], Ww[6][3], V[6][6];
+
+    // Read weights
+    for (int i = 0; i < 3; i++)
+    {
+      for (int j = 0; j < 3; j++)
+      {
+        w[i][j] = vld1_f32(inptr + i*ld_weight_row + j*ld_weight_col);
+      }
+    }
+
+    // Compute the matrix W w
+    for (int j = 0; j < 3; j++)
+    {
+      // Ww[0][j] =  6*w[0][j];
+      Ww[0][j] = vmul_n_f32(w[0][j], 6.0);
+
+      // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+      Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
+
+      // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+      Ww[2][j] = vmul_n_f32(vsub_f32(vsub_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
+
+      // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+      Ww[3][j] = vmla_n_f32(vmla_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+      // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+      Ww[4][j] = vmla_n_f32(vmls_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+      // Ww[5][j] = 24*w[2][j];
+      Ww[5][j] = vmul_n_f32(w[2][j], 24.0f);
+    }
+
+    // Compute V = W w WT
+    for (int i = 0; i < 6; i++)
+    {
+      const float recip576 = 1.0f / 576.0f;
+
+      // V[i][0] =  6*Ww[i][0];
+      V[i][0] = vmul_n_f32(vmul_n_f32(Ww[i][0], 6.0), recip576);
+
+      // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
+      V[i][1] = vmul_n_f32(vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
+
+      // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
+      V[i][2] = vmul_n_f32(vmul_n_f32(vsub_f32(vsub_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
+
+      // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
+      V[i][3] = vmul_n_f32(vmla_n_f32(vmla_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+      // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
+      V[i][4] = vmul_n_f32(vmla_n_f32(vmls_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+      // V[i][5] = 24*Ww[i][2];
+      V[i][5] = vmul_n_f32(vmul_n_f32(Ww[i][2], 24.0f), recip576);
+    }
+
+    // Store the transformed weights
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        vst1_f32(outptr + m*matrix_stride, V[i][j]);
+      }
+    }
+
+    inptr += 2;
+    outptr += 2;
+  }
+  for (; n_channels; n_channels--)
+  {
+    // Matrices used and computed in this kernel
+    float w[3][3], Ww[6][3], V[6][6];
+
+    // Read weights
+    for (int i = 0; i < 3; i++)
+    {
+      for (int j = 0; j < 3; j++)
+      {
+        w[i][j] = *(inptr + i*ld_weight_row + j*ld_weight_col);
+      }
+    }
+
+    // Compute the matrix W w
+    for (int j = 0; j < 3; j++)
+    {
+      Ww[0][j] =  6*w[0][j];
+      Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+      Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+      Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+      Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+      Ww[5][j] = 24*w[2][j];
+    }
+
+    // Compute V = W w WT
+    for (int i = 0; i < 6; i++)
+    {
+      V[i][0] = ( 6*Ww[i][0]) / 576.0;
+      V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
+      V[i][2] = (-4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
+      V[i][3] = ( 1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
+      V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
+      V[i][5] = (24*Ww[i][2]) / 576.0;
+    }
+
+    // Store the transformed weights
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        *(outptr + m*matrix_stride) = V[i][j];
+      }
+    }
+
+    inptr++;
+    outptr++;
+  }
+}
+
+}  // namespace weight_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms/cpp_fp32_1x2_1x7.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms/cpp_fp32_1x2_1x7.cpp
new file mode 100644
index 0000000000..ee657b01cd
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms/cpp_fp32_1x2_1x7.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+
+namespace arm_conv {
+namespace winograd {
+namespace weight_transform {
+
+void cpp_fp32_1x2_1x7(
+  unsigned int n_channels,
+  const float* inptr, size_t, size_t ld_weight_col,
+  float *outptr, size_t matrix_stride
+)
+{
+  for (; n_channels; n_channels--)
+  {
+    // Matrices used and computed in this kernel
+    float w[7], V[8];
+
+    // Read weights
+    for (int j = 0; j < 7; j++)
+    {
+      w[j] = *(inptr + j*ld_weight_col);
+    }
+
+    // Compute V = w WT
+    V[0] = (w[0]*-1) / 36.0f;
+    V[1] = (w[1]*-1 + w[3]*-1 + w[5]*-1 + w[0]*1 + w[2]*1 + w[4]*1 + w[6]*1) / 48.0f;
+    V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1 + w[5]*1 + w[6]*1) / 48.0f;
+    V[3] = (w[0]*-1 + w[6]*-64 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8 + w[5]*32) / 120.0f;
+    V[4] = (w[0]*-1 + w[6]*-64 + w[5]*-32 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120.0f;
+    V[5] = (w[5]*-243 + w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[6]*729 + w[0]*1) / 720.0f;
+    V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[5]*243 + w[6]*729 + w[0]*1) / 720.0f;
+    V[7] = (w[6]*1) / 1.0f;
+
+    // Store the transformed weights
+    for (int j = 0; j < 8; j++)
+    {
+      *(outptr + j*matrix_stride) = V[j];
+    }
+
+    inptr++;
+    outptr++;
+  }
+}
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms/cpp_fp32_1x4_1x5.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms/cpp_fp32_1x4_1x5.cpp
new file mode 100644
index 0000000000..47a85e306d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms/cpp_fp32_1x4_1x5.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+
+namespace arm_conv {
+namespace winograd {
+namespace weight_transform {
+
+void cpp_fp32_1x4_1x5(
+  unsigned int n_channels,
+  const float *inptr,
+  size_t,  // ld_weight_row
+  size_t ld_weight_col,
+  float *outptr,
+  size_t matrix_stride
+)
+{
+  constexpr auto kernel_cols = 5u, inner_tile_cols = 8u;
+
+  // For each output channel
+  for (; n_channels; n_channels--)
+  {
+    // Matrices used and computed in this kernel
+    float w[kernel_cols], V[inner_tile_cols];
+
+    // Read weights
+    for (auto j = 0u; j < kernel_cols; j++)
+    {
+      w[j] = *(inptr + j * ld_weight_col);
+    }
+
+    // Compute V = w WT
+    V[0] = (w[0]*-1) / 36;
+    V[1] = (w[1]*-1 + w[3]*-1 + w[0]*1 + w[2]*1 + w[4]*1) / 48;
+    V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1) / 48;
+    V[3] = (w[0]*-1 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8) / 120;
+    V[4] = (w[0]*-1 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120;
+    V[5] = (w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[0]*1) / 720;
+    V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[0]*1) / 720;
+    V[7] = (w[4]*1) / 1;
+
+    // Store the transformed weights
+    for (auto  j = 0u; j < inner_tile_cols; j++)
+    {
+      *(outptr + j*matrix_stride) = V[j];
+    }
+
+    inptr++;
+    outptr++;
+  }
+}
+
+}  // namespace weight_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms/cpp_fp32_1x6_1x3.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms/cpp_fp32_1x6_1x3.cpp
new file mode 100644
index 0000000000..22bb85e788
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms/cpp_fp32_1x6_1x3.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+
+namespace arm_conv {
+namespace winograd {
+namespace weight_transform {
+
+void cpp_fp32_1x6_1x3(
+  unsigned int n_channels,
+  const float *inptr, size_t, size_t ld_weight_col,
+  float *outptr, size_t matrix_stride
+)
+{
+  for (; n_channels; n_channels--)
+  {
+    // Matrices used and computed in this kernel
+    float w[3], V[8];
+
+    // Read weights
+    for (int j = 0; j < 3; j++)
+    {
+      w[j] = *(inptr + j * ld_weight_col);
+    }
+
+    // Compute V = w WT
+    V[0] = (w[0]*-1) / 36.0f;
+    V[1] = (w[1]*-1 + w[0]*1 + w[2]*1) / 48.0f;
+    V[2] = (w[0]*1 + w[1]*1 + w[2]*1) / 48.0f;
+    V[3] = (w[0]*-1 + w[2]*-4 + w[1]*2) / 120.0f;
+    V[4] = (w[0]*-1 + w[2]*-4 + w[1]*-2) / 120.0f;
+    V[5] = (w[1]*-3 + w[2]*9 + w[0]*1) / 720.0f;
+    V[6] = (w[1]*3 + w[2]*9 + w[0]*1) / 720.0f;
+    V[7] = (w[2]*1) / 1;
+
+    // Store the transformed weights
+    for (int j = 0; j < 8; j++)
+    {
+      *(outptr + j*matrix_stride) = V[j];
+    }
+
+    inptr++;
+    outptr++;
+  }
+}
+
+}  // namespace weight_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
new file mode 100644
index 0000000000..6c8bbe07cf
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+#include "winograd_implementations.hpp"
+#include "weight_transform.hpp"
+
+namespace arm_conv {
+namespace winograd {
+namespace weight_transform {
+
+void *a64_fp16_4x4_3x3(unsigned int, const __fp16 *, size_t, size_t, __fp16 *, size_t);
+
+#define IMPL(KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN) \
+  new Transform<__fp16>(#KERN, KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN)
+
+static const TransformImplementation<__fp16> transforms_fp16[] = {
+  { IMPL(3, 3, 6, 6, a64_fp16_4x4_3x3) },
+  { nullptr }
+};
+
+template <>
+const TransformImplementation<__fp16> *implementation_list(void)
+{
+  return transforms_fp16;
+}
+
+}  // namespace weight_transform
+}  // namespace winograd
+}  // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp32.cpp
new file mode 100644
index 0000000000..d12f3c60c0
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp32.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "winograd_implementations.hpp"
+#include "weight_transform.hpp"
+
+namespace arm_conv {
+namespace winograd {
+namespace weight_transform {
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#endif  // defined(__aarch64__)
+void arm_fp32_4x4_3x3(unsigned int, const float *, size_t, size_t, float *, size_t);
+void arm_fp32_2x2_3x3(unsigned int, const float *, size_t, size_t, float *, size_t);
+void arm_fp32_2x2_5x5(unsigned int, const float *, size_t, size_t, float *, size_t);
+void cpp_fp32_1x6_1x3(unsigned int, const float *, size_t, size_t, float *, size_t);
+void cpp_fp32_1x4_1x5(unsigned int, const float *, size_t, size_t, float *, size_t);
+void cpp_fp32_1x2_1x7(unsigned int, const float *, size_t, size_t, float *, size_t);
+
+#define IMPL(KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN) \
+  new Transform<float>(#KERN, KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN)
+
+#define IMPL_T(KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN) \
+  new Transform<float>(#KERN, KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, Transform<float>::get_transposed_kernel(KERN))
+
+static const TransformImplementation<float> transforms_fp32[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
+#endif  // defined(__aarch64__)
+  { IMPL(3, 3, 6, 6, arm_fp32_4x4_3x3) },
+  { IMPL(3, 3, 4, 4, arm_fp32_2x2_3x3) },
+  { IMPL(5, 5, 6, 6, arm_fp32_2x2_5x5) },
+  { IMPL(1, 3, 1, 8, cpp_fp32_1x6_1x3) },
+  { IMPL_T(3, 1, 8, 1, cpp_fp32_1x6_1x3) },
+  { IMPL(1, 5, 1, 8, cpp_fp32_1x4_1x5) },
+  { IMPL_T(5, 1, 8, 1, cpp_fp32_1x4_1x5) },
+  { IMPL(1, 7, 1, 8, cpp_fp32_1x2_1x7) },
+  { IMPL_T(7, 1, 8, 1, cpp_fp32_1x2_1x7) },
+  { nullptr }
+};
+
+template <>
+const TransformImplementation<float> *implementation_list(void)
+{
+  return transforms_fp32;
+}
+
+}  // namespace weight_transform
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd.cpp b/src/core/NEON/kernels/convolution/winograd/winograd.cpp
deleted file mode 100644
index d556112853..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <cstring>
-#include "utils.hpp"
-#include "winograd.hpp"
-
-using namespace winograd;
-using array2 = std::pair<unsigned int, unsigned int>;
-
-#define MEMBERFN(RTYPE)                                                        \
-  template <int output_tile_rows, int output_tile_cols, int kernel_rows,       \
-            int kernel_cols, WinogradRoots roots>                              \
-  template <typename TOut, typename TIn, typename TGEMMIn, typename TGEMMOut>  \
-  RTYPE WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows,          \
-                     kernel_cols,                                              \
-                     roots>::Convolution<TOut, TIn, TGEMMIn, TGEMMOut>
-
-/** Get the output shape of a convolution. */
-MEMBERFN(array2)
-::get_output_shape(const std::pair<unsigned int, unsigned int> input_shape,
-                   const bool padding_same) {
-  const unsigned int n_rows =
-      padding_same ? input_shape.first : input_shape.first - (kernel_rows - 1);
-  const unsigned int n_cols = padding_same
-                                  ? input_shape.second
-                                  : input_shape.second - (kernel_cols - 1);
-  return {n_rows, n_cols};
-}
-
-/** Get the memory required to store the kernel transformed into the
- * Winograd domain.
- */
-MEMBERFN(size_t)
-::get_kernel_storage_size(const unsigned int n_input_channels,
-                          const unsigned int n_output_channels) {
-  return N_GEMMS * get_kernel_matrix_size(n_input_channels, n_output_channels);
-}
-
-MEMBERFN(size_t)
-::get_input_storage_size(const unsigned int n_batches,
-                         const unsigned int n_rows, const unsigned int n_cols,
-                         const unsigned int n_channels,
-                         const bool same_padding) {
-  return N_GEMMS * get_input_matrix_size(n_batches, n_rows, n_cols, n_channels,
-                                         same_padding);
-}
-
-MEMBERFN(size_t)
-::get_output_storage_size(const unsigned int n_batches,
-                          const unsigned int n_rows, const unsigned int n_cols,
-                          const unsigned int n_channels) {
-  return N_GEMMS *
-         get_output_matrix_size(n_batches, n_rows, n_cols, n_channels);
-}
-
-/** Get the memory required to apply a Winograd operator to some input.
- */
-MEMBERFN(size_t)
-::get_working_space_size(const unsigned int n_batches,
-                         const unsigned int n_rows, const unsigned int n_cols,
-                         const unsigned int n_input_channels,
-                         const unsigned int n_output_channels,
-                         const bool padding_same) {
-  const auto output_shape = get_output_shape({n_rows, n_cols}, padding_same);
-
-  // Get the memory required to store the matrices
-  const size_t matrix_sizes =
-      N_GEMMS *
-      (get_input_matrix_size(n_batches, n_rows, n_cols, n_input_channels,
-                             padding_same) +
-       get_output_matrix_size(n_batches, output_shape.first,
-                              output_shape.second, n_output_channels));
-  return matrix_sizes;
-}
-
-/* Get the memory required by a single "input" matrix.
- */
-MEMBERFN(size_t)
-::get_input_matrix_size(const unsigned int n_batches, const unsigned int n_rows,
-                        const unsigned int n_cols,
-                        const unsigned int n_channels,
-                        const bool same_padding) {
-  return get_input_matrix_stride(n_batches, n_rows, n_cols, n_channels,
-                                 same_padding) *
-         sizeof(TGEMMIn);
-}
-
-MEMBERFN(int)
-::get_input_matrix_stride(const unsigned int n_batches, const unsigned int n_rows,
-                        const unsigned int n_cols,
-                        const unsigned int n_channels,
-                        const bool same_padding) {
-  const auto output_shape = get_output_shape({n_rows, n_cols}, same_padding);
-  const unsigned int tile_rows = iceildiv(output_shape.first, output_tile_rows);
-  const unsigned int tile_cols =
-      iceildiv(output_shape.second, output_tile_cols);
-  const unsigned int M =
-      roundup<unsigned int>(n_batches * tile_rows * tile_cols, M_BLOCK);
-  const unsigned int K = n_channels;
-
-  return M * K;
-}
-
-/* Get the memory required by a single "output" matrix.
- */
-MEMBERFN(size_t)
-::get_output_matrix_size(const unsigned int n_batches,
-                         const unsigned int n_rows, const unsigned int n_cols,
-                         const unsigned int n_channels) {
-  return get_output_matrix_stride(n_batches, n_rows, n_cols, n_channels) *
-         sizeof(TGEMMOut);
-}
-
-MEMBERFN(int)
-::get_output_matrix_stride(const unsigned int n_batches,
-                           const unsigned int n_rows, const unsigned int n_cols,
-                           const unsigned int n_channels) {
-  // Compute shape for the GEMM
-  const int tile_rows = iceildiv(n_rows, output_tile_rows);
-  const int tile_cols = iceildiv(n_cols, output_tile_cols);
-  const int M = roundup<int>(tile_rows * tile_cols, M_BLOCK);
-  const int N = roundup<int>(n_channels, N_BLOCK);
-
-  return n_batches * M * N;
-}
-
-
-/* Get the memory required by a single "kernel" matrix.
- */
-MEMBERFN(size_t)
-::get_kernel_matrix_size(const unsigned int n_input_channels,
-                         const unsigned int n_output_channels) {
-  return sizeof(TGEMMIn) *
-         get_kernel_matrix_stride(n_input_channels, n_output_channels);
-}
-
-MEMBERFN(int)
-::get_kernel_matrix_stride(const unsigned int n_input_channels,
-                           const unsigned int n_output_channels) {
-  return n_input_channels * roundup<int>(n_output_channels, N_BLOCK);
-}
-
-// Instantiate required implementations
-template class WinogradGEMM<2, 2, 3, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-template class WinogradGEMM<4, 4, 3, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-
-template class WinogradGEMM<1, 6, 1, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-template class WinogradGEMM<6, 1, 3, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-
-template class WinogradGEMM<2, 2, 5, 5, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-
-template class WinogradGEMM<1, 4, 1, 5, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-template class WinogradGEMM<4, 1, 5, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-
-template class WinogradGEMM<1, 2, 1, 7, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-template class WinogradGEMM<2, 1, 7, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class WinogradGEMM<4, 4, 3, 3, WinogradRoots::Integers>::Convolution<__fp16, __fp16, __fp16, __fp16>;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd.hpp b/src/core/NEON/kernels/convolution/winograd/winograd.hpp
deleted file mode 100644
index ac82e7b7b9..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd.hpp
+++ /dev/null
@@ -1,621 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "arm_gemm.hpp"
-
-#include <cstddef>
-#include <utility>
-
-namespace winograd
-{
-
-class ITransform
-{
-  public:
-    virtual ~ITransform() = default;
-
-    /**
-     * Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param nthreads The greatest number of threads that will be used to execute the transform.
-     * @return Size of working space required in bytes.
-     */
-    virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
-
-    /**
-     * Set the working space to be used by the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param Pointer to the working space.
-     */
-    virtual void set_working_space(void *buffer) = 0;
-
-    /**
-     * Get the window of work a given operator can perform.
-     */
-    virtual unsigned int get_window() const = 0;
-
-    /**
-     * Perform work upon a window of the transform.
-     */
-    virtual void run(unsigned int start, unsigned int stop, unsigned int threadid=0) = 0;
-};
-
-class IInputTransform : public ITransform
-{
-  public:
-    virtual ~IInputTransform() = default;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     */
-    virtual void set_input_tensor(const void *input) = 0;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_input_tensor(const void *input, int col_stride) = 0;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_input_tensor(const void *input, int row_stride, int col_stride) = 0;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes).
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) = 0;
-
-    /**
-     * Set pointers to the matrices written by the transform.
-     * @param matrices Pointer to the start of the first matrix representing the transformed input.
-     * @param inter_matrix_stride Stride (in elements) between matrices.
-     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
-     */
-    virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-};
-
-class IOutputTransform : public ITransform
-{
-  public:
-    virtual ~IOutputTransform() = default;
-
-    /**
-     * Set pointers to the matrices written by the transform.
-     * @param matrices Pointer to the start of the first matrix representing the input to the transform.
-     * @param inter_matrix_stride Stride (in elements) between matrices.
-     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
-     */
-    virtual void set_input_matrices(const void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-
-    /**
-     * Set pointer to the bias tensor (can be ignored or called with nullptr for no bias.
-     */
-    virtual void set_bias(const void *bias=nullptr) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     */
-    virtual void set_output_tensor(void *output) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_output_tensor(void *output, int col_stride) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_output_tensor(void *output, int row_stride, int col_stride) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes).
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) = 0;
-};
-
-class IWeightTransform : public ITransform
-{
-  public:
-    virtual ~IWeightTransform() = default;
-
-    /** Set pointer to the weight tensor read by the transform. */
-    virtual void set_weight_tensor(const void *weights) = 0;
-
-    /**
-     * Set pointers to the matrices written by the transform.
-     * @param matrices Pointer to the start of the first matrix representing the transformed input.
-     * @param inter_matrix_stride Stride (in elements) between matrices.
-     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
-     */
-    virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-};
-
-enum class WinogradRoots
-{
-  Integers,
-};
-
-template <int InnerTileRows, int InnerTileCols, typename TIn, typename TOut, WinogradRoots Roots>
-class InputTransform : public IInputTransform
-{
-  public:
-    /** Create an InputTransform operator fixed on a given problem and set of
-     * pointers.
-     */
-    InputTransform(
-        int kernel_rows,     /**< Number of rows in the kernel */
-        int kernel_cols,     /**< Number of columns in the kernel */
-        int n_batches,       /**< Number of batches in input tensor. */
-        int n_rows,          /**< Number of rows in input tensor. */
-        int n_cols,          /**< Number of columns in input tensor. */
-        int n_channels,      /**< Number of channels in input tensor. */
-        int padding_top,     /**< Padding to apply to the top of the image. */
-        int padding_left,    /**< Padding to apply to the left of the image. */
-        int padding_bottom,  /**< Padding to apply to the bottom of the image. */
-        int padding_right    /**< Padding to apply to the right of the image. */
-    );
-
-    InputTransform(InputTransform&) = delete;
-    InputTransform operator=(InputTransform&) = delete;
-
-    /** Set pointers to the input tensor read by the transform. */
-    void set_input_tensor(const void *input) override;
-    void set_input_tensor(const void *input, int col_stride) override;
-    void set_input_tensor(const void *input, int row_stride, int col_stride) override;
-    void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override;
-
-    /** Set pointers to the matrices written by the transform. */
-    void set_output_matrices(void *matrices, int iter_matrix_stride, int matrix_row_stride) override;
-
-    /** Get the working space required to perform the transformation. */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work a given operator can perform. */
-    unsigned int get_window() const override;
-    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window
-
-    /** Perform work upon a window of the input. */
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    const int _n_batches, _n_rows, _n_cols, _n_channels;
-
-  private:
-    void transform_unpadded_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr
-    );
-
-    void transform_padded_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr,
-      int padding_top,
-      int padding_left,
-      int padding_bottom,
-      int padding_right
-    );
-    
-    /* Tile implementation */
-    static void transform_tile(
-      int n_channels,         /** @param[in] Number of channels in the tensor. */
-      const TIn* inptr_base,  /** @param[in] Pointer to the base of the input tile. */
-      int input_row_stride,   /** @param[in] Stride between rows of the input tensor. */
-      int input_col_stride,   /** @param[in] Stride between columns of the input tensor. */
-      TOut* mptr_base,        /** @param[out] Base pointer to transformed input matrices. */
-      int matrix_stride       /** @param[in] Stride between matrices in the input space. */
-    );
-
-    /** Get the working space for a thread. */
-    void * get_working_space(unsigned int threadid) const;
-
-    const TIn* _inptr;
-    TOut* _outptr;
-
-    const int _overlap_rows, _overlap_cols;
-    const int _padding_top, _padding_left, _padding_bottom, _padding_right;
-    const int _tiles_M, _tiles_N;
-    int _matrix_stride, _matrix_row_stride, _matrix_batch_stride;
-    int _in_col_stride, _in_row_stride, _in_batch_stride;
-
-    const int _working_space_col_stride, _working_space_row_stride;
-    TIn *_working_space;
-};
-
-template <int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots>
-class InputTransform<InnerTileRows, 1, TIn, TOut, Roots> :
-  public InputTransform<1, InnerTileRows, TIn, TOut, Roots>
-{
-  using Base = InputTransform<1, InnerTileRows, TIn, TOut, Roots>;
-
-  public:
-    InputTransform(
-      int kernel_rows,     /**< Number of rows in the kernel. */
-      int kernel_cols,     /**< Number of columns in the kernel. */
-      int n_batches,       /**< Number of batches in input tensor. */
-      int n_rows,          /**< Number of rows in input tensor. */
-      int n_cols,          /**< Number of columns in input tensor. */
-      int n_channels,      /**< Number of channels in input tensor. */
-      int padding_top,     /**< Padding to apply to the top of the image. */
-      int padding_left,    /**< Padding to apply to the left of the image. */
-      int padding_bottom,  /**< Padding to apply to the bottom of the image. */
-      int padding_right    /**< Padding to apply to the right of the image. */
-    );
-
-    /** Set pointers to the input tensor read by the transform. */
-    void set_input_tensor(const void *input) override;
-    void set_input_tensor(const void *input, int col_stride) override;
-    void set_input_tensor(const void *input, int row_stride, int col_stride) override;
-    void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override;
-};
-
-template <
-  int KernelRows, int KernelCols,
-  int InnerTileRows, int InnerTileCols,
-  typename TIn, typename TOut,
-  WinogradRoots Roots
->
-class OutputTransform : public IOutputTransform
-{
-  public:
-    OutputTransform(
-      int n_batches,  /**< Number of batches in output tensor. */
-      int n_rows,     /**< Number of rows in output tensor. */
-      int n_cols,     /**< Number of columns in output tensor. */
-      int n_channels, /**< Number of channels in output tensor. */
-      const arm_gemm::Activation &activation
-    );
-
-    OutputTransform(OutputTransform&) = delete;
-    OutputTransform operator=(OutputTransform&) = delete;
-
-    /** Set pointers to the matrices read by the transform. */
-    void set_input_matrices(const void *matrices, int iter_matrix_stride, int matrix_row_stride) override;
-
-    /** Set pointer to the bias tensor (can be ignored or called with nullptr for no bias */
-    void set_bias(const void *bias=nullptr) override;
-
-    /** Set pointers to the output tensor written by the transform. */
-    void set_output_tensor(void *output) override;
-    void set_output_tensor(void *output, int col_stride) override;
-    void set_output_tensor(void *output, int row_stride, int col_stride) override;
-    void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override;
-
-    /** Get the working space required to perform the transformation. */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work a given operator can perform. */
-    unsigned int get_window() const override;
-    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window
-
-    /** Perform work upon a window of the input. */
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    static constexpr int inner_tile_rows = InnerTileRows;
-    static constexpr int inner_tile_cols = InnerTileCols;
-    static constexpr int output_tile_rows = InnerTileRows - KernelRows + 1;
-    static constexpr int output_tile_cols = InnerTileCols - KernelCols + 1;
-
-    const int _n_batches, _n_rows, _n_cols, _n_channels;
-    const TOut _output_min, _output_max;
-
-  private:
-    void transform_uncropped_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr,
-      const TOut *biases
-    );
-
-    void transform_cropped_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr,
-      const TOut *biases,
-      int pad_bottom,
-      int pad_right
-    );
-
-    /** Implementation of the tile transformation method. */
-    static void transform_tile(
-      int n_channels,
-      const TIn* matrix_base,
-      int matrix_stride,
-      const TOut* biases,
-      TOut* output,
-      int output_row_stride,
-      int output_col_stride,
-      TOut output_min,
-      TOut output_max
-    );
-
-    /** Get the working space for a thread. */
-    void * get_working_space(unsigned int threadid) const;
-
-    const TIn* _matrix_base;
-    const TOut* _biases;
-    int _matrix_stride, _matrix_row_stride, _matrix_batch_stride;
-    TOut* _outptr;
-    const int _tiles_M, _tiles_N;
-    int _out_col_stride, _out_row_stride, _out_batch_stride;
-
-    const int _working_space_col_stride, _working_space_row_stride;
-    TOut *_working_space;
-};
-
-template <
-  int KernelRows,
-  int InnerTileRows,
-  typename TIn, typename TOut,
-  WinogradRoots Roots
->
-class OutputTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> :
-  public OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>
-{
-  using Base = OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>;
-
-  public:
-    OutputTransform(
-      int n_batches,  /**< Number of batches in output tensor. */
-      int n_rows,     /**< Number of rows in output tensor. */
-      int n_cols,     /**< Number of columns in output tensor. */
-      int n_channels, /**< Number of channels in output tensor. */
-      const arm_gemm::Activation &activation
-    );
-
-    /** Set pointers to the output tensor written by the transform. */
-    void set_output_tensor(void *output) override;
-    void set_output_tensor(void *output, int col_stride) override;
-    void set_output_tensor(void *output, int row_stride, int col_stride) override;
-    void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override;
-};
-
-template <
-  int KernelRows, int KernelCols,
-  int InnerTileRows, int InnerTileCols,
-  typename TIn, typename TOut,
-  WinogradRoots Roots
->
-class WeightTransform : public IWeightTransform
-{
-  public:
-    WeightTransform(
-      int n_output_channels,  /**< Number of output channels in the kernel. */
-      int n_input_channels    /**< Number of input channels in the kernel. */
-    );
-
-    WeightTransform(WeightTransform&) = delete;
-    WeightTransform operator=(WeightTransform&) = delete;
-
-    /** Set pointer to the weight tensor read by the transform. */
-    void set_weight_tensor(const void *weights) override;
-
-    /** Set pointer to the matrices written by the transform. */
-    void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) override;
-
-    /** Get the working space required to perform the transformation. */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work a given operator can perform. */
-    unsigned int get_window() const override;
-    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window
-
-    /** Perform work upon a window of the input. */
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    static const int kernel_rows = KernelRows;
-    static const int kernel_cols = KernelCols;
-    static const int inner_tile_rows = InnerTileRows;
-    static const int inner_tile_cols = InnerTileCols;
-
-  private:
-    /** Apply the transform to a tensor. */
-    static void execute(
-      int n_output_channels,
-      int n_input_channels,
-      const TIn* input,
-      TOut* output,
-      int matrix_stride,
-      int matrix_row_stride
-    );
-
-    const int _n_output_channels, _n_input_channels;
-    TOut *_matrices;
-    int _matrix_stride, _matrix_row_stride;
-    const TIn *_weights;
-};
-
-template <int KernelRows, int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots>
-class WeightTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> :
-  public WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>
-{
-  public:
-    using WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>::WeightTransform;
-};
-
-template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols, WinogradRoots Roots>
-class WinogradGEMM
-{
-  public:
-    // Information about the specific Winograd instance
-    static constexpr int output_tile_rows = OutputTileRows;
-    static constexpr int output_tile_cols = OutputTileCols;
-    static constexpr int kernel_rows = KernelRows;
-    static constexpr int kernel_cols = KernelCols;
-    static constexpr int inner_tile_rows = output_tile_rows + kernel_rows - 1;
-    static constexpr int inner_tile_cols = output_tile_cols + kernel_cols - 1;
-    static constexpr int N_GEMMS = inner_tile_rows * inner_tile_cols;
-
-    /** Transform weights from the spatial to the Winograd domain. */
-    template <typename TIn, typename TOut>
-    using WeightsTransform = WeightTransform<
-      KernelRows, KernelCols, inner_tile_rows, inner_tile_cols,
-      TIn, TOut, Roots
-    >;
-
-    /** Transform input feature maps from the spatial to the Winograd domain.
-     */
-    template <typename TIn, typename TOut>
-    using InputTransform = InputTransform<
-      inner_tile_rows, inner_tile_cols, TIn, TOut, Roots
-    >;
-
-    /** Transform output feature maps from the Winograd to the spatial domain.
-     */
-    template <typename TIn, typename TOut>
-    using OutputTransform = OutputTransform<
-      KernelRows, KernelCols, inner_tile_rows, inner_tile_cols,
-      TIn, TOut, Roots
-    >;
-
-    /** Perform a convolution.
-     */
-    template <typename TOut, typename TIn, typename TInGEMM=TIn, typename TOutGEMM=TOut>
-    class Convolution
-    {
-      public:
-        // Information about the typed Winograd instance
-        typedef TOut OutputType;
-        typedef TOutGEMM GemmOutputType;
-        typedef TInGEMM GemmInputType;
-        typedef TIn InputType;
-
-        /** Get the output shape of a convolution. */
-        static std::pair<unsigned int, unsigned int> get_output_shape(
-            const std::pair<unsigned int, unsigned int> input_shape,
-            bool padding_same);
-
-        /** Get the memory required to store the kernel transformed into the
-         * Winograd domain.
-         */
-        static size_t get_kernel_storage_size(unsigned int n_input_channels,
-                                              unsigned int n_output_channels);
-
-        /** Get the memory required to store the input tensor transformed into
-         * the Winograd domain.
-         */
-        static size_t get_input_storage_size(
-            unsigned int n_batches,  // Number of batches
-            unsigned int n_rows,     // Number of input rows
-            unsigned int n_cols,     // Number of input columns
-            unsigned int n_channels, // Number of input channels
-            bool padding_same);
-
-        /** Get the memory required to store the output tensor in the Winograd
-         * domain.
-         */
-        static size_t get_output_storage_size(
-            unsigned int n_batches, // Number of batches
-            unsigned int n_rows,    // Number of output rows
-            unsigned int n_cols,    // Number of output columns
-            unsigned int n_channels // Number of output channels
-            );
-
-        /** Get the memory required to apply a Winograd operator to some input.
-         */
-        static size_t get_working_space_size(
-            unsigned int n_batches,
-            unsigned int n_rows,            // Number of input rows
-            unsigned int n_cols,            // Number of input columns
-            unsigned int n_input_channels,  // Number of input channels
-            unsigned int n_output_channels, // Number of output channels
-            bool padding_same);
-
-        /* Get the memory required by a single "input" matrix.
-         */
-        static size_t get_input_matrix_size(
-            unsigned int n_batches,  // Number of batches
-            unsigned int n_rows,     // Number of input rows
-            unsigned int n_cols,     // Number of input columns
-            unsigned int n_channels, // Number of input channels
-            bool padding_same);
-
-        static int get_input_matrix_stride(
-            unsigned int n_batches,  // Number of batches
-            unsigned int n_rows,     // Number of input rows
-            unsigned int n_cols,     // Number of input columns
-            unsigned int n_channels, // Number of input channels
-            bool padding_same);
-
-        /* Get the memory required by a single "output" matrix.
-         */
-        static size_t get_output_matrix_size(
-            unsigned int n_batches, // Number of batches
-            unsigned int n_rows,    // Number of output rows
-            unsigned int n_cols,    // Number of output columns
-            unsigned int n_channels // Number of output channels
-            );
-
-        static int get_output_matrix_stride(
-            unsigned int n_batches, // Number of batches
-            unsigned int n_rows,    // Number of output rows
-            unsigned int n_cols,    // Number of output columns
-            unsigned int n_channels // Number of output channels
-            );
-
-        /* Get the memory required by a single "kernel" matrix.
-         */
-        static size_t get_kernel_matrix_size(unsigned int n_input_channels,
-                                             unsigned int n_output_channels);
-        static int get_kernel_matrix_stride(unsigned int n_input_channels,
-                                            unsigned int n_output_channels);
-
-        static constexpr int M_BLOCK = 4;   /** Size of block used by GEMM. */
-        static constexpr int N_BLOCK = 16;  /** Size of block used by GEMM. */
-    };
-};
-
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_fp16.cpp
new file mode 100644
index 0000000000..e1ad9e458d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_fp16.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+#include "winograd_implementations.hpp"
+
+namespace arm_conv {
+namespace winograd {
+
+template bool get_implementation<__fp16>(
+  WinogradImpl &,
+  const CPUInfo *,
+  const ConvolutionArgs &,
+  int max_threads,
+  bool fast_mode,
+  const WinogradConfig *,
+  const arm_gemm::GemmConfig *
+);
+
+}  // namespace winograd
+}  // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_fp32.cpp
new file mode 100644
index 0000000000..b92de1dde7
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_fp32.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "winograd_implementations.hpp"
+
+namespace arm_conv {
+namespace winograd {
+
+template bool get_implementation<float>(
+  WinogradImpl &,
+  const CPUInfo *,
+  const ConvolutionArgs &,
+  int max_threads,
+  bool fast_mode,
+  const WinogradConfig *,
+  const arm_gemm::GemmConfig *
+);
+
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp
new file mode 100644
index 0000000000..af0dd04298
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "winograd.hpp"
+#include <memory>
+#include <string>
+
+namespace arm_conv {
+namespace winograd {
+
+enum class MethodConstraints
+{
+  None,
+  RequiresSVE  = 0x1,
+  RequiresSVE2 = 0x2,
+  RequiresSME  = 0x4,
+  RequiresSME2 = 0x8,
+  LargerShape  = 0x10, // Input tensor shape is larger than the output transform tile shape.
+};
+
+constexpr inline bool operator!(const MethodConstraints &c)
+{
+  return c == MethodConstraints::None;
+}
+
+constexpr inline MethodConstraints operator|(const MethodConstraints &a, const MethodConstraints &b)
+{
+  return static_cast<MethodConstraints>(static_cast<unsigned int>(a) | static_cast<unsigned int>(b));
+}
+
+constexpr inline MethodConstraints operator&(const MethodConstraints &a, const MethodConstraints &b)
+{
+  return static_cast<MethodConstraints>(static_cast<unsigned int>(a) & static_cast<unsigned int>(b));
+}
+
+inline bool constraints_met(const MethodConstraints &c, const CPUInfo *ci, const ConvolutionArgs &, const WinogradConfig *)
+{
+  return (
+    (!(c & MethodConstraints::RequiresSVE) || (ci->has_sve())) &&
+    (!(c & MethodConstraints::RequiresSVE2) || (ci->has_sve2())) &&
+    (!(c & MethodConstraints::RequiresSME) || (ci->has_sme())) &&
+    (!(c & MethodConstraints::RequiresSME2) || (ci->has_sme2()))
+    // Add further constraints here
+  );
+}
+
+inline bool output_transform_constraints_met(const output_transform::ITransform *transform, const MethodConstraints &c, const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg)
+{
+  return (
+    constraints_met(c, ci, conv_args, cfg) &&
+    (!(c & MethodConstraints::LargerShape) || (conv_args.input_shape.rows > transform->get_output_rows() && conv_args.input_shape.cols > transform->get_output_cols()))
+  );
+}
+
+namespace weight_transform {
+
+template <typename TIn, typename TOut=TIn>
+struct TransformImplementation
+{
+  std::unique_ptr<const ITransform> transform;
+  MethodConstraints constraints;
+
+  TransformImplementation(const ITransform *transform, const MethodConstraints &constraints = MethodConstraints::None)
+  : transform(transform), constraints(constraints)
+  {
+  }
+};
+
+template <typename TIn, typename TOut=TIn>
+const TransformImplementation<TIn, TOut> *implementation_list(void);
+
+}  // namespace weight_transform
+
+namespace input_transform
+{
+
+template <typename TIn, typename TOut=TIn>
+struct TransformImplementation
+{
+  std::unique_ptr<const ITransform> transform;
+  MethodConstraints constraints;
+
+  TransformImplementation(const ITransform *transform, const MethodConstraints &constraints = MethodConstraints::None)
+  : transform(transform), constraints(constraints)
+  {
+  }
+};
+
+template <typename TIn, typename TOut=TIn>
+const TransformImplementation<TIn, TOut> *implementation_list(void);
+
+}  // namespace input_transform
+
+namespace output_transform
+{
+
+template <typename TIn, typename TOut=TIn>
+struct TransformImplementation
+{
+  std::unique_ptr<const ITransform> transform;
+  MethodConstraints constraints;
+
+  TransformImplementation(const ITransform *transform, const MethodConstraints &constraints = MethodConstraints::None)
+  : transform(transform), constraints(constraints)
+  {
+  }
+};
+
+template <typename TIn, typename TOut=TIn>
+const TransformImplementation<TIn, TOut> *implementation_list(void);
+
+}  // namespace output_transform
+
+namespace{
+
+template <typename T>
+constexpr T iceildiv(T num, T den)
+{
+  return (num + den - 1) / den;
+}
+
+template <typename T>
+constexpr T iroundup(T num, T den)
+{
+  return den * iceildiv(num, den);
+}
+
+}
+
+template <typename TWeight, typename TWinogradIn>
+inline std::vector<const weight_transform::ITransform *> get_weight_transforms(
+  const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg
+)
+{
+  // Get target inner tile size
+  const auto target_inner_tile_rows = cfg->output_rows == 0 ? 0 : (conv_args.kernel_shape.rows + cfg->output_rows - 1);
+  const auto target_inner_tile_cols = cfg->output_cols == 0 ? 0 : (conv_args.kernel_shape.cols + cfg->output_cols - 1);
+
+  std::vector<const weight_transform::ITransform *> weight_transforms;
+  for (auto impl = weight_transform::implementation_list<TWeight, TWinogradIn>();
+       impl->transform.get() != nullptr; impl++)
+  {
+    // If this transform supports the requested kernel size, then add it to the
+    // list of weight transforms.
+    if (
+      constraints_met(impl->constraints, ci, conv_args,  cfg) &&
+      impl->transform->get_kernel_rows() == conv_args.kernel_shape.rows &&
+      impl->transform->get_kernel_cols() == conv_args.kernel_shape.cols &&
+      (target_inner_tile_rows == 0 || target_inner_tile_rows == impl->transform->get_transformed_tile_rows()) &&
+      (target_inner_tile_cols == 0 || target_inner_tile_cols == impl->transform->get_transformed_tile_cols()) &&
+      (cfg->weight_transform_filter == "" || std::strstr(impl->transform->get_name().c_str(), cfg->weight_transform_filter.c_str()))
+    )
+    {
+      weight_transforms.push_back(impl->transform.get());
+    }
+  }
+
+  return weight_transforms;
+}
+
+template <typename TIn, typename TWinogradIn>
+inline std::vector<const input_transform::ITransform *> get_input_transforms(
+  const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg
+)
+{
+  // Get target inner tile size
+  const auto target_inner_tile_rows = cfg->output_rows == 0 ? 0 : (conv_args.kernel_shape.rows + cfg->output_rows - 1);
+  const auto target_inner_tile_cols = cfg->output_cols == 0 ? 0 : (conv_args.kernel_shape.cols + cfg->output_cols - 1);
+
+  std::vector<const input_transform::ITransform *> input_transforms;
+  for (auto impl = input_transform::implementation_list<TIn, TWinogradIn>();
+       impl->transform.get() != nullptr; impl++)
+  {
+    if(
+      constraints_met(impl->constraints, ci, conv_args,  cfg) &&
+      (target_inner_tile_rows == 0 || target_inner_tile_rows == impl->transform->get_input_rows()) &&
+      (target_inner_tile_cols == 0 || target_inner_tile_cols == impl->transform->get_input_cols()) &&
+      (cfg->input_transform_filter == "" || std::strstr(impl->transform->get_name().c_str(), cfg->input_transform_filter.c_str()))
+    )
+    {
+      input_transforms.push_back(impl->transform.get());
+    }
+  }
+
+  return input_transforms;
+}
+
+template <typename TWinogradOut, typename TOut>
+inline std::vector<const output_transform::ITransform *> get_output_transforms(
+  const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg
+)
+{
+  std::vector<const output_transform::ITransform *> output_transforms;
+  for (auto impl = output_transform::implementation_list<TWinogradOut, TOut>();
+       impl->transform.get() != nullptr; impl++)
+  {
+    if(
+      output_transform_constraints_met(impl->transform.get(), impl->constraints, ci, conv_args,  cfg) &&
+      impl->transform->get_kernel_rows() == conv_args.kernel_shape.rows &&
+      impl->transform->get_kernel_cols() == conv_args.kernel_shape.cols &&
+      (cfg->output_rows == 0 || cfg->output_rows == impl->transform->get_output_rows()) &&
+      (cfg->output_cols == 0 || cfg->output_cols == impl->transform->get_output_cols()) &&
+      (cfg->output_transform_filter == "" || std::strstr(impl->transform->get_name().c_str(), cfg->output_transform_filter.c_str()))
+    )
+    {
+      output_transforms.push_back(impl->transform.get());
+    }
+  }
+
+  return output_transforms;
+}
+
+template <typename TIn, typename TWeight, typename TOut, typename TWinogradIn, typename TWinogradOut>
+bool get_implementation(
+  WinogradImpl &dest,  // Destination for the selected implementation
+  const CPUInfo *ci,
+  const ConvolutionArgs &conv_args,
+  int max_threads,
+  bool fast_mode,
+  const WinogradConfig *cfg,
+  const arm_gemm::GemmConfig *gemm_cfg
+)
+{
+  // Get vectors of valid weight, input and output transforms; then select the
+  // combination which produces the biggest output tile.
+  const auto weight_transforms = get_weight_transforms<TWeight, TWinogradIn>(ci, conv_args, cfg);
+  const auto input_transforms = get_input_transforms<TIn, TWinogradIn>(ci, conv_args, cfg);
+  const auto output_transforms = get_output_transforms<TWinogradOut, TOut>(ci, conv_args, cfg);
+
+  // Now attempt to select a complete set of Winograd transformations which can
+  // solve the problem. Work backwards from the output transform to find
+  // matching input implementations.
+  bool success = false;
+  for (auto output_transform = output_transforms.cbegin();
+       !success && output_transform != output_transforms.cend();
+       output_transform++)
+  {
+    // Look for matching weight transforms, if we find one then we look for
+    // matching input transforms.
+    for (auto weight_transform = weight_transforms.cbegin();
+         !success && weight_transform != weight_transforms.cend();
+         weight_transform++)
+    {
+      // If this weight transform is compatible, then look for a matching input
+      // transform
+      if ((*output_transform)->get_input_rows() == (*weight_transform)->get_transformed_tile_rows() &&
+          (*output_transform)->get_input_cols() == (*weight_transform)->get_transformed_tile_cols())
+      {
+        for (auto input_transform = input_transforms.cbegin();
+             !success && input_transform != input_transforms.cend();
+             input_transform++)
+        {
+          // If the input transform is suitable, then set the configuration and
+          // indicate success.
+          if ((*input_transform)->get_input_rows() == (*output_transform)->get_input_rows() &&
+              (*input_transform)->get_input_cols() == (*output_transform)->get_input_cols())
+          {
+            dest.output_transform = *output_transform;
+            dest.input_transform = *input_transform;
+            dest.weight_transform = *weight_transform;
+            success = true;
+          }
+        }
+      }
+    }
+  }
+
+  if (!success)
+  {
+    return false;
+  }
+
+  // If we're able to construct the Winograd elements, then specify the GEMM
+  // arguments required to perform the multiply-accumulate step of the
+  // convolution.
+  const auto n_output_row_tiles = iceildiv(conv_args.output_shape.rows, dest.output_transform->get_output_rows());
+  const auto n_output_col_tiles = iceildiv(conv_args.output_shape.cols, dest.output_transform->get_output_cols());
+  const auto n_output_patches = n_output_row_tiles * n_output_col_tiles;
+
+  const int n_multis = dest.input_transform->get_input_rows() *
+                       dest.input_transform->get_input_cols();
+
+  dest.gemm_args.reset(new arm_gemm::GemmArgs(
+    ci,
+    n_output_patches,  // M
+    conv_args.n_output_channels,  // N
+    conv_args.n_input_channels,  // K
+    1,  // K-sections
+    conv_args.n_batches,  // # Batches
+    n_multis,
+    false,  // Indirect input
+    {},  // No activation
+    max_threads,
+    false, // Not fixed format
+    fast_mode,
+    gemm_cfg
+  ));
+
+  // Also provide hints for the Winograd memory layout
+  auto &ws = dest.winograd_spec;
+  ws.weight_ld_row = iroundup(conv_args.n_output_channels, 4u);
+  ws.weight_ld_matrix = conv_args.n_input_channels * ws.weight_ld_row;
+  ws.weight_matrix_size_bytes = n_multis * ws.weight_ld_matrix * sizeof(TWinogradIn);
+
+  ws.input_ld_row = iroundup(conv_args.n_input_channels, 4u);
+  ws.input_ld_matrix = iroundup(n_output_patches, 4u) * ws.input_ld_row;
+  ws.input_ld_batch = n_multis * ws.input_ld_matrix;
+  ws.input_matrix_size_bytes = conv_args.n_batches * ws.input_ld_batch * sizeof(TWinogradIn);
+
+  ws.output_ld_row = ws.weight_ld_row;
+  ws.output_ld_matrix = n_output_patches * ws.output_ld_row;
+  ws.output_ld_batch = n_multis * ws.output_ld_matrix;
+  ws.output_matrix_size_bytes = conv_args.n_batches * ws.output_ld_batch * sizeof(TWinogradOut);
+
+  return true;
+}
+
+}  // namespace winograd
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp
deleted file mode 100644
index 52ff7b3798..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "arm_gemm_local.hpp"
-#include "arm_gemm.hpp"
-#include "winograd.hpp"
-
-namespace winograd
-{
-
-
-class IWinogradConvolutionLayer
-{
-  public:
-    virtual ~IWinogradConvolutionLayer() = default;
-
-    virtual unsigned int weight_transform_get_window(void) const = 0;
-    virtual void weight_transform_run(unsigned int start, unsigned int stop) = 0;
-
-    virtual IInputTransform& input_transform(void) = 0; // Expose the input transform
-    virtual IOutputTransform& output_transform(void) = 0;  // Expose the output transform
-    virtual arm_gemm::IGemmCommon *gemm(void) = 0;  // Expose the underlying GEMM
-};
-
-/** Example of how to construct an ACL-like interface.
- *
- * Use `get_weight_storage_size`, `get_input_storage_size` and
- * `get_output_storage_size` to allocate memory for the convolution engine.
- * Then create a `WinogradConvolutionLayer`.
- *
- * Initialise the weights using `weights_transform.run(...)`.
- *
- * For each inference:
- *   1. Transform the inputs to the Winograd domain using `input_transform.run(...)`
- *   2. Perform a number of GEMMs using `gemms.run(...)`
- *   3. Transform the output to the spatial domain using `output_transform.run(...)`
- */
-template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
-          typename TIn, typename TInGEMM, typename TOutGEMM, typename TOut,
-          WinogradRoots Roots>
-class WinogradConvolutionLayer : public IWinogradConvolutionLayer
-{
-  public:
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, Roots>;
-    using WeightsTransform = typename WinogradBase::template WeightsTransform<TIn, TInGEMM>;
-    using InputTransform = typename WinogradBase::template InputTransform<TIn, TInGEMM>;
-    using WinogradConv = typename WinogradBase::template Convolution<TOut, TIn, TInGEMM, TOutGEMM>;
-    using OutputTransform = typename WinogradBase::template OutputTransform<TOutGEMM, TOut>;
-
-  private:
-    static constexpr int InnerTileRows = OutputTileRows + KernelRows - 1;
-    static constexpr int InnerTileCols = OutputTileCols + KernelCols - 1;
-    static constexpr int N_GEMMS = InnerTileRows * InnerTileCols;
-
-    const int _n_output_rows, _n_output_cols;
-    const int _kernel_matrix_stride, _kernel_matrix_row_stride;
-    const int _input_matrix_stride, _input_matrix_row_stride;
-    const int _output_matrix_stride, _output_matrix_row_stride;
-    const int _tile_rows, _tile_cols;
-    const int _m, _k, _n;
-
-    WeightsTransform weights_transform;  /** Operator to transform weights to Winograd domain. */
-    InputTransform _input_transform;      /** Operator to transform input to Winograd domain. */
-    const arm_gemm::GemmArgs gemm_args;
-    arm_gemm::UniqueGemmCommon<TInGEMM, TOutGEMM> gemms;    /** Operator to perform multiple GEMMs. */
-    OutputTransform _output_transform;    /** Operator to transform output from Winograd domain. */
-
-  public:
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed weights.
-     */
-    static unsigned int get_weight_storage_size(
-      const int n_output_channels,  /** Number of output feature maps. */
-      const int n_input_channels    /** Number of input feature maps. */
-    );
-
-    static unsigned int get_weight_stride(
-      const int n_output_channels,  /** Number of output feature maps. */
-      const int n_input_channels    /** Number of input feature maps. */
-    );
-
-    static unsigned int get_weight_multi_stride(
-      const int n_output_channels,  /** Number of output feature maps. */
-      const int n_input_channels    /** Number of input feature maps. */
-    );
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed input.
-     */
-    static unsigned int get_input_storage_size(
-      const int n_batches,     /** Number of batches in the input tensor. */
-      const int n_channels,    /** Number of feature maps in the input tensor. */
-      const int n_rows,        /** Number of rows in each feature map. */
-      const int n_cols,        /** Number of columns in each feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Get the row stride for the A matrix in the Winograd domain. */
-    static unsigned int get_input_stride(
-      const int n_batches,     /** Number of batches in the input tensor. */
-      const int n_channels,    /** Number of feature maps in the input tensor. */
-      const int n_rows,        /** Number of rows in each feature map. */
-      const int n_cols,        /** Number of columns in each feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Get the stride between A matrices in the Winograd domain. */
-    static unsigned int get_input_multi_stride(
-      const int n_batches,     /** Number of batches in the input tensor. */
-      const int n_channels,    /** Number of feature maps in the input tensor. */
-      const int n_rows,        /** Number of rows in each feature map. */
-      const int n_cols,        /** Number of columns in each feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Determine how much memory (in units of TOut) to allocate for the
-     * (Winograd domain) output.
-     */
-    static unsigned int get_output_storage_size(
-      const int n_batches,          /** Number of batches in the output tensor. */
-      const int n_rows,             /** Number of rows in each feature map of the input tensor. */
-      const int n_cols,             /** Number of columns in each feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    static unsigned int get_output_stride(
-      const int n_batches,          /** Number of batches in the output tensor. */
-      const int n_rows,             /** Number of rows in each feature map of the input tensor. */
-      const int n_cols,             /** Number of columns in each feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    static unsigned int get_output_multi_stride(
-      const int n_batches,          /** Number of batches in the output tensor. */
-      const int n_rows,             /** Number of rows in each feature map of the input tensor. */
-      const int n_cols,             /** Number of columns in each feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Get the shape (rows, cols) of a feature map of the output tensor. */
-    static std::pair<int, int> get_output_feature_map_shape(
-      const int n_input_rows,  /** Number of rows in the input feature map. */
-      const int n_input_cols,  /** Number of columns in the input feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Create a new Winograd convolution layer.
-     */
-    WinogradConvolutionLayer(
-      const CPUInfo &cpuinfo,       /** Describes CPU properties. */
-      const int n_threads,          /** Maximum number of threads used to execute the convolution. */
-      const int n_batches,          /** Number of batches in the input and output tensors. */
-      const int n_input_channels,   /** Number of feature maps in a batch of the input tensor. */
-      const int n_input_rows,       /** Number of rows in a feature map of the input tensor. */
-      const int n_input_cols,       /** Number of columns in a feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding,      /** Use "SAME" padding, otherwise use "VALID". */
-      const arm_gemm::Activation &activation,
-      const TIn* const weights,     /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */
-      TInGEMM* const weights_storage,  /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */
-      const TIn* const input,       /** Pointer to NHWC ordered input tensor, in the spatial domain. */
-      TInGEMM* const winograd_input,    /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */
-      const TOut* const biases,     /** Pointer to biases vector. Pass nullptr if no bias is provided. */
-      TOut* const output,           /** Pointer to NHWC ordered output tensor, in the spatial domain. */
-      TOutGEMM* const winograd_output,  /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */
-      const bool pretranspose_B=true,         /** Hint that the B matrix can be pretransposed. */
-      arm_gemm::GemmConfig *gemm_cfg=nullptr  /** Pointer to GEMM configuration. */
-    );
-
-    /* Utility methods for interacting with the layer. */
-    unsigned int weight_transform_get_window(void) const;
-    void weight_transform_run(const unsigned int start, const unsigned int stop);
-
-    IInputTransform& input_transform(void);
-    IOutputTransform& output_transform(void);
-
-    /* Get a pointer to the GEMM underlying the Winograd transform. */
-    arm_gemm::IGemmCommon *gemm(void);
-};
-
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp
deleted file mode 100644
index c0f50beb2c..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <algorithm>
-
-#include "padding.hpp"
-#include "utils.hpp"
-#include "winograd.hpp"
-
-#define MEMBERFN(RTYPE) template <\
-  int InnerTileRows, int InnerTileCols,\
-  typename TIn, typename TOut, WinogradRoots Roots\
-> RTYPE InputTransform<InnerTileRows, InnerTileCols, TIn, TOut, Roots>
-
-
-#define Nx1MEMBERFN(RTYPE) template <\
-  int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots\
-> RTYPE InputTransform<InnerTileRows, 1, TIn, TOut, Roots>
-
-namespace winograd
-{
-
-MEMBERFN()::InputTransform(
-  const int kernel_rows,
-  const int kernel_cols,
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  const int padding_top,
-  const int padding_left,
-  const int padding_bottom,
-  const int padding_right
-) : _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols), _n_channels(n_channels),
-    _inptr(nullptr), _outptr(nullptr),
-    _overlap_rows(kernel_rows - 1), _overlap_cols(kernel_cols - 1),
-    _padding_top(padding_top), _padding_left(padding_left), _padding_bottom(padding_bottom), _padding_right(padding_right),
-    _tiles_M(iceildiv(padding_top + n_rows + padding_bottom - kernel_rows + 1, InnerTileRows - kernel_rows + 1)),
-    _tiles_N(iceildiv(padding_left + n_cols + padding_right - kernel_cols + 1, InnerTileCols - kernel_cols + 1)),
-    _matrix_stride(0), _matrix_row_stride(0), _matrix_batch_stride(0),
-    _in_col_stride(0), _in_row_stride(0), _in_batch_stride(0),
-    _working_space_col_stride(n_channels),
-    _working_space_row_stride(InnerTileCols * _working_space_col_stride),
-    _working_space(nullptr)
-{
-}
-
-MEMBERFN(void)::set_input_tensor(const void* const inptr)
-{
-  set_input_tensor(inptr, _n_channels);
-}
-
-MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldcol)
-{
-  set_input_tensor(inptr, _n_cols * ldcol, ldcol);
-}
-
-MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldrow, const int ldcol)
-{
-  set_input_tensor(inptr, _n_rows * ldrow, ldrow, ldcol);
-}
-
-MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldbatch, const int ldrow, const int ldcol)
-{
-  _inptr = static_cast<const TIn *>(inptr);
-  _in_batch_stride = ldbatch;
-  _in_row_stride = ldrow;
-  _in_col_stride = ldcol;
-}
-
-MEMBERFN(void)::set_output_matrices(void * const mptr, const int ldmatrix, const int ldrow)
-{
-  _outptr = static_cast<TOut *>(mptr);
-  _matrix_stride = ldmatrix;
-  _matrix_row_stride = ldrow;
-  _matrix_batch_stride = _tiles_M * _tiles_N * ldrow;
-}
-
-Nx1MEMBERFN()::InputTransform(
-  const int kernel_rows,
-  const int kernel_cols,
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  const int padding_top,
-  const int padding_left,
-  const int padding_bottom,
-  const int padding_right
-) : InputTransform<1, InnerTileRows, TIn, TOut, Roots>::InputTransform(
-    /* Transpose rows and columns */
-    kernel_cols, kernel_rows, n_batches, n_cols, n_rows, n_channels,
-    padding_left, padding_top, padding_right, padding_bottom
-  )
-{
-}
-
-Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr)
-{
-  set_input_tensor(inptr, this->_n_channels);
-}
-
-Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldcol)
-{
-  set_input_tensor(inptr, this->_n_cols * ldcol, ldcol);
-}
-
-Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldrow, const int ldcol)
-{
-  set_input_tensor(inptr, this->_n_rows * ldrow, ldrow, ldcol);
-}
-
-Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldbatch, const int ldrow, const int ldcol)
-{
-  // Transpose row and column strides
-  Base::set_input_tensor(inptr, ldbatch, ldcol, ldrow);
-}
-
-MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
-{
-  return sizeof(TIn) * InnerTileRows * _working_space_row_stride * nthreads;
-}
-
-MEMBERFN(void)::set_working_space(void * const buffer)
-{
-  _working_space = static_cast<TIn *>(buffer);
-}
-
-MEMBERFN(unsigned int)::get_window(void) const
-{
-  return iceildiv(_n_channels, WINDOW_BLOCK);
-}
-
-MEMBERFN(void)::run(
-  const unsigned int start,
-  const unsigned int stop,
-  const unsigned int threadid
-)
-{
-  // Determine the channels on which to work
-  if (start >= get_window())
-  {
-    return;  // No work to do beyond the end of the window
-  }
-  const unsigned int start_channel = start * WINDOW_BLOCK;
-  const unsigned int stop_channel = std::min<unsigned int>(_n_channels , stop * WINDOW_BLOCK);
-  const unsigned int n_channels = stop_channel - start_channel;
-
-  // Loop over batches
-  for (int batch = 0; batch < _n_batches; batch++)
-  {
-    const TIn* const inptr_batch = _inptr + start_channel + batch*_in_batch_stride;
-    TOut* const outptr_batch = _outptr + start_channel + batch*_matrix_batch_stride;
-
-    // Loop over rows of tiles
-    for (int tile_i = 0; tile_i < _tiles_M; tile_i++)
-    {
-      // Compute the starting and ending row of pixels within the row of tiles,
-      // hence compute the padding to apply to the top and bottom of each tile.
-      const int row_top = tile_i * (InnerTileRows - _overlap_rows) - _padding_top;
-      const int row_bottom = row_top + InnerTileRows;
-      const int row_pad_top = std::max(0, _padding_top - tile_i * (InnerTileRows - _overlap_rows));
-      const int row_pad_bottom = std::max(0, row_bottom - _n_rows);
-
-      // Get a pointer to the start of the row.
-      const int row_offset = std::min(0, row_pad_top - _padding_top);
-      const TIn* const inptr_row = inptr_batch + _in_row_stride*(row_offset + tile_i*(InnerTileRows - _overlap_rows));
-      TOut* const outptr_row = outptr_batch + tile_i*_tiles_N*_matrix_row_stride;
-
-      // Loop over tiles within the row
-      for (int tile_j = 0; tile_j < _tiles_N; tile_j++)
-      {
-        // Compute the starting and ending column of pixels within the tile,
-        // hence compute the padding to apply to the left and right of the
-        // tile.
-        const int tile_left = tile_j * (InnerTileCols - _overlap_cols) - _padding_left;
-        const int tile_right = tile_left + InnerTileCols;
-        const int tile_pad_left = std::max(0, _padding_left - tile_j * (InnerTileCols - _overlap_cols));
-        const int tile_pad_right = std::max(0, tile_right - _n_cols);
-
-        // Get a pointer to the start of the tile.
-        const int col_offset = std::min(0, tile_pad_left - _padding_left);
-        const TIn* const inptr_tile = inptr_row + _in_col_stride*(col_offset + tile_j*(InnerTileCols - _overlap_cols));
-        TOut* const outptr_tile = outptr_row + tile_j * _matrix_row_stride;
-
-        // Transform the tile, applying padding if necessary.
-        if (row_pad_top || tile_pad_left || row_pad_bottom || tile_pad_right)
-        {
-          transform_padded_tile(
-            threadid, n_channels, outptr_tile, inptr_tile,
-            row_pad_top, tile_pad_left, row_pad_bottom, tile_pad_right
-          );
-        }
-        else
-        {
-          transform_unpadded_tile(threadid, n_channels, outptr_tile, inptr_tile);
-        }
-      }
-    }
-  }
-}
-
-MEMBERFN(void)::transform_unpadded_tile(
-  const unsigned int /* threadid unused */,
-  const int n_channels,
-  TOut * const outptr,
-  const TIn * const inptr
-)
-{
-  transform_tile(
-    n_channels, inptr, _in_row_stride, _in_col_stride, outptr, _matrix_stride
-  );
-}
-
-MEMBERFN(void)::transform_padded_tile(
-  const unsigned int threadid,
-  const int n_channels,
-  TOut * const outptr,
-  const TIn * const inptr,
-  const int padding_top,
-  const int padding_left,
-  const int padding_bottom,
-  const int padding_right
-)
-{
-  padding::copy_and_pad_tile(
-    InnerTileRows, InnerTileCols, n_channels,
-    inptr, _in_row_stride, _in_col_stride,
-    static_cast<TIn *>(get_working_space(threadid)), _working_space_row_stride, _working_space_col_stride,
-    padding_top, padding_left, padding_bottom, padding_right
-  );
-
-  transform_tile(
-    n_channels, static_cast<const TIn *>(get_working_space(threadid)),
-    _working_space_row_stride, _working_space_col_stride,
-    outptr, _matrix_stride
-  );
-}
-
-MEMBERFN(void *)::get_working_space(const unsigned int threadid) const
-{
-  return _working_space + InnerTileRows * _working_space_row_stride * threadid;
-}
-
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp
deleted file mode 100644
index 8f6e9e8b40..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm.hpp"
-#include "input.hpp"
-
-namespace winograd
-{
-
-template <>
-void InputTransform<1, 8, float, float, WinogradRoots::Integers>::transform_tile(
-  const int n_channels,
-  const float* const input_base,
-  const int,  // We don't need to stride over rows
-  const int input_col_stride,
-  float* outptr,
-  const int matrix_stride
-)
-{
-  constexpr int inner_tile_cols = 8;
-
-  // Get pointers into the input tile
-  const float *x_ptrs[inner_tile_cols];
-  for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
-  {
-    x_ptrs[j] = input_base + xj*input_col_stride;
-  }
-
-  // Vectors used/computed in this kernel.
-  float x[inner_tile_cols];
-  float U[inner_tile_cols];
-
-  for (int j = 0; j < inner_tile_cols; j++)
-  {
-    x[j] = 0.0f;
-  }
-
-  // Perform the Winograd input transformation for each channel in the input
-  // tensor.
-  int channels_remaining = n_channels;
-#ifdef _arm_any_
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    float32x4_t x[inner_tile_cols], U[inner_tile_cols];
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      x[j] = vdupq_n_f32(0.0f);
-    }
-
-    // Load x
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      x[j] = vld1q_f32(x_ptrs[j]);
-      x_ptrs[j] += 4;
-    }
-
-    // Compute U = x . X
-    U[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
-    U[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
-    U[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
-    U[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
-    U[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
-    U[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
-    U[6] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
-    U[7] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
-
-    // Store the transformed vector
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      vst1q_f32(outptr + j*matrix_stride, U[j]);
-    }
-    outptr += 4;
-  }
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    float32x2_t x[inner_tile_cols], U[inner_tile_cols];
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      x[j] = vdup_n_f32(0.0f);
-    }
-
-    // Load x
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      x[j] = vld1_f32(x_ptrs[j]);
-      x_ptrs[j] += 2;
-    }
-
-    // Compute U = x . X
-    U[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
-    U[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
-    U[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
-    U[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
-    U[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
-    U[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
-    U[6] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
-    U[7] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
-
-    // Store the transformed vector
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      vst1_f32(outptr + j*matrix_stride, U[j]);
-    }
-    outptr += 2;
-  }
-#endif  // _arm_any_
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load x
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      x[j] = *(x_ptrs[j]++);
-    }
-
-    // Compute U = x . X
-    U[0] = x[0]*-36 + x[4]*-14 + x[2]*49 + x[6]*1;
-    U[1] = x[5]*-1 + x[1]*-36 + x[4]*-13 + x[3]*13 + x[2]*36 + x[6]*1;
-    U[2] = x[3]*-13 + x[4]*-13 + x[1]*36 + x[2]*36 + x[5]*1 + x[6]*1;
-    U[3] = x[1]*-18 + x[4]*-10 + x[5]*-2 + x[2]*9 + x[3]*20 + x[6]*1;
-    U[4] = x[3]*-20 + x[4]*-10 + x[5]*2 + x[2]*9 + x[1]*18 + x[6]*1;
-    U[5] = x[1]*-12 + x[4]*-5 + x[5]*-3 + x[2]*4 + x[3]*15 + x[6]*1;
-    U[6] = x[3]*-15 + x[4]*-5 + x[5]*3 + x[2]*4 + x[1]*12 + x[6]*1;
-    U[7] = x[1]*-36 + x[5]*-14 + x[3]*49 + x[7]*1;
-
-    // Store the transformed vector
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      *(outptr + j*matrix_stride) = U[j];
-    }
-    outptr++;
-  }
-}
-
-template class InputTransform<1, 8, float, float, WinogradRoots::Integers>;
-template class InputTransform<8, 1, float, float, WinogradRoots::Integers>;
-
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp
deleted file mode 100644
index 5e6ac97121..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#include "input.hpp"
-#include "arm.hpp"
-
-namespace winograd
-{
-
-template <>
-void InputTransform<4, 4, __fp16, __fp16, WinogradRoots::Integers>::transform_tile(
-    const int n_channels,
-    const __fp16* const input_base,
-    const int input_row_stride,
-    const int input_col_stride,
-    __fp16* outptr,
-    const int matrix_stride
-)
-{
-    constexpr int inner_tile_rows = 4, inner_tile_cols = 4;
-
-    // Get pointers into the input tile
-    const __fp16 *x_ptrs[inner_tile_rows][inner_tile_cols];
-    for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
-    {
-        // Get a pointer into the row
-        const __fp16* const row_ptr = input_base + xi*input_row_stride;
-
-        for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
-        {
-            x_ptrs[i][j] = row_ptr + xj*input_col_stride;
-        }
-    }
-
-    // Matrices used/computed in this kernel.
-    __fp16 x[inner_tile_rows][inner_tile_cols];
-    __fp16 XTx[inner_tile_rows][inner_tile_cols];
-    __fp16 U[inner_tile_rows][inner_tile_cols];
-
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-        for (int j = 0; j < inner_tile_cols; j++)
-        {
-            x[i][j] = XTx[i][j] = 0.0f;
-        }
-    }
-
-    // Perform the Winograd input transformation for each channel in the input
-    // tensor.
-    int channels_remaining = n_channels;
-#ifdef __aarch64__
-    for (; channels_remaining >= 8; channels_remaining -= 8)
-  {
-    // Matrices used/computed in this kernel.
-    float16x8_t x[inner_tile_rows][inner_tile_cols];
-    float16x8_t XTx[inner_tile_rows][inner_tile_cols];
-    float16x8_t U[inner_tile_rows][inner_tile_cols];
-
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vdupq_n_f16(0.0f);
-        XTx[i][j] = vdupq_n_f16(0.0f);
-      }
-    }
-
-    // Load x
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vld1q_f16(x_ptrs[i][j]);
-        x_ptrs[i][j] += 8;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      // XTx[0][j] = x[0][j] - x[2][j];
-      XTx[0][j] = vsubq_f16(x[0][j], x[2][j]);
-
-      // XTx[1][j] = x[1][j] + x[2][j];
-      XTx[1][j] = vaddq_f16(x[1][j], x[2][j]);
-
-      // XTx[2][j] = x[2][j] - x[1][j];
-      XTx[2][j] = vsubq_f16(x[2][j], x[1][j]);
-
-      // XTx[3][j] = x[1][j] - x[3][j];
-      XTx[3][j] = vsubq_f16(x[1][j], x[3][j]);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      // U[i][0] = XTx[i][0] - XTx[i][2];
-      U[i][0] = vsubq_f16(XTx[i][0], XTx[i][2]);
-
-      // U[i][1] = XTx[i][1] + XTx[i][2];
-      U[i][1] = vaddq_f16(XTx[i][1], XTx[i][2]);
-
-      // U[i][2] = XTx[i][2] - XTx[i][1];
-      U[i][2] = vsubq_f16(XTx[i][2], XTx[i][1]);
-
-      // U[i][3] = XTx[i][1] - XTx[i][3];
-      U[i][3] = vsubq_f16(XTx[i][1], XTx[i][3]);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++, m++)
-      {
-        vst1q_f16(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 8;
-  }
-#endif  // __aarch64__
-#ifdef __arm_any__
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used/computed in this kernel.
-    float16x4_t x[inner_tile_rows][inner_tile_cols];
-    float16x4_t XTx[inner_tile_rows][inner_tile_cols];
-    float16x4_t U[inner_tile_rows][inner_tile_cols];
-
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vdup_n_f16(0.0f);
-        XTx[i][j] = vdup_n_f16(0.0f);
-      }
-    }
-
-    // Load x
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vld1_f16(x_ptrs[i][j]);
-        x_ptrs[i][j] += 4;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      // XTx[0][j] = x[0][j] - x[2][j];
-      XTx[0][j] = vsub_f16(x[0][j], x[2][j]);
-
-      // XTx[1][j] = x[1][j] + x[2][j];
-      XTx[1][j] = vadd_f16(x[1][j], x[2][j]);
-
-      // XTx[2][j] = x[2][j] - x[1][j];
-      XTx[2][j] = vsub_f16(x[2][j], x[1][j]);
-
-      // XTx[3][j] = x[1][j] - x[3][j];
-      XTx[3][j] = vsub_f16(x[1][j], x[3][j]);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      // U[i][0] = XTx[i][0] - XTx[i][2];
-      U[i][0] = vsub_f16(XTx[i][0], XTx[i][2]);
-
-      // U[i][1] = XTx[i][1] + XTx[i][2];
-      U[i][1] = vadd_f16(XTx[i][1], XTx[i][2]);
-
-      // U[i][2] = XTx[i][2] - XTx[i][1];
-      U[i][2] = vsub_f16(XTx[i][2], XTx[i][1]);
-
-      // U[i][3] = XTx[i][1] - XTx[i][3];
-      U[i][3] = vsub_f16(XTx[i][1], XTx[i][3]);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++, m++)
-      {
-        vst1_f16(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 4;
-  }
-#endif  // __arm_any__
-    for (; channels_remaining; channels_remaining--)
-    {
-        // Load x
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            for (int j = 0; j < inner_tile_cols; j++)
-            {
-                x[i][j] = *(x_ptrs[i][j]++);
-            }
-        }
-
-        // Compute XT . x
-        for (int j = 0; j < inner_tile_cols; j++)
-        {
-            XTx[0][j] = x[0][j] - x[2][j];
-            XTx[1][j] = x[1][j] + x[2][j];
-            XTx[2][j] = x[2][j] - x[1][j];
-            XTx[3][j] = x[1][j] - x[3][j];
-        }
-
-        // Compute U = XT . x . X
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            U[i][0] = XTx[i][0] - XTx[i][2];
-            U[i][1] = XTx[i][1] + XTx[i][2];
-            U[i][2] = XTx[i][2] - XTx[i][1];
-            U[i][3] = XTx[i][1] - XTx[i][3];
-        }
-
-        // Store the transformed matrix
-        for (int i = 0, m = 0; i < inner_tile_rows; i++)
-        {
-            for (int j = 0; j < inner_tile_cols; j++, m++)
-            {
-                *(outptr + m*matrix_stride) = U[i][j];
-            }
-        }
-        outptr++;
-    }
-}
-
-template class InputTransform<4, 4, __fp16, __fp16, WinogradRoots::Integers>;
-
-}  // namespace
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp
deleted file mode 100644
index 69d3e8feb5..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "input.hpp"
-#include "arm.hpp"
-
-namespace winograd
-{
-
-template <>
-void InputTransform<4, 4, float, float, WinogradRoots::Integers>::transform_tile(
-  const int n_channels,
-  const float* const input_base,
-  const int input_row_stride,
-  const int input_col_stride,
-  float* outptr,
-  const int matrix_stride
-)
-{
-  constexpr int inner_tile_rows = 4, inner_tile_cols = 4;
-
-  // Get pointers into the input tile
-  const float *x_ptrs[inner_tile_rows][inner_tile_cols];
-  for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
-  {
-    // Get a pointer into the row
-    const float* const row_ptr = input_base + xi*input_row_stride;
-
-    for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
-    {
-      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
-    }
-  }
-
-  // Matrices used/computed in this kernel.
-  float x[inner_tile_rows][inner_tile_cols];
-  float XTx[inner_tile_rows][inner_tile_cols];
-  float U[inner_tile_rows][inner_tile_cols];
-
-  for (int i = 0; i < inner_tile_rows; i++)
-  {
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      x[i][j] = XTx[i][j] = 0.0f;
-    }
-  }
-
-  // Perform the Winograd input transformation for each channel in the input
-  // tensor.
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used/computed in this kernel.
-    float32x4_t x[inner_tile_rows][inner_tile_cols];
-    float32x4_t XTx[inner_tile_rows][inner_tile_cols];
-    float32x4_t U[inner_tile_rows][inner_tile_cols];
-
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vdupq_n_f32(0.0f);
-        XTx[i][j] = vdupq_n_f32(0.0f);
-      }
-    }
-
-    // Load x
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vld1q_f32(x_ptrs[i][j]);
-        x_ptrs[i][j] += 4;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      // XTx[0][j] = x[0][j] - x[2][j];
-      XTx[0][j] = vsubq_f32(x[0][j], x[2][j]);
-
-      // XTx[1][j] = x[1][j] + x[2][j];
-      XTx[1][j] = vaddq_f32(x[1][j], x[2][j]);
-
-      // XTx[2][j] = x[2][j] - x[1][j];
-      XTx[2][j] = vsubq_f32(x[2][j], x[1][j]);
-
-      // XTx[3][j] = x[1][j] - x[3][j];
-      XTx[3][j] = vsubq_f32(x[1][j], x[3][j]);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      // U[i][0] = XTx[i][0] - XTx[i][2];
-      U[i][0] = vsubq_f32(XTx[i][0], XTx[i][2]);
-
-      // U[i][1] = XTx[i][1] + XTx[i][2];
-      U[i][1] = vaddq_f32(XTx[i][1], XTx[i][2]);
-
-      // U[i][2] = XTx[i][2] - XTx[i][1];
-      U[i][2] = vsubq_f32(XTx[i][2], XTx[i][1]);
-
-      // U[i][3] = XTx[i][1] - XTx[i][3];
-      U[i][3] = vsubq_f32(XTx[i][1], XTx[i][3]);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++, m++)
-      {
-        vst1q_f32(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 4;
-  }
-#endif  // __aarch64__
-#ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used/computed in this kernel.
-    float32x2_t x[inner_tile_rows][inner_tile_cols];
-    float32x2_t XTx[inner_tile_rows][inner_tile_cols];
-    float32x2_t U[inner_tile_rows][inner_tile_cols];
-
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vdup_n_f32(0.0f);
-        XTx[i][j] = vdup_n_f32(0.0f);
-      }
-    }
-
-    // Load x
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vld1_f32(x_ptrs[i][j]);
-        x_ptrs[i][j] += 2;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      // XTx[0][j] = x[0][j] - x[2][j];
-      XTx[0][j] = vsub_f32(x[0][j], x[2][j]);
-
-      // XTx[1][j] = x[1][j] + x[2][j];
-      XTx[1][j] = vadd_f32(x[1][j], x[2][j]);
-
-      // XTx[2][j] = x[2][j] - x[1][j];
-      XTx[2][j] = vsub_f32(x[2][j], x[1][j]);
-
-      // XTx[3][j] = x[1][j] - x[3][j];
-      XTx[3][j] = vsub_f32(x[1][j], x[3][j]);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      // U[i][0] = XTx[i][0] - XTx[i][2];
-      U[i][0] = vsub_f32(XTx[i][0], XTx[i][2]);
-
-      // U[i][1] = XTx[i][1] + XTx[i][2];
-      U[i][1] = vadd_f32(XTx[i][1], XTx[i][2]);
-
-      // U[i][2] = XTx[i][2] - XTx[i][1];
-      U[i][2] = vsub_f32(XTx[i][2], XTx[i][1]);
-
-      // U[i][3] = XTx[i][1] - XTx[i][3];
-      U[i][3] = vsub_f32(XTx[i][1], XTx[i][3]);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++, m++)
-      {
-        vst1_f32(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 2;
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load x
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = *(x_ptrs[i][j]++);
-      }
-    }
-
-    // Compute XT . x
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      XTx[0][j] = x[0][j] - x[2][j];
-      XTx[1][j] = x[1][j] + x[2][j];
-      XTx[2][j] = x[2][j] - x[1][j];
-      XTx[3][j] = x[1][j] - x[3][j];
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      U[i][0] = XTx[i][0] - XTx[i][2];
-      U[i][1] = XTx[i][1] + XTx[i][2];
-      U[i][2] = XTx[i][2] - XTx[i][1];
-      U[i][3] = XTx[i][1] - XTx[i][3];
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++, m++)
-      {
-        *(outptr + m*matrix_stride) = U[i][j];
-      }
-    }
-    outptr++;
-  }
-}
-
-template class InputTransform<4, 4, float, float, WinogradRoots::Integers>;
-
-}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp
deleted file mode 100644
index d0ce307988..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include "arm.hpp"
-#include "input.hpp"
-
-namespace winograd
-{
-template <>
-void InputTransform<6, 6, __fp16, __fp16, WinogradRoots::Integers>::transform_tile(
-    const int n_channels,
-    const __fp16* const input_base,
-    const int input_row_stride,
-    const int input_col_stride,
-    __fp16* outptr,
-    const int matrix_stride
-)
-{
-    constexpr int inner_tile_rows = 6;
-    constexpr int inner_tile_cols = 6;
-
-    // Get pointers into the input tile
-    const __fp16 *x_ptrs[inner_tile_rows][inner_tile_cols];
-    for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
-    {
-        // Get a pointer into the row
-        const __fp16* const row_ptr = input_base + xi*input_row_stride;
-
-        for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
-        {
-            x_ptrs[i][j] = row_ptr + xj*input_col_stride;
-        }
-    }
-
-    // Matrices used/computed in this kernel.
-    __fp16 x[inner_tile_rows][inner_tile_cols];
-    __fp16 XTx[inner_tile_rows][inner_tile_cols];
-    __fp16 U[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-        for (int j = 0; j < inner_tile_cols; j++)
-        {
-            x[i][j] = XTx[i][j] = 0.0f;
-        }
-    }
-
-    // Perform the Winograd input transformation for each channel in the input
-    // tensor.
-    int channels_remaining = n_channels;
-    for (; channels_remaining >= 8; channels_remaining -= 8)
-    {
-        // Matrices used/computed in this kernel
-        float16x8_t x[inner_tile_rows][inner_tile_cols];
-        float16x8_t XTx[inner_tile_rows][inner_tile_cols];
-        float16x8_t U[inner_tile_rows][inner_tile_cols];
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            for (int j = 0; j < inner_tile_cols; j++)
-            {
-                x[i][j] = vdupq_n_f16(0.0f);
-                XTx[i][j] = vdupq_n_f16(0.0f);
-            }
-        }
-
-        // Read a 6x6 tile in the Winograd domain
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            for (int j = 0; j < inner_tile_cols; j++)
-            {
-                x[i][j] = vld1q_f16(x_ptrs[i][j]);
-                x_ptrs[i][j] += 8;
-            }
-        }
-
-        // Compute XT . x
-        for (int j = 0; j < inner_tile_cols; j++)
-        {
-            // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-            XTx[0][j] = vsubq_f16(vaddq_f16(x[4][j], vmulq_f16(x[0][j], vdupq_n_f16(4.0f))), vmulq_f16(x[2][j], vdupq_n_f16(5.0f)));
-
-            // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-            XTx[1][j] = vsubq_f16(vaddq_f16(x[3][j], x[4][j]), vmulq_f16(vaddq_f16(x[1][j], x[2][j]),  vdupq_n_f16(4.0f)));
-
-            // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-            XTx[2][j] = vaddq_f16(vsubq_f16(x[4][j], x[3][j]), vmulq_f16(vsubq_f16(x[1][j], x[2][j]), vdupq_n_f16(4.0f)));
-
-            // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-            XTx[3][j] = vaddq_f16(vsubq_f16(x[4][j], x[2][j]), vmulq_f16(vsubq_f16(x[3][j], x[1][j]), vdupq_n_f16(2.0f)));
-
-            // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-            XTx[4][j] = vaddq_f16(vsubq_f16(x[4][j], x[2][j]), vmulq_f16(vsubq_f16(x[1][j], x[3][j]), vdupq_n_f16(2.0f)));
-
-            // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-            XTx[5][j] = vsubq_f16(vaddq_f16(x[5][j], vmulq_f16(x[1][j], vdupq_n_f16(4.0f))), vmulq_f16(x[3][j], vdupq_n_f16(5.0f)));
-        }
-
-        // Compute U = XT . x . X
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-            U[i][0] = vsubq_f16(vaddq_f16(XTx[i][4], vmulq_f16(XTx[i][0], vdupq_n_f16(4.0f))), vmulq_f16(XTx[i][2], vdupq_n_f16(5.0f)));
-
-            // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-            U[i][1] = vsubq_f16(vaddq_f16(XTx[i][3], XTx[i][4]), vmulq_f16(vaddq_f16(XTx[i][1], XTx[i][2]), vdupq_n_f16(4.0f)));
-
-            // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-            U[i][2] = vaddq_f16(vsubq_f16(XTx[i][4], XTx[i][3]), vmulq_f16(vsubq_f16(XTx[i][1], XTx[i][2]), vdupq_n_f16(4.0f)));
-
-            // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-            U[i][3] = vaddq_f16(vsubq_f16(XTx[i][4], XTx[i][2]), vmulq_f16(vsubq_f16(XTx[i][3], XTx[i][1]), vdupq_n_f16(2.0f)));
-
-            // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-            U[i][4] = vaddq_f16(vsubq_f16(XTx[i][4], XTx[i][2]), vmulq_f16(vsubq_f16(XTx[i][1], XTx[i][3]), vdupq_n_f16(2.0f)));
-
-            // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-            U[i][5] = vsubq_f16(vaddq_f16(XTx[i][5], vmulq_f16(XTx[i][1], vdupq_n_f16(4.0f))), vmulq_f16(XTx[i][3], vdupq_n_f16(5.0f)));
-        }
-
-        // Store the transformed matrix
-        for (int i = 0, m = 0; i < inner_tile_rows; i++)
-        {
-            for (int j = 0; j < inner_tile_cols; j++, m++)
-            {
-                vst1q_f16(outptr + m*matrix_stride, U[i][j]);
-            }
-        }
-        outptr += 8;
-    }
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-    {
-        // Matrices used/computed in this kernel
-        float16x4_t x[inner_tile_rows][inner_tile_cols];
-        float16x4_t XTx[inner_tile_rows][inner_tile_cols];
-        float16x4_t U[inner_tile_rows][inner_tile_cols];
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            for (int j = 0; j < inner_tile_cols; j++)
-            {
-                x[i][j] = vdup_n_f16(0.0f);
-                XTx[i][j] = vdup_n_f16(0.0f);
-            }
-        }
-
-        // Read a 6x6 tile in the Winograd domain
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            for (int j = 0; j < inner_tile_cols; j++)
-            {
-                x[i][j] = vld1_f16(x_ptrs[i][j]);
-                x_ptrs[i][j] += 4;
-            }
-        }
-
-        // Compute XT . x
-        for (int j = 0; j < inner_tile_cols; j++)
-        {
-            // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-            XTx[0][j] = vsub_f16(vadd_f16(x[4][j], vmul_f16(x[0][j], vdup_n_f16(4.0f))), vmul_f16(x[2][j], vdup_n_f16(5.0f)));
-
-            // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-            XTx[1][j] = vsub_f16(vadd_f16(x[3][j], x[4][j]), vmul_f16(vadd_f16(x[1][j], x[2][j]),  vdup_n_f16(4.0f)));
-
-            // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-            XTx[2][j] = vadd_f16(vsub_f16(x[4][j], x[3][j]), vmul_f16(vsub_f16(x[1][j], x[2][j]), vdup_n_f16(4.0f)));
-
-            // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-            XTx[3][j] = vadd_f16(vsub_f16(x[4][j], x[2][j]), vmul_f16(vsub_f16(x[3][j], x[1][j]), vdup_n_f16(2.0f)));
-
-            // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-            XTx[4][j] = vadd_f16(vsub_f16(x[4][j], x[2][j]), vmul_f16(vsub_f16(x[1][j], x[3][j]), vdup_n_f16(2.0f)));
-
-            // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-            XTx[5][j] = vsub_f16(vadd_f16(x[5][j], vmul_f16(x[1][j], vdup_n_f16(4.0f))), vmul_f16(x[3][j], vdup_n_f16(5.0f)));
-        }
-
-        // Compute U = XT . x . X
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-            U[i][0] = vsub_f16(vadd_f16(XTx[i][4], vmul_f16(XTx[i][0], vdup_n_f16(4.0f))), vmul_f16(XTx[i][2], vdup_n_f16(5.0f)));
-
-            // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-            U[i][1] = vsub_f16(vadd_f16(XTx[i][3], XTx[i][4]), vmul_f16(vadd_f16(XTx[i][1], XTx[i][2]), vdup_n_f16(4.0f)));
-
-            // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-            U[i][2] = vadd_f16(vsub_f16(XTx[i][4], XTx[i][3]), vmul_f16(vsub_f16(XTx[i][1], XTx[i][2]), vdup_n_f16(4.0f)));
-
-            // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-            U[i][3] = vadd_f16(vsub_f16(XTx[i][4], XTx[i][2]), vmul_f16(vsub_f16(XTx[i][3], XTx[i][1]), vdup_n_f16(2.0f)));
-
-            // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-            U[i][4] = vadd_f16(vsub_f16(XTx[i][4], XTx[i][2]), vmul_f16(vsub_f16(XTx[i][1], XTx[i][3]), vdup_n_f16(2.0f)));
-
-            // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-            U[i][5] = vsub_f16(vadd_f16(XTx[i][5], vmul_f16(XTx[i][1], vdup_n_f16(4.0f))), vmul_f16(XTx[i][3], vdup_n_f16(5.0f)));
-        }
-
-        // Store the transformed matrix
-        for (int i = 0, m = 0; i < inner_tile_rows; i++)
-        {
-            for (int j = 0; j < inner_tile_cols; j++, m++)
-            {
-                vst1_f16(outptr + m*matrix_stride, U[i][j]);
-            }
-        }
-        outptr += 4;
-    }
-    for (; channels_remaining; channels_remaining--)
-    {
-        // Load x
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            for (int j = 0; j < inner_tile_cols; j++)
-            {
-                x[i][j] = *(x_ptrs[i][j]++);
-            }
-        }
-
-        // Compute XT . x
-        for (int j = 0; j < inner_tile_cols; j++)
-        {
-            XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-            XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-            XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-            XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-            XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-            XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-        }
-
-        // Compute U = XT . x . X
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-            U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-            U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-            U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-            U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-            U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-        }
-
-        // Store the transformed matrix
-        for (int i = 0, m = 0; i < inner_tile_rows; i++)
-        {
-            for (int j = 0; j < inner_tile_cols; j++, m++)
-            {
-                *(outptr + m*matrix_stride) = U[i][j];
-            }
-        }
-        outptr++;
-    }
-}
-
-template class InputTransform<6, 6, __fp16, __fp16, WinogradRoots::Integers>;
-
-}  // namespace winograd
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-\ No newline at end of file
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
deleted file mode 100644
index 0095e6c96b..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,1308 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm.hpp"
-#include "input.hpp"
-
-namespace winograd
-{
-
-#ifdef __aarch64__
-
-template <>
-void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile(
-  int n_channels,
-  const float* input_base,
-  const int input_row_stride,
-  const int input_col_stride,
-  float* matrix_base,
-  const int matrix_stride
-)
-{
-  const float pcoeffs[4] = {1.0f, 2.0f, 4.0f, 5.0f};
-  __asm__ __volatile__(
-    "ldr q0, [%[pcoeffs]]\n"
-    "add x25, %[inptr0], %[input_row_stride]\n"
-    "add x9, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x16, x25, %[input_row_stride]\n"
-    "add x19, x9, %[input_col_stride1]\n"
-    "add x26, x16, %[input_row_stride]\n"
-    "add x20, x19, %[input_col_stride1]\n"
-    "add x17, x26, %[input_row_stride]\n"
-    "add x21, x20, %[input_col_stride1]\n"
-    "add x27, x17, %[input_row_stride]\n"
-    "add x28, %[outptr0], %[output_row_stride]\n"
-    "add x11, %[output_col_stride1], %[output_col_stride1]\n"
-    "add x22, x28, %[output_row_stride]\n"
-    "add x13, x11, %[output_col_stride1]\n"
-    "add x12, x22, %[output_row_stride]\n"
-    "add x23, x13, %[output_col_stride1]\n"
-    "add x14, x12, %[output_row_stride]\n"
-    "add x15, x23, %[output_col_stride1]\n"
-    "add x24, x14, %[output_row_stride]\n"
-    "cmp %w[n_channels], #4\n"
-    "blt 2f\n"
-    "1:\n"
-    "ldr q8, [%[inptr0], x20]\n"
-    "ldr q2, [%[inptr0], x9]\n"
-    "mov v14.16b, v8.16b\n"
-    "ldr q9, [%[inptr0]]\n"
-    "mov v10.16b, v8.16b\n"
-    "ldr q1, [%[inptr0], x21]\n"
-    "fmla v14.4s, v9.4s, v0.s[2]\n"
-    "ldr q4, [%[inptr0], x19]\n"
-    "mov v9.16b, v8.16b\n"
-    "ldr q12, [%[inptr0], %[input_col_stride1]]\n"
-    "fmls v10.4s, v12.4s, v0.s[2]\n"
-    "ldr q5, [x16, x20]\n"
-    "fmls v14.4s, v2.4s, v0.s[3]\n"
-    "ldr q20, [x16, x9]\n"
-    "fmla v9.4s, v12.4s, v0.s[2]\n"
-    "ldr q3, [x16]\n"
-    "fmls v10.4s, v2.4s, v0.s[2]\n"
-    "ldr q6, [x16, x21]\n"
-    "mov v7.16b, v8.16b\n"
-    "ldr q16, [x16, x19]\n"
-    "fmls v9.4s, v2.4s, v0.s[2]\n"
-    "ldr q22, [x16, %[input_col_stride1]]\n"
-    "fadd v10.4s, v10.4s, v4.4s\n"
-    "ldr q17, [x17, x20]\n"
-    "fmls v7.4s, v12.4s, v0.s[1]\n"
-    "ldr q15, [x17, x9]\n"
-    "fsub v9.4s, v9.4s, v4.4s\n"
-    "ldr q19, [x17]\n"
-    "mov v8.16b, v8.16b\n"
-    "ldr q18, [x17, x21]\n"
-    "fsub v7.4s, v7.4s, v2.4s\n"
-    "ldr q13, [x17, x19]\n"
-    "fmla v7.4s, v4.4s, v0.s[1]\n"
-    "ldr q21, [x17, %[input_col_stride1]]\n"
-    "fmla v8.4s, v12.4s, v0.s[1]\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "mov v11.16b, v1.16b\n"
-    "add x16, x16, #16\n"
-    "mov v1.16b, v5.16b\n"
-    "add x17, x17, #16\n"
-    "fsub v8.4s, v8.4s, v2.4s\n"
-    "fmla v11.4s, v12.4s, v0.s[2]\n"
-    "fmls v8.4s, v4.4s, v0.s[1]\n"
-    "fmla v1.4s, v3.4s, v0.s[2]\n"
-    "mov v2.16b, v5.16b\n"
-    "mov v3.16b, v5.16b\n"
-    "fmls v11.4s, v4.4s, v0.s[3]\n"
-    "mov v4.16b, v5.16b\n"
-    "fmls v1.4s, v20.4s, v0.s[3]\n"
-    "fmls v2.4s, v22.4s, v0.s[2]\n"
-    "fmla v3.4s, v22.4s, v0.s[2]\n"
-    "fmls v4.4s, v22.4s, v0.s[1]\n"
-    "mov v5.16b, v5.16b\n"
-    "mov v6.16b, v6.16b\n"
-    "fmls v2.4s, v20.4s, v0.s[2]\n"
-    "mov v12.16b, v17.16b\n"
-    "fmls v3.4s, v20.4s, v0.s[2]\n"
-    "fsub v4.4s, v4.4s, v20.4s\n"
-    "fmla v4.4s, v16.4s, v0.s[1]\n"
-    "fmla v5.4s, v22.4s, v0.s[1]\n"
-    "fadd v2.4s, v2.4s, v16.4s\n"
-    "fmla v6.4s, v22.4s, v0.s[2]\n"
-    "fsub v3.4s, v3.4s, v16.4s\n"
-    "fmla v12.4s, v19.4s, v0.s[2]\n"
-    "fsub v5.4s, v5.4s, v20.4s\n"
-    "mov v19.16b, v17.16b\n"
-    "fmls v5.4s, v16.4s, v0.s[1]\n"
-    "fmls v6.4s, v16.4s, v0.s[3]\n"
-    "fmls v12.4s, v15.4s, v0.s[3]\n"
-    "fmls v19.4s, v21.4s, v0.s[2]\n"
-    "mov v20.16b, v17.16b\n"
-    "mov v16.16b, v17.16b\n"
-    "mov v17.16b, v17.16b\n"
-    "mov v18.16b, v18.16b\n"
-    "fmls v19.4s, v15.4s, v0.s[2]\n"
-    "fmla v20.4s, v21.4s, v0.s[2]\n"
-    "fmls v16.4s, v21.4s, v0.s[1]\n"
-    "fmla v17.4s, v21.4s, v0.s[1]\n"
-    "fmla v18.4s, v21.4s, v0.s[2]\n"
-    "mov v23.16b, v12.16b\n"
-    "fadd v19.4s, v19.4s, v13.4s\n"
-    "fmls v20.4s, v15.4s, v0.s[2]\n"
-    "fsub v16.4s, v16.4s, v15.4s\n"
-    "fsub v17.4s, v17.4s, v15.4s\n"
-    "fmla v16.4s, v13.4s, v0.s[1]\n"
-    "fmls v17.4s, v13.4s, v0.s[1]\n"
-    "fsub v20.4s, v20.4s, v13.4s\n"
-    "fmls v18.4s, v13.4s, v0.s[3]\n"
-    "fmla v23.4s, v14.4s, v0.s[2]\n"
-    "mov v15.16b, v19.16b\n"
-    "mov v14.16b, v20.16b\n"
-    "mov v24.16b, v16.16b\n"
-    "fmla v15.4s, v10.4s, v0.s[2]\n"
-    "mov v10.16b, v17.16b\n"
-    "fmls v23.4s, v1.4s, v0.s[3]\n"
-    "fmla v14.4s, v9.4s, v0.s[2]\n"
-    "fmla v24.4s, v7.4s, v0.s[2]\n"
-    "fmla v10.4s, v8.4s, v0.s[2]\n"
-    "fmls v15.4s, v2.4s, v0.s[3]\n"
-    "mov v7.16b, v18.16b\n"
-    "str q23, [%[outptr0]]\n"
-    "fmls v14.4s, v3.4s, v0.s[3]\n"
-    "fmls v24.4s, v4.4s, v0.s[3]\n"
-    "fmls v10.4s, v5.4s, v0.s[3]\n"
-    "str q15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v7.4s, v11.4s, v0.s[2]\n"
-    "str q14, [%[outptr0], x11]\n"
-    "str q24, [%[outptr0], x13]\n"
-    "str q10, [%[outptr0], x23]\n"
-    "fmls v7.4s, v6.4s, v0.s[3]\n"
-    "str q7, [%[outptr0], x15]\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "mov v26.16b, v12.16b\n"
-    "mov v25.16b, v19.16b\n"
-    "ldr q11, [x25, x20]\n"
-    "mov v10.16b, v11.16b\n"
-    "ldr q23, [x25, x9]\n"
-    "mov v9.16b, v11.16b\n"
-    "ldr q7, [x25]\n"
-    "fmla v10.4s, v7.4s, v0.s[2]\n"
-    "ldr q13, [x25, x21]\n"
-    "mov v7.16b, v11.16b\n"
-    "ldr q31, [x25, x19]\n"
-    "mov v8.16b, v11.16b\n"
-    "ldr q21, [x25, %[input_col_stride1]]\n"
-    "fmls v10.4s, v23.4s, v0.s[3]\n"
-    "ldr q30, [x26, x20]\n"
-    "fmls v9.4s, v21.4s, v0.s[2]\n"
-    "ldr q29, [x26, x9]\n"
-    "fmla v7.4s, v21.4s, v0.s[2]\n"
-    "ldr q22, [x26]\n"
-    "fmls v8.4s, v21.4s, v0.s[1]\n"
-    "ldr q24, [x26, x21]\n"
-    "fmls v9.4s, v23.4s, v0.s[2]\n"
-    "ldr q27, [x26, x19]\n"
-    "fmls v7.4s, v23.4s, v0.s[2]\n"
-    "ldr q28, [x26, %[input_col_stride1]]\n"
-    "fsub v8.4s, v8.4s, v23.4s\n"
-    "add x25, x25, #16\n"
-    "fadd v9.4s, v9.4s, v31.4s\n"
-    "add x26, x26, #16\n"
-    "fsub v7.4s, v7.4s, v31.4s\n"
-    "fmla v8.4s, v31.4s, v0.s[1]\n"
-    "mov v11.16b, v11.16b\n"
-    "mov v15.16b, v13.16b\n"
-    "mov v14.16b, v30.16b\n"
-    "mov v13.16b, v30.16b\n"
-    "fmla v11.4s, v21.4s, v0.s[1]\n"
-    "fmla v15.4s, v21.4s, v0.s[2]\n"
-    "fmla v14.4s, v22.4s, v0.s[2]\n"
-    "fmls v13.4s, v28.4s, v0.s[2]\n"
-    "mov v21.16b, v30.16b\n"
-    "mov v22.16b, v30.16b\n"
-    "fsub v11.4s, v11.4s, v23.4s\n"
-    "fmls v15.4s, v31.4s, v0.s[3]\n"
-    "fmls v11.4s, v31.4s, v0.s[1]\n"
-    "fmls v14.4s, v29.4s, v0.s[3]\n"
-    "fmls v13.4s, v29.4s, v0.s[2]\n"
-    "fmla v21.4s, v28.4s, v0.s[2]\n"
-    "fmls v22.4s, v28.4s, v0.s[1]\n"
-    "mov v23.16b, v30.16b\n"
-    "mov v24.16b, v24.16b\n"
-    "fmls v26.4s, v10.4s, v0.s[2]\n"
-    "fadd v13.4s, v13.4s, v27.4s\n"
-    "fmls v21.4s, v29.4s, v0.s[2]\n"
-    "fsub v22.4s, v22.4s, v29.4s\n"
-    "fmla v23.4s, v28.4s, v0.s[1]\n"
-    "fmla v22.4s, v27.4s, v0.s[1]\n"
-    "fmla v24.4s, v28.4s, v0.s[2]\n"
-    "fsub v21.4s, v21.4s, v27.4s\n"
-    "fmls v26.4s, v1.4s, v0.s[2]\n"
-    "fsub v23.4s, v23.4s, v29.4s\n"
-    "fmls v25.4s, v9.4s, v0.s[2]\n"
-    "fmls v23.4s, v27.4s, v0.s[1]\n"
-    "fmls v24.4s, v27.4s, v0.s[3]\n"
-    "fadd v26.4s, v26.4s, v14.4s\n"
-    "mov v27.16b, v20.16b\n"
-    "str q26, [x28]\n"
-    "fmls v25.4s, v2.4s, v0.s[2]\n"
-    "fmls v27.4s, v7.4s, v0.s[2]\n"
-    "mov v31.16b, v16.16b\n"
-    "mov v30.16b, v17.16b\n"
-    "mov v29.16b, v18.16b\n"
-    "fadd v25.4s, v25.4s, v13.4s\n"
-    "fmls v31.4s, v8.4s, v0.s[2]\n"
-    "str q25, [x28, %[output_col_stride1]]\n"
-    "fmls v27.4s, v3.4s, v0.s[2]\n"
-    "fmls v30.4s, v11.4s, v0.s[2]\n"
-    "fmls v29.4s, v15.4s, v0.s[2]\n"
-    "fmls v31.4s, v4.4s, v0.s[2]\n"
-    "mov v26.16b, v12.16b\n"
-    "fadd v27.4s, v27.4s, v21.4s\n"
-    "mov v25.16b, v19.16b\n"
-    "str q27, [x28, x11]\n"
-    "fmls v30.4s, v5.4s, v0.s[2]\n"
-    "fadd v31.4s, v31.4s, v22.4s\n"
-    "fmls v29.4s, v6.4s, v0.s[2]\n"
-    "str q31, [x28, x13]\n"
-    "fmla v26.4s, v10.4s, v0.s[2]\n"
-    "fadd v30.4s, v30.4s, v23.4s\n"
-    "fmla v25.4s, v9.4s, v0.s[2]\n"
-    "str q30, [x28, x23]\n"
-    "fadd v29.4s, v29.4s, v24.4s\n"
-    "str q29, [x28, x15]\n"
-    "fmls v26.4s, v1.4s, v0.s[2]\n"
-    "fmls v25.4s, v2.4s, v0.s[2]\n"
-    "add x28, x28, #16\n"
-    "mov v30.16b, v20.16b\n"
-    "mov v29.16b, v16.16b\n"
-    "fsub v26.4s, v26.4s, v14.4s\n"
-    "mov v28.16b, v17.16b\n"
-    "str q26, [x22]\n"
-    "fsub v25.4s, v25.4s, v13.4s\n"
-    "str q25, [x22, %[output_col_stride1]]\n"
-    "fmla v30.4s, v7.4s, v0.s[2]\n"
-    "fmla v29.4s, v8.4s, v0.s[2]\n"
-    "fmla v28.4s, v11.4s, v0.s[2]\n"
-    "mov v26.16b, v18.16b\n"
-    "mov v25.16b, v12.16b\n"
-    "fmls v30.4s, v3.4s, v0.s[2]\n"
-    "mov v31.16b, v19.16b\n"
-    "fmls v29.4s, v4.4s, v0.s[2]\n"
-    "fmls v28.4s, v5.4s, v0.s[2]\n"
-    "fmla v26.4s, v15.4s, v0.s[2]\n"
-    "fmls v25.4s, v10.4s, v0.s[1]\n"
-    "fsub v30.4s, v30.4s, v21.4s\n"
-    "fmls v31.4s, v9.4s, v0.s[1]\n"
-    "str q30, [x22, x11]\n"
-    "fsub v29.4s, v29.4s, v22.4s\n"
-    "str q29, [x22, x13]\n"
-    "fsub v28.4s, v28.4s, v23.4s\n"
-    "str q28, [x22, x23]\n"
-    "fmls v26.4s, v6.4s, v0.s[2]\n"
-    "fsub v25.4s, v25.4s, v1.4s\n"
-    "fsub v31.4s, v31.4s, v2.4s\n"
-    "fmla v25.4s, v14.4s, v0.s[1]\n"
-    "fmla v31.4s, v13.4s, v0.s[1]\n"
-    "fsub v26.4s, v26.4s, v24.4s\n"
-    "mov v27.16b, v20.16b\n"
-    "str q26, [x22, x15]\n"
-    "mov v26.16b, v16.16b\n"
-    "str q25, [x12]\n"
-    "fmls v27.4s, v7.4s, v0.s[1]\n"
-    "str q31, [x12, %[output_col_stride1]]\n"
-    "fmls v26.4s, v8.4s, v0.s[1]\n"
-    "mov v25.16b, v17.16b\n"
-    "add x22, x22, #16\n"
-    "fsub v27.4s, v27.4s, v3.4s\n"
-    "mov v28.16b, v18.16b\n"
-    "fmla v27.4s, v21.4s, v0.s[1]\n"
-    "fsub v26.4s, v26.4s, v4.4s\n"
-    "fmla v26.4s, v22.4s, v0.s[1]\n"
-    "fmls v25.4s, v11.4s, v0.s[1]\n"
-    "fmls v28.4s, v15.4s, v0.s[1]\n"
-    "mov v12.16b, v12.16b\n"
-    "str q27, [x12, x11]\n"
-    "mov v19.16b, v19.16b\n"
-    "str q26, [x12, x13]\n"
-    "fsub v25.4s, v25.4s, v5.4s\n"
-    "fmla v25.4s, v23.4s, v0.s[1]\n"
-    "fsub v28.4s, v28.4s, v6.4s\n"
-    "fmla v28.4s, v24.4s, v0.s[1]\n"
-    "fmla v12.4s, v10.4s, v0.s[1]\n"
-    "fmla v19.4s, v9.4s, v0.s[1]\n"
-    "mov v20.16b, v20.16b\n"
-    "str q25, [x12, x23]\n"
-    "mov v16.16b, v16.16b\n"
-    "str q28, [x12, x15]\n"
-    "fsub v12.4s, v12.4s, v1.4s\n"
-    "fmls v12.4s, v14.4s, v0.s[1]\n"
-    "add x12, x12, #16\n"
-    "fsub v19.4s, v19.4s, v2.4s\n"
-    "fmla v20.4s, v7.4s, v0.s[1]\n"
-    "fmls v19.4s, v13.4s, v0.s[1]\n"
-    "fmla v16.4s, v8.4s, v0.s[1]\n"
-    "str q12, [x14]\n"
-    "mov v1.16b, v17.16b\n"
-    "fsub v20.4s, v20.4s, v3.4s\n"
-    "mov v17.16b, v18.16b\n"
-    "str q19, [x14, %[output_col_stride1]]\n"
-    "fmls v20.4s, v21.4s, v0.s[1]\n"
-    "fsub v16.4s, v16.4s, v4.4s\n"
-    "fmla v1.4s, v11.4s, v0.s[1]\n"
-    "fmls v16.4s, v22.4s, v0.s[1]\n"
-    "fmla v17.4s, v15.4s, v0.s[1]\n"
-    "str q20, [x14, x11]\n"
-    "fsub v1.4s, v1.4s, v5.4s\n"
-    "str q16, [x14, x13]\n"
-    "fmls v1.4s, v23.4s, v0.s[1]\n"
-    "fsub v17.4s, v17.4s, v6.4s\n"
-    "fmls v17.4s, v24.4s, v0.s[1]\n"
-    "str q1, [x14, x23]\n"
-    "str q17, [x14, x15]\n"
-    "add x14, x14, #16\n"
-    "ldr q2, [x27, x20]\n"
-    "mov v4.16b, v2.16b\n"
-    "ldr q17, [x27, x9]\n"
-    "mov v12.16b, v2.16b\n"
-    "ldr q18, [x27]\n"
-    "fmla v4.4s, v18.4s, v0.s[2]\n"
-    "ldr q3, [x27, x21]\n"
-    "mov v6.16b, v2.16b\n"
-    "ldr q5, [x27, x19]\n"
-    "mov v1.16b, v2.16b\n"
-    "ldr q18, [x27, %[input_col_stride1]]\n"
-    "fmls v4.4s, v17.4s, v0.s[3]\n"
-    "add x27, x27, #16\n"
-    "fmls v12.4s, v18.4s, v0.s[2]\n"
-    "sub %w[n_channels], %w[n_channels], #4\n"
-    "fmla v6.4s, v18.4s, v0.s[2]\n"
-    "cmp %w[n_channels], #4\n"
-    "fmls v1.4s, v18.4s, v0.s[1]\n"
-    "mov v2.16b, v2.16b\n"
-    "fmls v12.4s, v17.4s, v0.s[2]\n"
-    "mov v3.16b, v3.16b\n"
-    "fmls v6.4s, v17.4s, v0.s[2]\n"
-    "fmla v2.4s, v18.4s, v0.s[1]\n"
-    "fsub v1.4s, v1.4s, v17.4s\n"
-    "fmla v3.4s, v18.4s, v0.s[2]\n"
-    "fadd v12.4s, v12.4s, v5.4s\n"
-    "fmla v1.4s, v5.4s, v0.s[1]\n"
-    "fsub v6.4s, v6.4s, v5.4s\n"
-    "fsub v2.4s, v2.4s, v17.4s\n"
-    "fmls v2.4s, v5.4s, v0.s[1]\n"
-    "fmls v3.4s, v5.4s, v0.s[3]\n"
-    "mov v4.16b, v4.16b\n"
-    "mov v16.16b, v12.16b\n"
-    "mov v5.16b, v6.16b\n"
-    "mov v6.16b, v1.16b\n"
-    "fmla v4.4s, v10.4s, v0.s[2]\n"
-    "fmla v16.4s, v9.4s, v0.s[2]\n"
-    "fmla v5.4s, v7.4s, v0.s[2]\n"
-    "fmla v6.4s, v8.4s, v0.s[2]\n"
-    "mov v9.16b, v2.16b\n"
-    "mov v10.16b, v3.16b\n"
-    "fmls v4.4s, v14.4s, v0.s[3]\n"
-    "fmls v16.4s, v13.4s, v0.s[3]\n"
-    "fmls v5.4s, v21.4s, v0.s[3]\n"
-    "fmls v6.4s, v22.4s, v0.s[3]\n"
-    "fmla v9.4s, v11.4s, v0.s[2]\n"
-    "fmla v10.4s, v15.4s, v0.s[2]\n"
-    "str q4, [x24]\n"
-    "str q16, [x24, %[output_col_stride1]]\n"
-    "str q5, [x24, x11]\n"
-    "str q6, [x24, x13]\n"
-    "fmls v9.4s, v23.4s, v0.s[3]\n"
-    "fmls v10.4s, v24.4s, v0.s[3]\n"
-    "str q9, [x24, x23]\n"
-    "str q10, [x24, x15]\n"
-    "add x24, x24, #16\n"
-    "bge 1b\n"
-    "2:\n"
-    "cmp %w[n_channels], #2\n"
-    "blt 3f\n"
-    "ldr d8, [%[inptr0], x20]\n"
-    "mov v14.16b, v8.16b\n"
-    "ldr d2, [%[inptr0], x9]\n"
-    "mov v10.16b, v8.16b\n"
-    "ldr d9, [%[inptr0]]\n"
-    "fmla v14.4s, v9.4s, v0.s[2]\n"
-    "ldr d1, [%[inptr0], x21]\n"
-    "mov v9.16b, v8.16b\n"
-    "ldr d4, [%[inptr0], x19]\n"
-    "mov v7.16b, v8.16b\n"
-    "ldr d12, [%[inptr0], %[input_col_stride1]]\n"
-    "fmls v14.4s, v2.4s, v0.s[3]\n"
-    "ldr d5, [x16, x20]\n"
-    "fmls v10.4s, v12.4s, v0.s[2]\n"
-    "ldr d20, [x16, x9]\n"
-    "fmla v9.4s, v12.4s, v0.s[2]\n"
-    "ldr d3, [x16]\n"
-    "fmls v7.4s, v12.4s, v0.s[1]\n"
-    "ldr d6, [x16, x21]\n"
-    "fmls v10.4s, v2.4s, v0.s[2]\n"
-    "ldr d16, [x16, x19]\n"
-    "fmls v9.4s, v2.4s, v0.s[2]\n"
-    "ldr d22, [x16, %[input_col_stride1]]\n"
-    "fsub v7.4s, v7.4s, v2.4s\n"
-    "ldr d17, [x17, x20]\n"
-    "fadd v10.4s, v10.4s, v4.4s\n"
-    "ldr d15, [x17, x9]\n"
-    "fsub v9.4s, v9.4s, v4.4s\n"
-    "ldr d19, [x17]\n"
-    "fmla v7.4s, v4.4s, v0.s[1]\n"
-    "ldr d18, [x17, x21]\n"
-    "mov v8.16b, v8.16b\n"
-    "ldr d13, [x17, x19]\n"
-    "mov v11.16b, v1.16b\n"
-    "ldr d21, [x17, %[input_col_stride1]]\n"
-    "fmla v8.4s, v12.4s, v0.s[1]\n"
-    "add %[inptr0], %[inptr0], #8\n"
-    "fmla v11.4s, v12.4s, v0.s[2]\n"
-    "add x16, x16, #8\n"
-    "mov v1.16b, v5.16b\n"
-    "add x17, x17, #8\n"
-    "fsub v8.4s, v8.4s, v2.4s\n"
-    "mov v2.16b, v5.16b\n"
-    "fmls v8.4s, v4.4s, v0.s[1]\n"
-    "fmls v11.4s, v4.4s, v0.s[3]\n"
-    "fmla v1.4s, v3.4s, v0.s[2]\n"
-    "fmls v2.4s, v22.4s, v0.s[2]\n"
-    "mov v3.16b, v5.16b\n"
-    "mov v4.16b, v5.16b\n"
-    "mov v5.16b, v5.16b\n"
-    "mov v6.16b, v6.16b\n"
-    "fmls v1.4s, v20.4s, v0.s[3]\n"
-    "fmls v2.4s, v20.4s, v0.s[2]\n"
-    "fmla v3.4s, v22.4s, v0.s[2]\n"
-    "fmls v4.4s, v22.4s, v0.s[1]\n"
-    "fmla v5.4s, v22.4s, v0.s[1]\n"
-    "fmla v6.4s, v22.4s, v0.s[2]\n"
-    "fadd v2.4s, v2.4s, v16.4s\n"
-    "mov v12.16b, v17.16b\n"
-    "fmls v3.4s, v20.4s, v0.s[2]\n"
-    "fsub v4.4s, v4.4s, v20.4s\n"
-    "fmla v4.4s, v16.4s, v0.s[1]\n"
-    "fsub v5.4s, v5.4s, v20.4s\n"
-    "fmls v5.4s, v16.4s, v0.s[1]\n"
-    "fmls v6.4s, v16.4s, v0.s[3]\n"
-    "fsub v3.4s, v3.4s, v16.4s\n"
-    "fmla v12.4s, v19.4s, v0.s[2]\n"
-    "mov v19.16b, v17.16b\n"
-    "mov v20.16b, v17.16b\n"
-    "mov v16.16b, v17.16b\n"
-    "mov v17.16b, v17.16b\n"
-    "fmls v12.4s, v15.4s, v0.s[3]\n"
-    "fmls v19.4s, v21.4s, v0.s[2]\n"
-    "fmla v20.4s, v21.4s, v0.s[2]\n"
-    "fmls v16.4s, v21.4s, v0.s[1]\n"
-    "fmla v17.4s, v21.4s, v0.s[1]\n"
-    "mov v18.16b, v18.16b\n"
-    "fmls v19.4s, v15.4s, v0.s[2]\n"
-    "mov v23.16b, v12.16b\n"
-    "fmls v20.4s, v15.4s, v0.s[2]\n"
-    "fsub v16.4s, v16.4s, v15.4s\n"
-    "fmla v16.4s, v13.4s, v0.s[1]\n"
-    "fsub v17.4s, v17.4s, v15.4s\n"
-    "fadd v19.4s, v19.4s, v13.4s\n"
-    "fmls v17.4s, v13.4s, v0.s[1]\n"
-    "fsub v20.4s, v20.4s, v13.4s\n"
-    "fmla v18.4s, v21.4s, v0.s[2]\n"
-    "fmla v23.4s, v14.4s, v0.s[2]\n"
-    "mov v15.16b, v19.16b\n"
-    "mov v14.16b, v20.16b\n"
-    "mov v24.16b, v16.16b\n"
-    "fmls v18.4s, v13.4s, v0.s[3]\n"
-    "fmla v15.4s, v10.4s, v0.s[2]\n"
-    "fmls v23.4s, v1.4s, v0.s[3]\n"
-    "fmla v14.4s, v9.4s, v0.s[2]\n"
-    "fmla v24.4s, v7.4s, v0.s[2]\n"
-    "mov v10.16b, v17.16b\n"
-    "fmls v15.4s, v2.4s, v0.s[3]\n"
-    "mov v7.16b, v18.16b\n"
-    "str d23, [%[outptr0]]\n"
-    "fmls v14.4s, v3.4s, v0.s[3]\n"
-    "fmls v24.4s, v4.4s, v0.s[3]\n"
-    "fmla v10.4s, v8.4s, v0.s[2]\n"
-    "str d15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v7.4s, v11.4s, v0.s[2]\n"
-    "str d14, [%[outptr0], x11]\n"
-    "fmls v10.4s, v5.4s, v0.s[3]\n"
-    "str d24, [%[outptr0], x13]\n"
-    "fmls v7.4s, v6.4s, v0.s[3]\n"
-    "str d10, [%[outptr0], x23]\n"
-    "str d7, [%[outptr0], x15]\n"
-    "add %[outptr0], %[outptr0], #8\n"
-    "mov v26.16b, v12.16b\n"
-    "mov v25.16b, v19.16b\n"
-    "ldr d11, [x25, x20]\n"
-    "mov v10.16b, v11.16b\n"
-    "ldr d23, [x25, x9]\n"
-    "mov v9.16b, v11.16b\n"
-    "ldr d7, [x25]\n"
-    "fmla v10.4s, v7.4s, v0.s[2]\n"
-    "ldr d13, [x25, x21]\n"
-    "mov v7.16b, v11.16b\n"
-    "ldr d31, [x25, x19]\n"
-    "mov v8.16b, v11.16b\n"
-    "ldr d21, [x25, %[input_col_stride1]]\n"
-    "fmls v10.4s, v23.4s, v0.s[3]\n"
-    "ldr d30, [x26, x20]\n"
-    "fmls v9.4s, v21.4s, v0.s[2]\n"
-    "ldr d29, [x26, x9]\n"
-    "fmla v7.4s, v21.4s, v0.s[2]\n"
-    "ldr d22, [x26]\n"
-    "fmls v8.4s, v21.4s, v0.s[1]\n"
-    "ldr d24, [x26, x21]\n"
-    "fmls v9.4s, v23.4s, v0.s[2]\n"
-    "ldr d27, [x26, x19]\n"
-    "fmls v7.4s, v23.4s, v0.s[2]\n"
-    "ldr d28, [x26, %[input_col_stride1]]\n"
-    "fsub v8.4s, v8.4s, v23.4s\n"
-    "add x25, x25, #8\n"
-    "fadd v9.4s, v9.4s, v31.4s\n"
-    "add x26, x26, #8\n"
-    "fsub v7.4s, v7.4s, v31.4s\n"
-    "fmla v8.4s, v31.4s, v0.s[1]\n"
-    "mov v11.16b, v11.16b\n"
-    "mov v15.16b, v13.16b\n"
-    "mov v14.16b, v30.16b\n"
-    "mov v13.16b, v30.16b\n"
-    "fmla v11.4s, v21.4s, v0.s[1]\n"
-    "fmla v15.4s, v21.4s, v0.s[2]\n"
-    "fmla v14.4s, v22.4s, v0.s[2]\n"
-    "fmls v13.4s, v28.4s, v0.s[2]\n"
-    "mov v21.16b, v30.16b\n"
-    "mov v22.16b, v30.16b\n"
-    "fsub v11.4s, v11.4s, v23.4s\n"
-    "fmls v15.4s, v31.4s, v0.s[3]\n"
-    "fmls v11.4s, v31.4s, v0.s[1]\n"
-    "fmls v14.4s, v29.4s, v0.s[3]\n"
-    "fmls v13.4s, v29.4s, v0.s[2]\n"
-    "fmla v21.4s, v28.4s, v0.s[2]\n"
-    "fmls v22.4s, v28.4s, v0.s[1]\n"
-    "mov v23.16b, v30.16b\n"
-    "mov v24.16b, v24.16b\n"
-    "fmls v26.4s, v10.4s, v0.s[2]\n"
-    "fadd v13.4s, v13.4s, v27.4s\n"
-    "fmls v21.4s, v29.4s, v0.s[2]\n"
-    "fsub v22.4s, v22.4s, v29.4s\n"
-    "fmla v23.4s, v28.4s, v0.s[1]\n"
-    "fmla v22.4s, v27.4s, v0.s[1]\n"
-    "fmla v24.4s, v28.4s, v0.s[2]\n"
-    "fsub v21.4s, v21.4s, v27.4s\n"
-    "fmls v26.4s, v1.4s, v0.s[2]\n"
-    "fsub v23.4s, v23.4s, v29.4s\n"
-    "fmls v25.4s, v9.4s, v0.s[2]\n"
-    "fmls v23.4s, v27.4s, v0.s[1]\n"
-    "fmls v24.4s, v27.4s, v0.s[3]\n"
-    "fadd v26.4s, v26.4s, v14.4s\n"
-    "mov v27.16b, v20.16b\n"
-    "str d26, [x28]\n"
-    "fmls v25.4s, v2.4s, v0.s[2]\n"
-    "fmls v27.4s, v7.4s, v0.s[2]\n"
-    "mov v31.16b, v16.16b\n"
-    "mov v30.16b, v17.16b\n"
-    "mov v29.16b, v18.16b\n"
-    "fadd v25.4s, v25.4s, v13.4s\n"
-    "fmls v31.4s, v8.4s, v0.s[2]\n"
-    "str d25, [x28, %[output_col_stride1]]\n"
-    "fmls v27.4s, v3.4s, v0.s[2]\n"
-    "fmls v30.4s, v11.4s, v0.s[2]\n"
-    "fmls v29.4s, v15.4s, v0.s[2]\n"
-    "fmls v31.4s, v4.4s, v0.s[2]\n"
-    "mov v26.16b, v12.16b\n"
-    "fadd v27.4s, v27.4s, v21.4s\n"
-    "mov v25.16b, v19.16b\n"
-    "str d27, [x28, x11]\n"
-    "fmls v30.4s, v5.4s, v0.s[2]\n"
-    "fadd v31.4s, v31.4s, v22.4s\n"
-    "fmls v29.4s, v6.4s, v0.s[2]\n"
-    "str d31, [x28, x13]\n"
-    "fmla v26.4s, v10.4s, v0.s[2]\n"
-    "fadd v30.4s, v30.4s, v23.4s\n"
-    "fmla v25.4s, v9.4s, v0.s[2]\n"
-    "str d30, [x28, x23]\n"
-    "fadd v29.4s, v29.4s, v24.4s\n"
-    "str d29, [x28, x15]\n"
-    "fmls v26.4s, v1.4s, v0.s[2]\n"
-    "fmls v25.4s, v2.4s, v0.s[2]\n"
-    "add x28, x28, #8\n"
-    "mov v30.16b, v20.16b\n"
-    "mov v29.16b, v16.16b\n"
-    "fsub v26.4s, v26.4s, v14.4s\n"
-    "mov v28.16b, v17.16b\n"
-    "str d26, [x22]\n"
-    "fsub v25.4s, v25.4s, v13.4s\n"
-    "str d25, [x22, %[output_col_stride1]]\n"
-    "fmla v30.4s, v7.4s, v0.s[2]\n"
-    "fmla v29.4s, v8.4s, v0.s[2]\n"
-    "fmla v28.4s, v11.4s, v0.s[2]\n"
-    "mov v26.16b, v18.16b\n"
-    "mov v25.16b, v12.16b\n"
-    "fmls v30.4s, v3.4s, v0.s[2]\n"
-    "mov v31.16b, v19.16b\n"
-    "fmls v29.4s, v4.4s, v0.s[2]\n"
-    "fmls v28.4s, v5.4s, v0.s[2]\n"
-    "fmla v26.4s, v15.4s, v0.s[2]\n"
-    "fmls v25.4s, v10.4s, v0.s[1]\n"
-    "fsub v30.4s, v30.4s, v21.4s\n"
-    "fmls v31.4s, v9.4s, v0.s[1]\n"
-    "str d30, [x22, x11]\n"
-    "fsub v29.4s, v29.4s, v22.4s\n"
-    "str d29, [x22, x13]\n"
-    "fsub v28.4s, v28.4s, v23.4s\n"
-    "str d28, [x22, x23]\n"
-    "fmls v26.4s, v6.4s, v0.s[2]\n"
-    "fsub v25.4s, v25.4s, v1.4s\n"
-    "fsub v31.4s, v31.4s, v2.4s\n"
-    "fmla v25.4s, v14.4s, v0.s[1]\n"
-    "fmla v31.4s, v13.4s, v0.s[1]\n"
-    "fsub v26.4s, v26.4s, v24.4s\n"
-    "mov v27.16b, v20.16b\n"
-    "str d26, [x22, x15]\n"
-    "mov v26.16b, v16.16b\n"
-    "str d25, [x12]\n"
-    "fmls v27.4s, v7.4s, v0.s[1]\n"
-    "str d31, [x12, %[output_col_stride1]]\n"
-    "fmls v26.4s, v8.4s, v0.s[1]\n"
-    "mov v25.16b, v17.16b\n"
-    "add x22, x22, #8\n"
-    "fsub v27.4s, v27.4s, v3.4s\n"
-    "mov v28.16b, v18.16b\n"
-    "fmla v27.4s, v21.4s, v0.s[1]\n"
-    "fsub v26.4s, v26.4s, v4.4s\n"
-    "fmla v26.4s, v22.4s, v0.s[1]\n"
-    "fmls v25.4s, v11.4s, v0.s[1]\n"
-    "fmls v28.4s, v15.4s, v0.s[1]\n"
-    "mov v12.16b, v12.16b\n"
-    "str d27, [x12, x11]\n"
-    "mov v19.16b, v19.16b\n"
-    "str d26, [x12, x13]\n"
-    "fsub v25.4s, v25.4s, v5.4s\n"
-    "fmla v25.4s, v23.4s, v0.s[1]\n"
-    "fsub v28.4s, v28.4s, v6.4s\n"
-    "fmla v28.4s, v24.4s, v0.s[1]\n"
-    "fmla v12.4s, v10.4s, v0.s[1]\n"
-    "fmla v19.4s, v9.4s, v0.s[1]\n"
-    "mov v20.16b, v20.16b\n"
-    "str d25, [x12, x23]\n"
-    "mov v16.16b, v16.16b\n"
-    "str d28, [x12, x15]\n"
-    "fsub v12.4s, v12.4s, v1.4s\n"
-    "fmls v12.4s, v14.4s, v0.s[1]\n"
-    "add x12, x12, #8\n"
-    "fsub v19.4s, v19.4s, v2.4s\n"
-    "fmla v20.4s, v7.4s, v0.s[1]\n"
-    "fmls v19.4s, v13.4s, v0.s[1]\n"
-    "fmla v16.4s, v8.4s, v0.s[1]\n"
-    "str d12, [x14]\n"
-    "mov v1.16b, v17.16b\n"
-    "fsub v20.4s, v20.4s, v3.4s\n"
-    "mov v17.16b, v18.16b\n"
-    "str d19, [x14, %[output_col_stride1]]\n"
-    "fmls v20.4s, v21.4s, v0.s[1]\n"
-    "fsub v16.4s, v16.4s, v4.4s\n"
-    "fmla v1.4s, v11.4s, v0.s[1]\n"
-    "fmls v16.4s, v22.4s, v0.s[1]\n"
-    "fmla v17.4s, v15.4s, v0.s[1]\n"
-    "str d20, [x14, x11]\n"
-    "fsub v1.4s, v1.4s, v5.4s\n"
-    "str d16, [x14, x13]\n"
-    "fmls v1.4s, v23.4s, v0.s[1]\n"
-    "fsub v17.4s, v17.4s, v6.4s\n"
-    "fmls v17.4s, v24.4s, v0.s[1]\n"
-    "str d1, [x14, x23]\n"
-    "str d17, [x14, x15]\n"
-    "add x14, x14, #8\n"
-    "ldr d2, [x27, x20]\n"
-    "mov v4.16b, v2.16b\n"
-    "ldr d17, [x27, x9]\n"
-    "mov v12.16b, v2.16b\n"
-    "ldr d18, [x27]\n"
-    "fmla v4.4s, v18.4s, v0.s[2]\n"
-    "ldr d3, [x27, x21]\n"
-    "mov v6.16b, v2.16b\n"
-    "ldr d5, [x27, x19]\n"
-    "mov v1.16b, v2.16b\n"
-    "ldr d18, [x27, %[input_col_stride1]]\n"
-    "fmls v4.4s, v17.4s, v0.s[3]\n"
-    "add x27, x27, #8\n"
-    "fmls v12.4s, v18.4s, v0.s[2]\n"
-    "sub %w[n_channels], %w[n_channels], #2\n"
-    "fmla v6.4s, v18.4s, v0.s[2]\n"
-    "fmls v1.4s, v18.4s, v0.s[1]\n"
-    "mov v2.16b, v2.16b\n"
-    "mov v3.16b, v3.16b\n"
-    "fmls v12.4s, v17.4s, v0.s[2]\n"
-    "mov v4.16b, v4.16b\n"
-    "fmls v6.4s, v17.4s, v0.s[2]\n"
-    "fsub v1.4s, v1.4s, v17.4s\n"
-    "fmla v1.4s, v5.4s, v0.s[1]\n"
-    "fmla v2.4s, v18.4s, v0.s[1]\n"
-    "fadd v12.4s, v12.4s, v5.4s\n"
-    "fmla v3.4s, v18.4s, v0.s[2]\n"
-    "fsub v6.4s, v6.4s, v5.4s\n"
-    "fmla v4.4s, v10.4s, v0.s[2]\n"
-    "fsub v2.4s, v2.4s, v17.4s\n"
-    "mov v16.16b, v12.16b\n"
-    "fmls v2.4s, v5.4s, v0.s[1]\n"
-    "fmls v3.4s, v5.4s, v0.s[3]\n"
-    "fmls v4.4s, v14.4s, v0.s[3]\n"
-    "fmla v16.4s, v9.4s, v0.s[2]\n"
-    "mov v5.16b, v6.16b\n"
-    "mov v6.16b, v1.16b\n"
-    "mov v9.16b, v2.16b\n"
-    "mov v10.16b, v3.16b\n"
-    "str d4, [x24]\n"
-    "fmls v16.4s, v13.4s, v0.s[3]\n"
-    "fmla v5.4s, v7.4s, v0.s[2]\n"
-    "fmla v6.4s, v8.4s, v0.s[2]\n"
-    "fmla v9.4s, v11.4s, v0.s[2]\n"
-    "fmla v10.4s, v15.4s, v0.s[2]\n"
-    "str d16, [x24, %[output_col_stride1]]\n"
-    "fmls v5.4s, v21.4s, v0.s[3]\n"
-    "fmls v6.4s, v22.4s, v0.s[3]\n"
-    "fmls v9.4s, v23.4s, v0.s[3]\n"
-    "fmls v10.4s, v24.4s, v0.s[3]\n"
-    "str d5, [x24, x11]\n"
-    "str d6, [x24, x13]\n"
-    "str d9, [x24, x23]\n"
-    "str d10, [x24, x15]\n"
-    "add x24, x24, #8\n"
-    "3:\n"
-    "cbz %w[n_channels], 4f\n"
-    "ldr s8, [%[inptr0], x20]\n"
-    "mov v14.16b, v8.16b\n"
-    "ldr s2, [%[inptr0], x9]\n"
-    "mov v10.16b, v8.16b\n"
-    "ldr s9, [%[inptr0]]\n"
-    "fmla v14.4s, v9.4s, v0.s[2]\n"
-    "ldr s1, [%[inptr0], x21]\n"
-    "mov v9.16b, v8.16b\n"
-    "ldr s4, [%[inptr0], x19]\n"
-    "mov v7.16b, v8.16b\n"
-    "ldr s12, [%[inptr0], %[input_col_stride1]]\n"
-    "fmls v14.4s, v2.4s, v0.s[3]\n"
-    "ldr s5, [x16, x20]\n"
-    "fmls v10.4s, v12.4s, v0.s[2]\n"
-    "ldr s20, [x16, x9]\n"
-    "fmla v9.4s, v12.4s, v0.s[2]\n"
-    "ldr s3, [x16]\n"
-    "fmls v7.4s, v12.4s, v0.s[1]\n"
-    "ldr s6, [x16, x21]\n"
-    "fmls v10.4s, v2.4s, v0.s[2]\n"
-    "ldr s16, [x16, x19]\n"
-    "fmls v9.4s, v2.4s, v0.s[2]\n"
-    "ldr s22, [x16, %[input_col_stride1]]\n"
-    "fsub v7.4s, v7.4s, v2.4s\n"
-    "ldr s17, [x17, x20]\n"
-    "fadd v10.4s, v10.4s, v4.4s\n"
-    "ldr s15, [x17, x9]\n"
-    "fsub v9.4s, v9.4s, v4.4s\n"
-    "ldr s19, [x17]\n"
-    "fmla v7.4s, v4.4s, v0.s[1]\n"
-    "ldr s18, [x17, x21]\n"
-    "mov v8.16b, v8.16b\n"
-    "ldr s13, [x17, x19]\n"
-    "mov v11.16b, v1.16b\n"
-    "ldr s21, [x17, %[input_col_stride1]]\n"
-    "fmla v8.4s, v12.4s, v0.s[1]\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v11.4s, v12.4s, v0.s[2]\n"
-    "add x16, x16, #4\n"
-    "mov v1.16b, v5.16b\n"
-    "add x17, x17, #4\n"
-    "fsub v8.4s, v8.4s, v2.4s\n"
-    "mov v2.16b, v5.16b\n"
-    "fmls v8.4s, v4.4s, v0.s[1]\n"
-    "fmls v11.4s, v4.4s, v0.s[3]\n"
-    "fmla v1.4s, v3.4s, v0.s[2]\n"
-    "fmls v2.4s, v22.4s, v0.s[2]\n"
-    "mov v3.16b, v5.16b\n"
-    "mov v4.16b, v5.16b\n"
-    "mov v5.16b, v5.16b\n"
-    "mov v6.16b, v6.16b\n"
-    "fmls v1.4s, v20.4s, v0.s[3]\n"
-    "fmls v2.4s, v20.4s, v0.s[2]\n"
-    "fmla v3.4s, v22.4s, v0.s[2]\n"
-    "fmls v4.4s, v22.4s, v0.s[1]\n"
-    "fmla v5.4s, v22.4s, v0.s[1]\n"
-    "fmla v6.4s, v22.4s, v0.s[2]\n"
-    "fadd v2.4s, v2.4s, v16.4s\n"
-    "mov v12.16b, v17.16b\n"
-    "fmls v3.4s, v20.4s, v0.s[2]\n"
-    "fsub v4.4s, v4.4s, v20.4s\n"
-    "fmla v4.4s, v16.4s, v0.s[1]\n"
-    "fsub v5.4s, v5.4s, v20.4s\n"
-    "fmls v5.4s, v16.4s, v0.s[1]\n"
-    "fmls v6.4s, v16.4s, v0.s[3]\n"
-    "fsub v3.4s, v3.4s, v16.4s\n"
-    "fmla v12.4s, v19.4s, v0.s[2]\n"
-    "mov v19.16b, v17.16b\n"
-    "mov v20.16b, v17.16b\n"
-    "mov v16.16b, v17.16b\n"
-    "mov v17.16b, v17.16b\n"
-    "fmls v12.4s, v15.4s, v0.s[3]\n"
-    "fmls v19.4s, v21.4s, v0.s[2]\n"
-    "fmla v20.4s, v21.4s, v0.s[2]\n"
-    "fmls v16.4s, v21.4s, v0.s[1]\n"
-    "fmla v17.4s, v21.4s, v0.s[1]\n"
-    "mov v18.16b, v18.16b\n"
-    "fmls v19.4s, v15.4s, v0.s[2]\n"
-    "mov v23.16b, v12.16b\n"
-    "fmls v20.4s, v15.4s, v0.s[2]\n"
-    "fsub v16.4s, v16.4s, v15.4s\n"
-    "fmla v16.4s, v13.4s, v0.s[1]\n"
-    "fsub v17.4s, v17.4s, v15.4s\n"
-    "fadd v19.4s, v19.4s, v13.4s\n"
-    "fmls v17.4s, v13.4s, v0.s[1]\n"
-    "fsub v20.4s, v20.4s, v13.4s\n"
-    "fmla v18.4s, v21.4s, v0.s[2]\n"
-    "fmla v23.4s, v14.4s, v0.s[2]\n"
-    "mov v15.16b, v19.16b\n"
-    "mov v14.16b, v20.16b\n"
-    "mov v24.16b, v16.16b\n"
-    "fmls v18.4s, v13.4s, v0.s[3]\n"
-    "fmla v15.4s, v10.4s, v0.s[2]\n"
-    "fmls v23.4s, v1.4s, v0.s[3]\n"
-    "fmla v14.4s, v9.4s, v0.s[2]\n"
-    "fmla v24.4s, v7.4s, v0.s[2]\n"
-    "mov v10.16b, v17.16b\n"
-    "fmls v15.4s, v2.4s, v0.s[3]\n"
-    "mov v7.16b, v18.16b\n"
-    "str s23, [%[outptr0]]\n"
-    "fmls v14.4s, v3.4s, v0.s[3]\n"
-    "fmls v24.4s, v4.4s, v0.s[3]\n"
-    "fmla v10.4s, v8.4s, v0.s[2]\n"
-    "str s15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v7.4s, v11.4s, v0.s[2]\n"
-    "str s14, [%[outptr0], x11]\n"
-    "fmls v10.4s, v5.4s, v0.s[3]\n"
-    "str s24, [%[outptr0], x13]\n"
-    "fmls v7.4s, v6.4s, v0.s[3]\n"
-    "str s10, [%[outptr0], x23]\n"
-    "str s7, [%[outptr0], x15]\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "mov v26.16b, v12.16b\n"
-    "mov v25.16b, v19.16b\n"
-    "ldr s11, [x25, x20]\n"
-    "mov v10.16b, v11.16b\n"
-    "ldr s23, [x25, x9]\n"
-    "mov v9.16b, v11.16b\n"
-    "ldr s7, [x25]\n"
-    "fmla v10.4s, v7.4s, v0.s[2]\n"
-    "ldr s13, [x25, x21]\n"
-    "mov v7.16b, v11.16b\n"
-    "ldr s31, [x25, x19]\n"
-    "mov v8.16b, v11.16b\n"
-    "ldr s21, [x25, %[input_col_stride1]]\n"
-    "fmls v10.4s, v23.4s, v0.s[3]\n"
-    "ldr s30, [x26, x20]\n"
-    "fmls v9.4s, v21.4s, v0.s[2]\n"
-    "ldr s29, [x26, x9]\n"
-    "fmla v7.4s, v21.4s, v0.s[2]\n"
-    "ldr s22, [x26]\n"
-    "fmls v8.4s, v21.4s, v0.s[1]\n"
-    "ldr s24, [x26, x21]\n"
-    "fmls v9.4s, v23.4s, v0.s[2]\n"
-    "ldr s27, [x26, x19]\n"
-    "fmls v7.4s, v23.4s, v0.s[2]\n"
-    "ldr s28, [x26, %[input_col_stride1]]\n"
-    "fsub v8.4s, v8.4s, v23.4s\n"
-    "add x25, x25, #4\n"
-    "fadd v9.4s, v9.4s, v31.4s\n"
-    "add x26, x26, #4\n"
-    "fsub v7.4s, v7.4s, v31.4s\n"
-    "fmla v8.4s, v31.4s, v0.s[1]\n"
-    "mov v11.16b, v11.16b\n"
-    "mov v15.16b, v13.16b\n"
-    "mov v14.16b, v30.16b\n"
-    "mov v13.16b, v30.16b\n"
-    "fmla v11.4s, v21.4s, v0.s[1]\n"
-    "fmla v15.4s, v21.4s, v0.s[2]\n"
-    "fmla v14.4s, v22.4s, v0.s[2]\n"
-    "fmls v13.4s, v28.4s, v0.s[2]\n"
-    "mov v21.16b, v30.16b\n"
-    "mov v22.16b, v30.16b\n"
-    "fsub v11.4s, v11.4s, v23.4s\n"
-    "fmls v15.4s, v31.4s, v0.s[3]\n"
-    "fmls v11.4s, v31.4s, v0.s[1]\n"
-    "fmls v14.4s, v29.4s, v0.s[3]\n"
-    "fmls v13.4s, v29.4s, v0.s[2]\n"
-    "fmla v21.4s, v28.4s, v0.s[2]\n"
-    "fmls v22.4s, v28.4s, v0.s[1]\n"
-    "mov v23.16b, v30.16b\n"
-    "mov v24.16b, v24.16b\n"
-    "fmls v26.4s, v10.4s, v0.s[2]\n"
-    "fadd v13.4s, v13.4s, v27.4s\n"
-    "fmls v21.4s, v29.4s, v0.s[2]\n"
-    "fsub v22.4s, v22.4s, v29.4s\n"
-    "fmla v23.4s, v28.4s, v0.s[1]\n"
-    "fmla v22.4s, v27.4s, v0.s[1]\n"
-    "fmla v24.4s, v28.4s, v0.s[2]\n"
-    "fsub v21.4s, v21.4s, v27.4s\n"
-    "fmls v26.4s, v1.4s, v0.s[2]\n"
-    "fsub v23.4s, v23.4s, v29.4s\n"
-    "fmls v25.4s, v9.4s, v0.s[2]\n"
-    "fmls v23.4s, v27.4s, v0.s[1]\n"
-    "fmls v24.4s, v27.4s, v0.s[3]\n"
-    "fadd v26.4s, v26.4s, v14.4s\n"
-    "mov v27.16b, v20.16b\n"
-    "str s26, [x28]\n"
-    "fmls v25.4s, v2.4s, v0.s[2]\n"
-    "fmls v27.4s, v7.4s, v0.s[2]\n"
-    "mov v31.16b, v16.16b\n"
-    "mov v30.16b, v17.16b\n"
-    "mov v29.16b, v18.16b\n"
-    "fadd v25.4s, v25.4s, v13.4s\n"
-    "fmls v31.4s, v8.4s, v0.s[2]\n"
-    "str s25, [x28, %[output_col_stride1]]\n"
-    "fmls v27.4s, v3.4s, v0.s[2]\n"
-    "fmls v30.4s, v11.4s, v0.s[2]\n"
-    "fmls v29.4s, v15.4s, v0.s[2]\n"
-    "fmls v31.4s, v4.4s, v0.s[2]\n"
-    "mov v26.16b, v12.16b\n"
-    "fadd v27.4s, v27.4s, v21.4s\n"
-    "mov v25.16b, v19.16b\n"
-    "str s27, [x28, x11]\n"
-    "fmls v30.4s, v5.4s, v0.s[2]\n"
-    "fadd v31.4s, v31.4s, v22.4s\n"
-    "fmls v29.4s, v6.4s, v0.s[2]\n"
-    "str s31, [x28, x13]\n"
-    "fmla v26.4s, v10.4s, v0.s[2]\n"
-    "fadd v30.4s, v30.4s, v23.4s\n"
-    "fmla v25.4s, v9.4s, v0.s[2]\n"
-    "str s30, [x28, x23]\n"
-    "fadd v29.4s, v29.4s, v24.4s\n"
-    "str s29, [x28, x15]\n"
-    "fmls v26.4s, v1.4s, v0.s[2]\n"
-    "fmls v25.4s, v2.4s, v0.s[2]\n"
-    "add x28, x28, #4\n"
-    "mov v30.16b, v20.16b\n"
-    "mov v29.16b, v16.16b\n"
-    "fsub v26.4s, v26.4s, v14.4s\n"
-    "mov v28.16b, v17.16b\n"
-    "str s26, [x22]\n"
-    "fsub v25.4s, v25.4s, v13.4s\n"
-    "str s25, [x22, %[output_col_stride1]]\n"
-    "fmla v30.4s, v7.4s, v0.s[2]\n"
-    "fmla v29.4s, v8.4s, v0.s[2]\n"
-    "fmla v28.4s, v11.4s, v0.s[2]\n"
-    "mov v26.16b, v18.16b\n"
-    "mov v25.16b, v12.16b\n"
-    "fmls v30.4s, v3.4s, v0.s[2]\n"
-    "mov v31.16b, v19.16b\n"
-    "fmls v29.4s, v4.4s, v0.s[2]\n"
-    "fmls v28.4s, v5.4s, v0.s[2]\n"
-    "fmla v26.4s, v15.4s, v0.s[2]\n"
-    "fmls v25.4s, v10.4s, v0.s[1]\n"
-    "fsub v30.4s, v30.4s, v21.4s\n"
-    "fmls v31.4s, v9.4s, v0.s[1]\n"
-    "str s30, [x22, x11]\n"
-    "fsub v29.4s, v29.4s, v22.4s\n"
-    "str s29, [x22, x13]\n"
-    "fsub v28.4s, v28.4s, v23.4s\n"
-    "str s28, [x22, x23]\n"
-    "fmls v26.4s, v6.4s, v0.s[2]\n"
-    "fsub v25.4s, v25.4s, v1.4s\n"
-    "fsub v31.4s, v31.4s, v2.4s\n"
-    "fmla v25.4s, v14.4s, v0.s[1]\n"
-    "fmla v31.4s, v13.4s, v0.s[1]\n"
-    "fsub v26.4s, v26.4s, v24.4s\n"
-    "mov v27.16b, v20.16b\n"
-    "str s26, [x22, x15]\n"
-    "mov v26.16b, v16.16b\n"
-    "str s25, [x12]\n"
-    "fmls v27.4s, v7.4s, v0.s[1]\n"
-    "str s31, [x12, %[output_col_stride1]]\n"
-    "fmls v26.4s, v8.4s, v0.s[1]\n"
-    "mov v25.16b, v17.16b\n"
-    "add x22, x22, #4\n"
-    "fsub v27.4s, v27.4s, v3.4s\n"
-    "mov v28.16b, v18.16b\n"
-    "fmla v27.4s, v21.4s, v0.s[1]\n"
-    "fsub v26.4s, v26.4s, v4.4s\n"
-    "fmla v26.4s, v22.4s, v0.s[1]\n"
-    "fmls v25.4s, v11.4s, v0.s[1]\n"
-    "fmls v28.4s, v15.4s, v0.s[1]\n"
-    "mov v12.16b, v12.16b\n"
-    "str s27, [x12, x11]\n"
-    "mov v19.16b, v19.16b\n"
-    "str s26, [x12, x13]\n"
-    "fsub v25.4s, v25.4s, v5.4s\n"
-    "fmla v25.4s, v23.4s, v0.s[1]\n"
-    "fsub v28.4s, v28.4s, v6.4s\n"
-    "fmla v28.4s, v24.4s, v0.s[1]\n"
-    "fmla v12.4s, v10.4s, v0.s[1]\n"
-    "fmla v19.4s, v9.4s, v0.s[1]\n"
-    "mov v20.16b, v20.16b\n"
-    "str s25, [x12, x23]\n"
-    "mov v16.16b, v16.16b\n"
-    "str s28, [x12, x15]\n"
-    "fsub v12.4s, v12.4s, v1.4s\n"
-    "fmls v12.4s, v14.4s, v0.s[1]\n"
-    "add x12, x12, #4\n"
-    "fsub v19.4s, v19.4s, v2.4s\n"
-    "fmla v20.4s, v7.4s, v0.s[1]\n"
-    "fmls v19.4s, v13.4s, v0.s[1]\n"
-    "fmla v16.4s, v8.4s, v0.s[1]\n"
-    "str s12, [x14]\n"
-    "mov v1.16b, v17.16b\n"
-    "fsub v20.4s, v20.4s, v3.4s\n"
-    "mov v17.16b, v18.16b\n"
-    "str s19, [x14, %[output_col_stride1]]\n"
-    "fmls v20.4s, v21.4s, v0.s[1]\n"
-    "fsub v16.4s, v16.4s, v4.4s\n"
-    "fmla v1.4s, v11.4s, v0.s[1]\n"
-    "fmls v16.4s, v22.4s, v0.s[1]\n"
-    "fmla v17.4s, v15.4s, v0.s[1]\n"
-    "str s20, [x14, x11]\n"
-    "fsub v1.4s, v1.4s, v5.4s\n"
-    "str s16, [x14, x13]\n"
-    "fmls v1.4s, v23.4s, v0.s[1]\n"
-    "fsub v17.4s, v17.4s, v6.4s\n"
-    "fmls v17.4s, v24.4s, v0.s[1]\n"
-    "str s1, [x14, x23]\n"
-    "str s17, [x14, x15]\n"
-    "add x14, x14, #4\n"
-    "ldr s2, [x27, x20]\n"
-    "mov v4.16b, v2.16b\n"
-    "ldr s17, [x27, x9]\n"
-    "mov v12.16b, v2.16b\n"
-    "ldr s18, [x27]\n"
-    "fmla v4.4s, v18.4s, v0.s[2]\n"
-    "ldr s3, [x27, x21]\n"
-    "mov v6.16b, v2.16b\n"
-    "ldr s5, [x27, x19]\n"
-    "mov v1.16b, v2.16b\n"
-    "ldr s18, [x27, %[input_col_stride1]]\n"
-    "fmls v4.4s, v17.4s, v0.s[3]\n"
-    "add x27, x27, #4\n"
-    "fmls v12.4s, v18.4s, v0.s[2]\n"
-    "fmla v6.4s, v18.4s, v0.s[2]\n"
-    "fmls v1.4s, v18.4s, v0.s[1]\n"
-    "mov v2.16b, v2.16b\n"
-    "mov v3.16b, v3.16b\n"
-    "mov v4.16b, v4.16b\n"
-    "fmls v12.4s, v17.4s, v0.s[2]\n"
-    "fmls v6.4s, v17.4s, v0.s[2]\n"
-    "fsub v1.4s, v1.4s, v17.4s\n"
-    "fmla v2.4s, v18.4s, v0.s[1]\n"
-    "fmla v1.4s, v5.4s, v0.s[1]\n"
-    "fmla v3.4s, v18.4s, v0.s[2]\n"
-    "fadd v12.4s, v12.4s, v5.4s\n"
-    "fsub v6.4s, v6.4s, v5.4s\n"
-    "fsub v2.4s, v2.4s, v17.4s\n"
-    "fmla v4.4s, v10.4s, v0.s[2]\n"
-    "fmls v2.4s, v5.4s, v0.s[1]\n"
-    "fmls v3.4s, v5.4s, v0.s[3]\n"
-    "mov v16.16b, v12.16b\n"
-    "mov v5.16b, v6.16b\n"
-    "fmls v4.4s, v14.4s, v0.s[3]\n"
-    "mov v6.16b, v1.16b\n"
-    "fmla v16.4s, v9.4s, v0.s[2]\n"
-    "fmla v5.4s, v7.4s, v0.s[2]\n"
-    "fmla v6.4s, v8.4s, v0.s[2]\n"
-    "mov v9.16b, v2.16b\n"
-    "str s4, [x24]\n"
-    "mov v10.16b, v3.16b\n"
-    "fmls v16.4s, v13.4s, v0.s[3]\n"
-    "fmls v5.4s, v21.4s, v0.s[3]\n"
-    "fmls v6.4s, v22.4s, v0.s[3]\n"
-    "fmla v9.4s, v11.4s, v0.s[2]\n"
-    "fmla v10.4s, v15.4s, v0.s[2]\n"
-    "str s16, [x24, %[output_col_stride1]]\n"
-    "str s5, [x24, x11]\n"
-    "fmls v9.4s, v23.4s, v0.s[3]\n"
-    "str s6, [x24, x13]\n"
-    "fmls v10.4s, v24.4s, v0.s[3]\n"
-    "str s9, [x24, x23]\n"
-    "str s10, [x24, x15]\n"
-    "add x24, x24, #4\n"
-    "4:\n"
-    : [outptr0] "+r" (matrix_base),
-      [n_channels] "+r" (n_channels),
-      [inptr0] "+r" (input_base)
-    : [pcoeffs] "r" (pcoeffs),
-      [output_row_stride] "r" (6 * matrix_stride * sizeof(float)),
-      [output_col_stride1] "r" (matrix_stride * sizeof(float)),
-      [input_row_stride] "r" (input_row_stride * sizeof(float)),
-      [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
-      "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-      "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8",
-      "v9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x9", "x19",
-      "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-#else  // __arm__ not __aarch64__
-
-template <>
-void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile(
-  const int n_channels,
-  const float* const input_base,
-  const int input_row_stride,
-  const int input_col_stride,
-  float* outptr,
-  const int matrix_stride
-)
-{
-  constexpr int inner_tile_rows = 6;
-  constexpr int inner_tile_cols = 6;
-
-  // Get pointers into the input tile
-  const float *x_ptrs[inner_tile_rows][inner_tile_cols];
-  for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
-  {
-    // Get a pointer into the row
-    const float* const row_ptr = input_base + xi*input_row_stride;
-
-    for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
-    {
-      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
-    }
-  }
-
-  // Matrices used/computed in this kernel.
-  float x[inner_tile_rows][inner_tile_cols];
-  float XTx[inner_tile_rows][inner_tile_cols];
-  float U[inner_tile_rows][inner_tile_cols];
-  for (int i = 0; i < inner_tile_rows; i++)
-  {
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      x[i][j] = XTx[i][j] = 0.0f;
-    }
-  }
-
-  // Perform the Winograd input transformation for each channel in the input
-  // tensor.
-  int channels_remaining = n_channels;
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used/computed in this kernel
-    float32x2_t x[inner_tile_rows][inner_tile_cols];
-    float32x2_t XTx[inner_tile_rows][inner_tile_cols];
-    float32x2_t U[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vdup_n_f32(0.0f);
-        XTx[i][j] = vdup_n_f32(0.0f);
-      }
-    }
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vld1_f32(x_ptrs[i][j]);
-        x_ptrs[i][j] += 2;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-      XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
-      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-      XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-      XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-      XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
-
-      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-      XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
-
-      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-      XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-      U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
-      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-      U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-      U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-      U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
-      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-      U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
-      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-      U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++, m++)
-      {
-        vst1_f32(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 2;
-  }
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load x
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = *(x_ptrs[i][j]++);
-      }
-    }
-
-    // Compute XT . x
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-      XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-      XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-      XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-      XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-      XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-      U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-      U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-      U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-      U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-      U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++, m++)
-      {
-        *(outptr + m*matrix_stride) = U[i][j];
-      }
-    }
-    outptr++;
-  }
-}
-
-#endif
-
-template class InputTransform<6, 6, float, float, WinogradRoots::Integers>;
-
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp
deleted file mode 100644
index 27d20811d6..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "winograd.hpp"
-using namespace winograd;
-
-#define MEMBERFN(RTYPE) template <\
-  int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename TIn, typename TOut, WinogradRoots Roots\
-> RTYPE WeightTransform<KernelRows, KernelCols, InnerTileRows, InnerTileCols, TIn, TOut, Roots>
-
-MEMBERFN()::WeightTransform(
-  const int n_output_channels,
-  const int n_input_channels
-) : _n_output_channels(n_output_channels), _n_input_channels(n_input_channels),
-    _matrices(nullptr), _matrix_stride(0), _matrix_row_stride(0), _weights(nullptr)
-{
-
-}
-
-MEMBERFN(void)::set_weight_tensor(const void * const weights)
-{
-  _weights = static_cast<const TIn *>(weights);
-}
-
-MEMBERFN(void)::set_output_matrices(void * const mptr, const int ldmatrix, const int ldrow)
-{
-  _matrices = static_cast<TOut *>(mptr);
-  _matrix_stride = ldmatrix;
-  _matrix_row_stride = ldrow;
-}
-
-MEMBERFN(size_t)::get_working_space_size(unsigned int) const
-{
-  return 0;
-}
-
-MEMBERFN(void)::set_working_space(void *)
-{
-}
-
-MEMBERFN(unsigned int)::get_window(void) const
-{
-  // TODO When the weights transform supports multithreading, return the number
-  // of output channels. For now we return 1 to indicate that the weights must
-  // be transformed as a single block.
-  // return n_output_channels;
-  return 1;
-}
-
-MEMBERFN(void)::run(const unsigned int, const unsigned int, unsigned int)
-{
-  execute(
-    _n_output_channels, _n_input_channels, _weights,
-    _matrices, _matrix_stride, _matrix_row_stride
-  );
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp
deleted file mode 100644
index c1fb559b1d..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <algorithm>
-#include "winograd.hpp"
-#include "padding.hpp"
-#include "utils.hpp"
-
-#define MEMBERFN(RTYPE) template<\
-  int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols,\
-  typename TIn, typename TOut, WinogradRoots Roots\
-> RTYPE OutputTransform<KernelRows, KernelCols, InnerTileRows, InnerTileCols, TIn, TOut, Roots>
-
-#define Nx1MEMBERFN(RTYPE) template<\
-  int KernelRows, int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots\
-> RTYPE OutputTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots>
-
-namespace winograd
-{
-
-MEMBERFN()
-::OutputTransform(const int n_batches, const int n_rows, const int n_cols,
-                  const int n_channels, const arm_gemm::Activation &activation)
-    : _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols),
-      _n_channels(n_channels),
-      _output_min((activation.type == arm_gemm::Activation::Type::ReLU ||
-                   activation.type == arm_gemm::Activation::Type::BoundedReLU)
-                      ? static_cast<TOut>(0.0f) : TypeBounds<TOut>::lower()),
-      _output_max((activation.type == arm_gemm::Activation::Type::BoundedReLU)
-                      ? static_cast<TOut>(activation.param1) : TypeBounds<TOut>::upper()),
-      _matrix_base(nullptr), _biases(nullptr), _matrix_stride(0),
-      _matrix_row_stride(0), _matrix_batch_stride(0), _outptr(nullptr),
-      _tiles_M(iceildiv(n_rows, output_tile_rows)),
-      _tiles_N(iceildiv(n_cols, output_tile_cols)), _out_col_stride(0),
-      _out_row_stride(0), _out_batch_stride(0),
-      _working_space_col_stride(n_channels),
-      _working_space_row_stride(output_tile_cols * _working_space_col_stride),
-      _working_space(nullptr) {}
-
-MEMBERFN(void)::set_input_matrices(const void * const mptr, const int ldmatrix, const int ldrow)
-{
-  _matrix_base = static_cast<const TIn *>(mptr);
-  _matrix_stride = ldmatrix;
-  _matrix_row_stride = ldrow;
-  _matrix_batch_stride = _tiles_M * _tiles_N * ldrow;
-}
-
-MEMBERFN(void)::set_bias(const void * const bias)
-{
-  _biases = static_cast<const TOut *>(bias);
-}
-
-MEMBERFN(void)::set_output_tensor(void * const outptr)
-{
-  set_output_tensor(outptr, _n_channels);
-}
-
-MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldcol)
-{
-  set_output_tensor(outptr, _n_cols * ldcol, ldcol);
-}
-
-MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldrow, const int ldcol)
-{
-  set_output_tensor(outptr, _n_rows * ldrow, ldrow, ldcol);
-}
-
-MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldbatch, const int ldrow, const int ldcol)
-{
-  _outptr = static_cast<TOut *>(outptr);
-  _out_batch_stride = ldbatch;
-  _out_row_stride = ldrow;
-  _out_col_stride = ldcol;
-}
-
-Nx1MEMBERFN()::OutputTransform(
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  const arm_gemm::Activation &activation
-) : OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>::OutputTransform(
-    n_batches, n_cols, n_rows, n_channels, activation /* Transpose rows and columns */
-  )
-{
-}
-
-Nx1MEMBERFN(void)::set_output_tensor(void * const outptr)
-{
-  set_output_tensor(outptr, this->_n_channels);
-}
-
-Nx1MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldcol)
-{
-  set_output_tensor(outptr, this->_n_cols * ldcol, ldcol);
-}
-
-Nx1MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldrow, const int ldcol)
-{
-  set_output_tensor(outptr, this->_n_rows * ldrow, ldrow, ldcol);
-}
-
-Nx1MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldbatch, const int ldrow, const int ldcol)
-{
-  // Transpose rows and columns
-  Base::set_output_tensor(outptr, ldbatch, ldcol, ldrow);
-}
-
-MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
-{
-  return sizeof(TOut) * output_tile_rows * _working_space_row_stride * nthreads;
-}
-
-MEMBERFN(void)::set_working_space(void * const buffer)
-{
-  _working_space = static_cast<TOut *>(buffer);
-}
-
-MEMBERFN(unsigned int)::get_window(void) const
-{
-  return iceildiv(_n_channels, WINDOW_BLOCK);
-}
-
-MEMBERFN(void)::run(
-  const unsigned int start,
-  const unsigned int stop,
-  const unsigned int threadid
-)
-{
-  // Determine the channels on which to work
-  if (start >= get_window())
-  {
-    return;  // No work to do beyond the end of the window
-  }
-  const unsigned int start_channel = start * WINDOW_BLOCK;
-  const unsigned int stop_channel = std::min<unsigned int>(_n_channels, stop * WINDOW_BLOCK);
-  const unsigned int n_channels = stop_channel - start_channel;
-
-  const auto matrix_tile_col_stride = _matrix_row_stride;
-  const auto matrix_tile_row_stride = _tiles_N * matrix_tile_col_stride;
-
-  const TOut* const bptr = (_biases == nullptr) ? nullptr : _biases + start_channel;
-
-  // Loop over batches
-  for (int batch = 0; batch < _n_batches; batch++)
-  {
-    const TIn* const matrix_batch = _matrix_base + start_channel + batch * _matrix_batch_stride;
-    TOut* const outptr_batch = _outptr + start_channel + batch * _out_batch_stride;
-
-    for (int tile_i = 0; tile_i < _tiles_M; tile_i++)
-    {
-      // Compute properties of the row of output tiles
-      const int row_pad_bottom = std::max(0, (tile_i + 1)*output_tile_rows - _n_rows);
-      const TIn* const matrix_tile_row = matrix_batch + tile_i * matrix_tile_row_stride;
-      TOut* const outptr_row = outptr_batch + tile_i * output_tile_rows * _out_row_stride;
-
-      for (int tile_j = 0; tile_j < _tiles_N; tile_j++)
-      {
-        // Compute property of this specific tile
-        const int tile_pad_right = std::max(0, (tile_j + 1)*output_tile_cols - _n_cols);
-        const TIn* const matrix_tile = matrix_tile_row + tile_j * matrix_tile_col_stride;
-        TOut* const outptr_tile = outptr_row + tile_j * output_tile_cols * _out_col_stride;
-
-        // Perform the transformation
-        if (row_pad_bottom || tile_pad_right)
-        {
-          transform_cropped_tile(
-            threadid, n_channels, outptr_tile, matrix_tile, bptr,
-            row_pad_bottom, tile_pad_right
-          );
-        }
-        else
-        {
-          transform_uncropped_tile(
-            threadid, n_channels, outptr_tile, matrix_tile, bptr
-          );
-        }
-      }
-    }
-  }
-}
-
-MEMBERFN(void)::transform_uncropped_tile(
-  const unsigned int /* threadid unused */,
-  const int n_channels,
-  TOut * const outptr,
-  const TIn * const inptr,
-  const TOut * const biases
-)
-{
-  transform_tile(
-    n_channels, inptr, _matrix_stride, biases,
-    outptr, _out_row_stride, _out_col_stride,
-    _output_min, _output_max
-  );
-}
-
-MEMBERFN(void)::transform_cropped_tile(
-  const unsigned int threadid,
-  const int n_channels,
-  TOut * const outptr,
-  const TIn * const inptr,
-  const TOut * const biases,
-  const int pad_bottom,
-  const int pad_right
-)
-{
-  // Transform into working space and then copy the relevant section out.
-  TOut *wsptr = static_cast<TOut *>(get_working_space(threadid));
-  transform_tile(
-    n_channels, inptr, _matrix_stride, biases,
-    wsptr, _working_space_row_stride, _working_space_col_stride,
-    _output_min, _output_max
-  );
-
-  padding::crop_and_copy_tile(
-    output_tile_rows, output_tile_cols, n_channels,
-    wsptr, _working_space_row_stride, _working_space_col_stride,
-    outptr, _out_row_stride, _out_col_stride,
-    0u, 0u, pad_bottom, pad_right
-  );
-}
-
-MEMBERFN(void *)::get_working_space(const unsigned int threadid) const
-{
-  return _working_space + output_tile_rows * _working_space_row_stride * threadid;
-}
-
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp
deleted file mode 100644
index 8e257909a3..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm.hpp"
-#include "output.hpp"
-
-namespace winograd
-{
-
-template <>
-void OutputTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>::transform_tile(
-  const int n_channels,
-  const float* inptr,
-  const int matrix_stride,
-  const float* bptr,
-  float* const output,
-  const int,  // No need to stride across rows
-  const int output_col_stride,
-  const float output_min,
-  const float output_max
-)
-{
-  // Construct a map to the output cells
-  float *outptrs[output_tile_cols];
-  for (int j = 0; j < output_tile_cols; j++)
-  {
-    outptrs[j] = output + j*output_col_stride;
-  }
-
-  // For each channel of the output
-  int channels_remaining = n_channels;
-#ifdef __arm_any__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used and computed during this transform
-    float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
-
-    // Read a 1x8 tile in the Winograd domain
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      F[j] = vld1q_f32(inptr + j*matrix_stride);
-    }
-    inptr += 4;
-
-    f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
-    f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
-
-    // Write out the output tile
-    if (bptr != 0)
-    {
-      b = vld1q_f32(bptr);
-      bptr += 4;
-    }
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      const auto y = vminq_f32(vmaxq_f32(f[j] + b, vdupq_n_f32(output_min)),
-                               vdupq_n_f32(output_max));
-      vst1q_f32(outptrs[j], y);
-      outptrs[j] += 4;
-    }
-  }
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
-
-    // Read a 1x8 tile in the Winograd domain
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      F[j] = vld1_f32(inptr + j*matrix_stride);
-    }
-    inptr += 2;
-
-    f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
-    f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
-
-    // Write out the output tile
-    if (bptr != 0)
-    {
-      b = vld1_f32(bptr);
-      bptr += 2;
-    }
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      const auto y = vmin_f32(vmax_f32(f[j] + b, vdup_n_f32(output_min)),
-                              vdup_n_f32(output_max));
-      vst1_f32(outptrs[j], y);
-      outptrs[j] += 2;
-    }
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Matrices used and computed during this transform
-    float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
-
-    // Read a 1x8 tile in the Winograd domain
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      F[j] = *(inptr + j*matrix_stride);
-    }
-    inptr++;
-
-    f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
-    f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1 + F[7]*1;
-
-    // Write out the output tile
-    if (bptr != 0)
-    {
-      b = *(bptr++);
-    }
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      *(outptrs[j]++) = std::max(std::min(f[j] + b, output_max), output_min);
-    }
-  }
-}
-
-template class OutputTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>;
-template class OutputTransform<7, 1, 8, 1, float, float, WinogradRoots::Integers>;
-
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp
deleted file mode 100644
index 8b0b4707f9..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm.hpp"
-#include "output.hpp"
-
-namespace winograd
-{
-
-template <>
-void OutputTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>::transform_tile(
-  const int n_channels,
-  const float* inptr,
-  const int matrix_stride,
-  const float* bptr,
-  float* const output,
-  const int output_row_stride,
-  const int output_col_stride,
-  const float output_min,
-  const float output_max
-)
-{
-  // Construct a map to the output cells
-  float *outptrs[output_tile_rows][output_tile_cols];
-  for (int i = 0; i < output_tile_rows; i++)
-  {
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
-    }
-  }
-
-  // For each channel of the output
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used and computed during this transform
-    float32x4_t F[4][4], FZ[4][2], f[2][2], b;
-
-    // Read a 4x4 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 4; i++)
-    {
-      for (int j = 0; j < 4; j++, m++)
-      {
-        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
-      }
-    }
-    inptr += 4;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 4; i++)
-    {
-      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-      FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
-
-      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-      FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-      f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
-      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-      f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
-    }
-
-    // Load the bias vector
-    if (bptr != nullptr)
-    {
-      b = vld1q_f32(bptr);
-      bptr += 4;
-    }
-    else
-    {
-      b = vdupq_n_f32(0.0f);
-    }
-
-    // Write out the output tile
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        const auto y =
-            vmaxq_f32(vminq_f32(vaddq_f32(f[i][j], b), vdupq_n_f32(output_max)),
-                      vdupq_n_f32(output_min));
-        vst1q_f32(outptrs[i][j], y);
-        outptrs[i][j] += 4;
-      }
-    }
-  }
-#endif  // __aarch64__
-#ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[4][4], FZ[4][2], f[2][2], b;
-
-    // Read a 4x4 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 4; i++)
-    {
-      for (int j = 0; j < 4; j++, m++)
-      {
-        F[i][j] = vld1_f32(inptr + m*matrix_stride);
-      }
-    }
-    inptr += 2;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 4; i++)
-    {
-      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-      FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
-
-      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-      FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-      f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
-      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-      f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
-    }
-
-    // Load the bias vector
-    if (bptr != nullptr)
-    {
-      b = vld1_f32(bptr);
-      bptr += 2;
-    }
-    else
-    {
-      b = vdup_n_f32(0.0f);
-    }
-
-    // Write out the output tile
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        const auto y =
-            vmax_f32(vmin_f32(vadd_f32(f[i][j], b), vdup_n_f32(output_max)),
-                     vdup_n_f32(output_min));
-        vst1_f32(outptrs[i][j], y);
-        outptrs[i][j] += 2;
-      }
-    }
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Matrices used and computed during this transform
-    float F[4][4], FZ[4][2], f[2][2], b;
-
-    // Read a 4x4 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 4; i++)
-    {
-      for (int j = 0; j < 4; j++, m++)
-      {
-        F[i][j] = *(inptr + m*matrix_stride);
-      }
-    }
-    inptr++;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 4; i++)
-    {
-      FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-      FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-      f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-    }
-
-    // Load the bias
-    if (bptr != nullptr)
-    {
-      b = *(bptr++);
-    }
-    else
-    {
-      b = 0.0f;
-    }
-
-    // Write out the output tile
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        const auto y = std::max(std::min(f[i][j] + b, output_max), output_min);
-        *(outptrs[i][j]++) = y;
-      }
-    }
-  }
-}
-
-template class OutputTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>;
-
-}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp
deleted file mode 100644
index 3996be1c52..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "output.hpp"
-#include "arm.hpp"
-
-namespace winograd
-{
-
-template <>
-void OutputTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>::transform_tile(
-  const int n_channels,
-  const float* inptr,
-  const int matrix_stride,
-  const float* bptr,
-  float* const output,
-  const int output_row_stride,
-  const int output_col_stride,
-  const float output_min,
-  const float output_max
-)
-{
-  // Construct a map to the output cells
-  float *outptrs[output_tile_rows][output_tile_cols];
-  for (int i = 0; i < output_tile_rows; i++)
-  {
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
-    }
-  }
-
-  // For each channel of the output
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used and computed during this transform
-    float32x4_t F[6][6], FZ[6][2], f[2][2], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
-      }
-    }
-    inptr += 4;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
-      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-      FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-      f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    if (bptr != nullptr)
-    {
-      b = vld1q_f32(bptr);
-      bptr += 4;
-    }
-    else
-    {
-      b = vdupq_n_f32(0.0f);
-    }
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        const auto y =
-            vmaxq_f32(vminq_f32(vaddq_f32(f[i][j], b), vdupq_n_f32(output_max)),
-                      vdupq_n_f32(output_min));
-        vst1q_f32(outptrs[i][j], y);
-        outptrs[i][j] += 4;
-      }
-    }
-  }
-#endif  // __aarch64__
-#ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[6][6], FZ[6][2], f[2][2], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        F[i][j] = vld1_f32(inptr + m*matrix_stride);
-      }
-    }
-    inptr += 2;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
-
-      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-      FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-      f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    if (bptr != nullptr)
-    {
-      b = vld1_f32(bptr);
-      bptr += 2;
-    }
-    else
-    {
-      b = vdup_n_f32(0.0f);
-    }
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        const auto y =
-            vmax_f32(vmin_f32(vadd_f32(f[i][j], b), vdup_n_f32(output_max)),
-                     vdup_n_f32(output_min));
-        vst1_f32(outptrs[i][j], y);
-        outptrs[i][j] += 2;
-      }
-    }
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Matrices used and computed during this transform
-    float F[6][6], FZ[6][2], f[2][2], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        F[i][j] = *(inptr + m*matrix_stride);
-      }
-    }
-    inptr++;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-    }
-
-    // Write out the output tile
-    if (bptr != nullptr)
-    {
-      b = *(bptr++);
-    }
-    else
-    {
-      b = 0.0f;
-    }
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        const auto y = std::max(std::min(f[i][j] + b, output_max), output_min);
-        *(outptrs[i][j]++) = y;
-      }
-    }
-  }
-}
-
-template class OutputTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>;
-
-}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp
deleted file mode 100644
index c35037e143..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "output.hpp"
-#include "arm.hpp"
-
-namespace winograd
-{
-
-template <>
-void OutputTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>::transform_tile(
-  const int n_channels,
-  const float* inptr,
-  const int matrix_stride,
-  const float* bptr,
-  float* const output,
-  const int,  // No need to stride across rows
-  const int output_col_stride,
-  const float output_min,
-  const float output_max
-)
-{
-  // Construct a map to the output cells
-  float *outptrs[output_tile_cols];
-  for (int j = 0; j < output_tile_cols; j++)
-  {
-    outptrs[j] = output + j*output_col_stride;
-  }
-
-  // For each channel of the output
-  int channels_remaining = n_channels;
-#ifdef __arm_any__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used and computed during this transform
-    float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
-
-    // Read a 1x8 tile in the Winograd domain
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      F[j] = vld1q_f32(inptr + j*matrix_stride);
-    }
-    inptr += 4;
-
-    f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
-    f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
-    f[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
-    f[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
-
-    // Write out the output tile
-    if (bptr != 0)
-    {
-      b = vld1q_f32(bptr);
-      bptr += 4;
-    }
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      const auto y =
-          vmaxq_f32(vminq_f32(vaddq_f32(f[j], b), vdupq_n_f32(output_max)),
-                    vdupq_n_f32(output_min));
-      vst1q_f32(outptrs[j], y);
-      outptrs[j] += 4;
-    }
-  }
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
-
-    // Read a 1x8 tile in the Winograd domain
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      F[j] = vld1_f32(inptr + j*matrix_stride);
-    }
-    inptr += 2;
-
-    f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
-    f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
-    f[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
-    f[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
-
-    // Write out the output tile
-    if (bptr != 0)
-    {
-      b = vld1_f32(bptr);
-      bptr += 2;
-    }
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      const auto y =
-          vmax_f32(vmin_f32(vadd_f32(f[j], b), vdup_n_f32(output_max)),
-                   vdup_n_f32(output_min));
-      vst1_f32(outptrs[j], y);
-      outptrs[j] += 2;
-    }
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Matrices used and computed during this transform
-    float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
-
-    // Read a 1x8 tile in the Winograd domain
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      F[j] = *(inptr + j*matrix_stride);
-    }
-    inptr++;
-
-    f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
-    f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1;
-    f[2] = F[3]*4 + F[4]*4 + F[5]*9 + F[6]*9 + F[1]*1 + F[2]*1;
-    f[3] = F[1]*-1 + F[5]*-27 + F[3]*-8 + F[4]*8 + F[6]*27 + F[2]*1 + F[7]*1;
-
-    // Write out the output tile
-    if (bptr != 0)
-    {
-      b = *(bptr++);
-    }
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      const auto y = std::max(std::min(f[j] + b, output_max), output_min);
-      *(outptrs[j]++) = y;
-    }
-  }
-}
-
-template class OutputTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>;
-template class OutputTransform<5, 1, 8, 1, float, float, WinogradRoots::Integers>;
-
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp
deleted file mode 100644
index 3c071bdac6..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include "arm.hpp"
-#include "output.hpp"
-
-namespace winograd
-{
-
-template <>
-void winograd::OutputTransform<3, 3, 6, 6, __fp16, __fp16, winograd::WinogradRoots::Integers>::transform_tile(
-    const int n_channels,
-    const __fp16* inptr,
-    const int matrix_stride,
-    const __fp16* bptr,
-    __fp16* const output,
-    const int output_row_stride,
-    const int output_col_stride,
-    const __fp16 output_min,
-    const __fp16 output_max
-)
-{
-    // Construct a map to the output cells
-    __fp16 *outptrs[output_tile_rows][output_tile_cols];
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-        for (int j = 0; j < output_tile_cols; j++)
-        {
-            outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
-        }
-    }
-
-    // For each channel of the output
-    int channels_remaining = n_channels;
-
-#ifdef __aarch64__
-    for (; channels_remaining >= 8; channels_remaining -= 8)
-  {
-    // Matrices used and computed during this transform
-    float16x8_t F[6][6], FZ[6][4], f[4][4], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        F[i][j] = vld1q_f16(inptr + m*matrix_stride);
-      }
-    }
-    inptr += 8;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vaddq_f16(vaddq_f16(vaddq_f16(F[i][0], F[i][1]), vaddq_f16(F[i][2], F[i][3])), F[i][4]);
-
-      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-      FZ[i][1] = vaddq_f16(vsubq_f16(F[i][1], F[i][2]), vmulq_f16(vsubq_f16(F[i][3], F[i][4]), vdupq_n_f16(2.0f)));
-
-      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-      FZ[i][2] = vaddq_f16(vaddq_f16(F[i][1], F[i][2]), vmulq_f16(vaddq_f16(F[i][3], F[i][4]), vdupq_n_f16(4.0f)));
-
-      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-      FZ[i][3] = vaddq_f16(vaddq_f16(vsubq_f16(F[i][1], F[i][2]), vmulq_f16(vsubq_f16(F[i][3], F[i][4]), vdupq_n_f16(8.0f))), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 4; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vaddq_f16(vaddq_f16(vaddq_f16(FZ[0][j], FZ[1][j]), vaddq_f16(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-      f[1][j] = vaddq_f16(vsubq_f16(FZ[1][j], FZ[2][j]), vmulq_f16(vsubq_f16(FZ[3][j], FZ[4][j]), vdupq_n_f16(2.0f)));
-
-      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-      f[2][j] = vaddq_f16(vaddq_f16(FZ[1][j], FZ[2][j]), vmulq_f16(vaddq_f16(FZ[3][j], FZ[4][j]), vdupq_n_f16(4.0f)));
-
-      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-      f[3][j] = vaddq_f16(vaddq_f16(vsubq_f16(FZ[1][j], FZ[2][j]), vmulq_f16(vsubq_f16(FZ[3][j], FZ[4][j]), vdupq_n_f16(8.0f))), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    if (bptr != nullptr)
-    {
-      b = vld1q_f16(bptr);
-      bptr += 8;
-    }
-    else
-    {
-      b = vdupq_n_f16(0.0f);
-    }
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        const auto y =
-            vmaxq_f16(vminq_f16(vaddq_f16(f[i][j], b), vdupq_n_f16(output_max)),
-                     vdupq_n_f16(output_min));
-        vst1q_f16(outptrs[i][j], y);
-        outptrs[i][j] += 8;
-      }
-    }
-  }
-#endif  // __aarch64__
-#ifdef __arm_any__
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used and computed during this transform
-    float16x4_t F[6][6], FZ[6][4], f[4][4], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        F[i][j] = vld1_f16(inptr + m*matrix_stride);
-      }
-    }
-    inptr += 4;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vadd_f16(vadd_f16(vadd_f16(F[i][0], F[i][1]), vadd_f16(F[i][2], F[i][3])), F[i][4]);
-
-      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-      FZ[i][1] = vadd_f16(vsub_f16(F[i][1], F[i][2]), vmul_f16(vsub_f16(F[i][3], F[i][4]), vdup_n_f16(2.0f)));
-
-      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-      FZ[i][2] = vadd_f16(vadd_f16(F[i][1], F[i][2]), vmul_f16(vadd_f16(F[i][3], F[i][4]), vdup_n_f16(4.0f)));
-
-      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-      FZ[i][3] = vadd_f16(vadd_f16(vsub_f16(F[i][1], F[i][2]), vmul_f16(vsub_f16(F[i][3], F[i][4]), vdup_n_f16(8.0f))), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 4; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vadd_f16(vadd_f16(vadd_f16(FZ[0][j], FZ[1][j]), vadd_f16(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-      f[1][j] = vadd_f16(vsub_f16(FZ[1][j], FZ[2][j]), vmul_f16(vsub_f16(FZ[3][j], FZ[4][j]), vdup_n_f16(2.0f)));
-
-      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-      f[2][j] = vadd_f16(vadd_f16(FZ[1][j], FZ[2][j]), vmul_f16(vadd_f16(FZ[3][j], FZ[4][j]), vdup_n_f16(4.0f)));
-
-      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-      f[3][j] = vadd_f16(vadd_f16(vsub_f16(FZ[1][j], FZ[2][j]), vmul_f16(vsub_f16(FZ[3][j], FZ[4][j]), vdup_n_f16(8.0f))), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    if (bptr != nullptr)
-    {
-      b = vld1_f16(bptr);
-      bptr += 4;
-    }
-    else
-    {
-      b = vdup_n_f16(0.0f);
-    }
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        const auto y =
-            vmax_f16(vmin_f16(vadd_f16(f[i][j], b), vdup_n_f16(output_max)),
-                     vdup_n_f16(output_min));
-        vst1_f16(outptrs[i][j], y);
-        outptrs[i][j] += 4;
-      }
-    }
-  }
-#endif  // __arm_any__
-    for (; channels_remaining; channels_remaining--)
-    {
-        // Matrices used and computed during this transform
-        __fp16 F[6][6], FZ[6][4], f[4][4], b;
-
-        // Read a 6x6 tile in the Winograd domain
-        for (int i = 0, m = 0; i < 6; i++)
-        {
-            for (int j = 0; j < 6; j++, m++)
-            {
-                F[i][j] = *(inptr + m*matrix_stride);
-            }
-        }
-        inptr++;
-
-        // Compute the matrix F Z
-        for (int i = 0; i < 6; i++)
-        {
-            FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-            FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-            FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-            FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-        }
-
-        // Compute the output tile f = ZT F Z
-        for (int j = 0; j < 4; j++)
-        {
-            f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-            f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-            f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-            f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-        }
-
-        // Write out the output tile
-        if (bptr != nullptr)
-        {
-            b = *(bptr++);
-        }
-        else
-        {
-            b = 0.0f;
-        }
-        for (int i = 0; i < output_tile_rows; i++)
-        {
-            for (int j = 0; j < output_tile_cols; j++)
-            {
-                const auto y = std::max(std::min<__fp16>(f[i][j] + b, output_max), output_min);
-                *(outptrs[i][j]++) = y;
-            }
-        }
-    }
-}
-
-template class OutputTransform<3, 3, 6, 6, __fp16, __fp16, winograd::WinogradRoots::Integers>;
-
-}  // namespace winograd
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp
deleted file mode 100644
index 1eb9b537d2..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm.hpp"
-#include "output.hpp"
-
-namespace winograd
-{
-
-template <>
-void winograd::OutputTransform<3, 3, 6, 6, float, float, winograd::WinogradRoots::Integers>::transform_tile(
-  const int n_channels,
-  const float* inptr,
-  const int matrix_stride,
-  const float* bptr,
-  float* const output,
-  const int output_row_stride,
-  const int output_col_stride,
-  const float output_min,
-  const float output_max
-)
-{
-  // Construct a map to the output cells
-  float *outptrs[output_tile_rows][output_tile_cols];
-  for (int i = 0; i < output_tile_rows; i++)
-  {
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
-    }
-  }
-
-  // For each channel of the output
-  int channels_remaining = n_channels;
-
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used and computed during this transform
-    float32x4_t F[6][6], FZ[6][4], f[4][4], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
-      }
-    }
-    inptr += 4;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
-      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-      FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
-
-      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-      FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
-
-      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-      FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 4; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-      f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
-      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-      f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
-      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-      f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    if (bptr != nullptr)
-    {
-      b = vld1q_f32(bptr);
-      bptr += 4;
-    }
-    else
-    {
-      b = vdupq_n_f32(0.0f);
-    }
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        const auto y =
-            vmaxq_f32(vminq_f32(vaddq_f32(f[i][j], b), vdupq_n_f32(output_max)),
-                     vdupq_n_f32(output_min));
-        vst1q_f32(outptrs[i][j], y);
-        outptrs[i][j] += 4;
-      }
-    }
-  }
-#endif  // __aarch64__
-#ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[6][6], FZ[6][4], f[4][4], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        F[i][j] = vld1_f32(inptr + m*matrix_stride);
-      }
-    }
-    inptr += 2;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
-
-      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-      FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
-
-      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-      FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
-
-      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-      FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 4; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-      f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
-      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-      f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
-      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-      f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    if (bptr != nullptr)
-    {
-      b = vld1_f32(bptr);
-      bptr += 2;
-    }
-    else
-    {
-      b = vdup_n_f32(0.0f);
-    }
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        const auto y =
-            vmax_f32(vmin_f32(vadd_f32(f[i][j], b), vdup_n_f32(output_max)),
-                     vdup_n_f32(output_min));
-        vst1_f32(outptrs[i][j], y);
-        outptrs[i][j] += 2;
-      }
-    }
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Matrices used and computed during this transform
-    float F[6][6], FZ[6][4], f[4][4], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        F[i][j] = *(inptr + m*matrix_stride);
-      }
-    }
-    inptr++;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-      FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-      FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 4; j++)
-    {
-      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-      f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-      f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-    }
-
-    // Write out the output tile
-    if (bptr != nullptr)
-    {
-      b = *(bptr++);
-    }
-    else
-    {
-      b = 0.0f;
-    }
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        const auto y = std::max(std::min(f[i][j] + b, output_max), output_min);
-        *(outptrs[i][j]++) = y;
-      }
-    }
-  }
-}
-
-template class OutputTransform<3, 3, 6, 6, float, float, winograd::WinogradRoots::Integers>;
-
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp
deleted file mode 100644
index 528cd8c691..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "output.hpp"
-#include "arm.hpp"
-
-namespace winograd
-{
-
-template <>
-void OutputTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>::transform_tile(
-  const int n_channels,
-  const float* inptr,
-  const int matrix_stride,
-  const float* bptr,
-  float* const output,
-  const int,  // No need to stride across rows
-  const int output_col_stride,
-  const float output_min,
-  const float output_max
-)
-{
-  // Construct a map to the output cells
-  float *outptrs[output_tile_cols];
-  for (int j = 0; j < output_tile_cols; j++)
-  {
-    outptrs[j] = output + j*output_col_stride;
-  }
-
-  // For each channel of the output
-  int channels_remaining = n_channels;
-#ifdef __arm_any__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used and computed during this transform
-    float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
-
-    // Read a 1x8 tile in the Winograd domain
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      F[j] = vld1q_f32(inptr + j*matrix_stride);
-    }
-    inptr += 4;
-
-    f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
-    f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
-    f[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
-    f[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
-    f[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 81), F[5], 81), F[4], 16), F[3], 16);
-    f[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 243), F[4], 32), F[3], -32), F[5], -243), F[1], -1);
-
-    // Write out the output tile
-    if (bptr != 0)
-    {
-      b = vld1q_f32(bptr);
-      bptr += 4;
-    }
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      const auto y = vminq_f32(vmaxq_f32(f[j] + b, vdupq_n_f32(output_min)),
-                               vdupq_n_f32(output_max));
-      vst1q_f32(outptrs[j], y);
-      outptrs[j] += 4;
-    }
-  }
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
-
-    // Read a 1x8 tile in the Winograd domain
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      F[j] = vld1_f32(inptr + j*matrix_stride);
-    }
-    inptr += 2;
-
-    f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
-    f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
-    f[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
-    f[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
-    f[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 81), F[5], 81), F[4], 16), F[3], 16);
-    f[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 243), F[4], 32), F[3], -32), F[5], -243), F[1], -1);
-
-    // Write out the output tile
-    if (bptr != 0)
-    {
-      b = vld1_f32(bptr);
-      bptr += 2;
-    }
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      const auto y = vmin_f32(vmax_f32(f[j] + b, vdup_n_f32(output_min)),
-                              vdup_n_f32(output_max));
-      vst1_f32(outptrs[j], y);
-      outptrs[j] += 2;
-    }
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Matrices used and computed during this transform
-    float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
-
-    // Read a 1x8 tile in the Winograd domain
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      F[j] = *(inptr + j*matrix_stride);
-    }
-    inptr++;
-
-    f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
-    f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1;
-    f[2] = F[3]*4 + F[4]*4 + F[5]*9 + F[6]*9 + F[1]*1 + F[2]*1;
-    f[3] = F[1]*-1 + F[5]*-27 + F[3]*-8 + F[4]*8 + F[6]*27 + F[2]*1;
-    f[4] = F[3]*16 + F[4]*16 + F[5]*81 + F[6]*81 + F[1]*1 + F[2]*1;
-    f[5] = F[1]*-1 + F[5]*-243 + F[3]*-32 + F[4]*32 + F[6]*243 + F[2]*1 + F[7]*1;
-
-    // Write out the output tile
-    if (bptr != 0)
-    {
-      b = *(bptr++);
-    }
-    for (int j = 0; j < output_tile_cols; j++)
-    {
-      *(outptrs[j]++) = std::max(std::min(f[j] + b, output_max), output_min);
-    }
-  }
-}
-
-template class OutputTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>;
-template class OutputTransform<3, 1, 8, 1, float, float, WinogradRoots::Integers>;
-
-}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp
deleted file mode 100644
index 2ee377ceca..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm.hpp"
-#include "kernel.hpp"
-
-namespace winograd
-{
-
-template <>
-void WeightTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>::execute(
-  const int n_output_channels,
-  const int n_input_channels,
-  const float* const input,  // NOTE: Data in HWIO order
-  float* const output,
-  const int matrix_stride,
-  const int matrix_row_stride
-)
-{
-  // Get pointers to each cell of the weight tensor
-  const auto weight_col_stride = n_input_channels * n_output_channels;
-  const float *inptrs[kernel_cols];
-  for (int j = 0; j < kernel_cols; j++)
-  {
-    inptrs[j] = input + j*weight_col_stride;
-  }
-
-  // For each input channel
-  for (int ic = 0; ic < n_input_channels; ic++)
-  {
-    float *outptr = output + ic * matrix_row_stride;
-
-    // For each output channel
-    int channels_remaining = n_output_channels;
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed in this kernel
-      float w[kernel_cols], V[inner_tile_cols];
-
-      // Read weights
-      for (int j = 0; j < kernel_cols; j++)
-      {
-        w[j] = *(inptrs[j]++);
-      }
-
-      // Compute V = w WT
-      V[0] = (w[0]*-1) / 36.0f;
-      V[1] = (w[1]*-1 + w[3]*-1 + w[5]*-1 + w[0]*1 + w[2]*1 + w[4]*1 + w[6]*1) / 48.0f;
-      V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1 + w[5]*1 + w[6]*1) / 48.0f;
-      V[3] = (w[0]*-1 + w[6]*-64 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8 + w[5]*32) / 120.0f;
-      V[4] = (w[0]*-1 + w[6]*-64 + w[5]*-32 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120.0f;
-      V[5] = (w[5]*-243 + w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[6]*729 + w[0]*1) / 720.0f;
-      V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[5]*243 + w[6]*729 + w[0]*1) / 720.0f;
-      V[7] = (w[6]*1) / 1.0f;
-
-      // Store the transformed weights
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        *(outptr + j*matrix_stride) = V[j];
-      }
-      outptr++;
-    }
-  }
-}
-
-template class WeightTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>;
-template class WeightTransform<7, 1, 8, 1, float, float, WinogradRoots::Integers>;
-
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp
deleted file mode 100644
index 3fde4a7a6b..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm.hpp"
-#include "kernel.hpp"
-
-namespace winograd
-{
-
-template <>
-void WeightTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>::execute(
-  const int n_output_channels,
-  const int n_input_channels,
-  const float* const input,
-  float* const output,
-  const int matrix_stride,
-  const int matrix_row_stride
-)
-{
-  constexpr int inner_tile_i = 4;
-  constexpr int inner_tile_j = 4;
-
-  // Get pointers to each cell of the weight tensor
-  const auto weight_col_stride = n_input_channels * n_output_channels;
-  const auto weight_row_stride = 3 * weight_col_stride;
-  const float *inptrs[3][3];
-  for (int i = 0; i < 3; i++)
-  {
-    for (int j = 0; j < 3; j++)
-    {
-      inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
-    }
-  }
-
-  // For each input channel
-  for (int ic = 0; ic < n_input_channels; ic++)
-  {
-    float *outptr = output + ic * matrix_row_stride;
-
-    // For each output channel
-    int channels_remaining = n_output_channels;
-#ifdef __aarch64__
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-    {
-      // Matrices used and computed in this kernel
-      float32x4_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
-
-      // Read weights
-      for (int i = 0; i < 3; i++)
-      {
-        for (int j = 0; j < 3; j++)
-        {
-          w[i][j] = vld1q_f32(inptrs[i][j]);
-          inptrs[i][j] += 4;
-        }
-      }
-
-      // Compute the matrix W w
-      for (int j = 0; j < 3; j++)
-      {
-        Ww[0][j] = w[0][j];
-
-        // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
-        Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
-        // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
-        Ww[2][j] = vmulq_n_f32(vaddq_f32(vsubq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
-        Ww[3][j] = w[2][j];
-      }
-
-      // Compute V = W w WT
-      for (int i = 0; i < inner_tile_i; i++)
-      {
-        V[i][0] = Ww[i][0];
-
-        // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
-        V[i][1] = vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
-        // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
-        V[i][2] = vmulq_n_f32(vaddq_f32(vsubq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
-        V[i][3] = Ww[i][2];
-      }
-
-      // Store the transformed weights
-      for (int i = 0, m = 0; i < inner_tile_i; i++)
-      {
-        for (int j = 0; j < inner_tile_j; j++, m++)
-        {
-          vst1q_f32(outptr + m*matrix_stride, V[i][j]);
-        }
-      }
-      outptr += 4;
-    }
-#endif  // __aarch64__
-#ifdef __arm_any__
-    for (; channels_remaining >= 2; channels_remaining -= 2)
-    {
-      // Matrices used and computed in this kernel
-      float32x2_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
-
-      // Read weights
-      for (int i = 0; i < 3; i++)
-      {
-        for (int j = 0; j < 3; j++)
-        {
-          w[i][j] = vld1_f32(inptrs[i][j]);
-          inptrs[i][j] += 2;
-        }
-      }
-
-      // Compute the matrix W w
-      for (int j = 0; j < 3; j++)
-      {
-        Ww[0][j] = w[0][j];
-
-        // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
-        Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
-        // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
-        Ww[2][j] = vmul_n_f32(vadd_f32(vsub_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
-        Ww[3][j] = w[2][j];
-      }
-
-      // Compute V = W w WT
-      for (int i = 0; i < inner_tile_i; i++)
-      {
-        V[i][0] = Ww[i][0];
-
-        // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
-        V[i][1] = vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
-        // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
-        V[i][2] = vmul_n_f32(vadd_f32(vsub_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
-        V[i][3] = Ww[i][2];
-      }
-
-      // Store the transformed weights
-      for (int i = 0, m = 0; i < inner_tile_i; i++)
-      {
-        for (int j = 0; j < inner_tile_j; j++, m++)
-        {
-          vst1_f32(outptr + m*matrix_stride, V[i][j]);
-        }
-      }
-      outptr += 2;
-    }
-#endif  // __arm_any__
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed in this kernel
-      float w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
-
-      // Read weights
-      for (int i = 0; i < 3; i++)
-      {
-        for (int j = 0; j < 3; j++)
-        {
-          w[i][j] = *(inptrs[i][j]++);
-        }
-      }
-
-      // Compute the matrix W w
-      for (int j = 0; j < 3; j++)
-      {
-        Ww[0][j] = w[0][j];
-        Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
-        Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
-        Ww[3][j] = w[2][j];
-      }
-
-      // Compute V = W w WT
-      for (int i = 0; i < inner_tile_i; i++)
-      {
-        V[i][0] = Ww[i][0];
-        V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
-        V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
-        V[i][3] = Ww[i][2];
-      }
-
-      // Store the transformed weights
-      for (int i = 0, m = 0; i < inner_tile_i; i++)
-      {
-        for (int j = 0; j < inner_tile_j; j++, m++)
-        {
-          *(outptr + m*matrix_stride) = V[i][j];
-        }
-      }
-      outptr++;
-    }
-  }
-}
-
-template class WeightTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>;
-
-}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp
deleted file mode 100644
index 26ab56f24e..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,401 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm.hpp"
-#include "kernel.hpp"
-
-namespace winograd
-{
-
-template <>
-void WeightTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>::execute(
-  const int n_output_channels,
-  const int n_input_channels,
-  const float* const input,
-  float* const output,
-  const int matrix_stride,
-  const int matrix_row_stride
-)
-{
-  // Get pointers to each cell of the weight tensor
-  const auto weight_col_stride = n_input_channels * n_output_channels;
-  const auto weight_row_stride = 5 * weight_col_stride;
-  const float *inptrs[5][5];
-  for (int i = 0; i < 5; i++)
-  {
-    for (int j = 0; j < 5; j++)
-    {
-      inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
-    }
-  }
-
-  // For each input channel
-  for (int ic = 0; ic < n_input_channels; ic++)
-  {
-    float *outptr = output + ic * matrix_row_stride;
-
-    // For each output channel
-    int channels_remaining = n_output_channels;
-#ifdef __aarch64__
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-    {
-      // Matrices used and computed in this kernel
-      float32x4_t w[5][5], Ww[6][5], V[6][6];
-
-      // Read weights
-      for (int i = 0; i < 5; i++)
-      {
-        for (int j = 0; j < 5; j++)
-        {
-          w[i][j] = vld1q_f32(inptrs[i][j]);
-          inptrs[i][j] += 4;
-        }
-      }
-
-      // Compute the matrix W w
-      for (int j = 0; j < 5; j++)
-      {
-        // Ww[0][j] = w[0][j]/4.0f;
-        Ww[0][j] = vmulq_n_f32(w[0][j], 1.0f/4.0f);
-
-        // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
-        Ww[1][j] = vmulq_n_f32(
-          vaddq_f32(
-            vaddq_f32(
-              vaddq_f32(w[1][j], w[0][j]),
-              vaddq_f32(w[3][j], w[2][j])
-            ),
-            w[4][j]
-          ),
-          -1.0f/6.0f
-        );
-
-        // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
-        // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
-        Ww[2][j] = vmulq_n_f32(
-          vsubq_f32(
-            vaddq_f32(
-              vsubq_f32(w[1][j], w[0][j]),
-              vsubq_f32(w[3][j], w[2][j])
-            ),
-            w[4][j]
-          ),
-          1.0f/6.0f
-        );
-
-        // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
-        Ww[3][j] = vmulq_n_f32(
-          vmlaq_n_f32(
-            vaddq_f32(
-              vaddq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
-              vaddq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
-            ),
-            w[4][j], 2.0f
-          ),
-          1.0f/3.0f
-        );
-
-        // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
-        Ww[4][j] = vmulq_n_f32(
-          vmlaq_n_f32(
-            vaddq_f32(
-              vsubq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
-              vsubq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
-            ),
-            w[4][j], 2.0f
-          ),
-          1.0f/3.0f
-        );
-
-        // Ww[5][j] = w[4][j];
-        Ww[5][j] = w[4][j];
-      }
-
-      // Compute V = W w WT
-      for (int i = 0; i < 6; i++)
-      {
-        // V[i][0] = Ww[i][0]/4.0f;
-        V[i][0] = vmulq_n_f32(Ww[i][0], 1.0f/4.0f);
-
-        // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
-        V[i][1] = vmulq_n_f32(
-          vaddq_f32(
-            vaddq_f32(
-              vaddq_f32(Ww[i][1], Ww[i][0]),
-              vaddq_f32(Ww[i][3], Ww[i][2])
-            ),
-            Ww[i][4]
-          ),
-          -1.0f/6.0f
-        );
-
-        // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
-        // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
-        V[i][2] = vmulq_n_f32(
-          vsubq_f32(
-            vaddq_f32(
-              vsubq_f32(Ww[i][1], Ww[i][0]),
-              vsubq_f32(Ww[i][3], Ww[i][2])
-            ),
-            Ww[i][4]
-          ),
-          1.0f/6.0f
-        );
-
-        // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
-        V[i][3] = vmulq_n_f32(
-          vmlaq_n_f32(
-            vaddq_f32(
-              vaddq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
-              vaddq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
-            ),
-            Ww[i][4], 2.0f
-          ),
-          1.0f/3.0f
-        );
-
-        // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
-        V[i][4] = vmulq_n_f32(
-          vmlaq_n_f32(
-            vaddq_f32(
-              vsubq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
-              vsubq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
-            ),
-            Ww[i][4], 2.0f
-          ),
-          1.0f/3.0f
-        );
-
-        // V[i][5] = Ww[i][4];
-        V[i][5] = Ww[i][4];
-      }
-
-      // Store the transformed weights
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          vst1q_f32(outptr + m*matrix_stride, V[i][j]);
-        }
-      }
-      outptr += 4;
-    }
-#endif  // __aarch64__
-#ifdef __arm_any__
-    for (; channels_remaining >= 2; channels_remaining -= 2)
-    {
-      // Matrices used and computed in this kernel
-      float32x2_t w[5][5], Ww[6][5], V[6][6];
-
-      // Read weights
-      for (int i = 0; i < 5; i++)
-      {
-        for (int j = 0; j < 5; j++)
-        {
-          w[i][j] = vld1_f32(inptrs[i][j]);
-          inptrs[i][j] += 2;
-        }
-      }
-
-      // Compute the matrix W w
-      for (int j = 0; j < 5; j++)
-      {
-        // Ww[0][j] = w[0][j]/4.0f;
-        Ww[0][j] = vmul_n_f32(w[0][j], 1.0f/4.0f);
-
-        // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
-        Ww[1][j] = vmul_n_f32(
-          vadd_f32(
-            vadd_f32(
-              vadd_f32(w[1][j], w[0][j]),
-              vadd_f32(w[3][j], w[2][j])
-            ),
-            w[4][j]
-          ),
-          -1.0f/6.0f
-        );
-
-        // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
-        // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
-        Ww[2][j] = vmul_n_f32(
-          vsub_f32(
-            vadd_f32(
-              vsub_f32(w[1][j], w[0][j]),
-              vsub_f32(w[3][j], w[2][j])
-            ),
-            w[4][j]
-          ),
-          1.0f/6.0f
-        );
-
-        // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
-        Ww[3][j] = vmul_n_f32(
-          vmla_n_f32(
-            vadd_f32(
-              vadd_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
-              vadd_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
-            ),
-            w[4][j], 2.0f
-          ),
-          1.0f/3.0f
-        );
-
-        // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
-        Ww[4][j] = vmul_n_f32(
-          vmla_n_f32(
-            vadd_f32(
-              vsub_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
-              vsub_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
-            ),
-            w[4][j], 2.0f
-          ),
-          1.0f/3.0f
-        );
-
-        // Ww[5][j] = w[4][j];
-        Ww[5][j] = w[4][j];
-      }
-
-      // Compute V = W w WT
-      for (int i = 0; i < 6; i++)
-      {
-        // V[i][0] = Ww[i][0]/4.0f;
-        V[i][0] = vmul_n_f32(Ww[i][0], 1.0f/4.0f);
-
-        // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
-        V[i][1] = vmul_n_f32(
-          vadd_f32(
-            vadd_f32(
-              vadd_f32(Ww[i][1], Ww[i][0]),
-              vadd_f32(Ww[i][3], Ww[i][2])
-            ),
-            Ww[i][4]
-          ),
-          -1.0f/6.0f
-        );
-
-        // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
-        // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
-        V[i][2] = vmul_n_f32(
-          vsub_f32(
-            vadd_f32(
-              vsub_f32(Ww[i][1], Ww[i][0]),
-              vsub_f32(Ww[i][3], Ww[i][2])
-            ),
-            Ww[i][4]
-          ),
-          1.0f/6.0f
-        );
-
-        // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
-        V[i][3] = vmul_n_f32(
-          vmla_n_f32(
-            vadd_f32(
-              vadd_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
-              vadd_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
-            ),
-            Ww[i][4], 2.0f
-          ),
-          1.0f/3.0f
-        );
-
-        // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
-        V[i][4] = vmul_n_f32(
-          vmla_n_f32(
-            vadd_f32(
-              vsub_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
-              vsub_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
-            ),
-            Ww[i][4], 2.0f
-          ),
-          1.0f/3.0f
-        );
-
-        // V[i][5] = Ww[i][4];
-        V[i][5] = Ww[i][4];
-      }
-
-      // Store the transformed weights
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          vst1_f32(outptr + m*matrix_stride, V[i][j]);
-        }
-      }
-      outptr += 2;
-    }
-#endif  // __arm_any__
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed in this kernel
-      float w[5][5], Ww[6][5], V[6][6];
-
-      // Read weights
-      for (int i = 0; i < 5; i++)
-      {
-        for (int j = 0; j < 5; j++)
-        {
-          w[i][j] = *(inptrs[i][j]++);
-        }
-      }
-
-      // Compute the matrix W w
-      for (int j = 0; j < 5; j++)
-      {
-        Ww[0][j] = w[0][j]/4.0f;
-        Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
-        Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
-        Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
-        Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
-        Ww[5][j] = w[4][j];
-      }
-
-      // Compute V = W w WT
-      for (int i = 0; i < 6; i++)
-      {
-        V[i][0] = Ww[i][0]/4.0f;
-        V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
-        V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
-        V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
-        V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
-        V[i][5] = Ww[i][4];
-      }
-
-      // Store the transformed weights
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          *(outptr + m*matrix_stride) = V[i][j];
-        }
-      }
-      outptr++;
-    }
-  }
-}
-
-template class WeightTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>;
-
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp
deleted file mode 100644
index eeda274453..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm.hpp"
-#include "kernel.hpp"
-
-namespace winograd
-{
-
-template <>
-void WeightTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>::execute(
-  const int n_output_channels,
-  const int n_input_channels,
-  const float* const input,  // NOTE: Data in HWIO order
-  float* const output,
-  const int matrix_stride,
-  const int matrix_row_stride
-)
-{
-  // Get pointers to each cell of the weight tensor
-  const auto weight_col_stride = n_input_channels * n_output_channels;
-  const float *inptrs[kernel_cols];
-  for (int j = 0; j < kernel_cols; j++)
-  {
-    inptrs[j] = input + j*weight_col_stride;
-  }
-
-  // For each input channel
-  for (int ic = 0; ic < n_input_channels; ic++)
-  {
-    float *outptr = output + ic * matrix_row_stride;
-
-    // For each output channel
-    int channels_remaining = n_output_channels;
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed in this kernel
-      float w[kernel_cols], V[inner_tile_cols];
-
-      // Read weights
-      for (int j = 0; j < kernel_cols; j++)
-      {
-        w[j] = *(inptrs[j]++);
-      }
-
-      // Compute V = w WT
-      V[0] = (w[0]*-1) / 36;
-      V[1] = (w[1]*-1 + w[3]*-1 + w[0]*1 + w[2]*1 + w[4]*1) / 48;
-      V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1) / 48;
-      V[3] = (w[0]*-1 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8) / 120;
-      V[4] = (w[0]*-1 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120;
-      V[5] = (w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[0]*1) / 720;
-      V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[0]*1) / 720;
-      V[7] = (w[4]*1) / 1;
-
-      // Store the transformed weights
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        *(outptr + j*matrix_stride) = V[j];
-      }
-      outptr++;
-    }
-  }
-}
-
-template class WeightTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>;
-template class WeightTransform<5, 1, 8, 1, float, float, WinogradRoots::Integers>;
-
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp
deleted file mode 100644
index 3101865027..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#include "arm.hpp"
-#include "kernel.hpp"
-
-namespace winograd
-{
-
-template <>
-void WeightTransform<3, 3, 6, 6, __fp16, __fp16, WinogradRoots::Integers>::execute(
-    const int n_output_channels,
-    const int n_input_channels,
-    const __fp16* const input,  // NOTE: Data in HWIO order
-    __fp16* const output,
-    const int matrix_stride,
-    const int matrix_row_stride
-)
-{
-    // Get pointers to each cell of the weight tensor
-    const auto weight_col_stride = n_input_channels * n_output_channels;
-    const auto weight_row_stride = 3 * weight_col_stride;
-    const __fp16 *inptrs[3][3];
-    for (int i = 0; i < 3; i++)
-    {
-        for (int j = 0; j < 3; j++)
-        {
-            inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
-        }
-    }
-
-    // For each input channel
-    for (int ic = 0; ic < n_input_channels; ic++)
-    {
-        __fp16 *outptr = output + ic * matrix_row_stride;
-
-        // For each output channel
-        int channels_remaining = n_output_channels;
-#ifdef __aarch64__
-    for (; channels_remaining >= 8; channels_remaining -= 8)
-    {
-      // Matrices used and computed in this kernel
-      float16x8_t w[3][3], Ww[6][3], V[6][6];
-
-      // Read weights
-      for (int i = 0; i < 3; i++)
-      {
-        for (int j = 0; j < 3; j++)
-        {
-          w[i][j] = vld1q_f16(inptrs[i][j]);
-          inptrs[i][j] += 8;
-        }
-      }
-
-      // Compute the matrix W w
-      for (int j = 0; j < 3; j++)
-      {
-        // Ww[0][j] =  6*w[0][j];
-        Ww[0][j] = vmulq_n_f16(w[0][j], 6.0);
-
-        // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
-        Ww[1][j] = vmulq_n_f16(vaddq_f16(vaddq_f16(w[0][j], w[1][j]), w[2][j]), -4.0);
-
-        // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
-        Ww[2][j] = vmulq_n_f16(vsubq_f16(vsubq_f16(w[1][j], w[0][j]), w[2][j]), 4.0);
-
-        // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
-        Ww[3][j] = vaddq_f16(vaddq_f16(w[0][j], vmulq_f16(w[1][j], vdupq_n_f16(2.0f))), vmulq_f16(w[2][j], vdupq_n_f16(4.0f)));
-
-        // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
-        Ww[4][j] = vaddq_f16(vsubq_f16(w[0][j], vmulq_f16(w[1][j], vdupq_n_f16(2.0f))), vmulq_f16(w[2][j], vdupq_n_f16(4.0f)));
-
-        // Ww[5][j] = 24*w[2][j];
-        Ww[5][j] = vmulq_n_f16(w[2][j], 24.0f);
-      }
-
-      // Compute V = W w WT
-      for (int i = 0; i < 6; i++)
-      {
-        const float recip576 = 1.0f / 576.0f;
-
-        // V[i][0] =  6*Ww[i][0];
-        V[i][0] = vmulq_n_f16(vmulq_n_f16(Ww[i][0], 6.0), recip576);
-
-        // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
-        V[i][1] = vmulq_n_f16(vmulq_n_f16(vaddq_f16(vaddq_f16(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
-
-        // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
-        V[i][2] = vmulq_n_f16(vmulq_n_f16(vsubq_f16(vsubq_f16(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
-
-        // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
-        V[i][3] = vmulq_n_f16(vaddq_f16(vaddq_f16(Ww[i][0], vmulq_f16(Ww[i][1], vdupq_n_f16(2.0f))), vmulq_f16(Ww[i][2], vdupq_n_f16(4.0f))), recip576);
-
-        // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
-        V[i][4] = vmulq_n_f16(vaddq_f16(vsubq_f16(Ww[i][0], vmulq_f16(Ww[i][1], vdupq_n_f16(2.0f))), vmulq_f16(Ww[i][2], vdupq_n_f16(4.0f))), recip576);
-
-        // V[i][5] = 24*Ww[i][2];
-        V[i][5] = vmulq_n_f16(vmulq_n_f16(Ww[i][2], 24.0f), recip576);
-      }
-
-      // Store the transformed weights
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          vst1q_f16(outptr + m*matrix_stride, V[i][j]);
-        }
-      }
-      outptr += 8;
-    }
-#endif  // __aarch64__
-#ifdef __arm_any__
-        for (; channels_remaining >= 4; channels_remaining -= 4)
-    {
-      // Matrices used and computed in this kernel
-      float16x4_t w[3][3], Ww[6][3], V[6][6];
-
-      // Read weights
-      for (int i = 0; i < 3; i++)
-      {
-        for (int j = 0; j < 3; j++)
-        {
-          w[i][j] = vld1_f16(inptrs[i][j]);
-          inptrs[i][j] += 4;
-        }
-      }
-
-      // Compute the matrix W w
-      for (int j = 0; j < 3; j++)
-      {
-        // Ww[0][j] =  6*w[0][j];
-        Ww[0][j] = vmul_n_f16(w[0][j], 6.0);
-
-        // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
-        Ww[1][j] = vmul_n_f16(vadd_f16(vadd_f16(w[0][j], w[1][j]), w[2][j]), -4.0);
-
-        // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
-        Ww[2][j] = vmul_n_f16(vsub_f16(vsub_f16(w[1][j], w[0][j]), w[2][j]), 4.0);
-
-        // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
-        Ww[3][j] = vadd_f16(vadd_f16(w[0][j], vmul_f16(w[1][j], vdup_n_f16(2.0f))), vmul_f16(w[2][j], vdup_n_f16(4.0f)));
-
-        // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
-        Ww[4][j] = vadd_f16(vsub_f16(w[0][j], vmul_f16(w[1][j], vdup_n_f16(2.0f))), vmul_f16(w[2][j], vdup_n_f16(4.0f)));
-
-        // Ww[5][j] = 24*w[2][j];
-        Ww[5][j] = vmul_n_f16(w[2][j], 24.0f);
-      }
-
-      // Compute V = W w WT
-      for (int i = 0; i < 6; i++)
-      {
-        const float recip576 = 1.0f / 576.0f;
-
-        // V[i][0] =  6*Ww[i][0];
-        V[i][0] = vmul_n_f16(vmul_n_f16(Ww[i][0], 6.0), recip576);
-
-        // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
-        V[i][1] = vmul_n_f16(vmul_n_f16(vadd_f16(vadd_f16(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
-
-        // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
-        V[i][2] = vmul_n_f16(vmul_n_f16(vsub_f16(vsub_f16(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
-
-        // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
-        V[i][3] = vmul_n_f16(vadd_f16(vadd_f16(Ww[i][0], vmul_f16(Ww[i][1], vdup_n_f16(2.0f))), vmul_f16(Ww[i][2], vdup_n_f16(4.0f))), recip576);
-
-        // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
-        V[i][4] = vmul_n_f16(vadd_f16(vsub_f16(Ww[i][0], vmul_f16(Ww[i][1], vdup_n_f16(2.0f))), vmul_f16(Ww[i][2], vdup_n_f16(4.0f))), recip576);
-
-        // V[i][5] = 24*Ww[i][2];
-        V[i][5] = vmul_n_f16(vmul_n_f16(Ww[i][2], 24.0f), recip576);
-      }
-
-      // Store the transformed weights
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          vst1_f16(outptr + m*matrix_stride, V[i][j]);
-        }
-      }
-      outptr += 4;
-    }
-#endif  // __arm_any__
-        for (; channels_remaining; channels_remaining--)
-        {
-            // Matrices used and computed in this kernel
-            __fp16 w[3][3], Ww[6][3], V[6][6];
-
-            // Read weights
-            for (int i = 0; i < 3; i++)
-            {
-                for (int j = 0; j < 3; j++)
-                {
-                    w[i][j] = *(inptrs[i][j]++);
-                }
-            }
-
-            // Compute the matrix W w
-            for (int j = 0; j < 3; j++)
-            {
-                Ww[0][j] =  6*w[0][j];
-                Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
-                Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
-                Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
-                Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
-                Ww[5][j] = 24*w[2][j];
-            }
-
-            // Compute V = W w WT
-            for (int i = 0; i < 6; i++)
-            {
-                V[i][0] = ( 6*Ww[i][0]) / 576.0;
-                V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
-                V[i][2] = (-4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
-                V[i][3] = ( 1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
-                V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
-                V[i][5] = (24*Ww[i][2]) / 576.0;
-            }
-
-            // Store the transformed weights
-            for (int i = 0, m = 0; i < 6; i++)
-            {
-                for (int j = 0; j < 6; j++, m++)
-                {
-                    *(outptr + m*matrix_stride) = V[i][j];
-                }
-            }
-            outptr++;
-        }
-    }
-}
-
-template class WeightTransform<3, 3, 6, 6, __fp16, __fp16, WinogradRoots::Integers>;
-
-}  // namespace
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp
deleted file mode 100644
index 7c2c718bd5..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm.hpp"
-#include "kernel.hpp"
-
-namespace winograd
-{
-
-template <>
-void WeightTransform<3, 3, 6, 6, float, float, WinogradRoots::Integers>::execute(
-  const int n_output_channels,
-  const int n_input_channels,
-  const float* const input,  // NOTE: Data in HWIO order
-  float* const output,
-  const int matrix_stride,
-  const int matrix_row_stride
-)
-{
-  // Get pointers to each cell of the weight tensor
-  const auto weight_col_stride = n_input_channels * n_output_channels;
-  const auto weight_row_stride = 3 * weight_col_stride;
-  const float *inptrs[3][3];
-  for (int i = 0; i < 3; i++)
-  {
-    for (int j = 0; j < 3; j++)
-    {
-      inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
-    }
-  }
-
-  // For each input channel
-  for (int ic = 0; ic < n_input_channels; ic++)
-  {
-    float *outptr = output + ic * matrix_row_stride;
-
-    // For each output channel
-    int channels_remaining = n_output_channels;
-#ifdef __aarch64__
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-    {
-      // Matrices used and computed in this kernel
-      float32x4_t w[3][3], Ww[6][3], V[6][6];
-
-      // Read weights
-      for (int i = 0; i < 3; i++)
-      {
-        for (int j = 0; j < 3; j++)
-        {
-          w[i][j] = vld1q_f32(inptrs[i][j]);
-          inptrs[i][j] += 4;
-        }
-      }
-
-      // Compute the matrix W w
-      for (int j = 0; j < 3; j++)
-      {
-        // Ww[0][j] =  6*w[0][j];
-        Ww[0][j] = vmulq_n_f32(w[0][j], 6.0);
-
-        // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
-        Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
-
-        // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
-        Ww[2][j] = vmulq_n_f32(vsubq_f32(vsubq_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
-
-        // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
-        Ww[3][j] = vmlaq_n_f32(vmlaq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
-        // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
-        Ww[4][j] = vmlaq_n_f32(vmlsq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
-        // Ww[5][j] = 24*w[2][j];
-        Ww[5][j] = vmulq_n_f32(w[2][j], 24.0f);
-      }
-
-      // Compute V = W w WT
-      for (int i = 0; i < 6; i++)
-      {
-        const float recip576 = 1.0f / 576.0f;
-
-        // V[i][0] =  6*Ww[i][0];
-        V[i][0] = vmulq_n_f32(vmulq_n_f32(Ww[i][0], 6.0), recip576);
-
-        // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
-        V[i][1] = vmulq_n_f32(vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
-
-        // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
-        V[i][2] = vmulq_n_f32(vmulq_n_f32(vsubq_f32(vsubq_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
-
-        // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
-        V[i][3] = vmulq_n_f32(vmlaq_n_f32(vmlaq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
-        // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
-        V[i][4] = vmulq_n_f32(vmlaq_n_f32(vmlsq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
-        // V[i][5] = 24*Ww[i][2];
-        V[i][5] = vmulq_n_f32(vmulq_n_f32(Ww[i][2], 24.0f), recip576);
-      }
-
-      // Store the transformed weights
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          vst1q_f32(outptr + m*matrix_stride, V[i][j]);
-        }
-      }
-      outptr += 4;
-    }
-#endif  // __aarch64__
-#ifdef __arm_any__
-    for (; channels_remaining >= 2; channels_remaining -= 2)
-    {
-      // Matrices used and computed in this kernel
-      float32x2_t w[3][3], Ww[6][3], V[6][6];
-
-      // Read weights
-      for (int i = 0; i < 3; i++)
-      {
-        for (int j = 0; j < 3; j++)
-        {
-          w[i][j] = vld1_f32(inptrs[i][j]);
-          inptrs[i][j] += 2;
-        }
-      }
-
-      // Compute the matrix W w
-      for (int j = 0; j < 3; j++)
-      {
-        // Ww[0][j] =  6*w[0][j];
-        Ww[0][j] = vmul_n_f32(w[0][j], 6.0);
-
-        // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
-        Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
-
-        // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
-        Ww[2][j] = vmul_n_f32(vsub_f32(vsub_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
-
-        // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
-        Ww[3][j] = vmla_n_f32(vmla_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
-        // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
-        Ww[4][j] = vmla_n_f32(vmls_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
-        // Ww[5][j] = 24*w[2][j];
-        Ww[5][j] = vmul_n_f32(w[2][j], 24.0f);
-      }
-
-      // Compute V = W w WT
-      for (int i = 0; i < 6; i++)
-      {
-        const float recip576 = 1.0f / 576.0f;
-
-        // V[i][0] =  6*Ww[i][0];
-        V[i][0] = vmul_n_f32(vmul_n_f32(Ww[i][0], 6.0), recip576);
-
-        // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
-        V[i][1] = vmul_n_f32(vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
-
-        // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
-        V[i][2] = vmul_n_f32(vmul_n_f32(vsub_f32(vsub_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
-
-        // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
-        V[i][3] = vmul_n_f32(vmla_n_f32(vmla_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
-        // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
-        V[i][4] = vmul_n_f32(vmla_n_f32(vmls_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
-        // V[i][5] = 24*Ww[i][2];
-        V[i][5] = vmul_n_f32(vmul_n_f32(Ww[i][2], 24.0f), recip576);
-      }
-
-      // Store the transformed weights
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          vst1_f32(outptr + m*matrix_stride, V[i][j]);
-        }
-      }
-      outptr += 2;
-    }
-#endif  // __arm_any__
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed in this kernel
-      float w[3][3], Ww[6][3], V[6][6];
-
-      // Read weights
-      for (int i = 0; i < 3; i++)
-      {
-        for (int j = 0; j < 3; j++)
-        {
-          w[i][j] = *(inptrs[i][j]++);
-        }
-      }
-
-      // Compute the matrix W w
-      for (int j = 0; j < 3; j++)
-      {
-        Ww[0][j] =  6*w[0][j];
-        Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
-        Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
-        Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
-        Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
-        Ww[5][j] = 24*w[2][j];
-      }
-
-      // Compute V = W w WT
-      for (int i = 0; i < 6; i++)
-      {
-        V[i][0] = ( 6*Ww[i][0]) / 576.0;
-        V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
-        V[i][2] = (-4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
-        V[i][3] = ( 1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
-        V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
-        V[i][5] = (24*Ww[i][2]) / 576.0;
-      }
-
-      // Store the transformed weights
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          *(outptr + m*matrix_stride) = V[i][j];
-        }
-      }
-      outptr++;
-    }
-  }
-}
-
-template class WeightTransform<3, 3, 6, 6, float, float, WinogradRoots::Integers>;
-
-}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp
deleted file mode 100644
index 9b42224eaf..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm.hpp"
-#include "kernel.hpp"
-
-namespace winograd
-{
-
-template <>
-void WeightTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>::execute(
-  const int n_output_channels,
-  const int n_input_channels,
-  const float* const input,  // NOTE: Data in HWIO order
-  float* const output,
-  const int matrix_stride,
-  const int matrix_row_stride
-)
-{
-  // Get pointers to each cell of the weight tensor
-  const auto weight_col_stride = n_input_channels * n_output_channels;
-  const float *inptrs[3];
-  for (int j = 0; j < 3; j++)
-  {
-    inptrs[j] = input + j*weight_col_stride;
-  }
-
-  // For each input channel
-  for (int ic = 0; ic < n_input_channels; ic++)
-  {
-    float *outptr = output + ic * matrix_row_stride;
-
-    // For each output channel
-    int channels_remaining = n_output_channels;
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed in this kernel
-      float w[3], V[inner_tile_cols];
-
-      // Read weights
-      for (int j = 0; j < 3; j++)
-      {
-        w[j] = *(inptrs[j]++);
-      }
-
-      // Compute V = w WT
-      V[0] = (w[0]*-1) / 36.0f;
-      V[1] = (w[1]*-1 + w[0]*1 + w[2]*1) / 48.0f;
-      V[2] = (w[0]*1 + w[1]*1 + w[2]*1) / 48.0f;
-      V[3] = (w[0]*-1 + w[2]*-4 + w[1]*2) / 120.0f;
-      V[4] = (w[0]*-1 + w[2]*-4 + w[1]*-2) / 120.0f;
-      V[5] = (w[1]*-3 + w[2]*9 + w[0]*1) / 720.0f;
-      V[6] = (w[1]*3 + w[2]*9 + w[0]*1) / 720.0f;
-      V[7] = (w[2]*1) / 1;
-
-      // Store the transformed weights
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        *(outptr + j*matrix_stride) = V[j];
-      }
-      outptr++;
-    }
-  }
-}
-
-template class WeightTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>;
-template class WeightTransform<3, 1, 8, 1, float, float, WinogradRoots::Integers>;
-
-}  // namespace
diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index 25d682d8ae..95cdc8f2f9 100644
--- a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
 #define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
 
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -157,8 +159,7 @@ struct logistic
      *
      * @param[in] act_info Activation layer information.
      */
-    explicit logistic(ActivationLayerInfo act_info)
-        : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
+    explicit logistic(ActivationLayerInfo act_info) : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
@@ -197,8 +198,7 @@ struct relu
      *
      * @param[in] act_info Activation layer information.
      */
-    explicit relu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
+    explicit relu(ActivationLayerInfo act_info) : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
index ac196d9dbb..50fff04cad 100644
--- a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
+++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
@@ -25,6 +25,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IMultiImage.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/NEON/NEMath.h"
 
 #include <arm_neon.h>
@@ -50,8 +51,12 @@ constexpr float rgb2u8_red_coef   = 0.2126f;
 constexpr float rgb2u8_green_coef = 0.7152f;
 constexpr float rgb2u8_blue_coef  = 0.0722f;
 
-inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor,
-                                                const float rcoef, const float gcoef, const float bcoef)
+inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor,
+                                                const float32x4_t &gcolor,
+                                                const float32x4_t &bcolor,
+                                                const float        rcoef,
+                                                const float        gcoef,
+                                                const float        bcoef)
 {
     float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef);
     greyscale             = vmlaq_n_f32(greyscale, gcolor, gcoef);
@@ -86,8 +91,12 @@ inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out)
     arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out);
 }
 
-inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
-                                   float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
+inline void rgb_to_yuv_calculation(const float32x4_t &rvec,
+                                   const float32x4_t &gvec,
+                                   const float32x4_t &bvec,
+                                   float32x4_t       &yvec,
+                                   float32x4_t       &uvec,
+                                   float32x4_t       &vvec)
 {
     /*
     Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
@@ -110,8 +119,12 @@ inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &g
     vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
 }
 
-inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
-                                    float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
+inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val,
+                                    float32x4_t        uvec_val,
+                                    const float32x4_t &yyvec_val,
+                                    float32x4_t        vvec_val,
+                                    unsigned char     *output_ptr,
+                                    const bool         alpha)
 {
     float32x4x3_t rgb1, rgb2;
 
@@ -126,8 +139,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve
     // b = 1.8556f*f_u + 0.0000f*f_v;
     const auto red   = vmulq_n_f32(vvec_val, red_coef_bt709);
     const auto blue  = vmulq_n_f32(uvec_val, blue_coef_bt709);
-    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
-                                 vmulq_n_f32(vvec_val, green_coef2_bt709));
+    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), vmulq_n_f32(vvec_val, green_coef2_bt709));
 
     // Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
     // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
@@ -144,7 +156,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve
     uint8x8x3_t u8_rgb;
     arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
 
-    if(!alpha)
+    if (!alpha)
     {
         vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
         vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
@@ -177,7 +189,7 @@ inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
 {
     uint8x16x3_t rgb;
 
-    if(alpha)
+    if (alpha)
     {
         const auto tmp = vld4q_u8(ptr);
         rgb.val[0]     = tmp.val[0];
@@ -206,12 +218,12 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto
     float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
     float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
 
-    for(auto i = 0; i < 4; ++i)
+    for (auto i = 0; i < 4; ++i)
     {
-        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
-                               fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
-        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
-                               fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
+        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], fyvec_top.val[i], fuvec_top.val[i],
+                               fvvec_top.val[i]);
+        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], fyvec_bottom.val[i],
+                               fuvec_bottom.val[i], fvvec_bottom.val[i]);
     }
 
     arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]);
@@ -222,9 +234,14 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto
     arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]);
 }
 
-inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_nv12(const uint8x16_t &rvec_top,
+                              const uint8x16_t &gvec_top,
+                              const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom,
+                              const uint8x16_t &gvec_bottom,
+                              const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top,
+                              unsigned char *const __restrict out_y_bottom,
                               unsigned char *const __restrict out_uv)
 {
     uint8x16x3_t vec_top, vec_bottom;
@@ -252,9 +269,14 @@ inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec
     vst2_u8(out_uv, uvvec);
 }
 
-inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top,
+                              const uint8x16_t &gvec_top,
+                              const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom,
+                              const uint8x16_t &gvec_bottom,
+                              const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top,
+                              unsigned char *const __restrict out_y_bottom,
                               unsigned char *const __restrict out_u,
                               unsigned char *const __restrict out_v)
 {
@@ -273,14 +295,16 @@ inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec
 
     const auto uvvec_top    = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
     const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
-    const auto uvvec        = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
-                                        vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
+    const auto uvvec =
+        vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
 
     vst1_u8(out_u, vget_low_u8(uvvec));
     vst1_u8(out_v, vget_high_u8(uvvec));
 }
 
-inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
+inline void store_rgb_to_yuv4(const uint8x16_t &rvec,
+                              const uint8x16_t &gvec,
+                              const uint8x16_t &bvec,
                               unsigned char *const __restrict out_y,
                               unsigned char *const __restrict out_u,
                               unsigned char *const __restrict out_v)
@@ -291,10 +315,9 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
     const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec);
 
     float32x4x4_t fyvec, fuvec, fvvec;
-    for(auto i = 0; i < 4; ++i)
+    for (auto i = 0; i < 4; ++i)
     {
-        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
-                               fyvec.val[i], fuvec.val[i], fvvec.val[i]);
+        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], fyvec.val[i], fuvec.val[i], fvvec.val[i]);
     }
 
     uint8x16_t yvec, uvec, vvec;
@@ -307,7 +330,7 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
     vst1q_u8(out_v, vvec);
 }
 #endif /* DOXYGEN_SKIP_THIS */
-}
+} // namespace
 
 namespace arm_compute
 {
@@ -329,17 +352,19 @@ void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict out
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld3q_u8(in.ptr());
-        uint8x16x4_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        ta2.val[3] = vdupq_n_u8(255);
-        vst4q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta1 = vld3q_u8(in.ptr());
+            uint8x16x4_t ta2;
+            ta2.val[0] = ta1.val[0];
+            ta2.val[1] = ta1.val[1];
+            ta2.val[2] = ta1.val[2];
+            ta2.val[3] = vdupq_n_u8(255);
+            vst4q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert RGB to U8.
@@ -360,14 +385,16 @@ void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict outpu
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta1 = vld3q_u8(in.ptr());
-        uint8x16_t ta2;
-        rgb_to_u8_conversion(ta1, ta2);
-        vst1q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta1 = vld3q_u8(in.ptr());
+            uint8x16_t ta2;
+            rgb_to_u8_conversion(ta1, ta2);
+            vst1q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert RGBX to RGB.
@@ -388,16 +415,18 @@ void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld4q_u8(in.ptr());
-        uint8x16x3_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        vst3q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta1 = vld4q_u8(in.ptr());
+            uint8x16x3_t ta2;
+            ta2.val[0] = ta1.val[0];
+            ta2.val[1] = ta1.val[1];
+            ta2.val[2] = ta1.val[2];
+            vst3q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert YUYV to RGB.
@@ -422,26 +451,32 @@ void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict out
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta = vld4q_u8(in.ptr());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
-        const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
-        const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
-        const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
-
-        yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta = vld4q_u8(in.ptr());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
+            const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
+            const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
+            const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
+
+            yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size,
+                                    alpha);
+        },
+        in, out);
 }
 
 /** Convert NV12 to RGB.
@@ -475,35 +510,45 @@ void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict out
     Iterator in_uv(input_ptr->plane(1), win_uv);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
-
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_uv, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+            float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+            float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+            float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+            float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
+            float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
+
+            yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+                                    out.ptr() + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+                                    out.ptr() + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+                                    out.ptr() + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+                                    out.ptr() + 3 * element_size, alpha);
+
+            yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+                                    out.ptr() + out_stride + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+                                    out.ptr() + out_stride + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+                                    out.ptr() + out_stride + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+                                    out.ptr() + out_stride + 3 * element_size, alpha);
+        },
+        in_y, in_uv, out);
 }
 
 /** Convert IYUV to RGB.
@@ -537,59 +582,71 @@ void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict out
     Iterator in_v(input_ptr->plane(2), win_uv);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto *y_top_ptr    = in_y.ptr();
-        const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
-        const auto *u_ptr        = in_u.ptr();
-        const auto *v_ptr        = in_v.ptr();
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto *y_top_ptr    = in_y.ptr();
+            const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
+            const auto *u_ptr        = in_u.ptr();
+            const auto *v_ptr        = in_v.ptr();
 
         // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation
 #if defined(__arch64__)
-        const auto ta0_y_top    = vld1q_u8(y_top_ptr);
-        const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
-        const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
-        const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
-        const auto ta_u         = vld1q_u8(u_ptr);
-        const auto ta_v         = vld1q_u8(v_ptr);
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+            const auto ta0_y_top    = vld1q_u8(y_top_ptr);
+            const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
+            const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
+            const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
+            const auto ta_u         = vld1q_u8(u_ptr);
+            const auto ta_v         = vld1q_u8(v_ptr);
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
+            float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
+            float32x4x4_t yvec_bottom =
+                arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
+            float32x4x4_t yyvec_bottom =
+                arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
+            float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+            float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
 #else  /* defined(__arch64__) */
-        const auto ta_y_top    = vld2q_u8(y_top_ptr);
-        const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
-        const auto ta_u        = vld1q_u8(u_ptr);
-        const auto ta_v        = vld1q_u8(v_ptr);
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u.val[0] = U0 U2 U4 U6 ...
-        //ta_v.val[0] = V0 V2 V4 V6 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+            const auto ta_y_top    = vld2q_u8(y_top_ptr);
+            const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
+            const auto ta_u        = vld1q_u8(u_ptr);
+            const auto ta_v        = vld1q_u8(v_ptr);
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_u.val[0] = U0 U2 U4 U6 ...
+            //ta_v.val[0] = V0 V2 V4 V6 ...
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+            float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+            float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+            float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+            float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+            float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
 #endif /* defined(__arch64__) */
 
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_u, in_v, out);
+            yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+                                    out.ptr() + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+                                    out.ptr() + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+                                    out.ptr() + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+                                    out.ptr() + 3 * element_size, alpha);
+
+            yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+                                    out.ptr() + out_stride + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+                                    out.ptr() + out_stride + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+                                    out.ptr() + out_stride + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+                                    out.ptr() + out_stride + 3 * element_size, alpha);
+        },
+        in_y, in_u, in_v, out);
 }
 
 /** Convert YUYV to NV12.
@@ -621,31 +678,33 @@ void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict ou
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
-
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
-        uint8x16x2_t uvvec;
-        uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst2q_u8(out_uv.ptr(), uvvec);
-    },
-    in, out_y, out_uv);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_top    = vld4q_u8(in.ptr());
+            const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
+
+            uint8x16x2_t yvec;
+            yvec.val[0] = ta_top.val[0 + shift];
+            yvec.val[1] = ta_top.val[2 + shift];
+            vst2q_u8(out_y.ptr(), yvec);
+
+            uint8x16x2_t yyvec;
+            yyvec.val[0] = ta_bottom.val[0 + shift];
+            yyvec.val[1] = ta_bottom.val[2 + shift];
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+            uint8x16x2_t uvvec;
+            uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+            uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+            vst2q_u8(out_uv.ptr(), uvvec);
+        },
+        in, out_y, out_uv);
 }
 
 /** Convert IYUV to NV12.
@@ -676,23 +735,25 @@ void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict ou
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        uint8x16x2_t ta_uv;
-        ta_uv.val[0] = vld1q_u8(in_u.ptr());
-        ta_uv.val[1] = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst2q_u8(out_uv.ptr(), ta_uv);
-    },
-    in_y, in_u, in_v, out_y, out_uv);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            uint8x16x2_t ta_uv;
+            ta_uv.val[0] = vld1q_u8(in_u.ptr());
+            ta_uv.val[1] = vld1q_u8(in_v.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+            vst2q_u8(out_uv.ptr(), ta_uv);
+        },
+        in_y, in_u, in_v, out_y, out_uv);
 }
 
 /** Convert NV12 to IYUV.
@@ -726,22 +787,24 @@ void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
-        vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+            vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
+            vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
+        },
+        in_y, in_uv, out_y, out_u, out_v);
 }
 
 /** Convert YUYV to IYUV.
@@ -774,34 +837,36 @@ void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
-
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
-        uint8x16_t uvec;
-        uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        vst1q_u8(out_u.ptr(), uvec);
-
-        uint8x16_t vvec;
-        vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst1q_u8(out_v.ptr(), vvec);
-    },
-    in, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_top    = vld4q_u8(in.ptr());
+            const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
+
+            uint8x16x2_t yvec;
+            yvec.val[0] = ta_top.val[0 + shift];
+            yvec.val[1] = ta_top.val[2 + shift];
+            vst2q_u8(out_y.ptr(), yvec);
+
+            uint8x16x2_t yyvec;
+            yyvec.val[0] = ta_bottom.val[0 + shift];
+            yyvec.val[1] = ta_bottom.val[2 + shift];
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+            uint8x16_t uvec;
+            uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+            vst1q_u8(out_u.ptr(), uvec);
+
+            uint8x16_t vvec;
+            vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+            vst1q_u8(out_v.ptr(), vvec);
+        },
+        in, out_y, out_u, out_v);
 }
 
 /** Convert NV12 to YUV4.
@@ -835,32 +900,34 @@ void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_uv.val[0 + shift];
-        uvec.val[1] = ta_uv.val[0 + shift];
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_uv.val[1 - shift];
-        vvec.val[1] = ta_uv.val[1 - shift];
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+            uint8x16x2_t uvec;
+            uvec.val[0] = ta_uv.val[0 + shift];
+            uvec.val[1] = ta_uv.val[0 + shift];
+            vst2q_u8(out_u.ptr(), uvec);
+            vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+            uint8x16x2_t vvec;
+            vvec.val[0] = ta_uv.val[1 - shift];
+            vvec.val[1] = ta_uv.val[1 - shift];
+            vst2q_u8(out_v.ptr(), vvec);
+            vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+        },
+        in_y, in_uv, out_y, out_u, out_v);
 }
 
 /** Convert IYUV to YUV4.
@@ -892,33 +959,35 @@ void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_u        = vld1q_u8(in_u.ptr());
-        const auto ta_v        = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u = U0 U2 U4 U6 ...
-        //ta_v = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_u;
-        uvec.val[1] = ta_u;
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_v;
-        vvec.val[1] = ta_v;
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_u, in_v, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_u        = vld1q_u8(in_u.ptr());
+            const auto ta_v        = vld1q_u8(in_v.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_u = U0 U2 U4 U6 ...
+            //ta_v = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+            uint8x16x2_t uvec;
+            uvec.val[0] = ta_u;
+            uvec.val[1] = ta_u;
+            vst2q_u8(out_u.ptr(), uvec);
+            vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+            uint8x16x2_t vvec;
+            vvec.val[0] = ta_v;
+            vvec.val[1] = ta_v;
+            vst2q_u8(out_v.ptr(), vvec);
+            vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+        },
+        in_y, in_u, in_v, out_y, out_u, out_v);
 }
 
 /** Convert RGB to NV12.
@@ -948,20 +1017,21 @@ void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict out
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_uv.ptr());
-    },
-    in, out_y, out_uv);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+            const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+            store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+                              ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+                              out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_uv.ptr());
+        },
+        in, out_y, out_uv);
 }
 
 /** Convert RGB to IYUV.
@@ -992,20 +1062,22 @@ void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict out
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+            const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+            store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+                              ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+                              out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_u.ptr(),
+                              out_v.ptr());
+        },
+        in, out_y, out_u, out_v);
 }
 
 /** Convert RGB to YUV4.
@@ -1030,16 +1102,17 @@ void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict out
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb = load_rgb(in.ptr(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
-                          out_y.ptr(), out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb = load_rgb(in.ptr(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+            store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], out_y.ptr(), out_u.ptr(), out_v.ptr());
+        },
+        in, out_y, out_u, out_v);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
index 96defbc9c9..4b1eb079b2 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
@@ -33,56 +33,32 @@ namespace detail
 {
 inline float32x4x3_t load_matrix_row(const float *ptr)
 {
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
+    const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
     return r;
 }
 
 template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
+float32x4x2_t convolve_3x3(const float         *in_top,
+                           const float         *in_mid,
+                           const float         *in_low,
+                           const float32x4x3_t &m0,
+                           const float32x4x3_t &m1,
+                           const float32x4x3_t &m2);
 
 template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<1>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + 4),
-            vld1q_f32(in_top + 8)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + 4),
-            vld1q_f32(in_mid + 8)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + 4),
-            vld1q_f32(in_low + 8)
-        }
-    };
-    float32x4x2_t out =
-    {
-        {
-            vmulq_f32(vtop.val[0], m0.val[0]),
-            vmulq_f32(vtop.val[1], m0.val[0])
-        }
-    };
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+    const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+    const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+    const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+    float32x4x2_t       out  = {{vmulq_f32(vtop.val[0], m0.val[0]), vmulq_f32(vtop.val[1], m0.val[0])}};
+    out.val[0]               = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+    out.val[0]               = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
 
     out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
@@ -106,7 +82,12 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<2>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
@@ -116,7 +97,12 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<3>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
@@ -165,6 +151,6 @@ int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteratio
 {
     return num_elems_written_per_iteration * 3;
 }
-}
+} // namespace detail
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index 779db6030d..fd1ee54597 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,7 @@
 
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "support/Requires.h"
+#include "support/AclRequires.h"
 
 #include <arm_neon.h>
 
@@ -45,14 +45,7 @@ namespace detail
 inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
 {
     ARM_COMPUTE_UNUSED(weights_offset);
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
+    const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
     return r;
 }
 
@@ -63,21 +56,16 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
  *
  * @return The loaded matrix.
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
 inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0)
 {
     const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
 
     /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
        r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    int32x4x3_t r =
-    {
-        {
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
-        }
-    };
+    int32x4x3_t r = {{vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
+                      vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
+                      vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))}};
     return r;
 }
 
@@ -245,36 +233,23 @@ inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                                const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                                const size_t dilation_x, int input_offset)
+inline float32x4_t single_convolve_3x3_dilation(const float         *in_top,
+                                                const float         *in_mid,
+                                                const float         *in_low,
+                                                const float32x4x3_t &m0,
+                                                const float32x4x3_t &m1,
+                                                const float32x4x3_t &m2,
+                                                const size_t         dilation_x,
+                                                int                  input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + dilation_x),
-            vld1q_f32(in_top + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + dilation_x),
-            vld1q_f32(in_mid + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + dilation_x),
-            vld1q_f32(in_low + 2 * dilation_x)
-        }
-    };
+    const float32x4x3_t vtop = {
+        {vld1q_f32(in_top), vld1q_f32(in_top + dilation_x), vld1q_f32(in_top + 2 * dilation_x)}};
+    const float32x4x3_t vmid = {
+        {vld1q_f32(in_mid), vld1q_f32(in_mid + dilation_x), vld1q_f32(in_mid + 2 * dilation_x)}};
+    const float32x4x3_t vlow = {
+        {vld1q_f32(in_low), vld1q_f32(in_low + dilation_x), vld1q_f32(in_low + 2 * dilation_x)}};
     float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
     out             = vmlaq_f32(out, vtop.val[1], m0.val[1]);
     out             = vmlaq_f32(out, vtop.val[2], m0.val[2]);
@@ -303,26 +278,28 @@ inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                           const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
+inline float32x4x2_t convolve_3x3_dilation(const float         *in_top,
+                                           const float         *in_mid,
+                                           const float         *in_low,
+                                           const float32x4x3_t &m0,
+                                           const float32x4x3_t &m1,
+                                           const float32x4x3_t &m2,
+                                           const size_t         dilation_x,
+                                           unsigned int         stridex,
+                                           int                  input_offset = 0)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
-    float32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
+    float32x4x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
 
-    if(stridex == 2)
+    if (stridex == 2)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     }
@@ -344,26 +321,32 @@ inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_
  *
  */
 template <bool accumulate>
-void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                  const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                  unsigned int stridex, int input_offset = 0);
+void convolve_3x3(const float         *in_top,
+                  const float         *in_mid,
+                  const float         *in_low,
+                  float               *out_ptr,
+                  const float32x4x3_t &m0,
+                  const float32x4x3_t &m1,
+                  const float32x4x3_t &m2,
+                  unsigned int         stridex,
+                  int                  input_offset = 0);
 
 template <bool accumulate>
-inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                         const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                         unsigned int stridex, int input_offset)
+inline void convolve_3x3(const float         *in_top,
+                         const float         *in_mid,
+                         const float         *in_low,
+                         float               *out_ptr,
+                         const float32x4x3_t &m0,
+                         const float32x4x3_t &m1,
+                         const float32x4x3_t &m2,
+                         unsigned int         stridex,
+                         int                  input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
     ARM_COMPUTE_ERROR_ON(stridex > 3);
 
-    float32x4x2_t out =
-    {
-        {
-            vdupq_n_f32(0.f),
-            vdupq_n_f32(0.f)
-        }
-    };
-    if(stridex == 2)
+    float32x4x2_t out = {{vdupq_n_f32(0.f), vdupq_n_f32(0.f)}};
+    if (stridex == 2)
     {
         const float32x4x2_t vtop     = vld2q_f32(in_top);
         const float32x4x2_t vmid     = vld2q_f32(in_mid);
@@ -389,32 +372,11 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
     }
     else
     {
-        const float32x4x3_t vtop =
-        {
-            {
-                vld1q_f32(in_top),
-                vld1q_f32(in_top + 4),
-                vld1q_f32(in_top + 8)
-            }
-        };
-        const float32x4x3_t vmid =
-        {
-            {
-                vld1q_f32(in_mid),
-                vld1q_f32(in_mid + 4),
-                vld1q_f32(in_mid + 8)
-            }
-        };
-        const float32x4x3_t vlow =
-        {
-            {
-                vld1q_f32(in_low),
-                vld1q_f32(in_low + 4),
-                vld1q_f32(in_low + 8)
-            }
-        };
-        out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
+        const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+        const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+        const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+        out.val[0]               = vmulq_f32(vtop.val[0], m0.val[0]);
+        out.val[1]               = vmulq_f32(vtop.val[1], m0.val[0]);
 
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
@@ -438,7 +400,7 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
 
-        if(stridex == 3)
+        if (stridex == 3)
         {
             out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
             accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -462,65 +424,43 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
  * @param[in] input_offset Input quantization offset.
  *
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low,
-                                              const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                              size_t dilation_x, int32_t input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4_t single_convolve_3x3_dilation(const T           *in_top,
+                                              const T           *in_mid,
+                                              const T           *in_low,
+                                              const int32x4x3_t &m0,
+                                              const int32x4x3_t &m1,
+                                              const int32x4x3_t &m2,
+                                              size_t             dilation_x,
+                                              int32_t            input_offset)
 {
     using VectorType    = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
     using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + dilation_x),
-            wrapper::vload(in_top + 2 * dilation_x)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + dilation_x),
-            wrapper::vload(in_mid + 2 * dilation_x)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + dilation_x),
-            wrapper::vload(in_low + 2 * dilation_x)
-        }
-    };
-
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
-        }
-    };
+    const VectorType vtop = {
+        {wrapper::vload(in_top), wrapper::vload(in_top + dilation_x), wrapper::vload(in_top + 2 * dilation_x)}};
+    const VectorType vmid = {
+        {wrapper::vload(in_mid), wrapper::vload(in_mid + dilation_x), wrapper::vload(in_mid + 2 * dilation_x)}};
+    const VectorType vlow = {
+        {wrapper::vload(in_low), wrapper::vload(in_low + dilation_x), wrapper::vload(in_low + 2 * dilation_x)}};
+
+    const int32x4x3_t vtop_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
+    }};
+    const int32x4x3_t vmid_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
+    }};
+    const int32x4x3_t vlow_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
+    }};
 
     int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
     out           = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
@@ -550,26 +490,29 @@ inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid,
  * @param[in] input_offset Input quantization offset.
  *
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                         const size_t dilation_x, unsigned int stridex, int input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4x2_t convolve_3x3_dilation(const T           *in_top,
+                                         const T           *in_mid,
+                                         const T           *in_low,
+                                         const int32x4x3_t &m0,
+                                         const int32x4x3_t &m1,
+                                         const int32x4x3_t &m2,
+                                         const size_t       dilation_x,
+                                         unsigned int       stridex,
+                                         int                input_offset)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
-    int32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
+    int32x4x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
 
-    if(stridex == 2)
+    if (stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
     }
@@ -589,10 +532,19 @@ inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const
  * @param[in]  input_offset Input quantization offset.
  *
  */
-template < bool accumulate, typename T1, typename T2, ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value) >
-void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr,
-                  const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                  unsigned int stridex, int32_t input_offset)
+template <bool accumulate,
+          typename T1,
+          typename T2,
+          ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value)>
+void convolve_3x3(const T1          *in_top,
+                  const T1          *in_mid,
+                  const T1          *in_low,
+                  T2                *out_ptr,
+                  const int32x4x3_t &m0,
+                  const int32x4x3_t &m1,
+                  const int32x4x3_t &m2,
+                  unsigned int       stridex,
+                  int32_t            input_offset)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     using VectorType    = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
@@ -600,60 +552,30 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + 8)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + 8)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + 8)
-        }
-    };
-
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-        }
-    };
-
-    int32x4x2_t out
-    {
-        {
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-        }
-    };
+    const VectorType vtop = {{wrapper::vload(in_top), wrapper::vload(in_top + 8)}};
+    const VectorType vmid = {{wrapper::vload(in_mid), wrapper::vload(in_mid + 8)}};
+    const VectorType vlow = {{wrapper::vload(in_low), wrapper::vload(in_low + 8)}};
+
+    const int32x4x3_t vtop_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+    }};
+    const int32x4x3_t vmid_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+    }};
+    const int32x4x3_t vlow_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+    }};
+
+    int32x4x2_t out{{
+        wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+        wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+    }};
 
     // 0
     out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
@@ -681,11 +603,11 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
 
-    if(stridex == 1)
+    if (stridex == 1)
     {
         accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
     }
-    else if(stridex == 2)
+    else if (stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
@@ -693,7 +615,7 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
 
         accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
         accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -712,14 +634,7 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset =
     ARM_COMPUTE_UNUSED(weights_offset);
     /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
        r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    const float16x8x3_t r =
-    {
-        {
-            vld1q_dup_f16(ptr),
-            vld1q_dup_f16(1 + ptr),
-            vld1q_dup_f16(2 + ptr)
-        }
-    };
+    const float16x8x3_t r = {{vld1q_dup_f16(ptr), vld1q_dup_f16(1 + ptr), vld1q_dup_f16(2 + ptr)}};
     return r;
 }
 
@@ -735,35 +650,22 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset =
  * @param[in] input_offset (Optional)Input quantization offset.
  *
  */
-inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                                const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                                const size_t dilation_x, int input_offset = 0)
+inline float16x8_t single_convolve_3x3_dilation(const float16_t     *in_top,
+                                                const float16_t     *in_mid,
+                                                const float16_t     *in_low,
+                                                const float16x8x3_t &m0,
+                                                const float16x8x3_t &m1,
+                                                const float16x8x3_t &m2,
+                                                const size_t         dilation_x,
+                                                int                  input_offset = 0)
 {
     ARM_COMPUTE_UNUSED(input_offset);
-    const float16x8x3_t vtop =
-    {
-        {
-            vld1q_f16(in_top),
-            vld1q_f16(in_top + dilation_x),
-            vld1q_f16(in_top + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vmid =
-    {
-        {
-            vld1q_f16(in_mid),
-            vld1q_f16(in_mid + dilation_x),
-            vld1q_f16(in_mid + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vlow =
-    {
-        {
-            vld1q_f16(in_low),
-            vld1q_f16(in_low + dilation_x),
-            vld1q_f16(in_low + 2 * dilation_x)
-        }
-    };
+    const float16x8x3_t vtop = {
+        {vld1q_f16(in_top), vld1q_f16(in_top + dilation_x), vld1q_f16(in_top + 2 * dilation_x)}};
+    const float16x8x3_t vmid = {
+        {vld1q_f16(in_mid), vld1q_f16(in_mid + dilation_x), vld1q_f16(in_mid + 2 * dilation_x)}};
+    const float16x8x3_t vlow = {
+        {vld1q_f16(in_low), vld1q_f16(in_low + dilation_x), vld1q_f16(in_low + 2 * dilation_x)}};
     float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]);
     out             = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1]));
     out             = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2]));
@@ -792,19 +694,21 @@ inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const f
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                           const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
-{
-    float16x8x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
-
-    if(stridex == 2)
+inline float16x8x2_t convolve_3x3_dilation(const float16_t     *in_top,
+                                           const float16_t     *in_mid,
+                                           const float16_t     *in_low,
+                                           const float16x8x3_t &m0,
+                                           const float16x8x3_t &m1,
+                                           const float16x8x3_t &m2,
+                                           const size_t         dilation_x,
+                                           unsigned int         stridex,
+                                           int                  input_offset = 0)
+{
+    float16x8x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)}};
+
+    if (stridex == 2)
     {
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);
@@ -814,7 +718,7 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -838,20 +742,20 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1
  *
  */
 template <bool accumulate>
-inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr,
-                         const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                         unsigned int stridex, int input_offset = 0)
+inline void convolve_3x3(const float16_t     *in_top,
+                         const float16_t     *in_mid,
+                         const float16_t     *in_low,
+                         float16_t           *out_ptr,
+                         const float16x8x3_t &m0,
+                         const float16x8x3_t &m1,
+                         const float16x8x3_t &m2,
+                         unsigned int         stridex,
+                         int                  input_offset = 0)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    float16x8x2_t out =
-    {
-        {
-            vdupq_n_f16(0),
-            vdupq_n_f16(0)
-        }
-    };
-    if(stridex == 2)
+    float16x8x2_t out = {{vdupq_n_f16(0), vdupq_n_f16(0)}};
+    if (stridex == 2)
     {
         const float16x8x2_t vtop     = vld2q_f16(in_top);
         const float16x8x2_t vmid     = vld2q_f16(in_mid);
@@ -877,32 +781,11 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
     }
     else
     {
-        const float16x8x3_t vtop =
-        {
-            {
-                vld1q_f16(in_top),
-                vld1q_f16(in_top + 8),
-                vld1q_f16(in_top + 16)
-            }
-        };
-        const float16x8x3_t vmid =
-        {
-            {
-                vld1q_f16(in_mid),
-                vld1q_f16(in_mid + 8),
-                vld1q_f16(in_mid + 16)
-            }
-        };
-        const float16x8x3_t vlow =
-        {
-            {
-                vld1q_f16(in_low),
-                vld1q_f16(in_low + 8),
-                vld1q_f16(in_low + 16)
-            }
-        };
-        out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);
+        const float16x8x3_t vtop = {{vld1q_f16(in_top), vld1q_f16(in_top + 8), vld1q_f16(in_top + 16)}};
+        const float16x8x3_t vmid = {{vld1q_f16(in_mid), vld1q_f16(in_mid + 8), vld1q_f16(in_mid + 16)}};
+        const float16x8x3_t vlow = {{vld1q_f16(in_low), vld1q_f16(in_low + 8), vld1q_f16(in_low + 16)}};
+        out.val[0]               = vmulq_f16(vtop.val[0], m0.val[0]);
+        out.val[1]               = vmulq_f16(vtop.val[1], m0.val[0]);
 
         out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
         out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
@@ -921,7 +804,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
         out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
         out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
 
-        if(stridex == 3)
+        if (stridex == 3)
         {
             out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
             out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -946,7 +829,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
  */
 inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex)
 {
-    switch(stridex)
+    switch (stridex)
     {
         case 1:
             return num_elems_written_per_iteration;
@@ -959,6 +842,6 @@ inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iter
             return 0;
     }
 }
-}
+} // namespace detail
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */
diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h
index 6e79a92bc2..381de2284a 100644
--- a/src/core/NEON/wrapper/intrinsics/cvt.h
+++ b/src/core/NEON/wrapper/intrinsics/cvt.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,12 +30,11 @@ namespace arm_compute
 {
 namespace wrapper
 {
-#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2)                   \
-    template <typename T>                                                            \
-    inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type \
-    vcvt(const vtype &a)                                                             \
-    {                                                                                \
-        return prefix##_##postfix1##_##postfix2(a);                                  \
+#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2)                                        \
+    template <typename T>                                                                                 \
+    inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type vcvt(const vtype &a) \
+    {                                                                                                     \
+        return prefix##_##postfix1##_##postfix2(a);                                                       \
     }
 
 VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32)
@@ -46,12 +45,11 @@ VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16)
 #undef VCVT_TO_F32_IMPL
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2)                       \
-    template <typename T>                                                                \
-    inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type \
-    vcvt(const vtype &a)                                                                 \
-    {                                                                                    \
-        return prefix##_##postfix1##_##postfix2(a);                                      \
+#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2)                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type vcvt(const vtype &a) \
+    {                                                                                                         \
+        return prefix##_##postfix1##_##postfix2(a);                                                           \
     }
 
 VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
@@ -59,20 +57,34 @@ VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint32x4_t>::type
+inline typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t>::type
 vcvt(const float32x4_t &a)
 {
     return vcvtq_u32_f32(a);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int32x4_t>::type
+inline typename std::enable_if<std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t>::type
 vcvt(const float32x4_t &a)
 {
     return vcvtq_s32_f32(a);
 }
 
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
+#ifdef __aarch64__
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type vcvta(const float32x4_t &a)
+{
+    return vcvtaq_u32_f32(a);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type vcvta(const float32x4_t &a)
+{
+    return vcvtaq_s32_f32(a);
+}
+#endif //__aarch64__
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
 /** Convert 2x128-bit floating point vectors into 1x128-bit bfloat16 vector
  *
  * @param[in]     inptr  Pointer to the input memory to load values from
@@ -80,16 +92,15 @@ vcvt(const float32x4_t &a)
  */
 inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr)
 {
-    __asm __volatile(
-        "ldp    q0, q1, [%[inptr]]\n"
-        ".inst  0xea16800\n"  // BFCVTN v0, v0
-        ".inst  0x4ea16820\n" // BFCVTN2 v0, v1
-        "str    q0, [%[outptr]]\n"
-        : [inptr] "+r"(inptr)
-        : [outptr] "r"(outptr)
-        : "v0", "v1", "memory");
+    __asm __volatile("ldp    q0, q1, [%[inptr]]\n"
+                     ".inst  0xea16800\n"  // BFCVTN v0, v0
+                     ".inst  0x4ea16820\n" // BFCVTN2 v0, v1
+                     "str    q0, [%[outptr]]\n"
+                     : [inptr] "+r"(inptr)
+                     : [outptr] "r"(outptr)
+                     : "v0", "v1", "memory");
 }
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 
 } // namespace wrapper
 } // namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/div.h b/src/core/NEON/wrapper/intrinsics/div.h
index 265f30d33b..ece991a5b0 100644
--- a/src/core/NEON/wrapper/intrinsics/div.h
+++ b/src/core/NEON/wrapper/intrinsics/div.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_DIV_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/erf.h b/src/core/NEON/wrapper/intrinsics/erf.h
new file mode 100644
index 0000000000..0e34462b96
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/erf.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_WRAPPER_ERF_H
+#define ARM_COMPUTE_WRAPPER_ERF_H
+
+#include "src/core/NEON/NEMath.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VERF_IMPL(vtype, prefix, postfix) \
+    inline vtype verf(const vtype &a)     \
+    {                                     \
+        return prefix##_##postfix(a);     \
+    }
+
+VERF_IMPL(float32x4_t, verfq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VERF_IMPL(float16x8_t, verfq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VERF_IMPL
+
+} // namespace wrapper
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_WRAPPER_ERF_H */
diff --git a/src/core/NEON/wrapper/intrinsics/exp.h b/src/core/NEON/wrapper/intrinsics/exp.h
index c2a6970967..f44577b926 100644
--- a/src/core/NEON/wrapper/intrinsics/exp.h
+++ b/src/core/NEON/wrapper/intrinsics/exp.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_EXP_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/getlane.h b/src/core/NEON/wrapper/intrinsics/getlane.h
index 2052751612..ae813bb2fa 100644
--- a/src/core/NEON/wrapper/intrinsics/getlane.h
+++ b/src/core/NEON/wrapper/intrinsics/getlane.h
@@ -33,7 +33,7 @@ namespace wrapper
 #define VGETLANE_IMPL_8(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -59,7 +59,7 @@ namespace wrapper
 #define VGETLANE_IMPL_4(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -77,7 +77,7 @@ namespace wrapper
 #define VGETLANE_IMPL_2(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -102,7 +102,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_16(stype, vtype, postfix)                       \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -144,7 +144,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_8(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -170,7 +170,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_4(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -188,7 +188,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_2(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
diff --git a/src/core/NEON/wrapper/intrinsics/intrinsics.h b/src/core/NEON/wrapper/intrinsics/intrinsics.h
index 871d9cc5ac..97975ebe7c 100644
--- a/src/core/NEON/wrapper/intrinsics/intrinsics.h
+++ b/src/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,7 @@
 #include "src/core/NEON/wrapper/intrinsics/div.h"
 #include "src/core/NEON/wrapper/intrinsics/dup_n.h"
 #include "src/core/NEON/wrapper/intrinsics/eor.h"
+#include "src/core/NEON/wrapper/intrinsics/erf.h"
 #include "src/core/NEON/wrapper/intrinsics/exp.h"
 #include "src/core/NEON/wrapper/intrinsics/ext.h"
 #include "src/core/NEON/wrapper/intrinsics/gethigh.h"
@@ -66,6 +67,7 @@
 #include "src/core/NEON/wrapper/intrinsics/rev64.h"
 #include "src/core/NEON/wrapper/intrinsics/round.h"
 #include "src/core/NEON/wrapper/intrinsics/setlane.h"
+#include "src/core/NEON/wrapper/intrinsics/shr.h"
 #include "src/core/NEON/wrapper/intrinsics/sin.h"
 #include "src/core/NEON/wrapper/intrinsics/sqrt.h"
 #include "src/core/NEON/wrapper/intrinsics/store.h"
diff --git a/src/core/NEON/wrapper/intrinsics/inv.h b/src/core/NEON/wrapper/intrinsics/inv.h
index de398b0403..e443be679b 100644
--- a/src/core/NEON/wrapper/intrinsics/inv.h
+++ b/src/core/NEON/wrapper/intrinsics/inv.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_INV_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/invsqrt.h b/src/core/NEON/wrapper/intrinsics/invsqrt.h
index 2343efa8f8..257b445cc7 100644
--- a/src/core/NEON/wrapper/intrinsics/invsqrt.h
+++ b/src/core/NEON/wrapper/intrinsics/invsqrt.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_INVSQRT_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/log.h b/src/core/NEON/wrapper/intrinsics/log.h
index 357a77ca78..d091407edb 100644
--- a/src/core/NEON/wrapper/intrinsics/log.h
+++ b/src/core/NEON/wrapper/intrinsics/log.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_LOG_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h
index cec437d171..32d38a856c 100644
--- a/src/core/NEON/wrapper/intrinsics/max.h
+++ b/src/core/NEON/wrapper/intrinsics/max.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_MAX_H
-#define ARM_COMPUTE_WRAPPER_MAX_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
+#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
 
 #include <arm_neon.h>
 
@@ -59,6 +59,39 @@ VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VMAX_IMPL
+
+#if defined(__aarch64__)
+// VMAXV: Across vector max
+#define VMAXV_IMPL(stype, vtype, prefix, postfix) \
+    inline stype vmaxv(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VMAXV_IMPL(uint8_t, uint8x8_t, vmaxv, u8)
+VMAXV_IMPL(int8_t, int8x8_t, vmaxv, s8)
+VMAXV_IMPL(uint16_t, uint16x4_t, vmaxv, u16)
+VMAXV_IMPL(int16_t, int16x4_t, vmaxv, s16)
+VMAXV_IMPL(uint32_t, uint32x2_t, vmaxv, u32)
+VMAXV_IMPL(int32_t, int32x2_t, vmaxv, s32)
+VMAXV_IMPL(float, float32x2_t, vmaxv, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x4_t, vmaxv, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VMAXV_IMPL(uint8_t, uint8x16_t, vmaxvq, u8)
+VMAXV_IMPL(int8_t, int8x16_t, vmaxvq, s8)
+VMAXV_IMPL(uint16_t, uint16x8_t, vmaxvq, u16)
+VMAXV_IMPL(int16_t, int16x8_t, vmaxvq, s16)
+VMAXV_IMPL(uint32_t, uint32x4_t, vmaxvq, u32)
+VMAXV_IMPL(int32_t, int32x4_t, vmaxvq, s32)
+VMAXV_IMPL(float, float32x4_t, vmaxvq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x8_t, vmaxvq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VMAXV_IMPL
+#endif // defined(__aarch64__)
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MAX_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
diff --git a/src/core/NEON/wrapper/intrinsics/pow.h b/src/core/NEON/wrapper/intrinsics/pow.h
index 61f834ed23..dfd6ccc358 100644
--- a/src/core/NEON/wrapper/intrinsics/pow.h
+++ b/src/core/NEON/wrapper/intrinsics/pow.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_POW_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/qmov.h b/src/core/NEON/wrapper/intrinsics/qmov.h
index 167f3cf43b..9a0a23a241 100644
--- a/src/core/NEON/wrapper/intrinsics/qmov.h
+++ b/src/core/NEON/wrapper/intrinsics/qmov.h
@@ -31,15 +31,13 @@ namespace arm_compute
 namespace wrapper
 {
 template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type
-vqmov(const int16x8_t &a)
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type vqmov(const int16x8_t &a)
 {
     return vqmovun_s16(a);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type
-vqmov(const int16x8_t &a)
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type vqmov(const int16x8_t &a)
 {
     return vqmovn_s16(a);
 }
diff --git a/src/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h
index cf00a4aceb..c2c4f720d2 100644
--- a/src/core/NEON/wrapper/intrinsics/reinterpret.h
+++ b/src/core/NEON/wrapper/intrinsics/reinterpret.h
@@ -35,7 +35,7 @@ namespace wrapper
     {                                                               \
         return prefix##_##postfix1##_##postfix2(a);                 \
     }                                                               \
-    \
+                                                                    \
     inline ptype vreinterpret(const ptype &a)                       \
     {                                                               \
         return a;                                                   \
diff --git a/src/core/NEON/wrapper/intrinsics/round.h b/src/core/NEON/wrapper/intrinsics/round.h
index d23feb6b42..7789aab770 100644
--- a/src/core/NEON/wrapper/intrinsics/round.h
+++ b/src/core/NEON/wrapper/intrinsics/round.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_ROUND_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/setlane.h b/src/core/NEON/wrapper/intrinsics/setlane.h
index 197eedacb5..259b8eaf90 100644
--- a/src/core/NEON/wrapper/intrinsics/setlane.h
+++ b/src/core/NEON/wrapper/intrinsics/setlane.h
@@ -33,7 +33,7 @@ namespace wrapper
 #define VSETLANE_IMPL_8(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -59,7 +59,7 @@ namespace wrapper
 #define VSETLANE_IMPL_4(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -77,7 +77,7 @@ namespace wrapper
 #define VSETLANE_IMPL_2(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -102,7 +102,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
 #define VSETQLANE_IMPL_16(stype, atype, vtype, postfix)                                   \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \
@@ -144,7 +144,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
 #define VSETQLANE_IMPL_8(stype, atype, vtype, postfix)                                    \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \
@@ -170,7 +170,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
 #define VSETQLANE_IMPL_4(stype, atype, vtype, postfix)                                    \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \
diff --git a/src/core/NEON/wrapper/intrinsics/shr.h b/src/core/NEON/wrapper/intrinsics/shr.h
new file mode 100644
index 0000000000..6ccb9cdf92
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/shr.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_WRAPPER_SHR_H
+#define ARM_COMPUTE_WRAPPER_SHR_H
+
+#include <arm_neon.h>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VQRSHRN_IMPL(half_vtype, vtype, prefix, postfix) \
+    template <int b>                                     \
+    inline half_vtype vqrshrn(const vtype &a)            \
+    {                                                    \
+        return prefix##_##postfix(a, b);                 \
+    }
+VQRSHRN_IMPL(int8x8_t, int16x8_t, vqrshrn_n, s16)
+VQRSHRN_IMPL(uint8x8_t, uint16x8_t, vqrshrn_n, u16)
+VQRSHRN_IMPL(int16x4_t, int32x4_t, vqrshrn_n, s32)
+VQRSHRN_IMPL(uint16x4_t, uint32x4_t, vqrshrn_n, u32)
+VQRSHRN_IMPL(int32x2_t, int64x2_t, vqrshrn_n, s64)
+VQRSHRN_IMPL(uint32x2_t, uint64x2_t, vqrshrn_n, u64)
+
+#undef VQRSHRN_IMPL
+
+#ifdef __aarch64__
+#define VQRSHRN_SCALAR_IMPL(half_vtype, vtype, prefix, postfix) \
+    template <int b>                                            \
+    inline half_vtype vqrshrn(const vtype &a)                   \
+    {                                                           \
+        return prefix##_##postfix(a, b);                        \
+    }
+
+VQRSHRN_SCALAR_IMPL(int8_t, int16_t, vqrshrnh_n, s16)
+VQRSHRN_SCALAR_IMPL(uint8_t, uint16_t, vqrshrnh_n, u16)
+VQRSHRN_SCALAR_IMPL(int16_t, int32_t, vqrshrns_n, s32)
+VQRSHRN_SCALAR_IMPL(uint16_t, uint32_t, vqrshrns_n, u32)
+VQRSHRN_SCALAR_IMPL(int32_t, int64_t, vqrshrnd_n, s64)
+VQRSHRN_SCALAR_IMPL(uint32_t, uint64_t, vqrshrnd_n, u64)
+
+#undef VQRSHRN_SCALAR_IMPL
+#endif // __aarch64__
+
+// This function is the mixed version of VQRSHRN and VQRSHRUN.
+// The input vector is always signed integer, while the returned vector
+// can be either signed or unsigned depending on the signedness of scalar type T.
+#define VQRSHRN_EX_IMPL(half_vtype, vtype, prefix_signed, prefix_unsigned, postfix)                              \
+    template <int b, typename T>                                                                                 \
+    inline typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, half_vtype>::type     \
+    vqrshrn_ex(const vtype &a)                                                                                   \
+    {                                                                                                            \
+        return prefix_signed##_##postfix(a, b);                                                                  \
+    }                                                                                                            \
+                                                                                                                 \
+    template <int b, typename T>                                                                                 \
+    inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \
+    vqrshrn_ex(const vtype &a)                                                                                   \
+    {                                                                                                            \
+        return prefix_unsigned##_##postfix(a, b);                                                                \
+    }
+VQRSHRN_EX_IMPL(int8x8_t, int16x8_t, vqrshrn_n, vqrshrun_n, s16)
+VQRSHRN_EX_IMPL(int16x4_t, int32x4_t, vqrshrn_n, vqrshrun_n, s32)
+VQRSHRN_EX_IMPL(int32x2_t, int64x2_t, vqrshrn_n, vqrshrun_n, s64)
+#undef VQRSHRN_EX_IMPL
+
+#define VSHR_IMPL(vtype, prefix, postfix) \
+    template <int b>                      \
+    inline vtype vshr_n(const vtype &a)   \
+    {                                     \
+        return prefix##_##postfix(a, b);  \
+    }
+VSHR_IMPL(uint8x8_t, vshr_n, u8)
+VSHR_IMPL(int8x8_t, vshr_n, s8)
+#undef VSHR_IMPL
+
+#define VSHRQ_IMPL(vtype, prefix, postfix) \
+    template <int b>                       \
+    inline vtype vshrq_n(const vtype &a)   \
+    {                                      \
+        return prefix##_##postfix(a, b);   \
+    }
+VSHRQ_IMPL(uint32x4_t, vshrq_n, u32)
+VSHRQ_IMPL(int32x4_t, vshrq_n, s32)
+#undef VSHRQ_IMPL
+
+#ifdef __aarch64__
+#define VSHRQ_SCALAR_IMPL(vtype, prefix, postfix) \
+    template <int b>                              \
+    inline vtype vshrq_n(const vtype &a)          \
+    {                                             \
+        return prefix##_##postfix(a, b);          \
+    }
+VSHRQ_SCALAR_IMPL(uint32_t, vshrd_n, u64)
+VSHRQ_SCALAR_IMPL(int32_t, vshrd_n, s64)
+
+#undef VSHRQ_SCALAR_IMPL
+#endif // __aarch64__
+
+#ifdef __aarch64__
+#define VQRSHRN_EX_SCALAR_IMPL(half_vtype, vtype, prefix_signed, prefix_unsigned, postfix)                       \
+    template <int b, typename T>                                                                                 \
+    inline typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, half_vtype>::type     \
+    vqrshrn_ex(const vtype &a)                                                                                   \
+    {                                                                                                            \
+        return prefix_signed##_##postfix(a, b);                                                                  \
+    }                                                                                                            \
+                                                                                                                 \
+    template <int b, typename T>                                                                                 \
+    inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \
+    vqrshrn_ex(const vtype &a)                                                                                   \
+    {                                                                                                            \
+        return prefix_unsigned##_##postfix(a, b);                                                                \
+    }
+
+VQRSHRN_EX_SCALAR_IMPL(int8_t, int16_t, vqrshrnh_n, vqrshrunh_n, s16)
+VQRSHRN_EX_SCALAR_IMPL(int16_t, int32_t, vqrshrns_n, vqrshruns_n, s32)
+VQRSHRN_EX_SCALAR_IMPL(int32_t, int64_t, vqrshrnd_n, vqrshrund_n, s64)
+
+#undef VQRSHRN_EX_IMPL
+#endif // __aarch64__
+
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_SHR_H */
diff --git a/src/core/NEON/wrapper/intrinsics/sin.h b/src/core/NEON/wrapper/intrinsics/sin.h
index 03c2813a32..d24fdfa816 100644
--- a/src/core/NEON/wrapper/intrinsics/sin.h
+++ b/src/core/NEON/wrapper/intrinsics/sin.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_SIN_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -54,4 +55,4 @@ VSIN_IMPL_INT(int32x4_t, vsinq, s32)
 #undef vsub_IMPL
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
diff --git a/src/core/NEON/wrapper/intrinsics/store.h b/src/core/NEON/wrapper/intrinsics/store.h
index 6dda432ea9..ce1b9a554e 100644
--- a/src/core/NEON/wrapper/intrinsics/store.h
+++ b/src/core/NEON/wrapper/intrinsics/store.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,8 +44,6 @@ VSTORE_IMPL(uint16_t, uint16x4_t, vst1, u16)
 VSTORE_IMPL(int16_t, int16x4_t, vst1, s16)
 VSTORE_IMPL(uint32_t, uint32x2_t, vst1, u32)
 VSTORE_IMPL(int32_t, int32x2_t, vst1, s32)
-//VSTORE_IMPL(uint64_t, 1, vst1, u64)
-//VSTORE_IMPL(int64_t, 1, vst1, s64)
 VSTORE_IMPL(float, float32x2_t, vst1, f32)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 VSTORE_IMPL(float16_t, float16x4_t, vst1, f16)
@@ -57,8 +55,6 @@ VSTORE_IMPL(uint16_t, uint16x8_t, vst1q, u16)
 VSTORE_IMPL(int16_t, int16x8_t, vst1q, s16)
 VSTORE_IMPL(uint32_t, uint32x4_t, vst1q, u32)
 VSTORE_IMPL(int32_t, int32x4_t, vst1q, s32)
-//VSTORE_IMPL(uint64_t, 2, vst1q, u64)
-//VSTORE_IMPL(int64_t, 2, vst1q, s64)
 VSTORE_IMPL(float, float32x4_t, vst1q, f32)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 VSTORE_IMPL(float16_t, float16x8_t, vst1q, f16)
diff --git a/src/core/NEON/wrapper/intrinsics/sub.h b/src/core/NEON/wrapper/intrinsics/sub.h
index 475986d0f6..20436714ef 100644
--- a/src/core/NEON/wrapper/intrinsics/sub.h
+++ b/src/core/NEON/wrapper/intrinsics/sub.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -98,6 +98,21 @@ VQSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #undef VQSUB_IMPL
 
+#define VSUBL_IMPL(rtype, vtype, prefix, postfix)      \
+    inline rtype vsubl(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VSUBL_IMPL(int16x8_t, int8x8_t, vsubl, s8)
+VSUBL_IMPL(int32x4_t, int16x4_t, vsubl, s16)
+VSUBL_IMPL(int64x2_t, int32x2_t, vsubl, s32)
+VSUBL_IMPL(uint16x8_t, uint8x8_t, vsubl, u8)
+VSUBL_IMPL(uint32x4_t, uint16x4_t, vsubl, u16)
+VSUBL_IMPL(uint64x2_t, uint32x2_t, vsubl, u32)
+
+#undef VSUB_IMPL
+
 } // namespace wrapper
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_WRAPPER_SUB_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svcnt.h b/src/core/NEON/wrapper/intrinsics/svcnt.h
index e530e7c83f..c4652504b4 100644
--- a/src/core/NEON/wrapper/intrinsics/svcnt.h
+++ b/src/core/NEON/wrapper/intrinsics/svcnt.h
@@ -30,7 +30,7 @@ namespace arm_compute
 namespace wrapper
 {
 template <size_t element_size>
-inline uint64_t  svcnt_size();
+inline uint64_t svcnt_size();
 
 template <>
 inline uint64_t svcnt_size<64>()
@@ -65,4 +65,4 @@ inline uint64_t svcnt()
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svcvt.h b/src/core/NEON/wrapper/intrinsics/svcvt.h
index 746b004d7d..00ef7b7eb3 100644
--- a/src/core/NEON/wrapper/intrinsics/svcvt.h
+++ b/src/core/NEON/wrapper/intrinsics/svcvt.h
@@ -29,11 +29,12 @@ namespace arm_compute
 {
 namespace wrapper
 {
-#define SVCVT_Z_TO_F32_IMPL(vtype)                                                                                        \
-    template <typename T>                                                                                                 \
-    inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                     \
-        return svcvt_f32_z(pg, a);                                                                                        \
+#define SVCVT_Z_TO_F32_IMPL(vtype)                                                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t     pg, \
+                                                                                             const vtype &a)  \
+    {                                                                                                         \
+        return svcvt_f32_z(pg, a);                                                                            \
     }
 
 SVCVT_Z_TO_F32_IMPL(svuint32_t)
@@ -42,11 +43,12 @@ SVCVT_Z_TO_F32_IMPL(svfloat16_t)
 
 #undef SVCVT_Z_TO_F32_IMPL
 
-#define SVCVT_Z_TO_F16_IMPL(vtype)                                                                                            \
-    template <typename T>                                                                                                     \
-    inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                         \
-        return svcvt_f16_z(pg, a);                                                                                            \
+#define SVCVT_Z_TO_F16_IMPL(vtype)                                                                                \
+    template <typename T>                                                                                         \
+    inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t     pg, \
+                                                                                                 const vtype &a)  \
+    {                                                                                                             \
+        return svcvt_f16_z(pg, a);                                                                                \
     }
 
 SVCVT_Z_TO_F16_IMPL(svuint32_t)
@@ -55,11 +57,12 @@ SVCVT_Z_TO_F16_IMPL(svfloat32_t)
 
 #undef SVCVT_Z_TO_F16_IMPL
 
-#define SVCVT_Z_TO_S32_IMPL(vtype)                                                                                        \
-    template <typename T>                                                                                                 \
-    inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                     \
-        return svcvt_s32_z(pg, a);                                                                                        \
+#define SVCVT_Z_TO_S32_IMPL(vtype)                                                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t     pg, \
+                                                                                             const vtype &a)  \
+    {                                                                                                         \
+        return svcvt_s32_z(pg, a);                                                                            \
     }
 
 SVCVT_Z_TO_S32_IMPL(svfloat16_t)
@@ -71,4 +74,4 @@ SVCVT_Z_TO_S32_IMPL(svfloat32_t)
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svdup_n.h b/src/core/NEON/wrapper/intrinsics/svdup_n.h
index b1aed97d9c..9c42c86db7 100644
--- a/src/core/NEON/wrapper/intrinsics/svdup_n.h
+++ b/src/core/NEON/wrapper/intrinsics/svdup_n.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,9 @@ SVDUP_N_IMPL(uint64_t, svuint64_t, u64)
 SVDUP_N_IMPL(float16_t, svfloat16_t, f16)
 SVDUP_N_IMPL(float, svfloat32_t, f32)
 SVDUP_N_IMPL(float64_t, svfloat64_t, f64)
+#if __ARM_FEATURE_SVE_BF16
 SVDUP_N_IMPL(bfloat16_t, svbfloat16_t, bf16)
+#endif // #if __ARM_FEATURE_SVE_BF16
 
 #undef SVDUP_N_IMPL
 
@@ -54,4 +56,4 @@ SVDUP_N_IMPL(bfloat16_t, svbfloat16_t, bf16)
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVDUP_N_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVDUP_N_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svexp.h b/src/core/NEON/wrapper/intrinsics/svexp.h
index d6ce9a77d1..1e8bce3960 100644
--- a/src/core/NEON/wrapper/intrinsics/svexp.h
+++ b/src/core/NEON/wrapper/intrinsics/svexp.h
@@ -26,6 +26,7 @@
 
 #if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -46,4 +47,4 @@ SVEXP_IMPL(svfloat16_t, f16)
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svlog.h b/src/core/NEON/wrapper/intrinsics/svlog.h
index 5b505ae1e3..b4630e20ed 100644
--- a/src/core/NEON/wrapper/intrinsics/svlog.h
+++ b/src/core/NEON/wrapper/intrinsics/svlog.h
@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H
 #if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -44,4 +45,4 @@ SVLOG_IMPL(svfloat16_t, f16)
 } // namespace wrapper
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svptrue.h b/src/core/NEON/wrapper/intrinsics/svptrue.h
index 53407e5301..6ed00bccbf 100644
--- a/src/core/NEON/wrapper/intrinsics/svptrue.h
+++ b/src/core/NEON/wrapper/intrinsics/svptrue.h
@@ -30,7 +30,7 @@ namespace arm_compute
 namespace wrapper
 {
 template <size_t element_size>
-inline svbool_t  svptrue_size();
+inline svbool_t svptrue_size();
 
 template <>
 inline svbool_t svptrue_size<64>()
@@ -65,4 +65,4 @@ svbool_t svptrue()
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svwhilelt.h b/src/core/NEON/wrapper/intrinsics/svwhilelt.h
index ef58217dc4..f0f84a9508 100644
--- a/src/core/NEON/wrapper/intrinsics/svwhilelt.h
+++ b/src/core/NEON/wrapper/intrinsics/svwhilelt.h
@@ -32,7 +32,7 @@ namespace wrapper
 #define SVWHILELT_IMPL(type)                           \
     template <size_t element_size>                     \
     inline svbool_t svwhilelt_size(type a, type b);    \
-    \
+                                                       \
     template <>                                        \
     inline svbool_t svwhilelt_size<64>(type a, type b) \
     {                                                  \
@@ -70,4 +70,4 @@ inline svbool_t svwhilelt(IndexType a, IndexType b)
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/tanh.h b/src/core/NEON/wrapper/intrinsics/tanh.h
index daeaf19997..e74f0e86fe 100644
--- a/src/core/NEON/wrapper/intrinsics/tanh.h
+++ b/src/core/NEON/wrapper/intrinsics/tanh.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_TANH_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/scalar/add.h b/src/core/NEON/wrapper/scalar/add.h
index 642d9261f3..2ec88869e3 100644
--- a/src/core/NEON/wrapper/scalar/add.h
+++ b/src/core/NEON/wrapper/scalar/add.h
@@ -32,22 +32,22 @@ namespace wrapper
 {
 inline uint8_t add_sat(const uint8_t &a, const uint8_t &b)
 {
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+    const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0};
+    const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0};
     return vget_lane_u8(vqadd_u8(va, vb), 0);
 }
 
 inline int16_t add_sat(const int16_t &a, const int16_t &b)
 {
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
+    const int16x4_t va = {a, 0, 0, 0};
+    const int16x4_t vb = {b, 0, 0, 0};
     return vget_lane_s16(vqadd_s16(va, vb), 0);
 }
 
 inline int32_t add_sat(const int32_t &a, const int32_t &b)
 {
-    const int32x2_t va = { a, 0 };
-    const int32x2_t vb = { b, 0 };
+    const int32x2_t va = {a, 0};
+    const int32x2_t vb = {b, 0};
     return vget_lane_s32(vqadd_s32(va, vb), 0);
 }
 
diff --git a/src/core/NEON/wrapper/scalar/sub.h b/src/core/NEON/wrapper/scalar/sub.h
index 1fe51d75fc..00de7d867f 100644
--- a/src/core/NEON/wrapper/scalar/sub.h
+++ b/src/core/NEON/wrapper/scalar/sub.h
@@ -32,22 +32,22 @@ namespace wrapper
 {
 inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b)
 {
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+    const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0};
+    const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0};
     return vget_lane_u8(vqsub_u8(va, vb), 0);
 }
 
 inline int16_t sub_sat(const int16_t &a, const int16_t &b)
 {
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
+    const int16x4_t va = {a, 0, 0, 0};
+    const int16x4_t vb = {b, 0, 0, 0};
     return vget_lane_s16(vqsub_s16(va, vb), 0);
 }
 
 inline int32_t sub_sat(const int32_t &a, const int32_t &b)
 {
-    const int32x2_t va = { a, 0 };
-    const int32x2_t vb = { b, 0 };
+    const int32x2_t va = {a, 0};
+    const int32x2_t vb = {b, 0};
     return vget_lane_s32(vqsub_s32(va, vb), 0);
 }
 
diff --git a/src/core/NEON/wrapper/svtraits.h b/src/core/NEON/wrapper/svtraits.h
index 8d2d660659..330d272752 100644
--- a/src/core/NEON/wrapper/svtraits.h
+++ b/src/core/NEON/wrapper/svtraits.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 #ifndef SRC_CORE_NEON_WRAPPER_SVTRAITS_H
 #define SRC_CORE_NEON_WRAPPER_SVTRAITS_H
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -59,12 +60,15 @@ DEFINE_TYPES(uint64_t)
 DEFINE_TYPES(float16_t)
 DEFINE_TYPES(float32_t)
 DEFINE_TYPES(float64_t)
+
+#if __ARM_FEATURE_SVE_BF16
 DEFINE_TYPES(bfloat16_t)
+#endif // #if __ARM_FEATURE_SVE_BF16
 
 #undef DEFINE_TYPES
 
 } // namespace wrapper
 } // namespace arm_compute
 
-#endif /* defined(ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 #endif /* #ifndef SRC_CORE_NEON_WRAPPER_SVTRAITS_H */
diff --git a/src/core/NEON/wrapper/traits.h b/src/core/NEON/wrapper/traits.h
index 81685140f1..1dac61af74 100644
--- a/src/core/NEON/wrapper/traits.h
+++ b/src/core/NEON/wrapper/traits.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_TRAITS_H
-#define ARM_COMPUTE_WRAPPER_TRAITS_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_TRAITS_H
+#define ACL_SRC_CORE_NEON_WRAPPER_TRAITS_H
+
+#include "arm_compute/core/CoreTypes.h"
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#include "src/cpu/CpuTypes.h" // required for float16_t
+#endif                        // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 #include <arm_neon.h>
 
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
-#endif /* defined(ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FEATURE_SVE) */
+
+#include <cmath>
+#include <cstdint>
 
 namespace arm_compute
 {
@@ -116,13 +125,13 @@ template <> struct neon_bitvector<float16_t, BitWidth::W128>{ using type = float
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FEATURE_SVE)
 /** Create the appropriate SVE vector given its type */
 template <typename T> struct sve_vector;
 
 template <> struct sve_vector<uint8_t>{ using scalar_type = uint8_t; using type = svuint8_t; };
 template <> struct sve_vector<int8_t>{ using scalar_type = int8_t; using type = svint8_t; };
-#endif /* defined(ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FEATURE_SVE) */
 
 #endif /* DOXYGEN_SKIP_THIS */
 
@@ -151,4 +160,4 @@ using promote_t = typename promote<T>::type;
 } // namespace traits
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_TRAITS_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_TRAITS_H
diff --git a/src/core/NEON/wrapper/wrapper.h b/src/core/NEON/wrapper/wrapper.h
index e5467e98ff..f3f3c5d9e6 100644
--- a/src/core/NEON/wrapper/wrapper.h
+++ b/src/core/NEON/wrapper/wrapper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_H
-#define ARM_COMPUTE_WRAPPER_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_WRAPPER_H
+#define ACL_SRC_CORE_NEON_WRAPPER_WRAPPER_H
+
+#include "arm_compute/core/Error.h"
 
 // Traits
 #include "src/core/NEON/wrapper/traits.h"
@@ -31,4 +33,4 @@
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/core/NEON/wrapper/scalar/scalar.h"
 
-#endif /* ARM_COMPUTE_WRAPPER_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_WRAPPER_H
diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp
index 99858e2a98..62ce335815 100644
--- a/src/core/Rounding.cpp
+++ b/src/core/Rounding.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/core/Rounding.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
@@ -36,7 +37,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy)
 {
     using namespace std;
     int rounded = 0;
-    switch(rounding_policy)
+    switch (rounding_policy)
     {
         case RoundingPolicy::TO_ZERO:
         {
@@ -51,9 +52,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy)
         case RoundingPolicy::TO_NEAREST_EVEN:
         {
 #ifdef __aarch64__
-            asm("fcvtns %x[res], %s[value]"
-                : [res] "=r"(rounded)
-                : [value] "w"(x));
+            asm("fcvtns %x[res], %s[value]" : [res] "=r"(rounded) : [value] "w"(x));
 #else  // __aarch64__
             ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported.");
 #endif // __aarch64__
diff --git a/src/core/Size2D.cpp b/src/core/Size2D.cpp
index 6eb46e56af..69b2651520 100644
--- a/src/core/Size2D.cpp
+++ b/src/core/Size2D.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Size2D.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -30,4 +31,4 @@ std::string Size2D::to_string() const
 {
     return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height);
 }
-}
+} // namespace arm_compute
diff --git a/src/core/Size3D.cpp b/src/core/Size3D.cpp
new file mode 100644
index 0000000000..b56a99acd7
--- /dev/null
+++ b/src/core/Size3D.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Size3D.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+std::string Size3D::to_string() const
+{
+    return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") +
+           support::cpp11::to_string(depth);
+}
+} // namespace arm_compute
diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index 3d68331181..8012c3d721 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,10 +42,10 @@ namespace
 TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coordinates coords)
 {
     // Extend shape
-    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
     {
         int dimension_extend = coords[i] + static_cast<int>(shape[i]);
-        if((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
+        if ((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
         {
             parent_shape.set(i, static_cast<size_t>(dimension_extend));
         }
@@ -56,23 +56,35 @@ TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coo
 } // namespace
 
 SubTensorInfo::SubTensorInfo()
-    : _parent(nullptr), _tensor_shape(), _dims_state(), _coords(), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(false)
+    : _parent(nullptr),
+      _tensor_shape(),
+      _dims_state(),
+      _coords(),
+      _valid_region{Coordinates(), _tensor_shape},
+      _extend_parent(false),
+      _lock_paddings(false)
 {
 }
 
 SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent)
-    : _parent(parent), _tensor_shape(tensor_shape), _dims_state(), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(extend_parent)
+    : _parent(parent),
+      _tensor_shape(tensor_shape),
+      _dims_state(),
+      _coords(coords),
+      _valid_region{Coordinates(), _tensor_shape},
+      _extend_parent(extend_parent),
+      _lock_paddings(false)
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
 
     // Check if subtensor is valid if parent is configured
-    if(parent->tensor_shape().total_size() != 0 && !_extend_parent)
+    if (parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
     }
 
     // Initialize valid region
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 }
 
 std::unique_ptr<ITensorInfo> SubTensorInfo::clone() const
@@ -91,17 +103,17 @@ ITensorInfo &SubTensorInfo::set_tensor_shape(const TensorShape &shape)
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
 
     // Check if subtensor is valid if parent is configured
-    if(_parent->tensor_shape().total_size() != 0 && !_extend_parent)
+    if (_parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
-        _valid_region = ValidRegion{ _coords, shape };
+        _valid_region = ValidRegion{_coords, shape};
     }
-    else if(_extend_parent) // Extend parent shape, configure if specified
+    else if (_extend_parent) // Extend parent shape, configure if specified
     {
         ARM_COMPUTE_ERROR_ON((_parent->data_type() == DataType::UNKNOWN) && (_parent->format() == Format::UNKNOWN));
         TensorShape parent_extended_shape = extend_parent_shape(_parent->tensor_shape(), shape, _coords);
         _parent->set_tensor_shape(parent_extended_shape);
-        _parent->set_valid_region(ValidRegion{ Coordinates(), parent_extended_shape });
+        _parent->set_valid_region(ValidRegion{Coordinates(), parent_extended_shape});
     }
     _tensor_shape = shape;
     return *this;
@@ -114,18 +126,30 @@ ITensorInfo &SubTensorInfo::set_tensor_dims_state(const TensorDimsState &state)
     return *this;
 }
 
+ITensorInfo &SubTensorInfo::set_lock_paddings(bool flag)
+{
+    _lock_paddings = flag;
+    return *this;
+}
+
+bool SubTensorInfo::lock_paddings() const
+{
+    return _lock_paddings;
+}
+
 bool SubTensorInfo::extend_padding(const PaddingSize &padding)
 {
+    ARM_COMPUTE_ERROR_ON(_lock_paddings);
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
     ARM_COMPUTE_ERROR_ON(!_parent->is_resizable());
     ARM_COMPUTE_ERROR_ON(_parent->total_size() == 0);
 
     // Check that you do not extend padding on sub-tensors unless XY shape matches parent tensor
-    if(!_extend_parent && (padding.left || padding.right))
+    if (!_extend_parent && (padding.left || padding.right))
     {
         ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().x() != tensor_shape().x());
     }
-    if(!_extend_parent && (padding.top || padding.bottom))
+    if (!_extend_parent && (padding.top || padding.bottom))
     {
         ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().y() != tensor_shape().y());
     }
@@ -141,7 +165,7 @@ int32_t SubTensorInfo::offset_element_in_bytes(const Coordinates &pos) const
     int32_t        offset  = offset_first_element_in_bytes();
     const Strides &strides = strides_in_bytes();
 
-    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
     {
         offset += pos[i] * strides[i];
     }
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index c471615ee8..31bddbde40 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,20 +27,34 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/Utils.h"
 
 #include <memory>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 TensorInfo::TensorInfo()
-    : _total_size(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _dims_state(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true },
-      _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW)
+    : _total_size(0),
+      _offset_first_element_in_bytes(0),
+      _strides_in_bytes(),
+      _num_channels(0),
+      _tensor_shape(),
+      _dims_state(),
+      _data_type(DataType::UNKNOWN),
+      _format(Format::UNKNOWN),
+      _is_resizable{true},
+      _valid_region{Coordinates(), _tensor_shape},
+      _padding{0},
+      _quantization_info(),
+      _data_layout(DataLayout::NCHW),
+      _are_values_constant(true),
+      _id(invalid_tensor_id),
+      _lock_paddings(false)
 {
 }
 
-TensorInfo::TensorInfo(const ITensorInfo &info)
-    : TensorInfo()
+TensorInfo::TensorInfo(const ITensorInfo &info) : TensorInfo()
 {
     _total_size                    = info.total_size();
     _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
@@ -55,10 +69,31 @@ TensorInfo::TensorInfo(const ITensorInfo &info)
     _padding                       = info.padding();
     _quantization_info             = info.quantization_info();
     _data_layout                   = info.data_layout();
+    _are_values_constant           = info.are_values_constant();
+    _id                            = info.id();
+    _lock_paddings                 = info.lock_paddings();
 }
 
-TensorInfo::TensorInfo(Format format)
-    : TensorInfo(TensorShape(), format)
+TensorInfo::TensorInfo(const TensorInfo &info) : TensorInfo()
+{
+    _total_size                    = info.total_size();
+    _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
+    _strides_in_bytes              = info.strides_in_bytes();
+    _num_channels                  = info.num_channels();
+    _tensor_shape                  = info.tensor_shape();
+    _dims_state                    = info.tensor_dims_state();
+    _data_type                     = info.data_type();
+    _format                        = info.format();
+    _is_resizable                  = info.is_resizable();
+    _valid_region                  = info.valid_region();
+    _padding                       = info.padding();
+    _quantization_info             = info.quantization_info();
+    _data_layout                   = info.data_layout();
+    _are_values_constant           = info.are_values_constant();
+    _id                            = info.id();
+    _lock_paddings                 = false;
+}
+TensorInfo::TensorInfo(Format format) : TensorInfo(TensorShape(), format)
 {
 }
 
@@ -67,25 +102,25 @@ TensorInfo::TensorInfo(unsigned int width, unsigned int height, Format format)
 {
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format)
-    : TensorInfo()
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format) : TensorInfo()
 {
     init(tensor_shape, format);
 }
 
-TensorInfo::TensorInfo(size_t num_channels, DataType data_type)
-    : TensorInfo()
+TensorInfo::TensorInfo(size_t num_channels, DataType data_type) : TensorInfo()
 {
     init(TensorShape(), num_channels, data_type);
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type)
-    : TensorInfo()
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type) : TensorInfo()
 {
     init(tensor_shape, num_channels, data_type);
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info)
+TensorInfo::TensorInfo(const TensorShape &tensor_shape,
+                       size_t             num_channels,
+                       DataType           data_type,
+                       QuantizationInfo   quantization_info)
     : TensorInfo()
 {
     init(tensor_shape, num_channels, data_type);
@@ -114,9 +149,11 @@ void TensorInfo::init(const TensorShape &tensor_shape, Format format)
     _format = format;
 }
 
-void TensorInfo::init(const TensorShape &tensor_shape, Format format,
-                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-                      size_t total_size_in_bytes)
+void TensorInfo::init(const TensorShape &tensor_shape,
+                      Format             format,
+                      const Strides     &strides_in_bytes,
+                      size_t             offset_first_element_in_bytes,
+                      size_t             total_size_in_bytes)
 {
     size_t         num_channels = num_channels_from_format(format);
     const DataType type         = data_type_from_format(format);
@@ -142,9 +179,12 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data
     set_tensor_shape(tensor_shape);
 }
 
-void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type,
-                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-                      size_t total_size_in_bytes)
+void TensorInfo::init(const TensorShape &tensor_shape,
+                      size_t             num_channels,
+                      DataType           data_type,
+                      const Strides     &strides_in_bytes,
+                      size_t             offset_first_element_in_bytes,
+                      size_t             total_size_in_bytes)
 {
     ARM_COMPUTE_ERROR_ON(num_channels == 0);
 
@@ -156,7 +196,7 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data
     _strides_in_bytes              = strides_in_bytes;
     _total_size                    = total_size_in_bytes;
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 }
 
 size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, Format format)
@@ -179,7 +219,7 @@ size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num
     _format       = Format::UNKNOWN;
     _tensor_shape = tensor_shape;
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 
     auto_padding();
 
@@ -210,11 +250,11 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c
     size_t       required_total_size           = 0;
     const size_t required_offset_first_element = padding.left * stride_x + padding.top * stride_y;
 
-    switch(_tensor_shape.num_dimensions())
+    switch (_tensor_shape.num_dimensions())
     {
         case 0:
         {
-            if(_tensor_shape.total_size() > 0)
+            if (_tensor_shape.total_size() > 0)
             {
                 required_strides    = Strides(stride_x, stride_x);
                 required_total_size = stride_z;
@@ -235,7 +275,8 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c
 
             const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
 
-            required_total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension];
+            required_total_size =
+                static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension];
             break;
         }
     }
@@ -243,31 +284,43 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c
     return std::make_tuple(required_strides, required_offset_first_element, required_total_size);
 }
 
+ITensorInfo &TensorInfo::set_lock_paddings(bool flag)
+{
+    _lock_paddings = flag;
+    return *this;
+}
+
+bool TensorInfo::lock_paddings() const
+{
+    return _lock_paddings;
+}
+
 bool TensorInfo::extend_padding(const PaddingSize &padding)
 {
+    ARM_COMPUTE_ERROR_ON(_lock_paddings);
     ARM_COMPUTE_ERROR_ON(!_is_resizable);
 
     bool updated = false;
 
-    if(padding.top > _padding.top)
+    if (padding.top > _padding.top)
     {
         _padding.top = padding.top;
         updated      = true;
     }
 
-    if(padding.right > _padding.right)
+    if (padding.right > _padding.right)
     {
         _padding.right = padding.right;
         updated        = true;
     }
 
-    if(padding.bottom > _padding.bottom)
+    if (padding.bottom > _padding.bottom)
     {
         _padding.bottom = padding.bottom;
         updated         = true;
     }
 
-    if(padding.left > _padding.left)
+    if (padding.left > _padding.left)
     {
         _padding.left = padding.left;
         updated       = true;
@@ -301,7 +354,7 @@ ITensorInfo &TensorInfo::set_format(Format format)
 {
     _format = format;
 
-    if(_data_type == DataType::UNKNOWN)
+    if (_data_type == DataType::UNKNOWN)
     {
         _num_channels = num_channels_from_format(format);
         _data_type    = data_type_from_format(format);
@@ -320,19 +373,19 @@ ITensorInfo &TensorInfo::set_tensor_shape(const TensorShape &shape)
     _offset_first_element_in_bytes = 0;
     _strides_in_bytes              = compute_strides(*this);
 
-    if(_tensor_shape.num_dimensions() == 0)
+    if (_tensor_shape.num_dimensions() == 0)
     {
         _total_size = _strides_in_bytes[0];
     }
     else
     {
         const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
-        _total_size                           = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension];
+        _total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension];
     }
 
     std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
     return *this;
 }
 
@@ -357,9 +410,10 @@ ITensorInfo &TensorInfo::set_data_layout(const DataLayout &data_layout)
 ITensorInfo &TensorInfo::reset_padding()
 {
     _padding = PaddingSize();
-    if(((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0)
+    if (((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0)
     {
-        std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
+        std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) =
+            calculate_padding_requirements(_padding);
     }
     return *this;
 }
@@ -370,10 +424,11 @@ int32_t TensorInfo::offset_element_in_bytes(const Coordinates &pos) const
 
     int32_t offset = _offset_first_element_in_bytes;
 
-    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
     {
         offset += pos[i] * _strides_in_bytes[i];
     }
 
     return offset;
 }
+} // namespace arm_compute
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index b81b498ae5..532d08de92 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,10 +22,12 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/Helpers.h"
-
 #include "arm_compute/core/Utils.h"
 
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -47,7 +49,7 @@ std::string read_file(const std::string &filename, bool binary)
         fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
         std::ios_base::openmode mode = std::ios::in;
 
-        if(binary)
+        if (binary)
         {
             mode |= std::ios::binary;
         }
@@ -64,7 +66,7 @@ std::string read_file(const std::string &filename, bool binary)
         out.assign(std::istreambuf_iterator<char>(fs), std::istreambuf_iterator<char>());
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", filename.c_str(), e.what());
     }
@@ -73,174 +75,89 @@ std::string read_file(const std::string &filename, bool binary)
     return out;
 }
 
-const std::string &string_from_format(Format format)
-{
-    static std::map<Format, const std::string> formats_map =
-    {
-        { Format::UNKNOWN, "UNKNOWN" },
-        { Format::U8, "U8" },
-        { Format::S16, "S16" },
-        { Format::U16, "U16" },
-        { Format::S32, "S32" },
-        { Format::U32, "U32" },
-        { Format::F16, "F16" },
-        { Format::F32, "F32" },
-        { Format::UV88, "UV88" },
-        { Format::RGB888, "RGB888" },
-        { Format::RGBA8888, "RGBA8888" },
-        { Format::YUV444, "YUV444" },
-        { Format::YUYV422, "YUYV422" },
-        { Format::NV12, "NV12" },
-        { Format::NV21, "NV21" },
-        { Format::IYUV, "IYUV" },
-        { Format::UYVY422, "UYVY422" }
-    };
-
-    return formats_map[format];
-}
-
 const std::string &string_from_channel(Channel channel)
 {
-    static std::map<Channel, const std::string> channels_map =
-    {
-        { Channel::UNKNOWN, "UNKNOWN" },
-        { Channel::R, "R" },
-        { Channel::G, "G" },
-        { Channel::B, "B" },
-        { Channel::A, "A" },
-        { Channel::Y, "Y" },
-        { Channel::U, "U" },
-        { Channel::V, "V" },
-        { Channel::C0, "C0" },
-        { Channel::C1, "C1" },
-        { Channel::C2, "C2" },
-        { Channel::C3, "C3" }
-    };
+    static std::map<Channel, const std::string> channels_map = {{Channel::UNKNOWN, "UNKNOWN"},
+                                                                {Channel::R, "R"},
+                                                                {Channel::G, "G"},
+                                                                {Channel::B, "B"},
+                                                                {Channel::A, "A"},
+                                                                {Channel::Y, "Y"},
+                                                                {Channel::U, "U"},
+                                                                {Channel::V, "V"},
+                                                                {Channel::C0, "C0"},
+                                                                {Channel::C1, "C1"},
+                                                                {Channel::C2, "C2"},
+                                                                {Channel::C3, "C3"}};
 
     return channels_map[channel];
 }
 
-const std::string &string_from_data_layout(DataLayout dl)
-{
-    static std::map<DataLayout, const std::string> dl_map =
-    {
-        { DataLayout::UNKNOWN, "UNKNOWN" },
-        { DataLayout::NCHW, "NCHW" },
-        { DataLayout::NHWC, "NHWC" },
-    };
-
-    return dl_map[dl];
-}
-
-const std::string &string_from_data_type(DataType dt)
-{
-    static std::map<DataType, const std::string> dt_map =
-    {
-        { DataType::UNKNOWN, "UNKNOWN" },
-        { DataType::S8, "S8" },
-        { DataType::U8, "U8" },
-        { DataType::S16, "S16" },
-        { DataType::U16, "U16" },
-        { DataType::S32, "S32" },
-        { DataType::U32, "U32" },
-        { DataType::S64, "S64" },
-        { DataType::U64, "U64" },
-        { DataType::F16, "F16" },
-        { DataType::F32, "F32" },
-        { DataType::F64, "F64" },
-        { DataType::SIZET, "SIZET" },
-        { DataType::QSYMM8, "QSYMM8" },
-        { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" },
-        { DataType::QASYMM8, "QASYMM8" },
-        { DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED" },
-        { DataType::QSYMM16, "QSYMM16" },
-        { DataType::QASYMM16, "QASYMM16" },
-    };
-
-    return dt_map[dt];
-}
-
-const std::string &string_from_activation_func(ActivationLayerInfo::ActivationFunction act)
+const std::string &string_from_border_mode(BorderMode border_mode)
 {
-    static std::map<ActivationLayerInfo::ActivationFunction, const std::string> act_map =
-    {
-        { ActivationLayerInfo::ActivationFunction::ABS, "ABS" },
-        { ActivationLayerInfo::ActivationFunction::LINEAR, "LINEAR" },
-        { ActivationLayerInfo::ActivationFunction::LOGISTIC, "LOGISTIC" },
-        { ActivationLayerInfo::ActivationFunction::RELU, "RELU" },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, "BRELU" },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU" },
-        { ActivationLayerInfo::ActivationFunction::LEAKY_RELU, "LRELU" },
-        { ActivationLayerInfo::ActivationFunction::SOFT_RELU, "SRELU" },
-        { ActivationLayerInfo::ActivationFunction::ELU, "ELU" },
-        { ActivationLayerInfo::ActivationFunction::SQRT, "SQRT" },
-        { ActivationLayerInfo::ActivationFunction::SQUARE, "SQUARE" },
-        { ActivationLayerInfo::ActivationFunction::TANH, "TANH" },
-        { ActivationLayerInfo::ActivationFunction::IDENTITY, "IDENTITY" },
-        { ActivationLayerInfo::ActivationFunction::HARD_SWISH, "HARD_SWISH" }
-
+    static std::map<BorderMode, const std::string> border_mode_map = {
+        {BorderMode::UNDEFINED, "UNDEFINED"},
+        {BorderMode::CONSTANT, "CONSTANT"},
+        {BorderMode::REPLICATE, "REPLICATE"},
     };
 
-    return act_map[act];
+    return border_mode_map[border_mode];
 }
 
-const std::string &string_from_interpolation_policy(InterpolationPolicy policy)
+const std::string &string_from_norm_type(NormType type)
 {
-    static std::map<InterpolationPolicy, const std::string> interpolation_policy_map =
-    {
-        { InterpolationPolicy::AREA, "AREA" },
-        { InterpolationPolicy::BILINEAR, "BILINEAR" },
-        { InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR" },
+    static std::map<NormType, const std::string> norm_type_map = {
+        {NormType::IN_MAP_1D, "IN_MAP_1D"},
+        {NormType::IN_MAP_2D, "IN_MAP_2D"},
+        {NormType::CROSS_MAP, "CROSS_MAP"},
     };
 
-    return interpolation_policy_map[policy];
+    return norm_type_map[type];
 }
 
-const std::string &string_from_border_mode(BorderMode border_mode)
+const std::string &string_from_pooling_type(PoolingType type)
 {
-    static std::map<BorderMode, const std::string> border_mode_map =
-    {
-        { BorderMode::UNDEFINED, "UNDEFINED" },
-        { BorderMode::CONSTANT, "CONSTANT" },
-        { BorderMode::REPLICATE, "REPLICATE" },
+    static std::map<PoolingType, const std::string> pool_type_map = {
+        {PoolingType::MAX, "MAX"},
+        {PoolingType::AVG, "AVG"},
+        {PoolingType::L2, "L2"},
     };
 
-    return border_mode_map[border_mode];
+    return pool_type_map[type];
 }
 
-const std::string &string_from_norm_type(NormType type)
+bool is_pool_region_entirely_outside_input(const PoolingLayerInfo &info)
 {
-    static std::map<NormType, const std::string> norm_type_map =
+    if (info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0)
     {
-        { NormType::IN_MAP_1D, "IN_MAP_1D" },
-        { NormType::IN_MAP_2D, "IN_MAP_2D" },
-        { NormType::CROSS_MAP, "CROSS_MAP" },
-    };
-
-    return norm_type_map[type];
+        return false;
+    }
+    const auto ps                = info.pad_stride_info;
+    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.pad_left(), ps.pad_right()});
+    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.pad_top(), ps.pad_bottom()});
+    return pool_le_padding_x || pool_le_padding_y;
 }
 
-const std::string &string_from_pooling_type(PoolingType type)
+bool is_pool_3d_region_entirely_outside_input(const Pooling3dLayerInfo &info)
 {
-    static std::map<PoolingType, const std::string> pool_type_map =
+    if (info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0)
     {
-        { PoolingType::MAX, "MAX" },
-        { PoolingType::AVG, "AVG" },
-        { PoolingType::L2, "L2" },
-    };
-
-    return pool_type_map[type];
+        return false;
+    }
+    const auto ps                = info.padding;
+    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.left, ps.right});
+    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.top, ps.bottom});
+    const auto pool_le_padding_z = info.pool_size.z() <= std::max({ps.front, ps.back});
+    return pool_le_padding_x || pool_le_padding_y || pool_le_padding_z;
 }
 
 const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage)
 {
-    static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map =
-    {
-        { GEMMLowpOutputStageType::NONE, "" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float" }
-    };
+    static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map = {
+        {GEMMLowpOutputStageType::NONE, ""},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down"},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint"},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float"}};
 
     return output_stage_map[output_stage];
 }
@@ -250,7 +167,7 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data
     std::stringstream ss;
     std::string       converted_string;
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -298,43 +215,16 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data
     return converted_string;
 }
 
-DataType data_type_from_name(const std::string &name)
-{
-    static const std::map<std::string, DataType> data_types =
-    {
-        { "f16", DataType::F16 },
-        { "f32", DataType::F32 },
-        { "qasymm8", DataType::QASYMM8 },
-        { "qasymm8_signed", DataType::QASYMM8_SIGNED },
-    };
-
-#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
-    try
-    {
-#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
-        return data_types.at(utility::tolower(name));
-
-#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
-    }
-    catch(const std::out_of_range &)
-    {
-        ARM_COMPUTE_ERROR_VAR("Invalid data type name: %s", name.c_str());
-    }
-#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
-}
-
-std::string lower_string(const std::string &val)
-{
-    std::string res = val;
-    std::transform(res.begin(), res.end(), res.begin(), ::tolower);
-    return res;
-}
-
-PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation,
+PadStrideInfo calculate_same_pad(TensorShape                  input_shape,
+                                 TensorShape                  weights_shape,
+                                 PadStrideInfo                conv_info,
+                                 DataLayout                   data_layout,
+                                 const Size2D                &dilation,
                                  const DimensionRoundingType &rounding_type)
 {
     const auto &strides = conv_info.stride();
-    ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1), "Stride values should be greater than or equal to 1.");
+    ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1),
+                             "Stride values should be greater than or equal to 1.");
 
     const unsigned int width_idx     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -353,8 +243,9 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
     const int real_weight_height = (kernel_height - 1) * dilation.y() + 1;
 
     // Calculate total pad
-    const int pad_width  = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width));
-    const int pad_height = std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height));
+    const int pad_width = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width));
+    const int pad_height =
+        std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height));
 
     // Calculate individual paddings
     const unsigned int pad_left   = pad_width / 2;
@@ -372,8 +263,10 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
     return same_info;
 }
 
-std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                                                      unsigned int kernel_width, unsigned int kernel_height,
+std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int         in_width,
+                                                                      unsigned int         in_height,
+                                                                      unsigned int         kernel_width,
+                                                                      unsigned int         kernel_height,
                                                                       const PadStrideInfo &pad_stride_info)
 {
     const unsigned int pad_left   = pad_stride_info.pad_left();
@@ -392,8 +285,10 @@ std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned i
     return std::make_pair<unsigned int, unsigned int>(w, h);
 }
 
-std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
-                                                        int kernel_width, int kernel_height,
+std::pair<unsigned int, unsigned int> scaled_dimensions(int                  width,
+                                                        int                  height,
+                                                        int                  kernel_width,
+                                                        int                  kernel_height,
                                                         const PadStrideInfo &pad_stride_info,
                                                         const Size2D        &dilation)
 {
@@ -407,15 +302,25 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
     const int stride_y   = pad_stride_info.stride().second;
     int       w          = 0;
     int       h          = 0;
-    switch(pad_stride_info.round())
+    switch (pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1));
-            h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1));
+            w = static_cast<int>(std::floor(
+                (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) +
+                1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) /
+                            stride_y) +
+                           1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1));
-            h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1));
+            w = static_cast<int>(std::ceil(
+                (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) +
+                1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) /
+                           stride_y) +
+                          1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -426,9 +331,8 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
     return std::make_pair(static_cast<unsigned int>(w), static_cast<unsigned int>(h));
 }
 
-std::pair<int, int> scaled_dimensions_signed(int width, int height,
-                                             int kernel_width, int kernel_height,
-                                             const PadStrideInfo &pad_stride_info)
+std::pair<int, int> scaled_dimensions_signed(
+    int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info)
 {
     const int pad_left   = pad_stride_info.pad_left();
     const int pad_top    = pad_stride_info.pad_top();
@@ -438,15 +342,19 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height,
     const int stride_y   = pad_stride_info.stride().second;
     int       w          = 0;
     int       h          = 0;
-    switch(pad_stride_info.round())
+    switch (pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<int>(
+                std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<int>(
+                std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -455,13 +363,59 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height,
     return std::make_pair(static_cast<int>(w), static_cast<int>(h));
 }
 
+std::tuple<int, int, int> scaled_3d_dimensions_signed(int                       width,
+                                                      int                       height,
+                                                      int                       depth,
+                                                      int                       kernel_width,
+                                                      int                       kernel_height,
+                                                      int                       kernel_depth,
+                                                      const Pooling3dLayerInfo &pool3d_info)
+{
+    const int pad_left   = pool3d_info.padding.left;
+    const int pad_top    = pool3d_info.padding.top;
+    const int pad_right  = pool3d_info.padding.right;
+    const int pad_bottom = pool3d_info.padding.bottom;
+    const int pad_front  = pool3d_info.padding.front;
+    const int pad_back   = pool3d_info.padding.back;
+    const int stride_x   = pool3d_info.stride.x();
+    const int stride_y   = pool3d_info.stride.y();
+    const int stride_z   = pool3d_info.stride.z();
+    int       w          = 0;
+    int       h          = 0;
+    int       d          = 0;
+
+    switch (pool3d_info.round_type)
+    {
+        case DimensionRoundingType::FLOOR:
+            w = static_cast<int>(
+                std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            d = static_cast<int>(
+                std::floor((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
+            break;
+        case DimensionRoundingType::CEIL:
+            w = static_cast<int>(
+                std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            d = static_cast<int>(
+                std::ceil((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported rounding type");
+    }
+
+    return std::make_tuple(static_cast<int>(w), static_cast<int>(h), static_cast<int>(d));
+}
+
 bool needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int axis)
 {
     const bool is_min_max        = (op == ReductionOperation::MAX || op == ReductionOperation::MIN);
     const bool is_quantized_type = is_data_type_quantized(dt);
     const bool is_first_dim      = (axis == 0);
 
-    return !is_first_dim || is_min_max || is_quantized_type;
+    return !is_first_dim || (is_quantized_type && !is_min_max);
 }
 
 QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool is_log)
@@ -471,9 +425,9 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
     // * Softmax with QASYMM8_SIGNED: scale = 1/256, offset = -128
     // * LogSoftmax with QASYMM8: scale = 1/256, offset = 0
     // * LogSoftmax with QASYMM8_SIGNED: scale = 16/256, offset = 127
-    if(is_data_type_quantized_asymmetric_signed(input_type))
+    if (is_data_type_quantized_asymmetric_signed(input_type))
     {
-        if(is_log)
+        if (is_log)
         {
             return QuantizationInfo(16.f / 256, 127);
         }
@@ -485,17 +439,22 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
     return QuantizationInfo(1.f / 256, 0);
 }
 
-std::pair<int32_t, int32_t> get_quantized_activation_min_max(ActivationLayerInfo act_info, DataType data_type, UniformQuantizationInfo oq_info)
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info,
+                                                             DataType                   data_type,
+                                                             UniformQuantizationInfo    oq_info)
 {
     const bool is_qasymm8_signed = is_data_type_quantized_asymmetric_signed(data_type);
     const auto a                 = act_info.a();
     const auto b                 = act_info.b();
-    const int  a_int             = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info);
-    const int  b_int             = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
-    const auto type_max_value    = std::get<1>(get_min_max(data_type)).get<int32_t>();
+    const int  a_int          = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info);
+    const int  b_int          = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
+    const auto type_max_value = std::get<1>(get_min_max(data_type)).get<int32_t>();
 
-    const int32_t min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int;
-    const int32_t max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
+    const int32_t min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                       ? std::min(oq_info.offset, type_max_value)
+                                       : b_int;
+    const int32_t max_activation =
+        act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
 
     return std::make_pair(min_activation, max_activation);
 }
@@ -504,11 +463,11 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
 {
     std::unordered_map<const ITensorInfo *, PaddingSize> res;
 
-    for(const ITensor *tensor : tensors)
+    for (const ITensor *tensor : tensors)
     {
-        if(tensor)
+        if (tensor)
         {
-            res.insert({ tensor->info(), tensor->info()->padding() });
+            res.insert({tensor->info(), tensor->info()->padding()});
         }
     }
 
@@ -519,11 +478,11 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
 {
     std::unordered_map<const ITensorInfo *, PaddingSize> res;
 
-    for(const ITensorInfo *info : infos)
+    for (const ITensorInfo *info : infos)
     {
-        if(info)
+        if (info)
         {
-            res.insert({ info, info->padding() });
+            res.insert({info, info->padding()});
         }
     }
 
@@ -532,17 +491,20 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
 
 bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map)
 {
-    return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
-    {
-        return (padding_info.first->padding() != padding_info.second);
-    })
-    != padding_map.end();
+    return std::find_if(padding_map.begin(), padding_map.end(),
+                        [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
+                        { return (padding_info.first->padding() != padding_info.second); }) != padding_map.end();
 }
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
+void print_consecutive_elements(std::ostream      &s,
+                                DataType           dt,
+                                const uint8_t     *ptr,
+                                unsigned int       n,
+                                int                stream_width,
+                                const std::string &element_delim)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -552,30 +514,46 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
         case DataType::QSYMM8:
         case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
-            print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width,
+                                                    element_delim);
             break;
         case DataType::U16:
         case DataType::QASYMM16:
-            print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::S16:
         case DataType::QSYMM16:
-            print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width,
+                                                     element_delim);
             break;
         case DataType::U32:
-            print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::S32:
-            print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width,
+                                                     element_delim);
+            break;
+        case DataType::U64:
+            print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width,
+                                                      element_delim);
+            break;
+        case DataType::S64:
+            print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width,
+                                                     element_delim);
             break;
         case DataType::BFLOAT16:
-            print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::F16:
-            print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width,
+                                                  element_delim);
             break;
         case DataType::F32:
-            print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width,
+                                                   element_delim);
             break;
         default:
             ARM_COMPUTE_ERROR("Undefined element size for given data type");
@@ -584,7 +562,7 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
 
 int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -604,6 +582,10 @@ int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const u
             return max_consecutive_elements_display_width_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n);
         case DataType::S32:
             return max_consecutive_elements_display_width_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n);
+        case DataType::U64:
+            return max_consecutive_elements_display_width_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n);
+        case DataType::S64:
+            return max_consecutive_elements_display_width_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n);
         case DataType::BFLOAT16:
             return max_consecutive_elements_display_width_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n);
         case DataType::F16:
@@ -617,4 +599,4 @@ int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const u
 }
 #endif /* ARM_COMPUTE_ASSERTS_ENABLED */
 
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index 5a6486e11e..d8f796193e 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp
@@ -23,13 +23,16 @@
  */
 #include "arm_compute/core/Validate.h"
 
-arm_compute::Status arm_compute::error_on_mismatching_windows(const char *function, const char *file, const int line,
-                                                              const arm_compute::Window &full, const arm_compute::Window &win)
+arm_compute::Status arm_compute::error_on_mismatching_windows(const char                *function,
+                                                              const char                *file,
+                                                              const int                  line,
+                                                              const arm_compute::Window &full,
+                                                              const arm_compute::Window &win)
 {
     full.validate();
     win.validate();
 
-    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() != win[i].start(), function, file, line);
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() != win[i].end(), function, file, line);
@@ -38,13 +41,16 @@ arm_compute::Status arm_compute::error_on_mismatching_windows(const char *functi
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function, const char *file, const int line,
-                                                            const arm_compute::Window &full, const arm_compute::Window &sub)
+arm_compute::Status arm_compute::error_on_invalid_subwindow(const char                *function,
+                                                            const char                *file,
+                                                            const int                  line,
+                                                            const arm_compute::Window &full,
+                                                            const arm_compute::Window &sub)
 {
     full.validate();
     sub.validate();
 
-    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() > sub[i].start(), function, file, line);
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() < sub[i].end(), function, file, line);
@@ -54,8 +60,12 @@ arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
-                                                                              const arm_compute::Window &full, const arm_compute::Window &window, const int dim)
+arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char                *function,
+                                                                              const char                *file,
+                                                                              const int                  line,
+                                                                              const arm_compute::Window &full,
+                                                                              const arm_compute::Window &window,
+                                                                              const int                  dim)
 {
     full.validate();
     window.validate();
@@ -67,65 +77,73 @@ arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(co
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
-                                                                     const arm_compute::Coordinates &pos, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(
+    const char *function, const char *file, const int line, const arm_compute::Coordinates &pos, unsigned int max_dim)
 {
-    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(pos[i] != 0, function, file, line);
     }
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_window_dimensions_gte(const char *function, const char *file, const int line,
-                                                                const arm_compute::Window &win, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_window_dimensions_gte(
+    const char *function, const char *file, const int line, const arm_compute::Window &win, unsigned int max_dim)
 {
-    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR((win[i].start() != 0) || (win[i].end() != win[i].step()),
-                                                function, file, line,
-                                                "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+            (win[i].start() != 0) || (win[i].end() != win[i].step()), function, file, line,
+            "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
     }
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_tensor_not_2d(const char                 *function,
+                                                        const char                 *file,
+                                                        const int                   line,
                                                         const arm_compute::ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor->info() == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2,
-                                            function, file, line,
-                                            "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->info()->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2, function, file, line,
+                                            "Only 2D Tensors are supported by this kernel (%zu passed)",
+                                            tensor->info()->num_dimensions());
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_tensor_not_2d(const char                     *function,
+                                                        const char                     *file,
+                                                        const int                       line,
                                                         const arm_compute::ITensorInfo *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2,
-                                            function, file, line,
-                                            "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2, function, file, line,
+                                            "Only 2D Tensors are supported by this kernel (%zu passed)",
+                                            tensor->num_dimensions());
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
-                                                                      arm_compute::Format fmt, arm_compute::Channel cn)
+arm_compute::Status arm_compute::error_on_channel_not_in_known_format(
+    const char *function, const char *file, const int line, arm_compute::Format fmt, arm_compute::Channel cn)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(fmt == arm_compute::Format::UNKNOWN, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == arm_compute::Channel::UNKNOWN, function, file, line);
 
-    switch(fmt)
+    switch (fmt)
     {
         case arm_compute::Format::RGB888:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R,
+                                                 arm_compute::Channel::G, arm_compute::Channel::B);
             break;
         case arm_compute::Format::RGBA8888:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B, arm_compute::Channel::A);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R,
+                                                 arm_compute::Channel::G, arm_compute::Channel::B,
+                                                 arm_compute::Channel::A);
             break;
         case arm_compute::Format::UV88:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U, arm_compute::Channel::V);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U,
+                                                 arm_compute::Channel::V);
             break;
         case arm_compute::Format::IYUV:
         case arm_compute::Format::UYVY422:
@@ -133,7 +151,8 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char
         case arm_compute::Format::NV12:
         case arm_compute::Format::NV21:
         case arm_compute::Format::YUV444:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y, arm_compute::Channel::U, arm_compute::Channel::V);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y,
+                                                 arm_compute::Channel::U, arm_compute::Channel::V);
             break;
         default:
             ARM_COMPUTE_ERROR_LOC(function, file, line, "Not supported format.");
@@ -141,21 +160,26 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char                 *function,
+                                                              const char                 *file,
+                                                              const int                   line,
                                                               const arm_compute::IKernel *kernel)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(kernel == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(),
-                                        function, file, line,
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(), function, file, line,
                                         "This kernel hasn't been configured.");
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
-                                                            const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
+arm_compute::Status arm_compute::error_on_invalid_subtensor(const char        *function,
+                                                            const char        *file,
+                                                            const int          line,
+                                                            const TensorShape &parent_shape,
+                                                            const Coordinates &coords,
+                                                            const TensorShape &shape)
 {
     // Check dimensions
-    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
     {
         const bool invalid_idx        = coords[i] >= static_cast<int>(parent_shape[i]);
         const bool out_of_bounds_size = coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]);
@@ -164,15 +188,20 @@ arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
-                                                                         const ValidRegion &parent_valid_region, const ValidRegion &valid_region)
+arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char        *function,
+                                                                         const char        *file,
+                                                                         const int          line,
+                                                                         const ValidRegion &parent_valid_region,
+                                                                         const ValidRegion &valid_region)
 {
     // Check valid regions
-    for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line);
-        ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
-                                        function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(
+            (parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) <
+                (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
+            function, file, line);
     }
 
     return arm_compute::Status{};
diff --git a/src/core/common/Macros.h b/src/core/common/Macros.h
index d791154e5c..bc0ea29911 100644
--- a/src/core/common/Macros.h
+++ b/src/core/common/Macros.h
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_COMMON_MACROS_H
 
 #define ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(TypeName) \
-    TypeName(const TypeName &) = delete;               \
+    TypeName(const TypeName &)            = delete;    \
     TypeName &operator=(const TypeName &) = delete;    \
     TypeName(TypeName &&)                 = default;   \
-    TypeName &operator=(TypeName &&) = default
+    TypeName &operator=(TypeName &&)      = default
 
 #endif /* ARM_COMPUTE_COMMON_MACROS_H */
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index 44ddf9808d..cd849c3666 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,108 +21,194 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_COMMON_REGISTRARS_H
-#define SRC_CORE_COMMON_REGISTRARS_H
+#ifndef ACL_SRC_CORE_COMMON_REGISTRARS_H
+#define ACL_SRC_CORE_COMMON_REGISTRARS_H
 
 #if defined(ENABLE_FP16_KERNELS)
 
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #define REGISTER_FP16_SVE(func_name) &(func_name)
-#else /* !defined(ENABLE_SVE) */
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE) */
 #define REGISTER_FP16_SVE(func_name) nullptr
-#endif /* defined(ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 
-#if defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_FP16_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_FP16_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#define REGISTER_FP16_SME2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SME2) */
+#define REGISTER_FP16_SME2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SME2) */
+
+#if defined(ARM_COMPUTE_ENABLE_NEON)
 #define REGISTER_FP16_NEON(func_name) &(func_name)
-#else /* !defined(ENABLE_NEON) */
+#else /* !defined(ARM_COMPUTE_ENABLE_NEON) */
 #define REGISTER_FP16_NEON(func_name) nullptr
-#endif /* defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
 
 #else /* !defined(ENABLE_FP16_KERNELS) */
 #define REGISTER_FP16_NEON(func_name) nullptr
-#define REGISTER_FP16_SVE(func_name) nullptr
+#define REGISTER_FP16_SVE(func_name)  nullptr
+#define REGISTER_FP16_SVE2(func_name) nullptr
+#define REGISTER_FP16_SME2(func_name) nullptr
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 
 #if defined(ENABLE_FP32_KERNELS)
 
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #define REGISTER_FP32_SVE(func_name) &(func_name)
-#else /* !defined(ENABLE_SVE) */
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE) */
 #define REGISTER_FP32_SVE(func_name) nullptr
-#endif /* defined(ENABLE_SVE) */
-
-#if defined(ENABLE_NEON)
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_FP32_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_FP32_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#define REGISTER_FP32_SME2(func_name)           &(func_name)
+#define REGISTER_QASYMM8_SME2(func_name)        &(func_name)
+#define REGISTER_QASYMM8_SIGNED_SME2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SME2) */
+#define REGISTER_FP32_SME2(func_name)           nullptr
+#define REGISTER_QASYMM8_SME2(func_name)        nullptr
+#define REGISTER_QASYMM8_SIGNED_SME2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SME2) */
+
+#if defined(ARM_COMPUTE_ENABLE_NEON)
 #define REGISTER_FP32_NEON(func_name) &(func_name)
-#else /* !defined(ENABLE_NEON) */
+#else /* !defined(ARM_COMPUTE_ENABLE_NEON) */
 #define REGISTER_FP32_NEON(func_name) nullptr
-#endif /* defined(ENABLE_NEON) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
 
 #else /* defined(ENABLE_FP32_KERNELS) */
 #define REGISTER_FP32_NEON(func_name) nullptr
-#define REGISTER_FP32_SVE(func_name) nullptr
+#define REGISTER_FP32_SVE(func_name)  nullptr
+#define REGISTER_FP32_SVE2(func_name) nullptr
+#define REGISTER_FP32_SME2(func_name) nullptr
 #endif /* defined(ENABLE_FP32_KERNELS) */
 
 #if defined(ENABLE_QASYMM8_SIGNED_KERNELS)
 
 #define REGISTER_QASYMM8_SIGNED_NEON(func_name) &(func_name)
 
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #define REGISTER_QASYMM8_SIGNED_SVE(func_name) &(func_name)
-#else /* !defined(ENABLE_SVE) */
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE) */
 #define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
-#endif /* defined(ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_QASYMM8_SIGNED_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_QASYMM8_SIGNED_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#define REGISTER_QASYMM8_SIGNED_SME2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SME2) */
+#define REGISTER_QASYMM8_SIGNED_SME2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SME2) */
 
 #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr
-#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SIGNED_SVE(func_name)  nullptr
+#define REGISTER_QASYMM8_SIGNED_SVE2(func_name) nullptr
+#define REGISTER_QASYMM8_SIGNED_SME2(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 
 #if defined(ENABLE_QASYMM8_KERNELS)
 #define REGISTER_QASYMM8_NEON(func_name) &(func_name)
 
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #define REGISTER_QASYMM8_SVE(func_name) &(func_name)
-#else /* !defined(ENABLE_SVE) */
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE) */
 #define REGISTER_QASYMM8_SVE(func_name) nullptr
-#endif /* defined(ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_QASYMM8_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_QASYMM8_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#define REGISTER_QASYMM8_SME2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SME2) */
+#define REGISTER_QASYMM8_SME2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SME2) */
 
 #else /* defined(ENABLE_QASYMM8_KERNELS) */
 #define REGISTER_QASYMM8_NEON(func_name) nullptr
-#define REGISTER_QASYMM8_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SVE(func_name)  nullptr
+#define REGISTER_QASYMM8_SVE2(func_name) nullptr
+#define REGISTER_QASYMM8_SME2(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_KERNELS) */
 
 #if defined(ENABLE_QSYMM16_KERNELS)
 
 #define REGISTER_QSYMM16_NEON(func_name) &(func_name)
 
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #define REGISTER_QSYMM16_SVE(func_name) &(func_name)
-#else /* !defined(ENABLE_SVE) */
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE) */
 #define REGISTER_QSYMM16_SVE(func_name) nullptr
-#endif /* defined(ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_QSYMM16_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_QSYMM16_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 
 #else /* defined(ENABLE_QSYMM16_KERNELS) */
 #define REGISTER_QSYMM16_NEON(func_name) nullptr
-#define REGISTER_QSYMM16_SVE(func_name) nullptr
+#define REGISTER_QSYMM16_SVE(func_name)  nullptr
+#define REGISTER_QSYMM16_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QSYMM16_KERNELS) */
 
+#if defined(ENABLE_QASYMM8_KERNELS) || defined(ENABLE_QASYMM8_SIGNED_KERNELS)
+#define REGISTER_Q8_NEON(func_name) &(func_name)
+#else /* !defined(ENABLE_QASYMM8_KERNELS) && !defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
+#define REGISTER_Q8_NEON(func_name) nullptr
+#endif /* defined(ENABLE_QASYMM8_KERNELS) || defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
+
 #if defined(ENABLE_INTEGER_KERNELS)
 
-#if defined(ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 #define REGISTER_INTEGER_SVE(func_name) &(func_name)
-#else /* !defined(ENABLE_SVE) */
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE) */
 #define REGISTER_INTEGER_SVE(func_name) nullptr
-#endif /* defined(ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#define REGISTER_INTEGER_SVE2(func_name) &(func_name)
+#else /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
+#define REGISTER_INTEGER_SVE2(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 
-#if defined(ENABLE_NEON)
+#if defined(ARM_COMPUTE_ENABLE_NEON)
 #define REGISTER_INTEGER_NEON(func_name) &(func_name)
-#else /* !defined(ENABLE_NEON) */
+#else /* !defined(ARM_COMPUTE_ENABLE_NEON) */
 #define REGISTER_INTEGER_NEON(func_name) nullptr
-#endif /* defined(ENABLE_NEON) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
 
 #else /* defined(ENABLE_INTEGER_KERNELS) */
 #define REGISTER_INTEGER_NEON(func_name) nullptr
-#define REGISTER_INTEGER_SVE(func_name) nullptr
+#define REGISTER_INTEGER_SVE(func_name)  nullptr
+#define REGISTER_INTEGER_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_INTEGER_KERNELS) */
 
-#endif /* SRC_CORE_COMMON_REGISTRARS_H */
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+#define REGISTER_BF16_NEON(func_name) &(func_name)
+#else /* !(defined(ARM_COMPUTE_ENABLE_BF16))*/
+#define REGISTER_BF16_NEON(func_name) nullptr
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16)*/
+
+#endif // ACL_SRC_CORE_COMMON_REGISTRARS_H
diff --git a/src/core/cpu/ICpuKernel.h b/src/core/cpu/ICpuKernel.h
deleted file mode 100644
index 650b3a7d0b..0000000000
--- a/src/core/cpu/ICpuKernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICPUKERNEL_H
-#define ARM_COMPUTE_ICPUKERNEL_H
-
-#include "arm_compute/core/CPP/ICPPKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-using ICpuKernel = arm_compute::ICPPKernel;
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ICPUKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp
deleted file mode 100644
index 8a57a3b529..0000000000
--- a/src/core/cpu/kernels/CpuActivationKernel.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuActivationKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/activation/list.h"
-
-#include <array>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct ActivationSelectorData
-{
-    DataType dt;
-};
-
-using ActivationSelectorPtr = std::add_pointer<bool(const ActivationSelectorData &data)>::type;
-using ActivationKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
-
-struct ActivationKernel
-{
-    const char                 *name;
-    const ActivationSelectorPtr is_selected;
-    ActivationKernelPtr         ukernel;
-};
-
-static const ActivationKernel available_kernels[] =
-{
-#if defined(ENABLE_SVE)
-    {
-        "fp16_sve_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation)
-    },
-    {
-        "fp32_sve_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation)
-    },
-#endif /* defined(ENABLE_SVE)  */
-#if defined(ENABLE_NEON)
-    {
-        "fp16_neon_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation)
-    },
-    {
-        "fp32_neon_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation)
-    },
-#endif /* defined(ENABLE_NEON)  */
-#if defined(__ARM_FEATURE_SVE2)
-    {
-        "qasymm8_sve_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation)
-    },
-    {
-        "qasymm8_signed_sve_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation)
-    },
-    {
-        "qsymm16_sve_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
-        REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation)
-    },
-#else  /* !defined(__ARM_FEATURE_SVE2) */
-    {
-        "qasymm8_neon_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation)
-    },
-    {
-        "qasymm8_signed_neon_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation)
-    },
-    {
-        "qsymm16_neon_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation)
-    },
-#endif /* defined(__ARM_FEATURE_SVE2)  */
-};
-
-const ActivationKernel *get_implementation(const ActivationSelectorData &data)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected(data))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-/* Supported activation in the 8-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 7> qasymm8_activations =
-{
-    ActivationLayerInfo::ActivationFunction::RELU,
-    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-    ActivationLayerInfo::ActivationFunction::LOGISTIC,
-    ActivationLayerInfo::ActivationFunction::TANH,
-    ActivationLayerInfo::ActivationFunction::HARD_SWISH,
-    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
-};
-/* Supported activation in the 16-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 3> qsymm16_activations =
-{
-    ActivationLayerInfo::ActivationFunction::LOGISTIC,
-    ActivationLayerInfo::ActivationFunction::TANH,
-    ActivationLayerInfo::ActivationFunction::HARD_SWISH
-};
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
-
-    const auto *uk = get_implementation(ActivationSelectorData{ src->data_type() });
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    const DataType                                data_type = src->data_type();
-    const QuantizationInfo                       &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
-    const ActivationLayerInfo::ActivationFunction f_act     = activation_info.activation();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)),
-                                    "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)),
-                                    "For QSYMM16 only tanh and logistic are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH)
-                                && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-                                && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-
-    // Checks performed when dst is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
-{
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    if(dst != nullptr)
-    {
-        // dst auto inizialitation if not yet initialized
-        auto_init_if_empty(*dst, *src->clone());
-    }
-
-    return std::make_pair(Status{}, win);
-}
-} // namespace
-
-void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-
-    _act_info = activation_info;
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICPPKernel::configure(win_config.second);
-}
-
-Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
-
-    return Status{};
-}
-
-void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    // Early exit on disabled activation
-    if(!_act_info.enabled())
-    {
-        return;
-    }
-
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    const auto *uk = get_implementation(ActivationSelectorData{ src->info()->data_type() });
-
-    uk->ukernel(src, dst, _act_info, window);
-}
-
-const char *CpuActivationKernel::name() const
-{
-    return "CpuActivationKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuActivationKernel.h b/src/core/cpu/kernels/CpuActivationKernel.h
deleted file mode 100644
index de71014303..0000000000
--- a/src/core/cpu/kernels/CpuActivationKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
-#define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the activation kernel */
-class CpuActivationKernel : public ICpuKernel
-{
-public:
-    CpuActivationKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuActivationKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in, out] src             Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     dst             Destination tensor info. Data type supported: same as @p src
-     * @param[in]      activation_info Activation layer information.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuActivationKernel
-     *
-     * @param[in] src      Source tensor info. In case of @p dst tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[in] dst      Destination tensor info. Data type supported: same as @p src
-     * @param[in] act_info Activation layer information.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    ActivationLayerInfo _act_info{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp
deleted file mode 100644
index 7afdceae38..0000000000
--- a/src/core/cpu/kernels/CpuAddKernel.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuAddKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/add/neon/list.h"
-#include "src/core/cpu/kernels/add/sve/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <array>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct AddSelectorData
-{
-    DataType dt1;
-    DataType dt2;
-    DataType dt3;
-};
-
-using AddSelectorPtr = std::add_pointer<bool(const AddSelectorData &data)>::type;
-using AddKernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
-struct AddKernel
-{
-    const char          *name;
-    const AddSelectorPtr is_selected;
-    AddKernelPtr         ukernel;
-};
-
-static const AddKernel available_kernels[] =
-{
-#if defined(ENABLE_SVE)
-    {
-        "add_same_sve",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
-        REGISTER_FP32_SVE(arm_compute::cpu::add_same_sve<float>)
-    },
-    {
-        "add_same_sve",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); },
-        REGISTER_FP16_SVE(arm_compute::cpu::add_same_sve<float16_t>)
-    },
-    {
-        "add_same_sve",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<uint8_t>)
-    },
-    {
-        "add_same_sve",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int16_t>)
-    },
-    {
-        "add_same_sve",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int32_t>)
-    },
-    {
-        "add_u8_s16_s16_sve",
-        [](const AddSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_s16_s16_sve)
-    },
-    {
-        "add_s16_u8_s16_sve",
-        [](const AddSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_u8_s16_sve)
-    },
-    {
-        "add_u8_u8_s16_sve",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_u8_s16_sve)
-    },
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
-    {
-        "add_same_neon",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_same_neon<float>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "add_same_neon",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::add_same_neon<float16_t>)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "add_same_neon",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<uint8_t>)
-    },
-    {
-        "add_same_neon",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int16_t>)
-    },
-    {
-        "add_same_neon",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int32_t>)
-    },
-    {
-        "add_u8_s16_s16_neon",
-        [](const AddSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_s16_s16_neon)
-    },
-    {
-        "add_s16_u8_s16_neon",
-        [](const AddSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_u8_s16_neon)
-    },
-    {
-        "add_u8_u8_s16_neon",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_u8_s16_neon)
-    },
-#endif /* defined(ENABLE_NEON) */
-#if defined(__ARM_FEATURE_SVE2)
-    {
-        "add_qasymm8_sve",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::add_qasymm8_sve)
-    },
-    {
-        "add_qasymm8_signed_sve",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::add_qasymm8_signed_sve)
-    },
-    {
-        "add_qsymm16_sve",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
-        REGISTER_QSYMM16_SVE(arm_compute::cpu::add_qsymm16_sve)
-    },
-#else  /* !defined(__ARM_FEATURE_SVE2) */
-    {
-        "add_qasymm8_neon",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)
-    },
-    {
-        "add_qasymm8_signed_neon",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)
-    },
-    {
-        "add_qsymm16_neon",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
-    },
-#endif /* defined(ENABLE_NEON) */
-
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const AddKernel *get_implementation(DataType dt1, DataType dt2, DataType dt3)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt1, dt2, dt3 }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type())
-                                                                                             || (src1.data_type() != dst.data_type())),
-                                    "Broadcasting across width is supported on configurations where all tensors have the same data type");
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-            !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::U8)
-            && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32 && dst.data_type() == DataType::S32)
-            && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32 && dst.data_type() == DataType::F32)
-            && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16 && dst.data_type() == DataType::F16)
-            && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && dst.data_type() == DataType::QASYMM8)
-            && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && dst.data_type() == DataType::QASYMM8_SIGNED)
-            && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && dst.data_type() == DataType::QSYMM16),
-            "You called addition with the wrong image formats");
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
-                                        "Wrong shape for dst");
-    }
-
-    const auto *uk = get_implementation(src0.data_type(), src1.data_type(), dst.data_type());
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo &src0, const ITensorInfo &src1, ITensorInfo &dst)
-{
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
-
-    // Auto initialize dst if not initialized
-    {
-        set_shape_if_empty(dst, out_shape);
-
-        if(src0.data_type() == DataType::S16 || src1.data_type() == DataType::S16)
-        {
-            set_format_if_unknown(dst, Format::S16);
-        }
-        if(src0.data_type() == DataType::S32 || src1.data_type() == DataType::S32)
-        {
-            set_format_if_unknown(dst, Format::S32);
-        }
-        else if(src0.data_type() == DataType::F16 || src1.data_type() == DataType::F16)
-        {
-            set_format_if_unknown(dst, Format::F16);
-        }
-        else if(src0.data_type() == DataType::F32 || src1.data_type() == DataType::F32)
-        {
-            set_format_if_unknown(dst, Format::F32);
-        }
-        else if(src0.data_type() == DataType::QASYMM8 || src1.data_type() == DataType::QASYMM8)
-        {
-            set_data_type_if_unknown(dst, DataType::QASYMM8);
-        }
-        else if(src0.data_type() == DataType::QASYMM8_SIGNED || src1.data_type() == DataType::QASYMM8_SIGNED)
-        {
-            set_data_type_if_unknown(dst, DataType::QASYMM8_SIGNED);
-        }
-        else if(src0.data_type() == DataType::QSYMM16 || src1.data_type() == DataType::QSYMM16)
-        {
-            set_data_type_if_unknown(dst, DataType::QSYMM16);
-        }
-    }
-
-    Window win = calculate_max_window(out_shape, Steps());
-
-    // CpuAddKernel doesn't need padding so update_window_and_padding() can be skipped
-    return std::make_pair(Status{}, win);
-}
-} // namespace
-
-void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
-
-    _policy = policy;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(*src0, *src1, *dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICpuKernel::configure(win_config.second);
-}
-
-Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*src0->clone(), *src1->clone(), *dst->clone()).first);
-
-    return Status{};
-}
-
-void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    const auto *uk = get_implementation(src0->info()->data_type(), src1->info()->data_type(), dst->info()->data_type());
-    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    uk->ukernel(src0, src1, dst, _policy, window);
-}
-
-const char *CpuAddKernel::name() const
-{
-    return "CpuAddKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuAddKernel.h b/src/core/cpu/kernels/CpuAddKernel.h
deleted file mode 100644
index a36ec7ad65..0000000000
--- a/src/core/cpu/kernels/CpuAddKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPUADDKERNEL_H
-#define ARM_COMPUTE_CPUADDKERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform addition between two tensors */
-class CpuAddKernel : public ICpuKernel
-{
-public:
-    CpuAddKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuAddKernel);
-    /** Initialise the kernel's input, dst and border mode.
-     *
-     * Valid configurations (src0,src1) -> dst :
-     *
-     *   - (U8,U8)           -> U8
-     *   - (U8,U8)           -> S16
-     *   - (S16,U8)          -> S16
-     *   - (U8,S16)          -> S16
-     *   - (S16,S16)         -> S16
-     *   - (S32,S32)         -> S32
-     *   - (F16,F16)         -> F16
-     *   - (F32,F32)         -> F32
-     *   - (QASYMM8,QASYMM8) -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16) -> QSYMM16
-     *
-     * @param[in]  src0   First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
-     * @param[in]  src1   Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
-     * @param[out] dst    The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in]  policy Overflow policy.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuAddKernel
-     *
-     * @param[in] src0   First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
-     * @param[in] src1   Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
-     * @param[in] dst    The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] policy Overflow policy.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    ConvertPolicy _policy{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPUADDKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuCastKernel.cpp b/src/core/cpu/kernels/CpuCastKernel.cpp
deleted file mode 100644
index 46f3c330ef..0000000000
--- a/src/core/cpu/kernels/CpuCastKernel.cpp
+++ /dev/null
@@ -1,1367 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuCastKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/SaturateCast.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst);
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
-                                                         DataType::F32, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
-                                                         DataType::U32, DataType::S32, DataType::F32);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
-                                                                                     && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
-                                    "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
-                                                                              && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
-                                    "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
-                                                                         && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
-                                    "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
-                                    "Only data_types supported [in] U16 ->  [out] U8, U32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
-                                    "Only data_types supported [in] S16 ->  [out] U8, S32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32,
-                                    "Only data_types supported [in] BFLOAT16 ->  [out] F32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::U8
-                                                                          && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
-                                    "Only data_types supported [in] F16 ->  [out] QASYMM8, F32, S32, U8");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16
-                                                                          && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
-                                    "Only data_types supported [in] F32 ->  [out] QASYMM8, BFLOAT16, F16, S32, U8");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::F16
-                                                                          && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8),
-                                    "Only data_types supported [in] S32 ->  [out] QASYMM8, F16, F32, U8");
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuCastKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
-    set_shape_if_empty(*dst, src->tensor_shape());
-
-    _policy = policy;
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    ICPPKernel::configure(win);
-}
-
-Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
-    return Status{};
-}
-
-void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16;
-
-    const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *_dst = tensors.get_tensor(TensorType::ACL_DST);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-    ARM_COMPUTE_ERROR_ON(_src == _dst);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator src(_src, win);
-    Iterator dst(_dst, win);
-
-    switch(_src->info()->data_type())
-    {
-        case DataType::QASYMM8_SIGNED:
-        {
-            switch(_dst->info()->data_type())
-            {
-                case DataType::S16:
-                {
-                    /* Up-conversion QASYMM8_SIGNED -> S16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
-
-                            vst1q_s16(dst_ptr + x, texels.val[0]);
-                            vst1q_s16(dst_ptr + x + 8, texels.val[1]);
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::S32:
-                {
-                    /* Up-conversion QASYMM8_SIGNED -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
-
-                            vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                            vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::F32:
-                {
-                    /* Up-conversion QASYMM8_SIGNED -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(src.ptr()));
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                {
-                    /* Up-conversion QASYMM8_SIGNED -> F16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
-                            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-                            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-        }
-
-        case DataType::QASYMM8:
-        case DataType::U8:
-        {
-            switch(_dst->info()->data_type())
-            {
-                case DataType::S16:
-                {
-                    /* Up-conversion U8 -> S16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
-
-                            vst1q_s16(dst_ptr + x, texels.val[0]);
-                            vst1q_s16(dst_ptr + x + 8, texels.val[1]);
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::S32:
-                {
-                    /* Up-conversion U8 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
-
-                            vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                            vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::F32:
-                {
-                    /* Up-conversion U8 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                {
-                    /* Up-conversion U8 -> F16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
-                            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-                            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::U16:
-                {
-                    /* Up-conversion U8 -> U16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
-                            const uint16x8x2_t texels =
-                            {
-                                {
-                                    vmovl_u8(vget_low_u8(texels_u8)),
-                                    vmovl_u8(vget_high_u8(texels_u8))
-                                }
-                            };
-
-                            vst1q_u16(dst_ptr + x, texels.val[0]);
-                            vst1q_u16(dst_ptr + x + 8, texels.val[1]);
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-        }
-        case DataType::S16:
-        {
-            switch(_dst->info()->data_type())
-            {
-                case DataType::QASYMM8_SIGNED:
-                {
-                    /* Down-conversion S16 -> QASYMM8_SIGNED */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    else
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    break;
-                }
-                case DataType::U8:
-                {
-                    /* Down-conversion S16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    else
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
-                                                                  vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    break;
-                }
-                case DataType::S32:
-                {
-                    /* Up-conversion S16 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_s16(src_ptr + x),
-                                    vld1q_s16(src_ptr + x + 8)
-                                }
-                            };
-
-                            const int32x4x4_t texels_s32 =
-                            {
-                                {
-                                    vmovl_s16(vget_low_s16(texels.val[0])),
-                                    vmovl_s16(vget_high_s16(texels.val[0])),
-                                    vmovl_s16(vget_low_s16(texels.val[1])),
-                                    vmovl_s16(vget_high_s16(texels.val[1]))
-                                }
-                            };
-
-                            vst1q_s32(dst_ptr + x, texels_s32.val[0]);
-                            vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
-                            vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
-                            vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-        }
-        case DataType::U16:
-        {
-            switch(_dst->info()->data_type())
-            {
-                case DataType::U8:
-                {
-                    /* Down-conversion U16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const uint16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_u16(src_ptr + x),
-                                        vld1q_u16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    else
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const uint16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_u16(src_ptr + x),
-                                        vld1q_u16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-
-                        },
-                        src, dst);
-                    }
-                    break;
-                }
-                case DataType::U32:
-                {
-                    /* Up-conversion U16 -> U32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_u16(src_ptr + x),
-                                    vld1q_u16(src_ptr + x + 8)
-                                }
-                            };
-
-                            vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
-                            vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
-                            vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
-                            vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
-                        }
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-
-                    },
-                    src, dst);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-        }
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-        case DataType::BFLOAT16:
-            switch(_dst->info()->data_type())
-            {
-                case DataType::F32:
-                {
-                    /* Up-conversion BFLOAT16 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const bfloat16 *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr())),
-                                    vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr()) + 8)
-                                }
-                            };
-
-                            vst1q_f32(reinterpret_cast<float *>(dst.ptr()),
-                                      vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
-                            vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 4,
-                                      vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
-                            vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 8,
-                                      vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
-                            vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 12,
-                                      vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
-                        }
-
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = float(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type unsupported");
-            }
-            break;
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            switch(_dst->info()->data_type())
-            {
-                case DataType::QASYMM8_SIGNED:
-                {
-                    /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_f16(src_ptr + x),
-                                    vld1q_f16(src_ptr + x + 8),
-                                }
-                            };
-
-                            vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::QASYMM8:
-                case DataType::U8:
-                {
-                    /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_f16(src_ptr + x),
-                                    vld1q_f16(src_ptr + x + 8),
-                                }
-                            };
-
-                            vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                        }
-
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::F32:
-                {
-                    /* Up-conversion F16 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_f16(src_ptr + x),
-                                    vld1q_f16(src_ptr + x + 8)
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
-                            vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
-                            vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
-                            vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::S32:
-                {
-                    /* Up-conversion F16 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_f16(src_ptr + x),
-                                    vld1q_f16(src_ptr + x + 8)
-                                }
-                            };
-
-                            vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
-                            vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
-                            vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
-                            vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-            switch(_dst->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                {
-                    /* Down-conversion F32 -> F16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float32x4x4_t texels =
-                            {
-                                {
-                                    vld1q_f32(src_ptr + x),
-                                    vld1q_f32(src_ptr + x + 4),
-                                    vld1q_f32(src_ptr + x + 8),
-                                    vld1q_f32(src_ptr + x + 12)
-                                }
-                            };
-
-                            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-                            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-                case DataType::BFLOAT16:
-                {
-                    /* Down-conversion F32 -> BFLOAT16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<bfloat16 *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()),
-                                                   reinterpret_cast<uint16_t *>(dst.ptr()));
-                            wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()) + 8,
-                                                   reinterpret_cast<uint16_t *>(dst.ptr()) + 8);
-                        }
-
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = *(src_ptr + x);
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-                case DataType::S32:
-                {
-                    /* Conversion F32 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float32x4x4_t texels =
-                            {
-                                {
-                                    vld1q_f32(src_ptr + x),
-                                    vld1q_f32(src_ptr + x + 4),
-                                    vld1q_f32(src_ptr + x + 8),
-                                    vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
-                            vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
-                            vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
-                            vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::QASYMM8:
-                case DataType::U8:
-                {
-                    /* Down-conversion F32 -> U8 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float32x4x4_t texels =
-                            {
-                                {
-                                    vld1q_f32(src_ptr + x),
-                                    vld1q_f32(src_ptr + x + 4),
-                                    vld1q_f32(src_ptr + x + 8),
-                                    vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
-                            vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::QASYMM8_SIGNED:
-                {
-                    /* Down-conversion F32 -> QASYMM8_SIGNED */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float32x4x4_t texels =
-                            {
-                                {
-                                    vld1q_f32(src_ptr + x),
-                                    vld1q_f32(src_ptr + x + 4),
-                                    vld1q_f32(src_ptr + x + 8),
-                                    vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
-                            vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
-                        }
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-
-        case DataType::S32:
-            switch(_dst->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                {
-                    /* Down-conversion S32 -> F16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float32x4x4_t texels =
-                            {
-                                {
-                                    vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
-                                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
-                                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
-                                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
-                                }
-                            };
-
-                            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-                            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                case DataType::F32:
-                {
-                    /* Conversion S32 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int32x4x4_t texels =
-                            {
-                                {
-                                    vld1q_s32(src_ptr + x),
-                                    vld1q_s32(src_ptr + x + 4),
-                                    vld1q_s32(src_ptr + x + 8),
-                                    vld1q_s32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::QASYMM8_SIGNED:
-                {
-                    /* Down-conversion S32 -> QASYMM8_SIGNED */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12),
-                                    }
-                                };
-                                vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
-                                vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    else
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
-
-                                vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
-                                vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    break;
-                }
-                case DataType::QASYMM8:
-                case DataType::U8:
-                {
-                    /* Down-conversion S32 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
-                                vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
-                                vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    else
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
-
-                                vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
-                                vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
-}
-
-const char *CpuCastKernel::name() const
-{
-    return "CpuCastKernel.cpp";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuCastKernel.h b/src/core/cpu/kernels/CpuCastKernel.h
deleted file mode 100644
index 2a75c5850e..0000000000
--- a/src/core/cpu/kernels/CpuCastKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CAST_KERNEL_H
-#define ARM_COMPUTE_CPU_CAST_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Casts a given tensor to a new type
- *
- * @note When casting between quantized types the scale and zeroPoint are ignored
- */
-class CpuCastKernel : public ICpuKernel
-{
-public:
-    CpuCastKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCastKernel);
-    /** Set the src and dst of the kernel
-     *
-     * Valid conversions src -> dst :
-     *
-     *   - QASYMM8_SIGNED -> S16, S32, F32, F16
-     *   - QASYMM8        -> U16, S16, S32, F32, F16
-     *   - U8             -> U16, S16, S32, F32, F16
-     *   - U16            -> U8, U32
-     *   - S16            -> QASYMM8_SIGNED, U8, S32
-     *   - BFLOAT16       -> F32
-     *   - F16            -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
-     *   - S32            -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
-     *   - F32            -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
-     *
-     * @param[in]  src    The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
-     * @param[out] dst    The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
-     * @param[in]  policy Conversion policy.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuCastKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    ConvertPolicy _policy{ ConvertPolicy::SATURATE };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CAST_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp
deleted file mode 100644
index 5df5ac3dd0..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateBatchKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-template <typename T>
-void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, const Window &window)
-{
-    // Offset src
-    uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
-
-    // Offset dst
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + batch_offset * dst->info()->strides_in_bytes()[3];
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16 / dst->info()->element_size();
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1));
-
-    Iterator src_it(src, win);
-    Iterator dst_it(dst, win);
-
-    const DataType                dt        = src->info()->data_type();
-    const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
-    }
-}
-
-Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst);
-
-    return Status{};
-}
-} // namespace
-
-CpuConcatenateBatchKernel::CpuConcatenateBatchKernel()
-    : _func(nullptr), _batch_offset(0)
-{
-}
-
-void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
-
-    _func         = nullptr;
-    _batch_offset = batch_offset;
-
-    switch(src->data_type())
-    {
-        case DataType::S8:
-        case DataType::U8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-            _func = &batch_concat<uint8_t>;
-            break;
-        case DataType::S16:
-        case DataType::U16:
-        case DataType::F16:
-            _func = &batch_concat<uint16_t>;
-            break;
-        case DataType::S32:
-        case DataType::U32:
-        case DataType::F32:
-            _func = &batch_concat<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateBatchKernel::validate(const arm_compute::ITensorInfo *src,
-                                           unsigned int                    batch_offset,
-                                           const arm_compute::ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst));
-    return Status{};
-}
-
-void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
-             tensors.get_tensor(TensorType::ACL_DST),
-             _batch_offset,
-             window);
-}
-
-const char *CpuConcatenateBatchKernel::name() const
-{
-    return "CpuConcatenateBatchKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateBatchKernel.h b/src/core/cpu/kernels/CpuConcatenateBatchKernel.h
deleted file mode 100644
index 99e8d84d99..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateBatchKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONCATENATEBATCH_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATEBATCH_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the batch concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CpuConcatenateBatchKernel : public ICpuKernel
-{
-public:
-    CpuConcatenateBatchKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateBatchKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]     src          Source tensor info. Data types supported: All.
-     * @param[in]     batch_offset The offset on axis # 3.
-     * @param[in,out] dst          Destination tensor info. Data types supported: Same as @p src.
-     */
-    void configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CpuConcatenateBatchKernel
-     *
-     * @param[in] src          Source tensor info. Data types supported: All.
-     * @param[in] batch_offset The offset on axis # 3.
-     * @param[in] dst          Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
-
-private:
-    BatchConcatFunction *_func;
-    unsigned int         _batch_offset;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATEBATCH_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp
deleted file mode 100644
index a7e5cd8c60..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateDepthKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-template <typename T>
-void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, const Window &window)
-{
-    // Offset source
-    uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
-
-    // Offset destination
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + depth_offset * dst->info()->strides_in_bytes()[2];
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16 / dst->info()->element_size();
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1));
-
-    Iterator src_it(src, win);
-    Iterator dst_it(dst, win);
-
-    const DataType                dt        = src->info()->data_type();
-    const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
-    }
-}
-
-Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
-
-    return Status{};
-}
-} // namespace
-
-CpuConcatenateDepthKernel::CpuConcatenateDepthKernel()
-    : _func(nullptr), _depth_offset(0)
-{
-}
-
-void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
-
-    _func         = nullptr;
-    _depth_offset = depth_offset;
-
-    switch(src->data_type())
-    {
-        case DataType::QASYMM8:
-            _func = &depth_concat<uint8_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-            _func = &depth_concat<int8_t>;
-            break;
-        case DataType::F16:
-            _func = &depth_concat<uint16_t>;
-            break;
-        case DataType::F32:
-            _func = &depth_concat<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateDepthKernel::validate(const arm_compute::ITensorInfo *src,
-                                           unsigned int                    depth_offset,
-                                           const arm_compute::ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst));
-    return Status{};
-}
-
-void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
-             tensors.get_tensor(TensorType::ACL_DST),
-             _depth_offset,
-             window);
-}
-
-const char *CpuConcatenateDepthKernel::name() const
-{
-    return "CpuConcatenateDepthKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateDepthKernel.h b/src/core/cpu/kernels/CpuConcatenateDepthKernel.h
deleted file mode 100644
index af89c2464f..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateDepthKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CPU_CONCATENATEDEPTH_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATEDEPTH_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the depth concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CpuConcatenateDepthKernel : public ICpuKernel
-{
-public:
-    CpuConcatenateDepthKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateDepthKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]     src          Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] dst          Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CpuConcatenateDepthKernel
-     *
-     * @param[in] src          Source tensor info. Data types supported:  QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] depth_offset The offset on the Z axis.
-     * @param[in] dst          Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
-
-private:
-    DepthConcatFunction *_func;
-    unsigned int         _depth_offset;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATEDEPTH_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp
deleted file mode 100644
index 54b972662b..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateHeightKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
-    }
-
-    return Status{};
-}
-} // namespace
-
-CpuConcatenateHeightKernel::CpuConcatenateHeightKernel()
-    : _height_offset(0)
-{
-}
-
-void CpuConcatenateHeightKernel::configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(src);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
-
-    _height_offset = height_offset;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateHeightKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst));
-    return Status{};
-}
-
-void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    // Offset destination pointer to the correct position
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
-    const int  window_step_x  = 16;
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1));
-
-    // Create iterators
-    Iterator src_it(src, win);
-    Iterator dst_it(dst, win);
-
-    const DataType                 dt        = src->info()->data_type();
-    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-
-        },
-        src_it, dst_it);
-    }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
-                         vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = src_it.ptr();
-            const auto out_ptr = dst_ptr + dst_it.offset();
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
-    }
-}
-
-const char *CpuConcatenateHeightKernel::name() const
-{
-    return "CpuConcatenateHeightKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateHeightKernel.h b/src/core/cpu/kernels/CpuConcatenateHeightKernel.h
deleted file mode 100644
index 609bb21da7..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateHeightKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONCATENATEHEIGHT_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATEHEIGHT_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the height concatenate kernel.
- *  The source tensor will be concatenated into the destination tensor.
- */
-class CpuConcatenateHeightKernel : public ICpuKernel
-{
-public:
-    CpuConcatenateHeightKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateHeightKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]     src           Source tensor info. Data types supported: All
-     * @param[in]     height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in,out] dst           Destination tensor info. Data types supported: Same as @p src.
-     *
-     */
-    void configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CpuConcatenateHeightKernel
-     *
-     * @param[in] src           Source tensor info. Data types supported: All
-     * @param[in] height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in] dst           Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    unsigned int _height_offset;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATEHEIGHT_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp
deleted file mode 100644
index effcbc336c..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateWidthKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
-
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
-    }
-
-    return Status{};
-}
-} // namespace
-
-CpuConcatenateWidthKernel::CpuConcatenateWidthKernel()
-    : _width_offset(0)
-{
-}
-
-void CpuConcatenateWidthKernel::configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
-    ARM_COMPUTE_UNUSED(dst);
-
-    _width_offset = width_offset;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateWidthKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst));
-    return Status{};
-}
-
-void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    // Offset output pointer to the correct position
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0];
-
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
-    constexpr int window_step_x  = 16;
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator                       src_it(src, win);
-    Iterator                       dst_it(dst, win);
-    const DataType                 dt        = src->info()->data_type();
-    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
-                         vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = src_it.ptr();
-            const auto out_ptr = dst_ptr + dst_it.offset();
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
-    }
-}
-
-const char *CpuConcatenateWidthKernel::name() const
-{
-    return "CpuConcatenateWidthKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateWidthKernel.h b/src/core/cpu/kernels/CpuConcatenateWidthKernel.h
deleted file mode 100644
index afdc3ccddd..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateWidthKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CPU_CONCATENATEWIDTH_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATEWIDTH_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the width concatenate kernel.
- *  The source tensor will be concatenated into the destination tensor.
- */
-class CpuConcatenateWidthKernel : public ICPPKernel
-{
-public:
-    CpuConcatenateWidthKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateWidthKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]     src          Source tensor info. Data types supported: All
-     * @param[in]     width_offset The offset on the X axis.
-     * @param[in,out] dst          Destination tensor info. Data types supported: Same as @p src.
-     */
-    void configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CpuConcatenateWidthKernel
-     *
-     * @param[in] src          Source tensor info. Data types supported: All
-     * @param[in] width_offset The offset on the X axis.
-     * @param[in] dst          Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    unsigned int _width_offset;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATEWIDTH_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
deleted file mode 100644
index d91ee64ecf..0000000000
--- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-CpuConvertFullyConnectedWeightsKernel::CpuConvertFullyConnectedWeightsKernel()
-    : _factor1(0), _factor2(0)
-{
-}
-
-void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape,
-                                                      DataLayout data_layout)
-
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto initialisation if not yet initialized
-    auto_init_if_empty(*dst, *src->clone());
-
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
-
-    const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
-
-    const int width_idx   = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH);
-    const int height_idx  = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT);
-    const int channel_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL);
-
-    const unsigned int num_elems_per_input_plane = original_input_shape[width_idx] * original_input_shape[height_idx];
-    const unsigned int num_channels              = original_input_shape[channel_idx];
-
-    _factor1 = (data_layout == DataLayout::NCHW) ? num_elems_per_input_plane : num_channels;
-    _factor2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_input_plane;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape,
-                                                       DataLayout data_layout)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_input_shape.total_size_lower(3));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
-
-    // Checks performed when dst is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-
-template <typename T>
-void CpuConvertFullyConnectedWeightsKernel::run_convert_fc_weights(const ITensor *in, ITensor *out, const Window &window)
-{
-    const unsigned int dst_stride_x = out->info()->strides_in_bytes().x();
-    const unsigned int dst_stride_y = out->info()->strides_in_bytes().y();
-
-    Iterator input(in, window);
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        *reinterpret_cast<T *>(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y) = *reinterpret_cast<T *>(input.ptr());
-    },
-    input);
-}
-
-void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(src->info()->element_size())
-    {
-        case 1:
-            run_convert_fc_weights<uint8_t>(src, dst, window);
-            break;
-        case 2:
-            run_convert_fc_weights<uint16_t>(src, dst, window);
-            break;
-        case 4:
-            run_convert_fc_weights<uint32_t>(src, dst, window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported.");
-            break;
-    }
-}
-
-const char *CpuConvertFullyConnectedWeightsKernel::name() const
-{
-    return "CpuConvertFullyConnectedWeightsKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
deleted file mode 100644
index c867e3deeb..0000000000
--- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_KERNEL_H
-#define ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
- *
- * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
- *       - It follows a Convolution layer
- *       - The data layout used by the network does not match the one the model has been trained in.
- *
- * @note This function assumes the weights are already reshaped (transposed)
- */
-class CpuConvertFullyConnectedWeightsKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuConvertFullyConnectedWeightsKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertFullyConnectedWeightsKernel);
-    /** Set the src and dst tensor.
-     *
-     * @param[in] src                  Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[in] dst                  The converted weights tensor info. Shape and Data Type: Same as @p src.
-     * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer).
-     * @param[in] data_layout          The data layout the weights have been trained in.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuConvertFullyConnectedWeightsKernel
-     *
-     * @param[in] src                  Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[in] dst                  The converted weights tensor info. Shape and Data Type: Same as @p src.
-     * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer).
-     * @param[in] data_layout          The data layout the weights have been trained in.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    unsigned int _factor1; /*  equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
-    unsigned int _factor2; /*  equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
-
-    /** Template function to run the permute
-     *
-     * @param[in] in     Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[in] out    The converted weights tensor info. Shape and Data Type: Same as @p in.
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_convert_fc_weights(const ITensor *in, ITensor *out, const Window &window);
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_KERNEL_H */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuCopyKernel.cpp b/src/core/cpu/kernels/CpuCopyKernel.cpp
deleted file mode 100644
index 8ec354b2aa..0000000000
--- a/src/core/cpu/kernels/CpuCopyKernel.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuCopyKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList())
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
-
-    // Validate destination if initialized
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
-{
-    // Destination auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, *src);
-    return std::make_pair(Status{}, calculate_max_window(*dst));
-}
-
-std::pair<Status, Window> validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
-{
-    const TensorShape src_shape    = src->tensor_shape();
-    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding);
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(padded_shape));
-    // Configure window
-    const Window win = calculate_max_window(*dst, dst->dimension(0));
-    return std::make_pair(Status{}, win);
-}
-
-} // namespace
-
-void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, padding));
-
-    _padding = padding;
-
-    std::pair<Status, Window> win_config;
-    if(padding.empty())
-    {
-        win_config = validate_and_configure_window(src, dst);
-    }
-    else
-    {
-        win_config = validate_and_configure_window_with_padding(src, dst, padding);
-    }
-
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICpuKernel::configure(win_config.second);
-}
-
-Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, const PaddingList &padding)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding));
-
-    if(padding.empty())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
-    }
-
-    return Status{};
-}
-
-void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    if(_padding.empty())
-    {
-        Window dst_window{ window };
-        dst_window.set(Window::DimX, Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
-        Window out_slice = dst_window.first_slice_window_1D();
-        do
-        {
-            Iterator src_it(src, out_slice);
-            Iterator dst_it(dst, out_slice);
-
-            execute_window_loop(out_slice, [&](const Coordinates &)
-            {
-                memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size());
-            },
-            src_it, dst_it);
-        }
-        while(dst_window.slide_window_slice_1D(out_slice));
-    }
-    else
-    {
-        Window src_window{ window };
-        src_window.set(Window::DimX, Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
-
-        Iterator     src_it(src, src_window);
-        Iterator     dst_it(dst, window);
-        const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size();
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
-            std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
-        },
-        src_it, dst_it);
-    }
-}
-
-const char *CpuCopyKernel::name() const
-{
-    return "CpuCopyKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuCopyKernel.h b/src/core/cpu/kernels/CpuCopyKernel.h
deleted file mode 100644
index 98b79a964c..0000000000
--- a/src/core/cpu/kernels/CpuCopyKernel.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_COPY_KERNEL_H
-#define ARM_COMPUTE_CPU_COPY_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to perform a copy between two tensors */
-class CpuCopyKernel : public ICpuKernel
-{
-public:
-    CpuCopyKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCopyKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]  src     Source tensor. Data types supported: All
-     * @param[out] dst     Destination tensor. Data types supported: same as @p src.
-     * @param[in]  padding (Optional) Padding to be applied to the input tensor
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding = PaddingList());
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuCopyKernel
-     *
-     * @param[in] src     Source tensor. Data types supported: All
-     * @param[in] dst     Destination tensor. Data types supported: same as @p src.
-     * @param[in] padding (Optional) Padding to be applied to the input tensor
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    PaddingList _padding{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_COPY_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
deleted file mode 100644
index 4ddb35f2d5..0000000000
--- a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
+++ /dev/null
@@ -1,919 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-constexpr auto data_layout = DataLayout::NHWC;
-const size_t   width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-const size_t   height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-const size_t   channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-constexpr auto   dim_manual_loop      = Window::Dimension(0, 0, 0);
-constexpr auto   dim_single_unit_step = Window::Dimension(0, 1, 1);
-constexpr size_t vector_size          = 8;
-
-struct DepthwiseConvolutionRunInfo
-{
-    const size_t   num_read_elements_per_iteration;
-    const uint32_t x_start;
-    const uint32_t x_end;
-    const uint32_t x_step;
-    const uint32_t x_leftover_start;
-    const size_t   input_stride_y;
-    const size_t   input_stride_z;
-    const size_t   input_max_offset;
-    const size_t   weights_width;
-    const size_t   weights_height;
-    const size_t   weights_stride_y;
-    const size_t   weights_stride_z;
-    const size_t   conv_stride_x;
-    const size_t   conv_stride_y;
-    const size_t   conv_pad_left;
-    const size_t   conv_pad_top;
-    const size_t   input_height;
-    const size_t   input_width;
-    const size_t   input_depth;
-
-    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
-        : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
-          x_start(w.x().start()),
-          x_end(w.x().end()),
-          x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
-          x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
-          input_stride_y(input.strides_in_bytes().y()),
-          input_stride_z(input.strides_in_bytes().z()),
-          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
-          weights_width(weights.dimension(width_idx)),
-          weights_height(weights.dimension(height_idx)),
-          weights_stride_y(weights.strides_in_bytes().y()),
-          weights_stride_z(weights.strides_in_bytes().z()),
-          conv_stride_x(conv_info.stride().first),
-          conv_stride_y(conv_info.stride().second),
-          conv_pad_left(conv_info.pad_left()),
-          conv_pad_top(conv_info.pad_top()),
-          input_height(input.dimension(height_idx)),
-          input_width(input.dimension(width_idx)),
-          input_depth(input.dimension(channel_idx))
-    {
-    }
-};
-
-inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
-{
-    const int32_t current_h  = base_h + h * dilation.y();
-    const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
-
-    const int32_t current_w  = base_w + w * dilation.x();
-    const bool    is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
-
-    return is_valid_h && is_valid_w;
-}
-
-template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                   const Size2D &dilation, const Window &window, bool has_biases)
-{
-    constexpr auto element_per_vector = vector_size / sizeof(T);
-    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
-    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
-
-    const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, dim_single_unit_step);
-
-    Window win_input = window;
-    win_input.set(Window::DimX, dim_manual_loop);
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = win_input;
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set(Window::DimX, dim_manual_loop);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto const base_weights_ptr = weights_it.ptr();
-        uint32_t   x                = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
-        {
-            VectorType acc          = zero_vector;
-            auto       weights_ptr  = base_weights_ptr;
-            int64_t    input_offset = base_input_offset;
-
-            for(uint32_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(uint32_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 zero_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-                    acc                     = wrapper::vmla(acc, weights_vals, input_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc                    = wrapper::vadd(acc, biases_vals);
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
-        }
-
-        for(; x < run_info.x_end; ++x)
-        {
-            auto    acc_scalar   = T{ 0 };
-            auto    weights_ptr  = base_weights_ptr;
-            int64_t input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
-                    const auto weights_vals    = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc_scalar += (input_vals * weights_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc_scalar += biases_vals;
-            }
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T>
-void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                               const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
-{
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<T> acc(depth_multiplier, static_cast<T>(0));
-
-        const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
-            {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
-                {
-                    const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m)              = support::cpp11::fma(weights_val, input_val, acc.at(m));
-                }
-
-                offs += dilation.x() * run_info.input_stride_y;
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        if(has_biases)
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
-            {
-                const auto biases_val                                     = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
-            }
-        }
-        else
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
-            {
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
-            }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
-{
-    ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
-    constexpr auto element_per_vector = vector_size / sizeof(T);
-    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
-    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
-    using AccType                     = int32_t;
-    using AccArrayType                = std::array<AccType, element_per_vector>;
-
-    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
-    const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
-
-    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
-    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
-    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, dim_single_unit_step);
-
-    Window win_input = window;
-    win_input.set(Window::DimX, dim_manual_loop);
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = win_input;
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set(Window::DimX, dim_manual_loop);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-        auto const    base_weights_ptr  = weights_it.ptr();
-        size_t        x                 = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
-        {
-            AccArrayType acc{};
-            AccArrayType in_sum{};
-            AccArrayType we_sum{};
-
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 out_of_bound_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    for(size_t i = 0; i < element_per_vector; ++i)
-                    {
-                        acc.at(i) += input_vals[i] * weights_vals[i];
-                        in_sum.at(i) += input_vals[i];
-                        we_sum.at(i) += weights_vals[i];
-                    }
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
-            for(size_t i = 0; i < element_per_vector; ++i)
-            {
-                acc.at(i) -= in_sum.at(i) * weights_qoffset;
-                acc.at(i) -= we_sum.at(i) * input_qoffset;
-                acc.at(i) += k_offset;
-
-                if(has_biases)
-                {
-                    acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
-                }
-
-                const int32_t out_mul   = output_multiplier.at(x + i);
-                const int32_t out_shift = output_shift.at(x + i);
-                if(out_shift < 0)
-                {
-                    acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
-                }
-                else
-                {
-                    acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
-                }
-                out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
-        }
-
-        // left-over
-        for(; x < run_info.x_end; ++x)
-        {
-            AccType acc    = 0;
-            AccType in_sum = 0;
-            AccType we_sum = 0;
-
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_val       = is_valid_region ?
-                                                 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
-                                                 out_of_bound_value;
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc += input_val * weights_val;
-                    in_sum += input_val;
-                    we_sum += weights_val;
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            T out_vals{ 0 };
-
-            acc -= in_sum * weights_qoffset;
-            acc -= we_sum * input_qoffset;
-            acc += k_offset;
-
-            if(has_biases)
-            {
-                acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
-            }
-
-            const int32_t out_mul   = output_multiplier.at(x);
-            const int32_t out_shift = output_shift.at(x);
-
-            if(out_shift < 0)
-            {
-                acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
-            {
-                acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
-            }
-
-            out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
-{
-    using AccType = int32_t;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
-
-    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
-    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
-    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<AccType> acc(depth_multiplier, 0);
-        std::vector<AccType> we_sum(depth_multiplier, 0);
-        AccType              in_sum = 0;
-
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
-            {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
-                {
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m) += input_val * weights_val;
-
-                    we_sum.at(m) += weights_val;
-                }
-
-                offs += dilation.x() * run_info.input_stride_y;
-                in_sum += input_val;
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        for(size_t m = 0; m < depth_multiplier; ++m)
-        {
-            acc.at(m) -= in_sum * weights_qoffset;
-            acc.at(m) -= we_sum.at(m) * input_qoffset;
-            acc.at(m) += k_offset;
-
-            if(has_biases)
-            {
-                acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-            }
-
-            const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
-            const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
-            if(out_shift < 0)
-            {
-                acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
-            {
-                acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
-            }
-            *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
-{
-    constexpr int half_vec = vector_size / 2;
-
-    using AccType          = int32_t;
-    using AccVectorType    = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
-    using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
-    using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
-    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
-    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
-
-    const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
-    const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
-    const auto zero  = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
-
-    const auto out_mul   = output_multiplier.at(0);
-    const auto out_shift = output_shift.at(0);
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
-    std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::fill(begin(acc0), end(acc0), zero);
-        std::fill(begin(acc1), end(acc1), zero);
-
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            const int32_t current_h = input_z + h * dilation.y();
-            if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
-            {
-                int offs = input_offset;
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const int32_t current_w = input_y + w * dilation.x();
-                    if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
-                    {
-                        const auto input_8x8     = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
-                        const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
-                        const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
-
-                        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
-                        {
-                            const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                            const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
-                            const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
-
-                            acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
-                            acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
-                        }
-                    }
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
-        {
-            if(has_biases)
-            {
-                const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-                const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
-
-                acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
-                acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
-            }
-
-            if(out_shift < 0)
-            {
-                acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-            }
-            else
-            {
-                acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
-            }
-
-            acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
-            acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
-
-            const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
-                                                   wrapper::vmovn(acc1.at(i)));
-
-            if(std::is_same<T, uint8_t>::value)
-            {
-                wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
-            }
-            else
-            {
-                wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
-            }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
-    ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
-    ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
-
-    if(is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-    }
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
-
-        if(is_data_type_quantized_asymmetric(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-    }
-
-    if(dst->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-CpuDepthwiseConv2dNativeKernel::CpuDepthwiseConv2dNativeKernel()
-    : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
-{
-}
-
-void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
-
-    _conv_info        = info.pad_stride_info;
-    _depth_multiplier = info.depth_multiplier;
-    _dilation         = info.dilation;
-    _has_biases       = (biases != nullptr);
-
-    if(is_data_type_quantized(src->data_type()))
-    {
-        const auto input_scale  = src->quantization_info().uniform().scale;
-        const auto output_scale = dst->quantization_info().uniform().scale;
-
-        auto weights_scale = weights->quantization_info().scale();
-        if(!is_data_type_quantized_per_channel(weights->data_type()))
-        {
-            for(size_t i = 1; i < weights->dimension(channel_idx); ++i)
-            {
-                weights_scale.push_back(weights_scale.front());
-            }
-        }
-
-        for(const auto &s : weights_scale)
-        {
-            int32_t     out_mult   = 0;
-            int32_t     out_shift  = 0;
-            const float multiplier = input_scale * s / output_scale;
-            arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
-
-            _output_multiplier.push_back(out_mult);
-            _output_shift.push_back(out_shift);
-        }
-    }
-
-    switch(weights->data_type())
-    {
-        case DataType::QASYMM8:
-            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, uint8_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
-            break;
-        case DataType::QSYMM8_PER_CHANNEL:
-            if(src->data_type() == DataType::QASYMM8)
-            {
-                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, int8_t>;
-            }
-            else
-            {
-                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
-            }
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float16_t, float16_t>;
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float, float>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
-    }
-
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
-    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
-
-    Window win = calculate_max_window(*dst, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
-    return Status{};
-}
-
-template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::FloatEnalber<T>>
-void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                   ITensor *dst, const Window &window, bool has_biases)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    if(_depth_multiplier == 1)
-    {
-        depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, _conv_info, _dilation, window, has_biases);
-    }
-    else
-    {
-        depthwise_loop_generic_fp<T>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, window, has_biases);
-    }
-}
-
-template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::Quantized8bitEnalber<T>>
-void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                   ITensor *dst, const Window &window, bool has_biases)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    if(_depth_multiplier == 1)
-    {
-        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
-    }
-    else
-    {
-        const bool is_pow2                 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
-        const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
-
-        if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
-        {
-            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
-        }
-        else
-        {
-            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
-        }
-    }
-}
-
-void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    const auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto       dst     = tensors.get_tensor(TensorType::ACL_DST);
-    (this->*_func)(src, weights, biases, dst, window, _has_biases);
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
deleted file mode 100644
index 559c46dc93..0000000000
--- a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
-
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "support/Requires.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_neon.h>
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class CpuDepthwiseConv2dNativeKernel : public ICpuKernel
-{
-public:
-    const char *name() const override
-    {
-        return "CpuDepthwiseConv2dNativeKernel";
-    }
-    /** Default constructor */
-    CpuDepthwiseConv2dNativeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dNativeKernel);
-
-    /** Initialize the function's source, destination and parameters.
-     *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in]  src     Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
-     *                     Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases  Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] dst     Destination tensor. Data type supported: Same as @p src.
-     * @param[in]  info    Depthwise convolution meta-data.
-     *
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuDepthwiseConv2dNativeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <typename T>
-    using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
-
-    template <typename T, typename TW, FloatEnalber<T> = 0>
-    void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
-    template <typename T>
-    using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
-
-    template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
-    void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
-    /** Common signature for all the specialised depthwise convolution native functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using DepthwiseFunctionPtr = void (CpuDepthwiseConv2dNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
-    DepthwiseFunctionPtr _func;
-    PadStrideInfo        _conv_info;
-    unsigned int         _depth_multiplier;
-    Size2D               _dilation;
-    std::vector<int>     _output_multiplier;
-    std::vector<int>     _output_shift;
-    bool                 _has_biases;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDequantizeKernel.cpp b/src/core/cpu/kernels/CpuDequantizeKernel.cpp
deleted file mode 100644
index 42b5439697..0000000000
--- a/src/core/cpu/kernels/CpuDequantizeKernel.cpp
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDequantizeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
-
-    if(dst->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-
-template <typename T>
-inline void store_result(T *ptr, const float32x4x4_t &v)
-{
-    ARM_COMPUTE_UNUSED(ptr, v);
-}
-
-template <>
-inline void store_result<float>(float *ptr, const float32x4x4_t &v)
-{
-    wrapper::vstore(ptr, v.val[0]);
-    wrapper::vstore(ptr + 4, v.val[1]);
-    wrapper::vstore(ptr + 8, v.val[2]);
-    wrapper::vstore(ptr + 12, v.val[3]);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
-{
-    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
-    wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T>
-inline void store_result(T *ptr, const float32x4x2_t &v)
-{
-    ARM_COMPUTE_UNUSED(ptr, v);
-}
-
-template <>
-inline void store_result<float>(float *ptr, const float32x4x2_t &v)
-{
-    wrapper::vstore(ptr, v.val[0]);
-    wrapper::vstore(ptr + 4, v.val[1]);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v)
-{
-    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename TOut, typename TIn>
-void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window)
-{
-    const UniformQuantizationInfo &qinfo  = input->info()->quantization_info().uniform();
-    const float                    scale  = qinfo.scale;
-    const int32_t                  offset = qinfo.offset;
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win_collapsed);
-    Iterator out(output, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const TIn *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale, offset);
-
-            store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            auto val       = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window)
-{
-    const auto scale = input->info()->quantization_info().scale();
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Reset first dimension to handle tail calculations manually
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win);
-    Iterator out(output, win);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale[id.z()]);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window)
-{
-    const auto scale = input->info()->quantization_info().scale();
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Reset first dimension to handle tail calculations manually
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win);
-    Iterator out(output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const float32x4x4_t vscale =
-            {
-                {
-                    scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3],
-                    scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7],
-                    scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11],
-                    scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15]
-                }
-            };
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, vscale);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window)
-{
-    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
-    const float                    scale = qinfo.scale;
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win_collapsed);
-    Iterator out(output, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window)
-{
-    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
-    const float                    scale = qinfo.scale;
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win_collapsed);
-    Iterator out(output, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize_int16(vin, scale);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int16_t val    = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
-{
-    switch(input->info()->data_type())
-    {
-        case DataType::QASYMM8:
-            run_dequantization_qasymm8<T, uint8_t>(input, output, window);
-            break;
-        case DataType::QASYMM8_SIGNED:
-            run_dequantization_qasymm8<T, int8_t>(input, output, window);
-            break;
-        case DataType::QSYMM8_PER_CHANNEL:
-            input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
-            break;
-        case DataType::QSYMM8:
-            run_dequantization_qsymm8<T>(input, output, window);
-            break;
-        case DataType::QSYMM16:
-            run_dequantization_qsymm16<T>(input, output, window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-}
-} // namespace
-
-void CpuDequantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(dst->info()->data_type())
-    {
-        case DataType::F32:
-            run_dequantization_core<float>(src, dst, window);
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            run_dequantization_core<float16_t>(src, dst, window);
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-}
-const char *CpuDequantizeKernel::name() const
-{
-    return "CpuDequantizeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDequantizeKernel.h b/src/core/cpu/kernels/CpuDequantizeKernel.h
deleted file mode 100644
index 798f32cec7..0000000000
--- a/src/core/cpu/kernels/CpuDequantizeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the dequantization layer kernel. */
-class CpuDequantizeKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuDequantizeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDequantizeKernel);
-    /** Set input, output tensors.
-     *
-     * @param[in]  src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuDequantizeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
deleted file mode 100644
index c0fc41525e..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
+++ /dev/null
@@ -1,1385 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
-
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-
-using namespace arm_compute::detail;
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <unsigned int stridex>
-float16x8_t internal_vld1q(const float16_t *in);
-
-template <>
-float16x8_t internal_vld1q<1>(const float16_t *in)
-{
-    return vld1q_f16(in);
-}
-
-template <>
-float16x8_t internal_vld1q<2>(const float16_t *in)
-{
-    const float16x8x2_t tmp = vld2q_f16(in);
-    return tmp.val[0];
-}
-
-template <>
-float16x8_t internal_vld1q<3>(const float16_t *in)
-{
-    const float16x8x3_t tmp = vld3q_f16(in);
-    return tmp.val[0];
-}
-
-inline float16x8_t internal_vdupq_n(float16_t v)
-{
-    return vdupq_n_f16(v);
-}
-
-inline void internal_vst1q(float16_t *p, const float16x8_t &v)
-{
-    vst1q_f16(p, v);
-}
-
-float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y)
-{
-    return vmulq_f16(x, y);
-}
-
-inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z)
-{
-    return vaddq_f16(x, vmulq_f16(y, z));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <unsigned int stridex>
-float32x4_t internal_vld1q(const float *in);
-
-template <>
-float32x4_t internal_vld1q<1>(const float *in)
-{
-    return vld1q_f32(in);
-}
-
-template <>
-float32x4_t internal_vld1q<2>(const float *in)
-{
-    const float32x4x2_t tmp = vld2q_f32(in);
-    return tmp.val[0];
-}
-
-template <>
-float32x4_t internal_vld1q<3>(const float *in)
-{
-    const float32x4x3_t tmp = vld3q_f32(in);
-    return tmp.val[0];
-}
-
-inline float32x4_t internal_vdupq_n(float v)
-{
-    return vdupq_n_f32(v);
-}
-
-inline void internal_vst1q(float *p, const float32x4_t &v)
-{
-    vst1q_f32(p, v);
-}
-
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y)
-{
-    return vmulq_f32(x, y);
-}
-
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z)
-{
-    return vmlaq_f32(x, y, z);
-}
-
-constexpr int small_tensor_size_optim = 8;
-inline bool run_optim_small_tensor_info(const ITensorInfo *t)
-{
-    return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim;
-}
-
-inline bool run_optim_small_tensor(const ITensor *t)
-{
-    return run_optim_small_tensor_info(t->info());
-}
-
-// Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
-// For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to
-// store intermidiate results in memory. Temporary results are stored in SIMD registers directly and then written to the output buffer.
-template <unsigned int stridex>
-class convolver_w1x1_i8x8_f32
-{
-public:
-    static void convolve(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimX) > small_tensor_size_optim);
-        ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimY) > small_tensor_size_optim);
-
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_h        = dst->info()->dimension(1);
-        const int          range_z         = window.z().end() - window.z().start();
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
-            std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
-            for(int oz = 0; oz < range_z; ++oz)
-            {
-                accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
-                accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
-                auto p_out_base                                                                               = out_ptr + oz * output_stride_z;
-                for(int p = 0; p < kernel_depth; ++p)
-                {
-                    const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk0   = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const float *>(input_ptr + p * input_stride_z + offset_xy);
-                        auto      v_in0     = internal_vld1q<stridex>(in_val);
-                        auto      v_in1     = internal_vld1q<stridex>(in_val + 4);
-                        accum0[oh]          = vmlaq_f32(accum0[oh], vk0, v_in0);
-                        accum1[oh]          = vmlaq_f32(accum1[oh], vk0, v_in1);
-                    }
-                }
-                for(oh = 0; oh < output_h; ++oh)
-                {
-                    auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y);
-                    vst1q_f32(p_out, accum0[oh]);
-                    vst1q_f32(p_out + 4, accum1[oh]);
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_1x1
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          range_z         = window.z().end() - window.z().start();
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            /*
-                For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
-            */
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            for(int oz = 0; oz < range_z; ++oz)
-            {
-                auto p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk    = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
-                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
-                        {
-                            internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
-                        }
-                    }
-                }
-
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk    = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
-                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
-                        {
-                            internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <unsigned int stridex>
-float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                           const float *m0, const float *m1, const float *m2, const float *m3, const float *m4);
-
-inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
-{
-    const float32x4x3_t m00 =
-    {
-        {
-            vld1q_dup_f32(m0),
-            vld1q_dup_f32(m1),
-            vld1q_dup_f32(m2)
-        }
-    };
-    return m00;
-}
-
-inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)
-{
-    const float32x4x2_t m00 =
-    {
-        {
-            vld1q_dup_f32(m3),
-            vld1q_dup_f32(m4)
-        }
-    };
-    return m00;
-}
-
-inline float32x4x3_t load_input(const float *const in)
-{
-    const float32x4x3_t vin =
-    {
-        {
-            vld1q_f32(in),
-            vld1q_f32(in + 4),
-            vld1q_f32(in + 8)
-        }
-    };
-    return vin;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    const float32x4x3_t vin0 = load_input(in_0);
-    const float32x4x3_t vin1 = load_input(in_1);
-    const float32x4x3_t vin2 = load_input(in_2);
-    const float32x4x3_t vin3 = load_input(in_3);
-    const float32x4x3_t vin4 = load_input(in_4);
-    const float32x4x3_t m00  = load_matrix_hi(m0, 1 + m0, 2 + m0);
-    const float32x4x2_t m01  = load_matrix_lo(3 + m0, 4 + m0);
-    const float32x4x3_t m10  = load_matrix_hi(m1, 1 + m1, 2 + m1);
-    const float32x4x2_t m11  = load_matrix_lo(3 + m1, 4 + m1);
-    const float32x4x3_t m20  = load_matrix_hi(m2, 1 + m2, 2 + m2);
-    const float32x4x2_t m21  = load_matrix_lo(3 + m2, 4 + m2);
-    const float32x4x3_t m30  = load_matrix_hi(m3, 1 + m3, 2 + m3);
-    const float32x4x2_t m31  = load_matrix_lo(3 + m3, 4 + m3);
-    const float32x4x3_t m40  = load_matrix_hi(m4, 1 + m4, 2 + m4);
-    const float32x4x2_t m41  = load_matrix_lo(3 + m4, 4 + m4);
-
-    float32x4x2_t out =
-    {
-        {
-            vmulq_f32(vin0.val[0], m00.val[0]),
-            vmulq_f32(vin0.val[1], m00.val[0])
-        }
-    };
-
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
-
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
-    return out;
-}
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_3x3
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          num_planes_z    = window.z().end() - window.z().start();
-        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            /*
-                    Each thread executing this kernel computes one or more output's volume planes.
-
-                    Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
-                    the third thread [16,24] and the fourth thread [25,31].
-
-                    The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
-                    is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
-
-                    The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
-                        1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
-                        2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
-            */
-            for(int oz = 0; oz < num_planes_z; ++oz)
-            {
-                const int zoffset    = id.z() + oz;
-                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
-                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
-                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
-                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
-                        }
-                    }
-                }
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
-                    const uint8_t *input_base = input_ptr + p * input_stride_z;
-                    const auto     ptr_k_r0   = reinterpret_cast<const T1 *>(ptr_k_base);
-                    const auto     ptr_k_r1   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);
-                    const auto     ptr_k_r2   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);
-                    const auto     vk_r0      = load_matrix_row(ptr_k_r0);
-                    const auto     vk_r1      = load_matrix_row(ptr_k_r1);
-                    const auto     vk_r2      = load_matrix_row(ptr_k_r2);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);
-                        auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);
-                        auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);
-                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_5x5
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          num_planes_z    = window.z().end() - window.z().start();
-        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            for(int oz = 0; oz < num_planes_z; ++oz)
-            {
-                const int zoffset    = id.z() + oz;
-                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
-                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);
-                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);
-                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
-                            store_results<stridex>(p_out, vres);
-                        }
-                    }
-                }
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
-
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
-                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);
-                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);
-                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
-                            accumulate_results<stridex>(p_out, vres);
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-float vreduce(const float32x4_t &v)
-{
-    auto v0    = wrapper::vgethigh(v);
-    auto v1    = wrapper::vgetlow(v);
-    auto v_out = wrapper::vadd(v0, v1);
-
-    float a = wrapper::vgetlane(v_out, 0);
-    float b = wrapper::vgetlane(v_out, 1);
-    return a + b;
-}
-
-template <typename T1, typename T2>
-inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-template <>
-inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                                       const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    if(run_optim_small_tensor(src))
-    {
-        switch(conv_stride_x)
-        {
-            case 1:
-                convolver_w1x1_i8x8_f32<1>::convolve(window, src, weights, dst, conv_info);
-                break;
-            case 2:
-                convolver_w1x1_i8x8_f32<2>::convolve(window, src, weights, dst, conv_info);
-                break;
-            case 3:
-                convolver_w1x1_i8x8_f32<3>::convolve(window, src, weights, dst, conv_info);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
-    }
-    else
-    {
-        switch(conv_stride_x)
-        {
-            case 1:
-                convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            case 2:
-                convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            case 3:
-                convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
-    }
-}
-
-template <typename T1, typename T2>
-inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-template <typename T1, typename T2>
-inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-
-    const DataLayout data_layout = src->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (src->data_type() == DataType::F16));
-
-    // Checks performed when output is configured
-    if(dst->total_size() != 0)
-    {
-        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
-        DataType data_type = src->data_type();
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
-                                                        unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
-{
-    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-
-    const DataLayout data_layout = src->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-
-    // Calculate right and bottom border
-    unsigned int kernel_size   = weights->dimension(width_idx);
-    const int    conv_stride_x = std::get<0>(conv_info.stride());
-    const int    conv_stride_y = std::get<1>(conv_info.stride());
-    const int    input_width   = src->dimension(width_idx);
-
-    Window win{};
-    bool   window_changed = false;
-
-    if(data_layout == DataLayout::NCHW)
-    {
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                switch(src->data_type())
-                {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        num_elems_written_per_iteration = 8;
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    case DataType::F32:
-                        if(run_optim_small_tensor_info(src))
-                        {
-                            num_elems_written_per_iteration = 8;
-                        }
-                        else
-                        {
-                            num_elems_written_per_iteration = 4;
-                        }
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-                num_weight_elems_read_per_row = kernel_size;
-                num_elems_read_per_iteration  = conv_stride_x * num_elems_written_per_iteration;
-                break;
-            }
-            case 3:
-                switch(src->data_type())
-                {
-                    case DataType::F32:
-                        num_weight_elems_read_per_row   = 4 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 12;
-                        num_elems_written_per_iteration = 16 >> conv_stride_x;
-                        break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        num_weight_elems_read_per_row   = 8 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 24;
-                        num_elems_written_per_iteration = 32 >> conv_stride_x;
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-                break;
-            case 5:
-            {
-                switch(src->data_type())
-                {
-                    case DataType::F32:
-                        num_weight_elems_read_per_row   = 4 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 12;
-                        num_elems_written_per_iteration = 16 >> conv_stride_x;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-            }
-            break;
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not implemented");
-                break;
-            }
-        }
-
-        // Calculate right pad
-        int start_x       = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
-        int end_x         = ceil_to_multiple(static_cast<int>(dst->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
-        int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
-
-        // Calculate border
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-        const unsigned int conv_pad_right  = std::max(upper_bound_w, 0);
-        const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-
-        border_size.left   = conv_pad_left;
-        border_size.top    = conv_pad_top;
-        border_size.right  = conv_pad_right;
-        border_size.bottom = conv_pad_bottom;
-
-        // Configure window
-        win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration));
-
-        AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top,
-                                           num_elems_read_per_iteration, kernel_size,
-                                           conv_stride_x, conv_stride_y);
-        AccessWindowStatic     weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
-        AccessWindowHorizontal output_access(dst, 0, num_elems_written_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-    }
-    else
-    {
-        // Configure window NHWC without any padding
-        win = calculate_max_window(*dst, Steps());
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-bool have_zero_x_internal_padding(ITensorInfo *src, ITensorInfo *weights)
-{
-    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
-}
-
-} // namespace
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
-    // This function assumes that input and weights have not padding in channel
-
-    // Declare useful types
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    // Scalar quantities
-    const int element_size   = src->info()->element_size();
-    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
-    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
-    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
-    const int input_dim_w    = src->info()->dimension(1);
-    const int input_dim_h    = src->info()->dimension(2);
-
-    const int output_stride_c = dst->info()->strides_in_bytes().x();
-
-    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
-    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
-    const int          kernel_dim_w    = weights->info()->dimension(1);
-    const int          kernel_dim_h    = weights->info()->dimension(2);
-
-    const int conv_pad_top  = _conv_info.pad_top();
-    const int conv_pad_left = _conv_info.pad_left();
-    const int conv_stride_w = std::get<0>(_conv_info.stride());
-    const int conv_stride_h = std::get<1>(_conv_info.stride());
-
-    // Setup input window for the output iterator
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Setup input window for the weights iterator
-    Window window_w = calculate_max_window(*weights->info(), Steps());
-    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    Iterator out(dst, window_out);
-    Iterator wei(weights, window_w);
-
-    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-    /*
-     * This implementation parallelize the full WC plane of input and weights by
-     * treating them as series of elements. So for example, a 3x3 weights and
-     * floating point vector operations of 4 elements per time, the first 3
-     * channel elements of the first row would be taken and additionally the first
-     * element of the second row. The 9 elements in each single WC weight plane
-     * would require 2 4-element vector operations and a last single element operation.
-     *
-     * This works since when we create the input vector to multiply with the weights,
-     * the exact required elements are loaded in the same order. Therefore the
-     * multiplication works on the correct input/weight elements.
-     */
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        /*
-         * In here we create theoretical indexes which then we validate for both
-         * inputs and weights.
-         * As a reminder, this loop take each output point in NHW, C is treated
-         * in the weights loop.
-         */
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
-        const int index_h_start  = in_h_start - in_h_start_t;
-        const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
-        const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            /*
-             * This is the loop in the weights, and it goes along N (the batches)
-             * As a reminder, the batches of the weights are translated into the
-             * channels of the output
-             */
-            const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
-                                  + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
-            const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
-            uint8_t *out_ptr         = out.ptr() + id_w[3] * output_stride_c;
-
-            T out_temp = static_cast<T>(0);
-            for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
-            {
-                const T    *in_ptr_mover = in_ptr_row;
-                int         index_wc     = index_wc_start;
-                vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
-                {
-                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                    const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
-                    out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                }
-                out_temp += vreduce(out_temp_vec);
-                for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
-                {
-                    const auto src_val = *(in_ptr_mover);
-                    const auto w_val   = *(weights_ptr_row + index_wc);
-                    out_temp += src_val * w_val;
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-        },
-        wei);
-    },
-    out);
-}
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
-    // Declare useful types
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    // Scalar quantities
-    const int element_size   = src->info()->element_size();
-    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
-    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
-    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
-    const int input_dim_w    = src->info()->dimension(1);
-    const int input_dim_h    = src->info()->dimension(2);
-
-    const int output_stride_c = dst->info()->strides_in_bytes().x();
-
-    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
-    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
-    const int          kernel_dim_w    = weights->info()->dimension(1);
-    const int          kernel_dim_h    = weights->info()->dimension(2);
-
-    const int conv_pad_top  = _conv_info.pad_top();
-    const int conv_pad_left = _conv_info.pad_left();
-    const int conv_stride_w = std::get<0>(_conv_info.stride());
-    const int conv_stride_h = std::get<1>(_conv_info.stride());
-
-    // Setup input window for the output iterator
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Setup input window for the weights iterator
-    Window window_w = calculate_max_window(*weights->info(), Steps());
-    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    Iterator out(dst, window_out);
-    Iterator wei(weights, window_w);
-
-    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        const int      index_c_end  = weights->info()->dimension(0);
-        const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
-
-            T out_temp = static_cast<T>(0);
-            for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-            {
-                const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
-                const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
-                for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
-                {
-                    const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                    const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                    int         index_c           = 0;
-                    vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                    for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
-                    {
-                        const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                        const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
-                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                    }
-                    out_temp += vreduce(out_temp_vec);
-                    for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
-                    {
-                        const auto src_val = *(in_ptr_mover);
-                        const auto w_val   = *(weights_ptr_mover);
-                        out_temp += src_val * w_val;
-                    }
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-        },
-        wei);
-    },
-    out);
-}
-
-BorderSize CpuDirectConv2dKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-
-    _conv_info   = conv_info;
-    _data_layout = src->data_layout();
-    _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
-
-    const unsigned int conv_pad_left   = conv_info.pad_left();
-    const unsigned int conv_pad_top    = conv_info.pad_top();
-    const unsigned int conv_pad_right  = conv_info.pad_right();
-    const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-    if(_data_layout == DataLayout::NCHW)
-    {
-        _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
-    }
-    else
-    {
-        _border_size = BorderSize(0);
-    }
-
-    // Get convolved dimensions
-    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
-    DataType data_type = src->data_type();
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape, 1, data_type);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, weights, dst, conv_info, _num_weight_elems_read_per_row,
-                                                    _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICpuKernel::configure(win_config.second);
-}
-
-Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
-    unsigned int num_weight_elems_read_per_row   = 0;
-    unsigned int num_elems_read_per_iteration    = 0;
-    unsigned int num_elems_written_per_iteration = 0;
-    BorderSize   border_size                     = {};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
-                                                              weights->clone().get(),
-                                                              dst->clone().get(),
-                                                              conv_info,
-                                                              num_weight_elems_read_per_row,
-                                                              num_elems_read_per_iteration,
-                                                              num_elems_written_per_iteration,
-                                                              border_size)
-                                .first);
-
-    return Status{};
-}
-
-void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto      src         = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto      weights     = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto      dst         = tensors.get_tensor(TensorType::ACL_DST);
-    const int kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
-
-    if(_data_layout == DataLayout::NCHW)
-    {
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
-                break;
-            }
-            case 3:
-            {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
-                break;
-            }
-            case 5:
-            {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
-                break;
-            }
-        }
-    }
-    else
-    {
-        switch(src->info()->data_type())
-        {
-            case DataType::F32:
-            {
-                if(have_zero_x_internal_padding(src->info(), weights->info()))
-                {
-                    convolve_nhwc_optimized<float>(window, src, weights, dst);
-                }
-                else
-                {
-                    convolve_nhwc<float>(window, src, weights, dst);
-                }
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-        }
-    }
-}
-const char *CpuDirectConv2dKernel::name() const
-{
-    return "CpuDirectConvolutionLayerKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDirectConv2dKernel.h b/src/core/cpu/kernels/CpuDirectConv2dKernel.h
deleted file mode 100644
index 62ed96f255..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform Direct Convolution Layer. */
-class CpuDirectConv2dKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuDirectConv2dKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel);
-    /** Set the src, weights, and dst tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *
-     * @param[in]  src       The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                       Data type supported:Same as @p input.
-     * @param[out] dst       Output tensor.
-     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuDirectConv2dKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-    BorderSize  border_size() const override;
-
-private:
-    /* Template function for optimized convolution NHWC */
-    template <typename T>
-    void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
-
-    /* Template function for convolution NHWC */
-    template <typename T>
-    void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
-
-    PadStrideInfo _conv_info{};
-    BorderSize    _border_size{};
-    unsigned int  _kernel_size{ 0 };
-    unsigned int  _num_weight_elems_read_per_row{ 0 };
-    unsigned int  _num_elems_read_per_iteration{ 0 };
-    unsigned int  _num_elems_written_per_iteration{ 0 };
-    DataLayout    _data_layout{ DataLayout::UNKNOWN };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
deleted file mode 100644
index 662d052941..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                          const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-    }
-
-    if(src->data_type() == DataType::S32)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
-    }
-
-    // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
-    {
-        if(is_data_type_float(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-    else if(src->data_type() == DataType::S32)
-    {
-        // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
-        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
-    }
-
-    return Status{};
-}
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
-    ARM_COMPUTE_UNUSED(result_shift);
-    ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
-            auto       v_in   = wrapper::vloadq(in_ptr);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
-                v_in          = wrapper::vadd(v_in, vb);
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, v_in);
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
-            }
-
-            *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
-        }
-
-    },
-    in, out);
-}
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
-    ARM_COMPUTE_UNUSED(result_shift);
-    ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
-    Window window_bias = window;
-    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    window_bias.set(3, Window::Dimension(0, 0, 0));
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator bi(bias, window_bias);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
-            auto       v_in   = wrapper::vloadq(in_ptr + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            wrapper::vstore(out_ptr + x, v_in);
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            *(out_ptr + x)     = s_in;
-        }
-    },
-    in, bi, out);
-}
-
-// Quantized case
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
-    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
-    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
-    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
-            {
-                {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12)
-                }
-            };
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
-                v_in =
-                {
-                    {
-                        wrapper::vadd(v_in.val[0], vb),
-                        wrapper::vadd(v_in.val[1], vb),
-                        wrapper::vadd(v_in.val[2], vb),
-                        wrapper::vadd(v_in.val[3], vb)
-                    }
-                };
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
-                                                           min, max, false));
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, out);
-}
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
-    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
-    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
-    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
-    Window window_bias = window;
-    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    window_bias.set(3, Window::Dimension(0, 0, 0));
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator bi(bias, window_bias);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
-            {
-                {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12),
-                }
-            };
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-
-                wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
-                wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
-                wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
-                wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32_t    s_in   = *in_ptr;
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, bi, out);
-}
-} // namespace
-
-void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
-                                                 const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(bias);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
-
-    _func                         = nullptr;
-    _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
-    _result_shift                 = info.result_shift;
-    _result_offset_after_shift    = info.result_offset_after_shift;
-
-    // Auto-initialize output output if required
-    if(dst != nullptr)
-    {
-        // Work out expected output data type
-        const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
-        // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt));
-    }
-
-    Window win = calculate_max_window(*src, Steps());
-
-    ICpuKernel::configure(win);
-
-    const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
-
-    // Set appropriate function
-    if(src->data_layout() == DataLayout::NCHW)
-    {
-        switch(src->data_type())
-        {
-            case DataType::S32:
-            {
-                if(is_qasymm8_signed)
-                {
-                    _func = &output_stage_nchw<int8_t>;
-                }
-                else
-                {
-                    _func = &output_stage_nchw<uint8_t>;
-                }
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                _func = &output_stage_nchw<float16_t>;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-            {
-                _func = &output_stage_nchw<float>;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
-            }
-        }
-    }
-    else
-    {
-        switch(src->data_type())
-        {
-            case DataType::S32:
-            {
-                if(is_qasymm8_signed)
-                {
-                    _func = &output_stage_nhwc<int8_t>;
-                }
-                else
-                {
-                    _func = &output_stage_nhwc<uint8_t>;
-                }
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                _func = &output_stage_nhwc<float16_t>;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-            {
-                _func = &output_stage_nhwc<float>;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
-            }
-        }
-    }
-}
-
-Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                                                  const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
-    return Status{};
-}
-
-void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
-    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
-}
-
-const char *CpuDirectConv2dOutputStageKernel::name() const
-{
-    return "CpuDirectConv2dOutputStageKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
deleted file mode 100644
index 62bc5d41c9..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input.
- *
- * @note We assume bias to be shared
- * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
- *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
- */
-class CpuDirectConv2dOutputStageKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuDirectConv2dOutputStageKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dOutputStageKernel);
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] src  Input to add the bias to. If @p dst is not specified then accumulation is done in-place.
-     *                      Data type supported: F16/F32/S32
-     * @param[in]      bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
-     * @param[out]     dst  (Optional) If the dst tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                      Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
-     *                      Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
-     * @param[in]      info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
-     */
-    void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
-                   const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuDirectConv2dOutputStageKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr,
-                           const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
-
-    OutputStageKernel *_func{ nullptr };
-    int                _result_fixedpoint_multiplier{ 0 };
-    int                _result_shift{ 0 };
-    int                _result_offset_after_shift{ 0 };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
deleted file mode 100644
index 643a870540..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseKernel.cpp
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuElementwiseKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_list.h"
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-using ElementwiseSelector = std::add_pointer<bool(DataType)>::type;
-using UKernelType         = CpuElementwiseKernel::ElementwiseFunction;
-struct ElementwiseKernel
-{
-    const char               *name;
-    const ElementwiseSelector is_selected;
-    UKernelType              *ukernel;
-};
-
-template <DataType dt>
-inline bool is_selected(DataType data_type)
-{
-    return dt == data_type;
-}
-
-template <DataType input_data_type, DataType output_data_type = input_data_type>
-static ElementwiseKernel generate_kernel(UKernelType *ukernel)
-{
-    std::string kernel_name("op_");
-    kernel_name += string_from_data_type(input_data_type) + "_";
-    kernel_name += string_from_data_type(input_data_type) + "_";
-    kernel_name += string_from_data_type(output_data_type);
-
-    return { kernel_name.c_str(), is_selected<input_data_type>, ukernel };
-}
-
-template <ArithmeticOperation op>
-std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(src1, dst);
-    static ElementwiseKernel kernels[] =
-    {
-#if defined(ENABLE_SVE)
-        generate_kernel<DataType::F32>(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float32_t>))),
-        generate_kernel<DataType::S32>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int32_t>))),
-        generate_kernel<DataType::S16>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int16_t>))),
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
-        generate_kernel<DataType::F32>(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>))),
-        generate_kernel<DataType::S32>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>))),
-#endif /* defined(ENABLE_NEON) */
-#if defined(__ARM_FEATURE_SVE2)
-        generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, uint8_t>))),
-        generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, int8_t>))),
-#else  /* !defined(__ARM_FEATURE_SVE2) */
-        generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized<op>))),
-        generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed<op>))),
-#endif /* defined(__ARM_FEATURE_SVE2) */
-#if defined(ENABLE_SVE)
-        generate_kernel<DataType::F16>(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float16_t>))),
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-        generate_kernel<DataType::F16>(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>))),
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-        generate_kernel<DataType::S16>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>))),
-#endif /* defined(ENABLE_NEON) */
-    };
-
-    for(const auto &uk : kernels)
-    {
-        if(uk.is_selected(src0->data_type()))
-        {
-            return uk.ukernel;
-        }
-    }
-
-    return nullptr;
-}
-
-template <ComparisonOperation op>
-std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(src1, dst);
-    static ElementwiseKernel kernels[] =
-    {
-#if defined(ENABLE_SVE)
-        generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, uint8_t>))),
-        generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op<op, float>))),
-        generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int16_t>))),
-        generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int32_t>))),
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
-        generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8<op, uint8_t, uint8x16_t>))),
-        generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32<op, float, float32x4_t>))),
-        generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16<op, int16_t, int16x8_t>))),
-        generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32<op, int32_t, int32x4_t>))),
-#endif /* defined(ENABLE_NEON) */
-#if defined(__ARM_FEATURE_SVE2)
-        generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, int8_t>))),
-        generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, uint8_t>))),
-#else  /* !defined(__ARM_FEATURE_SVE2) */
-        generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed<op>))),
-        generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized<op>))),
-#endif /* defined(__ARM_FEATURE_SVE2) */
-#if defined(ENABLE_SVE)
-        generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op<op, float16_t>))),
-#endif /* defined(ENABLE_SVE)  */
-#if defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-        generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16<op, float16_t, float16x8_t>))),
-#endif /* defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    };
-
-    for(const auto &uk : kernels)
-    {
-        if(uk.is_selected(src0->data_type()))
-        {
-            return uk.ukernel;
-        }
-    }
-
-    return nullptr;
-}
-} // namespace
-
-Status CpuElementwiseKernel::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
-                                        "Wrong shape for output");
-    }
-
-    return Status{};
-}
-
-void CpuElementwiseKernel::configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    // If any of shapes is dynamic, expect a configured window and dst at run-time.
-    if(src0->is_dynamic() || src1->is_dynamic())
-    {
-        return;
-    }
-
-    auto shape_and_window = compute_output_shape_and_window(src0->tensor_shape(), src1->tensor_shape());
-    auto_init_if_empty(*dst, shape_and_window.first, 1, src0->data_type());
-    ICpuKernel::configure(shape_and_window.second);
-}
-
-void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-
-    auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    auto function = get_implementation(src0->info(), src1->info(), dst->info());
-    ARM_COMPUTE_ERROR_ON(function == nullptr);
-    function(src0, src1, dst, window);
-}
-
-/** Arithmetic operators (min, max, squared_diff) */
-void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
-    configure_common(src0, src1, dst);
-    _op = op;
-}
-
-Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
-    }
-    return validate_arguments_common(src0, src1, dst);
-}
-
-Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(op);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
-    return Status{};
-}
-
-std::function<CpuElementwiseKernel::ElementwiseFunction>
-CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    switch(_op)
-    {
-        case ArithmeticOperation::MAX:
-            return configure_arithm_func<ArithmeticOperation::MAX>(src0, src1, dst);
-        case ArithmeticOperation::MIN:
-            return configure_arithm_func<ArithmeticOperation::MIN>(src0, src1, dst);
-        case ArithmeticOperation::SQUARED_DIFF:
-            return configure_arithm_func<ArithmeticOperation::SQUARED_DIFF>(src0, src1, dst);
-        case ArithmeticOperation::PRELU:
-            return configure_arithm_func<ArithmeticOperation::PRELU>(src0, src1, dst);
-        case ArithmeticOperation::DIV:
-            return configure_arithm_func<ArithmeticOperation::DIV>(src0, src1, dst);
-        case ArithmeticOperation::POWER:
-            return configure_arithm_func<ArithmeticOperation::POWER>(src0, src1, dst);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-    return nullptr;
-}
-
-/** The division operator */
-
-void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
-    configure_common(src0, src1, dst);
-    _op = ArithmeticOperation::DIV;
-}
-
-Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32);
-    return CpuArithmeticKernel::validate_arguments(src0, src1, dst);
-}
-
-Status CpuDivisionKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
-    return Status{};
-}
-
-/** The power operator */
-void CpuPowerKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
-    configure_common(src0, src1, dst);
-    _op = ArithmeticOperation::POWER;
-}
-
-Status CpuPowerKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::F16, DataType::F32);
-    return CpuArithmeticKernel::validate_arguments(src0, src1, dst);
-}
-
-Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
-    return Status{};
-}
-
-/** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
-void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
-    configure_common(src0, src1, dst);
-    _op = op;
-}
-
-Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8);
-    }
-    return validate_arguments_common(src0, src1, dst);
-}
-
-Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(op);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
-    return Status{};
-}
-
-std::function<CpuElementwiseKernel::ElementwiseFunction>
-CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    switch(_op)
-    {
-        case ComparisonOperation::Equal:
-            return configure_comp_func<ComparisonOperation::Equal>(src0, src1, dst);
-        case ComparisonOperation::NotEqual:
-            return configure_comp_func<ComparisonOperation::NotEqual>(src0, src1, dst);
-        case ComparisonOperation::Greater:
-            return configure_comp_func<ComparisonOperation::Greater>(src0, src1, dst);
-        case ComparisonOperation::GreaterEqual:
-            return configure_comp_func<ComparisonOperation::GreaterEqual>(src0, src1, dst);
-        case ComparisonOperation::Less:
-            return configure_comp_func<ComparisonOperation::Less>(src0, src1, dst);
-        case ComparisonOperation::LessEqual:
-            return configure_comp_func<ComparisonOperation::LessEqual>(src0, src1, dst);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-    return nullptr;
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.h b/src/core/cpu/kernels/CpuElementwiseKernel.h
deleted file mode 100644
index 952c6e3e25..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseKernel.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H
-#define ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for an element-wise operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ dst(x,y) = OP(src0(x,y), src1(x,y))@f]
- *
- */
-class CpuElementwiseKernel : public ICpuKernel
-{
-public:
-    const char *name() const override
-    {
-        return "CpuElementwiseKernel";
-    }
-
-    CpuElementwiseKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseKernel);
-
-    /** Common signature for all the specialised arithmetic functions
-     *
-     * @param[in]  src0   First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in]  src1   Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out] dst    Output tensor info. Data types supported: Dependent on subclass.
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using ElementwiseFunction = void(const ITensor *, const ITensor *, ITensor *, const Window &);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-protected:
-    /** Validate the argument passed to the kernel
-     *
-     * @param[in] src0 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] src1 Second tensor input. Data types supported: Same as @p src0.
-     * @param[in] dst  Output tensor. Data types supported: Dependent on subclass.
-     */
-    static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-
-    /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
-     *
-     */
-    void configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
-    /** Function to get the micro kernel implementation
-     *
-     * @param[in] src0 First input tensor information
-     * @param[in] src1 Second input tensor information
-     * @param[in] dst  Output tensor information
-     *
-     * @return the function instance for the micro kernel
-     */
-    virtual std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) = 0;
-};
-
-class CpuArithmeticKernel : public CpuElementwiseKernel
-{
-public:
-    /** Default constructor */
-    CpuArithmeticKernel() = default;
-
-    /** Configure kernel
-     *
-     * @param[in]  op   Arithmetic operation to be executed.
-     * @param[in]  src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
-     */
-    void configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel
-     *
-     * @param[in] op   Arithmetic operation to be executed.
-     * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[in] dst  Output tensor info. Data types supported: Same as @p src0.
-     *
-     * @return a Status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-
-    ArithmeticOperation _op{};
-
-private:
-    /** Function to get the micro kernel implementation
-     *
-     * @param[in] src0 First input tensor information
-     * @param[in] src1 Second input tensor information
-     * @param[in] dst  Output tensor information
-     *
-     * @return the function instance for the micro kernel
-     */
-    std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
-};
-
-class CpuDivisionKernel : public CpuArithmeticKernel
-{
-public:
-    /** Default constructor */
-    CpuDivisionKernel() = default;
-
-    /** Configure kernel
-     *
-     * @param[in]  src0 First tensor input info. Data types supported: S32/F16/F32.
-     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDivisionKernel
-     *
-     * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32.
-     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[in] dst  Output tensor info. Data types supported: Same as @p src0.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-};
-
-class CpuPowerKernel : public CpuArithmeticKernel
-{
-public:
-    /** Default constructor */
-    CpuPowerKernel() = default;
-
-    /** Configure kernel
-     *
-     * @param[in]  src0 First tensor input info. Data types supported: F16/F32.
-     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuPowerKernel
-     *
-     * @param[in] src0 First tensor input info. Data types supported: F16/F32.
-     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[in] dst  Output tensor info. Data types supported: Same as @p src0.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-};
-
-class CpuComparisonKernel : public CpuElementwiseKernel
-{
-public:
-    /** Default constructor */
-    CpuComparisonKernel() = default;
-
-    /** Configure kernel
-     *
-     * @param[in]  op   Comparison operation to be executed.
-     * @param[in]  src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out] dst  Output tensor info. Data types supported: U8.
-     */
-    void configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
-     *
-     * @param[in] op   Comparison operation to be executed.
-     * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[in] dst  Output tensor info. Data types supported: U8.
-     *
-     * @return a Status
-     */
-    static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-
-private:
-    /** Function to get the micro kernel implementation
-     *
-     * @param[in] src0 First input tensor information
-     * @param[in] src1 Second input tensor information
-     * @param[in] dst  Output tensor information
-     *
-     * @return the function instance for the micro kernel
-     */
-    std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
-
-    ComparisonOperation _op{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
deleted file mode 100644
index 2600a49b70..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuElementwiseUnaryKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-using ElementwiseUnarySelector = std::add_pointer<bool(DataType)>::type;
-
-struct ElementwiseUnaryKernel
-{
-    const char                                           *name;
-    const ElementwiseUnarySelector                        is_selected;
-    CpuElementwiseUnaryKernel::ElementwiseUnaryUkernelPtr ukernel;
-};
-
-static const ElementwiseUnaryKernel available_kernels[] =
-{
-#if defined(ENABLE_SVE)
-    {
-        "fp32_sve_elementwise_unary",
-        [](DataType dt) { return dt == DataType::F32; },
-        REGISTER_FP32_SVE(arm_compute::cpu::elementwise_sve_op<float>),
-    },
-    {
-        "fp16_sve_elementwise_unary",
-        [](DataType dt) { return dt == DataType::F16; },
-        REGISTER_FP16_SVE(arm_compute::cpu::elementwise_sve_op<__fp16>),
-    },
-    {
-        "s32_sve_elementwise_unary",
-        [](DataType dt) { return dt == DataType::S32; },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op<int32_t>),
-    },
-#endif // defined(ENABLE_SVE)
-#if defined(ENABLE_NEON)
-    {
-        "fp32_neon_elementwise_unary",
-        [](DataType dt) { return dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<float>),
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "fp16_neon_elementwise_unary",
-        [](DataType dt) { return dt == DataType::F16; },
-        REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<__fp16>),
-    },
-#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "s32_neon_elementwise_unary",
-        [](DataType dt) { return dt == DataType::S32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op<int32_t>),
-    },
-#endif // defined(ENABLE_NEON)
-};
-
-const ElementwiseUnaryKernel *get_implementation(DataType dt)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected(dt))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-} // namespace
-
-CpuElementwiseUnaryKernel::CpuElementwiseUnaryKernel()
-    : _op()
-{
-}
-
-void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst));
-
-    _op = op;
-
-    // If input shape is dynamic, expect a configured window and dst at run-time.
-    if(src.is_dynamic())
-    {
-        return;
-    }
-
-    auto shape_and_window = compute_output_shape_and_window(src.tensor_shape());
-    auto_init_if_empty(dst, shape_and_window.first, 1, src.data_type());
-    ICpuKernel::configure(shape_and_window.second);
-}
-
-Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
-
-    const auto *uk = get_implementation(src.data_type());
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    switch(op)
-    {
-        case ElementWiseUnary::EXP:
-        case ElementWiseUnary::RSQRT:
-        case ElementWiseUnary::LOG:
-        case ElementWiseUnary::ROUND:
-        case ElementWiseUnary::SIN:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32);
-            break;
-        case ElementWiseUnary::NEG:
-        case ElementWiseUnary::ABS:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported");
-    }
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-    }
-
-    return Status{};
-}
-
-void CpuElementwiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-
-    auto src  = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-    auto func = get_implementation(src->info()->data_type())->ukernel;
-    ARM_COMPUTE_ERROR_ON(func == nullptr);
-    func(src, dst, window, _op);
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h
deleted file mode 100644
index ceb90dcf70..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
-#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for an element-wise unary operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ dst(x) = OP(src(x))@f]
- *
- */
-class CpuElementwiseUnaryKernel : public ICpuKernel
-{
-public:
-    const char *name() const override
-    {
-        return "CpuElementwiseUnaryKernel";
-    }
-    /** Default constructor */
-    CpuElementwiseUnaryKernel();
-    /** Default destructor */
-    ~CpuElementwiseUnaryKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseUnaryKernel);
-
-    /** Function to configure the @ref CpuElementwiseUnaryKernel
-     *
-     * @param[in]  op  Arithmetic operation to be executed.
-     * @param[in]  src First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
-     * @param[out] dst Output tensor. Data types supported: Same as @p src.
-     */
-    void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuElementwiseUnaryKernel
-     *
-     * @param[in] op  Arithmetic operation to be executed.
-     * @param[in] src First tensor input info. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
-     * @param[in] dst Output tensor info. Data types supported: Same as @p src.
-     *
-     * @return a Status
-     */
-    static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-    /** Common signature for all the specialised elementwise unary micro-kernels
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary)>::type;
-
-private:
-    ElementWiseUnary _op;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuFillKernel.cpp b/src/core/cpu/kernels/CpuFillKernel.cpp
deleted file mode 100644
index aab4d715ee..0000000000
--- a/src/core/cpu/kernels/CpuFillKernel.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuFillKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-void CpuFillKernel::configure(const ITensorInfo *tensor, const PixelValue &constant_value)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
-    _constant_value = constant_value;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*tensor, Steps());
-    ICpuKernel::configure(win);
-}
-
-void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto inout = tensors.get_tensor(TensorType::ACL_SRC_DST);
-
-    // Collapse all the batches on the third dimension
-    bool   has_collapsed = true;
-    Window collapsed     = window.collapse_if_possible(window, Window::DimZ, &has_collapsed);
-    ARM_COMPUTE_ERROR_ON(!has_collapsed);
-
-    uint8_t *const start_valid_region = inout->ptr_to_element(inout->info()->valid_region().anchor);
-    const auto     window_width       = static_cast<int>(collapsed.x().end()) - static_cast<int>(collapsed.x().start());
-    const size_t   element_size       = inout->info()->element_size();
-
-    // Unroll X dimension
-    collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator tensor_it(inout, collapsed);
-    execute_window_loop(collapsed, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + tensor_it.offset();
-        // Set memory
-        for(int i = 0; i < window_width; ++i)
-        {
-            std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
-        }
-
-    },
-    tensor_it);
-}
-
-const char *CpuFillKernel::name() const
-{
-    return "CpuFillKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuFillKernel.h b/src/core/cpu/kernels/CpuFillKernel.h
deleted file mode 100644
index 9afdee4186..0000000000
--- a/src/core/cpu/kernels/CpuFillKernel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FILL_KERNEL_H
-#define ARM_COMPUTE_CPU_FILL_KERNEL_H
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel for filling a tensor with a given constant value */
-class CpuFillKernel : public ICpuKernel
-{
-public:
-    CpuFillKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFillKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in,out] tensor         Tensor to fill. Supported data types: All
-     * @param[in]     constant_value The value used to fill the planes of the tensor
-     */
-    void configure(const ITensorInfo *tensor, const PixelValue &constant_value);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    PixelValue _constant_value{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FILL_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuFloorKernel.cpp b/src/core/cpu/kernels/CpuFloorKernel.cpp
deleted file mode 100644
index c2e9d48ce9..0000000000
--- a/src/core/cpu/kernels/CpuFloorKernel.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuFloorKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/floor/list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct FloorSelectorData
-{
-    DataType dt;
-};
-
-using FloorSelectorPtr = std::add_pointer<bool(const FloorSelectorData &data)>::type;
-using FloorUKernelPtr  = std::add_pointer<void(const void *, void *, int)>::type;
-
-struct FloorUKernel
-{
-    const char            *name;
-    const FloorSelectorPtr is_selected;
-    FloorUKernelPtr        func;
-};
-
-static const FloorUKernel available_kernels[] =
-{
-    {
-        "fp16_neon_floor",
-        [](const FloorSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)
-    },
-    {
-        "f32_neon_floor",
-        [](const FloorSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)
-    },
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const FloorUKernel *get_implementation(const FloorSelectorData &data)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected(data))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-
-    const auto *uk = get_implementation(FloorSelectorData{ src->data_type() });
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->func == nullptr);
-
-    // Validate in case of configured output
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Auto initialize output
-    auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
-
-    // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    // Configure kernel window
-    const Window win = calculate_max_window(*src, Steps());
-
-    ICPPKernel::configure(win);
-}
-
-Window CpuFloorKernel::infer_window(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(dst);
-    ARM_COMPUTE_ERROR_ON(!bool(validate_arguments(src, dst)));
-
-    Window win;
-    win.use_tensor_dimensions(src->tensor_shape());
-    return win;
-}
-
-Status CpuFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    return Status{};
-}
-
-void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    const auto  len     = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
-    const auto *ukernel = get_implementation(FloorSelectorData{ src->info()->data_type() });
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator src_it(src, win);
-    Iterator dst_it(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        ukernel->func(src_it.ptr(), dst_it.ptr(), len);
-    },
-    src_it, dst_it);
-}
-
-const char *CpuFloorKernel::name() const
-{
-    return "CpuFloorKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuFloorKernel.h b/src/core/cpu/kernels/CpuFloorKernel.h
deleted file mode 100644
index 2680871b45..0000000000
--- a/src/core/cpu/kernels/CpuFloorKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FLOOR_KERNEL_H
-#define ARM_COMPUTE_CPU_FLOOR_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Cpu accelarated kernel to perform a floor operation */
-class CpuFloorKernel : public ICpuKernel
-{
-public:
-    CpuFloorKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFloorKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]  src Source tensor. Data type supported: F16/F32.
-     * @param[out] dst Destination tensor. Same as @p src
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuFloorKernel
-     *
-     * @param[in] src Source tensor info. Data type supported: F16/F32.
-     * @param[in] dst Destination tensor info. Same as @p src
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-    /** Infer execution window
-     *
-     * @param[in] src Source tensor info. Data type supported: F16/F32.
-     * @param[in] dst Destination tensor info. Same as @p src
-     *
-     * @return an execution Window
-     */
-    Window infer_window(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FLOOR_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuMulKernel.cpp b/src/core/cpu/kernels/CpuMulKernel.cpp
deleted file mode 100644
index 82ec322875..0000000000
--- a/src/core/cpu/kernels/CpuMulKernel.cpp
+++ /dev/null
@@ -1,1729 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuMulKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-const float       scale255_constant      = 1.f / 255.f;
-const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
-const float32x4_t positive_round_f32q    = vdupq_n_f32(0.5f);
-
-inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
-{
-    ARM_COMPUTE_UNUSED(overflow_policy);
-    ARM_COMPUTE_UNUSED(rounding_policy);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16,
-                                                         DataType::S32, DataType::F16, DataType::F32);
-    if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
-    }
-
-    if(dst->total_size() > 0)
-    {
-        const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-        // clang-format off
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-            !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
-            !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
-            !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
-            !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
-            !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
-            !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
-            , "Invalid data type combination");
-        // clang-format on
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
-    }
-
-    if(std::abs(scale - scale255_constant) < 0.00001f)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
-                                        "Scale == 1/255 is not supported if input and dst are of data type S32");
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
-
-        int         exponent            = 0;
-        const float normalized_mantissa = std::frexp(scale, &exponent);
-
-        // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
-        // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
-        // Moreover, it will be negative as we deal with 1/2^n
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
-    }
-
-    return Status{};
-}
-
-/* Scales a given vector by 1/255.
- *
- * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
- *
- * @param in Input vector to scale.
- * @return   Scaled dst rounded to nearest (round half up).
- */
-inline int32x4_t scale255_S32_S32(int32x4_t in)
-{
-    // Scale
-    const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
-    // Round to nearest (round half up)
-    // Add +0.5 for all values
-    // Afterwards vcvt rounds toward zero
-    return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
-}
-
-inline uint16x8_t scale255_U16_U16(uint16x8_t in)
-{
-    const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
-    const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
-    return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
-vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
-{
-    return vquantize_signed(val, info);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
-vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
-{
-    return vquantize(val, info);
-}
-
-template <typename T>
-void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16 / sizeof(T);
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
-    const UniformQuantizationInfo tmp_qua_info    = { output_qua_info.scale / scale, output_qua_info.offset };
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-
-        using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<T *>(dst.ptr());
-
-            const auto broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-
-                // Dequantize inputs
-                const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
-                const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
-
-                const float32x4x4_t out_f32x4x4 =
-                {
-                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-                };
-
-                // Quantize dst
-                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
-                wrapper::vstore(output_ptr + x, result);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                // Dequantize inputs
-                const T     src1    = *(non_broadcast_input_ptr + x);
-                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
-                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
-                const float tmp_f   = tmp_in1 * tmp_in2;
-
-                // Quantize dst
-                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
-                *(output_ptr + x)  = tmp_qua;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto input1_q = wrapper::vloadq(input1_ptr + x);
-                const auto input2_q = wrapper::vloadq(input2_ptr + x);
-
-                // Dequantize inputs
-                const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
-                const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
-
-                const float32x4x4_t out_f32x4x4 =
-                {
-                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-                };
-
-                // Quantize dst
-                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
-                wrapper::vstore(output_ptr + x, result);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                // Dequantize inputs
-                const T     src1    = *(input1_ptr + x);
-                const T     src2    = *(input2_ptr + x);
-                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
-                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
-                const float tmp_f   = tmp_in1 * tmp_in2;
-
-                // Quantize dst
-                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
-                *(output_ptr + x)  = tmp_qua;
-            }
-        },
-        input1, input2, dst);
-    }
-}
-
-void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
-    const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
-    const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
-
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const qsymm16x8x2_t input1_q =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const qsymm16x8x2_t input2_q =
-            {
-                {
-                    vld1q_s16(input2_ptr + x),
-                    vld1q_s16(input2_ptr + x + 8),
-                }
-            };
-
-            // Dequantize inputs
-            const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
-            const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
-
-            const float32x4x4_t out_f32x4x4 =
-            {
-                vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-            };
-
-            const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Dequantize inputs
-            float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
-            float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
-            float tmp_f   = tmp_in1 * tmp_in2;
-
-            // Quantize dst, lrintf() has same rounding mode as vcombine_s16
-            int32_t   tmp     = lrintf(tmp_f / tmp_qua_info.scale);
-            qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            *(output_ptr + x) = tmp_qua;
-        }
-    },
-    input1, input2, dst);
-}
-
-void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
-{
-    ARM_COMPUTE_UNUSED(scale);
-
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const qsymm16x8x2_t input1_q =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const qsymm16x8x2_t input2_q =
-            {
-                {
-                    vld1q_s16(input2_ptr + x),
-                    vld1q_s16(input2_ptr + x + 8),
-                }
-            };
-
-            const int32x4x4_t in1_s32 =
-            {
-                {
-                    vmovl_s16(vget_low_s16(input1_q.val[0])),
-                    vmovl_s16(vget_high_s16(input1_q.val[0])),
-                    vmovl_s16(vget_low_s16(input1_q.val[1])),
-                    vmovl_s16(vget_high_s16(input1_q.val[1])),
-                }
-            };
-            const int32x4x4_t in2_s32 =
-            {
-                {
-                    vmovl_s16(vget_low_s16(input2_q.val[0])),
-                    vmovl_s16(vget_high_s16(input2_q.val[0])),
-                    vmovl_s16(vget_low_s16(input2_q.val[1])),
-                    vmovl_s16(vget_high_s16(input2_q.val[1])),
-                }
-            };
-
-            const int32x4x4_t result =
-            {
-                {
-                    vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
-                    vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
-                    vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
-                    vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
-                }
-            };
-
-            vst1q_s32(output_ptr + x, result.val[0]);
-            vst1q_s32(output_ptr + x + 4, result.val[1]);
-            vst1q_s32(output_ptr + x + 8, result.val[2]);
-            vst1q_s32(output_ptr + x + 12, result.val[3]);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp       = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16 / sizeof(uint8_t);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
-            const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
-
-            uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
-            const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
-            uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
-            const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
-
-            tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
-            tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
-
-            if(is_scale255)
-            {
-                tmp1_high = scale255_U16_U16(tmp1_high);
-                tmp1_low  = scale255_U16_U16(tmp1_low);
-            }
-            else
-            {
-                const int16x8_t vn = vdupq_n_s16(-n);
-
-                if(is_sat)
-                {
-                    tmp1_high = vqshlq_u16(tmp1_high, vn);
-                    tmp1_low  = vqshlq_u16(tmp1_low, vn);
-                }
-                else
-                {
-                    tmp1_high = vshlq_u16(tmp1_high, vn);
-                    tmp1_low  = vshlq_u16(tmp1_low, vn);
-                }
-            }
-            if(is_sat)
-            {
-                vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
-            }
-            else
-            {
-                vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
-            }
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-                tmp         = static_cast<uint16_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                tmp >>= n;
-            }
-            if(is_sat && tmp > 255)
-            {
-                tmp = 255;
-            }
-            *(output_ptr + x) = static_cast<uint8_t>(tmp);
-        }
-    },
-    input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
-{
-    int32x4_t       tmp1_high = vmovl_s16(vget_high_s16(src1));
-    const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
-    int32x4_t       tmp1_low  = vmovl_s16(vget_low_s16(src1));
-    const int32x4_t tmp2_low  = vmovl_s16(vget_low_s16(src2));
-
-    tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
-    tmp1_low  = vmulq_s32(tmp1_low, tmp2_low);
-
-    if(is_scale255)
-    {
-        tmp1_high = scale255_S32_S32(tmp1_high);
-        tmp1_low  = scale255_S32_S32(tmp1_low);
-    }
-    else
-    {
-        // Right shift amount
-        const int32x4_t vn = vdupq_n_s32(-n);
-        // Left shift amount
-        const int32x4_t vnl = vdupq_n_s32(n);
-        // Calculate conversion bit
-        const uint32x4_t tmp1_high_u  = vreinterpretq_u32_s32(tmp1_high);
-        const uint32x4_t tmp1_low_u   = vreinterpretq_u32_s32(tmp1_low);
-        const uint32x4_t sign_high    = vshrq_n_u32(tmp1_high_u, 31);
-        const uint32x4_t sign_low     = vshrq_n_u32(tmp1_low_u, 31);
-        const int32x4_t  sign_high_s  = vreinterpretq_s32_u32(sign_high);
-        const int32x4_t  sign_low_s   = vreinterpretq_s32_u32(sign_low);
-        const int32x4_t  convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
-        const int32x4_t  convert_low  = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
-        if(is_sat)
-        {
-            tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
-            tmp1_low  = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
-        }
-        else
-        {
-            tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
-            tmp1_low  = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
-        }
-    }
-
-    if(is_sat)
-    {
-        return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
-    }
-    else
-    {
-        return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
-    }
-}
-
-template <bool is_scale255, bool is_sat>
-inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
-{
-    const int16x8x2_t result =
-    {
-        {
-            // First 8 elements
-            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
-            // Second 8 elements
-            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
-        }
-    };
-
-    return result;
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const int16x8x2_t ta1 =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t ta2 =
-            {
-                {
-                    vld1q_s16(input2_ptr + x),
-                    vld1q_s16(input2_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
-
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-
-                tmp = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint32_t mask = (1u << n) - 1;
-                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
-                }
-            }
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            }
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
-}
-
-template <bool   is_sat>
-inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
-{
-    const int32x2_t input1_1 = vget_low_s32(src1);
-    const int32x2_t input2_1 = vget_low_s32(src2);
-    const int32x2_t input1_2 = vget_high_s32(src1);
-    const int32x2_t input2_2 = vget_high_s32(src2);
-
-    int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
-    int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
-
-    // Apply scaling, conversion and rounding (round to zero)
-    // Right shift amount
-    const int64x2_t vn = vdupq_n_s64(-n);
-    // Left shift amount
-    const int64x2_t vnl = vdupq_n_s64(n);
-    // Calculate conversion bit
-    const uint64x2_t tmp_1_u   = vreinterpretq_u64_s64(tmp_1);
-    const uint64x2_t sign_1    = vshrq_n_u64(tmp_1_u, 63);
-    const int64x2_t  sign_1_s  = vreinterpretq_s64_u64(sign_1);
-    const int64x2_t  convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
-
-    const uint64x2_t tmp_2_u   = vreinterpretq_u64_s64(tmp_2);
-    const uint64x2_t sign_2    = vshrq_n_u64(tmp_2_u, 63);
-    const int64x2_t  sign_2_s  = vreinterpretq_s64_u64(sign_2);
-    const int64x2_t  convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
-    if(is_sat)
-    {
-        tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
-        tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
-        return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
-    }
-    else
-    {
-        tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
-        tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
-        return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
-    }
-}
-
-template <bool     is_sat>
-inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
-{
-    const int32x4x2_t result =
-    {
-        {
-            // First 4 elements
-            mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
-            // Second 4 elements
-            mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
-        }
-    };
-
-    return result;
-}
-
-template <bool is_sat>
-void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 8;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int32_t *>(dst.ptr());
-
-            const int32_t broadcast_value     = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
-            const auto    broadcast_value_vec = vdupq_n_s32(broadcast_value);
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int32x4x2_t broadcast_v =
-                {
-                    {
-                        broadcast_value_vec,
-                        broadcast_value_vec,
-                    }
-                };
-                const int32x4x2_t non_broadcast_v =
-                {
-                    {
-                        vld1q_s32(non_broadcast_input_ptr + x),
-                        vld1q_s32(non_broadcast_input_ptr + x + 4),
-                    }
-                };
-                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
-
-                vst1q_s32(output_ptr + x, result.val[0]);
-                vst1q_s32(output_ptr + x + 4, result.val[1]);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
-
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint64_t mask = ((uint64_t)1u << n) - 1;
-                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
-                }
-                if(is_sat)
-                {
-                    tmp = utility::clamp<int64_t, int32_t>(tmp);
-                }
-                *(output_ptr + x) = static_cast<int32_t>(tmp);
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int32x4x2_t ta1 =
-                {
-                    {
-                        vld1q_s32(input1_ptr + x),
-                        vld1q_s32(input1_ptr + x + 4),
-                    }
-                };
-                const int32x4x2_t ta2 =
-                {
-                    {
-                        vld1q_s32(input2_ptr + x),
-                        vld1q_s32(input2_ptr + x + 4),
-                    }
-                };
-                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
-
-                vst1q_s32(output_ptr + x, result.val[0]);
-                vst1q_s32(output_ptr + x + 4, result.val[1]);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
-
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint64_t mask = ((uint64_t)1u << n) - 1;
-                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
-                }
-                if(is_sat)
-                {
-                    tmp = utility::clamp<int64_t, int32_t>(tmp);
-                }
-                *(output_ptr + x) = static_cast<int32_t>(tmp);
-            }
-        },
-        input1, input2, dst);
-    }
-}
-
-void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    constexpr int window_step_x         = 16 / sizeof(float);
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
-    using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
-
-            const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
-            const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-            const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                auto       res             = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto ta1       = wrapper::vloadq(input1_ptr + x);
-                const auto ta2       = wrapper::vloadq(input2_ptr + x);
-                const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
-                const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto ta1    = *(input1_ptr + x);
-                const auto ta2    = *(input2_ptr + x);
-                *(output_ptr + x) = ta1 * ta2 * scale;
-            }
-        },
-        input1, input2, dst);
-    }
-}
-
-void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    constexpr int window_step_x         = 8 / sizeof(float);
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
-    using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
-
-            const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto  a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
-                float32x4_t b = vdupq_n_f32(broadcast_value);
-
-                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
-                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
-                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
-                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
-                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
-
-                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
-                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
-
-                float32x4_t res = wrapper::vmul(tmp0, b);
-                b               = wrapper::vmul(b, mask);
-
-                res = wrapper::vmla(res, tmp1, b);
-                wrapper::vstore(output_ptr + 2 * x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
-                const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
-                auto       res1                 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
-                auto       res2                 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
-                *(output_ptr + 2 * x)           = res1;
-                *(output_ptr + 2 * x + 1)       = res2;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
-                float32x4_t       b = wrapper::vloadq(input2_ptr + 2 * x);
-
-                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
-                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
-                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
-                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
-                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
-
-                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
-                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
-
-                float32x4_t res = wrapper::vmul(tmp0, b);
-
-                b = wrapper::vrev64(b);
-                b = wrapper::vmul(b, mask);
-
-                res = wrapper::vmla(res, tmp1, b);
-                wrapper::vstore(output_ptr + 2 * x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto a0             = *(input1_ptr + 2 * x);
-                const auto a1             = *(input1_ptr + 2 * x + 1);
-                const auto b0             = *(input2_ptr + 2 * x);
-                const auto b1             = *(input2_ptr + 2 * x + 1);
-                auto       res1           = a0 * b0 - a1 * b1;
-                auto       res2           = a0 * b1 + a1 * b0;
-                *(output_ptr + 2 * x)     = res1;
-                *(output_ptr + 2 * x + 1) = res2;
-            }
-        },
-        input1, input2, dst);
-    }
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    constexpr int window_step_x         = 16;
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto          non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
-            const auto          output_ptr              = reinterpret_cast<float16_t *>(dst.ptr());
-            const auto          broadcast_value         = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
-            const float16x8x2_t broadcast_value_vec =
-            {
-                {
-                    vdupq_n_f16(broadcast_value),
-                    vdupq_n_f16(broadcast_value),
-                }
-            };
-            const auto scale_vec = vdupq_n_f16(scale);
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float16x8x2_t non_broadcast_v =
-                {
-                    {
-                        vld1q_f16(non_broadcast_input_ptr + x),
-                        vld1q_f16(non_broadcast_input_ptr + x + 8),
-                    }
-                };
-                const float16x8x2_t result =
-                {
-                    {
-                        vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
-                        vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
-                    }
-                };
-                vst1q_f16(output_ptr + x, result.val[0]);
-                vst1q_f16(output_ptr + x + 8, result.val[1]);
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float16x8x2_t ta1 =
-                {
-                    {
-                        vld1q_f16(input1_ptr + x),
-                        vld1q_f16(input1_ptr + x + 8),
-                    }
-                };
-                const float16x8x2_t ta2 =
-                {
-                    {
-                        vld1q_f16(input2_ptr + x),
-                        vld1q_f16(input2_ptr + x + 8),
-                    }
-                };
-                const float16x8_t   scale_vec = vdupq_n_f16(scale);
-                const float16x8x2_t result =
-                {
-                    {
-                        vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
-                        vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
-                    }
-                };
-                vst1q_f16(output_ptr + x, result.val[0]);
-                vst1q_f16(output_ptr + x + 8, result.val[1]);
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto ta1    = *(input1_ptr + x);
-                const auto ta2    = *(input2_ptr + x);
-                *(output_ptr + x) = ta1 * ta2 * scale;
-            }
-        },
-        input1, input2, dst);
-    }
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <bool is_scale255, bool is_sat>
-void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16 / sizeof(uint8_t);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
-            const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
-
-            uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
-            uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
-            tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
-            tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
-
-            if(is_scale255)
-            {
-                tmp_low  = scale255_U16_U16(tmp_low);
-                tmp_high = scale255_U16_U16(tmp_high);
-            }
-            else
-            {
-                const int16x8_t vn = vdupq_n_s16(-n);
-
-                if(is_sat)
-                {
-                    tmp_low  = vqshlq_u16(tmp_low, vn);
-                    tmp_high = vqshlq_u16(tmp_high, vn);
-                }
-                else
-                {
-                    tmp_low  = vshlq_u16(tmp_low, vn);
-                    tmp_high = vshlq_u16(tmp_high, vn);
-                }
-            }
-
-            if(is_sat)
-            {
-                static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
-
-                tmp_low  = vminq_u16(tmp_low, max);
-                tmp_high = vminq_u16(tmp_high, max);
-            }
-
-            vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
-            vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-                tmp         = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                tmp >>= n;
-            }
-
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
-            }
-
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const int16x8x2_t ta1 =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const uint8x8x2_t ta2u =
-            {
-                {
-                    vld1_u8(input2_ptr + x),
-                    vld1_u8(input2_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t ta2 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
-                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
-                }
-            };
-
-            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
-
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-
-                tmp = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint32_t mask = (1u << n) - 1;
-                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
-                }
-            }
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            }
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Simply swap the two input buffers
-    mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
-}
-} // namespace
-
-void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
-{
-    ARM_COMPUTE_UNUSED(rounding_policy);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    // Auto initialize dst if not initialized
-    set_shape_if_empty(*dst, out_shape);
-
-    _scale          = scale;
-    _scale_exponent = 0;
-    _func_quantized = nullptr;
-    _func_int       = nullptr;
-    _func_float     = nullptr;
-
-    bool is_scale_255 = false;
-    // Check and validate scaling factor
-    if(std::abs(scale - scale255_constant) < 0.00001f)
-    {
-        is_scale_255 = true;
-    }
-    else
-    {
-        int exponent = 0;
-
-        std::frexp(scale, &exponent);
-
-        // Store the positive exponent. We know that we compute 1/2^n
-        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
-        _scale_exponent = std::abs(exponent - 1);
-    }
-
-    const DataType dt_input1 = src1->data_type();
-    const DataType dt_input2 = src2->data_type();
-    const DataType dt_output = dst->data_type();
-    const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
-
-    switch(dt_input1)
-    {
-        case DataType::QASYMM8:
-            if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
-            {
-                _func_quantized = &mul_saturate_quantized_8<uint8_t>;
-            }
-            break;
-        case DataType::QASYMM8_SIGNED:
-            if(dt_input2 == DataType::QASYMM8_SIGNED)
-            {
-                _func_quantized = &mul_saturate_quantized_8<int8_t>;
-                ;
-            }
-            break;
-        case DataType::QSYMM16:
-            if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
-            {
-                _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
-            }
-            else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
-            {
-                _func_int = &mul_QSYMM16_QSYMM16_S32;
-            }
-            break;
-        case DataType::S16:
-            if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
-            {
-                if(is_scale_255)
-                {
-                    _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
-                }
-                else
-                {
-                    _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
-                }
-            }
-            if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
-            {
-                if(is_scale_255)
-                {
-                    _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
-                }
-                else
-                {
-                    _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
-                }
-            }
-            break;
-        case DataType::S32:
-            if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
-            {
-                _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
-            }
-            break;
-        case DataType::U8:
-            if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
-            {
-                if(is_scale_255)
-                {
-                    _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
-                }
-                else
-                {
-                    _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
-                }
-            }
-            else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
-            {
-                if(is_scale_255)
-                {
-                    _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
-                }
-                else
-                {
-                    _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
-                }
-            }
-            else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
-            {
-                if(is_scale_255)
-                {
-                    _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
-                }
-                else
-                {
-                    _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
-                }
-            }
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func_float = &mul_F16_F16_F16;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-            _func_float = &mul_F32_F32_F32;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("You called with the wrong img formats");
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(out_shape);
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
-                              RoundingPolicy rounding_policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
-
-    return Status{};
-}
-
-void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    if(_func_quantized != nullptr)
-    {
-        (*_func_quantized)(src1, src2, dst, window, _scale);
-    }
-    else if(_func_int != nullptr)
-    {
-        (*_func_int)(src1, src2, dst, window, _scale_exponent);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
-        (*_func_float)(src1, src2, dst, window, _scale);
-    }
-}
-const char *CpuMulKernel::name() const
-{
-    return "CpuMulKernel";
-}
-namespace
-{
-Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    // Auto initialize dst if not initialized
-    const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
-    auto_init_if_empty(*dst, out_info);
-
-    // Configure kernel window
-    Window win = calculate_max_window(out_shape);
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
-
-    return Status{};
-}
-
-void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    c_mul_F32_F32_F32_n(src1, src2, dst, window);
-}
-
-const char *CpuComplexMulKernel::name() const
-{
-    return "CpuComplexMulKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuMulKernel.h b/src/core/cpu/kernels/CpuMulKernel.h
deleted file mode 100644
index 3e667bc4be..0000000000
--- a/src/core/cpu/kernels/CpuMulKernel.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_MUL_KERNEL_H
-#define ARM_COMPUTE_CPU_MUL_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform multiplication between two tensors */
-class CpuMulKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuMulKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMulKernel);
-    /** Initialise the kernel's input, dst and border mode.
-     *
-     * Valid configurations (Src1,Src2) -> Dst :
-     *
-     *                                                       Support: Broadcast? Scale=1/255?
-     *   - (U8,U8)                         -> U8, S16                 N          Y
-     *   - (U8,S16)                        -> S16                     N          Y
-     *   - (S16,U8)                        -> S16                     N          Y
-     *   - (S16,S16)                       -> S16                     N          Y
-     *   - (S32,S32)                       -> S32                     Y          N
-     *   - (F16,F16)                       -> F16                     N          Y
-     *   - (F32,F32)                       -> F32                     Y          Y
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8                 Y          Y
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED          Y          Y
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16, S32            N          Y
-     *
-     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
-     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *
-     * @param[in]  src1            First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
-     * @param[in]  src2            Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
-     * @param[out] dst             Dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     *                             If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255
-     * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
-     * @param[in]  rounding_policy Rounding policy.
-     */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuMulKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
-
-    // Inherited methods overridden
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Common signature for all the specialised multiplication functions with integer scaling factor
-     *
-     * @param[in]  src1   Src1 tensor object.
-     * @param[in]  src2   Src2 tensor object.
-     * @param[out] dst    Dst tensor object.
-     * @param[in]  window Region on which to execute the kernel
-     * @param[in]  scale  Integer scale factor.
-     */
-    using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
-    /** Common signature for all the specialised multiplication functions with float scaling factor
-     *
-     * @param[in]  src1   Src1 tensor object.
-     * @param[in]  src2   Src2 tensor object.
-     * @param[out] dst    Dst tensor object.
-     * @param[in]  window Region on which to execute the kernel
-     * @param[in]  scale  Float scale factor.
-     */
-    using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
-    /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
-     *
-     * @param[in]  src1   Src1 tensor object.
-     * @param[in]  src2   Src2 tensor object.
-     * @param[out] dst    Dst tensor object.
-     * @param[in]  window Region on which to execute the kernel
-     * @param[in]  scale  Float scale factor.
-     *
-     */
-    using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
-
-    MulFunctionFloat     *_func_float{ nullptr };
-    MulFunctionInt       *_func_int{ nullptr };
-    MulFunctionQuantized *_func_quantized{ nullptr };
-    float                 _scale{ 0 };
-    int                   _scale_exponent{ 0 };
-};
-
-/** Interface for the complex pixelwise multiplication kernel. */
-class CpuComplexMulKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuComplexMulKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuComplexMulKernel);
-    /** Initialise the kernel's src, dst and border mode.
-     *
-     * @param[in]  src1 An src tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[in]  src2 An src tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
-     * @param[out] dst  The dst tensor, Data types supported: same as @p src1.  Number of channels supported: same as @p src1.
-     */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuComplexMulKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_MUL_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPermuteKernel.cpp b/src/core/cpu/kernels/CpuPermuteKernel.cpp
deleted file mode 100644
index 270d6e222e..0000000000
--- a/src/core/cpu/kernels/CpuPermuteKernel.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuPermuteKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace
-{
-#include "src/core/NEON/kernels/convolution/common/shims.hpp"
-} // namespace
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-inline bool is_permutation_supported(const PermutationVector &v)
-{
-    static const std::array<PermutationVector, 2> permutations2 =
-    {
-        {
-            PermutationVector(0U, 1U),
-            PermutationVector(1U, 0U),
-        }
-    };
-    static const std::array<PermutationVector, 6> permutations3 =
-    {
-        {
-            PermutationVector(2U, 0U, 1U),
-            PermutationVector(1U, 2U, 0U),
-            PermutationVector(0U, 1U, 2U),
-            PermutationVector(0U, 2U, 1U),
-            PermutationVector(1U, 0U, 2U),
-            PermutationVector(2U, 1U, 0U),
-        }
-    };
-    static const std::array<PermutationVector, 24> permutations4 =
-    {
-        {
-            PermutationVector(0U, 1U, 2U, 3U),
-            PermutationVector(1U, 0U, 2U, 3U),
-            PermutationVector(2U, 0U, 1U, 3U),
-            PermutationVector(0U, 2U, 1U, 3U),
-            PermutationVector(1U, 2U, 0U, 3U),
-            PermutationVector(2U, 1U, 0U, 3U),
-            PermutationVector(2U, 1U, 3U, 0U),
-            PermutationVector(1U, 2U, 3U, 0U),
-            PermutationVector(3U, 2U, 1U, 0U),
-            PermutationVector(2U, 3U, 1U, 0U),
-            PermutationVector(1U, 3U, 2U, 0U),
-            PermutationVector(3U, 1U, 2U, 0U),
-            PermutationVector(3U, 0U, 2U, 1U),
-            PermutationVector(0U, 3U, 2U, 1U),
-            PermutationVector(2U, 3U, 0U, 1U),
-            PermutationVector(3U, 2U, 0U, 1U),
-            PermutationVector(0U, 2U, 3U, 1U),
-            PermutationVector(2U, 0U, 3U, 1U),
-            PermutationVector(1U, 0U, 3U, 2U),
-            PermutationVector(0U, 1U, 3U, 2U),
-            PermutationVector(3U, 1U, 0U, 2U),
-            PermutationVector(1U, 3U, 0U, 2U),
-            PermutationVector(0U, 3U, 1U, 2U),
-            PermutationVector(3U, 0U, 1U, 2U)
-        }
-    };
-
-    return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v))
-           || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_permutation_supported(perm), "PermutationVector not supported.");
-
-    const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
-
-    // Validate configured destination
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-
-template <typename T>
-void run_permute(const Window &window, const ITensor *src, const ITensor *dst, const PermutationVector &perm)
-{
-    const DataLayout src_layout = src->info()->data_layout();
-
-    // Source window
-    Window window_src = window;
-
-    // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others
-    // we have to fall back to C++
-    if((src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) || (src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U }))
-    {
-        window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
-        window_src.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
-        window_src.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
-        window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
-    }
-
-    // Destination window
-    Window                  window_dst(window);
-    const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
-    for(size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
-    {
-        window_dst.set(d, zero_window);
-    }
-
-    // Create iterators
-    Iterator src_it(src, window_src);
-    Iterator dst_it(dst, window_dst);
-
-    int in_row_stride     = 0;
-    int in_col_stride     = 0;
-    int in_channel_stride = 0;
-    int in_batch_stride   = 0;
-    int n_cols            = 0;
-    int n_rows            = 0;
-    int n_channels        = 0;
-    int n_batches         = 0;
-
-    switch(src_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            in_row_stride     = src->info()->strides_in_bytes().y() / sizeof(T);
-            in_channel_stride = src->info()->strides_in_bytes().z() / sizeof(T);
-            in_batch_stride   = src->info()->strides_in_bytes()[3] / sizeof(T);
-            n_cols            = src->info()->tensor_shape().x();
-            n_rows            = window_src.y().step();
-            n_channels        = src->info()->tensor_shape().z();
-            n_batches         = src->info()->tensor_shape()[3];
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            in_col_stride   = src->info()->strides_in_bytes().y() / sizeof(T);
-            in_row_stride   = src->info()->strides_in_bytes().z() / sizeof(T);
-            in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T);
-            n_channels      = src->info()->tensor_shape().x();
-            n_cols          = window_src.y().step();
-            n_rows          = src->info()->tensor_shape().z();
-            n_batches       = src->info()->tensor_shape()[3];
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Invalid source data layout.");
-            break;
-        }
-    }
-
-    // CHW -> HWC
-    if(src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U })
-    {
-        const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T);
-        const int out_col_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
-        const int out_row_stride     = dst->info()->strides_in_bytes().z() / sizeof(T);
-        const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
-        execute_window_loop(window_src, [&](const Coordinates & id)
-        {
-            const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
-            reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
-                                  n_batches, n_channels, n_rows, n_cols,
-                                  in_batch_stride, in_channel_stride, in_row_stride,
-                                  out_batch_stride, out_row_stride, out_col_stride);
-        },
-        src_it, dst_it);
-    }
-    // HWC -> CHW
-    else if(src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U })
-    {
-        const int out_col_stride     = dst->info()->strides_in_bytes().x() / sizeof(T);
-        const int out_row_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
-        const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T);
-        const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
-        execute_window_loop(window_src, [&](const Coordinates & id)
-        {
-            const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
-            reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
-                                  n_batches, n_rows, n_cols, n_channels,
-                                  in_batch_stride, in_row_stride, in_col_stride,
-                                  out_batch_stride, out_channel_stride, out_row_stride);
-        },
-        src_it, dst_it);
-    }
-    else
-    {
-        // All other cases fall back to C++
-        // Permute strides
-        Strides strides      = dst->info()->strides_in_bytes();
-        Strides perm_strides = strides;
-        permute_strides(perm_strides, perm);
-        const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0;
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                                = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
-            *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
-        },
-        src_it, dst_it);
-    }
-}
-} // namespace
-
-void CpuPermuteKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
-    // Destination auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm));
-
-    _perm = perm;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    // This kernel doesn't need padding so update_window_and_padding() can be skipped
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm));
-    return Status{};
-}
-
-void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(src->info()->element_size())
-    {
-        case 1:
-            run_permute<uint8_t>(window, src, dst, _perm);
-            break;
-        case 2:
-            run_permute<uint16_t>(window, src, dst, _perm);
-            break;
-        case 4:
-            run_permute<uint32_t>(window, src, dst, _perm);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-}
-
-const char *CpuPermuteKernel::name() const
-{
-    return "CpuPermuteKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPermuteKernel.h b/src/core/cpu/kernels/CpuPermuteKernel.h
deleted file mode 100644
index 9c59d5b9d4..0000000000
--- a/src/core/cpu/kernels/CpuPermuteKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_PERMUTE_KERNEL_H
-#define ARM_COMPUTE_CPU_PERMUTE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to perform tensor permutation given a permutation vector */
-class CpuPermuteKernel : public ICpuKernel
-{
-public:
-    CpuPermuteKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPermuteKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in]  src  Srouce tensor to permute. Data types supported: All
-     * @param[out] dst  Destination tensor. Data types supported: Same as @p src
-     * @param[in]  perm Permutation vector
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuPermuteKernel
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] src  Source tensor to permute. Data types supported: All
-     * @param[in] dst  Destination tensor. Data types supported: Same as @p src
-     * @param[in] perm Permutation vector
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    PermutationVector _perm{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_PERMUTE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPool2dKernel.cpp b/src/core/cpu/kernels/CpuPool2dKernel.cpp
deleted file mode 100644
index e6f5890685..0000000000
--- a/src/core/cpu/kernels/CpuPool2dKernel.cpp
+++ /dev/null
@@ -1,514 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuPool2dKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-#include "src/core/NEON/wrapper/wrapper.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-using namespace misc::shape_calculator;
-
-struct PoolingSelectorData
-{
-    DataType   dt;
-    DataLayout dl;
-    int        pool_stride_x;
-    Size2D     pool_size;
-};
-
-using PoolingSelectorPtr = std::add_pointer<bool(const PoolingSelectorData &data)>::type;
-using PoolingKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
-struct PoolingKernel
-{
-    const char              *name;
-    const PoolingSelectorPtr is_selected;
-    PoolingKernelPtr         ukernel;
-};
-
-static const PoolingKernel available_kernels[] =
-{
-    {
-        "poolingMxN_qasymm8_neon_nhwc",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)
-    },
-    {
-        "poolingMxN_qasymm8_signed_neon_nhwc",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "poolingMxN_fp16_neon_nhwc",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "poolingMxN_fp32_neon_nhwc",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)
-    },
-#if defined(ENABLE_NCHW_KERNELS)
-    {
-        "pooling2_qasymm8_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "pooling3_qasymm8_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "poolingMxN_qasymm8_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "pooling2_qasymm8_signed_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "pooling3_qasymm8_signed_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "poolingMxN_qasymm8_signed_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "pooling2_fp16_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)
-    },
-    {
-        "pooling3_fp16_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)
-    },
-    {
-        "poolingMxN_fp16_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "pooling2_fp32_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)
-    },
-    {
-        "pooling3_fp32_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)
-    },
-    {
-        "pooling7_fp32_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)
-    },
-    {
-        "poolingMxN_fp32_neon_nchw",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)
-    },
-#endif /* defined(ENABLE_NCHW_KERNELS) */
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const PoolingKernel *get_implementation(DataType dt, DataLayout dl, int pool_stride_x, Size2D pool_size)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt, dl, pool_stride_x, pool_size }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
-                          const ITensorInfo *indices, Size2D pool_size)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_size.y() == 0);
-
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    int                 output_width    = 0;
-    int                 output_height   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const auto          data_layout     = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size.x(), pool_size.y(), pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
-
-    TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    if(indices)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
-                                    && (src->data_layout() == DataLayout::NHWC),
-                                    "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
-        if(indices)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info);
-        }
-    }
-
-    const auto *uk = get_implementation(src->data_type(), src->data_layout(), pool_stride_x, pool_size);
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
-                                                        unsigned int &num_elems_processed_per_iteration,
-                                                        BorderSize   &border_size,
-                                                        int pool_size_x, int pool_size_y)
-{
-    // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
-    if(indices)
-    {
-        // Indices auto inizialitation if not yet initialized
-        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src,
-                                                                                        pool_info)))
-                           .set_data_type(DataType::U32) /* we store the offset to the element */);
-    }
-    const auto          data_layout                  = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    unsigned int        num_elems_read_per_iteration = 0;
-    unsigned int        num_elems_horizontal_window  = 0;
-    int                 pool_stride_x                = 0;
-    int                 pool_stride_y                = 0;
-    const int           idx_width                    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height                   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int           src_width                    = src->dimension(idx_width);
-    const int           src_height                   = src->dimension(idx_height);
-    const PadStrideInfo pad_stride_info              = pool_info.pad_stride_info;
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int          pool_pad_right  = pad_stride_info.pad_right();
-    const int          pool_pad_top    = pad_stride_info.pad_top();
-    const int          pool_pad_left   = pad_stride_info.pad_left();
-    const int          pool_pad_bottom = pad_stride_info.pad_bottom();
-    const bool         is_square       = pool_size_x == pool_size_y;
-    const unsigned int pooled_w        = dst->dimension(idx_width);
-    const unsigned int pooled_h        = dst->dimension(idx_height);
-
-    //If it's not squared and optimized will be executed the MxN
-    num_elems_read_per_iteration      = 1;
-    num_elems_processed_per_iteration = 1;
-    num_elems_horizontal_window       = 1;
-
-    if(is_square)
-    {
-        switch(src->data_type())
-        {
-            case DataType::QASYMM8:
-            case DataType::QASYMM8_SIGNED:
-                switch(pool_size_x)
-                {
-                    case 2:
-                        num_elems_read_per_iteration      = 16;
-                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
-                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
-                        break;
-                    case 3:
-                        num_elems_read_per_iteration      = 16;
-                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
-                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
-                        break;
-                    default:
-                        break;
-                }
-                break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                switch(pool_size_x)
-                {
-                    case 2:
-                    case 3:
-                        num_elems_read_per_iteration      = 4;
-                        num_elems_processed_per_iteration = 1;
-                        num_elems_horizontal_window       = 1;
-                        break;
-                    default:
-                        break;
-                }
-                break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-                switch(pool_size_x)
-                {
-                    case 2:
-                        num_elems_read_per_iteration = 2;
-                        break;
-                    case 3:
-                        num_elems_read_per_iteration = 4; // We use vload4 for pooling3
-                        break;
-                    case 7:
-                        num_elems_read_per_iteration = 8; // We use vload8 for pooling7
-                        break;
-                    default:
-                        break;
-                }
-                num_elems_processed_per_iteration = 1;
-                num_elems_horizontal_window       = 1;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Element size not supported");
-                break;
-        }
-    }
-
-    bool   window_changed = false;
-    Window win{};
-    if(data_layout == DataLayout::NCHW)
-    {
-        // Number of iterations in X dimension
-        const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
-        // Upper limit for the number of right/bottom border elements that are accessed
-        const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
-        border_size             = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
-        border_size.right       = std::max(upper_bound_w, pool_pad_right);
-        border_size.bottom      = std::max(upper_bound_h, pool_pad_bottom);
-        TensorShape dst_shape{ src->tensor_shape() };
-        dst_shape.set(0, pooled_w);
-        dst_shape.set(1, pooled_h);
-        TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
-        win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
-        AccessWindowStatic     src_access(src, -pool_pad_left, -pool_pad_top, ceil_to_multiple(src_width + border_size.right, pool_size_x), src_height + border_size.bottom);
-        AccessWindowHorizontal dst_access(dst, 0, num_elems_horizontal_window);
-        if(indices)
-        {
-            AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
-            window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
-        }
-        else
-        {
-            window_changed = update_window_and_padding(win, src_access, dst_access);
-        }
-        dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-
-        border_size = src->padding();
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-BorderSize CpuPool2dKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
-    const bool          is_global_pooling = pool_info.is_global_pooling;
-
-    // Get data layout
-    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    // Update pool size in case of global pooling
-    const Size2D pool_size(
-        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
-        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
-
-    // Set instance variables
-    _pool_info     = pool_info;
-    _data_layout   = src->data_layout();
-    _pool_size     = pool_size;
-    _pool_stride_x = pad_stride_info.stride().first;
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        // Configure kernel window
-        Window win = calculate_max_window(*dst, Steps());
-        ICpuKernel::configure(win);
-    }
-    else
-    {
-        // Configure kernel window
-        auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
-                                                        _border_size, pool_size.x(), pool_size.y());
-        ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-        ICpuKernel::configure(win_config.second);
-    }
-}
-
-Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-    BorderSize   border_size(0);
-
-    const bool is_global_pooling = pool_info.is_global_pooling;
-
-    // Get data layout
-    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y)));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
-                                                              (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size,
-                                                              pool_size_x, pool_size_y)
-                                .first);
-
-    return Status{};
-}
-
-void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const ITensor *src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    ITensor       *dst     = tensors.get_tensor(TensorType::ACL_DST_0);
-    ITensor       *indices = tensors.get_tensor(TensorType::ACL_DST_1);
-
-    const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
-    const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
-    const unsigned int pool_size     = _pool_info.pool_size.width;
-
-    Window window_src(window);
-    if(_data_layout == DataLayout::NCHW)
-    {
-        // Set step for src in x and y direction for the src
-        unsigned int window_x_inc = 0;
-        switch(src->info()->data_type())
-        {
-            case DataType::QASYMM8:
-            case DataType::QASYMM8_SIGNED:
-            {
-                window_x_inc = pool_stride_x;
-                if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
-                {
-                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
-                }
-                break;
-            }
-
-            case DataType::F16:
-            case DataType::F32:
-            {
-                window_x_inc = pool_stride_x;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not supported");
-            }
-        }
-        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
-        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
-    }
-    else
-    {
-        window_src.set(Window::DimX, Window::Dimension(0, 1, 1));
-        window_src.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
-        window_src.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
-    }
-
-    const auto *uk = get_implementation(src->info()->data_type(), _data_layout, _pool_stride_x, _pool_size);
-    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    uk->ukernel(src, dst, indices, _pool_info, window_src, window);
-}
-
-const char *CpuPool2dKernel::name() const
-{
-    return "CpuPool2dKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPool2dKernel.h b/src/core/cpu/kernels/CpuPool2dKernel.h
deleted file mode 100644
index 95298004e9..0000000000
--- a/src/core/cpu/kernels/CpuPool2dKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOL2D_KERNEL_H
-#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the pooling layer kernel */
-class CpuPool2dKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuPool2dKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in]  src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst       Destination tensor info. Data types supported: Same as @p src.
-     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuPool2dKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    BorderSize  border_size() const override;
-    const char *name() const override;
-
-private:
-    PoolingLayerInfo _pool_info{};
-    DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    unsigned int     _num_elems_processed_per_iteration{ 0 };
-    BorderSize       _border_size{ 0 };
-    Size2D           _pool_size{};
-    int              _pool_stride_x{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_POOL2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuQuantizeKernel.cpp b/src/core/cpu/kernels/CpuQuantizeKernel.cpp
deleted file mode 100644
index 8ca81e8b11..0000000000
--- a/src/core/cpu/kernels/CpuQuantizeKernel.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuQuantizeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/CPP/Validate.h"
-
-#include <arm_neon.h>
-#include <map>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-constexpr auto window_step = 16;
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-
-    return Status{};
-}
-
-template <typename T>
-inline float32x4x4_t load_value(const T *input_ptr)
-{
-    using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type;
-    return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr));
-}
-
-template <>
-inline float32x4x4_t load_value(const float *input_ptr)
-{
-    return { wrapper::vloadq(input_ptr),
-             wrapper::vloadq(input_ptr + 4),
-             wrapper::vloadq(input_ptr + 8),
-             wrapper::vloadq(input_ptr + 12) };
-}
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float32x4x4_t load_value(const float16_t *input_ptr)
-{
-    return { vcvt_f32_f16(wrapper::vload(input_ptr)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
-}
-
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <typename element_type>
-using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>;
-
-template <typename quantized_type>
-vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi);
-
-template <>
-vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    return vquantize(qv, qi);
-}
-
-template <>
-vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    return vquantize_signed(qv, qi);
-}
-
-} // namespace
-
-void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map =
-    {
-        { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
-        { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t> },
-        { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t> },
-
-        { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> },
-
-        { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> },
-        { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> },
-        { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> },
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t> },
-        { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t> },
-        { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t> },
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
-    };
-
-    std::string function_to_call("op_");
-    function_to_call += string_from_data_type(src->data_type()) + "_";
-    function_to_call += string_from_data_type(dst->data_type());
-
-    auto it = quant_map.find(function_to_call);
-
-    if(it == quant_map.end())
-    {
-        ARM_COMPUTE_ERROR("Unsupported combination of input and output data types");
-    }
-    _func = it->second;
-
-    // Configure kernel window
-    Window win_config = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win_config);
-}
-
-Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-template <typename TIn, typename TOut>
-void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
-    {
-        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-    }
-#ifdef __aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-        auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
-}
-
-template <typename T>
-void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
-    {
-        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-    }
-#ifdef __aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
-            vst1q_u16(&output_ptr[x], tmp.val[0]);
-            vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
-}
-
-void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-    (this->*_func)(src, dst, window);
-}
-
-const char *CpuQuantizeKernel::name() const
-{
-    return "CpuQuantizeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuQuantizeKernel.h b/src/core/cpu/kernels/CpuQuantizeKernel.h
deleted file mode 100644
index d3422d3fbd..0000000000
--- a/src/core/cpu/kernels/CpuQuantizeKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors
- */
-class CpuQuantizeKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuQuantizeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuQuantizeKernel);
-    /** Set the input, output.
-     *
-     * @param[in]  src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuQuantizeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Common signature for all the specialised @ref CpuQuantizeKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
-    /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename TIn, typename TOut>
-    void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window);
-    /** Function to apply QASYMM16 quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T>
-    void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window);
-
-    QuantizeFunctionExecutorPtr _func{ nullptr };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuReshapeKernel.cpp b/src/core/cpu/kernels/CpuReshapeKernel.cpp
deleted file mode 100644
index 5b717b9bba..0000000000
--- a/src/core/cpu/kernels/CpuReshapeKernel.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuReshapeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-/** [NEReshapeLayerKernel Kernel] **/
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    if(dst->tensor_shape().total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size());
-    }
-
-    return Status{};
-}
-
-template <typename T>
-inline void reshape_tensor(const Window &window, const ITensor *src, ITensor *dst)
-{
-    const TensorShape &src_shape = src->info()->tensor_shape();
-    const TensorShape &dst_shape = dst->info()->tensor_shape();
-    Coordinates        dst_coord{};
-
-    Iterator src_it(src, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        dst_coord                                              = index2coords(dst_shape, coords2index(src_shape, id));
-        *reinterpret_cast<T *>(dst->ptr_to_element(dst_coord)) = *reinterpret_cast<T *>(src_it.ptr());
-    },
-    src_it);
-}
-} // namespace
-
-void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-    ARM_COMPUTE_UNUSED(dst);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src);
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-
-    return Status{};
-}
-
-void CpuReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(src->info()->data_type())
-    {
-        case DataType::U8:
-        case DataType::S8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-            reshape_tensor<uint8_t>(window, src, dst);
-            break;
-        case DataType::U16:
-        case DataType::S16:
-        case DataType::F16:
-            reshape_tensor<uint16_t>(window, src, dst);
-            break;
-        case DataType::U32:
-        case DataType::S32:
-        case DataType::F32:
-            reshape_tensor<uint32_t>(window, src, dst);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type!");
-    }
-}
-
-const char *CpuReshapeKernel::name() const
-{
-    return "CpuReshapeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-/** [NEReshapeLayerKernel Kernel] **/
diff --git a/src/core/cpu/kernels/CpuReshapeKernel.h b/src/core/cpu/kernels/CpuReshapeKernel.h
deleted file mode 100644
index add6782b9e..0000000000
--- a/src/core/cpu/kernels/CpuReshapeKernel.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_RESHAPE_KERNEL_H
-#define ARM_COMPUTE_CPU_RESHAPE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform tensor reshaping */
-class CpuReshapeKernel : public ICpuKernel
-{
-public:
-    CpuReshapeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuReshapeKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]  src Source tensor info. Data type supported: All
-     * @param[out] dst Destination tensor info. Data type supported: Same as @p input
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuReshapeKernel
-     *
-     * @param[in] src Source tensor info. Data type supported: All
-     * @param[in] dst Destination tensor info. Data type supported: Same as @p src
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_RESHAPE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuScaleKernel.cpp b/src/core/cpu/kernels/CpuScaleKernel.cpp
deleted file mode 100644
index 29475fa63f..0000000000
--- a/src/core/cpu/kernels/CpuScaleKernel.cpp
+++ /dev/null
@@ -1,621 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuScaleKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Utility.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/scale/neon/list.h"
-#include "src/core/cpu/kernels/scale/sve/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_neon.h>
-#include <map>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct ScaleSelectorData
-{
-    DataType dt;
-};
-using ScaleSelectorPtr = std::add_pointer<bool(const ScaleSelectorData &data)>::type;
-using ScaleKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                               InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
-struct ScaleKernel
-{
-    const char            *name;
-    const ScaleSelectorPtr is_selected;
-    ScaleKernelPtr         ukernel;
-};
-
-static const ScaleKernel available_kernels[] =
-{
-#if defined(ENABLE_SVE)
-    {
-        "fp16_sve_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)
-    },
-    {
-        "f32_sve_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)
-    },
-    {
-        "qasymm8_sve_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)
-    },
-    {
-        "qasymm8_signed_sve_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)
-    },
-    {
-        "u8_sve_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)
-    },
-    {
-        "s16_sve_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)
-    },
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "common_neon_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)
-    },
-#endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "common_neon_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)
-    },
-    {
-        "qasymm8_neon_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)
-    },
-    {
-        "qasymm8_signed_neon_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)
-    },
-    {
-        "common_neon_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<uint8_t>)
-    },
-    {
-        "common_neon_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<int16_t>)
-    },
-#endif /* defined(ENABLE_NEON) */
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const ScaleKernel *get_implementation(const ScaleSelectorData &data)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected(data))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy,
-                          const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    const auto *uk = get_implementation(ScaleSelectorData{ src->data_type() });
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
-    ARM_COMPUTE_UNUSED(info.constant_border_value);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported");
-
-    const DataLayout data_layout   = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    const auto       width_index   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const auto       height_index  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const auto       output_width  = dst->dimension(width_index);
-    const auto       output_height = dst->dimension(height_index);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0);
-
-    if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
-    }
-
-    if(info.interpolation_policy == InterpolationPolicy::BILINEAR)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
-        if(dx != nullptr && dy != nullptr)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
-        }
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
-
-    if(info.interpolation_policy == InterpolationPolicy::AREA)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
-    }
-
-    return Status{};
-}
-} // namespace
-
-CpuScaleKernel::CpuScaleKernel()
-    : _func(nullptr), _policy(), _border_mode(), _constant_border_value(PixelValue()), _sampling_offset(0), _align_corners(false), _data_layout(DataLayout::UNKNOWN)
-{
-}
-
-void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets,
-                               ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(dx, dy, offsets);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  dx,
-                                                  dy,
-                                                  offsets,
-                                                  dst,
-                                                  info));
-
-    // Get data layout and width/height indices
-    _data_layout         = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-    _policy                = info.interpolation_policy;
-    _border_mode           = info.border_mode;
-    _constant_border_value = info.constant_border_value;
-    _align_corners         = info.align_corners;
-
-    if(info.sampling_policy == SamplingPolicy::CENTER)
-    {
-        _sampling_offset = 0.5f;
-    }
-
-    // Compute the ratio between source width/height and destination width/height
-    const auto wr = scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
-    const auto hr = scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
-
-    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
-
-    if(_border_mode == BorderMode::UNDEFINED)
-    {
-        _border_mode           = BorderMode::CONSTANT;
-        _constant_border_value = PixelValue();
-    }
-
-#ifdef ENABLE_NCHW_KERNELS
-    // Configure scale function to run
-    if(_data_layout == DataLayout::NCHW)
-    {
-        std::string function_to_call("scale_");
-        function_to_call += string_from_data_type(src->data_type()) + "_";
-        function_to_call += string_from_data_layout(_data_layout) + "_";
-        function_to_call += string_from_interpolation_policy(_policy);
-
-        static std::map<std::string, ScaleFunctionPtr> map_function =
-        {
-            { "scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8 },
-
-            { "scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t> },
-            { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
-
-            { "scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t> },
-            { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
-
-            { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t> },
-            { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t> },
-
-            { "scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t> },
-            { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t> },
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            { "scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t> },
-            { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t> },
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-            { "scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float> },
-            { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float> },
-        };
-        auto it = map_function.find(function_to_call);
-        if(it != map_function.end())
-        {
-            _func = it->second;
-        }
-    }
-#endif // ENABLE_NCHW_KERNELS
-
-    // Configure window
-    Window win = calculate_max_window(*dst, Steps());
-    ICpuKernel::configure(win);
-}
-
-#ifdef ENABLE_NCHW_KERNELS
-template <typename T>
-void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dx, dy);
-    const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Set offsets window
-    Window win_off;
-    win_off.set(Window::DimX, window[Window::DimX]);
-    win_off.set(Window::DimY, window[Window::DimY]);
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    // Create iterators
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-    Iterator offsets_i(offsets, win_off);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
-        const auto in_yi       = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((
-                                                          id.y() + _sampling_offset)
-                                                      * hr));
-        const int32_t offset_row            = in_yi * in_stride_x;
-        *reinterpret_cast<T *>(dst_i.ptr()) = *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
-    },
-    src_i, offsets_i, dst_i);
-}
-
-template <typename T>
-void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, window.x());
-    win_off.set(Window::DimY, window.y());
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-    Iterator offsets_i(offsets, win_off);
-    Iterator dx_i(dx, win_off);
-    Iterator dy_i(dy, win_off);
-
-    const int32_t in_dim_w    = src->info()->dimension(0);
-    const int32_t in_dim_h    = src->info()->dimension(1);
-    const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right;
-
-    if(_border_mode == BorderMode::CONSTANT)
-    {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        using ConstType = T;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-            const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h
-                              && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h
-                              && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) :
-                             const_border_value;
-
-            *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        src_i, offsets_i, dx_i, dy_i, dst_i);
-    }
-    else if(_border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int  index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-            const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
-            const auto dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
-            const auto dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
-            const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
-            const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
-            const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
-            const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
-
-            *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        src_i, offsets_i, dx_i, dy_i, dst_i);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dx, dy, offsets);
-    using namespace scale_helpers;
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
-
-    // Don't increment in width/height/channels for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-
-    const auto   wr        = scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners);
-    const auto   hr        = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-    const auto   w         = src->info()->dimension(0);
-    const auto   h         = src->info()->dimension(1);
-    const size_t in_stride = src->info()->strides_in_bytes()[1];
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
-
-        uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
-
-        uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
-
-        vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
-    },
-    src_i, dst_i);
-}
-
-template <typename T>
-void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    // Get data layout and width/height indices
-    const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), _align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(idx_width, Window::Dimension(0, 0, 0));
-    win_in.set(idx_height, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-
-    const int32_t in_dim_w = src->info()->dimension(idx_width);
-    const int32_t in_dim_h = src->info()->dimension(idx_height);
-    const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
-    const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
-
-    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
-    if(_border_mode == BorderMode::CONSTANT)
-    {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        using ConstType = T;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
-            const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
-            const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
-            const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
-            *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        src_i, dst_i);
-    }
-    else if(_border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
-            const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
-            const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
-            const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
-            *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        src_i, dst_i);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-#endif // ENABLE_NCHW_KERNELS
-
-Status CpuScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
-                                const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
-    return Status{};
-}
-
-void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr && _data_layout == DataLayout::NCHW);
-
-    const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst     = tensors.get_tensor(TensorType::ACL_DST);
-    const auto dx      = tensors.get_const_tensor(TensorType::ACL_INT_0);
-    const auto dy      = tensors.get_const_tensor(TensorType::ACL_INT_1);
-    const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2);
-
-    if(_data_layout == DataLayout::NCHW)
-    {
-        (this->*_func)(src, dst, dx, dy, offsets, window);
-    }
-    else
-    {
-        const auto *uk = get_implementation(ScaleSelectorData{ src->info()->data_type() });
-        uk->ukernel(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window);
-    }
-}
-
-const char *CpuScaleKernel::name() const
-{
-    return "CpuScaleKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuScaleKernel.h b/src/core/cpu/kernels/CpuScaleKernel.h
deleted file mode 100644
index 24790d16d7..0000000000
--- a/src/core/cpu/kernels/CpuScaleKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SCALEKERNEL_H
-#define ARM_COMPUTE_CPU_SCALEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Arm(R) Neon(TM) kernel to perform scaling on a tensor */
-class CpuScaleKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuScaleKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScaleKernel);
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
-     * @note Using @p policy Area only supports data layout NCHW and input data type U8.
-     *
-     * @param[in]  src     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
-     * @param[in]  dx      Distance x tensor info. Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
-     * @param[in]  dy      Distance y tensor info. Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
-     * @param[in]  offsets Offset tensor info. Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[out] dst     Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info    @ref ScaleKernelInfo to use for configuration
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
-                   const ScaleKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuScaleKernel
-     *
-     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
-     * @note Using @p policy Area only supports data layout NCHW and input data type U8.
-     *
-     * @param[in] src     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
-     * @param[in] dx      Distance x tensor info. Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
-     * @param[in] dy      Distance y tensor info. Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
-     * @param[in] offsets Offset tensor info. Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[in] dst     Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] info    @ref ScaleKernelInfo to use for validation
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
-                           const ScaleKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-#ifdef ENABLE_NCHW_KERNELS
-    /** function to perform scale using area interpolation on the given window
-     *
-     *  @note Used only in case down-sampling.
-     */
-    void scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-
-    /** function to perform scale using bilinear interpolation on the given window */
-    template <typename T>
-    void scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-    /** function to perform scale using bilinear interpolation on the given window */
-    template <typename T>
-    void scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-
-    /** function to perform scale using nearest neighbour on the given window */
-    template <typename T>
-    void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-#endif // ENABLE_NCHW_KERNELS
-
-    /** Scale function to use for the particular function to use */
-    using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
-
-    ScaleFunctionPtr    _func;
-    InterpolationPolicy _policy;
-    BorderMode          _border_mode;
-    PixelValue          _constant_border_value;
-    float               _sampling_offset;
-    bool                _align_corners;
-    DataLayout          _data_layout;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_SCALEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
deleted file mode 100644
index 8ea186b16a..0000000000
--- a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuSoftmaxKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/softmax/impl/neon/list.h"
-#include "src/core/cpu/kernels/softmax/impl/sve/list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct SoftmaxSelectorData
-{
-    DataType dt;
-};
-using SoftmaxSelectorPtr          = std::add_pointer<bool(const SoftmaxSelectorData &data)>::type;
-using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
-using SoftmaxLogits1DKernelPtr    = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
-
-struct SoftmaxLogits1DKernel
-{
-    const char              *name;
-    const SoftmaxSelectorPtr is_selected;
-    SoftmaxLogits1DKernelPtr ukernel;
-};
-
-struct SoftmaxLogits1DMaxKernel
-{
-    const char                 *name;
-    const SoftmaxSelectorPtr    is_selected;
-    SoftmaxLogits1DMaxKernelPtr ukernel;
-};
-
-static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
-{
-#if defined(ENABLE_SVE)
-    {
-        "sve_softmax_logits_1d_float",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float>)
-    },
-    {
-        "sve_softmax_logits_1d_float",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>)
-    },
-#endif /* defined(ENABLE_SVE) */
-
-#if defined(ENABLE_NEON)
-    {
-        "neon_softmax_logits_1d_float",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_softmax_logits_1d_float",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-#endif /* !defined(ENABLE_NEON) */
-
-#if defined(__ARM_FEATURE_SVE2)
-    {
-        "sve_softmax_logits_1d_quantized",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_t>)
-    },
-    {
-        "sve_softmax_logits_1d_quantized",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_signed_t>)
-    },
-#else  /* !defined(__ARM_FEATURE_SVE2) */
-    {
-        "neon_softmax_logits_1d_quantized",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_t>)
-    },
-    {
-        "neon_softmax_logits_1d_quantized",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_signed_t>)
-    },
-#endif /* defined(__ARM_FEATURE_SVE2) */
-
-};
-
-static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
-{
-#if defined(ENABLE_SVE)
-    {
-        "sve_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max<float>)
-    },
-    {
-        "sve_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max<float16_t>)
-    },
-    {
-        "sve_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_t>)
-    },
-    {
-        "sve_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>)
-    },
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
-    {
-        "neon_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max<float>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max<float16_t>)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "neon_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_t>)
-    },
-    {
-        "neon_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>)
-    },
-#endif /* defined(ENABLE_NEON) */
-};
-
-const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data)
-{
-    for(const auto &uk : available_logits_1d_kernels)
-    {
-        if(uk.is_selected({ data.dt }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-const SoftmaxLogits1DMaxKernel *get_implementation_logits_max(const SoftmaxSelectorData &data)
-{
-    for(const auto &uk : available_logits_1d_max_kernels)
-    {
-        if(uk.is_selected({ data.dt }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
-    // Validate in case of configured output
-    if(output.total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
-    }
-
-    return Status{};
-}
-
-} // namespace
-
-CpuLogits1DMaxKernel::CpuLogits1DMaxKernel()
-{
-}
-
-void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
-
-    // Softmax across the x dimension
-    const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
-
-    Window win = calculate_max_window(*src, Steps());
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
-
-    return Status{};
-}
-
-void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->info()->data_type() });
-    uk->ukernel(src, dst, window);
-}
-
-const char *CpuLogits1DMaxKernel::name() const
-{
-    return "CpuLogits1DMaxKernel";
-}
-
-namespace
-{
-Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max,
-                                         const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log)
-{
-    ARM_COMPUTE_UNUSED(beta);
-    // Check input
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
-    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
-
-    // Check max
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
-
-    // Check output if configured
-    if(dst.total_size() != 0)
-    {
-        const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info();
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
-    }
-
-    // Check tmp if configured
-    if(tmp.total_size() != 0)
-    {
-        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
-        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
-        // We could potentially reduce tmp memory if we could predict or make an assumption
-        // on the maximum number of threads that will run in parallel.
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp);
-    }
-
-    return Status{};
-}
-} // namespace
-
-template <bool IS_LOG>
-CpuLogits1DSoftmaxKernel<IS_LOG>::CpuLogits1DSoftmaxKernel()
-    : _beta(1.0f)
-{
-}
-
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
-
-    _beta = beta;
-
-    // Configure kernel window
-    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
-
-    // Output auto initialization if not yet initialized
-    const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info();
-    auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
-
-    // Tmp auto initialization if not yet initialized
-    const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
-    auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*max, Steps());
-
-    ICpuKernel::configure(win);
-}
-
-template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max,
-                                                  const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
-
-    return Status{};
-}
-
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto       max = tensors.get_tensor(TensorType::ACL_SRC_1);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST_0);
-    auto       tmp = tensors.get_tensor(TensorType::ACL_DST_1);
-
-    const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
-    const unsigned int tmp_size_for_thread               = tmp->info()->element_size() * num_elems_processed_per_iteration;
-
-    ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
-
-    void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
-
-    const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->info()->data_type() });
-    uk->ukernel(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
-}
-
-template <bool IS_LOG>
-const char    *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
-{
-    if(IS_LOG)
-    {
-        return "CpuLogits1DSoftmaxKernel";
-    }
-    else
-    {
-        return "CpuLogits1DLogSoftmaxKernel";
-    }
-}
-
-template class CpuLogits1DSoftmaxKernel<true>;
-template class CpuLogits1DSoftmaxKernel<false>;
-
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.h b/src/core/cpu/kernels/CpuSoftmaxKernel.h
deleted file mode 100644
index aa10467965..0000000000
--- a/src/core/cpu/kernels/CpuSoftmaxKernel.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SOFTMAXKERNEL_H
-#define ARM_COMPUTE_CPU_SOFTMAXKERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the identifying the max value of 1D Logits */
-class CpuLogits1DMaxKernel : public ICpuKernel
-{
-public:
-    /** Constructor */
-    CpuLogits1DMaxKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst Destination tensor info. Data types supported: same as @p input
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuLogits1DMaxKernel
-     *
-     * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst Destination tensor info. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-
-/** Interface for softmax computation for QASYMM8 with pre-computed max. */
-template <bool IS_LOG = false>
-class CpuLogits1DSoftmaxKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuLogits1DSoftmaxKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel);
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  max  Max values tensor info. Same shape as input with dimension 0 set to 1.
-     *                  Data types supported: same as @p input.
-     * @param[out] dst  Destination tensor info. Data types supported: same as @p input.
-     * @param[in]  beta A scaling factor for the exponent.
-     *
-     * @param      tmp    Auxiliary tensor info. Must be type F32 and same shape as the input.
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuLogits1DSoftmaxKernel
-     *
-     * @param[in] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] max  Max values tensor info. Same shape as input with dimension 0 set to 1.
-     *                 Data types supported: same as @p input.
-     * @param[in] dst  Destination tensor info. Data types supported: same as @p input.
-     * @param[in] beta A scaling factor for the exponent.
-     * @param[in] tmp  Tensor info of auxiliary. Must be type F32 and same shape as the input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *max,
-                           const ITensorInfo *dst, const float beta, const ITensorInfo *tmp);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    float _beta;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAXKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuSubKernel.cpp b/src/core/cpu/kernels/CpuSubKernel.cpp
deleted file mode 100644
index d7057bbe2b..0000000000
--- a/src/core/cpu/kernels/CpuSubKernel.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuSubKernel.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/sub/neon/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct SubSelectorData
-{
-    DataType dt1;
-    DataType dt2;
-    DataType dt3;
-};
-
-using SubSelectorPtr = std::add_pointer<bool(const SubSelectorData &data)>::type;
-using SubKernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
-
-struct SubKernel
-{
-    const char          *name;
-    const SubSelectorPtr is_selected;
-    SubKernelPtr         ukernel;
-};
-
-static const SubKernel available_kernels[] =
-{
-    {
-        "sub_same_neon",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "sub_same_neon",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "sub_same_neon",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)
-    },
-    {
-        "sub_same_neon",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)
-    },
-    {
-        "sub_same_neon",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)
-    },
-    {
-        "sub_u8_s16_s16_neon",
-        [](const SubSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_s16_s16_neon)
-    },
-    {
-        "sub_s16_u8_s16_neon",
-        [](const SubSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_s16_u8_s16_neon)
-    },
-    {
-        "sub_u8_u8_s16_neon",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_u8_s16_neon)
-    },
-    {
-        "sub_qasymm8_neon",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)
-    },
-    {
-        "sub_qasymm8_signed_neon",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)
-    },
-    {
-        "sub_qsymm16_neon",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)
-    },
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const SubKernel *get_implementation(DataType dt1, DataType dt2, DataType dt3)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt1, dt2, dt3 }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-
-    const auto *uk = get_implementation(src0.data_type(), src1.data_type(), dst.data_type());
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8)
-        && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8)
-        && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED)
-        && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16)
-        && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8)
-        && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16)
-        && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8)
-        && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16)
-        && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32)
-        && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32)
-        && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16),
-        "You called subtract with the wrong image formats");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        (src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP)
-        || (src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP)
-        || (src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP),
-        "Convert policy cannot be WRAP if datatype is quantized");
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-            !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::U8)
-            && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && dst.data_type() == DataType::QASYMM8)
-            && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && dst.data_type() == DataType::QASYMM8_SIGNED)
-            && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && dst.data_type() == DataType::QSYMM16)
-            && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32 && dst.data_type() == DataType::S32)
-            && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32 && dst.data_type() == DataType::F32)
-            && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16 && dst.data_type() == DataType::F16),
-            "You called subtract with the wrong image formats");
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
-                                        "Wrong shape for dst");
-    }
-    return Status{};
-}
-} // namespace
-
-void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src0->tensor_shape(), src1->tensor_shape());
-
-    // Auto initialize dst if not initialized
-    set_shape_if_empty(*dst, out_shape);
-
-    _policy = policy;
-
-    // CpuSubKernel doesn't need padding so update_window_and_padding() can be skipped
-    Window win = calculate_max_window(out_shape, Steps());
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
-
-    return Status{};
-}
-
-void CpuSubKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    // Dispatch kernel
-    const auto *uk = get_implementation(src0->info()->data_type(), src1->info()->data_type(), dst->info()->data_type());
-    uk->ukernel(src0, src1, dst, _policy, window);
-}
-
-const char *CpuSubKernel::name() const
-{
-    return "CpuSubKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuSubKernel.h b/src/core/cpu/kernels/CpuSubKernel.h
deleted file mode 100644
index da114b6e08..0000000000
--- a/src/core/cpu/kernels/CpuSubKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SUB_KERNEL_H
-#define ARM_COMPUTE_CPU_SUB_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform subtraction between two tensors */
-class CpuSubKernel : public ICpuKernel
-{
-public:
-    CpuSubKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSubKernel);
-
-    /** Initialise the kernel's src and dst.
-     *
-     * Valid configurations (src0,src1) -> dst :
-     *
-     *   - (U8,U8)                          -> U8
-     *   - (U8,U8)                          -> S16
-     *   - (QASYMM8, QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (S16,U8)                         -> S16
-     *   - (U8,S16)                         -> S16
-     *   - (S16,S16)                        -> S16
-     *   - (S32,S32)                        -> S32
-     *   - (F16,F16)                        -> F16
-     *   - (F32,F32)                        -> F32
-     *
-     * @param[in]  src0   An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
-     * @param[in]  src1   An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
-     * @param[out] dst    The dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
-     * @param[in]  policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuSubKernel
-     *
-     * Valid configurations (src0,src1) -> dst :
-     *
-     *   - (U8,U8)                          -> U8
-     *   - (U8,U8)                          -> S16
-     *   - (QASYMM8, QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (S16,U8)                         -> S16
-     *   - (U8,S16)                         -> S16
-     *   - (S16,S16)                        -> S16
-     *   - (S32,S32)                        -> S32
-     *   - (F16,F16)                        -> F16
-     *   - (F32,F32)                        -> F32
-     *
-     * @param[in] src0   An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
-     * @param[in] src1   An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
-     * @param[in] dst    The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
-     * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    ConvertPolicy _policy{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SUB_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuTransposeKernel.cpp b/src/core/cpu/kernels/CpuTransposeKernel.cpp
deleted file mode 100644
index c7cafe94a8..0000000000
--- a/src/core/cpu/kernels/CpuTransposeKernel.cpp
+++ /dev/null
@@ -1,510 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuTransposeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-unsigned int num_elems_processed(size_t element_size)
-{
-    switch(element_size)
-    {
-        case 1:
-            return 8;
-        case 2:
-        case 4:
-            return 4;
-        default:
-            break;
-    }
-
-    ARM_COMPUTE_ERROR("Element size not supported");
-}
-
-void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window)
-{
-    const int    window_step_x            = 8;
-    const int    window_step_y            = 8;
-    const int    window_start_x           = window.x().start();
-    const int    window_end_x             = window.x().end();
-    const int    window_start_y           = window.y().start();
-    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
-    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
-    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
-
-    // Check if we need a left-over loop for the y dimension
-    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
-
-    Window window_in(window);
-    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
-    {
-        // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
-        {
-            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
-        }
-        else
-        {
-            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
-        }
-    }
-
-    Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator output(out, window_out);
-
-    // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
-    {
-        Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 8x8 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
-                const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
-                const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
-                const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
-                const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
-                const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
-                const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
-                const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
-
-                // Transpose 2x2
-                const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
-                const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
-                const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
-                const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
-
-                // Transpose 4x4
-                const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
-                const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
-                const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
-                const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
-
-                // Transpose 8x8
-                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
-                const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
-                const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
-            }
-
-            // Compute left-over elements along the x dimension (1x8)
-            for(; x < window_end_x; ++x)
-            {
-                const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
-                const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
-                const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
-                const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
-                const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
-                const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
-                const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
-                const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
-
-                uint8x8_t result = vdup_n_u8(0);
-                result           = vset_lane_u8(val0, result, 0);
-                result           = vset_lane_u8(val1, result, 1);
-                result           = vset_lane_u8(val2, result, 2);
-                result           = vset_lane_u8(val3, result, 3);
-                result           = vset_lane_u8(val4, result, 4);
-                result           = vset_lane_u8(val5, result, 5);
-                result           = vset_lane_u8(val6, result, 6);
-                result           = vset_lane_u8(val7, result, 7);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
-                vst1_u8(output.ptr() + dst_offset_in_bytes, result);
-            }
-        },
-        input, output);
-    }
-
-    if(left_over_loop_y)
-    {
-        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
-        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
-
-        Iterator input(in, window_in);
-        Iterator output(out, window_out);
-
-        // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint8_t val0 = *input.ptr();
-
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
-
-            *(output.ptr() + dst_offset_in_bytes) = val0;
-        },
-        input, output);
-    }
-}
-
-void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window)
-{
-    const int    window_step_x            = 4;
-    const int    window_step_y            = 4;
-    const int    window_start_x           = window.x().start();
-    const int    window_end_x             = window.x().end();
-    const int    window_start_y           = window.y().start();
-    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
-    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
-    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
-
-    // Check if we need a left-over loop for the y dimension
-    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
-
-    Window window_in(window);
-    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
-    {
-        // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
-        {
-            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
-        }
-        else
-        {
-            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
-        }
-    }
-
-    Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator output(out, window_out);
-
-    // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
-    {
-        Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 4x4 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                // Transpose 2x2
-                const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
-                const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
-
-                // Transpose 4x4
-                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
-            }
-
-            // Compute left-over elements (1x4)
-            for(; x < window_end_x; ++x)
-            {
-                const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                uint16x4_t result = vdup_n_u16(0);
-                result            = vset_lane_u16(val0, result, 0);
-                result            = vset_lane_u16(val1, result, 1);
-                result            = vset_lane_u16(val2, result, 2);
-                result            = vset_lane_u16(val3, result, 3);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
-            }
-        },
-        input, output);
-    }
-
-    if(left_over_loop_y)
-    {
-        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
-        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
-
-        Iterator input(in, window_in);
-        Iterator output(out, window_out);
-
-        // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
-
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
-
-            *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
-    }
-}
-
-void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
-{
-    const int    window_step_x            = 4;
-    const int    window_step_y            = 4;
-    const int    window_start_x           = window.x().start();
-    const int    window_end_x             = window.x().end();
-    const int    window_start_y           = window.y().start();
-    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
-    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
-    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
-
-    // Check if we need a left-over loop for the y dimension
-    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
-
-    Window window_in(window);
-    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
-    {
-        // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
-        {
-            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
-        }
-        else
-        {
-            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
-        }
-    }
-
-    Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator output(out, window_out);
-
-    // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
-    {
-        Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 4x4 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                // Transpose 2x2
-                const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
-                const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
-                const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                // Swap block 01 with block 10 and store
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
-            }
-
-            // Compute left-over elements (1x4)
-            for(; x < window_end_x; ++x)
-            {
-                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                uint32x4_t result = vdupq_n_u32(0);
-                result            = vsetq_lane_u32(val0, result, 0);
-                result            = vsetq_lane_u32(val1, result, 1);
-                result            = vsetq_lane_u32(val2, result, 2);
-                result            = vsetq_lane_u32(val3, result, 3);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
-            }
-        },
-        input, output);
-    }
-
-    if(left_over_loop_y)
-    {
-        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
-        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
-
-        Iterator input(in, window_in);
-        Iterator output(out, window_out);
-
-        // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
-
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
-
-            *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
-    }
-}
-} // namespace
-
-void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Destination auto inizialitation if not yet initialized
-    const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst));
-
-    // Note: This kernel performs 16 elements per iteration.
-    // However, since we use a left-over for loop on both dimensions (X and Y), we cannot have any read or write out of memory
-    // For this reason num_elems_processed_per_iteration_x is set to 1
-    const unsigned int num_elems_processed_per_iteration_x = 1;
-    const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped
-    Coordinates coord;
-    coord.set_num_dimensions(dst->num_dimensions());
-    dst->set_valid_region(ValidRegion(coord, dst->tensor_shape()));
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    // Error if input is not 8 bit, 16bit or 32bit
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->element_size() != 1 && src->element_size() != 2 && src->element_size() != 4,
-                                    "Element size not supported");
-
-    // Validate configured destination
-    if(dst->total_size() != 0)
-    {
-        const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-
-void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(src->info()->element_size())
-    {
-        case 1:
-            transpose_8bit_elements(src, dst, window);
-            break;
-        case 2:
-            transpose_16bit_elements(src, dst, window);
-            break;
-        case 4:
-            transpose_32bit_elements(src, dst, window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-}
-
-const char *CpuTransposeKernel::name() const
-{
-    return "CpuTransposeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuTransposeKernel.h b/src/core/cpu/kernels/CpuTransposeKernel.h
deleted file mode 100644
index f09f427be8..0000000000
--- a/src/core/cpu/kernels/CpuTransposeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H
-#define ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel which transposes the elements of a matrix */
-class CpuTransposeKernel : public ICpuKernel
-{
-public:
-    CpuTransposeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuTransposeKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]  src Srouce tensor to permute. Data types supported: All
-     * @param[out] dst Destination tensor. Data types supported: Same as @p src
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuTransposeKernel
-     *
-     * @param[in] src Source tensor to permute. Data types supported: All
-     * @param[in] dst Destination tensor. Data types supported: Same as @p src
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H */
diff --git a/src/core/cpu/kernels/activation/list.h b/src/core/cpu/kernels/activation/list.h
deleted file mode 100644
index 409d025db0..0000000000
--- a/src/core/cpu/kernels/activation/list.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
-#define SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_ACTIVATION_KERNEL(func_name) \
-    void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-
-DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_sve_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_signed_sve_activation);
-DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qsymm16_sve_activation);
-DECLARE_ACTIVATION_KERNEL(fp16_neon_activation);
-DECLARE_ACTIVATION_KERNEL(fp16_sve_activation);
-DECLARE_ACTIVATION_KERNEL(fp32_neon_activation);
-DECLARE_ACTIVATION_KERNEL(fp32_sve_activation);
-
-#undef DECLARE_ACTIVATION_KERNEL
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H */
diff --git a/src/core/cpu/kernels/activation/neon/fp16.cpp b/src/core/cpu/kernels/activation/neon/fp16.cpp
deleted file mode 100644
index 6f2d5d8533..0000000000
--- a/src/core/cpu/kernels/activation/neon/fp16.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/NEMath.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-#ifndef __aarch64__
-inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
-{
-    auto int_in = vreinterpretq_u16_f16(in);
-    return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
-}
-#endif /* __aarch64__ */
-} // namespace
-
-void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType                                = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
-    const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
-    constexpr int window_step_x  = 8;
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    // In case of non-aarch64, a small delta value is added to the input
-    // to prevent NAN values caused by zeros in inputs to SQRT.
-    // In case of aarh64, we call vsqrt directly, so we don't use delta.
-#ifndef __aarch64__
-    const auto delta = wrapper::vdup_n(static_cast<float16_t>((1e-7), ExactTagType {}));
-#endif /* __aarch64__ */
-
-    const auto const_1     = wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
-    const auto const_0     = wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
-    const auto const_6     = wrapper::vdup_n(static_cast<float16_t>(6.f), ExactTagType{});
-    const auto const_3     = wrapper::vdup_n(static_cast<float16_t>(3.f), ExactTagType{});
-    const auto const_inv_6 = wrapper::vdup_n(static_cast<float16_t>(0.166666667f), ExactTagType{});
-
-    constexpr float soft_relu_thresh  = 12.f;
-    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<float16_t>(soft_relu_thresh), ExactTagType{});
-
-    const auto va = wrapper::vdup_n(static_cast<float16_t>(act_info.a()), ExactTagType{});
-    const auto vb = wrapper::vdup_n(static_cast<float16_t>(act_info.b()), ExactTagType{});
-    const auto a  = static_cast<float16_t>(act_info.a());
-    const auto b  = static_cast<float16_t>(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<float16_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = wrapper::vabs(vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = wrapper::vmla(vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = wrapper::vmax(const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-#ifdef __aarch64__
-                    tmp = wrapper::vsqrt(vin);
-#else  /* __aarch64__ */
-                    {
-                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{}));
-                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
-                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
-                    }
-#endif /* __aarch64__ */
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = wrapper::vmul(vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const float16_t in = *(reinterpret_cast<const float16_t *>(input_ptr + x));
-            float16_t       tmp;
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = std::abs(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = a * in + b;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = static_cast<float16_t>(1) / (static_cast<float16_t>(1) + std::exp(-in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = std::max<float16_t>(static_cast<float16_t>(0), in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<float16_t>(a, std::max(static_cast<float16_t>(0), in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<float16_t>(a, std::max<float16_t>(b, in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = (in > 0) ? in : a * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float16_t>(1) + std::exp(in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = std::sqrt(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = in * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = a * std::tanh(b * in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/core/cpu/kernels/activation/neon/fp32.cpp b/src/core/cpu/kernels/activation/neon/fp32.cpp
deleted file mode 100644
index 54301d45ad..0000000000
--- a/src/core/cpu/kernels/activation/neon/fp32.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-#ifndef __aarch64__
-inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
-{
-    auto int_in = vreinterpretq_u32_f32(in);
-    return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
-}
-#endif /* __aarch64__ */
-} // namespace
-
-void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
-
-    constexpr int                                 window_step_x  = 4;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    // In case of non-aarch64, a small delta value is added to the input
-    // to prevent NAN values caused by zeros in inputs to SQRT.
-    // In case of aarh64, we call vsqrt directly, so we don't use delta.
-#ifndef __aarch64__
-    const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {});
-#endif /* __aarch64__ */
-    const auto const_1     = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {});
-    const auto const_0     = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
-    const auto const_6     = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{});
-    const auto const_3     = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{});
-    const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{});
-
-    constexpr float soft_relu_thresh  = 12.f;
-    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<float>(soft_relu_thresh), ExactTagType{});
-
-    const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{});
-    const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{});
-    const auto a  = static_cast<float>(act_info.a());
-    const auto b  = static_cast<float>(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = wrapper::vabs(vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = wrapper::vmla(vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = wrapper::vmax(const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-#ifdef __aarch64__
-                    tmp = wrapper::vsqrt(vin);
-#else  /* __aarch64__ */
-                    {
-                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
-                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
-                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
-                    }
-#endif /* __aarch64__ */
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = wrapper::vmul(vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const float in = *(reinterpret_cast<const float *>(input_ptr + x));
-            float       tmp;
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = std::abs(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = a * in + b;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = std::max<float>(static_cast<float>(0), in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<float>(a, std::max(static_cast<float>(0), in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<float>(a, std::max<float>(b, in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = (in > 0) ? in : a * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float>(1) + std::exp(in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = std::sqrt(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = in * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = a * std::tanh(b * in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qasymm8.cpp b/src/core/cpu/kernels/activation/neon/qasymm8.cpp
deleted file mode 100644
index a1217435b6..0000000000
--- a/src/core/cpu/kernels/activation/neon/qasymm8.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    constexpr int                                 window_step_x  = 16;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const qasymm8x16_t            va       = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
-    const qasymm8x16_t            vb       = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
-    const qasymm8_t               a        = quantize_qasymm8(act_info.a(), qi_in);
-    const qasymm8_t               b        = quantize_qasymm8(act_info.b(), qi_in);
-    const qasymm8_t               const_0  = quantize_qasymm8(0.f, qi_in);
-    const qasymm8x16_t            vconst_0 = vdupq_n_u8(const_0);
-    const auto                    vconst_1 = vdupq_n_f32(1.f);
-#ifndef __aarch64__
-    const auto vconst_0_f32 = vdupq_n_f32(0);
-#endif // __aarch64__
-    const float32x4_t va_f32          = vdupq_n_f32(act_info.a());
-    const float32x4_t vb_f32          = vdupq_n_f32(act_info.b());
-    const float       a_f32           = act_info.a();
-    const float       b_f32           = act_info.b();
-    const auto        const_6_f32     = vdupq_n_f32(6.f);
-    const auto        const_0_f32     = vdupq_n_f32(0.f);
-    const auto        const_3_f32     = vdupq_n_f32(3.f);
-    const auto        const_inv_6_f32 = vdupq_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    float       s  = qi_in.scale / qi_out.scale;
-    float       o  = -qi_in.offset * s + qi_out.offset;
-    float32x4_t vs = vdupq_n_f32(s);
-    float32x4_t vo = vdupq_n_f32(o);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_u8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
-
-#ifdef __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgtz(vin_deq.val[0]),
-                        wrapper::vcgtz(vin_deq.val[1]),
-                        wrapper::vcgtz(vin_deq.val[2]),
-                        wrapper::vcgtz(vin_deq.val[3]),
-                    }
-                };
-#else  // __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
-#endif // __aarch64__
-
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
-                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
-                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
-                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
-
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
-            qasymm8_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp
deleted file mode 100644
index 8b40bf8e72..0000000000
--- a/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    constexpr int                                 window_step_x  = 16;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const qasymm8x16_signed_t     va       = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
-    const qasymm8x16_signed_t     vb       = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
-    const qasymm8_signed_t        a        = quantize_qasymm8_signed(act_info.a(), qi_in);
-    const qasymm8_signed_t        b        = quantize_qasymm8_signed(act_info.b(), qi_in);
-    const qasymm8_signed_t        const_0  = quantize_qasymm8_signed(0.f, qi_in);
-    const qasymm8x16_signed_t     vconst_0 = vdupq_n_s8(const_0);
-    const auto                    vconst_1 = vdupq_n_f32(1.f);
-#ifndef __aarch64__
-    const auto vconst_0_f32 = vdupq_n_f32(1.f);
-#endif // __aarch64__
-    const float32x4_t va_f32          = vdupq_n_f32(act_info.a());
-    const float32x4_t vb_f32          = vdupq_n_f32(act_info.b());
-    const float       a_f32           = act_info.a();
-    const float       b_f32           = act_info.b();
-    const auto        const_6_f32     = vdupq_n_f32(6.f);
-    const auto        const_0_f32     = vdupq_n_f32(0.f);
-    const auto        const_3_f32     = vdupq_n_f32(3.f);
-    const auto        const_inv_6_f32 = vdupq_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    float       s  = qi_in.scale / qi_out.scale;
-    float       o  = -qi_in.offset * s + qi_out.offset;
-    float32x4_t vs = vdupq_n_f32(s);
-    float32x4_t vo = vdupq_n_f32(o);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_s8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
-
-#ifdef __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgtz(vin_deq.val[0]),
-                        wrapper::vcgtz(vin_deq.val[1]),
-                        wrapper::vcgtz(vin_deq.val[2]),
-                        wrapper::vcgtz(vin_deq.val[3]),
-                    }
-                };
-#else  // __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
-#endif // __aarch64__
-
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
-                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
-                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
-                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
-
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
-            qasymm8_signed_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qsymm16.cpp b/src/core/cpu/kernels/activation/neon/qsymm16.cpp
deleted file mode 100644
index 54b41820f2..0000000000
--- a/src/core/cpu/kernels/activation/neon/qsymm16.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    constexpr int                                 window_step_x  = 8;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const auto                    vconst_1 = vdupq_n_f32(1.f);
-    const float32x4_t             va_f32   = vdupq_n_f32(act_info.a());
-    const float32x4_t             vb_f32   = vdupq_n_f32(act_info.b());
-    const float                   a_f32    = act_info.a();
-    const float                   b_f32    = act_info.b();
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
-        ARM_COMPUTE_UNUSED(tmp);
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
-            qsymm16_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/sve/fp16.cpp b/src/core/cpu/kernels/activation/sve/fp16.cpp
deleted file mode 100644
index 5e76e82c52..0000000000
--- a/src/core/cpu/kernels/activation/sve/fp16.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstddef>
-
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const auto const_1     = svdup_n_f16(1.f);
-    const auto const_0     = svdup_n_f16(0.f);
-    const auto const_6     = svdup_n_f16(6.f);
-    const auto const_3     = svdup_n_f16(3.f);
-    const auto const_inv_6 = svdup_n_f16(0.166666667f);
-
-    const auto va = svdup_n_f16(act_info.a());
-    const auto vb = svdup_n_f16(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        svfloat16_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_f16(pg, input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f16_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f16_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f16_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f16(pg, output_ptr + x, tmp);
-
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/fp32.cpp b/src/core/cpu/kernels/activation/sve/fp32.cpp
deleted file mode 100644
index cb9f82eb39..0000000000
--- a/src/core/cpu/kernels/activation/sve/fp32.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/SVEMath.h"
-
-#include <cmath>
-#include <cstddef>
-
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const auto const_1     = svdup_n_f32(1.f);
-    const auto const_0     = svdup_n_f32(0.f);
-    const auto const_6     = svdup_n_f32(6.f);
-    const auto const_3     = svdup_n_f32(3.f);
-    const auto const_inv_6 = svdup_n_f32(0.166666667f);
-
-    const auto va = svdup_n_f32(act_info.a());
-    const auto vb = svdup_n_f32(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        svfloat32_t tmp;
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_f32(pg, input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f32_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f32_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f32_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f32(pg, output_ptr + x, tmp);
-
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/qasymm8.cpp b/src/core/cpu/kernels/activation/sve/qasymm8.cpp
deleted file mode 100644
index 228b4ae530..0000000000
--- a/src/core/cpu/kernels/activation/sve/qasymm8.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_SVE2)
-#include "src/core/NEON/SVEAsymm.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
-    const auto                    va              = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in));
-    const auto                    vb              = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in));
-    const auto                    const_0         = quantize_qasymm8(0.f, qi_in);
-    const auto                    vconst_0        = svdup_n_u8(const_0);
-    const auto                    vconst_1        = svdup_n_f32(1.f);
-    const auto                    va_f32          = svdup_n_f32(act_info.a());
-    const auto                    vb_f32          = svdup_n_f32(act_info.b());
-    const auto                    const_6_f32     = svdup_n_f32(6.f);
-    const auto                    const_0_f32     = svdup_n_f32(0.f);
-    const auto                    const_3_f32     = svdup_n_f32(3.f);
-    const auto                    const_inv_6_f32 = svdup_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
-    {
-        requant = false;
-    }
-    float s  = qi_in.scale / qi_out.scale;
-    float o  = -qi_in.offset * s + qi_out.offset;
-    auto  vs = svdup_n_f32(s);
-    auto  vo = svdup_n_f32(o);
-
-    // Initialise scale/offset for re-quantization with int32_t
-    const auto voffset_in = svdup_n_s32(qi_in.offset);
-    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_s32     = svdup_n_s32(s_s32);
-    const auto vo_s32     = svdup_n_s32(o_s32);
-
-    // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
-    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-        svuint8_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_u8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_u8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
-
-                // Expand to int32
-                const svint32x4_t vin_s32 =
-                {
-                    { {
-                            svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
-                            svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
-                            svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
-                            svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))),
-                        }
-                    }
-                };
-
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
-                {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-                else
-                {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-
-                // Multiply negative elements and requantize if necessary
-                if(requant)
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
-                }
-                else
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
-                }
-
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_u8(pg, output_ptr + x, tmp);
-
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b8(), pg));
-
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp
deleted file mode 100644
index 989f825eb9..0000000000
--- a/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_SVE2)
-#include "src/core/NEON/SVEAsymm.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
-    const auto                    va              = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
-    const auto                    vb              = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
-    const auto                    const_0         = quantize_qasymm8_signed(0.f, qi_in);
-    const auto                    vconst_0        = svdup_n_s8(const_0);
-    const auto                    vconst_1        = svdup_n_f32(1.f);
-    const auto                    va_f32          = svdup_n_f32(act_info.a());
-    const auto                    vb_f32          = svdup_n_f32(act_info.b());
-    const auto                    const_6_f32     = svdup_n_f32(6.f);
-    const auto                    const_0_f32     = svdup_n_f32(0.f);
-    const auto                    const_3_f32     = svdup_n_f32(3.f);
-    const auto                    const_inv_6_f32 = svdup_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
-    {
-        requant = false;
-    }
-    float s  = qi_in.scale / qi_out.scale;
-    float o  = -qi_in.offset * s + qi_out.offset;
-    auto  vs = svdup_n_f32(s);
-    auto  vo = svdup_n_f32(o);
-
-    // Initialise scale/offset for re-quantization with int32_t
-    const auto voffset_in = svdup_n_s32(qi_in.offset);
-    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_s32     = svdup_n_s32(s_s32);
-    const auto vo_s32     = svdup_n_s32(o_s32);
-
-    // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
-    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-        svint8_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_s8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_s8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
-
-                // Expand to int32
-                const svint32x4_t vin_s32 =
-                {
-                    { {
-                            svmovlb_s32(svmovlb_s16(vin)),
-                            svmovlt_s32(svmovlb_s16(vin)),
-                            svmovlb_s32(svmovlt_s16(vin)),
-                            svmovlt_s32(svmovlt_s16(vin)),
-                        }
-                    }
-                };
-
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
-                {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-                else
-                {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-
-                // Multiply negative elements and requantize if necessary
-                if(requant)
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
-                }
-                else
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
-                }
-
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_s8(pg, output_ptr + x, tmp);
-
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/core/cpu/kernels/activation/sve/qsymm16.cpp b/src/core/cpu/kernels/activation/sve/qsymm16.cpp
deleted file mode 100644
index 66974875da..0000000000
--- a/src/core/cpu/kernels/activation/sve/qsymm16.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_SVE2)
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/SVESymm.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const auto                    vconst_1 = svdup_n_f32(1.f);
-    const auto                    va_f32   = svdup_n_f32(act_info.a());
-    const auto                    vb_f32   = svdup_n_f32(act_info.b());
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        svint16_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_s16(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep =
-                {
-                    { {
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_s16(pg, output_ptr + x, tmp);
-
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/core/cpu/kernels/add/neon/integer.cpp b/src/core/cpu/kernels/add/neon/integer.cpp
deleted file mode 100644
index 24a0ac3b7c..0000000000
--- a/src/core/cpu/kernels/add/neon/integer.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_u8_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) + static_cast<int16_t>(*(input2_ptr + x));
-            }
-        }
-        else
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = wrapper::add_sat(static_cast<int16_t>(*(input1_ptr + x)),
-                                                     static_cast<int16_t>(*(input2_ptr + x)));
-            }
-        }
-    },
-    input1, input2, output);
-}
-
-void add_s16_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = wrapper::vloadq(input1_ptr + x);
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = *(input1_ptr + x) + static_cast<int16_t>(*(input2_ptr + x));
-            }
-        }
-        else
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = wrapper::vloadq(input1_ptr + x);
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
-            }
-        }
-    },
-    input1, input2, output);
-}
-
-void add_u8_s16_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Simply swap the two input buffers:
-    add_s16_u8_s16_neon(src1, src0, dst, policy, window);
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/neon/list.h b/src/core/cpu/kernels/add/neon/list.h
deleted file mode 100644
index 3ab03dd40e..0000000000
--- a/src/core/cpu/kernels/add/neon/list.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ADD_LIST_H
-#define SRC_CORE_NEON_KERNELS_ADD_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_ADD_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-
-DECLARE_ADD_KERNEL(add_qasymm8_neon);
-DECLARE_ADD_KERNEL(add_qasymm8_signed_neon);
-DECLARE_ADD_KERNEL(add_qsymm16_neon);
-DECLARE_ADD_KERNEL(add_s16_u8_s16_neon);
-DECLARE_ADD_KERNEL(add_u8_s16_s16_neon);
-DECLARE_ADD_KERNEL(add_u8_u8_s16_neon);
-
-#undef DECLARE_ADD_KERNEL
-
-template <typename ScalarType>
-void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    constexpr int window_step_x         = 16 / sizeof(ScalarType);
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                const auto res             = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto val1 = wrapper::vloadq(input1_ptr + x);
-                const auto val2 = wrapper::vloadq(input2_ptr + x);
-                const auto res  = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto val1   = *(input1_ptr + x);
-                const auto val2   = *(input2_ptr + x);
-                *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_ADD_LIST_H
diff --git a/src/core/cpu/kernels/add/neon/qasymm8.cpp b/src/core/cpu/kernels/add/neon/qasymm8.cpp
deleted file mode 100644
index e357a7ef7f..0000000000
--- a/src/core/cpu/kernels/add/neon/qasymm8.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-    const float32x4_t voffseto   = vdupq_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const float32x4_t vscale1  = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
-        const float32x4_t vscale2  = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
-        const int32x4_t   voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
-        const int32x4_t   voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const uint8_t    broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
-
-            const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
-            const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
-            const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
-            const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
-
-            const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint8x16_t a    = vld1q_u8(non_broadcast_input_ptr + x);
-                const auto       af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
-                const auto       af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
-                const auto       af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
-                const auto       af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
-
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
-                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        const float32x4_t vscale1  = vdupq_n_f32(iq1_info.scale);
-        const float32x4_t vscale2  = vdupq_n_f32(iq2_info.scale);
-        const int32x4_t   voffset1 = vdupq_n_s32(iq1_info.offset);
-        const int32x4_t   voffset2 = vdupq_n_s32(iq2_info.offset);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint8x16_t a = vld1q_u8(input1_ptr + x);
-                const uint8x16_t b = vld1q_u8(input2_ptr + x);
-
-                const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
-                const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
-                const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
-                const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
-
-                const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
-                const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
-                const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
-                const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
-
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
-                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp
deleted file mode 100644
index d62d0739f5..0000000000
--- a/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-    const float32x4_t voffseto   = vdupq_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const float32x4_t vscale1  = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
-        const float32x4_t vscale2  = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
-        const int32x4_t   voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
-        const int32x4_t   voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const int8_t    broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const int8x16_t broadcast_value_vec = vdupq_n_s8(broadcast_value);
-
-            const auto  bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
-            const auto  bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
-            const auto  bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
-            const auto  bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
-            const float bfs  = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
-
-                const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
-                const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
-                const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
-                const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
-
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
-                const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        const float32x4_t vscale1  = vdupq_n_f32(iq1_info.scale);
-        const float32x4_t vscale2  = vdupq_n_f32(iq2_info.scale);
-        const int32x4_t   voffset1 = vdupq_n_s32(iq1_info.offset);
-        const int32x4_t   voffset2 = vdupq_n_s32(iq2_info.offset);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int8x16_t a = vld1q_s8(input1_ptr + x);
-                const int8x16_t b = vld1q_s8(input2_ptr + x);
-
-                const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
-                const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
-                const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
-                const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
-
-                const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
-                const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
-                const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
-                const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
-
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
-                const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/neon/qsymm16.cpp b/src/core/cpu/kernels/add/neon/qsymm16.cpp
deleted file mode 100644
index e76e408d6e..0000000000
--- a/src/core/cpu/kernels/add/neon/qsymm16.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 8;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t vscale1    = vdupq_n_f32(iq1_info.scale);
-    const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
-
-            const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
-
-            const auto  bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
-            const auto  bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
-            const float bfs  = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int16x8_t a    = vld1q_s16(non_broadcast_input_ptr + x);
-                const auto      af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
-                const auto      af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#endif //__aarch64__
-
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int16x8_t a = vld1q_s16(input1_ptr + x);
-                const int16x8_t b = vld1q_s16(input2_ptr + x);
-
-                const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
-                const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
-                const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
-                const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#endif //__aarch64__
-
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/impl.cpp b/src/core/cpu/kernels/add/sve/impl.cpp
deleted file mode 100644
index cf9e301c29..0000000000
--- a/src/core/cpu/kernels/add/sve/impl.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/cpu/kernels/add/sve/impl.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    const auto all_true_pg           = wrapper::svptrue<ScalarType>();
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-    const bool is_sat                = (policy == ConvertPolicy::SATURATE);
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
-    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
-    Iterator output(dst, window);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto val1 = svld1(pg, input1_ptr + x);
-                const auto val2 = svld1(pg, input2_ptr + x);
-                const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-
-template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/impl.h b/src/core/cpu/kernels/add/sve/impl.h
deleted file mode 100644
index c38b1d47e0..0000000000
--- a/src/core/cpu/kernels/add/sve/impl.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H
-#define SRC_CORE_SVE_KERNELS_ADD_IMPL_H
-
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ENABLE_SVE)
-#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/integer.cpp b/src/core/cpu/kernels/add/sve/integer.cpp
deleted file mode 100644
index bd8179205b..0000000000
--- a/src/core/cpu/kernels/add/sve/integer.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_u8_u8_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const auto all_true_pg    = svptrue_b8();
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            int      x    = window_start_x;
-            svbool_t pg_u = svwhilelt_b8(x, window_end_x);
-            svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
-            svbool_t pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
-            do
-            {
-                const auto vsrc0 = svld1(pg_u, input1_ptr + x);
-                const auto vsrc1 = svld1(pg_u, input2_ptr + x);
-
-                const auto vsrc0_lo = svreinterpret_s16_u16(svunpklo(vsrc0));
-                const auto vsrc0_hi = svreinterpret_s16_u16(svunpkhi(vsrc0));
-                const auto vsrc1_lo = svreinterpret_s16_u16(svunpklo(vsrc1));
-                const auto vsrc1_hi = svreinterpret_s16_u16(svunpkhi(vsrc1));
-                svst1(pg_0, output_ptr + x, svqadd(vsrc0_lo, vsrc1_lo));
-                svst1(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_hi, vsrc1_hi));
-
-                x += svcntb();
-                pg_u = svwhilelt_b8(x, window_end_x);
-                pg_0 = svwhilelt_b16(x, window_end_x);
-                pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
-            }
-            while(svptest_any(all_true_pg, pg_u));
-        }
-        else
-        {
-            int      x    = window_start_x;
-            svbool_t pg_u = svwhilelt_b8(x, window_end_x);
-            svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
-            svbool_t pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
-            do
-            {
-                const auto vsrc0 = svld1(pg_u, input1_ptr + x);
-                const auto vsrc1 = svld1(pg_u, input2_ptr + x);
-
-                const auto vsrc0_lo = svreinterpret_s16_u16(svunpklo(vsrc0));
-                const auto vsrc0_hi = svreinterpret_s16_u16(svunpkhi(vsrc0));
-                const auto vsrc1_lo = svreinterpret_s16_u16(svunpklo(vsrc1));
-                const auto vsrc1_hi = svreinterpret_s16_u16(svunpkhi(vsrc1));
-                svst1(pg_0, output_ptr + x, svqadd(vsrc0_lo, vsrc1_lo));
-                svst1(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_hi, vsrc1_hi));
-
-                x += svcntb();
-                pg_u = svwhilelt_b8(x, window_end_x);
-                pg_0 = svwhilelt_b16(x, window_end_x);
-                pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
-            }
-            while(svptest_any(all_true_pg, pg_u));
-        }
-    },
-    input1, input2, output);
-}
-
-void add_s16_u8_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const auto all_true_pg    = svptrue_b8();
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            int      x    = window_start_x;
-            svbool_t pg_u = svwhilelt_b8(x, window_end_x);
-            svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
-            svbool_t pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
-            do
-            {
-                const auto vsrc0_0  = svld1_s16(pg_0, input1_ptr + x);
-                const auto vsrc0_1  = svld1_s16(pg_1, input1_ptr + x + svcnth());
-                const auto vsrc1_u8 = svld1_u8(pg_u, input2_ptr + x);
-                const auto vsrc1_0  = svreinterpret_s16_u16(svunpklo(vsrc1_u8));
-                const auto vsrc1_1  = svreinterpret_s16_u16(svunpkhi(vsrc1_u8));
-                svst1_s16(pg_0, output_ptr + x, svadd_s16_z(pg_0, vsrc0_0, vsrc1_0));
-                svst1_s16(pg_1, output_ptr + x + svcnth(), svadd_s16_z(pg_1, vsrc0_1, vsrc1_1));
-
-                x += svcntb();
-                pg_u = svwhilelt_b8(x, window_end_x);
-                pg_0 = svwhilelt_b16(x, window_end_x);
-                pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg_u));
-        }
-        else
-        {
-            int      x    = window_start_x;
-            svbool_t pg_u = svwhilelt_b8(x, window_end_x);
-            svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
-            svbool_t pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
-            do
-            {
-                const auto vsrc0_0  = svld1_s16(pg_0, input1_ptr + x);
-                const auto vsrc0_1  = svld1_s16(pg_1, input1_ptr + x + svcnth());
-                const auto vsrc1_u8 = svld1_u8(pg_u, input2_ptr + x);
-                const auto vsrc1_0  = svreinterpret_s16_u16(svunpklo(vsrc1_u8));
-                const auto vsrc1_1  = svreinterpret_s16_u16(svunpkhi(vsrc1_u8));
-
-                svst1_s16(pg_0, output_ptr + x, svqadd(vsrc0_0, vsrc1_0));
-                svst1_s16(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_1, vsrc1_1));
-
-                x += svcntb();
-                pg_u = svwhilelt_b8(x, window_end_x);
-                pg_0 = svwhilelt_b16(x, window_end_x);
-                pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg_u));
-        }
-    },
-    input1, input2, output);
-}
-
-void add_u8_s16_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Simply swap the two input buffers:
-    add_s16_u8_s16_sve(src1, src0, dst, policy, window);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/list.h b/src/core/cpu/kernels/add/sve/list.h
deleted file mode 100644
index aebb43bb60..0000000000
--- a/src/core/cpu/kernels/add/sve/list.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ADD_LIST_H
-#define SRC_CORE_SVE_KERNELS_ADD_LIST_H
-
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/add/sve/impl.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_ADD_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-
-DECLARE_ADD_KERNEL(add_qasymm8_sve);
-DECLARE_ADD_KERNEL(add_qasymm8_signed_sve);
-DECLARE_ADD_KERNEL(add_qsymm16_sve);
-DECLARE_ADD_KERNEL(add_s16_u8_s16_sve);
-DECLARE_ADD_KERNEL(add_u8_s16_s16_sve);
-DECLARE_ADD_KERNEL(add_u8_u8_s16_sve);
-
-#undef DECLARE_ADD_KERNEL
-
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ENABLE_SVE)
-#endif // SRC_CORE_SVE_KERNELS_ADD_LIST_H
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/qasymm8.cpp b/src/core/cpu/kernels/add/sve/qasymm8.cpp
deleted file mode 100644
index f6d1485e61..0000000000
--- a/src/core/cpu/kernels/add/sve/qasymm8.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE2)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-    const auto all_true_pg           = svptrue_b8();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
-    const auto voffseto   = svdup_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        const svfloat32_t vscale1  = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale);
-        const svfloat32_t vscale2  = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale);
-        const svint32_t   voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset);
-        const svint32_t   voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const uint8_t   broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
-
-            do
-            {
-                const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
-                const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
-                const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
-
-                const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
-                svst1_u8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        const auto vscale1  = svdup_n_f32(iq1_info.scale);
-        const auto vscale2  = svdup_n_f32(iq2_info.scale);
-        const auto voffset1 = svdup_n_s32(iq1_info.offset);
-        const auto voffset2 = svdup_n_s32(iq2_info.offset);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-            do
-            {
-                const auto a    = svld1_u8(pg, input1_ptr + x);
-                const auto b    = svld1_u8(pg, input2_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2);
-                const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2);
-                const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2);
-
-                const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
-                const auto pb  = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
-                const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
-
-                svst1_u8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp
deleted file mode 100644
index 8102aa5c65..0000000000
--- a/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE2)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_signed_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
-    const auto voffseto   = svdup_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const auto     all_true_pg          = svptrue_b8();
-
-        const auto vscale1  = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale);
-        const auto vscale2  = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale);
-        const auto voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset);
-        const auto voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const int8_t broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const auto   broadcast_value_vec = svdup_n_s8(broadcast_value);
-
-            int        x    = window_start_x;
-            svbool_t   pg   = svwhilelt_b8(x, window_end_x);
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-
-            do
-            {
-                const auto a    = svld1_s8(pg, non_broadcast_input_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
-                const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
-                svst1_s8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        const auto vscale1  = svdup_n_f32(iq1_info.scale);
-        const auto vscale2  = svdup_n_f32(iq2_info.scale);
-        const auto voffset1 = svdup_n_s32(iq1_info.offset);
-        const auto voffset2 = svdup_n_s32(iq2_info.offset);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-            do
-            {
-                const auto a = svld1_s8(pg, input1_ptr + x);
-                const auto b = svld1_s8(pg, input2_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
-                const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
-                const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
-                const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
-                svst1_s8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(svptrue_b8(), pg));
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/qsymm16.cpp b/src/core/cpu/kernels/add/sve/qsymm16.cpp
deleted file mode 100644
index fb62257b0a..0000000000
--- a/src/core/cpu/kernels/add/sve/qsymm16.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE2)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qsymm16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const auto vscale1     = svdup_n_f32(iq1_info.scale);
-    const auto vscale2     = svdup_n_f32(iq2_info.scale);
-    const auto invvscaleo  = svdup_n_f32(1.f / oq_info.scale);
-    const auto all_true_pg = svptrue_b16();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
-
-            const int16_t broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const auto    broadcast_value_vec = svdup_n_s16(broadcast_value);
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b16(x, window_end_x);
-
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
-
-            do
-            {
-                const auto a    = svld1_s16(pg, non_broadcast_input_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-
-                const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-
-                svst1_s16(pg, output_ptr + x, res);
-
-                x += svcnth();
-                pg = svwhilelt_b16(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b16(x, window_end_x);
-            do
-            {
-                auto a = svld1_s16(pg, input1_ptr + x);
-                auto b = svld1_s16(pg, input2_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-
-                const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                svst1_s16(pg, output_ptr + x, res);
-
-                x += svcnth();
-                pg = svwhilelt_b16(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
deleted file mode 100644
index 4b7b092d01..0000000000
--- a/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp"
-
-#include "gemm_common.hpp"
-
-namespace arm_compute
-{
-class ITensor;
-
-namespace cpu
-{
-namespace kernel
-{
-/** This class is a wrapper for the assembly kernels.
-  *
-  * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55.
-  * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance
-  * of CpuGemmAssemblyWrapperKernel and other auxiliary data structures to execute a single assembly kernel
-  * in the context of an NEFunctions.
-  *
-  * The type T is the type of the actual kernel implemented in assembly which is of type
-  *         template<typename To, typename Tr> class GemmCommon
-  *
-  *
-  */
-template <typename TypeInput, typename TypeOutput>
-class CpuGemmAssemblyWrapperKernel final : public INEKernel
-{
-public:
-    /** Constructor
-     */
-    CpuGemmAssemblyWrapperKernel()
-        : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
-    {
-    }
-
-    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &)  = delete;
-    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default;
-    CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete;
-
-    const char *name() const override
-    {
-        return _name.c_str();
-    }
-
-    void run(const Window &window, const ThreadInfo &info) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-        auto win = arm_gemm::to_ndcoord(window);
-
-        arm_gemm::ndcoord_t thread_locator{};
-
-        _kernel->execute(win, thread_locator, info.thread_id);
-    }
-
-    // Inherited methods overridden:
-    void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-        //convert between arm_compute and arm_gemm types
-        auto ndc_win = arm_gemm::to_ndcoord(window);
-        auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator);
-
-        _kernel->execute(ndc_win, ndc_tlc, info.thread_id);
-    }
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in] kernel          Pointer to an assembly kernel implementation.
-     * @param[in] kernel_name_tag Tag to be attacehd to the kernel's name.
-     */
-    void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
-        _kernel = kernel;
-
-        Window win = to_window(kernel->get_window_size());
-
-        INEKernel::configure(win);
-
-        if(!kernel_name_tag.empty())
-        {
-            _name += "/" + kernel_name_tag;
-        }
-    }
-
-private:
-    arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
-    std::string _name;
-};
-} // namespace kernel
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H */
diff --git a/src/core/cpu/kernels/assembly/arm_gemm.hpp b/src/core/cpu/kernels/assembly/arm_gemm.hpp
deleted file mode 100644
index 81e355d6b3..0000000000
--- a/src/core/cpu/kernels/assembly/arm_gemm.hpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <cstring>
-#include <memory>
-#include <vector>
-
-#include "arm_gemm_local.hpp"
-#include "gemm_common.hpp"
-
-namespace arm_gemm
-{
-enum class GemmMethod
-{
-    DEFAULT,
-    GEMV_BATCHED,
-    GEMV_PRETRANSPOSED,
-    GEMV_NATIVE_TRANSPOSED,
-    GEMM_NATIVE,
-    GEMM_HYBRID,
-    GEMM_INTERLEAVED,
-    GEMM_INTERLEAVED_2D,
-    QUANTIZE_WRAPPER,
-    QUANTIZE_WRAPPER_2D,
-    GEMM_HYBRID_QUANTIZED,
-    INDIRECT_GEMM,
-    CONVOLUTION_GEMM
-};
-
-struct KernelDescription
-{
-    GemmMethod  method         = GemmMethod::DEFAULT;
-    std::string name           = "";
-    bool        is_default     = false;
-    uint64_t    cycle_estimate = 0;
-
-    KernelDescription(GemmMethod m, std::string n, bool d = false, uint64_t c = 0)
-        : method(m), name(n), is_default(d), cycle_estimate(c)
-    {
-    }
-    KernelDescription() noexcept
-    {
-    }
-};
-
-struct GemmConfig
-{
-    GemmMethod   method           = GemmMethod::DEFAULT;
-    std::string  filter           = "";
-    unsigned int inner_block_size = 0;
-    unsigned int outer_block_size = 0;
-
-    GemmConfig(GemmMethod method)
-        : method(method)
-    {
-    }
-    GemmConfig()
-    {
-    }
-};
-
-struct Activation
-{
-    enum class Type
-    {
-        None,
-        ReLU,
-        BoundedReLU
-    };
-
-    Type  type;
-    float param1;
-    float param2;
-
-    Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f)
-        : type(type), param1(p1), param2(p2)
-    {
-    }
-};
-
-struct GemmArgs
-{
-public:
-    const CPUInfo    *_ci;
-    unsigned int      _Msize;
-    unsigned int      _Nsize;
-    unsigned int      _Ksize;
-    unsigned int      _Ksections;
-    unsigned int      _nbatches;
-    unsigned int      _nmulti;
-    bool              _indirect_input;
-    Activation        _act;
-    int               _maxthreads;
-    const GemmConfig *_cfg;
-
-    GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
-             unsigned int K, unsigned int Ksections, unsigned int nbatches,
-             unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
-             const GemmConfig *cfg = nullptr)
-        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _cfg(cfg)
-    {
-    }
-};
-
-struct Requantize32
-{
-public:
-    const int32_t *bias                     = nullptr;
-    size_t         bias_multi_stride        = 0;
-    int32_t        a_offset                 = 0;
-    int32_t        b_offset                 = 0;
-    int32_t        c_offset                 = 0;
-    bool           per_channel_requant      = false;
-    int32_t        per_layer_left_shift     = 0;
-    int32_t        per_layer_right_shift    = 0;
-    int32_t        per_layer_mul            = 0;
-    const int32_t *per_channel_left_shifts  = nullptr;
-    const int32_t *per_channel_right_shifts = nullptr;
-    const int32_t *per_channel_muls         = nullptr;
-    int32_t        minval                   = 0;
-    int32_t        maxval                   = 0;
-
-    Requantize32() = default;
-
-    // Constructor for per-tensor quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
-          per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
-    {
-    }
-
-    // Constructor for per-channel quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 const int32_t *requant_left_shifts,
-                 const int32_t *requant_right_shifts,
-                 const int32_t *requant_muls,
-                 int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts),
-          per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv)
-    {
-    }
-};
-
-struct Nothing
-{
-};
-
-template <typename Top, typename Tret>
-using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>;
-
-/* Low level API calls.
- * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */
-
-/* get_gemm_method(): Given the templated types and provided parameters,
- * which is the preferred method to implement this GEMM?  */
-template <typename Top, typename Tret, class OutputStage = Nothing>
-KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {});
-
-template <typename Top, typename Tret, class OutputStage = Nothing>
-UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & = {});
-
-template <typename Top, typename Tret, class OutputStage = Nothing>
-std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & = {});
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
deleted file mode 100644
index 718fcd1fb4..0000000000
--- a/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "arm_compute/core/Dimensions.h"
-#include "arm_compute/core/Window.h"
-
-#include "ndrange.hpp"
-
-#include <cassert>
-
-/* This file contains mapping between integral types used in arm_compute and arm_gemm
- * These two codebases both require a degree of separation for the sake of modularity
- * so maintain their own types which represent similar information.
- */
-
-namespace arm_gemm
-{
-//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
-constexpr std::size_t ndrange_max =
-    arm_compute::Dimensions<unsigned int>::num_max_dimensions;
-
-using ndrange_t = NDRange<ndrange_max>;
-using ndcoord_t = NDCoordinate<ndrange_max>;
-
-/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window`
- *
- * As `NDRange<T>` does not not encode start positions, we specify
- * the start to be zero in the produced `arm_compute::Window`
- *
- * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window`
- * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr`
- */
-inline arm_compute::Window to_window(const ndrange_t &ndr)
-{
-    arm_compute::Window win;
-
-    for(unsigned int i = 0; i != ndrange_max; ++i)
-    {
-        //populate the window with the dimensions of the NDRange
-        win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
-    }
-
-    return win;
-}
-
-/*
- * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window`
- *
- * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window`
- * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc`
- */
-inline arm_compute::Window to_window(const ndcoord_t &ndc)
-{
-    arm_compute::Window win;
-
-    for(unsigned int i = 0; i != ndrange_max; ++i)
-    {
-        const auto start = ndc.get_position(i);
-        const auto size  = ndc.get_size(i);
-        const auto stop  = start + size;
-
-        //populate the window with the dimensions of the NDRange
-        win.set(i, arm_compute::Window::Dimension(start, stop));
-    }
-
-    return win;
-}
-
-/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions
- *
- * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()`
- * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range
- *
- * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t`
- * @return the resultant ndrange_t
- */
-inline ndrange_t to_ndrange(const arm_compute::Window &win)
-{
-    return
-    {
-        static_cast<unsigned int>(win[0].end() - win[0].start()),
-        static_cast<unsigned int>(win[1].end() - win[1].start()),
-        static_cast<unsigned int>(win[2].end() - win[2].start()),
-        static_cast<unsigned int>(win[3].end() - win[3].start()),
-        static_cast<unsigned int>(win[4].end() - win[4].start()),
-        static_cast<unsigned int>(win[5].end() - win[5].start())
-    };
-}
-
-/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
- *
- * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t`
- * @return the resultant ndcoord_t
- */
-inline ndcoord_t to_ndcoord(const arm_compute::Window &win)
-{
-    return
-    {
-        { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
-        { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
-        { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
-        { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
-        { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
-        { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
-    };
-}
-
-} //namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/arm_gemm_local.hpp b/src/core/cpu/kernels/assembly/arm_gemm_local.hpp
deleted file mode 100644
index 78e0adf31f..0000000000
--- a/src/core/cpu/kernels/assembly/arm_gemm_local.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-/* This file is used to configure integration-specific aspects of arm_gemm into ACL */
-
-#include "arm_compute/core/CPP/CPPTypes.h"
-
-using CPUModel = arm_compute::CPUModel;
-using CPUInfo  = arm_compute::CPUInfo;
diff --git a/src/core/cpu/kernels/assembly/convolution_parameters.hpp b/src/core/cpu/kernels/assembly/convolution_parameters.hpp
deleted file mode 100644
index 0c1ae58902..0000000000
--- a/src/core/cpu/kernels/assembly/convolution_parameters.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <cstdint>
-
-namespace arm_gemm
-{
-/*
- * Parameter set for "convolution" type GEMM.
- *
- * For a "convolution" GEMM, the GEMM parameters (M, K) are specified as if
- * an im2row had been performed on the input tensor to generate the operand
- * matrix, but instead this structure describes the convolution parameters
- * such that this can be done on the fly.
- *
- * The parameters describe the convolution details - the notional shape of
- * the input and output tensors, whether padding is to be applied, the size
- * of the kernel and a constant value to be used for padding (needed for
- * quantized tensors).
- *
- * The second part describes the layout of the input tensor in memory, which
- * is assumed to be in NHWC format.  This consists of a base pointer and
- * strides for columns, rows and batches.  'multis' are not supported for
- * convolution type GEMMs.
- */
-struct ConvolutionParameters
-{
-    int64_t input_width;
-    int64_t input_height;
-    int64_t input_channels;
-    int64_t kernel_width;
-    int64_t kernel_height;
-    int64_t output_width;
-    int64_t output_height;
-    int64_t output_stride_w;
-    int64_t output_stride_h;
-    //          output_channels not included as they do not affect the input.
-    int64_t padding_top;
-    int64_t padding_left;
-    float   padding_value;
-};
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/gemm_common.hpp b/src/core/cpu/kernels/assembly/gemm_common.hpp
deleted file mode 100644
index 4af85ed663..0000000000
--- a/src/core/cpu/kernels/assembly/gemm_common.hpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "convolution_parameters.hpp"
-#include "ndrange.hpp"
-
-#include <cstddef>
-
-namespace arm_gemm
-{
-// Abstract class for the GEMM/GEMV functions.
-//
-// GEMM implementations may be "native" (never require any input
-// permutation), "pretransposed" (require permutation up-front) or require
-// working space (permute as they go along).  This interface should support
-// all of them.
-
-// The real GemmCommon class is templated based on the operand and return
-// type.  This is an interface class which is independent of those types.
-class IGemmCommon
-{
-public:
-    /* Pass in the pointers to the arrays to be operated on and their
-     * strides.  This "generic" version uses void *s, the preferred version
-     * is the one provided by templated GemmCommon (below) which takes
-     * appropriately typed pointers.  If B is pretransposed (see below) then
-     * the settings for B here are ignored.
-     */
-    virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                                    const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                                    void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                                    const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
-
-    /** @returns an ndrange containing ranges of the compute space which can be
-     * broken up and parallelised over
-     */
-    virtual ndrange_t get_window_size() const = 0;
-
-    /* The maximum thread count is specified when the GEMM is created.  Some
-     * implementations need to know how many threads will actually run in
-     * order to work properly.
-     *
-     * In some cases, after creating the GEMM the number of threads needs to
-     * be reduced (e.g. not enough work to split across threads).  This
-     * method allows the number of actual threads to be run to be set (must
-     * be equal or lower).
-     *
-     * This has an empty default implementation, as GEMMs which don't care
-     * about thread count can safely ignore this.
-     */
-    virtual void set_nthreads(int) {};
-
-    /* Whether this GEMM can be dynamically scheduled or not. */
-    virtual bool supports_dynamic_scheduling() const
-    {
-        return false;
-    }
-
-    /** Main execute member fucntion
-     * @param [in] work_range     specifies the range of work we want to be computed, total range defined by get_window_size()
-     * @param [in] thread_locator where are we inside of the thread space
-     * @param [in] threadid       a unique threadid
-     */
-    virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0;
-
-    /*** Working space interface (optional) ***/
-    /* Total number of bytes of temporary working space needed.  If zero, it's not necessary to call set_working_space(). */
-    virtual size_t get_working_size() const
-    {
-        return 0;
-    }
-    /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
-    virtual void set_working_space(void *) {};
-
-    /*** "Pretransposed" interface (optional) ***/
-    /* Is this object set up for pretranspose?  If so, pretranspose_array() needs to be called before execute(); */
-    virtual bool B_is_pretransposed() const
-    {
-        return false;
-    }
-    /* Does pretranspose still need to be done? */
-    virtual bool B_pretranspose_required() const
-    {
-        return false;
-    }
-    /* Total number of bytes of space needed for pretransposed arrays. */
-    virtual size_t get_B_pretransposed_array_size() const
-    {
-        return 0;
-    }
-    /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
-    /* The "real" version of this depends on the templated operand type (see below).  */
-    virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
-    /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
-    virtual void set_pretransposed_B_data(void *)
-    {
-    }
-
-    /*** "Quantized bias" interface (optional) ***/
-    /* Set the bias vector for quantized GEMMs */
-    virtual void set_quantized_bias(const int32_t *, size_t)
-    {
-    }
-
-    /*** Indirect interface (optional) ***/
-    /* Set the indirect table.  This comprises a number of values per kernel point, and a densely packed array of pointers,
-     * multis * batches * kernel_points */
-    virtual void set_indirect_parameters_generic(size_t, const void *const *const *)
-    {
-    }
-
-    /*** Convolution interface (optional) ***/
-    /* Set the convolution parameters. */
-    virtual void set_convolution_parameters(ConvolutionParameters)
-    {
-    }
-
-    // Destructor
-    virtual ~IGemmCommon()
-    {
-    }
-};
-
-/* "Real" GemmCommon class which is templated on the operand and return types.
- *
- * In addition to correctly typed versions of the functions that operate on
- * operand and return data, this class provides a default implementation of
- * 'set_arrays' to capture the provided arguments in protected class
- * members, as essentially any implementation will need these.
- */
-template <typename To, typename Tr>
-class GemmCommon : public IGemmCommon
-{
-protected:
-    const To *_Aptr              = nullptr;
-    int       _lda               = 0;
-    int       _A_batch_stride    = 0;
-    int       _A_multi_stride    = 0;
-    const To *_Bptr              = nullptr;
-    int       _ldb               = 0;
-    int       _B_multi_stride    = 0;
-    Tr       *_Cptr              = nullptr;
-    int       _ldc               = 0;
-    int       _C_batch_stride    = 0;
-    int       _C_multi_stride    = 0;
-    const Tr *_bias              = nullptr;
-    int       _bias_multi_stride = 0;
-
-public:
-    /* Pass in the pointers to the arrays to be operated on and their
-     * strides (templated version with appropriate types). */
-    virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                            Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
-    {
-        _Aptr              = A;
-        _lda               = lda;
-        _A_batch_stride    = A_batch_stride;
-        _A_multi_stride    = A_multi_stride;
-        _Bptr              = B;
-        _ldb               = ldb;
-        _B_multi_stride    = B_multi_stride;
-        _Cptr              = C;
-        _ldc               = ldc;
-        _C_batch_stride    = C_batch_stride;
-        _C_multi_stride    = C_multi_stride;
-        _bias              = bias;
-        _bias_multi_stride = bias_multi_stride;
-    }
-
-    /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                            void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
-    {
-        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
-                   static_cast<const To *>(B), ldb, B_multi_stride,
-                   static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
-                   static_cast<const Tr *>(bias), bias_multi_stride);
-    }
-
-    /*** "Pretransposed" interface ***/
-
-    /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
-    /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
-    virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
-
-    /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
-    {
-        pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
-    }
-
-    /*** Indirect interface ***/
-    virtual void set_indirect_parameters(size_t, const To *const *const *)
-    {
-    }
-
-    void set_indirect_parameters_generic(size_t sz, const void *const *const *ptr) override
-    {
-        set_indirect_parameters(sz, reinterpret_cast<const To *const *const *>(ptr));
-    }
-};
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/ndrange.hpp b/src/core/cpu/kernels/assembly/ndrange.hpp
deleted file mode 100644
index 1c8261aef7..0000000000
--- a/src/core/cpu/kernels/assembly/ndrange.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <initializer_list>
-
-namespace arm_gemm
-{
-template <unsigned int D>
-class NDRange
-{
-private:
-    std::array<unsigned int, D> m_sizes{};
-    std::array<unsigned int, D> m_totalsizes{};
-
-    class NDRangeIterator
-    {
-    private:
-        const NDRange &m_parent;
-        unsigned int   m_pos = 0;
-        unsigned int   m_end = 0;
-
-    public:
-        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e)
-            : m_parent(p), m_pos(s), m_end(e)
-        {
-        }
-
-        bool done() const
-        {
-            return (m_pos >= m_end);
-        }
-
-        unsigned int dim(unsigned int d) const
-        {
-            unsigned int r = m_pos;
-
-            if(d < (D - 1))
-            {
-                r %= m_parent.m_totalsizes[d];
-            }
-
-            if(d > 0)
-            {
-                r /= m_parent.m_totalsizes[d - 1];
-            }
-
-            return r;
-        }
-
-        bool next_dim0()
-        {
-            m_pos++;
-
-            return !done();
-        }
-
-        bool next_dim1()
-        {
-            m_pos += m_parent.m_sizes[0] - dim(0);
-
-            return !done();
-        }
-
-        unsigned int dim0_max() const
-        {
-            unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
-
-            return dim(0) + offset;
-        }
-    };
-
-    void set_totalsizes()
-    {
-        unsigned int t = 1;
-
-        for(unsigned int i = 0; i < D; i++)
-        {
-            if(m_sizes[i] == 0)
-            {
-                m_sizes[i] = 1;
-            }
-
-            t *= m_sizes[i];
-
-            m_totalsizes[i] = t;
-        }
-    }
-
-public:
-    NDRange &operator=(const NDRange &rhs) = default;
-    NDRange(const NDRange &rhs)            = default;
-
-    template <typename... T>
-    NDRange(T... ts)
-        : m_sizes{ ts... }
-    {
-        set_totalsizes();
-    }
-
-    NDRange(const std::array<unsigned int, D> &n)
-        : m_sizes(n)
-    {
-        set_totalsizes();
-    }
-
-    NDRangeIterator iterator(unsigned int start, unsigned int end) const
-    {
-        return NDRangeIterator(*this, start, end);
-    }
-
-    unsigned int total_size() const
-    {
-        return m_totalsizes[D - 1];
-    }
-
-    unsigned int get_size(unsigned int v) const
-    {
-        return m_sizes[v];
-    }
-};
-
-/** NDCoordinate builds upon a range, but specifies a starting position
- * in addition to a size which it inherits from NDRange
- */
-template <unsigned int N>
-class NDCoordinate : public NDRange<N>
-{
-    using int_t     = unsigned int;
-    using ndrange_t = NDRange<N>;
-
-    std::array<int_t, N> m_positions{};
-
-public:
-    NDCoordinate &operator=(const NDCoordinate &rhs) = default;
-    NDCoordinate(const NDCoordinate &rhs)            = default;
-    NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>> &list)
-    {
-        std::array<int_t, N> sizes{};
-
-        std::size_t i = 0;
-        for(auto &p : list)
-        {
-            m_positions[i] = p.first;
-            sizes[i++]     = p.second;
-        }
-
-        //update the parents sizes
-        static_cast<ndrange_t &>(*this) = ndrange_t(sizes);
-    }
-
-    int_t get_position(int_t d) const
-    {
-        assert(d < N);
-
-        return m_positions[d];
-    }
-
-    void set_position(int_t d, int_t v)
-    {
-        assert(d < N);
-
-        m_positions[d] = v;
-    }
-
-    int_t get_position_end(int_t d) const
-    {
-        return get_position(d) + ndrange_t::get_size(d);
-    }
-}; //class NDCoordinate
-
-using ndrange_t = NDRange<6>;
-using ndcoord_t = NDCoordinate<6>;
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_list.h
deleted file mode 100644
index 43e44be5e2..0000000000
--- a/src/core/cpu/kernels/elementwise/neon/elementwise_list.h
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H
-
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
-                    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
-                    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const auto a      = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
-            for(; x < window_end_x; ++x)
-            {
-                const auto a      = *(input1_ptr + x);
-                const auto b      = *(input2_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(a, b);
-            }
-        },
-        input1, input2, output);
-    }
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
-{
-    auto res = ScalarType(0);
-
-    switch(op)
-    {
-        case ArithmeticOperation::MAX:
-            res = std::max(a, b);
-            break;
-        case ArithmeticOperation::MIN:
-            res = std::min(a, b);
-            break;
-        case ArithmeticOperation::SQUARED_DIFF:
-        {
-            res = (a - b) * (a - b);
-            break;
-        }
-        case ArithmeticOperation::PRELU:
-        {
-            res = (a > 0 ? a : a * b);
-            break;
-        }
-        case ArithmeticOperation::DIV:
-        {
-            res = a / b;
-            if(std::is_integral<ScalarType>::value)
-            {
-                res = (b == 0) ? 0 : res;
-                if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
-                {
-                    --res;
-                }
-            }
-            break;
-        }
-        case ArithmeticOperation::POWER:
-        {
-            res = std::pow(a, b);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-    return res;
-}
-
-template <ArithmeticOperation    op, typename VectorType>
-inline typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b)
-{
-    using vec_type    = typename VectorType::type;
-    using scalar_type = typename VectorType::scalar_type;
-    using tag_type    = typename VectorType::tag_type;
-
-    vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
-
-    switch(op)
-    {
-        case ArithmeticOperation::MAX:
-            res = wrapper::vmax(a, b);
-            break;
-        case ArithmeticOperation::MIN:
-            res = wrapper::vmin(a, b);
-            break;
-        case ArithmeticOperation::SQUARED_DIFF:
-        {
-            const vec_type tmp = wrapper::vsub(a, b);
-            res                = wrapper::vmul(tmp, tmp);
-            break;
-        }
-        case ArithmeticOperation::PRELU:
-        {
-            const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
-            const vec_type tmp  = wrapper::vmul(a, b);
-            const auto     gt   = wrapper::vcgt(a, zero);
-
-            res = wrapper::vbsl(gt, a, tmp);
-            break;
-        }
-
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-
-    return res;
-}
-
-template <>
-inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
-{
-    return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
-}
-
-template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
-{
-    return wrapper::vdiv(a, b);
-}
-
-template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
-{
-    return wrapper::vpow(a, b);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
-{
-    return wrapper::vdiv(a, b);
-}
-
-template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
-{
-    return wrapper::vpow(a, b);
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <ArithmeticOperation    op, typename ScalarType, typename VectorType>
-inline typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
-{
-    using tag_type = typename VectorType::tag_type;
-    using vec_type = typename VectorType::type;
-
-    vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});
-    return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = wrapper::vloadq(input1_ptr + x);
-        const auto b = wrapper::vloadq(input2_ptr + x);
-        wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));
-    }
-    return x;
-}
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
-        wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
-    }
-    return x;
-}
-
-template <ArithmeticOperation op, typename VectorType>
-void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    using scalar_type = typename VectorType::scalar_type;
-
-    elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
-                                                         &elementwise_arithm_op_scalar<op, scalar_type>,
-                                                         &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
-                                                         &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType>
-inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
-{
-    bool res = false;
-
-    switch(op)
-    {
-        case ComparisonOperation::Equal:
-            res = (a == b);
-            break;
-        case ComparisonOperation::NotEqual:
-            res = (a != b);
-            break;
-        case ComparisonOperation::Greater:
-            res = (a > b);
-            break;
-        case ComparisonOperation::GreaterEqual:
-            res = (a >= b);
-            break;
-        case ComparisonOperation::Less:
-            res = (a < b);
-            break;
-        case ComparisonOperation::LessEqual:
-            res = (a <= b);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-    return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
-}
-
-template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
-{
-    OutputVectorType res = { 0, 0, 0, 0 };
-
-    switch(op)
-    {
-        case ComparisonOperation::Equal:
-            res = wrapper::vceq(a, b);
-            break;
-        case ComparisonOperation::NotEqual:
-            res = wrapper::vnot(wrapper::vceq(a, b));
-            break;
-        case ComparisonOperation::Greater:
-            res = wrapper::vcgt(a, b);
-            break;
-        case ComparisonOperation::GreaterEqual:
-            res = wrapper::vcge(a, b);
-            break;
-        case ComparisonOperation::Less:
-            res = wrapper::vcgt(b, a);
-            break;
-        case ComparisonOperation::LessEqual:
-            res = wrapper::vcge(b, a);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-
-    return res;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
-{
-    InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
-    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        wrapper::vstore(output_ptr + x, a);
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
-        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
-    }
-    if(x <= window_end_x - 4)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        for(int i = 0; i < 4; i++)
-        {
-            *(output_ptr + x + i) = wrapper::vgetlane(a, i);
-        }
-        x = +4;
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a   = wrapper::vloadq(input1_ptr + x);
-        const auto b   = wrapper::vloadq(input2_ptr + x);
-        const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
-        wrapper::vstore(output_ptr + x, res);
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a   = wrapper::vloadq(input1_ptr + x);
-        const auto b   = wrapper::vloadq(input2_ptr + x);
-        const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        auto       a    = wrapper::vloadq(input1_ptr + x);
-        auto       b    = wrapper::vloadq(input2_ptr + x);
-        const auto res  = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        a               = wrapper::vloadq(input1_ptr + x + 4);
-        b               = wrapper::vloadq(input2_ptr + x + 4);
-        const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
-    }
-    if(x <= window_end_x - 4)
-    {
-        const auto a   = wrapper::vloadq(input1_ptr + x);
-        const auto b   = wrapper::vloadq(input2_ptr + x);
-        const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        for(int i = 0; i < 4; i++)
-        {
-            *(output_ptr + x + i) = wrapper::vgetlane(res, i);
-        }
-        x = +4;
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
-}
-} // namesapce cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h
deleted file mode 100644
index 1ff4632f5c..0000000000
--- a/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h
+++ /dev/null
@@ -1,654 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
-{
-    qasymm8x16_t        x = vld1q_u8(input1_ptr);
-    const float32x4x4_t out =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
-        }
-    };
-    return out;
-}
-
-float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
-{
-    qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
-    const float32x4x4_t out =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
-        }
-    };
-    return out;
-}
-
-void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
-{
-    const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1])));
-    const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3])));
-    vst1q_u8(output_ptr, vcombine_u8(pa, pb));
-}
-
-void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
-{
-    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
-    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
-    vst1q_u8(output_ptr, vcombine_u8(pa, pb));
-}
-
-void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
-{
-    int32x4x4_t out =
-    {
-        {
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
-        }
-    };
-    store_quantized(output_ptr, out);
-}
-
-void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
-{
-    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
-    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
-    vst1q_s8(output_ptr, vcombine_s8(pa, pb));
-}
-
-void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
-{
-    int32x4x4_t out =
-    {
-        {
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
-        }
-    };
-    store_quantized_signed(output_ptr, out);
-}
-
-template <ArithmeticOperation op>
-inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
-{
-    return quantize_qasymm8(elementwise_arithm_op_scalar<op>(a, b), qinfo);
-}
-
-template <ArithmeticOperation op>
-inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
-{
-    return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
-}
-
-template <ArithmeticOperation op>
-inline float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
-{
-    using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
-    float32x4x4_t out =
-    {
-        {
-            elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
-        }
-    };
-    return out;
-}
-
-template <ComparisonOperation op>
-inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
-{
-    ARM_COMPUTE_UNUSED(qinfo);
-    return elementwise_comp_op_scalar<op>(a, b);
-}
-
-template <ComparisonOperation op>
-inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)
-{
-    uint32x4x4_t out =
-    {
-        {
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])
-        }
-    };
-    return out;
-}
-
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                                int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                float32x4_t voffseto, float32x4_t invvscaleo)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        // Get inputs and compute output
-        const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
-        const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
-        store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
-    }
-    return x;
-}
-
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                       const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr,
-                                                       int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                       float32x4_t voffseto, float32x4_t invvscaleo)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        // Get inputs and compute output
-        const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
-        const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
-        store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
-    }
-    return x;
-}
-
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                          const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                          int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                          float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
-        store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
-    }
-    return x;
-}
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                                 const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr,
-                                                                 int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                                 float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
-        store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
-    }
-    return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
-                                              const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                              int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                              float32x4_t voffseto, float32x4_t invvscaleo)
-{
-    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
-        const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(af, bf);
-        store_quantized(output_ptr + x, rf);
-    }
-    return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                     const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr,
-                                                     int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                     float32x4_t voffseto, float32x4_t invvscaleo)
-{
-    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
-        const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(af, bf);
-        store_quantized(output_ptr + x, rf);
-    }
-    return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                        const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                        int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                        float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
-    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
-        store_quantized(output_ptr + x, rf);
-    }
-    return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                               const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                               int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                               float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
-    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
-        store_quantized(output_ptr + x, rf);
-    }
-    return x;
-}
-
-void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                              uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                              int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                    float32x4_t, float32x4_t, const bool),
-                              int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
-                                               int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                               float32x4_t, float32x4_t))
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
-
-    // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero)
-    const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset + 0.5f);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
-
-    if(is_broadcast_across_x)
-    {
-        // Select the broadcast input on the X axis
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        const UniformQuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
-        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const uint8_t       broadcast_value  = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
-
-        // Input1 quantization info
-        const int32x4_t   voffset1 = vdupq_n_s32(input1_qinfo.offset);
-        const float32x4_t vscale1  = vdupq_n_f32(input1_qinfo.scale);
-
-        // Input2 quantization info
-        const int32x4_t   voffset2 = vdupq_n_s32(input2_qinfo.offset);
-        const float32x4_t vscale2  = vdupq_n_f32(input2_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
-    }
-}
-
-void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                       uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                       int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                             float32x4_t, float32x4_t, const bool),
-                                       int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *,
-                                                        int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                        float32x4_t, float32x4_t))
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
-
-    const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
-
-    if(is_broadcast_across_x)
-    {
-        // Select the broadcast input on the X axis
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        const UniformQuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
-        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
-
-        // Input1 quantization info
-        const int32x4_t   voffset1 = vdupq_n_s32(input1_qinfo.offset);
-        const float32x4_t vscale1  = vdupq_n_f32(input1_qinfo.scale);
-
-        // Input2 quantization info
-        const int32x4_t   voffset2 = vdupq_n_s32(input2_qinfo.offset);
-        const float32x4_t vscale2  = vdupq_n_f32(input2_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
-    }
-}
-
-void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                     int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                     int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t,
-                                                           float32x4_t, float32x4_t, const bool),
-                                     int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *,
-                                                      int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                      float32x4_t, float32x4_t))
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
-
-    const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
-
-    if(is_broadcast_across_x)
-    {
-        // Select the broadcast input on the X axis
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        const UniformQuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
-        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
-
-        // Input1 quantization info
-        const int32x4_t   voffset1 = vdupq_n_s32(input1_qinfo.offset);
-        const float32x4_t vscale1  = vdupq_n_f32(input1_qinfo.scale);
-
-        // Input2 quantization info
-        const int32x4_t   voffset2 = vdupq_n_s32(input2_qinfo.offset);
-        const float32x4_t vscale2  = vdupq_n_f32(input2_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
-    }
-}
-
-template <ArithmeticOperation op>
-void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>,
-                             &elementwise_arithm_op_quantized_broadcast_loop<op>,
-                             &elementwise_arithm_op_quantized_loop<op>);
-}
-template <ArithmeticOperation op>
-void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar<op>,
-                                    &elementwise_arithm_op_quantized_signed_broadcast_loop<op>,
-                                    &elementwise_arithm_op_quantized_singed_loop<op>);
-}
-
-template <ComparisonOperation op>
-void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
-                             &elementwise_comp_op_quantized_broadcast_loop<op>,
-                             &elementwise_comp_op_quantized_loop<op>);
-}
-
-template <ComparisonOperation op>
-void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
-                                      &elementwise_comp_op_quantized_signed_broadcast_loop<op>,
-                                      &elementwise_comp_op_quantized_signed_loop<op>);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h
deleted file mode 100644
index 307e95fae9..0000000000
--- a/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            return 1 / sqrt(a);
-        case ElementWiseUnary::EXP:
-            return std::exp(a);
-        case ElementWiseUnary::NEG:
-            return -a;
-        case ElementWiseUnary::LOG:
-            return std::log(a);
-        case ElementWiseUnary::ABS:
-            return std::abs(a);
-        case ElementWiseUnary::ROUND:
-            return support::cpp11::nearbyint(a);
-        case ElementWiseUnary::SIN:
-            return std::sin(a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-}
-
-template <typename ScalarType, typename VectorType>
-inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            return wrapper::vinvsqrt(a);
-        case ElementWiseUnary::EXP:
-            return wrapper::vexpq(a);
-        case ElementWiseUnary::NEG:
-            return wrapper::vneg(a);
-        case ElementWiseUnary::LOG:
-            return wrapper::vlog(a);
-        case ElementWiseUnary::ABS:
-            return wrapper::vabs(a);
-        case ElementWiseUnary::ROUND:
-            return wrapper::vround(a);
-        case ElementWiseUnary::SIN:
-            return wrapper::vsin(a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-}
-
-template <typename ScalarType>
-void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
-{
-    const int  window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-
-        int x = window_start_x;
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
-        }
-    },
-    input, output);
-}
-
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp
deleted file mode 100644
index 58ebb28fe5..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::wrapper;
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct LoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    const InputScalarType *input2_ptr;
-    OutputScalarType      *output_ptr;
-};
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct BroadcastLoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    InputScalarType        broadcast_value;
-    OutputScalarType      *output_ptr;
-    bool                   reorder;
-};
-
-template <typename InputScalarType, typename OutputScalarType>
-void arithmetic_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto in1 = svld1(pg, args.input1_ptr);
-    const auto in2 = svld1(pg, args.input2_ptr);
-    const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
-    svst1(pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto non_broadcast_vector = svld1(pg, args.input1_ptr);
-    const auto broadcast_vector     = svdup_n(args.broadcast_value);
-    const auto in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
-    const auto in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
-    const auto res                  = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
-    svst1(pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-void comparison_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto     in1       = svld1(pg, args.input1_ptr);
-    const auto     in2       = svld1(pg, args.input2_ptr);
-    const auto     res       = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
-    const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-    svst1(output_pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto     non_broadcast_vector = svld1(pg, args.input1_ptr);
-    const auto     broadcast_vector     = svdup_n(args.broadcast_value);
-    const auto     in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
-    const auto     in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
-    const auto     res                  = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
-    const svbool_t output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-    svst1(output_pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using LoopFuncType = void (*)(svbool_t, const LoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
-          typename InputScalarType  = typename sve_scalar<InputVectorType>::type,
-          typename OutputScalarType = typename sve_scalar<OutputVectorType>::type>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    OperatorType op,
-                    LoopFuncType<InputScalarType, OutputScalarType, OperatorType>          func,
-                    BroadcastLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func)
-{
-    const auto all_true_pg = svptrue<InputScalarType>();
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                broadcast_func(pg,
-                {
-                    op,
-                    non_broadcast_input_ptr + x,
-                    broadcast_value,
-                    output_ptr + x,
-                    !is_broadcast_input_2
-                });
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                func(pg,
-                {
-                    op,
-                    input1_ptr + x,
-                    input2_ptr + x,
-                    output_ptr + x
-                });
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    using VectorType = typename sve_vector<ScalarType>::type;
-
-    elementwise_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
-                                                                &arithmetic_op_loop<ScalarType, ScalarType>,
-                                                                &arithmetic_op_broadcast_loop<ScalarType, ScalarType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
-    using InputVectorType  = typename sve_vector<InputScalarType>::type;
-    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
-
-    elementwise_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
-                                                                           &comparison_op_loop<InputScalarType, OutputScalarType>,
-                                                                           &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>);
-}
-
-template <>
-svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
-{
-    return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
-}
-
-template <>
-svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
-{
-    return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
-}
-
-template <>
-svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svint16_t &b)
-{
-    ARM_COMPUTE_UNUSED(pg, a, b);
-    ARM_COMPUTE_ERROR("Not supported");
-}
-
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Equal, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Greater, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Less, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
deleted file mode 100644
index a92a8648a8..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
-#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/NEON/wrapper/svtraits.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::wrapper;
-
-template <typename VectorType>
-VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b)
-{
-    return svpow_z(pg, a, b);
-}
-
-template <typename VectorType>
-VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b)
-{
-    return svdiv_z(pg, a, b);
-}
-
-template <uint32_t bytewidth>
-svbool_t narrow_to_byte_predicate(svbool_t pg)
-{
-    const auto all_false = svpfalse();
-
-    switch(bytewidth)
-    {
-        case 8:
-            pg = svuzp1_b32(pg, all_false);
-        /* fall through */
-        case 4:
-            pg = svuzp1_b16(pg, all_false);
-        /* fall through */
-        case 2:
-            pg = svuzp1_b8(pg, all_false);
-        /* fall through */
-        default:
-            break;
-    }
-    return pg;
-}
-
-template <typename VectorType>
-VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op)
-{
-    using ScalarType = typename wrapper::sve_scalar<VectorType>::type;
-    VectorType res{};
-
-    switch(op)
-    {
-        case ArithmeticOperation::MAX:
-            res = svmax_z(pg, a, b);
-            break;
-        case ArithmeticOperation::MIN:
-            res = svmin_z(pg, a, b);
-            break;
-        case ArithmeticOperation::SQUARED_DIFF:
-        {
-            const auto tmp = svsub_z(pg, a, b);
-            res            = svmul_z(pg, tmp, tmp);
-            break;
-        }
-        case ArithmeticOperation::PRELU:
-        {
-            const auto zero = svdup_n(ScalarType(0));
-            const auto tmp  = svmul_z(pg, a, b);
-            const auto gt   = svcmpgt(pg, a, zero);
-            res             = svsel(gt, a, tmp);
-            break;
-        }
-        case ArithmeticOperation::DIV:
-        {
-            res = elementwise_div(pg, a, b);
-            break;
-        }
-        case ArithmeticOperation::POWER:
-        {
-            res = elementwise_pow(pg, a, b);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-
-    return res;
-}
-
-template <typename InputVectorType, typename OutputVectorType>
-OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
-{
-    svbool_t selection_vector{};
-
-    switch(op)
-    {
-        case ComparisonOperation::Equal:
-            selection_vector = svcmpeq(pg, a, b);
-            break;
-        case ComparisonOperation::NotEqual:
-            selection_vector = svcmpne(pg, a, b);
-            break;
-        case ComparisonOperation::Greater:
-            selection_vector = svcmpgt(pg, a, b);
-            break;
-        case ComparisonOperation::GreaterEqual:
-            selection_vector = svcmpge(pg, a, b);
-            break;
-        case ComparisonOperation::Less:
-            selection_vector = svcmplt(pg, a, b);
-            break;
-        case ComparisonOperation::LessEqual:
-            selection_vector = svcmple(pg, a, b);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-
-    using InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type;
-    selection_vector      = narrow_to_byte_predicate<sizeof(InputScalarType)>(selection_vector);
-
-    using OutputScalarType  = typename wrapper::sve_scalar<OutputVectorType>::type;
-    const auto false_vector = svdup_n(static_cast<OutputScalarType>((uint32_t)0));
-    const auto true_vector  = svdup_n(static_cast<OutputScalarType>(~(uint32_t)0));
-    auto       ret          = svsel(selection_vector, true_vector, false_vector);
-
-    return ret;
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template <ComparisonOperation op, typename ScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ENABLE_SVE)
-#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
deleted file mode 100644
index 6c5524e284..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-
-#if defined(__ARM_FEATURE_SVE2)
-
-#include "src/core/NEON/wrapper/svtraits.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::wrapper;
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct QuantizedLoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    const InputScalarType *input2_ptr;
-    OutputScalarType      *output_ptr;
-
-    const svint32_t   &in1_offset;
-    const svint32_t   &in2_offset;
-    const svint32_t   &out_offset;
-    const svfloat32_t &in1_scale;
-    const svfloat32_t &in2_scale;
-    const svfloat32_t &out_scale;
-};
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct BroadcastQuantizedLoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    float                  broadcast_value;
-    OutputScalarType      *output_ptr;
-    bool                   reorder;
-
-    const svint32_t   &in1_offset;
-    const svint32_t   &out_offset;
-    const svfloat32_t &in1_scale;
-    const svfloat32_t &out_scale;
-};
-
-svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
-{
-    auto x = svld1(pg, ptr);
-
-    const auto widened = svcreate4(
-                             svmovlb(svmovlb(x)),
-                             svmovlt(svmovlb(x)),
-                             svmovlb(svmovlt(x)),
-                             svmovlt(svmovlt(x)));
-
-    pg = svptrue_b8();
-
-    return svcreate4(
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
-}
-
-svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
-{
-    auto x = svld1(pg, ptr);
-
-    //vprint(x);
-
-    const auto widened = svcreate4(
-                             svmovlb(svmovlb(x)),
-                             svmovlt(svmovlb(x)),
-                             svmovlb(svmovlt(x)),
-                             svmovlt(svmovlt(x)));
-
-    pg = svptrue_b8();
-
-    return svcreate4(
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
-}
-
-void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
-{
-    const auto quantized = svcreate4(
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
-
-    const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1));
-    const auto narrowed_top    = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3));
-    const auto narrowed        = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
-    svst1(pg, ptr, narrowed);
-}
-
-void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
-{
-    const auto quantized = svcreate4(
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
-
-    const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1));
-    const auto narrowed_top    = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3));
-    const auto narrowed        = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
-
-    svst1(pg, ptr, narrowed);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void arithmetic_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
-    const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
-
-    const auto result = svcreate4(
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), args.op));
-
-    store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void arithmetic_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
-    const auto in2 = svcreate4(
-                         svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value));
-
-    const auto &af = args.reorder ? in2 : in1;
-    const auto &bf = args.reorder ? in1 : in2;
-
-    const auto result = svcreate4(
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 0), svget4(bf, 0), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 1), svget4(bf, 1), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 2), svget4(bf, 2), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 3), svget4(bf, 3), args.op));
-
-    store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
-    const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
-
-    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
-
-    const auto result = svcreate4(
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), args.op));
-
-    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
-    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
-    const auto zipped        = svzip1(zipped_bottom, zipped_top);
-    svst1(pg, args.output_ptr, zipped);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
-    const auto in2 = svcreate4(
-                         svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value));
-
-    const auto &af = args.reorder ? in2 : in1;
-    const auto &bf = args.reorder ? in1 : in2;
-
-    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
-
-    const auto result = svcreate4(
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 0), svget4(bf, 0), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 1), svget4(bf, 1), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 2), svget4(bf, 2), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 3), svget4(bf, 3), args.op));
-
-    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
-    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
-    const auto zipped        = svzip1(zipped_bottom, zipped_top);
-    svst1(pg, args.output_ptr, zipped);
-}
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using LoopQuantizedFuncType = void (*)(svbool_t, const QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
-          typename InputScalarType  = typename wrapper::sve_scalar<InputVectorType>::type,
-          typename OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type>
-void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                              OperatorType op,
-                              LoopQuantizedFuncType<InputScalarType, OutputScalarType, OperatorType>          func,
-                              BroadcastQuantizedLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func)
-{
-    const auto all_true_pg = wrapper::svptrue<InputScalarType>();
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset);
-    const auto output_vscale  = svdup_n(1.f / out->info()->quantization_info().uniform().scale);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
-        const auto broadcast_qinfo     = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
-
-        const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
-        const auto non_broadcast_vscale  = svdup_n(non_broadcast_qinfo.uniform().scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                const auto args = BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType>
-                {
-                    op,
-                    non_broadcast_input_ptr + x,
-                    Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo),
-                    output_ptr + x,
-                    !is_broadcast_input_2,
-                    non_broadcast_voffset, output_voffset,
-                    non_broadcast_vscale, output_vscale
-                };
-                broadcast_func(pg, args);
-                x += wrapper::svcnt<InputScalarType>();
-                pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset);
-        const auto in1_vscale  = svdup_n(in1->info()->quantization_info().uniform().scale);
-
-        const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
-        const auto in2_vscale  = svdup_n(in2->info()->quantization_info().uniform().scale);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                const auto args = QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType>
-                {
-                    op,
-                    input1_ptr + x,
-                    input2_ptr + x,
-                    output_ptr + x,
-                    in1_voffset, in2_voffset, output_voffset,
-                    in1_vscale, in2_vscale, output_vscale
-                };
-                func(pg, args);
-                x += wrapper::svcnt<InputScalarType>();
-                pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    using VectorType = typename wrapper::traits::sve_vector<ScalarType>::type;
-    elementwise_quantized_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
-                                                                          &arithmetic_op_quantized_loop<ScalarType, ScalarType>,
-                                                                          &arithmetic_op_broadcast_quantized_loop<ScalarType, ScalarType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
-    using InputVectorType  = typename wrapper::traits::sve_vector<InputScalarType>::type;
-    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
-    elementwise_quantized_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
-                                                                                     &comparison_op_quantized_loop<InputScalarType, OutputScalarType>,
-                                                                                     &comparison_op_broadcast_quantized_loop<InputScalarType, OutputScalarType>);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_SVE2) */
-#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
deleted file mode 100644
index ddf1febd66..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            return svinvsqrt(pg, a);
-        case ElementWiseUnary::EXP:
-            return wrapper::svexp_z(pg, a);
-        case ElementWiseUnary::NEG:
-            return svneg_z(pg, a);
-        case ElementWiseUnary::LOG:
-            return wrapper::svlog_z(pg, a);
-        case ElementWiseUnary::ABS:
-            return svabs_z(pg, a);
-        case ElementWiseUnary::ROUND:
-            return svrintn_z(pg, a);
-        case ElementWiseUnary::SIN:
-            return wrapper::svsin_z(pg, a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
-    }
-}
-
-template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::NEG:
-            return svneg_z(pg, a);
-        case ElementWiseUnary::ABS:
-            return svabs_z(pg, a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
-    }
-}
-
-template <typename ScalarType>
-void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
-{
-    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        int        x          = window_start_x;
-
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
-        {
-            const auto vin = svld1(pg, input_ptr + x);
-            svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
-    },
-    input, output);
-}
-
-template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
deleted file mode 100644
index 63490421e9..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
-#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
-
-#include "arm_compute/core/Types.h"
-#if defined(ENABLE_SVE)
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ENABLE_SVE)
-#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/floor/list.h b/src/core/cpu/kernels/floor/list.h
deleted file mode 100644
index 4367e0ffc9..0000000000
--- a/src/core/cpu/kernels/floor/list.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_FLOOR_LIST_H
-#define SRC_CORE_NEON_KERNELS_FLOOR_LIST_H
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_FLOOR_KERNEL(func_name) \
-    void func_name(const void *src, void *dst, int len)
-
-DECLARE_FLOOR_KERNEL(fp16_neon_floor);
-DECLARE_FLOOR_KERNEL(fp32_neon_floor);
-
-#undef DECLARE_FLOOR_KERNEL
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_FLOOR_LIST_H */
diff --git a/src/core/cpu/kernels/floor/neon/fp16.cpp b/src/core/cpu/kernels/floor/neon/fp16.cpp
deleted file mode 100644
index f362676a36..0000000000
--- a/src/core/cpu/kernels/floor/neon/fp16.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-#include "src/common/utils/Validate.h"
-#include "src/core/NEON/NEMath.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-constexpr int step = 8;
-
-void fp16_neon_floor(const void *src, void *dst, int len)
-{
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
-    ARM_COMPUTE_ASSERT(len >= 0);
-
-    auto psrc = static_cast<const __fp16 *>(src);
-    auto pdst = static_cast<__fp16 *>(dst);
-
-    for(; len >= step; len -= step)
-    {
-        vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc)));
-        psrc += step;
-        pdst += step;
-    }
-
-    for(; len > 0; --len)
-    {
-        *pdst = std::floor(*psrc);
-        ++psrc;
-        ++pdst;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/core/cpu/kernels/floor/neon/fp32.cpp b/src/core/cpu/kernels/floor/neon/fp32.cpp
deleted file mode 100644
index f5efb2e849..0000000000
--- a/src/core/cpu/kernels/floor/neon/fp32.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/common/utils/Validate.h"
-#include "src/core/NEON/NEMath.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-constexpr int step = 4;
-
-void fp32_neon_floor(const void *src, void *dst, int len)
-{
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
-    ARM_COMPUTE_ASSERT(len >= 0);
-
-    auto psrc = static_cast<const float *>(src);
-    auto pdst = static_cast<float *>(dst);
-
-    for(; len >= step; len -= step)
-    {
-        vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
-        psrc += step;
-        pdst += step;
-    }
-
-    for(; len > 0; --len)
-    {
-        *pdst = std::floor(*psrc);
-        ++pdst;
-        ++psrc;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
deleted file mode 100644
index c78ffb9848..0000000000
--- a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-using namespace arm_compute::misc::shape_calculator;
-
-void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // dst initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info)));
-
-    const bool requantize = src->quantization_info() != dst->quantization_info();
-
-    switch(src->data_type())
-    {
-        case DataType::QASYMM8:
-            if(requantize)
-            {
-                create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
-            }
-            else
-            {
-                create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info);
-            }
-            break;
-        case DataType::QASYMM8_SIGNED:
-            if(requantize)
-            {
-                create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
-            }
-            else
-            {
-                create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
-            }
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-            create_arm_pooling<float, float>(src, dst, info, cpu_info);
-            break;
-        default:
-            break;
-    }
-
-    Window win = calculate_max_window(*dst, Steps());
-    INEKernel::configure(win);
-}
-
-Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-
-#ifndef __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
-#endif /* __aarch64__ */
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
-                                    "Only AVG and MAX pooling are supported by assembly kernels");
-
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
-        const auto src_qinfo = src->quantization_info().uniform();
-        const auto dst_qinfo = dst->quantization_info().uniform();
-
-        if(src_qinfo != dst_qinfo)
-        {
-            const float multiplier = src_qinfo.scale / dst_qinfo.scale;
-            int32_t     dst_multiplier{};
-            int32_t     dst_shift{};
-            ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
-        }
-        else
-        {
-            if(src->data_type() == DataType::QASYMM8)
-            {
-                const bool has_padding = info.pad_stride_info.has_padding();
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
-            }
-        }
-    }
-    else
-    {
-        if(src->data_type() == DataType::QASYMM8)
-        {
-            // If dst is not configured, the quantization info are the same
-            const bool has_padding = info.pad_stride_info.has_padding();
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
-        }
-    }
-    return Status{};
-}
-
-void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_UNUSED(info);
-
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const ITensor *src       = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
-    ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
-
-    const auto in_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    auto       out_ptr       = dst->buffer() + dst->info()->offset_first_element_in_bytes();
-    auto       working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
-
-    const auto src_shape   = src->info()->tensor_shape();
-    const auto dst_shape   = dst->info()->tensor_shape();
-    const auto src_padding = src->info()->padding();
-    const auto dst_padding = dst->info()->padding();
-
-    const size_t ld_src_col   = src_shape[0] + src_padding.left + src_padding.right;
-    const size_t ld_src_row   = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
-    const size_t ld_src_batch = ld_src_row * src_shape[2];
-    const size_t ld_dst_col   = dst_shape[0] + dst_padding.left + dst_padding.right;
-    const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
-    const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
-
-    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
-                         out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
-                         working_space, info.thread_id, info.num_threads);
-}
-
-size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
-{
-    return _kernel_asm->get_working_size(num_threads);
-}
-
-bool CpuPool2dAssemblyWrapperKernel::is_configured() const
-{
-    return _kernel_asm != nullptr;
-}
-
-template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
-    arm_conv::pooling::PoolingWindow window{};
-    window.cols = static_cast<unsigned int>(info.pool_size.x());
-    window.rows = static_cast<unsigned int>(info.pool_size.y());
-
-    arm_conv::pooling::PoolingStride stride{};
-    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
-    constexpr unsigned int idx_width    = 1;
-    constexpr unsigned int idx_height   = 2;
-    constexpr unsigned int idx_channels = 0;
-    constexpr unsigned int idx_batches  = 3;
-
-    const unsigned int n_batches  = src->dimension(idx_batches);
-    const unsigned int src_rows   = src->dimension(idx_height);
-    const unsigned int src_cols   = src->dimension(idx_width);
-    const unsigned int n_channels = src->dimension(idx_channels);
-    const unsigned int dst_rows   = dst->dimension(idx_height);
-    const unsigned int dst_cols   = dst->dimension(idx_width);
-
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
-
-    // Configure assembly pooling kernel
-    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
-    if(pooling_kernel_asm == nullptr)
-    {
-        // Configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    _kernel_asm = std::move(pooling_kernel_asm);
-}
-
-template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
-    arm_conv::pooling::PoolingWindow window{};
-    window.cols = static_cast<unsigned int>(info.pool_size.x());
-    window.rows = static_cast<unsigned int>(info.pool_size.y());
-
-    arm_conv::pooling::PoolingStride stride{};
-    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
-    constexpr unsigned int idx_width    = 1;
-    constexpr unsigned int idx_height   = 2;
-    constexpr unsigned int idx_channels = 0;
-    constexpr unsigned int idx_batches  = 3;
-
-    const unsigned int n_batches  = src->dimension(idx_batches);
-    const unsigned int src_rows   = src->dimension(idx_height);
-    const unsigned int src_cols   = src->dimension(idx_width);
-    const unsigned int n_channels = src->dimension(idx_channels);
-    const unsigned int dst_rows   = dst->dimension(idx_height);
-    const unsigned int dst_cols   = dst->dimension(idx_width);
-
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
-
-    const auto src_qinfo = src->quantization_info().uniform();
-    const auto dst_qinfo = dst->quantization_info().uniform();
-
-    const float multiplier = src_qinfo.scale / dst_qinfo.scale;
-    int32_t     dst_multiplier{};
-    int32_t     dst_shift{};
-    quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
-
-    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
-                                                       dst_qinfo.offset,
-                                                       dst_shift, // left shift
-                                                       0,         // right shift
-                                                       dst_multiplier);
-
-    // Configure assembly pooling kernel with requantization
-    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
-    if(pooling_kernel_asm == nullptr)
-    {
-        // Configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    _kernel_asm = std::move(pooling_kernel_asm);
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
deleted file mode 100644
index 3afa4c16a4..0000000000
--- a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-#include "pool_common.hpp"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** This class is a wrapper for the assembly kernels.
-  *
-  * Some kernels were written in assembly and highly optimised for specific
-  * CPUs like A53 or A55. The arm compute library creates an instance of
-  * CpuPool2dAssemblyWrapperKernel and other auxiliary data structures to
-  * execute a single assembly kernel in the context of an NEFunction.
-  *
-  */
-class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel
-{
-public:
-    /** Constructor
-     */
-    CpuPool2dAssemblyWrapperKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dAssemblyWrapperKernel);
-
-    const char *name() const override
-    {
-        return "CpuPool2dAssemblyWrapperKernel";
-    }
-
-    /** Initialise the kernel's src and dst.
-     *
-     * @param[in]  src      Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst      Destination tensor info to store the result of pooling. Data types supported: same as @p src.
-     * @param[in]  info     Pooling meta-data.
-     * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuPool2dAssemblyWrapperKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-    /** Get size of the workspace needed by the assembly kernel.
-     *
-     * @param[in] num_threads Maximum number of threads that are going to be spawned.
-     *
-     * @return size of workspace
-     */
-    size_t get_working_size(unsigned int num_threads) const;
-
-    /** Was the asm kernel successfully configured?
-     *
-     * @return True if the asm kernel is configured and ready to run
-     */
-    bool is_configured() const;
-
-private:
-    /** Helper function to create the assembly kernel.
-     *
-     * @param[in] src  Source tensor info.
-     * @param[in] dst  Destination tensor info.
-     * @param[in] info Pooling layer meta-data.
-     */
-    template <typename Typesrc, typename Typedst>
-    void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    /** Helper function to create the assembly kernel with requantization support
-     *
-     * @param[in] src  Source tensor info.
-     * @param[in] dst  Destination tensor info.
-     * @param[in] info Pooling layer meta-data.
-     */
-    template <typename Typesrc, typename Typedst>
-    void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/cpu/kernels/pooling/neon/fp16.cpp b/src/core/cpu/kernels/pooling/neon/fp16.cpp
deleted file mode 100644
index 0aae7b8a57..0000000000
--- a/src/core/cpu/kernels/pooling/neon/fp16.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 8;
-
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, window_src);
-    Iterator out(dst0, window_out);
-    Iterator indices(dst1, window_out);
-
-    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
-
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-
-    const int pad_right      = src->info()->padding().right;
-    const int pad_left       = src->info()->padding().left;
-    const int pad_horizontal = pad_right + pad_left;
-    const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
-    const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
-            const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
-            const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
-            const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
-            const auto  v_x0      = vld1q_f16(in_x0_ptr);
-            const auto  v_x1      = vld1q_f16(in_x1_ptr);
-            const auto  v_x2      = vld1q_f16(in_x2_ptr);
-            const auto  v_x3      = vld1q_f16(in_x3_ptr);
-            float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
-            // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
-
-            const uint32_t   offset_base    = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t   offset_x0      = (uint32_t)offset_base / sizeof(float16_t) + x_off;
-            const uint32_t   offset_x1      = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t   offset_x3      = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32x4_t voffset_x0_0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-            const uint32x4_t voffset_x0_1   = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
-            const uint16x8_t voffset_x0     = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
-            const uint32x4_t voffset_x1_0   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-            const uint32x4_t voffset_x1_1   = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
-            const uint16x8_t voffset_x1     = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
-            const uint32x4_t voffset_x2_0   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-            const uint32x4_t voffset_x2_1   = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
-            const uint16x8_t voffset_x2     = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
-            const uint32x4_t voffset_x3_0   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-            const uint32x4_t voffset_x3_1   = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
-            const uint16x8_t voffset_x3     = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
-            const uint16x8_t tmp_indices0   = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
-            const uint16x8_t tmp_indices1   = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
-            const uint16x8_t tmp_indices2   = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-            const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
-            const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
-            // Store indicies
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            const auto x0  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
-            const auto x1  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
-            const auto x2  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
-            const auto x3  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
-            float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
-
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
-
-            const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
-            const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
-            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
-            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
-            // Store indices
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
-        }
-    },
-    in, out, indices);
-}
-}
-
-void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
-    {
-        pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
-    }
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 8;
-
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, window_src);
-    Iterator out(dst0, window_out);
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    float16x8_t vres;
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
-                const float16x8_t scale_v = vdupq_n_f16(scale);
-
-                // Perform pooling
-                vres = vdupq_n_f16(0.0f);
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-
-                        // Get power of 2 in case of l2 pooling and accumulate
-                        if(pool_info.pool_type == PoolingType::L2)
-                        {
-                            vres = vaddq_f16(vres, vmulq_f16(data, data));
-                        }
-                        else
-                        {
-                            vres = vaddq_f16(vres, data);
-                        }
-                    }
-                }
-                // Divide by scale
-                vres = vmulq_f16(vres, scale_v);
-            }
-            else
-            {
-                vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-                        vres                   = vmaxq_f16(vres, data);
-                    }
-                }
-            }
-
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
-                vres                        = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
-            }
-
-            // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            float16_t res = 0.0f;
-
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                // Calculate scale
-                const float16_t scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-
-                        // Get power of 2 in case of l2 pooling and accumulate
-                        if(pool_info.pool_type == PoolingType::L2)
-                        {
-                            res += data * data;
-                        }
-                        else
-                        {
-                            res += data;
-                        }
-                    }
-                }
-
-                // Divide by scale
-                res *= scale;
-            }
-            else
-            {
-                res = std::numeric_limits<float>::lowest();
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res                  = std::max(res, data);
-                    }
-                }
-            }
-
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                res = std::sqrt(res);
-            }
-
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
-        }
-    },
-    in, out);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/fp32.cpp b/src/core/cpu/kernels/pooling/neon/fp32.cpp
deleted file mode 100644
index 4e41fdec7f..0000000000
--- a/src/core/cpu/kernels/pooling/neon/fp32.cpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 4;
-
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, window_src);
-    Iterator out(dst0, window_out);
-    Iterator indices(dst1, window_out);
-
-    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
-
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-
-    float32x4_t vres;
-    float       res;
-
-    const int pad_right      = src->info()->padding().right;
-    const int pad_left       = src->info()->padding().left;
-    const int pad_horizontal = pad_right + pad_left;
-    const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
-    const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
-            const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
-            const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
-            const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
-            const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
-            const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
-            const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
-            const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
-            vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
-            // Store result
-            vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-
-            const uint32_t   offset_base  = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t   offset_x0    = (uint32_t)offset_base / sizeof(float) + x_off;
-            const uint32_t   offset_x1    = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t   offset_x2    = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t   offset_x3    = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32x4_t voffset_x0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-            const uint32x4_t voffset_x1   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-            const uint32x4_t voffset_x2   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-            const uint32x4_t voffset_x3   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-            const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
-            const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
-            const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-
-            // Store indices
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
-            const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
-            const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
-            const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
-            res           = std::max(std::max(x2, x3), std::max(x0, x1));
-
-            // Store result
-            *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
-
-            const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float) + x_off;
-            const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
-            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
-            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
-            // Store indices
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
-        }
-    },
-    in, out, indices);
-}
-}
-
-void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
-    {
-        pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
-    }
-    else
-    {
-        const int window_start_x = window.x().start();
-        const int window_end_x   = window.x().end();
-        const int window_step_x  = 4;
-
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator in(src, window_src);
-        Iterator out(dst0, window_out);
-
-        const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-        const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-        const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int       pool_stride_x   = 0;
-        int       pool_stride_y   = 0;
-        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-        const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-        float32x4_t vres;
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const int idx_width    = id.y() * pool_stride_x;
-            const int idx_height   = id.z() * pool_stride_y;
-            const int pool_limit_y = pool_pad_top - idx_height;
-            const int pool_limit_x = pool_pad_left - idx_width;
-
-            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-            int x_off = window_start_x;
-            for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-            {
-                if(pool_info.pool_type != PoolingType::MAX)
-                {
-                    // Calculate scale
-                    const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
-                    const float32x4_t scale_v = vdupq_n_f32(scale);
-
-                    // Perform pooling
-                    vres = vdupq_n_f32(0.0f);
-
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
-                    {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-
-                            // Get power of 2 in case of l2 pooling and accumulate
-                            if(pool_info.pool_type == PoolingType::L2)
-                            {
-                                vres = vmlaq_f32(vres, data, data);
-                            }
-                            else
-                            {
-                                vres = vaddq_f32(vres, data);
-                            }
-                        }
-                    }
-                    // Divide by scale
-                    vres = vmulq_f32(vres, scale_v);
-                }
-                else
-                {
-                    vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
-                    {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-                            vres                   = vmaxq_f32(vres, data);
-                        }
-                    }
-                }
-
-                // Calculate square-root in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
-                                         };
-                    vres = l2_res;
-                }
-
-                // Store result
-                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-            }
-
-            // Left-overs loop
-            for(; x_off < window_end_x; ++x_off)
-            {
-                float res = 0.0f;
-
-                if(pool_info.pool_type != PoolingType::MAX)
-                {
-                    // Calculate scale
-                    const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
-
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
-                    {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-
-                            // Get power of 2 in case of l2 pooling and accumulate
-                            if(pool_info.pool_type == PoolingType::L2)
-                            {
-                                res += data * data;
-                            }
-                            else
-                            {
-                                res += data;
-                            }
-                        }
-                    }
-
-                    // Divide by scale
-                    res *= scale;
-                }
-                else
-                {
-                    res = std::numeric_limits<float>::lowest();
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
-                    {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-                            res              = std::max(res, data);
-                        }
-                    }
-                }
-
-                // Calculate square-root in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    res = std::sqrt(res);
-                }
-
-                // Store result
-                *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
-            }
-        },
-        in, out);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/list.h b/src/core/cpu/kernels/pooling/neon/list.h
deleted file mode 100644
index bec1536f61..0000000000
--- a/src/core/cpu/kernels/pooling/neon/list.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_POOLING_LIST_H
-#define SRC_CORE_NEON_KERNELS_POOLING_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/cpu/kernels/pooling/neon/quantized.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_POOLING_KERNEL(func_name) \
-    void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
-
-DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc);
-DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc);
-DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nhwc);
-DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nhwc);
-
-#if defined(ENABLE_NCHW_KERNELS)
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-DECLARE_POOLING_KERNEL(pooling2_fp16_neon_nchw);
-DECLARE_POOLING_KERNEL(pooling3_fp16_neon_nchw);
-DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nchw);
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-DECLARE_POOLING_KERNEL(pooling2_fp32_neon_nchw);
-DECLARE_POOLING_KERNEL(pooling3_fp32_neon_nchw);
-DECLARE_POOLING_KERNEL(pooling7_fp32_neon_nchw);
-DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nchw);
-#endif /* defined(ENABLE_NCHW_KERNELS) */
-
-#undef DECLARE_POOLING_KERNEL
-
-template <typename T>
-inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout)
-{
-    const int pad_left    = info.padding().left;
-    const int pad_right   = info.padding().right;
-    const int pad_top     = info.padding().top;
-    const int pad_bottom  = info.padding().bottom;
-    const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
-    const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
-    const int pad_horiz   = pad_left + pad_right;
-    const int pad_vert    = pad_top + pad_bottom;
-
-    if(data_layout == DataLayout::NCHW)
-    {
-        const uint32_t offset_base = padded_offset
-                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_y                                            /* subtract padding elems per row */
-                                     - pad_top * sizeof(T)                                                                       /* top padding */
-                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
-                                     - in_stride_w * id[3];
-
-        return offset_base;
-    }
-    else
-    {
-        const uint32_t offset_base = padded_offset
-                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_x                          // subtract padding elems per row
-                                     - pad_top * sizeof(T)                                                     // top padding
-                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
-                                     - in_stride_w * id[3];
-
-        return offset_base;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/nchw/all.cpp b/src/core/cpu/kernels/pooling/neon/nchw/all.cpp
deleted file mode 100644
index 80eac684aa..0000000000
--- a/src/core/cpu/kernels/pooling/neon/nchw/all.cpp
+++ /dev/null
@@ -1,700 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#ifdef ENABLE_NCHW_KERNELS
-namespace arm_compute
-{
-namespace cpu
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    ARM_COMPUTE_UNUSED(pool_info.pool_type);
-    ARM_COMPUTE_UNUSED(pool_info.exclude_padding);
-
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()));
-        float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()));
-        float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()));
-        float16x4_t res         = {};
-
-        // Get power of 2 in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            top_data    = vmul_f16(top_data, top_data);
-            middle_data = vmul_f16(middle_data, middle_data);
-            bottom_data = vmul_f16(bottom_data, bottom_data);
-        }
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
-            const float16x4_t scale_v = vdup_n_f16(scale);
-            // Perform pooling
-            const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
-            res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
-            res                        = vmul_f16(vpadd_f16(res, res), scale_v);
-        }
-        else
-        {
-            const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
-            res                        = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
-            res                        = vpmax_f16(res, res);
-        }
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = vinv_f16(vinvsqrt_f16(res));
-        }
-
-        *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
-    },
-    in, out);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
-f16_to_f32(float16x4_t in)
-{
-    float32x2_t out = { static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) };
-    return out;
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
-f16_to_f32(float32x2_t in)
-{
-    return in;
-}
-
-template <typename T>
-void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    Iterator  in(src, window_src);
-    Iterator  out(dst0, window);
-    Iterator  indices(dst1, window);
-    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
-    int       pool_stride_x = 0;
-    int       pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const int            pad_left       = src->info()->padding().left;
-    const int            pad_right      = src->info()->padding().right;
-    const int            in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto        top_data        = wrapper::vload(reinterpret_cast<const T *>(src_top_ptr + in.offset()));
-        auto        bottom_data     = wrapper::vload(reinterpret_cast<const T *>(src_bottom_ptr + in.offset()));
-        float32x2_t top_data_f32    = f16_to_f32<T>(top_data);
-        float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
-
-        // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
-        const float32x2_t max_data_top      = vpmax_f32(top_data_f32, top_data_f32);
-        const float32x2_t max_data_bottom   = vpmax_f32(bottom_data_f32, bottom_data_f32);
-        const float32x2_t max_data          = vmax_f32(max_data_top, max_data_bottom);
-        *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
-
-        // Calculate max data indice, which will be used in max unpool.
-        const uint32_t   offset_base              = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
-        const uint32_t   offset_top               = (uint32_t)(offset_base / sizeof(T));
-        const uint32_t   offset_bottom            = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
-        const uint32x2_t voffset_top              = { offset_top, offset_top + 1u };
-        const uint32x2_t voffset_bottom           = { offset_bottom, offset_bottom + 1u };
-        const uint32x2_t tmp_indices_top          = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
-        const uint32x2_t tmp_indices_bottom       = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
-        *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
-    },
-    in, out, indices);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    if(pool_info.pool_type == PoolingType::MAX && dst1)
-    {
-        pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window);
-    }
-    else
-    {
-        Iterator      in(src, window_src);
-        Iterator      out(dst0, window);
-        constexpr int pool_size       = 2;
-        const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int           pool_stride_x, pool_stride_y = 0;
-        std::tie(pool_stride_x, pool_stride_y)     = pool_info.pad_stride_info.stride();
-        const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-        const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()));
-            float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()));
-            float16x4_t res         = {};
-
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                top_data    = vmul_f16(top_data, top_data);
-                bottom_data = vmul_f16(bottom_data, bottom_data);
-            }
-
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
-                const float16x4_t scale_v = vdup_n_f16(scale);
-
-                const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
-                res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
-            }
-            else
-            {
-                const float16x4_t max_data = vmax_f16(top_data, bottom_data);
-                res                        = vpmax_f16(max_data, max_data);
-            }
-
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                res = vinv_f16(vinvsqrt_f16(res));
-            }
-
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
-        },
-        in, out);
-    }
-}
-
-void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float16_t   res  = 0.0f;
-        float16x8_t vres = vdupq_n_f16(0.0f);
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
-
-            // Perform pooling
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 8); x += 8)
-                {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                           (src->info()->strides_in_bytes().y())));
-
-                    // Get power of 2 in case of l2 pooling and accumulate
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        vres = vaddq_f16(vres, vmulq_f16(data, data));
-                    }
-                    else
-                    {
-                        vres = vaddq_f16(vres, data);
-                    }
-                }
-
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                           + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())));
-
-                    // Get power of 2 in case of l2 pooling
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        data *= data;
-                    }
-
-                    res += data;
-                }
-            }
-
-            // Reduction
-            float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
-            res += vget_lane_f16(tmp, 0);
-            res += vget_lane_f16(tmp, 1);
-            res += vget_lane_f16(tmp, 2);
-            res += vget_lane_f16(tmp, 3);
-
-            // Divide by scale
-            res *= scale;
-        }
-        else
-        {
-            float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
-            res              = std::numeric_limits<float>::lowest();
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 8); x += 8)
-                {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                           (src->info()->strides_in_bytes().y())));
-                    vres                   = vmaxq_f16(vres, data);
-                }
-
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                                 + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())));
-                    res = std::max(res, data);
-                }
-            }
-
-            float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
-            res             = std::max(res, vget_lane_f16(tmp, 0));
-            res             = std::max(res, vget_lane_f16(tmp, 1));
-            res             = std::max(res, vget_lane_f16(tmp, 2));
-            res             = std::max(res, vget_lane_f16(tmp, 3));
-        }
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = std::sqrt(res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float16_t *>(out.ptr())) = res;
-    },
-    in, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float res = 0.0f;
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
-
-            // Perform pooling
-            float32x4_t vres = vdupq_n_f32(0.0f);
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 4); x += 4)
-                {
-                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (src->info()->strides_in_bytes().y())));
-
-                    // Get power of 2 in case of l2 pooling and accumulate
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        vres = vmlaq_f32(vres, data, data);
-                    }
-                    else
-                    {
-                        vres = vaddq_f32(vres, data);
-                    }
-                }
-
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                   (src->info()->strides_in_bytes().y())));
-
-                    // Get power of 2 in case of l2 pooling
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        data *= data;
-                    }
-
-                    res += data;
-                }
-            }
-
-#if defined(__aarch64__)
-            // Reduction operation available on 64 bit architectures only
-            res += vaddvq_f32(vres);
-#else  // __aarch64__
-            // Reduction
-            float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
-            tmp             = vpadd_f32(tmp, tmp);
-
-            res += vget_lane_f32(tmp, 0);
-#endif // __aarch64__
-            // Divide by scale
-            res *= scale;
-        }
-        else
-        {
-            float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
-            res              = std::numeric_limits<float>::lowest();
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 4); x += 4)
-                {
-                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (src->info()->strides_in_bytes().y())));
-                    vres                   = vmaxq_f32(vres, data);
-                }
-
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                         (src->info()->strides_in_bytes().y())));
-                    res              = std::max(res, data);
-                }
-            }
-#if defined(__aarch64__)
-            // Reduction operation available on 64 bit architectures only
-            res = std::max(vmaxvq_f32(vres), res);
-#else  // __aarch64__
-            float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
-            tmp             = vpmax_f32(tmp, tmp);
-
-            res = std::max(res, vget_lane_f32(tmp, 0));
-#endif // __aarch64__
-        }
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = std::sqrt(res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = res;
-    },
-    in, out);
-}
-
-void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    if(pool_info.pool_type == PoolingType::MAX && dst1)
-    {
-        pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window);
-    }
-    else
-    {
-        Iterator      in(src, window_src);
-        Iterator      out(dst0, window);
-        constexpr int pool_size       = 2;
-        const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int           pool_stride_x   = 0;
-        int           pool_stride_y   = 0;
-        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-        const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-        const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto  in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
-            const auto  in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
-            float32x2_t top_data      = vld1_f32(in_top_ptr);
-            float32x2_t bottom_data   = vld1_f32(in_bottom_ptr);
-            float32x2_t res           = {};
-            float       final_res     = 0;
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                top_data    = vmul_f32(top_data, top_data);
-                bottom_data = vmul_f32(bottom_data, bottom_data);
-            }
-
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                // Calculate scale
-                float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                  pool_stride_y);
-                const float32x2_t scale_v = vdup_n_f32(scale);
-
-                // Perform pooling
-                const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
-                res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
-            }
-            else
-            {
-                const float32x2_t max_data = vmax_f32(top_data, bottom_data);
-                res                        = vpmax_f32(max_data, max_data);
-            }
-            final_res = vget_lane_f32(res, 0);
-
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                final_res = sqrt(final_res);
-            }
-
-            // Store result
-            *(reinterpret_cast<float *>(out.ptr())) = final_res;
-        },
-        in, out);
-    }
-}
-
-void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(src_top_ptr + in.offset()));
-        float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(src_middle_ptr + in.offset()));
-        float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(src_bottom_ptr + in.offset()));
-        float32x2_t res         = {};
-        float       final_res   = 0;
-
-        // Get power of 2 in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            top_data    = vmulq_f32(top_data, top_data);
-            middle_data = vmulq_f32(middle_data, middle_data);
-            bottom_data = vmulq_f32(bottom_data, bottom_data);
-        }
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                              pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
-            res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
-            res                        = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
-            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
-            res                        = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = final_res;
-    },
-    in, out);
-}
-
-void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    constexpr const int pool_size       = 7;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    std::array<const uint8_t *, pool_size> src_ptrs{ {} };
-    for(int i = 0; i < pool_size; ++i)
-    {
-        src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
-    }
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float32x2_t res       = {};
-        float       final_res = 0.f;
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                              pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + in.offset()));
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                data.val[0] = vmulq_f32(data.val[0], data.val[0]);
-                data.val[1] = vmulq_f32(data.val[1], data.val[1]);
-            }
-            float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
-            for(int i = 1; i < pool_size; ++i)
-            {
-                data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + in.offset()));
-                // Get power of 2 in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    data.val[0] = vmulq_f32(data.val[0], data.val[0]);
-                    data.val[1] = vmulq_f32(data.val[1], data.val[1]);
-                }
-                sum_data = vaddq_f32(sum_data, data.val[0]);
-                sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
-            }
-            res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
-            res = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + in.offset()));
-            for(int i = 1; i < pool_size; ++i)
-            {
-                const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + in.offset()));
-                max_data                 = vmax2q_f32(max_data, data);
-            }
-            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
-            res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
-            res = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = final_res;
-    },
-    in, out);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ENABLE_NCHW_KERNELS
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/qasymm8.cpp b/src/core/cpu/kernels/pooling/neon/qasymm8.cpp
deleted file mode 100644
index af62ede13f..0000000000
--- a/src/core/cpu/kernels/pooling/neon/qasymm8.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window);
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/pooling/neon/qasymm8_signed.cpp
deleted file mode 100644
index 2c4b095225..0000000000
--- a/src/core/cpu/kernels/pooling/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window);
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/quantized.h b/src/core/cpu/kernels/pooling/neon/quantized.h
deleted file mode 100644
index a16960a205..0000000000
--- a/src/core/cpu/kernels/pooling/neon/quantized.h
+++ /dev/null
@@ -1,863 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H
-#define SRC_CORE_NEON_KERNELS_QUANTIZED_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
-quantize(float val, const UniformQuantizationInfo &info)
-{
-    return quantize_qasymm8_signed(val, info);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
-quantize(float val, const UniformQuantizationInfo &info)
-{
-    return quantize_qasymm8(val, info);
-}
-
-template <typename T>
-inline T vcvtq_q32_f32(float32x4_t values);
-
-template <>
-inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
-{
-    return vcvtq_u32_f32(values);
-}
-
-template <>
-inline int32x4_t vcvtq_q32_f32(float32x4_t values)
-{
-    return vcvtq_s32_f32(values);
-}
-
-template <typename T>
-inline float32x4_t vcvtq_f32_q32(T values);
-
-template <>
-inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
-{
-    return vcvtq_f32_u32(values);
-}
-
-template <>
-inline float32x4_t vcvtq_f32_q32(int32x4_t values)
-{
-    return vcvtq_f32_s32(values);
-}
-
-template <typename Tout>
-inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
-
-template <>
-inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
-{
-    const float new_scale = quant_rescale / scale_pooling;
-    return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
-}
-
-template <>
-inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
-{
-    const float new_scale = quant_rescale / scale_pooling;
-    return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
-}
-
-template <typename Tin, typename Tout>
-inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
-
-template <>
-inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
-        }
-    };
-    return vquantize(acc, requant_qinfo);
-}
-
-template <>
-inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
-        }
-    };
-    return vquantize_signed(acc, requant_qinfo);
-}
-
-template <typename T>
-inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
-
-template <>
-inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
-        }
-    };
-    return vquantize(acc, requant_qinfo);
-}
-
-template <>
-inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
-        }
-    };
-    return vquantize_signed(acc, requant_qinfo);
-}
-
-inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                                 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    int start_x = id[idx_width] * stride_x - pad_x;
-    int start_y = id[idx_height] * stride_y - pad_y;
-
-    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
-    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
-    if(exclude_padding)
-    {
-        start_x = std::max(0, start_x);
-        start_y = std::max(0, start_y);
-    }
-    return 1.f / ((end_y - start_y) * (end_x - start_x));
-}
-
-template <typename T>
-void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-
-    const int window_start_x     = window.x().start();
-    const int window_end_x       = window.x().end();
-    const int window_step_x      = 16;
-    const int window_half_step_x = window_step_x / 2;
-
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, window_src);
-    Iterator out(dst0, window_out);
-
-    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
-    using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
-    using q16_t   = typename wrapper::traits::promote_t<T>;
-    using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
-    using q32_t   = typename wrapper::traits::promote_t<q16_t>;
-    using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const float32x4_t             half_scale_v = vdupq_n_f32(0.5f);
-    const UniformQuantizationInfo src_qinfo    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo dst_qinfo    = dst0->info()->quantization_info().uniform();
-
-    const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
-    // "new_offset" doesn't have to consider the "half_scale_v" in its computation
-    // With a requantization performed in a single step there won't be uncertainties introduced
-    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
-
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-
-                // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
-
-                // Perform pooling
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (src->info()->strides_in_bytes().z())) + x_off);
-
-                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
-                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
-                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
-                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
-                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
-                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
-                    }
-                }
-
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float32x4x4_t vres =
-                    {
-                        {
-                            vcvtq_f32_q32(vres1),
-                            vcvtq_f32_q32(vres2),
-                            vcvtq_f32_q32(vres3),
-                            vcvtq_f32_q32(vres4),
-                        }
-                    };
-                    const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
-                    // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
-                }
-                else
-                {
-                    const float32x4_t scale_v = vdupq_n_f32(scale);
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
-                    const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
-                    const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
-                    // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
-                }
-            }
-            else
-            {
-                q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (src->info()->strides_in_bytes().z())) + x_off);
-                        vres               = wrapper::vmax(vres, data);
-                    }
-                }
-
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
-                                requant_qinfo) :
-                                vres);
-            }
-        }
-
-        if(pool_info.pool_type == PoolingType::MAX)
-        {
-            for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
-            {
-                q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (src->info()->strides_in_bytes().z())) + x_off);
-                        vres              = wrapper::vmax(vres, data);
-                    }
-                }
-
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
-                                (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
-            }
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                q32_t res = static_cast<q32_t>(0.f);
-
-                // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
-
-                // Perform pooling
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res += data;
-                    }
-                }
-
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float res_f           = static_cast<float>(res);
-                    const float new_scale       = quant_rescale / scale;
-                    const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
-
-                    // Store result
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
-                }
-                else
-                {
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
-
-                    // Store result
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-                }
-            }
-            else
-            {
-                T res = std::numeric_limits<T>::min();
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res          = std::max(res, data);
-                    }
-                }
-
-                // Store result
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float res_f                           = static_cast<float>(res);
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
-                }
-                else
-                {
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-                }
-            }
-        }
-
-    },
-    in, out);
-}
-
-#if defined(ENABLE_NCHW_KERNELS)
-template <typename T, typename TVec>
-inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
-                               const int pool_size, const int upper_bound_w, const int upper_bound_h,
-                               const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int       start_x = (id.x() + id_offset) * stride_x - pad_x;
-    int       start_y = id.y() * stride_y - pad_y;
-    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
-    if(exclude_padding)
-    {
-        start_y = std::max(0, start_y);
-    }
-
-    std::array<T, 8> elems =
-    {
-        {
-            wrapper::vgetlane(v, 0),
-            wrapper::vgetlane(v, 1),
-            wrapper::vgetlane(v, 2),
-            wrapper::vgetlane(v, 3),
-            wrapper::vgetlane(v, 4),
-            wrapper::vgetlane(v, 5),
-            wrapper::vgetlane(v, 6),
-            wrapper::vgetlane(v, 7),
-        }
-    };
-
-    for(auto &el : elems)
-    {
-        int       c_start_x = start_x;
-        const int end_x     = std::min(c_start_x + pool_size, upper_bound_w);
-        if(exclude_padding)
-        {
-            c_start_x = std::max(0, c_start_x);
-        }
-        float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
-        el *= scale;
-        start_x += step * stride_x;
-    }
-
-    v = wrapper::vsetlane(elems[0], v, 0);
-    v = wrapper::vsetlane(elems[1], v, 1);
-    v = wrapper::vsetlane(elems[2], v, 2);
-    v = wrapper::vsetlane(elems[3], v, 3);
-    v = wrapper::vsetlane(elems[4], v, 4);
-    v = wrapper::vsetlane(elems[5], v, 5);
-    v = wrapper::vsetlane(elems[6], v, 6);
-    v = wrapper::vsetlane(elems[7], v, 7);
-}
-
-template <typename T>
-void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    /** SIMD vector types */
-    using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
-    using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
-    using q8x8x2_t  = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
-    using q16_t     = typename wrapper::traits::promote_t<T>;
-    using q16x4_t   = typename wrapper::traits::neon_vector<q16_t, 4>::type;
-    using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
-    using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
-
-    constexpr int pool_size       = 2;
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
-    const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const T *const src_top_ptr    = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
-
-    const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
-
-    const UniformQuantizationInfo src_qinfo            = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo dst_qinfo            = dst0->info()->quantization_info().uniform();
-    const bool                    have_different_qinfo = src_qinfo != dst_qinfo;
-
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto top_data    = wrapper::vloadq(src_top_ptr + in.offset());
-        const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset());
-        q8x8_t     lower_res   = {};
-        q8x8_t     upper_res   = {};
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
-            const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
-
-            // Add rows
-            const q16x8x2_t vrsum =
-            {
-                {
-                    wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
-                    wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
-                }
-            };
-
-            // Pair-wise add row data
-            const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
-            const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
-
-            q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
-
-            // Scale lower result
-            scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x,
-                                               pool_size, upper_bound_w, upper_bound_h,
-                                               pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            lower_res = wrapper::vmovn(res_lower);
-
-            // Compute upper result for stride_x == 1
-            if(pool_stride_x == 1)
-            {
-                // Shifted row sum
-                const q16x8x2_t vrsum_shifted =
-                {
-                    {
-                        wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
-                        wrapper::vext_1(vrsum.val[1], vrsum.val[1])
-                    }
-                };
-
-                // Pair-wise add shifted row
-                q16x8_t res_upper = wrapper::vcombine(
-                                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
-                                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
-
-                // Scale upper result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                upper_res = wrapper::vmovn(res_upper);
-            }
-        }
-        else
-        {
-            const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
-            lower_res              = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
-            if(pool_stride_x == 1)
-            {
-                const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
-                upper_res                      = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
-            }
-        }
-
-        if(have_different_qinfo)
-        {
-            const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
-            lower_res                  = wrapper::vgetlow(requantized_dst);
-            upper_res                  = wrapper::vgethigh(requantized_dst);
-        }
-
-        // Store result
-        if(pool_stride_x == 1)
-        {
-            const q8x8x2_t res = { { lower_res, upper_res } };
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()), res);
-        }
-        else
-        {
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()), lower_res);
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    /** SIMD vector types */
-    using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
-    using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
-    using q8x8x2_t  = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
-    using q16_t     = typename wrapper::traits::promote_t<T>;
-    using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
-    using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
-
-    constexpr int pool_size       = 3;
-    const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
-
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-
-    const T *const src_top_ptr    = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
-    const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto top_data    = wrapper::vloadq(src_top_ptr + in.offset());
-        const auto middle_data = wrapper::vloadq(src_middle_ptr + in.offset());
-        const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset());
-        q8x8_t     fres        = {};
-        q8x16_t    fqres       = {};
-
-        if(pool_info.pool_type == PoolingType::AVG)
-        {
-            // Convert data to u16
-            const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
-            const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
-            const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
-
-            // Calculate row sums
-            const q16x8x2_t vrsum =
-            {
-                {
-                    wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
-                    wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
-                }
-            };
-            const q16x8x2_t vrsum_shifted_1 =
-            {
-                {
-                    wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
-                    wrapper::vext_1(vrsum.val[1], vrsum.val[1])
-                }
-            };
-            const q16x8x2_t vrsum_shifted_2 =
-            {
-                {
-                    wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
-                    wrapper::vext_2(vrsum.val[1], vrsum.val[1])
-                }
-            };
-            // Calculate final sum
-            q16x8x2_t final_sum =
-            {
-                {
-                    wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
-                    wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
-                }
-            };
-            if(pool_stride_x == 2)
-            {
-                q16x8_t res =
-                {
-                    wrapper::vgetlane(final_sum.val[0], 0),
-                    wrapper::vgetlane(final_sum.val[0], 2),
-                    wrapper::vgetlane(final_sum.val[0], 4),
-                    wrapper::vgetlane(final_sum.val[0], 6),
-                    wrapper::vgetlane(final_sum.val[1], 0),
-                    wrapper::vgetlane(final_sum.val[1], 2),
-                    wrapper::vgetlane(final_sum.val[1], 4),
-                    wrapper::vgetlane(final_sum.val[1], 6),
-                };
-
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                fres = wrapper::vmovn(res);
-            }
-            else
-            {
-                // Scale lower result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                // Scale lower result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
-            }
-        }
-        else
-        {
-            const q8x16_t max_data        = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
-            const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
-            const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
-            const q8x16_t final_max       = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
-
-            if(pool_stride_x == 2)
-            {
-                const q8x8x2_t      table      = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
-                static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
-                fres                           = wrapper::vtbl(table, lookup_val);
-            }
-            else
-            {
-                fqres = final_max;
-            }
-        }
-
-        // Store result
-        if(pool_stride_x == 1)
-        {
-            if(src_qinfo != dst_qinfo)
-            {
-                fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
-            }
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()), fqres);
-        }
-        else
-        {
-            if(src_qinfo != dst_qinfo)
-            {
-                fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
-            }
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()), fres);
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    /** SIMD vector types */
-    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
-    using q16_t   = typename wrapper::traits::promote_t<T>;
-    using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
-    using q32_t   = typename wrapper::traits::promote_t<q16_t>;
-    using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        T res = std::numeric_limits<T>::min();
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32_t   sres = 0;
-
-            // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
-
-            // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 8); x += 8)
-                {
-                    const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                   (src->info()->strides_in_bytes().y())));
-
-                    const q16x8_t data_q16 = wrapper::vmovl(data);
-                    vres                   = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
-                }
-
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                           (src->info()->strides_in_bytes().y())));
-                    sres += data;
-                }
-            }
-
-            // Reduction
-            const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
-            sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
-
-            // Divide by scale
-            res = static_cast<T>(support::cpp11::round(sres * scale));
-        }
-        else
-        {
-            q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 8); x += 8)
-                {
-                    const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                   (src->info()->strides_in_bytes().y())));
-                    vres              = wrapper::vmax(vres, data);
-                }
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                 (src->info()->strides_in_bytes().y())));
-                    res          = std::max(res, data);
-                }
-            }
-
-            // Reduce max
-            vres = wrapper::vpmax(vres, vres);
-            vres = wrapper::vpmax(vres, vres);
-            vres = wrapper::vpmax(vres, vres);
-
-            // Get max value
-            res = std::max(res, wrapper::vgetlane(vres, 0));
-        }
-        // Store result
-        res                                 = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
-        *(reinterpret_cast<T *>(out.ptr())) = res;
-    },
-    in, out);
-}
-#endif /* defined(ENABLE_NCHW_KERNELS) */
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H
diff --git a/src/core/cpu/kernels/scale/neon/fp16.cpp b/src/core/cpu/kernels/scale/neon/fp16.cpp
deleted file mode 100644
index 0ad66cab1c..0000000000
--- a/src/core/cpu/kernels/scale/neon/fp16.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace
-{
-void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                             float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-    const int  window_step_x  = 8;
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t    offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto       in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int        offset_row = in_hi * in_stride_wc;
-        int32_t          x          = window_start_x;
-        const float16_t *in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
-}
-
-void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                              BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                              bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
-
-        const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto       offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t    in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                     InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                     bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/neon/integer.cpp b/src/core/cpu/kernels/scale/neon/integer.cpp
deleted file mode 100644
index a2359aac94..0000000000
--- a/src/core/cpu/kernels/scale/neon/integer.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                           float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-    const int  window_step_x  = 16;
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int      offset_row = in_hi * in_stride_wc;
-        int32_t        x          = window_start_x;
-        const uint8_t *in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
-}
-
-void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                            BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                            bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-    const int  window_step_x  = 8;
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int      offset_row = in_hi * in_stride_wc;
-        int32_t        x          = window_start_x;
-        const int16_t *in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
-}
-
-void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                             BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                             bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-
-void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/neon/list.h b/src/core/cpu/kernels/scale/neon/list.h
deleted file mode 100644
index c91242f5b2..0000000000
--- a/src/core/cpu/kernels/scale/neon/list.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_SCALE_LIST_H
-#define SRC_CORE_NEON_KERNELS_SCALE_LIST_H
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_SCALE_KERNEL(func_name)                                                                                         \
-    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,              \
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
-                   bool align_corners, const Window &window)
-
-DECLARE_SCALE_KERNEL(qasymm8_neon_scale);
-DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale);
-
-#undef DECLARE_SCALE_KERNEL
-
-template <typename T>
-void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset,
-                        bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-    const int  window_step_x  = 16 / sizeof(T);
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        int32_t       x          = window_start_x;
-        const T      *in_ptr     = reinterpret_cast<const T *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<T *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
-}
-
-template <typename T>
-void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                         BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                         bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        using ConstType = T;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto    offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto    dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto    dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const T      *in_ptr = reinterpret_cast<const T *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-template <typename T>
-void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                       InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                       bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_SCALE_LIST_H */
diff --git a/src/core/cpu/kernels/scale/neon/qasymm8.cpp b/src/core/cpu/kernels/scale/neon/qasymm8.cpp
deleted file mode 100644
index 90302ce889..0000000000
--- a/src/core/cpu/kernels/scale/neon/qasymm8.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/scale/neon/list.h"
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                 BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                 bool align_corners, const Window &window)
-{
-    // Data layout is NHWC
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(1, Window::Dimension(0, 0, 0));
-    win_in.set(2, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator in(src, win_in);
-    Iterator out(dst, window);
-
-    const int32_t in_dim_w = src->info()->dimension(1);
-    const int32_t in_dim_h = src->info()->dimension(2);
-    const int32_t stride_w = src->info()->strides_in_bytes()[1];
-    const int32_t stride_h = src->info()->strides_in_bytes()[2];
-
-    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
-            const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
-            const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
-            const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
-            const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
-            const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
-            const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                        InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                        bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        nearest_neon_scale<uint8_t>(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp
deleted file mode 100644
index 07d6c6ef03..0000000000
--- a/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/scale/neon/list.h"
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                        BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                        bool align_corners, const Window &window)
-{
-    // Data layout is NHWC
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(1, Window::Dimension(0, 0, 0));
-    win_in.set(2, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator in(src, win_in);
-    Iterator out(dst, window);
-
-    const int32_t in_dim_w = src->info()->dimension(1);
-    const int32_t in_dim_h = src->info()->dimension(2);
-    const int32_t stride_w = src->info()->strides_in_bytes()[1];
-    const int32_t stride_h = src->info()->strides_in_bytes()[2];
-
-    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
-            const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
-            const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
-            const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
-            const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
-            const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
-            const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                               InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                               bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        nearest_neon_scale<int8_t>(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/fp16.cpp b/src/core/cpu/kernels/scale/sve/fp16.cpp
deleted file mode 100644
index 5b9377c6e6..0000000000
--- a/src/core/cpu/kernels/scale/sve/fp16.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<float16_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    out);
-}
-
-void fp16_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                             BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                             bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
-
-        const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto       offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t    in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        fp16_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ENABLE_SVE
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/fp32.cpp b/src/core/cpu/kernels/scale/sve/fp32.cpp
deleted file mode 100644
index 05fbedf20d..0000000000
--- a/src/core/cpu/kernels/scale/sve/fp32.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <cmath>
-#include <cstddef>
-
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace
-{
-void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<float *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    out);
-}
-
-void fp32_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                             BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                             bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const float const_border_value = static_cast<float>(constant_border_value.get<float>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto    offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto    dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto    dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const float *in_ptr = reinterpret_cast<const float *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<float *>(out.ptr()) = static_cast<float>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<float *>(out.ptr()) = static_cast<float>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        fp32_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ENABLE_SVE
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/integer.cpp b/src/core/cpu/kernels/scale/sve/integer.cpp
deleted file mode 100644
index d7e270c661..0000000000
--- a/src/core/cpu/kernels/scale/sve/integer.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                          float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
-}
-
-void u8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                           BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                           bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                           float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<int16_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    out);
-}
-
-void s16_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                            BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                            bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                  InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                  bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        u8_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-
-void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        s16_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ENABLE_SVE
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/list.h b/src/core/cpu/kernels/scale/sve/list.h
deleted file mode 100644
index b9c3a10a78..0000000000
--- a/src/core/cpu/kernels/scale/sve/list.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_SCALE_LIST_H
-#define SRC_CORE_SVE_KERNELS_SCALE_LIST_H
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_SCALE_KERNEL(func_name)                                                                                         \
-    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,              \
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
-                   bool align_corners, const Window &window)
-
-DECLARE_SCALE_KERNEL(fp16_sve_scale);
-DECLARE_SCALE_KERNEL(fp32_sve_scale);
-DECLARE_SCALE_KERNEL(s16_sve_scale);
-DECLARE_SCALE_KERNEL(u8_sve_scale);
-DECLARE_SCALE_KERNEL(qasymm8_sve_scale);
-DECLARE_SCALE_KERNEL(qasymm8_signed_sve_scale);
-
-#undef DECLARE_SCALE_KERNEL
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_SVE_KERNELS_SCALE_LIST_H */
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8.cpp b/src/core/cpu/kernels/scale/sve/qasymm8.cpp
deleted file mode 100644
index f747037938..0000000000
--- a/src/core/cpu/kernels/scale/sve/qasymm8.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                               float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
-}
-
-void qasymm8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                bool align_corners, const Window &window)
-{
-    // Data layout is NHWC
-    const int idx_width  = 1;
-    const int idx_height = 2;
-
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(idx_width, Window::Dimension(0, 0, 0));
-    win_in.set(idx_height, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator in(src, win_in);
-    Iterator out(dst, window);
-
-    const int32_t in_dim_w = src->info()->dimension(idx_width);
-    const int32_t in_dim_h = src->info()->dimension(idx_height);
-    const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
-    const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
-
-    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
-            const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
-            const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
-            const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
-            const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
-            const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
-            const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                       InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                       bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        qasymm8_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif //  defined(ENABLE_SVE)
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
deleted file mode 100644
index 584ec7a0da..0000000000
--- a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                                      float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<int8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
-}
-
-void qasymm8_signed_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                       BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                       bool align_corners, const Window &window)
-{
-    // Data layout is NHWC
-    const int idx_width  = 1;
-    const int idx_height = 2;
-
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(idx_width, Window::Dimension(0, 0, 0));
-    win_in.set(idx_height, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator in(src, win_in);
-    Iterator out(dst, window);
-
-    const int32_t in_dim_w = src->info()->dimension(idx_width);
-    const int32_t in_dim_h = src->info()->dimension(idx_height);
-    const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
-    const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
-
-    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
-            const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
-            const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
-            const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
-            const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
-            const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
-            const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                              InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                              bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        qasymm8_signed_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ENABLE_SVE
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/softmax/impl/neon/list.h b/src/core/cpu/kernels/softmax/impl/neon/list.h
deleted file mode 100644
index 5ebee31272..0000000000
--- a/src/core/cpu/kernels/softmax/impl/neon/list.h
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "support/SaturateCast.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int window_step_x  = 16 / sizeof(T);
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    const int sum_stages = log2(window_step_x / 2);
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
-        // Init max value
-        auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
-        int  x       = window_start_x;
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto current_value = wrapper::vloadq(in_ptr + x);
-            vec_max                  = wrapper::vmax(vec_max, current_value);
-        }
-        auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
-        for(int i = 0; i < sum_stages; ++i)
-        {
-            carry_max = wrapper::vpmax(carry_max, carry_max);
-        }
-        T max_val = wrapper::vgetlane(carry_max, 0);
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
-        }
-
-        *out_ptr = max_val;
-    },
-    input, output);
-}
-
-template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window)
-{
-    static_assert(std::is_same<T, qasymm8_t>::value
-                  || std::is_same<T, qasymm8_signed_t>::value,
-                  "quantized type should be either qasymm8_t or qasymm8_signed_t.");
-
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
-    const auto  scale_beta_vec = vdupq_n_f32(scale_beta);
-
-    Iterator      in_it(in, window);
-    Iterator      max_it(max, window);
-    Iterator      out_it(out, window);
-    constexpr int vec_size = 16;
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
-        float sum{};
-        float sum_inversed{};
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
-
-            /* Init sum to zero */
-            float32x4x4_t vec_sum =
-            {
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-            };
-
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_elements     = wrapper::vloadq(in_ptr + x);
-                vec_elements          = wrapper::vqsub(vec_max, vec_elements);
-                auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
-
-                if(is_log)
-                {
-                    vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
-                    vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
-                    vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
-                    vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
-                }
-                else
-                {
-                    vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
-                    vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
-                    vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
-                    vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
-                }
-
-                vst4q_f32(tmp_ptr + x, vec_elements_flt);
-            }
-
-            /* Reduce sum */
-            const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
-            auto       sum_res     = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
-            sum_res                = vpadd_f32(sum_res, sum_res);
-            sum                    = wrapper::vgetlane(sum_res, 0);
-
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                float element{};
-                if(is_log)
-                {
-                    element = (max_val - in_ptr[x]) * scale_beta;
-                    sum += std::exp(element);
-                }
-                else
-                {
-                    element = std::exp((max_val - in_ptr[x]) * scale_beta);
-                    sum += element;
-                }
-
-                tmp_ptr[x] = element;
-            }
-
-            if(!is_log)
-            {
-                sum_inversed = 256.f / sum;
-            }
-            else
-            {
-                sum = std::log(sum);
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
-                float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
-                int_vec_type  normalized_value{};
-                if(is_log)
-                {
-                    const float32x4x4_t sub =
-                    {
-                        vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
-                    };
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
-                }
-                else
-                {
-                    float32x4x4_t mul =
-                    {
-                        vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
-                    };
-
-                    if(is_qasymm8_signed)
-                    {
-                        const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
-                        mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
-                        mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
-                        mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
-                        mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
-                    }
-
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
-                }
-                wrapper::vstore(out_ptr + x, normalized_value);
-            }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                if(is_log)
-                {
-                    out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
-                }
-                else
-                {
-                    out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
-                }
-            }
-        }
-    },
-    in_it, max_it, out_it);
-}
-
-template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                  ITensor *out, const float beta, bool is_log, const Window &window)
-{
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    Iterator in_it(in, window);
-    Iterator max_it(max, window);
-    Iterator out_it(out, window);
-
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int vec_size   = 16 / sizeof(T);
-    const int     sum_stages = log2(vec_size / 2);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
-        T sum{};
-        T sum_inversed{};
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
-
-            /* Init sum to zero */
-            auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_elements = wrapper::vloadq(in_ptr + x);
-                vec_elements      = wrapper::vsub(vec_elements, vec_max);
-                if(is_log)
-                {
-                    vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
-                    vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
-                }
-                else
-                {
-                    vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
-                    vec_sum      = wrapper::vadd(vec_sum, vec_elements);
-                }
-                wrapper::vstore(tmp_ptr + x, vec_elements);
-            }
-
-            /* Reduce sum */
-            auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
-            for(int i = 0; i < sum_stages; ++i)
-            {
-                sum_res = wrapper::vpadd(sum_res, sum_res);
-            }
-            sum = wrapper::vgetlane(sum_res, 0);
-
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                T element{};
-
-                if(is_log)
-                {
-                    element = (in_ptr[x] - max_val) * beta;
-                    sum += std::exp(element);
-                }
-                else
-                {
-                    element = std::exp((in_ptr[x] - max_val) * beta);
-                    sum += element;
-                }
-                tmp_ptr[x] = element;
-            }
-
-            if(!is_log)
-            {
-                sum_inversed = T(1) / sum;
-            }
-            else
-            {
-                sum = static_cast<T>(std::log(sum));
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_in           = wrapper::vloadq(tmp_ptr + x);
-                auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-                if(is_log)
-                {
-                    normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
-                }
-                else
-                {
-                    normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
-                }
-                wrapper::vstore(out_ptr + x, normalized_value);
-            }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                if(is_log)
-                {
-                    out_ptr[x] = tmp_ptr[x] - sum;
-                }
-                else
-                {
-                    out_ptr[x] = tmp_ptr[x] * sum_inversed;
-                }
-            }
-        }
-    },
-    in_it, max_it, out_it);
-}
-
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
diff --git a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp
deleted file mode 100644
index 4ed5a4fbea..0000000000
--- a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-        // Init max value
-        auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
-
-        int      x  = window_start_x;
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
-        {
-            const auto current_value = svld1(pg, in_ptr + x);
-            vec_max                  = svmax_m(pg, vec_max, current_value);
-
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
-
-        auto max_val = svmaxv(all_true_pg, vec_max);
-
-        *out_ptr = max_val;
-    },
-    input, output);
-}
-
-template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
-{
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    Iterator in_it(in, window);
-    Iterator max_it(max, window);
-    Iterator out_it(out, window);
-
-    const auto all_true_pg = wrapper::svptrue<ScalarType>();
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
-
-        ScalarType sum{ 0 };
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max = wrapper::svdup_n(max_val);
-
-            /* Init sum to zero */
-            auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
-
-            /* Loop over row and compute exponentials and sum */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
-            {
-                auto vec_elements = svld1(pg, in_ptr + x);
-                vec_elements      = svsub_z(pg, vec_elements, vec_max);
-                if(is_log)
-                {
-                    vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta)));
-                    vec_sum      = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
-                }
-                else
-                {
-                    vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta))));
-                    vec_sum      = svadd_m(pg, vec_sum, vec_elements);
-                }
-                svst1(pg, tmp_ptr + x, vec_elements);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            }
-            while(svptest_any(all_true_pg, pg));
-
-            /* Reduce sum */
-            sum = svaddv(all_true_pg, vec_sum);
-
-            if(is_log)
-            {
-                sum = static_cast<ScalarType>(std::log(sum));
-            }
-            else
-            {
-                sum = ScalarType(1) / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
-            {
-                auto vec_in           = svld1(pg, tmp_ptr + x);
-                auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
-                if(is_log)
-                {
-                    normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                else
-                {
-                    normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                svst1(pg, out_ptr + x, normalized_value);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
-}
-
-template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                 ITensor *out, const float beta, bool is_log, const Window &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                     ITensor *out, const float beta, bool is_log, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ENABLE_SVE) */
diff --git a/src/core/cpu/kernels/softmax/impl/sve/list.h b/src/core/cpu/kernels/softmax/impl/sve/list.h
deleted file mode 100644
index 7ddb358b8e..0000000000
--- a/src/core/cpu/kernels/softmax/impl/sve/list.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
-
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
-
-template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window);
-
-#if defined(__ARM_FEATURE_SVE2)
-template <typename ScalarType>
-void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                     ITensor *out, float beta, bool is_log, const Window &window)
-{
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
-    const auto  scale_beta_vec = svdup_n_f32(scale_beta);
-
-    Iterator   in_it(in, window);
-    Iterator   max_it(max, window);
-    Iterator   out_it(out, window);
-    const auto all_true_pg = wrapper::svptrue<ScalarType>();
-    using SVEType          = typename wrapper::traits::sve_vector<ScalarType>::type;
-
-    const int inc_1 = static_cast<int>(svcntw());
-    const int inc_2 = static_cast<int>(2 * svcntw());
-    const int inc_3 = static_cast<int>(3 * svcntw());
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
-        float sum{};
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max = wrapper::svdup_n(max_val);
-
-            /* Init sum to zero */
-            auto vec_sum_0 = svdup_n_f32(0.f);
-            auto vec_sum_1 = svdup_n_f32(0.f);
-            auto vec_sum_2 = svdup_n_f32(0.f);
-            auto vec_sum_3 = svdup_n_f32(0.f);
-
-            /* Loop over row and compute exponentials and sum */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
-            {
-                auto vec_elements = svld1(pg, in_ptr + x);
-                vec_elements      = svsub_z(pg, vec_max, vec_elements);
-
-                auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements)));
-                auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements)));
-                auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements)));
-                auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements)));
-
-                if(is_log)
-                {
-                    vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
-                    vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
-                    vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
-                    vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
-                }
-                else
-                {
-                    vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
-                    vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
-                    vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
-                    vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
-                }
-
-                svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
-                svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
-                svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
-                svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
-            }
-            while(svptest_any(all_true_pg, pg));
-
-            /* Reduce sum */
-            const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
-            sum                = svaddv_f32(all_true_pg, vec_sum);
-
-            /* Run remaining elements */
-            x = 0;
-            if(is_log)
-            {
-                sum = std::log(sum);
-            }
-            else
-            {
-                sum = 256.f / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
-            {
-                auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
-                auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
-                auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
-                auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
-
-                svfloat32_t res_0{};
-                svfloat32_t res_1{};
-                svfloat32_t res_2{};
-                svfloat32_t res_3{};
-
-                if(is_log)
-                {
-                    res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
-                }
-                else
-                {
-                    res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
-
-                    if(is_qasymm8_signed)
-                    {
-                        const auto offset_vec = svdup_n_f32(128.f);
-                        res_0                 = svsub_z(pg_0, vec_in_0, offset_vec);
-                        res_1                 = svsub_z(pg_1, vec_in_1, offset_vec);
-                        res_2                 = svsub_z(pg_2, vec_in_2, offset_vec);
-                        res_3                 = svsub_z(pg_3, vec_in_3, offset_vec);
-                    }
-                }
-
-                // Store value
-                const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
-                svst1(pg, out_ptr + x, out);
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
-            }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
-}
-#endif /* defined(__ARM_FEATURE_SVE2) */
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ENABLE_SVE) */
-
-#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */
diff --git a/src/core/cpu/kernels/sub/neon/integer.cpp b/src/core/cpu/kernels/sub/neon/integer.cpp
deleted file mode 100644
index bba73df1e8..0000000000
--- a/src/core/cpu/kernels/sub/neon/integer.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-void sub_s16_u8_s16_impl(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_swapped)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = wrapper::vloadq(input1_ptr + x);
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                const auto res  = is_swapped ? wrapper::vsub(vin2, vin1) : wrapper::vsub(vin1, vin2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto res    = is_swapped ? static_cast<int16_t>(*(input2_ptr + x)) - *(input1_ptr + x) : *(input1_ptr + x) - static_cast<int16_t>(*(input2_ptr + x));
-                *(output_ptr + x) = res;
-            }
-        }
-        else
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = wrapper::vloadq(input1_ptr + x);
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                const auto res  = is_swapped ? wrapper::vqsub(vin2, vin1) : wrapper::vqsub(vin1, vin2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto res    = is_swapped ? wrapper::sub_sat(static_cast<int16_t>(*(input2_ptr + x)), *(input1_ptr + x)) : wrapper::sub_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
-                *(output_ptr + x) = res;
-            }
-        }
-    },
-    input1, input2, output);
-}
-}
-
-void sub_s16_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    sub_s16_u8_s16_impl(src1, src0, dst, policy, window, false);
-}
-
-void sub_u8_s16_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Swap arguments
-    sub_s16_u8_s16_impl(src1, src0, dst, policy, window, true);
-}
-
-void sub_u8_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vsub(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) - static_cast<int16_t>(*(input2_ptr + x));
-            }
-        }
-        else
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vqsub(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = wrapper::sub_sat(static_cast<int16_t>(*(input1_ptr + x)),
-                                                     static_cast<int16_t>(*(input2_ptr + x)));
-            }
-        }
-    },
-    input1, input2, output);
-}
-
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/sub/neon/list.h b/src/core/cpu/kernels/sub/neon/list.h
deleted file mode 100644
index 1ab4e6367b..0000000000
--- a/src/core/cpu/kernels/sub/neon/list.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_SUB_LIST_H
-#define SRC_CORE_NEON_KERNELS_SUB_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_SUB_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-
-DECLARE_SUB_KERNEL(sub_qasymm8_neon);
-DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon);
-DECLARE_SUB_KERNEL(sub_qsymm16_neon);
-DECLARE_SUB_KERNEL(sub_s16_u8_s16_neon);
-DECLARE_SUB_KERNEL(sub_u8_s16_s16_neon);
-DECLARE_SUB_KERNEL(sub_u8_u8_s16_neon);
-
-#undef DECLARE_SUB_KERNEL
-
-template <typename T>
-void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    bool is_sat = policy == ConvertPolicy::SATURATE;
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    constexpr int window_step_x         = 16 / sizeof(T);
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
-    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
-    Iterator output(dst, window);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
-
-            const T    broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
-                if(is_broadcast_input_2)
-                {
-                    res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
-                }
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
-                if(is_broadcast_input_2)
-                {
-                    res = static_cast<T>(-1) * res;
-                }
-
-                *(output_ptr + x) = res;
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto val1 = wrapper::vloadq(input1_ptr + x);
-                const auto val2 = wrapper::vloadq(input2_ptr + x);
-                const auto res  = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto val1   = *(input1_ptr + x);
-                const auto val2   = *(input2_ptr + x);
-                *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_SUB_LIST_H
diff --git a/src/core/cpu/kernels/sub/neon/qasymm8.cpp b/src/core/cpu/kernels/sub/neon/qasymm8.cpp
deleted file mode 100644
index 8f4cd8bdbb..0000000000
--- a/src/core/cpu/kernels/sub/neon/qasymm8.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-    const float32x4_t voffseto   = vdupq_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-        const float32x4_t             vscale1              = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
-        const float32x4_t             vscale2              = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
-        const int32x4_t               voffset1             = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
-        const int32x4_t               voffset2             = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const auto broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(static_cast<uint8_t>(broadcast_value), wrapper::traits::vector_128_tag{});
-
-            const float32x4x4_t bf =
-            {
-                {
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
-                }
-            };
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);
-
-                const float32x4x4_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                    }
-                };
-
-                const int32x4x4_t rf =
-                {
-                    {
-#ifdef __aarch64_
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-                const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-                wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
-                const float bfs   = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qasymm8(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info());
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const float32x4_t vscale1  = vdupq_n_f32(iq1_info.scale);
-        const float32x4_t vscale2  = vdupq_n_f32(iq2_info.scale);
-        const int32x4_t   voffset1 = vdupq_n_s32(iq1_info.offset);
-        const int32x4_t   voffset2 = vdupq_n_s32(iq2_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto a = wrapper::vloadq(input1_ptr + x);
-                const auto b = wrapper::vloadq(input2_ptr + x);
-
-                const float32x4x4_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                    }
-                };
-
-                const float32x4x4_t bf =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
-                    }
-                };
-
-                const int32x4x4_t rf =
-                {
-                    {
-#ifdef __aarch64__
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-                const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-                wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
-                const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
-
-                *(output_ptr + x) = quantize_qasymm8((afs - bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
-    }
-}
-
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp
deleted file mode 100644
index 2c9e411743..0000000000
--- a/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-    const float32x4_t voffseto   = vdupq_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-        const float32x4_t             vscale1              = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
-        const float32x4_t             vscale2              = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
-        const int32x4_t               voffset1             = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
-        const int32x4_t               voffset2             = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const auto broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(static_cast<int8_t>(broadcast_value), wrapper::traits::vector_128_tag{});
-
-            const float32x4x4_t bf =
-            {
-                {
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
-                }
-            };
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);
-
-                const float32x4x4_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                    }
-                };
-
-                const int32x4x4_t rf =
-                {
-                    {
-#ifdef __aarch64_
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-                const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-                wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
-                const float bfs   = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qasymm8_signed(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info());
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const float32x4_t vscale1  = vdupq_n_f32(iq1_info.scale);
-        const float32x4_t vscale2  = vdupq_n_f32(iq2_info.scale);
-        const int32x4_t   voffset1 = vdupq_n_s32(iq1_info.offset);
-        const int32x4_t   voffset2 = vdupq_n_s32(iq2_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto a = wrapper::vloadq(input1_ptr + x);
-                const auto b = wrapper::vloadq(input2_ptr + x);
-
-                const float32x4x4_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                    }
-                };
-
-                const float32x4x4_t bf =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
-                    }
-                };
-
-                const int32x4x4_t rf =
-                {
-                    {
-#ifdef __aarch64__
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-                const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-                wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
-                const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
-
-                *(output_ptr + x) = quantize_qasymm8_signed((afs - bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/cpu/kernels/sub/neon/qsymm16.cpp b/src/core/cpu/kernels/sub/neon/qsymm16.cpp
deleted file mode 100644
index 4dfdc0e78c..0000000000
--- a/src/core/cpu/kernels/sub/neon/qsymm16.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 8;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t vscale1    = vdupq_n_f32(iq1_info.scale);
-    const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
-
-            const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
-
-            const float32x4x2_t bf =
-            {
-                {
-                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
-                }
-            };
-            const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int16x8_t     a = vld1q_s16(non_broadcast_input_ptr + x);
-                const float32x4x2_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
-                    }
-                };
-
-                const int32x4x4_t rf =
-                {
-                    {
-#ifdef __aarch64__
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int16x8_t a = vld1q_s16(input1_ptr + x);
-                const int16x8_t b = vld1q_s16(input2_ptr + x);
-
-                const float32x4x2_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
-                    }
-                };
-
-                const float32x4x2_t bf =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
-                    }
-                };
-
-                const int32x4x2_t rf =
-                {
-                    {
-#ifdef __aarch64__
-                        vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/gpu/cl/ClCompileContext.h b/src/core/gpu/cl/ClCompileContext.h
deleted file mode 100644
index e69cc0200f..0000000000
--- a/src/core/gpu/cl/ClCompileContext.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_COMPILE_CONTEXT_H
-#define ARM_COMPUTE_CL_COMPILE_CONTEXT_H
-
-#include "arm_compute/core/CL/CLCompileContext.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-using ClCompileContext = arm_compute::CLCompileContext;
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_COMPILE_CONTEXT_H */
diff --git a/src/core/gpu/cl/ClKernelLibrary.cpp b/src/core/gpu/cl/ClKernelLibrary.cpp
deleted file mode 100644
index 9d516e54a7..0000000000
--- a/src/core/gpu/cl/ClKernelLibrary.cpp
+++ /dev/null
@@ -1,943 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/ClKernelLibrary.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Utils.h"
-
-#include <algorithm>
-#include <array>
-#include <fstream>
-#include <utility>
-
-#ifdef ARM_COMPUTE_COMPRESSED_KERNELS
-#include <zlib.h>
-
-namespace
-{
-/* Decoding table */
-constexpr std::array<uint8_t, 256> b64_invtab =
-{
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63,
-    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0,
-    0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
-    0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
-    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-/** Decode a base64 encoded string
- *
- * @param[in] str Base64 encoded string to decode
- *
- * @return The decode string in case of a valid, non-empty string otherwise an empty string
- */
-std::string decode_base64(const std::string &str)
-{
-    constexpr const char pad_char = '=';
-
-    // Handle empty string
-    if(str.empty())
-    {
-        return {};
-    }
-
-    // Base64 encoded string has size multiple of 4
-    if(str.length() % 4)
-    {
-        return {};
-    }
-
-    //
-    // Check encoded string padding
-    std::size_t padding = (str.rbegin()[0] == pad_char) + (str.rbegin()[1] == pad_char);
-    const int   str_len = str.size();
-
-    // Reserve memory for the decoded string
-    // Note each 4 consecutive elements of 6-bit encode 3 bytes
-    std::string dec_b64;
-    dec_b64.reserve(((str_len / 4) * 3));
-
-    // Block decoding function (exclude padding)
-    int       c   = 0;
-    const int end = str_len - 4 - padding;
-    for(; c <= end; c += 4)
-    {
-        const int byte0 = b64_invtab[str[c]];
-        const int byte1 = b64_invtab[str[c + 1]];
-        const int byte2 = b64_invtab[str[c + 2]];
-        const int byte3 = b64_invtab[str[c + 3]];
-
-        dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
-        dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
-        dec_b64.push_back((byte2 << 6) | (byte3));
-    }
-
-    // Last step that might contain padding symbols
-    if(padding == 1)
-    {
-        const int byte0 = b64_invtab[str[c]];
-        const int byte1 = b64_invtab[str[c + 1]];
-        const int byte2 = b64_invtab[str[c + 2]];
-
-        dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
-        dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
-    }
-    else if(padding == 2)
-    {
-        const int byte0 = b64_invtab[str[c]];
-        const int byte1 = b64_invtab[str[c + 1]];
-
-        dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
-    }
-
-    return dec_b64;
-}
-
-/** Decompress a zlib compressed string
- *
- * @param[in] str ZLib compressed string
- *
- * @return The decompressed string if successful, otherwise false.
- */
-std::string decompress_zlib(const std::string &str)
-{
-    // Create and initialize decompression stream
-    z_stream ds{};
-    if(inflateInit(&ds) != Z_OK)
-    {
-        return std::string();
-    }
-    ds.avail_in = str.size();
-    ds.next_in  = (Bytef *)str.data();
-
-    // Roll-over the string using a buffer and decompress
-    int         status = Z_OK;
-    char        roll_buff[16384];
-    std::string inflated_str;
-    do
-    {
-        ds.avail_out = sizeof(roll_buff);
-        ds.next_out  = reinterpret_cast<Bytef *>(roll_buff);
-
-        status = inflate(&ds, 0);
-        if(inflated_str.size() < ds.total_out)
-        {
-            inflated_str.append(roll_buff, ds.total_out - inflated_str.size());
-        }
-    }
-    while(status == Z_OK);
-
-    // Finalize decompression stream
-    inflateEnd(&ds);
-    if(status != Z_STREAM_END)
-    {
-        return std::string();
-    }
-
-    return inflated_str;
-}
-} // namespace
-#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
-
-namespace arm_compute
-{
-namespace opencl
-{
-const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
-{
-    { "activation_layer", "activation_layer.cl" },
-    { "activation_layer_quant", "activation_layer_quant.cl" },
-    { "activation_layer_quant_f32", "activation_layer_quant.cl" },
-    { "arg_min_max_x", "arg_min_max.cl" },
-    { "arg_min_max_y", "arg_min_max.cl" },
-    { "arg_min_max_z", "arg_min_max.cl" },
-    { "arg_min_max_w", "arg_min_max.cl" },
-    { "batch_to_space_nchw", "batch_to_space.cl" },
-    { "batch_to_space_static_nchw", "batch_to_space.cl" },
-    { "batch_to_space_nhwc", "batch_to_space.cl" },
-    { "batch_to_space_static_nhwc", "batch_to_space.cl" },
-    { "batchnormalization_layer_nchw", "batchnormalization_layer.cl" },
-    { "batchnormalization_layer_nhwc", "batchnormalization_layer.cl" },
-    { "bitwise_or", "bitwise_op.cl" },
-    { "bitwise_and", "bitwise_op.cl" },
-    { "bitwise_xor", "bitwise_op.cl" },
-    { "bitwise_not", "bitwise_op.cl" },
-    { "bounding_box_transform", "bounding_box_transform.cl" },
-    { "bounding_box_transform_quantized", "bounding_box_transform_quantized.cl" },
-    { "channel_shuffle_nchw", "channel_shuffle.cl" },
-    { "channel_shuffle_nhwc", "channel_shuffle.cl" },
-    { "compare_equal", "comparisons.cl" },
-    { "compare_equal_quantized", "comparisons.cl" },
-    { "compare_notequal", "comparisons.cl" },
-    { "compare_notequal_quantized", "comparisons.cl" },
-    { "compare_greater", "comparisons.cl" },
-    { "compare_greater_quantized", "comparisons.cl" },
-    { "compare_greaterequal", "comparisons.cl" },
-    { "compare_greaterequal_quantized", "comparisons.cl" },
-    { "compare_less", "comparisons.cl" },
-    { "compare_less_quantized", "comparisons.cl" },
-    { "compare_lessequal", "comparisons.cl" },
-    { "compare_lessequal_quantized", "comparisons.cl" },
-    { "concatenate", "concatenate.cl" },
-    { "concatenate_width", "concatenate.cl" },
-    { "concatenate_height", "concatenate.cl" },
-    { "concatenate_width_x2", "concatenate.cl" },
-    { "concatenate_width_x4", "concatenate.cl" },
-    { "col2im", "col2im.cl" },
-    { "cast_down", "cast.cl" },
-    { "cast_up", "cast.cl" },
-    { "convert_fc_weights", "convert_fc_weights.cl" },
-    { "copy_tensor", "copy_tensor.cl" },
-    { "crop_tensor", "crop_tensor.cl" },
-    { "deconvolution_reshape", "deconvolution_layer.cl" },
-    { "deconvolution_upsample", "deconvolution_layer.cl" },
-    { "depthwise_convolution_3x3", "depthwise_convolution.cl" },
-    { "depthwise_convolution_3x3_f16", "depthwise_convolution.cl" },
-    { "depthwise_convolution_3x3_nhwc", "depthwise_convolution.cl" },
-    { "depthwise_convolution_3x3_nhwc_stride1", "depthwise_convolution.cl" },
-    { "dwc_MxN_native_fp_nhwc", "depthwise_convolution.cl" },
-    { "dwc_MxN_native_quantized8_nhwc", "depthwise_convolution_quantized.cl" },
-    { "dwc_3x3_native_quantized8_nchw", "depthwise_convolution_quantized.cl" },
-    { "dwc_3x3_native_quantized8_dot8_nchw", "depthwise_convolution_quantized.cl" },
-    { "depth_to_space_nchw", "depth_to_space.cl" },
-    { "depth_to_space_nhwc", "depth_to_space.cl" },
-    { "depthwise_convolution_3x3_stridex1_stridey1_f16", "depthwise_convolution.cl" },
-    { "depthwise_convolution_3x3_stridex2_stridey2_f16", "depthwise_convolution.cl" },
-    { "depthwise_convolution_3x3_stridex1_stridey1_f32", "depthwise_convolution.cl" },
-    { "depthwise_convolution_3x3_stridex2_stridey2_f32", "depthwise_convolution.cl" },
-    { "dequantization_layer", "dequantization_layer.cl" },
-    { "dequantization_layer_per_channel_nhwc", "dequantization_layer.cl" },
-    { "dequantization_layer_per_channel_nchw", "dequantization_layer.cl" },
-    { "direct_convolution_nhwc", "direct_convolution.cl" },
-    { "direct_convolution1x1", "direct_convolution1x1.cl" },
-    { "direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl" },
-    { "direct_convolution3x3", "direct_convolution3x3.cl" },
-    { "direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl" },
-    { "direct_convolution5x5", "direct_convolution5x5.cl" },
-    { "direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl" },
-    { "direct_convolution_quantized", "direct_convolution_quantized.cl" },
-    { "elementwise_operation_ADD", "elementwise_operation.cl" },
-    { "elementwise_operation_SUB", "elementwise_operation.cl" },
-    { "elementwise_operation_MAX", "elementwise_operation.cl" },
-    { "elementwise_operation_MIN", "elementwise_operation.cl" },
-    { "elementwise_operation_DIV", "elementwise_operation.cl" },
-    { "elementwise_operation_SQUARED_DIFF", "elementwise_operation.cl" },
-    { "elementwise_operation_POWER", "elementwise_operation.cl" },
-    { "elementwise_operation_PRELU", "elementwise_operation.cl" },
-    { "elementwise_operation_AND", "elementwise_operation.cl" },
-    { "elementwise_operation_OR", "elementwise_operation.cl" },
-    { "elementwise_operation_ADD_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SUB_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MAX_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MIN_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_DIV_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SQUARED_DIFF_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_PRELU_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_unary", "elementwise_unary.cl" },
-    { "fft_digit_reverse_axis_0", "fft_digit_reverse.cl" },
-    { "fft_digit_reverse_axis_1", "fft_digit_reverse.cl" },
-    { "fft_radix_2_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_2_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_2_axis_0", "fft.cl" },
-    { "fft_radix_2_axis_1", "fft.cl" },
-    { "fft_radix_3_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_3_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_3_axis_0", "fft.cl" },
-    { "fft_radix_3_axis_1", "fft.cl" },
-    { "fft_radix_4_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_4_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_4_axis_0", "fft.cl" },
-    { "fft_radix_4_axis_1", "fft.cl" },
-    { "fft_radix_5_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_5_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_5_axis_0", "fft.cl" },
-    { "fft_radix_5_axis_1", "fft.cl" },
-    { "fft_radix_7_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_7_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_7_axis_0", "fft.cl" },
-    { "fft_radix_7_axis_1", "fft.cl" },
-    { "fft_radix_8_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_8_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_8_axis_0", "fft.cl" },
-    { "fft_radix_8_axis_1", "fft.cl" },
-    { "fft_scale_conj", "fft_scale.cl" },
-    { "fill_image_borders_constant", "fill_border.cl" },
-    { "fill_image_borders_replicate", "fill_border.cl" },
-    { "floor_layer", "floor.cl" },
-    { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
-    { "gather", "gather.cl" },
-    { "gemm_ma_f16", "gemm.cl" },
-    { "gemm_ma_f32", "gemm.cl" },
-    { "gemm_mv", "gemv.cl" },
-    { "gemm_mv_quantized", "gemv.cl" },
-    { "gemm_mm_interleaved_transposed_f16", "gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f16_acc32", "gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f16_bifrost", "gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f32", "gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm_v1.cl" },
-    { "gemm_mm_floating_point", "gemm_v1.cl" },
-    { "gemm_mm_floating_point_f16_bifrost", "gemm_v1.cl" },
-    { "gemm_mm_floating_point_f16_bifrost_acc32", "gemm_v1.cl" },
-    { "gemm_mm_floating_point_f32_bifrost", "gemm_v1.cl" },
-    { "gemm_mm_floating_point_f32_bifrost_1000", "gemm_v1.cl" },
-    { "gemm_mm_native", "gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t", "gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt", "gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt", "gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_texture", "gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t", "gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_texture", "gemm.cl" },
-    { "gemm_lc_vm_f32", "gemm.cl" },
-    { "gemm_reshape_lhs_matrix_nt", "gemm.cl" },
-    { "gemm_reshape_lhs_matrix_t", "gemm.cl" },
-    { "gemm_reshape_rhs_matrix_nt", "gemm.cl" },
-    { "gemm_reshape_rhs_matrix_t", "gemm.cl" },
-    { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
-    { "gemmlowp_matrix_a_reduction_dot8", "gemmlowp.cl" },
-    { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
-    { "gemmlowp_mm_native", "gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t", "gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "gemmlowp.cl" },
-    { "gemmlowp_offset_contribution", "gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down", "gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down", "gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_float", "gemmlowp.cl" },
-    { "generate_proposals_compute_all_anchors", "generate_proposals.cl" },
-    { "generate_proposals_compute_all_anchors_quantized", "generate_proposals_quantized.cl" },
-    { "im2col1x1_stridex1_nchw", "im2col.cl" },
-    { "im2col3x3_nchw", "im2col.cl" },
-    { "im2col5x5_nchw", "im2col.cl" },
-    { "im2col11x11_padx0_pady0_nchw", "im2col.cl" },
-    { "im2col_generic_nchw", "im2col.cl" },
-    { "im2col_generic_padx0_pady0_nchw", "im2col.cl" },
-    { "im2col3x3_nhwc", "im2col.cl" },
-    { "im2col9x9_nhwc", "im2col.cl" },
-    { "im2col_generic_nhwc", "im2col.cl" },
-    { "instance_normalization", "instance_normalization.cl" },
-    { "compute_mean_var", "instance_normalization.cl" },
-    { "l2_normalize_x", "l2_normalize.cl" },
-    { "l2_normalize_y", "l2_normalize.cl" },
-    { "l2_normalize_z", "l2_normalize.cl" },
-    { "max_unpooling_layer_2", "unpooling_layer.cl" },
-    { "mean_stddev_normalization", "mean_stddev_normalization.cl" },
-    { "memset", "memset.cl" },
-    { "minmax_layer", "minmax_layer.cl" },
-    { "non_max_suppression", "nonmax.cl" },
-    { "normalization_layer_cross_map_nchw", "normalization_layer.cl" },
-    { "normalization_layer_cross_map_nhwc", "normalization_layer.cl" },
-    { "normalization_layer_in_map_nchw", "normalization_layer.cl" },
-    { "normalization_layer_in_map_nhwc", "normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nchw", "normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" },
-    { "normalize_planar_yuv_layer_q8_nhwc", "normalize_planar_yuv_layer_quantized.cl" },
-    { "pad_layer_constant", "pad_layer.cl" },
-    { "pad_layer_symmetric_reflect", "pad_layer.cl" },
-    { "permute", "permute.cl" },
-    { "pixelwise_mul_complex", "pixelwise_mul_float.cl" },
-    { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
-    { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
-    { "pixelwise_mul_quantized", "pixelwise_mul_int.cl" },
-    { "pooling_layer_2", "pooling_layer.cl" },
-    { "pooling_layer_3", "pooling_layer.cl" },
-    { "pooling_layer_optimized_3", "pooling_layer.cl" },
-    { "pooling_layer_7", "pooling_layer.cl" },
-    { "pooling_layer_MxN_nchw", "pooling_layer.cl" },
-    { "pooling_layer_MxN_nhwc", "pooling_layer.cl" },
-    { "pooling_layer_2x2_nhwc", "pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices_fp32", "pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices_fp16", "pooling_layer.cl" },
-    { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
-    { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
-    { "prior_box_layer_nchw", "prior_box_layer.cl" },
-    { "qlstm_layer_normalization", "qlstm_layer_normalization.cl" },
-    { "quantization_layer", "quantization_layer.cl" },
-    { "range", "range.cl" },
-    { "range_quantized", "range.cl" },
-    { "reduction_operation_x", "reduction_operation.cl" },
-    { "reduction_operation_non_parallel_x", "reduction_operation.cl" },
-    { "reduction_operation_y", "reduction_operation.cl" },
-    { "reduction_operation_z", "reduction_operation.cl" },
-    { "reduction_operation_w", "reduction_operation.cl" },
-    { "remap_nearest_neighbour_nchw", "remap.cl" },
-    { "remap_bilinear_nchw", "remap.cl" },
-    { "remap_nearest_neighbour_nhwc", "remap.cl" },
-    { "remap_bilinear_nhwc", "remap.cl" },
-    { "reorg_layer_nchw", "reorg_layer.cl" },
-    { "reorg_layer_nhwc", "reorg_layer.cl" },
-    { "reshape_layer", "reshape_layer.cl" },
-    { "reshape_to_columns", "convolution_layer.cl" },
-    { "reverse", "reverse.cl" },
-    { "roi_align_layer", "roi_align_layer.cl" },
-    { "roi_align_layer_quantized", "roi_align_layer_quantized.cl" },
-    { "roi_pooling_layer", "roi_pooling_layer.cl" },
-    { "scale_nearest_neighbour_nchw", "scale.cl" },
-    { "scale_nearest_neighbour_nhwc", "scale.cl" },
-    { "scale_bilinear_nchw", "scale.cl" },
-    { "scale_bilinear_nhwc", "scale.cl" },
-    { "scale_bilinear_quantized_nchw", "scale_quantized.cl" },
-    { "scale_bilinear_quantized_nhwc", "scale_quantized.cl" },
-    { "select_same_rank", "select.cl" },
-    { "select_different_rank_2", "select.cl" },
-    { "select_different_rank_n", "select.cl" },
-    { "softmax_layer_norm", "softmax_layer.cl" },
-    { "softmax_layer_norm_quantized", "softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
-    { "space_to_batch_nchw", "space_to_batch.cl" },
-    { "space_to_batch_static_nchw", "space_to_batch.cl" },
-    { "space_to_batch_nhwc", "space_to_batch.cl" },
-    { "space_to_batch_static_nhwc", "space_to_batch.cl" },
-    { "space_to_depth_nchw", "space_to_depth.cl" },
-    { "space_to_depth_nhwc", "space_to_depth.cl" },
-    { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
-    { "stack_layer", "stack_layer.cl" },
-    { "strided_slice", "slice_ops.cl" },
-    { "tile", "tile.cl" },
-    { "transpose", "transpose.cl" },
-    { "upsample_layer_nchw", "upsample_layer.cl" },
-    { "upsample_layer_nhwc", "upsample_layer.cl" },
-    { "winograd_filter_transform_2x2_3x3_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_3x1_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x3_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_3x1_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_3x1_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x2_7x7_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_7x1_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x7_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz2_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz2_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_output_transform_2x2_3x3_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_3x1_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x3_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_2x2_7x7_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_7x1_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x7_nhwc", "winograd_output_transform.cl" },
-};
-
-const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
-{
-#ifdef EMBEDDED_KERNELS
-    {
-        "activation_layer.cl",
-#include "./cl_kernels/activation_layer.clembed"
-    },
-    {
-        "activation_layer_quant.cl",
-#include "./cl_kernels/activation_layer_quant.clembed"
-    },
-    {
-        "arg_min_max.cl",
-#include "./cl_kernels/arg_min_max.clembed"
-    },
-    {
-        "batch_to_space.cl",
-#include "./cl_kernels/batch_to_space.clembed"
-    },
-    {
-        "bitwise_op.cl",
-#include "./cl_kernels/bitwise_op.clembed"
-    },
-    {
-        "bounding_box_transform.cl",
-#include "./cl_kernels/bounding_box_transform.clembed"
-    },
-    {
-        "bounding_box_transform_quantized.cl",
-#include "./cl_kernels/bounding_box_transform_quantized.clembed"
-    },
-    {
-        "channel_shuffle.cl",
-#include "./cl_kernels/channel_shuffle.clembed"
-    },
-    {
-        "col2im.cl",
-#include "./cl_kernels/col2im.clembed"
-    },
-    {
-        "comparisons.cl",
-#include "./cl_kernels/comparisons.clembed"
-    },
-    {
-        "concatenate.cl",
-#include "./cl_kernels/concatenate.clembed"
-    },
-    {
-        "convert_fc_weights.cl",
-#include "./cl_kernels/convert_fc_weights.clembed"
-    },
-    {
-        "convolution_layer.cl",
-#include "./cl_kernels/convolution_layer.clembed"
-    },
-    {
-        "copy_tensor.cl",
-#include "./cl_kernels/copy_tensor.clembed"
-    },
-    {
-        "crop_tensor.cl",
-#include "./cl_kernels/crop_tensor.clembed"
-    },
-    {
-        "upsample_layer.cl",
-#include "./cl_kernels/upsample_layer.clembed"
-    },
-    {
-        "deconvolution_layer.cl",
-#include "./cl_kernels/deconvolution_layer.clembed"
-    },
-    {
-        "cast.cl",
-#include "./cl_kernels/cast.clembed"
-    },
-    {
-        "depth_to_space.cl",
-#include "./cl_kernels/depth_to_space.clembed"
-    },
-    {
-        "depthwise_convolution.cl",
-#include "./cl_kernels/depthwise_convolution.clembed"
-    },
-    {
-        "depthwise_convolution_quantized.cl",
-#include "./cl_kernels/depthwise_convolution_quantized.clembed"
-    },
-    {
-        "dequantization_layer.cl",
-#include "./cl_kernels/dequantization_layer.clembed"
-    },
-    {
-        "direct_convolution1x1.cl",
-#include "./cl_kernels/direct_convolution1x1.clembed"
-    },
-    {
-        "direct_convolution3x3.cl",
-#include "./cl_kernels/direct_convolution3x3.clembed"
-    },
-    {
-        "direct_convolution5x5.cl",
-#include "./cl_kernels/direct_convolution5x5.clembed"
-    },
-    {
-        "direct_convolution_quantized.cl",
-#include "./cl_kernels/direct_convolution_quantized.clembed"
-    },
-    {
-        "direct_convolution.cl",
-#include "./cl_kernels/direct_convolution.clembed"
-    },
-    {
-        "elementwise_operation.cl",
-#include "./cl_kernels/elementwise_operation.clembed"
-    },
-    {
-        "elementwise_operation_quantized.cl",
-#include "./cl_kernels/elementwise_operation_quantized.clembed"
-    },
-    {
-        "elementwise_unary.cl",
-#include "./cl_kernels/elementwise_unary.clembed"
-    },
-    {
-        "fft.cl",
-#include "./cl_kernels/fft.clembed"
-    },
-    {
-        "fft_digit_reverse.cl",
-#include "./cl_kernels/fft_digit_reverse.clembed"
-    },
-    {
-        "fft_scale.cl",
-#include "./cl_kernels/fft_scale.clembed"
-    },
-    {
-        "fill_border.cl",
-#include "./cl_kernels/fill_border.clembed"
-    },
-    {
-        "floor.cl",
-#include "./cl_kernels/floor.clembed"
-    },
-    {
-        "gather.cl",
-#include "./cl_kernels/gather.clembed"
-    },
-    {
-        "gemm.cl",
-#include "./cl_kernels/gemm.clembed"
-    },
-    {
-        "gemm_v1.cl",
-#include "./cl_kernels/gemm_v1.clembed"
-    },
-    {
-        "gemmlowp.cl",
-#include "./cl_kernels/gemmlowp.clembed"
-    },
-    {
-        "gemv.cl",
-#include "./cl_kernels/gemv.clembed"
-    },
-    {
-        "generate_proposals.cl",
-#include "./cl_kernels/generate_proposals.clembed"
-    },
-    {
-        "generate_proposals_quantized.cl",
-#include "./cl_kernels/generate_proposals_quantized.clembed"
-    },
-    {
-        "helpers.h",
-#include "./cl_kernels/helpers.hembed"
-    },
-    {
-        "helpers_asymm.h",
-#include "./cl_kernels/helpers_asymm.hembed"
-    },
-    {
-        "im2col.cl",
-#include "./cl_kernels/im2col.clembed"
-    },
-    {
-        "instance_normalization.cl",
-#include "./cl_kernels/instance_normalization.clembed"
-    },
-    {
-        "l2_normalize.cl",
-#include "./cl_kernels/l2_normalize.clembed"
-    },
-    {
-        "mean_stddev_normalization.cl",
-#include "./cl_kernels/mean_stddev_normalization.clembed"
-    },
-    {
-        "memset.cl",
-#include "./cl_kernels/memset.clembed"
-    },
-    {
-        "minmax_layer.cl",
-#include "./cl_kernels/minmax_layer.clembed"
-    },
-    {
-        "nonmax.cl",
-#include "./cl_kernels/nonmax.clembed"
-    },
-    {
-        "normalization_layer.cl",
-#include "./cl_kernels/normalization_layer.clembed"
-    },
-    {
-        "normalize_planar_yuv_layer.cl",
-#include "./cl_kernels/normalize_planar_yuv_layer.clembed"
-    },
-    {
-        "normalize_planar_yuv_layer_quantized.cl",
-#include "./cl_kernels/normalize_planar_yuv_layer_quantized.clembed"
-    },
-    {
-        "batchnormalization_layer.cl",
-#include "./cl_kernels/batchnormalization_layer.clembed"
-    },
-    {
-        "pad_layer.cl",
-#include "./cl_kernels/pad_layer.clembed"
-    },
-    {
-        "permute.cl",
-#include "./cl_kernels/permute.clembed"
-    },
-    {
-        "pixelwise_mul_float.cl",
-#include "./cl_kernels/pixelwise_mul_float.clembed"
-    },
-    {
-        "pixelwise_mul_int.cl",
-#include "./cl_kernels/pixelwise_mul_int.clembed"
-    },
-    {
-        "pooling_layer.cl",
-#include "./cl_kernels/pooling_layer.clembed"
-    },
-    {
-        "pooling_layer_quantized.cl",
-#include "./cl_kernels/pooling_layer_quantized.clembed"
-    },
-    {
-        "prior_box_layer.cl",
-#include "./cl_kernels/prior_box_layer.clembed"
-    },
-    {
-        "qlstm_layer_normalization.cl",
-#include "./cl_kernels/qlstm_layer_normalization.clembed"
-    },
-    {
-        "quantization_layer.cl",
-#include "./cl_kernels/quantization_layer.clembed"
-    },
-    {
-        "range.cl",
-#include "./cl_kernels/range.clembed"
-    },
-    {
-        "reduction_operation.cl",
-#include "./cl_kernels/reduction_operation.clembed"
-    },
-    {
-        "remap.cl",
-#include "./cl_kernels/remap.clembed"
-    },
-    {
-        "reorg_layer.cl",
-#include "./cl_kernels/reorg_layer.clembed"
-    },
-    {
-        "reshape_layer.cl",
-#include "./cl_kernels/reshape_layer.clembed"
-    },
-    {
-        "reverse.cl",
-#include "./cl_kernels/reverse.clembed"
-    },
-    {
-        "roi_align_layer.cl",
-#include "./cl_kernels/roi_align_layer.clembed"
-    },
-    {
-        "roi_align_layer_quantized.cl",
-#include "./cl_kernels/roi_align_layer_quantized.clembed"
-    },
-    {
-        "roi_pooling_layer.cl",
-#include "./cl_kernels/roi_pooling_layer.clembed"
-    },
-    {
-        "scale.cl",
-#include "./cl_kernels/scale.clembed"
-    },
-    {
-        "scale_quantized.cl",
-#include "./cl_kernels/scale_quantized.clembed"
-    },
-    {
-        "select.cl",
-#include "./cl_kernels/select.clembed"
-    },
-    {
-        "softmax_layer.cl",
-#include "./cl_kernels/softmax_layer.clembed"
-    },
-    {
-        "softmax_layer_quantized.cl",
-#include "./cl_kernels/softmax_layer_quantized.clembed"
-    },
-    {
-        "slice_ops.cl",
-#include "./cl_kernels/slice_ops.clembed"
-    },
-    {
-        "space_to_batch.cl",
-#include "./cl_kernels/space_to_batch.clembed"
-    },
-    {
-        "space_to_depth.cl",
-#include "./cl_kernels/space_to_depth.clembed"
-    },
-    {
-        "stack_layer.cl",
-#include "./cl_kernels/stack_layer.clembed"
-    },
-    {
-        "tile.cl",
-#include "./cl_kernels/tile.clembed"
-    },
-    {
-        "transpose.cl",
-#include "./cl_kernels/transpose.clembed"
-    },
-    {
-        "types.h",
-#include "./cl_kernels/types.hembed"
-    },
-    {
-        "unpooling_layer.cl",
-#include "./cl_kernels/unpooling_layer.clembed"
-    },
-    {
-        "winograd_filter_transform.cl",
-#include "./cl_kernels/winograd_filter_transform.clembed"
-    },
-    {
-        "winograd_input_transform.cl",
-#include "./cl_kernels/winograd_input_transform.clembed"
-    },
-    {
-        "winograd_output_transform.cl",
-#include "./cl_kernels/winograd_output_transform.clembed"
-    },
-#endif /* EMBEDDED_KERNELS */
-};
-
-ClKernelLibrary &ClKernelLibrary::get()
-{
-    static ClKernelLibrary _kernel_library;
-    return _kernel_library;
-}
-
-std::string ClKernelLibrary::program_name(const std::string &kernel_name) const
-{
-    // Find which program contains the kernel
-    auto kernel_program_it = _kernel_program_map.find(kernel_name);
-
-    if(_kernel_program_map.end() == kernel_program_it)
-    {
-        ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
-    }
-
-    const std::string program_name = kernel_program_it->second;
-
-    return program_name;
-}
-
-void ClKernelLibrary::set_kernel_path(std::string kernel_path)
-{
-    _kernel_path = std::move(kernel_path);
-    _kernel_path += "/";
-}
-
-const std::string &ClKernelLibrary::kernel_path() const
-{
-    return _kernel_path;
-}
-
-ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &program_name) const
-{
-#ifdef EMBEDDED_KERNELS
-#ifdef ARM_COMPUTE_COMPRESSED_KERNELS
-    const auto inflatted_program_source_it = _decompressed_source_map.find(program_name);
-    if(inflatted_program_source_it != _decompressed_source_map.end())
-    {
-        return ClProgramInfo{ inflatted_program_source_it->second, false };
-    }
-#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
-
-    const auto program_source_it = _program_source_map.find(program_name);
-    if(program_source_it == _program_source_map.end())
-    {
-        ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
-    }
-    std::string program_source = program_source_it->second;
-
-#ifdef ARM_COMPUTE_COMPRESSED_KERNELS
-    std::string decompressed_program_source = decompress_zlib(decode_base64(program_source_it->second));
-    ARM_COMPUTE_ERROR_ON_MSG(decompressed_program_source.empty(), "Cannot de-compress requested program");
-    _decompressed_source_map.insert(std::make_pair(program_name, decompressed_program_source));
-    program_source = std::move(decompressed_program_source);
-#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
-
-    return ClProgramInfo{ program_source, false };
-#else  /* EMBEDDED_KERNELS */
-    // Check for binary
-    std::string source_name = _kernel_path + program_name;
-    std::string binary_name = source_name + "bin";
-    std::string program_source{};
-    bool        is_binary = false;
-
-    if(std::ifstream(binary_name).is_open())
-    {
-        program_source = read_file(binary_name, true);
-        is_binary      = true;
-    }
-    else if(std::ifstream(source_name).is_open())
-    {
-        program_source = read_file(source_name, false);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str());
-    }
-
-    return ClProgramInfo{ program_source, is_binary };
-#endif /* EMBEDDED_KERNELS */
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/ClKernelLibrary.h b/src/core/gpu/cl/ClKernelLibrary.h
deleted file mode 100644
index 42bec95032..0000000000
--- a/src/core/gpu/cl/ClKernelLibrary.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_KERNEL_LIBRARY_H
-#define ARM_COMPUTE_CL_KERNEL_LIBRARY_H
-
-#include <map>
-#include <string>
-#include <tuple>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** ClKernelLibrary contains all the OpenCL kernels that are used throughout the library
- *
- * @note Kernel library is a singleton to reduce memory requirements
- * @note Sole responsibility is just to provide access to the kernel string,
- *       does not perform any compilation and relevant tasks
- */
-class ClKernelLibrary final
-{
-private:
-    /** Default Constructor */
-    ClKernelLibrary() = default;
-    /** Prevent instances of this class from being copied */
-    ClKernelLibrary(const ClKernelLibrary &) = delete;
-    /** Prevent instances of this class from being copied */
-    const ClKernelLibrary &operator=(const ClKernelLibrary &) = delete;
-
-public:
-    /** Structure to encapsulte program related information */
-    struct ClProgramInfo
-    {
-        std::string program{};          /**< Program raw string */
-        bool        is_binary{ false }; /**< Flag that indicates if is in binary format */
-    };
-
-public:
-    /** Access the KernelLibrary singleton
-     *
-     * @return The KernelLibrary instance
-     */
-    static ClKernelLibrary &get();
-    /** Sets the path that the kernels reside in
-     *
-     * @param[in] kernel_path Path of the kernel
-     */
-    void set_kernel_path(std::string kernel_path);
-    /** Gets the path that the kernels reside in
-     */
-    const std::string &kernel_path() const;
-    /** Gets the source of the selected program
-     *
-     * @param[in] program_name Program name
-     *
-     * @return A pair with the source (false) or the binary (true), of the selected program
-     */
-    ClProgramInfo program(const std::string &program_name) const;
-    /** Returns the program name given a kernel name
-     *
-     * @return Program name
-     */
-    std::string program_name(const std::string &kernel_name) const;
-
-private:
-    std::string _kernel_path{};                                                 /**< Path to the kernels folder. */
-    mutable std::map<std::string, std::string>      _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */
-    static const std::map<std::string, std::string> _kernel_program_map;        /**< Map that associates kernel names with programs. */
-    static const std::map<std::string, std::string> _program_source_map;        /**< Contains sources for all programs.
-                                                                                     Used for compile-time kernel inclusion. >*/
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_KERNEL_LIBRARY_H */
diff --git a/src/core/gpu/cl/IClKernel.h b/src/core/gpu/cl/IClKernel.h
deleted file mode 100644
index 52ea3c9183..0000000000
--- a/src/core/gpu/cl/IClKernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICL_KERNEL_H
-#define ARM_COMPUTE_ICL_KERNEL_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-using IClKernel = arm_compute::ICLKernel;
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ICL_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClActivationKernel.cpp b/src/core/gpu/cl/kernels/ClActivationKernel.cpp
deleted file mode 100644
index 17a8c6498d..0000000000
--- a/src/core/gpu/cl/kernels/ClActivationKernel.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClActivationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-#include <set>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
-
-    static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations =
-    {
-        ActivationLayerInfo::ActivationFunction::RELU,
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::LOGISTIC,
-        ActivationLayerInfo::ActivationFunction::TANH,
-        ActivationLayerInfo::ActivationFunction::HARD_SWISH,
-        ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
-    };
-    const DataType                                data_type = src->data_type();
-    const QuantizationInfo                       &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
-    const ActivationLayerInfo::ActivationFunction f_act     = act_info.activation();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0),
-                                    "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
-
-    // Checks performed when destination is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClActivationKernel::ClActivationKernel()
-    : _run_in_place(false)
-{
-}
-
-void ClActivationKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    _run_in_place = (dst == nullptr) || (dst == src);
-
-    if(dst != nullptr)
-    {
-        // Destination auto inizialitation if not yet initialized
-        auto_init_if_empty(*dst, *src->clone());
-    }
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, (dst != nullptr) ? dst : nullptr, act_info));
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
-
-    const DataType dt      = src->data_type();
-    float          a_const = act_info.a();
-    float          b_const = act_info.b();
-
-    const ActivationLayerInfo::ActivationFunction f_act        = act_info.activation();
-    const bool                                    is_quantized = is_data_type_quantized(dt);
-    const bool                                    perform_activation_in_float =
-        (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-        || (f_act == ActivationLayerInfo::ActivationFunction::TANH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(perform_activation_in_float, "-DFLOAT_DOMAIN");
-    build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
-    build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-
-    std::string kernel_name = std::string("activation_layer");
-
-    // Set quantization info build options
-    if(is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-
-        if(!perform_activation_in_float)
-        {
-            int a_const_int = 0;
-            int b_const_int = 0;
-
-            // Create quantized version of constants a, b if needed
-            switch(dt)
-            {
-                case DataType::QASYMM8:
-                {
-                    a_const_int = quantize_qasymm8(a_const, iq_info);
-                    b_const_int = quantize_qasymm8(b_const, iq_info);
-                }
-                break;
-                case DataType::QASYMM8_SIGNED:
-                {
-                    a_const_int = quantize_qasymm8_signed(a_const, iq_info);
-                    b_const_int = quantize_qasymm8_signed(b_const, iq_info);
-                }
-                break;
-                case DataType::QSYMM16:
-                {
-                    a_const_int = quantize_qsymm16(a_const, iq_info);
-                    b_const_int = quantize_qsymm16(b_const, iq_info);
-                }
-                break;
-                default:
-                    break;
-            }
-            build_opts.add_option(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
-            build_opts.add_option(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
-        }
-        else
-        {
-            build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
-            build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
-        }
-
-        // Quantized value of 0 corresponds to the offset o1
-        build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
-        build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale)));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
-
-        // Set correct kernel name
-        kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
-
-        // Set scale and offset of the source and destination if they have different quantization info
-        if(dst != nullptr)
-        {
-            const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-            if(iq_info != oq_info)
-            {
-                build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale)));
-                build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
-            }
-        }
-    }
-    else
-    {
-        // Set A, B constants in build options for float types
-        build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
-        build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "activation_layer_";
-    _config_id += lower_string(string_from_data_type(dt));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(1));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
-    return Status{};
-}
-
-void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        if(!_run_in_place)
-        {
-            add_3D_tensor_argument(idx, dst, slice);
-        }
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClActivationKernel.h b/src/core/gpu/cl/kernels/ClActivationKernel.h
deleted file mode 100644
index 68c309e9e7..0000000000
--- a/src/core/gpu/cl/kernels/ClActivationKernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
-#define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the activation kernel. */
-class ClActivationKernel : public IClKernel
-{
-public:
-    ClActivationKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClActivationKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] src             Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     dst             Destination tensor info. Data type supported: same as @p src
-     * @param[in]      act_info        Activation layer information.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClActivationKernel
-     *
-     * @param[in] src      Source tensor info. In case of @p dst tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[in] dst      Destination tensor info. Data type supported: same as @p src
-     * @param[in] act_info Activation layer information.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-
-private:
-    bool _run_in_place;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ACTIVATION_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
deleted file mode 100644
index 26f5113822..0000000000
--- a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst);
-
-    return Status{};
-}
-} // namespace
-
-ClBatchConcatenateKernel::ClBatchConcatenateKernel()
-    : _batch_offset(0)
-{
-}
-
-void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    _batch_offset = batch_offset;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
-
-    // Configure kernel window
-    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-    win.set(3, Window::Dimension(0, src->tensor_shape()[3], 1));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "concatenate_";
-    _config_id += support::cpp11::to_string(3);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(batch_offset);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(3));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClBatchConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
-                                          unsigned int                    batch_offset,
-                                          const arm_compute::ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst));
-    return Status{};
-}
-
-void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice = window.first_slice_window_3D();
-
-    const int offset_to_first_elements_in_bytes = _batch_offset * dst->info()->strides_in_bytes()[3];
-
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
-    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace opencl
-} // namespace kernels
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h
deleted file mode 100644
index d9fa905e8e..0000000000
--- a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
-#define ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the batch concatenate kernel.
- *  The src tensor will be concatenated into the destination tensor.
- */
-class ClBatchConcatenateKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClBatchConcatenateKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClBatchConcatenateKernel);
-    /** Initialise the kernel's source and destination
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     src             Source tensor info. Data types supported: All.
-     * @param[in]     batch_offset    The offset on axis # 3.
-     * @param[in,out] dst             Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
-     * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
-    /**  Static function to check if given info will lead to a valid configuration of @ref ClBatchConcatenateKernel
-     *
-     * @param[in] src          Input tensor info. Data types supported: All.
-     * @param[in] batch_offset The offset on axis # 3.
-     * @param[in] dst          Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _batch_offset;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClCastKernel.cpp b/src/core/gpu/cl/kernels/ClCastKernel.cpp
deleted file mode 100644
index 7a1d5c2824..0000000000
--- a/src/core/gpu/cl/kernels/ClCastKernel.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClCastKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different");
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void ClCastKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
-    set_shape_if_empty(*dst, src->tensor_shape());
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Get data sizes
-    const size_t src_size = data_size_from_type(src->data_type());
-    const size_t dst_size = data_size_from_type(dst->data_type());
-
-    // Get number of elements to process per iterations
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
-    // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
-    build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
-    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), "-DIS_DATA_TYPE_FLOAT");
-    build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
-
-    // Create kernel
-    const std::string kernel_name = (src_size >= dst_size) ? "cast_down" : "cast_up";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Collapse window
-    const Window &full_window      = window();
-    Window        collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
-    ICLKernel::configure_internal(collapsed_window);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(src->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-}
-
-Status ClCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
-    return Status{};
-}
-
-void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClCastKernel.h b/src/core/gpu/cl/kernels/ClCastKernel.h
deleted file mode 100644
index 451aa9c1ab..0000000000
--- a/src/core/gpu/cl/kernels/ClCastKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_CAST_KERNEL_H
-#define ARM_COMPUTE_CL_CAST_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Casts a given tensor to a new type
- *
- * @note When casting between quantized types the scale and zeroPoint are ignored
- */
-class ClCastKernel : public IClKernel
-{
-public:
-    ClCastKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCastKernel);
-    /** Set the src and dst of the kernel.
-     *
-     * Valid conversions src -> dst :
-     *
-     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             The source tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
-     * @param[out] dst             The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy          Conversion policy
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClCastKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_CAST_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
deleted file mode 100644
index 49f2f68a76..0000000000
--- a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                     DataLayout data_layout)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto initialisation if not yet initialized
-    auto_init_if_empty(*dst, *src->clone());
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    ARM_COMPUTE_ERROR_THROW_ON(ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
-
-    const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
-
-    const int width_idx   = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::WIDTH);
-    const int height_idx  = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::HEIGHT);
-    const int channel_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::CHANNEL);
-
-    const unsigned int num_elems_per_src_plane = original_src_shape[width_idx] * original_src_shape[height_idx];
-    const unsigned int num_channels            = original_src_shape[channel_idx];
-
-    const unsigned int factor_1 = (data_layout == DataLayout::NCHW) ? num_elems_per_src_plane : num_channels;
-    const unsigned int factor_2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_src_plane;
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
-    build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(factor_1));
-    build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(factor_2));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "convert_fc_weights", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                      DataLayout data_layout)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_src_shape.total_size_lower(3));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
-
-    // Checks performed when dst is configured
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-
-void ClConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, src, window);
-    add_2D_tensor_argument(idx, dst, window);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
deleted file mode 100644
index 11ab4d2a0d..0000000000
--- a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-#define ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
- *
- * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
- *       - It follows a Convolution layer
- *       - The data layout used by the network does not match the one the model has been trained in.
- *
- * @note This function assumes the weights are already reshaped (transposed)
- */
-namespace opencl
-{
-namespace kernels
-{
-class ClConvertFullyConnectedWeightsKernel : public IClKernel
-{
-public:
-    ClConvertFullyConnectedWeightsKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClConvertFullyConnectedWeightsKernel);
-    /** Set the src and dst tensor.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  src                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[out] dst                The converted weights tensor info. Shape and Data Type: Same as @p src.
-     * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
-     * @param[in]  data_layout        The data layout the weights have been trained in.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClConvertFullyConnectedWeightsKernel
-     *
-     * @param[in] src                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[in] dst                The converted weights tensor info. Shape and Data Type: Same as @p src.
-     * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
-     * @param[in] data_layout        The data layout the weights have been trained in.
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClCopyKernel.cpp b/src/core/gpu/cl/kernels/ClCopyKernel.cpp
deleted file mode 100644
index d6c87f8fad..0000000000
--- a/src/core/gpu/cl/kernels/ClCopyKernel.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClCopyKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-
-    // Validate dst if initialized
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        if(dst_window == nullptr)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst_window->shape());
-        }
-    }
-
-    return Status{};
-}
-
-} // namespace
-
-void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, *src);
-
-    // Configure window
-    const unsigned int vec_size_x = adjust_vec_size(16 / src->element_size(), src->dimension(0));
-
-    const Window win_config = calculate_max_window(*src, Steps(vec_size_x));
-
-    if(dst_window != nullptr)
-    {
-        _has_dst_window                = true;
-        _dst_window                    = Window(*dst_window);
-        const int  width_x             = dst_window->num_iterations(0);
-        const int  vec_size_x_leftover = width_x % vec_size_x;
-        const bool multi_access_x      = width_x >= static_cast<int32_t>(vec_size_x);
-
-        if(multi_access_x)
-        {
-            _dst_window.set(Window::DimX, Window::Dimension(dst_window->x().start(), ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
-        }
-
-        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
-    }
-    else
-    {
-        const int width_x             = src->tensor_shape().x();
-        const int vec_size_x_leftover = width_x % vec_size_x;
-
-        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
-    }
-
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-
-    // Build kernel
-    _kernel = create_kernel(compile_context, "copy_tensor", build_opts.options());
-
-    // Validate and set the window
-    ICLKernel::configure_internal(win_config);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window));
-
-    return Status{};
-}
-
-void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice;
-
-    if(_has_dst_window)
-    {
-        slice            = window.first_slice_window_3D();
-        Window out_slice = _dst_window.first_slice_window_3D();
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, src, slice);
-            add_3D_tensor_argument(idx, dst, out_slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
-    }
-    else
-    {
-        Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-        slice            = collapsed.first_slice_window_3D();
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, src, slice);
-            add_3D_tensor_argument(idx, dst, slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(collapsed.slide_window_slice_3D(slice));
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClCopyKernel.h b/src/core/gpu/cl/kernels/ClCopyKernel.h
deleted file mode 100644
index b1b9672bcb..0000000000
--- a/src/core/gpu/cl/kernels/ClCopyKernel.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_COPY_KERNEL_H
-#define ARM_COMPUTE_CL_COPY_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to perform a copy between two tensors */
-class ClCopyKernel : public IClKernel
-{
-public:
-    ClCopyKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCopyKernel);
-    /** Initialize the kernel's src, dst.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: All.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClCopyKernel
-     *
-     * @param[in] src        Source tensor info. Data types supported: All.
-     * @param[in] dst        Destination tensor info. Data types supported: same as @p src.
-     * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    Window _dst_window{};
-    bool   _has_dst_window{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_COPY_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClCropKernel.cpp b/src/core/gpu/cl/kernels/ClCropKernel.cpp
deleted file mode 100644
index 1d322eefa1..0000000000
--- a/src/core/gpu/cl/kernels/ClCropKernel.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClCropKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-#include <map>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-void ClCropKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window);
-}
-
-void ClCropKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index,
-                             float extrapolation_value, Window *dst_window)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window));
-
-    _start               = start;
-    _batch_index         = batch_index;
-    _extrapolation_value = extrapolation_value;
-
-    const int vec_size_x = 4;
-    // Create and update the window (if needed)
-    Window win = calculate_max_window(*dst);
-
-    if(dst_window != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window);
-        win = *dst_window;
-    }
-
-    const int  dst_width_x    = win.num_iterations(0);
-    const bool multi_access_x = dst_width_x >= vec_size_x;
-    const bool remainder_x    = dst_width_x % vec_size_x > 0;
-
-    if(multi_access_x)
-    {
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
-    build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED=");
-    build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED=");
-    _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options());
-}
-
-Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
-{
-    ARM_COMPUTE_UNUSED(extrapolation_value, dst_window);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4);
-    ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2))
-                                || end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
-    ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3));
-    if(dst_window != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1);
-    }
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON(dst->num_dimensions() > 3);
-    }
-    return Status{};
-}
-
-void ClCropKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window in_slice = Window();
-    in_slice.use_tensor_dimensions(src->info()->tensor_shape());
-    in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step()));
-    in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1));
-
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, src, in_slice);
-    add_3D_tensor_argument(idx, dst, window);
-    add_argument(idx, _start.x);
-    add_argument(idx, _start.y);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClCropKernel.h b/src/core/gpu/cl/kernels/ClCropKernel.h
deleted file mode 100644
index ec0f8e58da..0000000000
--- a/src/core/gpu/cl/kernels/ClCropKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCROPKERNEL_H
-#define ARM_COMPUTE_CLCROPKERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to perform a copy between two tensors */
-class ClCropKernel : public IClKernel
-{
-public:
-    ClCropKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCropKernel);
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  src                 Source tensor info. Data type supported: All. Data layouts supported: NHWC.
-     * @param[out] dst                 Destination tensor info. Data type supported: F32
-     * @param[in]  start               Coordinates of where to start cropping the image.
-     * @param[in]  end                 Coordinates of where to end cropping the image.
-     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p src.
-     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
-     * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, Window *dst_window = nullptr);
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  src                 Source tensor info. Data type supported: All. Data layouts supported: NHWC.
-     * @param[out] dst                 Destination tensor info. Data type supported: F32
-     * @param[in]  start               Coordinates of where to start cropping the image.
-     * @param[in]  end                 Coordinates of where to end cropping the image.
-     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p src.
-     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
-     * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *dst_window = nullptr);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in] src                 Source tensor info. Data type supported: All. Data layouts supported: NHWC.
-     * @param[in] dst                 Destination tensor info. Data type supported: F32
-     * @param[in] start               Coordinates of where to start cropping the image.
-     * @param[in] end                 Coordinates of where to end cropping the image.
-     * @param[in] batch_index         Fourth dimension index of the 3D image to crop in @p src.
-     * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
-     * @param[in] dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *dst_window = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    Coordinates2D _start{};
-    uint32_t      _batch_index{};
-    float         _extrapolation_value{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCROPKERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
deleted file mode 100644
index 4039570da4..0000000000
--- a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(2) + depth_offset > dst->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, src, dst);
-
-    return Status{};
-}
-} // namespace
-
-ClDepthConcatenateKernel::ClDepthConcatenateKernel()
-    : _depth_offset(0)
-{
-}
-
-void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    _depth_offset = depth_offset;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
-
-    // Configure kernel window
-    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-    win.set(Window::DimZ, Window::Dimension(0, src->tensor_shape().z(), 1));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClDepthConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
-                                          unsigned int                    depth_offset,
-                                          const arm_compute::ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst));
-    return Status{};
-}
-
-void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice = window.first_slice_window_3D();
-
-    const int offset_to_first_elements_in_bytes = _depth_offset * dst->info()->strides_in_bytes()[2];
-
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
-    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h
deleted file mode 100644
index 103ef00695..0000000000
--- a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
-#define ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the depth concatenate kernel.
- *  The src tensor will be concatenated into the dst tensor.
- */
-class ClDepthConcatenateKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClDepthConcatenateKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDepthConcatenateKernel);
-    /** Initialise the kernel's source and destination
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset    The offset on the Z axis.
-     * @param[in,out] dst             Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
-     * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
-    /**  Static function to check if given info will lead to a valid configuration of @ref ClDepthConcatenateKernel
-     *
-     * @param[in] src          Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] depth_offset The offset on the Z axis.
-     * @param[in] dst          Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _depth_offset;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp
deleted file mode 100644
index f2758b759f..0000000000
--- a/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClDequantizeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
-
-    if(dst->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    const int  vec_size_x     = 16 / dst->element_size();
-    const int  output_width_x = dst->tensor_shape().x();
-    const bool multi_access_x = (output_width_x / vec_size_x > 0);
-
-    const bool  is_quantized_per_channel = is_data_type_quantized_per_channel(src->data_type());
-    std::string kernel_name              = "dequantization_layer";
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    if(!is_quantized_per_channel)
-    {
-        const UniformQuantizationInfo qinfo   = src->quantization_info().uniform();
-        const int                     qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0;
-        build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset));
-    }
-    else
-    {
-        kernel_name += "_per_channel";
-        kernel_name += src->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc";
-    }
-
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
-
-    // Create kernel name
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst);
-    if(multi_access_x)
-    {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type());
-
-    // Collapse windo
-    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3);
-    Window slice      = new_window.first_slice_window_3D();
-
-    if(is_quantized_per_channel)
-    {
-        unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
-        _kernel.setArg(idx++, src->quantization().scale->cl_buffer());
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(new_window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDequantizeKernel.h b/src/core/gpu/cl/kernels/ClDequantizeKernel.h
deleted file mode 100644
index 33e0164cc9..0000000000
--- a/src/core/gpu/cl/kernels/ClDequantizeKernel.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the dequantization layer kernel. */
-class ClDequantizeKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClDequantizeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDequantizeKernel);
-    /** Initialise the kernel's input and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] dst             Destination tensor info. Data types supported: F16/F32.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClDequantizeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
deleted file mode 100644
index 94c4044bff..0000000000
--- a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ /dev/null
@@ -1,667 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-
-    const DataLayout data_layout = src->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
-                                    "Weights feature map dimension should match the respective src's one");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9)
-                                    && std::get<0>(conv_info.stride()) > 2,
-                                    "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(data_layout != DataLayout::NHWC && !is_data_type_float(src->data_type()) && act_info.enabled(),
-                                    "Activation supported only for floating point and NHWC.");
-
-    if(data_layout == DataLayout::NCHW)
-    {
-        if(is_data_type_quantized(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
-                                            "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
-                                            "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
-        }
-    }
-
-    if(biases != nullptr)
-    {
-        if(is_data_type_quantized_asymmetric(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
-                                        "Biases size and number of src feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
-    }
-
-    // Checks performed when dst is configured
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    const auto data_type = src->data_type();
-    if(is_data_type_quantized(data_type))
-    {
-        const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-        const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-    }
-    return Status{};
-}
-
-inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
-                                                      DataType data_type, DataLayout data_layout)
-{
-    return gpu_target_is_in(gpu_target,
-                            GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                            GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                            GPUTarget::G52, GPUTarget::G52LIT)
-           && (kernel_size <= 5)
-           && (conv_stride_x == 1) && (conv_stride_y == 1)
-           && (data_type == DataType::F32)
-           && (data_layout == DataLayout::NCHW);
-}
-
-inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
-                                 unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
-                                 unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *src)
-{
-    const DataType   data_type     = src->data_type();
-    const DataLayout data_layout   = src->data_layout();
-    unsigned int     conv_stride_x = std::get<0>(conv_info.stride());
-    unsigned int     conv_stride_y = std::get<1>(conv_info.stride());
-
-    const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
-
-    if(run_optimized_bifrost)
-    {
-        // Configure kernel window
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                num_elems_read_per_iteration_x    = 4;
-                num_elems_read_per_iteration_y    = 4;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 4;
-                break;
-            }
-            case 3:
-            {
-                num_elems_read_per_iteration_x    = 6;
-                num_elems_read_per_iteration_y    = 5;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 3;
-                break;
-            }
-            case 5:
-            {
-                num_elems_read_per_iteration_x    = 8;
-                num_elems_read_per_iteration_y    = 6;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 2;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
-            }
-        }
-    }
-    else
-    {
-        num_elems_read_per_iteration_y    = kernel_size;
-        num_elems_written_per_iteration_x = 8;
-        num_elems_written_per_iteration_y = 1;
-        switch(kernel_size)
-        {
-            case 1:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 8;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 16;
-                        break;
-                    case 3:
-                        switch(src->element_size())
-                        {
-                            case 1:
-                                num_elems_read_per_iteration_x = 28;
-                                break;
-                            case 2:
-                                num_elems_read_per_iteration_x = 24;
-                                break;
-                            case 4:
-                                num_elems_read_per_iteration_x = 22;
-                                break;
-                            default:
-                                ARM_COMPUTE_ERROR("Invalid data size");
-                        }
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 3:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 10;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 17;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 5:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 12;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 20;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 9:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 16;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 24;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Invalid direct convolution size");
-        }
-    }
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target)
-{
-    const DataLayout data_layout = src->data_layout();
-
-    // Get dst shape
-    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
-
-    if(data_layout == DataLayout::NHWC)
-    {
-        const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u);
-        unsigned int       num_rows = 1U;
-        if(dst->tensor_shape()[0] > 16)
-        {
-            num_rows = src->data_type() == DataType::F32 ? 2U : 4U;
-        }
-
-        // Create window and update padding
-        Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
-        return std::make_pair(Status{}, win);
-    }
-    else if(data_layout == DataLayout::NCHW)
-    {
-        const int          width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const unsigned int kernel_size = weights->dimension(width_idx);
-
-        unsigned int num_elems_read_per_iteration_x    = 0;
-        unsigned int num_elems_read_per_iteration_y    = 0;
-        unsigned int num_elems_written_per_iteration_x = 0;
-        unsigned int num_elems_written_per_iteration_y = 0;
-
-        unsigned int conv_pad_left = conv_info.pad_left();
-        unsigned int conv_pad_top  = conv_info.pad_top();
-        unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-        unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-
-        setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
-                             num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
-                             kernel_size, conv_info, target, src);
-
-        // Create window and update padding
-        bool   window_changed = false;
-        Window win            = calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
-        AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
-        AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
-        AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-        return std::make_pair(err, win);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported");
-    }
-}
-
-bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout)
-{
-    if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
-    {
-        return false;
-    }
-
-    // If not floating point
-    if(!is_data_type_float(tensor->data_type()))
-    {
-        return false;
-    }
-
-    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
-    {
-        return false;
-    }
-
-    // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
-    if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
-    {
-        return false;
-    }
-
-    // Check cl image pitch alignment
-    if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
-    {
-        return false;
-    }
-
-    const size_t image_w     = tensor->tensor_shape()[0] / 4;
-    const size_t image_h     = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
-    const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
-    const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
-
-    if(image_w > max_image_w || image_h > max_image_h)
-    {
-        return false;
-    }
-
-    return true;
-}
-
-} // namespace
-
-BorderSize ClDirectConv2dKernel::border_size() const
-{
-    return _border_size;
-}
-
-void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                     const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-
-    // Perform validation
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info));
-
-    const int conv_stride_x = std::get<0>(conv_info.stride());
-    const int conv_stride_y = std::get<1>(conv_info.stride());
-
-    _data_layout = src->data_layout();
-    _conv_info   = conv_info;
-
-    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const unsigned int kernel_size = weights->dimension(width_idx);
-    const DataType     data_type   = src->data_type();
-
-    const GPUTarget gpu_target = get_target();
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    std::stringstream kernel_name;
-    CLBuildOptions    build_options;
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        _border_size = BorderSize();
-
-        kernel_name << "direct_convolution_nhwc";
-
-        const unsigned int n0                 = win_config.second.x().step();
-        const unsigned int m0                 = win_config.second.y().step();
-        const unsigned int k0                 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx));
-        const unsigned int partial_store_n0   = dst->dimension(channel_idx) % n0;
-        const unsigned int pad_left           = conv_info.pad_left();
-        const unsigned int pad_top            = conv_info.pad_top();
-        const bool         export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout);
-
-        // Update the padding for the weights tensor if we can export to cl_image
-        if(export_to_cl_image)
-        {
-            gemm::update_padding_for_cl_image(weights);
-        }
-
-        if(biases != nullptr)
-        {
-            build_options.add_option(std::string("-DHAS_BIAS"));
-            build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
-        }
-
-        build_options.add_option("-cl-fast-relaxed-math");
-        build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
-        build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
-        build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
-        build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
-        build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-        build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
-        build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx)));
-        build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx)));
-        build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx)));
-        build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-        build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
-        build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
-        build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
-        build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
-        build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
-        build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
-        build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
-        build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
-        build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
-        build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
-        build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
-        build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-        build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-
-        if(is_data_type_quantized(data_type))
-        {
-            const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-            const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-            const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-            PixelValue zero_value = PixelValue(0, src->data_type(), src->quantization_info());
-            int        zero_value_s32;
-            zero_value.get(zero_value_s32);
-
-            float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-            int   output_multiplier = 0;
-            int   output_shift      = 0;
-            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-            build_options.add_option("-DIS_QUANTIZED");
-            build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-            build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
-            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
-            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
-            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
-            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
-            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
-        }
-        else
-        {
-            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
-            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
-            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-            build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-        }
-    }
-    else
-    {
-        _border_size = BorderSize(src->padding());
-
-        kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
-
-        build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS"));
-
-        const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout);
-
-        if(run_optimized_for_bifrost)
-        {
-            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
-
-            kernel_name << "_f32_bifrost";
-        }
-        else
-        {
-            build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
-            build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
-            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
-            build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
-            build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-
-            if(is_data_type_quantized(data_type))
-            {
-                const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-                const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-                const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-                float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-                int   output_multiplier = 0;
-                int   output_shift      = 0;
-                quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-                build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-                build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-                build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
-                build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
-                build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
-                build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
-
-                kernel_name.str("direct_convolution_quantized");
-            }
-        }
-    }
-
-    _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(data_type));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(kernel_size);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().left);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().top);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().right);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().bottom);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_stride_x);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_stride_y);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(width_idx));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(height_idx));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                      const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first);
-
-    return Status{};
-}
-
-void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    // Get initial windows
-    Window slice = window.first_slice_window_3D();
-
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        cl::Image2D weights_cl_image;
-
-        const size_t dim_y_collapsed    = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step());
-        const bool   export_to_cl_image = export_to_cl_image_support(weights->info(), get_target(), _data_layout);
-
-        slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step()));
-        slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1));
-
-        if(export_to_cl_image)
-        {
-            const size_t      image_w = weights->info()->dimension(0) / 4;
-            const size_t      image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
-            const TensorShape shape2d(image_w, image_h);
-            const size_t      image_row_pitch = weights->info()->strides_in_bytes()[1];
-
-            // Export cl_buffer to cl_image
-            weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch);
-        }
-
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src, slice);
-        add_4D_tensor_argument(idx, dst, slice);
-        if(export_to_cl_image)
-        {
-            _kernel.setArg(idx++, weights_cl_image);
-        }
-        add_4D_tensor_argument(idx, weights, slice);
-        if(biases != nullptr)
-        {
-            add_1D_tensor_argument(idx, biases, slice);
-        }
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    else
-    {
-        Window win_in = window;
-
-        win_in.adjust(Window::DimX, -_conv_info.pad_left(), true);
-        win_in.adjust(Window::DimY, -_conv_info.pad_top(), true);
-
-        const int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-        const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-        const int conv_stride_x = std::get<0>(_conv_info.stride());
-        const int conv_stride_y = std::get<1>(_conv_info.stride());
-
-        win_in.set_dimension_step(width_idx, window[width_idx].step() * conv_stride_x);
-        win_in.set_dimension_step(height_idx, window[height_idx].step() * conv_stride_y);
-
-        Window       slice_in = win_in.first_slice_window_3D();
-        unsigned int idx1     = 2 * num_arguments_per_3D_tensor();
-        add_3D_tensor_argument(idx1, weights, slice);
-
-        if(biases != nullptr)
-        {
-            Window slice_biases;
-            slice_biases.use_tensor_dimensions(biases->info()->tensor_shape());
-            add_1D_tensor_argument(idx1, biases, slice_biases);
-        }
-
-        _kernel.setArg(idx1++, static_cast<unsigned int>(weights->info()->strides_in_bytes()[3]));
-
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, src, slice_in);
-            add_3D_tensor_argument(idx, dst, slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
deleted file mode 100644
index e76666fd36..0000000000
--- a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
-#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the  direct convolution kernel. */
-class ClDirectConv2dKernel : public IClKernel
-{
-public:
-    ClDirectConv2dKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConv2dKernel);
-    /** Set the src, weights, biases and dst tensors info.
-     *
-     * @note: Due to set_valid_region(), thus src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
-     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
-     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             The 3rd dimension must be the same as the src's volume 3rd dimension.
-     *                             Data type supported:Same as @p src.
-     * @param[in]  biases          Biases tensor info. Biases are 1D tensor with dimension [OFM].
-     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-     * @param[out] dst             Output tensor info.
-     *                             The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src.
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClDirectConv2dKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    DataLayout    _data_layout{};
-    BorderSize    _border_size{};
-    PadStrideInfo _conv_info{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
deleted file mode 100644
index 335ee9c392..0000000000
--- a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
+++ /dev/null
@@ -1,536 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/common/utils/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-#include <map>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-constexpr unsigned int vector_size_byte_opencl = 16;
-
-std::map<ArithmeticOperation, std::string> supported_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
-    { ArithmeticOperation::DIV, "DIV" },
-    { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" },
-    { ArithmeticOperation::MIN, "MIN" },
-    { ArithmeticOperation::MAX, "MAX" },
-    { ArithmeticOperation::POWER, "POWER" },
-    { ArithmeticOperation::PRELU, "PRELU" },
-};
-
-std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
-};
-
-std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    std::string config_id;
-    // Set config_id for enabling LWS tuning
-    config_id = kernel_name;
-    config_id += "_";
-    config_id += lower_string(string_from_data_type(src1.data_type()));
-    config_id += "_";
-    config_id += support::cpp11::to_string(dst.dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(dst.dimension(1));
-    return config_id;
-}
-
-Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
-                                        "Wrong shape for dst");
-    }
-
-    return Status{};
-}
-
-Status validate_arguments_divide_operation(const ITensorInfo* src1, const ITensorInfo* src2, const ITensorInfo* dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::F16, DataType::F32, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
-                                        "Wrong shape for dst");
-    }
-
-    return Status{};
-}
-
-Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
-
-    const bool is_quantized = is_data_type_quantized(src1.data_type()) || is_data_type_quantized(src2.data_type());
-    if(is_quantized)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
-
-        if(is_data_type_quantized_symmetric(src1.data_type()))
-        {
-            const int32_t in1_offset = src1.quantization_info().uniform().offset;
-            const int32_t in2_offset = src2.quantization_info().uniform().offset;
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_offset != 0, "For quantized symmetric, offset must be zero");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(in2_offset != 0, "For quantized symmetric, offset must be zero");
-        }
-    }
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                             DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                             DataType::S32, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((dst.data_type() == DataType::U8) && ((src1.data_type() != DataType::U8) || (src2.data_type() != DataType::U8)),
-                                        "dst can only be U8 if both inputs are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
-                                        "Wrong shape for dst");
-
-        if(is_quantized)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-
-            if(is_data_type_quantized_symmetric(dst.data_type()))
-            {
-                const int32_t offset = dst.quantization_info().uniform().offset;
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero");
-            }
-        }
-    }
-    return Status{};
-}
-
-CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const std::string &operation_string)
-{
-    CLBuildOptions build_opts;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
-
-    build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1.data_type()));
-    build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2.data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst.data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DOP=" + operation_string);
-    if(is_data_type_quantized(src1.data_type()))
-    {
-        const UniformQuantizationInfo iq1info = src1.quantization_info().uniform();
-        const UniformQuantizationInfo iq2info = src2.quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo  = dst.quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(iq1info.offset));
-        build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(iq2info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(oqinfo.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1info.scale));
-        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
-    }
-    build_opts.add_option_if(src1.data_type() == DataType::S32, "-DS32");
-
-    return build_opts;
-}
-
-std::pair<Status, Window> configure_window_arithmetic_common(ITensorInfo &dst)
-{
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
-    Window             win                               = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
-    return std::make_pair(Status{}, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
-{
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
-    const TensorShape &out_shape = broadcast_pair.first;
-
-    set_shape_if_empty(dst, out_shape);
-
-    if(src1.data_type() == DataType::S16 || src2.data_type() == DataType::S16)
-    {
-        set_format_if_unknown(dst, Format::S16);
-    }
-    else if(src1.data_type() == DataType::F16 || src2.data_type() == DataType::F16)
-    {
-        set_format_if_unknown(dst, Format::F16);
-    }
-    else if(src1.data_type() == DataType::F32 || src2.data_type() == DataType::F32)
-    {
-        set_format_if_unknown(dst, Format::F32);
-    }
-    else if(src1.data_type() == DataType::QASYMM8 || src2.data_type() == DataType::QASYMM8)
-    {
-        set_data_type_if_unknown(dst, DataType::QASYMM8);
-    }
-    else if(src1.data_type() == DataType::QASYMM8_SIGNED || src2.data_type() == DataType::QASYMM8_SIGNED)
-    {
-        set_data_type_if_unknown(dst, DataType::QASYMM8_SIGNED);
-    }
-    else if(src1.data_type() == DataType::QSYMM16 || src2.data_type() == DataType::QSYMM16)
-    {
-        set_data_type_if_unknown(dst, DataType::QSYMM16);
-    }
-
-    return configure_window_arithmetic_common(dst);
-}
-
-std::pair<Status, Window> validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
-{
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
-    const TensorShape &out_shape = broadcast_pair.first;
-
-    set_shape_if_empty(dst, out_shape);
-    set_data_type_if_unknown(dst, DataType::U8);
-
-    // The arithmetic utility functions can be share
-    return configure_window_arithmetic_common(dst);
-}
-
-std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
-{
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
-    const TensorShape &out_shape = broadcast_pair.first;
-    auto_init_if_empty(dst, out_shape, 1, src1.data_type());
-    return configure_window_arithmetic_common(dst);
-}
-} // namespace
-
-void ClElementwiseKernel::configure_common(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    configure_common(CLKernelLibrary::get().get_compile_context(), src1, src2, dst);
-}
-
-void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(*src1, *src2, *dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    _src1 = src1;
-    _src2 = src2;
-    _dst  = dst;
-
-    std::string kernel_name = "elementwise_operation_" + name();
-    if(is_data_type_quantized(src1->data_type()))
-    {
-        kernel_name += "_quantized";
-    }
-
-    // Set kernel build options
-    CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst);
-    if(_act_info.enabled())
-    {
-        build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation())));
-        build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a()));
-        build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(_act_info.b()));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    ICLKernel::configure_internal(win_config.second);
-
-    _config_id = generate_id_for_tuning(kernel_name, *src1, *dst);
-}
-
-void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
-    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
-    const TensorShape &out_shape = dst->info()->tensor_shape();
-
-    bool       can_collapse = true;
-    const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
-    {
-        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-        {
-            can_collapse = (in_shape1[d] == in_shape2[d]);
-        }
-    }
-
-    bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
-    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-    Window slice      = collapsed.first_slice_window_3D();
-    Window slice_src1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-    Window slice_src2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src_0, slice_src1);
-        add_3D_tensor_argument(idx, src_1, slice_src2);
-        add_3D_tensor_argument(idx, dst, slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1));
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-
-/** Logical binary */
-
-void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst));
-    _op = op;
-    configure_common(compile_context, src1, src2, dst);
-}
-
-Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(op);
-    ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
-
-    return Status{};
-}
-
-std::string ClLogicalBinaryKernel::name()
-{
-    switch(_op)
-    {
-        case LogicalOperation::And:
-            return "AND";
-        case LogicalOperation::Or:
-            return "OR";
-        case LogicalOperation::Not:
-        /* fall through */
-        default:
-            ARM_COMPUTE_ASSERT(true);
-    }
-    return "";
-}
-
-std::pair<Status, Window> ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
-{
-    return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst);
-}
-
-CLBuildOptions ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
-{
-    // The arithmetic utility functions can be share
-    return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
-}
-
-std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    return generate_id_for_tuning_common(kernel_name, src1, dst);
-}
-
-/** Arithmetic operations with saturation*/
-void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
-                                            const ConvertPolicy       &policy,
-                                            const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info));
-    auto padding_info = get_padding_info({ input1, input2, output });
-
-    _policy   = policy;
-    _op       = op;
-    _act_info = act_info;
-    configure_common(compile_context, input1, input2, output);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
-                                             const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(op, policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
-
-    return Status{};
-}
-
-std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
-{
-    return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
-}
-
-CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
-{
-    const bool has_float_out = is_data_type_float(output.data_type());
-    auto       build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
-    build_options.add_option((_policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
-    return build_options;
-}
-
-std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
-{
-    auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
-    config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
-    config_id += lower_string(string_from_data_layout(input1.data_layout()));
-    return config_id;
-}
-
-std::string ClSaturatedArithmeticKernel::name()
-{
-    return supported_sat_arithmetic_ops[_op];
-}
-
-/** Arithmetic operations*/
-void ClArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                                   const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info));
-    auto padding_info = get_padding_info({ src1, src2, dst });
-
-    _op       = op;
-    _act_info = act_info;
-    configure_common(compile_context, src1, src2, dst);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    if(op == ArithmeticOperation::DIV)
-    {
-        // Partial integer support S32/F32/F16
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
-    }
-    else if(op == ArithmeticOperation::POWER)
-    {
-        // Power operators doesn't support integer arithmetic
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
-
-    return Status{};
-}
-std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
-{
-    if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
-    {
-        // Division and Power operators don't support integer arithmetic
-        return validate_and_configure_window_for_division(src1, src2, dst);
-    }
-    else
-    {
-        return validate_and_configure_window_for_arithmetic_operators(src1, src2, dst);
-    }
-}
-
-CLBuildOptions ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
-{
-    return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
-}
-std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    return generate_id_for_tuning_common(kernel_name, src1, dst);
-}
-
-std::string ClArithmeticKernel::name()
-{
-    return supported_arithmetic_ops[_op];
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.h b/src/core/gpu/cl/kernels/ClElementwiseKernel.h
deleted file mode 100644
index 4ed8ae73ab..0000000000
--- a/src/core/gpu/cl/kernels/ClElementwiseKernel.h
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
-#define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
-
-#include "src/core/KernelTypes.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for an element-wise operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ dst(x,y) = OP(src1(x,y), src2(x,y))@f]
- *
- */
-class ClElementwiseKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClElementwiseKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementwiseKernel);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-
-protected:
-    /** The name of the operation */
-    virtual std::string name() = 0;
-
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in] src1 First source tensor info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
-     * @param[in] dst  Destination tensor info. Data types supported: same as @p src1.
-     *
-     * @return a pair of Status and Window
-     */
-    virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
-
-    /** Generate the build options for the specific kernel
-     *
-     * @reutrn a CLBuildOptions struct
-     */
-    virtual CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
-
-    /** Generate the identifier for tuning
-     *
-     * @reutrn a string
-     */
-    virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
-
-    /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
-     *
-     */
-    void configure_common(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
-    /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
-     *
-     */
-    void configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
-
-    ActivationLayerInfo _act_info{};
-
-private:
-    const ITensorInfo *_src1{ nullptr }; /**< Source tensor info 1 */
-    const ITensorInfo *_src2{ nullptr }; /**< Source tensor info 2 */
-    ITensorInfo       *_dst{ nullptr };  /**< Destination tensor info */
-};
-
-class ClLogicalBinaryKernel : public ClElementwiseKernel
-{
-public:
-    /** Default constructor */
-    ClLogicalBinaryKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogicalBinaryKernel);
-    /** Function to configure kernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] op              Logical binary operation to be executed.
-     * @param[in] src1            First source tensor info. Data types supported: U8.
-     * @param[in] src2            Second source tensor info. Data types supported: same as @p src1.
-     * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
-     */
-    void configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
-    /** Static function to check if the given configuration is valid for this kernel
-     *
-     * @param[in] op   Logical binary operation to be executed.
-     * @param[in] src1 First source tensor info. Data types supported: U8.
-     * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
-     * @param[in] dst  Destination tensor info. Data types supported: same as @p src1.
-     */
-    static Status validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
-
-private:
-    // Inherited methods overridden:
-    std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
-
-    LogicalOperation _op{ LogicalOperation::Unknown };
-};
-
-/** Addition operation */
-class ClSaturatedArithmeticKernel : public ClElementwiseKernel
-{
-public:
-    ClSaturatedArithmeticKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClSaturatedArithmeticKernel);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClSaturatedArithmeticKernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] input1          First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] input2          Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output          Output tensor info. Data types supported: Same as @p input1.
-     * @param[in] policy          Policy to use to handle overflow.
-     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration of @ref ClSaturatedArithmeticKernel
-     *
-     * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input info info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] input2   Second tensor input info info. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor info info. Data types supported: Same as @p input1.
-     * @param[in] policy   Policy to use to handle overflow.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a Status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-protected:
-    // Inherited methods overridden:
-    std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
-
-private:
-    ConvertPolicy       _policy{};
-    ArithmeticOperation _op{};
-};
-
-class ClArithmeticKernel : public ClElementwiseKernel
-{
-public:
-    ClArithmeticKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClArithmeticKernel);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref ClArithmeticKernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] src2            Second source tensor info. Data types supported: same as @p src1.
-     * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
-     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration of @ref ClArithmeticKernel
-     *
-     * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] src1     First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] src2     Second source tensor info. Data types supported: same as @p src1.
-     * @param[in] dst      Destination tensor info. Data types supported: same as @p src1.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a Status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-protected:
-    // Inherited methods overridden:
-    std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
-
-private:
-    ArithmeticOperation _op{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
deleted file mode 100644
index 5cbb3f2e38..0000000000
--- a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    if(op == ElementWiseUnary::LOGICAL_NOT)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8);
-    }
-    else if(op == ElementWiseUnary::NEG)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32);
-    }
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op));
-
-    const std::string kernel_name    = "elementwise_unary";
-    const int         vec_size_x     = 16 / dst->element_size();
-    const int         dst_width_x    = dst->tensor_shape().x();
-    const bool        multi_access_x = (dst_width_x / vec_size_x > 0);
-
-    // Set kernel build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            build_opts.add_option("-DOPERATION=rsqrt_op");
-            break;
-        case ElementWiseUnary::EXP:
-            build_opts.add_option("-DOPERATION=exp_op");
-            break;
-        case ElementWiseUnary::NEG:
-            build_opts.add_option("-DOPERATION=neg_op");
-            break;
-        case ElementWiseUnary::SIN:
-            build_opts.add_option("-DOPERATION=sin_op");
-            break;
-        case ElementWiseUnary::ABS:
-            build_opts.add_option("-DOPERATION=fabs_op");
-            break;
-        case ElementWiseUnary::LOG:
-            build_opts.add_option("-DOPERATION=natural_log_op");
-            break;
-        case ElementWiseUnary::ROUND:
-            build_opts.add_option("-DOPERATION=round_op");
-            break;
-        case ElementWiseUnary::LOGICAL_NOT:
-            build_opts.add_option("-DOPERATION=logical_not_op");
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst);
-    if(multi_access_x)
-    {
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClElementWiseUnaryKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op)
-{
-    ARM_COMPUTE_UNUSED(op);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src, *dst, op));
-
-    return Status{};
-}
-
-void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h
deleted file mode 100644
index 7e5edef3ee..0000000000
--- a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H
-#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the elementwise unary operator */
-class ClElementWiseUnaryKernel : public IClKernel
-{
-public:
-    ClElementWiseUnaryKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementWiseUnaryKernel);
-    /** Initialise the kernel's srcs, dst.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             First source tensor info. Data types supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     * @param[in]  op              Element wise unary operation to perform.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClElementWiseUnaryKernel
-     *
-     * @param[in] src First source tensor info. Data types supported: F16/F32.
-     * @param[in] dst Destination tensor info. Data types supported: same as @p src.
-     * @param[in] op  Element wise unary operation to perform.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClFillKernel.cpp b/src/core/gpu/cl/kernels/ClFillKernel.cpp
deleted file mode 100644
index b194ee549b..0000000000
--- a/src/core/gpu/cl/kernels/ClFillKernel.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClFillKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-void ClFillKernel::configure(ITensorInfo      *tensor,
-                             const PixelValue &constant_value,
-                             Window           *window)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, window);
-}
-
-void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor,
-                             const PixelValue &constant_value,
-                             Window           *window)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window));
-
-    const DataType data_type  = tensor->data_type();
-    const int      vec_size_x = 16 / tensor->element_size();
-
-    // Create and update the window (if needed)
-    _full_window = calculate_max_window(*tensor);
-    Window win   = _full_window;
-    if(window != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
-        win = *window;
-    }
-
-    const int  output_width_x = win.num_iterations(0);
-    const bool multi_access_x = output_width_x >= vec_size_x;
-    const bool remainder_x    = output_width_x % vec_size_x > 0;
-
-    if(multi_access_x)
-    {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
-    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
-    _kernel = create_kernel(compile_context, "memset", build_opts.options());
-}
-
-Status ClFillKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window)
-{
-    ARM_COMPUTE_UNUSED(tensor);
-    ARM_COMPUTE_UNUSED(constant_value);
-    if(window != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
-    }
-    return Status{};
-}
-
-void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-
-    // Collapse all the batches on the third
-    Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, tensor, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClFillKernel.h b/src/core/gpu/cl/kernels/ClFillKernel.h
deleted file mode 100644
index b439eac0de..0000000000
--- a/src/core/gpu/cl/kernels/ClFillKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_FILL_KERNEL_H
-#define ARM_COMPUTE_CL_FILL_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for filling the planes of a tensor */
-class ClFillKernel : public IClKernel
-{
-public:
-    ClFillKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFillKernel);
-    /** Initialise the kernel's tensor and filling value
-     *
-     * @param[in,out] tensor         Input tensor info. Supported data types: All.
-     * @param[in]     constant_value The value used to fill the planes of the tensor
-     * @param[in]     window         Window to be used in case setting only part of a tensor. Default is nullptr.
-     */
-    void configure(ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
-    /** Initialise the kernel's tensor and filling value
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] tensor          Input tensor info. Supported data types: All.
-     * @param[in]     constant_value  The value used to fill the planes of the tensor
-     * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClFillKernel
-     *
-     * @param[in] tensor         Source tensor info. Data types supported: All.
-     * @param[in] constant_value The value used to fill the planes of the tensor
-     * @param[in] window         Window to be used in case setting only part of a tensor. Default is nullptr.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    Window _full_window{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMEMSETRKERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClFloorKernel.cpp b/src/core/gpu/cl/kernels/ClFloorKernel.cpp
deleted file mode 100644
index 7296d40eaf..0000000000
--- a/src/core/gpu/cl/kernels/ClFloorKernel.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClFloorKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-
-    // Validate in case of configured output
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Auto initialize output
-    auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
-
-    // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
-
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
-    const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
-    CLBuildOptions     build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "floor_layer", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(vec_size_x));
-    IClKernel::configure_internal(win);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClFloorKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClFloorKernel.h b/src/core/gpu/cl/kernels/ClFloorKernel.h
deleted file mode 100644
index 646dfb30d8..0000000000
--- a/src/core/gpu/cl/kernels/ClFloorKernel.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_FLOOR_KERNEL_H
-#define ARM_COMPUTE_CL_FLOOR_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to perform a floor operation */
-class ClFloorKernel : public IClKernel
-{
-public:
-    ClFloorKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFloorKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data type supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Same as @p src
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref ClFloorKernel
-     *
-     * @param[in] src Source tensor info. Data type supported: F16/F32.
-     * @param[in] dst Destination tensor info. Same as @p src
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FLOOR_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp
deleted file mode 100644
index 817a105b14..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,533 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-inline Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float beta,
-                                 bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (src0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The src1 tensor cannot have more than 2 dimensions if src0 has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((reshape_info.reinterpret_input_as_3d() || reshape_info.depth_output_gemm3d() != 0) && (src2 != nullptr)
-                                    && (!reshape_info.broadcast_bias()),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-
-    if(!is_interleaved_transposed)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != src1->dimension(1));
-
-        if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-        {
-            const unsigned int m         = reshape_info.reinterpret_input_as_3d() ? src0->dimension(1) * src0->dimension(2) : src0->dimension(1);
-            const unsigned int n         = src1->dimension(0);
-            const unsigned int src2_dim0 = src2->dimension(0);
-            const unsigned int src2_dim1 = src2->dimension(1);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-            if(reshape_info.broadcast_bias())
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
-            }
-        }
-    }
-    else
-    {
-        GEMMRHSMatrixInfo rhs_info;
-        GEMMLHSMatrixInfo lhs_info;
-        const auto        m                         = static_cast<unsigned int>(reshape_info.m());
-        const auto        n                         = static_cast<unsigned int>(reshape_info.n());
-        const int         k                         = reshape_info.k();
-        const int         mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int         mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-        rhs_info.n0                                 = max_cl_vector_width / src1->element_size();
-        rhs_info.k0                                 = 1;
-        rhs_info.h0                                 = mult_transpose1xW_width;
-        rhs_info.interleave                         = false;
-        rhs_info.transpose                          = false;
-        lhs_info.m0                                 = 4;
-        lhs_info.k0                                 = 4;
-        lhs_info.v0                                 = mult_interleave4x4_height;
-        lhs_info.interleave                         = true;
-        lhs_info.transpose                          = true;
-
-        TensorShape tensor_shape0{ src0->tensor_shape() };
-        tensor_shape0.set(0, k);
-        tensor_shape0.set(1, m);
-
-        TensorShape tensor_shape1{ src1->tensor_shape() };
-        tensor_shape1.set(0, n);
-        tensor_shape1.set(1, k);
-
-        const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
-        const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
-
-        const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-        const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
-
-        if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-        {
-            const unsigned int src2_dim0 = src2->dimension(0);
-            const unsigned int src2_dim1 = src2->dimension(1);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-            if(reshape_info.broadcast_bias())
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
-            }
-        }
-    }
-
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, is_interleaved_transposed, reshape_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-    }
-
-    return Status{};
-}
-
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                                                               float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
-                                                               ElementsProcessed &num_elements_processed)
-{
-    ARM_COMPUTE_UNUSED(beta);
-    bool   window_changed = false;
-    Window win{};
-    Window win_out{};
-
-    const DataType data_type                           = src0->data_type();
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool           reinterpret_input_as_3d             = reshape_info.reinterpret_input_as_3d();
-    bool           reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 0);
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_input_as_3d  = false;
-        reinterpret_output_as_3d = false;
-    }
-
-    // dst tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, is_interleaved_transposed, reshape_info)));
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    if(is_interleaved_transposed)
-    {
-        // reinterpret_input_as_3d is not supported if is_interleaved_transposed is set
-        ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d());
-
-        // Configure kernel window
-        num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
-        num_elems_processed_per_iteration_y = 4;
-
-        win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        if(src2 != nullptr)
-        {
-            const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-            const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y;
-
-            AccessWindowStatic src2_access(src2, 0, 0,
-                                           ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                           ceil_to_multiple(src2->dimension(1), bias_processed_per_iteration_y));
-
-            window_changed = update_window_and_padding(win, src2_access); // window used by the execute_window_loop
-        }
-    }
-    else // The input tensors have not been reshaped
-    {
-        // Special case for 1xN, 2xN, 3xN and 4xN src0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
-        num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
-        num_elems_processed_per_iteration_y = std::min(static_cast<int>(dst->dimension(1)), 4);
-
-        // Create kernels according to the architecture, data type and input size.
-        GPUTarget arch_target = get_arch_from_target(gpu_target);
-        if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
-        {
-            num_elems_processed_per_iteration_x = (src1->dimension(0) <= 1000 && src0->num_dimensions() == 1) ? 2 : 4;
-        }
-
-        // Configure window
-        win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1));
-        AccessWindowStatic src1_access(src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
-        AccessWindowStatic dst_access(dst, 0, 0,
-                                      dst->dimension(0),
-                                      dst->dimension(1));
-
-        if(src2 != nullptr)
-        {
-            const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-            AccessWindowStatic src2_access(src2, 0, 0,
-                                           ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                           src2->dimension(1));
-
-            window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
-                             update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
-        }
-        else
-        {
-            window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
-                             update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
-        }
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-void ClGemmMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
-                                           float beta,
-                                           bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, beta,
-                                                  is_interleaved_transposed, reshape_info, fp_mixed_precision));
-
-    auto padding_info = is_interleaved_transposed ? get_padding_info({ src0, src1, dst }) : get_padding_info({ src0, dst });
-
-    _reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
-    _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
-    _add_bias                 = src2 != nullptr;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_src0 = _reinterpret_input_as_3d ? src0->num_dimensions() - 1 : src0->num_dimensions();
-
-    _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0);
-
-    const DataType data_type = src0->data_type();
-
-    // Get target architecture
-    GPUTarget gpu_target = get_target();
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info,
-                                                    gpu_target, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false)
-    // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by dst->dimension(1)
-    const unsigned int internal_m = _reinterpret_output_as_3d ? dst->dimension(1) * dst->dimension(2) : dst->dimension(1);
-    const unsigned int n          = dst->dimension(0);
-
-    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1);
-    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2);
-
-    const unsigned int m0 = num_elements_processed.y();
-    const unsigned int n0 = num_elements_processed.x();
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % m0;
-    const unsigned int partial_store_n0 = n % n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation())));
-    build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a()));
-    build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b()));
-    build_opts.add_option("-DIN1_DIM_X=" + support::cpp11::to_string(src1->dimension(0)));
-
-    const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
-
-    std::string kernel_name;
-    if(is_interleaved_transposed)
-    {
-        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-
-        build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-        build_opts.add_option("-DN=" + support::cpp11::to_string(n));
-        build_opts.add_option("-DK=" + support::cpp11::to_string(src1->dimension(0) / (n0 * mult_transpose1xW_width)));
-        build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width));
-        build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height));
-        build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-        build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-        if(is_data_type_float(data_type) && is_bifrost)
-        {
-            kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
-        }
-        else
-        {
-            kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
-            if(fp_mixed_precision && data_type == DataType::F16)
-            {
-                // currently wider accumulator is only supported for fp16 kernels.
-                kernel_name += "_acc32";
-            }
-        }
-    }
-    else // The input tensors have not been reshaped
-    {
-        build_opts.add_option("-DN=" + support::cpp11::to_string(n));
-        build_opts.add_option("-DK=" + support::cpp11::to_string(src0->dimension(0)));
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-        build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-        build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-        build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-        build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-        // Create kernels according to the architecture, data type and input size.
-        if(is_data_type_float(data_type) && is_bifrost)
-        {
-            kernel_name = "gemm_mm_floating_point";
-
-            if(src0->num_dimensions() != 1)
-            {
-                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
-                if(fp_mixed_precision && data_type == DataType::F16)
-                {
-                    // currently wider accumulator is only supported for fp16 kernels.
-                    kernel_name += "_acc32";
-                }
-            }
-            else if(src1->dimension(0) <= 1000 && data_type == DataType::F32)
-            {
-                // The first kernel is optimized for the case of 1000 or less dst elements (e.g. FC8 of AlexNet and VGG-16, and
-                // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 dst elements (e.g.
-                // FC6 and FC7 of AlexNet and VGG-16).
-                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
-            }
-
-            // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
-            // via exhaustive autotuning over a range of representative layer configurations.
-            set_lws_hint(cl::NDRange(4));
-        }
-        else // (MIDGARD and F32) or (F16)
-        {
-            kernel_name = "gemm_mm_floating_point";
-        }
-    }
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "gemm_";
-    _config_id += (is_interleaved_transposed ? "reshaped_" : "");
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (reshape_info.broadcast_bias() ? "broadcast_bias_" : "");
-    _config_id += (fp_mixed_precision ? "fp_mixed_" : "");
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += lower_string(string_from_data_type(src0->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(3));
-    _config_id += "_";
-    _config_id += (is_interleaved_transposed ? support::cpp11::to_string(src1->dimension(0)) : support::cpp11::to_string(src1->dimension(1)));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                                            bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
-    // Note: num_elements_processed will be set in validate_and_configure_window()
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(activation_info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              (src2 != nullptr) ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              beta,
-                                                              is_interleaved_transposed,
-                                                              reshape_info,
-                                                              gpu_target,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void ClGemmMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
-
-    if(src1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0;
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias;
-        const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias;
-        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, src0, slice);
-        add_2D_tensor_argument(idx, src1, slice_b);
-        if(_add_bias)
-        {
-            add_2D_tensor_argument(idx, src2, slice);
-        }
-        add_2D_tensor_argument(idx, dst, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
-        }
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h
deleted file mode 100644
index c1601335ee..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result.
- *  For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object
- *
- * @note If the input tensors @p src0 and @p src1 have been reshaped respectively with @ref ClGemmReshapeLhsMatrixKernel" and @ref ClGemmReshapeRhsMatrixKernel,
- *       the flag @p is_interleaved_transposed must be set to true
- *
- * @attention @p src1 tensor must have at least 2 dimensions (matrix)
- */
-class ClGemmMatrixMultiplyKernel : public IClKernel
-{
-public:
-    ClGemmMatrixMultiplyKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyKernel);
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  compile_context           The compile context to be used.
-     * @param[in]  src0                      Input tensor containing the Matrix A. Data types supported: F16/F32
-     * @param[in]  src1                      Input tensor containing the Matrix B. Data type supported: same as @p src0
-     * @param[in]  src2                      Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p src0
-     * @param[out] dst                       Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0
-     * @param[in]  alpha                     Weight of the matrix product
-     * @param[in]  beta                      (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
-     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref ClGemmReshapeLhsMatrixKernel and @ref ClGemmReshapeRhsMatrixKernel
-     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in]  fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
-     * @param[in]  activation_info           (Optional) Activation to apply after the matrix multiplication
-     *
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta = 0.f,
-                   bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmMatrixMultiplyKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                           bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _add_bias{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
deleted file mode 100644
index 97d64c433c..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ /dev/null
@@ -1,411 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                          const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMKernelInfo    &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
-
-    const unsigned int m = gemm_info.m;
-    const unsigned int n = gemm_info.n;
-    const unsigned int k = gemm_info.k;
-
-    ARM_COMPUTE_UNUSED(m);
-    ARM_COMPUTE_UNUSED(n);
-    ARM_COMPUTE_UNUSED(k);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k);
-    if(gemm_info.reinterpret_input_as_3d)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
-    }
-
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-    {
-        const unsigned int src2_dim0 = src2->dimension(0);
-        const unsigned int src2_dim1 = src2->dimension(1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
-        }
-    }
-
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic src0_access(src0, 0, 0,
-                                   src0->dimension(0),
-                                   src0->dimension(1));
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
-    AccessWindowStatic dst_access(dst, 0, 0,
-                                  dst->dimension(0),
-                                  dst->dimension(1));
-
-    if(src2 != nullptr)
-    {
-        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                       src2->dimension(1));
-
-        window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
-                                                 float                    beta,
-                                                 const GEMMLHSMatrixInfo &lhs_info,
-                                                 const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-
-    auto padding_info         = get_padding_info({ src0, dst });
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
-    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _add_bias                 = src2 != nullptr;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_src0 = src0->num_dimensions();
-    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IClKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
-
-    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1);
-    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-
-    std::string kernel_name("gemm_mm_native");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
-    _config_id += lower_string(string_from_data_type(src0->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.k0);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                                                  const GEMMLHSMatrixInfo &lhs_info,
-                                                  const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
-
-    if(src1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        unsigned int idx0;
-        if(_add_bias)
-        {
-            idx0 = 4 * num_arguments_per_2D_tensor() + 4;
-        }
-        else
-        {
-            idx0 = 3 * num_arguments_per_2D_tensor() + 3;
-        }
-        const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
-        unsigned int idx0;
-        if(_add_bias)
-        {
-            idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0);
-        }
-        else
-        {
-            idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        }
-        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, src0, slice);
-        add_2D_tensor_argument(idx, src1, slice_b);
-        if(_add_bias)
-        {
-            add_2D_tensor_argument(idx, src2, slice);
-        }
-        add_2D_tensor_argument(idx, dst, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
-        }
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
deleted file mode 100644
index 4770b18b8e..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */
-class ClGemmMatrixMultiplyNativeKernel : public IClKernel
-{
-public:
-    ClGemmMatrixMultiplyNativeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyNativeKernel);
-    /** Initialise the kernel's input and dst.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src0            Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  src1            Input tensor for the RHS matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  src2            Input tensor containing the bias matrix. Data type supported: same as @p src0.
-     * @param[out] dst             dst tensor info. Data type supported: same as @p src0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
-     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same of lhs_info.k0
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmMatrixMultiplyNativeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
-    bool _add_bias{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H*/
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
deleted file mode 100644
index 27409b66ac..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ /dev/null
@@ -1,416 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                          const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMKernelInfo    &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
-    ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
-
-    const unsigned int m = gemm_info.m;
-    const unsigned int n = gemm_info.n;
-    const unsigned int k = gemm_info.k;
-
-    TensorShape tensor_shape0{ src0->tensor_shape() };
-    tensor_shape0.set(0, k);
-    tensor_shape0.set(1, m);
-
-    TensorShape tensor_shape1{ src1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-    {
-        const unsigned int src2_dim0 = src2->dimension(0);
-        const unsigned int src2_dim1 = src2->dimension(1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
-        }
-    }
-
-    const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
-    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
-
-    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
-
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    if(src2 != nullptr)
-    {
-        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-        const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y;
-
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                       ceil_to_multiple(src2->dimension(1), bias_processed_per_iteration_y));
-
-        window_changed = update_window_and_padding(win, src2_access); // window used by the execute_window_loop
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context,
-                                                   ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-
-    auto padding_info         = get_padding_info({ src0, dst });
-    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _add_bias                 = src2 != nullptr;
-    _export_to_cl_image       = rhs_info.export_to_cl_image;
-    _k                        = gemm_info.k;
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_src0 = src0->num_dimensions();
-    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    const bool     enable_mixed_precision = gemm_info.fp_mixed_precision;
-    const DataType data_type              = src0->data_type();
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
-
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
-    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION");
-    build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
-    build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
-    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
-    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    std::string kernel_name("gemm_mm_reshaped_");
-    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
-    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
-    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
-    _config_id += lower_string(string_from_data_type(src0->data_type()));
-    _config_id += "_";
-    _config_id += (enable_mixed_precision ? "mixed_precision_" : "");
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.v0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.interleave);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                                                    const GEMMLHSMatrixInfo &lhs_info,
-                                                    const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
-
-    if(src1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
-
-    cl::Image2D src1_image2d;
-
-    if(_export_to_cl_image)
-    {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
-        const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
-
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch);
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-
-        // LHS buffer
-        add_2D_tensor_argument(idx, src0, slice);
-
-        // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
-        {
-            _kernel.setArg(idx++, src1_image2d);
-        }
-        else
-        {
-            add_2D_tensor_argument(idx, src1, slice_b);
-        }
-
-        // Bias buffer (_add_bias == true)
-        add_2D_tensor_argument_if(_add_bias, idx, src2, slice);
-
-        // dst buffer
-        add_2D_tensor_argument(idx, dst, slice);
-
-        // K dimension (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
-
-        // LHS stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
-
-        // RHS stride_z (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-
-        // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
-        }
-
-        // dst stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-
-        // Cross-plan padding (if _reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
-        }
-
-        // Dispatch kernel
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index ab648f15ae..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped
- *
- * @note The input matrices @p src0 and @p src1 must be reshaped through:
- *  - @ref ClGemmReshapeLhsMatrixKernel
- *  - @ref ClGemmReshapeRhsMatrixKernel
- */
-class ClGemmMatrixMultiplyReshapedKernel : public IClKernel
-{
-public:
-    ClGemmMatrixMultiplyReshapedKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
-     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
-     *       multiplications. i.e. float c = (half)a * (half)b
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
-     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
-     *       the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement
-     *       -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src0            Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32  (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
-     * @param[in]  src1            Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3
-     * @param[in]  src2            Input tensor containing the bias matrix. Data type supported: same as @p src0.
-     * @param[out] dst             dst tensor to store the result of matrix multiplication. Data type supported: same as @p src0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used for reshaping the src0 tensor.  Only the following values are supported:
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     *                             lhs_info.transpose: false
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the src1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
-     *                             rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
-     *                             rhs_info.transpose: true
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const ClCompileContext &compile_context,
-                   ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_output_as_3d{ false };
-    bool         _use_dummy_work_items{ false };
-    bool         _add_bias{ false };
-    bool         _export_to_cl_image{ false };
-    unsigned int _k{ 1 };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H */
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
deleted file mode 100644
index 4eea2c6f76..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ /dev/null
@@ -1,438 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
-    ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
-
-    const unsigned int m = gemm_info.m;
-    const unsigned int n = gemm_info.n;
-    const unsigned int k = gemm_info.k;
-
-    TensorShape tensor_shape1{ src1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-    {
-        const unsigned int src2_dim0 = src2->dimension(0);
-        const unsigned int src2_dim1 = src2->dimension(1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0);
-        if(gemm_info.broadcast_bias)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
-        }
-    }
-
-    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
-
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
-    if(gemm_info.reinterpret_input_as_3d)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
-
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    // This approach should only be used when the input/dst tensors have pad on the y direction
-    if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    if(src2 != nullptr)
-    {
-        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                       src2->dimension(1));
-
-        window_changed = update_window_and_padding(win, src2_access);
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
-                                                          ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
-    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _add_bias                 = src2 != nullptr;
-    _export_to_cl_image       = rhs_info.export_to_cl_image;
-    _has_pad_y                = gemm_info.has_pad_y;
-
-    auto padding_info = get_padding_info({ src0, src1, dst });
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_src0 = src0->num_dimensions();
-    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
-
-    // These variables are used only if gemm_info.has_pad_y == true
-    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1);
-    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2);
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % internal_m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
-    build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    if(_has_pad_y)
-    {
-        build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-        build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
-    }
-
-    std::string kernel_name("gemm_mm_reshaped_only_rhs_");
-    kernel_name += rhs_info.transpose ? "t" : "nt";
-    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += (_has_pad_y ? "" : "no_pad_y_");
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
-    _config_id += lower_string(string_from_data_type(src0->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                                                           const GEMMLHSMatrixInfo &lhs_info,
-                                                           const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
-
-    if(src1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u;
-    const size_t rhs_idx_batch_size = 2u;
-    const size_t bia_idx_batch_size = 2u;
-    const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u;
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    // Get cross plane pads
-    const unsigned int total_cross_plane_pad_lhs = src0->info()->padding().top + src0->info()->padding().bottom;
-    const unsigned int total_cross_plane_pad_out = dst->info()->padding().top + dst->info()->padding().bottom;
-
-    // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor
-    ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0)));
-
-    cl::Image2D src1_image2d;
-
-    if(_export_to_cl_image)
-    {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
-        const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
-
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch);
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-
-        // LHS buffer
-        add_2D_tensor_argument(idx, src0, slice);
-
-        // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
-        {
-            _kernel.setArg(idx++, src1_image2d);
-        }
-        else
-        {
-            add_2D_tensor_argument(idx, src1, slice_b);
-        }
-
-        // Bias buffer (_add_bias == true)
-        add_2D_tensor_argument_if(_add_bias, idx, src2, slice);
-
-        // dst buffer
-        add_2D_tensor_argument(idx, dst, slice);
-
-        // LHS stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
-
-        // RHS stride_z (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size]));
-
-        // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
-        }
-
-        // dst stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
-
-        // Cross-plan padding (if _reinterpret_input_as_3d = true)
-        if(_reinterpret_input_as_3d && _has_pad_y)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
-        }
-
-        // Cross-plan padding (if reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d && _has_pad_y)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
-        }
-
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
deleted file mode 100644
index ff6c391e15..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to multiply matrices when only the input matrix RHS (src1) has been reshaped
- *
- * @note The input matrix src1 must be reshaped through @ref ClGemmReshapeRhsMatrixKernel
- */
-class ClGemmMatrixMultiplyReshapedOnlyRhsKernel : public ICLKernel
-{
-public:
-    ClGemmMatrixMultiplyReshapedOnlyRhsKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedOnlyRhsKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
-     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
-     *       the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement
-     *       -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src0            Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
-     *                             The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  src1            Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  src2            Input tensor containing the bias matrix. Data type supported: same as @p src0.
-     * @param[out] dst             Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
-     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the src1 tensor.  Only the following values are supported:
-     *                             rhs_info.k0: 2,3,4,8,16
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.transpose: true,false
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ClCompileContext &compile_context,
-                   ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
-    bool _add_bias{ false };
-    bool _export_to_cl_image{ false };
-    bool _has_pad_y{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
deleted file mode 100644
index 98161edfff..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
-    const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
-    bool               window_changed                      = false;
-
-    TensorInfo tmp_info(*src);
-
-    if(reinterpret_input_as_3d)
-    {
-        // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(src->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)));
-
-    // Configure window
-    Window win    = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    Window win_in = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic src_access(src, 0, 0,
-                                  src->dimension(0),
-                                  src->dimension(1));
-    AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1));
-
-    window_changed = update_window_and_padding(win_in, src_access) || // window used by the execute_window_loop
-                     update_window_and_padding(win, dst_access);      // window used to update the padding requirements of dst tensor
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
-
-    auto padding_info = get_padding_info({ src });
-
-    _reinterpret_input_as_3d = reinterpret_input_as_3d;
-
-    const unsigned int src_w           = src->dimension(0);
-    const unsigned int src_h           = _reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1);
-    const unsigned int partial_load_m0 = src_h % lhs_info.m0;
-    const unsigned int partial_load_k0 = src_w % lhs_info.k0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
-    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_w));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_h));
-    build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(src->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(src->dimension(2)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
-    build_opts.add_option("-DPARTIAL_LOAD_M0=" + support::cpp11::to_string(partial_load_m0));
-    build_opts.add_option("-DPARTIAL_LOAD_K0=" + support::cpp11::to_string(partial_load_k0));
-
-    std::string kernel_name("gemm_reshape_lhs_matrix_");
-    kernel_name += lhs_info.transpose ? "t" : "nt";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, lhs_info, reinterpret_input_as_3d);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "gemm_reshape_lhs_matrix_";
-    _config_id += (_reinterpret_input_as_3d ? "3d_" : "");
-    _config_id += lower_string(string_from_data_type(src->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.v0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.interleave);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.transpose);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), lhs_info, reinterpret_input_as_3d).first);
-
-    return Status{};
-}
-
-void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    Window slice = window.first_slice_window_3D();
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the src has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 2 * num_arguments_per_3D_tensor();
-        const unsigned int total_cross_plane_pad = src->info()->padding().top + src->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
deleted file mode 100644
index b830ba02b4..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication.
- *  In particular, this function splits the src matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and
- *  stores each one in the dst matrix unrolling the values
- */
-class ClGemmReshapeLhsMatrixKernel : public ICLKernel
-{
-public:
-    ClGemmReshapeLhsMatrixKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeLhsMatrixKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context       The compile context to be used.
-     * @param[in]  src                   Input tensor. Data types supported: All
-     * @param[out] dst                   Output tensor. Data type supported: same as @p src
-     * @param[in]  lhs_info              LHS matrix information to be used for reshaping. This object contains all the necessary
-     *                                   information to reshape the src tensor. Only the following values are supported:
-     *                                   lhs_info.m0: 2,3,4,5,6,7,8
-     *                                   lhs_info.k0: 2,3,4,8,16
-     *                                   lhs_info.v0: greater than 0
-     *                                   lhs_info.transpose: true, false
-     *                                   lhs_info.interleave: true, false
-     * @param[in]  reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d = false);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool _reinterpret_input_as_3d{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
deleted file mode 100644
index e1ef7c61aa..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
-
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    if(rhs_info.export_to_cl_image)
-    {
-        const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, src->data_type());
-        ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
-    }
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
-{
-    const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
-    const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
-    bool               window_changed                      = false;
-
-    // dst auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
-
-    // Configure window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-    window_changed = update_window_and_padding(win, src_access);
-
-    if(rhs_info.export_to_cl_image)
-    {
-        gemm::update_padding_for_cl_image(dst);
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, rhs_info));
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE");
-    build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE");
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
-
-    std::string kernel_name("gemm_reshape_rhs_matrix_");
-    kernel_name += rhs_info.transpose ? "t" : "nt";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, rhs_info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-}
-
-Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first);
-
-    return Status{};
-}
-
-void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    Window slice = window.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
deleted file mode 100644
index e877d87408..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication
- *  In particular, this kernel splits the src matrix in blocks of size K0xN0 and stores each one in
- *  the dst matrix unrolling the values */
-class ClGemmReshapeRhsMatrixKernel : public ICLKernel
-{
-public:
-    ClGemmReshapeRhsMatrixKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeRhsMatrixKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
-     *       required to create a OpenCL image object from buffer in @ref ClGemmMatrixMultiplyReshapedKernel and in @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel
-     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32, F16
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *       -# The output tensor should be only consumed by @ref ClGemmMatrixMultiplyReshapedKernel or @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Input tensor. Data types supported: All
-     * @param[out] dst             Output tensor. Data type supported: same as @p src
-     * @param[in]  rhs_info        RHS matrix information to be used for reshaping. This object contains all the necessary
-     *                             information to reshape the src tensor. Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
-     *                             rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
-     *                             rhs_info.h0: greater than 0
-     *                             rhs_info.transpose: true, false
-     *                             rhs_info.interleave: true, false
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
deleted file mode 100644
index 4436e98fe3..0000000000
--- a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-ClHeightConcatenateKernel::ClHeightConcatenateKernel()
-    : _height_offset(0)
-{
-}
-
-Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst));
-    return Status{};
-}
-
-void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    _height_offset = height_offset;
-
-    // Add build options
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2)));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_height", build_opts.options());
-    // Configure kernel window
-
-    // The window needs to be based on src as we copy all the heights of src
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, src, window);
-    add_4D_tensor_argument(idx, dst, window);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h
deleted file mode 100644
index 9a4380a5b7..0000000000
--- a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_HEIGHT_CONCATENATE_LAYER_KERNEL_H
-#define ARM_COMPUTE_CL_HEIGHT_CONCATENATE_LAYER_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the height concatenate kernel.
- *  The source tensor will be concatenated into the destination tensor.
- */
-class ClHeightConcatenateKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClHeightConcatenateKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClHeightConcatenateKernel);
-    /** Initialise the kernel's source and destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: All.
-     * @param[in]  height_offset   The starting offset on the Y axis for the dst tensor.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
-    /**  Static function to check if given info will lead to a valid configuration of @ref ClHeightConcatenateKernel
-     *
-     * @param[in] src           Source tensor info. Data types supported: All.
-     * @param[in] height_offset The starting offset on the Y axis for the dst tensor.
-     * @param[in] dst           Destination tensor info. Data types supported: same as @p src.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _height_offset;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_HEIGHT_CONCATENATE_LAYER_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClMulKernel.cpp b/src/core/gpu/cl/kernels/ClMulKernel.cpp
deleted file mode 100644
index b8081bbacf..0000000000
--- a/src/core/gpu/cl/kernels/ClMulKernel.cpp
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClMulKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(overflow_policy);
-    ARM_COMPUTE_UNUSED(rounding_policy);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                             1,
-                                                             DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                             DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                             DataType::S32, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
-                                        "Dst can only be U8 if both src are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
-                                        "Dst can only be QASYMM8 if both src are QASYMM8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
-                                        "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
-                                        "Dst can only be QSYMM16 if both src are QSYMM16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32),
-                                        "Dst must be S32 if source tensors are S32");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
-    }
-
-    return Status{};
-}
-} // namespace
-
-void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                            ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst,
-                                                  scale, overflow_policy, rounding_policy, act_info));
-
-    auto padding_info = get_padding_info({ src1, src2, dst });
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-    auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
-
-    int scale_int = -1;
-    // Extract sign, exponent and mantissa
-    int   exponent            = 0;
-    float normalized_mantissa = std::frexp(scale, &exponent);
-    // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
-    // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
-    // Moreover, it will be negative as we deal with 1/2^n
-    if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
-    {
-        // Store the positive exponent. We know that we compute 1/2^n
-        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
-        scale_int = std::abs(exponent - 1);
-    }
-
-    std::string acc_type;
-    // Check if it has float src and dst
-    if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
-    {
-        scale_int = -1;
-        acc_type  = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half";
-    }
-    else
-    {
-        if(src1->element_size() == 4 || src2->element_size() == 4)
-        {   
-            // use 64 bit accumulator for 32-bit input
-            acc_type = "long";
-        }
-        else if(src1->element_size() == 2 || src2->element_size() == 2)
-        {
-            // Use 32-bit accumulator for 16-bit input
-            acc_type = "int";
-        }
-        else
-        {
-            // Use 16-bit accumulator for 8-bit input
-            acc_type = "ushort";
-        }
-    }
-
-    const bool         is_quantized      = is_data_type_quantized(src1->data_type());
-    const unsigned int vec_size          = adjust_vec_size(16 / dst->element_size(), dst->dimension(0));
-    const unsigned int vec_size_leftover = dst->dimension(0) % vec_size;
-
-    // Set kernel build options
-    std::string    kernel_name = "pixelwise_mul";
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type()));
-    build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
-    build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    if(is_quantized && (dst->data_type() != DataType::S32))
-    {
-        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
-
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(src1->data_type()),
-                                 "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(src2->data_type()),
-                                 "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(dst->data_type()),
-                                 "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
-        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-        kernel_name += "_quantized";
-    }
-    else
-    {
-        kernel_name += (scale_int >= 0) ? "_int" : "_float";
-        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE");
-        build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
-        build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
-        if(act_info.enabled())
-        {
-            build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-            build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-            build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-        }
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set scale argument
-    unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
-
-    if(scale_int >= 0 && !is_quantized)
-    {
-        _kernel.setArg(idx++, scale_int);
-    }
-    else
-    {
-        _kernel.setArg(idx++, scale);
-    }
-
-    Window win = calculate_max_window(*dst, Steps(vec_size));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(dst->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-}
-
-Status ClMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                             ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
-
-    return Status{};
-}
-
-void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
-    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
-    const TensorShape &out_shape = dst->info()->tensor_shape();
-
-    bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-    {
-        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
-        {
-            can_collapse = (in_shape1[d] == in_shape2[d]);
-        }
-    }
-
-    bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
-    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-    Window slice        = collapsed.first_slice_window_3D();
-    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src_0, slice_input1);
-        add_3D_tensor_argument(idx, src_1, slice_input2);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-
-namespace
-{
-constexpr unsigned int vec_size_complex = 1;
-
-Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
-    }
-
-    return Status{};
-}
-} // namespace
-
-void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info));
-
-    auto padding_info = get_padding_info({ src1, src2, dst });
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-    auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    if(act_info.enabled())
-    {
-        build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-        build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-        build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options());
-
-    Window win = calculate_max_window(*dst, Steps(vec_size_complex));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info));
-
-    return Status{};
-}
-
-void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
-    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
-    const TensorShape &out_shape = dst->info()->tensor_shape();
-
-    bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-    {
-        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
-        {
-            can_collapse = (in_shape1[d] == in_shape2[d]);
-        }
-    }
-
-    bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
-    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-    Window slice        = collapsed.first_slice_window_3D();
-    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src_0, slice_input1);
-        add_3D_tensor_argument(idx, src_1, slice_input2);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClMulKernel.h b/src/core/gpu/cl/kernels/ClMulKernel.h
deleted file mode 100644
index 44162f3db3..0000000000
--- a/src/core/gpu/cl/kernels/ClMulKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_MUL_KERNEL_H
-#define ARM_COMPUTE_CL_MUL_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the pixelwise multiplication kernel. */
-class ClMulKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClMulKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMulKernel);
-    /** Initialise the kernel's src and dst.
-     *
-     * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)                         -> U8
-     *   - (U8,U8)                         -> S16
-     *   - (U8,S16)                        -> S16
-     *   - (S16,U8)                        -> S16
-     *   - (S16,S16)                       -> S16
-     *   - (S32,S32)                       -> S32
-     *   - (F16,F16)                       -> F16
-     *   - (F32,F32)                       -> F32
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16
-     *   - (QSYMM16,QSYMM16)               -> S32
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
-     * @param[in]  src2            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
-     * @param[out] dst             The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClMulKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** Interface for the complex pixelwise multiplication kernel. */
-class ClComplexMulKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    ClComplexMulKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexMulKernel);
-    /** Initialise the kernel's src and dst.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            An src tensor info. Data types supported: F32. Number of channels supported: 2.
-     * @param[in]  src2            An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
-     * @param[out] dst             The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClComplexMulKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_MUL_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClPermuteKernel.cpp b/src/core/gpu/cl/kernels/ClPermuteKernel.cpp
deleted file mode 100644
index ffc13060a8..0000000000
--- a/src/core/gpu/cl/kernels/ClPermuteKernel.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClPermuteKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-TensorShape get_dst_shape(const ITensorInfo *src, const PermutationVector &perm)
-{
-    TensorShape dst_shape = src->tensor_shape();
-    permute(dst_shape, perm);
-    return dst_shape;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() < 1 || src->num_dimensions() > 4,
-                                    "Permutation up to 4-D src tensor is supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
-                                    "Permutation vector size should be less than or equal to 4");
-    for(const auto &p : perm)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
-    }
-
-    // Validate configured dst
-    if(dst->total_size() != 0)
-    {
-        const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-    return Status{};
-}
-} // namespace
-
-void ClPermuteKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    auto              padding_info = get_padding_info({ src, dst });
-    const TensorShape dst_shape    = get_dst_shape(src, perm);
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm));
-
-    _perm = perm;
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
-    build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2)));
-    // New positions of  width(W), height(H), channel(C) and batch(D) based on permutation vector
-    build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
-    build_opts.add_option("-DP2=" + support::cpp11::to_string((_perm.num_dimensions() >= 2) ? perm[1] : 1));
-    build_opts.add_option("-DP3=" + support::cpp11::to_string((_perm.num_dimensions() >= 3) ? perm[2] : 2));
-    build_opts.add_option("-DP4=" + support::cpp11::to_string((_perm.num_dimensions() >= 4) ? perm[3] : 3));
-
-    _kernel = create_kernel(compile_context, "permute", build_opts.options());
-
-    // Configure  kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    ICLKernel::configure_internal(win);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm));
-
-    return Status{};
-}
-
-void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-    // Setup dst slice
-    Window slice_out(slice_in);
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    slice_out.set(3, Window::Dimension(0, 0, 0));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src, slice_in);
-        add_4D_tensor_argument(idx, dst, slice_out);
-        enqueue(queue, *this, slice_in, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClPermuteKernel.h b/src/core/gpu/cl/kernels/ClPermuteKernel.h
deleted file mode 100644
index b844214595..0000000000
--- a/src/core/gpu/cl/kernels/ClPermuteKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_PERMUTE_KERNEL_H
-#define ARM_COMPUTE_CL_PERMUTE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to perform tensor permutation.
- *
- * Permutes given a permutation vector
- */
-class ClPermuteKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClPermuteKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPermuteKernel);
-    /** Set the src and dst of the kernel.
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] src             The src tensor info. Data types supported: All.
-     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
-     * @param[in] perm            Permutation vector
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClPermuteKernel
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] src  The src tensor info. Data types supported: All.
-     * @param[in] dst  The dst tensor info. Data types supported: same as @p src.
-     * @param[in] perm Permutation vector
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    PermutationVector _perm{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_PERMUTE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClPool2dKernel.cpp b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
deleted file mode 100644
index 0e15bffd14..0000000000
--- a/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClPool2dKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-// Internal window config info
-using ClPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
-
-void auto_init(const ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, PoolingLayerInfo pool_info)
-{
-    TensorShape out_shape = compute_pool_shape(*src, pool_info);
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
-    if(indices)
-    {
-        auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
-    }
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
-                                    "Unsupported combination of parameters!");
-
-    const auto   data_layout       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int    idx_width         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int    idx_height        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const bool   is_global_pooling = pool_info.is_global_pooling;
-    unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    int          output_width      = 0;
-    int          output_height     = 0;
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size_x, pool_size_y, pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
-
-    // Check indices
-    if(indices)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-
-        if(indices->total_size() != 0)
-        {
-            TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32));
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
-        }
-    }
-
-    // Checks performed when dst is configured
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-        TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
-    }
-
-    return Status{};
-}
-
-std::tuple<Status, Window, ClPoolingConfig> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Get data layout
-    const DataLayout data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    unsigned int        pooled_w        = 0;
-    unsigned int        pooled_h        = 0;
-    int                 pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    int                 pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int  pool_pad_right  = pad_stride_info.pad_right();
-    const int  pool_pad_top    = pad_stride_info.pad_top();
-    const int  pool_pad_left   = pad_stride_info.pad_left();
-    const int  pool_pad_bottom = pad_stride_info.pad_bottom();
-    BorderSize border_size     = BorderSize();
-
-    auto_init(src, dst, indices, pool_info);
-    pooled_w = dst->tensor_shape()[idx_width];
-    pooled_h = dst->tensor_shape()[idx_height];
-
-    const DataType data_type = src->data_type();
-
-    const int src_width  = src->dimension(idx_width);
-    const int src_height = src->dimension(idx_height);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-    bool         window_changed                    = false;
-    Window       win{};
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            // Initialize border size
-            border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
-            // Change the number of elements processed per iteration
-            // for pooling 3x3 with stride less equal than 3
-            const bool can_optimize                         = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
-            num_elems_processed_per_iteration               = can_optimize ? 4 : 1;
-            const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
-
-            // Number of iterations in X dimension
-            const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
-
-            // Upper limit for the number of right/bottom border elements that are accessed
-            const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
-            const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
-
-            border_size.right  = std::max(upper_bound_w, pool_pad_right);
-            border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
-
-            win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-
-            AccessWindowRectangle src_access(src, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
-                                             pool_stride_x, pool_stride_y);
-            AccessWindowHorizontal dst_access(dst, 0, num_elems_processed_per_iteration);
-
-            // Update indices window
-            if(indices)
-            {
-                AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
-                window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
-                indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
-            }
-            else
-            {
-                window_changed = update_window_and_padding(win, src_access, dst_access);
-            }
-
-            dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            const size_t vec_size = dst->data_type() == DataType::F32 ? 2 : 4;
-
-            // Initialize border size
-            border_size                       = BorderSize();
-            num_elems_processed_per_iteration = adjust_vec_size(vec_size, dst->dimension(0));
-            win                               = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win, ClPoolingConfig(num_elems_processed_per_iteration, border_size));
-}
-} // namespace
-
-ClPool2dKernel::ClPool2dKernel()
-    : _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
-{
-}
-
-BorderSize ClPool2dKernel::border_size() const
-{
-    return _border_size;
-}
-
-void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    auto padding_info = get_padding_info({ src, dst, indices });
-
-    // Set instance variables
-    _pool_info                          = pool_info;
-    _data_layout                        = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    const PoolingType   pool_type       = pool_info.pool_type;
-    const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int           idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
-    const int           pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    const int           pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const bool          exclude_padding = pool_info.exclude_padding;
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int pool_pad_top  = pad_stride_info.pad_top();
-    const int pool_pad_left = pad_stride_info.pad_left();
-
-    // Set build options
-    CLBuildOptions build_opts;
-    const DataType data_type = src->data_type();
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, pool_info, indices);
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-    ICLKernel::configure_internal(std::get<1>(win_config));
-
-    ClPoolingConfig pooling_config     = std::get<2>(win_config);
-    _num_elems_processed_per_iteration = pooling_config.first;
-    _border_size                       = pooling_config.second;
-
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
-
-    // Tensor paddings are used to calculate the indicies for MAX pooling
-    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
-    {
-        build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(src->padding().left));
-        build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(src->padding().right));
-        build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(src->padding().top));
-        build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(src->padding().bottom));
-        build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(src->dimension(idx_channel)));
-        build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
-        build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
-    }
-
-    if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Check dst dimensions
-    auto_init(src, dst, indices, pool_info);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
-
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
-    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
-    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
-    build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
-    build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
-    build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
-    build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
-
-    // Set the initial value for the pooling operation accordingly with the data type
-    if(pool_type == PoolingType::MAX)
-    {
-        if(is_data_type_quantized(data_type))
-        {
-            PixelValue type_min{};
-            std::tie(type_min, std::ignore) = get_min_max(data_type);
-            build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
-        }
-        else
-        {
-            build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
-        }
-    }
-    else
-    {
-        // Pool AVG and Pool L2 initial value
-        build_opts.add_option("-DINITIAL_VALUE=0");
-    }
-
-    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
-    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
-
-    // Create kernel
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
-            const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
-            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
-            build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
-            build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
-
-            if(pool_type != PoolingType::MAX)
-            {
-                build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
-            }
-
-            if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
-            {
-                // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
-                // each thread computes 4 dst elements
-                const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3);
-
-                std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
-                                          + support::cpp11::to_string(pool_size_x);
-                _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
-            {
-                // For max pooling with pool2x2, store indicies which will be used in max unpooling
-                if(data_type == DataType::F32)
-                {
-                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp32";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
-                else if(data_type == DataType::F16)
-                {
-                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp16";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
-            }
-            else // Run general case
-            {
-                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw";
-                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            // Floating point mixed precision is support on F16 only
-            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
-
-            // Wider accumulation is required to avoid accuracy loss
-            // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
-            // Cast 2: Quantized (int8/uint8 src data and int32 accumulation )
-            DataType acc_data_type = data_type;
-
-            if(use_fp_mixed_precision)
-            {
-                acc_data_type = DataType::F32;
-            }
-            else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
-            {
-                acc_data_type = DataType::S32;
-            }
-
-            build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
-            build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
-            build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
-            build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
-            build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
-            build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
-            build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
-            build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
-            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
-            if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
-            {
-                build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
-
-                std::string kernel_name = "pooling_layer_2x2_nhwc";
-                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            else
-            {
-                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
-                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "pooling_layer_";
-    _config_id += lower_string(string_from_data_type(data_type));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(idx_width));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(idx_height));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(idx_channel));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(src->data_layout()));
-
-    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
-}
-
-Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(src->clone().get(), dst->clone().get(), pool_info)));
-
-    return Status{};
-}
-
-void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    unsigned int pool_stride_x = 0;
-    unsigned int pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
-    auto       indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
-
-    // Collapse window
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            Window slice = window_collapsed.first_slice_window_3D();
-            do
-            {
-                // Upsample src by pool size
-                Window in_slice(slice);
-                in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info.pad_left(),
-                                                             (in_slice.x().end() - _pool_info.pad_stride_info.pad_left()) * pool_stride_x,
-                                                             pool_stride_x * _num_elems_processed_per_iteration));
-                in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info.pad_top(),
-                                                             (in_slice.y().end() - _pool_info.pad_stride_info.pad_top()) * pool_stride_y,
-                                                             pool_stride_y));
-
-                // Set srcs
-                unsigned int idx = 0;
-                add_3D_tensor_argument(idx, src, in_slice);
-                add_3D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
-                {
-                    add_3D_tensor_argument(idx, indices, slice);
-                }
-                enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window_collapsed.slide_window_slice_3D(slice));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            const size_t batch_size = dst->info()->tensor_shape().total_size_upper(3);
-
-            Window slice    = window_collapsed.first_slice_window_4D();
-            Window in_slice = window_collapsed.first_slice_window_4D();
-            in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
-            in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
-            in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
-            in_slice.set(3, Window::Dimension(0, batch_size, 1));
-            do
-            {
-                // Set srcs
-                unsigned int idx = 0;
-                add_4D_tensor_argument(idx, src, in_slice);
-                add_4D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
-                {
-                    add_4D_tensor_argument(idx, indices, slice);
-                }
-                enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClPool2dKernel.h b/src/core/gpu/cl/kernels/ClPool2dKernel.h
deleted file mode 100644
index 8ecb8eb7b7..0000000000
--- a/src/core/gpu/cl/kernels/ClPool2dKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_POOL2D_KERNEL_H
-#define ARM_COMPUTE_CL_POOL2D_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the pooling layer kernel */
-class ClPool2dKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClPool2dKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool2dKernel);
-
-    /** Configure kernel for a given list of arguments
-     *
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClPool2dKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    PoolingLayerInfo _pool_info;
-    DataLayout       _data_layout;
-    BorderSize       _border_size;
-    unsigned int     _num_elems_processed_per_iteration;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_POOL2D_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp
deleted file mode 100644
index 48d351d536..0000000000
--- a/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClQuantizeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-
-    // Output must always be initialized
-    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-
-    return Status{};
-}
-} // namespace
-
-void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    const int  vec_size_x     = 16 / src->element_size();
-    const int  input_width_x  = src->tensor_shape().x();
-    const bool multi_access_x = (input_width_x / vec_size_x > 0);
-
-    const UniformQuantizationInfo qinfo            = dst->quantization_info().uniform();
-    const DataType                output_data_type = dst->data_type();
-
-    float   scale_to_apply  = qinfo.scale;
-    int32_t offset_to_apply = qinfo.offset;
-    if(is_data_type_quantized_asymmetric(src->data_type()))
-    {
-        /*
-         * In case of requantization of a quantized input tensor to an output tensor with another quantization
-         * instead of of apply dequantization and then a quantization functions, we just compute new scale and
-         * offset to apply.
-         *
-         * Assuming:
-         *   - q_i as input quantized value
-         *   - q_o as output quantized value
-         *   - z_i as input quantization offset value
-         *   - z_o as output quantization offset value
-         *   - s_i as input quantization scale value
-         *   - s_o as output quantization scale value
-         *   - z_n as new quantization offset value
-         *   - s_n as new quantization scale value
-         *
-         * q_o = ( q_i - z_i ) * s_i / s_o + z_o
-         *
-         * We can rewrite the formula as:
-         *
-         * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o
-         *
-         * q_o = q_i / s_n + z_n
-         *
-         * Where:
-         *
-         * s_n = s_o / s_i
-         *
-         * z_n = - z_i * s_i / s_o + z_o
-         *
-         */
-        const UniformQuantizationInfo qinfo_in = src->quantization_info().uniform();
-        scale_to_apply /= qinfo_in.scale;
-        // In order to minimize flooring we convert the offset to a float,
-        // then compute the new offset in the float domain,
-        // finally we convert it back as int32_t
-        offset_to_apply -= static_cast<int32_t>(static_cast<float>(qinfo_in.offset) * qinfo_in.scale / qinfo.scale);
-    }
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT");
-    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply));
-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
-    std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type);
-    build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
-    build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
-
-    _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-    if(multi_access_x)
-    {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClQuantizeKernel.h b/src/core/gpu/cl/kernels/ClQuantizeKernel.h
deleted file mode 100644
index 8d37f33032..0000000000
--- a/src/core/gpu/cl/kernels/ClQuantizeKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_QUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CL_QUANTIZE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors.
- */
-class ClQuantizeKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClQuantizeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClQuantizeKernel);
-    /** Set the input, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] dst             Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClQuantizeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_QUANTIZE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClReshapeKernel.cpp b/src/core/gpu/cl/kernels/ClReshapeKernel.cpp
deleted file mode 100644
index 923b9cb264..0000000000
--- a/src/core/gpu/cl/kernels/ClReshapeKernel.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClReshapeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include <string>
-
-/** [ClReshapeKernel Kernel] **/
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    if(dst->tensor_shape().total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size());
-    }
-
-    return Status{};
-}
-} // namespace
-
-void ClReshapeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Create kernel
-    std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()) };
-    _kernel                          = create_kernel(compile_context, "reshape_layer", build_opts);
-
-    // Add static arguments
-    const cl_int2 src_shape =
-    {
-        {
-            static_cast<cl_int>(src->tensor_shape()[0]),
-            static_cast<cl_int>(src->tensor_shape()[1])
-        }
-    };
-    const cl_int2 dst_shape =
-    {
-        {
-            static_cast<cl_int>(dst->tensor_shape()[0]),
-            static_cast<cl_int>(dst->tensor_shape()[1])
-        }
-    };
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
-    _kernel.setArg<cl_int2>(idx++, src_shape);
-    _kernel.setArg<cl_int2>(idx++, dst_shape);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src);
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-
-    return Status{};
-}
-
-void ClReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    // Set srcs
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, src, window_collapsed);
-    add_3D_tensor_argument(idx, dst, window_collapsed);
-    enqueue(queue, *this, slice, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-/** [ClReshapeKernel Kernel] **/
diff --git a/src/core/gpu/cl/kernels/ClReshapeKernel.h b/src/core/gpu/cl/kernels/ClReshapeKernel.h
deleted file mode 100644
index 0501b93f40..0000000000
--- a/src/core/gpu/cl/kernels/ClReshapeKernel.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_RESHAPE_KERNEL_H
-#define ARM_COMPUTE_CL_RESHAPE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the kernel to perform tensor reshaping */
-class ClReshapeKernel : public IClKernel
-{
-public:
-    ClReshapeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClReshapeKernel);
-    /** Set the src and dst of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data type supported: All.
-     * @param[out] dst             Destination tensor info. Data type supported: Same as @p src
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref ClReshapeKernel
-     *
-     * @param[in] src Source tensor info. Data type supported: All
-     * @param[in] dst Destination tensor info. Data type supported: Same as @p src
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace opencl
-} // namespace kernels
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_RESHAPE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.cpp b/src/core/gpu/cl/kernels/ClScaleKernel.cpp
deleted file mode 100644
index 7fb5d2a5d3..0000000000
--- a/src/core/gpu/cl/kernels/ClScaleKernel.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClScaleKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Cast.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-inline std::pair<float, float> calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
-{
-    const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the ratio between source width/height and destination width/height
-    const unsigned int src_width  = src->dimension(idx_width);
-    const unsigned int src_height = src->dimension(idx_height);
-    const unsigned int dst_width  = dst->dimension(idx_width);
-    const unsigned int dst_height = dst->dimension(idx_height);
-
-    float wr = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners);
-    float hr = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners);
-
-    return std::make_pair(wr, hr);
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
-
-    float            wr          = 0.f;
-    float            hr          = 0.f;
-    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    std::tie(wr, hr) = calculate_scale_factors(src, dst, data_layout, info.align_corners);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (wr > 1.f || hr > 1.f));
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info, BorderSize &border)
-{
-    Window           win{};
-    bool             window_changed{};
-    unsigned int     num_elems_processed_per_iteration = 0;
-    const DataLayout data_layout                       = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            if(info.border_mode == BorderMode::UNDEFINED)
-            {
-                border = BorderSize(0);
-            }
-
-            num_elems_processed_per_iteration = 4;
-            // Configure kernel window
-            win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-            AccessWindowStatic input_access(src,
-                                            -border.left, -border.top,
-                                            src->dimension(0) + border.right,
-                                            src->dimension(1) + border.bottom);
-            AccessWindowHorizontal output_access(dst, 0, num_elems_processed_per_iteration);
-
-            output_access.set_valid_region(win, calculate_valid_region_scale(*src,
-                                                                             dst->tensor_shape(),
-                                                                             info.interpolation_policy,
-                                                                             info.sampling_policy,
-                                                                             info.border_mode == BorderMode::UNDEFINED));
-
-            window_changed = update_window_and_padding(win, input_access, output_access);
-        }
-        break;
-        case DataLayout::NHWC:
-        {
-            // Configure kernel window
-            win = calculate_max_window(*dst, Steps());
-        }
-        break;
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-BorderSize ClScaleKernel::border_size() const
-{
-    return BorderSize(static_cast<size_t>(_data_layout == DataLayout::NCHW));
-}
-
-Status ClScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, info));
-    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    BorderSize       border      = BorderSize(static_cast<size_t>(data_layout == DataLayout::NCHW));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), info, border).first);
-
-    return Status{};
-}
-
-void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info));
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Info required for the static tuning
-    _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-
-    float wr = 0.f;
-    float hr = 0.f;
-    std::tie(wr, hr) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
-    const bool call_quantized_kernel = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
-
-    // Compute actual border size
-    BorderSize border  = border_size();
-    const bool is_nhwc = _data_layout == DataLayout::NHWC;
-
-    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    auto interpolation_policy_to_use = info.interpolation_policy;
-    if(info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
-    {
-        interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
-    }
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, info, border);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
-    build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
-    build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
-    build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(dst->dimension(2)));
-    build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
-    build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
-    if(call_quantized_kernel)
-    {
-        const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
-        build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
-        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
-    }
-    std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
-    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
-    std::string kernel_name = "scale_" + interpolation_name;
-    kernel_name += call_quantized_kernel ? "_quantized_" : "_";
-    kernel_name += lower_string(string_from_data_layout(_data_layout));
-
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-    if(is_nhwc)
-    {
-        ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-    }
-
-    const int          idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int          idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    unsigned int       idx        = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    const unsigned int src_width  = src->dimension(idx_width);
-    const unsigned int dst_height = src->dimension(idx_height);
-
-    _kernel.setArg<float>(idx++, src_width);
-    _kernel.setArg<float>(idx++, dst_height);
-    _kernel.setArg<float>(idx++, wr);
-    _kernel.setArg<float>(idx++, hr);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "scale_";
-    _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
-    _config_id += (info.sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft");
-    _config_id += (is_nhwc ? "nhwc" : "nchw");
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(3));
-}
-
-void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            Window slice = window.first_slice_window_2D();
-
-            do
-            {
-                unsigned int idx = 0;
-                add_2D_tensor_argument(idx, src, slice);
-                add_2D_tensor_argument(idx, dst, slice);
-                enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(slice));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
-            Window slice     = collapsed.first_slice_window_4D();
-
-            unsigned int idx = 0;
-            add_4D_tensor_argument(idx, src, slice);
-            add_4D_tensor_argument(idx, dst, slice);
-            enqueue(queue, *this, slice, lws_hint());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.h b/src/core/gpu/cl/kernels/ClScaleKernel.h
deleted file mode 100644
index ad7632c713..0000000000
--- a/src/core/gpu/cl/kernels/ClScaleKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_SCALE_KERNEL_H
-#define ARM_COMPUTE_CL_SCALE_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the scale kernel */
-class ClScaleKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClScaleKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClScaleKernel);
-
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[out] dst             Destination tensor info. Data types supported: Same as @p src
-     *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info            @ref ScaleKernelInfo Kernel descriptor to be used to configure.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref ClScaleKernel
-     *
-     * @param[in] src  Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[in] dst  Destination tensor info. Data types supported: Same as @p src
-     *                 All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] info @ref ScaleKernelInfo Kernel descriptor to be used to validate
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    DataLayout _data_layout = DataLayout::UNKNOWN;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSCALEKERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp
deleted file mode 100644
index 000c9ad04d..0000000000
--- a/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClSoftmaxKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
- *
- * Prepares these build options:
- * -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier.
- * -DIFF_MIN - threshold difference between maximum value of input data and current processed value,
- *             it defines whether the value will be taken into account or not.
- *
- * @param[in] build_opts  Build options to extend
- * @param[in] input_scale Input scaling factor
- * @param[in] beta        Exponent scaling factor beta
- */
-CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta)
-{
-    // Number of integer bits in temporary fixed-point representation of current-to-max difference
-    static const int scaled_diff_int_bits = 5;
-    // Number of integer bits used in temporary fixed-point representation of exponent accumulator
-    static const int exp_accumulation_in_bits = 12;
-
-    const double beta_multiplier = std::min(
-                                       1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
-                                       (1LL << 31) - 1.0);
-    int input_beta_multiplier;
-    int input_beta_left_shift;
-    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
-
-    const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
-    const int    diff_min           = -1.f * std::floor(max_input_rescaled);
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
-    build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits));
-    build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier));
-    build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift));
-    build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min));
-
-    return build_opts;
-}
-
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
-
-    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
-
-    // Checks performed when output is configured
-    if(dst.total_size() != 0)
-    {
-        if(is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-    }
-
-    // Checks performed when sum is configured
-    if(sum.total_size() != 0)
-    {
-        if(is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&sum, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&max, &sum);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&max, &sum);
-    }
-
-    return Status{};
-}
-
-Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &sum);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type));
-
-    // Note: output should always have a scale of 1/256 and offset 0
-    const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const bool             is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
-
-    // Checks performed when output is configured
-    if(dst.total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-        if(!is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-            ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != allowed_quantization_info);
-        }
-    }
-
-    return Status{};
-}
-} // namespace
-
-/**< Grid size (obtained through auto-tuning) */
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_grid_size = 64;
-/**< Vector size in the serial case (obtained through auto-tuning) */
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
-/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
-
-void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info)
-{
-    auto padding_info = get_padding_info({ &src, &max, &dst, &sum });
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(sum, src.clone()->set_tensor_shape(max.tensor_shape()));
-    auto_init_if_empty(dst, *src.clone());
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum));
-
-    const DataType                dt                 = src.data_type();
-    const UniformQuantizationInfo qinfo              = src.quantization_info().uniform();
-    const size_t                  reduction_dim_size = src.dimension(0);
-    const float                   beta               = info.beta;
-    const auto                    is_signed_qasymm8  = is_data_type_quantized_asymmetric_signed(info.input_data_type);
-    const int                     min_value          = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
-
-    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
-    const unsigned int    vector_size             = adjust_vec_size(std::get<1>(parallel_reduction_info), reduction_dim_size);
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
-    build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(reduction_dim_size));
-    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(reduction_dim_size % vector_size));
-    build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
-    build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
-    build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
-    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX");
-    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX")));
-    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
-
-    cl::NDRange lws_hint(cl::NullRange);
-    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "");
-
-    // Configure parallel kernel if needed
-    if(std::get<0>(parallel_reduction_info))
-    {
-        kernel_name += "parallel";
-        bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
-        build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
-
-        // Handle boundary conditions.
-        const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size;
-        build_opts.add_option_if((multiple_grid_size != 0) || ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE");
-        // Setting _lws_hint in this way can also communicate grid_size to ClLogits1DMaxShiftExpSumKernel::run().
-        // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
-        lws_hint = cl::NDRange(_grid_size);
-    }
-    else
-    {
-        kernel_name += "serial";
-    }
-
-    // Create kernel.
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure window
-    Window win = calculate_max_window(src, Steps(reduction_dim_size));
-    IClKernel::configure_internal(win, lws_hint);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum));
-    return Status{};
-}
-
-ClLogits1DMaxShiftExpSumKernel::ParallelReductionInfo ClLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
-{
-    bool         is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
-    unsigned int vector_size           = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
-    return std::make_tuple(is_parallel_reduction, vector_size);
-}
-
-void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    auto max = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0));
-    auto sum = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_1));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, max, sum);
-
-    // Collapse window in Z dimension
-    Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
-
-    // Reconfigure window in case of parallel reduction
-    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(src->info()->dimension(0));
-    if(std::get<0>(parallel_reduction_info))
-    {
-        // Launch grid_size parallel work items
-        window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1));
-    }
-
-    // Get slices
-    Window slice = window_collapsed.first_slice_window_3D();
-    do
-    {
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, max, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        add_3D_tensor_argument(idx, sum, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-
-void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
-    auto padding_info = get_padding_info({ &src, &dst, &sum });
-
-    // Note: output should always have a scale of 1/256 and offset 0
-    const bool                    is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
-    const DataType                output_data_type          = info.input_data_type;
-    const QuantizationInfo        allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const UniformQuantizationInfo qinfo                     = src.quantization_info().uniform();
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(dst, src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(src, sum, dst, info));
-
-    const auto         is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
-    const int          min_value         = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
-    const unsigned int vector_size       = adjust_vec_size(16, src.dimension(0));
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(info.input_data_type));
-    build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(src.dimension(0) % vector_size));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric_signed(info.input_data_type), "-DQASYMM8_SIGNED");
-    build_opts.add_options_if(is_quantized_asymmetric,
-                              prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options());
-    build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
-
-    // Create kernel
-    std::string kernel_name = std::string("softmax_layer_norm") + (is_quantized_asymmetric ? "_quantized" : "");
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure window
-    auto win = calculate_max_window(src, Steps(vector_size));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClLogits1DNormKernel::validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(src, sum, dst, info));
-
-    return Status{};
-}
-
-void ClLogits1DNormKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    auto sum = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, sum);
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    do
-    {
-        Window sum_slice = slice;
-        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, sum, sum_slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClSoftmaxKernel.h b/src/core/gpu/cl/kernels/ClSoftmaxKernel.h
deleted file mode 100644
index af980eaa8e..0000000000
--- a/src/core/gpu/cl/kernels/ClSoftmaxKernel.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for max, shifting, exponentiating and summing the logits */
-class ClLogits1DMaxShiftExpSumKernel : public IClKernel
-{
-    /**< Grid size (obtained through auto-tuning) */
-    static const unsigned int _grid_size;
-    /**< Vector size in the serial case (obtained through auto-tuning) */
-    static const unsigned int _serial_vector_size;
-    /**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
-    static const unsigned int _parallel_vector_size;
-
-public:
-    /** Info for whether a parallel reduction will be run and the vector size of the execution. */
-    using ParallelReductionInfo = std::tuple<bool, unsigned int>;
-
-    /** Default constructor */
-    ClLogits1DMaxShiftExpSumKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DMaxShiftExpSumKernel);
-    /** Configure the kernel using the given information about tensors
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     src             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in,out] max             Max values tensor. Data types supported: same as @p src
-     * @param[out]    dst             Destination tensor. Data types supported: same as @p src
-     * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p src
-     * @param[in]     info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClLogits1DMaxShiftExpSumKernel
-     *
-     * @param[in] src Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] max Max values tensor. Data types supported: same as @p src
-     * @param[in] dst Destination tensor. Data types supported: same as @p src
-     * @param[in] sum Sum of 1D logits tensor. Data types supported: same as @p src
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum);
-    /** Checks if the given size is eligible for parallel reduction
-     *
-     * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
-     * @note  Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4.
-     *
-     * @param[in] size Size to check
-     *
-     * @return A two-element tuple where the first element is a boolean specifying if a parallel reduction will be run,
-     *         while the second element is the vector size of the execution.
-     */
-    static ParallelReductionInfo is_parallel_reduction(size_t size);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-
-/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
-class ClLogits1DNormKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClLogits1DNormKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DNormKernel);
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
-     * @param[in]  sum             Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] dst             Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
-     * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClLogits1DNormKernel
-     *
-     * @param[in] src  Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
-     * @param[in] sum  Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[in] dst  Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
-     * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClTransposeKernel.cpp b/src/core/gpu/cl/kernels/ClTransposeKernel.cpp
deleted file mode 100644
index 704d0152cf..0000000000
--- a/src/core/gpu/cl/kernels/ClTransposeKernel.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClTransposeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-void ClTransposeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output auto initialization if not yet initialized
-    const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
-    ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Create kernel
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
-    const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
-    const unsigned int vec_size_y           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1));
-    const int          vec_size_y_leftovers = src->dimension(1) % vec_size_y;
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(src->element_size()));
-    build_opts.add_option("-DVEC_SIZE_X=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER_X=" + support::cpp11::to_string(vec_size_x_leftovers));
-    build_opts.add_option("-DVEC_SIZE_Y=" + support::cpp11::to_string(vec_size_y));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER_Y=" + support::cpp11::to_string(vec_size_y_leftovers));
-
-    _kernel = create_kernel(compile_context, "transpose", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(vec_size_x, vec_size_y));
-    ICLKernel::configure_internal(win, cl::NDRange(2, 8));
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported");
-
-    // Validate configured dst
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo dst_info = src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-
-void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, src, slice);
-        add_2D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClTransposeKernel.h b/src/core/gpu/cl/kernels/ClTransposeKernel.h
deleted file mode 100644
index 21d4fd41f5..0000000000
--- a/src/core/gpu/cl/kernels/ClTransposeKernel.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H
-#define ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to transpose a 2D tensor. */
-class ClTransposeKernel : public IClKernel
-{
-public:
-    ClTransposeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClTransposeKernel);
-    /** Set the src and dst of the kernel.
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] src             The src tensor info. Data types supported: All.
-     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClTransposeKernel
-     *
-     * @param[in] src The src tensor info. Data types supported: All.
-     * @param[in] dst The dst tensor info. Data types supported: same as @p src.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
deleted file mode 100644
index 9f970719ed..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/tensor_info.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0));
-
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst));
-    return Status{};
-}
-
-void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst));
-
-    auto padding_info = get_padding_info({ src1, src2, dst });
-
-    const unsigned int min_dimension                     = std::min(src1->dimension(0), src2->dimension(0));
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
-    const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0)));
-    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0)));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-
-    // If input have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
-    {
-        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
-        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
-        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_width_x2", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "concatenate_width_x2_";
-    _config_id += lower_string(string_from_data_type(src1->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(1));
-}
-
-void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_4D();
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src0, slice);
-        add_4D_tensor_argument(idx, src1, slice);
-        add_4D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
deleted file mode 100644
index ddade29113..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
-#define ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the width concatenate kernel of 2 tensors.
- *  The src1 and src2 tensors will be concatenated into the dst tensor.
- */
-class ClWidthConcatenate2TensorsKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClWidthConcatenate2TensorsKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate2TensorsKernel);
-    /** Initialise the kernel's sources and destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            First source tensor info. Data types supported: All.
-     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1
-     * @param[out] dst             Destination tensor info. Data types supported: Same as @p src1.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
-    /**  Static function to check if given info will lead to a valid configuration of @ref ClWidthConcatenate2TensorsKernel
-     *
-     * @param[in] src1 First tensor info. Data types supported: All.
-     * @param[in] src2 Second tensor info. Data types supported: same as @p src1
-     * @param[in] dst  Destination tensor info. Data types supported: Same as @p src1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_2TENSORS_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
deleted file mode 100644
index 281d190381..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/tensor_info.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0));
-
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(src3->dimension(i) != dst->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(src4->dimension(i) != dst->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel()
-{
-}
-
-Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst));
-    return Status{};
-}
-
-void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
-                                                 ITensorInfo *src1, ITensorInfo *src2,
-                                                 ITensorInfo *src3, ITensorInfo *src4,
-                                                 ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst));
-
-    auto               padding_info                      = get_padding_info({ src1, src2, src3, src4, dst });
-    const unsigned int min_dimension                     = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
-    const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0)));
-    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0)));
-    build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(src3->dimension(0)));
-    build_opts.add_option("-DINPUT4_WIDTH=" + support::cpp11::to_string(src4->dimension(0)));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-
-    // If soources have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
-    {
-        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
-        const UniformQuantizationInfo iq3_info = src3->quantization_info().uniform();
-        const UniformQuantizationInfo iq4_info = src4->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
-        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
-        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
-        build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(iq3_info.offset));
-        build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(iq3_info.scale));
-        build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(iq4_info.offset));
-        build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(iq4_info.scale));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_width_x4", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "concatenate_width_x4_";
-    _config_id += lower_string(string_from_data_type(src1->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src3->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src3->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src4->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src4->dimension(1));
-}
-
-void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
-    const auto src3 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice = window.first_slice_window_4D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src0, slice);
-        add_4D_tensor_argument(idx, src1, slice);
-        add_4D_tensor_argument(idx, src2, slice);
-        add_4D_tensor_argument(idx, src3, slice);
-        add_4D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
deleted file mode 100644
index 19bda65902..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
-#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the width concatenate kernel of 4 tensors.
- *  All source tensors will be concatenated into the destination tensor.
- */
-class ClWidthConcatenate4TensorsKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClWidthConcatenate4TensorsKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate4TensorsKernel);
-    /** Initialise the kernel's sources and destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            First source tensor info. Data types supported: All.
-     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1
-     * @param[in]  src3            Third source tensor info. Data types supported: same as @p src1
-     * @param[in]  src4            Fourth source tensor info. Data types supported: same as @p src1
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst);
-    /**  Static function to check if given info will lead to a valid configuration of @ref ClWidthConcatenate4TensorsKernel
-     *
-     * @param[in] src1 First tensor info. Data types supported: All.
-     * @param[in] src2 Second tensor info. Data types supported: same as @p src1
-     * @param[in] src3 Third tensor info. Data types supported: same as @p src1
-     * @param[in] src4 Fourth tensor info. Data types supported: same as @p src1
-     * @param[in] dst  Destination tensor info. Data types supported: same as @p src1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
deleted file mode 100644
index d188a5226b..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
-
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-ClWidthConcatenateKernel::ClWidthConcatenateKernel()
-{
-}
-
-Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst));
-    return Status{};
-}
-
-void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0));
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2)));
-
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iqinfo.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_width", build_opts.options());
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, src, window);
-    add_4D_tensor_argument(idx, dst, window);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h
deleted file mode 100644
index 6bc8e57a08..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_LAYER_KERNEL_H
-#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_LAYER_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the width concatenate kernel.
- *  The source tensor will be concatenated into the destination tensor.
- */
-class ClWidthConcatenateKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClWidthConcatenateKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenateKernel);
-    /** Initialise the kernel's source and destination
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     src             Source tensor info. Data types supported: All.
-     * @param[in]     width_offset    The offset on the X axis.
-     * @param[in,out] dst             Destination tensor info. Data types supported: same as @p src.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
-    /**  Static function to check if given info will lead to a valid configuration of @ref ClWidthConcatenateKernel
-     *
-     * @param[in] src          Source tensor info. Data types supported: All.
-     * @param[in] width_offset The offset on the X axis.
-     * @param[in] dst          Destination tensor info. Data types supported: same as @p src.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_LAYER_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
deleted file mode 100644
index 381b4bcae9..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
-    const Size2D kernel_size      = winograd_info.kernel_size;
-    const Size2D output_tile_size = winograd_info.output_tile_size;
-
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_UNUSED(output);
-
-    const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
-    const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
-    const unsigned int num_elems_read_per_iteration_z      = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
-
-    Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
-    Window win_collapsed = win.collapse(win, Window::DimZ);
-    return std::make_pair(Status{}, win_collapsed);
-}
-} // namespace
-
-void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(src->dimension(2)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL");
-    build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL");
-    const Size2D kernel_size      = winograd_info.kernel_size;
-    const Size2D output_tile_size = winograd_info.output_tile_size;
-
-    // Create kernel
-    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IClKernel::configure_internal(win_config.second);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
-
-    return Status{};
-}
-
-void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    // Setup output window
-    Window window_out;
-    window_out.use_tensor_dimensions(dst->info()->tensor_shape(), 0);
-
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, src, window);
-    add_3D_tensor_argument(idx, dst, window_out);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
deleted file mode 100644
index 2bc2ceb36e..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
-#define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the Winograd filter transform kernel. */
-class ClWinogradFilterTransformKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClWinogradFilterTransformKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradFilterTransformKernel);
-    /** Set the input and output tensor.
-     *
-     * @note Winograd filter transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd filter transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
-     * @param[out] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClWinogradFilterTransformKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
deleted file mode 100644
index 17f0eb9e2c..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported");
-
-    ARM_COMPUTE_UNUSED(conv_info);
-    ARM_COMPUTE_UNUSED(output_tile_size);
-    ARM_COMPUTE_UNUSED(kernel_size);
-
-    // Validate configured output
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_UNUSED(output);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    bool   window_changed = false;
-    Window win            = calculate_max_window(*input, Steps(1, 1));
-
-    if(input->data_layout() == DataLayout::NCHW)
-    {
-        const PadStrideInfo conv_info        = winograd_info.convolution_info;
-        const Size2D        output_tile_size = winograd_info.output_tile_size;
-        const Size2D        kernel_size      = winograd_info.kernel_size;
-
-        unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
-        unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
-
-        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
-        window_changed = update_window_and_padding(win, input_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-ClWinogradInputTransformKernel::ClWinogradInputTransformKernel()
-    : _border_size(0), _data_layout(DataLayout::UNKNOWN), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
-{
-}
-
-BorderSize ClWinogradInputTransformKernel::border_size() const
-{
-    return _border_size;
-}
-
-void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-
-    _data_layout = src->data_layout();
-
-    const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)),
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
-
-    _num_tiles_x = num_tiles.width;
-    _num_tiles_y = num_tiles.height;
-
-    const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape));
-
-    ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(dst->dimension(1)));
-    const size_t total_batches = src->tensor_shape().total_size_upper(3);
-
-    CLBuildOptions build_opts;
-    if(_data_layout == DataLayout::NHWC)
-    {
-        build_opts.add_option("-DNHWC");
-        build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_w)));
-        build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_h)));
-        build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
-        build_opts.add_option("-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
-        build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-        build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-        build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
-        build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-        build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
-        build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
-    }
-    else
-    {
-        build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
-        build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-        build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-        build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
-        build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-        build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
-        build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
-        build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
-    }
-
-    // Create kernel
-    std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
-
-    // Get the maximum dimension from the tile size
-    const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
-
-    // Check optimized kernel if output_dims == 2x2
-    if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
-    {
-        _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2;
-    }
-
-    // Append stepz and data layout
-    kernel_name += "_stepz";
-    kernel_name += support::cpp11::to_string(_step_z);
-    kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
-
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Create window and update padding
-    auto win_config = validate_and_configure_window(src, dst, winograd_info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IClKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8));
-
-    _border_size = BorderSize(src->padding());
-
-    ARM_COMPUTE_ERROR_ON((src->data_layout() == DataLayout::NHWC) && has_padding_changed(padding_info));
-
-    _config_id = kernel_name;
-    _config_id += support::cpp11::to_string(src->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_info.pad_left());
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_info.pad_top());
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
-    return Status{};
-}
-
-void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    const size_t idx_w         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const size_t idx_c         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const size_t total_batches = window.shape().total_size_upper(3);
-
-    // Collapse window
-    Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        Window slice = window_collapsed.first_slice_window_3D();
-        slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1));
-        slice.set(2, Window::Dimension(0, total_batches, 1));
-
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src, slice);
-        add_4D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    else
-    {
-        Window slice = window_collapsed.first_slice_window_3D();
-        slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
-        slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
-
-        ARM_COMPUTE_ERROR_ON(((slice[idx_c].end() - slice[idx_c].start()) % _step_z) != 0);
-        slice.set(idx_c, Window::Dimension(slice[idx_c].start(), slice[idx_c].end(), _step_z));
-
-        unsigned int idx = 2 * num_arguments_per_3D_tensor();
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[3]));
-
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, src, slice);
-            add_3D_tensor_argument(idx, dst, slice);
-
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window_collapsed.slide_window_slice_3D(slice));
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h
deleted file mode 100644
index 76b45279a4..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
-#define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to perform Winograd input transform.*/
-class ClWinogradInputTransformKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClWinogradInputTransformKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradInputTransformKernel);
-    /** Set the input and output of the kernel.
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] src             The input tensor info to transform. Data types supported: F16/F32
-     * @param[in] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClWinogradInputTransformKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    BorderSize   _border_size;
-    DataLayout   _data_layout;
-    int          _num_tiles_x;
-    int          _num_tiles_y;
-    unsigned int _step_z;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
deleted file mode 100644
index a6c05420ed..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != winograd_info.output_data_layout);
-
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-    const Size2D        input_dimensions = winograd_info.input_dimensions;
-    const unsigned int  num_channels     = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels");
-
-    // Compute number of elements to process in the X and Y direction
-    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area())));
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_UNUSED(bias);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    bool   window_changed = false;
-
-    if(output->data_layout() == DataLayout::NCHW)
-    {
-        const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
-        const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
-
-        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-        AccessWindowStatic    output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
-        window_changed = update_window_and_padding(win, input_access, output_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
-                                                const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, bias, dst, winograd_info.output_tile_size);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IClKernel::configure_internal(win_config.second);
-
-    auto padding_info = get_padding_info({ src, bias, dst });
-
-    _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
-
-    // Compute num_tiles_x
-    const Size2D        input_dimensions = winograd_info.input_dimensions;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const int           idx_width        = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height       = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
-    const size_t total_batches = dst->tensor_shape().total_size_upper(3);
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-    build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-    build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-
-    if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
-    {
-        build_opts.add_option("-DVEC_SIZE=2");
-    }
-    else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
-    {
-        build_opts.add_option("-DVEC_SIZE=4");
-    }
-
-    build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
-    build_opts.add_option("-cl-fast-relaxed-math");
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
-    build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
-    build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
-    build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1)));
-    build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(idx_width)));
-    build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
-    build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
-    build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
-    build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
-
-    // Create kernel
-    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout));
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(src->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc);
-}
-
-Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), dst->clone().get(), winograd_info.output_tile_size).first);
-    return Status{};
-}
-
-void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
-
-    auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    // Collapse window
-    Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
-
-    // Get initial windows
-    Window slice = window_collapsed.first_slice_window_4D();
-    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    // Setup output slice
-    Window slice_out(slice);
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    if(bias != nullptr)
-    {
-        unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
-        Window       slice_biases;
-        slice_biases.use_tensor_dimensions(bias->info()->tensor_shape());
-        add_1D_tensor_argument(idx1, bias, slice_biases);
-    }
-
-    if(_is_nhwc)
-    {
-        unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
-        _kernel.setArg(idx2, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y()));
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src, slice);
-        add_4D_tensor_argument(idx, dst, slice_out);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
deleted file mode 100644
index 48b27e658c..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
-#define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the Winograd output transform kernel. */
-class ClWinogradOutputTransformKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClWinogradOutputTransformKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradOutputTransformKernel);
-    /** Set the input and output tensor.
-     *
-     * @note Winograd output transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd output transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info with shape [C, N, K, batches]. Data types supported: F16/F32.
-     * @param[in]  bias            Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p src
-     * @param[out] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p src
-     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClWinogradOutputTransformKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    bool _is_nhwc{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
deleted file mode 100644
index 7866ccb679..0000000000
--- a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image)
-{
-    ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0);
-    v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1));
-    h0 = std::max(std::min(static_cast<int>(n / n0), static_cast<int>(h0)), static_cast<int>(1));
-
-    const GEMMLHSMatrixInfo lhs_info(m0, k0, v0, lhs_transpose, lhs_interleave);
-    const GEMMRHSMatrixInfo rhs_info(n0, k0, h0, rhs_transpose, rhs_interleave, export_to_cl_image);
-
-    return std::make_pair(lhs_info, rhs_info);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, data_type);
-    const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second);
-    const TensorInfo  tensor_reshaped_info(shape, 1, data_type);
-
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
-    {
-        return info_img;
-    }
-    else
-    {
-        return info_buf;
-    }
-}
-
-void update_padding_for_cl_image(ITensorInfo *tensor)
-{
-    constexpr unsigned int num_floats_per_pixel = 4;
-
-    const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size();
-    const unsigned int pixel_alignment      = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
-
-    ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment");
-    if(pixel_alignment == 0)
-    {
-        return;
-    }
-
-    const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel;
-    const unsigned int round_up_width      = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
-    const unsigned int padding             = round_up_width - stride_y_in_elements;
-
-    tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0));
-}
-
-Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
-{
-    if(rhs_info.export_to_cl_image)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.n0 == 2) || (rhs_info.n0 == 3), "Export to cl_image only supported with n0 = 4, 8 or 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 == 2) || (rhs_info.k0 == 3), "Export to cl_image only supported with k0 = 4, 8 or 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment");
-
-        // Check the width and height of the output tensor.
-        // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension
-        const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
-        const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image");
-    }
-
-    return Status{};
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h
deleted file mode 100644
index 3fce8c9173..0000000000
--- a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_HELPERS_H
-#define ARM_COMPUTE_CL_GEMM_HELPERS_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- *
- * @param[in] m                  Number of rows (M) in the LHS matrix not reshaped
- * @param[in] n                  Number of columns (N) in the RHS matrix not reshaped
- * @param[in] m0                 Number of rows processed by each thread/work-item
- * @param[in] n0                 Number of columns processed by each thread/work-item
- * @param[in] k0                 Number of inner accumulation performed by each thread/work-item
- * @param[in] v0                 Number of vertical blocks of size (m0xk0) stored on the same output row
- * @param[in] h0                 Number of horizontal blocks of size (k0xn0) stored on the same output row
- * @param[in] lhs_interleave     True if the v0 (m0xk0) blocks have to be interleaved in the output row
- * @param[in] rhs_interleave     True if the h0 (k0xn0) blocks have to be interleaved in the output row
- * @param[in] lhs_transpose      True if the (m0xk0) block has to be transposed before been stored
- * @param[in] rhs_transpose      True if the (k0xn0) block has to be transposed before been stored
- * @param[in] export_to_cl_image (Optional) True if the RHS reshaped matrix has to be exported to cl_image
- *
- * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false);
-
-/** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- *
- * This function accepts two pairs of GEMMLHSMatrixInfo/GEMMRHSMatrixInfo where only the first is with cl_image2d support,
- * and selects the valid one validating the GEMMRHSMatrixInfo. If the validation passes, the functions will return
- * the first GEMMLHSMatrixInfo/GEMMRHSMatrixInfo pair with cl_image2d support.
- *
- * @param[in] info_img  GEMMLHSMatrixInfo/GEMMRHSMatrixInfo with cl_image2d support
- * @param[in] info_buf  GEMMLHSMatrixInfo/GEMMRHSMatrixInfo to fall-back if cl_image2d cannot be used
- * @param[in] n         Number of columns (N) in the RHS matrix not reshaped
- * @param[in] k         Number of rows (K) in the RHS matrix not reshaped
- * @param[in] b         Batch size
- * @param[in] data_type Data type
- *
- * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type);
-
-/** Update padding required to export the OpenCL buffer to OpenCL image2d
- *
- * @param[in,out] tensor ITensorInfo of the tensor required to be exported to OpenCL image2d
- */
-void update_padding_for_cl_image(ITensorInfo *tensor);
-
-/** Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix
- *
- * @param[in] tensor_reshaped_info TensorInfo for the RHS reshaped matrix
- * @param[in] rhs_info             @ref GEMMRHSMatrixInfo
- *
- * @return Status reporting if we can use the image2d OpenCL object on the RHS reshaped matrix
- */
-Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info);
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_HELPERS_H */
diff --git a/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
deleted file mode 100644
index a49836cfda..0000000000
--- a/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H
-#define ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H
-
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/common/Macros.h"
-
-#include <array>
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Basic container for the OpenCL GEMM configuration functions */
-template <class T>
-class CLGEMMConfigArray
-{
-public:
-    /** Alias for F32 index */
-    static constexpr size_t DT_F32 = 0;
-    /** Alias for F16 index */
-    static constexpr size_t DT_F16 = 1;
-    /** Alias for Int8 index */
-    static constexpr size_t DT_INT8 = 2;
-
-    /** Constructor
-     *
-     * @param[in] func_f32  Function to call for GEMM F32
-     * @param[in] func_f16  Function to call for GEMM F16
-     * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
-     *
-     */
-    CLGEMMConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
-    {
-    }
-
-    /** Method to return the GEMM configuration function based on data type
-     *
-     * @param[in] data_type Input data type
-     *
-     * @return the valid function otherwise it returns nullptr if the data type is not valid
-     */
-    T get_function(DataType data_type)
-    {
-        switch(data_type)
-        {
-            case DataType::F32:
-                return _configs.at(DT_F32);
-            case DataType::F16:
-                return _configs.at(DT_F16);
-            case DataType::QASYMM8:
-            case DataType::QASYMM8_SIGNED:
-            case DataType::QSYMM8_PER_CHANNEL:
-                return _configs.at(DT_INT8);
-            default:
-                return nullptr;
-        }
-    }
-
-private:
-    std::array<T, 3> _configs;
-};
-
-/** Basic interface for the GEMM kernel configuration */
-class IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] arch GPU target
-     */
-    IClGemmKernelConfig(GPUTarget arch)
-        : _target(arch)
-    {
-    }
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig);
-    /** Virtual destructor */
-    virtual ~IClGemmKernelConfig() = default;
-    /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used
-     *
-     * @param[in] m         Number of rows LHS matrix
-     * @param[in] n         Number of columns RHS matrix
-     * @param[in] k         Number of columns LHS matrix or number of rows RHS matrix
-     * @param[in] b         Batch size
-     * @param[in] data_type Data type
-     */
-    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
-
-protected:
-    GPUTarget _target;
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H */
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
deleted file mode 100644
index 9d11006703..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-
-    switch(_target)
-    {
-        case GPUTarget::G76:
-            func = configs_G76.get_function(data_type);
-            break;
-        case GPUTarget::G71:
-            func = configs_G71.get_function(data_type);
-            break;
-        default:
-            func = configs_G7x.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n < 2048)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
-        }
-        else if(n >= 2048 && n < 8192)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(m == 1)
-        {
-            if(n < 2048)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
-            }
-            else if(n >= 2048 && n < 16384)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
-            }
-        }
-        else
-        {
-            if(m < 64)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
-            }
-        }
-    }
-    else
-    {
-        if(m == 1)
-        {
-            if(n < 8192)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
-            }
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n > 4196)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            if(k < 2048)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false);
-            }
-            else if(k >= 2048 && k < 16384)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 1, false, false, false, false);
-            }
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 2, 8, 2, 1, 1, false, false, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n < 2048)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
-        }
-        else if(n >= 2048 && n < 16384)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
-        }
-    }
-    else
-    {
-        if(m < 64)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
deleted file mode 100644
index 385b96e40e..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Bifrost based OpenCL GEMMNative configuration */
-class ClGemmDefaultConfigNativeBifrost final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigNativeBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H */
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
deleted file mode 100644
index e3c129e3be..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr,
-                                                                        nullptr,
-                                                                        &ClGemmDefaultConfigNativeMidgard::default_q8);
-
-    auto func = configs_default.get_function(data_type);
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    const unsigned int m0 = std::min(m, static_cast<unsigned int>(4));
-    const unsigned int n0 = std::min(n, static_cast<unsigned int>(4));
-
-    return configure_lhs_rhs_info(m, n, m0, n0, 2, 1, 1, false, false, false, false);
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
deleted file mode 100644
index 0ff5471f7c..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Midgard based OpenCL GEMMNative configuration */
-class ClGemmDefaultConfigNativeMidgard final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigNativeMidgard(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H */
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
deleted file mode 100644
index 92767aca52..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
-
-    auto func = configs_default.get_function(data_type);
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n < 2048)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
-        }
-        else if(n >= 2048 && n < 8192)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n < 2048)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
-        }
-        else if(n >= 2048 && n < 8192)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 8, 2, 1, 1, false, false, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(m == 1)
-        {
-            if(n < 2048)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
-            }
-            else if(n >= 2048 && n < 16384)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
-            }
-        }
-        else
-        {
-            if(m < 64)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
-            }
-        }
-    }
-    else
-    {
-        if(m == 1)
-        {
-            if(n < 8192)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
-            }
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false);
-        }
-    }
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
deleted file mode 100644
index 17e4c9d339..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Valhall based OpenCL GEMMNative configuration */
-class ClGemmDefaultConfigNativeValhall final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigNativeValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H */
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
deleted file mode 100644
index ff6a0128af..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H
-#define ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h"
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h"
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** CLGEMMNative factory class */
-class ClGemmNativeKernelConfigurationFactory final
-{
-public:
-    /** Static method to construct CLGEMMNative kernel object accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMNative kernel configuration class
-     */
-    static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-                return std::make_unique<ClGemmDefaultConfigNativeMidgard>(gpu);
-            case GPUTarget::BIFROST:
-                return std::make_unique<ClGemmDefaultConfigNativeBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return std::make_unique<ClGemmDefaultConfigNativeValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
deleted file mode 100644
index b030913a87..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-using namespace arm_compute::misc::shape_calculator;
-
-ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-
-    switch(_target)
-    {
-        case GPUTarget::G76:
-            func = configs_G76.get_function(data_type);
-            break;
-        case GPUTarget::G52:
-            func = configs_G52.get_function(data_type);
-            break;
-        default:
-            func = configs_G7x.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(n <= 4)
-        {
-            return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, true, false, false, true);
-        }
-    }
-    else
-    {
-        if(n <= 4)
-        {
-            return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 6, 4, 4, 2, 2, true, true, false, true);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    if(workload <= 274.4000f)
-    {
-        if(r_nk <= 0.7461f)
-        {
-            if(r_mn <= 21.1667f)
-            {
-                return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false);
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-        else
-        {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
-
-            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
-        }
-    }
-    else
-    {
-        if(r_mk <= 17.3926f)
-        {
-            if(workload <= 542.4000f)
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-        else
-        {
-            if(r_nk <= 0.5463f)
-            {
-                if(workload <= 11767.6001f)
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
-                }
-                else
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
-                }
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(workload <= 323.4000f)
-    {
-        return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 8, 4, 2, 2, true, true, true, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    // Get lhs_info/rhs_info in case of OpenCL buffer
-    if(n <= 4)
-    {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
-    }
-    else
-    {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true);
-    }
-
-    // Get lhs_info/rhs_info in case of OpenCL image
-    // Condition on the GPU workload
-    if((m / 4) * (n / 4) >= 2560)
-    {
-        // Big workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
-    }
-    else
-    {
-        // Small workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
-    }
-
-    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
-    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
-    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
-
-    // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d
-    const bool use_cl_image2d = (n <= 4) ? false : true;
-
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
-    {
-        return std::make_pair(lhs_info_img, rhs_info_img);
-    }
-    else
-    {
-        return std::make_pair(lhs_info_buf, rhs_info_buf);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-
-    if(workload <= 1595.2000f)
-    {
-        if(r_mk <= 2.1044f)
-        {
-            if(workload <= 870.4000f)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false);
-            }
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, false, true, false, true);
-    }
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
deleted file mode 100644
index 52e6ce3f48..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Bifrost based OpenCL GEMMReshaped configuration */
-class ClGemmDefaultConfigReshapedBifrost final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
deleted file mode 100644
index 57e42c92b3..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
+++ /dev/null
@@ -1,538 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-
-    switch(_target)
-    {
-        case GPUTarget::G78:
-            func = configs_G78.get_function(data_type);
-            break;
-        case GPUTarget::G77:
-        default:
-            func = configs_G77.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, 0, 1, 0, 1);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
-
-    if(r_mk <= 0.11824845522642136)
-    {
-        if(workload <= 880.0)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
-        }
-        else
-        {
-            if(r_nk <= 0.42521367967128754)
-            {
-                if(workload <= 1726.4000244140625)
-                {
-                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0);
-                }
-                else
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-            }
-            else
-            {
-                if(workload <= 1241.6000366210938)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
-                }
-            }
-        }
-    }
-    else
-    {
-        if(workload <= 11404.7998046875)
-        {
-            if(r_mk <= 1.0126488208770752)
-            {
-                if(r_mn <= 2.545312523841858)
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
-                }
-            }
-            else
-            {
-                if(workload <= 2881.199951171875)
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-            }
-        }
-        else
-        {
-            if(r_nk <= 0.5765306055545807)
-            {
-                if(r_mn <= 6.010416746139526)
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(workload <= 1288.0000f)
-    {
-        if(workload <= 505.6000f)
-        {
-            if(r_mn <= 0.4466f)
-            {
-                if(r_nk <= 0.2384f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0);
-                }
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0);
-            }
-        }
-        else
-        {
-            if(r_mn <= 0.2250f)
-            {
-                if(r_mn <= 0.1599f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                }
-            }
-            else
-            {
-                if(r_mk <= 0.7609f)
-                {
-                    if(r_mn <= 2.5453f)
-                    {
-                        if(workload <= 1089.6000f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 4, 8, 2, 4, 0, 0, 1, 0, 1);
-                        }
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 2, 4, 16, 4, 4, 0, 0, 1, 0, 1);
-                    }
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
-                }
-            }
-        }
-    }
-    else
-    {
-        if(workload <= 5434.4001f)
-        {
-            if(workload <= 1603.2000f)
-            {
-                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-            }
-            else
-            {
-                if(r_nk <= 0.6192f)
-                {
-                    if(r_mn <= 16.1016f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                    }
-                    else
-                    {
-                        if(workload <= 2750.0000f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            if(r_mk <= 6.3151f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(r_mk <= 0.0387f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
-                    }
-                    else
-                    {
-                        if(r_mk <= 2.5859f)
-                        {
-                            if(r_mk <= 0.2734f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                            }
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                        }
-                    }
-                }
-            }
-        }
-        else
-        {
-            if(r_mk <= 25.7500f)
-            {
-                if(r_mk <= 0.3615f)
-                {
-                    if(r_mn <= 0.0913f)
-                    {
-                        if(r_mk <= 0.0683f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
-                        }
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                    }
-                }
-                else
-                {
-                    if(workload <= 11174.3999f)
-                    {
-                        if(r_mk <= 0.8047f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            if(workload <= 7185.5999f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(workload <= 17917.5000f)
-                        {
-                            if(r_mk <= 1.5078f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
-                            }
-                        }
-                        else
-                        {
-                            if(workload <= 34449.6016f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1);
-                            }
-                        }
-                    }
-                }
-            }
-            else
-            {
-                if(r_mk <= 331.1111f)
-                {
-                    if(workload <= 53397.5996f)
-                    {
-                        if(r_mn <= 57.8063f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
-                        }
-                    }
-                    else
-                    {
-                        if(r_nk <= 0.9211f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
-                        }
-                    }
-                }
-                else
-                {
-                    if(workload <= 38070.4004f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                    }
-                }
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(workload <= 801.6000f)
-    {
-        return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
-    }
-    else
-    {
-        if(r_mn <= 0.1211f)
-        {
-            if(workload <= 3296.0000f)
-            {
-                return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-            }
-            else
-            {
-                if(r_nk <= 1.0625f)
-                {
-                    return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1);
-                }
-            }
-        }
-        else
-        {
-            if(workload <= 5068.8000f)
-            {
-                return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
-            }
-            else
-            {
-                if(r_nk <= 0.2361f)
-                {
-                    if(workload <= 12630.0000f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 1, 0, 0, 1, 0, 1);
-                    }
-                }
-                else
-                {
-                    if(workload <= 178790.3984f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
-                    }
-                }
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, 0, 1, 0, 1);
-    }
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
deleted file mode 100644
index 588cd64e0e..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Valhall based OpenCL GEMMReshaped configuration */
-class ClGemmDefaultConfigReshapedValhall final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigReshapedValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
deleted file mode 100644
index c990c89a91..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H
-#define ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** CLGEMMReshaped factory class */
-class ClGemmReshapedKernelConfigurationFactory final
-{
-public:
-    /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMReshaped kernel configuration class
-     */
-    static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-            case GPUTarget::BIFROST:
-                return std::make_unique<ClGemmDefaultConfigReshapedBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return std::make_unique<ClGemmDefaultConfigReshapedValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
deleted file mode 100644
index 7ed6b39f3e..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
+++ /dev/null
@@ -1,518 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-using namespace arm_compute::misc::shape_calculator;
-
-ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-
-    switch(_target)
-    {
-        case GPUTarget::G76:
-            func = configs_G76.get_function(data_type);
-            break;
-        case GPUTarget::G51:
-            func = configs_G51.get_function(data_type);
-            break;
-        case GPUTarget::G52:
-            func = configs_G52.get_function(data_type);
-            break;
-        default:
-            func = configs_G7x.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n <= 2548)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 8, false, true, false, true, false);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    const bool is_workload_big = ((m * n * b) / 16) >= 2048;
-
-    if(m == 1)
-    {
-        if(n >= 8192)
-        {
-            const unsigned int h0 = std::max(n / 4, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false);
-        }
-        else
-        {
-            const unsigned int h0 = std::max(n / 2, 1U);
-            if(n <= 204)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true, false);
-            }
-        }
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-        if(is_workload_big)
-        {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
-        }
-    }
-
-    // Get lhs_info/rhs_info in case of OpenCL image
-    const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-    if(is_workload_big)
-    {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
-    }
-    else
-    {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
-    }
-
-    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
-    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
-    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
-
-    // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
-    const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
-
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
-    {
-        return std::make_pair(lhs_info_img, rhs_info_img);
-    }
-    else
-    {
-        return std::make_pair(lhs_info_buf, rhs_info_buf);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    if(m == 1)
-    {
-        if(r_nk <= 0.4664f)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false);
-        }
-        else
-        {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
-
-            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
-        }
-    }
-    else
-    {
-        if(workload <= 274.4000f)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false);
-        }
-        else
-        {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
-
-            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int n0 = n < 1280 ? 2 : 4;
-        const unsigned int h0 = std::max(n / n0, 1U);
-        return configure_lhs_rhs_info(m, n, 1, n0, 4, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n > 2048)
-        {
-            const unsigned int h0 = std::max(n / 4, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            const unsigned int h0 = std::max(n / 2, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    if(m == 1)
-    {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
-
-        if(r_mk <= 0.0026f)
-        {
-            if(r_nk <= 0.4664f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
-            }
-        }
-        else
-        {
-            if(r_mk <= 0.0148f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
-            }
-        }
-    }
-    else
-    {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
-
-        if(workload <= 362.6000f)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
-        }
-        else
-        {
-            if(r_mn <= 22.6067f)
-            {
-                if(workload <= 708.8000f)
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 16, false, false, false, false, false);
-                }
-            }
-            else
-            {
-                if(r_nk <= 0.0917f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
-                }
-                else
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-
-    if(m == 1)
-    {
-        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
-    }
-    else
-    {
-        const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-        const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-        if(workload <= 7449.60f)
-        {
-            if(workload <= 691.60f)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false);
-            }
-            else
-            {
-                if(workload <= 4155.20f)
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 32, false, false, false, false, false);
-                }
-            }
-        }
-        else
-        {
-            if(workload <= 16300.80f)
-            {
-                if(r_mn <= 44.56f)
-                {
-                    GEMMLHSMatrixInfo lhs_info_buf;
-                    GEMMRHSMatrixInfo rhs_info_buf;
-                    GEMMLHSMatrixInfo lhs_info_img;
-                    GEMMRHSMatrixInfo rhs_info_img;
-
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-                }
-            }
-            else
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int n0 = n < 1280 ? 2 : 4;
-        const unsigned int h0 = std::max(n / n0, 1U);
-        return configure_lhs_rhs_info(m, n, 1, n0, 8, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(m == 1)
-        {
-            const unsigned int h0 = std::max(n / 2, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            const unsigned int h0 = std::max(n / 4, 1U);
-            return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true);
-        }
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1));
-        if(m == 1)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true);
-    }
-}
-
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
deleted file mode 100644
index 7b1a1fb04d..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */
-class ClGemmDefaultConfigReshapedRhsOnlyBifrost final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
deleted file mode 100644
index 4c6e633896..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
+++ /dev/null
@@ -1,570 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-using namespace arm_compute::misc::shape_calculator;
-
-ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-
-    switch(_target)
-    {
-        case GPUTarget::G78:
-            func = configs_G78.get_function(data_type);
-            break;
-        case GPUTarget::G77:
-        default:
-            func = configs_G77.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    if(m == 1)
-    {
-        const float r_mn = static_cast<float>(m) / static_cast<float>(n);
-        const float r_mk = static_cast<float>(m) / static_cast<float>(k);
-
-        if(r_mk <= 0.0064484127797186375)
-        {
-            if(r_mn <= 0.0028273810748942196)
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-
-                const unsigned int h0 = std::max(n / 4, 1U);
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, 0, 1, 0, 0, 0);
-            }
-        }
-        else
-        {
-            if(r_mk <= 0.020312500186264515)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, 0, 1, 0, 1, 0);
-            }
-        }
-    }
-    else
-    {
-        const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-        const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-        const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-
-        if(workload <= 1999.2000122070312)
-        {
-            if(workload <= 747.1999816894531)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-            }
-            else
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-        else
-        {
-            if(r_mn <= 0.03348214365541935)
-            {
-                if(r_mk <= 0.028125000186264515)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-                }
-                else
-                {
-                    GEMMLHSMatrixInfo lhs_info_buf;
-                    GEMMRHSMatrixInfo rhs_info_buf;
-                    GEMMLHSMatrixInfo lhs_info_img;
-                    GEMMRHSMatrixInfo rhs_info_img;
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
-                }
-            }
-            else
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, 0, 1, 0, 0, 1);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        if(n <= 836.0)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, 0, 1, 0, 1, 0);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, 0, 1, 0, 1, 0);
-        }
-    }
-    else if(m < 128)
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(k >= 512)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0);
-        }
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(n >= 64)
-        {
-            return configure_lhs_rhs_info(m, n, 4, 8, 4, 1, h0, 0, 1, 0, 0);
-        }
-        else
-        {
-            if(k >= 512)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
-        {
-            return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 1);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(m == 1)
-    {
-        if(workload <= 278.7000f)
-        {
-            if(workload <= 7.5000f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-            }
-            else
-            {
-                if(r_mn <= 0.0031f)
-                {
-                    if(workload <= 256.6000f)
-                    {
-                        if(workload <= 16.7500f)
-                        {
-                            if(r_nk <= 1.6671f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                            }
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                        }
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                    }
-                }
-                else
-                {
-                    if(r_mk <= 0.0027f)
-                    {
-                        if(r_mk <= 0.0014f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                        }
-                        else
-                        {
-                            if(workload <= 8.9500f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(workload <= 14.1500f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                        }
-                        else
-                        {
-                            if(r_mk <= 0.0041f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        else
-        {
-            if(workload <= 363.7000f)
-            {
-                if(r_mk <= 0.0031f)
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 32, 0, 1, 0, 1, 0);
-                }
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
-            }
-        }
-    }
-    else
-    {
-        if(workload <= 1384.8000f)
-        {
-            if(workload <= 704.0000f)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1);
-            }
-        }
-        else
-        {
-            if(workload <= 16761.6006f)
-            {
-                if(r_mn <= 187.1250f)
-                {
-                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1);
-                }
-            }
-            else
-            {
-                if(r_mk <= 432.4630f)
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 16, 0, 1, 0, 1, 1);
-                }
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(m == 1)
-    {
-        if(r_mn <= 0.0038f)
-        {
-            if(workload <= 353.9000f)
-            {
-                if(workload <= 278.7000f)
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                }
-                else
-                {
-                    if(r_mk <= 0.0004f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                    }
-                    else
-                    {
-                        if(r_mk <= 0.0030f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1);
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                        }
-                    }
-                }
-            }
-            else
-            {
-                if(r_nk <= 1.9384f)
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1);
-                }
-            }
-        }
-        else
-        {
-            if(r_nk <= 1.0368f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, 0, 0, 1, 0, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-            }
-        }
-    }
-    else
-    {
-        if(workload <= 1422.4000f)
-        {
-            if(workload <= 704.0000f)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0);
-            }
-            else
-            {
-                if(workload <= 1197.6000f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1);
-                }
-                else
-                {
-                    if(workload <= 1241.6000f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1);
-                    }
-                }
-            }
-        }
-        else
-        {
-            if(workload <= 2769.6000f)
-            {
-                if(workload <= 1846.4000f)
-                {
-                    if(r_mn <= 2.4927f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                    }
-                }
-                else
-                {
-                    if(r_mn <= 0.6261f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        if(r_mk <= 3.4453f)
-                        {
-                            if(r_mn <= 1.4135f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                            }
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                        }
-                    }
-                }
-            }
-            else
-            {
-                if(r_nk <= 0.0302f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1);
-                }
-                else
-                {
-                    if(r_mk <= 181.3750f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        if(workload <= 28035.2002f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                        }
-                        else
-                        {
-                            if(r_mk <= 808.6667f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
deleted file mode 100644
index 6a11ddb748..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Valhall based OpenCL GEMMReshapedOnlyRHS configuration */
-class ClGemmDefaultConfigReshapedRhsOnlyValhall final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyBifrost.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyBifrost.cpp
deleted file mode 100644
index 7ed6b39f3e..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyBifrost.cpp
+++ /dev/null
@@ -1,518 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-using namespace arm_compute::misc::shape_calculator;
-
-ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-
-    switch(_target)
-    {
-        case GPUTarget::G76:
-            func = configs_G76.get_function(data_type);
-            break;
-        case GPUTarget::G51:
-            func = configs_G51.get_function(data_type);
-            break;
-        case GPUTarget::G52:
-            func = configs_G52.get_function(data_type);
-            break;
-        default:
-            func = configs_G7x.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n <= 2548)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 8, false, true, false, true, false);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    const bool is_workload_big = ((m * n * b) / 16) >= 2048;
-
-    if(m == 1)
-    {
-        if(n >= 8192)
-        {
-            const unsigned int h0 = std::max(n / 4, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false);
-        }
-        else
-        {
-            const unsigned int h0 = std::max(n / 2, 1U);
-            if(n <= 204)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true, false);
-            }
-        }
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-        if(is_workload_big)
-        {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
-        }
-    }
-
-    // Get lhs_info/rhs_info in case of OpenCL image
-    const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-    if(is_workload_big)
-    {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
-    }
-    else
-    {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
-    }
-
-    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
-    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
-    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
-
-    // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
-    const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
-
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
-    {
-        return std::make_pair(lhs_info_img, rhs_info_img);
-    }
-    else
-    {
-        return std::make_pair(lhs_info_buf, rhs_info_buf);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    if(m == 1)
-    {
-        if(r_nk <= 0.4664f)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false);
-        }
-        else
-        {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
-
-            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
-        }
-    }
-    else
-    {
-        if(workload <= 274.4000f)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false);
-        }
-        else
-        {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
-
-            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int n0 = n < 1280 ? 2 : 4;
-        const unsigned int h0 = std::max(n / n0, 1U);
-        return configure_lhs_rhs_info(m, n, 1, n0, 4, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n > 2048)
-        {
-            const unsigned int h0 = std::max(n / 4, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            const unsigned int h0 = std::max(n / 2, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    if(m == 1)
-    {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
-
-        if(r_mk <= 0.0026f)
-        {
-            if(r_nk <= 0.4664f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
-            }
-        }
-        else
-        {
-            if(r_mk <= 0.0148f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
-            }
-        }
-    }
-    else
-    {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
-
-        if(workload <= 362.6000f)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
-        }
-        else
-        {
-            if(r_mn <= 22.6067f)
-            {
-                if(workload <= 708.8000f)
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 16, false, false, false, false, false);
-                }
-            }
-            else
-            {
-                if(r_nk <= 0.0917f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
-                }
-                else
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-
-    if(m == 1)
-    {
-        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
-    }
-    else
-    {
-        const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-        const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-        if(workload <= 7449.60f)
-        {
-            if(workload <= 691.60f)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false);
-            }
-            else
-            {
-                if(workload <= 4155.20f)
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 32, false, false, false, false, false);
-                }
-            }
-        }
-        else
-        {
-            if(workload <= 16300.80f)
-            {
-                if(r_mn <= 44.56f)
-                {
-                    GEMMLHSMatrixInfo lhs_info_buf;
-                    GEMMRHSMatrixInfo rhs_info_buf;
-                    GEMMLHSMatrixInfo lhs_info_img;
-                    GEMMRHSMatrixInfo rhs_info_img;
-
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-                }
-            }
-            else
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int n0 = n < 1280 ? 2 : 4;
-        const unsigned int h0 = std::max(n / n0, 1U);
-        return configure_lhs_rhs_info(m, n, 1, n0, 8, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(m == 1)
-        {
-            const unsigned int h0 = std::max(n / 2, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            const unsigned int h0 = std::max(n / 4, 1U);
-            return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true);
-        }
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1));
-        if(m == 1)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true);
-    }
-}
-
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyValhall.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyValhall.cpp
deleted file mode 100644
index 4c6e633896..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyValhall.cpp
+++ /dev/null
@@ -1,570 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-using namespace arm_compute::misc::shape_calculator;
-
-ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-
-    switch(_target)
-    {
-        case GPUTarget::G78:
-            func = configs_G78.get_function(data_type);
-            break;
-        case GPUTarget::G77:
-        default:
-            func = configs_G77.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    if(m == 1)
-    {
-        const float r_mn = static_cast<float>(m) / static_cast<float>(n);
-        const float r_mk = static_cast<float>(m) / static_cast<float>(k);
-
-        if(r_mk <= 0.0064484127797186375)
-        {
-            if(r_mn <= 0.0028273810748942196)
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-
-                const unsigned int h0 = std::max(n / 4, 1U);
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, 0, 1, 0, 0, 0);
-            }
-        }
-        else
-        {
-            if(r_mk <= 0.020312500186264515)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, 0, 1, 0, 1, 0);
-            }
-        }
-    }
-    else
-    {
-        const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-        const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-        const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-
-        if(workload <= 1999.2000122070312)
-        {
-            if(workload <= 747.1999816894531)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-            }
-            else
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-        else
-        {
-            if(r_mn <= 0.03348214365541935)
-            {
-                if(r_mk <= 0.028125000186264515)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-                }
-                else
-                {
-                    GEMMLHSMatrixInfo lhs_info_buf;
-                    GEMMRHSMatrixInfo rhs_info_buf;
-                    GEMMLHSMatrixInfo lhs_info_img;
-                    GEMMRHSMatrixInfo rhs_info_img;
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
-                }
-            }
-            else
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, 0, 1, 0, 0, 1);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        if(n <= 836.0)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, 0, 1, 0, 1, 0);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, 0, 1, 0, 1, 0);
-        }
-    }
-    else if(m < 128)
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(k >= 512)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0);
-        }
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(n >= 64)
-        {
-            return configure_lhs_rhs_info(m, n, 4, 8, 4, 1, h0, 0, 1, 0, 0);
-        }
-        else
-        {
-            if(k >= 512)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
-        {
-            return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 1);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(m == 1)
-    {
-        if(workload <= 278.7000f)
-        {
-            if(workload <= 7.5000f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-            }
-            else
-            {
-                if(r_mn <= 0.0031f)
-                {
-                    if(workload <= 256.6000f)
-                    {
-                        if(workload <= 16.7500f)
-                        {
-                            if(r_nk <= 1.6671f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                            }
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                        }
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                    }
-                }
-                else
-                {
-                    if(r_mk <= 0.0027f)
-                    {
-                        if(r_mk <= 0.0014f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                        }
-                        else
-                        {
-                            if(workload <= 8.9500f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(workload <= 14.1500f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                        }
-                        else
-                        {
-                            if(r_mk <= 0.0041f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        else
-        {
-            if(workload <= 363.7000f)
-            {
-                if(r_mk <= 0.0031f)
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 32, 0, 1, 0, 1, 0);
-                }
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
-            }
-        }
-    }
-    else
-    {
-        if(workload <= 1384.8000f)
-        {
-            if(workload <= 704.0000f)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1);
-            }
-        }
-        else
-        {
-            if(workload <= 16761.6006f)
-            {
-                if(r_mn <= 187.1250f)
-                {
-                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1);
-                }
-            }
-            else
-            {
-                if(r_mk <= 432.4630f)
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 16, 0, 1, 0, 1, 1);
-                }
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(m == 1)
-    {
-        if(r_mn <= 0.0038f)
-        {
-            if(workload <= 353.9000f)
-            {
-                if(workload <= 278.7000f)
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                }
-                else
-                {
-                    if(r_mk <= 0.0004f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                    }
-                    else
-                    {
-                        if(r_mk <= 0.0030f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1);
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                        }
-                    }
-                }
-            }
-            else
-            {
-                if(r_nk <= 1.9384f)
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1);
-                }
-            }
-        }
-        else
-        {
-            if(r_nk <= 1.0368f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, 0, 0, 1, 0, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-            }
-        }
-    }
-    else
-    {
-        if(workload <= 1422.4000f)
-        {
-            if(workload <= 704.0000f)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0);
-            }
-            else
-            {
-                if(workload <= 1197.6000f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1);
-                }
-                else
-                {
-                    if(workload <= 1241.6000f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1);
-                    }
-                }
-            }
-        }
-        else
-        {
-            if(workload <= 2769.6000f)
-            {
-                if(workload <= 1846.4000f)
-                {
-                    if(r_mn <= 2.4927f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                    }
-                }
-                else
-                {
-                    if(r_mn <= 0.6261f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        if(r_mk <= 3.4453f)
-                        {
-                            if(r_mn <= 1.4135f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                            }
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                        }
-                    }
-                }
-            }
-            else
-            {
-                if(r_nk <= 0.0302f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1);
-                }
-                else
-                {
-                    if(r_mk <= 181.3750f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        if(workload <= 28035.2002f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                        }
-                        else
-                        {
-                            if(r_mk <= 808.6667f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
deleted file mode 100644
index 8fd71276a0..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H
-#define ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** CLGEMMReshapedOnlyRHS factory class */
-class ClGemmReshapedOnlyRhsKernelConfigurationFactory final
-{
-public:
-    /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMReshapedOnlyRHS kernel configuration class
-     */
-    static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-            case GPUTarget::BIFROST:
-                return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H */
diff --git a/src/core/helpers/AutoConfiguration.h b/src/core/helpers/AutoConfiguration.h
index 6880a6cb66..9df2a76983 100644
--- a/src/core/helpers/AutoConfiguration.h
+++ b/src/core/helpers/AutoConfiguration.h
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2020 Arm Limited.
+* Copyright (c) 2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 
 namespace arm_compute
 {
@@ -41,10 +42,11 @@ namespace arm_compute
  */
 inline bool auto_init_if_empty(ITensorInfo       &info,
                                const TensorShape &shape,
-                               int num_channels, DataType data_type,
-                               QuantizationInfo quantization_info = QuantizationInfo())
+                               int                num_channels,
+                               DataType           data_type,
+                               QuantizationInfo   quantization_info = QuantizationInfo())
 {
-    if(info.tensor_shape().total_size() == 0)
+    if (info.tensor_shape().total_size() == 0)
     {
         info.set_data_type(data_type);
         info.set_num_channels(num_channels);
@@ -57,21 +59,26 @@ inline bool auto_init_if_empty(ITensorInfo       &info,
 }
 
 /** Auto initialize the tensor info using another tensor info.
-*
-* @param info_sink   Tensor info used to check and assign
-* @param info_source Tensor info used to assign
-*
-* @return True if the tensor info has been initialized
-*/
+ *
+ * (COMPMID-6012) This method should remain in sync with the fields of ITensorInfo that have setters.
+ *
+ *
+ * @param info_sink   Tensor info used to check and assign
+ * @param info_source Tensor info used to assign
+ *
+ *
+ * @return True if the tensor info has been initialized
+ */
 inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source)
 {
-    if(info_sink.tensor_shape().total_size() == 0)
+    if (info_sink.tensor_shape().total_size() == 0)
     {
         info_sink.set_data_type(info_source.data_type());
         info_sink.set_num_channels(info_source.num_channels());
         info_sink.set_tensor_shape(info_source.tensor_shape());
         info_sink.set_quantization_info(info_source.quantization_info());
         info_sink.set_data_layout(info_source.data_layout());
+        info_sink.set_are_values_constant(info_source.are_values_constant());
         return true;
     }
 
@@ -87,7 +94,7 @@ inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_s
  */
 inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
 {
-    if(info.tensor_shape().total_size() == 0)
+    if (info.tensor_shape().total_size() == 0)
     {
         info.set_tensor_shape(shape);
         return true;
@@ -106,7 +113,7 @@ inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
  */
 inline bool set_format_if_unknown(ITensorInfo &info, Format format)
 {
-    if(info.data_type() == DataType::UNKNOWN)
+    if (info.data_type() == DataType::UNKNOWN)
     {
         info.set_format(format);
         return true;
@@ -125,7 +132,7 @@ inline bool set_format_if_unknown(ITensorInfo &info, Format format)
  */
 inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
 {
-    if(info.data_type() == DataType::UNKNOWN)
+    if (info.data_type() == DataType::UNKNOWN)
     {
         info.set_data_type(data_type);
         return true;
@@ -144,7 +151,7 @@ inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
  */
 inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout)
 {
-    if(info.data_layout() == DataLayout::UNKNOWN)
+    if (info.data_layout() == DataLayout::UNKNOWN)
     {
         info.set_data_layout(data_layout);
         return true;
@@ -163,7 +170,7 @@ inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout
  */
 inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
 {
-    if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
+    if (info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
     {
         info.set_quantization_info(quantization_info);
         return true;
diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp
new file mode 100644
index 0000000000..06e35eed8c
--- /dev/null
+++ b/src/core/helpers/LUTManager.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/helpers/LUTManager.h"
+
+namespace arm_compute
+{
+#ifdef __aarch64__
+namespace
+{
+
+void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut)
+{
+    union Element
+    {
+        uint16_t  i = 0;
+        float16_t fp;
+    } item;
+    // Fill lut by iterating over all 16 bit values using the union.
+    while (true)
+    {
+        (*lut)[item.i] = 1.f / (1.f + std::exp(-item.fp));
+        if (item.i == 65535)
+            break;
+        item.i++;
+    }
+}
+} // namespace
+
+std::shared_ptr<ActivationLayerInfo::LookupTable65536> LUTManager::get_lut_table(LUTInfo info)
+{
+    const auto itr   = map_fp16.find(info);
+    auto       s_ptr = (itr != map_fp16.end()) ? itr->second.lock() : nullptr; // nullptr if invalid or not found.
+    if (s_ptr != nullptr)
+    {
+        // Found and valid
+        return s_ptr; // Return weak ptr as shared ptr
+    }
+    else
+    {
+        // Not found, or pointer not valid
+        // We do not use make_shared to prevent the weak_ptr keeping the control block alive
+        std::shared_ptr<ActivationLayerInfo::LookupTable65536> ptr(new ActivationLayerInfo::LookupTable65536);
+        init_lut_fp16(ptr.get());
+        map_fp16[info] = ptr;
+        return ptr;
+    }
+}
+#endif // __aarch64__
+
+// Static function to get LutManager instance
+LUTManager &LUTManager::get_instance()
+{
+    static auto inst_ = std::make_unique<LUTManager>(); // The one, single instance.
+    return *inst_;
+}
+
+} // namespace arm_compute
diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h
new file mode 100644
index 0000000000..4e13ead7e3
--- /dev/null
+++ b/src/core/helpers/LUTManager.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_HELPERS_LUTMANAGER_H
+#define ACL_SRC_CORE_HELPERS_LUTMANAGER_H
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include <map>
+#include <memory>
+
+namespace arm_compute
+{
+
+struct LUTInfo
+{
+    ActivationLayerInfo::ActivationFunction act;
+    DataType                                dt;
+    QuantizationInfo                        qinfo;
+    // Operators enable use of map with Lutinfo as key
+    friend bool operator<(const LUTInfo &l, const LUTInfo &r)
+    {
+        return (l.act < r.act) || ((l.act == r.act) && (l.dt < r.dt)) ||
+               ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() < r.qinfo.scale())) ||
+               ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() == r.qinfo.scale()) &&
+                (l.qinfo.offset() < l.qinfo.offset()));
+    }
+    bool operator==(const LUTInfo &l)
+    {
+        return this->act == l.act && this->dt == l.dt && this->qinfo == l.qinfo;
+    }
+};
+
+/* Class to handle getting look up table */
+class LUTManager
+{
+public:
+    LUTManager() = default;
+
+    static LUTManager &get_instance();
+#ifdef __aarch64__
+    std::shared_ptr<ActivationLayerInfo::LookupTable65536> get_lut_table(LUTInfo info);
+
+private:
+    std::map<LUTInfo, std::weak_ptr<ActivationLayerInfo::LookupTable65536>> map_fp16{};
+#endif // __aarch64__
+};
+
+} // namespace arm_compute
+#endif // ACL_SRC_CORE_HELPERS_LUTMANAGER_H
diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h
index e751e6025d..dd094b414c 100644
--- a/src/core/helpers/MemoryHelpers.h
+++ b/src/core/helpers/MemoryHelpers.h
@@ -24,9 +24,9 @@
 #ifndef SRC_COMMON_MEMORY_HELPERS_H
 #define SRC_COMMON_MEMORY_HELPERS_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 #include <memory>
@@ -41,12 +41,19 @@ inline int offset_int_vec(int offset)
 }
 
 template <typename TensorType>
-using WorkspaceData = std::vector<std::pair<int, std::unique_ptr<TensorType>>>;
+struct WorkspaceDataElement
+{
+    int                          slot{-1};
+    experimental::MemoryLifetime lifetime{experimental::MemoryLifetime::Temporary};
+    std::unique_ptr<TensorType>  tensor{nullptr};
+};
 
 template <typename TensorType>
-WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
-                                           MemoryGroup                            &mgroup,
-                                           ITensorPack                            &run_pack)
+using WorkspaceData = std::vector<WorkspaceDataElement<TensorType>>;
+
+template <typename TensorType>
+WorkspaceData<TensorType>
+manage_workspace(const experimental::MemoryRequirements &mem_reqs, MemoryGroup &mgroup, ITensorPack &run_pack)
 {
     ITensorPack dummy_pack = ITensorPack();
     return manage_workspace<TensorType>(mem_reqs, mgroup, run_pack, dummy_pack);
@@ -55,24 +62,26 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
 template <typename TensorType>
 WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
                                            MemoryGroup                            &mgroup,
-                                           ITensorPack &run_pack, ITensorPack &prep_pack)
+                                           ITensorPack                            &run_pack,
+                                           ITensorPack                            &prep_pack)
 {
     WorkspaceData<TensorType> workspace_memory;
-    for(const auto &req : mem_reqs)
+    for (const auto &req : mem_reqs)
     {
-        if(req.size == 0)
+        if (req.size == 0)
         {
             continue;
         }
 
-        const auto aux_info = TensorInfo{ TensorShape(req.size), 1, DataType::U8 };
-        workspace_memory.emplace_back(req.slot, std::make_unique<TensorType>());
+        const auto aux_info = TensorInfo{TensorShape(req.size), 1, DataType::U8};
+        workspace_memory.emplace_back(
+            WorkspaceDataElement<TensorType>{req.slot, req.lifetime, std::make_unique<TensorType>()});
 
-        auto aux_tensor = workspace_memory.back().second.get();
+        auto aux_tensor = workspace_memory.back().tensor.get();
         ARM_COMPUTE_ERROR_ON_NULLPTR(aux_tensor);
-        aux_tensor->allocator()->init(aux_info);
+        aux_tensor->allocator()->init(aux_info, req.alignment);
 
-        if(req.lifetime == experimental::MemoryLifetime::Temporary)
+        if (req.lifetime == experimental::MemoryLifetime::Temporary)
         {
             mgroup.manage(aux_tensor);
         }
@@ -83,13 +92,48 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
         run_pack.add_tensor(req.slot, aux_tensor);
     }
 
-    for(auto &mem : workspace_memory)
+    for (auto &mem : workspace_memory)
     {
-        auto tensor = mem.second.get();
+        auto tensor = mem.tensor.get();
         tensor->allocator()->allocate();
     }
 
     return workspace_memory;
 }
+
+template <typename TensorType>
+void release_prepare_tensors(WorkspaceData<TensorType> &workspace, ITensorPack &prep_pack)
+{
+    workspace.erase(std::remove_if(workspace.begin(), workspace.end(),
+                                   [&prep_pack](auto &wk)
+                                   {
+                                       const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare;
+                                       if (to_erase)
+                                       {
+                                           prep_pack.remove_tensor(wk.slot);
+                                       }
+                                       return to_erase;
+                                   }),
+                    workspace.end());
+}
+
+/** Utility function to release tensors with lifetime marked as Prepare */
+template <typename TensorType>
+void release_temporaries(const experimental::MemoryRequirements &mem_reqs, WorkspaceData<TensorType> &workspace)
+{
+    for (auto &ws : workspace)
+    {
+        const int slot = ws.slot;
+        for (auto &m : mem_reqs)
+        {
+            if (m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare)
+            {
+                auto tensor = ws.tensor.get();
+                tensor->allocator()->free();
+                break;
+            }
+        }
+    }
+}
 } // namespace arm_compute
 #endif /* SRC_COMMON_MEMORY_HELPERS_H */
diff --git a/src/core/helpers/PoolingHelpers.h b/src/core/helpers/PoolingHelpers.h
new file mode 100644
index 0000000000..9ef045f472
--- /dev/null
+++ b/src/core/helpers/PoolingHelpers.h
@@ -0,0 +1,219 @@
+/*
+* Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_POOLINGHELPERS_H
+#define SRC_CORE_HELPERS_POOLINGHELPERS_H
+
+#include "src/core/NEON/NEAsymm.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+
+inline float calculate_avg_scale_pool3d(bool               exclude_padding,
+                                        const Coordinates &id,
+                                        const int          pool_size_x,
+                                        const int          pool_size_y,
+                                        const int          pool_size_z,
+                                        const int          upper_bound_w,
+                                        const int          upper_bound_h,
+                                        const int          upper_bound_d,
+                                        const int          pad_x,
+                                        const int          pad_y,
+                                        const int          pad_z,
+                                        const int          stride_x,
+                                        const int          stride_y,
+                                        const int          stride_z)
+{
+    // Based on NDHWC
+    int start_x = id[1] * stride_x - pad_x;
+    int start_y = id[2] * stride_y - pad_y;
+    int start_z = id[3] * stride_z - pad_z;
+
+    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
+    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
+    const int end_z = std::min(start_z + pool_size_z, upper_bound_d);
+    if (exclude_padding)
+    {
+        start_x = std::max(0, start_x);
+        start_y = std::max(0, start_y);
+        start_z = std::max(0, start_z);
+    }
+    return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z));
+}
+
+inline float calculate_avg_scale_pool2d(bool               exclude_padding,
+                                        DataLayout         data_layout,
+                                        const Coordinates &id,
+                                        const int          pool_size_x,
+                                        const int          pool_size_y,
+                                        const int          upper_bound_w,
+                                        const int          upper_bound_h,
+                                        const int          pad_x,
+                                        const int          pad_y,
+                                        const int          stride_x,
+                                        const int          stride_y)
+{
+    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    int start_x = id[idx_width] * stride_x - pad_x;
+    int start_y = id[idx_height] * stride_y - pad_y;
+
+    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
+    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
+    if (exclude_padding)
+    {
+        start_x = std::max(0, start_x);
+        start_y = std::max(0, start_y);
+    }
+    return 1.f / ((end_y - start_y) * (end_x - start_x));
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
+quantize(float val, const UniformQuantizationInfo &info)
+{
+    return quantize_qasymm8_signed(val, info);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
+quantize(float val, const UniformQuantizationInfo &info)
+{
+    return quantize_qasymm8(val, info);
+}
+
+template <typename T>
+inline T vcvtq_q32_f32(float32x4_t values);
+
+template <>
+inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
+{
+    return vcvtq_u32_f32(values);
+}
+
+template <>
+inline int32x4_t vcvtq_q32_f32(float32x4_t values)
+{
+    return vcvtq_s32_f32(values);
+}
+
+template <typename T>
+inline float32x4_t vcvtq_f32_q32(T values);
+
+template <>
+inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
+{
+    return vcvtq_f32_u32(values);
+}
+
+template <>
+inline float32x4_t vcvtq_f32_q32(int32x4_t values)
+{
+    return vcvtq_f32_s32(values);
+}
+
+template <typename Tout>
+inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                           const float          quant_rescale,
+                                           const float          scale_pooling,
+                                           const int32_t        new_offset);
+
+template <>
+inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                                 const float          quant_rescale,
+                                                 const float          scale_pooling,
+                                                 const int32_t        new_offset)
+{
+    const float new_scale = quant_rescale / scale_pooling;
+    return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
+}
+
+template <>
+inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                                const float          quant_rescale,
+                                                const float          scale_pooling,
+                                                const int32_t        new_offset)
+{
+    const float new_scale = quant_rescale / scale_pooling;
+    return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
+}
+
+template <typename Tin, typename Tout>
+inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
+
+template <>
+inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
+{
+    const float32x4x4_t acc = {{
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
+    }};
+    return vquantize(acc, requant_qinfo);
+}
+
+template <>
+inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
+{
+    const float32x4x4_t acc = {{
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
+    }};
+    return vquantize_signed(acc, requant_qinfo);
+}
+
+template <typename T>
+inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
+
+template <>
+inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
+{
+    const float32x4x2_t acc = {{
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
+    }};
+    return vquantize(acc, requant_qinfo);
+}
+
+template <>
+inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
+{
+    const float32x4x2_t acc = {{
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
+    }};
+    return vquantize_signed(acc, requant_qinfo);
+}
+
+} // namespace
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CORE_HELPERS_POOLINGHELPERS_H */
diff --git a/src/core/helpers/ScaleHelpers.h b/src/core/helpers/ScaleHelpers.h
index e769bba782..47605e7385 100644
--- a/src/core/helpers/ScaleHelpers.h
+++ b/src/core/helpers/ScaleHelpers.h
@@ -50,8 +50,12 @@ namespace scale_helpers
  *
  * @return The bilinear interpolated pixel value
  */
-inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy,
-                                           UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+inline uint8_t delta_bilinear_c1_quantized(const uint8_t          *pixel_ptr,
+                                           size_t                  stride,
+                                           float                   dx,
+                                           float                   dy,
+                                           UniformQuantizationInfo iq_info,
+                                           UniformQuantizationInfo oq_info)
 {
     ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
 
@@ -85,8 +89,12 @@ inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stri
  *
  * @return The bilinear interpolated pixel value
  */
-inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy,
-                                          UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+inline int8_t delta_bilinear_c1_quantized(const int8_t           *pixel_ptr,
+                                          size_t                  stride,
+                                          float                   dx,
+                                          float                   dy,
+                                          UniformQuantizationInfo iq_info,
+                                          UniformQuantizationInfo oq_info)
 {
     ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
 
@@ -122,9 +130,8 @@ inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride
  *
  * @return The pixel at (x, y) using area interpolation.
  */
-inline uint8_t
-pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr,
-                      float hr, int x, int y)
+inline uint8_t pixel_area_c1u8_clamp(
+    const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
 {
     ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
 
@@ -159,7 +166,7 @@ pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t widt
 
     // Sum pixels in area
     int sum = 0;
-    for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
+    for (int j = yi + y_from, je = yi + y_to; j <= je; ++j)
     {
         const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
         sum                = std::accumulate(ptr, ptr + x_elements, sum);
diff --git a/src/core/helpers/SoftmaxHelpers.cpp b/src/core/helpers/SoftmaxHelpers.cpp
index 71b971af31..8184991ab5 100644
--- a/src/core/helpers/SoftmaxHelpers.cpp
+++ b/src/core/helpers/SoftmaxHelpers.cpp
@@ -29,7 +29,7 @@ namespace softmax_helpers
 {
 PermutationVector get_permutation_vector_from_softmax_axis(size_t axis)
 {
-    switch(axis)
+    switch (axis)
     {
         case 1:
             return PermutationVector(1U, 0U, 2U, 3U);
diff --git a/src/core/helpers/Utils.cpp b/src/core/helpers/Utils.cpp
new file mode 100644
index 0000000000..f8895d8a3c
--- /dev/null
+++ b/src/core/helpers/Utils.cpp
@@ -0,0 +1,49 @@
+/*
+* Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/helpers/Utils.h"
+
+namespace arm_compute
+{
+bool has_holes(const ITensorInfo &info)
+{
+    return has_holes(info, info.num_dimensions() - 1);
+}
+
+bool has_holes(const ITensorInfo &info, size_t dimension)
+{
+    const auto &shape          = info.tensor_shape();
+    const auto &strides        = info.strides_in_bytes();
+    size_t      squashed_bytes = info.element_size();
+
+    for (size_t dim = 0; dim <= dimension; ++dim)
+    {
+        if (strides[dim] != squashed_bytes)
+        {
+            return true;
+        }
+        squashed_bytes *= shape[dim];
+    }
+    return false;
+}
+} // namespace arm_compute
diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h
index 326dc962c7..a17a78f7ee 100644
--- a/src/core/helpers/Utils.h
+++ b/src/core/helpers/Utils.h
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2020-2021 Arm Limited.
+* Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_HELPERS_UTILS_H
-#define SRC_CORE_HELPERS_UTILS_H
+#ifndef ACL_SRC_CORE_HELPERS_UTILS_H
+#define ACL_SRC_CORE_HELPERS_UTILS_H
 
 #include "arm_compute/core/ITensorInfo.h"
 
@@ -38,14 +38,14 @@ namespace arm_compute
  *         calculated based on the tensor shape and the strides of lower dimensions.
  */
 template <typename T, typename... Ts>
-inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
+inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&...fixed_strides)
 {
     const TensorShape &shape = info.tensor_shape();
 
     // Create strides object
     Strides strides(stride_x, fixed_strides...);
 
-    for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
+    for (size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
     {
         strides.set(i, shape[i - 1] * strides[i - 1]);
     }
@@ -92,6 +92,29 @@ inline unsigned int get_next_power_two(unsigned int x)
 
     return x;
 }
+
+/** Check if the tensor has any holes.
+ *
+ * A hole is defined as any gap in the tensor between two consecutive values. This can be a result of extending
+ * the paddings or manipulating the strides of the tensor
+ *
+ * @param[in] info Tensor info object defining the shape of the input tensor.
+ *
+ * @note This function checks for holes in all dimensions.
+ *
+ */
+bool has_holes(const ITensorInfo &info);
+
+/** Check if the tensor has any holes.
+ *
+ * @param[in] info      Tensor info object defining the shape of the input tensor.
+ * @param[in] dimension Highest dimension to check.
+ *
+ * @note This function checks for holes in all the dimensions upto and including the highest dimension.
+ *
+ */
+bool has_holes(const ITensorInfo &info, size_t dimension);
+
 } // namespace arm_compute
 
-#endif /* SRC_CORE_HELPERS_UTILS_H */
+#endif // ACL_SRC_CORE_HELPERS_UTILS_H
diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp
index 75ffb71b4b..30a55fcbc6 100644
--- a/src/core/helpers/WindowHelpers.cpp
+++ b/src/core/helpers/WindowHelpers.cpp
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2020-2021 Arm Limited.
+* Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,10 @@
 
 namespace arm_compute
 {
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+Window
+calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
 {
-    if(!skip_border)
+    if (!skip_border)
     {
         border_size = BorderSize(0);
     }
@@ -38,40 +39,47 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps,
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      anchor[0] + border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] + border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
-        window.set(1, Window::Dimension(
+        window.set(1,
+                   Window::Dimension(
                        // Skip the border above the image
                        anchor[1] + border_size.top,
                        // Skip the border below the image
-                       anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
+                       anchor[1] + border_size.top +
+                           ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) -
+                                                            static_cast<int>(border_size.bottom)),
+                                            steps[1]),
                        steps[1]));
 
         ++n;
     }
 
-    if(anchor.num_dimensions() > 2)
+    if (anchor.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(anchor[2], std::max<size_t>(1, shape[2]), steps[2]));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -81,7 +89,7 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps,
 
 Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool skip_border, BorderSize border_size)
 {
-    if(!skip_border)
+    if (!skip_border)
     {
         border_size = BorderSize(0);
     }
@@ -89,40 +97,46 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool s
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(shape.num_dimensions() > 1)
+    if (shape.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       border_size.top,
-                       // Skip the border below the image
-                       border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
-                       steps[1]));
+                          // Skip the border above the image
+                          border_size.top,
+                          // Skip the border below the image
+                          border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) -
+                                                                             static_cast<int>(border_size.top) -
+                                                                             static_cast<int>(border_size.bottom)),
+                                                             steps[1]),
+                          steps[1]));
 
         ++n;
     }
 
-    if(shape.num_dimensions() > 2)
+    if (shape.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[2]), steps[2]));
 
         ++n;
     }
 
-    for(; n < shape.num_dimensions(); ++n)
+    for (; n < shape.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(0, std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -138,40 +152,42 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
     Window window;
 
     window.set(0, Window::Dimension(
-                   // move the anchor to the start from the border
-                   anchor[0] - border_size.left,
-                   // move the anchor to include the right end border
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
-                   steps[0]));
+                      // move the anchor to the start from the border
+                      anchor[0] - border_size.left,
+                      // move the anchor to include the right end border
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] - border_size.left +
+                          ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Include the border above the image
-                       anchor[1] - border_size.top,
-                       // Include the border below the image
-                       anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
-                       steps[1]));
+                          // Include the border above the image
+                          anchor[1] - border_size.top,
+                          // Include the border below the image
+                          anchor[1] - border_size.top +
+                              ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
+                          steps[1]));
 
         ++n;
     }
 
-    if(anchor.num_dimensions() > 2)
+    if (anchor.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[n]), steps[2]));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -179,9 +195,12 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
     return window;
 }
 
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+Window calculate_max_window_horizontal(const ValidRegion &valid_region,
+                                       const Steps       &steps,
+                                       bool               skip_border,
+                                       BorderSize         border_size)
 {
-    if(skip_border)
+    if (skip_border)
     {
         border_size.top    = 0;
         border_size.bottom = 0;
@@ -198,37 +217,133 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      anchor[0] + border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] + border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       anchor[1] - border_size.top,
-                       // Skip the border below the image
-                       anchor[1] + shape[1] + border_size.bottom,
-                       1));
+                          // Skip the border above the image
+                          anchor[1] - border_size.top,
+                          // Skip the border below the image
+                          anchor[1] + shape[1] + border_size.bottom, 1));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
 
     return window;
 }
+
+std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &src0, const ITensorInfo &src1)
+{
+    const auto &shape0         = src0.tensor_shape();
+    const auto &shape1         = src1.tensor_shape();
+    const auto &strides0       = src0.strides_in_bytes();
+    const auto &strides1       = src1.strides_in_bytes();
+    const auto  num_dimensions = std::max(src0.num_dimensions(), src1.num_dimensions());
+
+    Window win;
+    size_t split_dimension = Window::DimY;
+    size_t dim             = 0;
+
+    size_t squashed_bytes = src0.element_size();
+
+    // Try to squash the low dimensions together.
+    for (; dim < num_dimensions; ++dim)
+    {
+        if (shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes)
+        {
+            break;
+        }
+
+        squashed_bytes *= shape0[dim];
+    }
+
+    if (dim == num_dimensions)
+    {
+        auto squashed_elements = squashed_bytes / src0.element_size();
+
+        split_dimension = Window::DimX;
+
+        // The input tensors can be interpreted as 1D array.
+        win.set(0, Window::Dimension(0, squashed_elements, 1));
+
+        for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+        {
+            win.set(dim, Window::Dimension(0, 1, 1));
+        }
+    }
+    else
+    {
+        // Generates the max window.
+        for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+        {
+            win.set(dim, Window::Dimension(0, std::max(shape0[dim], shape1[dim]), 1));
+        }
+    }
+
+    return std::make_pair(win, split_dimension);
+}
+
+std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &src)
+{
+    const auto &shape          = src.tensor_shape();
+    const auto &strides        = src.strides_in_bytes();
+    const auto  num_dimensions = src.num_dimensions();
+
+    Window win;
+    size_t split_dimension = Window::DimY;
+    size_t dim             = 0;
+    size_t squashed_bytes  = src.element_size();
+
+    // Try to squash the low dimensions together.
+    for (; dim < num_dimensions; ++dim)
+    {
+        if (strides[dim] != squashed_bytes)
+        {
+            break;
+        }
+        squashed_bytes *= shape[dim];
+    }
+    if (dim == num_dimensions)
+    {
+        const auto squashed_elements = squashed_bytes / src.element_size();
+        split_dimension              = Window::DimX;
+        // The input tensor can be interpreted as 1D array.
+        win.set(0, Window::Dimension(0, squashed_elements, 1));
+        for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+        {
+            win.set(dim, Window::Dimension(0, 1, 1));
+        }
+    }
+    else
+    {
+        // Generate the max window.
+        for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+        {
+            win.set(dim, Window::Dimension(0, shape[dim], 1));
+        }
+    }
+    return std::make_pair(win, split_dimension);
+}
+
 } // namespace arm_compute
diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h
index dad5da62d3..e404c18e8a 100644
--- a/src/core/helpers/WindowHelpers.h
+++ b/src/core/helpers/WindowHelpers.h
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2020-2021 Arm Limited.
+* Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,23 +43,13 @@ namespace arm_compute
  *         influence the returned value.
  */
 template <typename... Ts>
-bool update_window_and_padding(Window &win, Ts &&... patterns)
+bool update_window_and_padding(Window &win, Ts &&...patterns)
 {
     bool window_changed = false;
 
-    utility::for_each([&](const IAccessWindow & w)
-    {
-        window_changed |= w.update_window_if_needed(win);
-    },
-    patterns...);
-
-    bool padding_changed = false;
+    utility::for_each([&](const IAccessWindow &w) { window_changed |= w.update_window_if_needed(win); }, patterns...);
 
-    utility::for_each([&](IAccessWindow & w)
-    {
-        padding_changed |= w.update_padding_if_needed(win);
-    },
-    patterns...);
+    utility::for_each([&](IAccessWindow &w) { w.update_padding_if_needed(win); }, patterns...);
 
     return window_changed;
 }
@@ -71,18 +61,18 @@ bool update_window_and_padding(Window &win, Ts &&... patterns)
  * @return Intersection of all regions.
  */
 template <typename... Ts>
-ValidRegion intersect_valid_regions(const Ts &... regions)
+ValidRegion intersect_valid_regions(const Ts &...regions)
 {
-    auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
+    auto intersect = [](const ValidRegion &r1, const ValidRegion &r2) -> ValidRegion
     {
         ValidRegion region;
 
-        for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
+        for (size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
         {
             region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
         }
 
-        for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
+        for (size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
         {
             region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
         }
@@ -103,7 +93,10 @@ ValidRegion intersect_valid_regions(const Ts &... regions)
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window(const ValidRegion &valid_region,
+                            const Steps       &steps       = Steps(),
+                            bool               skip_border = false,
+                            BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting
  *
@@ -114,7 +107,10 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window(const TensorShape &shape,
+                            const Steps       &steps       = Steps(),
+                            bool               skip_border = false,
+                            BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting
  *
@@ -125,7 +121,10 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+inline Window calculate_max_window(const ITensorInfo &info,
+                                   const Steps       &steps       = Steps(),
+                                   bool               skip_border = false,
+                                   BorderSize         border_size = BorderSize())
 {
     return calculate_max_window(info.tensor_shape(), steps, skip_border, border_size);
 }
@@ -139,7 +138,10 @@ inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps =
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window_horizontal(const ValidRegion &valid_region,
+                                       const Steps       &steps       = Steps(),
+                                       bool               skip_border = false,
+                                       BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
  *
@@ -150,7 +152,10 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+inline Window calculate_max_window_horizontal(const ITensorInfo &info,
+                                              const Steps       &steps       = Steps(),
+                                              bool               skip_border = false,
+                                              BorderSize         border_size = BorderSize())
 {
     return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size);
 }
@@ -163,7 +168,9 @@ inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Ste
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
+Window calculate_max_enlarged_window(const ValidRegion &valid_region,
+                                     const Steps       &steps       = Steps(),
+                                     BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
  *
@@ -173,11 +180,38 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize())
+inline Window calculate_max_enlarged_window(const ITensorInfo &info,
+                                            const Steps       &steps       = Steps(),
+                                            BorderSize         border_size = BorderSize())
 {
     return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
 }
 
+/** Calculate the squashed or maximum window for the given tensor shape.
+ *
+ * If the tensor data resides continuously in the memory, the tensor can be interpreted
+ * as 1D array and all the dimensions can be squashed together into the x-dimension.
+ * Otherwise, generate the max window for the given tensor shape.
+ *
+ * @param[in] src Tensor info object defining the shape of the input tensor.
+ *
+ * @return The maximum window the kernel can be executed on and the preferred split dimension.
+ */
+std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &src);
+
+/** Calculate the squashed or maximum window for the given tensor shapes.
+ *
+ * If the tensor data resides continuously in the memory, the tensor can be interpreted
+ * as 1D array and all the dimensions can be squashed together into the x-dimension.
+ * Otherwise, generate the max window for the given tensor shapes.
+ *
+ * @param[in] src0 Tensor info object defining the shape of the first input tensor.
+ * @param[in] src1 Tensor info object defining the shape of the second input tensor.
+ *
+ * @return The squashed or maximum window the kernel can be executed on and the preferred split dimension.
+ */
+std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &src0, const ITensorInfo &src1);
+
 /** Function to compute the shape of output and window for the given inputs
  *
  * @param[in] infos Input tensor informations
@@ -185,7 +219,7 @@ inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps
  * @return A pair of the shape and window
  */
 template <typename... Shapes>
-std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &... shapes)
+std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &...shapes)
 {
     const TensorShape out_shape = TensorShape::broadcast_shape(shapes...);
     return std::make_pair(out_shape, calculate_max_window(out_shape));
diff --git a/src/core/utils/ActivationFunctionUtils.cpp b/src/core/utils/ActivationFunctionUtils.cpp
new file mode 100644
index 0000000000..017170a0c5
--- /dev/null
+++ b/src/core/utils/ActivationFunctionUtils.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+
+#include <map>
+
+namespace arm_compute
+{
+const std::string &string_from_activation_func(const ActivationFunction &act)
+{
+    static std::map<ActivationFunction, const std::string> act_map = {{ActivationFunction::ABS, "ABS"},
+                                                                      {ActivationFunction::LINEAR, "LINEAR"},
+                                                                      {ActivationFunction::LOGISTIC, "LOGISTIC"},
+                                                                      {ActivationFunction::RELU, "RELU"},
+                                                                      {ActivationFunction::BOUNDED_RELU, "BRELU"},
+                                                                      {ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU"},
+                                                                      {ActivationFunction::LEAKY_RELU, "LRELU"},
+                                                                      {ActivationFunction::SOFT_RELU, "SRELU"},
+                                                                      {ActivationFunction::ELU, "ELU"},
+                                                                      {ActivationFunction::SQRT, "SQRT"},
+                                                                      {ActivationFunction::SQUARE, "SQUARE"},
+                                                                      {ActivationFunction::TANH, "TANH"},
+                                                                      {ActivationFunction::IDENTITY, "IDENTITY"},
+                                                                      {ActivationFunction::HARD_SWISH, "HARD_SWISH"},
+                                                                      {ActivationFunction::SWISH, "SWISH"},
+                                                                      {ActivationFunction::GELU, "GELU"}
+
+    };
+
+    return act_map[act];
+}
+
+} // namespace arm_compute
diff --git a/src/core/utils/AssemblyUtils.cpp b/src/core/utils/AssemblyUtils.cpp
new file mode 100644
index 0000000000..d97ea42091
--- /dev/null
+++ b/src/core/utils/AssemblyUtils.cpp
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/utils/AssemblyUtils.h"
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+namespace arm_compute
+{
+namespace assembly_utils
+{
+arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
+{
+    arm_gemm::Activation gemm_act;
+
+    // Early exit in case lower bound is other than 0, as it's not yet supported
+    if (act.b() != 0.f)
+    {
+        return gemm_act;
+    }
+
+    switch (act.activation())
+    {
+        case ActivationLayerInfo::ActivationFunction::RELU:
+            gemm_act.type = arm_gemm::Activation::Type::ReLU;
+            break;
+        case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
+            gemm_act.param1 = act.a();
+            gemm_act.param2 = 0.f;
+            break;
+        case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
+            gemm_act.param1 = act.a();
+            gemm_act.param2 = act.b();
+            break;
+        default:
+            gemm_act.type = arm_gemm::Activation::Type::None;
+    }
+
+    return gemm_act;
+}
+
+arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info)
+{
+    return arm_conv::PaddingValues{pad_stride_info.pad_left(), pad_stride_info.pad_top(), pad_stride_info.pad_right(),
+                                   pad_stride_info.pad_bottom()};
+}
+
+arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFormat &weight_format)
+{
+    arm_gemm::WeightFormat gemm_weight_fromat;
+
+    switch (weight_format)
+    {
+        case arm_compute::WeightFormat::UNSPECIFIED:
+            gemm_weight_fromat = arm_gemm::WeightFormat::UNSPECIFIED;
+            break;
+        case arm_compute::WeightFormat::ANY:
+            gemm_weight_fromat = arm_gemm::WeightFormat::ANY;
+            break;
+        case arm_compute::WeightFormat::OHWI:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWI;
+            break;
+        case arm_compute::WeightFormat::OHWIo2:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo2;
+            break;
+        case arm_compute::WeightFormat::OHWIo4:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo4;
+            break;
+        case arm_compute::WeightFormat::OHWIo8:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo8;
+            break;
+        case arm_compute::WeightFormat::OHWIo16:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo16;
+            break;
+        case arm_compute::WeightFormat::OHWIo32:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo32;
+            break;
+        case arm_compute::WeightFormat::OHWIo64:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo64;
+            break;
+        case arm_compute::WeightFormat::OHWIo128:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo128;
+            break;
+        case arm_compute::WeightFormat::OHWIo4i2:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo4i2;
+            break;
+        case arm_compute::WeightFormat::OHWIo4i2_bf16:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo4i2_bf16;
+            break;
+        case arm_compute::WeightFormat::OHWIo8i2:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo8i2;
+            break;
+        case arm_compute::WeightFormat::OHWIo8i2_bf16:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo8i2_bf16;
+            break;
+        case arm_compute::WeightFormat::OHWIo16i2:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo16i2;
+            break;
+        case arm_compute::WeightFormat::OHWIo16i2_bf16:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo16i2_bf16;
+            break;
+        case arm_compute::WeightFormat::OHWIo32i2:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo32i2;
+            break;
+        case arm_compute::WeightFormat::OHWIo32i2_bf16:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo32i2_bf16;
+            break;
+        case arm_compute::WeightFormat::OHWIo64i2:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo64i2;
+            break;
+        case arm_compute::WeightFormat::OHWIo64i2_bf16:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo64i2_bf16;
+            break;
+        case arm_compute::WeightFormat::OHWIo4i4:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo4i4;
+            break;
+        case arm_compute::WeightFormat::OHWIo4i4_bf16:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo4i4_bf16;
+            break;
+        case arm_compute::WeightFormat::OHWIo8i4:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo8i4;
+            break;
+        case arm_compute::WeightFormat::OHWIo8i4_bf16:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo8i4_bf16;
+            break;
+        case arm_compute::WeightFormat::OHWIo16i4:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo16i4;
+            break;
+        case arm_compute::WeightFormat::OHWIo16i4_bf16:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo16i4_bf16;
+            break;
+        case arm_compute::WeightFormat::OHWIo32i4:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo32i4;
+            break;
+        case arm_compute::WeightFormat::OHWIo32i4_bf16:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo32i4_bf16;
+            break;
+        case arm_compute::WeightFormat::OHWIo64i4:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo64i4;
+            break;
+        case arm_compute::WeightFormat::OHWIo64i4_bf16:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo64i4_bf16;
+            break;
+        case arm_compute::WeightFormat::OHWIo2i8:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo2i8;
+            break;
+        case arm_compute::WeightFormat::OHWIo4i8:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo4i8;
+            break;
+        case arm_compute::WeightFormat::OHWIo8i8:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo8i8;
+            break;
+        case arm_compute::WeightFormat::OHWIo16i8:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo16i8;
+            break;
+        case arm_compute::WeightFormat::OHWIo32i8:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo32i8;
+            break;
+        case arm_compute::WeightFormat::OHWIo64i8:
+            gemm_weight_fromat = arm_gemm::WeightFormat::OHWIo64i8;
+            break;
+        default:
+            gemm_weight_fromat = arm_gemm::WeightFormat::UNSPECIFIED;
+    }
+    return gemm_weight_fromat;
+}
+
+arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::WeightFormat &weight_format)
+{
+    arm_compute::WeightFormat acl_weight_fromat;
+
+    switch (weight_format)
+    {
+        case arm_gemm::WeightFormat::UNSPECIFIED:
+            acl_weight_fromat = arm_compute::WeightFormat::UNSPECIFIED;
+            break;
+        case arm_gemm::WeightFormat::ANY:
+            acl_weight_fromat = arm_compute::WeightFormat::ANY;
+            break;
+        case arm_gemm::WeightFormat::OHWI:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWI;
+            break;
+        case arm_gemm::WeightFormat::OHWIo2:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo2;
+            break;
+        case arm_gemm::WeightFormat::OHWIo4:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo4;
+            break;
+        case arm_gemm::WeightFormat::OHWIo8:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo8;
+            break;
+        case arm_gemm::WeightFormat::OHWIo16:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo16;
+            break;
+        case arm_gemm::WeightFormat::OHWIo32:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo32;
+            break;
+        case arm_gemm::WeightFormat::OHWIo64:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo64;
+            break;
+        case arm_gemm::WeightFormat::OHWIo128:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo128;
+            break;
+        case arm_gemm::WeightFormat::OHWIo4i2:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo4i2;
+            break;
+        case arm_gemm::WeightFormat::OHWIo4i2_bf16:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo4i2_bf16;
+            break;
+        case arm_gemm::WeightFormat::OHWIo8i2:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo8i2;
+            break;
+        case arm_gemm::WeightFormat::OHWIo8i2_bf16:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo8i2_bf16;
+            break;
+        case arm_gemm::WeightFormat::OHWIo16i2:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo16i2;
+            break;
+        case arm_gemm::WeightFormat::OHWIo16i2_bf16:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo16i2_bf16;
+            break;
+        case arm_gemm::WeightFormat::OHWIo32i2:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo32i2;
+            break;
+        case arm_gemm::WeightFormat::OHWIo32i2_bf16:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo32i2_bf16;
+            break;
+        case arm_gemm::WeightFormat::OHWIo64i2:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo64i2;
+            break;
+        case arm_gemm::WeightFormat::OHWIo64i2_bf16:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo64i2_bf16;
+            break;
+        case arm_gemm::WeightFormat::OHWIo4i4:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo4i4;
+            break;
+        case arm_gemm::WeightFormat::OHWIo4i4_bf16:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo4i4_bf16;
+            break;
+        case arm_gemm::WeightFormat::OHWIo8i4:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo8i4;
+            break;
+        case arm_gemm::WeightFormat::OHWIo8i4_bf16:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo8i4_bf16;
+            break;
+        case arm_gemm::WeightFormat::OHWIo16i4:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo16i4;
+            break;
+        case arm_gemm::WeightFormat::OHWIo16i4_bf16:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo16i4_bf16;
+            break;
+        case arm_gemm::WeightFormat::OHWIo32i4:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo32i4;
+            break;
+        case arm_gemm::WeightFormat::OHWIo32i4_bf16:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo32i4_bf16;
+            break;
+        case arm_gemm::WeightFormat::OHWIo64i4:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo64i4;
+            break;
+        case arm_gemm::WeightFormat::OHWIo64i4_bf16:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo64i4_bf16;
+            break;
+        case arm_gemm::WeightFormat::OHWIo2i8:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo2i8;
+            break;
+        case arm_gemm::WeightFormat::OHWIo4i8:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo4i8;
+            break;
+        case arm_gemm::WeightFormat::OHWIo8i8:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo8i8;
+            break;
+        case arm_gemm::WeightFormat::OHWIo16i8:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo16i8;
+            break;
+        case arm_gemm::WeightFormat::OHWIo32i8:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo32i8;
+            break;
+        case arm_gemm::WeightFormat::OHWIo64i8:
+            acl_weight_fromat = arm_compute::WeightFormat::OHWIo64i8;
+            break;
+        default:
+            acl_weight_fromat = arm_compute::WeightFormat::UNSPECIFIED;
+    }
+    return acl_weight_fromat;
+}
+} // namespace assembly_utils
+} // namespace arm_compute
diff --git a/src/core/utils/AssemblyUtils.h b/src/core/utils/AssemblyUtils.h
new file mode 100644
index 0000000000..7d0d37c4ef
--- /dev/null
+++ b/src/core/utils/AssemblyUtils.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef UTILS_CORE_ASSEMBLY_UTILS_H
+#define UTILS_CORE_ASSEMBLY_UTILS_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/core/NEON/kernels/assembly/common.hpp"
+#include "src/cpu/kernels/assembly/arm_gemm.hpp"
+
+namespace arm_compute
+{
+class ActivationLayerInfo;
+
+namespace assembly_utils
+{
+/** Performs a mapping between Compute Library ActivationLayerInfo and the assembly Activation structure.
+ *
+ * @param[in] act Compute Library activation info.
+ *
+ * @return Assembly activation info.
+ */
+arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act);
+
+/** Performs a mapping between Compute Library PadStrideInfo and the assembly PaddingValues structure.
+ *
+ * @param[in] pad_stride_info Compute Library padding and strides info.
+ *
+ * @return Assembly padding values.
+ */
+arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info);
+
+/** Performs a mapping from Compute Library WeightFormat to the assembly WeightFormat enum
+ *
+ * @param[in] weight_format Compute Library WeightFormat enum value
+ *
+ * @return Assembly WeightFormat
+ */
+arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFormat &weight_format);
+
+/** Performs a mapping from Assembly WeightFormat to the Compute Library WeightFormat enum
+ *
+ * @param[in] weight_format Assembly WeightFormat enum value
+ *
+ * @return Compute Library WeightFormat
+ */
+arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::WeightFormat &weight_format);
+} // namespace assembly_utils
+} // namespace arm_compute
+#endif /* UTILS_CORE_ASSEMBLY_UTILS_H */
diff --git a/src/core/utils/DataLayoutUtils.cpp b/src/core/utils/DataLayoutUtils.cpp
new file mode 100644
index 0000000000..234bed71cb
--- /dev/null
+++ b/src/core/utils/DataLayoutUtils.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/utils/DataLayoutUtils.h"
+
+namespace arm_compute
+{
+
+const std::string &string_from_data_layout(DataLayout dl)
+{
+    static std::map<DataLayout, const std::string> dl_map = {
+        {DataLayout::UNKNOWN, "UNKNOWN"},
+        {DataLayout::NCHW, "NCHW"},
+        {DataLayout::NHWC, "NHWC"},
+    };
+
+    return dl_map[dl];
+}
+
+} // namespace arm_compute
diff --git a/src/core/utils/DataTypeUtils.cpp b/src/core/utils/DataTypeUtils.cpp
new file mode 100644
index 0000000000..1394339987
--- /dev/null
+++ b/src/core/utils/DataTypeUtils.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/utils/DataTypeUtils.h"
+
+#include <map>
+
+namespace arm_compute
+{
+const std::string &string_from_data_type(DataType dt)
+{
+    static std::map<DataType, const std::string> dt_map = {
+        {DataType::UNKNOWN, "UNKNOWN"},
+        {DataType::S8, "S8"},
+        {DataType::U8, "U8"},
+        {DataType::S16, "S16"},
+        {DataType::U16, "U16"},
+        {DataType::S32, "S32"},
+        {DataType::U32, "U32"},
+        {DataType::S64, "S64"},
+        {DataType::U64, "U64"},
+        {DataType::F16, "F16"},
+        {DataType::F32, "F32"},
+        {DataType::F64, "F64"},
+        {DataType::SIZET, "SIZET"},
+        {DataType::QSYMM8, "QSYMM8"},
+        {DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL"},
+        {DataType::QASYMM8, "QASYMM8"},
+        {DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED"},
+        {DataType::QSYMM16, "QSYMM16"},
+        {DataType::QASYMM16, "QASYMM16"},
+    };
+
+    return dt_map[dt];
+}
+
+DataType data_type_from_name(const std::string &name)
+{
+    static const std::map<std::string, DataType> data_types = {
+        {"f16", DataType::F16},
+        {"f32", DataType::F32},
+        {"qasymm8", DataType::QASYMM8},
+        {"qasymm8_signed", DataType::QASYMM8_SIGNED},
+    };
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+    try
+    {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+        return data_types.at(utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+    }
+    catch (const std::out_of_range &)
+    {
+        ARM_COMPUTE_ERROR_VAR("Invalid data type name: %s", name.c_str());
+    }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+}
+
+} // namespace arm_compute
diff --git a/src/core/utils/FormatUtils.cpp b/src/core/utils/FormatUtils.cpp
new file mode 100644
index 0000000000..46f8455315
--- /dev/null
+++ b/src/core/utils/FormatUtils.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/utils/FormatUtils.h"
+
+#include <map>
+
+namespace arm_compute
+{
+const std::string &string_from_format(Format format)
+{
+    static std::map<Format, const std::string> formats_map = {
+        {Format::UNKNOWN, "UNKNOWN"},   {Format::U8, "U8"},
+        {Format::S16, "S16"},           {Format::U16, "U16"},
+        {Format::S32, "S32"},           {Format::U32, "U32"},
+        {Format::F16, "F16"},           {Format::F32, "F32"},
+        {Format::UV88, "UV88"},         {Format::RGB888, "RGB888"},
+        {Format::RGBA8888, "RGBA8888"}, {Format::YUV444, "YUV444"},
+        {Format::YUYV422, "YUYV422"},   {Format::NV12, "NV12"},
+        {Format::NV21, "NV21"},         {Format::IYUV, "IYUV"},
+        {Format::UYVY422, "UYVY422"}};
+
+    return formats_map[format];
+}
+} // namespace arm_compute
diff --git a/src/core/utils/InterpolationPolicyUtils.cpp b/src/core/utils/InterpolationPolicyUtils.cpp
new file mode 100644
index 0000000000..276e760544
--- /dev/null
+++ b/src/core/utils/InterpolationPolicyUtils.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/utils/InterpolationPolicyUtils.h"
+
+namespace arm_compute
+{
+
+const std::string &string_from_interpolation_policy(InterpolationPolicy policy)
+{
+    static std::map<InterpolationPolicy, const std::string> interpolation_policy_map = {
+        {InterpolationPolicy::AREA, "AREA"},
+        {InterpolationPolicy::BILINEAR, "BILINEAR"},
+        {InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR"},
+    };
+
+    return interpolation_policy_map[policy];
+}
+
+} // namespace arm_compute
diff --git a/src/core/utils/Math.cpp b/src/core/utils/Math.cpp
new file mode 100644
index 0000000000..270e65c161
--- /dev/null
+++ b/src/core/utils/Math.cpp
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/utils/Math.h"
+
+namespace arm_compute
+{
+
+const std::array<ErfLutEntry<float>, 513> erf_f32_lut = {{
+    {0.0000000000f, 1.1283791671f}, // 0.0000000000
+    {0.0088152829f, 1.1283102984f}, // 0.0078125000
+    {0.0176294898f, 1.1281037175f}, // 0.0156250000
+    {0.0264415450f, 1.1277595001f}, // 0.0234375000
+    {0.0352503739f, 1.1272777722f}, // 0.0312500000
+    {0.0440549026f, 1.1266587101f}, // 0.0390625000
+    {0.0528540592f, 1.1259025402f}, // 0.0468750000
+    {0.0616467734f, 1.1250095393f}, // 0.0546875000
+    {0.0704319777f, 1.1239800336f}, // 0.0625000000
+    {0.0792086070f, 1.1228143994f}, // 0.0703125000
+    {0.0879755993f, 1.1215130622f}, // 0.0781250000
+    {0.0967318964f, 1.1200764968f}, // 0.0859375000
+    {0.1054764438f, 1.1185052270f}, // 0.0937500000
+    {0.1142081913f, 1.1167998249f}, // 0.1015625000
+    {0.1229260934f, 1.1149609111f}, // 0.1093750000
+    {0.1316291096f, 1.1129891541f}, // 0.1171875000
+    {0.1403162048f, 1.1108852696f}, // 0.1250000000
+    {0.1489863498f, 1.1086500206f}, // 0.1328125000
+    {0.1576385214f, 1.1062842165f}, // 0.1406250000
+    {0.1662717029f, 1.1037887128f}, // 0.1484375000
+    {0.1748848846f, 1.1011644107f}, // 0.1562500000
+    {0.1834770639f, 1.0984122563f}, // 0.1640625000
+    {0.1920472457f, 1.0955332401f}, // 0.1718750000
+    {0.2005944431f, 1.0925283966f}, // 0.1796875000
+    {0.2091176771f, 1.0893988035f}, // 0.1875000000
+    {0.2176159774f, 1.0861455810f}, // 0.1953125000
+    {0.2260883828f, 1.0827698913f}, // 0.2031250000
+    {0.2345339412f, 1.0792729378f}, // 0.2109375000
+    {0.2429517099f, 1.0756559646f}, // 0.2187500000
+    {0.2513407564f, 1.0719202554f}, // 0.2265625000
+    {0.2597001582f, 1.0680671328f}, // 0.2343750000
+    {0.2680290031f, 1.0640979580f}, // 0.2421875000
+    {0.2763263902f, 1.0600141294f}, // 0.2500000000
+    {0.2845914291f, 1.0558170820f}, // 0.2578125000
+    {0.2928232411f, 1.0515082867f}, // 0.2656250000
+    {0.3010209590f, 1.0470892494f}, // 0.2734375000
+    {0.3091837275f, 1.0425615098f}, // 0.2812500000
+    {0.3173107036f, 1.0379266409f}, // 0.2890625000
+    {0.3254010565f, 1.0331862480f}, // 0.2968750000
+    {0.3334539681f, 1.0283419676f}, // 0.3046875000
+    {0.3414686335f, 1.0233954666f}, // 0.3125000000
+    {0.3494442605f, 1.0183484415f}, // 0.3203125000
+    {0.3573800706f, 1.0132026170f}, // 0.3281250000
+    {0.3652752987f, 1.0079597454f}, // 0.3359375000
+    {0.3731291935f, 1.0026216055f}, // 0.3437500000
+    {0.3809410179f, 0.9971900017f}, // 0.3515625000
+    {0.3887100487f, 0.9916667625f}, // 0.3593750000
+    {0.3964355772f, 0.9860537403f}, // 0.3671875000
+    {0.4041169094f, 0.9803528095f}, // 0.3750000000
+    {0.4117533659f, 0.9745658663f}, // 0.3828125000
+    {0.4193442821f, 0.9686948267f}, // 0.3906250000
+    {0.4268890086f, 0.9627416265f}, // 0.3984375000
+    {0.4343869111f, 0.9567082195f}, // 0.4062500000
+    {0.4418373708f, 0.9505965764f}, // 0.4140625000
+    {0.4492397841f, 0.9444086845f}, // 0.4218750000
+    {0.4565935631f, 0.9381465458f}, // 0.4296875000
+    {0.4638981357f, 0.9318121761f}, // 0.4375000000
+    {0.4711529456f, 0.9254076045f}, // 0.4453125000
+    {0.4783574521f, 0.9189348715f}, // 0.4531250000
+    {0.4855111308f, 0.9123960286f}, // 0.4609375000
+    {0.4926134732f, 0.9057931368f}, // 0.4687500000
+    {0.4996639871f, 0.8991282656f}, // 0.4765625000
+    {0.5066621964f, 0.8924034924f}, // 0.4843750000
+    {0.5136076411f, 0.8856209005f}, // 0.4921875000
+    {0.5204998778f, 0.8787825789f}, // 0.5000000000
+    {0.5273384792f, 0.8718906210f}, // 0.5078125000
+    {0.5341230345f, 0.8649471234f}, // 0.5156250000
+    {0.5408531493f, 0.8579541846f}, // 0.5234375000
+    {0.5475284454f, 0.8509139049f}, // 0.5312500000
+    {0.5541485612f, 0.8438283842f}, // 0.5390625000
+    {0.5607131516f, 0.8366997220f}, // 0.5468750000
+    {0.5672218875f, 0.8295300154f}, // 0.5546875000
+    {0.5736744566f, 0.8223213592f}, // 0.5625000000
+    {0.5800705628f, 0.8150758439f}, // 0.5703125000
+    {0.5864099261f, 0.8077955554f}, // 0.5781250000
+    {0.5926922832f, 0.8004825735f}, // 0.5859375000
+    {0.5989173866f, 0.7931389715f}, // 0.5937500000
+    {0.6050850052f, 0.7857668149f}, // 0.6015625000
+    {0.6111949241f, 0.7783681603f}, // 0.6093750000
+    {0.6172469441f, 0.7709450550f}, // 0.6171875000
+    {0.6232408822f, 0.7634995358f}, // 0.6250000000
+    {0.6291765712f, 0.7560336278f}, // 0.6328125000
+    {0.6350538598f, 0.7485493443f}, // 0.6406250000
+    {0.6408726121f, 0.7410486852f}, // 0.6484375000
+    {0.6466327080f, 0.7335336365f}, // 0.6562500000
+    {0.6523340428f, 0.7260061695f}, // 0.6640625000
+    {0.6579765272f, 0.7184682397f}, // 0.6718750000
+    {0.6635600869f, 0.7109217866f}, // 0.6796875000
+    {0.6690846629f, 0.7033687322f}, // 0.6875000000
+    {0.6745502111f, 0.6958109807f}, // 0.6953125000
+    {0.6799567021f, 0.6882504177f}, // 0.7031250000
+    {0.6853041214f, 0.6806889096f}, // 0.7109375000
+    {0.6905924687f, 0.6731283025f}, // 0.7187500000
+    {0.6958217582f, 0.6655704219f}, // 0.7265625000
+    {0.7009920183f, 0.6580170718f}, // 0.7343750000
+    {0.7061032914f, 0.6504700344f}, // 0.7421875000
+    {0.7111556337f, 0.6429310692f}, // 0.7500000000
+    {0.7161491149f, 0.6354019123f}, // 0.7578125000
+    {0.7210838185f, 0.6278842762f}, // 0.7656250000
+    {0.7259598411f, 0.6203798491f}, // 0.7734375000
+    {0.7307772924f, 0.6128902940f}, // 0.7812500000
+    {0.7355362950f, 0.6054172488f}, // 0.7890625000
+    {0.7402369841f, 0.5979623254f}, // 0.7968750000
+    {0.7448795076f, 0.5905271095f}, // 0.8046875000
+    {0.7494640256f, 0.5831131598f}, // 0.8125000000
+    {0.7539907101f, 0.5757220079f}, // 0.8203125000
+    {0.7584597452f, 0.5683551577f}, // 0.8281250000
+    {0.7628713266f, 0.5610140853f}, // 0.8359375000
+    {0.7672256612f, 0.5537002383f}, // 0.8437500000
+    {0.7715229674f, 0.5464150355f}, // 0.8515625000
+    {0.7757634744f, 0.5391598669f}, // 0.8593750000
+    {0.7799474221f, 0.5319360931f}, // 0.8671875000
+    {0.7840750611f, 0.5247450453f}, // 0.8750000000
+    {0.7881466520f, 0.5175880246f}, // 0.8828125000
+    {0.7921624659f, 0.5104663022f}, // 0.8906250000
+    {0.7961227832f, 0.5033811191f}, // 0.8984375000
+    {0.8000278942f, 0.4963336858f}, // 0.9062500000
+    {0.8038780984f, 0.4893251822f}, // 0.9140625000
+    {0.8076737045f, 0.4823567575f}, // 0.9218750000
+    {0.8114150300f, 0.4754295299f}, // 0.9296875000
+    {0.8151024010f, 0.4685445869f}, // 0.9375000000
+    {0.8187361521f, 0.4617029846f}, // 0.9453125000
+    {0.8223166257f, 0.4549057483f}, // 0.9531250000
+    {0.8258441725f, 0.4481538720f}, // 0.9609375000
+    {0.8293191506f, 0.4414483184f}, // 0.9687500000
+    {0.8327419255f, 0.4347900193f}, // 0.9765625000
+    {0.8361128701f, 0.4281798750f}, // 0.9843750000
+    {0.8394323638f, 0.4216187550f}, // 0.9921875000
+    {0.8427007929f, 0.4151074974f}, // 1.0000000000
+    {0.8459185504f, 0.4086469096f}, // 1.0078125000
+    {0.8490860349f, 0.4022377678f}, // 1.0156250000
+    {0.8522036514f, 0.3958808176f}, // 1.0234375000
+    {0.8552718104f, 0.3895767737f}, // 1.0312500000
+    {0.8582909280f, 0.3833263203f}, // 1.0390625000
+    {0.8612614255f, 0.3771301114f}, // 1.0468750000
+    {0.8641837289f, 0.3709887705f}, // 1.0546875000
+    {0.8670582694f, 0.3649028912f}, // 1.0625000000
+    {0.8698854825f, 0.3588730371f}, // 1.0703125000
+    {0.8726658079f, 0.3528997425f}, // 1.0781250000
+    {0.8753996896f, 0.3469835119f}, // 1.0859375000
+    {0.8780875752f, 0.3411248209f}, // 1.0937500000
+    {0.8807299159f, 0.3353241162f}, // 1.1015625000
+    {0.8833271666f, 0.3295818158f}, // 1.1093750000
+    {0.8858797849f, 0.3238983093f}, // 1.1171875000
+    {0.8883882317f, 0.3182739585f}, // 1.1250000000
+    {0.8908529704f, 0.3127090972f}, // 1.1328125000
+    {0.8932744671f, 0.3072040319f}, // 1.1406250000
+    {0.8956531899f, 0.3017590421f}, // 1.1484375000
+    {0.8979896092f, 0.2963743805f}, // 1.1562500000
+    {0.9002841973f, 0.2910502733f}, // 1.1640625000
+    {0.9025374279f, 0.2857869208f}, // 1.1718750000
+    {0.9047497766f, 0.2805844976f}, // 1.1796875000
+    {0.9069217198f, 0.2754431531f}, // 1.1875000000
+    {0.9090537352f, 0.2703630118f}, // 1.1953125000
+    {0.9111463015f, 0.2653441734f}, // 1.2031250000
+    {0.9131998978f, 0.2603867140f}, // 1.2109375000
+    {0.9152150039f, 0.2554906858f}, // 1.2187500000
+    {0.9171920998f, 0.2506561176f}, // 1.2265625000
+    {0.9191316658f, 0.2458830155f}, // 1.2343750000
+    {0.9210341819f, 0.2411713632f}, // 1.2421875000
+    {0.9229001283f, 0.2365211224f}, // 1.2500000000
+    {0.9247299843f, 0.2319322334f}, // 1.2578125000
+    {0.9265242290f, 0.2274046151f}, // 1.2656250000
+    {0.9282833407f, 0.2229381659f}, // 1.2734375000
+    {0.9300077968f, 0.2185327643f}, // 1.2812500000
+    {0.9316980737f, 0.2141882685f}, // 1.2890625000
+    {0.9333546467f, 0.2099045180f}, // 1.2968750000
+    {0.9349779895f, 0.2056813330f}, // 1.3046875000
+    {0.9365685747f, 0.2015185157f}, // 1.3125000000
+    {0.9381268730f, 0.1974158503f}, // 1.3203125000
+    {0.9396533534f, 0.1933731034f}, // 1.3281250000
+    {0.9411484831f, 0.1893900249f}, // 1.3359375000
+    {0.9426127272f, 0.1854663482f}, // 1.3437500000
+    {0.9440465488f, 0.1816017904f}, // 1.3515625000
+    {0.9454504084f, 0.1777960534f}, // 1.3593750000
+    {0.9468247645f, 0.1740488238f}, // 1.3671875000
+    {0.9481700728f, 0.1703597737f}, // 1.3750000000
+    {0.9494867865f, 0.1667285609f}, // 1.3828125000
+    {0.9507753562f, 0.1631548298f}, // 1.3906250000
+    {0.9520362295f, 0.1596382112f}, // 1.3984375000
+    {0.9532698510f, 0.1561783236f}, // 1.4062500000
+    {0.9544766625f, 0.1527747727f}, // 1.4140625000
+    {0.9556571025f, 0.1494271527f}, // 1.4218750000
+    {0.9568116063f, 0.1461350463f}, // 1.4296875000
+    {0.9579406061f, 0.1428980254f}, // 1.4375000000
+    {0.9590445303f, 0.1397156511f}, // 1.4453125000
+    {0.9601238042f, 0.1365874749f}, // 1.4531250000
+    {0.9611788495f, 0.1335130382f}, // 1.4609375000
+    {0.9622100842f, 0.1304918737f}, // 1.4687500000
+    {0.9632179226f, 0.1275235050f}, // 1.4765625000
+    {0.9642027752f, 0.1246074475f}, // 1.4843750000
+    {0.9651650489f, 0.1217432089f}, // 1.4921875000
+    {0.9661051465f, 0.1189302892f}, // 1.5000000000
+    {0.9670234670f, 0.1161681815f}, // 1.5078125000
+    {0.9679204053f, 0.1134563721f}, // 1.5156250000
+    {0.9687963524f, 0.1107943411f}, // 1.5234375000
+    {0.9696516951f, 0.1081815630f}, // 1.5312500000
+    {0.9704868162f, 0.1056175064f}, // 1.5390625000
+    {0.9713020942f, 0.1031016352f}, // 1.5468750000
+    {0.9720979033f, 0.1006334084f}, // 1.5546875000
+    {0.9728746138f, 0.0982122808f}, // 1.5625000000
+    {0.9736325914f, 0.0958377032f}, // 1.5703125000
+    {0.9743721977f, 0.0935091227f}, // 1.5781250000
+    {0.9750937898f, 0.0912259834f}, // 1.5859375000
+    {0.9757977206f, 0.0889877264f}, // 1.5937500000
+    {0.9764843385f, 0.0867937900f}, // 1.6015625000
+    {0.9771539875f, 0.0846436106f}, // 1.6093750000
+    {0.9778070074f, 0.0825366227f}, // 1.6171875000
+    {0.9784437332f, 0.0804722590f}, // 1.6250000000
+    {0.9790644959f, 0.0784499511f}, // 1.6328125000
+    {0.9796696218f, 0.0764691297f}, // 1.6406250000
+    {0.9802594326f, 0.0745292246f}, // 1.6484375000
+    {0.9808342460f, 0.0726296655f}, // 1.6562500000
+    {0.9813943747f, 0.0707698819f}, // 1.6640625000
+    {0.9819401275f, 0.0689493034f}, // 1.6718750000
+    {0.9824718082f, 0.0671673602f}, // 1.6796875000
+    {0.9829897166f, 0.0654234833f}, // 1.6875000000
+    {0.9834941478f, 0.0637171046f}, // 1.6953125000
+    {0.9839853925f, 0.0620476570f}, // 1.7031250000
+    {0.9844637371f, 0.0604145752f}, // 1.7109375000
+    {0.9849294635f, 0.0588172956f}, // 1.7187500000
+    {0.9853828492f, 0.0572552562f}, // 1.7265625000
+    {0.9858241672f, 0.0557278976f}, // 1.7343750000
+    {0.9862536864f, 0.0542346624f}, // 1.7421875000
+    {0.9866716712f, 0.0527749959f}, // 1.7500000000
+    {0.9870783817f, 0.0513483463f}, // 1.7578125000
+    {0.9874740737f, 0.0499541645f}, // 1.7656250000
+    {0.9878589987f, 0.0485919049f}, // 1.7734375000
+    {0.9882334039f, 0.0472610247f}, // 1.7812500000
+    {0.9885975325f, 0.0459609852f}, // 1.7890625000
+    {0.9889516232f, 0.0446912508f}, // 1.7968750000
+    {0.9892959108f, 0.0434512901f}, // 1.8046875000
+    {0.9896306258f, 0.0422405756f}, // 1.8125000000
+    {0.9899559946f, 0.0410585838f}, // 1.8203125000
+    {0.9902722396f, 0.0399047954f}, // 1.8281250000
+    {0.9905795791f, 0.0387786956f}, // 1.8359375000
+    {0.9908782275f, 0.0376797741f}, // 1.8437500000
+    {0.9911683951f, 0.0366075252f}, // 1.8515625000
+    {0.9914502882f, 0.0355614479f}, // 1.8593750000
+    {0.9917241096f, 0.0345410460f}, // 1.8671875000
+    {0.9919900577f, 0.0335458284f}, // 1.8750000000
+    {0.9922483274f, 0.0325753089f}, // 1.8828125000
+    {0.9924991099f, 0.0316290065f}, // 1.8906250000
+    {0.9927425925f, 0.0307064452f}, // 1.8984375000
+    {0.9929789587f, 0.0298071547f}, // 1.9062500000
+    {0.9932083887f, 0.0289306696f}, // 1.9140625000
+    {0.9934310586f, 0.0280765301f}, // 1.9218750000
+    {0.9936471415f, 0.0272442821f}, // 1.9296875000
+    {0.9938568064f, 0.0264334768f}, // 1.9375000000
+    {0.9940602192f, 0.0256436709f}, // 1.9453125000
+    {0.9942575423f, 0.0248744271f}, // 1.9531250000
+    {0.9944489346f, 0.0241253134f}, // 1.9609375000
+    {0.9946345516f, 0.0233959038f}, // 1.9687500000
+    {0.9948145458f, 0.0226857778f}, // 1.9765625000
+    {0.9949890661f, 0.0219945210f}, // 1.9843750000
+    {0.9951582582f, 0.0213217245f}, // 1.9921875000
+    {0.9953222650f, 0.0206669854f}, // 2.0000000000
+    {0.9954812259f, 0.0200299065f}, // 2.0078125000
+    {0.9956352773f, 0.0194100966f}, // 2.0156250000
+    {0.9957845526f, 0.0188071704f}, // 2.0234375000
+    {0.9959291823f, 0.0182207482f}, // 2.0312500000
+    {0.9960692938f, 0.0176504563f}, // 2.0390625000
+    {0.9962050117f, 0.0170959271f}, // 2.0468750000
+    {0.9963364578f, 0.0165567984f}, // 2.0546875000
+    {0.9964637509f, 0.0160327141f}, // 2.0625000000
+    {0.9965870072f, 0.0155233240f}, // 2.0703125000
+    {0.9967063402f, 0.0150282836f}, // 2.0781250000
+    {0.9968218606f, 0.0145472542f}, // 2.0859375000
+    {0.9969336766f, 0.0140799029f}, // 2.0937500000
+    {0.9970418939f, 0.0136259025f}, // 2.1015625000
+    {0.9971466153f, 0.0131849315f}, // 2.1093750000
+    {0.9972479415f, 0.0127566743f}, // 2.1171875000
+    {0.9973459706f, 0.0123408206f}, // 2.1250000000
+    {0.9974407984f, 0.0119370661f}, // 2.1328125000
+    {0.9975325180f, 0.0115451118f}, // 2.1406250000
+    {0.9976212207f, 0.0111646644f}, // 2.1484375000
+    {0.9977069951f, 0.0107954360f}, // 2.1562500000
+    {0.9977899279f, 0.0104371443f}, // 2.1640625000
+    {0.9978701033f, 0.0100895123f}, // 2.1718750000
+    {0.9979476035f, 0.0097522684f}, // 2.1796875000
+    {0.9980225088f, 0.0094251464f}, // 2.1875000000
+    {0.9980948971f, 0.0091078852f}, // 2.1953125000
+    {0.9981648445f, 0.0088002291f}, // 2.2031250000
+    {0.9982324251f, 0.0085019274f}, // 2.2109375000
+    {0.9982977109f, 0.0082127346f}, // 2.2187500000
+    {0.9983607721f, 0.0079324104f}, // 2.2265625000
+    {0.9984216773f, 0.0076607192f}, // 2.2343750000
+    {0.9984804928f, 0.0073974307f}, // 2.2421875000
+    {0.9985372834f, 0.0071423190f}, // 2.2500000000
+    {0.9985921122f, 0.0068951636f}, // 2.2578125000
+    {0.9986450405f, 0.0066557482f}, // 2.2656250000
+    {0.9986961279f, 0.0064238617f}, // 2.2734375000
+    {0.9987454324f, 0.0061992973f}, // 2.2812500000
+    {0.9987930105f, 0.0059818530f}, // 2.2890625000
+    {0.9988389169f, 0.0057713311f}, // 2.2968750000
+    {0.9988832050f, 0.0055675385f}, // 2.3046875000
+    {0.9989259267f, 0.0053702865f}, // 2.3125000000
+    {0.9989671323f, 0.0051793907f}, // 2.3203125000
+    {0.9990068708f, 0.0049946708f}, // 2.3281250000
+    {0.9990451897f, 0.0048159509f}, // 2.3359375000
+    {0.9990821352f, 0.0046430592f}, // 2.3437500000
+    {0.9991177522f, 0.0044758278f}, // 2.3515625000
+    {0.9991520843f, 0.0043140931f}, // 2.3593750000
+    {0.9991851738f, 0.0041576951f}, // 2.3671875000
+    {0.9992170618f, 0.0040064779f}, // 2.3750000000
+    {0.9992477881f, 0.0038602892f}, // 2.3828125000
+    {0.9992773915f, 0.0037189807f}, // 2.3906250000
+    {0.9993059095f, 0.0035824076f}, // 2.3984375000
+    {0.9993333786f, 0.0034504286f}, // 2.4062500000
+    {0.9993598341f, 0.0033229062f}, // 2.4140625000
+    {0.9993853103f, 0.0031997062f}, // 2.4218750000
+    {0.9994098404f, 0.0030806979f}, // 2.4296875000
+    {0.9994334567f, 0.0029657539f}, // 2.4375000000
+    {0.9994561906f, 0.0028547501f}, // 2.4453125000
+    {0.9994780722f, 0.0027475655f}, // 2.4531250000
+    {0.9994991309f, 0.0026440825f}, // 2.4609375000
+    {0.9995193953f, 0.0025441865f}, // 2.4687500000
+    {0.9995388929f, 0.0024477658f}, // 2.4765625000
+    {0.9995576504f, 0.0023547119f}, // 2.4843750000
+    {0.9995756937f, 0.0022649190f}, // 2.4921875000
+    {0.9995930480f, 0.0021782842f}, // 2.5000000000
+    {0.9996097374f, 0.0020947076f}, // 2.5078125000
+    {0.9996257855f, 0.0020140918f}, // 2.5156250000
+    {0.9996412150f, 0.0019363421f}, // 2.5234375000
+    {0.9996560481f, 0.0018613666f}, // 2.5312500000
+    {0.9996703059f, 0.0017890757f}, // 2.5390625000
+    {0.9996840091f, 0.0017193826f}, // 2.5468750000
+    {0.9996971778f, 0.0016522026f}, // 2.5546875000
+    {0.9997098311f, 0.0015874537f}, // 2.5625000000
+    {0.9997219879f, 0.0015250561f}, // 2.5703125000
+    {0.9997336661f, 0.0014649323f}, // 2.5781250000
+    {0.9997448832f, 0.0014070070f}, // 2.5859375000
+    {0.9997556561f, 0.0013512073f}, // 2.5937500000
+    {0.9997660011f, 0.0012974620f}, // 2.6015625000
+    {0.9997759341f, 0.0012457025f}, // 2.6093750000
+    {0.9997854702f, 0.0011958618f}, // 2.6171875000
+    {0.9997946243f, 0.0011478751f}, // 2.6250000000
+    {0.9998034104f, 0.0011016795f}, // 2.6328125000
+    {0.9998118425f, 0.0010572140f}, // 2.6406250000
+    {0.9998199338f, 0.0010144193f}, // 2.6484375000
+    {0.9998276970f, 0.0009732381f}, // 2.6562500000
+    {0.9998351447f, 0.0009336147f}, // 2.6640625000
+    {0.9998422887f, 0.0008954951f}, // 2.6718750000
+    {0.9998491406f, 0.0008588272f}, // 2.6796875000
+    {0.9998557115f, 0.0008235601f}, // 2.6875000000
+    {0.9998620122f, 0.0007896449f}, // 2.6953125000
+    {0.9998680531f, 0.0007570339f}, // 2.7031250000
+    {0.9998738441f, 0.0007256811f}, // 2.7109375000
+    {0.9998793950f, 0.0006955419f}, // 2.7187500000
+    {0.9998847150f, 0.0006665730f}, // 2.7265625000
+    {0.9998898132f, 0.0006387328f}, // 2.7343750000
+    {0.9998946981f, 0.0006119806f}, // 2.7421875000
+    {0.9998993781f, 0.0005862772f}, // 2.7500000000
+    {0.9999038613f, 0.0005615849f}, // 2.7578125000
+    {0.9999081554f, 0.0005378669f}, // 2.7656250000
+    {0.9999122679f, 0.0005150877f}, // 2.7734375000
+    {0.9999162060f, 0.0004932131f}, // 2.7812500000
+    {0.9999199766f, 0.0004722097f}, // 2.7890625000
+    {0.9999235864f, 0.0004520456f}, // 2.7968750000
+    {0.9999270419f, 0.0004326897f}, // 2.8046875000
+    {0.9999303492f, 0.0004141120f}, // 2.8125000000
+    {0.9999335144f, 0.0003962836f}, // 2.8203125000
+    {0.9999365431f, 0.0003791765f}, // 2.8281250000
+    {0.9999394408f, 0.0003627636f}, // 2.8359375000
+    {0.9999422130f, 0.0003470187f}, // 2.8437500000
+    {0.9999448647f, 0.0003319167f}, // 2.8515625000
+    {0.9999474008f, 0.0003174332f}, // 2.8593750000
+    {0.9999498261f, 0.0003035446f}, // 2.8671875000
+    {0.9999521452f, 0.0002902283f}, // 2.8750000000
+    {0.9999543624f, 0.0002774622f}, // 2.8828125000
+    {0.9999564819f, 0.0002652254f}, // 2.8906250000
+    {0.9999585078f, 0.0002534972f}, // 2.8984375000
+    {0.9999604441f, 0.0002422581f}, // 2.9062500000
+    {0.9999622943f, 0.0002314890f}, // 2.9140625000
+    {0.9999640622f, 0.0002211717f}, // 2.9218750000
+    {0.9999657513f, 0.0002112884f}, // 2.9296875000
+    {0.9999673647f, 0.0002018221f}, // 2.9375000000
+    {0.9999689058f, 0.0001927564f}, // 2.9453125000
+    {0.9999703775f, 0.0001840754f}, // 2.9531250000
+    {0.9999717829f, 0.0001757640f}, // 2.9609375000
+    {0.9999731248f, 0.0001678073f}, // 2.9687500000
+    {0.9999744058f, 0.0001601913f}, // 2.9765625000
+    {0.9999756286f, 0.0001529022f}, // 2.9843750000
+    {0.9999767957f, 0.0001459270f}, // 2.9921875000
+    {0.9999779095f, 0.0001392531f}, // 3.0000000000
+    {0.9999789723f, 0.0001328681f}, // 3.0078125000
+    {0.9999799863f, 0.0001267604f}, // 3.0156250000
+    {0.9999809536f, 0.0001209187f}, // 3.0234375000
+    {0.9999818763f, 0.0001153322f}, // 3.0312500000
+    {0.9999827563f, 0.0001099903f}, // 3.0390625000
+    {0.9999835955f, 0.0001048830f}, // 3.0468750000
+    {0.9999843957f, 0.0001000007f}, // 3.0546875000
+    {0.9999851586f, 0.0000953340f}, // 3.0625000000
+    {0.9999858858f, 0.0000908740f}, // 3.0703125000
+    {0.9999865790f, 0.0000866121f}, // 3.0781250000
+    {0.9999872396f, 0.0000825400f}, // 3.0859375000
+    {0.9999878692f, 0.0000786497f}, // 3.0937500000
+    {0.9999884690f, 0.0000749336f}, // 3.1015625000
+    {0.9999890404f, 0.0000713844f}, // 3.1093750000
+    {0.9999895848f, 0.0000679950f}, // 3.1171875000
+    {0.9999901033f, 0.0000647587f}, // 3.1250000000
+    {0.9999905970f, 0.0000616688f}, // 3.1328125000
+    {0.9999910672f, 0.0000587192f}, // 3.1406250000
+    {0.9999915149f, 0.0000559039f}, // 3.1484375000
+    {0.9999919410f, 0.0000532170f}, // 3.1562500000
+    {0.9999923467f, 0.0000506531f}, // 3.1640625000
+    {0.9999927328f, 0.0000482069f}, // 3.1718750000
+    {0.9999931002f, 0.0000458732f}, // 3.1796875000
+    {0.9999934498f, 0.0000436471f}, // 3.1875000000
+    {0.9999937825f, 0.0000415240f}, // 3.1953125000
+    {0.9999940989f, 0.0000394993f}, // 3.2031250000
+    {0.9999943999f, 0.0000375688f}, // 3.2109375000
+    {0.9999946862f, 0.0000357282f}, // 3.2187500000
+    {0.9999949584f, 0.0000339737f}, // 3.2265625000
+    {0.9999952172f, 0.0000323014f}, // 3.2343750000
+    {0.9999954633f, 0.0000307077f}, // 3.2421875000
+    {0.9999956972f, 0.0000291890f}, // 3.2500000000
+    {0.9999959196f, 0.0000277421f}, // 3.2578125000
+    {0.9999961309f, 0.0000263636f}, // 3.2656250000
+    {0.9999963317f, 0.0000250506f}, // 3.2734375000
+    {0.9999965224f, 0.0000238001f}, // 3.2812500000
+    {0.9999967037f, 0.0000226093f}, // 3.2890625000
+    {0.9999968759f, 0.0000214754f}, // 3.2968750000
+    {0.9999970394f, 0.0000203959f}, // 3.3046875000
+    {0.9999971947f, 0.0000193683f}, // 3.3125000000
+    {0.9999973421f, 0.0000183902f}, // 3.3203125000
+    {0.9999974822f, 0.0000174594f}, // 3.3281250000
+    {0.9999976151f, 0.0000165736f}, // 3.3359375000
+    {0.9999977412f, 0.0000157309f}, // 3.3437500000
+    {0.9999978610f, 0.0000149292f}, // 3.3515625000
+    {0.9999979746f, 0.0000141667f}, // 3.3593750000
+    {0.9999980824f, 0.0000134414f}, // 3.3671875000
+    {0.9999981847f, 0.0000127517f}, // 3.3750000000
+    {0.9999982818f, 0.0000120960f}, // 3.3828125000
+    {0.9999983738f, 0.0000114725f}, // 3.3906250000
+    {0.9999984611f, 0.0000108799f}, // 3.3984375000
+    {0.9999985439f, 0.0000103166f}, // 3.4062500000
+    {0.9999986224f, 0.0000097813f}, // 3.4140625000
+    {0.9999986968f, 0.0000092726f}, // 3.4218750000
+    {0.9999987673f, 0.0000087893f}, // 3.4296875000
+    {0.9999988342f, 0.0000083302f}, // 3.4375000000
+    {0.9999988975f, 0.0000078941f}, // 3.4453125000
+    {0.9999989576f, 0.0000074799f}, // 3.4531250000
+    {0.9999990145f, 0.0000070866f}, // 3.4609375000
+    {0.9999990684f, 0.0000067131f}, // 3.4687500000
+    {0.9999991194f, 0.0000063586f}, // 3.4765625000
+    {0.9999991678f, 0.0000060220f}, // 3.4843750000
+    {0.9999992135f, 0.0000057026f}, // 3.4921875000
+    {0.9999992569f, 0.0000053994f}, // 3.5000000000
+    {0.9999992980f, 0.0000051118f}, // 3.5078125000
+    {0.9999993368f, 0.0000048388f}, // 3.5156250000
+    {0.9999993736f, 0.0000045799f}, // 3.5234375000
+    {0.9999994084f, 0.0000043343f}, // 3.5312500000
+    {0.9999994414f, 0.0000041014f}, // 3.5390625000
+    {0.9999994725f, 0.0000038805f}, // 3.5468750000
+    {0.9999995020f, 0.0000036711f}, // 3.5546875000
+    {0.9999995299f, 0.0000034725f}, // 3.5625000000
+    {0.9999995563f, 0.0000032843f}, // 3.5703125000
+    {0.9999995813f, 0.0000031059f}, // 3.5781250000
+    {0.9999996049f, 0.0000029369f}, // 3.5859375000
+    {0.9999996272f, 0.0000027767f}, // 3.5937500000
+    {0.9999996483f, 0.0000026249f}, // 3.6015625000
+    {0.9999996682f, 0.0000024811f}, // 3.6093750000
+    {0.9999996870f, 0.0000023449f}, // 3.6171875000
+    {0.9999997049f, 0.0000022159f}, // 3.6250000000
+    {0.9999997217f, 0.0000020938f}, // 3.6328125000
+    {0.9999997376f, 0.0000019781f}, // 3.6406250000
+    {0.9999997526f, 0.0000018686f}, // 3.6484375000
+    {0.9999997668f, 0.0000017650f}, // 3.6562500000
+    {0.9999997802f, 0.0000016669f}, // 3.6640625000
+    {0.9999997929f, 0.0000015740f}, // 3.6718750000
+    {0.9999998048f, 0.0000014862f}, // 3.6796875000
+    {0.9999998161f, 0.0000014030f}, // 3.6875000000
+    {0.9999998267f, 0.0000013244f}, // 3.6953125000
+    {0.9999998368f, 0.0000012500f}, // 3.7031250000
+    {0.9999998463f, 0.0000011797f}, // 3.7109375000
+    {0.9999998552f, 0.0000011131f}, // 3.7187500000
+    {0.9999998637f, 0.0000010502f}, // 3.7265625000
+    {0.9999998717f, 0.0000009908f}, // 3.7343750000
+    {0.9999998792f, 0.0000009346f}, // 3.7421875000
+    {0.9999998863f, 0.0000008814f}, // 3.7500000000
+    {0.9999998930f, 0.0000008312f}, // 3.7578125000
+    {0.9999998993f, 0.0000007838f}, // 3.7656250000
+    {0.9999999052f, 0.0000007389f}, // 3.7734375000
+    {0.9999999108f, 0.0000006966f}, // 3.7812500000
+    {0.9999999161f, 0.0000006566f}, // 3.7890625000
+    {0.9999999211f, 0.0000006188f}, // 3.7968750000
+    {0.9999999258f, 0.0000005831f}, // 3.8046875000
+    {0.9999999302f, 0.0000005494f}, // 3.8125000000
+    {0.9999999344f, 0.0000005176f}, // 3.8203125000
+    {0.9999999383f, 0.0000004876f}, // 3.8281250000
+    {0.9999999420f, 0.0000004593f}, // 3.8359375000
+    {0.9999999455f, 0.0000004325f}, // 3.8437500000
+    {0.9999999488f, 0.0000004073f}, // 3.8515625000
+    {0.9999999518f, 0.0000003835f}, // 3.8593750000
+    {0.9999999547f, 0.0000003610f}, // 3.8671875000
+    {0.9999999575f, 0.0000003398f}, // 3.8750000000
+    {0.9999999601f, 0.0000003198f}, // 3.8828125000
+    {0.9999999625f, 0.0000003010f}, // 3.8906250000
+    {0.9999999648f, 0.0000002832f}, // 3.8984375000
+    {0.9999999669f, 0.0000002665f}, // 3.9062500000
+    {0.9999999689f, 0.0000002507f}, // 3.9140625000
+    {0.9999999708f, 0.0000002358f}, // 3.9218750000
+    {0.9999999726f, 0.0000002218f}, // 3.9296875000
+    {0.9999999743f, 0.0000002085f}, // 3.9375000000
+    {0.9999999759f, 0.0000001961f}, // 3.9453125000
+    {0.9999999774f, 0.0000001844f}, // 3.9531250000
+    {0.9999999788f, 0.0000001733f}, // 3.9609375000
+    {0.9999999801f, 0.0000001629f}, // 3.9687500000
+    {0.9999999813f, 0.0000001531f}, // 3.9765625000
+    {0.9999999825f, 0.0000001439f}, // 3.9843750000
+    {0.9999999836f, 0.0000001352f}, // 3.9921875000
+    {0.9999999846f, 0.0000001270f}, // 4.0000000000
+}};
+
+} // namespace arm_compute
diff --git a/src/core/utils/Math.h b/src/core/utils/Math.h
new file mode 100644
index 0000000000..f006948f69
--- /dev/null
+++ b/src/core/utils/Math.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_UTILS_MATH_H
+#define ACL_SRC_CORE_UTILS_MATH_H
+
+#include <array>
+
+namespace arm_compute
+{
+
+/** Lookup table for erf(x) calculation. */
+template <typename T>
+struct ErfLutEntry
+{
+    /** erf(x) */
+    T value;
+
+    /** 2 / sqrt(pi) * e^(-x^2) */
+    T scale;
+};
+
+/** The lookup table for FP32 erf(x) calculation. */
+extern const std::array<ErfLutEntry<float>, 513> erf_f32_lut;
+
+} // namespace arm_compute
+
+#endif // ACL_SRC_CORE_UTILS_MATH_H
diff --git a/src/core/utils/ScaleUtils.cpp b/src/core/utils/ScaleUtils.cpp
index d46ca0ea8e..a92da39b67 100644
--- a/src/core/utils/ScaleUtils.cpp
+++ b/src/core/utils/ScaleUtils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "src/core/utils/ScaleUtils.h"
-#include "arm_compute/core/Helpers.h"
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/common/cpuinfo/CpuIsaInfo.h"
 
 float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners)
 {
@@ -34,4 +39,30 @@ float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t
     ARM_COMPUTE_ERROR_ON(out == 0);
 
     return static_cast<float>(in) / static_cast<float>(out);
-}
-\ No newline at end of file
+}
+
+bool arm_compute::scale_utils::is_precomputation_required(DataLayout          data_layout,
+                                                          DataType            data_type,
+                                                          InterpolationPolicy policy,
+                                                          BorderMode          border_mode)
+{
+    // Do not calculate precomputed weights and indices if kernel code doesn't use them
+    if (data_layout == DataLayout::NHWC)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+            case DataType::F16:
+                return (CPUInfo::get().get_isa().sve == true && policy == InterpolationPolicy::NEAREST_NEIGHBOR);
+            case DataType::U8:
+            case DataType::S8:
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+                return (border_mode != BorderMode::REPLICATE) || (policy == InterpolationPolicy::NEAREST_NEIGHBOR);
+            default:
+                return true;
+        }
+    }
+
+    return true;
+}
diff --git a/src/core/utils/ScaleUtils.h b/src/core/utils/ScaleUtils.h
index 3cc986b1db..d8dddc8c70 100644
--- a/src/core/utils/ScaleUtils.h
+++ b/src/core/utils/ScaleUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,9 +26,6 @@
 
 #include "arm_compute/core/Types.h"
 
-#include <cstdint>
-#include <cstdlib>
-
 namespace arm_compute
 {
 namespace scale_utils
@@ -53,6 +50,21 @@ inline bool is_align_corners_allowed_sampling_policy(SamplingPolicy sampling_pol
 {
     return sampling_policy != SamplingPolicy::CENTER;
 }
+
+/** Returns if precomputation of indices and/or weights is required or/not
+ *
+ * @param[in] data_layout Data layout
+ * @param[in] data_type   Data type
+ * @param[in] policy      Interpolation policy
+ * @param[in] border_mode Border Mode
+ *
+ * @return True if precomputation is required
+ */
+bool is_precomputation_required(DataLayout          data_layout,
+                                DataType            data_type,
+                                InterpolationPolicy policy,
+                                BorderMode          border_mode);
+
 } // namespace scale_utils
 } // namespace arm_compute
-#endif /* UTILS_CORE_SCALEUTILS_H */
-\ No newline at end of file
+#endif /* UTILS_CORE_SCALEUTILS_H */
diff --git a/src/core/utils/StringUtils.cpp b/src/core/utils/StringUtils.cpp
new file mode 100644
index 0000000000..bcab0ce10c
--- /dev/null
+++ b/src/core/utils/StringUtils.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <limits>
+#include <map>
+#include <numeric>
+#include <sstream>
+#include <string>
+
+namespace arm_compute
+{
+std::string lower_string(const std::string &val)
+{
+    std::string res = val;
+    std::transform(res.begin(), res.end(), res.begin(), ::tolower);
+    return res;
+}
+
+std::string upper_string(const std::string &val)
+{
+    std::string res = val;
+    std::transform(res.begin(), res.end(), res.begin(), ::toupper);
+    return res;
+}
+
+std::string float_to_string_with_full_precision(float val)
+{
+    std::stringstream ss;
+    ss.precision(std::numeric_limits<float>::max_digits10);
+    ss << val;
+
+    if (val != static_cast<int>(val))
+    {
+        ss << "f";
+    }
+
+    return ss.str();
+}
+
+std::string join(const std::vector<std::string> strings, const std::string &sep)
+{
+    if (strings.empty())
+    {
+        return "";
+    }
+    return std::accumulate(std::next(strings.begin()), strings.end(), strings.at(0),
+                           [&sep](const std::string &a, const std::string &b) { return a + sep + b; });
+}
+} // namespace arm_compute
diff --git a/src/core/utils/helpers/bit_ops.h b/src/core/utils/helpers/bit_ops.h
index 954fb56460..fbd0382509 100644
--- a/src/core/utils/helpers/bit_ops.h
+++ b/src/core/utils/helpers/bit_ops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H
 #define ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H
 
-#include "support/Requires.h"
+#include "support/AclRequires.h"
 
 #include <type_traits>
 
diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp
index 64633c643d..edc8d0eacc 100644
--- a/src/core/utils/helpers/fft.cpp
+++ b/src/core/utils/helpers/fft.cpp
@@ -37,7 +37,7 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
     unsigned int              res = N;
 
     // Early exit if no supported factors are provided
-    if(supported_factors.empty())
+    if (supported_factors.empty())
     {
         return stages;
     }
@@ -46,10 +46,10 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
     auto rfactor_it = supported_factors.rbegin();
 
     // Decomposition step
-    while(res != 0)
+    while (res != 0)
     {
         const unsigned int factor = *rfactor_it;
-        if(0 == (res % factor) && res >= factor)
+        if (0 == (res % factor) && res >= factor)
         {
             stages.push_back(factor);
             res /= factor;
@@ -57,9 +57,9 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
         else
         {
             ++rfactor_it;
-            if(rfactor_it == supported_factors.rend())
+            if (rfactor_it == supported_factors.rend())
             {
-                if(res > 1)
+                if (res > 1)
                 {
                     // Couldn't decompose with given factors
                     stages.clear();
@@ -81,8 +81,9 @@ std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vecto
     std::vector<unsigned int> idx_digit_reverse;
 
     // Early exit in case N and fft stages do not match
-    const float stages_prod = std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
-    if(stages_prod != N)
+    const float stages_prod =
+        std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
+    if (stages_prod != N)
     {
         return idx_digit_reverse;
     }
@@ -94,13 +95,13 @@ std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vecto
     unsigned int n_stages = fft_stages.size();
 
     // Scan elements
-    for(unsigned int n = 0; n < N; ++n)
+    for (unsigned int n = 0; n < N; ++n)
     {
         unsigned int k  = n;
         unsigned int Nx = fft_stages[0];
 
         // Scan stages
-        for(unsigned int s = 1; s < n_stages; ++s)
+        for (unsigned int s = 1; s < n_stages; ++s)
         {
             // radix of stage i-th
             unsigned int Ny = fft_stages[s];
diff --git a/src/core/utils/helpers/float_ops.h b/src/core/utils/helpers/float_ops.h
index a475a23b59..487496915a 100644
--- a/src/core/utils/helpers/float_ops.h
+++ b/src/core/utils/helpers/float_ops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H
-#define ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H
+#ifndef ACL_SRC_CORE_UTILS_HELPERS_FLOAT_OPS_H
+#define ACL_SRC_CORE_UTILS_HELPERS_FLOAT_OPS_H
+
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
 
 namespace arm_compute
 {
@@ -36,8 +40,7 @@ union RawFloat
      *
      * @param[in] val Floating-point value
      */
-    explicit RawFloat(float val)
-        : f32(val)
+    explicit RawFloat(float val) : f32(val)
     {
     }
     /** Extract sign of floating point number
@@ -113,4 +116,4 @@ inline bool is_zero(float a, float epsilon = 0.00001f)
 } // namespace float_ops
 } // namespace helpers
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H */
+#endif // ACL_SRC_CORE_UTILS_HELPERS_FLOAT_OPS_H
diff --git a/src/core/utils/helpers/tensor_info.h b/src/core/utils/helpers/tensor_info.h
index 9279532e2a..fd4745a453 100644
--- a/src/core/utils/helpers/tensor_info.h
+++ b/src/core/utils/helpers/tensor_info.h
@@ -41,15 +41,17 @@ namespace tensor_info
  * @return True if tensors have mismatching quantization info else false.
  */
 template <typename... Ts>
-inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1,
+                                                     const ITensorInfo *tensor_info_2,
+                                                     Ts... tensor_infos)
 {
     const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
 
-    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->quantization_info() != first_quantization_info;
-    });
+    const std::array<const ITensorInfo *, 1 + sizeof...(Ts)> tensor_infos_array{
+        {tensor_info_2, std::forward<Ts>(tensor_infos)...}};
+    return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                       [&](const ITensorInfo *tensor_info)
+                       { return tensor_info->quantization_info() != first_quantization_info; });
 }
 } // namespace tensor_info
 } // namespace helpers
diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index f2216995a9..212cfdabaa 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,10 +36,11 @@ int calculate_stride_on_index(int index, Coordinates strides)
     return index >= static_cast<int>(strides.num_dimensions()) ? 1 : strides[index];
 }
 
-int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
+int calculate_start_on_index(
+    TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
 {
     // Early exit
-    if(index >= static_cast<int>(starts.num_dimensions()))
+    if (index >= static_cast<int>(starts.num_dimensions()))
     {
         return 0;
     }
@@ -51,14 +52,14 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
     int start = starts[index];
 
     // Reset in case of begin mask present
-    if(arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
+    if (arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
     {
         start = stride > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max();
     }
 
     // Account negative start points
     const int dim_size = input_shape[index];
-    if(start < 0)
+    if (start < 0)
     {
         start += dim_size;
     }
@@ -69,12 +70,16 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
     return start;
 }
 
-int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index,
-                           Coordinates ends, Coordinates strides,
-                           int32_t end_mask, int32_t shrink_axis_mask)
+int calculate_end_on_index(TensorShape input_shape,
+                           int         index,
+                           int         start_on_index,
+                           Coordinates ends,
+                           Coordinates strides,
+                           int32_t     end_mask,
+                           int32_t     shrink_axis_mask)
 {
     // Early exit
-    if(index >= static_cast<int>(ends.num_dimensions()))
+    if (index >= static_cast<int>(ends.num_dimensions()))
     {
         return input_shape[index];
     }
@@ -86,9 +91,9 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
     int stop = ends[index];
 
     // Shrink dimension
-    if(shrink_axis)
+    if (shrink_axis)
     {
-        if(start_on_index == std::numeric_limits<int>::max())
+        if (start_on_index == std::numeric_limits<int>::max())
         {
             stop = start_on_index;
         }
@@ -99,33 +104,40 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
     }
 
     // Reset in case of begin mask present
-    if(arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
+    if (arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
     {
         stop = (stride > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest();
     }
 
     // Account negative end points
     const int dim_size = input_shape[index];
-    if(stop < 0)
+    if (stop < 0)
     {
         stop += dim_size;
     }
 
     // Final clamp
-    stop = (stride > 0) ? utility::clamp(stop, 0, dim_size) : utility::clamp(stop, -1, dim_size - 1);
+    if (stride > 0)
+        stop = utility::clamp(stop, 0, dim_size);
+    else
+        stop = utility::clamp(stop, -1, dim_size - 1);
 
     return stop;
 }
 
 std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape,
-                                                                                 Coordinates starts, Coordinates ends, Coordinates strides,
-                                                                                 int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+                                                                                 Coordinates starts,
+                                                                                 Coordinates ends,
+                                                                                 Coordinates strides,
+                                                                                 int32_t     begin_mask,
+                                                                                 int32_t     end_mask,
+                                                                                 int32_t     shrink_axis_mask)
 {
     Coordinates starts_abs{};
     Coordinates ends_abs{};
     Coordinates final_strides{};
 
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const int start_i = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
         starts_abs.set(i, start_i);
@@ -136,13 +148,19 @@ std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords
     return std::make_tuple(starts_abs, ends_abs, final_strides);
 }
 
-TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides,
-                                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask, bool return_unshrinked)
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape,
+                                               Coordinates starts,
+                                               Coordinates ends,
+                                               Coordinates strides,
+                                               int32_t     begin_mask,
+                                               int32_t     end_mask,
+                                               int32_t     shrink_axis_mask,
+                                               bool        return_unshrinked)
 {
     unsigned int index = 0;
 
     TensorShape output_shape;
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const int stride = calculate_stride_on_index(index, strides);
         const int start  = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
@@ -150,11 +168,11 @@ TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordina
         const int range  = end - start;
 
         const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
-        if(return_unshrinked || !is_shrink)
+        if (return_unshrinked || !is_shrink)
         {
-            if((range == 0) ||               // Zero range
-               (range < 0 && stride >= 0) || // Negative range with positive stride
-               (range > 0 && stride <= 0))   // Positive range with negative stride
+            if ((range == 0) ||               // Zero range
+                (range < 0 && stride >= 0) || // Negative range with positive stride
+                (range > 0 && stride <= 0))   // Positive range with negative stride
             {
                 output_shape.set(index, 0);
                 return output_shape;
@@ -173,9 +191,9 @@ int32_t construct_slice_end_mask(Coordinates ends)
 {
     // Create end mask
     int32_t end_mask = 0;
-    for(unsigned int i = 0; i < ends.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < ends.num_dimensions(); ++i)
     {
-        if(ends[i] < 0)
+        if (ends[i] < 0)
         {
             end_mask |= 1 << i;
         }
diff --git a/src/core/utils/io/FileHandler.cpp b/src/core/utils/io/FileHandler.cpp
index 95fc2e3fa2..d106493238 100644
--- a/src/core/utils/io/FileHandler.cpp
+++ b/src/core/utils/io/FileHandler.cpp
@@ -21,16 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <string>
-
 #include "arm_compute/core/utils/io/FileHandler.h"
 
 #include "arm_compute/core/Error.h"
 
+#include <string>
+
 using namespace arm_compute::io;
 
-FileHandler::FileHandler()
-    : _filestream(), _filename(" "), _mode()
+FileHandler::FileHandler() : _filestream(), _filename(" "), _mode()
 {
 }
 
diff --git a/src/core/utils/logging/FilePrinter.cpp b/src/core/utils/logging/FilePrinter.cpp
index 55e78f9630..7b4eead38d 100644
--- a/src/core/utils/logging/FilePrinter.cpp
+++ b/src/core/utils/logging/FilePrinter.cpp
@@ -25,8 +25,7 @@
 
 using namespace arm_compute::logging;
 
-FilePrinter::FilePrinter(const std::string &filename)
-    : _handler()
+FilePrinter::FilePrinter(const std::string &filename) : _handler()
 {
     _handler.open(filename, std::fstream::out | std::fstream::trunc);
 }
@@ -34,4 +33,4 @@ FilePrinter::FilePrinter(const std::string &filename)
 void FilePrinter::print_internal(const std::string &msg)
 {
     _handler.stream() << msg << std::endl;
-}
-\ No newline at end of file
+}
diff --git a/src/core/utils/logging/Helpers.cpp b/src/core/utils/logging/Helpers.cpp
index c3df7f6207..14ad910562 100644
--- a/src/core/utils/logging/Helpers.cpp
+++ b/src/core/utils/logging/Helpers.cpp
@@ -30,13 +30,12 @@ using namespace arm_compute::logging;
 
 const std::string &arm_compute::logging::string_from_log_level(LogLevel log_level)
 {
-    static std::map<LogLevel, const std::string> log_level_map =
-    {
-        { LogLevel::VERBOSE, "VERBOSE" },
-        { LogLevel::INFO, "INFO" },
-        { LogLevel::WARN, "WARN" },
-        { LogLevel::OFF, "OFF" },
+    static std::map<LogLevel, const std::string> log_level_map = {
+        {LogLevel::VERBOSE, "VERBOSE"},
+        {LogLevel::INFO, "INFO"},
+        {LogLevel::WARN, "WARN"},
+        {LogLevel::OFF, "OFF"},
     };
 
     return log_level_map[log_level];
-}
-\ No newline at end of file
+}
diff --git a/src/core/utils/logging/Logger.cpp b/src/core/utils/logging/Logger.cpp
index 70b5868da8..d6681f8179 100644
--- a/src/core/utils/logging/Logger.cpp
+++ b/src/core/utils/logging/Logger.cpp
@@ -30,10 +30,7 @@
 using namespace arm_compute::logging;
 
 Logger::Logger(std::string name, LogLevel log_level, std::shared_ptr<Printer> printer)
-    : _name(std::move(name)), _log_level(log_level), _printers(
-{
-    std::move(printer)
-}), _decorators()
+    : _name(std::move(name)), _log_level(log_level), _printers({std::move(printer)}), _decorators()
 {
     // Check printer
     ARM_COMPUTE_ERROR_ON(printer == nullptr);
@@ -46,7 +43,7 @@ Logger::Logger(std::string name, LogLevel log_level, std::vector<std::shared_ptr
     : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators()
 {
     // Check printers
-    for(const auto &p : _printers)
+    for (const auto &p : _printers)
     {
         ARM_COMPUTE_UNUSED(p);
         ARM_COMPUTE_ERROR_ON(p == nullptr);
@@ -62,13 +59,13 @@ Logger::Logger(std::string                              name,
     : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators(std::move(decorators))
 {
     // Check printers
-    for(const auto &p : _printers)
+    for (const auto &p : _printers)
     {
         ARM_COMPUTE_UNUSED(p);
         ARM_COMPUTE_ERROR_ON(p == nullptr);
     }
     // Check decorators
-    for(const auto &d : _decorators)
+    for (const auto &d : _decorators)
     {
         ARM_COMPUTE_UNUSED(d);
         ARM_COMPUTE_ERROR_ON(d == nullptr);
@@ -79,7 +76,7 @@ void Logger::log(LogLevel log_level, const std::string &msg)
 {
     // Return if message shouldn't be logged
     // i.e. if log level does not match the logger's
-    if(!is_loggable(log_level))
+    if (!is_loggable(log_level))
     {
         return;
     }
@@ -129,7 +126,7 @@ bool Logger::is_loggable(LogLevel log_level)
 
 void Logger::decorate_log_msg(LogMsg &msg)
 {
-    for(const auto &d : _decorators)
+    for (const auto &d : _decorators)
     {
         d->decorate(msg);
     }
@@ -148,7 +145,7 @@ std::string Logger::create_log_msg(const std::string &str, LogLevel log_level)
 
 void Logger::print_all(const std::string &msg)
 {
-    for(auto &p : _printers)
+    for (auto &p : _printers)
     {
         p->print(msg);
     }
diff --git a/src/core/utils/logging/LoggerRegistry.cpp b/src/core/utils/logging/LoggerRegistry.cpp
index c281d8863c..17015d9ae9 100644
--- a/src/core/utils/logging/LoggerRegistry.cpp
+++ b/src/core/utils/logging/LoggerRegistry.cpp
@@ -24,15 +24,15 @@
 #include "arm_compute/core/utils/logging/LoggerRegistry.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "support/Mutex.h"
 
 using namespace arm_compute::logging;
 
 /** Reserved logger used by the library */
-std::set<std::string> LoggerRegistry::_reserved_loggers = { "CORE", "RUNTIME", "GRAPH" };
+std::set<std::string> LoggerRegistry::_reserved_loggers = {"CORE", "RUNTIME", "GRAPH"};
 
-LoggerRegistry::LoggerRegistry()
-    : _mtx(), _loggers()
+LoggerRegistry::LoggerRegistry() : _mtx(), _loggers()
 {
 }
 
@@ -42,10 +42,12 @@ LoggerRegistry &LoggerRegistry::get()
     return _instance;
 }
 
-void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
+void LoggerRegistry::create_logger(const std::string                           &name,
+                                   LogLevel                                     log_level,
+                                   const std::vector<std::shared_ptr<Printer>> &printers)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    if((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
+    if ((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
     {
         _loggers[name] = std::make_shared<Logger>(name, log_level, printers);
     }
@@ -54,7 +56,7 @@ void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level,
 void LoggerRegistry::remove_logger(const std::string &name)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    if(_loggers.find(name) != _loggers.end())
+    if (_loggers.find(name) != _loggers.end())
     {
         _loggers.erase(name);
     }
@@ -69,9 +71,9 @@ std::shared_ptr<Logger> LoggerRegistry::logger(const std::string &name)
 void LoggerRegistry::create_reserved_loggers(LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    for(const auto &r : _reserved_loggers)
+    for (const auto &r : _reserved_loggers)
     {
-        if(_loggers.find(r) == _loggers.end())
+        if (_loggers.find(r) == _loggers.end())
         {
             _loggers[r] = std::make_shared<Logger>(r, log_level, printers);
         }
diff --git a/src/core/utils/misc/MMappedFile.cpp b/src/core/utils/misc/MMappedFile.cpp
index 0b9414107e..a467cb3320 100644
--- a/src/core/utils/misc/MMappedFile.cpp
+++ b/src/core/utils/misc/MMappedFile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if !defined(BARE_METAL)
+#if !defined(_WIN64) && !defined(BARE_METAL)
 
 #include "arm_compute/core/utils/misc/MMappedFile.h"
 
 #include <cstdio>
 #include <cstring>
-#include <tuple>
-
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <tuple>
 #include <unistd.h>
 
 namespace arm_compute
@@ -53,7 +52,7 @@ std::pair<size_t, bool> get_file_size(const std::string &filename)
 {
     struct stat st; // NOLINT
     memset(&st, 0, sizeof(struct stat));
-    if(stat(filename.c_str(), &st) == 0)
+    if (stat(filename.c_str(), &st) == 0)
     {
         return std::make_pair(st.st_size, true);
     }
@@ -73,8 +72,7 @@ size_t get_page_size()
 }
 } // namespace
 
-MMappedFile::MMappedFile()
-    : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr)
+MMappedFile::MMappedFile() : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr)
 {
 }
 
@@ -92,14 +90,14 @@ MMappedFile::~MMappedFile()
 bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
 {
     // Check if file is mapped
-    if(is_mapped())
+    if (is_mapped())
     {
         return false;
     }
 
     // Open file
     _fp = fopen(filename.c_str(), "a+be");
-    if(_fp == nullptr)
+    if (_fp == nullptr)
     {
         return false;
     }
@@ -107,26 +105,26 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
     // Extract file descriptor
     int  fd     = fileno(_fp);
     bool status = fd >= 0;
-    if(status)
+    if (status)
     {
         // Get file size
         std::tie(_file_size, status) = get_file_size(_filename);
 
-        if(status)
+        if (status)
         {
             // Map all file from offset if map size is 0
             _map_size   = (size == 0) ? _file_size : size;
             _map_offset = offset;
 
             // Check offset mapping
-            if((_map_offset > _file_size) || (_map_offset % get_page_size() != 0))
+            if ((_map_offset > _file_size) || (_map_offset % get_page_size() != 0))
             {
                 status = false;
             }
             else
             {
                 // Truncate to file size
-                if(_map_offset + _map_size > _file_size)
+                if (_map_offset + _map_size > _file_size)
                 {
                     _map_size = _file_size - _map_offset;
                 }
@@ -137,7 +135,7 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
         }
     }
 
-    if(!status)
+    if (!status)
     {
         fclose(_fp);
     }
@@ -148,14 +146,14 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
 void MMappedFile::release()
 {
     // Unmap file
-    if(_data != nullptr)
+    if (_data != nullptr)
     {
         ::munmap(_data, _file_size);
         _data = nullptr;
     }
 
     // Close file
-    if(_fp != nullptr)
+    if (_fp != nullptr)
     {
         fclose(_fp);
         _fp = nullptr;
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 49e39f663f..f8b74a985d 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/utils/quantization/AsymmHelpers.h"
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
@@ -38,7 +42,7 @@ constexpr float   epsilon            = 0.00001f;
 
 Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon)
 {
-    if(multiplier >= 1.f)
+    if (multiplier >= 1.f)
     {
         Status status = calculate_quantized_multiplier_greater_than_one(multiplier, quant_multiplier, shift);
         *shift *= -1;
@@ -61,25 +65,19 @@ Status calculate_quantized_multiplier_less_than_one(float    multiplier,
     ARM_COMPUTE_RETURN_ERROR_ON(right_shift == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(multiplier < -internal_epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f + internal_epsilon);
-    if(std::fabs(0.0f - multiplier) < internal_epsilon)
-    {
-        *quant_multiplier = 0;
-        *right_shift      = 0;
-        return Status{};
-    }
 
     int          shift_exp = 0;
     const double q         = std::frexp(multiplier, &shift_exp);
     *right_shift           = -1 * shift_exp;
     auto q_fixed           = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0));
     ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
-    if(q_fixed == fixed_point_one_Q0)
+    if (q_fixed == fixed_point_one_Q0)
     {
         q_fixed /= 2;
         --*right_shift;
     }
 
-    if(ignore_epsilon && *right_shift > 31)
+    if (ignore_epsilon && *right_shift > 31)
     {
         *right_shift = 0;
         q_fixed      = 0;
@@ -92,9 +90,8 @@ Status calculate_quantized_multiplier_less_than_one(float    multiplier,
     return Status{};
 }
 
-Status calculate_quantized_multiplier_greater_than_one(float    multiplier,
-                                                       int32_t *quantized_multiplier,
-                                                       int32_t *left_shift)
+Status
+calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(quantized_multiplier == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(left_shift == nullptr);
@@ -105,7 +102,7 @@ Status calculate_quantized_multiplier_greater_than_one(float    multiplier,
     *left_shift            = shift_exp;
     auto q_fixed           = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0));
     ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
-    if(q_fixed == fixed_point_one_Q0)
+    if (q_fixed == fixed_point_one_Q0)
     {
         q_fixed /= 2;
         ++*left_shift;
@@ -117,27 +114,27 @@ Status calculate_quantized_multiplier_greater_than_one(float    multiplier,
     return Status{};
 }
 
-arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
-                                                    const QuantizationInfo &wq_info,
-                                                    const QuantizationInfo &oq_info,
+arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo  &iq_info,
+                                                    const QuantizationInfo  &wq_info,
+                                                    const QuantizationInfo  &oq_info,
                                                     GEMMLowpOutputStageInfo &stage_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty());
     ARM_COMPUTE_RETURN_ERROR_ON(wq_info.scale().empty());
     ARM_COMPUTE_RETURN_ERROR_ON(oq_info.scale().empty());
-
-    const unsigned int size = wq_info.scale().size();
-
-    auto &quant_multipliers = stage_info.gemmlowp_multipliers;
-    auto &quant_shifts      = stage_info.gemmlowp_shifts;
-    quant_multipliers.resize(size);
-    quant_shifts.resize(size);
+    constexpr unsigned int padding_elems = 32; // assembly kernels assume the shifts and multipliers buffers are padded
+    const unsigned int     size          = wq_info.scale().size();
+    const size_t           padded_size   = (size == 1) ? 1 : size + padding_elems;
+    auto                  &quant_multipliers = stage_info.gemmlowp_multipliers;
+    auto                  &quant_shifts      = stage_info.gemmlowp_shifts;
+    quant_multipliers.resize(padded_size);
+    quant_shifts.resize(padded_size);
 
     const auto &w_scales = wq_info.scale();
     const float i_scale  = iq_info.scale().at(0);
     const float o_scale  = oq_info.scale().at(0);
 
-    for(unsigned int i = 0; i < size; ++i)
+    for (unsigned int i = 0; i < size; ++i)
     {
         const float multiplier       = i_scale * w_scales[i] / o_scale;
         int32_t     quant_multiplier = 0;
@@ -158,7 +155,7 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
 {
     int min_quant_val = 0;
     int max_quant_val = 0;
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
             min_quant_val = std::numeric_limits<uint8_t>::min();
@@ -182,20 +179,60 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
     }
     return std::make_pair(min_quant_val, max_quant_val);
 }
+
+std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo    &q_info,
+                                                                     const ActivationLayerInfo &act_info,
+                                                                     DataType                   data_type)
+{
+    ARM_COMPUTE_ERROR_ON(data_type != DataType::QASYMM8 && data_type != DataType::QASYMM8_SIGNED);
+
+    const auto min_max = get_min_max(data_type);
+
+    int32_t type_min = std::get<0>(min_max).get<int32_t>();
+    int32_t type_max = std::get<1>(min_max).get<int32_t>();
+
+    const UniformQuantizationInfo q_unif = q_info.uniform();
+
+    if (act_info.enabled())
+    {
+        switch (act_info.activation())
+        {
+            case ActivationLayerInfo::ActivationFunction::RELU:
+                type_min = q_unif.offset;
+                break;
+            case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                type_min = q_unif.offset;
+                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.a(), q_info);
+                break;
+            case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.b(), q_info);
+                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.a(), q_info);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Activation function not supported.");
+                break;
+        }
+    }
+
+    return std::make_tuple(type_min, type_max);
+}
+
 void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
                                               const ITensorInfo *weights,
                                               const ITensorInfo *output,
-                                              unsigned int       idx_ofms,
                                               int32_t           *output_multipliers_ptr,
                                               int32_t           *output_shifts_ptr)
 {
-    const unsigned int num_filters = is_data_type_quantized_per_channel(weights->data_type()) ? weights->dimension(idx_ofms) : 1;
-
     const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
     const QuantizationInfo        wq_info = weights->quantization_info();
     const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
 
-    for(unsigned int i = 0; i < num_filters; ++i)
+    const unsigned int num_filters = wq_info.scale().size();
+
+    for (unsigned int i = 0; i < num_filters; ++i)
     {
         int32_t     output_multiplier = 0;
         int32_t     output_shift      = 0;
@@ -209,13 +246,14 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
 
 int32_t saturating_rounding_doubling_highmul(int32_t a, int32_t b)
 {
-    bool    overflow = a == b && a == std::numeric_limits<int32_t>::min();
-    int64_t a_64(a);
-    int64_t b_64(b);
-    int64_t ab_64               = a_64 * b_64;
-    bool    is_positive_or_zero = a == 0 || b == 0 || (std::signbit(a) == std::signbit(b));
-    int32_t nudge               = is_positive_or_zero ? (1 << 30) : (1 - (1 << 30));
-    int32_t ab_x2_high32        = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
+    bool       overflow = a == b && a == std::numeric_limits<int32_t>::min();
+    int64_t    a_64(a);
+    int64_t    b_64(b);
+    int64_t    ab_64 = a_64 * b_64;
+    const bool is_positive_or_zero =
+        a == 0 || b == 0 || (std::signbit(static_cast<double>(a)) == std::signbit(static_cast<double>(b)));
+    int32_t nudge        = is_positive_or_zero ? (1 << 30) : (1 - (1 << 30));
+    int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
     return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32;
 }
 
@@ -235,11 +273,11 @@ int32_t multiply_by_quantized_multiplier(int32_t input, int32_t qmul, int32_t sh
 
 int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
 {
-    if(exponent == 0)
+    if (exponent == 0)
     {
         return v;
     }
-    else if(exponent < 0)
+    else if (exponent < 0)
     {
         return rounding_divide_by_pow2(v, -exponent);
     }
@@ -259,11 +297,14 @@ int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
     }
 }
 
-void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift)
+void get_invsqrt_quantized_multiplier_exp(int32_t  input,
+                                          int32_t  reverse_shift,
+                                          int32_t &output_inv_sqrt,
+                                          int32_t &output_shift)
 {
     ARM_COMPUTE_ERROR_ON(input < 0);
 
-    if(input <= 1)
+    if (input <= 1)
     {
         // dealing the inputs (0 and 1) separately to avoid overflow
         output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
@@ -273,7 +314,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
 
     // prepare input for fixed point operation and compute shift value
     output_shift = 11;
-    while(input >= (1 << 29))
+    while (input >= (1 << 29))
     {
         input /= 4;
         ++output_shift;
@@ -302,9 +343,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
 
     // multiplication of two fixed point numbers, defined for readability
     auto fixed_point_mul = [](FixedPointRawType a, FixedPointRawType b) -> FixedPointRawType
-    {
-        return saturating_rounding_doubling_highmul(a, b);
-    };
+    { return saturating_rounding_doubling_highmul(a, b); };
 
     // rescaling of fixed point to have dst_bit integer bits, defined for readability
     auto fixed_point_rescale = [](FixedPointRawType a, uint32_t src_bit, uint32_t dst_bit) -> FixedPointRawType
@@ -315,17 +354,18 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
 
     // 5 iterations of Newton-Raphson method for inverse square root - 1.5 * x_n = input/2 * (x_n)^3
     constexpr int32_t num_iteration = 5;
-    for(int32_t i = 0; i < num_iteration; ++i)
+    for (int32_t i = 0; i < num_iteration; ++i)
     {
         const auto x3 = fixed_point_rescale(fixed_point_mul(fixed_point_mul(x, x), x), 9, fixedpoint_position);
-        x             = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), 6, fixedpoint_position);
+        x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3),
+                                6, fixedpoint_position);
     }
 
     // fixed point representation of sqrt(1/2)
     const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
     x                                        = fixed_point_mul(fixedpoint_half_sqrt_2, x);
     output_inv_sqrt                          = x;
-    if(output_shift < 0)
+    if (output_shift < 0)
     {
         output_inv_sqrt <<= -output_shift;
         output_shift = 0;
@@ -333,5 +373,5 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
     // convert right shift to left shift
     output_shift *= reverse_shift;
 }
-} // quantization
-} // arm_compute
+} // namespace quantization
+} // namespace arm_compute
diff --git a/src/core/utils/quantization/AsymmHelpers.h b/src/core/utils/quantization/AsymmHelpers.h
new file mode 100644
index 0000000000..5dc607ce58
--- /dev/null
+++ b/src/core/utils/quantization/AsymmHelpers.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_UTILS_QUANTIZATION_ASYMMHELPERS_H
+#define ACL_SRC_CORE_UTILS_QUANTIZATION_ASYMMHELPERS_H
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace quantization
+{
+
+/** Get minimum and maximum output of the activation function after quantization.
+ *
+ * Only ReLU, upper bounded ReLU and lower+upper bounded ReLU are supported.
+ *
+ * @param[in] q_info    Output quantization info.
+ * @param[in] act_info  Activation function information.
+ * @param[in] data_type Output data type (either QASYMM8 or QASYMM8_SIGNED).
+ *
+ * @return The minimum and maximum output of the activation function after quantization.
+ */
+std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo    &q_info,
+                                                                     const ActivationLayerInfo &act_info,
+                                                                     DataType                   data_type);
+
+} // namespace quantization
+} // namespace arm_compute
+
+#endif // ACL_SRC_CORE_UTILS_QUANTIZATION_ASYMMHELPERS_H